/* SPDX-License-Identifier: GPL-2.0 */
#ifndef __NET_RTNETLINK_H
#define __NET_RTNETLINK_H
#include <linux/rtnetlink.h>
#include <net/netlink.h>
typedef int (*rtnl_doit_func)(struct sk_buff *, struct nlmsghdr *,
struct netlink_ext_ack *);
typedef int (*rtnl_dumpit_func)(struct sk_buff *, struct netlink_callback *);
enum rtnl_link_flags {
RTNL_FLAG_DOIT_UNLOCKED = 1,
};
void rtnl_register(int protocol, int msgtype,
rtnl_doit_func, rtnl_dumpit_func, unsigned int flags);
int rtnl_register_module(struct module *owner, int protocol, int msgtype,
rtnl_doit_func, rtnl_dumpit_func, unsigned int flags);
int rtnl_unregister(int protocol, int msgtype);
void rtnl_unregister_all(int protocol);
static inline int rtnl_msg_family(const struct nlmsghdr *nlh)
{
if (nlmsg_len(nlh) >= sizeof(struct rtgenmsg)) return ((struct rtgenmsg *) nlmsg_data(nlh))->rtgen_family;
else
return AF_UNSPEC;
}
/**
* struct rtnl_link_ops - rtnetlink link operations
*
* @list: Used internally
* @kind: Identifier
* @netns_refund: Physical device, move to init_net on netns exit
* @maxtype: Highest device specific netlink attribute number
* @policy: Netlink policy for device specific attribute validation
* @validate: Optional validation function for netlink/changelink parameters
* @alloc: netdev allocation function, can be %NULL and is then used
* in place of alloc_netdev_mqs(), in this case @priv_size
* and @setup are unused. Returns a netdev or ERR_PTR().
* @priv_size: sizeof net_device private space
* @setup: net_device setup function
* @newlink: Function for configuring and registering a new device
* @changelink: Function for changing parameters of an existing device
* @dellink: Function to remove a device
* @get_size: Function to calculate required room for dumping device
* specific netlink attributes
* @fill_info: Function to dump device specific netlink attributes
* @get_xstats_size: Function to calculate required room for dumping device
* specific statistics
* @fill_xstats: Function to dump device specific statistics
* @get_num_tx_queues: Function to determine number of transmit queues
* to create when creating a new device.
* @get_num_rx_queues: Function to determine number of receive queues
* to create when creating a new device.
* @get_link_net: Function to get the i/o netns of the device
* @get_linkxstats_size: Function to calculate the required room for
* dumping device-specific extended link stats
* @fill_linkxstats: Function to dump device-specific extended link stats
*/
struct rtnl_link_ops {
struct list_head list;
const char *kind;
size_t priv_size;
struct net_device *(*alloc)(struct nlattr *tb[],
const char *ifname,
unsigned char name_assign_type,
unsigned int num_tx_queues,
unsigned int num_rx_queues);
void (*setup)(struct net_device *dev);
bool netns_refund;
unsigned int maxtype;
const struct nla_policy *policy;
int (*validate)(struct nlattr *tb[],
struct nlattr *data[],
struct netlink_ext_ack *extack);
int (*newlink)(struct net *src_net,
struct net_device *dev,
struct nlattr *tb[],
struct nlattr *data[],
struct netlink_ext_ack *extack);
int (*changelink)(struct net_device *dev,
struct nlattr *tb[],
struct nlattr *data[],
struct netlink_ext_ack *extack);
void (*dellink)(struct net_device *dev,
struct list_head *head);
size_t (*get_size)(const struct net_device *dev);
int (*fill_info)(struct sk_buff *skb,
const struct net_device *dev);
size_t (*get_xstats_size)(const struct net_device *dev);
int (*fill_xstats)(struct sk_buff *skb,
const struct net_device *dev);
unsigned int (*get_num_tx_queues)(void);
unsigned int (*get_num_rx_queues)(void);
unsigned int slave_maxtype;
const struct nla_policy *slave_policy;
int (*slave_changelink)(struct net_device *dev,
struct net_device *slave_dev,
struct nlattr *tb[],
struct nlattr *data[],
struct netlink_ext_ack *extack);
size_t (*get_slave_size)(const struct net_device *dev,
const struct net_device *slave_dev);
int (*fill_slave_info)(struct sk_buff *skb,
const struct net_device *dev,
const struct net_device *slave_dev);
struct net *(*get_link_net)(const struct net_device *dev);
size_t (*get_linkxstats_size)(const struct net_device *dev,
int attr);
int (*fill_linkxstats)(struct sk_buff *skb,
const struct net_device *dev,
int *prividx, int attr);
};
int __rtnl_link_register(struct rtnl_link_ops *ops);
void __rtnl_link_unregister(struct rtnl_link_ops *ops);
int rtnl_link_register(struct rtnl_link_ops *ops);
void rtnl_link_unregister(struct rtnl_link_ops *ops);
/**
* struct rtnl_af_ops - rtnetlink address family operations
*
* @list: Used internally
* @family: Address family
* @fill_link_af: Function to fill IFLA_AF_SPEC with address family
* specific netlink attributes.
* @get_link_af_size: Function to calculate size of address family specific
* netlink attributes.
* @validate_link_af: Validate a IFLA_AF_SPEC attribute, must check attr
* for invalid configuration settings.
* @set_link_af: Function to parse a IFLA_AF_SPEC attribute and modify
* net_device accordingly.
*/
struct rtnl_af_ops {
struct list_head list;
int family;
int (*fill_link_af)(struct sk_buff *skb,
const struct net_device *dev,
u32 ext_filter_mask);
size_t (*get_link_af_size)(const struct net_device *dev,
u32 ext_filter_mask);
int (*validate_link_af)(const struct net_device *dev,
const struct nlattr *attr,
struct netlink_ext_ack *extack);
int (*set_link_af)(struct net_device *dev,
const struct nlattr *attr,
struct netlink_ext_ack *extack);
int (*fill_stats_af)(struct sk_buff *skb,
const struct net_device *dev);
size_t (*get_stats_af_size)(const struct net_device *dev);
};
void rtnl_af_register(struct rtnl_af_ops *ops);
void rtnl_af_unregister(struct rtnl_af_ops *ops);
struct net *rtnl_link_get_net(struct net *src_net, struct nlattr *tb[]);
struct net_device *rtnl_create_link(struct net *net, const char *ifname,
unsigned char name_assign_type,
const struct rtnl_link_ops *ops,
struct nlattr *tb[],
struct netlink_ext_ack *extack);
int rtnl_delete_link(struct net_device *dev);
int rtnl_configure_link(struct net_device *dev, const struct ifinfomsg *ifm);
int rtnl_nla_parse_ifla(struct nlattr **tb, const struct nlattr *head, int len,
struct netlink_ext_ack *exterr);
struct net *rtnl_get_net_ns_capable(struct sock *sk, int netnsid);
#define MODULE_ALIAS_RTNL_LINK(kind) MODULE_ALIAS("rtnl-link-" kind)
#endif
// SPDX-License-Identifier: GPL-2.0
/*
* Block multiqueue core code
*
* Copyright (C) 2013-2014 Jens Axboe
* Copyright (C) 2013-2014 Christoph Hellwig
*/
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/backing-dev.h>
#include <linux/bio.h>
#include <linux/blkdev.h>
#include <linux/kmemleak.h>
#include <linux/mm.h>
#include <linux/init.h>
#include <linux/slab.h>
#include <linux/workqueue.h>
#include <linux/smp.h>
#include <linux/llist.h>
#include <linux/list_sort.h>
#include <linux/cpu.h>
#include <linux/cache.h>
#include <linux/sched/sysctl.h>
#include <linux/sched/topology.h>
#include <linux/sched/signal.h>
#include <linux/delay.h>
#include <linux/crash_dump.h>
#include <linux/prefetch.h>
#include <linux/blk-crypto.h>
#include <trace/events/block.h>
#include <linux/blk-mq.h>
#include <linux/t10-pi.h>
#include "blk.h"
#include "blk-mq.h"
#include "blk-mq-debugfs.h"
#include "blk-mq-tag.h"
#include "blk-pm.h"
#include "blk-stat.h"
#include "blk-mq-sched.h"
#include "blk-rq-qos.h"
static DEFINE_PER_CPU(struct llist_head, blk_cpu_done);
static void blk_mq_poll_stats_start(struct request_queue *q);
static void blk_mq_poll_stats_fn(struct blk_stat_callback *cb);
static int blk_mq_poll_stats_bkt(const struct request *rq)
{
int ddir, sectors, bucket;
ddir = rq_data_dir(rq);
sectors = blk_rq_stats_sectors(rq);
bucket = ddir + 2 * ilog2(sectors);
if (bucket < 0)
return -1;
else if (bucket >= BLK_MQ_POLL_STATS_BKTS)
return ddir + BLK_MQ_POLL_STATS_BKTS - 2;
return bucket;
}
/*
* Check if any of the ctx, dispatch list or elevator
* have pending work in this hardware queue.
*/
static bool blk_mq_hctx_has_pending(struct blk_mq_hw_ctx *hctx)
{
return !list_empty_careful(&hctx->dispatch) || sbitmap_any_bit_set(&hctx->ctx_map) ||
blk_mq_sched_has_work(hctx);
}
/*
* Mark this ctx as having pending work in this hardware queue
*/
static void blk_mq_hctx_mark_pending(struct blk_mq_hw_ctx *hctx,
struct blk_mq_ctx *ctx)
{
const int bit = ctx->index_hw[hctx->type];
if (!sbitmap_test_bit(&hctx->ctx_map, bit))
sbitmap_set_bit(&hctx->ctx_map, bit);
}
static void blk_mq_hctx_clear_pending(struct blk_mq_hw_ctx *hctx,
struct blk_mq_ctx *ctx)
{
const int bit = ctx->index_hw[hctx->type];
sbitmap_clear_bit(&hctx->ctx_map, bit);
}
struct mq_inflight {
struct block_device *part;
unsigned int inflight[2];
};
static bool blk_mq_check_inflight(struct blk_mq_hw_ctx *hctx,
struct request *rq, void *priv,
bool reserved)
{
struct mq_inflight *mi = priv;
if ((!mi->part->bd_partno || rq->part == mi->part) &&
blk_mq_rq_state(rq) == MQ_RQ_IN_FLIGHT)
mi->inflight[rq_data_dir(rq)]++;
return true;
}
unsigned int blk_mq_in_flight(struct request_queue *q,
struct block_device *part)
{
struct mq_inflight mi = { .part = part };
blk_mq_queue_tag_busy_iter(q, blk_mq_check_inflight, &mi);
return mi.inflight[0] + mi.inflight[1];
}
void blk_mq_in_flight_rw(struct request_queue *q, struct block_device *part,
unsigned int inflight[2])
{
struct mq_inflight mi = { .part = part };
blk_mq_queue_tag_busy_iter(q, blk_mq_check_inflight, &mi);
inflight[0] = mi.inflight[0];
inflight[1] = mi.inflight[1];
}
void blk_freeze_queue_start(struct request_queue *q)
{
mutex_lock(&q->mq_freeze_lock);
if (++q->mq_freeze_depth == 1) {
percpu_ref_kill(&q->q_usage_counter);
mutex_unlock(&q->mq_freeze_lock);
if (queue_is_mq(q))
blk_mq_run_hw_queues(q, false);
} else {
mutex_unlock(&q->mq_freeze_lock);
}
}
EXPORT_SYMBOL_GPL(blk_freeze_queue_start);
void blk_mq_freeze_queue_wait(struct request_queue *q)
{
wait_event(q->mq_freeze_wq, percpu_ref_is_zero(&q->q_usage_counter));}
EXPORT_SYMBOL_GPL(blk_mq_freeze_queue_wait);
int blk_mq_freeze_queue_wait_timeout(struct request_queue *q,
unsigned long timeout)
{
return wait_event_timeout(q->mq_freeze_wq,
percpu_ref_is_zero(&q->q_usage_counter),
timeout);
}
EXPORT_SYMBOL_GPL(blk_mq_freeze_queue_wait_timeout);
/*
* Guarantee no request is in use, so we can change any data structure of
* the queue afterward.
*/
void blk_freeze_queue(struct request_queue *q)
{
/*
* In the !blk_mq case we are only calling this to kill the
* q_usage_counter, otherwise this increases the freeze depth
* and waits for it to return to zero. For this reason there is
* no blk_unfreeze_queue(), and blk_freeze_queue() is not
* exported to drivers as the only user for unfreeze is blk_mq.
*/
blk_freeze_queue_start(q);
blk_mq_freeze_queue_wait(q);
}
void blk_mq_freeze_queue(struct request_queue *q)
{
/*
* ...just an alias to keep freeze and unfreeze actions balanced
* in the blk_mq_* namespace
*/
blk_freeze_queue(q);
}
EXPORT_SYMBOL_GPL(blk_mq_freeze_queue);
void __blk_mq_unfreeze_queue(struct request_queue *q, bool force_atomic)
{
mutex_lock(&q->mq_freeze_lock);
if (force_atomic)
q->q_usage_counter.data->force_atomic = true; q->mq_freeze_depth--; WARN_ON_ONCE(q->mq_freeze_depth < 0); if (!q->mq_freeze_depth) { percpu_ref_resurrect(&q->q_usage_counter);
wake_up_all(&q->mq_freeze_wq);
}
mutex_unlock(&q->mq_freeze_lock);
}
void blk_mq_unfreeze_queue(struct request_queue *q)
{
__blk_mq_unfreeze_queue(q, false);
}
EXPORT_SYMBOL_GPL(blk_mq_unfreeze_queue);
/*
* FIXME: replace the scsi_internal_device_*block_nowait() calls in the
* mpt3sas driver such that this function can be removed.
*/
void blk_mq_quiesce_queue_nowait(struct request_queue *q)
{
blk_queue_flag_set(QUEUE_FLAG_QUIESCED, q);
}
EXPORT_SYMBOL_GPL(blk_mq_quiesce_queue_nowait);
/**
* blk_mq_quiesce_queue() - wait until all ongoing dispatches have finished
* @q: request queue.
*
* Note: this function does not prevent that the struct request end_io()
* callback function is invoked. Once this function is returned, we make
* sure no dispatch can happen until the queue is unquiesced via
* blk_mq_unquiesce_queue().
*/
void blk_mq_quiesce_queue(struct request_queue *q)
{
struct blk_mq_hw_ctx *hctx;
unsigned int i;
bool rcu = false;
blk_mq_quiesce_queue_nowait(q);
queue_for_each_hw_ctx(q, hctx, i) {
if (hctx->flags & BLK_MQ_F_BLOCKING)
synchronize_srcu(hctx->srcu);
else
rcu = true;
}
if (rcu)
synchronize_rcu();
}
EXPORT_SYMBOL_GPL(blk_mq_quiesce_queue);
/*
* blk_mq_unquiesce_queue() - counterpart of blk_mq_quiesce_queue()
* @q: request queue.
*
* This function recovers queue into the state before quiescing
* which is done by blk_mq_quiesce_queue.
*/
void blk_mq_unquiesce_queue(struct request_queue *q)
{
blk_queue_flag_clear(QUEUE_FLAG_QUIESCED, q);
/* dispatch requests which are inserted during quiescing */
blk_mq_run_hw_queues(q, true);
}
EXPORT_SYMBOL_GPL(blk_mq_unquiesce_queue);
void blk_mq_wake_waiters(struct request_queue *q)
{
struct blk_mq_hw_ctx *hctx;
unsigned int i;
queue_for_each_hw_ctx(q, hctx, i)
if (blk_mq_hw_queue_mapped(hctx))
blk_mq_tag_wakeup_all(hctx->tags, true);
}
/*
* Only need start/end time stamping if we have iostat or
* blk stats enabled, or using an IO scheduler.
*/
static inline bool blk_mq_need_time_stamp(struct request *rq)
{
return (rq->rq_flags & (RQF_IO_STAT | RQF_STATS)) || rq->q->elevator;
}
static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data,
unsigned int tag, u64 alloc_time_ns)
{
struct blk_mq_tags *tags = blk_mq_tags_from_data(data);
struct request *rq = tags->static_rqs[tag];
if (data->q->elevator) {
rq->tag = BLK_MQ_NO_TAG;
rq->internal_tag = tag;
} else {
rq->tag = tag;
rq->internal_tag = BLK_MQ_NO_TAG;
}
/* csd/requeue_work/fifo_time is initialized before use */
rq->q = data->q;
rq->mq_ctx = data->ctx;
rq->mq_hctx = data->hctx;
rq->rq_flags = 0;
rq->cmd_flags = data->cmd_flags;
if (data->flags & BLK_MQ_REQ_PM)
rq->rq_flags |= RQF_PM; if (blk_queue_io_stat(data->q)) rq->rq_flags |= RQF_IO_STAT; INIT_LIST_HEAD(&rq->queuelist);
INIT_HLIST_NODE(&rq->hash);
RB_CLEAR_NODE(&rq->rb_node);
rq->rq_disk = NULL;
rq->part = NULL;
#ifdef CONFIG_BLK_RQ_ALLOC_TIME
rq->alloc_time_ns = alloc_time_ns;
#endif
if (blk_mq_need_time_stamp(rq))
rq->start_time_ns = ktime_get_ns();
else
rq->start_time_ns = 0; rq->io_start_time_ns = 0;
rq->stats_sectors = 0;
rq->nr_phys_segments = 0;
#if defined(CONFIG_BLK_DEV_INTEGRITY)
rq->nr_integrity_segments = 0;
#endif
blk_crypto_rq_set_defaults(rq);
/* tag was already set */
WRITE_ONCE(rq->deadline, 0);
rq->timeout = 0;
rq->end_io = NULL;
rq->end_io_data = NULL;
data->ctx->rq_dispatched[op_is_sync(data->cmd_flags)]++;
refcount_set(&rq->ref, 1);
if (!op_is_flush(data->cmd_flags)) {
struct elevator_queue *e = data->q->elevator;
rq->elv.icq = NULL;
if (e && e->type->ops.prepare_request) { if (e->type->icq_cache) blk_mq_sched_assign_ioc(rq); e->type->ops.prepare_request(rq);
rq->rq_flags |= RQF_ELVPRIV;
}
}
data->hctx->queued++;
return rq;
}
static struct request *__blk_mq_alloc_request(struct blk_mq_alloc_data *data)
{
struct request_queue *q = data->q;
struct elevator_queue *e = q->elevator;
u64 alloc_time_ns = 0;
unsigned int tag;
/* alloc_time includes depth and tag waits */
if (blk_queue_rq_alloc_time(q))
alloc_time_ns = ktime_get_ns();
if (data->cmd_flags & REQ_NOWAIT)
data->flags |= BLK_MQ_REQ_NOWAIT; if (e) {
/*
* Flush/passthrough requests are special and go directly to the
* dispatch list. Don't include reserved tags in the
* limiting, as it isn't useful.
*/
if (!op_is_flush(data->cmd_flags) &&
!blk_op_is_passthrough(data->cmd_flags) &&
e->type->ops.limit_depth && !(data->flags & BLK_MQ_REQ_RESERVED)) e->type->ops.limit_depth(data->cmd_flags, data);
}
retry:
data->ctx = blk_mq_get_ctx(q);
data->hctx = blk_mq_map_queue(q, data->cmd_flags, data->ctx);
if (!e)
blk_mq_tag_busy(data->hctx);
/*
* Waiting allocations only fail because of an inactive hctx. In that
* case just retry the hctx assignment and tag allocation as CPU hotplug
* should have migrated us to an online CPU by now.
*/
tag = blk_mq_get_tag(data);
if (tag == BLK_MQ_NO_TAG) {
if (data->flags & BLK_MQ_REQ_NOWAIT)
return NULL;
/*
* Give up the CPU and sleep for a random short time to ensure
* that thread using a realtime scheduling class are migrated
* off the CPU, and thus off the hctx that is going away.
*/
msleep(3);
goto retry;
}
return blk_mq_rq_ctx_init(data, tag, alloc_time_ns);
}
struct request *blk_mq_alloc_request(struct request_queue *q, unsigned int op,
blk_mq_req_flags_t flags)
{
struct blk_mq_alloc_data data = {
.q = q,
.flags = flags,
.cmd_flags = op,
};
struct request *rq;
int ret;
ret = blk_queue_enter(q, flags);
if (ret)
return ERR_PTR(ret);
rq = __blk_mq_alloc_request(&data);
if (!rq)
goto out_queue_exit;
rq->__data_len = 0;
rq->__sector = (sector_t) -1;
rq->bio = rq->biotail = NULL;
return rq;
out_queue_exit:
blk_queue_exit(q);
return ERR_PTR(-EWOULDBLOCK);
}
EXPORT_SYMBOL(blk_mq_alloc_request);
struct request *blk_mq_alloc_request_hctx(struct request_queue *q,
unsigned int op, blk_mq_req_flags_t flags, unsigned int hctx_idx)
{
struct blk_mq_alloc_data data = {
.q = q,
.flags = flags,
.cmd_flags = op,
};
u64 alloc_time_ns = 0;
unsigned int cpu;
unsigned int tag;
int ret;
/* alloc_time includes depth and tag waits */
if (blk_queue_rq_alloc_time(q))
alloc_time_ns = ktime_get_ns();
/*
* If the tag allocator sleeps we could get an allocation for a
* different hardware context. No need to complicate the low level
* allocator for this for the rare use case of a command tied to
* a specific queue.
*/
if (WARN_ON_ONCE(!(flags & (BLK_MQ_REQ_NOWAIT | BLK_MQ_REQ_RESERVED))))
return ERR_PTR(-EINVAL);
if (hctx_idx >= q->nr_hw_queues)
return ERR_PTR(-EIO);
ret = blk_queue_enter(q, flags);
if (ret)
return ERR_PTR(ret);
/*
* Check if the hardware context is actually mapped to anything.
* If not tell the caller that it should skip this queue.
*/
ret = -EXDEV;
data.hctx = q->queue_hw_ctx[hctx_idx];
if (!blk_mq_hw_queue_mapped(data.hctx))
goto out_queue_exit;
cpu = cpumask_first_and(data.hctx->cpumask, cpu_online_mask);
data.ctx = __blk_mq_get_ctx(q, cpu);
if (!q->elevator)
blk_mq_tag_busy(data.hctx);
ret = -EWOULDBLOCK;
tag = blk_mq_get_tag(&data);
if (tag == BLK_MQ_NO_TAG)
goto out_queue_exit;
return blk_mq_rq_ctx_init(&data, tag, alloc_time_ns);
out_queue_exit:
blk_queue_exit(q);
return ERR_PTR(ret);
}
EXPORT_SYMBOL_GPL(blk_mq_alloc_request_hctx);
static void __blk_mq_free_request(struct request *rq)
{
struct request_queue *q = rq->q;
struct blk_mq_ctx *ctx = rq->mq_ctx;
struct blk_mq_hw_ctx *hctx = rq->mq_hctx;
const int sched_tag = rq->internal_tag;
blk_crypto_free_request(rq);
blk_pm_mark_last_busy(rq);
rq->mq_hctx = NULL;
if (rq->tag != BLK_MQ_NO_TAG)
blk_mq_put_tag(hctx->tags, ctx, rq->tag);
if (sched_tag != BLK_MQ_NO_TAG)
blk_mq_put_tag(hctx->sched_tags, ctx, sched_tag);
blk_mq_sched_restart(hctx);
blk_queue_exit(q);
}
void blk_mq_free_request(struct request *rq)
{
struct request_queue *q = rq->q;
struct elevator_queue *e = q->elevator;
struct blk_mq_ctx *ctx = rq->mq_ctx;
struct blk_mq_hw_ctx *hctx = rq->mq_hctx;
if (rq->rq_flags & RQF_ELVPRIV) {
if (e && e->type->ops.finish_request)
e->type->ops.finish_request(rq);
if (rq->elv.icq) {
put_io_context(rq->elv.icq->ioc);
rq->elv.icq = NULL;
}
}
ctx->rq_completed[rq_is_sync(rq)]++;
if (rq->rq_flags & RQF_MQ_INFLIGHT)
__blk_mq_dec_active_requests(hctx);
if (unlikely(laptop_mode && !blk_rq_is_passthrough(rq)))
laptop_io_completion(q->disk->bdi);
rq_qos_done(q, rq);
WRITE_ONCE(rq->state, MQ_RQ_IDLE);
if (refcount_dec_and_test(&rq->ref))
__blk_mq_free_request(rq);
}
EXPORT_SYMBOL_GPL(blk_mq_free_request);
inline void __blk_mq_end_request(struct request *rq, blk_status_t error)
{
u64 now = 0;
if (blk_mq_need_time_stamp(rq))
now = ktime_get_ns();
if (rq->rq_flags & RQF_STATS) {
blk_mq_poll_stats_start(rq->q);
blk_stat_add(rq, now);
}
blk_mq_sched_completed_request(rq, now);
blk_account_io_done(rq, now);
if (rq->end_io) {
rq_qos_done(rq->q, rq);
rq->end_io(rq, error);
} else {
blk_mq_free_request(rq);
}
}
EXPORT_SYMBOL(__blk_mq_end_request);
void blk_mq_end_request(struct request *rq, blk_status_t error)
{
if (blk_update_request(rq, error, blk_rq_bytes(rq)))
BUG();
__blk_mq_end_request(rq, error);
}
EXPORT_SYMBOL(blk_mq_end_request);
static void blk_complete_reqs(struct llist_head *list)
{
struct llist_node *entry = llist_reverse_order(llist_del_all(list));
struct request *rq, *next;
llist_for_each_entry_safe(rq, next, entry, ipi_list)
rq->q->mq_ops->complete(rq);
}
static __latent_entropy void blk_done_softirq(struct softirq_action *h)
{
blk_complete_reqs(this_cpu_ptr(&blk_cpu_done));
}
static int blk_softirq_cpu_dead(unsigned int cpu)
{
blk_complete_reqs(&per_cpu(blk_cpu_done, cpu));
return 0;
}
static void __blk_mq_complete_request_remote(void *data)
{
__raise_softirq_irqoff(BLOCK_SOFTIRQ);
}
static inline bool blk_mq_complete_need_ipi(struct request *rq)
{
int cpu = raw_smp_processor_id();
if (!IS_ENABLED(CONFIG_SMP) ||
!test_bit(QUEUE_FLAG_SAME_COMP, &rq->q->queue_flags))
return false;
/*
* With force threaded interrupts enabled, raising softirq from an SMP
* function call will always result in waking the ksoftirqd thread.
* This is probably worse than completing the request on a different
* cache domain.
*/
if (force_irqthreads())
return false;
/* same CPU or cache domain? Complete locally */
if (cpu == rq->mq_ctx->cpu ||
(!test_bit(QUEUE_FLAG_SAME_FORCE, &rq->q->queue_flags) &&
cpus_share_cache(cpu, rq->mq_ctx->cpu)))
return false;
/* don't try to IPI to an offline CPU */
return cpu_online(rq->mq_ctx->cpu);
}
static void blk_mq_complete_send_ipi(struct request *rq)
{
struct llist_head *list;
unsigned int cpu;
cpu = rq->mq_ctx->cpu;
list = &per_cpu(blk_cpu_done, cpu);
if (llist_add(&rq->ipi_list, list)) {
INIT_CSD(&rq->csd, __blk_mq_complete_request_remote, rq);
smp_call_function_single_async(cpu, &rq->csd);
}
}
static void blk_mq_raise_softirq(struct request *rq)
{
struct llist_head *list;
preempt_disable();
list = this_cpu_ptr(&blk_cpu_done);
if (llist_add(&rq->ipi_list, list))
raise_softirq(BLOCK_SOFTIRQ);
preempt_enable();
}
bool blk_mq_complete_request_remote(struct request *rq)
{
WRITE_ONCE(rq->state, MQ_RQ_COMPLETE);
/*
* For a polled request, always complete locallly, it's pointless
* to redirect the completion.
*/
if (rq->cmd_flags & REQ_HIPRI)
return false;
if (blk_mq_complete_need_ipi(rq)) {
blk_mq_complete_send_ipi(rq);
return true;
}
if (rq->q->nr_hw_queues == 1) {
blk_mq_raise_softirq(rq);
return true;
}
return false;
}
EXPORT_SYMBOL_GPL(blk_mq_complete_request_remote);
/**
* blk_mq_complete_request - end I/O on a request
* @rq: the request being processed
*
* Description:
* Complete a request by scheduling the ->complete_rq operation.
**/
void blk_mq_complete_request(struct request *rq)
{
if (!blk_mq_complete_request_remote(rq))
rq->q->mq_ops->complete(rq);
}
EXPORT_SYMBOL(blk_mq_complete_request);
static void hctx_unlock(struct blk_mq_hw_ctx *hctx, int srcu_idx)
__releases(hctx->srcu)
{
if (!(hctx->flags & BLK_MQ_F_BLOCKING))
rcu_read_unlock();
else
srcu_read_unlock(hctx->srcu, srcu_idx);
}
static void hctx_lock(struct blk_mq_hw_ctx *hctx, int *srcu_idx)
__acquires(hctx->srcu)
{
if (!(hctx->flags & BLK_MQ_F_BLOCKING)) {
/* shut up gcc false positive */
*srcu_idx = 0;
rcu_read_lock();
} else
*srcu_idx = srcu_read_lock(hctx->srcu);
}
/**
* blk_mq_start_request - Start processing a request
* @rq: Pointer to request to be started
*
* Function used by device drivers to notify the block layer that a request
* is going to be processed now, so blk layer can do proper initializations
* such as starting the timeout timer.
*/
void blk_mq_start_request(struct request *rq)
{
struct request_queue *q = rq->q;
trace_block_rq_issue(rq);
if (test_bit(QUEUE_FLAG_STATS, &q->queue_flags)) {
rq->io_start_time_ns = ktime_get_ns();
rq->stats_sectors = blk_rq_sectors(rq);
rq->rq_flags |= RQF_STATS;
rq_qos_issue(q, rq);
}
WARN_ON_ONCE(blk_mq_rq_state(rq) != MQ_RQ_IDLE); blk_add_timer(rq);
WRITE_ONCE(rq->state, MQ_RQ_IN_FLIGHT);
#ifdef CONFIG_BLK_DEV_INTEGRITY
if (blk_integrity_rq(rq) && req_op(rq) == REQ_OP_WRITE)
q->integrity.profile->prepare_fn(rq);
#endif
}
EXPORT_SYMBOL(blk_mq_start_request);
static void __blk_mq_requeue_request(struct request *rq)
{
struct request_queue *q = rq->q;
blk_mq_put_driver_tag(rq);
trace_block_rq_requeue(rq);
rq_qos_requeue(q, rq);
if (blk_mq_request_started(rq)) {
WRITE_ONCE(rq->state, MQ_RQ_IDLE);
rq->rq_flags &= ~RQF_TIMED_OUT;
}
}
void blk_mq_requeue_request(struct request *rq, bool kick_requeue_list)
{
__blk_mq_requeue_request(rq);
/* this request will be re-inserted to io scheduler queue */
blk_mq_sched_requeue_request(rq);
blk_mq_add_to_requeue_list(rq, true, kick_requeue_list);
}
EXPORT_SYMBOL(blk_mq_requeue_request);
static void blk_mq_requeue_work(struct work_struct *work)
{
struct request_queue *q =
container_of(work, struct request_queue, requeue_work.work);
LIST_HEAD(rq_list);
struct request *rq, *next;
spin_lock_irq(&q->requeue_lock);
list_splice_init(&q->requeue_list, &rq_list);
spin_unlock_irq(&q->requeue_lock);
list_for_each_entry_safe(rq, next, &rq_list, queuelist) {
if (!(rq->rq_flags & (RQF_SOFTBARRIER | RQF_DONTPREP)))
continue;
rq->rq_flags &= ~RQF_SOFTBARRIER;
list_del_init(&rq->queuelist);
/*
* If RQF_DONTPREP, rq has contained some driver specific
* data, so insert it to hctx dispatch list to avoid any
* merge.
*/
if (rq->rq_flags & RQF_DONTPREP)
blk_mq_request_bypass_insert(rq, false, false);
else
blk_mq_sched_insert_request(rq, true, false, false);
}
while (!list_empty(&rq_list)) {
rq = list_entry(rq_list.next, struct request, queuelist);
list_del_init(&rq->queuelist);
blk_mq_sched_insert_request(rq, false, false, false);
}
blk_mq_run_hw_queues(q, false);
}
void blk_mq_add_to_requeue_list(struct request *rq, bool at_head,
bool kick_requeue_list)
{
struct request_queue *q = rq->q;
unsigned long flags;
/*
* We abuse this flag that is otherwise used by the I/O scheduler to
* request head insertion from the workqueue.
*/
BUG_ON(rq->rq_flags & RQF_SOFTBARRIER);
spin_lock_irqsave(&q->requeue_lock, flags);
if (at_head) {
rq->rq_flags |= RQF_SOFTBARRIER;
list_add(&rq->queuelist, &q->requeue_list);
} else {
list_add_tail(&rq->queuelist, &q->requeue_list);
}
spin_unlock_irqrestore(&q->requeue_lock, flags);
if (kick_requeue_list)
blk_mq_kick_requeue_list(q);
}
void blk_mq_kick_requeue_list(struct request_queue *q)
{
kblockd_mod_delayed_work_on(WORK_CPU_UNBOUND, &q->requeue_work, 0);
}
EXPORT_SYMBOL(blk_mq_kick_requeue_list);
void blk_mq_delay_kick_requeue_list(struct request_queue *q,
unsigned long msecs)
{
kblockd_mod_delayed_work_on(WORK_CPU_UNBOUND, &q->requeue_work,
msecs_to_jiffies(msecs));
}
EXPORT_SYMBOL(blk_mq_delay_kick_requeue_list);
struct request *blk_mq_tag_to_rq(struct blk_mq_tags *tags, unsigned int tag)
{
if (tag < tags->nr_tags) {
prefetch(tags->rqs[tag]);
return tags->rqs[tag];
}
return NULL;
}
EXPORT_SYMBOL(blk_mq_tag_to_rq);
static bool blk_mq_rq_inflight(struct blk_mq_hw_ctx *hctx, struct request *rq,
void *priv, bool reserved)
{
/*
* If we find a request that isn't idle and the queue matches,
* we know the queue is busy. Return false to stop the iteration.
*/
if (blk_mq_request_started(rq) && rq->q == hctx->queue) {
bool *busy = priv;
*busy = true;
return false;
}
return true;
}
bool blk_mq_queue_inflight(struct request_queue *q)
{
bool busy = false;
blk_mq_queue_tag_busy_iter(q, blk_mq_rq_inflight, &busy);
return busy;
}
EXPORT_SYMBOL_GPL(blk_mq_queue_inflight);
static void blk_mq_rq_timed_out(struct request *req, bool reserved)
{
req->rq_flags |= RQF_TIMED_OUT;
if (req->q->mq_ops->timeout) {
enum blk_eh_timer_return ret;
ret = req->q->mq_ops->timeout(req, reserved);
if (ret == BLK_EH_DONE)
return;
WARN_ON_ONCE(ret != BLK_EH_RESET_TIMER);
}
blk_add_timer(req);
}
static bool blk_mq_req_expired(struct request *rq, unsigned long *next)
{
unsigned long deadline;
if (blk_mq_rq_state(rq) != MQ_RQ_IN_FLIGHT)
return false;
if (rq->rq_flags & RQF_TIMED_OUT)
return false;
deadline = READ_ONCE(rq->deadline);
if (time_after_eq(jiffies, deadline))
return true;
if (*next == 0)
*next = deadline;
else if (time_after(*next, deadline))
*next = deadline;
return false;
}
void blk_mq_put_rq_ref(struct request *rq)
{
if (is_flush_rq(rq))
rq->end_io(rq, 0);
else if (refcount_dec_and_test(&rq->ref))
__blk_mq_free_request(rq);
}
static bool blk_mq_check_expired(struct blk_mq_hw_ctx *hctx,
struct request *rq, void *priv, bool reserved)
{
unsigned long *next = priv;
/*
* blk_mq_queue_tag_busy_iter() has locked the request, so it cannot
* be reallocated underneath the timeout handler's processing, then
* the expire check is reliable. If the request is not expired, then
* it was completed and reallocated as a new request after returning
* from blk_mq_check_expired().
*/
if (blk_mq_req_expired(rq, next))
blk_mq_rq_timed_out(rq, reserved);
return true;
}
static void blk_mq_timeout_work(struct work_struct *work)
{
struct request_queue *q =
container_of(work, struct request_queue, timeout_work);
unsigned long next = 0;
struct blk_mq_hw_ctx *hctx;
int i;
/* A deadlock might occur if a request is stuck requiring a
* timeout at the same time a queue freeze is waiting
* completion, since the timeout code would not be able to
* acquire the queue reference here.
*
* That's why we don't use blk_queue_enter here; instead, we use
* percpu_ref_tryget directly, because we need to be able to
* obtain a reference even in the short window between the queue
* starting to freeze, by dropping the first reference in
* blk_freeze_queue_start, and the moment the last request is
* consumed, marked by the instant q_usage_counter reaches
* zero.
*/
if (!percpu_ref_tryget(&q->q_usage_counter))
return;
blk_mq_queue_tag_busy_iter(q, blk_mq_check_expired, &next);
if (next != 0) {
mod_timer(&q->timeout, next);
} else {
/*
* Request timeouts are handled as a forward rolling timer. If
* we end up here it means that no requests are pending and
* also that no request has been pending for a while. Mark
* each hctx as idle.
*/
queue_for_each_hw_ctx(q, hctx, i) {
/* the hctx may be unmapped, so check it here */
if (blk_mq_hw_queue_mapped(hctx))
blk_mq_tag_idle(hctx);
}
}
blk_queue_exit(q);
}
struct flush_busy_ctx_data {
struct blk_mq_hw_ctx *hctx;
struct list_head *list;
};
static bool flush_busy_ctx(struct sbitmap *sb, unsigned int bitnr, void *data)
{
struct flush_busy_ctx_data *flush_data = data;
struct blk_mq_hw_ctx *hctx = flush_data->hctx;
struct blk_mq_ctx *ctx = hctx->ctxs[bitnr];
enum hctx_type type = hctx->type;
spin_lock(&ctx->lock);
list_splice_tail_init(&ctx->rq_lists[type], flush_data->list);
sbitmap_clear_bit(sb, bitnr);
spin_unlock(&ctx->lock);
return true;
}
/*
* Process software queues that have been marked busy, splicing them
* to the for-dispatch
*/
void blk_mq_flush_busy_ctxs(struct blk_mq_hw_ctx *hctx, struct list_head *list)
{
struct flush_busy_ctx_data data = {
.hctx = hctx,
.list = list,
};
sbitmap_for_each_set(&hctx->ctx_map, flush_busy_ctx, &data);
}
EXPORT_SYMBOL_GPL(blk_mq_flush_busy_ctxs);
struct dispatch_rq_data {
struct blk_mq_hw_ctx *hctx;
struct request *rq;
};
static bool dispatch_rq_from_ctx(struct sbitmap *sb, unsigned int bitnr,
void *data)
{
struct dispatch_rq_data *dispatch_data = data;
struct blk_mq_hw_ctx *hctx = dispatch_data->hctx;
struct blk_mq_ctx *ctx = hctx->ctxs[bitnr];
enum hctx_type type = hctx->type;
spin_lock(&ctx->lock);
if (!list_empty(&ctx->rq_lists[type])) {
dispatch_data->rq = list_entry_rq(ctx->rq_lists[type].next);
list_del_init(&dispatch_data->rq->queuelist);
if (list_empty(&ctx->rq_lists[type]))
sbitmap_clear_bit(sb, bitnr);
}
spin_unlock(&ctx->lock);
return !dispatch_data->rq;
}
struct request *blk_mq_dequeue_from_ctx(struct blk_mq_hw_ctx *hctx,
struct blk_mq_ctx *start)
{
unsigned off = start ? start->index_hw[hctx->type] : 0;
struct dispatch_rq_data data = {
.hctx = hctx,
.rq = NULL,
};
__sbitmap_for_each_set(&hctx->ctx_map, off,
dispatch_rq_from_ctx, &data);
return data.rq;
}
static inline unsigned int queued_to_index(unsigned int queued)
{
if (!queued)
return 0;
return min(BLK_MQ_MAX_DISPATCH_ORDER - 1, ilog2(queued) + 1);
}
static bool __blk_mq_get_driver_tag(struct request *rq)
{
struct sbitmap_queue *bt = rq->mq_hctx->tags->bitmap_tags;
unsigned int tag_offset = rq->mq_hctx->tags->nr_reserved_tags;
int tag;
blk_mq_tag_busy(rq->mq_hctx);
if (blk_mq_tag_is_reserved(rq->mq_hctx->sched_tags, rq->internal_tag)) { bt = rq->mq_hctx->tags->breserved_tags;
tag_offset = 0;
} else {
if (!hctx_may_queue(rq->mq_hctx, bt))
return false;
}
tag = __sbitmap_queue_get(bt);
if (tag == BLK_MQ_NO_TAG)
return false;
rq->tag = tag + tag_offset;
return true;
}
bool blk_mq_get_driver_tag(struct request *rq)
{
struct blk_mq_hw_ctx *hctx = rq->mq_hctx;
if (rq->tag == BLK_MQ_NO_TAG && !__blk_mq_get_driver_tag(rq))
return false;
if ((hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED) && !(rq->rq_flags & RQF_MQ_INFLIGHT)) { rq->rq_flags |= RQF_MQ_INFLIGHT;
__blk_mq_inc_active_requests(hctx);
}
hctx->tags->rqs[rq->tag] = rq; return true;
}
static int blk_mq_dispatch_wake(wait_queue_entry_t *wait, unsigned mode,
int flags, void *key)
{
struct blk_mq_hw_ctx *hctx;
hctx = container_of(wait, struct blk_mq_hw_ctx, dispatch_wait);
spin_lock(&hctx->dispatch_wait_lock);
if (!list_empty(&wait->entry)) {
struct sbitmap_queue *sbq;
list_del_init(&wait->entry);
sbq = hctx->tags->bitmap_tags;
atomic_dec(&sbq->ws_active);
}
spin_unlock(&hctx->dispatch_wait_lock);
blk_mq_run_hw_queue(hctx, true);
return 1;
}
/*
* Mark us waiting for a tag. For shared tags, this involves hooking us into
* the tag wakeups. For non-shared tags, we can simply mark us needing a
* restart. For both cases, take care to check the condition again after
* marking us as waiting.
*/
static bool blk_mq_mark_tag_wait(struct blk_mq_hw_ctx *hctx,
struct request *rq)
{
struct sbitmap_queue *sbq = hctx->tags->bitmap_tags;
struct wait_queue_head *wq;
wait_queue_entry_t *wait;
bool ret;
if (!(hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED)) { blk_mq_sched_mark_restart_hctx(hctx);
/*
* It's possible that a tag was freed in the window between the
* allocation failure and adding the hardware queue to the wait
* queue.
*
* Don't clear RESTART here, someone else could have set it.
* At most this will cost an extra queue run.
*/
return blk_mq_get_driver_tag(rq);
}
wait = &hctx->dispatch_wait;
if (!list_empty_careful(&wait->entry))
return false;
wq = &bt_wait_ptr(sbq, hctx)->wait;
spin_lock_irq(&wq->lock);
spin_lock(&hctx->dispatch_wait_lock);
if (!list_empty(&wait->entry)) {
spin_unlock(&hctx->dispatch_wait_lock);
spin_unlock_irq(&wq->lock);
return false;
}
atomic_inc(&sbq->ws_active);
wait->flags &= ~WQ_FLAG_EXCLUSIVE;
__add_wait_queue(wq, wait);
/*
* It's possible that a tag was freed in the window between the
* allocation failure and adding the hardware queue to the wait
* queue.
*/
ret = blk_mq_get_driver_tag(rq);
if (!ret) {
spin_unlock(&hctx->dispatch_wait_lock);
spin_unlock_irq(&wq->lock);
return false;
}
/*
* We got a tag, remove ourselves from the wait queue to ensure
* someone else gets the wakeup.
*/
list_del_init(&wait->entry);
atomic_dec(&sbq->ws_active);
spin_unlock(&hctx->dispatch_wait_lock);
spin_unlock_irq(&wq->lock);
return true;
}
#define BLK_MQ_DISPATCH_BUSY_EWMA_WEIGHT 8
#define BLK_MQ_DISPATCH_BUSY_EWMA_FACTOR 4
/*
* Update dispatch busy with the Exponential Weighted Moving Average(EWMA):
* - EWMA is one simple way to compute running average value
* - weight(7/8 and 1/8) is applied so that it can decrease exponentially
* - take 4 as factor for avoiding to get too small(0) result, and this
* factor doesn't matter because EWMA decreases exponentially
*/
static void blk_mq_update_dispatch_busy(struct blk_mq_hw_ctx *hctx, bool busy)
{
unsigned int ewma;
ewma = hctx->dispatch_busy;
if (!ewma && !busy)
return;
ewma *= BLK_MQ_DISPATCH_BUSY_EWMA_WEIGHT - 1;
if (busy)
ewma += 1 << BLK_MQ_DISPATCH_BUSY_EWMA_FACTOR;
ewma /= BLK_MQ_DISPATCH_BUSY_EWMA_WEIGHT;
hctx->dispatch_busy = ewma;
}
#define BLK_MQ_RESOURCE_DELAY 3 /* ms units */
static void blk_mq_handle_dev_resource(struct request *rq,
struct list_head *list)
{
struct request *next =
list_first_entry_or_null(list, struct request, queuelist);
/*
* If an I/O scheduler has been configured and we got a driver tag for
* the next request already, free it.
*/
if (next)
blk_mq_put_driver_tag(next);
list_add(&rq->queuelist, list);
__blk_mq_requeue_request(rq);
}
static void blk_mq_handle_zone_resource(struct request *rq,
struct list_head *zone_list)
{
/*
* If we end up here it is because we cannot dispatch a request to a
* specific zone due to LLD level zone-write locking or other zone
* related resource not being available. In this case, set the request
* aside in zone_list for retrying it later.
*/
list_add(&rq->queuelist, zone_list);
__blk_mq_requeue_request(rq);
}
enum prep_dispatch {
PREP_DISPATCH_OK,
PREP_DISPATCH_NO_TAG,
PREP_DISPATCH_NO_BUDGET,
};
static enum prep_dispatch blk_mq_prep_dispatch_rq(struct request *rq,
bool need_budget)
{
struct blk_mq_hw_ctx *hctx = rq->mq_hctx;
int budget_token = -1;
if (need_budget) { budget_token = blk_mq_get_dispatch_budget(rq->q); if (budget_token < 0) {
blk_mq_put_driver_tag(rq);
return PREP_DISPATCH_NO_BUDGET;
}
blk_mq_set_rq_budget_token(rq, budget_token);
}
if (!blk_mq_get_driver_tag(rq)) {
/*
* The initial allocation attempt failed, so we need to
* rerun the hardware queue when a tag is freed. The
* waitqueue takes care of that. If the queue is run
* before we add this entry back on the dispatch list,
* we'll re-run it below.
*/
if (!blk_mq_mark_tag_wait(hctx, rq)) {
/*
* All budgets not got from this function will be put
* together during handling partial dispatch
*/
if (need_budget) blk_mq_put_dispatch_budget(rq->q, budget_token);
return PREP_DISPATCH_NO_TAG;
}
}
return PREP_DISPATCH_OK;
}
/* release all allocated budgets before calling to blk_mq_dispatch_rq_list */
static void blk_mq_release_budgets(struct request_queue *q,
struct list_head *list)
{
struct request *rq;
list_for_each_entry(rq, list, queuelist) {
int budget_token = blk_mq_get_rq_budget_token(rq);
if (budget_token >= 0)
blk_mq_put_dispatch_budget(q, budget_token);
}
}
/*
* Returns true if we did some work AND can potentially do more.
*/
bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *list,
unsigned int nr_budgets)
{
enum prep_dispatch prep;
struct request_queue *q = hctx->queue;
struct request *rq, *nxt;
int errors, queued;
blk_status_t ret = BLK_STS_OK;
LIST_HEAD(zone_list);
bool needs_resource = false;
if (list_empty(list))
return false;
/*
* Now process all the entries, sending them to the driver.
*/
errors = queued = 0;
do {
struct blk_mq_queue_data bd;
rq = list_first_entry(list, struct request, queuelist); WARN_ON_ONCE(hctx != rq->mq_hctx);
prep = blk_mq_prep_dispatch_rq(rq, !nr_budgets);
if (prep != PREP_DISPATCH_OK)
break;
list_del_init(&rq->queuelist);
bd.rq = rq;
/*
* Flag last if we have no more requests, or if we have more
* but can't assign a driver tag to it.
*/
if (list_empty(list))
bd.last = true;
else {
nxt = list_first_entry(list, struct request, queuelist);
bd.last = !blk_mq_get_driver_tag(nxt);
}
/*
* once the request is queued to lld, no need to cover the
* budget any more
*/
if (nr_budgets) nr_budgets--; ret = q->mq_ops->queue_rq(hctx, &bd);
switch (ret) {
case BLK_STS_OK:
queued++;
break;
case BLK_STS_RESOURCE:
needs_resource = true;
fallthrough;
case BLK_STS_DEV_RESOURCE:
blk_mq_handle_dev_resource(rq, list);
goto out;
case BLK_STS_ZONE_RESOURCE:
/*
* Move the request to zone_list and keep going through
* the dispatch list to find more requests the drive can
* accept.
*/
blk_mq_handle_zone_resource(rq, &zone_list);
needs_resource = true;
break;
default:
errors++;
blk_mq_end_request(rq, ret);
}
} while (!list_empty(list));
out:
if (!list_empty(&zone_list))
list_splice_tail_init(&zone_list, list);
hctx->dispatched[queued_to_index(queued)]++;
/* If we didn't flush the entire list, we could have told the driver
* there was more coming, but that turned out to be a lie.
*/
if ((!list_empty(list) || errors) && q->mq_ops->commit_rqs && queued) q->mq_ops->commit_rqs(hctx);
/*
* Any items that need requeuing? Stuff them into hctx->dispatch,
* that is where we will continue on next queue run.
*/
if (!list_empty(list)) {
bool needs_restart;
/* For non-shared tags, the RESTART check will suffice */
bool no_tag = prep == PREP_DISPATCH_NO_TAG && (hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED); if (nr_budgets)
blk_mq_release_budgets(q, list);
spin_lock(&hctx->lock);
list_splice_tail_init(list, &hctx->dispatch);
spin_unlock(&hctx->lock);
/*
* Order adding requests to hctx->dispatch and checking
* SCHED_RESTART flag. The pair of this smp_mb() is the one
* in blk_mq_sched_restart(). Avoid restart code path to
* miss the new added requests to hctx->dispatch, meantime
* SCHED_RESTART is observed here.
*/
smp_mb();
/*
* If SCHED_RESTART was set by the caller of this function and
* it is no longer set that means that it was cleared by another
* thread and hence that a queue rerun is needed.
*
* If 'no_tag' is set, that means that we failed getting
* a driver tag with an I/O scheduler attached. If our dispatch
* waitqueue is no longer active, ensure that we run the queue
* AFTER adding our entries back to the list.
*
* If no I/O scheduler has been configured it is possible that
* the hardware queue got stopped and restarted before requests
* were pushed back onto the dispatch list. Rerun the queue to
* avoid starvation. Notes:
* - blk_mq_run_hw_queue() checks whether or not a queue has
* been stopped before rerunning a queue.
* - Some but not all block drivers stop a queue before
* returning BLK_STS_RESOURCE. Two exceptions are scsi-mq
* and dm-rq.
*
* If driver returns BLK_STS_RESOURCE and SCHED_RESTART
* bit is set, run queue after a delay to avoid IO stalls
* that could otherwise occur if the queue is idle. We'll do
* similar if we couldn't get budget or couldn't lock a zone
* and SCHED_RESTART is set.
*/
needs_restart = blk_mq_sched_needs_restart(hctx);
if (prep == PREP_DISPATCH_NO_BUDGET)
needs_resource = true;
if (!needs_restart || (no_tag && list_empty_careful(&hctx->dispatch_wait.entry))) blk_mq_run_hw_queue(hctx, true); else if (needs_restart && needs_resource)
blk_mq_delay_run_hw_queue(hctx, BLK_MQ_RESOURCE_DELAY);
blk_mq_update_dispatch_busy(hctx, true);
return false;
} else
blk_mq_update_dispatch_busy(hctx, false);
return (queued + errors) != 0;
}
/**
* __blk_mq_run_hw_queue - Run a hardware queue.
* @hctx: Pointer to the hardware queue to run.
*
* Send pending requests to the hardware.
*/
static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx)
{
int srcu_idx;
/*
* We can't run the queue inline with ints disabled. Ensure that
* we catch bad users of this early.
*/
WARN_ON_ONCE(in_interrupt()); might_sleep_if(hctx->flags & BLK_MQ_F_BLOCKING);
hctx_lock(hctx, &srcu_idx);
blk_mq_sched_dispatch_requests(hctx);
hctx_unlock(hctx, srcu_idx);
}
static inline int blk_mq_first_mapped_cpu(struct blk_mq_hw_ctx *hctx)
{
int cpu = cpumask_first_and(hctx->cpumask, cpu_online_mask);
if (cpu >= nr_cpu_ids)
cpu = cpumask_first(hctx->cpumask);
return cpu;
}
/*
* It'd be great if the workqueue API had a way to pass
* in a mask and had some smarts for more clever placement.
* For now we just round-robin here, switching for every
* BLK_MQ_CPU_WORK_BATCH queued items.
*/
static int blk_mq_hctx_next_cpu(struct blk_mq_hw_ctx *hctx)
{
bool tried = false;
int next_cpu = hctx->next_cpu;
if (hctx->queue->nr_hw_queues == 1)
return WORK_CPU_UNBOUND;
if (--hctx->next_cpu_batch <= 0) {
select_cpu:
next_cpu = cpumask_next_and(next_cpu, hctx->cpumask,
cpu_online_mask);
if (next_cpu >= nr_cpu_ids)
next_cpu = blk_mq_first_mapped_cpu(hctx);
hctx->next_cpu_batch = BLK_MQ_CPU_WORK_BATCH;
}
/*
* Do unbound schedule if we can't find a online CPU for this hctx,
* and it should only happen in the path of handling CPU DEAD.
*/
if (!cpu_online(next_cpu)) {
if (!tried) {
tried = true;
goto select_cpu;
}
/*
* Make sure to re-select CPU next time once after CPUs
* in hctx->cpumask become online again.
*/
hctx->next_cpu = next_cpu;
hctx->next_cpu_batch = 1;
return WORK_CPU_UNBOUND;
}
hctx->next_cpu = next_cpu;
return next_cpu;
}
/**
* __blk_mq_delay_run_hw_queue - Run (or schedule to run) a hardware queue.
* @hctx: Pointer to the hardware queue to run.
* @async: If we want to run the queue asynchronously.
* @msecs: Milliseconds of delay to wait before running the queue.
*
* If !@async, try to run the queue now. Else, run the queue asynchronously and
* with a delay of @msecs.
*/
static void __blk_mq_delay_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async,
unsigned long msecs)
{
if (unlikely(blk_mq_hctx_stopped(hctx)))
return;
if (!async && !(hctx->flags & BLK_MQ_F_BLOCKING)) { int cpu = get_cpu();
if (cpumask_test_cpu(cpu, hctx->cpumask)) {
__blk_mq_run_hw_queue(hctx);
put_cpu();
return;
}
put_cpu();
}
kblockd_mod_delayed_work_on(blk_mq_hctx_next_cpu(hctx), &hctx->run_work,
msecs_to_jiffies(msecs));
}
/**
* blk_mq_delay_run_hw_queue - Run a hardware queue asynchronously.
* @hctx: Pointer to the hardware queue to run.
* @msecs: Milliseconds of delay to wait before running the queue.
*
* Run a hardware queue asynchronously with a delay of @msecs.
*/
void blk_mq_delay_run_hw_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs)
{
__blk_mq_delay_run_hw_queue(hctx, true, msecs);
}
EXPORT_SYMBOL(blk_mq_delay_run_hw_queue);
/**
* blk_mq_run_hw_queue - Start to run a hardware queue.
* @hctx: Pointer to the hardware queue to run.
* @async: If we want to run the queue asynchronously.
*
* Check if the request queue is not in a quiesced state and if there are
* pending requests to be sent. If this is true, run the queue to send requests
* to hardware.
*/
void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async)
{
int srcu_idx;
bool need_run;
/*
* When queue is quiesced, we may be switching io scheduler, or
* updating nr_hw_queues, or other things, and we can't run queue
* any more, even __blk_mq_hctx_has_pending() can't be called safely.
*
* And queue will be rerun in blk_mq_unquiesce_queue() if it is
* quiesced.
*/
hctx_lock(hctx, &srcu_idx);
need_run = !blk_queue_quiesced(hctx->queue) &&
blk_mq_hctx_has_pending(hctx);
hctx_unlock(hctx, srcu_idx);
if (need_run)
__blk_mq_delay_run_hw_queue(hctx, async, 0);
}
EXPORT_SYMBOL(blk_mq_run_hw_queue);
/*
* Is the request queue handled by an IO scheduler that does not respect
* hardware queues when dispatching?
*/
static bool blk_mq_has_sqsched(struct request_queue *q)
{
struct elevator_queue *e = q->elevator;
if (e && e->type->ops.dispatch_request && !(e->type->elevator_features & ELEVATOR_F_MQ_AWARE))
return true;
return false;
}
/*
* Return prefered queue to dispatch from (if any) for non-mq aware IO
* scheduler.
*/
static struct blk_mq_hw_ctx *blk_mq_get_sq_hctx(struct request_queue *q)
{
struct blk_mq_hw_ctx *hctx;
/*
* If the IO scheduler does not respect hardware queues when
* dispatching, we just don't bother with multiple HW queues and
* dispatch from hctx for the current CPU since running multiple queues
* just causes lock contention inside the scheduler and pointless cache
* bouncing.
*/
hctx = blk_mq_map_queue_type(q, HCTX_TYPE_DEFAULT,
raw_smp_processor_id());
if (!blk_mq_hctx_stopped(hctx))
return hctx;
return NULL;
}
/**
* blk_mq_run_hw_queues - Run all hardware queues in a request queue.
* @q: Pointer to the request queue to run.
* @async: If we want to run the queue asynchronously.
*/
void blk_mq_run_hw_queues(struct request_queue *q, bool async)
{
struct blk_mq_hw_ctx *hctx, *sq_hctx;
int i;
sq_hctx = NULL;
if (blk_mq_has_sqsched(q)) sq_hctx = blk_mq_get_sq_hctx(q); queue_for_each_hw_ctx(q, hctx, i) {
if (blk_mq_hctx_stopped(hctx))
continue;
/*
* Dispatch from this hctx either if there's no hctx preferred
* by IO scheduler or if it has requests that bypass the
* scheduler.
*/
if (!sq_hctx || sq_hctx == hctx || !list_empty_careful(&hctx->dispatch)) blk_mq_run_hw_queue(hctx, async);
}
}
EXPORT_SYMBOL(blk_mq_run_hw_queues);
/**
* blk_mq_delay_run_hw_queues - Run all hardware queues asynchronously.
* @q: Pointer to the request queue to run.
* @msecs: Milliseconds of delay to wait before running the queues.
*/
void blk_mq_delay_run_hw_queues(struct request_queue *q, unsigned long msecs)
{
struct blk_mq_hw_ctx *hctx, *sq_hctx;
int i;
sq_hctx = NULL;
if (blk_mq_has_sqsched(q))
sq_hctx = blk_mq_get_sq_hctx(q);
queue_for_each_hw_ctx(q, hctx, i) {
if (blk_mq_hctx_stopped(hctx))
continue;
/*
* Dispatch from this hctx either if there's no hctx preferred
* by IO scheduler or if it has requests that bypass the
* scheduler.
*/
if (!sq_hctx || sq_hctx == hctx ||
!list_empty_careful(&hctx->dispatch))
blk_mq_delay_run_hw_queue(hctx, msecs);
}
}
EXPORT_SYMBOL(blk_mq_delay_run_hw_queues);
/**
* blk_mq_queue_stopped() - check whether one or more hctxs have been stopped
* @q: request queue.
*
* The caller is responsible for serializing this function against
* blk_mq_{start,stop}_hw_queue().
*/
bool blk_mq_queue_stopped(struct request_queue *q)
{
struct blk_mq_hw_ctx *hctx;
int i;
queue_for_each_hw_ctx(q, hctx, i)
if (blk_mq_hctx_stopped(hctx))
return true;
return false;
}
EXPORT_SYMBOL(blk_mq_queue_stopped);
/*
* This function is often used for pausing .queue_rq() by driver when
* there isn't enough resource or some conditions aren't satisfied, and
* BLK_STS_RESOURCE is usually returned.
*
* We do not guarantee that dispatch can be drained or blocked
* after blk_mq_stop_hw_queue() returns. Please use
* blk_mq_quiesce_queue() for that requirement.
*/
void blk_mq_stop_hw_queue(struct blk_mq_hw_ctx *hctx)
{
cancel_delayed_work(&hctx->run_work);
set_bit(BLK_MQ_S_STOPPED, &hctx->state);
}
EXPORT_SYMBOL(blk_mq_stop_hw_queue);
/*
* This function is often used for pausing .queue_rq() by driver when
* there isn't enough resource or some conditions aren't satisfied, and
* BLK_STS_RESOURCE is usually returned.
*
* We do not guarantee that dispatch can be drained or blocked
* after blk_mq_stop_hw_queues() returns. Please use
* blk_mq_quiesce_queue() for that requirement.
*/
void blk_mq_stop_hw_queues(struct request_queue *q)
{
struct blk_mq_hw_ctx *hctx;
int i;
queue_for_each_hw_ctx(q, hctx, i)
blk_mq_stop_hw_queue(hctx);
}
EXPORT_SYMBOL(blk_mq_stop_hw_queues);
void blk_mq_start_hw_queue(struct blk_mq_hw_ctx *hctx)
{
clear_bit(BLK_MQ_S_STOPPED, &hctx->state);
blk_mq_run_hw_queue(hctx, false);
}
EXPORT_SYMBOL(blk_mq_start_hw_queue);
void blk_mq_start_hw_queues(struct request_queue *q)
{
struct blk_mq_hw_ctx *hctx;
int i;
queue_for_each_hw_ctx(q, hctx, i)
blk_mq_start_hw_queue(hctx);
}
EXPORT_SYMBOL(blk_mq_start_hw_queues);
void blk_mq_start_stopped_hw_queue(struct blk_mq_hw_ctx *hctx, bool async)
{
if (!blk_mq_hctx_stopped(hctx))
return;
clear_bit(BLK_MQ_S_STOPPED, &hctx->state);
blk_mq_run_hw_queue(hctx, async);
}
EXPORT_SYMBOL_GPL(blk_mq_start_stopped_hw_queue);
void blk_mq_start_stopped_hw_queues(struct request_queue *q, bool async)
{
struct blk_mq_hw_ctx *hctx;
int i;
queue_for_each_hw_ctx(q, hctx, i)
blk_mq_start_stopped_hw_queue(hctx, async);
}
EXPORT_SYMBOL(blk_mq_start_stopped_hw_queues);
static void blk_mq_run_work_fn(struct work_struct *work)
{
struct blk_mq_hw_ctx *hctx;
hctx = container_of(work, struct blk_mq_hw_ctx, run_work.work);
/*
* If we are stopped, don't run the queue.
*/
if (blk_mq_hctx_stopped(hctx))
return;
__blk_mq_run_hw_queue(hctx);
}
static inline void __blk_mq_insert_req_list(struct blk_mq_hw_ctx *hctx,
struct request *rq,
bool at_head)
{
struct blk_mq_ctx *ctx = rq->mq_ctx;
enum hctx_type type = hctx->type;
lockdep_assert_held(&ctx->lock);
trace_block_rq_insert(rq);
if (at_head)
list_add(&rq->queuelist, &ctx->rq_lists[type]);
else
list_add_tail(&rq->queuelist, &ctx->rq_lists[type]);
}
void __blk_mq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq,
bool at_head)
{
struct blk_mq_ctx *ctx = rq->mq_ctx;
lockdep_assert_held(&ctx->lock);
__blk_mq_insert_req_list(hctx, rq, at_head);
blk_mq_hctx_mark_pending(hctx, ctx);
}
/**
* blk_mq_request_bypass_insert - Insert a request at dispatch list.
* @rq: Pointer to request to be inserted.
* @at_head: true if the request should be inserted at the head of the list.
* @run_queue: If we should run the hardware queue after inserting the request.
*
* Should only be used carefully, when the caller knows we want to
* bypass a potential IO scheduler on the target device.
*/
void blk_mq_request_bypass_insert(struct request *rq, bool at_head,
bool run_queue)
{
struct blk_mq_hw_ctx *hctx = rq->mq_hctx;
spin_lock(&hctx->lock);
if (at_head)
list_add(&rq->queuelist, &hctx->dispatch);
else
list_add_tail(&rq->queuelist, &hctx->dispatch);
spin_unlock(&hctx->lock);
if (run_queue)
blk_mq_run_hw_queue(hctx, false);
}
void blk_mq_insert_requests(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx,
struct list_head *list)
{
struct request *rq;
enum hctx_type type = hctx->type;
/*
* preemption doesn't flush plug list, so it's possible ctx->cpu is
* offline now
*/
list_for_each_entry(rq, list, queuelist) { BUG_ON(rq->mq_ctx != ctx);
trace_block_rq_insert(rq);
}
spin_lock(&ctx->lock);
list_splice_tail_init(list, &ctx->rq_lists[type]);
blk_mq_hctx_mark_pending(hctx, ctx);
spin_unlock(&ctx->lock);
}
static int plug_rq_cmp(void *priv, const struct list_head *a,
const struct list_head *b)
{
struct request *rqa = container_of(a, struct request, queuelist);
struct request *rqb = container_of(b, struct request, queuelist);
if (rqa->mq_ctx != rqb->mq_ctx)
return rqa->mq_ctx > rqb->mq_ctx;
if (rqa->mq_hctx != rqb->mq_hctx)
return rqa->mq_hctx > rqb->mq_hctx;
return blk_rq_pos(rqa) > blk_rq_pos(rqb);
}
void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule)
{
LIST_HEAD(list);
if (list_empty(&plug->mq_list))
return;
list_splice_init(&plug->mq_list, &list);
if (plug->rq_count > 2 && plug->multiple_queues) list_sort(NULL, &list, plug_rq_cmp); plug->rq_count = 0;
do {
struct list_head rq_list;
struct request *rq, *head_rq = list_entry_rq(list.next);
struct list_head *pos = &head_rq->queuelist; /* skip first */
struct blk_mq_hw_ctx *this_hctx = head_rq->mq_hctx;
struct blk_mq_ctx *this_ctx = head_rq->mq_ctx;
unsigned int depth = 1;
list_for_each_continue(pos, &list) {
rq = list_entry_rq(pos);
BUG_ON(!rq->q); if (rq->mq_hctx != this_hctx || rq->mq_ctx != this_ctx)
break;
depth++;
}
list_cut_before(&rq_list, &list, pos);
trace_block_unplug(head_rq->q, depth, !from_schedule); blk_mq_sched_insert_requests(this_hctx, this_ctx, &rq_list,
from_schedule);
} while(!list_empty(&list));
}
static void blk_mq_bio_to_request(struct request *rq, struct bio *bio,
unsigned int nr_segs)
{
int err;
if (bio->bi_opf & REQ_RAHEAD)
rq->cmd_flags |= REQ_FAILFAST_MASK; rq->__sector = bio->bi_iter.bi_sector;
rq->write_hint = bio->bi_write_hint;
blk_rq_bio_prep(rq, bio, nr_segs);
/* This can't fail, since GFP_NOIO includes __GFP_DIRECT_RECLAIM. */
err = blk_crypto_rq_bio_prep(rq, bio, GFP_NOIO);
WARN_ON_ONCE(err);
blk_account_io_start(rq);
}
static blk_status_t __blk_mq_issue_directly(struct blk_mq_hw_ctx *hctx,
struct request *rq,
blk_qc_t *cookie, bool last)
{
struct request_queue *q = rq->q;
struct blk_mq_queue_data bd = {
.rq = rq,
.last = last,
};
blk_qc_t new_cookie;
blk_status_t ret;
new_cookie = request_to_qc_t(hctx, rq);
/*
* For OK queue, we are done. For error, caller may kill it.
* Any other error (busy), just add it to our list as we
* previously would have done.
*/
ret = q->mq_ops->queue_rq(hctx, &bd);
switch (ret) {
case BLK_STS_OK:
blk_mq_update_dispatch_busy(hctx, false);
*cookie = new_cookie;
break;
case BLK_STS_RESOURCE:
case BLK_STS_DEV_RESOURCE:
blk_mq_update_dispatch_busy(hctx, true);
__blk_mq_requeue_request(rq);
break;
default:
blk_mq_update_dispatch_busy(hctx, false);
*cookie = BLK_QC_T_NONE;
break;
}
return ret;
}
static blk_status_t __blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx,
struct request *rq,
blk_qc_t *cookie,
bool bypass_insert, bool last)
{
struct request_queue *q = rq->q;
bool run_queue = true;
int budget_token;
/*
* RCU or SRCU read lock is needed before checking quiesced flag.
*
* When queue is stopped or quiesced, ignore 'bypass_insert' from
* blk_mq_request_issue_directly(), and return BLK_STS_OK to caller,
* and avoid driver to try to dispatch again.
*/
if (blk_mq_hctx_stopped(hctx) || blk_queue_quiesced(q)) {
run_queue = false;
bypass_insert = false;
goto insert;
}
if (q->elevator && !bypass_insert)
goto insert;
budget_token = blk_mq_get_dispatch_budget(q);
if (budget_token < 0)
goto insert;
blk_mq_set_rq_budget_token(rq, budget_token);
if (!blk_mq_get_driver_tag(rq)) {
blk_mq_put_dispatch_budget(q, budget_token);
goto insert;
}
return __blk_mq_issue_directly(hctx, rq, cookie, last);
insert:
if (bypass_insert)
return BLK_STS_RESOURCE;
blk_mq_sched_insert_request(rq, false, run_queue, false); return BLK_STS_OK;
}
/**
* blk_mq_try_issue_directly - Try to send a request directly to device driver.
* @hctx: Pointer of the associated hardware queue.
* @rq: Pointer to request to be sent.
* @cookie: Request queue cookie.
*
* If the device has enough resources to accept a new request now, send the
* request directly to device driver. Else, insert at hctx->dispatch queue, so
* we can try send it another time in the future. Requests inserted at this
* queue have higher priority.
*/
static void blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx,
struct request *rq, blk_qc_t *cookie)
{
blk_status_t ret;
int srcu_idx;
might_sleep_if(hctx->flags & BLK_MQ_F_BLOCKING);
hctx_lock(hctx, &srcu_idx);
ret = __blk_mq_try_issue_directly(hctx, rq, cookie, false, true);
if (ret == BLK_STS_RESOURCE || ret == BLK_STS_DEV_RESOURCE)
blk_mq_request_bypass_insert(rq, false, true); else if (ret != BLK_STS_OK) blk_mq_end_request(rq, ret); hctx_unlock(hctx, srcu_idx);
}
blk_status_t blk_mq_request_issue_directly(struct request *rq, bool last)
{
blk_status_t ret;
int srcu_idx;
blk_qc_t unused_cookie;
struct blk_mq_hw_ctx *hctx = rq->mq_hctx;
hctx_lock(hctx, &srcu_idx);
ret = __blk_mq_try_issue_directly(hctx, rq, &unused_cookie, true, last);
hctx_unlock(hctx, srcu_idx);
return ret;
}
void blk_mq_try_issue_list_directly(struct blk_mq_hw_ctx *hctx,
struct list_head *list)
{
int queued = 0;
int errors = 0;
while (!list_empty(list)) {
blk_status_t ret;
struct request *rq = list_first_entry(list, struct request,
queuelist);
list_del_init(&rq->queuelist);
ret = blk_mq_request_issue_directly(rq, list_empty(list));
if (ret != BLK_STS_OK) {
if (ret == BLK_STS_RESOURCE ||
ret == BLK_STS_DEV_RESOURCE) {
blk_mq_request_bypass_insert(rq, false,
list_empty(list));
break;
}
blk_mq_end_request(rq, ret);
errors++;
} else
queued++;
}
/*
* If we didn't flush the entire list, we could have told
* the driver there was more coming, but that turned out to
* be a lie.
*/
if ((!list_empty(list) || errors) && hctx->queue->mq_ops->commit_rqs && queued) hctx->queue->mq_ops->commit_rqs(hctx);
}
static void blk_add_rq_to_plug(struct blk_plug *plug, struct request *rq)
{
list_add_tail(&rq->queuelist, &plug->mq_list);
plug->rq_count++;
if (!plug->multiple_queues && !list_is_singular(&plug->mq_list)) {
struct request *tmp;
tmp = list_first_entry(&plug->mq_list, struct request,
queuelist);
if (tmp->q != rq->q) plug->multiple_queues = true;
}
}
/*
* Allow 2x BLK_MAX_REQUEST_COUNT requests on plug queue for multiple
* queues. This is important for md arrays to benefit from merging
* requests.
*/
static inline unsigned short blk_plug_max_rq_count(struct blk_plug *plug)
{
if (plug->multiple_queues)
return BLK_MAX_REQUEST_COUNT * 2;
return BLK_MAX_REQUEST_COUNT;
}
/**
* blk_mq_submit_bio - Create and send a request to block device.
* @bio: Bio pointer.
*
* Builds up a request structure from @q and @bio and send to the device. The
* request may not be queued directly to hardware if:
* * This request can be merged with another one
* * We want to place request at plug queue for possible future merging
* * There is an IO scheduler active at this queue
*
* It will not queue the request if there is an error with the bio, or at the
* request creation.
*
* Returns: Request queue cookie.
*/
blk_qc_t blk_mq_submit_bio(struct bio *bio)
{
struct request_queue *q = bio->bi_bdev->bd_disk->queue;
const int is_sync = op_is_sync(bio->bi_opf);
const int is_flush_fua = op_is_flush(bio->bi_opf);
struct blk_mq_alloc_data data = {
.q = q,
};
struct request *rq;
struct blk_plug *plug;
struct request *same_queue_rq = NULL;
unsigned int nr_segs;
blk_qc_t cookie;
blk_status_t ret;
bool hipri;
blk_queue_bounce(q, &bio);
__blk_queue_split(&bio, &nr_segs);
if (!bio_integrity_prep(bio))
goto queue_exit;
if (!is_flush_fua && !blk_queue_nomerges(q) &&
blk_attempt_plug_merge(q, bio, nr_segs, &same_queue_rq))
goto queue_exit;
if (blk_mq_sched_bio_merge(q, bio, nr_segs))
goto queue_exit;
rq_qos_throttle(q, bio); hipri = bio->bi_opf & REQ_HIPRI;
data.cmd_flags = bio->bi_opf;
rq = __blk_mq_alloc_request(&data);
if (unlikely(!rq)) {
rq_qos_cleanup(q, bio); if (bio->bi_opf & REQ_NOWAIT)
bio_wouldblock_error(bio);
goto queue_exit;
}
trace_block_getrq(bio); rq_qos_track(q, rq, bio); cookie = request_to_qc_t(data.hctx, rq);
blk_mq_bio_to_request(rq, bio, nr_segs);
ret = blk_crypto_init_request(rq);
if (ret != BLK_STS_OK) {
bio->bi_status = ret;
bio_endio(bio);
blk_mq_free_request(rq);
return BLK_QC_T_NONE;
}
plug = blk_mq_plug(q, bio);
if (unlikely(is_flush_fua)) {
/* Bypass scheduler for flush requests */
blk_insert_flush(rq);
blk_mq_run_hw_queue(data.hctx, true);
} else if (plug && (q->nr_hw_queues == 1 || blk_mq_is_sbitmap_shared(rq->mq_hctx->flags) || q->mq_ops->commit_rqs || !blk_queue_nonrot(q))) {
/*
* Use plugging if we have a ->commit_rqs() hook as well, as
* we know the driver uses bd->last in a smart fashion.
*
* Use normal plugging if this disk is slow HDD, as sequential
* IO may benefit a lot from plug merging.
*/
unsigned int request_count = plug->rq_count;
struct request *last = NULL;
if (!request_count)
trace_block_plug(q);
else
last = list_entry_rq(plug->mq_list.prev); if (request_count >= blk_plug_max_rq_count(plug) || (last && blk_rq_bytes(last) >= BLK_PLUG_FLUSH_SIZE)) { blk_flush_plug_list(plug, false);
trace_block_plug(q);
}
blk_add_rq_to_plug(plug, rq); } else if (q->elevator) {
/* Insert the request at the IO scheduler queue */
blk_mq_sched_insert_request(rq, false, true, true);
} else if (plug && !blk_queue_nomerges(q)) {
/*
* We do limited plugging. If the bio can be merged, do that.
* Otherwise the existing request in the plug list will be
* issued. So the plug list will have one request at most
* The plug list might get flushed before this. If that happens,
* the plug list is empty, and same_queue_rq is invalid.
*/
if (list_empty(&plug->mq_list)) same_queue_rq = NULL; if (same_queue_rq) { list_del_init(&same_queue_rq->queuelist);
plug->rq_count--;
}
blk_add_rq_to_plug(plug, rq);
trace_block_plug(q);
if (same_queue_rq) { data.hctx = same_queue_rq->mq_hctx;
trace_block_unplug(q, 1, true);
blk_mq_try_issue_directly(data.hctx, same_queue_rq,
&cookie);
}
} else if ((q->nr_hw_queues > 1 && is_sync) || !data.hctx->dispatch_busy) {
/*
* There is no scheduler and we can try to send directly
* to the hardware.
*/
blk_mq_try_issue_directly(data.hctx, rq, &cookie);
} else {
/* Default case. */
blk_mq_sched_insert_request(rq, false, true, true);
}
if (!hipri)
return BLK_QC_T_NONE;
return cookie;
queue_exit:
blk_queue_exit(q); return BLK_QC_T_NONE;
}
static size_t order_to_size(unsigned int order)
{
return (size_t)PAGE_SIZE << order;
}
/* called before freeing request pool in @tags */
static void blk_mq_clear_rq_mapping(struct blk_mq_tag_set *set,
struct blk_mq_tags *tags, unsigned int hctx_idx)
{
struct blk_mq_tags *drv_tags = set->tags[hctx_idx];
struct page *page;
unsigned long flags;
list_for_each_entry(page, &tags->page_list, lru) {
unsigned long start = (unsigned long)page_address(page);
unsigned long end = start + order_to_size(page->private);
int i;
for (i = 0; i < set->queue_depth; i++) {
struct request *rq = drv_tags->rqs[i];
unsigned long rq_addr = (unsigned long)rq;
if (rq_addr >= start && rq_addr < end) {
WARN_ON_ONCE(refcount_read(&rq->ref) != 0);
cmpxchg(&drv_tags->rqs[i], rq, NULL);
}
}
}
/*
* Wait until all pending iteration is done.
*
* Request reference is cleared and it is guaranteed to be observed
* after the ->lock is released.
*/
spin_lock_irqsave(&drv_tags->lock, flags);
spin_unlock_irqrestore(&drv_tags->lock, flags);
}
void blk_mq_free_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags,
unsigned int hctx_idx)
{
struct page *page;
if (tags->rqs && set->ops->exit_request) {
int i;
for (i = 0; i < tags->nr_tags; i++) {
struct request *rq = tags->static_rqs[i];
if (!rq)
continue;
set->ops->exit_request(set, rq, hctx_idx);
tags->static_rqs[i] = NULL;
}
}
blk_mq_clear_rq_mapping(set, tags, hctx_idx);
while (!list_empty(&tags->page_list)) {
page = list_first_entry(&tags->page_list, struct page, lru);
list_del_init(&page->lru);
/*
* Remove kmemleak object previously allocated in
* blk_mq_alloc_rqs().
*/
kmemleak_free(page_address(page));
__free_pages(page, page->private);
}
}
void blk_mq_free_rq_map(struct blk_mq_tags *tags, unsigned int flags)
{
kfree(tags->rqs);
tags->rqs = NULL;
kfree(tags->static_rqs);
tags->static_rqs = NULL;
blk_mq_free_tags(tags, flags);
}
struct blk_mq_tags *blk_mq_alloc_rq_map(struct blk_mq_tag_set *set,
unsigned int hctx_idx,
unsigned int nr_tags,
unsigned int reserved_tags,
unsigned int flags)
{
struct blk_mq_tags *tags;
int node;
node = blk_mq_hw_queue_to_node(&set->map[HCTX_TYPE_DEFAULT], hctx_idx);
if (node == NUMA_NO_NODE)
node = set->numa_node;
tags = blk_mq_init_tags(nr_tags, reserved_tags, node, flags);
if (!tags)
return NULL;
tags->rqs = kcalloc_node(nr_tags, sizeof(struct request *),
GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY,
node);
if (!tags->rqs) {
blk_mq_free_tags(tags, flags);
return NULL;
}
tags->static_rqs = kcalloc_node(nr_tags, sizeof(struct request *),
GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY,
node);
if (!tags->static_rqs) {
kfree(tags->rqs);
blk_mq_free_tags(tags, flags);
return NULL;
}
return tags;
}
static int blk_mq_init_request(struct blk_mq_tag_set *set, struct request *rq,
unsigned int hctx_idx, int node)
{
int ret;
if (set->ops->init_request) {
ret = set->ops->init_request(set, rq, hctx_idx, node);
if (ret)
return ret;
}
WRITE_ONCE(rq->state, MQ_RQ_IDLE);
return 0;
}
int blk_mq_alloc_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags,
unsigned int hctx_idx, unsigned int depth)
{
unsigned int i, j, entries_per_page, max_order = 4;
size_t rq_size, left;
int node;
node = blk_mq_hw_queue_to_node(&set->map[HCTX_TYPE_DEFAULT], hctx_idx);
if (node == NUMA_NO_NODE)
node = set->numa_node;
INIT_LIST_HEAD(&tags->page_list);
/*
* rq_size is the size of the request plus driver payload, rounded
* to the cacheline size
*/
rq_size = round_up(sizeof(struct request) + set->cmd_size,
cache_line_size());
left = rq_size * depth;
for (i = 0; i < depth; ) {
int this_order = max_order;
struct page *page;
int to_do;
void *p;
while (this_order && left < order_to_size(this_order - 1))
this_order--;
do {
page = alloc_pages_node(node,
GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY | __GFP_ZERO,
this_order);
if (page)
break;
if (!this_order--)
break;
if (order_to_size(this_order) < rq_size)
break;
} while (1);
if (!page)
goto fail;
page->private = this_order;
list_add_tail(&page->lru, &tags->page_list);
p = page_address(page);
/*
* Allow kmemleak to scan these pages as they contain pointers
* to additional allocations like via ops->init_request().
*/
kmemleak_alloc(p, order_to_size(this_order), 1, GFP_NOIO);
entries_per_page = order_to_size(this_order) / rq_size;
to_do = min(entries_per_page, depth - i);
left -= to_do * rq_size;
for (j = 0; j < to_do; j++) {
struct request *rq = p;
tags->static_rqs[i] = rq;
if (blk_mq_init_request(set, rq, hctx_idx, node)) {
tags->static_rqs[i] = NULL;
goto fail;
}
p += rq_size;
i++;
}
}
return 0;
fail:
blk_mq_free_rqs(set, tags, hctx_idx);
return -ENOMEM;
}
struct rq_iter_data {
struct blk_mq_hw_ctx *hctx;
bool has_rq;
};
static bool blk_mq_has_request(struct request *rq, void *data, bool reserved)
{
struct rq_iter_data *iter_data = data;
if (rq->mq_hctx != iter_data->hctx)
return true;
iter_data->has_rq = true;
return false;
}
static bool blk_mq_hctx_has_requests(struct blk_mq_hw_ctx *hctx)
{
struct blk_mq_tags *tags = hctx->sched_tags ?
hctx->sched_tags : hctx->tags;
struct rq_iter_data data = {
.hctx = hctx,
};
blk_mq_all_tag_iter(tags, blk_mq_has_request, &data);
return data.has_rq;
}
static inline bool blk_mq_last_cpu_in_hctx(unsigned int cpu,
struct blk_mq_hw_ctx *hctx)
{
if (cpumask_next_and(-1, hctx->cpumask, cpu_online_mask) != cpu)
return false;
if (cpumask_next_and(cpu, hctx->cpumask, cpu_online_mask) < nr_cpu_ids)
return false;
return true;
}
static int blk_mq_hctx_notify_offline(unsigned int cpu, struct hlist_node *node)
{
struct blk_mq_hw_ctx *hctx = hlist_entry_safe(node,
struct blk_mq_hw_ctx, cpuhp_online);
if (!cpumask_test_cpu(cpu, hctx->cpumask) ||
!blk_mq_last_cpu_in_hctx(cpu, hctx))
return 0;
/*
* Prevent new request from being allocated on the current hctx.
*
* The smp_mb__after_atomic() Pairs with the implied barrier in
* test_and_set_bit_lock in sbitmap_get(). Ensures the inactive flag is
* seen once we return from the tag allocator.
*/
set_bit(BLK_MQ_S_INACTIVE, &hctx->state);
smp_mb__after_atomic();
/*
* Try to grab a reference to the queue and wait for any outstanding
* requests. If we could not grab a reference the queue has been
* frozen and there are no requests.
*/
if (percpu_ref_tryget(&hctx->queue->q_usage_counter)) {
while (blk_mq_hctx_has_requests(hctx))
msleep(5);
percpu_ref_put(&hctx->queue->q_usage_counter);
}
return 0;
}
static int blk_mq_hctx_notify_online(unsigned int cpu, struct hlist_node *node)
{
struct blk_mq_hw_ctx *hctx = hlist_entry_safe(node,
struct blk_mq_hw_ctx, cpuhp_online);
if (cpumask_test_cpu(cpu, hctx->cpumask))
clear_bit(BLK_MQ_S_INACTIVE, &hctx->state);
return 0;
}
/*
* 'cpu' is going away. splice any existing rq_list entries from this
* software queue to the hw queue dispatch list, and ensure that it
* gets run.
*/
static int blk_mq_hctx_notify_dead(unsigned int cpu, struct hlist_node *node)
{
struct blk_mq_hw_ctx *hctx;
struct blk_mq_ctx *ctx;
LIST_HEAD(tmp);
enum hctx_type type;
hctx = hlist_entry_safe(node, struct blk_mq_hw_ctx, cpuhp_dead);
if (!cpumask_test_cpu(cpu, hctx->cpumask))
return 0;
ctx = __blk_mq_get_ctx(hctx->queue, cpu);
type = hctx->type;
spin_lock(&ctx->lock);
if (!list_empty(&ctx->rq_lists[type])) {
list_splice_init(&ctx->rq_lists[type], &tmp);
blk_mq_hctx_clear_pending(hctx, ctx);
}
spin_unlock(&ctx->lock);
if (list_empty(&tmp))
return 0;
spin_lock(&hctx->lock);
list_splice_tail_init(&tmp, &hctx->dispatch);
spin_unlock(&hctx->lock);
blk_mq_run_hw_queue(hctx, true);
return 0;
}
static void blk_mq_remove_cpuhp(struct blk_mq_hw_ctx *hctx)
{
if (!(hctx->flags & BLK_MQ_F_STACKING))
cpuhp_state_remove_instance_nocalls(CPUHP_AP_BLK_MQ_ONLINE,
&hctx->cpuhp_online);
cpuhp_state_remove_instance_nocalls(CPUHP_BLK_MQ_DEAD,
&hctx->cpuhp_dead);
}
/*
* Before freeing hw queue, clearing the flush request reference in
* tags->rqs[] for avoiding potential UAF.
*/
static void blk_mq_clear_flush_rq_mapping(struct blk_mq_tags *tags,
unsigned int queue_depth, struct request *flush_rq)
{
int i;
unsigned long flags;
/* The hw queue may not be mapped yet */
if (!tags)
return;
WARN_ON_ONCE(refcount_read(&flush_rq->ref) != 0);
for (i = 0; i < queue_depth; i++)
cmpxchg(&tags->rqs[i], flush_rq, NULL);
/*
* Wait until all pending iteration is done.
*
* Request reference is cleared and it is guaranteed to be observed
* after the ->lock is released.
*/
spin_lock_irqsave(&tags->lock, flags);
spin_unlock_irqrestore(&tags->lock, flags);
}
/* hctx->ctxs will be freed in queue's release handler */
static void blk_mq_exit_hctx(struct request_queue *q,
struct blk_mq_tag_set *set,
struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx)
{
struct request *flush_rq = hctx->fq->flush_rq;
if (blk_mq_hw_queue_mapped(hctx))
blk_mq_tag_idle(hctx);
blk_mq_clear_flush_rq_mapping(set->tags[hctx_idx],
set->queue_depth, flush_rq);
if (set->ops->exit_request)
set->ops->exit_request(set, flush_rq, hctx_idx);
if (set->ops->exit_hctx)
set->ops->exit_hctx(hctx, hctx_idx);
blk_mq_remove_cpuhp(hctx);
spin_lock(&q->unused_hctx_lock);
list_add(&hctx->hctx_list, &q->unused_hctx_list);
spin_unlock(&q->unused_hctx_lock);
}
static void blk_mq_exit_hw_queues(struct request_queue *q,
struct blk_mq_tag_set *set, int nr_queue)
{
struct blk_mq_hw_ctx *hctx;
unsigned int i;
queue_for_each_hw_ctx(q, hctx, i) {
if (i == nr_queue)
break;
blk_mq_debugfs_unregister_hctx(hctx);
blk_mq_exit_hctx(q, set, hctx, i);
}
}
static int blk_mq_hw_ctx_size(struct blk_mq_tag_set *tag_set)
{
int hw_ctx_size = sizeof(struct blk_mq_hw_ctx);
BUILD_BUG_ON(ALIGN(offsetof(struct blk_mq_hw_ctx, srcu),
__alignof__(struct blk_mq_hw_ctx)) !=
sizeof(struct blk_mq_hw_ctx));
if (tag_set->flags & BLK_MQ_F_BLOCKING)
hw_ctx_size += sizeof(struct srcu_struct);
return hw_ctx_size;
}
static int blk_mq_init_hctx(struct request_queue *q,
struct blk_mq_tag_set *set,
struct blk_mq_hw_ctx *hctx, unsigned hctx_idx)
{
hctx->queue_num = hctx_idx;
if (!(hctx->flags & BLK_MQ_F_STACKING))
cpuhp_state_add_instance_nocalls(CPUHP_AP_BLK_MQ_ONLINE,
&hctx->cpuhp_online);
cpuhp_state_add_instance_nocalls(CPUHP_BLK_MQ_DEAD, &hctx->cpuhp_dead);
hctx->tags = set->tags[hctx_idx];
if (set->ops->init_hctx &&
set->ops->init_hctx(hctx, set->driver_data, hctx_idx))
goto unregister_cpu_notifier;
if (blk_mq_init_request(set, hctx->fq->flush_rq, hctx_idx,
hctx->numa_node))
goto exit_hctx;
return 0;
exit_hctx:
if (set->ops->exit_hctx)
set->ops->exit_hctx(hctx, hctx_idx);
unregister_cpu_notifier:
blk_mq_remove_cpuhp(hctx);
return -1;
}
static struct blk_mq_hw_ctx *
blk_mq_alloc_hctx(struct request_queue *q, struct blk_mq_tag_set *set,
int node)
{
struct blk_mq_hw_ctx *hctx;
gfp_t gfp = GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY;
hctx = kzalloc_node(blk_mq_hw_ctx_size(set), gfp, node);
if (!hctx)
goto fail_alloc_hctx;
if (!zalloc_cpumask_var_node(&hctx->cpumask, gfp, node))
goto free_hctx;
atomic_set(&hctx->nr_active, 0);
if (node == NUMA_NO_NODE)
node = set->numa_node;
hctx->numa_node = node;
INIT_DELAYED_WORK(&hctx->run_work, blk_mq_run_work_fn);
spin_lock_init(&hctx->lock);
INIT_LIST_HEAD(&hctx->dispatch);
hctx->queue = q;
hctx->flags = set->flags & ~BLK_MQ_F_TAG_QUEUE_SHARED;
INIT_LIST_HEAD(&hctx->hctx_list);
/*
* Allocate space for all possible cpus to avoid allocation at
* runtime
*/
hctx->ctxs = kmalloc_array_node(nr_cpu_ids, sizeof(void *),
gfp, node);
if (!hctx->ctxs)
goto free_cpumask;
if (sbitmap_init_node(&hctx->ctx_map, nr_cpu_ids, ilog2(8),
gfp, node, false, false))
goto free_ctxs;
hctx->nr_ctx = 0;
spin_lock_init(&hctx->dispatch_wait_lock);
init_waitqueue_func_entry(&hctx->dispatch_wait, blk_mq_dispatch_wake);
INIT_LIST_HEAD(&hctx->dispatch_wait.entry);
hctx->fq = blk_alloc_flush_queue(hctx->numa_node, set->cmd_size, gfp);
if (!hctx->fq)
goto free_bitmap;
if (hctx->flags & BLK_MQ_F_BLOCKING)
init_srcu_struct(hctx->srcu);
blk_mq_hctx_kobj_init(hctx);
return hctx;
free_bitmap:
sbitmap_free(&hctx->ctx_map);
free_ctxs:
kfree(hctx->ctxs);
free_cpumask:
free_cpumask_var(hctx->cpumask);
free_hctx:
kfree(hctx);
fail_alloc_hctx:
return NULL;
}
static void blk_mq_init_cpu_queues(struct request_queue *q,
unsigned int nr_hw_queues)
{
struct blk_mq_tag_set *set = q->tag_set;
unsigned int i, j;
for_each_possible_cpu(i) {
struct blk_mq_ctx *__ctx = per_cpu_ptr(q->queue_ctx, i);
struct blk_mq_hw_ctx *hctx;
int k;
__ctx->cpu = i;
spin_lock_init(&__ctx->lock);
for (k = HCTX_TYPE_DEFAULT; k < HCTX_MAX_TYPES; k++)
INIT_LIST_HEAD(&__ctx->rq_lists[k]);
__ctx->queue = q;
/*
* Set local node, IFF we have more than one hw queue. If
* not, we remain on the home node of the device
*/
for (j = 0; j < set->nr_maps; j++) {
hctx = blk_mq_map_queue_type(q, j, i);
if (nr_hw_queues > 1 && hctx->numa_node == NUMA_NO_NODE)
hctx->numa_node = cpu_to_node(i);
}
}
}
static bool __blk_mq_alloc_map_and_request(struct blk_mq_tag_set *set,
int hctx_idx)
{
unsigned int flags = set->flags;
int ret = 0;
set->tags[hctx_idx] = blk_mq_alloc_rq_map(set, hctx_idx,
set->queue_depth, set->reserved_tags, flags);
if (!set->tags[hctx_idx])
return false;
ret = blk_mq_alloc_rqs(set, set->tags[hctx_idx], hctx_idx,
set->queue_depth);
if (!ret)
return true;
blk_mq_free_rq_map(set->tags[hctx_idx], flags);
set->tags[hctx_idx] = NULL;
return false;
}
static void blk_mq_free_map_and_requests(struct blk_mq_tag_set *set,
unsigned int hctx_idx)
{
unsigned int flags = set->flags;
if (set->tags && set->tags[hctx_idx]) {
blk_mq_free_rqs(set, set->tags[hctx_idx], hctx_idx);
blk_mq_free_rq_map(set->tags[hctx_idx], flags);
set->tags[hctx_idx] = NULL;
}
}
static void blk_mq_map_swqueue(struct request_queue *q)
{
unsigned int i, j, hctx_idx;
struct blk_mq_hw_ctx *hctx;
struct blk_mq_ctx *ctx;
struct blk_mq_tag_set *set = q->tag_set;
queue_for_each_hw_ctx(q, hctx, i) {
cpumask_clear(hctx->cpumask);
hctx->nr_ctx = 0;
hctx->dispatch_from = NULL;
}
/*
* Map software to hardware queues.
*
* If the cpu isn't present, the cpu is mapped to first hctx.
*/
for_each_possible_cpu(i) {
ctx = per_cpu_ptr(q->queue_ctx, i);
for (j = 0; j < set->nr_maps; j++) {
if (!set->map[j].nr_queues) {
ctx->hctxs[j] = blk_mq_map_queue_type(q,
HCTX_TYPE_DEFAULT, i);
continue;
}
hctx_idx = set->map[j].mq_map[i];
/* unmapped hw queue can be remapped after CPU topo changed */
if (!set->tags[hctx_idx] &&
!__blk_mq_alloc_map_and_request(set, hctx_idx)) {
/*
* If tags initialization fail for some hctx,
* that hctx won't be brought online. In this
* case, remap the current ctx to hctx[0] which
* is guaranteed to always have tags allocated
*/
set->map[j].mq_map[i] = 0;
}
hctx = blk_mq_map_queue_type(q, j, i);
ctx->hctxs[j] = hctx;
/*
* If the CPU is already set in the mask, then we've
* mapped this one already. This can happen if
* devices share queues across queue maps.
*/
if (cpumask_test_cpu(i, hctx->cpumask))
continue;
cpumask_set_cpu(i, hctx->cpumask);
hctx->type = j;
ctx->index_hw[hctx->type] = hctx->nr_ctx;
hctx->ctxs[hctx->nr_ctx++] = ctx;
/*
* If the nr_ctx type overflows, we have exceeded the
* amount of sw queues we can support.
*/
BUG_ON(!hctx->nr_ctx);
}
for (; j < HCTX_MAX_TYPES; j++)
ctx->hctxs[j] = blk_mq_map_queue_type(q,
HCTX_TYPE_DEFAULT, i);
}
queue_for_each_hw_ctx(q, hctx, i) {
/*
* If no software queues are mapped to this hardware queue,
* disable it and free the request entries.
*/
if (!hctx->nr_ctx) {
/* Never unmap queue 0. We need it as a
* fallback in case of a new remap fails
* allocation
*/
if (i && set->tags[i])
blk_mq_free_map_and_requests(set, i);
hctx->tags = NULL;
continue;
}
hctx->tags = set->tags[i];
WARN_ON(!hctx->tags);
/*
* Set the map size to the number of mapped software queues.
* This is more accurate and more efficient than looping
* over all possibly mapped software queues.
*/
sbitmap_resize(&hctx->ctx_map, hctx->nr_ctx);
/*
* Initialize batch roundrobin counts
*/
hctx->next_cpu = blk_mq_first_mapped_cpu(hctx);
hctx->next_cpu_batch = BLK_MQ_CPU_WORK_BATCH;
}
}
/*
* Caller needs to ensure that we're either frozen/quiesced, or that
* the queue isn't live yet.
*/
static void queue_set_hctx_shared(struct request_queue *q, bool shared)
{
struct blk_mq_hw_ctx *hctx;
int i;
queue_for_each_hw_ctx(q, hctx, i) {
if (shared) {
hctx->flags |= BLK_MQ_F_TAG_QUEUE_SHARED;
} else {
blk_mq_tag_idle(hctx);
hctx->flags &= ~BLK_MQ_F_TAG_QUEUE_SHARED;
}
}
}
static void blk_mq_update_tag_set_shared(struct blk_mq_tag_set *set,
bool shared)
{
struct request_queue *q;
lockdep_assert_held(&set->tag_list_lock);
list_for_each_entry(q, &set->tag_list, tag_set_list) {
blk_mq_freeze_queue(q);
queue_set_hctx_shared(q, shared);
blk_mq_unfreeze_queue(q);
}
}
static void blk_mq_del_queue_tag_set(struct request_queue *q)
{
struct blk_mq_tag_set *set = q->tag_set;
mutex_lock(&set->tag_list_lock);
list_del(&q->tag_set_list);
if (list_is_singular(&set->tag_list)) {
/* just transitioned to unshared */
set->flags &= ~BLK_MQ_F_TAG_QUEUE_SHARED;
/* update existing queue */
blk_mq_update_tag_set_shared(set, false);
}
mutex_unlock(&set->tag_list_lock);
INIT_LIST_HEAD(&q->tag_set_list);
}
static void blk_mq_add_queue_tag_set(struct blk_mq_tag_set *set,
struct request_queue *q)
{
mutex_lock(&set->tag_list_lock);
/*
* Check to see if we're transitioning to shared (from 1 to 2 queues).
*/
if (!list_empty(&set->tag_list) &&
!(set->flags & BLK_MQ_F_TAG_QUEUE_SHARED)) {
set->flags |= BLK_MQ_F_TAG_QUEUE_SHARED;
/* update existing queue */
blk_mq_update_tag_set_shared(set, true);
}
if (set->flags & BLK_MQ_F_TAG_QUEUE_SHARED)
queue_set_hctx_shared(q, true);
list_add_tail(&q->tag_set_list, &set->tag_list);
mutex_unlock(&set->tag_list_lock);
}
/* All allocations will be freed in release handler of q->mq_kobj */
static int blk_mq_alloc_ctxs(struct request_queue *q)
{
struct blk_mq_ctxs *ctxs;
int cpu;
ctxs = kzalloc(sizeof(*ctxs), GFP_KERNEL);
if (!ctxs)
return -ENOMEM;
ctxs->queue_ctx = alloc_percpu(struct blk_mq_ctx);
if (!ctxs->queue_ctx)
goto fail;
for_each_possible_cpu(cpu) {
struct blk_mq_ctx *ctx = per_cpu_ptr(ctxs->queue_ctx, cpu);
ctx->ctxs = ctxs;
}
q->mq_kobj = &ctxs->kobj;
q->queue_ctx = ctxs->queue_ctx;
return 0;
fail:
kfree(ctxs);
return -ENOMEM;
}
/*
* It is the actual release handler for mq, but we do it from
* request queue's release handler for avoiding use-after-free
* and headache because q->mq_kobj shouldn't have been introduced,
* but we can't group ctx/kctx kobj without it.
*/
void blk_mq_release(struct request_queue *q)
{
struct blk_mq_hw_ctx *hctx, *next;
int i;
queue_for_each_hw_ctx(q, hctx, i)
WARN_ON_ONCE(hctx && list_empty(&hctx->hctx_list));
/* all hctx are in .unused_hctx_list now */
list_for_each_entry_safe(hctx, next, &q->unused_hctx_list, hctx_list) {
list_del_init(&hctx->hctx_list);
kobject_put(&hctx->kobj);
}
kfree(q->queue_hw_ctx);
/*
* release .mq_kobj and sw queue's kobject now because
* both share lifetime with request queue.
*/
blk_mq_sysfs_deinit(q);
}
static struct request_queue *blk_mq_init_queue_data(struct blk_mq_tag_set *set,
void *queuedata)
{
struct request_queue *q;
int ret;
q = blk_alloc_queue(set->numa_node);
if (!q)
return ERR_PTR(-ENOMEM);
q->queuedata = queuedata;
ret = blk_mq_init_allocated_queue(set, q);
if (ret) {
blk_cleanup_queue(q);
return ERR_PTR(ret);
}
return q;
}
struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *set)
{
return blk_mq_init_queue_data(set, NULL);
}
EXPORT_SYMBOL(blk_mq_init_queue);
struct gendisk *__blk_mq_alloc_disk(struct blk_mq_tag_set *set, void *queuedata,
struct lock_class_key *lkclass)
{
struct request_queue *q;
struct gendisk *disk;
q = blk_mq_init_queue_data(set, queuedata);
if (IS_ERR(q))
return ERR_CAST(q);
disk = __alloc_disk_node(q, set->numa_node, lkclass);
if (!disk) {
blk_cleanup_queue(q);
return ERR_PTR(-ENOMEM);
}
return disk;
}
EXPORT_SYMBOL(__blk_mq_alloc_disk);
static struct blk_mq_hw_ctx *blk_mq_alloc_and_init_hctx(
struct blk_mq_tag_set *set, struct request_queue *q,
int hctx_idx, int node)
{
struct blk_mq_hw_ctx *hctx = NULL, *tmp;
/* reuse dead hctx first */
spin_lock(&q->unused_hctx_lock);
list_for_each_entry(tmp, &q->unused_hctx_list, hctx_list) {
if (tmp->numa_node == node) {
hctx = tmp;
break;
}
}
if (hctx)
list_del_init(&hctx->hctx_list);
spin_unlock(&q->unused_hctx_lock);
if (!hctx)
hctx = blk_mq_alloc_hctx(q, set, node);
if (!hctx)
goto fail;
if (blk_mq_init_hctx(q, set, hctx, hctx_idx))
goto free_hctx;
return hctx;
free_hctx:
kobject_put(&hctx->kobj);
fail:
return NULL;
}
static void blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set,
struct request_queue *q)
{
int i, j, end;
struct blk_mq_hw_ctx **hctxs = q->queue_hw_ctx;
if (q->nr_hw_queues < set->nr_hw_queues) {
struct blk_mq_hw_ctx **new_hctxs;
new_hctxs = kcalloc_node(set->nr_hw_queues,
sizeof(*new_hctxs), GFP_KERNEL,
set->numa_node);
if (!new_hctxs)
return;
if (hctxs)
memcpy(new_hctxs, hctxs, q->nr_hw_queues *
sizeof(*hctxs));
q->queue_hw_ctx = new_hctxs;
kfree(hctxs);
hctxs = new_hctxs;
}
/* protect against switching io scheduler */
mutex_lock(&q->sysfs_lock);
for (i = 0; i < set->nr_hw_queues; i++) {
int node;
struct blk_mq_hw_ctx *hctx;
node = blk_mq_hw_queue_to_node(&set->map[HCTX_TYPE_DEFAULT], i);
/*
* If the hw queue has been mapped to another numa node,
* we need to realloc the hctx. If allocation fails, fallback
* to use the previous one.
*/
if (hctxs[i] && (hctxs[i]->numa_node == node))
continue;
hctx = blk_mq_alloc_and_init_hctx(set, q, i, node);
if (hctx) {
if (hctxs[i])
blk_mq_exit_hctx(q, set, hctxs[i], i);
hctxs[i] = hctx;
} else {
if (hctxs[i])
pr_warn("Allocate new hctx on node %d fails,\
fallback to previous one on node %d\n",
node, hctxs[i]->numa_node);
else
break;
}
}
/*
* Increasing nr_hw_queues fails. Free the newly allocated
* hctxs and keep the previous q->nr_hw_queues.
*/
if (i != set->nr_hw_queues) {
j = q->nr_hw_queues;
end = i;
} else {
j = i;
end = q->nr_hw_queues;
q->nr_hw_queues = set->nr_hw_queues;
}
for (; j < end; j++) {
struct blk_mq_hw_ctx *hctx = hctxs[j];
if (hctx) {
if (hctx->tags)
blk_mq_free_map_and_requests(set, j);
blk_mq_exit_hctx(q, set, hctx, j);
hctxs[j] = NULL;
}
}
mutex_unlock(&q->sysfs_lock);
}
int blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
struct request_queue *q)
{
/* mark the queue as mq asap */
q->mq_ops = set->ops;
q->poll_cb = blk_stat_alloc_callback(blk_mq_poll_stats_fn,
blk_mq_poll_stats_bkt,
BLK_MQ_POLL_STATS_BKTS, q);
if (!q->poll_cb)
goto err_exit;
if (blk_mq_alloc_ctxs(q))
goto err_poll;
/* init q->mq_kobj and sw queues' kobjects */
blk_mq_sysfs_init(q);
INIT_LIST_HEAD(&q->unused_hctx_list);
spin_lock_init(&q->unused_hctx_lock);
blk_mq_realloc_hw_ctxs(set, q);
if (!q->nr_hw_queues)
goto err_hctxs;
INIT_WORK(&q->timeout_work, blk_mq_timeout_work);
blk_queue_rq_timeout(q, set->timeout ? set->timeout : 30 * HZ);
q->tag_set = set;
q->queue_flags |= QUEUE_FLAG_MQ_DEFAULT;
if (set->nr_maps > HCTX_TYPE_POLL &&
set->map[HCTX_TYPE_POLL].nr_queues)
blk_queue_flag_set(QUEUE_FLAG_POLL, q);
INIT_DELAYED_WORK(&q->requeue_work, blk_mq_requeue_work);
INIT_LIST_HEAD(&q->requeue_list);
spin_lock_init(&q->requeue_lock);
q->nr_requests = set->queue_depth;
/*
* Default to classic polling
*/
q->poll_nsec = BLK_MQ_POLL_CLASSIC;
blk_mq_init_cpu_queues(q, set->nr_hw_queues);
blk_mq_add_queue_tag_set(set, q);
blk_mq_map_swqueue(q);
return 0;
err_hctxs:
kfree(q->queue_hw_ctx);
q->nr_hw_queues = 0;
blk_mq_sysfs_deinit(q);
err_poll:
blk_stat_free_callback(q->poll_cb);
q->poll_cb = NULL;
err_exit:
q->mq_ops = NULL;
return -ENOMEM;
}
EXPORT_SYMBOL(blk_mq_init_allocated_queue);
/* tags can _not_ be used after returning from blk_mq_exit_queue */
void blk_mq_exit_queue(struct request_queue *q)
{
struct blk_mq_tag_set *set = q->tag_set;
/* Checks hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED. */
blk_mq_exit_hw_queues(q, set, set->nr_hw_queues);
/* May clear BLK_MQ_F_TAG_QUEUE_SHARED in hctx->flags. */
blk_mq_del_queue_tag_set(q);
}
static int __blk_mq_alloc_rq_maps(struct blk_mq_tag_set *set)
{
int i;
for (i = 0; i < set->nr_hw_queues; i++) {
if (!__blk_mq_alloc_map_and_request(set, i))
goto out_unwind;
cond_resched();
}
return 0;
out_unwind:
while (--i >= 0)
blk_mq_free_map_and_requests(set, i);
return -ENOMEM;
}
/*
* Allocate the request maps associated with this tag_set. Note that this
* may reduce the depth asked for, if memory is tight. set->queue_depth
* will be updated to reflect the allocated depth.
*/
static int blk_mq_alloc_map_and_requests(struct blk_mq_tag_set *set)
{
unsigned int depth;
int err;
depth = set->queue_depth;
do {
err = __blk_mq_alloc_rq_maps(set);
if (!err)
break;
set->queue_depth >>= 1;
if (set->queue_depth < set->reserved_tags + BLK_MQ_TAG_MIN) {
err = -ENOMEM;
break;
}
} while (set->queue_depth);
if (!set->queue_depth || err) {
pr_err("blk-mq: failed to allocate request map\n");
return -ENOMEM;
}
if (depth != set->queue_depth)
pr_info("blk-mq: reduced tag depth (%u -> %u)\n",
depth, set->queue_depth);
return 0;
}
static int blk_mq_update_queue_map(struct blk_mq_tag_set *set)
{
/*
* blk_mq_map_queues() and multiple .map_queues() implementations
* expect that set->map[HCTX_TYPE_DEFAULT].nr_queues is set to the
* number of hardware queues.
*/
if (set->nr_maps == 1)
set->map[HCTX_TYPE_DEFAULT].nr_queues = set->nr_hw_queues;
if (set->ops->map_queues && !is_kdump_kernel()) {
int i;
/*
* transport .map_queues is usually done in the following
* way:
*
* for (queue = 0; queue < set->nr_hw_queues; queue++) {
* mask = get_cpu_mask(queue)
* for_each_cpu(cpu, mask)
* set->map[x].mq_map[cpu] = queue;
* }
*
* When we need to remap, the table has to be cleared for
* killing stale mapping since one CPU may not be mapped
* to any hw queue.
*/
for (i = 0; i < set->nr_maps; i++)
blk_mq_clear_mq_map(&set->map[i]);
return set->ops->map_queues(set);
} else {
BUG_ON(set->nr_maps > 1);
return blk_mq_map_queues(&set->map[HCTX_TYPE_DEFAULT]);
}
}
static int blk_mq_realloc_tag_set_tags(struct blk_mq_tag_set *set,
int cur_nr_hw_queues, int new_nr_hw_queues)
{
struct blk_mq_tags **new_tags;
if (cur_nr_hw_queues >= new_nr_hw_queues)
return 0;
new_tags = kcalloc_node(new_nr_hw_queues, sizeof(struct blk_mq_tags *),
GFP_KERNEL, set->numa_node);
if (!new_tags)
return -ENOMEM;
if (set->tags)
memcpy(new_tags, set->tags, cur_nr_hw_queues *
sizeof(*set->tags));
kfree(set->tags);
set->tags = new_tags;
set->nr_hw_queues = new_nr_hw_queues;
return 0;
}
static int blk_mq_alloc_tag_set_tags(struct blk_mq_tag_set *set,
int new_nr_hw_queues)
{
return blk_mq_realloc_tag_set_tags(set, 0, new_nr_hw_queues);
}
/*
* Alloc a tag set to be associated with one or more request queues.
* May fail with EINVAL for various error conditions. May adjust the
* requested depth down, if it's too large. In that case, the set
* value will be stored in set->queue_depth.
*/
int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set)
{
int i, ret;
BUILD_BUG_ON(BLK_MQ_MAX_DEPTH > 1 << BLK_MQ_UNIQUE_TAG_BITS);
if (!set->nr_hw_queues)
return -EINVAL;
if (!set->queue_depth)
return -EINVAL;
if (set->queue_depth < set->reserved_tags + BLK_MQ_TAG_MIN)
return -EINVAL;
if (!set->ops->queue_rq)
return -EINVAL;
if (!set->ops->get_budget ^ !set->ops->put_budget)
return -EINVAL;
if (set->queue_depth > BLK_MQ_MAX_DEPTH) {
pr_info("blk-mq: reduced tag depth to %u\n",
BLK_MQ_MAX_DEPTH);
set->queue_depth = BLK_MQ_MAX_DEPTH;
}
if (!set->nr_maps)
set->nr_maps = 1;
else if (set->nr_maps > HCTX_MAX_TYPES)
return -EINVAL;
/*
* If a crashdump is active, then we are potentially in a very
* memory constrained environment. Limit us to 1 queue and
* 64 tags to prevent using too much memory.
*/
if (is_kdump_kernel()) {
set->nr_hw_queues = 1;
set->nr_maps = 1;
set->queue_depth = min(64U, set->queue_depth);
}
/*
* There is no use for more h/w queues than cpus if we just have
* a single map
*/
if (set->nr_maps == 1 && set->nr_hw_queues > nr_cpu_ids)
set->nr_hw_queues = nr_cpu_ids;
if (blk_mq_alloc_tag_set_tags(set, set->nr_hw_queues) < 0)
return -ENOMEM;
ret = -ENOMEM;
for (i = 0; i < set->nr_maps; i++) {
set->map[i].mq_map = kcalloc_node(nr_cpu_ids,
sizeof(set->map[i].mq_map[0]),
GFP_KERNEL, set->numa_node);
if (!set->map[i].mq_map)
goto out_free_mq_map;
set->map[i].nr_queues = is_kdump_kernel() ? 1 : set->nr_hw_queues;
}
ret = blk_mq_update_queue_map(set);
if (ret)
goto out_free_mq_map;
ret = blk_mq_alloc_map_and_requests(set);
if (ret)
goto out_free_mq_map;
if (blk_mq_is_sbitmap_shared(set->flags)) {
atomic_set(&set->active_queues_shared_sbitmap, 0);
if (blk_mq_init_shared_sbitmap(set)) {
ret = -ENOMEM;
goto out_free_mq_rq_maps;
}
}
mutex_init(&set->tag_list_lock);
INIT_LIST_HEAD(&set->tag_list);
return 0;
out_free_mq_rq_maps:
for (i = 0; i < set->nr_hw_queues; i++)
blk_mq_free_map_and_requests(set, i);
out_free_mq_map:
for (i = 0; i < set->nr_maps; i++) {
kfree(set->map[i].mq_map);
set->map[i].mq_map = NULL;
}
kfree(set->tags);
set->tags = NULL;
return ret;
}
EXPORT_SYMBOL(blk_mq_alloc_tag_set);
/* allocate and initialize a tagset for a simple single-queue device */
int blk_mq_alloc_sq_tag_set(struct blk_mq_tag_set *set,
const struct blk_mq_ops *ops, unsigned int queue_depth,
unsigned int set_flags)
{
memset(set, 0, sizeof(*set));
set->ops = ops;
set->nr_hw_queues = 1;
set->nr_maps = 1;
set->queue_depth = queue_depth;
set->numa_node = NUMA_NO_NODE;
set->flags = set_flags;
return blk_mq_alloc_tag_set(set);
}
EXPORT_SYMBOL_GPL(blk_mq_alloc_sq_tag_set);
void blk_mq_free_tag_set(struct blk_mq_tag_set *set)
{
int i, j;
for (i = 0; i < set->nr_hw_queues; i++)
blk_mq_free_map_and_requests(set, i);
if (blk_mq_is_sbitmap_shared(set->flags))
blk_mq_exit_shared_sbitmap(set);
for (j = 0; j < set->nr_maps; j++) {
kfree(set->map[j].mq_map);
set->map[j].mq_map = NULL;
}
kfree(set->tags);
set->tags = NULL;
}
EXPORT_SYMBOL(blk_mq_free_tag_set);
int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr)
{
struct blk_mq_tag_set *set = q->tag_set;
struct blk_mq_hw_ctx *hctx;
int i, ret;
if (!set)
return -EINVAL;
if (q->nr_requests == nr)
return 0;
blk_mq_freeze_queue(q);
blk_mq_quiesce_queue(q);
ret = 0;
queue_for_each_hw_ctx(q, hctx, i) {
if (!hctx->tags)
continue;
/*
* If we're using an MQ scheduler, just update the scheduler
* queue depth. This is similar to what the old code would do.
*/
if (!hctx->sched_tags) {
ret = blk_mq_tag_update_depth(hctx, &hctx->tags, nr,
false);
if (!ret && blk_mq_is_sbitmap_shared(set->flags))
blk_mq_tag_resize_shared_sbitmap(set, nr);
} else {
ret = blk_mq_tag_update_depth(hctx, &hctx->sched_tags,
nr, true);
if (blk_mq_is_sbitmap_shared(set->flags)) {
hctx->sched_tags->bitmap_tags =
&q->sched_bitmap_tags;
hctx->sched_tags->breserved_tags =
&q->sched_breserved_tags;
}
}
if (ret)
break;
if (q->elevator && q->elevator->type->ops.depth_updated)
q->elevator->type->ops.depth_updated(hctx);
}
if (!ret) {
q->nr_requests = nr;
if (q->elevator && blk_mq_is_sbitmap_shared(set->flags))
sbitmap_queue_resize(&q->sched_bitmap_tags,
nr - set->reserved_tags);
}
blk_mq_unquiesce_queue(q);
blk_mq_unfreeze_queue(q);
return ret;
}
/*
* request_queue and elevator_type pair.
* It is just used by __blk_mq_update_nr_hw_queues to cache
* the elevator_type associated with a request_queue.
*/
struct blk_mq_qe_pair {
struct list_head node;
struct request_queue *q;
struct elevator_type *type;
};
/*
* Cache the elevator_type in qe pair list and switch the
* io scheduler to 'none'
*/
static bool blk_mq_elv_switch_none(struct list_head *head,
struct request_queue *q)
{
struct blk_mq_qe_pair *qe;
if (!q->elevator)
return true;
qe = kmalloc(sizeof(*qe), GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY);
if (!qe)
return false;
INIT_LIST_HEAD(&qe->node);
qe->q = q;
qe->type = q->elevator->type;
list_add(&qe->node, head);
mutex_lock(&q->sysfs_lock);
/*
* After elevator_switch_mq, the previous elevator_queue will be
* released by elevator_release. The reference of the io scheduler
* module get by elevator_get will also be put. So we need to get
* a reference of the io scheduler module here to prevent it to be
* removed.
*/
__module_get(qe->type->elevator_owner);
elevator_switch_mq(q, NULL);
mutex_unlock(&q->sysfs_lock);
return true;
}
static void blk_mq_elv_switch_back(struct list_head *head,
struct request_queue *q)
{
struct blk_mq_qe_pair *qe;
struct elevator_type *t = NULL;
list_for_each_entry(qe, head, node)
if (qe->q == q) {
t = qe->type;
break;
}
if (!t)
return;
list_del(&qe->node);
kfree(qe);
mutex_lock(&q->sysfs_lock);
elevator_switch_mq(q, t);
mutex_unlock(&q->sysfs_lock);
}
static void __blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set,
int nr_hw_queues)
{
struct request_queue *q;
LIST_HEAD(head);
int prev_nr_hw_queues;
lockdep_assert_held(&set->tag_list_lock);
if (set->nr_maps == 1 && nr_hw_queues > nr_cpu_ids)
nr_hw_queues = nr_cpu_ids;
if (nr_hw_queues < 1)
return;
if (set->nr_maps == 1 && nr_hw_queues == set->nr_hw_queues)
return;
list_for_each_entry(q, &set->tag_list, tag_set_list)
blk_mq_freeze_queue(q);
/*
* Switch IO scheduler to 'none', cleaning up the data associated
* with the previous scheduler. We will switch back once we are done
* updating the new sw to hw queue mappings.
*/
list_for_each_entry(q, &set->tag_list, tag_set_list)
if (!blk_mq_elv_switch_none(&head, q))
goto switch_back;
list_for_each_entry(q, &set->tag_list, tag_set_list) {
blk_mq_debugfs_unregister_hctxs(q);
blk_mq_sysfs_unregister(q);
}
prev_nr_hw_queues = set->nr_hw_queues;
if (blk_mq_realloc_tag_set_tags(set, set->nr_hw_queues, nr_hw_queues) <
0)
goto reregister;
set->nr_hw_queues = nr_hw_queues;
fallback:
blk_mq_update_queue_map(set);
list_for_each_entry(q, &set->tag_list, tag_set_list) {
blk_mq_realloc_hw_ctxs(set, q);
if (q->nr_hw_queues != set->nr_hw_queues) {
pr_warn("Increasing nr_hw_queues to %d fails, fallback to %d\n",
nr_hw_queues, prev_nr_hw_queues);
set->nr_hw_queues = prev_nr_hw_queues;
blk_mq_map_queues(&set->map[HCTX_TYPE_DEFAULT]);
goto fallback;
}
blk_mq_map_swqueue(q);
}
reregister:
list_for_each_entry(q, &set->tag_list, tag_set_list) {
blk_mq_sysfs_register(q);
blk_mq_debugfs_register_hctxs(q);
}
switch_back:
list_for_each_entry(q, &set->tag_list, tag_set_list)
blk_mq_elv_switch_back(&head, q);
list_for_each_entry(q, &set->tag_list, tag_set_list)
blk_mq_unfreeze_queue(q);
}
void blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, int nr_hw_queues)
{
mutex_lock(&set->tag_list_lock);
__blk_mq_update_nr_hw_queues(set, nr_hw_queues);
mutex_unlock(&set->tag_list_lock);
}
EXPORT_SYMBOL_GPL(blk_mq_update_nr_hw_queues);
/* Enable polling stats and return whether they were already enabled. */
static bool blk_poll_stats_enable(struct request_queue *q)
{
if (test_bit(QUEUE_FLAG_POLL_STATS, &q->queue_flags) ||
blk_queue_flag_test_and_set(QUEUE_FLAG_POLL_STATS, q))
return true;
blk_stat_add_callback(q, q->poll_cb);
return false;
}
static void blk_mq_poll_stats_start(struct request_queue *q)
{
/*
* We don't arm the callback if polling stats are not enabled or the
* callback is already active.
*/
if (!test_bit(QUEUE_FLAG_POLL_STATS, &q->queue_flags) ||
blk_stat_is_active(q->poll_cb))
return;
blk_stat_activate_msecs(q->poll_cb, 100);
}
static void blk_mq_poll_stats_fn(struct blk_stat_callback *cb)
{
struct request_queue *q = cb->data;
int bucket;
for (bucket = 0; bucket < BLK_MQ_POLL_STATS_BKTS; bucket++) {
if (cb->stat[bucket].nr_samples)
q->poll_stat[bucket] = cb->stat[bucket];
}
}
static unsigned long blk_mq_poll_nsecs(struct request_queue *q,
struct request *rq)
{
unsigned long ret = 0;
int bucket;
/*
* If stats collection isn't on, don't sleep but turn it on for
* future users
*/
if (!blk_poll_stats_enable(q))
return 0;
/*
* As an optimistic guess, use half of the mean service time
* for this type of request. We can (and should) make this smarter.
* For instance, if the completion latencies are tight, we can
* get closer than just half the mean. This is especially
* important on devices where the completion latencies are longer
* than ~10 usec. We do use the stats for the relevant IO size
* if available which does lead to better estimates.
*/
bucket = blk_mq_poll_stats_bkt(rq);
if (bucket < 0)
return ret;
if (q->poll_stat[bucket].nr_samples)
ret = (q->poll_stat[bucket].mean + 1) / 2;
return ret;
}
static bool blk_mq_poll_hybrid_sleep(struct request_queue *q,
struct request *rq)
{
struct hrtimer_sleeper hs;
enum hrtimer_mode mode;
unsigned int nsecs;
ktime_t kt;
if (rq->rq_flags & RQF_MQ_POLL_SLEPT)
return false;
/*
* If we get here, hybrid polling is enabled. Hence poll_nsec can be:
*
* 0: use half of prev avg
* >0: use this specific value
*/
if (q->poll_nsec > 0)
nsecs = q->poll_nsec;
else
nsecs = blk_mq_poll_nsecs(q, rq);
if (!nsecs)
return false;
rq->rq_flags |= RQF_MQ_POLL_SLEPT;
/*
* This will be replaced with the stats tracking code, using
* 'avg_completion_time / 2' as the pre-sleep target.
*/
kt = nsecs;
mode = HRTIMER_MODE_REL;
hrtimer_init_sleeper_on_stack(&hs, CLOCK_MONOTONIC, mode);
hrtimer_set_expires(&hs.timer, kt);
do {
if (blk_mq_rq_state(rq) == MQ_RQ_COMPLETE)
break;
set_current_state(TASK_UNINTERRUPTIBLE);
hrtimer_sleeper_start_expires(&hs, mode);
if (hs.task)
io_schedule();
hrtimer_cancel(&hs.timer);
mode = HRTIMER_MODE_ABS;
} while (hs.task && !signal_pending(current));
__set_current_state(TASK_RUNNING);
destroy_hrtimer_on_stack(&hs.timer);
return true;
}
static bool blk_mq_poll_hybrid(struct request_queue *q,
struct blk_mq_hw_ctx *hctx, blk_qc_t cookie)
{
struct request *rq;
if (q->poll_nsec == BLK_MQ_POLL_CLASSIC)
return false;
if (!blk_qc_t_is_internal(cookie))
rq = blk_mq_tag_to_rq(hctx->tags, blk_qc_t_to_tag(cookie));
else {
rq = blk_mq_tag_to_rq(hctx->sched_tags, blk_qc_t_to_tag(cookie));
/*
* With scheduling, if the request has completed, we'll
* get a NULL return here, as we clear the sched tag when
* that happens. The request still remains valid, like always,
* so we should be safe with just the NULL check.
*/
if (!rq)
return false;
}
return blk_mq_poll_hybrid_sleep(q, rq);
}
/**
* blk_poll - poll for IO completions
* @q: the queue
* @cookie: cookie passed back at IO submission time
* @spin: whether to spin for completions
*
* Description:
* Poll for completions on the passed in queue. Returns number of
* completed entries found. If @spin is true, then blk_poll will continue
* looping until at least one completion is found, unless the task is
* otherwise marked running (or we need to reschedule).
*/
int blk_poll(struct request_queue *q, blk_qc_t cookie, bool spin)
{
struct blk_mq_hw_ctx *hctx;
unsigned int state;
if (!blk_qc_t_valid(cookie) ||
!test_bit(QUEUE_FLAG_POLL, &q->queue_flags))
return 0;
if (current->plug)
blk_flush_plug_list(current->plug, false);
hctx = q->queue_hw_ctx[blk_qc_t_to_queue_num(cookie)];
/*
* If we sleep, have the caller restart the poll loop to reset
* the state. Like for the other success return cases, the
* caller is responsible for checking if the IO completed. If
* the IO isn't complete, we'll get called again and will go
* straight to the busy poll loop. If specified not to spin,
* we also should not sleep.
*/
if (spin && blk_mq_poll_hybrid(q, hctx, cookie))
return 1;
hctx->poll_considered++;
state = get_current_state();
do {
int ret;
hctx->poll_invoked++;
ret = q->mq_ops->poll(hctx);
if (ret > 0) {
hctx->poll_success++;
__set_current_state(TASK_RUNNING);
return ret;
}
if (signal_pending_state(state, current))
__set_current_state(TASK_RUNNING);
if (task_is_running(current))
return 1;
if (ret < 0 || !spin)
break;
cpu_relax();
} while (!need_resched());
__set_current_state(TASK_RUNNING);
return 0;
}
EXPORT_SYMBOL_GPL(blk_poll);
unsigned int blk_mq_rq_cpu(struct request *rq)
{
return rq->mq_ctx->cpu;
}
EXPORT_SYMBOL(blk_mq_rq_cpu);
void blk_mq_cancel_work_sync(struct request_queue *q)
{
if (queue_is_mq(q)) {
struct blk_mq_hw_ctx *hctx;
int i;
cancel_delayed_work_sync(&q->requeue_work);
queue_for_each_hw_ctx(q, hctx, i)
cancel_delayed_work_sync(&hctx->run_work);
}
}
static int __init blk_mq_init(void)
{
int i;
for_each_possible_cpu(i)
init_llist_head(&per_cpu(blk_cpu_done, i));
open_softirq(BLOCK_SOFTIRQ, blk_done_softirq);
cpuhp_setup_state_nocalls(CPUHP_BLOCK_SOFTIRQ_DEAD,
"block/softirq:dead", NULL,
blk_softirq_cpu_dead);
cpuhp_setup_state_multi(CPUHP_BLK_MQ_DEAD, "block/mq:dead", NULL,
blk_mq_hctx_notify_dead);
cpuhp_setup_state_multi(CPUHP_AP_BLK_MQ_ONLINE, "block/mq:online",
blk_mq_hctx_notify_online,
blk_mq_hctx_notify_offline);
return 0;
}
subsys_initcall(blk_mq_init);
// SPDX-License-Identifier: GPL-2.0
/*
* device.h - generic, centralized driver model
*
* Copyright (c) 2001-2003 Patrick Mochel <mochel@osdl.org>
* Copyright (c) 2004-2009 Greg Kroah-Hartman <gregkh@suse.de>
* Copyright (c) 2008-2009 Novell Inc.
*
* See Documentation/driver-api/driver-model/ for more information.
*/
#ifndef _DEVICE_H_
#define _DEVICE_H_
#include <linux/dev_printk.h>
#include <linux/energy_model.h>
#include <linux/ioport.h>
#include <linux/kobject.h>
#include <linux/klist.h>
#include <linux/list.h>
#include <linux/lockdep.h>
#include <linux/compiler.h>
#include <linux/types.h>
#include <linux/mutex.h>
#include <linux/pm.h>
#include <linux/atomic.h>
#include <linux/uidgid.h>
#include <linux/gfp.h>
#include <linux/overflow.h>
#include <linux/device/bus.h>
#include <linux/device/class.h>
#include <linux/device/driver.h>
#include <asm/device.h>
struct device;
struct device_private;
struct device_driver;
struct driver_private;
struct module;
struct class;
struct subsys_private;
struct device_node;
struct fwnode_handle;
struct iommu_ops;
struct iommu_group;
struct dev_pin_info;
struct dev_iommu;
/**
* struct subsys_interface - interfaces to device functions
* @name: name of the device function
* @subsys: subsystem of the devices to attach to
* @node: the list of functions registered at the subsystem
* @add_dev: device hookup to device function handler
* @remove_dev: device hookup to device function handler
*
* Simple interfaces attached to a subsystem. Multiple interfaces can
* attach to a subsystem and its devices. Unlike drivers, they do not
* exclusively claim or control devices. Interfaces usually represent
* a specific functionality of a subsystem/class of devices.
*/
struct subsys_interface {
const char *name;
struct bus_type *subsys;
struct list_head node;
int (*add_dev)(struct device *dev, struct subsys_interface *sif);
void (*remove_dev)(struct device *dev, struct subsys_interface *sif);
};
int subsys_interface_register(struct subsys_interface *sif);
void subsys_interface_unregister(struct subsys_interface *sif);
int subsys_system_register(struct bus_type *subsys,
const struct attribute_group **groups);
int subsys_virtual_register(struct bus_type *subsys,
const struct attribute_group **groups);
/*
* The type of device, "struct device" is embedded in. A class
* or bus can contain devices of different types
* like "partitions" and "disks", "mouse" and "event".
* This identifies the device type and carries type-specific
* information, equivalent to the kobj_type of a kobject.
* If "name" is specified, the uevent will contain it in
* the DEVTYPE variable.
*/
struct device_type {
const char *name;
const struct attribute_group **groups;
int (*uevent)(struct device *dev, struct kobj_uevent_env *env);
char *(*devnode)(struct device *dev, umode_t *mode,
kuid_t *uid, kgid_t *gid);
void (*release)(struct device *dev);
const struct dev_pm_ops *pm;
};
/* interface for exporting device attributes */
struct device_attribute {
struct attribute attr;
ssize_t (*show)(struct device *dev, struct device_attribute *attr,
char *buf);
ssize_t (*store)(struct device *dev, struct device_attribute *attr,
const char *buf, size_t count);
};
struct dev_ext_attribute {
struct device_attribute attr;
void *var;
};
ssize_t device_show_ulong(struct device *dev, struct device_attribute *attr,
char *buf);
ssize_t device_store_ulong(struct device *dev, struct device_attribute *attr,
const char *buf, size_t count);
ssize_t device_show_int(struct device *dev, struct device_attribute *attr,
char *buf);
ssize_t device_store_int(struct device *dev, struct device_attribute *attr,
const char *buf, size_t count);
ssize_t device_show_bool(struct device *dev, struct device_attribute *attr,
char *buf);
ssize_t device_store_bool(struct device *dev, struct device_attribute *attr,
const char *buf, size_t count);
#define DEVICE_ATTR(_name, _mode, _show, _store) \
struct device_attribute dev_attr_##_name = __ATTR(_name, _mode, _show, _store)
#define DEVICE_ATTR_PREALLOC(_name, _mode, _show, _store) \
struct device_attribute dev_attr_##_name = \
__ATTR_PREALLOC(_name, _mode, _show, _store)
#define DEVICE_ATTR_RW(_name) \
struct device_attribute dev_attr_##_name = __ATTR_RW(_name)
#define DEVICE_ATTR_ADMIN_RW(_name) \
struct device_attribute dev_attr_##_name = __ATTR_RW_MODE(_name, 0600)
#define DEVICE_ATTR_RO(_name) \
struct device_attribute dev_attr_##_name = __ATTR_RO(_name)
#define DEVICE_ATTR_ADMIN_RO(_name) \
struct device_attribute dev_attr_##_name = __ATTR_RO_MODE(_name, 0400)
#define DEVICE_ATTR_WO(_name) \
struct device_attribute dev_attr_##_name = __ATTR_WO(_name)
#define DEVICE_ULONG_ATTR(_name, _mode, _var) \
struct dev_ext_attribute dev_attr_##_name = \
{ __ATTR(_name, _mode, device_show_ulong, device_store_ulong), &(_var) }
#define DEVICE_INT_ATTR(_name, _mode, _var) \
struct dev_ext_attribute dev_attr_##_name = \
{ __ATTR(_name, _mode, device_show_int, device_store_int), &(_var) }
#define DEVICE_BOOL_ATTR(_name, _mode, _var) \
struct dev_ext_attribute dev_attr_##_name = \
{ __ATTR(_name, _mode, device_show_bool, device_store_bool), &(_var) }
#define DEVICE_ATTR_IGNORE_LOCKDEP(_name, _mode, _show, _store) \
struct device_attribute dev_attr_##_name = \
__ATTR_IGNORE_LOCKDEP(_name, _mode, _show, _store)
int device_create_file(struct device *device,
const struct device_attribute *entry);
void device_remove_file(struct device *dev,
const struct device_attribute *attr);
bool device_remove_file_self(struct device *dev,
const struct device_attribute *attr);
int __must_check device_create_bin_file(struct device *dev,
const struct bin_attribute *attr);
void device_remove_bin_file(struct device *dev,
const struct bin_attribute *attr);
/* device resource management */
typedef void (*dr_release_t)(struct device *dev, void *res);
typedef int (*dr_match_t)(struct device *dev, void *res, void *match_data);
void *__devres_alloc_node(dr_release_t release, size_t size, gfp_t gfp,
int nid, const char *name) __malloc;
#define devres_alloc(release, size, gfp) \
__devres_alloc_node(release, size, gfp, NUMA_NO_NODE, #release)
#define devres_alloc_node(release, size, gfp, nid) \
__devres_alloc_node(release, size, gfp, nid, #release)
void devres_for_each_res(struct device *dev, dr_release_t release,
dr_match_t match, void *match_data,
void (*fn)(struct device *, void *, void *),
void *data);
void devres_free(void *res);
void devres_add(struct device *dev, void *res);
void *devres_find(struct device *dev, dr_release_t release,
dr_match_t match, void *match_data);
void *devres_get(struct device *dev, void *new_res,
dr_match_t match, void *match_data);
void *devres_remove(struct device *dev, dr_release_t release,
dr_match_t match, void *match_data);
int devres_destroy(struct device *dev, dr_release_t release,
dr_match_t match, void *match_data);
int devres_release(struct device *dev, dr_release_t release,
dr_match_t match, void *match_data);
/* devres group */
void * __must_check devres_open_group(struct device *dev, void *id, gfp_t gfp);
void devres_close_group(struct device *dev, void *id);
void devres_remove_group(struct device *dev, void *id);
int devres_release_group(struct device *dev, void *id);
/* managed devm_k.alloc/kfree for device drivers */
void *devm_kmalloc(struct device *dev, size_t size, gfp_t gfp) __malloc;
void *devm_krealloc(struct device *dev, void *ptr, size_t size,
gfp_t gfp) __must_check;
__printf(3, 0) char *devm_kvasprintf(struct device *dev, gfp_t gfp,
const char *fmt, va_list ap) __malloc;
__printf(3, 4) char *devm_kasprintf(struct device *dev, gfp_t gfp,
const char *fmt, ...) __malloc;
static inline void *devm_kzalloc(struct device *dev, size_t size, gfp_t gfp)
{
return devm_kmalloc(dev, size, gfp | __GFP_ZERO);
}
static inline void *devm_kmalloc_array(struct device *dev,
size_t n, size_t size, gfp_t flags)
{
size_t bytes;
if (unlikely(check_mul_overflow(n, size, &bytes)))
return NULL;
return devm_kmalloc(dev, bytes, flags);
}
static inline void *devm_kcalloc(struct device *dev,
size_t n, size_t size, gfp_t flags)
{
return devm_kmalloc_array(dev, n, size, flags | __GFP_ZERO);
}
void devm_kfree(struct device *dev, const void *p);
char *devm_kstrdup(struct device *dev, const char *s, gfp_t gfp) __malloc;
const char *devm_kstrdup_const(struct device *dev, const char *s, gfp_t gfp);
void *devm_kmemdup(struct device *dev, const void *src, size_t len, gfp_t gfp);
unsigned long devm_get_free_pages(struct device *dev,
gfp_t gfp_mask, unsigned int order);
void devm_free_pages(struct device *dev, unsigned long addr);
void __iomem *devm_ioremap_resource(struct device *dev,
const struct resource *res);
void __iomem *devm_ioremap_resource_wc(struct device *dev,
const struct resource *res);
void __iomem *devm_of_iomap(struct device *dev,
struct device_node *node, int index,
resource_size_t *size);
/* allows to add/remove a custom action to devres stack */
int devm_add_action(struct device *dev, void (*action)(void *), void *data);
void devm_remove_action(struct device *dev, void (*action)(void *), void *data);
void devm_release_action(struct device *dev, void (*action)(void *), void *data);
static inline int devm_add_action_or_reset(struct device *dev,
void (*action)(void *), void *data)
{
int ret;
ret = devm_add_action(dev, action, data);
if (ret)
action(data);
return ret;
}
/**
* devm_alloc_percpu - Resource-managed alloc_percpu
* @dev: Device to allocate per-cpu memory for
* @type: Type to allocate per-cpu memory for
*
* Managed alloc_percpu. Per-cpu memory allocated with this function is
* automatically freed on driver detach.
*
* RETURNS:
* Pointer to allocated memory on success, NULL on failure.
*/
#define devm_alloc_percpu(dev, type) \
((typeof(type) __percpu *)__devm_alloc_percpu((dev), sizeof(type), \
__alignof__(type)))
void __percpu *__devm_alloc_percpu(struct device *dev, size_t size,
size_t align);
void devm_free_percpu(struct device *dev, void __percpu *pdata);
struct device_dma_parameters {
/*
* a low level driver may set these to teach IOMMU code about
* sg limitations.
*/
unsigned int max_segment_size;
unsigned int min_align_mask;
unsigned long segment_boundary_mask;
};
/**
* enum device_link_state - Device link states.
* @DL_STATE_NONE: The presence of the drivers is not being tracked.
* @DL_STATE_DORMANT: None of the supplier/consumer drivers is present.
* @DL_STATE_AVAILABLE: The supplier driver is present, but the consumer is not.
* @DL_STATE_CONSUMER_PROBE: The consumer is probing (supplier driver present).
* @DL_STATE_ACTIVE: Both the supplier and consumer drivers are present.
* @DL_STATE_SUPPLIER_UNBIND: The supplier driver is unbinding.
*/
enum device_link_state {
DL_STATE_NONE = -1,
DL_STATE_DORMANT = 0,
DL_STATE_AVAILABLE,
DL_STATE_CONSUMER_PROBE,
DL_STATE_ACTIVE,
DL_STATE_SUPPLIER_UNBIND,
};
/*
* Device link flags.
*
* STATELESS: The core will not remove this link automatically.
* AUTOREMOVE_CONSUMER: Remove the link automatically on consumer driver unbind.
* PM_RUNTIME: If set, the runtime PM framework will use this link.
* RPM_ACTIVE: Run pm_runtime_get_sync() on the supplier during link creation.
* AUTOREMOVE_SUPPLIER: Remove the link automatically on supplier driver unbind.
* AUTOPROBE_CONSUMER: Probe consumer driver automatically after supplier binds.
* MANAGED: The core tracks presence of supplier/consumer drivers (internal).
* SYNC_STATE_ONLY: Link only affects sync_state() behavior.
* INFERRED: Inferred from data (eg: firmware) and not from driver actions.
*/
#define DL_FLAG_STATELESS BIT(0)
#define DL_FLAG_AUTOREMOVE_CONSUMER BIT(1)
#define DL_FLAG_PM_RUNTIME BIT(2)
#define DL_FLAG_RPM_ACTIVE BIT(3)
#define DL_FLAG_AUTOREMOVE_SUPPLIER BIT(4)
#define DL_FLAG_AUTOPROBE_CONSUMER BIT(5)
#define DL_FLAG_MANAGED BIT(6)
#define DL_FLAG_SYNC_STATE_ONLY BIT(7)
#define DL_FLAG_INFERRED BIT(8)
/**
* enum dl_dev_state - Device driver presence tracking information.
* @DL_DEV_NO_DRIVER: There is no driver attached to the device.
* @DL_DEV_PROBING: A driver is probing.
* @DL_DEV_DRIVER_BOUND: The driver has been bound to the device.
* @DL_DEV_UNBINDING: The driver is unbinding from the device.
*/
enum dl_dev_state {
DL_DEV_NO_DRIVER = 0,
DL_DEV_PROBING,
DL_DEV_DRIVER_BOUND,
DL_DEV_UNBINDING,
};
/**
* enum device_removable - Whether the device is removable. The criteria for a
* device to be classified as removable is determined by its subsystem or bus.
* @DEVICE_REMOVABLE_NOT_SUPPORTED: This attribute is not supported for this
* device (default).
* @DEVICE_REMOVABLE_UNKNOWN: Device location is Unknown.
* @DEVICE_FIXED: Device is not removable by the user.
* @DEVICE_REMOVABLE: Device is removable by the user.
*/
enum device_removable {
DEVICE_REMOVABLE_NOT_SUPPORTED = 0, /* must be 0 */
DEVICE_REMOVABLE_UNKNOWN,
DEVICE_FIXED,
DEVICE_REMOVABLE,
};
/**
* struct dev_links_info - Device data related to device links.
* @suppliers: List of links to supplier devices.
* @consumers: List of links to consumer devices.
* @defer_sync: Hook to global list of devices that have deferred sync_state.
* @status: Driver status information.
*/
struct dev_links_info {
struct list_head suppliers;
struct list_head consumers;
struct list_head defer_sync;
enum dl_dev_state status;
};
/**
* struct device - The basic device structure
* @parent: The device's "parent" device, the device to which it is attached.
* In most cases, a parent device is some sort of bus or host
* controller. If parent is NULL, the device, is a top-level device,
* which is not usually what you want.
* @p: Holds the private data of the driver core portions of the device.
* See the comment of the struct device_private for detail.
* @kobj: A top-level, abstract class from which other classes are derived.
* @init_name: Initial name of the device.
* @type: The type of device.
* This identifies the device type and carries type-specific
* information.
* @mutex: Mutex to synchronize calls to its driver.
* @lockdep_mutex: An optional debug lock that a subsystem can use as a
* peer lock to gain localized lockdep coverage of the device_lock.
* @bus: Type of bus device is on.
* @driver: Which driver has allocated this
* @platform_data: Platform data specific to the device.
* Example: For devices on custom boards, as typical of embedded
* and SOC based hardware, Linux often uses platform_data to point
* to board-specific structures describing devices and how they
* are wired. That can include what ports are available, chip
* variants, which GPIO pins act in what additional roles, and so
* on. This shrinks the "Board Support Packages" (BSPs) and
* minimizes board-specific #ifdefs in drivers.
* @driver_data: Private pointer for driver specific info.
* @links: Links to suppliers and consumers of this device.
* @power: For device power management.
* See Documentation/driver-api/pm/devices.rst for details.
* @pm_domain: Provide callbacks that are executed during system suspend,
* hibernation, system resume and during runtime PM transitions
* along with subsystem-level and driver-level callbacks.
* @em_pd: device's energy model performance domain
* @pins: For device pin management.
* See Documentation/driver-api/pin-control.rst for details.
* @msi_lock: Lock to protect MSI mask cache and mask register
* @msi_list: Hosts MSI descriptors
* @msi_domain: The generic MSI domain this device is using.
* @numa_node: NUMA node this device is close to.
* @dma_ops: DMA mapping operations for this device.
* @dma_mask: Dma mask (if dma'ble device).
* @coherent_dma_mask: Like dma_mask, but for alloc_coherent mapping as not all
* hardware supports 64-bit addresses for consistent allocations
* such descriptors.
* @bus_dma_limit: Limit of an upstream bridge or bus which imposes a smaller
* DMA limit than the device itself supports.
* @dma_range_map: map for DMA memory ranges relative to that of RAM
* @dma_parms: A low level driver may set these to teach IOMMU code about
* segment limitations.
* @dma_pools: Dma pools (if dma'ble device).
* @dma_mem: Internal for coherent mem override.
* @cma_area: Contiguous memory area for dma allocations
* @dma_io_tlb_mem: Pointer to the swiotlb pool used. Not for driver use.
* @archdata: For arch-specific additions.
* @of_node: Associated device tree node.
* @fwnode: Associated device node supplied by platform firmware.
* @devt: For creating the sysfs "dev".
* @id: device instance
* @devres_lock: Spinlock to protect the resource of the device.
* @devres_head: The resources list of the device.
* @knode_class: The node used to add the device to the class list.
* @class: The class of the device.
* @groups: Optional attribute groups.
* @release: Callback to free the device after all references have
* gone away. This should be set by the allocator of the
* device (i.e. the bus driver that discovered the device).
* @iommu_group: IOMMU group the device belongs to.
* @iommu: Per device generic IOMMU runtime data
* @removable: Whether the device can be removed from the system. This
* should be set by the subsystem / bus driver that discovered
* the device.
*
* @offline_disabled: If set, the device is permanently online.
* @offline: Set after successful invocation of bus type's .offline().
* @of_node_reused: Set if the device-tree node is shared with an ancestor
* device.
* @state_synced: The hardware state of this device has been synced to match
* the software state of this device by calling the driver/bus
* sync_state() callback.
* @can_match: The device has matched with a driver at least once or it is in
* a bus (like AMBA) which can't check for matching drivers until
* other devices probe successfully.
* @dma_coherent: this particular device is dma coherent, even if the
* architecture supports non-coherent devices.
* @dma_ops_bypass: If set to %true then the dma_ops are bypassed for the
* streaming DMA operations (->map_* / ->unmap_* / ->sync_*),
* and optionall (if the coherent mask is large enough) also
* for dma allocations. This flag is managed by the dma ops
* instance from ->dma_supported.
*
* At the lowest level, every device in a Linux system is represented by an
* instance of struct device. The device structure contains the information
* that the device model core needs to model the system. Most subsystems,
* however, track additional information about the devices they host. As a
* result, it is rare for devices to be represented by bare device structures;
* instead, that structure, like kobject structures, is usually embedded within
* a higher-level representation of the device.
*/
struct device {
struct kobject kobj;
struct device *parent;
struct device_private *p;
const char *init_name; /* initial name of the device */
const struct device_type *type;
struct bus_type *bus; /* type of bus device is on */
struct device_driver *driver; /* which driver has allocated this
device */
void *platform_data; /* Platform specific data, device
core doesn't touch it */
void *driver_data; /* Driver data, set and get with
dev_set_drvdata/dev_get_drvdata */
#ifdef CONFIG_PROVE_LOCKING
struct mutex lockdep_mutex;
#endif
struct mutex mutex; /* mutex to synchronize calls to
* its driver.
*/
struct dev_links_info links;
struct dev_pm_info power;
struct dev_pm_domain *pm_domain;
#ifdef CONFIG_ENERGY_MODEL
struct em_perf_domain *em_pd;
#endif
#ifdef CONFIG_GENERIC_MSI_IRQ_DOMAIN
struct irq_domain *msi_domain;
#endif
#ifdef CONFIG_PINCTRL
struct dev_pin_info *pins;
#endif
#ifdef CONFIG_GENERIC_MSI_IRQ
raw_spinlock_t msi_lock;
struct list_head msi_list;
#endif
#ifdef CONFIG_DMA_OPS
const struct dma_map_ops *dma_ops;
#endif
u64 *dma_mask; /* dma mask (if dma'able device) */
u64 coherent_dma_mask;/* Like dma_mask, but for
alloc_coherent mappings as
not all hardware supports
64 bit addresses for consistent
allocations such descriptors. */
u64 bus_dma_limit; /* upstream dma constraint */
const struct bus_dma_region *dma_range_map;
struct device_dma_parameters *dma_parms;
struct list_head dma_pools; /* dma pools (if dma'ble) */
#ifdef CONFIG_DMA_DECLARE_COHERENT
struct dma_coherent_mem *dma_mem; /* internal for coherent mem
override */
#endif
#ifdef CONFIG_DMA_CMA
struct cma *cma_area; /* contiguous memory area for dma
allocations */
#endif
#ifdef CONFIG_SWIOTLB
struct io_tlb_mem *dma_io_tlb_mem;
#endif
/* arch specific additions */
struct dev_archdata archdata;
struct device_node *of_node; /* associated device tree node */
struct fwnode_handle *fwnode; /* firmware device node */
#ifdef CONFIG_NUMA
int numa_node; /* NUMA node this device is close to */
#endif
dev_t devt; /* dev_t, creates the sysfs "dev" */
u32 id; /* device instance */
spinlock_t devres_lock;
struct list_head devres_head;
struct class *class;
const struct attribute_group **groups; /* optional groups */
void (*release)(struct device *dev);
struct iommu_group *iommu_group;
struct dev_iommu *iommu;
enum device_removable removable;
bool offline_disabled:1;
bool offline:1;
bool of_node_reused:1;
bool state_synced:1;
bool can_match:1;
#if defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_DEVICE) || \
defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU) || \
defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU_ALL)
bool dma_coherent:1;
#endif
#ifdef CONFIG_DMA_OPS_BYPASS
bool dma_ops_bypass : 1;
#endif
};
/**
* struct device_link - Device link representation.
* @supplier: The device on the supplier end of the link.
* @s_node: Hook to the supplier device's list of links to consumers.
* @consumer: The device on the consumer end of the link.
* @c_node: Hook to the consumer device's list of links to suppliers.
* @link_dev: device used to expose link details in sysfs
* @status: The state of the link (with respect to the presence of drivers).
* @flags: Link flags.
* @rpm_active: Whether or not the consumer device is runtime-PM-active.
* @kref: Count repeated addition of the same link.
* @rm_work: Work structure used for removing the link.
* @supplier_preactivated: Supplier has been made active before consumer probe.
*/
struct device_link {
struct device *supplier;
struct list_head s_node;
struct device *consumer;
struct list_head c_node;
struct device link_dev;
enum device_link_state status;
u32 flags;
refcount_t rpm_active;
struct kref kref;
struct work_struct rm_work;
bool supplier_preactivated; /* Owned by consumer probe. */
};
static inline struct device *kobj_to_dev(struct kobject *kobj)
{
return container_of(kobj, struct device, kobj);
}
/**
* device_iommu_mapped - Returns true when the device DMA is translated
* by an IOMMU
* @dev: Device to perform the check on
*/
static inline bool device_iommu_mapped(struct device *dev)
{
return (dev->iommu_group != NULL);
}
/* Get the wakeup routines, which depend on struct device */
#include <linux/pm_wakeup.h>
static inline const char *dev_name(const struct device *dev)
{
/* Use the init name until the kobject becomes available */
if (dev->init_name)
return dev->init_name;
return kobject_name(&dev->kobj);
}
/**
* dev_bus_name - Return a device's bus/class name, if at all possible
* @dev: struct device to get the bus/class name of
*
* Will return the name of the bus/class the device is attached to. If it is
* not attached to a bus/class, an empty string will be returned.
*/
static inline const char *dev_bus_name(const struct device *dev)
{
return dev->bus ? dev->bus->name : (dev->class ? dev->class->name : "");
}
__printf(2, 3) int dev_set_name(struct device *dev, const char *name, ...);
#ifdef CONFIG_NUMA
static inline int dev_to_node(struct device *dev)
{
return dev->numa_node;
}
static inline void set_dev_node(struct device *dev, int node)
{
dev->numa_node = node;
}
#else
static inline int dev_to_node(struct device *dev)
{
return NUMA_NO_NODE;
}
static inline void set_dev_node(struct device *dev, int node)
{
}
#endif
static inline struct irq_domain *dev_get_msi_domain(const struct device *dev)
{
#ifdef CONFIG_GENERIC_MSI_IRQ_DOMAIN
return dev->msi_domain;
#else
return NULL;
#endif
}
static inline void dev_set_msi_domain(struct device *dev, struct irq_domain *d)
{
#ifdef CONFIG_GENERIC_MSI_IRQ_DOMAIN
dev->msi_domain = d;
#endif
}
static inline void *dev_get_drvdata(const struct device *dev)
{
return dev->driver_data;
}
static inline void dev_set_drvdata(struct device *dev, void *data)
{
dev->driver_data = data;
}
static inline struct pm_subsys_data *dev_to_psd(struct device *dev)
{
return dev ? dev->power.subsys_data : NULL;
}
static inline unsigned int dev_get_uevent_suppress(const struct device *dev)
{
return dev->kobj.uevent_suppress;
}
static inline void dev_set_uevent_suppress(struct device *dev, int val)
{
dev->kobj.uevent_suppress = val;
}
static inline int device_is_registered(struct device *dev)
{
return dev->kobj.state_in_sysfs;
}
static inline void device_enable_async_suspend(struct device *dev)
{
if (!dev->power.is_prepared)
dev->power.async_suspend = true;
}
static inline void device_disable_async_suspend(struct device *dev)
{
if (!dev->power.is_prepared)
dev->power.async_suspend = false;
}
static inline bool device_async_suspend_enabled(struct device *dev)
{
return !!dev->power.async_suspend;
}
static inline bool device_pm_not_required(struct device *dev)
{
return dev->power.no_pm;
}
static inline void device_set_pm_not_required(struct device *dev)
{
dev->power.no_pm = true;
}
static inline void dev_pm_syscore_device(struct device *dev, bool val)
{
#ifdef CONFIG_PM_SLEEP
dev->power.syscore = val;
#endif
}
static inline void dev_pm_set_driver_flags(struct device *dev, u32 flags)
{
dev->power.driver_flags = flags;
}
static inline bool dev_pm_test_driver_flags(struct device *dev, u32 flags)
{
return !!(dev->power.driver_flags & flags);
}
static inline void device_lock(struct device *dev)
{
mutex_lock(&dev->mutex);
}
static inline int device_lock_interruptible(struct device *dev)
{
return mutex_lock_interruptible(&dev->mutex);
}
static inline int device_trylock(struct device *dev)
{
return mutex_trylock(&dev->mutex);
}
static inline void device_unlock(struct device *dev)
{
mutex_unlock(&dev->mutex);
}
static inline void device_lock_assert(struct device *dev)
{
lockdep_assert_held(&dev->mutex);
}
static inline struct device_node *dev_of_node(struct device *dev)
{
if (!IS_ENABLED(CONFIG_OF) || !dev)
return NULL;
return dev->of_node;
}
static inline bool dev_has_sync_state(struct device *dev)
{
if (!dev)
return false;
if (dev->driver && dev->driver->sync_state)
return true;
if (dev->bus && dev->bus->sync_state)
return true;
return false;
}
static inline void dev_set_removable(struct device *dev,
enum device_removable removable)
{
dev->removable = removable;
}
static inline bool dev_is_removable(struct device *dev)
{
return dev->removable == DEVICE_REMOVABLE;
}
static inline bool dev_removable_is_valid(struct device *dev)
{
return dev->removable != DEVICE_REMOVABLE_NOT_SUPPORTED;
}
/*
* High level routines for use by the bus drivers
*/
int __must_check device_register(struct device *dev);
void device_unregister(struct device *dev);
void device_initialize(struct device *dev);
int __must_check device_add(struct device *dev);
void device_del(struct device *dev);
int device_for_each_child(struct device *dev, void *data,
int (*fn)(struct device *dev, void *data));
int device_for_each_child_reverse(struct device *dev, void *data,
int (*fn)(struct device *dev, void *data));
struct device *device_find_child(struct device *dev, void *data,
int (*match)(struct device *dev, void *data));
struct device *device_find_child_by_name(struct device *parent,
const char *name);
int device_rename(struct device *dev, const char *new_name);
int device_move(struct device *dev, struct device *new_parent,
enum dpm_order dpm_order);
int device_change_owner(struct device *dev, kuid_t kuid, kgid_t kgid);
const char *device_get_devnode(struct device *dev, umode_t *mode, kuid_t *uid,
kgid_t *gid, const char **tmp);
int device_is_dependent(struct device *dev, void *target);
static inline bool device_supports_offline(struct device *dev)
{
return dev->bus && dev->bus->offline && dev->bus->online;
}
void lock_device_hotplug(void);
void unlock_device_hotplug(void);
int lock_device_hotplug_sysfs(void);
int device_offline(struct device *dev);
int device_online(struct device *dev);
void set_primary_fwnode(struct device *dev, struct fwnode_handle *fwnode);
void set_secondary_fwnode(struct device *dev, struct fwnode_handle *fwnode);
void device_set_of_node_from_dev(struct device *dev, const struct device *dev2);
void device_set_node(struct device *dev, struct fwnode_handle *fwnode);
static inline int dev_num_vf(struct device *dev)
{
if (dev->bus && dev->bus->num_vf)
return dev->bus->num_vf(dev);
return 0;
}
/*
* Root device objects for grouping under /sys/devices
*/
struct device *__root_device_register(const char *name, struct module *owner);
/* This is a macro to avoid include problems with THIS_MODULE */
#define root_device_register(name) \
__root_device_register(name, THIS_MODULE)
void root_device_unregister(struct device *root);
static inline void *dev_get_platdata(const struct device *dev)
{
return dev->platform_data;
}
/*
* Manual binding of a device to driver. See drivers/base/bus.c
* for information on use.
*/
int __must_check device_driver_attach(struct device_driver *drv,
struct device *dev);
int __must_check device_bind_driver(struct device *dev);
void device_release_driver(struct device *dev);
int __must_check device_attach(struct device *dev);
int __must_check driver_attach(struct device_driver *drv);
void device_initial_probe(struct device *dev);
int __must_check device_reprobe(struct device *dev);
bool device_is_bound(struct device *dev);
/*
* Easy functions for dynamically creating devices on the fly
*/
__printf(5, 6) struct device *
device_create(struct class *cls, struct device *parent, dev_t devt,
void *drvdata, const char *fmt, ...);
__printf(6, 7) struct device *
device_create_with_groups(struct class *cls, struct device *parent, dev_t devt,
void *drvdata, const struct attribute_group **groups,
const char *fmt, ...);
void device_destroy(struct class *cls, dev_t devt);
int __must_check device_add_groups(struct device *dev,
const struct attribute_group **groups);
void device_remove_groups(struct device *dev,
const struct attribute_group **groups);
static inline int __must_check device_add_group(struct device *dev,
const struct attribute_group *grp)
{
const struct attribute_group *groups[] = { grp, NULL };
return device_add_groups(dev, groups);
}
static inline void device_remove_group(struct device *dev,
const struct attribute_group *grp)
{
const struct attribute_group *groups[] = { grp, NULL };
return device_remove_groups(dev, groups);
}
int __must_check devm_device_add_groups(struct device *dev,
const struct attribute_group **groups);
void devm_device_remove_groups(struct device *dev,
const struct attribute_group **groups);
int __must_check devm_device_add_group(struct device *dev,
const struct attribute_group *grp);
void devm_device_remove_group(struct device *dev,
const struct attribute_group *grp);
/*
* Platform "fixup" functions - allow the platform to have their say
* about devices and actions that the general device layer doesn't
* know about.
*/
/* Notify platform of device discovery */
extern int (*platform_notify)(struct device *dev);
extern int (*platform_notify_remove)(struct device *dev);
/*
* get_device - atomically increment the reference count for the device.
*
*/
struct device *get_device(struct device *dev);
void put_device(struct device *dev);
bool kill_device(struct device *dev);
#ifdef CONFIG_DEVTMPFS
int devtmpfs_mount(void);
#else
static inline int devtmpfs_mount(void) { return 0; }
#endif
/* drivers/base/power/shutdown.c */
void device_shutdown(void);
/* debugging and troubleshooting/diagnostic helpers. */
const char *dev_driver_string(const struct device *dev);
/* Device links interface. */
struct device_link *device_link_add(struct device *consumer,
struct device *supplier, u32 flags);
void device_link_del(struct device_link *link);
void device_link_remove(void *consumer, struct device *supplier);
void device_links_supplier_sync_state_pause(void);
void device_links_supplier_sync_state_resume(void);
extern __printf(3, 4)
int dev_err_probe(const struct device *dev, int err, const char *fmt, ...);
/* Create alias, so I can be autoloaded. */
#define MODULE_ALIAS_CHARDEV(major,minor) \
MODULE_ALIAS("char-major-" __stringify(major) "-" __stringify(minor))
#define MODULE_ALIAS_CHARDEV_MAJOR(major) \
MODULE_ALIAS("char-major-" __stringify(major) "-*")
#ifdef CONFIG_SYSFS_DEPRECATED
extern long sysfs_deprecated;
#else
#define sysfs_deprecated 0
#endif
#endif /* _DEVICE_H_ */
// SPDX-License-Identifier: GPL-2.0+
/*
* XArray implementation
* Copyright (c) 2017-2018 Microsoft Corporation
* Copyright (c) 2018-2020 Oracle
* Author: Matthew Wilcox <willy@infradead.org>
*/
#include <linux/bitmap.h>
#include <linux/export.h>
#include <linux/list.h>
#include <linux/slab.h>
#include <linux/xarray.h>
/*
* Coding conventions in this file:
*
* @xa is used to refer to the entire xarray.
* @xas is the 'xarray operation state'. It may be either a pointer to
* an xa_state, or an xa_state stored on the stack. This is an unfortunate
* ambiguity.
* @index is the index of the entry being operated on
* @mark is an xa_mark_t; a small number indicating one of the mark bits.
* @node refers to an xa_node; usually the primary one being operated on by
* this function.
* @offset is the index into the slots array inside an xa_node.
* @parent refers to the @xa_node closer to the head than @node.
* @entry refers to something stored in a slot in the xarray
*/
static inline unsigned int xa_lock_type(const struct xarray *xa)
{
return (__force unsigned int)xa->xa_flags & 3;
}
static inline void xas_lock_type(struct xa_state *xas, unsigned int lock_type)
{
if (lock_type == XA_LOCK_IRQ)
xas_lock_irq(xas);
else if (lock_type == XA_LOCK_BH)
xas_lock_bh(xas);
else
xas_lock(xas);
}
static inline void xas_unlock_type(struct xa_state *xas, unsigned int lock_type)
{
if (lock_type == XA_LOCK_IRQ)
xas_unlock_irq(xas);
else if (lock_type == XA_LOCK_BH)
xas_unlock_bh(xas);
else
xas_unlock(xas);
}
static inline bool xa_track_free(const struct xarray *xa)
{
return xa->xa_flags & XA_FLAGS_TRACK_FREE;
}
static inline bool xa_zero_busy(const struct xarray *xa)
{
return xa->xa_flags & XA_FLAGS_ZERO_BUSY;
}
static inline void xa_mark_set(struct xarray *xa, xa_mark_t mark)
{
if (!(xa->xa_flags & XA_FLAGS_MARK(mark)))
xa->xa_flags |= XA_FLAGS_MARK(mark);
}
static inline void xa_mark_clear(struct xarray *xa, xa_mark_t mark)
{
if (xa->xa_flags & XA_FLAGS_MARK(mark)) xa->xa_flags &= ~(XA_FLAGS_MARK(mark));
}
static inline unsigned long *node_marks(struct xa_node *node, xa_mark_t mark)
{
return node->marks[(__force unsigned)mark];
}
static inline bool node_get_mark(struct xa_node *node,
unsigned int offset, xa_mark_t mark)
{
return test_bit(offset, node_marks(node, mark));
}
/* returns true if the bit was set */
static inline bool node_set_mark(struct xa_node *node, unsigned int offset,
xa_mark_t mark)
{
return __test_and_set_bit(offset, node_marks(node, mark));
}
/* returns true if the bit was set */
static inline bool node_clear_mark(struct xa_node *node, unsigned int offset,
xa_mark_t mark)
{
return __test_and_clear_bit(offset, node_marks(node, mark));
}
static inline bool node_any_mark(struct xa_node *node, xa_mark_t mark)
{
return !bitmap_empty(node_marks(node, mark), XA_CHUNK_SIZE);
}
static inline void node_mark_all(struct xa_node *node, xa_mark_t mark)
{
bitmap_fill(node_marks(node, mark), XA_CHUNK_SIZE);
}
#define mark_inc(mark) do { \
mark = (__force xa_mark_t)((__force unsigned)(mark) + 1); \
} while (0)
/*
* xas_squash_marks() - Merge all marks to the first entry
* @xas: Array operation state.
*
* Set a mark on the first entry if any entry has it set. Clear marks on
* all sibling entries.
*/
static void xas_squash_marks(const struct xa_state *xas)
{
unsigned int mark = 0;
unsigned int limit = xas->xa_offset + xas->xa_sibs + 1;
if (!xas->xa_sibs)
return;
do {
unsigned long *marks = xas->xa_node->marks[mark];
if (find_next_bit(marks, limit, xas->xa_offset + 1) == limit)
continue;
__set_bit(xas->xa_offset, marks);
bitmap_clear(marks, xas->xa_offset + 1, xas->xa_sibs);
} while (mark++ != (__force unsigned)XA_MARK_MAX);
}
/* extracts the offset within this node from the index */
static unsigned int get_offset(unsigned long index, struct xa_node *node)
{
return (index >> node->shift) & XA_CHUNK_MASK;
}
static void xas_set_offset(struct xa_state *xas)
{
xas->xa_offset = get_offset(xas->xa_index, xas->xa_node);
}
/* move the index either forwards (find) or backwards (sibling slot) */
static void xas_move_index(struct xa_state *xas, unsigned long offset)
{
unsigned int shift = xas->xa_node->shift;
xas->xa_index &= ~XA_CHUNK_MASK << shift;
xas->xa_index += offset << shift;
}
static void xas_advance(struct xa_state *xas)
{
xas->xa_offset++;
xas_move_index(xas, xas->xa_offset);
}
static void *set_bounds(struct xa_state *xas)
{
xas->xa_node = XAS_BOUNDS;
return NULL;
}
/*
* Starts a walk. If the @xas is already valid, we assume that it's on
* the right path and just return where we've got to. If we're in an
* error state, return NULL. If the index is outside the current scope
* of the xarray, return NULL without changing @xas->xa_node. Otherwise
* set @xas->xa_node to NULL and return the current head of the array.
*/
static void *xas_start(struct xa_state *xas)
{
void *entry;
if (xas_valid(xas))
return xas_reload(xas);
if (xas_error(xas))
return NULL;
entry = xa_head(xas->xa);
if (!xa_is_node(entry)) {
if (xas->xa_index)
return set_bounds(xas);
} else {
if ((xas->xa_index >> xa_to_node(entry)->shift) > XA_CHUNK_MASK)
return set_bounds(xas);
}
xas->xa_node = NULL; return entry;
}
static void *xas_descend(struct xa_state *xas, struct xa_node *node)
{
unsigned int offset = get_offset(xas->xa_index, node);
void *entry = xa_entry(xas->xa, node, offset);
xas->xa_node = node;
if (xa_is_sibling(entry)) {
offset = xa_to_sibling(entry);
entry = xa_entry(xas->xa, node, offset);
}
xas->xa_offset = offset;
return entry;
}
/**
* xas_load() - Load an entry from the XArray (advanced).
* @xas: XArray operation state.
*
* Usually walks the @xas to the appropriate state to load the entry
* stored at xa_index. However, it will do nothing and return %NULL if
* @xas is in an error state. xas_load() will never expand the tree.
*
* If the xa_state is set up to operate on a multi-index entry, xas_load()
* may return %NULL or an internal entry, even if there are entries
* present within the range specified by @xas.
*
* Context: Any context. The caller should hold the xa_lock or the RCU lock.
* Return: Usually an entry in the XArray, but see description for exceptions.
*/
void *xas_load(struct xa_state *xas)
{
void *entry = xas_start(xas);
while (xa_is_node(entry)) {
struct xa_node *node = xa_to_node(entry);
if (xas->xa_shift > node->shift)
break;
entry = xas_descend(xas, node);
if (node->shift == 0)
break;
}
return entry;
}
EXPORT_SYMBOL_GPL(xas_load);
/* Move the radix tree node cache here */
extern struct kmem_cache *radix_tree_node_cachep;
extern void radix_tree_node_rcu_free(struct rcu_head *head);
#define XA_RCU_FREE ((struct xarray *)1)
static void xa_node_free(struct xa_node *node)
{
XA_NODE_BUG_ON(node, !list_empty(&node->private_list));
node->array = XA_RCU_FREE;
call_rcu(&node->rcu_head, radix_tree_node_rcu_free);
}
/*
* xas_destroy() - Free any resources allocated during the XArray operation.
* @xas: XArray operation state.
*
* This function is now internal-only.
*/
static void xas_destroy(struct xa_state *xas)
{
struct xa_node *next, *node = xas->xa_alloc;
while (node) {
XA_NODE_BUG_ON(node, !list_empty(&node->private_list));
next = rcu_dereference_raw(node->parent);
radix_tree_node_rcu_free(&node->rcu_head);
xas->xa_alloc = node = next;
}
}
/**
* xas_nomem() - Allocate memory if needed.
* @xas: XArray operation state.
* @gfp: Memory allocation flags.
*
* If we need to add new nodes to the XArray, we try to allocate memory
* with GFP_NOWAIT while holding the lock, which will usually succeed.
* If it fails, @xas is flagged as needing memory to continue. The caller
* should drop the lock and call xas_nomem(). If xas_nomem() succeeds,
* the caller should retry the operation.
*
* Forward progress is guaranteed as one node is allocated here and
* stored in the xa_state where it will be found by xas_alloc(). More
* nodes will likely be found in the slab allocator, but we do not tie
* them up here.
*
* Return: true if memory was needed, and was successfully allocated.
*/
bool xas_nomem(struct xa_state *xas, gfp_t gfp)
{
if (xas->xa_node != XA_ERROR(-ENOMEM)) {
xas_destroy(xas);
return false;
}
if (xas->xa->xa_flags & XA_FLAGS_ACCOUNT) gfp |= __GFP_ACCOUNT; xas->xa_alloc = kmem_cache_alloc(radix_tree_node_cachep, gfp);
if (!xas->xa_alloc)
return false;
xas->xa_alloc->parent = NULL;
XA_NODE_BUG_ON(xas->xa_alloc, !list_empty(&xas->xa_alloc->private_list));
xas->xa_node = XAS_RESTART;
return true;
}
EXPORT_SYMBOL_GPL(xas_nomem);
/*
* __xas_nomem() - Drop locks and allocate memory if needed.
* @xas: XArray operation state.
* @gfp: Memory allocation flags.
*
* Internal variant of xas_nomem().
*
* Return: true if memory was needed, and was successfully allocated.
*/
static bool __xas_nomem(struct xa_state *xas, gfp_t gfp)
__must_hold(xas->xa->xa_lock)
{
unsigned int lock_type = xa_lock_type(xas->xa);
if (xas->xa_node != XA_ERROR(-ENOMEM)) {
xas_destroy(xas);
return false;
}
if (xas->xa->xa_flags & XA_FLAGS_ACCOUNT)
gfp |= __GFP_ACCOUNT;
if (gfpflags_allow_blocking(gfp)) {
xas_unlock_type(xas, lock_type);
xas->xa_alloc = kmem_cache_alloc(radix_tree_node_cachep, gfp);
xas_lock_type(xas, lock_type);
} else {
xas->xa_alloc = kmem_cache_alloc(radix_tree_node_cachep, gfp);
}
if (!xas->xa_alloc)
return false;
xas->xa_alloc->parent = NULL;
XA_NODE_BUG_ON(xas->xa_alloc, !list_empty(&xas->xa_alloc->private_list));
xas->xa_node = XAS_RESTART;
return true;
}
static void xas_update(struct xa_state *xas, struct xa_node *node)
{
if (xas->xa_update)
xas->xa_update(node);
else
XA_NODE_BUG_ON(node, !list_empty(&node->private_list));
}
static void *xas_alloc(struct xa_state *xas, unsigned int shift)
{
struct xa_node *parent = xas->xa_node;
struct xa_node *node = xas->xa_alloc;
if (xas_invalid(xas))
return NULL;
if (node) { xas->xa_alloc = NULL;
} else {
gfp_t gfp = GFP_NOWAIT | __GFP_NOWARN;
if (xas->xa->xa_flags & XA_FLAGS_ACCOUNT)
gfp |= __GFP_ACCOUNT;
node = kmem_cache_alloc(radix_tree_node_cachep, gfp);
if (!node) {
xas_set_err(xas, -ENOMEM);
return NULL;
}
}
if (parent) { node->offset = xas->xa_offset;
parent->count++;
XA_NODE_BUG_ON(node, parent->count > XA_CHUNK_SIZE);
xas_update(xas, parent);
}
XA_NODE_BUG_ON(node, shift > BITS_PER_LONG);
XA_NODE_BUG_ON(node, !list_empty(&node->private_list));
node->shift = shift;
node->count = 0;
node->nr_values = 0;
RCU_INIT_POINTER(node->parent, xas->xa_node);
node->array = xas->xa;
return node;
}
#ifdef CONFIG_XARRAY_MULTI
/* Returns the number of indices covered by a given xa_state */
static unsigned long xas_size(const struct xa_state *xas)
{
return (xas->xa_sibs + 1UL) << xas->xa_shift;
}
#endif
/*
* Use this to calculate the maximum index that will need to be created
* in order to add the entry described by @xas. Because we cannot store a
* multi-index entry at index 0, the calculation is a little more complex
* than you might expect.
*/
static unsigned long xas_max(struct xa_state *xas)
{
unsigned long max = xas->xa_index;
#ifdef CONFIG_XARRAY_MULTI
if (xas->xa_shift || xas->xa_sibs) {
unsigned long mask = xas_size(xas) - 1;
max |= mask;
if (mask == max)
max++;
}
#endif
return max;
}
/* The maximum index that can be contained in the array without expanding it */
static unsigned long max_index(void *entry)
{
if (!xa_is_node(entry))
return 0;
return (XA_CHUNK_SIZE << xa_to_node(entry)->shift) - 1;
}
static void xas_shrink(struct xa_state *xas)
{
struct xarray *xa = xas->xa;
struct xa_node *node = xas->xa_node;
for (;;) {
void *entry;
XA_NODE_BUG_ON(node, node->count > XA_CHUNK_SIZE);
if (node->count != 1)
break;
entry = xa_entry_locked(xa, node, 0);
if (!entry)
break;
if (!xa_is_node(entry) && node->shift)
break;
if (xa_is_zero(entry) && xa_zero_busy(xa))
entry = NULL;
xas->xa_node = XAS_BOUNDS;
RCU_INIT_POINTER(xa->xa_head, entry);
if (xa_track_free(xa) && !node_get_mark(node, 0, XA_FREE_MARK))
xa_mark_clear(xa, XA_FREE_MARK);
node->count = 0;
node->nr_values = 0;
if (!xa_is_node(entry))
RCU_INIT_POINTER(node->slots[0], XA_RETRY_ENTRY); xas_update(xas, node);
xa_node_free(node);
if (!xa_is_node(entry))
break;
node = xa_to_node(entry);
node->parent = NULL;
}
}
/*
* xas_delete_node() - Attempt to delete an xa_node
* @xas: Array operation state.
*
* Attempts to delete the @xas->xa_node. This will fail if xa->node has
* a non-zero reference count.
*/
static void xas_delete_node(struct xa_state *xas)
{
struct xa_node *node = xas->xa_node;
for (;;) {
struct xa_node *parent;
XA_NODE_BUG_ON(node, node->count > XA_CHUNK_SIZE);
if (node->count)
break;
parent = xa_parent_locked(xas->xa, node);
xas->xa_node = parent;
xas->xa_offset = node->offset;
xa_node_free(node);
if (!parent) {
xas->xa->xa_head = NULL;
xas->xa_node = XAS_BOUNDS;
return;
}
parent->slots[xas->xa_offset] = NULL;
parent->count--;
XA_NODE_BUG_ON(parent, parent->count > XA_CHUNK_SIZE);
node = parent;
xas_update(xas, node);
}
if (!node->parent)
xas_shrink(xas);
}
/**
* xas_free_nodes() - Free this node and all nodes that it references
* @xas: Array operation state.
* @top: Node to free
*
* This node has been removed from the tree. We must now free it and all
* of its subnodes. There may be RCU walkers with references into the tree,
* so we must replace all entries with retry markers.
*/
static void xas_free_nodes(struct xa_state *xas, struct xa_node *top)
{
unsigned int offset = 0;
struct xa_node *node = top;
for (;;) {
void *entry = xa_entry_locked(xas->xa, node, offset);
if (node->shift && xa_is_node(entry)) {
node = xa_to_node(entry);
offset = 0;
continue;
}
if (entry)
RCU_INIT_POINTER(node->slots[offset], XA_RETRY_ENTRY);
offset++;
while (offset == XA_CHUNK_SIZE) {
struct xa_node *parent;
parent = xa_parent_locked(xas->xa, node);
offset = node->offset + 1;
node->count = 0;
node->nr_values = 0;
xas_update(xas, node);
xa_node_free(node);
if (node == top)
return;
node = parent;
}
}
}
/*
* xas_expand adds nodes to the head of the tree until it has reached
* sufficient height to be able to contain @xas->xa_index
*/
static int xas_expand(struct xa_state *xas, void *head)
{
struct xarray *xa = xas->xa;
struct xa_node *node = NULL;
unsigned int shift = 0;
unsigned long max = xas_max(xas);
if (!head) {
if (max == 0)
return 0;
while ((max >> shift) >= XA_CHUNK_SIZE) shift += XA_CHUNK_SHIFT;
return shift + XA_CHUNK_SHIFT;
} else if (xa_is_node(head)) {
node = xa_to_node(head);
shift = node->shift + XA_CHUNK_SHIFT;
}
xas->xa_node = NULL;
while (max > max_index(head)) {
xa_mark_t mark = 0;
XA_NODE_BUG_ON(node, shift > BITS_PER_LONG);
node = xas_alloc(xas, shift);
if (!node)
return -ENOMEM;
node->count = 1;
if (xa_is_value(head))
node->nr_values = 1; RCU_INIT_POINTER(node->slots[0], head);
/* Propagate the aggregated mark info to the new child */
for (;;) {
if (xa_track_free(xa) && mark == XA_FREE_MARK) {
node_mark_all(node, XA_FREE_MARK);
if (!xa_marked(xa, XA_FREE_MARK)) {
node_clear_mark(node, 0, XA_FREE_MARK);
xa_mark_set(xa, XA_FREE_MARK);
}
} else if (xa_marked(xa, mark)) {
node_set_mark(node, 0, mark);
}
if (mark == XA_MARK_MAX)
break;
mark_inc(mark);
}
/*
* Now that the new node is fully initialised, we can add
* it to the tree
*/
if (xa_is_node(head)) {
xa_to_node(head)->offset = 0;
rcu_assign_pointer(xa_to_node(head)->parent, node);
}
head = xa_mk_node(node);
rcu_assign_pointer(xa->xa_head, head);
xas_update(xas, node);
shift += XA_CHUNK_SHIFT;
}
xas->xa_node = node;
return shift;
}
/*
* xas_create() - Create a slot to store an entry in.
* @xas: XArray operation state.
* @allow_root: %true if we can store the entry in the root directly
*
* Most users will not need to call this function directly, as it is called
* by xas_store(). It is useful for doing conditional store operations
* (see the xa_cmpxchg() implementation for an example).
*
* Return: If the slot already existed, returns the contents of this slot.
* If the slot was newly created, returns %NULL. If it failed to create the
* slot, returns %NULL and indicates the error in @xas.
*/
static void *xas_create(struct xa_state *xas, bool allow_root)
{
struct xarray *xa = xas->xa;
void *entry;
void __rcu **slot;
struct xa_node *node = xas->xa_node;
int shift;
unsigned int order = xas->xa_shift;
if (xas_top(node)) {
entry = xa_head_locked(xa);
xas->xa_node = NULL;
if (!entry && xa_zero_busy(xa))
entry = XA_ZERO_ENTRY;
shift = xas_expand(xas, entry);
if (shift < 0)
return NULL;
if (!shift && !allow_root)
shift = XA_CHUNK_SHIFT;
entry = xa_head_locked(xa);
slot = &xa->xa_head;
} else if (xas_error(xas)) {
return NULL;
} else if (node) { unsigned int offset = xas->xa_offset;
shift = node->shift;
entry = xa_entry_locked(xa, node, offset);
slot = &node->slots[offset];
} else {
shift = 0;
entry = xa_head_locked(xa);
slot = &xa->xa_head;
}
while (shift > order) { shift -= XA_CHUNK_SHIFT;
if (!entry) {
node = xas_alloc(xas, shift);
if (!node)
break;
if (xa_track_free(xa))
node_mark_all(node, XA_FREE_MARK);
rcu_assign_pointer(*slot, xa_mk_node(node));
} else if (xa_is_node(entry)) {
node = xa_to_node(entry);
} else {
break;
}
entry = xas_descend(xas, node);
slot = &node->slots[xas->xa_offset];
}
return entry;
}
/**
* xas_create_range() - Ensure that stores to this range will succeed
* @xas: XArray operation state.
*
* Creates all of the slots in the range covered by @xas. Sets @xas to
* create single-index entries and positions it at the beginning of the
* range. This is for the benefit of users which have not yet been
* converted to use multi-index entries.
*/
void xas_create_range(struct xa_state *xas)
{
unsigned long index = xas->xa_index;
unsigned char shift = xas->xa_shift;
unsigned char sibs = xas->xa_sibs;
xas->xa_index |= ((sibs + 1UL) << shift) - 1;
if (xas_is_node(xas) && xas->xa_node->shift == xas->xa_shift) xas->xa_offset |= sibs; xas->xa_shift = 0;
xas->xa_sibs = 0;
for (;;) {
xas_create(xas, true);
if (xas_error(xas))
goto restore;
if (xas->xa_index <= (index | XA_CHUNK_MASK))
goto success;
xas->xa_index -= XA_CHUNK_SIZE;
for (;;) {
struct xa_node *node = xas->xa_node;
if (node->shift >= shift)
break;
xas->xa_node = xa_parent_locked(xas->xa, node);
xas->xa_offset = node->offset - 1;
if (node->offset != 0)
break;
}
}
restore:
xas->xa_shift = shift;
xas->xa_sibs = sibs;
xas->xa_index = index;
return;
success:
xas->xa_index = index;
if (xas->xa_node)
xas_set_offset(xas);
}
EXPORT_SYMBOL_GPL(xas_create_range);
static void update_node(struct xa_state *xas, struct xa_node *node,
int count, int values)
{
if (!node || (!count && !values))
return;
node->count += count;
node->nr_values += values;
XA_NODE_BUG_ON(node, node->count > XA_CHUNK_SIZE);
XA_NODE_BUG_ON(node, node->nr_values > XA_CHUNK_SIZE);
xas_update(xas, node);
if (count < 0)
xas_delete_node(xas);
}
/**
* xas_store() - Store this entry in the XArray.
* @xas: XArray operation state.
* @entry: New entry.
*
* If @xas is operating on a multi-index entry, the entry returned by this
* function is essentially meaningless (it may be an internal entry or it
* may be %NULL, even if there are non-NULL entries at some of the indices
* covered by the range). This is not a problem for any current users,
* and can be changed if needed.
*
* Return: The old entry at this index.
*/
void *xas_store(struct xa_state *xas, void *entry)
{
struct xa_node *node;
void __rcu **slot = &xas->xa->xa_head;
unsigned int offset, max;
int count = 0;
int values = 0;
void *first, *next;
bool value = xa_is_value(entry);
if (entry) {
bool allow_root = !xa_is_node(entry) && !xa_is_zero(entry); first = xas_create(xas, allow_root);
} else {
first = xas_load(xas);
}
if (xas_invalid(xas))
return first;
node = xas->xa_node;
if (node && (xas->xa_shift < node->shift)) xas->xa_sibs = 0; if ((first == entry) && !xas->xa_sibs)
return first;
next = first;
offset = xas->xa_offset;
max = xas->xa_offset + xas->xa_sibs;
if (node) {
slot = &node->slots[offset]; if (xas->xa_sibs)
xas_squash_marks(xas);
}
if (!entry) xas_init_marks(xas);
for (;;) {
/*
* Must clear the marks before setting the entry to NULL,
* otherwise xas_for_each_marked may find a NULL entry and
* stop early. rcu_assign_pointer contains a release barrier
* so the mark clearing will appear to happen before the
* entry is set to NULL.
*/
rcu_assign_pointer(*slot, entry); if (xa_is_node(next) && (!node || node->shift))
xas_free_nodes(xas, xa_to_node(next));
if (!node)
break;
count += !next - !entry;
values += !xa_is_value(first) - !value;
if (entry) {
if (offset == max)
break;
if (!xa_is_sibling(entry))
entry = xa_mk_sibling(xas->xa_offset);
} else {
if (offset == XA_CHUNK_MASK)
break;
}
next = xa_entry_locked(xas->xa, node, ++offset);
if (!xa_is_sibling(next)) {
if (!entry && (offset > max))
break;
first = next;
}
slot++;
}
update_node(xas, node, count, values);
return first;
}
EXPORT_SYMBOL_GPL(xas_store);
/**
* xas_get_mark() - Returns the state of this mark.
* @xas: XArray operation state.
* @mark: Mark number.
*
* Return: true if the mark is set, false if the mark is clear or @xas
* is in an error state.
*/
bool xas_get_mark(const struct xa_state *xas, xa_mark_t mark)
{
if (xas_invalid(xas))
return false;
if (!xas->xa_node)
return xa_marked(xas->xa, mark);
return node_get_mark(xas->xa_node, xas->xa_offset, mark);
}
EXPORT_SYMBOL_GPL(xas_get_mark);
/**
* xas_set_mark() - Sets the mark on this entry and its parents.
* @xas: XArray operation state.
* @mark: Mark number.
*
* Sets the specified mark on this entry, and walks up the tree setting it
* on all the ancestor entries. Does nothing if @xas has not been walked to
* an entry, or is in an error state.
*/
void xas_set_mark(const struct xa_state *xas, xa_mark_t mark)
{
struct xa_node *node = xas->xa_node; unsigned int offset = xas->xa_offset;
if (xas_invalid(xas))
return;
while (node) {
if (node_set_mark(node, offset, mark))
return;
offset = node->offset;
node = xa_parent_locked(xas->xa, node);
}
if (!xa_marked(xas->xa, mark))
xa_mark_set(xas->xa, mark);
}
EXPORT_SYMBOL_GPL(xas_set_mark);
/**
* xas_clear_mark() - Clears the mark on this entry and its parents.
* @xas: XArray operation state.
* @mark: Mark number.
*
* Clears the specified mark on this entry, and walks back to the head
* attempting to clear it on all the ancestor entries. Does nothing if
* @xas has not been walked to an entry, or is in an error state.
*/
void xas_clear_mark(const struct xa_state *xas, xa_mark_t mark)
{
struct xa_node *node = xas->xa_node; unsigned int offset = xas->xa_offset;
if (xas_invalid(xas))
return;
while (node) {
if (!node_clear_mark(node, offset, mark))
return;
if (node_any_mark(node, mark))
return;
offset = node->offset;
node = xa_parent_locked(xas->xa, node);
}
if (xa_marked(xas->xa, mark))
xa_mark_clear(xas->xa, mark);
}
EXPORT_SYMBOL_GPL(xas_clear_mark);
/**
* xas_init_marks() - Initialise all marks for the entry
* @xas: Array operations state.
*
* Initialise all marks for the entry specified by @xas. If we're tracking
* free entries with a mark, we need to set it on all entries. All other
* marks are cleared.
*
* This implementation is not as efficient as it could be; we may walk
* up the tree multiple times.
*/
void xas_init_marks(const struct xa_state *xas)
{
xa_mark_t mark = 0;
for (;;) {
if (xa_track_free(xas->xa) && mark == XA_FREE_MARK) xas_set_mark(xas, mark);
else
xas_clear_mark(xas, mark);
if (mark == XA_MARK_MAX)
break;
mark_inc(mark);
}
}
EXPORT_SYMBOL_GPL(xas_init_marks);
#ifdef CONFIG_XARRAY_MULTI
static unsigned int node_get_marks(struct xa_node *node, unsigned int offset)
{
unsigned int marks = 0;
xa_mark_t mark = XA_MARK_0;
for (;;) {
if (node_get_mark(node, offset, mark))
marks |= 1 << (__force unsigned int)mark;
if (mark == XA_MARK_MAX)
break;
mark_inc(mark);
}
return marks;
}
static void node_set_marks(struct xa_node *node, unsigned int offset,
struct xa_node *child, unsigned int marks)
{
xa_mark_t mark = XA_MARK_0;
for (;;) {
if (marks & (1 << (__force unsigned int)mark)) {
node_set_mark(node, offset, mark);
if (child)
node_mark_all(child, mark);
}
if (mark == XA_MARK_MAX)
break;
mark_inc(mark);
}
}
/**
* xas_split_alloc() - Allocate memory for splitting an entry.
* @xas: XArray operation state.
* @entry: New entry which will be stored in the array.
* @order: Current entry order.
* @gfp: Memory allocation flags.
*
* This function should be called before calling xas_split().
* If necessary, it will allocate new nodes (and fill them with @entry)
* to prepare for the upcoming split of an entry of @order size into
* entries of the order stored in the @xas.
*
* Context: May sleep if @gfp flags permit.
*/
void xas_split_alloc(struct xa_state *xas, void *entry, unsigned int order,
gfp_t gfp)
{
unsigned int sibs = (1 << (order % XA_CHUNK_SHIFT)) - 1;
unsigned int mask = xas->xa_sibs;
/* XXX: no support for splitting really large entries yet */
if (WARN_ON(xas->xa_shift + 2 * XA_CHUNK_SHIFT < order))
goto nomem;
if (xas->xa_shift + XA_CHUNK_SHIFT > order)
return;
do {
unsigned int i;
void *sibling = NULL;
struct xa_node *node;
node = kmem_cache_alloc(radix_tree_node_cachep, gfp);
if (!node)
goto nomem;
node->array = xas->xa;
for (i = 0; i < XA_CHUNK_SIZE; i++) {
if ((i & mask) == 0) {
RCU_INIT_POINTER(node->slots[i], entry);
sibling = xa_mk_sibling(i);
} else {
RCU_INIT_POINTER(node->slots[i], sibling);
}
}
RCU_INIT_POINTER(node->parent, xas->xa_alloc);
xas->xa_alloc = node;
} while (sibs-- > 0);
return;
nomem:
xas_destroy(xas);
xas_set_err(xas, -ENOMEM);
}
EXPORT_SYMBOL_GPL(xas_split_alloc);
/**
* xas_split() - Split a multi-index entry into smaller entries.
* @xas: XArray operation state.
* @entry: New entry to store in the array.
* @order: Current entry order.
*
* The size of the new entries is set in @xas. The value in @entry is
* copied to all the replacement entries.
*
* Context: Any context. The caller should hold the xa_lock.
*/
void xas_split(struct xa_state *xas, void *entry, unsigned int order)
{
unsigned int sibs = (1 << (order % XA_CHUNK_SHIFT)) - 1;
unsigned int offset, marks;
struct xa_node *node;
void *curr = xas_load(xas);
int values = 0;
node = xas->xa_node;
if (xas_top(node))
return;
marks = node_get_marks(node, xas->xa_offset);
offset = xas->xa_offset + sibs;
do {
if (xas->xa_shift < node->shift) {
struct xa_node *child = xas->xa_alloc;
xas->xa_alloc = rcu_dereference_raw(child->parent);
child->shift = node->shift - XA_CHUNK_SHIFT;
child->offset = offset;
child->count = XA_CHUNK_SIZE;
child->nr_values = xa_is_value(entry) ?
XA_CHUNK_SIZE : 0;
RCU_INIT_POINTER(child->parent, node);
node_set_marks(node, offset, child, marks);
rcu_assign_pointer(node->slots[offset],
xa_mk_node(child));
if (xa_is_value(curr))
values--;
xas_update(xas, child);
} else {
unsigned int canon = offset - xas->xa_sibs;
node_set_marks(node, canon, NULL, marks);
rcu_assign_pointer(node->slots[canon], entry);
while (offset > canon)
rcu_assign_pointer(node->slots[offset--],
xa_mk_sibling(canon));
values += (xa_is_value(entry) - xa_is_value(curr)) *
(xas->xa_sibs + 1);
}
} while (offset-- > xas->xa_offset);
node->nr_values += values;
xas_update(xas, node);
}
EXPORT_SYMBOL_GPL(xas_split);
#endif
/**
* xas_pause() - Pause a walk to drop a lock.
* @xas: XArray operation state.
*
* Some users need to pause a walk and drop the lock they're holding in
* order to yield to a higher priority thread or carry out an operation
* on an entry. Those users should call this function before they drop
* the lock. It resets the @xas to be suitable for the next iteration
* of the loop after the user has reacquired the lock. If most entries
* found during a walk require you to call xas_pause(), the xa_for_each()
* iterator may be more appropriate.
*
* Note that xas_pause() only works for forward iteration. If a user needs
* to pause a reverse iteration, we will need a xas_pause_rev().
*/
void xas_pause(struct xa_state *xas)
{
struct xa_node *node = xas->xa_node;
if (xas_invalid(xas))
return;
xas->xa_node = XAS_RESTART;
if (node) {
unsigned long offset = xas->xa_offset;
while (++offset < XA_CHUNK_SIZE) {
if (!xa_is_sibling(xa_entry(xas->xa, node, offset)))
break;
}
xas->xa_index += (offset - xas->xa_offset) << node->shift;
if (xas->xa_index == 0)
xas->xa_node = XAS_BOUNDS;
} else {
xas->xa_index++;
}
}
EXPORT_SYMBOL_GPL(xas_pause);
/*
* __xas_prev() - Find the previous entry in the XArray.
* @xas: XArray operation state.
*
* Helper function for xas_prev() which handles all the complex cases
* out of line.
*/
void *__xas_prev(struct xa_state *xas)
{
void *entry;
if (!xas_frozen(xas->xa_node)) xas->xa_index--; if (!xas->xa_node)
return set_bounds(xas);
if (xas_not_node(xas->xa_node))
return xas_load(xas); if (xas->xa_offset != get_offset(xas->xa_index, xas->xa_node)) xas->xa_offset--; while (xas->xa_offset == 255) { xas->xa_offset = xas->xa_node->offset - 1;
xas->xa_node = xa_parent(xas->xa, xas->xa_node);
if (!xas->xa_node)
return set_bounds(xas);
}
for (;;) {
entry = xa_entry(xas->xa, xas->xa_node, xas->xa_offset);
if (!xa_is_node(entry))
return entry;
xas->xa_node = xa_to_node(entry);
xas_set_offset(xas);
}
}
EXPORT_SYMBOL_GPL(__xas_prev);
/*
* __xas_next() - Find the next entry in the XArray.
* @xas: XArray operation state.
*
* Helper function for xas_next() which handles all the complex cases
* out of line.
*/
void *__xas_next(struct xa_state *xas)
{
void *entry;
if (!xas_frozen(xas->xa_node)) xas->xa_index++; if (!xas->xa_node)
return set_bounds(xas);
if (xas_not_node(xas->xa_node))
return xas_load(xas); if (xas->xa_offset != get_offset(xas->xa_index, xas->xa_node)) xas->xa_offset++; while (xas->xa_offset == XA_CHUNK_SIZE) { xas->xa_offset = xas->xa_node->offset + 1;
xas->xa_node = xa_parent(xas->xa, xas->xa_node);
if (!xas->xa_node)
return set_bounds(xas);
}
for (;;) {
entry = xa_entry(xas->xa, xas->xa_node, xas->xa_offset);
if (!xa_is_node(entry))
return entry;
xas->xa_node = xa_to_node(entry);
xas_set_offset(xas);
}
}
EXPORT_SYMBOL_GPL(__xas_next);
/**
* xas_find() - Find the next present entry in the XArray.
* @xas: XArray operation state.
* @max: Highest index to return.
*
* If the @xas has not yet been walked to an entry, return the entry
* which has an index >= xas.xa_index. If it has been walked, the entry
* currently being pointed at has been processed, and so we move to the
* next entry.
*
* If no entry is found and the array is smaller than @max, the iterator
* is set to the smallest index not yet in the array. This allows @xas
* to be immediately passed to xas_store().
*
* Return: The entry, if found, otherwise %NULL.
*/
void *xas_find(struct xa_state *xas, unsigned long max)
{
void *entry;
if (xas_error(xas) || xas->xa_node == XAS_BOUNDS)
return NULL;
if (xas->xa_index > max) return set_bounds(xas); if (!xas->xa_node) { xas->xa_index = 1;
return set_bounds(xas);
} else if (xas->xa_node == XAS_RESTART) { entry = xas_load(xas); if (entry || xas_not_node(xas->xa_node))
return entry;
} else if (!xas->xa_node->shift && xas->xa_offset != (xas->xa_index & XA_CHUNK_MASK)) { xas->xa_offset = ((xas->xa_index - 1) & XA_CHUNK_MASK) + 1;
}
xas_advance(xas);
while (xas->xa_node && (xas->xa_index <= max)) { if (unlikely(xas->xa_offset == XA_CHUNK_SIZE)) { xas->xa_offset = xas->xa_node->offset + 1;
xas->xa_node = xa_parent(xas->xa, xas->xa_node);
continue;
}
entry = xa_entry(xas->xa, xas->xa_node, xas->xa_offset);
if (xa_is_node(entry)) {
xas->xa_node = xa_to_node(entry);
xas->xa_offset = 0;
continue;
}
if (entry && !xa_is_sibling(entry))
return entry;
xas_advance(xas);
}
if (!xas->xa_node)
xas->xa_node = XAS_BOUNDS;
return NULL;
}
EXPORT_SYMBOL_GPL(xas_find);
/**
* xas_find_marked() - Find the next marked entry in the XArray.
* @xas: XArray operation state.
* @max: Highest index to return.
* @mark: Mark number to search for.
*
* If the @xas has not yet been walked to an entry, return the marked entry
* which has an index >= xas.xa_index. If it has been walked, the entry
* currently being pointed at has been processed, and so we return the
* first marked entry with an index > xas.xa_index.
*
* If no marked entry is found and the array is smaller than @max, @xas is
* set to the bounds state and xas->xa_index is set to the smallest index
* not yet in the array. This allows @xas to be immediately passed to
* xas_store().
*
* If no entry is found before @max is reached, @xas is set to the restart
* state.
*
* Return: The entry, if found, otherwise %NULL.
*/
void *xas_find_marked(struct xa_state *xas, unsigned long max, xa_mark_t mark)
{
bool advance = true;
unsigned int offset;
void *entry;
if (xas_error(xas))
return NULL;
if (xas->xa_index > max)
goto max;
if (!xas->xa_node) { xas->xa_index = 1;
goto out;
} else if (xas_top(xas->xa_node)) {
advance = false;
entry = xa_head(xas->xa);
xas->xa_node = NULL;
if (xas->xa_index > max_index(entry))
goto out;
if (!xa_is_node(entry)) {
if (xa_marked(xas->xa, mark))
return entry;
xas->xa_index = 1;
goto out;
}
xas->xa_node = xa_to_node(entry);
xas->xa_offset = xas->xa_index >> xas->xa_node->shift;
}
while (xas->xa_index <= max) { if (unlikely(xas->xa_offset == XA_CHUNK_SIZE)) { xas->xa_offset = xas->xa_node->offset + 1;
xas->xa_node = xa_parent(xas->xa, xas->xa_node);
if (!xas->xa_node)
break;
advance = false;
continue;
}
if (!advance) {
entry = xa_entry(xas->xa, xas->xa_node, xas->xa_offset);
if (xa_is_sibling(entry)) {
xas->xa_offset = xa_to_sibling(entry);
xas_move_index(xas, xas->xa_offset);
}
}
offset = xas_find_chunk(xas, advance, mark);
if (offset > xas->xa_offset) {
advance = false;
xas_move_index(xas, offset);
/* Mind the wrap */
if ((xas->xa_index - 1) >= max)
goto max;
xas->xa_offset = offset;
if (offset == XA_CHUNK_SIZE)
continue;
}
entry = xa_entry(xas->xa, xas->xa_node, xas->xa_offset); if (!entry && !(xa_track_free(xas->xa) && mark == XA_FREE_MARK))
continue;
if (!xa_is_node(entry))
return entry;
xas->xa_node = xa_to_node(entry);
xas_set_offset(xas);
}
out:
if (xas->xa_index > max)
goto max;
return set_bounds(xas);
max:
xas->xa_node = XAS_RESTART;
return NULL;
}
EXPORT_SYMBOL_GPL(xas_find_marked);
/**
* xas_find_conflict() - Find the next present entry in a range.
* @xas: XArray operation state.
*
* The @xas describes both a range and a position within that range.
*
* Context: Any context. Expects xa_lock to be held.
* Return: The next entry in the range covered by @xas or %NULL.
*/
void *xas_find_conflict(struct xa_state *xas)
{
void *curr;
if (xas_error(xas))
return NULL;
if (!xas->xa_node)
return NULL;
if (xas_top(xas->xa_node)) { curr = xas_start(xas);
if (!curr)
return NULL;
while (xa_is_node(curr)) {
struct xa_node *node = xa_to_node(curr);
curr = xas_descend(xas, node);
}
if (curr)
return curr;
}
if (xas->xa_node->shift > xas->xa_shift)
return NULL;
for (;;) {
if (xas->xa_node->shift == xas->xa_shift) { if ((xas->xa_offset & xas->xa_sibs) == xas->xa_sibs)
break;
} else if (xas->xa_offset == XA_CHUNK_MASK) { xas->xa_offset = xas->xa_node->offset;
xas->xa_node = xa_parent_locked(xas->xa, xas->xa_node);
if (!xas->xa_node)
break;
continue;
}
curr = xa_entry_locked(xas->xa, xas->xa_node, ++xas->xa_offset);
if (xa_is_sibling(curr))
continue;
while (xa_is_node(curr)) {
xas->xa_node = xa_to_node(curr);
xas->xa_offset = 0;
curr = xa_entry_locked(xas->xa, xas->xa_node, 0);
}
if (curr)
return curr;
}
xas->xa_offset -= xas->xa_sibs; return NULL;
}
EXPORT_SYMBOL_GPL(xas_find_conflict);
/**
* xa_load() - Load an entry from an XArray.
* @xa: XArray.
* @index: index into array.
*
* Context: Any context. Takes and releases the RCU lock.
* Return: The entry at @index in @xa.
*/
void *xa_load(struct xarray *xa, unsigned long index)
{
XA_STATE(xas, xa, index);
void *entry;
rcu_read_lock();
do {
entry = xas_load(&xas);
if (xa_is_zero(entry))
entry = NULL;
} while (xas_retry(&xas, entry));
rcu_read_unlock();
return entry;
}
EXPORT_SYMBOL(xa_load);
static void *xas_result(struct xa_state *xas, void *curr)
{
if (xa_is_zero(curr))
return NULL;
if (xas_error(xas))
curr = xas->xa_node;
return curr;
}
/**
* __xa_erase() - Erase this entry from the XArray while locked.
* @xa: XArray.
* @index: Index into array.
*
* After this function returns, loading from @index will return %NULL.
* If the index is part of a multi-index entry, all indices will be erased
* and none of the entries will be part of a multi-index entry.
*
* Context: Any context. Expects xa_lock to be held on entry.
* Return: The entry which used to be at this index.
*/
void *__xa_erase(struct xarray *xa, unsigned long index)
{
XA_STATE(xas, xa, index);
return xas_result(&xas, xas_store(&xas, NULL));
}
EXPORT_SYMBOL(__xa_erase);
/**
* xa_erase() - Erase this entry from the XArray.
* @xa: XArray.
* @index: Index of entry.
*
* After this function returns, loading from @index will return %NULL.
* If the index is part of a multi-index entry, all indices will be erased
* and none of the entries will be part of a multi-index entry.
*
* Context: Any context. Takes and releases the xa_lock.
* Return: The entry which used to be at this index.
*/
void *xa_erase(struct xarray *xa, unsigned long index)
{
void *entry;
xa_lock(xa);
entry = __xa_erase(xa, index);
xa_unlock(xa);
return entry;
}
EXPORT_SYMBOL(xa_erase);
/**
* __xa_store() - Store this entry in the XArray.
* @xa: XArray.
* @index: Index into array.
* @entry: New entry.
* @gfp: Memory allocation flags.
*
* You must already be holding the xa_lock when calling this function.
* It will drop the lock if needed to allocate memory, and then reacquire
* it afterwards.
*
* Context: Any context. Expects xa_lock to be held on entry. May
* release and reacquire xa_lock if @gfp flags permit.
* Return: The old entry at this index or xa_err() if an error happened.
*/
void *__xa_store(struct xarray *xa, unsigned long index, void *entry, gfp_t gfp)
{
XA_STATE(xas, xa, index);
void *curr;
if (WARN_ON_ONCE(xa_is_advanced(entry)))
return XA_ERROR(-EINVAL);
if (xa_track_free(xa) && !entry)
entry = XA_ZERO_ENTRY;
do {
curr = xas_store(&xas, entry);
if (xa_track_free(xa))
xas_clear_mark(&xas, XA_FREE_MARK);
} while (__xas_nomem(&xas, gfp));
return xas_result(&xas, curr);
}
EXPORT_SYMBOL(__xa_store);
/**
* xa_store() - Store this entry in the XArray.
* @xa: XArray.
* @index: Index into array.
* @entry: New entry.
* @gfp: Memory allocation flags.
*
* After this function returns, loads from this index will return @entry.
* Storing into an existing multi-index entry updates the entry of every index.
* The marks associated with @index are unaffected unless @entry is %NULL.
*
* Context: Any context. Takes and releases the xa_lock.
* May sleep if the @gfp flags permit.
* Return: The old entry at this index on success, xa_err(-EINVAL) if @entry
* cannot be stored in an XArray, or xa_err(-ENOMEM) if memory allocation
* failed.
*/
void *xa_store(struct xarray *xa, unsigned long index, void *entry, gfp_t gfp)
{
void *curr;
xa_lock(xa);
curr = __xa_store(xa, index, entry, gfp);
xa_unlock(xa);
return curr;
}
EXPORT_SYMBOL(xa_store);
/**
* __xa_cmpxchg() - Store this entry in the XArray.
* @xa: XArray.
* @index: Index into array.
* @old: Old value to test against.
* @entry: New entry.
* @gfp: Memory allocation flags.
*
* You must already be holding the xa_lock when calling this function.
* It will drop the lock if needed to allocate memory, and then reacquire
* it afterwards.
*
* Context: Any context. Expects xa_lock to be held on entry. May
* release and reacquire xa_lock if @gfp flags permit.
* Return: The old entry at this index or xa_err() if an error happened.
*/
void *__xa_cmpxchg(struct xarray *xa, unsigned long index,
void *old, void *entry, gfp_t gfp)
{
XA_STATE(xas, xa, index);
void *curr;
if (WARN_ON_ONCE(xa_is_advanced(entry)))
return XA_ERROR(-EINVAL);
do {
curr = xas_load(&xas);
if (curr == old) {
xas_store(&xas, entry);
if (xa_track_free(xa) && entry && !curr)
xas_clear_mark(&xas, XA_FREE_MARK);
}
} while (__xas_nomem(&xas, gfp));
return xas_result(&xas, curr);
}
EXPORT_SYMBOL(__xa_cmpxchg);
/**
* __xa_insert() - Store this entry in the XArray if no entry is present.
* @xa: XArray.
* @index: Index into array.
* @entry: New entry.
* @gfp: Memory allocation flags.
*
* Inserting a NULL entry will store a reserved entry (like xa_reserve())
* if no entry is present. Inserting will fail if a reserved entry is
* present, even though loading from this index will return NULL.
*
* Context: Any context. Expects xa_lock to be held on entry. May
* release and reacquire xa_lock if @gfp flags permit.
* Return: 0 if the store succeeded. -EBUSY if another entry was present.
* -ENOMEM if memory could not be allocated.
*/
int __xa_insert(struct xarray *xa, unsigned long index, void *entry, gfp_t gfp)
{
XA_STATE(xas, xa, index);
void *curr;
if (WARN_ON_ONCE(xa_is_advanced(entry)))
return -EINVAL;
if (!entry)
entry = XA_ZERO_ENTRY;
do {
curr = xas_load(&xas);
if (!curr) {
xas_store(&xas, entry);
if (xa_track_free(xa))
xas_clear_mark(&xas, XA_FREE_MARK);
} else {
xas_set_err(&xas, -EBUSY);
}
} while (__xas_nomem(&xas, gfp));
return xas_error(&xas);
}
EXPORT_SYMBOL(__xa_insert);
#ifdef CONFIG_XARRAY_MULTI
static void xas_set_range(struct xa_state *xas, unsigned long first,
unsigned long last)
{
unsigned int shift = 0;
unsigned long sibs = last - first;
unsigned int offset = XA_CHUNK_MASK;
xas_set(xas, first);
while ((first & XA_CHUNK_MASK) == 0) {
if (sibs < XA_CHUNK_MASK)
break;
if ((sibs == XA_CHUNK_MASK) && (offset < XA_CHUNK_MASK))
break;
shift += XA_CHUNK_SHIFT;
if (offset == XA_CHUNK_MASK)
offset = sibs & XA_CHUNK_MASK;
sibs >>= XA_CHUNK_SHIFT;
first >>= XA_CHUNK_SHIFT;
}
offset = first & XA_CHUNK_MASK;
if (offset + sibs > XA_CHUNK_MASK)
sibs = XA_CHUNK_MASK - offset;
if ((((first + sibs + 1) << shift) - 1) > last)
sibs -= 1;
xas->xa_shift = shift;
xas->xa_sibs = sibs;
}
/**
* xa_store_range() - Store this entry at a range of indices in the XArray.
* @xa: XArray.
* @first: First index to affect.
* @last: Last index to affect.
* @entry: New entry.
* @gfp: Memory allocation flags.
*
* After this function returns, loads from any index between @first and @last,
* inclusive will return @entry.
* Storing into an existing multi-index entry updates the entry of every index.
* The marks associated with @index are unaffected unless @entry is %NULL.
*
* Context: Process context. Takes and releases the xa_lock. May sleep
* if the @gfp flags permit.
* Return: %NULL on success, xa_err(-EINVAL) if @entry cannot be stored in
* an XArray, or xa_err(-ENOMEM) if memory allocation failed.
*/
void *xa_store_range(struct xarray *xa, unsigned long first,
unsigned long last, void *entry, gfp_t gfp)
{
XA_STATE(xas, xa, 0);
if (WARN_ON_ONCE(xa_is_internal(entry)))
return XA_ERROR(-EINVAL);
if (last < first)
return XA_ERROR(-EINVAL);
do {
xas_lock(&xas);
if (entry) {
unsigned int order = BITS_PER_LONG;
if (last + 1)
order = __ffs(last + 1);
xas_set_order(&xas, last, order);
xas_create(&xas, true);
if (xas_error(&xas))
goto unlock;
}
do {
xas_set_range(&xas, first, last);
xas_store(&xas, entry);
if (xas_error(&xas))
goto unlock;
first += xas_size(&xas);
} while (first <= last);
unlock:
xas_unlock(&xas);
} while (xas_nomem(&xas, gfp));
return xas_result(&xas, NULL);
}
EXPORT_SYMBOL(xa_store_range);
/**
* xa_get_order() - Get the order of an entry.
* @xa: XArray.
* @index: Index of the entry.
*
* Return: A number between 0 and 63 indicating the order of the entry.
*/
int xa_get_order(struct xarray *xa, unsigned long index)
{
XA_STATE(xas, xa, index);
void *entry;
int order = 0;
rcu_read_lock();
entry = xas_load(&xas);
if (!entry)
goto unlock;
if (!xas.xa_node)
goto unlock;
for (;;) {
unsigned int slot = xas.xa_offset + (1 << order);
if (slot >= XA_CHUNK_SIZE)
break;
if (!xa_is_sibling(xas.xa_node->slots[slot]))
break;
order++;
}
order += xas.xa_node->shift;
unlock:
rcu_read_unlock();
return order;
}
EXPORT_SYMBOL(xa_get_order);
#endif /* CONFIG_XARRAY_MULTI */
/**
* __xa_alloc() - Find somewhere to store this entry in the XArray.
* @xa: XArray.
* @id: Pointer to ID.
* @limit: Range for allocated ID.
* @entry: New entry.
* @gfp: Memory allocation flags.
*
* Finds an empty entry in @xa between @limit.min and @limit.max,
* stores the index into the @id pointer, then stores the entry at
* that index. A concurrent lookup will not see an uninitialised @id.
*
* Context: Any context. Expects xa_lock to be held on entry. May
* release and reacquire xa_lock if @gfp flags permit.
* Return: 0 on success, -ENOMEM if memory could not be allocated or
* -EBUSY if there are no free entries in @limit.
*/
int __xa_alloc(struct xarray *xa, u32 *id, void *entry,
struct xa_limit limit, gfp_t gfp)
{
XA_STATE(xas, xa, 0);
if (WARN_ON_ONCE(xa_is_advanced(entry)))
return -EINVAL;
if (WARN_ON_ONCE(!xa_track_free(xa)))
return -EINVAL;
if (!entry)
entry = XA_ZERO_ENTRY;
do {
xas.xa_index = limit.min;
xas_find_marked(&xas, limit.max, XA_FREE_MARK);
if (xas.xa_node == XAS_RESTART)
xas_set_err(&xas, -EBUSY);
else
*id = xas.xa_index;
xas_store(&xas, entry);
xas_clear_mark(&xas, XA_FREE_MARK);
} while (__xas_nomem(&xas, gfp));
return xas_error(&xas);
}
EXPORT_SYMBOL(__xa_alloc);
/**
* __xa_alloc_cyclic() - Find somewhere to store this entry in the XArray.
* @xa: XArray.
* @id: Pointer to ID.
* @entry: New entry.
* @limit: Range of allocated ID.
* @next: Pointer to next ID to allocate.
* @gfp: Memory allocation flags.
*
* Finds an empty entry in @xa between @limit.min and @limit.max,
* stores the index into the @id pointer, then stores the entry at
* that index. A concurrent lookup will not see an uninitialised @id.
* The search for an empty entry will start at @next and will wrap
* around if necessary.
*
* Context: Any context. Expects xa_lock to be held on entry. May
* release and reacquire xa_lock if @gfp flags permit.
* Return: 0 if the allocation succeeded without wrapping. 1 if the
* allocation succeeded after wrapping, -ENOMEM if memory could not be
* allocated or -EBUSY if there are no free entries in @limit.
*/
int __xa_alloc_cyclic(struct xarray *xa, u32 *id, void *entry,
struct xa_limit limit, u32 *next, gfp_t gfp)
{
u32 min = limit.min;
int ret;
limit.min = max(min, *next);
ret = __xa_alloc(xa, id, entry, limit, gfp);
if ((xa->xa_flags & XA_FLAGS_ALLOC_WRAPPED) && ret == 0) {
xa->xa_flags &= ~XA_FLAGS_ALLOC_WRAPPED;
ret = 1;
}
if (ret < 0 && limit.min > min) {
limit.min = min;
ret = __xa_alloc(xa, id, entry, limit, gfp);
if (ret == 0)
ret = 1;
}
if (ret >= 0) {
*next = *id + 1;
if (*next == 0)
xa->xa_flags |= XA_FLAGS_ALLOC_WRAPPED;
}
return ret;
}
EXPORT_SYMBOL(__xa_alloc_cyclic);
/**
* __xa_set_mark() - Set this mark on this entry while locked.
* @xa: XArray.
* @index: Index of entry.
* @mark: Mark number.
*
* Attempting to set a mark on a %NULL entry does not succeed.
*
* Context: Any context. Expects xa_lock to be held on entry.
*/
void __xa_set_mark(struct xarray *xa, unsigned long index, xa_mark_t mark)
{
XA_STATE(xas, xa, index);
void *entry = xas_load(&xas);
if (entry)
xas_set_mark(&xas, mark);
}
EXPORT_SYMBOL(__xa_set_mark);
/**
* __xa_clear_mark() - Clear this mark on this entry while locked.
* @xa: XArray.
* @index: Index of entry.
* @mark: Mark number.
*
* Context: Any context. Expects xa_lock to be held on entry.
*/
void __xa_clear_mark(struct xarray *xa, unsigned long index, xa_mark_t mark)
{
XA_STATE(xas, xa, index);
void *entry = xas_load(&xas);
if (entry)
xas_clear_mark(&xas, mark);
}
EXPORT_SYMBOL(__xa_clear_mark);
/**
* xa_get_mark() - Inquire whether this mark is set on this entry.
* @xa: XArray.
* @index: Index of entry.
* @mark: Mark number.
*
* This function uses the RCU read lock, so the result may be out of date
* by the time it returns. If you need the result to be stable, use a lock.
*
* Context: Any context. Takes and releases the RCU lock.
* Return: True if the entry at @index has this mark set, false if it doesn't.
*/
bool xa_get_mark(struct xarray *xa, unsigned long index, xa_mark_t mark)
{
XA_STATE(xas, xa, index);
void *entry;
rcu_read_lock();
entry = xas_start(&xas);
while (xas_get_mark(&xas, mark)) {
if (!xa_is_node(entry))
goto found;
entry = xas_descend(&xas, xa_to_node(entry));
}
rcu_read_unlock();
return false;
found:
rcu_read_unlock();
return true;
}
EXPORT_SYMBOL(xa_get_mark);
/**
* xa_set_mark() - Set this mark on this entry.
* @xa: XArray.
* @index: Index of entry.
* @mark: Mark number.
*
* Attempting to set a mark on a %NULL entry does not succeed.
*
* Context: Process context. Takes and releases the xa_lock.
*/
void xa_set_mark(struct xarray *xa, unsigned long index, xa_mark_t mark)
{
xa_lock(xa);
__xa_set_mark(xa, index, mark);
xa_unlock(xa);
}
EXPORT_SYMBOL(xa_set_mark);
/**
* xa_clear_mark() - Clear this mark on this entry.
* @xa: XArray.
* @index: Index of entry.
* @mark: Mark number.
*
* Clearing a mark always succeeds.
*
* Context: Process context. Takes and releases the xa_lock.
*/
void xa_clear_mark(struct xarray *xa, unsigned long index, xa_mark_t mark)
{
xa_lock(xa);
__xa_clear_mark(xa, index, mark);
xa_unlock(xa);
}
EXPORT_SYMBOL(xa_clear_mark);
/**
* xa_find() - Search the XArray for an entry.
* @xa: XArray.
* @indexp: Pointer to an index.
* @max: Maximum index to search to.
* @filter: Selection criterion.
*
* Finds the entry in @xa which matches the @filter, and has the lowest
* index that is at least @indexp and no more than @max.
* If an entry is found, @indexp is updated to be the index of the entry.
* This function is protected by the RCU read lock, so it may not find
* entries which are being simultaneously added. It will not return an
* %XA_RETRY_ENTRY; if you need to see retry entries, use xas_find().
*
* Context: Any context. Takes and releases the RCU lock.
* Return: The entry, if found, otherwise %NULL.
*/
void *xa_find(struct xarray *xa, unsigned long *indexp,
unsigned long max, xa_mark_t filter)
{
XA_STATE(xas, xa, *indexp);
void *entry;
rcu_read_lock();
do {
if ((__force unsigned int)filter < XA_MAX_MARKS) entry = xas_find_marked(&xas, max, filter);
else
entry = xas_find(&xas, max);
} while (xas_retry(&xas, entry));
rcu_read_unlock();
if (entry)
*indexp = xas.xa_index; return entry;
}
EXPORT_SYMBOL(xa_find);
static bool xas_sibling(struct xa_state *xas)
{
struct xa_node *node = xas->xa_node;
unsigned long mask;
if (!IS_ENABLED(CONFIG_XARRAY_MULTI) || !node)
return false;
mask = (XA_CHUNK_SIZE << node->shift) - 1;
return (xas->xa_index & mask) >
((unsigned long)xas->xa_offset << node->shift);
}
/**
* xa_find_after() - Search the XArray for a present entry.
* @xa: XArray.
* @indexp: Pointer to an index.
* @max: Maximum index to search to.
* @filter: Selection criterion.
*
* Finds the entry in @xa which matches the @filter and has the lowest
* index that is above @indexp and no more than @max.
* If an entry is found, @indexp is updated to be the index of the entry.
* This function is protected by the RCU read lock, so it may miss entries
* which are being simultaneously added. It will not return an
* %XA_RETRY_ENTRY; if you need to see retry entries, use xas_find().
*
* Context: Any context. Takes and releases the RCU lock.
* Return: The pointer, if found, otherwise %NULL.
*/
void *xa_find_after(struct xarray *xa, unsigned long *indexp,
unsigned long max, xa_mark_t filter)
{
XA_STATE(xas, xa, *indexp + 1);
void *entry;
if (xas.xa_index == 0)
return NULL;
rcu_read_lock();
for (;;) {
if ((__force unsigned int)filter < XA_MAX_MARKS)
entry = xas_find_marked(&xas, max, filter);
else
entry = xas_find(&xas, max);
if (xas_invalid(&xas))
break;
if (xas_sibling(&xas))
continue;
if (!xas_retry(&xas, entry))
break;
}
rcu_read_unlock();
if (entry)
*indexp = xas.xa_index;
return entry;
}
EXPORT_SYMBOL(xa_find_after);
static unsigned int xas_extract_present(struct xa_state *xas, void **dst,
unsigned long max, unsigned int n)
{
void *entry;
unsigned int i = 0;
rcu_read_lock();
xas_for_each(xas, entry, max) {
if (xas_retry(xas, entry))
continue;
dst[i++] = entry;
if (i == n)
break;
}
rcu_read_unlock();
return i;
}
static unsigned int xas_extract_marked(struct xa_state *xas, void **dst,
unsigned long max, unsigned int n, xa_mark_t mark)
{
void *entry;
unsigned int i = 0;
rcu_read_lock();
xas_for_each_marked(xas, entry, max, mark) {
if (xas_retry(xas, entry))
continue;
dst[i++] = entry;
if (i == n)
break;
}
rcu_read_unlock();
return i;
}
/**
* xa_extract() - Copy selected entries from the XArray into a normal array.
* @xa: The source XArray to copy from.
* @dst: The buffer to copy entries into.
* @start: The first index in the XArray eligible to be selected.
* @max: The last index in the XArray eligible to be selected.
* @n: The maximum number of entries to copy.
* @filter: Selection criterion.
*
* Copies up to @n entries that match @filter from the XArray. The
* copied entries will have indices between @start and @max, inclusive.
*
* The @filter may be an XArray mark value, in which case entries which are
* marked with that mark will be copied. It may also be %XA_PRESENT, in
* which case all entries which are not %NULL will be copied.
*
* The entries returned may not represent a snapshot of the XArray at a
* moment in time. For example, if another thread stores to index 5, then
* index 10, calling xa_extract() may return the old contents of index 5
* and the new contents of index 10. Indices not modified while this
* function is running will not be skipped.
*
* If you need stronger guarantees, holding the xa_lock across calls to this
* function will prevent concurrent modification.
*
* Context: Any context. Takes and releases the RCU lock.
* Return: The number of entries copied.
*/
unsigned int xa_extract(struct xarray *xa, void **dst, unsigned long start,
unsigned long max, unsigned int n, xa_mark_t filter)
{
XA_STATE(xas, xa, start);
if (!n)
return 0;
if ((__force unsigned int)filter < XA_MAX_MARKS)
return xas_extract_marked(&xas, dst, max, n, filter);
return xas_extract_present(&xas, dst, max, n);
}
EXPORT_SYMBOL(xa_extract);
/**
* xa_delete_node() - Private interface for workingset code.
* @node: Node to be removed from the tree.
* @update: Function to call to update ancestor nodes.
*
* Context: xa_lock must be held on entry and will not be released.
*/
void xa_delete_node(struct xa_node *node, xa_update_node_t update)
{
struct xa_state xas = {
.xa = node->array,
.xa_index = (unsigned long)node->offset <<
(node->shift + XA_CHUNK_SHIFT),
.xa_shift = node->shift + XA_CHUNK_SHIFT,
.xa_offset = node->offset,
.xa_node = xa_parent_locked(node->array, node),
.xa_update = update,
};
xas_store(&xas, NULL);
}
EXPORT_SYMBOL_GPL(xa_delete_node); /* For the benefit of the test suite */
/**
* xa_destroy() - Free all internal data structures.
* @xa: XArray.
*
* After calling this function, the XArray is empty and has freed all memory
* allocated for its internal data structures. You are responsible for
* freeing the objects referenced by the XArray.
*
* Context: Any context. Takes and releases the xa_lock, interrupt-safe.
*/
void xa_destroy(struct xarray *xa)
{
XA_STATE(xas, xa, 0);
unsigned long flags;
void *entry;
xas.xa_node = NULL;
xas_lock_irqsave(&xas, flags);
entry = xa_head_locked(xa);
RCU_INIT_POINTER(xa->xa_head, NULL);
xas_init_marks(&xas);
if (xa_zero_busy(xa))
xa_mark_clear(xa, XA_FREE_MARK);
/* lockdep checks we're still holding the lock in xas_free_nodes() */
if (xa_is_node(entry))
xas_free_nodes(&xas, xa_to_node(entry));
xas_unlock_irqrestore(&xas, flags);
}
EXPORT_SYMBOL(xa_destroy);
#ifdef XA_DEBUG
void xa_dump_node(const struct xa_node *node)
{
unsigned i, j;
if (!node)
return;
if ((unsigned long)node & 3) {
pr_cont("node %px\n", node);
return;
}
pr_cont("node %px %s %d parent %px shift %d count %d values %d "
"array %px list %px %px marks",
node, node->parent ? "offset" : "max", node->offset,
node->parent, node->shift, node->count, node->nr_values,
node->array, node->private_list.prev, node->private_list.next);
for (i = 0; i < XA_MAX_MARKS; i++)
for (j = 0; j < XA_MARK_LONGS; j++)
pr_cont(" %lx", node->marks[i][j]);
pr_cont("\n");
}
void xa_dump_index(unsigned long index, unsigned int shift)
{
if (!shift)
pr_info("%lu: ", index);
else if (shift >= BITS_PER_LONG)
pr_info("0-%lu: ", ~0UL);
else
pr_info("%lu-%lu: ", index, index | ((1UL << shift) - 1));
}
void xa_dump_entry(const void *entry, unsigned long index, unsigned long shift)
{
if (!entry)
return;
xa_dump_index(index, shift);
if (xa_is_node(entry)) {
if (shift == 0) {
pr_cont("%px\n", entry);
} else {
unsigned long i;
struct xa_node *node = xa_to_node(entry);
xa_dump_node(node);
for (i = 0; i < XA_CHUNK_SIZE; i++)
xa_dump_entry(node->slots[i],
index + (i << node->shift), node->shift);
}
} else if (xa_is_value(entry))
pr_cont("value %ld (0x%lx) [%px]\n", xa_to_value(entry),
xa_to_value(entry), entry);
else if (!xa_is_internal(entry))
pr_cont("%px\n", entry);
else if (xa_is_retry(entry))
pr_cont("retry (%ld)\n", xa_to_internal(entry));
else if (xa_is_sibling(entry))
pr_cont("sibling (slot %ld)\n", xa_to_sibling(entry));
else if (xa_is_zero(entry))
pr_cont("zero (%ld)\n", xa_to_internal(entry));
else
pr_cont("UNKNOWN ENTRY (%px)\n", entry);
}
void xa_dump(const struct xarray *xa)
{
void *entry = xa->xa_head;
unsigned int shift = 0;
pr_info("xarray: %px head %px flags %x marks %d %d %d\n", xa, entry,
xa->xa_flags, xa_marked(xa, XA_MARK_0),
xa_marked(xa, XA_MARK_1), xa_marked(xa, XA_MARK_2));
if (xa_is_node(entry))
shift = xa_to_node(entry)->shift + XA_CHUNK_SHIFT;
xa_dump_entry(entry, 0, shift);
}
#endif
// SPDX-License-Identifier: GPL-2.0
#include <linux/bitops.h>
#include <linux/slab.h>
#include <linux/bio.h>
#include <linux/mm.h>
#include <linux/pagemap.h>
#include <linux/page-flags.h>
#include <linux/spinlock.h>
#include <linux/blkdev.h>
#include <linux/swap.h>
#include <linux/writeback.h>
#include <linux/pagevec.h>
#include <linux/prefetch.h>
#include <linux/cleancache.h>
#include <linux/fsverity.h>
#include "misc.h"
#include "extent_io.h"
#include "extent-io-tree.h"
#include "extent_map.h"
#include "ctree.h"
#include "btrfs_inode.h"
#include "volumes.h"
#include "check-integrity.h"
#include "locking.h"
#include "rcu-string.h"
#include "backref.h"
#include "disk-io.h"
#include "subpage.h"
#include "zoned.h"
#include "block-group.h"
static struct kmem_cache *extent_state_cache;
static struct kmem_cache *extent_buffer_cache;
static struct bio_set btrfs_bioset;
static inline bool extent_state_in_tree(const struct extent_state *state)
{
return !RB_EMPTY_NODE(&state->rb_node);
}
#ifdef CONFIG_BTRFS_DEBUG
static LIST_HEAD(states);
static DEFINE_SPINLOCK(leak_lock);
static inline void btrfs_leak_debug_add(spinlock_t *lock,
struct list_head *new,
struct list_head *head)
{
unsigned long flags;
spin_lock_irqsave(lock, flags);
list_add(new, head);
spin_unlock_irqrestore(lock, flags);
}
static inline void btrfs_leak_debug_del(spinlock_t *lock,
struct list_head *entry)
{
unsigned long flags;
spin_lock_irqsave(lock, flags);
list_del(entry);
spin_unlock_irqrestore(lock, flags);
}
void btrfs_extent_buffer_leak_debug_check(struct btrfs_fs_info *fs_info)
{
struct extent_buffer *eb;
unsigned long flags;
/*
* If we didn't get into open_ctree our allocated_ebs will not be
* initialized, so just skip this.
*/
if (!fs_info->allocated_ebs.next)
return;
spin_lock_irqsave(&fs_info->eb_leak_lock, flags);
while (!list_empty(&fs_info->allocated_ebs)) {
eb = list_first_entry(&fs_info->allocated_ebs,
struct extent_buffer, leak_list);
pr_err(
"BTRFS: buffer leak start %llu len %lu refs %d bflags %lu owner %llu\n",
eb->start, eb->len, atomic_read(&eb->refs), eb->bflags,
btrfs_header_owner(eb));
list_del(&eb->leak_list);
kmem_cache_free(extent_buffer_cache, eb);
}
spin_unlock_irqrestore(&fs_info->eb_leak_lock, flags);
}
static inline void btrfs_extent_state_leak_debug_check(void)
{
struct extent_state *state;
while (!list_empty(&states)) {
state = list_entry(states.next, struct extent_state, leak_list);
pr_err("BTRFS: state leak: start %llu end %llu state %u in tree %d refs %d\n",
state->start, state->end, state->state,
extent_state_in_tree(state),
refcount_read(&state->refs));
list_del(&state->leak_list);
kmem_cache_free(extent_state_cache, state);
}
}
#define btrfs_debug_check_extent_io_range(tree, start, end) \
__btrfs_debug_check_extent_io_range(__func__, (tree), (start), (end))
static inline void __btrfs_debug_check_extent_io_range(const char *caller,
struct extent_io_tree *tree, u64 start, u64 end)
{
struct inode *inode = tree->private_data;
u64 isize;
if (!inode || !is_data_inode(inode))
return;
isize = i_size_read(inode);
if (end >= PAGE_SIZE && (end % 2) == 0 && end != isize - 1) {
btrfs_debug_rl(BTRFS_I(inode)->root->fs_info,
"%s: ino %llu isize %llu odd range [%llu,%llu]",
caller, btrfs_ino(BTRFS_I(inode)), isize, start, end);
}
}
#else
#define btrfs_leak_debug_add(lock, new, head) do {} while (0)
#define btrfs_leak_debug_del(lock, entry) do {} while (0)
#define btrfs_extent_state_leak_debug_check() do {} while (0)
#define btrfs_debug_check_extent_io_range(c, s, e) do {} while (0)
#endif
struct tree_entry {
u64 start;
u64 end;
struct rb_node rb_node;
};
struct extent_page_data {
struct btrfs_bio_ctrl bio_ctrl;
/* tells writepage not to lock the state bits for this range
* it still does the unlocking
*/
unsigned int extent_locked:1;
/* tells the submit_bio code to use REQ_SYNC */
unsigned int sync_io:1;
};
static int add_extent_changeset(struct extent_state *state, u32 bits,
struct extent_changeset *changeset,
int set)
{
int ret;
if (!changeset)
return 0;
if (set && (state->state & bits) == bits)
return 0;
if (!set && (state->state & bits) == 0)
return 0;
changeset->bytes_changed += state->end - state->start + 1;
ret = ulist_add(&changeset->range_changed, state->start, state->end,
GFP_ATOMIC);
return ret;
}
int __must_check submit_one_bio(struct bio *bio, int mirror_num,
unsigned long bio_flags)
{
blk_status_t ret = 0;
struct extent_io_tree *tree = bio->bi_private;
bio->bi_private = NULL;
/* Caller should ensure the bio has at least some range added */
ASSERT(bio->bi_iter.bi_size);
if (is_data_inode(tree->private_data)) ret = btrfs_submit_data_bio(tree->private_data, bio, mirror_num,
bio_flags);
else
ret = btrfs_submit_metadata_bio(tree->private_data, bio,
mirror_num, bio_flags);
return blk_status_to_errno(ret);
}
/* Cleanup unsubmitted bios */
static void end_write_bio(struct extent_page_data *epd, int ret)
{
struct bio *bio = epd->bio_ctrl.bio;
if (bio) {
bio->bi_status = errno_to_blk_status(ret);
bio_endio(bio);
epd->bio_ctrl.bio = NULL;
}
}
/*
* Submit bio from extent page data via submit_one_bio
*
* Return 0 if everything is OK.
* Return <0 for error.
*/
static int __must_check flush_write_bio(struct extent_page_data *epd)
{
int ret = 0;
struct bio *bio = epd->bio_ctrl.bio;
if (bio) {
ret = submit_one_bio(bio, 0, 0);
/*
* Clean up of epd->bio is handled by its endio function.
* And endio is either triggered by successful bio execution
* or the error handler of submit bio hook.
* So at this point, no matter what happened, we don't need
* to clean up epd->bio.
*/
epd->bio_ctrl.bio = NULL;
}
return ret;
}
int __init extent_state_cache_init(void)
{
extent_state_cache = kmem_cache_create("btrfs_extent_state",
sizeof(struct extent_state), 0,
SLAB_MEM_SPREAD, NULL);
if (!extent_state_cache)
return -ENOMEM;
return 0;
}
int __init extent_io_init(void)
{
extent_buffer_cache = kmem_cache_create("btrfs_extent_buffer",
sizeof(struct extent_buffer), 0,
SLAB_MEM_SPREAD, NULL);
if (!extent_buffer_cache)
return -ENOMEM;
if (bioset_init(&btrfs_bioset, BIO_POOL_SIZE,
offsetof(struct btrfs_io_bio, bio),
BIOSET_NEED_BVECS))
goto free_buffer_cache;
if (bioset_integrity_create(&btrfs_bioset, BIO_POOL_SIZE))
goto free_bioset;
return 0;
free_bioset:
bioset_exit(&btrfs_bioset);
free_buffer_cache:
kmem_cache_destroy(extent_buffer_cache);
extent_buffer_cache = NULL;
return -ENOMEM;
}
void __cold extent_state_cache_exit(void)
{
btrfs_extent_state_leak_debug_check();
kmem_cache_destroy(extent_state_cache);
}
void __cold extent_io_exit(void)
{
/*
* Make sure all delayed rcu free are flushed before we
* destroy caches.
*/
rcu_barrier();
kmem_cache_destroy(extent_buffer_cache);
bioset_exit(&btrfs_bioset);
}
/*
* For the file_extent_tree, we want to hold the inode lock when we lookup and
* update the disk_i_size, but lockdep will complain because our io_tree we hold
* the tree lock and get the inode lock when setting delalloc. These two things
* are unrelated, so make a class for the file_extent_tree so we don't get the
* two locking patterns mixed up.
*/
static struct lock_class_key file_extent_tree_class;
void extent_io_tree_init(struct btrfs_fs_info *fs_info,
struct extent_io_tree *tree, unsigned int owner,
void *private_data)
{
tree->fs_info = fs_info;
tree->state = RB_ROOT;
tree->dirty_bytes = 0;
spin_lock_init(&tree->lock);
tree->private_data = private_data;
tree->owner = owner;
if (owner == IO_TREE_INODE_FILE_EXTENT)
lockdep_set_class(&tree->lock, &file_extent_tree_class);
}
void extent_io_tree_release(struct extent_io_tree *tree)
{
spin_lock(&tree->lock);
/*
* Do a single barrier for the waitqueue_active check here, the state
* of the waitqueue should not change once extent_io_tree_release is
* called.
*/
smp_mb();
while (!RB_EMPTY_ROOT(&tree->state)) {
struct rb_node *node;
struct extent_state *state;
node = rb_first(&tree->state);
state = rb_entry(node, struct extent_state, rb_node);
rb_erase(&state->rb_node, &tree->state);
RB_CLEAR_NODE(&state->rb_node);
/*
* btree io trees aren't supposed to have tasks waiting for
* changes in the flags of extent states ever.
*/
ASSERT(!waitqueue_active(&state->wq));
free_extent_state(state);
cond_resched_lock(&tree->lock);
}
spin_unlock(&tree->lock);
}
static struct extent_state *alloc_extent_state(gfp_t mask)
{
struct extent_state *state;
/*
* The given mask might be not appropriate for the slab allocator,
* drop the unsupported bits
*/
mask &= ~(__GFP_DMA32|__GFP_HIGHMEM);
state = kmem_cache_alloc(extent_state_cache, mask);
if (!state)
return state;
state->state = 0;
state->failrec = NULL;
RB_CLEAR_NODE(&state->rb_node);
btrfs_leak_debug_add(&leak_lock, &state->leak_list, &states);
refcount_set(&state->refs, 1);
init_waitqueue_head(&state->wq);
trace_alloc_extent_state(state, mask, _RET_IP_);
return state;
}
void free_extent_state(struct extent_state *state)
{
if (!state)
return;
if (refcount_dec_and_test(&state->refs)) { WARN_ON(extent_state_in_tree(state));
btrfs_leak_debug_del(&leak_lock, &state->leak_list);
trace_free_extent_state(state, _RET_IP_); kmem_cache_free(extent_state_cache, state);
}
}
static struct rb_node *tree_insert(struct rb_root *root,
struct rb_node *search_start,
u64 offset,
struct rb_node *node,
struct rb_node ***p_in,
struct rb_node **parent_in)
{
struct rb_node **p;
struct rb_node *parent = NULL;
struct tree_entry *entry;
if (p_in && parent_in) { p = *p_in;
parent = *parent_in;
goto do_insert;
}
p = search_start ? &search_start : &root->rb_node;
while (*p) {
parent = *p;
entry = rb_entry(parent, struct tree_entry, rb_node);
if (offset < entry->start) p = &(*p)->rb_left; else if (offset > entry->end) p = &(*p)->rb_right;
else
return parent;
}
do_insert:
rb_link_node(node, parent, p);
rb_insert_color(node, root);
return NULL;
}
/**
* Search @tree for an entry that contains @offset. Such entry would have
* entry->start <= offset && entry->end >= offset.
*
* @tree: the tree to search
* @offset: offset that should fall within an entry in @tree
* @next_ret: pointer to the first entry whose range ends after @offset
* @prev_ret: pointer to the first entry whose range begins before @offset
* @p_ret: pointer where new node should be anchored (used when inserting an
* entry in the tree)
* @parent_ret: points to entry which would have been the parent of the entry,
* containing @offset
*
* This function returns a pointer to the entry that contains @offset byte
* address. If no such entry exists, then NULL is returned and the other
* pointer arguments to the function are filled, otherwise the found entry is
* returned and other pointers are left untouched.
*/
static struct rb_node *__etree_search(struct extent_io_tree *tree, u64 offset,
struct rb_node **next_ret,
struct rb_node **prev_ret,
struct rb_node ***p_ret,
struct rb_node **parent_ret)
{
struct rb_root *root = &tree->state;
struct rb_node **n = &root->rb_node;
struct rb_node *prev = NULL;
struct rb_node *orig_prev = NULL;
struct tree_entry *entry;
struct tree_entry *prev_entry = NULL;
while (*n) {
prev = *n;
entry = rb_entry(prev, struct tree_entry, rb_node);
prev_entry = entry;
if (offset < entry->start) n = &(*n)->rb_left; else if (offset > entry->end) n = &(*n)->rb_right;
else
return *n;
}
if (p_ret) *p_ret = n; if (parent_ret) *parent_ret = prev;
if (next_ret) {
orig_prev = prev;
while (prev && offset > prev_entry->end) { prev = rb_next(prev);
prev_entry = rb_entry(prev, struct tree_entry, rb_node);
}
*next_ret = prev;
prev = orig_prev;
}
if (prev_ret) {
prev_entry = rb_entry(prev, struct tree_entry, rb_node); while (prev && offset < prev_entry->start) { prev = rb_prev(prev);
prev_entry = rb_entry(prev, struct tree_entry, rb_node);
}
*prev_ret = prev;
}
return NULL;
}
static inline struct rb_node *
tree_search_for_insert(struct extent_io_tree *tree,
u64 offset,
struct rb_node ***p_ret,
struct rb_node **parent_ret)
{
struct rb_node *next= NULL;
struct rb_node *ret;
ret = __etree_search(tree, offset, &next, NULL, p_ret, parent_ret);
if (!ret) return next;
return ret;
}
static inline struct rb_node *tree_search(struct extent_io_tree *tree,
u64 offset)
{
return tree_search_for_insert(tree, offset, NULL, NULL);
}
/*
* utility function to look for merge candidates inside a given range.
* Any extents with matching state are merged together into a single
* extent in the tree. Extents with EXTENT_IO in their state field
* are not merged because the end_io handlers need to be able to do
* operations on them without sleeping (or doing allocations/splits).
*
* This should be called with the tree lock held.
*/
static void merge_state(struct extent_io_tree *tree,
struct extent_state *state)
{
struct extent_state *other;
struct rb_node *other_node;
if (state->state & (EXTENT_LOCKED | EXTENT_BOUNDARY))
return;
other_node = rb_prev(&state->rb_node);
if (other_node) {
other = rb_entry(other_node, struct extent_state, rb_node); if (other->end == state->start - 1 && other->state == state->state) { if (tree->private_data &&
is_data_inode(tree->private_data))
btrfs_merge_delalloc_extent(tree->private_data,
state, other);
state->start = other->start;
rb_erase(&other->rb_node, &tree->state);
RB_CLEAR_NODE(&other->rb_node);
free_extent_state(other);
}
}
other_node = rb_next(&state->rb_node);
if (other_node) {
other = rb_entry(other_node, struct extent_state, rb_node); if (other->start == state->end + 1 && other->state == state->state) { if (tree->private_data &&
is_data_inode(tree->private_data))
btrfs_merge_delalloc_extent(tree->private_data,
state, other);
state->end = other->end;
rb_erase(&other->rb_node, &tree->state);
RB_CLEAR_NODE(&other->rb_node);
free_extent_state(other);
}
}
}
static void set_state_bits(struct extent_io_tree *tree,
struct extent_state *state, u32 *bits,
struct extent_changeset *changeset);
/*
* insert an extent_state struct into the tree. 'bits' are set on the
* struct before it is inserted.
*
* This may return -EEXIST if the extent is already there, in which case the
* state struct is freed.
*
* The tree lock is not taken internally. This is a utility function and
* probably isn't what you want to call (see set/clear_extent_bit).
*/
static int insert_state(struct extent_io_tree *tree,
struct extent_state *state, u64 start, u64 end,
struct rb_node ***p,
struct rb_node **parent,
u32 *bits, struct extent_changeset *changeset)
{
struct rb_node *node;
if (end < start) {
btrfs_err(tree->fs_info,
"insert state: end < start %llu %llu", end, start);
WARN_ON(1);
}
state->start = start;
state->end = end;
set_state_bits(tree, state, bits, changeset);
node = tree_insert(&tree->state, NULL, end, &state->rb_node, p, parent);
if (node) {
struct extent_state *found;
found = rb_entry(node, struct extent_state, rb_node);
btrfs_err(tree->fs_info,
"found node %llu %llu on insert of %llu %llu",
found->start, found->end, start, end);
return -EEXIST;
}
merge_state(tree, state);
return 0;
}
/*
* split a given extent state struct in two, inserting the preallocated
* struct 'prealloc' as the newly created second half. 'split' indicates an
* offset inside 'orig' where it should be split.
*
* Before calling,
* the tree has 'orig' at [orig->start, orig->end]. After calling, there
* are two extent state structs in the tree:
* prealloc: [orig->start, split - 1]
* orig: [ split, orig->end ]
*
* The tree locks are not taken by this function. They need to be held
* by the caller.
*/
static int split_state(struct extent_io_tree *tree, struct extent_state *orig,
struct extent_state *prealloc, u64 split)
{
struct rb_node *node;
if (tree->private_data && is_data_inode(tree->private_data)) btrfs_split_delalloc_extent(tree->private_data, orig, split); prealloc->start = orig->start;
prealloc->end = split - 1;
prealloc->state = orig->state;
orig->start = split;
node = tree_insert(&tree->state, &orig->rb_node, prealloc->end,
&prealloc->rb_node, NULL, NULL);
if (node) {
free_extent_state(prealloc);
return -EEXIST;
}
return 0;
}
static struct extent_state *next_state(struct extent_state *state)
{
struct rb_node *next = rb_next(&state->rb_node);
if (next)
return rb_entry(next, struct extent_state, rb_node);
else
return NULL;
}
/*
* utility function to clear some bits in an extent state struct.
* it will optionally wake up anyone waiting on this state (wake == 1).
*
* If no bits are set on the state struct after clearing things, the
* struct is freed and removed from the tree
*/
static struct extent_state *clear_state_bit(struct extent_io_tree *tree,
struct extent_state *state,
u32 *bits, int wake,
struct extent_changeset *changeset)
{
struct extent_state *next;
u32 bits_to_clear = *bits & ~EXTENT_CTLBITS;
int ret;
if ((bits_to_clear & EXTENT_DIRTY) && (state->state & EXTENT_DIRTY)) { u64 range = state->end - state->start + 1; WARN_ON(range > tree->dirty_bytes); tree->dirty_bytes -= range;
}
if (tree->private_data && is_data_inode(tree->private_data)) btrfs_clear_delalloc_extent(tree->private_data, state, bits);
ret = add_extent_changeset(state, bits_to_clear, changeset, 0);
BUG_ON(ret < 0); state->state &= ~bits_to_clear;
if (wake)
wake_up(&state->wq); if (state->state == 0) {
next = next_state(state);
if (extent_state_in_tree(state)) {
rb_erase(&state->rb_node, &tree->state);
RB_CLEAR_NODE(&state->rb_node);
free_extent_state(state);
} else {
WARN_ON(1);
}
} else {
merge_state(tree, state);
next = next_state(state);
}
return next;
}
static struct extent_state *
alloc_extent_state_atomic(struct extent_state *prealloc)
{
if (!prealloc) prealloc = alloc_extent_state(GFP_ATOMIC);
return prealloc;
}
static void extent_io_tree_panic(struct extent_io_tree *tree, int err)
{
btrfs_panic(tree->fs_info, err,
"locking error: extent tree was modified by another thread while locked");
}
/*
* clear some bits on a range in the tree. This may require splitting
* or inserting elements in the tree, so the gfp mask is used to
* indicate which allocations or sleeping are allowed.
*
* pass 'wake' == 1 to kick any sleepers, and 'delete' == 1 to remove
* the given range from the tree regardless of state (ie for truncate).
*
* the range [start, end] is inclusive.
*
* This takes the tree lock, and returns 0 on success and < 0 on error.
*/
int __clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
u32 bits, int wake, int delete,
struct extent_state **cached_state,
gfp_t mask, struct extent_changeset *changeset)
{
struct extent_state *state;
struct extent_state *cached;
struct extent_state *prealloc = NULL;
struct rb_node *node;
u64 last_end;
int err;
int clear = 0;
btrfs_debug_check_extent_io_range(tree, start, end);
trace_btrfs_clear_extent_bit(tree, start, end - start + 1, bits); if (bits & EXTENT_DELALLOC) bits |= EXTENT_NORESERVE; if (delete) bits |= ~EXTENT_CTLBITS; if (bits & (EXTENT_LOCKED | EXTENT_BOUNDARY))
clear = 1;
again:
if (!prealloc && gfpflags_allow_blocking(mask)) {
/*
* Don't care for allocation failure here because we might end
* up not needing the pre-allocated extent state at all, which
* is the case if we only have in the tree extent states that
* cover our input range and don't cover too any other range.
* If we end up needing a new extent state we allocate it later.
*/
prealloc = alloc_extent_state(mask);
}
spin_lock(&tree->lock);
if (cached_state) {
cached = *cached_state;
if (clear) {
*cached_state = NULL;
cached_state = NULL;
}
if (cached && extent_state_in_tree(cached) && cached->start <= start && cached->end > start) { if (clear) refcount_dec(&cached->refs);
state = cached;
goto hit_next;
}
if (clear) free_extent_state(cached);
}
/*
* this search will find the extents that end after
* our range starts
*/
node = tree_search(tree, start);
if (!node)
goto out;
state = rb_entry(node, struct extent_state, rb_node);
hit_next:
if (state->start > end)
goto out;
WARN_ON(state->end < start);
last_end = state->end;
/* the state doesn't have the wanted bits, go ahead */
if (!(state->state & bits)) {
state = next_state(state);
goto next;
}
/*
* | ---- desired range ---- |
* | state | or
* | ------------- state -------------- |
*
* We need to split the extent we found, and may flip
* bits on second half.
*
* If the extent we found extends past our range, we
* just split and search again. It'll get split again
* the next time though.
*
* If the extent we found is inside our range, we clear
* the desired bit on it.
*/
if (state->start < start) {
prealloc = alloc_extent_state_atomic(prealloc);
BUG_ON(!prealloc); err = split_state(tree, state, prealloc, start);
if (err)
extent_io_tree_panic(tree, err);
prealloc = NULL;
if (err)
goto out;
if (state->end <= end) { state = clear_state_bit(tree, state, &bits, wake,
changeset);
goto next;
}
goto search_again;
}
/*
* | ---- desired range ---- |
* | state |
* We need to split the extent, and clear the bit
* on the first half
*/
if (state->start <= end && state->end > end) {
prealloc = alloc_extent_state_atomic(prealloc);
BUG_ON(!prealloc); err = split_state(tree, state, prealloc, end + 1);
if (err)
extent_io_tree_panic(tree, err);
if (wake) wake_up(&state->wq); clear_state_bit(tree, prealloc, &bits, wake, changeset);
prealloc = NULL;
goto out;
}
state = clear_state_bit(tree, state, &bits, wake, changeset);
next:
if (last_end == (u64)-1)
goto out;
start = last_end + 1; if (start <= end && state && !need_resched())
goto hit_next;
search_again:
if (start > end)
goto out;
spin_unlock(&tree->lock);
if (gfpflags_allow_blocking(mask))
cond_resched();
goto again;
out:
spin_unlock(&tree->lock);
if (prealloc)
free_extent_state(prealloc); return 0;
}
static void wait_on_state(struct extent_io_tree *tree,
struct extent_state *state)
__releases(tree->lock)
__acquires(tree->lock)
{
DEFINE_WAIT(wait);
prepare_to_wait(&state->wq, &wait, TASK_UNINTERRUPTIBLE);
spin_unlock(&tree->lock);
schedule();
spin_lock(&tree->lock);
finish_wait(&state->wq, &wait);
}
/*
* waits for one or more bits to clear on a range in the state tree.
* The range [start, end] is inclusive.
* The tree lock is taken by this function
*/
static void wait_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
u32 bits)
{
struct extent_state *state;
struct rb_node *node;
btrfs_debug_check_extent_io_range(tree, start, end);
spin_lock(&tree->lock);
again:
while (1) {
/*
* this search will find all the extents that end after
* our range starts
*/
node = tree_search(tree, start);
process_node:
if (!node)
break;
state = rb_entry(node, struct extent_state, rb_node);
if (state->start > end)
goto out;
if (state->state & bits) {
start = state->start;
refcount_inc(&state->refs);
wait_on_state(tree, state);
free_extent_state(state);
goto again;
}
start = state->end + 1;
if (start > end)
break;
if (!cond_resched_lock(&tree->lock)) { node = rb_next(node);
goto process_node;
}
}
out:
spin_unlock(&tree->lock);
}
static void set_state_bits(struct extent_io_tree *tree,
struct extent_state *state,
u32 *bits, struct extent_changeset *changeset)
{
u32 bits_to_set = *bits & ~EXTENT_CTLBITS;
int ret;
if (tree->private_data && is_data_inode(tree->private_data)) btrfs_set_delalloc_extent(tree->private_data, state, bits); if ((bits_to_set & EXTENT_DIRTY) && !(state->state & EXTENT_DIRTY)) { u64 range = state->end - state->start + 1;
tree->dirty_bytes += range;
}
ret = add_extent_changeset(state, bits_to_set, changeset, 1);
BUG_ON(ret < 0); state->state |= bits_to_set;
}
static void cache_state_if_flags(struct extent_state *state,
struct extent_state **cached_ptr,
unsigned flags)
{
if (cached_ptr && !(*cached_ptr)) { if (!flags || (state->state & flags)) { *cached_ptr = state;
refcount_inc(&state->refs);
}
}
}
static void cache_state(struct extent_state *state,
struct extent_state **cached_ptr)
{
return cache_state_if_flags(state, cached_ptr,
EXTENT_LOCKED | EXTENT_BOUNDARY);
}
/*
* set some bits on a range in the tree. This may require allocations or
* sleeping, so the gfp mask is used to indicate what is allowed.
*
* If any of the exclusive bits are set, this will fail with -EEXIST if some
* part of the range already has the desired bits set. The start of the
* existing range is returned in failed_start in this case.
*
* [start, end] is inclusive This takes the tree lock.
*/
int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, u32 bits,
u32 exclusive_bits, u64 *failed_start,
struct extent_state **cached_state, gfp_t mask,
struct extent_changeset *changeset)
{
struct extent_state *state;
struct extent_state *prealloc = NULL;
struct rb_node *node;
struct rb_node **p;
struct rb_node *parent;
int err = 0;
u64 last_start;
u64 last_end;
btrfs_debug_check_extent_io_range(tree, start, end);
trace_btrfs_set_extent_bit(tree, start, end - start + 1, bits);
if (exclusive_bits)
ASSERT(failed_start);
else
ASSERT(failed_start == NULL);
again:
if (!prealloc && gfpflags_allow_blocking(mask)) {
/*
* Don't care for allocation failure here because we might end
* up not needing the pre-allocated extent state at all, which
* is the case if we only have in the tree extent states that
* cover our input range and don't cover too any other range.
* If we end up needing a new extent state we allocate it later.
*/
prealloc = alloc_extent_state(mask);
}
spin_lock(&tree->lock);
if (cached_state && *cached_state) {
state = *cached_state;
if (state->start <= start && state->end > start &&
extent_state_in_tree(state)) {
node = &state->rb_node;
goto hit_next;
}
}
/*
* this search will find all the extents that end after
* our range starts.
*/
node = tree_search_for_insert(tree, start, &p, &parent);
if (!node) {
prealloc = alloc_extent_state_atomic(prealloc);
BUG_ON(!prealloc); err = insert_state(tree, prealloc, start, end,
&p, &parent, &bits, changeset);
if (err)
extent_io_tree_panic(tree, err);
cache_state(prealloc, cached_state);
prealloc = NULL;
goto out;
}
state = rb_entry(node, struct extent_state, rb_node);
hit_next:
last_start = state->start;
last_end = state->end;
/*
* | ---- desired range ---- |
* | state |
*
* Just lock what we found and keep going
*/
if (state->start == start && state->end <= end) { if (state->state & exclusive_bits) { *failed_start = state->start;
err = -EEXIST;
goto out;
}
set_state_bits(tree, state, &bits, changeset);
cache_state(state, cached_state);
merge_state(tree, state);
if (last_end == (u64)-1)
goto out;
start = last_end + 1;
state = next_state(state);
if (start < end && state && state->start == start &&
!need_resched())
goto hit_next;
goto search_again;
}
/*
* | ---- desired range ---- |
* | state |
* or
* | ------------- state -------------- |
*
* We need to split the extent we found, and may flip bits on
* second half.
*
* If the extent we found extends past our
* range, we just split and search again. It'll get split
* again the next time though.
*
* If the extent we found is inside our range, we set the
* desired bit on it.
*/
if (state->start < start) { if (state->state & exclusive_bits) { *failed_start = start;
err = -EEXIST;
goto out;
}
/*
* If this extent already has all the bits we want set, then
* skip it, not necessary to split it or do anything with it.
*/
if ((state->state & bits) == bits) { start = state->end + 1;
cache_state(state, cached_state);
goto search_again;
}
prealloc = alloc_extent_state_atomic(prealloc);
BUG_ON(!prealloc); err = split_state(tree, state, prealloc, start);
if (err)
extent_io_tree_panic(tree, err);
prealloc = NULL;
if (err)
goto out;
if (state->end <= end) { set_state_bits(tree, state, &bits, changeset);
cache_state(state, cached_state);
merge_state(tree, state);
if (last_end == (u64)-1)
goto out;
start = last_end + 1;
state = next_state(state);
if (start < end && state && state->start == start &&
!need_resched())
goto hit_next;
}
goto search_again;
}
/*
* | ---- desired range ---- |
* | state | or | state |
*
* There's a hole, we need to insert something in it and
* ignore the extent we found.
*/
if (state->start > start) {
u64 this_end;
if (end < last_start)
this_end = end;
else
this_end = last_start - 1;
prealloc = alloc_extent_state_atomic(prealloc);
BUG_ON(!prealloc);
/*
* Avoid to free 'prealloc' if it can be merged with
* the later extent.
*/
err = insert_state(tree, prealloc, start, this_end,
NULL, NULL, &bits, changeset);
if (err)
extent_io_tree_panic(tree, err);
cache_state(prealloc, cached_state);
prealloc = NULL;
start = this_end + 1;
goto search_again;
}
/*
* | ---- desired range ---- |
* | state |
* We need to split the extent, and set the bit
* on the first half
*/
if (state->start <= end && state->end > end) { if (state->state & exclusive_bits) { *failed_start = start;
err = -EEXIST;
goto out;
}
prealloc = alloc_extent_state_atomic(prealloc);
BUG_ON(!prealloc); err = split_state(tree, state, prealloc, end + 1);
if (err)
extent_io_tree_panic(tree, err);
set_state_bits(tree, prealloc, &bits, changeset);
cache_state(prealloc, cached_state);
merge_state(tree, prealloc);
prealloc = NULL;
goto out;
}
search_again:
if (start > end)
goto out;
spin_unlock(&tree->lock);
if (gfpflags_allow_blocking(mask))
cond_resched();
goto again;
out:
spin_unlock(&tree->lock);
if (prealloc)
free_extent_state(prealloc); return err;
}
/**
* convert_extent_bit - convert all bits in a given range from one bit to
* another
* @tree: the io tree to search
* @start: the start offset in bytes
* @end: the end offset in bytes (inclusive)
* @bits: the bits to set in this range
* @clear_bits: the bits to clear in this range
* @cached_state: state that we're going to cache
*
* This will go through and set bits for the given range. If any states exist
* already in this range they are set with the given bit and cleared of the
* clear_bits. This is only meant to be used by things that are mergeable, ie
* converting from say DELALLOC to DIRTY. This is not meant to be used with
* boundary bits like LOCK.
*
* All allocations are done with GFP_NOFS.
*/
int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
u32 bits, u32 clear_bits,
struct extent_state **cached_state)
{
struct extent_state *state;
struct extent_state *prealloc = NULL;
struct rb_node *node;
struct rb_node **p;
struct rb_node *parent;
int err = 0;
u64 last_start;
u64 last_end;
bool first_iteration = true;
btrfs_debug_check_extent_io_range(tree, start, end);
trace_btrfs_convert_extent_bit(tree, start, end - start + 1, bits,
clear_bits);
again:
if (!prealloc) {
/*
* Best effort, don't worry if extent state allocation fails
* here for the first iteration. We might have a cached state
* that matches exactly the target range, in which case no
* extent state allocations are needed. We'll only know this
* after locking the tree.
*/
prealloc = alloc_extent_state(GFP_NOFS); if (!prealloc && !first_iteration)
return -ENOMEM;
}
spin_lock(&tree->lock);
if (cached_state && *cached_state) {
state = *cached_state;
if (state->start <= start && state->end > start &&
extent_state_in_tree(state)) {
node = &state->rb_node;
goto hit_next;
}
}
/*
* this search will find all the extents that end after
* our range starts.
*/
node = tree_search_for_insert(tree, start, &p, &parent);
if (!node) {
prealloc = alloc_extent_state_atomic(prealloc);
if (!prealloc) {
err = -ENOMEM;
goto out;
}
err = insert_state(tree, prealloc, start, end,
&p, &parent, &bits, NULL);
if (err)
extent_io_tree_panic(tree, err);
cache_state(prealloc, cached_state);
prealloc = NULL;
goto out;
}
state = rb_entry(node, struct extent_state, rb_node);
hit_next:
last_start = state->start;
last_end = state->end;
/*
* | ---- desired range ---- |
* | state |
*
* Just lock what we found and keep going
*/
if (state->start == start && state->end <= end) { set_state_bits(tree, state, &bits, NULL);
cache_state(state, cached_state);
state = clear_state_bit(tree, state, &clear_bits, 0, NULL);
if (last_end == (u64)-1)
goto out;
start = last_end + 1; if (start < end && state && state->start == start &&
!need_resched())
goto hit_next;
goto search_again;
}
/*
* | ---- desired range ---- |
* | state |
* or
* | ------------- state -------------- |
*
* We need to split the extent we found, and may flip bits on
* second half.
*
* If the extent we found extends past our
* range, we just split and search again. It'll get split
* again the next time though.
*
* If the extent we found is inside our range, we set the
* desired bit on it.
*/
if (state->start < start) {
prealloc = alloc_extent_state_atomic(prealloc);
if (!prealloc) {
err = -ENOMEM;
goto out;
}
err = split_state(tree, state, prealloc, start);
if (err)
extent_io_tree_panic(tree, err);
prealloc = NULL;
if (err)
goto out;
if (state->end <= end) { set_state_bits(tree, state, &bits, NULL);
cache_state(state, cached_state);
state = clear_state_bit(tree, state, &clear_bits, 0,
NULL);
if (last_end == (u64)-1)
goto out;
start = last_end + 1; if (start < end && state && state->start == start &&
!need_resched())
goto hit_next;
}
goto search_again;
}
/*
* | ---- desired range ---- |
* | state | or | state |
*
* There's a hole, we need to insert something in it and
* ignore the extent we found.
*/
if (state->start > start) {
u64 this_end;
if (end < last_start)
this_end = end;
else
this_end = last_start - 1;
prealloc = alloc_extent_state_atomic(prealloc);
if (!prealloc) {
err = -ENOMEM;
goto out;
}
/*
* Avoid to free 'prealloc' if it can be merged with
* the later extent.
*/
err = insert_state(tree, prealloc, start, this_end,
NULL, NULL, &bits, NULL);
if (err)
extent_io_tree_panic(tree, err);
cache_state(prealloc, cached_state);
prealloc = NULL;
start = this_end + 1;
goto search_again;
}
/*
* | ---- desired range ---- |
* | state |
* We need to split the extent, and set the bit
* on the first half
*/
if (state->start <= end && state->end > end) {
prealloc = alloc_extent_state_atomic(prealloc);
if (!prealloc) {
err = -ENOMEM;
goto out;
}
err = split_state(tree, state, prealloc, end + 1);
if (err)
extent_io_tree_panic(tree, err);
set_state_bits(tree, prealloc, &bits, NULL);
cache_state(prealloc, cached_state);
clear_state_bit(tree, prealloc, &clear_bits, 0, NULL);
prealloc = NULL;
goto out;
}
search_again:
if (start > end)
goto out;
spin_unlock(&tree->lock);
cond_resched();
first_iteration = false;
goto again;
out:
spin_unlock(&tree->lock);
if (prealloc) free_extent_state(prealloc);
return err;
}
/* wrappers around set/clear extent bit */
int set_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
u32 bits, struct extent_changeset *changeset)
{
/*
* We don't support EXTENT_LOCKED yet, as current changeset will
* record any bits changed, so for EXTENT_LOCKED case, it will
* either fail with -EEXIST or changeset will record the whole
* range.
*/
BUG_ON(bits & EXTENT_LOCKED);
return set_extent_bit(tree, start, end, bits, 0, NULL, NULL, GFP_NOFS,
changeset);
}
int set_extent_bits_nowait(struct extent_io_tree *tree, u64 start, u64 end,
u32 bits)
{
return set_extent_bit(tree, start, end, bits, 0, NULL, NULL,
GFP_NOWAIT, NULL);
}
int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
u32 bits, int wake, int delete,
struct extent_state **cached)
{
return __clear_extent_bit(tree, start, end, bits, wake, delete,
cached, GFP_NOFS, NULL);
}
int clear_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
u32 bits, struct extent_changeset *changeset)
{
/*
* Don't support EXTENT_LOCKED case, same reason as
* set_record_extent_bits().
*/
BUG_ON(bits & EXTENT_LOCKED); return __clear_extent_bit(tree, start, end, bits, 0, 0, NULL, GFP_NOFS,
changeset);
}
/*
* either insert or lock state struct between start and end use mask to tell
* us if waiting is desired.
*/
int lock_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
struct extent_state **cached_state)
{
int err;
u64 failed_start;
while (1) {
err = set_extent_bit(tree, start, end, EXTENT_LOCKED,
EXTENT_LOCKED, &failed_start,
cached_state, GFP_NOFS, NULL);
if (err == -EEXIST) {
wait_extent_bit(tree, failed_start, end, EXTENT_LOCKED);
start = failed_start;
} else
break;
WARN_ON(start > end);
}
return err;
}
int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end)
{
int err;
u64 failed_start;
err = set_extent_bit(tree, start, end, EXTENT_LOCKED, EXTENT_LOCKED,
&failed_start, NULL, GFP_NOFS, NULL);
if (err == -EEXIST) {
if (failed_start > start)
clear_extent_bit(tree, start, failed_start - 1,
EXTENT_LOCKED, 1, 0, NULL);
return 0;
}
return 1;
}
void extent_range_clear_dirty_for_io(struct inode *inode, u64 start, u64 end)
{
unsigned long index = start >> PAGE_SHIFT;
unsigned long end_index = end >> PAGE_SHIFT;
struct page *page;
while (index <= end_index) {
page = find_get_page(inode->i_mapping, index);
BUG_ON(!page); /* Pages should be in the extent_io_tree */
clear_page_dirty_for_io(page);
put_page(page);
index++;
}
}
void extent_range_redirty_for_io(struct inode *inode, u64 start, u64 end)
{
unsigned long index = start >> PAGE_SHIFT;
unsigned long end_index = end >> PAGE_SHIFT;
struct page *page;
while (index <= end_index) {
page = find_get_page(inode->i_mapping, index);
BUG_ON(!page); /* Pages should be in the extent_io_tree */
__set_page_dirty_nobuffers(page);
account_page_redirty(page);
put_page(page);
index++;
}
}
/* find the first state struct with 'bits' set after 'start', and
* return it. tree->lock must be held. NULL will returned if
* nothing was found after 'start'
*/
static struct extent_state *
find_first_extent_bit_state(struct extent_io_tree *tree, u64 start, u32 bits)
{
struct rb_node *node;
struct extent_state *state;
/*
* this search will find all the extents that end after
* our range starts.
*/
node = tree_search(tree, start);
if (!node)
goto out;
while (1) {
state = rb_entry(node, struct extent_state, rb_node); if (state->end >= start && (state->state & bits))
return state;
node = rb_next(node);
if (!node)
break;
}
out:
return NULL;
}
/*
* Find the first offset in the io tree with one or more @bits set.
*
* Note: If there are multiple bits set in @bits, any of them will match.
*
* Return 0 if we find something, and update @start_ret and @end_ret.
* Return 1 if we found nothing.
*/
int find_first_extent_bit(struct extent_io_tree *tree, u64 start,
u64 *start_ret, u64 *end_ret, u32 bits,
struct extent_state **cached_state)
{
struct extent_state *state;
int ret = 1;
spin_lock(&tree->lock);
if (cached_state && *cached_state) {
state = *cached_state;
if (state->end == start - 1 && extent_state_in_tree(state)) {
while ((state = next_state(state)) != NULL) {
if (state->state & bits)
goto got_it;
}
free_extent_state(*cached_state);
*cached_state = NULL;
goto out;
}
free_extent_state(*cached_state);
*cached_state = NULL;
}
state = find_first_extent_bit_state(tree, start, bits);
got_it:
if (state) {
cache_state_if_flags(state, cached_state, 0);
*start_ret = state->start;
*end_ret = state->end;
ret = 0;
}
out:
spin_unlock(&tree->lock);
return ret;
}
/**
* Find a contiguous area of bits
*
* @tree: io tree to check
* @start: offset to start the search from
* @start_ret: the first offset we found with the bits set
* @end_ret: the final contiguous range of the bits that were set
* @bits: bits to look for
*
* set_extent_bit and clear_extent_bit can temporarily split contiguous ranges
* to set bits appropriately, and then merge them again. During this time it
* will drop the tree->lock, so use this helper if you want to find the actual
* contiguous area for given bits. We will search to the first bit we find, and
* then walk down the tree until we find a non-contiguous area. The area
* returned will be the full contiguous area with the bits set.
*/
int find_contiguous_extent_bit(struct extent_io_tree *tree, u64 start,
u64 *start_ret, u64 *end_ret, u32 bits)
{
struct extent_state *state;
int ret = 1;
spin_lock(&tree->lock);
state = find_first_extent_bit_state(tree, start, bits);
if (state) {
*start_ret = state->start;
*end_ret = state->end;
while ((state = next_state(state)) != NULL) {
if (state->start > (*end_ret + 1))
break;
*end_ret = state->end;
}
ret = 0;
}
spin_unlock(&tree->lock);
return ret;
}
/**
* Find the first range that has @bits not set. This range could start before
* @start.
*
* @tree: the tree to search
* @start: offset at/after which the found extent should start
* @start_ret: records the beginning of the range
* @end_ret: records the end of the range (inclusive)
* @bits: the set of bits which must be unset
*
* Since unallocated range is also considered one which doesn't have the bits
* set it's possible that @end_ret contains -1, this happens in case the range
* spans (last_range_end, end of device]. In this case it's up to the caller to
* trim @end_ret to the appropriate size.
*/
void find_first_clear_extent_bit(struct extent_io_tree *tree, u64 start,
u64 *start_ret, u64 *end_ret, u32 bits)
{
struct extent_state *state;
struct rb_node *node, *prev = NULL, *next;
spin_lock(&tree->lock);
/* Find first extent with bits cleared */
while (1) {
node = __etree_search(tree, start, &next, &prev, NULL, NULL);
if (!node && !next && !prev) {
/*
* Tree is completely empty, send full range and let
* caller deal with it
*/
*start_ret = 0;
*end_ret = -1;
goto out;
} else if (!node && !next) {
/*
* We are past the last allocated chunk, set start at
* the end of the last extent.
*/
state = rb_entry(prev, struct extent_state, rb_node);
*start_ret = state->end + 1;
*end_ret = -1;
goto out;
} else if (!node) {
node = next;
}
/*
* At this point 'node' either contains 'start' or start is
* before 'node'
*/
state = rb_entry(node, struct extent_state, rb_node);
if (in_range(start, state->start, state->end - state->start + 1)) {
if (state->state & bits) {
/*
* |--range with bits sets--|
* |
* start
*/
start = state->end + 1;
} else {
/*
* 'start' falls within a range that doesn't
* have the bits set, so take its start as
* the beginning of the desired range
*
* |--range with bits cleared----|
* |
* start
*/
*start_ret = state->start;
break;
}
} else {
/*
* |---prev range---|---hole/unset---|---node range---|
* |
* start
*
* or
*
* |---hole/unset--||--first node--|
* 0 |
* start
*/
if (prev) {
state = rb_entry(prev, struct extent_state,
rb_node);
*start_ret = state->end + 1;
} else {
*start_ret = 0;
}
break;
}
}
/*
* Find the longest stretch from start until an entry which has the
* bits set
*/
while (1) {
state = rb_entry(node, struct extent_state, rb_node);
if (state->end >= start && !(state->state & bits)) {
*end_ret = state->end;
} else {
*end_ret = state->start - 1;
break;
}
node = rb_next(node);
if (!node)
break;
}
out:
spin_unlock(&tree->lock);
}
/*
* find a contiguous range of bytes in the file marked as delalloc, not
* more than 'max_bytes'. start and end are used to return the range,
*
* true is returned if we find something, false if nothing was in the tree
*/
bool btrfs_find_delalloc_range(struct extent_io_tree *tree, u64 *start,
u64 *end, u64 max_bytes,
struct extent_state **cached_state)
{
struct rb_node *node;
struct extent_state *state;
u64 cur_start = *start;
bool found = false;
u64 total_bytes = 0;
spin_lock(&tree->lock);
/*
* this search will find all the extents that end after
* our range starts.
*/
node = tree_search(tree, cur_start);
if (!node) {
*end = (u64)-1;
goto out;
}
while (1) {
state = rb_entry(node, struct extent_state, rb_node);
if (found && (state->start != cur_start ||
(state->state & EXTENT_BOUNDARY))) {
goto out;
}
if (!(state->state & EXTENT_DELALLOC)) { if (!found) *end = state->end;
goto out;
}
if (!found) { *start = state->start;
*cached_state = state;
refcount_inc(&state->refs);
}
found = true;
*end = state->end;
cur_start = state->end + 1;
node = rb_next(node);
total_bytes += state->end - state->start + 1;
if (total_bytes >= max_bytes)
break;
if (!node)
break;
}
out:
spin_unlock(&tree->lock);
return found;
}
/*
* Process one page for __process_pages_contig().
*
* Return >0 if we hit @page == @locked_page.
* Return 0 if we updated the page status.
* Return -EGAIN if the we need to try again.
* (For PAGE_LOCK case but got dirty page or page not belong to mapping)
*/
static int process_one_page(struct btrfs_fs_info *fs_info,
struct address_space *mapping,
struct page *page, struct page *locked_page,
unsigned long page_ops, u64 start, u64 end)
{
u32 len;
ASSERT(end + 1 - start != 0 && end + 1 - start < U32_MAX);
len = end + 1 - start;
if (page_ops & PAGE_SET_ORDERED)
btrfs_page_clamp_set_ordered(fs_info, page, start, len); if (page_ops & PAGE_SET_ERROR) btrfs_page_clamp_set_error(fs_info, page, start, len); if (page_ops & PAGE_START_WRITEBACK) { btrfs_page_clamp_clear_dirty(fs_info, page, start, len);
btrfs_page_clamp_set_writeback(fs_info, page, start, len);
}
if (page_ops & PAGE_END_WRITEBACK) btrfs_page_clamp_clear_writeback(fs_info, page, start, len); if (page == locked_page)
return 1;
if (page_ops & PAGE_LOCK) {
int ret;
ret = btrfs_page_start_writer_lock(fs_info, page, start, len);
if (ret)
return ret;
if (!PageDirty(page) || page->mapping != mapping) { btrfs_page_end_writer_lock(fs_info, page, start, len);
return -EAGAIN;
}
}
if (page_ops & PAGE_UNLOCK) btrfs_page_end_writer_lock(fs_info, page, start, len);
return 0;
}
static int __process_pages_contig(struct address_space *mapping,
struct page *locked_page,
u64 start, u64 end, unsigned long page_ops,
u64 *processed_end)
{
struct btrfs_fs_info *fs_info = btrfs_sb(mapping->host->i_sb);
pgoff_t start_index = start >> PAGE_SHIFT;
pgoff_t end_index = end >> PAGE_SHIFT;
pgoff_t index = start_index;
unsigned long nr_pages = end_index - start_index + 1;
unsigned long pages_processed = 0;
struct page *pages[16];
int err = 0;
int i;
if (page_ops & PAGE_LOCK) {
ASSERT(page_ops == PAGE_LOCK);
ASSERT(processed_end && *processed_end == start);
}
if ((page_ops & PAGE_SET_ERROR) && nr_pages > 0)
mapping_set_error(mapping, -EIO);
while (nr_pages > 0) {
int found_pages;
found_pages = find_get_pages_contig(mapping, index,
min_t(unsigned long,
nr_pages, ARRAY_SIZE(pages)), pages);
if (found_pages == 0) {
/*
* Only if we're going to lock these pages, we can find
* nothing at @index.
*/
ASSERT(page_ops & PAGE_LOCK);
err = -EAGAIN;
goto out;
}
for (i = 0; i < found_pages; i++) {
int process_ret;
process_ret = process_one_page(fs_info, mapping,
pages[i], locked_page, page_ops,
start, end);
if (process_ret < 0) { for (; i < found_pages; i++) put_page(pages[i]);
err = -EAGAIN;
goto out;
}
put_page(pages[i]); pages_processed++;
}
nr_pages -= found_pages;
index += found_pages;
cond_resched();
}
out:
if (err && processed_end) {
/*
* Update @processed_end. I know this is awful since it has
* two different return value patterns (inclusive vs exclusive).
*
* But the exclusive pattern is necessary if @start is 0, or we
* underflow and check against processed_end won't work as
* expected.
*/
if (pages_processed) *processed_end = min(end,
((u64)(start_index + pages_processed) << PAGE_SHIFT) - 1);
else
*processed_end = start;
}
return err;
}
static noinline void __unlock_for_delalloc(struct inode *inode,
struct page *locked_page,
u64 start, u64 end)
{
unsigned long index = start >> PAGE_SHIFT;
unsigned long end_index = end >> PAGE_SHIFT;
ASSERT(locked_page);
if (index == locked_page->index && end_index == index)
return;
__process_pages_contig(inode->i_mapping, locked_page, start, end,
PAGE_UNLOCK, NULL);
}
static noinline int lock_delalloc_pages(struct inode *inode,
struct page *locked_page,
u64 delalloc_start,
u64 delalloc_end)
{
unsigned long index = delalloc_start >> PAGE_SHIFT; unsigned long end_index = delalloc_end >> PAGE_SHIFT;
u64 processed_end = delalloc_start;
int ret;
ASSERT(locked_page);
if (index == locked_page->index && index == end_index)
return 0;
ret = __process_pages_contig(inode->i_mapping, locked_page, delalloc_start,
delalloc_end, PAGE_LOCK, &processed_end);
if (ret == -EAGAIN && processed_end > delalloc_start) __unlock_for_delalloc(inode, locked_page, delalloc_start,
processed_end);
return ret;
}
/*
* Find and lock a contiguous range of bytes in the file marked as delalloc, no
* more than @max_bytes. @Start and @end are used to return the range,
*
* Return: true if we find something
* false if nothing was in the tree
*/
EXPORT_FOR_TESTS
noinline_for_stack bool find_lock_delalloc_range(struct inode *inode,
struct page *locked_page, u64 *start,
u64 *end)
{
struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
u64 max_bytes = BTRFS_MAX_EXTENT_SIZE;
u64 delalloc_start;
u64 delalloc_end;
bool found;
struct extent_state *cached_state = NULL;
int ret;
int loops = 0;
again:
/* step one, find a bunch of delalloc bytes starting at start */
delalloc_start = *start;
delalloc_end = 0;
found = btrfs_find_delalloc_range(tree, &delalloc_start, &delalloc_end,
max_bytes, &cached_state);
if (!found || delalloc_end <= *start) { *start = delalloc_start;
*end = delalloc_end;
free_extent_state(cached_state);
return false;
}
/*
* start comes from the offset of locked_page. We have to lock
* pages in order, so we can't process delalloc bytes before
* locked_page
*/
if (delalloc_start < *start) delalloc_start = *start;
/*
* make sure to limit the number of pages we try to lock down
*/
if (delalloc_end + 1 - delalloc_start > max_bytes) delalloc_end = delalloc_start + max_bytes - 1;
/* step two, lock all the pages after the page that has start */
ret = lock_delalloc_pages(inode, locked_page,
delalloc_start, delalloc_end);
ASSERT(!ret || ret == -EAGAIN);
if (ret == -EAGAIN) {
/* some of the pages are gone, lets avoid looping by
* shortening the size of the delalloc range we're searching
*/
free_extent_state(cached_state);
cached_state = NULL;
if (!loops) {
max_bytes = PAGE_SIZE;
loops = 1;
goto again;
} else {
found = false;
goto out_failed;
}
}
/* step three, lock the state bits for the whole range */
lock_extent_bits(tree, delalloc_start, delalloc_end, &cached_state);
/* then test to make sure it is all still delalloc */
ret = test_range_bit(tree, delalloc_start, delalloc_end,
EXTENT_DELALLOC, 1, cached_state);
if (!ret) {
unlock_extent_cached(tree, delalloc_start, delalloc_end,
&cached_state);
__unlock_for_delalloc(inode, locked_page,
delalloc_start, delalloc_end);
cond_resched();
goto again;
}
free_extent_state(cached_state);
*start = delalloc_start;
*end = delalloc_end;
out_failed:
return found;
}
void extent_clear_unlock_delalloc(struct btrfs_inode *inode, u64 start, u64 end,
struct page *locked_page,
u32 clear_bits, unsigned long page_ops)
{
clear_extent_bit(&inode->io_tree, start, end, clear_bits, 1, 0, NULL);
__process_pages_contig(inode->vfs_inode.i_mapping, locked_page,
start, end, page_ops, NULL);
}
/*
* count the number of bytes in the tree that have a given bit(s)
* set. This can be fairly slow, except for EXTENT_DIRTY which is
* cached. The total number found is returned.
*/
u64 count_range_bits(struct extent_io_tree *tree,
u64 *start, u64 search_end, u64 max_bytes,
u32 bits, int contig)
{
struct rb_node *node;
struct extent_state *state;
u64 cur_start = *start;
u64 total_bytes = 0;
u64 last = 0;
int found = 0;
if (WARN_ON(search_end <= cur_start))
return 0;
spin_lock(&tree->lock);
if (cur_start == 0 && bits == EXTENT_DIRTY) { total_bytes = tree->dirty_bytes;
goto out;
}
/*
* this search will find all the extents that end after
* our range starts.
*/
node = tree_search(tree, cur_start);
if (!node)
goto out;
while (1) {
state = rb_entry(node, struct extent_state, rb_node);
if (state->start > search_end)
break;
if (contig && found && state->start > last + 1)
break;
if (state->end >= cur_start && (state->state & bits) == bits) { total_bytes += min(search_end, state->end) + 1 -
max(cur_start, state->start);
if (total_bytes >= max_bytes)
break;
if (!found) { *start = max(cur_start, state->start);
found = 1;
}
last = state->end;
} else if (contig && found) {
break;
}
node = rb_next(node);
if (!node)
break;
}
out:
spin_unlock(&tree->lock);
return total_bytes;
}
/*
* set the private field for a given byte offset in the tree. If there isn't
* an extent_state there already, this does nothing.
*/
int set_state_failrec(struct extent_io_tree *tree, u64 start,
struct io_failure_record *failrec)
{
struct rb_node *node;
struct extent_state *state;
int ret = 0;
spin_lock(&tree->lock);
/*
* this search will find all the extents that end after
* our range starts.
*/
node = tree_search(tree, start);
if (!node) {
ret = -ENOENT;
goto out;
}
state = rb_entry(node, struct extent_state, rb_node);
if (state->start != start) {
ret = -ENOENT;
goto out;
}
state->failrec = failrec;
out:
spin_unlock(&tree->lock);
return ret;
}
struct io_failure_record *get_state_failrec(struct extent_io_tree *tree, u64 start)
{
struct rb_node *node;
struct extent_state *state;
struct io_failure_record *failrec;
spin_lock(&tree->lock);
/*
* this search will find all the extents that end after
* our range starts.
*/
node = tree_search(tree, start);
if (!node) {
failrec = ERR_PTR(-ENOENT);
goto out;
}
state = rb_entry(node, struct extent_state, rb_node);
if (state->start != start) {
failrec = ERR_PTR(-ENOENT);
goto out;
}
failrec = state->failrec;
out:
spin_unlock(&tree->lock);
return failrec;
}
/*
* searches a range in the state tree for a given mask.
* If 'filled' == 1, this returns 1 only if every extent in the tree
* has the bits set. Otherwise, 1 is returned if any bit in the
* range is found set.
*/
int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end,
u32 bits, int filled, struct extent_state *cached)
{
struct extent_state *state = NULL;
struct rb_node *node;
int bitset = 0;
spin_lock(&tree->lock);
if (cached && extent_state_in_tree(cached) && cached->start <= start && cached->end > start)
node = &cached->rb_node;
else
node = tree_search(tree, start);
while (node && start <= end) {
state = rb_entry(node, struct extent_state, rb_node);
if (filled && state->start > start) {
bitset = 0;
break;
}
if (state->start > end)
break;
if (state->state & bits) {
bitset = 1;
if (!filled)
break;
} else if (filled) {
bitset = 0;
break;
}
if (state->end == (u64)-1)
break;
start = state->end + 1;
if (start > end)
break;
node = rb_next(node);
if (!node) {
if (filled)
bitset = 0;
break;
}
}
spin_unlock(&tree->lock);
return bitset;
}
int free_io_failure(struct extent_io_tree *failure_tree,
struct extent_io_tree *io_tree,
struct io_failure_record *rec)
{
int ret;
int err = 0;
set_state_failrec(failure_tree, rec->start, NULL);
ret = clear_extent_bits(failure_tree, rec->start,
rec->start + rec->len - 1,
EXTENT_LOCKED | EXTENT_DIRTY);
if (ret)
err = ret;
ret = clear_extent_bits(io_tree, rec->start,
rec->start + rec->len - 1,
EXTENT_DAMAGED);
if (ret && !err)
err = ret;
kfree(rec);
return err;
}
/*
* this bypasses the standard btrfs submit functions deliberately, as
* the standard behavior is to write all copies in a raid setup. here we only
* want to write the one bad copy. so we do the mapping for ourselves and issue
* submit_bio directly.
* to avoid any synchronization issues, wait for the data after writing, which
* actually prevents the read that triggered the error from finishing.
* currently, there can be no more than two copies of every data bit. thus,
* exactly one rewrite is required.
*/
int repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start,
u64 length, u64 logical, struct page *page,
unsigned int pg_offset, int mirror_num)
{
struct bio *bio;
struct btrfs_device *dev;
u64 map_length = 0;
u64 sector;
struct btrfs_bio *bbio = NULL;
int ret;
ASSERT(!(fs_info->sb->s_flags & SB_RDONLY));
BUG_ON(!mirror_num); if (btrfs_is_zoned(fs_info)) return btrfs_repair_one_zone(fs_info, logical); bio = btrfs_io_bio_alloc(1);
bio->bi_iter.bi_size = 0;
map_length = length;
/*
* Avoid races with device replace and make sure our bbio has devices
* associated to its stripes that don't go away while we are doing the
* read repair operation.
*/
btrfs_bio_counter_inc_blocked(fs_info);
if (btrfs_is_parity_mirror(fs_info, logical, length)) {
/*
* Note that we don't use BTRFS_MAP_WRITE because it's supposed
* to update all raid stripes, but here we just want to correct
* bad stripe, thus BTRFS_MAP_READ is abused to only get the bad
* stripe's dev and sector.
*/
ret = btrfs_map_block(fs_info, BTRFS_MAP_READ, logical,
&map_length, &bbio, 0);
if (ret) {
btrfs_bio_counter_dec(fs_info);
bio_put(bio);
return -EIO;
}
ASSERT(bbio->mirror_num == 1);
} else {
ret = btrfs_map_block(fs_info, BTRFS_MAP_WRITE, logical,
&map_length, &bbio, mirror_num);
if (ret) {
btrfs_bio_counter_dec(fs_info);
bio_put(bio);
return -EIO;
}
BUG_ON(mirror_num != bbio->mirror_num);
}
sector = bbio->stripes[bbio->mirror_num - 1].physical >> 9;
bio->bi_iter.bi_sector = sector;
dev = bbio->stripes[bbio->mirror_num - 1].dev;
btrfs_put_bbio(bbio);
if (!dev || !dev->bdev || !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state)) {
btrfs_bio_counter_dec(fs_info);
bio_put(bio);
return -EIO;
}
bio_set_dev(bio, dev->bdev);
bio->bi_opf = REQ_OP_WRITE | REQ_SYNC;
bio_add_page(bio, page, length, pg_offset);
if (btrfsic_submit_bio_wait(bio)) {
/* try to remap that extent elsewhere? */
btrfs_bio_counter_dec(fs_info);
bio_put(bio);
btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_WRITE_ERRS);
return -EIO;
}
btrfs_info_rl_in_rcu(fs_info,
"read error corrected: ino %llu off %llu (dev %s sector %llu)",
ino, start,
rcu_str_deref(dev->name), sector);
btrfs_bio_counter_dec(fs_info);
bio_put(bio);
return 0;
}
int btrfs_repair_eb_io_failure(const struct extent_buffer *eb, int mirror_num)
{
struct btrfs_fs_info *fs_info = eb->fs_info;
u64 start = eb->start;
int i, num_pages = num_extent_pages(eb);
int ret = 0;
if (sb_rdonly(fs_info->sb))
return -EROFS;
for (i = 0; i < num_pages; i++) { struct page *p = eb->pages[i];
ret = repair_io_failure(fs_info, 0, start, PAGE_SIZE, start, p,
start - page_offset(p), mirror_num);
if (ret)
break;
start += PAGE_SIZE;
}
return ret;
}
/*
* each time an IO finishes, we do a fast check in the IO failure tree
* to see if we need to process or clean up an io_failure_record
*/
int clean_io_failure(struct btrfs_fs_info *fs_info,
struct extent_io_tree *failure_tree,
struct extent_io_tree *io_tree, u64 start,
struct page *page, u64 ino, unsigned int pg_offset)
{
u64 private;
struct io_failure_record *failrec;
struct extent_state *state;
int num_copies;
int ret;
private = 0;
ret = count_range_bits(failure_tree, &private, (u64)-1, 1,
EXTENT_DIRTY, 0);
if (!ret)
return 0;
failrec = get_state_failrec(failure_tree, start);
if (IS_ERR(failrec))
return 0;
BUG_ON(!failrec->this_mirror);
if (sb_rdonly(fs_info->sb))
goto out;
spin_lock(&io_tree->lock);
state = find_first_extent_bit_state(io_tree,
failrec->start,
EXTENT_LOCKED);
spin_unlock(&io_tree->lock);
if (state && state->start <= failrec->start &&
state->end >= failrec->start + failrec->len - 1) {
num_copies = btrfs_num_copies(fs_info, failrec->logical,
failrec->len);
if (num_copies > 1) {
repair_io_failure(fs_info, ino, start, failrec->len,
failrec->logical, page, pg_offset,
failrec->failed_mirror);
}
}
out:
free_io_failure(failure_tree, io_tree, failrec);
return 0;
}
/*
* Can be called when
* - hold extent lock
* - under ordered extent
* - the inode is freeing
*/
void btrfs_free_io_failure_record(struct btrfs_inode *inode, u64 start, u64 end)
{
struct extent_io_tree *failure_tree = &inode->io_failure_tree;
struct io_failure_record *failrec;
struct extent_state *state, *next;
if (RB_EMPTY_ROOT(&failure_tree->state))
return;
spin_lock(&failure_tree->lock);
state = find_first_extent_bit_state(failure_tree, start, EXTENT_DIRTY);
while (state) {
if (state->start > end)
break;
ASSERT(state->end <= end);
next = next_state(state);
failrec = state->failrec;
free_extent_state(state);
kfree(failrec);
state = next;
}
spin_unlock(&failure_tree->lock);
}
static struct io_failure_record *btrfs_get_io_failure_record(struct inode *inode,
u64 start)
{
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct io_failure_record *failrec;
struct extent_map *em;
struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree;
struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
const u32 sectorsize = fs_info->sectorsize;
int ret;
u64 logical;
failrec = get_state_failrec(failure_tree, start);
if (!IS_ERR(failrec)) {
btrfs_debug(fs_info,
"Get IO Failure Record: (found) logical=%llu, start=%llu, len=%llu",
failrec->logical, failrec->start, failrec->len);
/*
* when data can be on disk more than twice, add to failrec here
* (e.g. with a list for failed_mirror) to make
* clean_io_failure() clean all those errors at once.
*/
return failrec;
}
failrec = kzalloc(sizeof(*failrec), GFP_NOFS);
if (!failrec)
return ERR_PTR(-ENOMEM);
failrec->start = start;
failrec->len = sectorsize;
failrec->this_mirror = 0;
failrec->bio_flags = 0;
read_lock(&em_tree->lock);
em = lookup_extent_mapping(em_tree, start, failrec->len);
if (!em) {
read_unlock(&em_tree->lock);
kfree(failrec);
return ERR_PTR(-EIO);
}
if (em->start > start || em->start + em->len <= start) {
free_extent_map(em);
em = NULL;
}
read_unlock(&em_tree->lock);
if (!em) {
kfree(failrec);
return ERR_PTR(-EIO);
}
logical = start - em->start;
logical = em->block_start + logical;
if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
logical = em->block_start;
failrec->bio_flags = EXTENT_BIO_COMPRESSED;
extent_set_compress_type(&failrec->bio_flags, em->compress_type);
}
btrfs_debug(fs_info,
"Get IO Failure Record: (new) logical=%llu, start=%llu, len=%llu",
logical, start, failrec->len);
failrec->logical = logical;
free_extent_map(em);
/* Set the bits in the private failure tree */
ret = set_extent_bits(failure_tree, start, start + sectorsize - 1,
EXTENT_LOCKED | EXTENT_DIRTY);
if (ret >= 0) {
ret = set_state_failrec(failure_tree, start, failrec);
/* Set the bits in the inode's tree */
ret = set_extent_bits(tree, start, start + sectorsize - 1,
EXTENT_DAMAGED);
} else if (ret < 0) {
kfree(failrec);
return ERR_PTR(ret);
}
return failrec;
}
static bool btrfs_check_repairable(struct inode *inode,
struct io_failure_record *failrec,
int failed_mirror)
{
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
int num_copies;
num_copies = btrfs_num_copies(fs_info, failrec->logical, failrec->len);
if (num_copies == 1) {
/*
* we only have a single copy of the data, so don't bother with
* all the retry and error correction code that follows. no
* matter what the error is, it is very likely to persist.
*/
btrfs_debug(fs_info,
"Check Repairable: cannot repair, num_copies=%d, next_mirror %d, failed_mirror %d",
num_copies, failrec->this_mirror, failed_mirror);
return false;
}
/* The failure record should only contain one sector */
ASSERT(failrec->len == fs_info->sectorsize);
/*
* There are two premises:
* a) deliver good data to the caller
* b) correct the bad sectors on disk
*
* Since we're only doing repair for one sector, we only need to get
* a good copy of the failed sector and if we succeed, we have setup
* everything for repair_io_failure to do the rest for us.
*/
failrec->failed_mirror = failed_mirror;
failrec->this_mirror++;
if (failrec->this_mirror == failed_mirror)
failrec->this_mirror++;
if (failrec->this_mirror > num_copies) {
btrfs_debug(fs_info,
"Check Repairable: (fail) num_copies=%d, next_mirror %d, failed_mirror %d",
num_copies, failrec->this_mirror, failed_mirror);
return false;
}
return true;
}
int btrfs_repair_one_sector(struct inode *inode,
struct bio *failed_bio, u32 bio_offset,
struct page *page, unsigned int pgoff,
u64 start, int failed_mirror,
submit_bio_hook_t *submit_bio_hook)
{
struct io_failure_record *failrec;
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree;
struct btrfs_io_bio *failed_io_bio = btrfs_io_bio(failed_bio);
const int icsum = bio_offset >> fs_info->sectorsize_bits;
struct bio *repair_bio;
struct btrfs_io_bio *repair_io_bio;
blk_status_t status;
btrfs_debug(fs_info,
"repair read error: read error at %llu", start);
BUG_ON(bio_op(failed_bio) == REQ_OP_WRITE);
failrec = btrfs_get_io_failure_record(inode, start);
if (IS_ERR(failrec))
return PTR_ERR(failrec);
if (!btrfs_check_repairable(inode, failrec, failed_mirror)) {
free_io_failure(failure_tree, tree, failrec);
return -EIO;
}
repair_bio = btrfs_io_bio_alloc(1);
repair_io_bio = btrfs_io_bio(repair_bio);
repair_bio->bi_opf = REQ_OP_READ;
repair_bio->bi_end_io = failed_bio->bi_end_io;
repair_bio->bi_iter.bi_sector = failrec->logical >> 9;
repair_bio->bi_private = failed_bio->bi_private;
if (failed_io_bio->csum) {
const u32 csum_size = fs_info->csum_size;
repair_io_bio->csum = repair_io_bio->csum_inline;
memcpy(repair_io_bio->csum,
failed_io_bio->csum + csum_size * icsum, csum_size);
}
bio_add_page(repair_bio, page, failrec->len, pgoff);
repair_io_bio->logical = failrec->start;
repair_io_bio->iter = repair_bio->bi_iter;
btrfs_debug(btrfs_sb(inode->i_sb),
"repair read error: submitting new read to mirror %d",
failrec->this_mirror);
status = submit_bio_hook(inode, repair_bio, failrec->this_mirror,
failrec->bio_flags);
if (status) {
free_io_failure(failure_tree, tree, failrec);
bio_put(repair_bio);
}
return blk_status_to_errno(status);
}
static void end_page_read(struct page *page, bool uptodate, u64 start, u32 len)
{
struct btrfs_fs_info *fs_info = btrfs_sb(page->mapping->host->i_sb);
ASSERT(page_offset(page) <= start &&
start + len <= page_offset(page) + PAGE_SIZE);
if (uptodate) {
if (fsverity_active(page->mapping->host) &&
!PageError(page) &&
!PageUptodate(page) &&
start < i_size_read(page->mapping->host) &&
!fsverity_verify_page(page)) {
btrfs_page_set_error(fs_info, page, start, len);
} else {
btrfs_page_set_uptodate(fs_info, page, start, len);
}
} else {
btrfs_page_clear_uptodate(fs_info, page, start, len);
btrfs_page_set_error(fs_info, page, start, len);
}
if (fs_info->sectorsize == PAGE_SIZE) unlock_page(page);
else
btrfs_subpage_end_reader(fs_info, page, start, len);
}
static blk_status_t submit_read_repair(struct inode *inode,
struct bio *failed_bio, u32 bio_offset,
struct page *page, unsigned int pgoff,
u64 start, u64 end, int failed_mirror,
unsigned int error_bitmap,
submit_bio_hook_t *submit_bio_hook)
{
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
const u32 sectorsize = fs_info->sectorsize;
const int nr_bits = (end + 1 - start) >> fs_info->sectorsize_bits;
int error = 0;
int i;
BUG_ON(bio_op(failed_bio) == REQ_OP_WRITE);
/* We're here because we had some read errors or csum mismatch */
ASSERT(error_bitmap);
/*
* We only get called on buffered IO, thus page must be mapped and bio
* must not be cloned.
*/
ASSERT(page->mapping && !bio_flagged(failed_bio, BIO_CLONED));
/* Iterate through all the sectors in the range */
for (i = 0; i < nr_bits; i++) {
const unsigned int offset = i * sectorsize;
struct extent_state *cached = NULL;
bool uptodate = false;
int ret;
if (!(error_bitmap & (1U << i))) {
/*
* This sector has no error, just end the page read
* and unlock the range.
*/
uptodate = true;
goto next;
}
ret = btrfs_repair_one_sector(inode, failed_bio,
bio_offset + offset,
page, pgoff + offset, start + offset,
failed_mirror, submit_bio_hook);
if (!ret) {
/*
* We have submitted the read repair, the page release
* will be handled by the endio function of the
* submitted repair bio.
* Thus we don't need to do any thing here.
*/
continue;
}
/*
* Repair failed, just record the error but still continue.
* Or the remaining sectors will not be properly unlocked.
*/
if (!error)
error = ret;
next:
end_page_read(page, uptodate, start + offset, sectorsize);
if (uptodate)
set_extent_uptodate(&BTRFS_I(inode)->io_tree,
start + offset,
start + offset + sectorsize - 1,
&cached, GFP_ATOMIC);
unlock_extent_cached_atomic(&BTRFS_I(inode)->io_tree,
start + offset,
start + offset + sectorsize - 1,
&cached);
}
return errno_to_blk_status(error);
}
/* lots and lots of room for performance fixes in the end_bio funcs */
void end_extent_writepage(struct page *page, int err, u64 start, u64 end)
{
struct btrfs_inode *inode;
const bool uptodate = (err == 0);
int ret = 0;
ASSERT(page && page->mapping);
inode = BTRFS_I(page->mapping->host);
btrfs_writepage_endio_finish_ordered(inode, page, start, end, uptodate);
if (!uptodate) {
const struct btrfs_fs_info *fs_info = inode->root->fs_info;
u32 len;
ASSERT(end + 1 - start <= U32_MAX);
len = end + 1 - start;
btrfs_page_clear_uptodate(fs_info, page, start, len);
btrfs_page_set_error(fs_info, page, start, len);
ret = err < 0 ? err : -EIO;
mapping_set_error(page->mapping, ret);
}
}
/*
* after a writepage IO is done, we need to:
* clear the uptodate bits on error
* clear the writeback bits in the extent tree for this IO
* end_page_writeback if the page has no more pending IO
*
* Scheduling is not allowed, so the extent state tree is expected
* to have one and only one object corresponding to this IO.
*/
static void end_bio_extent_writepage(struct bio *bio)
{
int error = blk_status_to_errno(bio->bi_status);
struct bio_vec *bvec;
u64 start;
u64 end;
struct bvec_iter_all iter_all;
bool first_bvec = true;
ASSERT(!bio_flagged(bio, BIO_CLONED));
bio_for_each_segment_all(bvec, bio, iter_all) {
struct page *page = bvec->bv_page;
struct inode *inode = page->mapping->host;
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
const u32 sectorsize = fs_info->sectorsize;
/* Our read/write should always be sector aligned. */
if (!IS_ALIGNED(bvec->bv_offset, sectorsize))
btrfs_err(fs_info,
"partial page write in btrfs with offset %u and length %u",
bvec->bv_offset, bvec->bv_len);
else if (!IS_ALIGNED(bvec->bv_len, sectorsize))
btrfs_info(fs_info,
"incomplete page write with offset %u and length %u",
bvec->bv_offset, bvec->bv_len);
start = page_offset(page) + bvec->bv_offset;
end = start + bvec->bv_len - 1;
if (first_bvec) {
btrfs_record_physical_zoned(inode, start, bio);
first_bvec = false;
}
end_extent_writepage(page, error, start, end);
btrfs_page_clear_writeback(fs_info, page, start, bvec->bv_len);
}
bio_put(bio);
}
/*
* Record previously processed extent range
*
* For endio_readpage_release_extent() to handle a full extent range, reducing
* the extent io operations.
*/
struct processed_extent {
struct btrfs_inode *inode;
/* Start of the range in @inode */
u64 start;
/* End of the range in @inode */
u64 end;
bool uptodate;
};
/*
* Try to release processed extent range
*
* May not release the extent range right now if the current range is
* contiguous to processed extent.
*
* Will release processed extent when any of @inode, @uptodate, the range is
* no longer contiguous to the processed range.
*
* Passing @inode == NULL will force processed extent to be released.
*/
static void endio_readpage_release_extent(struct processed_extent *processed,
struct btrfs_inode *inode, u64 start, u64 end,
bool uptodate)
{
struct extent_state *cached = NULL;
struct extent_io_tree *tree;
/* The first extent, initialize @processed */
if (!processed->inode)
goto update;
/*
* Contiguous to processed extent, just uptodate the end.
*
* Several things to notice:
*
* - bio can be merged as long as on-disk bytenr is contiguous
* This means we can have page belonging to other inodes, thus need to
* check if the inode still matches.
* - bvec can contain range beyond current page for multi-page bvec
* Thus we need to do processed->end + 1 >= start check
*/
if (processed->inode == inode && processed->uptodate == uptodate &&
processed->end + 1 >= start && end >= processed->end) {
processed->end = end;
return;
}
tree = &processed->inode->io_tree;
/*
* Now we don't have range contiguous to the processed range, release
* the processed range now.
*/
if (processed->uptodate && tree->track_uptodate)
set_extent_uptodate(tree, processed->start, processed->end,
&cached, GFP_ATOMIC);
unlock_extent_cached_atomic(tree, processed->start, processed->end,
&cached);
update:
/* Update processed to current range */
processed->inode = inode;
processed->start = start;
processed->end = end;
processed->uptodate = uptodate;
}
static void begin_page_read(struct btrfs_fs_info *fs_info, struct page *page)
{
ASSERT(PageLocked(page));
if (fs_info->sectorsize == PAGE_SIZE)
return;
ASSERT(PagePrivate(page));
btrfs_subpage_start_reader(fs_info, page, page_offset(page), PAGE_SIZE);
}
/*
* Find extent buffer for a givne bytenr.
*
* This is for end_bio_extent_readpage(), thus we can't do any unsafe locking
* in endio context.
*/
static struct extent_buffer *find_extent_buffer_readpage(
struct btrfs_fs_info *fs_info, struct page *page, u64 bytenr)
{
struct extent_buffer *eb;
/*
* For regular sectorsize, we can use page->private to grab extent
* buffer
*/
if (fs_info->sectorsize == PAGE_SIZE) {
ASSERT(PagePrivate(page) && page->private);
return (struct extent_buffer *)page->private;
}
/* For subpage case, we need to lookup buffer radix tree */
rcu_read_lock();
eb = radix_tree_lookup(&fs_info->buffer_radix,
bytenr >> fs_info->sectorsize_bits);
rcu_read_unlock();
ASSERT(eb);
return eb;
}
/*
* after a readpage IO is done, we need to:
* clear the uptodate bits on error
* set the uptodate bits if things worked
* set the page up to date if all extents in the tree are uptodate
* clear the lock bit in the extent tree
* unlock the page if there are no other extents locked for it
*
* Scheduling is not allowed, so the extent state tree is expected
* to have one and only one object corresponding to this IO.
*/
static void end_bio_extent_readpage(struct bio *bio)
{
struct bio_vec *bvec;
struct btrfs_io_bio *io_bio = btrfs_io_bio(bio);
struct extent_io_tree *tree, *failure_tree;
struct processed_extent processed = { 0 };
/*
* The offset to the beginning of a bio, since one bio can never be
* larger than UINT_MAX, u32 here is enough.
*/
u32 bio_offset = 0;
int mirror;
int ret;
struct bvec_iter_all iter_all;
ASSERT(!bio_flagged(bio, BIO_CLONED));
bio_for_each_segment_all(bvec, bio, iter_all) {
bool uptodate = !bio->bi_status;
struct page *page = bvec->bv_page;
struct inode *inode = page->mapping->host;
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
const u32 sectorsize = fs_info->sectorsize;
unsigned int error_bitmap = (unsigned int)-1;
u64 start;
u64 end;
u32 len;
btrfs_debug(fs_info,
"end_bio_extent_readpage: bi_sector=%llu, err=%d, mirror=%u",
bio->bi_iter.bi_sector, bio->bi_status,
io_bio->mirror_num);
tree = &BTRFS_I(inode)->io_tree;
failure_tree = &BTRFS_I(inode)->io_failure_tree;
/*
* We always issue full-sector reads, but if some block in a
* page fails to read, blk_update_request() will advance
* bv_offset and adjust bv_len to compensate. Print a warning
* for unaligned offsets, and an error if they don't add up to
* a full sector.
*/
if (!IS_ALIGNED(bvec->bv_offset, sectorsize))
btrfs_err(fs_info,
"partial page read in btrfs with offset %u and length %u",
bvec->bv_offset, bvec->bv_len);
else if (!IS_ALIGNED(bvec->bv_offset + bvec->bv_len,
sectorsize))
btrfs_info(fs_info,
"incomplete page read with offset %u and length %u",
bvec->bv_offset, bvec->bv_len);
start = page_offset(page) + bvec->bv_offset;
end = start + bvec->bv_len - 1;
len = bvec->bv_len;
mirror = io_bio->mirror_num;
if (likely(uptodate)) {
if (is_data_inode(inode)) {
error_bitmap = btrfs_verify_data_csum(io_bio,
bio_offset, page, start, end);
ret = error_bitmap;
} else {
ret = btrfs_validate_metadata_buffer(io_bio,
page, start, end, mirror);
}
if (ret)
uptodate = false;
else
clean_io_failure(BTRFS_I(inode)->root->fs_info,
failure_tree, tree, start,
page,
btrfs_ino(BTRFS_I(inode)), 0);
}
if (likely(uptodate))
goto readpage_ok;
if (is_data_inode(inode)) {
/*
* btrfs_submit_read_repair() will handle all the good
* and bad sectors, we just continue to the next bvec.
*/
submit_read_repair(inode, bio, bio_offset, page,
start - page_offset(page), start,
end, mirror, error_bitmap,
btrfs_submit_data_bio);
ASSERT(bio_offset + len > bio_offset);
bio_offset += len;
continue;
} else {
struct extent_buffer *eb;
eb = find_extent_buffer_readpage(fs_info, page, start);
set_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags);
eb->read_mirror = mirror;
atomic_dec(&eb->io_pages);
if (test_and_clear_bit(EXTENT_BUFFER_READAHEAD,
&eb->bflags))
btree_readahead_hook(eb, -EIO);
}
readpage_ok:
if (likely(uptodate)) {
loff_t i_size = i_size_read(inode);
pgoff_t end_index = i_size >> PAGE_SHIFT;
/*
* Zero out the remaining part if this range straddles
* i_size.
*
* Here we should only zero the range inside the bvec,
* not touch anything else.
*
* NOTE: i_size is exclusive while end is inclusive.
*/
if (page->index == end_index && i_size <= end) {
u32 zero_start = max(offset_in_page(i_size),
offset_in_page(start));
zero_user_segment(page, zero_start,
offset_in_page(end) + 1);
}
}
ASSERT(bio_offset + len > bio_offset);
bio_offset += len;
/* Update page status and unlock */
end_page_read(page, uptodate, start, len);
endio_readpage_release_extent(&processed, BTRFS_I(inode),
start, end, PageUptodate(page));
}
/* Release the last extent */
endio_readpage_release_extent(&processed, NULL, 0, 0, false);
btrfs_io_bio_free_csum(io_bio);
bio_put(bio);
}
/*
* Initialize the members up to but not including 'bio'. Use after allocating a
* new bio by bio_alloc_bioset as it does not initialize the bytes outside of
* 'bio' because use of __GFP_ZERO is not supported.
*/
static inline void btrfs_io_bio_init(struct btrfs_io_bio *btrfs_bio)
{
memset(btrfs_bio, 0, offsetof(struct btrfs_io_bio, bio));
}
/*
* The following helpers allocate a bio. As it's backed by a bioset, it'll
* never fail. We're returning a bio right now but you can call btrfs_io_bio
* for the appropriate container_of magic
*/
struct bio *btrfs_bio_alloc(u64 first_byte)
{
struct bio *bio;
bio = bio_alloc_bioset(GFP_NOFS, BIO_MAX_VECS, &btrfs_bioset);
bio->bi_iter.bi_sector = first_byte >> 9;
btrfs_io_bio_init(btrfs_io_bio(bio));
return bio;
}
struct bio *btrfs_bio_clone(struct bio *bio)
{
struct btrfs_io_bio *btrfs_bio;
struct bio *new;
/* Bio allocation backed by a bioset does not fail */
new = bio_clone_fast(bio, GFP_NOFS, &btrfs_bioset);
btrfs_bio = btrfs_io_bio(new);
btrfs_io_bio_init(btrfs_bio);
btrfs_bio->iter = bio->bi_iter;
return new;
}
struct bio *btrfs_io_bio_alloc(unsigned int nr_iovecs)
{
struct bio *bio;
/* Bio allocation backed by a bioset does not fail */
bio = bio_alloc_bioset(GFP_NOFS, nr_iovecs, &btrfs_bioset);
btrfs_io_bio_init(btrfs_io_bio(bio));
return bio;
}
struct bio *btrfs_bio_clone_partial(struct bio *orig, u64 offset, u64 size)
{
struct bio *bio;
struct btrfs_io_bio *btrfs_bio;
ASSERT(offset <= UINT_MAX && size <= UINT_MAX);
/* this will never fail when it's backed by a bioset */
bio = bio_clone_fast(orig, GFP_NOFS, &btrfs_bioset);
ASSERT(bio);
btrfs_bio = btrfs_io_bio(bio);
btrfs_io_bio_init(btrfs_bio);
bio_trim(bio, offset >> 9, size >> 9);
btrfs_bio->iter = bio->bi_iter;
return bio;
}
/**
* Attempt to add a page to bio
*
* @bio: destination bio
* @page: page to add to the bio
* @disk_bytenr: offset of the new bio or to check whether we are adding
* a contiguous page to the previous one
* @pg_offset: starting offset in the page
* @size: portion of page that we want to write
* @prev_bio_flags: flags of previous bio to see if we can merge the current one
* @bio_flags: flags of the current bio to see if we can merge them
*
* Attempt to add a page to bio considering stripe alignment etc.
*
* Return >= 0 for the number of bytes added to the bio.
* Can return 0 if the current bio is already at stripe/zone boundary.
* Return <0 for error.
*/
static int btrfs_bio_add_page(struct btrfs_bio_ctrl *bio_ctrl,
struct page *page,
u64 disk_bytenr, unsigned int size,
unsigned int pg_offset,
unsigned long bio_flags)
{
struct bio *bio = bio_ctrl->bio;
u32 bio_size = bio->bi_iter.bi_size;
u32 real_size;
const sector_t sector = disk_bytenr >> SECTOR_SHIFT;
bool contig;
int ret;
ASSERT(bio);
/* The limit should be calculated when bio_ctrl->bio is allocated */
ASSERT(bio_ctrl->len_to_oe_boundary && bio_ctrl->len_to_stripe_boundary); if (bio_ctrl->bio_flags != bio_flags)
return 0;
if (bio_ctrl->bio_flags & EXTENT_BIO_COMPRESSED)
contig = bio->bi_iter.bi_sector == sector;
else
contig = bio_end_sector(bio) == sector; if (!contig)
return 0;
real_size = min(bio_ctrl->len_to_oe_boundary,
bio_ctrl->len_to_stripe_boundary) - bio_size;
real_size = min(real_size, size);
/*
* If real_size is 0, never call bio_add_*_page(), as even size is 0,
* bio will still execute its endio function on the page!
*/
if (real_size == 0)
return 0;
if (bio_op(bio) == REQ_OP_ZONE_APPEND) ret = bio_add_zone_append_page(bio, page, real_size, pg_offset);
else
ret = bio_add_page(bio, page, real_size, pg_offset);
return ret;
}
static int calc_bio_boundaries(struct btrfs_bio_ctrl *bio_ctrl,
struct btrfs_inode *inode, u64 file_offset)
{
struct btrfs_fs_info *fs_info = inode->root->fs_info;
struct btrfs_io_geometry geom;
struct btrfs_ordered_extent *ordered;
struct extent_map *em;
u64 logical = (bio_ctrl->bio->bi_iter.bi_sector << SECTOR_SHIFT);
int ret;
/*
* Pages for compressed extent are never submitted to disk directly,
* thus it has no real boundary, just set them to U32_MAX.
*
* The split happens for real compressed bio, which happens in
* btrfs_submit_compressed_read/write().
*/
if (bio_ctrl->bio_flags & EXTENT_BIO_COMPRESSED) {
bio_ctrl->len_to_oe_boundary = U32_MAX;
bio_ctrl->len_to_stripe_boundary = U32_MAX;
return 0;
}
em = btrfs_get_chunk_map(fs_info, logical, fs_info->sectorsize);
if (IS_ERR(em))
return PTR_ERR(em);
ret = btrfs_get_io_geometry(fs_info, em, btrfs_op(bio_ctrl->bio),
logical, &geom);
free_extent_map(em);
if (ret < 0) {
return ret;
}
if (geom.len > U32_MAX)
bio_ctrl->len_to_stripe_boundary = U32_MAX;
else
bio_ctrl->len_to_stripe_boundary = (u32)geom.len;
if (!btrfs_is_zoned(fs_info) ||
bio_op(bio_ctrl->bio) != REQ_OP_ZONE_APPEND) { bio_ctrl->len_to_oe_boundary = U32_MAX;
return 0;
}
/* Ordered extent not yet created, so we're good */
ordered = btrfs_lookup_ordered_extent(inode, file_offset);
if (!ordered) {
bio_ctrl->len_to_oe_boundary = U32_MAX;
return 0;
}
bio_ctrl->len_to_oe_boundary = min_t(u32, U32_MAX,
ordered->disk_bytenr + ordered->disk_num_bytes - logical);
btrfs_put_ordered_extent(ordered);
return 0;
}
static int alloc_new_bio(struct btrfs_inode *inode,
struct btrfs_bio_ctrl *bio_ctrl,
struct writeback_control *wbc,
unsigned int opf,
bio_end_io_t end_io_func,
u64 disk_bytenr, u32 offset, u64 file_offset,
unsigned long bio_flags)
{
struct btrfs_fs_info *fs_info = inode->root->fs_info;
struct bio *bio;
int ret;
/*
* For compressed page range, its disk_bytenr is always @disk_bytenr
* passed in, no matter if we have added any range into previous bio.
*/
if (bio_flags & EXTENT_BIO_COMPRESSED)
bio = btrfs_bio_alloc(disk_bytenr);
else
bio = btrfs_bio_alloc(disk_bytenr + offset); bio_ctrl->bio = bio;
bio_ctrl->bio_flags = bio_flags;
bio->bi_end_io = end_io_func;
bio->bi_private = &inode->io_tree;
bio->bi_write_hint = inode->vfs_inode.i_write_hint;
bio->bi_opf = opf;
ret = calc_bio_boundaries(bio_ctrl, inode, file_offset);
if (ret < 0)
goto error;
if (wbc) {
struct block_device *bdev;
bdev = fs_info->fs_devices->latest_dev->bdev; bio_set_dev(bio, bdev);
wbc_init_bio(wbc, bio);
}
if (btrfs_is_zoned(fs_info) && bio_op(bio) == REQ_OP_ZONE_APPEND) {
struct btrfs_device *device;
device = btrfs_zoned_get_device(fs_info, disk_bytenr,
fs_info->sectorsize);
if (IS_ERR(device)) {
ret = PTR_ERR(device);
goto error;
}
btrfs_io_bio(bio)->device = device;
}
return 0;
error:
bio_ctrl->bio = NULL;
bio->bi_status = errno_to_blk_status(ret);
bio_endio(bio);
return ret;
}
/*
* @opf: bio REQ_OP_* and REQ_* flags as one value
* @wbc: optional writeback control for io accounting
* @page: page to add to the bio
* @disk_bytenr: logical bytenr where the write will be
* @size: portion of page that we want to write to
* @pg_offset: offset of the new bio or to check whether we are adding
* a contiguous page to the previous one
* @bio_ret: must be valid pointer, newly allocated bio will be stored there
* @end_io_func: end_io callback for new bio
* @mirror_num: desired mirror to read/write
* @prev_bio_flags: flags of previous bio to see if we can merge the current one
* @bio_flags: flags of the current bio to see if we can merge them
*/
static int submit_extent_page(unsigned int opf,
struct writeback_control *wbc,
struct btrfs_bio_ctrl *bio_ctrl,
struct page *page, u64 disk_bytenr,
size_t size, unsigned long pg_offset,
bio_end_io_t end_io_func,
int mirror_num,
unsigned long bio_flags,
bool force_bio_submit)
{
int ret = 0;
struct btrfs_inode *inode = BTRFS_I(page->mapping->host);
unsigned int cur = pg_offset;
ASSERT(bio_ctrl);
ASSERT(pg_offset < PAGE_SIZE && size <= PAGE_SIZE &&
pg_offset + size <= PAGE_SIZE);
if (force_bio_submit && bio_ctrl->bio) { ret = submit_one_bio(bio_ctrl->bio, mirror_num, bio_ctrl->bio_flags);
bio_ctrl->bio = NULL;
if (ret < 0)
return ret;
}
while (cur < pg_offset + size) { u32 offset = cur - pg_offset;
int added;
/* Allocate new bio if needed */
if (!bio_ctrl->bio) {
ret = alloc_new_bio(inode, bio_ctrl, wbc, opf,
end_io_func, disk_bytenr, offset,
page_offset(page) + cur,
bio_flags);
if (ret < 0)
return ret;
}
/*
* We must go through btrfs_bio_add_page() to ensure each
* page range won't cross various boundaries.
*/
if (bio_flags & EXTENT_BIO_COMPRESSED) added = btrfs_bio_add_page(bio_ctrl, page, disk_bytenr,
size - offset, pg_offset + offset,
bio_flags);
else
added = btrfs_bio_add_page(bio_ctrl, page,
disk_bytenr + offset, size - offset,
pg_offset + offset, bio_flags);
/* Metadata page range should never be split */
if (!is_data_inode(&inode->vfs_inode))
ASSERT(added == 0 || added == size - offset);
/* At least we added some page, update the account */
if (wbc && added)
wbc_account_cgroup_owner(wbc, page, added);
/* We have reached boundary, submit right now */
if (added < size - offset) {
/* The bio should contain some page(s) */
ASSERT(bio_ctrl->bio->bi_iter.bi_size);
ret = submit_one_bio(bio_ctrl->bio, mirror_num,
bio_ctrl->bio_flags);
bio_ctrl->bio = NULL;
if (ret < 0)
return ret;
}
cur += added;
}
return 0;
}
static int attach_extent_buffer_page(struct extent_buffer *eb,
struct page *page,
struct btrfs_subpage *prealloc)
{
struct btrfs_fs_info *fs_info = eb->fs_info; int ret = 0;
/*
* If the page is mapped to btree inode, we should hold the private
* lock to prevent race.
* For cloned or dummy extent buffers, their pages are not mapped and
* will not race with any other ebs.
*/
if (page->mapping)
lockdep_assert_held(&page->mapping->private_lock);
if (fs_info->sectorsize == PAGE_SIZE) {
if (!PagePrivate(page))
attach_page_private(page, eb);
else
WARN_ON(page->private != (unsigned long)eb);
return 0;
}
/* Already mapped, just free prealloc */
if (PagePrivate(page)) {
btrfs_free_subpage(prealloc); return 0;
}
if (prealloc)
/* Has preallocated memory for subpage */
attach_page_private(page, prealloc);
else
/* Do new allocation to attach subpage */
ret = btrfs_attach_subpage(fs_info, page,
BTRFS_SUBPAGE_METADATA);
return ret;
}
int set_page_extent_mapped(struct page *page)
{
struct btrfs_fs_info *fs_info;
ASSERT(page->mapping);
if (PagePrivate(page))
return 0;
fs_info = btrfs_sb(page->mapping->host->i_sb);
if (fs_info->sectorsize < PAGE_SIZE)
return btrfs_attach_subpage(fs_info, page, BTRFS_SUBPAGE_DATA);
attach_page_private(page, (void *)EXTENT_PAGE_PRIVATE);
return 0;
}
void clear_page_extent_mapped(struct page *page)
{
struct btrfs_fs_info *fs_info;
ASSERT(page->mapping);
if (!PagePrivate(page))
return;
fs_info = btrfs_sb(page->mapping->host->i_sb);
if (fs_info->sectorsize < PAGE_SIZE)
return btrfs_detach_subpage(fs_info, page);
detach_page_private(page);
}
static struct extent_map *
__get_extent_map(struct inode *inode, struct page *page, size_t pg_offset,
u64 start, u64 len, struct extent_map **em_cached)
{
struct extent_map *em;
if (em_cached && *em_cached) {
em = *em_cached;
if (extent_map_in_tree(em) && start >= em->start && start < extent_map_end(em)) { refcount_inc(&em->refs);
return em;
}
free_extent_map(em);
*em_cached = NULL;
}
em = btrfs_get_extent(BTRFS_I(inode), page, pg_offset, start, len);
if (em_cached && !IS_ERR_OR_NULL(em)) {
BUG_ON(*em_cached); refcount_inc(&em->refs); *em_cached = em;
}
return em;
}
/*
* basic readpage implementation. Locked extent state structs are inserted
* into the tree that are removed when the IO is done (by the end_io
* handlers)
* XXX JDM: This needs looking at to ensure proper page locking
* return 0 on success, otherwise return error
*/
int btrfs_do_readpage(struct page *page, struct extent_map **em_cached,
struct btrfs_bio_ctrl *bio_ctrl,
unsigned int read_flags, u64 *prev_em_start)
{
struct inode *inode = page->mapping->host;
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
u64 start = page_offset(page);
const u64 end = start + PAGE_SIZE - 1;
u64 cur = start;
u64 extent_offset;
u64 last_byte = i_size_read(inode);
u64 block_start;
u64 cur_end;
struct extent_map *em;
int ret = 0;
size_t pg_offset = 0;
size_t iosize;
size_t blocksize = inode->i_sb->s_blocksize;
struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
ret = set_page_extent_mapped(page);
if (ret < 0) {
unlock_extent(tree, start, end);
btrfs_page_set_error(fs_info, page, start, PAGE_SIZE);
unlock_page(page);
goto out;
}
if (!PageUptodate(page)) {
if (cleancache_get_page(page) == 0) {
BUG_ON(blocksize != PAGE_SIZE);
unlock_extent(tree, start, end);
unlock_page(page);
goto out;
}
}
if (page->index == last_byte >> PAGE_SHIFT) {
size_t zero_offset = offset_in_page(last_byte);
if (zero_offset) {
iosize = PAGE_SIZE - zero_offset;
memzero_page(page, zero_offset, iosize);
flush_dcache_page(page);
}
}
begin_page_read(fs_info, page);
while (cur <= end) {
unsigned long this_bio_flag = 0;
bool force_bio_submit = false;
u64 disk_bytenr;
if (cur >= last_byte) { struct extent_state *cached = NULL;
iosize = PAGE_SIZE - pg_offset;
memzero_page(page, pg_offset, iosize);
flush_dcache_page(page);
set_extent_uptodate(tree, cur, cur + iosize - 1,
&cached, GFP_NOFS);
unlock_extent_cached(tree, cur,
cur + iosize - 1, &cached);
end_page_read(page, true, cur, iosize);
break;
}
em = __get_extent_map(inode, page, pg_offset, cur,
end - cur + 1, em_cached);
if (IS_ERR_OR_NULL(em)) {
unlock_extent(tree, cur, end);
end_page_read(page, false, cur, end + 1 - cur);
break;
}
extent_offset = cur - em->start; BUG_ON(extent_map_end(em) <= cur);
BUG_ON(end < cur);
if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
this_bio_flag |= EXTENT_BIO_COMPRESSED;
extent_set_compress_type(&this_bio_flag,
em->compress_type);
}
iosize = min(extent_map_end(em) - cur, end - cur + 1);
cur_end = min(extent_map_end(em) - 1, end);
iosize = ALIGN(iosize, blocksize);
if (this_bio_flag & EXTENT_BIO_COMPRESSED)
disk_bytenr = em->block_start;
else
disk_bytenr = em->block_start + extent_offset;
block_start = em->block_start;
if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
block_start = EXTENT_MAP_HOLE;
/*
* If we have a file range that points to a compressed extent
* and it's followed by a consecutive file range that points
* to the same compressed extent (possibly with a different
* offset and/or length, so it either points to the whole extent
* or only part of it), we must make sure we do not submit a
* single bio to populate the pages for the 2 ranges because
* this makes the compressed extent read zero out the pages
* belonging to the 2nd range. Imagine the following scenario:
*
* File layout
* [0 - 8K] [8K - 24K]
* | |
* | |
* points to extent X, points to extent X,
* offset 4K, length of 8K offset 0, length 16K
*
* [extent X, compressed length = 4K uncompressed length = 16K]
*
* If the bio to read the compressed extent covers both ranges,
* it will decompress extent X into the pages belonging to the
* first range and then it will stop, zeroing out the remaining
* pages that belong to the other range that points to extent X.
* So here we make sure we submit 2 bios, one for the first
* range and another one for the third range. Both will target
* the same physical extent from disk, but we can't currently
* make the compressed bio endio callback populate the pages
* for both ranges because each compressed bio is tightly
* coupled with a single extent map, and each range can have
* an extent map with a different offset value relative to the
* uncompressed data of our extent and different lengths. This
* is a corner case so we prioritize correctness over
* non-optimal behavior (submitting 2 bios for the same extent).
*/
if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags) && prev_em_start && *prev_em_start != (u64)-1 &&
*prev_em_start != em->start)
force_bio_submit = true;
if (prev_em_start) *prev_em_start = em->start; free_extent_map(em);
em = NULL;
/* we've found a hole, just zero and go on */
if (block_start == EXTENT_MAP_HOLE) {
struct extent_state *cached = NULL;
memzero_page(page, pg_offset, iosize);
flush_dcache_page(page);
set_extent_uptodate(tree, cur, cur + iosize - 1,
&cached, GFP_NOFS);
unlock_extent_cached(tree, cur,
cur + iosize - 1, &cached);
end_page_read(page, true, cur, iosize);
cur = cur + iosize;
pg_offset += iosize;
continue;
}
/* the get_extent function already copied into the page */
if (test_range_bit(tree, cur, cur_end,
EXTENT_UPTODATE, 1, NULL)) {
unlock_extent(tree, cur, cur + iosize - 1);
end_page_read(page, true, cur, iosize);
cur = cur + iosize;
pg_offset += iosize;
continue;
}
/* we have an inline extent but it didn't get marked up
* to date. Error out
*/
if (block_start == EXTENT_MAP_INLINE) { unlock_extent(tree, cur, cur + iosize - 1);
end_page_read(page, false, cur, iosize);
cur = cur + iosize;
pg_offset += iosize;
continue;
}
ret = submit_extent_page(REQ_OP_READ | read_flags, NULL,
bio_ctrl, page, disk_bytenr, iosize,
pg_offset,
end_bio_extent_readpage, 0,
this_bio_flag,
force_bio_submit);
if (ret) {
unlock_extent(tree, cur, cur + iosize - 1);
end_page_read(page, false, cur, iosize);
goto out;
}
cur = cur + iosize;
pg_offset += iosize;
}
out:
return ret;
}
static inline void contiguous_readpages(struct page *pages[], int nr_pages,
u64 start, u64 end,
struct extent_map **em_cached,
struct btrfs_bio_ctrl *bio_ctrl,
u64 *prev_em_start)
{
struct btrfs_inode *inode = BTRFS_I(pages[0]->mapping->host);
int index;
btrfs_lock_and_flush_ordered_range(inode, start, end, NULL);
for (index = 0; index < nr_pages; index++) { btrfs_do_readpage(pages[index], em_cached, bio_ctrl,
REQ_RAHEAD, prev_em_start);
put_page(pages[index]);
}
}
static void update_nr_written(struct writeback_control *wbc,
unsigned long nr_written)
{
wbc->nr_to_write -= nr_written;
}
/*
* helper for __extent_writepage, doing all of the delayed allocation setup.
*
* This returns 1 if btrfs_run_delalloc_range function did all the work required
* to write the page (copy into inline extent). In this case the IO has
* been started and the page is already unlocked.
*
* This returns 0 if all went well (page still locked)
* This returns < 0 if there were errors (page still locked)
*/
static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode,
struct page *page, struct writeback_control *wbc,
u64 delalloc_start, unsigned long *nr_written)
{
u64 page_end = delalloc_start + PAGE_SIZE - 1;
bool found;
u64 delalloc_to_write = 0;
u64 delalloc_end = 0;
int ret;
int page_started = 0;
while (delalloc_end < page_end) { found = find_lock_delalloc_range(&inode->vfs_inode, page,
&delalloc_start,
&delalloc_end);
if (!found) {
delalloc_start = delalloc_end + 1;
continue;
}
ret = btrfs_run_delalloc_range(inode, page, delalloc_start,
delalloc_end, &page_started, nr_written, wbc);
if (ret) {
btrfs_page_set_error(inode->root->fs_info, page,
page_offset(page), PAGE_SIZE);
return ret;
}
/*
* delalloc_end is already one less than the total length, so
* we don't subtract one from PAGE_SIZE
*/
delalloc_to_write += (delalloc_end - delalloc_start +
PAGE_SIZE) >> PAGE_SHIFT;
delalloc_start = delalloc_end + 1;
}
if (wbc->nr_to_write < delalloc_to_write) {
int thresh = 8192;
if (delalloc_to_write < thresh * 2)
thresh = delalloc_to_write;
wbc->nr_to_write = min_t(u64, delalloc_to_write,
thresh);
}
/* did the fill delalloc function already unlock and start
* the IO?
*/
if (page_started) {
/*
* we've unlocked the page, so we can't update
* the mapping's writeback index, just update
* nr_to_write.
*/
wbc->nr_to_write -= *nr_written;
return 1;
}
return 0;
}
/*
* Find the first byte we need to write.
*
* For subpage, one page can contain several sectors, and
* __extent_writepage_io() will just grab all extent maps in the page
* range and try to submit all non-inline/non-compressed extents.
*
* This is a big problem for subpage, we shouldn't re-submit already written
* data at all.
* This function will lookup subpage dirty bit to find which range we really
* need to submit.
*
* Return the next dirty range in [@start, @end).
* If no dirty range is found, @start will be page_offset(page) + PAGE_SIZE.
*/
static void find_next_dirty_byte(struct btrfs_fs_info *fs_info,
struct page *page, u64 *start, u64 *end)
{
struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
u64 orig_start = *start;
/* Declare as unsigned long so we can use bitmap ops */
unsigned long dirty_bitmap;
unsigned long flags;
int nbits = (orig_start - page_offset(page)) >> fs_info->sectorsize_bits;
int range_start_bit = nbits;
int range_end_bit;
/*
* For regular sector size == page size case, since one page only
* contains one sector, we return the page offset directly.
*/
if (fs_info->sectorsize == PAGE_SIZE) {
*start = page_offset(page);
*end = page_offset(page) + PAGE_SIZE;
return;
}
/* We should have the page locked, but just in case */
spin_lock_irqsave(&subpage->lock, flags);
dirty_bitmap = subpage->dirty_bitmap;
spin_unlock_irqrestore(&subpage->lock, flags);
bitmap_next_set_region(&dirty_bitmap, &range_start_bit, &range_end_bit,
BTRFS_SUBPAGE_BITMAP_SIZE);
*start = page_offset(page) + range_start_bit * fs_info->sectorsize;
*end = page_offset(page) + range_end_bit * fs_info->sectorsize;
}
/*
* helper for __extent_writepage. This calls the writepage start hooks,
* and does the loop to map the page into extents and bios.
*
* We return 1 if the IO is started and the page is unlocked,
* 0 if all went well (page still locked)
* < 0 if there were errors (page still locked)
*/
static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode,
struct page *page,
struct writeback_control *wbc,
struct extent_page_data *epd,
loff_t i_size,
unsigned long nr_written,
int *nr_ret)
{
struct btrfs_fs_info *fs_info = inode->root->fs_info;
u64 cur = page_offset(page);
u64 end = cur + PAGE_SIZE - 1;
u64 extent_offset;
u64 block_start;
struct extent_map *em;
int ret = 0;
int nr = 0;
u32 opf = REQ_OP_WRITE;
const unsigned int write_flags = wbc_to_write_flags(wbc);
bool compressed;
ret = btrfs_writepage_cow_fixup(page);
if (ret) {
/* Fixup worker will requeue */
redirty_page_for_writepage(wbc, page);
update_nr_written(wbc, nr_written);
unlock_page(page);
return 1;
}
/*
* we don't want to touch the inode after unlocking the page,
* so we update the mapping writeback index now
*/
update_nr_written(wbc, nr_written + 1); while (cur <= end) {
u64 disk_bytenr;
u64 em_end;
u64 dirty_range_start = cur;
u64 dirty_range_end;
u32 iosize;
if (cur >= i_size) { btrfs_writepage_endio_finish_ordered(inode, page, cur,
end, true);
/*
* This range is beyond i_size, thus we don't need to
* bother writing back.
* But we still need to clear the dirty subpage bit, or
* the next time the page gets dirtied, we will try to
* writeback the sectors with subpage dirty bits,
* causing writeback without ordered extent.
*/
btrfs_page_clear_dirty(fs_info, page, cur, end + 1 - cur);
break;
}
find_next_dirty_byte(fs_info, page, &dirty_range_start,
&dirty_range_end);
if (cur < dirty_range_start) {
cur = dirty_range_start;
continue;
}
em = btrfs_get_extent(inode, NULL, 0, cur, end - cur + 1);
if (IS_ERR_OR_NULL(em)) {
btrfs_page_set_error(fs_info, page, cur, end - cur + 1);
ret = PTR_ERR_OR_ZERO(em);
break;
}
extent_offset = cur - em->start;
em_end = extent_map_end(em);
ASSERT(cur <= em_end);
ASSERT(cur < end);
ASSERT(IS_ALIGNED(em->start, fs_info->sectorsize));
ASSERT(IS_ALIGNED(em->len, fs_info->sectorsize));
block_start = em->block_start;
compressed = test_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
disk_bytenr = em->block_start + extent_offset;
/*
* Note that em_end from extent_map_end() and dirty_range_end from
* find_next_dirty_byte() are all exclusive
*/
iosize = min(min(em_end, end + 1), dirty_range_end) - cur;
if (btrfs_use_zone_append(inode, em->block_start))
opf = REQ_OP_ZONE_APPEND;
free_extent_map(em);
em = NULL;
/*
* compressed and inline extents are written through other
* paths in the FS
*/
if (compressed || block_start == EXTENT_MAP_HOLE ||
block_start == EXTENT_MAP_INLINE) {
if (compressed)
nr++;
else
btrfs_writepage_endio_finish_ordered(inode,
page, cur, cur + iosize - 1, true);
btrfs_page_clear_dirty(fs_info, page, cur, iosize);
cur += iosize;
continue;
}
btrfs_set_range_writeback(inode, cur, cur + iosize - 1);
if (!PageWriteback(page)) {
btrfs_err(inode->root->fs_info,
"page %lu not writeback, cur %llu end %llu",
page->index, cur, end);
}
/*
* Although the PageDirty bit is cleared before entering this
* function, subpage dirty bit is not cleared.
* So clear subpage dirty bit here so next time we won't submit
* page for range already written to disk.
*/
btrfs_page_clear_dirty(fs_info, page, cur, iosize);
ret = submit_extent_page(opf | write_flags, wbc,
&epd->bio_ctrl, page,
disk_bytenr, iosize,
cur - page_offset(page),
end_bio_extent_writepage,
0, 0, false);
if (ret) {
btrfs_page_set_error(fs_info, page, cur, iosize);
if (PageWriteback(page))
btrfs_page_clear_writeback(fs_info, page, cur,
iosize);
}
cur += iosize;
nr++;
}
/*
* If we finish without problem, we should not only clear page dirty,
* but also empty subpage dirty bits
*/
if (!ret) btrfs_page_assert_not_dirty(fs_info, page); *nr_ret = nr; return ret;
}
/*
* the writepage semantics are similar to regular writepage. extent
* records are inserted to lock ranges in the tree, and as dirty areas
* are found, they are marked writeback. Then the lock bits are removed
* and the end_io handler clears the writeback ranges
*
* Return 0 if everything goes well.
* Return <0 for error.
*/
static int __extent_writepage(struct page *page, struct writeback_control *wbc,
struct extent_page_data *epd)
{
struct inode *inode = page->mapping->host;
u64 start = page_offset(page);
u64 page_end = start + PAGE_SIZE - 1;
int ret;
int nr = 0;
size_t pg_offset;
loff_t i_size = i_size_read(inode);
unsigned long end_index = i_size >> PAGE_SHIFT;
unsigned long nr_written = 0;
trace___extent_writepage(page, inode, wbc);
WARN_ON(!PageLocked(page));
btrfs_page_clear_error(btrfs_sb(inode->i_sb), page,
page_offset(page), PAGE_SIZE); pg_offset = offset_in_page(i_size); if (page->index > end_index ||
(page->index == end_index && !pg_offset)) {
page->mapping->a_ops->invalidatepage(page, 0, PAGE_SIZE);
unlock_page(page);
return 0;
}
if (page->index == end_index) {
memzero_page(page, pg_offset, PAGE_SIZE - pg_offset);
flush_dcache_page(page);
}
ret = set_page_extent_mapped(page);
if (ret < 0) {
SetPageError(page);
goto done;
}
if (!epd->extent_locked) { ret = writepage_delalloc(BTRFS_I(inode), page, wbc, start,
&nr_written);
if (ret == 1)
return 0;
if (ret)
goto done;
}
ret = __extent_writepage_io(BTRFS_I(inode), page, wbc, epd, i_size,
nr_written, &nr);
if (ret == 1)
return 0;
done:
if (nr == 0) {
/* make sure the mapping tag for page dirty gets cleared */
set_page_writeback(page);
end_page_writeback(page);
}
/*
* Here we used to have a check for PageError() and then set @ret and
* call end_extent_writepage().
*
* But in fact setting @ret here will cause different error paths
* between subpage and regular sectorsize.
*
* For regular page size, we never submit current page, but only add
* current page to current bio.
* The bio submission can only happen in next page.
* Thus if we hit the PageError() branch, @ret is already set to
* non-zero value and will not get updated for regular sectorsize.
*
* But for subpage case, it's possible we submit part of current page,
* thus can get PageError() set by submitted bio of the same page,
* while our @ret is still 0.
*
* So here we unify the behavior and don't set @ret.
* Error can still be properly passed to higher layer as page will
* be set error, here we just don't handle the IO failure.
*
* NOTE: This is just a hotfix for subpage.
* The root fix will be properly ending ordered extent when we hit
* an error during writeback.
*
* But that needs a bigger refactoring, as we not only need to grab the
* submitted OE, but also need to know exactly at which bytenr we hit
* the error.
* Currently the full page based __extent_writepage_io() is not
* capable of that.
*/
if (PageError(page))
end_extent_writepage(page, ret, start, page_end);
unlock_page(page);
ASSERT(ret <= 0);
return ret;
}
void wait_on_extent_buffer_writeback(struct extent_buffer *eb)
{
wait_on_bit_io(&eb->bflags, EXTENT_BUFFER_WRITEBACK,
TASK_UNINTERRUPTIBLE);
}
static void end_extent_buffer_writeback(struct extent_buffer *eb)
{
clear_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags);
smp_mb__after_atomic();
wake_up_bit(&eb->bflags, EXTENT_BUFFER_WRITEBACK);
}
/*
* Lock extent buffer status and pages for writeback.
*
* May try to flush write bio if we can't get the lock.
*
* Return 0 if the extent buffer doesn't need to be submitted.
* (E.g. the extent buffer is not dirty)
* Return >0 is the extent buffer is submitted to bio.
* Return <0 if something went wrong, no page is locked.
*/
static noinline_for_stack int lock_extent_buffer_for_io(struct extent_buffer *eb,
struct extent_page_data *epd)
{
struct btrfs_fs_info *fs_info = eb->fs_info;
int i, num_pages, failed_page_nr;
int flush = 0;
int ret = 0;
if (!btrfs_try_tree_write_lock(eb)) {
ret = flush_write_bio(epd);
if (ret < 0)
return ret;
flush = 1;
btrfs_tree_lock(eb);
}
if (test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags)) { btrfs_tree_unlock(eb);
if (!epd->sync_io)
return 0; if (!flush) {
ret = flush_write_bio(epd);
if (ret < 0)
return ret;
flush = 1;
}
while (1) {
wait_on_extent_buffer_writeback(eb);
btrfs_tree_lock(eb);
if (!test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags))
break;
btrfs_tree_unlock(eb);
}
}
/*
* We need to do this to prevent races in people who check if the eb is
* under IO since we can end up having no IO bits set for a short period
* of time.
*/
spin_lock(&eb->refs_lock);
if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) {
set_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags);
spin_unlock(&eb->refs_lock);
btrfs_set_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN);
percpu_counter_add_batch(&fs_info->dirty_metadata_bytes,
-eb->len,
fs_info->dirty_metadata_batch);
ret = 1;
} else {
spin_unlock(&eb->refs_lock);
}
btrfs_tree_unlock(eb);
/*
* Either we don't need to submit any tree block, or we're submitting
* subpage eb.
* Subpage metadata doesn't use page locking at all, so we can skip
* the page locking.
*/
if (!ret || fs_info->sectorsize < PAGE_SIZE)
return ret;
num_pages = num_extent_pages(eb); for (i = 0; i < num_pages; i++) { struct page *p = eb->pages[i];
if (!trylock_page(p)) {
if (!flush) {
int err;
err = flush_write_bio(epd);
if (err < 0) {
ret = err;
failed_page_nr = i;
goto err_unlock;
}
flush = 1;
}
lock_page(p);
}
}
return ret;
err_unlock:
/* Unlock already locked pages */
for (i = 0; i < failed_page_nr; i++) unlock_page(eb->pages[i]);
/*
* Clear EXTENT_BUFFER_WRITEBACK and wake up anyone waiting on it.
* Also set back EXTENT_BUFFER_DIRTY so future attempts to this eb can
* be made and undo everything done before.
*/
btrfs_tree_lock(eb);
spin_lock(&eb->refs_lock);
set_bit(EXTENT_BUFFER_DIRTY, &eb->bflags);
end_extent_buffer_writeback(eb);
spin_unlock(&eb->refs_lock);
percpu_counter_add_batch(&fs_info->dirty_metadata_bytes, eb->len,
fs_info->dirty_metadata_batch);
btrfs_clear_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN);
btrfs_tree_unlock(eb);
return ret;
}
static void set_btree_ioerr(struct page *page, struct extent_buffer *eb)
{
struct btrfs_fs_info *fs_info = eb->fs_info;
btrfs_page_set_error(fs_info, page, eb->start, eb->len);
if (test_and_set_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags))
return;
/*
* A read may stumble upon this buffer later, make sure that it gets an
* error and knows there was an error.
*/
clear_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
/*
* If we error out, we should add back the dirty_metadata_bytes
* to make it consistent.
*/
percpu_counter_add_batch(&fs_info->dirty_metadata_bytes,
eb->len, fs_info->dirty_metadata_batch);
/*
* If writeback for a btree extent that doesn't belong to a log tree
* failed, increment the counter transaction->eb_write_errors.
* We do this because while the transaction is running and before it's
* committing (when we call filemap_fdata[write|wait]_range against
* the btree inode), we might have
* btree_inode->i_mapping->a_ops->writepages() called by the VM - if it
* returns an error or an error happens during writeback, when we're
* committing the transaction we wouldn't know about it, since the pages
* can be no longer dirty nor marked anymore for writeback (if a
* subsequent modification to the extent buffer didn't happen before the
* transaction commit), which makes filemap_fdata[write|wait]_range not
* able to find the pages tagged with SetPageError at transaction
* commit time. So if this happens we must abort the transaction,
* otherwise we commit a super block with btree roots that point to
* btree nodes/leafs whose content on disk is invalid - either garbage
* or the content of some node/leaf from a past generation that got
* cowed or deleted and is no longer valid.
*
* Note: setting AS_EIO/AS_ENOSPC in the btree inode's i_mapping would
* not be enough - we need to distinguish between log tree extents vs
* non-log tree extents, and the next filemap_fdatawait_range() call
* will catch and clear such errors in the mapping - and that call might
* be from a log sync and not from a transaction commit. Also, checking
* for the eb flag EXTENT_BUFFER_WRITE_ERR at transaction commit time is
* not done and would not be reliable - the eb might have been released
* from memory and reading it back again means that flag would not be
* set (since it's a runtime flag, not persisted on disk).
*
* Using the flags below in the btree inode also makes us achieve the
* goal of AS_EIO/AS_ENOSPC when writepages() returns success, started
* writeback for all dirty pages and before filemap_fdatawait_range()
* is called, the writeback for all dirty pages had already finished
* with errors - because we were not using AS_EIO/AS_ENOSPC,
* filemap_fdatawait_range() would return success, as it could not know
* that writeback errors happened (the pages were no longer tagged for
* writeback).
*/
switch (eb->log_index) {
case -1:
set_bit(BTRFS_FS_BTREE_ERR, &fs_info->flags);
break;
case 0:
set_bit(BTRFS_FS_LOG1_ERR, &fs_info->flags);
break;
case 1:
set_bit(BTRFS_FS_LOG2_ERR, &fs_info->flags);
break;
default:
BUG(); /* unexpected, logic error */
}
}
/*
* The endio specific version which won't touch any unsafe spinlock in endio
* context.
*/
static struct extent_buffer *find_extent_buffer_nolock(
struct btrfs_fs_info *fs_info, u64 start)
{
struct extent_buffer *eb;
rcu_read_lock();
eb = radix_tree_lookup(&fs_info->buffer_radix,
start >> fs_info->sectorsize_bits);
if (eb && atomic_inc_not_zero(&eb->refs)) {
rcu_read_unlock();
return eb;
}
rcu_read_unlock();
return NULL;
}
/*
* The endio function for subpage extent buffer write.
*
* Unlike end_bio_extent_buffer_writepage(), we only call end_page_writeback()
* after all extent buffers in the page has finished their writeback.
*/
static void end_bio_subpage_eb_writepage(struct bio *bio)
{
struct btrfs_fs_info *fs_info;
struct bio_vec *bvec;
struct bvec_iter_all iter_all;
fs_info = btrfs_sb(bio_first_page_all(bio)->mapping->host->i_sb);
ASSERT(fs_info->sectorsize < PAGE_SIZE);
ASSERT(!bio_flagged(bio, BIO_CLONED));
bio_for_each_segment_all(bvec, bio, iter_all) {
struct page *page = bvec->bv_page;
u64 bvec_start = page_offset(page) + bvec->bv_offset;
u64 bvec_end = bvec_start + bvec->bv_len - 1;
u64 cur_bytenr = bvec_start;
ASSERT(IS_ALIGNED(bvec->bv_len, fs_info->nodesize));
/* Iterate through all extent buffers in the range */
while (cur_bytenr <= bvec_end) {
struct extent_buffer *eb;
int done;
/*
* Here we can't use find_extent_buffer(), as it may
* try to lock eb->refs_lock, which is not safe in endio
* context.
*/
eb = find_extent_buffer_nolock(fs_info, cur_bytenr);
ASSERT(eb);
cur_bytenr = eb->start + eb->len;
ASSERT(test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags));
done = atomic_dec_and_test(&eb->io_pages);
ASSERT(done);
if (bio->bi_status ||
test_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags)) {
ClearPageUptodate(page);
set_btree_ioerr(page, eb);
}
btrfs_subpage_clear_writeback(fs_info, page, eb->start,
eb->len);
end_extent_buffer_writeback(eb);
/*
* free_extent_buffer() will grab spinlock which is not
* safe in endio context. Thus here we manually dec
* the ref.
*/
atomic_dec(&eb->refs);
}
}
bio_put(bio);
}
static void end_bio_extent_buffer_writepage(struct bio *bio)
{
struct bio_vec *bvec;
struct extent_buffer *eb;
int done;
struct bvec_iter_all iter_all;
ASSERT(!bio_flagged(bio, BIO_CLONED));
bio_for_each_segment_all(bvec, bio, iter_all) {
struct page *page = bvec->bv_page;
eb = (struct extent_buffer *)page->private;
BUG_ON(!eb);
done = atomic_dec_and_test(&eb->io_pages);
if (bio->bi_status ||
test_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags)) {
ClearPageUptodate(page);
set_btree_ioerr(page, eb);
}
end_page_writeback(page);
if (!done)
continue;
end_extent_buffer_writeback(eb);
}
bio_put(bio);
}
static void prepare_eb_write(struct extent_buffer *eb)
{
u32 nritems;
unsigned long start;
unsigned long end;
clear_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags);
atomic_set(&eb->io_pages, num_extent_pages(eb));
/* Set btree blocks beyond nritems with 0 to avoid stale content */
nritems = btrfs_header_nritems(eb);
if (btrfs_header_level(eb) > 0) {
end = btrfs_node_key_ptr_offset(nritems);
memzero_extent_buffer(eb, end, eb->len - end);
} else {
/*
* Leaf:
* header 0 1 2 .. N ... data_N .. data_2 data_1 data_0
*/
start = btrfs_item_nr_offset(nritems); end = BTRFS_LEAF_DATA_OFFSET + leaf_data_end(eb);
memzero_extent_buffer(eb, start, end - start);
}
}
/*
* Unlike the work in write_one_eb(), we rely completely on extent locking.
* Page locking is only utilized at minimum to keep the VMM code happy.
*/
static int write_one_subpage_eb(struct extent_buffer *eb,
struct writeback_control *wbc,
struct extent_page_data *epd)
{
struct btrfs_fs_info *fs_info = eb->fs_info;
struct page *page = eb->pages[0];
unsigned int write_flags = wbc_to_write_flags(wbc) | REQ_META;
bool no_dirty_ebs = false;
int ret;
prepare_eb_write(eb);
/* clear_page_dirty_for_io() in subpage helper needs page locked */
lock_page(page);
btrfs_subpage_set_writeback(fs_info, page, eb->start, eb->len);
/* Check if this is the last dirty bit to update nr_written */
no_dirty_ebs = btrfs_subpage_clear_and_test_dirty(fs_info, page,
eb->start, eb->len);
if (no_dirty_ebs)
clear_page_dirty_for_io(page);
ret = submit_extent_page(REQ_OP_WRITE | write_flags, wbc,
&epd->bio_ctrl, page, eb->start, eb->len,
eb->start - page_offset(page),
end_bio_subpage_eb_writepage, 0, 0, false);
if (ret) {
btrfs_subpage_clear_writeback(fs_info, page, eb->start, eb->len);
set_btree_ioerr(page, eb);
unlock_page(page);
if (atomic_dec_and_test(&eb->io_pages))
end_extent_buffer_writeback(eb);
return -EIO;
}
unlock_page(page);
/*
* Submission finished without problem, if no range of the page is
* dirty anymore, we have submitted a page. Update nr_written in wbc.
*/
if (no_dirty_ebs)
update_nr_written(wbc, 1);
return ret;
}
static noinline_for_stack int write_one_eb(struct extent_buffer *eb,
struct writeback_control *wbc,
struct extent_page_data *epd)
{
u64 disk_bytenr = eb->start;
int i, num_pages;
unsigned int write_flags = wbc_to_write_flags(wbc) | REQ_META;
int ret = 0;
prepare_eb_write(eb);
num_pages = num_extent_pages(eb);
for (i = 0; i < num_pages; i++) { struct page *p = eb->pages[i];
clear_page_dirty_for_io(p);
set_page_writeback(p);
ret = submit_extent_page(REQ_OP_WRITE | write_flags, wbc,
&epd->bio_ctrl, p, disk_bytenr,
PAGE_SIZE, 0,
end_bio_extent_buffer_writepage,
0, 0, false);
if (ret) {
set_btree_ioerr(p, eb);
if (PageWriteback(p))
end_page_writeback(p); if (atomic_sub_and_test(num_pages - i, &eb->io_pages))
end_extent_buffer_writeback(eb);
ret = -EIO;
break;
}
disk_bytenr += PAGE_SIZE;
update_nr_written(wbc, 1);
unlock_page(p);
}
if (unlikely(ret)) {
for (; i < num_pages; i++) { struct page *p = eb->pages[i];
clear_page_dirty_for_io(p);
unlock_page(p);
}
}
return ret;
}
/*
* Submit one subpage btree page.
*
* The main difference to submit_eb_page() is:
* - Page locking
* For subpage, we don't rely on page locking at all.
*
* - Flush write bio
* We only flush bio if we may be unable to fit current extent buffers into
* current bio.
*
* Return >=0 for the number of submitted extent buffers.
* Return <0 for fatal error.
*/
static int submit_eb_subpage(struct page *page,
struct writeback_control *wbc,
struct extent_page_data *epd)
{
struct btrfs_fs_info *fs_info = btrfs_sb(page->mapping->host->i_sb);
int submitted = 0;
u64 page_start = page_offset(page);
int bit_start = 0;
const int nbits = BTRFS_SUBPAGE_BITMAP_SIZE;
int sectors_per_node = fs_info->nodesize >> fs_info->sectorsize_bits;
int ret;
/* Lock and write each dirty extent buffers in the range */
while (bit_start < nbits) { struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
struct extent_buffer *eb;
unsigned long flags;
u64 start;
/*
* Take private lock to ensure the subpage won't be detached
* in the meantime.
*/
spin_lock(&page->mapping->private_lock);
if (!PagePrivate(page)) {
spin_unlock(&page->mapping->private_lock);
break;
}
spin_lock_irqsave(&subpage->lock, flags);
if (!((1 << bit_start) & subpage->dirty_bitmap)) {
spin_unlock_irqrestore(&subpage->lock, flags);
spin_unlock(&page->mapping->private_lock);
bit_start++;
continue;
}
start = page_start + bit_start * fs_info->sectorsize;
bit_start += sectors_per_node;
/*
* Here we just want to grab the eb without touching extra
* spin locks, so call find_extent_buffer_nolock().
*/
eb = find_extent_buffer_nolock(fs_info, start);
spin_unlock_irqrestore(&subpage->lock, flags);
spin_unlock(&page->mapping->private_lock);
/*
* The eb has already reached 0 refs thus find_extent_buffer()
* doesn't return it. We don't need to write back such eb
* anyway.
*/
if (!eb)
continue;
ret = lock_extent_buffer_for_io(eb, epd);
if (ret == 0) {
free_extent_buffer(eb);
continue;
}
if (ret < 0) {
free_extent_buffer(eb);
goto cleanup;
}
ret = write_one_subpage_eb(eb, wbc, epd);
free_extent_buffer(eb);
if (ret < 0)
goto cleanup;
submitted++;
}
return submitted;
cleanup:
/* We hit error, end bio for the submitted extent buffers */
end_write_bio(epd, ret);
return ret;
}
/*
* Submit all page(s) of one extent buffer.
*
* @page: the page of one extent buffer
* @eb_context: to determine if we need to submit this page, if current page
* belongs to this eb, we don't need to submit
*
* The caller should pass each page in their bytenr order, and here we use
* @eb_context to determine if we have submitted pages of one extent buffer.
*
* If we have, we just skip until we hit a new page that doesn't belong to
* current @eb_context.
*
* If not, we submit all the page(s) of the extent buffer.
*
* Return >0 if we have submitted the extent buffer successfully.
* Return 0 if we don't need to submit the page, as it's already submitted by
* previous call.
* Return <0 for fatal error.
*/
static int submit_eb_page(struct page *page, struct writeback_control *wbc,
struct extent_page_data *epd,
struct extent_buffer **eb_context)
{
struct address_space *mapping = page->mapping;
struct btrfs_block_group *cache = NULL;
struct extent_buffer *eb;
int ret;
if (!PagePrivate(page))
return 0;
if (btrfs_sb(page->mapping->host->i_sb)->sectorsize < PAGE_SIZE)
return submit_eb_subpage(page, wbc, epd);
spin_lock(&mapping->private_lock);
if (!PagePrivate(page)) {
spin_unlock(&mapping->private_lock);
return 0;
}
eb = (struct extent_buffer *)page->private;
/*
* Shouldn't happen and normally this would be a BUG_ON but no point
* crashing the machine for something we can survive anyway.
*/
if (WARN_ON(!eb)) {
spin_unlock(&mapping->private_lock);
return 0;
}
if (eb == *eb_context) {
spin_unlock(&mapping->private_lock);
return 0;
}
ret = atomic_inc_not_zero(&eb->refs);
spin_unlock(&mapping->private_lock);
if (!ret)
return 0;
if (!btrfs_check_meta_write_pointer(eb->fs_info, eb, &cache)) {
/*
* If for_sync, this hole will be filled with
* trasnsaction commit.
*/
if (wbc->sync_mode == WB_SYNC_ALL && !wbc->for_sync)
ret = -EAGAIN;
else
ret = 0;
free_extent_buffer(eb);
return ret;
}
*eb_context = eb;
ret = lock_extent_buffer_for_io(eb, epd);
if (ret <= 0) {
btrfs_revert_meta_write_pointer(cache, eb);
if (cache)
btrfs_put_block_group(cache);
free_extent_buffer(eb);
return ret;
}
if (cache)
btrfs_put_block_group(cache);
ret = write_one_eb(eb, wbc, epd);
free_extent_buffer(eb);
if (ret < 0)
return ret;
return 1;
}
int btree_write_cache_pages(struct address_space *mapping,
struct writeback_control *wbc)
{
struct extent_buffer *eb_context = NULL;
struct extent_page_data epd = {
.bio_ctrl = { 0 },
.extent_locked = 0,
.sync_io = wbc->sync_mode == WB_SYNC_ALL,
};
struct btrfs_fs_info *fs_info = BTRFS_I(mapping->host)->root->fs_info;
int ret = 0;
int done = 0;
int nr_to_write_done = 0;
struct pagevec pvec;
int nr_pages;
pgoff_t index;
pgoff_t end; /* Inclusive */
int scanned = 0;
xa_mark_t tag;
pagevec_init(&pvec);
if (wbc->range_cyclic) {
index = mapping->writeback_index; /* Start from prev offset */
end = -1;
/*
* Start from the beginning does not need to cycle over the
* range, mark it as scanned.
*/
scanned = (index == 0);
} else {
index = wbc->range_start >> PAGE_SHIFT;
end = wbc->range_end >> PAGE_SHIFT;
scanned = 1;
}
if (wbc->sync_mode == WB_SYNC_ALL)
tag = PAGECACHE_TAG_TOWRITE;
else
tag = PAGECACHE_TAG_DIRTY;
btrfs_zoned_meta_io_lock(fs_info);
retry:
if (wbc->sync_mode == WB_SYNC_ALL) tag_pages_for_writeback(mapping, index, end); while (!done && !nr_to_write_done && (index <= end) && (nr_pages = pagevec_lookup_range_tag(&pvec, mapping, &index, end,
tag))) {
unsigned i;
for (i = 0; i < nr_pages; i++) { struct page *page = pvec.pages[i];
ret = submit_eb_page(page, wbc, &epd, &eb_context);
if (ret == 0)
continue;
if (ret < 0) {
done = 1;
break;
}
/*
* the filesystem may choose to bump up nr_to_write.
* We have to make sure to honor the new nr_to_write
* at any time
*/
nr_to_write_done = wbc->nr_to_write <= 0;
}
pagevec_release(&pvec);
cond_resched();
}
if (!scanned && !done) {
/*
* We hit the last page and there is more work to be done: wrap
* back to the start of the file
*/
scanned = 1;
index = 0;
goto retry;
}
if (ret < 0) { end_write_bio(&epd, ret);
goto out;
}
/*
* If something went wrong, don't allow any metadata write bio to be
* submitted.
*
* This would prevent use-after-free if we had dirty pages not
* cleaned up, which can still happen by fuzzed images.
*
* - Bad extent tree
* Allowing existing tree block to be allocated for other trees.
*
* - Log tree operations
* Exiting tree blocks get allocated to log tree, bumps its
* generation, then get cleaned in tree re-balance.
* Such tree block will not be written back, since it's clean,
* thus no WRITTEN flag set.
* And after log writes back, this tree block is not traced by
* any dirty extent_io_tree.
*
* - Offending tree block gets re-dirtied from its original owner
* Since it has bumped generation, no WRITTEN flag, it can be
* reused without COWing. This tree block will not be traced
* by btrfs_transaction::dirty_pages.
*
* Now such dirty tree block will not be cleaned by any dirty
* extent io tree. Thus we don't want to submit such wild eb
* if the fs already has error.
*/
if (!test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) {
ret = flush_write_bio(&epd);
} else {
ret = -EROFS;
end_write_bio(&epd, ret);
}
out:
btrfs_zoned_meta_io_unlock(fs_info);
return ret;
}
/**
* Walk the list of dirty pages of the given address space and write all of them.
*
* @mapping: address space structure to write
* @wbc: subtract the number of written pages from *@wbc->nr_to_write
* @epd: holds context for the write, namely the bio
*
* If a page is already under I/O, write_cache_pages() skips it, even
* if it's dirty. This is desirable behaviour for memory-cleaning writeback,
* but it is INCORRECT for data-integrity system calls such as fsync(). fsync()
* and msync() need to guarantee that all the data which was dirty at the time
* the call was made get new I/O started against them. If wbc->sync_mode is
* WB_SYNC_ALL then we were called for data integrity and we must wait for
* existing IO to complete.
*/
static int extent_write_cache_pages(struct address_space *mapping,
struct writeback_control *wbc,
struct extent_page_data *epd)
{
struct inode *inode = mapping->host;
int ret = 0;
int done = 0;
int nr_to_write_done = 0;
struct pagevec pvec;
int nr_pages;
pgoff_t index;
pgoff_t end; /* Inclusive */
pgoff_t done_index;
int range_whole = 0;
int scanned = 0;
xa_mark_t tag;
/*
* We have to hold onto the inode so that ordered extents can do their
* work when the IO finishes. The alternative to this is failing to add
* an ordered extent if the igrab() fails there and that is a huge pain
* to deal with, so instead just hold onto the inode throughout the
* writepages operation. If it fails here we are freeing up the inode
* anyway and we'd rather not waste our time writing out stuff that is
* going to be truncated anyway.
*/
if (!igrab(inode))
return 0;
pagevec_init(&pvec);
if (wbc->range_cyclic) {
index = mapping->writeback_index; /* Start from prev offset */
end = -1;
/*
* Start from the beginning does not need to cycle over the
* range, mark it as scanned.
*/
scanned = (index == 0);
} else {
index = wbc->range_start >> PAGE_SHIFT;
end = wbc->range_end >> PAGE_SHIFT;
if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
range_whole = 1;
scanned = 1;
}
/*
* We do the tagged writepage as long as the snapshot flush bit is set
* and we are the first one who do the filemap_flush() on this inode.
*
* The nr_to_write == LONG_MAX is needed to make sure other flushers do
* not race in and drop the bit.
*/
if (range_whole && wbc->nr_to_write == LONG_MAX &&
test_and_clear_bit(BTRFS_INODE_SNAPSHOT_FLUSH,
&BTRFS_I(inode)->runtime_flags)) wbc->tagged_writepages = 1; if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
tag = PAGECACHE_TAG_TOWRITE;
else
tag = PAGECACHE_TAG_DIRTY;
retry:
if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) tag_pages_for_writeback(mapping, index, end);
done_index = index;
while (!done && !nr_to_write_done && (index <= end) && (nr_pages = pagevec_lookup_range_tag(&pvec, mapping,
&index, end, tag))) {
unsigned i;
for (i = 0; i < nr_pages; i++) { struct page *page = pvec.pages[i];
done_index = page->index + 1;
/*
* At this point we hold neither the i_pages lock nor
* the page lock: the page may be truncated or
* invalidated (changing page->mapping to NULL),
* or even swizzled back from swapper_space to
* tmpfs file mapping
*/
if (!trylock_page(page)) {
ret = flush_write_bio(epd);
BUG_ON(ret < 0);
lock_page(page);
}
if (unlikely(page->mapping != mapping)) {
unlock_page(page);
continue;
}
if (wbc->sync_mode != WB_SYNC_NONE) {
if (PageWriteback(page)) {
ret = flush_write_bio(epd);
BUG_ON(ret < 0);
}
wait_on_page_writeback(page);
}
if (PageWriteback(page) ||
!clear_page_dirty_for_io(page)) { unlock_page(page);
continue;
}
ret = __extent_writepage(page, wbc, epd);
if (ret < 0) {
done = 1;
break;
}
/*
* the filesystem may choose to bump up nr_to_write.
* We have to make sure to honor the new nr_to_write
* at any time
*/
nr_to_write_done = wbc->nr_to_write <= 0;
}
pagevec_release(&pvec);
cond_resched();
}
if (!scanned && !done) {
/*
* We hit the last page and there is more work to be done: wrap
* back to the start of the file
*/
scanned = 1;
index = 0;
/*
* If we're looping we could run into a page that is locked by a
* writer and that writer could be waiting on writeback for a
* page in our current bio, and thus deadlock, so flush the
* write bio here.
*/
ret = flush_write_bio(epd);
if (!ret)
goto retry;
}
if (wbc->range_cyclic || (wbc->nr_to_write > 0 && range_whole)) mapping->writeback_index = done_index; btrfs_add_delayed_iput(inode);
return ret;
}
int extent_write_full_page(struct page *page, struct writeback_control *wbc)
{
int ret;
struct extent_page_data epd = {
.bio_ctrl = { 0 },
.extent_locked = 0,
.sync_io = wbc->sync_mode == WB_SYNC_ALL,
};
ret = __extent_writepage(page, wbc, &epd);
ASSERT(ret <= 0);
if (ret < 0) {
end_write_bio(&epd, ret);
return ret;
}
ret = flush_write_bio(&epd);
ASSERT(ret <= 0);
return ret;
}
int extent_write_locked_range(struct inode *inode, u64 start, u64 end,
int mode)
{
int ret = 0;
struct address_space *mapping = inode->i_mapping;
struct page *page;
unsigned long nr_pages = (end - start + PAGE_SIZE) >>
PAGE_SHIFT;
struct extent_page_data epd = {
.bio_ctrl = { 0 },
.extent_locked = 1,
.sync_io = mode == WB_SYNC_ALL,
};
struct writeback_control wbc_writepages = {
.sync_mode = mode,
.nr_to_write = nr_pages * 2,
.range_start = start,
.range_end = end + 1,
/* We're called from an async helper function */
.punt_to_cgroup = 1,
.no_cgroup_owner = 1,
};
wbc_attach_fdatawrite_inode(&wbc_writepages, inode);
while (start <= end) {
page = find_get_page(mapping, start >> PAGE_SHIFT);
if (clear_page_dirty_for_io(page))
ret = __extent_writepage(page, &wbc_writepages, &epd);
else {
btrfs_writepage_endio_finish_ordered(BTRFS_I(inode),
page, start, start + PAGE_SIZE - 1, true);
unlock_page(page);
}
put_page(page);
start += PAGE_SIZE;
}
ASSERT(ret <= 0);
if (ret == 0)
ret = flush_write_bio(&epd);
else
end_write_bio(&epd, ret);
wbc_detach_inode(&wbc_writepages);
return ret;
}
int extent_writepages(struct address_space *mapping,
struct writeback_control *wbc)
{
struct inode *inode = mapping->host;
const bool data_reloc = btrfs_is_data_reloc_root(BTRFS_I(inode)->root);
const bool zoned = btrfs_is_zoned(BTRFS_I(inode)->root->fs_info);
int ret = 0;
struct extent_page_data epd = {
.bio_ctrl = { 0 },
.extent_locked = 0,
.sync_io = wbc->sync_mode == WB_SYNC_ALL,
};
/*
* Allow only a single thread to do the reloc work in zoned mode to
* protect the write pointer updates.
*/
if (data_reloc && zoned) btrfs_inode_lock(inode, 0); ret = extent_write_cache_pages(mapping, wbc, &epd);
if (data_reloc && zoned)
btrfs_inode_unlock(inode, 0);
ASSERT(ret <= 0);
if (ret < 0) { end_write_bio(&epd, ret);
return ret;
}
ret = flush_write_bio(&epd);
return ret;
}
void extent_readahead(struct readahead_control *rac)
{
struct btrfs_bio_ctrl bio_ctrl = { 0 };
struct page *pagepool[16];
struct extent_map *em_cached = NULL;
u64 prev_em_start = (u64)-1;
int nr;
while ((nr = readahead_page_batch(rac, pagepool))) {
u64 contig_start = readahead_pos(rac);
u64 contig_end = contig_start + readahead_batch_length(rac) - 1;
contiguous_readpages(pagepool, nr, contig_start, contig_end,
&em_cached, &bio_ctrl, &prev_em_start);
}
if (em_cached) free_extent_map(em_cached); if (bio_ctrl.bio) { if (submit_one_bio(bio_ctrl.bio, 0, bio_ctrl.bio_flags))
return;
}
}
/*
* basic invalidatepage code, this waits on any locked or writeback
* ranges corresponding to the page, and then deletes any extent state
* records from the tree
*/
int extent_invalidatepage(struct extent_io_tree *tree,
struct page *page, unsigned long offset)
{
struct extent_state *cached_state = NULL;
u64 start = page_offset(page);
u64 end = start + PAGE_SIZE - 1;
size_t blocksize = page->mapping->host->i_sb->s_blocksize;
/* This function is only called for the btree inode */
ASSERT(tree->owner == IO_TREE_BTREE_INODE_IO);
start += ALIGN(offset, blocksize);
if (start > end)
return 0;
lock_extent_bits(tree, start, end, &cached_state);
wait_on_page_writeback(page);
/*
* Currently for btree io tree, only EXTENT_LOCKED is utilized,
* so here we only need to unlock the extent range to free any
* existing extent state.
*/
unlock_extent_cached(tree, start, end, &cached_state);
return 0;
}
/*
* a helper for releasepage, this tests for areas of the page that
* are locked or under IO and drops the related state bits if it is safe
* to drop the page.
*/
static int try_release_extent_state(struct extent_io_tree *tree,
struct page *page, gfp_t mask)
{
u64 start = page_offset(page);
u64 end = start + PAGE_SIZE - 1;
int ret = 1;
if (test_range_bit(tree, start, end, EXTENT_LOCKED, 0, NULL)) {
ret = 0;
} else {
/*
* At this point we can safely clear everything except the
* locked bit, the nodatasum bit and the delalloc new bit.
* The delalloc new bit will be cleared by ordered extent
* completion.
*/
ret = __clear_extent_bit(tree, start, end,
~(EXTENT_LOCKED | EXTENT_NODATASUM | EXTENT_DELALLOC_NEW),
0, 0, NULL, mask, NULL);
/* if clear_extent_bit failed for enomem reasons,
* we can't allow the release to continue.
*/
if (ret < 0)
ret = 0;
else
ret = 1;
}
return ret;
}
/*
* a helper for releasepage. As long as there are no locked extents
* in the range corresponding to the page, both state records and extent
* map records are removed
*/
int try_release_extent_mapping(struct page *page, gfp_t mask)
{
struct extent_map *em;
u64 start = page_offset(page);
u64 end = start + PAGE_SIZE - 1;
struct btrfs_inode *btrfs_inode = BTRFS_I(page->mapping->host);
struct extent_io_tree *tree = &btrfs_inode->io_tree;
struct extent_map_tree *map = &btrfs_inode->extent_tree;
if (gfpflags_allow_blocking(mask) &&
page->mapping->host->i_size > SZ_16M) {
u64 len;
while (start <= end) {
struct btrfs_fs_info *fs_info;
u64 cur_gen;
len = end - start + 1;
write_lock(&map->lock);
em = lookup_extent_mapping(map, start, len);
if (!em) {
write_unlock(&map->lock);
break;
}
if (test_bit(EXTENT_FLAG_PINNED, &em->flags) || em->start != start) {
write_unlock(&map->lock);
free_extent_map(em);
break;
}
if (test_range_bit(tree, em->start, extent_map_end(em) - 1,
EXTENT_LOCKED, 0, NULL))
goto next;
/*
* If it's not in the list of modified extents, used
* by a fast fsync, we can remove it. If it's being
* logged we can safely remove it since fsync took an
* extra reference on the em.
*/
if (list_empty(&em->list) ||
test_bit(EXTENT_FLAG_LOGGING, &em->flags))
goto remove_em;
/*
* If it's in the list of modified extents, remove it
* only if its generation is older then the current one,
* in which case we don't need it for a fast fsync.
* Otherwise don't remove it, we could be racing with an
* ongoing fast fsync that could miss the new extent.
*/
fs_info = btrfs_inode->root->fs_info;
spin_lock(&fs_info->trans_lock);
cur_gen = fs_info->generation;
spin_unlock(&fs_info->trans_lock);
if (em->generation >= cur_gen)
goto next;
remove_em:
/*
* We only remove extent maps that are not in the list of
* modified extents or that are in the list but with a
* generation lower then the current generation, so there
* is no need to set the full fsync flag on the inode (it
* hurts the fsync performance for workloads with a data
* size that exceeds or is close to the system's memory).
*/
remove_extent_mapping(map, em);
/* once for the rb tree */
free_extent_map(em);
next:
start = extent_map_end(em);
write_unlock(&map->lock);
/* once for us */
free_extent_map(em);
cond_resched(); /* Allow large-extent preemption. */
}
}
return try_release_extent_state(tree, page, mask);
}
/*
* helper function for fiemap, which doesn't want to see any holes.
* This maps until we find something past 'last'
*/
static struct extent_map *get_extent_skip_holes(struct btrfs_inode *inode,
u64 offset, u64 last)
{
u64 sectorsize = btrfs_inode_sectorsize(inode);
struct extent_map *em;
u64 len;
if (offset >= last)
return NULL;
while (1) {
len = last - offset;
if (len == 0)
break;
len = ALIGN(len, sectorsize);
em = btrfs_get_extent_fiemap(inode, offset, len);
if (IS_ERR_OR_NULL(em))
return em;
/* if this isn't a hole return it */
if (em->block_start != EXTENT_MAP_HOLE)
return em;
/* this is a hole, advance to the next extent */
offset = extent_map_end(em);
free_extent_map(em);
if (offset >= last)
break;
}
return NULL;
}
/*
* To cache previous fiemap extent
*
* Will be used for merging fiemap extent
*/
struct fiemap_cache {
u64 offset;
u64 phys;
u64 len;
u32 flags;
bool cached;
};
/*
* Helper to submit fiemap extent.
*
* Will try to merge current fiemap extent specified by @offset, @phys,
* @len and @flags with cached one.
* And only when we fails to merge, cached one will be submitted as
* fiemap extent.
*
* Return value is the same as fiemap_fill_next_extent().
*/
static int emit_fiemap_extent(struct fiemap_extent_info *fieinfo,
struct fiemap_cache *cache,
u64 offset, u64 phys, u64 len, u32 flags)
{
int ret = 0;
if (!cache->cached)
goto assign;
/*
* Sanity check, extent_fiemap() should have ensured that new
* fiemap extent won't overlap with cached one.
* Not recoverable.
*
* NOTE: Physical address can overlap, due to compression
*/
if (cache->offset + cache->len > offset) {
WARN_ON(1);
return -EINVAL;
}
/*
* Only merges fiemap extents if
* 1) Their logical addresses are continuous
*
* 2) Their physical addresses are continuous
* So truly compressed (physical size smaller than logical size)
* extents won't get merged with each other
*
* 3) Share same flags except FIEMAP_EXTENT_LAST
* So regular extent won't get merged with prealloc extent
*/
if (cache->offset + cache->len == offset &&
cache->phys + cache->len == phys &&
(cache->flags & ~FIEMAP_EXTENT_LAST) ==
(flags & ~FIEMAP_EXTENT_LAST)) {
cache->len += len;
cache->flags |= flags;
goto try_submit_last;
}
/* Not mergeable, need to submit cached one */
ret = fiemap_fill_next_extent(fieinfo, cache->offset, cache->phys,
cache->len, cache->flags);
cache->cached = false;
if (ret)
return ret;
assign:
cache->cached = true;
cache->offset = offset;
cache->phys = phys;
cache->len = len;
cache->flags = flags;
try_submit_last:
if (cache->flags & FIEMAP_EXTENT_LAST) {
ret = fiemap_fill_next_extent(fieinfo, cache->offset,
cache->phys, cache->len, cache->flags);
cache->cached = false;
}
return ret;
}
/*
* Emit last fiemap cache
*
* The last fiemap cache may still be cached in the following case:
* 0 4k 8k
* |<- Fiemap range ->|
* |<------------ First extent ----------->|
*
* In this case, the first extent range will be cached but not emitted.
* So we must emit it before ending extent_fiemap().
*/
static int emit_last_fiemap_cache(struct fiemap_extent_info *fieinfo,
struct fiemap_cache *cache)
{
int ret;
if (!cache->cached)
return 0;
ret = fiemap_fill_next_extent(fieinfo, cache->offset, cache->phys,
cache->len, cache->flags);
cache->cached = false;
if (ret > 0)
ret = 0;
return ret;
}
int extent_fiemap(struct btrfs_inode *inode, struct fiemap_extent_info *fieinfo,
u64 start, u64 len)
{
int ret = 0;
u64 off;
u64 max = start + len;
u32 flags = 0;
u32 found_type;
u64 last;
u64 last_for_get_extent = 0;
u64 disko = 0;
u64 isize = i_size_read(&inode->vfs_inode);
struct btrfs_key found_key;
struct extent_map *em = NULL;
struct extent_state *cached_state = NULL;
struct btrfs_path *path;
struct btrfs_root *root = inode->root;
struct fiemap_cache cache = { 0 };
struct ulist *roots;
struct ulist *tmp_ulist;
int end = 0;
u64 em_start = 0;
u64 em_len = 0;
u64 em_end = 0;
if (len == 0)
return -EINVAL;
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
roots = ulist_alloc(GFP_KERNEL);
tmp_ulist = ulist_alloc(GFP_KERNEL);
if (!roots || !tmp_ulist) {
ret = -ENOMEM;
goto out_free_ulist;
}
/*
* We can't initialize that to 'start' as this could miss extents due
* to extent item merging
*/
off = 0;
start = round_down(start, btrfs_inode_sectorsize(inode));
len = round_up(max, btrfs_inode_sectorsize(inode)) - start;
/*
* lookup the last file extent. We're not using i_size here
* because there might be preallocation past i_size
*/
ret = btrfs_lookup_file_extent(NULL, root, path, btrfs_ino(inode), -1,
0);
if (ret < 0) {
goto out_free_ulist;
} else {
WARN_ON(!ret);
if (ret == 1)
ret = 0;
}
path->slots[0]--;
btrfs_item_key_to_cpu(path->nodes[0], &found_key, path->slots[0]);
found_type = found_key.type;
/* No extents, but there might be delalloc bits */
if (found_key.objectid != btrfs_ino(inode) ||
found_type != BTRFS_EXTENT_DATA_KEY) {
/* have to trust i_size as the end */
last = (u64)-1;
last_for_get_extent = isize;
} else {
/*
* remember the start of the last extent. There are a
* bunch of different factors that go into the length of the
* extent, so its much less complex to remember where it started
*/
last = found_key.offset;
last_for_get_extent = last + 1;
}
btrfs_release_path(path);
/*
* we might have some extents allocated but more delalloc past those
* extents. so, we trust isize unless the start of the last extent is
* beyond isize
*/
if (last < isize) {
last = (u64)-1;
last_for_get_extent = isize;
}
lock_extent_bits(&inode->io_tree, start, start + len - 1,
&cached_state);
em = get_extent_skip_holes(inode, start, last_for_get_extent);
if (!em)
goto out;
if (IS_ERR(em)) {
ret = PTR_ERR(em);
goto out;
}
while (!end) {
u64 offset_in_extent = 0;
/* break if the extent we found is outside the range */
if (em->start >= max || extent_map_end(em) < off)
break;
/*
* get_extent may return an extent that starts before our
* requested range. We have to make sure the ranges
* we return to fiemap always move forward and don't
* overlap, so adjust the offsets here
*/
em_start = max(em->start, off);
/*
* record the offset from the start of the extent
* for adjusting the disk offset below. Only do this if the
* extent isn't compressed since our in ram offset may be past
* what we have actually allocated on disk.
*/
if (!test_bit(EXTENT_FLAG_COMPRESSED, &em->flags))
offset_in_extent = em_start - em->start;
em_end = extent_map_end(em);
em_len = em_end - em_start;
flags = 0;
if (em->block_start < EXTENT_MAP_LAST_BYTE)
disko = em->block_start + offset_in_extent;
else
disko = 0;
/*
* bump off for our next call to get_extent
*/
off = extent_map_end(em);
if (off >= max)
end = 1;
if (em->block_start == EXTENT_MAP_LAST_BYTE) {
end = 1;
flags |= FIEMAP_EXTENT_LAST;
} else if (em->block_start == EXTENT_MAP_INLINE) {
flags |= (FIEMAP_EXTENT_DATA_INLINE |
FIEMAP_EXTENT_NOT_ALIGNED);
} else if (em->block_start == EXTENT_MAP_DELALLOC) {
flags |= (FIEMAP_EXTENT_DELALLOC |
FIEMAP_EXTENT_UNKNOWN);
} else if (fieinfo->fi_extents_max) {
u64 bytenr = em->block_start -
(em->start - em->orig_start);
/*
* As btrfs supports shared space, this information
* can be exported to userspace tools via
* flag FIEMAP_EXTENT_SHARED. If fi_extents_max == 0
* then we're just getting a count and we can skip the
* lookup stuff.
*/
ret = btrfs_check_shared(root, btrfs_ino(inode),
bytenr, roots, tmp_ulist);
if (ret < 0)
goto out_free;
if (ret)
flags |= FIEMAP_EXTENT_SHARED;
ret = 0;
}
if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags))
flags |= FIEMAP_EXTENT_ENCODED;
if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
flags |= FIEMAP_EXTENT_UNWRITTEN;
free_extent_map(em);
em = NULL;
if ((em_start >= last) || em_len == (u64)-1 ||
(last == (u64)-1 && isize <= em_end)) {
flags |= FIEMAP_EXTENT_LAST;
end = 1;
}
/* now scan forward to see if this is really the last extent. */
em = get_extent_skip_holes(inode, off, last_for_get_extent);
if (IS_ERR(em)) {
ret = PTR_ERR(em);
goto out;
}
if (!em) {
flags |= FIEMAP_EXTENT_LAST;
end = 1;
}
ret = emit_fiemap_extent(fieinfo, &cache, em_start, disko,
em_len, flags);
if (ret) {
if (ret == 1)
ret = 0;
goto out_free;
}
}
out_free:
if (!ret)
ret = emit_last_fiemap_cache(fieinfo, &cache);
free_extent_map(em);
out:
unlock_extent_cached(&inode->io_tree, start, start + len - 1,
&cached_state);
out_free_ulist:
btrfs_free_path(path);
ulist_free(roots);
ulist_free(tmp_ulist);
return ret;
}
static void __free_extent_buffer(struct extent_buffer *eb)
{
kmem_cache_free(extent_buffer_cache, eb);
}
int extent_buffer_under_io(const struct extent_buffer *eb)
{
return (atomic_read(&eb->io_pages) || test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags) ||
test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags));
}
static bool page_range_has_eb(struct btrfs_fs_info *fs_info, struct page *page)
{
struct btrfs_subpage *subpage;
lockdep_assert_held(&page->mapping->private_lock);
if (PagePrivate(page)) {
subpage = (struct btrfs_subpage *)page->private;
if (atomic_read(&subpage->eb_refs))
return true;
/*
* Even there is no eb refs here, we may still have
* end_page_read() call relying on page::private.
*/
if (atomic_read(&subpage->readers))
return true;
}
return false;
}
static void detach_extent_buffer_page(struct extent_buffer *eb, struct page *page)
{
struct btrfs_fs_info *fs_info = eb->fs_info;
const bool mapped = !test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags);
/*
* For mapped eb, we're going to change the page private, which should
* be done under the private_lock.
*/
if (mapped)
spin_lock(&page->mapping->private_lock);
if (!PagePrivate(page)) {
if (mapped)
spin_unlock(&page->mapping->private_lock);
return;
}
if (fs_info->sectorsize == PAGE_SIZE) {
/*
* We do this since we'll remove the pages after we've
* removed the eb from the radix tree, so we could race
* and have this page now attached to the new eb. So
* only clear page_private if it's still connected to
* this eb.
*/
if (PagePrivate(page) &&
page->private == (unsigned long)eb) { BUG_ON(test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)); BUG_ON(PageDirty(page)); BUG_ON(PageWriteback(page));
/*
* We need to make sure we haven't be attached
* to a new eb.
*/
detach_page_private(page);
}
if (mapped) spin_unlock(&page->mapping->private_lock);
return;
}
/*
* For subpage, we can have dummy eb with page private. In this case,
* we can directly detach the private as such page is only attached to
* one dummy eb, no sharing.
*/
if (!mapped) {
btrfs_detach_subpage(fs_info, page);
return;
}
btrfs_page_dec_eb_refs(fs_info, page);
/*
* We can only detach the page private if there are no other ebs in the
* page range and no unfinished IO.
*/
if (!page_range_has_eb(fs_info, page))
btrfs_detach_subpage(fs_info, page); spin_unlock(&page->mapping->private_lock);
}
/* Release all pages attached to the extent buffer */
static void btrfs_release_extent_buffer_pages(struct extent_buffer *eb)
{
int i;
int num_pages;
ASSERT(!extent_buffer_under_io(eb));
num_pages = num_extent_pages(eb);
for (i = 0; i < num_pages; i++) { struct page *page = eb->pages[i];
if (!page)
continue;
detach_extent_buffer_page(eb, page);
/* One for when we allocated the page */
put_page(page);
}
}
/*
* Helper for releasing the extent buffer.
*/
static inline void btrfs_release_extent_buffer(struct extent_buffer *eb)
{
btrfs_release_extent_buffer_pages(eb);
btrfs_leak_debug_del(&eb->fs_info->eb_leak_lock, &eb->leak_list);
__free_extent_buffer(eb);
}
static struct extent_buffer *
__alloc_extent_buffer(struct btrfs_fs_info *fs_info, u64 start,
unsigned long len)
{
struct extent_buffer *eb = NULL;
eb = kmem_cache_zalloc(extent_buffer_cache, GFP_NOFS|__GFP_NOFAIL);
eb->start = start;
eb->len = len;
eb->fs_info = fs_info;
eb->bflags = 0;
init_rwsem(&eb->lock);
btrfs_leak_debug_add(&fs_info->eb_leak_lock, &eb->leak_list,
&fs_info->allocated_ebs);
INIT_LIST_HEAD(&eb->release_list);
spin_lock_init(&eb->refs_lock);
atomic_set(&eb->refs, 1);
atomic_set(&eb->io_pages, 0);
ASSERT(len <= BTRFS_MAX_METADATA_BLOCKSIZE);
return eb;
}
struct extent_buffer *btrfs_clone_extent_buffer(const struct extent_buffer *src)
{
int i;
struct page *p;
struct extent_buffer *new;
int num_pages = num_extent_pages(src);
new = __alloc_extent_buffer(src->fs_info, src->start, src->len);
if (new == NULL)
return NULL;
/*
* Set UNMAPPED before calling btrfs_release_extent_buffer(), as
* btrfs_release_extent_buffer() have different behavior for
* UNMAPPED subpage extent buffer.
*/
set_bit(EXTENT_BUFFER_UNMAPPED, &new->bflags);
for (i = 0; i < num_pages; i++) {
int ret;
p = alloc_page(GFP_NOFS);
if (!p) {
btrfs_release_extent_buffer(new);
return NULL;
}
ret = attach_extent_buffer_page(new, p, NULL);
if (ret < 0) {
put_page(p);
btrfs_release_extent_buffer(new);
return NULL;
}
WARN_ON(PageDirty(p));
new->pages[i] = p;
copy_page(page_address(p), page_address(src->pages[i]));
}
set_extent_buffer_uptodate(new);
return new;
}
struct extent_buffer *__alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info,
u64 start, unsigned long len)
{
struct extent_buffer *eb;
int num_pages;
int i;
eb = __alloc_extent_buffer(fs_info, start, len);
if (!eb)
return NULL;
num_pages = num_extent_pages(eb);
for (i = 0; i < num_pages; i++) {
int ret;
eb->pages[i] = alloc_page(GFP_NOFS);
if (!eb->pages[i])
goto err;
ret = attach_extent_buffer_page(eb, eb->pages[i], NULL);
if (ret < 0)
goto err;
}
set_extent_buffer_uptodate(eb);
btrfs_set_header_nritems(eb, 0);
set_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags);
return eb;
err:
for (; i > 0; i--) {
detach_extent_buffer_page(eb, eb->pages[i - 1]);
__free_page(eb->pages[i - 1]);
}
__free_extent_buffer(eb);
return NULL;
}
struct extent_buffer *alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info,
u64 start)
{
return __alloc_dummy_extent_buffer(fs_info, start, fs_info->nodesize);
}
static void check_buffer_tree_ref(struct extent_buffer *eb)
{
int refs;
/*
* The TREE_REF bit is first set when the extent_buffer is added
* to the radix tree. It is also reset, if unset, when a new reference
* is created by find_extent_buffer.
*
* It is only cleared in two cases: freeing the last non-tree
* reference to the extent_buffer when its STALE bit is set or
* calling releasepage when the tree reference is the only reference.
*
* In both cases, care is taken to ensure that the extent_buffer's
* pages are not under io. However, releasepage can be concurrently
* called with creating new references, which is prone to race
* conditions between the calls to check_buffer_tree_ref in those
* codepaths and clearing TREE_REF in try_release_extent_buffer.
*
* The actual lifetime of the extent_buffer in the radix tree is
* adequately protected by the refcount, but the TREE_REF bit and
* its corresponding reference are not. To protect against this
* class of races, we call check_buffer_tree_ref from the codepaths
* which trigger io after they set eb->io_pages. Note that once io is
* initiated, TREE_REF can no longer be cleared, so that is the
* moment at which any such race is best fixed.
*/
refs = atomic_read(&eb->refs);
if (refs >= 2 && test_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags))
return;
spin_lock(&eb->refs_lock);
if (!test_and_set_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags))
atomic_inc(&eb->refs);
spin_unlock(&eb->refs_lock);
}
static void mark_extent_buffer_accessed(struct extent_buffer *eb,
struct page *accessed)
{
int num_pages, i;
check_buffer_tree_ref(eb);
num_pages = num_extent_pages(eb);
for (i = 0; i < num_pages; i++) { struct page *p = eb->pages[i];
if (p != accessed)
mark_page_accessed(p);
}
}
struct extent_buffer *find_extent_buffer(struct btrfs_fs_info *fs_info,
u64 start)
{
struct extent_buffer *eb;
eb = find_extent_buffer_nolock(fs_info, start);
if (!eb)
return NULL;
/*
* Lock our eb's refs_lock to avoid races with free_extent_buffer().
* When we get our eb it might be flagged with EXTENT_BUFFER_STALE and
* another task running free_extent_buffer() might have seen that flag
* set, eb->refs == 2, that the buffer isn't under IO (dirty and
* writeback flags not set) and it's still in the tree (flag
* EXTENT_BUFFER_TREE_REF set), therefore being in the process of
* decrementing the extent buffer's reference count twice. So here we
* could race and increment the eb's reference count, clear its stale
* flag, mark it as dirty and drop our reference before the other task
* finishes executing free_extent_buffer, which would later result in
* an attempt to free an extent buffer that is dirty.
*/
if (test_bit(EXTENT_BUFFER_STALE, &eb->bflags)) {
spin_lock(&eb->refs_lock);
spin_unlock(&eb->refs_lock);
}
mark_extent_buffer_accessed(eb, NULL); return eb;
}
#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
struct extent_buffer *alloc_test_extent_buffer(struct btrfs_fs_info *fs_info,
u64 start)
{
struct extent_buffer *eb, *exists = NULL;
int ret;
eb = find_extent_buffer(fs_info, start);
if (eb)
return eb;
eb = alloc_dummy_extent_buffer(fs_info, start);
if (!eb)
return ERR_PTR(-ENOMEM);
eb->fs_info = fs_info;
again:
ret = radix_tree_preload(GFP_NOFS);
if (ret) {
exists = ERR_PTR(ret);
goto free_eb;
}
spin_lock(&fs_info->buffer_lock);
ret = radix_tree_insert(&fs_info->buffer_radix,
start >> fs_info->sectorsize_bits, eb);
spin_unlock(&fs_info->buffer_lock);
radix_tree_preload_end();
if (ret == -EEXIST) {
exists = find_extent_buffer(fs_info, start);
if (exists)
goto free_eb;
else
goto again;
}
check_buffer_tree_ref(eb);
set_bit(EXTENT_BUFFER_IN_TREE, &eb->bflags);
return eb;
free_eb:
btrfs_release_extent_buffer(eb);
return exists;
}
#endif
static struct extent_buffer *grab_extent_buffer(
struct btrfs_fs_info *fs_info, struct page *page)
{
struct extent_buffer *exists;
/*
* For subpage case, we completely rely on radix tree to ensure we
* don't try to insert two ebs for the same bytenr. So here we always
* return NULL and just continue.
*/
if (fs_info->sectorsize < PAGE_SIZE)
return NULL;
/* Page not yet attached to an extent buffer */
if (!PagePrivate(page))
return NULL;
/*
* We could have already allocated an eb for this page and attached one
* so lets see if we can get a ref on the existing eb, and if we can we
* know it's good and we can just return that one, else we know we can
* just overwrite page->private.
*/
exists = (struct extent_buffer *)page->private;
if (atomic_inc_not_zero(&exists->refs))
return exists;
WARN_ON(PageDirty(page));
detach_page_private(page);
return NULL;
}
struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
u64 start, u64 owner_root, int level)
{
unsigned long len = fs_info->nodesize;
int num_pages;
int i;
unsigned long index = start >> PAGE_SHIFT;
struct extent_buffer *eb;
struct extent_buffer *exists = NULL;
struct page *p;
struct address_space *mapping = fs_info->btree_inode->i_mapping;
int uptodate = 1;
int ret;
if (!IS_ALIGNED(start, fs_info->sectorsize)) {
btrfs_err(fs_info, "bad tree block start %llu", start);
return ERR_PTR(-EINVAL);
}
#if BITS_PER_LONG == 32
if (start >= MAX_LFS_FILESIZE) {
btrfs_err_rl(fs_info,
"extent buffer %llu is beyond 32bit page cache limit", start);
btrfs_err_32bit_limit(fs_info);
return ERR_PTR(-EOVERFLOW);
}
if (start >= BTRFS_32BIT_EARLY_WARN_THRESHOLD)
btrfs_warn_32bit_limit(fs_info);
#endif
if (fs_info->sectorsize < PAGE_SIZE && offset_in_page(start) + len > PAGE_SIZE) {
btrfs_err(fs_info,
"tree block crosses page boundary, start %llu nodesize %lu",
start, len);
return ERR_PTR(-EINVAL);
}
eb = find_extent_buffer(fs_info, start); if (eb)
return eb;
eb = __alloc_extent_buffer(fs_info, start, len);
if (!eb)
return ERR_PTR(-ENOMEM);
btrfs_set_buffer_lockdep_class(owner_root, eb, level);
num_pages = num_extent_pages(eb);
for (i = 0; i < num_pages; i++, index++) {
struct btrfs_subpage *prealloc = NULL;
p = find_or_create_page(mapping, index, GFP_NOFS|__GFP_NOFAIL);
if (!p) {
exists = ERR_PTR(-ENOMEM);
goto free_eb;
}
/*
* Preallocate page->private for subpage case, so that we won't
* allocate memory with private_lock hold. The memory will be
* freed by attach_extent_buffer_page() or freed manually if
* we exit earlier.
*
* Although we have ensured one subpage eb can only have one
* page, but it may change in the future for 16K page size
* support, so we still preallocate the memory in the loop.
*/
ret = btrfs_alloc_subpage(fs_info, &prealloc,
BTRFS_SUBPAGE_METADATA);
if (ret < 0) {
unlock_page(p);
put_page(p);
exists = ERR_PTR(ret);
goto free_eb;
}
spin_lock(&mapping->private_lock);
exists = grab_extent_buffer(fs_info, p);
if (exists) {
spin_unlock(&mapping->private_lock);
unlock_page(p);
put_page(p);
mark_extent_buffer_accessed(exists, p);
btrfs_free_subpage(prealloc);
goto free_eb;
}
/* Should not fail, as we have preallocated the memory */
ret = attach_extent_buffer_page(eb, p, prealloc);
ASSERT(!ret);
/*
* To inform we have extra eb under allocation, so that
* detach_extent_buffer_page() won't release the page private
* when the eb hasn't yet been inserted into radix tree.
*
* The ref will be decreased when the eb released the page, in
* detach_extent_buffer_page().
* Thus needs no special handling in error path.
*/
btrfs_page_inc_eb_refs(fs_info, p);
spin_unlock(&mapping->private_lock);
WARN_ON(btrfs_page_test_dirty(fs_info, p, eb->start, eb->len)); eb->pages[i] = p;
if (!PageUptodate(p))
uptodate = 0;
/*
* We can't unlock the pages just yet since the extent buffer
* hasn't been properly inserted in the radix tree, this
* opens a race with btree_releasepage which can free a page
* while we are still filling in all pages for the buffer and
* we could crash.
*/
}
if (uptodate) set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
again:
ret = radix_tree_preload(GFP_NOFS);
if (ret) {
exists = ERR_PTR(ret);
goto free_eb;
}
spin_lock(&fs_info->buffer_lock);
ret = radix_tree_insert(&fs_info->buffer_radix,
start >> fs_info->sectorsize_bits, eb);
spin_unlock(&fs_info->buffer_lock);
radix_tree_preload_end();
if (ret == -EEXIST) {
exists = find_extent_buffer(fs_info, start);
if (exists)
goto free_eb;
else
goto again;
}
/* add one reference for the tree */
check_buffer_tree_ref(eb);
set_bit(EXTENT_BUFFER_IN_TREE, &eb->bflags);
/*
* Now it's safe to unlock the pages because any calls to
* btree_releasepage will correctly detect that a page belongs to a
* live buffer and won't free them prematurely.
*/
for (i = 0; i < num_pages; i++)
unlock_page(eb->pages[i]);
return eb;
free_eb:
WARN_ON(!atomic_dec_and_test(&eb->refs)); for (i = 0; i < num_pages; i++) { if (eb->pages[i]) unlock_page(eb->pages[i]);
}
btrfs_release_extent_buffer(eb);
return exists;
}
static inline void btrfs_release_extent_buffer_rcu(struct rcu_head *head)
{
struct extent_buffer *eb =
container_of(head, struct extent_buffer, rcu_head);
__free_extent_buffer(eb);
}
static int release_extent_buffer(struct extent_buffer *eb)
__releases(&eb->refs_lock)
{
lockdep_assert_held(&eb->refs_lock);
WARN_ON(atomic_read(&eb->refs) == 0);
if (atomic_dec_and_test(&eb->refs)) {
if (test_and_clear_bit(EXTENT_BUFFER_IN_TREE, &eb->bflags)) { struct btrfs_fs_info *fs_info = eb->fs_info;
spin_unlock(&eb->refs_lock);
spin_lock(&fs_info->buffer_lock);
radix_tree_delete(&fs_info->buffer_radix,
eb->start >> fs_info->sectorsize_bits);
spin_unlock(&fs_info->buffer_lock);
} else {
spin_unlock(&eb->refs_lock);
}
btrfs_leak_debug_del(&eb->fs_info->eb_leak_lock, &eb->leak_list);
/* Should be safe to release our pages at this point */
btrfs_release_extent_buffer_pages(eb);
#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
if (unlikely(test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags))) {
__free_extent_buffer(eb);
return 1;
}
#endif
call_rcu(&eb->rcu_head, btrfs_release_extent_buffer_rcu);
return 1;
}
spin_unlock(&eb->refs_lock);
return 0;
}
void free_extent_buffer(struct extent_buffer *eb)
{
int refs;
int old;
if (!eb)
return;
while (1) {
refs = atomic_read(&eb->refs);
if ((!test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags) && refs <= 3) || (test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags) &&
refs == 1))
break;
old = atomic_cmpxchg(&eb->refs, refs, refs - 1);
if (old == refs)
return;
}
spin_lock(&eb->refs_lock);
if (atomic_read(&eb->refs) == 2 &&
test_bit(EXTENT_BUFFER_STALE, &eb->bflags) &&
!extent_buffer_under_io(eb) &&
test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags))
atomic_dec(&eb->refs);
/*
* I know this is terrible, but it's temporary until we stop tracking
* the uptodate bits and such for the extent buffers.
*/
release_extent_buffer(eb);
}
void free_extent_buffer_stale(struct extent_buffer *eb)
{
if (!eb)
return;
spin_lock(&eb->refs_lock);
set_bit(EXTENT_BUFFER_STALE, &eb->bflags);
if (atomic_read(&eb->refs) == 2 && !extent_buffer_under_io(eb) &&
test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags))
atomic_dec(&eb->refs);
release_extent_buffer(eb);
}
static void btree_clear_page_dirty(struct page *page)
{
ASSERT(PageDirty(page));
ASSERT(PageLocked(page));
clear_page_dirty_for_io(page);
xa_lock_irq(&page->mapping->i_pages);
if (!PageDirty(page))
__xa_clear_mark(&page->mapping->i_pages,
page_index(page), PAGECACHE_TAG_DIRTY);
xa_unlock_irq(&page->mapping->i_pages);
}
static void clear_subpage_extent_buffer_dirty(const struct extent_buffer *eb)
{
struct btrfs_fs_info *fs_info = eb->fs_info;
struct page *page = eb->pages[0];
bool last;
/* btree_clear_page_dirty() needs page locked */
lock_page(page);
last = btrfs_subpage_clear_and_test_dirty(fs_info, page, eb->start,
eb->len);
if (last)
btree_clear_page_dirty(page); unlock_page(page); WARN_ON(atomic_read(&eb->refs) == 0);
}
void clear_extent_buffer_dirty(const struct extent_buffer *eb)
{
int i;
int num_pages;
struct page *page;
if (eb->fs_info->sectorsize < PAGE_SIZE)
return clear_subpage_extent_buffer_dirty(eb);
num_pages = num_extent_pages(eb); for (i = 0; i < num_pages; i++) { page = eb->pages[i];
if (!PageDirty(page))
continue;
lock_page(page);
btree_clear_page_dirty(page);
ClearPageError(page);
unlock_page(page);
}
WARN_ON(atomic_read(&eb->refs) == 0);
}
bool set_extent_buffer_dirty(struct extent_buffer *eb)
{
int i;
int num_pages;
bool was_dirty;
check_buffer_tree_ref(eb);
was_dirty = test_and_set_bit(EXTENT_BUFFER_DIRTY, &eb->bflags);
num_pages = num_extent_pages(eb);
WARN_ON(atomic_read(&eb->refs) == 0); WARN_ON(!test_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)); if (!was_dirty) {
bool subpage = eb->fs_info->sectorsize < PAGE_SIZE;
/*
* For subpage case, we can have other extent buffers in the
* same page, and in clear_subpage_extent_buffer_dirty() we
* have to clear page dirty without subpage lock held.
* This can cause race where our page gets dirty cleared after
* we just set it.
*
* Thankfully, clear_subpage_extent_buffer_dirty() has locked
* its page for other reasons, we can use page lock to prevent
* the above race.
*/
if (subpage)
lock_page(eb->pages[0]); for (i = 0; i < num_pages; i++)
btrfs_page_set_dirty(eb->fs_info, eb->pages[i],
eb->start, eb->len); if (subpage) unlock_page(eb->pages[0]);
}
#ifdef CONFIG_BTRFS_DEBUG
for (i = 0; i < num_pages; i++)
ASSERT(PageDirty(eb->pages[i]));
#endif
return was_dirty;
}
void clear_extent_buffer_uptodate(struct extent_buffer *eb)
{
struct btrfs_fs_info *fs_info = eb->fs_info;
struct page *page;
int num_pages;
int i;
clear_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
num_pages = num_extent_pages(eb);
for (i = 0; i < num_pages; i++) { page = eb->pages[i];
if (page)
btrfs_page_clear_uptodate(fs_info, page,
eb->start, eb->len);
}
}
void set_extent_buffer_uptodate(struct extent_buffer *eb)
{
struct btrfs_fs_info *fs_info = eb->fs_info;
struct page *page;
int num_pages;
int i;
set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
num_pages = num_extent_pages(eb);
for (i = 0; i < num_pages; i++) {
page = eb->pages[i]; btrfs_page_set_uptodate(fs_info, page, eb->start, eb->len);
}
}
static int read_extent_buffer_subpage(struct extent_buffer *eb, int wait,
int mirror_num)
{
struct btrfs_fs_info *fs_info = eb->fs_info;
struct extent_io_tree *io_tree;
struct page *page = eb->pages[0];
struct btrfs_bio_ctrl bio_ctrl = { 0 };
int ret = 0;
ASSERT(!test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags));
ASSERT(PagePrivate(page));
io_tree = &BTRFS_I(fs_info->btree_inode)->io_tree;
if (wait == WAIT_NONE) {
if (!try_lock_extent(io_tree, eb->start, eb->start + eb->len - 1))
return -EAGAIN;
} else {
ret = lock_extent(io_tree, eb->start, eb->start + eb->len - 1);
if (ret < 0)
return ret;
}
ret = 0;
if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags) ||
PageUptodate(page) ||
btrfs_subpage_test_uptodate(fs_info, page, eb->start, eb->len)) {
set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
unlock_extent(io_tree, eb->start, eb->start + eb->len - 1);
return ret;
}
clear_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags);
eb->read_mirror = 0;
atomic_set(&eb->io_pages, 1);
check_buffer_tree_ref(eb);
btrfs_subpage_clear_error(fs_info, page, eb->start, eb->len);
btrfs_subpage_start_reader(fs_info, page, eb->start, eb->len);
ret = submit_extent_page(REQ_OP_READ | REQ_META, NULL, &bio_ctrl,
page, eb->start, eb->len,
eb->start - page_offset(page),
end_bio_extent_readpage, mirror_num, 0,
true);
if (ret) {
/*
* In the endio function, if we hit something wrong we will
* increase the io_pages, so here we need to decrease it for
* error path.
*/
atomic_dec(&eb->io_pages);
}
if (bio_ctrl.bio) {
int tmp;
tmp = submit_one_bio(bio_ctrl.bio, mirror_num, 0);
bio_ctrl.bio = NULL;
if (tmp < 0)
return tmp;
}
if (ret || wait != WAIT_COMPLETE)
return ret;
wait_extent_bit(io_tree, eb->start, eb->start + eb->len - 1, EXTENT_LOCKED);
if (!test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags))
ret = -EIO;
return ret;
}
int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num)
{
int i;
struct page *page;
int err;
int ret = 0;
int locked_pages = 0;
int all_uptodate = 1;
int num_pages;
unsigned long num_reads = 0;
struct btrfs_bio_ctrl bio_ctrl = { 0 };
if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags))
return 0;
/*
* We could have had EXTENT_BUFFER_UPTODATE cleared by the write
* operation, which could potentially still be in flight. In this case
* we simply want to return an error.
*/
if (unlikely(test_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags)))
return -EIO;
if (eb->fs_info->sectorsize < PAGE_SIZE)
return read_extent_buffer_subpage(eb, wait, mirror_num);
num_pages = num_extent_pages(eb);
for (i = 0; i < num_pages; i++) {
page = eb->pages[i];
if (wait == WAIT_NONE) {
/*
* WAIT_NONE is only utilized by readahead. If we can't
* acquire the lock atomically it means either the eb
* is being read out or under modification.
* Either way the eb will be or has been cached,
* readahead can exit safely.
*/
if (!trylock_page(page))
goto unlock_exit;
} else {
lock_page(page);
}
locked_pages++;
}
/*
* We need to firstly lock all pages to make sure that
* the uptodate bit of our pages won't be affected by
* clear_extent_buffer_uptodate().
*/
for (i = 0; i < num_pages; i++) { page = eb->pages[i];
if (!PageUptodate(page)) {
num_reads++;
all_uptodate = 0;
}
}
if (all_uptodate) {
set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
goto unlock_exit;
}
clear_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags);
eb->read_mirror = 0;
atomic_set(&eb->io_pages, num_reads);
/*
* It is possible for releasepage to clear the TREE_REF bit before we
* set io_pages. See check_buffer_tree_ref for a more detailed comment.
*/
check_buffer_tree_ref(eb);
for (i = 0; i < num_pages; i++) { page = eb->pages[i];
if (!PageUptodate(page)) {
if (ret) {
atomic_dec(&eb->io_pages);
unlock_page(page);
continue;
}
ClearPageError(page);
err = submit_extent_page(REQ_OP_READ | REQ_META, NULL,
&bio_ctrl, page, page_offset(page),
PAGE_SIZE, 0, end_bio_extent_readpage,
mirror_num, 0, false);
if (err) {
/*
* We failed to submit the bio so it's the
* caller's responsibility to perform cleanup
* i.e unlock page/set error bit.
*/
ret = err;
SetPageError(page);
unlock_page(page);
atomic_dec(&eb->io_pages);
}
} else {
unlock_page(page);
}
}
if (bio_ctrl.bio) { err = submit_one_bio(bio_ctrl.bio, mirror_num, bio_ctrl.bio_flags);
bio_ctrl.bio = NULL;
if (err)
return err;
}
if (ret || wait != WAIT_COMPLETE)
return ret;
for (i = 0; i < num_pages; i++) { page = eb->pages[i];
wait_on_page_locked(page);
if (!PageUptodate(page))
ret = -EIO;
}
return ret;
unlock_exit:
while (locked_pages > 0) { locked_pages--;
page = eb->pages[locked_pages];
unlock_page(page);
}
return ret;
}
static bool report_eb_range(const struct extent_buffer *eb, unsigned long start,
unsigned long len)
{
btrfs_warn(eb->fs_info,
"access to eb bytenr %llu len %lu out of range start %lu len %lu",
eb->start, eb->len, start, len);
WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG));
return true;
}
/*
* Check if the [start, start + len) range is valid before reading/writing
* the eb.
* NOTE: @start and @len are offset inside the eb, not logical address.
*
* Caller should not touch the dst/src memory if this function returns error.
*/
static inline int check_eb_range(const struct extent_buffer *eb,
unsigned long start, unsigned long len)
{
unsigned long offset;
/* start, start + len should not go beyond eb->len nor overflow */
if (unlikely(check_add_overflow(start, len, &offset) || offset > eb->len))
return report_eb_range(eb, start, len);
return false;
}
void read_extent_buffer(const struct extent_buffer *eb, void *dstv,
unsigned long start, unsigned long len)
{
size_t cur;
size_t offset;
struct page *page;
char *kaddr;
char *dst = (char *)dstv;
unsigned long i = get_eb_page_index(start);
if (check_eb_range(eb, start, len))
return;
offset = get_eb_offset_in_page(eb, start);
while (len > 0) {
page = eb->pages[i];
cur = min(len, (PAGE_SIZE - offset));
kaddr = page_address(page);
memcpy(dst, kaddr + offset, cur);
dst += cur;
len -= cur;
offset = 0;
i++;
}
}
int read_extent_buffer_to_user_nofault(const struct extent_buffer *eb,
void __user *dstv,
unsigned long start, unsigned long len)
{
size_t cur;
size_t offset;
struct page *page;
char *kaddr;
char __user *dst = (char __user *)dstv;
unsigned long i = get_eb_page_index(start);
int ret = 0;
WARN_ON(start > eb->len);
WARN_ON(start + len > eb->start + eb->len);
offset = get_eb_offset_in_page(eb, start);
while (len > 0) {
page = eb->pages[i];
cur = min(len, (PAGE_SIZE - offset));
kaddr = page_address(page);
if (copy_to_user_nofault(dst, kaddr + offset, cur)) {
ret = -EFAULT;
break;
}
dst += cur;
len -= cur;
offset = 0;
i++;
}
return ret;
}
int memcmp_extent_buffer(const struct extent_buffer *eb, const void *ptrv,
unsigned long start, unsigned long len)
{
size_t cur;
size_t offset;
struct page *page;
char *kaddr;
char *ptr = (char *)ptrv;
unsigned long i = get_eb_page_index(start);
int ret = 0;
if (check_eb_range(eb, start, len))
return -EINVAL;
offset = get_eb_offset_in_page(eb, start);
while (len > 0) {
page = eb->pages[i];
cur = min(len, (PAGE_SIZE - offset));
kaddr = page_address(page);
ret = memcmp(ptr, kaddr + offset, cur);
if (ret)
break;
ptr += cur;
len -= cur;
offset = 0;
i++;
}
return ret;
}
/*
* Check that the extent buffer is uptodate.
*
* For regular sector size == PAGE_SIZE case, check if @page is uptodate.
* For subpage case, check if the range covered by the eb has EXTENT_UPTODATE.
*/
static void assert_eb_page_uptodate(const struct extent_buffer *eb,
struct page *page)
{
struct btrfs_fs_info *fs_info = eb->fs_info;
/*
* If we are using the commit root we could potentially clear a page
* Uptodate while we're using the extent buffer that we've previously
* looked up. We don't want to complain in this case, as the page was
* valid before, we just didn't write it out. Instead we want to catch
* the case where we didn't actually read the block properly, which
* would have !PageUptodate && !PageError, as we clear PageError before
* reading.
*/
if (fs_info->sectorsize < PAGE_SIZE) {
bool uptodate, error;
uptodate = btrfs_subpage_test_uptodate(fs_info, page,
eb->start, eb->len);
error = btrfs_subpage_test_error(fs_info, page, eb->start, eb->len);
WARN_ON(!uptodate && !error);
} else {
WARN_ON(!PageUptodate(page) && !PageError(page));
}
}
void write_extent_buffer_chunk_tree_uuid(const struct extent_buffer *eb,
const void *srcv)
{
char *kaddr;
assert_eb_page_uptodate(eb, eb->pages[0]);
kaddr = page_address(eb->pages[0]) +
get_eb_offset_in_page(eb, offsetof(struct btrfs_header,
chunk_tree_uuid));
memcpy(kaddr, srcv, BTRFS_FSID_SIZE);
}
void write_extent_buffer_fsid(const struct extent_buffer *eb, const void *srcv)
{
char *kaddr;
assert_eb_page_uptodate(eb, eb->pages[0]);
kaddr = page_address(eb->pages[0]) +
get_eb_offset_in_page(eb, offsetof(struct btrfs_header, fsid));
memcpy(kaddr, srcv, BTRFS_FSID_SIZE);
}
void write_extent_buffer(const struct extent_buffer *eb, const void *srcv,
unsigned long start, unsigned long len)
{
size_t cur;
size_t offset;
struct page *page;
char *kaddr;
char *src = (char *)srcv;
unsigned long i = get_eb_page_index(start);
WARN_ON(test_bit(EXTENT_BUFFER_NO_CHECK, &eb->bflags));
if (check_eb_range(eb, start, len))
return;
offset = get_eb_offset_in_page(eb, start);
while (len > 0) {
page = eb->pages[i];
assert_eb_page_uptodate(eb, page);
cur = min(len, PAGE_SIZE - offset);
kaddr = page_address(page);
memcpy(kaddr + offset, src, cur);
src += cur;
len -= cur;
offset = 0;
i++;
}
}
void memzero_extent_buffer(const struct extent_buffer *eb, unsigned long start,
unsigned long len)
{
size_t cur;
size_t offset;
struct page *page;
char *kaddr;
unsigned long i = get_eb_page_index(start);
if (check_eb_range(eb, start, len))
return;
offset = get_eb_offset_in_page(eb, start);
while (len > 0) {
page = eb->pages[i];
assert_eb_page_uptodate(eb, page);
cur = min(len, PAGE_SIZE - offset);
kaddr = page_address(page);
memset(kaddr + offset, 0, cur);
len -= cur;
offset = 0;
i++;
}
}
void copy_extent_buffer_full(const struct extent_buffer *dst,
const struct extent_buffer *src)
{
int i;
int num_pages;
ASSERT(dst->len == src->len); if (dst->fs_info->sectorsize == PAGE_SIZE) {
num_pages = num_extent_pages(dst);
for (i = 0; i < num_pages; i++)
copy_page(page_address(dst->pages[i]),
page_address(src->pages[i]));
} else {
size_t src_offset = get_eb_offset_in_page(src, 0);
size_t dst_offset = get_eb_offset_in_page(dst, 0);
ASSERT(src->fs_info->sectorsize < PAGE_SIZE);
memcpy(page_address(dst->pages[0]) + dst_offset,
page_address(src->pages[0]) + src_offset,
src->len);
}
}
void copy_extent_buffer(const struct extent_buffer *dst,
const struct extent_buffer *src,
unsigned long dst_offset, unsigned long src_offset,
unsigned long len)
{
u64 dst_len = dst->len;
size_t cur;
size_t offset;
struct page *page;
char *kaddr;
unsigned long i = get_eb_page_index(dst_offset);
if (check_eb_range(dst, dst_offset, len) ||
check_eb_range(src, src_offset, len))
return;
WARN_ON(src->len != dst_len); offset = get_eb_offset_in_page(dst, dst_offset);
while (len > 0) {
page = dst->pages[i];
assert_eb_page_uptodate(dst, page);
cur = min(len, (unsigned long)(PAGE_SIZE - offset));
kaddr = page_address(page);
read_extent_buffer(src, kaddr + offset, src_offset, cur);
src_offset += cur;
len -= cur;
offset = 0;
i++;
}
}
/*
* eb_bitmap_offset() - calculate the page and offset of the byte containing the
* given bit number
* @eb: the extent buffer
* @start: offset of the bitmap item in the extent buffer
* @nr: bit number
* @page_index: return index of the page in the extent buffer that contains the
* given bit number
* @page_offset: return offset into the page given by page_index
*
* This helper hides the ugliness of finding the byte in an extent buffer which
* contains a given bit.
*/
static inline void eb_bitmap_offset(const struct extent_buffer *eb,
unsigned long start, unsigned long nr,
unsigned long *page_index,
size_t *page_offset)
{
size_t byte_offset = BIT_BYTE(nr);
size_t offset;
/*
* The byte we want is the offset of the extent buffer + the offset of
* the bitmap item in the extent buffer + the offset of the byte in the
* bitmap item.
*/
offset = start + offset_in_page(eb->start) + byte_offset;
*page_index = offset >> PAGE_SHIFT;
*page_offset = offset_in_page(offset);
}
/**
* extent_buffer_test_bit - determine whether a bit in a bitmap item is set
* @eb: the extent buffer
* @start: offset of the bitmap item in the extent buffer
* @nr: bit number to test
*/
int extent_buffer_test_bit(const struct extent_buffer *eb, unsigned long start,
unsigned long nr)
{
u8 *kaddr;
struct page *page;
unsigned long i;
size_t offset;
eb_bitmap_offset(eb, start, nr, &i, &offset);
page = eb->pages[i];
assert_eb_page_uptodate(eb, page);
kaddr = page_address(page);
return 1U & (kaddr[offset] >> (nr & (BITS_PER_BYTE - 1)));
}
/**
* extent_buffer_bitmap_set - set an area of a bitmap
* @eb: the extent buffer
* @start: offset of the bitmap item in the extent buffer
* @pos: bit number of the first bit
* @len: number of bits to set
*/
void extent_buffer_bitmap_set(const struct extent_buffer *eb, unsigned long start,
unsigned long pos, unsigned long len)
{
u8 *kaddr;
struct page *page;
unsigned long i;
size_t offset;
const unsigned int size = pos + len;
int bits_to_set = BITS_PER_BYTE - (pos % BITS_PER_BYTE);
u8 mask_to_set = BITMAP_FIRST_BYTE_MASK(pos);
eb_bitmap_offset(eb, start, pos, &i, &offset);
page = eb->pages[i];
assert_eb_page_uptodate(eb, page);
kaddr = page_address(page);
while (len >= bits_to_set) {
kaddr[offset] |= mask_to_set;
len -= bits_to_set;
bits_to_set = BITS_PER_BYTE;
mask_to_set = ~0;
if (++offset >= PAGE_SIZE && len > 0) {
offset = 0;
page = eb->pages[++i];
assert_eb_page_uptodate(eb, page);
kaddr = page_address(page);
}
}
if (len) {
mask_to_set &= BITMAP_LAST_BYTE_MASK(size);
kaddr[offset] |= mask_to_set;
}
}
/**
* extent_buffer_bitmap_clear - clear an area of a bitmap
* @eb: the extent buffer
* @start: offset of the bitmap item in the extent buffer
* @pos: bit number of the first bit
* @len: number of bits to clear
*/
void extent_buffer_bitmap_clear(const struct extent_buffer *eb,
unsigned long start, unsigned long pos,
unsigned long len)
{
u8 *kaddr;
struct page *page;
unsigned long i;
size_t offset;
const unsigned int size = pos + len;
int bits_to_clear = BITS_PER_BYTE - (pos % BITS_PER_BYTE);
u8 mask_to_clear = BITMAP_FIRST_BYTE_MASK(pos);
eb_bitmap_offset(eb, start, pos, &i, &offset);
page = eb->pages[i];
assert_eb_page_uptodate(eb, page);
kaddr = page_address(page);
while (len >= bits_to_clear) {
kaddr[offset] &= ~mask_to_clear;
len -= bits_to_clear;
bits_to_clear = BITS_PER_BYTE;
mask_to_clear = ~0;
if (++offset >= PAGE_SIZE && len > 0) {
offset = 0;
page = eb->pages[++i];
assert_eb_page_uptodate(eb, page);
kaddr = page_address(page);
}
}
if (len) {
mask_to_clear &= BITMAP_LAST_BYTE_MASK(size);
kaddr[offset] &= ~mask_to_clear;
}
}
static inline bool areas_overlap(unsigned long src, unsigned long dst, unsigned long len)
{
unsigned long distance = (src > dst) ? src - dst : dst - src; return distance < len;
}
static void copy_pages(struct page *dst_page, struct page *src_page,
unsigned long dst_off, unsigned long src_off,
unsigned long len)
{
char *dst_kaddr = page_address(dst_page);
char *src_kaddr;
int must_memmove = 0;
if (dst_page != src_page) {
src_kaddr = page_address(src_page);
} else {
src_kaddr = dst_kaddr;
if (areas_overlap(src_off, dst_off, len))
must_memmove = 1;
}
if (must_memmove)
memmove(dst_kaddr + dst_off, src_kaddr + src_off, len);
else
memcpy(dst_kaddr + dst_off, src_kaddr + src_off, len);
}
void memcpy_extent_buffer(const struct extent_buffer *dst,
unsigned long dst_offset, unsigned long src_offset,
unsigned long len)
{
size_t cur;
size_t dst_off_in_page;
size_t src_off_in_page;
unsigned long dst_i;
unsigned long src_i;
if (check_eb_range(dst, dst_offset, len) ||
check_eb_range(dst, src_offset, len))
return;
while (len > 0) { dst_off_in_page = get_eb_offset_in_page(dst, dst_offset);
src_off_in_page = get_eb_offset_in_page(dst, src_offset);
dst_i = get_eb_page_index(dst_offset);
src_i = get_eb_page_index(src_offset);
cur = min(len, (unsigned long)(PAGE_SIZE -
src_off_in_page));
cur = min_t(unsigned long, cur,
(unsigned long)(PAGE_SIZE - dst_off_in_page));
copy_pages(dst->pages[dst_i], dst->pages[src_i],
dst_off_in_page, src_off_in_page, cur);
src_offset += cur;
dst_offset += cur;
len -= cur;
}
}
void memmove_extent_buffer(const struct extent_buffer *dst,
unsigned long dst_offset, unsigned long src_offset,
unsigned long len)
{
size_t cur;
size_t dst_off_in_page;
size_t src_off_in_page;
unsigned long dst_end = dst_offset + len - 1;
unsigned long src_end = src_offset + len - 1;
unsigned long dst_i;
unsigned long src_i;
if (check_eb_range(dst, dst_offset, len) ||
check_eb_range(dst, src_offset, len))
return;
if (dst_offset < src_offset) { memcpy_extent_buffer(dst, dst_offset, src_offset, len);
return;
}
while (len > 0) {
dst_i = get_eb_page_index(dst_end);
src_i = get_eb_page_index(src_end);
dst_off_in_page = get_eb_offset_in_page(dst, dst_end);
src_off_in_page = get_eb_offset_in_page(dst, src_end);
cur = min_t(unsigned long, len, src_off_in_page + 1);
cur = min(cur, dst_off_in_page + 1);
copy_pages(dst->pages[dst_i], dst->pages[src_i],
dst_off_in_page - cur + 1,
src_off_in_page - cur + 1, cur);
dst_end -= cur;
src_end -= cur;
len -= cur;
}
}
static struct extent_buffer *get_next_extent_buffer(
struct btrfs_fs_info *fs_info, struct page *page, u64 bytenr)
{
struct extent_buffer *gang[BTRFS_SUBPAGE_BITMAP_SIZE];
struct extent_buffer *found = NULL;
u64 page_start = page_offset(page);
int ret;
int i;
ASSERT(in_range(bytenr, page_start, PAGE_SIZE));
ASSERT(PAGE_SIZE / fs_info->nodesize <= BTRFS_SUBPAGE_BITMAP_SIZE);
lockdep_assert_held(&fs_info->buffer_lock);
ret = radix_tree_gang_lookup(&fs_info->buffer_radix, (void **)gang,
bytenr >> fs_info->sectorsize_bits,
PAGE_SIZE / fs_info->nodesize);
for (i = 0; i < ret; i++) {
/* Already beyond page end */
if (gang[i]->start >= page_start + PAGE_SIZE)
break;
/* Found one */
if (gang[i]->start >= bytenr) {
found = gang[i];
break;
}
}
return found;
}
static int try_release_subpage_extent_buffer(struct page *page)
{
struct btrfs_fs_info *fs_info = btrfs_sb(page->mapping->host->i_sb);
u64 cur = page_offset(page);
const u64 end = page_offset(page) + PAGE_SIZE;
int ret;
while (cur < end) {
struct extent_buffer *eb = NULL;
/*
* Unlike try_release_extent_buffer() which uses page->private
* to grab buffer, for subpage case we rely on radix tree, thus
* we need to ensure radix tree consistency.
*
* We also want an atomic snapshot of the radix tree, thus go
* with spinlock rather than RCU.
*/
spin_lock(&fs_info->buffer_lock);
eb = get_next_extent_buffer(fs_info, page, cur);
if (!eb) {
/* No more eb in the page range after or at cur */
spin_unlock(&fs_info->buffer_lock);
break;
}
cur = eb->start + eb->len;
/*
* The same as try_release_extent_buffer(), to ensure the eb
* won't disappear out from under us.
*/
spin_lock(&eb->refs_lock);
if (atomic_read(&eb->refs) != 1 || extent_buffer_under_io(eb)) {
spin_unlock(&eb->refs_lock);
spin_unlock(&fs_info->buffer_lock);
break;
}
spin_unlock(&fs_info->buffer_lock);
/*
* If tree ref isn't set then we know the ref on this eb is a
* real ref, so just return, this eb will likely be freed soon
* anyway.
*/
if (!test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) {
spin_unlock(&eb->refs_lock);
break;
}
/*
* Here we don't care about the return value, we will always
* check the page private at the end. And
* release_extent_buffer() will release the refs_lock.
*/
release_extent_buffer(eb);
}
/*
* Finally to check if we have cleared page private, as if we have
* released all ebs in the page, the page private should be cleared now.
*/
spin_lock(&page->mapping->private_lock);
if (!PagePrivate(page))
ret = 1;
else
ret = 0;
spin_unlock(&page->mapping->private_lock);
return ret;
}
int try_release_extent_buffer(struct page *page)
{
struct extent_buffer *eb;
if (btrfs_sb(page->mapping->host->i_sb)->sectorsize < PAGE_SIZE)
return try_release_subpage_extent_buffer(page);
/*
* We need to make sure nobody is changing page->private, as we rely on
* page->private as the pointer to extent buffer.
*/
spin_lock(&page->mapping->private_lock);
if (!PagePrivate(page)) {
spin_unlock(&page->mapping->private_lock);
return 1;
}
eb = (struct extent_buffer *)page->private; BUG_ON(!eb);
/*
* This is a little awful but should be ok, we need to make sure that
* the eb doesn't disappear out from under us while we're looking at
* this page.
*/
spin_lock(&eb->refs_lock);
if (atomic_read(&eb->refs) != 1 || extent_buffer_under_io(eb)) {
spin_unlock(&eb->refs_lock);
spin_unlock(&page->mapping->private_lock);
return 0;
}
spin_unlock(&page->mapping->private_lock);
/*
* If tree ref isn't set then we know the ref on this eb is a real ref,
* so just return, this page will likely be freed soon anyway.
*/
if (!test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) {
spin_unlock(&eb->refs_lock);
return 0;
}
return release_extent_buffer(eb);
}
/*
* btrfs_readahead_tree_block - attempt to readahead a child block
* @fs_info: the fs_info
* @bytenr: bytenr to read
* @owner_root: objectid of the root that owns this eb
* @gen: generation for the uptodate check, can be 0
* @level: level for the eb
*
* Attempt to readahead a tree block at @bytenr. If @gen is 0 then we do a
* normal uptodate check of the eb, without checking the generation. If we have
* to read the block we will not block on anything.
*/
void btrfs_readahead_tree_block(struct btrfs_fs_info *fs_info,
u64 bytenr, u64 owner_root, u64 gen, int level)
{
struct extent_buffer *eb;
int ret;
eb = btrfs_find_create_tree_block(fs_info, bytenr, owner_root, level);
if (IS_ERR(eb))
return;
if (btrfs_buffer_uptodate(eb, gen, 1)) {
free_extent_buffer(eb);
return;
}
ret = read_extent_buffer_pages(eb, WAIT_NONE, 0);
if (ret < 0)
free_extent_buffer_stale(eb);
else
free_extent_buffer(eb);
}
/*
* btrfs_readahead_node_child - readahead a node's child block
* @node: parent node we're reading from
* @slot: slot in the parent node for the child we want to read
*
* A helper for btrfs_readahead_tree_block, we simply read the bytenr pointed at
* the slot in the node provided.
*/
void btrfs_readahead_node_child(struct extent_buffer *node, int slot)
{
btrfs_readahead_tree_block(node->fs_info,
btrfs_node_blockptr(node, slot),
btrfs_header_owner(node),
btrfs_node_ptr_generation(node, slot),
btrfs_header_level(node) - 1);
}
// SPDX-License-Identifier: GPL-2.0
#include <linux/spinlock.h>
#include <linux/task_work.h>
#include <linux/tracehook.h>
static struct callback_head work_exited; /* all we need is ->next == NULL */
/**
* task_work_add - ask the @task to execute @work->func()
* @task: the task which should run the callback
* @work: the callback to run
* @notify: how to notify the targeted task
*
* Queue @work for task_work_run() below and notify the @task if @notify
* is @TWA_RESUME or @TWA_SIGNAL. @TWA_SIGNAL works like signals, in that the
* it will interrupt the targeted task and run the task_work. @TWA_RESUME
* work is run only when the task exits the kernel and returns to user mode,
* or before entering guest mode. Fails if the @task is exiting/exited and thus
* it can't process this @work. Otherwise @work->func() will be called when the
* @task goes through one of the aforementioned transitions, or exits.
*
* If the targeted task is exiting, then an error is returned and the work item
* is not queued. It's up to the caller to arrange for an alternative mechanism
* in that case.
*
* Note: there is no ordering guarantee on works queued here. The task_work
* list is LIFO.
*
* RETURNS:
* 0 if succeeds or -ESRCH.
*/
int task_work_add(struct task_struct *task, struct callback_head *work,
enum task_work_notify_mode notify)
{
struct callback_head *head;
/* record the work call stack in order to print it in KASAN reports */
kasan_record_aux_stack(work);
do {
head = READ_ONCE(task->task_works);
if (unlikely(head == &work_exited))
return -ESRCH; work->next = head;
} while (cmpxchg(&task->task_works, head, work) != head);
switch (notify) {
case TWA_NONE:
break;
case TWA_RESUME:
set_notify_resume(task);
break;
case TWA_SIGNAL:
set_notify_signal(task);
break;
default:
WARN_ON_ONCE(1);
break;
}
return 0;
}
/**
* task_work_cancel_match - cancel a pending work added by task_work_add()
* @task: the task which should execute the work
* @match: match function to call
*
* RETURNS:
* The found work or NULL if not found.
*/
struct callback_head *
task_work_cancel_match(struct task_struct *task,
bool (*match)(struct callback_head *, void *data),
void *data)
{
struct callback_head **pprev = &task->task_works;
struct callback_head *work;
unsigned long flags;
if (likely(!task->task_works))
return NULL;
/*
* If cmpxchg() fails we continue without updating pprev.
* Either we raced with task_work_add() which added the
* new entry before this work, we will find it again. Or
* we raced with task_work_run(), *pprev == NULL/exited.
*/
raw_spin_lock_irqsave(&task->pi_lock, flags);
while ((work = READ_ONCE(*pprev))) {
if (!match(work, data))
pprev = &work->next;
else if (cmpxchg(pprev, work, work->next) == work)
break;
}
raw_spin_unlock_irqrestore(&task->pi_lock, flags);
return work;
}
static bool task_work_func_match(struct callback_head *cb, void *data)
{
return cb->func == data;
}
/**
* task_work_cancel - cancel a pending work added by task_work_add()
* @task: the task which should execute the work
* @func: identifies the work to remove
*
* Find the last queued pending work with ->func == @func and remove
* it from queue.
*
* RETURNS:
* The found work or NULL if not found.
*/
struct callback_head *
task_work_cancel(struct task_struct *task, task_work_func_t func)
{
return task_work_cancel_match(task, task_work_func_match, func);
}
/**
* task_work_run - execute the works added by task_work_add()
*
* Flush the pending works. Should be used by the core kernel code.
* Called before the task returns to the user-mode or stops, or when
* it exits. In the latter case task_work_add() can no longer add the
* new work after task_work_run() returns.
*/
void task_work_run(void)
{
struct task_struct *task = current;
struct callback_head *work, *head, *next;
for (;;) {
/*
* work->func() can do task_work_add(), do not set
* work_exited unless the list is empty.
*/
do {
head = NULL;
work = READ_ONCE(task->task_works);
if (!work) {
if (task->flags & PF_EXITING)
head = &work_exited;
else
break;
}
} while (cmpxchg(&task->task_works, work, head) != work); if (!work)
break;
/*
* Synchronize with task_work_cancel(). It can not remove
* the first entry == work, cmpxchg(task_works) must fail.
* But it can remove another entry from the ->next list.
*/
raw_spin_lock_irq(&task->pi_lock);
raw_spin_unlock_irq(&task->pi_lock);
do {
next = work->next;
work->func(work);
work = next;
cond_resched();
} while (work);
}
}
/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
* Scatterlist Cryptographic API.
*
* Copyright (c) 2002 James Morris <jmorris@intercode.com.au>
* Copyright (c) 2002 David S. Miller (davem@redhat.com)
* Copyright (c) 2005 Herbert Xu <herbert@gondor.apana.org.au>
*
* Portions derived from Cryptoapi, by Alexander Kjeldaas <astor@fast.no>
* and Nettle, by Niels Möller.
*/
#ifndef _LINUX_CRYPTO_H
#define _LINUX_CRYPTO_H
#include <linux/atomic.h>
#include <linux/kernel.h>
#include <linux/list.h>
#include <linux/bug.h>
#include <linux/refcount.h>
#include <linux/slab.h>
#include <linux/completion.h>
/*
* Autoloaded crypto modules should only use a prefixed name to avoid allowing
* arbitrary modules to be loaded. Loading from userspace may still need the
* unprefixed names, so retains those aliases as well.
* This uses __MODULE_INFO directly instead of MODULE_ALIAS because pre-4.3
* gcc (e.g. avr32 toolchain) uses __LINE__ for uniqueness, and this macro
* expands twice on the same line. Instead, use a separate base name for the
* alias.
*/
#define MODULE_ALIAS_CRYPTO(name) \
__MODULE_INFO(alias, alias_userspace, name); \
__MODULE_INFO(alias, alias_crypto, "crypto-" name)
/*
* Algorithm masks and types.
*/
#define CRYPTO_ALG_TYPE_MASK 0x0000000f
#define CRYPTO_ALG_TYPE_CIPHER 0x00000001
#define CRYPTO_ALG_TYPE_COMPRESS 0x00000002
#define CRYPTO_ALG_TYPE_AEAD 0x00000003
#define CRYPTO_ALG_TYPE_SKCIPHER 0x00000005
#define CRYPTO_ALG_TYPE_KPP 0x00000008
#define CRYPTO_ALG_TYPE_ACOMPRESS 0x0000000a
#define CRYPTO_ALG_TYPE_SCOMPRESS 0x0000000b
#define CRYPTO_ALG_TYPE_RNG 0x0000000c
#define CRYPTO_ALG_TYPE_AKCIPHER 0x0000000d
#define CRYPTO_ALG_TYPE_HASH 0x0000000e
#define CRYPTO_ALG_TYPE_SHASH 0x0000000e
#define CRYPTO_ALG_TYPE_AHASH 0x0000000f
#define CRYPTO_ALG_TYPE_HASH_MASK 0x0000000e
#define CRYPTO_ALG_TYPE_AHASH_MASK 0x0000000e
#define CRYPTO_ALG_TYPE_ACOMPRESS_MASK 0x0000000e
#define CRYPTO_ALG_LARVAL 0x00000010
#define CRYPTO_ALG_DEAD 0x00000020
#define CRYPTO_ALG_DYING 0x00000040
#define CRYPTO_ALG_ASYNC 0x00000080
/*
* Set if the algorithm (or an algorithm which it uses) requires another
* algorithm of the same type to handle corner cases.
*/
#define CRYPTO_ALG_NEED_FALLBACK 0x00000100
/*
* Set if the algorithm has passed automated run-time testing. Note that
* if there is no run-time testing for a given algorithm it is considered
* to have passed.
*/
#define CRYPTO_ALG_TESTED 0x00000400
/*
* Set if the algorithm is an instance that is built from templates.
*/
#define CRYPTO_ALG_INSTANCE 0x00000800
/* Set this bit if the algorithm provided is hardware accelerated but
* not available to userspace via instruction set or so.
*/
#define CRYPTO_ALG_KERN_DRIVER_ONLY 0x00001000
/*
* Mark a cipher as a service implementation only usable by another
* cipher and never by a normal user of the kernel crypto API
*/
#define CRYPTO_ALG_INTERNAL 0x00002000
/*
* Set if the algorithm has a ->setkey() method but can be used without
* calling it first, i.e. there is a default key.
*/
#define CRYPTO_ALG_OPTIONAL_KEY 0x00004000
/*
* Don't trigger module loading
*/
#define CRYPTO_NOLOAD 0x00008000
/*
* The algorithm may allocate memory during request processing, i.e. during
* encryption, decryption, or hashing. Users can request an algorithm with this
* flag unset if they can't handle memory allocation failures.
*
* This flag is currently only implemented for algorithms of type "skcipher",
* "aead", "ahash", "shash", and "cipher". Algorithms of other types might not
* have this flag set even if they allocate memory.
*
* In some edge cases, algorithms can allocate memory regardless of this flag.
* To avoid these cases, users must obey the following usage constraints:
* skcipher:
* - The IV buffer and all scatterlist elements must be aligned to the
* algorithm's alignmask.
* - If the data were to be divided into chunks of size
* crypto_skcipher_walksize() (with any remainder going at the end), no
* chunk can cross a page boundary or a scatterlist element boundary.
* aead:
* - The IV buffer and all scatterlist elements must be aligned to the
* algorithm's alignmask.
* - The first scatterlist element must contain all the associated data,
* and its pages must be !PageHighMem.
* - If the plaintext/ciphertext were to be divided into chunks of size
* crypto_aead_walksize() (with the remainder going at the end), no chunk
* can cross a page boundary or a scatterlist element boundary.
* ahash:
* - The result buffer must be aligned to the algorithm's alignmask.
* - crypto_ahash_finup() must not be used unless the algorithm implements
* ->finup() natively.
*/
#define CRYPTO_ALG_ALLOCATES_MEMORY 0x00010000
/*
* Transform masks and values (for crt_flags).
*/
#define CRYPTO_TFM_NEED_KEY 0x00000001
#define CRYPTO_TFM_REQ_MASK 0x000fff00
#define CRYPTO_TFM_REQ_FORBID_WEAK_KEYS 0x00000100
#define CRYPTO_TFM_REQ_MAY_SLEEP 0x00000200
#define CRYPTO_TFM_REQ_MAY_BACKLOG 0x00000400
/*
* Miscellaneous stuff.
*/
#define CRYPTO_MAX_ALG_NAME 128
/*
* The macro CRYPTO_MINALIGN_ATTR (along with the void * type in the actual
* declaration) is used to ensure that the crypto_tfm context structure is
* aligned correctly for the given architecture so that there are no alignment
* faults for C data types. On architectures that support non-cache coherent
* DMA, such as ARM or arm64, it also takes into account the minimal alignment
* that is required to ensure that the context struct member does not share any
* cachelines with the rest of the struct. This is needed to ensure that cache
* maintenance for non-coherent DMA (cache invalidation in particular) does not
* affect data that may be accessed by the CPU concurrently.
*/
#define CRYPTO_MINALIGN ARCH_KMALLOC_MINALIGN
#define CRYPTO_MINALIGN_ATTR __attribute__ ((__aligned__(CRYPTO_MINALIGN)))
struct scatterlist;
struct crypto_async_request;
struct crypto_tfm;
struct crypto_type;
typedef void (*crypto_completion_t)(struct crypto_async_request *req, int err);
/**
* DOC: Block Cipher Context Data Structures
*
* These data structures define the operating context for each block cipher
* type.
*/
struct crypto_async_request {
struct list_head list;
crypto_completion_t complete;
void *data;
struct crypto_tfm *tfm;
u32 flags;
};
/**
* DOC: Block Cipher Algorithm Definitions
*
* These data structures define modular crypto algorithm implementations,
* managed via crypto_register_alg() and crypto_unregister_alg().
*/
/**
* struct cipher_alg - single-block symmetric ciphers definition
* @cia_min_keysize: Minimum key size supported by the transformation. This is
* the smallest key length supported by this transformation
* algorithm. This must be set to one of the pre-defined
* values as this is not hardware specific. Possible values
* for this field can be found via git grep "_MIN_KEY_SIZE"
* include/crypto/
* @cia_max_keysize: Maximum key size supported by the transformation. This is
* the largest key length supported by this transformation
* algorithm. This must be set to one of the pre-defined values
* as this is not hardware specific. Possible values for this
* field can be found via git grep "_MAX_KEY_SIZE"
* include/crypto/
* @cia_setkey: Set key for the transformation. This function is used to either
* program a supplied key into the hardware or store the key in the
* transformation context for programming it later. Note that this
* function does modify the transformation context. This function
* can be called multiple times during the existence of the
* transformation object, so one must make sure the key is properly
* reprogrammed into the hardware. This function is also
* responsible for checking the key length for validity.
* @cia_encrypt: Encrypt a single block. This function is used to encrypt a
* single block of data, which must be @cra_blocksize big. This
* always operates on a full @cra_blocksize and it is not possible
* to encrypt a block of smaller size. The supplied buffers must
* therefore also be at least of @cra_blocksize size. Both the
* input and output buffers are always aligned to @cra_alignmask.
* In case either of the input or output buffer supplied by user
* of the crypto API is not aligned to @cra_alignmask, the crypto
* API will re-align the buffers. The re-alignment means that a
* new buffer will be allocated, the data will be copied into the
* new buffer, then the processing will happen on the new buffer,
* then the data will be copied back into the original buffer and
* finally the new buffer will be freed. In case a software
* fallback was put in place in the @cra_init call, this function
* might need to use the fallback if the algorithm doesn't support
* all of the key sizes. In case the key was stored in
* transformation context, the key might need to be re-programmed
* into the hardware in this function. This function shall not
* modify the transformation context, as this function may be
* called in parallel with the same transformation object.
* @cia_decrypt: Decrypt a single block. This is a reverse counterpart to
* @cia_encrypt, and the conditions are exactly the same.
*
* All fields are mandatory and must be filled.
*/
struct cipher_alg {
unsigned int cia_min_keysize;
unsigned int cia_max_keysize;
int (*cia_setkey)(struct crypto_tfm *tfm, const u8 *key,
unsigned int keylen);
void (*cia_encrypt)(struct crypto_tfm *tfm, u8 *dst, const u8 *src);
void (*cia_decrypt)(struct crypto_tfm *tfm, u8 *dst, const u8 *src);
};
/**
* struct compress_alg - compression/decompression algorithm
* @coa_compress: Compress a buffer of specified length, storing the resulting
* data in the specified buffer. Return the length of the
* compressed data in dlen.
* @coa_decompress: Decompress the source buffer, storing the uncompressed
* data in the specified buffer. The length of the data is
* returned in dlen.
*
* All fields are mandatory.
*/
struct compress_alg {
int (*coa_compress)(struct crypto_tfm *tfm, const u8 *src,
unsigned int slen, u8 *dst, unsigned int *dlen);
int (*coa_decompress)(struct crypto_tfm *tfm, const u8 *src,
unsigned int slen, u8 *dst, unsigned int *dlen);
};
#ifdef CONFIG_CRYPTO_STATS
/*
* struct crypto_istat_aead - statistics for AEAD algorithm
* @encrypt_cnt: number of encrypt requests
* @encrypt_tlen: total data size handled by encrypt requests
* @decrypt_cnt: number of decrypt requests
* @decrypt_tlen: total data size handled by decrypt requests
* @err_cnt: number of error for AEAD requests
*/
struct crypto_istat_aead {
atomic64_t encrypt_cnt;
atomic64_t encrypt_tlen;
atomic64_t decrypt_cnt;
atomic64_t decrypt_tlen;
atomic64_t err_cnt;
};
/*
* struct crypto_istat_akcipher - statistics for akcipher algorithm
* @encrypt_cnt: number of encrypt requests
* @encrypt_tlen: total data size handled by encrypt requests
* @decrypt_cnt: number of decrypt requests
* @decrypt_tlen: total data size handled by decrypt requests
* @verify_cnt: number of verify operation
* @sign_cnt: number of sign requests
* @err_cnt: number of error for akcipher requests
*/
struct crypto_istat_akcipher {
atomic64_t encrypt_cnt;
atomic64_t encrypt_tlen;
atomic64_t decrypt_cnt;
atomic64_t decrypt_tlen;
atomic64_t verify_cnt;
atomic64_t sign_cnt;
atomic64_t err_cnt;
};
/*
* struct crypto_istat_cipher - statistics for cipher algorithm
* @encrypt_cnt: number of encrypt requests
* @encrypt_tlen: total data size handled by encrypt requests
* @decrypt_cnt: number of decrypt requests
* @decrypt_tlen: total data size handled by decrypt requests
* @err_cnt: number of error for cipher requests
*/
struct crypto_istat_cipher {
atomic64_t encrypt_cnt;
atomic64_t encrypt_tlen;
atomic64_t decrypt_cnt;
atomic64_t decrypt_tlen;
atomic64_t err_cnt;
};
/*
* struct crypto_istat_compress - statistics for compress algorithm
* @compress_cnt: number of compress requests
* @compress_tlen: total data size handled by compress requests
* @decompress_cnt: number of decompress requests
* @decompress_tlen: total data size handled by decompress requests
* @err_cnt: number of error for compress requests
*/
struct crypto_istat_compress {
atomic64_t compress_cnt;
atomic64_t compress_tlen;
atomic64_t decompress_cnt;
atomic64_t decompress_tlen;
atomic64_t err_cnt;
};
/*
* struct crypto_istat_hash - statistics for has algorithm
* @hash_cnt: number of hash requests
* @hash_tlen: total data size hashed
* @err_cnt: number of error for hash requests
*/
struct crypto_istat_hash {
atomic64_t hash_cnt;
atomic64_t hash_tlen;
atomic64_t err_cnt;
};
/*
* struct crypto_istat_kpp - statistics for KPP algorithm
* @setsecret_cnt: number of setsecrey operation
* @generate_public_key_cnt: number of generate_public_key operation
* @compute_shared_secret_cnt: number of compute_shared_secret operation
* @err_cnt: number of error for KPP requests
*/
struct crypto_istat_kpp {
atomic64_t setsecret_cnt;
atomic64_t generate_public_key_cnt;
atomic64_t compute_shared_secret_cnt;
atomic64_t err_cnt;
};
/*
* struct crypto_istat_rng: statistics for RNG algorithm
* @generate_cnt: number of RNG generate requests
* @generate_tlen: total data size of generated data by the RNG
* @seed_cnt: number of times the RNG was seeded
* @err_cnt: number of error for RNG requests
*/
struct crypto_istat_rng {
atomic64_t generate_cnt;
atomic64_t generate_tlen;
atomic64_t seed_cnt;
atomic64_t err_cnt;
};
#endif /* CONFIG_CRYPTO_STATS */
#define cra_cipher cra_u.cipher
#define cra_compress cra_u.compress
/**
* struct crypto_alg - definition of a cryptograpic cipher algorithm
* @cra_flags: Flags describing this transformation. See include/linux/crypto.h
* CRYPTO_ALG_* flags for the flags which go in here. Those are
* used for fine-tuning the description of the transformation
* algorithm.
* @cra_blocksize: Minimum block size of this transformation. The size in bytes
* of the smallest possible unit which can be transformed with
* this algorithm. The users must respect this value.
* In case of HASH transformation, it is possible for a smaller
* block than @cra_blocksize to be passed to the crypto API for
* transformation, in case of any other transformation type, an
* error will be returned upon any attempt to transform smaller
* than @cra_blocksize chunks.
* @cra_ctxsize: Size of the operational context of the transformation. This
* value informs the kernel crypto API about the memory size
* needed to be allocated for the transformation context.
* @cra_alignmask: Alignment mask for the input and output data buffer. The data
* buffer containing the input data for the algorithm must be
* aligned to this alignment mask. The data buffer for the
* output data must be aligned to this alignment mask. Note that
* the Crypto API will do the re-alignment in software, but
* only under special conditions and there is a performance hit.
* The re-alignment happens at these occasions for different
* @cra_u types: cipher -- For both input data and output data
* buffer; ahash -- For output hash destination buf; shash --
* For output hash destination buf.
* This is needed on hardware which is flawed by design and
* cannot pick data from arbitrary addresses.
* @cra_priority: Priority of this transformation implementation. In case
* multiple transformations with same @cra_name are available to
* the Crypto API, the kernel will use the one with highest
* @cra_priority.
* @cra_name: Generic name (usable by multiple implementations) of the
* transformation algorithm. This is the name of the transformation
* itself. This field is used by the kernel when looking up the
* providers of particular transformation.
* @cra_driver_name: Unique name of the transformation provider. This is the
* name of the provider of the transformation. This can be any
* arbitrary value, but in the usual case, this contains the
* name of the chip or provider and the name of the
* transformation algorithm.
* @cra_type: Type of the cryptographic transformation. This is a pointer to
* struct crypto_type, which implements callbacks common for all
* transformation types. There are multiple options, such as
* &crypto_skcipher_type, &crypto_ahash_type, &crypto_rng_type.
* This field might be empty. In that case, there are no common
* callbacks. This is the case for: cipher, compress, shash.
* @cra_u: Callbacks implementing the transformation. This is a union of
* multiple structures. Depending on the type of transformation selected
* by @cra_type and @cra_flags above, the associated structure must be
* filled with callbacks. This field might be empty. This is the case
* for ahash, shash.
* @cra_init: Initialize the cryptographic transformation object. This function
* is used to initialize the cryptographic transformation object.
* This function is called only once at the instantiation time, right
* after the transformation context was allocated. In case the
* cryptographic hardware has some special requirements which need to
* be handled by software, this function shall check for the precise
* requirement of the transformation and put any software fallbacks
* in place.
* @cra_exit: Deinitialize the cryptographic transformation object. This is a
* counterpart to @cra_init, used to remove various changes set in
* @cra_init.
* @cra_u.cipher: Union member which contains a single-block symmetric cipher
* definition. See @struct @cipher_alg.
* @cra_u.compress: Union member which contains a (de)compression algorithm.
* See @struct @compress_alg.
* @cra_module: Owner of this transformation implementation. Set to THIS_MODULE
* @cra_list: internally used
* @cra_users: internally used
* @cra_refcnt: internally used
* @cra_destroy: internally used
*
* @stats: union of all possible crypto_istat_xxx structures
* @stats.aead: statistics for AEAD algorithm
* @stats.akcipher: statistics for akcipher algorithm
* @stats.cipher: statistics for cipher algorithm
* @stats.compress: statistics for compress algorithm
* @stats.hash: statistics for hash algorithm
* @stats.rng: statistics for rng algorithm
* @stats.kpp: statistics for KPP algorithm
*
* The struct crypto_alg describes a generic Crypto API algorithm and is common
* for all of the transformations. Any variable not documented here shall not
* be used by a cipher implementation as it is internal to the Crypto API.
*/
struct crypto_alg {
struct list_head cra_list;
struct list_head cra_users;
u32 cra_flags;
unsigned int cra_blocksize;
unsigned int cra_ctxsize;
unsigned int cra_alignmask;
int cra_priority;
refcount_t cra_refcnt;
char cra_name[CRYPTO_MAX_ALG_NAME];
char cra_driver_name[CRYPTO_MAX_ALG_NAME];
const struct crypto_type *cra_type;
union {
struct cipher_alg cipher;
struct compress_alg compress;
} cra_u;
int (*cra_init)(struct crypto_tfm *tfm);
void (*cra_exit)(struct crypto_tfm *tfm);
void (*cra_destroy)(struct crypto_alg *alg);
struct module *cra_module;
#ifdef CONFIG_CRYPTO_STATS
union {
struct crypto_istat_aead aead;
struct crypto_istat_akcipher akcipher;
struct crypto_istat_cipher cipher;
struct crypto_istat_compress compress;
struct crypto_istat_hash hash;
struct crypto_istat_rng rng;
struct crypto_istat_kpp kpp;
} stats;
#endif /* CONFIG_CRYPTO_STATS */
} CRYPTO_MINALIGN_ATTR;
#ifdef CONFIG_CRYPTO_STATS
void crypto_stats_init(struct crypto_alg *alg);
void crypto_stats_get(struct crypto_alg *alg);
void crypto_stats_aead_encrypt(unsigned int cryptlen, struct crypto_alg *alg, int ret);
void crypto_stats_aead_decrypt(unsigned int cryptlen, struct crypto_alg *alg, int ret);
void crypto_stats_ahash_update(unsigned int nbytes, int ret, struct crypto_alg *alg);
void crypto_stats_ahash_final(unsigned int nbytes, int ret, struct crypto_alg *alg);
void crypto_stats_akcipher_encrypt(unsigned int src_len, int ret, struct crypto_alg *alg);
void crypto_stats_akcipher_decrypt(unsigned int src_len, int ret, struct crypto_alg *alg);
void crypto_stats_akcipher_sign(int ret, struct crypto_alg *alg);
void crypto_stats_akcipher_verify(int ret, struct crypto_alg *alg);
void crypto_stats_compress(unsigned int slen, int ret, struct crypto_alg *alg);
void crypto_stats_decompress(unsigned int slen, int ret, struct crypto_alg *alg);
void crypto_stats_kpp_set_secret(struct crypto_alg *alg, int ret);
void crypto_stats_kpp_generate_public_key(struct crypto_alg *alg, int ret);
void crypto_stats_kpp_compute_shared_secret(struct crypto_alg *alg, int ret);
void crypto_stats_rng_seed(struct crypto_alg *alg, int ret);
void crypto_stats_rng_generate(struct crypto_alg *alg, unsigned int dlen, int ret);
void crypto_stats_skcipher_encrypt(unsigned int cryptlen, int ret, struct crypto_alg *alg);
void crypto_stats_skcipher_decrypt(unsigned int cryptlen, int ret, struct crypto_alg *alg);
#else
static inline void crypto_stats_init(struct crypto_alg *alg)
{}
static inline void crypto_stats_get(struct crypto_alg *alg)
{}
static inline void crypto_stats_aead_encrypt(unsigned int cryptlen, struct crypto_alg *alg, int ret)
{}
static inline void crypto_stats_aead_decrypt(unsigned int cryptlen, struct crypto_alg *alg, int ret)
{}
static inline void crypto_stats_ahash_update(unsigned int nbytes, int ret, struct crypto_alg *alg)
{}
static inline void crypto_stats_ahash_final(unsigned int nbytes, int ret, struct crypto_alg *alg)
{}
static inline void crypto_stats_akcipher_encrypt(unsigned int src_len, int ret, struct crypto_alg *alg)
{}
static inline void crypto_stats_akcipher_decrypt(unsigned int src_len, int ret, struct crypto_alg *alg)
{}
static inline void crypto_stats_akcipher_sign(int ret, struct crypto_alg *alg)
{}
static inline void crypto_stats_akcipher_verify(int ret, struct crypto_alg *alg)
{}
static inline void crypto_stats_compress(unsigned int slen, int ret, struct crypto_alg *alg)
{}
static inline void crypto_stats_decompress(unsigned int slen, int ret, struct crypto_alg *alg)
{}
static inline void crypto_stats_kpp_set_secret(struct crypto_alg *alg, int ret)
{}
static inline void crypto_stats_kpp_generate_public_key(struct crypto_alg *alg, int ret)
{}
static inline void crypto_stats_kpp_compute_shared_secret(struct crypto_alg *alg, int ret)
{}
static inline void crypto_stats_rng_seed(struct crypto_alg *alg, int ret)
{}
static inline void crypto_stats_rng_generate(struct crypto_alg *alg, unsigned int dlen, int ret)
{}
static inline void crypto_stats_skcipher_encrypt(unsigned int cryptlen, int ret, struct crypto_alg *alg)
{}
static inline void crypto_stats_skcipher_decrypt(unsigned int cryptlen, int ret, struct crypto_alg *alg)
{}
#endif
/*
* A helper struct for waiting for completion of async crypto ops
*/
struct crypto_wait {
struct completion completion;
int err;
};
/*
* Macro for declaring a crypto op async wait object on stack
*/
#define DECLARE_CRYPTO_WAIT(_wait) \
struct crypto_wait _wait = { \
COMPLETION_INITIALIZER_ONSTACK((_wait).completion), 0 }
/*
* Async ops completion helper functioons
*/
void crypto_req_done(struct crypto_async_request *req, int err);
static inline int crypto_wait_req(int err, struct crypto_wait *wait)
{
switch (err) {
case -EINPROGRESS:
case -EBUSY:
wait_for_completion(&wait->completion);
reinit_completion(&wait->completion);
err = wait->err;
break;
}
return err;
}
static inline void crypto_init_wait(struct crypto_wait *wait)
{
init_completion(&wait->completion);
}
/*
* Algorithm registration interface.
*/
int crypto_register_alg(struct crypto_alg *alg);
void crypto_unregister_alg(struct crypto_alg *alg);
int crypto_register_algs(struct crypto_alg *algs, int count);
void crypto_unregister_algs(struct crypto_alg *algs, int count);
/*
* Algorithm query interface.
*/
int crypto_has_alg(const char *name, u32 type, u32 mask);
/*
* Transforms: user-instantiated objects which encapsulate algorithms
* and core processing logic. Managed via crypto_alloc_*() and
* crypto_free_*(), as well as the various helpers below.
*/
struct crypto_tfm {
u32 crt_flags;
int node;
void (*exit)(struct crypto_tfm *tfm);
struct crypto_alg *__crt_alg;
void *__crt_ctx[] CRYPTO_MINALIGN_ATTR;
};
struct crypto_comp {
struct crypto_tfm base;
};
/*
* Transform user interface.
*/
struct crypto_tfm *crypto_alloc_base(const char *alg_name, u32 type, u32 mask);
void crypto_destroy_tfm(void *mem, struct crypto_tfm *tfm);
static inline void crypto_free_tfm(struct crypto_tfm *tfm)
{
return crypto_destroy_tfm(tfm, tfm);
}
int alg_test(const char *driver, const char *alg, u32 type, u32 mask);
/*
* Transform helpers which query the underlying algorithm.
*/
static inline const char *crypto_tfm_alg_name(struct crypto_tfm *tfm)
{
return tfm->__crt_alg->cra_name;
}
static inline const char *crypto_tfm_alg_driver_name(struct crypto_tfm *tfm)
{
return tfm->__crt_alg->cra_driver_name;
}
static inline int crypto_tfm_alg_priority(struct crypto_tfm *tfm)
{
return tfm->__crt_alg->cra_priority;
}
static inline u32 crypto_tfm_alg_type(struct crypto_tfm *tfm)
{
return tfm->__crt_alg->cra_flags & CRYPTO_ALG_TYPE_MASK;
}
static inline unsigned int crypto_tfm_alg_blocksize(struct crypto_tfm *tfm)
{
return tfm->__crt_alg->cra_blocksize;
}
static inline unsigned int crypto_tfm_alg_alignmask(struct crypto_tfm *tfm)
{
return tfm->__crt_alg->cra_alignmask;
}
static inline u32 crypto_tfm_get_flags(struct crypto_tfm *tfm)
{
return tfm->crt_flags;
}
static inline void crypto_tfm_set_flags(struct crypto_tfm *tfm, u32 flags)
{
tfm->crt_flags |= flags;
}
static inline void crypto_tfm_clear_flags(struct crypto_tfm *tfm, u32 flags)
{
tfm->crt_flags &= ~flags;
}
static inline void *crypto_tfm_ctx(struct crypto_tfm *tfm)
{
return tfm->__crt_ctx;
}
static inline unsigned int crypto_tfm_ctx_alignment(void)
{
struct crypto_tfm *tfm;
return __alignof__(tfm->__crt_ctx);
}
static inline struct crypto_comp *__crypto_comp_cast(struct crypto_tfm *tfm)
{
return (struct crypto_comp *)tfm;
}
static inline struct crypto_comp *crypto_alloc_comp(const char *alg_name,
u32 type, u32 mask)
{
type &= ~CRYPTO_ALG_TYPE_MASK;
type |= CRYPTO_ALG_TYPE_COMPRESS;
mask |= CRYPTO_ALG_TYPE_MASK;
return __crypto_comp_cast(crypto_alloc_base(alg_name, type, mask));
}
static inline struct crypto_tfm *crypto_comp_tfm(struct crypto_comp *tfm)
{
return &tfm->base;
}
static inline void crypto_free_comp(struct crypto_comp *tfm)
{
crypto_free_tfm(crypto_comp_tfm(tfm));
}
static inline int crypto_has_comp(const char *alg_name, u32 type, u32 mask)
{
type &= ~CRYPTO_ALG_TYPE_MASK;
type |= CRYPTO_ALG_TYPE_COMPRESS;
mask |= CRYPTO_ALG_TYPE_MASK;
return crypto_has_alg(alg_name, type, mask);
}
static inline const char *crypto_comp_name(struct crypto_comp *tfm)
{
return crypto_tfm_alg_name(crypto_comp_tfm(tfm));
}
int crypto_comp_compress(struct crypto_comp *tfm,
const u8 *src, unsigned int slen,
u8 *dst, unsigned int *dlen);
int crypto_comp_decompress(struct crypto_comp *tfm,
const u8 *src, unsigned int slen,
u8 *dst, unsigned int *dlen);
#endif /* _LINUX_CRYPTO_H */
// SPDX-License-Identifier: GPL-2.0
/*
* Copyright (C) 2007 Oracle. All rights reserved.
*/
#include <linux/fs.h>
#include <linux/blkdev.h>
#include <linux/radix-tree.h>
#include <linux/writeback.h>
#include <linux/workqueue.h>
#include <linux/kthread.h>
#include <linux/slab.h>
#include <linux/migrate.h>
#include <linux/ratelimit.h>
#include <linux/uuid.h>
#include <linux/semaphore.h>
#include <linux/error-injection.h>
#include <linux/crc32c.h>
#include <linux/sched/mm.h>
#include <asm/unaligned.h>
#include <crypto/hash.h>
#include "ctree.h"
#include "disk-io.h"
#include "transaction.h"
#include "btrfs_inode.h"
#include "volumes.h"
#include "print-tree.h"
#include "locking.h"
#include "tree-log.h"
#include "free-space-cache.h"
#include "free-space-tree.h"
#include "check-integrity.h"
#include "rcu-string.h"
#include "dev-replace.h"
#include "raid56.h"
#include "sysfs.h"
#include "qgroup.h"
#include "compression.h"
#include "tree-checker.h"
#include "ref-verify.h"
#include "block-group.h"
#include "discard.h"
#include "space-info.h"
#include "zoned.h"
#include "subpage.h"
#define BTRFS_SUPER_FLAG_SUPP (BTRFS_HEADER_FLAG_WRITTEN |\
BTRFS_HEADER_FLAG_RELOC |\
BTRFS_SUPER_FLAG_ERROR |\
BTRFS_SUPER_FLAG_SEEDING |\
BTRFS_SUPER_FLAG_METADUMP |\
BTRFS_SUPER_FLAG_METADUMP_V2)
static void end_workqueue_fn(struct btrfs_work *work);
static void btrfs_destroy_ordered_extents(struct btrfs_root *root);
static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
struct btrfs_fs_info *fs_info);
static void btrfs_destroy_delalloc_inodes(struct btrfs_root *root);
static int btrfs_destroy_marked_extents(struct btrfs_fs_info *fs_info,
struct extent_io_tree *dirty_pages,
int mark);
static int btrfs_destroy_pinned_extent(struct btrfs_fs_info *fs_info,
struct extent_io_tree *pinned_extents);
static int btrfs_cleanup_transaction(struct btrfs_fs_info *fs_info);
static void btrfs_error_commit_super(struct btrfs_fs_info *fs_info);
/*
* btrfs_end_io_wq structs are used to do processing in task context when an IO
* is complete. This is used during reads to verify checksums, and it is used
* by writes to insert metadata for new file extents after IO is complete.
*/
struct btrfs_end_io_wq {
struct bio *bio;
bio_end_io_t *end_io;
void *private;
struct btrfs_fs_info *info;
blk_status_t status;
enum btrfs_wq_endio_type metadata;
struct btrfs_work work;
};
static struct kmem_cache *btrfs_end_io_wq_cache;
int __init btrfs_end_io_wq_init(void)
{
btrfs_end_io_wq_cache = kmem_cache_create("btrfs_end_io_wq",
sizeof(struct btrfs_end_io_wq),
0,
SLAB_MEM_SPREAD,
NULL);
if (!btrfs_end_io_wq_cache)
return -ENOMEM;
return 0;
}
void __cold btrfs_end_io_wq_exit(void)
{
kmem_cache_destroy(btrfs_end_io_wq_cache);
}
static void btrfs_free_csum_hash(struct btrfs_fs_info *fs_info)
{
if (fs_info->csum_shash)
crypto_free_shash(fs_info->csum_shash);
}
/*
* async submit bios are used to offload expensive checksumming
* onto the worker threads. They checksum file and metadata bios
* just before they are sent down the IO stack.
*/
struct async_submit_bio {
struct inode *inode;
struct bio *bio;
extent_submit_bio_start_t *submit_bio_start;
int mirror_num;
/* Optional parameter for submit_bio_start used by direct io */
u64 dio_file_offset;
struct btrfs_work work;
blk_status_t status;
};
/*
* Lockdep class keys for extent_buffer->lock's in this root. For a given
* eb, the lockdep key is determined by the btrfs_root it belongs to and
* the level the eb occupies in the tree.
*
* Different roots are used for different purposes and may nest inside each
* other and they require separate keysets. As lockdep keys should be
* static, assign keysets according to the purpose of the root as indicated
* by btrfs_root->root_key.objectid. This ensures that all special purpose
* roots have separate keysets.
*
* Lock-nesting across peer nodes is always done with the immediate parent
* node locked thus preventing deadlock. As lockdep doesn't know this, use
* subclass to avoid triggering lockdep warning in such cases.
*
* The key is set by the readpage_end_io_hook after the buffer has passed
* csum validation but before the pages are unlocked. It is also set by
* btrfs_init_new_buffer on freshly allocated blocks.
*
* We also add a check to make sure the highest level of the tree is the
* same as our lockdep setup here. If BTRFS_MAX_LEVEL changes, this code
* needs update as well.
*/
#ifdef CONFIG_DEBUG_LOCK_ALLOC
# if BTRFS_MAX_LEVEL != 8
# error
# endif
#define DEFINE_LEVEL(stem, level) \
.names[level] = "btrfs-" stem "-0" #level,
#define DEFINE_NAME(stem) \
DEFINE_LEVEL(stem, 0) \
DEFINE_LEVEL(stem, 1) \
DEFINE_LEVEL(stem, 2) \
DEFINE_LEVEL(stem, 3) \
DEFINE_LEVEL(stem, 4) \
DEFINE_LEVEL(stem, 5) \
DEFINE_LEVEL(stem, 6) \
DEFINE_LEVEL(stem, 7)
static struct btrfs_lockdep_keyset {
u64 id; /* root objectid */
/* Longest entry: btrfs-free-space-00 */
char names[BTRFS_MAX_LEVEL][20];
struct lock_class_key keys[BTRFS_MAX_LEVEL];
} btrfs_lockdep_keysets[] = {
{ .id = BTRFS_ROOT_TREE_OBJECTID, DEFINE_NAME("root") },
{ .id = BTRFS_EXTENT_TREE_OBJECTID, DEFINE_NAME("extent") },
{ .id = BTRFS_CHUNK_TREE_OBJECTID, DEFINE_NAME("chunk") },
{ .id = BTRFS_DEV_TREE_OBJECTID, DEFINE_NAME("dev") },
{ .id = BTRFS_CSUM_TREE_OBJECTID, DEFINE_NAME("csum") },
{ .id = BTRFS_QUOTA_TREE_OBJECTID, DEFINE_NAME("quota") },
{ .id = BTRFS_TREE_LOG_OBJECTID, DEFINE_NAME("log") },
{ .id = BTRFS_TREE_RELOC_OBJECTID, DEFINE_NAME("treloc") },
{ .id = BTRFS_DATA_RELOC_TREE_OBJECTID, DEFINE_NAME("dreloc") },
{ .id = BTRFS_UUID_TREE_OBJECTID, DEFINE_NAME("uuid") },
{ .id = BTRFS_FREE_SPACE_TREE_OBJECTID, DEFINE_NAME("free-space") },
{ .id = 0, DEFINE_NAME("tree") },
};
#undef DEFINE_LEVEL
#undef DEFINE_NAME
void btrfs_set_buffer_lockdep_class(u64 objectid, struct extent_buffer *eb,
int level)
{
struct btrfs_lockdep_keyset *ks;
BUG_ON(level >= ARRAY_SIZE(ks->keys));
/* find the matching keyset, id 0 is the default entry */
for (ks = btrfs_lockdep_keysets; ks->id; ks++)
if (ks->id == objectid)
break;
lockdep_set_class_and_name(&eb->lock,
&ks->keys[level], ks->names[level]);
}
#endif
/*
* Compute the csum of a btree block and store the result to provided buffer.
*/
static void csum_tree_block(struct extent_buffer *buf, u8 *result)
{
struct btrfs_fs_info *fs_info = buf->fs_info;
const int num_pages = num_extent_pages(buf);
const int first_page_part = min_t(u32, PAGE_SIZE, fs_info->nodesize);
SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
char *kaddr;
int i;
shash->tfm = fs_info->csum_shash;
crypto_shash_init(shash);
kaddr = page_address(buf->pages[0]) + offset_in_page(buf->start);
crypto_shash_update(shash, kaddr + BTRFS_CSUM_SIZE,
first_page_part - BTRFS_CSUM_SIZE);
for (i = 1; i < num_pages; i++) {
kaddr = page_address(buf->pages[i]);
crypto_shash_update(shash, kaddr, PAGE_SIZE);
}
memset(result, 0, BTRFS_CSUM_SIZE);
crypto_shash_final(shash, result);
}
/*
* we can't consider a given block up to date unless the transid of the
* block matches the transid in the parent node's pointer. This is how we
* detect blocks that either didn't get written at all or got written
* in the wrong place.
*/
static int verify_parent_transid(struct extent_io_tree *io_tree,
struct extent_buffer *eb, u64 parent_transid,
int atomic)
{
struct extent_state *cached_state = NULL;
int ret;
if (!parent_transid || btrfs_header_generation(eb) == parent_transid)
return 0;
if (atomic)
return -EAGAIN;
lock_extent_bits(io_tree, eb->start, eb->start + eb->len - 1,
&cached_state);
if (extent_buffer_uptodate(eb) &&
btrfs_header_generation(eb) == parent_transid) {
ret = 0;
goto out;
}
btrfs_err_rl(eb->fs_info,
"parent transid verify failed on %llu wanted %llu found %llu",
eb->start,
parent_transid, btrfs_header_generation(eb));
ret = 1;
clear_extent_buffer_uptodate(eb);
out:
unlock_extent_cached(io_tree, eb->start, eb->start + eb->len - 1,
&cached_state);
return ret;
}
static bool btrfs_supported_super_csum(u16 csum_type)
{
switch (csum_type) {
case BTRFS_CSUM_TYPE_CRC32:
case BTRFS_CSUM_TYPE_XXHASH:
case BTRFS_CSUM_TYPE_SHA256:
case BTRFS_CSUM_TYPE_BLAKE2:
return true;
default:
return false;
}
}
/*
* Return 0 if the superblock checksum type matches the checksum value of that
* algorithm. Pass the raw disk superblock data.
*/
static int btrfs_check_super_csum(struct btrfs_fs_info *fs_info,
char *raw_disk_sb)
{
struct btrfs_super_block *disk_sb =
(struct btrfs_super_block *)raw_disk_sb;
char result[BTRFS_CSUM_SIZE];
SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
shash->tfm = fs_info->csum_shash;
/*
* The super_block structure does not span the whole
* BTRFS_SUPER_INFO_SIZE range, we expect that the unused space is
* filled with zeros and is included in the checksum.
*/
crypto_shash_digest(shash, raw_disk_sb + BTRFS_CSUM_SIZE,
BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE, result);
if (memcmp(disk_sb->csum, result, fs_info->csum_size))
return 1;
return 0;
}
int btrfs_verify_level_key(struct extent_buffer *eb, int level,
struct btrfs_key *first_key, u64 parent_transid)
{
struct btrfs_fs_info *fs_info = eb->fs_info;
int found_level;
struct btrfs_key found_key;
int ret;
found_level = btrfs_header_level(eb);
if (found_level != level) {
WARN(IS_ENABLED(CONFIG_BTRFS_DEBUG),
KERN_ERR "BTRFS: tree level check failed\n");
btrfs_err(fs_info,
"tree level mismatch detected, bytenr=%llu level expected=%u has=%u",
eb->start, level, found_level);
return -EIO;
}
if (!first_key)
return 0;
/*
* For live tree block (new tree blocks in current transaction),
* we need proper lock context to avoid race, which is impossible here.
* So we only checks tree blocks which is read from disk, whose
* generation <= fs_info->last_trans_committed.
*/
if (btrfs_header_generation(eb) > fs_info->last_trans_committed)
return 0;
/* We have @first_key, so this @eb must have at least one item */
if (btrfs_header_nritems(eb) == 0) {
btrfs_err(fs_info,
"invalid tree nritems, bytenr=%llu nritems=0 expect >0",
eb->start);
WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG));
return -EUCLEAN;
}
if (found_level)
btrfs_node_key_to_cpu(eb, &found_key, 0);
else
btrfs_item_key_to_cpu(eb, &found_key, 0);
ret = btrfs_comp_cpu_keys(first_key, &found_key); if (ret) {
WARN(IS_ENABLED(CONFIG_BTRFS_DEBUG),
KERN_ERR "BTRFS: tree first key check failed\n");
btrfs_err(fs_info,
"tree first key mismatch detected, bytenr=%llu parent_transid=%llu key expected=(%llu,%u,%llu) has=(%llu,%u,%llu)",
eb->start, parent_transid, first_key->objectid,
first_key->type, first_key->offset,
found_key.objectid, found_key.type,
found_key.offset);
}
return ret;
}
/*
* helper to read a given tree block, doing retries as required when
* the checksums don't match and we have alternate mirrors to try.
*
* @parent_transid: expected transid, skip check if 0
* @level: expected level, mandatory check
* @first_key: expected key of first slot, skip check if NULL
*/
static int btree_read_extent_buffer_pages(struct extent_buffer *eb,
u64 parent_transid, int level,
struct btrfs_key *first_key)
{
struct btrfs_fs_info *fs_info = eb->fs_info;
struct extent_io_tree *io_tree;
int failed = 0;
int ret;
int num_copies = 0;
int mirror_num = 0;
int failed_mirror = 0;
io_tree = &BTRFS_I(fs_info->btree_inode)->io_tree;
while (1) {
clear_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags);
ret = read_extent_buffer_pages(eb, WAIT_COMPLETE, mirror_num);
if (!ret) {
if (verify_parent_transid(io_tree, eb,
parent_transid, 0))
ret = -EIO;
else if (btrfs_verify_level_key(eb, level,
first_key, parent_transid))
ret = -EUCLEAN;
else
break;
}
num_copies = btrfs_num_copies(fs_info,
eb->start, eb->len);
if (num_copies == 1)
break;
if (!failed_mirror) {
failed = 1;
failed_mirror = eb->read_mirror;
}
mirror_num++;
if (mirror_num == failed_mirror)
mirror_num++; if (mirror_num > num_copies)
break;
}
if (failed && !ret && failed_mirror) btrfs_repair_eb_io_failure(eb, failed_mirror); return ret;
}
static int csum_one_extent_buffer(struct extent_buffer *eb)
{
struct btrfs_fs_info *fs_info = eb->fs_info;
u8 result[BTRFS_CSUM_SIZE];
int ret;
ASSERT(memcmp_extent_buffer(eb, fs_info->fs_devices->metadata_uuid,
offsetof(struct btrfs_header, fsid),
BTRFS_FSID_SIZE) == 0);
csum_tree_block(eb, result);
if (btrfs_header_level(eb))
ret = btrfs_check_node(eb);
else
ret = btrfs_check_leaf_full(eb); if (ret < 0)
goto error;
/*
* Also check the generation, the eb reached here must be newer than
* last committed. Or something seriously wrong happened.
*/
if (unlikely(btrfs_header_generation(eb) <= fs_info->last_trans_committed)) {
ret = -EUCLEAN;
btrfs_err(fs_info,
"block=%llu bad generation, have %llu expect > %llu",
eb->start, btrfs_header_generation(eb),
fs_info->last_trans_committed);
goto error;
}
write_extent_buffer(eb, result, 0, fs_info->csum_size); return 0;
error:
btrfs_print_tree(eb, 0);
btrfs_err(fs_info, "block=%llu write time tree block corruption detected",
eb->start);
WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG));
return ret;
}
/* Checksum all dirty extent buffers in one bio_vec */
static int csum_dirty_subpage_buffers(struct btrfs_fs_info *fs_info,
struct bio_vec *bvec)
{
struct page *page = bvec->bv_page;
u64 bvec_start = page_offset(page) + bvec->bv_offset;
u64 cur;
int ret = 0;
for (cur = bvec_start; cur < bvec_start + bvec->bv_len;
cur += fs_info->nodesize) {
struct extent_buffer *eb;
bool uptodate;
eb = find_extent_buffer(fs_info, cur);
uptodate = btrfs_subpage_test_uptodate(fs_info, page, cur,
fs_info->nodesize);
/* A dirty eb shouldn't disappear from buffer_radix */
if (WARN_ON(!eb))
return -EUCLEAN;
if (WARN_ON(cur != btrfs_header_bytenr(eb))) {
free_extent_buffer(eb);
return -EUCLEAN;
}
if (WARN_ON(!uptodate)) {
free_extent_buffer(eb);
return -EUCLEAN;
}
ret = csum_one_extent_buffer(eb);
free_extent_buffer(eb);
if (ret < 0)
return ret;
}
return ret;
}
/*
* Checksum a dirty tree block before IO. This has extra checks to make sure
* we only fill in the checksum field in the first page of a multi-page block.
* For subpage extent buffers we need bvec to also read the offset in the page.
*/
static int csum_dirty_buffer(struct btrfs_fs_info *fs_info, struct bio_vec *bvec)
{
struct page *page = bvec->bv_page;
u64 start = page_offset(page);
u64 found_start;
struct extent_buffer *eb;
if (fs_info->sectorsize < PAGE_SIZE)
return csum_dirty_subpage_buffers(fs_info, bvec);
eb = (struct extent_buffer *)page->private;
if (page != eb->pages[0])
return 0;
found_start = btrfs_header_bytenr(eb);
if (test_bit(EXTENT_BUFFER_NO_CHECK, &eb->bflags)) {
WARN_ON(found_start != 0);
return 0;
}
/*
* Please do not consolidate these warnings into a single if.
* It is useful to know what went wrong.
*/
if (WARN_ON(found_start != start))
return -EUCLEAN;
if (WARN_ON(!PageUptodate(page)))
return -EUCLEAN;
return csum_one_extent_buffer(eb);
}
static int check_tree_block_fsid(struct extent_buffer *eb)
{
struct btrfs_fs_info *fs_info = eb->fs_info;
struct btrfs_fs_devices *fs_devices = fs_info->fs_devices, *seed_devs;
u8 fsid[BTRFS_FSID_SIZE];
u8 *metadata_uuid;
read_extent_buffer(eb, fsid, offsetof(struct btrfs_header, fsid),
BTRFS_FSID_SIZE);
/*
* Checking the incompat flag is only valid for the current fs. For
* seed devices it's forbidden to have their uuid changed so reading
* ->fsid in this case is fine
*/
if (btrfs_fs_incompat(fs_info, METADATA_UUID))
metadata_uuid = fs_devices->metadata_uuid;
else
metadata_uuid = fs_devices->fsid;
if (!memcmp(fsid, metadata_uuid, BTRFS_FSID_SIZE))
return 0;
list_for_each_entry(seed_devs, &fs_devices->seed_list, seed_list)
if (!memcmp(fsid, seed_devs->fsid, BTRFS_FSID_SIZE))
return 0;
return 1;
}
/* Do basic extent buffer checks at read time */
static int validate_extent_buffer(struct extent_buffer *eb)
{
struct btrfs_fs_info *fs_info = eb->fs_info;
u64 found_start;
const u32 csum_size = fs_info->csum_size;
u8 found_level;
u8 result[BTRFS_CSUM_SIZE];
const u8 *header_csum;
int ret = 0;
found_start = btrfs_header_bytenr(eb);
if (found_start != eb->start) {
btrfs_err_rl(fs_info, "bad tree block start, want %llu have %llu",
eb->start, found_start);
ret = -EIO;
goto out;
}
if (check_tree_block_fsid(eb)) {
btrfs_err_rl(fs_info, "bad fsid on block %llu",
eb->start);
ret = -EIO;
goto out;
}
found_level = btrfs_header_level(eb);
if (found_level >= BTRFS_MAX_LEVEL) {
btrfs_err(fs_info, "bad tree block level %d on %llu",
(int)btrfs_header_level(eb), eb->start);
ret = -EIO;
goto out;
}
csum_tree_block(eb, result);
header_csum = page_address(eb->pages[0]) +
get_eb_offset_in_page(eb, offsetof(struct btrfs_header, csum));
if (memcmp(result, header_csum, csum_size) != 0) {
btrfs_warn_rl(fs_info,
"checksum verify failed on %llu wanted " CSUM_FMT " found " CSUM_FMT " level %d",
eb->start,
CSUM_FMT_VALUE(csum_size, header_csum),
CSUM_FMT_VALUE(csum_size, result),
btrfs_header_level(eb));
ret = -EUCLEAN;
goto out;
}
/*
* If this is a leaf block and it is corrupt, set the corrupt bit so
* that we don't try and read the other copies of this block, just
* return -EIO.
*/
if (found_level == 0 && btrfs_check_leaf_full(eb)) {
set_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags);
ret = -EIO;
}
if (found_level > 0 && btrfs_check_node(eb))
ret = -EIO;
if (!ret)
set_extent_buffer_uptodate(eb);
else
btrfs_err(fs_info,
"block=%llu read time tree block corruption detected",
eb->start);
out:
return ret;
}
static int validate_subpage_buffer(struct page *page, u64 start, u64 end,
int mirror)
{
struct btrfs_fs_info *fs_info = btrfs_sb(page->mapping->host->i_sb);
struct extent_buffer *eb;
bool reads_done;
int ret = 0;
/*
* We don't allow bio merge for subpage metadata read, so we should
* only get one eb for each endio hook.
*/
ASSERT(end == start + fs_info->nodesize - 1);
ASSERT(PagePrivate(page));
eb = find_extent_buffer(fs_info, start);
/*
* When we are reading one tree block, eb must have been inserted into
* the radix tree. If not, something is wrong.
*/
ASSERT(eb);
reads_done = atomic_dec_and_test(&eb->io_pages);
/* Subpage read must finish in page read */
ASSERT(reads_done);
eb->read_mirror = mirror;
if (test_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags)) {
ret = -EIO;
goto err;
}
ret = validate_extent_buffer(eb);
if (ret < 0)
goto err;
if (test_and_clear_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags))
btree_readahead_hook(eb, ret);
set_extent_buffer_uptodate(eb);
free_extent_buffer(eb);
return ret;
err:
/*
* end_bio_extent_readpage decrements io_pages in case of error,
* make sure it has something to decrement.
*/
atomic_inc(&eb->io_pages);
clear_extent_buffer_uptodate(eb);
free_extent_buffer(eb);
return ret;
}
int btrfs_validate_metadata_buffer(struct btrfs_io_bio *io_bio,
struct page *page, u64 start, u64 end,
int mirror)
{
struct extent_buffer *eb;
int ret = 0;
int reads_done;
ASSERT(page->private);
if (btrfs_sb(page->mapping->host->i_sb)->sectorsize < PAGE_SIZE)
return validate_subpage_buffer(page, start, end, mirror);
eb = (struct extent_buffer *)page->private;
/*
* The pending IO might have been the only thing that kept this buffer
* in memory. Make sure we have a ref for all this other checks
*/
atomic_inc(&eb->refs);
reads_done = atomic_dec_and_test(&eb->io_pages);
if (!reads_done)
goto err;
eb->read_mirror = mirror;
if (test_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags)) {
ret = -EIO;
goto err;
}
ret = validate_extent_buffer(eb);
err:
if (reads_done &&
test_and_clear_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags))
btree_readahead_hook(eb, ret);
if (ret) {
/*
* our io error hook is going to dec the io pages
* again, we have to make sure it has something
* to decrement
*/
atomic_inc(&eb->io_pages);
clear_extent_buffer_uptodate(eb);
}
free_extent_buffer(eb);
return ret;
}
static void end_workqueue_bio(struct bio *bio)
{
struct btrfs_end_io_wq *end_io_wq = bio->bi_private;
struct btrfs_fs_info *fs_info;
struct btrfs_workqueue *wq;
fs_info = end_io_wq->info;
end_io_wq->status = bio->bi_status;
if (btrfs_op(bio) == BTRFS_MAP_WRITE) {
if (end_io_wq->metadata == BTRFS_WQ_ENDIO_METADATA) wq = fs_info->endio_meta_write_workers; else if (end_io_wq->metadata == BTRFS_WQ_ENDIO_FREE_SPACE) wq = fs_info->endio_freespace_worker; else if (end_io_wq->metadata == BTRFS_WQ_ENDIO_RAID56) wq = fs_info->endio_raid56_workers;
else
wq = fs_info->endio_write_workers;
} else {
if (end_io_wq->metadata == BTRFS_WQ_ENDIO_RAID56) wq = fs_info->endio_raid56_workers; else if (end_io_wq->metadata) wq = fs_info->endio_meta_workers;
else
wq = fs_info->endio_workers;
}
btrfs_init_work(&end_io_wq->work, end_workqueue_fn, NULL, NULL);
btrfs_queue_work(wq, &end_io_wq->work);
}
blk_status_t btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio,
enum btrfs_wq_endio_type metadata)
{
struct btrfs_end_io_wq *end_io_wq;
end_io_wq = kmem_cache_alloc(btrfs_end_io_wq_cache, GFP_NOFS);
if (!end_io_wq)
return BLK_STS_RESOURCE;
end_io_wq->private = bio->bi_private;
end_io_wq->end_io = bio->bi_end_io;
end_io_wq->info = info;
end_io_wq->status = 0;
end_io_wq->bio = bio;
end_io_wq->metadata = metadata;
bio->bi_private = end_io_wq;
bio->bi_end_io = end_workqueue_bio;
return 0;
}
static void run_one_async_start(struct btrfs_work *work)
{
struct async_submit_bio *async;
blk_status_t ret;
async = container_of(work, struct async_submit_bio, work);
ret = async->submit_bio_start(async->inode, async->bio,
async->dio_file_offset);
if (ret)
async->status = ret;
}
/*
* In order to insert checksums into the metadata in large chunks, we wait
* until bio submission time. All the pages in the bio are checksummed and
* sums are attached onto the ordered extent record.
*
* At IO completion time the csums attached on the ordered extent record are
* inserted into the tree.
*/
static void run_one_async_done(struct btrfs_work *work)
{
struct async_submit_bio *async;
struct inode *inode;
blk_status_t ret;
async = container_of(work, struct async_submit_bio, work);
inode = async->inode;
/* If an error occurred we just want to clean up the bio and move on */
if (async->status) {
async->bio->bi_status = async->status;
bio_endio(async->bio);
return;
}
/*
* All of the bios that pass through here are from async helpers.
* Use REQ_CGROUP_PUNT to issue them from the owning cgroup's context.
* This changes nothing when cgroups aren't in use.
*/
async->bio->bi_opf |= REQ_CGROUP_PUNT;
ret = btrfs_map_bio(btrfs_sb(inode->i_sb), async->bio, async->mirror_num);
if (ret) {
async->bio->bi_status = ret;
bio_endio(async->bio);
}
}
static void run_one_async_free(struct btrfs_work *work)
{
struct async_submit_bio *async;
async = container_of(work, struct async_submit_bio, work);
kfree(async);
}
blk_status_t btrfs_wq_submit_bio(struct inode *inode, struct bio *bio,
int mirror_num, unsigned long bio_flags,
u64 dio_file_offset,
extent_submit_bio_start_t *submit_bio_start)
{
struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
struct async_submit_bio *async;
async = kmalloc(sizeof(*async), GFP_NOFS);
if (!async)
return BLK_STS_RESOURCE;
async->inode = inode;
async->bio = bio;
async->mirror_num = mirror_num;
async->submit_bio_start = submit_bio_start;
btrfs_init_work(&async->work, run_one_async_start, run_one_async_done,
run_one_async_free);
async->dio_file_offset = dio_file_offset;
async->status = 0;
if (op_is_sync(bio->bi_opf))
btrfs_set_work_high_priority(&async->work); btrfs_queue_work(fs_info->workers, &async->work); return 0;}
static blk_status_t btree_csum_one_bio(struct bio *bio)
{
struct bio_vec *bvec;
struct btrfs_root *root;
int ret = 0;
struct bvec_iter_all iter_all;
ASSERT(!bio_flagged(bio, BIO_CLONED));
bio_for_each_segment_all(bvec, bio, iter_all) {
root = BTRFS_I(bvec->bv_page->mapping->host)->root;
ret = csum_dirty_buffer(root->fs_info, bvec);
if (ret)
break;
}
return errno_to_blk_status(ret);
}
static blk_status_t btree_submit_bio_start(struct inode *inode, struct bio *bio,
u64 dio_file_offset)
{
/*
* when we're called for a write, we're already in the async
* submission context. Just jump into btrfs_map_bio
*/
return btree_csum_one_bio(bio);
}
static bool should_async_write(struct btrfs_fs_info *fs_info,
struct btrfs_inode *bi)
{
if (btrfs_is_zoned(fs_info))
return false;
if (atomic_read(&bi->sync_writers))
return false;
if (test_bit(BTRFS_FS_CSUM_IMPL_FAST, &fs_info->flags))
return false;
return true;
}
blk_status_t btrfs_submit_metadata_bio(struct inode *inode, struct bio *bio,
int mirror_num, unsigned long bio_flags)
{
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
blk_status_t ret;
if (btrfs_op(bio) != BTRFS_MAP_WRITE) {
/*
* called for a read, do the setup so that checksum validation
* can happen in the async kernel threads
*/
ret = btrfs_bio_wq_end_io(fs_info, bio,
BTRFS_WQ_ENDIO_METADATA);
if (ret)
goto out_w_error;
ret = btrfs_map_bio(fs_info, bio, mirror_num);
} else if (!should_async_write(fs_info, BTRFS_I(inode))) {
ret = btree_csum_one_bio(bio);
if (ret)
goto out_w_error;
ret = btrfs_map_bio(fs_info, bio, mirror_num);
} else {
/*
* kthread helpers are used to submit writes so that
* checksumming can happen in parallel across all CPUs
*/
ret = btrfs_wq_submit_bio(inode, bio, mirror_num, 0,
0, btree_submit_bio_start);
}
if (ret)
goto out_w_error;
return 0;
out_w_error:
bio->bi_status = ret;
bio_endio(bio);
return ret;
}
#ifdef CONFIG_MIGRATION
static int btree_migratepage(struct address_space *mapping,
struct page *newpage, struct page *page,
enum migrate_mode mode)
{
/*
* we can't safely write a btree page from here,
* we haven't done the locking hook
*/
if (PageDirty(page))
return -EAGAIN;
/*
* Buffers may be managed in a filesystem specific way.
* We must have no buffers or drop them.
*/
if (page_has_private(page) &&
!try_to_release_page(page, GFP_KERNEL))
return -EAGAIN;
return migrate_page(mapping, newpage, page, mode);
}
#endif
static int btree_writepages(struct address_space *mapping,
struct writeback_control *wbc)
{
struct btrfs_fs_info *fs_info;
int ret;
if (wbc->sync_mode == WB_SYNC_NONE) { if (wbc->for_kupdate)
return 0;
fs_info = BTRFS_I(mapping->host)->root->fs_info;
/* this is a bit racy, but that's ok */
ret = __percpu_counter_compare(&fs_info->dirty_metadata_bytes,
BTRFS_DIRTY_METADATA_THRESH,
fs_info->dirty_metadata_batch);
if (ret < 0)
return 0;
}
return btree_write_cache_pages(mapping, wbc);
}
static int btree_releasepage(struct page *page, gfp_t gfp_flags)
{
if (PageWriteback(page) || PageDirty(page))
return 0;
return try_release_extent_buffer(page);
}
static void btree_invalidatepage(struct page *page, unsigned int offset,
unsigned int length)
{
struct extent_io_tree *tree;
tree = &BTRFS_I(page->mapping->host)->io_tree;
extent_invalidatepage(tree, page, offset);
btree_releasepage(page, GFP_NOFS);
if (PagePrivate(page)) {
btrfs_warn(BTRFS_I(page->mapping->host)->root->fs_info,
"page private not zero on page %llu",
(unsigned long long)page_offset(page));
detach_page_private(page);
}
}
static int btree_set_page_dirty(struct page *page)
{
#ifdef DEBUG
struct btrfs_fs_info *fs_info = btrfs_sb(page->mapping->host->i_sb);
struct btrfs_subpage *subpage;
struct extent_buffer *eb;
int cur_bit = 0;
u64 page_start = page_offset(page);
if (fs_info->sectorsize == PAGE_SIZE) {
BUG_ON(!PagePrivate(page));
eb = (struct extent_buffer *)page->private;
BUG_ON(!eb);
BUG_ON(!test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags));
BUG_ON(!atomic_read(&eb->refs));
btrfs_assert_tree_locked(eb);
return __set_page_dirty_nobuffers(page);
}
ASSERT(PagePrivate(page) && page->private);
subpage = (struct btrfs_subpage *)page->private;
ASSERT(subpage->dirty_bitmap);
while (cur_bit < BTRFS_SUBPAGE_BITMAP_SIZE) {
unsigned long flags;
u64 cur;
u16 tmp = (1 << cur_bit);
spin_lock_irqsave(&subpage->lock, flags);
if (!(tmp & subpage->dirty_bitmap)) {
spin_unlock_irqrestore(&subpage->lock, flags);
cur_bit++;
continue;
}
spin_unlock_irqrestore(&subpage->lock, flags);
cur = page_start + cur_bit * fs_info->sectorsize;
eb = find_extent_buffer(fs_info, cur);
ASSERT(eb);
ASSERT(test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags));
ASSERT(atomic_read(&eb->refs));
btrfs_assert_tree_locked(eb);
free_extent_buffer(eb);
cur_bit += (fs_info->nodesize >> fs_info->sectorsize_bits);
}
#endif
return __set_page_dirty_nobuffers(page);
}
static const struct address_space_operations btree_aops = {
.writepages = btree_writepages,
.releasepage = btree_releasepage,
.invalidatepage = btree_invalidatepage,
#ifdef CONFIG_MIGRATION
.migratepage = btree_migratepage,
#endif
.set_page_dirty = btree_set_page_dirty,
};
struct extent_buffer *btrfs_find_create_tree_block(
struct btrfs_fs_info *fs_info,
u64 bytenr, u64 owner_root,
int level)
{
if (btrfs_is_testing(fs_info))
return alloc_test_extent_buffer(fs_info, bytenr);
return alloc_extent_buffer(fs_info, bytenr, owner_root, level);
}
/*
* Read tree block at logical address @bytenr and do variant basic but critical
* verification.
*
* @owner_root: the objectid of the root owner for this block.
* @parent_transid: expected transid of this tree block, skip check if 0
* @level: expected level, mandatory check
* @first_key: expected key in slot 0, skip check if NULL
*/
struct extent_buffer *read_tree_block(struct btrfs_fs_info *fs_info, u64 bytenr,
u64 owner_root, u64 parent_transid,
int level, struct btrfs_key *first_key)
{
struct extent_buffer *buf = NULL;
int ret;
buf = btrfs_find_create_tree_block(fs_info, bytenr, owner_root, level);
if (IS_ERR(buf))
return buf;
ret = btree_read_extent_buffer_pages(buf, parent_transid,
level, first_key);
if (ret) { free_extent_buffer_stale(buf);
return ERR_PTR(ret);
}
return buf;
}
void btrfs_clean_tree_block(struct extent_buffer *buf)
{
struct btrfs_fs_info *fs_info = buf->fs_info;
if (btrfs_header_generation(buf) ==
fs_info->running_transaction->transid) {
btrfs_assert_tree_locked(buf);
if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &buf->bflags)) { percpu_counter_add_batch(&fs_info->dirty_metadata_bytes,
-buf->len,
fs_info->dirty_metadata_batch);
clear_extent_buffer_dirty(buf);
}
}
}
static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info,
u64 objectid)
{
bool dummy = test_bit(BTRFS_FS_STATE_DUMMY_FS_INFO, &fs_info->fs_state);
root->fs_info = fs_info;
root->node = NULL;
root->commit_root = NULL;
root->state = 0;
root->orphan_cleanup_state = 0;
root->last_trans = 0;
root->free_objectid = 0;
root->nr_delalloc_inodes = 0;
root->nr_ordered_extents = 0;
root->inode_tree = RB_ROOT;
INIT_RADIX_TREE(&root->delayed_nodes_tree, GFP_ATOMIC);
root->block_rsv = NULL;
INIT_LIST_HEAD(&root->dirty_list);
INIT_LIST_HEAD(&root->root_list);
INIT_LIST_HEAD(&root->delalloc_inodes);
INIT_LIST_HEAD(&root->delalloc_root);
INIT_LIST_HEAD(&root->ordered_extents);
INIT_LIST_HEAD(&root->ordered_root);
INIT_LIST_HEAD(&root->reloc_dirty_list);
INIT_LIST_HEAD(&root->logged_list[0]);
INIT_LIST_HEAD(&root->logged_list[1]);
spin_lock_init(&root->inode_lock);
spin_lock_init(&root->delalloc_lock);
spin_lock_init(&root->ordered_extent_lock);
spin_lock_init(&root->accounting_lock);
spin_lock_init(&root->log_extents_lock[0]);
spin_lock_init(&root->log_extents_lock[1]);
spin_lock_init(&root->qgroup_meta_rsv_lock);
mutex_init(&root->objectid_mutex);
mutex_init(&root->log_mutex);
mutex_init(&root->ordered_extent_mutex);
mutex_init(&root->delalloc_mutex);
init_waitqueue_head(&root->qgroup_flush_wait);
init_waitqueue_head(&root->log_writer_wait);
init_waitqueue_head(&root->log_commit_wait[0]);
init_waitqueue_head(&root->log_commit_wait[1]);
INIT_LIST_HEAD(&root->log_ctxs[0]);
INIT_LIST_HEAD(&root->log_ctxs[1]);
atomic_set(&root->log_commit[0], 0);
atomic_set(&root->log_commit[1], 0);
atomic_set(&root->log_writers, 0);
atomic_set(&root->log_batch, 0);
refcount_set(&root->refs, 1);
atomic_set(&root->snapshot_force_cow, 0);
atomic_set(&root->nr_swapfiles, 0);
root->log_transid = 0;
root->log_transid_committed = -1;
root->last_log_commit = 0;
if (!dummy) {
extent_io_tree_init(fs_info, &root->dirty_log_pages,
IO_TREE_ROOT_DIRTY_LOG_PAGES, NULL);
extent_io_tree_init(fs_info, &root->log_csum_range,
IO_TREE_LOG_CSUM_RANGE, NULL);
}
memset(&root->root_key, 0, sizeof(root->root_key));
memset(&root->root_item, 0, sizeof(root->root_item));
memset(&root->defrag_progress, 0, sizeof(root->defrag_progress));
root->root_key.objectid = objectid;
root->anon_dev = 0;
spin_lock_init(&root->root_item_lock);
btrfs_qgroup_init_swapped_blocks(&root->swapped_blocks);
#ifdef CONFIG_BTRFS_DEBUG
INIT_LIST_HEAD(&root->leak_list);
spin_lock(&fs_info->fs_roots_radix_lock);
list_add_tail(&root->leak_list, &fs_info->allocated_roots);
spin_unlock(&fs_info->fs_roots_radix_lock);
#endif
}
static struct btrfs_root *btrfs_alloc_root(struct btrfs_fs_info *fs_info,
u64 objectid, gfp_t flags)
{
struct btrfs_root *root = kzalloc(sizeof(*root), flags);
if (root)
__setup_root(root, fs_info, objectid);
return root;
}
#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
/* Should only be used by the testing infrastructure */
struct btrfs_root *btrfs_alloc_dummy_root(struct btrfs_fs_info *fs_info)
{
struct btrfs_root *root;
if (!fs_info)
return ERR_PTR(-EINVAL);
root = btrfs_alloc_root(fs_info, BTRFS_ROOT_TREE_OBJECTID, GFP_KERNEL);
if (!root)
return ERR_PTR(-ENOMEM);
/* We don't use the stripesize in selftest, set it as sectorsize */
root->alloc_bytenr = 0;
return root;
}
#endif
struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans,
u64 objectid)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
struct extent_buffer *leaf;
struct btrfs_root *tree_root = fs_info->tree_root;
struct btrfs_root *root;
struct btrfs_key key;
unsigned int nofs_flag;
int ret = 0;
/*
* We're holding a transaction handle, so use a NOFS memory allocation
* context to avoid deadlock if reclaim happens.
*/
nofs_flag = memalloc_nofs_save();
root = btrfs_alloc_root(fs_info, objectid, GFP_KERNEL);
memalloc_nofs_restore(nofs_flag);
if (!root)
return ERR_PTR(-ENOMEM);
root->root_key.objectid = objectid;
root->root_key.type = BTRFS_ROOT_ITEM_KEY;
root->root_key.offset = 0;
leaf = btrfs_alloc_tree_block(trans, root, 0, objectid, NULL, 0, 0, 0,
BTRFS_NESTING_NORMAL);
if (IS_ERR(leaf)) {
ret = PTR_ERR(leaf);
leaf = NULL;
goto fail_unlock;
}
root->node = leaf;
btrfs_mark_buffer_dirty(leaf);
root->commit_root = btrfs_root_node(root);
set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
btrfs_set_root_flags(&root->root_item, 0);
btrfs_set_root_limit(&root->root_item, 0);
btrfs_set_root_bytenr(&root->root_item, leaf->start);
btrfs_set_root_generation(&root->root_item, trans->transid);
btrfs_set_root_level(&root->root_item, 0);
btrfs_set_root_refs(&root->root_item, 1);
btrfs_set_root_used(&root->root_item, leaf->len);
btrfs_set_root_last_snapshot(&root->root_item, 0);
btrfs_set_root_dirid(&root->root_item, 0);
if (is_fstree(objectid))
generate_random_guid(root->root_item.uuid);
else
export_guid(root->root_item.uuid, &guid_null);
btrfs_set_root_drop_level(&root->root_item, 0);
btrfs_tree_unlock(leaf);
key.objectid = objectid;
key.type = BTRFS_ROOT_ITEM_KEY;
key.offset = 0;
ret = btrfs_insert_root(trans, tree_root, &key, &root->root_item);
if (ret)
goto fail;
return root;
fail_unlock:
if (leaf)
btrfs_tree_unlock(leaf);
fail:
btrfs_put_root(root);
return ERR_PTR(ret);
}
static struct btrfs_root *alloc_log_tree(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info)
{
struct btrfs_root *root;
root = btrfs_alloc_root(fs_info, BTRFS_TREE_LOG_OBJECTID, GFP_NOFS);
if (!root)
return ERR_PTR(-ENOMEM);
root->root_key.objectid = BTRFS_TREE_LOG_OBJECTID;
root->root_key.type = BTRFS_ROOT_ITEM_KEY;
root->root_key.offset = BTRFS_TREE_LOG_OBJECTID;
return root;
}
int btrfs_alloc_log_tree_node(struct btrfs_trans_handle *trans,
struct btrfs_root *root)
{
struct extent_buffer *leaf;
/*
* DON'T set SHAREABLE bit for log trees.
*
* Log trees are not exposed to user space thus can't be snapshotted,
* and they go away before a real commit is actually done.
*
* They do store pointers to file data extents, and those reference
* counts still get updated (along with back refs to the log tree).
*/
leaf = btrfs_alloc_tree_block(trans, root, 0, BTRFS_TREE_LOG_OBJECTID,
NULL, 0, 0, 0, BTRFS_NESTING_NORMAL);
if (IS_ERR(leaf))
return PTR_ERR(leaf); root->node = leaf;
btrfs_mark_buffer_dirty(root->node);
btrfs_tree_unlock(root->node);
return 0;
}
int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info)
{
struct btrfs_root *log_root;
log_root = alloc_log_tree(trans, fs_info);
if (IS_ERR(log_root))
return PTR_ERR(log_root);
if (!btrfs_is_zoned(fs_info)) { int ret = btrfs_alloc_log_tree_node(trans, log_root);
if (ret) {
btrfs_put_root(log_root); return ret;
}
}
WARN_ON(fs_info->log_root_tree); fs_info->log_root_tree = log_root; return 0;
}
int btrfs_add_log_tree(struct btrfs_trans_handle *trans,
struct btrfs_root *root)
{
struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_root *log_root;
struct btrfs_inode_item *inode_item;
int ret;
log_root = alloc_log_tree(trans, fs_info);
if (IS_ERR(log_root))
return PTR_ERR(log_root);
ret = btrfs_alloc_log_tree_node(trans, log_root);
if (ret) {
btrfs_put_root(log_root);
return ret;
}
log_root->last_trans = trans->transid; log_root->root_key.offset = root->root_key.objectid;
inode_item = &log_root->root_item.inode;
btrfs_set_stack_inode_generation(inode_item, 1);
btrfs_set_stack_inode_size(inode_item, 3);
btrfs_set_stack_inode_nlink(inode_item, 1);
btrfs_set_stack_inode_nbytes(inode_item,
fs_info->nodesize);
btrfs_set_stack_inode_mode(inode_item, S_IFDIR | 0755);
btrfs_set_root_node(&log_root->root_item, log_root->node);
WARN_ON(root->log_root); root->log_root = log_root;
root->log_transid = 0;
root->log_transid_committed = -1;
root->last_log_commit = 0;
return 0;
}
static struct btrfs_root *read_tree_root_path(struct btrfs_root *tree_root,
struct btrfs_path *path,
struct btrfs_key *key)
{
struct btrfs_root *root;
struct btrfs_fs_info *fs_info = tree_root->fs_info;
u64 generation;
int ret;
int level;
root = btrfs_alloc_root(fs_info, key->objectid, GFP_NOFS);
if (!root)
return ERR_PTR(-ENOMEM);
ret = btrfs_find_root(tree_root, key, path,
&root->root_item, &root->root_key);
if (ret) {
if (ret > 0)
ret = -ENOENT;
goto fail;
}
generation = btrfs_root_generation(&root->root_item);
level = btrfs_root_level(&root->root_item);
root->node = read_tree_block(fs_info,
btrfs_root_bytenr(&root->root_item),
key->objectid, generation, level, NULL);
if (IS_ERR(root->node)) {
ret = PTR_ERR(root->node);
root->node = NULL;
goto fail;
} else if (!btrfs_buffer_uptodate(root->node, generation, 0)) {
ret = -EIO;
goto fail;
}
root->commit_root = btrfs_root_node(root);
return root;
fail:
btrfs_put_root(root);
return ERR_PTR(ret);
}
struct btrfs_root *btrfs_read_tree_root(struct btrfs_root *tree_root,
struct btrfs_key *key)
{
struct btrfs_root *root;
struct btrfs_path *path;
path = btrfs_alloc_path();
if (!path)
return ERR_PTR(-ENOMEM);
root = read_tree_root_path(tree_root, path, key);
btrfs_free_path(path);
return root;
}
/*
* Initialize subvolume root in-memory structure
*
* @anon_dev: anonymous device to attach to the root, if zero, allocate new
*/
static int btrfs_init_fs_root(struct btrfs_root *root, dev_t anon_dev)
{
int ret;
unsigned int nofs_flag;
/*
* We might be called under a transaction (e.g. indirect backref
* resolution) which could deadlock if it triggers memory reclaim
*/
nofs_flag = memalloc_nofs_save();
ret = btrfs_drew_lock_init(&root->snapshot_lock);
memalloc_nofs_restore(nofs_flag);
if (ret)
goto fail;
if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID &&
!btrfs_is_data_reloc_root(root)) {
set_bit(BTRFS_ROOT_SHAREABLE, &root->state);
btrfs_check_and_init_root_item(&root->root_item);
}
/*
* Don't assign anonymous block device to roots that are not exposed to
* userspace, the id pool is limited to 1M
*/
if (is_fstree(root->root_key.objectid) &&
btrfs_root_refs(&root->root_item) > 0) {
if (!anon_dev) { ret = get_anon_bdev(&root->anon_dev); if (ret)
goto fail;
} else {
root->anon_dev = anon_dev;
}
}
mutex_lock(&root->objectid_mutex);
ret = btrfs_init_root_free_objectid(root);
if (ret) {
mutex_unlock(&root->objectid_mutex);
goto fail;
}
ASSERT(root->free_objectid <= BTRFS_LAST_FREE_OBJECTID);
mutex_unlock(&root->objectid_mutex);
return 0;
fail:
/* The caller is responsible to call btrfs_free_fs_root */
return ret;
}
static struct btrfs_root *btrfs_lookup_fs_root(struct btrfs_fs_info *fs_info,
u64 root_id)
{
struct btrfs_root *root;
spin_lock(&fs_info->fs_roots_radix_lock);
root = radix_tree_lookup(&fs_info->fs_roots_radix,
(unsigned long)root_id);
if (root)
root = btrfs_grab_root(root);
spin_unlock(&fs_info->fs_roots_radix_lock);
return root;
}
static struct btrfs_root *btrfs_get_global_root(struct btrfs_fs_info *fs_info,
u64 objectid)
{
if (objectid == BTRFS_ROOT_TREE_OBJECTID) return btrfs_grab_root(fs_info->tree_root); if (objectid == BTRFS_EXTENT_TREE_OBJECTID) return btrfs_grab_root(fs_info->extent_root); if (objectid == BTRFS_CHUNK_TREE_OBJECTID) return btrfs_grab_root(fs_info->chunk_root); if (objectid == BTRFS_DEV_TREE_OBJECTID) return btrfs_grab_root(fs_info->dev_root); if (objectid == BTRFS_CSUM_TREE_OBJECTID) return btrfs_grab_root(fs_info->csum_root); if (objectid == BTRFS_QUOTA_TREE_OBJECTID) return btrfs_grab_root(fs_info->quota_root) ? fs_info->quota_root : ERR_PTR(-ENOENT); if (objectid == BTRFS_UUID_TREE_OBJECTID) return btrfs_grab_root(fs_info->uuid_root) ? fs_info->uuid_root : ERR_PTR(-ENOENT); if (objectid == BTRFS_FREE_SPACE_TREE_OBJECTID) return btrfs_grab_root(fs_info->free_space_root) ? fs_info->free_space_root : ERR_PTR(-ENOENT);
return NULL;
}
int btrfs_insert_fs_root(struct btrfs_fs_info *fs_info,
struct btrfs_root *root)
{
int ret;
ret = radix_tree_preload(GFP_NOFS); if (ret)
return ret;
spin_lock(&fs_info->fs_roots_radix_lock);
ret = radix_tree_insert(&fs_info->fs_roots_radix,
(unsigned long)root->root_key.objectid,
root);
if (ret == 0) {
btrfs_grab_root(root);
set_bit(BTRFS_ROOT_IN_RADIX, &root->state);
}
spin_unlock(&fs_info->fs_roots_radix_lock);
radix_tree_preload_end();
return ret;
}
void btrfs_check_leaked_roots(struct btrfs_fs_info *fs_info)
{
#ifdef CONFIG_BTRFS_DEBUG
struct btrfs_root *root;
while (!list_empty(&fs_info->allocated_roots)) {
char buf[BTRFS_ROOT_NAME_BUF_LEN];
root = list_first_entry(&fs_info->allocated_roots,
struct btrfs_root, leak_list);
btrfs_err(fs_info, "leaked root %s refcount %d",
btrfs_root_name(&root->root_key, buf),
refcount_read(&root->refs));
while (refcount_read(&root->refs) > 1)
btrfs_put_root(root);
btrfs_put_root(root);
}
#endif
}
void btrfs_free_fs_info(struct btrfs_fs_info *fs_info)
{
percpu_counter_destroy(&fs_info->dirty_metadata_bytes);
percpu_counter_destroy(&fs_info->delalloc_bytes);
percpu_counter_destroy(&fs_info->ordered_bytes);
percpu_counter_destroy(&fs_info->dev_replace.bio_counter);
btrfs_free_csum_hash(fs_info);
btrfs_free_stripe_hash_table(fs_info);
btrfs_free_ref_cache(fs_info);
kfree(fs_info->balance_ctl);
kfree(fs_info->delayed_root);
btrfs_put_root(fs_info->extent_root);
btrfs_put_root(fs_info->tree_root);
btrfs_put_root(fs_info->chunk_root);
btrfs_put_root(fs_info->dev_root);
btrfs_put_root(fs_info->csum_root);
btrfs_put_root(fs_info->quota_root);
btrfs_put_root(fs_info->uuid_root);
btrfs_put_root(fs_info->free_space_root);
btrfs_put_root(fs_info->fs_root);
btrfs_put_root(fs_info->data_reloc_root);
btrfs_check_leaked_roots(fs_info);
btrfs_extent_buffer_leak_debug_check(fs_info);
kfree(fs_info->super_copy);
kfree(fs_info->super_for_commit);
kvfree(fs_info);
}
/*
* Get an in-memory reference of a root structure.
*
* For essential trees like root/extent tree, we grab it from fs_info directly.
* For subvolume trees, we check the cached filesystem roots first. If not
* found, then read it from disk and add it to cached fs roots.
*
* Caller should release the root by calling btrfs_put_root() after the usage.
*
* NOTE: Reloc and log trees can't be read by this function as they share the
* same root objectid.
*
* @objectid: root id
* @anon_dev: preallocated anonymous block device number for new roots,
* pass 0 for new allocation.
* @check_ref: whether to check root item references, If true, return -ENOENT
* for orphan roots
*/
static struct btrfs_root *btrfs_get_root_ref(struct btrfs_fs_info *fs_info,
u64 objectid, dev_t anon_dev,
bool check_ref)
{
struct btrfs_root *root;
struct btrfs_path *path;
struct btrfs_key key;
int ret;
root = btrfs_get_global_root(fs_info, objectid); if (root)
return root;
again:
root = btrfs_lookup_fs_root(fs_info, objectid);
if (root) {
/* Shouldn't get preallocated anon_dev for cached roots */
ASSERT(!anon_dev);
if (check_ref && btrfs_root_refs(&root->root_item) == 0) { btrfs_put_root(root);
return ERR_PTR(-ENOENT);
}
return root;
}
key.objectid = objectid;
key.type = BTRFS_ROOT_ITEM_KEY;
key.offset = (u64)-1;
root = btrfs_read_tree_root(fs_info->tree_root, &key);
if (IS_ERR(root))
return root;
if (check_ref && btrfs_root_refs(&root->root_item) == 0) {
ret = -ENOENT;
goto fail;
}
ret = btrfs_init_fs_root(root, anon_dev);
if (ret)
goto fail;
path = btrfs_alloc_path();
if (!path) {
ret = -ENOMEM;
goto fail;
}
key.objectid = BTRFS_ORPHAN_OBJECTID;
key.type = BTRFS_ORPHAN_ITEM_KEY;
key.offset = objectid;
ret = btrfs_search_slot(NULL, fs_info->tree_root, &key, path, 0, 0);
btrfs_free_path(path);
if (ret < 0)
goto fail;
if (ret == 0) set_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &root->state); ret = btrfs_insert_fs_root(fs_info, root);
if (ret) {
if (ret == -EEXIST) { btrfs_put_root(root); goto again;
}
goto fail;
}
return root;
fail:
/*
* If our caller provided us an anonymous device, then it's his
* responsability to free it in case we fail. So we have to set our
* root's anon_dev to 0 to avoid a double free, once by btrfs_put_root()
* and once again by our caller.
*/
if (anon_dev) root->anon_dev = 0; btrfs_put_root(root);
return ERR_PTR(ret);
}
/*
* Get in-memory reference of a root structure
*
* @objectid: tree objectid
* @check_ref: if set, verify that the tree exists and the item has at least
* one reference
*/
struct btrfs_root *btrfs_get_fs_root(struct btrfs_fs_info *fs_info,
u64 objectid, bool check_ref)
{
return btrfs_get_root_ref(fs_info, objectid, 0, check_ref);
}
/*
* Get in-memory reference of a root structure, created as new, optionally pass
* the anonymous block device id
*
* @objectid: tree objectid
* @anon_dev: if zero, allocate a new anonymous block device or use the
* parameter value
*/
struct btrfs_root *btrfs_get_new_fs_root(struct btrfs_fs_info *fs_info,
u64 objectid, dev_t anon_dev)
{
return btrfs_get_root_ref(fs_info, objectid, anon_dev, true);
}
/*
* btrfs_get_fs_root_commit_root - return a root for the given objectid
* @fs_info: the fs_info
* @objectid: the objectid we need to lookup
*
* This is exclusively used for backref walking, and exists specifically because
* of how qgroups does lookups. Qgroups will do a backref lookup at delayed ref
* creation time, which means we may have to read the tree_root in order to look
* up a fs root that is not in memory. If the root is not in memory we will
* read the tree root commit root and look up the fs root from there. This is a
* temporary root, it will not be inserted into the radix tree as it doesn't
* have the most uptodate information, it'll simply be discarded once the
* backref code is finished using the root.
*/
struct btrfs_root *btrfs_get_fs_root_commit_root(struct btrfs_fs_info *fs_info,
struct btrfs_path *path,
u64 objectid)
{
struct btrfs_root *root;
struct btrfs_key key;
ASSERT(path->search_commit_root && path->skip_locking);
/*
* This can return -ENOENT if we ask for a root that doesn't exist, but
* since this is called via the backref walking code we won't be looking
* up a root that doesn't exist, unless there's corruption. So if root
* != NULL just return it.
*/
root = btrfs_get_global_root(fs_info, objectid);
if (root)
return root;
root = btrfs_lookup_fs_root(fs_info, objectid);
if (root)
return root;
key.objectid = objectid;
key.type = BTRFS_ROOT_ITEM_KEY;
key.offset = (u64)-1;
root = read_tree_root_path(fs_info->tree_root, path, &key);
btrfs_release_path(path);
return root;
}
/*
* called by the kthread helper functions to finally call the bio end_io
* functions. This is where read checksum verification actually happens
*/
static void end_workqueue_fn(struct btrfs_work *work)
{
struct bio *bio;
struct btrfs_end_io_wq *end_io_wq;
end_io_wq = container_of(work, struct btrfs_end_io_wq, work);
bio = end_io_wq->bio;
bio->bi_status = end_io_wq->status;
bio->bi_private = end_io_wq->private;
bio->bi_end_io = end_io_wq->end_io;
bio_endio(bio);
kmem_cache_free(btrfs_end_io_wq_cache, end_io_wq);
}
static int cleaner_kthread(void *arg)
{
struct btrfs_root *root = arg;
struct btrfs_fs_info *fs_info = root->fs_info;
int again;
while (1) {
again = 0;
set_bit(BTRFS_FS_CLEANER_RUNNING, &fs_info->flags);
/* Make the cleaner go to sleep early. */
if (btrfs_need_cleaner_sleep(fs_info))
goto sleep;
/*
* Do not do anything if we might cause open_ctree() to block
* before we have finished mounting the filesystem.
*/
if (!test_bit(BTRFS_FS_OPEN, &fs_info->flags))
goto sleep;
if (!mutex_trylock(&fs_info->cleaner_mutex))
goto sleep;
/*
* Avoid the problem that we change the status of the fs
* during the above check and trylock.
*/
if (btrfs_need_cleaner_sleep(fs_info)) {
mutex_unlock(&fs_info->cleaner_mutex);
goto sleep;
}
btrfs_run_delayed_iputs(fs_info);
again = btrfs_clean_one_deleted_snapshot(root);
mutex_unlock(&fs_info->cleaner_mutex);
/*
* The defragger has dealt with the R/O remount and umount,
* needn't do anything special here.
*/
btrfs_run_defrag_inodes(fs_info);
/*
* Acquires fs_info->reclaim_bgs_lock to avoid racing
* with relocation (btrfs_relocate_chunk) and relocation
* acquires fs_info->cleaner_mutex (btrfs_relocate_block_group)
* after acquiring fs_info->reclaim_bgs_lock. So we
* can't hold, nor need to, fs_info->cleaner_mutex when deleting
* unused block groups.
*/
btrfs_delete_unused_bgs(fs_info);
/*
* Reclaim block groups in the reclaim_bgs list after we deleted
* all unused block_groups. This possibly gives us some more free
* space.
*/
btrfs_reclaim_bgs(fs_info);
sleep:
clear_and_wake_up_bit(BTRFS_FS_CLEANER_RUNNING, &fs_info->flags);
if (kthread_should_park())
kthread_parkme();
if (kthread_should_stop())
return 0;
if (!again) {
set_current_state(TASK_INTERRUPTIBLE);
schedule();
__set_current_state(TASK_RUNNING);
}
}
}
static int transaction_kthread(void *arg)
{
struct btrfs_root *root = arg;
struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_trans_handle *trans;
struct btrfs_transaction *cur;
u64 transid;
time64_t delta;
unsigned long delay;
bool cannot_commit;
do {
cannot_commit = false;
delay = msecs_to_jiffies(fs_info->commit_interval * 1000);
mutex_lock(&fs_info->transaction_kthread_mutex);
spin_lock(&fs_info->trans_lock);
cur = fs_info->running_transaction;
if (!cur) {
spin_unlock(&fs_info->trans_lock);
goto sleep;
}
delta = ktime_get_seconds() - cur->start_time;
if (cur->state < TRANS_STATE_COMMIT_START &&
delta < fs_info->commit_interval) {
spin_unlock(&fs_info->trans_lock);
delay -= msecs_to_jiffies((delta - 1) * 1000);
delay = min(delay,
msecs_to_jiffies(fs_info->commit_interval * 1000));
goto sleep;
}
transid = cur->transid;
spin_unlock(&fs_info->trans_lock);
/* If the file system is aborted, this will always fail. */
trans = btrfs_attach_transaction(root);
if (IS_ERR(trans)) {
if (PTR_ERR(trans) != -ENOENT)
cannot_commit = true;
goto sleep;
}
if (transid == trans->transid) {
btrfs_commit_transaction(trans);
} else {
btrfs_end_transaction(trans);
}
sleep:
wake_up_process(fs_info->cleaner_kthread);
mutex_unlock(&fs_info->transaction_kthread_mutex);
if (unlikely(test_bit(BTRFS_FS_STATE_ERROR,
&fs_info->fs_state)))
btrfs_cleanup_transaction(fs_info);
if (!kthread_should_stop() &&
(!btrfs_transaction_blocked(fs_info) ||
cannot_commit))
schedule_timeout_interruptible(delay);
} while (!kthread_should_stop());
return 0;
}
/*
* This will find the highest generation in the array of root backups. The
* index of the highest array is returned, or -EINVAL if we can't find
* anything.
*
* We check to make sure the array is valid by comparing the
* generation of the latest root in the array with the generation
* in the super block. If they don't match we pitch it.
*/
static int find_newest_super_backup(struct btrfs_fs_info *info)
{
const u64 newest_gen = btrfs_super_generation(info->super_copy);
u64 cur;
struct btrfs_root_backup *root_backup;
int i;
for (i = 0; i < BTRFS_NUM_BACKUP_ROOTS; i++) { root_backup = info->super_copy->super_roots + i;
cur = btrfs_backup_tree_root_gen(root_backup);
if (cur == newest_gen)
return i;
}
return -EINVAL;
}
/*
* copy all the root pointers into the super backup array.
* this will bump the backup pointer by one when it is
* done
*/
static void backup_super_roots(struct btrfs_fs_info *info)
{
const int next_backup = info->backup_root_index;
struct btrfs_root_backup *root_backup;
root_backup = info->super_for_commit->super_roots + next_backup;
/*
* make sure all of our padding and empty slots get zero filled
* regardless of which ones we use today
*/
memset(root_backup, 0, sizeof(*root_backup));
info->backup_root_index = (next_backup + 1) % BTRFS_NUM_BACKUP_ROOTS;
btrfs_set_backup_tree_root(root_backup, info->tree_root->node->start);
btrfs_set_backup_tree_root_gen(root_backup,
btrfs_header_generation(info->tree_root->node));
btrfs_set_backup_tree_root_level(root_backup,
btrfs_header_level(info->tree_root->node));
btrfs_set_backup_chunk_root(root_backup, info->chunk_root->node->start);
btrfs_set_backup_chunk_root_gen(root_backup,
btrfs_header_generation(info->chunk_root->node));
btrfs_set_backup_chunk_root_level(root_backup,
btrfs_header_level(info->chunk_root->node));
btrfs_set_backup_extent_root(root_backup, info->extent_root->node->start);
btrfs_set_backup_extent_root_gen(root_backup,
btrfs_header_generation(info->extent_root->node));
btrfs_set_backup_extent_root_level(root_backup,
btrfs_header_level(info->extent_root->node));
/*
* we might commit during log recovery, which happens before we set
* the fs_root. Make sure it is valid before we fill it in.
*/
if (info->fs_root && info->fs_root->node) { btrfs_set_backup_fs_root(root_backup,
info->fs_root->node->start);
btrfs_set_backup_fs_root_gen(root_backup,
btrfs_header_generation(info->fs_root->node));
btrfs_set_backup_fs_root_level(root_backup,
btrfs_header_level(info->fs_root->node));
}
btrfs_set_backup_dev_root(root_backup, info->dev_root->node->start);
btrfs_set_backup_dev_root_gen(root_backup,
btrfs_header_generation(info->dev_root->node));
btrfs_set_backup_dev_root_level(root_backup,
btrfs_header_level(info->dev_root->node));
btrfs_set_backup_csum_root(root_backup, info->csum_root->node->start);
btrfs_set_backup_csum_root_gen(root_backup,
btrfs_header_generation(info->csum_root->node));
btrfs_set_backup_csum_root_level(root_backup,
btrfs_header_level(info->csum_root->node));
btrfs_set_backup_total_bytes(root_backup,
btrfs_super_total_bytes(info->super_copy));
btrfs_set_backup_bytes_used(root_backup,
btrfs_super_bytes_used(info->super_copy));
btrfs_set_backup_num_devices(root_backup,
btrfs_super_num_devices(info->super_copy));
/*
* if we don't copy this out to the super_copy, it won't get remembered
* for the next commit
*/
memcpy(&info->super_copy->super_roots,
&info->super_for_commit->super_roots,
sizeof(*root_backup) * BTRFS_NUM_BACKUP_ROOTS);
}
/*
* read_backup_root - Reads a backup root based on the passed priority. Prio 0
* is the newest, prio 1/2/3 are 2nd newest/3rd newest/4th (oldest) backup roots
*
* fs_info - filesystem whose backup roots need to be read
* priority - priority of backup root required
*
* Returns backup root index on success and -EINVAL otherwise.
*/
static int read_backup_root(struct btrfs_fs_info *fs_info, u8 priority)
{
int backup_index = find_newest_super_backup(fs_info);
struct btrfs_super_block *super = fs_info->super_copy;
struct btrfs_root_backup *root_backup;
if (priority < BTRFS_NUM_BACKUP_ROOTS && backup_index >= 0) {
if (priority == 0)
return backup_index;
backup_index = backup_index + BTRFS_NUM_BACKUP_ROOTS - priority;
backup_index %= BTRFS_NUM_BACKUP_ROOTS;
} else {
return -EINVAL;
}
root_backup = super->super_roots + backup_index;
btrfs_set_super_generation(super,
btrfs_backup_tree_root_gen(root_backup));
btrfs_set_super_root(super, btrfs_backup_tree_root(root_backup));
btrfs_set_super_root_level(super,
btrfs_backup_tree_root_level(root_backup));
btrfs_set_super_bytes_used(super, btrfs_backup_bytes_used(root_backup));
/*
* Fixme: the total bytes and num_devices need to match or we should
* need a fsck
*/
btrfs_set_super_total_bytes(super, btrfs_backup_total_bytes(root_backup));
btrfs_set_super_num_devices(super, btrfs_backup_num_devices(root_backup));
return backup_index;
}
/* helper to cleanup workers */
static void btrfs_stop_all_workers(struct btrfs_fs_info *fs_info)
{
btrfs_destroy_workqueue(fs_info->fixup_workers);
btrfs_destroy_workqueue(fs_info->delalloc_workers);
btrfs_destroy_workqueue(fs_info->workers);
btrfs_destroy_workqueue(fs_info->endio_workers);
btrfs_destroy_workqueue(fs_info->endio_raid56_workers);
btrfs_destroy_workqueue(fs_info->rmw_workers);
btrfs_destroy_workqueue(fs_info->endio_write_workers);
btrfs_destroy_workqueue(fs_info->endio_freespace_worker);
btrfs_destroy_workqueue(fs_info->delayed_workers);
btrfs_destroy_workqueue(fs_info->caching_workers);
btrfs_destroy_workqueue(fs_info->readahead_workers);
btrfs_destroy_workqueue(fs_info->flush_workers);
btrfs_destroy_workqueue(fs_info->qgroup_rescan_workers);
if (fs_info->discard_ctl.discard_workers)
destroy_workqueue(fs_info->discard_ctl.discard_workers);
/*
* Now that all other work queues are destroyed, we can safely destroy
* the queues used for metadata I/O, since tasks from those other work
* queues can do metadata I/O operations.
*/
btrfs_destroy_workqueue(fs_info->endio_meta_workers);
btrfs_destroy_workqueue(fs_info->endio_meta_write_workers);
}
static void free_root_extent_buffers(struct btrfs_root *root)
{
if (root) { free_extent_buffer(root->node);
free_extent_buffer(root->commit_root);
root->node = NULL;
root->commit_root = NULL;
}
}
/* helper to cleanup tree roots */
static void free_root_pointers(struct btrfs_fs_info *info, bool free_chunk_root)
{
free_root_extent_buffers(info->tree_root); free_root_extent_buffers(info->dev_root); free_root_extent_buffers(info->extent_root); free_root_extent_buffers(info->csum_root); free_root_extent_buffers(info->quota_root); free_root_extent_buffers(info->uuid_root); free_root_extent_buffers(info->fs_root); free_root_extent_buffers(info->data_reloc_root); if (free_chunk_root) free_root_extent_buffers(info->chunk_root); free_root_extent_buffers(info->free_space_root);
}
void btrfs_put_root(struct btrfs_root *root)
{
if (!root)
return;
if (refcount_dec_and_test(&root->refs)) { WARN_ON(!RB_EMPTY_ROOT(&root->inode_tree)); WARN_ON(test_bit(BTRFS_ROOT_DEAD_RELOC_TREE, &root->state)); if (root->anon_dev) free_anon_bdev(root->anon_dev); btrfs_drew_lock_destroy(&root->snapshot_lock);
free_root_extent_buffers(root);
#ifdef CONFIG_BTRFS_DEBUG
spin_lock(&root->fs_info->fs_roots_radix_lock);
list_del_init(&root->leak_list);
spin_unlock(&root->fs_info->fs_roots_radix_lock);
#endif
kfree(root);
}
}
void btrfs_free_fs_roots(struct btrfs_fs_info *fs_info)
{
int ret;
struct btrfs_root *gang[8];
int i;
while (!list_empty(&fs_info->dead_roots)) { gang[0] = list_entry(fs_info->dead_roots.next,
struct btrfs_root, root_list);
list_del(&gang[0]->root_list);
if (test_bit(BTRFS_ROOT_IN_RADIX, &gang[0]->state))
btrfs_drop_and_free_fs_root(fs_info, gang[0]); btrfs_put_root(gang[0]);
}
while (1) {
ret = radix_tree_gang_lookup(&fs_info->fs_roots_radix,
(void **)gang, 0,
ARRAY_SIZE(gang));
if (!ret)
break;
for (i = 0; i < ret; i++) btrfs_drop_and_free_fs_root(fs_info, gang[i]);
}
}
static void btrfs_init_scrub(struct btrfs_fs_info *fs_info)
{
mutex_init(&fs_info->scrub_lock);
atomic_set(&fs_info->scrubs_running, 0);
atomic_set(&fs_info->scrub_pause_req, 0);
atomic_set(&fs_info->scrubs_paused, 0);
atomic_set(&fs_info->scrub_cancel_req, 0);
init_waitqueue_head(&fs_info->scrub_pause_wait);
refcount_set(&fs_info->scrub_workers_refcnt, 0);
}
static void btrfs_init_balance(struct btrfs_fs_info *fs_info)
{
spin_lock_init(&fs_info->balance_lock);
mutex_init(&fs_info->balance_mutex);
atomic_set(&fs_info->balance_pause_req, 0);
atomic_set(&fs_info->balance_cancel_req, 0);
fs_info->balance_ctl = NULL;
init_waitqueue_head(&fs_info->balance_wait_q);
atomic_set(&fs_info->reloc_cancel_req, 0);
}
static void btrfs_init_btree_inode(struct btrfs_fs_info *fs_info)
{
struct inode *inode = fs_info->btree_inode;
inode->i_ino = BTRFS_BTREE_INODE_OBJECTID;
set_nlink(inode, 1);
/*
* we set the i_size on the btree inode to the max possible int.
* the real end of the address space is determined by all of
* the devices in the system
*/
inode->i_size = OFFSET_MAX;
inode->i_mapping->a_ops = &btree_aops;
RB_CLEAR_NODE(&BTRFS_I(inode)->rb_node);
extent_io_tree_init(fs_info, &BTRFS_I(inode)->io_tree,
IO_TREE_BTREE_INODE_IO, inode);
BTRFS_I(inode)->io_tree.track_uptodate = false;
extent_map_tree_init(&BTRFS_I(inode)->extent_tree);
BTRFS_I(inode)->root = btrfs_grab_root(fs_info->tree_root);
memset(&BTRFS_I(inode)->location, 0, sizeof(struct btrfs_key));
set_bit(BTRFS_INODE_DUMMY, &BTRFS_I(inode)->runtime_flags);
btrfs_insert_inode_hash(inode);
}
static void btrfs_init_dev_replace_locks(struct btrfs_fs_info *fs_info)
{
mutex_init(&fs_info->dev_replace.lock_finishing_cancel_unmount);
init_rwsem(&fs_info->dev_replace.rwsem);
init_waitqueue_head(&fs_info->dev_replace.replace_wait);
}
static void btrfs_init_qgroup(struct btrfs_fs_info *fs_info)
{
spin_lock_init(&fs_info->qgroup_lock);
mutex_init(&fs_info->qgroup_ioctl_lock);
fs_info->qgroup_tree = RB_ROOT;
INIT_LIST_HEAD(&fs_info->dirty_qgroups);
fs_info->qgroup_seq = 1;
fs_info->qgroup_ulist = NULL;
fs_info->qgroup_rescan_running = false;
mutex_init(&fs_info->qgroup_rescan_lock);
}
static int btrfs_init_workqueues(struct btrfs_fs_info *fs_info,
struct btrfs_fs_devices *fs_devices)
{
u32 max_active = fs_info->thread_pool_size;
unsigned int flags = WQ_MEM_RECLAIM | WQ_FREEZABLE | WQ_UNBOUND;
fs_info->workers =
btrfs_alloc_workqueue(fs_info, "worker",
flags | WQ_HIGHPRI, max_active, 16);
fs_info->delalloc_workers =
btrfs_alloc_workqueue(fs_info, "delalloc",
flags, max_active, 2);
fs_info->flush_workers =
btrfs_alloc_workqueue(fs_info, "flush_delalloc",
flags, max_active, 0);
fs_info->caching_workers =
btrfs_alloc_workqueue(fs_info, "cache", flags, max_active, 0);
fs_info->fixup_workers =
btrfs_alloc_workqueue(fs_info, "fixup", flags, 1, 0);
/*
* endios are largely parallel and should have a very
* low idle thresh
*/
fs_info->endio_workers =
btrfs_alloc_workqueue(fs_info, "endio", flags, max_active, 4);
fs_info->endio_meta_workers =
btrfs_alloc_workqueue(fs_info, "endio-meta", flags,
max_active, 4);
fs_info->endio_meta_write_workers =
btrfs_alloc_workqueue(fs_info, "endio-meta-write", flags,
max_active, 2);
fs_info->endio_raid56_workers =
btrfs_alloc_workqueue(fs_info, "endio-raid56", flags,
max_active, 4);
fs_info->rmw_workers =
btrfs_alloc_workqueue(fs_info, "rmw", flags, max_active, 2);
fs_info->endio_write_workers =
btrfs_alloc_workqueue(fs_info, "endio-write", flags,
max_active, 2);
fs_info->endio_freespace_worker =
btrfs_alloc_workqueue(fs_info, "freespace-write", flags,
max_active, 0);
fs_info->delayed_workers =
btrfs_alloc_workqueue(fs_info, "delayed-meta", flags,
max_active, 0);
fs_info->readahead_workers =
btrfs_alloc_workqueue(fs_info, "readahead", flags,
max_active, 2);
fs_info->qgroup_rescan_workers =
btrfs_alloc_workqueue(fs_info, "qgroup-rescan", flags, 1, 0);
fs_info->discard_ctl.discard_workers =
alloc_workqueue("btrfs_discard", WQ_UNBOUND | WQ_FREEZABLE, 1);
if (!(fs_info->workers && fs_info->delalloc_workers && fs_info->flush_workers && fs_info->endio_workers && fs_info->endio_meta_workers && fs_info->endio_meta_write_workers && fs_info->endio_write_workers && fs_info->endio_raid56_workers && fs_info->endio_freespace_worker && fs_info->rmw_workers && fs_info->caching_workers && fs_info->readahead_workers && fs_info->fixup_workers && fs_info->delayed_workers && fs_info->qgroup_rescan_workers &&
fs_info->discard_ctl.discard_workers)) {
return -ENOMEM;
}
return 0;
}
static int btrfs_init_csum_hash(struct btrfs_fs_info *fs_info, u16 csum_type)
{
struct crypto_shash *csum_shash;
const char *csum_driver = btrfs_super_csum_driver(csum_type);
csum_shash = crypto_alloc_shash(csum_driver, 0, 0);
if (IS_ERR(csum_shash)) {
btrfs_err(fs_info, "error allocating %s hash for checksum",
csum_driver);
return PTR_ERR(csum_shash);
}
fs_info->csum_shash = csum_shash;
return 0;
}
static int btrfs_replay_log(struct btrfs_fs_info *fs_info,
struct btrfs_fs_devices *fs_devices)
{
int ret;
struct btrfs_root *log_tree_root;
struct btrfs_super_block *disk_super = fs_info->super_copy;
u64 bytenr = btrfs_super_log_root(disk_super);
int level = btrfs_super_log_root_level(disk_super);
if (fs_devices->rw_devices == 0) {
btrfs_warn(fs_info, "log replay required on RO media");
return -EIO;
}
log_tree_root = btrfs_alloc_root(fs_info, BTRFS_TREE_LOG_OBJECTID,
GFP_KERNEL);
if (!log_tree_root)
return -ENOMEM;
log_tree_root->node = read_tree_block(fs_info, bytenr,
BTRFS_TREE_LOG_OBJECTID,
fs_info->generation + 1, level,
NULL);
if (IS_ERR(log_tree_root->node)) {
btrfs_warn(fs_info, "failed to read log tree");
ret = PTR_ERR(log_tree_root->node);
log_tree_root->node = NULL;
btrfs_put_root(log_tree_root);
return ret;
} else if (!extent_buffer_uptodate(log_tree_root->node)) {
btrfs_err(fs_info, "failed to read log tree");
btrfs_put_root(log_tree_root);
return -EIO;
}
/* returns with log_tree_root freed on success */
ret = btrfs_recover_log_trees(log_tree_root);
if (ret) {
btrfs_handle_fs_error(fs_info, ret,
"Failed to recover log tree");
btrfs_put_root(log_tree_root);
return ret;
}
if (sb_rdonly(fs_info->sb)) { ret = btrfs_commit_super(fs_info);
if (ret)
return ret;
}
return 0;
}
static int btrfs_read_roots(struct btrfs_fs_info *fs_info)
{
struct btrfs_root *tree_root = fs_info->tree_root;
struct btrfs_root *root;
struct btrfs_key location;
int ret;
BUG_ON(!fs_info->tree_root); location.objectid = BTRFS_EXTENT_TREE_OBJECTID;
location.type = BTRFS_ROOT_ITEM_KEY;
location.offset = 0;
root = btrfs_read_tree_root(tree_root, &location);
if (IS_ERR(root)) {
if (!btrfs_test_opt(fs_info, IGNOREBADROOTS)) {
ret = PTR_ERR(root);
goto out;
}
} else {
set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
fs_info->extent_root = root;
}
location.objectid = BTRFS_DEV_TREE_OBJECTID;
root = btrfs_read_tree_root(tree_root, &location);
if (IS_ERR(root)) {
if (!btrfs_test_opt(fs_info, IGNOREBADROOTS)) {
ret = PTR_ERR(root);
goto out;
}
} else {
set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
fs_info->dev_root = root;
}
/* Initialize fs_info for all devices in any case */
btrfs_init_devices_late(fs_info);
/* If IGNOREDATACSUMS is set don't bother reading the csum root. */
if (!btrfs_test_opt(fs_info, IGNOREDATACSUMS)) {
location.objectid = BTRFS_CSUM_TREE_OBJECTID;
root = btrfs_read_tree_root(tree_root, &location);
if (IS_ERR(root)) {
if (!btrfs_test_opt(fs_info, IGNOREBADROOTS)) {
ret = PTR_ERR(root);
goto out;
}
} else {
set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
fs_info->csum_root = root;
}
}
/*
* This tree can share blocks with some other fs tree during relocation
* and we need a proper setup by btrfs_get_fs_root
*/
root = btrfs_get_fs_root(tree_root->fs_info,
BTRFS_DATA_RELOC_TREE_OBJECTID, true);
if (IS_ERR(root)) {
if (!btrfs_test_opt(fs_info, IGNOREBADROOTS)) {
ret = PTR_ERR(root);
goto out;
}
} else {
set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
fs_info->data_reloc_root = root;
}
location.objectid = BTRFS_QUOTA_TREE_OBJECTID;
root = btrfs_read_tree_root(tree_root, &location);
if (!IS_ERR(root)) {
set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
set_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags);
fs_info->quota_root = root;
}
location.objectid = BTRFS_UUID_TREE_OBJECTID;
root = btrfs_read_tree_root(tree_root, &location);
if (IS_ERR(root)) {
if (!btrfs_test_opt(fs_info, IGNOREBADROOTS)) {
ret = PTR_ERR(root);
if (ret != -ENOENT)
goto out;
}
} else {
set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
fs_info->uuid_root = root;
}
if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE)) { location.objectid = BTRFS_FREE_SPACE_TREE_OBJECTID;
root = btrfs_read_tree_root(tree_root, &location);
if (IS_ERR(root)) {
if (!btrfs_test_opt(fs_info, IGNOREBADROOTS)) {
ret = PTR_ERR(root);
goto out;
}
} else {
set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
fs_info->free_space_root = root;
}
}
return 0;
out:
btrfs_warn(fs_info, "failed to read root (objectid=%llu): %d",
location.objectid, ret);
return ret;
}
/*
* Real super block validation
* NOTE: super csum type and incompat features will not be checked here.
*
* @sb: super block to check
* @mirror_num: the super block number to check its bytenr:
* 0 the primary (1st) sb
* 1, 2 2nd and 3rd backup copy
* -1 skip bytenr check
*/
static int validate_super(struct btrfs_fs_info *fs_info,
struct btrfs_super_block *sb, int mirror_num)
{
u64 nodesize = btrfs_super_nodesize(sb);
u64 sectorsize = btrfs_super_sectorsize(sb);
int ret = 0;
if (btrfs_super_magic(sb) != BTRFS_MAGIC) {
btrfs_err(fs_info, "no valid FS found");
ret = -EINVAL;
}
if (btrfs_super_flags(sb) & ~BTRFS_SUPER_FLAG_SUPP) {
btrfs_err(fs_info, "unrecognized or unsupported super flag: %llu",
btrfs_super_flags(sb) & ~BTRFS_SUPER_FLAG_SUPP);
ret = -EINVAL;
}
if (btrfs_super_root_level(sb) >= BTRFS_MAX_LEVEL) {
btrfs_err(fs_info, "tree_root level too big: %d >= %d",
btrfs_super_root_level(sb), BTRFS_MAX_LEVEL);
ret = -EINVAL;
}
if (btrfs_super_chunk_root_level(sb) >= BTRFS_MAX_LEVEL) {
btrfs_err(fs_info, "chunk_root level too big: %d >= %d",
btrfs_super_chunk_root_level(sb), BTRFS_MAX_LEVEL);
ret = -EINVAL;
}
if (btrfs_super_log_root_level(sb) >= BTRFS_MAX_LEVEL) {
btrfs_err(fs_info, "log_root level too big: %d >= %d",
btrfs_super_log_root_level(sb), BTRFS_MAX_LEVEL);
ret = -EINVAL;
}
/*
* Check sectorsize and nodesize first, other check will need it.
* Check all possible sectorsize(4K, 8K, 16K, 32K, 64K) here.
*/
if (!is_power_of_2(sectorsize) || sectorsize < 4096 ||
sectorsize > BTRFS_MAX_METADATA_BLOCKSIZE) {
btrfs_err(fs_info, "invalid sectorsize %llu", sectorsize);
ret = -EINVAL;
}
/*
* For 4K page size, we only support 4K sector size.
* For 64K page size, we support read-write for 64K sector size, and
* read-only for 4K sector size.
*/
if ((PAGE_SIZE == SZ_4K && sectorsize != PAGE_SIZE) ||
(PAGE_SIZE == SZ_64K && (sectorsize != SZ_4K &&
sectorsize != SZ_64K))) {
btrfs_err(fs_info,
"sectorsize %llu not yet supported for page size %lu",
sectorsize, PAGE_SIZE);
ret = -EINVAL;
}
if (!is_power_of_2(nodesize) || nodesize < sectorsize ||
nodesize > BTRFS_MAX_METADATA_BLOCKSIZE) {
btrfs_err(fs_info, "invalid nodesize %llu", nodesize);
ret = -EINVAL;
}
if (nodesize != le32_to_cpu(sb->__unused_leafsize)) {
btrfs_err(fs_info, "invalid leafsize %u, should be %llu",
le32_to_cpu(sb->__unused_leafsize), nodesize);
ret = -EINVAL;
}
/* Root alignment check */
if (!IS_ALIGNED(btrfs_super_root(sb), sectorsize)) {
btrfs_warn(fs_info, "tree_root block unaligned: %llu",
btrfs_super_root(sb));
ret = -EINVAL;
}
if (!IS_ALIGNED(btrfs_super_chunk_root(sb), sectorsize)) {
btrfs_warn(fs_info, "chunk_root block unaligned: %llu",
btrfs_super_chunk_root(sb));
ret = -EINVAL;
}
if (!IS_ALIGNED(btrfs_super_log_root(sb), sectorsize)) {
btrfs_warn(fs_info, "log_root block unaligned: %llu",
btrfs_super_log_root(sb));
ret = -EINVAL;
}
if (memcmp(fs_info->fs_devices->fsid, fs_info->super_copy->fsid,
BTRFS_FSID_SIZE)) {
btrfs_err(fs_info,
"superblock fsid doesn't match fsid of fs_devices: %pU != %pU",
fs_info->super_copy->fsid, fs_info->fs_devices->fsid);
ret = -EINVAL;
}
if (btrfs_fs_incompat(fs_info, METADATA_UUID) &&
memcmp(fs_info->fs_devices->metadata_uuid,
fs_info->super_copy->metadata_uuid, BTRFS_FSID_SIZE)) { btrfs_err(fs_info,
"superblock metadata_uuid doesn't match metadata uuid of fs_devices: %pU != %pU",
fs_info->super_copy->metadata_uuid,
fs_info->fs_devices->metadata_uuid);
ret = -EINVAL;
}
if (memcmp(fs_info->fs_devices->metadata_uuid, sb->dev_item.fsid,
BTRFS_FSID_SIZE) != 0) {
btrfs_err(fs_info,
"dev_item UUID does not match metadata fsid: %pU != %pU",
fs_info->fs_devices->metadata_uuid, sb->dev_item.fsid);
ret = -EINVAL;
}
/*
* Hint to catch really bogus numbers, bitflips or so, more exact checks are
* done later
*/
if (btrfs_super_bytes_used(sb) < 6 * btrfs_super_nodesize(sb)) {
btrfs_err(fs_info, "bytes_used is too small %llu",
btrfs_super_bytes_used(sb));
ret = -EINVAL;
}
if (!is_power_of_2(btrfs_super_stripesize(sb))) {
btrfs_err(fs_info, "invalid stripesize %u",
btrfs_super_stripesize(sb));
ret = -EINVAL;
}
if (btrfs_super_num_devices(sb) > (1UL << 31))
btrfs_warn(fs_info, "suspicious number of devices: %llu",
btrfs_super_num_devices(sb));
if (btrfs_super_num_devices(sb) == 0) {
btrfs_err(fs_info, "number of devices is 0");
ret = -EINVAL;
}
if (mirror_num >= 0 &&
btrfs_super_bytenr(sb) != btrfs_sb_offset(mirror_num)) {
btrfs_err(fs_info, "super offset mismatch %llu != %u",
btrfs_super_bytenr(sb), BTRFS_SUPER_INFO_OFFSET);
ret = -EINVAL;
}
/*
* Obvious sys_chunk_array corruptions, it must hold at least one key
* and one chunk
*/
if (btrfs_super_sys_array_size(sb) > BTRFS_SYSTEM_CHUNK_ARRAY_SIZE) {
btrfs_err(fs_info, "system chunk array too big %u > %u",
btrfs_super_sys_array_size(sb),
BTRFS_SYSTEM_CHUNK_ARRAY_SIZE);
ret = -EINVAL;
}
if (btrfs_super_sys_array_size(sb) < sizeof(struct btrfs_disk_key)
+ sizeof(struct btrfs_chunk)) {
btrfs_err(fs_info, "system chunk array too small %u < %zu",
btrfs_super_sys_array_size(sb),
sizeof(struct btrfs_disk_key)
+ sizeof(struct btrfs_chunk));
ret = -EINVAL;
}
/*
* The generation is a global counter, we'll trust it more than the others
* but it's still possible that it's the one that's wrong.
*/
if (btrfs_super_generation(sb) < btrfs_super_chunk_root_generation(sb))
btrfs_warn(fs_info,
"suspicious: generation < chunk_root_generation: %llu < %llu",
btrfs_super_generation(sb),
btrfs_super_chunk_root_generation(sb));
if (btrfs_super_generation(sb) < btrfs_super_cache_generation(sb)
&& btrfs_super_cache_generation(sb) != (u64)-1)
btrfs_warn(fs_info,
"suspicious: generation < cache_generation: %llu < %llu",
btrfs_super_generation(sb),
btrfs_super_cache_generation(sb));
return ret;
}
/*
* Validation of super block at mount time.
* Some checks already done early at mount time, like csum type and incompat
* flags will be skipped.
*/
static int btrfs_validate_mount_super(struct btrfs_fs_info *fs_info)
{
return validate_super(fs_info, fs_info->super_copy, 0);
}
/*
* Validation of super block at write time.
* Some checks like bytenr check will be skipped as their values will be
* overwritten soon.
* Extra checks like csum type and incompat flags will be done here.
*/
static int btrfs_validate_write_super(struct btrfs_fs_info *fs_info,
struct btrfs_super_block *sb)
{
int ret;
ret = validate_super(fs_info, sb, -1);
if (ret < 0)
goto out;
if (!btrfs_supported_super_csum(btrfs_super_csum_type(sb))) {
ret = -EUCLEAN;
btrfs_err(fs_info, "invalid csum type, has %u want %u",
btrfs_super_csum_type(sb), BTRFS_CSUM_TYPE_CRC32);
goto out;
}
if (btrfs_super_incompat_flags(sb) & ~BTRFS_FEATURE_INCOMPAT_SUPP) {
ret = -EUCLEAN;
btrfs_err(fs_info,
"invalid incompat flags, has 0x%llx valid mask 0x%llx",
btrfs_super_incompat_flags(sb),
(unsigned long long)BTRFS_FEATURE_INCOMPAT_SUPP);
goto out;
}
out:
if (ret < 0)
btrfs_err(fs_info,
"super block corruption detected before writing it to disk");
return ret;
}
static int __cold init_tree_roots(struct btrfs_fs_info *fs_info)
{
int backup_index = find_newest_super_backup(fs_info);
struct btrfs_super_block *sb = fs_info->super_copy;
struct btrfs_root *tree_root = fs_info->tree_root;
bool handle_error = false;
int ret = 0;
int i;
for (i = 0; i < BTRFS_NUM_BACKUP_ROOTS; i++) {
u64 generation;
int level;
if (handle_error) {
if (!IS_ERR(tree_root->node)) free_extent_buffer(tree_root->node); tree_root->node = NULL;
if (!btrfs_test_opt(fs_info, USEBACKUPROOT))
break;
free_root_pointers(fs_info, 0);
/*
* Don't use the log in recovery mode, it won't be
* valid
*/
btrfs_set_super_log_root(sb, 0);
/* We can't trust the free space cache either */
btrfs_set_opt(fs_info->mount_opt, CLEAR_CACHE);
ret = read_backup_root(fs_info, i);
backup_index = ret;
if (ret < 0)
return ret;
}
generation = btrfs_super_generation(sb);
level = btrfs_super_root_level(sb);
tree_root->node = read_tree_block(fs_info, btrfs_super_root(sb),
BTRFS_ROOT_TREE_OBJECTID,
generation, level, NULL);
if (IS_ERR(tree_root->node)) {
handle_error = true;
ret = PTR_ERR(tree_root->node);
tree_root->node = NULL;
btrfs_warn(fs_info, "couldn't read tree root");
continue;
} else if (!extent_buffer_uptodate(tree_root->node)) {
handle_error = true;
ret = -EIO;
btrfs_warn(fs_info, "error while reading tree root");
continue;
}
btrfs_set_root_node(&tree_root->root_item, tree_root->node);
tree_root->commit_root = btrfs_root_node(tree_root);
btrfs_set_root_refs(&tree_root->root_item, 1);
/*
* No need to hold btrfs_root::objectid_mutex since the fs
* hasn't been fully initialised and we are the only user
*/
ret = btrfs_init_root_free_objectid(tree_root);
if (ret < 0) {
handle_error = true;
continue;
}
ASSERT(tree_root->free_objectid <= BTRFS_LAST_FREE_OBJECTID);
ret = btrfs_read_roots(fs_info);
if (ret < 0) {
handle_error = true;
continue;
}
/* All successful */
fs_info->generation = generation;
fs_info->last_trans_committed = generation;
fs_info->last_reloc_trans = 0;
/* Always begin writing backup roots after the one being used */
if (backup_index < 0) {
fs_info->backup_root_index = 0;
} else {
fs_info->backup_root_index = backup_index + 1;
fs_info->backup_root_index %= BTRFS_NUM_BACKUP_ROOTS;
}
break;
}
return ret;
}
void btrfs_init_fs_info(struct btrfs_fs_info *fs_info)
{
INIT_RADIX_TREE(&fs_info->fs_roots_radix, GFP_ATOMIC);
INIT_RADIX_TREE(&fs_info->buffer_radix, GFP_ATOMIC);
INIT_LIST_HEAD(&fs_info->trans_list);
INIT_LIST_HEAD(&fs_info->dead_roots);
INIT_LIST_HEAD(&fs_info->delayed_iputs);
INIT_LIST_HEAD(&fs_info->delalloc_roots);
INIT_LIST_HEAD(&fs_info->caching_block_groups);
spin_lock_init(&fs_info->delalloc_root_lock);
spin_lock_init(&fs_info->trans_lock);
spin_lock_init(&fs_info->fs_roots_radix_lock);
spin_lock_init(&fs_info->delayed_iput_lock);
spin_lock_init(&fs_info->defrag_inodes_lock);
spin_lock_init(&fs_info->super_lock);
spin_lock_init(&fs_info->buffer_lock);
spin_lock_init(&fs_info->unused_bgs_lock);
spin_lock_init(&fs_info->treelog_bg_lock);
spin_lock_init(&fs_info->relocation_bg_lock);
rwlock_init(&fs_info->tree_mod_log_lock);
mutex_init(&fs_info->unused_bg_unpin_mutex);
mutex_init(&fs_info->reclaim_bgs_lock);
mutex_init(&fs_info->reloc_mutex);
mutex_init(&fs_info->delalloc_root_mutex);
mutex_init(&fs_info->zoned_meta_io_lock);
seqlock_init(&fs_info->profiles_lock);
INIT_LIST_HEAD(&fs_info->dirty_cowonly_roots);
INIT_LIST_HEAD(&fs_info->space_info);
INIT_LIST_HEAD(&fs_info->tree_mod_seq_list);
INIT_LIST_HEAD(&fs_info->unused_bgs);
INIT_LIST_HEAD(&fs_info->reclaim_bgs);
#ifdef CONFIG_BTRFS_DEBUG
INIT_LIST_HEAD(&fs_info->allocated_roots);
INIT_LIST_HEAD(&fs_info->allocated_ebs);
spin_lock_init(&fs_info->eb_leak_lock);
#endif
extent_map_tree_init(&fs_info->mapping_tree);
btrfs_init_block_rsv(&fs_info->global_block_rsv,
BTRFS_BLOCK_RSV_GLOBAL);
btrfs_init_block_rsv(&fs_info->trans_block_rsv, BTRFS_BLOCK_RSV_TRANS);
btrfs_init_block_rsv(&fs_info->chunk_block_rsv, BTRFS_BLOCK_RSV_CHUNK);
btrfs_init_block_rsv(&fs_info->empty_block_rsv, BTRFS_BLOCK_RSV_EMPTY);
btrfs_init_block_rsv(&fs_info->delayed_block_rsv,
BTRFS_BLOCK_RSV_DELOPS);
btrfs_init_block_rsv(&fs_info->delayed_refs_rsv,
BTRFS_BLOCK_RSV_DELREFS);
atomic_set(&fs_info->async_delalloc_pages, 0);
atomic_set(&fs_info->defrag_running, 0);
atomic_set(&fs_info->reada_works_cnt, 0);
atomic_set(&fs_info->nr_delayed_iputs, 0);
atomic64_set(&fs_info->tree_mod_seq, 0);
fs_info->max_inline = BTRFS_DEFAULT_MAX_INLINE;
fs_info->metadata_ratio = 0;
fs_info->defrag_inodes = RB_ROOT;
atomic64_set(&fs_info->free_chunk_space, 0);
fs_info->tree_mod_log = RB_ROOT;
fs_info->commit_interval = BTRFS_DEFAULT_COMMIT_INTERVAL;
fs_info->avg_delayed_ref_runtime = NSEC_PER_SEC >> 6; /* div by 64 */
/* readahead state */
INIT_RADIX_TREE(&fs_info->reada_tree, GFP_NOFS & ~__GFP_DIRECT_RECLAIM);
spin_lock_init(&fs_info->reada_lock);
btrfs_init_ref_verify(fs_info);
fs_info->thread_pool_size = min_t(unsigned long,
num_online_cpus() + 2, 8);
INIT_LIST_HEAD(&fs_info->ordered_roots);
spin_lock_init(&fs_info->ordered_root_lock);
btrfs_init_scrub(fs_info);
#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
fs_info->check_integrity_print_mask = 0;
#endif
btrfs_init_balance(fs_info);
btrfs_init_async_reclaim_work(fs_info);
spin_lock_init(&fs_info->block_group_cache_lock);
fs_info->block_group_cache_tree = RB_ROOT;
fs_info->first_logical_byte = (u64)-1;
extent_io_tree_init(fs_info, &fs_info->excluded_extents,
IO_TREE_FS_EXCLUDED_EXTENTS, NULL);
set_bit(BTRFS_FS_BARRIER, &fs_info->flags);
mutex_init(&fs_info->ordered_operations_mutex);
mutex_init(&fs_info->tree_log_mutex);
mutex_init(&fs_info->chunk_mutex);
mutex_init(&fs_info->transaction_kthread_mutex);
mutex_init(&fs_info->cleaner_mutex);
mutex_init(&fs_info->ro_block_group_mutex);
init_rwsem(&fs_info->commit_root_sem);
init_rwsem(&fs_info->cleanup_work_sem);
init_rwsem(&fs_info->subvol_sem);
sema_init(&fs_info->uuid_tree_rescan_sem, 1);
btrfs_init_dev_replace_locks(fs_info);
btrfs_init_qgroup(fs_info);
btrfs_discard_init(fs_info);
btrfs_init_free_cluster(&fs_info->meta_alloc_cluster);
btrfs_init_free_cluster(&fs_info->data_alloc_cluster);
init_waitqueue_head(&fs_info->transaction_throttle);
init_waitqueue_head(&fs_info->transaction_wait);
init_waitqueue_head(&fs_info->transaction_blocked_wait);
init_waitqueue_head(&fs_info->async_submit_wait);
init_waitqueue_head(&fs_info->delayed_iputs_wait);
/* Usable values until the real ones are cached from the superblock */
fs_info->nodesize = 4096;
fs_info->sectorsize = 4096;
fs_info->sectorsize_bits = ilog2(4096);
fs_info->stripesize = 4096;
spin_lock_init(&fs_info->swapfile_pins_lock);
fs_info->swapfile_pins = RB_ROOT;
fs_info->bg_reclaim_threshold = BTRFS_DEFAULT_RECLAIM_THRESH;
INIT_WORK(&fs_info->reclaim_bgs_work, btrfs_reclaim_bgs_work);
}
static int init_mount_fs_info(struct btrfs_fs_info *fs_info, struct super_block *sb)
{
int ret;
fs_info->sb = sb;
sb->s_blocksize = BTRFS_BDEV_BLOCKSIZE;
sb->s_blocksize_bits = blksize_bits(BTRFS_BDEV_BLOCKSIZE);
ret = percpu_counter_init(&fs_info->ordered_bytes, 0, GFP_KERNEL);
if (ret)
return ret;
ret = percpu_counter_init(&fs_info->dirty_metadata_bytes, 0, GFP_KERNEL);
if (ret)
return ret;
fs_info->dirty_metadata_batch = PAGE_SIZE *
(1 + ilog2(nr_cpu_ids));
ret = percpu_counter_init(&fs_info->delalloc_bytes, 0, GFP_KERNEL);
if (ret)
return ret;
ret = percpu_counter_init(&fs_info->dev_replace.bio_counter, 0,
GFP_KERNEL);
if (ret)
return ret;
fs_info->delayed_root = kmalloc(sizeof(struct btrfs_delayed_root),
GFP_KERNEL);
if (!fs_info->delayed_root)
return -ENOMEM;
btrfs_init_delayed_root(fs_info->delayed_root);
if (sb_rdonly(sb))
set_bit(BTRFS_FS_STATE_RO, &fs_info->fs_state); return btrfs_alloc_stripe_hash_table(fs_info);
}
static int btrfs_uuid_rescan_kthread(void *data)
{
struct btrfs_fs_info *fs_info = (struct btrfs_fs_info *)data;
int ret;
/*
* 1st step is to iterate through the existing UUID tree and
* to delete all entries that contain outdated data.
* 2nd step is to add all missing entries to the UUID tree.
*/
ret = btrfs_uuid_tree_iterate(fs_info);
if (ret < 0) {
if (ret != -EINTR)
btrfs_warn(fs_info, "iterating uuid_tree failed %d",
ret);
up(&fs_info->uuid_tree_rescan_sem);
return ret;
}
return btrfs_uuid_scan_kthread(data);
}
static int btrfs_check_uuid_tree(struct btrfs_fs_info *fs_info)
{
struct task_struct *task;
down(&fs_info->uuid_tree_rescan_sem);
task = kthread_run(btrfs_uuid_rescan_kthread, fs_info, "btrfs-uuid");
if (IS_ERR(task)) {
/* fs_info->update_uuid_tree_gen remains 0 in all error case */
btrfs_warn(fs_info, "failed to start uuid_rescan task");
up(&fs_info->uuid_tree_rescan_sem);
return PTR_ERR(task);
}
return 0;
}
/*
* Some options only have meaning at mount time and shouldn't persist across
* remounts, or be displayed. Clear these at the end of mount and remount
* code paths.
*/
void btrfs_clear_oneshot_options(struct btrfs_fs_info *fs_info)
{
btrfs_clear_opt(fs_info->mount_opt, USEBACKUPROOT);
btrfs_clear_opt(fs_info->mount_opt, CLEAR_CACHE);
}
/*
* Mounting logic specific to read-write file systems. Shared by open_ctree
* and btrfs_remount when remounting from read-only to read-write.
*/
int btrfs_start_pre_rw_mount(struct btrfs_fs_info *fs_info)
{
int ret;
const bool cache_opt = btrfs_test_opt(fs_info, SPACE_CACHE);
bool clear_free_space_tree = false;
if (btrfs_test_opt(fs_info, CLEAR_CACHE) &&
btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE)) {
clear_free_space_tree = true;
} else if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE) &&
!btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE_VALID)) {
btrfs_warn(fs_info, "free space tree is invalid");
clear_free_space_tree = true;
}
if (clear_free_space_tree) {
btrfs_info(fs_info, "clearing free space tree");
ret = btrfs_clear_free_space_tree(fs_info);
if (ret) {
btrfs_warn(fs_info,
"failed to clear free space tree: %d", ret);
goto out;
}
}
/*
* btrfs_find_orphan_roots() is responsible for finding all the dead
* roots (with 0 refs), flag them with BTRFS_ROOT_DEAD_TREE and load
* them into the fs_info->fs_roots_radix tree. This must be done before
* calling btrfs_orphan_cleanup() on the tree root. If we don't do it
* first, then btrfs_orphan_cleanup() will delete a dead root's orphan
* item before the root's tree is deleted - this means that if we unmount
* or crash before the deletion completes, on the next mount we will not
* delete what remains of the tree because the orphan item does not
* exists anymore, which is what tells us we have a pending deletion.
*/
ret = btrfs_find_orphan_roots(fs_info);
if (ret)
goto out;
ret = btrfs_cleanup_fs_roots(fs_info);
if (ret)
goto out;
down_read(&fs_info->cleanup_work_sem);
if ((ret = btrfs_orphan_cleanup(fs_info->fs_root)) ||
(ret = btrfs_orphan_cleanup(fs_info->tree_root))) { up_read(&fs_info->cleanup_work_sem);
goto out;
}
up_read(&fs_info->cleanup_work_sem);
mutex_lock(&fs_info->cleaner_mutex);
ret = btrfs_recover_relocation(fs_info->tree_root);
mutex_unlock(&fs_info->cleaner_mutex);
if (ret < 0) {
btrfs_warn(fs_info, "failed to recover relocation: %d", ret);
goto out;
}
if (btrfs_test_opt(fs_info, FREE_SPACE_TREE) && !btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE)) { btrfs_info(fs_info, "creating free space tree");
ret = btrfs_create_free_space_tree(fs_info);
if (ret) {
btrfs_warn(fs_info,
"failed to create free space tree: %d", ret);
goto out;
}
}
if (cache_opt != btrfs_free_space_cache_v1_active(fs_info)) {
ret = btrfs_set_free_space_cache_v1_active(fs_info, cache_opt);
if (ret)
goto out;
}
ret = btrfs_resume_balance_async(fs_info);
if (ret)
goto out;
ret = btrfs_resume_dev_replace_async(fs_info);
if (ret) {
btrfs_warn(fs_info, "failed to resume dev_replace");
goto out;
}
btrfs_qgroup_rescan_resume(fs_info);
if (!fs_info->uuid_root) {
btrfs_info(fs_info, "creating UUID tree");
ret = btrfs_create_uuid_tree(fs_info);
if (ret) {
btrfs_warn(fs_info,
"failed to create the UUID tree %d", ret);
goto out;
}
}
out:
return ret;
}
int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_devices,
char *options)
{
u32 sectorsize;
u32 nodesize;
u32 stripesize;
u64 generation;
u64 features;
u16 csum_type;
struct btrfs_super_block *disk_super;
struct btrfs_fs_info *fs_info = btrfs_sb(sb);
struct btrfs_root *tree_root;
struct btrfs_root *chunk_root;
int ret;
int err = -EINVAL;
int level;
ret = init_mount_fs_info(fs_info, sb);
if (ret) {
err = ret;
goto fail;
}
/* These need to be init'ed before we start creating inodes and such. */
tree_root = btrfs_alloc_root(fs_info, BTRFS_ROOT_TREE_OBJECTID,
GFP_KERNEL);
fs_info->tree_root = tree_root;
chunk_root = btrfs_alloc_root(fs_info, BTRFS_CHUNK_TREE_OBJECTID,
GFP_KERNEL);
fs_info->chunk_root = chunk_root;
if (!tree_root || !chunk_root) {
err = -ENOMEM;
goto fail;
}
fs_info->btree_inode = new_inode(sb);
if (!fs_info->btree_inode) {
err = -ENOMEM;
goto fail;
}
mapping_set_gfp_mask(fs_info->btree_inode->i_mapping, GFP_NOFS);
btrfs_init_btree_inode(fs_info);
invalidate_bdev(fs_devices->latest_dev->bdev);
/*
* Read super block and check the signature bytes only
*/
disk_super = btrfs_read_dev_super(fs_devices->latest_dev->bdev);
if (IS_ERR(disk_super)) {
err = PTR_ERR(disk_super);
goto fail_alloc;
}
/*
* Verify the type first, if that or the checksum value are
* corrupted, we'll find out
*/
csum_type = btrfs_super_csum_type(disk_super);
if (!btrfs_supported_super_csum(csum_type)) {
btrfs_err(fs_info, "unsupported checksum algorithm: %u",
csum_type);
err = -EINVAL;
btrfs_release_disk_super(disk_super);
goto fail_alloc;
}
fs_info->csum_size = btrfs_super_csum_size(disk_super);
ret = btrfs_init_csum_hash(fs_info, csum_type);
if (ret) {
err = ret;
btrfs_release_disk_super(disk_super);
goto fail_alloc;
}
/*
* We want to check superblock checksum, the type is stored inside.
* Pass the whole disk block of size BTRFS_SUPER_INFO_SIZE (4k).
*/
if (btrfs_check_super_csum(fs_info, (u8 *)disk_super)) { btrfs_err(fs_info, "superblock checksum mismatch");
err = -EINVAL;
btrfs_release_disk_super(disk_super);
goto fail_alloc;
}
/*
* super_copy is zeroed at allocation time and we never touch the
* following bytes up to INFO_SIZE, the checksum is calculated from
* the whole block of INFO_SIZE
*/
memcpy(fs_info->super_copy, disk_super, sizeof(*fs_info->super_copy));
btrfs_release_disk_super(disk_super);
disk_super = fs_info->super_copy;
features = btrfs_super_flags(disk_super);
if (features & BTRFS_SUPER_FLAG_CHANGING_FSID_V2) {
features &= ~BTRFS_SUPER_FLAG_CHANGING_FSID_V2;
btrfs_set_super_flags(disk_super, features);
btrfs_info(fs_info,
"found metadata UUID change in progress flag, clearing");
}
memcpy(fs_info->super_for_commit, fs_info->super_copy,
sizeof(*fs_info->super_for_commit));
ret = btrfs_validate_mount_super(fs_info);
if (ret) {
btrfs_err(fs_info, "superblock contains fatal errors");
err = -EINVAL;
goto fail_alloc;
}
if (!btrfs_super_root(disk_super))
goto fail_alloc;
/* check FS state, whether FS is broken. */
if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_ERROR)
set_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state);
/*
* In the long term, we'll store the compression type in the super
* block, and it'll be used for per file compression control.
*/
fs_info->compress_type = BTRFS_COMPRESS_ZLIB;
/*
* Flag our filesystem as having big metadata blocks if they are bigger
* than the page size.
*/
if (btrfs_super_nodesize(disk_super) > PAGE_SIZE) {
if (!(features & BTRFS_FEATURE_INCOMPAT_BIG_METADATA)) btrfs_info(fs_info,
"flagging fs with big metadata feature");
features |= BTRFS_FEATURE_INCOMPAT_BIG_METADATA;
}
/* Set up fs_info before parsing mount options */
nodesize = btrfs_super_nodesize(disk_super);
sectorsize = btrfs_super_sectorsize(disk_super);
stripesize = sectorsize;
fs_info->dirty_metadata_batch = nodesize * (1 + ilog2(nr_cpu_ids));
fs_info->delalloc_batch = sectorsize * 512 * (1 + ilog2(nr_cpu_ids));
fs_info->nodesize = nodesize;
fs_info->sectorsize = sectorsize;
fs_info->sectorsize_bits = ilog2(sectorsize);
fs_info->csums_per_leaf = BTRFS_MAX_ITEM_SIZE(fs_info) / fs_info->csum_size;
fs_info->stripesize = stripesize;
ret = btrfs_parse_options(fs_info, options, sb->s_flags);
if (ret) {
err = ret;
goto fail_alloc;
}
features = btrfs_super_incompat_flags(disk_super) &
~BTRFS_FEATURE_INCOMPAT_SUPP;
if (features) {
btrfs_err(fs_info,
"cannot mount because of unsupported optional features (%llx)",
features);
err = -EINVAL;
goto fail_alloc;
}
features = btrfs_super_incompat_flags(disk_super);
features |= BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF;
if (fs_info->compress_type == BTRFS_COMPRESS_LZO)
features |= BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO; else if (fs_info->compress_type == BTRFS_COMPRESS_ZSTD) features |= BTRFS_FEATURE_INCOMPAT_COMPRESS_ZSTD; if (features & BTRFS_FEATURE_INCOMPAT_SKINNY_METADATA) btrfs_info(fs_info, "has skinny extents");
/*
* mixed block groups end up with duplicate but slightly offset
* extent buffers for the same range. It leads to corruptions
*/
if ((features & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS) &&
(sectorsize != nodesize)) {
btrfs_err(fs_info,
"unequal nodesize/sectorsize (%u != %u) are not allowed for mixed block groups",
nodesize, sectorsize);
goto fail_alloc;
}
/*
* Needn't use the lock because there is no other task which will
* update the flag.
*/
btrfs_set_super_incompat_flags(disk_super, features);
features = btrfs_super_compat_ro_flags(disk_super) &
~BTRFS_FEATURE_COMPAT_RO_SUPP;
if (!sb_rdonly(sb) && features) { btrfs_err(fs_info,
"cannot mount read-write because of unsupported optional features (%llx)",
features);
err = -EINVAL;
goto fail_alloc;
}
if (sectorsize != PAGE_SIZE) { btrfs_warn(fs_info,
"read-write for sector size %u with page size %lu is experimental",
sectorsize, PAGE_SIZE);
}
if (sectorsize != PAGE_SIZE) {
if (btrfs_super_incompat_flags(fs_info->super_copy) &
BTRFS_FEATURE_INCOMPAT_RAID56) {
btrfs_err(fs_info,
"RAID56 is not yet supported for sector size %u with page size %lu",
sectorsize, PAGE_SIZE);
err = -EINVAL;
goto fail_alloc;
}
}
ret = btrfs_init_workqueues(fs_info, fs_devices);
if (ret) {
err = ret;
goto fail_sb_buffer;
}
sb->s_bdi->ra_pages *= btrfs_super_num_devices(disk_super);
sb->s_bdi->ra_pages = max(sb->s_bdi->ra_pages, SZ_4M / PAGE_SIZE);
sb->s_blocksize = sectorsize;
sb->s_blocksize_bits = blksize_bits(sectorsize);
memcpy(&sb->s_uuid, fs_info->fs_devices->fsid, BTRFS_FSID_SIZE);
mutex_lock(&fs_info->chunk_mutex);
ret = btrfs_read_sys_array(fs_info);
mutex_unlock(&fs_info->chunk_mutex);
if (ret) {
btrfs_err(fs_info, "failed to read the system array: %d", ret);
goto fail_sb_buffer;
}
generation = btrfs_super_chunk_root_generation(disk_super);
level = btrfs_super_chunk_root_level(disk_super);
chunk_root->node = read_tree_block(fs_info,
btrfs_super_chunk_root(disk_super),
BTRFS_CHUNK_TREE_OBJECTID,
generation, level, NULL);
if (IS_ERR(chunk_root->node) ||
!extent_buffer_uptodate(chunk_root->node)) {
btrfs_err(fs_info, "failed to read chunk root");
if (!IS_ERR(chunk_root->node))
free_extent_buffer(chunk_root->node); chunk_root->node = NULL;
goto fail_tree_roots;
}
btrfs_set_root_node(&chunk_root->root_item, chunk_root->node);
chunk_root->commit_root = btrfs_root_node(chunk_root);
read_extent_buffer(chunk_root->node, fs_info->chunk_tree_uuid,
offsetof(struct btrfs_header, chunk_tree_uuid),
BTRFS_UUID_SIZE);
ret = btrfs_read_chunk_tree(fs_info);
if (ret) {
btrfs_err(fs_info, "failed to read chunk tree: %d", ret);
goto fail_tree_roots;
}
/*
* At this point we know all the devices that make this filesystem,
* including the seed devices but we don't know yet if the replace
* target is required. So free devices that are not part of this
* filesystem but skip the replace target device which is checked
* below in btrfs_init_dev_replace().
*/
btrfs_free_extra_devids(fs_devices);
if (!fs_devices->latest_dev->bdev) {
btrfs_err(fs_info, "failed to read devices");
goto fail_tree_roots;
}
ret = init_tree_roots(fs_info);
if (ret)
goto fail_tree_roots;
/*
* Get zone type information of zoned block devices. This will also
* handle emulation of a zoned filesystem if a regular device has the
* zoned incompat feature flag set.
*/
ret = btrfs_get_dev_zone_info_all_devices(fs_info);
if (ret) {
btrfs_err(fs_info,
"zoned: failed to read device zone info: %d",
ret);
goto fail_block_groups;
}
/*
* If we have a uuid root and we're not being told to rescan we need to
* check the generation here so we can set the
* BTRFS_FS_UPDATE_UUID_TREE_GEN bit. Otherwise we could commit the
* transaction during a balance or the log replay without updating the
* uuid generation, and then if we crash we would rescan the uuid tree,
* even though it was perfectly fine.
*/
if (fs_info->uuid_root && !btrfs_test_opt(fs_info, RESCAN_UUID_TREE) && fs_info->generation == btrfs_super_uuid_tree_generation(disk_super)) set_bit(BTRFS_FS_UPDATE_UUID_TREE_GEN, &fs_info->flags); ret = btrfs_verify_dev_extents(fs_info);
if (ret) {
btrfs_err(fs_info,
"failed to verify dev extents against chunks: %d",
ret);
goto fail_block_groups;
}
ret = btrfs_recover_balance(fs_info);
if (ret) {
btrfs_err(fs_info, "failed to recover balance: %d", ret);
goto fail_block_groups;
}
ret = btrfs_init_dev_stats(fs_info);
if (ret) {
btrfs_err(fs_info, "failed to init dev_stats: %d", ret);
goto fail_block_groups;
}
ret = btrfs_init_dev_replace(fs_info);
if (ret) {
btrfs_err(fs_info, "failed to init dev_replace: %d", ret);
goto fail_block_groups;
}
ret = btrfs_check_zoned_mode(fs_info);
if (ret) {
btrfs_err(fs_info, "failed to initialize zoned mode: %d",
ret);
goto fail_block_groups;
}
ret = btrfs_sysfs_add_fsid(fs_devices);
if (ret) {
btrfs_err(fs_info, "failed to init sysfs fsid interface: %d",
ret);
goto fail_block_groups;
}
ret = btrfs_sysfs_add_mounted(fs_info);
if (ret) {
btrfs_err(fs_info, "failed to init sysfs interface: %d", ret);
goto fail_fsdev_sysfs;
}
ret = btrfs_init_space_info(fs_info);
if (ret) {
btrfs_err(fs_info, "failed to initialize space info: %d", ret);
goto fail_sysfs;
}
ret = btrfs_read_block_groups(fs_info);
if (ret) {
btrfs_err(fs_info, "failed to read block groups: %d", ret);
goto fail_sysfs;
}
btrfs_free_zone_cache(fs_info);
if (!sb_rdonly(sb) && fs_info->fs_devices->missing_devices && !btrfs_check_rw_degradable(fs_info, NULL)) { btrfs_warn(fs_info,
"writable mount is not allowed due to too many missing devices");
goto fail_sysfs;
}
fs_info->cleaner_kthread = kthread_run(cleaner_kthread, tree_root,
"btrfs-cleaner");
if (IS_ERR(fs_info->cleaner_kthread))
goto fail_sysfs;
fs_info->transaction_kthread = kthread_run(transaction_kthread,
tree_root,
"btrfs-transaction");
if (IS_ERR(fs_info->transaction_kthread))
goto fail_cleaner;
if (!btrfs_test_opt(fs_info, NOSSD) && !fs_info->fs_devices->rotating) { btrfs_set_and_info(fs_info, SSD, "enabling ssd optimizations");
}
/*
* Mount does not set all options immediately, we can do it now and do
* not have to wait for transaction commit
*/
btrfs_apply_pending_changes(fs_info);
#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
if (btrfs_test_opt(fs_info, CHECK_INTEGRITY)) {
ret = btrfsic_mount(fs_info, fs_devices,
btrfs_test_opt(fs_info,
CHECK_INTEGRITY_DATA) ? 1 : 0,
fs_info->check_integrity_print_mask);
if (ret)
btrfs_warn(fs_info,
"failed to initialize integrity check module: %d",
ret);
}
#endif
ret = btrfs_read_qgroup_config(fs_info);
if (ret)
goto fail_trans_kthread;
if (btrfs_build_ref_tree(fs_info))
btrfs_err(fs_info, "couldn't build ref tree");
/* do not make disk changes in broken FS or nologreplay is given */
if (btrfs_super_log_root(disk_super) != 0 &&
!btrfs_test_opt(fs_info, NOLOGREPLAY)) { btrfs_info(fs_info, "start tree-log replay");
ret = btrfs_replay_log(fs_info, fs_devices);
if (ret) {
err = ret;
goto fail_qgroup;
}
}
fs_info->fs_root = btrfs_get_fs_root(fs_info, BTRFS_FS_TREE_OBJECTID, true);
if (IS_ERR(fs_info->fs_root)) {
err = PTR_ERR(fs_info->fs_root);
btrfs_warn(fs_info, "failed to read fs tree: %d", err);
fs_info->fs_root = NULL;
goto fail_qgroup;
}
if (sb_rdonly(sb))
goto clear_oneshot;
ret = btrfs_start_pre_rw_mount(fs_info);
if (ret) {
close_ctree(fs_info);
return ret;
}
btrfs_discard_resume(fs_info);
if (fs_info->uuid_root &&
(btrfs_test_opt(fs_info, RESCAN_UUID_TREE) || fs_info->generation != btrfs_super_uuid_tree_generation(disk_super))) { btrfs_info(fs_info, "checking UUID tree");
ret = btrfs_check_uuid_tree(fs_info);
if (ret) {
btrfs_warn(fs_info,
"failed to check the UUID tree: %d", ret);
close_ctree(fs_info);
return ret;
}
}
set_bit(BTRFS_FS_OPEN, &fs_info->flags);
/* Kick the cleaner thread so it'll start deleting snapshots. */
if (test_bit(BTRFS_FS_UNFINISHED_DROPS, &fs_info->flags))
wake_up_process(fs_info->cleaner_kthread);
clear_oneshot:
btrfs_clear_oneshot_options(fs_info);
return 0;
fail_qgroup:
btrfs_free_qgroup_config(fs_info);
fail_trans_kthread:
kthread_stop(fs_info->transaction_kthread);
btrfs_cleanup_transaction(fs_info);
btrfs_free_fs_roots(fs_info);
fail_cleaner:
kthread_stop(fs_info->cleaner_kthread);
/*
* make sure we're done with the btree inode before we stop our
* kthreads
*/
filemap_write_and_wait(fs_info->btree_inode->i_mapping);
fail_sysfs:
btrfs_sysfs_remove_mounted(fs_info);
fail_fsdev_sysfs:
btrfs_sysfs_remove_fsid(fs_info->fs_devices);
fail_block_groups:
btrfs_put_block_group_cache(fs_info);
fail_tree_roots:
if (fs_info->data_reloc_root) btrfs_drop_and_free_fs_root(fs_info, fs_info->data_reloc_root); free_root_pointers(fs_info, true);
invalidate_inode_pages2(fs_info->btree_inode->i_mapping);
fail_sb_buffer:
btrfs_stop_all_workers(fs_info);
btrfs_free_block_groups(fs_info);
fail_alloc:
btrfs_mapping_tree_free(&fs_info->mapping_tree);
iput(fs_info->btree_inode);
fail:
btrfs_close_devices(fs_info->fs_devices); return err;
}
ALLOW_ERROR_INJECTION(open_ctree, ERRNO);
static void btrfs_end_super_write(struct bio *bio)
{
struct btrfs_device *device = bio->bi_private;
struct bio_vec *bvec;
struct bvec_iter_all iter_all;
struct page *page;
bio_for_each_segment_all(bvec, bio, iter_all) {
page = bvec->bv_page;
if (bio->bi_status) {
btrfs_warn_rl_in_rcu(device->fs_info,
"lost page write due to IO error on %s (%d)",
rcu_str_deref(device->name),
blk_status_to_errno(bio->bi_status));
ClearPageUptodate(page);
SetPageError(page);
btrfs_dev_stat_inc_and_print(device,
BTRFS_DEV_STAT_WRITE_ERRS);
} else {
SetPageUptodate(page);
}
put_page(page);
unlock_page(page);
}
bio_put(bio);
}
struct btrfs_super_block *btrfs_read_dev_one_super(struct block_device *bdev,
int copy_num)
{
struct btrfs_super_block *super;
struct page *page;
u64 bytenr, bytenr_orig;
struct address_space *mapping = bdev->bd_inode->i_mapping;
int ret;
bytenr_orig = btrfs_sb_offset(copy_num);
ret = btrfs_sb_log_location_bdev(bdev, copy_num, READ, &bytenr);
if (ret == -ENOENT)
return ERR_PTR(-EINVAL);
else if (ret)
return ERR_PTR(ret);
if (bytenr + BTRFS_SUPER_INFO_SIZE >= i_size_read(bdev->bd_inode))
return ERR_PTR(-EINVAL);
page = read_cache_page_gfp(mapping, bytenr >> PAGE_SHIFT, GFP_NOFS);
if (IS_ERR(page))
return ERR_CAST(page);
super = page_address(page);
if (btrfs_super_magic(super) != BTRFS_MAGIC) {
btrfs_release_disk_super(super);
return ERR_PTR(-ENODATA);
}
if (btrfs_super_bytenr(super) != bytenr_orig) {
btrfs_release_disk_super(super);
return ERR_PTR(-EINVAL);
}
return super;
}
struct btrfs_super_block *btrfs_read_dev_super(struct block_device *bdev)
{
struct btrfs_super_block *super, *latest = NULL;
int i;
u64 transid = 0;
/* we would like to check all the supers, but that would make
* a btrfs mount succeed after a mkfs from a different FS.
* So, we need to add a special mount option to scan for
* later supers, using BTRFS_SUPER_MIRROR_MAX instead
*/
for (i = 0; i < 1; i++) {
super = btrfs_read_dev_one_super(bdev, i);
if (IS_ERR(super))
continue;
if (!latest || btrfs_super_generation(super) > transid) {
if (latest)
btrfs_release_disk_super(super);
latest = super;
transid = btrfs_super_generation(super);
}
}
return super;
}
/*
* Write superblock @sb to the @device. Do not wait for completion, all the
* pages we use for writing are locked.
*
* Write @max_mirrors copies of the superblock, where 0 means default that fit
* the expected device size at commit time. Note that max_mirrors must be
* same for write and wait phases.
*
* Return number of errors when page is not found or submission fails.
*/
static int write_dev_supers(struct btrfs_device *device,
struct btrfs_super_block *sb, int max_mirrors)
{
struct btrfs_fs_info *fs_info = device->fs_info;
struct address_space *mapping = device->bdev->bd_inode->i_mapping;
SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
int i;
int errors = 0;
int ret;
u64 bytenr, bytenr_orig;
if (max_mirrors == 0)
max_mirrors = BTRFS_SUPER_MIRROR_MAX;
shash->tfm = fs_info->csum_shash; for (i = 0; i < max_mirrors; i++) {
struct page *page;
struct bio *bio;
struct btrfs_super_block *disk_super;
bytenr_orig = btrfs_sb_offset(i);
ret = btrfs_sb_log_location(device, i, WRITE, &bytenr);
if (ret == -ENOENT) {
continue;
} else if (ret < 0) {
btrfs_err(device->fs_info,
"couldn't get super block location for mirror %d",
i);
errors++;
continue;
}
if (bytenr + BTRFS_SUPER_INFO_SIZE >=
device->commit_total_bytes)
break;
btrfs_set_super_bytenr(sb, bytenr_orig);
crypto_shash_digest(shash, (const char *)sb + BTRFS_CSUM_SIZE,
BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE,
sb->csum);
page = find_or_create_page(mapping, bytenr >> PAGE_SHIFT,
GFP_NOFS);
if (!page) {
btrfs_err(device->fs_info,
"couldn't get super block page for bytenr %llu",
bytenr);
errors++;
continue;
}
/* Bump the refcount for wait_dev_supers() */
get_page(page);
disk_super = page_address(page);
memcpy(disk_super, sb, BTRFS_SUPER_INFO_SIZE);
/*
* Directly use bios here instead of relying on the page cache
* to do I/O, so we don't lose the ability to do integrity
* checking.
*/
bio = bio_alloc(GFP_NOFS, 1);
bio_set_dev(bio, device->bdev);
bio->bi_iter.bi_sector = bytenr >> SECTOR_SHIFT;
bio->bi_private = device;
bio->bi_end_io = btrfs_end_super_write;
__bio_add_page(bio, page, BTRFS_SUPER_INFO_SIZE,
offset_in_page(bytenr));
/*
* We FUA only the first super block. The others we allow to
* go down lazy and there's a short window where the on-disk
* copies might still contain the older version.
*/
bio->bi_opf = REQ_OP_WRITE | REQ_SYNC | REQ_META | REQ_PRIO;
if (i == 0 && !btrfs_test_opt(device->fs_info, NOBARRIER)) bio->bi_opf |= REQ_FUA; btrfsic_submit_bio(bio);
btrfs_advance_sb_log(device, i);
}
return errors < i ? 0 : -1;}
/*
* Wait for write completion of superblocks done by write_dev_supers,
* @max_mirrors same for write and wait phases.
*
* Return number of errors when page is not found or not marked up to
* date.
*/
static int wait_dev_supers(struct btrfs_device *device, int max_mirrors)
{
int i;
int errors = 0;
bool primary_failed = false;
int ret;
u64 bytenr;
if (max_mirrors == 0)
max_mirrors = BTRFS_SUPER_MIRROR_MAX;
for (i = 0; i < max_mirrors; i++) {
struct page *page;
ret = btrfs_sb_log_location(device, i, READ, &bytenr);
if (ret == -ENOENT) {
break;
} else if (ret < 0) {
errors++;
if (i == 0)
primary_failed = true;
continue;
}
if (bytenr + BTRFS_SUPER_INFO_SIZE >=
device->commit_total_bytes)
break;
page = find_get_page(device->bdev->bd_inode->i_mapping,
bytenr >> PAGE_SHIFT);
if (!page) {
errors++;
if (i == 0)
primary_failed = true;
continue;
}
/* Page is submitted locked and unlocked once the IO completes */
wait_on_page_locked(page);
if (PageError(page)) {
errors++;
if (i == 0)
primary_failed = true;
}
/* Drop our reference */
put_page(page);
/* Drop the reference from the writing run */
put_page(page);
}
/* log error, force error return */
if (primary_failed) {
btrfs_err(device->fs_info, "error writing primary super block to device %llu",
device->devid);
return -1;
}
return errors < i ? 0 : -1;
}
/*
* endio for the write_dev_flush, this will wake anyone waiting
* for the barrier when it is done
*/
static void btrfs_end_empty_barrier(struct bio *bio)
{
complete(bio->bi_private);
}
/*
* Submit a flush request to the device if it supports it. Error handling is
* done in the waiting counterpart.
*/
static void write_dev_flush(struct btrfs_device *device)
{
struct bio *bio = device->flush_bio;
#ifndef CONFIG_BTRFS_FS_CHECK_INTEGRITY
/*
* When a disk has write caching disabled, we skip submission of a bio
* with flush and sync requests before writing the superblock, since
* it's not needed. However when the integrity checker is enabled, this
* results in reports that there are metadata blocks referred by a
* superblock that were not properly flushed. So don't skip the bio
* submission only when the integrity checker is enabled for the sake
* of simplicity, since this is a debug tool and not meant for use in
* non-debug builds.
*/
struct request_queue *q = bdev_get_queue(device->bdev);
if (!test_bit(QUEUE_FLAG_WC, &q->queue_flags))
return;
#endif
bio_reset(bio);
bio->bi_end_io = btrfs_end_empty_barrier;
bio_set_dev(bio, device->bdev);
bio->bi_opf = REQ_OP_WRITE | REQ_SYNC | REQ_PREFLUSH;
init_completion(&device->flush_wait);
bio->bi_private = &device->flush_wait;
btrfsic_submit_bio(bio);
set_bit(BTRFS_DEV_STATE_FLUSH_SENT, &device->dev_state);
}
/*
* If the flush bio has been submitted by write_dev_flush, wait for it.
*/
static blk_status_t wait_dev_flush(struct btrfs_device *device)
{
struct bio *bio = device->flush_bio;
if (!test_bit(BTRFS_DEV_STATE_FLUSH_SENT, &device->dev_state))
return BLK_STS_OK;
clear_bit(BTRFS_DEV_STATE_FLUSH_SENT, &device->dev_state);
wait_for_completion_io(&device->flush_wait);
return bio->bi_status;
}
static int check_barrier_error(struct btrfs_fs_info *fs_info)
{
if (!btrfs_check_rw_degradable(fs_info, NULL))
return -EIO;
return 0;
}
/*
* send an empty flush down to each device in parallel,
* then wait for them
*/
static int barrier_all_devices(struct btrfs_fs_info *info)
{
struct list_head *head;
struct btrfs_device *dev;
int errors_wait = 0;
blk_status_t ret;
lockdep_assert_held(&info->fs_devices->device_list_mutex);
/* send down all the barriers */
head = &info->fs_devices->devices;
list_for_each_entry(dev, head, dev_list) { if (test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state))
continue;
if (!dev->bdev)
continue;
if (!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &dev->dev_state) ||
!test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state))
continue;
write_dev_flush(dev);
dev->last_flush_error = BLK_STS_OK;
}
/* wait for all the barriers */
list_for_each_entry(dev, head, dev_list) { if (test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state))
continue;
if (!dev->bdev) { errors_wait++;
continue;
}
if (!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &dev->dev_state) ||
!test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state))
continue;
ret = wait_dev_flush(dev);
if (ret) {
dev->last_flush_error = ret;
btrfs_dev_stat_inc_and_print(dev,
BTRFS_DEV_STAT_FLUSH_ERRS);
errors_wait++;
}
}
if (errors_wait) {
/*
* At some point we need the status of all disks
* to arrive at the volume status. So error checking
* is being pushed to a separate loop.
*/
return check_barrier_error(info);
}
return 0;
}
int btrfs_get_num_tolerated_disk_barrier_failures(u64 flags)
{
int raid_type;
int min_tolerated = INT_MAX;
if ((flags & BTRFS_BLOCK_GROUP_PROFILE_MASK) == 0 || (flags & BTRFS_AVAIL_ALLOC_BIT_SINGLE)) min_tolerated = min_t(int, min_tolerated,
btrfs_raid_array[BTRFS_RAID_SINGLE].
tolerated_failures);
for (raid_type = 0; raid_type < BTRFS_NR_RAID_TYPES; raid_type++) { if (raid_type == BTRFS_RAID_SINGLE)
continue;
if (!(flags & btrfs_raid_array[raid_type].bg_flag))
continue;
min_tolerated = min_t(int, min_tolerated,
btrfs_raid_array[raid_type].
tolerated_failures);
}
if (min_tolerated == INT_MAX) {
pr_warn("BTRFS: unknown raid flag: %llu", flags);
min_tolerated = 0;
}
return min_tolerated;
}
int write_all_supers(struct btrfs_fs_info *fs_info, int max_mirrors)
{
struct list_head *head;
struct btrfs_device *dev;
struct btrfs_super_block *sb;
struct btrfs_dev_item *dev_item;
int ret;
int do_barriers;
int max_errors;
int total_errors = 0;
u64 flags;
do_barriers = !btrfs_test_opt(fs_info, NOBARRIER);
/*
* max_mirrors == 0 indicates we're from commit_transaction,
* not from fsync where the tree roots in fs_info have not
* been consistent on disk.
*/
if (max_mirrors == 0)
backup_super_roots(fs_info);
sb = fs_info->super_for_commit;
dev_item = &sb->dev_item;
mutex_lock(&fs_info->fs_devices->device_list_mutex);
head = &fs_info->fs_devices->devices;
max_errors = btrfs_super_num_devices(fs_info->super_copy) - 1;
if (do_barriers) {
ret = barrier_all_devices(fs_info);
if (ret) {
mutex_unlock(
&fs_info->fs_devices->device_list_mutex);
btrfs_handle_fs_error(fs_info, ret,
"errors while submitting device barriers.");
return ret;
}
}
list_for_each_entry(dev, head, dev_list) { if (!dev->bdev) {
total_errors++;
continue;
}
if (!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &dev->dev_state) ||
!test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state))
continue;
btrfs_set_stack_device_generation(dev_item, 0);
btrfs_set_stack_device_type(dev_item, dev->type);
btrfs_set_stack_device_id(dev_item, dev->devid);
btrfs_set_stack_device_total_bytes(dev_item,
dev->commit_total_bytes);
btrfs_set_stack_device_bytes_used(dev_item,
dev->commit_bytes_used);
btrfs_set_stack_device_io_align(dev_item, dev->io_align);
btrfs_set_stack_device_io_width(dev_item, dev->io_width);
btrfs_set_stack_device_sector_size(dev_item, dev->sector_size);
memcpy(dev_item->uuid, dev->uuid, BTRFS_UUID_SIZE);
memcpy(dev_item->fsid, dev->fs_devices->metadata_uuid,
BTRFS_FSID_SIZE);
flags = btrfs_super_flags(sb);
btrfs_set_super_flags(sb, flags | BTRFS_HEADER_FLAG_WRITTEN);
ret = btrfs_validate_write_super(fs_info, sb);
if (ret < 0) {
mutex_unlock(&fs_info->fs_devices->device_list_mutex);
btrfs_handle_fs_error(fs_info, -EUCLEAN,
"unexpected superblock corruption detected");
return -EUCLEAN;
}
ret = write_dev_supers(dev, sb, max_mirrors);
if (ret)
total_errors++;
}
if (total_errors > max_errors) {
btrfs_err(fs_info, "%d errors while writing supers",
total_errors);
mutex_unlock(&fs_info->fs_devices->device_list_mutex);
/* FUA is masked off if unsupported and can't be the reason */
btrfs_handle_fs_error(fs_info, -EIO,
"%d errors while writing supers",
total_errors);
return -EIO;
}
total_errors = 0;
list_for_each_entry(dev, head, dev_list) { if (!dev->bdev)
continue;
if (!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &dev->dev_state) ||
!test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state))
continue;
ret = wait_dev_supers(dev, max_mirrors);
if (ret)
total_errors++;
}
mutex_unlock(&fs_info->fs_devices->device_list_mutex);
if (total_errors > max_errors) {
btrfs_handle_fs_error(fs_info, -EIO,
"%d errors while writing supers",
total_errors);
return -EIO;
}
return 0;
}
/* Drop a fs root from the radix tree and free it. */
void btrfs_drop_and_free_fs_root(struct btrfs_fs_info *fs_info,
struct btrfs_root *root)
{
bool drop_ref = false;
spin_lock(&fs_info->fs_roots_radix_lock);
radix_tree_delete(&fs_info->fs_roots_radix,
(unsigned long)root->root_key.objectid);
if (test_and_clear_bit(BTRFS_ROOT_IN_RADIX, &root->state))
drop_ref = true;
spin_unlock(&fs_info->fs_roots_radix_lock);
if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) {
ASSERT(root->log_root == NULL);
if (root->reloc_root) { btrfs_put_root(root->reloc_root);
root->reloc_root = NULL;
}
}
if (drop_ref) btrfs_put_root(root);
}
int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info)
{
u64 root_objectid = 0;
struct btrfs_root *gang[8];
int i = 0;
int err = 0;
unsigned int ret = 0;
while (1) {
spin_lock(&fs_info->fs_roots_radix_lock);
ret = radix_tree_gang_lookup(&fs_info->fs_roots_radix,
(void **)gang, root_objectid,
ARRAY_SIZE(gang));
if (!ret) {
spin_unlock(&fs_info->fs_roots_radix_lock);
break;
}
root_objectid = gang[ret - 1]->root_key.objectid + 1; for (i = 0; i < ret; i++) {
/* Avoid to grab roots in dead_roots */
if (btrfs_root_refs(&gang[i]->root_item) == 0) { gang[i] = NULL;
continue;
}
/* grab all the search result for later use */
gang[i] = btrfs_grab_root(gang[i]);
}
spin_unlock(&fs_info->fs_roots_radix_lock);
for (i = 0; i < ret; i++) { if (!gang[i])
continue;
root_objectid = gang[i]->root_key.objectid;
err = btrfs_orphan_cleanup(gang[i]);
if (err)
break;
btrfs_put_root(gang[i]);
}
root_objectid++;
}
/* release the uncleaned roots due to error */
for (; i < ret; i++) {
if (gang[i])
btrfs_put_root(gang[i]);
}
return err;
}
int btrfs_commit_super(struct btrfs_fs_info *fs_info)
{
struct btrfs_root *root = fs_info->tree_root;
struct btrfs_trans_handle *trans;
mutex_lock(&fs_info->cleaner_mutex);
btrfs_run_delayed_iputs(fs_info);
mutex_unlock(&fs_info->cleaner_mutex);
wake_up_process(fs_info->cleaner_kthread);
/* wait until ongoing cleanup work done */
down_write(&fs_info->cleanup_work_sem);
up_write(&fs_info->cleanup_work_sem);
trans = btrfs_join_transaction(root);
if (IS_ERR(trans))
return PTR_ERR(trans);
return btrfs_commit_transaction(trans);
}
void __cold close_ctree(struct btrfs_fs_info *fs_info)
{
int ret;
set_bit(BTRFS_FS_CLOSING_START, &fs_info->flags);
/*
* We don't want the cleaner to start new transactions, add more delayed
* iputs, etc. while we're closing. We can't use kthread_stop() yet
* because that frees the task_struct, and the transaction kthread might
* still try to wake up the cleaner.
*/
kthread_park(fs_info->cleaner_kthread);
/*
* If we had UNFINISHED_DROPS we could still be processing them, so
* clear that bit and wake up relocation so it can stop.
*/
btrfs_wake_unfinished_drop(fs_info);
/* wait for the qgroup rescan worker to stop */
btrfs_qgroup_wait_for_completion(fs_info, false);
/* wait for the uuid_scan task to finish */
down(&fs_info->uuid_tree_rescan_sem);
/* avoid complains from lockdep et al., set sem back to initial state */
up(&fs_info->uuid_tree_rescan_sem);
/* pause restriper - we want to resume on mount */
btrfs_pause_balance(fs_info);
btrfs_dev_replace_suspend_for_unmount(fs_info);
btrfs_scrub_cancel(fs_info);
/* wait for any defraggers to finish */
wait_event(fs_info->transaction_wait,
(atomic_read(&fs_info->defrag_running) == 0));
/* clear out the rbtree of defraggable inodes */
btrfs_cleanup_defrag_inodes(fs_info);
cancel_work_sync(&fs_info->async_reclaim_work);
cancel_work_sync(&fs_info->async_data_reclaim_work);
cancel_work_sync(&fs_info->preempt_reclaim_work);
cancel_work_sync(&fs_info->reclaim_bgs_work);
/* Cancel or finish ongoing discard work */
btrfs_discard_cleanup(fs_info);
if (!sb_rdonly(fs_info->sb)) {
/*
* The cleaner kthread is stopped, so do one final pass over
* unused block groups.
*/
btrfs_delete_unused_bgs(fs_info);
/*
* There might be existing delayed inode workers still running
* and holding an empty delayed inode item. We must wait for
* them to complete first because they can create a transaction.
* This happens when someone calls btrfs_balance_delayed_items()
* and then a transaction commit runs the same delayed nodes
* before any delayed worker has done something with the nodes.
* We must wait for any worker here and not at transaction
* commit time since that could cause a deadlock.
* This is a very rare case.
*/
btrfs_flush_workqueue(fs_info->delayed_workers);
ret = btrfs_commit_super(fs_info);
if (ret)
btrfs_err(fs_info, "commit super ret %d", ret);
}
if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state) ||
test_bit(BTRFS_FS_STATE_TRANS_ABORTED, &fs_info->fs_state))
btrfs_error_commit_super(fs_info);
kthread_stop(fs_info->transaction_kthread);
kthread_stop(fs_info->cleaner_kthread);
ASSERT(list_empty(&fs_info->delayed_iputs));
set_bit(BTRFS_FS_CLOSING_DONE, &fs_info->flags);
if (btrfs_check_quota_leak(fs_info)) {
WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG));
btrfs_err(fs_info, "qgroup reserved space leaked");
}
btrfs_free_qgroup_config(fs_info);
ASSERT(list_empty(&fs_info->delalloc_roots));
if (percpu_counter_sum(&fs_info->delalloc_bytes)) {
btrfs_info(fs_info, "at unmount delalloc count %lld",
percpu_counter_sum(&fs_info->delalloc_bytes));
}
if (percpu_counter_sum(&fs_info->ordered_bytes))
btrfs_info(fs_info, "at unmount dio bytes count %lld",
percpu_counter_sum(&fs_info->ordered_bytes));
btrfs_sysfs_remove_mounted(fs_info);
btrfs_sysfs_remove_fsid(fs_info->fs_devices);
btrfs_put_block_group_cache(fs_info);
/*
* we must make sure there is not any read request to
* submit after we stopping all workers.
*/
invalidate_inode_pages2(fs_info->btree_inode->i_mapping);
btrfs_stop_all_workers(fs_info);
/* We shouldn't have any transaction open at this point */
ASSERT(list_empty(&fs_info->trans_list));
clear_bit(BTRFS_FS_OPEN, &fs_info->flags);
free_root_pointers(fs_info, true);
btrfs_free_fs_roots(fs_info);
/*
* We must free the block groups after dropping the fs_roots as we could
* have had an IO error and have left over tree log blocks that aren't
* cleaned up until the fs roots are freed. This makes the block group
* accounting appear to be wrong because there's pending reserved bytes,
* so make sure we do the block group cleanup afterwards.
*/
btrfs_free_block_groups(fs_info);
iput(fs_info->btree_inode);
#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
if (btrfs_test_opt(fs_info, CHECK_INTEGRITY))
btrfsic_unmount(fs_info->fs_devices);
#endif
btrfs_mapping_tree_free(&fs_info->mapping_tree);
btrfs_close_devices(fs_info->fs_devices);
}
int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid,
int atomic)
{
int ret;
struct inode *btree_inode = buf->pages[0]->mapping->host;
ret = extent_buffer_uptodate(buf);
if (!ret)
return ret;
ret = verify_parent_transid(&BTRFS_I(btree_inode)->io_tree, buf,
parent_transid, atomic);
if (ret == -EAGAIN)
return ret;
return !ret;
}
void btrfs_mark_buffer_dirty(struct extent_buffer *buf)
{
struct btrfs_fs_info *fs_info = buf->fs_info;
u64 transid = btrfs_header_generation(buf);
int was_dirty;
#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
/*
* This is a fast path so only do this check if we have sanity tests
* enabled. Normal people shouldn't be using unmapped buffers as dirty
* outside of the sanity tests.
*/
if (unlikely(test_bit(EXTENT_BUFFER_UNMAPPED, &buf->bflags)))
return;
#endif
btrfs_assert_tree_locked(buf);
if (transid != fs_info->generation)
WARN(1, KERN_CRIT "btrfs transid mismatch buffer %llu, found %llu running %llu\n",
buf->start, transid, fs_info->generation);
was_dirty = set_extent_buffer_dirty(buf);
if (!was_dirty)
percpu_counter_add_batch(&fs_info->dirty_metadata_bytes,
buf->len,
fs_info->dirty_metadata_batch);
#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
/*
* Since btrfs_mark_buffer_dirty() can be called with item pointer set
* but item data not updated.
* So here we should only check item pointers, not item data.
*/
if (btrfs_header_level(buf) == 0 &&
btrfs_check_leaf_relaxed(buf)) {
btrfs_print_leaf(buf);
ASSERT(0);
}
#endif
}
static void __btrfs_btree_balance_dirty(struct btrfs_fs_info *fs_info,
int flush_delayed)
{
/*
* looks as though older kernels can get into trouble with
* this code, they end up stuck in balance_dirty_pages forever
*/
int ret;
if (current->flags & PF_MEMALLOC)
return;
if (flush_delayed) btrfs_balance_delayed_items(fs_info); ret = __percpu_counter_compare(&fs_info->dirty_metadata_bytes,
BTRFS_DIRTY_METADATA_THRESH,
fs_info->dirty_metadata_batch);
if (ret > 0) {
balance_dirty_pages_ratelimited(fs_info->btree_inode->i_mapping);
}
}
void btrfs_btree_balance_dirty(struct btrfs_fs_info *fs_info)
{
__btrfs_btree_balance_dirty(fs_info, 1);
}
void btrfs_btree_balance_dirty_nodelay(struct btrfs_fs_info *fs_info)
{
__btrfs_btree_balance_dirty(fs_info, 0);
}
int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid, int level,
struct btrfs_key *first_key)
{
return btree_read_extent_buffer_pages(buf, parent_transid,
level, first_key);
}
static void btrfs_error_commit_super(struct btrfs_fs_info *fs_info)
{
/* cleanup FS via transaction */
btrfs_cleanup_transaction(fs_info);
mutex_lock(&fs_info->cleaner_mutex);
btrfs_run_delayed_iputs(fs_info);
mutex_unlock(&fs_info->cleaner_mutex);
down_write(&fs_info->cleanup_work_sem);
up_write(&fs_info->cleanup_work_sem);
}
static void btrfs_drop_all_logs(struct btrfs_fs_info *fs_info)
{
struct btrfs_root *gang[8];
u64 root_objectid = 0;
int ret;
spin_lock(&fs_info->fs_roots_radix_lock);
while ((ret = radix_tree_gang_lookup(&fs_info->fs_roots_radix,
(void **)gang, root_objectid,
ARRAY_SIZE(gang))) != 0) {
int i;
for (i = 0; i < ret; i++) gang[i] = btrfs_grab_root(gang[i]);
spin_unlock(&fs_info->fs_roots_radix_lock);
for (i = 0; i < ret; i++) { if (!gang[i])
continue;
root_objectid = gang[i]->root_key.objectid;
btrfs_free_log(NULL, gang[i]);
btrfs_put_root(gang[i]);
}
root_objectid++;
spin_lock(&fs_info->fs_roots_radix_lock);
}
spin_unlock(&fs_info->fs_roots_radix_lock);
btrfs_free_log_root_tree(NULL, fs_info);
}
static void btrfs_destroy_ordered_extents(struct btrfs_root *root)
{
struct btrfs_ordered_extent *ordered;
spin_lock(&root->ordered_extent_lock);
/*
* This will just short circuit the ordered completion stuff which will
* make sure the ordered extent gets properly cleaned up.
*/
list_for_each_entry(ordered, &root->ordered_extents,
root_extent_list)
set_bit(BTRFS_ORDERED_IOERR, &ordered->flags);
spin_unlock(&root->ordered_extent_lock);
}
static void btrfs_destroy_all_ordered_extents(struct btrfs_fs_info *fs_info)
{
struct btrfs_root *root;
struct list_head splice;
INIT_LIST_HEAD(&splice);
spin_lock(&fs_info->ordered_root_lock);
list_splice_init(&fs_info->ordered_roots, &splice);
while (!list_empty(&splice)) {
root = list_first_entry(&splice, struct btrfs_root,
ordered_root);
list_move_tail(&root->ordered_root,
&fs_info->ordered_roots);
spin_unlock(&fs_info->ordered_root_lock);
btrfs_destroy_ordered_extents(root);
cond_resched();
spin_lock(&fs_info->ordered_root_lock);
}
spin_unlock(&fs_info->ordered_root_lock);
/*
* We need this here because if we've been flipped read-only we won't
* get sync() from the umount, so we need to make sure any ordered
* extents that haven't had their dirty pages IO start writeout yet
* actually get run and error out properly.
*/
btrfs_wait_ordered_roots(fs_info, U64_MAX, 0, (u64)-1);
}
static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
struct btrfs_fs_info *fs_info)
{
struct rb_node *node;
struct btrfs_delayed_ref_root *delayed_refs;
struct btrfs_delayed_ref_node *ref;
int ret = 0;
delayed_refs = &trans->delayed_refs;
spin_lock(&delayed_refs->lock);
if (atomic_read(&delayed_refs->num_entries) == 0) {
spin_unlock(&delayed_refs->lock);
btrfs_debug(fs_info, "delayed_refs has NO entry");
return ret;
}
while ((node = rb_first_cached(&delayed_refs->href_root)) != NULL) {
struct btrfs_delayed_ref_head *head;
struct rb_node *n;
bool pin_bytes = false;
head = rb_entry(node, struct btrfs_delayed_ref_head,
href_node);
if (btrfs_delayed_ref_lock(delayed_refs, head))
continue;
spin_lock(&head->lock);
while ((n = rb_first_cached(&head->ref_tree)) != NULL) {
ref = rb_entry(n, struct btrfs_delayed_ref_node,
ref_node);
ref->in_tree = 0;
rb_erase_cached(&ref->ref_node, &head->ref_tree);
RB_CLEAR_NODE(&ref->ref_node);
if (!list_empty(&ref->add_list))
list_del(&ref->add_list);
atomic_dec(&delayed_refs->num_entries);
btrfs_put_delayed_ref(ref);
}
if (head->must_insert_reserved)
pin_bytes = true;
btrfs_free_delayed_extent_op(head->extent_op);
btrfs_delete_ref_head(delayed_refs, head);
spin_unlock(&head->lock);
spin_unlock(&delayed_refs->lock);
mutex_unlock(&head->mutex);
if (pin_bytes) {
struct btrfs_block_group *cache;
cache = btrfs_lookup_block_group(fs_info, head->bytenr);
BUG_ON(!cache);
spin_lock(&cache->space_info->lock);
spin_lock(&cache->lock);
cache->pinned += head->num_bytes;
btrfs_space_info_update_bytes_pinned(fs_info,
cache->space_info, head->num_bytes);
cache->reserved -= head->num_bytes;
cache->space_info->bytes_reserved -= head->num_bytes;
spin_unlock(&cache->lock);
spin_unlock(&cache->space_info->lock);
btrfs_put_block_group(cache);
btrfs_error_unpin_extent_range(fs_info, head->bytenr,
head->bytenr + head->num_bytes - 1);
}
btrfs_cleanup_ref_head_accounting(fs_info, delayed_refs, head);
btrfs_put_delayed_ref_head(head);
cond_resched();
spin_lock(&delayed_refs->lock);
}
btrfs_qgroup_destroy_extent_records(trans);
spin_unlock(&delayed_refs->lock);
return ret;
}
static void btrfs_destroy_delalloc_inodes(struct btrfs_root *root)
{
struct btrfs_inode *btrfs_inode;
struct list_head splice;
INIT_LIST_HEAD(&splice);
spin_lock(&root->delalloc_lock);
list_splice_init(&root->delalloc_inodes, &splice);
while (!list_empty(&splice)) {
struct inode *inode = NULL;
btrfs_inode = list_first_entry(&splice, struct btrfs_inode,
delalloc_inodes);
__btrfs_del_delalloc_inode(root, btrfs_inode);
spin_unlock(&root->delalloc_lock);
/*
* Make sure we get a live inode and that it'll not disappear
* meanwhile.
*/
inode = igrab(&btrfs_inode->vfs_inode);
if (inode) {
invalidate_inode_pages2(inode->i_mapping);
iput(inode);
}
spin_lock(&root->delalloc_lock);
}
spin_unlock(&root->delalloc_lock);
}
static void btrfs_destroy_all_delalloc_inodes(struct btrfs_fs_info *fs_info)
{
struct btrfs_root *root;
struct list_head splice;
INIT_LIST_HEAD(&splice);
spin_lock(&fs_info->delalloc_root_lock);
list_splice_init(&fs_info->delalloc_roots, &splice);
while (!list_empty(&splice)) {
root = list_first_entry(&splice, struct btrfs_root,
delalloc_root);
root = btrfs_grab_root(root);
BUG_ON(!root);
spin_unlock(&fs_info->delalloc_root_lock);
btrfs_destroy_delalloc_inodes(root);
btrfs_put_root(root);
spin_lock(&fs_info->delalloc_root_lock);
}
spin_unlock(&fs_info->delalloc_root_lock);
}
static int btrfs_destroy_marked_extents(struct btrfs_fs_info *fs_info,
struct extent_io_tree *dirty_pages,
int mark)
{
int ret;
struct extent_buffer *eb;
u64 start = 0;
u64 end;
while (1) {
ret = find_first_extent_bit(dirty_pages, start, &start, &end,
mark, NULL);
if (ret)
break;
clear_extent_bits(dirty_pages, start, end, mark);
while (start <= end) {
eb = find_extent_buffer(fs_info, start);
start += fs_info->nodesize;
if (!eb)
continue;
wait_on_extent_buffer_writeback(eb);
if (test_and_clear_bit(EXTENT_BUFFER_DIRTY,
&eb->bflags))
clear_extent_buffer_dirty(eb);
free_extent_buffer_stale(eb);
}
}
return ret;
}
static int btrfs_destroy_pinned_extent(struct btrfs_fs_info *fs_info,
struct extent_io_tree *unpin)
{
u64 start;
u64 end;
int ret;
while (1) {
struct extent_state *cached_state = NULL;
/*
* The btrfs_finish_extent_commit() may get the same range as
* ours between find_first_extent_bit and clear_extent_dirty.
* Hence, hold the unused_bg_unpin_mutex to avoid double unpin
* the same extent range.
*/
mutex_lock(&fs_info->unused_bg_unpin_mutex);
ret = find_first_extent_bit(unpin, 0, &start, &end,
EXTENT_DIRTY, &cached_state);
if (ret) {
mutex_unlock(&fs_info->unused_bg_unpin_mutex);
break;
}
clear_extent_dirty(unpin, start, end, &cached_state);
free_extent_state(cached_state);
btrfs_error_unpin_extent_range(fs_info, start, end);
mutex_unlock(&fs_info->unused_bg_unpin_mutex);
cond_resched();
}
return 0;
}
static void btrfs_cleanup_bg_io(struct btrfs_block_group *cache)
{
struct inode *inode;
inode = cache->io_ctl.inode;
if (inode) {
invalidate_inode_pages2(inode->i_mapping);
BTRFS_I(inode)->generation = 0;
cache->io_ctl.inode = NULL;
iput(inode);
}
ASSERT(cache->io_ctl.pages == NULL);
btrfs_put_block_group(cache);
}
void btrfs_cleanup_dirty_bgs(struct btrfs_transaction *cur_trans,
struct btrfs_fs_info *fs_info)
{
struct btrfs_block_group *cache;
spin_lock(&cur_trans->dirty_bgs_lock);
while (!list_empty(&cur_trans->dirty_bgs)) {
cache = list_first_entry(&cur_trans->dirty_bgs,
struct btrfs_block_group,
dirty_list);
if (!list_empty(&cache->io_list)) {
spin_unlock(&cur_trans->dirty_bgs_lock);
list_del_init(&cache->io_list);
btrfs_cleanup_bg_io(cache);
spin_lock(&cur_trans->dirty_bgs_lock);
}
list_del_init(&cache->dirty_list);
spin_lock(&cache->lock);
cache->disk_cache_state = BTRFS_DC_ERROR;
spin_unlock(&cache->lock);
spin_unlock(&cur_trans->dirty_bgs_lock);
btrfs_put_block_group(cache);
btrfs_delayed_refs_rsv_release(fs_info, 1);
spin_lock(&cur_trans->dirty_bgs_lock);
}
spin_unlock(&cur_trans->dirty_bgs_lock);
/*
* Refer to the definition of io_bgs member for details why it's safe
* to use it without any locking
*/
while (!list_empty(&cur_trans->io_bgs)) {
cache = list_first_entry(&cur_trans->io_bgs,
struct btrfs_block_group,
io_list);
list_del_init(&cache->io_list);
spin_lock(&cache->lock);
cache->disk_cache_state = BTRFS_DC_ERROR;
spin_unlock(&cache->lock);
btrfs_cleanup_bg_io(cache);
}
}
void btrfs_cleanup_one_transaction(struct btrfs_transaction *cur_trans,
struct btrfs_fs_info *fs_info)
{
struct btrfs_device *dev, *tmp;
btrfs_cleanup_dirty_bgs(cur_trans, fs_info);
ASSERT(list_empty(&cur_trans->dirty_bgs));
ASSERT(list_empty(&cur_trans->io_bgs));
list_for_each_entry_safe(dev, tmp, &cur_trans->dev_update_list,
post_commit_list) {
list_del_init(&dev->post_commit_list);
}
btrfs_destroy_delayed_refs(cur_trans, fs_info);
cur_trans->state = TRANS_STATE_COMMIT_START;
wake_up(&fs_info->transaction_blocked_wait);
cur_trans->state = TRANS_STATE_UNBLOCKED;
wake_up(&fs_info->transaction_wait);
btrfs_destroy_delayed_inodes(fs_info);
btrfs_destroy_marked_extents(fs_info, &cur_trans->dirty_pages,
EXTENT_DIRTY);
btrfs_destroy_pinned_extent(fs_info, &cur_trans->pinned_extents);
btrfs_free_redirty_list(cur_trans);
cur_trans->state =TRANS_STATE_COMPLETED;
wake_up(&cur_trans->commit_wait);
}
static int btrfs_cleanup_transaction(struct btrfs_fs_info *fs_info)
{
struct btrfs_transaction *t;
mutex_lock(&fs_info->transaction_kthread_mutex);
spin_lock(&fs_info->trans_lock);
while (!list_empty(&fs_info->trans_list)) {
t = list_first_entry(&fs_info->trans_list,
struct btrfs_transaction, list);
if (t->state >= TRANS_STATE_COMMIT_START) {
refcount_inc(&t->use_count);
spin_unlock(&fs_info->trans_lock);
btrfs_wait_for_commit(fs_info, t->transid);
btrfs_put_transaction(t);
spin_lock(&fs_info->trans_lock);
continue;
}
if (t == fs_info->running_transaction) { t->state = TRANS_STATE_COMMIT_DOING;
spin_unlock(&fs_info->trans_lock);
/*
* We wait for 0 num_writers since we don't hold a trans
* handle open currently for this transaction.
*/
wait_event(t->writer_wait,
atomic_read(&t->num_writers) == 0);
} else {
spin_unlock(&fs_info->trans_lock);
}
btrfs_cleanup_one_transaction(t, fs_info);
spin_lock(&fs_info->trans_lock);
if (t == fs_info->running_transaction)
fs_info->running_transaction = NULL; list_del_init(&t->list);
spin_unlock(&fs_info->trans_lock);
btrfs_put_transaction(t);
trace_btrfs_transaction_commit(fs_info->tree_root);
spin_lock(&fs_info->trans_lock);
}
spin_unlock(&fs_info->trans_lock);
btrfs_destroy_all_ordered_extents(fs_info);
btrfs_destroy_delayed_inodes(fs_info);
btrfs_assert_delayed_root_empty(fs_info);
btrfs_destroy_all_delalloc_inodes(fs_info);
btrfs_drop_all_logs(fs_info);
mutex_unlock(&fs_info->transaction_kthread_mutex);
return 0;
}
int btrfs_init_root_free_objectid(struct btrfs_root *root)
{
struct btrfs_path *path;
int ret;
struct extent_buffer *l;
struct btrfs_key search_key;
struct btrfs_key found_key;
int slot;
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
search_key.objectid = BTRFS_LAST_FREE_OBJECTID;
search_key.type = -1;
search_key.offset = (u64)-1;
ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
if (ret < 0)
goto error;
BUG_ON(ret == 0); /* Corruption */ if (path->slots[0] > 0) { slot = path->slots[0] - 1;
l = path->nodes[0];
btrfs_item_key_to_cpu(l, &found_key, slot);
root->free_objectid = max_t(u64, found_key.objectid + 1,
BTRFS_FIRST_FREE_OBJECTID);
} else {
root->free_objectid = BTRFS_FIRST_FREE_OBJECTID;
}
ret = 0;
error:
btrfs_free_path(path); return ret;
}
int btrfs_get_free_objectid(struct btrfs_root *root, u64 *objectid)
{
int ret;
mutex_lock(&root->objectid_mutex);
if (unlikely(root->free_objectid >= BTRFS_LAST_FREE_OBJECTID)) {
btrfs_warn(root->fs_info,
"the objectid of root %llu reaches its highest value",
root->root_key.objectid);
ret = -ENOSPC;
goto out;
}
*objectid = root->free_objectid++;
ret = 0;
out:
mutex_unlock(&root->objectid_mutex);
return ret;
}
// SPDX-License-Identifier: GPL-2.0-only
/*
* lib/parser.c - simple parser for mount, etc. options.
*/
#include <linux/ctype.h>
#include <linux/types.h>
#include <linux/export.h>
#include <linux/kstrtox.h>
#include <linux/parser.h>
#include <linux/slab.h>
#include <linux/string.h>
/**
* match_one - Determines if a string matches a simple pattern
* @s: the string to examine for presence of the pattern
* @p: the string containing the pattern
* @args: array of %MAX_OPT_ARGS &substring_t elements. Used to return match
* locations.
*
* Description: Determines if the pattern @p is present in string @s. Can only
* match extremely simple token=arg style patterns. If the pattern is found,
* the location(s) of the arguments will be returned in the @args array.
*/
static int match_one(char *s, const char *p, substring_t args[])
{
char *meta;
int argc = 0;
if (!p)
return 1;
while(1) {
int len = -1;
meta = strchr(p, '%');
if (!meta)
return strcmp(p, s) == 0; if (strncmp(p, s, meta-p))
return 0;
s += meta - p;
p = meta + 1;
if (isdigit(*p))
len = simple_strtoul(p, (char **) &p, 10); else if (*p == '%') { if (*s++ != '%')
return 0;
p++;
continue;
}
if (argc >= MAX_OPT_ARGS)
return 0;
args[argc].from = s;
switch (*p++) {
case 's': {
size_t str_len = strlen(s);
if (str_len == 0)
return 0;
if (len == -1 || len > str_len) len = str_len; args[argc].to = s + len;
break;
}
case 'd':
simple_strtol(s, &args[argc].to, 0);
goto num;
case 'u':
simple_strtoul(s, &args[argc].to, 0);
goto num;
case 'o':
simple_strtoul(s, &args[argc].to, 8);
goto num;
case 'x':
simple_strtoul(s, &args[argc].to, 16);
num:
if (args[argc].to == args[argc].from)
return 0;
break;
default:
return 0;
}
s = args[argc].to;
argc++;
}
}
/**
* match_token - Find a token (and optional args) in a string
* @s: the string to examine for token/argument pairs
* @table: match_table_t describing the set of allowed option tokens and the
* arguments that may be associated with them. Must be terminated with a
* &struct match_token whose pattern is set to the NULL pointer.
* @args: array of %MAX_OPT_ARGS &substring_t elements. Used to return match
* locations.
*
* Description: Detects which if any of a set of token strings has been passed
* to it. Tokens can include up to %MAX_OPT_ARGS instances of basic c-style
* format identifiers which will be taken into account when matching the
* tokens, and whose locations will be returned in the @args array.
*/
int match_token(char *s, const match_table_t table, substring_t args[])
{
const struct match_token *p;
for (p = table; !match_one(s, p->pattern, args) ; p++)
;
return p->token;
}
EXPORT_SYMBOL(match_token);
/**
* match_number - scan a number in the given base from a substring_t
* @s: substring to be scanned
* @result: resulting integer on success
* @base: base to use when converting string
*
* Description: Given a &substring_t and a base, attempts to parse the substring
* as a number in that base.
*
* Return: On success, sets @result to the integer represented by the
* string and returns 0. Returns -ENOMEM, -EINVAL, or -ERANGE on failure.
*/
static int match_number(substring_t *s, int *result, int base)
{
char *endp;
char *buf;
int ret;
long val;
buf = match_strdup(s);
if (!buf)
return -ENOMEM;
ret = 0;
val = simple_strtol(buf, &endp, base);
if (endp == buf)
ret = -EINVAL;
else if (val < (long)INT_MIN || val > (long)INT_MAX)
ret = -ERANGE;
else
*result = (int) val; kfree(buf); return ret;
}
/**
* match_u64int - scan a number in the given base from a substring_t
* @s: substring to be scanned
* @result: resulting u64 on success
* @base: base to use when converting string
*
* Description: Given a &substring_t and a base, attempts to parse the substring
* as a number in that base.
*
* Return: On success, sets @result to the integer represented by the
* string and returns 0. Returns -ENOMEM, -EINVAL, or -ERANGE on failure.
*/
static int match_u64int(substring_t *s, u64 *result, int base)
{
char *buf;
int ret;
u64 val;
buf = match_strdup(s);
if (!buf)
return -ENOMEM;
ret = kstrtoull(buf, base, &val);
if (!ret)
*result = val; kfree(buf); return ret;
}
/**
* match_int - scan a decimal representation of an integer from a substring_t
* @s: substring_t to be scanned
* @result: resulting integer on success
*
* Description: Attempts to parse the &substring_t @s as a decimal integer.
*
* Return: On success, sets @result to the integer represented by the string
* and returns 0. Returns -ENOMEM, -EINVAL, or -ERANGE on failure.
*/
int match_int(substring_t *s, int *result)
{
return match_number(s, result, 0);
}
EXPORT_SYMBOL(match_int);
/**
* match_uint - scan a decimal representation of an integer from a substring_t
* @s: substring_t to be scanned
* @result: resulting integer on success
*
* Description: Attempts to parse the &substring_t @s as a decimal integer.
*
* Return: On success, sets @result to the integer represented by the string
* and returns 0. Returns -ENOMEM, -EINVAL, or -ERANGE on failure.
*/
int match_uint(substring_t *s, unsigned int *result)
{
int err = -ENOMEM;
char *buf = match_strdup(s);
if (buf) {
err = kstrtouint(buf, 10, result);
kfree(buf);
}
return err;
}
EXPORT_SYMBOL(match_uint);
/**
* match_u64 - scan a decimal representation of a u64 from
* a substring_t
* @s: substring_t to be scanned
* @result: resulting unsigned long long on success
*
* Description: Attempts to parse the &substring_t @s as a long decimal
* integer.
*
* Return: On success, sets @result to the integer represented by the string
* and returns 0. Returns -ENOMEM, -EINVAL, or -ERANGE on failure.
*/
int match_u64(substring_t *s, u64 *result)
{
return match_u64int(s, result, 0);
}
EXPORT_SYMBOL(match_u64);
/**
* match_octal - scan an octal representation of an integer from a substring_t
* @s: substring_t to be scanned
* @result: resulting integer on success
*
* Description: Attempts to parse the &substring_t @s as an octal integer.
*
* Return: On success, sets @result to the integer represented by the string
* and returns 0. Returns -ENOMEM, -EINVAL, or -ERANGE on failure.
*/
int match_octal(substring_t *s, int *result)
{
return match_number(s, result, 8);
}
EXPORT_SYMBOL(match_octal);
/**
* match_hex - scan a hex representation of an integer from a substring_t
* @s: substring_t to be scanned
* @result: resulting integer on success
*
* Description: Attempts to parse the &substring_t @s as a hexadecimal integer.
*
* Return: On success, sets @result to the integer represented by the string
* and returns 0. Returns -ENOMEM, -EINVAL, or -ERANGE on failure.
*/
int match_hex(substring_t *s, int *result)
{
return match_number(s, result, 16);
}
EXPORT_SYMBOL(match_hex);
/**
* match_wildcard - parse if a string matches given wildcard pattern
* @pattern: wildcard pattern
* @str: the string to be parsed
*
* Description: Parse the string @str to check if matches wildcard
* pattern @pattern. The pattern may contain two types of wildcards:
* '*' - matches zero or more characters
* '?' - matches one character
*
* Return: If the @str matches the @pattern, return true, else return false.
*/
bool match_wildcard(const char *pattern, const char *str)
{
const char *s = str;
const char *p = pattern;
bool star = false;
while (*s) {
switch (*p) {
case '?':
s++;
p++;
break;
case '*':
star = true;
str = s;
if (!*++p)
return true;
pattern = p;
break;
default:
if (*s == *p) {
s++;
p++;
} else {
if (!star)
return false;
str++;
s = str;
p = pattern;
}
break;
}
}
if (*p == '*')
++p;
return !*p;
}
EXPORT_SYMBOL(match_wildcard);
/**
* match_strlcpy - Copy the characters from a substring_t to a sized buffer
* @dest: where to copy to
* @src: &substring_t to copy
* @size: size of destination buffer
*
* Description: Copy the characters in &substring_t @src to the
* c-style string @dest. Copy no more than @size - 1 characters, plus
* the terminating NUL.
*
* Return: length of @src.
*/
size_t match_strlcpy(char *dest, const substring_t *src, size_t size)
{
size_t ret = src->to - src->from;
if (size) {
size_t len = ret >= size ? size - 1 : ret;
memcpy(dest, src->from, len);
dest[len] = '\0';
}
return ret;
}
EXPORT_SYMBOL(match_strlcpy);
/**
* match_strdup - allocate a new string with the contents of a substring_t
* @s: &substring_t to copy
*
* Description: Allocates and returns a string filled with the contents of
* the &substring_t @s. The caller is responsible for freeing the returned
* string with kfree().
*
* Return: the address of the newly allocated NUL-terminated string or
* %NULL on error.
*/
char *match_strdup(const substring_t *s)
{
return kmemdup_nul(s->from, s->to - s->from, GFP_KERNEL);
}
EXPORT_SYMBOL(match_strdup);
// SPDX-License-Identifier: GPL-2.0+
/*
* Driver core for serial ports
*
* Based on drivers/char/serial.c, by Linus Torvalds, Theodore Ts'o.
*
* Copyright 1999 ARM Limited
* Copyright (C) 2000-2001 Deep Blue Solutions Ltd.
*/
#include <linux/module.h>
#include <linux/tty.h>
#include <linux/tty_flip.h>
#include <linux/slab.h>
#include <linux/sched/signal.h>
#include <linux/init.h>
#include <linux/console.h>
#include <linux/gpio/consumer.h>
#include <linux/of.h>
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
#include <linux/device.h>
#include <linux/serial.h> /* for serial_state and serial_icounter_struct */
#include <linux/serial_core.h>
#include <linux/sysrq.h>
#include <linux/delay.h>
#include <linux/mutex.h>
#include <linux/security.h>
#include <linux/irq.h>
#include <linux/uaccess.h>
/*
* This is used to lock changes in serial line configuration.
*/
static DEFINE_MUTEX(port_mutex);
/*
* lockdep: port->lock is initialized in two places, but we
* want only one lock-class:
*/
static struct lock_class_key port_lock_key;
#define HIGH_BITS_OFFSET ((sizeof(long)-sizeof(int))*8)
static void uart_change_speed(struct tty_struct *tty, struct uart_state *state,
struct ktermios *old_termios);
static void uart_wait_until_sent(struct tty_struct *tty, int timeout);
static void uart_change_pm(struct uart_state *state,
enum uart_pm_state pm_state);
static void uart_port_shutdown(struct tty_port *port);
static int uart_dcd_enabled(struct uart_port *uport)
{
return !!(uport->status & UPSTAT_DCD_ENABLE);
}
static inline struct uart_port *uart_port_ref(struct uart_state *state)
{
if (atomic_add_unless(&state->refcount, 1, 0))
return state->uart_port;
return NULL;
}
static inline void uart_port_deref(struct uart_port *uport)
{
if (atomic_dec_and_test(&uport->state->refcount))
wake_up(&uport->state->remove_wait);
}
#define uart_port_lock(state, flags) \
({ \
struct uart_port *__uport = uart_port_ref(state); \
if (__uport) \
spin_lock_irqsave(&__uport->lock, flags); \
__uport; \
})
#define uart_port_unlock(uport, flags) \
({ \
struct uart_port *__uport = uport; \
if (__uport) { \
spin_unlock_irqrestore(&__uport->lock, flags); \
uart_port_deref(__uport); \
} \
})
static inline struct uart_port *uart_port_check(struct uart_state *state)
{
lockdep_assert_held(&state->port.mutex);
return state->uart_port;
}
/*
* This routine is used by the interrupt handler to schedule processing in
* the software interrupt portion of the driver.
*/
void uart_write_wakeup(struct uart_port *port)
{
struct uart_state *state = port->state;
/*
* This means you called this function _after_ the port was
* closed. No cookie for you.
*/
BUG_ON(!state);
tty_port_tty_wakeup(&state->port);
}
static void uart_stop(struct tty_struct *tty)
{
struct uart_state *state = tty->driver_data;
struct uart_port *port;
unsigned long flags;
port = uart_port_lock(state, flags);
if (port)
port->ops->stop_tx(port);
uart_port_unlock(port, flags);
}
static void __uart_start(struct tty_struct *tty)
{
struct uart_state *state = tty->driver_data;
struct uart_port *port = state->uart_port;
if (port && !uart_tx_stopped(port))
port->ops->start_tx(port);
}
static void uart_start(struct tty_struct *tty)
{
struct uart_state *state = tty->driver_data;
struct uart_port *port;
unsigned long flags;
port = uart_port_lock(state, flags);
__uart_start(tty);
uart_port_unlock(port, flags);
}
static void
uart_update_mctrl(struct uart_port *port, unsigned int set, unsigned int clear)
{
unsigned long flags;
unsigned int old;
spin_lock_irqsave(&port->lock, flags);
old = port->mctrl;
port->mctrl = (old & ~clear) | set;
if (old != port->mctrl)
port->ops->set_mctrl(port, port->mctrl);
spin_unlock_irqrestore(&port->lock, flags);
}
#define uart_set_mctrl(port, set) uart_update_mctrl(port, set, 0)
#define uart_clear_mctrl(port, clear) uart_update_mctrl(port, 0, clear)
static void uart_port_dtr_rts(struct uart_port *uport, int raise)
{
int rs485_on = uport->rs485_config &&
(uport->rs485.flags & SER_RS485_ENABLED);
int RTS_after_send = !!(uport->rs485.flags & SER_RS485_RTS_AFTER_SEND);
if (raise) {
if (rs485_on && RTS_after_send) {
uart_set_mctrl(uport, TIOCM_DTR);
uart_clear_mctrl(uport, TIOCM_RTS);
} else {
uart_set_mctrl(uport, TIOCM_DTR | TIOCM_RTS);
}
} else {
unsigned int clear = TIOCM_DTR;
clear |= (!rs485_on || RTS_after_send) ? TIOCM_RTS : 0;
uart_clear_mctrl(uport, clear);
}
}
/*
* Startup the port. This will be called once per open. All calls
* will be serialised by the per-port mutex.
*/
static int uart_port_startup(struct tty_struct *tty, struct uart_state *state,
int init_hw)
{
struct uart_port *uport = uart_port_check(state);
unsigned long flags;
unsigned long page;
int retval = 0;
if (uport->type == PORT_UNKNOWN)
return 1;
/*
* Make sure the device is in D0 state.
*/
uart_change_pm(state, UART_PM_STATE_ON);
/*
* Initialise and allocate the transmit and temporary
* buffer.
*/
page = get_zeroed_page(GFP_KERNEL);
if (!page)
return -ENOMEM;
uart_port_lock(state, flags);
if (!state->xmit.buf) {
state->xmit.buf = (unsigned char *) page;
uart_circ_clear(&state->xmit);
uart_port_unlock(uport, flags);
} else {
uart_port_unlock(uport, flags);
/*
* Do not free() the page under the port lock, see
* uart_shutdown().
*/
free_page(page);
}
retval = uport->ops->startup(uport);
if (retval == 0) {
if (uart_console(uport) && uport->cons->cflag) {
tty->termios.c_cflag = uport->cons->cflag;
tty->termios.c_ispeed = uport->cons->ispeed;
tty->termios.c_ospeed = uport->cons->ospeed;
uport->cons->cflag = 0;
uport->cons->ispeed = 0;
uport->cons->ospeed = 0;
}
/*
* Initialise the hardware port settings.
*/
uart_change_speed(tty, state, NULL);
/*
* Setup the RTS and DTR signals once the
* port is open and ready to respond.
*/
if (init_hw && C_BAUD(tty))
uart_port_dtr_rts(uport, 1);
}
/*
* This is to allow setserial on this port. People may want to set
* port/irq/type and then reconfigure the port properly if it failed
* now.
*/
if (retval && capable(CAP_SYS_ADMIN))
return 1;
return retval;
}
static int uart_startup(struct tty_struct *tty, struct uart_state *state,
int init_hw)
{
struct tty_port *port = &state->port;
int retval;
if (tty_port_initialized(port))
return 0;
retval = uart_port_startup(tty, state, init_hw);
if (retval)
set_bit(TTY_IO_ERROR, &tty->flags);
return retval;
}
/*
* This routine will shutdown a serial port; interrupts are disabled, and
* DTR is dropped if the hangup on close termio flag is on. Calls to
* uart_shutdown are serialised by the per-port semaphore.
*
* uport == NULL if uart_port has already been removed
*/
static void uart_shutdown(struct tty_struct *tty, struct uart_state *state)
{
struct uart_port *uport = uart_port_check(state);
struct tty_port *port = &state->port;
unsigned long flags;
char *xmit_buf = NULL;
/*
* Set the TTY IO error marker
*/
if (tty)
set_bit(TTY_IO_ERROR, &tty->flags);
if (tty_port_initialized(port)) {
tty_port_set_initialized(port, 0);
/*
* Turn off DTR and RTS early.
*/
if (uport && uart_console(uport) && tty) {
uport->cons->cflag = tty->termios.c_cflag;
uport->cons->ispeed = tty->termios.c_ispeed;
uport->cons->ospeed = tty->termios.c_ospeed;
}
if (!tty || C_HUPCL(tty))
uart_port_dtr_rts(uport, 0);
uart_port_shutdown(port);
}
/*
* It's possible for shutdown to be called after suspend if we get
* a DCD drop (hangup) at just the right time. Clear suspended bit so
* we don't try to resume a port that has been shutdown.
*/
tty_port_set_suspended(port, 0);
/*
* Do not free() the transmit buffer page under the port lock since
* this can create various circular locking scenarios. For instance,
* console driver may need to allocate/free a debug object, which
* can endup in printk() recursion.
*/
uart_port_lock(state, flags);
xmit_buf = state->xmit.buf;
state->xmit.buf = NULL;
uart_port_unlock(uport, flags);
if (xmit_buf)
free_page((unsigned long)xmit_buf);
}
/**
* uart_update_timeout - update per-port FIFO timeout.
* @port: uart_port structure describing the port
* @cflag: termios cflag value
* @baud: speed of the port
*
* Set the port FIFO timeout value. The @cflag value should
* reflect the actual hardware settings.
*/
void
uart_update_timeout(struct uart_port *port, unsigned int cflag,
unsigned int baud)
{
unsigned int size;
size = tty_get_frame_size(cflag) * port->fifosize;
/*
* Figure the timeout to send the above number of bits.
* Add .02 seconds of slop
*/
port->timeout = (HZ * size) / baud + HZ/50;
}
EXPORT_SYMBOL(uart_update_timeout);
/**
* uart_get_baud_rate - return baud rate for a particular port
* @port: uart_port structure describing the port in question.
* @termios: desired termios settings.
* @old: old termios (or NULL)
* @min: minimum acceptable baud rate
* @max: maximum acceptable baud rate
*
* Decode the termios structure into a numeric baud rate,
* taking account of the magic 38400 baud rate (with spd_*
* flags), and mapping the %B0 rate to 9600 baud.
*
* If the new baud rate is invalid, try the old termios setting.
* If it's still invalid, we try 9600 baud.
*
* Update the @termios structure to reflect the baud rate
* we're actually going to be using. Don't do this for the case
* where B0 is requested ("hang up").
*/
unsigned int
uart_get_baud_rate(struct uart_port *port, struct ktermios *termios,
struct ktermios *old, unsigned int min, unsigned int max)
{
unsigned int try;
unsigned int baud;
unsigned int altbaud;
int hung_up = 0;
upf_t flags = port->flags & UPF_SPD_MASK;
switch (flags) {
case UPF_SPD_HI:
altbaud = 57600;
break;
case UPF_SPD_VHI:
altbaud = 115200;
break;
case UPF_SPD_SHI:
altbaud = 230400;
break;
case UPF_SPD_WARP:
altbaud = 460800;
break;
default:
altbaud = 38400;
break;
}
for (try = 0; try < 2; try++) {
baud = tty_termios_baud_rate(termios);
/*
* The spd_hi, spd_vhi, spd_shi, spd_warp kludge...
* Die! Die! Die!
*/
if (try == 0 && baud == 38400)
baud = altbaud;
/*
* Special case: B0 rate.
*/
if (baud == 0) {
hung_up = 1;
baud = 9600;
}
if (baud >= min && baud <= max)
return baud;
/*
* Oops, the quotient was zero. Try again with
* the old baud rate if possible.
*/
termios->c_cflag &= ~CBAUD;
if (old) {
baud = tty_termios_baud_rate(old);
if (!hung_up)
tty_termios_encode_baud_rate(termios,
baud, baud);
old = NULL;
continue;
}
/*
* As a last resort, if the range cannot be met then clip to
* the nearest chip supported rate.
*/
if (!hung_up) {
if (baud <= min)
tty_termios_encode_baud_rate(termios,
min + 1, min + 1);
else
tty_termios_encode_baud_rate(termios,
max - 1, max - 1);
}
}
/* Should never happen */
WARN_ON(1);
return 0;
}
EXPORT_SYMBOL(uart_get_baud_rate);
/**
* uart_get_divisor - return uart clock divisor
* @port: uart_port structure describing the port.
* @baud: desired baud rate
*
* Calculate the uart clock divisor for the port.
*/
unsigned int
uart_get_divisor(struct uart_port *port, unsigned int baud)
{
unsigned int quot;
/*
* Old custom speed handling.
*/
if (baud == 38400 && (port->flags & UPF_SPD_MASK) == UPF_SPD_CUST)
quot = port->custom_divisor;
else
quot = DIV_ROUND_CLOSEST(port->uartclk, 16 * baud);
return quot;
}
EXPORT_SYMBOL(uart_get_divisor);
/* Caller holds port mutex */
static void uart_change_speed(struct tty_struct *tty, struct uart_state *state,
struct ktermios *old_termios)
{
struct uart_port *uport = uart_port_check(state);
struct ktermios *termios;
int hw_stopped;
/*
* If we have no tty, termios, or the port does not exist,
* then we can't set the parameters for this port.
*/
if (!tty || uport->type == PORT_UNKNOWN)
return;
termios = &tty->termios;
uport->ops->set_termios(uport, termios, old_termios);
/*
* Set modem status enables based on termios cflag
*/
spin_lock_irq(&uport->lock);
if (termios->c_cflag & CRTSCTS)
uport->status |= UPSTAT_CTS_ENABLE;
else
uport->status &= ~UPSTAT_CTS_ENABLE;
if (termios->c_cflag & CLOCAL)
uport->status &= ~UPSTAT_DCD_ENABLE;
else
uport->status |= UPSTAT_DCD_ENABLE;
/* reset sw-assisted CTS flow control based on (possibly) new mode */
hw_stopped = uport->hw_stopped;
uport->hw_stopped = uart_softcts_mode(uport) &&
!(uport->ops->get_mctrl(uport) & TIOCM_CTS);
if (uport->hw_stopped) {
if (!hw_stopped)
uport->ops->stop_tx(uport);
} else {
if (hw_stopped)
__uart_start(tty);
}
spin_unlock_irq(&uport->lock);
}
static int uart_put_char(struct tty_struct *tty, unsigned char c)
{
struct uart_state *state = tty->driver_data;
struct uart_port *port;
struct circ_buf *circ;
unsigned long flags;
int ret = 0;
circ = &state->xmit;
port = uart_port_lock(state, flags);
if (!circ->buf) {
uart_port_unlock(port, flags);
return 0;
}
if (port && uart_circ_chars_free(circ) != 0) {
circ->buf[circ->head] = c;
circ->head = (circ->head + 1) & (UART_XMIT_SIZE - 1);
ret = 1;
}
uart_port_unlock(port, flags);
return ret;
}
static void uart_flush_chars(struct tty_struct *tty)
{
uart_start(tty);
}
static int uart_write(struct tty_struct *tty,
const unsigned char *buf, int count)
{
struct uart_state *state = tty->driver_data;
struct uart_port *port;
struct circ_buf *circ;
unsigned long flags;
int c, ret = 0;
/*
* This means you called this function _after_ the port was
* closed. No cookie for you.
*/
if (!state) {
WARN_ON(1);
return -EL3HLT;
}
port = uart_port_lock(state, flags);
circ = &state->xmit;
if (!circ->buf) {
uart_port_unlock(port, flags);
return 0;
}
while (port) {
c = CIRC_SPACE_TO_END(circ->head, circ->tail, UART_XMIT_SIZE);
if (count < c)
c = count;
if (c <= 0)
break;
memcpy(circ->buf + circ->head, buf, c);
circ->head = (circ->head + c) & (UART_XMIT_SIZE - 1);
buf += c;
count -= c;
ret += c;
}
__uart_start(tty);
uart_port_unlock(port, flags);
return ret;
}
static unsigned int uart_write_room(struct tty_struct *tty)
{
struct uart_state *state = tty->driver_data;
struct uart_port *port;
unsigned long flags;
unsigned int ret;
port = uart_port_lock(state, flags);
ret = uart_circ_chars_free(&state->xmit);
uart_port_unlock(port, flags);
return ret;
}
static unsigned int uart_chars_in_buffer(struct tty_struct *tty)
{
struct uart_state *state = tty->driver_data;
struct uart_port *port;
unsigned long flags;
unsigned int ret;
port = uart_port_lock(state, flags);
ret = uart_circ_chars_pending(&state->xmit);
uart_port_unlock(port, flags);
return ret;
}
static void uart_flush_buffer(struct tty_struct *tty)
{
struct uart_state *state = tty->driver_data;
struct uart_port *port;
unsigned long flags;
/*
* This means you called this function _after_ the port was
* closed. No cookie for you.
*/
if (!state) {
WARN_ON(1);
return;
}
pr_debug("uart_flush_buffer(%d) called\n", tty->index);
port = uart_port_lock(state, flags);
if (!port)
return;
uart_circ_clear(&state->xmit);
if (port->ops->flush_buffer)
port->ops->flush_buffer(port);
uart_port_unlock(port, flags);
tty_port_tty_wakeup(&state->port);
}
/*
* This function performs low-level write of high-priority XON/XOFF
* character and accounting for it.
*
* Requires uart_port to implement .serial_out().
*/
void uart_xchar_out(struct uart_port *uport, int offset)
{
serial_port_out(uport, offset, uport->x_char);
uport->icount.tx++;
uport->x_char = 0;
}
EXPORT_SYMBOL_GPL(uart_xchar_out);
/*
* This function is used to send a high-priority XON/XOFF character to
* the device
*/
static void uart_send_xchar(struct tty_struct *tty, char ch)
{
struct uart_state *state = tty->driver_data;
struct uart_port *port;
unsigned long flags;
port = uart_port_ref(state);
if (!port)
return;
if (port->ops->send_xchar)
port->ops->send_xchar(port, ch);
else {
spin_lock_irqsave(&port->lock, flags);
port->x_char = ch;
if (ch)
port->ops->start_tx(port);
spin_unlock_irqrestore(&port->lock, flags);
}
uart_port_deref(port);
}
static void uart_throttle(struct tty_struct *tty)
{
struct uart_state *state = tty->driver_data;
upstat_t mask = UPSTAT_SYNC_FIFO;
struct uart_port *port;
port = uart_port_ref(state);
if (!port)
return;
if (I_IXOFF(tty))
mask |= UPSTAT_AUTOXOFF;
if (C_CRTSCTS(tty))
mask |= UPSTAT_AUTORTS;
if (port->status & mask) {
port->ops->throttle(port);
mask &= ~port->status;
}
if (mask & UPSTAT_AUTORTS)
uart_clear_mctrl(port, TIOCM_RTS);
if (mask & UPSTAT_AUTOXOFF)
uart_send_xchar(tty, STOP_CHAR(tty));
uart_port_deref(port);
}
static void uart_unthrottle(struct tty_struct *tty)
{
struct uart_state *state = tty->driver_data;
upstat_t mask = UPSTAT_SYNC_FIFO;
struct uart_port *port;
port = uart_port_ref(state);
if (!port)
return;
if (I_IXOFF(tty))
mask |= UPSTAT_AUTOXOFF;
if (C_CRTSCTS(tty))
mask |= UPSTAT_AUTORTS;
if (port->status & mask) {
port->ops->unthrottle(port);
mask &= ~port->status;
}
if (mask & UPSTAT_AUTORTS)
uart_set_mctrl(port, TIOCM_RTS);
if (mask & UPSTAT_AUTOXOFF)
uart_send_xchar(tty, START_CHAR(tty));
uart_port_deref(port);
}
static int uart_get_info(struct tty_port *port, struct serial_struct *retinfo)
{
struct uart_state *state = container_of(port, struct uart_state, port);
struct uart_port *uport;
int ret = -ENODEV;
/*
* Ensure the state we copy is consistent and no hardware changes
* occur as we go
*/
mutex_lock(&port->mutex);
uport = uart_port_check(state);
if (!uport)
goto out;
retinfo->type = uport->type;
retinfo->line = uport->line;
retinfo->port = uport->iobase;
if (HIGH_BITS_OFFSET)
retinfo->port_high = (long) uport->iobase >> HIGH_BITS_OFFSET;
retinfo->irq = uport->irq;
retinfo->flags = (__force int)uport->flags;
retinfo->xmit_fifo_size = uport->fifosize;
retinfo->baud_base = uport->uartclk / 16;
retinfo->close_delay = jiffies_to_msecs(port->close_delay) / 10;
retinfo->closing_wait = port->closing_wait == ASYNC_CLOSING_WAIT_NONE ?
ASYNC_CLOSING_WAIT_NONE :
jiffies_to_msecs(port->closing_wait) / 10;
retinfo->custom_divisor = uport->custom_divisor;
retinfo->hub6 = uport->hub6;
retinfo->io_type = uport->iotype;
retinfo->iomem_reg_shift = uport->regshift;
retinfo->iomem_base = (void *)(unsigned long)uport->mapbase;
ret = 0;
out:
mutex_unlock(&port->mutex);
return ret;
}
static int uart_get_info_user(struct tty_struct *tty,
struct serial_struct *ss)
{
struct uart_state *state = tty->driver_data;
struct tty_port *port = &state->port;
return uart_get_info(port, ss) < 0 ? -EIO : 0;
}
static int uart_set_info(struct tty_struct *tty, struct tty_port *port,
struct uart_state *state,
struct serial_struct *new_info)
{
struct uart_port *uport = uart_port_check(state);
unsigned long new_port;
unsigned int change_irq, change_port, closing_wait;
unsigned int old_custom_divisor, close_delay;
upf_t old_flags, new_flags;
int retval = 0;
if (!uport)
return -EIO;
new_port = new_info->port;
if (HIGH_BITS_OFFSET)
new_port += (unsigned long) new_info->port_high << HIGH_BITS_OFFSET;
new_info->irq = irq_canonicalize(new_info->irq);
close_delay = msecs_to_jiffies(new_info->close_delay * 10);
closing_wait = new_info->closing_wait == ASYNC_CLOSING_WAIT_NONE ?
ASYNC_CLOSING_WAIT_NONE :
msecs_to_jiffies(new_info->closing_wait * 10);
change_irq = !(uport->flags & UPF_FIXED_PORT)
&& new_info->irq != uport->irq;
/*
* Since changing the 'type' of the port changes its resource
* allocations, we should treat type changes the same as
* IO port changes.
*/
change_port = !(uport->flags & UPF_FIXED_PORT)
&& (new_port != uport->iobase ||
(unsigned long)new_info->iomem_base != uport->mapbase ||
new_info->hub6 != uport->hub6 ||
new_info->io_type != uport->iotype ||
new_info->iomem_reg_shift != uport->regshift ||
new_info->type != uport->type);
old_flags = uport->flags;
new_flags = (__force upf_t)new_info->flags;
old_custom_divisor = uport->custom_divisor;
if (!capable(CAP_SYS_ADMIN)) {
retval = -EPERM;
if (change_irq || change_port ||
(new_info->baud_base != uport->uartclk / 16) ||
(close_delay != port->close_delay) ||
(closing_wait != port->closing_wait) ||
(new_info->xmit_fifo_size &&
new_info->xmit_fifo_size != uport->fifosize) ||
(((new_flags ^ old_flags) & ~UPF_USR_MASK) != 0))
goto exit;
uport->flags = ((uport->flags & ~UPF_USR_MASK) |
(new_flags & UPF_USR_MASK));
uport->custom_divisor = new_info->custom_divisor;
goto check_and_exit;
}
if (change_irq || change_port) {
retval = security_locked_down(LOCKDOWN_TIOCSSERIAL);
if (retval)
goto exit;
}
/*
* Ask the low level driver to verify the settings.
*/
if (uport->ops->verify_port)
retval = uport->ops->verify_port(uport, new_info);
if ((new_info->irq >= nr_irqs) || (new_info->irq < 0) ||
(new_info->baud_base < 9600))
retval = -EINVAL;
if (retval)
goto exit;
if (change_port || change_irq) {
retval = -EBUSY;
/*
* Make sure that we are the sole user of this port.
*/
if (tty_port_users(port) > 1)
goto exit;
/*
* We need to shutdown the serial port at the old
* port/type/irq combination.
*/
uart_shutdown(tty, state);
}
if (change_port) {
unsigned long old_iobase, old_mapbase;
unsigned int old_type, old_iotype, old_hub6, old_shift;
old_iobase = uport->iobase;
old_mapbase = uport->mapbase;
old_type = uport->type;
old_hub6 = uport->hub6;
old_iotype = uport->iotype;
old_shift = uport->regshift;
/*
* Free and release old regions
*/
if (old_type != PORT_UNKNOWN && uport->ops->release_port)
uport->ops->release_port(uport);
uport->iobase = new_port;
uport->type = new_info->type;
uport->hub6 = new_info->hub6;
uport->iotype = new_info->io_type;
uport->regshift = new_info->iomem_reg_shift;
uport->mapbase = (unsigned long)new_info->iomem_base;
/*
* Claim and map the new regions
*/
if (uport->type != PORT_UNKNOWN && uport->ops->request_port) {
retval = uport->ops->request_port(uport);
} else {
/* Always success - Jean II */
retval = 0;
}
/*
* If we fail to request resources for the
* new port, try to restore the old settings.
*/
if (retval) {
uport->iobase = old_iobase;
uport->type = old_type;
uport->hub6 = old_hub6;
uport->iotype = old_iotype;
uport->regshift = old_shift;
uport->mapbase = old_mapbase;
if (old_type != PORT_UNKNOWN) {
retval = uport->ops->request_port(uport);
/*
* If we failed to restore the old settings,
* we fail like this.
*/
if (retval)
uport->type = PORT_UNKNOWN;
/*
* We failed anyway.
*/
retval = -EBUSY;
}
/* Added to return the correct error -Ram Gupta */
goto exit;
}
}
if (change_irq)
uport->irq = new_info->irq;
if (!(uport->flags & UPF_FIXED_PORT))
uport->uartclk = new_info->baud_base * 16;
uport->flags = (uport->flags & ~UPF_CHANGE_MASK) |
(new_flags & UPF_CHANGE_MASK);
uport->custom_divisor = new_info->custom_divisor;
port->close_delay = close_delay;
port->closing_wait = closing_wait;
if (new_info->xmit_fifo_size)
uport->fifosize = new_info->xmit_fifo_size;
check_and_exit:
retval = 0;
if (uport->type == PORT_UNKNOWN)
goto exit;
if (tty_port_initialized(port)) {
if (((old_flags ^ uport->flags) & UPF_SPD_MASK) ||
old_custom_divisor != uport->custom_divisor) {
/*
* If they're setting up a custom divisor or speed,
* instead of clearing it, then bitch about it.
*/
if (uport->flags & UPF_SPD_MASK) {
dev_notice_ratelimited(uport->dev,
"%s sets custom speed on %s. This is deprecated.\n",
current->comm,
tty_name(port->tty));
}
uart_change_speed(tty, state, NULL);
}
} else {
retval = uart_startup(tty, state, 1);
if (retval == 0)
tty_port_set_initialized(port, true);
if (retval > 0)
retval = 0;
}
exit:
return retval;
}
static int uart_set_info_user(struct tty_struct *tty, struct serial_struct *ss)
{
struct uart_state *state = tty->driver_data;
struct tty_port *port = &state->port;
int retval;
down_write(&tty->termios_rwsem);
/*
* This semaphore protects port->count. It is also
* very useful to prevent opens. Also, take the
* port configuration semaphore to make sure that a
* module insertion/removal doesn't change anything
* under us.
*/
mutex_lock(&port->mutex);
retval = uart_set_info(tty, port, state, ss);
mutex_unlock(&port->mutex);
up_write(&tty->termios_rwsem);
return retval;
}
/**
* uart_get_lsr_info - get line status register info
* @tty: tty associated with the UART
* @state: UART being queried
* @value: returned modem value
*/
static int uart_get_lsr_info(struct tty_struct *tty,
struct uart_state *state, unsigned int __user *value)
{
struct uart_port *uport = uart_port_check(state);
unsigned int result;
result = uport->ops->tx_empty(uport);
/*
* If we're about to load something into the transmit
* register, we'll pretend the transmitter isn't empty to
* avoid a race condition (depending on when the transmit
* interrupt happens).
*/
if (uport->x_char ||
((uart_circ_chars_pending(&state->xmit) > 0) &&
!uart_tx_stopped(uport)))
result &= ~TIOCSER_TEMT;
return put_user(result, value);
}
static int uart_tiocmget(struct tty_struct *tty)
{
struct uart_state *state = tty->driver_data;
struct tty_port *port = &state->port;
struct uart_port *uport;
int result = -EIO;
mutex_lock(&port->mutex);
uport = uart_port_check(state);
if (!uport)
goto out;
if (!tty_io_error(tty)) {
result = uport->mctrl;
spin_lock_irq(&uport->lock);
result |= uport->ops->get_mctrl(uport);
spin_unlock_irq(&uport->lock);
}
out:
mutex_unlock(&port->mutex);
return result;
}
static int
uart_tiocmset(struct tty_struct *tty, unsigned int set, unsigned int clear)
{
struct uart_state *state = tty->driver_data;
struct tty_port *port = &state->port;
struct uart_port *uport;
int ret = -EIO;
mutex_lock(&port->mutex);
uport = uart_port_check(state);
if (!uport)
goto out;
if (!tty_io_error(tty)) {
if (uport->rs485.flags & SER_RS485_ENABLED) {
set &= ~TIOCM_RTS;
clear &= ~TIOCM_RTS;
}
uart_update_mctrl(uport, set, clear);
ret = 0;
}
out:
mutex_unlock(&port->mutex);
return ret;
}
static int uart_break_ctl(struct tty_struct *tty, int break_state)
{
struct uart_state *state = tty->driver_data;
struct tty_port *port = &state->port;
struct uart_port *uport;
int ret = -EIO;
mutex_lock(&port->mutex);
uport = uart_port_check(state);
if (!uport)
goto out;
if (uport->type != PORT_UNKNOWN && uport->ops->break_ctl)
uport->ops->break_ctl(uport, break_state);
ret = 0;
out:
mutex_unlock(&port->mutex);
return ret;
}
static int uart_do_autoconfig(struct tty_struct *tty, struct uart_state *state)
{
struct tty_port *port = &state->port;
struct uart_port *uport;
int flags, ret;
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
/*
* Take the per-port semaphore. This prevents count from
* changing, and hence any extra opens of the port while
* we're auto-configuring.
*/
if (mutex_lock_interruptible(&port->mutex))
return -ERESTARTSYS;
uport = uart_port_check(state);
if (!uport) {
ret = -EIO;
goto out;
}
ret = -EBUSY;
if (tty_port_users(port) == 1) {
uart_shutdown(tty, state);
/*
* If we already have a port type configured,
* we must release its resources.
*/
if (uport->type != PORT_UNKNOWN && uport->ops->release_port)
uport->ops->release_port(uport);
flags = UART_CONFIG_TYPE;
if (uport->flags & UPF_AUTO_IRQ)
flags |= UART_CONFIG_IRQ;
/*
* This will claim the ports resources if
* a port is found.
*/
uport->ops->config_port(uport, flags);
ret = uart_startup(tty, state, 1);
if (ret == 0)
tty_port_set_initialized(port, true);
if (ret > 0)
ret = 0;
}
out:
mutex_unlock(&port->mutex);
return ret;
}
static void uart_enable_ms(struct uart_port *uport)
{
/*
* Force modem status interrupts on
*/
if (uport->ops->enable_ms)
uport->ops->enable_ms(uport);
}
/*
* Wait for any of the 4 modem inputs (DCD,RI,DSR,CTS) to change
* - mask passed in arg for lines of interest
* (use |'ed TIOCM_RNG/DSR/CD/CTS for masking)
* Caller should use TIOCGICOUNT to see which one it was
*
* FIXME: This wants extracting into a common all driver implementation
* of TIOCMWAIT using tty_port.
*/
static int uart_wait_modem_status(struct uart_state *state, unsigned long arg)
{
struct uart_port *uport;
struct tty_port *port = &state->port;
DECLARE_WAITQUEUE(wait, current);
struct uart_icount cprev, cnow;
int ret;
/*
* note the counters on entry
*/
uport = uart_port_ref(state);
if (!uport)
return -EIO;
spin_lock_irq(&uport->lock);
memcpy(&cprev, &uport->icount, sizeof(struct uart_icount));
uart_enable_ms(uport);
spin_unlock_irq(&uport->lock);
add_wait_queue(&port->delta_msr_wait, &wait);
for (;;) {
spin_lock_irq(&uport->lock);
memcpy(&cnow, &uport->icount, sizeof(struct uart_icount));
spin_unlock_irq(&uport->lock);
set_current_state(TASK_INTERRUPTIBLE);
if (((arg & TIOCM_RNG) && (cnow.rng != cprev.rng)) ||
((arg & TIOCM_DSR) && (cnow.dsr != cprev.dsr)) ||
((arg & TIOCM_CD) && (cnow.dcd != cprev.dcd)) ||
((arg & TIOCM_CTS) && (cnow.cts != cprev.cts))) {
ret = 0;
break;
}
schedule();
/* see if a signal did it */
if (signal_pending(current)) {
ret = -ERESTARTSYS;
break;
}
cprev = cnow;
}
__set_current_state(TASK_RUNNING);
remove_wait_queue(&port->delta_msr_wait, &wait);
uart_port_deref(uport);
return ret;
}
/*
* Get counter of input serial line interrupts (DCD,RI,DSR,CTS)
* Return: write counters to the user passed counter struct
* NB: both 1->0 and 0->1 transitions are counted except for
* RI where only 0->1 is counted.
*/
static int uart_get_icount(struct tty_struct *tty,
struct serial_icounter_struct *icount)
{
struct uart_state *state = tty->driver_data;
struct uart_icount cnow;
struct uart_port *uport;
uport = uart_port_ref(state);
if (!uport)
return -EIO;
spin_lock_irq(&uport->lock);
memcpy(&cnow, &uport->icount, sizeof(struct uart_icount));
spin_unlock_irq(&uport->lock);
uart_port_deref(uport);
icount->cts = cnow.cts;
icount->dsr = cnow.dsr;
icount->rng = cnow.rng;
icount->dcd = cnow.dcd;
icount->rx = cnow.rx;
icount->tx = cnow.tx;
icount->frame = cnow.frame;
icount->overrun = cnow.overrun;
icount->parity = cnow.parity;
icount->brk = cnow.brk;
icount->buf_overrun = cnow.buf_overrun;
return 0;
}
static int uart_get_rs485_config(struct uart_port *port,
struct serial_rs485 __user *rs485)
{
unsigned long flags;
struct serial_rs485 aux;
spin_lock_irqsave(&port->lock, flags);
aux = port->rs485;
spin_unlock_irqrestore(&port->lock, flags);
if (copy_to_user(rs485, &aux, sizeof(aux)))
return -EFAULT;
return 0;
}
static int uart_set_rs485_config(struct uart_port *port,
struct serial_rs485 __user *rs485_user)
{
struct serial_rs485 rs485;
int ret;
unsigned long flags;
if (!port->rs485_config)
return -ENOTTY;
if (copy_from_user(&rs485, rs485_user, sizeof(*rs485_user)))
return -EFAULT;
spin_lock_irqsave(&port->lock, flags);
ret = port->rs485_config(port, &rs485);
spin_unlock_irqrestore(&port->lock, flags);
if (ret)
return ret;
if (copy_to_user(rs485_user, &port->rs485, sizeof(port->rs485)))
return -EFAULT;
return 0;
}
static int uart_get_iso7816_config(struct uart_port *port,
struct serial_iso7816 __user *iso7816)
{
unsigned long flags;
struct serial_iso7816 aux;
if (!port->iso7816_config)
return -ENOTTY;
spin_lock_irqsave(&port->lock, flags);
aux = port->iso7816;
spin_unlock_irqrestore(&port->lock, flags);
if (copy_to_user(iso7816, &aux, sizeof(aux)))
return -EFAULT;
return 0;
}
static int uart_set_iso7816_config(struct uart_port *port,
struct serial_iso7816 __user *iso7816_user)
{
struct serial_iso7816 iso7816;
int i, ret;
unsigned long flags;
if (!port->iso7816_config)
return -ENOTTY;
if (copy_from_user(&iso7816, iso7816_user, sizeof(*iso7816_user)))
return -EFAULT;
/*
* There are 5 words reserved for future use. Check that userspace
* doesn't put stuff in there to prevent breakages in the future.
*/
for (i = 0; i < 5; i++)
if (iso7816.reserved[i])
return -EINVAL;
spin_lock_irqsave(&port->lock, flags);
ret = port->iso7816_config(port, &iso7816);
spin_unlock_irqrestore(&port->lock, flags);
if (ret)
return ret;
if (copy_to_user(iso7816_user, &port->iso7816, sizeof(port->iso7816)))
return -EFAULT;
return 0;
}
/*
* Called via sys_ioctl. We can use spin_lock_irq() here.
*/
static int
uart_ioctl(struct tty_struct *tty, unsigned int cmd, unsigned long arg)
{
struct uart_state *state = tty->driver_data;
struct tty_port *port = &state->port;
struct uart_port *uport;
void __user *uarg = (void __user *)arg;
int ret = -ENOIOCTLCMD;
/*
* These ioctls don't rely on the hardware to be present.
*/
switch (cmd) {
case TIOCSERCONFIG:
down_write(&tty->termios_rwsem);
ret = uart_do_autoconfig(tty, state);
up_write(&tty->termios_rwsem);
break;
}
if (ret != -ENOIOCTLCMD)
goto out;
if (tty_io_error(tty)) {
ret = -EIO;
goto out;
}
/*
* The following should only be used when hardware is present.
*/
switch (cmd) {
case TIOCMIWAIT:
ret = uart_wait_modem_status(state, arg);
break;
}
if (ret != -ENOIOCTLCMD)
goto out;
mutex_lock(&port->mutex);
uport = uart_port_check(state);
if (!uport || tty_io_error(tty)) {
ret = -EIO;
goto out_up;
}
/*
* All these rely on hardware being present and need to be
* protected against the tty being hung up.
*/
switch (cmd) {
case TIOCSERGETLSR: /* Get line status register */
ret = uart_get_lsr_info(tty, state, uarg);
break;
case TIOCGRS485:
ret = uart_get_rs485_config(uport, uarg);
break;
case TIOCSRS485:
ret = uart_set_rs485_config(uport, uarg);
break;
case TIOCSISO7816:
ret = uart_set_iso7816_config(state->uart_port, uarg);
break;
case TIOCGISO7816:
ret = uart_get_iso7816_config(state->uart_port, uarg);
break;
default:
if (uport->ops->ioctl)
ret = uport->ops->ioctl(uport, cmd, arg);
break;
}
out_up:
mutex_unlock(&port->mutex);
out:
return ret;
}
static void uart_set_ldisc(struct tty_struct *tty)
{
struct uart_state *state = tty->driver_data;
struct uart_port *uport;
struct tty_port *port = &state->port;
if (!tty_port_initialized(port))
return;
mutex_lock(&state->port.mutex);
uport = uart_port_check(state);
if (uport && uport->ops->set_ldisc)
uport->ops->set_ldisc(uport, &tty->termios);
mutex_unlock(&state->port.mutex);
}
static void uart_set_termios(struct tty_struct *tty,
struct ktermios *old_termios)
{
struct uart_state *state = tty->driver_data;
struct uart_port *uport;
unsigned int cflag = tty->termios.c_cflag;
unsigned int iflag_mask = IGNBRK|BRKINT|IGNPAR|PARMRK|INPCK;
bool sw_changed = false;
mutex_lock(&state->port.mutex);
uport = uart_port_check(state);
if (!uport)
goto out;
/*
* Drivers doing software flow control also need to know
* about changes to these input settings.
*/
if (uport->flags & UPF_SOFT_FLOW) {
iflag_mask |= IXANY|IXON|IXOFF;
sw_changed =
tty->termios.c_cc[VSTART] != old_termios->c_cc[VSTART] ||
tty->termios.c_cc[VSTOP] != old_termios->c_cc[VSTOP];
}
/*
* These are the bits that are used to setup various
* flags in the low level driver. We can ignore the Bfoo
* bits in c_cflag; c_[io]speed will always be set
* appropriately by set_termios() in tty_ioctl.c
*/
if ((cflag ^ old_termios->c_cflag) == 0 &&
tty->termios.c_ospeed == old_termios->c_ospeed &&
tty->termios.c_ispeed == old_termios->c_ispeed &&
((tty->termios.c_iflag ^ old_termios->c_iflag) & iflag_mask) == 0 &&
!sw_changed) {
goto out;
}
uart_change_speed(tty, state, old_termios);
/* reload cflag from termios; port driver may have overridden flags */
cflag = tty->termios.c_cflag;
/* Handle transition to B0 status */
if ((old_termios->c_cflag & CBAUD) && !(cflag & CBAUD))
uart_clear_mctrl(uport, TIOCM_RTS | TIOCM_DTR);
/* Handle transition away from B0 status */
else if (!(old_termios->c_cflag & CBAUD) && (cflag & CBAUD)) {
unsigned int mask = TIOCM_DTR;
if (!(cflag & CRTSCTS) || !tty_throttled(tty))
mask |= TIOCM_RTS;
uart_set_mctrl(uport, mask);
}
out:
mutex_unlock(&state->port.mutex);
}
/*
* Calls to uart_close() are serialised via the tty_lock in
* drivers/tty/tty_io.c:tty_release()
* drivers/tty/tty_io.c:do_tty_hangup()
*/
static void uart_close(struct tty_struct *tty, struct file *filp)
{
struct uart_state *state = tty->driver_data;
if (!state) {
struct uart_driver *drv = tty->driver->driver_state;
struct tty_port *port;
state = drv->state + tty->index;
port = &state->port;
spin_lock_irq(&port->lock);
--port->count;
spin_unlock_irq(&port->lock);
return;
}
pr_debug("uart_close(%d) called\n", tty->index);
tty_port_close(tty->port, tty, filp);
}
static void uart_tty_port_shutdown(struct tty_port *port)
{
struct uart_state *state = container_of(port, struct uart_state, port);
struct uart_port *uport = uart_port_check(state);
char *buf;
/*
* At this point, we stop accepting input. To do this, we
* disable the receive line status interrupts.
*/
if (WARN(!uport, "detached port still initialized!\n"))
return;
spin_lock_irq(&uport->lock);
uport->ops->stop_rx(uport);
spin_unlock_irq(&uport->lock);
uart_port_shutdown(port);
/*
* It's possible for shutdown to be called after suspend if we get
* a DCD drop (hangup) at just the right time. Clear suspended bit so
* we don't try to resume a port that has been shutdown.
*/
tty_port_set_suspended(port, 0);
/*
* Free the transmit buffer.
*/
spin_lock_irq(&uport->lock);
buf = state->xmit.buf;
state->xmit.buf = NULL;
spin_unlock_irq(&uport->lock);
if (buf)
free_page((unsigned long)buf);
uart_change_pm(state, UART_PM_STATE_OFF);
}
static void uart_wait_until_sent(struct tty_struct *tty, int timeout)
{
struct uart_state *state = tty->driver_data;
struct uart_port *port;
unsigned long char_time, expire;
port = uart_port_ref(state);
if (!port)
return;
if (port->type == PORT_UNKNOWN || port->fifosize == 0) {
uart_port_deref(port);
return;
}
/*
* Set the check interval to be 1/5 of the estimated time to
* send a single character, and make it at least 1. The check
* interval should also be less than the timeout.
*
* Note: we have to use pretty tight timings here to satisfy
* the NIST-PCTS.
*/
char_time = (port->timeout - HZ/50) / port->fifosize;
char_time = char_time / 5;
if (char_time == 0)
char_time = 1;
if (timeout && timeout < char_time)
char_time = timeout;
/*
* If the transmitter hasn't cleared in twice the approximate
* amount of time to send the entire FIFO, it probably won't
* ever clear. This assumes the UART isn't doing flow
* control, which is currently the case. Hence, if it ever
* takes longer than port->timeout, this is probably due to a
* UART bug of some kind. So, we clamp the timeout parameter at
* 2*port->timeout.
*/
if (timeout == 0 || timeout > 2 * port->timeout)
timeout = 2 * port->timeout;
expire = jiffies + timeout;
pr_debug("uart_wait_until_sent(%d), jiffies=%lu, expire=%lu...\n",
port->line, jiffies, expire);
/*
* Check whether the transmitter is empty every 'char_time'.
* 'timeout' / 'expire' give us the maximum amount of time
* we wait.
*/
while (!port->ops->tx_empty(port)) {
msleep_interruptible(jiffies_to_msecs(char_time));
if (signal_pending(current))
break;
if (time_after(jiffies, expire))
break;
}
uart_port_deref(port);
}
/*
* Calls to uart_hangup() are serialised by the tty_lock in
* drivers/tty/tty_io.c:do_tty_hangup()
* This runs from a workqueue and can sleep for a _short_ time only.
*/
static void uart_hangup(struct tty_struct *tty)
{
struct uart_state *state = tty->driver_data;
struct tty_port *port = &state->port;
struct uart_port *uport;
unsigned long flags;
pr_debug("uart_hangup(%d)\n", tty->index);
mutex_lock(&port->mutex);
uport = uart_port_check(state);
WARN(!uport, "hangup of detached port!\n");
if (tty_port_active(port)) {
uart_flush_buffer(tty);
uart_shutdown(tty, state);
spin_lock_irqsave(&port->lock, flags);
port->count = 0;
spin_unlock_irqrestore(&port->lock, flags);
tty_port_set_active(port, 0);
tty_port_tty_set(port, NULL);
if (uport && !uart_console(uport))
uart_change_pm(state, UART_PM_STATE_OFF);
wake_up_interruptible(&port->open_wait);
wake_up_interruptible(&port->delta_msr_wait);
}
mutex_unlock(&port->mutex);
}
/* uport == NULL if uart_port has already been removed */
static void uart_port_shutdown(struct tty_port *port)
{
struct uart_state *state = container_of(port, struct uart_state, port);
struct uart_port *uport = uart_port_check(state);
/*
* clear delta_msr_wait queue to avoid mem leaks: we may free
* the irq here so the queue might never be woken up. Note
* that we won't end up waiting on delta_msr_wait again since
* any outstanding file descriptors should be pointing at
* hung_up_tty_fops now.
*/
wake_up_interruptible(&port->delta_msr_wait);
/*
* Free the IRQ and disable the port.
*/
if (uport)
uport->ops->shutdown(uport);
/*
* Ensure that the IRQ handler isn't running on another CPU.
*/
if (uport)
synchronize_irq(uport->irq);
}
static int uart_carrier_raised(struct tty_port *port)
{
struct uart_state *state = container_of(port, struct uart_state, port);
struct uart_port *uport;
int mctrl;
uport = uart_port_ref(state);
/*
* Should never observe uport == NULL since checks for hangup should
* abort the tty_port_block_til_ready() loop before checking for carrier
* raised -- but report carrier raised if it does anyway so open will
* continue and not sleep
*/
if (WARN_ON(!uport))
return 1;
spin_lock_irq(&uport->lock);
uart_enable_ms(uport);
mctrl = uport->ops->get_mctrl(uport);
spin_unlock_irq(&uport->lock);
uart_port_deref(uport);
if (mctrl & TIOCM_CAR)
return 1;
return 0;
}
static void uart_dtr_rts(struct tty_port *port, int raise)
{
struct uart_state *state = container_of(port, struct uart_state, port);
struct uart_port *uport;
uport = uart_port_ref(state);
if (!uport)
return;
uart_port_dtr_rts(uport, raise);
uart_port_deref(uport);
}
static int uart_install(struct tty_driver *driver, struct tty_struct *tty)
{
struct uart_driver *drv = driver->driver_state;
struct uart_state *state = drv->state + tty->index;
tty->driver_data = state;
return tty_standard_install(driver, tty);
}
/*
* Calls to uart_open are serialised by the tty_lock in
* drivers/tty/tty_io.c:tty_open()
* Note that if this fails, then uart_close() _will_ be called.
*
* In time, we want to scrap the "opening nonpresent ports"
* behaviour and implement an alternative way for setserial
* to set base addresses/ports/types. This will allow us to
* get rid of a certain amount of extra tests.
*/
static int uart_open(struct tty_struct *tty, struct file *filp)
{
struct uart_state *state = tty->driver_data;
int retval;
retval = tty_port_open(&state->port, tty, filp);
if (retval > 0)
retval = 0;
return retval;
}
static int uart_port_activate(struct tty_port *port, struct tty_struct *tty)
{
struct uart_state *state = container_of(port, struct uart_state, port);
struct uart_port *uport;
int ret;
uport = uart_port_check(state);
if (!uport || uport->flags & UPF_DEAD)
return -ENXIO;
/*
* Start up the serial port.
*/
ret = uart_startup(tty, state, 0);
if (ret > 0)
tty_port_set_active(port, 1);
return ret;
}
static const char *uart_type(struct uart_port *port)
{
const char *str = NULL;
if (port->ops->type)
str = port->ops->type(port);
if (!str)
str = "unknown";
return str;
}
#ifdef CONFIG_PROC_FS
static void uart_line_info(struct seq_file *m, struct uart_driver *drv, int i)
{
struct uart_state *state = drv->state + i;
struct tty_port *port = &state->port;
enum uart_pm_state pm_state;
struct uart_port *uport;
char stat_buf[32];
unsigned int status;
int mmio;
mutex_lock(&port->mutex);
uport = uart_port_check(state);
if (!uport)
goto out;
mmio = uport->iotype >= UPIO_MEM;
seq_printf(m, "%d: uart:%s %s%08llX irq:%d",
uport->line, uart_type(uport),
mmio ? "mmio:0x" : "port:",
mmio ? (unsigned long long)uport->mapbase
: (unsigned long long)uport->iobase,
uport->irq);
if (uport->type == PORT_UNKNOWN) {
seq_putc(m, '\n');
goto out;
}
if (capable(CAP_SYS_ADMIN)) {
pm_state = state->pm_state;
if (pm_state != UART_PM_STATE_ON)
uart_change_pm(state, UART_PM_STATE_ON);
spin_lock_irq(&uport->lock);
status = uport->ops->get_mctrl(uport);
spin_unlock_irq(&uport->lock);
if (pm_state != UART_PM_STATE_ON)
uart_change_pm(state, pm_state);
seq_printf(m, " tx:%d rx:%d",
uport->icount.tx, uport->icount.rx);
if (uport->icount.frame)
seq_printf(m, " fe:%d", uport->icount.frame);
if (uport->icount.parity)
seq_printf(m, " pe:%d", uport->icount.parity);
if (uport->icount.brk)
seq_printf(m, " brk:%d", uport->icount.brk);
if (uport->icount.overrun)
seq_printf(m, " oe:%d", uport->icount.overrun);
if (uport->icount.buf_overrun)
seq_printf(m, " bo:%d", uport->icount.buf_overrun);
#define INFOBIT(bit, str) \
if (uport->mctrl & (bit)) \
strncat(stat_buf, (str), sizeof(stat_buf) - \
strlen(stat_buf) - 2)
#define STATBIT(bit, str) \
if (status & (bit)) \
strncat(stat_buf, (str), sizeof(stat_buf) - \
strlen(stat_buf) - 2)
stat_buf[0] = '\0';
stat_buf[1] = '\0';
INFOBIT(TIOCM_RTS, "|RTS");
STATBIT(TIOCM_CTS, "|CTS");
INFOBIT(TIOCM_DTR, "|DTR");
STATBIT(TIOCM_DSR, "|DSR");
STATBIT(TIOCM_CAR, "|CD");
STATBIT(TIOCM_RNG, "|RI");
if (stat_buf[0])
stat_buf[0] = ' ';
seq_puts(m, stat_buf);
}
seq_putc(m, '\n');
#undef STATBIT
#undef INFOBIT
out:
mutex_unlock(&port->mutex);
}
static int uart_proc_show(struct seq_file *m, void *v)
{
struct tty_driver *ttydrv = m->private;
struct uart_driver *drv = ttydrv->driver_state;
int i;
seq_printf(m, "serinfo:1.0 driver%s%s revision:%s\n", "", "", "");
for (i = 0; i < drv->nr; i++)
uart_line_info(m, drv, i);
return 0;
}
#endif
static inline bool uart_console_enabled(struct uart_port *port)
{
return uart_console(port) && (port->cons->flags & CON_ENABLED);
}
static void uart_port_spin_lock_init(struct uart_port *port)
{
spin_lock_init(&port->lock);
lockdep_set_class(&port->lock, &port_lock_key);
}
#if defined(CONFIG_SERIAL_CORE_CONSOLE) || defined(CONFIG_CONSOLE_POLL)
/**
* uart_console_write - write a console message to a serial port
* @port: the port to write the message
* @s: array of characters
* @count: number of characters in string to write
* @putchar: function to write character to port
*/
void uart_console_write(struct uart_port *port, const char *s,
unsigned int count,
void (*putchar)(struct uart_port *, int))
{
unsigned int i;
for (i = 0; i < count; i++, s++) { if (*s == '\n') putchar(port, '\r'); putchar(port, *s);
}
}
EXPORT_SYMBOL_GPL(uart_console_write);
/*
* Check whether an invalid uart number has been specified, and
* if so, search for the first available port that does have
* console support.
*/
struct uart_port * __init
uart_get_console(struct uart_port *ports, int nr, struct console *co)
{
int idx = co->index;
if (idx < 0 || idx >= nr || (ports[idx].iobase == 0 &&
ports[idx].membase == NULL))
for (idx = 0; idx < nr; idx++)
if (ports[idx].iobase != 0 ||
ports[idx].membase != NULL)
break;
co->index = idx;
return ports + idx;
}
/**
* uart_parse_earlycon - Parse earlycon options
* @p: ptr to 2nd field (ie., just beyond '<name>,')
* @iotype: ptr for decoded iotype (out)
* @addr: ptr for decoded mapbase/iobase (out)
* @options: ptr for <options> field; NULL if not present (out)
*
* Decodes earlycon kernel command line parameters of the form
* earlycon=<name>,io|mmio|mmio16|mmio32|mmio32be|mmio32native,<addr>,<options>
* console=<name>,io|mmio|mmio16|mmio32|mmio32be|mmio32native,<addr>,<options>
*
* The optional form
*
* earlycon=<name>,0x<addr>,<options>
* console=<name>,0x<addr>,<options>
*
* is also accepted; the returned @iotype will be UPIO_MEM.
*
* Returns 0 on success or -EINVAL on failure
*/
int uart_parse_earlycon(char *p, unsigned char *iotype, resource_size_t *addr,
char **options)
{
if (strncmp(p, "mmio,", 5) == 0) {
*iotype = UPIO_MEM;
p += 5;
} else if (strncmp(p, "mmio16,", 7) == 0) {
*iotype = UPIO_MEM16;
p += 7;
} else if (strncmp(p, "mmio32,", 7) == 0) {
*iotype = UPIO_MEM32;
p += 7;
} else if (strncmp(p, "mmio32be,", 9) == 0) {
*iotype = UPIO_MEM32BE;
p += 9;
} else if (strncmp(p, "mmio32native,", 13) == 0) {
*iotype = IS_ENABLED(CONFIG_CPU_BIG_ENDIAN) ?
UPIO_MEM32BE : UPIO_MEM32;
p += 13;
} else if (strncmp(p, "io,", 3) == 0) {
*iotype = UPIO_PORT;
p += 3;
} else if (strncmp(p, "0x", 2) == 0) {
*iotype = UPIO_MEM;
} else {
return -EINVAL;
}
/*
* Before you replace it with kstrtoull(), think about options separator
* (',') it will not tolerate
*/
*addr = simple_strtoull(p, NULL, 0);
p = strchr(p, ',');
if (p)
p++;
*options = p;
return 0;
}
EXPORT_SYMBOL_GPL(uart_parse_earlycon);
/**
* uart_parse_options - Parse serial port baud/parity/bits/flow control.
* @options: pointer to option string
* @baud: pointer to an 'int' variable for the baud rate.
* @parity: pointer to an 'int' variable for the parity.
* @bits: pointer to an 'int' variable for the number of data bits.
* @flow: pointer to an 'int' variable for the flow control character.
*
* uart_parse_options decodes a string containing the serial console
* options. The format of the string is <baud><parity><bits><flow>,
* eg: 115200n8r
*/
void
uart_parse_options(const char *options, int *baud, int *parity,
int *bits, int *flow)
{
const char *s = options;
*baud = simple_strtoul(s, NULL, 10);
while (*s >= '0' && *s <= '9')
s++;
if (*s)
*parity = *s++;
if (*s)
*bits = *s++ - '0';
if (*s)
*flow = *s;
}
EXPORT_SYMBOL_GPL(uart_parse_options);
/**
* uart_set_options - setup the serial console parameters
* @port: pointer to the serial ports uart_port structure
* @co: console pointer
* @baud: baud rate
* @parity: parity character - 'n' (none), 'o' (odd), 'e' (even)
* @bits: number of data bits
* @flow: flow control character - 'r' (rts)
*/
int
uart_set_options(struct uart_port *port, struct console *co,
int baud, int parity, int bits, int flow)
{
struct ktermios termios;
static struct ktermios dummy;
/*
* Ensure that the serial-console lock is initialised early.
*
* Note that the console-enabled check is needed because of kgdboc,
* which can end up calling uart_set_options() for an already enabled
* console via tty_find_polling_driver() and uart_poll_init().
*/
if (!uart_console_enabled(port) && !port->console_reinit)
uart_port_spin_lock_init(port);
memset(&termios, 0, sizeof(struct ktermios));
termios.c_cflag |= CREAD | HUPCL | CLOCAL;
tty_termios_encode_baud_rate(&termios, baud, baud);
if (bits == 7)
termios.c_cflag |= CS7;
else
termios.c_cflag |= CS8;
switch (parity) {
case 'o': case 'O':
termios.c_cflag |= PARODD;
fallthrough;
case 'e': case 'E':
termios.c_cflag |= PARENB;
break;
}
if (flow == 'r')
termios.c_cflag |= CRTSCTS;
/*
* some uarts on other side don't support no flow control.
* So we set * DTR in host uart to make them happy
*/
port->mctrl |= TIOCM_DTR;
port->ops->set_termios(port, &termios, &dummy);
/*
* Allow the setting of the UART parameters with a NULL console
* too:
*/
if (co) {
co->cflag = termios.c_cflag;
co->ispeed = termios.c_ispeed;
co->ospeed = termios.c_ospeed;
}
return 0;
}
EXPORT_SYMBOL_GPL(uart_set_options);
#endif /* CONFIG_SERIAL_CORE_CONSOLE */
/**
* uart_change_pm - set power state of the port
*
* @state: port descriptor
* @pm_state: new state
*
* Locking: port->mutex has to be held
*/
static void uart_change_pm(struct uart_state *state,
enum uart_pm_state pm_state)
{
struct uart_port *port = uart_port_check(state);
if (state->pm_state != pm_state) {
if (port && port->ops->pm)
port->ops->pm(port, pm_state, state->pm_state);
state->pm_state = pm_state;
}
}
struct uart_match {
struct uart_port *port;
struct uart_driver *driver;
};
static int serial_match_port(struct device *dev, void *data)
{
struct uart_match *match = data;
struct tty_driver *tty_drv = match->driver->tty_driver;
dev_t devt = MKDEV(tty_drv->major, tty_drv->minor_start) +
match->port->line;
return dev->devt == devt; /* Actually, only one tty per port */
}
int uart_suspend_port(struct uart_driver *drv, struct uart_port *uport)
{
struct uart_state *state = drv->state + uport->line;
struct tty_port *port = &state->port;
struct device *tty_dev;
struct uart_match match = {uport, drv};
mutex_lock(&port->mutex);
tty_dev = device_find_child(uport->dev, &match, serial_match_port);
if (tty_dev && device_may_wakeup(tty_dev)) {
enable_irq_wake(uport->irq);
put_device(tty_dev);
mutex_unlock(&port->mutex);
return 0;
}
put_device(tty_dev);
/* Nothing to do if the console is not suspending */
if (!console_suspend_enabled && uart_console(uport))
goto unlock;
uport->suspended = 1;
if (tty_port_initialized(port)) {
const struct uart_ops *ops = uport->ops;
int tries;
tty_port_set_suspended(port, 1);
tty_port_set_initialized(port, 0);
spin_lock_irq(&uport->lock);
ops->stop_tx(uport);
ops->set_mctrl(uport, 0);
ops->stop_rx(uport);
spin_unlock_irq(&uport->lock);
/*
* Wait for the transmitter to empty.
*/
for (tries = 3; !ops->tx_empty(uport) && tries; tries--)
msleep(10);
if (!tries)
dev_err(uport->dev, "%s: Unable to drain transmitter\n",
uport->name);
ops->shutdown(uport);
}
/*
* Disable the console device before suspending.
*/
if (uart_console(uport))
console_stop(uport->cons);
uart_change_pm(state, UART_PM_STATE_OFF);
unlock:
mutex_unlock(&port->mutex);
return 0;
}
int uart_resume_port(struct uart_driver *drv, struct uart_port *uport)
{
struct uart_state *state = drv->state + uport->line;
struct tty_port *port = &state->port;
struct device *tty_dev;
struct uart_match match = {uport, drv};
struct ktermios termios;
mutex_lock(&port->mutex);
tty_dev = device_find_child(uport->dev, &match, serial_match_port);
if (!uport->suspended && device_may_wakeup(tty_dev)) {
if (irqd_is_wakeup_set(irq_get_irq_data((uport->irq))))
disable_irq_wake(uport->irq);
put_device(tty_dev);
mutex_unlock(&port->mutex);
return 0;
}
put_device(tty_dev);
uport->suspended = 0;
/*
* Re-enable the console device after suspending.
*/
if (uart_console(uport)) {
/*
* First try to use the console cflag setting.
*/
memset(&termios, 0, sizeof(struct ktermios));
termios.c_cflag = uport->cons->cflag;
termios.c_ispeed = uport->cons->ispeed;
termios.c_ospeed = uport->cons->ospeed;
/*
* If that's unset, use the tty termios setting.
*/
if (port->tty && termios.c_cflag == 0)
termios = port->tty->termios;
if (console_suspend_enabled)
uart_change_pm(state, UART_PM_STATE_ON);
uport->ops->set_termios(uport, &termios, NULL);
if (console_suspend_enabled)
console_start(uport->cons);
}
if (tty_port_suspended(port)) {
const struct uart_ops *ops = uport->ops;
int ret;
uart_change_pm(state, UART_PM_STATE_ON);
spin_lock_irq(&uport->lock);
ops->set_mctrl(uport, 0);
spin_unlock_irq(&uport->lock);
if (console_suspend_enabled || !uart_console(uport)) {
/* Protected by port mutex for now */
struct tty_struct *tty = port->tty;
ret = ops->startup(uport);
if (ret == 0) {
if (tty)
uart_change_speed(tty, state, NULL);
spin_lock_irq(&uport->lock);
ops->set_mctrl(uport, uport->mctrl);
ops->start_tx(uport);
spin_unlock_irq(&uport->lock);
tty_port_set_initialized(port, 1);
} else {
/*
* Failed to resume - maybe hardware went away?
* Clear the "initialized" flag so we won't try
* to call the low level drivers shutdown method.
*/
uart_shutdown(tty, state);
}
}
tty_port_set_suspended(port, 0);
}
mutex_unlock(&port->mutex);
return 0;
}
static inline void
uart_report_port(struct uart_driver *drv, struct uart_port *port)
{
char address[64];
switch (port->iotype) {
case UPIO_PORT:
snprintf(address, sizeof(address), "I/O 0x%lx", port->iobase);
break;
case UPIO_HUB6:
snprintf(address, sizeof(address),
"I/O 0x%lx offset 0x%x", port->iobase, port->hub6);
break;
case UPIO_MEM:
case UPIO_MEM16:
case UPIO_MEM32:
case UPIO_MEM32BE:
case UPIO_AU:
case UPIO_TSI:
snprintf(address, sizeof(address),
"MMIO 0x%llx", (unsigned long long)port->mapbase);
break;
default:
strlcpy(address, "*unknown*", sizeof(address));
break;
}
pr_info("%s%s%s at %s (irq = %d, base_baud = %d) is a %s\n",
port->dev ? dev_name(port->dev) : "",
port->dev ? ": " : "",
port->name,
address, port->irq, port->uartclk / 16, uart_type(port));
/* The magic multiplier feature is a bit obscure, so report it too. */
if (port->flags & UPF_MAGIC_MULTIPLIER)
pr_info("%s%s%s extra baud rates supported: %d, %d",
port->dev ? dev_name(port->dev) : "",
port->dev ? ": " : "",
port->name,
port->uartclk / 8, port->uartclk / 4);
}
static void
uart_configure_port(struct uart_driver *drv, struct uart_state *state,
struct uart_port *port)
{
unsigned int flags;
/*
* If there isn't a port here, don't do anything further.
*/
if (!port->iobase && !port->mapbase && !port->membase)
return;
/*
* Now do the auto configuration stuff. Note that config_port
* is expected to claim the resources and map the port for us.
*/
flags = 0;
if (port->flags & UPF_AUTO_IRQ)
flags |= UART_CONFIG_IRQ;
if (port->flags & UPF_BOOT_AUTOCONF) {
if (!(port->flags & UPF_FIXED_TYPE)) {
port->type = PORT_UNKNOWN;
flags |= UART_CONFIG_TYPE;
}
port->ops->config_port(port, flags);
}
if (port->type != PORT_UNKNOWN) {
unsigned long flags;
uart_report_port(drv, port);
/* Power up port for set_mctrl() */
uart_change_pm(state, UART_PM_STATE_ON);
/*
* Ensure that the modem control lines are de-activated.
* keep the DTR setting that is set in uart_set_options()
* We probably don't need a spinlock around this, but
*/
spin_lock_irqsave(&port->lock, flags);
port->mctrl &= TIOCM_DTR;
port->ops->set_mctrl(port, port->mctrl);
spin_unlock_irqrestore(&port->lock, flags);
/*
* If this driver supports console, and it hasn't been
* successfully registered yet, try to re-register it.
* It may be that the port was not available.
*/
if (port->cons && !(port->cons->flags & CON_ENABLED))
register_console(port->cons);
/*
* Power down all ports by default, except the
* console if we have one.
*/
if (!uart_console(port))
uart_change_pm(state, UART_PM_STATE_OFF);
}
}
#ifdef CONFIG_CONSOLE_POLL
static int uart_poll_init(struct tty_driver *driver, int line, char *options)
{
struct uart_driver *drv = driver->driver_state;
struct uart_state *state = drv->state + line;
struct tty_port *tport;
struct uart_port *port;
int baud = 9600;
int bits = 8;
int parity = 'n';
int flow = 'n';
int ret = 0;
tport = &state->port;
mutex_lock(&tport->mutex);
port = uart_port_check(state);
if (!port || !(port->ops->poll_get_char && port->ops->poll_put_char)) {
ret = -1;
goto out;
}
if (port->ops->poll_init) {
/*
* We don't set initialized as we only initialized the hw,
* e.g. state->xmit is still uninitialized.
*/
if (!tty_port_initialized(tport))
ret = port->ops->poll_init(port);
}
if (!ret && options) {
uart_parse_options(options, &baud, &parity, &bits, &flow);
ret = uart_set_options(port, NULL, baud, parity, bits, flow);
}
out:
mutex_unlock(&tport->mutex);
return ret;
}
static int uart_poll_get_char(struct tty_driver *driver, int line)
{
struct uart_driver *drv = driver->driver_state;
struct uart_state *state = drv->state + line;
struct uart_port *port;
int ret = -1;
port = uart_port_ref(state);
if (port) {
ret = port->ops->poll_get_char(port);
uart_port_deref(port);
}
return ret;
}
static void uart_poll_put_char(struct tty_driver *driver, int line, char ch)
{
struct uart_driver *drv = driver->driver_state;
struct uart_state *state = drv->state + line;
struct uart_port *port;
port = uart_port_ref(state);
if (!port)
return;
if (ch == '\n')
port->ops->poll_put_char(port, '\r');
port->ops->poll_put_char(port, ch);
uart_port_deref(port);
}
#endif
static const struct tty_operations uart_ops = {
.install = uart_install,
.open = uart_open,
.close = uart_close,
.write = uart_write,
.put_char = uart_put_char,
.flush_chars = uart_flush_chars,
.write_room = uart_write_room,
.chars_in_buffer= uart_chars_in_buffer,
.flush_buffer = uart_flush_buffer,
.ioctl = uart_ioctl,
.throttle = uart_throttle,
.unthrottle = uart_unthrottle,
.send_xchar = uart_send_xchar,
.set_termios = uart_set_termios,
.set_ldisc = uart_set_ldisc,
.stop = uart_stop,
.start = uart_start,
.hangup = uart_hangup,
.break_ctl = uart_break_ctl,
.wait_until_sent= uart_wait_until_sent,
#ifdef CONFIG_PROC_FS
.proc_show = uart_proc_show,
#endif
.tiocmget = uart_tiocmget,
.tiocmset = uart_tiocmset,
.set_serial = uart_set_info_user,
.get_serial = uart_get_info_user,
.get_icount = uart_get_icount,
#ifdef CONFIG_CONSOLE_POLL
.poll_init = uart_poll_init,
.poll_get_char = uart_poll_get_char,
.poll_put_char = uart_poll_put_char,
#endif
};
static const struct tty_port_operations uart_port_ops = {
.carrier_raised = uart_carrier_raised,
.dtr_rts = uart_dtr_rts,
.activate = uart_port_activate,
.shutdown = uart_tty_port_shutdown,
};
/**
* uart_register_driver - register a driver with the uart core layer
* @drv: low level driver structure
*
* Register a uart driver with the core driver. We in turn register
* with the tty layer, and initialise the core driver per-port state.
*
* We have a proc file in /proc/tty/driver which is named after the
* normal driver.
*
* drv->port should be NULL, and the per-port structures should be
* registered using uart_add_one_port after this call has succeeded.
*/
int uart_register_driver(struct uart_driver *drv)
{
struct tty_driver *normal;
int i, retval = -ENOMEM;
BUG_ON(drv->state);
/*
* Maybe we should be using a slab cache for this, especially if
* we have a large number of ports to handle.
*/
drv->state = kcalloc(drv->nr, sizeof(struct uart_state), GFP_KERNEL);
if (!drv->state)
goto out;
normal = tty_alloc_driver(drv->nr, TTY_DRIVER_REAL_RAW |
TTY_DRIVER_DYNAMIC_DEV);
if (IS_ERR(normal)) {
retval = PTR_ERR(normal);
goto out_kfree;
}
drv->tty_driver = normal;
normal->driver_name = drv->driver_name;
normal->name = drv->dev_name;
normal->major = drv->major;
normal->minor_start = drv->minor;
normal->type = TTY_DRIVER_TYPE_SERIAL;
normal->subtype = SERIAL_TYPE_NORMAL;
normal->init_termios = tty_std_termios;
normal->init_termios.c_cflag = B9600 | CS8 | CREAD | HUPCL | CLOCAL;
normal->init_termios.c_ispeed = normal->init_termios.c_ospeed = 9600;
normal->driver_state = drv;
tty_set_operations(normal, &uart_ops);
/*
* Initialise the UART state(s).
*/
for (i = 0; i < drv->nr; i++) {
struct uart_state *state = drv->state + i;
struct tty_port *port = &state->port;
tty_port_init(port);
port->ops = &uart_port_ops;
}
retval = tty_register_driver(normal);
if (retval >= 0)
return retval;
for (i = 0; i < drv->nr; i++)
tty_port_destroy(&drv->state[i].port);
tty_driver_kref_put(normal);
out_kfree:
kfree(drv->state);
out:
return retval;
}
/**
* uart_unregister_driver - remove a driver from the uart core layer
* @drv: low level driver structure
*
* Remove all references to a driver from the core driver. The low
* level driver must have removed all its ports via the
* uart_remove_one_port() if it registered them with uart_add_one_port().
* (ie, drv->port == NULL)
*/
void uart_unregister_driver(struct uart_driver *drv)
{
struct tty_driver *p = drv->tty_driver;
unsigned int i;
tty_unregister_driver(p);
tty_driver_kref_put(p);
for (i = 0; i < drv->nr; i++)
tty_port_destroy(&drv->state[i].port);
kfree(drv->state);
drv->state = NULL;
drv->tty_driver = NULL;
}
struct tty_driver *uart_console_device(struct console *co, int *index)
{
struct uart_driver *p = co->data;
*index = co->index;
return p->tty_driver;
}
EXPORT_SYMBOL_GPL(uart_console_device);
static ssize_t uartclk_show(struct device *dev,
struct device_attribute *attr, char *buf)
{
struct serial_struct tmp;
struct tty_port *port = dev_get_drvdata(dev);
uart_get_info(port, &tmp);
return sprintf(buf, "%d\n", tmp.baud_base * 16);
}
static ssize_t type_show(struct device *dev,
struct device_attribute *attr, char *buf)
{
struct serial_struct tmp;
struct tty_port *port = dev_get_drvdata(dev);
uart_get_info(port, &tmp);
return sprintf(buf, "%d\n", tmp.type);
}
static ssize_t line_show(struct device *dev,
struct device_attribute *attr, char *buf)
{
struct serial_struct tmp;
struct tty_port *port = dev_get_drvdata(dev);
uart_get_info(port, &tmp);
return sprintf(buf, "%d\n", tmp.line);
}
static ssize_t port_show(struct device *dev,
struct device_attribute *attr, char *buf)
{
struct serial_struct tmp;
struct tty_port *port = dev_get_drvdata(dev);
unsigned long ioaddr;
uart_get_info(port, &tmp);
ioaddr = tmp.port;
if (HIGH_BITS_OFFSET)
ioaddr |= (unsigned long)tmp.port_high << HIGH_BITS_OFFSET;
return sprintf(buf, "0x%lX\n", ioaddr);
}
static ssize_t irq_show(struct device *dev,
struct device_attribute *attr, char *buf)
{
struct serial_struct tmp;
struct tty_port *port = dev_get_drvdata(dev);
uart_get_info(port, &tmp);
return sprintf(buf, "%d\n", tmp.irq);
}
static ssize_t flags_show(struct device *dev,
struct device_attribute *attr, char *buf)
{
struct serial_struct tmp;
struct tty_port *port = dev_get_drvdata(dev);
uart_get_info(port, &tmp);
return sprintf(buf, "0x%X\n", tmp.flags);
}
static ssize_t xmit_fifo_size_show(struct device *dev,
struct device_attribute *attr, char *buf)
{
struct serial_struct tmp;
struct tty_port *port = dev_get_drvdata(dev);
uart_get_info(port, &tmp);
return sprintf(buf, "%d\n", tmp.xmit_fifo_size);
}
static ssize_t close_delay_show(struct device *dev,
struct device_attribute *attr, char *buf)
{
struct serial_struct tmp;
struct tty_port *port = dev_get_drvdata(dev);
uart_get_info(port, &tmp);
return sprintf(buf, "%d\n", tmp.close_delay);
}
static ssize_t closing_wait_show(struct device *dev,
struct device_attribute *attr, char *buf)
{
struct serial_struct tmp;
struct tty_port *port = dev_get_drvdata(dev);
uart_get_info(port, &tmp);
return sprintf(buf, "%d\n", tmp.closing_wait);
}
static ssize_t custom_divisor_show(struct device *dev,
struct device_attribute *attr, char *buf)
{
struct serial_struct tmp;
struct tty_port *port = dev_get_drvdata(dev);
uart_get_info(port, &tmp);
return sprintf(buf, "%d\n", tmp.custom_divisor);
}
static ssize_t io_type_show(struct device *dev,
struct device_attribute *attr, char *buf)
{
struct serial_struct tmp;
struct tty_port *port = dev_get_drvdata(dev);
uart_get_info(port, &tmp);
return sprintf(buf, "%d\n", tmp.io_type);
}
static ssize_t iomem_base_show(struct device *dev,
struct device_attribute *attr, char *buf)
{
struct serial_struct tmp;
struct tty_port *port = dev_get_drvdata(dev);
uart_get_info(port, &tmp);
return sprintf(buf, "0x%lX\n", (unsigned long)tmp.iomem_base);
}
static ssize_t iomem_reg_shift_show(struct device *dev,
struct device_attribute *attr, char *buf)
{
struct serial_struct tmp;
struct tty_port *port = dev_get_drvdata(dev);
uart_get_info(port, &tmp);
return sprintf(buf, "%d\n", tmp.iomem_reg_shift);
}
static ssize_t console_show(struct device *dev,
struct device_attribute *attr, char *buf)
{
struct tty_port *port = dev_get_drvdata(dev);
struct uart_state *state = container_of(port, struct uart_state, port);
struct uart_port *uport;
bool console = false;
mutex_lock(&port->mutex);
uport = uart_port_check(state);
if (uport)
console = uart_console_enabled(uport);
mutex_unlock(&port->mutex);
return sprintf(buf, "%c\n", console ? 'Y' : 'N');
}
static ssize_t console_store(struct device *dev,
struct device_attribute *attr, const char *buf, size_t count)
{
struct tty_port *port = dev_get_drvdata(dev);
struct uart_state *state = container_of(port, struct uart_state, port);
struct uart_port *uport;
bool oldconsole, newconsole;
int ret;
ret = kstrtobool(buf, &newconsole);
if (ret)
return ret;
mutex_lock(&port->mutex);
uport = uart_port_check(state);
if (uport) {
oldconsole = uart_console_enabled(uport);
if (oldconsole && !newconsole) {
ret = unregister_console(uport->cons);
} else if (!oldconsole && newconsole) {
if (uart_console(uport)) {
uport->console_reinit = 1;
register_console(uport->cons);
} else {
ret = -ENOENT;
}
}
} else {
ret = -ENXIO;
}
mutex_unlock(&port->mutex);
return ret < 0 ? ret : count;
}
static DEVICE_ATTR_RO(uartclk);
static DEVICE_ATTR_RO(type);
static DEVICE_ATTR_RO(line);
static DEVICE_ATTR_RO(port);
static DEVICE_ATTR_RO(irq);
static DEVICE_ATTR_RO(flags);
static DEVICE_ATTR_RO(xmit_fifo_size);
static DEVICE_ATTR_RO(close_delay);
static DEVICE_ATTR_RO(closing_wait);
static DEVICE_ATTR_RO(custom_divisor);
static DEVICE_ATTR_RO(io_type);
static DEVICE_ATTR_RO(iomem_base);
static DEVICE_ATTR_RO(iomem_reg_shift);
static DEVICE_ATTR_RW(console);
static struct attribute *tty_dev_attrs[] = {
&dev_attr_uartclk.attr,
&dev_attr_type.attr,
&dev_attr_line.attr,
&dev_attr_port.attr,
&dev_attr_irq.attr,
&dev_attr_flags.attr,
&dev_attr_xmit_fifo_size.attr,
&dev_attr_close_delay.attr,
&dev_attr_closing_wait.attr,
&dev_attr_custom_divisor.attr,
&dev_attr_io_type.attr,
&dev_attr_iomem_base.attr,
&dev_attr_iomem_reg_shift.attr,
&dev_attr_console.attr,
NULL
};
static const struct attribute_group tty_dev_attr_group = {
.attrs = tty_dev_attrs,
};
/**
* uart_add_one_port - attach a driver-defined port structure
* @drv: pointer to the uart low level driver structure for this port
* @uport: uart port structure to use for this port.
*
* Context: task context, might sleep
*
* This allows the driver to register its own uart_port structure
* with the core driver. The main purpose is to allow the low
* level uart drivers to expand uart_port, rather than having yet
* more levels of structures.
*/
int uart_add_one_port(struct uart_driver *drv, struct uart_port *uport)
{
struct uart_state *state;
struct tty_port *port;
int ret = 0;
struct device *tty_dev;
int num_groups;
if (uport->line >= drv->nr)
return -EINVAL;
state = drv->state + uport->line;
port = &state->port;
mutex_lock(&port_mutex);
mutex_lock(&port->mutex);
if (state->uart_port) {
ret = -EINVAL;
goto out;
}
/* Link the port to the driver state table and vice versa */
atomic_set(&state->refcount, 1);
init_waitqueue_head(&state->remove_wait);
state->uart_port = uport;
uport->state = state;
state->pm_state = UART_PM_STATE_UNDEFINED;
uport->cons = drv->cons;
uport->minor = drv->tty_driver->minor_start + uport->line;
uport->name = kasprintf(GFP_KERNEL, "%s%d", drv->dev_name,
drv->tty_driver->name_base + uport->line);
if (!uport->name) {
ret = -ENOMEM;
goto out;
}
/*
* If this port is in use as a console then the spinlock is already
* initialised.
*/
if (!uart_console_enabled(uport))
uart_port_spin_lock_init(uport);
if (uport->cons && uport->dev)
of_console_check(uport->dev->of_node, uport->cons->name, uport->line);
tty_port_link_device(port, drv->tty_driver, uport->line);
uart_configure_port(drv, state, uport);
port->console = uart_console(uport);
num_groups = 2;
if (uport->attr_group)
num_groups++;
uport->tty_groups = kcalloc(num_groups, sizeof(*uport->tty_groups),
GFP_KERNEL);
if (!uport->tty_groups) {
ret = -ENOMEM;
goto out;
}
uport->tty_groups[0] = &tty_dev_attr_group;
if (uport->attr_group)
uport->tty_groups[1] = uport->attr_group;
/*
* Register the port whether it's detected or not. This allows
* setserial to be used to alter this port's parameters.
*/
tty_dev = tty_port_register_device_attr_serdev(port, drv->tty_driver,
uport->line, uport->dev, port, uport->tty_groups);
if (!IS_ERR(tty_dev)) {
device_set_wakeup_capable(tty_dev, 1);
} else {
dev_err(uport->dev, "Cannot register tty device on line %d\n",
uport->line);
}
/*
* Ensure UPF_DEAD is not set.
*/
uport->flags &= ~UPF_DEAD;
out:
mutex_unlock(&port->mutex);
mutex_unlock(&port_mutex);
return ret;
}
/**
* uart_remove_one_port - detach a driver defined port structure
* @drv: pointer to the uart low level driver structure for this port
* @uport: uart port structure for this port
*
* Context: task context, might sleep
*
* This unhooks (and hangs up) the specified port structure from the
* core driver. No further calls will be made to the low-level code
* for this port.
*/
int uart_remove_one_port(struct uart_driver *drv, struct uart_port *uport)
{
struct uart_state *state = drv->state + uport->line;
struct tty_port *port = &state->port;
struct uart_port *uart_port;
struct tty_struct *tty;
int ret = 0;
mutex_lock(&port_mutex);
/*
* Mark the port "dead" - this prevents any opens from
* succeeding while we shut down the port.
*/
mutex_lock(&port->mutex);
uart_port = uart_port_check(state);
if (uart_port != uport)
dev_alert(uport->dev, "Removing wrong port: %p != %p\n",
uart_port, uport);
if (!uart_port) {
mutex_unlock(&port->mutex);
ret = -EINVAL;
goto out;
}
uport->flags |= UPF_DEAD;
mutex_unlock(&port->mutex);
/*
* Remove the devices from the tty layer
*/
tty_port_unregister_device(port, drv->tty_driver, uport->line);
tty = tty_port_tty_get(port);
if (tty) {
tty_vhangup(port->tty);
tty_kref_put(tty);
}
/*
* If the port is used as a console, unregister it
*/
if (uart_console(uport))
unregister_console(uport->cons);
/*
* Free the port IO and memory resources, if any.
*/
if (uport->type != PORT_UNKNOWN && uport->ops->release_port)
uport->ops->release_port(uport);
kfree(uport->tty_groups);
kfree(uport->name);
/*
* Indicate that there isn't a port here anymore.
*/
uport->type = PORT_UNKNOWN;
mutex_lock(&port->mutex);
WARN_ON(atomic_dec_return(&state->refcount) < 0);
wait_event(state->remove_wait, !atomic_read(&state->refcount));
state->uart_port = NULL;
mutex_unlock(&port->mutex);
out:
mutex_unlock(&port_mutex);
return ret;
}
/*
* Are the two ports equivalent?
*/
bool uart_match_port(const struct uart_port *port1,
const struct uart_port *port2)
{
if (port1->iotype != port2->iotype)
return false;
switch (port1->iotype) {
case UPIO_PORT:
return port1->iobase == port2->iobase;
case UPIO_HUB6:
return port1->iobase == port2->iobase &&
port1->hub6 == port2->hub6;
case UPIO_MEM:
case UPIO_MEM16:
case UPIO_MEM32:
case UPIO_MEM32BE:
case UPIO_AU:
case UPIO_TSI:
return port1->mapbase == port2->mapbase;
}
return false;
}
EXPORT_SYMBOL(uart_match_port);
/**
* uart_handle_dcd_change - handle a change of carrier detect state
* @uport: uart_port structure for the open port
* @status: new carrier detect status, nonzero if active
*
* Caller must hold uport->lock
*/
void uart_handle_dcd_change(struct uart_port *uport, unsigned int status)
{
struct tty_port *port = &uport->state->port;
struct tty_struct *tty = port->tty;
struct tty_ldisc *ld;
lockdep_assert_held_once(&uport->lock);
if (tty) {
ld = tty_ldisc_ref(tty);
if (ld) {
if (ld->ops->dcd_change)
ld->ops->dcd_change(tty, status);
tty_ldisc_deref(ld);
}
}
uport->icount.dcd++;
if (uart_dcd_enabled(uport)) {
if (status)
wake_up_interruptible(&port->open_wait);
else if (tty)
tty_hangup(tty);
}
}
EXPORT_SYMBOL_GPL(uart_handle_dcd_change);
/**
* uart_handle_cts_change - handle a change of clear-to-send state
* @uport: uart_port structure for the open port
* @status: new clear to send status, nonzero if active
*
* Caller must hold uport->lock
*/
void uart_handle_cts_change(struct uart_port *uport, unsigned int status)
{
lockdep_assert_held_once(&uport->lock);
uport->icount.cts++;
if (uart_softcts_mode(uport)) {
if (uport->hw_stopped) {
if (status) {
uport->hw_stopped = 0;
uport->ops->start_tx(uport);
uart_write_wakeup(uport);
}
} else {
if (!status) {
uport->hw_stopped = 1;
uport->ops->stop_tx(uport);
}
}
}
}
EXPORT_SYMBOL_GPL(uart_handle_cts_change);
/**
* uart_insert_char - push a char to the uart layer
*
* User is responsible to call tty_flip_buffer_push when they are done with
* insertion.
*
* @port: corresponding port
* @status: state of the serial port RX buffer (LSR for 8250)
* @overrun: mask of overrun bits in @status
* @ch: character to push
* @flag: flag for the character (see TTY_NORMAL and friends)
*/
void uart_insert_char(struct uart_port *port, unsigned int status,
unsigned int overrun, unsigned int ch, unsigned int flag)
{
struct tty_port *tport = &port->state->port;
if ((status & port->ignore_status_mask & ~overrun) == 0)
if (tty_insert_flip_char(tport, ch, flag) == 0)
++port->icount.buf_overrun;
/*
* Overrun is special. Since it's reported immediately,
* it doesn't affect the current character.
*/
if (status & ~port->ignore_status_mask & overrun)
if (tty_insert_flip_char(tport, 0, TTY_OVERRUN) == 0)
++port->icount.buf_overrun;
}
EXPORT_SYMBOL_GPL(uart_insert_char);
#ifdef CONFIG_MAGIC_SYSRQ_SERIAL
static const char sysrq_toggle_seq[] = CONFIG_MAGIC_SYSRQ_SERIAL_SEQUENCE;
static void uart_sysrq_on(struct work_struct *w)
{
int sysrq_toggle_seq_len = strlen(sysrq_toggle_seq);
sysrq_toggle_support(1);
pr_info("SysRq is enabled by magic sequence '%*pE' on serial\n",
sysrq_toggle_seq_len, sysrq_toggle_seq);
}
static DECLARE_WORK(sysrq_enable_work, uart_sysrq_on);
/**
* uart_try_toggle_sysrq - Enables SysRq from serial line
* @port: uart_port structure where char(s) after BREAK met
* @ch: new character in the sequence after received BREAK
*
* Enables magic SysRq when the required sequence is met on port
* (see CONFIG_MAGIC_SYSRQ_SERIAL_SEQUENCE).
*
* Returns false if @ch is out of enabling sequence and should be
* handled some other way, true if @ch was consumed.
*/
bool uart_try_toggle_sysrq(struct uart_port *port, unsigned int ch)
{
int sysrq_toggle_seq_len = strlen(sysrq_toggle_seq);
if (!sysrq_toggle_seq_len)
return false;
BUILD_BUG_ON(ARRAY_SIZE(sysrq_toggle_seq) >= U8_MAX);
if (sysrq_toggle_seq[port->sysrq_seq] != ch) {
port->sysrq_seq = 0;
return false;
}
if (++port->sysrq_seq < sysrq_toggle_seq_len) {
port->sysrq = jiffies + SYSRQ_TIMEOUT;
return true;
}
schedule_work(&sysrq_enable_work);
port->sysrq = 0;
return true;
}
EXPORT_SYMBOL_GPL(uart_try_toggle_sysrq);
#endif
EXPORT_SYMBOL(uart_write_wakeup);
EXPORT_SYMBOL(uart_register_driver);
EXPORT_SYMBOL(uart_unregister_driver);
EXPORT_SYMBOL(uart_suspend_port);
EXPORT_SYMBOL(uart_resume_port);
EXPORT_SYMBOL(uart_add_one_port);
EXPORT_SYMBOL(uart_remove_one_port);
/**
* uart_get_rs485_mode() - retrieve rs485 properties for given uart
* @port: uart device's target port
*
* This function implements the device tree binding described in
* Documentation/devicetree/bindings/serial/rs485.txt.
*/
int uart_get_rs485_mode(struct uart_port *port)
{
struct serial_rs485 *rs485conf = &port->rs485;
struct device *dev = port->dev;
u32 rs485_delay[2];
int ret;
ret = device_property_read_u32_array(dev, "rs485-rts-delay",
rs485_delay, 2);
if (!ret) {
rs485conf->delay_rts_before_send = rs485_delay[0];
rs485conf->delay_rts_after_send = rs485_delay[1];
} else {
rs485conf->delay_rts_before_send = 0;
rs485conf->delay_rts_after_send = 0;
}
/*
* Clear full-duplex and enabled flags, set RTS polarity to active high
* to get to a defined state with the following properties:
*/
rs485conf->flags &= ~(SER_RS485_RX_DURING_TX | SER_RS485_ENABLED |
SER_RS485_TERMINATE_BUS |
SER_RS485_RTS_AFTER_SEND);
rs485conf->flags |= SER_RS485_RTS_ON_SEND;
if (device_property_read_bool(dev, "rs485-rx-during-tx"))
rs485conf->flags |= SER_RS485_RX_DURING_TX;
if (device_property_read_bool(dev, "linux,rs485-enabled-at-boot-time"))
rs485conf->flags |= SER_RS485_ENABLED;
if (device_property_read_bool(dev, "rs485-rts-active-low")) {
rs485conf->flags &= ~SER_RS485_RTS_ON_SEND;
rs485conf->flags |= SER_RS485_RTS_AFTER_SEND;
}
/*
* Disabling termination by default is the safe choice: Else if many
* bus participants enable it, no communication is possible at all.
* Works fine for short cables and users may enable for longer cables.
*/
port->rs485_term_gpio = devm_gpiod_get_optional(dev, "rs485-term",
GPIOD_OUT_LOW);
if (IS_ERR(port->rs485_term_gpio)) {
ret = PTR_ERR(port->rs485_term_gpio);
port->rs485_term_gpio = NULL;
return dev_err_probe(dev, ret, "Cannot get rs485-term-gpios\n");
}
return 0;
}
EXPORT_SYMBOL_GPL(uart_get_rs485_mode);
MODULE_DESCRIPTION("Serial driver core");
MODULE_LICENSE("GPL");
// SPDX-License-Identifier: GPL-2.0
/*
* linux/fs/filesystems.c
*
* Copyright (C) 1991, 1992 Linus Torvalds
*
* table of configured filesystems
*/
#include <linux/syscalls.h>
#include <linux/fs.h>
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
#include <linux/kmod.h>
#include <linux/init.h>
#include <linux/module.h>
#include <linux/slab.h>
#include <linux/uaccess.h>
#include <linux/fs_parser.h>
/*
* Handling of filesystem drivers list.
* Rules:
* Inclusion to/removals from/scanning of list are protected by spinlock.
* During the unload module must call unregister_filesystem().
* We can access the fields of list element if:
* 1) spinlock is held or
* 2) we hold the reference to the module.
* The latter can be guaranteed by call of try_module_get(); if it
* returned 0 we must skip the element, otherwise we got the reference.
* Once the reference is obtained we can drop the spinlock.
*/
static struct file_system_type *file_systems;
static DEFINE_RWLOCK(file_systems_lock);
/* WARNING: This can be used only if we _already_ own a reference */
struct file_system_type *get_filesystem(struct file_system_type *fs)
{
__module_get(fs->owner);
return fs;
}
void put_filesystem(struct file_system_type *fs)
{
module_put(fs->owner);
}
static struct file_system_type **find_filesystem(const char *name, unsigned len)
{
struct file_system_type **p;
for (p = &file_systems; *p; p = &(*p)->next) if (strncmp((*p)->name, name, len) == 0 && !(*p)->name[len])
break;
return p;
}
/**
* register_filesystem - register a new filesystem
* @fs: the file system structure
*
* Adds the file system passed to the list of file systems the kernel
* is aware of for mount and other syscalls. Returns 0 on success,
* or a negative errno code on an error.
*
* The &struct file_system_type that is passed is linked into the kernel
* structures and must not be freed until the file system has been
* unregistered.
*/
int register_filesystem(struct file_system_type * fs)
{
int res = 0;
struct file_system_type ** p;
if (fs->parameters &&
!fs_validate_description(fs->name, fs->parameters))
return -EINVAL;
BUG_ON(strchr(fs->name, '.'));
if (fs->next)
return -EBUSY;
write_lock(&file_systems_lock);
p = find_filesystem(fs->name, strlen(fs->name));
if (*p)
res = -EBUSY;
else
*p = fs;
write_unlock(&file_systems_lock);
return res;
}
EXPORT_SYMBOL(register_filesystem);
/**
* unregister_filesystem - unregister a file system
* @fs: filesystem to unregister
*
* Remove a file system that was previously successfully registered
* with the kernel. An error is returned if the file system is not found.
* Zero is returned on a success.
*
* Once this function has returned the &struct file_system_type structure
* may be freed or reused.
*/
int unregister_filesystem(struct file_system_type * fs)
{
struct file_system_type ** tmp;
write_lock(&file_systems_lock);
tmp = &file_systems;
while (*tmp) {
if (fs == *tmp) {
*tmp = fs->next;
fs->next = NULL;
write_unlock(&file_systems_lock);
synchronize_rcu();
return 0;
}
tmp = &(*tmp)->next;
}
write_unlock(&file_systems_lock);
return -EINVAL;
}
EXPORT_SYMBOL(unregister_filesystem);
#ifdef CONFIG_SYSFS_SYSCALL
static int fs_index(const char __user * __name)
{
struct file_system_type * tmp;
struct filename *name;
int err, index;
name = getname(__name);
err = PTR_ERR(name);
if (IS_ERR(name))
return err;
err = -EINVAL;
read_lock(&file_systems_lock);
for (tmp=file_systems, index=0 ; tmp ; tmp=tmp->next, index++) {
if (strcmp(tmp->name, name->name) == 0) {
err = index;
break;
}
}
read_unlock(&file_systems_lock);
putname(name);
return err;
}
static int fs_name(unsigned int index, char __user * buf)
{
struct file_system_type * tmp;
int len, res;
read_lock(&file_systems_lock);
for (tmp = file_systems; tmp; tmp = tmp->next, index--)
if (index <= 0 && try_module_get(tmp->owner))
break;
read_unlock(&file_systems_lock);
if (!tmp)
return -EINVAL;
/* OK, we got the reference, so we can safely block */
len = strlen(tmp->name) + 1;
res = copy_to_user(buf, tmp->name, len) ? -EFAULT : 0;
put_filesystem(tmp);
return res;
}
static int fs_maxindex(void)
{
struct file_system_type * tmp;
int index;
read_lock(&file_systems_lock);
for (tmp = file_systems, index = 0 ; tmp ; tmp = tmp->next, index++)
;
read_unlock(&file_systems_lock);
return index;
}
/*
* Whee.. Weird sysv syscall.
*/
SYSCALL_DEFINE3(sysfs, int, option, unsigned long, arg1, unsigned long, arg2)
{
int retval = -EINVAL;
switch (option) {
case 1:
retval = fs_index((const char __user *) arg1);
break;
case 2:
retval = fs_name(arg1, (char __user *) arg2);
break;
case 3:
retval = fs_maxindex();
break;
}
return retval;
}
#endif
int __init list_bdev_fs_names(char *buf, size_t size)
{
struct file_system_type *p;
size_t len;
int count = 0;
read_lock(&file_systems_lock);
for (p = file_systems; p; p = p->next) {
if (!(p->fs_flags & FS_REQUIRES_DEV))
continue;
len = strlen(p->name) + 1;
if (len > size) {
pr_warn("%s: truncating file system list\n", __func__);
break;
}
memcpy(buf, p->name, len);
buf += len;
size -= len;
count++;
}
read_unlock(&file_systems_lock);
return count;
}
#ifdef CONFIG_PROC_FS
static int filesystems_proc_show(struct seq_file *m, void *v)
{
struct file_system_type * tmp;
read_lock(&file_systems_lock);
tmp = file_systems;
while (tmp) {
seq_printf(m, "%s\t%s\n",
(tmp->fs_flags & FS_REQUIRES_DEV) ? "" : "nodev",
tmp->name);
tmp = tmp->next;
}
read_unlock(&file_systems_lock);
return 0;
}
static int __init proc_filesystems_init(void)
{
proc_create_single("filesystems", 0, NULL, filesystems_proc_show);
return 0;
}
module_init(proc_filesystems_init);
#endif
static struct file_system_type *__get_fs_type(const char *name, int len)
{
struct file_system_type *fs;
read_lock(&file_systems_lock);
fs = *(find_filesystem(name, len));
if (fs && !try_module_get(fs->owner))
fs = NULL;
read_unlock(&file_systems_lock);
return fs;
}
struct file_system_type *get_fs_type(const char *name)
{
struct file_system_type *fs;
const char *dot = strchr(name, '.'); int len = dot ? dot - name : strlen(name);
fs = __get_fs_type(name, len);
if (!fs && (request_module("fs-%.*s", len, name) == 0)) { fs = __get_fs_type(name, len);
if (!fs)
pr_warn_once("request_module fs-%.*s succeeded, but still no fs?\n",
len, name);
}
if (dot && fs && !(fs->fs_flags & FS_HAS_SUBTYPE)) {
put_filesystem(fs);
fs = NULL;
}
return fs;
}
EXPORT_SYMBOL(get_fs_type);
// SPDX-License-Identifier: GPL-2.0-only
/*
* mm/percpu-vm.c - vmalloc area based chunk allocation
*
* Copyright (C) 2010 SUSE Linux Products GmbH
* Copyright (C) 2010 Tejun Heo <tj@kernel.org>
*
* Chunks are mapped into vmalloc areas and populated page by page.
* This is the default chunk allocator.
*/
#include "internal.h"
static struct page *pcpu_chunk_page(struct pcpu_chunk *chunk,
unsigned int cpu, int page_idx)
{
/* must not be used on pre-mapped chunk */
WARN_ON(chunk->immutable);
return vmalloc_to_page((void *)pcpu_chunk_addr(chunk, cpu, page_idx));
}
/**
* pcpu_get_pages - get temp pages array
*
* Returns pointer to array of pointers to struct page which can be indexed
* with pcpu_page_idx(). Note that there is only one array and accesses
* should be serialized by pcpu_alloc_mutex.
*
* RETURNS:
* Pointer to temp pages array on success.
*/
static struct page **pcpu_get_pages(void)
{
static struct page **pages;
size_t pages_size = pcpu_nr_units * pcpu_unit_pages * sizeof(pages[0]);
lockdep_assert_held(&pcpu_alloc_mutex);
if (!pages)
pages = pcpu_mem_zalloc(pages_size, GFP_KERNEL);
return pages;
}
/**
* pcpu_free_pages - free pages which were allocated for @chunk
* @chunk: chunk pages were allocated for
* @pages: array of pages to be freed, indexed by pcpu_page_idx()
* @page_start: page index of the first page to be freed
* @page_end: page index of the last page to be freed + 1
*
* Free pages [@page_start and @page_end) in @pages for all units.
* The pages were allocated for @chunk.
*/
static void pcpu_free_pages(struct pcpu_chunk *chunk,
struct page **pages, int page_start, int page_end)
{
unsigned int cpu;
int i;
for_each_possible_cpu(cpu) {
for (i = page_start; i < page_end; i++) {
struct page *page = pages[pcpu_page_idx(cpu, i)];
if (page)
__free_page(page);
}
}
}
/**
* pcpu_alloc_pages - allocates pages for @chunk
* @chunk: target chunk
* @pages: array to put the allocated pages into, indexed by pcpu_page_idx()
* @page_start: page index of the first page to be allocated
* @page_end: page index of the last page to be allocated + 1
* @gfp: allocation flags passed to the underlying allocator
*
* Allocate pages [@page_start,@page_end) into @pages for all units.
* The allocation is for @chunk. Percpu core doesn't care about the
* content of @pages and will pass it verbatim to pcpu_map_pages().
*/
static int pcpu_alloc_pages(struct pcpu_chunk *chunk,
struct page **pages, int page_start, int page_end,
gfp_t gfp)
{
unsigned int cpu, tcpu;
int i;
gfp |= __GFP_HIGHMEM;
for_each_possible_cpu(cpu) {
for (i = page_start; i < page_end; i++) {
struct page **pagep = &pages[pcpu_page_idx(cpu, i)];
*pagep = alloc_pages_node(cpu_to_node(cpu), gfp, 0);
if (!*pagep)
goto err;
}
}
return 0;
err:
while (--i >= page_start)
__free_page(pages[pcpu_page_idx(cpu, i)]);
for_each_possible_cpu(tcpu) {
if (tcpu == cpu)
break;
for (i = page_start; i < page_end; i++)
__free_page(pages[pcpu_page_idx(tcpu, i)]);
}
return -ENOMEM;
}
/**
* pcpu_pre_unmap_flush - flush cache prior to unmapping
* @chunk: chunk the regions to be flushed belongs to
* @page_start: page index of the first page to be flushed
* @page_end: page index of the last page to be flushed + 1
*
* Pages in [@page_start,@page_end) of @chunk are about to be
* unmapped. Flush cache. As each flushing trial can be very
* expensive, issue flush on the whole region at once rather than
* doing it for each cpu. This could be an overkill but is more
* scalable.
*/
static void pcpu_pre_unmap_flush(struct pcpu_chunk *chunk,
int page_start, int page_end)
{
flush_cache_vunmap(
pcpu_chunk_addr(chunk, pcpu_low_unit_cpu, page_start),
pcpu_chunk_addr(chunk, pcpu_high_unit_cpu, page_end));
}
static void __pcpu_unmap_pages(unsigned long addr, int nr_pages)
{
vunmap_range_noflush(addr, addr + (nr_pages << PAGE_SHIFT));
}
/**
* pcpu_unmap_pages - unmap pages out of a pcpu_chunk
* @chunk: chunk of interest
* @pages: pages array which can be used to pass information to free
* @page_start: page index of the first page to unmap
* @page_end: page index of the last page to unmap + 1
*
* For each cpu, unmap pages [@page_start,@page_end) out of @chunk.
* Corresponding elements in @pages were cleared by the caller and can
* be used to carry information to pcpu_free_pages() which will be
* called after all unmaps are finished. The caller should call
* proper pre/post flush functions.
*/
static void pcpu_unmap_pages(struct pcpu_chunk *chunk,
struct page **pages, int page_start, int page_end)
{
unsigned int cpu;
int i;
for_each_possible_cpu(cpu) {
for (i = page_start; i < page_end; i++) {
struct page *page;
page = pcpu_chunk_page(chunk, cpu, i);
WARN_ON(!page);
pages[pcpu_page_idx(cpu, i)] = page;
}
__pcpu_unmap_pages(pcpu_chunk_addr(chunk, cpu, page_start),
page_end - page_start);
}
}
/**
* pcpu_post_unmap_tlb_flush - flush TLB after unmapping
* @chunk: pcpu_chunk the regions to be flushed belong to
* @page_start: page index of the first page to be flushed
* @page_end: page index of the last page to be flushed + 1
*
* Pages [@page_start,@page_end) of @chunk have been unmapped. Flush
* TLB for the regions. This can be skipped if the area is to be
* returned to vmalloc as vmalloc will handle TLB flushing lazily.
*
* As with pcpu_pre_unmap_flush(), TLB flushing also is done at once
* for the whole region.
*/
static void pcpu_post_unmap_tlb_flush(struct pcpu_chunk *chunk,
int page_start, int page_end)
{
flush_tlb_kernel_range(
pcpu_chunk_addr(chunk, pcpu_low_unit_cpu, page_start),
pcpu_chunk_addr(chunk, pcpu_high_unit_cpu, page_end));
}
static int __pcpu_map_pages(unsigned long addr, struct page **pages,
int nr_pages)
{
return vmap_pages_range_noflush(addr, addr + (nr_pages << PAGE_SHIFT),
PAGE_KERNEL, pages, PAGE_SHIFT);
}
/**
* pcpu_map_pages - map pages into a pcpu_chunk
* @chunk: chunk of interest
* @pages: pages array containing pages to be mapped
* @page_start: page index of the first page to map
* @page_end: page index of the last page to map + 1
*
* For each cpu, map pages [@page_start,@page_end) into @chunk. The
* caller is responsible for calling pcpu_post_map_flush() after all
* mappings are complete.
*
* This function is responsible for setting up whatever is necessary for
* reverse lookup (addr -> chunk).
*/
static int pcpu_map_pages(struct pcpu_chunk *chunk,
struct page **pages, int page_start, int page_end)
{
unsigned int cpu, tcpu;
int i, err;
for_each_possible_cpu(cpu) {
err = __pcpu_map_pages(pcpu_chunk_addr(chunk, cpu, page_start),
&pages[pcpu_page_idx(cpu, page_start)],
page_end - page_start);
if (err < 0)
goto err;
for (i = page_start; i < page_end; i++)
pcpu_set_page_chunk(pages[pcpu_page_idx(cpu, i)],
chunk);
}
return 0;
err:
for_each_possible_cpu(tcpu) {
if (tcpu == cpu)
break;
__pcpu_unmap_pages(pcpu_chunk_addr(chunk, tcpu, page_start),
page_end - page_start);
}
pcpu_post_unmap_tlb_flush(chunk, page_start, page_end);
return err;
}
/**
* pcpu_post_map_flush - flush cache after mapping
* @chunk: pcpu_chunk the regions to be flushed belong to
* @page_start: page index of the first page to be flushed
* @page_end: page index of the last page to be flushed + 1
*
* Pages [@page_start,@page_end) of @chunk have been mapped. Flush
* cache.
*
* As with pcpu_pre_unmap_flush(), TLB flushing also is done at once
* for the whole region.
*/
static void pcpu_post_map_flush(struct pcpu_chunk *chunk,
int page_start, int page_end)
{
flush_cache_vmap(
pcpu_chunk_addr(chunk, pcpu_low_unit_cpu, page_start),
pcpu_chunk_addr(chunk, pcpu_high_unit_cpu, page_end));
}
/**
* pcpu_populate_chunk - populate and map an area of a pcpu_chunk
* @chunk: chunk of interest
* @page_start: the start page
* @page_end: the end page
* @gfp: allocation flags passed to the underlying memory allocator
*
* For each cpu, populate and map pages [@page_start,@page_end) into
* @chunk.
*
* CONTEXT:
* pcpu_alloc_mutex, does GFP_KERNEL allocation.
*/
static int pcpu_populate_chunk(struct pcpu_chunk *chunk,
int page_start, int page_end, gfp_t gfp)
{
struct page **pages;
pages = pcpu_get_pages();
if (!pages)
return -ENOMEM;
if (pcpu_alloc_pages(chunk, pages, page_start, page_end, gfp))
return -ENOMEM;
if (pcpu_map_pages(chunk, pages, page_start, page_end)) {
pcpu_free_pages(chunk, pages, page_start, page_end);
return -ENOMEM;
}
pcpu_post_map_flush(chunk, page_start, page_end);
return 0;
}
/**
* pcpu_depopulate_chunk - depopulate and unmap an area of a pcpu_chunk
* @chunk: chunk to depopulate
* @page_start: the start page
* @page_end: the end page
*
* For each cpu, depopulate and unmap pages [@page_start,@page_end)
* from @chunk.
*
* Caller is required to call pcpu_post_unmap_tlb_flush() if not returning the
* region back to vmalloc() which will lazily flush the tlb.
*
* CONTEXT:
* pcpu_alloc_mutex.
*/
static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk,
int page_start, int page_end)
{
struct page **pages;
/*
* If control reaches here, there must have been at least one
* successful population attempt so the temp pages array must
* be available now.
*/
pages = pcpu_get_pages();
BUG_ON(!pages);
/* unmap and free */
pcpu_pre_unmap_flush(chunk, page_start, page_end);
pcpu_unmap_pages(chunk, pages, page_start, page_end);
pcpu_free_pages(chunk, pages, page_start, page_end);
}
static struct pcpu_chunk *pcpu_create_chunk(gfp_t gfp)
{
struct pcpu_chunk *chunk;
struct vm_struct **vms;
chunk = pcpu_alloc_chunk(gfp);
if (!chunk)
return NULL;
vms = pcpu_get_vm_areas(pcpu_group_offsets, pcpu_group_sizes,
pcpu_nr_groups, pcpu_atom_size);
if (!vms) {
pcpu_free_chunk(chunk);
return NULL;
}
chunk->data = vms;
chunk->base_addr = vms[0]->addr - pcpu_group_offsets[0];
pcpu_stats_chunk_alloc();
trace_percpu_create_chunk(chunk->base_addr);
return chunk;
}
static void pcpu_destroy_chunk(struct pcpu_chunk *chunk)
{
if (!chunk)
return;
pcpu_stats_chunk_dealloc();
trace_percpu_destroy_chunk(chunk->base_addr);
if (chunk->data)
pcpu_free_vm_areas(chunk->data, pcpu_nr_groups);
pcpu_free_chunk(chunk);
}
static struct page *pcpu_addr_to_page(void *addr)
{
return vmalloc_to_page(addr);
}
static int __init pcpu_verify_alloc_info(const struct pcpu_alloc_info *ai)
{
/* no extra restriction */
return 0;
}
/**
* pcpu_should_reclaim_chunk - determine if a chunk should go into reclaim
* @chunk: chunk of interest
*
* This is the entry point for percpu reclaim. If a chunk qualifies, it is then
* isolated and managed in separate lists at the back of pcpu_slot: sidelined
* and to_depopulate respectively. The to_depopulate list holds chunks slated
* for depopulation. They no longer contribute to pcpu_nr_empty_pop_pages once
* they are on this list. Once depopulated, they are moved onto the sidelined
* list which enables them to be pulled back in for allocation if no other chunk
* can suffice the allocation.
*/
static bool pcpu_should_reclaim_chunk(struct pcpu_chunk *chunk)
{
/* do not reclaim either the first chunk or reserved chunk */
if (chunk == pcpu_first_chunk || chunk == pcpu_reserved_chunk)
return false;
/*
* If it is isolated, it may be on the sidelined list so move it back to
* the to_depopulate list. If we hit at least 1/4 pages empty pages AND
* there is no system-wide shortage of empty pages aside from this
* chunk, move it to the to_depopulate list.
*/
return ((chunk->isolated && chunk->nr_empty_pop_pages) ||
(pcpu_nr_empty_pop_pages >
(PCPU_EMPTY_POP_PAGES_HIGH + chunk->nr_empty_pop_pages) && chunk->nr_empty_pop_pages >= chunk->nr_pages / 4));
}
/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
* INET An implementation of the TCP/IP protocol suite for the LINUX
* operating system. INET is implemented using the BSD Socket
* interface as the means of communication with the user level.
*
* Authors: Lotsa people, from code originally in tcp
*/
#ifndef _INET_HASHTABLES_H
#define _INET_HASHTABLES_H
#include <linux/interrupt.h>
#include <linux/ip.h>
#include <linux/ipv6.h>
#include <linux/list.h>
#include <linux/slab.h>
#include <linux/socket.h>
#include <linux/spinlock.h>
#include <linux/types.h>
#include <linux/wait.h>
#include <net/inet_connection_sock.h>
#include <net/inet_sock.h>
#include <net/sock.h>
#include <net/route.h>
#include <net/tcp_states.h>
#include <net/netns/hash.h>
#include <linux/refcount.h>
#include <asm/byteorder.h>
/* This is for all connections with a full identity, no wildcards.
* The 'e' prefix stands for Establish, but we really put all sockets
* but LISTEN ones.
*/
struct inet_ehash_bucket {
struct hlist_nulls_head chain;
};
/* There are a few simple rules, which allow for local port reuse by
* an application. In essence:
*
* 1) Sockets bound to different interfaces may share a local port.
* Failing that, goto test 2.
* 2) If all sockets have sk->sk_reuse set, and none of them are in
* TCP_LISTEN state, the port may be shared.
* Failing that, goto test 3.
* 3) If all sockets are bound to a specific inet_sk(sk)->rcv_saddr local
* address, and none of them are the same, the port may be
* shared.
* Failing this, the port cannot be shared.
*
* The interesting point, is test #2. This is what an FTP server does
* all day. To optimize this case we use a specific flag bit defined
* below. As we add sockets to a bind bucket list, we perform a
* check of: (newsk->sk_reuse && (newsk->sk_state != TCP_LISTEN))
* As long as all sockets added to a bind bucket pass this test,
* the flag bit will be set.
* The resulting situation is that tcp_v[46]_verify_bind() can just check
* for this flag bit, if it is set and the socket trying to bind has
* sk->sk_reuse set, we don't even have to walk the owners list at all,
* we return that it is ok to bind this socket to the requested local port.
*
* Sounds like a lot of work, but it is worth it. In a more naive
* implementation (ie. current FreeBSD etc.) the entire list of ports
* must be walked for each data port opened by an ftp server. Needless
* to say, this does not scale at all. With a couple thousand FTP
* users logged onto your box, isn't it nice to know that new data
* ports are created in O(1) time? I thought so. ;-) -DaveM
*/
#define FASTREUSEPORT_ANY 1
#define FASTREUSEPORT_STRICT 2
struct inet_bind_bucket {
possible_net_t ib_net;
int l3mdev;
unsigned short port;
signed char fastreuse;
signed char fastreuseport;
kuid_t fastuid;
#if IS_ENABLED(CONFIG_IPV6)
struct in6_addr fast_v6_rcv_saddr;
#endif
__be32 fast_rcv_saddr;
unsigned short fast_sk_family;
bool fast_ipv6_only;
struct hlist_node node;
struct hlist_head owners;
};
static inline struct net *ib_net(struct inet_bind_bucket *ib)
{
return read_pnet(&ib->ib_net);
}
#define inet_bind_bucket_for_each(tb, head) \
hlist_for_each_entry(tb, head, node)
struct inet_bind_hashbucket {
spinlock_t lock;
struct hlist_head chain;
};
/* Sockets can be hashed in established or listening table.
* We must use different 'nulls' end-of-chain value for all hash buckets :
* A socket might transition from ESTABLISH to LISTEN state without
* RCU grace period. A lookup in ehash table needs to handle this case.
*/
#define LISTENING_NULLS_BASE (1U << 29)
struct inet_listen_hashbucket {
spinlock_t lock;
unsigned int count;
union {
struct hlist_head head;
struct hlist_nulls_head nulls_head;
};
};
/* This is for listening sockets, thus all sockets which possess wildcards. */
#define INET_LHTABLE_SIZE 32 /* Yes, really, this is all you need. */
struct inet_hashinfo {
/* This is for sockets with full identity only. Sockets here will
* always be without wildcards and will have the following invariant:
*
* TCP_ESTABLISHED <= sk->sk_state < TCP_CLOSE
*
*/
struct inet_ehash_bucket *ehash;
spinlock_t *ehash_locks;
unsigned int ehash_mask;
unsigned int ehash_locks_mask;
/* Ok, let's try this, I give up, we do need a local binding
* TCP hash as well as the others for fast bind/connect.
*/
struct kmem_cache *bind_bucket_cachep;
struct inet_bind_hashbucket *bhash;
unsigned int bhash_size;
/* The 2nd listener table hashed by local port and address */
unsigned int lhash2_mask;
struct inet_listen_hashbucket *lhash2;
/* All the above members are written once at bootup and
* never written again _or_ are predominantly read-access.
*
* Now align to a new cache line as all the following members
* might be often dirty.
*/
/* All sockets in TCP_LISTEN state will be in listening_hash.
* This is the only table where wildcard'd TCP sockets can
* exist. listening_hash is only hashed by local port number.
* If lhash2 is initialized, the same socket will also be hashed
* to lhash2 by port and address.
*/
struct inet_listen_hashbucket listening_hash[INET_LHTABLE_SIZE]
____cacheline_aligned_in_smp;
};
#define inet_lhash2_for_each_icsk_continue(__icsk) \
hlist_for_each_entry_continue(__icsk, icsk_listen_portaddr_node)
#define inet_lhash2_for_each_icsk(__icsk, list) \
hlist_for_each_entry(__icsk, list, icsk_listen_portaddr_node)
#define inet_lhash2_for_each_icsk_rcu(__icsk, list) \
hlist_for_each_entry_rcu(__icsk, list, icsk_listen_portaddr_node)
static inline struct inet_listen_hashbucket *
inet_lhash2_bucket(struct inet_hashinfo *h, u32 hash)
{
return &h->lhash2[hash & h->lhash2_mask];
}
static inline struct inet_ehash_bucket *inet_ehash_bucket(
struct inet_hashinfo *hashinfo,
unsigned int hash)
{
return &hashinfo->ehash[hash & hashinfo->ehash_mask];
}
static inline spinlock_t *inet_ehash_lockp(
struct inet_hashinfo *hashinfo,
unsigned int hash)
{
return &hashinfo->ehash_locks[hash & hashinfo->ehash_locks_mask];
}
int inet_ehash_locks_alloc(struct inet_hashinfo *hashinfo);
static inline void inet_hashinfo2_free_mod(struct inet_hashinfo *h)
{
kfree(h->lhash2);
h->lhash2 = NULL;
}
static inline void inet_ehash_locks_free(struct inet_hashinfo *hashinfo)
{
kvfree(hashinfo->ehash_locks);
hashinfo->ehash_locks = NULL;
}
static inline bool inet_sk_bound_dev_eq(struct net *net, int bound_dev_if,
int dif, int sdif)
{
#if IS_ENABLED(CONFIG_NET_L3_MASTER_DEV)
return inet_bound_dev_eq(!!net->ipv4.sysctl_tcp_l3mdev_accept,
bound_dev_if, dif, sdif);
#else
return inet_bound_dev_eq(true, bound_dev_if, dif, sdif);
#endif
}
struct inet_bind_bucket *
inet_bind_bucket_create(struct kmem_cache *cachep, struct net *net,
struct inet_bind_hashbucket *head,
const unsigned short snum, int l3mdev);
void inet_bind_bucket_destroy(struct kmem_cache *cachep,
struct inet_bind_bucket *tb);
static inline u32 inet_bhashfn(const struct net *net, const __u16 lport,
const u32 bhash_size)
{
return (lport + net_hash_mix(net)) & (bhash_size - 1);
}
void inet_bind_hash(struct sock *sk, struct inet_bind_bucket *tb,
const unsigned short snum);
/* These can have wildcards, don't try too hard. */
static inline u32 inet_lhashfn(const struct net *net, const unsigned short num)
{
return (num + net_hash_mix(net)) & (INET_LHTABLE_SIZE - 1);
}
static inline int inet_sk_listen_hashfn(const struct sock *sk)
{
return inet_lhashfn(sock_net(sk), inet_sk(sk)->inet_num);
}
/* Caller must disable local BH processing. */
int __inet_inherit_port(const struct sock *sk, struct sock *child);
void inet_put_port(struct sock *sk);
void inet_hashinfo_init(struct inet_hashinfo *h);
void inet_hashinfo2_init(struct inet_hashinfo *h, const char *name,
unsigned long numentries, int scale,
unsigned long low_limit,
unsigned long high_limit);
int inet_hashinfo2_init_mod(struct inet_hashinfo *h);
bool inet_ehash_insert(struct sock *sk, struct sock *osk, bool *found_dup_sk);
bool inet_ehash_nolisten(struct sock *sk, struct sock *osk,
bool *found_dup_sk);
int __inet_hash(struct sock *sk, struct sock *osk);
int inet_hash(struct sock *sk);
void inet_unhash(struct sock *sk);
struct sock *__inet_lookup_listener(struct net *net,
struct inet_hashinfo *hashinfo,
struct sk_buff *skb, int doff,
const __be32 saddr, const __be16 sport,
const __be32 daddr,
const unsigned short hnum,
const int dif, const int sdif);
static inline struct sock *inet_lookup_listener(struct net *net,
struct inet_hashinfo *hashinfo,
struct sk_buff *skb, int doff,
__be32 saddr, __be16 sport,
__be32 daddr, __be16 dport, int dif, int sdif)
{
return __inet_lookup_listener(net, hashinfo, skb, doff, saddr, sport,
daddr, ntohs(dport), dif, sdif);
}
/* Socket demux engine toys. */
/* What happens here is ugly; there's a pair of adjacent fields in
struct inet_sock; __be16 dport followed by __u16 num. We want to
search by pair, so we combine the keys into a single 32bit value
and compare with 32bit value read from &...->dport. Let's at least
make sure that it's not mixed with anything else...
On 64bit targets we combine comparisons with pair of adjacent __be32
fields in the same way.
*/
#ifdef __BIG_ENDIAN
#define INET_COMBINED_PORTS(__sport, __dport) \
((__force __portpair)(((__force __u32)(__be16)(__sport) << 16) | (__u32)(__dport)))
#else /* __LITTLE_ENDIAN */
#define INET_COMBINED_PORTS(__sport, __dport) \
((__force __portpair)(((__u32)(__dport) << 16) | (__force __u32)(__be16)(__sport)))
#endif
#if (BITS_PER_LONG == 64)
#ifdef __BIG_ENDIAN
#define INET_ADDR_COOKIE(__name, __saddr, __daddr) \
const __addrpair __name = (__force __addrpair) ( \
(((__force __u64)(__be32)(__saddr)) << 32) | \
((__force __u64)(__be32)(__daddr)))
#else /* __LITTLE_ENDIAN */
#define INET_ADDR_COOKIE(__name, __saddr, __daddr) \
const __addrpair __name = (__force __addrpair) ( \
(((__force __u64)(__be32)(__daddr)) << 32) | \
((__force __u64)(__be32)(__saddr)))
#endif /* __BIG_ENDIAN */
#define INET_MATCH(__sk, __net, __cookie, __saddr, __daddr, __ports, __dif, __sdif) \
(((__sk)->sk_portpair == (__ports)) && \
((__sk)->sk_addrpair == (__cookie)) && \
(((__sk)->sk_bound_dev_if == (__dif)) || \
((__sk)->sk_bound_dev_if == (__sdif))) && \
net_eq(sock_net(__sk), (__net)))
#else /* 32-bit arch */
#define INET_ADDR_COOKIE(__name, __saddr, __daddr) \
const int __name __deprecated __attribute__((unused))
#define INET_MATCH(__sk, __net, __cookie, __saddr, __daddr, __ports, __dif, __sdif) \
(((__sk)->sk_portpair == (__ports)) && \
((__sk)->sk_daddr == (__saddr)) && \
((__sk)->sk_rcv_saddr == (__daddr)) && \
(((__sk)->sk_bound_dev_if == (__dif)) || \
((__sk)->sk_bound_dev_if == (__sdif))) && \
net_eq(sock_net(__sk), (__net)))
#endif /* 64-bit arch */
/* Sockets in TCP_CLOSE state are _always_ taken out of the hash, so we need
* not check it for lookups anymore, thanks Alexey. -DaveM
*/
struct sock *__inet_lookup_established(struct net *net,
struct inet_hashinfo *hashinfo,
const __be32 saddr, const __be16 sport,
const __be32 daddr, const u16 hnum,
const int dif, const int sdif);
static inline struct sock *
inet_lookup_established(struct net *net, struct inet_hashinfo *hashinfo,
const __be32 saddr, const __be16 sport,
const __be32 daddr, const __be16 dport,
const int dif)
{
return __inet_lookup_established(net, hashinfo, saddr, sport, daddr,
ntohs(dport), dif, 0);
}
static inline struct sock *__inet_lookup(struct net *net,
struct inet_hashinfo *hashinfo,
struct sk_buff *skb, int doff,
const __be32 saddr, const __be16 sport,
const __be32 daddr, const __be16 dport,
const int dif, const int sdif,
bool *refcounted)
{
u16 hnum = ntohs(dport);
struct sock *sk;
sk = __inet_lookup_established(net, hashinfo, saddr, sport,
daddr, hnum, dif, sdif);
*refcounted = true;
if (sk)
return sk;
*refcounted = false;
return __inet_lookup_listener(net, hashinfo, skb, doff, saddr,
sport, daddr, hnum, dif, sdif);
}
static inline struct sock *inet_lookup(struct net *net,
struct inet_hashinfo *hashinfo,
struct sk_buff *skb, int doff,
const __be32 saddr, const __be16 sport,
const __be32 daddr, const __be16 dport,
const int dif)
{
struct sock *sk;
bool refcounted;
sk = __inet_lookup(net, hashinfo, skb, doff, saddr, sport, daddr,
dport, dif, 0, &refcounted);
if (sk && !refcounted && !refcount_inc_not_zero(&sk->sk_refcnt))
sk = NULL;
return sk;
}
static inline struct sock *__inet_lookup_skb(struct inet_hashinfo *hashinfo,
struct sk_buff *skb,
int doff,
const __be16 sport,
const __be16 dport,
const int sdif,
bool *refcounted)
{
struct sock *sk = skb_steal_sock(skb, refcounted);
const struct iphdr *iph = ip_hdr(skb);
if (sk)
return sk;
return __inet_lookup(dev_net(skb_dst(skb)->dev), hashinfo, skb,
doff, iph->saddr, sport,
iph->daddr, dport, inet_iif(skb), sdif,
refcounted);
}
u32 inet6_ehashfn(const struct net *net,
const struct in6_addr *laddr, const u16 lport,
const struct in6_addr *faddr, const __be16 fport);
static inline void sk_daddr_set(struct sock *sk, __be32 addr)
{
sk->sk_daddr = addr; /* alias of inet_daddr */
#if IS_ENABLED(CONFIG_IPV6)
ipv6_addr_set_v4mapped(addr, &sk->sk_v6_daddr);
#endif
}
static inline void sk_rcv_saddr_set(struct sock *sk, __be32 addr)
{
sk->sk_rcv_saddr = addr; /* alias of inet_rcv_saddr */
#if IS_ENABLED(CONFIG_IPV6)
ipv6_addr_set_v4mapped(addr, &sk->sk_v6_rcv_saddr);
#endif
}
int __inet_hash_connect(struct inet_timewait_death_row *death_row,
struct sock *sk, u32 port_offset,
int (*check_established)(struct inet_timewait_death_row *,
struct sock *, __u16,
struct inet_timewait_sock **));
int inet_hash_connect(struct inet_timewait_death_row *death_row,
struct sock *sk);
#endif /* _INET_HASHTABLES_H */
// SPDX-License-Identifier: GPL-2.0-only
#include <linux/kdebug.h>
#include <linux/kprobes.h>
#include <linux/export.h>
#include <linux/notifier.h>
#include <linux/rcupdate.h>
#include <linux/vmalloc.h>
#include <linux/reboot.h>
/*
* Notifier list for kernel code which wants to be called
* at shutdown. This is used to stop any idling DMA operations
* and the like.
*/
BLOCKING_NOTIFIER_HEAD(reboot_notifier_list);
/*
* Notifier chain core routines. The exported routines below
* are layered on top of these, with appropriate locking added.
*/
static int notifier_chain_register(struct notifier_block **nl,
struct notifier_block *n)
{
while ((*nl) != NULL) {
if (unlikely((*nl) == n)) {
WARN(1, "double register detected");
return 0;
}
if (n->priority > (*nl)->priority)
break;
nl = &((*nl)->next);
}
n->next = *nl;
rcu_assign_pointer(*nl, n);
return 0;
}
static int notifier_chain_unregister(struct notifier_block **nl,
struct notifier_block *n)
{
while ((*nl) != NULL) {
if ((*nl) == n) {
rcu_assign_pointer(*nl, n->next);
return 0;
}
nl = &((*nl)->next);
}
return -ENOENT;
}
/**
* notifier_call_chain - Informs the registered notifiers about an event.
* @nl: Pointer to head of the blocking notifier chain
* @val: Value passed unmodified to notifier function
* @v: Pointer passed unmodified to notifier function
* @nr_to_call: Number of notifier functions to be called. Don't care
* value of this parameter is -1.
* @nr_calls: Records the number of notifications sent. Don't care
* value of this field is NULL.
* @returns: notifier_call_chain returns the value returned by the
* last notifier function called.
*/
static int notifier_call_chain(struct notifier_block **nl,
unsigned long val, void *v,
int nr_to_call, int *nr_calls)
{
int ret = NOTIFY_DONE;
struct notifier_block *nb, *next_nb;
nb = rcu_dereference_raw(*nl);
while (nb && nr_to_call) { next_nb = rcu_dereference_raw(nb->next);
#ifdef CONFIG_DEBUG_NOTIFIERS
if (unlikely(!func_ptr_is_kernel_text(nb->notifier_call))) {
WARN(1, "Invalid notifier called!");
nb = next_nb;
continue;
}
#endif
ret = nb->notifier_call(nb, val, v);
if (nr_calls)
(*nr_calls)++;
if (ret & NOTIFY_STOP_MASK)
break;
nb = next_nb;
nr_to_call--;
}
return ret;
}
NOKPROBE_SYMBOL(notifier_call_chain);
/**
* notifier_call_chain_robust - Inform the registered notifiers about an event
* and rollback on error.
* @nl: Pointer to head of the blocking notifier chain
* @val_up: Value passed unmodified to the notifier function
* @val_down: Value passed unmodified to the notifier function when recovering
* from an error on @val_up
* @v Pointer passed unmodified to the notifier function
*
* NOTE: It is important the @nl chain doesn't change between the two
* invocations of notifier_call_chain() such that we visit the
* exact same notifier callbacks; this rules out any RCU usage.
*
* Returns: the return value of the @val_up call.
*/
static int notifier_call_chain_robust(struct notifier_block **nl,
unsigned long val_up, unsigned long val_down,
void *v)
{
int ret, nr = 0;
ret = notifier_call_chain(nl, val_up, v, -1, &nr);
if (ret & NOTIFY_STOP_MASK)
notifier_call_chain(nl, val_down, v, nr-1, NULL);
return ret;
}
/*
* Atomic notifier chain routines. Registration and unregistration
* use a spinlock, and call_chain is synchronized by RCU (no locks).
*/
/**
* atomic_notifier_chain_register - Add notifier to an atomic notifier chain
* @nh: Pointer to head of the atomic notifier chain
* @n: New entry in notifier chain
*
* Adds a notifier to an atomic notifier chain.
*
* Currently always returns zero.
*/
int atomic_notifier_chain_register(struct atomic_notifier_head *nh,
struct notifier_block *n)
{
unsigned long flags;
int ret;
spin_lock_irqsave(&nh->lock, flags);
ret = notifier_chain_register(&nh->head, n);
spin_unlock_irqrestore(&nh->lock, flags);
return ret;
}
EXPORT_SYMBOL_GPL(atomic_notifier_chain_register);
/**
* atomic_notifier_chain_unregister - Remove notifier from an atomic notifier chain
* @nh: Pointer to head of the atomic notifier chain
* @n: Entry to remove from notifier chain
*
* Removes a notifier from an atomic notifier chain.
*
* Returns zero on success or %-ENOENT on failure.
*/
int atomic_notifier_chain_unregister(struct atomic_notifier_head *nh,
struct notifier_block *n)
{
unsigned long flags;
int ret;
spin_lock_irqsave(&nh->lock, flags);
ret = notifier_chain_unregister(&nh->head, n);
spin_unlock_irqrestore(&nh->lock, flags);
synchronize_rcu();
return ret;
}
EXPORT_SYMBOL_GPL(atomic_notifier_chain_unregister);
/**
* atomic_notifier_call_chain - Call functions in an atomic notifier chain
* @nh: Pointer to head of the atomic notifier chain
* @val: Value passed unmodified to notifier function
* @v: Pointer passed unmodified to notifier function
*
* Calls each function in a notifier chain in turn. The functions
* run in an atomic context, so they must not block.
* This routine uses RCU to synchronize with changes to the chain.
*
* If the return value of the notifier can be and'ed
* with %NOTIFY_STOP_MASK then atomic_notifier_call_chain()
* will return immediately, with the return value of
* the notifier function which halted execution.
* Otherwise the return value is the return value
* of the last notifier function called.
*/
int atomic_notifier_call_chain(struct atomic_notifier_head *nh,
unsigned long val, void *v)
{
int ret;
rcu_read_lock();
ret = notifier_call_chain(&nh->head, val, v, -1, NULL);
rcu_read_unlock();
return ret;
}
EXPORT_SYMBOL_GPL(atomic_notifier_call_chain);
NOKPROBE_SYMBOL(atomic_notifier_call_chain);
/*
* Blocking notifier chain routines. All access to the chain is
* synchronized by an rwsem.
*/
/**
* blocking_notifier_chain_register - Add notifier to a blocking notifier chain
* @nh: Pointer to head of the blocking notifier chain
* @n: New entry in notifier chain
*
* Adds a notifier to a blocking notifier chain.
* Must be called in process context.
*
* Currently always returns zero.
*/
int blocking_notifier_chain_register(struct blocking_notifier_head *nh,
struct notifier_block *n)
{
int ret;
/*
* This code gets used during boot-up, when task switching is
* not yet working and interrupts must remain disabled. At
* such times we must not call down_write().
*/
if (unlikely(system_state == SYSTEM_BOOTING))
return notifier_chain_register(&nh->head, n);
down_write(&nh->rwsem);
ret = notifier_chain_register(&nh->head, n);
up_write(&nh->rwsem);
return ret;
}
EXPORT_SYMBOL_GPL(blocking_notifier_chain_register);
/**
* blocking_notifier_chain_unregister - Remove notifier from a blocking notifier chain
* @nh: Pointer to head of the blocking notifier chain
* @n: Entry to remove from notifier chain
*
* Removes a notifier from a blocking notifier chain.
* Must be called from process context.
*
* Returns zero on success or %-ENOENT on failure.
*/
int blocking_notifier_chain_unregister(struct blocking_notifier_head *nh,
struct notifier_block *n)
{
int ret;
/*
* This code gets used during boot-up, when task switching is
* not yet working and interrupts must remain disabled. At
* such times we must not call down_write().
*/
if (unlikely(system_state == SYSTEM_BOOTING))
return notifier_chain_unregister(&nh->head, n);
down_write(&nh->rwsem);
ret = notifier_chain_unregister(&nh->head, n);
up_write(&nh->rwsem);
return ret;
}
EXPORT_SYMBOL_GPL(blocking_notifier_chain_unregister);
int blocking_notifier_call_chain_robust(struct blocking_notifier_head *nh,
unsigned long val_up, unsigned long val_down, void *v)
{
int ret = NOTIFY_DONE;
/*
* We check the head outside the lock, but if this access is
* racy then it does not matter what the result of the test
* is, we re-check the list after having taken the lock anyway:
*/
if (rcu_access_pointer(nh->head)) {
down_read(&nh->rwsem);
ret = notifier_call_chain_robust(&nh->head, val_up, val_down, v);
up_read(&nh->rwsem);
}
return ret;
}
EXPORT_SYMBOL_GPL(blocking_notifier_call_chain_robust);
/**
* blocking_notifier_call_chain - Call functions in a blocking notifier chain
* @nh: Pointer to head of the blocking notifier chain
* @val: Value passed unmodified to notifier function
* @v: Pointer passed unmodified to notifier function
*
* Calls each function in a notifier chain in turn. The functions
* run in a process context, so they are allowed to block.
*
* If the return value of the notifier can be and'ed
* with %NOTIFY_STOP_MASK then blocking_notifier_call_chain()
* will return immediately, with the return value of
* the notifier function which halted execution.
* Otherwise the return value is the return value
* of the last notifier function called.
*/
int blocking_notifier_call_chain(struct blocking_notifier_head *nh,
unsigned long val, void *v)
{
int ret = NOTIFY_DONE;
/*
* We check the head outside the lock, but if this access is
* racy then it does not matter what the result of the test
* is, we re-check the list after having taken the lock anyway:
*/
if (rcu_access_pointer(nh->head)) {
down_read(&nh->rwsem);
ret = notifier_call_chain(&nh->head, val, v, -1, NULL);
up_read(&nh->rwsem);
}
return ret;
}
EXPORT_SYMBOL_GPL(blocking_notifier_call_chain);
/*
* Raw notifier chain routines. There is no protection;
* the caller must provide it. Use at your own risk!
*/
/**
* raw_notifier_chain_register - Add notifier to a raw notifier chain
* @nh: Pointer to head of the raw notifier chain
* @n: New entry in notifier chain
*
* Adds a notifier to a raw notifier chain.
* All locking must be provided by the caller.
*
* Currently always returns zero.
*/
int raw_notifier_chain_register(struct raw_notifier_head *nh,
struct notifier_block *n)
{
return notifier_chain_register(&nh->head, n);
}
EXPORT_SYMBOL_GPL(raw_notifier_chain_register);
/**
* raw_notifier_chain_unregister - Remove notifier from a raw notifier chain
* @nh: Pointer to head of the raw notifier chain
* @n: Entry to remove from notifier chain
*
* Removes a notifier from a raw notifier chain.
* All locking must be provided by the caller.
*
* Returns zero on success or %-ENOENT on failure.
*/
int raw_notifier_chain_unregister(struct raw_notifier_head *nh,
struct notifier_block *n)
{
return notifier_chain_unregister(&nh->head, n);
}
EXPORT_SYMBOL_GPL(raw_notifier_chain_unregister);
int raw_notifier_call_chain_robust(struct raw_notifier_head *nh,
unsigned long val_up, unsigned long val_down, void *v)
{
return notifier_call_chain_robust(&nh->head, val_up, val_down, v);
}
EXPORT_SYMBOL_GPL(raw_notifier_call_chain_robust);
/**
* raw_notifier_call_chain - Call functions in a raw notifier chain
* @nh: Pointer to head of the raw notifier chain
* @val: Value passed unmodified to notifier function
* @v: Pointer passed unmodified to notifier function
*
* Calls each function in a notifier chain in turn. The functions
* run in an undefined context.
* All locking must be provided by the caller.
*
* If the return value of the notifier can be and'ed
* with %NOTIFY_STOP_MASK then raw_notifier_call_chain()
* will return immediately, with the return value of
* the notifier function which halted execution.
* Otherwise the return value is the return value
* of the last notifier function called.
*/
int raw_notifier_call_chain(struct raw_notifier_head *nh,
unsigned long val, void *v)
{
return notifier_call_chain(&nh->head, val, v, -1, NULL);
}
EXPORT_SYMBOL_GPL(raw_notifier_call_chain);
#ifdef CONFIG_SRCU
/*
* SRCU notifier chain routines. Registration and unregistration
* use a mutex, and call_chain is synchronized by SRCU (no locks).
*/
/**
* srcu_notifier_chain_register - Add notifier to an SRCU notifier chain
* @nh: Pointer to head of the SRCU notifier chain
* @n: New entry in notifier chain
*
* Adds a notifier to an SRCU notifier chain.
* Must be called in process context.
*
* Currently always returns zero.
*/
int srcu_notifier_chain_register(struct srcu_notifier_head *nh,
struct notifier_block *n)
{
int ret;
/*
* This code gets used during boot-up, when task switching is
* not yet working and interrupts must remain disabled. At
* such times we must not call mutex_lock().
*/
if (unlikely(system_state == SYSTEM_BOOTING))
return notifier_chain_register(&nh->head, n);
mutex_lock(&nh->mutex);
ret = notifier_chain_register(&nh->head, n);
mutex_unlock(&nh->mutex);
return ret;
}
EXPORT_SYMBOL_GPL(srcu_notifier_chain_register);
/**
* srcu_notifier_chain_unregister - Remove notifier from an SRCU notifier chain
* @nh: Pointer to head of the SRCU notifier chain
* @n: Entry to remove from notifier chain
*
* Removes a notifier from an SRCU notifier chain.
* Must be called from process context.
*
* Returns zero on success or %-ENOENT on failure.
*/
int srcu_notifier_chain_unregister(struct srcu_notifier_head *nh,
struct notifier_block *n)
{
int ret;
/*
* This code gets used during boot-up, when task switching is
* not yet working and interrupts must remain disabled. At
* such times we must not call mutex_lock().
*/
if (unlikely(system_state == SYSTEM_BOOTING))
return notifier_chain_unregister(&nh->head, n);
mutex_lock(&nh->mutex);
ret = notifier_chain_unregister(&nh->head, n);
mutex_unlock(&nh->mutex);
synchronize_srcu(&nh->srcu);
return ret;
}
EXPORT_SYMBOL_GPL(srcu_notifier_chain_unregister);
/**
* srcu_notifier_call_chain - Call functions in an SRCU notifier chain
* @nh: Pointer to head of the SRCU notifier chain
* @val: Value passed unmodified to notifier function
* @v: Pointer passed unmodified to notifier function
*
* Calls each function in a notifier chain in turn. The functions
* run in a process context, so they are allowed to block.
*
* If the return value of the notifier can be and'ed
* with %NOTIFY_STOP_MASK then srcu_notifier_call_chain()
* will return immediately, with the return value of
* the notifier function which halted execution.
* Otherwise the return value is the return value
* of the last notifier function called.
*/
int srcu_notifier_call_chain(struct srcu_notifier_head *nh,
unsigned long val, void *v)
{
int ret;
int idx;
idx = srcu_read_lock(&nh->srcu);
ret = notifier_call_chain(&nh->head, val, v, -1, NULL);
srcu_read_unlock(&nh->srcu, idx);
return ret;
}
EXPORT_SYMBOL_GPL(srcu_notifier_call_chain);
/**
* srcu_init_notifier_head - Initialize an SRCU notifier head
* @nh: Pointer to head of the srcu notifier chain
*
* Unlike other sorts of notifier heads, SRCU notifier heads require
* dynamic initialization. Be sure to call this routine before
* calling any of the other SRCU notifier routines for this head.
*
* If an SRCU notifier head is deallocated, it must first be cleaned
* up by calling srcu_cleanup_notifier_head(). Otherwise the head's
* per-cpu data (used by the SRCU mechanism) will leak.
*/
void srcu_init_notifier_head(struct srcu_notifier_head *nh)
{
mutex_init(&nh->mutex);
if (init_srcu_struct(&nh->srcu) < 0)
BUG();
nh->head = NULL;
}
EXPORT_SYMBOL_GPL(srcu_init_notifier_head);
#endif /* CONFIG_SRCU */
static ATOMIC_NOTIFIER_HEAD(die_chain);
int notrace notify_die(enum die_val val, const char *str,
struct pt_regs *regs, long err, int trap, int sig)
{
struct die_args args = {
.regs = regs,
.str = str,
.err = err,
.trapnr = trap,
.signr = sig,
};
RCU_LOCKDEP_WARN(!rcu_is_watching(),
"notify_die called but RCU thinks we're quiescent");
return atomic_notifier_call_chain(&die_chain, val, &args);
}
NOKPROBE_SYMBOL(notify_die);
int register_die_notifier(struct notifier_block *nb)
{
return atomic_notifier_chain_register(&die_chain, nb);
}
EXPORT_SYMBOL_GPL(register_die_notifier);
int unregister_die_notifier(struct notifier_block *nb)
{
return atomic_notifier_chain_unregister(&die_chain, nb);
}
EXPORT_SYMBOL_GPL(unregister_die_notifier);
// SPDX-License-Identifier: GPL-2.0-only
#include <linux/mm.h>
#include <linux/slab.h>
#include <linux/string.h>
#include <linux/compiler.h>
#include <linux/export.h>
#include <linux/err.h>
#include <linux/sched.h>
#include <linux/sched/mm.h>
#include <linux/sched/signal.h>
#include <linux/sched/task_stack.h>
#include <linux/security.h>
#include <linux/swap.h>
#include <linux/swapops.h>
#include <linux/mman.h>
#include <linux/hugetlb.h>
#include <linux/vmalloc.h>
#include <linux/userfaultfd_k.h>
#include <linux/elf.h>
#include <linux/elf-randomize.h>
#include <linux/personality.h>
#include <linux/random.h>
#include <linux/processor.h>
#include <linux/sizes.h>
#include <linux/compat.h>
#include <linux/uaccess.h>
#include "internal.h"
/**
* kfree_const - conditionally free memory
* @x: pointer to the memory
*
* Function calls kfree only if @x is not in .rodata section.
*/
void kfree_const(const void *x)
{
if (!is_kernel_rodata((unsigned long)x)) kfree(x);
}
EXPORT_SYMBOL(kfree_const);
/**
* kstrdup - allocate space for and copy an existing string
* @s: the string to duplicate
* @gfp: the GFP mask used in the kmalloc() call when allocating memory
*
* Return: newly allocated copy of @s or %NULL in case of error
*/
char *kstrdup(const char *s, gfp_t gfp)
{
size_t len;
char *buf;
if (!s)
return NULL;
len = strlen(s) + 1;
buf = kmalloc_track_caller(len, gfp);
if (buf)
memcpy(buf, s, len);
return buf;
}
EXPORT_SYMBOL(kstrdup);
/**
* kstrdup_const - conditionally duplicate an existing const string
* @s: the string to duplicate
* @gfp: the GFP mask used in the kmalloc() call when allocating memory
*
* Note: Strings allocated by kstrdup_const should be freed by kfree_const and
* must not be passed to krealloc().
*
* Return: source string if it is in .rodata section otherwise
* fallback to kstrdup.
*/
const char *kstrdup_const(const char *s, gfp_t gfp)
{
if (is_kernel_rodata((unsigned long)s))
return s;
return kstrdup(s, gfp);}
EXPORT_SYMBOL(kstrdup_const);
/**
* kstrndup - allocate space for and copy an existing string
* @s: the string to duplicate
* @max: read at most @max chars from @s
* @gfp: the GFP mask used in the kmalloc() call when allocating memory
*
* Note: Use kmemdup_nul() instead if the size is known exactly.
*
* Return: newly allocated copy of @s or %NULL in case of error
*/
char *kstrndup(const char *s, size_t max, gfp_t gfp)
{
size_t len;
char *buf;
if (!s)
return NULL;
len = strnlen(s, max);
buf = kmalloc_track_caller(len+1, gfp);
if (buf) {
memcpy(buf, s, len);
buf[len] = '\0';
}
return buf;
}
EXPORT_SYMBOL(kstrndup);
/**
* kmemdup - duplicate region of memory
*
* @src: memory region to duplicate
* @len: memory region length
* @gfp: GFP mask to use
*
* Return: newly allocated copy of @src or %NULL in case of error
*/
void *kmemdup(const void *src, size_t len, gfp_t gfp)
{
void *p;
p = kmalloc_track_caller(len, gfp);
if (p)
memcpy(p, src, len); return p;
}
EXPORT_SYMBOL(kmemdup);
/**
* kmemdup_nul - Create a NUL-terminated string from unterminated data
* @s: The data to stringify
* @len: The size of the data
* @gfp: the GFP mask used in the kmalloc() call when allocating memory
*
* Return: newly allocated copy of @s with NUL-termination or %NULL in
* case of error
*/
char *kmemdup_nul(const char *s, size_t len, gfp_t gfp)
{
char *buf;
if (!s)
return NULL;
buf = kmalloc_track_caller(len + 1, gfp);
if (buf) {
memcpy(buf, s, len); buf[len] = '\0';
}
return buf;
}
EXPORT_SYMBOL(kmemdup_nul);
/**
* memdup_user - duplicate memory region from user space
*
* @src: source address in user space
* @len: number of bytes to copy
*
* Return: an ERR_PTR() on failure. Result is physically
* contiguous, to be freed by kfree().
*/
void *memdup_user(const void __user *src, size_t len)
{
void *p;
p = kmalloc_track_caller(len, GFP_USER | __GFP_NOWARN);
if (!p)
return ERR_PTR(-ENOMEM);
if (copy_from_user(p, src, len)) { kfree(p);
return ERR_PTR(-EFAULT);
}
return p;
}
EXPORT_SYMBOL(memdup_user);
/**
* vmemdup_user - duplicate memory region from user space
*
* @src: source address in user space
* @len: number of bytes to copy
*
* Return: an ERR_PTR() on failure. Result may be not
* physically contiguous. Use kvfree() to free.
*/
void *vmemdup_user(const void __user *src, size_t len)
{
void *p;
p = kvmalloc(len, GFP_USER);
if (!p)
return ERR_PTR(-ENOMEM);
if (copy_from_user(p, src, len)) {
kvfree(p);
return ERR_PTR(-EFAULT);
}
return p;
}
EXPORT_SYMBOL(vmemdup_user);
/**
* strndup_user - duplicate an existing string from user space
* @s: The string to duplicate
* @n: Maximum number of bytes to copy, including the trailing NUL.
*
* Return: newly allocated copy of @s or an ERR_PTR() in case of error
*/
char *strndup_user(const char __user *s, long n)
{
char *p;
long length;
length = strnlen_user(s, n);
if (!length)
return ERR_PTR(-EFAULT);
if (length > n)
return ERR_PTR(-EINVAL);
p = memdup_user(s, length);
if (IS_ERR(p))
return p;
p[length - 1] = '\0'; return p;
}
EXPORT_SYMBOL(strndup_user);
/**
* memdup_user_nul - duplicate memory region from user space and NUL-terminate
*
* @src: source address in user space
* @len: number of bytes to copy
*
* Return: an ERR_PTR() on failure.
*/
void *memdup_user_nul(const void __user *src, size_t len)
{
char *p;
/*
* Always use GFP_KERNEL, since copy_from_user() can sleep and
* cause pagefault, which makes it pointless to use GFP_NOFS
* or GFP_ATOMIC.
*/
p = kmalloc_track_caller(len + 1, GFP_KERNEL);
if (!p)
return ERR_PTR(-ENOMEM);
if (copy_from_user(p, src, len)) {
kfree(p);
return ERR_PTR(-EFAULT);
}
p[len] = '\0';
return p;
}
EXPORT_SYMBOL(memdup_user_nul);
void __vma_link_list(struct mm_struct *mm, struct vm_area_struct *vma,
struct vm_area_struct *prev)
{
struct vm_area_struct *next;
vma->vm_prev = prev;
if (prev) {
next = prev->vm_next;
prev->vm_next = vma;
} else {
next = mm->mmap;
mm->mmap = vma;
}
vma->vm_next = next;
if (next)
next->vm_prev = vma;
}
void __vma_unlink_list(struct mm_struct *mm, struct vm_area_struct *vma)
{
struct vm_area_struct *prev, *next;
next = vma->vm_next;
prev = vma->vm_prev;
if (prev)
prev->vm_next = next;
else
mm->mmap = next;
if (next)
next->vm_prev = prev;
}
/* Check if the vma is being used as a stack by this task */
int vma_is_stack_for_current(struct vm_area_struct *vma)
{
struct task_struct * __maybe_unused t = current;
return (vma->vm_start <= KSTK_ESP(t) && vma->vm_end >= KSTK_ESP(t));
}
/*
* Change backing file, only valid to use during initial VMA setup.
*/
void vma_set_file(struct vm_area_struct *vma, struct file *file)
{
/* Changing an anonymous vma with this is illegal */
get_file(file);
swap(vma->vm_file, file);
fput(file);
}
EXPORT_SYMBOL(vma_set_file);
#ifndef STACK_RND_MASK
#define STACK_RND_MASK (0x7ff >> (PAGE_SHIFT - 12)) /* 8MB of VA */
#endif
unsigned long randomize_stack_top(unsigned long stack_top)
{
unsigned long random_variable = 0;
if (current->flags & PF_RANDOMIZE) {
random_variable = get_random_long();
random_variable &= STACK_RND_MASK;
random_variable <<= PAGE_SHIFT;
}
#ifdef CONFIG_STACK_GROWSUP
return PAGE_ALIGN(stack_top) + random_variable;
#else
return PAGE_ALIGN(stack_top) - random_variable;
#endif
}
#ifdef CONFIG_ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT
unsigned long arch_randomize_brk(struct mm_struct *mm)
{
/* Is the current task 32bit ? */
if (!IS_ENABLED(CONFIG_64BIT) || is_compat_task())
return randomize_page(mm->brk, SZ_32M);
return randomize_page(mm->brk, SZ_1G);
}
unsigned long arch_mmap_rnd(void)
{
unsigned long rnd;
#ifdef CONFIG_HAVE_ARCH_MMAP_RND_COMPAT_BITS
if (is_compat_task())
rnd = get_random_long() & ((1UL << mmap_rnd_compat_bits) - 1);
else
#endif /* CONFIG_HAVE_ARCH_MMAP_RND_COMPAT_BITS */
rnd = get_random_long() & ((1UL << mmap_rnd_bits) - 1);
return rnd << PAGE_SHIFT;
}
static int mmap_is_legacy(struct rlimit *rlim_stack)
{
if (current->personality & ADDR_COMPAT_LAYOUT)
return 1;
if (rlim_stack->rlim_cur == RLIM_INFINITY)
return 1;
return sysctl_legacy_va_layout;
}
/*
* Leave enough space between the mmap area and the stack to honour ulimit in
* the face of randomisation.
*/
#define MIN_GAP (SZ_128M)
#define MAX_GAP (STACK_TOP / 6 * 5)
static unsigned long mmap_base(unsigned long rnd, struct rlimit *rlim_stack)
{
unsigned long gap = rlim_stack->rlim_cur;
unsigned long pad = stack_guard_gap;
/* Account for stack randomization if necessary */
if (current->flags & PF_RANDOMIZE)
pad += (STACK_RND_MASK << PAGE_SHIFT);
/* Values close to RLIM_INFINITY can overflow. */
if (gap + pad > gap)
gap += pad;
if (gap < MIN_GAP)
gap = MIN_GAP;
else if (gap > MAX_GAP)
gap = MAX_GAP;
return PAGE_ALIGN(STACK_TOP - gap - rnd);
}
void arch_pick_mmap_layout(struct mm_struct *mm, struct rlimit *rlim_stack)
{
unsigned long random_factor = 0UL;
if (current->flags & PF_RANDOMIZE)
random_factor = arch_mmap_rnd();
if (mmap_is_legacy(rlim_stack)) {
mm->mmap_base = TASK_UNMAPPED_BASE + random_factor;
mm->get_unmapped_area = arch_get_unmapped_area;
} else {
mm->mmap_base = mmap_base(random_factor, rlim_stack);
mm->get_unmapped_area = arch_get_unmapped_area_topdown;
}
}
#elif defined(CONFIG_MMU) && !defined(HAVE_ARCH_PICK_MMAP_LAYOUT)
void arch_pick_mmap_layout(struct mm_struct *mm, struct rlimit *rlim_stack)
{
mm->mmap_base = TASK_UNMAPPED_BASE;
mm->get_unmapped_area = arch_get_unmapped_area;
}
#endif
/**
* __account_locked_vm - account locked pages to an mm's locked_vm
* @mm: mm to account against
* @pages: number of pages to account
* @inc: %true if @pages should be considered positive, %false if not
* @task: task used to check RLIMIT_MEMLOCK
* @bypass_rlim: %true if checking RLIMIT_MEMLOCK should be skipped
*
* Assumes @task and @mm are valid (i.e. at least one reference on each), and
* that mmap_lock is held as writer.
*
* Return:
* * 0 on success
* * -ENOMEM if RLIMIT_MEMLOCK would be exceeded.
*/
int __account_locked_vm(struct mm_struct *mm, unsigned long pages, bool inc,
struct task_struct *task, bool bypass_rlim)
{
unsigned long locked_vm, limit;
int ret = 0;
mmap_assert_write_locked(mm);
locked_vm = mm->locked_vm;
if (inc) {
if (!bypass_rlim) {
limit = task_rlimit(task, RLIMIT_MEMLOCK) >> PAGE_SHIFT;
if (locked_vm + pages > limit)
ret = -ENOMEM;
}
if (!ret)
mm->locked_vm = locked_vm + pages;
} else {
WARN_ON_ONCE(pages > locked_vm);
mm->locked_vm = locked_vm - pages;
}
pr_debug("%s: [%d] caller %ps %c%lu %lu/%lu%s\n", __func__, task->pid,
(void *)_RET_IP_, (inc) ? '+' : '-', pages << PAGE_SHIFT,
locked_vm << PAGE_SHIFT, task_rlimit(task, RLIMIT_MEMLOCK),
ret ? " - exceeded" : "");
return ret;
}
EXPORT_SYMBOL_GPL(__account_locked_vm);
/**
* account_locked_vm - account locked pages to an mm's locked_vm
* @mm: mm to account against, may be NULL
* @pages: number of pages to account
* @inc: %true if @pages should be considered positive, %false if not
*
* Assumes a non-NULL @mm is valid (i.e. at least one reference on it).
*
* Return:
* * 0 on success, or if mm is NULL
* * -ENOMEM if RLIMIT_MEMLOCK would be exceeded.
*/
int account_locked_vm(struct mm_struct *mm, unsigned long pages, bool inc)
{
int ret;
if (pages == 0 || !mm)
return 0;
mmap_write_lock(mm);
ret = __account_locked_vm(mm, pages, inc, current,
capable(CAP_IPC_LOCK));
mmap_write_unlock(mm);
return ret;
}
EXPORT_SYMBOL_GPL(account_locked_vm);
unsigned long vm_mmap_pgoff(struct file *file, unsigned long addr,
unsigned long len, unsigned long prot,
unsigned long flag, unsigned long pgoff)
{
unsigned long ret;
struct mm_struct *mm = current->mm;
unsigned long populate;
LIST_HEAD(uf);
ret = security_mmap_file(file, prot, flag);
if (!ret) {
if (mmap_write_lock_killable(mm))
return -EINTR;
ret = do_mmap(file, addr, len, prot, flag, pgoff, &populate,
&uf);
mmap_write_unlock(mm);
userfaultfd_unmap_complete(mm, &uf);
if (populate)
mm_populate(ret, populate);
}
return ret;
}
unsigned long vm_mmap(struct file *file, unsigned long addr,
unsigned long len, unsigned long prot,
unsigned long flag, unsigned long offset)
{
if (unlikely(offset + PAGE_ALIGN(len) < offset))
return -EINVAL;
if (unlikely(offset_in_page(offset)))
return -EINVAL;
return vm_mmap_pgoff(file, addr, len, prot, flag, offset >> PAGE_SHIFT);
}
EXPORT_SYMBOL(vm_mmap);
/**
* kvmalloc_node - attempt to allocate physically contiguous memory, but upon
* failure, fall back to non-contiguous (vmalloc) allocation.
* @size: size of the request.
* @flags: gfp mask for the allocation - must be compatible (superset) with GFP_KERNEL.
* @node: numa node to allocate from
*
* Uses kmalloc to get the memory but if the allocation fails then falls back
* to the vmalloc allocator. Use kvfree for freeing the memory.
*
* Reclaim modifiers - __GFP_NORETRY and __GFP_NOFAIL are not supported.
* __GFP_RETRY_MAYFAIL is supported, and it should be used only if kmalloc is
* preferable to the vmalloc fallback, due to visible performance drawbacks.
*
* Please note that any use of gfp flags outside of GFP_KERNEL is careful to not
* fall back to vmalloc.
*
* Return: pointer to the allocated memory of %NULL in case of failure
*/
void *kvmalloc_node(size_t size, gfp_t flags, int node)
{
gfp_t kmalloc_flags = flags;
void *ret;
/*
* vmalloc uses GFP_KERNEL for some internal allocations (e.g page tables)
* so the given set of flags has to be compatible.
*/
if ((flags & GFP_KERNEL) != GFP_KERNEL)
return kmalloc_node(size, flags, node);
/*
* We want to attempt a large physically contiguous block first because
* it is less likely to fragment multiple larger blocks and therefore
* contribute to a long term fragmentation less than vmalloc fallback.
* However make sure that larger requests are not too disruptive - no
* OOM killer and no allocation failure warnings as we have a fallback.
*/
if (size > PAGE_SIZE) { kmalloc_flags |= __GFP_NOWARN; if (!(kmalloc_flags & __GFP_RETRY_MAYFAIL)) kmalloc_flags |= __GFP_NORETRY;
}
ret = kmalloc_node(size, kmalloc_flags, node);
/*
* It doesn't really make sense to fallback to vmalloc for sub page
* requests
*/
if (ret || size <= PAGE_SIZE)
return ret;
/* Don't even allow crazy sizes */
if (unlikely(size > INT_MAX)) { WARN_ON_ONCE(!(flags & __GFP_NOWARN));
return NULL;
}
return __vmalloc_node(size, 1, flags, node,
__builtin_return_address(0));
}
EXPORT_SYMBOL(kvmalloc_node);
/**
* kvfree() - Free memory.
* @addr: Pointer to allocated memory.
*
* kvfree frees memory allocated by any of vmalloc(), kmalloc() or kvmalloc().
* It is slightly more efficient to use kfree() or vfree() if you are certain
* that you know which one to use.
*
* Context: Either preemptible task context or not-NMI interrupt.
*/
void kvfree(const void *addr)
{
if (is_vmalloc_addr(addr)) vfree(addr);
else
kfree(addr);
}
EXPORT_SYMBOL(kvfree);
/**
* kvfree_sensitive - Free a data object containing sensitive information.
* @addr: address of the data object to be freed.
* @len: length of the data object.
*
* Use the special memzero_explicit() function to clear the content of a
* kvmalloc'ed object containing sensitive data to make sure that the
* compiler won't optimize out the data clearing.
*/
void kvfree_sensitive(const void *addr, size_t len)
{
if (likely(!ZERO_OR_NULL_PTR(addr))) {
memzero_explicit((void *)addr, len);
kvfree(addr);
}
}
EXPORT_SYMBOL(kvfree_sensitive);
void *kvrealloc(const void *p, size_t oldsize, size_t newsize, gfp_t flags)
{
void *newp;
if (oldsize >= newsize)
return (void *)p;
newp = kvmalloc(newsize, flags);
if (!newp)
return NULL;
memcpy(newp, p, oldsize);
kvfree(p);
return newp;
}
EXPORT_SYMBOL(kvrealloc);
static inline void *__page_rmapping(struct page *page)
{
unsigned long mapping;
mapping = (unsigned long)page->mapping;
mapping &= ~PAGE_MAPPING_FLAGS;
return (void *)mapping;
}
/* Neutral page->mapping pointer to address_space or anon_vma or other */
void *page_rmapping(struct page *page)
{
page = compound_head(page);
return __page_rmapping(page);
}
/*
* Return true if this page is mapped into pagetables.
* For compound page it returns true if any subpage of compound page is mapped.
*/
bool page_mapped(struct page *page)
{
int i;
if (likely(!PageCompound(page)))
return atomic_read(&page->_mapcount) >= 0;
page = compound_head(page);
if (atomic_read(compound_mapcount_ptr(page)) >= 0)
return true;
if (PageHuge(page))
return false;
for (i = 0; i < compound_nr(page); i++) { if (atomic_read(&page[i]._mapcount) >= 0)
return true;
}
return false;
}
EXPORT_SYMBOL(page_mapped);
struct anon_vma *page_anon_vma(struct page *page)
{
unsigned long mapping;
page = compound_head(page);
mapping = (unsigned long)page->mapping;
if ((mapping & PAGE_MAPPING_FLAGS) != PAGE_MAPPING_ANON)
return NULL;
return __page_rmapping(page);
}
struct address_space *page_mapping(struct page *page)
{
struct address_space *mapping;
page = compound_head(page);
/* This happens if someone calls flush_dcache_page on slab page */
if (unlikely(PageSlab(page)))
return NULL;
if (unlikely(PageSwapCache(page))) {
swp_entry_t entry;
entry.val = page_private(page); return swap_address_space(entry);
}
mapping = page->mapping;
if ((unsigned long)mapping & PAGE_MAPPING_ANON)
return NULL;
return (void *)((unsigned long)mapping & ~PAGE_MAPPING_FLAGS);}
EXPORT_SYMBOL(page_mapping);
/* Slow path of page_mapcount() for compound pages */
int __page_mapcount(struct page *page)
{
int ret;
ret = atomic_read(&page->_mapcount) + 1;
/*
* For file THP page->_mapcount contains total number of mapping
* of the page: no need to look into compound_mapcount.
*/
if (!PageAnon(page) && !PageHuge(page))
return ret;
page = compound_head(page);
ret += atomic_read(compound_mapcount_ptr(page)) + 1;
if (PageDoubleMap(page))
ret--;
return ret;
}
EXPORT_SYMBOL_GPL(__page_mapcount);
void copy_huge_page(struct page *dst, struct page *src)
{
unsigned i, nr = compound_nr(src);
for (i = 0; i < nr; i++) {
cond_resched();
copy_highpage(nth_page(dst, i), nth_page(src, i));
}
}
int sysctl_overcommit_memory __read_mostly = OVERCOMMIT_GUESS;
int sysctl_overcommit_ratio __read_mostly = 50;
unsigned long sysctl_overcommit_kbytes __read_mostly;
int sysctl_max_map_count __read_mostly = DEFAULT_MAX_MAP_COUNT;
unsigned long sysctl_user_reserve_kbytes __read_mostly = 1UL << 17; /* 128MB */
unsigned long sysctl_admin_reserve_kbytes __read_mostly = 1UL << 13; /* 8MB */
int overcommit_ratio_handler(struct ctl_table *table, int write, void *buffer,
size_t *lenp, loff_t *ppos)
{
int ret;
ret = proc_dointvec(table, write, buffer, lenp, ppos);
if (ret == 0 && write)
sysctl_overcommit_kbytes = 0;
return ret;
}
static void sync_overcommit_as(struct work_struct *dummy)
{
percpu_counter_sync(&vm_committed_as);
}
int overcommit_policy_handler(struct ctl_table *table, int write, void *buffer,
size_t *lenp, loff_t *ppos)
{
struct ctl_table t;
int new_policy = -1;
int ret;
/*
* The deviation of sync_overcommit_as could be big with loose policy
* like OVERCOMMIT_ALWAYS/OVERCOMMIT_GUESS. When changing policy to
* strict OVERCOMMIT_NEVER, we need to reduce the deviation to comply
* with the strict "NEVER", and to avoid possible race condition (even
* though user usually won't too frequently do the switching to policy
* OVERCOMMIT_NEVER), the switch is done in the following order:
* 1. changing the batch
* 2. sync percpu count on each CPU
* 3. switch the policy
*/
if (write) {
t = *table;
t.data = &new_policy;
ret = proc_dointvec_minmax(&t, write, buffer, lenp, ppos);
if (ret || new_policy == -1)
return ret;
mm_compute_batch(new_policy);
if (new_policy == OVERCOMMIT_NEVER)
schedule_on_each_cpu(sync_overcommit_as);
sysctl_overcommit_memory = new_policy;
} else {
ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
}
return ret;
}
int overcommit_kbytes_handler(struct ctl_table *table, int write, void *buffer,
size_t *lenp, loff_t *ppos)
{
int ret;
ret = proc_doulongvec_minmax(table, write, buffer, lenp, ppos);
if (ret == 0 && write)
sysctl_overcommit_ratio = 0;
return ret;
}
/*
* Committed memory limit enforced when OVERCOMMIT_NEVER policy is used
*/
unsigned long vm_commit_limit(void)
{
unsigned long allowed;
if (sysctl_overcommit_kbytes)
allowed = sysctl_overcommit_kbytes >> (PAGE_SHIFT - 10);
else
allowed = ((totalram_pages() - hugetlb_total_pages())
* sysctl_overcommit_ratio / 100);
allowed += total_swap_pages;
return allowed;
}
/*
* Make sure vm_committed_as in one cacheline and not cacheline shared with
* other variables. It can be updated by several CPUs frequently.
*/
struct percpu_counter vm_committed_as ____cacheline_aligned_in_smp;
/*
* The global memory commitment made in the system can be a metric
* that can be used to drive ballooning decisions when Linux is hosted
* as a guest. On Hyper-V, the host implements a policy engine for dynamically
* balancing memory across competing virtual machines that are hosted.
* Several metrics drive this policy engine including the guest reported
* memory commitment.
*
* The time cost of this is very low for small platforms, and for big
* platform like a 2S/36C/72T Skylake server, in worst case where
* vm_committed_as's spinlock is under severe contention, the time cost
* could be about 30~40 microseconds.
*/
unsigned long vm_memory_committed(void)
{
return percpu_counter_sum_positive(&vm_committed_as);
}
EXPORT_SYMBOL_GPL(vm_memory_committed);
/*
* Check that a process has enough memory to allocate a new virtual
* mapping. 0 means there is enough memory for the allocation to
* succeed and -ENOMEM implies there is not.
*
* We currently support three overcommit policies, which are set via the
* vm.overcommit_memory sysctl. See Documentation/vm/overcommit-accounting.rst
*
* Strict overcommit modes added 2002 Feb 26 by Alan Cox.
* Additional code 2002 Jul 20 by Robert Love.
*
* cap_sys_admin is 1 if the process has admin privileges, 0 otherwise.
*
* Note this is a helper function intended to be used by LSMs which
* wish to use this logic.
*/
int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin)
{
long allowed;
vm_acct_memory(pages);
/*
* Sometimes we want to use more memory than we have
*/
if (sysctl_overcommit_memory == OVERCOMMIT_ALWAYS)
return 0; if (sysctl_overcommit_memory == OVERCOMMIT_GUESS) {
if (pages > totalram_pages() + total_swap_pages)
goto error;
return 0;
}
allowed = vm_commit_limit();
/*
* Reserve some for root
*/
if (!cap_sys_admin) allowed -= sysctl_admin_reserve_kbytes >> (PAGE_SHIFT - 10);
/*
* Don't let a single process grow so big a user can't recover
*/
if (mm) {
long reserve = sysctl_user_reserve_kbytes >> (PAGE_SHIFT - 10);
allowed -= min_t(long, mm->total_vm / 32, reserve);
}
if (percpu_counter_read_positive(&vm_committed_as) < allowed)
return 0;
error:
vm_unacct_memory(pages);
return -ENOMEM;
}
/**
* get_cmdline() - copy the cmdline value to a buffer.
* @task: the task whose cmdline value to copy.
* @buffer: the buffer to copy to.
* @buflen: the length of the buffer. Larger cmdline values are truncated
* to this length.
*
* Return: the size of the cmdline field copied. Note that the copy does
* not guarantee an ending NULL byte.
*/
int get_cmdline(struct task_struct *task, char *buffer, int buflen)
{
int res = 0;
unsigned int len;
struct mm_struct *mm = get_task_mm(task);
unsigned long arg_start, arg_end, env_start, env_end;
if (!mm)
goto out;
if (!mm->arg_end)
goto out_mm; /* Shh! No looking before we're done */
spin_lock(&mm->arg_lock);
arg_start = mm->arg_start;
arg_end = mm->arg_end;
env_start = mm->env_start;
env_end = mm->env_end;
spin_unlock(&mm->arg_lock);
len = arg_end - arg_start;
if (len > buflen)
len = buflen;
res = access_process_vm(task, arg_start, buffer, len, FOLL_FORCE);
/*
* If the nul at the end of args has been overwritten, then
* assume application is using setproctitle(3).
*/
if (res > 0 && buffer[res-1] != '\0' && len < buflen) {
len = strnlen(buffer, res);
if (len < res) {
res = len;
} else {
len = env_end - env_start;
if (len > buflen - res)
len = buflen - res;
res += access_process_vm(task, env_start,
buffer+res, len,
FOLL_FORCE);
res = strnlen(buffer, res);
}
}
out_mm:
mmput(mm);
out:
return res;
}
int __weak memcmp_pages(struct page *page1, struct page *page2)
{
char *addr1, *addr2;
int ret;
addr1 = kmap_atomic(page1);
addr2 = kmap_atomic(page2);
ret = memcmp(addr1, addr2, PAGE_SIZE);
kunmap_atomic(addr2);
kunmap_atomic(addr1);
return ret;
}
#ifdef CONFIG_PRINTK
/**
* mem_dump_obj - Print available provenance information
* @object: object for which to find provenance information.
*
* This function uses pr_cont(), so that the caller is expected to have
* printed out whatever preamble is appropriate. The provenance information
* depends on the type of object and on how much debugging is enabled.
* For example, for a slab-cache object, the slab name is printed, and,
* if available, the return address and stack trace from the allocation
* and last free path of that object.
*/
void mem_dump_obj(void *object)
{
const char *type;
if (kmem_valid_obj(object)) {
kmem_dump_obj(object);
return;
}
if (vmalloc_dump_obj(object))
return;
if (virt_addr_valid(object))
type = "non-slab/vmalloc memory";
else if (object == NULL)
type = "NULL pointer";
else if (object == ZERO_SIZE_PTR)
type = "zero-size pointer";
else
type = "non-paged memory";
pr_cont(" %s\n", type);
}
EXPORT_SYMBOL_GPL(mem_dump_obj);
#endif
/*
* A driver might set a page logically offline -- PageOffline() -- and
* turn the page inaccessible in the hypervisor; after that, access to page
* content can be fatal.
*
* Some special PFN walkers -- i.e., /proc/kcore -- read content of random
* pages after checking PageOffline(); however, these PFN walkers can race
* with drivers that set PageOffline().
*
* page_offline_freeze()/page_offline_thaw() allows for a subsystem to
* synchronize with such drivers, achieving that a page cannot be set
* PageOffline() while frozen.
*
* page_offline_begin()/page_offline_end() is used by drivers that care about
* such races when setting a page PageOffline().
*/
static DECLARE_RWSEM(page_offline_rwsem);
void page_offline_freeze(void)
{
down_read(&page_offline_rwsem);
}
void page_offline_thaw(void)
{
up_read(&page_offline_rwsem);
}
void page_offline_begin(void)
{
down_write(&page_offline_rwsem);
}
EXPORT_SYMBOL(page_offline_begin);
void page_offline_end(void)
{
up_write(&page_offline_rwsem);
}
EXPORT_SYMBOL(page_offline_end);
// SPDX-License-Identifier: GPL-2.0-only
/*
* linux/mm/swap.c
*
* Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds
*/
/*
* This file contains the default values for the operation of the
* Linux VM subsystem. Fine-tuning documentation can be found in
* Documentation/admin-guide/sysctl/vm.rst.
* Started 18.12.91
* Swap aging added 23.2.95, Stephen Tweedie.
* Buffermem limits added 12.3.98, Rik van Riel.
*/
#include <linux/mm.h>
#include <linux/sched.h>
#include <linux/kernel_stat.h>
#include <linux/swap.h>
#include <linux/mman.h>
#include <linux/pagemap.h>
#include <linux/pagevec.h>
#include <linux/init.h>
#include <linux/export.h>
#include <linux/mm_inline.h>
#include <linux/percpu_counter.h>
#include <linux/memremap.h>
#include <linux/percpu.h>
#include <linux/cpu.h>
#include <linux/notifier.h>
#include <linux/backing-dev.h>
#include <linux/memcontrol.h>
#include <linux/gfp.h>
#include <linux/uio.h>
#include <linux/hugetlb.h>
#include <linux/page_idle.h>
#include <linux/local_lock.h>
#include <linux/buffer_head.h>
#include "internal.h"
#define CREATE_TRACE_POINTS
#include <trace/events/pagemap.h>
/* How many pages do we try to swap or page in/out together? */
int page_cluster;
/* Protecting only lru_rotate.pvec which requires disabling interrupts */
struct lru_rotate {
local_lock_t lock;
struct pagevec pvec;
};
static DEFINE_PER_CPU(struct lru_rotate, lru_rotate) = {
.lock = INIT_LOCAL_LOCK(lock),
};
/*
* The following struct pagevec are grouped together because they are protected
* by disabling preemption (and interrupts remain enabled).
*/
struct lru_pvecs {
local_lock_t lock;
struct pagevec lru_add;
struct pagevec lru_deactivate_file;
struct pagevec lru_deactivate;
struct pagevec lru_lazyfree;
#ifdef CONFIG_SMP
struct pagevec activate_page;
#endif
};
static DEFINE_PER_CPU(struct lru_pvecs, lru_pvecs) = {
.lock = INIT_LOCAL_LOCK(lock),
};
/*
* This path almost never happens for VM activity - pages are normally
* freed via pagevecs. But it gets used by networking.
*/
static void __page_cache_release(struct page *page)
{
if (PageLRU(page)) {
struct lruvec *lruvec;
unsigned long flags;
lruvec = lock_page_lruvec_irqsave(page, &flags);
del_page_from_lru_list(page, lruvec);
__clear_page_lru_flags(page);
unlock_page_lruvec_irqrestore(lruvec, flags);
}
__ClearPageWaiters(page);
}
static void __put_single_page(struct page *page)
{
__page_cache_release(page);
mem_cgroup_uncharge(page);
free_unref_page(page, 0);
}
static void __put_compound_page(struct page *page)
{
/*
* __page_cache_release() is supposed to be called for thp, not for
* hugetlb. This is because hugetlb page does never have PageLRU set
* (it's never listed to any LRU lists) and no memcg routines should
* be called for hugetlb (it has a separate hugetlb_cgroup.)
*/
if (!PageHuge(page)) __page_cache_release(page);
destroy_compound_page(page);
}
void __put_page(struct page *page)
{
if (is_zone_device_page(page)) {
put_dev_pagemap(page->pgmap);
/*
* The page belongs to the device that created pgmap. Do
* not return it to page allocator.
*/
return;
}
if (unlikely(PageCompound(page)))
__put_compound_page(page);
else
__put_single_page(page);
}
EXPORT_SYMBOL(__put_page);
/**
* put_pages_list() - release a list of pages
* @pages: list of pages threaded on page->lru
*
* Release a list of pages which are strung together on page.lru. Currently
* used by read_cache_pages() and related error recovery code.
*/
void put_pages_list(struct list_head *pages)
{
while (!list_empty(pages)) {
struct page *victim;
victim = lru_to_page(pages);
list_del(&victim->lru);
put_page(victim);
}
}
EXPORT_SYMBOL(put_pages_list);
/*
* get_kernel_pages() - pin kernel pages in memory
* @kiov: An array of struct kvec structures
* @nr_segs: number of segments to pin
* @write: pinning for read/write, currently ignored
* @pages: array that receives pointers to the pages pinned.
* Should be at least nr_segs long.
*
* Returns number of pages pinned. This may be fewer than the number
* requested. If nr_pages is 0 or negative, returns 0. If no pages
* were pinned, returns -errno. Each page returned must be released
* with a put_page() call when it is finished with.
*/
int get_kernel_pages(const struct kvec *kiov, int nr_segs, int write,
struct page **pages)
{
int seg;
for (seg = 0; seg < nr_segs; seg++) {
if (WARN_ON(kiov[seg].iov_len != PAGE_SIZE))
return seg;
pages[seg] = kmap_to_page(kiov[seg].iov_base);
get_page(pages[seg]);
}
return seg;
}
EXPORT_SYMBOL_GPL(get_kernel_pages);
static void pagevec_lru_move_fn(struct pagevec *pvec,
void (*move_fn)(struct page *page, struct lruvec *lruvec))
{
int i;
struct lruvec *lruvec = NULL;
unsigned long flags = 0;
for (i = 0; i < pagevec_count(pvec); i++) { struct page *page = pvec->pages[i];
/* block memcg migration during page moving between lru */
if (!TestClearPageLRU(page))
continue;
lruvec = relock_page_lruvec_irqsave(page, lruvec, &flags);
(*move_fn)(page, lruvec);
SetPageLRU(page);
}
if (lruvec)
unlock_page_lruvec_irqrestore(lruvec, flags);
release_pages(pvec->pages, pvec->nr);
pagevec_reinit(pvec);
}
static void pagevec_move_tail_fn(struct page *page, struct lruvec *lruvec)
{
if (!PageUnevictable(page)) {
del_page_from_lru_list(page, lruvec);
ClearPageActive(page);
add_page_to_lru_list_tail(page, lruvec);
__count_vm_events(PGROTATED, thp_nr_pages(page));
}
}
/* return true if pagevec needs to drain */
static bool pagevec_add_and_need_flush(struct pagevec *pvec, struct page *page)
{
bool ret = false;
if (!pagevec_add(pvec, page) || PageCompound(page) ||
lru_cache_disabled())
ret = true;
return ret;
}
/*
* Writeback is about to end against a page which has been marked for immediate
* reclaim. If it still appears to be reclaimable, move it to the tail of the
* inactive list.
*
* rotate_reclaimable_page() must disable IRQs, to prevent nasty races.
*/
void rotate_reclaimable_page(struct page *page)
{
if (!PageLocked(page) && !PageDirty(page) &&
!PageUnevictable(page) && PageLRU(page)) {
struct pagevec *pvec;
unsigned long flags;
get_page(page);
local_lock_irqsave(&lru_rotate.lock, flags);
pvec = this_cpu_ptr(&lru_rotate.pvec);
if (pagevec_add_and_need_flush(pvec, page))
pagevec_lru_move_fn(pvec, pagevec_move_tail_fn);
local_unlock_irqrestore(&lru_rotate.lock, flags);
}
}
void lru_note_cost(struct lruvec *lruvec, bool file, unsigned int nr_pages)
{
do {
unsigned long lrusize;
/*
* Hold lruvec->lru_lock is safe here, since
* 1) The pinned lruvec in reclaim, or
* 2) From a pre-LRU page during refault (which also holds the
* rcu lock, so would be safe even if the page was on the LRU
* and could move simultaneously to a new lruvec).
*/
spin_lock_irq(&lruvec->lru_lock);
/* Record cost event */
if (file)
lruvec->file_cost += nr_pages;
else
lruvec->anon_cost += nr_pages;
/*
* Decay previous events
*
* Because workloads change over time (and to avoid
* overflow) we keep these statistics as a floating
* average, which ends up weighing recent refaults
* more than old ones.
*/
lrusize = lruvec_page_state(lruvec, NR_INACTIVE_ANON) +
lruvec_page_state(lruvec, NR_ACTIVE_ANON) +
lruvec_page_state(lruvec, NR_INACTIVE_FILE) +
lruvec_page_state(lruvec, NR_ACTIVE_FILE);
if (lruvec->file_cost + lruvec->anon_cost > lrusize / 4) {
lruvec->file_cost /= 2;
lruvec->anon_cost /= 2;
}
spin_unlock_irq(&lruvec->lru_lock);
} while ((lruvec = parent_lruvec(lruvec)));
}
void lru_note_cost_page(struct page *page)
{
lru_note_cost(mem_cgroup_page_lruvec(page),
page_is_file_lru(page), thp_nr_pages(page));
}
static void __activate_page(struct page *page, struct lruvec *lruvec)
{
if (!PageActive(page) && !PageUnevictable(page)) {
int nr_pages = thp_nr_pages(page);
del_page_from_lru_list(page, lruvec);
SetPageActive(page);
add_page_to_lru_list(page, lruvec);
trace_mm_lru_activate(page);
__count_vm_events(PGACTIVATE, nr_pages);
__count_memcg_events(lruvec_memcg(lruvec), PGACTIVATE,
nr_pages);
}
}
#ifdef CONFIG_SMP
static void activate_page_drain(int cpu)
{
struct pagevec *pvec = &per_cpu(lru_pvecs.activate_page, cpu);
if (pagevec_count(pvec))
pagevec_lru_move_fn(pvec, __activate_page);
}
static bool need_activate_page_drain(int cpu)
{
return pagevec_count(&per_cpu(lru_pvecs.activate_page, cpu)) != 0;
}
static void activate_page(struct page *page)
{
page = compound_head(page);
if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) {
struct pagevec *pvec;
local_lock(&lru_pvecs.lock);
pvec = this_cpu_ptr(&lru_pvecs.activate_page);
get_page(page);
if (pagevec_add_and_need_flush(pvec, page))
pagevec_lru_move_fn(pvec, __activate_page); local_unlock(&lru_pvecs.lock);
}
}
#else
static inline void activate_page_drain(int cpu)
{
}
static void activate_page(struct page *page)
{
struct lruvec *lruvec;
page = compound_head(page);
if (TestClearPageLRU(page)) {
lruvec = lock_page_lruvec_irq(page);
__activate_page(page, lruvec);
unlock_page_lruvec_irq(lruvec);
SetPageLRU(page);
}
}
#endif
static void __lru_cache_activate_page(struct page *page)
{
struct pagevec *pvec;
int i;
local_lock(&lru_pvecs.lock);
pvec = this_cpu_ptr(&lru_pvecs.lru_add);
/*
* Search backwards on the optimistic assumption that the page being
* activated has just been added to this pagevec. Note that only
* the local pagevec is examined as a !PageLRU page could be in the
* process of being released, reclaimed, migrated or on a remote
* pagevec that is currently being drained. Furthermore, marking
* a remote pagevec's page PageActive potentially hits a race where
* a page is marked PageActive just after it is added to the inactive
* list causing accounting errors and BUG_ON checks to trigger.
*/
for (i = pagevec_count(pvec) - 1; i >= 0; i--) { struct page *pagevec_page = pvec->pages[i];
if (pagevec_page == page) {
SetPageActive(page);
break;
}
}
local_unlock(&lru_pvecs.lock);
}
/*
* Mark a page as having seen activity.
*
* inactive,unreferenced -> inactive,referenced
* inactive,referenced -> active,unreferenced
* active,unreferenced -> active,referenced
*
* When a newly allocated page is not yet visible, so safe for non-atomic ops,
* __SetPageReferenced(page) may be substituted for mark_page_accessed(page).
*/
void mark_page_accessed(struct page *page)
{
page = compound_head(page);
if (!PageReferenced(page)) {
SetPageReferenced(page);
} else if (PageUnevictable(page)) {
/*
* Unevictable pages are on the "LRU_UNEVICTABLE" list. But,
* this list is never rotated or maintained, so marking an
* evictable page accessed has no effect.
*/
} else if (!PageActive(page)) {
/*
* If the page is on the LRU, queue it for activation via
* lru_pvecs.activate_page. Otherwise, assume the page is on a
* pagevec, mark it active and it'll be moved to the active
* LRU on the next drain.
*/
if (PageLRU(page))
activate_page(page);
else
__lru_cache_activate_page(page);
ClearPageReferenced(page);
workingset_activation(page);
}
if (page_is_idle(page))
clear_page_idle(page);
}
EXPORT_SYMBOL(mark_page_accessed);
/**
* lru_cache_add - add a page to a page list
* @page: the page to be added to the LRU.
*
* Queue the page for addition to the LRU via pagevec. The decision on whether
* to add the page to the [in]active [file|anon] list is deferred until the
* pagevec is drained. This gives a chance for the caller of lru_cache_add()
* have the page added to the active list using mark_page_accessed().
*/
void lru_cache_add(struct page *page)
{
struct pagevec *pvec;
VM_BUG_ON_PAGE(PageActive(page) && PageUnevictable(page), page);
VM_BUG_ON_PAGE(PageLRU(page), page);
get_page(page);
local_lock(&lru_pvecs.lock);
pvec = this_cpu_ptr(&lru_pvecs.lru_add);
if (pagevec_add_and_need_flush(pvec, page))
__pagevec_lru_add(pvec); local_unlock(&lru_pvecs.lock);
}
EXPORT_SYMBOL(lru_cache_add);
/**
* lru_cache_add_inactive_or_unevictable
* @page: the page to be added to LRU
* @vma: vma in which page is mapped for determining reclaimability
*
* Place @page on the inactive or unevictable LRU list, depending on its
* evictability.
*/
void lru_cache_add_inactive_or_unevictable(struct page *page,
struct vm_area_struct *vma)
{
bool unevictable;
VM_BUG_ON_PAGE(PageLRU(page), page);
unevictable = (vma->vm_flags & (VM_LOCKED | VM_SPECIAL)) == VM_LOCKED;
if (unlikely(unevictable) && !TestSetPageMlocked(page)) {
int nr_pages = thp_nr_pages(page);
/*
* We use the irq-unsafe __mod_zone_page_state because this
* counter is not modified from interrupt context, and the pte
* lock is held(spinlock), which implies preemption disabled.
*/
__mod_zone_page_state(page_zone(page), NR_MLOCK, nr_pages);
count_vm_events(UNEVICTABLE_PGMLOCKED, nr_pages);
}
lru_cache_add(page);
}
/*
* If the page can not be invalidated, it is moved to the
* inactive list to speed up its reclaim. It is moved to the
* head of the list, rather than the tail, to give the flusher
* threads some time to write it out, as this is much more
* effective than the single-page writeout from reclaim.
*
* If the page isn't page_mapped and dirty/writeback, the page
* could reclaim asap using PG_reclaim.
*
* 1. active, mapped page -> none
* 2. active, dirty/writeback page -> inactive, head, PG_reclaim
* 3. inactive, mapped page -> none
* 4. inactive, dirty/writeback page -> inactive, head, PG_reclaim
* 5. inactive, clean -> inactive, tail
* 6. Others -> none
*
* In 4, why it moves inactive's head, the VM expects the page would
* be write it out by flusher threads as this is much more effective
* than the single-page writeout from reclaim.
*/
static void lru_deactivate_file_fn(struct page *page, struct lruvec *lruvec)
{
bool active = PageActive(page);
int nr_pages = thp_nr_pages(page);
if (PageUnevictable(page))
return;
/* Some processes are using the page */
if (page_mapped(page))
return;
del_page_from_lru_list(page, lruvec);
ClearPageActive(page);
ClearPageReferenced(page);
if (PageWriteback(page) || PageDirty(page)) {
/*
* PG_reclaim could be raced with end_page_writeback
* It can make readahead confusing. But race window
* is _really_ small and it's non-critical problem.
*/
add_page_to_lru_list(page, lruvec);
SetPageReclaim(page);
} else {
/*
* The page's writeback ends up during pagevec
* We move that page into tail of inactive.
*/
add_page_to_lru_list_tail(page, lruvec);
__count_vm_events(PGROTATED, nr_pages);
}
if (active) {
__count_vm_events(PGDEACTIVATE, nr_pages);
__count_memcg_events(lruvec_memcg(lruvec), PGDEACTIVATE,
nr_pages);
}
}
static void lru_deactivate_fn(struct page *page, struct lruvec *lruvec)
{
if (PageActive(page) && !PageUnevictable(page)) {
int nr_pages = thp_nr_pages(page);
del_page_from_lru_list(page, lruvec);
ClearPageActive(page);
ClearPageReferenced(page);
add_page_to_lru_list(page, lruvec);
__count_vm_events(PGDEACTIVATE, nr_pages);
__count_memcg_events(lruvec_memcg(lruvec), PGDEACTIVATE,
nr_pages);
}
}
static void lru_lazyfree_fn(struct page *page, struct lruvec *lruvec)
{
if (PageAnon(page) && PageSwapBacked(page) &&
!PageSwapCache(page) && !PageUnevictable(page)) {
int nr_pages = thp_nr_pages(page);
del_page_from_lru_list(page, lruvec);
ClearPageActive(page);
ClearPageReferenced(page);
/*
* Lazyfree pages are clean anonymous pages. They have
* PG_swapbacked flag cleared, to distinguish them from normal
* anonymous pages
*/
ClearPageSwapBacked(page);
add_page_to_lru_list(page, lruvec);
__count_vm_events(PGLAZYFREE, nr_pages);
__count_memcg_events(lruvec_memcg(lruvec), PGLAZYFREE,
nr_pages);
}
}
/*
* Drain pages out of the cpu's pagevecs.
* Either "cpu" is the current CPU, and preemption has already been
* disabled; or "cpu" is being hot-unplugged, and is already dead.
*/
void lru_add_drain_cpu(int cpu)
{
struct pagevec *pvec = &per_cpu(lru_pvecs.lru_add, cpu);
if (pagevec_count(pvec))
__pagevec_lru_add(pvec); pvec = &per_cpu(lru_rotate.pvec, cpu);
/* Disabling interrupts below acts as a compiler barrier. */
if (data_race(pagevec_count(pvec))) {
unsigned long flags;
/* No harm done if a racing interrupt already did this */
local_lock_irqsave(&lru_rotate.lock, flags);
pagevec_lru_move_fn(pvec, pagevec_move_tail_fn);
local_unlock_irqrestore(&lru_rotate.lock, flags);
}
pvec = &per_cpu(lru_pvecs.lru_deactivate_file, cpu);
if (pagevec_count(pvec))
pagevec_lru_move_fn(pvec, lru_deactivate_file_fn); pvec = &per_cpu(lru_pvecs.lru_deactivate, cpu);
if (pagevec_count(pvec))
pagevec_lru_move_fn(pvec, lru_deactivate_fn); pvec = &per_cpu(lru_pvecs.lru_lazyfree, cpu);
if (pagevec_count(pvec))
pagevec_lru_move_fn(pvec, lru_lazyfree_fn);
activate_page_drain(cpu);
}
/**
* deactivate_file_page - forcefully deactivate a file page
* @page: page to deactivate
*
* This function hints the VM that @page is a good reclaim candidate,
* for example if its invalidation fails due to the page being dirty
* or under writeback.
*/
void deactivate_file_page(struct page *page)
{
/*
* In a workload with many unevictable page such as mprotect,
* unevictable page deactivation for accelerating reclaim is pointless.
*/
if (PageUnevictable(page))
return;
if (likely(get_page_unless_zero(page))) {
struct pagevec *pvec;
local_lock(&lru_pvecs.lock);
pvec = this_cpu_ptr(&lru_pvecs.lru_deactivate_file);
if (pagevec_add_and_need_flush(pvec, page))
pagevec_lru_move_fn(pvec, lru_deactivate_file_fn); local_unlock(&lru_pvecs.lock);
}
}
/*
* deactivate_page - deactivate a page
* @page: page to deactivate
*
* deactivate_page() moves @page to the inactive list if @page was on the active
* list and was not an unevictable page. This is done to accelerate the reclaim
* of @page.
*/
void deactivate_page(struct page *page)
{
if (PageLRU(page) && PageActive(page) && !PageUnevictable(page)) {
struct pagevec *pvec;
local_lock(&lru_pvecs.lock);
pvec = this_cpu_ptr(&lru_pvecs.lru_deactivate);
get_page(page);
if (pagevec_add_and_need_flush(pvec, page))
pagevec_lru_move_fn(pvec, lru_deactivate_fn);
local_unlock(&lru_pvecs.lock);
}
}
/**
* mark_page_lazyfree - make an anon page lazyfree
* @page: page to deactivate
*
* mark_page_lazyfree() moves @page to the inactive file list.
* This is done to accelerate the reclaim of @page.
*/
void mark_page_lazyfree(struct page *page)
{
if (PageLRU(page) && PageAnon(page) && PageSwapBacked(page) &&
!PageSwapCache(page) && !PageUnevictable(page)) {
struct pagevec *pvec;
local_lock(&lru_pvecs.lock);
pvec = this_cpu_ptr(&lru_pvecs.lru_lazyfree);
get_page(page);
if (pagevec_add_and_need_flush(pvec, page))
pagevec_lru_move_fn(pvec, lru_lazyfree_fn);
local_unlock(&lru_pvecs.lock);
}
}
void lru_add_drain(void)
{
local_lock(&lru_pvecs.lock);
lru_add_drain_cpu(smp_processor_id());
local_unlock(&lru_pvecs.lock);
}
/*
* It's called from per-cpu workqueue context in SMP case so
* lru_add_drain_cpu and invalidate_bh_lrus_cpu should run on
* the same cpu. It shouldn't be a problem in !SMP case since
* the core is only one and the locks will disable preemption.
*/
static void lru_add_and_bh_lrus_drain(void)
{
local_lock(&lru_pvecs.lock);
lru_add_drain_cpu(smp_processor_id());
local_unlock(&lru_pvecs.lock);
invalidate_bh_lrus_cpu();
}
void lru_add_drain_cpu_zone(struct zone *zone)
{
local_lock(&lru_pvecs.lock);
lru_add_drain_cpu(smp_processor_id());
drain_local_pages(zone);
local_unlock(&lru_pvecs.lock);
}
#ifdef CONFIG_SMP
static DEFINE_PER_CPU(struct work_struct, lru_add_drain_work);
static void lru_add_drain_per_cpu(struct work_struct *dummy)
{
lru_add_and_bh_lrus_drain();
}
/*
* Doesn't need any cpu hotplug locking because we do rely on per-cpu
* kworkers being shut down before our page_alloc_cpu_dead callback is
* executed on the offlined cpu.
* Calling this function with cpu hotplug locks held can actually lead
* to obscure indirect dependencies via WQ context.
*/
inline void __lru_add_drain_all(bool force_all_cpus)
{
/*
* lru_drain_gen - Global pages generation number
*
* (A) Definition: global lru_drain_gen = x implies that all generations
* 0 < n <= x are already *scheduled* for draining.
*
* This is an optimization for the highly-contended use case where a
* user space workload keeps constantly generating a flow of pages for
* each CPU.
*/
static unsigned int lru_drain_gen;
static struct cpumask has_work;
static DEFINE_MUTEX(lock);
unsigned cpu, this_gen;
/*
* Make sure nobody triggers this path before mm_percpu_wq is fully
* initialized.
*/
if (WARN_ON(!mm_percpu_wq))
return;
/*
* Guarantee pagevec counter stores visible by this CPU are visible to
* other CPUs before loading the current drain generation.
*/
smp_mb();
/*
* (B) Locally cache global LRU draining generation number
*
* The read barrier ensures that the counter is loaded before the mutex
* is taken. It pairs with smp_mb() inside the mutex critical section
* at (D).
*/
this_gen = smp_load_acquire(&lru_drain_gen);
mutex_lock(&lock);
/*
* (C) Exit the draining operation if a newer generation, from another
* lru_add_drain_all(), was already scheduled for draining. Check (A).
*/
if (unlikely(this_gen != lru_drain_gen && !force_all_cpus))
goto done;
/*
* (D) Increment global generation number
*
* Pairs with smp_load_acquire() at (B), outside of the critical
* section. Use a full memory barrier to guarantee that the new global
* drain generation number is stored before loading pagevec counters.
*
* This pairing must be done here, before the for_each_online_cpu loop
* below which drains the page vectors.
*
* Let x, y, and z represent some system CPU numbers, where x < y < z.
* Assume CPU #z is in the middle of the for_each_online_cpu loop
* below and has already reached CPU #y's per-cpu data. CPU #x comes
* along, adds some pages to its per-cpu vectors, then calls
* lru_add_drain_all().
*
* If the paired barrier is done at any later step, e.g. after the
* loop, CPU #x will just exit at (C) and miss flushing out all of its
* added pages.
*/
WRITE_ONCE(lru_drain_gen, lru_drain_gen + 1);
smp_mb();
cpumask_clear(&has_work);
for_each_online_cpu(cpu) { struct work_struct *work = &per_cpu(lru_add_drain_work, cpu);
if (force_all_cpus ||
pagevec_count(&per_cpu(lru_pvecs.lru_add, cpu)) ||
data_race(pagevec_count(&per_cpu(lru_rotate.pvec, cpu))) || pagevec_count(&per_cpu(lru_pvecs.lru_deactivate_file, cpu)) || pagevec_count(&per_cpu(lru_pvecs.lru_deactivate, cpu)) || pagevec_count(&per_cpu(lru_pvecs.lru_lazyfree, cpu)) ||
need_activate_page_drain(cpu) ||
has_bh_in_lru(cpu, NULL)) { INIT_WORK(work, lru_add_drain_per_cpu);
queue_work_on(cpu, mm_percpu_wq, work);
__cpumask_set_cpu(cpu, &has_work);
}
}
for_each_cpu(cpu, &has_work) flush_work(&per_cpu(lru_add_drain_work, cpu));
done:
mutex_unlock(&lock);
}
void lru_add_drain_all(void)
{
__lru_add_drain_all(false);
}
#else
void lru_add_drain_all(void)
{
lru_add_drain();
}
#endif /* CONFIG_SMP */
atomic_t lru_disable_count = ATOMIC_INIT(0);
/*
* lru_cache_disable() needs to be called before we start compiling
* a list of pages to be migrated using isolate_lru_page().
* It drains pages on LRU cache and then disable on all cpus until
* lru_cache_enable is called.
*
* Must be paired with a call to lru_cache_enable().
*/
void lru_cache_disable(void)
{
atomic_inc(&lru_disable_count);
#ifdef CONFIG_SMP
/*
* lru_add_drain_all in the force mode will schedule draining on
* all online CPUs so any calls of lru_cache_disabled wrapped by
* local_lock or preemption disabled would be ordered by that.
* The atomic operation doesn't need to have stronger ordering
* requirements because that is enforeced by the scheduling
* guarantees.
*/
__lru_add_drain_all(true);
#else
lru_add_and_bh_lrus_drain();
#endif
}
/**
* release_pages - batched put_page()
* @pages: array of pages to release
* @nr: number of pages
*
* Decrement the reference count on all the pages in @pages. If it
* fell to zero, remove the page from the LRU and free it.
*/
void release_pages(struct page **pages, int nr)
{
int i;
LIST_HEAD(pages_to_free);
struct lruvec *lruvec = NULL;
unsigned long flags;
unsigned int lock_batch;
for (i = 0; i < nr; i++) { struct page *page = pages[i];
/*
* Make sure the IRQ-safe lock-holding time does not get
* excessive with a continuous string of pages from the
* same lruvec. The lock is held only if lruvec != NULL.
*/
if (lruvec && ++lock_batch == SWAP_CLUSTER_MAX) {
unlock_page_lruvec_irqrestore(lruvec, flags);
lruvec = NULL;
}
page = compound_head(page);
if (is_huge_zero_page(page))
continue;
if (is_zone_device_page(page)) {
if (lruvec) {
unlock_page_lruvec_irqrestore(lruvec, flags);
lruvec = NULL;
}
/*
* ZONE_DEVICE pages that return 'false' from
* page_is_devmap_managed() do not require special
* processing, and instead, expect a call to
* put_page_testzero().
*/
if (page_is_devmap_managed(page)) {
put_devmap_managed_page(page);
continue;
}
if (put_page_testzero(page))
put_dev_pagemap(page->pgmap);
continue;
}
if (!put_page_testzero(page))
continue;
if (PageCompound(page)) {
if (lruvec) {
unlock_page_lruvec_irqrestore(lruvec, flags);
lruvec = NULL;
}
__put_compound_page(page);
continue;
}
if (PageLRU(page)) {
struct lruvec *prev_lruvec = lruvec;
lruvec = relock_page_lruvec_irqsave(page, lruvec,
&flags);
if (prev_lruvec != lruvec)
lock_batch = 0;
del_page_from_lru_list(page, lruvec);
__clear_page_lru_flags(page);
}
__ClearPageWaiters(page);
list_add(&page->lru, &pages_to_free);
}
if (lruvec)
unlock_page_lruvec_irqrestore(lruvec, flags);
mem_cgroup_uncharge_list(&pages_to_free);
free_unref_page_list(&pages_to_free);
}
EXPORT_SYMBOL(release_pages);
/*
* The pages which we're about to release may be in the deferred lru-addition
* queues. That would prevent them from really being freed right now. That's
* OK from a correctness point of view but is inefficient - those pages may be
* cache-warm and we want to give them back to the page allocator ASAP.
*
* So __pagevec_release() will drain those queues here. __pagevec_lru_add()
* and __pagevec_lru_add_active() call release_pages() directly to avoid
* mutual recursion.
*/
void __pagevec_release(struct pagevec *pvec)
{
if (!pvec->percpu_pvec_drained) {
lru_add_drain();
pvec->percpu_pvec_drained = true;
}
release_pages(pvec->pages, pagevec_count(pvec));
pagevec_reinit(pvec);
}
EXPORT_SYMBOL(__pagevec_release);
static void __pagevec_lru_add_fn(struct page *page, struct lruvec *lruvec)
{
int was_unevictable = TestClearPageUnevictable(page);
int nr_pages = thp_nr_pages(page);
VM_BUG_ON_PAGE(PageLRU(page), page);
/*
* Page becomes evictable in two ways:
* 1) Within LRU lock [munlock_vma_page() and __munlock_pagevec()].
* 2) Before acquiring LRU lock to put the page to correct LRU and then
* a) do PageLRU check with lock [check_move_unevictable_pages]
* b) do PageLRU check before lock [clear_page_mlock]
*
* (1) & (2a) are ok as LRU lock will serialize them. For (2b), we need
* following strict ordering:
*
* #0: __pagevec_lru_add_fn #1: clear_page_mlock
*
* SetPageLRU() TestClearPageMlocked()
* smp_mb() // explicit ordering // above provides strict
* // ordering
* PageMlocked() PageLRU()
*
*
* if '#1' does not observe setting of PG_lru by '#0' and fails
* isolation, the explicit barrier will make sure that page_evictable
* check will put the page in correct LRU. Without smp_mb(), SetPageLRU
* can be reordered after PageMlocked check and can make '#1' to fail
* the isolation of the page whose Mlocked bit is cleared (#0 is also
* looking at the same page) and the evictable page will be stranded
* in an unevictable LRU.
*/
SetPageLRU(page);
smp_mb__after_atomic();
if (page_evictable(page)) {
if (was_unevictable)
__count_vm_events(UNEVICTABLE_PGRESCUED, nr_pages);
} else {
ClearPageActive(page);
SetPageUnevictable(page);
if (!was_unevictable)
__count_vm_events(UNEVICTABLE_PGCULLED, nr_pages);
}
add_page_to_lru_list(page, lruvec);
trace_mm_lru_insertion(page);
}
/*
* Add the passed pages to the LRU, then drop the caller's refcount
* on them. Reinitialises the caller's pagevec.
*/
void __pagevec_lru_add(struct pagevec *pvec)
{
int i;
struct lruvec *lruvec = NULL;
unsigned long flags = 0;
for (i = 0; i < pagevec_count(pvec); i++) { struct page *page = pvec->pages[i];
lruvec = relock_page_lruvec_irqsave(page, lruvec, &flags);
__pagevec_lru_add_fn(page, lruvec);
}
if (lruvec)
unlock_page_lruvec_irqrestore(lruvec, flags);
release_pages(pvec->pages, pvec->nr);
pagevec_reinit(pvec);
}
/**
* pagevec_remove_exceptionals - pagevec exceptionals pruning
* @pvec: The pagevec to prune
*
* find_get_entries() fills both pages and XArray value entries (aka
* exceptional entries) into the pagevec. This function prunes all
* exceptionals from @pvec without leaving holes, so that it can be
* passed on to page-only pagevec operations.
*/
void pagevec_remove_exceptionals(struct pagevec *pvec)
{
int i, j;
for (i = 0, j = 0; i < pagevec_count(pvec); i++) { struct page *page = pvec->pages[i];
if (!xa_is_value(page))
pvec->pages[j++] = page;
}
pvec->nr = j;
}
/**
* pagevec_lookup_range - gang pagecache lookup
* @pvec: Where the resulting pages are placed
* @mapping: The address_space to search
* @start: The starting page index
* @end: The final page index
*
* pagevec_lookup_range() will search for & return a group of up to PAGEVEC_SIZE
* pages in the mapping starting from index @start and upto index @end
* (inclusive). The pages are placed in @pvec. pagevec_lookup() takes a
* reference against the pages in @pvec.
*
* The search returns a group of mapping-contiguous pages with ascending
* indexes. There may be holes in the indices due to not-present pages. We
* also update @start to index the next page for the traversal.
*
* pagevec_lookup_range() returns the number of pages which were found. If this
* number is smaller than PAGEVEC_SIZE, the end of specified range has been
* reached.
*/
unsigned pagevec_lookup_range(struct pagevec *pvec,
struct address_space *mapping, pgoff_t *start, pgoff_t end)
{
pvec->nr = find_get_pages_range(mapping, start, end, PAGEVEC_SIZE,
pvec->pages);
return pagevec_count(pvec);
}
EXPORT_SYMBOL(pagevec_lookup_range);
unsigned pagevec_lookup_range_tag(struct pagevec *pvec,
struct address_space *mapping, pgoff_t *index, pgoff_t end,
xa_mark_t tag)
{
pvec->nr = find_get_pages_range_tag(mapping, index, end, tag,
PAGEVEC_SIZE, pvec->pages);
return pagevec_count(pvec);
}
EXPORT_SYMBOL(pagevec_lookup_range_tag);
/*
* Perform any setup for the swap system
*/
void __init swap_setup(void)
{
unsigned long megs = totalram_pages() >> (20 - PAGE_SHIFT);
/* Use a smaller cluster for small-memory machines */
if (megs < 16)
page_cluster = 2;
else
page_cluster = 3;
/*
* Right now other parts of the system means that we
* _really_ don't want to cluster much more
*/
}
#ifdef CONFIG_DEV_PAGEMAP_OPS
void put_devmap_managed_page(struct page *page)
{
int count;
if (WARN_ON_ONCE(!page_is_devmap_managed(page)))
return;
count = page_ref_dec_return(page);
/*
* devmap page refcounts are 1-based, rather than 0-based: if
* refcount is 1, then the page is free and the refcount is
* stable because nobody holds a reference on the page.
*/
if (count == 1)
free_devmap_managed_page(page);
else if (!count)
__put_page(page);
}
EXPORT_SYMBOL(put_devmap_managed_page);
#endif
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_WAIT_BIT_H
#define _LINUX_WAIT_BIT_H
/*
* Linux wait-bit related types and methods:
*/
#include <linux/wait.h>
struct wait_bit_key {
void *flags;
int bit_nr;
unsigned long timeout;
};
struct wait_bit_queue_entry {
struct wait_bit_key key;
struct wait_queue_entry wq_entry;
};
#define __WAIT_BIT_KEY_INITIALIZER(word, bit) \
{ .flags = word, .bit_nr = bit, }
typedef int wait_bit_action_f(struct wait_bit_key *key, int mode);
void __wake_up_bit(struct wait_queue_head *wq_head, void *word, int bit);
int __wait_on_bit(struct wait_queue_head *wq_head, struct wait_bit_queue_entry *wbq_entry, wait_bit_action_f *action, unsigned int mode);
int __wait_on_bit_lock(struct wait_queue_head *wq_head, struct wait_bit_queue_entry *wbq_entry, wait_bit_action_f *action, unsigned int mode);
void wake_up_bit(void *word, int bit);
int out_of_line_wait_on_bit(void *word, int, wait_bit_action_f *action, unsigned int mode);
int out_of_line_wait_on_bit_timeout(void *word, int, wait_bit_action_f *action, unsigned int mode, unsigned long timeout);
int out_of_line_wait_on_bit_lock(void *word, int, wait_bit_action_f *action, unsigned int mode);
struct wait_queue_head *bit_waitqueue(void *word, int bit);
extern void __init wait_bit_init(void);
int wake_bit_function(struct wait_queue_entry *wq_entry, unsigned mode, int sync, void *key);
#define DEFINE_WAIT_BIT(name, word, bit) \
struct wait_bit_queue_entry name = { \
.key = __WAIT_BIT_KEY_INITIALIZER(word, bit), \
.wq_entry = { \
.private = current, \
.func = wake_bit_function, \
.entry = \
LIST_HEAD_INIT((name).wq_entry.entry), \
}, \
}
extern int bit_wait(struct wait_bit_key *key, int mode);
extern int bit_wait_io(struct wait_bit_key *key, int mode);
extern int bit_wait_timeout(struct wait_bit_key *key, int mode);
extern int bit_wait_io_timeout(struct wait_bit_key *key, int mode);
/**
* wait_on_bit - wait for a bit to be cleared
* @word: the word being waited on, a kernel virtual address
* @bit: the bit of the word being waited on
* @mode: the task state to sleep in
*
* There is a standard hashed waitqueue table for generic use. This
* is the part of the hashtable's accessor API that waits on a bit.
* For instance, if one were to have waiters on a bitflag, one would
* call wait_on_bit() in threads waiting for the bit to clear.
* One uses wait_on_bit() where one is waiting for the bit to clear,
* but has no intention of setting it.
* Returned value will be zero if the bit was cleared, or non-zero
* if the process received a signal and the mode permitted wakeup
* on that signal.
*/
static inline int
wait_on_bit(unsigned long *word, int bit, unsigned mode)
{
might_sleep();
if (!test_bit(bit, word))
return 0;
return out_of_line_wait_on_bit(word, bit,
bit_wait,
mode);
}
/**
* wait_on_bit_io - wait for a bit to be cleared
* @word: the word being waited on, a kernel virtual address
* @bit: the bit of the word being waited on
* @mode: the task state to sleep in
*
* Use the standard hashed waitqueue table to wait for a bit
* to be cleared. This is similar to wait_on_bit(), but calls
* io_schedule() instead of schedule() for the actual waiting.
*
* Returned value will be zero if the bit was cleared, or non-zero
* if the process received a signal and the mode permitted wakeup
* on that signal.
*/
static inline int
wait_on_bit_io(unsigned long *word, int bit, unsigned mode)
{
might_sleep();
if (!test_bit(bit, word))
return 0;
return out_of_line_wait_on_bit(word, bit,
bit_wait_io,
mode);
}
/**
* wait_on_bit_timeout - wait for a bit to be cleared or a timeout elapses
* @word: the word being waited on, a kernel virtual address
* @bit: the bit of the word being waited on
* @mode: the task state to sleep in
* @timeout: timeout, in jiffies
*
* Use the standard hashed waitqueue table to wait for a bit
* to be cleared. This is similar to wait_on_bit(), except also takes a
* timeout parameter.
*
* Returned value will be zero if the bit was cleared before the
* @timeout elapsed, or non-zero if the @timeout elapsed or process
* received a signal and the mode permitted wakeup on that signal.
*/
static inline int
wait_on_bit_timeout(unsigned long *word, int bit, unsigned mode,
unsigned long timeout)
{
might_sleep();
if (!test_bit(bit, word))
return 0;
return out_of_line_wait_on_bit_timeout(word, bit,
bit_wait_timeout,
mode, timeout);
}
/**
* wait_on_bit_action - wait for a bit to be cleared
* @word: the word being waited on, a kernel virtual address
* @bit: the bit of the word being waited on
* @action: the function used to sleep, which may take special actions
* @mode: the task state to sleep in
*
* Use the standard hashed waitqueue table to wait for a bit
* to be cleared, and allow the waiting action to be specified.
* This is like wait_on_bit() but allows fine control of how the waiting
* is done.
*
* Returned value will be zero if the bit was cleared, or non-zero
* if the process received a signal and the mode permitted wakeup
* on that signal.
*/
static inline int
wait_on_bit_action(unsigned long *word, int bit, wait_bit_action_f *action,
unsigned mode)
{
might_sleep();
if (!test_bit(bit, word))
return 0;
return out_of_line_wait_on_bit(word, bit, action, mode);
}
/**
* wait_on_bit_lock - wait for a bit to be cleared, when wanting to set it
* @word: the word being waited on, a kernel virtual address
* @bit: the bit of the word being waited on
* @mode: the task state to sleep in
*
* There is a standard hashed waitqueue table for generic use. This
* is the part of the hashtable's accessor API that waits on a bit
* when one intends to set it, for instance, trying to lock bitflags.
* For instance, if one were to have waiters trying to set bitflag
* and waiting for it to clear before setting it, one would call
* wait_on_bit() in threads waiting to be able to set the bit.
* One uses wait_on_bit_lock() where one is waiting for the bit to
* clear with the intention of setting it, and when done, clearing it.
*
* Returns zero if the bit was (eventually) found to be clear and was
* set. Returns non-zero if a signal was delivered to the process and
* the @mode allows that signal to wake the process.
*/
static inline int
wait_on_bit_lock(unsigned long *word, int bit, unsigned mode)
{
might_sleep();
if (!test_and_set_bit(bit, word))
return 0;
return out_of_line_wait_on_bit_lock(word, bit, bit_wait, mode);
}
/**
* wait_on_bit_lock_io - wait for a bit to be cleared, when wanting to set it
* @word: the word being waited on, a kernel virtual address
* @bit: the bit of the word being waited on
* @mode: the task state to sleep in
*
* Use the standard hashed waitqueue table to wait for a bit
* to be cleared and then to atomically set it. This is similar
* to wait_on_bit(), but calls io_schedule() instead of schedule()
* for the actual waiting.
*
* Returns zero if the bit was (eventually) found to be clear and was
* set. Returns non-zero if a signal was delivered to the process and
* the @mode allows that signal to wake the process.
*/
static inline int
wait_on_bit_lock_io(unsigned long *word, int bit, unsigned mode)
{
might_sleep();
if (!test_and_set_bit(bit, word))
return 0;
return out_of_line_wait_on_bit_lock(word, bit, bit_wait_io, mode);
}
/**
* wait_on_bit_lock_action - wait for a bit to be cleared, when wanting to set it
* @word: the word being waited on, a kernel virtual address
* @bit: the bit of the word being waited on
* @action: the function used to sleep, which may take special actions
* @mode: the task state to sleep in
*
* Use the standard hashed waitqueue table to wait for a bit
* to be cleared and then to set it, and allow the waiting action
* to be specified.
* This is like wait_on_bit() but allows fine control of how the waiting
* is done.
*
* Returns zero if the bit was (eventually) found to be clear and was
* set. Returns non-zero if a signal was delivered to the process and
* the @mode allows that signal to wake the process.
*/
static inline int
wait_on_bit_lock_action(unsigned long *word, int bit, wait_bit_action_f *action,
unsigned mode)
{
might_sleep();
if (!test_and_set_bit(bit, word))
return 0;
return out_of_line_wait_on_bit_lock(word, bit, action, mode);
}
extern void init_wait_var_entry(struct wait_bit_queue_entry *wbq_entry, void *var, int flags);
extern void wake_up_var(void *var);
extern wait_queue_head_t *__var_waitqueue(void *p);
#define ___wait_var_event(var, condition, state, exclusive, ret, cmd) \
({ \
__label__ __out; \
struct wait_queue_head *__wq_head = __var_waitqueue(var); \
struct wait_bit_queue_entry __wbq_entry; \
long __ret = ret; /* explicit shadow */ \
\
init_wait_var_entry(&__wbq_entry, var, \
exclusive ? WQ_FLAG_EXCLUSIVE : 0); \
for (;;) { \
long __int = prepare_to_wait_event(__wq_head, \
&__wbq_entry.wq_entry, \
state); \
if (condition) \
break; \
\
if (___wait_is_interruptible(state) && __int) { \
__ret = __int; \
goto __out; \
} \
\
cmd; \
} \
finish_wait(__wq_head, &__wbq_entry.wq_entry); \
__out: __ret; \
})
#define __wait_var_event(var, condition) \
___wait_var_event(var, condition, TASK_UNINTERRUPTIBLE, 0, 0, \
schedule())
#define wait_var_event(var, condition) \
do { \
might_sleep(); \
if (condition) \
break; \
__wait_var_event(var, condition); \
} while (0)
#define __wait_var_event_killable(var, condition) \
___wait_var_event(var, condition, TASK_KILLABLE, 0, 0, \
schedule())
#define wait_var_event_killable(var, condition) \
({ \
int __ret = 0; \
might_sleep(); \
if (!(condition)) \
__ret = __wait_var_event_killable(var, condition); \
__ret; \
})
#define __wait_var_event_timeout(var, condition, timeout) \
___wait_var_event(var, ___wait_cond_timeout(condition), \
TASK_UNINTERRUPTIBLE, 0, timeout, \
__ret = schedule_timeout(__ret))
#define wait_var_event_timeout(var, condition, timeout) \
({ \
long __ret = timeout; \
might_sleep(); \
if (!___wait_cond_timeout(condition)) \
__ret = __wait_var_event_timeout(var, condition, timeout); \
__ret; \
})
#define __wait_var_event_interruptible(var, condition) \
___wait_var_event(var, condition, TASK_INTERRUPTIBLE, 0, 0, \
schedule())
#define wait_var_event_interruptible(var, condition) \
({ \
int __ret = 0; \
might_sleep(); \
if (!(condition)) \
__ret = __wait_var_event_interruptible(var, condition); \
__ret; \
})
/**
* clear_and_wake_up_bit - clear a bit and wake up anyone waiting on that bit
*
* @bit: the bit of the word being waited on
* @word: the word being waited on, a kernel virtual address
*
* You can use this helper if bitflags are manipulated atomically rather than
* non-atomically under a lock.
*/
static inline void clear_and_wake_up_bit(int bit, void *word)
{
clear_bit_unlock(bit, word);
/* See wake_up_bit() for which memory barrier you need to use. */
smp_mb__after_atomic();
wake_up_bit(word, bit);
}
#endif /* _LINUX_WAIT_BIT_H */
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_RCULIST_BL_H
#define _LINUX_RCULIST_BL_H
/*
* RCU-protected bl list version. See include/linux/list_bl.h.
*/
#include <linux/list_bl.h>
#include <linux/rcupdate.h>
static inline void hlist_bl_set_first_rcu(struct hlist_bl_head *h,
struct hlist_bl_node *n)
{
LIST_BL_BUG_ON((unsigned long)n & LIST_BL_LOCKMASK);
LIST_BL_BUG_ON(((unsigned long)h->first & LIST_BL_LOCKMASK) !=
LIST_BL_LOCKMASK);
rcu_assign_pointer(h->first,
(struct hlist_bl_node *)((unsigned long)n | LIST_BL_LOCKMASK));
}
static inline struct hlist_bl_node *hlist_bl_first_rcu(struct hlist_bl_head *h)
{
return (struct hlist_bl_node *)
((unsigned long)rcu_dereference_check(h->first, hlist_bl_is_locked(h)) & ~LIST_BL_LOCKMASK);
}
/**
* hlist_bl_del_rcu - deletes entry from hash list without re-initialization
* @n: the element to delete from the hash list.
*
* Note: hlist_bl_unhashed() on entry does not return true after this,
* the entry is in an undefined state. It is useful for RCU based
* lockfree traversal.
*
* In particular, it means that we can not poison the forward
* pointers that may still be used for walking the hash list.
*
* The caller must take whatever precautions are necessary
* (such as holding appropriate locks) to avoid racing
* with another list-mutation primitive, such as hlist_bl_add_head_rcu()
* or hlist_bl_del_rcu(), running on this same list.
* However, it is perfectly legal to run concurrently with
* the _rcu list-traversal primitives, such as
* hlist_bl_for_each_entry().
*/
static inline void hlist_bl_del_rcu(struct hlist_bl_node *n)
{
__hlist_bl_del(n);
n->pprev = LIST_POISON2;
}
/**
* hlist_bl_add_head_rcu
* @n: the element to add to the hash list.
* @h: the list to add to.
*
* Description:
* Adds the specified element to the specified hlist_bl,
* while permitting racing traversals.
*
* The caller must take whatever precautions are necessary
* (such as holding appropriate locks) to avoid racing
* with another list-mutation primitive, such as hlist_bl_add_head_rcu()
* or hlist_bl_del_rcu(), running on this same list.
* However, it is perfectly legal to run concurrently with
* the _rcu list-traversal primitives, such as
* hlist_bl_for_each_entry_rcu(), used to prevent memory-consistency
* problems on Alpha CPUs. Regardless of the type of CPU, the
* list-traversal primitive must be guarded by rcu_read_lock().
*/
static inline void hlist_bl_add_head_rcu(struct hlist_bl_node *n,
struct hlist_bl_head *h)
{
struct hlist_bl_node *first;
/* don't need hlist_bl_first_rcu because we're under lock */
first = hlist_bl_first(h);
n->next = first;
if (first)
first->pprev = &n->next; n->pprev = &h->first;
/* need _rcu because we can have concurrent lock free readers */
hlist_bl_set_first_rcu(h, n);
}
/**
* hlist_bl_for_each_entry_rcu - iterate over rcu list of given type
* @tpos: the type * to use as a loop cursor.
* @pos: the &struct hlist_bl_node to use as a loop cursor.
* @head: the head for your list.
* @member: the name of the hlist_bl_node within the struct.
*
*/
#define hlist_bl_for_each_entry_rcu(tpos, pos, head, member) \
for (pos = hlist_bl_first_rcu(head); \
pos && \
({ tpos = hlist_bl_entry(pos, typeof(*tpos), member); 1; }); \
pos = rcu_dereference_raw(pos->next))
#endif
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef __NET_NETLINK_H
#define __NET_NETLINK_H
#include <linux/types.h>
#include <linux/netlink.h>
#include <linux/jiffies.h>
#include <linux/in6.h>
/* ========================================================================
* Netlink Messages and Attributes Interface (As Seen On TV)
* ------------------------------------------------------------------------
* Messages Interface
* ------------------------------------------------------------------------
*
* Message Format:
* <--- nlmsg_total_size(payload) --->
* <-- nlmsg_msg_size(payload) ->
* +----------+- - -+-------------+- - -+-------- - -
* | nlmsghdr | Pad | Payload | Pad | nlmsghdr
* +----------+- - -+-------------+- - -+-------- - -
* nlmsg_data(nlh)---^ ^
* nlmsg_next(nlh)-----------------------+
*
* Payload Format:
* <---------------------- nlmsg_len(nlh) --------------------->
* <------ hdrlen ------> <- nlmsg_attrlen(nlh, hdrlen) ->
* +----------------------+- - -+--------------------------------+
* | Family Header | Pad | Attributes |
* +----------------------+- - -+--------------------------------+
* nlmsg_attrdata(nlh, hdrlen)---^
*
* Data Structures:
* struct nlmsghdr netlink message header
*
* Message Construction:
* nlmsg_new() create a new netlink message
* nlmsg_put() add a netlink message to an skb
* nlmsg_put_answer() callback based nlmsg_put()
* nlmsg_end() finalize netlink message
* nlmsg_get_pos() return current position in message
* nlmsg_trim() trim part of message
* nlmsg_cancel() cancel message construction
* nlmsg_free() free a netlink message
*
* Message Sending:
* nlmsg_multicast() multicast message to several groups
* nlmsg_unicast() unicast a message to a single socket
* nlmsg_notify() send notification message
*
* Message Length Calculations:
* nlmsg_msg_size(payload) length of message w/o padding
* nlmsg_total_size(payload) length of message w/ padding
* nlmsg_padlen(payload) length of padding at tail
*
* Message Payload Access:
* nlmsg_data(nlh) head of message payload
* nlmsg_len(nlh) length of message payload
* nlmsg_attrdata(nlh, hdrlen) head of attributes data
* nlmsg_attrlen(nlh, hdrlen) length of attributes data
*
* Message Parsing:
* nlmsg_ok(nlh, remaining) does nlh fit into remaining bytes?
* nlmsg_next(nlh, remaining) get next netlink message
* nlmsg_parse() parse attributes of a message
* nlmsg_find_attr() find an attribute in a message
* nlmsg_for_each_msg() loop over all messages
* nlmsg_validate() validate netlink message incl. attrs
* nlmsg_for_each_attr() loop over all attributes
*
* Misc:
* nlmsg_report() report back to application?
*
* ------------------------------------------------------------------------
* Attributes Interface
* ------------------------------------------------------------------------
*
* Attribute Format:
* <------- nla_total_size(payload) ------->
* <---- nla_attr_size(payload) ----->
* +----------+- - -+- - - - - - - - - +- - -+-------- - -
* | Header | Pad | Payload | Pad | Header
* +----------+- - -+- - - - - - - - - +- - -+-------- - -
* <- nla_len(nla) -> ^
* nla_data(nla)----^ |
* nla_next(nla)-----------------------------'
*
* Data Structures:
* struct nlattr netlink attribute header
*
* Attribute Construction:
* nla_reserve(skb, type, len) reserve room for an attribute
* nla_reserve_nohdr(skb, len) reserve room for an attribute w/o hdr
* nla_put(skb, type, len, data) add attribute to skb
* nla_put_nohdr(skb, len, data) add attribute w/o hdr
* nla_append(skb, len, data) append data to skb
*
* Attribute Construction for Basic Types:
* nla_put_u8(skb, type, value) add u8 attribute to skb
* nla_put_u16(skb, type, value) add u16 attribute to skb
* nla_put_u32(skb, type, value) add u32 attribute to skb
* nla_put_u64_64bit(skb, type,
* value, padattr) add u64 attribute to skb
* nla_put_s8(skb, type, value) add s8 attribute to skb
* nla_put_s16(skb, type, value) add s16 attribute to skb
* nla_put_s32(skb, type, value) add s32 attribute to skb
* nla_put_s64(skb, type, value,
* padattr) add s64 attribute to skb
* nla_put_string(skb, type, str) add string attribute to skb
* nla_put_flag(skb, type) add flag attribute to skb
* nla_put_msecs(skb, type, jiffies,
* padattr) add msecs attribute to skb
* nla_put_in_addr(skb, type, addr) add IPv4 address attribute to skb
* nla_put_in6_addr(skb, type, addr) add IPv6 address attribute to skb
*
* Nested Attributes Construction:
* nla_nest_start(skb, type) start a nested attribute
* nla_nest_end(skb, nla) finalize a nested attribute
* nla_nest_cancel(skb, nla) cancel nested attribute construction
*
* Attribute Length Calculations:
* nla_attr_size(payload) length of attribute w/o padding
* nla_total_size(payload) length of attribute w/ padding
* nla_padlen(payload) length of padding
*
* Attribute Payload Access:
* nla_data(nla) head of attribute payload
* nla_len(nla) length of attribute payload
*
* Attribute Payload Access for Basic Types:
* nla_get_u8(nla) get payload for a u8 attribute
* nla_get_u16(nla) get payload for a u16 attribute
* nla_get_u32(nla) get payload for a u32 attribute
* nla_get_u64(nla) get payload for a u64 attribute
* nla_get_s8(nla) get payload for a s8 attribute
* nla_get_s16(nla) get payload for a s16 attribute
* nla_get_s32(nla) get payload for a s32 attribute
* nla_get_s64(nla) get payload for a s64 attribute
* nla_get_flag(nla) return 1 if flag is true
* nla_get_msecs(nla) get payload for a msecs attribute
*
* Attribute Misc:
* nla_memcpy(dest, nla, count) copy attribute into memory
* nla_memcmp(nla, data, size) compare attribute with memory area
* nla_strscpy(dst, nla, size) copy attribute to a sized string
* nla_strcmp(nla, str) compare attribute with string
*
* Attribute Parsing:
* nla_ok(nla, remaining) does nla fit into remaining bytes?
* nla_next(nla, remaining) get next netlink attribute
* nla_validate() validate a stream of attributes
* nla_validate_nested() validate a stream of nested attributes
* nla_find() find attribute in stream of attributes
* nla_find_nested() find attribute in nested attributes
* nla_parse() parse and validate stream of attrs
* nla_parse_nested() parse nested attributes
* nla_for_each_attr() loop over all attributes
* nla_for_each_nested() loop over the nested attributes
*=========================================================================
*/
/**
* Standard attribute types to specify validation policy
*/
enum {
NLA_UNSPEC,
NLA_U8,
NLA_U16,
NLA_U32,
NLA_U64,
NLA_STRING,
NLA_FLAG,
NLA_MSECS,
NLA_NESTED,
NLA_NESTED_ARRAY,
NLA_NUL_STRING,
NLA_BINARY,
NLA_S8,
NLA_S16,
NLA_S32,
NLA_S64,
NLA_BITFIELD32,
NLA_REJECT,
__NLA_TYPE_MAX,
};
#define NLA_TYPE_MAX (__NLA_TYPE_MAX - 1)
struct netlink_range_validation {
u64 min, max;
};
struct netlink_range_validation_signed {
s64 min, max;
};
enum nla_policy_validation {
NLA_VALIDATE_NONE,
NLA_VALIDATE_RANGE,
NLA_VALIDATE_RANGE_WARN_TOO_LONG,
NLA_VALIDATE_MIN,
NLA_VALIDATE_MAX,
NLA_VALIDATE_MASK,
NLA_VALIDATE_RANGE_PTR,
NLA_VALIDATE_FUNCTION,
};
/**
* struct nla_policy - attribute validation policy
* @type: Type of attribute or NLA_UNSPEC
* @validation_type: type of attribute validation done in addition to
* type-specific validation (e.g. range, function call), see
* &enum nla_policy_validation
* @len: Type specific length of payload
*
* Policies are defined as arrays of this struct, the array must be
* accessible by attribute type up to the highest identifier to be expected.
*
* Meaning of `len' field:
* NLA_STRING Maximum length of string
* NLA_NUL_STRING Maximum length of string (excluding NUL)
* NLA_FLAG Unused
* NLA_BINARY Maximum length of attribute payload
* (but see also below with the validation type)
* NLA_NESTED,
* NLA_NESTED_ARRAY Length verification is done by checking len of
* nested header (or empty); len field is used if
* nested_policy is also used, for the max attr
* number in the nested policy.
* NLA_U8, NLA_U16,
* NLA_U32, NLA_U64,
* NLA_S8, NLA_S16,
* NLA_S32, NLA_S64,
* NLA_MSECS Leaving the length field zero will verify the
* given type fits, using it verifies minimum length
* just like "All other"
* NLA_BITFIELD32 Unused
* NLA_REJECT Unused
* All other Minimum length of attribute payload
*
* Meaning of validation union:
* NLA_BITFIELD32 This is a 32-bit bitmap/bitselector attribute and
* `bitfield32_valid' is the u32 value of valid flags
* NLA_REJECT This attribute is always rejected and `reject_message'
* may point to a string to report as the error instead
* of the generic one in extended ACK.
* NLA_NESTED `nested_policy' to a nested policy to validate, must
* also set `len' to the max attribute number. Use the
* provided NLA_POLICY_NESTED() macro.
* Note that nla_parse() will validate, but of course not
* parse, the nested sub-policies.
* NLA_NESTED_ARRAY `nested_policy' points to a nested policy to validate,
* must also set `len' to the max attribute number. Use
* the provided NLA_POLICY_NESTED_ARRAY() macro.
* The difference to NLA_NESTED is the structure:
* NLA_NESTED has the nested attributes directly inside
* while an array has the nested attributes at another
* level down and the attribute types directly in the
* nesting don't matter.
* NLA_U8,
* NLA_U16,
* NLA_U32,
* NLA_U64,
* NLA_S8,
* NLA_S16,
* NLA_S32,
* NLA_S64 The `min' and `max' fields are used depending on the
* validation_type field, if that is min/max/range then
* the min, max or both are used (respectively) to check
* the value of the integer attribute.
* Note that in the interest of code simplicity and
* struct size both limits are s16, so you cannot
* enforce a range that doesn't fall within the range
* of s16 - do that as usual in the code instead.
* Use the NLA_POLICY_MIN(), NLA_POLICY_MAX() and
* NLA_POLICY_RANGE() macros.
* NLA_U8,
* NLA_U16,
* NLA_U32,
* NLA_U64 If the validation_type field instead is set to
* NLA_VALIDATE_RANGE_PTR, `range' must be a pointer
* to a struct netlink_range_validation that indicates
* the min/max values.
* Use NLA_POLICY_FULL_RANGE().
* NLA_S8,
* NLA_S16,
* NLA_S32,
* NLA_S64 If the validation_type field instead is set to
* NLA_VALIDATE_RANGE_PTR, `range_signed' must be a
* pointer to a struct netlink_range_validation_signed
* that indicates the min/max values.
* Use NLA_POLICY_FULL_RANGE_SIGNED().
*
* NLA_BINARY If the validation type is like the ones for integers
* above, then the min/max length (not value like for
* integers) of the attribute is enforced.
*
* All other Unused - but note that it's a union
*
* Meaning of `validate' field, use via NLA_POLICY_VALIDATE_FN:
* NLA_BINARY Validation function called for the attribute.
* All other Unused - but note that it's a union
*
* Example:
*
* static const u32 myvalidflags = 0xff231023;
*
* static const struct nla_policy my_policy[ATTR_MAX+1] = {
* [ATTR_FOO] = { .type = NLA_U16 },
* [ATTR_BAR] = { .type = NLA_STRING, .len = BARSIZ },
* [ATTR_BAZ] = NLA_POLICY_EXACT_LEN(sizeof(struct mystruct)),
* [ATTR_GOO] = NLA_POLICY_BITFIELD32(myvalidflags),
* };
*/
struct nla_policy {
u8 type;
u8 validation_type;
u16 len;
union {
const u32 bitfield32_valid;
const u32 mask;
const char *reject_message;
const struct nla_policy *nested_policy;
struct netlink_range_validation *range;
struct netlink_range_validation_signed *range_signed;
struct {
s16 min, max;
};
int (*validate)(const struct nlattr *attr,
struct netlink_ext_ack *extack);
/* This entry is special, and used for the attribute at index 0
* only, and specifies special data about the policy, namely it
* specifies the "boundary type" where strict length validation
* starts for any attribute types >= this value, also, strict
* nesting validation starts here.
*
* Additionally, it means that NLA_UNSPEC is actually NLA_REJECT
* for any types >= this, so need to use NLA_POLICY_MIN_LEN() to
* get the previous pure { .len = xyz } behaviour. The advantage
* of this is that types not specified in the policy will be
* rejected.
*
* For completely new families it should be set to 1 so that the
* validation is enforced for all attributes. For existing ones
* it should be set at least when new attributes are added to
* the enum used by the policy, and be set to the new value that
* was added to enforce strict validation from thereon.
*/
u16 strict_start_type;
};
};
#define NLA_POLICY_ETH_ADDR NLA_POLICY_EXACT_LEN(ETH_ALEN)
#define NLA_POLICY_ETH_ADDR_COMPAT NLA_POLICY_EXACT_LEN_WARN(ETH_ALEN)
#define _NLA_POLICY_NESTED(maxattr, policy) \
{ .type = NLA_NESTED, .nested_policy = policy, .len = maxattr }
#define _NLA_POLICY_NESTED_ARRAY(maxattr, policy) \
{ .type = NLA_NESTED_ARRAY, .nested_policy = policy, .len = maxattr }
#define NLA_POLICY_NESTED(policy) \
_NLA_POLICY_NESTED(ARRAY_SIZE(policy) - 1, policy)
#define NLA_POLICY_NESTED_ARRAY(policy) \
_NLA_POLICY_NESTED_ARRAY(ARRAY_SIZE(policy) - 1, policy)
#define NLA_POLICY_BITFIELD32(valid) \
{ .type = NLA_BITFIELD32, .bitfield32_valid = valid }
#define __NLA_IS_UINT_TYPE(tp) \
(tp == NLA_U8 || tp == NLA_U16 || tp == NLA_U32 || tp == NLA_U64)
#define __NLA_IS_SINT_TYPE(tp) \
(tp == NLA_S8 || tp == NLA_S16 || tp == NLA_S32 || tp == NLA_S64)
#define __NLA_ENSURE(condition) BUILD_BUG_ON_ZERO(!(condition))
#define NLA_ENSURE_UINT_TYPE(tp) \
(__NLA_ENSURE(__NLA_IS_UINT_TYPE(tp)) + tp)
#define NLA_ENSURE_UINT_OR_BINARY_TYPE(tp) \
(__NLA_ENSURE(__NLA_IS_UINT_TYPE(tp) || \
tp == NLA_MSECS || \
tp == NLA_BINARY) + tp)
#define NLA_ENSURE_SINT_TYPE(tp) \
(__NLA_ENSURE(__NLA_IS_SINT_TYPE(tp)) + tp)
#define NLA_ENSURE_INT_OR_BINARY_TYPE(tp) \
(__NLA_ENSURE(__NLA_IS_UINT_TYPE(tp) || \
__NLA_IS_SINT_TYPE(tp) || \
tp == NLA_MSECS || \
tp == NLA_BINARY) + tp)
#define NLA_ENSURE_NO_VALIDATION_PTR(tp) \
(__NLA_ENSURE(tp != NLA_BITFIELD32 && \
tp != NLA_REJECT && \
tp != NLA_NESTED && \
tp != NLA_NESTED_ARRAY) + tp)
#define NLA_POLICY_RANGE(tp, _min, _max) { \
.type = NLA_ENSURE_INT_OR_BINARY_TYPE(tp), \
.validation_type = NLA_VALIDATE_RANGE, \
.min = _min, \
.max = _max \
}
#define NLA_POLICY_FULL_RANGE(tp, _range) { \
.type = NLA_ENSURE_UINT_OR_BINARY_TYPE(tp), \
.validation_type = NLA_VALIDATE_RANGE_PTR, \
.range = _range, \
}
#define NLA_POLICY_FULL_RANGE_SIGNED(tp, _range) { \
.type = NLA_ENSURE_SINT_TYPE(tp), \
.validation_type = NLA_VALIDATE_RANGE_PTR, \
.range_signed = _range, \
}
#define NLA_POLICY_MIN(tp, _min) { \
.type = NLA_ENSURE_INT_OR_BINARY_TYPE(tp), \
.validation_type = NLA_VALIDATE_MIN, \
.min = _min, \
}
#define NLA_POLICY_MAX(tp, _max) { \
.type = NLA_ENSURE_INT_OR_BINARY_TYPE(tp), \
.validation_type = NLA_VALIDATE_MAX, \
.max = _max, \
}
#define NLA_POLICY_MASK(tp, _mask) { \
.type = NLA_ENSURE_UINT_TYPE(tp), \
.validation_type = NLA_VALIDATE_MASK, \
.mask = _mask, \
}
#define NLA_POLICY_VALIDATE_FN(tp, fn, ...) { \
.type = NLA_ENSURE_NO_VALIDATION_PTR(tp), \
.validation_type = NLA_VALIDATE_FUNCTION, \
.validate = fn, \
.len = __VA_ARGS__ + 0, \
}
#define NLA_POLICY_EXACT_LEN(_len) NLA_POLICY_RANGE(NLA_BINARY, _len, _len)
#define NLA_POLICY_EXACT_LEN_WARN(_len) { \
.type = NLA_BINARY, \
.validation_type = NLA_VALIDATE_RANGE_WARN_TOO_LONG, \
.min = _len, \
.max = _len \
}
#define NLA_POLICY_MIN_LEN(_len) NLA_POLICY_MIN(NLA_BINARY, _len)
/**
* struct nl_info - netlink source information
* @nlh: Netlink message header of original request
* @nl_net: Network namespace
* @portid: Netlink PORTID of requesting application
* @skip_notify: Skip netlink notifications to user space
* @skip_notify_kernel: Skip selected in-kernel notifications
*/
struct nl_info {
struct nlmsghdr *nlh;
struct net *nl_net;
u32 portid;
u8 skip_notify:1,
skip_notify_kernel:1;
};
/**
* enum netlink_validation - netlink message/attribute validation levels
* @NL_VALIDATE_LIBERAL: Old-style "be liberal" validation, not caring about
* extra data at the end of the message, attributes being longer than
* they should be, or unknown attributes being present.
* @NL_VALIDATE_TRAILING: Reject junk data encountered after attribute parsing.
* @NL_VALIDATE_MAXTYPE: Reject attributes > max type; Together with _TRAILING
* this is equivalent to the old nla_parse_strict()/nlmsg_parse_strict().
* @NL_VALIDATE_UNSPEC: Reject attributes with NLA_UNSPEC in the policy.
* This can safely be set by the kernel when the given policy has no
* NLA_UNSPEC anymore, and can thus be used to ensure policy entries
* are enforced going forward.
* @NL_VALIDATE_STRICT_ATTRS: strict attribute policy parsing (e.g.
* U8, U16, U32 must have exact size, etc.)
* @NL_VALIDATE_NESTED: Check that NLA_F_NESTED is set for NLA_NESTED(_ARRAY)
* and unset for other policies.
*/
enum netlink_validation {
NL_VALIDATE_LIBERAL = 0,
NL_VALIDATE_TRAILING = BIT(0),
NL_VALIDATE_MAXTYPE = BIT(1),
NL_VALIDATE_UNSPEC = BIT(2),
NL_VALIDATE_STRICT_ATTRS = BIT(3),
NL_VALIDATE_NESTED = BIT(4),
};
#define NL_VALIDATE_DEPRECATED_STRICT (NL_VALIDATE_TRAILING |\
NL_VALIDATE_MAXTYPE)
#define NL_VALIDATE_STRICT (NL_VALIDATE_TRAILING |\
NL_VALIDATE_MAXTYPE |\
NL_VALIDATE_UNSPEC |\
NL_VALIDATE_STRICT_ATTRS |\
NL_VALIDATE_NESTED)
int netlink_rcv_skb(struct sk_buff *skb,
int (*cb)(struct sk_buff *, struct nlmsghdr *,
struct netlink_ext_ack *));
int nlmsg_notify(struct sock *sk, struct sk_buff *skb, u32 portid,
unsigned int group, int report, gfp_t flags);
int __nla_validate(const struct nlattr *head, int len, int maxtype,
const struct nla_policy *policy, unsigned int validate,
struct netlink_ext_ack *extack);
int __nla_parse(struct nlattr **tb, int maxtype, const struct nlattr *head,
int len, const struct nla_policy *policy, unsigned int validate,
struct netlink_ext_ack *extack);
int nla_policy_len(const struct nla_policy *, int);
struct nlattr *nla_find(const struct nlattr *head, int len, int attrtype);
ssize_t nla_strscpy(char *dst, const struct nlattr *nla, size_t dstsize);
char *nla_strdup(const struct nlattr *nla, gfp_t flags);
int nla_memcpy(void *dest, const struct nlattr *src, int count);
int nla_memcmp(const struct nlattr *nla, const void *data, size_t size);
int nla_strcmp(const struct nlattr *nla, const char *str);
struct nlattr *__nla_reserve(struct sk_buff *skb, int attrtype, int attrlen);
struct nlattr *__nla_reserve_64bit(struct sk_buff *skb, int attrtype,
int attrlen, int padattr);
void *__nla_reserve_nohdr(struct sk_buff *skb, int attrlen);
struct nlattr *nla_reserve(struct sk_buff *skb, int attrtype, int attrlen);
struct nlattr *nla_reserve_64bit(struct sk_buff *skb, int attrtype,
int attrlen, int padattr);
void *nla_reserve_nohdr(struct sk_buff *skb, int attrlen);
void __nla_put(struct sk_buff *skb, int attrtype, int attrlen,
const void *data);
void __nla_put_64bit(struct sk_buff *skb, int attrtype, int attrlen,
const void *data, int padattr);
void __nla_put_nohdr(struct sk_buff *skb, int attrlen, const void *data);
int nla_put(struct sk_buff *skb, int attrtype, int attrlen, const void *data);
int nla_put_64bit(struct sk_buff *skb, int attrtype, int attrlen,
const void *data, int padattr);
int nla_put_nohdr(struct sk_buff *skb, int attrlen, const void *data);
int nla_append(struct sk_buff *skb, int attrlen, const void *data);
/**************************************************************************
* Netlink Messages
**************************************************************************/
/**
* nlmsg_msg_size - length of netlink message not including padding
* @payload: length of message payload
*/
static inline int nlmsg_msg_size(int payload)
{
return NLMSG_HDRLEN + payload;
}
/**
* nlmsg_total_size - length of netlink message including padding
* @payload: length of message payload
*/
static inline int nlmsg_total_size(int payload)
{
return NLMSG_ALIGN(nlmsg_msg_size(payload));
}
/**
* nlmsg_padlen - length of padding at the message's tail
* @payload: length of message payload
*/
static inline int nlmsg_padlen(int payload)
{
return nlmsg_total_size(payload) - nlmsg_msg_size(payload);
}
/**
* nlmsg_data - head of message payload
* @nlh: netlink message header
*/
static inline void *nlmsg_data(const struct nlmsghdr *nlh)
{
return (unsigned char *) nlh + NLMSG_HDRLEN;
}
/**
* nlmsg_len - length of message payload
* @nlh: netlink message header
*/
static inline int nlmsg_len(const struct nlmsghdr *nlh)
{
return nlh->nlmsg_len - NLMSG_HDRLEN;
}
/**
* nlmsg_attrdata - head of attributes data
* @nlh: netlink message header
* @hdrlen: length of family specific header
*/
static inline struct nlattr *nlmsg_attrdata(const struct nlmsghdr *nlh,
int hdrlen)
{
unsigned char *data = nlmsg_data(nlh);
return (struct nlattr *) (data + NLMSG_ALIGN(hdrlen));
}
/**
* nlmsg_attrlen - length of attributes data
* @nlh: netlink message header
* @hdrlen: length of family specific header
*/
static inline int nlmsg_attrlen(const struct nlmsghdr *nlh, int hdrlen)
{
return nlmsg_len(nlh) - NLMSG_ALIGN(hdrlen);
}
/**
* nlmsg_ok - check if the netlink message fits into the remaining bytes
* @nlh: netlink message header
* @remaining: number of bytes remaining in message stream
*/
static inline int nlmsg_ok(const struct nlmsghdr *nlh, int remaining)
{
return (remaining >= (int) sizeof(struct nlmsghdr) &&
nlh->nlmsg_len >= sizeof(struct nlmsghdr) && nlh->nlmsg_len <= remaining);
}
/**
* nlmsg_next - next netlink message in message stream
* @nlh: netlink message header
* @remaining: number of bytes remaining in message stream
*
* Returns the next netlink message in the message stream and
* decrements remaining by the size of the current message.
*/
static inline struct nlmsghdr *
nlmsg_next(const struct nlmsghdr *nlh, int *remaining)
{
int totlen = NLMSG_ALIGN(nlh->nlmsg_len);
*remaining -= totlen;
return (struct nlmsghdr *) ((unsigned char *) nlh + totlen);
}
/**
* nla_parse - Parse a stream of attributes into a tb buffer
* @tb: destination array with maxtype+1 elements
* @maxtype: maximum attribute type to be expected
* @head: head of attribute stream
* @len: length of attribute stream
* @policy: validation policy
* @extack: extended ACK pointer
*
* Parses a stream of attributes and stores a pointer to each attribute in
* the tb array accessible via the attribute type. Attributes with a type
* exceeding maxtype will be rejected, policy must be specified, attributes
* will be validated in the strictest way possible.
*
* Returns 0 on success or a negative error code.
*/
static inline int nla_parse(struct nlattr **tb, int maxtype,
const struct nlattr *head, int len,
const struct nla_policy *policy,
struct netlink_ext_ack *extack)
{
return __nla_parse(tb, maxtype, head, len, policy,
NL_VALIDATE_STRICT, extack);
}
/**
* nla_parse_deprecated - Parse a stream of attributes into a tb buffer
* @tb: destination array with maxtype+1 elements
* @maxtype: maximum attribute type to be expected
* @head: head of attribute stream
* @len: length of attribute stream
* @policy: validation policy
* @extack: extended ACK pointer
*
* Parses a stream of attributes and stores a pointer to each attribute in
* the tb array accessible via the attribute type. Attributes with a type
* exceeding maxtype will be ignored and attributes from the policy are not
* always strictly validated (only for new attributes).
*
* Returns 0 on success or a negative error code.
*/
static inline int nla_parse_deprecated(struct nlattr **tb, int maxtype,
const struct nlattr *head, int len,
const struct nla_policy *policy,
struct netlink_ext_ack *extack)
{
return __nla_parse(tb, maxtype, head, len, policy,
NL_VALIDATE_LIBERAL, extack);
}
/**
* nla_parse_deprecated_strict - Parse a stream of attributes into a tb buffer
* @tb: destination array with maxtype+1 elements
* @maxtype: maximum attribute type to be expected
* @head: head of attribute stream
* @len: length of attribute stream
* @policy: validation policy
* @extack: extended ACK pointer
*
* Parses a stream of attributes and stores a pointer to each attribute in
* the tb array accessible via the attribute type. Attributes with a type
* exceeding maxtype will be rejected as well as trailing data, but the
* policy is not completely strictly validated (only for new attributes).
*
* Returns 0 on success or a negative error code.
*/
static inline int nla_parse_deprecated_strict(struct nlattr **tb, int maxtype,
const struct nlattr *head,
int len,
const struct nla_policy *policy,
struct netlink_ext_ack *extack)
{
return __nla_parse(tb, maxtype, head, len, policy,
NL_VALIDATE_DEPRECATED_STRICT, extack);
}
/**
* __nlmsg_parse - parse attributes of a netlink message
* @nlh: netlink message header
* @hdrlen: length of family specific header
* @tb: destination array with maxtype+1 elements
* @maxtype: maximum attribute type to be expected
* @policy: validation policy
* @validate: validation strictness
* @extack: extended ACK report struct
*
* See nla_parse()
*/
static inline int __nlmsg_parse(const struct nlmsghdr *nlh, int hdrlen,
struct nlattr *tb[], int maxtype,
const struct nla_policy *policy,
unsigned int validate,
struct netlink_ext_ack *extack)
{
if (nlh->nlmsg_len < nlmsg_msg_size(hdrlen)) { NL_SET_ERR_MSG(extack, "Invalid header length"); return -EINVAL;
}
return __nla_parse(tb, maxtype, nlmsg_attrdata(nlh, hdrlen),
nlmsg_attrlen(nlh, hdrlen), policy, validate,
extack);
}
/**
* nlmsg_parse - parse attributes of a netlink message
* @nlh: netlink message header
* @hdrlen: length of family specific header
* @tb: destination array with maxtype+1 elements
* @maxtype: maximum attribute type to be expected
* @extack: extended ACK report struct
*
* See nla_parse()
*/
static inline int nlmsg_parse(const struct nlmsghdr *nlh, int hdrlen,
struct nlattr *tb[], int maxtype,
const struct nla_policy *policy,
struct netlink_ext_ack *extack)
{
return __nlmsg_parse(nlh, hdrlen, tb, maxtype, policy,
NL_VALIDATE_STRICT, extack);
}
/**
* nlmsg_parse_deprecated - parse attributes of a netlink message
* @nlh: netlink message header
* @hdrlen: length of family specific header
* @tb: destination array with maxtype+1 elements
* @maxtype: maximum attribute type to be expected
* @extack: extended ACK report struct
*
* See nla_parse_deprecated()
*/
static inline int nlmsg_parse_deprecated(const struct nlmsghdr *nlh, int hdrlen,
struct nlattr *tb[], int maxtype,
const struct nla_policy *policy,
struct netlink_ext_ack *extack)
{
return __nlmsg_parse(nlh, hdrlen, tb, maxtype, policy,
NL_VALIDATE_LIBERAL, extack);
}
/**
* nlmsg_parse_deprecated_strict - parse attributes of a netlink message
* @nlh: netlink message header
* @hdrlen: length of family specific header
* @tb: destination array with maxtype+1 elements
* @maxtype: maximum attribute type to be expected
* @extack: extended ACK report struct
*
* See nla_parse_deprecated_strict()
*/
static inline int
nlmsg_parse_deprecated_strict(const struct nlmsghdr *nlh, int hdrlen,
struct nlattr *tb[], int maxtype,
const struct nla_policy *policy,
struct netlink_ext_ack *extack)
{
return __nlmsg_parse(nlh, hdrlen, tb, maxtype, policy,
NL_VALIDATE_DEPRECATED_STRICT, extack);
}
/**
* nlmsg_find_attr - find a specific attribute in a netlink message
* @nlh: netlink message header
* @hdrlen: length of familiy specific header
* @attrtype: type of attribute to look for
*
* Returns the first attribute which matches the specified type.
*/
static inline struct nlattr *nlmsg_find_attr(const struct nlmsghdr *nlh,
int hdrlen, int attrtype)
{
return nla_find(nlmsg_attrdata(nlh, hdrlen),
nlmsg_attrlen(nlh, hdrlen), attrtype);
}
/**
* nla_validate_deprecated - Validate a stream of attributes
* @head: head of attribute stream
* @len: length of attribute stream
* @maxtype: maximum attribute type to be expected
* @policy: validation policy
* @validate: validation strictness
* @extack: extended ACK report struct
*
* Validates all attributes in the specified attribute stream against the
* specified policy. Validation is done in liberal mode.
* See documenation of struct nla_policy for more details.
*
* Returns 0 on success or a negative error code.
*/
static inline int nla_validate_deprecated(const struct nlattr *head, int len,
int maxtype,
const struct nla_policy *policy,
struct netlink_ext_ack *extack)
{
return __nla_validate(head, len, maxtype, policy, NL_VALIDATE_LIBERAL,
extack);
}
/**
* nla_validate - Validate a stream of attributes
* @head: head of attribute stream
* @len: length of attribute stream
* @maxtype: maximum attribute type to be expected
* @policy: validation policy
* @extack: extended ACK report struct
*
* Validates all attributes in the specified attribute stream against the
* specified policy. Validation is done in strict mode.
* See documenation of struct nla_policy for more details.
*
* Returns 0 on success or a negative error code.
*/
static inline int nla_validate(const struct nlattr *head, int len, int maxtype,
const struct nla_policy *policy,
struct netlink_ext_ack *extack)
{
return __nla_validate(head, len, maxtype, policy, NL_VALIDATE_STRICT,
extack);
}
/**
* nlmsg_validate_deprecated - validate a netlink message including attributes
* @nlh: netlinket message header
* @hdrlen: length of familiy specific header
* @maxtype: maximum attribute type to be expected
* @policy: validation policy
* @extack: extended ACK report struct
*/
static inline int nlmsg_validate_deprecated(const struct nlmsghdr *nlh,
int hdrlen, int maxtype,
const struct nla_policy *policy,
struct netlink_ext_ack *extack)
{
if (nlh->nlmsg_len < nlmsg_msg_size(hdrlen))
return -EINVAL;
return __nla_validate(nlmsg_attrdata(nlh, hdrlen),
nlmsg_attrlen(nlh, hdrlen), maxtype,
policy, NL_VALIDATE_LIBERAL, extack);
}
/**
* nlmsg_report - need to report back to application?
* @nlh: netlink message header
*
* Returns 1 if a report back to the application is requested.
*/
static inline int nlmsg_report(const struct nlmsghdr *nlh)
{
return nlh ? !!(nlh->nlmsg_flags & NLM_F_ECHO) : 0;
}
/**
* nlmsg_for_each_attr - iterate over a stream of attributes
* @pos: loop counter, set to current attribute
* @nlh: netlink message header
* @hdrlen: length of familiy specific header
* @rem: initialized to len, holds bytes currently remaining in stream
*/
#define nlmsg_for_each_attr(pos, nlh, hdrlen, rem) \
nla_for_each_attr(pos, nlmsg_attrdata(nlh, hdrlen), \
nlmsg_attrlen(nlh, hdrlen), rem)
/**
* nlmsg_put - Add a new netlink message to an skb
* @skb: socket buffer to store message in
* @portid: netlink PORTID of requesting application
* @seq: sequence number of message
* @type: message type
* @payload: length of message payload
* @flags: message flags
*
* Returns NULL if the tailroom of the skb is insufficient to store
* the message header and payload.
*/
static inline struct nlmsghdr *nlmsg_put(struct sk_buff *skb, u32 portid, u32 seq,
int type, int payload, int flags)
{
if (unlikely(skb_tailroom(skb) < nlmsg_total_size(payload)))
return NULL;
return __nlmsg_put(skb, portid, seq, type, payload, flags);
}
/**
* nlmsg_put_answer - Add a new callback based netlink message to an skb
* @skb: socket buffer to store message in
* @cb: netlink callback
* @type: message type
* @payload: length of message payload
* @flags: message flags
*
* Returns NULL if the tailroom of the skb is insufficient to store
* the message header and payload.
*/
static inline struct nlmsghdr *nlmsg_put_answer(struct sk_buff *skb,
struct netlink_callback *cb,
int type, int payload,
int flags)
{
return nlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq,
type, payload, flags);
}
/**
* nlmsg_new - Allocate a new netlink message
* @payload: size of the message payload
* @flags: the type of memory to allocate.
*
* Use NLMSG_DEFAULT_SIZE if the size of the payload isn't known
* and a good default is needed.
*/
static inline struct sk_buff *nlmsg_new(size_t payload, gfp_t flags)
{
return alloc_skb(nlmsg_total_size(payload), flags);
}
/**
* nlmsg_end - Finalize a netlink message
* @skb: socket buffer the message is stored in
* @nlh: netlink message header
*
* Corrects the netlink message header to include the appeneded
* attributes. Only necessary if attributes have been added to
* the message.
*/
static inline void nlmsg_end(struct sk_buff *skb, struct nlmsghdr *nlh)
{
nlh->nlmsg_len = skb_tail_pointer(skb) - (unsigned char *)nlh;
}
/**
* nlmsg_get_pos - return current position in netlink message
* @skb: socket buffer the message is stored in
*
* Returns a pointer to the current tail of the message.
*/
static inline void *nlmsg_get_pos(struct sk_buff *skb)
{
return skb_tail_pointer(skb);
}
/**
* nlmsg_trim - Trim message to a mark
* @skb: socket buffer the message is stored in
* @mark: mark to trim to
*
* Trims the message to the provided mark.
*/
static inline void nlmsg_trim(struct sk_buff *skb, const void *mark)
{
if (mark) { WARN_ON((unsigned char *) mark < skb->data); skb_trim(skb, (unsigned char *) mark - skb->data);
}
}
/**
* nlmsg_cancel - Cancel construction of a netlink message
* @skb: socket buffer the message is stored in
* @nlh: netlink message header
*
* Removes the complete netlink message including all
* attributes from the socket buffer again.
*/
static inline void nlmsg_cancel(struct sk_buff *skb, struct nlmsghdr *nlh)
{
nlmsg_trim(skb, nlh);
}
/**
* nlmsg_free - free a netlink message
* @skb: socket buffer of netlink message
*/
static inline void nlmsg_free(struct sk_buff *skb)
{
kfree_skb(skb);
}
/**
* nlmsg_multicast - multicast a netlink message
* @sk: netlink socket to spread messages to
* @skb: netlink message as socket buffer
* @portid: own netlink portid to avoid sending to yourself
* @group: multicast group id
* @flags: allocation flags
*/
static inline int nlmsg_multicast(struct sock *sk, struct sk_buff *skb,
u32 portid, unsigned int group, gfp_t flags)
{
int err;
NETLINK_CB(skb).dst_group = group;
err = netlink_broadcast(sk, skb, portid, group, flags);
if (err > 0)
err = 0;
return err;
}
/**
* nlmsg_unicast - unicast a netlink message
* @sk: netlink socket to spread message to
* @skb: netlink message as socket buffer
* @portid: netlink portid of the destination socket
*/
static inline int nlmsg_unicast(struct sock *sk, struct sk_buff *skb, u32 portid)
{
int err;
err = netlink_unicast(sk, skb, portid, MSG_DONTWAIT);
if (err > 0)
err = 0;
return err;
}
/**
* nlmsg_for_each_msg - iterate over a stream of messages
* @pos: loop counter, set to current message
* @head: head of message stream
* @len: length of message stream
* @rem: initialized to len, holds bytes currently remaining in stream
*/
#define nlmsg_for_each_msg(pos, head, len, rem) \
for (pos = head, rem = len; \
nlmsg_ok(pos, rem); \
pos = nlmsg_next(pos, &(rem)))
/**
* nl_dump_check_consistent - check if sequence is consistent and advertise if not
* @cb: netlink callback structure that stores the sequence number
* @nlh: netlink message header to write the flag to
*
* This function checks if the sequence (generation) number changed during dump
* and if it did, advertises it in the netlink message header.
*
* The correct way to use it is to set cb->seq to the generation counter when
* all locks for dumping have been acquired, and then call this function for
* each message that is generated.
*
* Note that due to initialisation concerns, 0 is an invalid sequence number
* and must not be used by code that uses this functionality.
*/
static inline void
nl_dump_check_consistent(struct netlink_callback *cb,
struct nlmsghdr *nlh)
{
if (cb->prev_seq && cb->seq != cb->prev_seq) nlh->nlmsg_flags |= NLM_F_DUMP_INTR; cb->prev_seq = cb->seq;
}
/**************************************************************************
* Netlink Attributes
**************************************************************************/
/**
* nla_attr_size - length of attribute not including padding
* @payload: length of payload
*/
static inline int nla_attr_size(int payload)
{
return NLA_HDRLEN + payload;
}
/**
* nla_total_size - total length of attribute including padding
* @payload: length of payload
*/
static inline int nla_total_size(int payload)
{
return NLA_ALIGN(nla_attr_size(payload));
}
/**
* nla_padlen - length of padding at the tail of attribute
* @payload: length of payload
*/
static inline int nla_padlen(int payload)
{
return nla_total_size(payload) - nla_attr_size(payload);
}
/**
* nla_type - attribute type
* @nla: netlink attribute
*/
static inline int nla_type(const struct nlattr *nla)
{
return nla->nla_type & NLA_TYPE_MASK;
}
/**
* nla_data - head of payload
* @nla: netlink attribute
*/
static inline void *nla_data(const struct nlattr *nla)
{
return (char *) nla + NLA_HDRLEN;
}
/**
* nla_len - length of payload
* @nla: netlink attribute
*/
static inline int nla_len(const struct nlattr *nla)
{
return nla->nla_len - NLA_HDRLEN;
}
/**
* nla_ok - check if the netlink attribute fits into the remaining bytes
* @nla: netlink attribute
* @remaining: number of bytes remaining in attribute stream
*/
static inline int nla_ok(const struct nlattr *nla, int remaining)
{
return remaining >= (int) sizeof(*nla) &&
nla->nla_len >= sizeof(*nla) && nla->nla_len <= remaining;
}
/**
* nla_next - next netlink attribute in attribute stream
* @nla: netlink attribute
* @remaining: number of bytes remaining in attribute stream
*
* Returns the next netlink attribute in the attribute stream and
* decrements remaining by the size of the current attribute.
*/
static inline struct nlattr *nla_next(const struct nlattr *nla, int *remaining)
{
unsigned int totlen = NLA_ALIGN(nla->nla_len);
*remaining -= totlen;
return (struct nlattr *) ((char *) nla + totlen);
}
/**
* nla_find_nested - find attribute in a set of nested attributes
* @nla: attribute containing the nested attributes
* @attrtype: type of attribute to look for
*
* Returns the first attribute which matches the specified type.
*/
static inline struct nlattr *
nla_find_nested(const struct nlattr *nla, int attrtype)
{
return nla_find(nla_data(nla), nla_len(nla), attrtype);
}
/**
* nla_parse_nested - parse nested attributes
* @tb: destination array with maxtype+1 elements
* @maxtype: maximum attribute type to be expected
* @nla: attribute containing the nested attributes
* @policy: validation policy
* @extack: extended ACK report struct
*
* See nla_parse()
*/
static inline int nla_parse_nested(struct nlattr *tb[], int maxtype,
const struct nlattr *nla,
const struct nla_policy *policy,
struct netlink_ext_ack *extack)
{
if (!(nla->nla_type & NLA_F_NESTED)) { NL_SET_ERR_MSG_ATTR(extack, nla, "NLA_F_NESTED is missing");
return -EINVAL;
}
return __nla_parse(tb, maxtype, nla_data(nla), nla_len(nla), policy,
NL_VALIDATE_STRICT, extack);
}
/**
* nla_parse_nested_deprecated - parse nested attributes
* @tb: destination array with maxtype+1 elements
* @maxtype: maximum attribute type to be expected
* @nla: attribute containing the nested attributes
* @policy: validation policy
* @extack: extended ACK report struct
*
* See nla_parse_deprecated()
*/
static inline int nla_parse_nested_deprecated(struct nlattr *tb[], int maxtype,
const struct nlattr *nla,
const struct nla_policy *policy,
struct netlink_ext_ack *extack)
{
return __nla_parse(tb, maxtype, nla_data(nla), nla_len(nla), policy,
NL_VALIDATE_LIBERAL, extack);
}
/**
* nla_put_u8 - Add a u8 netlink attribute to a socket buffer
* @skb: socket buffer to add attribute to
* @attrtype: attribute type
* @value: numeric value
*/
static inline int nla_put_u8(struct sk_buff *skb, int attrtype, u8 value)
{
/* temporary variables to work around GCC PR81715 with asan-stack=1 */
u8 tmp = value;
return nla_put(skb, attrtype, sizeof(u8), &tmp);
}
/**
* nla_put_u16 - Add a u16 netlink attribute to a socket buffer
* @skb: socket buffer to add attribute to
* @attrtype: attribute type
* @value: numeric value
*/
static inline int nla_put_u16(struct sk_buff *skb, int attrtype, u16 value)
{
u16 tmp = value;
return nla_put(skb, attrtype, sizeof(u16), &tmp);
}
/**
* nla_put_be16 - Add a __be16 netlink attribute to a socket buffer
* @skb: socket buffer to add attribute to
* @attrtype: attribute type
* @value: numeric value
*/
static inline int nla_put_be16(struct sk_buff *skb, int attrtype, __be16 value)
{
__be16 tmp = value;
return nla_put(skb, attrtype, sizeof(__be16), &tmp);
}
/**
* nla_put_net16 - Add 16-bit network byte order netlink attribute to a socket buffer
* @skb: socket buffer to add attribute to
* @attrtype: attribute type
* @value: numeric value
*/
static inline int nla_put_net16(struct sk_buff *skb, int attrtype, __be16 value)
{
__be16 tmp = value;
return nla_put_be16(skb, attrtype | NLA_F_NET_BYTEORDER, tmp);
}
/**
* nla_put_le16 - Add a __le16 netlink attribute to a socket buffer
* @skb: socket buffer to add attribute to
* @attrtype: attribute type
* @value: numeric value
*/
static inline int nla_put_le16(struct sk_buff *skb, int attrtype, __le16 value)
{
__le16 tmp = value;
return nla_put(skb, attrtype, sizeof(__le16), &tmp);
}
/**
* nla_put_u32 - Add a u32 netlink attribute to a socket buffer
* @skb: socket buffer to add attribute to
* @attrtype: attribute type
* @value: numeric value
*/
static inline int nla_put_u32(struct sk_buff *skb, int attrtype, u32 value)
{
u32 tmp = value;
return nla_put(skb, attrtype, sizeof(u32), &tmp);
}
/**
* nla_put_be32 - Add a __be32 netlink attribute to a socket buffer
* @skb: socket buffer to add attribute to
* @attrtype: attribute type
* @value: numeric value
*/
static inline int nla_put_be32(struct sk_buff *skb, int attrtype, __be32 value)
{
__be32 tmp = value;
return nla_put(skb, attrtype, sizeof(__be32), &tmp);
}
/**
* nla_put_net32 - Add 32-bit network byte order netlink attribute to a socket buffer
* @skb: socket buffer to add attribute to
* @attrtype: attribute type
* @value: numeric value
*/
static inline int nla_put_net32(struct sk_buff *skb, int attrtype, __be32 value)
{
__be32 tmp = value;
return nla_put_be32(skb, attrtype | NLA_F_NET_BYTEORDER, tmp);
}
/**
* nla_put_le32 - Add a __le32 netlink attribute to a socket buffer
* @skb: socket buffer to add attribute to
* @attrtype: attribute type
* @value: numeric value
*/
static inline int nla_put_le32(struct sk_buff *skb, int attrtype, __le32 value)
{
__le32 tmp = value;
return nla_put(skb, attrtype, sizeof(__le32), &tmp);
}
/**
* nla_put_u64_64bit - Add a u64 netlink attribute to a skb and align it
* @skb: socket buffer to add attribute to
* @attrtype: attribute type
* @value: numeric value
* @padattr: attribute type for the padding
*/
static inline int nla_put_u64_64bit(struct sk_buff *skb, int attrtype,
u64 value, int padattr)
{
u64 tmp = value;
return nla_put_64bit(skb, attrtype, sizeof(u64), &tmp, padattr);
}
/**
* nla_put_be64 - Add a __be64 netlink attribute to a socket buffer and align it
* @skb: socket buffer to add attribute to
* @attrtype: attribute type
* @value: numeric value
* @padattr: attribute type for the padding
*/
static inline int nla_put_be64(struct sk_buff *skb, int attrtype, __be64 value,
int padattr)
{
__be64 tmp = value;
return nla_put_64bit(skb, attrtype, sizeof(__be64), &tmp, padattr);
}
/**
* nla_put_net64 - Add 64-bit network byte order nlattr to a skb and align it
* @skb: socket buffer to add attribute to
* @attrtype: attribute type
* @value: numeric value
* @padattr: attribute type for the padding
*/
static inline int nla_put_net64(struct sk_buff *skb, int attrtype, __be64 value,
int padattr)
{
__be64 tmp = value;
return nla_put_be64(skb, attrtype | NLA_F_NET_BYTEORDER, tmp,
padattr);
}
/**
* nla_put_le64 - Add a __le64 netlink attribute to a socket buffer and align it
* @skb: socket buffer to add attribute to
* @attrtype: attribute type
* @value: numeric value
* @padattr: attribute type for the padding
*/
static inline int nla_put_le64(struct sk_buff *skb, int attrtype, __le64 value,
int padattr)
{
__le64 tmp = value;
return nla_put_64bit(skb, attrtype, sizeof(__le64), &tmp, padattr);
}
/**
* nla_put_s8 - Add a s8 netlink attribute to a socket buffer
* @skb: socket buffer to add attribute to
* @attrtype: attribute type
* @value: numeric value
*/
static inline int nla_put_s8(struct sk_buff *skb, int attrtype, s8 value)
{
s8 tmp = value;
return nla_put(skb, attrtype, sizeof(s8), &tmp);
}
/**
* nla_put_s16 - Add a s16 netlink attribute to a socket buffer
* @skb: socket buffer to add attribute to
* @attrtype: attribute type
* @value: numeric value
*/
static inline int nla_put_s16(struct sk_buff *skb, int attrtype, s16 value)
{
s16 tmp = value;
return nla_put(skb, attrtype, sizeof(s16), &tmp);
}
/**
* nla_put_s32 - Add a s32 netlink attribute to a socket buffer
* @skb: socket buffer to add attribute to
* @attrtype: attribute type
* @value: numeric value
*/
static inline int nla_put_s32(struct sk_buff *skb, int attrtype, s32 value)
{
s32 tmp = value;
return nla_put(skb, attrtype, sizeof(s32), &tmp);
}
/**
* nla_put_s64 - Add a s64 netlink attribute to a socket buffer and align it
* @skb: socket buffer to add attribute to
* @attrtype: attribute type
* @value: numeric value
* @padattr: attribute type for the padding
*/
static inline int nla_put_s64(struct sk_buff *skb, int attrtype, s64 value,
int padattr)
{
s64 tmp = value;
return nla_put_64bit(skb, attrtype, sizeof(s64), &tmp, padattr);
}
/**
* nla_put_string - Add a string netlink attribute to a socket buffer
* @skb: socket buffer to add attribute to
* @attrtype: attribute type
* @str: NUL terminated string
*/
static inline int nla_put_string(struct sk_buff *skb, int attrtype,
const char *str)
{
return nla_put(skb, attrtype, strlen(str) + 1, str);
}
/**
* nla_put_flag - Add a flag netlink attribute to a socket buffer
* @skb: socket buffer to add attribute to
* @attrtype: attribute type
*/
static inline int nla_put_flag(struct sk_buff *skb, int attrtype)
{
return nla_put(skb, attrtype, 0, NULL);
}
/**
* nla_put_msecs - Add a msecs netlink attribute to a skb and align it
* @skb: socket buffer to add attribute to
* @attrtype: attribute type
* @njiffies: number of jiffies to convert to msecs
* @padattr: attribute type for the padding
*/
static inline int nla_put_msecs(struct sk_buff *skb, int attrtype,
unsigned long njiffies, int padattr)
{
u64 tmp = jiffies_to_msecs(njiffies);
return nla_put_64bit(skb, attrtype, sizeof(u64), &tmp, padattr);
}
/**
* nla_put_in_addr - Add an IPv4 address netlink attribute to a socket
* buffer
* @skb: socket buffer to add attribute to
* @attrtype: attribute type
* @addr: IPv4 address
*/
static inline int nla_put_in_addr(struct sk_buff *skb, int attrtype,
__be32 addr)
{
__be32 tmp = addr;
return nla_put_be32(skb, attrtype, tmp);
}
/**
* nla_put_in6_addr - Add an IPv6 address netlink attribute to a socket
* buffer
* @skb: socket buffer to add attribute to
* @attrtype: attribute type
* @addr: IPv6 address
*/
static inline int nla_put_in6_addr(struct sk_buff *skb, int attrtype,
const struct in6_addr *addr)
{
return nla_put(skb, attrtype, sizeof(*addr), addr);
}
/**
* nla_put_bitfield32 - Add a bitfield32 netlink attribute to a socket buffer
* @skb: socket buffer to add attribute to
* @attrtype: attribute type
* @value: value carrying bits
* @selector: selector of valid bits
*/
static inline int nla_put_bitfield32(struct sk_buff *skb, int attrtype,
__u32 value, __u32 selector)
{
struct nla_bitfield32 tmp = { value, selector, };
return nla_put(skb, attrtype, sizeof(tmp), &tmp);
}
/**
* nla_get_u32 - return payload of u32 attribute
* @nla: u32 netlink attribute
*/
static inline u32 nla_get_u32(const struct nlattr *nla)
{
return *(u32 *) nla_data(nla);
}
/**
* nla_get_be32 - return payload of __be32 attribute
* @nla: __be32 netlink attribute
*/
static inline __be32 nla_get_be32(const struct nlattr *nla)
{
return *(__be32 *) nla_data(nla);
}
/**
* nla_get_le32 - return payload of __le32 attribute
* @nla: __le32 netlink attribute
*/
static inline __le32 nla_get_le32(const struct nlattr *nla)
{
return *(__le32 *) nla_data(nla);
}
/**
* nla_get_u16 - return payload of u16 attribute
* @nla: u16 netlink attribute
*/
static inline u16 nla_get_u16(const struct nlattr *nla)
{
return *(u16 *) nla_data(nla);
}
/**
* nla_get_be16 - return payload of __be16 attribute
* @nla: __be16 netlink attribute
*/
static inline __be16 nla_get_be16(const struct nlattr *nla)
{
return *(__be16 *) nla_data(nla);
}
/**
* nla_get_le16 - return payload of __le16 attribute
* @nla: __le16 netlink attribute
*/
static inline __le16 nla_get_le16(const struct nlattr *nla)
{
return *(__le16 *) nla_data(nla);
}
/**
* nla_get_u8 - return payload of u8 attribute
* @nla: u8 netlink attribute
*/
static inline u8 nla_get_u8(const struct nlattr *nla)
{
return *(u8 *) nla_data(nla);
}
/**
* nla_get_u64 - return payload of u64 attribute
* @nla: u64 netlink attribute
*/
static inline u64 nla_get_u64(const struct nlattr *nla)
{
u64 tmp;
nla_memcpy(&tmp, nla, sizeof(tmp));
return tmp;
}
/**
* nla_get_be64 - return payload of __be64 attribute
* @nla: __be64 netlink attribute
*/
static inline __be64 nla_get_be64(const struct nlattr *nla)
{
__be64 tmp;
nla_memcpy(&tmp, nla, sizeof(tmp));
return tmp;
}
/**
* nla_get_le64 - return payload of __le64 attribute
* @nla: __le64 netlink attribute
*/
static inline __le64 nla_get_le64(const struct nlattr *nla)
{
return *(__le64 *) nla_data(nla);
}
/**
* nla_get_s32 - return payload of s32 attribute
* @nla: s32 netlink attribute
*/
static inline s32 nla_get_s32(const struct nlattr *nla)
{
return *(s32 *) nla_data(nla);
}
/**
* nla_get_s16 - return payload of s16 attribute
* @nla: s16 netlink attribute
*/
static inline s16 nla_get_s16(const struct nlattr *nla)
{
return *(s16 *) nla_data(nla);
}
/**
* nla_get_s8 - return payload of s8 attribute
* @nla: s8 netlink attribute
*/
static inline s8 nla_get_s8(const struct nlattr *nla)
{
return *(s8 *) nla_data(nla);
}
/**
* nla_get_s64 - return payload of s64 attribute
* @nla: s64 netlink attribute
*/
static inline s64 nla_get_s64(const struct nlattr *nla)
{
s64 tmp;
nla_memcpy(&tmp, nla, sizeof(tmp));
return tmp;
}
/**
* nla_get_flag - return payload of flag attribute
* @nla: flag netlink attribute
*/
static inline int nla_get_flag(const struct nlattr *nla)
{
return !!nla;
}
/**
* nla_get_msecs - return payload of msecs attribute
* @nla: msecs netlink attribute
*
* Returns the number of milliseconds in jiffies.
*/
static inline unsigned long nla_get_msecs(const struct nlattr *nla)
{
u64 msecs = nla_get_u64(nla);
return msecs_to_jiffies((unsigned long) msecs);
}
/**
* nla_get_in_addr - return payload of IPv4 address attribute
* @nla: IPv4 address netlink attribute
*/
static inline __be32 nla_get_in_addr(const struct nlattr *nla)
{
return *(__be32 *) nla_data(nla);
}
/**
* nla_get_in6_addr - return payload of IPv6 address attribute
* @nla: IPv6 address netlink attribute
*/
static inline struct in6_addr nla_get_in6_addr(const struct nlattr *nla)
{
struct in6_addr tmp;
nla_memcpy(&tmp, nla, sizeof(tmp));
return tmp;
}
/**
* nla_get_bitfield32 - return payload of 32 bitfield attribute
* @nla: nla_bitfield32 attribute
*/
static inline struct nla_bitfield32 nla_get_bitfield32(const struct nlattr *nla)
{
struct nla_bitfield32 tmp;
nla_memcpy(&tmp, nla, sizeof(tmp));
return tmp;
}
/**
* nla_memdup - duplicate attribute memory (kmemdup)
* @src: netlink attribute to duplicate from
* @gfp: GFP mask
*/
static inline void *nla_memdup(const struct nlattr *src, gfp_t gfp)
{
return kmemdup(nla_data(src), nla_len(src), gfp);
}
/**
* nla_nest_start_noflag - Start a new level of nested attributes
* @skb: socket buffer to add attributes to
* @attrtype: attribute type of container
*
* This function exists for backward compatibility to use in APIs which never
* marked their nest attributes with NLA_F_NESTED flag. New APIs should use
* nla_nest_start() which sets the flag.
*
* Returns the container attribute or NULL on error
*/
static inline struct nlattr *nla_nest_start_noflag(struct sk_buff *skb,
int attrtype)
{
struct nlattr *start = (struct nlattr *)skb_tail_pointer(skb);
if (nla_put(skb, attrtype, 0, NULL) < 0)
return NULL;
return start;
}
/**
* nla_nest_start - Start a new level of nested attributes, with NLA_F_NESTED
* @skb: socket buffer to add attributes to
* @attrtype: attribute type of container
*
* Unlike nla_nest_start_noflag(), mark the nest attribute with NLA_F_NESTED
* flag. This is the preferred function to use in new code.
*
* Returns the container attribute or NULL on error
*/
static inline struct nlattr *nla_nest_start(struct sk_buff *skb, int attrtype)
{
return nla_nest_start_noflag(skb, attrtype | NLA_F_NESTED);
}
/**
* nla_nest_end - Finalize nesting of attributes
* @skb: socket buffer the attributes are stored in
* @start: container attribute
*
* Corrects the container attribute header to include the all
* appeneded attributes.
*
* Returns the total data length of the skb.
*/
static inline int nla_nest_end(struct sk_buff *skb, struct nlattr *start)
{
start->nla_len = skb_tail_pointer(skb) - (unsigned char *)start; return skb->len;
}
/**
* nla_nest_cancel - Cancel nesting of attributes
* @skb: socket buffer the message is stored in
* @start: container attribute
*
* Removes the container attribute and including all nested
* attributes. Returns -EMSGSIZE
*/
static inline void nla_nest_cancel(struct sk_buff *skb, struct nlattr *start)
{
nlmsg_trim(skb, start);
}
/**
* __nla_validate_nested - Validate a stream of nested attributes
* @start: container attribute
* @maxtype: maximum attribute type to be expected
* @policy: validation policy
* @validate: validation strictness
* @extack: extended ACK report struct
*
* Validates all attributes in the nested attribute stream against the
* specified policy. Attributes with a type exceeding maxtype will be
* ignored. See documenation of struct nla_policy for more details.
*
* Returns 0 on success or a negative error code.
*/
static inline int __nla_validate_nested(const struct nlattr *start, int maxtype,
const struct nla_policy *policy,
unsigned int validate,
struct netlink_ext_ack *extack)
{
return __nla_validate(nla_data(start), nla_len(start), maxtype, policy,
validate, extack);
}
static inline int
nla_validate_nested(const struct nlattr *start, int maxtype,
const struct nla_policy *policy,
struct netlink_ext_ack *extack)
{
return __nla_validate_nested(start, maxtype, policy,
NL_VALIDATE_STRICT, extack);
}
static inline int
nla_validate_nested_deprecated(const struct nlattr *start, int maxtype,
const struct nla_policy *policy,
struct netlink_ext_ack *extack)
{
return __nla_validate_nested(start, maxtype, policy,
NL_VALIDATE_LIBERAL, extack);
}
/**
* nla_need_padding_for_64bit - test 64-bit alignment of the next attribute
* @skb: socket buffer the message is stored in
*
* Return true if padding is needed to align the next attribute (nla_data()) to
* a 64-bit aligned area.
*/
static inline bool nla_need_padding_for_64bit(struct sk_buff *skb)
{
#ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
/* The nlattr header is 4 bytes in size, that's why we test
* if the skb->data _is_ aligned. A NOP attribute, plus
* nlattr header for next attribute, will make nla_data()
* 8-byte aligned.
*/
if (IS_ALIGNED((unsigned long)skb_tail_pointer(skb), 8))
return true;
#endif
return false;
}
/**
* nla_align_64bit - 64-bit align the nla_data() of next attribute
* @skb: socket buffer the message is stored in
* @padattr: attribute type for the padding
*
* Conditionally emit a padding netlink attribute in order to make
* the next attribute we emit have a 64-bit aligned nla_data() area.
* This will only be done in architectures which do not have
* CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS defined.
*
* Returns zero on success or a negative error code.
*/
static inline int nla_align_64bit(struct sk_buff *skb, int padattr)
{
if (nla_need_padding_for_64bit(skb) &&
!nla_reserve(skb, padattr, 0))
return -EMSGSIZE;
return 0;
}
/**
* nla_total_size_64bit - total length of attribute including padding
* @payload: length of payload
*/
static inline int nla_total_size_64bit(int payload)
{
return NLA_ALIGN(nla_attr_size(payload))
#ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
+ NLA_ALIGN(nla_attr_size(0))
#endif
;
}
/**
* nla_for_each_attr - iterate over a stream of attributes
* @pos: loop counter, set to current attribute
* @head: head of attribute stream
* @len: length of attribute stream
* @rem: initialized to len, holds bytes currently remaining in stream
*/
#define nla_for_each_attr(pos, head, len, rem) \
for (pos = head, rem = len; \
nla_ok(pos, rem); \
pos = nla_next(pos, &(rem)))
/**
* nla_for_each_nested - iterate over nested attributes
* @pos: loop counter, set to current attribute
* @nla: attribute containing the nested attributes
* @rem: initialized to len, holds bytes currently remaining in stream
*/
#define nla_for_each_nested(pos, nla, rem) \
nla_for_each_attr(pos, nla_data(nla), nla_len(nla), rem)
/**
* nla_is_last - Test if attribute is last in stream
* @nla: attribute to test
* @rem: bytes remaining in stream
*/
static inline bool nla_is_last(const struct nlattr *nla, int rem)
{
return nla->nla_len == rem;
}
void nla_get_range_unsigned(const struct nla_policy *pt,
struct netlink_range_validation *range);
void nla_get_range_signed(const struct nla_policy *pt,
struct netlink_range_validation_signed *range);
struct netlink_policy_dump_state;
int netlink_policy_dump_add_policy(struct netlink_policy_dump_state **pstate,
const struct nla_policy *policy,
unsigned int maxtype);
int netlink_policy_dump_get_policy_idx(struct netlink_policy_dump_state *state,
const struct nla_policy *policy,
unsigned int maxtype);
bool netlink_policy_dump_loop(struct netlink_policy_dump_state *state);
int netlink_policy_dump_write(struct sk_buff *skb,
struct netlink_policy_dump_state *state);
int netlink_policy_dump_attr_size_estimate(const struct nla_policy *pt);
int netlink_policy_dump_write_attr(struct sk_buff *skb,
const struct nla_policy *pt,
int nestattr);
void netlink_policy_dump_free(struct netlink_policy_dump_state *state);
#endif
/* SPDX-License-Identifier: GPL-2.0 */
/*
* Macros for manipulating and testing page->flags
*/
#ifndef PAGE_FLAGS_H
#define PAGE_FLAGS_H
#include <linux/types.h>
#include <linux/bug.h>
#include <linux/mmdebug.h>
#ifndef __GENERATING_BOUNDS_H
#include <linux/mm_types.h>
#include <generated/bounds.h>
#endif /* !__GENERATING_BOUNDS_H */
/*
* Various page->flags bits:
*
* PG_reserved is set for special pages. The "struct page" of such a page
* should in general not be touched (e.g. set dirty) except by its owner.
* Pages marked as PG_reserved include:
* - Pages part of the kernel image (including vDSO) and similar (e.g. BIOS,
* initrd, HW tables)
* - Pages reserved or allocated early during boot (before the page allocator
* was initialized). This includes (depending on the architecture) the
* initial vmemmap, initial page tables, crashkernel, elfcorehdr, and much
* much more. Once (if ever) freed, PG_reserved is cleared and they will
* be given to the page allocator.
* - Pages falling into physical memory gaps - not IORESOURCE_SYSRAM. Trying
* to read/write these pages might end badly. Don't touch!
* - The zero page(s)
* - Pages not added to the page allocator when onlining a section because
* they were excluded via the online_page_callback() or because they are
* PG_hwpoison.
* - Pages allocated in the context of kexec/kdump (loaded kernel image,
* control pages, vmcoreinfo)
* - MMIO/DMA pages. Some architectures don't allow to ioremap pages that are
* not marked PG_reserved (as they might be in use by somebody else who does
* not respect the caching strategy).
* - Pages part of an offline section (struct pages of offline sections should
* not be trusted as they will be initialized when first onlined).
* - MCA pages on ia64
* - Pages holding CPU notes for POWER Firmware Assisted Dump
* - Device memory (e.g. PMEM, DAX, HMM)
* Some PG_reserved pages will be excluded from the hibernation image.
* PG_reserved does in general not hinder anybody from dumping or swapping
* and is no longer required for remap_pfn_range(). ioremap might require it.
* Consequently, PG_reserved for a page mapped into user space can indicate
* the zero page, the vDSO, MMIO pages or device memory.
*
* The PG_private bitflag is set on pagecache pages if they contain filesystem
* specific data (which is normally at page->private). It can be used by
* private allocations for its own usage.
*
* During initiation of disk I/O, PG_locked is set. This bit is set before I/O
* and cleared when writeback _starts_ or when read _completes_. PG_writeback
* is set before writeback starts and cleared when it finishes.
*
* PG_locked also pins a page in pagecache, and blocks truncation of the file
* while it is held.
*
* page_waitqueue(page) is a wait queue of all tasks waiting for the page
* to become unlocked.
*
* PG_swapbacked is set when a page uses swap as a backing storage. This are
* usually PageAnon or shmem pages but please note that even anonymous pages
* might lose their PG_swapbacked flag when they simply can be dropped (e.g. as
* a result of MADV_FREE).
*
* PG_uptodate tells whether the page's contents is valid. When a read
* completes, the page becomes uptodate, unless a disk I/O error happened.
*
* PG_referenced, PG_reclaim are used for page reclaim for anonymous and
* file-backed pagecache (see mm/vmscan.c).
*
* PG_error is set to indicate that an I/O error occurred on this page.
*
* PG_arch_1 is an architecture specific page state bit. The generic code
* guarantees that this bit is cleared for a page when it first is entered into
* the page cache.
*
* PG_hwpoison indicates that a page got corrupted in hardware and contains
* data with incorrect ECC bits that triggered a machine check. Accessing is
* not safe since it may cause another machine check. Don't touch!
*/
/*
* Don't use the pageflags directly. Use the PageFoo macros.
*
* The page flags field is split into two parts, the main flags area
* which extends from the low bits upwards, and the fields area which
* extends from the high bits downwards.
*
* | FIELD | ... | FLAGS |
* N-1 ^ 0
* (NR_PAGEFLAGS)
*
* The fields area is reserved for fields mapping zone, node (for NUMA) and
* SPARSEMEM section (for variants of SPARSEMEM that require section ids like
* SPARSEMEM_EXTREME with !SPARSEMEM_VMEMMAP).
*/
enum pageflags {
PG_locked, /* Page is locked. Don't touch. */
PG_referenced,
PG_uptodate,
PG_dirty,
PG_lru,
PG_active,
PG_workingset,
PG_waiters, /* Page has waiters, check its waitqueue. Must be bit #7 and in the same byte as "PG_locked" */
PG_error,
PG_slab,
PG_owner_priv_1, /* Owner use. If pagecache, fs may use*/
PG_arch_1,
PG_reserved,
PG_private, /* If pagecache, has fs-private data */
PG_private_2, /* If pagecache, has fs aux data */
PG_writeback, /* Page is under writeback */
PG_head, /* A head page */
PG_mappedtodisk, /* Has blocks allocated on-disk */
PG_reclaim, /* To be reclaimed asap */
PG_swapbacked, /* Page is backed by RAM/swap */
PG_unevictable, /* Page is "unevictable" */
#ifdef CONFIG_MMU
PG_mlocked, /* Page is vma mlocked */
#endif
#ifdef CONFIG_ARCH_USES_PG_UNCACHED
PG_uncached, /* Page has been mapped as uncached */
#endif
#ifdef CONFIG_MEMORY_FAILURE
PG_hwpoison, /* hardware poisoned page. Don't touch */
#endif
#if defined(CONFIG_PAGE_IDLE_FLAG) && defined(CONFIG_64BIT)
PG_young,
PG_idle,
#endif
#ifdef CONFIG_64BIT
PG_arch_2,
#endif
#ifdef CONFIG_KASAN_HW_TAGS
PG_skip_kasan_poison,
#endif
__NR_PAGEFLAGS,
/* Filesystems */
PG_checked = PG_owner_priv_1,
/* SwapBacked */
PG_swapcache = PG_owner_priv_1, /* Swap page: swp_entry_t in private */
/* Two page bits are conscripted by FS-Cache to maintain local caching
* state. These bits are set on pages belonging to the netfs's inodes
* when those inodes are being locally cached.
*/
PG_fscache = PG_private_2, /* page backed by cache */
/* XEN */
/* Pinned in Xen as a read-only pagetable page. */
PG_pinned = PG_owner_priv_1,
/* Pinned as part of domain save (see xen_mm_pin_all()). */
PG_savepinned = PG_dirty,
/* Has a grant mapping of another (foreign) domain's page. */
PG_foreign = PG_owner_priv_1,
/* Remapped by swiotlb-xen. */
PG_xen_remapped = PG_owner_priv_1,
/* SLOB */
PG_slob_free = PG_private,
/* Compound pages. Stored in first tail page's flags */
PG_double_map = PG_workingset,
#ifdef CONFIG_MEMORY_FAILURE
/*
* Compound pages. Stored in first tail page's flags.
* Indicates that at least one subpage is hwpoisoned in the
* THP.
*/
PG_has_hwpoisoned = PG_mappedtodisk,
#endif
/* non-lru isolated movable page */
PG_isolated = PG_reclaim,
/* Only valid for buddy pages. Used to track pages that are reported */
PG_reported = PG_uptodate,
};
#define PAGEFLAGS_MASK ((1UL << NR_PAGEFLAGS) - 1)
#ifndef __GENERATING_BOUNDS_H
static inline unsigned long _compound_head(const struct page *page)
{
unsigned long head = READ_ONCE(page->compound_head);
if (unlikely(head & 1))
return head - 1; return (unsigned long)page;
}
#define compound_head(page) ((typeof(page))_compound_head(page))
static __always_inline int PageTail(struct page *page)
{
return READ_ONCE(page->compound_head) & 1;
}
static __always_inline int PageCompound(struct page *page)
{
return test_bit(PG_head, &page->flags) || PageTail(page);
}
#define PAGE_POISON_PATTERN -1l
static inline int PagePoisoned(const struct page *page)
{
return page->flags == PAGE_POISON_PATTERN;
}
#ifdef CONFIG_DEBUG_VM
void page_init_poison(struct page *page, size_t size);
#else
static inline void page_init_poison(struct page *page, size_t size)
{
}
#endif
/*
* Page flags policies wrt compound pages
*
* PF_POISONED_CHECK
* check if this struct page poisoned/uninitialized
*
* PF_ANY:
* the page flag is relevant for small, head and tail pages.
*
* PF_HEAD:
* for compound page all operations related to the page flag applied to
* head page.
*
* PF_ONLY_HEAD:
* for compound page, callers only ever operate on the head page.
*
* PF_NO_TAIL:
* modifications of the page flag must be done on small or head pages,
* checks can be done on tail pages too.
*
* PF_NO_COMPOUND:
* the page flag is not relevant for compound pages.
*
* PF_SECOND:
* the page flag is stored in the first tail page.
*/
#define PF_POISONED_CHECK(page) ({ \
VM_BUG_ON_PGFLAGS(PagePoisoned(page), page); \
page; })
#define PF_ANY(page, enforce) PF_POISONED_CHECK(page)
#define PF_HEAD(page, enforce) PF_POISONED_CHECK(compound_head(page))
#define PF_ONLY_HEAD(page, enforce) ({ \
VM_BUG_ON_PGFLAGS(PageTail(page), page); \
PF_POISONED_CHECK(page); })
#define PF_NO_TAIL(page, enforce) ({ \
VM_BUG_ON_PGFLAGS(enforce && PageTail(page), page); \
PF_POISONED_CHECK(compound_head(page)); })
#define PF_NO_COMPOUND(page, enforce) ({ \
VM_BUG_ON_PGFLAGS(enforce && PageCompound(page), page); \
PF_POISONED_CHECK(page); })
#define PF_SECOND(page, enforce) ({ \
VM_BUG_ON_PGFLAGS(!PageHead(page), page); \
PF_POISONED_CHECK(&page[1]); })
/*
* Macros to create function definitions for page flags
*/
#define TESTPAGEFLAG(uname, lname, policy) \
static __always_inline int Page##uname(struct page *page) \
{ return test_bit(PG_##lname, &policy(page, 0)->flags); }
#define SETPAGEFLAG(uname, lname, policy) \
static __always_inline void SetPage##uname(struct page *page) \
{ set_bit(PG_##lname, &policy(page, 1)->flags); }
#define CLEARPAGEFLAG(uname, lname, policy) \
static __always_inline void ClearPage##uname(struct page *page) \
{ clear_bit(PG_##lname, &policy(page, 1)->flags); }
#define __SETPAGEFLAG(uname, lname, policy) \
static __always_inline void __SetPage##uname(struct page *page) \
{ __set_bit(PG_##lname, &policy(page, 1)->flags); }
#define __CLEARPAGEFLAG(uname, lname, policy) \
static __always_inline void __ClearPage##uname(struct page *page) \
{ __clear_bit(PG_##lname, &policy(page, 1)->flags); }
#define TESTSETFLAG(uname, lname, policy) \
static __always_inline int TestSetPage##uname(struct page *page) \
{ return test_and_set_bit(PG_##lname, &policy(page, 1)->flags); }
#define TESTCLEARFLAG(uname, lname, policy) \
static __always_inline int TestClearPage##uname(struct page *page) \
{ return test_and_clear_bit(PG_##lname, &policy(page, 1)->flags); }
#define PAGEFLAG(uname, lname, policy) \
TESTPAGEFLAG(uname, lname, policy) \
SETPAGEFLAG(uname, lname, policy) \
CLEARPAGEFLAG(uname, lname, policy)
#define __PAGEFLAG(uname, lname, policy) \
TESTPAGEFLAG(uname, lname, policy) \
__SETPAGEFLAG(uname, lname, policy) \
__CLEARPAGEFLAG(uname, lname, policy)
#define TESTSCFLAG(uname, lname, policy) \
TESTSETFLAG(uname, lname, policy) \
TESTCLEARFLAG(uname, lname, policy)
#define TESTPAGEFLAG_FALSE(uname) \
static inline int Page##uname(const struct page *page) { return 0; }
#define SETPAGEFLAG_NOOP(uname) \
static inline void SetPage##uname(struct page *page) { }
#define CLEARPAGEFLAG_NOOP(uname) \
static inline void ClearPage##uname(struct page *page) { }
#define __CLEARPAGEFLAG_NOOP(uname) \
static inline void __ClearPage##uname(struct page *page) { }
#define TESTSETFLAG_FALSE(uname) \
static inline int TestSetPage##uname(struct page *page) { return 0; }
#define TESTCLEARFLAG_FALSE(uname) \
static inline int TestClearPage##uname(struct page *page) { return 0; }
#define PAGEFLAG_FALSE(uname) TESTPAGEFLAG_FALSE(uname) \
SETPAGEFLAG_NOOP(uname) CLEARPAGEFLAG_NOOP(uname)
#define TESTSCFLAG_FALSE(uname) \
TESTSETFLAG_FALSE(uname) TESTCLEARFLAG_FALSE(uname)
__PAGEFLAG(Locked, locked, PF_NO_TAIL)PAGEFLAG(Waiters, waiters, PF_ONLY_HEAD) __CLEARPAGEFLAG(Waiters, waiters, PF_ONLY_HEAD)PAGEFLAG(Error, error, PF_NO_TAIL) TESTCLEARFLAG(Error, error, PF_NO_TAIL)PAGEFLAG(Referenced, referenced, PF_HEAD)
TESTCLEARFLAG(Referenced, referenced, PF_HEAD)
__SETPAGEFLAG(Referenced, referenced, PF_HEAD)PAGEFLAG(Dirty, dirty, PF_HEAD) TESTSCFLAG(Dirty, dirty, PF_HEAD)
__CLEARPAGEFLAG(Dirty, dirty, PF_HEAD)
PAGEFLAG(LRU, lru, PF_HEAD) __CLEARPAGEFLAG(LRU, lru, PF_HEAD) TESTCLEARFLAG(LRU, lru, PF_HEAD)PAGEFLAG(Active, active, PF_HEAD) __CLEARPAGEFLAG(Active, active, PF_HEAD)
TESTCLEARFLAG(Active, active, PF_HEAD)
PAGEFLAG(Workingset, workingset, PF_HEAD)
TESTCLEARFLAG(Workingset, workingset, PF_HEAD)
__PAGEFLAG(Slab, slab, PF_NO_TAIL)
__PAGEFLAG(SlobFree, slob_free, PF_NO_TAIL)
PAGEFLAG(Checked, checked, PF_NO_COMPOUND) /* Used by some filesystems */
/* Xen */
PAGEFLAG(Pinned, pinned, PF_NO_COMPOUND)
TESTSCFLAG(Pinned, pinned, PF_NO_COMPOUND)
PAGEFLAG(SavePinned, savepinned, PF_NO_COMPOUND);
PAGEFLAG(Foreign, foreign, PF_NO_COMPOUND);
PAGEFLAG(XenRemapped, xen_remapped, PF_NO_COMPOUND)
TESTCLEARFLAG(XenRemapped, xen_remapped, PF_NO_COMPOUND)
PAGEFLAG(Reserved, reserved, PF_NO_COMPOUND)
__CLEARPAGEFLAG(Reserved, reserved, PF_NO_COMPOUND)
__SETPAGEFLAG(Reserved, reserved, PF_NO_COMPOUND)
PAGEFLAG(SwapBacked, swapbacked, PF_NO_TAIL)
__CLEARPAGEFLAG(SwapBacked, swapbacked, PF_NO_TAIL)
__SETPAGEFLAG(SwapBacked, swapbacked, PF_NO_TAIL)
/*
* Private page markings that may be used by the filesystem that owns the page
* for its own purposes.
* - PG_private and PG_private_2 cause releasepage() and co to be invoked
*/
PAGEFLAG(Private, private, PF_ANY)PAGEFLAG(Private2, private_2, PF_ANY) TESTSCFLAG(Private2, private_2, PF_ANY)
PAGEFLAG(OwnerPriv1, owner_priv_1, PF_ANY)
TESTCLEARFLAG(OwnerPriv1, owner_priv_1, PF_ANY)
/*
* Only test-and-set exist for PG_writeback. The unconditional operators are
* risky: they bypass page accounting.
*/
TESTPAGEFLAG(Writeback, writeback, PF_NO_TAIL) TESTSCFLAG(Writeback, writeback, PF_NO_TAIL)PAGEFLAG(MappedToDisk, mappedtodisk, PF_NO_TAIL)
/* PG_readahead is only used for reads; PG_reclaim is only for writes */
PAGEFLAG(Reclaim, reclaim, PF_NO_TAIL)
TESTCLEARFLAG(Reclaim, reclaim, PF_NO_TAIL)
PAGEFLAG(Readahead, reclaim, PF_NO_COMPOUND)
TESTCLEARFLAG(Readahead, reclaim, PF_NO_COMPOUND)
#ifdef CONFIG_HIGHMEM
/*
* Must use a macro here due to header dependency issues. page_zone() is not
* available at this point.
*/
#define PageHighMem(__p) is_highmem_idx(page_zonenum(__p))
#else
PAGEFLAG_FALSE(HighMem)
#endif
#ifdef CONFIG_SWAP
static __always_inline int PageSwapCache(struct page *page)
{
#ifdef CONFIG_THP_SWAP
page = compound_head(page);
#endif
return PageSwapBacked(page) && test_bit(PG_swapcache, &page->flags);
}
SETPAGEFLAG(SwapCache, swapcache, PF_NO_TAIL)
CLEARPAGEFLAG(SwapCache, swapcache, PF_NO_TAIL)
#else
PAGEFLAG_FALSE(SwapCache)
#endif
PAGEFLAG(Unevictable, unevictable, PF_HEAD) __CLEARPAGEFLAG(Unevictable, unevictable, PF_HEAD) TESTCLEARFLAG(Unevictable, unevictable, PF_HEAD)
#ifdef CONFIG_MMU
PAGEFLAG(Mlocked, mlocked, PF_NO_TAIL)
__CLEARPAGEFLAG(Mlocked, mlocked, PF_NO_TAIL)
TESTSCFLAG(Mlocked, mlocked, PF_NO_TAIL)
#else
PAGEFLAG_FALSE(Mlocked) __CLEARPAGEFLAG_NOOP(Mlocked)
TESTSCFLAG_FALSE(Mlocked)
#endif
#ifdef CONFIG_ARCH_USES_PG_UNCACHED
PAGEFLAG(Uncached, uncached, PF_NO_COMPOUND)
#else
PAGEFLAG_FALSE(Uncached)
#endif
#ifdef CONFIG_MEMORY_FAILURE
PAGEFLAG(HWPoison, hwpoison, PF_ANY)
TESTSCFLAG(HWPoison, hwpoison, PF_ANY)
#define __PG_HWPOISON (1UL << PG_hwpoison)
extern bool take_page_off_buddy(struct page *page);
#else
PAGEFLAG_FALSE(HWPoison)
#define __PG_HWPOISON 0
#endif
#if defined(CONFIG_PAGE_IDLE_FLAG) && defined(CONFIG_64BIT)
TESTPAGEFLAG(Young, young, PF_ANY)
SETPAGEFLAG(Young, young, PF_ANY)
TESTCLEARFLAG(Young, young, PF_ANY)
PAGEFLAG(Idle, idle, PF_ANY)
#endif
#ifdef CONFIG_KASAN_HW_TAGS
PAGEFLAG(SkipKASanPoison, skip_kasan_poison, PF_HEAD)
#else
PAGEFLAG_FALSE(SkipKASanPoison)
#endif
/*
* PageReported() is used to track reported free pages within the Buddy
* allocator. We can use the non-atomic version of the test and set
* operations as both should be shielded with the zone lock to prevent
* any possible races on the setting or clearing of the bit.
*/
__PAGEFLAG(Reported, reported, PF_NO_COMPOUND)
/*
* On an anonymous page mapped into a user virtual memory area,
* page->mapping points to its anon_vma, not to a struct address_space;
* with the PAGE_MAPPING_ANON bit set to distinguish it. See rmap.h.
*
* On an anonymous page in a VM_MERGEABLE area, if CONFIG_KSM is enabled,
* the PAGE_MAPPING_MOVABLE bit may be set along with the PAGE_MAPPING_ANON
* bit; and then page->mapping points, not to an anon_vma, but to a private
* structure which KSM associates with that merged page. See ksm.h.
*
* PAGE_MAPPING_KSM without PAGE_MAPPING_ANON is used for non-lru movable
* page and then page->mapping points a struct address_space.
*
* Please note that, confusingly, "page_mapping" refers to the inode
* address_space which maps the page from disk; whereas "page_mapped"
* refers to user virtual address space into which the page is mapped.
*/
#define PAGE_MAPPING_ANON 0x1
#define PAGE_MAPPING_MOVABLE 0x2
#define PAGE_MAPPING_KSM (PAGE_MAPPING_ANON | PAGE_MAPPING_MOVABLE)
#define PAGE_MAPPING_FLAGS (PAGE_MAPPING_ANON | PAGE_MAPPING_MOVABLE)
static __always_inline int PageMappingFlags(struct page *page)
{
return ((unsigned long)page->mapping & PAGE_MAPPING_FLAGS) != 0;
}
static __always_inline int PageAnon(struct page *page)
{
page = compound_head(page);
return ((unsigned long)page->mapping & PAGE_MAPPING_ANON) != 0;
}
static __always_inline int __PageMovable(struct page *page)
{
return ((unsigned long)page->mapping & PAGE_MAPPING_FLAGS) ==
PAGE_MAPPING_MOVABLE;
}
#ifdef CONFIG_KSM
/*
* A KSM page is one of those write-protected "shared pages" or "merged pages"
* which KSM maps into multiple mms, wherever identical anonymous page content
* is found in VM_MERGEABLE vmas. It's a PageAnon page, pointing not to any
* anon_vma, but to that page's node of the stable tree.
*/
static __always_inline int PageKsm(struct page *page)
{
page = compound_head(page);
return ((unsigned long)page->mapping & PAGE_MAPPING_FLAGS) ==
PAGE_MAPPING_KSM;
}
#else
TESTPAGEFLAG_FALSE(Ksm)
#endif
u64 stable_page_flags(struct page *page);
static inline int PageUptodate(struct page *page)
{
int ret;
page = compound_head(page);
ret = test_bit(PG_uptodate, &(page)->flags);
/*
* Must ensure that the data we read out of the page is loaded
* _after_ we've loaded page->flags to check for PageUptodate.
* We can skip the barrier if the page is not uptodate, because
* we wouldn't be reading anything from it.
*
* See SetPageUptodate() for the other side of the story.
*/
if (ret)
smp_rmb(); return ret;
}
static __always_inline void __SetPageUptodate(struct page *page)
{
VM_BUG_ON_PAGE(PageTail(page), page);
smp_wmb();
__set_bit(PG_uptodate, &page->flags);
}
static __always_inline void SetPageUptodate(struct page *page)
{
VM_BUG_ON_PAGE(PageTail(page), page);
/*
* Memory barrier must be issued before setting the PG_uptodate bit,
* so that all previous stores issued in order to bring the page
* uptodate are actually visible before PageUptodate becomes true.
*/
smp_wmb();
set_bit(PG_uptodate, &page->flags);
}
CLEARPAGEFLAG(Uptodate, uptodate, PF_NO_TAIL)
int test_clear_page_writeback(struct page *page);
int __test_set_page_writeback(struct page *page, bool keep_write);
#define test_set_page_writeback(page) \
__test_set_page_writeback(page, false)
#define test_set_page_writeback_keepwrite(page) \
__test_set_page_writeback(page, true)
static inline void set_page_writeback(struct page *page)
{
test_set_page_writeback(page);
}
static inline void set_page_writeback_keepwrite(struct page *page)
{
test_set_page_writeback_keepwrite(page);
}
__PAGEFLAG(Head, head, PF_ANY) CLEARPAGEFLAG(Head, head, PF_ANY)
static __always_inline void set_compound_head(struct page *page, struct page *head)
{
WRITE_ONCE(page->compound_head, (unsigned long)head + 1);
}
static __always_inline void clear_compound_head(struct page *page)
{
WRITE_ONCE(page->compound_head, 0);
}
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
static inline void ClearPageCompound(struct page *page)
{
BUG_ON(!PageHead(page));
ClearPageHead(page);
}
#endif
#define PG_head_mask ((1UL << PG_head))
#ifdef CONFIG_HUGETLB_PAGE
int PageHuge(struct page *page);
int PageHeadHuge(struct page *page);
#else
TESTPAGEFLAG_FALSE(Huge)
TESTPAGEFLAG_FALSE(HeadHuge)
#endif
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
/*
* PageHuge() only returns true for hugetlbfs pages, but not for
* normal or transparent huge pages.
*
* PageTransHuge() returns true for both transparent huge and
* hugetlbfs pages, but not normal pages. PageTransHuge() can only be
* called only in the core VM paths where hugetlbfs pages can't exist.
*/
static inline int PageTransHuge(struct page *page)
{
VM_BUG_ON_PAGE(PageTail(page), page);
return PageHead(page);
}
/*
* PageTransCompound returns true for both transparent huge pages
* and hugetlbfs pages, so it should only be called when it's known
* that hugetlbfs pages aren't involved.
*/
static inline int PageTransCompound(struct page *page)
{
return PageCompound(page);
}
/*
* PageTransTail returns true for both transparent huge pages
* and hugetlbfs pages, so it should only be called when it's known
* that hugetlbfs pages aren't involved.
*/
static inline int PageTransTail(struct page *page)
{
return PageTail(page);
}
/*
* PageDoubleMap indicates that the compound page is mapped with PTEs as well
* as PMDs.
*
* This is required for optimization of rmap operations for THP: we can postpone
* per small page mapcount accounting (and its overhead from atomic operations)
* until the first PMD split.
*
* For the page PageDoubleMap means ->_mapcount in all sub-pages is offset up
* by one. This reference will go away with last compound_mapcount.
*
* See also __split_huge_pmd_locked() and page_remove_anon_compound_rmap().
*/
PAGEFLAG(DoubleMap, double_map, PF_SECOND)
TESTSCFLAG(DoubleMap, double_map, PF_SECOND)
#else
TESTPAGEFLAG_FALSE(TransHuge)
TESTPAGEFLAG_FALSE(TransCompound)
TESTPAGEFLAG_FALSE(TransCompoundMap)
TESTPAGEFLAG_FALSE(TransTail)
PAGEFLAG_FALSE(DoubleMap)
TESTSCFLAG_FALSE(DoubleMap)
#endif
#if defined(CONFIG_MEMORY_FAILURE) && defined(CONFIG_TRANSPARENT_HUGEPAGE)
/*
* PageHasHWPoisoned indicates that at least one subpage is hwpoisoned in the
* compound page.
*
* This flag is set by hwpoison handler. Cleared by THP split or free page.
*/
PAGEFLAG(HasHWPoisoned, has_hwpoisoned, PF_SECOND)
TESTSCFLAG(HasHWPoisoned, has_hwpoisoned, PF_SECOND)
#else
PAGEFLAG_FALSE(HasHWPoisoned)
TESTSCFLAG_FALSE(HasHWPoisoned)
#endif
/*
* Check if a page is currently marked HWPoisoned. Note that this check is
* best effort only and inherently racy: there is no way to synchronize with
* failing hardware.
*/
static inline bool is_page_hwpoison(struct page *page)
{
if (PageHWPoison(page))
return true;
return PageHuge(page) && PageHWPoison(compound_head(page));
}
/*
* For pages that are never mapped to userspace (and aren't PageSlab),
* page_type may be used. Because it is initialised to -1, we invert the
* sense of the bit, so __SetPageFoo *clears* the bit used for PageFoo, and
* __ClearPageFoo *sets* the bit used for PageFoo. We reserve a few high and
* low bits so that an underflow or overflow of page_mapcount() won't be
* mistaken for a page type value.
*/
#define PAGE_TYPE_BASE 0xf0000000
/* Reserve 0x0000007f to catch underflows of page_mapcount */
#define PAGE_MAPCOUNT_RESERVE -128
#define PG_buddy 0x00000080
#define PG_offline 0x00000100
#define PG_table 0x00000200
#define PG_guard 0x00000400
#define PageType(page, flag) \
((page->page_type & (PAGE_TYPE_BASE | flag)) == PAGE_TYPE_BASE)
static inline int page_has_type(struct page *page)
{
return (int)page->page_type < PAGE_MAPCOUNT_RESERVE;
}
#define PAGE_TYPE_OPS(uname, lname) \
static __always_inline int Page##uname(struct page *page) \
{ \
return PageType(page, PG_##lname); \
} \
static __always_inline void __SetPage##uname(struct page *page) \
{ \
VM_BUG_ON_PAGE(!PageType(page, 0), page); \
page->page_type &= ~PG_##lname; \
} \
static __always_inline void __ClearPage##uname(struct page *page) \
{ \
VM_BUG_ON_PAGE(!Page##uname(page), page); \
page->page_type |= PG_##lname; \
}
/*
* PageBuddy() indicates that the page is free and in the buddy system
* (see mm/page_alloc.c).
*/
PAGE_TYPE_OPS(Buddy, buddy)
/*
* PageOffline() indicates that the page is logically offline although the
* containing section is online. (e.g. inflated in a balloon driver or
* not onlined when onlining the section).
* The content of these pages is effectively stale. Such pages should not
* be touched (read/write/dump/save) except by their owner.
*
* If a driver wants to allow to offline unmovable PageOffline() pages without
* putting them back to the buddy, it can do so via the memory notifier by
* decrementing the reference count in MEM_GOING_OFFLINE and incrementing the
* reference count in MEM_CANCEL_OFFLINE. When offlining, the PageOffline()
* pages (now with a reference count of zero) are treated like free pages,
* allowing the containing memory block to get offlined. A driver that
* relies on this feature is aware that re-onlining the memory block will
* require to re-set the pages PageOffline() and not giving them to the
* buddy via online_page_callback_t.
*
* There are drivers that mark a page PageOffline() and expect there won't be
* any further access to page content. PFN walkers that read content of random
* pages should check PageOffline() and synchronize with such drivers using
* page_offline_freeze()/page_offline_thaw().
*/
PAGE_TYPE_OPS(Offline, offline)
extern void page_offline_freeze(void);
extern void page_offline_thaw(void);
extern void page_offline_begin(void);
extern void page_offline_end(void);
/*
* Marks pages in use as page tables.
*/
PAGE_TYPE_OPS(Table, table)
/*
* Marks guardpages used with debug_pagealloc.
*/
PAGE_TYPE_OPS(Guard, guard)
extern bool is_free_buddy_page(struct page *page);
__PAGEFLAG(Isolated, isolated, PF_ANY);
/*
* If network-based swap is enabled, sl*b must keep track of whether pages
* were allocated from pfmemalloc reserves.
*/
static inline int PageSlabPfmemalloc(struct page *page)
{
VM_BUG_ON_PAGE(!PageSlab(page), page);
return PageActive(page);
}
/*
* A version of PageSlabPfmemalloc() for opportunistic checks where the page
* might have been freed under us and not be a PageSlab anymore.
*/
static inline int __PageSlabPfmemalloc(struct page *page)
{
return PageActive(page);
}
static inline void SetPageSlabPfmemalloc(struct page *page)
{
VM_BUG_ON_PAGE(!PageSlab(page), page);
SetPageActive(page);
}
static inline void __ClearPageSlabPfmemalloc(struct page *page)
{
VM_BUG_ON_PAGE(!PageSlab(page), page);
__ClearPageActive(page);
}
static inline void ClearPageSlabPfmemalloc(struct page *page)
{
VM_BUG_ON_PAGE(!PageSlab(page), page);
ClearPageActive(page);
}
#ifdef CONFIG_MMU
#define __PG_MLOCKED (1UL << PG_mlocked)
#else
#define __PG_MLOCKED 0
#endif
/*
* Flags checked when a page is freed. Pages being freed should not have
* these flags set. If they are, there is a problem.
*/
#define PAGE_FLAGS_CHECK_AT_FREE \
(1UL << PG_lru | 1UL << PG_locked | \
1UL << PG_private | 1UL << PG_private_2 | \
1UL << PG_writeback | 1UL << PG_reserved | \
1UL << PG_slab | 1UL << PG_active | \
1UL << PG_unevictable | __PG_MLOCKED)
/*
* Flags checked when a page is prepped for return by the page allocator.
* Pages being prepped should not have these flags set. If they are set,
* there has been a kernel bug or struct page corruption.
*
* __PG_HWPOISON is exceptional because it needs to be kept beyond page's
* alloc-free cycle to prevent from reusing the page.
*/
#define PAGE_FLAGS_CHECK_AT_PREP \
(PAGEFLAGS_MASK & ~__PG_HWPOISON)
#define PAGE_FLAGS_PRIVATE \
(1UL << PG_private | 1UL << PG_private_2)
/**
* page_has_private - Determine if page has private stuff
* @page: The page to be checked
*
* Determine if a page has private stuff, indicating that release routines
* should be invoked upon it.
*/
static inline int page_has_private(struct page *page)
{
return !!(page->flags & PAGE_FLAGS_PRIVATE);
}
#undef PF_ANY
#undef PF_HEAD
#undef PF_ONLY_HEAD
#undef PF_NO_TAIL
#undef PF_NO_COMPOUND
#undef PF_SECOND
#endif /* !__GENERATING_BOUNDS_H */
#endif /* PAGE_FLAGS_H */
// SPDX-License-Identifier: GPL-2.0
/*
* NETLINK Netlink attributes
*
* Authors: Thomas Graf <tgraf@suug.ch>
* Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
*/
#include <linux/export.h>
#include <linux/kernel.h>
#include <linux/errno.h>
#include <linux/jiffies.h>
#include <linux/skbuff.h>
#include <linux/string.h>
#include <linux/types.h>
#include <net/netlink.h>
/* For these data types, attribute length should be exactly the given
* size. However, to maintain compatibility with broken commands, if the
* attribute length does not match the expected size a warning is emitted
* to the user that the command is sending invalid data and needs to be fixed.
*/
static const u8 nla_attr_len[NLA_TYPE_MAX+1] = {
[NLA_U8] = sizeof(u8),
[NLA_U16] = sizeof(u16),
[NLA_U32] = sizeof(u32),
[NLA_U64] = sizeof(u64),
[NLA_S8] = sizeof(s8),
[NLA_S16] = sizeof(s16),
[NLA_S32] = sizeof(s32),
[NLA_S64] = sizeof(s64),
};
static const u8 nla_attr_minlen[NLA_TYPE_MAX+1] = {
[NLA_U8] = sizeof(u8),
[NLA_U16] = sizeof(u16),
[NLA_U32] = sizeof(u32),
[NLA_U64] = sizeof(u64),
[NLA_MSECS] = sizeof(u64),
[NLA_NESTED] = NLA_HDRLEN,
[NLA_S8] = sizeof(s8),
[NLA_S16] = sizeof(s16),
[NLA_S32] = sizeof(s32),
[NLA_S64] = sizeof(s64),
};
/*
* Nested policies might refer back to the original
* policy in some cases, and userspace could try to
* abuse that and recurse by nesting in the right
* ways. Limit recursion to avoid this problem.
*/
#define MAX_POLICY_RECURSION_DEPTH 10
static int __nla_validate_parse(const struct nlattr *head, int len, int maxtype,
const struct nla_policy *policy,
unsigned int validate,
struct netlink_ext_ack *extack,
struct nlattr **tb, unsigned int depth);
static int validate_nla_bitfield32(const struct nlattr *nla,
const u32 valid_flags_mask)
{
const struct nla_bitfield32 *bf = nla_data(nla);
if (!valid_flags_mask)
return -EINVAL;
/*disallow invalid bit selector */
if (bf->selector & ~valid_flags_mask)
return -EINVAL;
/*disallow invalid bit values */
if (bf->value & ~valid_flags_mask)
return -EINVAL;
/*disallow valid bit values that are not selected*/
if (bf->value & ~bf->selector)
return -EINVAL;
return 0;
}
static int nla_validate_array(const struct nlattr *head, int len, int maxtype,
const struct nla_policy *policy,
struct netlink_ext_ack *extack,
unsigned int validate, unsigned int depth)
{
const struct nlattr *entry;
int rem;
nla_for_each_attr(entry, head, len, rem) {
int ret;
if (nla_len(entry) == 0)
continue;
if (nla_len(entry) < NLA_HDRLEN) { NL_SET_ERR_MSG_ATTR_POL(extack, entry, policy,
"Array element too short");
return -ERANGE;
}
ret = __nla_validate_parse(nla_data(entry), nla_len(entry),
maxtype, policy, validate, extack,
NULL, depth + 1);
if (ret < 0)
return ret;
}
return 0;
}
void nla_get_range_unsigned(const struct nla_policy *pt,
struct netlink_range_validation *range)
{
WARN_ON_ONCE(pt->validation_type != NLA_VALIDATE_RANGE_PTR &&
(pt->min < 0 || pt->max < 0));
range->min = 0;
switch (pt->type) {
case NLA_U8:
range->max = U8_MAX;
break;
case NLA_U16:
case NLA_BINARY:
range->max = U16_MAX;
break;
case NLA_U32:
range->max = U32_MAX;
break;
case NLA_U64:
case NLA_MSECS:
range->max = U64_MAX;
break;
default:
WARN_ON_ONCE(1);
return;
}
switch (pt->validation_type) {
case NLA_VALIDATE_RANGE:
case NLA_VALIDATE_RANGE_WARN_TOO_LONG:
range->min = pt->min;
range->max = pt->max;
break;
case NLA_VALIDATE_RANGE_PTR:
*range = *pt->range;
break;
case NLA_VALIDATE_MIN:
range->min = pt->min;
break;
case NLA_VALIDATE_MAX:
range->max = pt->max;
break;
default:
break;
}
}
static int nla_validate_range_unsigned(const struct nla_policy *pt,
const struct nlattr *nla,
struct netlink_ext_ack *extack,
unsigned int validate)
{
struct netlink_range_validation range;
u64 value;
switch (pt->type) {
case NLA_U8:
value = nla_get_u8(nla);
break;
case NLA_U16:
value = nla_get_u16(nla);
break;
case NLA_U32:
value = nla_get_u32(nla);
break;
case NLA_U64:
case NLA_MSECS:
value = nla_get_u64(nla);
break;
case NLA_BINARY:
value = nla_len(nla);
break;
default:
return -EINVAL;
}
nla_get_range_unsigned(pt, &range);
if (pt->validation_type == NLA_VALIDATE_RANGE_WARN_TOO_LONG &&
pt->type == NLA_BINARY && value > range.max) { pr_warn_ratelimited("netlink: '%s': attribute type %d has an invalid length.\n",
current->comm, pt->type);
if (validate & NL_VALIDATE_STRICT_ATTRS) { NL_SET_ERR_MSG_ATTR_POL(extack, nla, pt,
"invalid attribute length");
return -EINVAL;
}
/* this assumes min <= max (don't validate against min) */
return 0;
}
if (value < range.min || value > range.max) { bool binary = pt->type == NLA_BINARY;
if (binary)
NL_SET_ERR_MSG_ATTR_POL(extack, nla, pt,
"binary attribute size out of range");
else
NL_SET_ERR_MSG_ATTR_POL(extack, nla, pt,
"integer out of range");
return -ERANGE;
}
return 0;
}
void nla_get_range_signed(const struct nla_policy *pt,
struct netlink_range_validation_signed *range)
{
switch (pt->type) {
case NLA_S8:
range->min = S8_MIN;
range->max = S8_MAX;
break;
case NLA_S16:
range->min = S16_MIN;
range->max = S16_MAX;
break;
case NLA_S32:
range->min = S32_MIN;
range->max = S32_MAX;
break;
case NLA_S64:
range->min = S64_MIN;
range->max = S64_MAX;
break;
default:
WARN_ON_ONCE(1);
return;
}
switch (pt->validation_type) {
case NLA_VALIDATE_RANGE:
range->min = pt->min;
range->max = pt->max;
break;
case NLA_VALIDATE_RANGE_PTR:
*range = *pt->range_signed;
break;
case NLA_VALIDATE_MIN:
range->min = pt->min;
break;
case NLA_VALIDATE_MAX:
range->max = pt->max;
break;
default:
break;
}
}
static int nla_validate_int_range_signed(const struct nla_policy *pt,
const struct nlattr *nla,
struct netlink_ext_ack *extack)
{
struct netlink_range_validation_signed range;
s64 value;
switch (pt->type) {
case NLA_S8:
value = nla_get_s8(nla);
break;
case NLA_S16:
value = nla_get_s16(nla);
break;
case NLA_S32:
value = nla_get_s32(nla);
break;
case NLA_S64:
value = nla_get_s64(nla);
break;
default:
return -EINVAL;
}
nla_get_range_signed(pt, &range); if (value < range.min || value > range.max) { NL_SET_ERR_MSG_ATTR_POL(extack, nla, pt,
"integer out of range");
return -ERANGE;
}
return 0;
}
static int nla_validate_int_range(const struct nla_policy *pt,
const struct nlattr *nla,
struct netlink_ext_ack *extack,
unsigned int validate)
{
switch (pt->type) {
case NLA_U8:
case NLA_U16:
case NLA_U32:
case NLA_U64:
case NLA_MSECS:
case NLA_BINARY:
return nla_validate_range_unsigned(pt, nla, extack, validate);
case NLA_S8:
case NLA_S16:
case NLA_S32:
case NLA_S64:
return nla_validate_int_range_signed(pt, nla, extack);
default:
WARN_ON(1);
return -EINVAL;
}
}
static int nla_validate_mask(const struct nla_policy *pt,
const struct nlattr *nla,
struct netlink_ext_ack *extack)
{
u64 value;
switch (pt->type) {
case NLA_U8:
value = nla_get_u8(nla);
break;
case NLA_U16:
value = nla_get_u16(nla);
break;
case NLA_U32:
value = nla_get_u32(nla);
break;
case NLA_U64:
value = nla_get_u64(nla);
break;
default:
return -EINVAL;
}
if (value & ~(u64)pt->mask) { NL_SET_ERR_MSG_ATTR(extack, nla, "reserved bit set");
return -EINVAL;
}
return 0;
}
static int validate_nla(const struct nlattr *nla, int maxtype,
const struct nla_policy *policy, unsigned int validate,
struct netlink_ext_ack *extack, unsigned int depth)
{
u16 strict_start_type = policy[0].strict_start_type;
const struct nla_policy *pt;
int minlen = 0, attrlen = nla_len(nla), type = nla_type(nla);
int err = -ERANGE;
if (strict_start_type && type >= strict_start_type) validate |= NL_VALIDATE_STRICT;
if (type <= 0 || type > maxtype)
return 0;
pt = &policy[type]; BUG_ON(pt->type > NLA_TYPE_MAX); if (nla_attr_len[pt->type] && attrlen != nla_attr_len[pt->type]) { pr_warn_ratelimited("netlink: '%s': attribute type %d has an invalid length.\n",
current->comm, type);
if (validate & NL_VALIDATE_STRICT_ATTRS) { NL_SET_ERR_MSG_ATTR_POL(extack, nla, pt,
"invalid attribute length");
return -EINVAL;
}
}
if (validate & NL_VALIDATE_NESTED) { if ((pt->type == NLA_NESTED || pt->type == NLA_NESTED_ARRAY) && !(nla->nla_type & NLA_F_NESTED)) { NL_SET_ERR_MSG_ATTR_POL(extack, nla, pt,
"NLA_F_NESTED is missing");
return -EINVAL;
}
if (pt->type != NLA_NESTED && pt->type != NLA_NESTED_ARRAY && pt->type != NLA_UNSPEC && (nla->nla_type & NLA_F_NESTED)) { NL_SET_ERR_MSG_ATTR_POL(extack, nla, pt,
"NLA_F_NESTED not expected");
return -EINVAL;
}
}
switch (pt->type) {
case NLA_REJECT:
if (extack && pt->reject_message) { NL_SET_BAD_ATTR(extack, nla);
extack->_msg = pt->reject_message;
return -EINVAL;
}
err = -EINVAL;
goto out_err;
case NLA_FLAG:
if (attrlen > 0)
goto out_err;
break;
case NLA_BITFIELD32:
if (attrlen != sizeof(struct nla_bitfield32))
goto out_err;
err = validate_nla_bitfield32(nla, pt->bitfield32_valid);
if (err)
goto out_err;
break;
case NLA_NUL_STRING:
if (pt->len) minlen = min_t(int, attrlen, pt->len + 1);
else
minlen = attrlen;
if (!minlen || memchr(nla_data(nla), '\0', minlen) == NULL) {
err = -EINVAL;
goto out_err;
}
fallthrough;
case NLA_STRING:
if (attrlen < 1)
goto out_err;
if (pt->len) {
char *buf = nla_data(nla);
if (buf[attrlen - 1] == '\0')
attrlen--;
if (attrlen > pt->len)
goto out_err;
}
break;
case NLA_BINARY:
if (pt->len && attrlen > pt->len)
goto out_err;
break;
case NLA_NESTED:
/* a nested attributes is allowed to be empty; if its not,
* it must have a size of at least NLA_HDRLEN.
*/
if (attrlen == 0)
break;
if (attrlen < NLA_HDRLEN)
goto out_err;
if (pt->nested_policy) { err = __nla_validate_parse(nla_data(nla), nla_len(nla),
pt->len, pt->nested_policy,
validate, extack, NULL,
depth + 1);
if (err < 0) {
/*
* return directly to preserve the inner
* error message/attribute pointer
*/
return err;
}
}
break;
case NLA_NESTED_ARRAY:
/* a nested array attribute is allowed to be empty; if its not,
* it must have a size of at least NLA_HDRLEN.
*/
if (attrlen == 0)
break;
if (attrlen < NLA_HDRLEN)
goto out_err;
if (pt->nested_policy) {
int err;
err = nla_validate_array(nla_data(nla), nla_len(nla),
pt->len, pt->nested_policy,
extack, validate, depth);
if (err < 0) {
/*
* return directly to preserve the inner
* error message/attribute pointer
*/
return err;
}
}
break;
case NLA_UNSPEC:
if (validate & NL_VALIDATE_UNSPEC) { NL_SET_ERR_MSG_ATTR(extack, nla,
"Unsupported attribute");
return -EINVAL;
}
if (attrlen < pt->len)
goto out_err;
break;
default:
if (pt->len) minlen = pt->len;
else
minlen = nla_attr_minlen[pt->type];
if (attrlen < minlen)
goto out_err;
}
/* further validation */
switch (pt->validation_type) {
case NLA_VALIDATE_NONE:
/* nothing to do */
break;
case NLA_VALIDATE_RANGE_PTR:
case NLA_VALIDATE_RANGE:
case NLA_VALIDATE_RANGE_WARN_TOO_LONG:
case NLA_VALIDATE_MIN:
case NLA_VALIDATE_MAX:
err = nla_validate_int_range(pt, nla, extack, validate);
if (err)
return err;
break;
case NLA_VALIDATE_MASK:
err = nla_validate_mask(pt, nla, extack);
if (err)
return err;
break;
case NLA_VALIDATE_FUNCTION:
if (pt->validate) { err = pt->validate(nla, extack);
if (err)
return err;
}
break;
}
return 0;
out_err:
NL_SET_ERR_MSG_ATTR_POL(extack, nla, pt,
"Attribute failed policy validation");
return err;
}
static int __nla_validate_parse(const struct nlattr *head, int len, int maxtype,
const struct nla_policy *policy,
unsigned int validate,
struct netlink_ext_ack *extack,
struct nlattr **tb, unsigned int depth)
{
const struct nlattr *nla;
int rem;
if (depth >= MAX_POLICY_RECURSION_DEPTH) { NL_SET_ERR_MSG(extack,
"allowed policy recursion depth exceeded");
return -EINVAL;
}
if (tb) memset(tb, 0, sizeof(struct nlattr *) * (maxtype + 1));
nla_for_each_attr(nla, head, len, rem) {
u16 type = nla_type(nla); if (type == 0 || type > maxtype) { if (validate & NL_VALIDATE_MAXTYPE) { NL_SET_ERR_MSG_ATTR(extack, nla,
"Unknown attribute type");
return -EINVAL;
}
continue;
}
if (policy) {
int err = validate_nla(nla, maxtype, policy,
validate, extack, depth);
if (err < 0)
return err;
}
if (tb) tb[type] = (struct nlattr *)nla;
}
if (unlikely(rem > 0)) { pr_warn_ratelimited("netlink: %d bytes leftover after parsing attributes in process `%s'.\n",
rem, current->comm);
NL_SET_ERR_MSG(extack, "bytes leftover after parsing attributes"); if (validate & NL_VALIDATE_TRAILING)
return -EINVAL;
}
return 0;
}
/**
* __nla_validate - Validate a stream of attributes
* @head: head of attribute stream
* @len: length of attribute stream
* @maxtype: maximum attribute type to be expected
* @policy: validation policy
* @validate: validation strictness
* @extack: extended ACK report struct
*
* Validates all attributes in the specified attribute stream against the
* specified policy. Validation depends on the validate flags passed, see
* &enum netlink_validation for more details on that.
* See documentation of struct nla_policy for more details.
*
* Returns 0 on success or a negative error code.
*/
int __nla_validate(const struct nlattr *head, int len, int maxtype,
const struct nla_policy *policy, unsigned int validate,
struct netlink_ext_ack *extack)
{
return __nla_validate_parse(head, len, maxtype, policy, validate,
extack, NULL, 0);
}
EXPORT_SYMBOL(__nla_validate);
/**
* nla_policy_len - Determine the max. length of a policy
* @policy: policy to use
* @n: number of policies
*
* Determines the max. length of the policy. It is currently used
* to allocated Netlink buffers roughly the size of the actual
* message.
*
* Returns 0 on success or a negative error code.
*/
int
nla_policy_len(const struct nla_policy *p, int n)
{
int i, len = 0;
for (i = 0; i < n; i++, p++) {
if (p->len)
len += nla_total_size(p->len);
else if (nla_attr_len[p->type])
len += nla_total_size(nla_attr_len[p->type]);
else if (nla_attr_minlen[p->type])
len += nla_total_size(nla_attr_minlen[p->type]);
}
return len;
}
EXPORT_SYMBOL(nla_policy_len);
/**
* __nla_parse - Parse a stream of attributes into a tb buffer
* @tb: destination array with maxtype+1 elements
* @maxtype: maximum attribute type to be expected
* @head: head of attribute stream
* @len: length of attribute stream
* @policy: validation policy
* @validate: validation strictness
* @extack: extended ACK pointer
*
* Parses a stream of attributes and stores a pointer to each attribute in
* the tb array accessible via the attribute type.
* Validation is controlled by the @validate parameter.
*
* Returns 0 on success or a negative error code.
*/
int __nla_parse(struct nlattr **tb, int maxtype,
const struct nlattr *head, int len,
const struct nla_policy *policy, unsigned int validate,
struct netlink_ext_ack *extack)
{
return __nla_validate_parse(head, len, maxtype, policy, validate,
extack, tb, 0);
}
EXPORT_SYMBOL(__nla_parse);
/**
* nla_find - Find a specific attribute in a stream of attributes
* @head: head of attribute stream
* @len: length of attribute stream
* @attrtype: type of attribute to look for
*
* Returns the first attribute in the stream matching the specified type.
*/
struct nlattr *nla_find(const struct nlattr *head, int len, int attrtype)
{
const struct nlattr *nla;
int rem;
nla_for_each_attr(nla, head, len, rem)
if (nla_type(nla) == attrtype)
return (struct nlattr *)nla;
return NULL;
}
EXPORT_SYMBOL(nla_find);
/**
* nla_strscpy - Copy string attribute payload into a sized buffer
* @dst: Where to copy the string to.
* @nla: Attribute to copy the string from.
* @dstsize: Size of destination buffer.
*
* Copies at most dstsize - 1 bytes into the destination buffer.
* Unlike strlcpy the destination buffer is always padded out.
*
* Return:
* * srclen - Returns @nla length (not including the trailing %NUL).
* * -E2BIG - If @dstsize is 0 or greater than U16_MAX or @nla length greater
* than @dstsize.
*/
ssize_t nla_strscpy(char *dst, const struct nlattr *nla, size_t dstsize)
{
size_t srclen = nla_len(nla);
char *src = nla_data(nla);
ssize_t ret;
size_t len;
if (dstsize == 0 || WARN_ON_ONCE(dstsize > U16_MAX))
return -E2BIG;
if (srclen > 0 && src[srclen - 1] == '\0')
srclen--;
if (srclen >= dstsize) {
len = dstsize - 1;
ret = -E2BIG;
} else {
len = srclen;
ret = len;
}
memcpy(dst, src, len);
/* Zero pad end of dst. */
memset(dst + len, 0, dstsize - len);
return ret;
}
EXPORT_SYMBOL(nla_strscpy);
/**
* nla_strdup - Copy string attribute payload into a newly allocated buffer
* @nla: attribute to copy the string from
* @flags: the type of memory to allocate (see kmalloc).
*
* Returns a pointer to the allocated buffer or NULL on error.
*/
char *nla_strdup(const struct nlattr *nla, gfp_t flags)
{
size_t srclen = nla_len(nla);
char *src = nla_data(nla), *dst;
if (srclen > 0 && src[srclen - 1] == '\0')
srclen--;
dst = kmalloc(srclen + 1, flags);
if (dst != NULL) {
memcpy(dst, src, srclen);
dst[srclen] = '\0';
}
return dst;
}
EXPORT_SYMBOL(nla_strdup);
/**
* nla_memcpy - Copy a netlink attribute into another memory area
* @dest: where to copy to memcpy
* @src: netlink attribute to copy from
* @count: size of the destination area
*
* Note: The number of bytes copied is limited by the length of
* attribute's payload. memcpy
*
* Returns the number of bytes copied.
*/
int nla_memcpy(void *dest, const struct nlattr *src, int count)
{
int minlen = min_t(int, count, nla_len(src));
memcpy(dest, nla_data(src), minlen);
if (count > minlen)
memset(dest + minlen, 0, count - minlen);
return minlen;
}
EXPORT_SYMBOL(nla_memcpy);
/**
* nla_memcmp - Compare an attribute with sized memory area
* @nla: netlink attribute
* @data: memory area
* @size: size of memory area
*/
int nla_memcmp(const struct nlattr *nla, const void *data,
size_t size)
{
int d = nla_len(nla) - size;
if (d == 0)
d = memcmp(nla_data(nla), data, size);
return d;
}
EXPORT_SYMBOL(nla_memcmp);
/**
* nla_strcmp - Compare a string attribute against a string
* @nla: netlink string attribute
* @str: another string
*/
int nla_strcmp(const struct nlattr *nla, const char *str)
{
int len = strlen(str);
char *buf = nla_data(nla);
int attrlen = nla_len(nla);
int d;
while (attrlen > 0 && buf[attrlen - 1] == '\0')
attrlen--;
d = attrlen - len;
if (d == 0)
d = memcmp(nla_data(nla), str, len);
return d;
}
EXPORT_SYMBOL(nla_strcmp);
#ifdef CONFIG_NET
/**
* __nla_reserve - reserve room for attribute on the skb
* @skb: socket buffer to reserve room on
* @attrtype: attribute type
* @attrlen: length of attribute payload
*
* Adds a netlink attribute header to a socket buffer and reserves
* room for the payload but does not copy it.
*
* The caller is responsible to ensure that the skb provides enough
* tailroom for the attribute header and payload.
*/
struct nlattr *__nla_reserve(struct sk_buff *skb, int attrtype, int attrlen)
{
struct nlattr *nla;
nla = skb_put(skb, nla_total_size(attrlen));
nla->nla_type = attrtype;
nla->nla_len = nla_attr_size(attrlen);
memset((unsigned char *) nla + nla->nla_len, 0, nla_padlen(attrlen));
return nla;
}
EXPORT_SYMBOL(__nla_reserve);
/**
* __nla_reserve_64bit - reserve room for attribute on the skb and align it
* @skb: socket buffer to reserve room on
* @attrtype: attribute type
* @attrlen: length of attribute payload
* @padattr: attribute type for the padding
*
* Adds a netlink attribute header to a socket buffer and reserves
* room for the payload but does not copy it. It also ensure that this
* attribute will have a 64-bit aligned nla_data() area.
*
* The caller is responsible to ensure that the skb provides enough
* tailroom for the attribute header and payload.
*/
struct nlattr *__nla_reserve_64bit(struct sk_buff *skb, int attrtype,
int attrlen, int padattr)
{
nla_align_64bit(skb, padattr);
return __nla_reserve(skb, attrtype, attrlen);
}
EXPORT_SYMBOL(__nla_reserve_64bit);
/**
* __nla_reserve_nohdr - reserve room for attribute without header
* @skb: socket buffer to reserve room on
* @attrlen: length of attribute payload
*
* Reserves room for attribute payload without a header.
*
* The caller is responsible to ensure that the skb provides enough
* tailroom for the payload.
*/
void *__nla_reserve_nohdr(struct sk_buff *skb, int attrlen)
{
return skb_put_zero(skb, NLA_ALIGN(attrlen));
}
EXPORT_SYMBOL(__nla_reserve_nohdr);
/**
* nla_reserve - reserve room for attribute on the skb
* @skb: socket buffer to reserve room on
* @attrtype: attribute type
* @attrlen: length of attribute payload
*
* Adds a netlink attribute header to a socket buffer and reserves
* room for the payload but does not copy it.
*
* Returns NULL if the tailroom of the skb is insufficient to store
* the attribute header and payload.
*/
struct nlattr *nla_reserve(struct sk_buff *skb, int attrtype, int attrlen)
{
if (unlikely(skb_tailroom(skb) < nla_total_size(attrlen)))
return NULL;
return __nla_reserve(skb, attrtype, attrlen);
}
EXPORT_SYMBOL(nla_reserve);
/**
* nla_reserve_64bit - reserve room for attribute on the skb and align it
* @skb: socket buffer to reserve room on
* @attrtype: attribute type
* @attrlen: length of attribute payload
* @padattr: attribute type for the padding
*
* Adds a netlink attribute header to a socket buffer and reserves
* room for the payload but does not copy it. It also ensure that this
* attribute will have a 64-bit aligned nla_data() area.
*
* Returns NULL if the tailroom of the skb is insufficient to store
* the attribute header and payload.
*/
struct nlattr *nla_reserve_64bit(struct sk_buff *skb, int attrtype, int attrlen,
int padattr)
{
size_t len;
if (nla_need_padding_for_64bit(skb))
len = nla_total_size_64bit(attrlen);
else
len = nla_total_size(attrlen);
if (unlikely(skb_tailroom(skb) < len))
return NULL;
return __nla_reserve_64bit(skb, attrtype, attrlen, padattr);
}
EXPORT_SYMBOL(nla_reserve_64bit);
/**
* nla_reserve_nohdr - reserve room for attribute without header
* @skb: socket buffer to reserve room on
* @attrlen: length of attribute payload
*
* Reserves room for attribute payload without a header.
*
* Returns NULL if the tailroom of the skb is insufficient to store
* the attribute payload.
*/
void *nla_reserve_nohdr(struct sk_buff *skb, int attrlen)
{
if (unlikely(skb_tailroom(skb) < NLA_ALIGN(attrlen)))
return NULL;
return __nla_reserve_nohdr(skb, attrlen);
}
EXPORT_SYMBOL(nla_reserve_nohdr);
/**
* __nla_put - Add a netlink attribute to a socket buffer
* @skb: socket buffer to add attribute to
* @attrtype: attribute type
* @attrlen: length of attribute payload
* @data: head of attribute payload
*
* The caller is responsible to ensure that the skb provides enough
* tailroom for the attribute header and payload.
*/
void __nla_put(struct sk_buff *skb, int attrtype, int attrlen,
const void *data)
{
struct nlattr *nla;
nla = __nla_reserve(skb, attrtype, attrlen);
memcpy(nla_data(nla), data, attrlen);
}
EXPORT_SYMBOL(__nla_put);
/**
* __nla_put_64bit - Add a netlink attribute to a socket buffer and align it
* @skb: socket buffer to add attribute to
* @attrtype: attribute type
* @attrlen: length of attribute payload
* @data: head of attribute payload
* @padattr: attribute type for the padding
*
* The caller is responsible to ensure that the skb provides enough
* tailroom for the attribute header and payload.
*/
void __nla_put_64bit(struct sk_buff *skb, int attrtype, int attrlen,
const void *data, int padattr)
{
struct nlattr *nla;
nla = __nla_reserve_64bit(skb, attrtype, attrlen, padattr);
memcpy(nla_data(nla), data, attrlen);
}
EXPORT_SYMBOL(__nla_put_64bit);
/**
* __nla_put_nohdr - Add a netlink attribute without header
* @skb: socket buffer to add attribute to
* @attrlen: length of attribute payload
* @data: head of attribute payload
*
* The caller is responsible to ensure that the skb provides enough
* tailroom for the attribute payload.
*/
void __nla_put_nohdr(struct sk_buff *skb, int attrlen, const void *data)
{
void *start;
start = __nla_reserve_nohdr(skb, attrlen);
memcpy(start, data, attrlen);
}
EXPORT_SYMBOL(__nla_put_nohdr);
/**
* nla_put - Add a netlink attribute to a socket buffer
* @skb: socket buffer to add attribute to
* @attrtype: attribute type
* @attrlen: length of attribute payload
* @data: head of attribute payload
*
* Returns -EMSGSIZE if the tailroom of the skb is insufficient to store
* the attribute header and payload.
*/
int nla_put(struct sk_buff *skb, int attrtype, int attrlen, const void *data)
{
if (unlikely(skb_tailroom(skb) < nla_total_size(attrlen)))
return -EMSGSIZE;
__nla_put(skb, attrtype, attrlen, data); return 0;
}
EXPORT_SYMBOL(nla_put);
/**
* nla_put_64bit - Add a netlink attribute to a socket buffer and align it
* @skb: socket buffer to add attribute to
* @attrtype: attribute type
* @attrlen: length of attribute payload
* @data: head of attribute payload
* @padattr: attribute type for the padding
*
* Returns -EMSGSIZE if the tailroom of the skb is insufficient to store
* the attribute header and payload.
*/
int nla_put_64bit(struct sk_buff *skb, int attrtype, int attrlen,
const void *data, int padattr)
{
size_t len;
if (nla_need_padding_for_64bit(skb))
len = nla_total_size_64bit(attrlen);
else
len = nla_total_size(attrlen);
if (unlikely(skb_tailroom(skb) < len))
return -EMSGSIZE;
__nla_put_64bit(skb, attrtype, attrlen, data, padattr);
return 0;
}
EXPORT_SYMBOL(nla_put_64bit);
/**
* nla_put_nohdr - Add a netlink attribute without header
* @skb: socket buffer to add attribute to
* @attrlen: length of attribute payload
* @data: head of attribute payload
*
* Returns -EMSGSIZE if the tailroom of the skb is insufficient to store
* the attribute payload.
*/
int nla_put_nohdr(struct sk_buff *skb, int attrlen, const void *data)
{
if (unlikely(skb_tailroom(skb) < NLA_ALIGN(attrlen)))
return -EMSGSIZE;
__nla_put_nohdr(skb, attrlen, data);
return 0;
}
EXPORT_SYMBOL(nla_put_nohdr);
/**
* nla_append - Add a netlink attribute without header or padding
* @skb: socket buffer to add attribute to
* @attrlen: length of attribute payload
* @data: head of attribute payload
*
* Returns -EMSGSIZE if the tailroom of the skb is insufficient to store
* the attribute payload.
*/
int nla_append(struct sk_buff *skb, int attrlen, const void *data)
{
if (unlikely(skb_tailroom(skb) < NLA_ALIGN(attrlen)))
return -EMSGSIZE;
skb_put_data(skb, data, attrlen);
return 0;
}
EXPORT_SYMBOL(nla_append);
#endif
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef __LINUX_BIT_SPINLOCK_H
#define __LINUX_BIT_SPINLOCK_H
#include <linux/kernel.h>
#include <linux/preempt.h>
#include <linux/atomic.h>
#include <linux/bug.h>
/*
* bit-based spin_lock()
*
* Don't use this unless you really need to: spin_lock() and spin_unlock()
* are significantly faster.
*/
static inline void bit_spin_lock(int bitnum, unsigned long *addr)
{
/*
* Assuming the lock is uncontended, this never enters
* the body of the outer loop. If it is contended, then
* within the inner loop a non-atomic test is used to
* busywait with less bus contention for a good time to
* attempt to acquire the lock bit.
*/
preempt_disable();
#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK)
while (unlikely(test_and_set_bit_lock(bitnum, addr))) {
preempt_enable();
do {
cpu_relax();
} while (test_bit(bitnum, addr));
preempt_disable();
}
#endif
__acquire(bitlock);
}
/*
* Return true if it was acquired
*/
static inline int bit_spin_trylock(int bitnum, unsigned long *addr)
{
preempt_disable();
#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK)
if (unlikely(test_and_set_bit_lock(bitnum, addr))) {
preempt_enable();
return 0;
}
#endif
__acquire(bitlock);
return 1;
}
/*
* bit-based spin_unlock()
*/
static inline void bit_spin_unlock(int bitnum, unsigned long *addr)
{
#ifdef CONFIG_DEBUG_SPINLOCK
BUG_ON(!test_bit(bitnum, addr));
#endif
#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK)
clear_bit_unlock(bitnum, addr);
#endif
preempt_enable();
__release(bitlock);
}
/*
* bit-based spin_unlock()
* non-atomic version, which can be used eg. if the bit lock itself is
* protecting the rest of the flags in the word.
*/
static inline void __bit_spin_unlock(int bitnum, unsigned long *addr)
{
#ifdef CONFIG_DEBUG_SPINLOCK
BUG_ON(!test_bit(bitnum, addr));
#endif
#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK)
__clear_bit_unlock(bitnum, addr);
#endif
preempt_enable();
__release(bitlock);
}
/*
* Return true if the lock is held.
*/
static inline int bit_spin_is_locked(int bitnum, unsigned long *addr)
{
#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK)
return test_bit(bitnum, addr);
#elif defined CONFIG_PREEMPT_COUNT
return preempt_count();
#else
return 1;
#endif
}
#endif /* __LINUX_BIT_SPINLOCK_H */
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef BTRFS_SPACE_INFO_H
#define BTRFS_SPACE_INFO_H
struct btrfs_space_info {
spinlock_t lock;
u64 total_bytes; /* total bytes in the space,
this doesn't take mirrors into account */
u64 bytes_used; /* total bytes used,
this doesn't take mirrors into account */
u64 bytes_pinned; /* total bytes pinned, will be freed when the
transaction finishes */
u64 bytes_reserved; /* total bytes the allocator has reserved for
current allocations */
u64 bytes_may_use; /* number of bytes that may be used for
delalloc/allocations */
u64 bytes_readonly; /* total bytes that are read only */
u64 bytes_zone_unusable; /* total bytes that are unusable until
resetting the device zone */
u64 max_extent_size; /* This will hold the maximum extent size of
the space info if we had an ENOSPC in the
allocator. */
int clamp; /* Used to scale our threshold for preemptive
flushing. The value is >> clamp, so turns
out to be a 2^clamp divisor. */
unsigned int full:1; /* indicates that we cannot allocate any more
chunks for this space */
unsigned int chunk_alloc:1; /* set if we are allocating a chunk */
unsigned int flush:1; /* set if we are trying to make space */
unsigned int force_alloc; /* set if we need to force a chunk
alloc for this space */
u64 disk_used; /* total bytes used on disk */
u64 disk_total; /* total bytes on disk, takes mirrors into
account */
u64 flags;
struct list_head list;
/* Protected by the spinlock 'lock'. */
struct list_head ro_bgs;
struct list_head priority_tickets;
struct list_head tickets;
/*
* Size of space that needs to be reclaimed in order to satisfy pending
* tickets
*/
u64 reclaim_size;
/*
* tickets_id just indicates the next ticket will be handled, so note
* it's not stored per ticket.
*/
u64 tickets_id;
struct rw_semaphore groups_sem;
/* for block groups in our same type */
struct list_head block_groups[BTRFS_NR_RAID_TYPES];
struct kobject kobj;
struct kobject *block_group_kobjs[BTRFS_NR_RAID_TYPES];
};
struct reserve_ticket {
u64 bytes;
int error;
bool steal;
struct list_head list;
wait_queue_head_t wait;
};
static inline bool btrfs_mixed_space_info(struct btrfs_space_info *space_info)
{
return ((space_info->flags & BTRFS_BLOCK_GROUP_METADATA) &&
(space_info->flags & BTRFS_BLOCK_GROUP_DATA));
}
/*
*
* Declare a helper function to detect underflow of various space info members
*/
#define DECLARE_SPACE_INFO_UPDATE(name, trace_name) \
static inline void \
btrfs_space_info_update_##name(struct btrfs_fs_info *fs_info, \
struct btrfs_space_info *sinfo, \
s64 bytes) \
{ \
const u64 abs_bytes = (bytes < 0) ? -bytes : bytes; \
lockdep_assert_held(&sinfo->lock); \
trace_update_##name(fs_info, sinfo, sinfo->name, bytes); \
trace_btrfs_space_reservation(fs_info, trace_name, \
sinfo->flags, abs_bytes, \
bytes > 0); \
if (bytes < 0 && sinfo->name < -bytes) { \
WARN_ON(1); \
sinfo->name = 0; \
return; \
} \
sinfo->name += bytes; \
}
DECLARE_SPACE_INFO_UPDATE(bytes_may_use, "space_info");DECLARE_SPACE_INFO_UPDATE(bytes_pinned, "pinned");
int btrfs_init_space_info(struct btrfs_fs_info *fs_info);
void btrfs_update_space_info(struct btrfs_fs_info *info, u64 flags,
u64 total_bytes, u64 bytes_used,
u64 bytes_readonly, u64 bytes_zone_unusable,
struct btrfs_space_info **space_info);
struct btrfs_space_info *btrfs_find_space_info(struct btrfs_fs_info *info,
u64 flags);
u64 __pure btrfs_space_info_used(struct btrfs_space_info *s_info,
bool may_use_included);
void btrfs_clear_space_info_full(struct btrfs_fs_info *info);
void btrfs_dump_space_info(struct btrfs_fs_info *fs_info,
struct btrfs_space_info *info, u64 bytes,
int dump_block_groups);
int btrfs_reserve_metadata_bytes(struct btrfs_root *root,
struct btrfs_block_rsv *block_rsv,
u64 orig_bytes,
enum btrfs_reserve_flush_enum flush);
void btrfs_try_granting_tickets(struct btrfs_fs_info *fs_info,
struct btrfs_space_info *space_info);
int btrfs_can_overcommit(struct btrfs_fs_info *fs_info,
struct btrfs_space_info *space_info, u64 bytes,
enum btrfs_reserve_flush_enum flush);
static inline void btrfs_space_info_free_bytes_may_use(
struct btrfs_fs_info *fs_info,
struct btrfs_space_info *space_info,
u64 num_bytes)
{
spin_lock(&space_info->lock);
btrfs_space_info_update_bytes_may_use(fs_info, space_info, -num_bytes);
btrfs_try_granting_tickets(fs_info, space_info);
spin_unlock(&space_info->lock);
}
int btrfs_reserve_data_bytes(struct btrfs_fs_info *fs_info, u64 bytes,
enum btrfs_reserve_flush_enum flush);
#endif /* BTRFS_SPACE_INFO_H */
// SPDX-License-Identifier: GPL-2.0
/*
* linux/fs/file.c
*
* Copyright (C) 1998-1999, Stephen Tweedie and Bill Hawes
*
* Manage the dynamic fd arrays in the process files_struct.
*/
#include <linux/syscalls.h>
#include <linux/export.h>
#include <linux/fs.h>
#include <linux/kernel.h>
#include <linux/mm.h>
#include <linux/sched/signal.h>
#include <linux/slab.h>
#include <linux/file.h>
#include <linux/fdtable.h>
#include <linux/bitops.h>
#include <linux/spinlock.h>
#include <linux/rcupdate.h>
#include <linux/close_range.h>
#include <net/sock.h>
#include "internal.h"
unsigned int sysctl_nr_open __read_mostly = 1024*1024;
unsigned int sysctl_nr_open_min = BITS_PER_LONG;
/* our min() is unusable in constant expressions ;-/ */
#define __const_min(x, y) ((x) < (y) ? (x) : (y))
unsigned int sysctl_nr_open_max =
__const_min(INT_MAX, ~(size_t)0/sizeof(void *)) & -BITS_PER_LONG;
static void __free_fdtable(struct fdtable *fdt)
{
kvfree(fdt->fd);
kvfree(fdt->open_fds);
kfree(fdt);
}
static void free_fdtable_rcu(struct rcu_head *rcu)
{
__free_fdtable(container_of(rcu, struct fdtable, rcu));
}
#define BITBIT_NR(nr) BITS_TO_LONGS(BITS_TO_LONGS(nr))
#define BITBIT_SIZE(nr) (BITBIT_NR(nr) * sizeof(long))
/*
* Copy 'count' fd bits from the old table to the new table and clear the extra
* space if any. This does not copy the file pointers. Called with the files
* spinlock held for write.
*/
static void copy_fd_bitmaps(struct fdtable *nfdt, struct fdtable *ofdt,
unsigned int count)
{
unsigned int cpy, set;
cpy = count / BITS_PER_BYTE;
set = (nfdt->max_fds - count) / BITS_PER_BYTE;
memcpy(nfdt->open_fds, ofdt->open_fds, cpy);
memset((char *)nfdt->open_fds + cpy, 0, set);
memcpy(nfdt->close_on_exec, ofdt->close_on_exec, cpy);
memset((char *)nfdt->close_on_exec + cpy, 0, set);
cpy = BITBIT_SIZE(count);
set = BITBIT_SIZE(nfdt->max_fds) - cpy;
memcpy(nfdt->full_fds_bits, ofdt->full_fds_bits, cpy);
memset((char *)nfdt->full_fds_bits + cpy, 0, set);
}
/*
* Copy all file descriptors from the old table to the new, expanded table and
* clear the extra space. Called with the files spinlock held for write.
*/
static void copy_fdtable(struct fdtable *nfdt, struct fdtable *ofdt)
{
size_t cpy, set;
BUG_ON(nfdt->max_fds < ofdt->max_fds); cpy = ofdt->max_fds * sizeof(struct file *);
set = (nfdt->max_fds - ofdt->max_fds) * sizeof(struct file *);
memcpy(nfdt->fd, ofdt->fd, cpy);
memset((char *)nfdt->fd + cpy, 0, set);
copy_fd_bitmaps(nfdt, ofdt, ofdt->max_fds);
}
/*
* Note how the fdtable bitmap allocations very much have to be a multiple of
* BITS_PER_LONG. This is not only because we walk those things in chunks of
* 'unsigned long' in some places, but simply because that is how the Linux
* kernel bitmaps are defined to work: they are not "bits in an array of bytes",
* they are very much "bits in an array of unsigned long".
*
* The ALIGN(nr, BITS_PER_LONG) here is for clarity: since we just multiplied
* by that "1024/sizeof(ptr)" before, we already know there are sufficient
* clear low bits. Clang seems to realize that, gcc ends up being confused.
*
* On a 128-bit machine, the ALIGN() would actually matter. In the meantime,
* let's consider it documentation (and maybe a test-case for gcc to improve
* its code generation ;)
*/
static struct fdtable * alloc_fdtable(unsigned int nr)
{
struct fdtable *fdt;
void *data;
/*
* Figure out how many fds we actually want to support in this fdtable.
* Allocation steps are keyed to the size of the fdarray, since it
* grows far faster than any of the other dynamic data. We try to fit
* the fdarray into comfortable page-tuned chunks: starting at 1024B
* and growing in powers of two from there on.
*/
nr /= (1024 / sizeof(struct file *));
nr = roundup_pow_of_two(nr + 1);
nr *= (1024 / sizeof(struct file *));
nr = ALIGN(nr, BITS_PER_LONG);
/*
* Note that this can drive nr *below* what we had passed if sysctl_nr_open
* had been set lower between the check in expand_files() and here. Deal
* with that in caller, it's cheaper that way.
*
* We make sure that nr remains a multiple of BITS_PER_LONG - otherwise
* bitmaps handling below becomes unpleasant, to put it mildly...
*/
if (unlikely(nr > sysctl_nr_open))
nr = ((sysctl_nr_open - 1) | (BITS_PER_LONG - 1)) + 1;
fdt = kmalloc(sizeof(struct fdtable), GFP_KERNEL_ACCOUNT);
if (!fdt)
goto out;
fdt->max_fds = nr;
data = kvmalloc_array(nr, sizeof(struct file *), GFP_KERNEL_ACCOUNT);
if (!data)
goto out_fdt;
fdt->fd = data;
data = kvmalloc(max_t(size_t,
2 * nr / BITS_PER_BYTE + BITBIT_SIZE(nr), L1_CACHE_BYTES),
GFP_KERNEL_ACCOUNT);
if (!data)
goto out_arr;
fdt->open_fds = data;
data += nr / BITS_PER_BYTE;
fdt->close_on_exec = data;
data += nr / BITS_PER_BYTE;
fdt->full_fds_bits = data;
return fdt;
out_arr:
kvfree(fdt->fd);
out_fdt:
kfree(fdt);
out:
return NULL;
}
/*
* Expand the file descriptor table.
* This function will allocate a new fdtable and both fd array and fdset, of
* the given size.
* Return <0 error code on error; 1 on successful completion.
* The files->file_lock should be held on entry, and will be held on exit.
*/
static int expand_fdtable(struct files_struct *files, unsigned int nr)
__releases(files->file_lock)
__acquires(files->file_lock)
{
struct fdtable *new_fdt, *cur_fdt;
spin_unlock(&files->file_lock);
new_fdt = alloc_fdtable(nr);
/* make sure all fd_install() have seen resize_in_progress
* or have finished their rcu_read_lock_sched() section.
*/
if (atomic_read(&files->count) > 1)
synchronize_rcu();
spin_lock(&files->file_lock);
if (!new_fdt)
return -ENOMEM;
/*
* extremely unlikely race - sysctl_nr_open decreased between the check in
* caller and alloc_fdtable(). Cheaper to catch it here...
*/
if (unlikely(new_fdt->max_fds <= nr)) { __free_fdtable(new_fdt);
return -EMFILE;
}
cur_fdt = files_fdtable(files); BUG_ON(nr < cur_fdt->max_fds);
copy_fdtable(new_fdt, cur_fdt);
rcu_assign_pointer(files->fdt, new_fdt);
if (cur_fdt != &files->fdtab)
call_rcu(&cur_fdt->rcu, free_fdtable_rcu);
/* coupled with smp_rmb() in fd_install() */
smp_wmb();
return 1;
}
/*
* Expand files.
* This function will expand the file structures, if the requested size exceeds
* the current capacity and there is room for expansion.
* Return <0 error code on error; 0 when nothing done; 1 when files were
* expanded and execution may have blocked.
* The files->file_lock should be held on entry, and will be held on exit.
*/
static int expand_files(struct files_struct *files, unsigned int nr)
__releases(files->file_lock)
__acquires(files->file_lock)
{
struct fdtable *fdt;
int expanded = 0;
repeat:
fdt = files_fdtable(files);
/* Do we need to expand? */
if (nr < fdt->max_fds)
return expanded;
/* Can we expand? */
if (nr >= sysctl_nr_open)
return -EMFILE;
if (unlikely(files->resize_in_progress)) {
spin_unlock(&files->file_lock);
expanded = 1;
wait_event(files->resize_wait, !files->resize_in_progress);
spin_lock(&files->file_lock);
goto repeat;
}
/* All good, so we try */
files->resize_in_progress = true;
expanded = expand_fdtable(files, nr);
files->resize_in_progress = false;
wake_up_all(&files->resize_wait);
return expanded;
}
static inline void __set_close_on_exec(unsigned int fd, struct fdtable *fdt)
{
__set_bit(fd, fdt->close_on_exec);
}
static inline void __clear_close_on_exec(unsigned int fd, struct fdtable *fdt)
{
if (test_bit(fd, fdt->close_on_exec))
__clear_bit(fd, fdt->close_on_exec);
}
static inline void __set_open_fd(unsigned int fd, struct fdtable *fdt)
{
__set_bit(fd, fdt->open_fds);
fd /= BITS_PER_LONG;
if (!~fdt->open_fds[fd])
__set_bit(fd, fdt->full_fds_bits);
}
static inline void __clear_open_fd(unsigned int fd, struct fdtable *fdt)
{
__clear_bit(fd, fdt->open_fds);
__clear_bit(fd / BITS_PER_LONG, fdt->full_fds_bits);
}
static unsigned int count_open_files(struct fdtable *fdt)
{
unsigned int size = fdt->max_fds;
unsigned int i;
/* Find the last open fd */
for (i = size / BITS_PER_LONG; i > 0; ) {
if (fdt->open_fds[--i])
break;
}
i = (i + 1) * BITS_PER_LONG;
return i;
}
/*
* Note that a sane fdtable size always has to be a multiple of
* BITS_PER_LONG, since we have bitmaps that are sized by this.
*
* 'max_fds' will normally already be properly aligned, but it
* turns out that in the close_range() -> __close_range() ->
* unshare_fd() -> dup_fd() -> sane_fdtable_size() we can end
* up having a 'max_fds' value that isn't already aligned.
*
* Rather than make close_range() have to worry about this,
* just make that BITS_PER_LONG alignment be part of a sane
* fdtable size. Becuase that's really what it is.
*/
static unsigned int sane_fdtable_size(struct fdtable *fdt, unsigned int max_fds)
{
unsigned int count;
count = count_open_files(fdt);
if (max_fds < NR_OPEN_DEFAULT)
max_fds = NR_OPEN_DEFAULT;
return ALIGN(min(count, max_fds), BITS_PER_LONG);
}
/*
* Allocate a new files structure and copy contents from the
* passed in files structure.
* errorp will be valid only when the returned files_struct is NULL.
*/
struct files_struct *dup_fd(struct files_struct *oldf, unsigned int max_fds, int *errorp)
{
struct files_struct *newf;
struct file **old_fds, **new_fds;
unsigned int open_files, i;
struct fdtable *old_fdt, *new_fdt;
*errorp = -ENOMEM;
newf = kmem_cache_alloc(files_cachep, GFP_KERNEL);
if (!newf)
goto out;
atomic_set(&newf->count, 1);
spin_lock_init(&newf->file_lock);
newf->resize_in_progress = false;
init_waitqueue_head(&newf->resize_wait);
newf->next_fd = 0;
new_fdt = &newf->fdtab;
new_fdt->max_fds = NR_OPEN_DEFAULT;
new_fdt->close_on_exec = newf->close_on_exec_init;
new_fdt->open_fds = newf->open_fds_init;
new_fdt->full_fds_bits = newf->full_fds_bits_init;
new_fdt->fd = &newf->fd_array[0];
spin_lock(&oldf->file_lock);
old_fdt = files_fdtable(oldf);
open_files = sane_fdtable_size(old_fdt, max_fds);
/*
* Check whether we need to allocate a larger fd array and fd set.
*/
while (unlikely(open_files > new_fdt->max_fds)) {
spin_unlock(&oldf->file_lock);
if (new_fdt != &newf->fdtab)
__free_fdtable(new_fdt);
new_fdt = alloc_fdtable(open_files - 1);
if (!new_fdt) {
*errorp = -ENOMEM;
goto out_release;
}
/* beyond sysctl_nr_open; nothing to do */
if (unlikely(new_fdt->max_fds < open_files)) {
__free_fdtable(new_fdt);
*errorp = -EMFILE;
goto out_release;
}
/*
* Reacquire the oldf lock and a pointer to its fd table
* who knows it may have a new bigger fd table. We need
* the latest pointer.
*/
spin_lock(&oldf->file_lock);
old_fdt = files_fdtable(oldf);
open_files = sane_fdtable_size(old_fdt, max_fds);
}
copy_fd_bitmaps(new_fdt, old_fdt, open_files);
old_fds = old_fdt->fd;
new_fds = new_fdt->fd;
for (i = open_files; i != 0; i--) {
struct file *f = *old_fds++;
if (f) {
get_file(f);
} else {
/*
* The fd may be claimed in the fd bitmap but not yet
* instantiated in the files array if a sibling thread
* is partway through open(). So make sure that this
* fd is available to the new process.
*/
__clear_open_fd(open_files - i, new_fdt);
}
rcu_assign_pointer(*new_fds++, f);
}
spin_unlock(&oldf->file_lock);
/* clear the remainder */
memset(new_fds, 0, (new_fdt->max_fds - open_files) * sizeof(struct file *));
rcu_assign_pointer(newf->fdt, new_fdt);
return newf;
out_release:
kmem_cache_free(files_cachep, newf);
out:
return NULL;
}
static struct fdtable *close_files(struct files_struct * files)
{
/*
* It is safe to dereference the fd table without RCU or
* ->file_lock because this is the last reference to the
* files structure.
*/
struct fdtable *fdt = rcu_dereference_raw(files->fdt);
unsigned int i, j = 0;
for (;;) {
unsigned long set;
i = j * BITS_PER_LONG;
if (i >= fdt->max_fds)
break;
set = fdt->open_fds[j++];
while (set) {
if (set & 1) {
struct file * file = xchg(&fdt->fd[i], NULL);
if (file) {
filp_close(file, files);
cond_resched();
}
}
i++;
set >>= 1;
}
}
return fdt;
}
void put_files_struct(struct files_struct *files)
{
if (atomic_dec_and_test(&files->count)) {
struct fdtable *fdt = close_files(files);
/* free the arrays if they are not embedded */
if (fdt != &files->fdtab)
__free_fdtable(fdt);
kmem_cache_free(files_cachep, files);
}
}
void exit_files(struct task_struct *tsk)
{
struct files_struct * files = tsk->files;
if (files) {
task_lock(tsk);
tsk->files = NULL;
task_unlock(tsk);
put_files_struct(files);
}
}
struct files_struct init_files = {
.count = ATOMIC_INIT(1),
.fdt = &init_files.fdtab,
.fdtab = {
.max_fds = NR_OPEN_DEFAULT,
.fd = &init_files.fd_array[0],
.close_on_exec = init_files.close_on_exec_init,
.open_fds = init_files.open_fds_init,
.full_fds_bits = init_files.full_fds_bits_init,
},
.file_lock = __SPIN_LOCK_UNLOCKED(init_files.file_lock),
.resize_wait = __WAIT_QUEUE_HEAD_INITIALIZER(init_files.resize_wait),
};
static unsigned int find_next_fd(struct fdtable *fdt, unsigned int start)
{
unsigned int maxfd = fdt->max_fds;
unsigned int maxbit = maxfd / BITS_PER_LONG;
unsigned int bitbit = start / BITS_PER_LONG;
bitbit = find_next_zero_bit(fdt->full_fds_bits, maxbit, bitbit) * BITS_PER_LONG;
if (bitbit > maxfd)
return maxfd;
if (bitbit > start)
start = bitbit;
return find_next_zero_bit(fdt->open_fds, maxfd, start);
}
/*
* allocate a file descriptor, mark it busy.
*/
static int alloc_fd(unsigned start, unsigned end, unsigned flags)
{
struct files_struct *files = current->files;
unsigned int fd;
int error;
struct fdtable *fdt;
spin_lock(&files->file_lock);
repeat:
fdt = files_fdtable(files);
fd = start;
if (fd < files->next_fd)
fd = files->next_fd;
if (fd < fdt->max_fds)
fd = find_next_fd(fdt, fd);
/*
* N.B. For clone tasks sharing a files structure, this test
* will limit the total number of files that can be opened.
*/
error = -EMFILE;
if (fd >= end)
goto out;
error = expand_files(files, fd);
if (error < 0)
goto out;
/*
* If we needed to expand the fs array we
* might have blocked - try again.
*/
if (error)
goto repeat;
if (start <= files->next_fd) files->next_fd = fd + 1;
__set_open_fd(fd, fdt);
if (flags & O_CLOEXEC)
__set_close_on_exec(fd, fdt);
else
__clear_close_on_exec(fd, fdt);
error = fd;
#if 1
/* Sanity check */
if (rcu_access_pointer(fdt->fd[fd]) != NULL) {
printk(KERN_WARNING "alloc_fd: slot %d not NULL!\n", fd);
rcu_assign_pointer(fdt->fd[fd], NULL);
}
#endif
out:
spin_unlock(&files->file_lock);
return error;
}
int __get_unused_fd_flags(unsigned flags, unsigned long nofile)
{
return alloc_fd(0, nofile, flags);
}
int get_unused_fd_flags(unsigned flags)
{
return __get_unused_fd_flags(flags, rlimit(RLIMIT_NOFILE));
}
EXPORT_SYMBOL(get_unused_fd_flags);
static void __put_unused_fd(struct files_struct *files, unsigned int fd)
{
struct fdtable *fdt = files_fdtable(files);
__clear_open_fd(fd, fdt);
if (fd < files->next_fd)
files->next_fd = fd;
}
void put_unused_fd(unsigned int fd)
{
struct files_struct *files = current->files;
spin_lock(&files->file_lock);
__put_unused_fd(files, fd);
spin_unlock(&files->file_lock);
}
EXPORT_SYMBOL(put_unused_fd);
/*
* Install a file pointer in the fd array.
*
* The VFS is full of places where we drop the files lock between
* setting the open_fds bitmap and installing the file in the file
* array. At any such point, we are vulnerable to a dup2() race
* installing a file in the array before us. We need to detect this and
* fput() the struct file we are about to overwrite in this case.
*
* It should never happen - if we allow dup2() do it, _really_ bad things
* will follow.
*
* This consumes the "file" refcount, so callers should treat it
* as if they had called fput(file).
*/
void fd_install(unsigned int fd, struct file *file)
{
struct files_struct *files = current->files;
struct fdtable *fdt;
rcu_read_lock_sched();
if (unlikely(files->resize_in_progress)) {
rcu_read_unlock_sched();
spin_lock(&files->file_lock);
fdt = files_fdtable(files);
BUG_ON(fdt->fd[fd] != NULL); rcu_assign_pointer(fdt->fd[fd], file);
spin_unlock(&files->file_lock);
return;
}
/* coupled with smp_wmb() in expand_fdtable() */
smp_rmb();
fdt = rcu_dereference_sched(files->fdt);
BUG_ON(fdt->fd[fd] != NULL); rcu_assign_pointer(fdt->fd[fd], file);
rcu_read_unlock_sched();
}
EXPORT_SYMBOL(fd_install);
/**
* pick_file - return file associatd with fd
* @files: file struct to retrieve file from
* @fd: file descriptor to retrieve file for
*
* If this functions returns an EINVAL error pointer the fd was beyond the
* current maximum number of file descriptors for that fdtable.
*
* Returns: The file associated with @fd, on error returns an error pointer.
*/
static struct file *pick_file(struct files_struct *files, unsigned fd)
{
struct file *file;
struct fdtable *fdt;
spin_lock(&files->file_lock);
fdt = files_fdtable(files);
if (fd >= fdt->max_fds) {
file = ERR_PTR(-EINVAL);
goto out_unlock;
}
file = fdt->fd[fd];
if (!file) {
file = ERR_PTR(-EBADF);
goto out_unlock;
}
rcu_assign_pointer(fdt->fd[fd], NULL);
__put_unused_fd(files, fd);
out_unlock:
spin_unlock(&files->file_lock);
return file;
}
int close_fd(unsigned fd)
{
struct files_struct *files = current->files;
struct file *file;
file = pick_file(files, fd);
if (IS_ERR(file))
return -EBADF;
return filp_close(file, files);}
EXPORT_SYMBOL(close_fd); /* for ksys_close() */
/**
* last_fd - return last valid index into fd table
* @cur_fds: files struct
*
* Context: Either rcu read lock or files_lock must be held.
*
* Returns: Last valid index into fdtable.
*/
static inline unsigned last_fd(struct fdtable *fdt)
{
return fdt->max_fds - 1;
}
static inline void __range_cloexec(struct files_struct *cur_fds,
unsigned int fd, unsigned int max_fd)
{
struct fdtable *fdt;
/* make sure we're using the correct maximum value */
spin_lock(&cur_fds->file_lock);
fdt = files_fdtable(cur_fds);
max_fd = min(last_fd(fdt), max_fd);
if (fd <= max_fd)
bitmap_set(fdt->close_on_exec, fd, max_fd - fd + 1);
spin_unlock(&cur_fds->file_lock);
}
static inline void __range_close(struct files_struct *cur_fds, unsigned int fd,
unsigned int max_fd)
{
while (fd <= max_fd) {
struct file *file;
file = pick_file(cur_fds, fd++);
if (!IS_ERR(file)) {
/* found a valid file to close */
filp_close(file, cur_fds);
cond_resched();
continue;
}
/* beyond the last fd in that table */
if (PTR_ERR(file) == -EINVAL)
return;
}
}
/**
* __close_range() - Close all file descriptors in a given range.
*
* @fd: starting file descriptor to close
* @max_fd: last file descriptor to close
*
* This closes a range of file descriptors. All file descriptors
* from @fd up to and including @max_fd are closed.
*/
int __close_range(unsigned fd, unsigned max_fd, unsigned int flags)
{
struct task_struct *me = current;
struct files_struct *cur_fds = me->files, *fds = NULL;
if (flags & ~(CLOSE_RANGE_UNSHARE | CLOSE_RANGE_CLOEXEC))
return -EINVAL;
if (fd > max_fd)
return -EINVAL;
if (flags & CLOSE_RANGE_UNSHARE) {
int ret;
unsigned int max_unshare_fds = NR_OPEN_MAX;
/*
* If the caller requested all fds to be made cloexec we always
* copy all of the file descriptors since they still want to
* use them.
*/
if (!(flags & CLOSE_RANGE_CLOEXEC)) {
/*
* If the requested range is greater than the current
* maximum, we're closing everything so only copy all
* file descriptors beneath the lowest file descriptor.
*/
rcu_read_lock();
if (max_fd >= last_fd(files_fdtable(cur_fds)))
max_unshare_fds = fd;
rcu_read_unlock();
}
ret = unshare_fd(CLONE_FILES, max_unshare_fds, &fds);
if (ret)
return ret;
/*
* We used to share our file descriptor table, and have now
* created a private one, make sure we're using it below.
*/
if (fds)
swap(cur_fds, fds);
}
if (flags & CLOSE_RANGE_CLOEXEC)
__range_cloexec(cur_fds, fd, max_fd);
else
__range_close(cur_fds, fd, max_fd);
if (fds) {
/*
* We're done closing the files we were supposed to. Time to install
* the new file descriptor table and drop the old one.
*/
task_lock(me);
me->files = cur_fds;
task_unlock(me);
put_files_struct(fds);
}
return 0;
}
/*
* See close_fd_get_file() below, this variant assumes current->files->file_lock
* is held.
*/
int __close_fd_get_file(unsigned int fd, struct file **res)
{
struct files_struct *files = current->files;
struct file *file;
struct fdtable *fdt;
fdt = files_fdtable(files);
if (fd >= fdt->max_fds)
goto out_err;
file = fdt->fd[fd];
if (!file)
goto out_err;
rcu_assign_pointer(fdt->fd[fd], NULL);
__put_unused_fd(files, fd);
get_file(file);
*res = file;
return 0;
out_err:
*res = NULL;
return -ENOENT;
}
/*
* variant of close_fd that gets a ref on the file for later fput.
* The caller must ensure that filp_close() called on the file, and then
* an fput().
*/
int close_fd_get_file(unsigned int fd, struct file **res)
{
struct files_struct *files = current->files;
int ret;
spin_lock(&files->file_lock);
ret = __close_fd_get_file(fd, res);
spin_unlock(&files->file_lock);
return ret;
}
void do_close_on_exec(struct files_struct *files)
{
unsigned i;
struct fdtable *fdt;
/* exec unshares first */
spin_lock(&files->file_lock);
for (i = 0; ; i++) {
unsigned long set;
unsigned fd = i * BITS_PER_LONG;
fdt = files_fdtable(files);
if (fd >= fdt->max_fds)
break;
set = fdt->close_on_exec[i];
if (!set)
continue;
fdt->close_on_exec[i] = 0;
for ( ; set ; fd++, set >>= 1) {
struct file *file;
if (!(set & 1))
continue;
file = fdt->fd[fd];
if (!file)
continue;
rcu_assign_pointer(fdt->fd[fd], NULL);
__put_unused_fd(files, fd);
spin_unlock(&files->file_lock);
filp_close(file, files);
cond_resched();
spin_lock(&files->file_lock);
}
}
spin_unlock(&files->file_lock);
}
static inline struct file *__fget_files_rcu(struct files_struct *files,
unsigned int fd, fmode_t mask, unsigned int refs)
{
for (;;) {
struct file *file;
struct fdtable *fdt = rcu_dereference_raw(files->fdt);
struct file __rcu **fdentry;
if (unlikely(fd >= fdt->max_fds))
return NULL;
fdentry = fdt->fd + array_index_nospec(fd, fdt->max_fds);
file = rcu_dereference_raw(*fdentry);
if (unlikely(!file))
return NULL;
if (unlikely(file->f_mode & mask))
return NULL;
/*
* Ok, we have a file pointer. However, because we do
* this all locklessly under RCU, we may be racing with
* that file being closed.
*
* Such a race can take two forms:
*
* (a) the file ref already went down to zero,
* and get_file_rcu_many() fails. Just try
* again:
*/
if (unlikely(!get_file_rcu_many(file, refs)))
continue;
/*
* (b) the file table entry has changed under us.
* Note that we don't need to re-check the 'fdt->fd'
* pointer having changed, because it always goes
* hand-in-hand with 'fdt'.
*
* If so, we need to put our refs and try again.
*/
if (unlikely(rcu_dereference_raw(files->fdt) != fdt) || unlikely(rcu_dereference_raw(*fdentry) != file)) { fput_many(file, refs);
continue;
}
/*
* Ok, we have a ref to the file, and checked that it
* still exists.
*/
return file;
}
}
static struct file *__fget_files(struct files_struct *files, unsigned int fd,
fmode_t mask, unsigned int refs)
{
struct file *file;
rcu_read_lock();
file = __fget_files_rcu(files, fd, mask, refs);
rcu_read_unlock();
return file;
}
static inline struct file *__fget(unsigned int fd, fmode_t mask,
unsigned int refs)
{
return __fget_files(current->files, fd, mask, refs);
}
struct file *fget_many(unsigned int fd, unsigned int refs)
{
return __fget(fd, FMODE_PATH, refs);
}
struct file *fget(unsigned int fd)
{
return __fget(fd, FMODE_PATH, 1);
}
EXPORT_SYMBOL(fget);
struct file *fget_raw(unsigned int fd)
{
return __fget(fd, 0, 1);
}
EXPORT_SYMBOL(fget_raw);
struct file *fget_task(struct task_struct *task, unsigned int fd)
{
struct file *file = NULL;
task_lock(task);
if (task->files)
file = __fget_files(task->files, fd, 0, 1);
task_unlock(task);
return file;
}
struct file *task_lookup_fd_rcu(struct task_struct *task, unsigned int fd)
{
/* Must be called with rcu_read_lock held */
struct files_struct *files;
struct file *file = NULL;
task_lock(task);
files = task->files;
if (files)
file = files_lookup_fd_rcu(files, fd);
task_unlock(task);
return file;
}
struct file *task_lookup_next_fd_rcu(struct task_struct *task, unsigned int *ret_fd)
{
/* Must be called with rcu_read_lock held */
struct files_struct *files;
unsigned int fd = *ret_fd;
struct file *file = NULL;
task_lock(task);
files = task->files;
if (files) {
for (; fd < files_fdtable(files)->max_fds; fd++) {
file = files_lookup_fd_rcu(files, fd);
if (file)
break;
}
}
task_unlock(task);
*ret_fd = fd;
return file;
}
/*
* Lightweight file lookup - no refcnt increment if fd table isn't shared.
*
* You can use this instead of fget if you satisfy all of the following
* conditions:
* 1) You must call fput_light before exiting the syscall and returning control
* to userspace (i.e. you cannot remember the returned struct file * after
* returning to userspace).
* 2) You must not call filp_close on the returned struct file * in between
* calls to fget_light and fput_light.
* 3) You must not clone the current task in between the calls to fget_light
* and fput_light.
*
* The fput_needed flag returned by fget_light should be passed to the
* corresponding fput_light.
*/
static unsigned long __fget_light(unsigned int fd, fmode_t mask)
{
struct files_struct *files = current->files;
struct file *file;
if (atomic_read(&files->count) == 1) {
file = files_lookup_fd_raw(files, fd);
if (!file || unlikely(file->f_mode & mask)) return 0; return (unsigned long)file;
} else {
file = __fget(fd, mask, 1);
if (!file)
return 0;
return FDPUT_FPUT | (unsigned long)file;
}
}
unsigned long __fdget(unsigned int fd)
{
return __fget_light(fd, FMODE_PATH);
}
EXPORT_SYMBOL(__fdget);
unsigned long __fdget_raw(unsigned int fd)
{
return __fget_light(fd, 0);
}
unsigned long __fdget_pos(unsigned int fd)
{
unsigned long v = __fdget(fd);
struct file *file = (struct file *)(v & ~3);
if (file && (file->f_mode & FMODE_ATOMIC_POS)) {
if (file_count(file) > 1) { v |= FDPUT_POS_UNLOCK;
mutex_lock(&file->f_pos_lock);
}
}
return v;
}
void __f_unlock_pos(struct file *f)
{
mutex_unlock(&f->f_pos_lock);
}
/*
* We only lock f_pos if we have threads or if the file might be
* shared with another process. In both cases we'll have an elevated
* file count (done either by fdget() or by fork()).
*/
void set_close_on_exec(unsigned int fd, int flag)
{
struct files_struct *files = current->files;
struct fdtable *fdt;
spin_lock(&files->file_lock);
fdt = files_fdtable(files);
if (flag)
__set_close_on_exec(fd, fdt);
else
__clear_close_on_exec(fd, fdt);
spin_unlock(&files->file_lock);
}
bool get_close_on_exec(unsigned int fd)
{
struct files_struct *files = current->files;
struct fdtable *fdt;
bool res;
rcu_read_lock();
fdt = files_fdtable(files);
res = close_on_exec(fd, fdt);
rcu_read_unlock();
return res;
}
static int do_dup2(struct files_struct *files,
struct file *file, unsigned fd, unsigned flags)
__releases(&files->file_lock)
{
struct file *tofree;
struct fdtable *fdt;
/*
* We need to detect attempts to do dup2() over allocated but still
* not finished descriptor. NB: OpenBSD avoids that at the price of
* extra work in their equivalent of fget() - they insert struct
* file immediately after grabbing descriptor, mark it larval if
* more work (e.g. actual opening) is needed and make sure that
* fget() treats larval files as absent. Potentially interesting,
* but while extra work in fget() is trivial, locking implications
* and amount of surgery on open()-related paths in VFS are not.
* FreeBSD fails with -EBADF in the same situation, NetBSD "solution"
* deadlocks in rather amusing ways, AFAICS. All of that is out of
* scope of POSIX or SUS, since neither considers shared descriptor
* tables and this condition does not arise without those.
*/
fdt = files_fdtable(files);
tofree = fdt->fd[fd];
if (!tofree && fd_is_open(fd, fdt))
goto Ebusy;
get_file(file);
rcu_assign_pointer(fdt->fd[fd], file);
__set_open_fd(fd, fdt);
if (flags & O_CLOEXEC)
__set_close_on_exec(fd, fdt);
else
__clear_close_on_exec(fd, fdt);
spin_unlock(&files->file_lock);
if (tofree)
filp_close(tofree, files);
return fd;
Ebusy:
spin_unlock(&files->file_lock);
return -EBUSY;
}
int replace_fd(unsigned fd, struct file *file, unsigned flags)
{
int err;
struct files_struct *files = current->files;
if (!file)
return close_fd(fd);
if (fd >= rlimit(RLIMIT_NOFILE))
return -EBADF;
spin_lock(&files->file_lock);
err = expand_files(files, fd);
if (unlikely(err < 0))
goto out_unlock;
return do_dup2(files, file, fd, flags);
out_unlock:
spin_unlock(&files->file_lock);
return err;
}
/**
* __receive_fd() - Install received file into file descriptor table
* @file: struct file that was received from another process
* @ufd: __user pointer to write new fd number to
* @o_flags: the O_* flags to apply to the new fd entry
*
* Installs a received file into the file descriptor table, with appropriate
* checks and count updates. Optionally writes the fd number to userspace, if
* @ufd is non-NULL.
*
* This helper handles its own reference counting of the incoming
* struct file.
*
* Returns newly install fd or -ve on error.
*/
int __receive_fd(struct file *file, int __user *ufd, unsigned int o_flags)
{
int new_fd;
int error;
error = security_file_receive(file);
if (error)
return error;
new_fd = get_unused_fd_flags(o_flags);
if (new_fd < 0)
return new_fd;
if (ufd) {
error = put_user(new_fd, ufd);
if (error) {
put_unused_fd(new_fd);
return error;
}
}
fd_install(new_fd, get_file(file));
__receive_sock(file);
return new_fd;
}
int receive_fd_replace(int new_fd, struct file *file, unsigned int o_flags)
{
int error;
error = security_file_receive(file);
if (error)
return error;
error = replace_fd(new_fd, file, o_flags);
if (error)
return error;
__receive_sock(file);
return new_fd;
}
int receive_fd(struct file *file, unsigned int o_flags)
{
return __receive_fd(file, NULL, o_flags);
}
EXPORT_SYMBOL_GPL(receive_fd);
static int ksys_dup3(unsigned int oldfd, unsigned int newfd, int flags)
{
int err = -EBADF;
struct file *file;
struct files_struct *files = current->files;
if ((flags & ~O_CLOEXEC) != 0)
return -EINVAL;
if (unlikely(oldfd == newfd))
return -EINVAL;
if (newfd >= rlimit(RLIMIT_NOFILE))
return -EBADF;
spin_lock(&files->file_lock);
err = expand_files(files, newfd);
file = files_lookup_fd_locked(files, oldfd);
if (unlikely(!file))
goto Ebadf;
if (unlikely(err < 0)) {
if (err == -EMFILE)
goto Ebadf;
goto out_unlock;
}
return do_dup2(files, file, newfd, flags);
Ebadf:
err = -EBADF;
out_unlock:
spin_unlock(&files->file_lock);
return err;
}
SYSCALL_DEFINE3(dup3, unsigned int, oldfd, unsigned int, newfd, int, flags)
{
return ksys_dup3(oldfd, newfd, flags);
}
SYSCALL_DEFINE2(dup2, unsigned int, oldfd, unsigned int, newfd)
{
if (unlikely(newfd == oldfd)) { /* corner case */
struct files_struct *files = current->files;
int retval = oldfd;
rcu_read_lock();
if (!files_lookup_fd_rcu(files, oldfd))
retval = -EBADF;
rcu_read_unlock();
return retval;
}
return ksys_dup3(oldfd, newfd, 0);
}
SYSCALL_DEFINE1(dup, unsigned int, fildes)
{
int ret = -EBADF;
struct file *file = fget_raw(fildes);
if (file) {
ret = get_unused_fd_flags(0);
if (ret >= 0)
fd_install(ret, file);
else
fput(file);
}
return ret;
}
int f_dupfd(unsigned int from, struct file *file, unsigned flags)
{
unsigned long nofile = rlimit(RLIMIT_NOFILE);
int err;
if (from >= nofile)
return -EINVAL;
err = alloc_fd(from, nofile, flags);
if (err >= 0) {
get_file(file);
fd_install(err, file);
}
return err;
}
int iterate_fd(struct files_struct *files, unsigned n,
int (*f)(const void *, struct file *, unsigned),
const void *p)
{
struct fdtable *fdt;
int res = 0;
if (!files)
return 0;
spin_lock(&files->file_lock);
for (fdt = files_fdtable(files); n < fdt->max_fds; n++) {
struct file *file;
file = rcu_dereference_check_fdtable(files, fdt->fd[n]);
if (!file)
continue;
res = f(p, file, n);
if (res)
break;
}
spin_unlock(&files->file_lock);
return res;
}
EXPORT_SYMBOL(iterate_fd);
// SPDX-License-Identifier: GPL-2.0
/*
* Copyright (C) 1991, 1992 Linus Torvalds
*
* This file contains the interface functions for the various time related
* system calls: time, stime, gettimeofday, settimeofday, adjtime
*
* Modification history:
*
* 1993-09-02 Philip Gladstone
* Created file with time related functions from sched/core.c and adjtimex()
* 1993-10-08 Torsten Duwe
* adjtime interface update and CMOS clock write code
* 1995-08-13 Torsten Duwe
* kernel PLL updated to 1994-12-13 specs (rfc-1589)
* 1999-01-16 Ulrich Windl
* Introduced error checking for many cases in adjtimex().
* Updated NTP code according to technical memorandum Jan '96
* "A Kernel Model for Precision Timekeeping" by Dave Mills
* Allow time_constant larger than MAXTC(6) for NTP v4 (MAXTC == 10)
* (Even though the technical memorandum forbids it)
* 2004-07-14 Christoph Lameter
* Added getnstimeofday to allow the posix timer functions to return
* with nanosecond accuracy
*/
#include <linux/export.h>
#include <linux/kernel.h>
#include <linux/timex.h>
#include <linux/capability.h>
#include <linux/timekeeper_internal.h>
#include <linux/errno.h>
#include <linux/syscalls.h>
#include <linux/security.h>
#include <linux/fs.h>
#include <linux/math64.h>
#include <linux/ptrace.h>
#include <linux/uaccess.h>
#include <linux/compat.h>
#include <asm/unistd.h>
#include <generated/timeconst.h>
#include "timekeeping.h"
/*
* The timezone where the local system is located. Used as a default by some
* programs who obtain this value by using gettimeofday.
*/
struct timezone sys_tz;
EXPORT_SYMBOL(sys_tz);
#ifdef __ARCH_WANT_SYS_TIME
/*
* sys_time() can be implemented in user-level using
* sys_gettimeofday(). Is this for backwards compatibility? If so,
* why not move it into the appropriate arch directory (for those
* architectures that need it).
*/
SYSCALL_DEFINE1(time, __kernel_old_time_t __user *, tloc)
{
__kernel_old_time_t i = (__kernel_old_time_t)ktime_get_real_seconds();
if (tloc) {
if (put_user(i,tloc))
return -EFAULT;
}
force_successful_syscall_return();
return i;
}
/*
* sys_stime() can be implemented in user-level using
* sys_settimeofday(). Is this for backwards compatibility? If so,
* why not move it into the appropriate arch directory (for those
* architectures that need it).
*/
SYSCALL_DEFINE1(stime, __kernel_old_time_t __user *, tptr)
{
struct timespec64 tv;
int err;
if (get_user(tv.tv_sec, tptr))
return -EFAULT;
tv.tv_nsec = 0;
err = security_settime64(&tv, NULL);
if (err)
return err;
do_settimeofday64(&tv);
return 0;
}
#endif /* __ARCH_WANT_SYS_TIME */
#ifdef CONFIG_COMPAT_32BIT_TIME
#ifdef __ARCH_WANT_SYS_TIME32
/* old_time32_t is a 32 bit "long" and needs to get converted. */
SYSCALL_DEFINE1(time32, old_time32_t __user *, tloc)
{
old_time32_t i;
i = (old_time32_t)ktime_get_real_seconds();
if (tloc) {
if (put_user(i,tloc))
return -EFAULT;
}
force_successful_syscall_return();
return i;
}
SYSCALL_DEFINE1(stime32, old_time32_t __user *, tptr)
{
struct timespec64 tv;
int err;
if (get_user(tv.tv_sec, tptr))
return -EFAULT;
tv.tv_nsec = 0;
err = security_settime64(&tv, NULL);
if (err)
return err;
do_settimeofday64(&tv);
return 0;
}
#endif /* __ARCH_WANT_SYS_TIME32 */
#endif
SYSCALL_DEFINE2(gettimeofday, struct __kernel_old_timeval __user *, tv,
struct timezone __user *, tz)
{
if (likely(tv != NULL)) {
struct timespec64 ts;
ktime_get_real_ts64(&ts);
if (put_user(ts.tv_sec, &tv->tv_sec) ||
put_user(ts.tv_nsec / 1000, &tv->tv_usec))
return -EFAULT;
}
if (unlikely(tz != NULL)) {
if (copy_to_user(tz, &sys_tz, sizeof(sys_tz)))
return -EFAULT;
}
return 0;
}
/*
* In case for some reason the CMOS clock has not already been running
* in UTC, but in some local time: The first time we set the timezone,
* we will warp the clock so that it is ticking UTC time instead of
* local time. Presumably, if someone is setting the timezone then we
* are running in an environment where the programs understand about
* timezones. This should be done at boot time in the /etc/rc script,
* as soon as possible, so that the clock can be set right. Otherwise,
* various programs will get confused when the clock gets warped.
*/
int do_sys_settimeofday64(const struct timespec64 *tv, const struct timezone *tz)
{
static int firsttime = 1;
int error = 0;
if (tv && !timespec64_valid_settod(tv))
return -EINVAL;
error = security_settime64(tv, tz);
if (error)
return error;
if (tz) {
/* Verify we're within the +-15 hrs range */
if (tz->tz_minuteswest > 15*60 || tz->tz_minuteswest < -15*60)
return -EINVAL;
sys_tz = *tz;
update_vsyscall_tz();
if (firsttime) {
firsttime = 0;
if (!tv)
timekeeping_warp_clock();
}
}
if (tv)
return do_settimeofday64(tv);
return 0;
}
SYSCALL_DEFINE2(settimeofday, struct __kernel_old_timeval __user *, tv,
struct timezone __user *, tz)
{
struct timespec64 new_ts;
struct timezone new_tz;
if (tv) {
if (get_user(new_ts.tv_sec, &tv->tv_sec) ||
get_user(new_ts.tv_nsec, &tv->tv_usec))
return -EFAULT;
if (new_ts.tv_nsec > USEC_PER_SEC || new_ts.tv_nsec < 0)
return -EINVAL;
new_ts.tv_nsec *= NSEC_PER_USEC;
}
if (tz) {
if (copy_from_user(&new_tz, tz, sizeof(*tz)))
return -EFAULT;
}
return do_sys_settimeofday64(tv ? &new_ts : NULL, tz ? &new_tz : NULL);
}
#ifdef CONFIG_COMPAT
COMPAT_SYSCALL_DEFINE2(gettimeofday, struct old_timeval32 __user *, tv,
struct timezone __user *, tz)
{
if (tv) {
struct timespec64 ts;
ktime_get_real_ts64(&ts);
if (put_user(ts.tv_sec, &tv->tv_sec) ||
put_user(ts.tv_nsec / 1000, &tv->tv_usec))
return -EFAULT;
}
if (tz) {
if (copy_to_user(tz, &sys_tz, sizeof(sys_tz)))
return -EFAULT;
}
return 0;
}
COMPAT_SYSCALL_DEFINE2(settimeofday, struct old_timeval32 __user *, tv,
struct timezone __user *, tz)
{
struct timespec64 new_ts;
struct timezone new_tz;
if (tv) {
if (get_user(new_ts.tv_sec, &tv->tv_sec) ||
get_user(new_ts.tv_nsec, &tv->tv_usec))
return -EFAULT;
if (new_ts.tv_nsec > USEC_PER_SEC || new_ts.tv_nsec < 0)
return -EINVAL;
new_ts.tv_nsec *= NSEC_PER_USEC;
}
if (tz) {
if (copy_from_user(&new_tz, tz, sizeof(*tz)))
return -EFAULT;
}
return do_sys_settimeofday64(tv ? &new_ts : NULL, tz ? &new_tz : NULL);
}
#endif
#ifdef CONFIG_64BIT
SYSCALL_DEFINE1(adjtimex, struct __kernel_timex __user *, txc_p)
{
struct __kernel_timex txc; /* Local copy of parameter */
int ret;
/* Copy the user data space into the kernel copy
* structure. But bear in mind that the structures
* may change
*/
if (copy_from_user(&txc, txc_p, sizeof(struct __kernel_timex)))
return -EFAULT;
ret = do_adjtimex(&txc);
return copy_to_user(txc_p, &txc, sizeof(struct __kernel_timex)) ? -EFAULT : ret;
}
#endif
#ifdef CONFIG_COMPAT_32BIT_TIME
int get_old_timex32(struct __kernel_timex *txc, const struct old_timex32 __user *utp)
{
struct old_timex32 tx32;
memset(txc, 0, sizeof(struct __kernel_timex));
if (copy_from_user(&tx32, utp, sizeof(struct old_timex32)))
return -EFAULT;
txc->modes = tx32.modes;
txc->offset = tx32.offset;
txc->freq = tx32.freq;
txc->maxerror = tx32.maxerror;
txc->esterror = tx32.esterror;
txc->status = tx32.status;
txc->constant = tx32.constant;
txc->precision = tx32.precision;
txc->tolerance = tx32.tolerance;
txc->time.tv_sec = tx32.time.tv_sec;
txc->time.tv_usec = tx32.time.tv_usec;
txc->tick = tx32.tick;
txc->ppsfreq = tx32.ppsfreq;
txc->jitter = tx32.jitter;
txc->shift = tx32.shift;
txc->stabil = tx32.stabil;
txc->jitcnt = tx32.jitcnt;
txc->calcnt = tx32.calcnt;
txc->errcnt = tx32.errcnt;
txc->stbcnt = tx32.stbcnt;
return 0;
}
int put_old_timex32(struct old_timex32 __user *utp, const struct __kernel_timex *txc)
{
struct old_timex32 tx32;
memset(&tx32, 0, sizeof(struct old_timex32));
tx32.modes = txc->modes;
tx32.offset = txc->offset;
tx32.freq = txc->freq;
tx32.maxerror = txc->maxerror;
tx32.esterror = txc->esterror;
tx32.status = txc->status;
tx32.constant = txc->constant;
tx32.precision = txc->precision;
tx32.tolerance = txc->tolerance;
tx32.time.tv_sec = txc->time.tv_sec;
tx32.time.tv_usec = txc->time.tv_usec;
tx32.tick = txc->tick;
tx32.ppsfreq = txc->ppsfreq;
tx32.jitter = txc->jitter;
tx32.shift = txc->shift;
tx32.stabil = txc->stabil;
tx32.jitcnt = txc->jitcnt;
tx32.calcnt = txc->calcnt;
tx32.errcnt = txc->errcnt;
tx32.stbcnt = txc->stbcnt;
tx32.tai = txc->tai;
if (copy_to_user(utp, &tx32, sizeof(struct old_timex32)))
return -EFAULT;
return 0;
}
SYSCALL_DEFINE1(adjtimex_time32, struct old_timex32 __user *, utp)
{
struct __kernel_timex txc;
int err, ret;
err = get_old_timex32(&txc, utp);
if (err)
return err;
ret = do_adjtimex(&txc);
err = put_old_timex32(utp, &txc);
if (err)
return err;
return ret;
}
#endif
/*
* Convert jiffies to milliseconds and back.
*
* Avoid unnecessary multiplications/divisions in the
* two most common HZ cases:
*/
unsigned int jiffies_to_msecs(const unsigned long j)
{
#if HZ <= MSEC_PER_SEC && !(MSEC_PER_SEC % HZ)
return (MSEC_PER_SEC / HZ) * j;
#elif HZ > MSEC_PER_SEC && !(HZ % MSEC_PER_SEC)
return (j + (HZ / MSEC_PER_SEC) - 1)/(HZ / MSEC_PER_SEC);
#else
# if BITS_PER_LONG == 32
return (HZ_TO_MSEC_MUL32 * j + (1ULL << HZ_TO_MSEC_SHR32) - 1) >>
HZ_TO_MSEC_SHR32;
# else
return DIV_ROUND_UP(j * HZ_TO_MSEC_NUM, HZ_TO_MSEC_DEN);
# endif
#endif
}
EXPORT_SYMBOL(jiffies_to_msecs);
unsigned int jiffies_to_usecs(const unsigned long j)
{
/*
* Hz usually doesn't go much further MSEC_PER_SEC.
* jiffies_to_usecs() and usecs_to_jiffies() depend on that.
*/
BUILD_BUG_ON(HZ > USEC_PER_SEC);
#if !(USEC_PER_SEC % HZ)
return (USEC_PER_SEC / HZ) * j;
#else
# if BITS_PER_LONG == 32
return (HZ_TO_USEC_MUL32 * j) >> HZ_TO_USEC_SHR32;
# else
return (j * HZ_TO_USEC_NUM) / HZ_TO_USEC_DEN;
# endif
#endif
}
EXPORT_SYMBOL(jiffies_to_usecs);
/*
* mktime64 - Converts date to seconds.
* Converts Gregorian date to seconds since 1970-01-01 00:00:00.
* Assumes input in normal date format, i.e. 1980-12-31 23:59:59
* => year=1980, mon=12, day=31, hour=23, min=59, sec=59.
*
* [For the Julian calendar (which was used in Russia before 1917,
* Britain & colonies before 1752, anywhere else before 1582,
* and is still in use by some communities) leave out the
* -year/100+year/400 terms, and add 10.]
*
* This algorithm was first published by Gauss (I think).
*
* A leap second can be indicated by calling this function with sec as
* 60 (allowable under ISO 8601). The leap second is treated the same
* as the following second since they don't exist in UNIX time.
*
* An encoding of midnight at the end of the day as 24:00:00 - ie. midnight
* tomorrow - (allowable under ISO 8601) is supported.
*/
time64_t mktime64(const unsigned int year0, const unsigned int mon0,
const unsigned int day, const unsigned int hour,
const unsigned int min, const unsigned int sec)
{
unsigned int mon = mon0, year = year0;
/* 1..12 -> 11,12,1..10 */
if (0 >= (int) (mon -= 2)) {
mon += 12; /* Puts Feb last since it has leap day */
year -= 1;
}
return ((((time64_t)
(year/4 - year/100 + year/400 + 367*mon/12 + day) +
year*365 - 719499
)*24 + hour /* now have hours - midnight tomorrow handled here */
)*60 + min /* now have minutes */
)*60 + sec; /* finally seconds */
}
EXPORT_SYMBOL(mktime64);
struct __kernel_old_timeval ns_to_kernel_old_timeval(const s64 nsec)
{
struct timespec64 ts = ns_to_timespec64(nsec);
struct __kernel_old_timeval tv;
tv.tv_sec = ts.tv_sec;
tv.tv_usec = (suseconds_t)ts.tv_nsec / 1000;
return tv;
}
EXPORT_SYMBOL(ns_to_kernel_old_timeval);
/**
* set_normalized_timespec - set timespec sec and nsec parts and normalize
*
* @ts: pointer to timespec variable to be set
* @sec: seconds to set
* @nsec: nanoseconds to set
*
* Set seconds and nanoseconds field of a timespec variable and
* normalize to the timespec storage format
*
* Note: The tv_nsec part is always in the range of
* 0 <= tv_nsec < NSEC_PER_SEC
* For negative values only the tv_sec field is negative !
*/
void set_normalized_timespec64(struct timespec64 *ts, time64_t sec, s64 nsec)
{
while (nsec >= NSEC_PER_SEC) {
/*
* The following asm() prevents the compiler from
* optimising this loop into a modulo operation. See
* also __iter_div_u64_rem() in include/linux/time.h
*/
asm("" : "+rm"(nsec));
nsec -= NSEC_PER_SEC;
++sec;
}
while (nsec < 0) {
asm("" : "+rm"(nsec));
nsec += NSEC_PER_SEC;
--sec;
}
ts->tv_sec = sec;
ts->tv_nsec = nsec;
}
EXPORT_SYMBOL(set_normalized_timespec64);
/**
* ns_to_timespec64 - Convert nanoseconds to timespec64
* @nsec: the nanoseconds value to be converted
*
* Returns the timespec64 representation of the nsec parameter.
*/
struct timespec64 ns_to_timespec64(const s64 nsec)
{
struct timespec64 ts = { 0, 0 };
s32 rem;
if (likely(nsec > 0)) {
ts.tv_sec = div_u64_rem(nsec, NSEC_PER_SEC, &rem);
ts.tv_nsec = rem;
} else if (nsec < 0) {
/*
* With negative times, tv_sec points to the earlier
* second, and tv_nsec counts the nanoseconds since
* then, so tv_nsec is always a positive number.
*/
ts.tv_sec = -div_u64_rem(-nsec - 1, NSEC_PER_SEC, &rem) - 1;
ts.tv_nsec = NSEC_PER_SEC - rem - 1;
}
return ts;
}
EXPORT_SYMBOL(ns_to_timespec64);
/**
* msecs_to_jiffies: - convert milliseconds to jiffies
* @m: time in milliseconds
*
* conversion is done as follows:
*
* - negative values mean 'infinite timeout' (MAX_JIFFY_OFFSET)
*
* - 'too large' values [that would result in larger than
* MAX_JIFFY_OFFSET values] mean 'infinite timeout' too.
*
* - all other values are converted to jiffies by either multiplying
* the input value by a factor or dividing it with a factor and
* handling any 32-bit overflows.
* for the details see __msecs_to_jiffies()
*
* msecs_to_jiffies() checks for the passed in value being a constant
* via __builtin_constant_p() allowing gcc to eliminate most of the
* code, __msecs_to_jiffies() is called if the value passed does not
* allow constant folding and the actual conversion must be done at
* runtime.
* the _msecs_to_jiffies helpers are the HZ dependent conversion
* routines found in include/linux/jiffies.h
*/
unsigned long __msecs_to_jiffies(const unsigned int m)
{
/*
* Negative value, means infinite timeout:
*/
if ((int)m < 0)
return MAX_JIFFY_OFFSET;
return _msecs_to_jiffies(m);
}
EXPORT_SYMBOL(__msecs_to_jiffies);
unsigned long __usecs_to_jiffies(const unsigned int u)
{
if (u > jiffies_to_usecs(MAX_JIFFY_OFFSET))
return MAX_JIFFY_OFFSET;
return _usecs_to_jiffies(u);
}
EXPORT_SYMBOL(__usecs_to_jiffies);
/*
* The TICK_NSEC - 1 rounds up the value to the next resolution. Note
* that a remainder subtract here would not do the right thing as the
* resolution values don't fall on second boundaries. I.e. the line:
* nsec -= nsec % TICK_NSEC; is NOT a correct resolution rounding.
* Note that due to the small error in the multiplier here, this
* rounding is incorrect for sufficiently large values of tv_nsec, but
* well formed timespecs should have tv_nsec < NSEC_PER_SEC, so we're
* OK.
*
* Rather, we just shift the bits off the right.
*
* The >> (NSEC_JIFFIE_SC - SEC_JIFFIE_SC) converts the scaled nsec
* value to a scaled second value.
*/
unsigned long
timespec64_to_jiffies(const struct timespec64 *value)
{
u64 sec = value->tv_sec;
long nsec = value->tv_nsec + TICK_NSEC - 1;
if (sec >= MAX_SEC_IN_JIFFIES){
sec = MAX_SEC_IN_JIFFIES;
nsec = 0;
}
return ((sec * SEC_CONVERSION) +
(((u64)nsec * NSEC_CONVERSION) >>
(NSEC_JIFFIE_SC - SEC_JIFFIE_SC))) >> SEC_JIFFIE_SC;
}
EXPORT_SYMBOL(timespec64_to_jiffies);
void
jiffies_to_timespec64(const unsigned long jiffies, struct timespec64 *value)
{
/*
* Convert jiffies to nanoseconds and separate with
* one divide.
*/
u32 rem;
value->tv_sec = div_u64_rem((u64)jiffies * TICK_NSEC,
NSEC_PER_SEC, &rem);
value->tv_nsec = rem;
}
EXPORT_SYMBOL(jiffies_to_timespec64);
/*
* Convert jiffies/jiffies_64 to clock_t and back.
*/
clock_t jiffies_to_clock_t(unsigned long x)
{
#if (TICK_NSEC % (NSEC_PER_SEC / USER_HZ)) == 0
# if HZ < USER_HZ
return x * (USER_HZ / HZ);
# else
return x / (HZ / USER_HZ);
# endif
#else
return div_u64((u64)x * TICK_NSEC, NSEC_PER_SEC / USER_HZ);
#endif
}
EXPORT_SYMBOL(jiffies_to_clock_t);
unsigned long clock_t_to_jiffies(unsigned long x)
{
#if (HZ % USER_HZ)==0
if (x >= ~0UL / (HZ / USER_HZ))
return ~0UL;
return x * (HZ / USER_HZ);
#else
/* Don't worry about loss of precision here .. */
if (x >= ~0UL / HZ * USER_HZ)
return ~0UL;
/* .. but do try to contain it here */
return div_u64((u64)x * HZ, USER_HZ);
#endif
}
EXPORT_SYMBOL(clock_t_to_jiffies);
u64 jiffies_64_to_clock_t(u64 x)
{
#if (TICK_NSEC % (NSEC_PER_SEC / USER_HZ)) == 0
# if HZ < USER_HZ
x = div_u64(x * USER_HZ, HZ);
# elif HZ > USER_HZ
x = div_u64(x, HZ / USER_HZ);
# else
/* Nothing to do */
# endif
#else
/*
* There are better ways that don't overflow early,
* but even this doesn't overflow in hundreds of years
* in 64 bits, so..
*/
x = div_u64(x * TICK_NSEC, (NSEC_PER_SEC / USER_HZ));
#endif
return x;
}
EXPORT_SYMBOL(jiffies_64_to_clock_t);
u64 nsec_to_clock_t(u64 x)
{
#if (NSEC_PER_SEC % USER_HZ) == 0
return div_u64(x, NSEC_PER_SEC / USER_HZ);
#elif (USER_HZ % 512) == 0
return div_u64(x * USER_HZ / 512, NSEC_PER_SEC / 512);
#else
/*
* max relative error 5.7e-8 (1.8s per year) for USER_HZ <= 1024,
* overflow after 64.99 years.
* exact for HZ=60, 72, 90, 120, 144, 180, 300, 600, 900, ...
*/
return div_u64(x * 9, (9ull * NSEC_PER_SEC + (USER_HZ / 2)) / USER_HZ);
#endif
}
u64 jiffies64_to_nsecs(u64 j)
{
#if !(NSEC_PER_SEC % HZ)
return (NSEC_PER_SEC / HZ) * j;
# else
return div_u64(j * HZ_TO_NSEC_NUM, HZ_TO_NSEC_DEN);
#endif
}
EXPORT_SYMBOL(jiffies64_to_nsecs);
u64 jiffies64_to_msecs(const u64 j)
{
#if HZ <= MSEC_PER_SEC && !(MSEC_PER_SEC % HZ)
return (MSEC_PER_SEC / HZ) * j;
#else
return div_u64(j * HZ_TO_MSEC_NUM, HZ_TO_MSEC_DEN);
#endif
}
EXPORT_SYMBOL(jiffies64_to_msecs);
/**
* nsecs_to_jiffies64 - Convert nsecs in u64 to jiffies64
*
* @n: nsecs in u64
*
* Unlike {m,u}secs_to_jiffies, type of input is not unsigned int but u64.
* And this doesn't return MAX_JIFFY_OFFSET since this function is designed
* for scheduler, not for use in device drivers to calculate timeout value.
*
* note:
* NSEC_PER_SEC = 10^9 = (5^9 * 2^9) = (1953125 * 512)
* ULLONG_MAX ns = 18446744073.709551615 secs = about 584 years
*/
u64 nsecs_to_jiffies64(u64 n)
{
#if (NSEC_PER_SEC % HZ) == 0
/* Common case, HZ = 100, 128, 200, 250, 256, 500, 512, 1000 etc. */
return div_u64(n, NSEC_PER_SEC / HZ);
#elif (HZ % 512) == 0
/* overflow after 292 years if HZ = 1024 */
return div_u64(n * HZ / 512, NSEC_PER_SEC / 512);
#else
/*
* Generic case - optimized for cases where HZ is a multiple of 3.
* overflow after 64.99 years, exact for HZ = 60, 72, 90, 120 etc.
*/
return div_u64(n * 9, (9ull * NSEC_PER_SEC + HZ / 2) / HZ);
#endif
}
EXPORT_SYMBOL(nsecs_to_jiffies64);
/**
* nsecs_to_jiffies - Convert nsecs in u64 to jiffies
*
* @n: nsecs in u64
*
* Unlike {m,u}secs_to_jiffies, type of input is not unsigned int but u64.
* And this doesn't return MAX_JIFFY_OFFSET since this function is designed
* for scheduler, not for use in device drivers to calculate timeout value.
*
* note:
* NSEC_PER_SEC = 10^9 = (5^9 * 2^9) = (1953125 * 512)
* ULLONG_MAX ns = 18446744073.709551615 secs = about 584 years
*/
unsigned long nsecs_to_jiffies(u64 n)
{
return (unsigned long)nsecs_to_jiffies64(n);
}
EXPORT_SYMBOL_GPL(nsecs_to_jiffies);
/*
* Add two timespec64 values and do a safety check for overflow.
* It's assumed that both values are valid (>= 0).
* And, each timespec64 is in normalized form.
*/
struct timespec64 timespec64_add_safe(const struct timespec64 lhs,
const struct timespec64 rhs)
{
struct timespec64 res;
set_normalized_timespec64(&res, (timeu64_t) lhs.tv_sec + rhs.tv_sec,
lhs.tv_nsec + rhs.tv_nsec);
if (unlikely(res.tv_sec < lhs.tv_sec || res.tv_sec < rhs.tv_sec)) {
res.tv_sec = TIME64_MAX;
res.tv_nsec = 0;
}
return res;
}
int get_timespec64(struct timespec64 *ts,
const struct __kernel_timespec __user *uts)
{
struct __kernel_timespec kts;
int ret;
ret = copy_from_user(&kts, uts, sizeof(kts));
if (ret)
return -EFAULT;
ts->tv_sec = kts.tv_sec;
/* Zero out the padding in compat mode */
if (in_compat_syscall())
kts.tv_nsec &= 0xFFFFFFFFUL;
/* In 32-bit mode, this drops the padding */
ts->tv_nsec = kts.tv_nsec; return 0;
}
EXPORT_SYMBOL_GPL(get_timespec64);
int put_timespec64(const struct timespec64 *ts,
struct __kernel_timespec __user *uts)
{
struct __kernel_timespec kts = {
.tv_sec = ts->tv_sec,
.tv_nsec = ts->tv_nsec
};
return copy_to_user(uts, &kts, sizeof(kts)) ? -EFAULT : 0;
}
EXPORT_SYMBOL_GPL(put_timespec64);
static int __get_old_timespec32(struct timespec64 *ts64,
const struct old_timespec32 __user *cts)
{
struct old_timespec32 ts;
int ret;
ret = copy_from_user(&ts, cts, sizeof(ts));
if (ret)
return -EFAULT;
ts64->tv_sec = ts.tv_sec;
ts64->tv_nsec = ts.tv_nsec;
return 0;
}
static int __put_old_timespec32(const struct timespec64 *ts64,
struct old_timespec32 __user *cts)
{
struct old_timespec32 ts = {
.tv_sec = ts64->tv_sec,
.tv_nsec = ts64->tv_nsec
};
return copy_to_user(cts, &ts, sizeof(ts)) ? -EFAULT : 0;
}
int get_old_timespec32(struct timespec64 *ts, const void __user *uts)
{
if (COMPAT_USE_64BIT_TIME)
return copy_from_user(ts, uts, sizeof(*ts)) ? -EFAULT : 0;
else
return __get_old_timespec32(ts, uts);
}
EXPORT_SYMBOL_GPL(get_old_timespec32);
int put_old_timespec32(const struct timespec64 *ts, void __user *uts)
{
if (COMPAT_USE_64BIT_TIME)
return copy_to_user(uts, ts, sizeof(*ts)) ? -EFAULT : 0;
else
return __put_old_timespec32(ts, uts);
}
EXPORT_SYMBOL_GPL(put_old_timespec32);
int get_itimerspec64(struct itimerspec64 *it,
const struct __kernel_itimerspec __user *uit)
{
int ret;
ret = get_timespec64(&it->it_interval, &uit->it_interval);
if (ret)
return ret;
ret = get_timespec64(&it->it_value, &uit->it_value);
return ret;
}
EXPORT_SYMBOL_GPL(get_itimerspec64);
int put_itimerspec64(const struct itimerspec64 *it,
struct __kernel_itimerspec __user *uit)
{
int ret;
ret = put_timespec64(&it->it_interval, &uit->it_interval);
if (ret)
return ret;
ret = put_timespec64(&it->it_value, &uit->it_value);
return ret;
}
EXPORT_SYMBOL_GPL(put_itimerspec64);
int get_old_itimerspec32(struct itimerspec64 *its,
const struct old_itimerspec32 __user *uits)
{
if (__get_old_timespec32(&its->it_interval, &uits->it_interval) ||
__get_old_timespec32(&its->it_value, &uits->it_value))
return -EFAULT;
return 0;
}
EXPORT_SYMBOL_GPL(get_old_itimerspec32);
int put_old_itimerspec32(const struct itimerspec64 *its,
struct old_itimerspec32 __user *uits)
{
if (__put_old_timespec32(&its->it_interval, &uits->it_interval) ||
__put_old_timespec32(&its->it_value, &uits->it_value))
return -EFAULT;
return 0;
}
EXPORT_SYMBOL_GPL(put_old_itimerspec32);
// SPDX-License-Identifier: GPL-2.0-only
/*
* (C) 1997 Linus Torvalds
* (C) 1999 Andrea Arcangeli <andrea@suse.de> (dynamic inode allocation)
*/
#include <linux/export.h>
#include <linux/fs.h>
#include <linux/mm.h>
#include <linux/backing-dev.h>
#include <linux/hash.h>
#include <linux/swap.h>
#include <linux/security.h>
#include <linux/cdev.h>
#include <linux/memblock.h>
#include <linux/fsnotify.h>
#include <linux/mount.h>
#include <linux/posix_acl.h>
#include <linux/prefetch.h>
#include <linux/buffer_head.h> /* for inode_has_buffers */
#include <linux/ratelimit.h>
#include <linux/list_lru.h>
#include <linux/iversion.h>
#include <trace/events/writeback.h>
#include "internal.h"
/*
* Inode locking rules:
*
* inode->i_lock protects:
* inode->i_state, inode->i_hash, __iget()
* Inode LRU list locks protect:
* inode->i_sb->s_inode_lru, inode->i_lru
* inode->i_sb->s_inode_list_lock protects:
* inode->i_sb->s_inodes, inode->i_sb_list
* bdi->wb.list_lock protects:
* bdi->wb.b_{dirty,io,more_io,dirty_time}, inode->i_io_list
* inode_hash_lock protects:
* inode_hashtable, inode->i_hash
*
* Lock ordering:
*
* inode->i_sb->s_inode_list_lock
* inode->i_lock
* Inode LRU list locks
*
* bdi->wb.list_lock
* inode->i_lock
*
* inode_hash_lock
* inode->i_sb->s_inode_list_lock
* inode->i_lock
*
* iunique_lock
* inode_hash_lock
*/
static unsigned int i_hash_mask __read_mostly;
static unsigned int i_hash_shift __read_mostly;
static struct hlist_head *inode_hashtable __read_mostly;
static __cacheline_aligned_in_smp DEFINE_SPINLOCK(inode_hash_lock);
/*
* Empty aops. Can be used for the cases where the user does not
* define any of the address_space operations.
*/
const struct address_space_operations empty_aops = {
};
EXPORT_SYMBOL(empty_aops);
/*
* Statistics gathering..
*/
struct inodes_stat_t inodes_stat;
static DEFINE_PER_CPU(unsigned long, nr_inodes);
static DEFINE_PER_CPU(unsigned long, nr_unused);
static struct kmem_cache *inode_cachep __read_mostly;
static long get_nr_inodes(void)
{
int i;
long sum = 0;
for_each_possible_cpu(i) sum += per_cpu(nr_inodes, i); return sum < 0 ? 0 : sum;
}
static inline long get_nr_inodes_unused(void)
{
int i;
long sum = 0;
for_each_possible_cpu(i) sum += per_cpu(nr_unused, i); return sum < 0 ? 0 : sum;
}
long get_nr_dirty_inodes(void)
{
/* not actually dirty inodes, but a wild approximation */
long nr_dirty = get_nr_inodes() - get_nr_inodes_unused();
return nr_dirty > 0 ? nr_dirty : 0;
}
/*
* Handle nr_inode sysctl
*/
#ifdef CONFIG_SYSCTL
int proc_nr_inodes(struct ctl_table *table, int write,
void *buffer, size_t *lenp, loff_t *ppos)
{
inodes_stat.nr_inodes = get_nr_inodes();
inodes_stat.nr_unused = get_nr_inodes_unused();
return proc_doulongvec_minmax(table, write, buffer, lenp, ppos);
}
#endif
static int no_open(struct inode *inode, struct file *file)
{
return -ENXIO;
}
/**
* inode_init_always - perform inode structure initialisation
* @sb: superblock inode belongs to
* @inode: inode to initialise
*
* These are initializations that need to be done on every inode
* allocation as the fields are not initialised by slab allocation.
*/
int inode_init_always(struct super_block *sb, struct inode *inode)
{
static const struct inode_operations empty_iops;
static const struct file_operations no_open_fops = {.open = no_open};
struct address_space *const mapping = &inode->i_data;
inode->i_sb = sb;
inode->i_blkbits = sb->s_blocksize_bits;
inode->i_flags = 0;
atomic64_set(&inode->i_sequence, 0);
atomic_set(&inode->i_count, 1);
inode->i_op = &empty_iops;
inode->i_fop = &no_open_fops;
inode->i_ino = 0;
inode->__i_nlink = 1;
inode->i_opflags = 0;
if (sb->s_xattr)
inode->i_opflags |= IOP_XATTR;
i_uid_write(inode, 0);
i_gid_write(inode, 0);
atomic_set(&inode->i_writecount, 0);
inode->i_size = 0;
inode->i_write_hint = WRITE_LIFE_NOT_SET;
inode->i_blocks = 0;
inode->i_bytes = 0;
inode->i_generation = 0;
inode->i_pipe = NULL;
inode->i_cdev = NULL;
inode->i_link = NULL;
inode->i_dir_seq = 0;
inode->i_rdev = 0;
inode->dirtied_when = 0;
#ifdef CONFIG_CGROUP_WRITEBACK
inode->i_wb_frn_winner = 0;
inode->i_wb_frn_avg_time = 0;
inode->i_wb_frn_history = 0;
#endif
if (security_inode_alloc(inode))
goto out;
spin_lock_init(&inode->i_lock);
lockdep_set_class(&inode->i_lock, &sb->s_type->i_lock_key);
init_rwsem(&inode->i_rwsem);
lockdep_set_class(&inode->i_rwsem, &sb->s_type->i_mutex_key);
atomic_set(&inode->i_dio_count, 0);
mapping->a_ops = &empty_aops;
mapping->host = inode;
mapping->flags = 0;
if (sb->s_type->fs_flags & FS_THP_SUPPORT)
__set_bit(AS_THP_SUPPORT, &mapping->flags); mapping->wb_err = 0;
atomic_set(&mapping->i_mmap_writable, 0);
#ifdef CONFIG_READ_ONLY_THP_FOR_FS
atomic_set(&mapping->nr_thps, 0);
#endif
mapping_set_gfp_mask(mapping, GFP_HIGHUSER_MOVABLE);
mapping->private_data = NULL;
mapping->writeback_index = 0;
init_rwsem(&mapping->invalidate_lock);
lockdep_set_class_and_name(&mapping->invalidate_lock,
&sb->s_type->invalidate_lock_key,
"mapping.invalidate_lock");
inode->i_private = NULL;
inode->i_mapping = mapping;
INIT_HLIST_HEAD(&inode->i_dentry); /* buggered by rcu freeing */
#ifdef CONFIG_FS_POSIX_ACL
inode->i_acl = inode->i_default_acl = ACL_NOT_CACHED;
#endif
#ifdef CONFIG_FSNOTIFY
inode->i_fsnotify_mask = 0;
#endif
inode->i_flctx = NULL;
this_cpu_inc(nr_inodes);
return 0;
out:
return -ENOMEM;
}
EXPORT_SYMBOL(inode_init_always);
void free_inode_nonrcu(struct inode *inode)
{
kmem_cache_free(inode_cachep, inode);
}
EXPORT_SYMBOL(free_inode_nonrcu);
static void i_callback(struct rcu_head *head)
{
struct inode *inode = container_of(head, struct inode, i_rcu);
if (inode->free_inode)
inode->free_inode(inode);
else
free_inode_nonrcu(inode);
}
static struct inode *alloc_inode(struct super_block *sb)
{
const struct super_operations *ops = sb->s_op;
struct inode *inode;
if (ops->alloc_inode)
inode = ops->alloc_inode(sb);
else
inode = kmem_cache_alloc(inode_cachep, GFP_KERNEL); if (!inode)
return NULL;
if (unlikely(inode_init_always(sb, inode))) { if (ops->destroy_inode) { ops->destroy_inode(inode);
if (!ops->free_inode)
return NULL;
}
inode->free_inode = ops->free_inode;
i_callback(&inode->i_rcu);
return NULL;
}
return inode;
}
void __destroy_inode(struct inode *inode)
{
BUG_ON(inode_has_buffers(inode));
inode_detach_wb(inode);
security_inode_free(inode);
fsnotify_inode_delete(inode);
locks_free_lock_context(inode);
if (!inode->i_nlink) {
WARN_ON(atomic_long_read(&inode->i_sb->s_remove_count) == 0); atomic_long_dec(&inode->i_sb->s_remove_count);
}
#ifdef CONFIG_FS_POSIX_ACL
if (inode->i_acl && !is_uncached_acl(inode->i_acl))
posix_acl_release(inode->i_acl);
if (inode->i_default_acl && !is_uncached_acl(inode->i_default_acl))
posix_acl_release(inode->i_default_acl);
#endif
this_cpu_dec(nr_inodes);
}
EXPORT_SYMBOL(__destroy_inode);
static void destroy_inode(struct inode *inode)
{
const struct super_operations *ops = inode->i_sb->s_op; BUG_ON(!list_empty(&inode->i_lru)); __destroy_inode(inode);
if (ops->destroy_inode) {
ops->destroy_inode(inode);
if (!ops->free_inode)
return;
}
inode->free_inode = ops->free_inode;
call_rcu(&inode->i_rcu, i_callback);
}
/**
* drop_nlink - directly drop an inode's link count
* @inode: inode
*
* This is a low-level filesystem helper to replace any
* direct filesystem manipulation of i_nlink. In cases
* where we are attempting to track writes to the
* filesystem, a decrement to zero means an imminent
* write when the file is truncated and actually unlinked
* on the filesystem.
*/
void drop_nlink(struct inode *inode)
{
WARN_ON(inode->i_nlink == 0); inode->__i_nlink--;
if (!inode->i_nlink)
atomic_long_inc(&inode->i_sb->s_remove_count);
}
EXPORT_SYMBOL(drop_nlink);
/**
* clear_nlink - directly zero an inode's link count
* @inode: inode
*
* This is a low-level filesystem helper to replace any
* direct filesystem manipulation of i_nlink. See
* drop_nlink() for why we care about i_nlink hitting zero.
*/
void clear_nlink(struct inode *inode)
{
if (inode->i_nlink) { inode->__i_nlink = 0;
atomic_long_inc(&inode->i_sb->s_remove_count);
}
}
EXPORT_SYMBOL(clear_nlink);
/**
* set_nlink - directly set an inode's link count
* @inode: inode
* @nlink: new nlink (should be non-zero)
*
* This is a low-level filesystem helper to replace any
* direct filesystem manipulation of i_nlink.
*/
void set_nlink(struct inode *inode, unsigned int nlink)
{
if (!nlink) {
clear_nlink(inode);
} else {
/* Yes, some filesystems do change nlink from zero to one */
if (inode->i_nlink == 0) atomic_long_dec(&inode->i_sb->s_remove_count); inode->__i_nlink = nlink;
}
}
EXPORT_SYMBOL(set_nlink);
/**
* inc_nlink - directly increment an inode's link count
* @inode: inode
*
* This is a low-level filesystem helper to replace any
* direct filesystem manipulation of i_nlink. Currently,
* it is only here for parity with dec_nlink().
*/
void inc_nlink(struct inode *inode)
{
if (unlikely(inode->i_nlink == 0)) { WARN_ON(!(inode->i_state & I_LINKABLE)); atomic_long_dec(&inode->i_sb->s_remove_count);
}
inode->__i_nlink++;
}
EXPORT_SYMBOL(inc_nlink);
static void __address_space_init_once(struct address_space *mapping)
{
xa_init_flags(&mapping->i_pages, XA_FLAGS_LOCK_IRQ | XA_FLAGS_ACCOUNT);
init_rwsem(&mapping->i_mmap_rwsem);
INIT_LIST_HEAD(&mapping->private_list);
spin_lock_init(&mapping->private_lock);
mapping->i_mmap = RB_ROOT_CACHED;
}
void address_space_init_once(struct address_space *mapping)
{
memset(mapping, 0, sizeof(*mapping));
__address_space_init_once(mapping);
}
EXPORT_SYMBOL(address_space_init_once);
/*
* These are initializations that only need to be done
* once, because the fields are idempotent across use
* of the inode, so let the slab aware of that.
*/
void inode_init_once(struct inode *inode)
{
memset(inode, 0, sizeof(*inode));
INIT_HLIST_NODE(&inode->i_hash);
INIT_LIST_HEAD(&inode->i_devices);
INIT_LIST_HEAD(&inode->i_io_list);
INIT_LIST_HEAD(&inode->i_wb_list);
INIT_LIST_HEAD(&inode->i_lru);
__address_space_init_once(&inode->i_data);
i_size_ordered_init(inode);
}
EXPORT_SYMBOL(inode_init_once);
static void init_once(void *foo)
{
struct inode *inode = (struct inode *) foo;
inode_init_once(inode);
}
/*
* inode->i_lock must be held
*/
void __iget(struct inode *inode)
{
atomic_inc(&inode->i_count);
}
/*
* get additional reference to inode; caller must already hold one.
*/
void ihold(struct inode *inode)
{
WARN_ON(atomic_inc_return(&inode->i_count) < 2);
}
EXPORT_SYMBOL(ihold);
static void inode_lru_list_add(struct inode *inode)
{
if (list_lru_add(&inode->i_sb->s_inode_lru, &inode->i_lru)) this_cpu_inc(nr_unused);
else
inode->i_state |= I_REFERENCED;
}
/*
* Add inode to LRU if needed (inode is unused and clean).
*
* Needs inode->i_lock held.
*/
void inode_add_lru(struct inode *inode)
{
if (!(inode->i_state & (I_DIRTY_ALL | I_SYNC |
I_FREEING | I_WILL_FREE)) &&
!atomic_read(&inode->i_count) && inode->i_sb->s_flags & SB_ACTIVE)
inode_lru_list_add(inode);
}
static void inode_lru_list_del(struct inode *inode)
{
if (list_lru_del(&inode->i_sb->s_inode_lru, &inode->i_lru)) this_cpu_dec(nr_unused);
}
/**
* inode_sb_list_add - add inode to the superblock list of inodes
* @inode: inode to add
*/
void inode_sb_list_add(struct inode *inode)
{
spin_lock(&inode->i_sb->s_inode_list_lock);
list_add(&inode->i_sb_list, &inode->i_sb->s_inodes);
spin_unlock(&inode->i_sb->s_inode_list_lock);
}
EXPORT_SYMBOL_GPL(inode_sb_list_add);
static inline void inode_sb_list_del(struct inode *inode)
{
if (!list_empty(&inode->i_sb_list)) { spin_lock(&inode->i_sb->s_inode_list_lock);
list_del_init(&inode->i_sb_list);
spin_unlock(&inode->i_sb->s_inode_list_lock);
}
}
static unsigned long hash(struct super_block *sb, unsigned long hashval)
{
unsigned long tmp;
tmp = (hashval * (unsigned long)sb) ^ (GOLDEN_RATIO_PRIME + hashval) /
L1_CACHE_BYTES;
tmp = tmp ^ ((tmp ^ GOLDEN_RATIO_PRIME) >> i_hash_shift);
return tmp & i_hash_mask;
}
/**
* __insert_inode_hash - hash an inode
* @inode: unhashed inode
* @hashval: unsigned long value used to locate this object in the
* inode_hashtable.
*
* Add an inode to the inode hash for this superblock.
*/
void __insert_inode_hash(struct inode *inode, unsigned long hashval)
{
struct hlist_head *b = inode_hashtable + hash(inode->i_sb, hashval);
spin_lock(&inode_hash_lock);
spin_lock(&inode->i_lock);
hlist_add_head_rcu(&inode->i_hash, b);
spin_unlock(&inode->i_lock);
spin_unlock(&inode_hash_lock);
}
EXPORT_SYMBOL(__insert_inode_hash);
/**
* __remove_inode_hash - remove an inode from the hash
* @inode: inode to unhash
*
* Remove an inode from the superblock.
*/
void __remove_inode_hash(struct inode *inode)
{
spin_lock(&inode_hash_lock);
spin_lock(&inode->i_lock);
hlist_del_init_rcu(&inode->i_hash);
spin_unlock(&inode->i_lock);
spin_unlock(&inode_hash_lock);
}
EXPORT_SYMBOL(__remove_inode_hash);
void clear_inode(struct inode *inode)
{
/*
* We have to cycle the i_pages lock here because reclaim can be in the
* process of removing the last page (in __delete_from_page_cache())
* and we must not free the mapping under it.
*/
xa_lock_irq(&inode->i_data.i_pages);
BUG_ON(inode->i_data.nrpages);
/*
* Almost always, mapping_empty(&inode->i_data) here; but there are
* two known and long-standing ways in which nodes may get left behind
* (when deep radix-tree node allocation failed partway; or when THP
* collapse_file() failed). Until those two known cases are cleaned up,
* or a cleanup function is called here, do not BUG_ON(!mapping_empty),
* nor even WARN_ON(!mapping_empty).
*/
xa_unlock_irq(&inode->i_data.i_pages);
BUG_ON(!list_empty(&inode->i_data.private_list)); BUG_ON(!(inode->i_state & I_FREEING)); BUG_ON(inode->i_state & I_CLEAR); BUG_ON(!list_empty(&inode->i_wb_list));
/* don't need i_lock here, no concurrent mods to i_state */
inode->i_state = I_FREEING | I_CLEAR;
}
EXPORT_SYMBOL(clear_inode);
/*
* Free the inode passed in, removing it from the lists it is still connected
* to. We remove any pages still attached to the inode and wait for any IO that
* is still in progress before finally destroying the inode.
*
* An inode must already be marked I_FREEING so that we avoid the inode being
* moved back onto lists if we race with other code that manipulates the lists
* (e.g. writeback_single_inode). The caller is responsible for setting this.
*
* An inode must already be removed from the LRU list before being evicted from
* the cache. This should occur atomically with setting the I_FREEING state
* flag, so no inodes here should ever be on the LRU when being evicted.
*/
static void evict(struct inode *inode)
{
const struct super_operations *op = inode->i_sb->s_op; BUG_ON(!(inode->i_state & I_FREEING)); BUG_ON(!list_empty(&inode->i_lru));
if (!list_empty(&inode->i_io_list))
inode_io_list_del(inode);
inode_sb_list_del(inode);
/*
* Wait for flusher thread to be done with the inode so that filesystem
* does not start destroying it while writeback is still running. Since
* the inode has I_FREEING set, flusher thread won't start new work on
* the inode. We just have to wait for running writeback to finish.
*/
inode_wait_for_writeback(inode);
if (op->evict_inode) {
op->evict_inode(inode);
} else {
truncate_inode_pages_final(&inode->i_data);
clear_inode(inode);
}
if (S_ISCHR(inode->i_mode) && inode->i_cdev) cd_forget(inode);
remove_inode_hash(inode);
spin_lock(&inode->i_lock);
wake_up_bit(&inode->i_state, __I_NEW);
BUG_ON(inode->i_state != (I_FREEING | I_CLEAR));
spin_unlock(&inode->i_lock);
destroy_inode(inode);
}
/*
* dispose_list - dispose of the contents of a local list
* @head: the head of the list to free
*
* Dispose-list gets a local list with local inodes in it, so it doesn't
* need to worry about list corruption and SMP locks.
*/
static void dispose_list(struct list_head *head)
{
while (!list_empty(head)) {
struct inode *inode;
inode = list_first_entry(head, struct inode, i_lru);
list_del_init(&inode->i_lru);
evict(inode);
cond_resched();
}
}
/**
* evict_inodes - evict all evictable inodes for a superblock
* @sb: superblock to operate on
*
* Make sure that no inodes with zero refcount are retained. This is
* called by superblock shutdown after having SB_ACTIVE flag removed,
* so any inode reaching zero refcount during or after that call will
* be immediately evicted.
*/
void evict_inodes(struct super_block *sb)
{
struct inode *inode, *next;
LIST_HEAD(dispose);
again:
spin_lock(&sb->s_inode_list_lock);
list_for_each_entry_safe(inode, next, &sb->s_inodes, i_sb_list) { if (atomic_read(&inode->i_count))
continue;
spin_lock(&inode->i_lock);
if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE)) {
spin_unlock(&inode->i_lock);
continue;
}
inode->i_state |= I_FREEING;
inode_lru_list_del(inode);
spin_unlock(&inode->i_lock);
list_add(&inode->i_lru, &dispose);
/*
* We can have a ton of inodes to evict at unmount time given
* enough memory, check to see if we need to go to sleep for a
* bit so we don't livelock.
*/
if (need_resched()) {
spin_unlock(&sb->s_inode_list_lock);
cond_resched();
dispose_list(&dispose);
goto again;
}
}
spin_unlock(&sb->s_inode_list_lock);
dispose_list(&dispose);
}
EXPORT_SYMBOL_GPL(evict_inodes);
/**
* invalidate_inodes - attempt to free all inodes on a superblock
* @sb: superblock to operate on
* @kill_dirty: flag to guide handling of dirty inodes
*
* Attempts to free all inodes for a given superblock. If there were any
* busy inodes return a non-zero value, else zero.
* If @kill_dirty is set, discard dirty inodes too, otherwise treat
* them as busy.
*/
int invalidate_inodes(struct super_block *sb, bool kill_dirty)
{
int busy = 0;
struct inode *inode, *next;
LIST_HEAD(dispose);
again:
spin_lock(&sb->s_inode_list_lock);
list_for_each_entry_safe(inode, next, &sb->s_inodes, i_sb_list) {
spin_lock(&inode->i_lock);
if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE)) {
spin_unlock(&inode->i_lock);
continue;
}
if (inode->i_state & I_DIRTY_ALL && !kill_dirty) {
spin_unlock(&inode->i_lock);
busy = 1;
continue;
}
if (atomic_read(&inode->i_count)) {
spin_unlock(&inode->i_lock);
busy = 1;
continue;
}
inode->i_state |= I_FREEING;
inode_lru_list_del(inode);
spin_unlock(&inode->i_lock);
list_add(&inode->i_lru, &dispose);
if (need_resched()) {
spin_unlock(&sb->s_inode_list_lock);
cond_resched();
dispose_list(&dispose);
goto again;
}
}
spin_unlock(&sb->s_inode_list_lock);
dispose_list(&dispose);
return busy;
}
/*
* Isolate the inode from the LRU in preparation for freeing it.
*
* Any inodes which are pinned purely because of attached pagecache have their
* pagecache removed. If the inode has metadata buffers attached to
* mapping->private_list then try to remove them.
*
* If the inode has the I_REFERENCED flag set, then it means that it has been
* used recently - the flag is set in iput_final(). When we encounter such an
* inode, clear the flag and move it to the back of the LRU so it gets another
* pass through the LRU before it gets reclaimed. This is necessary because of
* the fact we are doing lazy LRU updates to minimise lock contention so the
* LRU does not have strict ordering. Hence we don't want to reclaim inodes
* with this flag set because they are the inodes that are out of order.
*/
static enum lru_status inode_lru_isolate(struct list_head *item,
struct list_lru_one *lru, spinlock_t *lru_lock, void *arg)
{
struct list_head *freeable = arg;
struct inode *inode = container_of(item, struct inode, i_lru);
/*
* we are inverting the lru lock/inode->i_lock here, so use a trylock.
* If we fail to get the lock, just skip it.
*/
if (!spin_trylock(&inode->i_lock))
return LRU_SKIP;
/*
* Referenced or dirty inodes are still in use. Give them another pass
* through the LRU as we canot reclaim them now.
*/
if (atomic_read(&inode->i_count) ||
(inode->i_state & ~I_REFERENCED)) {
list_lru_isolate(lru, &inode->i_lru);
spin_unlock(&inode->i_lock);
this_cpu_dec(nr_unused);
return LRU_REMOVED;
}
/* recently referenced inodes get one more pass */
if (inode->i_state & I_REFERENCED) {
inode->i_state &= ~I_REFERENCED;
spin_unlock(&inode->i_lock);
return LRU_ROTATE;
}
if (inode_has_buffers(inode) || !mapping_empty(&inode->i_data)) {
__iget(inode);
spin_unlock(&inode->i_lock);
spin_unlock(lru_lock);
if (remove_inode_buffers(inode)) {
unsigned long reap;
reap = invalidate_mapping_pages(&inode->i_data, 0, -1);
if (current_is_kswapd())
__count_vm_events(KSWAPD_INODESTEAL, reap);
else
__count_vm_events(PGINODESTEAL, reap);
if (current->reclaim_state)
current->reclaim_state->reclaimed_slab += reap;
}
iput(inode);
spin_lock(lru_lock);
return LRU_RETRY;
}
WARN_ON(inode->i_state & I_NEW);
inode->i_state |= I_FREEING;
list_lru_isolate_move(lru, &inode->i_lru, freeable);
spin_unlock(&inode->i_lock);
this_cpu_dec(nr_unused);
return LRU_REMOVED;
}
/*
* Walk the superblock inode LRU for freeable inodes and attempt to free them.
* This is called from the superblock shrinker function with a number of inodes
* to trim from the LRU. Inodes to be freed are moved to a temporary list and
* then are freed outside inode_lock by dispose_list().
*/
long prune_icache_sb(struct super_block *sb, struct shrink_control *sc)
{
LIST_HEAD(freeable);
long freed;
freed = list_lru_shrink_walk(&sb->s_inode_lru, sc,
inode_lru_isolate, &freeable);
dispose_list(&freeable);
return freed;
}
static void __wait_on_freeing_inode(struct inode *inode);
/*
* Called with the inode lock held.
*/
static struct inode *find_inode(struct super_block *sb,
struct hlist_head *head,
int (*test)(struct inode *, void *),
void *data)
{
struct inode *inode = NULL;
repeat:
hlist_for_each_entry(inode, head, i_hash) { if (inode->i_sb != sb)
continue;
if (!test(inode, data))
continue;
spin_lock(&inode->i_lock);
if (inode->i_state & (I_FREEING|I_WILL_FREE)) {
__wait_on_freeing_inode(inode);
goto repeat;
}
if (unlikely(inode->i_state & I_CREATING)) {
spin_unlock(&inode->i_lock);
return ERR_PTR(-ESTALE);
}
__iget(inode);
spin_unlock(&inode->i_lock);
return inode;
}
return NULL;
}
/*
* find_inode_fast is the fast path version of find_inode, see the comment at
* iget_locked for details.
*/
static struct inode *find_inode_fast(struct super_block *sb,
struct hlist_head *head, unsigned long ino)
{
struct inode *inode = NULL;
repeat:
hlist_for_each_entry(inode, head, i_hash) { if (inode->i_ino != ino)
continue;
if (inode->i_sb != sb)
continue;
spin_lock(&inode->i_lock);
if (inode->i_state & (I_FREEING|I_WILL_FREE)) {
__wait_on_freeing_inode(inode);
goto repeat;
}
if (unlikely(inode->i_state & I_CREATING)) {
spin_unlock(&inode->i_lock);
return ERR_PTR(-ESTALE);
}
__iget(inode);
spin_unlock(&inode->i_lock);
return inode;
}
return NULL;
}
/*
* Each cpu owns a range of LAST_INO_BATCH numbers.
* 'shared_last_ino' is dirtied only once out of LAST_INO_BATCH allocations,
* to renew the exhausted range.
*
* This does not significantly increase overflow rate because every CPU can
* consume at most LAST_INO_BATCH-1 unused inode numbers. So there is
* NR_CPUS*(LAST_INO_BATCH-1) wastage. At 4096 and 1024, this is ~0.1% of the
* 2^32 range, and is a worst-case. Even a 50% wastage would only increase
* overflow rate by 2x, which does not seem too significant.
*
* On a 32bit, non LFS stat() call, glibc will generate an EOVERFLOW
* error if st_ino won't fit in target struct field. Use 32bit counter
* here to attempt to avoid that.
*/
#define LAST_INO_BATCH 1024
static DEFINE_PER_CPU(unsigned int, last_ino);
unsigned int get_next_ino(void)
{
unsigned int *p = &get_cpu_var(last_ino);
unsigned int res = *p;
#ifdef CONFIG_SMP
if (unlikely((res & (LAST_INO_BATCH-1)) == 0)) {
static atomic_t shared_last_ino;
int next = atomic_add_return(LAST_INO_BATCH, &shared_last_ino);
res = next - LAST_INO_BATCH;
}
#endif
res++;
/* get_next_ino should not provide a 0 inode number */
if (unlikely(!res))
res++;
*p = res;
put_cpu_var(last_ino);
return res;
}
EXPORT_SYMBOL(get_next_ino);
/**
* new_inode_pseudo - obtain an inode
* @sb: superblock
*
* Allocates a new inode for given superblock.
* Inode wont be chained in superblock s_inodes list
* This means :
* - fs can't be unmount
* - quotas, fsnotify, writeback can't work
*/
struct inode *new_inode_pseudo(struct super_block *sb)
{
struct inode *inode = alloc_inode(sb);
if (inode) {
spin_lock(&inode->i_lock);
inode->i_state = 0;
spin_unlock(&inode->i_lock);
INIT_LIST_HEAD(&inode->i_sb_list);
}
return inode;
}
/**
* new_inode - obtain an inode
* @sb: superblock
*
* Allocates a new inode for given superblock. The default gfp_mask
* for allocations related to inode->i_mapping is GFP_HIGHUSER_MOVABLE.
* If HIGHMEM pages are unsuitable or it is known that pages allocated
* for the page cache are not reclaimable or migratable,
* mapping_set_gfp_mask() must be called with suitable flags on the
* newly created inode's mapping
*
*/
struct inode *new_inode(struct super_block *sb)
{
struct inode *inode;
spin_lock_prefetch(&sb->s_inode_list_lock);
inode = new_inode_pseudo(sb);
if (inode)
inode_sb_list_add(inode); return inode;
}
EXPORT_SYMBOL(new_inode);
#ifdef CONFIG_DEBUG_LOCK_ALLOC
void lockdep_annotate_inode_mutex_key(struct inode *inode)
{
if (S_ISDIR(inode->i_mode)) {
struct file_system_type *type = inode->i_sb->s_type;
/* Set new key only if filesystem hasn't already changed it */
if (lockdep_match_class(&inode->i_rwsem, &type->i_mutex_key)) {
/*
* ensure nobody is actually holding i_mutex
*/
// mutex_destroy(&inode->i_mutex);
init_rwsem(&inode->i_rwsem);
lockdep_set_class(&inode->i_rwsem,
&type->i_mutex_dir_key);
}
}
}
EXPORT_SYMBOL(lockdep_annotate_inode_mutex_key);
#endif
/**
* unlock_new_inode - clear the I_NEW state and wake up any waiters
* @inode: new inode to unlock
*
* Called when the inode is fully initialised to clear the new state of the
* inode and wake up anyone waiting for the inode to finish initialisation.
*/
void unlock_new_inode(struct inode *inode)
{
lockdep_annotate_inode_mutex_key(inode);
spin_lock(&inode->i_lock);
WARN_ON(!(inode->i_state & I_NEW)); inode->i_state &= ~I_NEW & ~I_CREATING;
smp_mb();
wake_up_bit(&inode->i_state, __I_NEW);
spin_unlock(&inode->i_lock);
}
EXPORT_SYMBOL(unlock_new_inode);
void discard_new_inode(struct inode *inode)
{
lockdep_annotate_inode_mutex_key(inode);
spin_lock(&inode->i_lock);
WARN_ON(!(inode->i_state & I_NEW));
inode->i_state &= ~I_NEW;
smp_mb();
wake_up_bit(&inode->i_state, __I_NEW);
spin_unlock(&inode->i_lock);
iput(inode);
}
EXPORT_SYMBOL(discard_new_inode);
/**
* lock_two_nondirectories - take two i_mutexes on non-directory objects
*
* Lock any non-NULL argument that is not a directory.
* Zero, one or two objects may be locked by this function.
*
* @inode1: first inode to lock
* @inode2: second inode to lock
*/
void lock_two_nondirectories(struct inode *inode1, struct inode *inode2)
{
if (inode1 > inode2)
swap(inode1, inode2);
if (inode1 && !S_ISDIR(inode1->i_mode))
inode_lock(inode1);
if (inode2 && !S_ISDIR(inode2->i_mode) && inode2 != inode1)
inode_lock_nested(inode2, I_MUTEX_NONDIR2);
}
EXPORT_SYMBOL(lock_two_nondirectories);
/**
* unlock_two_nondirectories - release locks from lock_two_nondirectories()
* @inode1: first inode to unlock
* @inode2: second inode to unlock
*/
void unlock_two_nondirectories(struct inode *inode1, struct inode *inode2)
{
if (inode1 && !S_ISDIR(inode1->i_mode))
inode_unlock(inode1);
if (inode2 && !S_ISDIR(inode2->i_mode) && inode2 != inode1)
inode_unlock(inode2);
}
EXPORT_SYMBOL(unlock_two_nondirectories);
/**
* inode_insert5 - obtain an inode from a mounted file system
* @inode: pre-allocated inode to use for insert to cache
* @hashval: hash value (usually inode number) to get
* @test: callback used for comparisons between inodes
* @set: callback used to initialize a new struct inode
* @data: opaque data pointer to pass to @test and @set
*
* Search for the inode specified by @hashval and @data in the inode cache,
* and if present it is return it with an increased reference count. This is
* a variant of iget5_locked() for callers that don't want to fail on memory
* allocation of inode.
*
* If the inode is not in cache, insert the pre-allocated inode to cache and
* return it locked, hashed, and with the I_NEW flag set. The file system gets
* to fill it in before unlocking it via unlock_new_inode().
*
* Note both @test and @set are called with the inode_hash_lock held, so can't
* sleep.
*/
struct inode *inode_insert5(struct inode *inode, unsigned long hashval,
int (*test)(struct inode *, void *),
int (*set)(struct inode *, void *), void *data)
{
struct hlist_head *head = inode_hashtable + hash(inode->i_sb, hashval);
struct inode *old;
bool creating = inode->i_state & I_CREATING;
again:
spin_lock(&inode_hash_lock);
old = find_inode(inode->i_sb, head, test, data);
if (unlikely(old)) {
/*
* Uhhuh, somebody else created the same inode under us.
* Use the old inode instead of the preallocated one.
*/
spin_unlock(&inode_hash_lock);
if (IS_ERR(old))
return NULL;
wait_on_inode(old);
if (unlikely(inode_unhashed(old))) {
iput(old);
goto again;
}
return old;
}
if (set && unlikely(set(inode, data))) {
inode = NULL;
goto unlock;
}
/*
* Return the locked inode with I_NEW set, the
* caller is responsible for filling in the contents
*/
spin_lock(&inode->i_lock);
inode->i_state |= I_NEW;
hlist_add_head_rcu(&inode->i_hash, head);
spin_unlock(&inode->i_lock);
if (!creating)
inode_sb_list_add(inode);
unlock:
spin_unlock(&inode_hash_lock);
return inode;
}
EXPORT_SYMBOL(inode_insert5);
/**
* iget5_locked - obtain an inode from a mounted file system
* @sb: super block of file system
* @hashval: hash value (usually inode number) to get
* @test: callback used for comparisons between inodes
* @set: callback used to initialize a new struct inode
* @data: opaque data pointer to pass to @test and @set
*
* Search for the inode specified by @hashval and @data in the inode cache,
* and if present it is return it with an increased reference count. This is
* a generalized version of iget_locked() for file systems where the inode
* number is not sufficient for unique identification of an inode.
*
* If the inode is not in cache, allocate a new inode and return it locked,
* hashed, and with the I_NEW flag set. The file system gets to fill it in
* before unlocking it via unlock_new_inode().
*
* Note both @test and @set are called with the inode_hash_lock held, so can't
* sleep.
*/
struct inode *iget5_locked(struct super_block *sb, unsigned long hashval,
int (*test)(struct inode *, void *),
int (*set)(struct inode *, void *), void *data)
{
struct inode *inode = ilookup5(sb, hashval, test, data);
if (!inode) {
struct inode *new = alloc_inode(sb);
if (new) {
new->i_state = 0;
inode = inode_insert5(new, hashval, test, set, data);
if (unlikely(inode != new))
destroy_inode(new);
}
}
return inode;
}
EXPORT_SYMBOL(iget5_locked);
/**
* iget_locked - obtain an inode from a mounted file system
* @sb: super block of file system
* @ino: inode number to get
*
* Search for the inode specified by @ino in the inode cache and if present
* return it with an increased reference count. This is for file systems
* where the inode number is sufficient for unique identification of an inode.
*
* If the inode is not in cache, allocate a new inode and return it locked,
* hashed, and with the I_NEW flag set. The file system gets to fill it in
* before unlocking it via unlock_new_inode().
*/
struct inode *iget_locked(struct super_block *sb, unsigned long ino)
{
struct hlist_head *head = inode_hashtable + hash(sb, ino);
struct inode *inode;
again:
spin_lock(&inode_hash_lock);
inode = find_inode_fast(sb, head, ino);
spin_unlock(&inode_hash_lock);
if (inode) {
if (IS_ERR(inode))
return NULL;
wait_on_inode(inode);
if (unlikely(inode_unhashed(inode))) {
iput(inode);
goto again;
}
return inode;
}
inode = alloc_inode(sb);
if (inode) {
struct inode *old;
spin_lock(&inode_hash_lock);
/* We released the lock, so.. */
old = find_inode_fast(sb, head, ino);
if (!old) {
inode->i_ino = ino;
spin_lock(&inode->i_lock);
inode->i_state = I_NEW;
hlist_add_head_rcu(&inode->i_hash, head);
spin_unlock(&inode->i_lock);
inode_sb_list_add(inode);
spin_unlock(&inode_hash_lock);
/* Return the locked inode with I_NEW set, the
* caller is responsible for filling in the contents
*/
return inode;
}
/*
* Uhhuh, somebody else created the same inode under
* us. Use the old inode instead of the one we just
* allocated.
*/
spin_unlock(&inode_hash_lock);
destroy_inode(inode);
if (IS_ERR(old))
return NULL;
inode = old;
wait_on_inode(inode);
if (unlikely(inode_unhashed(inode))) {
iput(inode);
goto again;
}
}
return inode;
}
EXPORT_SYMBOL(iget_locked);
/*
* search the inode cache for a matching inode number.
* If we find one, then the inode number we are trying to
* allocate is not unique and so we should not use it.
*
* Returns 1 if the inode number is unique, 0 if it is not.
*/
static int test_inode_iunique(struct super_block *sb, unsigned long ino)
{
struct hlist_head *b = inode_hashtable + hash(sb, ino);
struct inode *inode;
hlist_for_each_entry_rcu(inode, b, i_hash) {
if (inode->i_ino == ino && inode->i_sb == sb)
return 0;
}
return 1;
}
/**
* iunique - get a unique inode number
* @sb: superblock
* @max_reserved: highest reserved inode number
*
* Obtain an inode number that is unique on the system for a given
* superblock. This is used by file systems that have no natural
* permanent inode numbering system. An inode number is returned that
* is higher than the reserved limit but unique.
*
* BUGS:
* With a large number of inodes live on the file system this function
* currently becomes quite slow.
*/
ino_t iunique(struct super_block *sb, ino_t max_reserved)
{
/*
* On a 32bit, non LFS stat() call, glibc will generate an EOVERFLOW
* error if st_ino won't fit in target struct field. Use 32bit counter
* here to attempt to avoid that.
*/
static DEFINE_SPINLOCK(iunique_lock);
static unsigned int counter;
ino_t res;
rcu_read_lock();
spin_lock(&iunique_lock);
do {
if (counter <= max_reserved)
counter = max_reserved + 1;
res = counter++;
} while (!test_inode_iunique(sb, res));
spin_unlock(&iunique_lock);
rcu_read_unlock();
return res;
}
EXPORT_SYMBOL(iunique);
struct inode *igrab(struct inode *inode)
{
spin_lock(&inode->i_lock);
if (!(inode->i_state & (I_FREEING|I_WILL_FREE))) {
__iget(inode);
spin_unlock(&inode->i_lock);
} else {
spin_unlock(&inode->i_lock);
/*
* Handle the case where s_op->clear_inode is not been
* called yet, and somebody is calling igrab
* while the inode is getting freed.
*/
inode = NULL;
}
return inode;
}
EXPORT_SYMBOL(igrab);
/**
* ilookup5_nowait - search for an inode in the inode cache
* @sb: super block of file system to search
* @hashval: hash value (usually inode number) to search for
* @test: callback used for comparisons between inodes
* @data: opaque data pointer to pass to @test
*
* Search for the inode specified by @hashval and @data in the inode cache.
* If the inode is in the cache, the inode is returned with an incremented
* reference count.
*
* Note: I_NEW is not waited upon so you have to be very careful what you do
* with the returned inode. You probably should be using ilookup5() instead.
*
* Note2: @test is called with the inode_hash_lock held, so can't sleep.
*/
struct inode *ilookup5_nowait(struct super_block *sb, unsigned long hashval,
int (*test)(struct inode *, void *), void *data)
{
struct hlist_head *head = inode_hashtable + hash(sb, hashval);
struct inode *inode;
spin_lock(&inode_hash_lock);
inode = find_inode(sb, head, test, data);
spin_unlock(&inode_hash_lock);
return IS_ERR(inode) ? NULL : inode;
}
EXPORT_SYMBOL(ilookup5_nowait);
/**
* ilookup5 - search for an inode in the inode cache
* @sb: super block of file system to search
* @hashval: hash value (usually inode number) to search for
* @test: callback used for comparisons between inodes
* @data: opaque data pointer to pass to @test
*
* Search for the inode specified by @hashval and @data in the inode cache,
* and if the inode is in the cache, return the inode with an incremented
* reference count. Waits on I_NEW before returning the inode.
* returned with an incremented reference count.
*
* This is a generalized version of ilookup() for file systems where the
* inode number is not sufficient for unique identification of an inode.
*
* Note: @test is called with the inode_hash_lock held, so can't sleep.
*/
struct inode *ilookup5(struct super_block *sb, unsigned long hashval,
int (*test)(struct inode *, void *), void *data)
{
struct inode *inode;
again:
inode = ilookup5_nowait(sb, hashval, test, data);
if (inode) {
wait_on_inode(inode);
if (unlikely(inode_unhashed(inode))) {
iput(inode);
goto again;
}
}
return inode;
}
EXPORT_SYMBOL(ilookup5);
/**
* ilookup - search for an inode in the inode cache
* @sb: super block of file system to search
* @ino: inode number to search for
*
* Search for the inode @ino in the inode cache, and if the inode is in the
* cache, the inode is returned with an incremented reference count.
*/
struct inode *ilookup(struct super_block *sb, unsigned long ino)
{
struct hlist_head *head = inode_hashtable + hash(sb, ino);
struct inode *inode;
again:
spin_lock(&inode_hash_lock);
inode = find_inode_fast(sb, head, ino);
spin_unlock(&inode_hash_lock);
if (inode) {
if (IS_ERR(inode))
return NULL;
wait_on_inode(inode);
if (unlikely(inode_unhashed(inode))) {
iput(inode);
goto again;
}
}
return inode;
}
EXPORT_SYMBOL(ilookup);
/**
* find_inode_nowait - find an inode in the inode cache
* @sb: super block of file system to search
* @hashval: hash value (usually inode number) to search for
* @match: callback used for comparisons between inodes
* @data: opaque data pointer to pass to @match
*
* Search for the inode specified by @hashval and @data in the inode
* cache, where the helper function @match will return 0 if the inode
* does not match, 1 if the inode does match, and -1 if the search
* should be stopped. The @match function must be responsible for
* taking the i_lock spin_lock and checking i_state for an inode being
* freed or being initialized, and incrementing the reference count
* before returning 1. It also must not sleep, since it is called with
* the inode_hash_lock spinlock held.
*
* This is a even more generalized version of ilookup5() when the
* function must never block --- find_inode() can block in
* __wait_on_freeing_inode() --- or when the caller can not increment
* the reference count because the resulting iput() might cause an
* inode eviction. The tradeoff is that the @match funtion must be
* very carefully implemented.
*/
struct inode *find_inode_nowait(struct super_block *sb,
unsigned long hashval,
int (*match)(struct inode *, unsigned long,
void *),
void *data)
{
struct hlist_head *head = inode_hashtable + hash(sb, hashval);
struct inode *inode, *ret_inode = NULL;
int mval;
spin_lock(&inode_hash_lock);
hlist_for_each_entry(inode, head, i_hash) {
if (inode->i_sb != sb)
continue;
mval = match(inode, hashval, data);
if (mval == 0)
continue;
if (mval == 1)
ret_inode = inode;
goto out;
}
out:
spin_unlock(&inode_hash_lock);
return ret_inode;
}
EXPORT_SYMBOL(find_inode_nowait);
/**
* find_inode_rcu - find an inode in the inode cache
* @sb: Super block of file system to search
* @hashval: Key to hash
* @test: Function to test match on an inode
* @data: Data for test function
*
* Search for the inode specified by @hashval and @data in the inode cache,
* where the helper function @test will return 0 if the inode does not match
* and 1 if it does. The @test function must be responsible for taking the
* i_lock spin_lock and checking i_state for an inode being freed or being
* initialized.
*
* If successful, this will return the inode for which the @test function
* returned 1 and NULL otherwise.
*
* The @test function is not permitted to take a ref on any inode presented.
* It is also not permitted to sleep.
*
* The caller must hold the RCU read lock.
*/
struct inode *find_inode_rcu(struct super_block *sb, unsigned long hashval,
int (*test)(struct inode *, void *), void *data)
{
struct hlist_head *head = inode_hashtable + hash(sb, hashval);
struct inode *inode;
RCU_LOCKDEP_WARN(!rcu_read_lock_held(),
"suspicious find_inode_rcu() usage");
hlist_for_each_entry_rcu(inode, head, i_hash) {
if (inode->i_sb == sb &&
!(READ_ONCE(inode->i_state) & (I_FREEING | I_WILL_FREE)) &&
test(inode, data))
return inode;
}
return NULL;
}
EXPORT_SYMBOL(find_inode_rcu);
/**
* find_inode_by_ino_rcu - Find an inode in the inode cache
* @sb: Super block of file system to search
* @ino: The inode number to match
*
* Search for the inode specified by @hashval and @data in the inode cache,
* where the helper function @test will return 0 if the inode does not match
* and 1 if it does. The @test function must be responsible for taking the
* i_lock spin_lock and checking i_state for an inode being freed or being
* initialized.
*
* If successful, this will return the inode for which the @test function
* returned 1 and NULL otherwise.
*
* The @test function is not permitted to take a ref on any inode presented.
* It is also not permitted to sleep.
*
* The caller must hold the RCU read lock.
*/
struct inode *find_inode_by_ino_rcu(struct super_block *sb,
unsigned long ino)
{
struct hlist_head *head = inode_hashtable + hash(sb, ino);
struct inode *inode;
RCU_LOCKDEP_WARN(!rcu_read_lock_held(),
"suspicious find_inode_by_ino_rcu() usage");
hlist_for_each_entry_rcu(inode, head, i_hash) {
if (inode->i_ino == ino &&
inode->i_sb == sb &&
!(READ_ONCE(inode->i_state) & (I_FREEING | I_WILL_FREE)))
return inode;
}
return NULL;
}
EXPORT_SYMBOL(find_inode_by_ino_rcu);
int insert_inode_locked(struct inode *inode)
{
struct super_block *sb = inode->i_sb;
ino_t ino = inode->i_ino;
struct hlist_head *head = inode_hashtable + hash(sb, ino);
while (1) {
struct inode *old = NULL;
spin_lock(&inode_hash_lock);
hlist_for_each_entry(old, head, i_hash) {
if (old->i_ino != ino)
continue;
if (old->i_sb != sb)
continue;
spin_lock(&old->i_lock);
if (old->i_state & (I_FREEING|I_WILL_FREE)) {
spin_unlock(&old->i_lock);
continue;
}
break;
}
if (likely(!old)) {
spin_lock(&inode->i_lock);
inode->i_state |= I_NEW | I_CREATING;
hlist_add_head_rcu(&inode->i_hash, head);
spin_unlock(&inode->i_lock);
spin_unlock(&inode_hash_lock);
return 0;
}
if (unlikely(old->i_state & I_CREATING)) {
spin_unlock(&old->i_lock);
spin_unlock(&inode_hash_lock);
return -EBUSY;
}
__iget(old);
spin_unlock(&old->i_lock);
spin_unlock(&inode_hash_lock);
wait_on_inode(old);
if (unlikely(!inode_unhashed(old))) {
iput(old);
return -EBUSY;
}
iput(old);
}
}
EXPORT_SYMBOL(insert_inode_locked);
int insert_inode_locked4(struct inode *inode, unsigned long hashval,
int (*test)(struct inode *, void *), void *data)
{
struct inode *old;
inode->i_state |= I_CREATING;
old = inode_insert5(inode, hashval, test, NULL, data);
if (old != inode) {
iput(old);
return -EBUSY;
}
return 0;
}
EXPORT_SYMBOL(insert_inode_locked4);
int generic_delete_inode(struct inode *inode)
{
return 1;
}
EXPORT_SYMBOL(generic_delete_inode);
/*
* Called when we're dropping the last reference
* to an inode.
*
* Call the FS "drop_inode()" function, defaulting to
* the legacy UNIX filesystem behaviour. If it tells
* us to evict inode, do so. Otherwise, retain inode
* in cache if fs is alive, sync and evict if fs is
* shutting down.
*/
static void iput_final(struct inode *inode)
{
struct super_block *sb = inode->i_sb;
const struct super_operations *op = inode->i_sb->s_op;
unsigned long state;
int drop;
WARN_ON(inode->i_state & I_NEW); if (op->drop_inode) drop = op->drop_inode(inode);
else
drop = generic_drop_inode(inode);
if (!drop &&
!(inode->i_state & I_DONTCACHE) && (sb->s_flags & SB_ACTIVE)) { inode_add_lru(inode);
spin_unlock(&inode->i_lock);
return;
}
state = inode->i_state;
if (!drop) {
WRITE_ONCE(inode->i_state, state | I_WILL_FREE);
spin_unlock(&inode->i_lock);
write_inode_now(inode, 1);
spin_lock(&inode->i_lock);
state = inode->i_state;
WARN_ON(state & I_NEW); state &= ~I_WILL_FREE;
}
WRITE_ONCE(inode->i_state, state | I_FREEING);
if (!list_empty(&inode->i_lru))
inode_lru_list_del(inode);
spin_unlock(&inode->i_lock);
evict(inode);
}
/**
* iput - put an inode
* @inode: inode to put
*
* Puts an inode, dropping its usage count. If the inode use count hits
* zero, the inode is then freed and may also be destroyed.
*
* Consequently, iput() can sleep.
*/
void iput(struct inode *inode)
{
if (!inode)
return;
BUG_ON(inode->i_state & I_CLEAR);
retry:
if (atomic_dec_and_lock(&inode->i_count, &inode->i_lock)) { if (inode->i_nlink && (inode->i_state & I_DIRTY_TIME)) {
atomic_inc(&inode->i_count);
spin_unlock(&inode->i_lock);
trace_writeback_lazytime_iput(inode);
mark_inode_dirty_sync(inode);
goto retry;
}
iput_final(inode);
}
}
EXPORT_SYMBOL(iput);
#ifdef CONFIG_BLOCK
/**
* bmap - find a block number in a file
* @inode: inode owning the block number being requested
* @block: pointer containing the block to find
*
* Replaces the value in ``*block`` with the block number on the device holding
* corresponding to the requested block number in the file.
* That is, asked for block 4 of inode 1 the function will replace the
* 4 in ``*block``, with disk block relative to the disk start that holds that
* block of the file.
*
* Returns -EINVAL in case of error, 0 otherwise. If mapping falls into a
* hole, returns 0 and ``*block`` is also set to 0.
*/
int bmap(struct inode *inode, sector_t *block)
{
if (!inode->i_mapping->a_ops->bmap)
return -EINVAL;
*block = inode->i_mapping->a_ops->bmap(inode->i_mapping, *block);
return 0;
}
EXPORT_SYMBOL(bmap);
#endif
/*
* With relative atime, only update atime if the previous atime is
* earlier than either the ctime or mtime or if at least a day has
* passed since the last atime update.
*/
static int relatime_need_update(struct vfsmount *mnt, struct inode *inode,
struct timespec64 now)
{
if (!(mnt->mnt_flags & MNT_RELATIME))
return 1;
/*
* Is mtime younger than atime? If yes, update atime:
*/
if (timespec64_compare(&inode->i_mtime, &inode->i_atime) >= 0)
return 1;
/*
* Is ctime younger than atime? If yes, update atime:
*/
if (timespec64_compare(&inode->i_ctime, &inode->i_atime) >= 0)
return 1;
/*
* Is the previous atime value older than a day? If yes,
* update atime:
*/
if ((long)(now.tv_sec - inode->i_atime.tv_sec) >= 24*60*60)
return 1;
/*
* Good, we can skip the atime update:
*/
return 0;
}
int generic_update_time(struct inode *inode, struct timespec64 *time, int flags)
{
int dirty_flags = 0;
if (flags & (S_ATIME | S_CTIME | S_MTIME)) { if (flags & S_ATIME) inode->i_atime = *time; if (flags & S_CTIME) inode->i_ctime = *time; if (flags & S_MTIME) inode->i_mtime = *time; if (inode->i_sb->s_flags & SB_LAZYTIME)
dirty_flags |= I_DIRTY_TIME;
else
dirty_flags |= I_DIRTY_SYNC;
}
if ((flags & S_VERSION) && inode_maybe_inc_iversion(inode, false)) dirty_flags |= I_DIRTY_SYNC; __mark_inode_dirty(inode, dirty_flags);
return 0;
}
EXPORT_SYMBOL(generic_update_time);
/*
* This does the actual work of updating an inodes time or version. Must have
* had called mnt_want_write() before calling this.
*/
int inode_update_time(struct inode *inode, struct timespec64 *time, int flags)
{
if (inode->i_op->update_time) return inode->i_op->update_time(inode, time, flags); return generic_update_time(inode, time, flags);
}
EXPORT_SYMBOL(inode_update_time);
/**
* atime_needs_update - update the access time
* @path: the &struct path to update
* @inode: inode to update
*
* Update the accessed time on an inode and mark it for writeback.
* This function automatically handles read only file systems and media,
* as well as the "noatime" flag and inode specific "noatime" markers.
*/
bool atime_needs_update(const struct path *path, struct inode *inode)
{
struct vfsmount *mnt = path->mnt;
struct timespec64 now;
if (inode->i_flags & S_NOATIME) return false;
/* Atime updates will likely cause i_uid and i_gid to be written
* back improprely if their true value is unknown to the vfs.
*/
if (HAS_UNMAPPED_ID(mnt_user_ns(mnt), inode))
return false;
if (IS_NOATIME(inode))
return false;
if ((inode->i_sb->s_flags & SB_NODIRATIME) && S_ISDIR(inode->i_mode))
return false;
if (mnt->mnt_flags & MNT_NOATIME)
return false;
if ((mnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode))
return false;
now = current_time(inode);
if (!relatime_need_update(mnt, inode, now))
return false;
if (timespec64_equal(&inode->i_atime, &now))
return false;
return true;
}
void touch_atime(const struct path *path)
{
struct vfsmount *mnt = path->mnt;
struct inode *inode = d_inode(path->dentry);
struct timespec64 now;
if (!atime_needs_update(path, inode))
return; if (!sb_start_write_trylock(inode->i_sb))
return;
if (__mnt_want_write(mnt) != 0)
goto skip_update;
/*
* File systems can error out when updating inodes if they need to
* allocate new space to modify an inode (such is the case for
* Btrfs), but since we touch atime while walking down the path we
* really don't care if we failed to update the atime of the file,
* so just ignore the return value.
* We may also fail on filesystems that have the ability to make parts
* of the fs read only, e.g. subvolumes in Btrfs.
*/
now = current_time(inode);
inode_update_time(inode, &now, S_ATIME);
__mnt_drop_write(mnt);
skip_update:
sb_end_write(inode->i_sb);
}
EXPORT_SYMBOL(touch_atime);
/*
* The logic we want is
*
* if suid or (sgid and xgrp)
* remove privs
*/
int should_remove_suid(struct dentry *dentry)
{
umode_t mode = d_inode(dentry)->i_mode;
int kill = 0;
/* suid always must be killed */
if (unlikely(mode & S_ISUID))
kill = ATTR_KILL_SUID;
/*
* sgid without any exec bits is just a mandatory locking mark; leave
* it alone. If some exec bits are set, it's a real sgid; kill it.
*/
if (unlikely((mode & S_ISGID) && (mode & S_IXGRP)))
kill |= ATTR_KILL_SGID;
if (unlikely(kill && !capable(CAP_FSETID) && S_ISREG(mode)))
return kill;
return 0;
}
EXPORT_SYMBOL(should_remove_suid);
/*
* Return mask of changes for notify_change() that need to be done as a
* response to write or truncate. Return 0 if nothing has to be changed.
* Negative value on error (change should be denied).
*/
int dentry_needs_remove_privs(struct dentry *dentry)
{
struct inode *inode = d_inode(dentry);
int mask = 0;
int ret;
if (IS_NOSEC(inode))
return 0;
mask = should_remove_suid(dentry);
ret = security_inode_need_killpriv(dentry);
if (ret < 0)
return ret;
if (ret) mask |= ATTR_KILL_PRIV;
return mask;
}
static int __remove_privs(struct user_namespace *mnt_userns,
struct dentry *dentry, int kill)
{
struct iattr newattrs;
newattrs.ia_valid = ATTR_FORCE | kill;
/*
* Note we call this on write, so notify_change will not
* encounter any conflicting delegations:
*/
return notify_change(mnt_userns, dentry, &newattrs, NULL);
}
/*
* Remove special file priviledges (suid, capabilities) when file is written
* to or truncated.
*/
int file_remove_privs(struct file *file)
{
struct dentry *dentry = file_dentry(file);
struct inode *inode = file_inode(file);
int kill;
int error = 0;
/*
* Fast path for nothing security related.
* As well for non-regular files, e.g. blkdev inodes.
* For example, blkdev_write_iter() might get here
* trying to remove privs which it is not allowed to.
*/
if (IS_NOSEC(inode) || !S_ISREG(inode->i_mode))
return 0;
kill = dentry_needs_remove_privs(dentry);
if (kill < 0)
return kill;
if (kill) error = __remove_privs(file_mnt_user_ns(file), dentry, kill); if (!error)
inode_has_no_xattr(inode);
return error;
}
EXPORT_SYMBOL(file_remove_privs);
/**
* file_update_time - update mtime and ctime time
* @file: file accessed
*
* Update the mtime and ctime members of an inode and mark the inode
* for writeback. Note that this function is meant exclusively for
* usage in the file write path of filesystems, and filesystems may
* choose to explicitly ignore update via this function with the
* S_NOCMTIME inode flag, e.g. for network filesystem where these
* timestamps are handled by the server. This can return an error for
* file systems who need to allocate space in order to update an inode.
*/
int file_update_time(struct file *file)
{
struct inode *inode = file_inode(file);
struct timespec64 now;
int sync_it = 0;
int ret;
/* First try to exhaust all avenues to not sync */
if (IS_NOCMTIME(inode))
return 0; now = current_time(inode);
if (!timespec64_equal(&inode->i_mtime, &now))
sync_it = S_MTIME;
if (!timespec64_equal(&inode->i_ctime, &now)) sync_it |= S_CTIME; if (IS_I_VERSION(inode) && inode_iversion_need_inc(inode)) sync_it |= S_VERSION; if (!sync_it)
return 0;
/* Finally allowed to write? Takes lock. */
if (__mnt_want_write_file(file))
return 0;
ret = inode_update_time(inode, &now, sync_it);
__mnt_drop_write_file(file);
return ret;
}
EXPORT_SYMBOL(file_update_time);
/* Caller must hold the file's inode lock */
int file_modified(struct file *file)
{
int err;
/*
* Clear the security bits if the process is not being run by root.
* This keeps people from modifying setuid and setgid binaries.
*/
err = file_remove_privs(file);
if (err)
return err;
if (unlikely(file->f_mode & FMODE_NOCMTIME))
return 0;
return file_update_time(file);
}
EXPORT_SYMBOL(file_modified);
int inode_needs_sync(struct inode *inode)
{
if (IS_SYNC(inode))
return 1;
if (S_ISDIR(inode->i_mode) && IS_DIRSYNC(inode))
return 1;
return 0;
}
EXPORT_SYMBOL(inode_needs_sync);
/*
* If we try to find an inode in the inode hash while it is being
* deleted, we have to wait until the filesystem completes its
* deletion before reporting that it isn't found. This function waits
* until the deletion _might_ have completed. Callers are responsible
* to recheck inode state.
*
* It doesn't matter if I_NEW is not set initially, a call to
* wake_up_bit(&inode->i_state, __I_NEW) after removing from the hash list
* will DTRT.
*/
static void __wait_on_freeing_inode(struct inode *inode)
{
wait_queue_head_t *wq;
DEFINE_WAIT_BIT(wait, &inode->i_state, __I_NEW);
wq = bit_waitqueue(&inode->i_state, __I_NEW);
prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE);
spin_unlock(&inode->i_lock);
spin_unlock(&inode_hash_lock);
schedule();
finish_wait(wq, &wait.wq_entry);
spin_lock(&inode_hash_lock);
}
static __initdata unsigned long ihash_entries;
static int __init set_ihash_entries(char *str)
{
if (!str)
return 0;
ihash_entries = simple_strtoul(str, &str, 0);
return 1;
}
__setup("ihash_entries=", set_ihash_entries);
/*
* Initialize the waitqueues and inode hash table.
*/
void __init inode_init_early(void)
{
/* If hashes are distributed across NUMA nodes, defer
* hash allocation until vmalloc space is available.
*/
if (hashdist)
return;
inode_hashtable =
alloc_large_system_hash("Inode-cache",
sizeof(struct hlist_head),
ihash_entries,
14,
HASH_EARLY | HASH_ZERO,
&i_hash_shift,
&i_hash_mask,
0,
0);
}
void __init inode_init(void)
{
/* inode slab cache */
inode_cachep = kmem_cache_create("inode_cache",
sizeof(struct inode),
0,
(SLAB_RECLAIM_ACCOUNT|SLAB_PANIC|
SLAB_MEM_SPREAD|SLAB_ACCOUNT),
init_once);
/* Hash may have been set up in inode_init_early */
if (!hashdist)
return;
inode_hashtable =
alloc_large_system_hash("Inode-cache",
sizeof(struct hlist_head),
ihash_entries,
14,
HASH_ZERO,
&i_hash_shift,
&i_hash_mask,
0,
0);
}
void init_special_inode(struct inode *inode, umode_t mode, dev_t rdev)
{
inode->i_mode = mode;
if (S_ISCHR(mode)) {
inode->i_fop = &def_chr_fops;
inode->i_rdev = rdev;
} else if (S_ISBLK(mode)) {
inode->i_fop = &def_blk_fops;
inode->i_rdev = rdev;
} else if (S_ISFIFO(mode))
inode->i_fop = &pipefifo_fops;
else if (S_ISSOCK(mode))
; /* leave it no_open_fops */
else
printk(KERN_DEBUG "init_special_inode: bogus i_mode (%o) for"
" inode %s:%lu\n", mode, inode->i_sb->s_id,
inode->i_ino);
}
EXPORT_SYMBOL(init_special_inode);
/**
* inode_init_owner - Init uid,gid,mode for new inode according to posix standards
* @mnt_userns: User namespace of the mount the inode was created from
* @inode: New inode
* @dir: Directory inode
* @mode: mode of the new inode
*
* If the inode has been created through an idmapped mount the user namespace of
* the vfsmount must be passed through @mnt_userns. This function will then take
* care to map the inode according to @mnt_userns before checking permissions
* and initializing i_uid and i_gid. On non-idmapped mounts or if permission
* checking is to be performed on the raw inode simply passs init_user_ns.
*/
void inode_init_owner(struct user_namespace *mnt_userns, struct inode *inode,
const struct inode *dir, umode_t mode)
{
inode_fsuid_set(inode, mnt_userns);
if (dir && dir->i_mode & S_ISGID) { inode->i_gid = dir->i_gid;
/* Directories are special, and always inherit S_ISGID */
if (S_ISDIR(mode))
mode |= S_ISGID; else if ((mode & (S_ISGID | S_IXGRP)) == (S_ISGID | S_IXGRP) &&
!in_group_p(i_gid_into_mnt(mnt_userns, dir)) &&
!capable_wrt_inode_uidgid(mnt_userns, dir, CAP_FSETID)) mode &= ~S_ISGID;
} else
inode_fsgid_set(inode, mnt_userns);
inode->i_mode = mode;
}
EXPORT_SYMBOL(inode_init_owner);
/**
* inode_owner_or_capable - check current task permissions to inode
* @mnt_userns: user namespace of the mount the inode was found from
* @inode: inode being checked
*
* Return true if current either has CAP_FOWNER in a namespace with the
* inode owner uid mapped, or owns the file.
*
* If the inode has been found through an idmapped mount the user namespace of
* the vfsmount must be passed through @mnt_userns. This function will then take
* care to map the inode according to @mnt_userns before checking permissions.
* On non-idmapped mounts or if permission checking is to be performed on the
* raw inode simply passs init_user_ns.
*/
bool inode_owner_or_capable(struct user_namespace *mnt_userns,
const struct inode *inode)
{
kuid_t i_uid;
struct user_namespace *ns;
i_uid = i_uid_into_mnt(mnt_userns, inode);
if (uid_eq(current_fsuid(), i_uid))
return true;
ns = current_user_ns();
if (kuid_has_mapping(ns, i_uid) && ns_capable(ns, CAP_FOWNER))
return true;
return false;
}
EXPORT_SYMBOL(inode_owner_or_capable);
/*
* Direct i/o helper functions
*/
static void __inode_dio_wait(struct inode *inode)
{
wait_queue_head_t *wq = bit_waitqueue(&inode->i_state, __I_DIO_WAKEUP);
DEFINE_WAIT_BIT(q, &inode->i_state, __I_DIO_WAKEUP);
do {
prepare_to_wait(wq, &q.wq_entry, TASK_UNINTERRUPTIBLE);
if (atomic_read(&inode->i_dio_count))
schedule();
} while (atomic_read(&inode->i_dio_count));
finish_wait(wq, &q.wq_entry);
}
/**
* inode_dio_wait - wait for outstanding DIO requests to finish
* @inode: inode to wait for
*
* Waits for all pending direct I/O requests to finish so that we can
* proceed with a truncate or equivalent operation.
*
* Must be called under a lock that serializes taking new references
* to i_dio_count, usually by inode->i_mutex.
*/
void inode_dio_wait(struct inode *inode)
{
if (atomic_read(&inode->i_dio_count))
__inode_dio_wait(inode);
}
EXPORT_SYMBOL(inode_dio_wait);
/*
* inode_set_flags - atomically set some inode flags
*
* Note: the caller should be holding i_mutex, or else be sure that
* they have exclusive access to the inode structure (i.e., while the
* inode is being instantiated). The reason for the cmpxchg() loop
* --- which wouldn't be necessary if all code paths which modify
* i_flags actually followed this rule, is that there is at least one
* code path which doesn't today so we use cmpxchg() out of an abundance
* of caution.
*
* In the long run, i_mutex is overkill, and we should probably look
* at using the i_lock spinlock to protect i_flags, and then make sure
* it is so documented in include/linux/fs.h and that all code follows
* the locking convention!!
*/
void inode_set_flags(struct inode *inode, unsigned int flags,
unsigned int mask)
{
WARN_ON_ONCE(flags & ~mask);
set_mask_bits(&inode->i_flags, mask, flags);
}
EXPORT_SYMBOL(inode_set_flags);
void inode_nohighmem(struct inode *inode)
{
mapping_set_gfp_mask(inode->i_mapping, GFP_USER);
}
EXPORT_SYMBOL(inode_nohighmem);
/**
* timestamp_truncate - Truncate timespec to a granularity
* @t: Timespec
* @inode: inode being updated
*
* Truncate a timespec to the granularity supported by the fs
* containing the inode. Always rounds down. gran must
* not be 0 nor greater than a second (NSEC_PER_SEC, or 10^9 ns).
*/
struct timespec64 timestamp_truncate(struct timespec64 t, struct inode *inode)
{
struct super_block *sb = inode->i_sb;
unsigned int gran = sb->s_time_gran;
t.tv_sec = clamp(t.tv_sec, sb->s_time_min, sb->s_time_max);
if (unlikely(t.tv_sec == sb->s_time_max || t.tv_sec == sb->s_time_min))
t.tv_nsec = 0;
/* Avoid division in the common cases 1 ns and 1 s. */
if (gran == 1)
; /* nothing */
else if (gran == NSEC_PER_SEC)
t.tv_nsec = 0;
else if (gran > 1 && gran < NSEC_PER_SEC) t.tv_nsec -= t.tv_nsec % gran;
else
WARN(1, "invalid file time granularity: %u", gran); return t;
}
EXPORT_SYMBOL(timestamp_truncate);
/**
* current_time - Return FS time
* @inode: inode.
*
* Return the current time truncated to the time granularity supported by
* the fs.
*
* Note that inode and inode->sb cannot be NULL.
* Otherwise, the function warns and returns time without truncation.
*/
struct timespec64 current_time(struct inode *inode)
{
struct timespec64 now;
ktime_get_coarse_real_ts64(&now);
if (unlikely(!inode->i_sb)) {
WARN(1, "current_time() called with uninitialized super_block in the inode");
return now;
}
return timestamp_truncate(now, inode);
}
EXPORT_SYMBOL(current_time);
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_PAGEMAP_H
#define _LINUX_PAGEMAP_H
/*
* Copyright 1995 Linus Torvalds
*/
#include <linux/mm.h>
#include <linux/fs.h>
#include <linux/list.h>
#include <linux/highmem.h>
#include <linux/compiler.h>
#include <linux/uaccess.h>
#include <linux/gfp.h>
#include <linux/bitops.h>
#include <linux/hardirq.h> /* for in_interrupt() */
#include <linux/hugetlb_inline.h>
struct pagevec;
static inline bool mapping_empty(struct address_space *mapping)
{
return xa_empty(&mapping->i_pages);
}
/*
* Bits in mapping->flags.
*/
enum mapping_flags {
AS_EIO = 0, /* IO error on async write */
AS_ENOSPC = 1, /* ENOSPC on async write */
AS_MM_ALL_LOCKS = 2, /* under mm_take_all_locks() */
AS_UNEVICTABLE = 3, /* e.g., ramdisk, SHM_LOCK */
AS_EXITING = 4, /* final truncate in progress */
/* writeback related tags are not used */
AS_NO_WRITEBACK_TAGS = 5,
AS_THP_SUPPORT = 6, /* THPs supported */
};
/**
* mapping_set_error - record a writeback error in the address_space
* @mapping: the mapping in which an error should be set
* @error: the error to set in the mapping
*
* When writeback fails in some way, we must record that error so that
* userspace can be informed when fsync and the like are called. We endeavor
* to report errors on any file that was open at the time of the error. Some
* internal callers also need to know when writeback errors have occurred.
*
* When a writeback error occurs, most filesystems will want to call
* mapping_set_error to record the error in the mapping so that it can be
* reported when the application calls fsync(2).
*/
static inline void mapping_set_error(struct address_space *mapping, int error)
{
if (likely(!error))
return;
/* Record in wb_err for checkers using errseq_t based tracking */
__filemap_set_wb_err(mapping, error);
/* Record it in superblock */
if (mapping->host)
errseq_set(&mapping->host->i_sb->s_wb_err, error);
/* Record it in flags for now, for legacy callers */
if (error == -ENOSPC)
set_bit(AS_ENOSPC, &mapping->flags);
else
set_bit(AS_EIO, &mapping->flags);
}
static inline void mapping_set_unevictable(struct address_space *mapping)
{
set_bit(AS_UNEVICTABLE, &mapping->flags);
}
static inline void mapping_clear_unevictable(struct address_space *mapping)
{
clear_bit(AS_UNEVICTABLE, &mapping->flags);
}
static inline bool mapping_unevictable(struct address_space *mapping)
{
return mapping && test_bit(AS_UNEVICTABLE, &mapping->flags);
}
static inline void mapping_set_exiting(struct address_space *mapping)
{
set_bit(AS_EXITING, &mapping->flags);
}
static inline int mapping_exiting(struct address_space *mapping)
{
return test_bit(AS_EXITING, &mapping->flags);
}
static inline void mapping_set_no_writeback_tags(struct address_space *mapping)
{
set_bit(AS_NO_WRITEBACK_TAGS, &mapping->flags);
}
static inline int mapping_use_writeback_tags(struct address_space *mapping)
{
return !test_bit(AS_NO_WRITEBACK_TAGS, &mapping->flags);
}
static inline gfp_t mapping_gfp_mask(struct address_space * mapping)
{
return mapping->gfp_mask;
}
/* Restricts the given gfp_mask to what the mapping allows. */
static inline gfp_t mapping_gfp_constraint(struct address_space *mapping,
gfp_t gfp_mask)
{
return mapping_gfp_mask(mapping) & gfp_mask;
}
/*
* This is non-atomic. Only to be used before the mapping is activated.
* Probably needs a barrier...
*/
static inline void mapping_set_gfp_mask(struct address_space *m, gfp_t mask)
{
m->gfp_mask = mask;
}
static inline bool mapping_thp_support(struct address_space *mapping)
{
return test_bit(AS_THP_SUPPORT, &mapping->flags);
}
static inline int filemap_nr_thps(struct address_space *mapping)
{
#ifdef CONFIG_READ_ONLY_THP_FOR_FS
return atomic_read(&mapping->nr_thps);
#else
return 0;
#endif
}
static inline void filemap_nr_thps_inc(struct address_space *mapping)
{
#ifdef CONFIG_READ_ONLY_THP_FOR_FS
if (!mapping_thp_support(mapping))
atomic_inc(&mapping->nr_thps);
#else
WARN_ON_ONCE(1);
#endif
}
static inline void filemap_nr_thps_dec(struct address_space *mapping)
{
#ifdef CONFIG_READ_ONLY_THP_FOR_FS
if (!mapping_thp_support(mapping))
atomic_dec(&mapping->nr_thps);
#else
WARN_ON_ONCE(1);
#endif
}
void release_pages(struct page **pages, int nr);
/*
* For file cache pages, return the address_space, otherwise return NULL
*/
static inline struct address_space *page_mapping_file(struct page *page)
{
if (unlikely(PageSwapCache(page)))
return NULL;
return page_mapping(page);
}
/*
* speculatively take a reference to a page.
* If the page is free (_refcount == 0), then _refcount is untouched, and 0
* is returned. Otherwise, _refcount is incremented by 1 and 1 is returned.
*
* This function must be called inside the same rcu_read_lock() section as has
* been used to lookup the page in the pagecache radix-tree (or page table):
* this allows allocators to use a synchronize_rcu() to stabilize _refcount.
*
* Unless an RCU grace period has passed, the count of all pages coming out
* of the allocator must be considered unstable. page_count may return higher
* than expected, and put_page must be able to do the right thing when the
* page has been finished with, no matter what it is subsequently allocated
* for (because put_page is what is used here to drop an invalid speculative
* reference).
*
* This is the interesting part of the lockless pagecache (and lockless
* get_user_pages) locking protocol, where the lookup-side (eg. find_get_page)
* has the following pattern:
* 1. find page in radix tree
* 2. conditionally increment refcount
* 3. check the page is still in pagecache (if no, goto 1)
*
* Remove-side that cares about stability of _refcount (eg. reclaim) has the
* following (with the i_pages lock held):
* A. atomically check refcount is correct and set it to 0 (atomic_cmpxchg)
* B. remove page from pagecache
* C. free the page
*
* There are 2 critical interleavings that matter:
* - 2 runs before A: in this case, A sees elevated refcount and bails out
* - A runs before 2: in this case, 2 sees zero refcount and retries;
* subsequently, B will complete and 1 will find no page, causing the
* lookup to return NULL.
*
* It is possible that between 1 and 2, the page is removed then the exact same
* page is inserted into the same position in pagecache. That's OK: the
* old find_get_page using a lock could equally have run before or after
* such a re-insertion, depending on order that locks are granted.
*
* Lookups racing against pagecache insertion isn't a big problem: either 1
* will find the page or it will not. Likewise, the old find_get_page could run
* either before the insertion or afterwards, depending on timing.
*/
static inline int __page_cache_add_speculative(struct page *page, int count)
{
#ifdef CONFIG_TINY_RCU
# ifdef CONFIG_PREEMPT_COUNT
VM_BUG_ON(!in_atomic() && !irqs_disabled());
# endif
/*
* Preempt must be disabled here - we rely on rcu_read_lock doing
* this for us.
*
* Pagecache won't be truncated from interrupt context, so if we have
* found a page in the radix tree here, we have pinned its refcount by
* disabling preempt, and hence no need for the "speculative get" that
* SMP requires.
*/
VM_BUG_ON_PAGE(page_count(page) == 0, page);
page_ref_add(page, count);
#else
if (unlikely(!page_ref_add_unless(page, count, 0))) {
/*
* Either the page has been freed, or will be freed.
* In either case, retry here and the caller should
* do the right thing (see comments above).
*/
return 0;
}
#endif
VM_BUG_ON_PAGE(PageTail(page), page);
return 1;
}
static inline int page_cache_get_speculative(struct page *page)
{
return __page_cache_add_speculative(page, 1);
}
static inline int page_cache_add_speculative(struct page *page, int count)
{
return __page_cache_add_speculative(page, count);
}
/**
* attach_page_private - Attach private data to a page.
* @page: Page to attach data to.
* @data: Data to attach to page.
*
* Attaching private data to a page increments the page's reference count.
* The data must be detached before the page will be freed.
*/
static inline void attach_page_private(struct page *page, void *data)
{
get_page(page);
set_page_private(page, (unsigned long)data);
SetPagePrivate(page);
}
/**
* detach_page_private - Detach private data from a page.
* @page: Page to detach data from.
*
* Removes the data that was previously attached to the page and decrements
* the refcount on the page.
*
* Return: Data that was attached to the page.
*/
static inline void *detach_page_private(struct page *page)
{
void *data = (void *)page_private(page);
if (!PagePrivate(page))
return NULL;
ClearPagePrivate(page);
set_page_private(page, 0);
put_page(page);
return data;
}
#ifdef CONFIG_NUMA
extern struct page *__page_cache_alloc(gfp_t gfp);
#else
static inline struct page *__page_cache_alloc(gfp_t gfp)
{
return alloc_pages(gfp, 0);
}
#endif
static inline struct page *page_cache_alloc(struct address_space *x)
{
return __page_cache_alloc(mapping_gfp_mask(x));
}
static inline gfp_t readahead_gfp_mask(struct address_space *x)
{
return mapping_gfp_mask(x) | __GFP_NORETRY | __GFP_NOWARN;
}
typedef int filler_t(void *, struct page *);
pgoff_t page_cache_next_miss(struct address_space *mapping,
pgoff_t index, unsigned long max_scan);
pgoff_t page_cache_prev_miss(struct address_space *mapping,
pgoff_t index, unsigned long max_scan);
#define FGP_ACCESSED 0x00000001
#define FGP_LOCK 0x00000002
#define FGP_CREAT 0x00000004
#define FGP_WRITE 0x00000008
#define FGP_NOFS 0x00000010
#define FGP_NOWAIT 0x00000020
#define FGP_FOR_MMAP 0x00000040
#define FGP_HEAD 0x00000080
#define FGP_ENTRY 0x00000100
struct page *pagecache_get_page(struct address_space *mapping, pgoff_t offset,
int fgp_flags, gfp_t cache_gfp_mask);
/**
* find_get_page - find and get a page reference
* @mapping: the address_space to search
* @offset: the page index
*
* Looks up the page cache slot at @mapping & @offset. If there is a
* page cache page, it is returned with an increased refcount.
*
* Otherwise, %NULL is returned.
*/
static inline struct page *find_get_page(struct address_space *mapping,
pgoff_t offset)
{
return pagecache_get_page(mapping, offset, 0, 0);
}
static inline struct page *find_get_page_flags(struct address_space *mapping,
pgoff_t offset, int fgp_flags)
{
return pagecache_get_page(mapping, offset, fgp_flags, 0);
}
/**
* find_lock_page - locate, pin and lock a pagecache page
* @mapping: the address_space to search
* @index: the page index
*
* Looks up the page cache entry at @mapping & @index. If there is a
* page cache page, it is returned locked and with an increased
* refcount.
*
* Context: May sleep.
* Return: A struct page or %NULL if there is no page in the cache for this
* index.
*/
static inline struct page *find_lock_page(struct address_space *mapping,
pgoff_t index)
{
return pagecache_get_page(mapping, index, FGP_LOCK, 0);
}
/**
* find_lock_head - Locate, pin and lock a pagecache page.
* @mapping: The address_space to search.
* @index: The page index.
*
* Looks up the page cache entry at @mapping & @index. If there is a
* page cache page, its head page is returned locked and with an increased
* refcount.
*
* Context: May sleep.
* Return: A struct page which is !PageTail, or %NULL if there is no page
* in the cache for this index.
*/
static inline struct page *find_lock_head(struct address_space *mapping,
pgoff_t index)
{
return pagecache_get_page(mapping, index, FGP_LOCK | FGP_HEAD, 0);
}
/**
* find_or_create_page - locate or add a pagecache page
* @mapping: the page's address_space
* @index: the page's index into the mapping
* @gfp_mask: page allocation mode
*
* Looks up the page cache slot at @mapping & @offset. If there is a
* page cache page, it is returned locked and with an increased
* refcount.
*
* If the page is not present, a new page is allocated using @gfp_mask
* and added to the page cache and the VM's LRU list. The page is
* returned locked and with an increased refcount.
*
* On memory exhaustion, %NULL is returned.
*
* find_or_create_page() may sleep, even if @gfp_flags specifies an
* atomic allocation!
*/
static inline struct page *find_or_create_page(struct address_space *mapping,
pgoff_t index, gfp_t gfp_mask)
{
return pagecache_get_page(mapping, index,
FGP_LOCK|FGP_ACCESSED|FGP_CREAT,
gfp_mask);
}
/**
* grab_cache_page_nowait - returns locked page at given index in given cache
* @mapping: target address_space
* @index: the page index
*
* Same as grab_cache_page(), but do not wait if the page is unavailable.
* This is intended for speculative data generators, where the data can
* be regenerated if the page couldn't be grabbed. This routine should
* be safe to call while holding the lock for another page.
*
* Clear __GFP_FS when allocating the page to avoid recursion into the fs
* and deadlock against the caller's locked page.
*/
static inline struct page *grab_cache_page_nowait(struct address_space *mapping,
pgoff_t index)
{
return pagecache_get_page(mapping, index,
FGP_LOCK|FGP_CREAT|FGP_NOFS|FGP_NOWAIT,
mapping_gfp_mask(mapping));
}
/* Does this page contain this index? */
static inline bool thp_contains(struct page *head, pgoff_t index)
{
/* HugeTLBfs indexes the page cache in units of hpage_size */
if (PageHuge(head))
return head->index == index;
return page_index(head) == (index & ~(thp_nr_pages(head) - 1UL));
}
/*
* Given the page we found in the page cache, return the page corresponding
* to this index in the file
*/
static inline struct page *find_subpage(struct page *head, pgoff_t index)
{
/* HugeTLBfs wants the head page regardless */
if (PageHuge(head))
return head;
return head + (index & (thp_nr_pages(head) - 1));
}
unsigned find_get_entries(struct address_space *mapping, pgoff_t start,
pgoff_t end, struct pagevec *pvec, pgoff_t *indices);
unsigned find_get_pages_range(struct address_space *mapping, pgoff_t *start,
pgoff_t end, unsigned int nr_pages,
struct page **pages);
static inline unsigned find_get_pages(struct address_space *mapping,
pgoff_t *start, unsigned int nr_pages,
struct page **pages)
{
return find_get_pages_range(mapping, start, (pgoff_t)-1, nr_pages,
pages);
}
unsigned find_get_pages_contig(struct address_space *mapping, pgoff_t start,
unsigned int nr_pages, struct page **pages);
unsigned find_get_pages_range_tag(struct address_space *mapping, pgoff_t *index,
pgoff_t end, xa_mark_t tag, unsigned int nr_pages,
struct page **pages);
static inline unsigned find_get_pages_tag(struct address_space *mapping,
pgoff_t *index, xa_mark_t tag, unsigned int nr_pages,
struct page **pages)
{
return find_get_pages_range_tag(mapping, index, (pgoff_t)-1, tag,
nr_pages, pages);
}
struct page *grab_cache_page_write_begin(struct address_space *mapping,
pgoff_t index, unsigned flags);
/*
* Returns locked page at given index in given cache, creating it if needed.
*/
static inline struct page *grab_cache_page(struct address_space *mapping,
pgoff_t index)
{
return find_or_create_page(mapping, index, mapping_gfp_mask(mapping));
}
extern struct page * read_cache_page(struct address_space *mapping,
pgoff_t index, filler_t *filler, void *data);
extern struct page * read_cache_page_gfp(struct address_space *mapping,
pgoff_t index, gfp_t gfp_mask);
extern int read_cache_pages(struct address_space *mapping,
struct list_head *pages, filler_t *filler, void *data);
static inline struct page *read_mapping_page(struct address_space *mapping,
pgoff_t index, void *data)
{
return read_cache_page(mapping, index, NULL, data);
}
/*
* Get index of the page within radix-tree (but not for hugetlb pages).
* (TODO: remove once hugetlb pages will have ->index in PAGE_SIZE)
*/
static inline pgoff_t page_to_index(struct page *page)
{
struct page *head;
if (likely(!PageTransTail(page)))
return page->index;
head = compound_head(page);
/*
* We don't initialize ->index for tail pages: calculate based on
* head page
*/
return head->index + page - head;
}
extern pgoff_t hugetlb_basepage_index(struct page *page);
/*
* Get the offset in PAGE_SIZE (even for hugetlb pages).
* (TODO: hugetlb pages should have ->index in PAGE_SIZE)
*/
static inline pgoff_t page_to_pgoff(struct page *page)
{
if (unlikely(PageHuge(page)))
return hugetlb_basepage_index(page);
return page_to_index(page);
}
/*
* Return byte-offset into filesystem object for page.
*/
static inline loff_t page_offset(struct page *page)
{
return ((loff_t)page->index) << PAGE_SHIFT;
}
static inline loff_t page_file_offset(struct page *page)
{
return ((loff_t)page_index(page)) << PAGE_SHIFT;
}
extern pgoff_t linear_hugepage_index(struct vm_area_struct *vma,
unsigned long address);
static inline pgoff_t linear_page_index(struct vm_area_struct *vma,
unsigned long address)
{
pgoff_t pgoff;
if (unlikely(is_vm_hugetlb_page(vma)))
return linear_hugepage_index(vma, address); pgoff = (address - vma->vm_start) >> PAGE_SHIFT;
pgoff += vma->vm_pgoff;
return pgoff;
}
struct wait_page_key {
struct page *page;
int bit_nr;
int page_match;
};
struct wait_page_queue {
struct page *page;
int bit_nr;
wait_queue_entry_t wait;
};
static inline bool wake_page_match(struct wait_page_queue *wait_page,
struct wait_page_key *key)
{
if (wait_page->page != key->page)
return false;
key->page_match = 1;
if (wait_page->bit_nr != key->bit_nr)
return false;
return true;
}
extern void __lock_page(struct page *page);
extern int __lock_page_killable(struct page *page);
extern int __lock_page_async(struct page *page, struct wait_page_queue *wait);
extern int __lock_page_or_retry(struct page *page, struct mm_struct *mm,
unsigned int flags);
extern void unlock_page(struct page *page);
/*
* Return true if the page was successfully locked
*/
static inline int trylock_page(struct page *page)
{
page = compound_head(page); return (likely(!test_and_set_bit_lock(PG_locked, &page->flags)));
}
/*
* lock_page may only be called if we have the page's inode pinned.
*/
static inline void lock_page(struct page *page)
{
might_sleep();
if (!trylock_page(page))
__lock_page(page);
}
/*
* lock_page_killable is like lock_page but can be interrupted by fatal
* signals. It returns 0 if it locked the page and -EINTR if it was
* killed while waiting.
*/
static inline int lock_page_killable(struct page *page)
{
might_sleep();
if (!trylock_page(page))
return __lock_page_killable(page);
return 0;
}
/*
* lock_page_async - Lock the page, unless this would block. If the page
* is already locked, then queue a callback when the page becomes unlocked.
* This callback can then retry the operation.
*
* Returns 0 if the page is locked successfully, or -EIOCBQUEUED if the page
* was already locked and the callback defined in 'wait' was queued.
*/
static inline int lock_page_async(struct page *page,
struct wait_page_queue *wait)
{
if (!trylock_page(page))
return __lock_page_async(page, wait);
return 0;
}
/*
* lock_page_or_retry - Lock the page, unless this would block and the
* caller indicated that it can handle a retry.
*
* Return value and mmap_lock implications depend on flags; see
* __lock_page_or_retry().
*/
static inline int lock_page_or_retry(struct page *page, struct mm_struct *mm,
unsigned int flags)
{
might_sleep();
return trylock_page(page) || __lock_page_or_retry(page, mm, flags);
}
/*
* This is exported only for wait_on_page_locked/wait_on_page_writeback, etc.,
* and should not be used directly.
*/
extern void wait_on_page_bit(struct page *page, int bit_nr);
extern int wait_on_page_bit_killable(struct page *page, int bit_nr);
/*
* Wait for a page to be unlocked.
*
* This must be called with the caller "holding" the page,
* ie with increased "page->count" so that the page won't
* go away during the wait..
*/
static inline void wait_on_page_locked(struct page *page)
{
if (PageLocked(page))
wait_on_page_bit(compound_head(page), PG_locked);
}
static inline int wait_on_page_locked_killable(struct page *page)
{
if (!PageLocked(page))
return 0;
return wait_on_page_bit_killable(compound_head(page), PG_locked);
}
int put_and_wait_on_page_locked(struct page *page, int state);
void wait_on_page_writeback(struct page *page);
int wait_on_page_writeback_killable(struct page *page);
extern void end_page_writeback(struct page *page);
void wait_for_stable_page(struct page *page);
void __set_page_dirty(struct page *, struct address_space *, int warn);
int __set_page_dirty_nobuffers(struct page *page);
int __set_page_dirty_no_writeback(struct page *page);
void page_endio(struct page *page, bool is_write, int err);
/**
* set_page_private_2 - Set PG_private_2 on a page and take a ref
* @page: The page.
*
* Set the PG_private_2 flag on a page and take the reference needed for the VM
* to handle its lifetime correctly. This sets the flag and takes the
* reference unconditionally, so care must be taken not to set the flag again
* if it's already set.
*/
static inline void set_page_private_2(struct page *page)
{
page = compound_head(page);
get_page(page);
SetPagePrivate2(page);
}
void end_page_private_2(struct page *page);
void wait_on_page_private_2(struct page *page);
int wait_on_page_private_2_killable(struct page *page);
/*
* Add an arbitrary waiter to a page's wait queue
*/
extern void add_page_wait_queue(struct page *page, wait_queue_entry_t *waiter);
/*
* Fault in userspace address range.
*/
size_t fault_in_writeable(char __user *uaddr, size_t size);
size_t fault_in_safe_writeable(const char __user *uaddr, size_t size);
size_t fault_in_readable(const char __user *uaddr, size_t size);
int add_to_page_cache_locked(struct page *page, struct address_space *mapping,
pgoff_t index, gfp_t gfp_mask);
int add_to_page_cache_lru(struct page *page, struct address_space *mapping,
pgoff_t index, gfp_t gfp_mask);
extern void delete_from_page_cache(struct page *page);
extern void __delete_from_page_cache(struct page *page, void *shadow);
void replace_page_cache_page(struct page *old, struct page *new);
void delete_from_page_cache_batch(struct address_space *mapping,
struct pagevec *pvec);
loff_t mapping_seek_hole_data(struct address_space *, loff_t start, loff_t end,
int whence);
/*
* Like add_to_page_cache_locked, but used to add newly allocated pages:
* the page is new, so we can just run __SetPageLocked() against it.
*/
static inline int add_to_page_cache(struct page *page,
struct address_space *mapping, pgoff_t offset, gfp_t gfp_mask)
{
int error;
__SetPageLocked(page);
error = add_to_page_cache_locked(page, mapping, offset, gfp_mask);
if (unlikely(error))
__ClearPageLocked(page);
return error;
}
/**
* struct readahead_control - Describes a readahead request.
*
* A readahead request is for consecutive pages. Filesystems which
* implement the ->readahead method should call readahead_page() or
* readahead_page_batch() in a loop and attempt to start I/O against
* each page in the request.
*
* Most of the fields in this struct are private and should be accessed
* by the functions below.
*
* @file: The file, used primarily by network filesystems for authentication.
* May be NULL if invoked internally by the filesystem.
* @mapping: Readahead this filesystem object.
* @ra: File readahead state. May be NULL.
*/
struct readahead_control {
struct file *file;
struct address_space *mapping;
struct file_ra_state *ra;
/* private: use the readahead_* accessors instead */
pgoff_t _index;
unsigned int _nr_pages;
unsigned int _batch_count;
};
#define DEFINE_READAHEAD(ractl, f, r, m, i) \
struct readahead_control ractl = { \
.file = f, \
.mapping = m, \
.ra = r, \
._index = i, \
}
#define VM_READAHEAD_PAGES (SZ_128K / PAGE_SIZE)
void page_cache_ra_unbounded(struct readahead_control *,
unsigned long nr_to_read, unsigned long lookahead_count);
void page_cache_sync_ra(struct readahead_control *, unsigned long req_count);
void page_cache_async_ra(struct readahead_control *, struct page *,
unsigned long req_count);
void readahead_expand(struct readahead_control *ractl,
loff_t new_start, size_t new_len);
/**
* page_cache_sync_readahead - generic file readahead
* @mapping: address_space which holds the pagecache and I/O vectors
* @ra: file_ra_state which holds the readahead state
* @file: Used by the filesystem for authentication.
* @index: Index of first page to be read.
* @req_count: Total number of pages being read by the caller.
*
* page_cache_sync_readahead() should be called when a cache miss happened:
* it will submit the read. The readahead logic may decide to piggyback more
* pages onto the read request if access patterns suggest it will improve
* performance.
*/
static inline
void page_cache_sync_readahead(struct address_space *mapping,
struct file_ra_state *ra, struct file *file, pgoff_t index,
unsigned long req_count)
{
DEFINE_READAHEAD(ractl, file, ra, mapping, index);
page_cache_sync_ra(&ractl, req_count);
}
/**
* page_cache_async_readahead - file readahead for marked pages
* @mapping: address_space which holds the pagecache and I/O vectors
* @ra: file_ra_state which holds the readahead state
* @file: Used by the filesystem for authentication.
* @page: The page at @index which triggered the readahead call.
* @index: Index of first page to be read.
* @req_count: Total number of pages being read by the caller.
*
* page_cache_async_readahead() should be called when a page is used which
* is marked as PageReadahead; this is a marker to suggest that the application
* has used up enough of the readahead window that we should start pulling in
* more pages.
*/
static inline
void page_cache_async_readahead(struct address_space *mapping,
struct file_ra_state *ra, struct file *file,
struct page *page, pgoff_t index, unsigned long req_count)
{
DEFINE_READAHEAD(ractl, file, ra, mapping, index);
page_cache_async_ra(&ractl, page, req_count);
}
/**
* readahead_page - Get the next page to read.
* @rac: The current readahead request.
*
* Context: The page is locked and has an elevated refcount. The caller
* should decreases the refcount once the page has been submitted for I/O
* and unlock the page once all I/O to that page has completed.
* Return: A pointer to the next page, or %NULL if we are done.
*/
static inline struct page *readahead_page(struct readahead_control *rac)
{
struct page *page;
BUG_ON(rac->_batch_count > rac->_nr_pages); rac->_nr_pages -= rac->_batch_count;
rac->_index += rac->_batch_count;
if (!rac->_nr_pages) {
rac->_batch_count = 0;
return NULL;
}
page = xa_load(&rac->mapping->i_pages, rac->_index);
VM_BUG_ON_PAGE(!PageLocked(page), page);
rac->_batch_count = thp_nr_pages(page);
return page;
}
static inline unsigned int __readahead_batch(struct readahead_control *rac,
struct page **array, unsigned int array_sz)
{
unsigned int i = 0;
XA_STATE(xas, &rac->mapping->i_pages, 0);
struct page *page;
BUG_ON(rac->_batch_count > rac->_nr_pages); rac->_nr_pages -= rac->_batch_count;
rac->_index += rac->_batch_count;
rac->_batch_count = 0;
xas_set(&xas, rac->_index);
rcu_read_lock();
xas_for_each(&xas, page, rac->_index + rac->_nr_pages - 1) {
if (xas_retry(&xas, page))
continue;
VM_BUG_ON_PAGE(!PageLocked(page), page);
VM_BUG_ON_PAGE(PageTail(page), page);
array[i++] = page;
rac->_batch_count += thp_nr_pages(page);
/*
* The page cache isn't using multi-index entries yet,
* so the xas cursor needs to be manually moved to the
* next index. This can be removed once the page cache
* is converted.
*/
if (PageHead(page))
xas_set(&xas, rac->_index + rac->_batch_count); if (i == array_sz)
break;
}
rcu_read_unlock();
return i;
}
/**
* readahead_page_batch - Get a batch of pages to read.
* @rac: The current readahead request.
* @array: An array of pointers to struct page.
*
* Context: The pages are locked and have an elevated refcount. The caller
* should decreases the refcount once the page has been submitted for I/O
* and unlock the page once all I/O to that page has completed.
* Return: The number of pages placed in the array. 0 indicates the request
* is complete.
*/
#define readahead_page_batch(rac, array) \
__readahead_batch(rac, array, ARRAY_SIZE(array))
/**
* readahead_pos - The byte offset into the file of this readahead request.
* @rac: The readahead request.
*/
static inline loff_t readahead_pos(struct readahead_control *rac)
{
return (loff_t)rac->_index * PAGE_SIZE;
}
/**
* readahead_length - The number of bytes in this readahead request.
* @rac: The readahead request.
*/
static inline size_t readahead_length(struct readahead_control *rac)
{
return rac->_nr_pages * PAGE_SIZE;
}
/**
* readahead_index - The index of the first page in this readahead request.
* @rac: The readahead request.
*/
static inline pgoff_t readahead_index(struct readahead_control *rac)
{
return rac->_index;
}
/**
* readahead_count - The number of pages in this readahead request.
* @rac: The readahead request.
*/
static inline unsigned int readahead_count(struct readahead_control *rac)
{
return rac->_nr_pages;
}
/**
* readahead_batch_length - The number of bytes in the current batch.
* @rac: The readahead request.
*/
static inline size_t readahead_batch_length(struct readahead_control *rac)
{
return rac->_batch_count * PAGE_SIZE;
}
static inline unsigned long dir_pages(struct inode *inode)
{
return (unsigned long)(inode->i_size + PAGE_SIZE - 1) >>
PAGE_SHIFT;
}
/**
* page_mkwrite_check_truncate - check if page was truncated
* @page: the page to check
* @inode: the inode to check the page against
*
* Returns the number of bytes in the page up to EOF,
* or -EFAULT if the page was truncated.
*/
static inline int page_mkwrite_check_truncate(struct page *page,
struct inode *inode)
{
loff_t size = i_size_read(inode);
pgoff_t index = size >> PAGE_SHIFT;
int offset = offset_in_page(size);
if (page->mapping != inode->i_mapping)
return -EFAULT;
/* page is wholly inside EOF */
if (page->index < index)
return PAGE_SIZE;
/* page is wholly past EOF */
if (page->index > index || !offset)
return -EFAULT;
/* page is partially inside EOF */
return offset;
}
/**
* i_blocks_per_page - How many blocks fit in this page.
* @inode: The inode which contains the blocks.
* @page: The page (head page if the page is a THP).
*
* If the block size is larger than the size of this page, return zero.
*
* Context: The caller should hold a refcount on the page to prevent it
* from being split.
* Return: The number of filesystem blocks covered by this page.
*/
static inline
unsigned int i_blocks_per_page(struct inode *inode, struct page *page)
{
return thp_size(page) >> inode->i_blkbits;
}
#endif /* _LINUX_PAGEMAP_H */
// SPDX-License-Identifier: GPL-2.0
/*
* Workingset detection
*
* Copyright (C) 2013 Red Hat, Inc., Johannes Weiner
*/
#include <linux/memcontrol.h>
#include <linux/mm_inline.h>
#include <linux/writeback.h>
#include <linux/shmem_fs.h>
#include <linux/pagemap.h>
#include <linux/atomic.h>
#include <linux/module.h>
#include <linux/swap.h>
#include <linux/dax.h>
#include <linux/fs.h>
#include <linux/mm.h>
/*
* Double CLOCK lists
*
* Per node, two clock lists are maintained for file pages: the
* inactive and the active list. Freshly faulted pages start out at
* the head of the inactive list and page reclaim scans pages from the
* tail. Pages that are accessed multiple times on the inactive list
* are promoted to the active list, to protect them from reclaim,
* whereas active pages are demoted to the inactive list when the
* active list grows too big.
*
* fault ------------------------+
* |
* +--------------+ | +-------------+
* reclaim <- | inactive | <-+-- demotion | active | <--+
* +--------------+ +-------------+ |
* | |
* +-------------- promotion ------------------+
*
*
* Access frequency and refault distance
*
* A workload is thrashing when its pages are frequently used but they
* are evicted from the inactive list every time before another access
* would have promoted them to the active list.
*
* In cases where the average access distance between thrashing pages
* is bigger than the size of memory there is nothing that can be
* done - the thrashing set could never fit into memory under any
* circumstance.
*
* However, the average access distance could be bigger than the
* inactive list, yet smaller than the size of memory. In this case,
* the set could fit into memory if it weren't for the currently
* active pages - which may be used more, hopefully less frequently:
*
* +-memory available to cache-+
* | |
* +-inactive------+-active----+
* a b | c d e f g h i | J K L M N |
* +---------------+-----------+
*
* It is prohibitively expensive to accurately track access frequency
* of pages. But a reasonable approximation can be made to measure
* thrashing on the inactive list, after which refaulting pages can be
* activated optimistically to compete with the existing active pages.
*
* Approximating inactive page access frequency - Observations:
*
* 1. When a page is accessed for the first time, it is added to the
* head of the inactive list, slides every existing inactive page
* towards the tail by one slot, and pushes the current tail page
* out of memory.
*
* 2. When a page is accessed for the second time, it is promoted to
* the active list, shrinking the inactive list by one slot. This
* also slides all inactive pages that were faulted into the cache
* more recently than the activated page towards the tail of the
* inactive list.
*
* Thus:
*
* 1. The sum of evictions and activations between any two points in
* time indicate the minimum number of inactive pages accessed in
* between.
*
* 2. Moving one inactive page N page slots towards the tail of the
* list requires at least N inactive page accesses.
*
* Combining these:
*
* 1. When a page is finally evicted from memory, the number of
* inactive pages accessed while the page was in cache is at least
* the number of page slots on the inactive list.
*
* 2. In addition, measuring the sum of evictions and activations (E)
* at the time of a page's eviction, and comparing it to another
* reading (R) at the time the page faults back into memory tells
* the minimum number of accesses while the page was not cached.
* This is called the refault distance.
*
* Because the first access of the page was the fault and the second
* access the refault, we combine the in-cache distance with the
* out-of-cache distance to get the complete minimum access distance
* of this page:
*
* NR_inactive + (R - E)
*
* And knowing the minimum access distance of a page, we can easily
* tell if the page would be able to stay in cache assuming all page
* slots in the cache were available:
*
* NR_inactive + (R - E) <= NR_inactive + NR_active
*
* which can be further simplified to
*
* (R - E) <= NR_active
*
* Put into words, the refault distance (out-of-cache) can be seen as
* a deficit in inactive list space (in-cache). If the inactive list
* had (R - E) more page slots, the page would not have been evicted
* in between accesses, but activated instead. And on a full system,
* the only thing eating into inactive list space is active pages.
*
*
* Refaulting inactive pages
*
* All that is known about the active list is that the pages have been
* accessed more than once in the past. This means that at any given
* time there is actually a good chance that pages on the active list
* are no longer in active use.
*
* So when a refault distance of (R - E) is observed and there are at
* least (R - E) active pages, the refaulting page is activated
* optimistically in the hope that (R - E) active pages are actually
* used less frequently than the refaulting page - or even not used at
* all anymore.
*
* That means if inactive cache is refaulting with a suitable refault
* distance, we assume the cache workingset is transitioning and put
* pressure on the current active list.
*
* If this is wrong and demotion kicks in, the pages which are truly
* used more frequently will be reactivated while the less frequently
* used once will be evicted from memory.
*
* But if this is right, the stale pages will be pushed out of memory
* and the used pages get to stay in cache.
*
* Refaulting active pages
*
* If on the other hand the refaulting pages have recently been
* deactivated, it means that the active list is no longer protecting
* actively used cache from reclaim. The cache is NOT transitioning to
* a different workingset; the existing workingset is thrashing in the
* space allocated to the page cache.
*
*
* Implementation
*
* For each node's LRU lists, a counter for inactive evictions and
* activations is maintained (node->nonresident_age).
*
* On eviction, a snapshot of this counter (along with some bits to
* identify the node) is stored in the now empty page cache
* slot of the evicted page. This is called a shadow entry.
*
* On cache misses for which there are shadow entries, an eligible
* refault distance will immediately activate the refaulting page.
*/
#define WORKINGSET_SHIFT 1
#define EVICTION_SHIFT ((BITS_PER_LONG - BITS_PER_XA_VALUE) + \
WORKINGSET_SHIFT + NODES_SHIFT + \
MEM_CGROUP_ID_SHIFT)
#define EVICTION_MASK (~0UL >> EVICTION_SHIFT)
/*
* Eviction timestamps need to be able to cover the full range of
* actionable refaults. However, bits are tight in the xarray
* entry, and after storing the identifier for the lruvec there might
* not be enough left to represent every single actionable refault. In
* that case, we have to sacrifice granularity for distance, and group
* evictions into coarser buckets by shaving off lower timestamp bits.
*/
static unsigned int bucket_order __read_mostly;
static void *pack_shadow(int memcgid, pg_data_t *pgdat, unsigned long eviction,
bool workingset)
{
eviction >>= bucket_order;
eviction &= EVICTION_MASK;
eviction = (eviction << MEM_CGROUP_ID_SHIFT) | memcgid;
eviction = (eviction << NODES_SHIFT) | pgdat->node_id;
eviction = (eviction << WORKINGSET_SHIFT) | workingset;
return xa_mk_value(eviction);
}
static void unpack_shadow(void *shadow, int *memcgidp, pg_data_t **pgdat,
unsigned long *evictionp, bool *workingsetp)
{
unsigned long entry = xa_to_value(shadow);
int memcgid, nid;
bool workingset;
workingset = entry & ((1UL << WORKINGSET_SHIFT) - 1);
entry >>= WORKINGSET_SHIFT;
nid = entry & ((1UL << NODES_SHIFT) - 1);
entry >>= NODES_SHIFT;
memcgid = entry & ((1UL << MEM_CGROUP_ID_SHIFT) - 1);
entry >>= MEM_CGROUP_ID_SHIFT;
*memcgidp = memcgid;
*pgdat = NODE_DATA(nid);
*evictionp = entry << bucket_order;
*workingsetp = workingset;
}
/**
* workingset_age_nonresident - age non-resident entries as LRU ages
* @lruvec: the lruvec that was aged
* @nr_pages: the number of pages to count
*
* As in-memory pages are aged, non-resident pages need to be aged as
* well, in order for the refault distances later on to be comparable
* to the in-memory dimensions. This function allows reclaim and LRU
* operations to drive the non-resident aging along in parallel.
*/
void workingset_age_nonresident(struct lruvec *lruvec, unsigned long nr_pages)
{
/*
* Reclaiming a cgroup means reclaiming all its children in a
* round-robin fashion. That means that each cgroup has an LRU
* order that is composed of the LRU orders of its child
* cgroups; and every page has an LRU position not just in the
* cgroup that owns it, but in all of that group's ancestors.
*
* So when the physical inactive list of a leaf cgroup ages,
* the virtual inactive lists of all its parents, including
* the root cgroup's, age as well.
*/
do {
atomic_long_add(nr_pages, &lruvec->nonresident_age);
} while ((lruvec = parent_lruvec(lruvec)));
}
/**
* workingset_eviction - note the eviction of a page from memory
* @target_memcg: the cgroup that is causing the reclaim
* @page: the page being evicted
*
* Return: a shadow entry to be stored in @page->mapping->i_pages in place
* of the evicted @page so that a later refault can be detected.
*/
void *workingset_eviction(struct page *page, struct mem_cgroup *target_memcg)
{
struct pglist_data *pgdat = page_pgdat(page);
unsigned long eviction;
struct lruvec *lruvec;
int memcgid;
/* Page is fully exclusive and pins page's memory cgroup pointer */
VM_BUG_ON_PAGE(PageLRU(page), page);
VM_BUG_ON_PAGE(page_count(page), page);
VM_BUG_ON_PAGE(!PageLocked(page), page);
lruvec = mem_cgroup_lruvec(target_memcg, pgdat);
/* XXX: target_memcg can be NULL, go through lruvec */
memcgid = mem_cgroup_id(lruvec_memcg(lruvec));
eviction = atomic_long_read(&lruvec->nonresident_age);
workingset_age_nonresident(lruvec, thp_nr_pages(page));
return pack_shadow(memcgid, pgdat, eviction, PageWorkingset(page));
}
/**
* workingset_refault - evaluate the refault of a previously evicted page
* @page: the freshly allocated replacement page
* @shadow: shadow entry of the evicted page
*
* Calculates and evaluates the refault distance of the previously
* evicted page in the context of the node and the memcg whose memory
* pressure caused the eviction.
*/
void workingset_refault(struct page *page, void *shadow)
{
bool file = page_is_file_lru(page);
struct mem_cgroup *eviction_memcg;
struct lruvec *eviction_lruvec;
unsigned long refault_distance;
unsigned long workingset_size;
struct pglist_data *pgdat;
struct mem_cgroup *memcg;
unsigned long eviction;
struct lruvec *lruvec;
unsigned long refault;
bool workingset;
int memcgid;
unpack_shadow(shadow, &memcgid, &pgdat, &eviction, &workingset);
rcu_read_lock();
/*
* Look up the memcg associated with the stored ID. It might
* have been deleted since the page's eviction.
*
* Note that in rare events the ID could have been recycled
* for a new cgroup that refaults a shared page. This is
* impossible to tell from the available data. However, this
* should be a rare and limited disturbance, and activations
* are always speculative anyway. Ultimately, it's the aging
* algorithm's job to shake out the minimum access frequency
* for the active cache.
*
* XXX: On !CONFIG_MEMCG, this will always return NULL; it
* would be better if the root_mem_cgroup existed in all
* configurations instead.
*/
eviction_memcg = mem_cgroup_from_id(memcgid);
if (!mem_cgroup_disabled() && !eviction_memcg)
goto out;
eviction_lruvec = mem_cgroup_lruvec(eviction_memcg, pgdat);
refault = atomic_long_read(&eviction_lruvec->nonresident_age);
/*
* Calculate the refault distance
*
* The unsigned subtraction here gives an accurate distance
* across nonresident_age overflows in most cases. There is a
* special case: usually, shadow entries have a short lifetime
* and are either refaulted or reclaimed along with the inode
* before they get too old. But it is not impossible for the
* nonresident_age to lap a shadow entry in the field, which
* can then result in a false small refault distance, leading
* to a false activation should this old entry actually
* refault again. However, earlier kernels used to deactivate
* unconditionally with *every* reclaim invocation for the
* longest time, so the occasional inappropriate activation
* leading to pressure on the active list is not a problem.
*/
refault_distance = (refault - eviction) & EVICTION_MASK;
/*
* The activation decision for this page is made at the level
* where the eviction occurred, as that is where the LRU order
* during page reclaim is being determined.
*
* However, the cgroup that will own the page is the one that
* is actually experiencing the refault event.
*/
memcg = page_memcg(page);
lruvec = mem_cgroup_lruvec(memcg, pgdat);
inc_lruvec_state(lruvec, WORKINGSET_REFAULT_BASE + file);
mem_cgroup_flush_stats_delayed();
/*
* Compare the distance to the existing workingset size. We
* don't activate pages that couldn't stay resident even if
* all the memory was available to the workingset. Whether
* workingset competition needs to consider anon or not depends
* on having swap.
*/
workingset_size = lruvec_page_state(eviction_lruvec, NR_ACTIVE_FILE);
if (!file) {
workingset_size += lruvec_page_state(eviction_lruvec,
NR_INACTIVE_FILE);
}
if (mem_cgroup_get_nr_swap_pages(memcg) > 0) {
workingset_size += lruvec_page_state(eviction_lruvec,
NR_ACTIVE_ANON);
if (file) {
workingset_size += lruvec_page_state(eviction_lruvec,
NR_INACTIVE_ANON);
}
}
if (refault_distance > workingset_size)
goto out;
SetPageActive(page);
workingset_age_nonresident(lruvec, thp_nr_pages(page));
inc_lruvec_state(lruvec, WORKINGSET_ACTIVATE_BASE + file);
/* Page was active prior to eviction */
if (workingset) {
SetPageWorkingset(page);
/* XXX: Move to lru_cache_add() when it supports new vs putback */
lru_note_cost_page(page);
inc_lruvec_state(lruvec, WORKINGSET_RESTORE_BASE + file);
}
out:
rcu_read_unlock();
}
/**
* workingset_activation - note a page activation
* @page: page that is being activated
*/
void workingset_activation(struct page *page)
{
struct mem_cgroup *memcg;
struct lruvec *lruvec;
rcu_read_lock();
/*
* Filter non-memcg pages here, e.g. unmap can call
* mark_page_accessed() on VDSO pages.
*
* XXX: See workingset_refault() - this should return
* root_mem_cgroup even for !CONFIG_MEMCG.
*/
memcg = page_memcg_rcu(page);
if (!mem_cgroup_disabled() && !memcg)
goto out;
lruvec = mem_cgroup_page_lruvec(page);
workingset_age_nonresident(lruvec, thp_nr_pages(page));
out:
rcu_read_unlock();
}
/*
* Shadow entries reflect the share of the working set that does not
* fit into memory, so their number depends on the access pattern of
* the workload. In most cases, they will refault or get reclaimed
* along with the inode, but a (malicious) workload that streams
* through files with a total size several times that of available
* memory, while preventing the inodes from being reclaimed, can
* create excessive amounts of shadow nodes. To keep a lid on this,
* track shadow nodes and reclaim them when they grow way past the
* point where they would still be useful.
*/
static struct list_lru shadow_nodes;
void workingset_update_node(struct xa_node *node)
{
/*
* Track non-empty nodes that contain only shadow entries;
* unlink those that contain pages or are being freed.
*
* Avoid acquiring the list_lru lock when the nodes are
* already where they should be. The list_empty() test is safe
* as node->private_list is protected by the i_pages lock.
*/
VM_WARN_ON_ONCE(!irqs_disabled()); /* For __inc_lruvec_page_state */
if (node->count && node->count == node->nr_values) {
if (list_empty(&node->private_list)) {
list_lru_add(&shadow_nodes, &node->private_list);
__inc_lruvec_kmem_state(node, WORKINGSET_NODES);
}
} else {
if (!list_empty(&node->private_list)) {
list_lru_del(&shadow_nodes, &node->private_list);
__dec_lruvec_kmem_state(node, WORKINGSET_NODES);
}
}
}
static unsigned long count_shadow_nodes(struct shrinker *shrinker,
struct shrink_control *sc)
{
unsigned long max_nodes;
unsigned long nodes;
unsigned long pages;
nodes = list_lru_shrink_count(&shadow_nodes, sc);
if (!nodes)
return SHRINK_EMPTY;
/*
* Approximate a reasonable limit for the nodes
* containing shadow entries. We don't need to keep more
* shadow entries than possible pages on the active list,
* since refault distances bigger than that are dismissed.
*
* The size of the active list converges toward 100% of
* overall page cache as memory grows, with only a tiny
* inactive list. Assume the total cache size for that.
*
* Nodes might be sparsely populated, with only one shadow
* entry in the extreme case. Obviously, we cannot keep one
* node for every eligible shadow entry, so compromise on a
* worst-case density of 1/8th. Below that, not all eligible
* refaults can be detected anymore.
*
* On 64-bit with 7 xa_nodes per page and 64 slots
* each, this will reclaim shadow entries when they consume
* ~1.8% of available memory:
*
* PAGE_SIZE / xa_nodes / node_entries * 8 / PAGE_SIZE
*/
#ifdef CONFIG_MEMCG
if (sc->memcg) {
struct lruvec *lruvec;
int i;
lruvec = mem_cgroup_lruvec(sc->memcg, NODE_DATA(sc->nid));
for (pages = 0, i = 0; i < NR_LRU_LISTS; i++)
pages += lruvec_page_state_local(lruvec,
NR_LRU_BASE + i);
pages += lruvec_page_state_local(
lruvec, NR_SLAB_RECLAIMABLE_B) >> PAGE_SHIFT;
pages += lruvec_page_state_local(
lruvec, NR_SLAB_UNRECLAIMABLE_B) >> PAGE_SHIFT;
} else
#endif
pages = node_present_pages(sc->nid);
max_nodes = pages >> (XA_CHUNK_SHIFT - 3);
if (nodes <= max_nodes)
return 0;
return nodes - max_nodes;
}
static enum lru_status shadow_lru_isolate(struct list_head *item,
struct list_lru_one *lru,
spinlock_t *lru_lock,
void *arg) __must_hold(lru_lock)
{
struct xa_node *node = container_of(item, struct xa_node, private_list);
struct address_space *mapping;
int ret;
/*
* Page cache insertions and deletions synchronously maintain
* the shadow node LRU under the i_pages lock and the
* lru_lock. Because the page cache tree is emptied before
* the inode can be destroyed, holding the lru_lock pins any
* address_space that has nodes on the LRU.
*
* We can then safely transition to the i_pages lock to
* pin only the address_space of the particular node we want
* to reclaim, take the node off-LRU, and drop the lru_lock.
*/
mapping = container_of(node->array, struct address_space, i_pages);
/* Coming from the list, invert the lock order */
if (!xa_trylock(&mapping->i_pages)) {
spin_unlock_irq(lru_lock);
ret = LRU_RETRY;
goto out;
}
list_lru_isolate(lru, item);
__dec_lruvec_kmem_state(node, WORKINGSET_NODES);
spin_unlock(lru_lock);
/*
* The nodes should only contain one or more shadow entries,
* no pages, so we expect to be able to remove them all and
* delete and free the empty node afterwards.
*/
if (WARN_ON_ONCE(!node->nr_values))
goto out_invalid;
if (WARN_ON_ONCE(node->count != node->nr_values))
goto out_invalid;
xa_delete_node(node, workingset_update_node);
__inc_lruvec_kmem_state(node, WORKINGSET_NODERECLAIM);
out_invalid:
xa_unlock_irq(&mapping->i_pages);
ret = LRU_REMOVED_RETRY;
out:
cond_resched();
spin_lock_irq(lru_lock);
return ret;
}
static unsigned long scan_shadow_nodes(struct shrinker *shrinker,
struct shrink_control *sc)
{
/* list_lru lock nests inside the IRQ-safe i_pages lock */
return list_lru_shrink_walk_irq(&shadow_nodes, sc, shadow_lru_isolate,
NULL);
}
static struct shrinker workingset_shadow_shrinker = {
.count_objects = count_shadow_nodes,
.scan_objects = scan_shadow_nodes,
.seeks = 0, /* ->count reports only fully expendable nodes */
.flags = SHRINKER_NUMA_AWARE | SHRINKER_MEMCG_AWARE,
};
/*
* Our list_lru->lock is IRQ-safe as it nests inside the IRQ-safe
* i_pages lock.
*/
static struct lock_class_key shadow_nodes_key;
static int __init workingset_init(void)
{
unsigned int timestamp_bits;
unsigned int max_order;
int ret;
BUILD_BUG_ON(BITS_PER_LONG < EVICTION_SHIFT);
/*
* Calculate the eviction bucket size to cover the longest
* actionable refault distance, which is currently half of
* memory (totalram_pages/2). However, memory hotplug may add
* some more pages at runtime, so keep working with up to
* double the initial memory by using totalram_pages as-is.
*/
timestamp_bits = BITS_PER_LONG - EVICTION_SHIFT;
max_order = fls_long(totalram_pages() - 1);
if (max_order > timestamp_bits)
bucket_order = max_order - timestamp_bits;
pr_info("workingset: timestamp_bits=%d max_order=%d bucket_order=%u\n",
timestamp_bits, max_order, bucket_order);
ret = prealloc_shrinker(&workingset_shadow_shrinker);
if (ret)
goto err;
ret = __list_lru_init(&shadow_nodes, true, &shadow_nodes_key,
&workingset_shadow_shrinker);
if (ret)
goto err_list_lru;
register_shrinker_prepared(&workingset_shadow_shrinker);
return 0;
err_list_lru:
free_prealloced_shrinker(&workingset_shadow_shrinker);
err:
return ret;
}
module_init(workingset_init);
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef __LINUX_DCACHE_H
#define __LINUX_DCACHE_H
#include <linux/atomic.h>
#include <linux/list.h>
#include <linux/math.h>
#include <linux/rculist.h>
#include <linux/rculist_bl.h>
#include <linux/spinlock.h>
#include <linux/seqlock.h>
#include <linux/cache.h>
#include <linux/rcupdate.h>
#include <linux/lockref.h>
#include <linux/stringhash.h>
#include <linux/wait.h>
struct path;
struct vfsmount;
/*
* linux/include/linux/dcache.h
*
* Dirent cache data structures
*
* (C) Copyright 1997 Thomas Schoebel-Theuer,
* with heavy changes by Linus Torvalds
*/
#define IS_ROOT(x) ((x) == (x)->d_parent)
/* The hash is always the low bits of hash_len */
#ifdef __LITTLE_ENDIAN
#define HASH_LEN_DECLARE u32 hash; u32 len
#define bytemask_from_count(cnt) (~(~0ul << (cnt)*8))
#else
#define HASH_LEN_DECLARE u32 len; u32 hash
#define bytemask_from_count(cnt) (~(~0ul >> (cnt)*8))
#endif
/*
* "quick string" -- eases parameter passing, but more importantly
* saves "metadata" about the string (ie length and the hash).
*
* hash comes first so it snuggles against d_parent in the
* dentry.
*/
struct qstr {
union {
struct {
HASH_LEN_DECLARE;
};
u64 hash_len;
};
const unsigned char *name;
};
#define QSTR_INIT(n,l) { { { .len = l } }, .name = n }
extern const struct qstr empty_name;
extern const struct qstr slash_name;
extern const struct qstr dotdot_name;
struct dentry_stat_t {
long nr_dentry;
long nr_unused;
long age_limit; /* age in seconds */
long want_pages; /* pages requested by system */
long nr_negative; /* # of unused negative dentries */
long dummy; /* Reserved for future use */
};
extern struct dentry_stat_t dentry_stat;
/*
* Try to keep struct dentry aligned on 64 byte cachelines (this will
* give reasonable cacheline footprint with larger lines without the
* large memory footprint increase).
*/
#ifdef CONFIG_64BIT
# define DNAME_INLINE_LEN 32 /* 192 bytes */
#else
# ifdef CONFIG_SMP
# define DNAME_INLINE_LEN 36 /* 128 bytes */
# else
# define DNAME_INLINE_LEN 40 /* 128 bytes */
# endif
#endif
#define d_lock d_lockref.lock
struct dentry {
/* RCU lookup touched fields */
unsigned int d_flags; /* protected by d_lock */
seqcount_spinlock_t d_seq; /* per dentry seqlock */
struct hlist_bl_node d_hash; /* lookup hash list */
struct dentry *d_parent; /* parent directory */
struct qstr d_name;
struct inode *d_inode; /* Where the name belongs to - NULL is
* negative */
unsigned char d_iname[DNAME_INLINE_LEN]; /* small names */
/* Ref lookup also touches following */
struct lockref d_lockref; /* per-dentry lock and refcount */
const struct dentry_operations *d_op;
struct super_block *d_sb; /* The root of the dentry tree */
unsigned long d_time; /* used by d_revalidate */
void *d_fsdata; /* fs-specific data */
union {
struct list_head d_lru; /* LRU list */
wait_queue_head_t *d_wait; /* in-lookup ones only */
};
struct list_head d_child; /* child of parent list */
struct list_head d_subdirs; /* our children */
/*
* d_alias and d_rcu can share memory
*/
union {
struct hlist_node d_alias; /* inode alias list */
struct hlist_bl_node d_in_lookup_hash; /* only for in-lookup ones */
struct rcu_head d_rcu;
} d_u;
} __randomize_layout;
/*
* dentry->d_lock spinlock nesting subclasses:
*
* 0: normal
* 1: nested
*/
enum dentry_d_lock_class
{
DENTRY_D_LOCK_NORMAL, /* implicitly used by plain spin_lock() APIs. */
DENTRY_D_LOCK_NESTED
};
struct dentry_operations {
int (*d_revalidate)(struct dentry *, unsigned int);
int (*d_weak_revalidate)(struct dentry *, unsigned int);
int (*d_hash)(const struct dentry *, struct qstr *);
int (*d_compare)(const struct dentry *,
unsigned int, const char *, const struct qstr *);
int (*d_delete)(const struct dentry *);
int (*d_init)(struct dentry *);
void (*d_release)(struct dentry *);
void (*d_prune)(struct dentry *);
void (*d_iput)(struct dentry *, struct inode *);
char *(*d_dname)(struct dentry *, char *, int);
struct vfsmount *(*d_automount)(struct path *);
int (*d_manage)(const struct path *, bool);
struct dentry *(*d_real)(struct dentry *, const struct inode *);
} ____cacheline_aligned;
/*
* Locking rules for dentry_operations callbacks are to be found in
* Documentation/filesystems/locking.rst. Keep it updated!
*
* FUrther descriptions are found in Documentation/filesystems/vfs.rst.
* Keep it updated too!
*/
/* d_flags entries */
#define DCACHE_OP_HASH 0x00000001
#define DCACHE_OP_COMPARE 0x00000002
#define DCACHE_OP_REVALIDATE 0x00000004
#define DCACHE_OP_DELETE 0x00000008
#define DCACHE_OP_PRUNE 0x00000010
#define DCACHE_DISCONNECTED 0x00000020
/* This dentry is possibly not currently connected to the dcache tree, in
* which case its parent will either be itself, or will have this flag as
* well. nfsd will not use a dentry with this bit set, but will first
* endeavour to clear the bit either by discovering that it is connected,
* or by performing lookup operations. Any filesystem which supports
* nfsd_operations MUST have a lookup function which, if it finds a
* directory inode with a DCACHE_DISCONNECTED dentry, will d_move that
* dentry into place and return that dentry rather than the passed one,
* typically using d_splice_alias. */
#define DCACHE_REFERENCED 0x00000040 /* Recently used, don't discard. */
#define DCACHE_DONTCACHE 0x00000080 /* Purge from memory on final dput() */
#define DCACHE_CANT_MOUNT 0x00000100
#define DCACHE_GENOCIDE 0x00000200
#define DCACHE_SHRINK_LIST 0x00000400
#define DCACHE_OP_WEAK_REVALIDATE 0x00000800
#define DCACHE_NFSFS_RENAMED 0x00001000
/* this dentry has been "silly renamed" and has to be deleted on the last
* dput() */
#define DCACHE_COOKIE 0x00002000 /* For use by dcookie subsystem */
#define DCACHE_FSNOTIFY_PARENT_WATCHED 0x00004000
/* Parent inode is watched by some fsnotify listener */
#define DCACHE_DENTRY_KILLED 0x00008000
#define DCACHE_MOUNTED 0x00010000 /* is a mountpoint */
#define DCACHE_NEED_AUTOMOUNT 0x00020000 /* handle automount on this dir */
#define DCACHE_MANAGE_TRANSIT 0x00040000 /* manage transit from this dirent */
#define DCACHE_MANAGED_DENTRY \
(DCACHE_MOUNTED|DCACHE_NEED_AUTOMOUNT|DCACHE_MANAGE_TRANSIT)
#define DCACHE_LRU_LIST 0x00080000
#define DCACHE_ENTRY_TYPE 0x00700000
#define DCACHE_MISS_TYPE 0x00000000 /* Negative dentry (maybe fallthru to nowhere) */
#define DCACHE_WHITEOUT_TYPE 0x00100000 /* Whiteout dentry (stop pathwalk) */
#define DCACHE_DIRECTORY_TYPE 0x00200000 /* Normal directory */
#define DCACHE_AUTODIR_TYPE 0x00300000 /* Lookupless directory (presumed automount) */
#define DCACHE_REGULAR_TYPE 0x00400000 /* Regular file type (or fallthru to such) */
#define DCACHE_SPECIAL_TYPE 0x00500000 /* Other file type (or fallthru to such) */
#define DCACHE_SYMLINK_TYPE 0x00600000 /* Symlink (or fallthru to such) */
#define DCACHE_MAY_FREE 0x00800000
#define DCACHE_FALLTHRU 0x01000000 /* Fall through to lower layer */
#define DCACHE_NOKEY_NAME 0x02000000 /* Encrypted name encoded without key */
#define DCACHE_OP_REAL 0x04000000
#define DCACHE_PAR_LOOKUP 0x10000000 /* being looked up (with parent locked shared) */
#define DCACHE_DENTRY_CURSOR 0x20000000
#define DCACHE_NORCU 0x40000000 /* No RCU delay for freeing */
extern seqlock_t rename_lock;
/*
* These are the low-level FS interfaces to the dcache..
*/
extern void d_instantiate(struct dentry *, struct inode *);
extern void d_instantiate_new(struct dentry *, struct inode *);
extern struct dentry * d_instantiate_unique(struct dentry *, struct inode *);
extern struct dentry * d_instantiate_anon(struct dentry *, struct inode *);
extern void __d_drop(struct dentry *dentry);
extern void d_drop(struct dentry *dentry);
extern void d_delete(struct dentry *);
extern void d_set_d_op(struct dentry *dentry, const struct dentry_operations *op);
/* allocate/de-allocate */
extern struct dentry * d_alloc(struct dentry *, const struct qstr *);
extern struct dentry * d_alloc_anon(struct super_block *);
extern struct dentry * d_alloc_parallel(struct dentry *, const struct qstr *,
wait_queue_head_t *);
extern struct dentry * d_splice_alias(struct inode *, struct dentry *);
extern struct dentry * d_add_ci(struct dentry *, struct inode *, struct qstr *);
extern struct dentry * d_exact_alias(struct dentry *, struct inode *);
extern struct dentry *d_find_any_alias(struct inode *inode);
extern struct dentry * d_obtain_alias(struct inode *);
extern struct dentry * d_obtain_root(struct inode *);
extern void shrink_dcache_sb(struct super_block *);
extern void shrink_dcache_parent(struct dentry *);
extern void shrink_dcache_for_umount(struct super_block *);
extern void d_invalidate(struct dentry *);
/* only used at mount-time */
extern struct dentry * d_make_root(struct inode *);
/* <clickety>-<click> the ramfs-type tree */
extern void d_genocide(struct dentry *);
extern void d_tmpfile(struct dentry *, struct inode *);
extern struct dentry *d_find_alias(struct inode *);
extern void d_prune_aliases(struct inode *);
extern struct dentry *d_find_alias_rcu(struct inode *);
/* test whether we have any submounts in a subdir tree */
extern int path_has_submounts(const struct path *);
/*
* This adds the entry to the hash queues.
*/
extern void d_rehash(struct dentry *);
extern void d_add(struct dentry *, struct inode *);
/* used for rename() and baskets */
extern void d_move(struct dentry *, struct dentry *);
extern void d_exchange(struct dentry *, struct dentry *);
extern struct dentry *d_ancestor(struct dentry *, struct dentry *);
/* appendix may either be NULL or be used for transname suffixes */
extern struct dentry *d_lookup(const struct dentry *, const struct qstr *);
extern struct dentry *d_hash_and_lookup(struct dentry *, struct qstr *);
extern struct dentry *__d_lookup(const struct dentry *, const struct qstr *);
extern struct dentry *__d_lookup_rcu(const struct dentry *parent,
const struct qstr *name, unsigned *seq);
static inline unsigned d_count(const struct dentry *dentry)
{
return dentry->d_lockref.count;
}
/*
* helper function for dentry_operations.d_dname() members
*/
extern __printf(4, 5)
char *dynamic_dname(struct dentry *, char *, int, const char *, ...);
extern char *__d_path(const struct path *, const struct path *, char *, int);
extern char *d_absolute_path(const struct path *, char *, int);
extern char *d_path(const struct path *, char *, int);
extern char *dentry_path_raw(const struct dentry *, char *, int);
extern char *dentry_path(const struct dentry *, char *, int);
/* Allocation counts.. */
/**
* dget, dget_dlock - get a reference to a dentry
* @dentry: dentry to get a reference to
*
* Given a dentry or %NULL pointer increment the reference count
* if appropriate and return the dentry. A dentry will not be
* destroyed when it has references.
*/
static inline struct dentry *dget_dlock(struct dentry *dentry)
{
if (dentry)
dentry->d_lockref.count++;
return dentry;
}
static inline struct dentry *dget(struct dentry *dentry)
{
if (dentry) lockref_get(&dentry->d_lockref);
return dentry;
}
extern struct dentry *dget_parent(struct dentry *dentry);
/**
* d_unhashed - is dentry hashed
* @dentry: entry to check
*
* Returns true if the dentry passed is not currently hashed.
*/
static inline int d_unhashed(const struct dentry *dentry)
{
return hlist_bl_unhashed(&dentry->d_hash);
}
static inline int d_unlinked(const struct dentry *dentry)
{
return d_unhashed(dentry) && !IS_ROOT(dentry);
}
static inline int cant_mount(const struct dentry *dentry)
{
return (dentry->d_flags & DCACHE_CANT_MOUNT);
}
static inline void dont_mount(struct dentry *dentry)
{
spin_lock(&dentry->d_lock);
dentry->d_flags |= DCACHE_CANT_MOUNT;
spin_unlock(&dentry->d_lock);
}
extern void __d_lookup_done(struct dentry *);
static inline int d_in_lookup(const struct dentry *dentry)
{
return dentry->d_flags & DCACHE_PAR_LOOKUP;
}
static inline void d_lookup_done(struct dentry *dentry)
{
if (unlikely(d_in_lookup(dentry))) {
spin_lock(&dentry->d_lock);
__d_lookup_done(dentry);
spin_unlock(&dentry->d_lock);
}
}
extern void dput(struct dentry *);
static inline bool d_managed(const struct dentry *dentry)
{
return dentry->d_flags & DCACHE_MANAGED_DENTRY;
}
static inline bool d_mountpoint(const struct dentry *dentry)
{
return dentry->d_flags & DCACHE_MOUNTED;
}
/*
* Directory cache entry type accessor functions.
*/
static inline unsigned __d_entry_type(const struct dentry *dentry)
{
return dentry->d_flags & DCACHE_ENTRY_TYPE;
}
static inline bool d_is_miss(const struct dentry *dentry)
{
return __d_entry_type(dentry) == DCACHE_MISS_TYPE;
}
static inline bool d_is_whiteout(const struct dentry *dentry)
{
return __d_entry_type(dentry) == DCACHE_WHITEOUT_TYPE;
}
static inline bool d_can_lookup(const struct dentry *dentry)
{
return __d_entry_type(dentry) == DCACHE_DIRECTORY_TYPE;
}
static inline bool d_is_autodir(const struct dentry *dentry)
{
return __d_entry_type(dentry) == DCACHE_AUTODIR_TYPE;
}
static inline bool d_is_dir(const struct dentry *dentry)
{
return d_can_lookup(dentry) || d_is_autodir(dentry);
}
static inline bool d_is_symlink(const struct dentry *dentry)
{
return __d_entry_type(dentry) == DCACHE_SYMLINK_TYPE;
}
static inline bool d_is_reg(const struct dentry *dentry)
{
return __d_entry_type(dentry) == DCACHE_REGULAR_TYPE;
}
static inline bool d_is_special(const struct dentry *dentry)
{
return __d_entry_type(dentry) == DCACHE_SPECIAL_TYPE;
}
static inline bool d_is_file(const struct dentry *dentry)
{
return d_is_reg(dentry) || d_is_special(dentry);
}
static inline bool d_is_negative(const struct dentry *dentry)
{
// TODO: check d_is_whiteout(dentry) also.
return d_is_miss(dentry);
}
static inline bool d_flags_negative(unsigned flags)
{
return (flags & DCACHE_ENTRY_TYPE) == DCACHE_MISS_TYPE;
}
static inline bool d_is_positive(const struct dentry *dentry)
{
return !d_is_negative(dentry);
}
/**
* d_really_is_negative - Determine if a dentry is really negative (ignoring fallthroughs)
* @dentry: The dentry in question
*
* Returns true if the dentry represents either an absent name or a name that
* doesn't map to an inode (ie. ->d_inode is NULL). The dentry could represent
* a true miss, a whiteout that isn't represented by a 0,0 chardev or a
* fallthrough marker in an opaque directory.
*
* Note! (1) This should be used *only* by a filesystem to examine its own
* dentries. It should not be used to look at some other filesystem's
* dentries. (2) It should also be used in combination with d_inode() to get
* the inode. (3) The dentry may have something attached to ->d_lower and the
* type field of the flags may be set to something other than miss or whiteout.
*/
static inline bool d_really_is_negative(const struct dentry *dentry)
{
return dentry->d_inode == NULL;
}
/**
* d_really_is_positive - Determine if a dentry is really positive (ignoring fallthroughs)
* @dentry: The dentry in question
*
* Returns true if the dentry represents a name that maps to an inode
* (ie. ->d_inode is not NULL). The dentry might still represent a whiteout if
* that is represented on medium as a 0,0 chardev.
*
* Note! (1) This should be used *only* by a filesystem to examine its own
* dentries. It should not be used to look at some other filesystem's
* dentries. (2) It should also be used in combination with d_inode() to get
* the inode.
*/
static inline bool d_really_is_positive(const struct dentry *dentry)
{
return dentry->d_inode != NULL;
}
static inline int simple_positive(const struct dentry *dentry)
{
return d_really_is_positive(dentry) && !d_unhashed(dentry);
}
extern void d_set_fallthru(struct dentry *dentry);
static inline bool d_is_fallthru(const struct dentry *dentry)
{
return dentry->d_flags & DCACHE_FALLTHRU;
}
extern int sysctl_vfs_cache_pressure;
static inline unsigned long vfs_pressure_ratio(unsigned long val)
{
return mult_frac(val, sysctl_vfs_cache_pressure, 100);
}
/**
* d_inode - Get the actual inode of this dentry
* @dentry: The dentry to query
*
* This is the helper normal filesystems should use to get at their own inodes
* in their own dentries and ignore the layering superimposed upon them.
*/
static inline struct inode *d_inode(const struct dentry *dentry)
{
return dentry->d_inode;
}
/**
* d_inode_rcu - Get the actual inode of this dentry with READ_ONCE()
* @dentry: The dentry to query
*
* This is the helper normal filesystems should use to get at their own inodes
* in their own dentries and ignore the layering superimposed upon them.
*/
static inline struct inode *d_inode_rcu(const struct dentry *dentry)
{
return READ_ONCE(dentry->d_inode);
}
/**
* d_backing_inode - Get upper or lower inode we should be using
* @upper: The upper layer
*
* This is the helper that should be used to get at the inode that will be used
* if this dentry were to be opened as a file. The inode may be on the upper
* dentry or it may be on a lower dentry pinned by the upper.
*
* Normal filesystems should not use this to access their own inodes.
*/
static inline struct inode *d_backing_inode(const struct dentry *upper)
{
struct inode *inode = upper->d_inode;
return inode;
}
/**
* d_backing_dentry - Get upper or lower dentry we should be using
* @upper: The upper layer
*
* This is the helper that should be used to get the dentry of the inode that
* will be used if this dentry were opened as a file. It may be the upper
* dentry or it may be a lower dentry pinned by the upper.
*
* Normal filesystems should not use this to access their own dentries.
*/
static inline struct dentry *d_backing_dentry(struct dentry *upper)
{
return upper;
}
/**
* d_real - Return the real dentry
* @dentry: the dentry to query
* @inode: inode to select the dentry from multiple layers (can be NULL)
*
* If dentry is on a union/overlay, then return the underlying, real dentry.
* Otherwise return the dentry itself.
*
* See also: Documentation/filesystems/vfs.rst
*/
static inline struct dentry *d_real(struct dentry *dentry,
const struct inode *inode)
{
if (unlikely(dentry->d_flags & DCACHE_OP_REAL))
return dentry->d_op->d_real(dentry, inode);
else
return dentry;
}
/**
* d_real_inode - Return the real inode
* @dentry: The dentry to query
*
* If dentry is on a union/overlay, then return the underlying, real inode.
* Otherwise return d_inode().
*/
static inline struct inode *d_real_inode(const struct dentry *dentry)
{
/* This usage of d_real() results in const dentry */
return d_backing_inode(d_real((struct dentry *) dentry, NULL));
}
struct name_snapshot {
struct qstr name;
unsigned char inline_name[DNAME_INLINE_LEN];
};
void take_dentry_name_snapshot(struct name_snapshot *, struct dentry *);
void release_dentry_name_snapshot(struct name_snapshot *);
#endif /* __LINUX_DCACHE_H */
// SPDX-License-Identifier: GPL-2.0
/*
* Copyright (C) 2007 Oracle. All rights reserved.
*/
#include <linux/bio.h>
#include <linux/slab.h>
#include <linux/pagemap.h>
#include <linux/highmem.h>
#include <linux/sched/mm.h>
#include <crypto/hash.h>
#include "misc.h"
#include "ctree.h"
#include "disk-io.h"
#include "transaction.h"
#include "volumes.h"
#include "print-tree.h"
#include "compression.h"
#define __MAX_CSUM_ITEMS(r, size) ((unsigned long)(((BTRFS_LEAF_DATA_SIZE(r) - \
sizeof(struct btrfs_item) * 2) / \
size) - 1))
#define MAX_CSUM_ITEMS(r, size) (min_t(u32, __MAX_CSUM_ITEMS(r, size), \
PAGE_SIZE))
/**
* Set inode's size according to filesystem options
*
* @inode: inode we want to update the disk_i_size for
* @new_i_size: i_size we want to set to, 0 if we use i_size
*
* With NO_HOLES set this simply sets the disk_is_size to whatever i_size_read()
* returns as it is perfectly fine with a file that has holes without hole file
* extent items.
*
* However without NO_HOLES we need to only return the area that is contiguous
* from the 0 offset of the file. Otherwise we could end up adjust i_size up
* to an extent that has a gap in between.
*
* Finally new_i_size should only be set in the case of truncate where we're not
* ready to use i_size_read() as the limiter yet.
*/
void btrfs_inode_safe_disk_i_size_write(struct btrfs_inode *inode, u64 new_i_size)
{
struct btrfs_fs_info *fs_info = inode->root->fs_info;
u64 start, end, i_size;
int ret;
i_size = new_i_size ?: i_size_read(&inode->vfs_inode); if (btrfs_fs_incompat(fs_info, NO_HOLES)) { inode->disk_i_size = i_size; return;
}
spin_lock(&inode->lock);
ret = find_contiguous_extent_bit(&inode->file_extent_tree, 0, &start,
&end, EXTENT_DIRTY);
if (!ret && start == 0) i_size = min(i_size, end + 1);
else
i_size = 0;
inode->disk_i_size = i_size;
spin_unlock(&inode->lock);
}
/**
* Mark range within a file as having a new extent inserted
*
* @inode: inode being modified
* @start: start file offset of the file extent we've inserted
* @len: logical length of the file extent item
*
* Call when we are inserting a new file extent where there was none before.
* Does not need to call this in the case where we're replacing an existing file
* extent, however if not sure it's fine to call this multiple times.
*
* The start and len must match the file extent item, so thus must be sectorsize
* aligned.
*/
int btrfs_inode_set_file_extent_range(struct btrfs_inode *inode, u64 start,
u64 len)
{
if (len == 0)
return 0;
ASSERT(IS_ALIGNED(start + len, inode->root->fs_info->sectorsize)); if (btrfs_fs_incompat(inode->root->fs_info, NO_HOLES))
return 0;
return set_extent_bits(&inode->file_extent_tree, start, start + len - 1,
EXTENT_DIRTY);
}
/**
* Marks an inode range as not having a backing extent
*
* @inode: inode being modified
* @start: start file offset of the file extent we've inserted
* @len: logical length of the file extent item
*
* Called when we drop a file extent, for example when we truncate. Doesn't
* need to be called for cases where we're replacing a file extent, like when
* we've COWed a file extent.
*
* The start and len must match the file extent item, so thus must be sectorsize
* aligned.
*/
int btrfs_inode_clear_file_extent_range(struct btrfs_inode *inode, u64 start,
u64 len)
{
if (len == 0)
return 0;
ASSERT(IS_ALIGNED(start + len, inode->root->fs_info->sectorsize) ||
len == (u64)-1);
if (btrfs_fs_incompat(inode->root->fs_info, NO_HOLES))
return 0;
return clear_extent_bit(&inode->file_extent_tree, start,
start + len - 1, EXTENT_DIRTY, 0, 0, NULL);
}
static inline u32 max_ordered_sum_bytes(struct btrfs_fs_info *fs_info,
u16 csum_size)
{
u32 ncsums = (PAGE_SIZE - sizeof(struct btrfs_ordered_sum)) / csum_size;
return ncsums * fs_info->sectorsize;
}
int btrfs_insert_file_extent(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
u64 objectid, u64 pos,
u64 disk_offset, u64 disk_num_bytes,
u64 num_bytes, u64 offset, u64 ram_bytes,
u8 compression, u8 encryption, u16 other_encoding)
{
int ret = 0;
struct btrfs_file_extent_item *item;
struct btrfs_key file_key;
struct btrfs_path *path;
struct extent_buffer *leaf;
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
file_key.objectid = objectid;
file_key.offset = pos;
file_key.type = BTRFS_EXTENT_DATA_KEY;
ret = btrfs_insert_empty_item(trans, root, path, &file_key,
sizeof(*item));
if (ret < 0)
goto out;
BUG_ON(ret); /* Can't happen */ leaf = path->nodes[0];
item = btrfs_item_ptr(leaf, path->slots[0],
struct btrfs_file_extent_item);
btrfs_set_file_extent_disk_bytenr(leaf, item, disk_offset);
btrfs_set_file_extent_disk_num_bytes(leaf, item, disk_num_bytes);
btrfs_set_file_extent_offset(leaf, item, offset);
btrfs_set_file_extent_num_bytes(leaf, item, num_bytes);
btrfs_set_file_extent_ram_bytes(leaf, item, ram_bytes);
btrfs_set_file_extent_generation(leaf, item, trans->transid);
btrfs_set_file_extent_type(leaf, item, BTRFS_FILE_EXTENT_REG);
btrfs_set_file_extent_compression(leaf, item, compression);
btrfs_set_file_extent_encryption(leaf, item, encryption);
btrfs_set_file_extent_other_encoding(leaf, item, other_encoding);
btrfs_mark_buffer_dirty(leaf);
out:
btrfs_free_path(path); return ret;
}
static struct btrfs_csum_item *
btrfs_lookup_csum(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_path *path,
u64 bytenr, int cow)
{
struct btrfs_fs_info *fs_info = root->fs_info;
int ret;
struct btrfs_key file_key;
struct btrfs_key found_key;
struct btrfs_csum_item *item;
struct extent_buffer *leaf;
u64 csum_offset = 0;
const u32 csum_size = fs_info->csum_size;
int csums_in_item;
file_key.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
file_key.offset = bytenr;
file_key.type = BTRFS_EXTENT_CSUM_KEY;
ret = btrfs_search_slot(trans, root, &file_key, path, 0, cow);
if (ret < 0)
goto fail;
leaf = path->nodes[0];
if (ret > 0) {
ret = 1;
if (path->slots[0] == 0)
goto fail;
path->slots[0]--;
btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
if (found_key.type != BTRFS_EXTENT_CSUM_KEY)
goto fail;
csum_offset = (bytenr - found_key.offset) >>
fs_info->sectorsize_bits;
csums_in_item = btrfs_item_size_nr(leaf, path->slots[0]);
csums_in_item /= csum_size;
if (csum_offset == csums_in_item) {
ret = -EFBIG;
goto fail;
} else if (csum_offset > csums_in_item) {
goto fail;
}
}
item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_csum_item);
item = (struct btrfs_csum_item *)((unsigned char *)item +
csum_offset * csum_size);
return item;
fail:
if (ret > 0)
ret = -ENOENT;
return ERR_PTR(ret);
}
int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_path *path, u64 objectid,
u64 offset, int mod)
{
struct btrfs_key file_key;
int ins_len = mod < 0 ? -1 : 0;
int cow = mod != 0;
file_key.objectid = objectid;
file_key.offset = offset;
file_key.type = BTRFS_EXTENT_DATA_KEY;
return btrfs_search_slot(trans, root, &file_key, path, ins_len, cow);
}
/*
* Find checksums for logical bytenr range [disk_bytenr, disk_bytenr + len) and
* estore the result to @dst.
*
* Return >0 for the number of sectors we found.
* Return 0 for the range [disk_bytenr, disk_bytenr + sectorsize) has no csum
* for it. Caller may want to try next sector until one range is hit.
* Return <0 for fatal error.
*/
static int search_csum_tree(struct btrfs_fs_info *fs_info,
struct btrfs_path *path, u64 disk_bytenr,
u64 len, u8 *dst)
{
struct btrfs_csum_item *item = NULL;
struct btrfs_key key;
const u32 sectorsize = fs_info->sectorsize;
const u32 csum_size = fs_info->csum_size;
u32 itemsize;
int ret;
u64 csum_start;
u64 csum_len;
ASSERT(IS_ALIGNED(disk_bytenr, sectorsize) &&
IS_ALIGNED(len, sectorsize));
/* Check if the current csum item covers disk_bytenr */
if (path->nodes[0]) {
item = btrfs_item_ptr(path->nodes[0], path->slots[0],
struct btrfs_csum_item);
btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
itemsize = btrfs_item_size_nr(path->nodes[0], path->slots[0]);
csum_start = key.offset;
csum_len = (itemsize / csum_size) * sectorsize;
if (in_range(disk_bytenr, csum_start, csum_len))
goto found;
}
/* Current item doesn't contain the desired range, search again */
btrfs_release_path(path);
item = btrfs_lookup_csum(NULL, fs_info->csum_root, path, disk_bytenr, 0);
if (IS_ERR(item)) {
ret = PTR_ERR(item);
goto out;
}
btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
itemsize = btrfs_item_size_nr(path->nodes[0], path->slots[0]);
csum_start = key.offset;
csum_len = (itemsize / csum_size) * sectorsize;
ASSERT(in_range(disk_bytenr, csum_start, csum_len));
found:
ret = (min(csum_start + csum_len, disk_bytenr + len) -
disk_bytenr) >> fs_info->sectorsize_bits;
read_extent_buffer(path->nodes[0], dst, (unsigned long)item,
ret * csum_size);
out:
if (ret == -ENOENT || ret == -EFBIG)
ret = 0;
return ret;
}
/*
* Locate the file_offset of @cur_disk_bytenr of a @bio.
*
* Bio of btrfs represents read range of
* [bi_sector << 9, bi_sector << 9 + bi_size).
* Knowing this, we can iterate through each bvec to locate the page belong to
* @cur_disk_bytenr and get the file offset.
*
* @inode is used to determine if the bvec page really belongs to @inode.
*
* Return 0 if we can't find the file offset
* Return >0 if we find the file offset and restore it to @file_offset_ret
*/
static int search_file_offset_in_bio(struct bio *bio, struct inode *inode,
u64 disk_bytenr, u64 *file_offset_ret)
{
struct bvec_iter iter;
struct bio_vec bvec;
u64 cur = bio->bi_iter.bi_sector << SECTOR_SHIFT;
int ret = 0;
bio_for_each_segment(bvec, bio, iter) {
struct page *page = bvec.bv_page;
if (cur > disk_bytenr)
break;
if (cur + bvec.bv_len <= disk_bytenr) {
cur += bvec.bv_len;
continue;
}
ASSERT(in_range(disk_bytenr, cur, bvec.bv_len));
if (page->mapping && page->mapping->host &&
page->mapping->host == inode) {
ret = 1;
*file_offset_ret = page_offset(page) + bvec.bv_offset +
disk_bytenr - cur;
break;
}
}
return ret;
}
/**
* Lookup the checksum for the read bio in csum tree.
*
* @inode: inode that the bio is for.
* @bio: bio to look up.
* @dst: Buffer of size nblocks * btrfs_super_csum_size() used to return
* checksum (nblocks = bio->bi_iter.bi_size / fs_info->sectorsize). If
* NULL, the checksum buffer is allocated and returned in
* btrfs_io_bio(bio)->csum instead.
*
* Return: BLK_STS_RESOURCE if allocating memory fails, BLK_STS_OK otherwise.
*/
blk_status_t btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, u8 *dst)
{
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
struct btrfs_path *path;
const u32 sectorsize = fs_info->sectorsize;
const u32 csum_size = fs_info->csum_size;
u32 orig_len = bio->bi_iter.bi_size;
u64 orig_disk_bytenr = bio->bi_iter.bi_sector << SECTOR_SHIFT;
u64 cur_disk_bytenr;
u8 *csum;
const unsigned int nblocks = orig_len >> fs_info->sectorsize_bits;
int count = 0;
if (!fs_info->csum_root || (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM))
return BLK_STS_OK;
/*
* This function is only called for read bio.
*
* This means two things:
* - All our csums should only be in csum tree
* No ordered extents csums, as ordered extents are only for write
* path.
* - No need to bother any other info from bvec
* Since we're looking up csums, the only important info is the
* disk_bytenr and the length, which can be extracted from bi_iter
* directly.
*/
ASSERT(bio_op(bio) == REQ_OP_READ);
path = btrfs_alloc_path();
if (!path)
return BLK_STS_RESOURCE;
if (!dst) {
struct btrfs_io_bio *btrfs_bio = btrfs_io_bio(bio);
if (nblocks * csum_size > BTRFS_BIO_INLINE_CSUM_SIZE) { btrfs_bio->csum = kmalloc_array(nblocks, csum_size,
GFP_NOFS);
if (!btrfs_bio->csum) {
btrfs_free_path(path);
return BLK_STS_RESOURCE;
}
} else {
btrfs_bio->csum = btrfs_bio->csum_inline;
}
csum = btrfs_bio->csum;
} else {
csum = dst;
}
/*
* If requested number of sectors is larger than one leaf can contain,
* kick the readahead for csum tree.
*/
if (nblocks > fs_info->csums_per_leaf) path->reada = READA_FORWARD;
/*
* the free space stuff is only read when it hasn't been
* updated in the current transaction. So, we can safely
* read from the commit root and sidestep a nasty deadlock
* between reading the free space cache and updating the csum tree.
*/
if (btrfs_is_free_space_inode(BTRFS_I(inode))) {
path->search_commit_root = 1;
path->skip_locking = 1;
}
for (cur_disk_bytenr = orig_disk_bytenr;
cur_disk_bytenr < orig_disk_bytenr + orig_len;
cur_disk_bytenr += (count * sectorsize)) {
u64 search_len = orig_disk_bytenr + orig_len - cur_disk_bytenr;
unsigned int sector_offset;
u8 *csum_dst;
/*
* Although both cur_disk_bytenr and orig_disk_bytenr is u64,
* we're calculating the offset to the bio start.
*
* Bio size is limited to UINT_MAX, thus unsigned int is large
* enough to contain the raw result, not to mention the right
* shifted result.
*/
ASSERT(cur_disk_bytenr - orig_disk_bytenr < UINT_MAX);
sector_offset = (cur_disk_bytenr - orig_disk_bytenr) >>
fs_info->sectorsize_bits;
csum_dst = csum + sector_offset * csum_size;
count = search_csum_tree(fs_info, path, cur_disk_bytenr,
search_len, csum_dst);
if (count <= 0) {
/*
* Either we hit a critical error or we didn't find
* the csum.
* Either way, we put zero into the csums dst, and skip
* to the next sector.
*/
memset(csum_dst, 0, csum_size);
count = 1;
/*
* For data reloc inode, we need to mark the range
* NODATASUM so that balance won't report false csum
* error.
*/
if (BTRFS_I(inode)->root->root_key.objectid ==
BTRFS_DATA_RELOC_TREE_OBJECTID) {
u64 file_offset;
int ret;
ret = search_file_offset_in_bio(bio, inode,
cur_disk_bytenr, &file_offset);
if (ret)
set_extent_bits(io_tree, file_offset,
file_offset + sectorsize - 1,
EXTENT_NODATASUM);
} else {
btrfs_warn_rl(fs_info,
"csum hole found for disk bytenr range [%llu, %llu)",
cur_disk_bytenr, cur_disk_bytenr + sectorsize);
}
}
}
btrfs_free_path(path);
return BLK_STS_OK;
}
int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end,
struct list_head *list, int search_commit)
{
struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_key key;
struct btrfs_path *path;
struct extent_buffer *leaf;
struct btrfs_ordered_sum *sums;
struct btrfs_csum_item *item;
LIST_HEAD(tmplist);
unsigned long offset;
int ret;
size_t size;
u64 csum_end;
const u32 csum_size = fs_info->csum_size;
ASSERT(IS_ALIGNED(start, fs_info->sectorsize) &&
IS_ALIGNED(end + 1, fs_info->sectorsize));
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
if (search_commit) { path->skip_locking = 1;
path->reada = READA_FORWARD;
path->search_commit_root = 1;
}
key.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
key.offset = start;
key.type = BTRFS_EXTENT_CSUM_KEY;
ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
if (ret < 0)
goto fail;
if (ret > 0 && path->slots[0] > 0) { leaf = path->nodes[0];
btrfs_item_key_to_cpu(leaf, &key, path->slots[0] - 1);
if (key.objectid == BTRFS_EXTENT_CSUM_OBJECTID &&
key.type == BTRFS_EXTENT_CSUM_KEY) { offset = (start - key.offset) >> fs_info->sectorsize_bits;
if (offset * csum_size <
btrfs_item_size_nr(leaf, path->slots[0] - 1))
path->slots[0]--;
}
}
while (start <= end) { leaf = path->nodes[0];
if (path->slots[0] >= btrfs_header_nritems(leaf)) {
ret = btrfs_next_leaf(root, path);
if (ret < 0)
goto fail;
if (ret > 0)
break;
leaf = path->nodes[0];
}
btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
if (key.objectid != BTRFS_EXTENT_CSUM_OBJECTID ||
key.type != BTRFS_EXTENT_CSUM_KEY || key.offset > end)
break;
if (key.offset > start)
start = key.offset;
size = btrfs_item_size_nr(leaf, path->slots[0]);
csum_end = key.offset + (size / csum_size) * fs_info->sectorsize;
if (csum_end <= start) {
path->slots[0]++;
continue;
}
csum_end = min(csum_end, end + 1);
item = btrfs_item_ptr(path->nodes[0], path->slots[0],
struct btrfs_csum_item);
while (start < csum_end) {
size = min_t(size_t, csum_end - start,
max_ordered_sum_bytes(fs_info, csum_size));
sums = kzalloc(btrfs_ordered_sum_size(fs_info, size),
GFP_NOFS);
if (!sums) {
ret = -ENOMEM;
goto fail;
}
sums->bytenr = start;
sums->len = (int)size;
offset = (start - key.offset) >> fs_info->sectorsize_bits;
offset *= csum_size;
size >>= fs_info->sectorsize_bits;
read_extent_buffer(path->nodes[0],
sums->sums,
((unsigned long)item) + offset,
csum_size * size);
start += fs_info->sectorsize * size;
list_add_tail(&sums->list, &tmplist);
}
path->slots[0]++;
}
ret = 0;
fail:
while (ret < 0 && !list_empty(&tmplist)) {
sums = list_entry(tmplist.next, struct btrfs_ordered_sum, list);
list_del(&sums->list);
kfree(sums);
}
list_splice_tail(&tmplist, list);
btrfs_free_path(path); return ret;
}
/*
* btrfs_csum_one_bio - Calculates checksums of the data contained inside a bio
* @inode: Owner of the data inside the bio
* @bio: Contains the data to be checksummed
* @file_start: offset in file this bio begins to describe
* @contig: Boolean. If true/1 means all bio vecs in this bio are
* contiguous and they begin at @file_start in the file. False/0
* means this bio can contain potentially discontiguous bio vecs
* so the logical offset of each should be calculated separately.
*/
blk_status_t btrfs_csum_one_bio(struct btrfs_inode *inode, struct bio *bio,
u64 file_start, int contig)
{
struct btrfs_fs_info *fs_info = inode->root->fs_info;
SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
struct btrfs_ordered_sum *sums;
struct btrfs_ordered_extent *ordered = NULL;
char *data;
struct bvec_iter iter;
struct bio_vec bvec;
int index;
int nr_sectors;
unsigned long total_bytes = 0;
unsigned long this_sum_bytes = 0;
int i;
u64 offset;
unsigned nofs_flag;
nofs_flag = memalloc_nofs_save();
sums = kvzalloc(btrfs_ordered_sum_size(fs_info, bio->bi_iter.bi_size),
GFP_KERNEL);
memalloc_nofs_restore(nofs_flag);
if (!sums)
return BLK_STS_RESOURCE;
sums->len = bio->bi_iter.bi_size;
INIT_LIST_HEAD(&sums->list);
if (contig)
offset = file_start;
else
offset = 0; /* shut up gcc */
sums->bytenr = bio->bi_iter.bi_sector << 9;
index = 0;
shash->tfm = fs_info->csum_shash;
bio_for_each_segment(bvec, bio, iter) {
if (!contig)
offset = page_offset(bvec.bv_page) + bvec.bv_offset; if (!ordered) { ordered = btrfs_lookup_ordered_extent(inode, offset);
/*
* The bio range is not covered by any ordered extent,
* must be a code logic error.
*/
if (unlikely(!ordered)) {
WARN(1, KERN_WARNING
"no ordered extent for root %llu ino %llu offset %llu\n",
inode->root->root_key.objectid,
btrfs_ino(inode), offset);
kvfree(sums);
return BLK_STS_IOERR;
}
}
nr_sectors = BTRFS_BYTES_TO_BLKS(fs_info,
bvec.bv_len + fs_info->sectorsize
- 1);
for (i = 0; i < nr_sectors; i++) {
if (offset >= ordered->file_offset + ordered->num_bytes ||
offset < ordered->file_offset) {
unsigned long bytes_left;
sums->len = this_sum_bytes;
this_sum_bytes = 0;
btrfs_add_ordered_sum(ordered, sums);
btrfs_put_ordered_extent(ordered);
bytes_left = bio->bi_iter.bi_size - total_bytes;
nofs_flag = memalloc_nofs_save();
sums = kvzalloc(btrfs_ordered_sum_size(fs_info,
bytes_left), GFP_KERNEL);
memalloc_nofs_restore(nofs_flag);
BUG_ON(!sums); /* -ENOMEM */ sums->len = bytes_left;
ordered = btrfs_lookup_ordered_extent(inode,
offset);
ASSERT(ordered); /* Logic error */
sums->bytenr = (bio->bi_iter.bi_sector << 9)
+ total_bytes;
index = 0;
}
data = kmap_atomic(bvec.bv_page);
crypto_shash_digest(shash, data + bvec.bv_offset
+ (i * fs_info->sectorsize),
fs_info->sectorsize,
sums->sums + index);
kunmap_atomic(data);
index += fs_info->csum_size;
offset += fs_info->sectorsize;
this_sum_bytes += fs_info->sectorsize;
total_bytes += fs_info->sectorsize;
}
}
this_sum_bytes = 0;
btrfs_add_ordered_sum(ordered, sums);
btrfs_put_ordered_extent(ordered);
return 0;
}
/*
* helper function for csum removal, this expects the
* key to describe the csum pointed to by the path, and it expects
* the csum to overlap the range [bytenr, len]
*
* The csum should not be entirely contained in the range and the
* range should not be entirely contained in the csum.
*
* This calls btrfs_truncate_item with the correct args based on the
* overlap, and fixes up the key as required.
*/
static noinline void truncate_one_csum(struct btrfs_fs_info *fs_info,
struct btrfs_path *path,
struct btrfs_key *key,
u64 bytenr, u64 len)
{
struct extent_buffer *leaf;
const u32 csum_size = fs_info->csum_size;
u64 csum_end;
u64 end_byte = bytenr + len;
u32 blocksize_bits = fs_info->sectorsize_bits;
leaf = path->nodes[0];
csum_end = btrfs_item_size_nr(leaf, path->slots[0]) / csum_size;
csum_end <<= blocksize_bits;
csum_end += key->offset;
if (key->offset < bytenr && csum_end <= end_byte) {
/*
* [ bytenr - len ]
* [ ]
* [csum ]
* A simple truncate off the end of the item
*/
u32 new_size = (bytenr - key->offset) >> blocksize_bits;
new_size *= csum_size;
btrfs_truncate_item(path, new_size, 1);
} else if (key->offset >= bytenr && csum_end > end_byte &&
end_byte > key->offset) {
/*
* [ bytenr - len ]
* [ ]
* [csum ]
* we need to truncate from the beginning of the csum
*/
u32 new_size = (csum_end - end_byte) >> blocksize_bits;
new_size *= csum_size;
btrfs_truncate_item(path, new_size, 0);
key->offset = end_byte;
btrfs_set_item_key_safe(fs_info, path, key);
} else {
BUG();
}
}
/*
* deletes the csum items from the csum tree for a given
* range of bytes.
*/
int btrfs_del_csums(struct btrfs_trans_handle *trans,
struct btrfs_root *root, u64 bytenr, u64 len)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
struct btrfs_path *path;
struct btrfs_key key;
u64 end_byte = bytenr + len;
u64 csum_end;
struct extent_buffer *leaf;
int ret = 0;
const u32 csum_size = fs_info->csum_size;
u32 blocksize_bits = fs_info->sectorsize_bits;
ASSERT(root == fs_info->csum_root ||
root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID);
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
while (1) {
key.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
key.offset = end_byte - 1;
key.type = BTRFS_EXTENT_CSUM_KEY;
ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
if (ret > 0) {
ret = 0;
if (path->slots[0] == 0)
break;
path->slots[0]--; } else if (ret < 0) {
break;
}
leaf = path->nodes[0];
btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
if (key.objectid != BTRFS_EXTENT_CSUM_OBJECTID ||
key.type != BTRFS_EXTENT_CSUM_KEY) {
break;
}
if (key.offset >= end_byte)
break;
csum_end = btrfs_item_size_nr(leaf, path->slots[0]) / csum_size;
csum_end <<= blocksize_bits;
csum_end += key.offset;
/* this csum ends before we start, we're done */
if (csum_end <= bytenr)
break;
/* delete the entire item, it is inside our range */
if (key.offset >= bytenr && csum_end <= end_byte) {
int del_nr = 1;
/*
* Check how many csum items preceding this one in this
* leaf correspond to our range and then delete them all
* at once.
*/
if (key.offset > bytenr && path->slots[0] > 0) { int slot = path->slots[0] - 1; while (slot >= 0) {
struct btrfs_key pk;
btrfs_item_key_to_cpu(leaf, &pk, slot);
if (pk.offset < bytenr ||
pk.type != BTRFS_EXTENT_CSUM_KEY || pk.objectid !=
BTRFS_EXTENT_CSUM_OBJECTID)
break;
path->slots[0] = slot;
del_nr++;
key.offset = pk.offset;
slot--;
}
}
ret = btrfs_del_items(trans, root, path,
path->slots[0], del_nr);
if (ret)
break;
if (key.offset == bytenr)
break;
} else if (key.offset < bytenr && csum_end > end_byte) {
unsigned long offset;
unsigned long shift_len;
unsigned long item_offset;
/*
* [ bytenr - len ]
* [csum ]
*
* Our bytes are in the middle of the csum,
* we need to split this item and insert a new one.
*
* But we can't drop the path because the
* csum could change, get removed, extended etc.
*
* The trick here is the max size of a csum item leaves
* enough room in the tree block for a single
* item header. So, we split the item in place,
* adding a new header pointing to the existing
* bytes. Then we loop around again and we have
* a nicely formed csum item that we can neatly
* truncate.
*/
offset = (bytenr - key.offset) >> blocksize_bits; offset *= csum_size;
shift_len = (len >> blocksize_bits) * csum_size;
item_offset = btrfs_item_ptr_offset(leaf,
path->slots[0]);
memzero_extent_buffer(leaf, item_offset + offset,
shift_len);
key.offset = bytenr;
/*
* btrfs_split_item returns -EAGAIN when the
* item changed size or key
*/
ret = btrfs_split_item(trans, root, path, &key, offset);
if (ret && ret != -EAGAIN) { btrfs_abort_transaction(trans, ret);
break;
}
ret = 0;
key.offset = end_byte - 1;
} else {
truncate_one_csum(fs_info, path, &key, bytenr, len);
if (key.offset < bytenr)
break;
}
btrfs_release_path(path);
}
btrfs_free_path(path); return ret;
}
static int find_next_csum_offset(struct btrfs_root *root,
struct btrfs_path *path,
u64 *next_offset)
{
const u32 nritems = btrfs_header_nritems(path->nodes[0]);
struct btrfs_key found_key;
int slot = path->slots[0] + 1;
int ret;
if (nritems == 0 || slot >= nritems) {
ret = btrfs_next_leaf(root, path);
if (ret < 0) {
return ret;
} else if (ret > 0) { *next_offset = (u64)-1; return 0;
}
slot = path->slots[0];
}
btrfs_item_key_to_cpu(path->nodes[0], &found_key, slot);
if (found_key.objectid != BTRFS_EXTENT_CSUM_OBJECTID ||
found_key.type != BTRFS_EXTENT_CSUM_KEY)
*next_offset = (u64)-1;
else
*next_offset = found_key.offset;
return 0;
}
int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_ordered_sum *sums)
{
struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_key file_key;
struct btrfs_key found_key;
struct btrfs_path *path;
struct btrfs_csum_item *item;
struct btrfs_csum_item *item_end;
struct extent_buffer *leaf = NULL;
u64 next_offset;
u64 total_bytes = 0;
u64 csum_offset;
u64 bytenr;
u32 ins_size;
int index = 0;
int found_next;
int ret;
const u32 csum_size = fs_info->csum_size;
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
again:
next_offset = (u64)-1;
found_next = 0;
bytenr = sums->bytenr + total_bytes;
file_key.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
file_key.offset = bytenr;
file_key.type = BTRFS_EXTENT_CSUM_KEY;
item = btrfs_lookup_csum(trans, root, path, bytenr, 1);
if (!IS_ERR(item)) {
ret = 0;
leaf = path->nodes[0];
item_end = btrfs_item_ptr(leaf, path->slots[0],
struct btrfs_csum_item);
item_end = (struct btrfs_csum_item *)((char *)item_end +
btrfs_item_size_nr(leaf, path->slots[0]));
goto found;
}
ret = PTR_ERR(item);
if (ret != -EFBIG && ret != -ENOENT)
goto out;
if (ret == -EFBIG) {
u32 item_size;
/* we found one, but it isn't big enough yet */
leaf = path->nodes[0];
item_size = btrfs_item_size_nr(leaf, path->slots[0]);
if ((item_size / csum_size) >=
MAX_CSUM_ITEMS(fs_info, csum_size)) {
/* already at max size, make a new one */
goto insert;
}
} else {
/* We didn't find a csum item, insert one. */
ret = find_next_csum_offset(root, path, &next_offset);
if (ret < 0)
goto out;
found_next = 1;
goto insert;
}
/*
* At this point, we know the tree has a checksum item that ends at an
* offset matching the start of the checksum range we want to insert.
* We try to extend that item as much as possible and then add as many
* checksums to it as they fit.
*
* First check if the leaf has enough free space for at least one
* checksum. If it has go directly to the item extension code, otherwise
* release the path and do a search for insertion before the extension.
*/
if (btrfs_leaf_free_space(leaf) >= csum_size) { btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
csum_offset = (bytenr - found_key.offset) >>
fs_info->sectorsize_bits;
goto extend_csum;
}
btrfs_release_path(path);
path->search_for_extension = 1;
ret = btrfs_search_slot(trans, root, &file_key, path,
csum_size, 1);
path->search_for_extension = 0;
if (ret < 0)
goto out;
if (ret > 0) { if (path->slots[0] == 0)
goto insert;
path->slots[0]--;
}
leaf = path->nodes[0];
btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
csum_offset = (bytenr - found_key.offset) >> fs_info->sectorsize_bits;
if (found_key.type != BTRFS_EXTENT_CSUM_KEY ||
found_key.objectid != BTRFS_EXTENT_CSUM_OBJECTID ||
csum_offset >= MAX_CSUM_ITEMS(fs_info, csum_size)) {
goto insert;
}
extend_csum:
if (csum_offset == btrfs_item_size_nr(leaf, path->slots[0]) /
csum_size) {
int extend_nr;
u64 tmp;
u32 diff;
tmp = sums->len - total_bytes;
tmp >>= fs_info->sectorsize_bits;
WARN_ON(tmp < 1); extend_nr = max_t(int, 1, tmp);
/*
* A log tree can already have checksum items with a subset of
* the checksums we are trying to log. This can happen after
* doing a sequence of partial writes into prealloc extents and
* fsyncs in between, with a full fsync logging a larger subrange
* of an extent for which a previous fast fsync logged a smaller
* subrange. And this happens in particular due to merging file
* extent items when we complete an ordered extent for a range
* covered by a prealloc extent - this is done at
* btrfs_mark_extent_written().
*
* So if we try to extend the previous checksum item, which has
* a range that ends at the start of the range we want to insert,
* make sure we don't extend beyond the start offset of the next
* checksum item. If we are at the last item in the leaf, then
* forget the optimization of extending and add a new checksum
* item - it is not worth the complexity of releasing the path,
* getting the first key for the next leaf, repeat the btree
* search, etc, because log trees are temporary anyway and it
* would only save a few bytes of leaf space.
*/
if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) {
if (path->slots[0] + 1 >=
btrfs_header_nritems(path->nodes[0])) {
ret = find_next_csum_offset(root, path, &next_offset);
if (ret < 0)
goto out;
found_next = 1;
goto insert;
}
ret = find_next_csum_offset(root, path, &next_offset);
if (ret < 0)
goto out;
tmp = (next_offset - bytenr) >> fs_info->sectorsize_bits;
if (tmp <= INT_MAX)
extend_nr = min_t(int, extend_nr, tmp);
}
diff = (csum_offset + extend_nr) * csum_size;
diff = min(diff,
MAX_CSUM_ITEMS(fs_info, csum_size) * csum_size);
diff = diff - btrfs_item_size_nr(leaf, path->slots[0]);
diff = min_t(u32, btrfs_leaf_free_space(leaf), diff);
diff /= csum_size;
diff *= csum_size;
btrfs_extend_item(path, diff);
ret = 0;
goto csum;
}
insert:
btrfs_release_path(path);
csum_offset = 0;
if (found_next) {
u64 tmp;
tmp = sums->len - total_bytes;
tmp >>= fs_info->sectorsize_bits;
tmp = min(tmp, (next_offset - file_key.offset) >>
fs_info->sectorsize_bits);
tmp = max_t(u64, 1, tmp);
tmp = min_t(u64, tmp, MAX_CSUM_ITEMS(fs_info, csum_size));
ins_size = csum_size * tmp;
} else {
ins_size = csum_size;
}
ret = btrfs_insert_empty_item(trans, root, path, &file_key,
ins_size);
if (ret < 0)
goto out;
if (WARN_ON(ret != 0))
goto out;
leaf = path->nodes[0];
csum:
item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_csum_item);
item_end = (struct btrfs_csum_item *)((unsigned char *)item +
btrfs_item_size_nr(leaf, path->slots[0]));
item = (struct btrfs_csum_item *)((unsigned char *)item +
csum_offset * csum_size);
found:
ins_size = (u32)(sums->len - total_bytes) >> fs_info->sectorsize_bits;
ins_size *= csum_size;
ins_size = min_t(u32, (unsigned long)item_end - (unsigned long)item,
ins_size);
write_extent_buffer(leaf, sums->sums + index, (unsigned long)item,
ins_size);
index += ins_size;
ins_size /= csum_size;
total_bytes += ins_size * fs_info->sectorsize;
btrfs_mark_buffer_dirty(path->nodes[0]);
if (total_bytes < sums->len) {
btrfs_release_path(path);
cond_resched();
goto again;
}
out:
btrfs_free_path(path); return ret;
}
void btrfs_extent_item_to_extent_map(struct btrfs_inode *inode,
const struct btrfs_path *path,
struct btrfs_file_extent_item *fi,
const bool new_inline,
struct extent_map *em)
{
struct btrfs_fs_info *fs_info = inode->root->fs_info;
struct btrfs_root *root = inode->root;
struct extent_buffer *leaf = path->nodes[0];
const int slot = path->slots[0];
struct btrfs_key key;
u64 extent_start, extent_end;
u64 bytenr;
u8 type = btrfs_file_extent_type(leaf, fi);
int compress_type = btrfs_file_extent_compression(leaf, fi);
btrfs_item_key_to_cpu(leaf, &key, slot);
extent_start = key.offset;
extent_end = btrfs_file_extent_end(path);
em->ram_bytes = btrfs_file_extent_ram_bytes(leaf, fi);
if (type == BTRFS_FILE_EXTENT_REG ||
type == BTRFS_FILE_EXTENT_PREALLOC) {
em->start = extent_start;
em->len = extent_end - extent_start;
em->orig_start = extent_start -
btrfs_file_extent_offset(leaf, fi);
em->orig_block_len = btrfs_file_extent_disk_num_bytes(leaf, fi);
bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
if (bytenr == 0) {
em->block_start = EXTENT_MAP_HOLE;
return;
}
if (compress_type != BTRFS_COMPRESS_NONE) { set_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
em->compress_type = compress_type;
em->block_start = bytenr;
em->block_len = em->orig_block_len;
} else {
bytenr += btrfs_file_extent_offset(leaf, fi);
em->block_start = bytenr;
em->block_len = em->len;
if (type == BTRFS_FILE_EXTENT_PREALLOC)
set_bit(EXTENT_FLAG_PREALLOC, &em->flags);
}
} else if (type == BTRFS_FILE_EXTENT_INLINE) { em->block_start = EXTENT_MAP_INLINE;
em->start = extent_start;
em->len = extent_end - extent_start;
/*
* Initialize orig_start and block_len with the same values
* as in inode.c:btrfs_get_extent().
*/
em->orig_start = EXTENT_MAP_HOLE;
em->block_len = (u64)-1;
if (!new_inline && compress_type != BTRFS_COMPRESS_NONE) { set_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
em->compress_type = compress_type;
}
} else {
btrfs_err(fs_info,
"unknown file extent item type %d, inode %llu, offset %llu, "
"root %llu", type, btrfs_ino(inode), extent_start,
root->root_key.objectid);
}
}
/*
* Returns the end offset (non inclusive) of the file extent item the given path
* points to. If it points to an inline extent, the returned offset is rounded
* up to the sector size.
*/
u64 btrfs_file_extent_end(const struct btrfs_path *path)
{
const struct extent_buffer *leaf = path->nodes[0];
const int slot = path->slots[0];
struct btrfs_file_extent_item *fi;
struct btrfs_key key;
u64 end;
btrfs_item_key_to_cpu(leaf, &key, slot);
ASSERT(key.type == BTRFS_EXTENT_DATA_KEY);
fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
if (btrfs_file_extent_type(leaf, fi) == BTRFS_FILE_EXTENT_INLINE) {
end = btrfs_file_extent_ram_bytes(leaf, fi);
end = ALIGN(key.offset + end, leaf->fs_info->sectorsize);
} else {
end = key.offset + btrfs_file_extent_num_bytes(leaf, fi);
}
return end;
}
/* SPDX-License-Identifier: GPL-2.0 */
/*
* Copyright (C) 1994 Linus Torvalds
*
* Pentium III FXSR, SSE support
* General FPU state handling cleanups
* Gareth Hughes <gareth@valinux.com>, May 2000
* x86-64 work by Andi Kleen 2002
*/
#ifndef _ASM_X86_FPU_INTERNAL_H
#define _ASM_X86_FPU_INTERNAL_H
#include <linux/compat.h>
#include <linux/sched.h>
#include <linux/slab.h>
#include <linux/mm.h>
#include <asm/user.h>
#include <asm/fpu/api.h>
#include <asm/fpu/xstate.h>
#include <asm/fpu/xcr.h>
#include <asm/cpufeature.h>
#include <asm/trace/fpu.h>
/*
* High level FPU state handling functions:
*/
extern int fpu__restore_sig(void __user *buf, int ia32_frame);
extern void fpu__drop(struct fpu *fpu);
extern void fpu__clear_user_states(struct fpu *fpu);
extern int fpu__exception_code(struct fpu *fpu, int trap_nr);
extern void fpu_sync_fpstate(struct fpu *fpu);
/* Clone and exit operations */
extern int fpu_clone(struct task_struct *dst);
extern void fpu_flush_thread(void);
/*
* Boot time FPU initialization functions:
*/
extern void fpu__init_cpu(void);
extern void fpu__init_system_xstate(void);
extern void fpu__init_cpu_xstate(void);
extern void fpu__init_system(struct cpuinfo_x86 *c);
extern void fpu__init_check_bugs(void);
extern void fpu__resume_cpu(void);
/*
* Debugging facility:
*/
#ifdef CONFIG_X86_DEBUG_FPU
# define WARN_ON_FPU(x) WARN_ON_ONCE(x)
#else
# define WARN_ON_FPU(x) ({ (void)(x); 0; })
#endif
/*
* FPU related CPU feature flag helper routines:
*/
static __always_inline __pure bool use_xsaveopt(void)
{
return static_cpu_has(X86_FEATURE_XSAVEOPT);
}
static __always_inline __pure bool use_xsave(void)
{
return static_cpu_has(X86_FEATURE_XSAVE);
}
static __always_inline __pure bool use_fxsr(void)
{
return static_cpu_has(X86_FEATURE_FXSR);
}
/*
* fpstate handling functions:
*/
extern union fpregs_state init_fpstate;
extern void fpstate_init(union fpregs_state *state);
#ifdef CONFIG_MATH_EMULATION
extern void fpstate_init_soft(struct swregs_state *soft);
#else
static inline void fpstate_init_soft(struct swregs_state *soft) {}
#endif
extern void save_fpregs_to_fpstate(struct fpu *fpu);
/* Returns 0 or the negated trap number, which results in -EFAULT for #PF */
#define user_insn(insn, output, input...) \
({ \
int err; \
\
might_fault(); \
\
asm volatile(ASM_STAC "\n" \
"1: " #insn "\n" \
"2: " ASM_CLAC "\n" \
".section .fixup,\"ax\"\n" \
"3: negl %%eax\n" \
" jmp 2b\n" \
".previous\n" \
_ASM_EXTABLE_FAULT(1b, 3b) \
: [err] "=a" (err), output \
: "0"(0), input); \
err; \
})
#define kernel_insn_err(insn, output, input...) \
({ \
int err; \
asm volatile("1:" #insn "\n\t" \
"2:\n" \
".section .fixup,\"ax\"\n" \
"3: movl $-1,%[err]\n" \
" jmp 2b\n" \
".previous\n" \
_ASM_EXTABLE(1b, 3b) \
: [err] "=r" (err), output \
: "0"(0), input); \
err; \
})
#define kernel_insn(insn, output, input...) \
asm volatile("1:" #insn "\n\t" \
"2:\n" \
_ASM_EXTABLE_HANDLE(1b, 2b, ex_handler_fprestore) \
: output : input)
static inline int fnsave_to_user_sigframe(struct fregs_state __user *fx)
{
return user_insn(fnsave %[fx]; fwait, [fx] "=m" (*fx), "m" (*fx));
}
static inline int fxsave_to_user_sigframe(struct fxregs_state __user *fx)
{
if (IS_ENABLED(CONFIG_X86_32))
return user_insn(fxsave %[fx], [fx] "=m" (*fx), "m" (*fx));
else
return user_insn(fxsaveq %[fx], [fx] "=m" (*fx), "m" (*fx));
}
static inline void fxrstor(struct fxregs_state *fx)
{
if (IS_ENABLED(CONFIG_X86_32))
kernel_insn(fxrstor %[fx], "=m" (*fx), [fx] "m" (*fx));
else
kernel_insn(fxrstorq %[fx], "=m" (*fx), [fx] "m" (*fx));
}
static inline int fxrstor_safe(struct fxregs_state *fx)
{
if (IS_ENABLED(CONFIG_X86_32))
return kernel_insn_err(fxrstor %[fx], "=m" (*fx), [fx] "m" (*fx));
else
return kernel_insn_err(fxrstorq %[fx], "=m" (*fx), [fx] "m" (*fx));
}
static inline int fxrstor_from_user_sigframe(struct fxregs_state __user *fx)
{
if (IS_ENABLED(CONFIG_X86_32))
return user_insn(fxrstor %[fx], "=m" (*fx), [fx] "m" (*fx));
else
return user_insn(fxrstorq %[fx], "=m" (*fx), [fx] "m" (*fx));
}
static inline void frstor(struct fregs_state *fx)
{
kernel_insn(frstor %[fx], "=m" (*fx), [fx] "m" (*fx));
}
static inline int frstor_safe(struct fregs_state *fx)
{
return kernel_insn_err(frstor %[fx], "=m" (*fx), [fx] "m" (*fx));
}
static inline int frstor_from_user_sigframe(struct fregs_state __user *fx)
{
return user_insn(frstor %[fx], "=m" (*fx), [fx] "m" (*fx));
}
static inline void fxsave(struct fxregs_state *fx)
{
if (IS_ENABLED(CONFIG_X86_32))
asm volatile( "fxsave %[fx]" : [fx] "=m" (*fx));
else
asm volatile("fxsaveq %[fx]" : [fx] "=m" (*fx));
}
/* These macros all use (%edi)/(%rdi) as the single memory argument. */
#define XSAVE ".byte " REX_PREFIX "0x0f,0xae,0x27"
#define XSAVEOPT ".byte " REX_PREFIX "0x0f,0xae,0x37"
#define XSAVES ".byte " REX_PREFIX "0x0f,0xc7,0x2f"
#define XRSTOR ".byte " REX_PREFIX "0x0f,0xae,0x2f"
#define XRSTORS ".byte " REX_PREFIX "0x0f,0xc7,0x1f"
/*
* After this @err contains 0 on success or the negated trap number when
* the operation raises an exception. For faults this results in -EFAULT.
*/
#define XSTATE_OP(op, st, lmask, hmask, err) \
asm volatile("1:" op "\n\t" \
"xor %[err], %[err]\n" \
"2:\n\t" \
".pushsection .fixup,\"ax\"\n\t" \
"3: negl %%eax\n\t" \
"jmp 2b\n\t" \
".popsection\n\t" \
_ASM_EXTABLE_FAULT(1b, 3b) \
: [err] "=a" (err) \
: "D" (st), "m" (*st), "a" (lmask), "d" (hmask) \
: "memory")
/*
* If XSAVES is enabled, it replaces XSAVEOPT because it supports a compact
* format and supervisor states in addition to modified optimization in
* XSAVEOPT.
*
* Otherwise, if XSAVEOPT is enabled, XSAVEOPT replaces XSAVE because XSAVEOPT
* supports modified optimization which is not supported by XSAVE.
*
* We use XSAVE as a fallback.
*
* The 661 label is defined in the ALTERNATIVE* macros as the address of the
* original instruction which gets replaced. We need to use it here as the
* address of the instruction where we might get an exception at.
*/
#define XSTATE_XSAVE(st, lmask, hmask, err) \
asm volatile(ALTERNATIVE_2(XSAVE, \
XSAVEOPT, X86_FEATURE_XSAVEOPT, \
XSAVES, X86_FEATURE_XSAVES) \
"\n" \
"xor %[err], %[err]\n" \
"3:\n" \
".pushsection .fixup,\"ax\"\n" \
"4: movl $-2, %[err]\n" \
"jmp 3b\n" \
".popsection\n" \
_ASM_EXTABLE(661b, 4b) \
: [err] "=r" (err) \
: "D" (st), "m" (*st), "a" (lmask), "d" (hmask) \
: "memory")
/*
* Use XRSTORS to restore context if it is enabled. XRSTORS supports compact
* XSAVE area format.
*/
#define XSTATE_XRESTORE(st, lmask, hmask) \
asm volatile(ALTERNATIVE(XRSTOR, \
XRSTORS, X86_FEATURE_XSAVES) \
"\n" \
"3:\n" \
_ASM_EXTABLE_HANDLE(661b, 3b, ex_handler_fprestore)\
: \
: "D" (st), "m" (*st), "a" (lmask), "d" (hmask) \
: "memory")
/*
* This function is called only during boot time when x86 caps are not set
* up and alternative can not be used yet.
*/
static inline void os_xrstor_booting(struct xregs_state *xstate)
{
u64 mask = xfeatures_mask_fpstate();
u32 lmask = mask;
u32 hmask = mask >> 32;
int err;
WARN_ON(system_state != SYSTEM_BOOTING);
if (boot_cpu_has(X86_FEATURE_XSAVES))
XSTATE_OP(XRSTORS, xstate, lmask, hmask, err);
else
XSTATE_OP(XRSTOR, xstate, lmask, hmask, err);
/*
* We should never fault when copying from a kernel buffer, and the FPU
* state we set at boot time should be valid.
*/
WARN_ON_FPU(err);
}
/*
* Save processor xstate to xsave area.
*
* Uses either XSAVE or XSAVEOPT or XSAVES depending on the CPU features
* and command line options. The choice is permanent until the next reboot.
*/
static inline void os_xsave(struct xregs_state *xstate)
{
u64 mask = xfeatures_mask_all;
u32 lmask = mask;
u32 hmask = mask >> 32;
int err;
WARN_ON_FPU(!alternatives_patched);
XSTATE_XSAVE(xstate, lmask, hmask, err);
/* We should never fault when copying to a kernel buffer: */
WARN_ON_FPU(err);
}
/*
* Restore processor xstate from xsave area.
*
* Uses XRSTORS when XSAVES is used, XRSTOR otherwise.
*/
static inline void os_xrstor(struct xregs_state *xstate, u64 mask)
{
u32 lmask = mask;
u32 hmask = mask >> 32;
XSTATE_XRESTORE(xstate, lmask, hmask);
}
/*
* Save xstate to user space xsave area.
*
* We don't use modified optimization because xrstor/xrstors might track
* a different application.
*
* We don't use compacted format xsave area for
* backward compatibility for old applications which don't understand
* compacted format of xsave area.
*/
static inline int xsave_to_user_sigframe(struct xregs_state __user *buf)
{
/*
* Include the features which are not xsaved/rstored by the kernel
* internally, e.g. PKRU. That's user space ABI and also required
* to allow the signal handler to modify PKRU.
*/
u64 mask = xfeatures_mask_uabi();
u32 lmask = mask;
u32 hmask = mask >> 32;
int err;
/*
* Clear the xsave header first, so that reserved fields are
* initialized to zero.
*/
err = __clear_user(&buf->header, sizeof(buf->header));
if (unlikely(err))
return -EFAULT;
stac();
XSTATE_OP(XSAVE, buf, lmask, hmask, err);
clac();
return err;
}
/*
* Restore xstate from user space xsave area.
*/
static inline int xrstor_from_user_sigframe(struct xregs_state __user *buf, u64 mask)
{
struct xregs_state *xstate = ((__force struct xregs_state *)buf);
u32 lmask = mask;
u32 hmask = mask >> 32;
int err;
stac();
XSTATE_OP(XRSTOR, xstate, lmask, hmask, err);
clac();
return err;
}
/*
* Restore xstate from kernel space xsave area, return an error code instead of
* an exception.
*/
static inline int os_xrstor_safe(struct xregs_state *xstate, u64 mask)
{
u32 lmask = mask;
u32 hmask = mask >> 32;
int err;
if (cpu_feature_enabled(X86_FEATURE_XSAVES))
XSTATE_OP(XRSTORS, xstate, lmask, hmask, err);
else
XSTATE_OP(XRSTOR, xstate, lmask, hmask, err);
return err;
}
extern void __restore_fpregs_from_fpstate(union fpregs_state *fpstate, u64 mask);
static inline void restore_fpregs_from_fpstate(union fpregs_state *fpstate)
{
__restore_fpregs_from_fpstate(fpstate, xfeatures_mask_fpstate());
}
extern int copy_fpstate_to_sigframe(void __user *buf, void __user *fp, int size);
/*
* FPU context switch related helper methods:
*/
DECLARE_PER_CPU(struct fpu *, fpu_fpregs_owner_ctx);
/*
* The in-register FPU state for an FPU context on a CPU is assumed to be
* valid if the fpu->last_cpu matches the CPU, and the fpu_fpregs_owner_ctx
* matches the FPU.
*
* If the FPU register state is valid, the kernel can skip restoring the
* FPU state from memory.
*
* Any code that clobbers the FPU registers or updates the in-memory
* FPU state for a task MUST let the rest of the kernel know that the
* FPU registers are no longer valid for this task.
*
* Either one of these invalidation functions is enough. Invalidate
* a resource you control: CPU if using the CPU for something else
* (with preemption disabled), FPU for the current task, or a task that
* is prevented from running by the current task.
*/
static inline void __cpu_invalidate_fpregs_state(void)
{
__this_cpu_write(fpu_fpregs_owner_ctx, NULL);
}
static inline void __fpu_invalidate_fpregs_state(struct fpu *fpu)
{
fpu->last_cpu = -1;
}
static inline int fpregs_state_valid(struct fpu *fpu, unsigned int cpu)
{
return fpu == this_cpu_read(fpu_fpregs_owner_ctx) && cpu == fpu->last_cpu;
}
/*
* These generally need preemption protection to work,
* do try to avoid using these on their own:
*/
static inline void fpregs_deactivate(struct fpu *fpu)
{
this_cpu_write(fpu_fpregs_owner_ctx, NULL);
trace_x86_fpu_regs_deactivated(fpu);
}
static inline void fpregs_activate(struct fpu *fpu)
{
this_cpu_write(fpu_fpregs_owner_ctx, fpu);
trace_x86_fpu_regs_activated(fpu);
}
/* Internal helper for switch_fpu_return() and signal frame setup */
static inline void fpregs_restore_userregs(void)
{
struct fpu *fpu = ¤t->thread.fpu;
int cpu = smp_processor_id();
if (WARN_ON_ONCE(current->flags & PF_KTHREAD))
return;
if (!fpregs_state_valid(fpu, cpu)) {
u64 mask;
/*
* This restores _all_ xstate which has not been
* established yet.
*
* If PKRU is enabled, then the PKRU value is already
* correct because it was either set in switch_to() or in
* flush_thread(). So it is excluded because it might be
* not up to date in current->thread.fpu.xsave state.
*/
mask = xfeatures_mask_restore_user() |
xfeatures_mask_supervisor();
__restore_fpregs_from_fpstate(&fpu->state, mask);
fpregs_activate(fpu);
fpu->last_cpu = cpu;
}
clear_thread_flag(TIF_NEED_FPU_LOAD);
}
/*
* FPU state switching for scheduling.
*
* This is a two-stage process:
*
* - switch_fpu_prepare() saves the old state.
* This is done within the context of the old process.
*
* - switch_fpu_finish() sets TIF_NEED_FPU_LOAD; the floating point state
* will get loaded on return to userspace, or when the kernel needs it.
*
* If TIF_NEED_FPU_LOAD is cleared then the CPU's FPU registers
* are saved in the current thread's FPU register state.
*
* If TIF_NEED_FPU_LOAD is set then CPU's FPU registers may not
* hold current()'s FPU registers. It is required to load the
* registers before returning to userland or using the content
* otherwise.
*
* The FPU context is only stored/restored for a user task and
* PF_KTHREAD is used to distinguish between kernel and user threads.
*/
static inline void switch_fpu_prepare(struct fpu *old_fpu, int cpu)
{
if (static_cpu_has(X86_FEATURE_FPU) && !(current->flags & PF_KTHREAD)) {
save_fpregs_to_fpstate(old_fpu);
/*
* The save operation preserved register state, so the
* fpu_fpregs_owner_ctx is still @old_fpu. Store the
* current CPU number in @old_fpu, so the next return
* to user space can avoid the FPU register restore
* when is returns on the same CPU and still owns the
* context.
*/
old_fpu->last_cpu = cpu;
trace_x86_fpu_regs_deactivated(old_fpu);
}
}
/*
* Misc helper functions:
*/
/*
* Delay loading of the complete FPU state until the return to userland.
* PKRU is handled separately.
*/
static inline void switch_fpu_finish(struct fpu *new_fpu)
{
if (cpu_feature_enabled(X86_FEATURE_FPU))
set_thread_flag(TIF_NEED_FPU_LOAD);
}
#endif /* _ASM_X86_FPU_INTERNAL_H */
// SPDX-License-Identifier: GPL-2.0-only
#include <net/tcp.h>
/* The bandwidth estimator estimates the rate at which the network
* can currently deliver outbound data packets for this flow. At a high
* level, it operates by taking a delivery rate sample for each ACK.
*
* A rate sample records the rate at which the network delivered packets
* for this flow, calculated over the time interval between the transmission
* of a data packet and the acknowledgment of that packet.
*
* Specifically, over the interval between each transmit and corresponding ACK,
* the estimator generates a delivery rate sample. Typically it uses the rate
* at which packets were acknowledged. However, the approach of using only the
* acknowledgment rate faces a challenge under the prevalent ACK decimation or
* compression: packets can temporarily appear to be delivered much quicker
* than the bottleneck rate. Since it is physically impossible to do that in a
* sustained fashion, when the estimator notices that the ACK rate is faster
* than the transmit rate, it uses the latter:
*
* send_rate = #pkts_delivered/(last_snd_time - first_snd_time)
* ack_rate = #pkts_delivered/(last_ack_time - first_ack_time)
* bw = min(send_rate, ack_rate)
*
* Notice the estimator essentially estimates the goodput, not always the
* network bottleneck link rate when the sending or receiving is limited by
* other factors like applications or receiver window limits. The estimator
* deliberately avoids using the inter-packet spacing approach because that
* approach requires a large number of samples and sophisticated filtering.
*
* TCP flows can often be application-limited in request/response workloads.
* The estimator marks a bandwidth sample as application-limited if there
* was some moment during the sampled window of packets when there was no data
* ready to send in the write queue.
*/
/* Snapshot the current delivery information in the skb, to generate
* a rate sample later when the skb is (s)acked in tcp_rate_skb_delivered().
*/
void tcp_rate_skb_sent(struct sock *sk, struct sk_buff *skb)
{
struct tcp_sock *tp = tcp_sk(sk);
/* In general we need to start delivery rate samples from the
* time we received the most recent ACK, to ensure we include
* the full time the network needs to deliver all in-flight
* packets. If there are no packets in flight yet, then we
* know that any ACKs after now indicate that the network was
* able to deliver those packets completely in the sampling
* interval between now and the next ACK.
*
* Note that we use packets_out instead of tcp_packets_in_flight(tp)
* because the latter is a guess based on RTO and loss-marking
* heuristics. We don't want spurious RTOs or loss markings to cause
* a spuriously small time interval, causing a spuriously high
* bandwidth estimate.
*/
if (!tp->packets_out) { u64 tstamp_us = tcp_skb_timestamp_us(skb);
tp->first_tx_mstamp = tstamp_us;
tp->delivered_mstamp = tstamp_us;
}
TCP_SKB_CB(skb)->tx.first_tx_mstamp = tp->first_tx_mstamp;
TCP_SKB_CB(skb)->tx.delivered_mstamp = tp->delivered_mstamp;
TCP_SKB_CB(skb)->tx.delivered = tp->delivered;
TCP_SKB_CB(skb)->tx.is_app_limited = tp->app_limited ? 1 : 0;
}
/* When an skb is sacked or acked, we fill in the rate sample with the (prior)
* delivery information when the skb was last transmitted.
*
* If an ACK (s)acks multiple skbs (e.g., stretched-acks), this function is
* called multiple times. We favor the information from the most recently
* sent skb, i.e., the skb with the most recently sent time and the highest
* sequence.
*/
void tcp_rate_skb_delivered(struct sock *sk, struct sk_buff *skb,
struct rate_sample *rs)
{
struct tcp_sock *tp = tcp_sk(sk);
struct tcp_skb_cb *scb = TCP_SKB_CB(skb);
u64 tx_tstamp;
if (!scb->tx.delivered_mstamp)
return;
tx_tstamp = tcp_skb_timestamp_us(skb);
if (!rs->prior_delivered ||
tcp_skb_sent_after(tx_tstamp, tp->first_tx_mstamp,
scb->end_seq, rs->last_end_seq)) {
rs->prior_delivered = scb->tx.delivered;
rs->prior_mstamp = scb->tx.delivered_mstamp;
rs->is_app_limited = scb->tx.is_app_limited;
rs->is_retrans = scb->sacked & TCPCB_RETRANS;
rs->last_end_seq = scb->end_seq;
/* Record send time of most recently ACKed packet: */
tp->first_tx_mstamp = tx_tstamp;
/* Find the duration of the "send phase" of this window: */
rs->interval_us = tcp_stamp_us_delta(tp->first_tx_mstamp,
scb->tx.first_tx_mstamp);
}
/* Mark off the skb delivered once it's sacked to avoid being
* used again when it's cumulatively acked. For acked packets
* we don't need to reset since it'll be freed soon.
*/
if (scb->sacked & TCPCB_SACKED_ACKED)
scb->tx.delivered_mstamp = 0;
}
/* Update the connection delivery information and generate a rate sample. */
void tcp_rate_gen(struct sock *sk, u32 delivered, u32 lost,
bool is_sack_reneg, struct rate_sample *rs)
{
struct tcp_sock *tp = tcp_sk(sk);
u32 snd_us, ack_us;
/* Clear app limited if bubble is acked and gone. */
if (tp->app_limited && after(tp->delivered, tp->app_limited))
tp->app_limited = 0;
/* TODO: there are multiple places throughout tcp_ack() to get
* current time. Refactor the code using a new "tcp_acktag_state"
* to carry current time, flags, stats like "tcp_sacktag_state".
*/
if (delivered)
tp->delivered_mstamp = tp->tcp_mstamp;
rs->acked_sacked = delivered; /* freshly ACKed or SACKed */
rs->losses = lost; /* freshly marked lost */
/* Return an invalid sample if no timing information is available or
* in recovery from loss with SACK reneging. Rate samples taken during
* a SACK reneging event may overestimate bw by including packets that
* were SACKed before the reneg.
*/
if (!rs->prior_mstamp || is_sack_reneg) {
rs->delivered = -1;
rs->interval_us = -1;
return;
}
rs->delivered = tp->delivered - rs->prior_delivered;
/* Model sending data and receiving ACKs as separate pipeline phases
* for a window. Usually the ACK phase is longer, but with ACK
* compression the send phase can be longer. To be safe we use the
* longer phase.
*/
snd_us = rs->interval_us; /* send phase */
ack_us = tcp_stamp_us_delta(tp->tcp_mstamp,
rs->prior_mstamp); /* ack phase */
rs->interval_us = max(snd_us, ack_us);
/* Record both segment send and ack receive intervals */
rs->snd_interval_us = snd_us;
rs->rcv_interval_us = ack_us;
/* Normally we expect interval_us >= min-rtt.
* Note that rate may still be over-estimated when a spuriously
* retransmistted skb was first (s)acked because "interval_us"
* is under-estimated (up to an RTT). However continuously
* measuring the delivery rate during loss recovery is crucial
* for connections suffer heavy or prolonged losses.
*/
if (unlikely(rs->interval_us < tcp_min_rtt(tp))) {
if (!rs->is_retrans)
pr_debug("tcp rate: %ld %d %u %u %u\n",
rs->interval_us, rs->delivered,
inet_csk(sk)->icsk_ca_state,
tp->rx_opt.sack_ok, tcp_min_rtt(tp));
rs->interval_us = -1;
return;
}
/* Record the last non-app-limited or the highest app-limited bw */
if (!rs->is_app_limited ||
((u64)rs->delivered * tp->rate_interval_us >=
(u64)tp->rate_delivered * rs->interval_us)) {
tp->rate_delivered = rs->delivered;
tp->rate_interval_us = rs->interval_us;
tp->rate_app_limited = rs->is_app_limited;
}
}
/* If a gap is detected between sends, mark the socket application-limited. */
void tcp_rate_check_app_limited(struct sock *sk)
{
struct tcp_sock *tp = tcp_sk(sk);
if (/* We have less than one packet to send. */
tp->write_seq - tp->snd_nxt < tp->mss_cache &&
/* Nothing in sending host's qdisc queues or NIC tx queue. */
sk_wmem_alloc_get(sk) < SKB_TRUESIZE(1) &&
/* We are not limited by CWND. */
tcp_packets_in_flight(tp) < tp->snd_cwnd &&
/* All lost packets have been retransmitted. */
tp->lost_out <= tp->retrans_out)
tp->app_limited =
(tp->delivered + tcp_packets_in_flight(tp)) ? : 1;
}
EXPORT_SYMBOL_GPL(tcp_rate_check_app_limited);
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _SCSI_SCSI_CMND_H
#define _SCSI_SCSI_CMND_H
#include <linux/dma-mapping.h>
#include <linux/blkdev.h>
#include <linux/t10-pi.h>
#include <linux/list.h>
#include <linux/types.h>
#include <linux/timer.h>
#include <linux/scatterlist.h>
#include <scsi/scsi_device.h>
#include <scsi/scsi_host.h>
#include <scsi/scsi_request.h>
struct Scsi_Host;
struct scsi_driver;
/*
* MAX_COMMAND_SIZE is:
* The longest fixed-length SCSI CDB as per the SCSI standard.
* fixed-length means: commands that their size can be determined
* by their opcode and the CDB does not carry a length specifier, (unlike
* the VARIABLE_LENGTH_CMD(0x7f) command). This is actually not exactly
* true and the SCSI standard also defines extended commands and
* vendor specific commands that can be bigger than 16 bytes. The kernel
* will support these using the same infrastructure used for VARLEN CDB's.
* So in effect MAX_COMMAND_SIZE means the maximum size command scsi-ml
* supports without specifying a cmd_len by ULD's
*/
#define MAX_COMMAND_SIZE 16
#if (MAX_COMMAND_SIZE > BLK_MAX_CDB)
# error MAX_COMMAND_SIZE can not be bigger than BLK_MAX_CDB
#endif
struct scsi_data_buffer {
struct sg_table table;
unsigned length;
};
/* embedded in scsi_cmnd */
struct scsi_pointer {
char *ptr; /* data pointer */
int this_residual; /* left in this buffer */
struct scatterlist *buffer; /* which buffer */
int buffers_residual; /* how many buffers left */
dma_addr_t dma_handle;
volatile int Status;
volatile int Message;
volatile int have_data_in;
volatile int sent_command;
volatile int phase;
};
/* for scmd->flags */
#define SCMD_TAGGED (1 << 0)
#define SCMD_INITIALIZED (1 << 1)
#define SCMD_LAST (1 << 2)
/* flags preserved across unprep / reprep */
#define SCMD_PRESERVED_FLAGS (SCMD_INITIALIZED)
/* for scmd->state */
#define SCMD_STATE_COMPLETE 0
#define SCMD_STATE_INFLIGHT 1
struct scsi_cmnd {
struct scsi_request req;
struct scsi_device *device;
struct list_head eh_entry; /* entry for the host eh_abort_list/eh_cmd_q */
struct delayed_work abort_work;
struct rcu_head rcu;
int eh_eflags; /* Used by error handlr */
int budget_token;
/*
* This is set to jiffies as it was when the command was first
* allocated. It is used to time how long the command has
* been outstanding
*/
unsigned long jiffies_at_alloc;
int retries;
int allowed;
unsigned char prot_op;
unsigned char prot_type;
unsigned char prot_flags;
unsigned short cmd_len;
enum dma_data_direction sc_data_direction;
/* These elements define the operation we are about to perform */
unsigned char *cmnd;
/* These elements define the operation we ultimately want to perform */
struct scsi_data_buffer sdb;
struct scsi_data_buffer *prot_sdb;
unsigned underflow; /* Return error if less than
this amount is transferred */
unsigned transfersize; /* How much we are guaranteed to
transfer with each SCSI transfer
(ie, between disconnect /
reconnects. Probably == sector
size */
unsigned char *sense_buffer;
/* obtained by REQUEST SENSE when
* CHECK CONDITION is received on original
* command (auto-sense). Length must be
* SCSI_SENSE_BUFFERSIZE bytes. */
/* Low-level done function - can be used by low-level driver to point
* to completion function. Not used by mid/upper level code. */
void (*scsi_done) (struct scsi_cmnd *);
/*
* The following fields can be written to by the host specific code.
* Everything else should be left alone.
*/
struct scsi_pointer SCp; /* Scratchpad used by some host adapters */
unsigned char *host_scribble; /* The host adapter is allowed to
* call scsi_malloc and get some memory
* and hang it here. The host adapter
* is also expected to call scsi_free
* to release this memory. (The memory
* obtained by scsi_malloc is guaranteed
* to be at an address < 16Mb). */
int result; /* Status code from lower level driver */
int flags; /* Command flags */
unsigned long state; /* Command completion state */
unsigned int extra_len; /* length of alignment and padding */
};
/* Variant of blk_mq_rq_from_pdu() that verifies the type of its argument. */
static inline struct request *scsi_cmd_to_rq(struct scsi_cmnd *scmd)
{
return blk_mq_rq_from_pdu(scmd);
}
/*
* Return the driver private allocation behind the command.
* Only works if cmd_size is set in the host template.
*/
static inline void *scsi_cmd_priv(struct scsi_cmnd *cmd)
{
return cmd + 1;
}
/* make sure not to use it with passthrough commands */
static inline struct scsi_driver *scsi_cmd_to_driver(struct scsi_cmnd *cmd)
{
struct request *rq = scsi_cmd_to_rq(cmd);
return *(struct scsi_driver **)rq->rq_disk->private_data;
}
extern void scsi_finish_command(struct scsi_cmnd *cmd);
extern void *scsi_kmap_atomic_sg(struct scatterlist *sg, int sg_count,
size_t *offset, size_t *len);
extern void scsi_kunmap_atomic_sg(void *virt);
blk_status_t scsi_alloc_sgtables(struct scsi_cmnd *cmd);
void scsi_free_sgtables(struct scsi_cmnd *cmd);
#ifdef CONFIG_SCSI_DMA
extern int scsi_dma_map(struct scsi_cmnd *cmd);
extern void scsi_dma_unmap(struct scsi_cmnd *cmd);
#else /* !CONFIG_SCSI_DMA */
static inline int scsi_dma_map(struct scsi_cmnd *cmd) { return -ENOSYS; }
static inline void scsi_dma_unmap(struct scsi_cmnd *cmd) { }
#endif /* !CONFIG_SCSI_DMA */
static inline unsigned scsi_sg_count(struct scsi_cmnd *cmd)
{
return cmd->sdb.table.nents;
}
static inline struct scatterlist *scsi_sglist(struct scsi_cmnd *cmd)
{
return cmd->sdb.table.sgl;
}
static inline unsigned scsi_bufflen(struct scsi_cmnd *cmd)
{
return cmd->sdb.length;
}
static inline void scsi_set_resid(struct scsi_cmnd *cmd, unsigned int resid)
{
cmd->req.resid_len = resid;
}
static inline unsigned int scsi_get_resid(struct scsi_cmnd *cmd)
{
return cmd->req.resid_len;
}
#define scsi_for_each_sg(cmd, sg, nseg, __i) \
for_each_sg(scsi_sglist(cmd), sg, nseg, __i)
static inline int scsi_sg_copy_from_buffer(struct scsi_cmnd *cmd,
void *buf, int buflen)
{
return sg_copy_from_buffer(scsi_sglist(cmd), scsi_sg_count(cmd),
buf, buflen);
}
static inline int scsi_sg_copy_to_buffer(struct scsi_cmnd *cmd,
void *buf, int buflen)
{
return sg_copy_to_buffer(scsi_sglist(cmd), scsi_sg_count(cmd),
buf, buflen);
}
static inline sector_t scsi_get_sector(struct scsi_cmnd *scmd)
{
return blk_rq_pos(scsi_cmd_to_rq(scmd));
}
static inline sector_t scsi_get_lba(struct scsi_cmnd *scmd)
{
unsigned int shift = ilog2(scmd->device->sector_size) - SECTOR_SHIFT;
return blk_rq_pos(scsi_cmd_to_rq(scmd)) >> shift;
}
static inline unsigned int scsi_logical_block_count(struct scsi_cmnd *scmd)
{
unsigned int shift = ilog2(scmd->device->sector_size) - SECTOR_SHIFT;
return blk_rq_bytes(scsi_cmd_to_rq(scmd)) >> shift;
}
/*
* The operations below are hints that tell the controller driver how
* to handle I/Os with DIF or similar types of protection information.
*/
enum scsi_prot_operations {
/* Normal I/O */
SCSI_PROT_NORMAL = 0,
/* OS-HBA: Protected, HBA-Target: Unprotected */
SCSI_PROT_READ_INSERT,
SCSI_PROT_WRITE_STRIP,
/* OS-HBA: Unprotected, HBA-Target: Protected */
SCSI_PROT_READ_STRIP,
SCSI_PROT_WRITE_INSERT,
/* OS-HBA: Protected, HBA-Target: Protected */
SCSI_PROT_READ_PASS,
SCSI_PROT_WRITE_PASS,
};
static inline void scsi_set_prot_op(struct scsi_cmnd *scmd, unsigned char op)
{
scmd->prot_op = op;
}
static inline unsigned char scsi_get_prot_op(struct scsi_cmnd *scmd)
{
return scmd->prot_op;
}
enum scsi_prot_flags {
SCSI_PROT_TRANSFER_PI = 1 << 0,
SCSI_PROT_GUARD_CHECK = 1 << 1,
SCSI_PROT_REF_CHECK = 1 << 2,
SCSI_PROT_REF_INCREMENT = 1 << 3,
SCSI_PROT_IP_CHECKSUM = 1 << 4,
};
/*
* The controller usually does not know anything about the target it
* is communicating with. However, when DIX is enabled the controller
* must be know target type so it can verify the protection
* information passed along with the I/O.
*/
enum scsi_prot_target_type {
SCSI_PROT_DIF_TYPE0 = 0,
SCSI_PROT_DIF_TYPE1,
SCSI_PROT_DIF_TYPE2,
SCSI_PROT_DIF_TYPE3,
};
static inline void scsi_set_prot_type(struct scsi_cmnd *scmd, unsigned char type)
{
scmd->prot_type = type;
}
static inline unsigned char scsi_get_prot_type(struct scsi_cmnd *scmd)
{
return scmd->prot_type;
}
static inline u32 scsi_prot_ref_tag(struct scsi_cmnd *scmd)
{
struct request *rq = blk_mq_rq_from_pdu(scmd);
return t10_pi_ref_tag(rq);
}
static inline unsigned int scsi_prot_interval(struct scsi_cmnd *scmd)
{
return scmd->device->sector_size;
}
static inline unsigned scsi_prot_sg_count(struct scsi_cmnd *cmd)
{
return cmd->prot_sdb ? cmd->prot_sdb->table.nents : 0;
}
static inline struct scatterlist *scsi_prot_sglist(struct scsi_cmnd *cmd)
{
return cmd->prot_sdb ? cmd->prot_sdb->table.sgl : NULL;
}
static inline struct scsi_data_buffer *scsi_prot(struct scsi_cmnd *cmd)
{
return cmd->prot_sdb;
}
#define scsi_for_each_prot_sg(cmd, sg, nseg, __i) \
for_each_sg(scsi_prot_sglist(cmd), sg, nseg, __i)
static inline void set_status_byte(struct scsi_cmnd *cmd, char status)
{
cmd->result = (cmd->result & 0xffffff00) | status;
}
static inline u8 get_status_byte(struct scsi_cmnd *cmd)
{
return cmd->result & 0xff;
}
static inline void set_host_byte(struct scsi_cmnd *cmd, char status)
{
cmd->result = (cmd->result & 0xff00ffff) | (status << 16);
}
static inline u8 get_host_byte(struct scsi_cmnd *cmd)
{
return (cmd->result >> 16) & 0xff;
}
/**
* scsi_msg_to_host_byte() - translate message byte
*
* Translate the SCSI parallel message byte to a matching
* host byte setting. A message of COMMAND_COMPLETE indicates
* a successful command execution, any other message indicate
* an error. As the messages themselves only have a meaning
* for the SCSI parallel protocol this function translates
* them into a matching host byte value for SCSI EH.
*/
static inline void scsi_msg_to_host_byte(struct scsi_cmnd *cmd, u8 msg)
{
switch (msg) {
case COMMAND_COMPLETE:
break;
case ABORT_TASK_SET:
set_host_byte(cmd, DID_ABORT);
break;
case TARGET_RESET:
set_host_byte(cmd, DID_RESET);
break;
default:
set_host_byte(cmd, DID_ERROR);
break;
}
}
static inline unsigned scsi_transfer_length(struct scsi_cmnd *scmd)
{
unsigned int xfer_len = scmd->sdb.length;
unsigned int prot_interval = scsi_prot_interval(scmd);
if (scmd->prot_flags & SCSI_PROT_TRANSFER_PI)
xfer_len += (xfer_len >> ilog2(prot_interval)) * 8;
return xfer_len;
}
extern void scsi_build_sense(struct scsi_cmnd *scmd, int desc,
u8 key, u8 asc, u8 ascq);
#endif /* _SCSI_SCSI_CMND_H */
/* inflate.c -- zlib decompression
* Copyright (C) 1995-2005 Mark Adler
* For conditions of distribution and use, see copyright notice in zlib.h
*
* Based on zlib 1.2.3 but modified for the Linux Kernel by
* Richard Purdie <richard@openedhand.com>
*
* Changes mainly for static instead of dynamic memory allocation
*
*/
#include <linux/zutil.h>
#include "inftrees.h"
#include "inflate.h"
#include "inffast.h"
#include "infutil.h"
/* architecture-specific bits */
#ifdef CONFIG_ZLIB_DFLTCC
# include "../zlib_dfltcc/dfltcc.h"
#else
#define INFLATE_RESET_HOOK(strm) do {} while (0)
#define INFLATE_TYPEDO_HOOK(strm, flush) do {} while (0)
#define INFLATE_NEED_UPDATEWINDOW(strm) 1
#define INFLATE_NEED_CHECKSUM(strm) 1
#endif
int zlib_inflate_workspacesize(void)
{
return sizeof(struct inflate_workspace);
}
int zlib_inflateReset(z_streamp strm)
{
struct inflate_state *state;
if (strm == NULL || strm->state == NULL) return Z_STREAM_ERROR;
state = (struct inflate_state *)strm->state;
strm->total_in = strm->total_out = state->total = 0;
strm->msg = NULL;
strm->adler = 1; /* to support ill-conceived Java test suite */
state->mode = HEAD;
state->last = 0;
state->havedict = 0;
state->dmax = 32768U;
state->hold = 0;
state->bits = 0;
state->lencode = state->distcode = state->next = state->codes;
/* Initialise Window */
state->wsize = 1U << state->wbits;
state->write = 0;
state->whave = 0;
INFLATE_RESET_HOOK(strm);
return Z_OK;
}
int zlib_inflateInit2(z_streamp strm, int windowBits)
{
struct inflate_state *state;
if (strm == NULL) return Z_STREAM_ERROR; strm->msg = NULL; /* in case we return an error */
state = &WS(strm)->inflate_state;
strm->state = (struct internal_state *)state;
if (windowBits < 0) {
state->wrap = 0;
windowBits = -windowBits;
}
else {
state->wrap = (windowBits >> 4) + 1;
}
if (windowBits < 8 || windowBits > 15) {
return Z_STREAM_ERROR;
}
state->wbits = (unsigned)windowBits;
#ifdef CONFIG_ZLIB_DFLTCC
/*
* DFLTCC requires the window to be page aligned.
* Thus, we overallocate and take the aligned portion of the buffer.
*/
state->window = PTR_ALIGN(&WS(strm)->working_window[0], PAGE_SIZE);
#else
state->window = &WS(strm)->working_window[0];
#endif
return zlib_inflateReset(strm);
}
/*
Return state with length and distance decoding tables and index sizes set to
fixed code decoding. This returns fixed tables from inffixed.h.
*/
static void zlib_fixedtables(struct inflate_state *state)
{
# include "inffixed.h"
state->lencode = lenfix;
state->lenbits = 9;
state->distcode = distfix;
state->distbits = 5;
}
/*
Update the window with the last wsize (normally 32K) bytes written before
returning. This is only called when a window is already in use, or when
output has been written during this inflate call, but the end of the deflate
stream has not been reached yet. It is also called to window dictionary data
when a dictionary is loaded.
Providing output buffers larger than 32K to inflate() should provide a speed
advantage, since only the last 32K of output is copied to the sliding window
upon return from inflate(), and since all distances after the first 32K of
output will fall in the output data, making match copies simpler and faster.
The advantage may be dependent on the size of the processor's data caches.
*/
static void zlib_updatewindow(z_streamp strm, unsigned out)
{
struct inflate_state *state;
unsigned copy, dist;
state = (struct inflate_state *)strm->state;
/* copy state->wsize or less output bytes into the circular window */
copy = out - strm->avail_out;
if (copy >= state->wsize) {
memcpy(state->window, strm->next_out - state->wsize, state->wsize);
state->write = 0;
state->whave = state->wsize;
}
else {
dist = state->wsize - state->write;
if (dist > copy) dist = copy;
memcpy(state->window + state->write, strm->next_out - copy, dist);
copy -= dist;
if (copy) {
memcpy(state->window, strm->next_out - copy, copy);
state->write = copy;
state->whave = state->wsize;
}
else {
state->write += dist; if (state->write == state->wsize) state->write = 0; if (state->whave < state->wsize) state->whave += dist;
}
}
}
/*
* At the end of a Deflate-compressed PPP packet, we expect to have seen
* a `stored' block type value but not the (zero) length bytes.
*/
/*
Returns true if inflate is currently at the end of a block generated by
Z_SYNC_FLUSH or Z_FULL_FLUSH. This function is used by one PPP
implementation to provide an additional safety check. PPP uses
Z_SYNC_FLUSH but removes the length bytes of the resulting empty stored
block. When decompressing, PPP checks that at the end of input packet,
inflate is waiting for these length bytes.
*/
static int zlib_inflateSyncPacket(z_streamp strm)
{
struct inflate_state *state;
if (strm == NULL || strm->state == NULL) return Z_STREAM_ERROR;
state = (struct inflate_state *)strm->state;
if (state->mode == STORED && state->bits == 0) { state->mode = TYPE;
return Z_OK;
}
return Z_DATA_ERROR;
}
/* Macros for inflate(): */
/* check function to use adler32() for zlib or crc32() for gzip */
#define UPDATE(check, buf, len) zlib_adler32(check, buf, len)
/* Load registers with state in inflate() for speed */
#define LOAD() \
do { \
put = strm->next_out; \
left = strm->avail_out; \
next = strm->next_in; \
have = strm->avail_in; \
hold = state->hold; \
bits = state->bits; \
} while (0)
/* Restore state from registers in inflate() */
#define RESTORE() \
do { \
strm->next_out = put; \
strm->avail_out = left; \
strm->next_in = next; \
strm->avail_in = have; \
state->hold = hold; \
state->bits = bits; \
} while (0)
/* Clear the input bit accumulator */
#define INITBITS() \
do { \
hold = 0; \
bits = 0; \
} while (0)
/* Get a byte of input into the bit accumulator, or return from inflate()
if there is no input available. */
#define PULLBYTE() \
do { \
if (have == 0) goto inf_leave; \
have--; \
hold += (unsigned long)(*next++) << bits; \
bits += 8; \
} while (0)
/* Assure that there are at least n bits in the bit accumulator. If there is
not enough available input to do that, then return from inflate(). */
#define NEEDBITS(n) \
do { \
while (bits < (unsigned)(n)) \
PULLBYTE(); \
} while (0)
/* Return the low n bits of the bit accumulator (n < 16) */
#define BITS(n) \
((unsigned)hold & ((1U << (n)) - 1))
/* Remove n bits from the bit accumulator */
#define DROPBITS(n) \
do { \
hold >>= (n); \
bits -= (unsigned)(n); \
} while (0)
/* Remove zero to seven bits as needed to go to a byte boundary */
#define BYTEBITS() \
do { \
hold >>= bits & 7; \
bits -= bits & 7; \
} while (0)
/*
inflate() uses a state machine to process as much input data and generate as
much output data as possible before returning. The state machine is
structured roughly as follows:
for (;;) switch (state) {
...
case STATEn:
if (not enough input data or output space to make progress)
return;
... make progress ...
state = STATEm;
break;
...
}
so when inflate() is called again, the same case is attempted again, and
if the appropriate resources are provided, the machine proceeds to the
next state. The NEEDBITS() macro is usually the way the state evaluates
whether it can proceed or should return. NEEDBITS() does the return if
the requested bits are not available. The typical use of the BITS macros
is:
NEEDBITS(n);
... do something with BITS(n) ...
DROPBITS(n);
where NEEDBITS(n) either returns from inflate() if there isn't enough
input left to load n bits into the accumulator, or it continues. BITS(n)
gives the low n bits in the accumulator. When done, DROPBITS(n) drops
the low n bits off the accumulator. INITBITS() clears the accumulator
and sets the number of available bits to zero. BYTEBITS() discards just
enough bits to put the accumulator on a byte boundary. After BYTEBITS()
and a NEEDBITS(8), then BITS(8) would return the next byte in the stream.
NEEDBITS(n) uses PULLBYTE() to get an available byte of input, or to return
if there is no input available. The decoding of variable length codes uses
PULLBYTE() directly in order to pull just enough bytes to decode the next
code, and no more.
Some states loop until they get enough input, making sure that enough
state information is maintained to continue the loop where it left off
if NEEDBITS() returns in the loop. For example, want, need, and keep
would all have to actually be part of the saved state in case NEEDBITS()
returns:
case STATEw:
while (want < need) {
NEEDBITS(n);
keep[want++] = BITS(n);
DROPBITS(n);
}
state = STATEx;
case STATEx:
As shown above, if the next state is also the next case, then the break
is omitted.
A state may also return if there is not enough output space available to
complete that state. Those states are copying stored data, writing a
literal byte, and copying a matching string.
When returning, a "goto inf_leave" is used to update the total counters,
update the check value, and determine whether any progress has been made
during that inflate() call in order to return the proper return code.
Progress is defined as a change in either strm->avail_in or strm->avail_out.
When there is a window, goto inf_leave will update the window with the last
output written. If a goto inf_leave occurs in the middle of decompression
and there is no window currently, goto inf_leave will create one and copy
output to the window for the next call of inflate().
In this implementation, the flush parameter of inflate() only affects the
return code (per zlib.h). inflate() always writes as much as possible to
strm->next_out, given the space available and the provided input--the effect
documented in zlib.h of Z_SYNC_FLUSH. Furthermore, inflate() always defers
the allocation of and copying into a sliding window until necessary, which
provides the effect documented in zlib.h for Z_FINISH when the entire input
stream available. So the only thing the flush parameter actually does is:
when flush is set to Z_FINISH, inflate() cannot return Z_OK. Instead it
will return Z_BUF_ERROR if it has not reached the end of the stream.
*/
int zlib_inflate(z_streamp strm, int flush)
{
struct inflate_state *state;
const unsigned char *next; /* next input */
unsigned char *put; /* next output */
unsigned have, left; /* available input and output */
unsigned long hold; /* bit buffer */
unsigned bits; /* bits in bit buffer */
unsigned in, out; /* save starting available input and output */
unsigned copy; /* number of stored or match bytes to copy */
unsigned char *from; /* where to copy match bytes from */
code this; /* current decoding table entry */
code last; /* parent table entry */
unsigned len; /* length to copy for repeats, bits to drop */
int ret; /* return code */
static const unsigned short order[19] = /* permutation of code lengths */
{16, 17, 18, 0, 8, 7, 9, 6, 10, 5, 11, 4, 12, 3, 13, 2, 14, 1, 15};
/* Do not check for strm->next_out == NULL here as ppc zImage
inflates to strm->next_out = 0 */
if (strm == NULL || strm->state == NULL || (strm->next_in == NULL && strm->avail_in != 0))
return Z_STREAM_ERROR;
state = (struct inflate_state *)strm->state;
if (state->mode == TYPE) state->mode = TYPEDO; /* skip check */ LOAD();
in = have;
out = left;
ret = Z_OK;
for (;;)
switch (state->mode) {
case HEAD:
if (state->wrap == 0) { state->mode = TYPEDO; break;
}
NEEDBITS(16);
if (
((BITS(8) << 8) + (hold >> 8)) % 31) { strm->msg = (char *)"incorrect header check";
state->mode = BAD;
break;
}
if (BITS(4) != Z_DEFLATED) { strm->msg = (char *)"unknown compression method";
state->mode = BAD;
break;
}
DROPBITS(4);
len = BITS(4) + 8;
if (len > state->wbits) {
strm->msg = (char *)"invalid window size";
state->mode = BAD;
break;
}
state->dmax = 1U << len;
strm->adler = state->check = zlib_adler32(0L, NULL, 0);
state->mode = hold & 0x200 ? DICTID : TYPE;
INITBITS();
break;
case DICTID:
NEEDBITS(32); strm->adler = state->check = REVERSE(hold);
INITBITS();
state->mode = DICT;
fallthrough;
case DICT:
if (state->havedict == 0) { RESTORE();
return Z_NEED_DICT;
}
strm->adler = state->check = zlib_adler32(0L, NULL, 0);
state->mode = TYPE;
fallthrough;
case TYPE:
if (flush == Z_BLOCK) goto inf_leave;
fallthrough;
case TYPEDO:
INFLATE_TYPEDO_HOOK(strm, flush);
if (state->last) { BYTEBITS();
state->mode = CHECK;
break;
}
NEEDBITS(3); state->last = BITS(1);
DROPBITS(1);
switch (BITS(2)) {
case 0: /* stored block */
state->mode = STORED;
break;
case 1: /* fixed block */
zlib_fixedtables(state);
state->mode = LEN; /* decode codes */
break;
case 2: /* dynamic block */
state->mode = TABLE;
break;
case 3:
strm->msg = (char *)"invalid block type";
state->mode = BAD;
}
DROPBITS(2);
break;
case STORED:
BYTEBITS(); /* go to byte boundary */ NEEDBITS(32); if ((hold & 0xffff) != ((hold >> 16) ^ 0xffff)) { strm->msg = (char *)"invalid stored block lengths";
state->mode = BAD;
break;
}
state->length = (unsigned)hold & 0xffff;
INITBITS();
state->mode = COPY;
fallthrough;
case COPY:
copy = state->length;
if (copy) { if (copy > have) copy = have;
if (copy > left) copy = left;
if (copy == 0) goto inf_leave;
memcpy(put, next, copy);
have -= copy;
next += copy;
left -= copy;
put += copy;
state->length -= copy;
break;
}
state->mode = TYPE;
break;
case TABLE:
NEEDBITS(14); state->nlen = BITS(5) + 257;
DROPBITS(5);
state->ndist = BITS(5) + 1;
DROPBITS(5);
state->ncode = BITS(4) + 4;
DROPBITS(4);
#ifndef PKZIP_BUG_WORKAROUND
if (state->nlen > 286 || state->ndist > 30) { strm->msg = (char *)"too many length or distance symbols";
state->mode = BAD;
break;
}
#endif
state->have = 0;
state->mode = LENLENS;
fallthrough;
case LENLENS:
while (state->have < state->ncode) { NEEDBITS(3); state->lens[order[state->have++]] = (unsigned short)BITS(3);
DROPBITS(3);
}
while (state->have < 19) state->lens[order[state->have++]] = 0; state->next = state->codes;
state->lencode = (code const *)(state->next);
state->lenbits = 7;
ret = zlib_inflate_table(CODES, state->lens, 19, &(state->next),
&(state->lenbits), state->work);
if (ret) {
strm->msg = (char *)"invalid code lengths set";
state->mode = BAD;
break;
}
state->have = 0;
state->mode = CODELENS;
fallthrough;
case CODELENS:
while (state->have < state->nlen + state->ndist) {
for (;;) {
this = state->lencode[BITS(state->lenbits)];
if ((unsigned)(this.bits) <= bits) break;
PULLBYTE();
}
if (this.val < 16) {
NEEDBITS(this.bits);
DROPBITS(this.bits);
state->lens[state->have++] = this.val;
}
else {
if (this.val == 16) { NEEDBITS(this.bits + 2); DROPBITS(this.bits);
if (state->have == 0) {
strm->msg = (char *)"invalid bit length repeat";
state->mode = BAD;
break;
}
len = state->lens[state->have - 1];
copy = 3 + BITS(2);
DROPBITS(2);
}
else if (this.val == 17) { NEEDBITS(this.bits + 3); DROPBITS(this.bits);
len = 0;
copy = 3 + BITS(3);
DROPBITS(3);
}
else {
NEEDBITS(this.bits + 7); DROPBITS(this.bits);
len = 0;
copy = 11 + BITS(7);
DROPBITS(7);
}
if (state->have + copy > state->nlen + state->ndist) { strm->msg = (char *)"invalid bit length repeat";
state->mode = BAD;
break;
}
while (copy--) state->lens[state->have++] = (unsigned short)len;
}
}
/* handle error breaks in while */
if (state->mode == BAD) break;
/* build code tables */
state->next = state->codes;
state->lencode = (code const *)(state->next);
state->lenbits = 9;
ret = zlib_inflate_table(LENS, state->lens, state->nlen, &(state->next),
&(state->lenbits), state->work);
if (ret) {
strm->msg = (char *)"invalid literal/lengths set";
state->mode = BAD;
break;
}
state->distcode = (code const *)(state->next);
state->distbits = 6;
ret = zlib_inflate_table(DISTS, state->lens + state->nlen, state->ndist,
&(state->next), &(state->distbits), state->work);
if (ret) {
strm->msg = (char *)"invalid distances set";
state->mode = BAD;
break;
}
state->mode = LEN;
fallthrough;
case LEN:
if (have >= 6 && left >= 258) { RESTORE();
inflate_fast(strm, out);
LOAD();
break;
}
for (;;) {
this = state->lencode[BITS(state->lenbits)];
if ((unsigned)(this.bits) <= bits) break;
PULLBYTE();
}
if (this.op && (this.op & 0xf0) == 0) {
last = this;
for (;;) {
this = state->lencode[last.val +
(BITS(last.bits + last.op) >> last.bits)];
if ((unsigned)(last.bits + this.bits) <= bits) break;
PULLBYTE();
}
DROPBITS(last.bits);
}
DROPBITS(this.bits);
state->length = (unsigned)this.val;
if ((int)(this.op) == 0) {
state->mode = LIT;
break;
}
if (this.op & 32) { state->mode = TYPE;
break;
}
if (this.op & 64) { strm->msg = (char *)"invalid literal/length code";
state->mode = BAD;
break;
}
state->extra = (unsigned)(this.op) & 15;
state->mode = LENEXT;
fallthrough;
case LENEXT:
if (state->extra) { NEEDBITS(state->extra); state->length += BITS(state->extra);
DROPBITS(state->extra);
}
state->mode = DIST;
fallthrough;
case DIST:
for (;;) {
this = state->distcode[BITS(state->distbits)];
if ((unsigned)(this.bits) <= bits) break;
PULLBYTE();
}
if ((this.op & 0xf0) == 0) {
last = this;
for (;;) {
this = state->distcode[last.val +
(BITS(last.bits + last.op) >> last.bits)];
if ((unsigned)(last.bits + this.bits) <= bits) break;
PULLBYTE();
}
DROPBITS(last.bits);
}
DROPBITS(this.bits);
if (this.op & 64) {
strm->msg = (char *)"invalid distance code";
state->mode = BAD;
break;
}
state->offset = (unsigned)this.val;
state->extra = (unsigned)(this.op) & 15;
state->mode = DISTEXT;
fallthrough;
case DISTEXT:
if (state->extra) { NEEDBITS(state->extra); state->offset += BITS(state->extra);
DROPBITS(state->extra);
}
#ifdef INFLATE_STRICT
if (state->offset > state->dmax) {
strm->msg = (char *)"invalid distance too far back";
state->mode = BAD;
break;
}
#endif
if (state->offset > state->whave + out - left) { strm->msg = (char *)"invalid distance too far back"; state->mode = BAD;
break;
}
state->mode = MATCH;
fallthrough;
case MATCH:
if (left == 0) goto inf_leave; copy = out - left;
if (state->offset > copy) { /* copy from window */
copy = state->offset - copy;
if (copy > state->write) {
copy -= state->write;
from = state->window + (state->wsize - copy);
}
else
from = state->window + (state->write - copy); if (copy > state->length) copy = state->length;
}
else { /* copy from output */
from = put - state->offset;
copy = state->length;
}
if (copy > left) copy = left;
left -= copy;
state->length -= copy;
do {
*put++ = *from++;
} while (--copy);
if (state->length == 0) state->mode = LEN;
break;
case LIT:
if (left == 0) goto inf_leave; *put++ = (unsigned char)(state->length);
left--;
state->mode = LEN;
break;
case CHECK:
if (state->wrap) { NEEDBITS(32); out -= left;
strm->total_out += out;
state->total += out;
if (INFLATE_NEED_CHECKSUM(strm) && out)
strm->adler = state->check = UPDATE(state->check, put - out, out);
out = left;
if ((
REVERSE(hold)) != state->check) { strm->msg = (char *)"incorrect data check";
state->mode = BAD;
break;
}
INITBITS();
}
state->mode = DONE;
fallthrough;
case DONE:
ret = Z_STREAM_END;
goto inf_leave;
case BAD:
ret = Z_DATA_ERROR;
goto inf_leave;
case MEM:
return Z_MEM_ERROR;
case SYNC:
default:
return Z_STREAM_ERROR;
}
/*
Return from inflate(), updating the total counts and the check value.
If there was no progress during the inflate() call, return a buffer
error. Call zlib_updatewindow() to create and/or update the window state.
*/
inf_leave:
RESTORE();
if (INFLATE_NEED_UPDATEWINDOW(strm) &&
(state->wsize || (state->mode < CHECK && out != strm->avail_out))) zlib_updatewindow(strm, out); in -= strm->avail_in;
out -= strm->avail_out;
strm->total_in += in;
strm->total_out += out;
state->total += out;
if (INFLATE_NEED_CHECKSUM(strm) && state->wrap && out) strm->adler = state->check = UPDATE(state->check, strm->next_out - out, out); strm->data_type = state->bits + (state->last ? 64 : 0) + (state->mode == TYPE ? 128 : 0); if (flush == Z_PACKET_FLUSH && ret == Z_OK && strm->avail_out != 0 && strm->avail_in == 0)
return zlib_inflateSyncPacket(strm);
if (((in == 0 && out == 0) || flush == Z_FINISH) && ret == Z_OK)
ret = Z_BUF_ERROR;
return ret;
}
int zlib_inflateEnd(z_streamp strm)
{
if (strm == NULL || strm->state == NULL)
return Z_STREAM_ERROR;
return Z_OK;
}
/*
* This subroutine adds the data at next_in/avail_in to the output history
* without performing any output. The output buffer must be "caught up";
* i.e. no pending output but this should always be the case. The state must
* be waiting on the start of a block (i.e. mode == TYPE or HEAD). On exit,
* the output will also be caught up, and the checksum will have been updated
* if need be.
*/
int zlib_inflateIncomp(z_stream *z)
{
struct inflate_state *state = (struct inflate_state *)z->state;
Byte *saved_no = z->next_out;
uInt saved_ao = z->avail_out;
if (state->mode != TYPE && state->mode != HEAD)
return Z_DATA_ERROR;
/* Setup some variables to allow misuse of updateWindow */
z->avail_out = 0;
z->next_out = (unsigned char*)z->next_in + z->avail_in;
zlib_updatewindow(z, z->avail_in);
/* Restore saved variables */
z->avail_out = saved_ao;
z->next_out = saved_no;
z->adler = state->check =
UPDATE(state->check, z->next_in, z->avail_in);
z->total_out += z->avail_in;
z->total_in += z->avail_in;
z->next_in += z->avail_in;
state->total += z->avail_in;
z->avail_in = 0;
return Z_OK;
}
// SPDX-License-Identifier: GPL-2.0
#include <linux/slab.h>
#include <linux/kernel.h>
#include <linux/bitops.h>
#include <linux/cpumask.h>
#include <linux/export.h>
#include <linux/memblock.h>
#include <linux/numa.h>
/**
* cpumask_next - get the next cpu in a cpumask
* @n: the cpu prior to the place to search (ie. return will be > @n)
* @srcp: the cpumask pointer
*
* Returns >= nr_cpu_ids if no further cpus set.
*/
unsigned int cpumask_next(int n, const struct cpumask *srcp)
{
/* -1 is a legal arg here. */
if (n != -1)
cpumask_check(n);
return find_next_bit(cpumask_bits(srcp), nr_cpumask_bits, n + 1);
}
EXPORT_SYMBOL(cpumask_next);
/**
* cpumask_next_and - get the next cpu in *src1p & *src2p
* @n: the cpu prior to the place to search (ie. return will be > @n)
* @src1p: the first cpumask pointer
* @src2p: the second cpumask pointer
*
* Returns >= nr_cpu_ids if no further cpus set in both.
*/
int cpumask_next_and(int n, const struct cpumask *src1p,
const struct cpumask *src2p)
{
/* -1 is a legal arg here. */
if (n != -1)
cpumask_check(n);
return find_next_and_bit(cpumask_bits(src1p), cpumask_bits(src2p), nr_cpumask_bits, n + 1);
}
EXPORT_SYMBOL(cpumask_next_and);
/**
* cpumask_any_but - return a "random" in a cpumask, but not this one.
* @mask: the cpumask to search
* @cpu: the cpu to ignore.
*
* Often used to find any cpu but smp_processor_id() in a mask.
* Returns >= nr_cpu_ids if no cpus set.
*/
int cpumask_any_but(const struct cpumask *mask, unsigned int cpu)
{
unsigned int i;
cpumask_check(cpu);
for_each_cpu(i, mask) if (i != cpu)
break;
return i;
}
EXPORT_SYMBOL(cpumask_any_but);
/**
* cpumask_next_wrap - helper to implement for_each_cpu_wrap
* @n: the cpu prior to the place to search
* @mask: the cpumask pointer
* @start: the start point of the iteration
* @wrap: assume @n crossing @start terminates the iteration
*
* Returns >= nr_cpu_ids on completion
*
* Note: the @wrap argument is required for the start condition when
* we cannot assume @start is set in @mask.
*/
int cpumask_next_wrap(int n, const struct cpumask *mask, int start, bool wrap)
{
int next;
again:
next = cpumask_next(n, mask);
if (wrap && n < start && next >= start) {
return nr_cpumask_bits;
} else if (next >= nr_cpumask_bits) {
wrap = true;
n = -1;
goto again;
}
return next;
}
EXPORT_SYMBOL(cpumask_next_wrap);
/* These are not inline because of header tangles. */
#ifdef CONFIG_CPUMASK_OFFSTACK
/**
* alloc_cpumask_var_node - allocate a struct cpumask on a given node
* @mask: pointer to cpumask_var_t where the cpumask is returned
* @flags: GFP_ flags
*
* Only defined when CONFIG_CPUMASK_OFFSTACK=y, otherwise is
* a nop returning a constant 1 (in <linux/cpumask.h>)
* Returns TRUE if memory allocation succeeded, FALSE otherwise.
*
* In addition, mask will be NULL if this fails. Note that gcc is
* usually smart enough to know that mask can never be NULL if
* CONFIG_CPUMASK_OFFSTACK=n, so does code elimination in that case
* too.
*/
bool alloc_cpumask_var_node(cpumask_var_t *mask, gfp_t flags, int node)
{
*mask = kmalloc_node(cpumask_size(), flags, node);
#ifdef CONFIG_DEBUG_PER_CPU_MAPS
if (!*mask) {
printk(KERN_ERR "=> alloc_cpumask_var: failed!\n");
dump_stack();
}
#endif
return *mask != NULL;
}
EXPORT_SYMBOL(alloc_cpumask_var_node);
bool zalloc_cpumask_var_node(cpumask_var_t *mask, gfp_t flags, int node)
{
return alloc_cpumask_var_node(mask, flags | __GFP_ZERO, node);
}
EXPORT_SYMBOL(zalloc_cpumask_var_node);
/**
* alloc_cpumask_var - allocate a struct cpumask
* @mask: pointer to cpumask_var_t where the cpumask is returned
* @flags: GFP_ flags
*
* Only defined when CONFIG_CPUMASK_OFFSTACK=y, otherwise is
* a nop returning a constant 1 (in <linux/cpumask.h>).
*
* See alloc_cpumask_var_node.
*/
bool alloc_cpumask_var(cpumask_var_t *mask, gfp_t flags)
{
return alloc_cpumask_var_node(mask, flags, NUMA_NO_NODE);
}
EXPORT_SYMBOL(alloc_cpumask_var);
bool zalloc_cpumask_var(cpumask_var_t *mask, gfp_t flags)
{
return alloc_cpumask_var(mask, flags | __GFP_ZERO);
}
EXPORT_SYMBOL(zalloc_cpumask_var);
/**
* alloc_bootmem_cpumask_var - allocate a struct cpumask from the bootmem arena.
* @mask: pointer to cpumask_var_t where the cpumask is returned
*
* Only defined when CONFIG_CPUMASK_OFFSTACK=y, otherwise is
* a nop (in <linux/cpumask.h>).
* Either returns an allocated (zero-filled) cpumask, or causes the
* system to panic.
*/
void __init alloc_bootmem_cpumask_var(cpumask_var_t *mask)
{
*mask = memblock_alloc(cpumask_size(), SMP_CACHE_BYTES);
if (!*mask)
panic("%s: Failed to allocate %u bytes\n", __func__,
cpumask_size());
}
/**
* free_cpumask_var - frees memory allocated for a struct cpumask.
* @mask: cpumask to free
*
* This is safe on a NULL mask.
*/
void free_cpumask_var(cpumask_var_t mask)
{
kfree(mask);
}
EXPORT_SYMBOL(free_cpumask_var);
/**
* free_bootmem_cpumask_var - frees result of alloc_bootmem_cpumask_var
* @mask: cpumask to free
*/
void __init free_bootmem_cpumask_var(cpumask_var_t mask)
{
memblock_free_early(__pa(mask), cpumask_size());
}
#endif
/**
* cpumask_local_spread - select the i'th cpu with local numa cpu's first
* @i: index number
* @node: local numa_node
*
* This function selects an online CPU according to a numa aware policy;
* local cpus are returned first, followed by non-local ones, then it
* wraps around.
*
* It's not very efficient, but useful for setup.
*/
unsigned int cpumask_local_spread(unsigned int i, int node)
{
int cpu;
/* Wrap: we always want a cpu. */
i %= num_online_cpus();
if (node == NUMA_NO_NODE) {
for_each_cpu(cpu, cpu_online_mask)
if (i-- == 0)
return cpu;
} else {
/* NUMA first. */
for_each_cpu_and(cpu, cpumask_of_node(node), cpu_online_mask)
if (i-- == 0)
return cpu;
for_each_cpu(cpu, cpu_online_mask) {
/* Skip NUMA nodes, done above. */
if (cpumask_test_cpu(cpu, cpumask_of_node(node)))
continue;
if (i-- == 0)
return cpu;
}
}
BUG();
}
EXPORT_SYMBOL(cpumask_local_spread);
static DEFINE_PER_CPU(int, distribute_cpu_mask_prev);
/**
* Returns an arbitrary cpu within srcp1 & srcp2.
*
* Iterated calls using the same srcp1 and srcp2 will be distributed within
* their intersection.
*
* Returns >= nr_cpu_ids if the intersection is empty.
*/
int cpumask_any_and_distribute(const struct cpumask *src1p,
const struct cpumask *src2p)
{
int next, prev;
/* NOTE: our first selection will skip 0. */
prev = __this_cpu_read(distribute_cpu_mask_prev);
next = cpumask_next_and(prev, src1p, src2p);
if (next >= nr_cpu_ids)
next = cpumask_first_and(src1p, src2p);
if (next < nr_cpu_ids)
__this_cpu_write(distribute_cpu_mask_prev, next);
return next;
}
EXPORT_SYMBOL(cpumask_any_and_distribute);
int cpumask_any_distribute(const struct cpumask *srcp)
{
int next, prev;
/* NOTE: our first selection will skip 0. */
prev = __this_cpu_read(distribute_cpu_mask_prev);
next = cpumask_next(prev, srcp);
if (next >= nr_cpu_ids)
next = cpumask_first(srcp);
if (next < nr_cpu_ids)
__this_cpu_write(distribute_cpu_mask_prev, next);
return next;
}
EXPORT_SYMBOL(cpumask_any_distribute);
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_WORD_AT_A_TIME_H
#define _ASM_WORD_AT_A_TIME_H
#include <linux/kernel.h>
/*
* This is largely generic for little-endian machines, but the
* optimal byte mask counting is probably going to be something
* that is architecture-specific. If you have a reliably fast
* bit count instruction, that might be better than the multiply
* and shift, for example.
*/
struct word_at_a_time {
const unsigned long one_bits, high_bits;
};
#define WORD_AT_A_TIME_CONSTANTS { REPEAT_BYTE(0x01), REPEAT_BYTE(0x80) }
#ifdef CONFIG_64BIT
/*
* Jan Achrenius on G+: microoptimized version of
* the simpler "(mask & ONEBYTES) * ONEBYTES >> 56"
* that works for the bytemasks without having to
* mask them first.
*/
static inline long count_masked_bytes(unsigned long mask)
{
return mask*0x0001020304050608ul >> 56;
}
#else /* 32-bit case */
/* Carl Chatfield / Jan Achrenius G+ version for 32-bit */
static inline long count_masked_bytes(long mask)
{
/* (000000 0000ff 00ffff ffffff) -> ( 1 1 2 3 ) */
long a = (0x0ff0001+mask) >> 23;
/* Fix the 1 for 00 case */
return a & mask;
}
#endif
/* Return nonzero if it has a zero */
static inline unsigned long has_zero(unsigned long a, unsigned long *bits, const struct word_at_a_time *c)
{
unsigned long mask = ((a - c->one_bits) & ~a) & c->high_bits;
*bits = mask;
return mask;
}
static inline unsigned long prep_zero_mask(unsigned long a, unsigned long bits, const struct word_at_a_time *c)
{
return bits;
}
static inline unsigned long create_zero_mask(unsigned long bits)
{
bits = (bits - 1) & ~bits;
return bits >> 7;
}
/* The mask we created is directly usable as a bytemask */
#define zero_bytemask(mask) (mask)
static inline unsigned long find_zero(unsigned long mask)
{
return count_masked_bytes(mask);
}
/*
* Load an unaligned word from kernel space.
*
* In the (very unlikely) case of the word being a page-crosser
* and the next page not being mapped, take the exception and
* return zeroes in the non-existing part.
*/
static inline unsigned long load_unaligned_zeropad(const void *addr)
{
unsigned long ret, dummy;
asm(
"1:\tmov %2,%0\n"
"2:\n"
".section .fixup,\"ax\"\n"
"3:\t"
"lea %2,%1\n\t"
"and %3,%1\n\t"
"mov (%1),%0\n\t"
"leal %2,%%ecx\n\t"
"andl %4,%%ecx\n\t"
"shll $3,%%ecx\n\t"
"shr %%cl,%0\n\t"
"jmp 2b\n"
".previous\n"
_ASM_EXTABLE(1b, 3b)
:"=&r" (ret),"=&c" (dummy)
:"m" (*(unsigned long *)addr),
"i" (-sizeof(unsigned long)),
"i" (sizeof(unsigned long)-1));
return ret;
}
#endif /* _ASM_WORD_AT_A_TIME_H */
// SPDX-License-Identifier: GPL-2.0
#include <linux/mm.h>
#include <linux/gfp.h>
#include <linux/hugetlb.h>
#include <asm/pgalloc.h>
#include <asm/tlb.h>
#include <asm/fixmap.h>
#include <asm/mtrr.h>
#ifdef CONFIG_DYNAMIC_PHYSICAL_MASK
phys_addr_t physical_mask __ro_after_init = (1ULL << __PHYSICAL_MASK_SHIFT) - 1;
EXPORT_SYMBOL(physical_mask);
#endif
#ifdef CONFIG_HIGHPTE
#define PGTABLE_HIGHMEM __GFP_HIGHMEM
#else
#define PGTABLE_HIGHMEM 0
#endif
#ifndef CONFIG_PARAVIRT
static inline
void paravirt_tlb_remove_table(struct mmu_gather *tlb, void *table)
{
tlb_remove_page(tlb, table);
}
#endif
gfp_t __userpte_alloc_gfp = GFP_PGTABLE_USER | PGTABLE_HIGHMEM;
pgtable_t pte_alloc_one(struct mm_struct *mm)
{
return __pte_alloc_one(mm, __userpte_alloc_gfp);
}
static int __init setup_userpte(char *arg)
{
if (!arg)
return -EINVAL;
/*
* "userpte=nohigh" disables allocation of user pagetables in
* high memory.
*/
if (strcmp(arg, "nohigh") == 0)
__userpte_alloc_gfp &= ~__GFP_HIGHMEM;
else
return -EINVAL;
return 0;
}
early_param("userpte", setup_userpte);
void ___pte_free_tlb(struct mmu_gather *tlb, struct page *pte)
{
pgtable_pte_page_dtor(pte);
paravirt_release_pte(page_to_pfn(pte));
paravirt_tlb_remove_table(tlb, pte);
}
#if CONFIG_PGTABLE_LEVELS > 2
void ___pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd)
{
struct page *page = virt_to_page(pmd);
paravirt_release_pmd(__pa(pmd) >> PAGE_SHIFT);
/*
* NOTE! For PAE, any changes to the top page-directory-pointer-table
* entries need a full cr3 reload to flush.
*/
#ifdef CONFIG_X86_PAE
tlb->need_flush_all = 1;
#endif
pgtable_pmd_page_dtor(page);
paravirt_tlb_remove_table(tlb, page);
}
#if CONFIG_PGTABLE_LEVELS > 3
void ___pud_free_tlb(struct mmu_gather *tlb, pud_t *pud)
{
paravirt_release_pud(__pa(pud) >> PAGE_SHIFT);
paravirt_tlb_remove_table(tlb, virt_to_page(pud));
}
#if CONFIG_PGTABLE_LEVELS > 4
void ___p4d_free_tlb(struct mmu_gather *tlb, p4d_t *p4d)
{
paravirt_release_p4d(__pa(p4d) >> PAGE_SHIFT);
paravirt_tlb_remove_table(tlb, virt_to_page(p4d));
}
#endif /* CONFIG_PGTABLE_LEVELS > 4 */
#endif /* CONFIG_PGTABLE_LEVELS > 3 */
#endif /* CONFIG_PGTABLE_LEVELS > 2 */
static inline void pgd_list_add(pgd_t *pgd)
{
struct page *page = virt_to_page(pgd);
list_add(&page->lru, &pgd_list);
}
static inline void pgd_list_del(pgd_t *pgd)
{
struct page *page = virt_to_page(pgd);
list_del(&page->lru);
}
#define UNSHARED_PTRS_PER_PGD \
(SHARED_KERNEL_PMD ? KERNEL_PGD_BOUNDARY : PTRS_PER_PGD)
#define MAX_UNSHARED_PTRS_PER_PGD \
max_t(size_t, KERNEL_PGD_BOUNDARY, PTRS_PER_PGD)
static void pgd_set_mm(pgd_t *pgd, struct mm_struct *mm)
{
virt_to_page(pgd)->pt_mm = mm;
}
struct mm_struct *pgd_page_get_mm(struct page *page)
{
return page->pt_mm;
}
static void pgd_ctor(struct mm_struct *mm, pgd_t *pgd)
{
/* If the pgd points to a shared pagetable level (either the
ptes in non-PAE, or shared PMD in PAE), then just copy the
references from swapper_pg_dir. */
if (CONFIG_PGTABLE_LEVELS == 2 ||
(CONFIG_PGTABLE_LEVELS == 3 && SHARED_KERNEL_PMD) ||
CONFIG_PGTABLE_LEVELS >= 4) {
clone_pgd_range(pgd + KERNEL_PGD_BOUNDARY,
swapper_pg_dir + KERNEL_PGD_BOUNDARY,
KERNEL_PGD_PTRS);
}
/* list required to sync kernel mapping updates */
if (!SHARED_KERNEL_PMD) {
pgd_set_mm(pgd, mm);
pgd_list_add(pgd);
}
}
static void pgd_dtor(pgd_t *pgd)
{
if (SHARED_KERNEL_PMD)
return;
spin_lock(&pgd_lock);
pgd_list_del(pgd);
spin_unlock(&pgd_lock);
}
/*
* List of all pgd's needed for non-PAE so it can invalidate entries
* in both cached and uncached pgd's; not needed for PAE since the
* kernel pmd is shared. If PAE were not to share the pmd a similar
* tactic would be needed. This is essentially codepath-based locking
* against pageattr.c; it is the unique case in which a valid change
* of kernel pagetables can't be lazily synchronized by vmalloc faults.
* vmalloc faults work because attached pagetables are never freed.
* -- nyc
*/
#ifdef CONFIG_X86_PAE
/*
* In PAE mode, we need to do a cr3 reload (=tlb flush) when
* updating the top-level pagetable entries to guarantee the
* processor notices the update. Since this is expensive, and
* all 4 top-level entries are used almost immediately in a
* new process's life, we just pre-populate them here.
*
* Also, if we're in a paravirt environment where the kernel pmd is
* not shared between pagetables (!SHARED_KERNEL_PMDS), we allocate
* and initialize the kernel pmds here.
*/
#define PREALLOCATED_PMDS UNSHARED_PTRS_PER_PGD
#define MAX_PREALLOCATED_PMDS MAX_UNSHARED_PTRS_PER_PGD
/*
* We allocate separate PMDs for the kernel part of the user page-table
* when PTI is enabled. We need them to map the per-process LDT into the
* user-space page-table.
*/
#define PREALLOCATED_USER_PMDS (boot_cpu_has(X86_FEATURE_PTI) ? \
KERNEL_PGD_PTRS : 0)
#define MAX_PREALLOCATED_USER_PMDS KERNEL_PGD_PTRS
void pud_populate(struct mm_struct *mm, pud_t *pudp, pmd_t *pmd)
{
paravirt_alloc_pmd(mm, __pa(pmd) >> PAGE_SHIFT);
/* Note: almost everything apart from _PAGE_PRESENT is
reserved at the pmd (PDPT) level. */
set_pud(pudp, __pud(__pa(pmd) | _PAGE_PRESENT));
/*
* According to Intel App note "TLBs, Paging-Structure Caches,
* and Their Invalidation", April 2007, document 317080-001,
* section 8.1: in PAE mode we explicitly have to flush the
* TLB via cr3 if the top-level pgd is changed...
*/
flush_tlb_mm(mm);
}
#else /* !CONFIG_X86_PAE */
/* No need to prepopulate any pagetable entries in non-PAE modes. */
#define PREALLOCATED_PMDS 0
#define MAX_PREALLOCATED_PMDS 0
#define PREALLOCATED_USER_PMDS 0
#define MAX_PREALLOCATED_USER_PMDS 0
#endif /* CONFIG_X86_PAE */
static void free_pmds(struct mm_struct *mm, pmd_t *pmds[], int count)
{
int i;
for (i = 0; i < count; i++)
if (pmds[i]) {
pgtable_pmd_page_dtor(virt_to_page(pmds[i]));
free_page((unsigned long)pmds[i]);
mm_dec_nr_pmds(mm);
}
}
static int preallocate_pmds(struct mm_struct *mm, pmd_t *pmds[], int count)
{
int i;
bool failed = false;
gfp_t gfp = GFP_PGTABLE_USER;
if (mm == &init_mm)
gfp &= ~__GFP_ACCOUNT;
for (i = 0; i < count; i++) {
pmd_t *pmd = (pmd_t *)__get_free_page(gfp);
if (!pmd)
failed = true;
if (pmd && !pgtable_pmd_page_ctor(virt_to_page(pmd))) {
free_page((unsigned long)pmd);
pmd = NULL;
failed = true;
}
if (pmd)
mm_inc_nr_pmds(mm);
pmds[i] = pmd;
}
if (failed) {
free_pmds(mm, pmds, count);
return -ENOMEM;
}
return 0;
}
/*
* Mop up any pmd pages which may still be attached to the pgd.
* Normally they will be freed by munmap/exit_mmap, but any pmd we
* preallocate which never got a corresponding vma will need to be
* freed manually.
*/
static void mop_up_one_pmd(struct mm_struct *mm, pgd_t *pgdp)
{
pgd_t pgd = *pgdp;
if (pgd_val(pgd) != 0) {
pmd_t *pmd = (pmd_t *)pgd_page_vaddr(pgd);
pgd_clear(pgdp);
paravirt_release_pmd(pgd_val(pgd) >> PAGE_SHIFT);
pmd_free(mm, pmd);
mm_dec_nr_pmds(mm);
}
}
static void pgd_mop_up_pmds(struct mm_struct *mm, pgd_t *pgdp)
{
int i;
for (i = 0; i < PREALLOCATED_PMDS; i++)
mop_up_one_pmd(mm, &pgdp[i]);
#ifdef CONFIG_PAGE_TABLE_ISOLATION
if (!boot_cpu_has(X86_FEATURE_PTI))
return;
pgdp = kernel_to_user_pgdp(pgdp);
for (i = 0; i < PREALLOCATED_USER_PMDS; i++)
mop_up_one_pmd(mm, &pgdp[i + KERNEL_PGD_BOUNDARY]);
#endif
}
static void pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd, pmd_t *pmds[])
{
p4d_t *p4d;
pud_t *pud;
int i;
if (PREALLOCATED_PMDS == 0) /* Work around gcc-3.4.x bug */
return;
p4d = p4d_offset(pgd, 0);
pud = pud_offset(p4d, 0);
for (i = 0; i < PREALLOCATED_PMDS; i++, pud++) {
pmd_t *pmd = pmds[i];
if (i >= KERNEL_PGD_BOUNDARY)
memcpy(pmd, (pmd_t *)pgd_page_vaddr(swapper_pg_dir[i]),
sizeof(pmd_t) * PTRS_PER_PMD);
pud_populate(mm, pud, pmd);
}
}
#ifdef CONFIG_PAGE_TABLE_ISOLATION
static void pgd_prepopulate_user_pmd(struct mm_struct *mm,
pgd_t *k_pgd, pmd_t *pmds[])
{
pgd_t *s_pgd = kernel_to_user_pgdp(swapper_pg_dir);
pgd_t *u_pgd = kernel_to_user_pgdp(k_pgd);
p4d_t *u_p4d;
pud_t *u_pud;
int i;
u_p4d = p4d_offset(u_pgd, 0);
u_pud = pud_offset(u_p4d, 0);
s_pgd += KERNEL_PGD_BOUNDARY;
u_pud += KERNEL_PGD_BOUNDARY;
for (i = 0; i < PREALLOCATED_USER_PMDS; i++, u_pud++, s_pgd++) {
pmd_t *pmd = pmds[i];
memcpy(pmd, (pmd_t *)pgd_page_vaddr(*s_pgd),
sizeof(pmd_t) * PTRS_PER_PMD);
pud_populate(mm, u_pud, pmd);
}
}
#else
static void pgd_prepopulate_user_pmd(struct mm_struct *mm,
pgd_t *k_pgd, pmd_t *pmds[])
{
}
#endif
/*
* Xen paravirt assumes pgd table should be in one page. 64 bit kernel also
* assumes that pgd should be in one page.
*
* But kernel with PAE paging that is not running as a Xen domain
* only needs to allocate 32 bytes for pgd instead of one page.
*/
#ifdef CONFIG_X86_PAE
#include <linux/slab.h>
#define PGD_SIZE (PTRS_PER_PGD * sizeof(pgd_t))
#define PGD_ALIGN 32
static struct kmem_cache *pgd_cache;
void __init pgtable_cache_init(void)
{
/*
* When PAE kernel is running as a Xen domain, it does not use
* shared kernel pmd. And this requires a whole page for pgd.
*/
if (!SHARED_KERNEL_PMD)
return;
/*
* when PAE kernel is not running as a Xen domain, it uses
* shared kernel pmd. Shared kernel pmd does not require a whole
* page for pgd. We are able to just allocate a 32-byte for pgd.
* During boot time, we create a 32-byte slab for pgd table allocation.
*/
pgd_cache = kmem_cache_create("pgd_cache", PGD_SIZE, PGD_ALIGN,
SLAB_PANIC, NULL);
}
static inline pgd_t *_pgd_alloc(void)
{
/*
* If no SHARED_KERNEL_PMD, PAE kernel is running as a Xen domain.
* We allocate one page for pgd.
*/
if (!SHARED_KERNEL_PMD)
return (pgd_t *)__get_free_pages(GFP_PGTABLE_USER,
PGD_ALLOCATION_ORDER);
/*
* Now PAE kernel is not running as a Xen domain. We can allocate
* a 32-byte slab for pgd to save memory space.
*/
return kmem_cache_alloc(pgd_cache, GFP_PGTABLE_USER);
}
static inline void _pgd_free(pgd_t *pgd)
{
if (!SHARED_KERNEL_PMD)
free_pages((unsigned long)pgd, PGD_ALLOCATION_ORDER);
else
kmem_cache_free(pgd_cache, pgd);
}
#else
static inline pgd_t *_pgd_alloc(void)
{
return (pgd_t *)__get_free_pages(GFP_PGTABLE_USER,
PGD_ALLOCATION_ORDER);
}
static inline void _pgd_free(pgd_t *pgd)
{
free_pages((unsigned long)pgd, PGD_ALLOCATION_ORDER);
}
#endif /* CONFIG_X86_PAE */
pgd_t *pgd_alloc(struct mm_struct *mm)
{
pgd_t *pgd;
pmd_t *u_pmds[MAX_PREALLOCATED_USER_PMDS];
pmd_t *pmds[MAX_PREALLOCATED_PMDS];
pgd = _pgd_alloc();
if (pgd == NULL)
goto out;
mm->pgd = pgd;
if (preallocate_pmds(mm, pmds, PREALLOCATED_PMDS) != 0)
goto out_free_pgd;
if (preallocate_pmds(mm, u_pmds, PREALLOCATED_USER_PMDS) != 0)
goto out_free_pmds;
if (paravirt_pgd_alloc(mm) != 0)
goto out_free_user_pmds;
/*
* Make sure that pre-populating the pmds is atomic with
* respect to anything walking the pgd_list, so that they
* never see a partially populated pgd.
*/
spin_lock(&pgd_lock);
pgd_ctor(mm, pgd);
pgd_prepopulate_pmd(mm, pgd, pmds);
pgd_prepopulate_user_pmd(mm, pgd, u_pmds);
spin_unlock(&pgd_lock);
return pgd;
out_free_user_pmds:
free_pmds(mm, u_pmds, PREALLOCATED_USER_PMDS);
out_free_pmds:
free_pmds(mm, pmds, PREALLOCATED_PMDS);
out_free_pgd:
_pgd_free(pgd);
out:
return NULL;
}
void pgd_free(struct mm_struct *mm, pgd_t *pgd)
{
pgd_mop_up_pmds(mm, pgd);
pgd_dtor(pgd);
paravirt_pgd_free(mm, pgd);
_pgd_free(pgd);
}
/*
* Used to set accessed or dirty bits in the page table entries
* on other architectures. On x86, the accessed and dirty bits
* are tracked by hardware. However, do_wp_page calls this function
* to also make the pte writeable at the same time the dirty bit is
* set. In that case we do actually need to write the PTE.
*/
int ptep_set_access_flags(struct vm_area_struct *vma,
unsigned long address, pte_t *ptep,
pte_t entry, int dirty)
{
int changed = !pte_same(*ptep, entry); if (changed && dirty)
set_pte(ptep, entry);
return changed;
}
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
int pmdp_set_access_flags(struct vm_area_struct *vma,
unsigned long address, pmd_t *pmdp,
pmd_t entry, int dirty)
{
int changed = !pmd_same(*pmdp, entry);
VM_BUG_ON(address & ~HPAGE_PMD_MASK);
if (changed && dirty) {
set_pmd(pmdp, entry);
/*
* We had a write-protection fault here and changed the pmd
* to to more permissive. No need to flush the TLB for that,
* #PF is architecturally guaranteed to do that and in the
* worst-case we'll generate a spurious fault.
*/
}
return changed;
}
int pudp_set_access_flags(struct vm_area_struct *vma, unsigned long address,
pud_t *pudp, pud_t entry, int dirty)
{
int changed = !pud_same(*pudp, entry);
VM_BUG_ON(address & ~HPAGE_PUD_MASK);
if (changed && dirty) {
set_pud(pudp, entry);
/*
* We had a write-protection fault here and changed the pud
* to to more permissive. No need to flush the TLB for that,
* #PF is architecturally guaranteed to do that and in the
* worst-case we'll generate a spurious fault.
*/
}
return changed;
}
#endif
int ptep_test_and_clear_young(struct vm_area_struct *vma,
unsigned long addr, pte_t *ptep)
{
int ret = 0;
if (pte_young(*ptep))
ret = test_and_clear_bit(_PAGE_BIT_ACCESSED,
(unsigned long *) &ptep->pte);
return ret;
}
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
int pmdp_test_and_clear_young(struct vm_area_struct *vma,
unsigned long addr, pmd_t *pmdp)
{
int ret = 0;
if (pmd_young(*pmdp))
ret = test_and_clear_bit(_PAGE_BIT_ACCESSED,
(unsigned long *)pmdp);
return ret;
}
int pudp_test_and_clear_young(struct vm_area_struct *vma,
unsigned long addr, pud_t *pudp)
{
int ret = 0;
if (pud_young(*pudp))
ret = test_and_clear_bit(_PAGE_BIT_ACCESSED,
(unsigned long *)pudp);
return ret;
}
#endif
int ptep_clear_flush_young(struct vm_area_struct *vma,
unsigned long address, pte_t *ptep)
{
/*
* On x86 CPUs, clearing the accessed bit without a TLB flush
* doesn't cause data corruption. [ It could cause incorrect
* page aging and the (mistaken) reclaim of hot pages, but the
* chance of that should be relatively low. ]
*
* So as a performance optimization don't flush the TLB when
* clearing the accessed bit, it will eventually be flushed by
* a context switch or a VM operation anyway. [ In the rare
* event of it not getting flushed for a long time the delay
* shouldn't really matter because there's no real memory
* pressure for swapout to react to. ]
*/
return ptep_test_and_clear_young(vma, address, ptep);
}
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
int pmdp_clear_flush_young(struct vm_area_struct *vma,
unsigned long address, pmd_t *pmdp)
{
int young;
VM_BUG_ON(address & ~HPAGE_PMD_MASK);
young = pmdp_test_and_clear_young(vma, address, pmdp);
if (young)
flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
return young;
}
#endif
/**
* reserve_top_address - reserves a hole in the top of kernel address space
* @reserve - size of hole to reserve
*
* Can be used to relocate the fixmap area and poke a hole in the top
* of kernel address space to make room for a hypervisor.
*/
void __init reserve_top_address(unsigned long reserve)
{
#ifdef CONFIG_X86_32
BUG_ON(fixmaps_set > 0);
__FIXADDR_TOP = round_down(-reserve, 1 << PMD_SHIFT) - PAGE_SIZE;
printk(KERN_INFO "Reserving virtual address space above 0x%08lx (rounded to 0x%08lx)\n",
-reserve, __FIXADDR_TOP + PAGE_SIZE);
#endif
}
int fixmaps_set;
void __native_set_fixmap(enum fixed_addresses idx, pte_t pte)
{
unsigned long address = __fix_to_virt(idx);
#ifdef CONFIG_X86_64
/*
* Ensure that the static initial page tables are covering the
* fixmap completely.
*/
BUILD_BUG_ON(__end_of_permanent_fixed_addresses >
(FIXMAP_PMD_NUM * PTRS_PER_PTE));
#endif
if (idx >= __end_of_fixed_addresses) {
BUG();
return;
}
set_pte_vaddr(address, pte);
fixmaps_set++;
}
void native_set_fixmap(unsigned /* enum fixed_addresses */ idx,
phys_addr_t phys, pgprot_t flags)
{
/* Sanitize 'prot' against any unsupported bits: */
pgprot_val(flags) &= __default_kernel_pte_mask;
__native_set_fixmap(idx, pfn_pte(phys >> PAGE_SHIFT, flags));
}
#ifdef CONFIG_HAVE_ARCH_HUGE_VMAP
#ifdef CONFIG_X86_5LEVEL
/**
* p4d_set_huge - setup kernel P4D mapping
*
* No 512GB pages yet -- always return 0
*/
int p4d_set_huge(p4d_t *p4d, phys_addr_t addr, pgprot_t prot)
{
return 0;
}
/**
* p4d_clear_huge - clear kernel P4D mapping when it is set
*
* No 512GB pages yet -- always return 0
*/
int p4d_clear_huge(p4d_t *p4d)
{
return 0;
}
#endif
/**
* pud_set_huge - setup kernel PUD mapping
*
* MTRRs can override PAT memory types with 4KiB granularity. Therefore, this
* function sets up a huge page only if any of the following conditions are met:
*
* - MTRRs are disabled, or
*
* - MTRRs are enabled and the range is completely covered by a single MTRR, or
*
* - MTRRs are enabled and the corresponding MTRR memory type is WB, which
* has no effect on the requested PAT memory type.
*
* Callers should try to decrease page size (1GB -> 2MB -> 4K) if the bigger
* page mapping attempt fails.
*
* Returns 1 on success and 0 on failure.
*/
int pud_set_huge(pud_t *pud, phys_addr_t addr, pgprot_t prot)
{
u8 mtrr, uniform;
mtrr = mtrr_type_lookup(addr, addr + PUD_SIZE, &uniform);
if ((mtrr != MTRR_TYPE_INVALID) && (!uniform) &&
(mtrr != MTRR_TYPE_WRBACK))
return 0;
/* Bail out if we are we on a populated non-leaf entry: */
if (pud_present(*pud) && !pud_huge(*pud))
return 0;
set_pte((pte_t *)pud, pfn_pte(
(u64)addr >> PAGE_SHIFT,
__pgprot(protval_4k_2_large(pgprot_val(prot)) | _PAGE_PSE)));
return 1;
}
/**
* pmd_set_huge - setup kernel PMD mapping
*
* See text over pud_set_huge() above.
*
* Returns 1 on success and 0 on failure.
*/
int pmd_set_huge(pmd_t *pmd, phys_addr_t addr, pgprot_t prot)
{
u8 mtrr, uniform;
mtrr = mtrr_type_lookup(addr, addr + PMD_SIZE, &uniform);
if ((mtrr != MTRR_TYPE_INVALID) && (!uniform) &&
(mtrr != MTRR_TYPE_WRBACK)) {
pr_warn_once("%s: Cannot satisfy [mem %#010llx-%#010llx] with a huge-page mapping due to MTRR override.\n",
__func__, addr, addr + PMD_SIZE);
return 0;
}
/* Bail out if we are we on a populated non-leaf entry: */
if (pmd_present(*pmd) && !pmd_huge(*pmd))
return 0;
set_pte((pte_t *)pmd, pfn_pte(
(u64)addr >> PAGE_SHIFT,
__pgprot(protval_4k_2_large(pgprot_val(prot)) | _PAGE_PSE)));
return 1;
}
/**
* pud_clear_huge - clear kernel PUD mapping when it is set
*
* Returns 1 on success and 0 on failure (no PUD map is found).
*/
int pud_clear_huge(pud_t *pud)
{ if (pud_large(*pud)) {
pud_clear(pud);
return 1;
}
return 0;
}
/**
* pmd_clear_huge - clear kernel PMD mapping when it is set
*
* Returns 1 on success and 0 on failure (no PMD map is found).
*/
int pmd_clear_huge(pmd_t *pmd)
{
if (pmd_large(*pmd)) {
pmd_clear(pmd);
return 1;
}
return 0;
}
#ifdef CONFIG_X86_64
/**
* pud_free_pmd_page - Clear pud entry and free pmd page.
* @pud: Pointer to a PUD.
* @addr: Virtual address associated with pud.
*
* Context: The pud range has been unmapped and TLB purged.
* Return: 1 if clearing the entry succeeded. 0 otherwise.
*
* NOTE: Callers must allow a single page allocation.
*/
int pud_free_pmd_page(pud_t *pud, unsigned long addr)
{
pmd_t *pmd, *pmd_sv;
pte_t *pte;
int i;
pmd = pud_pgtable(*pud);
pmd_sv = (pmd_t *)__get_free_page(GFP_KERNEL);
if (!pmd_sv)
return 0;
for (i = 0; i < PTRS_PER_PMD; i++) {
pmd_sv[i] = pmd[i];
if (!pmd_none(pmd[i]))
pmd_clear(&pmd[i]);
}
pud_clear(pud);
/* INVLPG to clear all paging-structure caches */
flush_tlb_kernel_range(addr, addr + PAGE_SIZE-1);
for (i = 0; i < PTRS_PER_PMD; i++) {
if (!pmd_none(pmd_sv[i])) {
pte = (pte_t *)pmd_page_vaddr(pmd_sv[i]);
free_page((unsigned long)pte);
}
}
free_page((unsigned long)pmd_sv);
pgtable_pmd_page_dtor(virt_to_page(pmd));
free_page((unsigned long)pmd);
return 1;
}
/**
* pmd_free_pte_page - Clear pmd entry and free pte page.
* @pmd: Pointer to a PMD.
* @addr: Virtual address associated with pmd.
*
* Context: The pmd range has been unmapped and TLB purged.
* Return: 1 if clearing the entry succeeded. 0 otherwise.
*/
int pmd_free_pte_page(pmd_t *pmd, unsigned long addr)
{
pte_t *pte;
pte = (pte_t *)pmd_page_vaddr(*pmd);
pmd_clear(pmd);
/* INVLPG to clear all paging-structure caches */
flush_tlb_kernel_range(addr, addr + PAGE_SIZE-1);
free_page((unsigned long)pte);
return 1;
}
#else /* !CONFIG_X86_64 */
/*
* Disable free page handling on x86-PAE. This assures that ioremap()
* does not update sync'd pmd entries. See vmalloc_sync_one().
*/
int pmd_free_pte_page(pmd_t *pmd, unsigned long addr)
{
return pmd_none(*pmd);
}
#endif /* CONFIG_X86_64 */
#endif /* CONFIG_HAVE_ARCH_HUGE_VMAP */
// SPDX-License-Identifier: GPL-2.0
/*
* Copyright (C) 2011 STRATO AG
* written by Arne Jansen <sensille@gmx.net>
*/
#include <linux/slab.h>
#include "ulist.h"
#include "ctree.h"
/*
* ulist is a generic data structure to hold a collection of unique u64
* values. The only operations it supports is adding to the list and
* enumerating it.
* It is possible to store an auxiliary value along with the key.
*
* A sample usage for ulists is the enumeration of directed graphs without
* visiting a node twice. The pseudo-code could look like this:
*
* ulist = ulist_alloc();
* ulist_add(ulist, root);
* ULIST_ITER_INIT(&uiter);
*
* while ((elem = ulist_next(ulist, &uiter)) {
* for (all child nodes n in elem)
* ulist_add(ulist, n);
* do something useful with the node;
* }
* ulist_free(ulist);
*
* This assumes the graph nodes are addressable by u64. This stems from the
* usage for tree enumeration in btrfs, where the logical addresses are
* 64 bit.
*
* It is also useful for tree enumeration which could be done elegantly
* recursively, but is not possible due to kernel stack limitations. The
* loop would be similar to the above.
*/
/**
* ulist_init - freshly initialize a ulist
* @ulist: the ulist to initialize
*
* Note: don't use this function to init an already used ulist, use
* ulist_reinit instead.
*/
void ulist_init(struct ulist *ulist)
{
INIT_LIST_HEAD(&ulist->nodes);
ulist->root = RB_ROOT;
ulist->nnodes = 0;
}
/**
* ulist_release - free up additionally allocated memory for the ulist
* @ulist: the ulist from which to free the additional memory
*
* This is useful in cases where the base 'struct ulist' has been statically
* allocated.
*/
void ulist_release(struct ulist *ulist)
{
struct ulist_node *node;
struct ulist_node *next;
list_for_each_entry_safe(node, next, &ulist->nodes, list) { kfree(node);
}
ulist->root = RB_ROOT;
INIT_LIST_HEAD(&ulist->nodes);
}
/**
* ulist_reinit - prepare a ulist for reuse
* @ulist: ulist to be reused
*
* Free up all additional memory allocated for the list elements and reinit
* the ulist.
*/
void ulist_reinit(struct ulist *ulist)
{
ulist_release(ulist);
ulist_init(ulist);
}
/**
* ulist_alloc - dynamically allocate a ulist
* @gfp_mask: allocation flags to for base allocation
*
* The allocated ulist will be returned in an initialized state.
*/
struct ulist *ulist_alloc(gfp_t gfp_mask)
{
struct ulist *ulist = kmalloc(sizeof(*ulist), gfp_mask);
if (!ulist)
return NULL;
ulist_init(ulist);
return ulist;
}
/**
* ulist_free - free dynamically allocated ulist
* @ulist: ulist to free
*
* It is not necessary to call ulist_release before.
*/
void ulist_free(struct ulist *ulist)
{
if (!ulist)
return;
ulist_release(ulist);
kfree(ulist);
}
static struct ulist_node *ulist_rbtree_search(struct ulist *ulist, u64 val)
{
struct rb_node *n = ulist->root.rb_node;
struct ulist_node *u = NULL;
while (n) {
u = rb_entry(n, struct ulist_node, rb_node);
if (u->val < val)
n = n->rb_right;
else if (u->val > val)
n = n->rb_left;
else
return u;
}
return NULL;
}
static void ulist_rbtree_erase(struct ulist *ulist, struct ulist_node *node)
{
rb_erase(&node->rb_node, &ulist->root);
list_del(&node->list);
kfree(node);
BUG_ON(ulist->nnodes == 0);
ulist->nnodes--;
}
static int ulist_rbtree_insert(struct ulist *ulist, struct ulist_node *ins)
{
struct rb_node **p = &ulist->root.rb_node;
struct rb_node *parent = NULL;
struct ulist_node *cur = NULL;
while (*p) {
parent = *p;
cur = rb_entry(parent, struct ulist_node, rb_node);
if (cur->val < ins->val)
p = &(*p)->rb_right;
else if (cur->val > ins->val)
p = &(*p)->rb_left;
else
return -EEXIST;
}
rb_link_node(&ins->rb_node, parent, p);
rb_insert_color(&ins->rb_node, &ulist->root);
return 0;
}
/**
* ulist_add - add an element to the ulist
* @ulist: ulist to add the element to
* @val: value to add to ulist
* @aux: auxiliary value to store along with val
* @gfp_mask: flags to use for allocation
*
* Note: locking must be provided by the caller. In case of rwlocks write
* locking is needed
*
* Add an element to a ulist. The @val will only be added if it doesn't
* already exist. If it is added, the auxiliary value @aux is stored along with
* it. In case @val already exists in the ulist, @aux is ignored, even if
* it differs from the already stored value.
*
* ulist_add returns 0 if @val already exists in ulist and 1 if @val has been
* inserted.
* In case of allocation failure -ENOMEM is returned and the ulist stays
* unaltered.
*/
int ulist_add(struct ulist *ulist, u64 val, u64 aux, gfp_t gfp_mask)
{
return ulist_add_merge(ulist, val, aux, NULL, gfp_mask);
}
int ulist_add_merge(struct ulist *ulist, u64 val, u64 aux,
u64 *old_aux, gfp_t gfp_mask)
{
int ret;
struct ulist_node *node;
node = ulist_rbtree_search(ulist, val);
if (node) {
if (old_aux)
*old_aux = node->aux;
return 0;
}
node = kmalloc(sizeof(*node), gfp_mask);
if (!node)
return -ENOMEM;
node->val = val;
node->aux = aux;
ret = ulist_rbtree_insert(ulist, node);
ASSERT(!ret);
list_add_tail(&node->list, &ulist->nodes);
ulist->nnodes++;
return 1;
}
/*
* ulist_del - delete one node from ulist
* @ulist: ulist to remove node from
* @val: value to delete
* @aux: aux to delete
*
* The deletion will only be done when *BOTH* val and aux matches.
* Return 0 for successful delete.
* Return > 0 for not found.
*/
int ulist_del(struct ulist *ulist, u64 val, u64 aux)
{
struct ulist_node *node;
node = ulist_rbtree_search(ulist, val);
/* Not found */
if (!node)
return 1;
if (node->aux != aux)
return 1;
/* Found and delete */
ulist_rbtree_erase(ulist, node);
return 0;
}
/**
* ulist_next - iterate ulist
* @ulist: ulist to iterate
* @uiter: iterator variable, initialized with ULIST_ITER_INIT(&iterator)
*
* Note: locking must be provided by the caller. In case of rwlocks only read
* locking is needed
*
* This function is used to iterate an ulist.
* It returns the next element from the ulist or %NULL when the
* end is reached. No guarantee is made with respect to the order in which
* the elements are returned. They might neither be returned in order of
* addition nor in ascending order.
* It is allowed to call ulist_add during an enumeration. Newly added items
* are guaranteed to show up in the running enumeration.
*/
struct ulist_node *ulist_next(struct ulist *ulist, struct ulist_iterator *uiter)
{
struct ulist_node *node;
if (list_empty(&ulist->nodes))
return NULL;
if (uiter->cur_list && uiter->cur_list->next == &ulist->nodes)
return NULL;
if (uiter->cur_list) {
uiter->cur_list = uiter->cur_list->next;
} else {
uiter->cur_list = ulist->nodes.next;
}
node = list_entry(uiter->cur_list, struct ulist_node, list);
return node;
}
// SPDX-License-Identifier: GPL-2.0
/*
* Implementation of the multi-level security (MLS) policy.
*
* Author : Stephen Smalley, <sds@tycho.nsa.gov>
*/
/*
* Updated: Trusted Computer Solutions, Inc. <dgoeddel@trustedcs.com>
*
* Support for enhanced MLS infrastructure.
*
* Copyright (C) 2004-2006 Trusted Computer Solutions, Inc.
*/
/*
* Updated: Hewlett-Packard <paul@paul-moore.com>
*
* Added support to import/export the MLS label from NetLabel
*
* (c) Copyright Hewlett-Packard Development Company, L.P., 2006
*/
#include <linux/kernel.h>
#include <linux/slab.h>
#include <linux/string.h>
#include <linux/errno.h>
#include <net/netlabel.h>
#include "sidtab.h"
#include "mls.h"
#include "policydb.h"
#include "services.h"
/*
* Return the length in bytes for the MLS fields of the
* security context string representation of `context'.
*/
int mls_compute_context_len(struct policydb *p, struct context *context)
{
int i, l, len, head, prev;
char *nm;
struct ebitmap *e;
struct ebitmap_node *node;
if (!p->mls_enabled)
return 0;
len = 1; /* for the beginning ":" */
for (l = 0; l < 2; l++) {
int index_sens = context->range.level[l].sens;
len += strlen(sym_name(p, SYM_LEVELS, index_sens - 1));
/* categories */
head = -2;
prev = -2;
e = &context->range.level[l].cat;
ebitmap_for_each_positive_bit(e, node, i) {
if (i - prev > 1) {
/* one or more negative bits are skipped */
if (head != prev) {
nm = sym_name(p, SYM_CATS, prev);
len += strlen(nm) + 1;
}
nm = sym_name(p, SYM_CATS, i);
len += strlen(nm) + 1;
head = i;
}
prev = i;
}
if (prev != head) {
nm = sym_name(p, SYM_CATS, prev);
len += strlen(nm) + 1;
}
if (l == 0) {
if (mls_level_eq(&context->range.level[0],
&context->range.level[1]))
break;
else
len++;
}
}
return len;
}
/*
* Write the security context string representation of
* the MLS fields of `context' into the string `*scontext'.
* Update `*scontext' to point to the end of the MLS fields.
*/
void mls_sid_to_context(struct policydb *p,
struct context *context,
char **scontext)
{
char *scontextp, *nm;
int i, l, head, prev;
struct ebitmap *e;
struct ebitmap_node *node;
if (!p->mls_enabled)
return;
scontextp = *scontext;
*scontextp = ':';
scontextp++;
for (l = 0; l < 2; l++) {
strcpy(scontextp, sym_name(p, SYM_LEVELS,
context->range.level[l].sens - 1));
scontextp += strlen(scontextp);
/* categories */
head = -2;
prev = -2;
e = &context->range.level[l].cat;
ebitmap_for_each_positive_bit(e, node, i) {
if (i - prev > 1) {
/* one or more negative bits are skipped */
if (prev != head) {
if (prev - head > 1)
*scontextp++ = '.';
else
*scontextp++ = ',';
nm = sym_name(p, SYM_CATS, prev);
strcpy(scontextp, nm);
scontextp += strlen(nm);
}
if (prev < 0)
*scontextp++ = ':';
else
*scontextp++ = ',';
nm = sym_name(p, SYM_CATS, i);
strcpy(scontextp, nm);
scontextp += strlen(nm);
head = i;
}
prev = i;
}
if (prev != head) {
if (prev - head > 1)
*scontextp++ = '.';
else
*scontextp++ = ',';
nm = sym_name(p, SYM_CATS, prev);
strcpy(scontextp, nm);
scontextp += strlen(nm);
}
if (l == 0) {
if (mls_level_eq(&context->range.level[0],
&context->range.level[1]))
break;
else
*scontextp++ = '-';
}
}
*scontext = scontextp;
return;
}
int mls_level_isvalid(struct policydb *p, struct mls_level *l)
{
struct level_datum *levdatum;
if (!l->sens || l->sens > p->p_levels.nprim) return 0;
levdatum = symtab_search(&p->p_levels,
sym_name(p, SYM_LEVELS, l->sens - 1));
if (!levdatum)
return 0;
/*
* Return 1 iff all the bits set in l->cat are also be set in
* levdatum->level->cat and no bit in l->cat is larger than
* p->p_cats.nprim.
*/
return ebitmap_contains(&levdatum->level->cat, &l->cat,
p->p_cats.nprim);
}
int mls_range_isvalid(struct policydb *p, struct mls_range *r)
{
return (mls_level_isvalid(p, &r->level[0]) && mls_level_isvalid(p, &r->level[1]) &&
mls_level_dom(&r->level[1], &r->level[0]));
}
/*
* Return 1 if the MLS fields in the security context
* structure `c' are valid. Return 0 otherwise.
*/
int mls_context_isvalid(struct policydb *p, struct context *c)
{
struct user_datum *usrdatum;
if (!p->mls_enabled) return 1; if (!mls_range_isvalid(p, &c->range))
return 0;
if (c->role == OBJECT_R_VAL)
return 1;
/*
* User must be authorized for the MLS range.
*/
if (!c->user || c->user > p->p_users.nprim)
return 0;
usrdatum = p->user_val_to_struct[c->user - 1];
if (!mls_range_contains(usrdatum->range, c->range))
return 0; /* user may not be associated with range */
return 1;
}
/*
* Set the MLS fields in the security context structure
* `context' based on the string representation in
* the string `scontext'.
*
* This function modifies the string in place, inserting
* NULL characters to terminate the MLS fields.
*
* If a def_sid is provided and no MLS field is present,
* copy the MLS field of the associated default context.
* Used for upgraded to MLS systems where objects may lack
* MLS fields.
*
* Policy read-lock must be held for sidtab lookup.
*
*/
int mls_context_to_sid(struct policydb *pol,
char oldc,
char *scontext,
struct context *context,
struct sidtab *s,
u32 def_sid)
{
char *sensitivity, *cur_cat, *next_cat, *rngptr;
struct level_datum *levdatum;
struct cat_datum *catdatum, *rngdatum;
int l, rc, i;
char *rangep[2];
if (!pol->mls_enabled) {
/*
* With no MLS, only return -EINVAL if there is a MLS field
* and it did not come from an xattr.
*/
if (oldc && def_sid == SECSID_NULL)
return -EINVAL;
return 0;
}
/*
* No MLS component to the security context, try and map to
* default if provided.
*/
if (!oldc) {
struct context *defcon;
if (def_sid == SECSID_NULL)
return -EINVAL;
defcon = sidtab_search(s, def_sid);
if (!defcon)
return -EINVAL;
return mls_context_cpy(context, defcon);
}
/*
* If we're dealing with a range, figure out where the two parts
* of the range begin.
*/
rangep[0] = scontext;
rangep[1] = strchr(scontext, '-');
if (rangep[1]) {
rangep[1][0] = '\0';
rangep[1]++;
}
/* For each part of the range: */
for (l = 0; l < 2; l++) {
/* Split sensitivity and category set. */
sensitivity = rangep[l];
if (sensitivity == NULL)
break;
next_cat = strchr(sensitivity, ':');
if (next_cat)
*(next_cat++) = '\0';
/* Parse sensitivity. */
levdatum = symtab_search(&pol->p_levels, sensitivity);
if (!levdatum)
return -EINVAL; context->range.level[l].sens = levdatum->level->sens;
/* Extract category set. */
while (next_cat != NULL) {
cur_cat = next_cat;
next_cat = strchr(next_cat, ',');
if (next_cat != NULL)
*(next_cat++) = '\0';
/* Separate into range if exists */
rngptr = strchr(cur_cat, '.');
if (rngptr != NULL) {
/* Remove '.' */
*rngptr++ = '\0';
}
catdatum = symtab_search(&pol->p_cats, cur_cat);
if (!catdatum)
return -EINVAL;
rc = ebitmap_set_bit(&context->range.level[l].cat,
catdatum->value - 1, 1);
if (rc)
return rc;
/* If range, set all categories in range */
if (rngptr == NULL)
continue;
rngdatum = symtab_search(&pol->p_cats, rngptr);
if (!rngdatum)
return -EINVAL;
if (catdatum->value >= rngdatum->value)
return -EINVAL;
for (i = catdatum->value; i < rngdatum->value; i++) { rc = ebitmap_set_bit(&context->range.level[l].cat, i, 1);
if (rc)
return rc;
}
}
}
/* If we didn't see a '-', the range start is also the range end. */
if (rangep[1] == NULL) { context->range.level[1].sens = context->range.level[0].sens;
rc = ebitmap_cpy(&context->range.level[1].cat,
&context->range.level[0].cat);
if (rc)
return rc;
}
return 0;
}
/*
* Set the MLS fields in the security context structure
* `context' based on the string representation in
* the string `str'. This function will allocate temporary memory with the
* given constraints of gfp_mask.
*/
int mls_from_string(struct policydb *p, char *str, struct context *context,
gfp_t gfp_mask)
{
char *tmpstr;
int rc;
if (!p->mls_enabled)
return -EINVAL;
tmpstr = kstrdup(str, gfp_mask);
if (!tmpstr) {
rc = -ENOMEM;
} else {
rc = mls_context_to_sid(p, ':', tmpstr, context,
NULL, SECSID_NULL);
kfree(tmpstr);
}
return rc;
}
/*
* Copies the MLS range `range' into `context'.
*/
int mls_range_set(struct context *context,
struct mls_range *range)
{
int l, rc = 0;
/* Copy the MLS range into the context */
for (l = 0; l < 2; l++) {
context->range.level[l].sens = range->level[l].sens;
rc = ebitmap_cpy(&context->range.level[l].cat,
&range->level[l].cat);
if (rc)
break;
}
return rc;
}
int mls_setup_user_range(struct policydb *p,
struct context *fromcon, struct user_datum *user,
struct context *usercon)
{
if (p->mls_enabled) {
struct mls_level *fromcon_sen = &(fromcon->range.level[0]);
struct mls_level *fromcon_clr = &(fromcon->range.level[1]);
struct mls_level *user_low = &(user->range.level[0]);
struct mls_level *user_clr = &(user->range.level[1]);
struct mls_level *user_def = &(user->dfltlevel);
struct mls_level *usercon_sen = &(usercon->range.level[0]);
struct mls_level *usercon_clr = &(usercon->range.level[1]);
/* Honor the user's default level if we can */
if (mls_level_between(user_def, fromcon_sen, fromcon_clr))
*usercon_sen = *user_def;
else if (mls_level_between(fromcon_sen, user_def, user_clr))
*usercon_sen = *fromcon_sen;
else if (mls_level_between(fromcon_clr, user_low, user_def))
*usercon_sen = *user_low;
else
return -EINVAL;
/* Lower the clearance of available contexts
if the clearance of "fromcon" is lower than
that of the user's default clearance (but
only if the "fromcon" clearance dominates
the user's computed sensitivity level) */
if (mls_level_dom(user_clr, fromcon_clr))
*usercon_clr = *fromcon_clr;
else if (mls_level_dom(fromcon_clr, user_clr))
*usercon_clr = *user_clr;
else
return -EINVAL;
}
return 0;
}
/*
* Convert the MLS fields in the security context
* structure `oldc' from the values specified in the
* policy `oldp' to the values specified in the policy `newp',
* storing the resulting context in `newc'.
*/
int mls_convert_context(struct policydb *oldp,
struct policydb *newp,
struct context *oldc,
struct context *newc)
{
struct level_datum *levdatum;
struct cat_datum *catdatum;
struct ebitmap_node *node;
int l, i;
if (!oldp->mls_enabled || !newp->mls_enabled)
return 0;
for (l = 0; l < 2; l++) {
char *name = sym_name(oldp, SYM_LEVELS,
oldc->range.level[l].sens - 1);
levdatum = symtab_search(&newp->p_levels, name);
if (!levdatum)
return -EINVAL;
newc->range.level[l].sens = levdatum->level->sens;
ebitmap_for_each_positive_bit(&oldc->range.level[l].cat,
node, i) {
int rc;
catdatum = symtab_search(&newp->p_cats,
sym_name(oldp, SYM_CATS, i));
if (!catdatum)
return -EINVAL;
rc = ebitmap_set_bit(&newc->range.level[l].cat,
catdatum->value - 1, 1);
if (rc)
return rc;
}
}
return 0;
}
int mls_compute_sid(struct policydb *p,
struct context *scontext,
struct context *tcontext,
u16 tclass,
u32 specified,
struct context *newcontext,
bool sock)
{
struct range_trans rtr;
struct mls_range *r;
struct class_datum *cladatum;
int default_range = 0;
if (!p->mls_enabled)
return 0;
switch (specified) {
case AVTAB_TRANSITION:
/* Look for a range transition rule. */
rtr.source_type = scontext->type;
rtr.target_type = tcontext->type;
rtr.target_class = tclass;
r = policydb_rangetr_search(p, &rtr);
if (r)
return mls_range_set(newcontext, r);
if (tclass && tclass <= p->p_classes.nprim) { cladatum = p->class_val_to_struct[tclass - 1];
if (cladatum)
default_range = cladatum->default_range;
}
switch (default_range) {
case DEFAULT_SOURCE_LOW:
return mls_context_cpy_low(newcontext, scontext);
case DEFAULT_SOURCE_HIGH:
return mls_context_cpy_high(newcontext, scontext);
case DEFAULT_SOURCE_LOW_HIGH:
return mls_context_cpy(newcontext, scontext);
case DEFAULT_TARGET_LOW:
return mls_context_cpy_low(newcontext, tcontext);
case DEFAULT_TARGET_HIGH:
return mls_context_cpy_high(newcontext, tcontext);
case DEFAULT_TARGET_LOW_HIGH:
return mls_context_cpy(newcontext, tcontext);
case DEFAULT_GLBLUB:
return mls_context_glblub(newcontext,
scontext, tcontext);
}
fallthrough;
case AVTAB_CHANGE:
if ((tclass == p->process_class) || sock)
/* Use the process MLS attributes. */
return mls_context_cpy(newcontext, scontext);
else
/* Use the process effective MLS attributes. */
return mls_context_cpy_low(newcontext, scontext);
case AVTAB_MEMBER:
/* Use the process effective MLS attributes. */
return mls_context_cpy_low(newcontext, scontext);
}
return -EINVAL;
}
#ifdef CONFIG_NETLABEL
/**
* mls_export_netlbl_lvl - Export the MLS sensitivity levels to NetLabel
* @context: the security context
* @secattr: the NetLabel security attributes
*
* Description:
* Given the security context copy the low MLS sensitivity level into the
* NetLabel MLS sensitivity level field.
*
*/
void mls_export_netlbl_lvl(struct policydb *p,
struct context *context,
struct netlbl_lsm_secattr *secattr)
{
if (!p->mls_enabled)
return;
secattr->attr.mls.lvl = context->range.level[0].sens - 1;
secattr->flags |= NETLBL_SECATTR_MLS_LVL;
}
/**
* mls_import_netlbl_lvl - Import the NetLabel MLS sensitivity levels
* @context: the security context
* @secattr: the NetLabel security attributes
*
* Description:
* Given the security context and the NetLabel security attributes, copy the
* NetLabel MLS sensitivity level into the context.
*
*/
void mls_import_netlbl_lvl(struct policydb *p,
struct context *context,
struct netlbl_lsm_secattr *secattr)
{
if (!p->mls_enabled)
return;
context->range.level[0].sens = secattr->attr.mls.lvl + 1;
context->range.level[1].sens = context->range.level[0].sens;
}
/**
* mls_export_netlbl_cat - Export the MLS categories to NetLabel
* @context: the security context
* @secattr: the NetLabel security attributes
*
* Description:
* Given the security context copy the low MLS categories into the NetLabel
* MLS category field. Returns zero on success, negative values on failure.
*
*/
int mls_export_netlbl_cat(struct policydb *p,
struct context *context,
struct netlbl_lsm_secattr *secattr)
{
int rc;
if (!p->mls_enabled) return 0; rc = ebitmap_netlbl_export(&context->range.level[0].cat,
&secattr->attr.mls.cat);
if (rc == 0 && secattr->attr.mls.cat != NULL) secattr->flags |= NETLBL_SECATTR_MLS_CAT;
return rc;
}
/**
* mls_import_netlbl_cat - Import the MLS categories from NetLabel
* @context: the security context
* @secattr: the NetLabel security attributes
*
* Description:
* Copy the NetLabel security attributes into the SELinux context; since the
* NetLabel security attribute only contains a single MLS category use it for
* both the low and high categories of the context. Returns zero on success,
* negative values on failure.
*
*/
int mls_import_netlbl_cat(struct policydb *p,
struct context *context,
struct netlbl_lsm_secattr *secattr)
{
int rc;
if (!p->mls_enabled)
return 0;
rc = ebitmap_netlbl_import(&context->range.level[0].cat,
secattr->attr.mls.cat);
if (rc)
goto import_netlbl_cat_failure;
memcpy(&context->range.level[1].cat, &context->range.level[0].cat,
sizeof(context->range.level[0].cat));
return 0;
import_netlbl_cat_failure:
ebitmap_destroy(&context->range.level[0].cat);
return rc;
}
#endif /* CONFIG_NETLABEL */
// SPDX-License-Identifier: GPL-2.0-or-later
/*
* net/core/gen_stats.c
*
* Authors: Thomas Graf <tgraf@suug.ch>
* Jamal Hadi Salim
* Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
*
* See Documentation/networking/gen_stats.rst
*/
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/interrupt.h>
#include <linux/socket.h>
#include <linux/rtnetlink.h>
#include <linux/gen_stats.h>
#include <net/netlink.h>
#include <net/gen_stats.h>
static inline int
gnet_stats_copy(struct gnet_dump *d, int type, void *buf, int size, int padattr)
{
if (nla_put_64bit(d->skb, type, size, buf, padattr))
goto nla_put_failure;
return 0;
nla_put_failure:
if (d->lock)
spin_unlock_bh(d->lock);
kfree(d->xstats);
d->xstats = NULL;
d->xstats_len = 0;
return -1;
}
/**
* gnet_stats_start_copy_compat - start dumping procedure in compatibility mode
* @skb: socket buffer to put statistics TLVs into
* @type: TLV type for top level statistic TLV
* @tc_stats_type: TLV type for backward compatibility struct tc_stats TLV
* @xstats_type: TLV type for backward compatibility xstats TLV
* @lock: statistics lock
* @d: dumping handle
* @padattr: padding attribute
*
* Initializes the dumping handle, grabs the statistic lock and appends
* an empty TLV header to the socket buffer for use a container for all
* other statistic TLVS.
*
* The dumping handle is marked to be in backward compatibility mode telling
* all gnet_stats_copy_XXX() functions to fill a local copy of struct tc_stats.
*
* Returns 0 on success or -1 if the room in the socket buffer was not sufficient.
*/
int
gnet_stats_start_copy_compat(struct sk_buff *skb, int type, int tc_stats_type,
int xstats_type, spinlock_t *lock,
struct gnet_dump *d, int padattr)
__acquires(lock)
{
memset(d, 0, sizeof(*d));
if (type)
d->tail = (struct nlattr *)skb_tail_pointer(skb); d->skb = skb;
d->compat_tc_stats = tc_stats_type;
d->compat_xstats = xstats_type;
d->padattr = padattr;
if (lock) {
d->lock = lock;
spin_lock_bh(lock);
}
if (d->tail) {
int ret = gnet_stats_copy(d, type, NULL, 0, padattr);
/* The initial attribute added in gnet_stats_copy() may be
* preceded by a padding attribute, in which case d->tail will
* end up pointing at the padding instead of the real attribute.
* Fix this so gnet_stats_finish_copy() adjusts the length of
* the right attribute.
*/
if (ret == 0 && d->tail->nla_type == padattr)
d->tail = (struct nlattr *)((char *)d->tail +
NLA_ALIGN(d->tail->nla_len));
return ret;
}
return 0;
}
EXPORT_SYMBOL(gnet_stats_start_copy_compat);
/**
* gnet_stats_start_copy - start dumping procedure in compatibility mode
* @skb: socket buffer to put statistics TLVs into
* @type: TLV type for top level statistic TLV
* @lock: statistics lock
* @d: dumping handle
* @padattr: padding attribute
*
* Initializes the dumping handle, grabs the statistic lock and appends
* an empty TLV header to the socket buffer for use a container for all
* other statistic TLVS.
*
* Returns 0 on success or -1 if the room in the socket buffer was not sufficient.
*/
int
gnet_stats_start_copy(struct sk_buff *skb, int type, spinlock_t *lock,
struct gnet_dump *d, int padattr)
{
return gnet_stats_start_copy_compat(skb, type, 0, 0, lock, d, padattr);
}
EXPORT_SYMBOL(gnet_stats_start_copy);
static void
__gnet_stats_copy_basic_cpu(struct gnet_stats_basic_packed *bstats,
struct gnet_stats_basic_cpu __percpu *cpu)
{
int i;
for_each_possible_cpu(i) { struct gnet_stats_basic_cpu *bcpu = per_cpu_ptr(cpu, i);
unsigned int start;
u64 bytes, packets;
do {
start = u64_stats_fetch_begin_irq(&bcpu->syncp);
bytes = bcpu->bstats.bytes;
packets = bcpu->bstats.packets;
} while (u64_stats_fetch_retry_irq(&bcpu->syncp, start));
bstats->bytes += bytes;
bstats->packets += packets;
}
}
void
__gnet_stats_copy_basic(const seqcount_t *running,
struct gnet_stats_basic_packed *bstats,
struct gnet_stats_basic_cpu __percpu *cpu,
struct gnet_stats_basic_packed *b)
{
unsigned int seq;
if (cpu) {
__gnet_stats_copy_basic_cpu(bstats, cpu);
return;
}
do {
if (running) seq = read_seqcount_begin(running); bstats->bytes = b->bytes;
bstats->packets = b->packets;
} while (running && read_seqcount_retry(running, seq));
}
EXPORT_SYMBOL(__gnet_stats_copy_basic);
static int
___gnet_stats_copy_basic(const seqcount_t *running,
struct gnet_dump *d,
struct gnet_stats_basic_cpu __percpu *cpu,
struct gnet_stats_basic_packed *b,
int type)
{
struct gnet_stats_basic_packed bstats = {0};
__gnet_stats_copy_basic(running, &bstats, cpu, b);
if (d->compat_tc_stats && type == TCA_STATS_BASIC) { d->tc_stats.bytes = bstats.bytes;
d->tc_stats.packets = bstats.packets;
}
if (d->tail) {
struct gnet_stats_basic sb;
int res;
memset(&sb, 0, sizeof(sb));
sb.bytes = bstats.bytes;
sb.packets = bstats.packets;
res = gnet_stats_copy(d, type, &sb, sizeof(sb), TCA_STATS_PAD);
if (res < 0 || sb.packets == bstats.packets)
return res;
/* emit 64bit stats only if needed */
return gnet_stats_copy(d, TCA_STATS_PKT64, &bstats.packets,
sizeof(bstats.packets), TCA_STATS_PAD);
}
return 0;
}
/**
* gnet_stats_copy_basic - copy basic statistics into statistic TLV
* @running: seqcount_t pointer
* @d: dumping handle
* @cpu: copy statistic per cpu
* @b: basic statistics
*
* Appends the basic statistics to the top level TLV created by
* gnet_stats_start_copy().
*
* Returns 0 on success or -1 with the statistic lock released
* if the room in the socket buffer was not sufficient.
*/
int
gnet_stats_copy_basic(const seqcount_t *running,
struct gnet_dump *d,
struct gnet_stats_basic_cpu __percpu *cpu,
struct gnet_stats_basic_packed *b)
{
return ___gnet_stats_copy_basic(running, d, cpu, b,
TCA_STATS_BASIC);
}
EXPORT_SYMBOL(gnet_stats_copy_basic);
/**
* gnet_stats_copy_basic_hw - copy basic hw statistics into statistic TLV
* @running: seqcount_t pointer
* @d: dumping handle
* @cpu: copy statistic per cpu
* @b: basic statistics
*
* Appends the basic statistics to the top level TLV created by
* gnet_stats_start_copy().
*
* Returns 0 on success or -1 with the statistic lock released
* if the room in the socket buffer was not sufficient.
*/
int
gnet_stats_copy_basic_hw(const seqcount_t *running,
struct gnet_dump *d,
struct gnet_stats_basic_cpu __percpu *cpu,
struct gnet_stats_basic_packed *b)
{
return ___gnet_stats_copy_basic(running, d, cpu, b,
TCA_STATS_BASIC_HW);
}
EXPORT_SYMBOL(gnet_stats_copy_basic_hw);
/**
* gnet_stats_copy_rate_est - copy rate estimator statistics into statistics TLV
* @d: dumping handle
* @rate_est: rate estimator
*
* Appends the rate estimator statistics to the top level TLV created by
* gnet_stats_start_copy().
*
* Returns 0 on success or -1 with the statistic lock released
* if the room in the socket buffer was not sufficient.
*/
int
gnet_stats_copy_rate_est(struct gnet_dump *d,
struct net_rate_estimator __rcu **rate_est)
{
struct gnet_stats_rate_est64 sample;
struct gnet_stats_rate_est est;
int res;
if (!gen_estimator_read(rate_est, &sample)) return 0; est.bps = min_t(u64, UINT_MAX, sample.bps);
/* we have some time before reaching 2^32 packets per second */
est.pps = sample.pps;
if (d->compat_tc_stats) {
d->tc_stats.bps = est.bps;
d->tc_stats.pps = est.pps;
}
if (d->tail) {
res = gnet_stats_copy(d, TCA_STATS_RATE_EST, &est, sizeof(est),
TCA_STATS_PAD);
if (res < 0 || est.bps == sample.bps)
return res;
/* emit 64bit stats only if needed */
return gnet_stats_copy(d, TCA_STATS_RATE_EST64, &sample,
sizeof(sample), TCA_STATS_PAD);
}
return 0;
}
EXPORT_SYMBOL(gnet_stats_copy_rate_est);
static void
__gnet_stats_copy_queue_cpu(struct gnet_stats_queue *qstats,
const struct gnet_stats_queue __percpu *q)
{
int i;
for_each_possible_cpu(i) {
const struct gnet_stats_queue *qcpu = per_cpu_ptr(q, i);
qstats->qlen = 0;
qstats->backlog += qcpu->backlog;
qstats->drops += qcpu->drops;
qstats->requeues += qcpu->requeues;
qstats->overlimits += qcpu->overlimits;
}
}
void __gnet_stats_copy_queue(struct gnet_stats_queue *qstats,
const struct gnet_stats_queue __percpu *cpu,
const struct gnet_stats_queue *q,
__u32 qlen)
{
if (cpu) {
__gnet_stats_copy_queue_cpu(qstats, cpu);
} else {
qstats->qlen = q->qlen;
qstats->backlog = q->backlog;
qstats->drops = q->drops;
qstats->requeues = q->requeues;
qstats->overlimits = q->overlimits;
}
qstats->qlen = qlen;
}
EXPORT_SYMBOL(__gnet_stats_copy_queue);
/**
* gnet_stats_copy_queue - copy queue statistics into statistics TLV
* @d: dumping handle
* @cpu_q: per cpu queue statistics
* @q: queue statistics
* @qlen: queue length statistics
*
* Appends the queue statistics to the top level TLV created by
* gnet_stats_start_copy(). Using per cpu queue statistics if
* they are available.
*
* Returns 0 on success or -1 with the statistic lock released
* if the room in the socket buffer was not sufficient.
*/
int
gnet_stats_copy_queue(struct gnet_dump *d,
struct gnet_stats_queue __percpu *cpu_q,
struct gnet_stats_queue *q, __u32 qlen)
{
struct gnet_stats_queue qstats = {0};
__gnet_stats_copy_queue(&qstats, cpu_q, q, qlen);
if (d->compat_tc_stats) {
d->tc_stats.drops = qstats.drops;
d->tc_stats.qlen = qstats.qlen;
d->tc_stats.backlog = qstats.backlog;
d->tc_stats.overlimits = qstats.overlimits;
}
if (d->tail)
return gnet_stats_copy(d, TCA_STATS_QUEUE,
&qstats, sizeof(qstats),
TCA_STATS_PAD);
return 0;
}
EXPORT_SYMBOL(gnet_stats_copy_queue);
/**
* gnet_stats_copy_app - copy application specific statistics into statistics TLV
* @d: dumping handle
* @st: application specific statistics data
* @len: length of data
*
* Appends the application specific statistics to the top level TLV created by
* gnet_stats_start_copy() and remembers the data for XSTATS if the dumping
* handle is in backward compatibility mode.
*
* Returns 0 on success or -1 with the statistic lock released
* if the room in the socket buffer was not sufficient.
*/
int
gnet_stats_copy_app(struct gnet_dump *d, void *st, int len)
{
if (d->compat_xstats) {
d->xstats = kmemdup(st, len, GFP_ATOMIC);
if (!d->xstats)
goto err_out;
d->xstats_len = len;
}
if (d->tail)
return gnet_stats_copy(d, TCA_STATS_APP, st, len,
TCA_STATS_PAD);
return 0;
err_out:
if (d->lock)
spin_unlock_bh(d->lock);
d->xstats_len = 0;
return -1;
}
EXPORT_SYMBOL(gnet_stats_copy_app);
/**
* gnet_stats_finish_copy - finish dumping procedure
* @d: dumping handle
*
* Corrects the length of the top level TLV to include all TLVs added
* by gnet_stats_copy_XXX() calls. Adds the backward compatibility TLVs
* if gnet_stats_start_copy_compat() was used and releases the statistics
* lock.
*
* Returns 0 on success or -1 with the statistic lock released
* if the room in the socket buffer was not sufficient.
*/
int
gnet_stats_finish_copy(struct gnet_dump *d)
{
if (d->tail) d->tail->nla_len = skb_tail_pointer(d->skb) - (u8 *)d->tail; if (d->compat_tc_stats) if (gnet_stats_copy(d, d->compat_tc_stats, &d->tc_stats,
sizeof(d->tc_stats), d->padattr) < 0)
return -1;
if (d->compat_xstats && d->xstats) { if (gnet_stats_copy(d, d->compat_xstats, d->xstats,
d->xstats_len, d->padattr) < 0)
return -1;
}
if (d->lock)
spin_unlock_bh(d->lock);
kfree(d->xstats);
d->xstats = NULL;
d->xstats_len = 0;
return 0;
}
EXPORT_SYMBOL(gnet_stats_finish_copy);
// SPDX-License-Identifier: GPL-2.0-only
/*
* linux/fs/namespace.c
*
* (C) Copyright Al Viro 2000, 2001
*
* Based on code from fs/super.c, copyright Linus Torvalds and others.
* Heavily rewritten.
*/
#include <linux/syscalls.h>
#include <linux/export.h>
#include <linux/capability.h>
#include <linux/mnt_namespace.h>
#include <linux/user_namespace.h>
#include <linux/namei.h>
#include <linux/security.h>
#include <linux/cred.h>
#include <linux/idr.h>
#include <linux/init.h> /* init_rootfs */
#include <linux/fs_struct.h> /* get_fs_root et.al. */
#include <linux/fsnotify.h> /* fsnotify_vfsmount_delete */
#include <linux/file.h>
#include <linux/uaccess.h>
#include <linux/proc_ns.h>
#include <linux/magic.h>
#include <linux/memblock.h>
#include <linux/proc_fs.h>
#include <linux/task_work.h>
#include <linux/sched/task.h>
#include <uapi/linux/mount.h>
#include <linux/fs_context.h>
#include <linux/shmem_fs.h>
#include "pnode.h"
#include "internal.h"
/* Maximum number of mounts in a mount namespace */
unsigned int sysctl_mount_max __read_mostly = 100000;
static unsigned int m_hash_mask __read_mostly;
static unsigned int m_hash_shift __read_mostly;
static unsigned int mp_hash_mask __read_mostly;
static unsigned int mp_hash_shift __read_mostly;
static __initdata unsigned long mhash_entries;
static int __init set_mhash_entries(char *str)
{
if (!str)
return 0;
mhash_entries = simple_strtoul(str, &str, 0);
return 1;
}
__setup("mhash_entries=", set_mhash_entries);
static __initdata unsigned long mphash_entries;
static int __init set_mphash_entries(char *str)
{
if (!str)
return 0;
mphash_entries = simple_strtoul(str, &str, 0);
return 1;
}
__setup("mphash_entries=", set_mphash_entries);
static u64 event;
static DEFINE_IDA(mnt_id_ida);
static DEFINE_IDA(mnt_group_ida);
static struct hlist_head *mount_hashtable __read_mostly;
static struct hlist_head *mountpoint_hashtable __read_mostly;
static struct kmem_cache *mnt_cache __read_mostly;
static DECLARE_RWSEM(namespace_sem);
static HLIST_HEAD(unmounted); /* protected by namespace_sem */
static LIST_HEAD(ex_mountpoints); /* protected by namespace_sem */
struct mount_kattr {
unsigned int attr_set;
unsigned int attr_clr;
unsigned int propagation;
unsigned int lookup_flags;
bool recurse;
struct user_namespace *mnt_userns;
};
/* /sys/fs */
struct kobject *fs_kobj;
EXPORT_SYMBOL_GPL(fs_kobj);
/*
* vfsmount lock may be taken for read to prevent changes to the
* vfsmount hash, ie. during mountpoint lookups or walking back
* up the tree.
*
* It should be taken for write in all cases where the vfsmount
* tree or hash is modified or when a vfsmount structure is modified.
*/
__cacheline_aligned_in_smp DEFINE_SEQLOCK(mount_lock);
static inline void lock_mount_hash(void)
{
write_seqlock(&mount_lock);
}
static inline void unlock_mount_hash(void)
{
write_sequnlock(&mount_lock);
}
static inline struct hlist_head *m_hash(struct vfsmount *mnt, struct dentry *dentry)
{
unsigned long tmp = ((unsigned long)mnt / L1_CACHE_BYTES);
tmp += ((unsigned long)dentry / L1_CACHE_BYTES);
tmp = tmp + (tmp >> m_hash_shift);
return &mount_hashtable[tmp & m_hash_mask];
}
static inline struct hlist_head *mp_hash(struct dentry *dentry)
{
unsigned long tmp = ((unsigned long)dentry / L1_CACHE_BYTES);
tmp = tmp + (tmp >> mp_hash_shift);
return &mountpoint_hashtable[tmp & mp_hash_mask];
}
static int mnt_alloc_id(struct mount *mnt)
{
int res = ida_alloc(&mnt_id_ida, GFP_KERNEL);
if (res < 0)
return res;
mnt->mnt_id = res;
return 0;
}
static void mnt_free_id(struct mount *mnt)
{
ida_free(&mnt_id_ida, mnt->mnt_id);
}
/*
* Allocate a new peer group ID
*/
static int mnt_alloc_group_id(struct mount *mnt)
{
int res = ida_alloc_min(&mnt_group_ida, 1, GFP_KERNEL);
if (res < 0)
return res;
mnt->mnt_group_id = res;
return 0;
}
/*
* Release a peer group ID
*/
void mnt_release_group_id(struct mount *mnt)
{
ida_free(&mnt_group_ida, mnt->mnt_group_id);
mnt->mnt_group_id = 0;
}
/*
* vfsmount lock must be held for read
*/
static inline void mnt_add_count(struct mount *mnt, int n)
{
#ifdef CONFIG_SMP
this_cpu_add(mnt->mnt_pcp->mnt_count, n);
#else
preempt_disable();
mnt->mnt_count += n;
preempt_enable();
#endif
}
/*
* vfsmount lock must be held for write
*/
int mnt_get_count(struct mount *mnt)
{
#ifdef CONFIG_SMP
int count = 0;
int cpu;
for_each_possible_cpu(cpu) { count += per_cpu_ptr(mnt->mnt_pcp, cpu)->mnt_count;
}
return count;
#else
return mnt->mnt_count;
#endif
}
static struct mount *alloc_vfsmnt(const char *name)
{
struct mount *mnt = kmem_cache_zalloc(mnt_cache, GFP_KERNEL);
if (mnt) {
int err;
err = mnt_alloc_id(mnt);
if (err)
goto out_free_cache;
if (name) {
mnt->mnt_devname = kstrdup_const(name,
GFP_KERNEL_ACCOUNT);
if (!mnt->mnt_devname)
goto out_free_id;
}
#ifdef CONFIG_SMP
mnt->mnt_pcp = alloc_percpu(struct mnt_pcp);
if (!mnt->mnt_pcp)
goto out_free_devname;
this_cpu_add(mnt->mnt_pcp->mnt_count, 1);
#else
mnt->mnt_count = 1;
mnt->mnt_writers = 0;
#endif
INIT_HLIST_NODE(&mnt->mnt_hash);
INIT_LIST_HEAD(&mnt->mnt_child);
INIT_LIST_HEAD(&mnt->mnt_mounts);
INIT_LIST_HEAD(&mnt->mnt_list);
INIT_LIST_HEAD(&mnt->mnt_expire);
INIT_LIST_HEAD(&mnt->mnt_share);
INIT_LIST_HEAD(&mnt->mnt_slave_list);
INIT_LIST_HEAD(&mnt->mnt_slave);
INIT_HLIST_NODE(&mnt->mnt_mp_list);
INIT_LIST_HEAD(&mnt->mnt_umounting);
INIT_HLIST_HEAD(&mnt->mnt_stuck_children);
mnt->mnt.mnt_userns = &init_user_ns;
}
return mnt;
#ifdef CONFIG_SMP
out_free_devname:
kfree_const(mnt->mnt_devname);
#endif
out_free_id:
mnt_free_id(mnt);
out_free_cache:
kmem_cache_free(mnt_cache, mnt);
return NULL;
}
/*
* Most r/o checks on a fs are for operations that take
* discrete amounts of time, like a write() or unlink().
* We must keep track of when those operations start
* (for permission checks) and when they end, so that
* we can determine when writes are able to occur to
* a filesystem.
*/
/*
* __mnt_is_readonly: check whether a mount is read-only
* @mnt: the mount to check for its write status
*
* This shouldn't be used directly ouside of the VFS.
* It does not guarantee that the filesystem will stay
* r/w, just that it is right *now*. This can not and
* should not be used in place of IS_RDONLY(inode).
* mnt_want/drop_write() will _keep_ the filesystem
* r/w.
*/
bool __mnt_is_readonly(struct vfsmount *mnt)
{
return (mnt->mnt_flags & MNT_READONLY) || sb_rdonly(mnt->mnt_sb);
}
EXPORT_SYMBOL_GPL(__mnt_is_readonly);
static inline void mnt_inc_writers(struct mount *mnt)
{
#ifdef CONFIG_SMP
this_cpu_inc(mnt->mnt_pcp->mnt_writers);
#else
mnt->mnt_writers++;
#endif
}
static inline void mnt_dec_writers(struct mount *mnt)
{
#ifdef CONFIG_SMP
this_cpu_dec(mnt->mnt_pcp->mnt_writers);
#else
mnt->mnt_writers--;
#endif
}
static unsigned int mnt_get_writers(struct mount *mnt)
{
#ifdef CONFIG_SMP
unsigned int count = 0;
int cpu;
for_each_possible_cpu(cpu) { count += per_cpu_ptr(mnt->mnt_pcp, cpu)->mnt_writers;
}
return count;
#else
return mnt->mnt_writers;
#endif
}
static int mnt_is_readonly(struct vfsmount *mnt)
{
if (mnt->mnt_sb->s_readonly_remount)
return 1;
/* Order wrt setting s_flags/s_readonly_remount in do_remount() */
smp_rmb();
return __mnt_is_readonly(mnt);
}
/*
* Most r/o & frozen checks on a fs are for operations that take discrete
* amounts of time, like a write() or unlink(). We must keep track of when
* those operations start (for permission checks) and when they end, so that we
* can determine when writes are able to occur to a filesystem.
*/
/**
* __mnt_want_write - get write access to a mount without freeze protection
* @m: the mount on which to take a write
*
* This tells the low-level filesystem that a write is about to be performed to
* it, and makes sure that writes are allowed (mnt it read-write) before
* returning success. This operation does not protect against filesystem being
* frozen. When the write operation is finished, __mnt_drop_write() must be
* called. This is effectively a refcount.
*/
int __mnt_want_write(struct vfsmount *m)
{
struct mount *mnt = real_mount(m);
int ret = 0;
preempt_disable();
mnt_inc_writers(mnt);
/*
* The store to mnt_inc_writers must be visible before we pass
* MNT_WRITE_HOLD loop below, so that the slowpath can see our
* incremented count after it has set MNT_WRITE_HOLD.
*/
smp_mb();
while (READ_ONCE(mnt->mnt.mnt_flags) & MNT_WRITE_HOLD)
cpu_relax();
/*
* After the slowpath clears MNT_WRITE_HOLD, mnt_is_readonly will
* be set to match its requirements. So we must not load that until
* MNT_WRITE_HOLD is cleared.
*/
smp_rmb();
if (mnt_is_readonly(m)) {
mnt_dec_writers(mnt);
ret = -EROFS;
}
preempt_enable();
return ret;
}
/**
* mnt_want_write - get write access to a mount
* @m: the mount on which to take a write
*
* This tells the low-level filesystem that a write is about to be performed to
* it, and makes sure that writes are allowed (mount is read-write, filesystem
* is not frozen) before returning success. When the write operation is
* finished, mnt_drop_write() must be called. This is effectively a refcount.
*/
int mnt_want_write(struct vfsmount *m)
{
int ret;
sb_start_write(m->mnt_sb);
ret = __mnt_want_write(m);
if (ret)
sb_end_write(m->mnt_sb); return ret;
}
EXPORT_SYMBOL_GPL(mnt_want_write);
/**
* __mnt_want_write_file - get write access to a file's mount
* @file: the file who's mount on which to take a write
*
* This is like __mnt_want_write, but if the file is already open for writing it
* skips incrementing mnt_writers (since the open file already has a reference)
* and instead only does the check for emergency r/o remounts. This must be
* paired with __mnt_drop_write_file.
*/
int __mnt_want_write_file(struct file *file)
{
if (file->f_mode & FMODE_WRITER) {
/*
* Superblock may have become readonly while there are still
* writable fd's, e.g. due to a fs error with errors=remount-ro
*/
if (__mnt_is_readonly(file->f_path.mnt))
return -EROFS;
return 0;
}
return __mnt_want_write(file->f_path.mnt);
}
/**
* mnt_want_write_file - get write access to a file's mount
* @file: the file who's mount on which to take a write
*
* This is like mnt_want_write, but if the file is already open for writing it
* skips incrementing mnt_writers (since the open file already has a reference)
* and instead only does the freeze protection and the check for emergency r/o
* remounts. This must be paired with mnt_drop_write_file.
*/
int mnt_want_write_file(struct file *file)
{
int ret;
sb_start_write(file_inode(file)->i_sb);
ret = __mnt_want_write_file(file);
if (ret)
sb_end_write(file_inode(file)->i_sb);
return ret;
}
EXPORT_SYMBOL_GPL(mnt_want_write_file);
/**
* __mnt_drop_write - give up write access to a mount
* @mnt: the mount on which to give up write access
*
* Tells the low-level filesystem that we are done
* performing writes to it. Must be matched with
* __mnt_want_write() call above.
*/
void __mnt_drop_write(struct vfsmount *mnt)
{
preempt_disable();
mnt_dec_writers(real_mount(mnt));
preempt_enable();
}
/**
* mnt_drop_write - give up write access to a mount
* @mnt: the mount on which to give up write access
*
* Tells the low-level filesystem that we are done performing writes to it and
* also allows filesystem to be frozen again. Must be matched with
* mnt_want_write() call above.
*/
void mnt_drop_write(struct vfsmount *mnt)
{
__mnt_drop_write(mnt);
sb_end_write(mnt->mnt_sb);
}
EXPORT_SYMBOL_GPL(mnt_drop_write);
void __mnt_drop_write_file(struct file *file)
{
if (!(file->f_mode & FMODE_WRITER)) __mnt_drop_write(file->f_path.mnt);
}
void mnt_drop_write_file(struct file *file)
{
__mnt_drop_write_file(file);
sb_end_write(file_inode(file)->i_sb);
}
EXPORT_SYMBOL(mnt_drop_write_file);
static inline int mnt_hold_writers(struct mount *mnt)
{
mnt->mnt.mnt_flags |= MNT_WRITE_HOLD;
/*
* After storing MNT_WRITE_HOLD, we'll read the counters. This store
* should be visible before we do.
*/
smp_mb();
/*
* With writers on hold, if this value is zero, then there are
* definitely no active writers (although held writers may subsequently
* increment the count, they'll have to wait, and decrement it after
* seeing MNT_READONLY).
*
* It is OK to have counter incremented on one CPU and decremented on
* another: the sum will add up correctly. The danger would be when we
* sum up each counter, if we read a counter before it is incremented,
* but then read another CPU's count which it has been subsequently
* decremented from -- we would see more decrements than we should.
* MNT_WRITE_HOLD protects against this scenario, because
* mnt_want_write first increments count, then smp_mb, then spins on
* MNT_WRITE_HOLD, so it can't be decremented by another CPU while
* we're counting up here.
*/
if (mnt_get_writers(mnt) > 0)
return -EBUSY;
return 0;
}
static inline void mnt_unhold_writers(struct mount *mnt)
{
/*
* MNT_READONLY must become visible before ~MNT_WRITE_HOLD, so writers
* that become unheld will see MNT_READONLY.
*/
smp_wmb();
mnt->mnt.mnt_flags &= ~MNT_WRITE_HOLD;
}
static int mnt_make_readonly(struct mount *mnt)
{
int ret;
ret = mnt_hold_writers(mnt);
if (!ret)
mnt->mnt.mnt_flags |= MNT_READONLY;
mnt_unhold_writers(mnt);
return ret;
}
int sb_prepare_remount_readonly(struct super_block *sb)
{
struct mount *mnt;
int err = 0;
/* Racy optimization. Recheck the counter under MNT_WRITE_HOLD */
if (atomic_long_read(&sb->s_remove_count))
return -EBUSY;
lock_mount_hash();
list_for_each_entry(mnt, &sb->s_mounts, mnt_instance) { if (!(mnt->mnt.mnt_flags & MNT_READONLY)) { mnt->mnt.mnt_flags |= MNT_WRITE_HOLD;
smp_mb();
if (mnt_get_writers(mnt) > 0) {
err = -EBUSY;
break;
}
}
}
if (!err && atomic_long_read(&sb->s_remove_count))
err = -EBUSY;
if (!err) {
sb->s_readonly_remount = 1;
smp_wmb();
}
list_for_each_entry(mnt, &sb->s_mounts, mnt_instance) { if (mnt->mnt.mnt_flags & MNT_WRITE_HOLD) mnt->mnt.mnt_flags &= ~MNT_WRITE_HOLD;
}
unlock_mount_hash();
return err;
}
static void free_vfsmnt(struct mount *mnt)
{
struct user_namespace *mnt_userns;
mnt_userns = mnt_user_ns(&mnt->mnt);
if (mnt_userns != &init_user_ns)
put_user_ns(mnt_userns);
kfree_const(mnt->mnt_devname);
#ifdef CONFIG_SMP
free_percpu(mnt->mnt_pcp);
#endif
kmem_cache_free(mnt_cache, mnt);
}
static void delayed_free_vfsmnt(struct rcu_head *head)
{
free_vfsmnt(container_of(head, struct mount, mnt_rcu));
}
/* call under rcu_read_lock */
int __legitimize_mnt(struct vfsmount *bastard, unsigned seq)
{
struct mount *mnt;
if (read_seqretry(&mount_lock, seq))
return 1;
if (bastard == NULL) return 0;
mnt = real_mount(bastard);
mnt_add_count(mnt, 1);
smp_mb(); // see mntput_no_expire()
if (likely(!read_seqretry(&mount_lock, seq)))
return 0;
if (bastard->mnt_flags & MNT_SYNC_UMOUNT) { mnt_add_count(mnt, -1);
return 1;
}
lock_mount_hash();
if (unlikely(bastard->mnt_flags & MNT_DOOMED)) {
mnt_add_count(mnt, -1);
unlock_mount_hash();
return 1;
}
unlock_mount_hash();
/* caller will mntput() */
return -1;
}
/* call under rcu_read_lock */
bool legitimize_mnt(struct vfsmount *bastard, unsigned seq)
{
int res = __legitimize_mnt(bastard, seq); if (likely(!res))
return true;
if (unlikely(res < 0)) {
rcu_read_unlock();
mntput(bastard);
rcu_read_lock();
}
return false;
}
/*
* find the first mount at @dentry on vfsmount @mnt.
* call under rcu_read_lock()
*/
struct mount *__lookup_mnt(struct vfsmount *mnt, struct dentry *dentry)
{
struct hlist_head *head = m_hash(mnt, dentry);
struct mount *p;
hlist_for_each_entry_rcu(p, head, mnt_hash) if (&p->mnt_parent->mnt == mnt && p->mnt_mountpoint == dentry)
return p;
return NULL;
}
/*
* lookup_mnt - Return the first child mount mounted at path
*
* "First" means first mounted chronologically. If you create the
* following mounts:
*
* mount /dev/sda1 /mnt
* mount /dev/sda2 /mnt
* mount /dev/sda3 /mnt
*
* Then lookup_mnt() on the base /mnt dentry in the root mount will
* return successively the root dentry and vfsmount of /dev/sda1, then
* /dev/sda2, then /dev/sda3, then NULL.
*
* lookup_mnt takes a reference to the found vfsmount.
*/
struct vfsmount *lookup_mnt(const struct path *path)
{
struct mount *child_mnt;
struct vfsmount *m;
unsigned seq;
rcu_read_lock();
do {
seq = read_seqbegin(&mount_lock);
child_mnt = __lookup_mnt(path->mnt, path->dentry);
m = child_mnt ? &child_mnt->mnt : NULL; } while (!legitimize_mnt(m, seq));
rcu_read_unlock();
return m;
}
static inline void lock_ns_list(struct mnt_namespace *ns)
{
spin_lock(&ns->ns_lock);
}
static inline void unlock_ns_list(struct mnt_namespace *ns)
{
spin_unlock(&ns->ns_lock);
}
static inline bool mnt_is_cursor(struct mount *mnt)
{
return mnt->mnt.mnt_flags & MNT_CURSOR;
}
/*
* __is_local_mountpoint - Test to see if dentry is a mountpoint in the
* current mount namespace.
*
* The common case is dentries are not mountpoints at all and that
* test is handled inline. For the slow case when we are actually
* dealing with a mountpoint of some kind, walk through all of the
* mounts in the current mount namespace and test to see if the dentry
* is a mountpoint.
*
* The mount_hashtable is not usable in the context because we
* need to identify all mounts that may be in the current mount
* namespace not just a mount that happens to have some specified
* parent mount.
*/
bool __is_local_mountpoint(struct dentry *dentry)
{
struct mnt_namespace *ns = current->nsproxy->mnt_ns;
struct mount *mnt;
bool is_covered = false;
down_read(&namespace_sem);
lock_ns_list(ns);
list_for_each_entry(mnt, &ns->list, mnt_list) { if (mnt_is_cursor(mnt))
continue;
is_covered = (mnt->mnt_mountpoint == dentry);
if (is_covered)
break;
}
unlock_ns_list(ns);
up_read(&namespace_sem);
return is_covered;
}
static struct mountpoint *lookup_mountpoint(struct dentry *dentry)
{
struct hlist_head *chain = mp_hash(dentry);
struct mountpoint *mp;
hlist_for_each_entry(mp, chain, m_hash) { if (mp->m_dentry == dentry) { mp->m_count++; return mp;
}
}
return NULL;
}
static struct mountpoint *get_mountpoint(struct dentry *dentry)
{
struct mountpoint *mp, *new = NULL;
int ret;
if (d_mountpoint(dentry)) {
/* might be worth a WARN_ON() */
if (d_unlinked(dentry))
return ERR_PTR(-ENOENT);
mountpoint:
read_seqlock_excl(&mount_lock);
mp = lookup_mountpoint(dentry);
read_sequnlock_excl(&mount_lock);
if (mp)
goto done;
}
if (!new)
new = kmalloc(sizeof(struct mountpoint), GFP_KERNEL);
if (!new)
return ERR_PTR(-ENOMEM);
/* Exactly one processes may set d_mounted */
ret = d_set_mounted(dentry);
/* Someone else set d_mounted? */
if (ret == -EBUSY)
goto mountpoint;
/* The dentry is not available as a mountpoint? */
mp = ERR_PTR(ret); if (ret)
goto done;
/* Add the new mountpoint to the hash table */
read_seqlock_excl(&mount_lock);
new->m_dentry = dget(dentry);
new->m_count = 1;
hlist_add_head(&new->m_hash, mp_hash(dentry));
INIT_HLIST_HEAD(&new->m_list);
read_sequnlock_excl(&mount_lock);
mp = new;
new = NULL;
done:
kfree(new); return mp;
}
/*
* vfsmount lock must be held. Additionally, the caller is responsible
* for serializing calls for given disposal list.
*/
static void __put_mountpoint(struct mountpoint *mp, struct list_head *list)
{
if (!--mp->m_count) { struct dentry *dentry = mp->m_dentry; BUG_ON(!hlist_empty(&mp->m_list));
spin_lock(&dentry->d_lock);
dentry->d_flags &= ~DCACHE_MOUNTED;
spin_unlock(&dentry->d_lock);
dput_to_list(dentry, list);
hlist_del(&mp->m_hash);
kfree(mp);
}
}
/* called with namespace_lock and vfsmount lock */
static void put_mountpoint(struct mountpoint *mp)
{
__put_mountpoint(mp, &ex_mountpoints);
}
static inline int check_mnt(struct mount *mnt)
{
return mnt->mnt_ns == current->nsproxy->mnt_ns;
}
/*
* vfsmount lock must be held for write
*/
static void touch_mnt_namespace(struct mnt_namespace *ns)
{
if (ns) { ns->event = ++event;
wake_up_interruptible(&ns->poll);
}
}
/*
* vfsmount lock must be held for write
*/
static void __touch_mnt_namespace(struct mnt_namespace *ns)
{
if (ns && ns->event != event) {
ns->event = event;
wake_up_interruptible(&ns->poll);
}
}
/*
* vfsmount lock must be held for write
*/
static struct mountpoint *unhash_mnt(struct mount *mnt)
{
struct mountpoint *mp;
mnt->mnt_parent = mnt;
mnt->mnt_mountpoint = mnt->mnt.mnt_root;
list_del_init(&mnt->mnt_child);
hlist_del_init_rcu(&mnt->mnt_hash);
hlist_del_init(&mnt->mnt_mp_list);
mp = mnt->mnt_mp;
mnt->mnt_mp = NULL;
return mp;
}
/*
* vfsmount lock must be held for write
*/
static void umount_mnt(struct mount *mnt)
{
put_mountpoint(unhash_mnt(mnt));
}
/*
* vfsmount lock must be held for write
*/
void mnt_set_mountpoint(struct mount *mnt,
struct mountpoint *mp,
struct mount *child_mnt)
{
mp->m_count++;
mnt_add_count(mnt, 1); /* essentially, that's mntget */
child_mnt->mnt_mountpoint = mp->m_dentry;
child_mnt->mnt_parent = mnt;
child_mnt->mnt_mp = mp;
hlist_add_head(&child_mnt->mnt_mp_list, &mp->m_list);
}
static void __attach_mnt(struct mount *mnt, struct mount *parent)
{
hlist_add_head_rcu(&mnt->mnt_hash,
m_hash(&parent->mnt, mnt->mnt_mountpoint));
list_add_tail(&mnt->mnt_child, &parent->mnt_mounts);
}
/*
* vfsmount lock must be held for write
*/
static void attach_mnt(struct mount *mnt,
struct mount *parent,
struct mountpoint *mp)
{
mnt_set_mountpoint(parent, mp, mnt);
__attach_mnt(mnt, parent);
}
void mnt_change_mountpoint(struct mount *parent, struct mountpoint *mp, struct mount *mnt)
{
struct mountpoint *old_mp = mnt->mnt_mp;
struct mount *old_parent = mnt->mnt_parent;
list_del_init(&mnt->mnt_child);
hlist_del_init(&mnt->mnt_mp_list);
hlist_del_init_rcu(&mnt->mnt_hash);
attach_mnt(mnt, parent, mp);
put_mountpoint(old_mp);
mnt_add_count(old_parent, -1);
}
/*
* vfsmount lock must be held for write
*/
static void commit_tree(struct mount *mnt)
{
struct mount *parent = mnt->mnt_parent;
struct mount *m;
LIST_HEAD(head);
struct mnt_namespace *n = parent->mnt_ns;
BUG_ON(parent == mnt); list_add_tail(&head, &mnt->mnt_list);
list_for_each_entry(m, &head, mnt_list)
m->mnt_ns = n; list_splice(&head, n->list.prev); n->mounts += n->pending_mounts;
n->pending_mounts = 0;
__attach_mnt(mnt, parent);
touch_mnt_namespace(n);
}
static struct mount *next_mnt(struct mount *p, struct mount *root)
{
struct list_head *next = p->mnt_mounts.next;
if (next == &p->mnt_mounts) {
while (1) {
if (p == root)
return NULL;
next = p->mnt_child.next;
if (next != &p->mnt_parent->mnt_mounts)
break;
p = p->mnt_parent;
}
}
return list_entry(next, struct mount, mnt_child);
}
static struct mount *skip_mnt_tree(struct mount *p)
{
struct list_head *prev = p->mnt_mounts.prev;
while (prev != &p->mnt_mounts) {
p = list_entry(prev, struct mount, mnt_child); prev = p->mnt_mounts.prev;
}
return p;
}
/**
* vfs_create_mount - Create a mount for a configured superblock
* @fc: The configuration context with the superblock attached
*
* Create a mount to an already configured superblock. If necessary, the
* caller should invoke vfs_get_tree() before calling this.
*
* Note that this does not attach the mount to anything.
*/
struct vfsmount *vfs_create_mount(struct fs_context *fc)
{
struct mount *mnt;
if (!fc->root)
return ERR_PTR(-EINVAL);
mnt = alloc_vfsmnt(fc->source ?: "none");
if (!mnt)
return ERR_PTR(-ENOMEM);
if (fc->sb_flags & SB_KERNMOUNT) mnt->mnt.mnt_flags = MNT_INTERNAL; atomic_inc(&fc->root->d_sb->s_active);
mnt->mnt.mnt_sb = fc->root->d_sb;
mnt->mnt.mnt_root = dget(fc->root);
mnt->mnt_mountpoint = mnt->mnt.mnt_root;
mnt->mnt_parent = mnt;
lock_mount_hash();
list_add_tail(&mnt->mnt_instance, &mnt->mnt.mnt_sb->s_mounts);
unlock_mount_hash();
return &mnt->mnt;
}
EXPORT_SYMBOL(vfs_create_mount);
struct vfsmount *fc_mount(struct fs_context *fc)
{
int err = vfs_get_tree(fc);
if (!err) {
up_write(&fc->root->d_sb->s_umount); return vfs_create_mount(fc);
}
return ERR_PTR(err);
}
EXPORT_SYMBOL(fc_mount);
struct vfsmount *vfs_kern_mount(struct file_system_type *type,
int flags, const char *name,
void *data)
{
struct fs_context *fc;
struct vfsmount *mnt;
int ret = 0;
if (!type)
return ERR_PTR(-EINVAL);
fc = fs_context_for_mount(type, flags);
if (IS_ERR(fc))
return ERR_CAST(fc); if (name) ret = vfs_parse_fs_string(fc, "source",
name, strlen(name));
if (!ret)
ret = parse_monolithic_mount_data(fc, data);
if (!ret)
mnt = fc_mount(fc);
else
mnt = ERR_PTR(ret); put_fs_context(fc); return mnt;
}
EXPORT_SYMBOL_GPL(vfs_kern_mount);
struct vfsmount *
vfs_submount(const struct dentry *mountpoint, struct file_system_type *type,
const char *name, void *data)
{
/* Until it is worked out how to pass the user namespace
* through from the parent mount to the submount don't support
* unprivileged mounts with submounts.
*/
if (mountpoint->d_sb->s_user_ns != &init_user_ns)
return ERR_PTR(-EPERM);
return vfs_kern_mount(type, SB_SUBMOUNT, name, data);
}
EXPORT_SYMBOL_GPL(vfs_submount);
static struct mount *clone_mnt(struct mount *old, struct dentry *root,
int flag)
{
struct super_block *sb = old->mnt.mnt_sb;
struct mount *mnt;
int err;
mnt = alloc_vfsmnt(old->mnt_devname);
if (!mnt)
return ERR_PTR(-ENOMEM);
if (flag & (CL_SLAVE | CL_PRIVATE | CL_SHARED_TO_SLAVE))
mnt->mnt_group_id = 0; /* not a peer of original */
else
mnt->mnt_group_id = old->mnt_group_id; if ((flag & CL_MAKE_SHARED) && !mnt->mnt_group_id) {
err = mnt_alloc_group_id(mnt);
if (err)
goto out_free;
}
mnt->mnt.mnt_flags = old->mnt.mnt_flags;
mnt->mnt.mnt_flags &= ~(MNT_WRITE_HOLD|MNT_MARKED|MNT_INTERNAL);
atomic_inc(&sb->s_active);
mnt->mnt.mnt_userns = mnt_user_ns(&old->mnt);
if (mnt->mnt.mnt_userns != &init_user_ns)
mnt->mnt.mnt_userns = get_user_ns(mnt->mnt.mnt_userns); mnt->mnt.mnt_sb = sb; mnt->mnt.mnt_root = dget(root);
mnt->mnt_mountpoint = mnt->mnt.mnt_root;
mnt->mnt_parent = mnt;
lock_mount_hash();
list_add_tail(&mnt->mnt_instance, &sb->s_mounts);
unlock_mount_hash();
if ((flag & CL_SLAVE) ||
((flag & CL_SHARED_TO_SLAVE) && IS_MNT_SHARED(old))) { list_add(&mnt->mnt_slave, &old->mnt_slave_list);
mnt->mnt_master = old;
CLEAR_MNT_SHARED(mnt);
} else if (!(flag & CL_PRIVATE)) { if ((flag & CL_MAKE_SHARED) || IS_MNT_SHARED(old)) list_add(&mnt->mnt_share, &old->mnt_share); if (IS_MNT_SLAVE(old)) list_add(&mnt->mnt_slave, &old->mnt_slave); mnt->mnt_master = old->mnt_master;
} else {
CLEAR_MNT_SHARED(mnt);
}
if (flag & CL_MAKE_SHARED)
set_mnt_shared(mnt);
/* stick the duplicate mount on the same expiry list
* as the original if that was on one */
if (flag & CL_EXPIRE) { if (!list_empty(&old->mnt_expire)) list_add(&mnt->mnt_expire, &old->mnt_expire);
}
return mnt;
out_free:
mnt_free_id(mnt);
free_vfsmnt(mnt);
return ERR_PTR(err);
}
static void cleanup_mnt(struct mount *mnt)
{
struct hlist_node *p;
struct mount *m;
/*
* The warning here probably indicates that somebody messed
* up a mnt_want/drop_write() pair. If this happens, the
* filesystem was probably unable to make r/w->r/o transitions.
* The locking used to deal with mnt_count decrement provides barriers,
* so mnt_get_writers() below is safe.
*/
WARN_ON(mnt_get_writers(mnt)); if (unlikely(mnt->mnt_pins.first)) mnt_pin_kill(mnt); hlist_for_each_entry_safe(m, p, &mnt->mnt_stuck_children, mnt_umount) {
hlist_del(&m->mnt_umount);
mntput(&m->mnt);
}
fsnotify_vfsmount_delete(&mnt->mnt);
dput(mnt->mnt.mnt_root);
deactivate_super(mnt->mnt.mnt_sb);
mnt_free_id(mnt);
call_rcu(&mnt->mnt_rcu, delayed_free_vfsmnt);
}
static void __cleanup_mnt(struct rcu_head *head)
{
cleanup_mnt(container_of(head, struct mount, mnt_rcu));
}
static LLIST_HEAD(delayed_mntput_list);
static void delayed_mntput(struct work_struct *unused)
{
struct llist_node *node = llist_del_all(&delayed_mntput_list);
struct mount *m, *t;
llist_for_each_entry_safe(m, t, node, mnt_llist)
cleanup_mnt(m);
}
static DECLARE_DELAYED_WORK(delayed_mntput_work, delayed_mntput);
static void mntput_no_expire(struct mount *mnt)
{
LIST_HEAD(list);
int count;
rcu_read_lock();
if (likely(READ_ONCE(mnt->mnt_ns))) {
/*
* Since we don't do lock_mount_hash() here,
* ->mnt_ns can change under us. However, if it's
* non-NULL, then there's a reference that won't
* be dropped until after an RCU delay done after
* turning ->mnt_ns NULL. So if we observe it
* non-NULL under rcu_read_lock(), the reference
* we are dropping is not the final one.
*/
mnt_add_count(mnt, -1);
rcu_read_unlock();
return;
}
lock_mount_hash();
/*
* make sure that if __legitimize_mnt() has not seen us grab
* mount_lock, we'll see their refcount increment here.
*/
smp_mb();
mnt_add_count(mnt, -1);
count = mnt_get_count(mnt);
if (count != 0) {
WARN_ON(count < 0);
rcu_read_unlock();
unlock_mount_hash();
return;
}
if (unlikely(mnt->mnt.mnt_flags & MNT_DOOMED)) {
rcu_read_unlock();
unlock_mount_hash();
return;
}
mnt->mnt.mnt_flags |= MNT_DOOMED;
rcu_read_unlock();
list_del(&mnt->mnt_instance);
if (unlikely(!list_empty(&mnt->mnt_mounts))) {
struct mount *p, *tmp;
list_for_each_entry_safe(p, tmp, &mnt->mnt_mounts, mnt_child) { __put_mountpoint(unhash_mnt(p), &list);
hlist_add_head(&p->mnt_umount, &mnt->mnt_stuck_children);
}
}
unlock_mount_hash();
shrink_dentry_list(&list);
if (likely(!(mnt->mnt.mnt_flags & MNT_INTERNAL))) {
struct task_struct *task = current;
if (likely(!(task->flags & PF_KTHREAD))) {
init_task_work(&mnt->mnt_rcu, __cleanup_mnt);
if (!task_work_add(task, &mnt->mnt_rcu, TWA_RESUME))
return;
}
if (llist_add(&mnt->mnt_llist, &delayed_mntput_list))
schedule_delayed_work(&delayed_mntput_work, 1);
return;
}
cleanup_mnt(mnt);
}
void mntput(struct vfsmount *mnt)
{
if (mnt) {
struct mount *m = real_mount(mnt);
/* avoid cacheline pingpong, hope gcc doesn't get "smart" */
if (unlikely(m->mnt_expiry_mark)) m->mnt_expiry_mark = 0; mntput_no_expire(m);
}
}
EXPORT_SYMBOL(mntput);
struct vfsmount *mntget(struct vfsmount *mnt)
{
if (mnt) mnt_add_count(real_mount(mnt), 1); return mnt;
}
EXPORT_SYMBOL(mntget);
/**
* path_is_mountpoint() - Check if path is a mount in the current namespace.
* @path: path to check
*
* d_mountpoint() can only be used reliably to establish if a dentry is
* not mounted in any namespace and that common case is handled inline.
* d_mountpoint() isn't aware of the possibility there may be multiple
* mounts using a given dentry in a different namespace. This function
* checks if the passed in path is a mountpoint rather than the dentry
* alone.
*/
bool path_is_mountpoint(const struct path *path)
{
unsigned seq;
bool res;
if (!d_mountpoint(path->dentry))
return false;
rcu_read_lock();
do {
seq = read_seqbegin(&mount_lock);
res = __path_is_mountpoint(path);
} while (read_seqretry(&mount_lock, seq));
rcu_read_unlock();
return res;
}
EXPORT_SYMBOL(path_is_mountpoint);
struct vfsmount *mnt_clone_internal(const struct path *path)
{
struct mount *p;
p = clone_mnt(real_mount(path->mnt), path->dentry, CL_PRIVATE);
if (IS_ERR(p))
return ERR_CAST(p);
p->mnt.mnt_flags |= MNT_INTERNAL;
return &p->mnt;
}
#ifdef CONFIG_PROC_FS
static struct mount *mnt_list_next(struct mnt_namespace *ns,
struct list_head *p)
{
struct mount *mnt, *ret = NULL;
lock_ns_list(ns);
list_for_each_continue(p, &ns->list) {
mnt = list_entry(p, typeof(*mnt), mnt_list);
if (!mnt_is_cursor(mnt)) {
ret = mnt;
break;
}
}
unlock_ns_list(ns);
return ret;
}
/* iterator; we want it to have access to namespace_sem, thus here... */
static void *m_start(struct seq_file *m, loff_t *pos)
{
struct proc_mounts *p = m->private;
struct list_head *prev;
down_read(&namespace_sem);
if (!*pos) {
prev = &p->ns->list;
} else {
prev = &p->cursor.mnt_list;
/* Read after we'd reached the end? */
if (list_empty(prev))
return NULL;
}
return mnt_list_next(p->ns, prev);
}
static void *m_next(struct seq_file *m, void *v, loff_t *pos)
{
struct proc_mounts *p = m->private;
struct mount *mnt = v;
++*pos;
return mnt_list_next(p->ns, &mnt->mnt_list);
}
static void m_stop(struct seq_file *m, void *v)
{
struct proc_mounts *p = m->private;
struct mount *mnt = v;
lock_ns_list(p->ns);
if (mnt)
list_move_tail(&p->cursor.mnt_list, &mnt->mnt_list);
else
list_del_init(&p->cursor.mnt_list);
unlock_ns_list(p->ns);
up_read(&namespace_sem);
}
static int m_show(struct seq_file *m, void *v)
{
struct proc_mounts *p = m->private;
struct mount *r = v;
return p->show(m, &r->mnt);
}
const struct seq_operations mounts_op = {
.start = m_start,
.next = m_next,
.stop = m_stop,
.show = m_show,
};
void mnt_cursor_del(struct mnt_namespace *ns, struct mount *cursor)
{
down_read(&namespace_sem);
lock_ns_list(ns);
list_del(&cursor->mnt_list);
unlock_ns_list(ns);
up_read(&namespace_sem);
}
#endif /* CONFIG_PROC_FS */
/**
* may_umount_tree - check if a mount tree is busy
* @m: root of mount tree
*
* This is called to check if a tree of mounts has any
* open files, pwds, chroots or sub mounts that are
* busy.
*/
int may_umount_tree(struct vfsmount *m)
{
struct mount *mnt = real_mount(m);
int actual_refs = 0;
int minimum_refs = 0;
struct mount *p;
BUG_ON(!m);
/* write lock needed for mnt_get_count */
lock_mount_hash();
for (p = mnt; p; p = next_mnt(p, mnt)) {
actual_refs += mnt_get_count(p);
minimum_refs += 2;
}
unlock_mount_hash();
if (actual_refs > minimum_refs)
return 0;
return 1;
}
EXPORT_SYMBOL(may_umount_tree);
/**
* may_umount - check if a mount point is busy
* @mnt: root of mount
*
* This is called to check if a mount point has any
* open files, pwds, chroots or sub mounts. If the
* mount has sub mounts this will return busy
* regardless of whether the sub mounts are busy.
*
* Doesn't take quota and stuff into account. IOW, in some cases it will
* give false negatives. The main reason why it's here is that we need
* a non-destructive way to look for easily umountable filesystems.
*/
int may_umount(struct vfsmount *mnt)
{
int ret = 1;
down_read(&namespace_sem);
lock_mount_hash();
if (propagate_mount_busy(real_mount(mnt), 2))
ret = 0;
unlock_mount_hash();
up_read(&namespace_sem);
return ret;
}
EXPORT_SYMBOL(may_umount);
static void namespace_unlock(void)
{
struct hlist_head head;
struct hlist_node *p;
struct mount *m;
LIST_HEAD(list);
hlist_move_list(&unmounted, &head);
list_splice_init(&ex_mountpoints, &list);
up_write(&namespace_sem);
shrink_dentry_list(&list);
if (likely(hlist_empty(&head)))
return; synchronize_rcu_expedited(); hlist_for_each_entry_safe(m, p, &head, mnt_umount) {
hlist_del(&m->mnt_umount);
mntput(&m->mnt);
}
}
static inline void namespace_lock(void)
{
down_write(&namespace_sem);
}
enum umount_tree_flags {
UMOUNT_SYNC = 1,
UMOUNT_PROPAGATE = 2,
UMOUNT_CONNECTED = 4,
};
static bool disconnect_mount(struct mount *mnt, enum umount_tree_flags how)
{
/* Leaving mounts connected is only valid for lazy umounts */
if (how & UMOUNT_SYNC)
return true;
/* A mount without a parent has nothing to be connected to */
if (!mnt_has_parent(mnt))
return true;
/* Because the reference counting rules change when mounts are
* unmounted and connected, umounted mounts may not be
* connected to mounted mounts.
*/
if (!(mnt->mnt_parent->mnt.mnt_flags & MNT_UMOUNT))
return true;
/* Has it been requested that the mount remain connected? */
if (how & UMOUNT_CONNECTED)
return false;
/* Is the mount locked such that it needs to remain connected? */
if (IS_MNT_LOCKED(mnt))
return false;
/* By default disconnect the mount */
return true;
}
/*
* mount_lock must be held
* namespace_sem must be held for write
*/
static void umount_tree(struct mount *mnt, enum umount_tree_flags how)
{
LIST_HEAD(tmp_list);
struct mount *p;
if (how & UMOUNT_PROPAGATE)
propagate_mount_unlock(mnt);
/* Gather the mounts to umount */
for (p = mnt; p; p = next_mnt(p, mnt)) { p->mnt.mnt_flags |= MNT_UMOUNT;
list_move(&p->mnt_list, &tmp_list);
}
/* Hide the mounts from mnt_mounts */
list_for_each_entry(p, &tmp_list, mnt_list) { list_del_init(&p->mnt_child);
}
/* Add propogated mounts to the tmp_list */
if (how & UMOUNT_PROPAGATE) propagate_umount(&tmp_list); while (!list_empty(&tmp_list)) {
struct mnt_namespace *ns;
bool disconnect;
p = list_first_entry(&tmp_list, struct mount, mnt_list);
list_del_init(&p->mnt_expire);
list_del_init(&p->mnt_list);
ns = p->mnt_ns;
if (ns) {
ns->mounts--;
__touch_mnt_namespace(ns);
}
p->mnt_ns = NULL;
if (how & UMOUNT_SYNC)
p->mnt.mnt_flags |= MNT_SYNC_UMOUNT;
disconnect = disconnect_mount(p, how);
if (mnt_has_parent(p)) {
mnt_add_count(p->mnt_parent, -1);
if (!disconnect) {
/* Don't forget about p */
list_add_tail(&p->mnt_child, &p->mnt_parent->mnt_mounts);
} else {
umount_mnt(p);
}
}
change_mnt_propagation(p, MS_PRIVATE);
if (disconnect)
hlist_add_head(&p->mnt_umount, &unmounted);
}
}
static void shrink_submounts(struct mount *mnt);
static int do_umount_root(struct super_block *sb)
{
int ret = 0;
down_write(&sb->s_umount);
if (!sb_rdonly(sb)) {
struct fs_context *fc;
fc = fs_context_for_reconfigure(sb->s_root, SB_RDONLY,
SB_RDONLY);
if (IS_ERR(fc)) {
ret = PTR_ERR(fc);
} else {
ret = parse_monolithic_mount_data(fc, NULL);
if (!ret)
ret = reconfigure_super(fc);
put_fs_context(fc);
}
}
up_write(&sb->s_umount);
return ret;
}
static int do_umount(struct mount *mnt, int flags)
{
struct super_block *sb = mnt->mnt.mnt_sb;
int retval;
retval = security_sb_umount(&mnt->mnt, flags);
if (retval)
return retval;
/*
* Allow userspace to request a mountpoint be expired rather than
* unmounting unconditionally. Unmount only happens if:
* (1) the mark is already set (the mark is cleared by mntput())
* (2) the usage count == 1 [parent vfsmount] + 1 [sys_umount]
*/
if (flags & MNT_EXPIRE) {
if (&mnt->mnt == current->fs->root.mnt ||
flags & (MNT_FORCE | MNT_DETACH))
return -EINVAL;
/*
* probably don't strictly need the lock here if we examined
* all race cases, but it's a slowpath.
*/
lock_mount_hash();
if (mnt_get_count(mnt) != 2) {
unlock_mount_hash();
return -EBUSY;
}
unlock_mount_hash();
if (!xchg(&mnt->mnt_expiry_mark, 1))
return -EAGAIN;
}
/*
* If we may have to abort operations to get out of this
* mount, and they will themselves hold resources we must
* allow the fs to do things. In the Unix tradition of
* 'Gee thats tricky lets do it in userspace' the umount_begin
* might fail to complete on the first run through as other tasks
* must return, and the like. Thats for the mount program to worry
* about for the moment.
*/
if (flags & MNT_FORCE && sb->s_op->umount_begin) {
sb->s_op->umount_begin(sb);
}
/*
* No sense to grab the lock for this test, but test itself looks
* somewhat bogus. Suggestions for better replacement?
* Ho-hum... In principle, we might treat that as umount + switch
* to rootfs. GC would eventually take care of the old vfsmount.
* Actually it makes sense, especially if rootfs would contain a
* /reboot - static binary that would close all descriptors and
* call reboot(9). Then init(8) could umount root and exec /reboot.
*/
if (&mnt->mnt == current->fs->root.mnt && !(flags & MNT_DETACH)) {
/*
* Special case for "unmounting" root ...
* we just try to remount it readonly.
*/
if (!ns_capable(sb->s_user_ns, CAP_SYS_ADMIN))
return -EPERM;
return do_umount_root(sb);
}
namespace_lock();
lock_mount_hash();
/* Recheck MNT_LOCKED with the locks held */
retval = -EINVAL;
if (mnt->mnt.mnt_flags & MNT_LOCKED)
goto out;
event++;
if (flags & MNT_DETACH) {
if (!list_empty(&mnt->mnt_list))
umount_tree(mnt, UMOUNT_PROPAGATE);
retval = 0;
} else {
shrink_submounts(mnt);
retval = -EBUSY;
if (!propagate_mount_busy(mnt, 2)) {
if (!list_empty(&mnt->mnt_list))
umount_tree(mnt, UMOUNT_PROPAGATE|UMOUNT_SYNC);
retval = 0;
}
}
out:
unlock_mount_hash();
namespace_unlock();
return retval;
}
/*
* __detach_mounts - lazily unmount all mounts on the specified dentry
*
* During unlink, rmdir, and d_drop it is possible to loose the path
* to an existing mountpoint, and wind up leaking the mount.
* detach_mounts allows lazily unmounting those mounts instead of
* leaking them.
*
* The caller may hold dentry->d_inode->i_mutex.
*/
void __detach_mounts(struct dentry *dentry)
{
struct mountpoint *mp;
struct mount *mnt;
namespace_lock();
lock_mount_hash();
mp = lookup_mountpoint(dentry);
if (!mp)
goto out_unlock;
event++;
while (!hlist_empty(&mp->m_list)) {
mnt = hlist_entry(mp->m_list.first, struct mount, mnt_mp_list);
if (mnt->mnt.mnt_flags & MNT_UMOUNT) {
umount_mnt(mnt);
hlist_add_head(&mnt->mnt_umount, &unmounted);
}
else umount_tree(mnt, UMOUNT_CONNECTED);
}
put_mountpoint(mp);
out_unlock:
unlock_mount_hash();
namespace_unlock();
}
/*
* Is the caller allowed to modify his namespace?
*/
static inline bool may_mount(void)
{
return ns_capable(current->nsproxy->mnt_ns->user_ns, CAP_SYS_ADMIN);
}
static void warn_mandlock(void)
{
pr_warn_once("=======================================================\n"
"WARNING: The mand mount option has been deprecated and\n"
" and is ignored by this kernel. Remove the mand\n"
" option from the mount to silence this warning.\n"
"=======================================================\n");
}
static int can_umount(const struct path *path, int flags)
{
struct mount *mnt = real_mount(path->mnt);
if (!may_mount())
return -EPERM;
if (path->dentry != path->mnt->mnt_root)
return -EINVAL;
if (!check_mnt(mnt))
return -EINVAL;
if (mnt->mnt.mnt_flags & MNT_LOCKED) /* Check optimistically */
return -EINVAL;
if (flags & MNT_FORCE && !capable(CAP_SYS_ADMIN))
return -EPERM;
return 0;
}
// caller is responsible for flags being sane
int path_umount(struct path *path, int flags)
{
struct mount *mnt = real_mount(path->mnt);
int ret;
ret = can_umount(path, flags);
if (!ret)
ret = do_umount(mnt, flags);
/* we mustn't call path_put() as that would clear mnt_expiry_mark */
dput(path->dentry);
mntput_no_expire(mnt);
return ret;
}
static int ksys_umount(char __user *name, int flags)
{
int lookup_flags = LOOKUP_MOUNTPOINT;
struct path path;
int ret;
// basic validity checks done first
if (flags & ~(MNT_FORCE | MNT_DETACH | MNT_EXPIRE | UMOUNT_NOFOLLOW))
return -EINVAL;
if (!(flags & UMOUNT_NOFOLLOW))
lookup_flags |= LOOKUP_FOLLOW;
ret = user_path_at(AT_FDCWD, name, lookup_flags, &path);
if (ret)
return ret;
return path_umount(&path, flags);
}
SYSCALL_DEFINE2(umount, char __user *, name, int, flags)
{
return ksys_umount(name, flags);
}
#ifdef __ARCH_WANT_SYS_OLDUMOUNT
/*
* The 2.0 compatible umount. No flags.
*/
SYSCALL_DEFINE1(oldumount, char __user *, name)
{
return ksys_umount(name, 0);
}
#endif
static bool is_mnt_ns_file(struct dentry *dentry)
{
/* Is this a proxy for a mount namespace? */
return dentry->d_op == &ns_dentry_operations &&
dentry->d_fsdata == &mntns_operations;
}
static struct mnt_namespace *to_mnt_ns(struct ns_common *ns)
{
return container_of(ns, struct mnt_namespace, ns);
}
struct ns_common *from_mnt_ns(struct mnt_namespace *mnt)
{
return &mnt->ns;
}
static bool mnt_ns_loop(struct dentry *dentry)
{
/* Could bind mounting the mount namespace inode cause a
* mount namespace loop?
*/
struct mnt_namespace *mnt_ns;
if (!is_mnt_ns_file(dentry))
return false;
mnt_ns = to_mnt_ns(get_proc_ns(dentry->d_inode));
return current->nsproxy->mnt_ns->seq >= mnt_ns->seq;
}
struct mount *copy_tree(struct mount *mnt, struct dentry *dentry,
int flag)
{
struct mount *res, *p, *q, *r, *parent;
if (!(flag & CL_COPY_UNBINDABLE) && IS_MNT_UNBINDABLE(mnt))
return ERR_PTR(-EINVAL);
if (!(flag & CL_COPY_MNT_NS_FILE) && is_mnt_ns_file(dentry))
return ERR_PTR(-EINVAL);
res = q = clone_mnt(mnt, dentry, flag);
if (IS_ERR(q))
return q;
q->mnt_mountpoint = mnt->mnt_mountpoint;
p = mnt;
list_for_each_entry(r, &mnt->mnt_mounts, mnt_child) {
struct mount *s;
if (!is_subdir(r->mnt_mountpoint, dentry))
continue;
for (s = r; s; s = next_mnt(s, r)) { if (!(flag & CL_COPY_UNBINDABLE) && IS_MNT_UNBINDABLE(s)) { if (s->mnt.mnt_flags & MNT_LOCKED) {
/* Both unbindable and locked. */
q = ERR_PTR(-EPERM);
goto out;
} else {
s = skip_mnt_tree(s);
continue;
}
}
if (!(flag & CL_COPY_MNT_NS_FILE) && is_mnt_ns_file(s->mnt.mnt_root)) {
s = skip_mnt_tree(s);
continue;
}
while (p != s->mnt_parent) { p = p->mnt_parent;
q = q->mnt_parent;
}
p = s;
parent = q;
q = clone_mnt(p, p->mnt.mnt_root, flag);
if (IS_ERR(q))
goto out;
lock_mount_hash();
list_add_tail(&q->mnt_list, &res->mnt_list);
attach_mnt(q, parent, p->mnt_mp);
unlock_mount_hash();
}
}
return res;
out:
if (res) {
lock_mount_hash();
umount_tree(res, UMOUNT_SYNC);
unlock_mount_hash();
}
return q;
}
/* Caller should check returned pointer for errors */
struct vfsmount *collect_mounts(const struct path *path)
{
struct mount *tree;
namespace_lock();
if (!check_mnt(real_mount(path->mnt)))
tree = ERR_PTR(-EINVAL);
else
tree = copy_tree(real_mount(path->mnt), path->dentry,
CL_COPY_ALL | CL_PRIVATE);
namespace_unlock();
if (IS_ERR(tree))
return ERR_CAST(tree);
return &tree->mnt;
}
static void free_mnt_ns(struct mnt_namespace *);
static struct mnt_namespace *alloc_mnt_ns(struct user_namespace *, bool);
void dissolve_on_fput(struct vfsmount *mnt)
{
struct mnt_namespace *ns;
namespace_lock();
lock_mount_hash();
ns = real_mount(mnt)->mnt_ns;
if (ns) {
if (is_anon_ns(ns))
umount_tree(real_mount(mnt), UMOUNT_CONNECTED);
else
ns = NULL;
}
unlock_mount_hash();
namespace_unlock();
if (ns)
free_mnt_ns(ns);
}
void drop_collected_mounts(struct vfsmount *mnt)
{
namespace_lock();
lock_mount_hash();
umount_tree(real_mount(mnt), 0);
unlock_mount_hash();
namespace_unlock();
}
static bool has_locked_children(struct mount *mnt, struct dentry *dentry)
{
struct mount *child;
list_for_each_entry(child, &mnt->mnt_mounts, mnt_child) { if (!is_subdir(child->mnt_mountpoint, dentry))
continue;
if (child->mnt.mnt_flags & MNT_LOCKED)
return true;
}
return false;
}
/**
* clone_private_mount - create a private clone of a path
* @path: path to clone
*
* This creates a new vfsmount, which will be the clone of @path. The new mount
* will not be attached anywhere in the namespace and will be private (i.e.
* changes to the originating mount won't be propagated into this).
*
* Release with mntput().
*/
struct vfsmount *clone_private_mount(const struct path *path)
{
struct mount *old_mnt = real_mount(path->mnt);
struct mount *new_mnt;
down_read(&namespace_sem);
if (IS_MNT_UNBINDABLE(old_mnt))
goto invalid;
if (!check_mnt(old_mnt))
goto invalid;
if (has_locked_children(old_mnt, path->dentry))
goto invalid;
new_mnt = clone_mnt(old_mnt, path->dentry, CL_PRIVATE);
up_read(&namespace_sem);
if (IS_ERR(new_mnt))
return ERR_CAST(new_mnt);
/* Longterm mount to be removed by kern_unmount*() */
new_mnt->mnt_ns = MNT_NS_INTERNAL;
return &new_mnt->mnt;
invalid:
up_read(&namespace_sem);
return ERR_PTR(-EINVAL);
}
EXPORT_SYMBOL_GPL(clone_private_mount);
int iterate_mounts(int (*f)(struct vfsmount *, void *), void *arg,
struct vfsmount *root)
{
struct mount *mnt;
int res = f(root, arg);
if (res)
return res;
list_for_each_entry(mnt, &real_mount(root)->mnt_list, mnt_list) {
res = f(&mnt->mnt, arg);
if (res)
return res;
}
return 0;
}
static void lock_mnt_tree(struct mount *mnt)
{
struct mount *p;
for (p = mnt; p; p = next_mnt(p, mnt)) {
int flags = p->mnt.mnt_flags;
/* Don't allow unprivileged users to change mount flags */
flags |= MNT_LOCK_ATIME;
if (flags & MNT_READONLY)
flags |= MNT_LOCK_READONLY;
if (flags & MNT_NODEV)
flags |= MNT_LOCK_NODEV;
if (flags & MNT_NOSUID)
flags |= MNT_LOCK_NOSUID;
if (flags & MNT_NOEXEC)
flags |= MNT_LOCK_NOEXEC;
/* Don't allow unprivileged users to reveal what is under a mount */
if (list_empty(&p->mnt_expire))
flags |= MNT_LOCKED;
p->mnt.mnt_flags = flags;
}
}
static void cleanup_group_ids(struct mount *mnt, struct mount *end)
{
struct mount *p;
for (p = mnt; p != end; p = next_mnt(p, mnt)) {
if (p->mnt_group_id && !IS_MNT_SHARED(p))
mnt_release_group_id(p);
}
}
static int invent_group_ids(struct mount *mnt, bool recurse)
{
struct mount *p;
for (p = mnt; p; p = recurse ? next_mnt(p, mnt) : NULL) { if (!p->mnt_group_id && !IS_MNT_SHARED(p)) {
int err = mnt_alloc_group_id(p);
if (err) {
cleanup_group_ids(mnt, p);
return err;
}
}
}
return 0;
}
int count_mounts(struct mnt_namespace *ns, struct mount *mnt)
{
unsigned int max = READ_ONCE(sysctl_mount_max);
unsigned int mounts = 0, old, pending, sum;
struct mount *p;
for (p = mnt; p; p = next_mnt(p, mnt))
mounts++; old = ns->mounts;
pending = ns->pending_mounts;
sum = old + pending;
if ((old > sum) || (pending > sum) ||
(max < sum) ||
(mounts > (max - sum)))
return -ENOSPC;
ns->pending_mounts = pending + mounts; return 0;
}
/*
* @source_mnt : mount tree to be attached
* @nd : place the mount tree @source_mnt is attached
* @parent_nd : if non-null, detach the source_mnt from its parent and
* store the parent mount and mountpoint dentry.
* (done when source_mnt is moved)
*
* NOTE: in the table below explains the semantics when a source mount
* of a given type is attached to a destination mount of a given type.
* ---------------------------------------------------------------------------
* | BIND MOUNT OPERATION |
* |**************************************************************************
* | source-->| shared | private | slave | unbindable |
* | dest | | | | |
* | | | | | | |
* | v | | | | |
* |**************************************************************************
* | shared | shared (++) | shared (+) | shared(+++)| invalid |
* | | | | | |
* |non-shared| shared (+) | private | slave (*) | invalid |
* ***************************************************************************
* A bind operation clones the source mount and mounts the clone on the
* destination mount.
*
* (++) the cloned mount is propagated to all the mounts in the propagation
* tree of the destination mount and the cloned mount is added to
* the peer group of the source mount.
* (+) the cloned mount is created under the destination mount and is marked
* as shared. The cloned mount is added to the peer group of the source
* mount.
* (+++) the mount is propagated to all the mounts in the propagation tree
* of the destination mount and the cloned mount is made slave
* of the same master as that of the source mount. The cloned mount
* is marked as 'shared and slave'.
* (*) the cloned mount is made a slave of the same master as that of the
* source mount.
*
* ---------------------------------------------------------------------------
* | MOVE MOUNT OPERATION |
* |**************************************************************************
* | source-->| shared | private | slave | unbindable |
* | dest | | | | |
* | | | | | | |
* | v | | | | |
* |**************************************************************************
* | shared | shared (+) | shared (+) | shared(+++) | invalid |
* | | | | | |
* |non-shared| shared (+*) | private | slave (*) | unbindable |
* ***************************************************************************
*
* (+) the mount is moved to the destination. And is then propagated to
* all the mounts in the propagation tree of the destination mount.
* (+*) the mount is moved to the destination.
* (+++) the mount is moved to the destination and is then propagated to
* all the mounts belonging to the destination mount's propagation tree.
* the mount is marked as 'shared and slave'.
* (*) the mount continues to be a slave at the new location.
*
* if the source mount is a tree, the operations explained above is
* applied to each mount in the tree.
* Must be called without spinlocks held, since this function can sleep
* in allocations.
*/
static int attach_recursive_mnt(struct mount *source_mnt,
struct mount *dest_mnt,
struct mountpoint *dest_mp,
bool moving)
{
struct user_namespace *user_ns = current->nsproxy->mnt_ns->user_ns;
HLIST_HEAD(tree_list);
struct mnt_namespace *ns = dest_mnt->mnt_ns;
struct mountpoint *smp;
struct mount *child, *p;
struct hlist_node *n;
int err;
/* Preallocate a mountpoint in case the new mounts need
* to be tucked under other mounts.
*/
smp = get_mountpoint(source_mnt->mnt.mnt_root);
if (IS_ERR(smp))
return PTR_ERR(smp);
/* Is there space to add these mounts to the mount namespace? */
if (!moving) { err = count_mounts(ns, source_mnt);
if (err)
goto out;
}
if (IS_MNT_SHARED(dest_mnt)) { err = invent_group_ids(source_mnt, true);
if (err)
goto out;
err = propagate_mnt(dest_mnt, dest_mp, source_mnt, &tree_list);
lock_mount_hash();
if (err)
goto out_cleanup_ids;
for (p = source_mnt; p; p = next_mnt(p, source_mnt))
set_mnt_shared(p);
} else {
lock_mount_hash();
}
if (moving) { unhash_mnt(source_mnt);
attach_mnt(source_mnt, dest_mnt, dest_mp);
touch_mnt_namespace(source_mnt->mnt_ns);
} else {
if (source_mnt->mnt_ns) {
/* move from anon - the caller will destroy */
list_del_init(&source_mnt->mnt_ns->list);
}
mnt_set_mountpoint(dest_mnt, dest_mp, source_mnt);
commit_tree(source_mnt);
}
hlist_for_each_entry_safe(child, n, &tree_list, mnt_hash) {
struct mount *q;
hlist_del_init(&child->mnt_hash);
q = __lookup_mnt(&child->mnt_parent->mnt,
child->mnt_mountpoint);
if (q)
mnt_change_mountpoint(child, smp, q);
/* Notice when we are propagating across user namespaces */
if (child->mnt_parent->mnt_ns->user_ns != user_ns) lock_mnt_tree(child); child->mnt.mnt_flags &= ~MNT_LOCKED;
commit_tree(child);
}
put_mountpoint(smp);
unlock_mount_hash();
return 0;
out_cleanup_ids:
while (!hlist_empty(&tree_list)) {
child = hlist_entry(tree_list.first, struct mount, mnt_hash);
child->mnt_parent->mnt_ns->pending_mounts = 0;
umount_tree(child, UMOUNT_SYNC);
}
unlock_mount_hash();
cleanup_group_ids(source_mnt, NULL);
out:
ns->pending_mounts = 0;
read_seqlock_excl(&mount_lock);
put_mountpoint(smp);
read_sequnlock_excl(&mount_lock);
return err;
}
static struct mountpoint *lock_mount(struct path *path)
{
struct vfsmount *mnt;
struct dentry *dentry = path->dentry;
retry:
inode_lock(dentry->d_inode);
if (unlikely(cant_mount(dentry))) {
inode_unlock(dentry->d_inode);
return ERR_PTR(-ENOENT);
}
namespace_lock();
mnt = lookup_mnt(path);
if (likely(!mnt)) {
struct mountpoint *mp = get_mountpoint(dentry); if (IS_ERR(mp)) { namespace_unlock();
inode_unlock(dentry->d_inode);
return mp;
}
return mp;
}
namespace_unlock();
inode_unlock(path->dentry->d_inode);
path_put(path);
path->mnt = mnt;
dentry = path->dentry = dget(mnt->mnt_root);
goto retry;
}
static void unlock_mount(struct mountpoint *where)
{
struct dentry *dentry = where->m_dentry;
read_seqlock_excl(&mount_lock);
put_mountpoint(where);
read_sequnlock_excl(&mount_lock);
namespace_unlock();
inode_unlock(dentry->d_inode);
}
static int graft_tree(struct mount *mnt, struct mount *p, struct mountpoint *mp)
{
if (mnt->mnt.mnt_sb->s_flags & SB_NOUSER)
return -EINVAL;
if (d_is_dir(mp->m_dentry) !=
d_is_dir(mnt->mnt.mnt_root))
return -ENOTDIR;
return attach_recursive_mnt(mnt, p, mp, false);
}
/*
* Sanity check the flags to change_mnt_propagation.
*/
static int flags_to_propagation_type(int ms_flags)
{
int type = ms_flags & ~(MS_REC | MS_SILENT);
/* Fail if any non-propagation flags are set */
if (type & ~(MS_SHARED | MS_PRIVATE | MS_SLAVE | MS_UNBINDABLE))
return 0;
/* Only one propagation flag should be set */
if (!is_power_of_2(type))
return 0;
return type;
}
/*
* recursively change the type of the mountpoint.
*/
static int do_change_type(struct path *path, int ms_flags)
{
struct mount *m;
struct mount *mnt = real_mount(path->mnt);
int recurse = ms_flags & MS_REC;
int type;
int err = 0;
if (path->dentry != path->mnt->mnt_root)
return -EINVAL;
type = flags_to_propagation_type(ms_flags);
if (!type)
return -EINVAL;
namespace_lock();
if (type == MS_SHARED) {
err = invent_group_ids(mnt, recurse);
if (err)
goto out_unlock;
}
lock_mount_hash();
for (m = mnt; m; m = (recurse ? next_mnt(m, mnt) : NULL))
change_mnt_propagation(m, type);
unlock_mount_hash();
out_unlock:
namespace_unlock();
return err;
}
static struct mount *__do_loopback(struct path *old_path, int recurse)
{
struct mount *mnt = ERR_PTR(-EINVAL), *old = real_mount(old_path->mnt);
if (IS_MNT_UNBINDABLE(old)) return mnt; if (!check_mnt(old) && old_path->dentry->d_op != &ns_dentry_operations)
return mnt;
if (!recurse && has_locked_children(old, old_path->dentry))
return mnt;
if (recurse)
mnt = copy_tree(old, old_path->dentry, CL_COPY_MNT_NS_FILE);
else
mnt = clone_mnt(old, old_path->dentry, 0);
if (!IS_ERR(mnt))
mnt->mnt.mnt_flags &= ~MNT_LOCKED;
return mnt;
}
/*
* do loopback mount.
*/
static int do_loopback(struct path *path, const char *old_name,
int recurse)
{
struct path old_path;
struct mount *mnt = NULL, *parent;
struct mountpoint *mp;
int err;
if (!old_name || !*old_name)
return -EINVAL;
err = kern_path(old_name, LOOKUP_FOLLOW|LOOKUP_AUTOMOUNT, &old_path);
if (err)
return err;
err = -EINVAL;
if (mnt_ns_loop(old_path.dentry))
goto out;
mp = lock_mount(path);
if (IS_ERR(mp)) {
err = PTR_ERR(mp);
goto out;
}
parent = real_mount(path->mnt);
if (!check_mnt(parent))
goto out2;
mnt = __do_loopback(&old_path, recurse);
if (IS_ERR(mnt)) {
err = PTR_ERR(mnt);
goto out2;
}
err = graft_tree(mnt, parent, mp);
if (err) {
lock_mount_hash();
umount_tree(mnt, UMOUNT_SYNC);
unlock_mount_hash();
}
out2:
unlock_mount(mp);
out:
path_put(&old_path);
return err;
}
static struct file *open_detached_copy(struct path *path, bool recursive)
{
struct user_namespace *user_ns = current->nsproxy->mnt_ns->user_ns;
struct mnt_namespace *ns = alloc_mnt_ns(user_ns, true);
struct mount *mnt, *p;
struct file *file;
if (IS_ERR(ns))
return ERR_CAST(ns);
namespace_lock();
mnt = __do_loopback(path, recursive);
if (IS_ERR(mnt)) {
namespace_unlock();
free_mnt_ns(ns);
return ERR_CAST(mnt);
}
lock_mount_hash();
for (p = mnt; p; p = next_mnt(p, mnt)) {
p->mnt_ns = ns;
ns->mounts++;
}
ns->root = mnt;
list_add_tail(&ns->list, &mnt->mnt_list);
mntget(&mnt->mnt);
unlock_mount_hash();
namespace_unlock();
mntput(path->mnt);
path->mnt = &mnt->mnt;
file = dentry_open(path, O_PATH, current_cred());
if (IS_ERR(file))
dissolve_on_fput(path->mnt);
else
file->f_mode |= FMODE_NEED_UNMOUNT;
return file;
}
SYSCALL_DEFINE3(open_tree, int, dfd, const char __user *, filename, unsigned, flags)
{
struct file *file;
struct path path;
int lookup_flags = LOOKUP_AUTOMOUNT | LOOKUP_FOLLOW;
bool detached = flags & OPEN_TREE_CLONE;
int error;
int fd;
BUILD_BUG_ON(OPEN_TREE_CLOEXEC != O_CLOEXEC);
if (flags & ~(AT_EMPTY_PATH | AT_NO_AUTOMOUNT | AT_RECURSIVE |
AT_SYMLINK_NOFOLLOW | OPEN_TREE_CLONE |
OPEN_TREE_CLOEXEC))
return -EINVAL;
if ((flags & (AT_RECURSIVE | OPEN_TREE_CLONE)) == AT_RECURSIVE)
return -EINVAL;
if (flags & AT_NO_AUTOMOUNT)
lookup_flags &= ~LOOKUP_AUTOMOUNT;
if (flags & AT_SYMLINK_NOFOLLOW)
lookup_flags &= ~LOOKUP_FOLLOW;
if (flags & AT_EMPTY_PATH)
lookup_flags |= LOOKUP_EMPTY;
if (detached && !may_mount())
return -EPERM;
fd = get_unused_fd_flags(flags & O_CLOEXEC);
if (fd < 0)
return fd;
error = user_path_at(dfd, filename, lookup_flags, &path);
if (unlikely(error)) {
file = ERR_PTR(error);
} else {
if (detached)
file = open_detached_copy(&path, flags & AT_RECURSIVE);
else
file = dentry_open(&path, O_PATH, current_cred());
path_put(&path);
}
if (IS_ERR(file)) {
put_unused_fd(fd);
return PTR_ERR(file);
}
fd_install(fd, file);
return fd;
}
/*
* Don't allow locked mount flags to be cleared.
*
* No locks need to be held here while testing the various MNT_LOCK
* flags because those flags can never be cleared once they are set.
*/
static bool can_change_locked_flags(struct mount *mnt, unsigned int mnt_flags)
{
unsigned int fl = mnt->mnt.mnt_flags;
if ((fl & MNT_LOCK_READONLY) &&
!(mnt_flags & MNT_READONLY)) return false; if ((fl & MNT_LOCK_NODEV) && !(mnt_flags & MNT_NODEV))
return false;
if ((fl & MNT_LOCK_NOSUID) && !(mnt_flags & MNT_NOSUID))
return false;
if ((fl & MNT_LOCK_NOEXEC) && !(mnt_flags & MNT_NOEXEC))
return false;
if ((fl & MNT_LOCK_ATIME) && ((fl & MNT_ATIME_MASK) != (mnt_flags & MNT_ATIME_MASK)))
return false;
return true;
}
static int change_mount_ro_state(struct mount *mnt, unsigned int mnt_flags)
{
bool readonly_request = (mnt_flags & MNT_READONLY);
if (readonly_request == __mnt_is_readonly(&mnt->mnt))
return 0;
if (readonly_request)
return mnt_make_readonly(mnt);
mnt->mnt.mnt_flags &= ~MNT_READONLY;
return 0;
}
static void set_mount_attributes(struct mount *mnt, unsigned int mnt_flags)
{
mnt_flags |= mnt->mnt.mnt_flags & ~MNT_USER_SETTABLE_MASK;
mnt->mnt.mnt_flags = mnt_flags;
touch_mnt_namespace(mnt->mnt_ns);
}
static void mnt_warn_timestamp_expiry(struct path *mountpoint, struct vfsmount *mnt)
{
struct super_block *sb = mnt->mnt_sb;
if (!__mnt_is_readonly(mnt) &&
(ktime_get_real_seconds() + TIME_UPTIME_SEC_MAX > sb->s_time_max)) { char *buf = (char *)__get_free_page(GFP_KERNEL);
char *mntpath = buf ? d_path(mountpoint, buf, PAGE_SIZE) : ERR_PTR(-ENOMEM);
struct tm tm;
time64_to_tm(sb->s_time_max, 0, &tm);
pr_warn("%s filesystem being %s at %s supports timestamps until %04ld (0x%llx)\n",
sb->s_type->name,
is_mounted(mnt) ? "remounted" : "mounted",
mntpath,
tm.tm_year+1900, (unsigned long long)sb->s_time_max);
free_page((unsigned long)buf);
}
}
/*
* Handle reconfiguration of the mountpoint only without alteration of the
* superblock it refers to. This is triggered by specifying MS_REMOUNT|MS_BIND
* to mount(2).
*/
static int do_reconfigure_mnt(struct path *path, unsigned int mnt_flags)
{
struct super_block *sb = path->mnt->mnt_sb;
struct mount *mnt = real_mount(path->mnt);
int ret;
if (!check_mnt(mnt))
return -EINVAL;
if (path->dentry != mnt->mnt.mnt_root)
return -EINVAL;
if (!can_change_locked_flags(mnt, mnt_flags))
return -EPERM;
/*
* We're only checking whether the superblock is read-only not
* changing it, so only take down_read(&sb->s_umount).
*/
down_read(&sb->s_umount);
lock_mount_hash();
ret = change_mount_ro_state(mnt, mnt_flags);
if (ret == 0)
set_mount_attributes(mnt, mnt_flags);
unlock_mount_hash();
up_read(&sb->s_umount);
mnt_warn_timestamp_expiry(path, &mnt->mnt);
return ret;
}
/*
* change filesystem flags. dir should be a physical root of filesystem.
* If you've mounted a non-root directory somewhere and want to do remount
* on it - tough luck.
*/
static int do_remount(struct path *path, int ms_flags, int sb_flags,
int mnt_flags, void *data)
{
int err;
struct super_block *sb = path->mnt->mnt_sb;
struct mount *mnt = real_mount(path->mnt);
struct fs_context *fc;
if (!check_mnt(mnt))
return -EINVAL;
if (path->dentry != path->mnt->mnt_root)
return -EINVAL;
if (!can_change_locked_flags(mnt, mnt_flags))
return -EPERM;
fc = fs_context_for_reconfigure(path->dentry, sb_flags, MS_RMT_MASK);
if (IS_ERR(fc))
return PTR_ERR(fc);
fc->oldapi = true;
err = parse_monolithic_mount_data(fc, data);
if (!err) {
down_write(&sb->s_umount);
err = -EPERM;
if (ns_capable(sb->s_user_ns, CAP_SYS_ADMIN)) {
err = reconfigure_super(fc);
if (!err) {
lock_mount_hash();
set_mount_attributes(mnt, mnt_flags);
unlock_mount_hash();
}
}
up_write(&sb->s_umount);
}
mnt_warn_timestamp_expiry(path, &mnt->mnt);
put_fs_context(fc);
return err;
}
static inline int tree_contains_unbindable(struct mount *mnt)
{
struct mount *p;
for (p = mnt; p; p = next_mnt(p, mnt)) { if (IS_MNT_UNBINDABLE(p))
return 1;
}
return 0;
}
/*
* Check that there aren't references to earlier/same mount namespaces in the
* specified subtree. Such references can act as pins for mount namespaces
* that aren't checked by the mount-cycle checking code, thereby allowing
* cycles to be made.
*/
static bool check_for_nsfs_mounts(struct mount *subtree)
{
struct mount *p;
bool ret = false;
lock_mount_hash();
for (p = subtree; p; p = next_mnt(p, subtree))
if (mnt_ns_loop(p->mnt.mnt_root))
goto out;
ret = true;
out:
unlock_mount_hash();
return ret;
}
static int do_set_group(struct path *from_path, struct path *to_path)
{
struct mount *from, *to;
int err;
from = real_mount(from_path->mnt);
to = real_mount(to_path->mnt);
namespace_lock();
err = -EINVAL;
/* To and From must be mounted */
if (!is_mounted(&from->mnt))
goto out;
if (!is_mounted(&to->mnt))
goto out;
err = -EPERM;
/* We should be allowed to modify mount namespaces of both mounts */
if (!ns_capable(from->mnt_ns->user_ns, CAP_SYS_ADMIN))
goto out;
if (!ns_capable(to->mnt_ns->user_ns, CAP_SYS_ADMIN))
goto out;
err = -EINVAL;
/* To and From paths should be mount roots */
if (from_path->dentry != from_path->mnt->mnt_root)
goto out;
if (to_path->dentry != to_path->mnt->mnt_root)
goto out;
/* Setting sharing groups is only allowed across same superblock */
if (from->mnt.mnt_sb != to->mnt.mnt_sb)
goto out;
/* From mount root should be wider than To mount root */
if (!is_subdir(to->mnt.mnt_root, from->mnt.mnt_root))
goto out;
/* From mount should not have locked children in place of To's root */
if (has_locked_children(from, to->mnt.mnt_root))
goto out;
/* Setting sharing groups is only allowed on private mounts */
if (IS_MNT_SHARED(to) || IS_MNT_SLAVE(to))
goto out;
/* From should not be private */
if (!IS_MNT_SHARED(from) && !IS_MNT_SLAVE(from))
goto out;
if (IS_MNT_SLAVE(from)) {
struct mount *m = from->mnt_master;
list_add(&to->mnt_slave, &m->mnt_slave_list);
to->mnt_master = m;
}
if (IS_MNT_SHARED(from)) {
to->mnt_group_id = from->mnt_group_id;
list_add(&to->mnt_share, &from->mnt_share);
lock_mount_hash();
set_mnt_shared(to);
unlock_mount_hash();
}
err = 0;
out:
namespace_unlock();
return err;
}
static int do_move_mount(struct path *old_path, struct path *new_path)
{
struct mnt_namespace *ns;
struct mount *p;
struct mount *old;
struct mount *parent;
struct mountpoint *mp, *old_mp;
int err;
bool attached;
mp = lock_mount(new_path);
if (IS_ERR(mp))
return PTR_ERR(mp);
old = real_mount(old_path->mnt); p = real_mount(new_path->mnt);
parent = old->mnt_parent;
attached = mnt_has_parent(old);
old_mp = old->mnt_mp;
ns = old->mnt_ns;
err = -EINVAL;
/* The mountpoint must be in our namespace. */
if (!check_mnt(p))
goto out;
/* The thing moved must be mounted... */
if (!is_mounted(&old->mnt))
goto out;
/* ... and either ours or the root of anon namespace */
if (!(attached ? check_mnt(old) : is_anon_ns(ns)))
goto out;
if (old->mnt.mnt_flags & MNT_LOCKED)
goto out;
if (old_path->dentry != old_path->mnt->mnt_root)
goto out;
if (d_is_dir(new_path->dentry) !=
d_is_dir(old_path->dentry))
goto out;
/*
* Don't move a mount residing in a shared parent.
*/
if (attached && IS_MNT_SHARED(parent))
goto out;
/*
* Don't move a mount tree containing unbindable mounts to a destination
* mount which is shared.
*/
if (IS_MNT_SHARED(p) && tree_contains_unbindable(old))
goto out;
err = -ELOOP;
if (!check_for_nsfs_mounts(old))
goto out;
for (; mnt_has_parent(p); p = p->mnt_parent)
if (p == old)
goto out;
err = attach_recursive_mnt(old, real_mount(new_path->mnt), mp,
attached);
if (err)
goto out;
/* if the mount is moved, it should no longer be expire
* automatically */
list_del_init(&old->mnt_expire);
if (attached)
put_mountpoint(old_mp);
out:
unlock_mount(mp); if (!err) {
if (attached)
mntput_no_expire(parent);
else
free_mnt_ns(ns);
}
return err;
}
static int do_move_mount_old(struct path *path, const char *old_name)
{
struct path old_path;
int err;
if (!old_name || !*old_name)
return -EINVAL;
err = kern_path(old_name, LOOKUP_FOLLOW, &old_path); if (err)
return err;
err = do_move_mount(&old_path, path);
path_put(&old_path);
return err;
}
/*
* add a mount into a namespace's mount tree
*/
static int do_add_mount(struct mount *newmnt, struct mountpoint *mp,
struct path *path, int mnt_flags)
{
struct mount *parent = real_mount(path->mnt);
mnt_flags &= ~MNT_INTERNAL_FLAGS;
if (unlikely(!check_mnt(parent))) {
/* that's acceptable only for automounts done in private ns */
if (!(mnt_flags & MNT_SHRINKABLE))
return -EINVAL;
/* ... and for those we'd better have mountpoint still alive */
if (!parent->mnt_ns)
return -EINVAL;
}
/* Refuse the same filesystem on the same mount point */
if (path->mnt->mnt_sb == newmnt->mnt.mnt_sb && path->mnt->mnt_root == path->dentry)
return -EBUSY;
if (d_is_symlink(newmnt->mnt.mnt_root))
return -EINVAL;
newmnt->mnt.mnt_flags = mnt_flags; return graft_tree(newmnt, parent, mp);
}
static bool mount_too_revealing(const struct super_block *sb, int *new_mnt_flags);
/*
* Create a new mount using a superblock configuration and request it
* be added to the namespace tree.
*/
static int do_new_mount_fc(struct fs_context *fc, struct path *mountpoint,
unsigned int mnt_flags)
{
struct vfsmount *mnt;
struct mountpoint *mp;
struct super_block *sb = fc->root->d_sb;
int error;
error = security_sb_kern_mount(sb);
if (!error && mount_too_revealing(sb, &mnt_flags))
error = -EPERM;
if (unlikely(error)) {
fc_drop_locked(fc);
return error;
}
up_write(&sb->s_umount);
mnt = vfs_create_mount(fc);
if (IS_ERR(mnt))
return PTR_ERR(mnt);
mnt_warn_timestamp_expiry(mountpoint, mnt);
mp = lock_mount(mountpoint);
if (IS_ERR(mp)) {
mntput(mnt);
return PTR_ERR(mp);
}
error = do_add_mount(real_mount(mnt), mp, mountpoint, mnt_flags);
unlock_mount(mp);
if (error < 0)
mntput(mnt);
return error;
}
/*
* create a new mount for userspace and request it to be added into the
* namespace's tree
*/
static int do_new_mount(struct path *path, const char *fstype, int sb_flags,
int mnt_flags, const char *name, void *data)
{
struct file_system_type *type;
struct fs_context *fc;
const char *subtype = NULL;
int err = 0;
if (!fstype)
return -EINVAL;
type = get_fs_type(fstype);
if (!type)
return -ENODEV;
if (type->fs_flags & FS_HAS_SUBTYPE) { subtype = strchr(fstype, '.');
if (subtype) {
subtype++;
if (!*subtype) {
put_filesystem(type);
return -EINVAL;
}
}
}
fc = fs_context_for_mount(type, sb_flags);
put_filesystem(type);
if (IS_ERR(fc))
return PTR_ERR(fc);
if (subtype) err = vfs_parse_fs_string(fc, "subtype",
subtype, strlen(subtype));
if (!err && name) err = vfs_parse_fs_string(fc, "source", name, strlen(name));
if (!err)
err = parse_monolithic_mount_data(fc, data); if (!err && !mount_capable(fc))
err = -EPERM;
if (!err)
err = vfs_get_tree(fc);
if (!err)
err = do_new_mount_fc(fc, path, mnt_flags); put_fs_context(fc);
return err;
}
int finish_automount(struct vfsmount *m, struct path *path)
{
struct dentry *dentry = path->dentry;
struct mountpoint *mp;
struct mount *mnt;
int err;
if (!m)
return 0;
if (IS_ERR(m))
return PTR_ERR(m);
mnt = real_mount(m);
/* The new mount record should have at least 2 refs to prevent it being
* expired before we get a chance to add it
*/
BUG_ON(mnt_get_count(mnt) < 2);
if (m->mnt_sb == path->mnt->mnt_sb &&
m->mnt_root == dentry) {
err = -ELOOP;
goto discard;
}
/*
* we don't want to use lock_mount() - in this case finding something
* that overmounts our mountpoint to be means "quitely drop what we've
* got", not "try to mount it on top".
*/
inode_lock(dentry->d_inode);
namespace_lock();
if (unlikely(cant_mount(dentry))) {
err = -ENOENT;
goto discard_locked;
}
rcu_read_lock();
if (unlikely(__lookup_mnt(path->mnt, dentry))) {
rcu_read_unlock();
err = 0;
goto discard_locked;
}
rcu_read_unlock();
mp = get_mountpoint(dentry);
if (IS_ERR(mp)) {
err = PTR_ERR(mp);
goto discard_locked;
}
err = do_add_mount(mnt, mp, path, path->mnt->mnt_flags | MNT_SHRINKABLE);
unlock_mount(mp);
if (unlikely(err))
goto discard;
mntput(m);
return 0;
discard_locked:
namespace_unlock();
inode_unlock(dentry->d_inode);
discard:
/* remove m from any expiration list it may be on */
if (!list_empty(&mnt->mnt_expire)) {
namespace_lock();
list_del_init(&mnt->mnt_expire);
namespace_unlock();
}
mntput(m);
mntput(m);
return err;
}
/**
* mnt_set_expiry - Put a mount on an expiration list
* @mnt: The mount to list.
* @expiry_list: The list to add the mount to.
*/
void mnt_set_expiry(struct vfsmount *mnt, struct list_head *expiry_list)
{
namespace_lock();
list_add_tail(&real_mount(mnt)->mnt_expire, expiry_list);
namespace_unlock();
}
EXPORT_SYMBOL(mnt_set_expiry);
/*
* process a list of expirable mountpoints with the intent of discarding any
* mountpoints that aren't in use and haven't been touched since last we came
* here
*/
void mark_mounts_for_expiry(struct list_head *mounts)
{
struct mount *mnt, *next;
LIST_HEAD(graveyard);
if (list_empty(mounts))
return;
namespace_lock();
lock_mount_hash();
/* extract from the expiration list every vfsmount that matches the
* following criteria:
* - only referenced by its parent vfsmount
* - still marked for expiry (marked on the last call here; marks are
* cleared by mntput())
*/
list_for_each_entry_safe(mnt, next, mounts, mnt_expire) {
if (!xchg(&mnt->mnt_expiry_mark, 1) ||
propagate_mount_busy(mnt, 1))
continue;
list_move(&mnt->mnt_expire, &graveyard);
}
while (!list_empty(&graveyard)) {
mnt = list_first_entry(&graveyard, struct mount, mnt_expire);
touch_mnt_namespace(mnt->mnt_ns);
umount_tree(mnt, UMOUNT_PROPAGATE|UMOUNT_SYNC);
}
unlock_mount_hash();
namespace_unlock();
}
EXPORT_SYMBOL_GPL(mark_mounts_for_expiry);
/*
* Ripoff of 'select_parent()'
*
* search the list of submounts for a given mountpoint, and move any
* shrinkable submounts to the 'graveyard' list.
*/
static int select_submounts(struct mount *parent, struct list_head *graveyard)
{
struct mount *this_parent = parent;
struct list_head *next;
int found = 0;
repeat:
next = this_parent->mnt_mounts.next;
resume:
while (next != &this_parent->mnt_mounts) {
struct list_head *tmp = next;
struct mount *mnt = list_entry(tmp, struct mount, mnt_child);
next = tmp->next;
if (!(mnt->mnt.mnt_flags & MNT_SHRINKABLE))
continue;
/*
* Descend a level if the d_mounts list is non-empty.
*/
if (!list_empty(&mnt->mnt_mounts)) {
this_parent = mnt;
goto repeat;
}
if (!propagate_mount_busy(mnt, 1)) {
list_move_tail(&mnt->mnt_expire, graveyard);
found++;
}
}
/*
* All done at this level ... ascend and resume the search
*/
if (this_parent != parent) {
next = this_parent->mnt_child.next;
this_parent = this_parent->mnt_parent;
goto resume;
}
return found;
}
/*
* process a list of expirable mountpoints with the intent of discarding any
* submounts of a specific parent mountpoint
*
* mount_lock must be held for write
*/
static void shrink_submounts(struct mount *mnt)
{
LIST_HEAD(graveyard);
struct mount *m;
/* extract submounts of 'mountpoint' from the expiration list */
while (select_submounts(mnt, &graveyard)) {
while (!list_empty(&graveyard)) {
m = list_first_entry(&graveyard, struct mount,
mnt_expire);
touch_mnt_namespace(m->mnt_ns);
umount_tree(m, UMOUNT_PROPAGATE|UMOUNT_SYNC);
}
}
}
static void *copy_mount_options(const void __user * data)
{
char *copy;
unsigned left, offset;
if (!data)
return NULL;
copy = kmalloc(PAGE_SIZE, GFP_KERNEL);
if (!copy)
return ERR_PTR(-ENOMEM);
left = copy_from_user(copy, data, PAGE_SIZE);
/*
* Not all architectures have an exact copy_from_user(). Resort to
* byte at a time.
*/
offset = PAGE_SIZE - left;
while (left) {
char c;
if (get_user(c, (const char __user *)data + offset))
break;
copy[offset] = c;
left--;
offset++;
}
if (left == PAGE_SIZE) { kfree(copy);
return ERR_PTR(-EFAULT);
}
return copy;
}
static char *copy_mount_string(const void __user *data)
{
return data ? strndup_user(data, PATH_MAX) : NULL;
}
/*
* Flags is a 32-bit value that allows up to 31 non-fs dependent flags to
* be given to the mount() call (ie: read-only, no-dev, no-suid etc).
*
* data is a (void *) that can point to any structure up to
* PAGE_SIZE-1 bytes, which can contain arbitrary fs-dependent
* information (or be NULL).
*
* Pre-0.97 versions of mount() didn't have a flags word.
* When the flags word was introduced its top half was required
* to have the magic value 0xC0ED, and this remained so until 2.4.0-test9.
* Therefore, if this magic number is present, it carries no information
* and must be discarded.
*/
int path_mount(const char *dev_name, struct path *path,
const char *type_page, unsigned long flags, void *data_page)
{
unsigned int mnt_flags = 0, sb_flags;
int ret;
/* Discard magic */
if ((flags & MS_MGC_MSK) == MS_MGC_VAL) flags &= ~MS_MGC_MSK;
/* Basic sanity checks */
if (data_page) ((char *)data_page)[PAGE_SIZE - 1] = 0; if (flags & MS_NOUSER)
return -EINVAL;
ret = security_sb_mount(dev_name, path, type_page, flags, data_page); if (ret)
return ret;
if (!may_mount())
return -EPERM;
if (flags & SB_MANDLOCK)
warn_mandlock();
/* Default to relatime unless overriden */
if (!(flags & MS_NOATIME))
mnt_flags |= MNT_RELATIME;
/* Separate the per-mountpoint flags */
if (flags & MS_NOSUID) mnt_flags |= MNT_NOSUID; if (flags & MS_NODEV) mnt_flags |= MNT_NODEV; if (flags & MS_NOEXEC) mnt_flags |= MNT_NOEXEC; if (flags & MS_NOATIME) mnt_flags |= MNT_NOATIME; if (flags & MS_NODIRATIME) mnt_flags |= MNT_NODIRATIME; if (flags & MS_STRICTATIME) mnt_flags &= ~(MNT_RELATIME | MNT_NOATIME); if (flags & MS_RDONLY) mnt_flags |= MNT_READONLY; if (flags & MS_NOSYMFOLLOW) mnt_flags |= MNT_NOSYMFOLLOW;
/* The default atime for remount is preservation */
if ((flags & MS_REMOUNT) &&
((flags & (MS_NOATIME | MS_NODIRATIME | MS_RELATIME |
MS_STRICTATIME)) == 0)) {
mnt_flags &= ~MNT_ATIME_MASK;
mnt_flags |= path->mnt->mnt_flags & MNT_ATIME_MASK;
}
sb_flags = flags & (SB_RDONLY |
SB_SYNCHRONOUS |
SB_MANDLOCK |
SB_DIRSYNC |
SB_SILENT |
SB_POSIXACL |
SB_LAZYTIME |
SB_I_VERSION);
if ((flags & (MS_REMOUNT | MS_BIND)) == (MS_REMOUNT | MS_BIND))
return do_reconfigure_mnt(path, mnt_flags);
if (flags & MS_REMOUNT)
return do_remount(path, flags, sb_flags, mnt_flags, data_page);
if (flags & MS_BIND)
return do_loopback(path, dev_name, flags & MS_REC);
if (flags & (MS_SHARED | MS_PRIVATE | MS_SLAVE | MS_UNBINDABLE)) return do_change_type(path, flags); if (flags & MS_MOVE)
return do_move_mount_old(path, dev_name);
return do_new_mount(path, type_page, sb_flags, mnt_flags, dev_name,
data_page);
}
long do_mount(const char *dev_name, const char __user *dir_name,
const char *type_page, unsigned long flags, void *data_page)
{
struct path path;
int ret;
ret = user_path_at(AT_FDCWD, dir_name, LOOKUP_FOLLOW, &path);
if (ret)
return ret; ret = path_mount(dev_name, &path, type_page, flags, data_page);
path_put(&path);
return ret;
}
static struct ucounts *inc_mnt_namespaces(struct user_namespace *ns)
{
return inc_ucount(ns, current_euid(), UCOUNT_MNT_NAMESPACES);
}
static void dec_mnt_namespaces(struct ucounts *ucounts)
{
dec_ucount(ucounts, UCOUNT_MNT_NAMESPACES);
}
static void free_mnt_ns(struct mnt_namespace *ns)
{
if (!is_anon_ns(ns)) ns_free_inum(&ns->ns); dec_mnt_namespaces(ns->ucounts);
put_user_ns(ns->user_ns);
kfree(ns);
}
/*
* Assign a sequence number so we can detect when we attempt to bind
* mount a reference to an older mount namespace into the current
* mount namespace, preventing reference counting loops. A 64bit
* number incrementing at 10Ghz will take 12,427 years to wrap which
* is effectively never, so we can ignore the possibility.
*/
static atomic64_t mnt_ns_seq = ATOMIC64_INIT(1);
static struct mnt_namespace *alloc_mnt_ns(struct user_namespace *user_ns, bool anon)
{
struct mnt_namespace *new_ns;
struct ucounts *ucounts;
int ret;
ucounts = inc_mnt_namespaces(user_ns);
if (!ucounts)
return ERR_PTR(-ENOSPC);
new_ns = kzalloc(sizeof(struct mnt_namespace), GFP_KERNEL_ACCOUNT);
if (!new_ns) {
dec_mnt_namespaces(ucounts);
return ERR_PTR(-ENOMEM);
}
if (!anon) {
ret = ns_alloc_inum(&new_ns->ns);
if (ret) {
kfree(new_ns);
dec_mnt_namespaces(ucounts);
return ERR_PTR(ret);
}
}
new_ns->ns.ops = &mntns_operations;
if (!anon)
new_ns->seq = atomic64_add_return(1, &mnt_ns_seq);
refcount_set(&new_ns->ns.count, 1);
INIT_LIST_HEAD(&new_ns->list);
init_waitqueue_head(&new_ns->poll);
spin_lock_init(&new_ns->ns_lock);
new_ns->user_ns = get_user_ns(user_ns);
new_ns->ucounts = ucounts;
return new_ns;
}
__latent_entropy
struct mnt_namespace *copy_mnt_ns(unsigned long flags, struct mnt_namespace *ns,
struct user_namespace *user_ns, struct fs_struct *new_fs)
{
struct mnt_namespace *new_ns;
struct vfsmount *rootmnt = NULL, *pwdmnt = NULL;
struct mount *p, *q;
struct mount *old;
struct mount *new;
int copy_flags;
BUG_ON(!ns);
if (likely(!(flags & CLONE_NEWNS))) {
get_mnt_ns(ns);
return ns;
}
old = ns->root;
new_ns = alloc_mnt_ns(user_ns, false);
if (IS_ERR(new_ns))
return new_ns;
namespace_lock();
/* First pass: copy the tree topology */
copy_flags = CL_COPY_UNBINDABLE | CL_EXPIRE;
if (user_ns != ns->user_ns)
copy_flags |= CL_SHARED_TO_SLAVE;
new = copy_tree(old, old->mnt.mnt_root, copy_flags);
if (IS_ERR(new)) {
namespace_unlock();
free_mnt_ns(new_ns);
return ERR_CAST(new);
}
if (user_ns != ns->user_ns) {
lock_mount_hash();
lock_mnt_tree(new);
unlock_mount_hash();
}
new_ns->root = new;
list_add_tail(&new_ns->list, &new->mnt_list);
/*
* Second pass: switch the tsk->fs->* elements and mark new vfsmounts
* as belonging to new namespace. We have already acquired a private
* fs_struct, so tsk->fs->lock is not needed.
*/
p = old;
q = new;
while (p) {
q->mnt_ns = new_ns;
new_ns->mounts++;
if (new_fs) {
if (&p->mnt == new_fs->root.mnt) {
new_fs->root.mnt = mntget(&q->mnt);
rootmnt = &p->mnt;
}
if (&p->mnt == new_fs->pwd.mnt) {
new_fs->pwd.mnt = mntget(&q->mnt);
pwdmnt = &p->mnt;
}
}
p = next_mnt(p, old);
q = next_mnt(q, new);
if (!q)
break;
while (p->mnt.mnt_root != q->mnt.mnt_root)
p = next_mnt(p, old);
}
namespace_unlock();
if (rootmnt)
mntput(rootmnt);
if (pwdmnt)
mntput(pwdmnt);
return new_ns;
}
struct dentry *mount_subtree(struct vfsmount *m, const char *name)
{
struct mount *mnt = real_mount(m);
struct mnt_namespace *ns;
struct super_block *s;
struct path path;
int err;
ns = alloc_mnt_ns(&init_user_ns, true);
if (IS_ERR(ns)) {
mntput(m);
return ERR_CAST(ns);
}
mnt->mnt_ns = ns;
ns->root = mnt;
ns->mounts++;
list_add(&mnt->mnt_list, &ns->list);
err = vfs_path_lookup(m->mnt_root, m,
name, LOOKUP_FOLLOW|LOOKUP_AUTOMOUNT, &path);
put_mnt_ns(ns);
if (err)
return ERR_PTR(err);
/* trade a vfsmount reference for active sb one */
s = path.mnt->mnt_sb;
atomic_inc(&s->s_active);
mntput(path.mnt);
/* lock the sucker */
down_write(&s->s_umount);
/* ... and return the root of (sub)tree on it */
return path.dentry;
}
EXPORT_SYMBOL(mount_subtree);
SYSCALL_DEFINE5(mount, char __user *, dev_name, char __user *, dir_name,
char __user *, type, unsigned long, flags, void __user *, data)
{
int ret;
char *kernel_type;
char *kernel_dev;
void *options;
kernel_type = copy_mount_string(type);
ret = PTR_ERR(kernel_type);
if (IS_ERR(kernel_type))
goto out_type;
kernel_dev = copy_mount_string(dev_name);
ret = PTR_ERR(kernel_dev);
if (IS_ERR(kernel_dev))
goto out_dev;
options = copy_mount_options(data);
ret = PTR_ERR(options);
if (IS_ERR(options))
goto out_data;
ret = do_mount(kernel_dev, dir_name, kernel_type, flags, options);
kfree(options);
out_data:
kfree(kernel_dev);
out_dev:
kfree(kernel_type);
out_type:
return ret;
}
#define FSMOUNT_VALID_FLAGS \
(MOUNT_ATTR_RDONLY | MOUNT_ATTR_NOSUID | MOUNT_ATTR_NODEV | \
MOUNT_ATTR_NOEXEC | MOUNT_ATTR__ATIME | MOUNT_ATTR_NODIRATIME | \
MOUNT_ATTR_NOSYMFOLLOW)
#define MOUNT_SETATTR_VALID_FLAGS (FSMOUNT_VALID_FLAGS | MOUNT_ATTR_IDMAP)
#define MOUNT_SETATTR_PROPAGATION_FLAGS \
(MS_UNBINDABLE | MS_PRIVATE | MS_SLAVE | MS_SHARED)
static unsigned int attr_flags_to_mnt_flags(u64 attr_flags)
{
unsigned int mnt_flags = 0;
if (attr_flags & MOUNT_ATTR_RDONLY)
mnt_flags |= MNT_READONLY;
if (attr_flags & MOUNT_ATTR_NOSUID)
mnt_flags |= MNT_NOSUID;
if (attr_flags & MOUNT_ATTR_NODEV)
mnt_flags |= MNT_NODEV;
if (attr_flags & MOUNT_ATTR_NOEXEC)
mnt_flags |= MNT_NOEXEC;
if (attr_flags & MOUNT_ATTR_NODIRATIME)
mnt_flags |= MNT_NODIRATIME;
if (attr_flags & MOUNT_ATTR_NOSYMFOLLOW)
mnt_flags |= MNT_NOSYMFOLLOW;
return mnt_flags;
}
/*
* Create a kernel mount representation for a new, prepared superblock
* (specified by fs_fd) and attach to an open_tree-like file descriptor.
*/
SYSCALL_DEFINE3(fsmount, int, fs_fd, unsigned int, flags,
unsigned int, attr_flags)
{
struct mnt_namespace *ns;
struct fs_context *fc;
struct file *file;
struct path newmount;
struct mount *mnt;
struct fd f;
unsigned int mnt_flags = 0;
long ret;
if (!may_mount())
return -EPERM;
if ((flags & ~(FSMOUNT_CLOEXEC)) != 0)
return -EINVAL;
if (attr_flags & ~FSMOUNT_VALID_FLAGS)
return -EINVAL;
mnt_flags = attr_flags_to_mnt_flags(attr_flags);
switch (attr_flags & MOUNT_ATTR__ATIME) {
case MOUNT_ATTR_STRICTATIME:
break;
case MOUNT_ATTR_NOATIME:
mnt_flags |= MNT_NOATIME;
break;
case MOUNT_ATTR_RELATIME:
mnt_flags |= MNT_RELATIME;
break;
default:
return -EINVAL;
}
f = fdget(fs_fd);
if (!f.file)
return -EBADF;
ret = -EINVAL;
if (f.file->f_op != &fscontext_fops)
goto err_fsfd;
fc = f.file->private_data;
ret = mutex_lock_interruptible(&fc->uapi_mutex);
if (ret < 0)
goto err_fsfd;
/* There must be a valid superblock or we can't mount it */
ret = -EINVAL;
if (!fc->root)
goto err_unlock;
ret = -EPERM;
if (mount_too_revealing(fc->root->d_sb, &mnt_flags)) {
pr_warn("VFS: Mount too revealing\n");
goto err_unlock;
}
ret = -EBUSY;
if (fc->phase != FS_CONTEXT_AWAITING_MOUNT)
goto err_unlock;
if (fc->sb_flags & SB_MANDLOCK)
warn_mandlock();
newmount.mnt = vfs_create_mount(fc);
if (IS_ERR(newmount.mnt)) {
ret = PTR_ERR(newmount.mnt);
goto err_unlock;
}
newmount.dentry = dget(fc->root);
newmount.mnt->mnt_flags = mnt_flags;
/* We've done the mount bit - now move the file context into more or
* less the same state as if we'd done an fspick(). We don't want to
* do any memory allocation or anything like that at this point as we
* don't want to have to handle any errors incurred.
*/
vfs_clean_context(fc);
ns = alloc_mnt_ns(current->nsproxy->mnt_ns->user_ns, true);
if (IS_ERR(ns)) {
ret = PTR_ERR(ns);
goto err_path;
}
mnt = real_mount(newmount.mnt);
mnt->mnt_ns = ns;
ns->root = mnt;
ns->mounts = 1;
list_add(&mnt->mnt_list, &ns->list);
mntget(newmount.mnt);
/* Attach to an apparent O_PATH fd with a note that we need to unmount
* it, not just simply put it.
*/
file = dentry_open(&newmount, O_PATH, fc->cred);
if (IS_ERR(file)) {
dissolve_on_fput(newmount.mnt);
ret = PTR_ERR(file);
goto err_path;
}
file->f_mode |= FMODE_NEED_UNMOUNT;
ret = get_unused_fd_flags((flags & FSMOUNT_CLOEXEC) ? O_CLOEXEC : 0);
if (ret >= 0)
fd_install(ret, file);
else
fput(file);
err_path:
path_put(&newmount);
err_unlock:
mutex_unlock(&fc->uapi_mutex);
err_fsfd:
fdput(f);
return ret;
}
/*
* Move a mount from one place to another. In combination with
* fsopen()/fsmount() this is used to install a new mount and in combination
* with open_tree(OPEN_TREE_CLONE [| AT_RECURSIVE]) it can be used to copy
* a mount subtree.
*
* Note the flags value is a combination of MOVE_MOUNT_* flags.
*/
SYSCALL_DEFINE5(move_mount,
int, from_dfd, const char __user *, from_pathname,
int, to_dfd, const char __user *, to_pathname,
unsigned int, flags)
{
struct path from_path, to_path;
unsigned int lflags;
int ret = 0;
if (!may_mount())
return -EPERM;
if (flags & ~MOVE_MOUNT__MASK)
return -EINVAL;
/* If someone gives a pathname, they aren't permitted to move
* from an fd that requires unmount as we can't get at the flag
* to clear it afterwards.
*/
lflags = 0;
if (flags & MOVE_MOUNT_F_SYMLINKS) lflags |= LOOKUP_FOLLOW;
if (flags & MOVE_MOUNT_F_AUTOMOUNTS) lflags |= LOOKUP_AUTOMOUNT;
if (flags & MOVE_MOUNT_F_EMPTY_PATH) lflags |= LOOKUP_EMPTY;
ret = user_path_at(from_dfd, from_pathname, lflags, &from_path);
if (ret < 0)
return ret;
lflags = 0;
if (flags & MOVE_MOUNT_T_SYMLINKS) lflags |= LOOKUP_FOLLOW;
if (flags & MOVE_MOUNT_T_AUTOMOUNTS) lflags |= LOOKUP_AUTOMOUNT;
if (flags & MOVE_MOUNT_T_EMPTY_PATH) lflags |= LOOKUP_EMPTY;
ret = user_path_at(to_dfd, to_pathname, lflags, &to_path);
if (ret < 0)
goto out_from;
ret = security_move_mount(&from_path, &to_path);
if (ret < 0)
goto out_to;
if (flags & MOVE_MOUNT_SET_GROUP)
ret = do_set_group(&from_path, &to_path);
else
ret = do_move_mount(&from_path, &to_path);
out_to:
path_put(&to_path);
out_from:
path_put(&from_path);
return ret;
}
/*
* Return true if path is reachable from root
*
* namespace_sem or mount_lock is held
*/
bool is_path_reachable(struct mount *mnt, struct dentry *dentry,
const struct path *root)
{
while (&mnt->mnt != root->mnt && mnt_has_parent(mnt)) {
dentry = mnt->mnt_mountpoint;
mnt = mnt->mnt_parent;
}
return &mnt->mnt == root->mnt && is_subdir(dentry, root->dentry);
}
bool path_is_under(const struct path *path1, const struct path *path2)
{
bool res;
read_seqlock_excl(&mount_lock);
res = is_path_reachable(real_mount(path1->mnt), path1->dentry, path2);
read_sequnlock_excl(&mount_lock);
return res;
}
EXPORT_SYMBOL(path_is_under);
/*
* pivot_root Semantics:
* Moves the root file system of the current process to the directory put_old,
* makes new_root as the new root file system of the current process, and sets
* root/cwd of all processes which had them on the current root to new_root.
*
* Restrictions:
* The new_root and put_old must be directories, and must not be on the
* same file system as the current process root. The put_old must be
* underneath new_root, i.e. adding a non-zero number of /.. to the string
* pointed to by put_old must yield the same directory as new_root. No other
* file system may be mounted on put_old. After all, new_root is a mountpoint.
*
* Also, the current root cannot be on the 'rootfs' (initial ramfs) filesystem.
* See Documentation/filesystems/ramfs-rootfs-initramfs.rst for alternatives
* in this situation.
*
* Notes:
* - we don't move root/cwd if they are not at the root (reason: if something
* cared enough to change them, it's probably wrong to force them elsewhere)
* - it's okay to pick a root that isn't the root of a file system, e.g.
* /nfs/my_root where /nfs is the mount point. It must be a mountpoint,
* though, so you may need to say mount --bind /nfs/my_root /nfs/my_root
* first.
*/
SYSCALL_DEFINE2(pivot_root, const char __user *, new_root,
const char __user *, put_old)
{
struct path new, old, root;
struct mount *new_mnt, *root_mnt, *old_mnt, *root_parent, *ex_parent;
struct mountpoint *old_mp, *root_mp;
int error;
if (!may_mount())
return -EPERM;
error = user_path_at(AT_FDCWD, new_root,
LOOKUP_FOLLOW | LOOKUP_DIRECTORY, &new);
if (error)
goto out0;
error = user_path_at(AT_FDCWD, put_old,
LOOKUP_FOLLOW | LOOKUP_DIRECTORY, &old);
if (error)
goto out1;
error = security_sb_pivotroot(&old, &new);
if (error)
goto out2;
get_fs_root(current->fs, &root);
old_mp = lock_mount(&old);
error = PTR_ERR(old_mp);
if (IS_ERR(old_mp))
goto out3;
error = -EINVAL;
new_mnt = real_mount(new.mnt);
root_mnt = real_mount(root.mnt);
old_mnt = real_mount(old.mnt);
ex_parent = new_mnt->mnt_parent;
root_parent = root_mnt->mnt_parent;
if (IS_MNT_SHARED(old_mnt) ||
IS_MNT_SHARED(ex_parent) ||
IS_MNT_SHARED(root_parent))
goto out4;
if (!check_mnt(root_mnt) || !check_mnt(new_mnt))
goto out4;
if (new_mnt->mnt.mnt_flags & MNT_LOCKED)
goto out4;
error = -ENOENT;
if (d_unlinked(new.dentry))
goto out4;
error = -EBUSY;
if (new_mnt == root_mnt || old_mnt == root_mnt)
goto out4; /* loop, on the same file system */
error = -EINVAL;
if (root.mnt->mnt_root != root.dentry)
goto out4; /* not a mountpoint */
if (!mnt_has_parent(root_mnt))
goto out4; /* not attached */
if (new.mnt->mnt_root != new.dentry)
goto out4; /* not a mountpoint */
if (!mnt_has_parent(new_mnt))
goto out4; /* not attached */
/* make sure we can reach put_old from new_root */
if (!is_path_reachable(old_mnt, old.dentry, &new))
goto out4;
/* make certain new is below the root */
if (!is_path_reachable(new_mnt, new.dentry, &root))
goto out4;
lock_mount_hash();
umount_mnt(new_mnt);
root_mp = unhash_mnt(root_mnt); /* we'll need its mountpoint */
if (root_mnt->mnt.mnt_flags & MNT_LOCKED) {
new_mnt->mnt.mnt_flags |= MNT_LOCKED;
root_mnt->mnt.mnt_flags &= ~MNT_LOCKED;
}
/* mount old root on put_old */
attach_mnt(root_mnt, old_mnt, old_mp);
/* mount new_root on / */
attach_mnt(new_mnt, root_parent, root_mp);
mnt_add_count(root_parent, -1);
touch_mnt_namespace(current->nsproxy->mnt_ns);
/* A moved mount should not expire automatically */
list_del_init(&new_mnt->mnt_expire);
put_mountpoint(root_mp);
unlock_mount_hash();
chroot_fs_refs(&root, &new);
error = 0;
out4:
unlock_mount(old_mp);
if (!error)
mntput_no_expire(ex_parent);
out3:
path_put(&root);
out2:
path_put(&old);
out1:
path_put(&new);
out0:
return error;
}
static unsigned int recalc_flags(struct mount_kattr *kattr, struct mount *mnt)
{
unsigned int flags = mnt->mnt.mnt_flags;
/* flags to clear */
flags &= ~kattr->attr_clr;
/* flags to raise */
flags |= kattr->attr_set;
return flags;
}
static int can_idmap_mount(const struct mount_kattr *kattr, struct mount *mnt)
{
struct vfsmount *m = &mnt->mnt;
if (!kattr->mnt_userns)
return 0;
/*
* Once a mount has been idmapped we don't allow it to change its
* mapping. It makes things simpler and callers can just create
* another bind-mount they can idmap if they want to.
*/
if (mnt_user_ns(m) != &init_user_ns)
return -EPERM;
/* The underlying filesystem doesn't support idmapped mounts yet. */
if (!(m->mnt_sb->s_type->fs_flags & FS_ALLOW_IDMAP))
return -EINVAL;
/* Don't yet support filesystem mountable in user namespaces. */
if (m->mnt_sb->s_user_ns != &init_user_ns)
return -EINVAL;
/* We're not controlling the superblock. */
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
/* Mount has already been visible in the filesystem hierarchy. */
if (!is_anon_ns(mnt->mnt_ns))
return -EINVAL;
return 0;
}
static struct mount *mount_setattr_prepare(struct mount_kattr *kattr,
struct mount *mnt, int *err)
{
struct mount *m = mnt, *last = NULL;
if (!is_mounted(&m->mnt)) {
*err = -EINVAL;
goto out;
}
if (!(mnt_has_parent(m) ? check_mnt(m) : is_anon_ns(m->mnt_ns))) {
*err = -EINVAL;
goto out;
}
do {
unsigned int flags;
flags = recalc_flags(kattr, m);
if (!can_change_locked_flags(m, flags)) {
*err = -EPERM;
goto out;
}
*err = can_idmap_mount(kattr, m);
if (*err)
goto out;
last = m;
if ((kattr->attr_set & MNT_READONLY) &&
!(m->mnt.mnt_flags & MNT_READONLY)) {
*err = mnt_hold_writers(m);
if (*err)
goto out;
}
} while (kattr->recurse && (m = next_mnt(m, mnt)));
out:
return last;
}
static void do_idmap_mount(const struct mount_kattr *kattr, struct mount *mnt)
{
struct user_namespace *mnt_userns;
if (!kattr->mnt_userns)
return;
mnt_userns = get_user_ns(kattr->mnt_userns);
/* Pairs with smp_load_acquire() in mnt_user_ns(). */
smp_store_release(&mnt->mnt.mnt_userns, mnt_userns);
}
static void mount_setattr_commit(struct mount_kattr *kattr,
struct mount *mnt, struct mount *last,
int err)
{
struct mount *m = mnt;
do {
if (!err) {
unsigned int flags;
do_idmap_mount(kattr, m);
flags = recalc_flags(kattr, m);
WRITE_ONCE(m->mnt.mnt_flags, flags);
}
/*
* We either set MNT_READONLY above so make it visible
* before ~MNT_WRITE_HOLD or we failed to recursively
* apply mount options.
*/
if ((kattr->attr_set & MNT_READONLY) &&
(m->mnt.mnt_flags & MNT_WRITE_HOLD))
mnt_unhold_writers(m);
if (!err && kattr->propagation)
change_mnt_propagation(m, kattr->propagation);
/*
* On failure, only cleanup until we found the first mount
* we failed to handle.
*/
if (err && m == last)
break;
} while (kattr->recurse && (m = next_mnt(m, mnt)));
if (!err)
touch_mnt_namespace(mnt->mnt_ns);
}
static int do_mount_setattr(struct path *path, struct mount_kattr *kattr)
{
struct mount *mnt = real_mount(path->mnt), *last = NULL;
int err = 0;
if (path->dentry != mnt->mnt.mnt_root)
return -EINVAL;
if (kattr->propagation) {
/*
* Only take namespace_lock() if we're actually changing
* propagation.
*/
namespace_lock();
if (kattr->propagation == MS_SHARED) {
err = invent_group_ids(mnt, kattr->recurse);
if (err) {
namespace_unlock();
return err;
}
}
}
lock_mount_hash();
/*
* Get the mount tree in a shape where we can change mount
* properties without failure.
*/
last = mount_setattr_prepare(kattr, mnt, &err);
if (last) /* Commit all changes or revert to the old state. */
mount_setattr_commit(kattr, mnt, last, err);
unlock_mount_hash();
if (kattr->propagation) {
namespace_unlock();
if (err)
cleanup_group_ids(mnt, NULL);
}
return err;
}
static int build_mount_idmapped(const struct mount_attr *attr, size_t usize,
struct mount_kattr *kattr, unsigned int flags)
{
int err = 0;
struct ns_common *ns;
struct user_namespace *mnt_userns;
struct file *file;
if (!((attr->attr_set | attr->attr_clr) & MOUNT_ATTR_IDMAP))
return 0;
/*
* We currently do not support clearing an idmapped mount. If this ever
* is a use-case we can revisit this but for now let's keep it simple
* and not allow it.
*/
if (attr->attr_clr & MOUNT_ATTR_IDMAP)
return -EINVAL;
if (attr->userns_fd > INT_MAX)
return -EINVAL;
file = fget(attr->userns_fd);
if (!file)
return -EBADF;
if (!proc_ns_file(file)) {
err = -EINVAL;
goto out_fput;
}
ns = get_proc_ns(file_inode(file));
if (ns->ops->type != CLONE_NEWUSER) {
err = -EINVAL;
goto out_fput;
}
/*
* The init_user_ns is used to indicate that a vfsmount is not idmapped.
* This is simpler than just having to treat NULL as unmapped. Users
* wanting to idmap a mount to init_user_ns can just use a namespace
* with an identity mapping.
*/
mnt_userns = container_of(ns, struct user_namespace, ns);
if (mnt_userns == &init_user_ns) {
err = -EPERM;
goto out_fput;
}
kattr->mnt_userns = get_user_ns(mnt_userns);
out_fput:
fput(file);
return err;
}
static int build_mount_kattr(const struct mount_attr *attr, size_t usize,
struct mount_kattr *kattr, unsigned int flags)
{
unsigned int lookup_flags = LOOKUP_AUTOMOUNT | LOOKUP_FOLLOW;
if (flags & AT_NO_AUTOMOUNT)
lookup_flags &= ~LOOKUP_AUTOMOUNT;
if (flags & AT_SYMLINK_NOFOLLOW)
lookup_flags &= ~LOOKUP_FOLLOW;
if (flags & AT_EMPTY_PATH)
lookup_flags |= LOOKUP_EMPTY;
*kattr = (struct mount_kattr) {
.lookup_flags = lookup_flags,
.recurse = !!(flags & AT_RECURSIVE),
};
if (attr->propagation & ~MOUNT_SETATTR_PROPAGATION_FLAGS)
return -EINVAL;
if (hweight32(attr->propagation & MOUNT_SETATTR_PROPAGATION_FLAGS) > 1)
return -EINVAL;
kattr->propagation = attr->propagation;
if ((attr->attr_set | attr->attr_clr) & ~MOUNT_SETATTR_VALID_FLAGS)
return -EINVAL;
kattr->attr_set = attr_flags_to_mnt_flags(attr->attr_set);
kattr->attr_clr = attr_flags_to_mnt_flags(attr->attr_clr);
/*
* Since the MOUNT_ATTR_<atime> values are an enum, not a bitmap,
* users wanting to transition to a different atime setting cannot
* simply specify the atime setting in @attr_set, but must also
* specify MOUNT_ATTR__ATIME in the @attr_clr field.
* So ensure that MOUNT_ATTR__ATIME can't be partially set in
* @attr_clr and that @attr_set can't have any atime bits set if
* MOUNT_ATTR__ATIME isn't set in @attr_clr.
*/
if (attr->attr_clr & MOUNT_ATTR__ATIME) {
if ((attr->attr_clr & MOUNT_ATTR__ATIME) != MOUNT_ATTR__ATIME)
return -EINVAL;
/*
* Clear all previous time settings as they are mutually
* exclusive.
*/
kattr->attr_clr |= MNT_RELATIME | MNT_NOATIME;
switch (attr->attr_set & MOUNT_ATTR__ATIME) {
case MOUNT_ATTR_RELATIME:
kattr->attr_set |= MNT_RELATIME;
break;
case MOUNT_ATTR_NOATIME:
kattr->attr_set |= MNT_NOATIME;
break;
case MOUNT_ATTR_STRICTATIME:
break;
default:
return -EINVAL;
}
} else {
if (attr->attr_set & MOUNT_ATTR__ATIME)
return -EINVAL;
}
return build_mount_idmapped(attr, usize, kattr, flags);
}
static void finish_mount_kattr(struct mount_kattr *kattr)
{
put_user_ns(kattr->mnt_userns);
kattr->mnt_userns = NULL;
}
SYSCALL_DEFINE5(mount_setattr, int, dfd, const char __user *, path,
unsigned int, flags, struct mount_attr __user *, uattr,
size_t, usize)
{
int err;
struct path target;
struct mount_attr attr;
struct mount_kattr kattr;
BUILD_BUG_ON(sizeof(struct mount_attr) != MOUNT_ATTR_SIZE_VER0);
if (flags & ~(AT_EMPTY_PATH |
AT_RECURSIVE |
AT_SYMLINK_NOFOLLOW |
AT_NO_AUTOMOUNT))
return -EINVAL;
if (unlikely(usize > PAGE_SIZE))
return -E2BIG;
if (unlikely(usize < MOUNT_ATTR_SIZE_VER0))
return -EINVAL;
if (!may_mount())
return -EPERM;
err = copy_struct_from_user(&attr, sizeof(attr), uattr, usize);
if (err)
return err;
/* Don't bother walking through the mounts if this is a nop. */
if (attr.attr_set == 0 &&
attr.attr_clr == 0 &&
attr.propagation == 0)
return 0;
err = build_mount_kattr(&attr, usize, &kattr, flags);
if (err)
return err;
err = user_path_at(dfd, path, kattr.lookup_flags, &target);
if (!err) {
err = do_mount_setattr(&target, &kattr);
path_put(&target);
}
finish_mount_kattr(&kattr);
return err;
}
static void __init init_mount_tree(void)
{
struct vfsmount *mnt;
struct mount *m;
struct mnt_namespace *ns;
struct path root;
mnt = vfs_kern_mount(&rootfs_fs_type, 0, "rootfs", NULL);
if (IS_ERR(mnt))
panic("Can't create rootfs");
ns = alloc_mnt_ns(&init_user_ns, false);
if (IS_ERR(ns))
panic("Can't allocate initial namespace");
m = real_mount(mnt);
m->mnt_ns = ns;
ns->root = m;
ns->mounts = 1;
list_add(&m->mnt_list, &ns->list);
init_task.nsproxy->mnt_ns = ns;
get_mnt_ns(ns);
root.mnt = mnt;
root.dentry = mnt->mnt_root;
mnt->mnt_flags |= MNT_LOCKED;
set_fs_pwd(current->fs, &root);
set_fs_root(current->fs, &root);
}
void __init mnt_init(void)
{
int err;
mnt_cache = kmem_cache_create("mnt_cache", sizeof(struct mount),
0, SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT, NULL);
mount_hashtable = alloc_large_system_hash("Mount-cache",
sizeof(struct hlist_head),
mhash_entries, 19,
HASH_ZERO,
&m_hash_shift, &m_hash_mask, 0, 0);
mountpoint_hashtable = alloc_large_system_hash("Mountpoint-cache",
sizeof(struct hlist_head),
mphash_entries, 19,
HASH_ZERO,
&mp_hash_shift, &mp_hash_mask, 0, 0);
if (!mount_hashtable || !mountpoint_hashtable)
panic("Failed to allocate mount hash table\n");
kernfs_init();
err = sysfs_init();
if (err)
printk(KERN_WARNING "%s: sysfs_init error: %d\n",
__func__, err);
fs_kobj = kobject_create_and_add("fs", NULL);
if (!fs_kobj)
printk(KERN_WARNING "%s: kobj create error\n", __func__);
shmem_init();
init_rootfs();
init_mount_tree();
}
void put_mnt_ns(struct mnt_namespace *ns)
{
if (!refcount_dec_and_test(&ns->ns.count))
return;
drop_collected_mounts(&ns->root->mnt);
free_mnt_ns(ns);
}
struct vfsmount *kern_mount(struct file_system_type *type)
{
struct vfsmount *mnt;
mnt = vfs_kern_mount(type, SB_KERNMOUNT, type->name, NULL);
if (!IS_ERR(mnt)) {
/*
* it is a longterm mount, don't release mnt until
* we unmount before file sys is unregistered
*/
real_mount(mnt)->mnt_ns = MNT_NS_INTERNAL;
}
return mnt;
}
EXPORT_SYMBOL_GPL(kern_mount);
void kern_unmount(struct vfsmount *mnt)
{
/* release long term mount so mount point can be released */
if (!IS_ERR_OR_NULL(mnt)) {
real_mount(mnt)->mnt_ns = NULL;
synchronize_rcu(); /* yecchhh... */
mntput(mnt);
}
}
EXPORT_SYMBOL(kern_unmount);
void kern_unmount_array(struct vfsmount *mnt[], unsigned int num)
{
unsigned int i;
for (i = 0; i < num; i++)
if (mnt[i])
real_mount(mnt[i])->mnt_ns = NULL;
synchronize_rcu_expedited();
for (i = 0; i < num; i++)
mntput(mnt[i]);
}
EXPORT_SYMBOL(kern_unmount_array);
bool our_mnt(struct vfsmount *mnt)
{
return check_mnt(real_mount(mnt));
}
bool current_chrooted(void)
{
/* Does the current process have a non-standard root */
struct path ns_root;
struct path fs_root;
bool chrooted;
/* Find the namespace root */
ns_root.mnt = ¤t->nsproxy->mnt_ns->root->mnt;
ns_root.dentry = ns_root.mnt->mnt_root;
path_get(&ns_root);
while (d_mountpoint(ns_root.dentry) && follow_down_one(&ns_root))
;
get_fs_root(current->fs, &fs_root);
chrooted = !path_equal(&fs_root, &ns_root);
path_put(&fs_root);
path_put(&ns_root);
return chrooted;
}
static bool mnt_already_visible(struct mnt_namespace *ns,
const struct super_block *sb,
int *new_mnt_flags)
{
int new_flags = *new_mnt_flags;
struct mount *mnt;
bool visible = false;
down_read(&namespace_sem);
lock_ns_list(ns);
list_for_each_entry(mnt, &ns->list, mnt_list) {
struct mount *child;
int mnt_flags;
if (mnt_is_cursor(mnt))
continue;
if (mnt->mnt.mnt_sb->s_type != sb->s_type)
continue;
/* This mount is not fully visible if it's root directory
* is not the root directory of the filesystem.
*/
if (mnt->mnt.mnt_root != mnt->mnt.mnt_sb->s_root)
continue;
/* A local view of the mount flags */
mnt_flags = mnt->mnt.mnt_flags;
/* Don't miss readonly hidden in the superblock flags */
if (sb_rdonly(mnt->mnt.mnt_sb))
mnt_flags |= MNT_LOCK_READONLY;
/* Verify the mount flags are equal to or more permissive
* than the proposed new mount.
*/
if ((mnt_flags & MNT_LOCK_READONLY) && !(new_flags & MNT_READONLY))
continue;
if ((mnt_flags & MNT_LOCK_ATIME) && ((mnt_flags & MNT_ATIME_MASK) != (new_flags & MNT_ATIME_MASK)))
continue;
/* This mount is not fully visible if there are any
* locked child mounts that cover anything except for
* empty directories.
*/
list_for_each_entry(child, &mnt->mnt_mounts, mnt_child) { struct inode *inode = child->mnt_mountpoint->d_inode;
/* Only worry about locked mounts */
if (!(child->mnt.mnt_flags & MNT_LOCKED))
continue;
/* Is the directory permanetly empty? */
if (!is_empty_dir_inode(inode))
goto next;
}
/* Preserve the locked attributes */
*new_mnt_flags |= mnt_flags & (MNT_LOCK_READONLY | \
MNT_LOCK_ATIME);
visible = true;
goto found;
next: ;
}
found:
unlock_ns_list(ns);
up_read(&namespace_sem);
return visible;
}
static bool mount_too_revealing(const struct super_block *sb, int *new_mnt_flags)
{
const unsigned long required_iflags = SB_I_NOEXEC | SB_I_NODEV;
struct mnt_namespace *ns = current->nsproxy->mnt_ns;
unsigned long s_iflags;
if (ns->user_ns == &init_user_ns)
return false;
/* Can this filesystem be too revealing? */
s_iflags = sb->s_iflags;
if (!(s_iflags & SB_I_USERNS_VISIBLE))
return false;
if ((s_iflags & required_iflags) != required_iflags) { WARN_ONCE(1, "Expected s_iflags to contain 0x%lx\n",
required_iflags);
return true;
}
return !mnt_already_visible(ns, sb, new_mnt_flags);
}
bool mnt_may_suid(struct vfsmount *mnt)
{
/*
* Foreign mounts (accessed via fchdir or through /proc
* symlinks) are always treated as if they are nosuid. This
* prevents namespaces from trusting potentially unsafe
* suid/sgid bits, file caps, or security labels that originate
* in other namespaces.
*/
return !(mnt->mnt_flags & MNT_NOSUID) && check_mnt(real_mount(mnt)) &&
current_in_userns(mnt->mnt_sb->s_user_ns);
}
static struct ns_common *mntns_get(struct task_struct *task)
{
struct ns_common *ns = NULL;
struct nsproxy *nsproxy;
task_lock(task);
nsproxy = task->nsproxy;
if (nsproxy) {
ns = &nsproxy->mnt_ns->ns;
get_mnt_ns(to_mnt_ns(ns));
}
task_unlock(task);
return ns;
}
static void mntns_put(struct ns_common *ns)
{
put_mnt_ns(to_mnt_ns(ns));
}
static int mntns_install(struct nsset *nsset, struct ns_common *ns)
{
struct nsproxy *nsproxy = nsset->nsproxy;
struct fs_struct *fs = nsset->fs;
struct mnt_namespace *mnt_ns = to_mnt_ns(ns), *old_mnt_ns;
struct user_namespace *user_ns = nsset->cred->user_ns;
struct path root;
int err;
if (!ns_capable(mnt_ns->user_ns, CAP_SYS_ADMIN) ||
!ns_capable(user_ns, CAP_SYS_CHROOT) ||
!ns_capable(user_ns, CAP_SYS_ADMIN))
return -EPERM;
if (is_anon_ns(mnt_ns))
return -EINVAL;
if (fs->users != 1)
return -EINVAL;
get_mnt_ns(mnt_ns);
old_mnt_ns = nsproxy->mnt_ns;
nsproxy->mnt_ns = mnt_ns;
/* Find the root */
err = vfs_path_lookup(mnt_ns->root->mnt.mnt_root, &mnt_ns->root->mnt,
"/", LOOKUP_DOWN, &root);
if (err) {
/* revert to old namespace */
nsproxy->mnt_ns = old_mnt_ns;
put_mnt_ns(mnt_ns);
return err;
}
put_mnt_ns(old_mnt_ns);
/* Update the pwd and root */
set_fs_pwd(fs, &root);
set_fs_root(fs, &root);
path_put(&root);
return 0;
}
static struct user_namespace *mntns_owner(struct ns_common *ns)
{
return to_mnt_ns(ns)->user_ns;
}
const struct proc_ns_operations mntns_operations = {
.name = "mnt",
.type = CLONE_NEWNS,
.get = mntns_get,
.put = mntns_put,
.install = mntns_install,
.owner = mntns_owner,
};
// SPDX-License-Identifier: GPL-2.0-only
/*
* Copyright (C) 2010 Red Hat, Inc., Peter Zijlstra
*
* Provides a framework for enqueueing and running callbacks from hardirq
* context. The enqueueing is NMI-safe.
*/
#include <linux/bug.h>
#include <linux/kernel.h>
#include <linux/export.h>
#include <linux/irq_work.h>
#include <linux/percpu.h>
#include <linux/hardirq.h>
#include <linux/irqflags.h>
#include <linux/sched.h>
#include <linux/tick.h>
#include <linux/cpu.h>
#include <linux/notifier.h>
#include <linux/smp.h>
#include <asm/processor.h>
#include <linux/kasan.h>
static DEFINE_PER_CPU(struct llist_head, raised_list);
static DEFINE_PER_CPU(struct llist_head, lazy_list);
/*
* Claim the entry so that no one else will poke at it.
*/
static bool irq_work_claim(struct irq_work *work)
{
int oflags;
oflags = atomic_fetch_or(IRQ_WORK_CLAIMED | CSD_TYPE_IRQ_WORK, &work->node.a_flags);
/*
* If the work is already pending, no need to raise the IPI.
* The pairing smp_mb() in irq_work_single() makes sure
* everything we did before is visible.
*/
if (oflags & IRQ_WORK_PENDING)
return false;
return true;
}
void __weak arch_irq_work_raise(void)
{
/*
* Lame architectures will get the timer tick callback
*/
}
/* Enqueue on current CPU, work must already be claimed and preempt disabled */
static void __irq_work_queue_local(struct irq_work *work)
{
/* If the work is "lazy", handle it from next tick if any */
if (atomic_read(&work->node.a_flags) & IRQ_WORK_LAZY) { if (llist_add(&work->node.llist, this_cpu_ptr(&lazy_list)) && tick_nohz_tick_stopped()) arch_irq_work_raise();
} else {
if (llist_add(&work->node.llist, this_cpu_ptr(&raised_list)))
arch_irq_work_raise();
}
}
/* Enqueue the irq work @work on the current CPU */
bool irq_work_queue(struct irq_work *work)
{
/* Only queue if not already pending */
if (!irq_work_claim(work))
return false;
/* Queue the entry and raise the IPI if needed. */
preempt_disable();
__irq_work_queue_local(work);
preempt_enable();
return true;
}
EXPORT_SYMBOL_GPL(irq_work_queue);
/*
* Enqueue the irq_work @work on @cpu unless it's already pending
* somewhere.
*
* Can be re-enqueued while the callback is still in progress.
*/
bool irq_work_queue_on(struct irq_work *work, int cpu)
{
#ifndef CONFIG_SMP
return irq_work_queue(work);
#else /* CONFIG_SMP: */
/* All work should have been flushed before going offline */
WARN_ON_ONCE(cpu_is_offline(cpu));
/* Only queue if not already pending */
if (!irq_work_claim(work))
return false;
kasan_record_aux_stack(work);
preempt_disable();
if (cpu != smp_processor_id()) {
/* Arch remote IPI send/receive backend aren't NMI safe */
WARN_ON_ONCE(in_nmi());
__smp_call_single_queue(cpu, &work->node.llist);
} else {
__irq_work_queue_local(work);
}
preempt_enable();
return true;
#endif /* CONFIG_SMP */
}
bool irq_work_needs_cpu(void)
{
struct llist_head *raised, *lazy;
raised = this_cpu_ptr(&raised_list);
lazy = this_cpu_ptr(&lazy_list);
if (llist_empty(raised) || arch_irq_work_has_interrupt())
if (llist_empty(lazy))
return false;
/* All work should have been flushed before going offline */
WARN_ON_ONCE(cpu_is_offline(smp_processor_id()));
return true;
}
void irq_work_single(void *arg)
{
struct irq_work *work = arg;
int flags;
/*
* Clear the PENDING bit, after this point the @work can be re-used.
* The PENDING bit acts as a lock, and we own it, so we can clear it
* without atomic ops.
*/
flags = atomic_read(&work->node.a_flags);
flags &= ~IRQ_WORK_PENDING;
atomic_set(&work->node.a_flags, flags);
/*
* See irq_work_claim().
*/
smp_mb();
lockdep_irq_work_enter(flags);
work->func(work);
lockdep_irq_work_exit(flags);
/*
* Clear the BUSY bit, if set, and return to the free state if no-one
* else claimed it meanwhile.
*/
(void)atomic_cmpxchg(&work->node.a_flags, flags, flags & ~IRQ_WORK_BUSY);
}
static void irq_work_run_list(struct llist_head *list)
{
struct irq_work *work, *tmp;
struct llist_node *llnode;
BUG_ON(!irqs_disabled());
if (llist_empty(list))
return;
llnode = llist_del_all(list);
llist_for_each_entry_safe(work, tmp, llnode, node.llist)
irq_work_single(work);
}
/*
* hotplug calls this through:
* hotplug_cfd() -> flush_smp_call_function_queue()
*/
void irq_work_run(void)
{
irq_work_run_list(this_cpu_ptr(&raised_list));
irq_work_run_list(this_cpu_ptr(&lazy_list));
}
EXPORT_SYMBOL_GPL(irq_work_run);
void irq_work_tick(void)
{
struct llist_head *raised = this_cpu_ptr(&raised_list);
if (!llist_empty(raised) && !arch_irq_work_has_interrupt())
irq_work_run_list(raised);
irq_work_run_list(this_cpu_ptr(&lazy_list));
}
/*
* Synchronize against the irq_work @entry, ensures the entry is not
* currently in use.
*/
void irq_work_sync(struct irq_work *work)
{
lockdep_assert_irqs_enabled();
while (irq_work_is_busy(work))
cpu_relax();
}
EXPORT_SYMBOL_GPL(irq_work_sync);
// SPDX-License-Identifier: GPL-2.0-only
/*
* linux/fs/open.c
*
* Copyright (C) 1991, 1992 Linus Torvalds
*/
#include <linux/string.h>
#include <linux/mm.h>
#include <linux/file.h>
#include <linux/fdtable.h>
#include <linux/fsnotify.h>
#include <linux/module.h>
#include <linux/tty.h>
#include <linux/namei.h>
#include <linux/backing-dev.h>
#include <linux/capability.h>
#include <linux/securebits.h>
#include <linux/security.h>
#include <linux/mount.h>
#include <linux/fcntl.h>
#include <linux/slab.h>
#include <linux/uaccess.h>
#include <linux/fs.h>
#include <linux/personality.h>
#include <linux/pagemap.h>
#include <linux/syscalls.h>
#include <linux/rcupdate.h>
#include <linux/audit.h>
#include <linux/falloc.h>
#include <linux/fs_struct.h>
#include <linux/ima.h>
#include <linux/dnotify.h>
#include <linux/compat.h>
#include "internal.h"
int do_truncate(struct user_namespace *mnt_userns, struct dentry *dentry,
loff_t length, unsigned int time_attrs, struct file *filp)
{
int ret;
struct iattr newattrs;
/* Not pretty: "inode->i_size" shouldn't really be signed. But it is. */
if (length < 0)
return -EINVAL;
newattrs.ia_size = length;
newattrs.ia_valid = ATTR_SIZE | time_attrs;
if (filp) {
newattrs.ia_file = filp;
newattrs.ia_valid |= ATTR_FILE;
}
/* Remove suid, sgid, and file capabilities on truncate too */
ret = dentry_needs_remove_privs(dentry);
if (ret < 0)
return ret;
if (ret) newattrs.ia_valid |= ret | ATTR_FORCE; inode_lock(dentry->d_inode);
/* Note any delegations or leases have already been broken: */
ret = notify_change(mnt_userns, dentry, &newattrs, NULL);
inode_unlock(dentry->d_inode);
return ret;
}
long vfs_truncate(const struct path *path, loff_t length)
{
struct user_namespace *mnt_userns;
struct inode *inode;
long error;
inode = path->dentry->d_inode;
/* For directories it's -EISDIR, for other non-regulars - -EINVAL */
if (S_ISDIR(inode->i_mode))
return -EISDIR;
if (!S_ISREG(inode->i_mode))
return -EINVAL;
error = mnt_want_write(path->mnt); if (error)
goto out;
mnt_userns = mnt_user_ns(path->mnt);
error = inode_permission(mnt_userns, inode, MAY_WRITE);
if (error)
goto mnt_drop_write_and_out;
error = -EPERM;
if (IS_APPEND(inode))
goto mnt_drop_write_and_out;
error = get_write_access(inode);
if (error)
goto mnt_drop_write_and_out;
/*
* Make sure that there are no leases. get_write_access() protects
* against the truncate racing with a lease-granting setlease().
*/
error = break_lease(inode, O_WRONLY);
if (error)
goto put_write_and_out;
error = security_path_truncate(path);
if (!error)
error = do_truncate(mnt_userns, path->dentry, length, 0, NULL);
put_write_and_out:
put_write_access(inode);
mnt_drop_write_and_out:
mnt_drop_write(path->mnt);
out:
return error;
}
EXPORT_SYMBOL_GPL(vfs_truncate);
long do_sys_truncate(const char __user *pathname, loff_t length)
{
unsigned int lookup_flags = LOOKUP_FOLLOW;
struct path path;
int error;
if (length < 0) /* sorry, but loff_t says... */
return -EINVAL;
retry:
error = user_path_at(AT_FDCWD, pathname, lookup_flags, &path);
if (!error) {
error = vfs_truncate(&path, length);
path_put(&path);
}
if (retry_estale(error, lookup_flags)) {
lookup_flags |= LOOKUP_REVAL;
goto retry;
}
return error;
}
SYSCALL_DEFINE2(truncate, const char __user *, path, long, length)
{
return do_sys_truncate(path, length);
}
#ifdef CONFIG_COMPAT
COMPAT_SYSCALL_DEFINE2(truncate, const char __user *, path, compat_off_t, length)
{
return do_sys_truncate(path, length);
}
#endif
long do_sys_ftruncate(unsigned int fd, loff_t length, int small)
{
struct inode *inode;
struct dentry *dentry;
struct fd f;
int error;
error = -EINVAL;
if (length < 0)
goto out;
error = -EBADF;
f = fdget(fd);
if (!f.file)
goto out;
/* explicitly opened as large or we are on 64-bit box */
if (f.file->f_flags & O_LARGEFILE)
small = 0;
dentry = f.file->f_path.dentry;
inode = dentry->d_inode;
error = -EINVAL;
if (!S_ISREG(inode->i_mode) || !(f.file->f_mode & FMODE_WRITE))
goto out_putf;
error = -EINVAL;
/* Cannot ftruncate over 2^31 bytes without large file support */
if (small && length > MAX_NON_LFS)
goto out_putf;
error = -EPERM;
/* Check IS_APPEND on real upper inode */
if (IS_APPEND(file_inode(f.file)))
goto out_putf;
sb_start_write(inode->i_sb);
error = security_path_truncate(&f.file->f_path);
if (!error)
error = do_truncate(file_mnt_user_ns(f.file), dentry, length,
ATTR_MTIME | ATTR_CTIME, f.file);
sb_end_write(inode->i_sb);
out_putf:
fdput(f);
out:
return error;
}
SYSCALL_DEFINE2(ftruncate, unsigned int, fd, unsigned long, length)
{
return do_sys_ftruncate(fd, length, 1);
}
#ifdef CONFIG_COMPAT
COMPAT_SYSCALL_DEFINE2(ftruncate, unsigned int, fd, compat_ulong_t, length)
{
return do_sys_ftruncate(fd, length, 1);
}
#endif
/* LFS versions of truncate are only needed on 32 bit machines */
#if BITS_PER_LONG == 32
SYSCALL_DEFINE2(truncate64, const char __user *, path, loff_t, length)
{
return do_sys_truncate(path, length);
}
SYSCALL_DEFINE2(ftruncate64, unsigned int, fd, loff_t, length)
{
return do_sys_ftruncate(fd, length, 0);
}
#endif /* BITS_PER_LONG == 32 */
int vfs_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
{
struct inode *inode = file_inode(file);
long ret;
if (offset < 0 || len <= 0)
return -EINVAL;
/* Return error if mode is not supported */
if (mode & ~FALLOC_FL_SUPPORTED_MASK)
return -EOPNOTSUPP;
/* Punch hole and zero range are mutually exclusive */
if ((mode & (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_ZERO_RANGE)) ==
(FALLOC_FL_PUNCH_HOLE | FALLOC_FL_ZERO_RANGE))
return -EOPNOTSUPP;
/* Punch hole must have keep size set */
if ((mode & FALLOC_FL_PUNCH_HOLE) &&
!(mode & FALLOC_FL_KEEP_SIZE))
return -EOPNOTSUPP;
/* Collapse range should only be used exclusively. */
if ((mode & FALLOC_FL_COLLAPSE_RANGE) && (mode & ~FALLOC_FL_COLLAPSE_RANGE))
return -EINVAL;
/* Insert range should only be used exclusively. */
if ((mode & FALLOC_FL_INSERT_RANGE) && (mode & ~FALLOC_FL_INSERT_RANGE))
return -EINVAL;
/* Unshare range should only be used with allocate mode. */
if ((mode & FALLOC_FL_UNSHARE_RANGE) && (mode & ~(FALLOC_FL_UNSHARE_RANGE | FALLOC_FL_KEEP_SIZE)))
return -EINVAL;
if (!(file->f_mode & FMODE_WRITE))
return -EBADF;
/*
* We can only allow pure fallocate on append only files
*/
if ((mode & ~FALLOC_FL_KEEP_SIZE) && IS_APPEND(inode))
return -EPERM;
if (IS_IMMUTABLE(inode))
return -EPERM;
/*
* We cannot allow any fallocate operation on an active swapfile
*/
if (IS_SWAPFILE(inode))
return -ETXTBSY;
/*
* Revalidate the write permissions, in case security policy has
* changed since the files were opened.
*/
ret = security_file_permission(file, MAY_WRITE);
if (ret)
return ret;
if (S_ISFIFO(inode->i_mode))
return -ESPIPE;
if (S_ISDIR(inode->i_mode))
return -EISDIR;
if (!S_ISREG(inode->i_mode) && !S_ISBLK(inode->i_mode))
return -ENODEV;
/* Check for wrap through zero too */
if (((offset + len) > inode->i_sb->s_maxbytes) || ((offset + len) < 0))
return -EFBIG;
if (!file->f_op->fallocate)
return -EOPNOTSUPP;
file_start_write(file);
ret = file->f_op->fallocate(file, mode, offset, len);
/*
* Create inotify and fanotify events.
*
* To keep the logic simple always create events if fallocate succeeds.
* This implies that events are even created if the file size remains
* unchanged, e.g. when using flag FALLOC_FL_KEEP_SIZE.
*/
if (ret == 0)
fsnotify_modify(file);
file_end_write(file);
return ret;
}
EXPORT_SYMBOL_GPL(vfs_fallocate);
int ksys_fallocate(int fd, int mode, loff_t offset, loff_t len)
{
struct fd f = fdget(fd);
int error = -EBADF;
if (f.file) {
error = vfs_fallocate(f.file, mode, offset, len);
fdput(f);
}
return error;
}
SYSCALL_DEFINE4(fallocate, int, fd, int, mode, loff_t, offset, loff_t, len)
{
return ksys_fallocate(fd, mode, offset, len);
}
/*
* access() needs to use the real uid/gid, not the effective uid/gid.
* We do this by temporarily clearing all FS-related capabilities and
* switching the fsuid/fsgid around to the real ones.
*/
static const struct cred *access_override_creds(void)
{
const struct cred *old_cred;
struct cred *override_cred;
override_cred = prepare_creds();
if (!override_cred)
return NULL;
override_cred->fsuid = override_cred->uid;
override_cred->fsgid = override_cred->gid;
if (!issecure(SECURE_NO_SETUID_FIXUP)) {
/* Clear the capabilities if we switch to a non-root user */
kuid_t root_uid = make_kuid(override_cred->user_ns, 0);
if (!uid_eq(override_cred->uid, root_uid))
cap_clear(override_cred->cap_effective);
else
override_cred->cap_effective =
override_cred->cap_permitted;
}
/*
* The new set of credentials can *only* be used in
* task-synchronous circumstances, and does not need
* RCU freeing, unless somebody then takes a separate
* reference to it.
*
* NOTE! This is _only_ true because this credential
* is used purely for override_creds() that installs
* it as the subjective cred. Other threads will be
* accessing ->real_cred, not the subjective cred.
*
* If somebody _does_ make a copy of this (using the
* 'get_current_cred()' function), that will clear the
* non_rcu field, because now that other user may be
* expecting RCU freeing. But normal thread-synchronous
* cred accesses will keep things non-RCY.
*/
override_cred->non_rcu = 1;
old_cred = override_creds(override_cred);
/* override_cred() gets its own ref */
put_cred(override_cred);
return old_cred;
}
static long do_faccessat(int dfd, const char __user *filename, int mode, int flags)
{
struct path path;
struct inode *inode;
int res;
unsigned int lookup_flags = LOOKUP_FOLLOW;
const struct cred *old_cred = NULL;
if (mode & ~S_IRWXO) /* where's F_OK, X_OK, W_OK, R_OK? */
return -EINVAL;
if (flags & ~(AT_EACCESS | AT_SYMLINK_NOFOLLOW | AT_EMPTY_PATH))
return -EINVAL;
if (flags & AT_SYMLINK_NOFOLLOW)
lookup_flags &= ~LOOKUP_FOLLOW;
if (flags & AT_EMPTY_PATH)
lookup_flags |= LOOKUP_EMPTY;
if (!(flags & AT_EACCESS)) {
old_cred = access_override_creds();
if (!old_cred)
return -ENOMEM;
}
retry:
res = user_path_at(dfd, filename, lookup_flags, &path);
if (res)
goto out;
inode = d_backing_inode(path.dentry);
if ((mode & MAY_EXEC) && S_ISREG(inode->i_mode)) {
/*
* MAY_EXEC on regular files is denied if the fs is mounted
* with the "noexec" flag.
*/
res = -EACCES;
if (path_noexec(&path))
goto out_path_release;
}
res = inode_permission(mnt_user_ns(path.mnt), inode, mode | MAY_ACCESS);
/* SuS v2 requires we report a read only fs too */
if (res || !(mode & S_IWOTH) || special_file(inode->i_mode))
goto out_path_release;
/*
* This is a rare case where using __mnt_is_readonly()
* is OK without a mnt_want/drop_write() pair. Since
* no actual write to the fs is performed here, we do
* not need to telegraph to that to anyone.
*
* By doing this, we accept that this access is
* inherently racy and know that the fs may change
* state before we even see this result.
*/
if (__mnt_is_readonly(path.mnt))
res = -EROFS;
out_path_release:
path_put(&path);
if (retry_estale(res, lookup_flags)) {
lookup_flags |= LOOKUP_REVAL;
goto retry;
}
out:
if (old_cred)
revert_creds(old_cred);
return res;
}
SYSCALL_DEFINE3(faccessat, int, dfd, const char __user *, filename, int, mode)
{
return do_faccessat(dfd, filename, mode, 0);
}
SYSCALL_DEFINE4(faccessat2, int, dfd, const char __user *, filename, int, mode,
int, flags)
{
return do_faccessat(dfd, filename, mode, flags);
}
SYSCALL_DEFINE2(access, const char __user *, filename, int, mode)
{
return do_faccessat(AT_FDCWD, filename, mode, 0);
}
SYSCALL_DEFINE1(chdir, const char __user *, filename)
{
struct path path;
int error;
unsigned int lookup_flags = LOOKUP_FOLLOW | LOOKUP_DIRECTORY;
retry:
error = user_path_at(AT_FDCWD, filename, lookup_flags, &path);
if (error)
goto out;
error = path_permission(&path, MAY_EXEC | MAY_CHDIR);
if (error)
goto dput_and_out;
set_fs_pwd(current->fs, &path);
dput_and_out:
path_put(&path);
if (retry_estale(error, lookup_flags)) {
lookup_flags |= LOOKUP_REVAL;
goto retry;
}
out:
return error;
}
SYSCALL_DEFINE1(fchdir, unsigned int, fd)
{
struct fd f = fdget_raw(fd);
int error;
error = -EBADF;
if (!f.file)
goto out;
error = -ENOTDIR;
if (!d_can_lookup(f.file->f_path.dentry))
goto out_putf;
error = file_permission(f.file, MAY_EXEC | MAY_CHDIR);
if (!error)
set_fs_pwd(current->fs, &f.file->f_path);
out_putf:
fdput(f);
out:
return error;
}
SYSCALL_DEFINE1(chroot, const char __user *, filename)
{
struct path path;
int error;
unsigned int lookup_flags = LOOKUP_FOLLOW | LOOKUP_DIRECTORY;
retry:
error = user_path_at(AT_FDCWD, filename, lookup_flags, &path);
if (error)
goto out;
error = path_permission(&path, MAY_EXEC | MAY_CHDIR);
if (error)
goto dput_and_out;
error = -EPERM;
if (!ns_capable(current_user_ns(), CAP_SYS_CHROOT))
goto dput_and_out;
error = security_path_chroot(&path);
if (error)
goto dput_and_out;
set_fs_root(current->fs, &path);
error = 0;
dput_and_out:
path_put(&path);
if (retry_estale(error, lookup_flags)) {
lookup_flags |= LOOKUP_REVAL;
goto retry;
}
out:
return error;
}
int chmod_common(const struct path *path, umode_t mode)
{
struct inode *inode = path->dentry->d_inode;
struct inode *delegated_inode = NULL;
struct iattr newattrs;
int error;
error = mnt_want_write(path->mnt);
if (error)
return error;
retry_deleg:
inode_lock(inode);
error = security_path_chmod(path, mode);
if (error)
goto out_unlock;
newattrs.ia_mode = (mode & S_IALLUGO) | (inode->i_mode & ~S_IALLUGO);
newattrs.ia_valid = ATTR_MODE | ATTR_CTIME;
error = notify_change(mnt_user_ns(path->mnt), path->dentry,
&newattrs, &delegated_inode);
out_unlock:
inode_unlock(inode);
if (delegated_inode) {
error = break_deleg_wait(&delegated_inode);
if (!error)
goto retry_deleg;
}
mnt_drop_write(path->mnt); return error;
}
int vfs_fchmod(struct file *file, umode_t mode)
{
audit_file(file);
return chmod_common(&file->f_path, mode);
}
SYSCALL_DEFINE2(fchmod, unsigned int, fd, umode_t, mode)
{
struct fd f = fdget(fd);
int err = -EBADF;
if (f.file) {
err = vfs_fchmod(f.file, mode);
fdput(f);
}
return err;
}
static int do_fchmodat(int dfd, const char __user *filename, umode_t mode)
{
struct path path;
int error;
unsigned int lookup_flags = LOOKUP_FOLLOW;
retry:
error = user_path_at(dfd, filename, lookup_flags, &path);
if (!error) {
error = chmod_common(&path, mode);
path_put(&path);
if (retry_estale(error, lookup_flags)) {
lookup_flags |= LOOKUP_REVAL;
goto retry;
}
}
return error;
}
SYSCALL_DEFINE3(fchmodat, int, dfd, const char __user *, filename,
umode_t, mode)
{
return do_fchmodat(dfd, filename, mode);
}
SYSCALL_DEFINE2(chmod, const char __user *, filename, umode_t, mode)
{
return do_fchmodat(AT_FDCWD, filename, mode);
}
int chown_common(const struct path *path, uid_t user, gid_t group)
{
struct user_namespace *mnt_userns;
struct inode *inode = path->dentry->d_inode;
struct inode *delegated_inode = NULL;
int error;
struct iattr newattrs;
kuid_t uid;
kgid_t gid;
uid = make_kuid(current_user_ns(), user);
gid = make_kgid(current_user_ns(), group);
mnt_userns = mnt_user_ns(path->mnt);
uid = kuid_from_mnt(mnt_userns, uid);
gid = kgid_from_mnt(mnt_userns, gid);
retry_deleg:
newattrs.ia_valid = ATTR_CTIME;
if (user != (uid_t) -1) {
if (!uid_valid(uid))
return -EINVAL;
newattrs.ia_valid |= ATTR_UID;
newattrs.ia_uid = uid;
}
if (group != (gid_t) -1) {
if (!gid_valid(gid))
return -EINVAL;
newattrs.ia_valid |= ATTR_GID;
newattrs.ia_gid = gid;
}
if (!S_ISDIR(inode->i_mode))
newattrs.ia_valid |=
ATTR_KILL_SUID | ATTR_KILL_SGID | ATTR_KILL_PRIV;
inode_lock(inode);
error = security_path_chown(path, uid, gid);
if (!error)
error = notify_change(mnt_userns, path->dentry, &newattrs,
&delegated_inode);
inode_unlock(inode);
if (delegated_inode) {
error = break_deleg_wait(&delegated_inode);
if (!error)
goto retry_deleg;
}
return error;
}
int do_fchownat(int dfd, const char __user *filename, uid_t user, gid_t group,
int flag)
{
struct path path;
int error = -EINVAL;
int lookup_flags;
if ((flag & ~(AT_SYMLINK_NOFOLLOW | AT_EMPTY_PATH)) != 0)
goto out;
lookup_flags = (flag & AT_SYMLINK_NOFOLLOW) ? 0 : LOOKUP_FOLLOW;
if (flag & AT_EMPTY_PATH)
lookup_flags |= LOOKUP_EMPTY;
retry:
error = user_path_at(dfd, filename, lookup_flags, &path);
if (error)
goto out;
error = mnt_want_write(path.mnt);
if (error)
goto out_release;
error = chown_common(&path, user, group);
mnt_drop_write(path.mnt);
out_release:
path_put(&path);
if (retry_estale(error, lookup_flags)) {
lookup_flags |= LOOKUP_REVAL;
goto retry;
}
out:
return error;
}
SYSCALL_DEFINE5(fchownat, int, dfd, const char __user *, filename, uid_t, user,
gid_t, group, int, flag)
{
return do_fchownat(dfd, filename, user, group, flag);
}
SYSCALL_DEFINE3(chown, const char __user *, filename, uid_t, user, gid_t, group)
{
return do_fchownat(AT_FDCWD, filename, user, group, 0);
}
SYSCALL_DEFINE3(lchown, const char __user *, filename, uid_t, user, gid_t, group)
{
return do_fchownat(AT_FDCWD, filename, user, group,
AT_SYMLINK_NOFOLLOW);
}
int vfs_fchown(struct file *file, uid_t user, gid_t group)
{
int error;
error = mnt_want_write_file(file);
if (error)
return error;
audit_file(file);
error = chown_common(&file->f_path, user, group);
mnt_drop_write_file(file);
return error;
}
int ksys_fchown(unsigned int fd, uid_t user, gid_t group)
{
struct fd f = fdget(fd);
int error = -EBADF;
if (f.file) {
error = vfs_fchown(f.file, user, group);
fdput(f);
}
return error;
}
SYSCALL_DEFINE3(fchown, unsigned int, fd, uid_t, user, gid_t, group)
{
return ksys_fchown(fd, user, group);
}
static int do_dentry_open(struct file *f,
struct inode *inode,
int (*open)(struct inode *, struct file *))
{
static const struct file_operations empty_fops = {};
int error;
path_get(&f->f_path);
f->f_inode = inode;
f->f_mapping = inode->i_mapping;
f->f_wb_err = filemap_sample_wb_err(f->f_mapping);
f->f_sb_err = file_sample_sb_err(f);
if (unlikely(f->f_flags & O_PATH)) {
f->f_mode = FMODE_PATH | FMODE_OPENED;
f->f_op = &empty_fops;
return 0;
}
if (f->f_mode & FMODE_WRITE && !special_file(inode->i_mode)) {
error = get_write_access(inode);
if (unlikely(error))
goto cleanup_file;
error = __mnt_want_write(f->f_path.mnt);
if (unlikely(error)) {
put_write_access(inode);
goto cleanup_file;
}
f->f_mode |= FMODE_WRITER;
}
/* POSIX.1-2008/SUSv4 Section XSI 2.9.7 */
if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode)) f->f_mode |= FMODE_ATOMIC_POS; f->f_op = fops_get(inode->i_fop); if (WARN_ON(!f->f_op)) {
error = -ENODEV;
goto cleanup_all;
}
error = security_file_open(f);
if (error)
goto cleanup_all;
error = break_lease(locks_inode(f), f->f_flags);
if (error)
goto cleanup_all;
/* normally all 3 are set; ->open() can clear them if needed */
f->f_mode |= FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE;
if (!open)
open = f->f_op->open;
if (open) {
error = open(inode, f); if (error)
goto cleanup_all;
}
f->f_mode |= FMODE_OPENED;
if ((f->f_mode & (FMODE_READ | FMODE_WRITE)) == FMODE_READ)
i_readcount_inc(inode);
if ((f->f_mode & FMODE_READ) && likely(f->f_op->read || f->f_op->read_iter)) f->f_mode |= FMODE_CAN_READ; if ((f->f_mode & FMODE_WRITE) && likely(f->f_op->write || f->f_op->write_iter)) f->f_mode |= FMODE_CAN_WRITE; f->f_write_hint = WRITE_LIFE_NOT_SET;
f->f_flags &= ~(O_CREAT | O_EXCL | O_NOCTTY | O_TRUNC);
file_ra_state_init(&f->f_ra, f->f_mapping->host->i_mapping);
/* NB: we're sure to have correct a_ops only after f_op->open */
if (f->f_flags & O_DIRECT) {
if (!f->f_mapping->a_ops || !f->f_mapping->a_ops->direct_IO)
return -EINVAL;
}
/*
* XXX: Huge page cache doesn't support writing yet. Drop all page
* cache for this file before processing writes.
*/
if (f->f_mode & FMODE_WRITE) {
/*
* Paired with smp_mb() in collapse_file() to ensure nr_thps
* is up to date and the update to i_writecount by
* get_write_access() is visible. Ensures subsequent insertion
* of THPs into the page cache will fail.
*/
smp_mb();
if (filemap_nr_thps(inode->i_mapping)) {
struct address_space *mapping = inode->i_mapping;
filemap_invalidate_lock(inode->i_mapping);
/*
* unmap_mapping_range just need to be called once
* here, because the private pages is not need to be
* unmapped mapping (e.g. data segment of dynamic
* shared libraries here).
*/
unmap_mapping_range(mapping, 0, 0, 0);
truncate_inode_pages(mapping, 0);
filemap_invalidate_unlock(inode->i_mapping);
}
}
return 0;
cleanup_all:
if (WARN_ON_ONCE(error > 0))
error = -EINVAL;
fops_put(f->f_op); if (f->f_mode & FMODE_WRITER) {
put_write_access(inode);
__mnt_drop_write(f->f_path.mnt);
}
cleanup_file:
path_put(&f->f_path);
f->f_path.mnt = NULL;
f->f_path.dentry = NULL;
f->f_inode = NULL;
return error;
}
/**
* finish_open - finish opening a file
* @file: file pointer
* @dentry: pointer to dentry
* @open: open callback
* @opened: state of open
*
* This can be used to finish opening a file passed to i_op->atomic_open().
*
* If the open callback is set to NULL, then the standard f_op->open()
* filesystem callback is substituted.
*
* NB: the dentry reference is _not_ consumed. If, for example, the dentry is
* the return value of d_splice_alias(), then the caller needs to perform dput()
* on it after finish_open().
*
* Returns zero on success or -errno if the open failed.
*/
int finish_open(struct file *file, struct dentry *dentry,
int (*open)(struct inode *, struct file *))
{
BUG_ON(file->f_mode & FMODE_OPENED); /* once it's opened, it's opened */
file->f_path.dentry = dentry;
return do_dentry_open(file, d_backing_inode(dentry), open);
}
EXPORT_SYMBOL(finish_open);
/**
* finish_no_open - finish ->atomic_open() without opening the file
*
* @file: file pointer
* @dentry: dentry or NULL (as returned from ->lookup())
*
* This can be used to set the result of a successful lookup in ->atomic_open().
*
* NB: unlike finish_open() this function does consume the dentry reference and
* the caller need not dput() it.
*
* Returns "0" which must be the return value of ->atomic_open() after having
* called this function.
*/
int finish_no_open(struct file *file, struct dentry *dentry)
{
file->f_path.dentry = dentry;
return 0;
}
EXPORT_SYMBOL(finish_no_open);
char *file_path(struct file *filp, char *buf, int buflen)
{
return d_path(&filp->f_path, buf, buflen);
}
EXPORT_SYMBOL(file_path);
/**
* vfs_open - open the file at the given path
* @path: path to open
* @file: newly allocated file with f_flag initialized
* @cred: credentials to use
*/
int vfs_open(const struct path *path, struct file *file)
{
file->f_path = *path;
return do_dentry_open(file, d_backing_inode(path->dentry), NULL);
}
struct file *dentry_open(const struct path *path, int flags,
const struct cred *cred)
{
int error;
struct file *f;
validate_creds(cred);
/* We must always pass in a valid mount pointer. */
BUG_ON(!path->mnt);
f = alloc_empty_file(flags, cred);
if (!IS_ERR(f)) {
error = vfs_open(path, f);
if (error) {
fput(f);
f = ERR_PTR(error);
}
}
return f;
}
EXPORT_SYMBOL(dentry_open);
struct file *open_with_fake_path(const struct path *path, int flags,
struct inode *inode, const struct cred *cred)
{
struct file *f = alloc_empty_file_noaccount(flags, cred);
if (!IS_ERR(f)) {
int error;
f->f_path = *path;
error = do_dentry_open(f, inode, NULL);
if (error) {
fput(f);
f = ERR_PTR(error);
}
}
return f;
}
EXPORT_SYMBOL(open_with_fake_path);
#define WILL_CREATE(flags) (flags & (O_CREAT | __O_TMPFILE))
#define O_PATH_FLAGS (O_DIRECTORY | O_NOFOLLOW | O_PATH | O_CLOEXEC)
inline struct open_how build_open_how(int flags, umode_t mode)
{
struct open_how how = {
.flags = flags & VALID_OPEN_FLAGS,
.mode = mode & S_IALLUGO,
};
/* O_PATH beats everything else. */
if (how.flags & O_PATH) how.flags &= O_PATH_FLAGS;
/* Modes should only be set for create-like flags. */
if (!WILL_CREATE(how.flags))
how.mode = 0;
return how;
}
inline int build_open_flags(const struct open_how *how, struct open_flags *op)
{
u64 flags = how->flags;
u64 strip = FMODE_NONOTIFY | O_CLOEXEC;
int lookup_flags = 0;
int acc_mode = ACC_MODE(flags);
BUILD_BUG_ON_MSG(upper_32_bits(VALID_OPEN_FLAGS),
"struct open_flags doesn't yet handle flags > 32 bits");
/*
* Strip flags that either shouldn't be set by userspace like
* FMODE_NONOTIFY or that aren't relevant in determining struct
* open_flags like O_CLOEXEC.
*/
flags &= ~strip;
/*
* Older syscalls implicitly clear all of the invalid flags or argument
* values before calling build_open_flags(), but openat2(2) checks all
* of its arguments.
*/
if (flags & ~VALID_OPEN_FLAGS)
return -EINVAL;
if (how->resolve & ~VALID_RESOLVE_FLAGS)
return -EINVAL;
/* Scoping flags are mutually exclusive. */
if ((how->resolve & RESOLVE_BENEATH) && (how->resolve & RESOLVE_IN_ROOT))
return -EINVAL;
/* Deal with the mode. */
if (WILL_CREATE(flags)) { if (how->mode & ~S_IALLUGO)
return -EINVAL;
op->mode = how->mode | S_IFREG;
} else {
if (how->mode != 0)
return -EINVAL;
op->mode = 0;
}
/*
* In order to ensure programs get explicit errors when trying to use
* O_TMPFILE on old kernels, O_TMPFILE is implemented such that it
* looks like (O_DIRECTORY|O_RDWR & ~O_CREAT) to old kernels. But we
* have to require userspace to explicitly set it.
*/
if (flags & __O_TMPFILE) { if ((flags & O_TMPFILE_MASK) != O_TMPFILE)
return -EINVAL;
if (!(acc_mode & MAY_WRITE))
return -EINVAL;
}
if (flags & O_PATH) {
/* O_PATH only permits certain other flags to be set. */
if (flags & ~O_PATH_FLAGS)
return -EINVAL;
acc_mode = 0;
}
/*
* O_SYNC is implemented as __O_SYNC|O_DSYNC. As many places only
* check for O_DSYNC if the need any syncing at all we enforce it's
* always set instead of having to deal with possibly weird behaviour
* for malicious applications setting only __O_SYNC.
*/
if (flags & __O_SYNC) flags |= O_DSYNC;
op->open_flag = flags;
/* O_TRUNC implies we need access checks for write permissions */
if (flags & O_TRUNC)
acc_mode |= MAY_WRITE;
/* Allow the LSM permission hook to distinguish append
access from general write access. */
if (flags & O_APPEND) acc_mode |= MAY_APPEND; op->acc_mode = acc_mode; op->intent = flags & O_PATH ? 0 : LOOKUP_OPEN; if (flags & O_CREAT) { op->intent |= LOOKUP_CREATE; if (flags & O_EXCL) { op->intent |= LOOKUP_EXCL;
flags |= O_NOFOLLOW;
}
}
if (flags & O_DIRECTORY)
lookup_flags |= LOOKUP_DIRECTORY;
if (!(flags & O_NOFOLLOW)) lookup_flags |= LOOKUP_FOLLOW; if (how->resolve & RESOLVE_NO_XDEV) lookup_flags |= LOOKUP_NO_XDEV; if (how->resolve & RESOLVE_NO_MAGICLINKS) lookup_flags |= LOOKUP_NO_MAGICLINKS; if (how->resolve & RESOLVE_NO_SYMLINKS) lookup_flags |= LOOKUP_NO_SYMLINKS; if (how->resolve & RESOLVE_BENEATH) lookup_flags |= LOOKUP_BENEATH; if (how->resolve & RESOLVE_IN_ROOT) lookup_flags |= LOOKUP_IN_ROOT; if (how->resolve & RESOLVE_CACHED) {
/* Don't bother even trying for create/truncate/tmpfile open */
if (flags & (O_TRUNC | O_CREAT | O_TMPFILE))
return -EAGAIN;
lookup_flags |= LOOKUP_CACHED;
}
op->lookup_flags = lookup_flags;
return 0;
}
/**
* file_open_name - open file and return file pointer
*
* @name: struct filename containing path to open
* @flags: open flags as per the open(2) second argument
* @mode: mode for the new file if O_CREAT is set, else ignored
*
* This is the helper to open a file from kernelspace if you really
* have to. But in generally you should not do this, so please move
* along, nothing to see here..
*/
struct file *file_open_name(struct filename *name, int flags, umode_t mode)
{
struct open_flags op;
struct open_how how = build_open_how(flags, mode);
int err = build_open_flags(&how, &op);
if (err)
return ERR_PTR(err);
return do_filp_open(AT_FDCWD, name, &op);
}
/**
* filp_open - open file and return file pointer
*
* @filename: path to open
* @flags: open flags as per the open(2) second argument
* @mode: mode for the new file if O_CREAT is set, else ignored
*
* This is the helper to open a file from kernelspace if you really
* have to. But in generally you should not do this, so please move
* along, nothing to see here..
*/
struct file *filp_open(const char *filename, int flags, umode_t mode)
{
struct filename *name = getname_kernel(filename);
struct file *file = ERR_CAST(name);
if (!IS_ERR(name)) {
file = file_open_name(name, flags, mode);
putname(name);
}
return file;
}
EXPORT_SYMBOL(filp_open);
struct file *file_open_root(const struct path *root,
const char *filename, int flags, umode_t mode)
{
struct open_flags op;
struct open_how how = build_open_how(flags, mode);
int err = build_open_flags(&how, &op);
if (err)
return ERR_PTR(err);
return do_file_open_root(root, filename, &op);
}
EXPORT_SYMBOL(file_open_root);
static long do_sys_openat2(int dfd, const char __user *filename,
struct open_how *how)
{
struct open_flags op;
int fd = build_open_flags(how, &op);
struct filename *tmp;
if (fd)
return fd;
tmp = getname(filename);
if (IS_ERR(tmp))
return PTR_ERR(tmp);
fd = get_unused_fd_flags(how->flags);
if (fd >= 0) {
struct file *f = do_filp_open(dfd, tmp, &op);
if (IS_ERR(f)) {
put_unused_fd(fd);
fd = PTR_ERR(f);
} else {
fsnotify_open(f);
fd_install(fd, f);
}
}
putname(tmp); return fd;
}
long do_sys_open(int dfd, const char __user *filename, int flags, umode_t mode)
{
struct open_how how = build_open_how(flags, mode);
return do_sys_openat2(dfd, filename, &how);
}SYSCALL_DEFINE3(open, const char __user *, filename, int, flags, umode_t, mode)
{
if (force_o_largefile())
flags |= O_LARGEFILE;
return do_sys_open(AT_FDCWD, filename, flags, mode);
}
SYSCALL_DEFINE4(openat, int, dfd, const char __user *, filename, int, flags,
umode_t, mode)
{
if (force_o_largefile())
flags |= O_LARGEFILE;
return do_sys_open(dfd, filename, flags, mode);
}
SYSCALL_DEFINE4(openat2, int, dfd, const char __user *, filename,
struct open_how __user *, how, size_t, usize)
{
int err;
struct open_how tmp;
BUILD_BUG_ON(sizeof(struct open_how) < OPEN_HOW_SIZE_VER0);
BUILD_BUG_ON(sizeof(struct open_how) != OPEN_HOW_SIZE_LATEST);
if (unlikely(usize < OPEN_HOW_SIZE_VER0))
return -EINVAL;
err = copy_struct_from_user(&tmp, sizeof(tmp), how, usize);
if (err)
return err;
/* O_LARGEFILE is only allowed for non-O_PATH. */
if (!(tmp.flags & O_PATH) && force_o_largefile())
tmp.flags |= O_LARGEFILE;
return do_sys_openat2(dfd, filename, &tmp);
}
#ifdef CONFIG_COMPAT
/*
* Exactly like sys_open(), except that it doesn't set the
* O_LARGEFILE flag.
*/
COMPAT_SYSCALL_DEFINE3(open, const char __user *, filename, int, flags, umode_t, mode)
{
return do_sys_open(AT_FDCWD, filename, flags, mode);
}
/*
* Exactly like sys_openat(), except that it doesn't set the
* O_LARGEFILE flag.
*/
COMPAT_SYSCALL_DEFINE4(openat, int, dfd, const char __user *, filename, int, flags, umode_t, mode)
{
return do_sys_open(dfd, filename, flags, mode);
}
#endif
#ifndef __alpha__
/*
* For backward compatibility? Maybe this should be moved
* into arch/i386 instead?
*/
SYSCALL_DEFINE2(creat, const char __user *, pathname, umode_t, mode)
{
int flags = O_CREAT | O_WRONLY | O_TRUNC;
if (force_o_largefile())
flags |= O_LARGEFILE;
return do_sys_open(AT_FDCWD, pathname, flags, mode);
}
#endif
/*
* "id" is the POSIX thread ID. We use the
* files pointer for this..
*/
int filp_close(struct file *filp, fl_owner_t id)
{
int retval = 0;
if (!file_count(filp)) {
printk(KERN_ERR "VFS: Close: file count is 0\n");
return 0;
}
if (filp->f_op->flush) retval = filp->f_op->flush(filp, id); if (likely(!(filp->f_mode & FMODE_PATH))) { dnotify_flush(filp, id);
locks_remove_posix(filp, id);
}
fput(filp); return retval;
}
EXPORT_SYMBOL(filp_close);
/*
* Careful here! We test whether the file pointer is NULL before
* releasing the fd. This ensures that one clone task can't release
* an fd while another clone is opening it.
*/
SYSCALL_DEFINE1(close, unsigned int, fd)
{
int retval = close_fd(fd);
/* can't restart close syscall because file table entry was cleared */
if (unlikely(retval == -ERESTARTSYS ||
retval == -ERESTARTNOINTR ||
retval == -ERESTARTNOHAND ||
retval == -ERESTART_RESTARTBLOCK))
retval = -EINTR;
return retval;
}
/**
* close_range() - Close all file descriptors in a given range.
*
* @fd: starting file descriptor to close
* @max_fd: last file descriptor to close
* @flags: reserved for future extensions
*
* This closes a range of file descriptors. All file descriptors
* from @fd up to and including @max_fd are closed.
* Currently, errors to close a given file descriptor are ignored.
*/
SYSCALL_DEFINE3(close_range, unsigned int, fd, unsigned int, max_fd,
unsigned int, flags)
{
return __close_range(fd, max_fd, flags);
}
/*
* This routine simulates a hangup on the tty, to arrange that users
* are given clean terminals at login time.
*/
SYSCALL_DEFINE0(vhangup)
{
if (capable(CAP_SYS_TTY_CONFIG)) {
tty_vhangup_self();
return 0;
}
return -EPERM;
}
/*
* Called when an inode is about to be open.
* We use this to disallow opening large files on 32bit systems if
* the caller didn't specify O_LARGEFILE. On 64bit systems we force
* on this flag in sys_open.
*/
int generic_file_open(struct inode * inode, struct file * filp)
{
if (!(filp->f_flags & O_LARGEFILE) && i_size_read(inode) > MAX_NON_LFS)
return -EOVERFLOW;
return 0;
}
EXPORT_SYMBOL(generic_file_open);
/*
* This is used by subsystems that don't want seekable
* file descriptors. The function is not supposed to ever fail, the only
* reason it returns an 'int' and not 'void' is so that it can be plugged
* directly into file_operations structure.
*/
int nonseekable_open(struct inode *inode, struct file *filp)
{
filp->f_mode &= ~(FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE);
return 0;
}
EXPORT_SYMBOL(nonseekable_open);
/*
* stream_open is used by subsystems that want stream-like file descriptors.
* Such file descriptors are not seekable and don't have notion of position
* (file.f_pos is always 0 and ppos passed to .read()/.write() is always NULL).
* Contrary to file descriptors of other regular files, .read() and .write()
* can run simultaneously.
*
* stream_open never fails and is marked to return int so that it could be
* directly used as file_operations.open .
*/
int stream_open(struct inode *inode, struct file *filp)
{
filp->f_mode &= ~(FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE | FMODE_ATOMIC_POS);
filp->f_mode |= FMODE_STREAM;
return 0;
}
EXPORT_SYMBOL(stream_open);
// SPDX-License-Identifier: GPL-2.0-only
#include <linux/kernel.h>
#include <linux/errno.h>
#include <linux/err.h>
#include <linux/spinlock.h>
#include <linux/mm.h>
#include <linux/memremap.h>
#include <linux/pagemap.h>
#include <linux/rmap.h>
#include <linux/swap.h>
#include <linux/swapops.h>
#include <linux/secretmem.h>
#include <linux/sched/signal.h>
#include <linux/rwsem.h>
#include <linux/hugetlb.h>
#include <linux/migrate.h>
#include <linux/mm_inline.h>
#include <linux/sched/mm.h>
#include <asm/mmu_context.h>
#include <asm/tlbflush.h>
#include "internal.h"
struct follow_page_context {
struct dev_pagemap *pgmap;
unsigned int page_mask;
};
static void hpage_pincount_add(struct page *page, int refs)
{
VM_BUG_ON_PAGE(!hpage_pincount_available(page), page);
VM_BUG_ON_PAGE(page != compound_head(page), page);
atomic_add(refs, compound_pincount_ptr(page));
}
static void hpage_pincount_sub(struct page *page, int refs)
{
VM_BUG_ON_PAGE(!hpage_pincount_available(page), page);
VM_BUG_ON_PAGE(page != compound_head(page), page);
atomic_sub(refs, compound_pincount_ptr(page));
}
/* Equivalent to calling put_page() @refs times. */
static void put_page_refs(struct page *page, int refs)
{
#ifdef CONFIG_DEBUG_VM
if (VM_WARN_ON_ONCE_PAGE(page_ref_count(page) < refs, page))
return;
#endif
/*
* Calling put_page() for each ref is unnecessarily slow. Only the last
* ref needs a put_page().
*/
if (refs > 1)
page_ref_sub(page, refs - 1);
put_page(page);
}
/*
* Return the compound head page with ref appropriately incremented,
* or NULL if that failed.
*/
static inline struct page *try_get_compound_head(struct page *page, int refs)
{
struct page *head = compound_head(page);
if (WARN_ON_ONCE(page_ref_count(head) < 0))
return NULL;
if (unlikely(!page_cache_add_speculative(head, refs)))
return NULL;
/*
* At this point we have a stable reference to the head page; but it
* could be that between the compound_head() lookup and the refcount
* increment, the compound page was split, in which case we'd end up
* holding a reference on a page that has nothing to do with the page
* we were given anymore.
* So now that the head page is stable, recheck that the pages still
* belong together.
*/
if (unlikely(compound_head(page) != head)) { put_page_refs(head, refs);
return NULL;
}
return head;
}
/**
* try_grab_compound_head() - attempt to elevate a page's refcount, by a
* flags-dependent amount.
*
* Even though the name includes "compound_head", this function is still
* appropriate for callers that have a non-compound @page to get.
*
* @page: pointer to page to be grabbed
* @refs: the value to (effectively) add to the page's refcount
* @flags: gup flags: these are the FOLL_* flag values.
*
* "grab" names in this file mean, "look at flags to decide whether to use
* FOLL_PIN or FOLL_GET behavior, when incrementing the page's refcount.
*
* Either FOLL_PIN or FOLL_GET (or neither) must be set, but not both at the
* same time. (That's true throughout the get_user_pages*() and
* pin_user_pages*() APIs.) Cases:
*
* FOLL_GET: page's refcount will be incremented by @refs.
*
* FOLL_PIN on compound pages that are > two pages long: page's refcount will
* be incremented by @refs, and page[2].hpage_pinned_refcount will be
* incremented by @refs * GUP_PIN_COUNTING_BIAS.
*
* FOLL_PIN on normal pages, or compound pages that are two pages long:
* page's refcount will be incremented by @refs * GUP_PIN_COUNTING_BIAS.
*
* Return: head page (with refcount appropriately incremented) for success, or
* NULL upon failure. If neither FOLL_GET nor FOLL_PIN was set, that's
* considered failure, and furthermore, a likely bug in the caller, so a warning
* is also emitted.
*/
__maybe_unused struct page *try_grab_compound_head(struct page *page,
int refs, unsigned int flags)
{
if (flags & FOLL_GET)
return try_get_compound_head(page, refs);
else if (flags & FOLL_PIN) {
/*
* Can't do FOLL_LONGTERM + FOLL_PIN gup fast path if not in a
* right zone, so fail and let the caller fall back to the slow
* path.
*/
if (unlikely((flags & FOLL_LONGTERM) &&
!is_pinnable_page(page)))
return NULL;
/*
* CAUTION: Don't use compound_head() on the page before this
* point, the result won't be stable.
*/
page = try_get_compound_head(page, refs);
if (!page)
return NULL;
/*
* When pinning a compound page of order > 1 (which is what
* hpage_pincount_available() checks for), use an exact count to
* track it, via hpage_pincount_add/_sub().
*
* However, be sure to *also* increment the normal page refcount
* field at least once, so that the page really is pinned.
* That's why the refcount from the earlier
* try_get_compound_head() is left intact.
*/
if (hpage_pincount_available(page))
hpage_pincount_add(page, refs);
else
page_ref_add(page, refs * (GUP_PIN_COUNTING_BIAS - 1)); mod_node_page_state(page_pgdat(page), NR_FOLL_PIN_ACQUIRED,
refs);
return page;
}
WARN_ON_ONCE(1);
return NULL;
}
static void put_compound_head(struct page *page, int refs, unsigned int flags)
{
if (flags & FOLL_PIN) {
mod_node_page_state(page_pgdat(page), NR_FOLL_PIN_RELEASED,
refs);
if (hpage_pincount_available(page))
hpage_pincount_sub(page, refs);
else
refs *= GUP_PIN_COUNTING_BIAS;
}
put_page_refs(page, refs);
}
/**
* try_grab_page() - elevate a page's refcount by a flag-dependent amount
*
* This might not do anything at all, depending on the flags argument.
*
* "grab" names in this file mean, "look at flags to decide whether to use
* FOLL_PIN or FOLL_GET behavior, when incrementing the page's refcount.
*
* @page: pointer to page to be grabbed
* @flags: gup flags: these are the FOLL_* flag values.
*
* Either FOLL_PIN or FOLL_GET (or neither) may be set, but not both at the same
* time. Cases: please see the try_grab_compound_head() documentation, with
* "refs=1".
*
* Return: true for success, or if no action was required (if neither FOLL_PIN
* nor FOLL_GET was set, nothing is done). False for failure: FOLL_GET or
* FOLL_PIN was set, but the page could not be grabbed.
*/
bool __must_check try_grab_page(struct page *page, unsigned int flags)
{
WARN_ON_ONCE((flags & (FOLL_GET | FOLL_PIN)) == (FOLL_GET | FOLL_PIN)); if (flags & FOLL_GET)
return try_get_page(page);
else if (flags & FOLL_PIN) {
int refs = 1;
page = compound_head(page);
if (WARN_ON_ONCE(page_ref_count(page) <= 0))
return false;
if (hpage_pincount_available(page))
hpage_pincount_add(page, 1);
else
refs = GUP_PIN_COUNTING_BIAS;
/*
* Similar to try_grab_compound_head(): even if using the
* hpage_pincount_add/_sub() routines, be sure to
* *also* increment the normal page refcount field at least
* once, so that the page really is pinned.
*/
page_ref_add(page, refs);
mod_node_page_state(page_pgdat(page), NR_FOLL_PIN_ACQUIRED, 1);
}
return true;
}
/**
* unpin_user_page() - release a dma-pinned page
* @page: pointer to page to be released
*
* Pages that were pinned via pin_user_pages*() must be released via either
* unpin_user_page(), or one of the unpin_user_pages*() routines. This is so
* that such pages can be separately tracked and uniquely handled. In
* particular, interactions with RDMA and filesystems need special handling.
*/
void unpin_user_page(struct page *page)
{
put_compound_head(compound_head(page), 1, FOLL_PIN);
}
EXPORT_SYMBOL(unpin_user_page);
static inline void compound_range_next(unsigned long i, unsigned long npages,
struct page **list, struct page **head,
unsigned int *ntails)
{
struct page *next, *page;
unsigned int nr = 1;
if (i >= npages)
return;
next = *list + i;
page = compound_head(next);
if (PageCompound(page) && compound_order(page) >= 1)
nr = min_t(unsigned int,
page + compound_nr(page) - next, npages - i);
*head = page;
*ntails = nr;
}
#define for_each_compound_range(__i, __list, __npages, __head, __ntails) \
for (__i = 0, \
compound_range_next(__i, __npages, __list, &(__head), &(__ntails)); \
__i < __npages; __i += __ntails, \
compound_range_next(__i, __npages, __list, &(__head), &(__ntails)))
static inline void compound_next(unsigned long i, unsigned long npages,
struct page **list, struct page **head,
unsigned int *ntails)
{
struct page *page;
unsigned int nr;
if (i >= npages)
return;
page = compound_head(list[i]);
for (nr = i + 1; nr < npages; nr++) {
if (compound_head(list[nr]) != page)
break;
}
*head = page;
*ntails = nr - i;
}
#define for_each_compound_head(__i, __list, __npages, __head, __ntails) \
for (__i = 0, \
compound_next(__i, __npages, __list, &(__head), &(__ntails)); \
__i < __npages; __i += __ntails, \
compound_next(__i, __npages, __list, &(__head), &(__ntails)))
/**
* unpin_user_pages_dirty_lock() - release and optionally dirty gup-pinned pages
* @pages: array of pages to be maybe marked dirty, and definitely released.
* @npages: number of pages in the @pages array.
* @make_dirty: whether to mark the pages dirty
*
* "gup-pinned page" refers to a page that has had one of the get_user_pages()
* variants called on that page.
*
* For each page in the @pages array, make that page (or its head page, if a
* compound page) dirty, if @make_dirty is true, and if the page was previously
* listed as clean. In any case, releases all pages using unpin_user_page(),
* possibly via unpin_user_pages(), for the non-dirty case.
*
* Please see the unpin_user_page() documentation for details.
*
* set_page_dirty_lock() is used internally. If instead, set_page_dirty() is
* required, then the caller should a) verify that this is really correct,
* because _lock() is usually required, and b) hand code it:
* set_page_dirty_lock(), unpin_user_page().
*
*/
void unpin_user_pages_dirty_lock(struct page **pages, unsigned long npages,
bool make_dirty)
{
unsigned long index;
struct page *head;
unsigned int ntails;
if (!make_dirty) {
unpin_user_pages(pages, npages);
return;
}
for_each_compound_head(index, pages, npages, head, ntails) {
/*
* Checking PageDirty at this point may race with
* clear_page_dirty_for_io(), but that's OK. Two key
* cases:
*
* 1) This code sees the page as already dirty, so it
* skips the call to set_page_dirty(). That could happen
* because clear_page_dirty_for_io() called
* page_mkclean(), followed by set_page_dirty().
* However, now the page is going to get written back,
* which meets the original intention of setting it
* dirty, so all is well: clear_page_dirty_for_io() goes
* on to call TestClearPageDirty(), and write the page
* back.
*
* 2) This code sees the page as clean, so it calls
* set_page_dirty(). The page stays dirty, despite being
* written back, so it gets written back again in the
* next writeback cycle. This is harmless.
*/
if (!PageDirty(head))
set_page_dirty_lock(head);
put_compound_head(head, ntails, FOLL_PIN);
}
}
EXPORT_SYMBOL(unpin_user_pages_dirty_lock);
/**
* unpin_user_page_range_dirty_lock() - release and optionally dirty
* gup-pinned page range
*
* @page: the starting page of a range maybe marked dirty, and definitely released.
* @npages: number of consecutive pages to release.
* @make_dirty: whether to mark the pages dirty
*
* "gup-pinned page range" refers to a range of pages that has had one of the
* pin_user_pages() variants called on that page.
*
* For the page ranges defined by [page .. page+npages], make that range (or
* its head pages, if a compound page) dirty, if @make_dirty is true, and if the
* page range was previously listed as clean.
*
* set_page_dirty_lock() is used internally. If instead, set_page_dirty() is
* required, then the caller should a) verify that this is really correct,
* because _lock() is usually required, and b) hand code it:
* set_page_dirty_lock(), unpin_user_page().
*
*/
void unpin_user_page_range_dirty_lock(struct page *page, unsigned long npages,
bool make_dirty)
{
unsigned long index;
struct page *head;
unsigned int ntails;
for_each_compound_range(index, &page, npages, head, ntails) {
if (make_dirty && !PageDirty(head))
set_page_dirty_lock(head);
put_compound_head(head, ntails, FOLL_PIN);
}
}
EXPORT_SYMBOL(unpin_user_page_range_dirty_lock);
/**
* unpin_user_pages() - release an array of gup-pinned pages.
* @pages: array of pages to be marked dirty and released.
* @npages: number of pages in the @pages array.
*
* For each page in the @pages array, release the page using unpin_user_page().
*
* Please see the unpin_user_page() documentation for details.
*/
void unpin_user_pages(struct page **pages, unsigned long npages)
{
unsigned long index;
struct page *head;
unsigned int ntails;
/*
* If this WARN_ON() fires, then the system *might* be leaking pages (by
* leaving them pinned), but probably not. More likely, gup/pup returned
* a hard -ERRNO error to the caller, who erroneously passed it here.
*/
if (WARN_ON(IS_ERR_VALUE(npages)))
return;
for_each_compound_head(index, pages, npages, head, ntails)
put_compound_head(head, ntails, FOLL_PIN);
}
EXPORT_SYMBOL(unpin_user_pages);
/*
* Set the MMF_HAS_PINNED if not set yet; after set it'll be there for the mm's
* lifecycle. Avoid setting the bit unless necessary, or it might cause write
* cache bouncing on large SMP machines for concurrent pinned gups.
*/
static inline void mm_set_has_pinned_flag(unsigned long *mm_flags)
{
if (!test_bit(MMF_HAS_PINNED, mm_flags))
set_bit(MMF_HAS_PINNED, mm_flags);
}
#ifdef CONFIG_MMU
static struct page *no_page_table(struct vm_area_struct *vma,
unsigned int flags)
{
/*
* When core dumping an enormous anonymous area that nobody
* has touched so far, we don't want to allocate unnecessary pages or
* page tables. Return error instead of NULL to skip handle_mm_fault,
* then get_dump_page() will return NULL to leave a hole in the dump.
* But we can only make this optimization where a hole would surely
* be zero-filled if handle_mm_fault() actually did handle it.
*/
if ((flags & FOLL_DUMP) && (vma_is_anonymous(vma) || !vma->vm_ops->fault))
return ERR_PTR(-EFAULT);
return NULL;
}
static int follow_pfn_pte(struct vm_area_struct *vma, unsigned long address,
pte_t *pte, unsigned int flags)
{
/* No page to get reference */
if (flags & FOLL_GET)
return -EFAULT;
if (flags & FOLL_TOUCH) { pte_t entry = *pte;
if (flags & FOLL_WRITE)
entry = pte_mkdirty(entry);
entry = pte_mkyoung(entry);
if (!pte_same(*pte, entry)) {
set_pte_at(vma->vm_mm, address, pte, entry);
update_mmu_cache(vma, address, pte);
}
}
/* Proper page table entry exists, but no corresponding struct page */
return -EEXIST;
}
/*
* FOLL_FORCE can write to even unwritable pte's, but only
* after we've gone through a COW cycle and they are dirty.
*/
static inline bool can_follow_write_pte(pte_t pte, unsigned int flags)
{
return pte_write(pte) ||
((flags & FOLL_FORCE) && (flags & FOLL_COW) && pte_dirty(pte));
}
static struct page *follow_page_pte(struct vm_area_struct *vma,
unsigned long address, pmd_t *pmd, unsigned int flags,
struct dev_pagemap **pgmap)
{
struct mm_struct *mm = vma->vm_mm;
struct page *page;
spinlock_t *ptl;
pte_t *ptep, pte;
int ret;
/* FOLL_GET and FOLL_PIN are mutually exclusive. */
if (WARN_ON_ONCE((flags & (FOLL_PIN | FOLL_GET)) ==
(FOLL_PIN | FOLL_GET)))
return ERR_PTR(-EINVAL);
retry:
if (unlikely(pmd_bad(*pmd)))
return no_page_table(vma, flags);
ptep = pte_offset_map_lock(mm, pmd, address, &ptl);
pte = *ptep;
if (!pte_present(pte)) {
swp_entry_t entry;
/*
* KSM's break_ksm() relies upon recognizing a ksm page
* even while it is being migrated, so for that case we
* need migration_entry_wait().
*/
if (likely(!(flags & FOLL_MIGRATION)))
goto no_page;
if (pte_none(pte))
goto no_page;
entry = pte_to_swp_entry(pte);
if (!is_migration_entry(entry))
goto no_page;
pte_unmap_unlock(ptep, ptl);
migration_entry_wait(mm, pmd, address);
goto retry;
}
if ((flags & FOLL_NUMA) && pte_protnone(pte))
goto no_page;
if ((flags & FOLL_WRITE) && !can_follow_write_pte(pte, flags)) {
pte_unmap_unlock(ptep, ptl);
return NULL;
}
page = vm_normal_page(vma, address, pte); if (!page && pte_devmap(pte) && (flags & (FOLL_GET | FOLL_PIN))) {
/*
* Only return device mapping pages in the FOLL_GET or FOLL_PIN
* case since they are only valid while holding the pgmap
* reference.
*/
*pgmap = get_dev_pagemap(pte_pfn(pte), *pgmap);
if (*pgmap)
page = pte_page(pte);
else
goto no_page;
} else if (unlikely(!page)) {
if (flags & FOLL_DUMP) {
/* Avoid special (like zero) pages in core dumps */
page = ERR_PTR(-EFAULT);
goto out;
}
if (is_zero_pfn(pte_pfn(pte))) {
page = pte_page(pte);
} else {
ret = follow_pfn_pte(vma, address, ptep, flags);
page = ERR_PTR(ret);
goto out;
}
}
/* try_grab_page() does nothing unless FOLL_GET or FOLL_PIN is set. */
if (unlikely(!try_grab_page(page, flags))) {
page = ERR_PTR(-ENOMEM);
goto out;
}
/*
* We need to make the page accessible if and only if we are going
* to access its content (the FOLL_PIN case). Please see
* Documentation/core-api/pin_user_pages.rst for details.
*/
if (flags & FOLL_PIN) {
ret = arch_make_page_accessible(page);
if (ret) {
unpin_user_page(page);
page = ERR_PTR(ret);
goto out;
}
}
if (flags & FOLL_TOUCH) { if ((flags & FOLL_WRITE) &&
!pte_dirty(pte) && !PageDirty(page))
set_page_dirty(page);
/*
* pte_mkyoung() would be more correct here, but atomic care
* is needed to avoid losing the dirty bit: it is easier to use
* mark_page_accessed().
*/
mark_page_accessed(page);
}
if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) {
/* Do not mlock pte-mapped THP */
if (PageTransCompound(page))
goto out;
/*
* The preliminary mapping check is mainly to avoid the
* pointless overhead of lock_page on the ZERO_PAGE
* which might bounce very badly if there is contention.
*
* If the page is already locked, we don't need to
* handle it now - vmscan will handle it later if and
* when it attempts to reclaim the page.
*/
if (page->mapping && trylock_page(page)) { lru_add_drain(); /* push cached pages to LRU */
/*
* Because we lock page here, and migration is
* blocked by the pte's page reference, and we
* know the page is still mapped, we don't even
* need to check for file-cache page truncation.
*/
mlock_vma_page(page);
unlock_page(page);
}
}
out:
pte_unmap_unlock(ptep, ptl);
return page;
no_page:
pte_unmap_unlock(ptep, ptl);
if (!pte_none(pte))
return NULL;
return no_page_table(vma, flags);
}
static struct page *follow_pmd_mask(struct vm_area_struct *vma,
unsigned long address, pud_t *pudp,
unsigned int flags,
struct follow_page_context *ctx)
{
pmd_t *pmd, pmdval;
spinlock_t *ptl;
struct page *page;
struct mm_struct *mm = vma->vm_mm;
pmd = pmd_offset(pudp, address);
/*
* The READ_ONCE() will stabilize the pmdval in a register or
* on the stack so that it will stop changing under the code.
*/
pmdval = READ_ONCE(*pmd);
if (pmd_none(pmdval))
return no_page_table(vma, flags);
if (pmd_huge(pmdval) && is_vm_hugetlb_page(vma)) { page = follow_huge_pmd(mm, address, pmd, flags);
if (page)
return page;
return no_page_table(vma, flags);
}
if (is_hugepd(__hugepd(pmd_val(pmdval)))) {
page = follow_huge_pd(vma, address,
__hugepd(pmd_val(pmdval)), flags,
PMD_SHIFT);
if (page)
return page;
return no_page_table(vma, flags);
}
retry:
if (!pmd_present(pmdval)) {
if (likely(!(flags & FOLL_MIGRATION)))
return no_page_table(vma, flags);
VM_BUG_ON(thp_migration_supported() &&
!is_pmd_migration_entry(pmdval));
if (is_pmd_migration_entry(pmdval))
pmd_migration_entry_wait(mm, pmd);
pmdval = READ_ONCE(*pmd);
/*
* MADV_DONTNEED may convert the pmd to null because
* mmap_lock is held in read mode
*/
if (pmd_none(pmdval))
return no_page_table(vma, flags);
goto retry;
}
if (pmd_devmap(pmdval)) {
ptl = pmd_lock(mm, pmd);
page = follow_devmap_pmd(vma, address, pmd, flags, &ctx->pgmap);
spin_unlock(ptl);
if (page)
return page;
}
if (likely(!pmd_trans_huge(pmdval)))
return follow_page_pte(vma, address, pmd, flags, &ctx->pgmap);
if ((flags & FOLL_NUMA) && pmd_protnone(pmdval))
return no_page_table(vma, flags);
retry_locked:
ptl = pmd_lock(mm, pmd);
if (unlikely(pmd_none(*pmd))) {
spin_unlock(ptl);
return no_page_table(vma, flags);
}
if (unlikely(!pmd_present(*pmd))) {
spin_unlock(ptl);
if (likely(!(flags & FOLL_MIGRATION)))
return no_page_table(vma, flags);
pmd_migration_entry_wait(mm, pmd);
goto retry_locked;
}
if (unlikely(!pmd_trans_huge(*pmd))) {
spin_unlock(ptl);
return follow_page_pte(vma, address, pmd, flags, &ctx->pgmap);
}
if (flags & FOLL_SPLIT_PMD) {
int ret;
page = pmd_page(*pmd);
if (is_huge_zero_page(page)) {
spin_unlock(ptl);
ret = 0;
split_huge_pmd(vma, pmd, address);
if (pmd_trans_unstable(pmd))
ret = -EBUSY;
} else {
spin_unlock(ptl);
split_huge_pmd(vma, pmd, address);
ret = pte_alloc(mm, pmd) ? -ENOMEM : 0;
}
return ret ? ERR_PTR(ret) :
follow_page_pte(vma, address, pmd, flags, &ctx->pgmap);
}
page = follow_trans_huge_pmd(vma, address, pmd, flags);
spin_unlock(ptl);
ctx->page_mask = HPAGE_PMD_NR - 1;
return page;
}
static struct page *follow_pud_mask(struct vm_area_struct *vma,
unsigned long address, p4d_t *p4dp,
unsigned int flags,
struct follow_page_context *ctx)
{
pud_t *pud;
spinlock_t *ptl;
struct page *page;
struct mm_struct *mm = vma->vm_mm;
pud = pud_offset(p4dp, address);
if (pud_none(*pud))
return no_page_table(vma, flags);
if (pud_huge(*pud) && is_vm_hugetlb_page(vma)) { page = follow_huge_pud(mm, address, pud, flags);
if (page)
return page;
return no_page_table(vma, flags);
}
if (is_hugepd(__hugepd(pud_val(*pud)))) {
page = follow_huge_pd(vma, address,
__hugepd(pud_val(*pud)), flags,
PUD_SHIFT);
if (page)
return page;
return no_page_table(vma, flags);
}
if (pud_devmap(*pud)) {
ptl = pud_lock(mm, pud);
page = follow_devmap_pud(vma, address, pud, flags, &ctx->pgmap);
spin_unlock(ptl);
if (page)
return page;
}
if (unlikely(pud_bad(*pud)))
return no_page_table(vma, flags);
return follow_pmd_mask(vma, address, pud, flags, ctx);
}
static struct page *follow_p4d_mask(struct vm_area_struct *vma,
unsigned long address, pgd_t *pgdp,
unsigned int flags,
struct follow_page_context *ctx)
{
p4d_t *p4d;
struct page *page;
p4d = p4d_offset(pgdp, address);
if (p4d_none(*p4d))
return no_page_table(vma, flags);
BUILD_BUG_ON(p4d_huge(*p4d));
if (unlikely(p4d_bad(*p4d)))
return no_page_table(vma, flags);
if (is_hugepd(__hugepd(p4d_val(*p4d)))) {
page = follow_huge_pd(vma, address,
__hugepd(p4d_val(*p4d)), flags,
P4D_SHIFT);
if (page)
return page;
return no_page_table(vma, flags);
}
return follow_pud_mask(vma, address, p4d, flags, ctx);
}
/**
* follow_page_mask - look up a page descriptor from a user-virtual address
* @vma: vm_area_struct mapping @address
* @address: virtual address to look up
* @flags: flags modifying lookup behaviour
* @ctx: contains dev_pagemap for %ZONE_DEVICE memory pinning and a
* pointer to output page_mask
*
* @flags can have FOLL_ flags set, defined in <linux/mm.h>
*
* When getting pages from ZONE_DEVICE memory, the @ctx->pgmap caches
* the device's dev_pagemap metadata to avoid repeating expensive lookups.
*
* On output, the @ctx->page_mask is set according to the size of the page.
*
* Return: the mapped (struct page *), %NULL if no mapping exists, or
* an error pointer if there is a mapping to something not represented
* by a page descriptor (see also vm_normal_page()).
*/
static struct page *follow_page_mask(struct vm_area_struct *vma,
unsigned long address, unsigned int flags,
struct follow_page_context *ctx)
{
pgd_t *pgd;
struct page *page;
struct mm_struct *mm = vma->vm_mm;
ctx->page_mask = 0;
/* make this handle hugepd */
page = follow_huge_addr(mm, address, flags & FOLL_WRITE);
if (!IS_ERR(page)) {
WARN_ON_ONCE(flags & (FOLL_GET | FOLL_PIN));
return page;
}
pgd = pgd_offset(mm, address); if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd)))
return no_page_table(vma, flags);
if (pgd_huge(*pgd)) {
page = follow_huge_pgd(mm, address, pgd, flags);
if (page)
return page;
return no_page_table(vma, flags);
}
if (is_hugepd(__hugepd(pgd_val(*pgd)))) {
page = follow_huge_pd(vma, address,
__hugepd(pgd_val(*pgd)), flags,
PGDIR_SHIFT);
if (page)
return page;
return no_page_table(vma, flags);
}
return follow_p4d_mask(vma, address, pgd, flags, ctx);
}
struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
unsigned int foll_flags)
{
struct follow_page_context ctx = { NULL };
struct page *page;
if (vma_is_secretmem(vma))
return NULL;
page = follow_page_mask(vma, address, foll_flags, &ctx);
if (ctx.pgmap)
put_dev_pagemap(ctx.pgmap);
return page;
}
static int get_gate_page(struct mm_struct *mm, unsigned long address,
unsigned int gup_flags, struct vm_area_struct **vma,
struct page **page)
{
pgd_t *pgd;
p4d_t *p4d;
pud_t *pud;
pmd_t *pmd;
pte_t *pte;
int ret = -EFAULT;
/* user gate pages are read-only */
if (gup_flags & FOLL_WRITE)
return -EFAULT;
if (address > TASK_SIZE)
pgd = pgd_offset_k(address);
else
pgd = pgd_offset_gate(mm, address); if (pgd_none(*pgd))
return -EFAULT;
p4d = p4d_offset(pgd, address);
if (p4d_none(*p4d))
return -EFAULT;
pud = pud_offset(p4d, address);
if (pud_none(*pud))
return -EFAULT;
pmd = pmd_offset(pud, address);
if (!pmd_present(*pmd))
return -EFAULT;
VM_BUG_ON(pmd_trans_huge(*pmd));
pte = pte_offset_map(pmd, address);
if (pte_none(*pte))
goto unmap;
*vma = get_gate_vma(mm);
if (!page)
goto out;
*page = vm_normal_page(*vma, address, *pte);
if (!*page) {
if ((gup_flags & FOLL_DUMP) || !is_zero_pfn(pte_pfn(*pte)))
goto unmap;
*page = pte_page(*pte);
}
if (unlikely(!try_grab_page(*page, gup_flags))) {
ret = -ENOMEM;
goto unmap;
}
out:
ret = 0;
unmap:
pte_unmap(pte);
return ret;
}
/*
* mmap_lock must be held on entry. If @locked != NULL and *@flags
* does not include FOLL_NOWAIT, the mmap_lock may be released. If it
* is, *@locked will be set to 0 and -EBUSY returned.
*/
static int faultin_page(struct vm_area_struct *vma,
unsigned long address, unsigned int *flags, int *locked)
{
unsigned int fault_flags = 0;
vm_fault_t ret;
/* mlock all present pages, but do not fault in new pages */
if ((*flags & (FOLL_POPULATE | FOLL_MLOCK)) == FOLL_MLOCK)
return -ENOENT;
if (*flags & FOLL_NOFAULT)
return -EFAULT;
if (*flags & FOLL_WRITE)
fault_flags |= FAULT_FLAG_WRITE;
if (*flags & FOLL_REMOTE)
fault_flags |= FAULT_FLAG_REMOTE; if (locked) fault_flags |= FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE; if (*flags & FOLL_NOWAIT) fault_flags |= FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_RETRY_NOWAIT; if (*flags & FOLL_TRIED) {
/*
* Note: FAULT_FLAG_ALLOW_RETRY and FAULT_FLAG_TRIED
* can co-exist
*/
fault_flags |= FAULT_FLAG_TRIED;
}
ret = handle_mm_fault(vma, address, fault_flags, NULL);
if (ret & VM_FAULT_ERROR) {
int err = vm_fault_to_errno(ret, *flags);
if (err)
return err;
BUG();
}
if (ret & VM_FAULT_RETRY) { if (locked && !(fault_flags & FAULT_FLAG_RETRY_NOWAIT)) *locked = 0;
return -EBUSY;
}
/*
* The VM_FAULT_WRITE bit tells us that do_wp_page has broken COW when
* necessary, even if maybe_mkwrite decided not to set pte_write. We
* can thus safely do subsequent page lookups as if they were reads.
* But only do so when looping for pte_write is futile: in some cases
* userspace may also be wanting to write to the gotten user page,
* which a read fault here might prevent (a readonly page might get
* reCOWed by userspace write).
*/
if ((ret & VM_FAULT_WRITE) && !(vma->vm_flags & VM_WRITE)) *flags |= FOLL_COW;
return 0;
}
static int check_vma_flags(struct vm_area_struct *vma, unsigned long gup_flags)
{
vm_flags_t vm_flags = vma->vm_flags;
int write = (gup_flags & FOLL_WRITE);
int foreign = (gup_flags & FOLL_REMOTE);
if (vm_flags & (VM_IO | VM_PFNMAP))
return -EFAULT; if (gup_flags & FOLL_ANON && !vma_is_anonymous(vma))
return -EFAULT;
if ((gup_flags & FOLL_LONGTERM) && vma_is_fsdax(vma))
return -EOPNOTSUPP;
if (vma_is_secretmem(vma))
return -EFAULT;
if (write) { if (!(vm_flags & VM_WRITE)) { if (!(gup_flags & FOLL_FORCE))
return -EFAULT;
/*
* We used to let the write,force case do COW in a
* VM_MAYWRITE VM_SHARED !VM_WRITE vma, so ptrace could
* set a breakpoint in a read-only mapping of an
* executable, without corrupting the file (yet only
* when that file had been opened for writing!).
* Anon pages in shared mappings are surprising: now
* just reject it.
*/
if (!is_cow_mapping(vm_flags))
return -EFAULT;
}
} else if (!(vm_flags & VM_READ)) { if (!(gup_flags & FOLL_FORCE))
return -EFAULT;
/*
* Is there actually any vma we can reach here which does not
* have VM_MAYREAD set?
*/
if (!(vm_flags & VM_MAYREAD))
return -EFAULT;
}
/*
* gups are always data accesses, not instruction
* fetches, so execute=false here
*/
if (!arch_vma_access_permitted(vma, write, false, foreign))
return -EFAULT;
return 0;
}
/**
* __get_user_pages() - pin user pages in memory
* @mm: mm_struct of target mm
* @start: starting user address
* @nr_pages: number of pages from start to pin
* @gup_flags: flags modifying pin behaviour
* @pages: array that receives pointers to the pages pinned.
* Should be at least nr_pages long. Or NULL, if caller
* only intends to ensure the pages are faulted in.
* @vmas: array of pointers to vmas corresponding to each page.
* Or NULL if the caller does not require them.
* @locked: whether we're still with the mmap_lock held
*
* Returns either number of pages pinned (which may be less than the
* number requested), or an error. Details about the return value:
*
* -- If nr_pages is 0, returns 0.
* -- If nr_pages is >0, but no pages were pinned, returns -errno.
* -- If nr_pages is >0, and some pages were pinned, returns the number of
* pages pinned. Again, this may be less than nr_pages.
* -- 0 return value is possible when the fault would need to be retried.
*
* The caller is responsible for releasing returned @pages, via put_page().
*
* @vmas are valid only as long as mmap_lock is held.
*
* Must be called with mmap_lock held. It may be released. See below.
*
* __get_user_pages walks a process's page tables and takes a reference to
* each struct page that each user address corresponds to at a given
* instant. That is, it takes the page that would be accessed if a user
* thread accesses the given user virtual address at that instant.
*
* This does not guarantee that the page exists in the user mappings when
* __get_user_pages returns, and there may even be a completely different
* page there in some cases (eg. if mmapped pagecache has been invalidated
* and subsequently re faulted). However it does guarantee that the page
* won't be freed completely. And mostly callers simply care that the page
* contains data that was valid *at some point in time*. Typically, an IO
* or similar operation cannot guarantee anything stronger anyway because
* locks can't be held over the syscall boundary.
*
* If @gup_flags & FOLL_WRITE == 0, the page must not be written to. If
* the page is written to, set_page_dirty (or set_page_dirty_lock, as
* appropriate) must be called after the page is finished with, and
* before put_page is called.
*
* If @locked != NULL, *@locked will be set to 0 when mmap_lock is
* released by an up_read(). That can happen if @gup_flags does not
* have FOLL_NOWAIT.
*
* A caller using such a combination of @locked and @gup_flags
* must therefore hold the mmap_lock for reading only, and recognize
* when it's been released. Otherwise, it must be held for either
* reading or writing and will not be released.
*
* In most cases, get_user_pages or get_user_pages_fast should be used
* instead of __get_user_pages. __get_user_pages should be used only if
* you need some special @gup_flags.
*/
static long __get_user_pages(struct mm_struct *mm,
unsigned long start, unsigned long nr_pages,
unsigned int gup_flags, struct page **pages,
struct vm_area_struct **vmas, int *locked)
{
long ret = 0, i = 0;
struct vm_area_struct *vma = NULL;
struct follow_page_context ctx = { NULL };
if (!nr_pages)
return 0;
start = untagged_addr(start);
VM_BUG_ON(!!pages != !!(gup_flags & (FOLL_GET | FOLL_PIN)));
/*
* If FOLL_FORCE is set then do not force a full fault as the hinting
* fault information is unrelated to the reference behaviour of a task
* using the address space
*/
if (!(gup_flags & FOLL_FORCE)) gup_flags |= FOLL_NUMA;
do {
struct page *page;
unsigned int foll_flags = gup_flags;
unsigned int page_increm;
/* first iteration or cross vma bound */
if (!vma || start >= vma->vm_end) { vma = find_extend_vma(mm, start); if (!vma && in_gate_area(mm, start)) { ret = get_gate_page(mm, start & PAGE_MASK,
gup_flags, &vma,
pages ? &pages[i] : NULL);
if (ret)
goto out;
ctx.page_mask = 0;
goto next_page;
}
if (!vma) {
ret = -EFAULT;
goto out;
}
ret = check_vma_flags(vma, gup_flags);
if (ret)
goto out;
if (is_vm_hugetlb_page(vma)) { i = follow_hugetlb_page(mm, vma, pages, vmas,
&start, &nr_pages, i,
gup_flags, locked);
if (locked && *locked == 0) {
/*
* We've got a VM_FAULT_RETRY
* and we've lost mmap_lock.
* We must stop here.
*/
BUG_ON(gup_flags & FOLL_NOWAIT);
goto out;
}
continue;
}
}
retry:
/*
* If we have a pending SIGKILL, don't keep faulting pages and
* potentially allocating memory.
*/
if (fatal_signal_pending(current)) {
ret = -EINTR;
goto out;
}
cond_resched();
page = follow_page_mask(vma, start, foll_flags, &ctx);
if (!page) {
ret = faultin_page(vma, start, &foll_flags, locked);
switch (ret) {
case 0:
goto retry;
case -EBUSY:
ret = 0;
fallthrough;
case -EFAULT:
case -ENOMEM:
case -EHWPOISON:
goto out;
case -ENOENT:
goto next_page;
}
BUG();
} else if (PTR_ERR(page) == -EEXIST) {
/*
* Proper page table entry exists, but no corresponding
* struct page.
*/
goto next_page;
} else if (IS_ERR(page)) {
ret = PTR_ERR(page);
goto out;
}
if (pages) { pages[i] = page;
flush_anon_page(vma, page, start);
flush_dcache_page(page);
ctx.page_mask = 0;
}
next_page:
if (vmas) { vmas[i] = vma;
ctx.page_mask = 0;
}
page_increm = 1 + (~(start >> PAGE_SHIFT) & ctx.page_mask);
if (page_increm > nr_pages) page_increm = nr_pages; i += page_increm;
start += page_increm * PAGE_SIZE;
nr_pages -= page_increm;
} while (nr_pages);
out:
if (ctx.pgmap)
put_dev_pagemap(ctx.pgmap);
return i ? i : ret;
}
static bool vma_permits_fault(struct vm_area_struct *vma,
unsigned int fault_flags)
{
bool write = !!(fault_flags & FAULT_FLAG_WRITE); bool foreign = !!(fault_flags & FAULT_FLAG_REMOTE);
vm_flags_t vm_flags = write ? VM_WRITE : VM_READ;
if (!(vm_flags & vma->vm_flags))
return false;
/*
* The architecture might have a hardware protection
* mechanism other than read/write that can deny access.
*
* gup always represents data access, not instruction
* fetches, so execute=false here:
*/
if (!arch_vma_access_permitted(vma, write, false, foreign))
return false;
return true;
}
/**
* fixup_user_fault() - manually resolve a user page fault
* @mm: mm_struct of target mm
* @address: user address
* @fault_flags:flags to pass down to handle_mm_fault()
* @unlocked: did we unlock the mmap_lock while retrying, maybe NULL if caller
* does not allow retry. If NULL, the caller must guarantee
* that fault_flags does not contain FAULT_FLAG_ALLOW_RETRY.
*
* This is meant to be called in the specific scenario where for locking reasons
* we try to access user memory in atomic context (within a pagefault_disable()
* section), this returns -EFAULT, and we want to resolve the user fault before
* trying again.
*
* Typically this is meant to be used by the futex code.
*
* The main difference with get_user_pages() is that this function will
* unconditionally call handle_mm_fault() which will in turn perform all the
* necessary SW fixup of the dirty and young bits in the PTE, while
* get_user_pages() only guarantees to update these in the struct page.
*
* This is important for some architectures where those bits also gate the
* access permission to the page because they are maintained in software. On
* such architectures, gup() will not be enough to make a subsequent access
* succeed.
*
* This function will not return with an unlocked mmap_lock. So it has not the
* same semantics wrt the @mm->mmap_lock as does filemap_fault().
*/
int fixup_user_fault(struct mm_struct *mm,
unsigned long address, unsigned int fault_flags,
bool *unlocked)
{
struct vm_area_struct *vma;
vm_fault_t ret;
address = untagged_addr(address);
if (unlocked) fault_flags |= FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;
retry:
vma = find_extend_vma(mm, address); if (!vma || address < vma->vm_start) return -EFAULT;
if (!vma_permits_fault(vma, fault_flags))
return -EFAULT;
if ((fault_flags & FAULT_FLAG_KILLABLE) &&
fatal_signal_pending(current))
return -EINTR;
ret = handle_mm_fault(vma, address, fault_flags, NULL);
if (ret & VM_FAULT_ERROR) {
int err = vm_fault_to_errno(ret, 0);
if (err)
return err;
BUG();
}
if (ret & VM_FAULT_RETRY) {
mmap_read_lock(mm);
*unlocked = true;
fault_flags |= FAULT_FLAG_TRIED;
goto retry;
}
return 0;
}
EXPORT_SYMBOL_GPL(fixup_user_fault);
/*
* Please note that this function, unlike __get_user_pages will not
* return 0 for nr_pages > 0 without FOLL_NOWAIT
*/
static __always_inline long __get_user_pages_locked(struct mm_struct *mm,
unsigned long start,
unsigned long nr_pages,
struct page **pages,
struct vm_area_struct **vmas,
int *locked,
unsigned int flags)
{
long ret, pages_done;
bool lock_dropped;
if (locked) {
/* if VM_FAULT_RETRY can be returned, vmas become invalid */
BUG_ON(vmas);
/* check caller initialized locked */
BUG_ON(*locked != 1);
}
if (flags & FOLL_PIN) mm_set_has_pinned_flag(&mm->flags);
/*
* FOLL_PIN and FOLL_GET are mutually exclusive. Traditional behavior
* is to set FOLL_GET if the caller wants pages[] filled in (but has
* carelessly failed to specify FOLL_GET), so keep doing that, but only
* for FOLL_GET, not for the newer FOLL_PIN.
*
* FOLL_PIN always expects pages to be non-null, but no need to assert
* that here, as any failures will be obvious enough.
*/
if (pages && !(flags & FOLL_PIN)) flags |= FOLL_GET;
pages_done = 0;
lock_dropped = false;
for (;;) {
ret = __get_user_pages(mm, start, nr_pages, flags, pages,
vmas, locked);
if (!locked)
/* VM_FAULT_RETRY couldn't trigger, bypass */
return ret;
/* VM_FAULT_RETRY cannot return errors */
if (!*locked) {
BUG_ON(ret < 0); BUG_ON(ret >= nr_pages);
}
if (ret > 0) { nr_pages -= ret;
pages_done += ret;
if (!nr_pages)
break;
}
if (*locked) {
/*
* VM_FAULT_RETRY didn't trigger or it was a
* FOLL_NOWAIT.
*/
if (!pages_done)
pages_done = ret;
break;
}
/*
* VM_FAULT_RETRY triggered, so seek to the faulting offset.
* For the prefault case (!pages) we only update counts.
*/
if (likely(pages)) pages += ret; start += ret << PAGE_SHIFT;
lock_dropped = true;
retry:
/*
* Repeat on the address that fired VM_FAULT_RETRY
* with both FAULT_FLAG_ALLOW_RETRY and
* FAULT_FLAG_TRIED. Note that GUP can be interrupted
* by fatal signals, so we need to check it before we
* start trying again otherwise it can loop forever.
*/
if (fatal_signal_pending(current)) {
if (!pages_done)
pages_done = -EINTR;
break;
}
ret = mmap_read_lock_killable(mm);
if (ret) {
BUG_ON(ret > 0); if (!pages_done)
pages_done = ret;
break;
}
*locked = 1;
ret = __get_user_pages(mm, start, 1, flags | FOLL_TRIED,
pages, NULL, locked);
if (!*locked) {
/* Continue to retry until we succeeded */
BUG_ON(ret != 0);
goto retry;
}
if (ret != 1) { BUG_ON(ret > 1); if (!pages_done)
pages_done = ret;
break;
}
nr_pages--;
pages_done++;
if (!nr_pages)
break;
if (likely(pages)) pages++; start += PAGE_SIZE;
}
if (lock_dropped && *locked) {
/*
* We must let the caller know we temporarily dropped the lock
* and so the critical section protected by it was lost.
*/
mmap_read_unlock(mm);
*locked = 0;
}
return pages_done;
}
/**
* populate_vma_page_range() - populate a range of pages in the vma.
* @vma: target vma
* @start: start address
* @end: end address
* @locked: whether the mmap_lock is still held
*
* This takes care of mlocking the pages too if VM_LOCKED is set.
*
* Return either number of pages pinned in the vma, or a negative error
* code on error.
*
* vma->vm_mm->mmap_lock must be held.
*
* If @locked is NULL, it may be held for read or write and will
* be unperturbed.
*
* If @locked is non-NULL, it must held for read only and may be
* released. If it's released, *@locked will be set to 0.
*/
long populate_vma_page_range(struct vm_area_struct *vma,
unsigned long start, unsigned long end, int *locked)
{
struct mm_struct *mm = vma->vm_mm;
unsigned long nr_pages = (end - start) / PAGE_SIZE;
int gup_flags;
VM_BUG_ON(!PAGE_ALIGNED(start));
VM_BUG_ON(!PAGE_ALIGNED(end));
VM_BUG_ON_VMA(start < vma->vm_start, vma);
VM_BUG_ON_VMA(end > vma->vm_end, vma);
mmap_assert_locked(mm);
gup_flags = FOLL_TOUCH | FOLL_POPULATE | FOLL_MLOCK;
if (vma->vm_flags & VM_LOCKONFAULT)
gup_flags &= ~FOLL_POPULATE;
/*
* We want to touch writable mappings with a write fault in order
* to break COW, except for shared mappings because these don't COW
* and we would not want to dirty them for nothing.
*/
if ((vma->vm_flags & (VM_WRITE | VM_SHARED)) == VM_WRITE)
gup_flags |= FOLL_WRITE;
/*
* We want mlock to succeed for regions that have any permissions
* other than PROT_NONE.
*/
if (vma_is_accessible(vma))
gup_flags |= FOLL_FORCE;
/*
* We made sure addr is within a VMA, so the following will
* not result in a stack expansion that recurses back here.
*/
return __get_user_pages(mm, start, nr_pages, gup_flags,
NULL, NULL, locked);
}
/*
* faultin_vma_page_range() - populate (prefault) page tables inside the
* given VMA range readable/writable
*
* This takes care of mlocking the pages, too, if VM_LOCKED is set.
*
* @vma: target vma
* @start: start address
* @end: end address
* @write: whether to prefault readable or writable
* @locked: whether the mmap_lock is still held
*
* Returns either number of processed pages in the vma, or a negative error
* code on error (see __get_user_pages()).
*
* vma->vm_mm->mmap_lock must be held. The range must be page-aligned and
* covered by the VMA.
*
* If @locked is NULL, it may be held for read or write and will be unperturbed.
*
* If @locked is non-NULL, it must held for read only and may be released. If
* it's released, *@locked will be set to 0.
*/
long faultin_vma_page_range(struct vm_area_struct *vma, unsigned long start,
unsigned long end, bool write, int *locked)
{
struct mm_struct *mm = vma->vm_mm;
unsigned long nr_pages = (end - start) / PAGE_SIZE;
int gup_flags;
VM_BUG_ON(!PAGE_ALIGNED(start));
VM_BUG_ON(!PAGE_ALIGNED(end));
VM_BUG_ON_VMA(start < vma->vm_start, vma);
VM_BUG_ON_VMA(end > vma->vm_end, vma);
mmap_assert_locked(mm);
/*
* FOLL_TOUCH: Mark page accessed and thereby young; will also mark
* the page dirty with FOLL_WRITE -- which doesn't make a
* difference with !FOLL_FORCE, because the page is writable
* in the page table.
* FOLL_HWPOISON: Return -EHWPOISON instead of -EFAULT when we hit
* a poisoned page.
* FOLL_POPULATE: Always populate memory with VM_LOCKONFAULT.
* !FOLL_FORCE: Require proper access permissions.
*/
gup_flags = FOLL_TOUCH | FOLL_POPULATE | FOLL_MLOCK | FOLL_HWPOISON;
if (write)
gup_flags |= FOLL_WRITE;
/*
* We want to report -EINVAL instead of -EFAULT for any permission
* problems or incompatible mappings.
*/
if (check_vma_flags(vma, gup_flags))
return -EINVAL;
return __get_user_pages(mm, start, nr_pages, gup_flags,
NULL, NULL, locked);
}
/*
* __mm_populate - populate and/or mlock pages within a range of address space.
*
* This is used to implement mlock() and the MAP_POPULATE / MAP_LOCKED mmap
* flags. VMAs must be already marked with the desired vm_flags, and
* mmap_lock must not be held.
*/
int __mm_populate(unsigned long start, unsigned long len, int ignore_errors)
{
struct mm_struct *mm = current->mm;
unsigned long end, nstart, nend;
struct vm_area_struct *vma = NULL;
int locked = 0;
long ret = 0;
end = start + len;
for (nstart = start; nstart < end; nstart = nend) {
/*
* We want to fault in pages for [nstart; end) address range.
* Find first corresponding VMA.
*/
if (!locked) {
locked = 1;
mmap_read_lock(mm);
vma = find_vma(mm, nstart);
} else if (nstart >= vma->vm_end)
vma = vma->vm_next;
if (!vma || vma->vm_start >= end)
break;
/*
* Set [nstart; nend) to intersection of desired address
* range with the first VMA. Also, skip undesirable VMA types.
*/
nend = min(end, vma->vm_end);
if (vma->vm_flags & (VM_IO | VM_PFNMAP))
continue;
if (nstart < vma->vm_start)
nstart = vma->vm_start;
/*
* Now fault in a range of pages. populate_vma_page_range()
* double checks the vma flags, so that it won't mlock pages
* if the vma was already munlocked.
*/
ret = populate_vma_page_range(vma, nstart, nend, &locked);
if (ret < 0) {
if (ignore_errors) {
ret = 0;
continue; /* continue at next VMA */
}
break;
}
nend = nstart + ret * PAGE_SIZE;
ret = 0;
}
if (locked)
mmap_read_unlock(mm);
return ret; /* 0 or negative error code */
}
#else /* CONFIG_MMU */
static long __get_user_pages_locked(struct mm_struct *mm, unsigned long start,
unsigned long nr_pages, struct page **pages,
struct vm_area_struct **vmas, int *locked,
unsigned int foll_flags)
{
struct vm_area_struct *vma;
unsigned long vm_flags;
long i;
/* calculate required read or write permissions.
* If FOLL_FORCE is set, we only require the "MAY" flags.
*/
vm_flags = (foll_flags & FOLL_WRITE) ?
(VM_WRITE | VM_MAYWRITE) : (VM_READ | VM_MAYREAD);
vm_flags &= (foll_flags & FOLL_FORCE) ?
(VM_MAYREAD | VM_MAYWRITE) : (VM_READ | VM_WRITE);
for (i = 0; i < nr_pages; i++) {
vma = find_vma(mm, start);
if (!vma)
goto finish_or_fault;
/* protect what we can, including chardevs */
if ((vma->vm_flags & (VM_IO | VM_PFNMAP)) ||
!(vm_flags & vma->vm_flags))
goto finish_or_fault;
if (pages) {
pages[i] = virt_to_page(start);
if (pages[i])
get_page(pages[i]);
}
if (vmas)
vmas[i] = vma;
start = (start + PAGE_SIZE) & PAGE_MASK;
}
return i;
finish_or_fault:
return i ? : -EFAULT;
}
#endif /* !CONFIG_MMU */
/**
* fault_in_writeable - fault in userspace address range for writing
* @uaddr: start of address range
* @size: size of address range
*
* Returns the number of bytes not faulted in (like copy_to_user() and
* copy_from_user()).
*/
size_t fault_in_writeable(char __user *uaddr, size_t size)
{
char __user *start = uaddr, *end;
if (unlikely(size == 0))
return 0;
if (!PAGE_ALIGNED(uaddr)) {
if (unlikely(__put_user(0, uaddr) != 0))
return size;
uaddr = (char __user *)PAGE_ALIGN((unsigned long)uaddr);
}
end = (char __user *)PAGE_ALIGN((unsigned long)start + size);
if (unlikely(end < start))
end = NULL;
while (uaddr != end) {
if (unlikely(__put_user(0, uaddr) != 0))
goto out;
uaddr += PAGE_SIZE;
}
out:
if (size > uaddr - start)
return size - (uaddr - start);
return 0;
}
EXPORT_SYMBOL(fault_in_writeable);
/*
* fault_in_safe_writeable - fault in an address range for writing
* @uaddr: start of address range
* @size: length of address range
*
* Faults in an address range for writing. This is primarily useful when we
* already know that some or all of the pages in the address range aren't in
* memory.
*
* Unlike fault_in_writeable(), this function is non-destructive.
*
* Note that we don't pin or otherwise hold the pages referenced that we fault
* in. There's no guarantee that they'll stay in memory for any duration of
* time.
*
* Returns the number of bytes not faulted in, like copy_to_user() and
* copy_from_user().
*/
size_t fault_in_safe_writeable(const char __user *uaddr, size_t size)
{
unsigned long start = (unsigned long)uaddr, end;
struct mm_struct *mm = current->mm;
bool unlocked = false;
if (unlikely(size == 0))
return 0; end = PAGE_ALIGN(start + size);
if (end < start)
end = 0;
mmap_read_lock(mm);
do {
if (fixup_user_fault(mm, start, FAULT_FLAG_WRITE, &unlocked))
break;
start = (start + PAGE_SIZE) & PAGE_MASK;
} while (start != end);
mmap_read_unlock(mm);
if (size > (unsigned long)uaddr - start)
return size - ((unsigned long)uaddr - start);
return 0;
}
EXPORT_SYMBOL(fault_in_safe_writeable);
/**
* fault_in_readable - fault in userspace address range for reading
* @uaddr: start of user address range
* @size: size of user address range
*
* Returns the number of bytes not faulted in (like copy_to_user() and
* copy_from_user()).
*/
size_t fault_in_readable(const char __user *uaddr, size_t size)
{
const char __user *start = uaddr, *end;
volatile char c;
if (unlikely(size == 0)) return 0; if (!PAGE_ALIGNED(uaddr)) { if (unlikely(__get_user(c, uaddr) != 0))
return size;
uaddr = (const char __user *)PAGE_ALIGN((unsigned long)uaddr);
}
end = (const char __user *)PAGE_ALIGN((unsigned long)start + size);
if (unlikely(end < start))
end = NULL;
while (uaddr != end) { if (unlikely(__get_user(c, uaddr) != 0))
goto out;
uaddr += PAGE_SIZE;
}
out:
(void)c;
if (size > uaddr - start)
return size - (uaddr - start);
return 0;
}
EXPORT_SYMBOL(fault_in_readable);
/**
* get_dump_page() - pin user page in memory while writing it to core dump
* @addr: user address
*
* Returns struct page pointer of user page pinned for dump,
* to be freed afterwards by put_page().
*
* Returns NULL on any kind of failure - a hole must then be inserted into
* the corefile, to preserve alignment with its headers; and also returns
* NULL wherever the ZERO_PAGE, or an anonymous pte_none, has been found -
* allowing a hole to be left in the corefile to save disk space.
*
* Called without mmap_lock (takes and releases the mmap_lock by itself).
*/
#ifdef CONFIG_ELF_CORE
struct page *get_dump_page(unsigned long addr)
{
struct mm_struct *mm = current->mm;
struct page *page;
int locked = 1;
int ret;
if (mmap_read_lock_killable(mm))
return NULL;
ret = __get_user_pages_locked(mm, addr, 1, &page, NULL, &locked,
FOLL_FORCE | FOLL_DUMP | FOLL_GET);
if (locked)
mmap_read_unlock(mm);
return (ret == 1) ? page : NULL;
}
#endif /* CONFIG_ELF_CORE */
#ifdef CONFIG_MIGRATION
/*
* Check whether all pages are pinnable, if so return number of pages. If some
* pages are not pinnable, migrate them, and unpin all pages. Return zero if
* pages were migrated, or if some pages were not successfully isolated.
* Return negative error if migration fails.
*/
static long check_and_migrate_movable_pages(unsigned long nr_pages,
struct page **pages,
unsigned int gup_flags)
{
unsigned long i;
unsigned long isolation_error_count = 0;
bool drain_allow = true;
LIST_HEAD(movable_page_list);
long ret = 0;
struct page *prev_head = NULL;
struct page *head;
struct migration_target_control mtc = {
.nid = NUMA_NO_NODE,
.gfp_mask = GFP_USER | __GFP_NOWARN,
};
for (i = 0; i < nr_pages; i++) {
head = compound_head(pages[i]);
if (head == prev_head)
continue;
prev_head = head;
/*
* If we get a movable page, since we are going to be pinning
* these entries, try to move them out if possible.
*/
if (!is_pinnable_page(head)) {
if (PageHuge(head)) {
if (!isolate_huge_page(head, &movable_page_list))
isolation_error_count++;
} else {
if (!PageLRU(head) && drain_allow) {
lru_add_drain_all();
drain_allow = false;
}
if (isolate_lru_page(head)) {
isolation_error_count++;
continue;
}
list_add_tail(&head->lru, &movable_page_list);
mod_node_page_state(page_pgdat(head),
NR_ISOLATED_ANON +
page_is_file_lru(head),
thp_nr_pages(head));
}
}
}
/*
* If list is empty, and no isolation errors, means that all pages are
* in the correct zone.
*/
if (list_empty(&movable_page_list) && !isolation_error_count)
return nr_pages;
if (gup_flags & FOLL_PIN) {
unpin_user_pages(pages, nr_pages);
} else {
for (i = 0; i < nr_pages; i++)
put_page(pages[i]);
}
if (!list_empty(&movable_page_list)) {
ret = migrate_pages(&movable_page_list, alloc_migration_target,
NULL, (unsigned long)&mtc, MIGRATE_SYNC,
MR_LONGTERM_PIN, NULL);
if (ret && !list_empty(&movable_page_list))
putback_movable_pages(&movable_page_list);
}
return ret > 0 ? -ENOMEM : ret;
}
#else
static long check_and_migrate_movable_pages(unsigned long nr_pages,
struct page **pages,
unsigned int gup_flags)
{
return nr_pages;
}
#endif /* CONFIG_MIGRATION */
/*
* __gup_longterm_locked() is a wrapper for __get_user_pages_locked which
* allows us to process the FOLL_LONGTERM flag.
*/
static long __gup_longterm_locked(struct mm_struct *mm,
unsigned long start,
unsigned long nr_pages,
struct page **pages,
struct vm_area_struct **vmas,
unsigned int gup_flags)
{
unsigned int flags;
long rc;
if (!(gup_flags & FOLL_LONGTERM))
return __get_user_pages_locked(mm, start, nr_pages, pages, vmas,
NULL, gup_flags);
flags = memalloc_pin_save();
do {
rc = __get_user_pages_locked(mm, start, nr_pages, pages, vmas,
NULL, gup_flags);
if (rc <= 0)
break;
rc = check_and_migrate_movable_pages(rc, pages, gup_flags);
} while (!rc);
memalloc_pin_restore(flags);
return rc;
}
static bool is_valid_gup_flags(unsigned int gup_flags)
{
/*
* FOLL_PIN must only be set internally by the pin_user_pages*() APIs,
* never directly by the caller, so enforce that with an assertion:
*/
if (WARN_ON_ONCE(gup_flags & FOLL_PIN))
return false;
/*
* FOLL_PIN is a prerequisite to FOLL_LONGTERM. Another way of saying
* that is, FOLL_LONGTERM is a specific case, more restrictive case of
* FOLL_PIN.
*/
if (WARN_ON_ONCE(gup_flags & FOLL_LONGTERM))
return false;
return true;
}
#ifdef CONFIG_MMU
static long __get_user_pages_remote(struct mm_struct *mm,
unsigned long start, unsigned long nr_pages,
unsigned int gup_flags, struct page **pages,
struct vm_area_struct **vmas, int *locked)
{
/*
* Parts of FOLL_LONGTERM behavior are incompatible with
* FAULT_FLAG_ALLOW_RETRY because of the FS DAX check requirement on
* vmas. However, this only comes up if locked is set, and there are
* callers that do request FOLL_LONGTERM, but do not set locked. So,
* allow what we can.
*/
if (gup_flags & FOLL_LONGTERM) {
if (WARN_ON_ONCE(locked))
return -EINVAL;
/*
* This will check the vmas (even if our vmas arg is NULL)
* and return -ENOTSUPP if DAX isn't allowed in this case:
*/
return __gup_longterm_locked(mm, start, nr_pages, pages,
vmas, gup_flags | FOLL_TOUCH |
FOLL_REMOTE);
}
return __get_user_pages_locked(mm, start, nr_pages, pages, vmas,
locked,
gup_flags | FOLL_TOUCH | FOLL_REMOTE);
}
/**
* get_user_pages_remote() - pin user pages in memory
* @mm: mm_struct of target mm
* @start: starting user address
* @nr_pages: number of pages from start to pin
* @gup_flags: flags modifying lookup behaviour
* @pages: array that receives pointers to the pages pinned.
* Should be at least nr_pages long. Or NULL, if caller
* only intends to ensure the pages are faulted in.
* @vmas: array of pointers to vmas corresponding to each page.
* Or NULL if the caller does not require them.
* @locked: pointer to lock flag indicating whether lock is held and
* subsequently whether VM_FAULT_RETRY functionality can be
* utilised. Lock must initially be held.
*
* Returns either number of pages pinned (which may be less than the
* number requested), or an error. Details about the return value:
*
* -- If nr_pages is 0, returns 0.
* -- If nr_pages is >0, but no pages were pinned, returns -errno.
* -- If nr_pages is >0, and some pages were pinned, returns the number of
* pages pinned. Again, this may be less than nr_pages.
*
* The caller is responsible for releasing returned @pages, via put_page().
*
* @vmas are valid only as long as mmap_lock is held.
*
* Must be called with mmap_lock held for read or write.
*
* get_user_pages_remote walks a process's page tables and takes a reference
* to each struct page that each user address corresponds to at a given
* instant. That is, it takes the page that would be accessed if a user
* thread accesses the given user virtual address at that instant.
*
* This does not guarantee that the page exists in the user mappings when
* get_user_pages_remote returns, and there may even be a completely different
* page there in some cases (eg. if mmapped pagecache has been invalidated
* and subsequently re faulted). However it does guarantee that the page
* won't be freed completely. And mostly callers simply care that the page
* contains data that was valid *at some point in time*. Typically, an IO
* or similar operation cannot guarantee anything stronger anyway because
* locks can't be held over the syscall boundary.
*
* If gup_flags & FOLL_WRITE == 0, the page must not be written to. If the page
* is written to, set_page_dirty (or set_page_dirty_lock, as appropriate) must
* be called after the page is finished with, and before put_page is called.
*
* get_user_pages_remote is typically used for fewer-copy IO operations,
* to get a handle on the memory by some means other than accesses
* via the user virtual addresses. The pages may be submitted for
* DMA to devices or accessed via their kernel linear mapping (via the
* kmap APIs). Care should be taken to use the correct cache flushing APIs.
*
* See also get_user_pages_fast, for performance critical applications.
*
* get_user_pages_remote should be phased out in favor of
* get_user_pages_locked|unlocked or get_user_pages_fast. Nothing
* should use get_user_pages_remote because it cannot pass
* FAULT_FLAG_ALLOW_RETRY to handle_mm_fault.
*/
long get_user_pages_remote(struct mm_struct *mm,
unsigned long start, unsigned long nr_pages,
unsigned int gup_flags, struct page **pages,
struct vm_area_struct **vmas, int *locked)
{
if (!is_valid_gup_flags(gup_flags))
return -EINVAL;
return __get_user_pages_remote(mm, start, nr_pages, gup_flags,
pages, vmas, locked);
}
EXPORT_SYMBOL(get_user_pages_remote);
#else /* CONFIG_MMU */
long get_user_pages_remote(struct mm_struct *mm,
unsigned long start, unsigned long nr_pages,
unsigned int gup_flags, struct page **pages,
struct vm_area_struct **vmas, int *locked)
{
return 0;
}
static long __get_user_pages_remote(struct mm_struct *mm,
unsigned long start, unsigned long nr_pages,
unsigned int gup_flags, struct page **pages,
struct vm_area_struct **vmas, int *locked)
{
return 0;
}
#endif /* !CONFIG_MMU */
/**
* get_user_pages() - pin user pages in memory
* @start: starting user address
* @nr_pages: number of pages from start to pin
* @gup_flags: flags modifying lookup behaviour
* @pages: array that receives pointers to the pages pinned.
* Should be at least nr_pages long. Or NULL, if caller
* only intends to ensure the pages are faulted in.
* @vmas: array of pointers to vmas corresponding to each page.
* Or NULL if the caller does not require them.
*
* This is the same as get_user_pages_remote(), just with a less-flexible
* calling convention where we assume that the mm being operated on belongs to
* the current task, and doesn't allow passing of a locked parameter. We also
* obviously don't pass FOLL_REMOTE in here.
*/
long get_user_pages(unsigned long start, unsigned long nr_pages,
unsigned int gup_flags, struct page **pages,
struct vm_area_struct **vmas)
{
if (!is_valid_gup_flags(gup_flags))
return -EINVAL;
return __gup_longterm_locked(current->mm, start, nr_pages,
pages, vmas, gup_flags | FOLL_TOUCH);
}
EXPORT_SYMBOL(get_user_pages);
/**
* get_user_pages_locked() - variant of get_user_pages()
*
* @start: starting user address
* @nr_pages: number of pages from start to pin
* @gup_flags: flags modifying lookup behaviour
* @pages: array that receives pointers to the pages pinned.
* Should be at least nr_pages long. Or NULL, if caller
* only intends to ensure the pages are faulted in.
* @locked: pointer to lock flag indicating whether lock is held and
* subsequently whether VM_FAULT_RETRY functionality can be
* utilised. Lock must initially be held.
*
* It is suitable to replace the form:
*
* mmap_read_lock(mm);
* do_something()
* get_user_pages(mm, ..., pages, NULL);
* mmap_read_unlock(mm);
*
* to:
*
* int locked = 1;
* mmap_read_lock(mm);
* do_something()
* get_user_pages_locked(mm, ..., pages, &locked);
* if (locked)
* mmap_read_unlock(mm);
*
* We can leverage the VM_FAULT_RETRY functionality in the page fault
* paths better by using either get_user_pages_locked() or
* get_user_pages_unlocked().
*
*/
long get_user_pages_locked(unsigned long start, unsigned long nr_pages,
unsigned int gup_flags, struct page **pages,
int *locked)
{
/*
* FIXME: Current FOLL_LONGTERM behavior is incompatible with
* FAULT_FLAG_ALLOW_RETRY because of the FS DAX check requirement on
* vmas. As there are no users of this flag in this call we simply
* disallow this option for now.
*/
if (WARN_ON_ONCE(gup_flags & FOLL_LONGTERM))
return -EINVAL;
/*
* FOLL_PIN must only be set internally by the pin_user_pages*() APIs,
* never directly by the caller, so enforce that:
*/
if (WARN_ON_ONCE(gup_flags & FOLL_PIN))
return -EINVAL;
return __get_user_pages_locked(current->mm, start, nr_pages,
pages, NULL, locked,
gup_flags | FOLL_TOUCH);
}
EXPORT_SYMBOL(get_user_pages_locked);
/*
* get_user_pages_unlocked() is suitable to replace the form:
*
* mmap_read_lock(mm);
* get_user_pages(mm, ..., pages, NULL);
* mmap_read_unlock(mm);
*
* with:
*
* get_user_pages_unlocked(mm, ..., pages);
*
* It is functionally equivalent to get_user_pages_fast so
* get_user_pages_fast should be used instead if specific gup_flags
* (e.g. FOLL_FORCE) are not required.
*/
long get_user_pages_unlocked(unsigned long start, unsigned long nr_pages,
struct page **pages, unsigned int gup_flags)
{
struct mm_struct *mm = current->mm;
int locked = 1;
long ret;
/*
* FIXME: Current FOLL_LONGTERM behavior is incompatible with
* FAULT_FLAG_ALLOW_RETRY because of the FS DAX check requirement on
* vmas. As there are no users of this flag in this call we simply
* disallow this option for now.
*/
if (WARN_ON_ONCE(gup_flags & FOLL_LONGTERM))
return -EINVAL;
mmap_read_lock(mm);
ret = __get_user_pages_locked(mm, start, nr_pages, pages, NULL,
&locked, gup_flags | FOLL_TOUCH);
if (locked)
mmap_read_unlock(mm);
return ret;
}
EXPORT_SYMBOL(get_user_pages_unlocked);
/*
* Fast GUP
*
* get_user_pages_fast attempts to pin user pages by walking the page
* tables directly and avoids taking locks. Thus the walker needs to be
* protected from page table pages being freed from under it, and should
* block any THP splits.
*
* One way to achieve this is to have the walker disable interrupts, and
* rely on IPIs from the TLB flushing code blocking before the page table
* pages are freed. This is unsuitable for architectures that do not need
* to broadcast an IPI when invalidating TLBs.
*
* Another way to achieve this is to batch up page table containing pages
* belonging to more than one mm_user, then rcu_sched a callback to free those
* pages. Disabling interrupts will allow the fast_gup walker to both block
* the rcu_sched callback, and an IPI that we broadcast for splitting THPs
* (which is a relatively rare event). The code below adopts this strategy.
*
* Before activating this code, please be aware that the following assumptions
* are currently made:
*
* *) Either MMU_GATHER_RCU_TABLE_FREE is enabled, and tlb_remove_table() is used to
* free pages containing page tables or TLB flushing requires IPI broadcast.
*
* *) ptes can be read atomically by the architecture.
*
* *) access_ok is sufficient to validate userspace address ranges.
*
* The last two assumptions can be relaxed by the addition of helper functions.
*
* This code is based heavily on the PowerPC implementation by Nick Piggin.
*/
#ifdef CONFIG_HAVE_FAST_GUP
static void __maybe_unused undo_dev_pagemap(int *nr, int nr_start,
unsigned int flags,
struct page **pages)
{
while ((*nr) - nr_start) { struct page *page = pages[--(*nr)];
ClearPageReferenced(page);
if (flags & FOLL_PIN)
unpin_user_page(page);
else
put_page(page);
}
}
#ifdef CONFIG_ARCH_HAS_PTE_SPECIAL
static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end,
unsigned int flags, struct page **pages, int *nr)
{
struct dev_pagemap *pgmap = NULL;
int nr_start = *nr, ret = 0;
pte_t *ptep, *ptem;
ptem = ptep = pte_offset_map(&pmd, addr);
do {
pte_t pte = ptep_get_lockless(ptep);
struct page *head, *page;
/*
* Similar to the PMD case below, NUMA hinting must take slow
* path using the pte_protnone check.
*/
if (pte_protnone(pte))
goto pte_unmap;
if (!pte_access_permitted(pte, flags & FOLL_WRITE))
goto pte_unmap;
if (pte_devmap(pte)) {
if (unlikely(flags & FOLL_LONGTERM))
goto pte_unmap;
pgmap = get_dev_pagemap(pte_pfn(pte), pgmap);
if (unlikely(!pgmap)) {
undo_dev_pagemap(nr, nr_start, flags, pages);
goto pte_unmap;
}
} else if (pte_special(pte))
goto pte_unmap;
VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
page = pte_page(pte);
head = try_grab_compound_head(page, 1, flags);
if (!head)
goto pte_unmap;
if (unlikely(page_is_secretmem(page))) {
put_compound_head(head, 1, flags);
goto pte_unmap;
}
if (unlikely(pte_val(pte) != pte_val(*ptep))) {
put_compound_head(head, 1, flags);
goto pte_unmap;
}
VM_BUG_ON_PAGE(compound_head(page) != head, page);
/*
* We need to make the page accessible if and only if we are
* going to access its content (the FOLL_PIN case). Please
* see Documentation/core-api/pin_user_pages.rst for
* details.
*/
if (flags & FOLL_PIN) {
ret = arch_make_page_accessible(page);
if (ret) {
unpin_user_page(page);
goto pte_unmap;
}
}
SetPageReferenced(page);
pages[*nr] = page;
(*nr)++;
} while (ptep++, addr += PAGE_SIZE, addr != end);
ret = 1;
pte_unmap:
if (pgmap)
put_dev_pagemap(pgmap);
pte_unmap(ptem);
return ret;
}
#else
/*
* If we can't determine whether or not a pte is special, then fail immediately
* for ptes. Note, we can still pin HugeTLB and THP as these are guaranteed not
* to be special.
*
* For a futex to be placed on a THP tail page, get_futex_key requires a
* get_user_pages_fast_only implementation that can pin pages. Thus it's still
* useful to have gup_huge_pmd even if we can't operate on ptes.
*/
static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end,
unsigned int flags, struct page **pages, int *nr)
{
return 0;
}
#endif /* CONFIG_ARCH_HAS_PTE_SPECIAL */
#if defined(CONFIG_ARCH_HAS_PTE_DEVMAP) && defined(CONFIG_TRANSPARENT_HUGEPAGE)
static int __gup_device_huge(unsigned long pfn, unsigned long addr,
unsigned long end, unsigned int flags,
struct page **pages, int *nr)
{
int nr_start = *nr;
struct dev_pagemap *pgmap = NULL;
int ret = 1;
do {
struct page *page = pfn_to_page(pfn);
pgmap = get_dev_pagemap(pfn, pgmap);
if (unlikely(!pgmap)) {
undo_dev_pagemap(nr, nr_start, flags, pages);
ret = 0;
break;
}
SetPageReferenced(page);
pages[*nr] = page;
if (unlikely(!try_grab_page(page, flags))) {
undo_dev_pagemap(nr, nr_start, flags, pages);
ret = 0;
break;
}
(*nr)++;
pfn++;
} while (addr += PAGE_SIZE, addr != end);
put_dev_pagemap(pgmap);
return ret;
}
static int __gup_device_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr,
unsigned long end, unsigned int flags,
struct page **pages, int *nr)
{
unsigned long fault_pfn;
int nr_start = *nr;
fault_pfn = pmd_pfn(orig) + ((addr & ~PMD_MASK) >> PAGE_SHIFT);
if (!__gup_device_huge(fault_pfn, addr, end, flags, pages, nr))
return 0;
if (unlikely(pmd_val(orig) != pmd_val(*pmdp))) {
undo_dev_pagemap(nr, nr_start, flags, pages);
return 0;
}
return 1;
}
static int __gup_device_huge_pud(pud_t orig, pud_t *pudp, unsigned long addr,
unsigned long end, unsigned int flags,
struct page **pages, int *nr)
{
unsigned long fault_pfn;
int nr_start = *nr;
fault_pfn = pud_pfn(orig) + ((addr & ~PUD_MASK) >> PAGE_SHIFT);
if (!__gup_device_huge(fault_pfn, addr, end, flags, pages, nr))
return 0;
if (unlikely(pud_val(orig) != pud_val(*pudp))) {
undo_dev_pagemap(nr, nr_start, flags, pages);
return 0;
}
return 1;
}
#else
static int __gup_device_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr,
unsigned long end, unsigned int flags,
struct page **pages, int *nr)
{
BUILD_BUG();
return 0;
}
static int __gup_device_huge_pud(pud_t pud, pud_t *pudp, unsigned long addr,
unsigned long end, unsigned int flags,
struct page **pages, int *nr)
{
BUILD_BUG();
return 0;
}
#endif
static int record_subpages(struct page *page, unsigned long addr,
unsigned long end, struct page **pages)
{
int nr;
for (nr = 0; addr != end; addr += PAGE_SIZE) pages[nr++] = page++;
return nr;
}
#ifdef CONFIG_ARCH_HAS_HUGEPD
static unsigned long hugepte_addr_end(unsigned long addr, unsigned long end,
unsigned long sz)
{
unsigned long __boundary = (addr + sz) & ~(sz-1);
return (__boundary - 1 < end - 1) ? __boundary : end;
}
static int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned long addr,
unsigned long end, unsigned int flags,
struct page **pages, int *nr)
{
unsigned long pte_end;
struct page *head, *page;
pte_t pte;
int refs;
pte_end = (addr + sz) & ~(sz-1);
if (pte_end < end)
end = pte_end;
pte = huge_ptep_get(ptep);
if (!pte_access_permitted(pte, flags & FOLL_WRITE))
return 0;
/* hugepages are never "special" */
VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
head = pte_page(pte);
page = head + ((addr & (sz-1)) >> PAGE_SHIFT);
refs = record_subpages(page, addr, end, pages + *nr);
head = try_grab_compound_head(head, refs, flags);
if (!head)
return 0;
if (unlikely(pte_val(pte) != pte_val(*ptep))) {
put_compound_head(head, refs, flags);
return 0;
}
*nr += refs;
SetPageReferenced(head);
return 1;
}
static int gup_huge_pd(hugepd_t hugepd, unsigned long addr,
unsigned int pdshift, unsigned long end, unsigned int flags,
struct page **pages, int *nr)
{
pte_t *ptep;
unsigned long sz = 1UL << hugepd_shift(hugepd);
unsigned long next;
ptep = hugepte_offset(hugepd, addr, pdshift);
do {
next = hugepte_addr_end(addr, end, sz);
if (!gup_hugepte(ptep, sz, addr, end, flags, pages, nr))
return 0;
} while (ptep++, addr = next, addr != end);
return 1;
}
#else
static inline int gup_huge_pd(hugepd_t hugepd, unsigned long addr,
unsigned int pdshift, unsigned long end, unsigned int flags,
struct page **pages, int *nr)
{
return 0;
}
#endif /* CONFIG_ARCH_HAS_HUGEPD */
static int gup_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr,
unsigned long end, unsigned int flags,
struct page **pages, int *nr)
{
struct page *head, *page;
int refs;
if (!pmd_access_permitted(orig, flags & FOLL_WRITE))
return 0;
if (pmd_devmap(orig)) {
if (unlikely(flags & FOLL_LONGTERM))
return 0;
return __gup_device_huge_pmd(orig, pmdp, addr, end, flags,
pages, nr);
}
page = pmd_page(orig) + ((addr & ~PMD_MASK) >> PAGE_SHIFT);
refs = record_subpages(page, addr, end, pages + *nr);
head = try_grab_compound_head(pmd_page(orig), refs, flags);
if (!head)
return 0;
if (unlikely(pmd_val(orig) != pmd_val(*pmdp))) {
put_compound_head(head, refs, flags);
return 0;
}
*nr += refs;
SetPageReferenced(head);
return 1;
}
static int gup_huge_pud(pud_t orig, pud_t *pudp, unsigned long addr,
unsigned long end, unsigned int flags,
struct page **pages, int *nr)
{
struct page *head, *page;
int refs;
if (!pud_access_permitted(orig, flags & FOLL_WRITE))
return 0;
if (pud_devmap(orig)) {
if (unlikely(flags & FOLL_LONGTERM))
return 0;
return __gup_device_huge_pud(orig, pudp, addr, end, flags,
pages, nr);
}
page = pud_page(orig) + ((addr & ~PUD_MASK) >> PAGE_SHIFT);
refs = record_subpages(page, addr, end, pages + *nr);
head = try_grab_compound_head(pud_page(orig), refs, flags);
if (!head)
return 0;
if (unlikely(pud_val(orig) != pud_val(*pudp))) {
put_compound_head(head, refs, flags);
return 0;
}
*nr += refs;
SetPageReferenced(head);
return 1;
}
static int gup_huge_pgd(pgd_t orig, pgd_t *pgdp, unsigned long addr,
unsigned long end, unsigned int flags,
struct page **pages, int *nr)
{
int refs;
struct page *head, *page;
if (!pgd_access_permitted(orig, flags & FOLL_WRITE))
return 0;
BUILD_BUG_ON(pgd_devmap(orig));
page = pgd_page(orig) + ((addr & ~PGDIR_MASK) >> PAGE_SHIFT);
refs = record_subpages(page, addr, end, pages + *nr);
head = try_grab_compound_head(pgd_page(orig), refs, flags);
if (!head)
return 0;
if (unlikely(pgd_val(orig) != pgd_val(*pgdp))) {
put_compound_head(head, refs, flags);
return 0;
}
*nr += refs;
SetPageReferenced(head);
return 1;
}
static int gup_pmd_range(pud_t *pudp, pud_t pud, unsigned long addr, unsigned long end,
unsigned int flags, struct page **pages, int *nr)
{
unsigned long next;
pmd_t *pmdp;
pmdp = pmd_offset_lockless(pudp, pud, addr);
do {
pmd_t pmd = READ_ONCE(*pmdp);
next = pmd_addr_end(addr, end);
if (!pmd_present(pmd))
return 0;
if (unlikely(pmd_trans_huge(pmd) || pmd_huge(pmd) ||
pmd_devmap(pmd))) {
/*
* NUMA hinting faults need to be handled in the GUP
* slowpath for accounting purposes and so that they
* can be serialised against THP migration.
*/
if (pmd_protnone(pmd))
return 0;
if (!gup_huge_pmd(pmd, pmdp, addr, next, flags,
pages, nr))
return 0;
} else if (unlikely(is_hugepd(__hugepd(pmd_val(pmd))))) {
/*
* architecture have different format for hugetlbfs
* pmd format and THP pmd format
*/
if (!gup_huge_pd(__hugepd(pmd_val(pmd)), addr,
PMD_SHIFT, next, flags, pages, nr))
return 0;
} else if (!gup_pte_range(pmd, addr, next, flags, pages, nr))
return 0;
} while (pmdp++, addr = next, addr != end);
return 1;
}
static int gup_pud_range(p4d_t *p4dp, p4d_t p4d, unsigned long addr, unsigned long end,
unsigned int flags, struct page **pages, int *nr)
{
unsigned long next;
pud_t *pudp;
pudp = pud_offset_lockless(p4dp, p4d, addr);
do {
pud_t pud = READ_ONCE(*pudp);
next = pud_addr_end(addr, end);
if (unlikely(!pud_present(pud)))
return 0;
if (unlikely(pud_huge(pud))) { if (!gup_huge_pud(pud, pudp, addr, next, flags,
pages, nr))
return 0;
} else if (unlikely(is_hugepd(__hugepd(pud_val(pud))))) {
if (!gup_huge_pd(__hugepd(pud_val(pud)), addr,
PUD_SHIFT, next, flags, pages, nr))
return 0;
} else if (!gup_pmd_range(pudp, pud, addr, next, flags, pages, nr))
return 0;
} while (pudp++, addr = next, addr != end);
return 1;
}
static int gup_p4d_range(pgd_t *pgdp, pgd_t pgd, unsigned long addr, unsigned long end,
unsigned int flags, struct page **pages, int *nr)
{
unsigned long next;
p4d_t *p4dp;
p4dp = p4d_offset_lockless(pgdp, pgd, addr);
do {
p4d_t p4d = READ_ONCE(*p4dp);
next = p4d_addr_end(addr, end);
if (p4d_none(p4d))
return 0;
BUILD_BUG_ON(p4d_huge(p4d));
if (unlikely(is_hugepd(__hugepd(p4d_val(p4d))))) {
if (!gup_huge_pd(__hugepd(p4d_val(p4d)), addr,
P4D_SHIFT, next, flags, pages, nr))
return 0;
} else if (!gup_pud_range(p4dp, p4d, addr, next, flags, pages, nr))
return 0;
} while (p4dp++, addr = next, addr != end);
return 1;
}
static void gup_pgd_range(unsigned long addr, unsigned long end,
unsigned int flags, struct page **pages, int *nr)
{
unsigned long next;
pgd_t *pgdp;
pgdp = pgd_offset(current->mm, addr);
do {
pgd_t pgd = READ_ONCE(*pgdp);
next = pgd_addr_end(addr, end);
if (pgd_none(pgd))
return;
if (unlikely(pgd_huge(pgd))) {
if (!gup_huge_pgd(pgd, pgdp, addr, next, flags,
pages, nr))
return;
} else if (unlikely(is_hugepd(__hugepd(pgd_val(pgd))))) {
if (!gup_huge_pd(__hugepd(pgd_val(pgd)), addr,
PGDIR_SHIFT, next, flags, pages, nr))
return;
} else if (!gup_p4d_range(pgdp, pgd, addr, next, flags, pages, nr))
return;
} while (pgdp++, addr = next, addr != end);
}
#else
static inline void gup_pgd_range(unsigned long addr, unsigned long end,
unsigned int flags, struct page **pages, int *nr)
{
}
#endif /* CONFIG_HAVE_FAST_GUP */
#ifndef gup_fast_permitted
/*
* Check if it's allowed to use get_user_pages_fast_only() for the range, or
* we need to fall back to the slow version:
*/
static bool gup_fast_permitted(unsigned long start, unsigned long end)
{
return true;
}
#endif
static int __gup_longterm_unlocked(unsigned long start, int nr_pages,
unsigned int gup_flags, struct page **pages)
{
int ret;
/*
* FIXME: FOLL_LONGTERM does not work with
* get_user_pages_unlocked() (see comments in that function)
*/
if (gup_flags & FOLL_LONGTERM) {
mmap_read_lock(current->mm);
ret = __gup_longterm_locked(current->mm,
start, nr_pages,
pages, NULL, gup_flags);
mmap_read_unlock(current->mm);
} else {
ret = get_user_pages_unlocked(start, nr_pages,
pages, gup_flags);
}
return ret;
}
static unsigned long lockless_pages_from_mm(unsigned long start,
unsigned long end,
unsigned int gup_flags,
struct page **pages)
{
unsigned long flags;
int nr_pinned = 0;
unsigned seq;
if (!IS_ENABLED(CONFIG_HAVE_FAST_GUP) ||
!gup_fast_permitted(start, end))
return 0;
if (gup_flags & FOLL_PIN) {
seq = raw_read_seqcount(¤t->mm->write_protect_seq);
if (seq & 1)
return 0;
}
/*
* Disable interrupts. The nested form is used, in order to allow full,
* general purpose use of this routine.
*
* With interrupts disabled, we block page table pages from being freed
* from under us. See struct mmu_table_batch comments in
* include/asm-generic/tlb.h for more details.
*
* We do not adopt an rcu_read_lock() here as we also want to block IPIs
* that come from THPs splitting.
*/
local_irq_save(flags);
gup_pgd_range(start, end, gup_flags, pages, &nr_pinned);
local_irq_restore(flags);
/*
* When pinning pages for DMA there could be a concurrent write protect
* from fork() via copy_page_range(), in this case always fail fast GUP.
*/
if (gup_flags & FOLL_PIN) {
if (read_seqcount_retry(¤t->mm->write_protect_seq, seq)) {
unpin_user_pages(pages, nr_pinned);
return 0;
}
}
return nr_pinned;
}
static int internal_get_user_pages_fast(unsigned long start,
unsigned long nr_pages,
unsigned int gup_flags,
struct page **pages)
{
unsigned long len, end;
unsigned long nr_pinned;
int ret;
if (WARN_ON_ONCE(gup_flags & ~(FOLL_WRITE | FOLL_LONGTERM |
FOLL_FORCE | FOLL_PIN | FOLL_GET |
FOLL_FAST_ONLY | FOLL_NOFAULT)))
return -EINVAL;
if (gup_flags & FOLL_PIN)
mm_set_has_pinned_flag(¤t->mm->flags);
if (!(gup_flags & FOLL_FAST_ONLY))
might_lock_read(¤t->mm->mmap_lock);
start = untagged_addr(start) & PAGE_MASK;
len = nr_pages << PAGE_SHIFT;
if (check_add_overflow(start, len, &end))
return 0;
if (unlikely(!access_ok((void __user *)start, len)))
return -EFAULT;
nr_pinned = lockless_pages_from_mm(start, end, gup_flags, pages);
if (nr_pinned == nr_pages || gup_flags & FOLL_FAST_ONLY)
return nr_pinned;
/* Slow path: try to get the remaining pages with get_user_pages */
start += nr_pinned << PAGE_SHIFT;
pages += nr_pinned;
ret = __gup_longterm_unlocked(start, nr_pages - nr_pinned, gup_flags,
pages);
if (ret < 0) {
/*
* The caller has to unpin the pages we already pinned so
* returning -errno is not an option
*/
if (nr_pinned) return nr_pinned;
return ret;
}
return ret + nr_pinned;
}
/**
* get_user_pages_fast_only() - pin user pages in memory
* @start: starting user address
* @nr_pages: number of pages from start to pin
* @gup_flags: flags modifying pin behaviour
* @pages: array that receives pointers to the pages pinned.
* Should be at least nr_pages long.
*
* Like get_user_pages_fast() except it's IRQ-safe in that it won't fall back to
* the regular GUP.
* Note a difference with get_user_pages_fast: this always returns the
* number of pages pinned, 0 if no pages were pinned.
*
* If the architecture does not support this function, simply return with no
* pages pinned.
*
* Careful, careful! COW breaking can go either way, so a non-write
* access can get ambiguous page results. If you call this function without
* 'write' set, you'd better be sure that you're ok with that ambiguity.
*/
int get_user_pages_fast_only(unsigned long start, int nr_pages,
unsigned int gup_flags, struct page **pages)
{
int nr_pinned;
/*
* Internally (within mm/gup.c), gup fast variants must set FOLL_GET,
* because gup fast is always a "pin with a +1 page refcount" request.
*
* FOLL_FAST_ONLY is required in order to match the API description of
* this routine: no fall back to regular ("slow") GUP.
*/
gup_flags |= FOLL_GET | FOLL_FAST_ONLY;
nr_pinned = internal_get_user_pages_fast(start, nr_pages, gup_flags,
pages);
/*
* As specified in the API description above, this routine is not
* allowed to return negative values. However, the common core
* routine internal_get_user_pages_fast() *can* return -errno.
* Therefore, correct for that here:
*/
if (nr_pinned < 0)
nr_pinned = 0;
return nr_pinned;
}
EXPORT_SYMBOL_GPL(get_user_pages_fast_only);
/**
* get_user_pages_fast() - pin user pages in memory
* @start: starting user address
* @nr_pages: number of pages from start to pin
* @gup_flags: flags modifying pin behaviour
* @pages: array that receives pointers to the pages pinned.
* Should be at least nr_pages long.
*
* Attempt to pin user pages in memory without taking mm->mmap_lock.
* If not successful, it will fall back to taking the lock and
* calling get_user_pages().
*
* Returns number of pages pinned. This may be fewer than the number requested.
* If nr_pages is 0 or negative, returns 0. If no pages were pinned, returns
* -errno.
*/
int get_user_pages_fast(unsigned long start, int nr_pages,
unsigned int gup_flags, struct page **pages)
{
if (!is_valid_gup_flags(gup_flags))
return -EINVAL;
/*
* The caller may or may not have explicitly set FOLL_GET; either way is
* OK. However, internally (within mm/gup.c), gup fast variants must set
* FOLL_GET, because gup fast is always a "pin with a +1 page refcount"
* request.
*/
gup_flags |= FOLL_GET; return internal_get_user_pages_fast(start, nr_pages, gup_flags, pages);
}
EXPORT_SYMBOL_GPL(get_user_pages_fast);
/**
* pin_user_pages_fast() - pin user pages in memory without taking locks
*
* @start: starting user address
* @nr_pages: number of pages from start to pin
* @gup_flags: flags modifying pin behaviour
* @pages: array that receives pointers to the pages pinned.
* Should be at least nr_pages long.
*
* Nearly the same as get_user_pages_fast(), except that FOLL_PIN is set. See
* get_user_pages_fast() for documentation on the function arguments, because
* the arguments here are identical.
*
* FOLL_PIN means that the pages must be released via unpin_user_page(). Please
* see Documentation/core-api/pin_user_pages.rst for further details.
*/
int pin_user_pages_fast(unsigned long start, int nr_pages,
unsigned int gup_flags, struct page **pages)
{
/* FOLL_GET and FOLL_PIN are mutually exclusive. */
if (WARN_ON_ONCE(gup_flags & FOLL_GET))
return -EINVAL;
gup_flags |= FOLL_PIN;
return internal_get_user_pages_fast(start, nr_pages, gup_flags, pages);
}
EXPORT_SYMBOL_GPL(pin_user_pages_fast);
/*
* This is the FOLL_PIN equivalent of get_user_pages_fast_only(). Behavior
* is the same, except that this one sets FOLL_PIN instead of FOLL_GET.
*
* The API rules are the same, too: no negative values may be returned.
*/
int pin_user_pages_fast_only(unsigned long start, int nr_pages,
unsigned int gup_flags, struct page **pages)
{
int nr_pinned;
/*
* FOLL_GET and FOLL_PIN are mutually exclusive. Note that the API
* rules require returning 0, rather than -errno:
*/
if (WARN_ON_ONCE(gup_flags & FOLL_GET))
return 0;
/*
* FOLL_FAST_ONLY is required in order to match the API description of
* this routine: no fall back to regular ("slow") GUP.
*/
gup_flags |= (FOLL_PIN | FOLL_FAST_ONLY);
nr_pinned = internal_get_user_pages_fast(start, nr_pages, gup_flags,
pages);
/*
* This routine is not allowed to return negative values. However,
* internal_get_user_pages_fast() *can* return -errno. Therefore,
* correct for that here:
*/
if (nr_pinned < 0)
nr_pinned = 0;
return nr_pinned;
}
EXPORT_SYMBOL_GPL(pin_user_pages_fast_only);
/**
* pin_user_pages_remote() - pin pages of a remote process
*
* @mm: mm_struct of target mm
* @start: starting user address
* @nr_pages: number of pages from start to pin
* @gup_flags: flags modifying lookup behaviour
* @pages: array that receives pointers to the pages pinned.
* Should be at least nr_pages long. Or NULL, if caller
* only intends to ensure the pages are faulted in.
* @vmas: array of pointers to vmas corresponding to each page.
* Or NULL if the caller does not require them.
* @locked: pointer to lock flag indicating whether lock is held and
* subsequently whether VM_FAULT_RETRY functionality can be
* utilised. Lock must initially be held.
*
* Nearly the same as get_user_pages_remote(), except that FOLL_PIN is set. See
* get_user_pages_remote() for documentation on the function arguments, because
* the arguments here are identical.
*
* FOLL_PIN means that the pages must be released via unpin_user_page(). Please
* see Documentation/core-api/pin_user_pages.rst for details.
*/
long pin_user_pages_remote(struct mm_struct *mm,
unsigned long start, unsigned long nr_pages,
unsigned int gup_flags, struct page **pages,
struct vm_area_struct **vmas, int *locked)
{
/* FOLL_GET and FOLL_PIN are mutually exclusive. */
if (WARN_ON_ONCE(gup_flags & FOLL_GET))
return -EINVAL;
gup_flags |= FOLL_PIN;
return __get_user_pages_remote(mm, start, nr_pages, gup_flags,
pages, vmas, locked);
}
EXPORT_SYMBOL(pin_user_pages_remote);
/**
* pin_user_pages() - pin user pages in memory for use by other devices
*
* @start: starting user address
* @nr_pages: number of pages from start to pin
* @gup_flags: flags modifying lookup behaviour
* @pages: array that receives pointers to the pages pinned.
* Should be at least nr_pages long. Or NULL, if caller
* only intends to ensure the pages are faulted in.
* @vmas: array of pointers to vmas corresponding to each page.
* Or NULL if the caller does not require them.
*
* Nearly the same as get_user_pages(), except that FOLL_TOUCH is not set, and
* FOLL_PIN is set.
*
* FOLL_PIN means that the pages must be released via unpin_user_page(). Please
* see Documentation/core-api/pin_user_pages.rst for details.
*/
long pin_user_pages(unsigned long start, unsigned long nr_pages,
unsigned int gup_flags, struct page **pages,
struct vm_area_struct **vmas)
{
/* FOLL_GET and FOLL_PIN are mutually exclusive. */
if (WARN_ON_ONCE(gup_flags & FOLL_GET))
return -EINVAL;
gup_flags |= FOLL_PIN;
return __gup_longterm_locked(current->mm, start, nr_pages,
pages, vmas, gup_flags);
}
EXPORT_SYMBOL(pin_user_pages);
/*
* pin_user_pages_unlocked() is the FOLL_PIN variant of
* get_user_pages_unlocked(). Behavior is the same, except that this one sets
* FOLL_PIN and rejects FOLL_GET.
*/
long pin_user_pages_unlocked(unsigned long start, unsigned long nr_pages,
struct page **pages, unsigned int gup_flags)
{
/* FOLL_GET and FOLL_PIN are mutually exclusive. */
if (WARN_ON_ONCE(gup_flags & FOLL_GET))
return -EINVAL;
gup_flags |= FOLL_PIN;
return get_user_pages_unlocked(start, nr_pages, pages, gup_flags);
}
EXPORT_SYMBOL(pin_user_pages_unlocked);
/*
* pin_user_pages_locked() is the FOLL_PIN variant of get_user_pages_locked().
* Behavior is the same, except that this one sets FOLL_PIN and rejects
* FOLL_GET.
*/
long pin_user_pages_locked(unsigned long start, unsigned long nr_pages,
unsigned int gup_flags, struct page **pages,
int *locked)
{
/*
* FIXME: Current FOLL_LONGTERM behavior is incompatible with
* FAULT_FLAG_ALLOW_RETRY because of the FS DAX check requirement on
* vmas. As there are no users of this flag in this call we simply
* disallow this option for now.
*/
if (WARN_ON_ONCE(gup_flags & FOLL_LONGTERM))
return -EINVAL;
/* FOLL_GET and FOLL_PIN are mutually exclusive. */
if (WARN_ON_ONCE(gup_flags & FOLL_GET))
return -EINVAL;
gup_flags |= FOLL_PIN;
return __get_user_pages_locked(current->mm, start, nr_pages,
pages, NULL, locked,
gup_flags | FOLL_TOUCH);
}
EXPORT_SYMBOL(pin_user_pages_locked);
// SPDX-License-Identifier: GPL-2.0
/*
* Copyright (C) 2011, 2012 STRATO. All rights reserved.
*/
#include <linux/blkdev.h>
#include <linux/ratelimit.h>
#include <linux/sched/mm.h>
#include <crypto/hash.h>
#include "ctree.h"
#include "discard.h"
#include "volumes.h"
#include "disk-io.h"
#include "ordered-data.h"
#include "transaction.h"
#include "backref.h"
#include "extent_io.h"
#include "dev-replace.h"
#include "check-integrity.h"
#include "rcu-string.h"
#include "raid56.h"
#include "block-group.h"
#include "zoned.h"
/*
* This is only the first step towards a full-features scrub. It reads all
* extent and super block and verifies the checksums. In case a bad checksum
* is found or the extent cannot be read, good data will be written back if
* any can be found.
*
* Future enhancements:
* - In case an unrepairable extent is encountered, track which files are
* affected and report them
* - track and record media errors, throw out bad devices
* - add a mode to also read unallocated space
*/
struct scrub_block;
struct scrub_ctx;
/*
* the following three values only influence the performance.
* The last one configures the number of parallel and outstanding I/O
* operations. The first two values configure an upper limit for the number
* of (dynamically allocated) pages that are added to a bio.
*/
#define SCRUB_PAGES_PER_RD_BIO 32 /* 128k per bio */
#define SCRUB_PAGES_PER_WR_BIO 32 /* 128k per bio */
#define SCRUB_BIOS_PER_SCTX 64 /* 8MB per device in flight */
/*
* the following value times PAGE_SIZE needs to be large enough to match the
* largest node/leaf/sector size that shall be supported.
* Values larger than BTRFS_STRIPE_LEN are not supported.
*/
#define SCRUB_MAX_PAGES_PER_BLOCK 16 /* 64k per node/leaf/sector */
struct scrub_recover {
refcount_t refs;
struct btrfs_bio *bbio;
u64 map_length;
};
struct scrub_page {
struct scrub_block *sblock;
struct page *page;
struct btrfs_device *dev;
struct list_head list;
u64 flags; /* extent flags */
u64 generation;
u64 logical;
u64 physical;
u64 physical_for_dev_replace;
atomic_t refs;
u8 mirror_num;
unsigned int have_csum:1;
unsigned int io_error:1;
u8 csum[BTRFS_CSUM_SIZE];
struct scrub_recover *recover;
};
struct scrub_bio {
int index;
struct scrub_ctx *sctx;
struct btrfs_device *dev;
struct bio *bio;
blk_status_t status;
u64 logical;
u64 physical;
#if SCRUB_PAGES_PER_WR_BIO >= SCRUB_PAGES_PER_RD_BIO
struct scrub_page *pagev[SCRUB_PAGES_PER_WR_BIO];
#else
struct scrub_page *pagev[SCRUB_PAGES_PER_RD_BIO];
#endif
int page_count;
int next_free;
struct btrfs_work work;
};
struct scrub_block {
struct scrub_page *pagev[SCRUB_MAX_PAGES_PER_BLOCK];
int page_count;
atomic_t outstanding_pages;
refcount_t refs; /* free mem on transition to zero */
struct scrub_ctx *sctx;
struct scrub_parity *sparity;
struct {
unsigned int header_error:1;
unsigned int checksum_error:1;
unsigned int no_io_error_seen:1;
unsigned int generation_error:1; /* also sets header_error */
/* The following is for the data used to check parity */
/* It is for the data with checksum */
unsigned int data_corrected:1;
};
struct btrfs_work work;
};
/* Used for the chunks with parity stripe such RAID5/6 */
struct scrub_parity {
struct scrub_ctx *sctx;
struct btrfs_device *scrub_dev;
u64 logic_start;
u64 logic_end;
int nsectors;
u32 stripe_len;
refcount_t refs;
struct list_head spages;
/* Work of parity check and repair */
struct btrfs_work work;
/* Mark the parity blocks which have data */
unsigned long *dbitmap;
/*
* Mark the parity blocks which have data, but errors happen when
* read data or check data
*/
unsigned long *ebitmap;
unsigned long bitmap[];
};
struct scrub_ctx {
struct scrub_bio *bios[SCRUB_BIOS_PER_SCTX];
struct btrfs_fs_info *fs_info;
int first_free;
int curr;
atomic_t bios_in_flight;
atomic_t workers_pending;
spinlock_t list_lock;
wait_queue_head_t list_wait;
struct list_head csum_list;
atomic_t cancel_req;
int readonly;
int pages_per_rd_bio;
/* State of IO submission throttling affecting the associated device */
ktime_t throttle_deadline;
u64 throttle_sent;
int is_dev_replace;
u64 write_pointer;
struct scrub_bio *wr_curr_bio;
struct mutex wr_lock;
int pages_per_wr_bio; /* <= SCRUB_PAGES_PER_WR_BIO */
struct btrfs_device *wr_tgtdev;
bool flush_all_writes;
/*
* statistics
*/
struct btrfs_scrub_progress stat;
spinlock_t stat_lock;
/*
* Use a ref counter to avoid use-after-free issues. Scrub workers
* decrement bios_in_flight and workers_pending and then do a wakeup
* on the list_wait wait queue. We must ensure the main scrub task
* doesn't free the scrub context before or while the workers are
* doing the wakeup() call.
*/
refcount_t refs;
};
struct scrub_warning {
struct btrfs_path *path;
u64 extent_item_size;
const char *errstr;
u64 physical;
u64 logical;
struct btrfs_device *dev;
};
struct full_stripe_lock {
struct rb_node node;
u64 logical;
u64 refs;
struct mutex mutex;
};
static int scrub_setup_recheck_block(struct scrub_block *original_sblock,
struct scrub_block *sblocks_for_recheck);
static void scrub_recheck_block(struct btrfs_fs_info *fs_info,
struct scrub_block *sblock,
int retry_failed_mirror);
static void scrub_recheck_block_checksum(struct scrub_block *sblock);
static int scrub_repair_block_from_good_copy(struct scrub_block *sblock_bad,
struct scrub_block *sblock_good);
static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad,
struct scrub_block *sblock_good,
int page_num, int force_write);
static void scrub_write_block_to_dev_replace(struct scrub_block *sblock);
static int scrub_write_page_to_dev_replace(struct scrub_block *sblock,
int page_num);
static int scrub_checksum_data(struct scrub_block *sblock);
static int scrub_checksum_tree_block(struct scrub_block *sblock);
static int scrub_checksum_super(struct scrub_block *sblock);
static void scrub_block_put(struct scrub_block *sblock);
static void scrub_page_get(struct scrub_page *spage);
static void scrub_page_put(struct scrub_page *spage);
static void scrub_parity_get(struct scrub_parity *sparity);
static void scrub_parity_put(struct scrub_parity *sparity);
static int scrub_pages(struct scrub_ctx *sctx, u64 logical, u32 len,
u64 physical, struct btrfs_device *dev, u64 flags,
u64 gen, int mirror_num, u8 *csum,
u64 physical_for_dev_replace);
static void scrub_bio_end_io(struct bio *bio);
static void scrub_bio_end_io_worker(struct btrfs_work *work);
static void scrub_block_complete(struct scrub_block *sblock);
static void scrub_remap_extent(struct btrfs_fs_info *fs_info,
u64 extent_logical, u32 extent_len,
u64 *extent_physical,
struct btrfs_device **extent_dev,
int *extent_mirror_num);
static int scrub_add_page_to_wr_bio(struct scrub_ctx *sctx,
struct scrub_page *spage);
static void scrub_wr_submit(struct scrub_ctx *sctx);
static void scrub_wr_bio_end_io(struct bio *bio);
static void scrub_wr_bio_end_io_worker(struct btrfs_work *work);
static void scrub_put_ctx(struct scrub_ctx *sctx);
static inline int scrub_is_page_on_raid56(struct scrub_page *spage)
{
return spage->recover &&
(spage->recover->bbio->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK);
}
static void scrub_pending_bio_inc(struct scrub_ctx *sctx)
{
refcount_inc(&sctx->refs);
atomic_inc(&sctx->bios_in_flight);
}
static void scrub_pending_bio_dec(struct scrub_ctx *sctx)
{
atomic_dec(&sctx->bios_in_flight);
wake_up(&sctx->list_wait);
scrub_put_ctx(sctx);
}
static void __scrub_blocked_if_needed(struct btrfs_fs_info *fs_info)
{
while (atomic_read(&fs_info->scrub_pause_req)) {
mutex_unlock(&fs_info->scrub_lock);
wait_event(fs_info->scrub_pause_wait,
atomic_read(&fs_info->scrub_pause_req) == 0);
mutex_lock(&fs_info->scrub_lock);
}
}
static void scrub_pause_on(struct btrfs_fs_info *fs_info)
{
atomic_inc(&fs_info->scrubs_paused);
wake_up(&fs_info->scrub_pause_wait);
}
static void scrub_pause_off(struct btrfs_fs_info *fs_info)
{
mutex_lock(&fs_info->scrub_lock);
__scrub_blocked_if_needed(fs_info);
atomic_dec(&fs_info->scrubs_paused);
mutex_unlock(&fs_info->scrub_lock);
wake_up(&fs_info->scrub_pause_wait);
}
static void scrub_blocked_if_needed(struct btrfs_fs_info *fs_info)
{
scrub_pause_on(fs_info);
scrub_pause_off(fs_info);
}
/*
* Insert new full stripe lock into full stripe locks tree
*
* Return pointer to existing or newly inserted full_stripe_lock structure if
* everything works well.
* Return ERR_PTR(-ENOMEM) if we failed to allocate memory
*
* NOTE: caller must hold full_stripe_locks_root->lock before calling this
* function
*/
static struct full_stripe_lock *insert_full_stripe_lock(
struct btrfs_full_stripe_locks_tree *locks_root,
u64 fstripe_logical)
{
struct rb_node **p;
struct rb_node *parent = NULL;
struct full_stripe_lock *entry;
struct full_stripe_lock *ret;
lockdep_assert_held(&locks_root->lock);
p = &locks_root->root.rb_node;
while (*p) {
parent = *p;
entry = rb_entry(parent, struct full_stripe_lock, node);
if (fstripe_logical < entry->logical) {
p = &(*p)->rb_left;
} else if (fstripe_logical > entry->logical) {
p = &(*p)->rb_right;
} else {
entry->refs++;
return entry;
}
}
/*
* Insert new lock.
*/
ret = kmalloc(sizeof(*ret), GFP_KERNEL);
if (!ret)
return ERR_PTR(-ENOMEM);
ret->logical = fstripe_logical;
ret->refs = 1;
mutex_init(&ret->mutex);
rb_link_node(&ret->node, parent, p);
rb_insert_color(&ret->node, &locks_root->root);
return ret;
}
/*
* Search for a full stripe lock of a block group
*
* Return pointer to existing full stripe lock if found
* Return NULL if not found
*/
static struct full_stripe_lock *search_full_stripe_lock(
struct btrfs_full_stripe_locks_tree *locks_root,
u64 fstripe_logical)
{
struct rb_node *node;
struct full_stripe_lock *entry;
lockdep_assert_held(&locks_root->lock);
node = locks_root->root.rb_node;
while (node) {
entry = rb_entry(node, struct full_stripe_lock, node);
if (fstripe_logical < entry->logical)
node = node->rb_left;
else if (fstripe_logical > entry->logical)
node = node->rb_right;
else
return entry;
}
return NULL;
}
/*
* Helper to get full stripe logical from a normal bytenr.
*
* Caller must ensure @cache is a RAID56 block group.
*/
static u64 get_full_stripe_logical(struct btrfs_block_group *cache, u64 bytenr)
{
u64 ret;
/*
* Due to chunk item size limit, full stripe length should not be
* larger than U32_MAX. Just a sanity check here.
*/
WARN_ON_ONCE(cache->full_stripe_len >= U32_MAX);
/*
* round_down() can only handle power of 2, while RAID56 full
* stripe length can be 64KiB * n, so we need to manually round down.
*/
ret = div64_u64(bytenr - cache->start, cache->full_stripe_len) *
cache->full_stripe_len + cache->start;
return ret;
}
/*
* Lock a full stripe to avoid concurrency of recovery and read
*
* It's only used for profiles with parities (RAID5/6), for other profiles it
* does nothing.
*
* Return 0 if we locked full stripe covering @bytenr, with a mutex held.
* So caller must call unlock_full_stripe() at the same context.
*
* Return <0 if encounters error.
*/
static int lock_full_stripe(struct btrfs_fs_info *fs_info, u64 bytenr,
bool *locked_ret)
{
struct btrfs_block_group *bg_cache;
struct btrfs_full_stripe_locks_tree *locks_root;
struct full_stripe_lock *existing;
u64 fstripe_start;
int ret = 0;
*locked_ret = false;
bg_cache = btrfs_lookup_block_group(fs_info, bytenr);
if (!bg_cache) {
ASSERT(0);
return -ENOENT;
}
/* Profiles not based on parity don't need full stripe lock */
if (!(bg_cache->flags & BTRFS_BLOCK_GROUP_RAID56_MASK))
goto out;
locks_root = &bg_cache->full_stripe_locks_root;
fstripe_start = get_full_stripe_logical(bg_cache, bytenr);
/* Now insert the full stripe lock */
mutex_lock(&locks_root->lock);
existing = insert_full_stripe_lock(locks_root, fstripe_start);
mutex_unlock(&locks_root->lock);
if (IS_ERR(existing)) {
ret = PTR_ERR(existing);
goto out;
}
mutex_lock(&existing->mutex);
*locked_ret = true;
out:
btrfs_put_block_group(bg_cache);
return ret;
}
/*
* Unlock a full stripe.
*
* NOTE: Caller must ensure it's the same context calling corresponding
* lock_full_stripe().
*
* Return 0 if we unlock full stripe without problem.
* Return <0 for error
*/
static int unlock_full_stripe(struct btrfs_fs_info *fs_info, u64 bytenr,
bool locked)
{
struct btrfs_block_group *bg_cache;
struct btrfs_full_stripe_locks_tree *locks_root;
struct full_stripe_lock *fstripe_lock;
u64 fstripe_start;
bool freeit = false;
int ret = 0;
/* If we didn't acquire full stripe lock, no need to continue */
if (!locked)
return 0;
bg_cache = btrfs_lookup_block_group(fs_info, bytenr);
if (!bg_cache) {
ASSERT(0);
return -ENOENT;
}
if (!(bg_cache->flags & BTRFS_BLOCK_GROUP_RAID56_MASK))
goto out;
locks_root = &bg_cache->full_stripe_locks_root;
fstripe_start = get_full_stripe_logical(bg_cache, bytenr);
mutex_lock(&locks_root->lock);
fstripe_lock = search_full_stripe_lock(locks_root, fstripe_start);
/* Unpaired unlock_full_stripe() detected */
if (!fstripe_lock) {
WARN_ON(1);
ret = -ENOENT;
mutex_unlock(&locks_root->lock);
goto out;
}
if (fstripe_lock->refs == 0) {
WARN_ON(1);
btrfs_warn(fs_info, "full stripe lock at %llu refcount underflow",
fstripe_lock->logical);
} else {
fstripe_lock->refs--;
}
if (fstripe_lock->refs == 0) {
rb_erase(&fstripe_lock->node, &locks_root->root);
freeit = true;
}
mutex_unlock(&locks_root->lock);
mutex_unlock(&fstripe_lock->mutex);
if (freeit)
kfree(fstripe_lock);
out:
btrfs_put_block_group(bg_cache);
return ret;
}
static void scrub_free_csums(struct scrub_ctx *sctx)
{
while (!list_empty(&sctx->csum_list)) {
struct btrfs_ordered_sum *sum;
sum = list_first_entry(&sctx->csum_list,
struct btrfs_ordered_sum, list);
list_del(&sum->list);
kfree(sum);
}
}
static noinline_for_stack void scrub_free_ctx(struct scrub_ctx *sctx)
{
int i;
if (!sctx)
return;
/* this can happen when scrub is cancelled */
if (sctx->curr != -1) {
struct scrub_bio *sbio = sctx->bios[sctx->curr];
for (i = 0; i < sbio->page_count; i++) {
WARN_ON(!sbio->pagev[i]->page);
scrub_block_put(sbio->pagev[i]->sblock);
}
bio_put(sbio->bio);
}
for (i = 0; i < SCRUB_BIOS_PER_SCTX; ++i) {
struct scrub_bio *sbio = sctx->bios[i];
if (!sbio)
break;
kfree(sbio);
}
kfree(sctx->wr_curr_bio);
scrub_free_csums(sctx);
kfree(sctx);
}
static void scrub_put_ctx(struct scrub_ctx *sctx)
{
if (refcount_dec_and_test(&sctx->refs))
scrub_free_ctx(sctx);
}
static noinline_for_stack struct scrub_ctx *scrub_setup_ctx(
struct btrfs_fs_info *fs_info, int is_dev_replace)
{
struct scrub_ctx *sctx;
int i;
sctx = kzalloc(sizeof(*sctx), GFP_KERNEL);
if (!sctx)
goto nomem;
refcount_set(&sctx->refs, 1);
sctx->is_dev_replace = is_dev_replace;
sctx->pages_per_rd_bio = SCRUB_PAGES_PER_RD_BIO;
sctx->curr = -1;
sctx->fs_info = fs_info;
INIT_LIST_HEAD(&sctx->csum_list);
for (i = 0; i < SCRUB_BIOS_PER_SCTX; ++i) {
struct scrub_bio *sbio;
sbio = kzalloc(sizeof(*sbio), GFP_KERNEL);
if (!sbio)
goto nomem;
sctx->bios[i] = sbio;
sbio->index = i;
sbio->sctx = sctx;
sbio->page_count = 0;
btrfs_init_work(&sbio->work, scrub_bio_end_io_worker, NULL,
NULL);
if (i != SCRUB_BIOS_PER_SCTX - 1)
sctx->bios[i]->next_free = i + 1;
else
sctx->bios[i]->next_free = -1;
}
sctx->first_free = 0;
atomic_set(&sctx->bios_in_flight, 0);
atomic_set(&sctx->workers_pending, 0);
atomic_set(&sctx->cancel_req, 0);
spin_lock_init(&sctx->list_lock);
spin_lock_init(&sctx->stat_lock);
init_waitqueue_head(&sctx->list_wait);
sctx->throttle_deadline = 0;
WARN_ON(sctx->wr_curr_bio != NULL);
mutex_init(&sctx->wr_lock);
sctx->wr_curr_bio = NULL;
if (is_dev_replace) {
WARN_ON(!fs_info->dev_replace.tgtdev);
sctx->pages_per_wr_bio = SCRUB_PAGES_PER_WR_BIO;
sctx->wr_tgtdev = fs_info->dev_replace.tgtdev;
sctx->flush_all_writes = false;
}
return sctx;
nomem:
scrub_free_ctx(sctx);
return ERR_PTR(-ENOMEM);
}
static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root,
void *warn_ctx)
{
u32 nlink;
int ret;
int i;
unsigned nofs_flag;
struct extent_buffer *eb;
struct btrfs_inode_item *inode_item;
struct scrub_warning *swarn = warn_ctx;
struct btrfs_fs_info *fs_info = swarn->dev->fs_info;
struct inode_fs_paths *ipath = NULL;
struct btrfs_root *local_root;
struct btrfs_key key;
local_root = btrfs_get_fs_root(fs_info, root, true);
if (IS_ERR(local_root)) {
ret = PTR_ERR(local_root);
goto err;
}
/*
* this makes the path point to (inum INODE_ITEM ioff)
*/
key.objectid = inum;
key.type = BTRFS_INODE_ITEM_KEY;
key.offset = 0;
ret = btrfs_search_slot(NULL, local_root, &key, swarn->path, 0, 0);
if (ret) {
btrfs_put_root(local_root);
btrfs_release_path(swarn->path);
goto err;
}
eb = swarn->path->nodes[0];
inode_item = btrfs_item_ptr(eb, swarn->path->slots[0],
struct btrfs_inode_item);
nlink = btrfs_inode_nlink(eb, inode_item);
btrfs_release_path(swarn->path);
/*
* init_path might indirectly call vmalloc, or use GFP_KERNEL. Scrub
* uses GFP_NOFS in this context, so we keep it consistent but it does
* not seem to be strictly necessary.
*/
nofs_flag = memalloc_nofs_save();
ipath = init_ipath(4096, local_root, swarn->path);
memalloc_nofs_restore(nofs_flag);
if (IS_ERR(ipath)) {
btrfs_put_root(local_root);
ret = PTR_ERR(ipath);
ipath = NULL;
goto err;
}
ret = paths_from_inode(inum, ipath);
if (ret < 0)
goto err;
/*
* we deliberately ignore the bit ipath might have been too small to
* hold all of the paths here
*/
for (i = 0; i < ipath->fspath->elem_cnt; ++i)
btrfs_warn_in_rcu(fs_info,
"%s at logical %llu on dev %s, physical %llu, root %llu, inode %llu, offset %llu, length %u, links %u (path: %s)",
swarn->errstr, swarn->logical,
rcu_str_deref(swarn->dev->name),
swarn->physical,
root, inum, offset,
fs_info->sectorsize, nlink,
(char *)(unsigned long)ipath->fspath->val[i]);
btrfs_put_root(local_root);
free_ipath(ipath);
return 0;
err:
btrfs_warn_in_rcu(fs_info,
"%s at logical %llu on dev %s, physical %llu, root %llu, inode %llu, offset %llu: path resolving failed with ret=%d",
swarn->errstr, swarn->logical,
rcu_str_deref(swarn->dev->name),
swarn->physical,
root, inum, offset, ret);
free_ipath(ipath);
return 0;
}
static void scrub_print_warning(const char *errstr, struct scrub_block *sblock)
{
struct btrfs_device *dev;
struct btrfs_fs_info *fs_info;
struct btrfs_path *path;
struct btrfs_key found_key;
struct extent_buffer *eb;
struct btrfs_extent_item *ei;
struct scrub_warning swarn;
unsigned long ptr = 0;
u64 extent_item_pos;
u64 flags = 0;
u64 ref_root;
u32 item_size;
u8 ref_level = 0;
int ret;
WARN_ON(sblock->page_count < 1);
dev = sblock->pagev[0]->dev;
fs_info = sblock->sctx->fs_info;
path = btrfs_alloc_path();
if (!path)
return;
swarn.physical = sblock->pagev[0]->physical;
swarn.logical = sblock->pagev[0]->logical;
swarn.errstr = errstr;
swarn.dev = NULL;
ret = extent_from_logical(fs_info, swarn.logical, path, &found_key,
&flags);
if (ret < 0)
goto out;
extent_item_pos = swarn.logical - found_key.objectid;
swarn.extent_item_size = found_key.offset;
eb = path->nodes[0];
ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item);
item_size = btrfs_item_size_nr(eb, path->slots[0]);
if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
do {
ret = tree_backref_for_extent(&ptr, eb, &found_key, ei,
item_size, &ref_root,
&ref_level);
btrfs_warn_in_rcu(fs_info,
"%s at logical %llu on dev %s, physical %llu: metadata %s (level %d) in tree %llu",
errstr, swarn.logical,
rcu_str_deref(dev->name),
swarn.physical,
ref_level ? "node" : "leaf",
ret < 0 ? -1 : ref_level,
ret < 0 ? -1 : ref_root);
} while (ret != 1);
btrfs_release_path(path);
} else {
btrfs_release_path(path);
swarn.path = path;
swarn.dev = dev;
iterate_extent_inodes(fs_info, found_key.objectid,
extent_item_pos, 1,
scrub_print_warning_inode, &swarn, false);
}
out:
btrfs_free_path(path);
}
static inline void scrub_get_recover(struct scrub_recover *recover)
{
refcount_inc(&recover->refs);
}
static inline void scrub_put_recover(struct btrfs_fs_info *fs_info,
struct scrub_recover *recover)
{
if (refcount_dec_and_test(&recover->refs)) {
btrfs_bio_counter_dec(fs_info);
btrfs_put_bbio(recover->bbio);
kfree(recover);
}
}
/*
* scrub_handle_errored_block gets called when either verification of the
* pages failed or the bio failed to read, e.g. with EIO. In the latter
* case, this function handles all pages in the bio, even though only one
* may be bad.
* The goal of this function is to repair the errored block by using the
* contents of one of the mirrors.
*/
static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
{
struct scrub_ctx *sctx = sblock_to_check->sctx;
struct btrfs_device *dev;
struct btrfs_fs_info *fs_info;
u64 logical;
unsigned int failed_mirror_index;
unsigned int is_metadata;
unsigned int have_csum;
struct scrub_block *sblocks_for_recheck; /* holds one for each mirror */
struct scrub_block *sblock_bad;
int ret;
int mirror_index;
int page_num;
int success;
bool full_stripe_locked;
unsigned int nofs_flag;
static DEFINE_RATELIMIT_STATE(rs, DEFAULT_RATELIMIT_INTERVAL,
DEFAULT_RATELIMIT_BURST);
BUG_ON(sblock_to_check->page_count < 1);
fs_info = sctx->fs_info;
if (sblock_to_check->pagev[0]->flags & BTRFS_EXTENT_FLAG_SUPER) {
/*
* if we find an error in a super block, we just report it.
* They will get written with the next transaction commit
* anyway
*/
spin_lock(&sctx->stat_lock);
++sctx->stat.super_errors;
spin_unlock(&sctx->stat_lock);
return 0;
}
logical = sblock_to_check->pagev[0]->logical;
BUG_ON(sblock_to_check->pagev[0]->mirror_num < 1);
failed_mirror_index = sblock_to_check->pagev[0]->mirror_num - 1;
is_metadata = !(sblock_to_check->pagev[0]->flags &
BTRFS_EXTENT_FLAG_DATA);
have_csum = sblock_to_check->pagev[0]->have_csum;
dev = sblock_to_check->pagev[0]->dev;
if (btrfs_is_zoned(fs_info) && !sctx->is_dev_replace)
return btrfs_repair_one_zone(fs_info, logical);
/*
* We must use GFP_NOFS because the scrub task might be waiting for a
* worker task executing this function and in turn a transaction commit
* might be waiting the scrub task to pause (which needs to wait for all
* the worker tasks to complete before pausing).
* We do allocations in the workers through insert_full_stripe_lock()
* and scrub_add_page_to_wr_bio(), which happens down the call chain of
* this function.
*/
nofs_flag = memalloc_nofs_save();
/*
* For RAID5/6, race can happen for a different device scrub thread.
* For data corruption, Parity and Data threads will both try
* to recovery the data.
* Race can lead to doubly added csum error, or even unrecoverable
* error.
*/
ret = lock_full_stripe(fs_info, logical, &full_stripe_locked);
if (ret < 0) {
memalloc_nofs_restore(nofs_flag);
spin_lock(&sctx->stat_lock);
if (ret == -ENOMEM)
sctx->stat.malloc_errors++;
sctx->stat.read_errors++;
sctx->stat.uncorrectable_errors++;
spin_unlock(&sctx->stat_lock);
return ret;
}
/*
* read all mirrors one after the other. This includes to
* re-read the extent or metadata block that failed (that was
* the cause that this fixup code is called) another time,
* sector by sector this time in order to know which sectors
* caused I/O errors and which ones are good (for all mirrors).
* It is the goal to handle the situation when more than one
* mirror contains I/O errors, but the errors do not
* overlap, i.e. the data can be repaired by selecting the
* sectors from those mirrors without I/O error on the
* particular sectors. One example (with blocks >= 2 * sectorsize)
* would be that mirror #1 has an I/O error on the first sector,
* the second sector is good, and mirror #2 has an I/O error on
* the second sector, but the first sector is good.
* Then the first sector of the first mirror can be repaired by
* taking the first sector of the second mirror, and the
* second sector of the second mirror can be repaired by
* copying the contents of the 2nd sector of the 1st mirror.
* One more note: if the sectors of one mirror contain I/O
* errors, the checksum cannot be verified. In order to get
* the best data for repairing, the first attempt is to find
* a mirror without I/O errors and with a validated checksum.
* Only if this is not possible, the sectors are picked from
* mirrors with I/O errors without considering the checksum.
* If the latter is the case, at the end, the checksum of the
* repaired area is verified in order to correctly maintain
* the statistics.
*/
sblocks_for_recheck = kcalloc(BTRFS_MAX_MIRRORS,
sizeof(*sblocks_for_recheck), GFP_KERNEL);
if (!sblocks_for_recheck) {
spin_lock(&sctx->stat_lock);
sctx->stat.malloc_errors++;
sctx->stat.read_errors++;
sctx->stat.uncorrectable_errors++;
spin_unlock(&sctx->stat_lock);
btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS);
goto out;
}
/* setup the context, map the logical blocks and alloc the pages */
ret = scrub_setup_recheck_block(sblock_to_check, sblocks_for_recheck);
if (ret) {
spin_lock(&sctx->stat_lock);
sctx->stat.read_errors++;
sctx->stat.uncorrectable_errors++;
spin_unlock(&sctx->stat_lock);
btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS);
goto out;
}
BUG_ON(failed_mirror_index >= BTRFS_MAX_MIRRORS);
sblock_bad = sblocks_for_recheck + failed_mirror_index;
/* build and submit the bios for the failed mirror, check checksums */
scrub_recheck_block(fs_info, sblock_bad, 1);
if (!sblock_bad->header_error && !sblock_bad->checksum_error &&
sblock_bad->no_io_error_seen) {
/*
* the error disappeared after reading page by page, or
* the area was part of a huge bio and other parts of the
* bio caused I/O errors, or the block layer merged several
* read requests into one and the error is caused by a
* different bio (usually one of the two latter cases is
* the cause)
*/
spin_lock(&sctx->stat_lock);
sctx->stat.unverified_errors++;
sblock_to_check->data_corrected = 1;
spin_unlock(&sctx->stat_lock);
if (sctx->is_dev_replace)
scrub_write_block_to_dev_replace(sblock_bad);
goto out;
}
if (!sblock_bad->no_io_error_seen) {
spin_lock(&sctx->stat_lock);
sctx->stat.read_errors++;
spin_unlock(&sctx->stat_lock);
if (__ratelimit(&rs))
scrub_print_warning("i/o error", sblock_to_check);
btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS);
} else if (sblock_bad->checksum_error) {
spin_lock(&sctx->stat_lock);
sctx->stat.csum_errors++;
spin_unlock(&sctx->stat_lock);
if (__ratelimit(&rs))
scrub_print_warning("checksum error", sblock_to_check);
btrfs_dev_stat_inc_and_print(dev,
BTRFS_DEV_STAT_CORRUPTION_ERRS);
} else if (sblock_bad->header_error) {
spin_lock(&sctx->stat_lock);
sctx->stat.verify_errors++;
spin_unlock(&sctx->stat_lock);
if (__ratelimit(&rs))
scrub_print_warning("checksum/header error",
sblock_to_check);
if (sblock_bad->generation_error)
btrfs_dev_stat_inc_and_print(dev,
BTRFS_DEV_STAT_GENERATION_ERRS);
else
btrfs_dev_stat_inc_and_print(dev,
BTRFS_DEV_STAT_CORRUPTION_ERRS);
}
if (sctx->readonly) {
ASSERT(!sctx->is_dev_replace);
goto out;
}
/*
* now build and submit the bios for the other mirrors, check
* checksums.
* First try to pick the mirror which is completely without I/O
* errors and also does not have a checksum error.
* If one is found, and if a checksum is present, the full block
* that is known to contain an error is rewritten. Afterwards
* the block is known to be corrected.
* If a mirror is found which is completely correct, and no
* checksum is present, only those pages are rewritten that had
* an I/O error in the block to be repaired, since it cannot be
* determined, which copy of the other pages is better (and it
* could happen otherwise that a correct page would be
* overwritten by a bad one).
*/
for (mirror_index = 0; ;mirror_index++) {
struct scrub_block *sblock_other;
if (mirror_index == failed_mirror_index)
continue;
/* raid56's mirror can be more than BTRFS_MAX_MIRRORS */
if (!scrub_is_page_on_raid56(sblock_bad->pagev[0])) {
if (mirror_index >= BTRFS_MAX_MIRRORS)
break;
if (!sblocks_for_recheck[mirror_index].page_count)
break;
sblock_other = sblocks_for_recheck + mirror_index;
} else {
struct scrub_recover *r = sblock_bad->pagev[0]->recover;
int max_allowed = r->bbio->num_stripes -
r->bbio->num_tgtdevs;
if (mirror_index >= max_allowed)
break;
if (!sblocks_for_recheck[1].page_count)
break;
ASSERT(failed_mirror_index == 0);
sblock_other = sblocks_for_recheck + 1;
sblock_other->pagev[0]->mirror_num = 1 + mirror_index;
}
/* build and submit the bios, check checksums */
scrub_recheck_block(fs_info, sblock_other, 0);
if (!sblock_other->header_error &&
!sblock_other->checksum_error &&
sblock_other->no_io_error_seen) {
if (sctx->is_dev_replace) {
scrub_write_block_to_dev_replace(sblock_other);
goto corrected_error;
} else {
ret = scrub_repair_block_from_good_copy(
sblock_bad, sblock_other);
if (!ret)
goto corrected_error;
}
}
}
if (sblock_bad->no_io_error_seen && !sctx->is_dev_replace)
goto did_not_correct_error;
/*
* In case of I/O errors in the area that is supposed to be
* repaired, continue by picking good copies of those sectors.
* Select the good sectors from mirrors to rewrite bad sectors from
* the area to fix. Afterwards verify the checksum of the block
* that is supposed to be repaired. This verification step is
* only done for the purpose of statistic counting and for the
* final scrub report, whether errors remain.
* A perfect algorithm could make use of the checksum and try
* all possible combinations of sectors from the different mirrors
* until the checksum verification succeeds. For example, when
* the 2nd sector of mirror #1 faces I/O errors, and the 2nd sector
* of mirror #2 is readable but the final checksum test fails,
* then the 2nd sector of mirror #3 could be tried, whether now
* the final checksum succeeds. But this would be a rare
* exception and is therefore not implemented. At least it is
* avoided that the good copy is overwritten.
* A more useful improvement would be to pick the sectors
* without I/O error based on sector sizes (512 bytes on legacy
* disks) instead of on sectorsize. Then maybe 512 byte of one
* mirror could be repaired by taking 512 byte of a different
* mirror, even if other 512 byte sectors in the same sectorsize
* area are unreadable.
*/
success = 1;
for (page_num = 0; page_num < sblock_bad->page_count;
page_num++) {
struct scrub_page *spage_bad = sblock_bad->pagev[page_num];
struct scrub_block *sblock_other = NULL;
/* skip no-io-error page in scrub */
if (!spage_bad->io_error && !sctx->is_dev_replace)
continue;
if (scrub_is_page_on_raid56(sblock_bad->pagev[0])) {
/*
* In case of dev replace, if raid56 rebuild process
* didn't work out correct data, then copy the content
* in sblock_bad to make sure target device is identical
* to source device, instead of writing garbage data in
* sblock_for_recheck array to target device.
*/
sblock_other = NULL;
} else if (spage_bad->io_error) {
/* try to find no-io-error page in mirrors */
for (mirror_index = 0;
mirror_index < BTRFS_MAX_MIRRORS &&
sblocks_for_recheck[mirror_index].page_count > 0;
mirror_index++) {
if (!sblocks_for_recheck[mirror_index].
pagev[page_num]->io_error) {
sblock_other = sblocks_for_recheck +
mirror_index;
break;
}
}
if (!sblock_other)
success = 0;
}
if (sctx->is_dev_replace) {
/*
* did not find a mirror to fetch the page
* from. scrub_write_page_to_dev_replace()
* handles this case (page->io_error), by
* filling the block with zeros before
* submitting the write request
*/
if (!sblock_other)
sblock_other = sblock_bad;
if (scrub_write_page_to_dev_replace(sblock_other,
page_num) != 0) {
atomic64_inc(
&fs_info->dev_replace.num_write_errors);
success = 0;
}
} else if (sblock_other) {
ret = scrub_repair_page_from_good_copy(sblock_bad,
sblock_other,
page_num, 0);
if (0 == ret)
spage_bad->io_error = 0;
else
success = 0;
}
}
if (success && !sctx->is_dev_replace) {
if (is_metadata || have_csum) {
/*
* need to verify the checksum now that all
* sectors on disk are repaired (the write
* request for data to be repaired is on its way).
* Just be lazy and use scrub_recheck_block()
* which re-reads the data before the checksum
* is verified, but most likely the data comes out
* of the page cache.
*/
scrub_recheck_block(fs_info, sblock_bad, 1);
if (!sblock_bad->header_error &&
!sblock_bad->checksum_error &&
sblock_bad->no_io_error_seen)
goto corrected_error;
else
goto did_not_correct_error;
} else {
corrected_error:
spin_lock(&sctx->stat_lock);
sctx->stat.corrected_errors++;
sblock_to_check->data_corrected = 1;
spin_unlock(&sctx->stat_lock);
btrfs_err_rl_in_rcu(fs_info,
"fixed up error at logical %llu on dev %s",
logical, rcu_str_deref(dev->name));
}
} else {
did_not_correct_error:
spin_lock(&sctx->stat_lock);
sctx->stat.uncorrectable_errors++;
spin_unlock(&sctx->stat_lock);
btrfs_err_rl_in_rcu(fs_info,
"unable to fixup (regular) error at logical %llu on dev %s",
logical, rcu_str_deref(dev->name));
}
out:
if (sblocks_for_recheck) {
for (mirror_index = 0; mirror_index < BTRFS_MAX_MIRRORS;
mirror_index++) {
struct scrub_block *sblock = sblocks_for_recheck +
mirror_index;
struct scrub_recover *recover;
int page_index;
for (page_index = 0; page_index < sblock->page_count;
page_index++) {
sblock->pagev[page_index]->sblock = NULL;
recover = sblock->pagev[page_index]->recover;
if (recover) {
scrub_put_recover(fs_info, recover);
sblock->pagev[page_index]->recover =
NULL;
}
scrub_page_put(sblock->pagev[page_index]);
}
}
kfree(sblocks_for_recheck);
}
ret = unlock_full_stripe(fs_info, logical, full_stripe_locked);
memalloc_nofs_restore(nofs_flag);
if (ret < 0)
return ret;
return 0;
}
static inline int scrub_nr_raid_mirrors(struct btrfs_bio *bbio)
{
if (bbio->map_type & BTRFS_BLOCK_GROUP_RAID5)
return 2;
else if (bbio->map_type & BTRFS_BLOCK_GROUP_RAID6)
return 3;
else
return (int)bbio->num_stripes;
}
static inline void scrub_stripe_index_and_offset(u64 logical, u64 map_type,
u64 *raid_map,
u64 mapped_length,
int nstripes, int mirror,
int *stripe_index,
u64 *stripe_offset)
{
int i;
if (map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
/* RAID5/6 */
for (i = 0; i < nstripes; i++) {
if (raid_map[i] == RAID6_Q_STRIPE ||
raid_map[i] == RAID5_P_STRIPE)
continue;
if (logical >= raid_map[i] &&
logical < raid_map[i] + mapped_length)
break;
}
*stripe_index = i;
*stripe_offset = logical - raid_map[i];
} else {
/* The other RAID type */
*stripe_index = mirror;
*stripe_offset = 0;
}
}
static int scrub_setup_recheck_block(struct scrub_block *original_sblock,
struct scrub_block *sblocks_for_recheck)
{
struct scrub_ctx *sctx = original_sblock->sctx;
struct btrfs_fs_info *fs_info = sctx->fs_info;
u64 length = original_sblock->page_count * fs_info->sectorsize;
u64 logical = original_sblock->pagev[0]->logical;
u64 generation = original_sblock->pagev[0]->generation;
u64 flags = original_sblock->pagev[0]->flags;
u64 have_csum = original_sblock->pagev[0]->have_csum;
struct scrub_recover *recover;
struct btrfs_bio *bbio;
u64 sublen;
u64 mapped_length;
u64 stripe_offset;
int stripe_index;
int page_index = 0;
int mirror_index;
int nmirrors;
int ret;
/*
* note: the two members refs and outstanding_pages
* are not used (and not set) in the blocks that are used for
* the recheck procedure
*/
while (length > 0) {
sublen = min_t(u64, length, fs_info->sectorsize);
mapped_length = sublen;
bbio = NULL;
/*
* With a length of sectorsize, each returned stripe represents
* one mirror
*/
btrfs_bio_counter_inc_blocked(fs_info);
ret = btrfs_map_sblock(fs_info, BTRFS_MAP_GET_READ_MIRRORS,
logical, &mapped_length, &bbio);
if (ret || !bbio || mapped_length < sublen) {
btrfs_put_bbio(bbio);
btrfs_bio_counter_dec(fs_info);
return -EIO;
}
recover = kzalloc(sizeof(struct scrub_recover), GFP_NOFS);
if (!recover) {
btrfs_put_bbio(bbio);
btrfs_bio_counter_dec(fs_info);
return -ENOMEM;
}
refcount_set(&recover->refs, 1);
recover->bbio = bbio;
recover->map_length = mapped_length;
BUG_ON(page_index >= SCRUB_MAX_PAGES_PER_BLOCK);
nmirrors = min(scrub_nr_raid_mirrors(bbio), BTRFS_MAX_MIRRORS);
for (mirror_index = 0; mirror_index < nmirrors;
mirror_index++) {
struct scrub_block *sblock;
struct scrub_page *spage;
sblock = sblocks_for_recheck + mirror_index;
sblock->sctx = sctx;
spage = kzalloc(sizeof(*spage), GFP_NOFS);
if (!spage) {
leave_nomem:
spin_lock(&sctx->stat_lock);
sctx->stat.malloc_errors++;
spin_unlock(&sctx->stat_lock);
scrub_put_recover(fs_info, recover);
return -ENOMEM;
}
scrub_page_get(spage);
sblock->pagev[page_index] = spage;
spage->sblock = sblock;
spage->flags = flags;
spage->generation = generation;
spage->logical = logical;
spage->have_csum = have_csum;
if (have_csum)
memcpy(spage->csum,
original_sblock->pagev[0]->csum,
sctx->fs_info->csum_size);
scrub_stripe_index_and_offset(logical,
bbio->map_type,
bbio->raid_map,
mapped_length,
bbio->num_stripes -
bbio->num_tgtdevs,
mirror_index,
&stripe_index,
&stripe_offset);
spage->physical = bbio->stripes[stripe_index].physical +
stripe_offset;
spage->dev = bbio->stripes[stripe_index].dev;
BUG_ON(page_index >= original_sblock->page_count);
spage->physical_for_dev_replace =
original_sblock->pagev[page_index]->
physical_for_dev_replace;
/* for missing devices, dev->bdev is NULL */
spage->mirror_num = mirror_index + 1;
sblock->page_count++;
spage->page = alloc_page(GFP_NOFS);
if (!spage->page)
goto leave_nomem;
scrub_get_recover(recover);
spage->recover = recover;
}
scrub_put_recover(fs_info, recover);
length -= sublen;
logical += sublen;
page_index++;
}
return 0;
}
static void scrub_bio_wait_endio(struct bio *bio)
{
complete(bio->bi_private);
}
static int scrub_submit_raid56_bio_wait(struct btrfs_fs_info *fs_info,
struct bio *bio,
struct scrub_page *spage)
{
DECLARE_COMPLETION_ONSTACK(done);
int ret;
int mirror_num;
bio->bi_iter.bi_sector = spage->logical >> 9;
bio->bi_private = &done;
bio->bi_end_io = scrub_bio_wait_endio;
mirror_num = spage->sblock->pagev[0]->mirror_num;
ret = raid56_parity_recover(fs_info, bio, spage->recover->bbio,
spage->recover->map_length,
mirror_num, 0);
if (ret)
return ret;
wait_for_completion_io(&done);
return blk_status_to_errno(bio->bi_status);
}
static void scrub_recheck_block_on_raid56(struct btrfs_fs_info *fs_info,
struct scrub_block *sblock)
{
struct scrub_page *first_page = sblock->pagev[0];
struct bio *bio;
int page_num;
/* All pages in sblock belong to the same stripe on the same device. */
ASSERT(first_page->dev);
if (!first_page->dev->bdev)
goto out;
bio = btrfs_io_bio_alloc(BIO_MAX_VECS);
bio_set_dev(bio, first_page->dev->bdev);
for (page_num = 0; page_num < sblock->page_count; page_num++) {
struct scrub_page *spage = sblock->pagev[page_num];
WARN_ON(!spage->page);
bio_add_page(bio, spage->page, PAGE_SIZE, 0);
}
if (scrub_submit_raid56_bio_wait(fs_info, bio, first_page)) {
bio_put(bio);
goto out;
}
bio_put(bio);
scrub_recheck_block_checksum(sblock);
return;
out:
for (page_num = 0; page_num < sblock->page_count; page_num++)
sblock->pagev[page_num]->io_error = 1;
sblock->no_io_error_seen = 0;
}
/*
* this function will check the on disk data for checksum errors, header
* errors and read I/O errors. If any I/O errors happen, the exact pages
* which are errored are marked as being bad. The goal is to enable scrub
* to take those pages that are not errored from all the mirrors so that
* the pages that are errored in the just handled mirror can be repaired.
*/
static void scrub_recheck_block(struct btrfs_fs_info *fs_info,
struct scrub_block *sblock,
int retry_failed_mirror)
{
int page_num;
sblock->no_io_error_seen = 1;
/* short cut for raid56 */
if (!retry_failed_mirror && scrub_is_page_on_raid56(sblock->pagev[0]))
return scrub_recheck_block_on_raid56(fs_info, sblock);
for (page_num = 0; page_num < sblock->page_count; page_num++) {
struct bio *bio;
struct scrub_page *spage = sblock->pagev[page_num];
if (spage->dev->bdev == NULL) {
spage->io_error = 1;
sblock->no_io_error_seen = 0;
continue;
}
WARN_ON(!spage->page);
bio = btrfs_io_bio_alloc(1);
bio_set_dev(bio, spage->dev->bdev);
bio_add_page(bio, spage->page, fs_info->sectorsize, 0);
bio->bi_iter.bi_sector = spage->physical >> 9;
bio->bi_opf = REQ_OP_READ;
if (btrfsic_submit_bio_wait(bio)) {
spage->io_error = 1;
sblock->no_io_error_seen = 0;
}
bio_put(bio);
}
if (sblock->no_io_error_seen)
scrub_recheck_block_checksum(sblock);
}
static inline int scrub_check_fsid(u8 fsid[],
struct scrub_page *spage)
{
struct btrfs_fs_devices *fs_devices = spage->dev->fs_devices;
int ret;
ret = memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE);
return !ret;
}
static void scrub_recheck_block_checksum(struct scrub_block *sblock)
{
sblock->header_error = 0;
sblock->checksum_error = 0;
sblock->generation_error = 0;
if (sblock->pagev[0]->flags & BTRFS_EXTENT_FLAG_DATA)
scrub_checksum_data(sblock);
else
scrub_checksum_tree_block(sblock);
}
static int scrub_repair_block_from_good_copy(struct scrub_block *sblock_bad,
struct scrub_block *sblock_good)
{
int page_num;
int ret = 0;
for (page_num = 0; page_num < sblock_bad->page_count; page_num++) {
int ret_sub;
ret_sub = scrub_repair_page_from_good_copy(sblock_bad,
sblock_good,
page_num, 1);
if (ret_sub)
ret = ret_sub;
}
return ret;
}
static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad,
struct scrub_block *sblock_good,
int page_num, int force_write)
{
struct scrub_page *spage_bad = sblock_bad->pagev[page_num];
struct scrub_page *spage_good = sblock_good->pagev[page_num];
struct btrfs_fs_info *fs_info = sblock_bad->sctx->fs_info;
const u32 sectorsize = fs_info->sectorsize;
BUG_ON(spage_bad->page == NULL);
BUG_ON(spage_good->page == NULL);
if (force_write || sblock_bad->header_error ||
sblock_bad->checksum_error || spage_bad->io_error) {
struct bio *bio;
int ret;
if (!spage_bad->dev->bdev) {
btrfs_warn_rl(fs_info,
"scrub_repair_page_from_good_copy(bdev == NULL) is unexpected");
return -EIO;
}
bio = btrfs_io_bio_alloc(1);
bio_set_dev(bio, spage_bad->dev->bdev);
bio->bi_iter.bi_sector = spage_bad->physical >> 9;
bio->bi_opf = REQ_OP_WRITE;
ret = bio_add_page(bio, spage_good->page, sectorsize, 0);
if (ret != sectorsize) {
bio_put(bio);
return -EIO;
}
if (btrfsic_submit_bio_wait(bio)) {
btrfs_dev_stat_inc_and_print(spage_bad->dev,
BTRFS_DEV_STAT_WRITE_ERRS);
atomic64_inc(&fs_info->dev_replace.num_write_errors);
bio_put(bio);
return -EIO;
}
bio_put(bio);
}
return 0;
}
static void scrub_write_block_to_dev_replace(struct scrub_block *sblock)
{
struct btrfs_fs_info *fs_info = sblock->sctx->fs_info;
int page_num;
/*
* This block is used for the check of the parity on the source device,
* so the data needn't be written into the destination device.
*/
if (sblock->sparity)
return;
for (page_num = 0; page_num < sblock->page_count; page_num++) {
int ret;
ret = scrub_write_page_to_dev_replace(sblock, page_num);
if (ret)
atomic64_inc(&fs_info->dev_replace.num_write_errors);
}
}
static int scrub_write_page_to_dev_replace(struct scrub_block *sblock,
int page_num)
{
struct scrub_page *spage = sblock->pagev[page_num];
BUG_ON(spage->page == NULL);
if (spage->io_error)
clear_page(page_address(spage->page));
return scrub_add_page_to_wr_bio(sblock->sctx, spage);
}
static int fill_writer_pointer_gap(struct scrub_ctx *sctx, u64 physical)
{
int ret = 0;
u64 length;
if (!btrfs_is_zoned(sctx->fs_info))
return 0;
if (!btrfs_dev_is_sequential(sctx->wr_tgtdev, physical))
return 0;
if (sctx->write_pointer < physical) {
length = physical - sctx->write_pointer;
ret = btrfs_zoned_issue_zeroout(sctx->wr_tgtdev,
sctx->write_pointer, length);
if (!ret)
sctx->write_pointer = physical;
}
return ret;
}
static int scrub_add_page_to_wr_bio(struct scrub_ctx *sctx,
struct scrub_page *spage)
{
struct scrub_bio *sbio;
int ret;
const u32 sectorsize = sctx->fs_info->sectorsize;
mutex_lock(&sctx->wr_lock);
again:
if (!sctx->wr_curr_bio) {
sctx->wr_curr_bio = kzalloc(sizeof(*sctx->wr_curr_bio),
GFP_KERNEL);
if (!sctx->wr_curr_bio) {
mutex_unlock(&sctx->wr_lock);
return -ENOMEM;
}
sctx->wr_curr_bio->sctx = sctx;
sctx->wr_curr_bio->page_count = 0;
}
sbio = sctx->wr_curr_bio;
if (sbio->page_count == 0) {
struct bio *bio;
ret = fill_writer_pointer_gap(sctx,
spage->physical_for_dev_replace);
if (ret) {
mutex_unlock(&sctx->wr_lock);
return ret;
}
sbio->physical = spage->physical_for_dev_replace;
sbio->logical = spage->logical;
sbio->dev = sctx->wr_tgtdev;
bio = sbio->bio;
if (!bio) {
bio = btrfs_io_bio_alloc(sctx->pages_per_wr_bio);
sbio->bio = bio;
}
bio->bi_private = sbio;
bio->bi_end_io = scrub_wr_bio_end_io;
bio_set_dev(bio, sbio->dev->bdev);
bio->bi_iter.bi_sector = sbio->physical >> 9;
bio->bi_opf = REQ_OP_WRITE;
sbio->status = 0;
} else if (sbio->physical + sbio->page_count * sectorsize !=
spage->physical_for_dev_replace ||
sbio->logical + sbio->page_count * sectorsize !=
spage->logical) {
scrub_wr_submit(sctx);
goto again;
}
ret = bio_add_page(sbio->bio, spage->page, sectorsize, 0);
if (ret != sectorsize) {
if (sbio->page_count < 1) {
bio_put(sbio->bio);
sbio->bio = NULL;
mutex_unlock(&sctx->wr_lock);
return -EIO;
}
scrub_wr_submit(sctx);
goto again;
}
sbio->pagev[sbio->page_count] = spage;
scrub_page_get(spage);
sbio->page_count++;
if (sbio->page_count == sctx->pages_per_wr_bio)
scrub_wr_submit(sctx);
mutex_unlock(&sctx->wr_lock);
return 0;
}
static void scrub_wr_submit(struct scrub_ctx *sctx)
{
struct scrub_bio *sbio;
if (!sctx->wr_curr_bio)
return;
sbio = sctx->wr_curr_bio;
sctx->wr_curr_bio = NULL;
WARN_ON(!sbio->bio->bi_bdev);
scrub_pending_bio_inc(sctx);
/* process all writes in a single worker thread. Then the block layer
* orders the requests before sending them to the driver which
* doubled the write performance on spinning disks when measured
* with Linux 3.5 */
btrfsic_submit_bio(sbio->bio);
if (btrfs_is_zoned(sctx->fs_info))
sctx->write_pointer = sbio->physical + sbio->page_count *
sctx->fs_info->sectorsize;
}
static void scrub_wr_bio_end_io(struct bio *bio)
{
struct scrub_bio *sbio = bio->bi_private;
struct btrfs_fs_info *fs_info = sbio->dev->fs_info;
sbio->status = bio->bi_status;
sbio->bio = bio;
btrfs_init_work(&sbio->work, scrub_wr_bio_end_io_worker, NULL, NULL);
btrfs_queue_work(fs_info->scrub_wr_completion_workers, &sbio->work);
}
static void scrub_wr_bio_end_io_worker(struct btrfs_work *work)
{
struct scrub_bio *sbio = container_of(work, struct scrub_bio, work);
struct scrub_ctx *sctx = sbio->sctx;
int i;
WARN_ON(sbio->page_count > SCRUB_PAGES_PER_WR_BIO);
if (sbio->status) {
struct btrfs_dev_replace *dev_replace =
&sbio->sctx->fs_info->dev_replace;
for (i = 0; i < sbio->page_count; i++) {
struct scrub_page *spage = sbio->pagev[i];
spage->io_error = 1;
atomic64_inc(&dev_replace->num_write_errors);
}
}
for (i = 0; i < sbio->page_count; i++)
scrub_page_put(sbio->pagev[i]);
bio_put(sbio->bio);
kfree(sbio);
scrub_pending_bio_dec(sctx);
}
static int scrub_checksum(struct scrub_block *sblock)
{
u64 flags;
int ret;
/*
* No need to initialize these stats currently,
* because this function only use return value
* instead of these stats value.
*
* Todo:
* always use stats
*/
sblock->header_error = 0;
sblock->generation_error = 0;
sblock->checksum_error = 0;
WARN_ON(sblock->page_count < 1);
flags = sblock->pagev[0]->flags;
ret = 0;
if (flags & BTRFS_EXTENT_FLAG_DATA)
ret = scrub_checksum_data(sblock);
else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)
ret = scrub_checksum_tree_block(sblock);
else if (flags & BTRFS_EXTENT_FLAG_SUPER)
(void)scrub_checksum_super(sblock);
else
WARN_ON(1);
if (ret)
scrub_handle_errored_block(sblock);
return ret;
}
static int scrub_checksum_data(struct scrub_block *sblock)
{
struct scrub_ctx *sctx = sblock->sctx;
struct btrfs_fs_info *fs_info = sctx->fs_info;
SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
u8 csum[BTRFS_CSUM_SIZE];
struct scrub_page *spage;
char *kaddr;
BUG_ON(sblock->page_count < 1);
spage = sblock->pagev[0];
if (!spage->have_csum)
return 0;
kaddr = page_address(spage->page);
shash->tfm = fs_info->csum_shash;
crypto_shash_init(shash);
/*
* In scrub_pages() and scrub_pages_for_parity() we ensure each spage
* only contains one sector of data.
*/
crypto_shash_digest(shash, kaddr, fs_info->sectorsize, csum);
if (memcmp(csum, spage->csum, fs_info->csum_size))
sblock->checksum_error = 1;
return sblock->checksum_error;
}
static int scrub_checksum_tree_block(struct scrub_block *sblock)
{
struct scrub_ctx *sctx = sblock->sctx;
struct btrfs_header *h;
struct btrfs_fs_info *fs_info = sctx->fs_info;
SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
u8 calculated_csum[BTRFS_CSUM_SIZE];
u8 on_disk_csum[BTRFS_CSUM_SIZE];
/*
* This is done in sectorsize steps even for metadata as there's a
* constraint for nodesize to be aligned to sectorsize. This will need
* to change so we don't misuse data and metadata units like that.
*/
const u32 sectorsize = sctx->fs_info->sectorsize;
const int num_sectors = fs_info->nodesize >> fs_info->sectorsize_bits;
int i;
struct scrub_page *spage;
char *kaddr;
BUG_ON(sblock->page_count < 1);
/* Each member in pagev is just one block, not a full page */
ASSERT(sblock->page_count == num_sectors);
spage = sblock->pagev[0];
kaddr = page_address(spage->page);
h = (struct btrfs_header *)kaddr;
memcpy(on_disk_csum, h->csum, sctx->fs_info->csum_size);
/*
* we don't use the getter functions here, as we
* a) don't have an extent buffer and
* b) the page is already kmapped
*/
if (spage->logical != btrfs_stack_header_bytenr(h))
sblock->header_error = 1;
if (spage->generation != btrfs_stack_header_generation(h)) {
sblock->header_error = 1;
sblock->generation_error = 1;
}
if (!scrub_check_fsid(h->fsid, spage))
sblock->header_error = 1;
if (memcmp(h->chunk_tree_uuid, fs_info->chunk_tree_uuid,
BTRFS_UUID_SIZE))
sblock->header_error = 1;
shash->tfm = fs_info->csum_shash;
crypto_shash_init(shash);
crypto_shash_update(shash, kaddr + BTRFS_CSUM_SIZE,
sectorsize - BTRFS_CSUM_SIZE);
for (i = 1; i < num_sectors; i++) {
kaddr = page_address(sblock->pagev[i]->page);
crypto_shash_update(shash, kaddr, sectorsize);
}
crypto_shash_final(shash, calculated_csum);
if (memcmp(calculated_csum, on_disk_csum, sctx->fs_info->csum_size))
sblock->checksum_error = 1;
return sblock->header_error || sblock->checksum_error;
}
static int scrub_checksum_super(struct scrub_block *sblock)
{
struct btrfs_super_block *s;
struct scrub_ctx *sctx = sblock->sctx;
struct btrfs_fs_info *fs_info = sctx->fs_info;
SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
u8 calculated_csum[BTRFS_CSUM_SIZE];
struct scrub_page *spage;
char *kaddr;
int fail_gen = 0;
int fail_cor = 0;
BUG_ON(sblock->page_count < 1);
spage = sblock->pagev[0];
kaddr = page_address(spage->page);
s = (struct btrfs_super_block *)kaddr;
if (spage->logical != btrfs_super_bytenr(s))
++fail_cor;
if (spage->generation != btrfs_super_generation(s))
++fail_gen;
if (!scrub_check_fsid(s->fsid, spage))
++fail_cor;
shash->tfm = fs_info->csum_shash;
crypto_shash_init(shash);
crypto_shash_digest(shash, kaddr + BTRFS_CSUM_SIZE,
BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE, calculated_csum);
if (memcmp(calculated_csum, s->csum, sctx->fs_info->csum_size))
++fail_cor;
if (fail_cor + fail_gen) {
/*
* if we find an error in a super block, we just report it.
* They will get written with the next transaction commit
* anyway
*/
spin_lock(&sctx->stat_lock);
++sctx->stat.super_errors;
spin_unlock(&sctx->stat_lock);
if (fail_cor)
btrfs_dev_stat_inc_and_print(spage->dev,
BTRFS_DEV_STAT_CORRUPTION_ERRS);
else
btrfs_dev_stat_inc_and_print(spage->dev,
BTRFS_DEV_STAT_GENERATION_ERRS);
}
return fail_cor + fail_gen;
}
static void scrub_block_get(struct scrub_block *sblock)
{
refcount_inc(&sblock->refs);
}
static void scrub_block_put(struct scrub_block *sblock)
{
if (refcount_dec_and_test(&sblock->refs)) {
int i;
if (sblock->sparity)
scrub_parity_put(sblock->sparity);
for (i = 0; i < sblock->page_count; i++)
scrub_page_put(sblock->pagev[i]);
kfree(sblock);
}
}
static void scrub_page_get(struct scrub_page *spage)
{
atomic_inc(&spage->refs);
}
static void scrub_page_put(struct scrub_page *spage)
{
if (atomic_dec_and_test(&spage->refs)) {
if (spage->page)
__free_page(spage->page);
kfree(spage);
}
}
/*
* Throttling of IO submission, bandwidth-limit based, the timeslice is 1
* second. Limit can be set via /sys/fs/UUID/devinfo/devid/scrub_speed_max.
*/
static void scrub_throttle(struct scrub_ctx *sctx)
{
const int time_slice = 1000;
struct scrub_bio *sbio;
struct btrfs_device *device;
s64 delta;
ktime_t now;
u32 div;
u64 bwlimit;
sbio = sctx->bios[sctx->curr];
device = sbio->dev;
bwlimit = READ_ONCE(device->scrub_speed_max);
if (bwlimit == 0)
return;
/*
* Slice is divided into intervals when the IO is submitted, adjust by
* bwlimit and maximum of 64 intervals.
*/
div = max_t(u32, 1, (u32)(bwlimit / (16 * 1024 * 1024)));
div = min_t(u32, 64, div);
/* Start new epoch, set deadline */
now = ktime_get();
if (sctx->throttle_deadline == 0) {
sctx->throttle_deadline = ktime_add_ms(now, time_slice / div);
sctx->throttle_sent = 0;
}
/* Still in the time to send? */
if (ktime_before(now, sctx->throttle_deadline)) {
/* If current bio is within the limit, send it */
sctx->throttle_sent += sbio->bio->bi_iter.bi_size;
if (sctx->throttle_sent <= div_u64(bwlimit, div))
return;
/* We're over the limit, sleep until the rest of the slice */
delta = ktime_ms_delta(sctx->throttle_deadline, now);
} else {
/* New request after deadline, start new epoch */
delta = 0;
}
if (delta) {
long timeout;
timeout = div_u64(delta * HZ, 1000);
schedule_timeout_interruptible(timeout);
}
/* Next call will start the deadline period */
sctx->throttle_deadline = 0;
}
static void scrub_submit(struct scrub_ctx *sctx)
{
struct scrub_bio *sbio;
if (sctx->curr == -1)
return;
scrub_throttle(sctx);
sbio = sctx->bios[sctx->curr];
sctx->curr = -1;
scrub_pending_bio_inc(sctx);
btrfsic_submit_bio(sbio->bio);
}
static int scrub_add_page_to_rd_bio(struct scrub_ctx *sctx,
struct scrub_page *spage)
{
struct scrub_block *sblock = spage->sblock;
struct scrub_bio *sbio;
const u32 sectorsize = sctx->fs_info->sectorsize;
int ret;
again:
/*
* grab a fresh bio or wait for one to become available
*/
while (sctx->curr == -1) {
spin_lock(&sctx->list_lock);
sctx->curr = sctx->first_free;
if (sctx->curr != -1) {
sctx->first_free = sctx->bios[sctx->curr]->next_free;
sctx->bios[sctx->curr]->next_free = -1;
sctx->bios[sctx->curr]->page_count = 0;
spin_unlock(&sctx->list_lock);
} else {
spin_unlock(&sctx->list_lock);
wait_event(sctx->list_wait, sctx->first_free != -1);
}
}
sbio = sctx->bios[sctx->curr];
if (sbio->page_count == 0) {
struct bio *bio;
sbio->physical = spage->physical;
sbio->logical = spage->logical;
sbio->dev = spage->dev;
bio = sbio->bio;
if (!bio) {
bio = btrfs_io_bio_alloc(sctx->pages_per_rd_bio);
sbio->bio = bio;
}
bio->bi_private = sbio;
bio->bi_end_io = scrub_bio_end_io;
bio_set_dev(bio, sbio->dev->bdev);
bio->bi_iter.bi_sector = sbio->physical >> 9;
bio->bi_opf = REQ_OP_READ;
sbio->status = 0;
} else if (sbio->physical + sbio->page_count * sectorsize !=
spage->physical ||
sbio->logical + sbio->page_count * sectorsize !=
spage->logical ||
sbio->dev != spage->dev) {
scrub_submit(sctx);
goto again;
}
sbio->pagev[sbio->page_count] = spage;
ret = bio_add_page(sbio->bio, spage->page, sectorsize, 0);
if (ret != sectorsize) {
if (sbio->page_count < 1) {
bio_put(sbio->bio);
sbio->bio = NULL;
return -EIO;
}
scrub_submit(sctx);
goto again;
}
scrub_block_get(sblock); /* one for the page added to the bio */
atomic_inc(&sblock->outstanding_pages);
sbio->page_count++;
if (sbio->page_count == sctx->pages_per_rd_bio)
scrub_submit(sctx);
return 0;
}
static void scrub_missing_raid56_end_io(struct bio *bio)
{
struct scrub_block *sblock = bio->bi_private;
struct btrfs_fs_info *fs_info = sblock->sctx->fs_info;
if (bio->bi_status)
sblock->no_io_error_seen = 0;
bio_put(bio);
btrfs_queue_work(fs_info->scrub_workers, &sblock->work);
}
static void scrub_missing_raid56_worker(struct btrfs_work *work)
{
struct scrub_block *sblock = container_of(work, struct scrub_block, work);
struct scrub_ctx *sctx = sblock->sctx;
struct btrfs_fs_info *fs_info = sctx->fs_info;
u64 logical;
struct btrfs_device *dev;
logical = sblock->pagev[0]->logical;
dev = sblock->pagev[0]->dev;
if (sblock->no_io_error_seen)
scrub_recheck_block_checksum(sblock);
if (!sblock->no_io_error_seen) {
spin_lock(&sctx->stat_lock);
sctx->stat.read_errors++;
spin_unlock(&sctx->stat_lock);
btrfs_err_rl_in_rcu(fs_info,
"IO error rebuilding logical %llu for dev %s",
logical, rcu_str_deref(dev->name));
} else if (sblock->header_error || sblock->checksum_error) {
spin_lock(&sctx->stat_lock);
sctx->stat.uncorrectable_errors++;
spin_unlock(&sctx->stat_lock);
btrfs_err_rl_in_rcu(fs_info,
"failed to rebuild valid logical %llu for dev %s",
logical, rcu_str_deref(dev->name));
} else {
scrub_write_block_to_dev_replace(sblock);
}
if (sctx->is_dev_replace && sctx->flush_all_writes) {
mutex_lock(&sctx->wr_lock);
scrub_wr_submit(sctx);
mutex_unlock(&sctx->wr_lock);
}
scrub_block_put(sblock);
scrub_pending_bio_dec(sctx);
}
static void scrub_missing_raid56_pages(struct scrub_block *sblock)
{
struct scrub_ctx *sctx = sblock->sctx;
struct btrfs_fs_info *fs_info = sctx->fs_info;
u64 length = sblock->page_count * PAGE_SIZE;
u64 logical = sblock->pagev[0]->logical;
struct btrfs_bio *bbio = NULL;
struct bio *bio;
struct btrfs_raid_bio *rbio;
int ret;
int i;
btrfs_bio_counter_inc_blocked(fs_info);
ret = btrfs_map_sblock(fs_info, BTRFS_MAP_GET_READ_MIRRORS, logical,
&length, &bbio);
if (ret || !bbio || !bbio->raid_map)
goto bbio_out;
if (WARN_ON(!sctx->is_dev_replace ||
!(bbio->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK))) {
/*
* We shouldn't be scrubbing a missing device. Even for dev
* replace, we should only get here for RAID 5/6. We either
* managed to mount something with no mirrors remaining or
* there's a bug in scrub_remap_extent()/btrfs_map_block().
*/
goto bbio_out;
}
bio = btrfs_io_bio_alloc(0);
bio->bi_iter.bi_sector = logical >> 9;
bio->bi_private = sblock;
bio->bi_end_io = scrub_missing_raid56_end_io;
rbio = raid56_alloc_missing_rbio(fs_info, bio, bbio, length);
if (!rbio)
goto rbio_out;
for (i = 0; i < sblock->page_count; i++) {
struct scrub_page *spage = sblock->pagev[i];
raid56_add_scrub_pages(rbio, spage->page, spage->logical);
}
btrfs_init_work(&sblock->work, scrub_missing_raid56_worker, NULL, NULL);
scrub_block_get(sblock);
scrub_pending_bio_inc(sctx);
raid56_submit_missing_rbio(rbio);
return;
rbio_out:
bio_put(bio);
bbio_out:
btrfs_bio_counter_dec(fs_info);
btrfs_put_bbio(bbio);
spin_lock(&sctx->stat_lock);
sctx->stat.malloc_errors++;
spin_unlock(&sctx->stat_lock);
}
static int scrub_pages(struct scrub_ctx *sctx, u64 logical, u32 len,
u64 physical, struct btrfs_device *dev, u64 flags,
u64 gen, int mirror_num, u8 *csum,
u64 physical_for_dev_replace)
{
struct scrub_block *sblock;
const u32 sectorsize = sctx->fs_info->sectorsize;
int index;
sblock = kzalloc(sizeof(*sblock), GFP_KERNEL);
if (!sblock) {
spin_lock(&sctx->stat_lock);
sctx->stat.malloc_errors++;
spin_unlock(&sctx->stat_lock);
return -ENOMEM;
}
/* one ref inside this function, plus one for each page added to
* a bio later on */
refcount_set(&sblock->refs, 1);
sblock->sctx = sctx;
sblock->no_io_error_seen = 1;
for (index = 0; len > 0; index++) {
struct scrub_page *spage;
/*
* Here we will allocate one page for one sector to scrub.
* This is fine if PAGE_SIZE == sectorsize, but will cost
* more memory for PAGE_SIZE > sectorsize case.
*/
u32 l = min(sectorsize, len);
spage = kzalloc(sizeof(*spage), GFP_KERNEL);
if (!spage) {
leave_nomem:
spin_lock(&sctx->stat_lock);
sctx->stat.malloc_errors++;
spin_unlock(&sctx->stat_lock);
scrub_block_put(sblock);
return -ENOMEM;
}
BUG_ON(index >= SCRUB_MAX_PAGES_PER_BLOCK);
scrub_page_get(spage);
sblock->pagev[index] = spage;
spage->sblock = sblock;
spage->dev = dev;
spage->flags = flags;
spage->generation = gen;
spage->logical = logical;
spage->physical = physical;
spage->physical_for_dev_replace = physical_for_dev_replace;
spage->mirror_num = mirror_num;
if (csum) {
spage->have_csum = 1;
memcpy(spage->csum, csum, sctx->fs_info->csum_size);
} else {
spage->have_csum = 0;
}
sblock->page_count++;
spage->page = alloc_page(GFP_KERNEL);
if (!spage->page)
goto leave_nomem;
len -= l;
logical += l;
physical += l;
physical_for_dev_replace += l;
}
WARN_ON(sblock->page_count == 0);
if (test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state)) {
/*
* This case should only be hit for RAID 5/6 device replace. See
* the comment in scrub_missing_raid56_pages() for details.
*/
scrub_missing_raid56_pages(sblock);
} else {
for (index = 0; index < sblock->page_count; index++) {
struct scrub_page *spage = sblock->pagev[index];
int ret;
ret = scrub_add_page_to_rd_bio(sctx, spage);
if (ret) {
scrub_block_put(sblock);
return ret;
}
}
if (flags & BTRFS_EXTENT_FLAG_SUPER)
scrub_submit(sctx);
}
/* last one frees, either here or in bio completion for last page */
scrub_block_put(sblock);
return 0;
}
static void scrub_bio_end_io(struct bio *bio)
{
struct scrub_bio *sbio = bio->bi_private;
struct btrfs_fs_info *fs_info = sbio->dev->fs_info;
sbio->status = bio->bi_status;
sbio->bio = bio;
btrfs_queue_work(fs_info->scrub_workers, &sbio->work);
}
static void scrub_bio_end_io_worker(struct btrfs_work *work)
{
struct scrub_bio *sbio = container_of(work, struct scrub_bio, work);
struct scrub_ctx *sctx = sbio->sctx;
int i;
BUG_ON(sbio->page_count > SCRUB_PAGES_PER_RD_BIO);
if (sbio->status) {
for (i = 0; i < sbio->page_count; i++) {
struct scrub_page *spage = sbio->pagev[i];
spage->io_error = 1;
spage->sblock->no_io_error_seen = 0;
}
}
/* now complete the scrub_block items that have all pages completed */
for (i = 0; i < sbio->page_count; i++) {
struct scrub_page *spage = sbio->pagev[i];
struct scrub_block *sblock = spage->sblock;
if (atomic_dec_and_test(&sblock->outstanding_pages))
scrub_block_complete(sblock);
scrub_block_put(sblock);
}
bio_put(sbio->bio);
sbio->bio = NULL;
spin_lock(&sctx->list_lock);
sbio->next_free = sctx->first_free;
sctx->first_free = sbio->index;
spin_unlock(&sctx->list_lock);
if (sctx->is_dev_replace && sctx->flush_all_writes) {
mutex_lock(&sctx->wr_lock);
scrub_wr_submit(sctx);
mutex_unlock(&sctx->wr_lock);
}
scrub_pending_bio_dec(sctx);
}
static inline void __scrub_mark_bitmap(struct scrub_parity *sparity,
unsigned long *bitmap,
u64 start, u32 len)
{
u64 offset;
u32 nsectors;
u32 sectorsize_bits = sparity->sctx->fs_info->sectorsize_bits;
if (len >= sparity->stripe_len) {
bitmap_set(bitmap, 0, sparity->nsectors);
return;
}
start -= sparity->logic_start;
start = div64_u64_rem(start, sparity->stripe_len, &offset);
offset = offset >> sectorsize_bits;
nsectors = len >> sectorsize_bits;
if (offset + nsectors <= sparity->nsectors) {
bitmap_set(bitmap, offset, nsectors);
return;
}
bitmap_set(bitmap, offset, sparity->nsectors - offset);
bitmap_set(bitmap, 0, nsectors - (sparity->nsectors - offset));
}
static inline void scrub_parity_mark_sectors_error(struct scrub_parity *sparity,
u64 start, u32 len)
{
__scrub_mark_bitmap(sparity, sparity->ebitmap, start, len);
}
static inline void scrub_parity_mark_sectors_data(struct scrub_parity *sparity,
u64 start, u32 len)
{
__scrub_mark_bitmap(sparity, sparity->dbitmap, start, len);
}
static void scrub_block_complete(struct scrub_block *sblock)
{
int corrupted = 0;
if (!sblock->no_io_error_seen) {
corrupted = 1;
scrub_handle_errored_block(sblock);
} else {
/*
* if has checksum error, write via repair mechanism in
* dev replace case, otherwise write here in dev replace
* case.
*/
corrupted = scrub_checksum(sblock);
if (!corrupted && sblock->sctx->is_dev_replace)
scrub_write_block_to_dev_replace(sblock);
}
if (sblock->sparity && corrupted && !sblock->data_corrected) {
u64 start = sblock->pagev[0]->logical;
u64 end = sblock->pagev[sblock->page_count - 1]->logical +
sblock->sctx->fs_info->sectorsize;
ASSERT(end - start <= U32_MAX);
scrub_parity_mark_sectors_error(sblock->sparity,
start, end - start);
}
}
static void drop_csum_range(struct scrub_ctx *sctx, struct btrfs_ordered_sum *sum)
{
sctx->stat.csum_discards += sum->len >> sctx->fs_info->sectorsize_bits;
list_del(&sum->list);
kfree(sum);
}
/*
* Find the desired csum for range [logical, logical + sectorsize), and store
* the csum into @csum.
*
* The search source is sctx->csum_list, which is a pre-populated list
* storing bytenr ordered csum ranges. We're responsible to cleanup any range
* that is before @logical.
*
* Return 0 if there is no csum for the range.
* Return 1 if there is csum for the range and copied to @csum.
*/
static int scrub_find_csum(struct scrub_ctx *sctx, u64 logical, u8 *csum)
{
bool found = false;
while (!list_empty(&sctx->csum_list)) {
struct btrfs_ordered_sum *sum = NULL;
unsigned long index;
unsigned long num_sectors;
sum = list_first_entry(&sctx->csum_list,
struct btrfs_ordered_sum, list);
/* The current csum range is beyond our range, no csum found */
if (sum->bytenr > logical)
break;
/*
* The current sum is before our bytenr, since scrub is always
* done in bytenr order, the csum will never be used anymore,
* clean it up so that later calls won't bother with the range,
* and continue search the next range.
*/
if (sum->bytenr + sum->len <= logical) {
drop_csum_range(sctx, sum);
continue;
}
/* Now the csum range covers our bytenr, copy the csum */
found = true;
index = (logical - sum->bytenr) >> sctx->fs_info->sectorsize_bits;
num_sectors = sum->len >> sctx->fs_info->sectorsize_bits;
memcpy(csum, sum->sums + index * sctx->fs_info->csum_size,
sctx->fs_info->csum_size);
/* Cleanup the range if we're at the end of the csum range */
if (index == num_sectors - 1)
drop_csum_range(sctx, sum);
break;
}
if (!found)
return 0;
return 1;
}
/* scrub extent tries to collect up to 64 kB for each bio */
static int scrub_extent(struct scrub_ctx *sctx, struct map_lookup *map,
u64 logical, u32 len,
u64 physical, struct btrfs_device *dev, u64 flags,
u64 gen, int mirror_num, u64 physical_for_dev_replace)
{
int ret;
u8 csum[BTRFS_CSUM_SIZE];
u32 blocksize;
if (flags & BTRFS_EXTENT_FLAG_DATA) {
if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK)
blocksize = map->stripe_len;
else
blocksize = sctx->fs_info->sectorsize;
spin_lock(&sctx->stat_lock);
sctx->stat.data_extents_scrubbed++;
sctx->stat.data_bytes_scrubbed += len;
spin_unlock(&sctx->stat_lock);
} else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK)
blocksize = map->stripe_len;
else
blocksize = sctx->fs_info->nodesize;
spin_lock(&sctx->stat_lock);
sctx->stat.tree_extents_scrubbed++;
sctx->stat.tree_bytes_scrubbed += len;
spin_unlock(&sctx->stat_lock);
} else {
blocksize = sctx->fs_info->sectorsize;
WARN_ON(1);
}
while (len) {
u32 l = min(len, blocksize);
int have_csum = 0;
if (flags & BTRFS_EXTENT_FLAG_DATA) {
/* push csums to sbio */
have_csum = scrub_find_csum(sctx, logical, csum);
if (have_csum == 0)
++sctx->stat.no_csum;
}
ret = scrub_pages(sctx, logical, l, physical, dev, flags, gen,
mirror_num, have_csum ? csum : NULL,
physical_for_dev_replace);
if (ret)
return ret;
len -= l;
logical += l;
physical += l;
physical_for_dev_replace += l;
}
return 0;
}
static int scrub_pages_for_parity(struct scrub_parity *sparity,
u64 logical, u32 len,
u64 physical, struct btrfs_device *dev,
u64 flags, u64 gen, int mirror_num, u8 *csum)
{
struct scrub_ctx *sctx = sparity->sctx;
struct scrub_block *sblock;
const u32 sectorsize = sctx->fs_info->sectorsize;
int index;
ASSERT(IS_ALIGNED(len, sectorsize));
sblock = kzalloc(sizeof(*sblock), GFP_KERNEL);
if (!sblock) {
spin_lock(&sctx->stat_lock);
sctx->stat.malloc_errors++;
spin_unlock(&sctx->stat_lock);
return -ENOMEM;
}
/* one ref inside this function, plus one for each page added to
* a bio later on */
refcount_set(&sblock->refs, 1);
sblock->sctx = sctx;
sblock->no_io_error_seen = 1;
sblock->sparity = sparity;
scrub_parity_get(sparity);
for (index = 0; len > 0; index++) {
struct scrub_page *spage;
spage = kzalloc(sizeof(*spage), GFP_KERNEL);
if (!spage) {
leave_nomem:
spin_lock(&sctx->stat_lock);
sctx->stat.malloc_errors++;
spin_unlock(&sctx->stat_lock);
scrub_block_put(sblock);
return -ENOMEM;
}
BUG_ON(index >= SCRUB_MAX_PAGES_PER_BLOCK);
/* For scrub block */
scrub_page_get(spage);
sblock->pagev[index] = spage;
/* For scrub parity */
scrub_page_get(spage);
list_add_tail(&spage->list, &sparity->spages);
spage->sblock = sblock;
spage->dev = dev;
spage->flags = flags;
spage->generation = gen;
spage->logical = logical;
spage->physical = physical;
spage->mirror_num = mirror_num;
if (csum) {
spage->have_csum = 1;
memcpy(spage->csum, csum, sctx->fs_info->csum_size);
} else {
spage->have_csum = 0;
}
sblock->page_count++;
spage->page = alloc_page(GFP_KERNEL);
if (!spage->page)
goto leave_nomem;
/* Iterate over the stripe range in sectorsize steps */
len -= sectorsize;
logical += sectorsize;
physical += sectorsize;
}
WARN_ON(sblock->page_count == 0);
for (index = 0; index < sblock->page_count; index++) {
struct scrub_page *spage = sblock->pagev[index];
int ret;
ret = scrub_add_page_to_rd_bio(sctx, spage);
if (ret) {
scrub_block_put(sblock);
return ret;
}
}
/* last one frees, either here or in bio completion for last page */
scrub_block_put(sblock);
return 0;
}
static int scrub_extent_for_parity(struct scrub_parity *sparity,
u64 logical, u32 len,
u64 physical, struct btrfs_device *dev,
u64 flags, u64 gen, int mirror_num)
{
struct scrub_ctx *sctx = sparity->sctx;
int ret;
u8 csum[BTRFS_CSUM_SIZE];
u32 blocksize;
if (test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state)) {
scrub_parity_mark_sectors_error(sparity, logical, len);
return 0;
}
if (flags & BTRFS_EXTENT_FLAG_DATA) {
blocksize = sparity->stripe_len;
} else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
blocksize = sparity->stripe_len;
} else {
blocksize = sctx->fs_info->sectorsize;
WARN_ON(1);
}
while (len) {
u32 l = min(len, blocksize);
int have_csum = 0;
if (flags & BTRFS_EXTENT_FLAG_DATA) {
/* push csums to sbio */
have_csum = scrub_find_csum(sctx, logical, csum);
if (have_csum == 0)
goto skip;
}
ret = scrub_pages_for_parity(sparity, logical, l, physical, dev,
flags, gen, mirror_num,
have_csum ? csum : NULL);
if (ret)
return ret;
skip:
len -= l;
logical += l;
physical += l;
}
return 0;
}
/*
* Given a physical address, this will calculate it's
* logical offset. if this is a parity stripe, it will return
* the most left data stripe's logical offset.
*
* return 0 if it is a data stripe, 1 means parity stripe.
*/
static int get_raid56_logic_offset(u64 physical, int num,
struct map_lookup *map, u64 *offset,
u64 *stripe_start)
{
int i;
int j = 0;
u64 stripe_nr;
u64 last_offset;
u32 stripe_index;
u32 rot;
const int data_stripes = nr_data_stripes(map);
last_offset = (physical - map->stripes[num].physical) * data_stripes;
if (stripe_start)
*stripe_start = last_offset;
*offset = last_offset;
for (i = 0; i < data_stripes; i++) {
*offset = last_offset + i * map->stripe_len;
stripe_nr = div64_u64(*offset, map->stripe_len);
stripe_nr = div_u64(stripe_nr, data_stripes);
/* Work out the disk rotation on this stripe-set */
stripe_nr = div_u64_rem(stripe_nr, map->num_stripes, &rot);
/* calculate which stripe this data locates */
rot += i;
stripe_index = rot % map->num_stripes;
if (stripe_index == num)
return 0;
if (stripe_index < num)
j++;
}
*offset = last_offset + j * map->stripe_len;
return 1;
}
static void scrub_free_parity(struct scrub_parity *sparity)
{
struct scrub_ctx *sctx = sparity->sctx;
struct scrub_page *curr, *next;
int nbits;
nbits = bitmap_weight(sparity->ebitmap, sparity->nsectors);
if (nbits) {
spin_lock(&sctx->stat_lock);
sctx->stat.read_errors += nbits;
sctx->stat.uncorrectable_errors += nbits;
spin_unlock(&sctx->stat_lock);
}
list_for_each_entry_safe(curr, next, &sparity->spages, list) {
list_del_init(&curr->list);
scrub_page_put(curr);
}
kfree(sparity);
}
static void scrub_parity_bio_endio_worker(struct btrfs_work *work)
{
struct scrub_parity *sparity = container_of(work, struct scrub_parity,
work);
struct scrub_ctx *sctx = sparity->sctx;
scrub_free_parity(sparity);
scrub_pending_bio_dec(sctx);
}
static void scrub_parity_bio_endio(struct bio *bio)
{
struct scrub_parity *sparity = (struct scrub_parity *)bio->bi_private;
struct btrfs_fs_info *fs_info = sparity->sctx->fs_info;
if (bio->bi_status)
bitmap_or(sparity->ebitmap, sparity->ebitmap, sparity->dbitmap,
sparity->nsectors);
bio_put(bio);
btrfs_init_work(&sparity->work, scrub_parity_bio_endio_worker, NULL,
NULL);
btrfs_queue_work(fs_info->scrub_parity_workers, &sparity->work);
}
static void scrub_parity_check_and_repair(struct scrub_parity *sparity)
{
struct scrub_ctx *sctx = sparity->sctx;
struct btrfs_fs_info *fs_info = sctx->fs_info;
struct bio *bio;
struct btrfs_raid_bio *rbio;
struct btrfs_bio *bbio = NULL;
u64 length;
int ret;
if (!bitmap_andnot(sparity->dbitmap, sparity->dbitmap, sparity->ebitmap,
sparity->nsectors))
goto out;
length = sparity->logic_end - sparity->logic_start;
btrfs_bio_counter_inc_blocked(fs_info);
ret = btrfs_map_sblock(fs_info, BTRFS_MAP_WRITE, sparity->logic_start,
&length, &bbio);
if (ret || !bbio || !bbio->raid_map)
goto bbio_out;
bio = btrfs_io_bio_alloc(0);
bio->bi_iter.bi_sector = sparity->logic_start >> 9;
bio->bi_private = sparity;
bio->bi_end_io = scrub_parity_bio_endio;
rbio = raid56_parity_alloc_scrub_rbio(fs_info, bio, bbio,
length, sparity->scrub_dev,
sparity->dbitmap,
sparity->nsectors);
if (!rbio)
goto rbio_out;
scrub_pending_bio_inc(sctx);
raid56_parity_submit_scrub_rbio(rbio);
return;
rbio_out:
bio_put(bio);
bbio_out:
btrfs_bio_counter_dec(fs_info);
btrfs_put_bbio(bbio);
bitmap_or(sparity->ebitmap, sparity->ebitmap, sparity->dbitmap,
sparity->nsectors);
spin_lock(&sctx->stat_lock);
sctx->stat.malloc_errors++;
spin_unlock(&sctx->stat_lock);
out:
scrub_free_parity(sparity);
}
static inline int scrub_calc_parity_bitmap_len(int nsectors)
{
return DIV_ROUND_UP(nsectors, BITS_PER_LONG) * sizeof(long);
}
static void scrub_parity_get(struct scrub_parity *sparity)
{
refcount_inc(&sparity->refs);
}
static void scrub_parity_put(struct scrub_parity *sparity)
{
if (!refcount_dec_and_test(&sparity->refs))
return;
scrub_parity_check_and_repair(sparity);
}
static noinline_for_stack int scrub_raid56_parity(struct scrub_ctx *sctx,
struct map_lookup *map,
struct btrfs_device *sdev,
struct btrfs_path *path,
u64 logic_start,
u64 logic_end)
{
struct btrfs_fs_info *fs_info = sctx->fs_info;
struct btrfs_root *root = fs_info->extent_root;
struct btrfs_root *csum_root = fs_info->csum_root;
struct btrfs_extent_item *extent;
struct btrfs_bio *bbio = NULL;
u64 flags;
int ret;
int slot;
struct extent_buffer *l;
struct btrfs_key key;
u64 generation;
u64 extent_logical;
u64 extent_physical;
/* Check the comment in scrub_stripe() for why u32 is enough here */
u32 extent_len;
u64 mapped_length;
struct btrfs_device *extent_dev;
struct scrub_parity *sparity;
int nsectors;
int bitmap_len;
int extent_mirror_num;
int stop_loop = 0;
ASSERT(map->stripe_len <= U32_MAX);
nsectors = map->stripe_len >> fs_info->sectorsize_bits;
bitmap_len = scrub_calc_parity_bitmap_len(nsectors);
sparity = kzalloc(sizeof(struct scrub_parity) + 2 * bitmap_len,
GFP_NOFS);
if (!sparity) {
spin_lock(&sctx->stat_lock);
sctx->stat.malloc_errors++;
spin_unlock(&sctx->stat_lock);
return -ENOMEM;
}
ASSERT(map->stripe_len <= U32_MAX);
sparity->stripe_len = map->stripe_len;
sparity->nsectors = nsectors;
sparity->sctx = sctx;
sparity->scrub_dev = sdev;
sparity->logic_start = logic_start;
sparity->logic_end = logic_end;
refcount_set(&sparity->refs, 1);
INIT_LIST_HEAD(&sparity->spages);
sparity->dbitmap = sparity->bitmap;
sparity->ebitmap = (void *)sparity->bitmap + bitmap_len;
ret = 0;
while (logic_start < logic_end) {
if (btrfs_fs_incompat(fs_info, SKINNY_METADATA))
key.type = BTRFS_METADATA_ITEM_KEY;
else
key.type = BTRFS_EXTENT_ITEM_KEY;
key.objectid = logic_start;
key.offset = (u64)-1;
ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
if (ret < 0)
goto out;
if (ret > 0) {
ret = btrfs_previous_extent_item(root, path, 0);
if (ret < 0)
goto out;
if (ret > 0) {
btrfs_release_path(path);
ret = btrfs_search_slot(NULL, root, &key,
path, 0, 0);
if (ret < 0)
goto out;
}
}
stop_loop = 0;
while (1) {
u64 bytes;
l = path->nodes[0];
slot = path->slots[0];
if (slot >= btrfs_header_nritems(l)) {
ret = btrfs_next_leaf(root, path);
if (ret == 0)
continue;
if (ret < 0)
goto out;
stop_loop = 1;
break;
}
btrfs_item_key_to_cpu(l, &key, slot);
if (key.type != BTRFS_EXTENT_ITEM_KEY &&
key.type != BTRFS_METADATA_ITEM_KEY)
goto next;
if (key.type == BTRFS_METADATA_ITEM_KEY)
bytes = fs_info->nodesize;
else
bytes = key.offset;
if (key.objectid + bytes <= logic_start)
goto next;
if (key.objectid >= logic_end) {
stop_loop = 1;
break;
}
while (key.objectid >= logic_start + map->stripe_len)
logic_start += map->stripe_len;
extent = btrfs_item_ptr(l, slot,
struct btrfs_extent_item);
flags = btrfs_extent_flags(l, extent);
generation = btrfs_extent_generation(l, extent);
if ((flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) &&
(key.objectid < logic_start ||
key.objectid + bytes >
logic_start + map->stripe_len)) {
btrfs_err(fs_info,
"scrub: tree block %llu spanning stripes, ignored. logical=%llu",
key.objectid, logic_start);
spin_lock(&sctx->stat_lock);
sctx->stat.uncorrectable_errors++;
spin_unlock(&sctx->stat_lock);
goto next;
}
again:
extent_logical = key.objectid;
ASSERT(bytes <= U32_MAX);
extent_len = bytes;
if (extent_logical < logic_start) {
extent_len -= logic_start - extent_logical;
extent_logical = logic_start;
}
if (extent_logical + extent_len >
logic_start + map->stripe_len)
extent_len = logic_start + map->stripe_len -
extent_logical;
scrub_parity_mark_sectors_data(sparity, extent_logical,
extent_len);
mapped_length = extent_len;
bbio = NULL;
ret = btrfs_map_block(fs_info, BTRFS_MAP_READ,
extent_logical, &mapped_length, &bbio,
0);
if (!ret) {
if (!bbio || mapped_length < extent_len)
ret = -EIO;
}
if (ret) {
btrfs_put_bbio(bbio);
goto out;
}
extent_physical = bbio->stripes[0].physical;
extent_mirror_num = bbio->mirror_num;
extent_dev = bbio->stripes[0].dev;
btrfs_put_bbio(bbio);
ret = btrfs_lookup_csums_range(csum_root,
extent_logical,
extent_logical + extent_len - 1,
&sctx->csum_list, 1);
if (ret)
goto out;
ret = scrub_extent_for_parity(sparity, extent_logical,
extent_len,
extent_physical,
extent_dev, flags,
generation,
extent_mirror_num);
scrub_free_csums(sctx);
if (ret)
goto out;
if (extent_logical + extent_len <
key.objectid + bytes) {
logic_start += map->stripe_len;
if (logic_start >= logic_end) {
stop_loop = 1;
break;
}
if (logic_start < key.objectid + bytes) {
cond_resched();
goto again;
}
}
next:
path->slots[0]++;
}
btrfs_release_path(path);
if (stop_loop)
break;
logic_start += map->stripe_len;
}
out:
if (ret < 0) {
ASSERT(logic_end - logic_start <= U32_MAX);
scrub_parity_mark_sectors_error(sparity, logic_start,
logic_end - logic_start);
}
scrub_parity_put(sparity);
scrub_submit(sctx);
mutex_lock(&sctx->wr_lock);
scrub_wr_submit(sctx);
mutex_unlock(&sctx->wr_lock);
btrfs_release_path(path);
return ret < 0 ? ret : 0;
}
static void sync_replace_for_zoned(struct scrub_ctx *sctx)
{
if (!btrfs_is_zoned(sctx->fs_info))
return;
sctx->flush_all_writes = true;
scrub_submit(sctx);
mutex_lock(&sctx->wr_lock);
scrub_wr_submit(sctx);
mutex_unlock(&sctx->wr_lock);
wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0);
}
static int sync_write_pointer_for_zoned(struct scrub_ctx *sctx, u64 logical,
u64 physical, u64 physical_end)
{
struct btrfs_fs_info *fs_info = sctx->fs_info;
int ret = 0;
if (!btrfs_is_zoned(fs_info))
return 0;
wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0);
mutex_lock(&sctx->wr_lock);
if (sctx->write_pointer < physical_end) {
ret = btrfs_sync_zone_write_pointer(sctx->wr_tgtdev, logical,
physical,
sctx->write_pointer);
if (ret)
btrfs_err(fs_info,
"zoned: failed to recover write pointer");
}
mutex_unlock(&sctx->wr_lock);
btrfs_dev_clear_zone_empty(sctx->wr_tgtdev, physical);
return ret;
}
static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
struct map_lookup *map,
struct btrfs_device *scrub_dev,
int num, u64 base, u64 length,
struct btrfs_block_group *cache)
{
struct btrfs_path *path, *ppath;
struct btrfs_fs_info *fs_info = sctx->fs_info;
struct btrfs_root *root = fs_info->extent_root;
struct btrfs_root *csum_root = fs_info->csum_root;
struct btrfs_extent_item *extent;
struct blk_plug plug;
u64 flags;
int ret;
int slot;
u64 nstripes;
struct extent_buffer *l;
u64 physical;
u64 logical;
u64 logic_end;
u64 physical_end;
u64 generation;
int mirror_num;
struct reada_control *reada1;
struct reada_control *reada2;
struct btrfs_key key;
struct btrfs_key key_end;
u64 increment = map->stripe_len;
u64 offset;
u64 extent_logical;
u64 extent_physical;
/*
* Unlike chunk length, extent length should never go beyond
* BTRFS_MAX_EXTENT_SIZE, thus u32 is enough here.
*/
u32 extent_len;
u64 stripe_logical;
u64 stripe_end;
struct btrfs_device *extent_dev;
int extent_mirror_num;
int stop_loop = 0;
physical = map->stripes[num].physical;
offset = 0;
nstripes = div64_u64(length, map->stripe_len);
mirror_num = 1;
increment = map->stripe_len;
if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
offset = map->stripe_len * num;
increment = map->stripe_len * map->num_stripes;
} else if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
int factor = map->num_stripes / map->sub_stripes;
offset = map->stripe_len * (num / map->sub_stripes);
increment = map->stripe_len * factor;
mirror_num = num % map->sub_stripes + 1;
} else if (map->type & BTRFS_BLOCK_GROUP_RAID1_MASK) {
mirror_num = num % map->num_stripes + 1;
} else if (map->type & BTRFS_BLOCK_GROUP_DUP) {
mirror_num = num % map->num_stripes + 1;
} else if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
get_raid56_logic_offset(physical, num, map, &offset, NULL);
increment = map->stripe_len * nr_data_stripes(map);
}
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
ppath = btrfs_alloc_path();
if (!ppath) {
btrfs_free_path(path);
return -ENOMEM;
}
/*
* work on commit root. The related disk blocks are static as
* long as COW is applied. This means, it is save to rewrite
* them to repair disk errors without any race conditions
*/
path->search_commit_root = 1;
path->skip_locking = 1;
ppath->search_commit_root = 1;
ppath->skip_locking = 1;
/*
* trigger the readahead for extent tree csum tree and wait for
* completion. During readahead, the scrub is officially paused
* to not hold off transaction commits
*/
logical = base + offset;
physical_end = physical + nstripes * map->stripe_len;
if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
get_raid56_logic_offset(physical_end, num,
map, &logic_end, NULL);
logic_end += base;
} else {
logic_end = logical + increment * nstripes;
}
wait_event(sctx->list_wait,
atomic_read(&sctx->bios_in_flight) == 0);
scrub_blocked_if_needed(fs_info);
/* FIXME it might be better to start readahead at commit root */
key.objectid = logical;
key.type = BTRFS_EXTENT_ITEM_KEY;
key.offset = (u64)0;
key_end.objectid = logic_end;
key_end.type = BTRFS_METADATA_ITEM_KEY;
key_end.offset = (u64)-1;
reada1 = btrfs_reada_add(root, &key, &key_end);
if (cache->flags & BTRFS_BLOCK_GROUP_DATA) {
key.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
key.type = BTRFS_EXTENT_CSUM_KEY;
key.offset = logical;
key_end.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
key_end.type = BTRFS_EXTENT_CSUM_KEY;
key_end.offset = logic_end;
reada2 = btrfs_reada_add(csum_root, &key, &key_end);
} else {
reada2 = NULL;
}
if (!IS_ERR(reada1))
btrfs_reada_wait(reada1);
if (!IS_ERR_OR_NULL(reada2))
btrfs_reada_wait(reada2);
/*
* collect all data csums for the stripe to avoid seeking during
* the scrub. This might currently (crc32) end up to be about 1MB
*/
blk_start_plug(&plug);
if (sctx->is_dev_replace &&
btrfs_dev_is_sequential(sctx->wr_tgtdev, physical)) {
mutex_lock(&sctx->wr_lock);
sctx->write_pointer = physical;
mutex_unlock(&sctx->wr_lock);
sctx->flush_all_writes = true;
}
/*
* now find all extents for each stripe and scrub them
*/
ret = 0;
while (physical < physical_end) {
/*
* canceled?
*/
if (atomic_read(&fs_info->scrub_cancel_req) ||
atomic_read(&sctx->cancel_req)) {
ret = -ECANCELED;
goto out;
}
/*
* check to see if we have to pause
*/
if (atomic_read(&fs_info->scrub_pause_req)) {
/* push queued extents */
sctx->flush_all_writes = true;
scrub_submit(sctx);
mutex_lock(&sctx->wr_lock);
scrub_wr_submit(sctx);
mutex_unlock(&sctx->wr_lock);
wait_event(sctx->list_wait,
atomic_read(&sctx->bios_in_flight) == 0);
sctx->flush_all_writes = false;
scrub_blocked_if_needed(fs_info);
}
if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
ret = get_raid56_logic_offset(physical, num, map,
&logical,
&stripe_logical);
logical += base;
if (ret) {
/* it is parity strip */
stripe_logical += base;
stripe_end = stripe_logical + increment;
ret = scrub_raid56_parity(sctx, map, scrub_dev,
ppath, stripe_logical,
stripe_end);
if (ret)
goto out;
goto skip;
}
}
if (btrfs_fs_incompat(fs_info, SKINNY_METADATA))
key.type = BTRFS_METADATA_ITEM_KEY;
else
key.type = BTRFS_EXTENT_ITEM_KEY;
key.objectid = logical;
key.offset = (u64)-1;
ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
if (ret < 0)
goto out;
if (ret > 0) {
ret = btrfs_previous_extent_item(root, path, 0);
if (ret < 0)
goto out;
if (ret > 0) {
/* there's no smaller item, so stick with the
* larger one */
btrfs_release_path(path);
ret = btrfs_search_slot(NULL, root, &key,
path, 0, 0);
if (ret < 0)
goto out;
}
}
stop_loop = 0;
while (1) {
u64 bytes;
l = path->nodes[0];
slot = path->slots[0];
if (slot >= btrfs_header_nritems(l)) {
ret = btrfs_next_leaf(root, path);
if (ret == 0)
continue;
if (ret < 0)
goto out;
stop_loop = 1;
break;
}
btrfs_item_key_to_cpu(l, &key, slot);
if (key.type != BTRFS_EXTENT_ITEM_KEY &&
key.type != BTRFS_METADATA_ITEM_KEY)
goto next;
if (key.type == BTRFS_METADATA_ITEM_KEY)
bytes = fs_info->nodesize;
else
bytes = key.offset;
if (key.objectid + bytes <= logical)
goto next;
if (key.objectid >= logical + map->stripe_len) {
/* out of this device extent */
if (key.objectid >= logic_end)
stop_loop = 1;
break;
}
/*
* If our block group was removed in the meanwhile, just
* stop scrubbing since there is no point in continuing.
* Continuing would prevent reusing its device extents
* for new block groups for a long time.
*/
spin_lock(&cache->lock);
if (cache->removed) {
spin_unlock(&cache->lock);
ret = 0;
goto out;
}
spin_unlock(&cache->lock);
extent = btrfs_item_ptr(l, slot,
struct btrfs_extent_item);
flags = btrfs_extent_flags(l, extent);
generation = btrfs_extent_generation(l, extent);
if ((flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) &&
(key.objectid < logical ||
key.objectid + bytes >
logical + map->stripe_len)) {
btrfs_err(fs_info,
"scrub: tree block %llu spanning stripes, ignored. logical=%llu",
key.objectid, logical);
spin_lock(&sctx->stat_lock);
sctx->stat.uncorrectable_errors++;
spin_unlock(&sctx->stat_lock);
goto next;
}
again:
extent_logical = key.objectid;
ASSERT(bytes <= U32_MAX);
extent_len = bytes;
/*
* trim extent to this stripe
*/
if (extent_logical < logical) {
extent_len -= logical - extent_logical;
extent_logical = logical;
}
if (extent_logical + extent_len >
logical + map->stripe_len) {
extent_len = logical + map->stripe_len -
extent_logical;
}
extent_physical = extent_logical - logical + physical;
extent_dev = scrub_dev;
extent_mirror_num = mirror_num;
if (sctx->is_dev_replace)
scrub_remap_extent(fs_info, extent_logical,
extent_len, &extent_physical,
&extent_dev,
&extent_mirror_num);
if (flags & BTRFS_EXTENT_FLAG_DATA) {
ret = btrfs_lookup_csums_range(csum_root,
extent_logical,
extent_logical + extent_len - 1,
&sctx->csum_list, 1);
if (ret)
goto out;
}
ret = scrub_extent(sctx, map, extent_logical, extent_len,
extent_physical, extent_dev, flags,
generation, extent_mirror_num,
extent_logical - logical + physical);
scrub_free_csums(sctx);
if (ret)
goto out;
if (sctx->is_dev_replace)
sync_replace_for_zoned(sctx);
if (extent_logical + extent_len <
key.objectid + bytes) {
if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
/*
* loop until we find next data stripe
* or we have finished all stripes.
*/
loop:
physical += map->stripe_len;
ret = get_raid56_logic_offset(physical,
num, map, &logical,
&stripe_logical);
logical += base;
if (ret && physical < physical_end) {
stripe_logical += base;
stripe_end = stripe_logical +
increment;
ret = scrub_raid56_parity(sctx,
map, scrub_dev, ppath,
stripe_logical,
stripe_end);
if (ret)
goto out;
goto loop;
}
} else {
physical += map->stripe_len;
logical += increment;
}
if (logical < key.objectid + bytes) {
cond_resched();
goto again;
}
if (physical >= physical_end) {
stop_loop = 1;
break;
}
}
next:
path->slots[0]++;
}
btrfs_release_path(path);
skip:
logical += increment;
physical += map->stripe_len;
spin_lock(&sctx->stat_lock);
if (stop_loop)
sctx->stat.last_physical = map->stripes[num].physical +
length;
else
sctx->stat.last_physical = physical;
spin_unlock(&sctx->stat_lock);
if (stop_loop)
break;
}
out:
/* push queued extents */
scrub_submit(sctx);
mutex_lock(&sctx->wr_lock);
scrub_wr_submit(sctx);
mutex_unlock(&sctx->wr_lock);
blk_finish_plug(&plug);
btrfs_free_path(path);
btrfs_free_path(ppath);
if (sctx->is_dev_replace && ret >= 0) {
int ret2;
ret2 = sync_write_pointer_for_zoned(sctx, base + offset,
map->stripes[num].physical,
physical_end);
if (ret2)
ret = ret2;
}
return ret < 0 ? ret : 0;
}
static noinline_for_stack int scrub_chunk(struct scrub_ctx *sctx,
struct btrfs_device *scrub_dev,
u64 chunk_offset, u64 length,
u64 dev_offset,
struct btrfs_block_group *cache)
{
struct btrfs_fs_info *fs_info = sctx->fs_info;
struct extent_map_tree *map_tree = &fs_info->mapping_tree;
struct map_lookup *map;
struct extent_map *em;
int i;
int ret = 0;
read_lock(&map_tree->lock);
em = lookup_extent_mapping(map_tree, chunk_offset, 1);
read_unlock(&map_tree->lock);
if (!em) {
/*
* Might have been an unused block group deleted by the cleaner
* kthread or relocation.
*/
spin_lock(&cache->lock);
if (!cache->removed)
ret = -EINVAL;
spin_unlock(&cache->lock);
return ret;
}
map = em->map_lookup;
if (em->start != chunk_offset)
goto out;
if (em->len < length)
goto out;
for (i = 0; i < map->num_stripes; ++i) {
if (map->stripes[i].dev->bdev == scrub_dev->bdev &&
map->stripes[i].physical == dev_offset) {
ret = scrub_stripe(sctx, map, scrub_dev, i,
chunk_offset, length, cache);
if (ret)
goto out;
}
}
out:
free_extent_map(em);
return ret;
}
static int finish_extent_writes_for_zoned(struct btrfs_root *root,
struct btrfs_block_group *cache)
{
struct btrfs_fs_info *fs_info = cache->fs_info;
struct btrfs_trans_handle *trans;
if (!btrfs_is_zoned(fs_info))
return 0;
btrfs_wait_block_group_reservations(cache);
btrfs_wait_nocow_writers(cache);
btrfs_wait_ordered_roots(fs_info, U64_MAX, cache->start, cache->length);
trans = btrfs_join_transaction(root);
if (IS_ERR(trans))
return PTR_ERR(trans);
return btrfs_commit_transaction(trans);
}
static noinline_for_stack
int scrub_enumerate_chunks(struct scrub_ctx *sctx,
struct btrfs_device *scrub_dev, u64 start, u64 end)
{
struct btrfs_dev_extent *dev_extent = NULL;
struct btrfs_path *path;
struct btrfs_fs_info *fs_info = sctx->fs_info;
struct btrfs_root *root = fs_info->dev_root;
u64 length;
u64 chunk_offset;
int ret = 0;
int ro_set;
int slot;
struct extent_buffer *l;
struct btrfs_key key;
struct btrfs_key found_key;
struct btrfs_block_group *cache;
struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
path->reada = READA_FORWARD;
path->search_commit_root = 1;
path->skip_locking = 1;
key.objectid = scrub_dev->devid;
key.offset = 0ull;
key.type = BTRFS_DEV_EXTENT_KEY;
while (1) {
ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
if (ret < 0)
break;
if (ret > 0) {
if (path->slots[0] >=
btrfs_header_nritems(path->nodes[0])) {
ret = btrfs_next_leaf(root, path);
if (ret < 0)
break;
if (ret > 0) {
ret = 0;
break;
}
} else {
ret = 0;
}
}
l = path->nodes[0];
slot = path->slots[0];
btrfs_item_key_to_cpu(l, &found_key, slot);
if (found_key.objectid != scrub_dev->devid)
break;
if (found_key.type != BTRFS_DEV_EXTENT_KEY)
break;
if (found_key.offset >= end)
break;
if (found_key.offset < key.offset)
break;
dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
length = btrfs_dev_extent_length(l, dev_extent);
if (found_key.offset + length <= start)
goto skip;
chunk_offset = btrfs_dev_extent_chunk_offset(l, dev_extent);
/*
* get a reference on the corresponding block group to prevent
* the chunk from going away while we scrub it
*/
cache = btrfs_lookup_block_group(fs_info, chunk_offset);
/* some chunks are removed but not committed to disk yet,
* continue scrubbing */
if (!cache)
goto skip;
if (sctx->is_dev_replace && btrfs_is_zoned(fs_info)) {
spin_lock(&cache->lock);
if (!cache->to_copy) {
spin_unlock(&cache->lock);
btrfs_put_block_group(cache);
goto skip;
}
spin_unlock(&cache->lock);
}
/*
* Make sure that while we are scrubbing the corresponding block
* group doesn't get its logical address and its device extents
* reused for another block group, which can possibly be of a
* different type and different profile. We do this to prevent
* false error detections and crashes due to bogus attempts to
* repair extents.
*/
spin_lock(&cache->lock);
if (cache->removed) {
spin_unlock(&cache->lock);
btrfs_put_block_group(cache);
goto skip;
}
btrfs_freeze_block_group(cache);
spin_unlock(&cache->lock);
/*
* we need call btrfs_inc_block_group_ro() with scrubs_paused,
* to avoid deadlock caused by:
* btrfs_inc_block_group_ro()
* -> btrfs_wait_for_commit()
* -> btrfs_commit_transaction()
* -> btrfs_scrub_pause()
*/
scrub_pause_on(fs_info);
/*
* Don't do chunk preallocation for scrub.
*
* This is especially important for SYSTEM bgs, or we can hit
* -EFBIG from btrfs_finish_chunk_alloc() like:
* 1. The only SYSTEM bg is marked RO.
* Since SYSTEM bg is small, that's pretty common.
* 2. New SYSTEM bg will be allocated
* Due to regular version will allocate new chunk.
* 3. New SYSTEM bg is empty and will get cleaned up
* Before cleanup really happens, it's marked RO again.
* 4. Empty SYSTEM bg get scrubbed
* We go back to 2.
*
* This can easily boost the amount of SYSTEM chunks if cleaner
* thread can't be triggered fast enough, and use up all space
* of btrfs_super_block::sys_chunk_array
*
* While for dev replace, we need to try our best to mark block
* group RO, to prevent race between:
* - Write duplication
* Contains latest data
* - Scrub copy
* Contains data from commit tree
*
* If target block group is not marked RO, nocow writes can
* be overwritten by scrub copy, causing data corruption.
* So for dev-replace, it's not allowed to continue if a block
* group is not RO.
*/
ret = btrfs_inc_block_group_ro(cache, sctx->is_dev_replace);
if (!ret && sctx->is_dev_replace) {
ret = finish_extent_writes_for_zoned(root, cache);
if (ret) {
btrfs_dec_block_group_ro(cache);
scrub_pause_off(fs_info);
btrfs_put_block_group(cache);
break;
}
}
if (ret == 0) {
ro_set = 1;
} else if (ret == -ENOSPC && !sctx->is_dev_replace) {
/*
* btrfs_inc_block_group_ro return -ENOSPC when it
* failed in creating new chunk for metadata.
* It is not a problem for scrub, because
* metadata are always cowed, and our scrub paused
* commit_transactions.
*/
ro_set = 0;
} else if (ret == -ETXTBSY) {
btrfs_warn(fs_info,
"skipping scrub of block group %llu due to active swapfile",
cache->start);
scrub_pause_off(fs_info);
ret = 0;
goto skip_unfreeze;
} else {
btrfs_warn(fs_info,
"failed setting block group ro: %d", ret);
btrfs_unfreeze_block_group(cache);
btrfs_put_block_group(cache);
scrub_pause_off(fs_info);
break;
}
/*
* Now the target block is marked RO, wait for nocow writes to
* finish before dev-replace.
* COW is fine, as COW never overwrites extents in commit tree.
*/
if (sctx->is_dev_replace) {
btrfs_wait_nocow_writers(cache);
btrfs_wait_ordered_roots(fs_info, U64_MAX, cache->start,
cache->length);
}
scrub_pause_off(fs_info);
down_write(&dev_replace->rwsem);
dev_replace->cursor_right = found_key.offset + length;
dev_replace->cursor_left = found_key.offset;
dev_replace->item_needs_writeback = 1;
up_write(&dev_replace->rwsem);
ret = scrub_chunk(sctx, scrub_dev, chunk_offset, length,
found_key.offset, cache);
/*
* flush, submit all pending read and write bios, afterwards
* wait for them.
* Note that in the dev replace case, a read request causes
* write requests that are submitted in the read completion
* worker. Therefore in the current situation, it is required
* that all write requests are flushed, so that all read and
* write requests are really completed when bios_in_flight
* changes to 0.
*/
sctx->flush_all_writes = true;
scrub_submit(sctx);
mutex_lock(&sctx->wr_lock);
scrub_wr_submit(sctx);
mutex_unlock(&sctx->wr_lock);
wait_event(sctx->list_wait,
atomic_read(&sctx->bios_in_flight) == 0);
scrub_pause_on(fs_info);
/*
* must be called before we decrease @scrub_paused.
* make sure we don't block transaction commit while
* we are waiting pending workers finished.
*/
wait_event(sctx->list_wait,
atomic_read(&sctx->workers_pending) == 0);
sctx->flush_all_writes = false;
scrub_pause_off(fs_info);
if (sctx->is_dev_replace &&
!btrfs_finish_block_group_to_copy(dev_replace->srcdev,
cache, found_key.offset))
ro_set = 0;
down_write(&dev_replace->rwsem);
dev_replace->cursor_left = dev_replace->cursor_right;
dev_replace->item_needs_writeback = 1;
up_write(&dev_replace->rwsem);
if (ro_set)
btrfs_dec_block_group_ro(cache);
/*
* We might have prevented the cleaner kthread from deleting
* this block group if it was already unused because we raced
* and set it to RO mode first. So add it back to the unused
* list, otherwise it might not ever be deleted unless a manual
* balance is triggered or it becomes used and unused again.
*/
spin_lock(&cache->lock);
if (!cache->removed && !cache->ro && cache->reserved == 0 &&
cache->used == 0) {
spin_unlock(&cache->lock);
if (btrfs_test_opt(fs_info, DISCARD_ASYNC))
btrfs_discard_queue_work(&fs_info->discard_ctl,
cache);
else
btrfs_mark_bg_unused(cache);
} else {
spin_unlock(&cache->lock);
}
skip_unfreeze:
btrfs_unfreeze_block_group(cache);
btrfs_put_block_group(cache);
if (ret)
break;
if (sctx->is_dev_replace &&
atomic64_read(&dev_replace->num_write_errors) > 0) {
ret = -EIO;
break;
}
if (sctx->stat.malloc_errors > 0) {
ret = -ENOMEM;
break;
}
skip:
key.offset = found_key.offset + length;
btrfs_release_path(path);
}
btrfs_free_path(path);
return ret;
}
static noinline_for_stack int scrub_supers(struct scrub_ctx *sctx,
struct btrfs_device *scrub_dev)
{
int i;
u64 bytenr;
u64 gen;
int ret;
struct btrfs_fs_info *fs_info = sctx->fs_info;
if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state))
return -EROFS;
/* Seed devices of a new filesystem has their own generation. */
if (scrub_dev->fs_devices != fs_info->fs_devices)
gen = scrub_dev->generation;
else
gen = fs_info->last_trans_committed;
for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
bytenr = btrfs_sb_offset(i);
if (bytenr + BTRFS_SUPER_INFO_SIZE >
scrub_dev->commit_total_bytes)
break;
if (!btrfs_check_super_location(scrub_dev, bytenr))
continue;
ret = scrub_pages(sctx, bytenr, BTRFS_SUPER_INFO_SIZE, bytenr,
scrub_dev, BTRFS_EXTENT_FLAG_SUPER, gen, i,
NULL, bytenr);
if (ret)
return ret;
}
wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0);
return 0;
}
static void scrub_workers_put(struct btrfs_fs_info *fs_info)
{
if (refcount_dec_and_mutex_lock(&fs_info->scrub_workers_refcnt,
&fs_info->scrub_lock)) {
struct btrfs_workqueue *scrub_workers = NULL;
struct btrfs_workqueue *scrub_wr_comp = NULL;
struct btrfs_workqueue *scrub_parity = NULL;
scrub_workers = fs_info->scrub_workers;
scrub_wr_comp = fs_info->scrub_wr_completion_workers;
scrub_parity = fs_info->scrub_parity_workers;
fs_info->scrub_workers = NULL;
fs_info->scrub_wr_completion_workers = NULL;
fs_info->scrub_parity_workers = NULL;
mutex_unlock(&fs_info->scrub_lock);
btrfs_destroy_workqueue(scrub_workers);
btrfs_destroy_workqueue(scrub_wr_comp);
btrfs_destroy_workqueue(scrub_parity);
}
}
/*
* get a reference count on fs_info->scrub_workers. start worker if necessary
*/
static noinline_for_stack int scrub_workers_get(struct btrfs_fs_info *fs_info,
int is_dev_replace)
{
struct btrfs_workqueue *scrub_workers = NULL;
struct btrfs_workqueue *scrub_wr_comp = NULL;
struct btrfs_workqueue *scrub_parity = NULL;
unsigned int flags = WQ_FREEZABLE | WQ_UNBOUND;
int max_active = fs_info->thread_pool_size;
int ret = -ENOMEM;
if (refcount_inc_not_zero(&fs_info->scrub_workers_refcnt))
return 0;
scrub_workers = btrfs_alloc_workqueue(fs_info, "scrub", flags,
is_dev_replace ? 1 : max_active, 4);
if (!scrub_workers)
goto fail_scrub_workers;
scrub_wr_comp = btrfs_alloc_workqueue(fs_info, "scrubwrc", flags,
max_active, 2);
if (!scrub_wr_comp)
goto fail_scrub_wr_completion_workers;
scrub_parity = btrfs_alloc_workqueue(fs_info, "scrubparity", flags,
max_active, 2);
if (!scrub_parity)
goto fail_scrub_parity_workers;
mutex_lock(&fs_info->scrub_lock);
if (refcount_read(&fs_info->scrub_workers_refcnt) == 0) {
ASSERT(fs_info->scrub_workers == NULL &&
fs_info->scrub_wr_completion_workers == NULL &&
fs_info->scrub_parity_workers == NULL);
fs_info->scrub_workers = scrub_workers;
fs_info->scrub_wr_completion_workers = scrub_wr_comp;
fs_info->scrub_parity_workers = scrub_parity;
refcount_set(&fs_info->scrub_workers_refcnt, 1);
mutex_unlock(&fs_info->scrub_lock);
return 0;
}
/* Other thread raced in and created the workers for us */
refcount_inc(&fs_info->scrub_workers_refcnt);
mutex_unlock(&fs_info->scrub_lock);
ret = 0;
btrfs_destroy_workqueue(scrub_parity);
fail_scrub_parity_workers:
btrfs_destroy_workqueue(scrub_wr_comp);
fail_scrub_wr_completion_workers:
btrfs_destroy_workqueue(scrub_workers);
fail_scrub_workers:
return ret;
}
int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
u64 end, struct btrfs_scrub_progress *progress,
int readonly, int is_dev_replace)
{
struct scrub_ctx *sctx;
int ret;
struct btrfs_device *dev;
unsigned int nofs_flag;
if (btrfs_fs_closing(fs_info))
return -EAGAIN;
if (fs_info->nodesize > BTRFS_STRIPE_LEN) {
/*
* in this case scrub is unable to calculate the checksum
* the way scrub is implemented. Do not handle this
* situation at all because it won't ever happen.
*/
btrfs_err(fs_info,
"scrub: size assumption nodesize <= BTRFS_STRIPE_LEN (%d <= %d) fails",
fs_info->nodesize,
BTRFS_STRIPE_LEN);
return -EINVAL;
}
if (fs_info->nodesize >
PAGE_SIZE * SCRUB_MAX_PAGES_PER_BLOCK ||
fs_info->sectorsize > PAGE_SIZE * SCRUB_MAX_PAGES_PER_BLOCK) {
/*
* would exhaust the array bounds of pagev member in
* struct scrub_block
*/
btrfs_err(fs_info,
"scrub: size assumption nodesize and sectorsize <= SCRUB_MAX_PAGES_PER_BLOCK (%d <= %d && %d <= %d) fails",
fs_info->nodesize,
SCRUB_MAX_PAGES_PER_BLOCK,
fs_info->sectorsize,
SCRUB_MAX_PAGES_PER_BLOCK);
return -EINVAL;
}
/* Allocate outside of device_list_mutex */
sctx = scrub_setup_ctx(fs_info, is_dev_replace);
if (IS_ERR(sctx))
return PTR_ERR(sctx);
ret = scrub_workers_get(fs_info, is_dev_replace);
if (ret)
goto out_free_ctx;
mutex_lock(&fs_info->fs_devices->device_list_mutex);
dev = btrfs_find_device(fs_info->fs_devices, devid, NULL, NULL);
if (!dev || (test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state) &&
!is_dev_replace)) {
mutex_unlock(&fs_info->fs_devices->device_list_mutex);
ret = -ENODEV;
goto out;
}
if (!is_dev_replace && !readonly &&
!test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state)) {
mutex_unlock(&fs_info->fs_devices->device_list_mutex);
btrfs_err_in_rcu(fs_info,
"scrub on devid %llu: filesystem on %s is not writable",
devid, rcu_str_deref(dev->name));
ret = -EROFS;
goto out;
}
mutex_lock(&fs_info->scrub_lock);
if (!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &dev->dev_state) ||
test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &dev->dev_state)) {
mutex_unlock(&fs_info->scrub_lock);
mutex_unlock(&fs_info->fs_devices->device_list_mutex);
ret = -EIO;
goto out;
}
down_read(&fs_info->dev_replace.rwsem);
if (dev->scrub_ctx ||
(!is_dev_replace &&
btrfs_dev_replace_is_ongoing(&fs_info->dev_replace))) {
up_read(&fs_info->dev_replace.rwsem);
mutex_unlock(&fs_info->scrub_lock);
mutex_unlock(&fs_info->fs_devices->device_list_mutex);
ret = -EINPROGRESS;
goto out;
}
up_read(&fs_info->dev_replace.rwsem);
sctx->readonly = readonly;
dev->scrub_ctx = sctx;
mutex_unlock(&fs_info->fs_devices->device_list_mutex);
/*
* checking @scrub_pause_req here, we can avoid
* race between committing transaction and scrubbing.
*/
__scrub_blocked_if_needed(fs_info);
atomic_inc(&fs_info->scrubs_running);
mutex_unlock(&fs_info->scrub_lock);
/*
* In order to avoid deadlock with reclaim when there is a transaction
* trying to pause scrub, make sure we use GFP_NOFS for all the
* allocations done at btrfs_scrub_pages() and scrub_pages_for_parity()
* invoked by our callees. The pausing request is done when the
* transaction commit starts, and it blocks the transaction until scrub
* is paused (done at specific points at scrub_stripe() or right above
* before incrementing fs_info->scrubs_running).
*/
nofs_flag = memalloc_nofs_save();
if (!is_dev_replace) {
btrfs_info(fs_info, "scrub: started on devid %llu", devid);
/*
* by holding device list mutex, we can
* kick off writing super in log tree sync.
*/
mutex_lock(&fs_info->fs_devices->device_list_mutex);
ret = scrub_supers(sctx, dev);
mutex_unlock(&fs_info->fs_devices->device_list_mutex);
}
if (!ret)
ret = scrub_enumerate_chunks(sctx, dev, start, end);
memalloc_nofs_restore(nofs_flag);
wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0);
atomic_dec(&fs_info->scrubs_running);
wake_up(&fs_info->scrub_pause_wait);
wait_event(sctx->list_wait, atomic_read(&sctx->workers_pending) == 0);
if (progress)
memcpy(progress, &sctx->stat, sizeof(*progress));
if (!is_dev_replace)
btrfs_info(fs_info, "scrub: %s on devid %llu with status: %d",
ret ? "not finished" : "finished", devid, ret);
mutex_lock(&fs_info->scrub_lock);
dev->scrub_ctx = NULL;
mutex_unlock(&fs_info->scrub_lock);
scrub_workers_put(fs_info);
scrub_put_ctx(sctx);
return ret;
out:
scrub_workers_put(fs_info);
out_free_ctx:
scrub_free_ctx(sctx);
return ret;
}
void btrfs_scrub_pause(struct btrfs_fs_info *fs_info)
{
mutex_lock(&fs_info->scrub_lock);
atomic_inc(&fs_info->scrub_pause_req);
while (atomic_read(&fs_info->scrubs_paused) !=
atomic_read(&fs_info->scrubs_running)) {
mutex_unlock(&fs_info->scrub_lock); wait_event(fs_info->scrub_pause_wait,
atomic_read(&fs_info->scrubs_paused) ==
atomic_read(&fs_info->scrubs_running));
mutex_lock(&fs_info->scrub_lock);
}
mutex_unlock(&fs_info->scrub_lock);
}
void btrfs_scrub_continue(struct btrfs_fs_info *fs_info)
{
atomic_dec(&fs_info->scrub_pause_req);
wake_up(&fs_info->scrub_pause_wait);
}
int btrfs_scrub_cancel(struct btrfs_fs_info *fs_info)
{
mutex_lock(&fs_info->scrub_lock);
if (!atomic_read(&fs_info->scrubs_running)) {
mutex_unlock(&fs_info->scrub_lock);
return -ENOTCONN;
}
atomic_inc(&fs_info->scrub_cancel_req);
while (atomic_read(&fs_info->scrubs_running)) {
mutex_unlock(&fs_info->scrub_lock); wait_event(fs_info->scrub_pause_wait,
atomic_read(&fs_info->scrubs_running) == 0);
mutex_lock(&fs_info->scrub_lock);
}
atomic_dec(&fs_info->scrub_cancel_req);
mutex_unlock(&fs_info->scrub_lock);
return 0;
}
int btrfs_scrub_cancel_dev(struct btrfs_device *dev)
{
struct btrfs_fs_info *fs_info = dev->fs_info;
struct scrub_ctx *sctx;
mutex_lock(&fs_info->scrub_lock);
sctx = dev->scrub_ctx;
if (!sctx) {
mutex_unlock(&fs_info->scrub_lock);
return -ENOTCONN;
}
atomic_inc(&sctx->cancel_req);
while (dev->scrub_ctx) {
mutex_unlock(&fs_info->scrub_lock);
wait_event(fs_info->scrub_pause_wait,
dev->scrub_ctx == NULL);
mutex_lock(&fs_info->scrub_lock);
}
mutex_unlock(&fs_info->scrub_lock);
return 0;
}
int btrfs_scrub_progress(struct btrfs_fs_info *fs_info, u64 devid,
struct btrfs_scrub_progress *progress)
{
struct btrfs_device *dev;
struct scrub_ctx *sctx = NULL;
mutex_lock(&fs_info->fs_devices->device_list_mutex);
dev = btrfs_find_device(fs_info->fs_devices, devid, NULL, NULL);
if (dev)
sctx = dev->scrub_ctx;
if (sctx)
memcpy(progress, &sctx->stat, sizeof(*progress));
mutex_unlock(&fs_info->fs_devices->device_list_mutex);
return dev ? (sctx ? 0 : -ENOTCONN) : -ENODEV;
}
static void scrub_remap_extent(struct btrfs_fs_info *fs_info,
u64 extent_logical, u32 extent_len,
u64 *extent_physical,
struct btrfs_device **extent_dev,
int *extent_mirror_num)
{
u64 mapped_length;
struct btrfs_bio *bbio = NULL;
int ret;
mapped_length = extent_len;
ret = btrfs_map_block(fs_info, BTRFS_MAP_READ, extent_logical,
&mapped_length, &bbio, 0);
if (ret || !bbio || mapped_length < extent_len ||
!bbio->stripes[0].dev->bdev) {
btrfs_put_bbio(bbio);
return;
}
*extent_physical = bbio->stripes[0].physical;
*extent_mirror_num = bbio->mirror_num;
*extent_dev = bbio->stripes[0].dev;
btrfs_put_bbio(bbio);
}
// SPDX-License-Identifier: GPL-2.0
/*
* Copyright (C) 2010 Red Hat, Inc.
* Copyright (c) 2016-2021 Christoph Hellwig.
*/
#include <linux/fs.h>
#include <linux/iomap.h>
#include "trace.h"
static inline int iomap_iter_advance(struct iomap_iter *iter)
{
/* handle the previous iteration (if any) */
if (iter->iomap.length) { if (iter->processed <= 0) return iter->processed; if (WARN_ON_ONCE(iter->processed > iomap_length(iter)))
return -EIO;
iter->pos += iter->processed;
iter->len -= iter->processed;
if (!iter->len)
return 0;
}
/* clear the state for the next iteration */
iter->processed = 0;
memset(&iter->iomap, 0, sizeof(iter->iomap));
memset(&iter->srcmap, 0, sizeof(iter->srcmap));
return 1;
}
static inline void iomap_iter_done(struct iomap_iter *iter)
{
WARN_ON_ONCE(iter->iomap.offset > iter->pos); WARN_ON_ONCE(iter->iomap.length == 0); WARN_ON_ONCE(iter->iomap.offset + iter->iomap.length <= iter->pos); trace_iomap_iter_dstmap(iter->inode, &iter->iomap); if (iter->srcmap.type != IOMAP_HOLE) trace_iomap_iter_srcmap(iter->inode, &iter->srcmap);
}
/**
* iomap_iter - iterate over a ranges in a file
* @iter: iteration structue
* @ops: iomap ops provided by the file system
*
* Iterate over filesystem-provided space mappings for the provided file range.
*
* This function handles cleanup of resources acquired for iteration when the
* filesystem indicates there are no more space mappings, which means that this
* function must be called in a loop that continues as long it returns a
* positive value. If 0 or a negative value is returned, the caller must not
* return to the loop body. Within a loop body, there are two ways to break out
* of the loop body: leave @iter.processed unchanged, or set it to a negative
* errno.
*/
int iomap_iter(struct iomap_iter *iter, const struct iomap_ops *ops)
{
int ret;
if (iter->iomap.length && ops->iomap_end) { ret = ops->iomap_end(iter->inode, iter->pos, iomap_length(iter),
iter->processed > 0 ? iter->processed : 0,
iter->flags, &iter->iomap);
if (ret < 0 && !iter->processed)
return ret;
}
trace_iomap_iter(iter, ops, _RET_IP_);
ret = iomap_iter_advance(iter);
if (ret <= 0)
return ret;
ret = ops->iomap_begin(iter->inode, iter->pos, iter->len, iter->flags,
&iter->iomap, &iter->srcmap);
if (ret < 0)
return ret;
iomap_iter_done(iter);
return 1;
}
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _X_TABLES_H
#define _X_TABLES_H
#include <linux/netdevice.h>
#include <linux/static_key.h>
#include <linux/netfilter.h>
#include <uapi/linux/netfilter/x_tables.h>
/* Test a struct->invflags and a boolean for inequality */
#define NF_INVF(ptr, flag, boolean) \
((boolean) ^ !!((ptr)->invflags & (flag)))
/**
* struct xt_action_param - parameters for matches/targets
*
* @match: the match extension
* @target: the target extension
* @matchinfo: per-match data
* @targetinfo: per-target data
* @state: pointer to hook state this packet came from
* @fragoff: packet is a fragment, this is the data offset
* @thoff: position of transport header relative to skb->data
*
* Fields written to by extensions:
*
* @hotdrop: drop packet if we had inspection problems
*/
struct xt_action_param {
union {
const struct xt_match *match;
const struct xt_target *target;
};
union {
const void *matchinfo, *targinfo;
};
const struct nf_hook_state *state;
unsigned int thoff;
u16 fragoff;
bool hotdrop;
};
static inline struct net *xt_net(const struct xt_action_param *par)
{
return par->state->net;
}
static inline struct net_device *xt_in(const struct xt_action_param *par)
{
return par->state->in;
}
static inline const char *xt_inname(const struct xt_action_param *par)
{
return par->state->in->name;
}
static inline struct net_device *xt_out(const struct xt_action_param *par)
{
return par->state->out;
}
static inline const char *xt_outname(const struct xt_action_param *par)
{
return par->state->out->name;
}
static inline unsigned int xt_hooknum(const struct xt_action_param *par)
{
return par->state->hook;
}
static inline u_int8_t xt_family(const struct xt_action_param *par)
{
return par->state->pf;
}
/**
* struct xt_mtchk_param - parameters for match extensions'
* checkentry functions
*
* @net: network namespace through which the check was invoked
* @table: table the rule is tried to be inserted into
* @entryinfo: the family-specific rule data
* (struct ipt_ip, ip6t_ip, arpt_arp or (note) ebt_entry)
* @match: struct xt_match through which this function was invoked
* @matchinfo: per-match data
* @hook_mask: via which hooks the new rule is reachable
* Other fields as above.
*/
struct xt_mtchk_param {
struct net *net;
const char *table;
const void *entryinfo;
const struct xt_match *match;
void *matchinfo;
unsigned int hook_mask;
u_int8_t family;
bool nft_compat;
};
/**
* struct xt_mdtor_param - match destructor parameters
* Fields as above.
*/
struct xt_mtdtor_param {
struct net *net;
const struct xt_match *match;
void *matchinfo;
u_int8_t family;
};
/**
* struct xt_tgchk_param - parameters for target extensions'
* checkentry functions
*
* @entryinfo: the family-specific rule data
* (struct ipt_entry, ip6t_entry, arpt_entry, ebt_entry)
*
* Other fields see above.
*/
struct xt_tgchk_param {
struct net *net;
const char *table;
const void *entryinfo;
const struct xt_target *target;
void *targinfo;
unsigned int hook_mask;
u_int8_t family;
bool nft_compat;
};
/* Target destructor parameters */
struct xt_tgdtor_param {
struct net *net;
const struct xt_target *target;
void *targinfo;
u_int8_t family;
};
struct xt_match {
struct list_head list;
const char name[XT_EXTENSION_MAXNAMELEN];
u_int8_t revision;
/* Return true or false: return FALSE and set *hotdrop = 1 to
force immediate packet drop. */
/* Arguments changed since 2.6.9, as this must now handle
non-linear skb, using skb_header_pointer and
skb_ip_make_writable. */
bool (*match)(const struct sk_buff *skb,
struct xt_action_param *);
/* Called when user tries to insert an entry of this type. */
int (*checkentry)(const struct xt_mtchk_param *);
/* Called when entry of this type deleted. */
void (*destroy)(const struct xt_mtdtor_param *);
#ifdef CONFIG_NETFILTER_XTABLES_COMPAT
/* Called when userspace align differs from kernel space one */
void (*compat_from_user)(void *dst, const void *src);
int (*compat_to_user)(void __user *dst, const void *src);
#endif
/* Set this to THIS_MODULE if you are a module, otherwise NULL */
struct module *me;
const char *table;
unsigned int matchsize;
unsigned int usersize;
#ifdef CONFIG_NETFILTER_XTABLES_COMPAT
unsigned int compatsize;
#endif
unsigned int hooks;
unsigned short proto;
unsigned short family;
};
/* Registration hooks for targets. */
struct xt_target {
struct list_head list;
const char name[XT_EXTENSION_MAXNAMELEN];
u_int8_t revision;
/* Returns verdict. Argument order changed since 2.6.9, as this
must now handle non-linear skbs, using skb_copy_bits and
skb_ip_make_writable. */
unsigned int (*target)(struct sk_buff *skb,
const struct xt_action_param *);
/* Called when user tries to insert an entry of this type:
hook_mask is a bitmask of hooks from which it can be
called. */
/* Should return 0 on success or an error code otherwise (-Exxxx). */
int (*checkentry)(const struct xt_tgchk_param *);
/* Called when entry of this type deleted. */
void (*destroy)(const struct xt_tgdtor_param *);
#ifdef CONFIG_NETFILTER_XTABLES_COMPAT
/* Called when userspace align differs from kernel space one */
void (*compat_from_user)(void *dst, const void *src);
int (*compat_to_user)(void __user *dst, const void *src);
#endif
/* Set this to THIS_MODULE if you are a module, otherwise NULL */
struct module *me;
const char *table;
unsigned int targetsize;
unsigned int usersize;
#ifdef CONFIG_NETFILTER_XTABLES_COMPAT
unsigned int compatsize;
#endif
unsigned int hooks;
unsigned short proto;
unsigned short family;
};
/* Furniture shopping... */
struct xt_table {
struct list_head list;
/* What hooks you will enter on */
unsigned int valid_hooks;
/* Man behind the curtain... */
struct xt_table_info *private;
/* hook ops that register the table with the netfilter core */
struct nf_hook_ops *ops;
/* Set this to THIS_MODULE if you are a module, otherwise NULL */
struct module *me;
u_int8_t af; /* address/protocol family */
int priority; /* hook order */
/* A unique name... */
const char name[XT_TABLE_MAXNAMELEN];
};
#include <linux/netfilter_ipv4.h>
/* The table itself */
struct xt_table_info {
/* Size per table */
unsigned int size;
/* Number of entries: FIXME. --RR */
unsigned int number;
/* Initial number of entries. Needed for module usage count */
unsigned int initial_entries;
/* Entry points and underflows */
unsigned int hook_entry[NF_INET_NUMHOOKS];
unsigned int underflow[NF_INET_NUMHOOKS];
/*
* Number of user chains. Since tables cannot have loops, at most
* @stacksize jumps (number of user chains) can possibly be made.
*/
unsigned int stacksize;
void ***jumpstack;
unsigned char entries[] __aligned(8);
};
int xt_register_target(struct xt_target *target);
void xt_unregister_target(struct xt_target *target);
int xt_register_targets(struct xt_target *target, unsigned int n);
void xt_unregister_targets(struct xt_target *target, unsigned int n);
int xt_register_match(struct xt_match *target);
void xt_unregister_match(struct xt_match *target);
int xt_register_matches(struct xt_match *match, unsigned int n);
void xt_unregister_matches(struct xt_match *match, unsigned int n);
int xt_check_entry_offsets(const void *base, const char *elems,
unsigned int target_offset,
unsigned int next_offset);
int xt_check_table_hooks(const struct xt_table_info *info, unsigned int valid_hooks);
unsigned int *xt_alloc_entry_offsets(unsigned int size);
bool xt_find_jump_offset(const unsigned int *offsets,
unsigned int target, unsigned int size);
int xt_check_proc_name(const char *name, unsigned int size);
int xt_check_match(struct xt_mtchk_param *, unsigned int size, u16 proto,
bool inv_proto);
int xt_check_target(struct xt_tgchk_param *, unsigned int size, u16 proto,
bool inv_proto);
int xt_match_to_user(const struct xt_entry_match *m,
struct xt_entry_match __user *u);
int xt_target_to_user(const struct xt_entry_target *t,
struct xt_entry_target __user *u);
int xt_data_to_user(void __user *dst, const void *src,
int usersize, int size, int aligned_size);
void *xt_copy_counters(sockptr_t arg, unsigned int len,
struct xt_counters_info *info);
struct xt_counters *xt_counters_alloc(unsigned int counters);
struct xt_table *xt_register_table(struct net *net,
const struct xt_table *table,
struct xt_table_info *bootstrap,
struct xt_table_info *newinfo);
void *xt_unregister_table(struct xt_table *table);
struct xt_table_info *xt_replace_table(struct xt_table *table,
unsigned int num_counters,
struct xt_table_info *newinfo,
int *error);
struct xt_match *xt_find_match(u8 af, const char *name, u8 revision);
struct xt_match *xt_request_find_match(u8 af, const char *name, u8 revision);
struct xt_target *xt_request_find_target(u8 af, const char *name, u8 revision);
int xt_find_revision(u8 af, const char *name, u8 revision, int target,
int *err);
struct xt_table *xt_find_table(struct net *net, u8 af, const char *name);
struct xt_table *xt_find_table_lock(struct net *net, u_int8_t af,
const char *name);
struct xt_table *xt_request_find_table_lock(struct net *net, u_int8_t af,
const char *name);
void xt_table_unlock(struct xt_table *t);
int xt_proto_init(struct net *net, u_int8_t af);
void xt_proto_fini(struct net *net, u_int8_t af);
struct xt_table_info *xt_alloc_table_info(unsigned int size);
void xt_free_table_info(struct xt_table_info *info);
/**
* xt_recseq - recursive seqcount for netfilter use
*
* Packet processing changes the seqcount only if no recursion happened
* get_counters() can use read_seqcount_begin()/read_seqcount_retry(),
* because we use the normal seqcount convention :
* Low order bit set to 1 if a writer is active.
*/
DECLARE_PER_CPU(seqcount_t, xt_recseq);
/* xt_tee_enabled - true if x_tables needs to handle reentrancy
*
* Enabled if current ip(6)tables ruleset has at least one -j TEE rule.
*/
extern struct static_key xt_tee_enabled;
/**
* xt_write_recseq_begin - start of a write section
*
* Begin packet processing : all readers must wait the end
* 1) Must be called with preemption disabled
* 2) softirqs must be disabled too (or we should use this_cpu_add())
* Returns :
* 1 if no recursion on this cpu
* 0 if recursion detected
*/
static inline unsigned int xt_write_recseq_begin(void)
{
unsigned int addend;
/*
* Low order bit of sequence is set if we already
* called xt_write_recseq_begin().
*/
addend = (__this_cpu_read(xt_recseq.sequence) + 1) & 1;
/*
* This is kind of a write_seqcount_begin(), but addend is 0 or 1
* We dont check addend value to avoid a test and conditional jump,
* since addend is most likely 1
*/
__this_cpu_add(xt_recseq.sequence, addend);
smp_mb();
return addend;
}
/**
* xt_write_recseq_end - end of a write section
* @addend: return value from previous xt_write_recseq_begin()
*
* End packet processing : all readers can proceed
* 1) Must be called with preemption disabled
* 2) softirqs must be disabled too (or we should use this_cpu_add())
*/
static inline void xt_write_recseq_end(unsigned int addend)
{
/* this is kind of a write_seqcount_end(), but addend is 0 or 1 */
smp_wmb();
__this_cpu_add(xt_recseq.sequence, addend);
}
/*
* This helper is performance critical and must be inlined
*/
static inline unsigned long ifname_compare_aligned(const char *_a,
const char *_b,
const char *_mask)
{
const unsigned long *a = (const unsigned long *)_a;
const unsigned long *b = (const unsigned long *)_b;
const unsigned long *mask = (const unsigned long *)_mask;
unsigned long ret;
ret = (a[0] ^ b[0]) & mask[0];
if (IFNAMSIZ > sizeof(unsigned long))
ret |= (a[1] ^ b[1]) & mask[1];
if (IFNAMSIZ > 2 * sizeof(unsigned long))
ret |= (a[2] ^ b[2]) & mask[2];
if (IFNAMSIZ > 3 * sizeof(unsigned long))
ret |= (a[3] ^ b[3]) & mask[3];
BUILD_BUG_ON(IFNAMSIZ > 4 * sizeof(unsigned long));
return ret;
}
struct xt_percpu_counter_alloc_state {
unsigned int off;
const char __percpu *mem;
};
bool xt_percpu_counter_alloc(struct xt_percpu_counter_alloc_state *state,
struct xt_counters *counter);
void xt_percpu_counter_free(struct xt_counters *cnt);
static inline struct xt_counters *
xt_get_this_cpu_counter(struct xt_counters *cnt)
{
if (nr_cpu_ids > 1) return this_cpu_ptr((void __percpu *) (unsigned long) cnt->pcnt);
return cnt;
}
static inline struct xt_counters *
xt_get_per_cpu_counter(struct xt_counters *cnt, unsigned int cpu)
{
if (nr_cpu_ids > 1)
return per_cpu_ptr((void __percpu *) (unsigned long) cnt->pcnt, cpu);
return cnt;
}
struct nf_hook_ops *xt_hook_ops_alloc(const struct xt_table *, nf_hookfn *);
int xt_register_template(const struct xt_table *t, int(*table_init)(struct net *net));
void xt_unregister_template(const struct xt_table *t);
#ifdef CONFIG_NETFILTER_XTABLES_COMPAT
#include <net/compat.h>
struct compat_xt_entry_match {
union {
struct {
u_int16_t match_size;
char name[XT_FUNCTION_MAXNAMELEN - 1];
u_int8_t revision;
} user;
struct {
u_int16_t match_size;
compat_uptr_t match;
} kernel;
u_int16_t match_size;
} u;
unsigned char data[];
};
struct compat_xt_entry_target {
union {
struct {
u_int16_t target_size;
char name[XT_FUNCTION_MAXNAMELEN - 1];
u_int8_t revision;
} user;
struct {
u_int16_t target_size;
compat_uptr_t target;
} kernel;
u_int16_t target_size;
} u;
unsigned char data[];
};
/* FIXME: this works only on 32 bit tasks
* need to change whole approach in order to calculate align as function of
* current task alignment */
struct compat_xt_counters {
compat_u64 pcnt, bcnt; /* Packet and byte counters */
};
struct compat_xt_counters_info {
char name[XT_TABLE_MAXNAMELEN];
compat_uint_t num_counters;
struct compat_xt_counters counters[];
};
struct _compat_xt_align {
__u8 u8;
__u16 u16;
__u32 u32;
compat_u64 u64;
};
#define COMPAT_XT_ALIGN(s) __ALIGN_KERNEL((s), __alignof__(struct _compat_xt_align))
void xt_compat_lock(u_int8_t af);
void xt_compat_unlock(u_int8_t af);
int xt_compat_add_offset(u_int8_t af, unsigned int offset, int delta);
void xt_compat_flush_offsets(u_int8_t af);
int xt_compat_init_offsets(u8 af, unsigned int number);
int xt_compat_calc_jump(u_int8_t af, unsigned int offset);
int xt_compat_match_offset(const struct xt_match *match);
void xt_compat_match_from_user(struct xt_entry_match *m, void **dstptr,
unsigned int *size);
int xt_compat_match_to_user(const struct xt_entry_match *m,
void __user **dstptr, unsigned int *size);
int xt_compat_target_offset(const struct xt_target *target);
void xt_compat_target_from_user(struct xt_entry_target *t, void **dstptr,
unsigned int *size);
int xt_compat_target_to_user(const struct xt_entry_target *t,
void __user **dstptr, unsigned int *size);
int xt_compat_check_entry_offsets(const void *base, const char *elems,
unsigned int target_offset,
unsigned int next_offset);
#endif /* CONFIG_NETFILTER_XTABLES_COMPAT */
#endif /* _X_TABLES_H */
// SPDX-License-Identifier: GPL-2.0
/*
* High-level sync()-related operations
*/
#include <linux/blkdev.h>
#include <linux/kernel.h>
#include <linux/file.h>
#include <linux/fs.h>
#include <linux/slab.h>
#include <linux/export.h>
#include <linux/namei.h>
#include <linux/sched.h>
#include <linux/writeback.h>
#include <linux/syscalls.h>
#include <linux/linkage.h>
#include <linux/pagemap.h>
#include <linux/quotaops.h>
#include <linux/backing-dev.h>
#include "internal.h"
#define VALID_FLAGS (SYNC_FILE_RANGE_WAIT_BEFORE|SYNC_FILE_RANGE_WRITE| \
SYNC_FILE_RANGE_WAIT_AFTER)
/*
* Write out and wait upon all dirty data associated with this
* superblock. Filesystem data as well as the underlying block
* device. Takes the superblock lock.
*/
int sync_filesystem(struct super_block *sb)
{
int ret = 0;
/*
* We need to be protected against the filesystem going from
* r/o to r/w or vice versa.
*/
WARN_ON(!rwsem_is_locked(&sb->s_umount));
/*
* No point in syncing out anything if the filesystem is read-only.
*/
if (sb_rdonly(sb))
return 0;
/*
* Do the filesystem syncing work. For simple filesystems
* writeback_inodes_sb(sb) just dirties buffers with inodes so we have
* to submit I/O for these buffers via sync_blockdev(). This also
* speeds up the wait == 1 case since in that case write_inode()
* methods call sync_dirty_buffer() and thus effectively write one block
* at a time.
*/
writeback_inodes_sb(sb, WB_REASON_SYNC);
if (sb->s_op->sync_fs) {
ret = sb->s_op->sync_fs(sb, 0);
if (ret)
return ret;
}
ret = sync_blockdev_nowait(sb->s_bdev);
if (ret)
return ret;
sync_inodes_sb(sb);
if (sb->s_op->sync_fs) {
ret = sb->s_op->sync_fs(sb, 1);
if (ret)
return ret;
}
return sync_blockdev(sb->s_bdev);
}
EXPORT_SYMBOL(sync_filesystem);
static void sync_inodes_one_sb(struct super_block *sb, void *arg)
{
if (!sb_rdonly(sb))
sync_inodes_sb(sb);
}
static void sync_fs_one_sb(struct super_block *sb, void *arg)
{
if (!sb_rdonly(sb) && !(sb->s_iflags & SB_I_SKIP_SYNC) &&
sb->s_op->sync_fs)
sb->s_op->sync_fs(sb, *(int *)arg);
}
/*
* Sync everything. We start by waking flusher threads so that most of
* writeback runs on all devices in parallel. Then we sync all inodes reliably
* which effectively also waits for all flusher threads to finish doing
* writeback. At this point all data is on disk so metadata should be stable
* and we tell filesystems to sync their metadata via ->sync_fs() calls.
* Finally, we writeout all block devices because some filesystems (e.g. ext2)
* just write metadata (such as inodes or bitmaps) to block device page cache
* and do not sync it on their own in ->sync_fs().
*/
void ksys_sync(void)
{
int nowait = 0, wait = 1;
wakeup_flusher_threads(WB_REASON_SYNC);
iterate_supers(sync_inodes_one_sb, NULL);
iterate_supers(sync_fs_one_sb, &nowait);
iterate_supers(sync_fs_one_sb, &wait);
sync_bdevs(false);
sync_bdevs(true);
if (unlikely(laptop_mode))
laptop_sync_completion();
}
SYSCALL_DEFINE0(sync)
{
ksys_sync();
return 0;
}
static void do_sync_work(struct work_struct *work)
{
int nowait = 0;
/*
* Sync twice to reduce the possibility we skipped some inodes / pages
* because they were temporarily locked
*/
iterate_supers(sync_inodes_one_sb, &nowait);
iterate_supers(sync_fs_one_sb, &nowait);
sync_bdevs(false);
iterate_supers(sync_inodes_one_sb, &nowait);
iterate_supers(sync_fs_one_sb, &nowait);
sync_bdevs(false);
printk("Emergency Sync complete\n");
kfree(work);
}
void emergency_sync(void)
{
struct work_struct *work;
work = kmalloc(sizeof(*work), GFP_ATOMIC);
if (work) {
INIT_WORK(work, do_sync_work);
schedule_work(work);
}
}
/*
* sync a single super
*/
SYSCALL_DEFINE1(syncfs, int, fd)
{
struct fd f = fdget(fd);
struct super_block *sb;
int ret, ret2;
if (!f.file)
return -EBADF;
sb = f.file->f_path.dentry->d_sb;
down_read(&sb->s_umount);
ret = sync_filesystem(sb);
up_read(&sb->s_umount);
ret2 = errseq_check_and_advance(&sb->s_wb_err, &f.file->f_sb_err);
fdput(f);
return ret ? ret : ret2;
}
/**
* vfs_fsync_range - helper to sync a range of data & metadata to disk
* @file: file to sync
* @start: offset in bytes of the beginning of data range to sync
* @end: offset in bytes of the end of data range (inclusive)
* @datasync: perform only datasync
*
* Write back data in range @start..@end and metadata for @file to disk. If
* @datasync is set only metadata needed to access modified file data is
* written.
*/
int vfs_fsync_range(struct file *file, loff_t start, loff_t end, int datasync)
{
struct inode *inode = file->f_mapping->host; if (!file->f_op->fsync)
return -EINVAL;
if (!datasync && (inode->i_state & I_DIRTY_TIME))
mark_inode_dirty_sync(inode);
return file->f_op->fsync(file, start, end, datasync);
}
EXPORT_SYMBOL(vfs_fsync_range);
/**
* vfs_fsync - perform a fsync or fdatasync on a file
* @file: file to sync
* @datasync: only perform a fdatasync operation
*
* Write back data and metadata for @file to disk. If @datasync is
* set only metadata needed to access modified file data is written.
*/
int vfs_fsync(struct file *file, int datasync)
{
return vfs_fsync_range(file, 0, LLONG_MAX, datasync);
}
EXPORT_SYMBOL(vfs_fsync);
static int do_fsync(unsigned int fd, int datasync)
{
struct fd f = fdget(fd);
int ret = -EBADF;
if (f.file) {
ret = vfs_fsync(f.file, datasync);
fdput(f);
}
return ret;
}
SYSCALL_DEFINE1(fsync, unsigned int, fd)
{
return do_fsync(fd, 0);
}
SYSCALL_DEFINE1(fdatasync, unsigned int, fd)
{
return do_fsync(fd, 1);
}
int sync_file_range(struct file *file, loff_t offset, loff_t nbytes,
unsigned int flags)
{
int ret;
struct address_space *mapping;
loff_t endbyte; /* inclusive */
umode_t i_mode;
ret = -EINVAL;
if (flags & ~VALID_FLAGS)
goto out;
endbyte = offset + nbytes;
if ((s64)offset < 0)
goto out;
if ((s64)endbyte < 0)
goto out;
if (endbyte < offset)
goto out;
if (sizeof(pgoff_t) == 4) {
if (offset >= (0x100000000ULL << PAGE_SHIFT)) {
/*
* The range starts outside a 32 bit machine's
* pagecache addressing capabilities. Let it "succeed"
*/
ret = 0;
goto out;
}
if (endbyte >= (0x100000000ULL << PAGE_SHIFT)) {
/*
* Out to EOF
*/
nbytes = 0;
}
}
if (nbytes == 0)
endbyte = LLONG_MAX;
else
endbyte--; /* inclusive */
i_mode = file_inode(file)->i_mode;
ret = -ESPIPE;
if (!S_ISREG(i_mode) && !S_ISBLK(i_mode) && !S_ISDIR(i_mode) &&
!S_ISLNK(i_mode))
goto out;
mapping = file->f_mapping;
ret = 0;
if (flags & SYNC_FILE_RANGE_WAIT_BEFORE) {
ret = file_fdatawait_range(file, offset, endbyte);
if (ret < 0)
goto out;
}
if (flags & SYNC_FILE_RANGE_WRITE) {
int sync_mode = WB_SYNC_NONE;
if ((flags & SYNC_FILE_RANGE_WRITE_AND_WAIT) ==
SYNC_FILE_RANGE_WRITE_AND_WAIT)
sync_mode = WB_SYNC_ALL;
ret = __filemap_fdatawrite_range(mapping, offset, endbyte,
sync_mode);
if (ret < 0)
goto out;
}
if (flags & SYNC_FILE_RANGE_WAIT_AFTER)
ret = file_fdatawait_range(file, offset, endbyte);
out:
return ret;
}
/*
* ksys_sync_file_range() permits finely controlled syncing over a segment of
* a file in the range offset .. (offset+nbytes-1) inclusive. If nbytes is
* zero then ksys_sync_file_range() will operate from offset out to EOF.
*
* The flag bits are:
*
* SYNC_FILE_RANGE_WAIT_BEFORE: wait upon writeout of all pages in the range
* before performing the write.
*
* SYNC_FILE_RANGE_WRITE: initiate writeout of all those dirty pages in the
* range which are not presently under writeback. Note that this may block for
* significant periods due to exhaustion of disk request structures.
*
* SYNC_FILE_RANGE_WAIT_AFTER: wait upon writeout of all pages in the range
* after performing the write.
*
* Useful combinations of the flag bits are:
*
* SYNC_FILE_RANGE_WAIT_BEFORE|SYNC_FILE_RANGE_WRITE: ensures that all pages
* in the range which were dirty on entry to ksys_sync_file_range() are placed
* under writeout. This is a start-write-for-data-integrity operation.
*
* SYNC_FILE_RANGE_WRITE: start writeout of all dirty pages in the range which
* are not presently under writeout. This is an asynchronous flush-to-disk
* operation. Not suitable for data integrity operations.
*
* SYNC_FILE_RANGE_WAIT_BEFORE (or SYNC_FILE_RANGE_WAIT_AFTER): wait for
* completion of writeout of all pages in the range. This will be used after an
* earlier SYNC_FILE_RANGE_WAIT_BEFORE|SYNC_FILE_RANGE_WRITE operation to wait
* for that operation to complete and to return the result.
*
* SYNC_FILE_RANGE_WAIT_BEFORE|SYNC_FILE_RANGE_WRITE|SYNC_FILE_RANGE_WAIT_AFTER
* (a.k.a. SYNC_FILE_RANGE_WRITE_AND_WAIT):
* a traditional sync() operation. This is a write-for-data-integrity operation
* which will ensure that all pages in the range which were dirty on entry to
* ksys_sync_file_range() are written to disk. It should be noted that disk
* caches are not flushed by this call, so there are no guarantees here that the
* data will be available on disk after a crash.
*
*
* SYNC_FILE_RANGE_WAIT_BEFORE and SYNC_FILE_RANGE_WAIT_AFTER will detect any
* I/O errors or ENOSPC conditions and will return those to the caller, after
* clearing the EIO and ENOSPC flags in the address_space.
*
* It should be noted that none of these operations write out the file's
* metadata. So unless the application is strictly performing overwrites of
* already-instantiated disk blocks, there are no guarantees here that the data
* will be available after a crash.
*/
int ksys_sync_file_range(int fd, loff_t offset, loff_t nbytes,
unsigned int flags)
{
int ret;
struct fd f;
ret = -EBADF;
f = fdget(fd);
if (f.file)
ret = sync_file_range(f.file, offset, nbytes, flags);
fdput(f);
return ret;
}
SYSCALL_DEFINE4(sync_file_range, int, fd, loff_t, offset, loff_t, nbytes,
unsigned int, flags)
{
return ksys_sync_file_range(fd, offset, nbytes, flags);
}
/* It would be nice if people remember that not all the world's an i386
when they introduce new system calls */
SYSCALL_DEFINE4(sync_file_range2, int, fd, unsigned int, flags,
loff_t, offset, loff_t, nbytes)
{
return ksys_sync_file_range(fd, offset, nbytes, flags);
}
// SPDX-License-Identifier: GPL-2.0+
/*
* 2002-10-15 Posix Clocks & timers
* by George Anzinger george@mvista.com
* Copyright (C) 2002 2003 by MontaVista Software.
*
* 2004-06-01 Fix CLOCK_REALTIME clock/timer TIMER_ABSTIME bug.
* Copyright (C) 2004 Boris Hu
*
* These are all the functions necessary to implement POSIX clocks & timers
*/
#include <linux/mm.h>
#include <linux/interrupt.h>
#include <linux/slab.h>
#include <linux/time.h>
#include <linux/mutex.h>
#include <linux/sched/task.h>
#include <linux/uaccess.h>
#include <linux/list.h>
#include <linux/init.h>
#include <linux/compiler.h>
#include <linux/hash.h>
#include <linux/posix-clock.h>
#include <linux/posix-timers.h>
#include <linux/syscalls.h>
#include <linux/wait.h>
#include <linux/workqueue.h>
#include <linux/export.h>
#include <linux/hashtable.h>
#include <linux/compat.h>
#include <linux/nospec.h>
#include <linux/time_namespace.h>
#include "timekeeping.h"
#include "posix-timers.h"
/*
* Management arrays for POSIX timers. Timers are now kept in static hash table
* with 512 entries.
* Timer ids are allocated by local routine, which selects proper hash head by
* key, constructed from current->signal address and per signal struct counter.
* This keeps timer ids unique per process, but now they can intersect between
* processes.
*/
/*
* Lets keep our timers in a slab cache :-)
*/
static struct kmem_cache *posix_timers_cache;
static DEFINE_HASHTABLE(posix_timers_hashtable, 9);
static DEFINE_SPINLOCK(hash_lock);
static const struct k_clock * const posix_clocks[];
static const struct k_clock *clockid_to_kclock(const clockid_t id);
static const struct k_clock clock_realtime, clock_monotonic;
/*
* we assume that the new SIGEV_THREAD_ID shares no bits with the other
* SIGEV values. Here we put out an error if this assumption fails.
*/
#if SIGEV_THREAD_ID != (SIGEV_THREAD_ID & \
~(SIGEV_SIGNAL | SIGEV_NONE | SIGEV_THREAD))
#error "SIGEV_THREAD_ID must not share bit with other SIGEV values!"
#endif
/*
* The timer ID is turned into a timer address by idr_find().
* Verifying a valid ID consists of:
*
* a) checking that idr_find() returns other than -1.
* b) checking that the timer id matches the one in the timer itself.
* c) that the timer owner is in the callers thread group.
*/
/*
* CLOCKs: The POSIX standard calls for a couple of clocks and allows us
* to implement others. This structure defines the various
* clocks.
*
* RESOLUTION: Clock resolution is used to round up timer and interval
* times, NOT to report clock times, which are reported with as
* much resolution as the system can muster. In some cases this
* resolution may depend on the underlying clock hardware and
* may not be quantifiable until run time, and only then is the
* necessary code is written. The standard says we should say
* something about this issue in the documentation...
*
* FUNCTIONS: The CLOCKs structure defines possible functions to
* handle various clock functions.
*
* The standard POSIX timer management code assumes the
* following: 1.) The k_itimer struct (sched.h) is used for
* the timer. 2.) The list, it_lock, it_clock, it_id and
* it_pid fields are not modified by timer code.
*
* Permissions: It is assumed that the clock_settime() function defined
* for each clock will take care of permission checks. Some
* clocks may be set able by any user (i.e. local process
* clocks) others not. Currently the only set able clock we
* have is CLOCK_REALTIME and its high res counter part, both of
* which we beg off on and pass to do_sys_settimeofday().
*/
static struct k_itimer *__lock_timer(timer_t timer_id, unsigned long *flags);
#define lock_timer(tid, flags) \
({ struct k_itimer *__timr; \
__cond_lock(&__timr->it_lock, __timr = __lock_timer(tid, flags)); \
__timr; \
})
static int hash(struct signal_struct *sig, unsigned int nr)
{
return hash_32(hash32_ptr(sig) ^ nr, HASH_BITS(posix_timers_hashtable));
}
static struct k_itimer *__posix_timers_find(struct hlist_head *head,
struct signal_struct *sig,
timer_t id)
{
struct k_itimer *timer;
hlist_for_each_entry_rcu(timer, head, t_hash,
lockdep_is_held(&hash_lock)) {
if ((timer->it_signal == sig) && (timer->it_id == id))
return timer;
}
return NULL;
}
static struct k_itimer *posix_timer_by_id(timer_t id)
{
struct signal_struct *sig = current->signal;
struct hlist_head *head = &posix_timers_hashtable[hash(sig, id)];
return __posix_timers_find(head, sig, id);
}
static int posix_timer_add(struct k_itimer *timer)
{
struct signal_struct *sig = current->signal;
int first_free_id = sig->posix_timer_id;
struct hlist_head *head;
int ret = -ENOENT;
do {
spin_lock(&hash_lock);
head = &posix_timers_hashtable[hash(sig, sig->posix_timer_id)];
if (!__posix_timers_find(head, sig, sig->posix_timer_id)) {
hlist_add_head_rcu(&timer->t_hash, head);
ret = sig->posix_timer_id;
}
if (++sig->posix_timer_id < 0)
sig->posix_timer_id = 0;
if ((sig->posix_timer_id == first_free_id) && (ret == -ENOENT))
/* Loop over all possible ids completed */
ret = -EAGAIN;
spin_unlock(&hash_lock);
} while (ret == -ENOENT);
return ret;
}
static inline void unlock_timer(struct k_itimer *timr, unsigned long flags)
{
spin_unlock_irqrestore(&timr->it_lock, flags);
}
/* Get clock_realtime */
static int posix_get_realtime_timespec(clockid_t which_clock, struct timespec64 *tp)
{
ktime_get_real_ts64(tp);
return 0;
}
static ktime_t posix_get_realtime_ktime(clockid_t which_clock)
{
return ktime_get_real();
}
/* Set clock_realtime */
static int posix_clock_realtime_set(const clockid_t which_clock,
const struct timespec64 *tp)
{
return do_sys_settimeofday64(tp, NULL);
}
static int posix_clock_realtime_adj(const clockid_t which_clock,
struct __kernel_timex *t)
{
return do_adjtimex(t);
}
/*
* Get monotonic time for posix timers
*/
static int posix_get_monotonic_timespec(clockid_t which_clock, struct timespec64 *tp)
{
ktime_get_ts64(tp);
timens_add_monotonic(tp);
return 0;
}
static ktime_t posix_get_monotonic_ktime(clockid_t which_clock)
{
return ktime_get();
}
/*
* Get monotonic-raw time for posix timers
*/
static int posix_get_monotonic_raw(clockid_t which_clock, struct timespec64 *tp)
{
ktime_get_raw_ts64(tp);
timens_add_monotonic(tp);
return 0;
}
static int posix_get_realtime_coarse(clockid_t which_clock, struct timespec64 *tp)
{
ktime_get_coarse_real_ts64(tp);
return 0;
}
static int posix_get_monotonic_coarse(clockid_t which_clock,
struct timespec64 *tp)
{
ktime_get_coarse_ts64(tp);
timens_add_monotonic(tp);
return 0;
}
static int posix_get_coarse_res(const clockid_t which_clock, struct timespec64 *tp)
{
*tp = ktime_to_timespec64(KTIME_LOW_RES);
return 0;
}
static int posix_get_boottime_timespec(const clockid_t which_clock, struct timespec64 *tp)
{
ktime_get_boottime_ts64(tp);
timens_add_boottime(tp);
return 0;
}
static ktime_t posix_get_boottime_ktime(const clockid_t which_clock)
{
return ktime_get_boottime();
}
static int posix_get_tai_timespec(clockid_t which_clock, struct timespec64 *tp)
{
ktime_get_clocktai_ts64(tp);
return 0;
}
static ktime_t posix_get_tai_ktime(clockid_t which_clock)
{
return ktime_get_clocktai();
}
static int posix_get_hrtimer_res(clockid_t which_clock, struct timespec64 *tp)
{
tp->tv_sec = 0;
tp->tv_nsec = hrtimer_resolution;
return 0;
}
/*
* Initialize everything, well, just everything in Posix clocks/timers ;)
*/
static __init int init_posix_timers(void)
{
posix_timers_cache = kmem_cache_create("posix_timers_cache",
sizeof(struct k_itimer), 0,
SLAB_PANIC | SLAB_ACCOUNT, NULL);
return 0;
}
__initcall(init_posix_timers);
/*
* The siginfo si_overrun field and the return value of timer_getoverrun(2)
* are of type int. Clamp the overrun value to INT_MAX
*/
static inline int timer_overrun_to_int(struct k_itimer *timr, int baseval)
{
s64 sum = timr->it_overrun_last + (s64)baseval;
return sum > (s64)INT_MAX ? INT_MAX : (int)sum;
}
static void common_hrtimer_rearm(struct k_itimer *timr)
{
struct hrtimer *timer = &timr->it.real.timer;
timr->it_overrun += hrtimer_forward(timer, timer->base->get_time(),
timr->it_interval);
hrtimer_restart(timer);
}
/*
* This function is exported for use by the signal deliver code. It is
* called just prior to the info block being released and passes that
* block to us. It's function is to update the overrun entry AND to
* restart the timer. It should only be called if the timer is to be
* restarted (i.e. we have flagged this in the sys_private entry of the
* info block).
*
* To protect against the timer going away while the interrupt is queued,
* we require that the it_requeue_pending flag be set.
*/
void posixtimer_rearm(struct kernel_siginfo *info)
{
struct k_itimer *timr;
unsigned long flags;
timr = lock_timer(info->si_tid, &flags);
if (!timr)
return;
if (timr->it_interval && timr->it_requeue_pending == info->si_sys_private) {
timr->kclock->timer_rearm(timr);
timr->it_active = 1;
timr->it_overrun_last = timr->it_overrun;
timr->it_overrun = -1LL;
++timr->it_requeue_pending;
info->si_overrun = timer_overrun_to_int(timr, info->si_overrun);
}
unlock_timer(timr, flags);
}
int posix_timer_event(struct k_itimer *timr, int si_private)
{
enum pid_type type;
int ret;
/*
* FIXME: if ->sigq is queued we can race with
* dequeue_signal()->posixtimer_rearm().
*
* If dequeue_signal() sees the "right" value of
* si_sys_private it calls posixtimer_rearm().
* We re-queue ->sigq and drop ->it_lock().
* posixtimer_rearm() locks the timer
* and re-schedules it while ->sigq is pending.
* Not really bad, but not that we want.
*/
timr->sigq->info.si_sys_private = si_private;
type = !(timr->it_sigev_notify & SIGEV_THREAD_ID) ? PIDTYPE_TGID : PIDTYPE_PID;
ret = send_sigqueue(timr->sigq, timr->it_pid, type);
/* If we failed to send the signal the timer stops. */
return ret > 0;
}
/*
* This function gets called when a POSIX.1b interval timer expires. It
* is used as a callback from the kernel internal timer. The
* run_timer_list code ALWAYS calls with interrupts on.
* This code is for CLOCK_REALTIME* and CLOCK_MONOTONIC* timers.
*/
static enum hrtimer_restart posix_timer_fn(struct hrtimer *timer)
{
struct k_itimer *timr;
unsigned long flags;
int si_private = 0;
enum hrtimer_restart ret = HRTIMER_NORESTART;
timr = container_of(timer, struct k_itimer, it.real.timer);
spin_lock_irqsave(&timr->it_lock, flags);
timr->it_active = 0;
if (timr->it_interval != 0)
si_private = ++timr->it_requeue_pending;
if (posix_timer_event(timr, si_private)) {
/*
* signal was not sent because of sig_ignor
* we will not get a call back to restart it AND
* it should be restarted.
*/
if (timr->it_interval != 0) {
ktime_t now = hrtimer_cb_get_time(timer);
/*
* FIXME: What we really want, is to stop this
* timer completely and restart it in case the
* SIG_IGN is removed. This is a non trivial
* change which involves sighand locking
* (sigh !), which we don't want to do late in
* the release cycle.
*
* For now we just let timers with an interval
* less than a jiffie expire every jiffie to
* avoid softirq starvation in case of SIG_IGN
* and a very small interval, which would put
* the timer right back on the softirq pending
* list. By moving now ahead of time we trick
* hrtimer_forward() to expire the timer
* later, while we still maintain the overrun
* accuracy, but have some inconsistency in
* the timer_gettime() case. This is at least
* better than a starved softirq. A more
* complex fix which solves also another related
* inconsistency is already in the pipeline.
*/
#ifdef CONFIG_HIGH_RES_TIMERS
{
ktime_t kj = NSEC_PER_SEC / HZ;
if (timr->it_interval < kj)
now = ktime_add(now, kj);
}
#endif
timr->it_overrun += hrtimer_forward(timer, now,
timr->it_interval);
ret = HRTIMER_RESTART;
++timr->it_requeue_pending;
timr->it_active = 1;
}
}
unlock_timer(timr, flags);
return ret;
}
static struct pid *good_sigevent(sigevent_t * event)
{
struct pid *pid = task_tgid(current);
struct task_struct *rtn;
switch (event->sigev_notify) {
case SIGEV_SIGNAL | SIGEV_THREAD_ID:
pid = find_vpid(event->sigev_notify_thread_id);
rtn = pid_task(pid, PIDTYPE_PID);
if (!rtn || !same_thread_group(rtn, current))
return NULL;
fallthrough;
case SIGEV_SIGNAL:
case SIGEV_THREAD:
if (event->sigev_signo <= 0 || event->sigev_signo > SIGRTMAX)
return NULL;
fallthrough;
case SIGEV_NONE:
return pid;
default:
return NULL;
}
}
static struct k_itimer * alloc_posix_timer(void)
{
struct k_itimer *tmr;
tmr = kmem_cache_zalloc(posix_timers_cache, GFP_KERNEL);
if (!tmr)
return tmr;
if (unlikely(!(tmr->sigq = sigqueue_alloc()))) {
kmem_cache_free(posix_timers_cache, tmr);
return NULL;
}
clear_siginfo(&tmr->sigq->info);
return tmr;
}
static void k_itimer_rcu_free(struct rcu_head *head)
{
struct k_itimer *tmr = container_of(head, struct k_itimer, rcu);
kmem_cache_free(posix_timers_cache, tmr);
}
#define IT_ID_SET 1
#define IT_ID_NOT_SET 0
static void release_posix_timer(struct k_itimer *tmr, int it_id_set)
{
if (it_id_set) {
unsigned long flags;
spin_lock_irqsave(&hash_lock, flags);
hlist_del_rcu(&tmr->t_hash);
spin_unlock_irqrestore(&hash_lock, flags);
}
put_pid(tmr->it_pid);
sigqueue_free(tmr->sigq);
call_rcu(&tmr->rcu, k_itimer_rcu_free);
}
static int common_timer_create(struct k_itimer *new_timer)
{
hrtimer_init(&new_timer->it.real.timer, new_timer->it_clock, 0);
return 0;
}
/* Create a POSIX.1b interval timer. */
static int do_timer_create(clockid_t which_clock, struct sigevent *event,
timer_t __user *created_timer_id)
{
const struct k_clock *kc = clockid_to_kclock(which_clock);
struct k_itimer *new_timer;
int error, new_timer_id;
int it_id_set = IT_ID_NOT_SET;
if (!kc)
return -EINVAL;
if (!kc->timer_create)
return -EOPNOTSUPP;
new_timer = alloc_posix_timer();
if (unlikely(!new_timer))
return -EAGAIN;
spin_lock_init(&new_timer->it_lock);
new_timer_id = posix_timer_add(new_timer);
if (new_timer_id < 0) {
error = new_timer_id;
goto out;
}
it_id_set = IT_ID_SET;
new_timer->it_id = (timer_t) new_timer_id;
new_timer->it_clock = which_clock;
new_timer->kclock = kc;
new_timer->it_overrun = -1LL;
if (event) {
rcu_read_lock();
new_timer->it_pid = get_pid(good_sigevent(event));
rcu_read_unlock();
if (!new_timer->it_pid) {
error = -EINVAL;
goto out;
}
new_timer->it_sigev_notify = event->sigev_notify;
new_timer->sigq->info.si_signo = event->sigev_signo;
new_timer->sigq->info.si_value = event->sigev_value;
} else {
new_timer->it_sigev_notify = SIGEV_SIGNAL;
new_timer->sigq->info.si_signo = SIGALRM;
memset(&new_timer->sigq->info.si_value, 0, sizeof(sigval_t));
new_timer->sigq->info.si_value.sival_int = new_timer->it_id;
new_timer->it_pid = get_pid(task_tgid(current));
}
new_timer->sigq->info.si_tid = new_timer->it_id;
new_timer->sigq->info.si_code = SI_TIMER;
if (copy_to_user(created_timer_id,
&new_timer_id, sizeof (new_timer_id))) {
error = -EFAULT;
goto out;
}
error = kc->timer_create(new_timer);
if (error)
goto out;
spin_lock_irq(¤t->sighand->siglock);
new_timer->it_signal = current->signal;
list_add(&new_timer->list, ¤t->signal->posix_timers);
spin_unlock_irq(¤t->sighand->siglock);
return 0;
/*
* In the case of the timer belonging to another task, after
* the task is unlocked, the timer is owned by the other task
* and may cease to exist at any time. Don't use or modify
* new_timer after the unlock call.
*/
out:
release_posix_timer(new_timer, it_id_set);
return error;
}
SYSCALL_DEFINE3(timer_create, const clockid_t, which_clock,
struct sigevent __user *, timer_event_spec,
timer_t __user *, created_timer_id)
{
if (timer_event_spec) {
sigevent_t event;
if (copy_from_user(&event, timer_event_spec, sizeof (event)))
return -EFAULT;
return do_timer_create(which_clock, &event, created_timer_id);
}
return do_timer_create(which_clock, NULL, created_timer_id);
}
#ifdef CONFIG_COMPAT
COMPAT_SYSCALL_DEFINE3(timer_create, clockid_t, which_clock,
struct compat_sigevent __user *, timer_event_spec,
timer_t __user *, created_timer_id)
{
if (timer_event_spec) {
sigevent_t event;
if (get_compat_sigevent(&event, timer_event_spec))
return -EFAULT;
return do_timer_create(which_clock, &event, created_timer_id);
}
return do_timer_create(which_clock, NULL, created_timer_id);
}
#endif
/*
* Locking issues: We need to protect the result of the id look up until
* we get the timer locked down so it is not deleted under us. The
* removal is done under the idr spinlock so we use that here to bridge
* the find to the timer lock. To avoid a dead lock, the timer id MUST
* be release with out holding the timer lock.
*/
static struct k_itimer *__lock_timer(timer_t timer_id, unsigned long *flags)
{
struct k_itimer *timr;
/*
* timer_t could be any type >= int and we want to make sure any
* @timer_id outside positive int range fails lookup.
*/
if ((unsigned long long)timer_id > INT_MAX)
return NULL;
rcu_read_lock();
timr = posix_timer_by_id(timer_id);
if (timr) {
spin_lock_irqsave(&timr->it_lock, *flags);
if (timr->it_signal == current->signal) {
rcu_read_unlock();
return timr;
}
spin_unlock_irqrestore(&timr->it_lock, *flags);
}
rcu_read_unlock();
return NULL;
}
static ktime_t common_hrtimer_remaining(struct k_itimer *timr, ktime_t now)
{
struct hrtimer *timer = &timr->it.real.timer;
return __hrtimer_expires_remaining_adjusted(timer, now);
}
static s64 common_hrtimer_forward(struct k_itimer *timr, ktime_t now)
{
struct hrtimer *timer = &timr->it.real.timer;
return hrtimer_forward(timer, now, timr->it_interval);
}
/*
* Get the time remaining on a POSIX.1b interval timer. This function
* is ALWAYS called with spin_lock_irq on the timer, thus it must not
* mess with irq.
*
* We have a couple of messes to clean up here. First there is the case
* of a timer that has a requeue pending. These timers should appear to
* be in the timer list with an expiry as if we were to requeue them
* now.
*
* The second issue is the SIGEV_NONE timer which may be active but is
* not really ever put in the timer list (to save system resources).
* This timer may be expired, and if so, we will do it here. Otherwise
* it is the same as a requeue pending timer WRT to what we should
* report.
*/
void common_timer_get(struct k_itimer *timr, struct itimerspec64 *cur_setting)
{
const struct k_clock *kc = timr->kclock;
ktime_t now, remaining, iv;
bool sig_none;
sig_none = timr->it_sigev_notify == SIGEV_NONE;
iv = timr->it_interval;
/* interval timer ? */
if (iv) {
cur_setting->it_interval = ktime_to_timespec64(iv);
} else if (!timr->it_active) {
/*
* SIGEV_NONE oneshot timers are never queued. Check them
* below.
*/
if (!sig_none)
return;
}
now = kc->clock_get_ktime(timr->it_clock);
/*
* When a requeue is pending or this is a SIGEV_NONE timer move the
* expiry time forward by intervals, so expiry is > now.
*/
if (iv && (timr->it_requeue_pending & REQUEUE_PENDING || sig_none))
timr->it_overrun += kc->timer_forward(timr, now);
remaining = kc->timer_remaining(timr, now);
/* Return 0 only, when the timer is expired and not pending */
if (remaining <= 0) {
/*
* A single shot SIGEV_NONE timer must return 0, when
* it is expired !
*/
if (!sig_none)
cur_setting->it_value.tv_nsec = 1;
} else {
cur_setting->it_value = ktime_to_timespec64(remaining);
}
}
/* Get the time remaining on a POSIX.1b interval timer. */
static int do_timer_gettime(timer_t timer_id, struct itimerspec64 *setting)
{
struct k_itimer *timr;
const struct k_clock *kc;
unsigned long flags;
int ret = 0;
timr = lock_timer(timer_id, &flags);
if (!timr)
return -EINVAL;
memset(setting, 0, sizeof(*setting));
kc = timr->kclock;
if (WARN_ON_ONCE(!kc || !kc->timer_get))
ret = -EINVAL;
else
kc->timer_get(timr, setting);
unlock_timer(timr, flags);
return ret;
}
/* Get the time remaining on a POSIX.1b interval timer. */
SYSCALL_DEFINE2(timer_gettime, timer_t, timer_id,
struct __kernel_itimerspec __user *, setting)
{
struct itimerspec64 cur_setting;
int ret = do_timer_gettime(timer_id, &cur_setting);
if (!ret) {
if (put_itimerspec64(&cur_setting, setting))
ret = -EFAULT;
}
return ret;
}
#ifdef CONFIG_COMPAT_32BIT_TIME
SYSCALL_DEFINE2(timer_gettime32, timer_t, timer_id,
struct old_itimerspec32 __user *, setting)
{
struct itimerspec64 cur_setting;
int ret = do_timer_gettime(timer_id, &cur_setting);
if (!ret) {
if (put_old_itimerspec32(&cur_setting, setting))
ret = -EFAULT;
}
return ret;
}
#endif
/*
* Get the number of overruns of a POSIX.1b interval timer. This is to
* be the overrun of the timer last delivered. At the same time we are
* accumulating overruns on the next timer. The overrun is frozen when
* the signal is delivered, either at the notify time (if the info block
* is not queued) or at the actual delivery time (as we are informed by
* the call back to posixtimer_rearm(). So all we need to do is
* to pick up the frozen overrun.
*/
SYSCALL_DEFINE1(timer_getoverrun, timer_t, timer_id)
{
struct k_itimer *timr;
int overrun;
unsigned long flags;
timr = lock_timer(timer_id, &flags);
if (!timr)
return -EINVAL;
overrun = timer_overrun_to_int(timr, 0);
unlock_timer(timr, flags);
return overrun;
}
static void common_hrtimer_arm(struct k_itimer *timr, ktime_t expires,
bool absolute, bool sigev_none)
{
struct hrtimer *timer = &timr->it.real.timer;
enum hrtimer_mode mode;
mode = absolute ? HRTIMER_MODE_ABS : HRTIMER_MODE_REL;
/*
* Posix magic: Relative CLOCK_REALTIME timers are not affected by
* clock modifications, so they become CLOCK_MONOTONIC based under the
* hood. See hrtimer_init(). Update timr->kclock, so the generic
* functions which use timr->kclock->clock_get_*() work.
*
* Note: it_clock stays unmodified, because the next timer_set() might
* use ABSTIME, so it needs to switch back.
*/
if (timr->it_clock == CLOCK_REALTIME)
timr->kclock = absolute ? &clock_realtime : &clock_monotonic;
hrtimer_init(&timr->it.real.timer, timr->it_clock, mode);
timr->it.real.timer.function = posix_timer_fn;
if (!absolute)
expires = ktime_add_safe(expires, timer->base->get_time());
hrtimer_set_expires(timer, expires);
if (!sigev_none)
hrtimer_start_expires(timer, HRTIMER_MODE_ABS);
}
static int common_hrtimer_try_to_cancel(struct k_itimer *timr)
{
return hrtimer_try_to_cancel(&timr->it.real.timer);
}
static void common_timer_wait_running(struct k_itimer *timer)
{
hrtimer_cancel_wait_running(&timer->it.real.timer);
}
/*
* On PREEMPT_RT this prevent priority inversion against softirq kthread in
* case it gets preempted while executing a timer callback. See comments in
* hrtimer_cancel_wait_running. For PREEMPT_RT=n this just results in a
* cpu_relax().
*/
static struct k_itimer *timer_wait_running(struct k_itimer *timer,
unsigned long *flags)
{
const struct k_clock *kc = READ_ONCE(timer->kclock);
timer_t timer_id = READ_ONCE(timer->it_id);
/* Prevent kfree(timer) after dropping the lock */
rcu_read_lock();
unlock_timer(timer, *flags);
if (!WARN_ON_ONCE(!kc->timer_wait_running))
kc->timer_wait_running(timer);
rcu_read_unlock();
/* Relock the timer. It might be not longer hashed. */
return lock_timer(timer_id, flags);
}
/* Set a POSIX.1b interval timer. */
int common_timer_set(struct k_itimer *timr, int flags,
struct itimerspec64 *new_setting,
struct itimerspec64 *old_setting)
{
const struct k_clock *kc = timr->kclock;
bool sigev_none;
ktime_t expires;
if (old_setting)
common_timer_get(timr, old_setting);
/* Prevent rearming by clearing the interval */
timr->it_interval = 0;
/*
* Careful here. On SMP systems the timer expiry function could be
* active and spinning on timr->it_lock.
*/
if (kc->timer_try_to_cancel(timr) < 0)
return TIMER_RETRY;
timr->it_active = 0;
timr->it_requeue_pending = (timr->it_requeue_pending + 2) &
~REQUEUE_PENDING;
timr->it_overrun_last = 0;
/* Switch off the timer when it_value is zero */
if (!new_setting->it_value.tv_sec && !new_setting->it_value.tv_nsec)
return 0;
timr->it_interval = timespec64_to_ktime(new_setting->it_interval);
expires = timespec64_to_ktime(new_setting->it_value);
if (flags & TIMER_ABSTIME)
expires = timens_ktime_to_host(timr->it_clock, expires);
sigev_none = timr->it_sigev_notify == SIGEV_NONE;
kc->timer_arm(timr, expires, flags & TIMER_ABSTIME, sigev_none);
timr->it_active = !sigev_none;
return 0;
}
static int do_timer_settime(timer_t timer_id, int tmr_flags,
struct itimerspec64 *new_spec64,
struct itimerspec64 *old_spec64)
{
const struct k_clock *kc;
struct k_itimer *timr;
unsigned long flags;
int error = 0;
if (!timespec64_valid(&new_spec64->it_interval) ||
!timespec64_valid(&new_spec64->it_value))
return -EINVAL;
if (old_spec64)
memset(old_spec64, 0, sizeof(*old_spec64));
timr = lock_timer(timer_id, &flags);
retry:
if (!timr)
return -EINVAL;
kc = timr->kclock;
if (WARN_ON_ONCE(!kc || !kc->timer_set))
error = -EINVAL;
else
error = kc->timer_set(timr, tmr_flags, new_spec64, old_spec64);
if (error == TIMER_RETRY) {
// We already got the old time...
old_spec64 = NULL;
/* Unlocks and relocks the timer if it still exists */
timr = timer_wait_running(timr, &flags);
goto retry;
}
unlock_timer(timr, flags);
return error;
}
/* Set a POSIX.1b interval timer */
SYSCALL_DEFINE4(timer_settime, timer_t, timer_id, int, flags,
const struct __kernel_itimerspec __user *, new_setting,
struct __kernel_itimerspec __user *, old_setting)
{
struct itimerspec64 new_spec, old_spec;
struct itimerspec64 *rtn = old_setting ? &old_spec : NULL;
int error = 0;
if (!new_setting)
return -EINVAL;
if (get_itimerspec64(&new_spec, new_setting))
return -EFAULT;
error = do_timer_settime(timer_id, flags, &new_spec, rtn);
if (!error && old_setting) {
if (put_itimerspec64(&old_spec, old_setting))
error = -EFAULT;
}
return error;
}
#ifdef CONFIG_COMPAT_32BIT_TIME
SYSCALL_DEFINE4(timer_settime32, timer_t, timer_id, int, flags,
struct old_itimerspec32 __user *, new,
struct old_itimerspec32 __user *, old)
{
struct itimerspec64 new_spec, old_spec;
struct itimerspec64 *rtn = old ? &old_spec : NULL;
int error = 0;
if (!new)
return -EINVAL;
if (get_old_itimerspec32(&new_spec, new))
return -EFAULT;
error = do_timer_settime(timer_id, flags, &new_spec, rtn);
if (!error && old) {
if (put_old_itimerspec32(&old_spec, old))
error = -EFAULT;
}
return error;
}
#endif
int common_timer_del(struct k_itimer *timer)
{
const struct k_clock *kc = timer->kclock;
timer->it_interval = 0;
if (kc->timer_try_to_cancel(timer) < 0)
return TIMER_RETRY;
timer->it_active = 0;
return 0;
}
static inline int timer_delete_hook(struct k_itimer *timer)
{
const struct k_clock *kc = timer->kclock;
if (WARN_ON_ONCE(!kc || !kc->timer_del))
return -EINVAL;
return kc->timer_del(timer);
}
/* Delete a POSIX.1b interval timer. */
SYSCALL_DEFINE1(timer_delete, timer_t, timer_id)
{
struct k_itimer *timer;
unsigned long flags;
timer = lock_timer(timer_id, &flags);
retry_delete:
if (!timer)
return -EINVAL;
if (unlikely(timer_delete_hook(timer) == TIMER_RETRY)) {
/* Unlocks and relocks the timer if it still exists */
timer = timer_wait_running(timer, &flags);
goto retry_delete;
}
spin_lock(¤t->sighand->siglock);
list_del(&timer->list);
spin_unlock(¤t->sighand->siglock);
/*
* This keeps any tasks waiting on the spin lock from thinking
* they got something (see the lock code above).
*/
timer->it_signal = NULL;
unlock_timer(timer, flags);
release_posix_timer(timer, IT_ID_SET);
return 0;
}
/*
* return timer owned by the process, used by exit_itimers
*/
static void itimer_delete(struct k_itimer *timer)
{
retry_delete:
spin_lock_irq(&timer->it_lock);
if (timer_delete_hook(timer) == TIMER_RETRY) {
spin_unlock_irq(&timer->it_lock);
goto retry_delete;
}
list_del(&timer->list);
spin_unlock_irq(&timer->it_lock);
release_posix_timer(timer, IT_ID_SET);
}
/*
* This is called by do_exit or de_thread, only when there are no more
* references to the shared signal_struct.
*/
void exit_itimers(struct signal_struct *sig)
{
struct k_itimer *tmr;
while (!list_empty(&sig->posix_timers)) {
tmr = list_entry(sig->posix_timers.next, struct k_itimer, list);
itimer_delete(tmr);
}
}
SYSCALL_DEFINE2(clock_settime, const clockid_t, which_clock,
const struct __kernel_timespec __user *, tp)
{
const struct k_clock *kc = clockid_to_kclock(which_clock);
struct timespec64 new_tp;
if (!kc || !kc->clock_set)
return -EINVAL;
if (get_timespec64(&new_tp, tp))
return -EFAULT;
return kc->clock_set(which_clock, &new_tp);
}
SYSCALL_DEFINE2(clock_gettime, const clockid_t, which_clock,
struct __kernel_timespec __user *, tp)
{
const struct k_clock *kc = clockid_to_kclock(which_clock);
struct timespec64 kernel_tp;
int error;
if (!kc)
return -EINVAL;
error = kc->clock_get_timespec(which_clock, &kernel_tp);
if (!error && put_timespec64(&kernel_tp, tp))
error = -EFAULT;
return error;
}
int do_clock_adjtime(const clockid_t which_clock, struct __kernel_timex * ktx)
{
const struct k_clock *kc = clockid_to_kclock(which_clock);
if (!kc)
return -EINVAL;
if (!kc->clock_adj)
return -EOPNOTSUPP;
return kc->clock_adj(which_clock, ktx);
}
SYSCALL_DEFINE2(clock_adjtime, const clockid_t, which_clock,
struct __kernel_timex __user *, utx)
{
struct __kernel_timex ktx;
int err;
if (copy_from_user(&ktx, utx, sizeof(ktx)))
return -EFAULT;
err = do_clock_adjtime(which_clock, &ktx);
if (err >= 0 && copy_to_user(utx, &ktx, sizeof(ktx)))
return -EFAULT;
return err;
}
SYSCALL_DEFINE2(clock_getres, const clockid_t, which_clock,
struct __kernel_timespec __user *, tp)
{
const struct k_clock *kc = clockid_to_kclock(which_clock);
struct timespec64 rtn_tp;
int error;
if (!kc)
return -EINVAL;
error = kc->clock_getres(which_clock, &rtn_tp);
if (!error && tp && put_timespec64(&rtn_tp, tp))
error = -EFAULT;
return error;
}
#ifdef CONFIG_COMPAT_32BIT_TIME
SYSCALL_DEFINE2(clock_settime32, clockid_t, which_clock,
struct old_timespec32 __user *, tp)
{
const struct k_clock *kc = clockid_to_kclock(which_clock);
struct timespec64 ts;
if (!kc || !kc->clock_set)
return -EINVAL;
if (get_old_timespec32(&ts, tp))
return -EFAULT;
return kc->clock_set(which_clock, &ts);
}
SYSCALL_DEFINE2(clock_gettime32, clockid_t, which_clock,
struct old_timespec32 __user *, tp)
{
const struct k_clock *kc = clockid_to_kclock(which_clock);
struct timespec64 ts;
int err;
if (!kc)
return -EINVAL;
err = kc->clock_get_timespec(which_clock, &ts);
if (!err && put_old_timespec32(&ts, tp))
err = -EFAULT;
return err;
}
SYSCALL_DEFINE2(clock_adjtime32, clockid_t, which_clock,
struct old_timex32 __user *, utp)
{
struct __kernel_timex ktx;
int err;
err = get_old_timex32(&ktx, utp);
if (err)
return err;
err = do_clock_adjtime(which_clock, &ktx);
if (err >= 0 && put_old_timex32(utp, &ktx))
return -EFAULT;
return err;
}
SYSCALL_DEFINE2(clock_getres_time32, clockid_t, which_clock,
struct old_timespec32 __user *, tp)
{
const struct k_clock *kc = clockid_to_kclock(which_clock);
struct timespec64 ts;
int err;
if (!kc)
return -EINVAL;
err = kc->clock_getres(which_clock, &ts);
if (!err && tp && put_old_timespec32(&ts, tp))
return -EFAULT;
return err;
}
#endif
/*
* nanosleep for monotonic and realtime clocks
*/
static int common_nsleep(const clockid_t which_clock, int flags,
const struct timespec64 *rqtp)
{
ktime_t texp = timespec64_to_ktime(*rqtp);
return hrtimer_nanosleep(texp, flags & TIMER_ABSTIME ?
HRTIMER_MODE_ABS : HRTIMER_MODE_REL,
which_clock);
}
static int common_nsleep_timens(const clockid_t which_clock, int flags,
const struct timespec64 *rqtp)
{
ktime_t texp = timespec64_to_ktime(*rqtp);
if (flags & TIMER_ABSTIME)
texp = timens_ktime_to_host(which_clock, texp);
return hrtimer_nanosleep(texp, flags & TIMER_ABSTIME ?
HRTIMER_MODE_ABS : HRTIMER_MODE_REL,
which_clock);
}
SYSCALL_DEFINE4(clock_nanosleep, const clockid_t, which_clock, int, flags,
const struct __kernel_timespec __user *, rqtp,
struct __kernel_timespec __user *, rmtp)
{
const struct k_clock *kc = clockid_to_kclock(which_clock);
struct timespec64 t;
if (!kc) return -EINVAL; if (!kc->nsleep)
return -EOPNOTSUPP;
if (get_timespec64(&t, rqtp))
return -EFAULT;
if (!timespec64_valid(&t))
return -EINVAL;
if (flags & TIMER_ABSTIME)
rmtp = NULL;
current->restart_block.nanosleep.type = rmtp ? TT_NATIVE : TT_NONE;
current->restart_block.nanosleep.rmtp = rmtp;
return kc->nsleep(which_clock, flags, &t);
}
#ifdef CONFIG_COMPAT_32BIT_TIME
SYSCALL_DEFINE4(clock_nanosleep_time32, clockid_t, which_clock, int, flags,
struct old_timespec32 __user *, rqtp,
struct old_timespec32 __user *, rmtp)
{
const struct k_clock *kc = clockid_to_kclock(which_clock);
struct timespec64 t;
if (!kc)
return -EINVAL;
if (!kc->nsleep)
return -EOPNOTSUPP;
if (get_old_timespec32(&t, rqtp))
return -EFAULT;
if (!timespec64_valid(&t))
return -EINVAL;
if (flags & TIMER_ABSTIME)
rmtp = NULL;
current->restart_block.nanosleep.type = rmtp ? TT_COMPAT : TT_NONE;
current->restart_block.nanosleep.compat_rmtp = rmtp;
return kc->nsleep(which_clock, flags, &t);
}
#endif
static const struct k_clock clock_realtime = {
.clock_getres = posix_get_hrtimer_res,
.clock_get_timespec = posix_get_realtime_timespec,
.clock_get_ktime = posix_get_realtime_ktime,
.clock_set = posix_clock_realtime_set,
.clock_adj = posix_clock_realtime_adj,
.nsleep = common_nsleep,
.timer_create = common_timer_create,
.timer_set = common_timer_set,
.timer_get = common_timer_get,
.timer_del = common_timer_del,
.timer_rearm = common_hrtimer_rearm,
.timer_forward = common_hrtimer_forward,
.timer_remaining = common_hrtimer_remaining,
.timer_try_to_cancel = common_hrtimer_try_to_cancel,
.timer_wait_running = common_timer_wait_running,
.timer_arm = common_hrtimer_arm,
};
static const struct k_clock clock_monotonic = {
.clock_getres = posix_get_hrtimer_res,
.clock_get_timespec = posix_get_monotonic_timespec,
.clock_get_ktime = posix_get_monotonic_ktime,
.nsleep = common_nsleep_timens,
.timer_create = common_timer_create,
.timer_set = common_timer_set,
.timer_get = common_timer_get,
.timer_del = common_timer_del,
.timer_rearm = common_hrtimer_rearm,
.timer_forward = common_hrtimer_forward,
.timer_remaining = common_hrtimer_remaining,
.timer_try_to_cancel = common_hrtimer_try_to_cancel,
.timer_wait_running = common_timer_wait_running,
.timer_arm = common_hrtimer_arm,
};
static const struct k_clock clock_monotonic_raw = {
.clock_getres = posix_get_hrtimer_res,
.clock_get_timespec = posix_get_monotonic_raw,
};
static const struct k_clock clock_realtime_coarse = {
.clock_getres = posix_get_coarse_res,
.clock_get_timespec = posix_get_realtime_coarse,
};
static const struct k_clock clock_monotonic_coarse = {
.clock_getres = posix_get_coarse_res,
.clock_get_timespec = posix_get_monotonic_coarse,
};
static const struct k_clock clock_tai = {
.clock_getres = posix_get_hrtimer_res,
.clock_get_ktime = posix_get_tai_ktime,
.clock_get_timespec = posix_get_tai_timespec,
.nsleep = common_nsleep,
.timer_create = common_timer_create,
.timer_set = common_timer_set,
.timer_get = common_timer_get,
.timer_del = common_timer_del,
.timer_rearm = common_hrtimer_rearm,
.timer_forward = common_hrtimer_forward,
.timer_remaining = common_hrtimer_remaining,
.timer_try_to_cancel = common_hrtimer_try_to_cancel,
.timer_wait_running = common_timer_wait_running,
.timer_arm = common_hrtimer_arm,
};
static const struct k_clock clock_boottime = {
.clock_getres = posix_get_hrtimer_res,
.clock_get_ktime = posix_get_boottime_ktime,
.clock_get_timespec = posix_get_boottime_timespec,
.nsleep = common_nsleep_timens,
.timer_create = common_timer_create,
.timer_set = common_timer_set,
.timer_get = common_timer_get,
.timer_del = common_timer_del,
.timer_rearm = common_hrtimer_rearm,
.timer_forward = common_hrtimer_forward,
.timer_remaining = common_hrtimer_remaining,
.timer_try_to_cancel = common_hrtimer_try_to_cancel,
.timer_wait_running = common_timer_wait_running,
.timer_arm = common_hrtimer_arm,
};
static const struct k_clock * const posix_clocks[] = {
[CLOCK_REALTIME] = &clock_realtime,
[CLOCK_MONOTONIC] = &clock_monotonic,
[CLOCK_PROCESS_CPUTIME_ID] = &clock_process,
[CLOCK_THREAD_CPUTIME_ID] = &clock_thread,
[CLOCK_MONOTONIC_RAW] = &clock_monotonic_raw,
[CLOCK_REALTIME_COARSE] = &clock_realtime_coarse,
[CLOCK_MONOTONIC_COARSE] = &clock_monotonic_coarse,
[CLOCK_BOOTTIME] = &clock_boottime,
[CLOCK_REALTIME_ALARM] = &alarm_clock,
[CLOCK_BOOTTIME_ALARM] = &alarm_clock,
[CLOCK_TAI] = &clock_tai,
};
static const struct k_clock *clockid_to_kclock(const clockid_t id)
{
clockid_t idx = id;
if (id < 0) {
return (id & CLOCKFD_MASK) == CLOCKFD ?
&clock_posix_dynamic : &clock_posix_cpu;
}
if (id >= ARRAY_SIZE(posix_clocks))
return NULL;
return posix_clocks[array_index_nospec(idx, ARRAY_SIZE(posix_clocks))];
}
// SPDX-License-Identifier: GPL-2.0
/*
* Floating proportions with flexible aging period
*
* Copyright (C) 2011, SUSE, Jan Kara <jack@suse.cz>
*
* The goal of this code is: Given different types of event, measure proportion
* of each type of event over time. The proportions are measured with
* exponentially decaying history to give smooth transitions. A formula
* expressing proportion of event of type 'j' is:
*
* p_{j} = (\Sum_{i>=0} x_{i,j}/2^{i+1})/(\Sum_{i>=0} x_i/2^{i+1})
*
* Where x_{i,j} is j's number of events in i-th last time period and x_i is
* total number of events in i-th last time period.
*
* Note that p_{j}'s are normalised, i.e.
*
* \Sum_{j} p_{j} = 1,
*
* This formula can be straightforwardly computed by maintaining denominator
* (let's call it 'd') and for each event type its numerator (let's call it
* 'n_j'). When an event of type 'j' happens, we simply need to do:
* n_j++; d++;
*
* When a new period is declared, we could do:
* d /= 2
* for each j
* n_j /= 2
*
* To avoid iteration over all event types, we instead shift numerator of event
* j lazily when someone asks for a proportion of event j or when event j
* occurs. This can bit trivially implemented by remembering last period in
* which something happened with proportion of type j.
*/
#include <linux/flex_proportions.h>
int fprop_global_init(struct fprop_global *p, gfp_t gfp)
{
int err;
p->period = 0;
/* Use 1 to avoid dealing with periods with 0 events... */
err = percpu_counter_init(&p->events, 1, gfp);
if (err)
return err;
seqcount_init(&p->sequence);
return 0;
}
void fprop_global_destroy(struct fprop_global *p)
{
percpu_counter_destroy(&p->events);
}
/*
* Declare @periods new periods. It is upto the caller to make sure period
* transitions cannot happen in parallel.
*
* The function returns true if the proportions are still defined and false
* if aging zeroed out all events. This can be used to detect whether declaring
* further periods has any effect.
*/
bool fprop_new_period(struct fprop_global *p, int periods)
{
s64 events;
unsigned long flags;
local_irq_save(flags);
events = percpu_counter_sum(&p->events);
/*
* Don't do anything if there are no events.
*/
if (events <= 1) {
local_irq_restore(flags);
return false;
}
write_seqcount_begin(&p->sequence);
if (periods < 64)
events -= events >> periods;
/* Use addition to avoid losing events happening between sum and set */
percpu_counter_add(&p->events, -events);
p->period += periods;
write_seqcount_end(&p->sequence);
local_irq_restore(flags);
return true;
}
/*
* ---- SINGLE ----
*/
int fprop_local_init_single(struct fprop_local_single *pl)
{
pl->events = 0;
pl->period = 0;
raw_spin_lock_init(&pl->lock);
return 0;
}
void fprop_local_destroy_single(struct fprop_local_single *pl)
{
}
static void fprop_reflect_period_single(struct fprop_global *p,
struct fprop_local_single *pl)
{
unsigned int period = p->period;
unsigned long flags;
/* Fast path - period didn't change */
if (pl->period == period)
return;
raw_spin_lock_irqsave(&pl->lock, flags);
/* Someone updated pl->period while we were spinning? */
if (pl->period >= period) {
raw_spin_unlock_irqrestore(&pl->lock, flags);
return;
}
/* Aging zeroed our fraction? */
if (period - pl->period < BITS_PER_LONG)
pl->events >>= period - pl->period;
else
pl->events = 0;
pl->period = period;
raw_spin_unlock_irqrestore(&pl->lock, flags);
}
/* Event of type pl happened */
void __fprop_inc_single(struct fprop_global *p, struct fprop_local_single *pl)
{
fprop_reflect_period_single(p, pl);
pl->events++;
percpu_counter_add(&p->events, 1);
}
/* Return fraction of events of type pl */
void fprop_fraction_single(struct fprop_global *p,
struct fprop_local_single *pl,
unsigned long *numerator, unsigned long *denominator)
{
unsigned int seq;
s64 num, den;
do {
seq = read_seqcount_begin(&p->sequence);
fprop_reflect_period_single(p, pl);
num = pl->events;
den = percpu_counter_read_positive(&p->events);
} while (read_seqcount_retry(&p->sequence, seq));
/*
* Make fraction <= 1 and denominator > 0 even in presence of percpu
* counter errors
*/
if (den <= num) {
if (num)
den = num;
else
den = 1;
}
*denominator = den;
*numerator = num;
}
/*
* ---- PERCPU ----
*/
#define PROP_BATCH (8*(1+ilog2(nr_cpu_ids)))
int fprop_local_init_percpu(struct fprop_local_percpu *pl, gfp_t gfp)
{
int err;
err = percpu_counter_init(&pl->events, 0, gfp);
if (err)
return err;
pl->period = 0;
raw_spin_lock_init(&pl->lock);
return 0;
}
void fprop_local_destroy_percpu(struct fprop_local_percpu *pl)
{
percpu_counter_destroy(&pl->events);
}
static void fprop_reflect_period_percpu(struct fprop_global *p,
struct fprop_local_percpu *pl)
{
unsigned int period = p->period;
unsigned long flags;
/* Fast path - period didn't change */
if (pl->period == period)
return;
raw_spin_lock_irqsave(&pl->lock, flags);
/* Someone updated pl->period while we were spinning? */
if (pl->period >= period) {
raw_spin_unlock_irqrestore(&pl->lock, flags);
return;
}
/* Aging zeroed our fraction? */
if (period - pl->period < BITS_PER_LONG) { s64 val = percpu_counter_read(&pl->events);
if (val < (nr_cpu_ids * PROP_BATCH))
val = percpu_counter_sum(&pl->events);
percpu_counter_add_batch(&pl->events,
-val + (val >> (period-pl->period)), PROP_BATCH);
} else
percpu_counter_set(&pl->events, 0); pl->period = period;
raw_spin_unlock_irqrestore(&pl->lock, flags);
}
/* Event of type pl happened */
void __fprop_inc_percpu(struct fprop_global *p, struct fprop_local_percpu *pl)
{
fprop_reflect_period_percpu(p, pl);
percpu_counter_add_batch(&pl->events, 1, PROP_BATCH);
percpu_counter_add(&p->events, 1);
}
void fprop_fraction_percpu(struct fprop_global *p,
struct fprop_local_percpu *pl,
unsigned long *numerator, unsigned long *denominator)
{
unsigned int seq;
s64 num, den;
do {
seq = read_seqcount_begin(&p->sequence);
fprop_reflect_period_percpu(p, pl);
num = percpu_counter_read_positive(&pl->events);
den = percpu_counter_read_positive(&p->events);
} while (read_seqcount_retry(&p->sequence, seq));
/*
* Make fraction <= 1 and denominator > 0 even in presence of percpu
* counter errors
*/
if (den <= num) {
if (num)
den = num;
else
den = 1;
}
*denominator = den;
*numerator = num;
}
/*
* Like __fprop_inc_percpu() except that event is counted only if the given
* type has fraction smaller than @max_frac/FPROP_FRAC_BASE
*/
void __fprop_inc_percpu_max(struct fprop_global *p,
struct fprop_local_percpu *pl, int max_frac)
{
if (unlikely(max_frac < FPROP_FRAC_BASE)) {
unsigned long numerator, denominator;
fprop_fraction_percpu(p, pl, &numerator, &denominator); if (numerator >
(((u64)denominator) * max_frac) >> FPROP_FRAC_SHIFT)
return;
}
__fprop_inc_percpu(p, pl);
}
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef __LINUX_GFP_H
#define __LINUX_GFP_H
#include <linux/mmdebug.h>
#include <linux/mmzone.h>
#include <linux/stddef.h>
#include <linux/linkage.h>
#include <linux/topology.h>
/* The typedef is in types.h but we want the documentation here */
#if 0
/**
* typedef gfp_t - Memory allocation flags.
*
* GFP flags are commonly used throughout Linux to indicate how memory
* should be allocated. The GFP acronym stands for get_free_pages(),
* the underlying memory allocation function. Not every GFP flag is
* supported by every function which may allocate memory. Most users
* will want to use a plain ``GFP_KERNEL``.
*/
typedef unsigned int __bitwise gfp_t;
#endif
struct vm_area_struct;
/*
* In case of changes, please don't forget to update
* include/trace/events/mmflags.h and tools/perf/builtin-kmem.c
*/
/* Plain integer GFP bitmasks. Do not use this directly. */
#define ___GFP_DMA 0x01u
#define ___GFP_HIGHMEM 0x02u
#define ___GFP_DMA32 0x04u
#define ___GFP_MOVABLE 0x08u
#define ___GFP_RECLAIMABLE 0x10u
#define ___GFP_HIGH 0x20u
#define ___GFP_IO 0x40u
#define ___GFP_FS 0x80u
#define ___GFP_ZERO 0x100u
#define ___GFP_ATOMIC 0x200u
#define ___GFP_DIRECT_RECLAIM 0x400u
#define ___GFP_KSWAPD_RECLAIM 0x800u
#define ___GFP_WRITE 0x1000u
#define ___GFP_NOWARN 0x2000u
#define ___GFP_RETRY_MAYFAIL 0x4000u
#define ___GFP_NOFAIL 0x8000u
#define ___GFP_NORETRY 0x10000u
#define ___GFP_MEMALLOC 0x20000u
#define ___GFP_COMP 0x40000u
#define ___GFP_NOMEMALLOC 0x80000u
#define ___GFP_HARDWALL 0x100000u
#define ___GFP_THISNODE 0x200000u
#define ___GFP_ACCOUNT 0x400000u
#define ___GFP_ZEROTAGS 0x800000u
#define ___GFP_SKIP_KASAN_POISON 0x1000000u
#ifdef CONFIG_LOCKDEP
#define ___GFP_NOLOCKDEP 0x2000000u
#else
#define ___GFP_NOLOCKDEP 0
#endif
/* If the above are modified, __GFP_BITS_SHIFT may need updating */
/*
* Physical address zone modifiers (see linux/mmzone.h - low four bits)
*
* Do not put any conditional on these. If necessary modify the definitions
* without the underscores and use them consistently. The definitions here may
* be used in bit comparisons.
*/
#define __GFP_DMA ((__force gfp_t)___GFP_DMA)
#define __GFP_HIGHMEM ((__force gfp_t)___GFP_HIGHMEM)
#define __GFP_DMA32 ((__force gfp_t)___GFP_DMA32)
#define __GFP_MOVABLE ((__force gfp_t)___GFP_MOVABLE) /* ZONE_MOVABLE allowed */
#define GFP_ZONEMASK (__GFP_DMA|__GFP_HIGHMEM|__GFP_DMA32|__GFP_MOVABLE)
/**
* DOC: Page mobility and placement hints
*
* Page mobility and placement hints
* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
*
* These flags provide hints about how mobile the page is. Pages with similar
* mobility are placed within the same pageblocks to minimise problems due
* to external fragmentation.
*
* %__GFP_MOVABLE (also a zone modifier) indicates that the page can be
* moved by page migration during memory compaction or can be reclaimed.
*
* %__GFP_RECLAIMABLE is used for slab allocations that specify
* SLAB_RECLAIM_ACCOUNT and whose pages can be freed via shrinkers.
*
* %__GFP_WRITE indicates the caller intends to dirty the page. Where possible,
* these pages will be spread between local zones to avoid all the dirty
* pages being in one zone (fair zone allocation policy).
*
* %__GFP_HARDWALL enforces the cpuset memory allocation policy.
*
* %__GFP_THISNODE forces the allocation to be satisfied from the requested
* node with no fallbacks or placement policy enforcements.
*
* %__GFP_ACCOUNT causes the allocation to be accounted to kmemcg.
*/
#define __GFP_RECLAIMABLE ((__force gfp_t)___GFP_RECLAIMABLE)
#define __GFP_WRITE ((__force gfp_t)___GFP_WRITE)
#define __GFP_HARDWALL ((__force gfp_t)___GFP_HARDWALL)
#define __GFP_THISNODE ((__force gfp_t)___GFP_THISNODE)
#define __GFP_ACCOUNT ((__force gfp_t)___GFP_ACCOUNT)
/**
* DOC: Watermark modifiers
*
* Watermark modifiers -- controls access to emergency reserves
* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
*
* %__GFP_HIGH indicates that the caller is high-priority and that granting
* the request is necessary before the system can make forward progress.
* For example, creating an IO context to clean pages.
*
* %__GFP_ATOMIC indicates that the caller cannot reclaim or sleep and is
* high priority. Users are typically interrupt handlers. This may be
* used in conjunction with %__GFP_HIGH
*
* %__GFP_MEMALLOC allows access to all memory. This should only be used when
* the caller guarantees the allocation will allow more memory to be freed
* very shortly e.g. process exiting or swapping. Users either should
* be the MM or co-ordinating closely with the VM (e.g. swap over NFS).
* Users of this flag have to be extremely careful to not deplete the reserve
* completely and implement a throttling mechanism which controls the
* consumption of the reserve based on the amount of freed memory.
* Usage of a pre-allocated pool (e.g. mempool) should be always considered
* before using this flag.
*
* %__GFP_NOMEMALLOC is used to explicitly forbid access to emergency reserves.
* This takes precedence over the %__GFP_MEMALLOC flag if both are set.
*/
#define __GFP_ATOMIC ((__force gfp_t)___GFP_ATOMIC)
#define __GFP_HIGH ((__force gfp_t)___GFP_HIGH)
#define __GFP_MEMALLOC ((__force gfp_t)___GFP_MEMALLOC)
#define __GFP_NOMEMALLOC ((__force gfp_t)___GFP_NOMEMALLOC)
/**
* DOC: Reclaim modifiers
*
* Reclaim modifiers
* ~~~~~~~~~~~~~~~~~
* Please note that all the following flags are only applicable to sleepable
* allocations (e.g. %GFP_NOWAIT and %GFP_ATOMIC will ignore them).
*
* %__GFP_IO can start physical IO.
*
* %__GFP_FS can call down to the low-level FS. Clearing the flag avoids the
* allocator recursing into the filesystem which might already be holding
* locks.
*
* %__GFP_DIRECT_RECLAIM indicates that the caller may enter direct reclaim.
* This flag can be cleared to avoid unnecessary delays when a fallback
* option is available.
*
* %__GFP_KSWAPD_RECLAIM indicates that the caller wants to wake kswapd when
* the low watermark is reached and have it reclaim pages until the high
* watermark is reached. A caller may wish to clear this flag when fallback
* options are available and the reclaim is likely to disrupt the system. The
* canonical example is THP allocation where a fallback is cheap but
* reclaim/compaction may cause indirect stalls.
*
* %__GFP_RECLAIM is shorthand to allow/forbid both direct and kswapd reclaim.
*
* The default allocator behavior depends on the request size. We have a concept
* of so called costly allocations (with order > %PAGE_ALLOC_COSTLY_ORDER).
* !costly allocations are too essential to fail so they are implicitly
* non-failing by default (with some exceptions like OOM victims might fail so
* the caller still has to check for failures) while costly requests try to be
* not disruptive and back off even without invoking the OOM killer.
* The following three modifiers might be used to override some of these
* implicit rules
*
* %__GFP_NORETRY: The VM implementation will try only very lightweight
* memory direct reclaim to get some memory under memory pressure (thus
* it can sleep). It will avoid disruptive actions like OOM killer. The
* caller must handle the failure which is quite likely to happen under
* heavy memory pressure. The flag is suitable when failure can easily be
* handled at small cost, such as reduced throughput
*
* %__GFP_RETRY_MAYFAIL: The VM implementation will retry memory reclaim
* procedures that have previously failed if there is some indication
* that progress has been made else where. It can wait for other
* tasks to attempt high level approaches to freeing memory such as
* compaction (which removes fragmentation) and page-out.
* There is still a definite limit to the number of retries, but it is
* a larger limit than with %__GFP_NORETRY.
* Allocations with this flag may fail, but only when there is
* genuinely little unused memory. While these allocations do not
* directly trigger the OOM killer, their failure indicates that
* the system is likely to need to use the OOM killer soon. The
* caller must handle failure, but can reasonably do so by failing
* a higher-level request, or completing it only in a much less
* efficient manner.
* If the allocation does fail, and the caller is in a position to
* free some non-essential memory, doing so could benefit the system
* as a whole.
*
* %__GFP_NOFAIL: The VM implementation _must_ retry infinitely: the caller
* cannot handle allocation failures. The allocation could block
* indefinitely but will never return with failure. Testing for
* failure is pointless.
* New users should be evaluated carefully (and the flag should be
* used only when there is no reasonable failure policy) but it is
* definitely preferable to use the flag rather than opencode endless
* loop around allocator.
* Using this flag for costly allocations is _highly_ discouraged.
*/
#define __GFP_IO ((__force gfp_t)___GFP_IO)
#define __GFP_FS ((__force gfp_t)___GFP_FS)
#define __GFP_DIRECT_RECLAIM ((__force gfp_t)___GFP_DIRECT_RECLAIM) /* Caller can reclaim */
#define __GFP_KSWAPD_RECLAIM ((__force gfp_t)___GFP_KSWAPD_RECLAIM) /* kswapd can wake */
#define __GFP_RECLAIM ((__force gfp_t)(___GFP_DIRECT_RECLAIM|___GFP_KSWAPD_RECLAIM))
#define __GFP_RETRY_MAYFAIL ((__force gfp_t)___GFP_RETRY_MAYFAIL)
#define __GFP_NOFAIL ((__force gfp_t)___GFP_NOFAIL)
#define __GFP_NORETRY ((__force gfp_t)___GFP_NORETRY)
/**
* DOC: Action modifiers
*
* Action modifiers
* ~~~~~~~~~~~~~~~~
*
* %__GFP_NOWARN suppresses allocation failure reports.
*
* %__GFP_COMP address compound page metadata.
*
* %__GFP_ZERO returns a zeroed page on success.
*
* %__GFP_ZEROTAGS returns a page with zeroed memory tags on success, if
* __GFP_ZERO is set.
*
* %__GFP_SKIP_KASAN_POISON returns a page which does not need to be poisoned
* on deallocation. Typically used for userspace pages. Currently only has an
* effect in HW tags mode.
*/
#define __GFP_NOWARN ((__force gfp_t)___GFP_NOWARN)
#define __GFP_COMP ((__force gfp_t)___GFP_COMP)
#define __GFP_ZERO ((__force gfp_t)___GFP_ZERO)
#define __GFP_ZEROTAGS ((__force gfp_t)___GFP_ZEROTAGS)
#define __GFP_SKIP_KASAN_POISON ((__force gfp_t)___GFP_SKIP_KASAN_POISON)
/* Disable lockdep for GFP context tracking */
#define __GFP_NOLOCKDEP ((__force gfp_t)___GFP_NOLOCKDEP)
/* Room for N __GFP_FOO bits */
#define __GFP_BITS_SHIFT (25 + IS_ENABLED(CONFIG_LOCKDEP))
#define __GFP_BITS_MASK ((__force gfp_t)((1 << __GFP_BITS_SHIFT) - 1))
/**
* DOC: Useful GFP flag combinations
*
* Useful GFP flag combinations
* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~
*
* Useful GFP flag combinations that are commonly used. It is recommended
* that subsystems start with one of these combinations and then set/clear
* %__GFP_FOO flags as necessary.
*
* %GFP_ATOMIC users can not sleep and need the allocation to succeed. A lower
* watermark is applied to allow access to "atomic reserves".
* The current implementation doesn't support NMI and few other strict
* non-preemptive contexts (e.g. raw_spin_lock). The same applies to %GFP_NOWAIT.
*
* %GFP_KERNEL is typical for kernel-internal allocations. The caller requires
* %ZONE_NORMAL or a lower zone for direct access but can direct reclaim.
*
* %GFP_KERNEL_ACCOUNT is the same as GFP_KERNEL, except the allocation is
* accounted to kmemcg.
*
* %GFP_NOWAIT is for kernel allocations that should not stall for direct
* reclaim, start physical IO or use any filesystem callback.
*
* %GFP_NOIO will use direct reclaim to discard clean pages or slab pages
* that do not require the starting of any physical IO.
* Please try to avoid using this flag directly and instead use
* memalloc_noio_{save,restore} to mark the whole scope which cannot
* perform any IO with a short explanation why. All allocation requests
* will inherit GFP_NOIO implicitly.
*
* %GFP_NOFS will use direct reclaim but will not use any filesystem interfaces.
* Please try to avoid using this flag directly and instead use
* memalloc_nofs_{save,restore} to mark the whole scope which cannot/shouldn't
* recurse into the FS layer with a short explanation why. All allocation
* requests will inherit GFP_NOFS implicitly.
*
* %GFP_USER is for userspace allocations that also need to be directly
* accessibly by the kernel or hardware. It is typically used by hardware
* for buffers that are mapped to userspace (e.g. graphics) that hardware
* still must DMA to. cpuset limits are enforced for these allocations.
*
* %GFP_DMA exists for historical reasons and should be avoided where possible.
* The flags indicates that the caller requires that the lowest zone be
* used (%ZONE_DMA or 16M on x86-64). Ideally, this would be removed but
* it would require careful auditing as some users really require it and
* others use the flag to avoid lowmem reserves in %ZONE_DMA and treat the
* lowest zone as a type of emergency reserve.
*
* %GFP_DMA32 is similar to %GFP_DMA except that the caller requires a 32-bit
* address.
*
* %GFP_HIGHUSER is for userspace allocations that may be mapped to userspace,
* do not need to be directly accessible by the kernel but that cannot
* move once in use. An example may be a hardware allocation that maps
* data directly into userspace but has no addressing limitations.
*
* %GFP_HIGHUSER_MOVABLE is for userspace allocations that the kernel does not
* need direct access to but can use kmap() when access is required. They
* are expected to be movable via page reclaim or page migration. Typically,
* pages on the LRU would also be allocated with %GFP_HIGHUSER_MOVABLE.
*
* %GFP_TRANSHUGE and %GFP_TRANSHUGE_LIGHT are used for THP allocations. They
* are compound allocations that will generally fail quickly if memory is not
* available and will not wake kswapd/kcompactd on failure. The _LIGHT
* version does not attempt reclaim/compaction at all and is by default used
* in page fault path, while the non-light is used by khugepaged.
*/
#define GFP_ATOMIC (__GFP_HIGH|__GFP_ATOMIC|__GFP_KSWAPD_RECLAIM)
#define GFP_KERNEL (__GFP_RECLAIM | __GFP_IO | __GFP_FS)
#define GFP_KERNEL_ACCOUNT (GFP_KERNEL | __GFP_ACCOUNT)
#define GFP_NOWAIT (__GFP_KSWAPD_RECLAIM)
#define GFP_NOIO (__GFP_RECLAIM)
#define GFP_NOFS (__GFP_RECLAIM | __GFP_IO)
#define GFP_USER (__GFP_RECLAIM | __GFP_IO | __GFP_FS | __GFP_HARDWALL)
#define GFP_DMA __GFP_DMA
#define GFP_DMA32 __GFP_DMA32
#define GFP_HIGHUSER (GFP_USER | __GFP_HIGHMEM)
#define GFP_HIGHUSER_MOVABLE (GFP_HIGHUSER | __GFP_MOVABLE | \
__GFP_SKIP_KASAN_POISON)
#define GFP_TRANSHUGE_LIGHT ((GFP_HIGHUSER_MOVABLE | __GFP_COMP | \
__GFP_NOMEMALLOC | __GFP_NOWARN) & ~__GFP_RECLAIM)
#define GFP_TRANSHUGE (GFP_TRANSHUGE_LIGHT | __GFP_DIRECT_RECLAIM)
/* Convert GFP flags to their corresponding migrate type */
#define GFP_MOVABLE_MASK (__GFP_RECLAIMABLE|__GFP_MOVABLE)
#define GFP_MOVABLE_SHIFT 3
static inline int gfp_migratetype(const gfp_t gfp_flags)
{
VM_WARN_ON((gfp_flags & GFP_MOVABLE_MASK) == GFP_MOVABLE_MASK);
BUILD_BUG_ON((1UL << GFP_MOVABLE_SHIFT) != ___GFP_MOVABLE);
BUILD_BUG_ON((___GFP_MOVABLE >> GFP_MOVABLE_SHIFT) != MIGRATE_MOVABLE);
if (unlikely(page_group_by_mobility_disabled))
return MIGRATE_UNMOVABLE;
/* Group based on mobility */
return (gfp_flags & GFP_MOVABLE_MASK) >> GFP_MOVABLE_SHIFT;
}
#undef GFP_MOVABLE_MASK
#undef GFP_MOVABLE_SHIFT
static inline bool gfpflags_allow_blocking(const gfp_t gfp_flags)
{
return !!(gfp_flags & __GFP_DIRECT_RECLAIM);
}
/**
* gfpflags_normal_context - is gfp_flags a normal sleepable context?
* @gfp_flags: gfp_flags to test
*
* Test whether @gfp_flags indicates that the allocation is from the
* %current context and allowed to sleep.
*
* An allocation being allowed to block doesn't mean it owns the %current
* context. When direct reclaim path tries to allocate memory, the
* allocation context is nested inside whatever %current was doing at the
* time of the original allocation. The nested allocation may be allowed
* to block but modifying anything %current owns can corrupt the outer
* context's expectations.
*
* %true result from this function indicates that the allocation context
* can sleep and use anything that's associated with %current.
*/
static inline bool gfpflags_normal_context(const gfp_t gfp_flags)
{
return (gfp_flags & (__GFP_DIRECT_RECLAIM | __GFP_MEMALLOC)) ==
__GFP_DIRECT_RECLAIM;
}
#ifdef CONFIG_HIGHMEM
#define OPT_ZONE_HIGHMEM ZONE_HIGHMEM
#else
#define OPT_ZONE_HIGHMEM ZONE_NORMAL
#endif
#ifdef CONFIG_ZONE_DMA
#define OPT_ZONE_DMA ZONE_DMA
#else
#define OPT_ZONE_DMA ZONE_NORMAL
#endif
#ifdef CONFIG_ZONE_DMA32
#define OPT_ZONE_DMA32 ZONE_DMA32
#else
#define OPT_ZONE_DMA32 ZONE_NORMAL
#endif
/*
* GFP_ZONE_TABLE is a word size bitstring that is used for looking up the
* zone to use given the lowest 4 bits of gfp_t. Entries are GFP_ZONES_SHIFT
* bits long and there are 16 of them to cover all possible combinations of
* __GFP_DMA, __GFP_DMA32, __GFP_MOVABLE and __GFP_HIGHMEM.
*
* The zone fallback order is MOVABLE=>HIGHMEM=>NORMAL=>DMA32=>DMA.
* But GFP_MOVABLE is not only a zone specifier but also an allocation
* policy. Therefore __GFP_MOVABLE plus another zone selector is valid.
* Only 1 bit of the lowest 3 bits (DMA,DMA32,HIGHMEM) can be set to "1".
*
* bit result
* =================
* 0x0 => NORMAL
* 0x1 => DMA or NORMAL
* 0x2 => HIGHMEM or NORMAL
* 0x3 => BAD (DMA+HIGHMEM)
* 0x4 => DMA32 or NORMAL
* 0x5 => BAD (DMA+DMA32)
* 0x6 => BAD (HIGHMEM+DMA32)
* 0x7 => BAD (HIGHMEM+DMA32+DMA)
* 0x8 => NORMAL (MOVABLE+0)
* 0x9 => DMA or NORMAL (MOVABLE+DMA)
* 0xa => MOVABLE (Movable is valid only if HIGHMEM is set too)
* 0xb => BAD (MOVABLE+HIGHMEM+DMA)
* 0xc => DMA32 or NORMAL (MOVABLE+DMA32)
* 0xd => BAD (MOVABLE+DMA32+DMA)
* 0xe => BAD (MOVABLE+DMA32+HIGHMEM)
* 0xf => BAD (MOVABLE+DMA32+HIGHMEM+DMA)
*
* GFP_ZONES_SHIFT must be <= 2 on 32 bit platforms.
*/
#if defined(CONFIG_ZONE_DEVICE) && (MAX_NR_ZONES-1) <= 4
/* ZONE_DEVICE is not a valid GFP zone specifier */
#define GFP_ZONES_SHIFT 2
#else
#define GFP_ZONES_SHIFT ZONES_SHIFT
#endif
#if 16 * GFP_ZONES_SHIFT > BITS_PER_LONG
#error GFP_ZONES_SHIFT too large to create GFP_ZONE_TABLE integer
#endif
#define GFP_ZONE_TABLE ( \
(ZONE_NORMAL << 0 * GFP_ZONES_SHIFT) \
| (OPT_ZONE_DMA << ___GFP_DMA * GFP_ZONES_SHIFT) \
| (OPT_ZONE_HIGHMEM << ___GFP_HIGHMEM * GFP_ZONES_SHIFT) \
| (OPT_ZONE_DMA32 << ___GFP_DMA32 * GFP_ZONES_SHIFT) \
| (ZONE_NORMAL << ___GFP_MOVABLE * GFP_ZONES_SHIFT) \
| (OPT_ZONE_DMA << (___GFP_MOVABLE | ___GFP_DMA) * GFP_ZONES_SHIFT) \
| (ZONE_MOVABLE << (___GFP_MOVABLE | ___GFP_HIGHMEM) * GFP_ZONES_SHIFT)\
| (OPT_ZONE_DMA32 << (___GFP_MOVABLE | ___GFP_DMA32) * GFP_ZONES_SHIFT)\
)
/*
* GFP_ZONE_BAD is a bitmap for all combinations of __GFP_DMA, __GFP_DMA32
* __GFP_HIGHMEM and __GFP_MOVABLE that are not permitted. One flag per
* entry starting with bit 0. Bit is set if the combination is not
* allowed.
*/
#define GFP_ZONE_BAD ( \
1 << (___GFP_DMA | ___GFP_HIGHMEM) \
| 1 << (___GFP_DMA | ___GFP_DMA32) \
| 1 << (___GFP_DMA32 | ___GFP_HIGHMEM) \
| 1 << (___GFP_DMA | ___GFP_DMA32 | ___GFP_HIGHMEM) \
| 1 << (___GFP_MOVABLE | ___GFP_HIGHMEM | ___GFP_DMA) \
| 1 << (___GFP_MOVABLE | ___GFP_DMA32 | ___GFP_DMA) \
| 1 << (___GFP_MOVABLE | ___GFP_DMA32 | ___GFP_HIGHMEM) \
| 1 << (___GFP_MOVABLE | ___GFP_DMA32 | ___GFP_DMA | ___GFP_HIGHMEM) \
)
static inline enum zone_type gfp_zone(gfp_t flags)
{
enum zone_type z;
int bit = (__force int) (flags & GFP_ZONEMASK);
z = (GFP_ZONE_TABLE >> (bit * GFP_ZONES_SHIFT)) &
((1 << GFP_ZONES_SHIFT) - 1);
VM_BUG_ON((GFP_ZONE_BAD >> bit) & 1);
return z;
}
/*
* There is only one page-allocator function, and two main namespaces to
* it. The alloc_page*() variants return 'struct page *' and as such
* can allocate highmem pages, the *get*page*() variants return
* virtual kernel addresses to the allocated page(s).
*/
static inline int gfp_zonelist(gfp_t flags)
{
#ifdef CONFIG_NUMA
if (unlikely(flags & __GFP_THISNODE))
return ZONELIST_NOFALLBACK;
#endif
return ZONELIST_FALLBACK;
}
/*
* We get the zone list from the current node and the gfp_mask.
* This zone list contains a maximum of MAX_NUMNODES*MAX_NR_ZONES zones.
* There are two zonelists per node, one for all zones with memory and
* one containing just zones from the node the zonelist belongs to.
*
* For the case of non-NUMA systems the NODE_DATA() gets optimized to
* &contig_page_data at compile-time.
*/
static inline struct zonelist *node_zonelist(int nid, gfp_t flags)
{
return NODE_DATA(nid)->node_zonelists + gfp_zonelist(flags);
}
#ifndef HAVE_ARCH_FREE_PAGE
static inline void arch_free_page(struct page *page, int order) { }
#endif
#ifndef HAVE_ARCH_ALLOC_PAGE
static inline void arch_alloc_page(struct page *page, int order) { }
#endif
#ifndef HAVE_ARCH_MAKE_PAGE_ACCESSIBLE
static inline int arch_make_page_accessible(struct page *page)
{
return 0;
}
#endif
struct page *__alloc_pages(gfp_t gfp, unsigned int order, int preferred_nid,
nodemask_t *nodemask);
unsigned long __alloc_pages_bulk(gfp_t gfp, int preferred_nid,
nodemask_t *nodemask, int nr_pages,
struct list_head *page_list,
struct page **page_array);
/* Bulk allocate order-0 pages */
static inline unsigned long
alloc_pages_bulk_list(gfp_t gfp, unsigned long nr_pages, struct list_head *list)
{
return __alloc_pages_bulk(gfp, numa_mem_id(), NULL, nr_pages, list, NULL);
}
static inline unsigned long
alloc_pages_bulk_array(gfp_t gfp, unsigned long nr_pages, struct page **page_array)
{
return __alloc_pages_bulk(gfp, numa_mem_id(), NULL, nr_pages, NULL, page_array);
}
static inline unsigned long
alloc_pages_bulk_array_node(gfp_t gfp, int nid, unsigned long nr_pages, struct page **page_array)
{
if (nid == NUMA_NO_NODE)
nid = numa_mem_id();
return __alloc_pages_bulk(gfp, nid, NULL, nr_pages, NULL, page_array);
}
/*
* Allocate pages, preferring the node given as nid. The node must be valid and
* online. For more general interface, see alloc_pages_node().
*/
static inline struct page *
__alloc_pages_node(int nid, gfp_t gfp_mask, unsigned int order)
{
VM_BUG_ON(nid < 0 || nid >= MAX_NUMNODES);
VM_WARN_ON((gfp_mask & __GFP_THISNODE) && !node_online(nid));
return __alloc_pages(gfp_mask, order, nid, NULL);
}
/*
* Allocate pages, preferring the node given as nid. When nid == NUMA_NO_NODE,
* prefer the current CPU's closest node. Otherwise node must be valid and
* online.
*/
static inline struct page *alloc_pages_node(int nid, gfp_t gfp_mask,
unsigned int order)
{
if (nid == NUMA_NO_NODE)
nid = numa_mem_id();
return __alloc_pages_node(nid, gfp_mask, order);
}
#ifdef CONFIG_NUMA
struct page *alloc_pages(gfp_t gfp, unsigned int order);
extern struct page *alloc_pages_vma(gfp_t gfp_mask, int order,
struct vm_area_struct *vma, unsigned long addr,
int node, bool hugepage);
#define alloc_hugepage_vma(gfp_mask, vma, addr, order) \
alloc_pages_vma(gfp_mask, order, vma, addr, numa_node_id(), true)
#else
static inline struct page *alloc_pages(gfp_t gfp_mask, unsigned int order)
{
return alloc_pages_node(numa_node_id(), gfp_mask, order);
}
#define alloc_pages_vma(gfp_mask, order, vma, addr, node, false)\
alloc_pages(gfp_mask, order)
#define alloc_hugepage_vma(gfp_mask, vma, addr, order) \
alloc_pages(gfp_mask, order)
#endif
#define alloc_page(gfp_mask) alloc_pages(gfp_mask, 0)
#define alloc_page_vma(gfp_mask, vma, addr) \
alloc_pages_vma(gfp_mask, 0, vma, addr, numa_node_id(), false)
extern unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order);
extern unsigned long get_zeroed_page(gfp_t gfp_mask);
void *alloc_pages_exact(size_t size, gfp_t gfp_mask);
void free_pages_exact(void *virt, size_t size);
void * __meminit alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask);
#define __get_free_page(gfp_mask) \
__get_free_pages((gfp_mask), 0)
#define __get_dma_pages(gfp_mask, order) \
__get_free_pages((gfp_mask) | GFP_DMA, (order))
extern void __free_pages(struct page *page, unsigned int order);
extern void free_pages(unsigned long addr, unsigned int order);
struct page_frag_cache;
extern void __page_frag_cache_drain(struct page *page, unsigned int count);
extern void *page_frag_alloc_align(struct page_frag_cache *nc,
unsigned int fragsz, gfp_t gfp_mask,
unsigned int align_mask);
static inline void *page_frag_alloc(struct page_frag_cache *nc,
unsigned int fragsz, gfp_t gfp_mask)
{
return page_frag_alloc_align(nc, fragsz, gfp_mask, ~0u);
}
extern void page_frag_free(void *addr);
#define __free_page(page) __free_pages((page), 0)
#define free_page(addr) free_pages((addr), 0)
void page_alloc_init(void);
void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp);
void drain_all_pages(struct zone *zone);
void drain_local_pages(struct zone *zone);
void page_alloc_init_late(void);
/*
* gfp_allowed_mask is set to GFP_BOOT_MASK during early boot to restrict what
* GFP flags are used before interrupts are enabled. Once interrupts are
* enabled, it is set to __GFP_BITS_MASK while the system is running. During
* hibernation, it is used by PM to avoid I/O during memory allocation while
* devices are suspended.
*/
extern gfp_t gfp_allowed_mask;
/* Returns true if the gfp_mask allows use of ALLOC_NO_WATERMARK */
bool gfp_pfmemalloc_allowed(gfp_t gfp_mask);
extern void pm_restrict_gfp_mask(void);
extern void pm_restore_gfp_mask(void);
extern gfp_t vma_thp_gfp_mask(struct vm_area_struct *vma);
#ifdef CONFIG_PM_SLEEP
extern bool pm_suspended_storage(void);
#else
static inline bool pm_suspended_storage(void)
{
return false;
}
#endif /* CONFIG_PM_SLEEP */
#ifdef CONFIG_CONTIG_ALLOC
/* The below functions must be run on a range from a single zone. */
extern int alloc_contig_range(unsigned long start, unsigned long end,
unsigned migratetype, gfp_t gfp_mask);
extern struct page *alloc_contig_pages(unsigned long nr_pages, gfp_t gfp_mask,
int nid, nodemask_t *nodemask);
#endif
void free_contig_range(unsigned long pfn, unsigned long nr_pages);
#ifdef CONFIG_CMA
/* CMA stuff */
extern void init_cma_reserved_pageblock(struct page *page);
#endif
#endif /* __LINUX_GFP_H */
// SPDX-License-Identifier: GPL-2.0
/*
* Copyright (C) 2008 Red Hat. All rights reserved.
*/
#include <linux/pagemap.h>
#include <linux/sched.h>
#include <linux/sched/signal.h>
#include <linux/slab.h>
#include <linux/math64.h>
#include <linux/ratelimit.h>
#include <linux/error-injection.h>
#include <linux/sched/mm.h>
#include "misc.h"
#include "ctree.h"
#include "free-space-cache.h"
#include "transaction.h"
#include "disk-io.h"
#include "extent_io.h"
#include "volumes.h"
#include "space-info.h"
#include "delalloc-space.h"
#include "block-group.h"
#include "discard.h"
#define BITS_PER_BITMAP (PAGE_SIZE * 8UL)
#define MAX_CACHE_BYTES_PER_GIG SZ_64K
#define FORCE_EXTENT_THRESHOLD SZ_1M
struct btrfs_trim_range {
u64 start;
u64 bytes;
struct list_head list;
};
static int link_free_space(struct btrfs_free_space_ctl *ctl,
struct btrfs_free_space *info);
static void unlink_free_space(struct btrfs_free_space_ctl *ctl,
struct btrfs_free_space *info);
static int search_bitmap(struct btrfs_free_space_ctl *ctl,
struct btrfs_free_space *bitmap_info, u64 *offset,
u64 *bytes, bool for_alloc);
static void free_bitmap(struct btrfs_free_space_ctl *ctl,
struct btrfs_free_space *bitmap_info);
static void bitmap_clear_bits(struct btrfs_free_space_ctl *ctl,
struct btrfs_free_space *info, u64 offset,
u64 bytes);
static struct inode *__lookup_free_space_inode(struct btrfs_root *root,
struct btrfs_path *path,
u64 offset)
{
struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_key key;
struct btrfs_key location;
struct btrfs_disk_key disk_key;
struct btrfs_free_space_header *header;
struct extent_buffer *leaf;
struct inode *inode = NULL;
unsigned nofs_flag;
int ret;
key.objectid = BTRFS_FREE_SPACE_OBJECTID;
key.offset = offset;
key.type = 0;
ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
if (ret < 0)
return ERR_PTR(ret); if (ret > 0) { btrfs_release_path(path);
return ERR_PTR(-ENOENT);
}
leaf = path->nodes[0]; header = btrfs_item_ptr(leaf, path->slots[0],
struct btrfs_free_space_header);
btrfs_free_space_key(leaf, header, &disk_key);
btrfs_disk_key_to_cpu(&location, &disk_key);
btrfs_release_path(path);
/*
* We are often under a trans handle at this point, so we need to make
* sure NOFS is set to keep us from deadlocking.
*/
nofs_flag = memalloc_nofs_save();
inode = btrfs_iget_path(fs_info->sb, location.objectid, root, path);
btrfs_release_path(path);
memalloc_nofs_restore(nofs_flag);
if (IS_ERR(inode))
return inode;
mapping_set_gfp_mask(inode->i_mapping,
mapping_gfp_constraint(inode->i_mapping,
~(__GFP_FS | __GFP_HIGHMEM)));
return inode;
}
struct inode *lookup_free_space_inode(struct btrfs_block_group *block_group,
struct btrfs_path *path)
{
struct btrfs_fs_info *fs_info = block_group->fs_info;
struct inode *inode = NULL;
u32 flags = BTRFS_INODE_NODATASUM | BTRFS_INODE_NODATACOW;
spin_lock(&block_group->lock);
if (block_group->inode)
inode = igrab(block_group->inode);
spin_unlock(&block_group->lock);
if (inode)
return inode;
inode = __lookup_free_space_inode(fs_info->tree_root, path,
block_group->start);
if (IS_ERR(inode))
return inode;
spin_lock(&block_group->lock);
if (!((BTRFS_I(inode)->flags & flags) == flags)) {
btrfs_info(fs_info, "Old style space inode found, converting.");
BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM |
BTRFS_INODE_NODATACOW;
block_group->disk_cache_state = BTRFS_DC_CLEAR;
}
if (!block_group->iref) { block_group->inode = igrab(inode);
block_group->iref = 1;
}
spin_unlock(&block_group->lock);
return inode;
}
static int __create_free_space_inode(struct btrfs_root *root,
struct btrfs_trans_handle *trans,
struct btrfs_path *path,
u64 ino, u64 offset)
{
struct btrfs_key key;
struct btrfs_disk_key disk_key;
struct btrfs_free_space_header *header;
struct btrfs_inode_item *inode_item;
struct extent_buffer *leaf;
/* We inline CRCs for the free disk space cache */
const u64 flags = BTRFS_INODE_NOCOMPRESS | BTRFS_INODE_PREALLOC |
BTRFS_INODE_NODATASUM | BTRFS_INODE_NODATACOW;
int ret;
ret = btrfs_insert_empty_inode(trans, root, path, ino);
if (ret)
return ret;
leaf = path->nodes[0];
inode_item = btrfs_item_ptr(leaf, path->slots[0],
struct btrfs_inode_item);
btrfs_item_key(leaf, &disk_key, path->slots[0]);
memzero_extent_buffer(leaf, (unsigned long)inode_item,
sizeof(*inode_item));
btrfs_set_inode_generation(leaf, inode_item, trans->transid);
btrfs_set_inode_size(leaf, inode_item, 0);
btrfs_set_inode_nbytes(leaf, inode_item, 0);
btrfs_set_inode_uid(leaf, inode_item, 0);
btrfs_set_inode_gid(leaf, inode_item, 0);
btrfs_set_inode_mode(leaf, inode_item, S_IFREG | 0600);
btrfs_set_inode_flags(leaf, inode_item, flags);
btrfs_set_inode_nlink(leaf, inode_item, 1);
btrfs_set_inode_transid(leaf, inode_item, trans->transid);
btrfs_set_inode_block_group(leaf, inode_item, offset);
btrfs_mark_buffer_dirty(leaf);
btrfs_release_path(path);
key.objectid = BTRFS_FREE_SPACE_OBJECTID;
key.offset = offset;
key.type = 0;
ret = btrfs_insert_empty_item(trans, root, path, &key,
sizeof(struct btrfs_free_space_header));
if (ret < 0) {
btrfs_release_path(path);
return ret;
}
leaf = path->nodes[0];
header = btrfs_item_ptr(leaf, path->slots[0],
struct btrfs_free_space_header);
memzero_extent_buffer(leaf, (unsigned long)header, sizeof(*header));
btrfs_set_free_space_key(leaf, header, &disk_key);
btrfs_mark_buffer_dirty(leaf);
btrfs_release_path(path);
return 0;
}
int create_free_space_inode(struct btrfs_trans_handle *trans,
struct btrfs_block_group *block_group,
struct btrfs_path *path)
{
int ret;
u64 ino;
ret = btrfs_get_free_objectid(trans->fs_info->tree_root, &ino);
if (ret < 0)
return ret;
return __create_free_space_inode(trans->fs_info->tree_root, trans, path,
ino, block_group->start);
}
/*
* inode is an optional sink: if it is NULL, btrfs_remove_free_space_inode
* handles lookup, otherwise it takes ownership and iputs the inode.
* Don't reuse an inode pointer after passing it into this function.
*/
int btrfs_remove_free_space_inode(struct btrfs_trans_handle *trans,
struct inode *inode,
struct btrfs_block_group *block_group)
{
struct btrfs_path *path;
struct btrfs_key key;
int ret = 0;
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
if (!inode) inode = lookup_free_space_inode(block_group, path);
if (IS_ERR(inode)) {
if (PTR_ERR(inode) != -ENOENT)
ret = PTR_ERR(inode);
goto out;
}
ret = btrfs_orphan_add(trans, BTRFS_I(inode));
if (ret) {
btrfs_add_delayed_iput(inode);
goto out;
}
clear_nlink(inode);
/* One for the block groups ref */
spin_lock(&block_group->lock);
if (block_group->iref) {
block_group->iref = 0;
block_group->inode = NULL;
spin_unlock(&block_group->lock);
iput(inode);
} else {
spin_unlock(&block_group->lock);
}
/* One for the lookup ref */
btrfs_add_delayed_iput(inode);
key.objectid = BTRFS_FREE_SPACE_OBJECTID;
key.type = 0;
key.offset = block_group->start;
ret = btrfs_search_slot(trans, trans->fs_info->tree_root, &key, path,
-1, 1);
if (ret) {
if (ret > 0)
ret = 0;
goto out;
}
ret = btrfs_del_item(trans, trans->fs_info->tree_root, path);
out:
btrfs_free_path(path); return ret;
}
int btrfs_check_trunc_cache_free_space(struct btrfs_fs_info *fs_info,
struct btrfs_block_rsv *rsv)
{
u64 needed_bytes;
int ret;
/* 1 for slack space, 1 for updating the inode */
needed_bytes = btrfs_calc_insert_metadata_size(fs_info, 1) +
btrfs_calc_metadata_size(fs_info, 1);
spin_lock(&rsv->lock);
if (rsv->reserved < needed_bytes)
ret = -ENOSPC;
else
ret = 0;
spin_unlock(&rsv->lock);
return ret;
}
int btrfs_truncate_free_space_cache(struct btrfs_trans_handle *trans,
struct btrfs_block_group *block_group,
struct inode *inode)
{
struct btrfs_root *root = BTRFS_I(inode)->root;
int ret = 0;
bool locked = false;
if (block_group) {
struct btrfs_path *path = btrfs_alloc_path();
if (!path) {
ret = -ENOMEM;
goto fail;
}
locked = true;
mutex_lock(&trans->transaction->cache_write_mutex);
if (!list_empty(&block_group->io_list)) {
list_del_init(&block_group->io_list);
btrfs_wait_cache_io(trans, block_group, path);
btrfs_put_block_group(block_group);
}
/*
* now that we've truncated the cache away, its no longer
* setup or written
*/
spin_lock(&block_group->lock);
block_group->disk_cache_state = BTRFS_DC_CLEAR;
spin_unlock(&block_group->lock);
btrfs_free_path(path);
}
btrfs_i_size_write(BTRFS_I(inode), 0);
truncate_pagecache(inode, 0);
/*
* We skip the throttling logic for free space cache inodes, so we don't
* need to check for -EAGAIN.
*/
ret = btrfs_truncate_inode_items(trans, root, BTRFS_I(inode),
0, BTRFS_EXTENT_DATA_KEY, NULL);
if (ret)
goto fail;
ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
fail:
if (locked)
mutex_unlock(&trans->transaction->cache_write_mutex); if (ret) btrfs_abort_transaction(trans, ret); return ret;
}
static void readahead_cache(struct inode *inode)
{
struct file_ra_state ra;
unsigned long last_index;
file_ra_state_init(&ra, inode->i_mapping);
last_index = (i_size_read(inode) - 1) >> PAGE_SHIFT;
page_cache_sync_readahead(inode->i_mapping, &ra, NULL, 0, last_index);
}
static int io_ctl_init(struct btrfs_io_ctl *io_ctl, struct inode *inode,
int write)
{
int num_pages;
num_pages = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE);
/* Make sure we can fit our crcs and generation into the first page */
if (write && (num_pages * sizeof(u32) + sizeof(u64)) > PAGE_SIZE)
return -ENOSPC;
memset(io_ctl, 0, sizeof(struct btrfs_io_ctl)); io_ctl->pages = kcalloc(num_pages, sizeof(struct page *), GFP_NOFS);
if (!io_ctl->pages)
return -ENOMEM;
io_ctl->num_pages = num_pages;
io_ctl->fs_info = btrfs_sb(inode->i_sb);
io_ctl->inode = inode;
return 0;
}
ALLOW_ERROR_INJECTION(io_ctl_init, ERRNO);
static void io_ctl_free(struct btrfs_io_ctl *io_ctl)
{
kfree(io_ctl->pages);
io_ctl->pages = NULL;
}
static void io_ctl_unmap_page(struct btrfs_io_ctl *io_ctl)
{
if (io_ctl->cur) { io_ctl->cur = NULL; io_ctl->orig = NULL;
}
}
static void io_ctl_map_page(struct btrfs_io_ctl *io_ctl, int clear)
{
ASSERT(io_ctl->index < io_ctl->num_pages);
io_ctl->page = io_ctl->pages[io_ctl->index++];
io_ctl->cur = page_address(io_ctl->page);
io_ctl->orig = io_ctl->cur;
io_ctl->size = PAGE_SIZE;
if (clear)
clear_page(io_ctl->cur);
}
static void io_ctl_drop_pages(struct btrfs_io_ctl *io_ctl)
{
int i;
io_ctl_unmap_page(io_ctl);
for (i = 0; i < io_ctl->num_pages; i++) { if (io_ctl->pages[i]) {
ClearPageChecked(io_ctl->pages[i]);
unlock_page(io_ctl->pages[i]);
put_page(io_ctl->pages[i]);
}
}
}
static int io_ctl_prepare_pages(struct btrfs_io_ctl *io_ctl, bool uptodate)
{
struct page *page;
struct inode *inode = io_ctl->inode;
gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping);
int i;
for (i = 0; i < io_ctl->num_pages; i++) {
int ret;
page = find_or_create_page(inode->i_mapping, i, mask);
if (!page) {
io_ctl_drop_pages(io_ctl);
return -ENOMEM;
}
ret = set_page_extent_mapped(page);
if (ret < 0) {
unlock_page(page);
put_page(page);
io_ctl_drop_pages(io_ctl);
return ret;
}
io_ctl->pages[i] = page;
if (uptodate && !PageUptodate(page)) {
btrfs_readpage(NULL, page);
lock_page(page);
if (page->mapping != inode->i_mapping) {
btrfs_err(BTRFS_I(inode)->root->fs_info,
"free space cache page truncated");
io_ctl_drop_pages(io_ctl);
return -EIO;
}
if (!PageUptodate(page)) {
btrfs_err(BTRFS_I(inode)->root->fs_info,
"error reading free space cache");
io_ctl_drop_pages(io_ctl);
return -EIO;
}
}
}
for (i = 0; i < io_ctl->num_pages; i++) clear_page_dirty_for_io(io_ctl->pages[i]); return 0;
}
static void io_ctl_set_generation(struct btrfs_io_ctl *io_ctl, u64 generation)
{
io_ctl_map_page(io_ctl, 1);
/*
* Skip the csum areas. If we don't check crcs then we just have a
* 64bit chunk at the front of the first page.
*/
io_ctl->cur += (sizeof(u32) * io_ctl->num_pages);
io_ctl->size -= sizeof(u64) + (sizeof(u32) * io_ctl->num_pages);
put_unaligned_le64(generation, io_ctl->cur);
io_ctl->cur += sizeof(u64);
}
static int io_ctl_check_generation(struct btrfs_io_ctl *io_ctl, u64 generation)
{
u64 cache_gen;
/*
* Skip the crc area. If we don't check crcs then we just have a 64bit
* chunk at the front of the first page.
*/
io_ctl->cur += sizeof(u32) * io_ctl->num_pages;
io_ctl->size -= sizeof(u64) + (sizeof(u32) * io_ctl->num_pages);
cache_gen = get_unaligned_le64(io_ctl->cur);
if (cache_gen != generation) {
btrfs_err_rl(io_ctl->fs_info,
"space cache generation (%llu) does not match inode (%llu)",
cache_gen, generation);
io_ctl_unmap_page(io_ctl);
return -EIO;
}
io_ctl->cur += sizeof(u64);
return 0;
}
static void io_ctl_set_crc(struct btrfs_io_ctl *io_ctl, int index)
{
u32 *tmp;
u32 crc = ~(u32)0;
unsigned offset = 0;
if (index == 0) offset = sizeof(u32) * io_ctl->num_pages; crc = btrfs_crc32c(crc, io_ctl->orig + offset, PAGE_SIZE - offset);
btrfs_crc32c_final(crc, (u8 *)&crc);
io_ctl_unmap_page(io_ctl);
tmp = page_address(io_ctl->pages[0]);
tmp += index;
*tmp = crc;
}
static int io_ctl_check_crc(struct btrfs_io_ctl *io_ctl, int index)
{
u32 *tmp, val;
u32 crc = ~(u32)0;
unsigned offset = 0;
if (index == 0)
offset = sizeof(u32) * io_ctl->num_pages;
tmp = page_address(io_ctl->pages[0]);
tmp += index;
val = *tmp;
io_ctl_map_page(io_ctl, 0);
crc = btrfs_crc32c(crc, io_ctl->orig + offset, PAGE_SIZE - offset);
btrfs_crc32c_final(crc, (u8 *)&crc);
if (val != crc) {
btrfs_err_rl(io_ctl->fs_info,
"csum mismatch on free space cache");
io_ctl_unmap_page(io_ctl);
return -EIO;
}
return 0;
}
static int io_ctl_add_entry(struct btrfs_io_ctl *io_ctl, u64 offset, u64 bytes,
void *bitmap)
{
struct btrfs_free_space_entry *entry;
if (!io_ctl->cur)
return -ENOSPC;
entry = io_ctl->cur;
put_unaligned_le64(offset, &entry->offset);
put_unaligned_le64(bytes, &entry->bytes);
entry->type = (bitmap) ? BTRFS_FREE_SPACE_BITMAP :
BTRFS_FREE_SPACE_EXTENT;
io_ctl->cur += sizeof(struct btrfs_free_space_entry);
io_ctl->size -= sizeof(struct btrfs_free_space_entry);
if (io_ctl->size >= sizeof(struct btrfs_free_space_entry))
return 0;
io_ctl_set_crc(io_ctl, io_ctl->index - 1);
/* No more pages to map */
if (io_ctl->index >= io_ctl->num_pages)
return 0;
/* map the next page */
io_ctl_map_page(io_ctl, 1);
return 0;
}
static int io_ctl_add_bitmap(struct btrfs_io_ctl *io_ctl, void *bitmap)
{
if (!io_ctl->cur)
return -ENOSPC;
/*
* If we aren't at the start of the current page, unmap this one and
* map the next one if there is any left.
*/
if (io_ctl->cur != io_ctl->orig) { io_ctl_set_crc(io_ctl, io_ctl->index - 1);
if (io_ctl->index >= io_ctl->num_pages)
return -ENOSPC;
io_ctl_map_page(io_ctl, 0);
}
copy_page(io_ctl->cur, bitmap);
io_ctl_set_crc(io_ctl, io_ctl->index - 1);
if (io_ctl->index < io_ctl->num_pages)
io_ctl_map_page(io_ctl, 0);
return 0;
}
static void io_ctl_zero_remaining_pages(struct btrfs_io_ctl *io_ctl)
{
/*
* If we're not on the boundary we know we've modified the page and we
* need to crc the page.
*/
if (io_ctl->cur != io_ctl->orig) io_ctl_set_crc(io_ctl, io_ctl->index - 1);
else
io_ctl_unmap_page(io_ctl);
while (io_ctl->index < io_ctl->num_pages) {
io_ctl_map_page(io_ctl, 1);
io_ctl_set_crc(io_ctl, io_ctl->index - 1);
}
}
static int io_ctl_read_entry(struct btrfs_io_ctl *io_ctl,
struct btrfs_free_space *entry, u8 *type)
{
struct btrfs_free_space_entry *e;
int ret;
if (!io_ctl->cur) {
ret = io_ctl_check_crc(io_ctl, io_ctl->index);
if (ret)
return ret;
}
e = io_ctl->cur;
entry->offset = get_unaligned_le64(&e->offset);
entry->bytes = get_unaligned_le64(&e->bytes);
*type = e->type;
io_ctl->cur += sizeof(struct btrfs_free_space_entry);
io_ctl->size -= sizeof(struct btrfs_free_space_entry);
if (io_ctl->size >= sizeof(struct btrfs_free_space_entry))
return 0;
io_ctl_unmap_page(io_ctl);
return 0;
}
static int io_ctl_read_bitmap(struct btrfs_io_ctl *io_ctl,
struct btrfs_free_space *entry)
{
int ret;
ret = io_ctl_check_crc(io_ctl, io_ctl->index);
if (ret)
return ret;
copy_page(entry->bitmap, io_ctl->cur);
io_ctl_unmap_page(io_ctl);
return 0;
}
static void recalculate_thresholds(struct btrfs_free_space_ctl *ctl)
{
struct btrfs_block_group *block_group = ctl->private;
u64 max_bytes;
u64 bitmap_bytes;
u64 extent_bytes;
u64 size = block_group->length;
u64 bytes_per_bg = BITS_PER_BITMAP * ctl->unit;
u64 max_bitmaps = div64_u64(size + bytes_per_bg - 1, bytes_per_bg);
max_bitmaps = max_t(u64, max_bitmaps, 1);
ASSERT(ctl->total_bitmaps <= max_bitmaps);
/*
* We are trying to keep the total amount of memory used per 1GiB of
* space to be MAX_CACHE_BYTES_PER_GIG. However, with a reclamation
* mechanism of pulling extents >= FORCE_EXTENT_THRESHOLD out of
* bitmaps, we may end up using more memory than this.
*/
if (size < SZ_1G)
max_bytes = MAX_CACHE_BYTES_PER_GIG;
else
max_bytes = MAX_CACHE_BYTES_PER_GIG * div_u64(size, SZ_1G);
bitmap_bytes = ctl->total_bitmaps * ctl->unit;
/*
* we want the extent entry threshold to always be at most 1/2 the max
* bytes we can have, or whatever is less than that.
*/
extent_bytes = max_bytes - bitmap_bytes;
extent_bytes = min_t(u64, extent_bytes, max_bytes >> 1);
ctl->extents_thresh =
div_u64(extent_bytes, sizeof(struct btrfs_free_space));
}
static int __load_free_space_cache(struct btrfs_root *root, struct inode *inode,
struct btrfs_free_space_ctl *ctl,
struct btrfs_path *path, u64 offset)
{
struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_free_space_header *header;
struct extent_buffer *leaf;
struct btrfs_io_ctl io_ctl;
struct btrfs_key key;
struct btrfs_free_space *e, *n;
LIST_HEAD(bitmaps);
u64 num_entries;
u64 num_bitmaps;
u64 generation;
u8 type;
int ret = 0;
/* Nothing in the space cache, goodbye */
if (!i_size_read(inode))
return 0;
key.objectid = BTRFS_FREE_SPACE_OBJECTID;
key.offset = offset;
key.type = 0;
ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
if (ret < 0)
return 0;
else if (ret > 0) {
btrfs_release_path(path);
return 0;
}
ret = -1;
leaf = path->nodes[0];
header = btrfs_item_ptr(leaf, path->slots[0],
struct btrfs_free_space_header);
num_entries = btrfs_free_space_entries(leaf, header);
num_bitmaps = btrfs_free_space_bitmaps(leaf, header);
generation = btrfs_free_space_generation(leaf, header);
btrfs_release_path(path);
if (!BTRFS_I(inode)->generation) {
btrfs_info(fs_info,
"the free space cache file (%llu) is invalid, skip it",
offset);
return 0;
}
if (BTRFS_I(inode)->generation != generation) {
btrfs_err(fs_info,
"free space inode generation (%llu) did not match free space cache generation (%llu)",
BTRFS_I(inode)->generation, generation);
return 0;
}
if (!num_entries)
return 0;
ret = io_ctl_init(&io_ctl, inode, 0);
if (ret)
return ret;
readahead_cache(inode);
ret = io_ctl_prepare_pages(&io_ctl, true);
if (ret)
goto out;
ret = io_ctl_check_crc(&io_ctl, 0);
if (ret)
goto free_cache;
ret = io_ctl_check_generation(&io_ctl, generation);
if (ret)
goto free_cache;
while (num_entries) {
e = kmem_cache_zalloc(btrfs_free_space_cachep,
GFP_NOFS);
if (!e) {
ret = -ENOMEM;
goto free_cache;
}
ret = io_ctl_read_entry(&io_ctl, e, &type);
if (ret) {
kmem_cache_free(btrfs_free_space_cachep, e);
goto free_cache;
}
if (!e->bytes) {
ret = -1;
kmem_cache_free(btrfs_free_space_cachep, e);
goto free_cache;
}
if (type == BTRFS_FREE_SPACE_EXTENT) {
spin_lock(&ctl->tree_lock);
ret = link_free_space(ctl, e);
spin_unlock(&ctl->tree_lock);
if (ret) {
btrfs_err(fs_info,
"Duplicate entries in free space cache, dumping");
kmem_cache_free(btrfs_free_space_cachep, e);
goto free_cache;
}
} else {
ASSERT(num_bitmaps);
num_bitmaps--;
e->bitmap = kmem_cache_zalloc(
btrfs_free_space_bitmap_cachep, GFP_NOFS);
if (!e->bitmap) {
ret = -ENOMEM;
kmem_cache_free(
btrfs_free_space_cachep, e);
goto free_cache;
}
spin_lock(&ctl->tree_lock);
ret = link_free_space(ctl, e);
ctl->total_bitmaps++;
recalculate_thresholds(ctl);
spin_unlock(&ctl->tree_lock);
if (ret) {
btrfs_err(fs_info,
"Duplicate entries in free space cache, dumping");
kmem_cache_free(btrfs_free_space_cachep, e);
goto free_cache;
}
list_add_tail(&e->list, &bitmaps);
}
num_entries--;
}
io_ctl_unmap_page(&io_ctl);
/*
* We add the bitmaps at the end of the entries in order that
* the bitmap entries are added to the cache.
*/
list_for_each_entry_safe(e, n, &bitmaps, list) {
list_del_init(&e->list);
ret = io_ctl_read_bitmap(&io_ctl, e);
if (ret)
goto free_cache;
}
io_ctl_drop_pages(&io_ctl);
ret = 1;
out:
io_ctl_free(&io_ctl);
return ret;
free_cache:
io_ctl_drop_pages(&io_ctl);
__btrfs_remove_free_space_cache(ctl);
goto out;
}
static int copy_free_space_cache(struct btrfs_block_group *block_group,
struct btrfs_free_space_ctl *ctl)
{
struct btrfs_free_space *info;
struct rb_node *n;
int ret = 0;
while (!ret && (n = rb_first(&ctl->free_space_offset)) != NULL) {
info = rb_entry(n, struct btrfs_free_space, offset_index);
if (!info->bitmap) {
unlink_free_space(ctl, info);
ret = btrfs_add_free_space(block_group, info->offset,
info->bytes);
kmem_cache_free(btrfs_free_space_cachep, info);
} else {
u64 offset = info->offset;
u64 bytes = ctl->unit;
while (search_bitmap(ctl, info, &offset, &bytes,
false) == 0) {
ret = btrfs_add_free_space(block_group, offset,
bytes);
if (ret)
break;
bitmap_clear_bits(ctl, info, offset, bytes);
offset = info->offset;
bytes = ctl->unit;
}
free_bitmap(ctl, info);
}
cond_resched();
}
return ret;
}
int load_free_space_cache(struct btrfs_block_group *block_group)
{
struct btrfs_fs_info *fs_info = block_group->fs_info;
struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
struct btrfs_free_space_ctl tmp_ctl = {};
struct inode *inode;
struct btrfs_path *path;
int ret = 0;
bool matched;
u64 used = block_group->used;
/*
* Because we could potentially discard our loaded free space, we want
* to load everything into a temporary structure first, and then if it's
* valid copy it all into the actual free space ctl.
*/
btrfs_init_free_space_ctl(block_group, &tmp_ctl);
/*
* If this block group has been marked to be cleared for one reason or
* another then we can't trust the on disk cache, so just return.
*/
spin_lock(&block_group->lock);
if (block_group->disk_cache_state != BTRFS_DC_WRITTEN) {
spin_unlock(&block_group->lock);
return 0;
}
spin_unlock(&block_group->lock);
path = btrfs_alloc_path();
if (!path)
return 0;
path->search_commit_root = 1;
path->skip_locking = 1;
/*
* We must pass a path with search_commit_root set to btrfs_iget in
* order to avoid a deadlock when allocating extents for the tree root.
*
* When we are COWing an extent buffer from the tree root, when looking
* for a free extent, at extent-tree.c:find_free_extent(), we can find
* block group without its free space cache loaded. When we find one
* we must load its space cache which requires reading its free space
* cache's inode item from the root tree. If this inode item is located
* in the same leaf that we started COWing before, then we end up in
* deadlock on the extent buffer (trying to read lock it when we
* previously write locked it).
*
* It's safe to read the inode item using the commit root because
* block groups, once loaded, stay in memory forever (until they are
* removed) as well as their space caches once loaded. New block groups
* once created get their ->cached field set to BTRFS_CACHE_FINISHED so
* we will never try to read their inode item while the fs is mounted.
*/
inode = lookup_free_space_inode(block_group, path);
if (IS_ERR(inode)) {
btrfs_free_path(path);
return 0;
}
/* We may have converted the inode and made the cache invalid. */
spin_lock(&block_group->lock);
if (block_group->disk_cache_state != BTRFS_DC_WRITTEN) {
spin_unlock(&block_group->lock);
btrfs_free_path(path);
goto out;
}
spin_unlock(&block_group->lock);
ret = __load_free_space_cache(fs_info->tree_root, inode, &tmp_ctl,
path, block_group->start);
btrfs_free_path(path);
if (ret <= 0)
goto out;
matched = (tmp_ctl.free_space == (block_group->length - used -
block_group->bytes_super));
if (matched) {
ret = copy_free_space_cache(block_group, &tmp_ctl);
/*
* ret == 1 means we successfully loaded the free space cache,
* so we need to re-set it here.
*/
if (ret == 0)
ret = 1;
} else {
__btrfs_remove_free_space_cache(&tmp_ctl);
btrfs_warn(fs_info,
"block group %llu has wrong amount of free space",
block_group->start);
ret = -1;
}
out:
if (ret < 0) {
/* This cache is bogus, make sure it gets cleared */
spin_lock(&block_group->lock);
block_group->disk_cache_state = BTRFS_DC_CLEAR;
spin_unlock(&block_group->lock);
ret = 0;
btrfs_warn(fs_info,
"failed to load free space cache for block group %llu, rebuilding it now",
block_group->start);
}
spin_lock(&ctl->tree_lock);
btrfs_discard_update_discardable(block_group);
spin_unlock(&ctl->tree_lock);
iput(inode);
return ret;
}
static noinline_for_stack
int write_cache_extent_entries(struct btrfs_io_ctl *io_ctl,
struct btrfs_free_space_ctl *ctl,
struct btrfs_block_group *block_group,
int *entries, int *bitmaps,
struct list_head *bitmap_list)
{
int ret;
struct btrfs_free_cluster *cluster = NULL;
struct btrfs_free_cluster *cluster_locked = NULL;
struct rb_node *node = rb_first(&ctl->free_space_offset);
struct btrfs_trim_range *trim_entry;
/* Get the cluster for this block_group if it exists */
if (block_group && !list_empty(&block_group->cluster_list)) { cluster = list_entry(block_group->cluster_list.next,
struct btrfs_free_cluster,
block_group_list);
}
if (!node && cluster) {
cluster_locked = cluster;
spin_lock(&cluster_locked->lock);
node = rb_first(&cluster->root);
cluster = NULL;
}
/* Write out the extent entries */
while (node) {
struct btrfs_free_space *e;
e = rb_entry(node, struct btrfs_free_space, offset_index);
*entries += 1;
ret = io_ctl_add_entry(io_ctl, e->offset, e->bytes,
e->bitmap);
if (ret)
goto fail;
if (e->bitmap) { list_add_tail(&e->list, bitmap_list);
*bitmaps += 1;
}
node = rb_next(node); if (!node && cluster) { node = rb_first(&cluster->root);
cluster_locked = cluster;
spin_lock(&cluster_locked->lock);
cluster = NULL;
}
}
if (cluster_locked) {
spin_unlock(&cluster_locked->lock);
cluster_locked = NULL;
}
/*
* Make sure we don't miss any range that was removed from our rbtree
* because trimming is running. Otherwise after a umount+mount (or crash
* after committing the transaction) we would leak free space and get
* an inconsistent free space cache report from fsck.
*/
list_for_each_entry(trim_entry, &ctl->trimming_ranges, list) { ret = io_ctl_add_entry(io_ctl, trim_entry->start,
trim_entry->bytes, NULL);
if (ret)
goto fail;
*entries += 1;
}
return 0;
fail:
if (cluster_locked)
spin_unlock(&cluster_locked->lock);
return -ENOSPC;
}
static noinline_for_stack int
update_cache_item(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct inode *inode,
struct btrfs_path *path, u64 offset,
int entries, int bitmaps)
{
struct btrfs_key key;
struct btrfs_free_space_header *header;
struct extent_buffer *leaf;
int ret;
key.objectid = BTRFS_FREE_SPACE_OBJECTID;
key.offset = offset;
key.type = 0;
ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
if (ret < 0) {
clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, inode->i_size - 1,
EXTENT_DELALLOC, 0, 0, NULL);
goto fail;
}
leaf = path->nodes[0];
if (ret > 0) {
struct btrfs_key found_key;
ASSERT(path->slots[0]);
path->slots[0]--;
btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
if (found_key.objectid != BTRFS_FREE_SPACE_OBJECTID || found_key.offset != offset) {
clear_extent_bit(&BTRFS_I(inode)->io_tree, 0,
inode->i_size - 1, EXTENT_DELALLOC, 0,
0, NULL);
btrfs_release_path(path);
goto fail;
}
}
BTRFS_I(inode)->generation = trans->transid;
header = btrfs_item_ptr(leaf, path->slots[0],
struct btrfs_free_space_header);
btrfs_set_free_space_entries(leaf, header, entries);
btrfs_set_free_space_bitmaps(leaf, header, bitmaps);
btrfs_set_free_space_generation(leaf, header, trans->transid);
btrfs_mark_buffer_dirty(leaf);
btrfs_release_path(path);
return 0;
fail:
return -1;
}
static noinline_for_stack int write_pinned_extent_entries(
struct btrfs_trans_handle *trans,
struct btrfs_block_group *block_group,
struct btrfs_io_ctl *io_ctl,
int *entries)
{
u64 start, extent_start, extent_end, len;
struct extent_io_tree *unpin = NULL;
int ret;
if (!block_group) return 0;
/*
* We want to add any pinned extents to our free space cache
* so we don't leak the space
*
* We shouldn't have switched the pinned extents yet so this is the
* right one
*/
unpin = &trans->transaction->pinned_extents;
start = block_group->start;
while (start < block_group->start + block_group->length) {
ret = find_first_extent_bit(unpin, start,
&extent_start, &extent_end,
EXTENT_DIRTY, NULL);
if (ret)
return 0;
/* This pinned extent is out of our range */
if (extent_start >= block_group->start + block_group->length)
return 0;
extent_start = max(extent_start, start);
extent_end = min(block_group->start + block_group->length,
extent_end + 1);
len = extent_end - extent_start;
*entries += 1;
ret = io_ctl_add_entry(io_ctl, extent_start, len, NULL);
if (ret)
return -ENOSPC;
start = extent_end;
}
return 0;
}
static noinline_for_stack int
write_bitmap_entries(struct btrfs_io_ctl *io_ctl, struct list_head *bitmap_list)
{
struct btrfs_free_space *entry, *next;
int ret;
/* Write out the bitmaps */
list_for_each_entry_safe(entry, next, bitmap_list, list) { ret = io_ctl_add_bitmap(io_ctl, entry->bitmap);
if (ret)
return -ENOSPC;
list_del_init(&entry->list);
}
return 0;
}
static int flush_dirty_cache(struct inode *inode)
{
int ret;
ret = btrfs_wait_ordered_range(inode, 0, (u64)-1);
if (ret)
clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, inode->i_size - 1,
EXTENT_DELALLOC, 0, 0, NULL);
return ret;
}
static void noinline_for_stack
cleanup_bitmap_list(struct list_head *bitmap_list)
{
struct btrfs_free_space *entry, *next;
list_for_each_entry_safe(entry, next, bitmap_list, list)
list_del_init(&entry->list);
}
static void noinline_for_stack
cleanup_write_cache_enospc(struct inode *inode,
struct btrfs_io_ctl *io_ctl,
struct extent_state **cached_state)
{
io_ctl_drop_pages(io_ctl);
unlock_extent_cached(&BTRFS_I(inode)->io_tree, 0,
i_size_read(inode) - 1, cached_state);
}
static int __btrfs_wait_cache_io(struct btrfs_root *root,
struct btrfs_trans_handle *trans,
struct btrfs_block_group *block_group,
struct btrfs_io_ctl *io_ctl,
struct btrfs_path *path, u64 offset)
{
int ret;
struct inode *inode = io_ctl->inode;
if (!inode)
return 0;
/* Flush the dirty pages in the cache file. */
ret = flush_dirty_cache(inode);
if (ret)
goto out;
/* Update the cache item to tell everyone this cache file is valid. */
ret = update_cache_item(trans, root, inode, path, offset,
io_ctl->entries, io_ctl->bitmaps);
out:
if (ret) {
invalidate_inode_pages2(inode->i_mapping);
BTRFS_I(inode)->generation = 0;
if (block_group)
btrfs_debug(root->fs_info,
"failed to write free space cache for block group %llu error %d",
block_group->start, ret);
}
btrfs_update_inode(trans, root, BTRFS_I(inode));
if (block_group) {
/* the dirty list is protected by the dirty_bgs_lock */
spin_lock(&trans->transaction->dirty_bgs_lock);
/* the disk_cache_state is protected by the block group lock */
spin_lock(&block_group->lock);
/*
* only mark this as written if we didn't get put back on
* the dirty list while waiting for IO. Otherwise our
* cache state won't be right, and we won't get written again
*/
if (!ret && list_empty(&block_group->dirty_list))
block_group->disk_cache_state = BTRFS_DC_WRITTEN;
else if (ret)
block_group->disk_cache_state = BTRFS_DC_ERROR;
spin_unlock(&block_group->lock);
spin_unlock(&trans->transaction->dirty_bgs_lock);
io_ctl->inode = NULL;
iput(inode);
}
return ret;
}
int btrfs_wait_cache_io(struct btrfs_trans_handle *trans,
struct btrfs_block_group *block_group,
struct btrfs_path *path)
{
return __btrfs_wait_cache_io(block_group->fs_info->tree_root, trans,
block_group, &block_group->io_ctl,
path, block_group->start);
}
/**
* Write out cached info to an inode
*
* @root: root the inode belongs to
* @inode: freespace inode we are writing out
* @ctl: free space cache we are going to write out
* @block_group: block_group for this cache if it belongs to a block_group
* @io_ctl: holds context for the io
* @trans: the trans handle
*
* This function writes out a free space cache struct to disk for quick recovery
* on mount. This will return 0 if it was successful in writing the cache out,
* or an errno if it was not.
*/
static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
struct btrfs_free_space_ctl *ctl,
struct btrfs_block_group *block_group,
struct btrfs_io_ctl *io_ctl,
struct btrfs_trans_handle *trans)
{
struct extent_state *cached_state = NULL;
LIST_HEAD(bitmap_list);
int entries = 0;
int bitmaps = 0;
int ret;
int must_iput = 0;
if (!i_size_read(inode))
return -EIO;
WARN_ON(io_ctl->pages); ret = io_ctl_init(io_ctl, inode, 1);
if (ret)
return ret;
if (block_group && (block_group->flags & BTRFS_BLOCK_GROUP_DATA)) { down_write(&block_group->data_rwsem);
spin_lock(&block_group->lock);
if (block_group->delalloc_bytes) {
block_group->disk_cache_state = BTRFS_DC_WRITTEN;
spin_unlock(&block_group->lock);
up_write(&block_group->data_rwsem);
BTRFS_I(inode)->generation = 0;
ret = 0;
must_iput = 1;
goto out;
}
spin_unlock(&block_group->lock);
}
/* Lock all pages first so we can lock the extent safely. */
ret = io_ctl_prepare_pages(io_ctl, false);
if (ret)
goto out_unlock;
lock_extent_bits(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1,
&cached_state);
io_ctl_set_generation(io_ctl, trans->transid);
mutex_lock(&ctl->cache_writeout_mutex);
/* Write out the extent entries in the free space cache */
spin_lock(&ctl->tree_lock);
ret = write_cache_extent_entries(io_ctl, ctl,
block_group, &entries, &bitmaps,
&bitmap_list);
if (ret)
goto out_nospc_locked;
/*
* Some spaces that are freed in the current transaction are pinned,
* they will be added into free space cache after the transaction is
* committed, we shouldn't lose them.
*
* If this changes while we are working we'll get added back to
* the dirty list and redo it. No locking needed
*/
ret = write_pinned_extent_entries(trans, block_group, io_ctl, &entries);
if (ret)
goto out_nospc_locked;
/*
* At last, we write out all the bitmaps and keep cache_writeout_mutex
* locked while doing it because a concurrent trim can be manipulating
* or freeing the bitmap.
*/
ret = write_bitmap_entries(io_ctl, &bitmap_list);
spin_unlock(&ctl->tree_lock);
mutex_unlock(&ctl->cache_writeout_mutex);
if (ret)
goto out_nospc;
/* Zero out the rest of the pages just to make sure */
io_ctl_zero_remaining_pages(io_ctl);
/* Everything is written out, now we dirty the pages in the file. */
ret = btrfs_dirty_pages(BTRFS_I(inode), io_ctl->pages,
io_ctl->num_pages, 0, i_size_read(inode),
&cached_state, false);
if (ret)
goto out_nospc;
if (block_group && (block_group->flags & BTRFS_BLOCK_GROUP_DATA)) up_write(&block_group->data_rwsem);
/*
* Release the pages and unlock the extent, we will flush
* them out later
*/
io_ctl_drop_pages(io_ctl);
io_ctl_free(io_ctl);
unlock_extent_cached(&BTRFS_I(inode)->io_tree, 0,
i_size_read(inode) - 1, &cached_state);
/*
* at this point the pages are under IO and we're happy,
* The caller is responsible for waiting on them and updating
* the cache and the inode
*/
io_ctl->entries = entries;
io_ctl->bitmaps = bitmaps;
ret = btrfs_fdatawrite_range(inode, 0, (u64)-1);
if (ret)
goto out;
return 0;
out_nospc_locked:
cleanup_bitmap_list(&bitmap_list);
spin_unlock(&ctl->tree_lock);
mutex_unlock(&ctl->cache_writeout_mutex);
out_nospc:
cleanup_write_cache_enospc(inode, io_ctl, &cached_state);
out_unlock:
if (block_group && (block_group->flags & BTRFS_BLOCK_GROUP_DATA)) up_write(&block_group->data_rwsem);
out:
io_ctl->inode = NULL;
io_ctl_free(io_ctl);
if (ret) {
invalidate_inode_pages2(inode->i_mapping);
BTRFS_I(inode)->generation = 0;
}
btrfs_update_inode(trans, root, BTRFS_I(inode));
if (must_iput)
iput(inode);
return ret;
}
int btrfs_write_out_cache(struct btrfs_trans_handle *trans,
struct btrfs_block_group *block_group,
struct btrfs_path *path)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
struct inode *inode;
int ret = 0;
spin_lock(&block_group->lock);
if (block_group->disk_cache_state < BTRFS_DC_SETUP) {
spin_unlock(&block_group->lock);
return 0;
}
spin_unlock(&block_group->lock);
inode = lookup_free_space_inode(block_group, path);
if (IS_ERR(inode))
return 0;
ret = __btrfs_write_out_cache(fs_info->tree_root, inode, ctl,
block_group, &block_group->io_ctl, trans);
if (ret) {
btrfs_debug(fs_info,
"failed to write free space cache for block group %llu error %d",
block_group->start, ret);
spin_lock(&block_group->lock);
block_group->disk_cache_state = BTRFS_DC_ERROR;
spin_unlock(&block_group->lock);
block_group->io_ctl.inode = NULL;
iput(inode);
}
/*
* if ret == 0 the caller is expected to call btrfs_wait_cache_io
* to wait for IO and put the inode
*/
return ret;
}
static inline unsigned long offset_to_bit(u64 bitmap_start, u32 unit,
u64 offset)
{
ASSERT(offset >= bitmap_start);
offset -= bitmap_start;
return (unsigned long)(div_u64(offset, unit));
}
static inline unsigned long bytes_to_bits(u64 bytes, u32 unit)
{
return (unsigned long)(div_u64(bytes, unit));
}
static inline u64 offset_to_bitmap(struct btrfs_free_space_ctl *ctl,
u64 offset)
{
u64 bitmap_start;
u64 bytes_per_bitmap;
bytes_per_bitmap = BITS_PER_BITMAP * ctl->unit;
bitmap_start = offset - ctl->start;
bitmap_start = div64_u64(bitmap_start, bytes_per_bitmap);
bitmap_start *= bytes_per_bitmap;
bitmap_start += ctl->start;
return bitmap_start;
}
static int tree_insert_offset(struct rb_root *root, u64 offset,
struct rb_node *node, int bitmap)
{
struct rb_node **p = &root->rb_node;
struct rb_node *parent = NULL;
struct btrfs_free_space *info;
while (*p) {
parent = *p;
info = rb_entry(parent, struct btrfs_free_space, offset_index);
if (offset < info->offset) {
p = &(*p)->rb_left;
} else if (offset > info->offset) {
p = &(*p)->rb_right;
} else {
/*
* we could have a bitmap entry and an extent entry
* share the same offset. If this is the case, we want
* the extent entry to always be found first if we do a
* linear search through the tree, since we want to have
* the quickest allocation time, and allocating from an
* extent is faster than allocating from a bitmap. So
* if we're inserting a bitmap and we find an entry at
* this offset, we want to go right, or after this entry
* logically. If we are inserting an extent and we've
* found a bitmap, we want to go left, or before
* logically.
*/
if (bitmap) { if (info->bitmap) { WARN_ON_ONCE(1);
return -EEXIST;
}
p = &(*p)->rb_right;
} else {
if (!info->bitmap) { WARN_ON_ONCE(1); return -EEXIST;
}
p = &(*p)->rb_left;
}
}
}
rb_link_node(node, parent, p);
rb_insert_color(node, root);
return 0;
}
/*
* searches the tree for the given offset.
*
* fuzzy - If this is set, then we are trying to make an allocation, and we just
* want a section that has at least bytes size and comes at or after the given
* offset.
*/
static struct btrfs_free_space *
tree_search_offset(struct btrfs_free_space_ctl *ctl,
u64 offset, int bitmap_only, int fuzzy)
{
struct rb_node *n = ctl->free_space_offset.rb_node;
struct btrfs_free_space *entry, *prev = NULL;
/* find entry that is closest to the 'offset' */
while (1) {
if (!n) {
entry = NULL;
break;
}
entry = rb_entry(n, struct btrfs_free_space, offset_index);
prev = entry;
if (offset < entry->offset) n = n->rb_left; else if (offset > entry->offset) n = n->rb_right;
else
break;
}
if (bitmap_only) {
if (!entry)
return NULL; if (entry->bitmap)
return entry;
/*
* bitmap entry and extent entry may share same offset,
* in that case, bitmap entry comes after extent entry.
*/
n = rb_next(n);
if (!n)
return NULL;
entry = rb_entry(n, struct btrfs_free_space, offset_index);
if (entry->offset != offset)
return NULL;
WARN_ON(!entry->bitmap);
return entry;
} else if (entry) { if (entry->bitmap) {
/*
* if previous extent entry covers the offset,
* we should return it instead of the bitmap entry
*/
n = rb_prev(&entry->offset_index);
if (n) {
prev = rb_entry(n, struct btrfs_free_space,
offset_index);
if (!prev->bitmap && prev->offset + prev->bytes > offset)
entry = prev;
}
}
return entry;
}
if (!prev)
return NULL;
/* find last entry before the 'offset' */
entry = prev;
if (entry->offset > offset) { n = rb_prev(&entry->offset_index);
if (n) {
entry = rb_entry(n, struct btrfs_free_space,
offset_index);
ASSERT(entry->offset <= offset);
} else {
if (fuzzy)
return entry;
else
return NULL;
}
}
if (entry->bitmap) { n = rb_prev(&entry->offset_index);
if (n) {
prev = rb_entry(n, struct btrfs_free_space,
offset_index);
if (!prev->bitmap && prev->offset + prev->bytes > offset)
return prev;
}
if (entry->offset + BITS_PER_BITMAP * ctl->unit > offset)
return entry;
} else if (entry->offset + entry->bytes > offset)
return entry;
if (!fuzzy)
return NULL;
while (1) {
if (entry->bitmap) {
if (entry->offset + BITS_PER_BITMAP *
ctl->unit > offset)
break;
} else {
if (entry->offset + entry->bytes > offset)
break;
}
n = rb_next(&entry->offset_index);
if (!n)
return NULL;
entry = rb_entry(n, struct btrfs_free_space, offset_index);
}
return entry;
}
static inline void
__unlink_free_space(struct btrfs_free_space_ctl *ctl,
struct btrfs_free_space *info)
{
rb_erase(&info->offset_index, &ctl->free_space_offset); ctl->free_extents--; if (!info->bitmap && !btrfs_free_space_trimmed(info)) { ctl->discardable_extents[BTRFS_STAT_CURR]--;
ctl->discardable_bytes[BTRFS_STAT_CURR] -= info->bytes;
}
}
static void unlink_free_space(struct btrfs_free_space_ctl *ctl,
struct btrfs_free_space *info)
{
__unlink_free_space(ctl, info);
ctl->free_space -= info->bytes;
}
static int link_free_space(struct btrfs_free_space_ctl *ctl,
struct btrfs_free_space *info)
{
int ret = 0;
ASSERT(info->bytes || info->bitmap);
ret = tree_insert_offset(&ctl->free_space_offset, info->offset,
&info->offset_index, (info->bitmap != NULL));
if (ret)
return ret;
if (!info->bitmap && !btrfs_free_space_trimmed(info)) { ctl->discardable_extents[BTRFS_STAT_CURR]++;
ctl->discardable_bytes[BTRFS_STAT_CURR] += info->bytes;
}
ctl->free_space += info->bytes;
ctl->free_extents++;
return ret;
}
static inline void __bitmap_clear_bits(struct btrfs_free_space_ctl *ctl,
struct btrfs_free_space *info,
u64 offset, u64 bytes)
{
unsigned long start, count, end;
int extent_delta = -1;
start = offset_to_bit(info->offset, ctl->unit, offset);
count = bytes_to_bits(bytes, ctl->unit);
end = start + count;
ASSERT(end <= BITS_PER_BITMAP);
bitmap_clear(info->bitmap, start, count);
info->bytes -= bytes;
if (info->max_extent_size > ctl->unit)
info->max_extent_size = 0; if (start && test_bit(start - 1, info->bitmap))
extent_delta++;
if (end < BITS_PER_BITMAP && test_bit(end, info->bitmap)) extent_delta++; info->bitmap_extents += extent_delta;
if (!btrfs_free_space_trimmed(info)) {
ctl->discardable_extents[BTRFS_STAT_CURR] += extent_delta; ctl->discardable_bytes[BTRFS_STAT_CURR] -= bytes;
}
}
static void bitmap_clear_bits(struct btrfs_free_space_ctl *ctl,
struct btrfs_free_space *info, u64 offset,
u64 bytes)
{
__bitmap_clear_bits(ctl, info, offset, bytes);
ctl->free_space -= bytes;
}
static void bitmap_set_bits(struct btrfs_free_space_ctl *ctl,
struct btrfs_free_space *info, u64 offset,
u64 bytes)
{
unsigned long start, count, end;
int extent_delta = 1;
start = offset_to_bit(info->offset, ctl->unit, offset);
count = bytes_to_bits(bytes, ctl->unit);
end = start + count;
ASSERT(end <= BITS_PER_BITMAP);
bitmap_set(info->bitmap, start, count);
info->bytes += bytes;
ctl->free_space += bytes;
if (start && test_bit(start - 1, info->bitmap))
extent_delta--;
if (end < BITS_PER_BITMAP && test_bit(end, info->bitmap)) extent_delta--; info->bitmap_extents += extent_delta;
if (!btrfs_free_space_trimmed(info)) {
ctl->discardable_extents[BTRFS_STAT_CURR] += extent_delta;
ctl->discardable_bytes[BTRFS_STAT_CURR] += bytes;
}
}
/*
* If we can not find suitable extent, we will use bytes to record
* the size of the max extent.
*/
static int search_bitmap(struct btrfs_free_space_ctl *ctl,
struct btrfs_free_space *bitmap_info, u64 *offset,
u64 *bytes, bool for_alloc)
{
unsigned long found_bits = 0;
unsigned long max_bits = 0;
unsigned long bits, i;
unsigned long next_zero;
unsigned long extent_bits;
/*
* Skip searching the bitmap if we don't have a contiguous section that
* is large enough for this allocation.
*/
if (for_alloc &&
bitmap_info->max_extent_size &&
bitmap_info->max_extent_size < *bytes) {
*bytes = bitmap_info->max_extent_size;
return -1;
}
i = offset_to_bit(bitmap_info->offset, ctl->unit,
max_t(u64, *offset, bitmap_info->offset));
bits = bytes_to_bits(*bytes, ctl->unit);
for_each_set_bit_from(i, bitmap_info->bitmap, BITS_PER_BITMAP) {
if (for_alloc && bits == 1) {
found_bits = 1;
break;
}
next_zero = find_next_zero_bit(bitmap_info->bitmap,
BITS_PER_BITMAP, i);
extent_bits = next_zero - i;
if (extent_bits >= bits) {
found_bits = extent_bits;
break;
} else if (extent_bits > max_bits) {
max_bits = extent_bits;
}
i = next_zero;
}
if (found_bits) { *offset = (u64)(i * ctl->unit) + bitmap_info->offset;
*bytes = (u64)(found_bits) * ctl->unit;
return 0;
}
*bytes = (u64)(max_bits) * ctl->unit;
bitmap_info->max_extent_size = *bytes;
return -1;
}
static inline u64 get_max_extent_size(struct btrfs_free_space *entry)
{
if (entry->bitmap) return entry->max_extent_size;
return entry->bytes;
}
/* Cache the size of the max extent in bytes */
static struct btrfs_free_space *
find_free_space(struct btrfs_free_space_ctl *ctl, u64 *offset, u64 *bytes,
unsigned long align, u64 *max_extent_size)
{
struct btrfs_free_space *entry;
struct rb_node *node;
u64 tmp;
u64 align_off;
int ret;
if (!ctl->free_space_offset.rb_node)
goto out;
entry = tree_search_offset(ctl, offset_to_bitmap(ctl, *offset), 0, 1);
if (!entry)
goto out;
for (node = &entry->offset_index; node; node = rb_next(node)) {
entry = rb_entry(node, struct btrfs_free_space, offset_index);
if (entry->bytes < *bytes) { *max_extent_size = max(get_max_extent_size(entry),
*max_extent_size);
continue;
}
/* make sure the space returned is big enough
* to match our requested alignment
*/
if (*bytes >= align) { tmp = entry->offset - ctl->start + align - 1;
tmp = div64_u64(tmp, align);
tmp = tmp * align + ctl->start;
align_off = tmp - entry->offset;
} else {
align_off = 0;
tmp = entry->offset;
}
if (entry->bytes < *bytes + align_off) {
*max_extent_size = max(get_max_extent_size(entry),
*max_extent_size);
continue;
}
if (entry->bitmap) { u64 size = *bytes;
ret = search_bitmap(ctl, entry, &tmp, &size, true);
if (!ret) {
*offset = tmp;
*bytes = size;
return entry;
} else {
*max_extent_size =
max(get_max_extent_size(entry),
*max_extent_size);
}
continue;
}
*offset = tmp;
*bytes = entry->bytes - align_off;
return entry;
}
out:
return NULL;
}
static void add_new_bitmap(struct btrfs_free_space_ctl *ctl,
struct btrfs_free_space *info, u64 offset)
{
info->offset = offset_to_bitmap(ctl, offset);
info->bytes = 0;
info->bitmap_extents = 0;
INIT_LIST_HEAD(&info->list);
link_free_space(ctl, info);
ctl->total_bitmaps++;
recalculate_thresholds(ctl);
}
static void free_bitmap(struct btrfs_free_space_ctl *ctl,
struct btrfs_free_space *bitmap_info)
{
/*
* Normally when this is called, the bitmap is completely empty. However,
* if we are blowing up the free space cache for one reason or another
* via __btrfs_remove_free_space_cache(), then it may not be freed and
* we may leave stats on the table.
*/
if (bitmap_info->bytes && !btrfs_free_space_trimmed(bitmap_info)) { ctl->discardable_extents[BTRFS_STAT_CURR] -=
bitmap_info->bitmap_extents;
ctl->discardable_bytes[BTRFS_STAT_CURR] -= bitmap_info->bytes;
}
unlink_free_space(ctl, bitmap_info);
kmem_cache_free(btrfs_free_space_bitmap_cachep, bitmap_info->bitmap);
kmem_cache_free(btrfs_free_space_cachep, bitmap_info);
ctl->total_bitmaps--;
recalculate_thresholds(ctl);
}
static noinline int remove_from_bitmap(struct btrfs_free_space_ctl *ctl,
struct btrfs_free_space *bitmap_info,
u64 *offset, u64 *bytes)
{
u64 end;
u64 search_start, search_bytes;
int ret;
again:
end = bitmap_info->offset + (u64)(BITS_PER_BITMAP * ctl->unit) - 1;
/*
* We need to search for bits in this bitmap. We could only cover some
* of the extent in this bitmap thanks to how we add space, so we need
* to search for as much as it as we can and clear that amount, and then
* go searching for the next bit.
*/
search_start = *offset;
search_bytes = ctl->unit;
search_bytes = min(search_bytes, end - search_start + 1);
ret = search_bitmap(ctl, bitmap_info, &search_start, &search_bytes,
false);
if (ret < 0 || search_start != *offset)
return -EINVAL;
/* We may have found more bits than what we need */
search_bytes = min(search_bytes, *bytes);
/* Cannot clear past the end of the bitmap */
search_bytes = min(search_bytes, end - search_start + 1);
bitmap_clear_bits(ctl, bitmap_info, search_start, search_bytes);
*offset += search_bytes;
*bytes -= search_bytes;
if (*bytes) {
struct rb_node *next = rb_next(&bitmap_info->offset_index);
if (!bitmap_info->bytes)
free_bitmap(ctl, bitmap_info);
/*
* no entry after this bitmap, but we still have bytes to
* remove, so something has gone wrong.
*/
if (!next)
return -EINVAL;
bitmap_info = rb_entry(next, struct btrfs_free_space,
offset_index);
/*
* if the next entry isn't a bitmap we need to return to let the
* extent stuff do its work.
*/
if (!bitmap_info->bitmap)
return -EAGAIN;
/*
* Ok the next item is a bitmap, but it may not actually hold
* the information for the rest of this free space stuff, so
* look for it, and if we don't find it return so we can try
* everything over again.
*/
search_start = *offset;
search_bytes = ctl->unit;
ret = search_bitmap(ctl, bitmap_info, &search_start,
&search_bytes, false);
if (ret < 0 || search_start != *offset)
return -EAGAIN;
goto again;
} else if (!bitmap_info->bytes)
free_bitmap(ctl, bitmap_info);
return 0;
}
static u64 add_bytes_to_bitmap(struct btrfs_free_space_ctl *ctl,
struct btrfs_free_space *info, u64 offset,
u64 bytes, enum btrfs_trim_state trim_state)
{
u64 bytes_to_set = 0;
u64 end;
/*
* This is a tradeoff to make bitmap trim state minimal. We mark the
* whole bitmap untrimmed if at any point we add untrimmed regions.
*/
if (trim_state == BTRFS_TRIM_STATE_UNTRIMMED) { if (btrfs_free_space_trimmed(info)) { ctl->discardable_extents[BTRFS_STAT_CURR] +=
info->bitmap_extents;
ctl->discardable_bytes[BTRFS_STAT_CURR] += info->bytes;
}
info->trim_state = BTRFS_TRIM_STATE_UNTRIMMED;
}
end = info->offset + (u64)(BITS_PER_BITMAP * ctl->unit); bytes_to_set = min(end - offset, bytes);
bitmap_set_bits(ctl, info, offset, bytes_to_set);
/*
* We set some bytes, we have no idea what the max extent size is
* anymore.
*/
info->max_extent_size = 0;
return bytes_to_set;
}
static bool use_bitmap(struct btrfs_free_space_ctl *ctl,
struct btrfs_free_space *info)
{
struct btrfs_block_group *block_group = ctl->private; struct btrfs_fs_info *fs_info = block_group->fs_info;
bool forced = false;
#ifdef CONFIG_BTRFS_DEBUG
if (btrfs_should_fragment_free_space(block_group))
forced = true;
#endif
/* This is a way to reclaim large regions from the bitmaps. */
if (!forced && info->bytes >= FORCE_EXTENT_THRESHOLD) return false;
/*
* If we are below the extents threshold then we can add this as an
* extent, and don't have to deal with the bitmap
*/
if (!forced && ctl->free_extents < ctl->extents_thresh) {
/*
* If this block group has some small extents we don't want to
* use up all of our free slots in the cache with them, we want
* to reserve them to larger extents, however if we have plenty
* of cache left then go ahead an dadd them, no sense in adding
* the overhead of a bitmap if we don't have to.
*/
if (info->bytes <= fs_info->sectorsize * 8) {
if (ctl->free_extents * 3 <= ctl->extents_thresh)
return false;
} else {
return false;
}
}
/*
* The original block groups from mkfs can be really small, like 8
* megabytes, so don't bother with a bitmap for those entries. However
* some block groups can be smaller than what a bitmap would cover but
* are still large enough that they could overflow the 32k memory limit,
* so allow those block groups to still be allowed to have a bitmap
* entry.
*/
if (((BITS_PER_BITMAP * ctl->unit) >> 1) > block_group->length)
return false;
return true;
}
static const struct btrfs_free_space_op free_space_op = {
.use_bitmap = use_bitmap,
};
static int insert_into_bitmap(struct btrfs_free_space_ctl *ctl,
struct btrfs_free_space *info)
{
struct btrfs_free_space *bitmap_info;
struct btrfs_block_group *block_group = NULL;
int added = 0;
u64 bytes, offset, bytes_added;
enum btrfs_trim_state trim_state;
int ret;
bytes = info->bytes;
offset = info->offset;
trim_state = info->trim_state;
if (!ctl->op->use_bitmap(ctl, info))
return 0;
if (ctl->op == &free_space_op) block_group = ctl->private;
again:
/*
* Since we link bitmaps right into the cluster we need to see if we
* have a cluster here, and if so and it has our bitmap we need to add
* the free space to that bitmap.
*/
if (block_group && !list_empty(&block_group->cluster_list)) {
struct btrfs_free_cluster *cluster;
struct rb_node *node;
struct btrfs_free_space *entry;
cluster = list_entry(block_group->cluster_list.next,
struct btrfs_free_cluster,
block_group_list);
spin_lock(&cluster->lock);
node = rb_first(&cluster->root);
if (!node) {
spin_unlock(&cluster->lock);
goto no_cluster_bitmap;
}
entry = rb_entry(node, struct btrfs_free_space, offset_index);
if (!entry->bitmap) {
spin_unlock(&cluster->lock);
goto no_cluster_bitmap;
}
if (entry->offset == offset_to_bitmap(ctl, offset)) {
bytes_added = add_bytes_to_bitmap(ctl, entry, offset,
bytes, trim_state);
bytes -= bytes_added;
offset += bytes_added;
}
spin_unlock(&cluster->lock);
if (!bytes) {
ret = 1;
goto out;
}
}
no_cluster_bitmap:
bitmap_info = tree_search_offset(ctl, offset_to_bitmap(ctl, offset),
1, 0);
if (!bitmap_info) {
ASSERT(added == 0);
goto new_bitmap;
}
bytes_added = add_bytes_to_bitmap(ctl, bitmap_info, offset, bytes,
trim_state);
bytes -= bytes_added;
offset += bytes_added;
added = 0;
if (!bytes) {
ret = 1;
goto out;
} else
goto again;
new_bitmap:
if (info && info->bitmap) {
add_new_bitmap(ctl, info, offset);
added = 1;
info = NULL;
goto again;
} else {
spin_unlock(&ctl->tree_lock);
/* no pre-allocated info, allocate a new one */
if (!info) {
info = kmem_cache_zalloc(btrfs_free_space_cachep,
GFP_NOFS);
if (!info) {
spin_lock(&ctl->tree_lock);
ret = -ENOMEM;
goto out;
}
}
/* allocate the bitmap */
info->bitmap = kmem_cache_zalloc(btrfs_free_space_bitmap_cachep,
GFP_NOFS);
info->trim_state = BTRFS_TRIM_STATE_TRIMMED;
spin_lock(&ctl->tree_lock);
if (!info->bitmap) {
ret = -ENOMEM;
goto out;
}
goto again;
}
out:
if (info) { if (info->bitmap) kmem_cache_free(btrfs_free_space_bitmap_cachep,
info->bitmap);
kmem_cache_free(btrfs_free_space_cachep, info);
}
return ret;
}
/*
* Free space merging rules:
* 1) Merge trimmed areas together
* 2) Let untrimmed areas coalesce with trimmed areas
* 3) Always pull neighboring regions from bitmaps
*
* The above rules are for when we merge free space based on btrfs_trim_state.
* Rules 2 and 3 are subtle because they are suboptimal, but are done for the
* same reason: to promote larger extent regions which makes life easier for
* find_free_extent(). Rule 2 enables coalescing based on the common path
* being returning free space from btrfs_finish_extent_commit(). So when free
* space is trimmed, it will prevent aggregating trimmed new region and
* untrimmed regions in the rb_tree. Rule 3 is purely to obtain larger extents
* and provide find_free_extent() with the largest extents possible hoping for
* the reuse path.
*/
static bool try_merge_free_space(struct btrfs_free_space_ctl *ctl,
struct btrfs_free_space *info, bool update_stat)
{
struct btrfs_free_space *left_info = NULL;
struct btrfs_free_space *right_info;
bool merged = false;
u64 offset = info->offset;
u64 bytes = info->bytes;
const bool is_trimmed = btrfs_free_space_trimmed(info);
/*
* first we want to see if there is free space adjacent to the range we
* are adding, if there is remove that struct and add a new one to
* cover the entire range
*/
right_info = tree_search_offset(ctl, offset + bytes, 0, 0);
if (right_info && rb_prev(&right_info->offset_index)) left_info = rb_entry(rb_prev(&right_info->offset_index),
struct btrfs_free_space, offset_index);
else if (!right_info)
left_info = tree_search_offset(ctl, offset - 1, 0, 0);
/* See try_merge_free_space() comment. */
if (right_info && !right_info->bitmap && (!is_trimmed || btrfs_free_space_trimmed(right_info))) { if (update_stat)
unlink_free_space(ctl, right_info);
else
__unlink_free_space(ctl, right_info);
info->bytes += right_info->bytes;
kmem_cache_free(btrfs_free_space_cachep, right_info);
merged = true;
}
/* See try_merge_free_space() comment. */
if (left_info && !left_info->bitmap && left_info->offset + left_info->bytes == offset && (!is_trimmed || btrfs_free_space_trimmed(left_info))) { if (update_stat)
unlink_free_space(ctl, left_info);
else
__unlink_free_space(ctl, left_info);
info->offset = left_info->offset;
info->bytes += left_info->bytes;
kmem_cache_free(btrfs_free_space_cachep, left_info);
merged = true;
}
return merged;
}
static bool steal_from_bitmap_to_end(struct btrfs_free_space_ctl *ctl,
struct btrfs_free_space *info,
bool update_stat)
{
struct btrfs_free_space *bitmap;
unsigned long i;
unsigned long j;
const u64 end = info->offset + info->bytes;
const u64 bitmap_offset = offset_to_bitmap(ctl, end);
u64 bytes;
bitmap = tree_search_offset(ctl, bitmap_offset, 1, 0);
if (!bitmap)
return false;
i = offset_to_bit(bitmap->offset, ctl->unit, end);
j = find_next_zero_bit(bitmap->bitmap, BITS_PER_BITMAP, i);
if (j == i)
return false;
bytes = (j - i) * ctl->unit;
info->bytes += bytes;
/* See try_merge_free_space() comment. */
if (!btrfs_free_space_trimmed(bitmap))
info->trim_state = BTRFS_TRIM_STATE_UNTRIMMED; if (update_stat)
bitmap_clear_bits(ctl, bitmap, end, bytes);
else
__bitmap_clear_bits(ctl, bitmap, end, bytes);
if (!bitmap->bytes) free_bitmap(ctl, bitmap);
return true;
}
static bool steal_from_bitmap_to_front(struct btrfs_free_space_ctl *ctl,
struct btrfs_free_space *info,
bool update_stat)
{
struct btrfs_free_space *bitmap;
u64 bitmap_offset;
unsigned long i;
unsigned long j;
unsigned long prev_j;
u64 bytes;
bitmap_offset = offset_to_bitmap(ctl, info->offset);
/* If we're on a boundary, try the previous logical bitmap. */
if (bitmap_offset == info->offset) {
if (info->offset == 0)
return false;
bitmap_offset = offset_to_bitmap(ctl, info->offset - 1);
}
bitmap = tree_search_offset(ctl, bitmap_offset, 1, 0);
if (!bitmap)
return false;
i = offset_to_bit(bitmap->offset, ctl->unit, info->offset) - 1;
j = 0;
prev_j = (unsigned long)-1;
for_each_clear_bit_from(j, bitmap->bitmap, BITS_PER_BITMAP) { if (j > i)
break;
prev_j = j;
}
if (prev_j == i)
return false;
if (prev_j == (unsigned long)-1) bytes = (i + 1) * ctl->unit;
else
bytes = (i - prev_j) * ctl->unit; info->offset -= bytes;
info->bytes += bytes;
/* See try_merge_free_space() comment. */
if (!btrfs_free_space_trimmed(bitmap))
info->trim_state = BTRFS_TRIM_STATE_UNTRIMMED; if (update_stat)
bitmap_clear_bits(ctl, bitmap, info->offset, bytes);
else
__bitmap_clear_bits(ctl, bitmap, info->offset, bytes);
if (!bitmap->bytes) free_bitmap(ctl, bitmap);
return true;
}
/*
* We prefer always to allocate from extent entries, both for clustered and
* non-clustered allocation requests. So when attempting to add a new extent
* entry, try to see if there's adjacent free space in bitmap entries, and if
* there is, migrate that space from the bitmaps to the extent.
* Like this we get better chances of satisfying space allocation requests
* because we attempt to satisfy them based on a single cache entry, and never
* on 2 or more entries - even if the entries represent a contiguous free space
* region (e.g. 1 extent entry + 1 bitmap entry starting where the extent entry
* ends).
*/
static void steal_from_bitmap(struct btrfs_free_space_ctl *ctl,
struct btrfs_free_space *info,
bool update_stat)
{
/*
* Only work with disconnected entries, as we can change their offset,
* and must be extent entries.
*/
ASSERT(!info->bitmap);
ASSERT(RB_EMPTY_NODE(&info->offset_index));
if (ctl->total_bitmaps > 0) {
bool stole_end;
bool stole_front = false;
stole_end = steal_from_bitmap_to_end(ctl, info, update_stat); if (ctl->total_bitmaps > 0)
stole_front = steal_from_bitmap_to_front(ctl, info,
update_stat);
if (stole_end || stole_front)
try_merge_free_space(ctl, info, update_stat);
}
}
int __btrfs_add_free_space(struct btrfs_fs_info *fs_info,
struct btrfs_free_space_ctl *ctl,
u64 offset, u64 bytes,
enum btrfs_trim_state trim_state)
{
struct btrfs_block_group *block_group = ctl->private;
struct btrfs_free_space *info;
int ret = 0;
u64 filter_bytes = bytes;
ASSERT(!btrfs_is_zoned(fs_info));
info = kmem_cache_zalloc(btrfs_free_space_cachep, GFP_NOFS);
if (!info)
return -ENOMEM;
info->offset = offset;
info->bytes = bytes;
info->trim_state = trim_state;
RB_CLEAR_NODE(&info->offset_index);
spin_lock(&ctl->tree_lock);
if (try_merge_free_space(ctl, info, true))
goto link;
/*
* There was no extent directly to the left or right of this new
* extent then we know we're going to have to allocate a new extent, so
* before we do that see if we need to drop this into a bitmap
*/
ret = insert_into_bitmap(ctl, info);
if (ret < 0) {
goto out;
} else if (ret) {
ret = 0;
goto out;
}
link:
/*
* Only steal free space from adjacent bitmaps if we're sure we're not
* going to add the new free space to existing bitmap entries - because
* that would mean unnecessary work that would be reverted. Therefore
* attempt to steal space from bitmaps if we're adding an extent entry.
*/
steal_from_bitmap(ctl, info, true);
filter_bytes = max(filter_bytes, info->bytes);
ret = link_free_space(ctl, info);
if (ret)
kmem_cache_free(btrfs_free_space_cachep, info);
out:
btrfs_discard_update_discardable(block_group);
spin_unlock(&ctl->tree_lock);
if (ret) {
btrfs_crit(fs_info, "unable to add free space :%d", ret);
ASSERT(ret != -EEXIST);
}
if (trim_state != BTRFS_TRIM_STATE_TRIMMED) { btrfs_discard_check_filter(block_group, filter_bytes); btrfs_discard_queue_work(&fs_info->discard_ctl, block_group);
}
return ret;
}
static int __btrfs_add_free_space_zoned(struct btrfs_block_group *block_group,
u64 bytenr, u64 size, bool used)
{
struct btrfs_fs_info *fs_info = block_group->fs_info;
struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
u64 offset = bytenr - block_group->start;
u64 to_free, to_unusable;
const int bg_reclaim_threshold = READ_ONCE(fs_info->bg_reclaim_threshold);
spin_lock(&ctl->tree_lock);
if (!used)
to_free = size;
else if (offset >= block_group->alloc_offset)
to_free = size;
else if (offset + size <= block_group->alloc_offset)
to_free = 0;
else
to_free = offset + size - block_group->alloc_offset;
to_unusable = size - to_free;
ctl->free_space += to_free;
/*
* If the block group is read-only, we should account freed space into
* bytes_readonly.
*/
if (!block_group->ro)
block_group->zone_unusable += to_unusable;
spin_unlock(&ctl->tree_lock);
if (!used) {
spin_lock(&block_group->lock);
block_group->alloc_offset -= size;
spin_unlock(&block_group->lock);
}
/* All the region is now unusable. Mark it as unused and reclaim */
if (block_group->zone_unusable == block_group->length) {
btrfs_mark_bg_unused(block_group);
} else if (bg_reclaim_threshold &&
block_group->zone_unusable >=
div_factor_fine(block_group->length, bg_reclaim_threshold)) {
btrfs_mark_bg_to_reclaim(block_group);
}
return 0;
}
int btrfs_add_free_space(struct btrfs_block_group *block_group,
u64 bytenr, u64 size)
{
enum btrfs_trim_state trim_state = BTRFS_TRIM_STATE_UNTRIMMED;
if (btrfs_is_zoned(block_group->fs_info)) return __btrfs_add_free_space_zoned(block_group, bytenr, size,
true);
if (btrfs_test_opt(block_group->fs_info, DISCARD_SYNC))
trim_state = BTRFS_TRIM_STATE_TRIMMED;
return __btrfs_add_free_space(block_group->fs_info,
block_group->free_space_ctl,
bytenr, size, trim_state);
}
int btrfs_add_free_space_unused(struct btrfs_block_group *block_group,
u64 bytenr, u64 size)
{
if (btrfs_is_zoned(block_group->fs_info))
return __btrfs_add_free_space_zoned(block_group, bytenr, size,
false);
return btrfs_add_free_space(block_group, bytenr, size);
}
/*
* This is a subtle distinction because when adding free space back in general,
* we want it to be added as untrimmed for async. But in the case where we add
* it on loading of a block group, we want to consider it trimmed.
*/
int btrfs_add_free_space_async_trimmed(struct btrfs_block_group *block_group,
u64 bytenr, u64 size)
{
enum btrfs_trim_state trim_state = BTRFS_TRIM_STATE_UNTRIMMED;
if (btrfs_is_zoned(block_group->fs_info)) return __btrfs_add_free_space_zoned(block_group, bytenr, size,
true);
if (btrfs_test_opt(block_group->fs_info, DISCARD_SYNC) ||
btrfs_test_opt(block_group->fs_info, DISCARD_ASYNC))
trim_state = BTRFS_TRIM_STATE_TRIMMED;
return __btrfs_add_free_space(block_group->fs_info,
block_group->free_space_ctl,
bytenr, size, trim_state);
}
int btrfs_remove_free_space(struct btrfs_block_group *block_group,
u64 offset, u64 bytes)
{
struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
struct btrfs_free_space *info;
int ret;
bool re_search = false;
if (btrfs_is_zoned(block_group->fs_info)) {
/*
* This can happen with conventional zones when replaying log.
* Since the allocation info of tree-log nodes are not recorded
* to the extent-tree, calculate_alloc_pointer() failed to
* advance the allocation pointer after last allocated tree log
* node blocks.
*
* This function is called from
* btrfs_pin_extent_for_log_replay() when replaying the log.
* Advance the pointer not to overwrite the tree-log nodes.
*/
if (block_group->start + block_group->alloc_offset <
offset + bytes) {
block_group->alloc_offset =
offset + bytes - block_group->start;
}
return 0;
}
spin_lock(&ctl->tree_lock);
again:
ret = 0;
if (!bytes)
goto out_lock;
info = tree_search_offset(ctl, offset, 0, 0);
if (!info) {
/*
* oops didn't find an extent that matched the space we wanted
* to remove, look for a bitmap instead
*/
info = tree_search_offset(ctl, offset_to_bitmap(ctl, offset),
1, 0);
if (!info) {
/*
* If we found a partial bit of our free space in a
* bitmap but then couldn't find the other part this may
* be a problem, so WARN about it.
*/
WARN_ON(re_search);
goto out_lock;
}
}
re_search = false;
if (!info->bitmap) {
unlink_free_space(ctl, info);
if (offset == info->offset) {
u64 to_free = min(bytes, info->bytes);
info->bytes -= to_free;
info->offset += to_free;
if (info->bytes) {
ret = link_free_space(ctl, info);
WARN_ON(ret);
} else {
kmem_cache_free(btrfs_free_space_cachep, info);
}
offset += to_free;
bytes -= to_free;
goto again;
} else {
u64 old_end = info->bytes + info->offset;
info->bytes = offset - info->offset;
ret = link_free_space(ctl, info);
WARN_ON(ret);
if (ret)
goto out_lock;
/* Not enough bytes in this entry to satisfy us */
if (old_end < offset + bytes) {
bytes -= old_end - offset;
offset = old_end;
goto again;
} else if (old_end == offset + bytes) {
/* all done */
goto out_lock;
}
spin_unlock(&ctl->tree_lock);
ret = __btrfs_add_free_space(block_group->fs_info, ctl,
offset + bytes,
old_end - (offset + bytes),
info->trim_state);
WARN_ON(ret);
goto out;
}
}
ret = remove_from_bitmap(ctl, info, &offset, &bytes);
if (ret == -EAGAIN) {
re_search = true;
goto again;
}
out_lock:
btrfs_discard_update_discardable(block_group);
spin_unlock(&ctl->tree_lock);
out:
return ret;
}
void btrfs_dump_free_space(struct btrfs_block_group *block_group,
u64 bytes)
{
struct btrfs_fs_info *fs_info = block_group->fs_info;
struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
struct btrfs_free_space *info;
struct rb_node *n;
int count = 0;
/*
* Zoned btrfs does not use free space tree and cluster. Just print
* out the free space after the allocation offset.
*/
if (btrfs_is_zoned(fs_info)) {
btrfs_info(fs_info, "free space %llu",
block_group->length - block_group->alloc_offset);
return;
}
spin_lock(&ctl->tree_lock);
for (n = rb_first(&ctl->free_space_offset); n; n = rb_next(n)) {
info = rb_entry(n, struct btrfs_free_space, offset_index);
if (info->bytes >= bytes && !block_group->ro)
count++;
btrfs_crit(fs_info, "entry offset %llu, bytes %llu, bitmap %s",
info->offset, info->bytes,
(info->bitmap) ? "yes" : "no");
}
spin_unlock(&ctl->tree_lock);
btrfs_info(fs_info, "block group has cluster?: %s",
list_empty(&block_group->cluster_list) ? "no" : "yes");
btrfs_info(fs_info,
"%d blocks of free space at or bigger than bytes is", count);
}
void btrfs_init_free_space_ctl(struct btrfs_block_group *block_group,
struct btrfs_free_space_ctl *ctl)
{
struct btrfs_fs_info *fs_info = block_group->fs_info;
spin_lock_init(&ctl->tree_lock);
ctl->unit = fs_info->sectorsize;
ctl->start = block_group->start;
ctl->private = block_group;
ctl->op = &free_space_op;
INIT_LIST_HEAD(&ctl->trimming_ranges);
mutex_init(&ctl->cache_writeout_mutex);
/*
* we only want to have 32k of ram per block group for keeping
* track of free space, and if we pass 1/2 of that we want to
* start converting things over to using bitmaps
*/
ctl->extents_thresh = (SZ_32K / 2) / sizeof(struct btrfs_free_space);
}
/*
* for a given cluster, put all of its extents back into the free
* space cache. If the block group passed doesn't match the block group
* pointed to by the cluster, someone else raced in and freed the
* cluster already. In that case, we just return without changing anything
*/
static void __btrfs_return_cluster_to_free_space(
struct btrfs_block_group *block_group,
struct btrfs_free_cluster *cluster)
{
struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
struct btrfs_free_space *entry;
struct rb_node *node;
spin_lock(&cluster->lock);
if (cluster->block_group != block_group) {
spin_unlock(&cluster->lock);
return;
}
cluster->block_group = NULL;
cluster->window_start = 0;
list_del_init(&cluster->block_group_list);
node = rb_first(&cluster->root);
while (node) {
bool bitmap;
entry = rb_entry(node, struct btrfs_free_space, offset_index);
node = rb_next(&entry->offset_index);
rb_erase(&entry->offset_index, &cluster->root);
RB_CLEAR_NODE(&entry->offset_index);
bitmap = (entry->bitmap != NULL);
if (!bitmap) {
/* Merging treats extents as if they were new */
if (!btrfs_free_space_trimmed(entry)) { ctl->discardable_extents[BTRFS_STAT_CURR]--;
ctl->discardable_bytes[BTRFS_STAT_CURR] -=
entry->bytes;
}
try_merge_free_space(ctl, entry, false);
steal_from_bitmap(ctl, entry, false);
/* As we insert directly, update these statistics */
if (!btrfs_free_space_trimmed(entry)) { ctl->discardable_extents[BTRFS_STAT_CURR]++;
ctl->discardable_bytes[BTRFS_STAT_CURR] +=
entry->bytes;
}
}
tree_insert_offset(&ctl->free_space_offset,
entry->offset, &entry->offset_index, bitmap);
}
cluster->root = RB_ROOT;
spin_unlock(&cluster->lock);
btrfs_put_block_group(block_group);
}
static void __btrfs_remove_free_space_cache_locked(
struct btrfs_free_space_ctl *ctl)
{
struct btrfs_free_space *info;
struct rb_node *node;
while ((node = rb_last(&ctl->free_space_offset)) != NULL) {
info = rb_entry(node, struct btrfs_free_space, offset_index);
if (!info->bitmap) {
unlink_free_space(ctl, info);
kmem_cache_free(btrfs_free_space_cachep, info);
} else {
free_bitmap(ctl, info);
}
cond_resched_lock(&ctl->tree_lock);
}
}
void __btrfs_remove_free_space_cache(struct btrfs_free_space_ctl *ctl)
{
spin_lock(&ctl->tree_lock);
__btrfs_remove_free_space_cache_locked(ctl);
if (ctl->private)
btrfs_discard_update_discardable(ctl->private);
spin_unlock(&ctl->tree_lock);
}
void btrfs_remove_free_space_cache(struct btrfs_block_group *block_group)
{
struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
struct btrfs_free_cluster *cluster;
struct list_head *head;
spin_lock(&ctl->tree_lock);
while ((head = block_group->cluster_list.next) !=
&block_group->cluster_list) {
cluster = list_entry(head, struct btrfs_free_cluster,
block_group_list);
WARN_ON(cluster->block_group != block_group); __btrfs_return_cluster_to_free_space(block_group, cluster);
cond_resched_lock(&ctl->tree_lock);
}
__btrfs_remove_free_space_cache_locked(ctl);
btrfs_discard_update_discardable(block_group);
spin_unlock(&ctl->tree_lock);
}
/**
* btrfs_is_free_space_trimmed - see if everything is trimmed
* @block_group: block_group of interest
*
* Walk @block_group's free space rb_tree to determine if everything is trimmed.
*/
bool btrfs_is_free_space_trimmed(struct btrfs_block_group *block_group)
{
struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
struct btrfs_free_space *info;
struct rb_node *node;
bool ret = true;
spin_lock(&ctl->tree_lock);
node = rb_first(&ctl->free_space_offset);
while (node) {
info = rb_entry(node, struct btrfs_free_space, offset_index);
if (!btrfs_free_space_trimmed(info)) {
ret = false;
break;
}
node = rb_next(node);
}
spin_unlock(&ctl->tree_lock);
return ret;
}
u64 btrfs_find_space_for_alloc(struct btrfs_block_group *block_group,
u64 offset, u64 bytes, u64 empty_size,
u64 *max_extent_size)
{
struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
struct btrfs_discard_ctl *discard_ctl =
&block_group->fs_info->discard_ctl;
struct btrfs_free_space *entry = NULL;
u64 bytes_search = bytes + empty_size;
u64 ret = 0;
u64 align_gap = 0;
u64 align_gap_len = 0;
enum btrfs_trim_state align_gap_trim_state = BTRFS_TRIM_STATE_UNTRIMMED;
ASSERT(!btrfs_is_zoned(block_group->fs_info));
spin_lock(&ctl->tree_lock);
entry = find_free_space(ctl, &offset, &bytes_search,
block_group->full_stripe_len, max_extent_size);
if (!entry)
goto out;
ret = offset;
if (entry->bitmap) {
bitmap_clear_bits(ctl, entry, offset, bytes);
if (!btrfs_free_space_trimmed(entry))
atomic64_add(bytes, &discard_ctl->discard_bytes_saved); if (!entry->bytes) free_bitmap(ctl, entry);
} else {
unlink_free_space(ctl, entry);
align_gap_len = offset - entry->offset;
align_gap = entry->offset;
align_gap_trim_state = entry->trim_state;
if (!btrfs_free_space_trimmed(entry))
atomic64_add(bytes, &discard_ctl->discard_bytes_saved); entry->offset = offset + bytes; WARN_ON(entry->bytes < bytes + align_gap_len); entry->bytes -= bytes + align_gap_len;
if (!entry->bytes)
kmem_cache_free(btrfs_free_space_cachep, entry);
else
link_free_space(ctl, entry);
}
out:
btrfs_discard_update_discardable(block_group);
spin_unlock(&ctl->tree_lock);
if (align_gap_len)
__btrfs_add_free_space(block_group->fs_info, ctl,
align_gap, align_gap_len,
align_gap_trim_state);
return ret;
}
/*
* given a cluster, put all of its extents back into the free space
* cache. If a block group is passed, this function will only free
* a cluster that belongs to the passed block group.
*
* Otherwise, it'll get a reference on the block group pointed to by the
* cluster and remove the cluster from it.
*/
void btrfs_return_cluster_to_free_space(
struct btrfs_block_group *block_group,
struct btrfs_free_cluster *cluster)
{
struct btrfs_free_space_ctl *ctl;
/* first, get a safe pointer to the block group */
spin_lock(&cluster->lock);
if (!block_group) {
block_group = cluster->block_group;
if (!block_group) {
spin_unlock(&cluster->lock);
return;
}
} else if (cluster->block_group != block_group) {
/* someone else has already freed it don't redo their work */
spin_unlock(&cluster->lock);
return;
}
btrfs_get_block_group(block_group);
spin_unlock(&cluster->lock);
ctl = block_group->free_space_ctl;
/* now return any extents the cluster had on it */
spin_lock(&ctl->tree_lock);
__btrfs_return_cluster_to_free_space(block_group, cluster);
spin_unlock(&ctl->tree_lock);
btrfs_discard_queue_work(&block_group->fs_info->discard_ctl, block_group);
/* finally drop our ref */
btrfs_put_block_group(block_group);
}
static u64 btrfs_alloc_from_bitmap(struct btrfs_block_group *block_group,
struct btrfs_free_cluster *cluster,
struct btrfs_free_space *entry,
u64 bytes, u64 min_start,
u64 *max_extent_size)
{
struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
int err;
u64 search_start = cluster->window_start;
u64 search_bytes = bytes;
u64 ret = 0;
search_start = min_start;
search_bytes = bytes;
err = search_bitmap(ctl, entry, &search_start, &search_bytes, true);
if (err) {
*max_extent_size = max(get_max_extent_size(entry),
*max_extent_size);
return 0;
}
ret = search_start;
__bitmap_clear_bits(ctl, entry, ret, bytes);
return ret;
}
/*
* given a cluster, try to allocate 'bytes' from it, returns 0
* if it couldn't find anything suitably large, or a logical disk offset
* if things worked out
*/
u64 btrfs_alloc_from_cluster(struct btrfs_block_group *block_group,
struct btrfs_free_cluster *cluster, u64 bytes,
u64 min_start, u64 *max_extent_size)
{
struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
struct btrfs_discard_ctl *discard_ctl =
&block_group->fs_info->discard_ctl;
struct btrfs_free_space *entry = NULL;
struct rb_node *node;
u64 ret = 0;
ASSERT(!btrfs_is_zoned(block_group->fs_info));
spin_lock(&cluster->lock);
if (bytes > cluster->max_size)
goto out;
if (cluster->block_group != block_group)
goto out;
node = rb_first(&cluster->root);
if (!node)
goto out;
entry = rb_entry(node, struct btrfs_free_space, offset_index);
while (1) {
if (entry->bytes < bytes) *max_extent_size = max(get_max_extent_size(entry),
*max_extent_size);
if (entry->bytes < bytes || (!entry->bitmap && entry->offset < min_start)) {
node = rb_next(&entry->offset_index);
if (!node)
break;
entry = rb_entry(node, struct btrfs_free_space,
offset_index);
continue;
}
if (entry->bitmap) {
ret = btrfs_alloc_from_bitmap(block_group,
cluster, entry, bytes,
cluster->window_start,
max_extent_size);
if (ret == 0) {
node = rb_next(&entry->offset_index);
if (!node)
break;
entry = rb_entry(node, struct btrfs_free_space,
offset_index);
continue;
}
cluster->window_start += bytes;
} else {
ret = entry->offset;
entry->offset += bytes;
entry->bytes -= bytes;
}
break;
}
out:
spin_unlock(&cluster->lock);
if (!ret)
return 0;
spin_lock(&ctl->tree_lock);
if (!btrfs_free_space_trimmed(entry))
atomic64_add(bytes, &discard_ctl->discard_bytes_saved); ctl->free_space -= bytes; if (!entry->bitmap && !btrfs_free_space_trimmed(entry)) ctl->discardable_bytes[BTRFS_STAT_CURR] -= bytes;
spin_lock(&cluster->lock);
if (entry->bytes == 0) {
rb_erase(&entry->offset_index, &cluster->root);
ctl->free_extents--;
if (entry->bitmap) {
kmem_cache_free(btrfs_free_space_bitmap_cachep,
entry->bitmap);
ctl->total_bitmaps--;
recalculate_thresholds(ctl);
} else if (!btrfs_free_space_trimmed(entry)) { ctl->discardable_extents[BTRFS_STAT_CURR]--;
}
kmem_cache_free(btrfs_free_space_cachep, entry);
}
spin_unlock(&cluster->lock);
spin_unlock(&ctl->tree_lock);
return ret;
}
static int btrfs_bitmap_cluster(struct btrfs_block_group *block_group,
struct btrfs_free_space *entry,
struct btrfs_free_cluster *cluster,
u64 offset, u64 bytes,
u64 cont1_bytes, u64 min_bytes)
{
struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
unsigned long next_zero;
unsigned long i;
unsigned long want_bits;
unsigned long min_bits;
unsigned long found_bits;
unsigned long max_bits = 0;
unsigned long start = 0;
unsigned long total_found = 0;
int ret;
i = offset_to_bit(entry->offset, ctl->unit,
max_t(u64, offset, entry->offset));
want_bits = bytes_to_bits(bytes, ctl->unit);
min_bits = bytes_to_bits(min_bytes, ctl->unit);
/*
* Don't bother looking for a cluster in this bitmap if it's heavily
* fragmented.
*/
if (entry->max_extent_size &&
entry->max_extent_size < cont1_bytes)
return -ENOSPC;
again:
found_bits = 0;
for_each_set_bit_from(i, entry->bitmap, BITS_PER_BITMAP) {
next_zero = find_next_zero_bit(entry->bitmap,
BITS_PER_BITMAP, i);
if (next_zero - i >= min_bits) {
found_bits = next_zero - i;
if (found_bits > max_bits)
max_bits = found_bits;
break;
}
if (next_zero - i > max_bits)
max_bits = next_zero - i;
i = next_zero;
}
if (!found_bits) {
entry->max_extent_size = (u64)max_bits * ctl->unit;
return -ENOSPC;
}
if (!total_found) {
start = i;
cluster->max_size = 0;
}
total_found += found_bits;
if (cluster->max_size < found_bits * ctl->unit)
cluster->max_size = found_bits * ctl->unit;
if (total_found < want_bits || cluster->max_size < cont1_bytes) {
i = next_zero + 1;
goto again;
}
cluster->window_start = start * ctl->unit + entry->offset;
rb_erase(&entry->offset_index, &ctl->free_space_offset);
ret = tree_insert_offset(&cluster->root, entry->offset,
&entry->offset_index, 1);
ASSERT(!ret); /* -EEXIST; Logic error */
trace_btrfs_setup_cluster(block_group, cluster,
total_found * ctl->unit, 1);
return 0;
}
/*
* This searches the block group for just extents to fill the cluster with.
* Try to find a cluster with at least bytes total bytes, at least one
* extent of cont1_bytes, and other clusters of at least min_bytes.
*/
static noinline int
setup_cluster_no_bitmap(struct btrfs_block_group *block_group,
struct btrfs_free_cluster *cluster,
struct list_head *bitmaps, u64 offset, u64 bytes,
u64 cont1_bytes, u64 min_bytes)
{
struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
struct btrfs_free_space *first = NULL;
struct btrfs_free_space *entry = NULL;
struct btrfs_free_space *last;
struct rb_node *node;
u64 window_free;
u64 max_extent;
u64 total_size = 0;
entry = tree_search_offset(ctl, offset, 0, 1);
if (!entry)
return -ENOSPC;
/*
* We don't want bitmaps, so just move along until we find a normal
* extent entry.
*/
while (entry->bitmap || entry->bytes < min_bytes) { if (entry->bitmap && list_empty(&entry->list))
list_add_tail(&entry->list, bitmaps);
node = rb_next(&entry->offset_index);
if (!node)
return -ENOSPC;
entry = rb_entry(node, struct btrfs_free_space, offset_index);
}
window_free = entry->bytes;
max_extent = entry->bytes;
first = entry;
last = entry;
for (node = rb_next(&entry->offset_index); node; node = rb_next(&entry->offset_index)) {
entry = rb_entry(node, struct btrfs_free_space, offset_index);
if (entry->bitmap) { if (list_empty(&entry->list))
list_add_tail(&entry->list, bitmaps);
continue;
}
if (entry->bytes < min_bytes)
continue;
last = entry;
window_free += entry->bytes;
if (entry->bytes > max_extent)
max_extent = entry->bytes;
}
if (window_free < bytes || max_extent < cont1_bytes)
return -ENOSPC;
cluster->window_start = first->offset;
node = &first->offset_index;
/*
* now we've found our entries, pull them out of the free space
* cache and put them into the cluster rbtree
*/
do {
int ret;
entry = rb_entry(node, struct btrfs_free_space, offset_index);
node = rb_next(&entry->offset_index); if (entry->bitmap || entry->bytes < min_bytes)
continue;
rb_erase(&entry->offset_index, &ctl->free_space_offset);
ret = tree_insert_offset(&cluster->root, entry->offset,
&entry->offset_index, 0);
total_size += entry->bytes;
ASSERT(!ret); /* -EEXIST; Logic error */
} while (node && entry != last); cluster->max_size = max_extent;
trace_btrfs_setup_cluster(block_group, cluster, total_size, 0);
return 0;
}
/*
* This specifically looks for bitmaps that may work in the cluster, we assume
* that we have already failed to find extents that will work.
*/
static noinline int
setup_cluster_bitmap(struct btrfs_block_group *block_group,
struct btrfs_free_cluster *cluster,
struct list_head *bitmaps, u64 offset, u64 bytes,
u64 cont1_bytes, u64 min_bytes)
{
struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
struct btrfs_free_space *entry = NULL;
int ret = -ENOSPC;
u64 bitmap_offset = offset_to_bitmap(ctl, offset);
if (ctl->total_bitmaps == 0)
return -ENOSPC;
/*
* The bitmap that covers offset won't be in the list unless offset
* is just its start offset.
*/
if (!list_empty(bitmaps))
entry = list_first_entry(bitmaps, struct btrfs_free_space, list);
if (!entry || entry->offset != bitmap_offset) {
entry = tree_search_offset(ctl, bitmap_offset, 1, 0);
if (entry && list_empty(&entry->list))
list_add(&entry->list, bitmaps);
}
list_for_each_entry(entry, bitmaps, list) {
if (entry->bytes < bytes)
continue;
ret = btrfs_bitmap_cluster(block_group, entry, cluster, offset,
bytes, cont1_bytes, min_bytes);
if (!ret)
return 0;
}
/*
* The bitmaps list has all the bitmaps that record free space
* starting after offset, so no more search is required.
*/
return -ENOSPC;
}
/*
* here we try to find a cluster of blocks in a block group. The goal
* is to find at least bytes+empty_size.
* We might not find them all in one contiguous area.
*
* returns zero and sets up cluster if things worked out, otherwise
* it returns -enospc
*/
int btrfs_find_space_cluster(struct btrfs_block_group *block_group,
struct btrfs_free_cluster *cluster,
u64 offset, u64 bytes, u64 empty_size)
{
struct btrfs_fs_info *fs_info = block_group->fs_info;
struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
struct btrfs_free_space *entry, *tmp;
LIST_HEAD(bitmaps);
u64 min_bytes;
u64 cont1_bytes;
int ret;
/*
* Choose the minimum extent size we'll require for this
* cluster. For SSD_SPREAD, don't allow any fragmentation.
* For metadata, allow allocates with smaller extents. For
* data, keep it dense.
*/
if (btrfs_test_opt(fs_info, SSD_SPREAD)) {
cont1_bytes = min_bytes = bytes + empty_size; } else if (block_group->flags & BTRFS_BLOCK_GROUP_METADATA) {
cont1_bytes = bytes;
min_bytes = fs_info->sectorsize;
} else {
cont1_bytes = max(bytes, (bytes + empty_size) >> 2);
min_bytes = fs_info->sectorsize;
}
spin_lock(&ctl->tree_lock);
/*
* If we know we don't have enough space to make a cluster don't even
* bother doing all the work to try and find one.
*/
if (ctl->free_space < bytes) {
spin_unlock(&ctl->tree_lock);
return -ENOSPC;
}
spin_lock(&cluster->lock);
/* someone already found a cluster, hooray */
if (cluster->block_group) {
ret = 0;
goto out;
}
trace_btrfs_find_cluster(block_group, offset, bytes, empty_size,
min_bytes);
ret = setup_cluster_no_bitmap(block_group, cluster, &bitmaps, offset,
bytes + empty_size,
cont1_bytes, min_bytes);
if (ret)
ret = setup_cluster_bitmap(block_group, cluster, &bitmaps,
offset, bytes + empty_size,
cont1_bytes, min_bytes);
/* Clear our temporary list */
list_for_each_entry_safe(entry, tmp, &bitmaps, list)
list_del_init(&entry->list);
if (!ret) { btrfs_get_block_group(block_group);
list_add_tail(&cluster->block_group_list,
&block_group->cluster_list);
cluster->block_group = block_group;
} else {
trace_btrfs_failed_cluster_setup(block_group);
}
out:
spin_unlock(&cluster->lock);
spin_unlock(&ctl->tree_lock);
return ret;
}
/*
* simple code to zero out a cluster
*/
void btrfs_init_free_cluster(struct btrfs_free_cluster *cluster)
{
spin_lock_init(&cluster->lock);
spin_lock_init(&cluster->refill_lock);
cluster->root = RB_ROOT;
cluster->max_size = 0;
cluster->fragmented = false;
INIT_LIST_HEAD(&cluster->block_group_list);
cluster->block_group = NULL;
}
static int do_trimming(struct btrfs_block_group *block_group,
u64 *total_trimmed, u64 start, u64 bytes,
u64 reserved_start, u64 reserved_bytes,
enum btrfs_trim_state reserved_trim_state,
struct btrfs_trim_range *trim_entry)
{
struct btrfs_space_info *space_info = block_group->space_info;
struct btrfs_fs_info *fs_info = block_group->fs_info;
struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
int ret;
int update = 0;
const u64 end = start + bytes;
const u64 reserved_end = reserved_start + reserved_bytes;
enum btrfs_trim_state trim_state = BTRFS_TRIM_STATE_UNTRIMMED;
u64 trimmed = 0;
spin_lock(&space_info->lock);
spin_lock(&block_group->lock);
if (!block_group->ro) {
block_group->reserved += reserved_bytes;
space_info->bytes_reserved += reserved_bytes;
update = 1;
}
spin_unlock(&block_group->lock);
spin_unlock(&space_info->lock);
ret = btrfs_discard_extent(fs_info, start, bytes, &trimmed);
if (!ret) {
*total_trimmed += trimmed;
trim_state = BTRFS_TRIM_STATE_TRIMMED;
}
mutex_lock(&ctl->cache_writeout_mutex);
if (reserved_start < start)
__btrfs_add_free_space(fs_info, ctl, reserved_start,
start - reserved_start,
reserved_trim_state);
if (start + bytes < reserved_start + reserved_bytes)
__btrfs_add_free_space(fs_info, ctl, end, reserved_end - end,
reserved_trim_state);
__btrfs_add_free_space(fs_info, ctl, start, bytes, trim_state);
list_del(&trim_entry->list);
mutex_unlock(&ctl->cache_writeout_mutex);
if (update) {
spin_lock(&space_info->lock);
spin_lock(&block_group->lock);
if (block_group->ro)
space_info->bytes_readonly += reserved_bytes;
block_group->reserved -= reserved_bytes;
space_info->bytes_reserved -= reserved_bytes;
spin_unlock(&block_group->lock);
spin_unlock(&space_info->lock);
}
return ret;
}
/*
* If @async is set, then we will trim 1 region and return.
*/
static int trim_no_bitmap(struct btrfs_block_group *block_group,
u64 *total_trimmed, u64 start, u64 end, u64 minlen,
bool async)
{
struct btrfs_discard_ctl *discard_ctl =
&block_group->fs_info->discard_ctl;
struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
struct btrfs_free_space *entry;
struct rb_node *node;
int ret = 0;
u64 extent_start;
u64 extent_bytes;
enum btrfs_trim_state extent_trim_state;
u64 bytes;
const u64 max_discard_size = READ_ONCE(discard_ctl->max_discard_size);
while (start < end) {
struct btrfs_trim_range trim_entry;
mutex_lock(&ctl->cache_writeout_mutex);
spin_lock(&ctl->tree_lock);
if (ctl->free_space < minlen)
goto out_unlock;
entry = tree_search_offset(ctl, start, 0, 1);
if (!entry)
goto out_unlock;
/* Skip bitmaps and if async, already trimmed entries */
while (entry->bitmap ||
(async && btrfs_free_space_trimmed(entry))) {
node = rb_next(&entry->offset_index);
if (!node)
goto out_unlock;
entry = rb_entry(node, struct btrfs_free_space,
offset_index);
}
if (entry->offset >= end)
goto out_unlock;
extent_start = entry->offset;
extent_bytes = entry->bytes;
extent_trim_state = entry->trim_state;
if (async) {
start = entry->offset;
bytes = entry->bytes;
if (bytes < minlen) {
spin_unlock(&ctl->tree_lock);
mutex_unlock(&ctl->cache_writeout_mutex);
goto next;
}
unlink_free_space(ctl, entry);
/*
* Let bytes = BTRFS_MAX_DISCARD_SIZE + X.
* If X < BTRFS_ASYNC_DISCARD_MIN_FILTER, we won't trim
* X when we come back around. So trim it now.
*/
if (max_discard_size &&
bytes >= (max_discard_size +
BTRFS_ASYNC_DISCARD_MIN_FILTER)) {
bytes = max_discard_size;
extent_bytes = max_discard_size;
entry->offset += max_discard_size;
entry->bytes -= max_discard_size;
link_free_space(ctl, entry);
} else {
kmem_cache_free(btrfs_free_space_cachep, entry);
}
} else {
start = max(start, extent_start);
bytes = min(extent_start + extent_bytes, end) - start;
if (bytes < minlen) {
spin_unlock(&ctl->tree_lock);
mutex_unlock(&ctl->cache_writeout_mutex);
goto next;
}
unlink_free_space(ctl, entry);
kmem_cache_free(btrfs_free_space_cachep, entry);
}
spin_unlock(&ctl->tree_lock);
trim_entry.start = extent_start;
trim_entry.bytes = extent_bytes;
list_add_tail(&trim_entry.list, &ctl->trimming_ranges);
mutex_unlock(&ctl->cache_writeout_mutex);
ret = do_trimming(block_group, total_trimmed, start, bytes,
extent_start, extent_bytes, extent_trim_state,
&trim_entry);
if (ret) {
block_group->discard_cursor = start + bytes;
break;
}
next:
start += bytes;
block_group->discard_cursor = start;
if (async && *total_trimmed)
break;
if (fatal_signal_pending(current)) {
ret = -ERESTARTSYS;
break;
}
cond_resched();
}
return ret;
out_unlock:
block_group->discard_cursor = btrfs_block_group_end(block_group);
spin_unlock(&ctl->tree_lock);
mutex_unlock(&ctl->cache_writeout_mutex);
return ret;
}
/*
* If we break out of trimming a bitmap prematurely, we should reset the
* trimming bit. In a rather contrieved case, it's possible to race here so
* reset the state to BTRFS_TRIM_STATE_UNTRIMMED.
*
* start = start of bitmap
* end = near end of bitmap
*
* Thread 1: Thread 2:
* trim_bitmaps(start)
* trim_bitmaps(end)
* end_trimming_bitmap()
* reset_trimming_bitmap()
*/
static void reset_trimming_bitmap(struct btrfs_free_space_ctl *ctl, u64 offset)
{
struct btrfs_free_space *entry;
spin_lock(&ctl->tree_lock);
entry = tree_search_offset(ctl, offset, 1, 0);
if (entry) {
if (btrfs_free_space_trimmed(entry)) {
ctl->discardable_extents[BTRFS_STAT_CURR] +=
entry->bitmap_extents;
ctl->discardable_bytes[BTRFS_STAT_CURR] += entry->bytes;
}
entry->trim_state = BTRFS_TRIM_STATE_UNTRIMMED;
}
spin_unlock(&ctl->tree_lock);
}
static void end_trimming_bitmap(struct btrfs_free_space_ctl *ctl,
struct btrfs_free_space *entry)
{
if (btrfs_free_space_trimming_bitmap(entry)) {
entry->trim_state = BTRFS_TRIM_STATE_TRIMMED;
ctl->discardable_extents[BTRFS_STAT_CURR] -=
entry->bitmap_extents;
ctl->discardable_bytes[BTRFS_STAT_CURR] -= entry->bytes;
}
}
/*
* If @async is set, then we will trim 1 region and return.
*/
static int trim_bitmaps(struct btrfs_block_group *block_group,
u64 *total_trimmed, u64 start, u64 end, u64 minlen,
u64 maxlen, bool async)
{
struct btrfs_discard_ctl *discard_ctl =
&block_group->fs_info->discard_ctl;
struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
struct btrfs_free_space *entry;
int ret = 0;
int ret2;
u64 bytes;
u64 offset = offset_to_bitmap(ctl, start);
const u64 max_discard_size = READ_ONCE(discard_ctl->max_discard_size);
while (offset < end) {
bool next_bitmap = false;
struct btrfs_trim_range trim_entry;
mutex_lock(&ctl->cache_writeout_mutex);
spin_lock(&ctl->tree_lock);
if (ctl->free_space < minlen) {
block_group->discard_cursor =
btrfs_block_group_end(block_group);
spin_unlock(&ctl->tree_lock);
mutex_unlock(&ctl->cache_writeout_mutex);
break;
}
entry = tree_search_offset(ctl, offset, 1, 0);
/*
* Bitmaps are marked trimmed lossily now to prevent constant
* discarding of the same bitmap (the reason why we are bound
* by the filters). So, retrim the block group bitmaps when we
* are preparing to punt to the unused_bgs list. This uses
* @minlen to determine if we are in BTRFS_DISCARD_INDEX_UNUSED
* which is the only discard index which sets minlen to 0.
*/
if (!entry || (async && minlen && start == offset &&
btrfs_free_space_trimmed(entry))) {
spin_unlock(&ctl->tree_lock);
mutex_unlock(&ctl->cache_writeout_mutex);
next_bitmap = true;
goto next;
}
/*
* Async discard bitmap trimming begins at by setting the start
* to be key.objectid and the offset_to_bitmap() aligns to the
* start of the bitmap. This lets us know we are fully
* scanning the bitmap rather than only some portion of it.
*/
if (start == offset)
entry->trim_state = BTRFS_TRIM_STATE_TRIMMING;
bytes = minlen;
ret2 = search_bitmap(ctl, entry, &start, &bytes, false);
if (ret2 || start >= end) {
/*
* We lossily consider a bitmap trimmed if we only skip
* over regions <= BTRFS_ASYNC_DISCARD_MIN_FILTER.
*/
if (ret2 && minlen <= BTRFS_ASYNC_DISCARD_MIN_FILTER)
end_trimming_bitmap(ctl, entry);
else
entry->trim_state = BTRFS_TRIM_STATE_UNTRIMMED;
spin_unlock(&ctl->tree_lock);
mutex_unlock(&ctl->cache_writeout_mutex);
next_bitmap = true;
goto next;
}
/*
* We already trimmed a region, but are using the locking above
* to reset the trim_state.
*/
if (async && *total_trimmed) {
spin_unlock(&ctl->tree_lock);
mutex_unlock(&ctl->cache_writeout_mutex);
goto out;
}
bytes = min(bytes, end - start);
if (bytes < minlen || (async && maxlen && bytes > maxlen)) {
spin_unlock(&ctl->tree_lock);
mutex_unlock(&ctl->cache_writeout_mutex);
goto next;
}
/*
* Let bytes = BTRFS_MAX_DISCARD_SIZE + X.
* If X < @minlen, we won't trim X when we come back around.
* So trim it now. We differ here from trimming extents as we
* don't keep individual state per bit.
*/
if (async &&
max_discard_size &&
bytes > (max_discard_size + minlen))
bytes = max_discard_size;
bitmap_clear_bits(ctl, entry, start, bytes);
if (entry->bytes == 0)
free_bitmap(ctl, entry);
spin_unlock(&ctl->tree_lock);
trim_entry.start = start;
trim_entry.bytes = bytes;
list_add_tail(&trim_entry.list, &ctl->trimming_ranges);
mutex_unlock(&ctl->cache_writeout_mutex);
ret = do_trimming(block_group, total_trimmed, start, bytes,
start, bytes, 0, &trim_entry);
if (ret) {
reset_trimming_bitmap(ctl, offset);
block_group->discard_cursor =
btrfs_block_group_end(block_group);
break;
}
next:
if (next_bitmap) {
offset += BITS_PER_BITMAP * ctl->unit;
start = offset;
} else {
start += bytes;
}
block_group->discard_cursor = start;
if (fatal_signal_pending(current)) {
if (start != offset)
reset_trimming_bitmap(ctl, offset);
ret = -ERESTARTSYS;
break;
}
cond_resched();
}
if (offset >= end)
block_group->discard_cursor = end;
out:
return ret;
}
int btrfs_trim_block_group(struct btrfs_block_group *block_group,
u64 *trimmed, u64 start, u64 end, u64 minlen)
{
struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
int ret;
u64 rem = 0;
ASSERT(!btrfs_is_zoned(block_group->fs_info));
*trimmed = 0;
spin_lock(&block_group->lock);
if (block_group->removed) {
spin_unlock(&block_group->lock);
return 0;
}
btrfs_freeze_block_group(block_group);
spin_unlock(&block_group->lock);
ret = trim_no_bitmap(block_group, trimmed, start, end, minlen, false);
if (ret)
goto out;
ret = trim_bitmaps(block_group, trimmed, start, end, minlen, 0, false);
div64_u64_rem(end, BITS_PER_BITMAP * ctl->unit, &rem);
/* If we ended in the middle of a bitmap, reset the trimming flag */
if (rem)
reset_trimming_bitmap(ctl, offset_to_bitmap(ctl, end));
out:
btrfs_unfreeze_block_group(block_group);
return ret;
}
int btrfs_trim_block_group_extents(struct btrfs_block_group *block_group,
u64 *trimmed, u64 start, u64 end, u64 minlen,
bool async)
{
int ret;
*trimmed = 0;
spin_lock(&block_group->lock);
if (block_group->removed) {
spin_unlock(&block_group->lock);
return 0;
}
btrfs_freeze_block_group(block_group);
spin_unlock(&block_group->lock);
ret = trim_no_bitmap(block_group, trimmed, start, end, minlen, async);
btrfs_unfreeze_block_group(block_group);
return ret;
}
int btrfs_trim_block_group_bitmaps(struct btrfs_block_group *block_group,
u64 *trimmed, u64 start, u64 end, u64 minlen,
u64 maxlen, bool async)
{
int ret;
*trimmed = 0;
spin_lock(&block_group->lock);
if (block_group->removed) {
spin_unlock(&block_group->lock);
return 0;
}
btrfs_freeze_block_group(block_group);
spin_unlock(&block_group->lock);
ret = trim_bitmaps(block_group, trimmed, start, end, minlen, maxlen,
async);
btrfs_unfreeze_block_group(block_group);
return ret;
}
bool btrfs_free_space_cache_v1_active(struct btrfs_fs_info *fs_info)
{
return btrfs_super_cache_generation(fs_info->super_copy);
}
static int cleanup_free_space_cache_v1(struct btrfs_fs_info *fs_info,
struct btrfs_trans_handle *trans)
{
struct btrfs_block_group *block_group;
struct rb_node *node;
int ret = 0;
btrfs_info(fs_info, "cleaning free space cache v1");
node = rb_first(&fs_info->block_group_cache_tree);
while (node) { block_group = rb_entry(node, struct btrfs_block_group, cache_node);
ret = btrfs_remove_free_space_inode(trans, NULL, block_group);
if (ret)
goto out;
node = rb_next(node);
}
out:
return ret;
}
int btrfs_set_free_space_cache_v1_active(struct btrfs_fs_info *fs_info, bool active)
{
struct btrfs_trans_handle *trans;
int ret;
/*
* update_super_roots will appropriately set or unset
* super_copy->cache_generation based on SPACE_CACHE and
* BTRFS_FS_CLEANUP_SPACE_CACHE_V1. For this reason, we need a
* transaction commit whether we are enabling space cache v1 and don't
* have any other work to do, or are disabling it and removing free
* space inodes.
*/
trans = btrfs_start_transaction(fs_info->tree_root, 0);
if (IS_ERR(trans))
return PTR_ERR(trans);
if (!active) {
set_bit(BTRFS_FS_CLEANUP_SPACE_CACHE_V1, &fs_info->flags);
ret = cleanup_free_space_cache_v1(fs_info, trans);
if (ret) {
btrfs_abort_transaction(trans, ret);
btrfs_end_transaction(trans);
goto out;
}
}
ret = btrfs_commit_transaction(trans);
out:
clear_bit(BTRFS_FS_CLEANUP_SPACE_CACHE_V1, &fs_info->flags);
return ret;
}
#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
/*
* Use this if you need to make a bitmap or extent entry specifically, it
* doesn't do any of the merging that add_free_space does, this acts a lot like
* how the free space cache loading stuff works, so you can get really weird
* configurations.
*/
int test_add_free_space_entry(struct btrfs_block_group *cache,
u64 offset, u64 bytes, bool bitmap)
{
struct btrfs_free_space_ctl *ctl = cache->free_space_ctl;
struct btrfs_free_space *info = NULL, *bitmap_info;
void *map = NULL;
enum btrfs_trim_state trim_state = BTRFS_TRIM_STATE_TRIMMED;
u64 bytes_added;
int ret;
again:
if (!info) {
info = kmem_cache_zalloc(btrfs_free_space_cachep, GFP_NOFS);
if (!info)
return -ENOMEM;
}
if (!bitmap) {
spin_lock(&ctl->tree_lock);
info->offset = offset;
info->bytes = bytes;
info->max_extent_size = 0;
ret = link_free_space(ctl, info);
spin_unlock(&ctl->tree_lock);
if (ret)
kmem_cache_free(btrfs_free_space_cachep, info);
return ret;
}
if (!map) {
map = kmem_cache_zalloc(btrfs_free_space_bitmap_cachep, GFP_NOFS);
if (!map) {
kmem_cache_free(btrfs_free_space_cachep, info);
return -ENOMEM;
}
}
spin_lock(&ctl->tree_lock);
bitmap_info = tree_search_offset(ctl, offset_to_bitmap(ctl, offset),
1, 0);
if (!bitmap_info) {
info->bitmap = map;
map = NULL;
add_new_bitmap(ctl, info, offset);
bitmap_info = info;
info = NULL;
}
bytes_added = add_bytes_to_bitmap(ctl, bitmap_info, offset, bytes,
trim_state);
bytes -= bytes_added;
offset += bytes_added;
spin_unlock(&ctl->tree_lock);
if (bytes)
goto again;
if (info)
kmem_cache_free(btrfs_free_space_cachep, info);
if (map)
kmem_cache_free(btrfs_free_space_bitmap_cachep, map);
return 0;
}
/*
* Checks to see if the given range is in the free space cache. This is really
* just used to check the absence of space, so if there is free space in the
* range at all we will return 1.
*/
int test_check_exists(struct btrfs_block_group *cache,
u64 offset, u64 bytes)
{
struct btrfs_free_space_ctl *ctl = cache->free_space_ctl;
struct btrfs_free_space *info;
int ret = 0;
spin_lock(&ctl->tree_lock);
info = tree_search_offset(ctl, offset, 0, 0);
if (!info) {
info = tree_search_offset(ctl, offset_to_bitmap(ctl, offset),
1, 0);
if (!info)
goto out;
}
have_info:
if (info->bitmap) {
u64 bit_off, bit_bytes;
struct rb_node *n;
struct btrfs_free_space *tmp;
bit_off = offset;
bit_bytes = ctl->unit;
ret = search_bitmap(ctl, info, &bit_off, &bit_bytes, false);
if (!ret) {
if (bit_off == offset) {
ret = 1;
goto out;
} else if (bit_off > offset &&
offset + bytes > bit_off) {
ret = 1;
goto out;
}
}
n = rb_prev(&info->offset_index);
while (n) {
tmp = rb_entry(n, struct btrfs_free_space,
offset_index);
if (tmp->offset + tmp->bytes < offset)
break;
if (offset + bytes < tmp->offset) {
n = rb_prev(&tmp->offset_index);
continue;
}
info = tmp;
goto have_info;
}
n = rb_next(&info->offset_index);
while (n) {
tmp = rb_entry(n, struct btrfs_free_space,
offset_index);
if (offset + bytes < tmp->offset)
break;
if (tmp->offset + tmp->bytes < offset) {
n = rb_next(&tmp->offset_index);
continue;
}
info = tmp;
goto have_info;
}
ret = 0;
goto out;
}
if (info->offset == offset) {
ret = 1;
goto out;
}
if (offset > info->offset && offset < info->offset + info->bytes)
ret = 1;
out:
spin_unlock(&ctl->tree_lock);
return ret;
}
#endif /* CONFIG_BTRFS_FS_RUN_SANITY_TESTS */
// SPDX-License-Identifier: GPL-2.0
/*
* Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds
*
* Swap reorganised 29.12.95, Stephen Tweedie.
* kswapd added: 7.1.96 sct
* Removed kswapd_ctl limits, and swap out as many pages as needed
* to bring the system back to freepages.high: 2.4.97, Rik van Riel.
* Zone aware kswapd started 02/00, Kanoj Sarcar (kanoj@sgi.com).
* Multiqueue VM started 5.8.00, Rik van Riel.
*/
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/mm.h>
#include <linux/sched/mm.h>
#include <linux/module.h>
#include <linux/gfp.h>
#include <linux/kernel_stat.h>
#include <linux/swap.h>
#include <linux/pagemap.h>
#include <linux/init.h>
#include <linux/highmem.h>
#include <linux/vmpressure.h>
#include <linux/vmstat.h>
#include <linux/file.h>
#include <linux/writeback.h>
#include <linux/blkdev.h>
#include <linux/buffer_head.h> /* for try_to_release_page(),
buffer_heads_over_limit */
#include <linux/mm_inline.h>
#include <linux/backing-dev.h>
#include <linux/rmap.h>
#include <linux/topology.h>
#include <linux/cpu.h>
#include <linux/cpuset.h>
#include <linux/compaction.h>
#include <linux/notifier.h>
#include <linux/rwsem.h>
#include <linux/delay.h>
#include <linux/kthread.h>
#include <linux/freezer.h>
#include <linux/memcontrol.h>
#include <linux/migrate.h>
#include <linux/delayacct.h>
#include <linux/sysctl.h>
#include <linux/oom.h>
#include <linux/pagevec.h>
#include <linux/prefetch.h>
#include <linux/printk.h>
#include <linux/dax.h>
#include <linux/psi.h>
#include <asm/tlbflush.h>
#include <asm/div64.h>
#include <linux/swapops.h>
#include <linux/balloon_compaction.h>
#include "internal.h"
#define CREATE_TRACE_POINTS
#include <trace/events/vmscan.h>
struct scan_control {
/* How many pages shrink_list() should reclaim */
unsigned long nr_to_reclaim;
/*
* Nodemask of nodes allowed by the caller. If NULL, all nodes
* are scanned.
*/
nodemask_t *nodemask;
/*
* The memory cgroup that hit its limit and as a result is the
* primary target of this reclaim invocation.
*/
struct mem_cgroup *target_mem_cgroup;
/*
* Scan pressure balancing between anon and file LRUs
*/
unsigned long anon_cost;
unsigned long file_cost;
/* Can active pages be deactivated as part of reclaim? */
#define DEACTIVATE_ANON 1
#define DEACTIVATE_FILE 2
unsigned int may_deactivate:2;
unsigned int force_deactivate:1;
unsigned int skipped_deactivate:1;
/* Writepage batching in laptop mode; RECLAIM_WRITE */
unsigned int may_writepage:1;
/* Can mapped pages be reclaimed? */
unsigned int may_unmap:1;
/* Can pages be swapped as part of reclaim? */
unsigned int may_swap:1;
/*
* Cgroup memory below memory.low is protected as long as we
* don't threaten to OOM. If any cgroup is reclaimed at
* reduced force or passed over entirely due to its memory.low
* setting (memcg_low_skipped), and nothing is reclaimed as a
* result, then go back for one more cycle that reclaims the protected
* memory (memcg_low_reclaim) to avert OOM.
*/
unsigned int memcg_low_reclaim:1;
unsigned int memcg_low_skipped:1;
unsigned int hibernation_mode:1;
/* One of the zones is ready for compaction */
unsigned int compaction_ready:1;
/* There is easily reclaimable cold cache in the current node */
unsigned int cache_trim_mode:1;
/* The file pages on the current node are dangerously low */
unsigned int file_is_tiny:1;
/* Always discard instead of demoting to lower tier memory */
unsigned int no_demotion:1;
/* Allocation order */
s8 order;
/* Scan (total_size >> priority) pages at once */
s8 priority;
/* The highest zone to isolate pages for reclaim from */
s8 reclaim_idx;
/* This context's GFP mask */
gfp_t gfp_mask;
/* Incremented by the number of inactive pages that were scanned */
unsigned long nr_scanned;
/* Number of pages freed so far during a call to shrink_zones() */
unsigned long nr_reclaimed;
struct {
unsigned int dirty;
unsigned int unqueued_dirty;
unsigned int congested;
unsigned int writeback;
unsigned int immediate;
unsigned int file_taken;
unsigned int taken;
} nr;
/* for recording the reclaimed slab by now */
struct reclaim_state reclaim_state;
};
#ifdef ARCH_HAS_PREFETCHW
#define prefetchw_prev_lru_page(_page, _base, _field) \
do { \
if ((_page)->lru.prev != _base) { \
struct page *prev; \
\
prev = lru_to_page(&(_page->lru)); \
prefetchw(&prev->_field); \
} \
} while (0)
#else
#define prefetchw_prev_lru_page(_page, _base, _field) do { } while (0)
#endif
/*
* From 0 .. 200. Higher means more swappy.
*/
int vm_swappiness = 60;
static void set_task_reclaim_state(struct task_struct *task,
struct reclaim_state *rs)
{
/* Check for an overwrite */
WARN_ON_ONCE(rs && task->reclaim_state);
/* Check for the nulling of an already-nulled member */
WARN_ON_ONCE(!rs && !task->reclaim_state);
task->reclaim_state = rs;
}
static LIST_HEAD(shrinker_list);
static DECLARE_RWSEM(shrinker_rwsem);
#ifdef CONFIG_MEMCG
static int shrinker_nr_max;
/* The shrinker_info is expanded in a batch of BITS_PER_LONG */
static inline int shrinker_map_size(int nr_items)
{
return (DIV_ROUND_UP(nr_items, BITS_PER_LONG) * sizeof(unsigned long));
}
static inline int shrinker_defer_size(int nr_items)
{
return (round_up(nr_items, BITS_PER_LONG) * sizeof(atomic_long_t));
}
static struct shrinker_info *shrinker_info_protected(struct mem_cgroup *memcg,
int nid)
{
return rcu_dereference_protected(memcg->nodeinfo[nid]->shrinker_info,
lockdep_is_held(&shrinker_rwsem));
}
static int expand_one_shrinker_info(struct mem_cgroup *memcg,
int map_size, int defer_size,
int old_map_size, int old_defer_size)
{
struct shrinker_info *new, *old;
struct mem_cgroup_per_node *pn;
int nid;
int size = map_size + defer_size;
for_each_node(nid) {
pn = memcg->nodeinfo[nid];
old = shrinker_info_protected(memcg, nid);
/* Not yet online memcg */
if (!old)
return 0;
new = kvmalloc_node(sizeof(*new) + size, GFP_KERNEL, nid);
if (!new)
return -ENOMEM;
new->nr_deferred = (atomic_long_t *)(new + 1);
new->map = (void *)new->nr_deferred + defer_size;
/* map: set all old bits, clear all new bits */
memset(new->map, (int)0xff, old_map_size);
memset((void *)new->map + old_map_size, 0, map_size - old_map_size);
/* nr_deferred: copy old values, clear all new values */
memcpy(new->nr_deferred, old->nr_deferred, old_defer_size);
memset((void *)new->nr_deferred + old_defer_size, 0,
defer_size - old_defer_size);
rcu_assign_pointer(pn->shrinker_info, new);
kvfree_rcu(old, rcu);
}
return 0;
}
void free_shrinker_info(struct mem_cgroup *memcg)
{
struct mem_cgroup_per_node *pn;
struct shrinker_info *info;
int nid;
for_each_node(nid) {
pn = memcg->nodeinfo[nid];
info = rcu_dereference_protected(pn->shrinker_info, true);
kvfree(info);
rcu_assign_pointer(pn->shrinker_info, NULL);
}
}
int alloc_shrinker_info(struct mem_cgroup *memcg)
{
struct shrinker_info *info;
int nid, size, ret = 0;
int map_size, defer_size = 0;
down_write(&shrinker_rwsem);
map_size = shrinker_map_size(shrinker_nr_max);
defer_size = shrinker_defer_size(shrinker_nr_max);
size = map_size + defer_size;
for_each_node(nid) {
info = kvzalloc_node(sizeof(*info) + size, GFP_KERNEL, nid);
if (!info) {
free_shrinker_info(memcg);
ret = -ENOMEM;
break;
}
info->nr_deferred = (atomic_long_t *)(info + 1);
info->map = (void *)info->nr_deferred + defer_size;
rcu_assign_pointer(memcg->nodeinfo[nid]->shrinker_info, info);
}
up_write(&shrinker_rwsem);
return ret;
}
static inline bool need_expand(int nr_max)
{
return round_up(nr_max, BITS_PER_LONG) >
round_up(shrinker_nr_max, BITS_PER_LONG);
}
static int expand_shrinker_info(int new_id)
{
int ret = 0;
int new_nr_max = new_id + 1;
int map_size, defer_size = 0;
int old_map_size, old_defer_size = 0;
struct mem_cgroup *memcg;
if (!need_expand(new_nr_max))
goto out;
if (!root_mem_cgroup)
goto out;
lockdep_assert_held(&shrinker_rwsem);
map_size = shrinker_map_size(new_nr_max);
defer_size = shrinker_defer_size(new_nr_max);
old_map_size = shrinker_map_size(shrinker_nr_max);
old_defer_size = shrinker_defer_size(shrinker_nr_max);
memcg = mem_cgroup_iter(NULL, NULL, NULL);
do {
ret = expand_one_shrinker_info(memcg, map_size, defer_size,
old_map_size, old_defer_size);
if (ret) {
mem_cgroup_iter_break(NULL, memcg);
goto out;
}
} while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)) != NULL);
out:
if (!ret)
shrinker_nr_max = new_nr_max;
return ret;
}
void set_shrinker_bit(struct mem_cgroup *memcg, int nid, int shrinker_id)
{
if (shrinker_id >= 0 && memcg && !mem_cgroup_is_root(memcg)) {
struct shrinker_info *info;
rcu_read_lock();
info = rcu_dereference(memcg->nodeinfo[nid]->shrinker_info);
/* Pairs with smp mb in shrink_slab() */
smp_mb__before_atomic();
set_bit(shrinker_id, info->map);
rcu_read_unlock();
}
}
static DEFINE_IDR(shrinker_idr);
static int prealloc_memcg_shrinker(struct shrinker *shrinker)
{
int id, ret = -ENOMEM;
if (mem_cgroup_disabled())
return -ENOSYS;
down_write(&shrinker_rwsem);
/* This may call shrinker, so it must use down_read_trylock() */
id = idr_alloc(&shrinker_idr, shrinker, 0, 0, GFP_KERNEL);
if (id < 0)
goto unlock;
if (id >= shrinker_nr_max) {
if (expand_shrinker_info(id)) {
idr_remove(&shrinker_idr, id);
goto unlock;
}
}
shrinker->id = id;
ret = 0;
unlock:
up_write(&shrinker_rwsem);
return ret;
}
static void unregister_memcg_shrinker(struct shrinker *shrinker)
{
int id = shrinker->id;
BUG_ON(id < 0);
lockdep_assert_held(&shrinker_rwsem);
idr_remove(&shrinker_idr, id);
}
static long xchg_nr_deferred_memcg(int nid, struct shrinker *shrinker,
struct mem_cgroup *memcg)
{
struct shrinker_info *info;
info = shrinker_info_protected(memcg, nid);
return atomic_long_xchg(&info->nr_deferred[shrinker->id], 0);
}
static long add_nr_deferred_memcg(long nr, int nid, struct shrinker *shrinker,
struct mem_cgroup *memcg)
{
struct shrinker_info *info;
info = shrinker_info_protected(memcg, nid);
return atomic_long_add_return(nr, &info->nr_deferred[shrinker->id]);
}
void reparent_shrinker_deferred(struct mem_cgroup *memcg)
{
int i, nid;
long nr;
struct mem_cgroup *parent;
struct shrinker_info *child_info, *parent_info;
parent = parent_mem_cgroup(memcg);
if (!parent)
parent = root_mem_cgroup;
/* Prevent from concurrent shrinker_info expand */
down_read(&shrinker_rwsem);
for_each_node(nid) {
child_info = shrinker_info_protected(memcg, nid);
parent_info = shrinker_info_protected(parent, nid);
for (i = 0; i < shrinker_nr_max; i++) {
nr = atomic_long_read(&child_info->nr_deferred[i]);
atomic_long_add(nr, &parent_info->nr_deferred[i]);
}
}
up_read(&shrinker_rwsem);
}
static bool cgroup_reclaim(struct scan_control *sc)
{
return sc->target_mem_cgroup;
}
/**
* writeback_throttling_sane - is the usual dirty throttling mechanism available?
* @sc: scan_control in question
*
* The normal page dirty throttling mechanism in balance_dirty_pages() is
* completely broken with the legacy memcg and direct stalling in
* shrink_page_list() is used for throttling instead, which lacks all the
* niceties such as fairness, adaptive pausing, bandwidth proportional
* allocation and configurability.
*
* This function tests whether the vmscan currently in progress can assume
* that the normal dirty throttling mechanism is operational.
*/
static bool writeback_throttling_sane(struct scan_control *sc)
{
if (!cgroup_reclaim(sc))
return true;
#ifdef CONFIG_CGROUP_WRITEBACK
if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
return true;
#endif
return false;
}
#else
static int prealloc_memcg_shrinker(struct shrinker *shrinker)
{
return -ENOSYS;
}
static void unregister_memcg_shrinker(struct shrinker *shrinker)
{
}
static long xchg_nr_deferred_memcg(int nid, struct shrinker *shrinker,
struct mem_cgroup *memcg)
{
return 0;
}
static long add_nr_deferred_memcg(long nr, int nid, struct shrinker *shrinker,
struct mem_cgroup *memcg)
{
return 0;
}
static bool cgroup_reclaim(struct scan_control *sc)
{
return false;
}
static bool writeback_throttling_sane(struct scan_control *sc)
{
return true;
}
#endif
static long xchg_nr_deferred(struct shrinker *shrinker,
struct shrink_control *sc)
{
int nid = sc->nid;
if (!(shrinker->flags & SHRINKER_NUMA_AWARE))
nid = 0;
if (sc->memcg &&
(shrinker->flags & SHRINKER_MEMCG_AWARE))
return xchg_nr_deferred_memcg(nid, shrinker,
sc->memcg);
return atomic_long_xchg(&shrinker->nr_deferred[nid], 0);
}
static long add_nr_deferred(long nr, struct shrinker *shrinker,
struct shrink_control *sc)
{
int nid = sc->nid;
if (!(shrinker->flags & SHRINKER_NUMA_AWARE))
nid = 0;
if (sc->memcg &&
(shrinker->flags & SHRINKER_MEMCG_AWARE))
return add_nr_deferred_memcg(nr, nid, shrinker,
sc->memcg);
return atomic_long_add_return(nr, &shrinker->nr_deferred[nid]);
}
static bool can_demote(int nid, struct scan_control *sc)
{
if (!numa_demotion_enabled)
return false;
if (sc) {
if (sc->no_demotion)
return false;
/* It is pointless to do demotion in memcg reclaim */
if (cgroup_reclaim(sc))
return false;
}
if (next_demotion_node(nid) == NUMA_NO_NODE)
return false;
return true;
}
static inline bool can_reclaim_anon_pages(struct mem_cgroup *memcg,
int nid,
struct scan_control *sc)
{
if (memcg == NULL) {
/*
* For non-memcg reclaim, is there
* space in any swap device?
*/
if (get_nr_swap_pages() > 0)
return true;
} else {
/* Is the memcg below its swap limit? */
if (mem_cgroup_get_nr_swap_pages(memcg) > 0)
return true;
}
/*
* The page can not be swapped.
*
* Can it be reclaimed from this node via demotion?
*/
return can_demote(nid, sc);
}
/*
* This misses isolated pages which are not accounted for to save counters.
* As the data only determines if reclaim or compaction continues, it is
* not expected that isolated pages will be a dominating factor.
*/
unsigned long zone_reclaimable_pages(struct zone *zone)
{
unsigned long nr;
nr = zone_page_state_snapshot(zone, NR_ZONE_INACTIVE_FILE) +
zone_page_state_snapshot(zone, NR_ZONE_ACTIVE_FILE);
if (can_reclaim_anon_pages(NULL, zone_to_nid(zone), NULL))
nr += zone_page_state_snapshot(zone, NR_ZONE_INACTIVE_ANON) +
zone_page_state_snapshot(zone, NR_ZONE_ACTIVE_ANON);
return nr;
}
/**
* lruvec_lru_size - Returns the number of pages on the given LRU list.
* @lruvec: lru vector
* @lru: lru to use
* @zone_idx: zones to consider (use MAX_NR_ZONES for the whole LRU list)
*/
static unsigned long lruvec_lru_size(struct lruvec *lruvec, enum lru_list lru,
int zone_idx)
{
unsigned long size = 0;
int zid;
for (zid = 0; zid <= zone_idx && zid < MAX_NR_ZONES; zid++) {
struct zone *zone = &lruvec_pgdat(lruvec)->node_zones[zid];
if (!managed_zone(zone))
continue;
if (!mem_cgroup_disabled())
size += mem_cgroup_get_zone_lru_size(lruvec, lru, zid);
else
size += zone_page_state(zone, NR_ZONE_LRU_BASE + lru);
}
return size;
}
/*
* Add a shrinker callback to be called from the vm.
*/
int prealloc_shrinker(struct shrinker *shrinker)
{
unsigned int size;
int err;
if (shrinker->flags & SHRINKER_MEMCG_AWARE) {
err = prealloc_memcg_shrinker(shrinker);
if (err != -ENOSYS)
return err;
shrinker->flags &= ~SHRINKER_MEMCG_AWARE;
}
size = sizeof(*shrinker->nr_deferred);
if (shrinker->flags & SHRINKER_NUMA_AWARE) size *= nr_node_ids; shrinker->nr_deferred = kzalloc(size, GFP_KERNEL);
if (!shrinker->nr_deferred)
return -ENOMEM;
return 0;
}
void free_prealloced_shrinker(struct shrinker *shrinker)
{
if (shrinker->flags & SHRINKER_MEMCG_AWARE) { down_write(&shrinker_rwsem);
unregister_memcg_shrinker(shrinker);
up_write(&shrinker_rwsem);
return;
}
kfree(shrinker->nr_deferred);
shrinker->nr_deferred = NULL;
}
void register_shrinker_prepared(struct shrinker *shrinker)
{
down_write(&shrinker_rwsem);
list_add_tail(&shrinker->list, &shrinker_list);
shrinker->flags |= SHRINKER_REGISTERED;
up_write(&shrinker_rwsem);
}
int register_shrinker(struct shrinker *shrinker)
{
int err = prealloc_shrinker(shrinker);
if (err)
return err;
register_shrinker_prepared(shrinker);
return 0;
}
EXPORT_SYMBOL(register_shrinker);
/*
* Remove one
*/
void unregister_shrinker(struct shrinker *shrinker)
{
if (!(shrinker->flags & SHRINKER_REGISTERED))
return;
down_write(&shrinker_rwsem);
list_del(&shrinker->list);
shrinker->flags &= ~SHRINKER_REGISTERED;
if (shrinker->flags & SHRINKER_MEMCG_AWARE)
unregister_memcg_shrinker(shrinker);
up_write(&shrinker_rwsem);
kfree(shrinker->nr_deferred);
shrinker->nr_deferred = NULL;
}
EXPORT_SYMBOL(unregister_shrinker);
#define SHRINK_BATCH 128
static unsigned long do_shrink_slab(struct shrink_control *shrinkctl,
struct shrinker *shrinker, int priority)
{
unsigned long freed = 0;
unsigned long long delta;
long total_scan;
long freeable;
long nr;
long new_nr;
long batch_size = shrinker->batch ? shrinker->batch
: SHRINK_BATCH;
long scanned = 0, next_deferred;
freeable = shrinker->count_objects(shrinker, shrinkctl);
if (freeable == 0 || freeable == SHRINK_EMPTY)
return freeable;
/*
* copy the current shrinker scan count into a local variable
* and zero it so that other concurrent shrinker invocations
* don't also do this scanning work.
*/
nr = xchg_nr_deferred(shrinker, shrinkctl);
if (shrinker->seeks) {
delta = freeable >> priority;
delta *= 4;
do_div(delta, shrinker->seeks);
} else {
/*
* These objects don't require any IO to create. Trim
* them aggressively under memory pressure to keep
* them from causing refetches in the IO caches.
*/
delta = freeable / 2;
}
total_scan = nr >> priority;
total_scan += delta;
total_scan = min(total_scan, (2 * freeable));
trace_mm_shrink_slab_start(shrinker, shrinkctl, nr,
freeable, delta, total_scan, priority);
/*
* Normally, we should not scan less than batch_size objects in one
* pass to avoid too frequent shrinker calls, but if the slab has less
* than batch_size objects in total and we are really tight on memory,
* we will try to reclaim all available objects, otherwise we can end
* up failing allocations although there are plenty of reclaimable
* objects spread over several slabs with usage less than the
* batch_size.
*
* We detect the "tight on memory" situations by looking at the total
* number of objects we want to scan (total_scan). If it is greater
* than the total number of objects on slab (freeable), we must be
* scanning at high prio and therefore should try to reclaim as much as
* possible.
*/
while (total_scan >= batch_size ||
total_scan >= freeable) {
unsigned long ret;
unsigned long nr_to_scan = min(batch_size, total_scan);
shrinkctl->nr_to_scan = nr_to_scan;
shrinkctl->nr_scanned = nr_to_scan;
ret = shrinker->scan_objects(shrinker, shrinkctl);
if (ret == SHRINK_STOP)
break;
freed += ret;
count_vm_events(SLABS_SCANNED, shrinkctl->nr_scanned);
total_scan -= shrinkctl->nr_scanned;
scanned += shrinkctl->nr_scanned;
cond_resched();
}
/*
* The deferred work is increased by any new work (delta) that wasn't
* done, decreased by old deferred work that was done now.
*
* And it is capped to two times of the freeable items.
*/
next_deferred = max_t(long, (nr + delta - scanned), 0);
next_deferred = min(next_deferred, (2 * freeable));
/*
* move the unused scan count back into the shrinker in a
* manner that handles concurrent updates.
*/
new_nr = add_nr_deferred(next_deferred, shrinker, shrinkctl);
trace_mm_shrink_slab_end(shrinker, shrinkctl->nid, freed, nr, new_nr, total_scan);
return freed;
}
#ifdef CONFIG_MEMCG
static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid,
struct mem_cgroup *memcg, int priority)
{
struct shrinker_info *info;
unsigned long ret, freed = 0;
int i;
if (!mem_cgroup_online(memcg))
return 0;
if (!down_read_trylock(&shrinker_rwsem))
return 0;
info = shrinker_info_protected(memcg, nid);
if (unlikely(!info))
goto unlock;
for_each_set_bit(i, info->map, shrinker_nr_max) {
struct shrink_control sc = {
.gfp_mask = gfp_mask,
.nid = nid,
.memcg = memcg,
};
struct shrinker *shrinker;
shrinker = idr_find(&shrinker_idr, i);
if (unlikely(!shrinker || !(shrinker->flags & SHRINKER_REGISTERED))) {
if (!shrinker)
clear_bit(i, info->map);
continue;
}
/* Call non-slab shrinkers even though kmem is disabled */
if (!memcg_kmem_enabled() &&
!(shrinker->flags & SHRINKER_NONSLAB))
continue;
ret = do_shrink_slab(&sc, shrinker, priority);
if (ret == SHRINK_EMPTY) {
clear_bit(i, info->map);
/*
* After the shrinker reported that it had no objects to
* free, but before we cleared the corresponding bit in
* the memcg shrinker map, a new object might have been
* added. To make sure, we have the bit set in this
* case, we invoke the shrinker one more time and reset
* the bit if it reports that it is not empty anymore.
* The memory barrier here pairs with the barrier in
* set_shrinker_bit():
*
* list_lru_add() shrink_slab_memcg()
* list_add_tail() clear_bit()
* <MB> <MB>
* set_bit() do_shrink_slab()
*/
smp_mb__after_atomic();
ret = do_shrink_slab(&sc, shrinker, priority);
if (ret == SHRINK_EMPTY)
ret = 0;
else
set_shrinker_bit(memcg, nid, i);
}
freed += ret;
if (rwsem_is_contended(&shrinker_rwsem)) {
freed = freed ? : 1;
break;
}
}
unlock:
up_read(&shrinker_rwsem);
return freed;
}
#else /* CONFIG_MEMCG */
static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid,
struct mem_cgroup *memcg, int priority)
{
return 0;
}
#endif /* CONFIG_MEMCG */
/**
* shrink_slab - shrink slab caches
* @gfp_mask: allocation context
* @nid: node whose slab caches to target
* @memcg: memory cgroup whose slab caches to target
* @priority: the reclaim priority
*
* Call the shrink functions to age shrinkable caches.
*
* @nid is passed along to shrinkers with SHRINKER_NUMA_AWARE set,
* unaware shrinkers will receive a node id of 0 instead.
*
* @memcg specifies the memory cgroup to target. Unaware shrinkers
* are called only if it is the root cgroup.
*
* @priority is sc->priority, we take the number of objects and >> by priority
* in order to get the scan target.
*
* Returns the number of reclaimed slab objects.
*/
static unsigned long shrink_slab(gfp_t gfp_mask, int nid,
struct mem_cgroup *memcg,
int priority)
{
unsigned long ret, freed = 0;
struct shrinker *shrinker;
/*
* The root memcg might be allocated even though memcg is disabled
* via "cgroup_disable=memory" boot parameter. This could make
* mem_cgroup_is_root() return false, then just run memcg slab
* shrink, but skip global shrink. This may result in premature
* oom.
*/
if (!mem_cgroup_disabled() && !mem_cgroup_is_root(memcg))
return shrink_slab_memcg(gfp_mask, nid, memcg, priority);
if (!down_read_trylock(&shrinker_rwsem))
goto out;
list_for_each_entry(shrinker, &shrinker_list, list) {
struct shrink_control sc = {
.gfp_mask = gfp_mask,
.nid = nid,
.memcg = memcg,
};
ret = do_shrink_slab(&sc, shrinker, priority);
if (ret == SHRINK_EMPTY)
ret = 0;
freed += ret;
/*
* Bail out if someone want to register a new shrinker to
* prevent the registration from being stalled for long periods
* by parallel ongoing shrinking.
*/
if (rwsem_is_contended(&shrinker_rwsem)) {
freed = freed ? : 1;
break;
}
}
up_read(&shrinker_rwsem);
out:
cond_resched();
return freed;
}
void drop_slab_node(int nid)
{
unsigned long freed;
int shift = 0;
do {
struct mem_cgroup *memcg = NULL;
if (fatal_signal_pending(current))
return;
freed = 0;
memcg = mem_cgroup_iter(NULL, NULL, NULL);
do {
freed += shrink_slab(GFP_KERNEL, nid, memcg, 0);
} while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)) != NULL);
} while ((freed >> shift++) > 1);
}
void drop_slab(void)
{
int nid;
for_each_online_node(nid)
drop_slab_node(nid);
}
static inline int is_page_cache_freeable(struct page *page)
{
/*
* A freeable page cache page is referenced only by the caller
* that isolated the page, the page cache and optional buffer
* heads at page->private.
*/
int page_cache_pins = thp_nr_pages(page);
return page_count(page) - page_has_private(page) == 1 + page_cache_pins;
}
static int may_write_to_inode(struct inode *inode)
{
if (current->flags & PF_SWAPWRITE)
return 1;
if (!inode_write_congested(inode))
return 1;
if (inode_to_bdi(inode) == current->backing_dev_info)
return 1;
return 0;
}
/*
* We detected a synchronous write error writing a page out. Probably
* -ENOSPC. We need to propagate that into the address_space for a subsequent
* fsync(), msync() or close().
*
* The tricky part is that after writepage we cannot touch the mapping: nothing
* prevents it from being freed up. But we have a ref on the page and once
* that page is locked, the mapping is pinned.
*
* We're allowed to run sleeping lock_page() here because we know the caller has
* __GFP_FS.
*/
static void handle_write_error(struct address_space *mapping,
struct page *page, int error)
{
lock_page(page);
if (page_mapping(page) == mapping)
mapping_set_error(mapping, error);
unlock_page(page);
}
/* possible outcome of pageout() */
typedef enum {
/* failed to write page out, page is locked */
PAGE_KEEP,
/* move page to the active list, page is locked */
PAGE_ACTIVATE,
/* page has been sent to the disk successfully, page is unlocked */
PAGE_SUCCESS,
/* page is clean and locked */
PAGE_CLEAN,
} pageout_t;
/*
* pageout is called by shrink_page_list() for each dirty page.
* Calls ->writepage().
*/
static pageout_t pageout(struct page *page, struct address_space *mapping)
{
/*
* If the page is dirty, only perform writeback if that write
* will be non-blocking. To prevent this allocation from being
* stalled by pagecache activity. But note that there may be
* stalls if we need to run get_block(). We could test
* PagePrivate for that.
*
* If this process is currently in __generic_file_write_iter() against
* this page's queue, we can perform writeback even if that
* will block.
*
* If the page is swapcache, write it back even if that would
* block, for some throttling. This happens by accident, because
* swap_backing_dev_info is bust: it doesn't reflect the
* congestion state of the swapdevs. Easy to fix, if needed.
*/
if (!is_page_cache_freeable(page))
return PAGE_KEEP;
if (!mapping) {
/*
* Some data journaling orphaned pages can have
* page->mapping == NULL while being dirty with clean buffers.
*/
if (page_has_private(page)) {
if (try_to_free_buffers(page)) {
ClearPageDirty(page);
pr_info("%s: orphaned page\n", __func__);
return PAGE_CLEAN;
}
}
return PAGE_KEEP;
}
if (mapping->a_ops->writepage == NULL)
return PAGE_ACTIVATE;
if (!may_write_to_inode(mapping->host))
return PAGE_KEEP;
if (clear_page_dirty_for_io(page)) {
int res;
struct writeback_control wbc = {
.sync_mode = WB_SYNC_NONE,
.nr_to_write = SWAP_CLUSTER_MAX,
.range_start = 0,
.range_end = LLONG_MAX,
.for_reclaim = 1,
};
SetPageReclaim(page);
res = mapping->a_ops->writepage(page, &wbc);
if (res < 0)
handle_write_error(mapping, page, res);
if (res == AOP_WRITEPAGE_ACTIVATE) {
ClearPageReclaim(page);
return PAGE_ACTIVATE;
}
if (!PageWriteback(page)) {
/* synchronous write or broken a_ops? */
ClearPageReclaim(page);
}
trace_mm_vmscan_writepage(page);
inc_node_page_state(page, NR_VMSCAN_WRITE);
return PAGE_SUCCESS;
}
return PAGE_CLEAN;
}
/*
* Same as remove_mapping, but if the page is removed from the mapping, it
* gets returned with a refcount of 0.
*/
static int __remove_mapping(struct address_space *mapping, struct page *page,
bool reclaimed, struct mem_cgroup *target_memcg)
{
int refcount;
void *shadow = NULL;
BUG_ON(!PageLocked(page)); BUG_ON(mapping != page_mapping(page));
xa_lock_irq(&mapping->i_pages);
/*
* The non racy check for a busy page.
*
* Must be careful with the order of the tests. When someone has
* a ref to the page, it may be possible that they dirty it then
* drop the reference. So if PageDirty is tested before page_count
* here, then the following race may occur:
*
* get_user_pages(&page);
* [user mapping goes away]
* write_to(page);
* !PageDirty(page) [good]
* SetPageDirty(page);
* put_page(page);
* !page_count(page) [good, discard it]
*
* [oops, our write_to data is lost]
*
* Reversing the order of the tests ensures such a situation cannot
* escape unnoticed. The smp_rmb is needed to ensure the page->flags
* load is not satisfied before that of page->_refcount.
*
* Note that if SetPageDirty is always performed via set_page_dirty,
* and thus under the i_pages lock, then this ordering is not required.
*/
refcount = 1 + compound_nr(page);
if (!page_ref_freeze(page, refcount))
goto cannot_free;
/* note: atomic_cmpxchg in page_ref_freeze provides the smp_rmb */
if (unlikely(PageDirty(page))) {
page_ref_unfreeze(page, refcount);
goto cannot_free;
}
if (PageSwapCache(page)) {
swp_entry_t swap = { .val = page_private(page) };
mem_cgroup_swapout(page, swap);
if (reclaimed && !mapping_exiting(mapping))
shadow = workingset_eviction(page, target_memcg); __delete_from_swap_cache(page, swap, shadow);
xa_unlock_irq(&mapping->i_pages);
put_swap_page(page, swap);
} else {
void (*freepage)(struct page *);
freepage = mapping->a_ops->freepage;
/*
* Remember a shadow entry for reclaimed file cache in
* order to detect refaults, thus thrashing, later on.
*
* But don't store shadows in an address space that is
* already exiting. This is not just an optimization,
* inode reclaim needs to empty out the radix tree or
* the nodes are lost. Don't plant shadows behind its
* back.
*
* We also don't store shadows for DAX mappings because the
* only page cache pages found in these are zero pages
* covering holes, and because we don't want to mix DAX
* exceptional entries and shadow exceptional entries in the
* same address_space.
*/
if (reclaimed && page_is_file_lru(page) &&
!mapping_exiting(mapping) && !dax_mapping(mapping))
shadow = workingset_eviction(page, target_memcg); __delete_from_page_cache(page, shadow);
xa_unlock_irq(&mapping->i_pages);
if (freepage != NULL)
freepage(page);
}
return 1;
cannot_free:
xa_unlock_irq(&mapping->i_pages);
return 0;
}
/*
* Attempt to detach a locked page from its ->mapping. If it is dirty or if
* someone else has a ref on the page, abort and return 0. If it was
* successfully detached, return 1. Assumes the caller has a single ref on
* this page.
*/
int remove_mapping(struct address_space *mapping, struct page *page)
{
if (__remove_mapping(mapping, page, false, NULL)) {
/*
* Unfreezing the refcount with 1 rather than 2 effectively
* drops the pagecache ref for us without requiring another
* atomic operation.
*/
page_ref_unfreeze(page, 1);
return 1;
}
return 0;
}
/**
* putback_lru_page - put previously isolated page onto appropriate LRU list
* @page: page to be put back to appropriate lru list
*
* Add previously isolated @page to appropriate LRU list.
* Page may still be unevictable for other reasons.
*
* lru_lock must not be held, interrupts must be enabled.
*/
void putback_lru_page(struct page *page)
{
lru_cache_add(page);
put_page(page); /* drop ref from isolate */
}
enum page_references {
PAGEREF_RECLAIM,
PAGEREF_RECLAIM_CLEAN,
PAGEREF_KEEP,
PAGEREF_ACTIVATE,
};
static enum page_references page_check_references(struct page *page,
struct scan_control *sc)
{
int referenced_ptes, referenced_page;
unsigned long vm_flags;
referenced_ptes = page_referenced(page, 1, sc->target_mem_cgroup,
&vm_flags);
referenced_page = TestClearPageReferenced(page);
/*
* Mlock lost the isolation race with us. Let try_to_unmap()
* move the page to the unevictable list.
*/
if (vm_flags & VM_LOCKED)
return PAGEREF_RECLAIM;
if (referenced_ptes) {
/*
* All mapped pages start out with page table
* references from the instantiating fault, so we need
* to look twice if a mapped file page is used more
* than once.
*
* Mark it and spare it for another trip around the
* inactive list. Another page table reference will
* lead to its activation.
*
* Note: the mark is set for activated pages as well
* so that recently deactivated but used pages are
* quickly recovered.
*/
SetPageReferenced(page);
if (referenced_page || referenced_ptes > 1)
return PAGEREF_ACTIVATE;
/*
* Activate file-backed executable pages after first usage.
*/
if ((vm_flags & VM_EXEC) && !PageSwapBacked(page))
return PAGEREF_ACTIVATE;
return PAGEREF_KEEP;
}
/* Reclaim if clean, defer dirty pages to writeback */
if (referenced_page && !PageSwapBacked(page))
return PAGEREF_RECLAIM_CLEAN;
return PAGEREF_RECLAIM;
}
/* Check if a page is dirty or under writeback */
static void page_check_dirty_writeback(struct page *page,
bool *dirty, bool *writeback)
{
struct address_space *mapping;
/*
* Anonymous pages are not handled by flushers and must be written
* from reclaim context. Do not stall reclaim based on them
*/
if (!page_is_file_lru(page) ||
(PageAnon(page) && !PageSwapBacked(page))) {
*dirty = false;
*writeback = false;
return;
}
/* By default assume that the page flags are accurate */
*dirty = PageDirty(page);
*writeback = PageWriteback(page);
/* Verify dirty/writeback state if the filesystem supports it */
if (!page_has_private(page))
return;
mapping = page_mapping(page);
if (mapping && mapping->a_ops->is_dirty_writeback)
mapping->a_ops->is_dirty_writeback(page, dirty, writeback);
}
static struct page *alloc_demote_page(struct page *page, unsigned long node)
{
struct migration_target_control mtc = {
/*
* Allocate from 'node', or fail quickly and quietly.
* When this happens, 'page' will likely just be discarded
* instead of migrated.
*/
.gfp_mask = (GFP_HIGHUSER_MOVABLE & ~__GFP_RECLAIM) |
__GFP_THISNODE | __GFP_NOWARN |
__GFP_NOMEMALLOC | GFP_NOWAIT,
.nid = node
};
return alloc_migration_target(page, (unsigned long)&mtc);
}
/*
* Take pages on @demote_list and attempt to demote them to
* another node. Pages which are not demoted are left on
* @demote_pages.
*/
static unsigned int demote_page_list(struct list_head *demote_pages,
struct pglist_data *pgdat)
{
int target_nid = next_demotion_node(pgdat->node_id);
unsigned int nr_succeeded;
int err;
if (list_empty(demote_pages))
return 0;
if (target_nid == NUMA_NO_NODE)
return 0;
/* Demotion ignores all cpuset and mempolicy settings */
err = migrate_pages(demote_pages, alloc_demote_page, NULL,
target_nid, MIGRATE_ASYNC, MR_DEMOTION,
&nr_succeeded);
if (current_is_kswapd())
__count_vm_events(PGDEMOTE_KSWAPD, nr_succeeded);
else
__count_vm_events(PGDEMOTE_DIRECT, nr_succeeded);
return nr_succeeded;
}
/*
* shrink_page_list() returns the number of reclaimed pages
*/
static unsigned int shrink_page_list(struct list_head *page_list,
struct pglist_data *pgdat,
struct scan_control *sc,
struct reclaim_stat *stat,
bool ignore_references)
{
LIST_HEAD(ret_pages);
LIST_HEAD(free_pages);
LIST_HEAD(demote_pages);
unsigned int nr_reclaimed = 0;
unsigned int pgactivate = 0;
bool do_demote_pass;
memset(stat, 0, sizeof(*stat));
cond_resched();
do_demote_pass = can_demote(pgdat->node_id, sc);
retry:
while (!list_empty(page_list)) {
struct address_space *mapping;
struct page *page;
enum page_references references = PAGEREF_RECLAIM;
bool dirty, writeback, may_enter_fs;
unsigned int nr_pages;
cond_resched();
page = lru_to_page(page_list);
list_del(&page->lru);
if (!trylock_page(page))
goto keep;
VM_BUG_ON_PAGE(PageActive(page), page);
nr_pages = compound_nr(page);
/* Account the number of base pages even though THP */
sc->nr_scanned += nr_pages;
if (unlikely(!page_evictable(page)))
goto activate_locked;
if (!sc->may_unmap && page_mapped(page))
goto keep_locked;
may_enter_fs = (sc->gfp_mask & __GFP_FS) ||
(PageSwapCache(page) && (sc->gfp_mask & __GFP_IO));
/*
* The number of dirty pages determines if a node is marked
* reclaim_congested which affects wait_iff_congested. kswapd
* will stall and start writing pages if the tail of the LRU
* is all dirty unqueued pages.
*/
page_check_dirty_writeback(page, &dirty, &writeback);
if (dirty || writeback)
stat->nr_dirty++;
if (dirty && !writeback)
stat->nr_unqueued_dirty++;
/*
* Treat this page as congested if the underlying BDI is or if
* pages are cycling through the LRU so quickly that the
* pages marked for immediate reclaim are making it to the
* end of the LRU a second time.
*/
mapping = page_mapping(page);
if (((dirty || writeback) && mapping &&
inode_write_congested(mapping->host)) ||
(writeback && PageReclaim(page)))
stat->nr_congested++;
/*
* If a page at the tail of the LRU is under writeback, there
* are three cases to consider.
*
* 1) If reclaim is encountering an excessive number of pages
* under writeback and this page is both under writeback and
* PageReclaim then it indicates that pages are being queued
* for IO but are being recycled through the LRU before the
* IO can complete. Waiting on the page itself risks an
* indefinite stall if it is impossible to writeback the
* page due to IO error or disconnected storage so instead
* note that the LRU is being scanned too quickly and the
* caller can stall after page list has been processed.
*
* 2) Global or new memcg reclaim encounters a page that is
* not marked for immediate reclaim, or the caller does not
* have __GFP_FS (or __GFP_IO if it's simply going to swap,
* not to fs). In this case mark the page for immediate
* reclaim and continue scanning.
*
* Require may_enter_fs because we would wait on fs, which
* may not have submitted IO yet. And the loop driver might
* enter reclaim, and deadlock if it waits on a page for
* which it is needed to do the write (loop masks off
* __GFP_IO|__GFP_FS for this reason); but more thought
* would probably show more reasons.
*
* 3) Legacy memcg encounters a page that is already marked
* PageReclaim. memcg does not have any dirty pages
* throttling so we could easily OOM just because too many
* pages are in writeback and there is nothing else to
* reclaim. Wait for the writeback to complete.
*
* In cases 1) and 2) we activate the pages to get them out of
* the way while we continue scanning for clean pages on the
* inactive list and refilling from the active list. The
* observation here is that waiting for disk writes is more
* expensive than potentially causing reloads down the line.
* Since they're marked for immediate reclaim, they won't put
* memory pressure on the cache working set any longer than it
* takes to write them to disk.
*/
if (PageWriteback(page)) {
/* Case 1 above */
if (current_is_kswapd() &&
PageReclaim(page) &&
test_bit(PGDAT_WRITEBACK, &pgdat->flags)) {
stat->nr_immediate++;
goto activate_locked;
/* Case 2 above */
} else if (writeback_throttling_sane(sc) ||
!PageReclaim(page) || !may_enter_fs) {
/*
* This is slightly racy - end_page_writeback()
* might have just cleared PageReclaim, then
* setting PageReclaim here end up interpreted
* as PageReadahead - but that does not matter
* enough to care. What we do want is for this
* page to have PageReclaim set next time memcg
* reclaim reaches the tests above, so it will
* then wait_on_page_writeback() to avoid OOM;
* and it's also appropriate in global reclaim.
*/
SetPageReclaim(page);
stat->nr_writeback++;
goto activate_locked;
/* Case 3 above */
} else {
unlock_page(page);
wait_on_page_writeback(page);
/* then go back and try same page again */
list_add_tail(&page->lru, page_list);
continue;
}
}
if (!ignore_references)
references = page_check_references(page, sc);
switch (references) {
case PAGEREF_ACTIVATE:
goto activate_locked;
case PAGEREF_KEEP:
stat->nr_ref_keep += nr_pages;
goto keep_locked;
case PAGEREF_RECLAIM:
case PAGEREF_RECLAIM_CLEAN:
; /* try to reclaim the page below */
}
/*
* Before reclaiming the page, try to relocate
* its contents to another node.
*/
if (do_demote_pass &&
(thp_migration_supported() || !PageTransHuge(page))) {
list_add(&page->lru, &demote_pages);
unlock_page(page);
continue;
}
/*
* Anonymous process memory has backing store?
* Try to allocate it some swap space here.
* Lazyfree page could be freed directly
*/
if (PageAnon(page) && PageSwapBacked(page)) {
if (!PageSwapCache(page)) {
if (!(sc->gfp_mask & __GFP_IO))
goto keep_locked;
if (page_maybe_dma_pinned(page))
goto keep_locked;
if (PageTransHuge(page)) {
/* cannot split THP, skip it */
if (!can_split_huge_page(page, NULL))
goto activate_locked;
/*
* Split pages without a PMD map right
* away. Chances are some or all of the
* tail pages can be freed without IO.
*/
if (!compound_mapcount(page) &&
split_huge_page_to_list(page,
page_list))
goto activate_locked;
}
if (!add_to_swap(page)) {
if (!PageTransHuge(page))
goto activate_locked_split;
/* Fallback to swap normal pages */
if (split_huge_page_to_list(page,
page_list))
goto activate_locked;
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
count_vm_event(THP_SWPOUT_FALLBACK);
#endif
if (!add_to_swap(page))
goto activate_locked_split;
}
may_enter_fs = true;
/* Adding to swap updated mapping */
mapping = page_mapping(page);
}
} else if (unlikely(PageTransHuge(page))) {
/* Split file THP */
if (split_huge_page_to_list(page, page_list))
goto keep_locked;
}
/*
* THP may get split above, need minus tail pages and update
* nr_pages to avoid accounting tail pages twice.
*
* The tail pages that are added into swap cache successfully
* reach here.
*/
if ((nr_pages > 1) && !PageTransHuge(page)) {
sc->nr_scanned -= (nr_pages - 1);
nr_pages = 1;
}
/*
* The page is mapped into the page tables of one or more
* processes. Try to unmap it here.
*/
if (page_mapped(page)) {
enum ttu_flags flags = TTU_BATCH_FLUSH;
bool was_swapbacked = PageSwapBacked(page);
if (unlikely(PageTransHuge(page)))
flags |= TTU_SPLIT_HUGE_PMD;
try_to_unmap(page, flags);
if (page_mapped(page)) {
stat->nr_unmap_fail += nr_pages;
if (!was_swapbacked && PageSwapBacked(page))
stat->nr_lazyfree_fail += nr_pages;
goto activate_locked;
}
}
if (PageDirty(page)) {
/*
* Only kswapd can writeback filesystem pages
* to avoid risk of stack overflow. But avoid
* injecting inefficient single-page IO into
* flusher writeback as much as possible: only
* write pages when we've encountered many
* dirty pages, and when we've already scanned
* the rest of the LRU for clean pages and see
* the same dirty pages again (PageReclaim).
*/
if (page_is_file_lru(page) &&
(!current_is_kswapd() || !PageReclaim(page) ||
!test_bit(PGDAT_DIRTY, &pgdat->flags))) {
/*
* Immediately reclaim when written back.
* Similar in principal to deactivate_page()
* except we already have the page isolated
* and know it's dirty
*/
inc_node_page_state(page, NR_VMSCAN_IMMEDIATE);
SetPageReclaim(page);
goto activate_locked;
}
if (references == PAGEREF_RECLAIM_CLEAN)
goto keep_locked;
if (!may_enter_fs)
goto keep_locked;
if (!sc->may_writepage)
goto keep_locked;
/*
* Page is dirty. Flush the TLB if a writable entry
* potentially exists to avoid CPU writes after IO
* starts and then write it out here.
*/
try_to_unmap_flush_dirty();
switch (pageout(page, mapping)) {
case PAGE_KEEP:
goto keep_locked;
case PAGE_ACTIVATE:
goto activate_locked;
case PAGE_SUCCESS:
stat->nr_pageout += thp_nr_pages(page);
if (PageWriteback(page))
goto keep;
if (PageDirty(page))
goto keep;
/*
* A synchronous write - probably a ramdisk. Go
* ahead and try to reclaim the page.
*/
if (!trylock_page(page))
goto keep;
if (PageDirty(page) || PageWriteback(page))
goto keep_locked;
mapping = page_mapping(page);
fallthrough;
case PAGE_CLEAN:
; /* try to free the page below */
}
}
/*
* If the page has buffers, try to free the buffer mappings
* associated with this page. If we succeed we try to free
* the page as well.
*
* We do this even if the page is PageDirty().
* try_to_release_page() does not perform I/O, but it is
* possible for a page to have PageDirty set, but it is actually
* clean (all its buffers are clean). This happens if the
* buffers were written out directly, with submit_bh(). ext3
* will do this, as well as the blockdev mapping.
* try_to_release_page() will discover that cleanness and will
* drop the buffers and mark the page clean - it can be freed.
*
* Rarely, pages can have buffers and no ->mapping. These are
* the pages which were not successfully invalidated in
* truncate_cleanup_page(). We try to drop those buffers here
* and if that worked, and the page is no longer mapped into
* process address space (page_count == 1) it can be freed.
* Otherwise, leave the page on the LRU so it is swappable.
*/
if (page_has_private(page)) {
if (!try_to_release_page(page, sc->gfp_mask))
goto activate_locked;
if (!mapping && page_count(page) == 1) {
unlock_page(page);
if (put_page_testzero(page))
goto free_it;
else {
/*
* rare race with speculative reference.
* the speculative reference will free
* this page shortly, so we may
* increment nr_reclaimed here (and
* leave it off the LRU).
*/
nr_reclaimed++;
continue;
}
}
}
if (PageAnon(page) && !PageSwapBacked(page)) {
/* follow __remove_mapping for reference */
if (!page_ref_freeze(page, 1))
goto keep_locked;
/*
* The page has only one reference left, which is
* from the isolation. After the caller puts the
* page back on lru and drops the reference, the
* page will be freed anyway. It doesn't matter
* which lru it goes. So we don't bother checking
* PageDirty here.
*/
count_vm_event(PGLAZYFREED);
count_memcg_page_event(page, PGLAZYFREED);
} else if (!mapping || !__remove_mapping(mapping, page, true,
sc->target_mem_cgroup))
goto keep_locked;
unlock_page(page);
free_it:
/*
* THP may get swapped out in a whole, need account
* all base pages.
*/
nr_reclaimed += nr_pages;
/*
* Is there need to periodically free_page_list? It would
* appear not as the counts should be low
*/
if (unlikely(PageTransHuge(page)))
destroy_compound_page(page);
else
list_add(&page->lru, &free_pages);
continue;
activate_locked_split:
/*
* The tail pages that are failed to add into swap cache
* reach here. Fixup nr_scanned and nr_pages.
*/
if (nr_pages > 1) {
sc->nr_scanned -= (nr_pages - 1);
nr_pages = 1;
}
activate_locked:
/* Not a candidate for swapping, so reclaim swap space. */
if (PageSwapCache(page) && (mem_cgroup_swap_full(page) ||
PageMlocked(page)))
try_to_free_swap(page);
VM_BUG_ON_PAGE(PageActive(page), page);
if (!PageMlocked(page)) {
int type = page_is_file_lru(page);
SetPageActive(page);
stat->nr_activate[type] += nr_pages;
count_memcg_page_event(page, PGACTIVATE);
}
keep_locked:
unlock_page(page);
keep:
list_add(&page->lru, &ret_pages);
VM_BUG_ON_PAGE(PageLRU(page) || PageUnevictable(page), page);
}
/* 'page_list' is always empty here */
/* Migrate pages selected for demotion */
nr_reclaimed += demote_page_list(&demote_pages, pgdat);
/* Pages that could not be demoted are still in @demote_pages */
if (!list_empty(&demote_pages)) {
/* Pages which failed to demoted go back on @page_list for retry: */
list_splice_init(&demote_pages, page_list);
do_demote_pass = false;
goto retry;
}
pgactivate = stat->nr_activate[0] + stat->nr_activate[1];
mem_cgroup_uncharge_list(&free_pages);
try_to_unmap_flush();
free_unref_page_list(&free_pages);
list_splice(&ret_pages, page_list);
count_vm_events(PGACTIVATE, pgactivate);
return nr_reclaimed;
}
unsigned int reclaim_clean_pages_from_list(struct zone *zone,
struct list_head *page_list)
{
struct scan_control sc = {
.gfp_mask = GFP_KERNEL,
.may_unmap = 1,
};
struct reclaim_stat stat;
unsigned int nr_reclaimed;
struct page *page, *next;
LIST_HEAD(clean_pages);
unsigned int noreclaim_flag;
list_for_each_entry_safe(page, next, page_list, lru) {
if (!PageHuge(page) && page_is_file_lru(page) &&
!PageDirty(page) && !__PageMovable(page) &&
!PageUnevictable(page)) {
ClearPageActive(page);
list_move(&page->lru, &clean_pages);
}
}
/*
* We should be safe here since we are only dealing with file pages and
* we are not kswapd and therefore cannot write dirty file pages. But
* call memalloc_noreclaim_save() anyway, just in case these conditions
* change in the future.
*/
noreclaim_flag = memalloc_noreclaim_save();
nr_reclaimed = shrink_page_list(&clean_pages, zone->zone_pgdat, &sc,
&stat, true);
memalloc_noreclaim_restore(noreclaim_flag);
list_splice(&clean_pages, page_list);
mod_node_page_state(zone->zone_pgdat, NR_ISOLATED_FILE,
-(long)nr_reclaimed);
/*
* Since lazyfree pages are isolated from file LRU from the beginning,
* they will rotate back to anonymous LRU in the end if it failed to
* discard so isolated count will be mismatched.
* Compensate the isolated count for both LRU lists.
*/
mod_node_page_state(zone->zone_pgdat, NR_ISOLATED_ANON,
stat.nr_lazyfree_fail);
mod_node_page_state(zone->zone_pgdat, NR_ISOLATED_FILE,
-(long)stat.nr_lazyfree_fail);
return nr_reclaimed;
}
/*
* Attempt to remove the specified page from its LRU. Only take this page
* if it is of the appropriate PageActive status. Pages which are being
* freed elsewhere are also ignored.
*
* page: page to consider
* mode: one of the LRU isolation modes defined above
*
* returns true on success, false on failure.
*/
bool __isolate_lru_page_prepare(struct page *page, isolate_mode_t mode)
{
/* Only take pages on the LRU. */
if (!PageLRU(page))
return false;
/* Compaction should not handle unevictable pages but CMA can do so */
if (PageUnevictable(page) && !(mode & ISOLATE_UNEVICTABLE))
return false;
/*
* To minimise LRU disruption, the caller can indicate that it only
* wants to isolate pages it will be able to operate on without
* blocking - clean pages for the most part.
*
* ISOLATE_ASYNC_MIGRATE is used to indicate that it only wants to pages
* that it is possible to migrate without blocking
*/
if (mode & ISOLATE_ASYNC_MIGRATE) {
/* All the caller can do on PageWriteback is block */
if (PageWriteback(page))
return false;
if (PageDirty(page)) {
struct address_space *mapping;
bool migrate_dirty;
/*
* Only pages without mappings or that have a
* ->migratepage callback are possible to migrate
* without blocking. However, we can be racing with
* truncation so it's necessary to lock the page
* to stabilise the mapping as truncation holds
* the page lock until after the page is removed
* from the page cache.
*/
if (!trylock_page(page))
return false;
mapping = page_mapping(page);
migrate_dirty = !mapping || mapping->a_ops->migratepage;
unlock_page(page);
if (!migrate_dirty)
return false;
}
}
if ((mode & ISOLATE_UNMAPPED) && page_mapped(page))
return false;
return true;
}
/*
* Update LRU sizes after isolating pages. The LRU size updates must
* be complete before mem_cgroup_update_lru_size due to a sanity check.
*/
static __always_inline void update_lru_sizes(struct lruvec *lruvec,
enum lru_list lru, unsigned long *nr_zone_taken)
{
int zid;
for (zid = 0; zid < MAX_NR_ZONES; zid++) {
if (!nr_zone_taken[zid])
continue;
update_lru_size(lruvec, lru, zid, -nr_zone_taken[zid]);
}
}
/*
* Isolating page from the lruvec to fill in @dst list by nr_to_scan times.
*
* lruvec->lru_lock is heavily contended. Some of the functions that
* shrink the lists perform better by taking out a batch of pages
* and working on them outside the LRU lock.
*
* For pagecache intensive workloads, this function is the hottest
* spot in the kernel (apart from copy_*_user functions).
*
* Lru_lock must be held before calling this function.
*
* @nr_to_scan: The number of eligible pages to look through on the list.
* @lruvec: The LRU vector to pull pages from.
* @dst: The temp list to put pages on to.
* @nr_scanned: The number of pages that were scanned.
* @sc: The scan_control struct for this reclaim session
* @lru: LRU list id for isolating
*
* returns how many pages were moved onto *@dst.
*/
static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
struct lruvec *lruvec, struct list_head *dst,
unsigned long *nr_scanned, struct scan_control *sc,
enum lru_list lru)
{
struct list_head *src = &lruvec->lists[lru];
unsigned long nr_taken = 0;
unsigned long nr_zone_taken[MAX_NR_ZONES] = { 0 };
unsigned long nr_skipped[MAX_NR_ZONES] = { 0, };
unsigned long skipped = 0;
unsigned long scan, total_scan, nr_pages;
LIST_HEAD(pages_skipped);
isolate_mode_t mode = (sc->may_unmap ? 0 : ISOLATE_UNMAPPED);
total_scan = 0;
scan = 0;
while (scan < nr_to_scan && !list_empty(src)) {
struct page *page;
page = lru_to_page(src);
prefetchw_prev_lru_page(page, src, flags);
nr_pages = compound_nr(page);
total_scan += nr_pages;
if (page_zonenum(page) > sc->reclaim_idx) {
list_move(&page->lru, &pages_skipped);
nr_skipped[page_zonenum(page)] += nr_pages;
continue;
}
/*
* Do not count skipped pages because that makes the function
* return with no isolated pages if the LRU mostly contains
* ineligible pages. This causes the VM to not reclaim any
* pages, triggering a premature OOM.
*
* Account all tail pages of THP. This would not cause
* premature OOM since __isolate_lru_page() returns -EBUSY
* only when the page is being freed somewhere else.
*/
scan += nr_pages;
if (!__isolate_lru_page_prepare(page, mode)) {
/* It is being freed elsewhere */
list_move(&page->lru, src);
continue;
}
/*
* Be careful not to clear PageLRU until after we're
* sure the page is not being freed elsewhere -- the
* page release code relies on it.
*/
if (unlikely(!get_page_unless_zero(page))) {
list_move(&page->lru, src);
continue;
}
if (!TestClearPageLRU(page)) {
/* Another thread is already isolating this page */
put_page(page);
list_move(&page->lru, src);
continue;
}
nr_taken += nr_pages;
nr_zone_taken[page_zonenum(page)] += nr_pages;
list_move(&page->lru, dst);
}
/*
* Splice any skipped pages to the start of the LRU list. Note that
* this disrupts the LRU order when reclaiming for lower zones but
* we cannot splice to the tail. If we did then the SWAP_CLUSTER_MAX
* scanning would soon rescan the same pages to skip and put the
* system at risk of premature OOM.
*/
if (!list_empty(&pages_skipped)) {
int zid;
list_splice(&pages_skipped, src);
for (zid = 0; zid < MAX_NR_ZONES; zid++) {
if (!nr_skipped[zid])
continue;
__count_zid_vm_events(PGSCAN_SKIP, zid, nr_skipped[zid]);
skipped += nr_skipped[zid];
}
}
*nr_scanned = total_scan;
trace_mm_vmscan_lru_isolate(sc->reclaim_idx, sc->order, nr_to_scan,
total_scan, skipped, nr_taken, mode, lru);
update_lru_sizes(lruvec, lru, nr_zone_taken);
return nr_taken;
}
/**
* isolate_lru_page - tries to isolate a page from its LRU list
* @page: page to isolate from its LRU list
*
* Isolates a @page from an LRU list, clears PageLRU and adjusts the
* vmstat statistic corresponding to whatever LRU list the page was on.
*
* Returns 0 if the page was removed from an LRU list.
* Returns -EBUSY if the page was not on an LRU list.
*
* The returned page will have PageLRU() cleared. If it was found on
* the active list, it will have PageActive set. If it was found on
* the unevictable list, it will have the PageUnevictable bit set. That flag
* may need to be cleared by the caller before letting the page go.
*
* The vmstat statistic corresponding to the list on which the page was
* found will be decremented.
*
* Restrictions:
*
* (1) Must be called with an elevated refcount on the page. This is a
* fundamental difference from isolate_lru_pages (which is called
* without a stable reference).
* (2) the lru_lock must not be held.
* (3) interrupts must be enabled.
*/
int isolate_lru_page(struct page *page)
{
int ret = -EBUSY;
VM_BUG_ON_PAGE(!page_count(page), page);
WARN_RATELIMIT(PageTail(page), "trying to isolate tail page");
if (TestClearPageLRU(page)) {
struct lruvec *lruvec;
get_page(page);
lruvec = lock_page_lruvec_irq(page);
del_page_from_lru_list(page, lruvec);
unlock_page_lruvec_irq(lruvec);
ret = 0;
}
return ret;
}
/*
* A direct reclaimer may isolate SWAP_CLUSTER_MAX pages from the LRU list and
* then get rescheduled. When there are massive number of tasks doing page
* allocation, such sleeping direct reclaimers may keep piling up on each CPU,
* the LRU list will go small and be scanned faster than necessary, leading to
* unnecessary swapping, thrashing and OOM.
*/
static int too_many_isolated(struct pglist_data *pgdat, int file,
struct scan_control *sc)
{
unsigned long inactive, isolated;
if (current_is_kswapd())
return 0;
if (!writeback_throttling_sane(sc))
return 0;
if (file) {
inactive = node_page_state(pgdat, NR_INACTIVE_FILE);
isolated = node_page_state(pgdat, NR_ISOLATED_FILE);
} else {
inactive = node_page_state(pgdat, NR_INACTIVE_ANON);
isolated = node_page_state(pgdat, NR_ISOLATED_ANON);
}
/*
* GFP_NOIO/GFP_NOFS callers are allowed to isolate more pages, so they
* won't get blocked by normal direct-reclaimers, forming a circular
* deadlock.
*/
if ((sc->gfp_mask & (__GFP_IO | __GFP_FS)) == (__GFP_IO | __GFP_FS))
inactive >>= 3;
return isolated > inactive;
}
/*
* move_pages_to_lru() moves pages from private @list to appropriate LRU list.
* On return, @list is reused as a list of pages to be freed by the caller.
*
* Returns the number of pages moved to the given lruvec.
*/
static unsigned int move_pages_to_lru(struct lruvec *lruvec,
struct list_head *list)
{
int nr_pages, nr_moved = 0;
LIST_HEAD(pages_to_free);
struct page *page;
while (!list_empty(list)) {
page = lru_to_page(list);
VM_BUG_ON_PAGE(PageLRU(page), page);
list_del(&page->lru);
if (unlikely(!page_evictable(page))) {
spin_unlock_irq(&lruvec->lru_lock);
putback_lru_page(page);
spin_lock_irq(&lruvec->lru_lock);
continue;
}
/*
* The SetPageLRU needs to be kept here for list integrity.
* Otherwise:
* #0 move_pages_to_lru #1 release_pages
* if !put_page_testzero
* if (put_page_testzero())
* !PageLRU //skip lru_lock
* SetPageLRU()
* list_add(&page->lru,)
* list_add(&page->lru,)
*/
SetPageLRU(page);
if (unlikely(put_page_testzero(page))) {
__clear_page_lru_flags(page);
if (unlikely(PageCompound(page))) {
spin_unlock_irq(&lruvec->lru_lock);
destroy_compound_page(page);
spin_lock_irq(&lruvec->lru_lock);
} else
list_add(&page->lru, &pages_to_free);
continue;
}
/*
* All pages were isolated from the same lruvec (and isolation
* inhibits memcg migration).
*/
VM_BUG_ON_PAGE(!page_matches_lruvec(page, lruvec), page);
add_page_to_lru_list(page, lruvec);
nr_pages = thp_nr_pages(page);
nr_moved += nr_pages;
if (PageActive(page))
workingset_age_nonresident(lruvec, nr_pages);
}
/*
* To save our caller's stack, now use input list for pages to free.
*/
list_splice(&pages_to_free, list);
return nr_moved;
}
/*
* If a kernel thread (such as nfsd for loop-back mounts) services
* a backing device by writing to the page cache it sets PF_LOCAL_THROTTLE.
* In that case we should only throttle if the backing device it is
* writing to is congested. In other cases it is safe to throttle.
*/
static int current_may_throttle(void)
{
return !(current->flags & PF_LOCAL_THROTTLE) ||
current->backing_dev_info == NULL ||
bdi_write_congested(current->backing_dev_info);
}
/*
* shrink_inactive_list() is a helper for shrink_node(). It returns the number
* of reclaimed pages
*/
static unsigned long
shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
struct scan_control *sc, enum lru_list lru)
{
LIST_HEAD(page_list);
unsigned long nr_scanned;
unsigned int nr_reclaimed = 0;
unsigned long nr_taken;
struct reclaim_stat stat;
bool file = is_file_lru(lru);
enum vm_event_item item;
struct pglist_data *pgdat = lruvec_pgdat(lruvec);
bool stalled = false;
while (unlikely(too_many_isolated(pgdat, file, sc))) {
if (stalled)
return 0;
/* wait a bit for the reclaimer. */
msleep(100);
stalled = true;
/* We are about to die and free our memory. Return now. */
if (fatal_signal_pending(current))
return SWAP_CLUSTER_MAX;
}
lru_add_drain();
spin_lock_irq(&lruvec->lru_lock);
nr_taken = isolate_lru_pages(nr_to_scan, lruvec, &page_list,
&nr_scanned, sc, lru);
__mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, nr_taken);
item = current_is_kswapd() ? PGSCAN_KSWAPD : PGSCAN_DIRECT;
if (!cgroup_reclaim(sc))
__count_vm_events(item, nr_scanned);
__count_memcg_events(lruvec_memcg(lruvec), item, nr_scanned);
__count_vm_events(PGSCAN_ANON + file, nr_scanned);
spin_unlock_irq(&lruvec->lru_lock);
if (nr_taken == 0)
return 0;
nr_reclaimed = shrink_page_list(&page_list, pgdat, sc, &stat, false);
spin_lock_irq(&lruvec->lru_lock);
move_pages_to_lru(lruvec, &page_list);
__mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, -nr_taken);
item = current_is_kswapd() ? PGSTEAL_KSWAPD : PGSTEAL_DIRECT;
if (!cgroup_reclaim(sc))
__count_vm_events(item, nr_reclaimed);
__count_memcg_events(lruvec_memcg(lruvec), item, nr_reclaimed);
__count_vm_events(PGSTEAL_ANON + file, nr_reclaimed);
spin_unlock_irq(&lruvec->lru_lock);
lru_note_cost(lruvec, file, stat.nr_pageout);
mem_cgroup_uncharge_list(&page_list);
free_unref_page_list(&page_list);
/*
* If dirty pages are scanned that are not queued for IO, it
* implies that flushers are not doing their job. This can
* happen when memory pressure pushes dirty pages to the end of
* the LRU before the dirty limits are breached and the dirty
* data has expired. It can also happen when the proportion of
* dirty pages grows not through writes but through memory
* pressure reclaiming all the clean cache. And in some cases,
* the flushers simply cannot keep up with the allocation
* rate. Nudge the flusher threads in case they are asleep.
*/
if (stat.nr_unqueued_dirty == nr_taken)
wakeup_flusher_threads(WB_REASON_VMSCAN);
sc->nr.dirty += stat.nr_dirty;
sc->nr.congested += stat.nr_congested;
sc->nr.unqueued_dirty += stat.nr_unqueued_dirty;
sc->nr.writeback += stat.nr_writeback;
sc->nr.immediate += stat.nr_immediate;
sc->nr.taken += nr_taken;
if (file)
sc->nr.file_taken += nr_taken;
trace_mm_vmscan_lru_shrink_inactive(pgdat->node_id,
nr_scanned, nr_reclaimed, &stat, sc->priority, file);
return nr_reclaimed;
}
/*
* shrink_active_list() moves pages from the active LRU to the inactive LRU.
*
* We move them the other way if the page is referenced by one or more
* processes.
*
* If the pages are mostly unmapped, the processing is fast and it is
* appropriate to hold lru_lock across the whole operation. But if
* the pages are mapped, the processing is slow (page_referenced()), so
* we should drop lru_lock around each page. It's impossible to balance
* this, so instead we remove the pages from the LRU while processing them.
* It is safe to rely on PG_active against the non-LRU pages in here because
* nobody will play with that bit on a non-LRU page.
*
* The downside is that we have to touch page->_refcount against each page.
* But we had to alter page->flags anyway.
*/
static void shrink_active_list(unsigned long nr_to_scan,
struct lruvec *lruvec,
struct scan_control *sc,
enum lru_list lru)
{
unsigned long nr_taken;
unsigned long nr_scanned;
unsigned long vm_flags;
LIST_HEAD(l_hold); /* The pages which were snipped off */
LIST_HEAD(l_active);
LIST_HEAD(l_inactive);
struct page *page;
unsigned nr_deactivate, nr_activate;
unsigned nr_rotated = 0;
int file = is_file_lru(lru);
struct pglist_data *pgdat = lruvec_pgdat(lruvec);
lru_add_drain();
spin_lock_irq(&lruvec->lru_lock);
nr_taken = isolate_lru_pages(nr_to_scan, lruvec, &l_hold,
&nr_scanned, sc, lru);
__mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, nr_taken);
if (!cgroup_reclaim(sc))
__count_vm_events(PGREFILL, nr_scanned);
__count_memcg_events(lruvec_memcg(lruvec), PGREFILL, nr_scanned);
spin_unlock_irq(&lruvec->lru_lock);
while (!list_empty(&l_hold)) {
cond_resched();
page = lru_to_page(&l_hold);
list_del(&page->lru);
if (unlikely(!page_evictable(page))) {
putback_lru_page(page);
continue;
}
if (unlikely(buffer_heads_over_limit)) {
if (page_has_private(page) && trylock_page(page)) {
if (page_has_private(page))
try_to_release_page(page, 0);
unlock_page(page);
}
}
if (page_referenced(page, 0, sc->target_mem_cgroup,
&vm_flags)) {
/*
* Identify referenced, file-backed active pages and
* give them one more trip around the active list. So
* that executable code get better chances to stay in
* memory under moderate memory pressure. Anon pages
* are not likely to be evicted by use-once streaming
* IO, plus JVM can create lots of anon VM_EXEC pages,
* so we ignore them here.
*/
if ((vm_flags & VM_EXEC) && page_is_file_lru(page)) {
nr_rotated += thp_nr_pages(page);
list_add(&page->lru, &l_active);
continue;
}
}
ClearPageActive(page); /* we are de-activating */
SetPageWorkingset(page);
list_add(&page->lru, &l_inactive);
}
/*
* Move pages back to the lru list.
*/
spin_lock_irq(&lruvec->lru_lock);
nr_activate = move_pages_to_lru(lruvec, &l_active);
nr_deactivate = move_pages_to_lru(lruvec, &l_inactive);
/* Keep all free pages in l_active list */
list_splice(&l_inactive, &l_active);
__count_vm_events(PGDEACTIVATE, nr_deactivate);
__count_memcg_events(lruvec_memcg(lruvec), PGDEACTIVATE, nr_deactivate);
__mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, -nr_taken);
spin_unlock_irq(&lruvec->lru_lock);
mem_cgroup_uncharge_list(&l_active);
free_unref_page_list(&l_active);
trace_mm_vmscan_lru_shrink_active(pgdat->node_id, nr_taken, nr_activate,
nr_deactivate, nr_rotated, sc->priority, file);
}
unsigned long reclaim_pages(struct list_head *page_list)
{
int nid = NUMA_NO_NODE;
unsigned int nr_reclaimed = 0;
LIST_HEAD(node_page_list);
struct reclaim_stat dummy_stat;
struct page *page;
unsigned int noreclaim_flag;
struct scan_control sc = {
.gfp_mask = GFP_KERNEL,
.may_writepage = 1,
.may_unmap = 1,
.may_swap = 1,
.no_demotion = 1,
};
noreclaim_flag = memalloc_noreclaim_save();
while (!list_empty(page_list)) {
page = lru_to_page(page_list);
if (nid == NUMA_NO_NODE) {
nid = page_to_nid(page);
INIT_LIST_HEAD(&node_page_list);
}
if (nid == page_to_nid(page)) {
ClearPageActive(page);
list_move(&page->lru, &node_page_list);
continue;
}
nr_reclaimed += shrink_page_list(&node_page_list,
NODE_DATA(nid),
&sc, &dummy_stat, false);
while (!list_empty(&node_page_list)) {
page = lru_to_page(&node_page_list);
list_del(&page->lru);
putback_lru_page(page);
}
nid = NUMA_NO_NODE;
}
if (!list_empty(&node_page_list)) {
nr_reclaimed += shrink_page_list(&node_page_list,
NODE_DATA(nid),
&sc, &dummy_stat, false);
while (!list_empty(&node_page_list)) {
page = lru_to_page(&node_page_list);
list_del(&page->lru);
putback_lru_page(page);
}
}
memalloc_noreclaim_restore(noreclaim_flag);
return nr_reclaimed;
}
static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan,
struct lruvec *lruvec, struct scan_control *sc)
{
if (is_active_lru(lru)) {
if (sc->may_deactivate & (1 << is_file_lru(lru)))
shrink_active_list(nr_to_scan, lruvec, sc, lru);
else
sc->skipped_deactivate = 1;
return 0;
}
return shrink_inactive_list(nr_to_scan, lruvec, sc, lru);
}
/*
* The inactive anon list should be small enough that the VM never has
* to do too much work.
*
* The inactive file list should be small enough to leave most memory
* to the established workingset on the scan-resistant active list,
* but large enough to avoid thrashing the aggregate readahead window.
*
* Both inactive lists should also be large enough that each inactive
* page has a chance to be referenced again before it is reclaimed.
*
* If that fails and refaulting is observed, the inactive list grows.
*
* The inactive_ratio is the target ratio of ACTIVE to INACTIVE pages
* on this LRU, maintained by the pageout code. An inactive_ratio
* of 3 means 3:1 or 25% of the pages are kept on the inactive list.
*
* total target max
* memory ratio inactive
* -------------------------------------
* 10MB 1 5MB
* 100MB 1 50MB
* 1GB 3 250MB
* 10GB 10 0.9GB
* 100GB 31 3GB
* 1TB 101 10GB
* 10TB 320 32GB
*/
static bool inactive_is_low(struct lruvec *lruvec, enum lru_list inactive_lru)
{
enum lru_list active_lru = inactive_lru + LRU_ACTIVE;
unsigned long inactive, active;
unsigned long inactive_ratio;
unsigned long gb;
inactive = lruvec_page_state(lruvec, NR_LRU_BASE + inactive_lru);
active = lruvec_page_state(lruvec, NR_LRU_BASE + active_lru);
gb = (inactive + active) >> (30 - PAGE_SHIFT);
if (gb)
inactive_ratio = int_sqrt(10 * gb);
else
inactive_ratio = 1;
return inactive * inactive_ratio < active;
}
enum scan_balance {
SCAN_EQUAL,
SCAN_FRACT,
SCAN_ANON,
SCAN_FILE,
};
/*
* Determine how aggressively the anon and file LRU lists should be
* scanned. The relative value of each set of LRU lists is determined
* by looking at the fraction of the pages scanned we did rotate back
* onto the active list instead of evict.
*
* nr[0] = anon inactive pages to scan; nr[1] = anon active pages to scan
* nr[2] = file inactive pages to scan; nr[3] = file active pages to scan
*/
static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
unsigned long *nr)
{
struct pglist_data *pgdat = lruvec_pgdat(lruvec);
struct mem_cgroup *memcg = lruvec_memcg(lruvec);
unsigned long anon_cost, file_cost, total_cost;
int swappiness = mem_cgroup_swappiness(memcg);
u64 fraction[ANON_AND_FILE];
u64 denominator = 0; /* gcc */
enum scan_balance scan_balance;
unsigned long ap, fp;
enum lru_list lru;
/* If we have no swap space, do not bother scanning anon pages. */
if (!sc->may_swap || !can_reclaim_anon_pages(memcg, pgdat->node_id, sc)) {
scan_balance = SCAN_FILE;
goto out;
}
/*
* Global reclaim will swap to prevent OOM even with no
* swappiness, but memcg users want to use this knob to
* disable swapping for individual groups completely when
* using the memory controller's swap limit feature would be
* too expensive.
*/
if (cgroup_reclaim(sc) && !swappiness) {
scan_balance = SCAN_FILE;
goto out;
}
/*
* Do not apply any pressure balancing cleverness when the
* system is close to OOM, scan both anon and file equally
* (unless the swappiness setting disagrees with swapping).
*/
if (!sc->priority && swappiness) {
scan_balance = SCAN_EQUAL;
goto out;
}
/*
* If the system is almost out of file pages, force-scan anon.
*/
if (sc->file_is_tiny) {
scan_balance = SCAN_ANON;
goto out;
}
/*
* If there is enough inactive page cache, we do not reclaim
* anything from the anonymous working right now.
*/
if (sc->cache_trim_mode) {
scan_balance = SCAN_FILE;
goto out;
}
scan_balance = SCAN_FRACT;
/*
* Calculate the pressure balance between anon and file pages.
*
* The amount of pressure we put on each LRU is inversely
* proportional to the cost of reclaiming each list, as
* determined by the share of pages that are refaulting, times
* the relative IO cost of bringing back a swapped out
* anonymous page vs reloading a filesystem page (swappiness).
*
* Although we limit that influence to ensure no list gets
* left behind completely: at least a third of the pressure is
* applied, before swappiness.
*
* With swappiness at 100, anon and file have equal IO cost.
*/
total_cost = sc->anon_cost + sc->file_cost;
anon_cost = total_cost + sc->anon_cost;
file_cost = total_cost + sc->file_cost;
total_cost = anon_cost + file_cost;
ap = swappiness * (total_cost + 1);
ap /= anon_cost + 1;
fp = (200 - swappiness) * (total_cost + 1);
fp /= file_cost + 1;
fraction[0] = ap;
fraction[1] = fp;
denominator = ap + fp;
out:
for_each_evictable_lru(lru) {
int file = is_file_lru(lru);
unsigned long lruvec_size;
unsigned long low, min;
unsigned long scan;
lruvec_size = lruvec_lru_size(lruvec, lru, sc->reclaim_idx);
mem_cgroup_protection(sc->target_mem_cgroup, memcg,
&min, &low);
if (min || low) {
/*
* Scale a cgroup's reclaim pressure by proportioning
* its current usage to its memory.low or memory.min
* setting.
*
* This is important, as otherwise scanning aggression
* becomes extremely binary -- from nothing as we
* approach the memory protection threshold, to totally
* nominal as we exceed it. This results in requiring
* setting extremely liberal protection thresholds. It
* also means we simply get no protection at all if we
* set it too low, which is not ideal.
*
* If there is any protection in place, we reduce scan
* pressure by how much of the total memory used is
* within protection thresholds.
*
* There is one special case: in the first reclaim pass,
* we skip over all groups that are within their low
* protection. If that fails to reclaim enough pages to
* satisfy the reclaim goal, we come back and override
* the best-effort low protection. However, we still
* ideally want to honor how well-behaved groups are in
* that case instead of simply punishing them all
* equally. As such, we reclaim them based on how much
* memory they are using, reducing the scan pressure
* again by how much of the total memory used is under
* hard protection.
*/
unsigned long cgroup_size = mem_cgroup_size(memcg);
unsigned long protection;
/* memory.low scaling, make sure we retry before OOM */
if (!sc->memcg_low_reclaim && low > min) {
protection = low;
sc->memcg_low_skipped = 1;
} else {
protection = min;
}
/* Avoid TOCTOU with earlier protection check */
cgroup_size = max(cgroup_size, protection);
scan = lruvec_size - lruvec_size * protection /
(cgroup_size + 1);
/*
* Minimally target SWAP_CLUSTER_MAX pages to keep
* reclaim moving forwards, avoiding decrementing
* sc->priority further than desirable.
*/
scan = max(scan, SWAP_CLUSTER_MAX);
} else {
scan = lruvec_size;
}
scan >>= sc->priority;
/*
* If the cgroup's already been deleted, make sure to
* scrape out the remaining cache.
*/
if (!scan && !mem_cgroup_online(memcg))
scan = min(lruvec_size, SWAP_CLUSTER_MAX);
switch (scan_balance) {
case SCAN_EQUAL:
/* Scan lists relative to size */
break;
case SCAN_FRACT:
/*
* Scan types proportional to swappiness and
* their relative recent reclaim efficiency.
* Make sure we don't miss the last page on
* the offlined memory cgroups because of a
* round-off error.
*/
scan = mem_cgroup_online(memcg) ?
div64_u64(scan * fraction[file], denominator) :
DIV64_U64_ROUND_UP(scan * fraction[file],
denominator);
break;
case SCAN_FILE:
case SCAN_ANON:
/* Scan one type exclusively */
if ((scan_balance == SCAN_FILE) != file)
scan = 0;
break;
default:
/* Look ma, no brain */
BUG();
}
nr[lru] = scan;
}
}
/*
* Anonymous LRU management is a waste if there is
* ultimately no way to reclaim the memory.
*/
static bool can_age_anon_pages(struct pglist_data *pgdat,
struct scan_control *sc)
{
/* Aging the anon LRU is valuable if swap is present: */
if (total_swap_pages > 0)
return true;
/* Also valuable if anon pages can be demoted: */
return can_demote(pgdat->node_id, sc);
}
static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
{
unsigned long nr[NR_LRU_LISTS];
unsigned long targets[NR_LRU_LISTS];
unsigned long nr_to_scan;
enum lru_list lru;
unsigned long nr_reclaimed = 0;
unsigned long nr_to_reclaim = sc->nr_to_reclaim;
struct blk_plug plug;
bool scan_adjusted;
get_scan_count(lruvec, sc, nr);
/* Record the original scan target for proportional adjustments later */
memcpy(targets, nr, sizeof(nr));
/*
* Global reclaiming within direct reclaim at DEF_PRIORITY is a normal
* event that can occur when there is little memory pressure e.g.
* multiple streaming readers/writers. Hence, we do not abort scanning
* when the requested number of pages are reclaimed when scanning at
* DEF_PRIORITY on the assumption that the fact we are direct
* reclaiming implies that kswapd is not keeping up and it is best to
* do a batch of work at once. For memcg reclaim one check is made to
* abort proportional reclaim if either the file or anon lru has already
* dropped to zero at the first pass.
*/
scan_adjusted = (!cgroup_reclaim(sc) && !current_is_kswapd() &&
sc->priority == DEF_PRIORITY);
blk_start_plug(&plug);
while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] ||
nr[LRU_INACTIVE_FILE]) {
unsigned long nr_anon, nr_file, percentage;
unsigned long nr_scanned;
for_each_evictable_lru(lru) {
if (nr[lru]) {
nr_to_scan = min(nr[lru], SWAP_CLUSTER_MAX);
nr[lru] -= nr_to_scan;
nr_reclaimed += shrink_list(lru, nr_to_scan,
lruvec, sc);
}
}
cond_resched();
if (nr_reclaimed < nr_to_reclaim || scan_adjusted)
continue;
/*
* For kswapd and memcg, reclaim at least the number of pages
* requested. Ensure that the anon and file LRUs are scanned
* proportionally what was requested by get_scan_count(). We
* stop reclaiming one LRU and reduce the amount scanning
* proportional to the original scan target.
*/
nr_file = nr[LRU_INACTIVE_FILE] + nr[LRU_ACTIVE_FILE];
nr_anon = nr[LRU_INACTIVE_ANON] + nr[LRU_ACTIVE_ANON];
/*
* It's just vindictive to attack the larger once the smaller
* has gone to zero. And given the way we stop scanning the
* smaller below, this makes sure that we only make one nudge
* towards proportionality once we've got nr_to_reclaim.
*/
if (!nr_file || !nr_anon)
break;
if (nr_file > nr_anon) {
unsigned long scan_target = targets[LRU_INACTIVE_ANON] +
targets[LRU_ACTIVE_ANON] + 1;
lru = LRU_BASE;
percentage = nr_anon * 100 / scan_target;
} else {
unsigned long scan_target = targets[LRU_INACTIVE_FILE] +
targets[LRU_ACTIVE_FILE] + 1;
lru = LRU_FILE;
percentage = nr_file * 100 / scan_target;
}
/* Stop scanning the smaller of the LRU */
nr[lru] = 0;
nr[lru + LRU_ACTIVE] = 0;
/*
* Recalculate the other LRU scan count based on its original
* scan target and the percentage scanning already complete
*/
lru = (lru == LRU_FILE) ? LRU_BASE : LRU_FILE;
nr_scanned = targets[lru] - nr[lru];
nr[lru] = targets[lru] * (100 - percentage) / 100;
nr[lru] -= min(nr[lru], nr_scanned);
lru += LRU_ACTIVE;
nr_scanned = targets[lru] - nr[lru];
nr[lru] = targets[lru] * (100 - percentage) / 100;
nr[lru] -= min(nr[lru], nr_scanned);
scan_adjusted = true;
}
blk_finish_plug(&plug);
sc->nr_reclaimed += nr_reclaimed;
/*
* Even if we did not try to evict anon pages at all, we want to
* rebalance the anon lru active/inactive ratio.
*/
if (can_age_anon_pages(lruvec_pgdat(lruvec), sc) &&
inactive_is_low(lruvec, LRU_INACTIVE_ANON))
shrink_active_list(SWAP_CLUSTER_MAX, lruvec,
sc, LRU_ACTIVE_ANON);
}
/* Use reclaim/compaction for costly allocs or under memory pressure */
static bool in_reclaim_compaction(struct scan_control *sc)
{
if (IS_ENABLED(CONFIG_COMPACTION) && sc->order &&
(sc->order > PAGE_ALLOC_COSTLY_ORDER ||
sc->priority < DEF_PRIORITY - 2))
return true;
return false;
}
/*
* Reclaim/compaction is used for high-order allocation requests. It reclaims
* order-0 pages before compacting the zone. should_continue_reclaim() returns
* true if more pages should be reclaimed such that when the page allocator
* calls try_to_compact_pages() that it will have enough free pages to succeed.
* It will give up earlier than that if there is difficulty reclaiming pages.
*/
static inline bool should_continue_reclaim(struct pglist_data *pgdat,
unsigned long nr_reclaimed,
struct scan_control *sc)
{
unsigned long pages_for_compaction;
unsigned long inactive_lru_pages;
int z;
/* If not in reclaim/compaction mode, stop */
if (!in_reclaim_compaction(sc))
return false;
/*
* Stop if we failed to reclaim any pages from the last SWAP_CLUSTER_MAX
* number of pages that were scanned. This will return to the caller
* with the risk reclaim/compaction and the resulting allocation attempt
* fails. In the past we have tried harder for __GFP_RETRY_MAYFAIL
* allocations through requiring that the full LRU list has been scanned
* first, by assuming that zero delta of sc->nr_scanned means full LRU
* scan, but that approximation was wrong, and there were corner cases
* where always a non-zero amount of pages were scanned.
*/
if (!nr_reclaimed)
return false;
/* If compaction would go ahead or the allocation would succeed, stop */
for (z = 0; z <= sc->reclaim_idx; z++) {
struct zone *zone = &pgdat->node_zones[z];
if (!managed_zone(zone))
continue;
switch (compaction_suitable(zone, sc->order, 0, sc->reclaim_idx)) {
case COMPACT_SUCCESS:
case COMPACT_CONTINUE:
return false;
default:
/* check next zone */
;
}
}
/*
* If we have not reclaimed enough pages for compaction and the
* inactive lists are large enough, continue reclaiming
*/
pages_for_compaction = compact_gap(sc->order);
inactive_lru_pages = node_page_state(pgdat, NR_INACTIVE_FILE);
if (can_reclaim_anon_pages(NULL, pgdat->node_id, sc))
inactive_lru_pages += node_page_state(pgdat, NR_INACTIVE_ANON);
return inactive_lru_pages > pages_for_compaction;
}
static void shrink_node_memcgs(pg_data_t *pgdat, struct scan_control *sc)
{
struct mem_cgroup *target_memcg = sc->target_mem_cgroup;
struct mem_cgroup *memcg;
memcg = mem_cgroup_iter(target_memcg, NULL, NULL);
do {
struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat);
unsigned long reclaimed;
unsigned long scanned;
/*
* This loop can become CPU-bound when target memcgs
* aren't eligible for reclaim - either because they
* don't have any reclaimable pages, or because their
* memory is explicitly protected. Avoid soft lockups.
*/
cond_resched();
mem_cgroup_calculate_protection(target_memcg, memcg);
if (mem_cgroup_below_min(memcg)) {
/*
* Hard protection.
* If there is no reclaimable memory, OOM.
*/
continue;
} else if (mem_cgroup_below_low(memcg)) {
/*
* Soft protection.
* Respect the protection only as long as
* there is an unprotected supply
* of reclaimable memory from other cgroups.
*/
if (!sc->memcg_low_reclaim) {
sc->memcg_low_skipped = 1;
continue;
}
memcg_memory_event(memcg, MEMCG_LOW);
}
reclaimed = sc->nr_reclaimed;
scanned = sc->nr_scanned;
shrink_lruvec(lruvec, sc);
shrink_slab(sc->gfp_mask, pgdat->node_id, memcg,
sc->priority);
/* Record the group's reclaim efficiency */
vmpressure(sc->gfp_mask, memcg, false,
sc->nr_scanned - scanned,
sc->nr_reclaimed - reclaimed);
} while ((memcg = mem_cgroup_iter(target_memcg, memcg, NULL)));
}
static void shrink_node(pg_data_t *pgdat, struct scan_control *sc)
{
struct reclaim_state *reclaim_state = current->reclaim_state;
unsigned long nr_reclaimed, nr_scanned;
struct lruvec *target_lruvec;
bool reclaimable = false;
unsigned long file;
target_lruvec = mem_cgroup_lruvec(sc->target_mem_cgroup, pgdat);
again:
/*
* Flush the memory cgroup stats, so that we read accurate per-memcg
* lruvec stats for heuristics.
*/
mem_cgroup_flush_stats();
memset(&sc->nr, 0, sizeof(sc->nr));
nr_reclaimed = sc->nr_reclaimed;
nr_scanned = sc->nr_scanned;
/*
* Determine the scan balance between anon and file LRUs.
*/
spin_lock_irq(&target_lruvec->lru_lock);
sc->anon_cost = target_lruvec->anon_cost;
sc->file_cost = target_lruvec->file_cost;
spin_unlock_irq(&target_lruvec->lru_lock);
/*
* Target desirable inactive:active list ratios for the anon
* and file LRU lists.
*/
if (!sc->force_deactivate) {
unsigned long refaults;
refaults = lruvec_page_state(target_lruvec,
WORKINGSET_ACTIVATE_ANON);
if (refaults != target_lruvec->refaults[0] ||
inactive_is_low(target_lruvec, LRU_INACTIVE_ANON))
sc->may_deactivate |= DEACTIVATE_ANON;
else
sc->may_deactivate &= ~DEACTIVATE_ANON;
/*
* When refaults are being observed, it means a new
* workingset is being established. Deactivate to get
* rid of any stale active pages quickly.
*/
refaults = lruvec_page_state(target_lruvec,
WORKINGSET_ACTIVATE_FILE);
if (refaults != target_lruvec->refaults[1] ||
inactive_is_low(target_lruvec, LRU_INACTIVE_FILE))
sc->may_deactivate |= DEACTIVATE_FILE;
else
sc->may_deactivate &= ~DEACTIVATE_FILE;
} else
sc->may_deactivate = DEACTIVATE_ANON | DEACTIVATE_FILE;
/*
* If we have plenty of inactive file pages that aren't
* thrashing, try to reclaim those first before touching
* anonymous pages.
*/
file = lruvec_page_state(target_lruvec, NR_INACTIVE_FILE);
if (file >> sc->priority && !(sc->may_deactivate & DEACTIVATE_FILE))
sc->cache_trim_mode = 1;
else
sc->cache_trim_mode = 0;
/*
* Prevent the reclaimer from falling into the cache trap: as
* cache pages start out inactive, every cache fault will tip
* the scan balance towards the file LRU. And as the file LRU
* shrinks, so does the window for rotation from references.
* This means we have a runaway feedback loop where a tiny
* thrashing file LRU becomes infinitely more attractive than
* anon pages. Try to detect this based on file LRU size.
*/
if (!cgroup_reclaim(sc)) {
unsigned long total_high_wmark = 0;
unsigned long free, anon;
int z;
free = sum_zone_node_page_state(pgdat->node_id, NR_FREE_PAGES);
file = node_page_state(pgdat, NR_ACTIVE_FILE) +
node_page_state(pgdat, NR_INACTIVE_FILE);
for (z = 0; z < MAX_NR_ZONES; z++) {
struct zone *zone = &pgdat->node_zones[z];
if (!managed_zone(zone))
continue;
total_high_wmark += high_wmark_pages(zone);
}
/*
* Consider anon: if that's low too, this isn't a
* runaway file reclaim problem, but rather just
* extreme pressure. Reclaim as per usual then.
*/
anon = node_page_state(pgdat, NR_INACTIVE_ANON);
sc->file_is_tiny =
file + free <= total_high_wmark &&
!(sc->may_deactivate & DEACTIVATE_ANON) &&
anon >> sc->priority;
}
shrink_node_memcgs(pgdat, sc);
if (reclaim_state) {
sc->nr_reclaimed += reclaim_state->reclaimed_slab;
reclaim_state->reclaimed_slab = 0;
}
/* Record the subtree's reclaim efficiency */
vmpressure(sc->gfp_mask, sc->target_mem_cgroup, true,
sc->nr_scanned - nr_scanned,
sc->nr_reclaimed - nr_reclaimed);
if (sc->nr_reclaimed - nr_reclaimed)
reclaimable = true;
if (current_is_kswapd()) {
/*
* If reclaim is isolating dirty pages under writeback,
* it implies that the long-lived page allocation rate
* is exceeding the page laundering rate. Either the
* global limits are not being effective at throttling
* processes due to the page distribution throughout
* zones or there is heavy usage of a slow backing
* device. The only option is to throttle from reclaim
* context which is not ideal as there is no guarantee
* the dirtying process is throttled in the same way
* balance_dirty_pages() manages.
*
* Once a node is flagged PGDAT_WRITEBACK, kswapd will
* count the number of pages under pages flagged for
* immediate reclaim and stall if any are encountered
* in the nr_immediate check below.
*/
if (sc->nr.writeback && sc->nr.writeback == sc->nr.taken)
set_bit(PGDAT_WRITEBACK, &pgdat->flags);
/* Allow kswapd to start writing pages during reclaim.*/
if (sc->nr.unqueued_dirty == sc->nr.file_taken)
set_bit(PGDAT_DIRTY, &pgdat->flags);
/*
* If kswapd scans pages marked for immediate
* reclaim and under writeback (nr_immediate), it
* implies that pages are cycling through the LRU
* faster than they are written so also forcibly stall.
*/
if (sc->nr.immediate)
congestion_wait(BLK_RW_ASYNC, HZ/10);
}
/*
* Tag a node/memcg as congested if all the dirty pages
* scanned were backed by a congested BDI and
* wait_iff_congested will stall.
*
* Legacy memcg will stall in page writeback so avoid forcibly
* stalling in wait_iff_congested().
*/
if ((current_is_kswapd() ||
(cgroup_reclaim(sc) && writeback_throttling_sane(sc))) &&
sc->nr.dirty && sc->nr.dirty == sc->nr.congested)
set_bit(LRUVEC_CONGESTED, &target_lruvec->flags);
/*
* Stall direct reclaim for IO completions if underlying BDIs
* and node is congested. Allow kswapd to continue until it
* starts encountering unqueued dirty pages or cycling through
* the LRU too quickly.
*/
if (!current_is_kswapd() && current_may_throttle() &&
!sc->hibernation_mode &&
test_bit(LRUVEC_CONGESTED, &target_lruvec->flags))
wait_iff_congested(BLK_RW_ASYNC, HZ/10);
if (should_continue_reclaim(pgdat, sc->nr_reclaimed - nr_reclaimed,
sc))
goto again;
/*
* Kswapd gives up on balancing particular nodes after too
* many failures to reclaim anything from them and goes to
* sleep. On reclaim progress, reset the failure counter. A
* successful direct reclaim run will revive a dormant kswapd.
*/
if (reclaimable)
pgdat->kswapd_failures = 0;
}
/*
* Returns true if compaction should go ahead for a costly-order request, or
* the allocation would already succeed without compaction. Return false if we
* should reclaim first.
*/
static inline bool compaction_ready(struct zone *zone, struct scan_control *sc)
{
unsigned long watermark;
enum compact_result suitable;
suitable = compaction_suitable(zone, sc->order, 0, sc->reclaim_idx);
if (suitable == COMPACT_SUCCESS)
/* Allocation should succeed already. Don't reclaim. */
return true;
if (suitable == COMPACT_SKIPPED)
/* Compaction cannot yet proceed. Do reclaim. */
return false;
/*
* Compaction is already possible, but it takes time to run and there
* are potentially other callers using the pages just freed. So proceed
* with reclaim to make a buffer of free pages available to give
* compaction a reasonable chance of completing and allocating the page.
* Note that we won't actually reclaim the whole buffer in one attempt
* as the target watermark in should_continue_reclaim() is lower. But if
* we are already above the high+gap watermark, don't reclaim at all.
*/
watermark = high_wmark_pages(zone) + compact_gap(sc->order);
return zone_watermark_ok_safe(zone, 0, watermark, sc->reclaim_idx);
}
/*
* This is the direct reclaim path, for page-allocating processes. We only
* try to reclaim pages from zones which will satisfy the caller's allocation
* request.
*
* If a zone is deemed to be full of pinned pages then just give it a light
* scan then give up on it.
*/
static void shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
{
struct zoneref *z;
struct zone *zone;
unsigned long nr_soft_reclaimed;
unsigned long nr_soft_scanned;
gfp_t orig_mask;
pg_data_t *last_pgdat = NULL;
/*
* If the number of buffer_heads in the machine exceeds the maximum
* allowed level, force direct reclaim to scan the highmem zone as
* highmem pages could be pinning lowmem pages storing buffer_heads
*/
orig_mask = sc->gfp_mask;
if (buffer_heads_over_limit) {
sc->gfp_mask |= __GFP_HIGHMEM;
sc->reclaim_idx = gfp_zone(sc->gfp_mask);
}
for_each_zone_zonelist_nodemask(zone, z, zonelist,
sc->reclaim_idx, sc->nodemask) {
/*
* Take care memory controller reclaiming has small influence
* to global LRU.
*/
if (!cgroup_reclaim(sc)) {
if (!cpuset_zone_allowed(zone,
GFP_KERNEL | __GFP_HARDWALL))
continue;
/*
* If we already have plenty of memory free for
* compaction in this zone, don't free any more.
* Even though compaction is invoked for any
* non-zero order, only frequent costly order
* reclamation is disruptive enough to become a
* noticeable problem, like transparent huge
* page allocations.
*/
if (IS_ENABLED(CONFIG_COMPACTION) &&
sc->order > PAGE_ALLOC_COSTLY_ORDER &&
compaction_ready(zone, sc)) {
sc->compaction_ready = true;
continue;
}
/*
* Shrink each node in the zonelist once. If the
* zonelist is ordered by zone (not the default) then a
* node may be shrunk multiple times but in that case
* the user prefers lower zones being preserved.
*/
if (zone->zone_pgdat == last_pgdat)
continue;
/*
* This steals pages from memory cgroups over softlimit
* and returns the number of reclaimed pages and
* scanned pages. This works for global memory pressure
* and balancing, not for a memcg's limit.
*/
nr_soft_scanned = 0;
nr_soft_reclaimed = mem_cgroup_soft_limit_reclaim(zone->zone_pgdat,
sc->order, sc->gfp_mask,
&nr_soft_scanned);
sc->nr_reclaimed += nr_soft_reclaimed;
sc->nr_scanned += nr_soft_scanned;
/* need some check for avoid more shrink_zone() */
}
/* See comment about same check for global reclaim above */
if (zone->zone_pgdat == last_pgdat)
continue;
last_pgdat = zone->zone_pgdat;
shrink_node(zone->zone_pgdat, sc);
}
/*
* Restore to original mask to avoid the impact on the caller if we
* promoted it to __GFP_HIGHMEM.
*/
sc->gfp_mask = orig_mask;
}
static void snapshot_refaults(struct mem_cgroup *target_memcg, pg_data_t *pgdat)
{
struct lruvec *target_lruvec;
unsigned long refaults;
target_lruvec = mem_cgroup_lruvec(target_memcg, pgdat);
refaults = lruvec_page_state(target_lruvec, WORKINGSET_ACTIVATE_ANON);
target_lruvec->refaults[0] = refaults;
refaults = lruvec_page_state(target_lruvec, WORKINGSET_ACTIVATE_FILE);
target_lruvec->refaults[1] = refaults;
}
/*
* This is the main entry point to direct page reclaim.
*
* If a full scan of the inactive list fails to free enough memory then we
* are "out of memory" and something needs to be killed.
*
* If the caller is !__GFP_FS then the probability of a failure is reasonably
* high - the zone may be full of dirty or under-writeback pages, which this
* caller can't do much about. We kick the writeback threads and take explicit
* naps in the hope that some of these pages can be written. But if the
* allocating task holds filesystem locks which prevent writeout this might not
* work, and the allocation attempt will fail.
*
* returns: 0, if no pages reclaimed
* else, the number of pages reclaimed
*/
static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
struct scan_control *sc)
{
int initial_priority = sc->priority;
pg_data_t *last_pgdat;
struct zoneref *z;
struct zone *zone;
retry:
delayacct_freepages_start();
if (!cgroup_reclaim(sc))
__count_zid_vm_events(ALLOCSTALL, sc->reclaim_idx, 1);
do {
vmpressure_prio(sc->gfp_mask, sc->target_mem_cgroup,
sc->priority);
sc->nr_scanned = 0;
shrink_zones(zonelist, sc);
if (sc->nr_reclaimed >= sc->nr_to_reclaim)
break;
if (sc->compaction_ready)
break;
/*
* If we're getting trouble reclaiming, start doing
* writepage even in laptop mode.
*/
if (sc->priority < DEF_PRIORITY - 2)
sc->may_writepage = 1;
} while (--sc->priority >= 0);
last_pgdat = NULL;
for_each_zone_zonelist_nodemask(zone, z, zonelist, sc->reclaim_idx,
sc->nodemask) {
if (zone->zone_pgdat == last_pgdat)
continue;
last_pgdat = zone->zone_pgdat;
snapshot_refaults(sc->target_mem_cgroup, zone->zone_pgdat);
if (cgroup_reclaim(sc)) {
struct lruvec *lruvec;
lruvec = mem_cgroup_lruvec(sc->target_mem_cgroup,
zone->zone_pgdat);
clear_bit(LRUVEC_CONGESTED, &lruvec->flags);
}
}
delayacct_freepages_end();
if (sc->nr_reclaimed)
return sc->nr_reclaimed;
/* Aborted reclaim to try compaction? don't OOM, then */
if (sc->compaction_ready)
return 1;
/*
* We make inactive:active ratio decisions based on the node's
* composition of memory, but a restrictive reclaim_idx or a
* memory.low cgroup setting can exempt large amounts of
* memory from reclaim. Neither of which are very common, so
* instead of doing costly eligibility calculations of the
* entire cgroup subtree up front, we assume the estimates are
* good, and retry with forcible deactivation if that fails.
*/
if (sc->skipped_deactivate) {
sc->priority = initial_priority;
sc->force_deactivate = 1;
sc->skipped_deactivate = 0;
goto retry;
}
/* Untapped cgroup reserves? Don't OOM, retry. */
if (sc->memcg_low_skipped) {
sc->priority = initial_priority;
sc->force_deactivate = 0;
sc->memcg_low_reclaim = 1;
sc->memcg_low_skipped = 0;
goto retry;
}
return 0;
}
static bool allow_direct_reclaim(pg_data_t *pgdat)
{
struct zone *zone;
unsigned long pfmemalloc_reserve = 0;
unsigned long free_pages = 0;
int i;
bool wmark_ok;
if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES)
return true;
for (i = 0; i <= ZONE_NORMAL; i++) {
zone = &pgdat->node_zones[i];
if (!managed_zone(zone))
continue;
if (!zone_reclaimable_pages(zone))
continue;
pfmemalloc_reserve += min_wmark_pages(zone);
free_pages += zone_page_state(zone, NR_FREE_PAGES);
}
/* If there are no reserves (unexpected config) then do not throttle */
if (!pfmemalloc_reserve)
return true;
wmark_ok = free_pages > pfmemalloc_reserve / 2;
/* kswapd must be awake if processes are being throttled */
if (!wmark_ok && waitqueue_active(&pgdat->kswapd_wait)) {
if (READ_ONCE(pgdat->kswapd_highest_zoneidx) > ZONE_NORMAL)
WRITE_ONCE(pgdat->kswapd_highest_zoneidx, ZONE_NORMAL);
wake_up_interruptible(&pgdat->kswapd_wait);
}
return wmark_ok;
}
/*
* Throttle direct reclaimers if backing storage is backed by the network
* and the PFMEMALLOC reserve for the preferred node is getting dangerously
* depleted. kswapd will continue to make progress and wake the processes
* when the low watermark is reached.
*
* Returns true if a fatal signal was delivered during throttling. If this
* happens, the page allocator should not consider triggering the OOM killer.
*/
static bool throttle_direct_reclaim(gfp_t gfp_mask, struct zonelist *zonelist,
nodemask_t *nodemask)
{
struct zoneref *z;
struct zone *zone;
pg_data_t *pgdat = NULL;
/*
* Kernel threads should not be throttled as they may be indirectly
* responsible for cleaning pages necessary for reclaim to make forward
* progress. kjournald for example may enter direct reclaim while
* committing a transaction where throttling it could forcing other
* processes to block on log_wait_commit().
*/
if (current->flags & PF_KTHREAD)
goto out;
/*
* If a fatal signal is pending, this process should not throttle.
* It should return quickly so it can exit and free its memory
*/
if (fatal_signal_pending(current))
goto out;
/*
* Check if the pfmemalloc reserves are ok by finding the first node
* with a usable ZONE_NORMAL or lower zone. The expectation is that
* GFP_KERNEL will be required for allocating network buffers when
* swapping over the network so ZONE_HIGHMEM is unusable.
*
* Throttling is based on the first usable node and throttled processes
* wait on a queue until kswapd makes progress and wakes them. There
* is an affinity then between processes waking up and where reclaim
* progress has been made assuming the process wakes on the same node.
* More importantly, processes running on remote nodes will not compete
* for remote pfmemalloc reserves and processes on different nodes
* should make reasonable progress.
*/
for_each_zone_zonelist_nodemask(zone, z, zonelist,
gfp_zone(gfp_mask), nodemask) {
if (zone_idx(zone) > ZONE_NORMAL)
continue;
/* Throttle based on the first usable node */
pgdat = zone->zone_pgdat;
if (allow_direct_reclaim(pgdat))
goto out;
break;
}
/* If no zone was usable by the allocation flags then do not throttle */
if (!pgdat)
goto out;
/* Account for the throttling */
count_vm_event(PGSCAN_DIRECT_THROTTLE);
/*
* If the caller cannot enter the filesystem, it's possible that it
* is due to the caller holding an FS lock or performing a journal
* transaction in the case of a filesystem like ext[3|4]. In this case,
* it is not safe to block on pfmemalloc_wait as kswapd could be
* blocked waiting on the same lock. Instead, throttle for up to a
* second before continuing.
*/
if (!(gfp_mask & __GFP_FS))
wait_event_interruptible_timeout(pgdat->pfmemalloc_wait,
allow_direct_reclaim(pgdat), HZ);
else
/* Throttle until kswapd wakes the process */
wait_event_killable(zone->zone_pgdat->pfmemalloc_wait,
allow_direct_reclaim(pgdat));
if (fatal_signal_pending(current))
return true;
out:
return false;
}
unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
gfp_t gfp_mask, nodemask_t *nodemask)
{
unsigned long nr_reclaimed;
struct scan_control sc = {
.nr_to_reclaim = SWAP_CLUSTER_MAX,
.gfp_mask = current_gfp_context(gfp_mask),
.reclaim_idx = gfp_zone(gfp_mask),
.order = order,
.nodemask = nodemask,
.priority = DEF_PRIORITY,
.may_writepage = !laptop_mode,
.may_unmap = 1,
.may_swap = 1,
};
/*
* scan_control uses s8 fields for order, priority, and reclaim_idx.
* Confirm they are large enough for max values.
*/
BUILD_BUG_ON(MAX_ORDER > S8_MAX);
BUILD_BUG_ON(DEF_PRIORITY > S8_MAX);
BUILD_BUG_ON(MAX_NR_ZONES > S8_MAX);
/*
* Do not enter reclaim if fatal signal was delivered while throttled.
* 1 is returned so that the page allocator does not OOM kill at this
* point.
*/
if (throttle_direct_reclaim(sc.gfp_mask, zonelist, nodemask))
return 1;
set_task_reclaim_state(current, &sc.reclaim_state);
trace_mm_vmscan_direct_reclaim_begin(order, sc.gfp_mask);
nr_reclaimed = do_try_to_free_pages(zonelist, &sc);
trace_mm_vmscan_direct_reclaim_end(nr_reclaimed);
set_task_reclaim_state(current, NULL);
return nr_reclaimed;
}
#ifdef CONFIG_MEMCG
/* Only used by soft limit reclaim. Do not reuse for anything else. */
unsigned long mem_cgroup_shrink_node(struct mem_cgroup *memcg,
gfp_t gfp_mask, bool noswap,
pg_data_t *pgdat,
unsigned long *nr_scanned)
{
struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat);
struct scan_control sc = {
.nr_to_reclaim = SWAP_CLUSTER_MAX,
.target_mem_cgroup = memcg,
.may_writepage = !laptop_mode,
.may_unmap = 1,
.reclaim_idx = MAX_NR_ZONES - 1,
.may_swap = !noswap,
};
WARN_ON_ONCE(!current->reclaim_state);
sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) |
(GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK);
trace_mm_vmscan_memcg_softlimit_reclaim_begin(sc.order,
sc.gfp_mask);
/*
* NOTE: Although we can get the priority field, using it
* here is not a good idea, since it limits the pages we can scan.
* if we don't reclaim here, the shrink_node from balance_pgdat
* will pick up pages from other mem cgroup's as well. We hack
* the priority and make it zero.
*/
shrink_lruvec(lruvec, &sc);
trace_mm_vmscan_memcg_softlimit_reclaim_end(sc.nr_reclaimed);
*nr_scanned = sc.nr_scanned;
return sc.nr_reclaimed;
}
unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
unsigned long nr_pages,
gfp_t gfp_mask,
bool may_swap)
{
unsigned long nr_reclaimed;
unsigned int noreclaim_flag;
struct scan_control sc = {
.nr_to_reclaim = max(nr_pages, SWAP_CLUSTER_MAX),
.gfp_mask = (current_gfp_context(gfp_mask) & GFP_RECLAIM_MASK) |
(GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK),
.reclaim_idx = MAX_NR_ZONES - 1,
.target_mem_cgroup = memcg,
.priority = DEF_PRIORITY,
.may_writepage = !laptop_mode,
.may_unmap = 1,
.may_swap = may_swap,
};
/*
* Traverse the ZONELIST_FALLBACK zonelist of the current node to put
* equal pressure on all the nodes. This is based on the assumption that
* the reclaim does not bail out early.
*/
struct zonelist *zonelist = node_zonelist(numa_node_id(), sc.gfp_mask);
set_task_reclaim_state(current, &sc.reclaim_state);
trace_mm_vmscan_memcg_reclaim_begin(0, sc.gfp_mask);
noreclaim_flag = memalloc_noreclaim_save();
nr_reclaimed = do_try_to_free_pages(zonelist, &sc);
memalloc_noreclaim_restore(noreclaim_flag);
trace_mm_vmscan_memcg_reclaim_end(nr_reclaimed);
set_task_reclaim_state(current, NULL);
return nr_reclaimed;
}
#endif
static void age_active_anon(struct pglist_data *pgdat,
struct scan_control *sc)
{
struct mem_cgroup *memcg;
struct lruvec *lruvec;
if (!can_age_anon_pages(pgdat, sc))
return;
lruvec = mem_cgroup_lruvec(NULL, pgdat);
if (!inactive_is_low(lruvec, LRU_INACTIVE_ANON))
return;
memcg = mem_cgroup_iter(NULL, NULL, NULL);
do {
lruvec = mem_cgroup_lruvec(memcg, pgdat);
shrink_active_list(SWAP_CLUSTER_MAX, lruvec,
sc, LRU_ACTIVE_ANON);
memcg = mem_cgroup_iter(NULL, memcg, NULL);
} while (memcg);
}
static bool pgdat_watermark_boosted(pg_data_t *pgdat, int highest_zoneidx)
{
int i;
struct zone *zone;
/*
* Check for watermark boosts top-down as the higher zones
* are more likely to be boosted. Both watermarks and boosts
* should not be checked at the same time as reclaim would
* start prematurely when there is no boosting and a lower
* zone is balanced.
*/
for (i = highest_zoneidx; i >= 0; i--) {
zone = pgdat->node_zones + i;
if (!managed_zone(zone))
continue;
if (zone->watermark_boost)
return true;
}
return false;
}
/*
* Returns true if there is an eligible zone balanced for the request order
* and highest_zoneidx
*/
static bool pgdat_balanced(pg_data_t *pgdat, int order, int highest_zoneidx)
{
int i;
unsigned long mark = -1;
struct zone *zone;
/*
* Check watermarks bottom-up as lower zones are more likely to
* meet watermarks.
*/
for (i = 0; i <= highest_zoneidx; i++) {
zone = pgdat->node_zones + i;
if (!managed_zone(zone))
continue;
mark = high_wmark_pages(zone);
if (zone_watermark_ok_safe(zone, order, mark, highest_zoneidx))
return true;
}
/*
* If a node has no populated zone within highest_zoneidx, it does not
* need balancing by definition. This can happen if a zone-restricted
* allocation tries to wake a remote kswapd.
*/
if (mark == -1)
return true;
return false;
}
/* Clear pgdat state for congested, dirty or under writeback. */
static void clear_pgdat_congested(pg_data_t *pgdat)
{
struct lruvec *lruvec = mem_cgroup_lruvec(NULL, pgdat);
clear_bit(LRUVEC_CONGESTED, &lruvec->flags);
clear_bit(PGDAT_DIRTY, &pgdat->flags);
clear_bit(PGDAT_WRITEBACK, &pgdat->flags);
}
/*
* Prepare kswapd for sleeping. This verifies that there are no processes
* waiting in throttle_direct_reclaim() and that watermarks have been met.
*
* Returns true if kswapd is ready to sleep
*/
static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order,
int highest_zoneidx)
{
/*
* The throttled processes are normally woken up in balance_pgdat() as
* soon as allow_direct_reclaim() is true. But there is a potential
* race between when kswapd checks the watermarks and a process gets
* throttled. There is also a potential race if processes get
* throttled, kswapd wakes, a large process exits thereby balancing the
* zones, which causes kswapd to exit balance_pgdat() before reaching
* the wake up checks. If kswapd is going to sleep, no process should
* be sleeping on pfmemalloc_wait, so wake them now if necessary. If
* the wake up is premature, processes will wake kswapd and get
* throttled again. The difference from wake ups in balance_pgdat() is
* that here we are under prepare_to_wait().
*/
if (waitqueue_active(&pgdat->pfmemalloc_wait))
wake_up_all(&pgdat->pfmemalloc_wait);
/* Hopeless node, leave it to direct reclaim */
if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES)
return true;
if (pgdat_balanced(pgdat, order, highest_zoneidx)) {
clear_pgdat_congested(pgdat);
return true;
}
return false;
}
/*
* kswapd shrinks a node of pages that are at or below the highest usable
* zone that is currently unbalanced.
*
* Returns true if kswapd scanned at least the requested number of pages to
* reclaim or if the lack of progress was due to pages under writeback.
* This is used to determine if the scanning priority needs to be raised.
*/
static bool kswapd_shrink_node(pg_data_t *pgdat,
struct scan_control *sc)
{
struct zone *zone;
int z;
/* Reclaim a number of pages proportional to the number of zones */
sc->nr_to_reclaim = 0;
for (z = 0; z <= sc->reclaim_idx; z++) {
zone = pgdat->node_zones + z;
if (!managed_zone(zone))
continue;
sc->nr_to_reclaim += max(high_wmark_pages(zone), SWAP_CLUSTER_MAX);
}
/*
* Historically care was taken to put equal pressure on all zones but
* now pressure is applied based on node LRU order.
*/
shrink_node(pgdat, sc);
/*
* Fragmentation may mean that the system cannot be rebalanced for
* high-order allocations. If twice the allocation size has been
* reclaimed then recheck watermarks only at order-0 to prevent
* excessive reclaim. Assume that a process requested a high-order
* can direct reclaim/compact.
*/
if (sc->order && sc->nr_reclaimed >= compact_gap(sc->order))
sc->order = 0;
return sc->nr_scanned >= sc->nr_to_reclaim;
}
/* Page allocator PCP high watermark is lowered if reclaim is active. */
static inline void
update_reclaim_active(pg_data_t *pgdat, int highest_zoneidx, bool active)
{
int i;
struct zone *zone;
for (i = 0; i <= highest_zoneidx; i++) {
zone = pgdat->node_zones + i;
if (!managed_zone(zone))
continue;
if (active)
set_bit(ZONE_RECLAIM_ACTIVE, &zone->flags);
else
clear_bit(ZONE_RECLAIM_ACTIVE, &zone->flags);
}
}
static inline void
set_reclaim_active(pg_data_t *pgdat, int highest_zoneidx)
{
update_reclaim_active(pgdat, highest_zoneidx, true);
}
static inline void
clear_reclaim_active(pg_data_t *pgdat, int highest_zoneidx)
{
update_reclaim_active(pgdat, highest_zoneidx, false);
}
/*
* For kswapd, balance_pgdat() will reclaim pages across a node from zones
* that are eligible for use by the caller until at least one zone is
* balanced.
*
* Returns the order kswapd finished reclaiming at.
*
* kswapd scans the zones in the highmem->normal->dma direction. It skips
* zones which have free_pages > high_wmark_pages(zone), but once a zone is
* found to have free_pages <= high_wmark_pages(zone), any page in that zone
* or lower is eligible for reclaim until at least one usable zone is
* balanced.
*/
static int balance_pgdat(pg_data_t *pgdat, int order, int highest_zoneidx)
{
int i;
unsigned long nr_soft_reclaimed;
unsigned long nr_soft_scanned;
unsigned long pflags;
unsigned long nr_boost_reclaim;
unsigned long zone_boosts[MAX_NR_ZONES] = { 0, };
bool boosted;
struct zone *zone;
struct scan_control sc = {
.gfp_mask = GFP_KERNEL,
.order = order,
.may_unmap = 1,
};
set_task_reclaim_state(current, &sc.reclaim_state);
psi_memstall_enter(&pflags);
__fs_reclaim_acquire(_THIS_IP_);
count_vm_event(PAGEOUTRUN);
/*
* Account for the reclaim boost. Note that the zone boost is left in
* place so that parallel allocations that are near the watermark will
* stall or direct reclaim until kswapd is finished.
*/
nr_boost_reclaim = 0;
for (i = 0; i <= highest_zoneidx; i++) {
zone = pgdat->node_zones + i;
if (!managed_zone(zone))
continue;
nr_boost_reclaim += zone->watermark_boost;
zone_boosts[i] = zone->watermark_boost;
}
boosted = nr_boost_reclaim;
restart:
set_reclaim_active(pgdat, highest_zoneidx);
sc.priority = DEF_PRIORITY;
do {
unsigned long nr_reclaimed = sc.nr_reclaimed;
bool raise_priority = true;
bool balanced;
bool ret;
sc.reclaim_idx = highest_zoneidx;
/*
* If the number of buffer_heads exceeds the maximum allowed
* then consider reclaiming from all zones. This has a dual
* purpose -- on 64-bit systems it is expected that
* buffer_heads are stripped during active rotation. On 32-bit
* systems, highmem pages can pin lowmem memory and shrinking
* buffers can relieve lowmem pressure. Reclaim may still not
* go ahead if all eligible zones for the original allocation
* request are balanced to avoid excessive reclaim from kswapd.
*/
if (buffer_heads_over_limit) {
for (i = MAX_NR_ZONES - 1; i >= 0; i--) {
zone = pgdat->node_zones + i;
if (!managed_zone(zone))
continue;
sc.reclaim_idx = i;
break;
}
}
/*
* If the pgdat is imbalanced then ignore boosting and preserve
* the watermarks for a later time and restart. Note that the
* zone watermarks will be still reset at the end of balancing
* on the grounds that the normal reclaim should be enough to
* re-evaluate if boosting is required when kswapd next wakes.
*/
balanced = pgdat_balanced(pgdat, sc.order, highest_zoneidx);
if (!balanced && nr_boost_reclaim) {
nr_boost_reclaim = 0;
goto restart;
}
/*
* If boosting is not active then only reclaim if there are no
* eligible zones. Note that sc.reclaim_idx is not used as
* buffer_heads_over_limit may have adjusted it.
*/
if (!nr_boost_reclaim && balanced)
goto out;
/* Limit the priority of boosting to avoid reclaim writeback */
if (nr_boost_reclaim && sc.priority == DEF_PRIORITY - 2)
raise_priority = false;
/*
* Do not writeback or swap pages for boosted reclaim. The
* intent is to relieve pressure not issue sub-optimal IO
* from reclaim context. If no pages are reclaimed, the
* reclaim will be aborted.
*/
sc.may_writepage = !laptop_mode && !nr_boost_reclaim;
sc.may_swap = !nr_boost_reclaim;
/*
* Do some background aging of the anon list, to give
* pages a chance to be referenced before reclaiming. All
* pages are rotated regardless of classzone as this is
* about consistent aging.
*/
age_active_anon(pgdat, &sc);
/*
* If we're getting trouble reclaiming, start doing writepage
* even in laptop mode.
*/
if (sc.priority < DEF_PRIORITY - 2)
sc.may_writepage = 1;
/* Call soft limit reclaim before calling shrink_node. */
sc.nr_scanned = 0;
nr_soft_scanned = 0;
nr_soft_reclaimed = mem_cgroup_soft_limit_reclaim(pgdat, sc.order,
sc.gfp_mask, &nr_soft_scanned);
sc.nr_reclaimed += nr_soft_reclaimed;
/*
* There should be no need to raise the scanning priority if
* enough pages are already being scanned that that high
* watermark would be met at 100% efficiency.
*/
if (kswapd_shrink_node(pgdat, &sc))
raise_priority = false;
/*
* If the low watermark is met there is no need for processes
* to be throttled on pfmemalloc_wait as they should not be
* able to safely make forward progress. Wake them
*/
if (waitqueue_active(&pgdat->pfmemalloc_wait) &&
allow_direct_reclaim(pgdat))
wake_up_all(&pgdat->pfmemalloc_wait);
/* Check if kswapd should be suspending */
__fs_reclaim_release(_THIS_IP_);
ret = try_to_freeze();
__fs_reclaim_acquire(_THIS_IP_);
if (ret || kthread_should_stop())
break;
/*
* Raise priority if scanning rate is too low or there was no
* progress in reclaiming pages
*/
nr_reclaimed = sc.nr_reclaimed - nr_reclaimed;
nr_boost_reclaim -= min(nr_boost_reclaim, nr_reclaimed);
/*
* If reclaim made no progress for a boost, stop reclaim as
* IO cannot be queued and it could be an infinite loop in
* extreme circumstances.
*/
if (nr_boost_reclaim && !nr_reclaimed)
break;
if (raise_priority || !nr_reclaimed)
sc.priority--;
} while (sc.priority >= 1);
if (!sc.nr_reclaimed)
pgdat->kswapd_failures++;
out:
clear_reclaim_active(pgdat, highest_zoneidx);
/* If reclaim was boosted, account for the reclaim done in this pass */
if (boosted) {
unsigned long flags;
for (i = 0; i <= highest_zoneidx; i++) {
if (!zone_boosts[i])
continue;
/* Increments are under the zone lock */
zone = pgdat->node_zones + i;
spin_lock_irqsave(&zone->lock, flags);
zone->watermark_boost -= min(zone->watermark_boost, zone_boosts[i]);
spin_unlock_irqrestore(&zone->lock, flags);
}
/*
* As there is now likely space, wakeup kcompact to defragment
* pageblocks.
*/
wakeup_kcompactd(pgdat, pageblock_order, highest_zoneidx);
}
snapshot_refaults(NULL, pgdat);
__fs_reclaim_release(_THIS_IP_);
psi_memstall_leave(&pflags);
set_task_reclaim_state(current, NULL);
/*
* Return the order kswapd stopped reclaiming at as
* prepare_kswapd_sleep() takes it into account. If another caller
* entered the allocator slow path while kswapd was awake, order will
* remain at the higher level.
*/
return sc.order;
}
/*
* The pgdat->kswapd_highest_zoneidx is used to pass the highest zone index to
* be reclaimed by kswapd from the waker. If the value is MAX_NR_ZONES which is
* not a valid index then either kswapd runs for first time or kswapd couldn't
* sleep after previous reclaim attempt (node is still unbalanced). In that
* case return the zone index of the previous kswapd reclaim cycle.
*/
static enum zone_type kswapd_highest_zoneidx(pg_data_t *pgdat,
enum zone_type prev_highest_zoneidx)
{
enum zone_type curr_idx = READ_ONCE(pgdat->kswapd_highest_zoneidx);
return curr_idx == MAX_NR_ZONES ? prev_highest_zoneidx : curr_idx;
}
static void kswapd_try_to_sleep(pg_data_t *pgdat, int alloc_order, int reclaim_order,
unsigned int highest_zoneidx)
{
long remaining = 0;
DEFINE_WAIT(wait);
if (freezing(current) || kthread_should_stop())
return;
prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE);
/*
* Try to sleep for a short interval. Note that kcompactd will only be
* woken if it is possible to sleep for a short interval. This is
* deliberate on the assumption that if reclaim cannot keep an
* eligible zone balanced that it's also unlikely that compaction will
* succeed.
*/
if (prepare_kswapd_sleep(pgdat, reclaim_order, highest_zoneidx)) {
/*
* Compaction records what page blocks it recently failed to
* isolate pages from and skips them in the future scanning.
* When kswapd is going to sleep, it is reasonable to assume
* that pages and compaction may succeed so reset the cache.
*/
reset_isolation_suitable(pgdat);
/*
* We have freed the memory, now we should compact it to make
* allocation of the requested order possible.
*/
wakeup_kcompactd(pgdat, alloc_order, highest_zoneidx);
remaining = schedule_timeout(HZ/10);
/*
* If woken prematurely then reset kswapd_highest_zoneidx and
* order. The values will either be from a wakeup request or
* the previous request that slept prematurely.
*/
if (remaining) {
WRITE_ONCE(pgdat->kswapd_highest_zoneidx,
kswapd_highest_zoneidx(pgdat,
highest_zoneidx));
if (READ_ONCE(pgdat->kswapd_order) < reclaim_order)
WRITE_ONCE(pgdat->kswapd_order, reclaim_order);
}
finish_wait(&pgdat->kswapd_wait, &wait);
prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE);
}
/*
* After a short sleep, check if it was a premature sleep. If not, then
* go fully to sleep until explicitly woken up.
*/
if (!remaining &&
prepare_kswapd_sleep(pgdat, reclaim_order, highest_zoneidx)) {
trace_mm_vmscan_kswapd_sleep(pgdat->node_id);
/*
* vmstat counters are not perfectly accurate and the estimated
* value for counters such as NR_FREE_PAGES can deviate from the
* true value by nr_online_cpus * threshold. To avoid the zone
* watermarks being breached while under pressure, we reduce the
* per-cpu vmstat threshold while kswapd is awake and restore
* them before going back to sleep.
*/
set_pgdat_percpu_threshold(pgdat, calculate_normal_threshold);
if (!kthread_should_stop())
schedule();
set_pgdat_percpu_threshold(pgdat, calculate_pressure_threshold);
} else {
if (remaining)
count_vm_event(KSWAPD_LOW_WMARK_HIT_QUICKLY);
else
count_vm_event(KSWAPD_HIGH_WMARK_HIT_QUICKLY);
}
finish_wait(&pgdat->kswapd_wait, &wait);
}
/*
* The background pageout daemon, started as a kernel thread
* from the init process.
*
* This basically trickles out pages so that we have _some_
* free memory available even if there is no other activity
* that frees anything up. This is needed for things like routing
* etc, where we otherwise might have all activity going on in
* asynchronous contexts that cannot page things out.
*
* If there are applications that are active memory-allocators
* (most normal use), this basically shouldn't matter.
*/
static int kswapd(void *p)
{
unsigned int alloc_order, reclaim_order;
unsigned int highest_zoneidx = MAX_NR_ZONES - 1;
pg_data_t *pgdat = (pg_data_t *)p;
struct task_struct *tsk = current;
const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id);
if (!cpumask_empty(cpumask))
set_cpus_allowed_ptr(tsk, cpumask);
/*
* Tell the memory management that we're a "memory allocator",
* and that if we need more memory we should get access to it
* regardless (see "__alloc_pages()"). "kswapd" should
* never get caught in the normal page freeing logic.
*
* (Kswapd normally doesn't need memory anyway, but sometimes
* you need a small amount of memory in order to be able to
* page out something else, and this flag essentially protects
* us from recursively trying to free more memory as we're
* trying to free the first piece of memory in the first place).
*/
tsk->flags |= PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD;
set_freezable();
WRITE_ONCE(pgdat->kswapd_order, 0);
WRITE_ONCE(pgdat->kswapd_highest_zoneidx, MAX_NR_ZONES);
for ( ; ; ) {
bool ret;
alloc_order = reclaim_order = READ_ONCE(pgdat->kswapd_order);
highest_zoneidx = kswapd_highest_zoneidx(pgdat,
highest_zoneidx);
kswapd_try_sleep:
kswapd_try_to_sleep(pgdat, alloc_order, reclaim_order,
highest_zoneidx);
/* Read the new order and highest_zoneidx */
alloc_order = READ_ONCE(pgdat->kswapd_order);
highest_zoneidx = kswapd_highest_zoneidx(pgdat,
highest_zoneidx);
WRITE_ONCE(pgdat->kswapd_order, 0);
WRITE_ONCE(pgdat->kswapd_highest_zoneidx, MAX_NR_ZONES);
ret = try_to_freeze();
if (kthread_should_stop())
break;
/*
* We can speed up thawing tasks if we don't call balance_pgdat
* after returning from the refrigerator
*/
if (ret)
continue;
/*
* Reclaim begins at the requested order but if a high-order
* reclaim fails then kswapd falls back to reclaiming for
* order-0. If that happens, kswapd will consider sleeping
* for the order it finished reclaiming at (reclaim_order)
* but kcompactd is woken to compact for the original
* request (alloc_order).
*/
trace_mm_vmscan_kswapd_wake(pgdat->node_id, highest_zoneidx,
alloc_order);
reclaim_order = balance_pgdat(pgdat, alloc_order,
highest_zoneidx);
if (reclaim_order < alloc_order)
goto kswapd_try_sleep;
}
tsk->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD);
return 0;
}
/*
* A zone is low on free memory or too fragmented for high-order memory. If
* kswapd should reclaim (direct reclaim is deferred), wake it up for the zone's
* pgdat. It will wake up kcompactd after reclaiming memory. If kswapd reclaim
* has failed or is not needed, still wake up kcompactd if only compaction is
* needed.
*/
void wakeup_kswapd(struct zone *zone, gfp_t gfp_flags, int order,
enum zone_type highest_zoneidx)
{
pg_data_t *pgdat;
enum zone_type curr_idx;
if (!managed_zone(zone))
return;
if (!cpuset_zone_allowed(zone, gfp_flags))
return;
pgdat = zone->zone_pgdat;
curr_idx = READ_ONCE(pgdat->kswapd_highest_zoneidx);
if (curr_idx == MAX_NR_ZONES || curr_idx < highest_zoneidx)
WRITE_ONCE(pgdat->kswapd_highest_zoneidx, highest_zoneidx);
if (READ_ONCE(pgdat->kswapd_order) < order)
WRITE_ONCE(pgdat->kswapd_order, order);
if (!waitqueue_active(&pgdat->kswapd_wait))
return;
/* Hopeless node, leave it to direct reclaim if possible */
if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES ||
(pgdat_balanced(pgdat, order, highest_zoneidx) &&
!pgdat_watermark_boosted(pgdat, highest_zoneidx))) {
/*
* There may be plenty of free memory available, but it's too
* fragmented for high-order allocations. Wake up kcompactd
* and rely on compaction_suitable() to determine if it's
* needed. If it fails, it will defer subsequent attempts to
* ratelimit its work.
*/
if (!(gfp_flags & __GFP_DIRECT_RECLAIM))
wakeup_kcompactd(pgdat, order, highest_zoneidx);
return;
}
trace_mm_vmscan_wakeup_kswapd(pgdat->node_id, highest_zoneidx, order,
gfp_flags);
wake_up_interruptible(&pgdat->kswapd_wait);
}
#ifdef CONFIG_HIBERNATION
/*
* Try to free `nr_to_reclaim' of memory, system-wide, and return the number of
* freed pages.
*
* Rather than trying to age LRUs the aim is to preserve the overall
* LRU order by reclaiming preferentially
* inactive > active > active referenced > active mapped
*/
unsigned long shrink_all_memory(unsigned long nr_to_reclaim)
{
struct scan_control sc = {
.nr_to_reclaim = nr_to_reclaim,
.gfp_mask = GFP_HIGHUSER_MOVABLE,
.reclaim_idx = MAX_NR_ZONES - 1,
.priority = DEF_PRIORITY,
.may_writepage = 1,
.may_unmap = 1,
.may_swap = 1,
.hibernation_mode = 1,
};
struct zonelist *zonelist = node_zonelist(numa_node_id(), sc.gfp_mask);
unsigned long nr_reclaimed;
unsigned int noreclaim_flag;
fs_reclaim_acquire(sc.gfp_mask);
noreclaim_flag = memalloc_noreclaim_save();
set_task_reclaim_state(current, &sc.reclaim_state);
nr_reclaimed = do_try_to_free_pages(zonelist, &sc);
set_task_reclaim_state(current, NULL);
memalloc_noreclaim_restore(noreclaim_flag);
fs_reclaim_release(sc.gfp_mask);
return nr_reclaimed;
}
#endif /* CONFIG_HIBERNATION */
/*
* This kswapd start function will be called by init and node-hot-add.
* On node-hot-add, kswapd will moved to proper cpus if cpus are hot-added.
*/
void kswapd_run(int nid)
{
pg_data_t *pgdat = NODE_DATA(nid);
if (pgdat->kswapd)
return;
pgdat->kswapd = kthread_run(kswapd, pgdat, "kswapd%d", nid);
if (IS_ERR(pgdat->kswapd)) {
/* failure at boot is fatal */
BUG_ON(system_state < SYSTEM_RUNNING);
pr_err("Failed to start kswapd on node %d\n", nid);
pgdat->kswapd = NULL;
}
}
/*
* Called by memory hotplug when all memory in a node is offlined. Caller must
* hold mem_hotplug_begin/end().
*/
void kswapd_stop(int nid)
{
struct task_struct *kswapd = NODE_DATA(nid)->kswapd;
if (kswapd) {
kthread_stop(kswapd);
NODE_DATA(nid)->kswapd = NULL;
}
}
static int __init kswapd_init(void)
{
int nid;
swap_setup();
for_each_node_state(nid, N_MEMORY)
kswapd_run(nid);
return 0;
}
module_init(kswapd_init)
#ifdef CONFIG_NUMA
/*
* Node reclaim mode
*
* If non-zero call node_reclaim when the number of free pages falls below
* the watermarks.
*/
int node_reclaim_mode __read_mostly;
/*
* Priority for NODE_RECLAIM. This determines the fraction of pages
* of a node considered for each zone_reclaim. 4 scans 1/16th of
* a zone.
*/
#define NODE_RECLAIM_PRIORITY 4
/*
* Percentage of pages in a zone that must be unmapped for node_reclaim to
* occur.
*/
int sysctl_min_unmapped_ratio = 1;
/*
* If the number of slab pages in a zone grows beyond this percentage then
* slab reclaim needs to occur.
*/
int sysctl_min_slab_ratio = 5;
static inline unsigned long node_unmapped_file_pages(struct pglist_data *pgdat)
{
unsigned long file_mapped = node_page_state(pgdat, NR_FILE_MAPPED);
unsigned long file_lru = node_page_state(pgdat, NR_INACTIVE_FILE) +
node_page_state(pgdat, NR_ACTIVE_FILE);
/*
* It's possible for there to be more file mapped pages than
* accounted for by the pages on the file LRU lists because
* tmpfs pages accounted for as ANON can also be FILE_MAPPED
*/
return (file_lru > file_mapped) ? (file_lru - file_mapped) : 0;
}
/* Work out how many page cache pages we can reclaim in this reclaim_mode */
static unsigned long node_pagecache_reclaimable(struct pglist_data *pgdat)
{
unsigned long nr_pagecache_reclaimable;
unsigned long delta = 0;
/*
* If RECLAIM_UNMAP is set, then all file pages are considered
* potentially reclaimable. Otherwise, we have to worry about
* pages like swapcache and node_unmapped_file_pages() provides
* a better estimate
*/
if (node_reclaim_mode & RECLAIM_UNMAP)
nr_pagecache_reclaimable = node_page_state(pgdat, NR_FILE_PAGES);
else
nr_pagecache_reclaimable = node_unmapped_file_pages(pgdat);
/* If we can't clean pages, remove dirty pages from consideration */
if (!(node_reclaim_mode & RECLAIM_WRITE))
delta += node_page_state(pgdat, NR_FILE_DIRTY);
/* Watch for any possible underflows due to delta */
if (unlikely(delta > nr_pagecache_reclaimable))
delta = nr_pagecache_reclaimable;
return nr_pagecache_reclaimable - delta;
}
/*
* Try to free up some pages from this node through reclaim.
*/
static int __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned int order)
{
/* Minimum pages needed in order to stay on node */
const unsigned long nr_pages = 1 << order;
struct task_struct *p = current;
unsigned int noreclaim_flag;
struct scan_control sc = {
.nr_to_reclaim = max(nr_pages, SWAP_CLUSTER_MAX),
.gfp_mask = current_gfp_context(gfp_mask),
.order = order,
.priority = NODE_RECLAIM_PRIORITY,
.may_writepage = !!(node_reclaim_mode & RECLAIM_WRITE),
.may_unmap = !!(node_reclaim_mode & RECLAIM_UNMAP),
.may_swap = 1,
.reclaim_idx = gfp_zone(gfp_mask),
};
unsigned long pflags;
trace_mm_vmscan_node_reclaim_begin(pgdat->node_id, order,
sc.gfp_mask);
cond_resched();
psi_memstall_enter(&pflags);
fs_reclaim_acquire(sc.gfp_mask);
/*
* We need to be able to allocate from the reserves for RECLAIM_UNMAP
* and we also need to be able to write out pages for RECLAIM_WRITE
* and RECLAIM_UNMAP.
*/
noreclaim_flag = memalloc_noreclaim_save();
p->flags |= PF_SWAPWRITE;
set_task_reclaim_state(p, &sc.reclaim_state);
if (node_pagecache_reclaimable(pgdat) > pgdat->min_unmapped_pages) {
/*
* Free memory by calling shrink node with increasing
* priorities until we have enough memory freed.
*/
do {
shrink_node(pgdat, &sc);
} while (sc.nr_reclaimed < nr_pages && --sc.priority >= 0);
}
set_task_reclaim_state(p, NULL);
current->flags &= ~PF_SWAPWRITE;
memalloc_noreclaim_restore(noreclaim_flag);
fs_reclaim_release(sc.gfp_mask);
psi_memstall_leave(&pflags);
trace_mm_vmscan_node_reclaim_end(sc.nr_reclaimed);
return sc.nr_reclaimed >= nr_pages;
}
int node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned int order)
{
int ret;
/*
* Node reclaim reclaims unmapped file backed pages and
* slab pages if we are over the defined limits.
*
* A small portion of unmapped file backed pages is needed for
* file I/O otherwise pages read by file I/O will be immediately
* thrown out if the node is overallocated. So we do not reclaim
* if less than a specified percentage of the node is used by
* unmapped file backed pages.
*/
if (node_pagecache_reclaimable(pgdat) <= pgdat->min_unmapped_pages &&
node_page_state_pages(pgdat, NR_SLAB_RECLAIMABLE_B) <=
pgdat->min_slab_pages)
return NODE_RECLAIM_FULL;
/*
* Do not scan if the allocation should not be delayed.
*/
if (!gfpflags_allow_blocking(gfp_mask) || (current->flags & PF_MEMALLOC))
return NODE_RECLAIM_NOSCAN;
/*
* Only run node reclaim on the local node or on nodes that do not
* have associated processors. This will favor the local processor
* over remote processors and spread off node memory allocations
* as wide as possible.
*/
if (node_state(pgdat->node_id, N_CPU) && pgdat->node_id != numa_node_id())
return NODE_RECLAIM_NOSCAN;
if (test_and_set_bit(PGDAT_RECLAIM_LOCKED, &pgdat->flags))
return NODE_RECLAIM_NOSCAN;
ret = __node_reclaim(pgdat, gfp_mask, order);
clear_bit(PGDAT_RECLAIM_LOCKED, &pgdat->flags);
if (!ret)
count_vm_event(PGSCAN_ZONE_RECLAIM_FAILED);
return ret;
}
#endif
/**
* check_move_unevictable_pages - check pages for evictability and move to
* appropriate zone lru list
* @pvec: pagevec with lru pages to check
*
* Checks pages for evictability, if an evictable page is in the unevictable
* lru list, moves it to the appropriate evictable lru list. This function
* should be only used for lru pages.
*/
void check_move_unevictable_pages(struct pagevec *pvec)
{
struct lruvec *lruvec = NULL;
int pgscanned = 0;
int pgrescued = 0;
int i;
for (i = 0; i < pvec->nr; i++) {
struct page *page = pvec->pages[i];
int nr_pages;
if (PageTransTail(page))
continue;
nr_pages = thp_nr_pages(page);
pgscanned += nr_pages;
/* block memcg migration during page moving between lru */
if (!TestClearPageLRU(page))
continue;
lruvec = relock_page_lruvec_irq(page, lruvec);
if (page_evictable(page) && PageUnevictable(page)) {
del_page_from_lru_list(page, lruvec);
ClearPageUnevictable(page);
add_page_to_lru_list(page, lruvec);
pgrescued += nr_pages;
}
SetPageLRU(page);
}
if (lruvec) {
__count_vm_events(UNEVICTABLE_PGRESCUED, pgrescued);
__count_vm_events(UNEVICTABLE_PGSCANNED, pgscanned);
unlock_page_lruvec_irq(lruvec);
} else if (pgscanned) {
count_vm_events(UNEVICTABLE_PGSCANNED, pgscanned);
}
}
EXPORT_SYMBOL_GPL(check_move_unevictable_pages);
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef __LINUX_SWIOTLB_H
#define __LINUX_SWIOTLB_H
#include <linux/device.h>
#include <linux/dma-direction.h>
#include <linux/init.h>
#include <linux/types.h>
#include <linux/limits.h>
#include <linux/spinlock.h>
struct device;
struct page;
struct scatterlist;
enum swiotlb_force {
SWIOTLB_NORMAL, /* Default - depending on HW DMA mask etc. */
SWIOTLB_FORCE, /* swiotlb=force */
SWIOTLB_NO_FORCE, /* swiotlb=noforce */
};
/*
* Maximum allowable number of contiguous slabs to map,
* must be a power of 2. What is the appropriate value ?
* The complexity of {map,unmap}_single is linearly dependent on this value.
*/
#define IO_TLB_SEGSIZE 128
/*
* log of the size of each IO TLB slab. The number of slabs is command line
* controllable.
*/
#define IO_TLB_SHIFT 11
#define IO_TLB_SIZE (1 << IO_TLB_SHIFT)
/* default to 64MB */
#define IO_TLB_DEFAULT_SIZE (64UL<<20)
extern void swiotlb_init(int verbose);
int swiotlb_init_with_tbl(char *tlb, unsigned long nslabs, int verbose);
unsigned long swiotlb_size_or_default(void);
extern int swiotlb_late_init_with_tbl(char *tlb, unsigned long nslabs);
extern int swiotlb_late_init_with_default_size(size_t default_size);
extern void __init swiotlb_update_mem_attributes(void);
phys_addr_t swiotlb_tbl_map_single(struct device *hwdev, phys_addr_t phys,
size_t mapping_size, size_t alloc_size,
unsigned int alloc_aligned_mask, enum dma_data_direction dir,
unsigned long attrs);
extern void swiotlb_tbl_unmap_single(struct device *hwdev,
phys_addr_t tlb_addr,
size_t mapping_size,
enum dma_data_direction dir,
unsigned long attrs);
void swiotlb_sync_single_for_device(struct device *dev, phys_addr_t tlb_addr,
size_t size, enum dma_data_direction dir);
void swiotlb_sync_single_for_cpu(struct device *dev, phys_addr_t tlb_addr,
size_t size, enum dma_data_direction dir);
dma_addr_t swiotlb_map(struct device *dev, phys_addr_t phys,
size_t size, enum dma_data_direction dir, unsigned long attrs);
#ifdef CONFIG_SWIOTLB
extern enum swiotlb_force swiotlb_force;
/**
* struct io_tlb_mem - IO TLB Memory Pool Descriptor
*
* @start: The start address of the swiotlb memory pool. Used to do a quick
* range check to see if the memory was in fact allocated by this
* API.
* @end: The end address of the swiotlb memory pool. Used to do a quick
* range check to see if the memory was in fact allocated by this
* API.
* @nslabs: The number of IO TLB blocks (in groups of 64) between @start and
* @end. For default swiotlb, this is command line adjustable via
* setup_io_tlb_npages.
* @used: The number of used IO TLB block.
* @list: The free list describing the number of free entries available
* from each index.
* @index: The index to start searching in the next round.
* @orig_addr: The original address corresponding to a mapped entry.
* @alloc_size: Size of the allocated buffer.
* @lock: The lock to protect the above data structures in the map and
* unmap calls.
* @debugfs: The dentry to debugfs.
* @late_alloc: %true if allocated using the page allocator
* @force_bounce: %true if swiotlb bouncing is forced
* @for_alloc: %true if the pool is used for memory allocation
*/
struct io_tlb_mem {
phys_addr_t start;
phys_addr_t end;
unsigned long nslabs;
unsigned long used;
unsigned int index;
spinlock_t lock;
struct dentry *debugfs;
bool late_alloc;
bool force_bounce;
bool for_alloc;
struct io_tlb_slot {
phys_addr_t orig_addr;
size_t alloc_size;
unsigned int list;
} *slots;
};
extern struct io_tlb_mem io_tlb_default_mem;
static inline bool is_swiotlb_buffer(struct device *dev, phys_addr_t paddr)
{
struct io_tlb_mem *mem = dev->dma_io_tlb_mem;
return mem && paddr >= mem->start && paddr < mem->end;
}
static inline bool is_swiotlb_force_bounce(struct device *dev)
{
struct io_tlb_mem *mem = dev->dma_io_tlb_mem;
return mem && mem->force_bounce;
}
void __init swiotlb_exit(void);
unsigned int swiotlb_max_segment(void);
size_t swiotlb_max_mapping_size(struct device *dev);
bool is_swiotlb_active(struct device *dev);
void __init swiotlb_adjust_size(unsigned long size);
#else
#define swiotlb_force SWIOTLB_NO_FORCE
static inline bool is_swiotlb_buffer(struct device *dev, phys_addr_t paddr)
{
return false;
}
static inline bool is_swiotlb_force_bounce(struct device *dev)
{
return false;
}
static inline void swiotlb_exit(void)
{
}
static inline unsigned int swiotlb_max_segment(void)
{
return 0;
}
static inline size_t swiotlb_max_mapping_size(struct device *dev)
{
return SIZE_MAX;
}
static inline bool is_swiotlb_active(struct device *dev)
{
return false;
}
static inline void swiotlb_adjust_size(unsigned long size)
{
}
#endif /* CONFIG_SWIOTLB */
extern void swiotlb_print_info(void);
extern void swiotlb_set_max_segment(unsigned int);
#ifdef CONFIG_DMA_RESTRICTED_POOL
struct page *swiotlb_alloc(struct device *dev, size_t size);
bool swiotlb_free(struct device *dev, struct page *page, size_t size);
static inline bool is_swiotlb_for_alloc(struct device *dev)
{
return dev->dma_io_tlb_mem->for_alloc;
}
#else
static inline struct page *swiotlb_alloc(struct device *dev, size_t size)
{
return NULL;
}
static inline bool swiotlb_free(struct device *dev, struct page *page,
size_t size)
{
return false;
}
static inline bool is_swiotlb_for_alloc(struct device *dev)
{
return false;
}
#endif /* CONFIG_DMA_RESTRICTED_POOL */
#endif /* __LINUX_SWIOTLB_H */
// SPDX-License-Identifier: GPL-2.0
#include "misc.h"
#include "ctree.h"
#include "block-rsv.h"
#include "space-info.h"
#include "transaction.h"
#include "block-group.h"
/*
* HOW DO BLOCK RESERVES WORK
*
* Think of block_rsv's as buckets for logically grouped metadata
* reservations. Each block_rsv has a ->size and a ->reserved. ->size is
* how large we want our block rsv to be, ->reserved is how much space is
* currently reserved for this block reserve.
*
* ->failfast exists for the truncate case, and is described below.
*
* NORMAL OPERATION
*
* -> Reserve
* Entrance: btrfs_block_rsv_add, btrfs_block_rsv_refill
*
* We call into btrfs_reserve_metadata_bytes() with our bytes, which is
* accounted for in space_info->bytes_may_use, and then add the bytes to
* ->reserved, and ->size in the case of btrfs_block_rsv_add.
*
* ->size is an over-estimation of how much we may use for a particular
* operation.
*
* -> Use
* Entrance: btrfs_use_block_rsv
*
* When we do a btrfs_alloc_tree_block() we call into btrfs_use_block_rsv()
* to determine the appropriate block_rsv to use, and then verify that
* ->reserved has enough space for our tree block allocation. Once
* successful we subtract fs_info->nodesize from ->reserved.
*
* -> Finish
* Entrance: btrfs_block_rsv_release
*
* We are finished with our operation, subtract our individual reservation
* from ->size, and then subtract ->size from ->reserved and free up the
* excess if there is any.
*
* There is some logic here to refill the delayed refs rsv or the global rsv
* as needed, otherwise the excess is subtracted from
* space_info->bytes_may_use.
*
* TYPES OF BLOCK RESERVES
*
* BLOCK_RSV_TRANS, BLOCK_RSV_DELOPS, BLOCK_RSV_CHUNK
* These behave normally, as described above, just within the confines of the
* lifetime of their particular operation (transaction for the whole trans
* handle lifetime, for example).
*
* BLOCK_RSV_GLOBAL
* It is impossible to properly account for all the space that may be required
* to make our extent tree updates. This block reserve acts as an overflow
* buffer in case our delayed refs reserve does not reserve enough space to
* update the extent tree.
*
* We can steal from this in some cases as well, notably on evict() or
* truncate() in order to help users recover from ENOSPC conditions.
*
* BLOCK_RSV_DELALLOC
* The individual item sizes are determined by the per-inode size
* calculations, which are described with the delalloc code. This is pretty
* straightforward, it's just the calculation of ->size encodes a lot of
* different items, and thus it gets used when updating inodes, inserting file
* extents, and inserting checksums.
*
* BLOCK_RSV_DELREFS
* We keep a running tally of how many delayed refs we have on the system.
* We assume each one of these delayed refs are going to use a full
* reservation. We use the transaction items and pre-reserve space for every
* operation, and use this reservation to refill any gap between ->size and
* ->reserved that may exist.
*
* From there it's straightforward, removing a delayed ref means we remove its
* count from ->size and free up reservations as necessary. Since this is
* the most dynamic block reserve in the system, we will try to refill this
* block reserve first with any excess returned by any other block reserve.
*
* BLOCK_RSV_EMPTY
* This is the fallback block reserve to make us try to reserve space if we
* don't have a specific bucket for this allocation. It is mostly used for
* updating the device tree and such, since that is a separate pool we're
* content to just reserve space from the space_info on demand.
*
* BLOCK_RSV_TEMP
* This is used by things like truncate and iput. We will temporarily
* allocate a block reserve, set it to some size, and then truncate bytes
* until we have no space left. With ->failfast set we'll simply return
* ENOSPC from btrfs_use_block_rsv() to signal that we need to unwind and try
* to make a new reservation. This is because these operations are
* unbounded, so we want to do as much work as we can, and then back off and
* re-reserve.
*/
static u64 block_rsv_release_bytes(struct btrfs_fs_info *fs_info,
struct btrfs_block_rsv *block_rsv,
struct btrfs_block_rsv *dest, u64 num_bytes,
u64 *qgroup_to_release_ret)
{
struct btrfs_space_info *space_info = block_rsv->space_info;
u64 qgroup_to_release = 0;
u64 ret;
spin_lock(&block_rsv->lock);
if (num_bytes == (u64)-1) {
num_bytes = block_rsv->size;
qgroup_to_release = block_rsv->qgroup_rsv_size;
}
block_rsv->size -= num_bytes;
if (block_rsv->reserved >= block_rsv->size) {
num_bytes = block_rsv->reserved - block_rsv->size;
block_rsv->reserved = block_rsv->size;
block_rsv->full = 1;
} else {
num_bytes = 0;
}
if (block_rsv->qgroup_rsv_reserved >= block_rsv->qgroup_rsv_size) { qgroup_to_release = block_rsv->qgroup_rsv_reserved -
block_rsv->qgroup_rsv_size;
block_rsv->qgroup_rsv_reserved = block_rsv->qgroup_rsv_size;
} else {
qgroup_to_release = 0;
}
spin_unlock(&block_rsv->lock);
ret = num_bytes;
if (num_bytes > 0) {
if (dest) {
spin_lock(&dest->lock);
if (!dest->full) {
u64 bytes_to_add;
bytes_to_add = dest->size - dest->reserved;
bytes_to_add = min(num_bytes, bytes_to_add);
dest->reserved += bytes_to_add;
if (dest->reserved >= dest->size)
dest->full = 1; num_bytes -= bytes_to_add;
}
spin_unlock(&dest->lock);
}
if (num_bytes)
btrfs_space_info_free_bytes_may_use(fs_info,
space_info,
num_bytes);
}
if (qgroup_to_release_ret) *qgroup_to_release_ret = qgroup_to_release;
return ret;
}
int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src,
struct btrfs_block_rsv *dst, u64 num_bytes,
bool update_size)
{
int ret;
ret = btrfs_block_rsv_use_bytes(src, num_bytes);
if (ret)
return ret;
btrfs_block_rsv_add_bytes(dst, num_bytes, update_size); return 0;
}
void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv, unsigned short type)
{
memset(rsv, 0, sizeof(*rsv));
spin_lock_init(&rsv->lock);
rsv->type = type;
}
void btrfs_init_metadata_block_rsv(struct btrfs_fs_info *fs_info,
struct btrfs_block_rsv *rsv,
unsigned short type)
{
btrfs_init_block_rsv(rsv, type);
rsv->space_info = btrfs_find_space_info(fs_info,
BTRFS_BLOCK_GROUP_METADATA);
}
struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_fs_info *fs_info,
unsigned short type)
{
struct btrfs_block_rsv *block_rsv;
block_rsv = kmalloc(sizeof(*block_rsv), GFP_NOFS);
if (!block_rsv)
return NULL;
btrfs_init_metadata_block_rsv(fs_info, block_rsv, type); return block_rsv;
}
void btrfs_free_block_rsv(struct btrfs_fs_info *fs_info,
struct btrfs_block_rsv *rsv)
{
if (!rsv)
return;
btrfs_block_rsv_release(fs_info, rsv, (u64)-1, NULL);
kfree(rsv);
}
int btrfs_block_rsv_add(struct btrfs_root *root,
struct btrfs_block_rsv *block_rsv, u64 num_bytes,
enum btrfs_reserve_flush_enum flush)
{
int ret;
if (num_bytes == 0)
return 0;
ret = btrfs_reserve_metadata_bytes(root, block_rsv, num_bytes, flush);
if (!ret)
btrfs_block_rsv_add_bytes(block_rsv, num_bytes, true);
return ret;
}
int btrfs_block_rsv_check(struct btrfs_block_rsv *block_rsv, int min_factor)
{
u64 num_bytes = 0;
int ret = -ENOSPC;
if (!block_rsv)
return 0;
spin_lock(&block_rsv->lock);
num_bytes = div_factor(block_rsv->size, min_factor);
if (block_rsv->reserved >= num_bytes)
ret = 0;
spin_unlock(&block_rsv->lock);
return ret;
}
int btrfs_block_rsv_refill(struct btrfs_root *root,
struct btrfs_block_rsv *block_rsv, u64 min_reserved,
enum btrfs_reserve_flush_enum flush)
{
u64 num_bytes = 0;
int ret = -ENOSPC;
if (!block_rsv)
return 0;
spin_lock(&block_rsv->lock);
num_bytes = min_reserved;
if (block_rsv->reserved >= num_bytes)
ret = 0;
else
num_bytes -= block_rsv->reserved;
spin_unlock(&block_rsv->lock);
if (!ret)
return 0;
ret = btrfs_reserve_metadata_bytes(root, block_rsv, num_bytes, flush);
if (!ret) {
btrfs_block_rsv_add_bytes(block_rsv, num_bytes, false);
return 0;
}
return ret;
}
u64 btrfs_block_rsv_release(struct btrfs_fs_info *fs_info,
struct btrfs_block_rsv *block_rsv, u64 num_bytes,
u64 *qgroup_to_release)
{
struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
struct btrfs_block_rsv *delayed_rsv = &fs_info->delayed_refs_rsv;
struct btrfs_block_rsv *target = NULL;
/*
* If we are the delayed_rsv then push to the global rsv, otherwise dump
* into the delayed rsv if it is not full.
*/
if (block_rsv == delayed_rsv)
target = global_rsv;
else if (block_rsv != global_rsv && !delayed_rsv->full)
target = delayed_rsv;
if (target && block_rsv->space_info != target->space_info)
target = NULL;
return block_rsv_release_bytes(fs_info, block_rsv, target, num_bytes,
qgroup_to_release);
}
int btrfs_block_rsv_use_bytes(struct btrfs_block_rsv *block_rsv, u64 num_bytes)
{
int ret = -ENOSPC;
spin_lock(&block_rsv->lock);
if (block_rsv->reserved >= num_bytes) {
block_rsv->reserved -= num_bytes;
if (block_rsv->reserved < block_rsv->size)
block_rsv->full = 0;
ret = 0;
}
spin_unlock(&block_rsv->lock);
return ret;
}
void btrfs_block_rsv_add_bytes(struct btrfs_block_rsv *block_rsv,
u64 num_bytes, bool update_size)
{
spin_lock(&block_rsv->lock);
block_rsv->reserved += num_bytes;
if (update_size)
block_rsv->size += num_bytes; else if (block_rsv->reserved >= block_rsv->size) block_rsv->full = 1;
spin_unlock(&block_rsv->lock);
}
int btrfs_cond_migrate_bytes(struct btrfs_fs_info *fs_info,
struct btrfs_block_rsv *dest, u64 num_bytes,
int min_factor)
{
struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
u64 min_bytes;
if (global_rsv->space_info != dest->space_info)
return -ENOSPC;
spin_lock(&global_rsv->lock);
min_bytes = div_factor(global_rsv->size, min_factor);
if (global_rsv->reserved < min_bytes + num_bytes) {
spin_unlock(&global_rsv->lock);
return -ENOSPC;
}
global_rsv->reserved -= num_bytes;
if (global_rsv->reserved < global_rsv->size)
global_rsv->full = 0;
spin_unlock(&global_rsv->lock);
btrfs_block_rsv_add_bytes(dest, num_bytes, true);
return 0;
}
void btrfs_update_global_block_rsv(struct btrfs_fs_info *fs_info)
{
struct btrfs_block_rsv *block_rsv = &fs_info->global_block_rsv;
struct btrfs_space_info *sinfo = block_rsv->space_info;
u64 num_bytes;
unsigned min_items;
/*
* The global block rsv is based on the size of the extent tree, the
* checksum tree and the root tree. If the fs is empty we want to set
* it to a minimal amount for safety.
*/
num_bytes = btrfs_root_used(&fs_info->extent_root->root_item) +
btrfs_root_used(&fs_info->csum_root->root_item) +
btrfs_root_used(&fs_info->tree_root->root_item);
/*
* We at a minimum are going to modify the csum root, the tree root, and
* the extent root.
*/
min_items = 3;
/*
* But we also want to reserve enough space so we can do the fallback
* global reserve for an unlink, which is an additional 5 items (see the
* comment in __unlink_start_trans for what we're modifying.)
*
* But we also need space for the delayed ref updates from the unlink,
* so its 10, 5 for the actual operation, and 5 for the delayed ref
* updates.
*/
min_items += 10;
num_bytes = max_t(u64, num_bytes,
btrfs_calc_insert_metadata_size(fs_info, min_items));
spin_lock(&sinfo->lock);
spin_lock(&block_rsv->lock);
block_rsv->size = min_t(u64, num_bytes, SZ_512M);
if (block_rsv->reserved < block_rsv->size) {
num_bytes = block_rsv->size - block_rsv->reserved;
btrfs_space_info_update_bytes_may_use(fs_info, sinfo,
num_bytes);
block_rsv->reserved = block_rsv->size;
} else if (block_rsv->reserved > block_rsv->size) {
num_bytes = block_rsv->reserved - block_rsv->size;
btrfs_space_info_update_bytes_may_use(fs_info, sinfo,
-num_bytes);
block_rsv->reserved = block_rsv->size;
btrfs_try_granting_tickets(fs_info, sinfo);
}
if (block_rsv->reserved == block_rsv->size)
block_rsv->full = 1;
else
block_rsv->full = 0;
if (block_rsv->size >= sinfo->total_bytes)
sinfo->force_alloc = CHUNK_ALLOC_FORCE;
spin_unlock(&block_rsv->lock);
spin_unlock(&sinfo->lock);
}
void btrfs_init_global_block_rsv(struct btrfs_fs_info *fs_info)
{
struct btrfs_space_info *space_info;
space_info = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_SYSTEM);
fs_info->chunk_block_rsv.space_info = space_info;
space_info = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
fs_info->global_block_rsv.space_info = space_info;
fs_info->trans_block_rsv.space_info = space_info;
fs_info->empty_block_rsv.space_info = space_info;
fs_info->delayed_block_rsv.space_info = space_info;
fs_info->delayed_refs_rsv.space_info = space_info;
/*
* Our various recovery options can leave us with NULL roots, so check
* here and just bail before we go dereferencing NULLs everywhere.
*/
if (!fs_info->extent_root || !fs_info->csum_root || !fs_info->dev_root || !fs_info->chunk_root || !fs_info->tree_root)
return;
fs_info->extent_root->block_rsv = &fs_info->delayed_refs_rsv;
fs_info->csum_root->block_rsv = &fs_info->delayed_refs_rsv;
fs_info->dev_root->block_rsv = &fs_info->global_block_rsv;
fs_info->tree_root->block_rsv = &fs_info->global_block_rsv;
if (fs_info->quota_root)
fs_info->quota_root->block_rsv = &fs_info->global_block_rsv; fs_info->chunk_root->block_rsv = &fs_info->chunk_block_rsv;
btrfs_update_global_block_rsv(fs_info);
}
void btrfs_release_global_block_rsv(struct btrfs_fs_info *fs_info)
{
btrfs_block_rsv_release(fs_info, &fs_info->global_block_rsv, (u64)-1,
NULL);
WARN_ON(fs_info->trans_block_rsv.size > 0); WARN_ON(fs_info->trans_block_rsv.reserved > 0); WARN_ON(fs_info->chunk_block_rsv.size > 0); WARN_ON(fs_info->chunk_block_rsv.reserved > 0); WARN_ON(fs_info->delayed_block_rsv.size > 0); WARN_ON(fs_info->delayed_block_rsv.reserved > 0); WARN_ON(fs_info->delayed_refs_rsv.reserved > 0); WARN_ON(fs_info->delayed_refs_rsv.size > 0);
}
static struct btrfs_block_rsv *get_block_rsv(
const struct btrfs_trans_handle *trans,
const struct btrfs_root *root)
{
struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_block_rsv *block_rsv = NULL;
if (test_bit(BTRFS_ROOT_SHAREABLE, &root->state) ||
(root == fs_info->csum_root && trans->adding_csums) || (root == fs_info->uuid_root)) block_rsv = trans->block_rsv;
if (!block_rsv)
block_rsv = root->block_rsv;
if (!block_rsv)
block_rsv = &fs_info->empty_block_rsv;
return block_rsv;
}
struct btrfs_block_rsv *btrfs_use_block_rsv(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
u32 blocksize)
{
struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_block_rsv *block_rsv;
struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
int ret;
bool global_updated = false;
block_rsv = get_block_rsv(trans, root);
if (unlikely(block_rsv->size == 0))
goto try_reserve;
again:
ret = btrfs_block_rsv_use_bytes(block_rsv, blocksize);
if (!ret)
return block_rsv;
if (block_rsv->failfast) return ERR_PTR(ret); if (block_rsv->type == BTRFS_BLOCK_RSV_GLOBAL && !global_updated) {
global_updated = true;
btrfs_update_global_block_rsv(fs_info);
goto again;
}
/*
* The global reserve still exists to save us from ourselves, so don't
* warn_on if we are short on our delayed refs reserve.
*/
if (block_rsv->type != BTRFS_BLOCK_RSV_DELREFS && btrfs_test_opt(fs_info, ENOSPC_DEBUG)) {
static DEFINE_RATELIMIT_STATE(_rs,
DEFAULT_RATELIMIT_INTERVAL * 10,
/*DEFAULT_RATELIMIT_BURST*/ 1);
if (__ratelimit(&_rs)) WARN(1, KERN_DEBUG
"BTRFS: block rsv %d returned %d\n",
block_rsv->type, ret);
}
try_reserve:
ret = btrfs_reserve_metadata_bytes(root, block_rsv, blocksize,
BTRFS_RESERVE_NO_FLUSH);
if (!ret)
return block_rsv;
/*
* If we couldn't reserve metadata bytes try and use some from
* the global reserve if its space type is the same as the global
* reservation.
*/
if (block_rsv->type != BTRFS_BLOCK_RSV_GLOBAL && block_rsv->space_info == global_rsv->space_info) {
ret = btrfs_block_rsv_use_bytes(global_rsv, blocksize);
if (!ret)
return global_rsv;
}
return ERR_PTR(ret);
}
// SPDX-License-Identifier: GPL-2.0-or-later
/*
* Copyright (C) 2008 Red Hat, Inc., Eric Paris <eparis@redhat.com>
*/
/*
* fsnotify inode mark locking/lifetime/and refcnting
*
* REFCNT:
* The group->recnt and mark->refcnt tell how many "things" in the kernel
* currently are referencing the objects. Both kind of objects typically will
* live inside the kernel with a refcnt of 2, one for its creation and one for
* the reference a group and a mark hold to each other.
* If you are holding the appropriate locks, you can take a reference and the
* object itself is guaranteed to survive until the reference is dropped.
*
* LOCKING:
* There are 3 locks involved with fsnotify inode marks and they MUST be taken
* in order as follows:
*
* group->mark_mutex
* mark->lock
* mark->connector->lock
*
* group->mark_mutex protects the marks_list anchored inside a given group and
* each mark is hooked via the g_list. It also protects the groups private
* data (i.e group limits).
* mark->lock protects the marks attributes like its masks and flags.
* Furthermore it protects the access to a reference of the group that the mark
* is assigned to as well as the access to a reference of the inode/vfsmount
* that is being watched by the mark.
*
* mark->connector->lock protects the list of marks anchored inside an
* inode / vfsmount and each mark is hooked via the i_list.
*
* A list of notification marks relating to inode / mnt is contained in
* fsnotify_mark_connector. That structure is alive as long as there are any
* marks in the list and is also protected by fsnotify_mark_srcu. A mark gets
* detached from fsnotify_mark_connector when last reference to the mark is
* dropped. Thus having mark reference is enough to protect mark->connector
* pointer and to make sure fsnotify_mark_connector cannot disappear. Also
* because we remove mark from g_list before dropping mark reference associated
* with that, any mark found through g_list is guaranteed to have
* mark->connector set until we drop group->mark_mutex.
*
* LIFETIME:
* Inode marks survive between when they are added to an inode and when their
* refcnt==0. Marks are also protected by fsnotify_mark_srcu.
*
* The inode mark can be cleared for a number of different reasons including:
* - The inode is unlinked for the last time. (fsnotify_inode_remove)
* - The inode is being evicted from cache. (fsnotify_inode_delete)
* - The fs the inode is on is unmounted. (fsnotify_inode_delete/fsnotify_unmount_inodes)
* - Something explicitly requests that it be removed. (fsnotify_destroy_mark)
* - The fsnotify_group associated with the mark is going away and all such marks
* need to be cleaned up. (fsnotify_clear_marks_by_group)
*
* This has the very interesting property of being able to run concurrently with
* any (or all) other directions.
*/
#include <linux/fs.h>
#include <linux/init.h>
#include <linux/kernel.h>
#include <linux/kthread.h>
#include <linux/module.h>
#include <linux/mutex.h>
#include <linux/slab.h>
#include <linux/spinlock.h>
#include <linux/srcu.h>
#include <linux/ratelimit.h>
#include <linux/atomic.h>
#include <linux/fsnotify_backend.h>
#include "fsnotify.h"
#define FSNOTIFY_REAPER_DELAY (1) /* 1 jiffy */
struct srcu_struct fsnotify_mark_srcu;
struct kmem_cache *fsnotify_mark_connector_cachep;
static DEFINE_SPINLOCK(destroy_lock);
static LIST_HEAD(destroy_list);
static struct fsnotify_mark_connector *connector_destroy_list;
static void fsnotify_mark_destroy_workfn(struct work_struct *work);
static DECLARE_DELAYED_WORK(reaper_work, fsnotify_mark_destroy_workfn);
static void fsnotify_connector_destroy_workfn(struct work_struct *work);
static DECLARE_WORK(connector_reaper_work, fsnotify_connector_destroy_workfn);
void fsnotify_get_mark(struct fsnotify_mark *mark)
{
WARN_ON_ONCE(!refcount_read(&mark->refcnt));
refcount_inc(&mark->refcnt);
}
static __u32 *fsnotify_conn_mask_p(struct fsnotify_mark_connector *conn)
{
if (conn->type == FSNOTIFY_OBJ_TYPE_INODE)
return &fsnotify_conn_inode(conn)->i_fsnotify_mask;
else if (conn->type == FSNOTIFY_OBJ_TYPE_VFSMOUNT)
return &fsnotify_conn_mount(conn)->mnt_fsnotify_mask;
else if (conn->type == FSNOTIFY_OBJ_TYPE_SB)
return &fsnotify_conn_sb(conn)->s_fsnotify_mask;
return NULL;
}
__u32 fsnotify_conn_mask(struct fsnotify_mark_connector *conn)
{
if (WARN_ON(!fsnotify_valid_obj_type(conn->type)))
return 0;
return *fsnotify_conn_mask_p(conn);
}
static void __fsnotify_recalc_mask(struct fsnotify_mark_connector *conn)
{
u32 new_mask = 0;
struct fsnotify_mark *mark;
assert_spin_locked(&conn->lock);
/* We can get detached connector here when inode is getting unlinked. */
if (!fsnotify_valid_obj_type(conn->type))
return;
hlist_for_each_entry(mark, &conn->list, obj_list) {
if (mark->flags & FSNOTIFY_MARK_FLAG_ATTACHED)
new_mask |= mark->mask;
}
*fsnotify_conn_mask_p(conn) = new_mask;
}
/*
* Calculate mask of events for a list of marks. The caller must make sure
* connector and connector->obj cannot disappear under us. Callers achieve
* this by holding a mark->lock or mark->group->mark_mutex for a mark on this
* list.
*/
void fsnotify_recalc_mask(struct fsnotify_mark_connector *conn)
{
if (!conn)
return;
spin_lock(&conn->lock);
__fsnotify_recalc_mask(conn);
spin_unlock(&conn->lock);
if (conn->type == FSNOTIFY_OBJ_TYPE_INODE)
__fsnotify_update_child_dentry_flags(
fsnotify_conn_inode(conn));
}
/* Free all connectors queued for freeing once SRCU period ends */
static void fsnotify_connector_destroy_workfn(struct work_struct *work)
{
struct fsnotify_mark_connector *conn, *free;
spin_lock(&destroy_lock);
conn = connector_destroy_list;
connector_destroy_list = NULL;
spin_unlock(&destroy_lock);
synchronize_srcu(&fsnotify_mark_srcu);
while (conn) {
free = conn;
conn = conn->destroy_next;
kmem_cache_free(fsnotify_mark_connector_cachep, free);
}
}
static void fsnotify_get_inode_ref(struct inode *inode)
{
ihold(inode);
atomic_long_inc(&inode->i_sb->s_fsnotify_connectors);
}
static void fsnotify_put_inode_ref(struct inode *inode)
{
struct super_block *sb = inode->i_sb;
iput(inode);
if (atomic_long_dec_and_test(&sb->s_fsnotify_connectors))
wake_up_var(&sb->s_fsnotify_connectors);
}
static void fsnotify_get_sb_connectors(struct fsnotify_mark_connector *conn)
{
struct super_block *sb = fsnotify_connector_sb(conn);
if (sb)
atomic_long_inc(&sb->s_fsnotify_connectors);
}
static void fsnotify_put_sb_connectors(struct fsnotify_mark_connector *conn)
{
struct super_block *sb = fsnotify_connector_sb(conn);
if (sb && atomic_long_dec_and_test(&sb->s_fsnotify_connectors))
wake_up_var(&sb->s_fsnotify_connectors);
}
static void *fsnotify_detach_connector_from_object(
struct fsnotify_mark_connector *conn,
unsigned int *type)
{
struct inode *inode = NULL;
*type = conn->type;
if (conn->type == FSNOTIFY_OBJ_TYPE_DETACHED)
return NULL;
if (conn->type == FSNOTIFY_OBJ_TYPE_INODE) {
inode = fsnotify_conn_inode(conn);
inode->i_fsnotify_mask = 0;
} else if (conn->type == FSNOTIFY_OBJ_TYPE_VFSMOUNT) {
fsnotify_conn_mount(conn)->mnt_fsnotify_mask = 0;
} else if (conn->type == FSNOTIFY_OBJ_TYPE_SB) {
fsnotify_conn_sb(conn)->s_fsnotify_mask = 0;
}
fsnotify_put_sb_connectors(conn);
rcu_assign_pointer(*(conn->obj), NULL);
conn->obj = NULL;
conn->type = FSNOTIFY_OBJ_TYPE_DETACHED;
return inode;
}
static void fsnotify_final_mark_destroy(struct fsnotify_mark *mark)
{
struct fsnotify_group *group = mark->group;
if (WARN_ON_ONCE(!group))
return;
group->ops->free_mark(mark);
fsnotify_put_group(group);
}
/* Drop object reference originally held by a connector */
static void fsnotify_drop_object(unsigned int type, void *objp)
{
if (!objp)
return;
/* Currently only inode references are passed to be dropped */
if (WARN_ON_ONCE(type != FSNOTIFY_OBJ_TYPE_INODE))
return;
fsnotify_put_inode_ref(objp);
}
void fsnotify_put_mark(struct fsnotify_mark *mark)
{
struct fsnotify_mark_connector *conn = READ_ONCE(mark->connector);
void *objp = NULL;
unsigned int type = FSNOTIFY_OBJ_TYPE_DETACHED;
bool free_conn = false;
/* Catch marks that were actually never attached to object */
if (!conn) {
if (refcount_dec_and_test(&mark->refcnt))
fsnotify_final_mark_destroy(mark);
return;
}
/*
* We have to be careful so that traversals of obj_list under lock can
* safely grab mark reference.
*/
if (!refcount_dec_and_lock(&mark->refcnt, &conn->lock))
return;
hlist_del_init_rcu(&mark->obj_list);
if (hlist_empty(&conn->list)) {
objp = fsnotify_detach_connector_from_object(conn, &type);
free_conn = true;
} else {
__fsnotify_recalc_mask(conn);
}
WRITE_ONCE(mark->connector, NULL);
spin_unlock(&conn->lock);
fsnotify_drop_object(type, objp);
if (free_conn) {
spin_lock(&destroy_lock);
conn->destroy_next = connector_destroy_list;
connector_destroy_list = conn;
spin_unlock(&destroy_lock);
queue_work(system_unbound_wq, &connector_reaper_work);
}
/*
* Note that we didn't update flags telling whether inode cares about
* what's happening with children. We update these flags from
* __fsnotify_parent() lazily when next event happens on one of our
* children.
*/
spin_lock(&destroy_lock);
list_add(&mark->g_list, &destroy_list);
spin_unlock(&destroy_lock);
queue_delayed_work(system_unbound_wq, &reaper_work,
FSNOTIFY_REAPER_DELAY);
}
EXPORT_SYMBOL_GPL(fsnotify_put_mark);
/*
* Get mark reference when we found the mark via lockless traversal of object
* list. Mark can be already removed from the list by now and on its way to be
* destroyed once SRCU period ends.
*
* Also pin the group so it doesn't disappear under us.
*/
static bool fsnotify_get_mark_safe(struct fsnotify_mark *mark)
{
if (!mark)
return true;
if (refcount_inc_not_zero(&mark->refcnt)) {
spin_lock(&mark->lock);
if (mark->flags & FSNOTIFY_MARK_FLAG_ATTACHED) {
/* mark is attached, group is still alive then */
atomic_inc(&mark->group->user_waits);
spin_unlock(&mark->lock);
return true;
}
spin_unlock(&mark->lock);
fsnotify_put_mark(mark);
}
return false;
}
/*
* Puts marks and wakes up group destruction if necessary.
*
* Pairs with fsnotify_get_mark_safe()
*/
static void fsnotify_put_mark_wake(struct fsnotify_mark *mark)
{
if (mark) {
struct fsnotify_group *group = mark->group;
fsnotify_put_mark(mark);
/*
* We abuse notification_waitq on group shutdown for waiting for
* all marks pinned when waiting for userspace.
*/
if (atomic_dec_and_test(&group->user_waits) && group->shutdown)
wake_up(&group->notification_waitq);
}
}
bool fsnotify_prepare_user_wait(struct fsnotify_iter_info *iter_info)
__releases(&fsnotify_mark_srcu)
{
int type;
fsnotify_foreach_obj_type(type) {
/* This can fail if mark is being removed */
if (!fsnotify_get_mark_safe(iter_info->marks[type])) {
__release(&fsnotify_mark_srcu);
goto fail;
}
}
/*
* Now that both marks are pinned by refcount in the inode / vfsmount
* lists, we can drop SRCU lock, and safely resume the list iteration
* once userspace returns.
*/
srcu_read_unlock(&fsnotify_mark_srcu, iter_info->srcu_idx);
return true;
fail:
for (type--; type >= 0; type--)
fsnotify_put_mark_wake(iter_info->marks[type]);
return false;
}
void fsnotify_finish_user_wait(struct fsnotify_iter_info *iter_info)
__acquires(&fsnotify_mark_srcu)
{
int type;
iter_info->srcu_idx = srcu_read_lock(&fsnotify_mark_srcu);
fsnotify_foreach_obj_type(type)
fsnotify_put_mark_wake(iter_info->marks[type]);
}
/*
* Mark mark as detached, remove it from group list. Mark still stays in object
* list until its last reference is dropped. Note that we rely on mark being
* removed from group list before corresponding reference to it is dropped. In
* particular we rely on mark->connector being valid while we hold
* group->mark_mutex if we found the mark through g_list.
*
* Must be called with group->mark_mutex held. The caller must either hold
* reference to the mark or be protected by fsnotify_mark_srcu.
*/
void fsnotify_detach_mark(struct fsnotify_mark *mark)
{
struct fsnotify_group *group = mark->group;
WARN_ON_ONCE(!mutex_is_locked(&group->mark_mutex));
WARN_ON_ONCE(!srcu_read_lock_held(&fsnotify_mark_srcu) &&
refcount_read(&mark->refcnt) < 1 +
!!(mark->flags & FSNOTIFY_MARK_FLAG_ATTACHED));
spin_lock(&mark->lock);
/* something else already called this function on this mark */
if (!(mark->flags & FSNOTIFY_MARK_FLAG_ATTACHED)) {
spin_unlock(&mark->lock);
return;
}
mark->flags &= ~FSNOTIFY_MARK_FLAG_ATTACHED;
list_del_init(&mark->g_list);
spin_unlock(&mark->lock);
/* Drop mark reference acquired in fsnotify_add_mark_locked() */
fsnotify_put_mark(mark);
}
/*
* Free fsnotify mark. The mark is actually only marked as being freed. The
* freeing is actually happening only once last reference to the mark is
* dropped from a workqueue which first waits for srcu period end.
*
* Caller must have a reference to the mark or be protected by
* fsnotify_mark_srcu.
*/
void fsnotify_free_mark(struct fsnotify_mark *mark)
{
struct fsnotify_group *group = mark->group;
spin_lock(&mark->lock);
/* something else already called this function on this mark */
if (!(mark->flags & FSNOTIFY_MARK_FLAG_ALIVE)) {
spin_unlock(&mark->lock);
return;
}
mark->flags &= ~FSNOTIFY_MARK_FLAG_ALIVE;
spin_unlock(&mark->lock);
/*
* Some groups like to know that marks are being freed. This is a
* callback to the group function to let it know that this mark
* is being freed.
*/
if (group->ops->freeing_mark)
group->ops->freeing_mark(mark, group);
}
void fsnotify_destroy_mark(struct fsnotify_mark *mark,
struct fsnotify_group *group)
{
mutex_lock_nested(&group->mark_mutex, SINGLE_DEPTH_NESTING);
fsnotify_detach_mark(mark);
mutex_unlock(&group->mark_mutex);
fsnotify_free_mark(mark);
}
EXPORT_SYMBOL_GPL(fsnotify_destroy_mark);
/*
* Sorting function for lists of fsnotify marks.
*
* Fanotify supports different notification classes (reflected as priority of
* notification group). Events shall be passed to notification groups in
* decreasing priority order. To achieve this marks in notification lists for
* inodes and vfsmounts are sorted so that priorities of corresponding groups
* are descending.
*
* Furthermore correct handling of the ignore mask requires processing inode
* and vfsmount marks of each group together. Using the group address as
* further sort criterion provides a unique sorting order and thus we can
* merge inode and vfsmount lists of marks in linear time and find groups
* present in both lists.
*
* A return value of 1 signifies that b has priority over a.
* A return value of 0 signifies that the two marks have to be handled together.
* A return value of -1 signifies that a has priority over b.
*/
int fsnotify_compare_groups(struct fsnotify_group *a, struct fsnotify_group *b)
{
if (a == b)
return 0;
if (!a)
return 1;
if (!b)
return -1;
if (a->priority < b->priority)
return 1;
if (a->priority > b->priority)
return -1;
if (a < b) return 1;
return -1;
}
static int fsnotify_attach_connector_to_object(fsnotify_connp_t *connp,
unsigned int type,
__kernel_fsid_t *fsid)
{
struct inode *inode = NULL;
struct fsnotify_mark_connector *conn;
conn = kmem_cache_alloc(fsnotify_mark_connector_cachep, GFP_KERNEL);
if (!conn)
return -ENOMEM;
spin_lock_init(&conn->lock);
INIT_HLIST_HEAD(&conn->list);
conn->type = type;
conn->obj = connp;
/* Cache fsid of filesystem containing the object */
if (fsid) {
conn->fsid = *fsid;
conn->flags = FSNOTIFY_CONN_FLAG_HAS_FSID;
} else {
conn->fsid.val[0] = conn->fsid.val[1] = 0;
conn->flags = 0;
}
if (conn->type == FSNOTIFY_OBJ_TYPE_INODE) {
inode = fsnotify_conn_inode(conn);
fsnotify_get_inode_ref(inode);
}
fsnotify_get_sb_connectors(conn);
/*
* cmpxchg() provides the barrier so that readers of *connp can see
* only initialized structure
*/
if (cmpxchg(connp, NULL, conn)) {
/* Someone else created list structure for us */
if (inode)
fsnotify_put_inode_ref(inode);
fsnotify_put_sb_connectors(conn);
kmem_cache_free(fsnotify_mark_connector_cachep, conn);
}
return 0;
}
/*
* Get mark connector, make sure it is alive and return with its lock held.
* This is for users that get connector pointer from inode or mount. Users that
* hold reference to a mark on the list may directly lock connector->lock as
* they are sure list cannot go away under them.
*/
static struct fsnotify_mark_connector *fsnotify_grab_connector(
fsnotify_connp_t *connp)
{
struct fsnotify_mark_connector *conn;
int idx;
idx = srcu_read_lock(&fsnotify_mark_srcu);
conn = srcu_dereference(*connp, &fsnotify_mark_srcu);
if (!conn)
goto out;
spin_lock(&conn->lock);
if (conn->type == FSNOTIFY_OBJ_TYPE_DETACHED) {
spin_unlock(&conn->lock);
srcu_read_unlock(&fsnotify_mark_srcu, idx);
return NULL;
}
out:
srcu_read_unlock(&fsnotify_mark_srcu, idx);
return conn;}
/*
* Add mark into proper place in given list of marks. These marks may be used
* for the fsnotify backend to determine which event types should be delivered
* to which group and for which inodes. These marks are ordered according to
* priority, highest number first, and then by the group's location in memory.
*/
static int fsnotify_add_mark_list(struct fsnotify_mark *mark,
fsnotify_connp_t *connp, unsigned int type,
int allow_dups, __kernel_fsid_t *fsid)
{
struct fsnotify_mark *lmark, *last = NULL;
struct fsnotify_mark_connector *conn;
int cmp;
int err = 0;
if (WARN_ON(!fsnotify_valid_obj_type(type)))
return -EINVAL;
/* Backend is expected to check for zero fsid (e.g. tmpfs) */
if (fsid && WARN_ON_ONCE(!fsid->val[0] && !fsid->val[1]))
return -ENODEV;
restart:
spin_lock(&mark->lock);
conn = fsnotify_grab_connector(connp);
if (!conn) {
spin_unlock(&mark->lock);
err = fsnotify_attach_connector_to_object(connp, type, fsid);
if (err)
return err;
goto restart;
} else if (fsid && !(conn->flags & FSNOTIFY_CONN_FLAG_HAS_FSID)) {
conn->fsid = *fsid;
/* Pairs with smp_rmb() in fanotify_get_fsid() */
smp_wmb();
conn->flags |= FSNOTIFY_CONN_FLAG_HAS_FSID;
} else if (fsid && (conn->flags & FSNOTIFY_CONN_FLAG_HAS_FSID) &&
(fsid->val[0] != conn->fsid.val[0] ||
fsid->val[1] != conn->fsid.val[1])) {
/*
* Backend is expected to check for non uniform fsid
* (e.g. btrfs), but maybe we missed something?
* Only allow setting conn->fsid once to non zero fsid.
* inotify and non-fid fanotify groups do not set nor test
* conn->fsid.
*/
pr_warn_ratelimited("%s: fsid mismatch on object of type %u: "
"%x.%x != %x.%x\n", __func__, conn->type,
fsid->val[0], fsid->val[1],
conn->fsid.val[0], conn->fsid.val[1]);
err = -EXDEV;
goto out_err;
}
/* is mark the first mark? */
if (hlist_empty(&conn->list)) {
hlist_add_head_rcu(&mark->obj_list, &conn->list);
goto added;
}
/* should mark be in the middle of the current list? */
hlist_for_each_entry(lmark, &conn->list, obj_list) {
last = lmark;
if ((lmark->group == mark->group) &&
(lmark->flags & FSNOTIFY_MARK_FLAG_ATTACHED) &&
!allow_dups) {
err = -EEXIST;
goto out_err;
}
cmp = fsnotify_compare_groups(lmark->group, mark->group);
if (cmp >= 0) {
hlist_add_before_rcu(&mark->obj_list, &lmark->obj_list);
goto added;
}
}
BUG_ON(last == NULL);
/* mark should be the last entry. last is the current last entry */
hlist_add_behind_rcu(&mark->obj_list, &last->obj_list);
added:
/*
* Since connector is attached to object using cmpxchg() we are
* guaranteed that connector initialization is fully visible by anyone
* seeing mark->connector set.
*/
WRITE_ONCE(mark->connector, conn);
out_err:
spin_unlock(&conn->lock);
spin_unlock(&mark->lock);
return err;
}
/*
* Attach an initialized mark to a given group and fs object.
* These marks may be used for the fsnotify backend to determine which
* event types should be delivered to which group.
*/
int fsnotify_add_mark_locked(struct fsnotify_mark *mark,
fsnotify_connp_t *connp, unsigned int type,
int allow_dups, __kernel_fsid_t *fsid)
{
struct fsnotify_group *group = mark->group;
int ret = 0;
BUG_ON(!mutex_is_locked(&group->mark_mutex));
/*
* LOCKING ORDER!!!!
* group->mark_mutex
* mark->lock
* mark->connector->lock
*/
spin_lock(&mark->lock);
mark->flags |= FSNOTIFY_MARK_FLAG_ALIVE | FSNOTIFY_MARK_FLAG_ATTACHED;
list_add(&mark->g_list, &group->marks_list);
fsnotify_get_mark(mark); /* for g_list */
spin_unlock(&mark->lock);
ret = fsnotify_add_mark_list(mark, connp, type, allow_dups, fsid);
if (ret)
goto err;
if (mark->mask)
fsnotify_recalc_mask(mark->connector);
return ret;
err:
spin_lock(&mark->lock);
mark->flags &= ~(FSNOTIFY_MARK_FLAG_ALIVE |
FSNOTIFY_MARK_FLAG_ATTACHED);
list_del_init(&mark->g_list);
spin_unlock(&mark->lock);
fsnotify_put_mark(mark);
return ret;
}
int fsnotify_add_mark(struct fsnotify_mark *mark, fsnotify_connp_t *connp,
unsigned int type, int allow_dups, __kernel_fsid_t *fsid)
{
int ret;
struct fsnotify_group *group = mark->group;
mutex_lock(&group->mark_mutex);
ret = fsnotify_add_mark_locked(mark, connp, type, allow_dups, fsid);
mutex_unlock(&group->mark_mutex);
return ret;
}
EXPORT_SYMBOL_GPL(fsnotify_add_mark);
/*
* Given a list of marks, find the mark associated with given group. If found
* take a reference to that mark and return it, else return NULL.
*/
struct fsnotify_mark *fsnotify_find_mark(fsnotify_connp_t *connp,
struct fsnotify_group *group)
{
struct fsnotify_mark_connector *conn;
struct fsnotify_mark *mark;
conn = fsnotify_grab_connector(connp);
if (!conn)
return NULL;
hlist_for_each_entry(mark, &conn->list, obj_list) {
if (mark->group == group &&
(mark->flags & FSNOTIFY_MARK_FLAG_ATTACHED)) {
fsnotify_get_mark(mark);
spin_unlock(&conn->lock);
return mark;
}
}
spin_unlock(&conn->lock);
return NULL;
}
EXPORT_SYMBOL_GPL(fsnotify_find_mark);
/* Clear any marks in a group with given type mask */
void fsnotify_clear_marks_by_group(struct fsnotify_group *group,
unsigned int type_mask)
{
struct fsnotify_mark *lmark, *mark;
LIST_HEAD(to_free);
struct list_head *head = &to_free;
/* Skip selection step if we want to clear all marks. */
if (type_mask == FSNOTIFY_OBJ_ALL_TYPES_MASK) {
head = &group->marks_list;
goto clear;
}
/*
* We have to be really careful here. Anytime we drop mark_mutex, e.g.
* fsnotify_clear_marks_by_inode() can come and free marks. Even in our
* to_free list so we have to use mark_mutex even when accessing that
* list. And freeing mark requires us to drop mark_mutex. So we can
* reliably free only the first mark in the list. That's why we first
* move marks to free to to_free list in one go and then free marks in
* to_free list one by one.
*/
mutex_lock_nested(&group->mark_mutex, SINGLE_DEPTH_NESTING);
list_for_each_entry_safe(mark, lmark, &group->marks_list, g_list) {
if ((1U << mark->connector->type) & type_mask)
list_move(&mark->g_list, &to_free);
}
mutex_unlock(&group->mark_mutex);
clear:
while (1) {
mutex_lock_nested(&group->mark_mutex, SINGLE_DEPTH_NESTING);
if (list_empty(head)) {
mutex_unlock(&group->mark_mutex);
break;
}
mark = list_first_entry(head, struct fsnotify_mark, g_list);
fsnotify_get_mark(mark);
fsnotify_detach_mark(mark);
mutex_unlock(&group->mark_mutex);
fsnotify_free_mark(mark);
fsnotify_put_mark(mark);
}
}
/* Destroy all marks attached to an object via connector */
void fsnotify_destroy_marks(fsnotify_connp_t *connp)
{
struct fsnotify_mark_connector *conn;
struct fsnotify_mark *mark, *old_mark = NULL;
void *objp;
unsigned int type;
conn = fsnotify_grab_connector(connp);
if (!conn)
return;
/*
* We have to be careful since we can race with e.g.
* fsnotify_clear_marks_by_group() and once we drop the conn->lock, the
* list can get modified. However we are holding mark reference and
* thus our mark cannot be removed from obj_list so we can continue
* iteration after regaining conn->lock.
*/
hlist_for_each_entry(mark, &conn->list, obj_list) { fsnotify_get_mark(mark);
spin_unlock(&conn->lock);
if (old_mark)
fsnotify_put_mark(old_mark);
old_mark = mark;
fsnotify_destroy_mark(mark, mark->group);
spin_lock(&conn->lock);
}
/*
* Detach list from object now so that we don't pin inode until all
* mark references get dropped. It would lead to strange results such
* as delaying inode deletion or blocking unmount.
*/
objp = fsnotify_detach_connector_from_object(conn, &type);
spin_unlock(&conn->lock);
if (old_mark)
fsnotify_put_mark(old_mark); fsnotify_drop_object(type, objp);
}
/*
* Nothing fancy, just initialize lists and locks and counters.
*/
void fsnotify_init_mark(struct fsnotify_mark *mark,
struct fsnotify_group *group)
{
memset(mark, 0, sizeof(*mark));
spin_lock_init(&mark->lock);
refcount_set(&mark->refcnt, 1);
fsnotify_get_group(group);
mark->group = group;
WRITE_ONCE(mark->connector, NULL);
}
EXPORT_SYMBOL_GPL(fsnotify_init_mark);
/*
* Destroy all marks in destroy_list, waits for SRCU period to finish before
* actually freeing marks.
*/
static void fsnotify_mark_destroy_workfn(struct work_struct *work)
{
struct fsnotify_mark *mark, *next;
struct list_head private_destroy_list;
spin_lock(&destroy_lock);
/* exchange the list head */
list_replace_init(&destroy_list, &private_destroy_list);
spin_unlock(&destroy_lock);
synchronize_srcu(&fsnotify_mark_srcu);
list_for_each_entry_safe(mark, next, &private_destroy_list, g_list) {
list_del_init(&mark->g_list);
fsnotify_final_mark_destroy(mark);
}
}
/* Wait for all marks queued for destruction to be actually destroyed */
void fsnotify_wait_marks_destroyed(void)
{
flush_delayed_work(&reaper_work);
}
EXPORT_SYMBOL_GPL(fsnotify_wait_marks_destroyed);
// SPDX-License-Identifier: GPL-2.0-or-later
/*
* Directory notifications for Linux.
*
* Copyright (C) 2000,2001,2002 Stephen Rothwell
*
* Copyright (C) 2009 Eric Paris <Red Hat Inc>
* dnotify was largly rewritten to use the new fsnotify infrastructure
*/
#include <linux/fs.h>
#include <linux/module.h>
#include <linux/sched.h>
#include <linux/sched/signal.h>
#include <linux/dnotify.h>
#include <linux/init.h>
#include <linux/security.h>
#include <linux/spinlock.h>
#include <linux/slab.h>
#include <linux/fdtable.h>
#include <linux/fsnotify_backend.h>
int dir_notify_enable __read_mostly = 1;
static struct kmem_cache *dnotify_struct_cache __read_mostly;
static struct kmem_cache *dnotify_mark_cache __read_mostly;
static struct fsnotify_group *dnotify_group __read_mostly;
/*
* dnotify will attach one of these to each inode (i_fsnotify_marks) which
* is being watched by dnotify. If multiple userspace applications are watching
* the same directory with dnotify their information is chained in dn
*/
struct dnotify_mark {
struct fsnotify_mark fsn_mark;
struct dnotify_struct *dn;
};
/*
* When a process starts or stops watching an inode the set of events which
* dnotify cares about for that inode may change. This function runs the
* list of everything receiving dnotify events about this directory and calculates
* the set of all those events. After it updates what dnotify is interested in
* it calls the fsnotify function so it can update the set of all events relevant
* to this inode.
*/
static void dnotify_recalc_inode_mask(struct fsnotify_mark *fsn_mark)
{
__u32 new_mask = 0;
struct dnotify_struct *dn;
struct dnotify_mark *dn_mark = container_of(fsn_mark,
struct dnotify_mark,
fsn_mark);
assert_spin_locked(&fsn_mark->lock);
for (dn = dn_mark->dn; dn != NULL; dn = dn->dn_next)
new_mask |= (dn->dn_mask & ~FS_DN_MULTISHOT);
if (fsn_mark->mask == new_mask)
return;
fsn_mark->mask = new_mask;
fsnotify_recalc_mask(fsn_mark->connector);
}
/*
* Mains fsnotify call where events are delivered to dnotify.
* Find the dnotify mark on the relevant inode, run the list of dnotify structs
* on that mark and determine which of them has expressed interest in receiving
* events of this type. When found send the correct process and signal and
* destroy the dnotify struct if it was not registered to receive multiple
* events.
*/
static int dnotify_handle_event(struct fsnotify_mark *inode_mark, u32 mask,
struct inode *inode, struct inode *dir,
const struct qstr *name, u32 cookie)
{
struct dnotify_mark *dn_mark;
struct dnotify_struct *dn;
struct dnotify_struct **prev;
struct fown_struct *fown;
__u32 test_mask = mask & ~FS_EVENT_ON_CHILD;
/* not a dir, dnotify doesn't care */
if (!dir && !(mask & FS_ISDIR))
return 0;
dn_mark = container_of(inode_mark, struct dnotify_mark, fsn_mark);
spin_lock(&inode_mark->lock);
prev = &dn_mark->dn;
while ((dn = *prev) != NULL) {
if ((dn->dn_mask & test_mask) == 0) {
prev = &dn->dn_next;
continue;
}
fown = &dn->dn_filp->f_owner;
send_sigio(fown, dn->dn_fd, POLL_MSG);
if (dn->dn_mask & FS_DN_MULTISHOT)
prev = &dn->dn_next;
else {
*prev = dn->dn_next;
kmem_cache_free(dnotify_struct_cache, dn);
dnotify_recalc_inode_mask(inode_mark);
}
}
spin_unlock(&inode_mark->lock);
return 0;
}
static void dnotify_free_mark(struct fsnotify_mark *fsn_mark)
{
struct dnotify_mark *dn_mark = container_of(fsn_mark,
struct dnotify_mark,
fsn_mark);
BUG_ON(dn_mark->dn);
kmem_cache_free(dnotify_mark_cache, dn_mark);
}
static const struct fsnotify_ops dnotify_fsnotify_ops = {
.handle_inode_event = dnotify_handle_event,
.free_mark = dnotify_free_mark,
};
/*
* Called every time a file is closed. Looks first for a dnotify mark on the
* inode. If one is found run all of the ->dn structures attached to that
* mark for one relevant to this process closing the file and remove that
* dnotify_struct. If that was the last dnotify_struct also remove the
* fsnotify_mark.
*/
void dnotify_flush(struct file *filp, fl_owner_t id)
{
struct fsnotify_mark *fsn_mark;
struct dnotify_mark *dn_mark;
struct dnotify_struct *dn;
struct dnotify_struct **prev;
struct inode *inode;
bool free = false;
inode = file_inode(filp);
if (!S_ISDIR(inode->i_mode))
return;
fsn_mark = fsnotify_find_mark(&inode->i_fsnotify_marks, dnotify_group);
if (!fsn_mark)
return;
dn_mark = container_of(fsn_mark, struct dnotify_mark, fsn_mark);
mutex_lock(&dnotify_group->mark_mutex);
spin_lock(&fsn_mark->lock);
prev = &dn_mark->dn;
while ((dn = *prev) != NULL) {
if ((dn->dn_owner == id) && (dn->dn_filp == filp)) { *prev = dn->dn_next;
kmem_cache_free(dnotify_struct_cache, dn);
dnotify_recalc_inode_mask(fsn_mark);
break;
}
prev = &dn->dn_next;
}
spin_unlock(&fsn_mark->lock);
/* nothing else could have found us thanks to the dnotify_groups
mark_mutex */
if (dn_mark->dn == NULL) {
fsnotify_detach_mark(fsn_mark);
free = true;
}
mutex_unlock(&dnotify_group->mark_mutex);
if (free)
fsnotify_free_mark(fsn_mark);
fsnotify_put_mark(fsn_mark);
}
/* this conversion is done only at watch creation */
static __u32 convert_arg(unsigned long arg)
{
__u32 new_mask = FS_EVENT_ON_CHILD;
if (arg & DN_MULTISHOT)
new_mask |= FS_DN_MULTISHOT;
if (arg & DN_DELETE)
new_mask |= (FS_DELETE | FS_MOVED_FROM);
if (arg & DN_MODIFY)
new_mask |= FS_MODIFY;
if (arg & DN_ACCESS)
new_mask |= FS_ACCESS;
if (arg & DN_ATTRIB)
new_mask |= FS_ATTRIB;
if (arg & DN_RENAME)
new_mask |= FS_DN_RENAME;
if (arg & DN_CREATE)
new_mask |= (FS_CREATE | FS_MOVED_TO);
return new_mask;
}
/*
* If multiple processes watch the same inode with dnotify there is only one
* dnotify mark in inode->i_fsnotify_marks but we chain a dnotify_struct
* onto that mark. This function either attaches the new dnotify_struct onto
* that list, or it |= the mask onto an existing dnofiy_struct.
*/
static int attach_dn(struct dnotify_struct *dn, struct dnotify_mark *dn_mark,
fl_owner_t id, int fd, struct file *filp, __u32 mask)
{
struct dnotify_struct *odn;
odn = dn_mark->dn;
while (odn != NULL) {
/* adding more events to existing dnofiy_struct? */
if ((odn->dn_owner == id) && (odn->dn_filp == filp)) {
odn->dn_fd = fd;
odn->dn_mask |= mask;
return -EEXIST;
}
odn = odn->dn_next;
}
dn->dn_mask = mask;
dn->dn_fd = fd;
dn->dn_filp = filp;
dn->dn_owner = id;
dn->dn_next = dn_mark->dn;
dn_mark->dn = dn;
return 0;
}
/*
* When a process calls fcntl to attach a dnotify watch to a directory it ends
* up here. Allocate both a mark for fsnotify to add and a dnotify_struct to be
* attached to the fsnotify_mark.
*/
int fcntl_dirnotify(int fd, struct file *filp, unsigned long arg)
{
struct dnotify_mark *new_dn_mark, *dn_mark;
struct fsnotify_mark *new_fsn_mark, *fsn_mark;
struct dnotify_struct *dn;
struct inode *inode;
fl_owner_t id = current->files;
struct file *f;
int destroy = 0, error = 0;
__u32 mask;
/* we use these to tell if we need to kfree */
new_fsn_mark = NULL;
dn = NULL;
if (!dir_notify_enable) {
error = -EINVAL;
goto out_err;
}
/* a 0 mask means we are explicitly removing the watch */
if ((arg & ~DN_MULTISHOT) == 0) {
dnotify_flush(filp, id);
error = 0;
goto out_err;
}
/* dnotify only works on directories */
inode = file_inode(filp);
if (!S_ISDIR(inode->i_mode)) {
error = -ENOTDIR;
goto out_err;
}
/*
* convert the userspace DN_* "arg" to the internal FS_*
* defined in fsnotify
*/
mask = convert_arg(arg);
error = security_path_notify(&filp->f_path, mask,
FSNOTIFY_OBJ_TYPE_INODE);
if (error)
goto out_err;
/* expect most fcntl to add new rather than augment old */
dn = kmem_cache_alloc(dnotify_struct_cache, GFP_KERNEL);
if (!dn) {
error = -ENOMEM;
goto out_err;
}
/* new fsnotify mark, we expect most fcntl calls to add a new mark */
new_dn_mark = kmem_cache_alloc(dnotify_mark_cache, GFP_KERNEL);
if (!new_dn_mark) {
error = -ENOMEM;
goto out_err;
}
/* set up the new_fsn_mark and new_dn_mark */
new_fsn_mark = &new_dn_mark->fsn_mark;
fsnotify_init_mark(new_fsn_mark, dnotify_group);
new_fsn_mark->mask = mask;
new_dn_mark->dn = NULL;
/* this is needed to prevent the fcntl/close race described below */
mutex_lock(&dnotify_group->mark_mutex);
/* add the new_fsn_mark or find an old one. */
fsn_mark = fsnotify_find_mark(&inode->i_fsnotify_marks, dnotify_group);
if (fsn_mark) {
dn_mark = container_of(fsn_mark, struct dnotify_mark, fsn_mark);
spin_lock(&fsn_mark->lock);
} else {
error = fsnotify_add_inode_mark_locked(new_fsn_mark, inode, 0);
if (error) {
mutex_unlock(&dnotify_group->mark_mutex);
goto out_err;
}
spin_lock(&new_fsn_mark->lock);
fsn_mark = new_fsn_mark;
dn_mark = new_dn_mark;
/* we used new_fsn_mark, so don't free it */
new_fsn_mark = NULL;
}
rcu_read_lock();
f = lookup_fd_rcu(fd);
rcu_read_unlock();
/* if (f != filp) means that we lost a race and another task/thread
* actually closed the fd we are still playing with before we grabbed
* the dnotify_groups mark_mutex and fsn_mark->lock. Since closing the
* fd is the only time we clean up the marks we need to get our mark
* off the list. */
if (f != filp) {
/* if we added ourselves, shoot ourselves, it's possible that
* the flush actually did shoot this fsn_mark. That's fine too
* since multiple calls to destroy_mark is perfectly safe, if
* we found a dn_mark already attached to the inode, just sod
* off silently as the flush at close time dealt with it.
*/
if (dn_mark == new_dn_mark)
destroy = 1;
error = 0;
goto out;
}
__f_setown(filp, task_pid(current), PIDTYPE_TGID, 0);
error = attach_dn(dn, dn_mark, id, fd, filp, mask);
/* !error means that we attached the dn to the dn_mark, so don't free it */
if (!error)
dn = NULL;
/* -EEXIST means that we didn't add this new dn and used an old one.
* that isn't an error (and the unused dn should be freed) */
else if (error == -EEXIST)
error = 0;
dnotify_recalc_inode_mask(fsn_mark);
out:
spin_unlock(&fsn_mark->lock);
if (destroy)
fsnotify_detach_mark(fsn_mark);
mutex_unlock(&dnotify_group->mark_mutex);
if (destroy)
fsnotify_free_mark(fsn_mark);
fsnotify_put_mark(fsn_mark);
out_err:
if (new_fsn_mark)
fsnotify_put_mark(new_fsn_mark);
if (dn)
kmem_cache_free(dnotify_struct_cache, dn);
return error;
}
static int __init dnotify_init(void)
{
dnotify_struct_cache = KMEM_CACHE(dnotify_struct,
SLAB_PANIC|SLAB_ACCOUNT);
dnotify_mark_cache = KMEM_CACHE(dnotify_mark, SLAB_PANIC|SLAB_ACCOUNT);
dnotify_group = fsnotify_alloc_group(&dnotify_fsnotify_ops);
if (IS_ERR(dnotify_group))
panic("unable to allocate fsnotify group for dnotify\n");
return 0;
}
module_init(dnotify_init)
// SPDX-License-Identifier: GPL-2.0-or-later
/* bit search implementation
*
* Copyright (C) 2004 Red Hat, Inc. All Rights Reserved.
* Written by David Howells (dhowells@redhat.com)
*
* Copyright (C) 2008 IBM Corporation
* 'find_last_bit' is written by Rusty Russell <rusty@rustcorp.com.au>
* (Inspired by David Howell's find_next_bit implementation)
*
* Rewritten by Yury Norov <yury.norov@gmail.com> to decrease
* size and improve performance, 2015.
*/
#include <linux/bitops.h>
#include <linux/bitmap.h>
#include <linux/export.h>
#include <linux/math.h>
#include <linux/minmax.h>
#include <linux/swab.h>
#if !defined(find_next_bit) || !defined(find_next_zero_bit) || \
!defined(find_next_bit_le) || !defined(find_next_zero_bit_le) || \
!defined(find_next_and_bit)
/*
* This is a common helper function for find_next_bit, find_next_zero_bit, and
* find_next_and_bit. The differences are:
* - The "invert" argument, which is XORed with each fetched word before
* searching it for one bits.
* - The optional "addr2", which is anded with "addr1" if present.
*/
unsigned long _find_next_bit(const unsigned long *addr1,
const unsigned long *addr2, unsigned long nbits,
unsigned long start, unsigned long invert, unsigned long le)
{
unsigned long tmp, mask;
if (unlikely(start >= nbits))
return nbits;
tmp = addr1[start / BITS_PER_LONG];
if (addr2)
tmp &= addr2[start / BITS_PER_LONG]; tmp ^= invert;
/* Handle 1st word. */
mask = BITMAP_FIRST_WORD_MASK(start);
if (le)
mask = swab(mask);
tmp &= mask;
start = round_down(start, BITS_PER_LONG);
while (!tmp) { start += BITS_PER_LONG;
if (start >= nbits)
return nbits;
tmp = addr1[start / BITS_PER_LONG];
if (addr2)
tmp &= addr2[start / BITS_PER_LONG]; tmp ^= invert;
}
if (le)
tmp = swab(tmp);
return min(start + __ffs(tmp), nbits);
}
EXPORT_SYMBOL(_find_next_bit);
#endif
#ifndef find_first_bit
/*
* Find the first set bit in a memory region.
*/
unsigned long _find_first_bit(const unsigned long *addr, unsigned long size)
{
unsigned long idx;
for (idx = 0; idx * BITS_PER_LONG < size; idx++) { if (addr[idx]) return min(idx * BITS_PER_LONG + __ffs(addr[idx]), size);
}
return size;
}
EXPORT_SYMBOL(_find_first_bit);
#endif
#ifndef find_first_zero_bit
/*
* Find the first cleared bit in a memory region.
*/
unsigned long _find_first_zero_bit(const unsigned long *addr, unsigned long size)
{
unsigned long idx;
for (idx = 0; idx * BITS_PER_LONG < size; idx++) { if (addr[idx] != ~0UL) return min(idx * BITS_PER_LONG + ffz(addr[idx]), size);
}
return size;
}
EXPORT_SYMBOL(_find_first_zero_bit);
#endif
#ifndef find_last_bit
unsigned long _find_last_bit(const unsigned long *addr, unsigned long size)
{
if (size) { unsigned long val = BITMAP_LAST_WORD_MASK(size);
unsigned long idx = (size-1) / BITS_PER_LONG;
do {
val &= addr[idx];
if (val)
return idx * BITS_PER_LONG + __fls(val);
val = ~0ul;
} while (idx--);
}
return size;
}
EXPORT_SYMBOL(_find_last_bit);
#endif
unsigned long find_next_clump8(unsigned long *clump, const unsigned long *addr,
unsigned long size, unsigned long offset)
{
offset = find_next_bit(addr, size, offset);
if (offset == size)
return size;
offset = round_down(offset, 8);
*clump = bitmap_get_value8(addr, offset);
return offset;
}
EXPORT_SYMBOL(find_next_clump8);
// SPDX-License-Identifier: GPL-2.0
/*
* Copyright (C) 2007 Red Hat. All rights reserved.
*/
#include <linux/init.h>
#include <linux/fs.h>
#include <linux/slab.h>
#include <linux/rwsem.h>
#include <linux/xattr.h>
#include <linux/security.h>
#include <linux/posix_acl_xattr.h>
#include <linux/iversion.h>
#include <linux/sched/mm.h>
#include "ctree.h"
#include "btrfs_inode.h"
#include "transaction.h"
#include "xattr.h"
#include "disk-io.h"
#include "props.h"
#include "locking.h"
int btrfs_getxattr(struct inode *inode, const char *name,
void *buffer, size_t size)
{
struct btrfs_dir_item *di;
struct btrfs_root *root = BTRFS_I(inode)->root;
struct btrfs_path *path;
struct extent_buffer *leaf;
int ret = 0;
unsigned long data_ptr;
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
/* lookup the xattr by name */
di = btrfs_lookup_xattr(NULL, root, path, btrfs_ino(BTRFS_I(inode)), name, strlen(name), 0);
if (!di) {
ret = -ENODATA;
goto out;
} else if (IS_ERR(di)) {
ret = PTR_ERR(di);
goto out;
}
leaf = path->nodes[0];
/* if size is 0, that means we want the size of the attr */
if (!size) {
ret = btrfs_dir_data_len(leaf, di);
goto out;
}
/* now get the data out of our dir_item */
if (btrfs_dir_data_len(leaf, di) > size) {
ret = -ERANGE;
goto out;
}
/*
* The way things are packed into the leaf is like this
* |struct btrfs_dir_item|name|data|
* where name is the xattr name, so security.foo, and data is the
* content of the xattr. data_ptr points to the location in memory
* where the data starts in the in memory leaf
*/
data_ptr = (unsigned long)((char *)(di + 1) +
btrfs_dir_name_len(leaf, di));
read_extent_buffer(leaf, buffer, data_ptr,
btrfs_dir_data_len(leaf, di));
ret = btrfs_dir_data_len(leaf, di);
out:
btrfs_free_path(path); return ret;
}
int btrfs_setxattr(struct btrfs_trans_handle *trans, struct inode *inode,
const char *name, const void *value, size_t size, int flags)
{
struct btrfs_dir_item *di = NULL;
struct btrfs_root *root = BTRFS_I(inode)->root;
struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_path *path;
size_t name_len = strlen(name);
int ret = 0;
ASSERT(trans);
if (name_len + size > BTRFS_MAX_XATTR_SIZE(root->fs_info))
return -ENOSPC;
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
path->skip_release_on_error = 1;
if (!value) {
di = btrfs_lookup_xattr(trans, root, path,
btrfs_ino(BTRFS_I(inode)), name, name_len, -1);
if (!di && (flags & XATTR_REPLACE))
ret = -ENODATA;
else if (IS_ERR(di))
ret = PTR_ERR(di);
else if (di)
ret = btrfs_delete_one_dir_name(trans, root, path, di);
goto out;
}
/*
* For a replace we can't just do the insert blindly.
* Do a lookup first (read-only btrfs_search_slot), and return if xattr
* doesn't exist. If it exists, fall down below to the insert/replace
* path - we can't race with a concurrent xattr delete, because the VFS
* locks the inode's i_mutex before calling setxattr or removexattr.
*/
if (flags & XATTR_REPLACE) {
ASSERT(inode_is_locked(inode));
di = btrfs_lookup_xattr(NULL, root, path,
btrfs_ino(BTRFS_I(inode)), name, name_len, 0);
if (!di)
ret = -ENODATA;
else if (IS_ERR(di))
ret = PTR_ERR(di);
if (ret)
goto out;
btrfs_release_path(path);
di = NULL;
}
ret = btrfs_insert_xattr_item(trans, root, path, btrfs_ino(BTRFS_I(inode)),
name, name_len, value, size);
if (ret == -EOVERFLOW) {
/*
* We have an existing item in a leaf, split_leaf couldn't
* expand it. That item might have or not a dir_item that
* matches our target xattr, so lets check.
*/
ret = 0;
btrfs_assert_tree_locked(path->nodes[0]);
di = btrfs_match_dir_item_name(fs_info, path, name, name_len); if (!di && !(flags & XATTR_REPLACE)) {
ret = -ENOSPC;
goto out;
}
} else if (ret == -EEXIST) {
ret = 0;
di = btrfs_match_dir_item_name(fs_info, path, name, name_len);
ASSERT(di); /* logic error */
} else if (ret) {
goto out;
}
if (di && (flags & XATTR_CREATE)) {
ret = -EEXIST;
goto out;
}
if (di) {
/*
* We're doing a replace, and it must be atomic, that is, at
* any point in time we have either the old or the new xattr
* value in the tree. We don't want readers (getxattr and
* listxattrs) to miss a value, this is specially important
* for ACLs.
*/
const int slot = path->slots[0];
struct extent_buffer *leaf = path->nodes[0];
const u16 old_data_len = btrfs_dir_data_len(leaf, di);
const u32 item_size = btrfs_item_size_nr(leaf, slot);
const u32 data_size = sizeof(*di) + name_len + size;
struct btrfs_item *item;
unsigned long data_ptr;
char *ptr;
if (size > old_data_len) {
if (btrfs_leaf_free_space(leaf) <
(size - old_data_len)) {
ret = -ENOSPC;
goto out;
}
}
if (old_data_len + name_len + sizeof(*di) == item_size) {
/* No other xattrs packed in the same leaf item. */
if (size > old_data_len)
btrfs_extend_item(path, size - old_data_len); else if (size < old_data_len) btrfs_truncate_item(path, data_size, 1);
} else {
/* There are other xattrs packed in the same item. */
ret = btrfs_delete_one_dir_name(trans, root, path, di);
if (ret)
goto out;
btrfs_extend_item(path, data_size);
}
item = btrfs_item_nr(slot);
ptr = btrfs_item_ptr(leaf, slot, char);
ptr += btrfs_item_size(leaf, item) - data_size;
di = (struct btrfs_dir_item *)ptr;
btrfs_set_dir_data_len(leaf, di, size);
data_ptr = ((unsigned long)(di + 1)) + name_len;
write_extent_buffer(leaf, value, data_ptr, size);
btrfs_mark_buffer_dirty(leaf);
} else {
/*
* Insert, and we had space for the xattr, so path->slots[0] is
* where our xattr dir_item is and btrfs_insert_xattr_item()
* filled it.
*/
}
out:
btrfs_free_path(path);
if (!ret) {
set_bit(BTRFS_INODE_COPY_EVERYTHING,
&BTRFS_I(inode)->runtime_flags);
clear_bit(BTRFS_INODE_NO_XATTRS, &BTRFS_I(inode)->runtime_flags);
}
return ret;
}
/*
* @value: "" makes the attribute to empty, NULL removes it
*/
int btrfs_setxattr_trans(struct inode *inode, const char *name,
const void *value, size_t size, int flags)
{
struct btrfs_root *root = BTRFS_I(inode)->root;
struct btrfs_trans_handle *trans;
const bool start_trans = (current->journal_info == NULL);
int ret;
if (start_trans) {
/*
* 1 unit for inserting/updating/deleting the xattr
* 1 unit for the inode item update
*/
trans = btrfs_start_transaction(root, 2);
if (IS_ERR(trans))
return PTR_ERR(trans);
} else {
/*
* This can happen when smack is enabled and a directory is being
* created. It happens through d_instantiate_new(), which calls
* smack_d_instantiate(), which in turn calls __vfs_setxattr() to
* set the transmute xattr (XATTR_NAME_SMACKTRANSMUTE) on the
* inode. We have already reserved space for the xattr and inode
* update at btrfs_mkdir(), so just use the transaction handle.
* We don't join or start a transaction, as that will reset the
* block_rsv of the handle and trigger a warning for the start
* case.
*/
ASSERT(strncmp(name, XATTR_SECURITY_PREFIX,
XATTR_SECURITY_PREFIX_LEN) == 0);
trans = current->journal_info;
}
ret = btrfs_setxattr(trans, inode, name, value, size, flags); if (ret)
goto out;
inode_inc_iversion(inode);
inode->i_ctime = current_time(inode);
ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
BUG_ON(ret);
out:
if (start_trans) btrfs_end_transaction(trans);
return ret;
}
ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size)
{
struct btrfs_key key;
struct inode *inode = d_inode(dentry);
struct btrfs_root *root = BTRFS_I(inode)->root;
struct btrfs_path *path;
int ret = 0;
size_t total_size = 0, size_left = size;
/*
* ok we want all objects associated with this id.
* NOTE: we set key.offset = 0; because we want to start with the
* first xattr that we find and walk forward
*/
key.objectid = btrfs_ino(BTRFS_I(inode));
key.type = BTRFS_XATTR_ITEM_KEY;
key.offset = 0;
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
path->reada = READA_FORWARD;
/* search for our xattrs */
ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
if (ret < 0)
goto err;
while (1) {
struct extent_buffer *leaf;
int slot;
struct btrfs_dir_item *di;
struct btrfs_key found_key;
u32 item_size;
u32 cur;
leaf = path->nodes[0];
slot = path->slots[0];
/* this is where we start walking through the path */
if (slot >= btrfs_header_nritems(leaf)) {
/*
* if we've reached the last slot in this leaf we need
* to go to the next leaf and reset everything
*/
ret = btrfs_next_leaf(root, path);
if (ret < 0)
goto err; else if (ret > 0)
break;
continue;
}
btrfs_item_key_to_cpu(leaf, &found_key, slot);
/* check to make sure this item is what we want */
if (found_key.objectid != key.objectid)
break;
if (found_key.type > BTRFS_XATTR_ITEM_KEY)
break;
if (found_key.type < BTRFS_XATTR_ITEM_KEY)
goto next_item;
di = btrfs_item_ptr(leaf, slot, struct btrfs_dir_item);
item_size = btrfs_item_size_nr(leaf, slot);
cur = 0;
while (cur < item_size) {
u16 name_len = btrfs_dir_name_len(leaf, di);
u16 data_len = btrfs_dir_data_len(leaf, di);
u32 this_len = sizeof(*di) + name_len + data_len;
unsigned long name_ptr = (unsigned long)(di + 1);
total_size += name_len + 1;
/*
* We are just looking for how big our buffer needs to
* be.
*/
if (!size)
goto next;
if (!buffer || (name_len + 1) > size_left) {
ret = -ERANGE;
goto err;
}
read_extent_buffer(leaf, buffer, name_ptr, name_len);
buffer[name_len] = '\0';
size_left -= name_len + 1;
buffer += name_len + 1;
next:
cur += this_len;
di = (struct btrfs_dir_item *)((char *)di + this_len);
}
next_item:
path->slots[0]++;
}
ret = total_size;
err:
btrfs_free_path(path); return ret;
}
static int btrfs_xattr_handler_get(const struct xattr_handler *handler,
struct dentry *unused, struct inode *inode,
const char *name, void *buffer, size_t size)
{
name = xattr_full_name(handler, name);
return btrfs_getxattr(inode, name, buffer, size);
}
static int btrfs_xattr_handler_set(const struct xattr_handler *handler,
struct user_namespace *mnt_userns,
struct dentry *unused, struct inode *inode,
const char *name, const void *buffer,
size_t size, int flags)
{
name = xattr_full_name(handler, name);
return btrfs_setxattr_trans(inode, name, buffer, size, flags);
}
static int btrfs_xattr_handler_set_prop(const struct xattr_handler *handler,
struct user_namespace *mnt_userns,
struct dentry *unused, struct inode *inode,
const char *name, const void *value,
size_t size, int flags)
{
int ret;
struct btrfs_trans_handle *trans;
struct btrfs_root *root = BTRFS_I(inode)->root;
name = xattr_full_name(handler, name);
ret = btrfs_validate_prop(name, value, size);
if (ret)
return ret;
trans = btrfs_start_transaction(root, 2);
if (IS_ERR(trans))
return PTR_ERR(trans);
ret = btrfs_set_prop(trans, inode, name, value, size, flags);
if (!ret) {
inode_inc_iversion(inode);
inode->i_ctime = current_time(inode);
ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
BUG_ON(ret);
}
btrfs_end_transaction(trans);
return ret;
}
static const struct xattr_handler btrfs_security_xattr_handler = {
.prefix = XATTR_SECURITY_PREFIX,
.get = btrfs_xattr_handler_get,
.set = btrfs_xattr_handler_set,
};
static const struct xattr_handler btrfs_trusted_xattr_handler = {
.prefix = XATTR_TRUSTED_PREFIX,
.get = btrfs_xattr_handler_get,
.set = btrfs_xattr_handler_set,
};
static const struct xattr_handler btrfs_user_xattr_handler = {
.prefix = XATTR_USER_PREFIX,
.get = btrfs_xattr_handler_get,
.set = btrfs_xattr_handler_set,
};
static const struct xattr_handler btrfs_btrfs_xattr_handler = {
.prefix = XATTR_BTRFS_PREFIX,
.get = btrfs_xattr_handler_get,
.set = btrfs_xattr_handler_set_prop,
};
const struct xattr_handler *btrfs_xattr_handlers[] = {
&btrfs_security_xattr_handler,
#ifdef CONFIG_BTRFS_FS_POSIX_ACL
&posix_acl_access_xattr_handler,
&posix_acl_default_xattr_handler,
#endif
&btrfs_trusted_xattr_handler,
&btrfs_user_xattr_handler,
&btrfs_btrfs_xattr_handler,
NULL,
};
static int btrfs_initxattrs(struct inode *inode,
const struct xattr *xattr_array, void *fs_private)
{
struct btrfs_trans_handle *trans = fs_private;
const struct xattr *xattr;
unsigned int nofs_flag;
char *name;
int err = 0;
/*
* We're holding a transaction handle, so use a NOFS memory allocation
* context to avoid deadlock if reclaim happens.
*/
nofs_flag = memalloc_nofs_save();
for (xattr = xattr_array; xattr->name != NULL; xattr++) {
name = kmalloc(XATTR_SECURITY_PREFIX_LEN +
strlen(xattr->name) + 1, GFP_KERNEL);
if (!name) {
err = -ENOMEM;
break;
}
strcpy(name, XATTR_SECURITY_PREFIX);
strcpy(name + XATTR_SECURITY_PREFIX_LEN, xattr->name);
err = btrfs_setxattr(trans, inode, name, xattr->value,
xattr->value_len, 0);
kfree(name);
if (err < 0)
break;
}
memalloc_nofs_restore(nofs_flag);
return err;
}
int btrfs_xattr_security_init(struct btrfs_trans_handle *trans,
struct inode *inode, struct inode *dir,
const struct qstr *qstr)
{
return security_inode_init_security(inode, dir, qstr,
&btrfs_initxattrs, trans);
}
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef LINUX_MM_INLINE_H
#define LINUX_MM_INLINE_H
#include <linux/huge_mm.h>
#include <linux/swap.h>
/**
* page_is_file_lru - should the page be on a file LRU or anon LRU?
* @page: the page to test
*
* Returns 1 if @page is a regular filesystem backed page cache page or a lazily
* freed anonymous page (e.g. via MADV_FREE). Returns 0 if @page is a normal
* anonymous page, a tmpfs page or otherwise ram or swap backed page. Used by
* functions that manipulate the LRU lists, to sort a page onto the right LRU
* list.
*
* We would like to get this info without a page flag, but the state
* needs to survive until the page is last deleted from the LRU, which
* could be as far down as __page_cache_release.
*/
static inline int page_is_file_lru(struct page *page)
{
return !PageSwapBacked(page);
}
static __always_inline void update_lru_size(struct lruvec *lruvec,
enum lru_list lru, enum zone_type zid,
int nr_pages)
{
struct pglist_data *pgdat = lruvec_pgdat(lruvec);
__mod_lruvec_state(lruvec, NR_LRU_BASE + lru, nr_pages);
__mod_zone_page_state(&pgdat->node_zones[zid],
NR_ZONE_LRU_BASE + lru, nr_pages);
#ifdef CONFIG_MEMCG
mem_cgroup_update_lru_size(lruvec, lru, zid, nr_pages);
#endif
}
/**
* __clear_page_lru_flags - clear page lru flags before releasing a page
* @page: the page that was on lru and now has a zero reference
*/
static __always_inline void __clear_page_lru_flags(struct page *page)
{
VM_BUG_ON_PAGE(!PageLRU(page), page);
__ClearPageLRU(page);
/* this shouldn't happen, so leave the flags to bad_page() */
if (PageActive(page) && PageUnevictable(page))
return;
__ClearPageActive(page);
__ClearPageUnevictable(page);
}
/**
* page_lru - which LRU list should a page be on?
* @page: the page to test
*
* Returns the LRU list a page should be on, as an index
* into the array of LRU lists.
*/
static __always_inline enum lru_list page_lru(struct page *page)
{
enum lru_list lru;
VM_BUG_ON_PAGE(PageActive(page) && PageUnevictable(page), page);
if (PageUnevictable(page))
return LRU_UNEVICTABLE;
lru = page_is_file_lru(page) ? LRU_INACTIVE_FILE : LRU_INACTIVE_ANON;
if (PageActive(page)) lru += LRU_ACTIVE;
return lru;
}
static __always_inline void add_page_to_lru_list(struct page *page,
struct lruvec *lruvec)
{
enum lru_list lru = page_lru(page);
update_lru_size(lruvec, lru, page_zonenum(page), thp_nr_pages(page));
list_add(&page->lru, &lruvec->lists[lru]);
}
static __always_inline void add_page_to_lru_list_tail(struct page *page,
struct lruvec *lruvec)
{
enum lru_list lru = page_lru(page);
update_lru_size(lruvec, lru, page_zonenum(page), thp_nr_pages(page));
list_add_tail(&page->lru, &lruvec->lists[lru]);
}
static __always_inline void del_page_from_lru_list(struct page *page,
struct lruvec *lruvec)
{
list_del(&page->lru);
update_lru_size(lruvec, page_lru(page), page_zonenum(page),
-thp_nr_pages(page));
}
#endif
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_KERNEL_H
#define _LINUX_KERNEL_H
#include <linux/stdarg.h>
#include <linux/align.h>
#include <linux/limits.h>
#include <linux/linkage.h>
#include <linux/stddef.h>
#include <linux/types.h>
#include <linux/compiler.h>
#include <linux/bitops.h>
#include <linux/kstrtox.h>
#include <linux/log2.h>
#include <linux/math.h>
#include <linux/minmax.h>
#include <linux/typecheck.h>
#include <linux/panic.h>
#include <linux/printk.h>
#include <linux/build_bug.h>
#include <linux/static_call_types.h>
#include <asm/byteorder.h>
#include <uapi/linux/kernel.h>
#define STACK_MAGIC 0xdeadbeef
/**
* REPEAT_BYTE - repeat the value @x multiple times as an unsigned long value
* @x: value to repeat
*
* NOTE: @x is not checked for > 0xff; larger values produce odd results.
*/
#define REPEAT_BYTE(x) ((~0ul / 0xff) * (x))
/* generic data direction definitions */
#define READ 0
#define WRITE 1
/**
* ARRAY_SIZE - get the number of elements in array @arr
* @arr: array to be sized
*/
#define ARRAY_SIZE(arr) (sizeof(arr) / sizeof((arr)[0]) + __must_be_array(arr))
#define PTR_IF(cond, ptr) ((cond) ? (ptr) : NULL)
#define u64_to_user_ptr(x) ( \
{ \
typecheck(u64, (x)); \
(void __user *)(uintptr_t)(x); \
} \
)
#define typeof_member(T, m) typeof(((T*)0)->m)
#define _RET_IP_ (unsigned long)__builtin_return_address(0)
#define _THIS_IP_ ({ __label__ __here; __here: (unsigned long)&&__here; })
/**
* upper_32_bits - return bits 32-63 of a number
* @n: the number we're accessing
*
* A basic shift-right of a 64- or 32-bit quantity. Use this to suppress
* the "right shift count >= width of type" warning when that quantity is
* 32-bits.
*/
#define upper_32_bits(n) ((u32)(((n) >> 16) >> 16))
/**
* lower_32_bits - return bits 0-31 of a number
* @n: the number we're accessing
*/
#define lower_32_bits(n) ((u32)((n) & 0xffffffff))
/**
* upper_16_bits - return bits 16-31 of a number
* @n: the number we're accessing
*/
#define upper_16_bits(n) ((u16)((n) >> 16))
/**
* lower_16_bits - return bits 0-15 of a number
* @n: the number we're accessing
*/
#define lower_16_bits(n) ((u16)((n) & 0xffff))
struct completion;
struct user;
#ifdef CONFIG_PREEMPT_VOLUNTARY
extern int __cond_resched(void);
# define might_resched() __cond_resched()
#elif defined(CONFIG_PREEMPT_DYNAMIC)
extern int __cond_resched(void);
DECLARE_STATIC_CALL(might_resched, __cond_resched);
static __always_inline void might_resched(void)
{
static_call_mod(might_resched)();
}
#else
# define might_resched() do { } while (0)
#endif /* CONFIG_PREEMPT_* */
#ifdef CONFIG_DEBUG_ATOMIC_SLEEP
extern void ___might_sleep(const char *file, int line, int preempt_offset);
extern void __might_sleep(const char *file, int line, int preempt_offset);
extern void __cant_sleep(const char *file, int line, int preempt_offset);
extern void __cant_migrate(const char *file, int line);
/**
* might_sleep - annotation for functions that can sleep
*
* this macro will print a stack trace if it is executed in an atomic
* context (spinlock, irq-handler, ...). Additional sections where blocking is
* not allowed can be annotated with non_block_start() and non_block_end()
* pairs.
*
* This is a useful debugging help to be able to catch problems early and not
* be bitten later when the calling function happens to sleep when it is not
* supposed to.
*/
# define might_sleep() \
do { __might_sleep(__FILE__, __LINE__, 0); might_resched(); } while (0)
/**
* cant_sleep - annotation for functions that cannot sleep
*
* this macro will print a stack trace if it is executed with preemption enabled
*/
# define cant_sleep() \
do { __cant_sleep(__FILE__, __LINE__, 0); } while (0)
# define sched_annotate_sleep() (current->task_state_change = 0)
/**
* cant_migrate - annotation for functions that cannot migrate
*
* Will print a stack trace if executed in code which is migratable
*/
# define cant_migrate() \
do { \
if (IS_ENABLED(CONFIG_SMP)) \
__cant_migrate(__FILE__, __LINE__); \
} while (0)
/**
* non_block_start - annotate the start of section where sleeping is prohibited
*
* This is on behalf of the oom reaper, specifically when it is calling the mmu
* notifiers. The problem is that if the notifier were to block on, for example,
* mutex_lock() and if the process which holds that mutex were to perform a
* sleeping memory allocation, the oom reaper is now blocked on completion of
* that memory allocation. Other blocking calls like wait_event() pose similar
* issues.
*/
# define non_block_start() (current->non_block_count++)
/**
* non_block_end - annotate the end of section where sleeping is prohibited
*
* Closes a section opened by non_block_start().
*/
# define non_block_end() WARN_ON(current->non_block_count-- == 0)
#else
static inline void ___might_sleep(const char *file, int line,
int preempt_offset) { }
static inline void __might_sleep(const char *file, int line,
int preempt_offset) { }
# define might_sleep() do { might_resched(); } while (0)
# define cant_sleep() do { } while (0)
# define cant_migrate() do { } while (0)
# define sched_annotate_sleep() do { } while (0)
# define non_block_start() do { } while (0)
# define non_block_end() do { } while (0)
#endif
#define might_sleep_if(cond) do { if (cond) might_sleep(); } while (0)
#if defined(CONFIG_MMU) && \
(defined(CONFIG_PROVE_LOCKING) || defined(CONFIG_DEBUG_ATOMIC_SLEEP))
#define might_fault() __might_fault(__FILE__, __LINE__)
void __might_fault(const char *file, int line);
#else
static inline void might_fault(void) { }
#endif
void do_exit(long error_code) __noreturn;
void complete_and_exit(struct completion *, long) __noreturn;
extern int num_to_str(char *buf, int size,
unsigned long long num, unsigned int width);
/* lib/printf utilities */
extern __printf(2, 3) int sprintf(char *buf, const char * fmt, ...);
extern __printf(2, 0) int vsprintf(char *buf, const char *, va_list);
extern __printf(3, 4)
int snprintf(char *buf, size_t size, const char *fmt, ...);
extern __printf(3, 0)
int vsnprintf(char *buf, size_t size, const char *fmt, va_list args);
extern __printf(3, 4)
int scnprintf(char *buf, size_t size, const char *fmt, ...);
extern __printf(3, 0)
int vscnprintf(char *buf, size_t size, const char *fmt, va_list args);
extern __printf(2, 3) __malloc
char *kasprintf(gfp_t gfp, const char *fmt, ...);
extern __printf(2, 0) __malloc
char *kvasprintf(gfp_t gfp, const char *fmt, va_list args);
extern __printf(2, 0)
const char *kvasprintf_const(gfp_t gfp, const char *fmt, va_list args);
extern __scanf(2, 3)
int sscanf(const char *, const char *, ...);
extern __scanf(2, 0)
int vsscanf(const char *, const char *, va_list);
extern int no_hash_pointers_enable(char *str);
extern int get_option(char **str, int *pint);
extern char *get_options(const char *str, int nints, int *ints);
extern unsigned long long memparse(const char *ptr, char **retptr);
extern bool parse_option_str(const char *str, const char *option);
extern char *next_arg(char *args, char **param, char **val);
extern int core_kernel_text(unsigned long addr);
extern int init_kernel_text(unsigned long addr);
extern int core_kernel_data(unsigned long addr);
extern int __kernel_text_address(unsigned long addr);
extern int kernel_text_address(unsigned long addr);
extern int func_ptr_is_kernel_text(void *ptr);
extern void bust_spinlocks(int yes);
extern int root_mountflags;
extern bool early_boot_irqs_disabled;
/*
* Values used for system_state. Ordering of the states must not be changed
* as code checks for <, <=, >, >= STATE.
*/
extern enum system_states {
SYSTEM_BOOTING,
SYSTEM_SCHEDULING,
SYSTEM_RUNNING,
SYSTEM_HALT,
SYSTEM_POWER_OFF,
SYSTEM_RESTART,
SYSTEM_SUSPEND,
} system_state;
extern const char hex_asc[];
#define hex_asc_lo(x) hex_asc[((x) & 0x0f)]
#define hex_asc_hi(x) hex_asc[((x) & 0xf0) >> 4]
static inline char *hex_byte_pack(char *buf, u8 byte)
{
*buf++ = hex_asc_hi(byte);
*buf++ = hex_asc_lo(byte);
return buf;
}
extern const char hex_asc_upper[];
#define hex_asc_upper_lo(x) hex_asc_upper[((x) & 0x0f)]
#define hex_asc_upper_hi(x) hex_asc_upper[((x) & 0xf0) >> 4]
static inline char *hex_byte_pack_upper(char *buf, u8 byte)
{
*buf++ = hex_asc_upper_hi(byte);
*buf++ = hex_asc_upper_lo(byte);
return buf;
}
extern int hex_to_bin(unsigned char ch);
extern int __must_check hex2bin(u8 *dst, const char *src, size_t count);
extern char *bin2hex(char *dst, const void *src, size_t count);
bool mac_pton(const char *s, u8 *mac);
/*
* General tracing related utility functions - trace_printk(),
* tracing_on/tracing_off and tracing_start()/tracing_stop
*
* Use tracing_on/tracing_off when you want to quickly turn on or off
* tracing. It simply enables or disables the recording of the trace events.
* This also corresponds to the user space /sys/kernel/debug/tracing/tracing_on
* file, which gives a means for the kernel and userspace to interact.
* Place a tracing_off() in the kernel where you want tracing to end.
* From user space, examine the trace, and then echo 1 > tracing_on
* to continue tracing.
*
* tracing_stop/tracing_start has slightly more overhead. It is used
* by things like suspend to ram where disabling the recording of the
* trace is not enough, but tracing must actually stop because things
* like calling smp_processor_id() may crash the system.
*
* Most likely, you want to use tracing_on/tracing_off.
*/
enum ftrace_dump_mode {
DUMP_NONE,
DUMP_ALL,
DUMP_ORIG,
};
#ifdef CONFIG_TRACING
void tracing_on(void);
void tracing_off(void);
int tracing_is_on(void);
void tracing_snapshot(void);
void tracing_snapshot_alloc(void);
extern void tracing_start(void);
extern void tracing_stop(void);
static inline __printf(1, 2)
void ____trace_printk_check_format(const char *fmt, ...)
{
}
#define __trace_printk_check_format(fmt, args...) \
do { \
if (0) \
____trace_printk_check_format(fmt, ##args); \
} while (0)
/**
* trace_printk - printf formatting in the ftrace buffer
* @fmt: the printf format for printing
*
* Note: __trace_printk is an internal function for trace_printk() and
* the @ip is passed in via the trace_printk() macro.
*
* This function allows a kernel developer to debug fast path sections
* that printk is not appropriate for. By scattering in various
* printk like tracing in the code, a developer can quickly see
* where problems are occurring.
*
* This is intended as a debugging tool for the developer only.
* Please refrain from leaving trace_printks scattered around in
* your code. (Extra memory is used for special buffers that are
* allocated when trace_printk() is used.)
*
* A little optimization trick is done here. If there's only one
* argument, there's no need to scan the string for printf formats.
* The trace_puts() will suffice. But how can we take advantage of
* using trace_puts() when trace_printk() has only one argument?
* By stringifying the args and checking the size we can tell
* whether or not there are args. __stringify((__VA_ARGS__)) will
* turn into "()\0" with a size of 3 when there are no args, anything
* else will be bigger. All we need to do is define a string to this,
* and then take its size and compare to 3. If it's bigger, use
* do_trace_printk() otherwise, optimize it to trace_puts(). Then just
* let gcc optimize the rest.
*/
#define trace_printk(fmt, ...) \
do { \
char _______STR[] = __stringify((__VA_ARGS__)); \
if (sizeof(_______STR) > 3) \
do_trace_printk(fmt, ##__VA_ARGS__); \
else \
trace_puts(fmt); \
} while (0)
#define do_trace_printk(fmt, args...) \
do { \
static const char *trace_printk_fmt __used \
__section("__trace_printk_fmt") = \
__builtin_constant_p(fmt) ? fmt : NULL; \
\
__trace_printk_check_format(fmt, ##args); \
\
if (__builtin_constant_p(fmt)) \
__trace_bprintk(_THIS_IP_, trace_printk_fmt, ##args); \
else \
__trace_printk(_THIS_IP_, fmt, ##args); \
} while (0)
extern __printf(2, 3)
int __trace_bprintk(unsigned long ip, const char *fmt, ...);
extern __printf(2, 3)
int __trace_printk(unsigned long ip, const char *fmt, ...);
/**
* trace_puts - write a string into the ftrace buffer
* @str: the string to record
*
* Note: __trace_bputs is an internal function for trace_puts and
* the @ip is passed in via the trace_puts macro.
*
* This is similar to trace_printk() but is made for those really fast
* paths that a developer wants the least amount of "Heisenbug" effects,
* where the processing of the print format is still too much.
*
* This function allows a kernel developer to debug fast path sections
* that printk is not appropriate for. By scattering in various
* printk like tracing in the code, a developer can quickly see
* where problems are occurring.
*
* This is intended as a debugging tool for the developer only.
* Please refrain from leaving trace_puts scattered around in
* your code. (Extra memory is used for special buffers that are
* allocated when trace_puts() is used.)
*
* Returns: 0 if nothing was written, positive # if string was.
* (1 when __trace_bputs is used, strlen(str) when __trace_puts is used)
*/
#define trace_puts(str) ({ \
static const char *trace_printk_fmt __used \
__section("__trace_printk_fmt") = \
__builtin_constant_p(str) ? str : NULL; \
\
if (__builtin_constant_p(str)) \
__trace_bputs(_THIS_IP_, trace_printk_fmt); \
else \
__trace_puts(_THIS_IP_, str, strlen(str)); \
})
extern int __trace_bputs(unsigned long ip, const char *str);
extern int __trace_puts(unsigned long ip, const char *str, int size);
extern void trace_dump_stack(int skip);
/*
* The double __builtin_constant_p is because gcc will give us an error
* if we try to allocate the static variable to fmt if it is not a
* constant. Even with the outer if statement.
*/
#define ftrace_vprintk(fmt, vargs) \
do { \
if (__builtin_constant_p(fmt)) { \
static const char *trace_printk_fmt __used \
__section("__trace_printk_fmt") = \
__builtin_constant_p(fmt) ? fmt : NULL; \
\
__ftrace_vbprintk(_THIS_IP_, trace_printk_fmt, vargs); \
} else \
__ftrace_vprintk(_THIS_IP_, fmt, vargs); \
} while (0)
extern __printf(2, 0) int
__ftrace_vbprintk(unsigned long ip, const char *fmt, va_list ap);
extern __printf(2, 0) int
__ftrace_vprintk(unsigned long ip, const char *fmt, va_list ap);
extern void ftrace_dump(enum ftrace_dump_mode oops_dump_mode);
#else
static inline void tracing_start(void) { }
static inline void tracing_stop(void) { }
static inline void trace_dump_stack(int skip) { }
static inline void tracing_on(void) { }
static inline void tracing_off(void) { }
static inline int tracing_is_on(void) { return 0; }
static inline void tracing_snapshot(void) { }
static inline void tracing_snapshot_alloc(void) { }
static inline __printf(1, 2)
int trace_printk(const char *fmt, ...)
{
return 0;
}
static __printf(1, 0) inline int
ftrace_vprintk(const char *fmt, va_list ap)
{
return 0;
}
static inline void ftrace_dump(enum ftrace_dump_mode oops_dump_mode) { }
#endif /* CONFIG_TRACING */
/* This counts to 12. Any more, it will return 13th argument. */
#define __COUNT_ARGS(_0, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _n, X...) _n
#define COUNT_ARGS(X...) __COUNT_ARGS(, ##X, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)
#define __CONCAT(a, b) a ## b
#define CONCATENATE(a, b) __CONCAT(a, b)
/**
* container_of - cast a member of a structure out to the containing structure
* @ptr: the pointer to the member.
* @type: the type of the container struct this is embedded in.
* @member: the name of the member within the struct.
*
*/
#define container_of(ptr, type, member) ({ \
void *__mptr = (void *)(ptr); \
BUILD_BUG_ON_MSG(!__same_type(*(ptr), ((type *)0)->member) && \
!__same_type(*(ptr), void), \
"pointer type mismatch in container_of()"); \
((type *)(__mptr - offsetof(type, member))); })
/**
* container_of_safe - cast a member of a structure out to the containing structure
* @ptr: the pointer to the member.
* @type: the type of the container struct this is embedded in.
* @member: the name of the member within the struct.
*
* If IS_ERR_OR_NULL(ptr), ptr is returned unchanged.
*/
#define container_of_safe(ptr, type, member) ({ \
void *__mptr = (void *)(ptr); \
BUILD_BUG_ON_MSG(!__same_type(*(ptr), ((type *)0)->member) && \
!__same_type(*(ptr), void), \
"pointer type mismatch in container_of()"); \
IS_ERR_OR_NULL(__mptr) ? ERR_CAST(__mptr) : \
((type *)(__mptr - offsetof(type, member))); })
/* Rebuild everything on CONFIG_FTRACE_MCOUNT_RECORD */
#ifdef CONFIG_FTRACE_MCOUNT_RECORD
# define REBUILD_DUE_TO_FTRACE_MCOUNT_RECORD
#endif
/* Permissions on a sysfs file: you didn't miss the 0 prefix did you? */
#define VERIFY_OCTAL_PERMISSIONS(perms) \
(BUILD_BUG_ON_ZERO((perms) < 0) + \
BUILD_BUG_ON_ZERO((perms) > 0777) + \
/* USER_READABLE >= GROUP_READABLE >= OTHER_READABLE */ \
BUILD_BUG_ON_ZERO((((perms) >> 6) & 4) < (((perms) >> 3) & 4)) + \
BUILD_BUG_ON_ZERO((((perms) >> 3) & 4) < ((perms) & 4)) + \
/* USER_WRITABLE >= GROUP_WRITABLE */ \
BUILD_BUG_ON_ZERO((((perms) >> 6) & 2) < (((perms) >> 3) & 2)) + \
/* OTHER_WRITABLE? Generally considered a bad idea. */ \
BUILD_BUG_ON_ZERO((perms) & 2) + \
(perms))
#endif
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_RMAP_H
#define _LINUX_RMAP_H
/*
* Declarations for Reverse Mapping functions in mm/rmap.c
*/
#include <linux/list.h>
#include <linux/slab.h>
#include <linux/mm.h>
#include <linux/rwsem.h>
#include <linux/memcontrol.h>
#include <linux/highmem.h>
/*
* The anon_vma heads a list of private "related" vmas, to scan if
* an anonymous page pointing to this anon_vma needs to be unmapped:
* the vmas on the list will be related by forking, or by splitting.
*
* Since vmas come and go as they are split and merged (particularly
* in mprotect), the mapping field of an anonymous page cannot point
* directly to a vma: instead it points to an anon_vma, on whose list
* the related vmas can be easily linked or unlinked.
*
* After unlinking the last vma on the list, we must garbage collect
* the anon_vma object itself: we're guaranteed no page can be
* pointing to this anon_vma once its vma list is empty.
*/
struct anon_vma {
struct anon_vma *root; /* Root of this anon_vma tree */
struct rw_semaphore rwsem; /* W: modification, R: walking the list */
/*
* The refcount is taken on an anon_vma when there is no
* guarantee that the vma of page tables will exist for
* the duration of the operation. A caller that takes
* the reference is responsible for clearing up the
* anon_vma if they are the last user on release
*/
atomic_t refcount;
/*
* Count of child anon_vmas and VMAs which points to this anon_vma.
*
* This counter is used for making decision about reusing anon_vma
* instead of forking new one. See comments in function anon_vma_clone.
*/
unsigned degree;
struct anon_vma *parent; /* Parent of this anon_vma */
/*
* NOTE: the LSB of the rb_root.rb_node is set by
* mm_take_all_locks() _after_ taking the above lock. So the
* rb_root must only be read/written after taking the above lock
* to be sure to see a valid next pointer. The LSB bit itself
* is serialized by a system wide lock only visible to
* mm_take_all_locks() (mm_all_locks_mutex).
*/
/* Interval tree of private "related" vmas */
struct rb_root_cached rb_root;
};
/*
* The copy-on-write semantics of fork mean that an anon_vma
* can become associated with multiple processes. Furthermore,
* each child process will have its own anon_vma, where new
* pages for that process are instantiated.
*
* This structure allows us to find the anon_vmas associated
* with a VMA, or the VMAs associated with an anon_vma.
* The "same_vma" list contains the anon_vma_chains linking
* all the anon_vmas associated with this VMA.
* The "rb" field indexes on an interval tree the anon_vma_chains
* which link all the VMAs associated with this anon_vma.
*/
struct anon_vma_chain {
struct vm_area_struct *vma;
struct anon_vma *anon_vma;
struct list_head same_vma; /* locked by mmap_lock & page_table_lock */
struct rb_node rb; /* locked by anon_vma->rwsem */
unsigned long rb_subtree_last;
#ifdef CONFIG_DEBUG_VM_RB
unsigned long cached_vma_start, cached_vma_last;
#endif
};
enum ttu_flags {
TTU_SPLIT_HUGE_PMD = 0x4, /* split huge PMD if any */
TTU_IGNORE_MLOCK = 0x8, /* ignore mlock */
TTU_SYNC = 0x10, /* avoid racy checks with PVMW_SYNC */
TTU_IGNORE_HWPOISON = 0x20, /* corrupted page is recoverable */
TTU_BATCH_FLUSH = 0x40, /* Batch TLB flushes where possible
* and caller guarantees they will
* do a final flush if necessary */
TTU_RMAP_LOCKED = 0x80, /* do not grab rmap lock:
* caller holds it */
};
#ifdef CONFIG_MMU
static inline void get_anon_vma(struct anon_vma *anon_vma)
{
atomic_inc(&anon_vma->refcount);
}
void __put_anon_vma(struct anon_vma *anon_vma);
static inline void put_anon_vma(struct anon_vma *anon_vma)
{
if (atomic_dec_and_test(&anon_vma->refcount)) __put_anon_vma(anon_vma);
}
static inline void anon_vma_lock_write(struct anon_vma *anon_vma)
{
down_write(&anon_vma->root->rwsem);
}
static inline void anon_vma_unlock_write(struct anon_vma *anon_vma)
{
up_write(&anon_vma->root->rwsem);
}
static inline void anon_vma_lock_read(struct anon_vma *anon_vma)
{
down_read(&anon_vma->root->rwsem);
}
static inline void anon_vma_unlock_read(struct anon_vma *anon_vma)
{
up_read(&anon_vma->root->rwsem);
}
/*
* anon_vma helper functions.
*/
void anon_vma_init(void); /* create anon_vma_cachep */
int __anon_vma_prepare(struct vm_area_struct *);
void unlink_anon_vmas(struct vm_area_struct *);
int anon_vma_clone(struct vm_area_struct *, struct vm_area_struct *);
int anon_vma_fork(struct vm_area_struct *, struct vm_area_struct *);
static inline int anon_vma_prepare(struct vm_area_struct *vma)
{
if (likely(vma->anon_vma))
return 0;
return __anon_vma_prepare(vma);
}
static inline void anon_vma_merge(struct vm_area_struct *vma,
struct vm_area_struct *next)
{
VM_BUG_ON_VMA(vma->anon_vma != next->anon_vma, vma);
unlink_anon_vmas(next);
}
struct anon_vma *page_get_anon_vma(struct page *page);
/* bitflags for do_page_add_anon_rmap() */
#define RMAP_EXCLUSIVE 0x01
#define RMAP_COMPOUND 0x02
/*
* rmap interfaces called when adding or removing pte of page
*/
void page_move_anon_rmap(struct page *, struct vm_area_struct *);
void page_add_anon_rmap(struct page *, struct vm_area_struct *,
unsigned long, bool);
void do_page_add_anon_rmap(struct page *, struct vm_area_struct *,
unsigned long, int);
void page_add_new_anon_rmap(struct page *, struct vm_area_struct *,
unsigned long, bool);
void page_add_file_rmap(struct page *, bool);
void page_remove_rmap(struct page *, bool);
void hugepage_add_anon_rmap(struct page *, struct vm_area_struct *,
unsigned long);
void hugepage_add_new_anon_rmap(struct page *, struct vm_area_struct *,
unsigned long);
static inline void page_dup_rmap(struct page *page, bool compound)
{
atomic_inc(compound ? compound_mapcount_ptr(page) : &page->_mapcount);
}
/*
* Called from mm/vmscan.c to handle paging out
*/
int page_referenced(struct page *, int is_locked,
struct mem_cgroup *memcg, unsigned long *vm_flags);
void try_to_migrate(struct page *page, enum ttu_flags flags);
void try_to_unmap(struct page *, enum ttu_flags flags);
int make_device_exclusive_range(struct mm_struct *mm, unsigned long start,
unsigned long end, struct page **pages,
void *arg);
/* Avoid racy checks */
#define PVMW_SYNC (1 << 0)
/* Look for migarion entries rather than present PTEs */
#define PVMW_MIGRATION (1 << 1)
struct page_vma_mapped_walk {
struct page *page;
struct vm_area_struct *vma;
unsigned long address;
pmd_t *pmd;
pte_t *pte;
spinlock_t *ptl;
unsigned int flags;
};
static inline void page_vma_mapped_walk_done(struct page_vma_mapped_walk *pvmw)
{
/* HugeTLB pte is set to the relevant page table entry without pte_mapped. */
if (pvmw->pte && !PageHuge(pvmw->page))
pte_unmap(pvmw->pte);
if (pvmw->ptl)
spin_unlock(pvmw->ptl);
}
bool page_vma_mapped_walk(struct page_vma_mapped_walk *pvmw);
/*
* Used by swapoff to help locate where page is expected in vma.
*/
unsigned long page_address_in_vma(struct page *, struct vm_area_struct *);
/*
* Cleans the PTEs of shared mappings.
* (and since clean PTEs should also be readonly, write protects them too)
*
* returns the number of cleaned PTEs.
*/
int page_mkclean(struct page *);
/*
* called in munlock()/munmap() path to check for other vmas holding
* the page mlocked.
*/
void page_mlock(struct page *page);
void remove_migration_ptes(struct page *old, struct page *new, bool locked);
/*
* Called by memory-failure.c to kill processes.
*/
struct anon_vma *page_lock_anon_vma_read(struct page *page);
void page_unlock_anon_vma_read(struct anon_vma *anon_vma);
int page_mapped_in_vma(struct page *page, struct vm_area_struct *vma);
/*
* rmap_walk_control: To control rmap traversing for specific needs
*
* arg: passed to rmap_one() and invalid_vma()
* rmap_one: executed on each vma where page is mapped
* done: for checking traversing termination condition
* anon_lock: for getting anon_lock by optimized way rather than default
* invalid_vma: for skipping uninterested vma
*/
struct rmap_walk_control {
void *arg;
/*
* Return false if page table scanning in rmap_walk should be stopped.
* Otherwise, return true.
*/
bool (*rmap_one)(struct page *page, struct vm_area_struct *vma,
unsigned long addr, void *arg);
int (*done)(struct page *page);
struct anon_vma *(*anon_lock)(struct page *page);
bool (*invalid_vma)(struct vm_area_struct *vma, void *arg);
};
void rmap_walk(struct page *page, struct rmap_walk_control *rwc);
void rmap_walk_locked(struct page *page, struct rmap_walk_control *rwc);
#else /* !CONFIG_MMU */
#define anon_vma_init() do {} while (0)
#define anon_vma_prepare(vma) (0)
#define anon_vma_link(vma) do {} while (0)
static inline int page_referenced(struct page *page, int is_locked,
struct mem_cgroup *memcg,
unsigned long *vm_flags)
{
*vm_flags = 0;
return 0;
}
static inline void try_to_unmap(struct page *page, enum ttu_flags flags)
{
}
static inline int page_mkclean(struct page *page)
{
return 0;
}
#endif /* CONFIG_MMU */
#endif /* _LINUX_RMAP_H */
// SPDX-License-Identifier: GPL-2.0-only
/*
* Copyright (C) 1999 Eric Youngdale
* Copyright (C) 2014 Christoph Hellwig
*
* SCSI queueing library.
* Initial versions: Eric Youngdale (eric@andante.org).
* Based upon conversations with large numbers
* of people at Linux Expo.
*/
#include <linux/bio.h>
#include <linux/bitops.h>
#include <linux/blkdev.h>
#include <linux/completion.h>
#include <linux/kernel.h>
#include <linux/export.h>
#include <linux/init.h>
#include <linux/pci.h>
#include <linux/delay.h>
#include <linux/hardirq.h>
#include <linux/scatterlist.h>
#include <linux/blk-mq.h>
#include <linux/ratelimit.h>
#include <asm/unaligned.h>
#include <scsi/scsi.h>
#include <scsi/scsi_cmnd.h>
#include <scsi/scsi_dbg.h>
#include <scsi/scsi_device.h>
#include <scsi/scsi_driver.h>
#include <scsi/scsi_eh.h>
#include <scsi/scsi_host.h>
#include <scsi/scsi_transport.h> /* __scsi_init_queue() */
#include <scsi/scsi_dh.h>
#include <trace/events/scsi.h>
#include "scsi_debugfs.h"
#include "scsi_priv.h"
#include "scsi_logging.h"
/*
* Size of integrity metadata is usually small, 1 inline sg should
* cover normal cases.
*/
#ifdef CONFIG_ARCH_NO_SG_CHAIN
#define SCSI_INLINE_PROT_SG_CNT 0
#define SCSI_INLINE_SG_CNT 0
#else
#define SCSI_INLINE_PROT_SG_CNT 1
#define SCSI_INLINE_SG_CNT 2
#endif
static struct kmem_cache *scsi_sense_cache;
static DEFINE_MUTEX(scsi_sense_cache_mutex);
static void scsi_mq_uninit_cmd(struct scsi_cmnd *cmd);
int scsi_init_sense_cache(struct Scsi_Host *shost)
{
int ret = 0;
mutex_lock(&scsi_sense_cache_mutex);
if (!scsi_sense_cache) {
scsi_sense_cache =
kmem_cache_create_usercopy("scsi_sense_cache",
SCSI_SENSE_BUFFERSIZE, 0, SLAB_HWCACHE_ALIGN,
0, SCSI_SENSE_BUFFERSIZE, NULL);
if (!scsi_sense_cache)
ret = -ENOMEM;
}
mutex_unlock(&scsi_sense_cache_mutex);
return ret;
}
/*
* When to reinvoke queueing after a resource shortage. It's 3 msecs to
* not change behaviour from the previous unplug mechanism, experimentation
* may prove this needs changing.
*/
#define SCSI_QUEUE_DELAY 3
static void
scsi_set_blocked(struct scsi_cmnd *cmd, int reason)
{
struct Scsi_Host *host = cmd->device->host;
struct scsi_device *device = cmd->device;
struct scsi_target *starget = scsi_target(device);
/*
* Set the appropriate busy bit for the device/host.
*
* If the host/device isn't busy, assume that something actually
* completed, and that we should be able to queue a command now.
*
* Note that the prior mid-layer assumption that any host could
* always queue at least one command is now broken. The mid-layer
* will implement a user specifiable stall (see
* scsi_host.max_host_blocked and scsi_device.max_device_blocked)
* if a command is requeued with no other commands outstanding
* either for the device or for the host.
*/
switch (reason) {
case SCSI_MLQUEUE_HOST_BUSY:
atomic_set(&host->host_blocked, host->max_host_blocked);
break;
case SCSI_MLQUEUE_DEVICE_BUSY:
case SCSI_MLQUEUE_EH_RETRY:
atomic_set(&device->device_blocked,
device->max_device_blocked);
break;
case SCSI_MLQUEUE_TARGET_BUSY:
atomic_set(&starget->target_blocked,
starget->max_target_blocked);
break;
}
}
static void scsi_mq_requeue_cmd(struct scsi_cmnd *cmd)
{
struct request *rq = scsi_cmd_to_rq(cmd);
if (rq->rq_flags & RQF_DONTPREP) {
rq->rq_flags &= ~RQF_DONTPREP;
scsi_mq_uninit_cmd(cmd);
} else {
WARN_ON_ONCE(true);
}
blk_mq_requeue_request(rq, true);
}
/**
* __scsi_queue_insert - private queue insertion
* @cmd: The SCSI command being requeued
* @reason: The reason for the requeue
* @unbusy: Whether the queue should be unbusied
*
* This is a private queue insertion. The public interface
* scsi_queue_insert() always assumes the queue should be unbusied
* because it's always called before the completion. This function is
* for a requeue after completion, which should only occur in this
* file.
*/
static void __scsi_queue_insert(struct scsi_cmnd *cmd, int reason, bool unbusy)
{
struct scsi_device *device = cmd->device;
SCSI_LOG_MLQUEUE(1, scmd_printk(KERN_INFO, cmd,
"Inserting command %p into mlqueue\n", cmd));
scsi_set_blocked(cmd, reason);
/*
* Decrement the counters, since these commands are no longer
* active on the host/device.
*/
if (unbusy)
scsi_device_unbusy(device, cmd);
/*
* Requeue this command. It will go before all other commands
* that are already in the queue. Schedule requeue work under
* lock such that the kblockd_schedule_work() call happens
* before blk_cleanup_queue() finishes.
*/
cmd->result = 0;
blk_mq_requeue_request(scsi_cmd_to_rq(cmd), true);
}
/**
* scsi_queue_insert - Reinsert a command in the queue.
* @cmd: command that we are adding to queue.
* @reason: why we are inserting command to queue.
*
* We do this for one of two cases. Either the host is busy and it cannot accept
* any more commands for the time being, or the device returned QUEUE_FULL and
* can accept no more commands.
*
* Context: This could be called either from an interrupt context or a normal
* process context.
*/
void scsi_queue_insert(struct scsi_cmnd *cmd, int reason)
{
__scsi_queue_insert(cmd, reason, true);
}
/**
* __scsi_execute - insert request and wait for the result
* @sdev: scsi device
* @cmd: scsi command
* @data_direction: data direction
* @buffer: data buffer
* @bufflen: len of buffer
* @sense: optional sense buffer
* @sshdr: optional decoded sense header
* @timeout: request timeout in HZ
* @retries: number of times to retry request
* @flags: flags for ->cmd_flags
* @rq_flags: flags for ->rq_flags
* @resid: optional residual length
*
* Returns the scsi_cmnd result field if a command was executed, or a negative
* Linux error code if we didn't get that far.
*/
int __scsi_execute(struct scsi_device *sdev, const unsigned char *cmd,
int data_direction, void *buffer, unsigned bufflen,
unsigned char *sense, struct scsi_sense_hdr *sshdr,
int timeout, int retries, u64 flags, req_flags_t rq_flags,
int *resid)
{
struct request *req;
struct scsi_request *rq;
int ret;
req = blk_get_request(sdev->request_queue,
data_direction == DMA_TO_DEVICE ?
REQ_OP_DRV_OUT : REQ_OP_DRV_IN,
rq_flags & RQF_PM ? BLK_MQ_REQ_PM : 0);
if (IS_ERR(req))
return PTR_ERR(req);
rq = scsi_req(req);
if (bufflen) {
ret = blk_rq_map_kern(sdev->request_queue, req,
buffer, bufflen, GFP_NOIO);
if (ret)
goto out;
}
rq->cmd_len = COMMAND_SIZE(cmd[0]);
memcpy(rq->cmd, cmd, rq->cmd_len);
rq->retries = retries;
req->timeout = timeout;
req->cmd_flags |= flags;
req->rq_flags |= rq_flags | RQF_QUIET;
/*
* head injection *required* here otherwise quiesce won't work
*/
blk_execute_rq(NULL, req, 1);
/*
* Some devices (USB mass-storage in particular) may transfer
* garbage data together with a residue indicating that the data
* is invalid. Prevent the garbage from being misinterpreted
* and prevent security leaks by zeroing out the excess data.
*/
if (unlikely(rq->resid_len > 0 && rq->resid_len <= bufflen))
memset(buffer + (bufflen - rq->resid_len), 0, rq->resid_len);
if (resid)
*resid = rq->resid_len;
if (sense && rq->sense_len)
memcpy(sense, rq->sense, SCSI_SENSE_BUFFERSIZE);
if (sshdr)
scsi_normalize_sense(rq->sense, rq->sense_len, sshdr);
ret = rq->result;
out:
blk_put_request(req);
return ret;
}
EXPORT_SYMBOL(__scsi_execute);
/*
* Wake up the error handler if necessary. Avoid as follows that the error
* handler is not woken up if host in-flight requests number ==
* shost->host_failed: use call_rcu() in scsi_eh_scmd_add() in combination
* with an RCU read lock in this function to ensure that this function in
* its entirety either finishes before scsi_eh_scmd_add() increases the
* host_failed counter or that it notices the shost state change made by
* scsi_eh_scmd_add().
*/
static void scsi_dec_host_busy(struct Scsi_Host *shost, struct scsi_cmnd *cmd)
{
unsigned long flags;
rcu_read_lock();
__clear_bit(SCMD_STATE_INFLIGHT, &cmd->state);
if (unlikely(scsi_host_in_recovery(shost))) {
spin_lock_irqsave(shost->host_lock, flags);
if (shost->host_failed || shost->host_eh_scheduled)
scsi_eh_wakeup(shost);
spin_unlock_irqrestore(shost->host_lock, flags);
}
rcu_read_unlock();
}
void scsi_device_unbusy(struct scsi_device *sdev, struct scsi_cmnd *cmd)
{
struct Scsi_Host *shost = sdev->host;
struct scsi_target *starget = scsi_target(sdev);
scsi_dec_host_busy(shost, cmd);
if (starget->can_queue > 0)
atomic_dec(&starget->target_busy);
sbitmap_put(&sdev->budget_map, cmd->budget_token);
cmd->budget_token = -1;
}
static void scsi_kick_queue(struct request_queue *q)
{
blk_mq_run_hw_queues(q, false);
}
/*
* Called for single_lun devices on IO completion. Clear starget_sdev_user,
* and call blk_run_queue for all the scsi_devices on the target -
* including current_sdev first.
*
* Called with *no* scsi locks held.
*/
static void scsi_single_lun_run(struct scsi_device *current_sdev)
{
struct Scsi_Host *shost = current_sdev->host;
struct scsi_device *sdev, *tmp;
struct scsi_target *starget = scsi_target(current_sdev);
unsigned long flags;
spin_lock_irqsave(shost->host_lock, flags);
starget->starget_sdev_user = NULL;
spin_unlock_irqrestore(shost->host_lock, flags);
/*
* Call blk_run_queue for all LUNs on the target, starting with
* current_sdev. We race with others (to set starget_sdev_user),
* but in most cases, we will be first. Ideally, each LU on the
* target would get some limited time or requests on the target.
*/
scsi_kick_queue(current_sdev->request_queue);
spin_lock_irqsave(shost->host_lock, flags);
if (starget->starget_sdev_user)
goto out;
list_for_each_entry_safe(sdev, tmp, &starget->devices,
same_target_siblings) {
if (sdev == current_sdev)
continue;
if (scsi_device_get(sdev))
continue;
spin_unlock_irqrestore(shost->host_lock, flags);
scsi_kick_queue(sdev->request_queue);
spin_lock_irqsave(shost->host_lock, flags);
scsi_device_put(sdev);
}
out:
spin_unlock_irqrestore(shost->host_lock, flags);
}
static inline bool scsi_device_is_busy(struct scsi_device *sdev)
{
if (scsi_device_busy(sdev) >= sdev->queue_depth)
return true;
if (atomic_read(&sdev->device_blocked) > 0)
return true;
return false;
}
static inline bool scsi_target_is_busy(struct scsi_target *starget)
{
if (starget->can_queue > 0) {
if (atomic_read(&starget->target_busy) >= starget->can_queue)
return true;
if (atomic_read(&starget->target_blocked) > 0)
return true;
}
return false;
}
static inline bool scsi_host_is_busy(struct Scsi_Host *shost)
{
if (atomic_read(&shost->host_blocked) > 0)
return true;
if (shost->host_self_blocked)
return true;
return false;
}
static void scsi_starved_list_run(struct Scsi_Host *shost)
{
LIST_HEAD(starved_list);
struct scsi_device *sdev;
unsigned long flags;
spin_lock_irqsave(shost->host_lock, flags);
list_splice_init(&shost->starved_list, &starved_list);
while (!list_empty(&starved_list)) {
struct request_queue *slq;
/*
* As long as shost is accepting commands and we have
* starved queues, call blk_run_queue. scsi_request_fn
* drops the queue_lock and can add us back to the
* starved_list.
*
* host_lock protects the starved_list and starved_entry.
* scsi_request_fn must get the host_lock before checking
* or modifying starved_list or starved_entry.
*/
if (scsi_host_is_busy(shost))
break;
sdev = list_entry(starved_list.next,
struct scsi_device, starved_entry);
list_del_init(&sdev->starved_entry);
if (scsi_target_is_busy(scsi_target(sdev))) {
list_move_tail(&sdev->starved_entry,
&shost->starved_list);
continue;
}
/*
* Once we drop the host lock, a racing scsi_remove_device()
* call may remove the sdev from the starved list and destroy
* it and the queue. Mitigate by taking a reference to the
* queue and never touching the sdev again after we drop the
* host lock. Note: if __scsi_remove_device() invokes
* blk_cleanup_queue() before the queue is run from this
* function then blk_run_queue() will return immediately since
* blk_cleanup_queue() marks the queue with QUEUE_FLAG_DYING.
*/
slq = sdev->request_queue;
if (!blk_get_queue(slq))
continue;
spin_unlock_irqrestore(shost->host_lock, flags);
scsi_kick_queue(slq);
blk_put_queue(slq);
spin_lock_irqsave(shost->host_lock, flags);
}
/* put any unprocessed entries back */
list_splice(&starved_list, &shost->starved_list);
spin_unlock_irqrestore(shost->host_lock, flags);
}
/**
* scsi_run_queue - Select a proper request queue to serve next.
* @q: last request's queue
*
* The previous command was completely finished, start a new one if possible.
*/
static void scsi_run_queue(struct request_queue *q)
{
struct scsi_device *sdev = q->queuedata;
if (scsi_target(sdev)->single_lun)
scsi_single_lun_run(sdev);
if (!list_empty(&sdev->host->starved_list))
scsi_starved_list_run(sdev->host);
blk_mq_run_hw_queues(q, false);
}
void scsi_requeue_run_queue(struct work_struct *work)
{
struct scsi_device *sdev;
struct request_queue *q;
sdev = container_of(work, struct scsi_device, requeue_work);
q = sdev->request_queue;
scsi_run_queue(q);
}
void scsi_run_host_queues(struct Scsi_Host *shost)
{
struct scsi_device *sdev;
shost_for_each_device(sdev, shost)
scsi_run_queue(sdev->request_queue);
}
static void scsi_uninit_cmd(struct scsi_cmnd *cmd)
{
if (!blk_rq_is_passthrough(scsi_cmd_to_rq(cmd))) {
struct scsi_driver *drv = scsi_cmd_to_driver(cmd);
if (drv->uninit_command)
drv->uninit_command(cmd);
}
}
void scsi_free_sgtables(struct scsi_cmnd *cmd)
{
if (cmd->sdb.table.nents)
sg_free_table_chained(&cmd->sdb.table,
SCSI_INLINE_SG_CNT);
if (scsi_prot_sg_count(cmd))
sg_free_table_chained(&cmd->prot_sdb->table,
SCSI_INLINE_PROT_SG_CNT);
}
EXPORT_SYMBOL_GPL(scsi_free_sgtables);
static void scsi_mq_uninit_cmd(struct scsi_cmnd *cmd)
{
scsi_free_sgtables(cmd);
scsi_uninit_cmd(cmd);
}
static void scsi_run_queue_async(struct scsi_device *sdev)
{
if (scsi_target(sdev)->single_lun ||
!list_empty(&sdev->host->starved_list)) {
kblockd_schedule_work(&sdev->requeue_work);
} else {
/*
* smp_mb() present in sbitmap_queue_clear() or implied in
* .end_io is for ordering writing .device_busy in
* scsi_device_unbusy() and reading sdev->restarts.
*/
int old = atomic_read(&sdev->restarts);
/*
* ->restarts has to be kept as non-zero if new budget
* contention occurs.
*
* No need to run queue when either another re-run
* queue wins in updating ->restarts or a new budget
* contention occurs.
*/
if (old && atomic_cmpxchg(&sdev->restarts, old, 0) == old)
blk_mq_run_hw_queues(sdev->request_queue, true);
}
}
/* Returns false when no more bytes to process, true if there are more */
static bool scsi_end_request(struct request *req, blk_status_t error,
unsigned int bytes)
{
struct scsi_cmnd *cmd = blk_mq_rq_to_pdu(req);
struct scsi_device *sdev = cmd->device;
struct request_queue *q = sdev->request_queue;
if (blk_update_request(req, error, bytes))
return true;
if (blk_queue_add_random(q))
add_disk_randomness(req->rq_disk);
if (!blk_rq_is_passthrough(req)) {
WARN_ON_ONCE(!(cmd->flags & SCMD_INITIALIZED));
cmd->flags &= ~SCMD_INITIALIZED;
}
/*
* Calling rcu_barrier() is not necessary here because the
* SCSI error handler guarantees that the function called by
* call_rcu() has been called before scsi_end_request() is
* called.
*/
destroy_rcu_head(&cmd->rcu);
/*
* In the MQ case the command gets freed by __blk_mq_end_request,
* so we have to do all cleanup that depends on it earlier.
*
* We also can't kick the queues from irq context, so we
* will have to defer it to a workqueue.
*/
scsi_mq_uninit_cmd(cmd);
/*
* queue is still alive, so grab the ref for preventing it
* from being cleaned up during running queue.
*/
percpu_ref_get(&q->q_usage_counter);
__blk_mq_end_request(req, error);
scsi_run_queue_async(sdev);
percpu_ref_put(&q->q_usage_counter);
return false;
}
/**
* scsi_result_to_blk_status - translate a SCSI result code into blk_status_t
* @cmd: SCSI command
* @result: scsi error code
*
* Translate a SCSI result code into a blk_status_t value. May reset the host
* byte of @cmd->result.
*/
static blk_status_t scsi_result_to_blk_status(struct scsi_cmnd *cmd, int result)
{
switch (host_byte(result)) {
case DID_OK:
if (scsi_status_is_good(result))
return BLK_STS_OK;
return BLK_STS_IOERR;
case DID_TRANSPORT_FAILFAST:
case DID_TRANSPORT_MARGINAL:
return BLK_STS_TRANSPORT;
case DID_TARGET_FAILURE:
set_host_byte(cmd, DID_OK);
return BLK_STS_TARGET;
case DID_NEXUS_FAILURE:
set_host_byte(cmd, DID_OK);
return BLK_STS_NEXUS;
case DID_ALLOC_FAILURE:
set_host_byte(cmd, DID_OK);
return BLK_STS_NOSPC;
case DID_MEDIUM_ERROR:
set_host_byte(cmd, DID_OK);
return BLK_STS_MEDIUM;
default:
return BLK_STS_IOERR;
}
}
/* Helper for scsi_io_completion() when "reprep" action required. */
static void scsi_io_completion_reprep(struct scsi_cmnd *cmd,
struct request_queue *q)
{
/* A new command will be prepared and issued. */
scsi_mq_requeue_cmd(cmd);
}
static bool scsi_cmd_runtime_exceeced(struct scsi_cmnd *cmd)
{
struct request *req = scsi_cmd_to_rq(cmd);
unsigned long wait_for;
if (cmd->allowed == SCSI_CMD_RETRIES_NO_LIMIT)
return false;
wait_for = (cmd->allowed + 1) * req->timeout;
if (time_before(cmd->jiffies_at_alloc + wait_for, jiffies)) {
scmd_printk(KERN_ERR, cmd, "timing out command, waited %lus\n",
wait_for/HZ);
return true;
}
return false;
}
/* Helper for scsi_io_completion() when special action required. */
static void scsi_io_completion_action(struct scsi_cmnd *cmd, int result)
{
struct request_queue *q = cmd->device->request_queue;
struct request *req = scsi_cmd_to_rq(cmd);
int level = 0;
enum {ACTION_FAIL, ACTION_REPREP, ACTION_RETRY,
ACTION_DELAYED_RETRY} action;
struct scsi_sense_hdr sshdr;
bool sense_valid;
bool sense_current = true; /* false implies "deferred sense" */
blk_status_t blk_stat;
sense_valid = scsi_command_normalize_sense(cmd, &sshdr);
if (sense_valid)
sense_current = !scsi_sense_is_deferred(&sshdr);
blk_stat = scsi_result_to_blk_status(cmd, result);
if (host_byte(result) == DID_RESET) {
/* Third party bus reset or reset for error recovery
* reasons. Just retry the command and see what
* happens.
*/
action = ACTION_RETRY;
} else if (sense_valid && sense_current) {
switch (sshdr.sense_key) {
case UNIT_ATTENTION:
if (cmd->device->removable) {
/* Detected disc change. Set a bit
* and quietly refuse further access.
*/
cmd->device->changed = 1;
action = ACTION_FAIL;
} else {
/* Must have been a power glitch, or a
* bus reset. Could not have been a
* media change, so we just retry the
* command and see what happens.
*/
action = ACTION_RETRY;
}
break;
case ILLEGAL_REQUEST:
/* If we had an ILLEGAL REQUEST returned, then
* we may have performed an unsupported
* command. The only thing this should be
* would be a ten byte read where only a six
* byte read was supported. Also, on a system
* where READ CAPACITY failed, we may have
* read past the end of the disk.
*/
if ((cmd->device->use_10_for_rw &&
sshdr.asc == 0x20 && sshdr.ascq == 0x00) &&
(cmd->cmnd[0] == READ_10 ||
cmd->cmnd[0] == WRITE_10)) {
/* This will issue a new 6-byte command. */
cmd->device->use_10_for_rw = 0;
action = ACTION_REPREP;
} else if (sshdr.asc == 0x10) /* DIX */ {
action = ACTION_FAIL;
blk_stat = BLK_STS_PROTECTION;
/* INVALID COMMAND OPCODE or INVALID FIELD IN CDB */
} else if (sshdr.asc == 0x20 || sshdr.asc == 0x24) {
action = ACTION_FAIL;
blk_stat = BLK_STS_TARGET;
} else
action = ACTION_FAIL;
break;
case ABORTED_COMMAND:
action = ACTION_FAIL;
if (sshdr.asc == 0x10) /* DIF */
blk_stat = BLK_STS_PROTECTION;
break;
case NOT_READY:
/* If the device is in the process of becoming
* ready, or has a temporary blockage, retry.
*/
if (sshdr.asc == 0x04) {
switch (sshdr.ascq) {
case 0x01: /* becoming ready */
case 0x04: /* format in progress */
case 0x05: /* rebuild in progress */
case 0x06: /* recalculation in progress */
case 0x07: /* operation in progress */
case 0x08: /* Long write in progress */
case 0x09: /* self test in progress */
case 0x11: /* notify (enable spinup) required */
case 0x14: /* space allocation in progress */
case 0x1a: /* start stop unit in progress */
case 0x1b: /* sanitize in progress */
case 0x1d: /* configuration in progress */
case 0x24: /* depopulation in progress */
action = ACTION_DELAYED_RETRY;
break;
case 0x0a: /* ALUA state transition */
blk_stat = BLK_STS_AGAIN;
fallthrough;
default:
action = ACTION_FAIL;
break;
}
} else
action = ACTION_FAIL;
break;
case VOLUME_OVERFLOW:
/* See SSC3rXX or current. */
action = ACTION_FAIL;
break;
case DATA_PROTECT:
action = ACTION_FAIL;
if ((sshdr.asc == 0x0C && sshdr.ascq == 0x12) ||
(sshdr.asc == 0x55 &&
(sshdr.ascq == 0x0E || sshdr.ascq == 0x0F))) {
/* Insufficient zone resources */
blk_stat = BLK_STS_ZONE_OPEN_RESOURCE;
}
break;
default:
action = ACTION_FAIL;
break;
}
} else
action = ACTION_FAIL;
if (action != ACTION_FAIL && scsi_cmd_runtime_exceeced(cmd))
action = ACTION_FAIL;
switch (action) {
case ACTION_FAIL:
/* Give up and fail the remainder of the request */
if (!(req->rq_flags & RQF_QUIET)) {
static DEFINE_RATELIMIT_STATE(_rs,
DEFAULT_RATELIMIT_INTERVAL,
DEFAULT_RATELIMIT_BURST);
if (unlikely(scsi_logging_level))
level =
SCSI_LOG_LEVEL(SCSI_LOG_MLCOMPLETE_SHIFT,
SCSI_LOG_MLCOMPLETE_BITS);
/*
* if logging is enabled the failure will be printed
* in scsi_log_completion(), so avoid duplicate messages
*/
if (!level && __ratelimit(&_rs)) {
scsi_print_result(cmd, NULL, FAILED);
if (sense_valid)
scsi_print_sense(cmd);
scsi_print_command(cmd);
}
}
if (!scsi_end_request(req, blk_stat, blk_rq_err_bytes(req)))
return;
fallthrough;
case ACTION_REPREP:
scsi_io_completion_reprep(cmd, q);
break;
case ACTION_RETRY:
/* Retry the same command immediately */
__scsi_queue_insert(cmd, SCSI_MLQUEUE_EH_RETRY, false);
break;
case ACTION_DELAYED_RETRY:
/* Retry the same command after a delay */
__scsi_queue_insert(cmd, SCSI_MLQUEUE_DEVICE_BUSY, false);
break;
}
}
/*
* Helper for scsi_io_completion() when cmd->result is non-zero. Returns a
* new result that may suppress further error checking. Also modifies
* *blk_statp in some cases.
*/
static int scsi_io_completion_nz_result(struct scsi_cmnd *cmd, int result,
blk_status_t *blk_statp)
{
bool sense_valid;
bool sense_current = true; /* false implies "deferred sense" */
struct request *req = scsi_cmd_to_rq(cmd);
struct scsi_sense_hdr sshdr;
sense_valid = scsi_command_normalize_sense(cmd, &sshdr);
if (sense_valid)
sense_current = !scsi_sense_is_deferred(&sshdr);
if (blk_rq_is_passthrough(req)) {
if (sense_valid) {
/*
* SG_IO wants current and deferred errors
*/
scsi_req(req)->sense_len =
min(8 + cmd->sense_buffer[7],
SCSI_SENSE_BUFFERSIZE);
}
if (sense_current)
*blk_statp = scsi_result_to_blk_status(cmd, result);
} else if (blk_rq_bytes(req) == 0 && sense_current) {
/*
* Flush commands do not transfers any data, and thus cannot use
* good_bytes != blk_rq_bytes(req) as the signal for an error.
* This sets *blk_statp explicitly for the problem case.
*/
*blk_statp = scsi_result_to_blk_status(cmd, result);
}
/*
* Recovered errors need reporting, but they're always treated as
* success, so fiddle the result code here. For passthrough requests
* we already took a copy of the original into sreq->result which
* is what gets returned to the user
*/
if (sense_valid && (sshdr.sense_key == RECOVERED_ERROR)) {
bool do_print = true;
/*
* if ATA PASS-THROUGH INFORMATION AVAILABLE [0x0, 0x1d]
* skip print since caller wants ATA registers. Only occurs
* on SCSI ATA PASS_THROUGH commands when CK_COND=1
*/
if ((sshdr.asc == 0x0) && (sshdr.ascq == 0x1d))
do_print = false;
else if (req->rq_flags & RQF_QUIET)
do_print = false;
if (do_print)
scsi_print_sense(cmd);
result = 0;
/* for passthrough, *blk_statp may be set */
*blk_statp = BLK_STS_OK;
}
/*
* Another corner case: the SCSI status byte is non-zero but 'good'.
* Example: PRE-FETCH command returns SAM_STAT_CONDITION_MET when
* it is able to fit nominated LBs in its cache (and SAM_STAT_GOOD
* if it can't fit). Treat SAM_STAT_CONDITION_MET and the related
* intermediate statuses (both obsolete in SAM-4) as good.
*/
if ((result & 0xff) && scsi_status_is_good(result)) {
result = 0;
*blk_statp = BLK_STS_OK;
}
return result;
}
/**
* scsi_io_completion - Completion processing for SCSI commands.
* @cmd: command that is finished.
* @good_bytes: number of processed bytes.
*
* We will finish off the specified number of sectors. If we are done, the
* command block will be released and the queue function will be goosed. If we
* are not done then we have to figure out what to do next:
*
* a) We can call scsi_io_completion_reprep(). The request will be
* unprepared and put back on the queue. Then a new command will
* be created for it. This should be used if we made forward
* progress, or if we want to switch from READ(10) to READ(6) for
* example.
*
* b) We can call scsi_io_completion_action(). The request will be
* put back on the queue and retried using the same command as
* before, possibly after a delay.
*
* c) We can call scsi_end_request() with blk_stat other than
* BLK_STS_OK, to fail the remainder of the request.
*/
void scsi_io_completion(struct scsi_cmnd *cmd, unsigned int good_bytes)
{
int result = cmd->result;
struct request_queue *q = cmd->device->request_queue;
struct request *req = scsi_cmd_to_rq(cmd);
blk_status_t blk_stat = BLK_STS_OK;
if (unlikely(result)) /* a nz result may or may not be an error */
result = scsi_io_completion_nz_result(cmd, result, &blk_stat);
if (unlikely(blk_rq_is_passthrough(req))) {
/*
* scsi_result_to_blk_status may have reset the host_byte
*/
scsi_req(req)->result = cmd->result;
}
/*
* Next deal with any sectors which we were able to correctly
* handle.
*/
SCSI_LOG_HLCOMPLETE(1, scmd_printk(KERN_INFO, cmd,
"%u sectors total, %d bytes done.\n",
blk_rq_sectors(req), good_bytes));
/*
* Failed, zero length commands always need to drop down
* to retry code. Fast path should return in this block.
*/
if (likely(blk_rq_bytes(req) > 0 || blk_stat == BLK_STS_OK)) {
if (likely(!scsi_end_request(req, blk_stat, good_bytes)))
return; /* no bytes remaining */
}
/* Kill remainder if no retries. */
if (unlikely(blk_stat && scsi_noretry_cmd(cmd))) {
if (scsi_end_request(req, blk_stat, blk_rq_bytes(req)))
WARN_ONCE(true,
"Bytes remaining after failed, no-retry command");
return;
}
/*
* If there had been no error, but we have leftover bytes in the
* requeues just queue the command up again.
*/
if (likely(result == 0))
scsi_io_completion_reprep(cmd, q);
else
scsi_io_completion_action(cmd, result);
}
static inline bool scsi_cmd_needs_dma_drain(struct scsi_device *sdev,
struct request *rq)
{
return sdev->dma_drain_len && blk_rq_is_passthrough(rq) &&
!op_is_write(req_op(rq)) &&
sdev->host->hostt->dma_need_drain(rq);
}
/**
* scsi_alloc_sgtables - Allocate and initialize data and integrity scatterlists
* @cmd: SCSI command data structure to initialize.
*
* Initializes @cmd->sdb and also @cmd->prot_sdb if data integrity is enabled
* for @cmd.
*
* Returns:
* * BLK_STS_OK - on success
* * BLK_STS_RESOURCE - if the failure is retryable
* * BLK_STS_IOERR - if the failure is fatal
*/
blk_status_t scsi_alloc_sgtables(struct scsi_cmnd *cmd)
{
struct scsi_device *sdev = cmd->device;
struct request *rq = scsi_cmd_to_rq(cmd);
unsigned short nr_segs = blk_rq_nr_phys_segments(rq);
struct scatterlist *last_sg = NULL;
blk_status_t ret;
bool need_drain = scsi_cmd_needs_dma_drain(sdev, rq);
int count;
if (WARN_ON_ONCE(!nr_segs))
return BLK_STS_IOERR;
/*
* Make sure there is space for the drain. The driver must adjust
* max_hw_segments to be prepared for this.
*/
if (need_drain) nr_segs++;
/*
* If sg table allocation fails, requeue request later.
*/
if (unlikely(sg_alloc_table_chained(&cmd->sdb.table, nr_segs,
cmd->sdb.table.sgl, SCSI_INLINE_SG_CNT)))
return BLK_STS_RESOURCE;
/*
* Next, walk the list, and fill in the addresses and sizes of
* each segment.
*/
count = __blk_rq_map_sg(rq->q, rq, cmd->sdb.table.sgl, &last_sg);
if (blk_rq_bytes(rq) & rq->q->dma_pad_mask) {
unsigned int pad_len =
(rq->q->dma_pad_mask & ~blk_rq_bytes(rq)) + 1;
last_sg->length += pad_len;
cmd->extra_len += pad_len;
}
if (need_drain) { sg_unmark_end(last_sg);
last_sg = sg_next(last_sg);
sg_set_buf(last_sg, sdev->dma_drain_buf, sdev->dma_drain_len);
sg_mark_end(last_sg);
cmd->extra_len += sdev->dma_drain_len;
count++;
}
BUG_ON(count > cmd->sdb.table.nents); cmd->sdb.table.nents = count; cmd->sdb.length = blk_rq_payload_bytes(rq);
if (blk_integrity_rq(rq)) {
struct scsi_data_buffer *prot_sdb = cmd->prot_sdb;
int ivecs;
if (WARN_ON_ONCE(!prot_sdb)) {
/*
* This can happen if someone (e.g. multipath)
* queues a command to a device on an adapter
* that does not support DIX.
*/
ret = BLK_STS_IOERR;
goto out_free_sgtables;
}
ivecs = blk_rq_count_integrity_sg(rq->q, rq->bio);
if (sg_alloc_table_chained(&prot_sdb->table, ivecs,
prot_sdb->table.sgl,
SCSI_INLINE_PROT_SG_CNT)) {
ret = BLK_STS_RESOURCE;
goto out_free_sgtables;
}
count = blk_rq_map_integrity_sg(rq->q, rq->bio,
prot_sdb->table.sgl);
BUG_ON(count > ivecs);
BUG_ON(count > queue_max_integrity_segments(rq->q));
cmd->prot_sdb = prot_sdb;
cmd->prot_sdb->table.nents = count;
}
return BLK_STS_OK;
out_free_sgtables:
scsi_free_sgtables(cmd);
return ret;
}
EXPORT_SYMBOL(scsi_alloc_sgtables);
/**
* scsi_initialize_rq - initialize struct scsi_cmnd partially
* @rq: Request associated with the SCSI command to be initialized.
*
* This function initializes the members of struct scsi_cmnd that must be
* initialized before request processing starts and that won't be
* reinitialized if a SCSI command is requeued.
*
* Called from inside blk_get_request() for pass-through requests and from
* inside scsi_init_command() for filesystem requests.
*/
static void scsi_initialize_rq(struct request *rq)
{
struct scsi_cmnd *cmd = blk_mq_rq_to_pdu(rq);
struct scsi_request *req = &cmd->req;
memset(req->__cmd, 0, sizeof(req->__cmd));
req->cmd = req->__cmd;
req->cmd_len = BLK_MAX_CDB;
req->sense_len = 0;
init_rcu_head(&cmd->rcu);
cmd->jiffies_at_alloc = jiffies;
cmd->retries = 0;
}
/*
* Only called when the request isn't completed by SCSI, and not freed by
* SCSI
*/
static void scsi_cleanup_rq(struct request *rq)
{
if (rq->rq_flags & RQF_DONTPREP) {
scsi_mq_uninit_cmd(blk_mq_rq_to_pdu(rq));
rq->rq_flags &= ~RQF_DONTPREP;
}
}
/* Called before a request is prepared. See also scsi_mq_prep_fn(). */
void scsi_init_command(struct scsi_device *dev, struct scsi_cmnd *cmd)
{
void *buf = cmd->sense_buffer;
void *prot = cmd->prot_sdb;
struct request *rq = scsi_cmd_to_rq(cmd);
unsigned int flags = cmd->flags & SCMD_PRESERVED_FLAGS;
unsigned long jiffies_at_alloc;
int retries, to_clear;
bool in_flight;
int budget_token = cmd->budget_token;
if (!blk_rq_is_passthrough(rq) && !(flags & SCMD_INITIALIZED)) {
flags |= SCMD_INITIALIZED;
scsi_initialize_rq(rq);
}
jiffies_at_alloc = cmd->jiffies_at_alloc;
retries = cmd->retries;
in_flight = test_bit(SCMD_STATE_INFLIGHT, &cmd->state);
/*
* Zero out the cmd, except for the embedded scsi_request. Only clear
* the driver-private command data if the LLD does not supply a
* function to initialize that data.
*/
to_clear = sizeof(*cmd) - sizeof(cmd->req);
if (!dev->host->hostt->init_cmd_priv)
to_clear += dev->host->hostt->cmd_size; memset((char *)cmd + sizeof(cmd->req), 0, to_clear);
cmd->device = dev;
cmd->sense_buffer = buf;
cmd->prot_sdb = prot;
cmd->flags = flags;
INIT_LIST_HEAD(&cmd->eh_entry);
INIT_DELAYED_WORK(&cmd->abort_work, scmd_eh_abort_handler);
cmd->jiffies_at_alloc = jiffies_at_alloc;
cmd->retries = retries;
if (in_flight)
__set_bit(SCMD_STATE_INFLIGHT, &cmd->state);
cmd->budget_token = budget_token;}
static blk_status_t scsi_setup_scsi_cmnd(struct scsi_device *sdev,
struct request *req)
{
struct scsi_cmnd *cmd = blk_mq_rq_to_pdu(req);
/*
* Passthrough requests may transfer data, in which case they must
* a bio attached to them. Or they might contain a SCSI command
* that does not transfer data, in which case they may optionally
* submit a request without an attached bio.
*/
if (req->bio) { blk_status_t ret = scsi_alloc_sgtables(cmd); if (unlikely(ret != BLK_STS_OK))
return ret;
} else {
BUG_ON(blk_rq_bytes(req)); memset(&cmd->sdb, 0, sizeof(cmd->sdb));
}
cmd->cmd_len = scsi_req(req)->cmd_len;
cmd->cmnd = scsi_req(req)->cmd;
cmd->transfersize = blk_rq_bytes(req);
cmd->allowed = scsi_req(req)->retries;
return BLK_STS_OK;
}
static blk_status_t
scsi_device_state_check(struct scsi_device *sdev, struct request *req)
{
switch (sdev->sdev_state) {
case SDEV_CREATED:
return BLK_STS_OK;
case SDEV_OFFLINE:
case SDEV_TRANSPORT_OFFLINE:
/*
* If the device is offline we refuse to process any
* commands. The device must be brought online
* before trying any recovery commands.
*/
if (!sdev->offline_already) { sdev->offline_already = true;
sdev_printk(KERN_ERR, sdev,
"rejecting I/O to offline device\n");
}
return BLK_STS_IOERR;
case SDEV_DEL:
/*
* If the device is fully deleted, we refuse to
* process any commands as well.
*/
sdev_printk(KERN_ERR, sdev,
"rejecting I/O to dead device\n");
return BLK_STS_IOERR;
case SDEV_BLOCK:
case SDEV_CREATED_BLOCK:
return BLK_STS_RESOURCE;
case SDEV_QUIESCE:
/*
* If the device is blocked we only accept power management
* commands.
*/
if (req && WARN_ON_ONCE(!(req->rq_flags & RQF_PM)))
return BLK_STS_RESOURCE;
return BLK_STS_OK;
default:
/*
* For any other not fully online state we only allow
* power management commands.
*/
if (req && !(req->rq_flags & RQF_PM))
return BLK_STS_IOERR;
return BLK_STS_OK;
}
}
/*
* scsi_dev_queue_ready: if we can send requests to sdev, assign one token
* and return the token else return -1.
*/
static inline int scsi_dev_queue_ready(struct request_queue *q,
struct scsi_device *sdev)
{
int token;
token = sbitmap_get(&sdev->budget_map);
if (atomic_read(&sdev->device_blocked)) {
if (token < 0)
goto out;
if (scsi_device_busy(sdev) > 1)
goto out_dec;
/*
* unblock after device_blocked iterates to zero
*/
if (atomic_dec_return(&sdev->device_blocked) > 0)
goto out_dec;
SCSI_LOG_MLQUEUE(3, sdev_printk(KERN_INFO, sdev,
"unblocking device at zero depth\n"));
}
return token;
out_dec:
if (token >= 0)
sbitmap_put(&sdev->budget_map, token);
out:
return -1;
}
/*
* scsi_target_queue_ready: checks if there we can send commands to target
* @sdev: scsi device on starget to check.
*/
static inline int scsi_target_queue_ready(struct Scsi_Host *shost,
struct scsi_device *sdev)
{
struct scsi_target *starget = scsi_target(sdev);
unsigned int busy;
if (starget->single_lun) {
spin_lock_irq(shost->host_lock); if (starget->starget_sdev_user &&
starget->starget_sdev_user != sdev) {
spin_unlock_irq(shost->host_lock);
return 0;
}
starget->starget_sdev_user = sdev;
spin_unlock_irq(shost->host_lock);
}
if (starget->can_queue <= 0)
return 1;
busy = atomic_inc_return(&starget->target_busy) - 1;
if (atomic_read(&starget->target_blocked) > 0) {
if (busy)
goto starved;
/*
* unblock after target_blocked iterates to zero
*/
if (atomic_dec_return(&starget->target_blocked) > 0)
goto out_dec;
SCSI_LOG_MLQUEUE(3, starget_printk(KERN_INFO, starget,
"unblocking target at zero depth\n"));
}
if (busy >= starget->can_queue)
goto starved;
return 1;
starved:
spin_lock_irq(shost->host_lock);
list_move_tail(&sdev->starved_entry, &shost->starved_list);
spin_unlock_irq(shost->host_lock);
out_dec:
if (starget->can_queue > 0)
atomic_dec(&starget->target_busy);
return 0;
}
/*
* scsi_host_queue_ready: if we can send requests to shost, return 1 else
* return 0. We must end up running the queue again whenever 0 is
* returned, else IO can hang.
*/
static inline int scsi_host_queue_ready(struct request_queue *q,
struct Scsi_Host *shost,
struct scsi_device *sdev,
struct scsi_cmnd *cmd)
{
if (scsi_host_in_recovery(shost))
return 0;
if (atomic_read(&shost->host_blocked) > 0) { if (scsi_host_busy(shost) > 0)
goto starved;
/*
* unblock after host_blocked iterates to zero
*/
if (atomic_dec_return(&shost->host_blocked) > 0)
goto out_dec;
SCSI_LOG_MLQUEUE(3,
shost_printk(KERN_INFO, shost,
"unblocking host at zero depth\n"));
}
if (shost->host_self_blocked)
goto starved;
/* We're OK to process the command, so we can't be starved */
if (!list_empty(&sdev->starved_entry)) {
spin_lock_irq(shost->host_lock);
if (!list_empty(&sdev->starved_entry))
list_del_init(&sdev->starved_entry);
spin_unlock_irq(shost->host_lock);
}
__set_bit(SCMD_STATE_INFLIGHT, &cmd->state);
return 1;
starved:
spin_lock_irq(shost->host_lock);
if (list_empty(&sdev->starved_entry))
list_add_tail(&sdev->starved_entry, &shost->starved_list); spin_unlock_irq(shost->host_lock);
out_dec:
scsi_dec_host_busy(shost, cmd);
return 0;
}
/*
* Busy state exporting function for request stacking drivers.
*
* For efficiency, no lock is taken to check the busy state of
* shost/starget/sdev, since the returned value is not guaranteed and
* may be changed after request stacking drivers call the function,
* regardless of taking lock or not.
*
* When scsi can't dispatch I/Os anymore and needs to kill I/Os scsi
* needs to return 'not busy'. Otherwise, request stacking drivers
* may hold requests forever.
*/
static bool scsi_mq_lld_busy(struct request_queue *q)
{
struct scsi_device *sdev = q->queuedata;
struct Scsi_Host *shost;
if (blk_queue_dying(q))
return false;
shost = sdev->host;
/*
* Ignore host/starget busy state.
* Since block layer does not have a concept of fairness across
* multiple queues, congestion of host/starget needs to be handled
* in SCSI layer.
*/
if (scsi_host_in_recovery(shost) || scsi_device_is_busy(sdev))
return true;
return false;
}
/*
* Block layer request completion callback. May be called from interrupt
* context.
*/
static void scsi_complete(struct request *rq)
{
struct scsi_cmnd *cmd = blk_mq_rq_to_pdu(rq);
enum scsi_disposition disposition;
INIT_LIST_HEAD(&cmd->eh_entry);
atomic_inc(&cmd->device->iodone_cnt);
if (cmd->result)
atomic_inc(&cmd->device->ioerr_cnt);
disposition = scsi_decide_disposition(cmd);
if (disposition != SUCCESS && scsi_cmd_runtime_exceeced(cmd))
disposition = SUCCESS;
scsi_log_completion(cmd, disposition);
switch (disposition) {
case SUCCESS:
scsi_finish_command(cmd);
break;
case NEEDS_RETRY:
scsi_queue_insert(cmd, SCSI_MLQUEUE_EH_RETRY);
break;
case ADD_TO_MLQUEUE:
scsi_queue_insert(cmd, SCSI_MLQUEUE_DEVICE_BUSY);
break;
default:
scsi_eh_scmd_add(cmd);
break;
}
}
/**
* scsi_dispatch_cmd - Dispatch a command to the low-level driver.
* @cmd: command block we are dispatching.
*
* Return: nonzero return request was rejected and device's queue needs to be
* plugged.
*/
static int scsi_dispatch_cmd(struct scsi_cmnd *cmd)
{
struct Scsi_Host *host = cmd->device->host;
int rtn = 0;
atomic_inc(&cmd->device->iorequest_cnt);
/* check if the device is still usable */
if (unlikely(cmd->device->sdev_state == SDEV_DEL)) {
/* in SDEV_DEL we error all commands. DID_NO_CONNECT
* returns an immediate error upwards, and signals
* that the device is no longer present */
cmd->result = DID_NO_CONNECT << 16;
goto done;
}
/* Check to see if the scsi lld made this device blocked. */
if (unlikely(scsi_device_blocked(cmd->device))) {
/*
* in blocked state, the command is just put back on
* the device queue. The suspend state has already
* blocked the queue so future requests should not
* occur until the device transitions out of the
* suspend state.
*/
SCSI_LOG_MLQUEUE(3, scmd_printk(KERN_INFO, cmd,
"queuecommand : device blocked\n"));
return SCSI_MLQUEUE_DEVICE_BUSY;
}
/* Store the LUN value in cmnd, if needed. */
if (cmd->device->lun_in_cdb) cmd->cmnd[1] = (cmd->cmnd[1] & 0x1f) |
(cmd->device->lun << 5 & 0xe0);
scsi_log_send(cmd);
/*
* Before we queue this command, check if the command
* length exceeds what the host adapter can handle.
*/
if (cmd->cmd_len > cmd->device->host->max_cmd_len) {
SCSI_LOG_MLQUEUE(3, scmd_printk(KERN_INFO, cmd,
"queuecommand : command too long. "
"cdb_size=%d host->max_cmd_len=%d\n",
cmd->cmd_len, cmd->device->host->max_cmd_len));
cmd->result = (DID_ABORT << 16);
goto done;
}
if (unlikely(host->shost_state == SHOST_DEL)) { cmd->result = (DID_NO_CONNECT << 16);
goto done;
}
trace_scsi_dispatch_cmd_start(cmd);
rtn = host->hostt->queuecommand(host, cmd);
if (rtn) {
trace_scsi_dispatch_cmd_error(cmd, rtn);
if (rtn != SCSI_MLQUEUE_DEVICE_BUSY &&
rtn != SCSI_MLQUEUE_TARGET_BUSY)
rtn = SCSI_MLQUEUE_HOST_BUSY;
SCSI_LOG_MLQUEUE(3, scmd_printk(KERN_INFO, cmd,
"queuecommand : request rejected\n"));
}
return rtn;
done:
cmd->scsi_done(cmd);
return 0;
}
/* Size in bytes of the sg-list stored in the scsi-mq command-private data. */
static unsigned int scsi_mq_inline_sgl_size(struct Scsi_Host *shost)
{
return min_t(unsigned int, shost->sg_tablesize, SCSI_INLINE_SG_CNT) *
sizeof(struct scatterlist);
}
static blk_status_t scsi_prepare_cmd(struct request *req)
{
struct scsi_cmnd *cmd = blk_mq_rq_to_pdu(req);
struct scsi_device *sdev = req->q->queuedata;
struct Scsi_Host *shost = sdev->host;
struct scatterlist *sg;
scsi_init_command(sdev, cmd);
cmd->prot_op = SCSI_PROT_NORMAL;
if (blk_rq_bytes(req))
cmd->sc_data_direction = rq_dma_dir(req);
else
cmd->sc_data_direction = DMA_NONE; sg = (void *)cmd + sizeof(struct scsi_cmnd) + shost->hostt->cmd_size;
cmd->sdb.table.sgl = sg;
if (scsi_host_get_prot(shost)) {
memset(cmd->prot_sdb, 0, sizeof(struct scsi_data_buffer));
cmd->prot_sdb->table.sgl =
(struct scatterlist *)(cmd->prot_sdb + 1);
}
/*
* Special handling for passthrough commands, which don't go to the ULP
* at all:
*/
if (blk_rq_is_passthrough(req))
return scsi_setup_scsi_cmnd(sdev, req);
if (sdev->handler && sdev->handler->prep_fn) { blk_status_t ret = sdev->handler->prep_fn(sdev, req);
if (ret != BLK_STS_OK)
return ret;
}
cmd->cmnd = scsi_req(req)->cmd = scsi_req(req)->__cmd;
memset(cmd->cmnd, 0, BLK_MAX_CDB);
return scsi_cmd_to_driver(cmd)->init_command(cmd);
}
static void scsi_mq_done(struct scsi_cmnd *cmd)
{
if (unlikely(blk_should_fake_timeout(scsi_cmd_to_rq(cmd)->q)))
return;
if (unlikely(test_and_set_bit(SCMD_STATE_COMPLETE, &cmd->state)))
return;
trace_scsi_dispatch_cmd_done(cmd);
blk_mq_complete_request(scsi_cmd_to_rq(cmd));
}
static void scsi_mq_put_budget(struct request_queue *q, int budget_token)
{
struct scsi_device *sdev = q->queuedata;
sbitmap_put(&sdev->budget_map, budget_token);
}
static int scsi_mq_get_budget(struct request_queue *q)
{
struct scsi_device *sdev = q->queuedata;
int token = scsi_dev_queue_ready(q, sdev);
if (token >= 0)
return token;
atomic_inc(&sdev->restarts);
/*
* Orders atomic_inc(&sdev->restarts) and atomic_read(&sdev->device_busy).
* .restarts must be incremented before .device_busy is read because the
* code in scsi_run_queue_async() depends on the order of these operations.
*/
smp_mb__after_atomic();
/*
* If all in-flight requests originated from this LUN are completed
* before reading .device_busy, sdev->device_busy will be observed as
* zero, then blk_mq_delay_run_hw_queues() will dispatch this request
* soon. Otherwise, completion of one of these requests will observe
* the .restarts flag, and the request queue will be run for handling
* this request, see scsi_end_request().
*/
if (unlikely(scsi_device_busy(sdev) == 0 &&
!scsi_device_blocked(sdev)))
blk_mq_delay_run_hw_queues(sdev->request_queue, SCSI_QUEUE_DELAY);
return -1;
}
static void scsi_mq_set_rq_budget_token(struct request *req, int token)
{
struct scsi_cmnd *cmd = blk_mq_rq_to_pdu(req);
cmd->budget_token = token;
}
static int scsi_mq_get_rq_budget_token(struct request *req)
{
struct scsi_cmnd *cmd = blk_mq_rq_to_pdu(req);
return cmd->budget_token;
}
static blk_status_t scsi_queue_rq(struct blk_mq_hw_ctx *hctx,
const struct blk_mq_queue_data *bd)
{
struct request *req = bd->rq;
struct request_queue *q = req->q;
struct scsi_device *sdev = q->queuedata;
struct Scsi_Host *shost = sdev->host;
struct scsi_cmnd *cmd = blk_mq_rq_to_pdu(req);
blk_status_t ret;
int reason;
WARN_ON_ONCE(cmd->budget_token < 0);
/*
* If the device is not in running state we will reject some or all
* commands.
*/
if (unlikely(sdev->sdev_state != SDEV_RUNNING)) {
ret = scsi_device_state_check(sdev, req);
if (ret != BLK_STS_OK)
goto out_put_budget;
}
ret = BLK_STS_RESOURCE;
if (!scsi_target_queue_ready(shost, sdev))
goto out_put_budget;
if (!scsi_host_queue_ready(q, shost, sdev, cmd))
goto out_dec_target_busy;
if (!(req->rq_flags & RQF_DONTPREP)) {
ret = scsi_prepare_cmd(req);
if (ret != BLK_STS_OK)
goto out_dec_host_busy;
req->rq_flags |= RQF_DONTPREP;
} else {
clear_bit(SCMD_STATE_COMPLETE, &cmd->state);
}
cmd->flags &= SCMD_PRESERVED_FLAGS;
if (sdev->simple_tags)
cmd->flags |= SCMD_TAGGED; if (bd->last) cmd->flags |= SCMD_LAST;
scsi_set_resid(cmd, 0);
memset(cmd->sense_buffer, 0, SCSI_SENSE_BUFFERSIZE);
cmd->scsi_done = scsi_mq_done;
blk_mq_start_request(req);
reason = scsi_dispatch_cmd(cmd);
if (reason) {
scsi_set_blocked(cmd, reason);
ret = BLK_STS_RESOURCE;
goto out_dec_host_busy;
}
return BLK_STS_OK;
out_dec_host_busy:
scsi_dec_host_busy(shost, cmd);
out_dec_target_busy:
if (scsi_target(sdev)->can_queue > 0) atomic_dec(&scsi_target(sdev)->target_busy);
out_put_budget:
scsi_mq_put_budget(q, cmd->budget_token);
cmd->budget_token = -1;
switch (ret) {
case BLK_STS_OK:
break;
case BLK_STS_RESOURCE:
case BLK_STS_ZONE_RESOURCE:
if (scsi_device_blocked(sdev))
ret = BLK_STS_DEV_RESOURCE;
break;
case BLK_STS_AGAIN:
scsi_req(req)->result = DID_BUS_BUSY << 16;
if (req->rq_flags & RQF_DONTPREP)
scsi_mq_uninit_cmd(cmd);
break;
default:
if (unlikely(!scsi_device_online(sdev)))
scsi_req(req)->result = DID_NO_CONNECT << 16;
else
scsi_req(req)->result = DID_ERROR << 16;
/*
* Make sure to release all allocated resources when
* we hit an error, as we will never see this command
* again.
*/
if (req->rq_flags & RQF_DONTPREP)
scsi_mq_uninit_cmd(cmd); scsi_run_queue_async(sdev); break;
}
return ret;
}
static enum blk_eh_timer_return scsi_timeout(struct request *req,
bool reserved)
{
if (reserved)
return BLK_EH_RESET_TIMER;
return scsi_times_out(req);
}
static int scsi_mq_init_request(struct blk_mq_tag_set *set, struct request *rq,
unsigned int hctx_idx, unsigned int numa_node)
{
struct Scsi_Host *shost = set->driver_data;
struct scsi_cmnd *cmd = blk_mq_rq_to_pdu(rq);
struct scatterlist *sg;
int ret = 0;
cmd->sense_buffer =
kmem_cache_alloc_node(scsi_sense_cache, GFP_KERNEL, numa_node);
if (!cmd->sense_buffer)
return -ENOMEM;
cmd->req.sense = cmd->sense_buffer;
if (scsi_host_get_prot(shost)) {
sg = (void *)cmd + sizeof(struct scsi_cmnd) +
shost->hostt->cmd_size;
cmd->prot_sdb = (void *)sg + scsi_mq_inline_sgl_size(shost);
}
if (shost->hostt->init_cmd_priv) {
ret = shost->hostt->init_cmd_priv(shost, cmd);
if (ret < 0)
kmem_cache_free(scsi_sense_cache, cmd->sense_buffer);
}
return ret;
}
static void scsi_mq_exit_request(struct blk_mq_tag_set *set, struct request *rq,
unsigned int hctx_idx)
{
struct Scsi_Host *shost = set->driver_data;
struct scsi_cmnd *cmd = blk_mq_rq_to_pdu(rq);
if (shost->hostt->exit_cmd_priv)
shost->hostt->exit_cmd_priv(shost, cmd);
kmem_cache_free(scsi_sense_cache, cmd->sense_buffer);
}
static int scsi_mq_poll(struct blk_mq_hw_ctx *hctx)
{
struct Scsi_Host *shost = hctx->driver_data;
if (shost->hostt->mq_poll)
return shost->hostt->mq_poll(shost, hctx->queue_num);
return 0;
}
static int scsi_init_hctx(struct blk_mq_hw_ctx *hctx, void *data,
unsigned int hctx_idx)
{
struct Scsi_Host *shost = data;
hctx->driver_data = shost;
return 0;
}
static int scsi_map_queues(struct blk_mq_tag_set *set)
{
struct Scsi_Host *shost = container_of(set, struct Scsi_Host, tag_set);
if (shost->hostt->map_queues)
return shost->hostt->map_queues(shost);
return blk_mq_map_queues(&set->map[HCTX_TYPE_DEFAULT]);
}
void __scsi_init_queue(struct Scsi_Host *shost, struct request_queue *q)
{
struct device *dev = shost->dma_dev;
/*
* this limit is imposed by hardware restrictions
*/
blk_queue_max_segments(q, min_t(unsigned short, shost->sg_tablesize,
SG_MAX_SEGMENTS));
if (scsi_host_prot_dma(shost)) {
shost->sg_prot_tablesize =
min_not_zero(shost->sg_prot_tablesize,
(unsigned short)SCSI_MAX_PROT_SG_SEGMENTS);
BUG_ON(shost->sg_prot_tablesize < shost->sg_tablesize);
blk_queue_max_integrity_segments(q, shost->sg_prot_tablesize);
}
if (dev->dma_mask) {
shost->max_sectors = min_t(unsigned int, shost->max_sectors,
dma_max_mapping_size(dev) >> SECTOR_SHIFT);
}
blk_queue_max_hw_sectors(q, shost->max_sectors);
blk_queue_segment_boundary(q, shost->dma_boundary);
dma_set_seg_boundary(dev, shost->dma_boundary);
blk_queue_max_segment_size(q, shost->max_segment_size);
blk_queue_virt_boundary(q, shost->virt_boundary_mask);
dma_set_max_seg_size(dev, queue_max_segment_size(q));
/*
* Set a reasonable default alignment: The larger of 32-byte (dword),
* which is a common minimum for HBAs, and the minimum DMA alignment,
* which is set by the platform.
*
* Devices that require a bigger alignment can increase it later.
*/
blk_queue_dma_alignment(q, max(4, dma_get_cache_alignment()) - 1);
}
EXPORT_SYMBOL_GPL(__scsi_init_queue);
static const struct blk_mq_ops scsi_mq_ops_no_commit = {
.get_budget = scsi_mq_get_budget,
.put_budget = scsi_mq_put_budget,
.queue_rq = scsi_queue_rq,
.complete = scsi_complete,
.timeout = scsi_timeout,
#ifdef CONFIG_BLK_DEBUG_FS
.show_rq = scsi_show_rq,
#endif
.init_request = scsi_mq_init_request,
.exit_request = scsi_mq_exit_request,
.initialize_rq_fn = scsi_initialize_rq,
.cleanup_rq = scsi_cleanup_rq,
.busy = scsi_mq_lld_busy,
.map_queues = scsi_map_queues,
.init_hctx = scsi_init_hctx,
.poll = scsi_mq_poll,
.set_rq_budget_token = scsi_mq_set_rq_budget_token,
.get_rq_budget_token = scsi_mq_get_rq_budget_token,
};
static void scsi_commit_rqs(struct blk_mq_hw_ctx *hctx)
{
struct Scsi_Host *shost = hctx->driver_data;
shost->hostt->commit_rqs(shost, hctx->queue_num);
}
static const struct blk_mq_ops scsi_mq_ops = {
.get_budget = scsi_mq_get_budget,
.put_budget = scsi_mq_put_budget,
.queue_rq = scsi_queue_rq,
.commit_rqs = scsi_commit_rqs,
.complete = scsi_complete,
.timeout = scsi_timeout,
#ifdef CONFIG_BLK_DEBUG_FS
.show_rq = scsi_show_rq,
#endif
.init_request = scsi_mq_init_request,
.exit_request = scsi_mq_exit_request,
.initialize_rq_fn = scsi_initialize_rq,
.cleanup_rq = scsi_cleanup_rq,
.busy = scsi_mq_lld_busy,
.map_queues = scsi_map_queues,
.init_hctx = scsi_init_hctx,
.poll = scsi_mq_poll,
.set_rq_budget_token = scsi_mq_set_rq_budget_token,
.get_rq_budget_token = scsi_mq_get_rq_budget_token,
};
int scsi_mq_setup_tags(struct Scsi_Host *shost)
{
unsigned int cmd_size, sgl_size;
struct blk_mq_tag_set *tag_set = &shost->tag_set;
sgl_size = max_t(unsigned int, sizeof(struct scatterlist),
scsi_mq_inline_sgl_size(shost));
cmd_size = sizeof(struct scsi_cmnd) + shost->hostt->cmd_size + sgl_size;
if (scsi_host_get_prot(shost))
cmd_size += sizeof(struct scsi_data_buffer) +
sizeof(struct scatterlist) * SCSI_INLINE_PROT_SG_CNT;
memset(tag_set, 0, sizeof(*tag_set));
if (shost->hostt->commit_rqs)
tag_set->ops = &scsi_mq_ops;
else
tag_set->ops = &scsi_mq_ops_no_commit;
tag_set->nr_hw_queues = shost->nr_hw_queues ? : 1;
tag_set->nr_maps = shost->nr_maps ? : 1;
tag_set->queue_depth = shost->can_queue;
tag_set->cmd_size = cmd_size;
tag_set->numa_node = NUMA_NO_NODE;
tag_set->flags = BLK_MQ_F_SHOULD_MERGE;
tag_set->flags |=
BLK_ALLOC_POLICY_TO_MQ_FLAG(shost->hostt->tag_alloc_policy);
tag_set->driver_data = shost;
if (shost->host_tagset)
tag_set->flags |= BLK_MQ_F_TAG_HCTX_SHARED;
return blk_mq_alloc_tag_set(tag_set);
}
void scsi_mq_destroy_tags(struct Scsi_Host *shost)
{
blk_mq_free_tag_set(&shost->tag_set);
}
/**
* scsi_device_from_queue - return sdev associated with a request_queue
* @q: The request queue to return the sdev from
*
* Return the sdev associated with a request queue or NULL if the
* request_queue does not reference a SCSI device.
*/
struct scsi_device *scsi_device_from_queue(struct request_queue *q)
{
struct scsi_device *sdev = NULL;
if (q->mq_ops == &scsi_mq_ops_no_commit ||
q->mq_ops == &scsi_mq_ops)
sdev = q->queuedata;
if (!sdev || !get_device(&sdev->sdev_gendev))
sdev = NULL;
return sdev;
}
/**
* scsi_block_requests - Utility function used by low-level drivers to prevent
* further commands from being queued to the device.
* @shost: host in question
*
* There is no timer nor any other means by which the requests get unblocked
* other than the low-level driver calling scsi_unblock_requests().
*/
void scsi_block_requests(struct Scsi_Host *shost)
{
shost->host_self_blocked = 1;
}
EXPORT_SYMBOL(scsi_block_requests);
/**
* scsi_unblock_requests - Utility function used by low-level drivers to allow
* further commands to be queued to the device.
* @shost: host in question
*
* There is no timer nor any other means by which the requests get unblocked
* other than the low-level driver calling scsi_unblock_requests(). This is done
* as an API function so that changes to the internals of the scsi mid-layer
* won't require wholesale changes to drivers that use this feature.
*/
void scsi_unblock_requests(struct Scsi_Host *shost)
{
shost->host_self_blocked = 0;
scsi_run_host_queues(shost);
}
EXPORT_SYMBOL(scsi_unblock_requests);
void scsi_exit_queue(void)
{
kmem_cache_destroy(scsi_sense_cache);
}
/**
* scsi_mode_select - issue a mode select
* @sdev: SCSI device to be queried
* @pf: Page format bit (1 == standard, 0 == vendor specific)
* @sp: Save page bit (0 == don't save, 1 == save)
* @modepage: mode page being requested
* @buffer: request buffer (may not be smaller than eight bytes)
* @len: length of request buffer.
* @timeout: command timeout
* @retries: number of retries before failing
* @data: returns a structure abstracting the mode header data
* @sshdr: place to put sense data (or NULL if no sense to be collected).
* must be SCSI_SENSE_BUFFERSIZE big.
*
* Returns zero if successful; negative error number or scsi
* status on error
*
*/
int
scsi_mode_select(struct scsi_device *sdev, int pf, int sp, int modepage,
unsigned char *buffer, int len, int timeout, int retries,
struct scsi_mode_data *data, struct scsi_sense_hdr *sshdr)
{
unsigned char cmd[10];
unsigned char *real_buffer;
int ret;
memset(cmd, 0, sizeof(cmd));
cmd[1] = (pf ? 0x10 : 0) | (sp ? 0x01 : 0);
if (sdev->use_10_for_ms) {
if (len > 65535)
return -EINVAL;
real_buffer = kmalloc(8 + len, GFP_KERNEL);
if (!real_buffer)
return -ENOMEM;
memcpy(real_buffer + 8, buffer, len);
len += 8;
real_buffer[0] = 0;
real_buffer[1] = 0;
real_buffer[2] = data->medium_type;
real_buffer[3] = data->device_specific;
real_buffer[4] = data->longlba ? 0x01 : 0;
real_buffer[5] = 0;
real_buffer[6] = data->block_descriptor_length >> 8;
real_buffer[7] = data->block_descriptor_length;
cmd[0] = MODE_SELECT_10;
cmd[7] = len >> 8;
cmd[8] = len;
} else {
if (len > 255 || data->block_descriptor_length > 255 ||
data->longlba)
return -EINVAL;
real_buffer = kmalloc(4 + len, GFP_KERNEL);
if (!real_buffer)
return -ENOMEM;
memcpy(real_buffer + 4, buffer, len);
len += 4;
real_buffer[0] = 0;
real_buffer[1] = data->medium_type;
real_buffer[2] = data->device_specific;
real_buffer[3] = data->block_descriptor_length;
cmd[0] = MODE_SELECT;
cmd[4] = len;
}
ret = scsi_execute_req(sdev, cmd, DMA_TO_DEVICE, real_buffer, len,
sshdr, timeout, retries, NULL);
kfree(real_buffer);
return ret;
}
EXPORT_SYMBOL_GPL(scsi_mode_select);
/**
* scsi_mode_sense - issue a mode sense, falling back from 10 to six bytes if necessary.
* @sdev: SCSI device to be queried
* @dbd: set to prevent mode sense from returning block descriptors
* @modepage: mode page being requested
* @buffer: request buffer (may not be smaller than eight bytes)
* @len: length of request buffer.
* @timeout: command timeout
* @retries: number of retries before failing
* @data: returns a structure abstracting the mode header data
* @sshdr: place to put sense data (or NULL if no sense to be collected).
* must be SCSI_SENSE_BUFFERSIZE big.
*
* Returns zero if successful, or a negative error number on failure
*/
int
scsi_mode_sense(struct scsi_device *sdev, int dbd, int modepage,
unsigned char *buffer, int len, int timeout, int retries,
struct scsi_mode_data *data, struct scsi_sense_hdr *sshdr)
{
unsigned char cmd[12];
int use_10_for_ms;
int header_length;
int result, retry_count = retries;
struct scsi_sense_hdr my_sshdr;
memset(data, 0, sizeof(*data));
memset(&cmd[0], 0, 12);
dbd = sdev->set_dbd_for_ms ? 8 : dbd;
cmd[1] = dbd & 0x18; /* allows DBD and LLBA bits */
cmd[2] = modepage;
/* caller might not be interested in sense, but we need it */
if (!sshdr)
sshdr = &my_sshdr;
retry:
use_10_for_ms = sdev->use_10_for_ms || len > 255;
if (use_10_for_ms) {
if (len < 8 || len > 65535)
return -EINVAL;
cmd[0] = MODE_SENSE_10;
put_unaligned_be16(len, &cmd[7]);
header_length = 8;
} else {
if (len < 4)
return -EINVAL;
cmd[0] = MODE_SENSE;
cmd[4] = len;
header_length = 4;
}
memset(buffer, 0, len);
result = scsi_execute_req(sdev, cmd, DMA_FROM_DEVICE, buffer, len,
sshdr, timeout, retries, NULL);
if (result < 0)
return result;
/* This code looks awful: what it's doing is making sure an
* ILLEGAL REQUEST sense return identifies the actual command
* byte as the problem. MODE_SENSE commands can return
* ILLEGAL REQUEST if the code page isn't supported */
if (!scsi_status_is_good(result)) {
if (scsi_sense_valid(sshdr)) {
if ((sshdr->sense_key == ILLEGAL_REQUEST) &&
(sshdr->asc == 0x20) && (sshdr->ascq == 0)) {
/*
* Invalid command operation code: retry using
* MODE SENSE(6) if this was a MODE SENSE(10)
* request, except if the request mode page is
* too large for MODE SENSE single byte
* allocation length field.
*/
if (use_10_for_ms) {
if (len > 255)
return -EIO;
sdev->use_10_for_ms = 0;
goto retry;
}
}
if (scsi_status_is_check_condition(result) &&
sshdr->sense_key == UNIT_ATTENTION &&
retry_count) {
retry_count--;
goto retry;
}
}
return -EIO;
}
if (unlikely(buffer[0] == 0x86 && buffer[1] == 0x0b &&
(modepage == 6 || modepage == 8))) {
/* Initio breakage? */
header_length = 0;
data->length = 13;
data->medium_type = 0;
data->device_specific = 0;
data->longlba = 0;
data->block_descriptor_length = 0;
} else if (use_10_for_ms) {
data->length = get_unaligned_be16(&buffer[0]) + 2;
data->medium_type = buffer[2];
data->device_specific = buffer[3];
data->longlba = buffer[4] & 0x01;
data->block_descriptor_length = get_unaligned_be16(&buffer[6]);
} else {
data->length = buffer[0] + 1;
data->medium_type = buffer[1];
data->device_specific = buffer[2];
data->block_descriptor_length = buffer[3];
}
data->header_length = header_length;
return 0;
}
EXPORT_SYMBOL(scsi_mode_sense);
/**
* scsi_test_unit_ready - test if unit is ready
* @sdev: scsi device to change the state of.
* @timeout: command timeout
* @retries: number of retries before failing
* @sshdr: outpout pointer for decoded sense information.
*
* Returns zero if unsuccessful or an error if TUR failed. For
* removable media, UNIT_ATTENTION sets ->changed flag.
**/
int
scsi_test_unit_ready(struct scsi_device *sdev, int timeout, int retries,
struct scsi_sense_hdr *sshdr)
{
char cmd[] = {
TEST_UNIT_READY, 0, 0, 0, 0, 0,
};
int result;
/* try to eat the UNIT_ATTENTION if there are enough retries */
do {
result = scsi_execute_req(sdev, cmd, DMA_NONE, NULL, 0, sshdr,
timeout, 1, NULL);
if (sdev->removable && scsi_sense_valid(sshdr) &&
sshdr->sense_key == UNIT_ATTENTION)
sdev->changed = 1;
} while (scsi_sense_valid(sshdr) &&
sshdr->sense_key == UNIT_ATTENTION && --retries);
return result;
}
EXPORT_SYMBOL(scsi_test_unit_ready);
/**
* scsi_device_set_state - Take the given device through the device state model.
* @sdev: scsi device to change the state of.
* @state: state to change to.
*
* Returns zero if successful or an error if the requested
* transition is illegal.
*/
int
scsi_device_set_state(struct scsi_device *sdev, enum scsi_device_state state)
{
enum scsi_device_state oldstate = sdev->sdev_state;
if (state == oldstate)
return 0;
switch (state) {
case SDEV_CREATED:
switch (oldstate) {
case SDEV_CREATED_BLOCK:
break;
default:
goto illegal;
}
break;
case SDEV_RUNNING:
switch (oldstate) {
case SDEV_CREATED:
case SDEV_OFFLINE:
case SDEV_TRANSPORT_OFFLINE:
case SDEV_QUIESCE:
case SDEV_BLOCK:
break;
default:
goto illegal;
}
break;
case SDEV_QUIESCE:
switch (oldstate) {
case SDEV_RUNNING:
case SDEV_OFFLINE:
case SDEV_TRANSPORT_OFFLINE:
break;
default:
goto illegal;
}
break;
case SDEV_OFFLINE:
case SDEV_TRANSPORT_OFFLINE:
switch (oldstate) {
case SDEV_CREATED:
case SDEV_RUNNING:
case SDEV_QUIESCE:
case SDEV_BLOCK:
break;
default:
goto illegal;
}
break;
case SDEV_BLOCK:
switch (oldstate) {
case SDEV_RUNNING:
case SDEV_CREATED_BLOCK:
case SDEV_QUIESCE:
case SDEV_OFFLINE:
break;
default:
goto illegal;
}
break;
case SDEV_CREATED_BLOCK:
switch (oldstate) {
case SDEV_CREATED:
break;
default:
goto illegal;
}
break;
case SDEV_CANCEL:
switch (oldstate) {
case SDEV_CREATED:
case SDEV_RUNNING:
case SDEV_QUIESCE:
case SDEV_OFFLINE:
case SDEV_TRANSPORT_OFFLINE:
break;
default:
goto illegal;
}
break;
case SDEV_DEL:
switch (oldstate) {
case SDEV_CREATED:
case SDEV_RUNNING:
case SDEV_OFFLINE:
case SDEV_TRANSPORT_OFFLINE:
case SDEV_CANCEL:
case SDEV_BLOCK:
case SDEV_CREATED_BLOCK:
break;
default:
goto illegal;
}
break;
}
sdev->offline_already = false;
sdev->sdev_state = state;
return 0;
illegal:
SCSI_LOG_ERROR_RECOVERY(1,
sdev_printk(KERN_ERR, sdev,
"Illegal state transition %s->%s",
scsi_device_state_name(oldstate),
scsi_device_state_name(state))
);
return -EINVAL;
}
EXPORT_SYMBOL(scsi_device_set_state);
/**
* scsi_evt_emit - emit a single SCSI device uevent
* @sdev: associated SCSI device
* @evt: event to emit
*
* Send a single uevent (scsi_event) to the associated scsi_device.
*/
static void scsi_evt_emit(struct scsi_device *sdev, struct scsi_event *evt)
{
int idx = 0;
char *envp[3];
switch (evt->evt_type) {
case SDEV_EVT_MEDIA_CHANGE:
envp[idx++] = "SDEV_MEDIA_CHANGE=1";
break;
case SDEV_EVT_INQUIRY_CHANGE_REPORTED:
scsi_rescan_device(&sdev->sdev_gendev);
envp[idx++] = "SDEV_UA=INQUIRY_DATA_HAS_CHANGED";
break;
case SDEV_EVT_CAPACITY_CHANGE_REPORTED:
envp[idx++] = "SDEV_UA=CAPACITY_DATA_HAS_CHANGED";
break;
case SDEV_EVT_SOFT_THRESHOLD_REACHED_REPORTED:
envp[idx++] = "SDEV_UA=THIN_PROVISIONING_SOFT_THRESHOLD_REACHED";
break;
case SDEV_EVT_MODE_PARAMETER_CHANGE_REPORTED:
envp[idx++] = "SDEV_UA=MODE_PARAMETERS_CHANGED";
break;
case SDEV_EVT_LUN_CHANGE_REPORTED:
envp[idx++] = "SDEV_UA=REPORTED_LUNS_DATA_HAS_CHANGED";
break;
case SDEV_EVT_ALUA_STATE_CHANGE_REPORTED:
envp[idx++] = "SDEV_UA=ASYMMETRIC_ACCESS_STATE_CHANGED";
break;
case SDEV_EVT_POWER_ON_RESET_OCCURRED:
envp[idx++] = "SDEV_UA=POWER_ON_RESET_OCCURRED";
break;
default:
/* do nothing */
break;
}
envp[idx++] = NULL;
kobject_uevent_env(&sdev->sdev_gendev.kobj, KOBJ_CHANGE, envp);
}
/**
* scsi_evt_thread - send a uevent for each scsi event
* @work: work struct for scsi_device
*
* Dispatch queued events to their associated scsi_device kobjects
* as uevents.
*/
void scsi_evt_thread(struct work_struct *work)
{
struct scsi_device *sdev;
enum scsi_device_event evt_type;
LIST_HEAD(event_list);
sdev = container_of(work, struct scsi_device, event_work);
for (evt_type = SDEV_EVT_FIRST; evt_type <= SDEV_EVT_LAST; evt_type++)
if (test_and_clear_bit(evt_type, sdev->pending_events))
sdev_evt_send_simple(sdev, evt_type, GFP_KERNEL);
while (1) {
struct scsi_event *evt;
struct list_head *this, *tmp;
unsigned long flags;
spin_lock_irqsave(&sdev->list_lock, flags);
list_splice_init(&sdev->event_list, &event_list);
spin_unlock_irqrestore(&sdev->list_lock, flags);
if (list_empty(&event_list))
break;
list_for_each_safe(this, tmp, &event_list) {
evt = list_entry(this, struct scsi_event, node);
list_del(&evt->node);
scsi_evt_emit(sdev, evt);
kfree(evt);
}
}
}
/**
* sdev_evt_send - send asserted event to uevent thread
* @sdev: scsi_device event occurred on
* @evt: event to send
*
* Assert scsi device event asynchronously.
*/
void sdev_evt_send(struct scsi_device *sdev, struct scsi_event *evt)
{
unsigned long flags;
#if 0
/* FIXME: currently this check eliminates all media change events
* for polled devices. Need to update to discriminate between AN
* and polled events */
if (!test_bit(evt->evt_type, sdev->supported_events)) {
kfree(evt);
return;
}
#endif
spin_lock_irqsave(&sdev->list_lock, flags);
list_add_tail(&evt->node, &sdev->event_list);
schedule_work(&sdev->event_work);
spin_unlock_irqrestore(&sdev->list_lock, flags);
}
EXPORT_SYMBOL_GPL(sdev_evt_send);
/**
* sdev_evt_alloc - allocate a new scsi event
* @evt_type: type of event to allocate
* @gfpflags: GFP flags for allocation
*
* Allocates and returns a new scsi_event.
*/
struct scsi_event *sdev_evt_alloc(enum scsi_device_event evt_type,
gfp_t gfpflags)
{
struct scsi_event *evt = kzalloc(sizeof(struct scsi_event), gfpflags);
if (!evt)
return NULL;
evt->evt_type = evt_type;
INIT_LIST_HEAD(&evt->node);
/* evt_type-specific initialization, if any */
switch (evt_type) {
case SDEV_EVT_MEDIA_CHANGE:
case SDEV_EVT_INQUIRY_CHANGE_REPORTED:
case SDEV_EVT_CAPACITY_CHANGE_REPORTED:
case SDEV_EVT_SOFT_THRESHOLD_REACHED_REPORTED:
case SDEV_EVT_MODE_PARAMETER_CHANGE_REPORTED:
case SDEV_EVT_LUN_CHANGE_REPORTED:
case SDEV_EVT_ALUA_STATE_CHANGE_REPORTED:
case SDEV_EVT_POWER_ON_RESET_OCCURRED:
default:
/* do nothing */
break;
}
return evt;
}
EXPORT_SYMBOL_GPL(sdev_evt_alloc);
/**
* sdev_evt_send_simple - send asserted event to uevent thread
* @sdev: scsi_device event occurred on
* @evt_type: type of event to send
* @gfpflags: GFP flags for allocation
*
* Assert scsi device event asynchronously, given an event type.
*/
void sdev_evt_send_simple(struct scsi_device *sdev,
enum scsi_device_event evt_type, gfp_t gfpflags)
{
struct scsi_event *evt = sdev_evt_alloc(evt_type, gfpflags);
if (!evt) {
sdev_printk(KERN_ERR, sdev, "event %d eaten due to OOM\n",
evt_type);
return;
}
sdev_evt_send(sdev, evt);
}
EXPORT_SYMBOL_GPL(sdev_evt_send_simple);
/**
* scsi_device_quiesce - Block all commands except power management.
* @sdev: scsi device to quiesce.
*
* This works by trying to transition to the SDEV_QUIESCE state
* (which must be a legal transition). When the device is in this
* state, only power management requests will be accepted, all others will
* be deferred.
*
* Must be called with user context, may sleep.
*
* Returns zero if unsuccessful or an error if not.
*/
int
scsi_device_quiesce(struct scsi_device *sdev)
{
struct request_queue *q = sdev->request_queue;
int err;
/*
* It is allowed to call scsi_device_quiesce() multiple times from
* the same context but concurrent scsi_device_quiesce() calls are
* not allowed.
*/
WARN_ON_ONCE(sdev->quiesced_by && sdev->quiesced_by != current);
if (sdev->quiesced_by == current)
return 0;
blk_set_pm_only(q);
blk_mq_freeze_queue(q);
/*
* Ensure that the effect of blk_set_pm_only() will be visible
* for percpu_ref_tryget() callers that occur after the queue
* unfreeze even if the queue was already frozen before this function
* was called. See also https://lwn.net/Articles/573497/.
*/
synchronize_rcu();
blk_mq_unfreeze_queue(q);
mutex_lock(&sdev->state_mutex);
err = scsi_device_set_state(sdev, SDEV_QUIESCE);
if (err == 0)
sdev->quiesced_by = current;
else
blk_clear_pm_only(q);
mutex_unlock(&sdev->state_mutex);
return err;
}
EXPORT_SYMBOL(scsi_device_quiesce);
/**
* scsi_device_resume - Restart user issued commands to a quiesced device.
* @sdev: scsi device to resume.
*
* Moves the device from quiesced back to running and restarts the
* queues.
*
* Must be called with user context, may sleep.
*/
void scsi_device_resume(struct scsi_device *sdev)
{
/* check if the device state was mutated prior to resume, and if
* so assume the state is being managed elsewhere (for example
* device deleted during suspend)
*/
mutex_lock(&sdev->state_mutex);
if (sdev->sdev_state == SDEV_QUIESCE)
scsi_device_set_state(sdev, SDEV_RUNNING);
if (sdev->quiesced_by) {
sdev->quiesced_by = NULL;
blk_clear_pm_only(sdev->request_queue);
}
mutex_unlock(&sdev->state_mutex);
}
EXPORT_SYMBOL(scsi_device_resume);
static void
device_quiesce_fn(struct scsi_device *sdev, void *data)
{
scsi_device_quiesce(sdev);
}
void
scsi_target_quiesce(struct scsi_target *starget)
{
starget_for_each_device(starget, NULL, device_quiesce_fn);
}
EXPORT_SYMBOL(scsi_target_quiesce);
static void
device_resume_fn(struct scsi_device *sdev, void *data)
{
scsi_device_resume(sdev);
}
void
scsi_target_resume(struct scsi_target *starget)
{
starget_for_each_device(starget, NULL, device_resume_fn);
}
EXPORT_SYMBOL(scsi_target_resume);
/**
* scsi_internal_device_block_nowait - try to transition to the SDEV_BLOCK state
* @sdev: device to block
*
* Pause SCSI command processing on the specified device. Does not sleep.
*
* Returns zero if successful or a negative error code upon failure.
*
* Notes:
* This routine transitions the device to the SDEV_BLOCK state (which must be
* a legal transition). When the device is in this state, command processing
* is paused until the device leaves the SDEV_BLOCK state. See also
* scsi_internal_device_unblock_nowait().
*/
int scsi_internal_device_block_nowait(struct scsi_device *sdev)
{
struct request_queue *q = sdev->request_queue;
int err = 0;
err = scsi_device_set_state(sdev, SDEV_BLOCK);
if (err) {
err = scsi_device_set_state(sdev, SDEV_CREATED_BLOCK);
if (err)
return err;
}
/*
* The device has transitioned to SDEV_BLOCK. Stop the
* block layer from calling the midlayer with this device's
* request queue.
*/
blk_mq_quiesce_queue_nowait(q);
return 0;
}
EXPORT_SYMBOL_GPL(scsi_internal_device_block_nowait);
/**
* scsi_internal_device_block - try to transition to the SDEV_BLOCK state
* @sdev: device to block
*
* Pause SCSI command processing on the specified device and wait until all
* ongoing scsi_request_fn() / scsi_queue_rq() calls have finished. May sleep.
*
* Returns zero if successful or a negative error code upon failure.
*
* Note:
* This routine transitions the device to the SDEV_BLOCK state (which must be
* a legal transition). When the device is in this state, command processing
* is paused until the device leaves the SDEV_BLOCK state. See also
* scsi_internal_device_unblock().
*/
static int scsi_internal_device_block(struct scsi_device *sdev)
{
struct request_queue *q = sdev->request_queue;
int err;
mutex_lock(&sdev->state_mutex);
err = scsi_internal_device_block_nowait(sdev);
if (err == 0)
blk_mq_quiesce_queue(q);
mutex_unlock(&sdev->state_mutex);
return err;
}
void scsi_start_queue(struct scsi_device *sdev)
{
struct request_queue *q = sdev->request_queue;
blk_mq_unquiesce_queue(q);
}
/**
* scsi_internal_device_unblock_nowait - resume a device after a block request
* @sdev: device to resume
* @new_state: state to set the device to after unblocking
*
* Restart the device queue for a previously suspended SCSI device. Does not
* sleep.
*
* Returns zero if successful or a negative error code upon failure.
*
* Notes:
* This routine transitions the device to the SDEV_RUNNING state or to one of
* the offline states (which must be a legal transition) allowing the midlayer
* to goose the queue for this device.
*/
int scsi_internal_device_unblock_nowait(struct scsi_device *sdev,
enum scsi_device_state new_state)
{
switch (new_state) {
case SDEV_RUNNING:
case SDEV_TRANSPORT_OFFLINE:
break;
default:
return -EINVAL;
}
/*
* Try to transition the scsi device to SDEV_RUNNING or one of the
* offlined states and goose the device queue if successful.
*/
switch (sdev->sdev_state) {
case SDEV_BLOCK:
case SDEV_TRANSPORT_OFFLINE:
sdev->sdev_state = new_state;
break;
case SDEV_CREATED_BLOCK:
if (new_state == SDEV_TRANSPORT_OFFLINE ||
new_state == SDEV_OFFLINE)
sdev->sdev_state = new_state;
else
sdev->sdev_state = SDEV_CREATED;
break;
case SDEV_CANCEL:
case SDEV_OFFLINE:
break;
default:
return -EINVAL;
}
scsi_start_queue(sdev);
return 0;
}
EXPORT_SYMBOL_GPL(scsi_internal_device_unblock_nowait);
/**
* scsi_internal_device_unblock - resume a device after a block request
* @sdev: device to resume
* @new_state: state to set the device to after unblocking
*
* Restart the device queue for a previously suspended SCSI device. May sleep.
*
* Returns zero if successful or a negative error code upon failure.
*
* Notes:
* This routine transitions the device to the SDEV_RUNNING state or to one of
* the offline states (which must be a legal transition) allowing the midlayer
* to goose the queue for this device.
*/
static int scsi_internal_device_unblock(struct scsi_device *sdev,
enum scsi_device_state new_state)
{
int ret;
mutex_lock(&sdev->state_mutex);
ret = scsi_internal_device_unblock_nowait(sdev, new_state);
mutex_unlock(&sdev->state_mutex);
return ret;
}
static void
device_block(struct scsi_device *sdev, void *data)
{
int ret;
ret = scsi_internal_device_block(sdev);
WARN_ONCE(ret, "scsi_internal_device_block(%s) failed: ret = %d\n",
dev_name(&sdev->sdev_gendev), ret);
}
static int
target_block(struct device *dev, void *data)
{
if (scsi_is_target_device(dev))
starget_for_each_device(to_scsi_target(dev), NULL,
device_block);
return 0;
}
void
scsi_target_block(struct device *dev)
{
if (scsi_is_target_device(dev))
starget_for_each_device(to_scsi_target(dev), NULL,
device_block);
else
device_for_each_child(dev, NULL, target_block);
}
EXPORT_SYMBOL_GPL(scsi_target_block);
static void
device_unblock(struct scsi_device *sdev, void *data)
{
scsi_internal_device_unblock(sdev, *(enum scsi_device_state *)data);
}
static int
target_unblock(struct device *dev, void *data)
{
if (scsi_is_target_device(dev))
starget_for_each_device(to_scsi_target(dev), data,
device_unblock);
return 0;
}
void
scsi_target_unblock(struct device *dev, enum scsi_device_state new_state)
{
if (scsi_is_target_device(dev))
starget_for_each_device(to_scsi_target(dev), &new_state,
device_unblock);
else
device_for_each_child(dev, &new_state, target_unblock);
}
EXPORT_SYMBOL_GPL(scsi_target_unblock);
int
scsi_host_block(struct Scsi_Host *shost)
{
struct scsi_device *sdev;
int ret = 0;
/*
* Call scsi_internal_device_block_nowait so we can avoid
* calling synchronize_rcu() for each LUN.
*/
shost_for_each_device(sdev, shost) {
mutex_lock(&sdev->state_mutex);
ret = scsi_internal_device_block_nowait(sdev);
mutex_unlock(&sdev->state_mutex);
if (ret) {
scsi_device_put(sdev);
break;
}
}
/*
* SCSI never enables blk-mq's BLK_MQ_F_BLOCKING flag so
* calling synchronize_rcu() once is enough.
*/
WARN_ON_ONCE(shost->tag_set.flags & BLK_MQ_F_BLOCKING);
if (!ret)
synchronize_rcu();
return ret;
}
EXPORT_SYMBOL_GPL(scsi_host_block);
int
scsi_host_unblock(struct Scsi_Host *shost, int new_state)
{
struct scsi_device *sdev;
int ret = 0;
shost_for_each_device(sdev, shost) {
ret = scsi_internal_device_unblock(sdev, new_state);
if (ret) {
scsi_device_put(sdev);
break;
}
}
return ret;
}
EXPORT_SYMBOL_GPL(scsi_host_unblock);
/**
* scsi_kmap_atomic_sg - find and atomically map an sg-elemnt
* @sgl: scatter-gather list
* @sg_count: number of segments in sg
* @offset: offset in bytes into sg, on return offset into the mapped area
* @len: bytes to map, on return number of bytes mapped
*
* Returns virtual address of the start of the mapped page
*/
void *scsi_kmap_atomic_sg(struct scatterlist *sgl, int sg_count,
size_t *offset, size_t *len)
{
int i;
size_t sg_len = 0, len_complete = 0;
struct scatterlist *sg;
struct page *page;
WARN_ON(!irqs_disabled());
for_each_sg(sgl, sg, sg_count, i) {
len_complete = sg_len; /* Complete sg-entries */
sg_len += sg->length;
if (sg_len > *offset)
break;
}
if (unlikely(i == sg_count)) {
printk(KERN_ERR "%s: Bytes in sg: %zu, requested offset %zu, "
"elements %d\n",
__func__, sg_len, *offset, sg_count);
WARN_ON(1);
return NULL;
}
/* Offset starting from the beginning of first page in this sg-entry */
*offset = *offset - len_complete + sg->offset;
/* Assumption: contiguous pages can be accessed as "page + i" */
page = nth_page(sg_page(sg), (*offset >> PAGE_SHIFT));
*offset &= ~PAGE_MASK;
/* Bytes in this sg-entry from *offset to the end of the page */
sg_len = PAGE_SIZE - *offset;
if (*len > sg_len)
*len = sg_len;
return kmap_atomic(page);
}
EXPORT_SYMBOL(scsi_kmap_atomic_sg);
/**
* scsi_kunmap_atomic_sg - atomically unmap a virtual address, previously mapped with scsi_kmap_atomic_sg
* @virt: virtual address to be unmapped
*/
void scsi_kunmap_atomic_sg(void *virt)
{
kunmap_atomic(virt);
}
EXPORT_SYMBOL(scsi_kunmap_atomic_sg);
void sdev_disable_disk_events(struct scsi_device *sdev)
{
atomic_inc(&sdev->disk_events_disable_depth);
}
EXPORT_SYMBOL(sdev_disable_disk_events);
void sdev_enable_disk_events(struct scsi_device *sdev)
{
if (WARN_ON_ONCE(atomic_read(&sdev->disk_events_disable_depth) <= 0))
return;
atomic_dec(&sdev->disk_events_disable_depth);
}
EXPORT_SYMBOL(sdev_enable_disk_events);
static unsigned char designator_prio(const unsigned char *d)
{
if (d[1] & 0x30)
/* not associated with LUN */
return 0;
if (d[3] == 0)
/* invalid length */
return 0;
/*
* Order of preference for lun descriptor:
* - SCSI name string
* - NAA IEEE Registered Extended
* - EUI-64 based 16-byte
* - EUI-64 based 12-byte
* - NAA IEEE Registered
* - NAA IEEE Extended
* - EUI-64 based 8-byte
* - SCSI name string (truncated)
* - T10 Vendor ID
* as longer descriptors reduce the likelyhood
* of identification clashes.
*/
switch (d[1] & 0xf) {
case 8:
/* SCSI name string, variable-length UTF-8 */
return 9;
case 3:
switch (d[4] >> 4) {
case 6:
/* NAA registered extended */
return 8;
case 5:
/* NAA registered */
return 5;
case 4:
/* NAA extended */
return 4;
case 3:
/* NAA locally assigned */
return 1;
default:
break;
}
break;
case 2:
switch (d[3]) {
case 16:
/* EUI64-based, 16 byte */
return 7;
case 12:
/* EUI64-based, 12 byte */
return 6;
case 8:
/* EUI64-based, 8 byte */
return 3;
default:
break;
}
break;
case 1:
/* T10 vendor ID */
return 1;
default:
break;
}
return 0;
}
/**
* scsi_vpd_lun_id - return a unique device identification
* @sdev: SCSI device
* @id: buffer for the identification
* @id_len: length of the buffer
*
* Copies a unique device identification into @id based
* on the information in the VPD page 0x83 of the device.
* The string will be formatted as a SCSI name string.
*
* Returns the length of the identification or error on failure.
* If the identifier is longer than the supplied buffer the actual
* identifier length is returned and the buffer is not zero-padded.
*/
int scsi_vpd_lun_id(struct scsi_device *sdev, char *id, size_t id_len)
{
u8 cur_id_prio = 0;
u8 cur_id_size = 0;
const unsigned char *d, *cur_id_str;
const struct scsi_vpd *vpd_pg83;
int id_size = -EINVAL;
rcu_read_lock();
vpd_pg83 = rcu_dereference(sdev->vpd_pg83);
if (!vpd_pg83) {
rcu_read_unlock();
return -ENXIO;
}
/* The id string must be at least 20 bytes + terminating NULL byte */
if (id_len < 21) {
rcu_read_unlock();
return -EINVAL;
}
memset(id, 0, id_len);
for (d = vpd_pg83->data + 4;
d < vpd_pg83->data + vpd_pg83->len;
d += d[3] + 4) {
u8 prio = designator_prio(d);
if (prio == 0 || cur_id_prio > prio)
continue;
switch (d[1] & 0xf) {
case 0x1:
/* T10 Vendor ID */
if (cur_id_size > d[3])
break;
cur_id_prio = prio;
cur_id_size = d[3];
if (cur_id_size + 4 > id_len)
cur_id_size = id_len - 4;
cur_id_str = d + 4;
id_size = snprintf(id, id_len, "t10.%*pE",
cur_id_size, cur_id_str);
break;
case 0x2:
/* EUI-64 */
cur_id_prio = prio;
cur_id_size = d[3];
cur_id_str = d + 4;
switch (cur_id_size) {
case 8:
id_size = snprintf(id, id_len,
"eui.%8phN",
cur_id_str);
break;
case 12:
id_size = snprintf(id, id_len,
"eui.%12phN",
cur_id_str);
break;
case 16:
id_size = snprintf(id, id_len,
"eui.%16phN",
cur_id_str);
break;
default:
break;
}
break;
case 0x3:
/* NAA */
cur_id_prio = prio;
cur_id_size = d[3];
cur_id_str = d + 4;
switch (cur_id_size) {
case 8:
id_size = snprintf(id, id_len,
"naa.%8phN",
cur_id_str);
break;
case 16:
id_size = snprintf(id, id_len,
"naa.%16phN",
cur_id_str);
break;
default:
break;
}
break;
case 0x8:
/* SCSI name string */
if (cur_id_size > d[3])
break;
/* Prefer others for truncated descriptor */
if (d[3] > id_len) {
prio = 2;
if (cur_id_prio > prio)
break;
}
cur_id_prio = prio;
cur_id_size = id_size = d[3];
cur_id_str = d + 4;
if (cur_id_size >= id_len)
cur_id_size = id_len - 1;
memcpy(id, cur_id_str, cur_id_size);
break;
default:
break;
}
}
rcu_read_unlock();
return id_size;
}
EXPORT_SYMBOL(scsi_vpd_lun_id);
/*
* scsi_vpd_tpg_id - return a target port group identifier
* @sdev: SCSI device
*
* Returns the Target Port Group identifier from the information
* froom VPD page 0x83 of the device.
*
* Returns the identifier or error on failure.
*/
int scsi_vpd_tpg_id(struct scsi_device *sdev, int *rel_id)
{
const unsigned char *d;
const struct scsi_vpd *vpd_pg83;
int group_id = -EAGAIN, rel_port = -1;
rcu_read_lock();
vpd_pg83 = rcu_dereference(sdev->vpd_pg83);
if (!vpd_pg83) {
rcu_read_unlock();
return -ENXIO;
}
d = vpd_pg83->data + 4;
while (d < vpd_pg83->data + vpd_pg83->len) {
switch (d[1] & 0xf) {
case 0x4:
/* Relative target port */
rel_port = get_unaligned_be16(&d[6]);
break;
case 0x5:
/* Target port group */
group_id = get_unaligned_be16(&d[6]);
break;
default:
break;
}
d += d[3] + 4;
}
rcu_read_unlock();
if (group_id >= 0 && rel_id && rel_port != -1)
*rel_id = rel_port;
return group_id;
}
EXPORT_SYMBOL(scsi_vpd_tpg_id);
/**
* scsi_build_sense - build sense data for a command
* @scmd: scsi command for which the sense should be formatted
* @desc: Sense format (non-zero == descriptor format,
* 0 == fixed format)
* @key: Sense key
* @asc: Additional sense code
* @ascq: Additional sense code qualifier
*
**/
void scsi_build_sense(struct scsi_cmnd *scmd, int desc, u8 key, u8 asc, u8 ascq)
{
scsi_build_sense_buffer(desc, scmd->sense_buffer, key, asc, ascq);
scmd->result = SAM_STAT_CHECK_CONDITION;
}
EXPORT_SYMBOL_GPL(scsi_build_sense);
// SPDX-License-Identifier: GPL-2.0-or-later
/*
* libata-sff.c - helper library for PCI IDE BMDMA
*
* Copyright 2003-2006 Red Hat, Inc. All rights reserved.
* Copyright 2003-2006 Jeff Garzik
*
* libata documentation is available via 'make {ps|pdf}docs',
* as Documentation/driver-api/libata.rst
*
* Hardware documentation available from http://www.t13.org/ and
* http://www.sata-io.org/
*/
#include <linux/kernel.h>
#include <linux/gfp.h>
#include <linux/pci.h>
#include <linux/module.h>
#include <linux/libata.h>
#include <linux/highmem.h>
#include "libata.h"
static struct workqueue_struct *ata_sff_wq;
const struct ata_port_operations ata_sff_port_ops = {
.inherits = &ata_base_port_ops,
.qc_prep = ata_noop_qc_prep,
.qc_issue = ata_sff_qc_issue,
.qc_fill_rtf = ata_sff_qc_fill_rtf,
.freeze = ata_sff_freeze,
.thaw = ata_sff_thaw,
.prereset = ata_sff_prereset,
.softreset = ata_sff_softreset,
.hardreset = sata_sff_hardreset,
.postreset = ata_sff_postreset,
.error_handler = ata_sff_error_handler,
.sff_dev_select = ata_sff_dev_select,
.sff_check_status = ata_sff_check_status,
.sff_tf_load = ata_sff_tf_load,
.sff_tf_read = ata_sff_tf_read,
.sff_exec_command = ata_sff_exec_command,
.sff_data_xfer = ata_sff_data_xfer,
.sff_drain_fifo = ata_sff_drain_fifo,
.lost_interrupt = ata_sff_lost_interrupt,
};
EXPORT_SYMBOL_GPL(ata_sff_port_ops);
/**
* ata_sff_check_status - Read device status reg & clear interrupt
* @ap: port where the device is
*
* Reads ATA taskfile status register for currently-selected device
* and return its value. This also clears pending interrupts
* from this device
*
* LOCKING:
* Inherited from caller.
*/
u8 ata_sff_check_status(struct ata_port *ap)
{
return ioread8(ap->ioaddr.status_addr);
}
EXPORT_SYMBOL_GPL(ata_sff_check_status);
/**
* ata_sff_altstatus - Read device alternate status reg
* @ap: port where the device is
*
* Reads ATA taskfile alternate status register for
* currently-selected device and return its value.
*
* Note: may NOT be used as the check_altstatus() entry in
* ata_port_operations.
*
* LOCKING:
* Inherited from caller.
*/
static u8 ata_sff_altstatus(struct ata_port *ap)
{
if (ap->ops->sff_check_altstatus)
return ap->ops->sff_check_altstatus(ap);
return ioread8(ap->ioaddr.altstatus_addr);
}
/**
* ata_sff_irq_status - Check if the device is busy
* @ap: port where the device is
*
* Determine if the port is currently busy. Uses altstatus
* if available in order to avoid clearing shared IRQ status
* when finding an IRQ source. Non ctl capable devices don't
* share interrupt lines fortunately for us.
*
* LOCKING:
* Inherited from caller.
*/
static u8 ata_sff_irq_status(struct ata_port *ap)
{
u8 status;
if (ap->ops->sff_check_altstatus || ap->ioaddr.altstatus_addr) {
status = ata_sff_altstatus(ap);
/* Not us: We are busy */
if (status & ATA_BUSY)
return status;
}
/* Clear INTRQ latch */
status = ap->ops->sff_check_status(ap);
return status;
}
/**
* ata_sff_sync - Flush writes
* @ap: Port to wait for.
*
* CAUTION:
* If we have an mmio device with no ctl and no altstatus
* method this will fail. No such devices are known to exist.
*
* LOCKING:
* Inherited from caller.
*/
static void ata_sff_sync(struct ata_port *ap)
{
if (ap->ops->sff_check_altstatus) ap->ops->sff_check_altstatus(ap); else if (ap->ioaddr.altstatus_addr) ioread8(ap->ioaddr.altstatus_addr);
}
/**
* ata_sff_pause - Flush writes and wait 400nS
* @ap: Port to pause for.
*
* CAUTION:
* If we have an mmio device with no ctl and no altstatus
* method this will fail. No such devices are known to exist.
*
* LOCKING:
* Inherited from caller.
*/
void ata_sff_pause(struct ata_port *ap)
{
ata_sff_sync(ap);
ndelay(400);
}
EXPORT_SYMBOL_GPL(ata_sff_pause);
/**
* ata_sff_dma_pause - Pause before commencing DMA
* @ap: Port to pause for.
*
* Perform I/O fencing and ensure sufficient cycle delays occur
* for the HDMA1:0 transition
*/
void ata_sff_dma_pause(struct ata_port *ap)
{
if (ap->ops->sff_check_altstatus || ap->ioaddr.altstatus_addr) {
/* An altstatus read will cause the needed delay without
messing up the IRQ status */
ata_sff_altstatus(ap);
return;
}
/* There are no DMA controllers without ctl. BUG here to ensure
we never violate the HDMA1:0 transition timing and risk
corruption. */
BUG();
}
EXPORT_SYMBOL_GPL(ata_sff_dma_pause);
/**
* ata_sff_busy_sleep - sleep until BSY clears, or timeout
* @ap: port containing status register to be polled
* @tmout_pat: impatience timeout in msecs
* @tmout: overall timeout in msecs
*
* Sleep until ATA Status register bit BSY clears,
* or a timeout occurs.
*
* LOCKING:
* Kernel thread context (may sleep).
*
* RETURNS:
* 0 on success, -errno otherwise.
*/
int ata_sff_busy_sleep(struct ata_port *ap,
unsigned long tmout_pat, unsigned long tmout)
{
unsigned long timer_start, timeout;
u8 status;
status = ata_sff_busy_wait(ap, ATA_BUSY, 300);
timer_start = jiffies;
timeout = ata_deadline(timer_start, tmout_pat);
while (status != 0xff && (status & ATA_BUSY) &&
time_before(jiffies, timeout)) {
ata_msleep(ap, 50);
status = ata_sff_busy_wait(ap, ATA_BUSY, 3);
}
if (status != 0xff && (status & ATA_BUSY))
ata_port_warn(ap,
"port is slow to respond, please be patient (Status 0x%x)\n",
status);
timeout = ata_deadline(timer_start, tmout);
while (status != 0xff && (status & ATA_BUSY) &&
time_before(jiffies, timeout)) {
ata_msleep(ap, 50);
status = ap->ops->sff_check_status(ap);
}
if (status == 0xff)
return -ENODEV;
if (status & ATA_BUSY) {
ata_port_err(ap,
"port failed to respond (%lu secs, Status 0x%x)\n",
DIV_ROUND_UP(tmout, 1000), status);
return -EBUSY;
}
return 0;
}
EXPORT_SYMBOL_GPL(ata_sff_busy_sleep);
static int ata_sff_check_ready(struct ata_link *link)
{
u8 status = link->ap->ops->sff_check_status(link->ap);
return ata_check_ready(status);
}
/**
* ata_sff_wait_ready - sleep until BSY clears, or timeout
* @link: SFF link to wait ready status for
* @deadline: deadline jiffies for the operation
*
* Sleep until ATA Status register bit BSY clears, or timeout
* occurs.
*
* LOCKING:
* Kernel thread context (may sleep).
*
* RETURNS:
* 0 on success, -errno otherwise.
*/
int ata_sff_wait_ready(struct ata_link *link, unsigned long deadline)
{
return ata_wait_ready(link, deadline, ata_sff_check_ready);
}
EXPORT_SYMBOL_GPL(ata_sff_wait_ready);
/**
* ata_sff_set_devctl - Write device control reg
* @ap: port where the device is
* @ctl: value to write
*
* Writes ATA taskfile device control register.
*
* Note: may NOT be used as the sff_set_devctl() entry in
* ata_port_operations.
*
* LOCKING:
* Inherited from caller.
*/
static void ata_sff_set_devctl(struct ata_port *ap, u8 ctl)
{
if (ap->ops->sff_set_devctl)
ap->ops->sff_set_devctl(ap, ctl);
else
iowrite8(ctl, ap->ioaddr.ctl_addr);
}
/**
* ata_sff_dev_select - Select device 0/1 on ATA bus
* @ap: ATA channel to manipulate
* @device: ATA device (numbered from zero) to select
*
* Use the method defined in the ATA specification to
* make either device 0, or device 1, active on the
* ATA channel. Works with both PIO and MMIO.
*
* May be used as the dev_select() entry in ata_port_operations.
*
* LOCKING:
* caller.
*/
void ata_sff_dev_select(struct ata_port *ap, unsigned int device)
{
u8 tmp;
if (device == 0)
tmp = ATA_DEVICE_OBS;
else
tmp = ATA_DEVICE_OBS | ATA_DEV1;
iowrite8(tmp, ap->ioaddr.device_addr);
ata_sff_pause(ap); /* needed; also flushes, for mmio */
}
EXPORT_SYMBOL_GPL(ata_sff_dev_select);
/**
* ata_dev_select - Select device 0/1 on ATA bus
* @ap: ATA channel to manipulate
* @device: ATA device (numbered from zero) to select
* @wait: non-zero to wait for Status register BSY bit to clear
* @can_sleep: non-zero if context allows sleeping
*
* Use the method defined in the ATA specification to
* make either device 0, or device 1, active on the
* ATA channel.
*
* This is a high-level version of ata_sff_dev_select(), which
* additionally provides the services of inserting the proper
* pauses and status polling, where needed.
*
* LOCKING:
* caller.
*/
static void ata_dev_select(struct ata_port *ap, unsigned int device,
unsigned int wait, unsigned int can_sleep)
{
if (ata_msg_probe(ap)) ata_port_info(ap, "ata_dev_select: ENTER, device %u, wait %u\n",
device, wait);
if (wait)
ata_wait_idle(ap);
ap->ops->sff_dev_select(ap, device);
if (wait) {
if (can_sleep && ap->link.device[device].class == ATA_DEV_ATAPI)
ata_msleep(ap, 150);
ata_wait_idle(ap);
}
}
/**
* ata_sff_irq_on - Enable interrupts on a port.
* @ap: Port on which interrupts are enabled.
*
* Enable interrupts on a legacy IDE device using MMIO or PIO,
* wait for idle, clear any pending interrupts.
*
* Note: may NOT be used as the sff_irq_on() entry in
* ata_port_operations.
*
* LOCKING:
* Inherited from caller.
*/
void ata_sff_irq_on(struct ata_port *ap)
{
struct ata_ioports *ioaddr = &ap->ioaddr;
if (ap->ops->sff_irq_on) {
ap->ops->sff_irq_on(ap);
return;
}
ap->ctl &= ~ATA_NIEN;
ap->last_ctl = ap->ctl;
if (ap->ops->sff_set_devctl || ioaddr->ctl_addr)
ata_sff_set_devctl(ap, ap->ctl);
ata_wait_idle(ap);
if (ap->ops->sff_irq_clear)
ap->ops->sff_irq_clear(ap);
}
EXPORT_SYMBOL_GPL(ata_sff_irq_on);
/**
* ata_sff_tf_load - send taskfile registers to host controller
* @ap: Port to which output is sent
* @tf: ATA taskfile register set
*
* Outputs ATA taskfile to standard ATA host controller.
*
* LOCKING:
* Inherited from caller.
*/
void ata_sff_tf_load(struct ata_port *ap, const struct ata_taskfile *tf)
{
struct ata_ioports *ioaddr = &ap->ioaddr;
unsigned int is_addr = tf->flags & ATA_TFLAG_ISADDR;
if (tf->ctl != ap->last_ctl) {
if (ioaddr->ctl_addr) iowrite8(tf->ctl, ioaddr->ctl_addr); ap->last_ctl = tf->ctl;
ata_wait_idle(ap);
}
if (is_addr && (tf->flags & ATA_TFLAG_LBA48)) { WARN_ON_ONCE(!ioaddr->ctl_addr); iowrite8(tf->hob_feature, ioaddr->feature_addr);
iowrite8(tf->hob_nsect, ioaddr->nsect_addr);
iowrite8(tf->hob_lbal, ioaddr->lbal_addr);
iowrite8(tf->hob_lbam, ioaddr->lbam_addr);
iowrite8(tf->hob_lbah, ioaddr->lbah_addr);
VPRINTK("hob: feat 0x%X nsect 0x%X, lba 0x%X 0x%X 0x%X\n",
tf->hob_feature,
tf->hob_nsect,
tf->hob_lbal,
tf->hob_lbam,
tf->hob_lbah);
}
if (is_addr) {
iowrite8(tf->feature, ioaddr->feature_addr);
iowrite8(tf->nsect, ioaddr->nsect_addr);
iowrite8(tf->lbal, ioaddr->lbal_addr);
iowrite8(tf->lbam, ioaddr->lbam_addr);
iowrite8(tf->lbah, ioaddr->lbah_addr);
VPRINTK("feat 0x%X nsect 0x%X lba 0x%X 0x%X 0x%X\n",
tf->feature,
tf->nsect,
tf->lbal,
tf->lbam,
tf->lbah);
}
if (tf->flags & ATA_TFLAG_DEVICE) { iowrite8(tf->device, ioaddr->device_addr);
VPRINTK("device 0x%X\n", tf->device);
}
ata_wait_idle(ap);
}
EXPORT_SYMBOL_GPL(ata_sff_tf_load);
/**
* ata_sff_tf_read - input device's ATA taskfile shadow registers
* @ap: Port from which input is read
* @tf: ATA taskfile register set for storing input
*
* Reads ATA taskfile registers for currently-selected device
* into @tf. Assumes the device has a fully SFF compliant task file
* layout and behaviour. If you device does not (eg has a different
* status method) then you will need to provide a replacement tf_read
*
* LOCKING:
* Inherited from caller.
*/
void ata_sff_tf_read(struct ata_port *ap, struct ata_taskfile *tf)
{
struct ata_ioports *ioaddr = &ap->ioaddr;
tf->command = ata_sff_check_status(ap);
tf->feature = ioread8(ioaddr->error_addr);
tf->nsect = ioread8(ioaddr->nsect_addr);
tf->lbal = ioread8(ioaddr->lbal_addr);
tf->lbam = ioread8(ioaddr->lbam_addr);
tf->lbah = ioread8(ioaddr->lbah_addr);
tf->device = ioread8(ioaddr->device_addr);
if (tf->flags & ATA_TFLAG_LBA48) {
if (likely(ioaddr->ctl_addr)) {
iowrite8(tf->ctl | ATA_HOB, ioaddr->ctl_addr);
tf->hob_feature = ioread8(ioaddr->error_addr);
tf->hob_nsect = ioread8(ioaddr->nsect_addr);
tf->hob_lbal = ioread8(ioaddr->lbal_addr);
tf->hob_lbam = ioread8(ioaddr->lbam_addr);
tf->hob_lbah = ioread8(ioaddr->lbah_addr);
iowrite8(tf->ctl, ioaddr->ctl_addr);
ap->last_ctl = tf->ctl;
} else
WARN_ON_ONCE(1);
}
}
EXPORT_SYMBOL_GPL(ata_sff_tf_read);
/**
* ata_sff_exec_command - issue ATA command to host controller
* @ap: port to which command is being issued
* @tf: ATA taskfile register set
*
* Issues ATA command, with proper synchronization with interrupt
* handler / other threads.
*
* LOCKING:
* spin_lock_irqsave(host lock)
*/
void ata_sff_exec_command(struct ata_port *ap, const struct ata_taskfile *tf)
{
DPRINTK("ata%u: cmd 0x%X\n", ap->print_id, tf->command);
iowrite8(tf->command, ap->ioaddr.command_addr);
ata_sff_pause(ap);
}
EXPORT_SYMBOL_GPL(ata_sff_exec_command);
/**
* ata_tf_to_host - issue ATA taskfile to host controller
* @ap: port to which command is being issued
* @tf: ATA taskfile register set
*
* Issues ATA taskfile register set to ATA host controller,
* with proper synchronization with interrupt handler and
* other threads.
*
* LOCKING:
* spin_lock_irqsave(host lock)
*/
static inline void ata_tf_to_host(struct ata_port *ap,
const struct ata_taskfile *tf)
{
ap->ops->sff_tf_load(ap, tf);
ap->ops->sff_exec_command(ap, tf);
}
/**
* ata_sff_data_xfer - Transfer data by PIO
* @qc: queued command
* @buf: data buffer
* @buflen: buffer length
* @rw: read/write
*
* Transfer data from/to the device data register by PIO.
*
* LOCKING:
* Inherited from caller.
*
* RETURNS:
* Bytes consumed.
*/
unsigned int ata_sff_data_xfer(struct ata_queued_cmd *qc, unsigned char *buf,
unsigned int buflen, int rw)
{
struct ata_port *ap = qc->dev->link->ap;
void __iomem *data_addr = ap->ioaddr.data_addr;
unsigned int words = buflen >> 1;
/* Transfer multiple of 2 bytes */
if (rw == READ)
ioread16_rep(data_addr, buf, words);
else
iowrite16_rep(data_addr, buf, words);
/* Transfer trailing byte, if any. */
if (unlikely(buflen & 0x01)) {
unsigned char pad[2] = { };
/* Point buf to the tail of buffer */
buf += buflen - 1;
/*
* Use io*16_rep() accessors here as well to avoid pointlessly
* swapping bytes to and from on the big endian machines...
*/
if (rw == READ) {
ioread16_rep(data_addr, pad, 1);
*buf = pad[0];
} else {
pad[0] = *buf;
iowrite16_rep(data_addr, pad, 1);
}
words++;
}
return words << 1;
}
EXPORT_SYMBOL_GPL(ata_sff_data_xfer);
/**
* ata_sff_data_xfer32 - Transfer data by PIO
* @qc: queued command
* @buf: data buffer
* @buflen: buffer length
* @rw: read/write
*
* Transfer data from/to the device data register by PIO using 32bit
* I/O operations.
*
* LOCKING:
* Inherited from caller.
*
* RETURNS:
* Bytes consumed.
*/
unsigned int ata_sff_data_xfer32(struct ata_queued_cmd *qc, unsigned char *buf,
unsigned int buflen, int rw)
{
struct ata_device *dev = qc->dev;
struct ata_port *ap = dev->link->ap;
void __iomem *data_addr = ap->ioaddr.data_addr;
unsigned int words = buflen >> 2;
int slop = buflen & 3;
if (!(ap->pflags & ATA_PFLAG_PIO32))
return ata_sff_data_xfer(qc, buf, buflen, rw);
/* Transfer multiple of 4 bytes */
if (rw == READ)
ioread32_rep(data_addr, buf, words);
else
iowrite32_rep(data_addr, buf, words);
/* Transfer trailing bytes, if any */
if (unlikely(slop)) {
unsigned char pad[4] = { };
/* Point buf to the tail of buffer */
buf += buflen - slop;
/*
* Use io*_rep() accessors here as well to avoid pointlessly
* swapping bytes to and from on the big endian machines...
*/
if (rw == READ) {
if (slop < 3)
ioread16_rep(data_addr, pad, 1);
else
ioread32_rep(data_addr, pad, 1);
memcpy(buf, pad, slop);
} else {
memcpy(pad, buf, slop);
if (slop < 3)
iowrite16_rep(data_addr, pad, 1);
else
iowrite32_rep(data_addr, pad, 1);
}
}
return (buflen + 1) & ~1;
}
EXPORT_SYMBOL_GPL(ata_sff_data_xfer32);
static void ata_pio_xfer(struct ata_queued_cmd *qc, struct page *page,
unsigned int offset, size_t xfer_size)
{
bool do_write = (qc->tf.flags & ATA_TFLAG_WRITE);
unsigned char *buf;
buf = kmap_atomic(page);
qc->ap->ops->sff_data_xfer(qc, buf + offset, xfer_size, do_write);
kunmap_atomic(buf);
if (!do_write && !PageSlab(page))
flush_dcache_page(page);
}
/**
* ata_pio_sector - Transfer a sector of data.
* @qc: Command on going
*
* Transfer qc->sect_size bytes of data from/to the ATA device.
*
* LOCKING:
* Inherited from caller.
*/
static void ata_pio_sector(struct ata_queued_cmd *qc)
{
struct ata_port *ap = qc->ap;
struct page *page;
unsigned int offset;
if (!qc->cursg) {
qc->curbytes = qc->nbytes;
return;
}
if (qc->curbytes == qc->nbytes - qc->sect_size)
ap->hsm_task_state = HSM_ST_LAST;
page = sg_page(qc->cursg);
offset = qc->cursg->offset + qc->cursg_ofs;
/* get the current page and offset */
page = nth_page(page, (offset >> PAGE_SHIFT));
offset %= PAGE_SIZE;
DPRINTK("data %s\n", qc->tf.flags & ATA_TFLAG_WRITE ? "write" : "read");
/*
* Split the transfer when it splits a page boundary. Note that the
* split still has to be dword aligned like all ATA data transfers.
*/
WARN_ON_ONCE(offset % 4);
if (offset + qc->sect_size > PAGE_SIZE) {
unsigned int split_len = PAGE_SIZE - offset;
ata_pio_xfer(qc, page, offset, split_len);
ata_pio_xfer(qc, nth_page(page, 1), 0,
qc->sect_size - split_len);
} else {
ata_pio_xfer(qc, page, offset, qc->sect_size);
}
qc->curbytes += qc->sect_size;
qc->cursg_ofs += qc->sect_size;
if (qc->cursg_ofs == qc->cursg->length) {
qc->cursg = sg_next(qc->cursg);
if (!qc->cursg)
ap->hsm_task_state = HSM_ST_LAST;
qc->cursg_ofs = 0;
}
}
/**
* ata_pio_sectors - Transfer one or many sectors.
* @qc: Command on going
*
* Transfer one or many sectors of data from/to the
* ATA device for the DRQ request.
*
* LOCKING:
* Inherited from caller.
*/
static void ata_pio_sectors(struct ata_queued_cmd *qc)
{
if (is_multi_taskfile(&qc->tf)) {
/* READ/WRITE MULTIPLE */
unsigned int nsect;
WARN_ON_ONCE(qc->dev->multi_count == 0);
nsect = min((qc->nbytes - qc->curbytes) / qc->sect_size,
qc->dev->multi_count);
while (nsect--)
ata_pio_sector(qc);
} else
ata_pio_sector(qc);
ata_sff_sync(qc->ap); /* flush */
}
/**
* atapi_send_cdb - Write CDB bytes to hardware
* @ap: Port to which ATAPI device is attached.
* @qc: Taskfile currently active
*
* When device has indicated its readiness to accept
* a CDB, this function is called. Send the CDB.
*
* LOCKING:
* caller.
*/
static void atapi_send_cdb(struct ata_port *ap, struct ata_queued_cmd *qc)
{
/* send SCSI cdb */
DPRINTK("send cdb\n");
WARN_ON_ONCE(qc->dev->cdb_len < 12);
ap->ops->sff_data_xfer(qc, qc->cdb, qc->dev->cdb_len, 1);
ata_sff_sync(ap);
/* FIXME: If the CDB is for DMA do we need to do the transition delay
or is bmdma_start guaranteed to do it ? */
switch (qc->tf.protocol) {
case ATAPI_PROT_PIO:
ap->hsm_task_state = HSM_ST;
break;
case ATAPI_PROT_NODATA:
ap->hsm_task_state = HSM_ST_LAST;
break;
#ifdef CONFIG_ATA_BMDMA
case ATAPI_PROT_DMA:
ap->hsm_task_state = HSM_ST_LAST;
/* initiate bmdma */
ap->ops->bmdma_start(qc);
break;
#endif /* CONFIG_ATA_BMDMA */
default:
BUG();
}
}
/**
* __atapi_pio_bytes - Transfer data from/to the ATAPI device.
* @qc: Command on going
* @bytes: number of bytes
*
* Transfer Transfer data from/to the ATAPI device.
*
* LOCKING:
* Inherited from caller.
*
*/
static int __atapi_pio_bytes(struct ata_queued_cmd *qc, unsigned int bytes)
{
int rw = (qc->tf.flags & ATA_TFLAG_WRITE) ? WRITE : READ;
struct ata_port *ap = qc->ap;
struct ata_device *dev = qc->dev;
struct ata_eh_info *ehi = &dev->link->eh_info;
struct scatterlist *sg;
struct page *page;
unsigned char *buf;
unsigned int offset, count, consumed;
next_sg:
sg = qc->cursg;
if (unlikely(!sg)) {
ata_ehi_push_desc(ehi, "unexpected or too much trailing data "
"buf=%u cur=%u bytes=%u",
qc->nbytes, qc->curbytes, bytes);
return -1;
}
page = sg_page(sg);
offset = sg->offset + qc->cursg_ofs;
/* get the current page and offset */
page = nth_page(page, (offset >> PAGE_SHIFT));
offset %= PAGE_SIZE;
/* don't overrun current sg */
count = min(sg->length - qc->cursg_ofs, bytes);
/* don't cross page boundaries */
count = min(count, (unsigned int)PAGE_SIZE - offset);
DPRINTK("data %s\n", qc->tf.flags & ATA_TFLAG_WRITE ? "write" : "read");
/* do the actual data transfer */
buf = kmap_atomic(page);
consumed = ap->ops->sff_data_xfer(qc, buf + offset, count, rw);
kunmap_atomic(buf);
bytes -= min(bytes, consumed);
qc->curbytes += count;
qc->cursg_ofs += count;
if (qc->cursg_ofs == sg->length) {
qc->cursg = sg_next(qc->cursg);
qc->cursg_ofs = 0;
}
/*
* There used to be a WARN_ON_ONCE(qc->cursg && count != consumed);
* Unfortunately __atapi_pio_bytes doesn't know enough to do the WARN
* check correctly as it doesn't know if it is the last request being
* made. Somebody should implement a proper sanity check.
*/
if (bytes)
goto next_sg;
return 0;
}
/**
* atapi_pio_bytes - Transfer data from/to the ATAPI device.
* @qc: Command on going
*
* Transfer Transfer data from/to the ATAPI device.
*
* LOCKING:
* Inherited from caller.
*/
static void atapi_pio_bytes(struct ata_queued_cmd *qc)
{
struct ata_port *ap = qc->ap;
struct ata_device *dev = qc->dev;
struct ata_eh_info *ehi = &dev->link->eh_info;
unsigned int ireason, bc_lo, bc_hi, bytes;
int i_write, do_write = (qc->tf.flags & ATA_TFLAG_WRITE) ? 1 : 0;
/* Abuse qc->result_tf for temp storage of intermediate TF
* here to save some kernel stack usage.
* For normal completion, qc->result_tf is not relevant. For
* error, qc->result_tf is later overwritten by ata_qc_complete().
* So, the correctness of qc->result_tf is not affected.
*/
ap->ops->sff_tf_read(ap, &qc->result_tf);
ireason = qc->result_tf.nsect;
bc_lo = qc->result_tf.lbam;
bc_hi = qc->result_tf.lbah;
bytes = (bc_hi << 8) | bc_lo;
/* shall be cleared to zero, indicating xfer of data */
if (unlikely(ireason & ATAPI_COD))
goto atapi_check;
/* make sure transfer direction matches expected */
i_write = ((ireason & ATAPI_IO) == 0) ? 1 : 0;
if (unlikely(do_write != i_write))
goto atapi_check;
if (unlikely(!bytes))
goto atapi_check;
VPRINTK("ata%u: xfering %d bytes\n", ap->print_id, bytes);
if (unlikely(__atapi_pio_bytes(qc, bytes)))
goto err_out;
ata_sff_sync(ap); /* flush */
return;
atapi_check:
ata_ehi_push_desc(ehi, "ATAPI check failed (ireason=0x%x bytes=%u)",
ireason, bytes);
err_out:
qc->err_mask |= AC_ERR_HSM;
ap->hsm_task_state = HSM_ST_ERR;
}
/**
* ata_hsm_ok_in_wq - Check if the qc can be handled in the workqueue.
* @ap: the target ata_port
* @qc: qc on going
*
* RETURNS:
* 1 if ok in workqueue, 0 otherwise.
*/
static inline int ata_hsm_ok_in_wq(struct ata_port *ap,
struct ata_queued_cmd *qc)
{
if (qc->tf.flags & ATA_TFLAG_POLLING)
return 1;
if (ap->hsm_task_state == HSM_ST_FIRST) {
if (qc->tf.protocol == ATA_PROT_PIO &&
(qc->tf.flags & ATA_TFLAG_WRITE))
return 1;
if (ata_is_atapi(qc->tf.protocol) &&
!(qc->dev->flags & ATA_DFLAG_CDB_INTR))
return 1;
}
return 0;
}
/**
* ata_hsm_qc_complete - finish a qc running on standard HSM
* @qc: Command to complete
* @in_wq: 1 if called from workqueue, 0 otherwise
*
* Finish @qc which is running on standard HSM.
*
* LOCKING:
* If @in_wq is zero, spin_lock_irqsave(host lock).
* Otherwise, none on entry and grabs host lock.
*/
static void ata_hsm_qc_complete(struct ata_queued_cmd *qc, int in_wq)
{
struct ata_port *ap = qc->ap;
if (ap->ops->error_handler) {
if (in_wq) {
/* EH might have kicked in while host lock is
* released.
*/
qc = ata_qc_from_tag(ap, qc->tag);
if (qc) {
if (likely(!(qc->err_mask & AC_ERR_HSM))) {
ata_sff_irq_on(ap);
ata_qc_complete(qc);
} else
ata_port_freeze(ap);
}
} else {
if (likely(!(qc->err_mask & AC_ERR_HSM)))
ata_qc_complete(qc);
else
ata_port_freeze(ap);
}
} else {
if (in_wq) {
ata_sff_irq_on(ap);
ata_qc_complete(qc);
} else
ata_qc_complete(qc);
}
}
/**
* ata_sff_hsm_move - move the HSM to the next state.
* @ap: the target ata_port
* @qc: qc on going
* @status: current device status
* @in_wq: 1 if called from workqueue, 0 otherwise
*
* RETURNS:
* 1 when poll next status needed, 0 otherwise.
*/
int ata_sff_hsm_move(struct ata_port *ap, struct ata_queued_cmd *qc,
u8 status, int in_wq)
{
struct ata_link *link = qc->dev->link;
struct ata_eh_info *ehi = &link->eh_info;
int poll_next;
lockdep_assert_held(ap->lock);
WARN_ON_ONCE((qc->flags & ATA_QCFLAG_ACTIVE) == 0);
/* Make sure ata_sff_qc_issue() does not throw things
* like DMA polling into the workqueue. Notice that
* in_wq is not equivalent to (qc->tf.flags & ATA_TFLAG_POLLING).
*/
WARN_ON_ONCE(in_wq != ata_hsm_ok_in_wq(ap, qc));
fsm_start:
DPRINTK("ata%u: protocol %d task_state %d (dev_stat 0x%X)\n",
ap->print_id, qc->tf.protocol, ap->hsm_task_state, status);
switch (ap->hsm_task_state) {
case HSM_ST_FIRST:
/* Send first data block or PACKET CDB */
/* If polling, we will stay in the work queue after
* sending the data. Otherwise, interrupt handler
* takes over after sending the data.
*/
poll_next = (qc->tf.flags & ATA_TFLAG_POLLING);
/* check device status */
if (unlikely((status & ATA_DRQ) == 0)) {
/* handle BSY=0, DRQ=0 as error */
if (likely(status & (ATA_ERR | ATA_DF)))
/* device stops HSM for abort/error */
qc->err_mask |= AC_ERR_DEV;
else {
/* HSM violation. Let EH handle this */
ata_ehi_push_desc(ehi,
"ST_FIRST: !(DRQ|ERR|DF)");
qc->err_mask |= AC_ERR_HSM;
}
ap->hsm_task_state = HSM_ST_ERR;
goto fsm_start;
}
/* Device should not ask for data transfer (DRQ=1)
* when it finds something wrong.
* We ignore DRQ here and stop the HSM by
* changing hsm_task_state to HSM_ST_ERR and
* let the EH abort the command or reset the device.
*/
if (unlikely(status & (ATA_ERR | ATA_DF))) {
/* Some ATAPI tape drives forget to clear the ERR bit
* when doing the next command (mostly request sense).
* We ignore ERR here to workaround and proceed sending
* the CDB.
*/
if (!(qc->dev->horkage & ATA_HORKAGE_STUCK_ERR)) {
ata_ehi_push_desc(ehi, "ST_FIRST: "
"DRQ=1 with device error, "
"dev_stat 0x%X", status);
qc->err_mask |= AC_ERR_HSM;
ap->hsm_task_state = HSM_ST_ERR;
goto fsm_start;
}
}
if (qc->tf.protocol == ATA_PROT_PIO) {
/* PIO data out protocol.
* send first data block.
*/
/* ata_pio_sectors() might change the state
* to HSM_ST_LAST. so, the state is changed here
* before ata_pio_sectors().
*/
ap->hsm_task_state = HSM_ST;
ata_pio_sectors(qc);
} else
/* send CDB */
atapi_send_cdb(ap, qc);
/* if polling, ata_sff_pio_task() handles the rest.
* otherwise, interrupt handler takes over from here.
*/
break;
case HSM_ST:
/* complete command or read/write the data register */
if (qc->tf.protocol == ATAPI_PROT_PIO) {
/* ATAPI PIO protocol */
if ((status & ATA_DRQ) == 0) {
/* No more data to transfer or device error.
* Device error will be tagged in HSM_ST_LAST.
*/
ap->hsm_task_state = HSM_ST_LAST;
goto fsm_start;
}
/* Device should not ask for data transfer (DRQ=1)
* when it finds something wrong.
* We ignore DRQ here and stop the HSM by
* changing hsm_task_state to HSM_ST_ERR and
* let the EH abort the command or reset the device.
*/
if (unlikely(status & (ATA_ERR | ATA_DF))) {
ata_ehi_push_desc(ehi, "ST-ATAPI: "
"DRQ=1 with device error, "
"dev_stat 0x%X", status);
qc->err_mask |= AC_ERR_HSM;
ap->hsm_task_state = HSM_ST_ERR;
goto fsm_start;
}
atapi_pio_bytes(qc);
if (unlikely(ap->hsm_task_state == HSM_ST_ERR))
/* bad ireason reported by device */
goto fsm_start;
} else {
/* ATA PIO protocol */
if (unlikely((status & ATA_DRQ) == 0)) {
/* handle BSY=0, DRQ=0 as error */
if (likely(status & (ATA_ERR | ATA_DF))) {
/* device stops HSM for abort/error */
qc->err_mask |= AC_ERR_DEV;
/* If diagnostic failed and this is
* IDENTIFY, it's likely a phantom
* device. Mark hint.
*/
if (qc->dev->horkage &
ATA_HORKAGE_DIAGNOSTIC)
qc->err_mask |=
AC_ERR_NODEV_HINT;
} else {
/* HSM violation. Let EH handle this.
* Phantom devices also trigger this
* condition. Mark hint.
*/
ata_ehi_push_desc(ehi, "ST-ATA: "
"DRQ=0 without device error, "
"dev_stat 0x%X", status);
qc->err_mask |= AC_ERR_HSM |
AC_ERR_NODEV_HINT;
}
ap->hsm_task_state = HSM_ST_ERR;
goto fsm_start;
}
/* For PIO reads, some devices may ask for
* data transfer (DRQ=1) alone with ERR=1.
* We respect DRQ here and transfer one
* block of junk data before changing the
* hsm_task_state to HSM_ST_ERR.
*
* For PIO writes, ERR=1 DRQ=1 doesn't make
* sense since the data block has been
* transferred to the device.
*/
if (unlikely(status & (ATA_ERR | ATA_DF))) {
/* data might be corrputed */
qc->err_mask |= AC_ERR_DEV;
if (!(qc->tf.flags & ATA_TFLAG_WRITE)) {
ata_pio_sectors(qc);
status = ata_wait_idle(ap);
}
if (status & (ATA_BUSY | ATA_DRQ)) {
ata_ehi_push_desc(ehi, "ST-ATA: "
"BUSY|DRQ persists on ERR|DF, "
"dev_stat 0x%X", status);
qc->err_mask |= AC_ERR_HSM;
}
/* There are oddball controllers with
* status register stuck at 0x7f and
* lbal/m/h at zero which makes it
* pass all other presence detection
* mechanisms we have. Set NODEV_HINT
* for it. Kernel bz#7241.
*/
if (status == 0x7f)
qc->err_mask |= AC_ERR_NODEV_HINT;
/* ata_pio_sectors() might change the
* state to HSM_ST_LAST. so, the state
* is changed after ata_pio_sectors().
*/
ap->hsm_task_state = HSM_ST_ERR;
goto fsm_start;
}
ata_pio_sectors(qc);
if (ap->hsm_task_state == HSM_ST_LAST &&
(!(qc->tf.flags & ATA_TFLAG_WRITE))) {
/* all data read */
status = ata_wait_idle(ap);
goto fsm_start;
}
}
poll_next = 1;
break;
case HSM_ST_LAST:
if (unlikely(!ata_ok(status))) {
qc->err_mask |= __ac_err_mask(status);
ap->hsm_task_state = HSM_ST_ERR;
goto fsm_start;
}
/* no more data to transfer */
DPRINTK("ata%u: dev %u command complete, drv_stat 0x%x\n",
ap->print_id, qc->dev->devno, status);
WARN_ON_ONCE(qc->err_mask & (AC_ERR_DEV | AC_ERR_HSM));
ap->hsm_task_state = HSM_ST_IDLE;
/* complete taskfile transaction */
ata_hsm_qc_complete(qc, in_wq);
poll_next = 0;
break;
case HSM_ST_ERR:
ap->hsm_task_state = HSM_ST_IDLE;
/* complete taskfile transaction */
ata_hsm_qc_complete(qc, in_wq);
poll_next = 0;
break;
default:
poll_next = 0;
WARN(true, "ata%d: SFF host state machine in invalid state %d",
ap->print_id, ap->hsm_task_state);
}
return poll_next;
}
EXPORT_SYMBOL_GPL(ata_sff_hsm_move);
void ata_sff_queue_work(struct work_struct *work)
{
queue_work(ata_sff_wq, work);
}
EXPORT_SYMBOL_GPL(ata_sff_queue_work);
void ata_sff_queue_delayed_work(struct delayed_work *dwork, unsigned long delay)
{
queue_delayed_work(ata_sff_wq, dwork, delay);
}
EXPORT_SYMBOL_GPL(ata_sff_queue_delayed_work);
void ata_sff_queue_pio_task(struct ata_link *link, unsigned long delay)
{
struct ata_port *ap = link->ap;
WARN_ON((ap->sff_pio_task_link != NULL) &&
(ap->sff_pio_task_link != link));
ap->sff_pio_task_link = link;
/* may fail if ata_sff_flush_pio_task() in progress */
ata_sff_queue_delayed_work(&ap->sff_pio_task, msecs_to_jiffies(delay));
}
EXPORT_SYMBOL_GPL(ata_sff_queue_pio_task);
void ata_sff_flush_pio_task(struct ata_port *ap)
{
DPRINTK("ENTER\n");
cancel_delayed_work_sync(&ap->sff_pio_task);
/*
* We wanna reset the HSM state to IDLE. If we do so without
* grabbing the port lock, critical sections protected by it which
* expect the HSM state to stay stable may get surprised. For
* example, we may set IDLE in between the time
* __ata_sff_port_intr() checks for HSM_ST_IDLE and before it calls
* ata_sff_hsm_move() causing ata_sff_hsm_move() to BUG().
*/
spin_lock_irq(ap->lock);
ap->hsm_task_state = HSM_ST_IDLE;
spin_unlock_irq(ap->lock);
ap->sff_pio_task_link = NULL;
if (ata_msg_ctl(ap))
ata_port_dbg(ap, "%s: EXIT\n", __func__);
}
static void ata_sff_pio_task(struct work_struct *work)
{
struct ata_port *ap =
container_of(work, struct ata_port, sff_pio_task.work);
struct ata_link *link = ap->sff_pio_task_link;
struct ata_queued_cmd *qc;
u8 status;
int poll_next;
spin_lock_irq(ap->lock);
BUG_ON(ap->sff_pio_task_link == NULL);
/* qc can be NULL if timeout occurred */
qc = ata_qc_from_tag(ap, link->active_tag);
if (!qc) {
ap->sff_pio_task_link = NULL;
goto out_unlock;
}
fsm_start:
WARN_ON_ONCE(ap->hsm_task_state == HSM_ST_IDLE);
/*
* This is purely heuristic. This is a fast path.
* Sometimes when we enter, BSY will be cleared in
* a chk-status or two. If not, the drive is probably seeking
* or something. Snooze for a couple msecs, then
* chk-status again. If still busy, queue delayed work.
*/
status = ata_sff_busy_wait(ap, ATA_BUSY, 5);
if (status & ATA_BUSY) {
spin_unlock_irq(ap->lock);
ata_msleep(ap, 2);
spin_lock_irq(ap->lock);
status = ata_sff_busy_wait(ap, ATA_BUSY, 10);
if (status & ATA_BUSY) {
ata_sff_queue_pio_task(link, ATA_SHORT_PAUSE);
goto out_unlock;
}
}
/*
* hsm_move() may trigger another command to be processed.
* clean the link beforehand.
*/
ap->sff_pio_task_link = NULL;
/* move the HSM */
poll_next = ata_sff_hsm_move(ap, qc, status, 1);
/* another command or interrupt handler
* may be running at this point.
*/
if (poll_next)
goto fsm_start;
out_unlock:
spin_unlock_irq(ap->lock);
}
/**
* ata_sff_qc_issue - issue taskfile to a SFF controller
* @qc: command to issue to device
*
* This function issues a PIO or NODATA command to a SFF
* controller.
*
* LOCKING:
* spin_lock_irqsave(host lock)
*
* RETURNS:
* Zero on success, AC_ERR_* mask on failure
*/
unsigned int ata_sff_qc_issue(struct ata_queued_cmd *qc)
{
struct ata_port *ap = qc->ap;
struct ata_link *link = qc->dev->link;
/* Use polling pio if the LLD doesn't handle
* interrupt driven pio and atapi CDB interrupt.
*/
if (ap->flags & ATA_FLAG_PIO_POLLING)
qc->tf.flags |= ATA_TFLAG_POLLING;
/* select the device */
ata_dev_select(ap, qc->dev->devno, 1, 0);
/* start the command */
switch (qc->tf.protocol) {
case ATA_PROT_NODATA:
if (qc->tf.flags & ATA_TFLAG_POLLING)
ata_qc_set_polling(qc);
ata_tf_to_host(ap, &qc->tf);
ap->hsm_task_state = HSM_ST_LAST;
if (qc->tf.flags & ATA_TFLAG_POLLING)
ata_sff_queue_pio_task(link, 0);
break;
case ATA_PROT_PIO:
if (qc->tf.flags & ATA_TFLAG_POLLING)
ata_qc_set_polling(qc);
ata_tf_to_host(ap, &qc->tf);
if (qc->tf.flags & ATA_TFLAG_WRITE) {
/* PIO data out protocol */
ap->hsm_task_state = HSM_ST_FIRST;
ata_sff_queue_pio_task(link, 0);
/* always send first data block using the
* ata_sff_pio_task() codepath.
*/
} else {
/* PIO data in protocol */
ap->hsm_task_state = HSM_ST;
if (qc->tf.flags & ATA_TFLAG_POLLING)
ata_sff_queue_pio_task(link, 0);
/* if polling, ata_sff_pio_task() handles the
* rest. otherwise, interrupt handler takes
* over from here.
*/
}
break;
case ATAPI_PROT_PIO:
case ATAPI_PROT_NODATA:
if (qc->tf.flags & ATA_TFLAG_POLLING)
ata_qc_set_polling(qc);
ata_tf_to_host(ap, &qc->tf);
ap->hsm_task_state = HSM_ST_FIRST;
/* send cdb by polling if no cdb interrupt */
if ((!(qc->dev->flags & ATA_DFLAG_CDB_INTR)) ||
(qc->tf.flags & ATA_TFLAG_POLLING)) ata_sff_queue_pio_task(link, 0);
break;
default:
return AC_ERR_SYSTEM;
}
return 0;
}
EXPORT_SYMBOL_GPL(ata_sff_qc_issue);
/**
* ata_sff_qc_fill_rtf - fill result TF using ->sff_tf_read
* @qc: qc to fill result TF for
*
* @qc is finished and result TF needs to be filled. Fill it
* using ->sff_tf_read.
*
* LOCKING:
* spin_lock_irqsave(host lock)
*
* RETURNS:
* true indicating that result TF is successfully filled.
*/
bool ata_sff_qc_fill_rtf(struct ata_queued_cmd *qc)
{
qc->ap->ops->sff_tf_read(qc->ap, &qc->result_tf);
return true;
}
EXPORT_SYMBOL_GPL(ata_sff_qc_fill_rtf);
static unsigned int ata_sff_idle_irq(struct ata_port *ap)
{
ap->stats.idle_irq++;
#ifdef ATA_IRQ_TRAP
if ((ap->stats.idle_irq % 1000) == 0) {
ap->ops->sff_check_status(ap);
if (ap->ops->sff_irq_clear)
ap->ops->sff_irq_clear(ap);
ata_port_warn(ap, "irq trap\n");
return 1;
}
#endif
return 0; /* irq not handled */
}
static unsigned int __ata_sff_port_intr(struct ata_port *ap,
struct ata_queued_cmd *qc,
bool hsmv_on_idle)
{
u8 status;
VPRINTK("ata%u: protocol %d task_state %d\n",
ap->print_id, qc->tf.protocol, ap->hsm_task_state);
/* Check whether we are expecting interrupt in this state */
switch (ap->hsm_task_state) {
case HSM_ST_FIRST:
/* Some pre-ATAPI-4 devices assert INTRQ
* at this state when ready to receive CDB.
*/
/* Check the ATA_DFLAG_CDB_INTR flag is enough here.
* The flag was turned on only for atapi devices. No
* need to check ata_is_atapi(qc->tf.protocol) again.
*/
if (!(qc->dev->flags & ATA_DFLAG_CDB_INTR))
return ata_sff_idle_irq(ap);
break;
case HSM_ST_IDLE:
return ata_sff_idle_irq(ap);
default:
break;
}
/* check main status, clearing INTRQ if needed */
status = ata_sff_irq_status(ap);
if (status & ATA_BUSY) {
if (hsmv_on_idle) {
/* BMDMA engine is already stopped, we're screwed */
qc->err_mask |= AC_ERR_HSM;
ap->hsm_task_state = HSM_ST_ERR;
} else
return ata_sff_idle_irq(ap);
}
/* clear irq events */
if (ap->ops->sff_irq_clear)
ap->ops->sff_irq_clear(ap);
ata_sff_hsm_move(ap, qc, status, 0);
return 1; /* irq handled */
}
/**
* ata_sff_port_intr - Handle SFF port interrupt
* @ap: Port on which interrupt arrived (possibly...)
* @qc: Taskfile currently active in engine
*
* Handle port interrupt for given queued command.
*
* LOCKING:
* spin_lock_irqsave(host lock)
*
* RETURNS:
* One if interrupt was handled, zero if not (shared irq).
*/
unsigned int ata_sff_port_intr(struct ata_port *ap, struct ata_queued_cmd *qc)
{
return __ata_sff_port_intr(ap, qc, false);
}
EXPORT_SYMBOL_GPL(ata_sff_port_intr);
static inline irqreturn_t __ata_sff_interrupt(int irq, void *dev_instance,
unsigned int (*port_intr)(struct ata_port *, struct ata_queued_cmd *))
{
struct ata_host *host = dev_instance;
bool retried = false;
unsigned int i;
unsigned int handled, idle, polling;
unsigned long flags;
/* TODO: make _irqsave conditional on x86 PCI IDE legacy mode */
spin_lock_irqsave(&host->lock, flags);
retry:
handled = idle = polling = 0;
for (i = 0; i < host->n_ports; i++) {
struct ata_port *ap = host->ports[i];
struct ata_queued_cmd *qc;
qc = ata_qc_from_tag(ap, ap->link.active_tag);
if (qc) {
if (!(qc->tf.flags & ATA_TFLAG_POLLING))
handled |= port_intr(ap, qc);
else
polling |= 1 << i;
} else
idle |= 1 << i;
}
/*
* If no port was expecting IRQ but the controller is actually
* asserting IRQ line, nobody cared will ensue. Check IRQ
* pending status if available and clear spurious IRQ.
*/
if (!handled && !retried) {
bool retry = false;
for (i = 0; i < host->n_ports; i++) {
struct ata_port *ap = host->ports[i];
if (polling & (1 << i))
continue;
if (!ap->ops->sff_irq_check ||
!ap->ops->sff_irq_check(ap))
continue;
if (idle & (1 << i)) {
ap->ops->sff_check_status(ap);
if (ap->ops->sff_irq_clear)
ap->ops->sff_irq_clear(ap);
} else {
/* clear INTRQ and check if BUSY cleared */
if (!(ap->ops->sff_check_status(ap) & ATA_BUSY))
retry |= true;
/*
* With command in flight, we can't do
* sff_irq_clear() w/o racing with completion.
*/
}
}
if (retry) {
retried = true;
goto retry;
}
}
spin_unlock_irqrestore(&host->lock, flags);
return IRQ_RETVAL(handled);
}
/**
* ata_sff_interrupt - Default SFF ATA host interrupt handler
* @irq: irq line (unused)
* @dev_instance: pointer to our ata_host information structure
*
* Default interrupt handler for PCI IDE devices. Calls
* ata_sff_port_intr() for each port that is not disabled.
*
* LOCKING:
* Obtains host lock during operation.
*
* RETURNS:
* IRQ_NONE or IRQ_HANDLED.
*/
irqreturn_t ata_sff_interrupt(int irq, void *dev_instance)
{
return __ata_sff_interrupt(irq, dev_instance, ata_sff_port_intr);
}
EXPORT_SYMBOL_GPL(ata_sff_interrupt);
/**
* ata_sff_lost_interrupt - Check for an apparent lost interrupt
* @ap: port that appears to have timed out
*
* Called from the libata error handlers when the core code suspects
* an interrupt has been lost. If it has complete anything we can and
* then return. Interface must support altstatus for this faster
* recovery to occur.
*
* Locking:
* Caller holds host lock
*/
void ata_sff_lost_interrupt(struct ata_port *ap)
{
u8 status;
struct ata_queued_cmd *qc;
/* Only one outstanding command per SFF channel */
qc = ata_qc_from_tag(ap, ap->link.active_tag);
/* We cannot lose an interrupt on a non-existent or polled command */
if (!qc || qc->tf.flags & ATA_TFLAG_POLLING)
return;
/* See if the controller thinks it is still busy - if so the command
isn't a lost IRQ but is still in progress */
status = ata_sff_altstatus(ap);
if (status & ATA_BUSY)
return;
/* There was a command running, we are no longer busy and we have
no interrupt. */
ata_port_warn(ap, "lost interrupt (Status 0x%x)\n",
status);
/* Run the host interrupt logic as if the interrupt had not been
lost */
ata_sff_port_intr(ap, qc);
}
EXPORT_SYMBOL_GPL(ata_sff_lost_interrupt);
/**
* ata_sff_freeze - Freeze SFF controller port
* @ap: port to freeze
*
* Freeze SFF controller port.
*
* LOCKING:
* Inherited from caller.
*/
void ata_sff_freeze(struct ata_port *ap)
{
ap->ctl |= ATA_NIEN;
ap->last_ctl = ap->ctl;
if (ap->ops->sff_set_devctl || ap->ioaddr.ctl_addr)
ata_sff_set_devctl(ap, ap->ctl);
/* Under certain circumstances, some controllers raise IRQ on
* ATA_NIEN manipulation. Also, many controllers fail to mask
* previously pending IRQ on ATA_NIEN assertion. Clear it.
*/
ap->ops->sff_check_status(ap);
if (ap->ops->sff_irq_clear)
ap->ops->sff_irq_clear(ap);
}
EXPORT_SYMBOL_GPL(ata_sff_freeze);
/**
* ata_sff_thaw - Thaw SFF controller port
* @ap: port to thaw
*
* Thaw SFF controller port.
*
* LOCKING:
* Inherited from caller.
*/
void ata_sff_thaw(struct ata_port *ap)
{
/* clear & re-enable interrupts */
ap->ops->sff_check_status(ap);
if (ap->ops->sff_irq_clear)
ap->ops->sff_irq_clear(ap);
ata_sff_irq_on(ap);
}
EXPORT_SYMBOL_GPL(ata_sff_thaw);
/**
* ata_sff_prereset - prepare SFF link for reset
* @link: SFF link to be reset
* @deadline: deadline jiffies for the operation
*
* SFF link @link is about to be reset. Initialize it. It first
* calls ata_std_prereset() and wait for !BSY if the port is
* being softreset.
*
* LOCKING:
* Kernel thread context (may sleep)
*
* RETURNS:
* 0 on success, -errno otherwise.
*/
int ata_sff_prereset(struct ata_link *link, unsigned long deadline)
{
struct ata_eh_context *ehc = &link->eh_context;
int rc;
rc = ata_std_prereset(link, deadline);
if (rc)
return rc;
/* if we're about to do hardreset, nothing more to do */
if (ehc->i.action & ATA_EH_HARDRESET)
return 0;
/* wait for !BSY if we don't know that no device is attached */
if (!ata_link_offline(link)) {
rc = ata_sff_wait_ready(link, deadline);
if (rc && rc != -ENODEV) {
ata_link_warn(link,
"device not ready (errno=%d), forcing hardreset\n",
rc);
ehc->i.action |= ATA_EH_HARDRESET;
}
}
return 0;
}
EXPORT_SYMBOL_GPL(ata_sff_prereset);
/**
* ata_devchk - PATA device presence detection
* @ap: ATA channel to examine
* @device: Device to examine (starting at zero)
*
* This technique was originally described in
* Hale Landis's ATADRVR (www.ata-atapi.com), and
* later found its way into the ATA/ATAPI spec.
*
* Write a pattern to the ATA shadow registers,
* and if a device is present, it will respond by
* correctly storing and echoing back the
* ATA shadow register contents.
*
* LOCKING:
* caller.
*/
static unsigned int ata_devchk(struct ata_port *ap, unsigned int device)
{
struct ata_ioports *ioaddr = &ap->ioaddr;
u8 nsect, lbal;
ap->ops->sff_dev_select(ap, device);
iowrite8(0x55, ioaddr->nsect_addr);
iowrite8(0xaa, ioaddr->lbal_addr);
iowrite8(0xaa, ioaddr->nsect_addr);
iowrite8(0x55, ioaddr->lbal_addr);
iowrite8(0x55, ioaddr->nsect_addr);
iowrite8(0xaa, ioaddr->lbal_addr);
nsect = ioread8(ioaddr->nsect_addr);
lbal = ioread8(ioaddr->lbal_addr);
if ((nsect == 0x55) && (lbal == 0xaa))
return 1; /* we found a device */
return 0; /* nothing found */
}
/**
* ata_sff_dev_classify - Parse returned ATA device signature
* @dev: ATA device to classify (starting at zero)
* @present: device seems present
* @r_err: Value of error register on completion
*
* After an event -- SRST, E.D.D., or SATA COMRESET -- occurs,
* an ATA/ATAPI-defined set of values is placed in the ATA
* shadow registers, indicating the results of device detection
* and diagnostics.
*
* Select the ATA device, and read the values from the ATA shadow
* registers. Then parse according to the Error register value,
* and the spec-defined values examined by ata_dev_classify().
*
* LOCKING:
* caller.
*
* RETURNS:
* Device type - %ATA_DEV_ATA, %ATA_DEV_ATAPI or %ATA_DEV_NONE.
*/
unsigned int ata_sff_dev_classify(struct ata_device *dev, int present,
u8 *r_err)
{
struct ata_port *ap = dev->link->ap;
struct ata_taskfile tf;
unsigned int class;
u8 err;
ap->ops->sff_dev_select(ap, dev->devno);
memset(&tf, 0, sizeof(tf));
ap->ops->sff_tf_read(ap, &tf);
err = tf.feature;
if (r_err)
*r_err = err;
/* see if device passed diags: continue and warn later */
if (err == 0)
/* diagnostic fail : do nothing _YET_ */
dev->horkage |= ATA_HORKAGE_DIAGNOSTIC;
else if (err == 1)
/* do nothing */ ;
else if ((dev->devno == 0) && (err == 0x81))
/* do nothing */ ;
else
return ATA_DEV_NONE;
/* determine if device is ATA or ATAPI */
class = ata_dev_classify(&tf);
if (class == ATA_DEV_UNKNOWN) {
/* If the device failed diagnostic, it's likely to
* have reported incorrect device signature too.
* Assume ATA device if the device seems present but
* device signature is invalid with diagnostic
* failure.
*/
if (present && (dev->horkage & ATA_HORKAGE_DIAGNOSTIC))
class = ATA_DEV_ATA;
else
class = ATA_DEV_NONE;
} else if ((class == ATA_DEV_ATA) &&
(ap->ops->sff_check_status(ap) == 0))
class = ATA_DEV_NONE;
return class;
}
EXPORT_SYMBOL_GPL(ata_sff_dev_classify);
/**
* ata_sff_wait_after_reset - wait for devices to become ready after reset
* @link: SFF link which is just reset
* @devmask: mask of present devices
* @deadline: deadline jiffies for the operation
*
* Wait devices attached to SFF @link to become ready after
* reset. It contains preceding 150ms wait to avoid accessing TF
* status register too early.
*
* LOCKING:
* Kernel thread context (may sleep).
*
* RETURNS:
* 0 on success, -ENODEV if some or all of devices in @devmask
* don't seem to exist. -errno on other errors.
*/
int ata_sff_wait_after_reset(struct ata_link *link, unsigned int devmask,
unsigned long deadline)
{
struct ata_port *ap = link->ap;
struct ata_ioports *ioaddr = &ap->ioaddr;
unsigned int dev0 = devmask & (1 << 0);
unsigned int dev1 = devmask & (1 << 1);
int rc, ret = 0;
ata_msleep(ap, ATA_WAIT_AFTER_RESET);
/* always check readiness of the master device */
rc = ata_sff_wait_ready(link, deadline);
/* -ENODEV means the odd clown forgot the D7 pulldown resistor
* and TF status is 0xff, bail out on it too.
*/
if (rc)
return rc;
/* if device 1 was found in ata_devchk, wait for register
* access briefly, then wait for BSY to clear.
*/
if (dev1) {
int i;
ap->ops->sff_dev_select(ap, 1);
/* Wait for register access. Some ATAPI devices fail
* to set nsect/lbal after reset, so don't waste too
* much time on it. We're gonna wait for !BSY anyway.
*/
for (i = 0; i < 2; i++) {
u8 nsect, lbal;
nsect = ioread8(ioaddr->nsect_addr);
lbal = ioread8(ioaddr->lbal_addr);
if ((nsect == 1) && (lbal == 1))
break;
ata_msleep(ap, 50); /* give drive a breather */
}
rc = ata_sff_wait_ready(link, deadline);
if (rc) {
if (rc != -ENODEV)
return rc;
ret = rc;
}
}
/* is all this really necessary? */
ap->ops->sff_dev_select(ap, 0);
if (dev1)
ap->ops->sff_dev_select(ap, 1);
if (dev0)
ap->ops->sff_dev_select(ap, 0);
return ret;
}
EXPORT_SYMBOL_GPL(ata_sff_wait_after_reset);
static int ata_bus_softreset(struct ata_port *ap, unsigned int devmask,
unsigned long deadline)
{
struct ata_ioports *ioaddr = &ap->ioaddr;
DPRINTK("ata%u: bus reset via SRST\n", ap->print_id);
if (ap->ioaddr.ctl_addr) {
/* software reset. causes dev0 to be selected */
iowrite8(ap->ctl, ioaddr->ctl_addr);
udelay(20); /* FIXME: flush */
iowrite8(ap->ctl | ATA_SRST, ioaddr->ctl_addr);
udelay(20); /* FIXME: flush */
iowrite8(ap->ctl, ioaddr->ctl_addr);
ap->last_ctl = ap->ctl;
}
/* wait the port to become ready */
return ata_sff_wait_after_reset(&ap->link, devmask, deadline);
}
/**
* ata_sff_softreset - reset host port via ATA SRST
* @link: ATA link to reset
* @classes: resulting classes of attached devices
* @deadline: deadline jiffies for the operation
*
* Reset host port using ATA SRST.
*
* LOCKING:
* Kernel thread context (may sleep)
*
* RETURNS:
* 0 on success, -errno otherwise.
*/
int ata_sff_softreset(struct ata_link *link, unsigned int *classes,
unsigned long deadline)
{
struct ata_port *ap = link->ap;
unsigned int slave_possible = ap->flags & ATA_FLAG_SLAVE_POSS;
unsigned int devmask = 0;
int rc;
u8 err;
DPRINTK("ENTER\n");
/* determine if device 0/1 are present */
if (ata_devchk(ap, 0))
devmask |= (1 << 0);
if (slave_possible && ata_devchk(ap, 1))
devmask |= (1 << 1);
/* select device 0 again */
ap->ops->sff_dev_select(ap, 0);
/* issue bus reset */
DPRINTK("about to softreset, devmask=%x\n", devmask);
rc = ata_bus_softreset(ap, devmask, deadline);
/* if link is occupied, -ENODEV too is an error */
if (rc && (rc != -ENODEV || sata_scr_valid(link))) {
ata_link_err(link, "SRST failed (errno=%d)\n", rc);
return rc;
}
/* determine by signature whether we have ATA or ATAPI devices */
classes[0] = ata_sff_dev_classify(&link->device[0],
devmask & (1 << 0), &err);
if (slave_possible && err != 0x81)
classes[1] = ata_sff_dev_classify(&link->device[1],
devmask & (1 << 1), &err);
DPRINTK("EXIT, classes[0]=%u [1]=%u\n", classes[0], classes[1]);
return 0;
}
EXPORT_SYMBOL_GPL(ata_sff_softreset);
/**
* sata_sff_hardreset - reset host port via SATA phy reset
* @link: link to reset
* @class: resulting class of attached device
* @deadline: deadline jiffies for the operation
*
* SATA phy-reset host port using DET bits of SControl register,
* wait for !BSY and classify the attached device.
*
* LOCKING:
* Kernel thread context (may sleep)
*
* RETURNS:
* 0 on success, -errno otherwise.
*/
int sata_sff_hardreset(struct ata_link *link, unsigned int *class,
unsigned long deadline)
{
struct ata_eh_context *ehc = &link->eh_context;
const unsigned long *timing = sata_ehc_deb_timing(ehc);
bool online;
int rc;
rc = sata_link_hardreset(link, timing, deadline, &online,
ata_sff_check_ready);
if (online)
*class = ata_sff_dev_classify(link->device, 1, NULL);
DPRINTK("EXIT, class=%u\n", *class);
return rc;
}
EXPORT_SYMBOL_GPL(sata_sff_hardreset);
/**
* ata_sff_postreset - SFF postreset callback
* @link: the target SFF ata_link
* @classes: classes of attached devices
*
* This function is invoked after a successful reset. It first
* calls ata_std_postreset() and performs SFF specific postreset
* processing.
*
* LOCKING:
* Kernel thread context (may sleep)
*/
void ata_sff_postreset(struct ata_link *link, unsigned int *classes)
{
struct ata_port *ap = link->ap;
ata_std_postreset(link, classes);
/* is double-select really necessary? */
if (classes[0] != ATA_DEV_NONE)
ap->ops->sff_dev_select(ap, 1);
if (classes[1] != ATA_DEV_NONE)
ap->ops->sff_dev_select(ap, 0);
/* bail out if no device is present */
if (classes[0] == ATA_DEV_NONE && classes[1] == ATA_DEV_NONE) {
DPRINTK("EXIT, no device\n");
return;
}
/* set up device control */
if (ap->ops->sff_set_devctl || ap->ioaddr.ctl_addr) {
ata_sff_set_devctl(ap, ap->ctl);
ap->last_ctl = ap->ctl;
}
}
EXPORT_SYMBOL_GPL(ata_sff_postreset);
/**
* ata_sff_drain_fifo - Stock FIFO drain logic for SFF controllers
* @qc: command
*
* Drain the FIFO and device of any stuck data following a command
* failing to complete. In some cases this is necessary before a
* reset will recover the device.
*
*/
void ata_sff_drain_fifo(struct ata_queued_cmd *qc)
{
int count;
struct ata_port *ap;
/* We only need to flush incoming data when a command was running */
if (qc == NULL || qc->dma_dir == DMA_TO_DEVICE)
return;
ap = qc->ap;
/* Drain up to 64K of data before we give up this recovery method */
for (count = 0; (ap->ops->sff_check_status(ap) & ATA_DRQ)
&& count < 65536; count += 2)
ioread16(ap->ioaddr.data_addr);
/* Can become DEBUG later */
if (count)
ata_port_dbg(ap, "drained %d bytes to clear DRQ\n", count);
}
EXPORT_SYMBOL_GPL(ata_sff_drain_fifo);
/**
* ata_sff_error_handler - Stock error handler for SFF controller
* @ap: port to handle error for
*
* Stock error handler for SFF controller. It can handle both
* PATA and SATA controllers. Many controllers should be able to
* use this EH as-is or with some added handling before and
* after.
*
* LOCKING:
* Kernel thread context (may sleep)
*/
void ata_sff_error_handler(struct ata_port *ap)
{
ata_reset_fn_t softreset = ap->ops->softreset;
ata_reset_fn_t hardreset = ap->ops->hardreset;
struct ata_queued_cmd *qc;
unsigned long flags;
qc = __ata_qc_from_tag(ap, ap->link.active_tag);
if (qc && !(qc->flags & ATA_QCFLAG_FAILED))
qc = NULL;
spin_lock_irqsave(ap->lock, flags);
/*
* We *MUST* do FIFO draining before we issue a reset as
* several devices helpfully clear their internal state and
* will lock solid if we touch the data port post reset. Pass
* qc in case anyone wants to do different PIO/DMA recovery or
* has per command fixups
*/
if (ap->ops->sff_drain_fifo)
ap->ops->sff_drain_fifo(qc);
spin_unlock_irqrestore(ap->lock, flags);
/* ignore built-in hardresets if SCR access is not available */
if ((hardreset == sata_std_hardreset ||
hardreset == sata_sff_hardreset) && !sata_scr_valid(&ap->link))
hardreset = NULL;
ata_do_eh(ap, ap->ops->prereset, softreset, hardreset,
ap->ops->postreset);
}
EXPORT_SYMBOL_GPL(ata_sff_error_handler);
/**
* ata_sff_std_ports - initialize ioaddr with standard port offsets.
* @ioaddr: IO address structure to be initialized
*
* Utility function which initializes data_addr, error_addr,
* feature_addr, nsect_addr, lbal_addr, lbam_addr, lbah_addr,
* device_addr, status_addr, and command_addr to standard offsets
* relative to cmd_addr.
*
* Does not set ctl_addr, altstatus_addr, bmdma_addr, or scr_addr.
*/
void ata_sff_std_ports(struct ata_ioports *ioaddr)
{
ioaddr->data_addr = ioaddr->cmd_addr + ATA_REG_DATA;
ioaddr->error_addr = ioaddr->cmd_addr + ATA_REG_ERR;
ioaddr->feature_addr = ioaddr->cmd_addr + ATA_REG_FEATURE;
ioaddr->nsect_addr = ioaddr->cmd_addr + ATA_REG_NSECT;
ioaddr->lbal_addr = ioaddr->cmd_addr + ATA_REG_LBAL;
ioaddr->lbam_addr = ioaddr->cmd_addr + ATA_REG_LBAM;
ioaddr->lbah_addr = ioaddr->cmd_addr + ATA_REG_LBAH;
ioaddr->device_addr = ioaddr->cmd_addr + ATA_REG_DEVICE;
ioaddr->status_addr = ioaddr->cmd_addr + ATA_REG_STATUS;
ioaddr->command_addr = ioaddr->cmd_addr + ATA_REG_CMD;
}
EXPORT_SYMBOL_GPL(ata_sff_std_ports);
#ifdef CONFIG_PCI
static int ata_resources_present(struct pci_dev *pdev, int port)
{
int i;
/* Check the PCI resources for this channel are enabled */
port = port * 2;
for (i = 0; i < 2; i++) {
if (pci_resource_start(pdev, port + i) == 0 ||
pci_resource_len(pdev, port + i) == 0)
return 0;
}
return 1;
}
/**
* ata_pci_sff_init_host - acquire native PCI ATA resources and init host
* @host: target ATA host
*
* Acquire native PCI ATA resources for @host and initialize the
* first two ports of @host accordingly. Ports marked dummy are
* skipped and allocation failure makes the port dummy.
*
* Note that native PCI resources are valid even for legacy hosts
* as we fix up pdev resources array early in boot, so this
* function can be used for both native and legacy SFF hosts.
*
* LOCKING:
* Inherited from calling layer (may sleep).
*
* RETURNS:
* 0 if at least one port is initialized, -ENODEV if no port is
* available.
*/
int ata_pci_sff_init_host(struct ata_host *host)
{
struct device *gdev = host->dev;
struct pci_dev *pdev = to_pci_dev(gdev);
unsigned int mask = 0;
int i, rc;
/* request, iomap BARs and init port addresses accordingly */
for (i = 0; i < 2; i++) {
struct ata_port *ap = host->ports[i];
int base = i * 2;
void __iomem * const *iomap;
if (ata_port_is_dummy(ap))
continue;
/* Discard disabled ports. Some controllers show
* their unused channels this way. Disabled ports are
* made dummy.
*/
if (!ata_resources_present(pdev, i)) {
ap->ops = &ata_dummy_port_ops;
continue;
}
rc = pcim_iomap_regions(pdev, 0x3 << base,
dev_driver_string(gdev));
if (rc) {
dev_warn(gdev,
"failed to request/iomap BARs for port %d (errno=%d)\n",
i, rc);
if (rc == -EBUSY)
pcim_pin_device(pdev);
ap->ops = &ata_dummy_port_ops;
continue;
}
host->iomap = iomap = pcim_iomap_table(pdev);
ap->ioaddr.cmd_addr = iomap[base];
ap->ioaddr.altstatus_addr =
ap->ioaddr.ctl_addr = (void __iomem *)
((unsigned long)iomap[base + 1] | ATA_PCI_CTL_OFS);
ata_sff_std_ports(&ap->ioaddr);
ata_port_desc(ap, "cmd 0x%llx ctl 0x%llx",
(unsigned long long)pci_resource_start(pdev, base),
(unsigned long long)pci_resource_start(pdev, base + 1));
mask |= 1 << i;
}
if (!mask) {
dev_err(gdev, "no available native port\n");
return -ENODEV;
}
return 0;
}
EXPORT_SYMBOL_GPL(ata_pci_sff_init_host);
/**
* ata_pci_sff_prepare_host - helper to prepare PCI PIO-only SFF ATA host
* @pdev: target PCI device
* @ppi: array of port_info, must be enough for two ports
* @r_host: out argument for the initialized ATA host
*
* Helper to allocate PIO-only SFF ATA host for @pdev, acquire
* all PCI resources and initialize it accordingly in one go.
*
* LOCKING:
* Inherited from calling layer (may sleep).
*
* RETURNS:
* 0 on success, -errno otherwise.
*/
int ata_pci_sff_prepare_host(struct pci_dev *pdev,
const struct ata_port_info * const *ppi,
struct ata_host **r_host)
{
struct ata_host *host;
int rc;
if (!devres_open_group(&pdev->dev, NULL, GFP_KERNEL))
return -ENOMEM;
host = ata_host_alloc_pinfo(&pdev->dev, ppi, 2);
if (!host) {
dev_err(&pdev->dev, "failed to allocate ATA host\n");
rc = -ENOMEM;
goto err_out;
}
rc = ata_pci_sff_init_host(host);
if (rc)
goto err_out;
devres_remove_group(&pdev->dev, NULL);
*r_host = host;
return 0;
err_out:
devres_release_group(&pdev->dev, NULL);
return rc;
}
EXPORT_SYMBOL_GPL(ata_pci_sff_prepare_host);
/**
* ata_pci_sff_activate_host - start SFF host, request IRQ and register it
* @host: target SFF ATA host
* @irq_handler: irq_handler used when requesting IRQ(s)
* @sht: scsi_host_template to use when registering the host
*
* This is the counterpart of ata_host_activate() for SFF ATA
* hosts. This separate helper is necessary because SFF hosts
* use two separate interrupts in legacy mode.
*
* LOCKING:
* Inherited from calling layer (may sleep).
*
* RETURNS:
* 0 on success, -errno otherwise.
*/
int ata_pci_sff_activate_host(struct ata_host *host,
irq_handler_t irq_handler,
struct scsi_host_template *sht)
{
struct device *dev = host->dev;
struct pci_dev *pdev = to_pci_dev(dev);
const char *drv_name = dev_driver_string(host->dev);
int legacy_mode = 0, rc;
rc = ata_host_start(host);
if (rc)
return rc;
if ((pdev->class >> 8) == PCI_CLASS_STORAGE_IDE) {
u8 tmp8, mask = 0;
/*
* ATA spec says we should use legacy mode when one
* port is in legacy mode, but disabled ports on some
* PCI hosts appear as fixed legacy ports, e.g SB600/700
* on which the secondary port is not wired, so
* ignore ports that are marked as 'dummy' during
* this check
*/
pci_read_config_byte(pdev, PCI_CLASS_PROG, &tmp8);
if (!ata_port_is_dummy(host->ports[0]))
mask |= (1 << 0);
if (!ata_port_is_dummy(host->ports[1]))
mask |= (1 << 2);
if ((tmp8 & mask) != mask)
legacy_mode = 1;
}
if (!devres_open_group(dev, NULL, GFP_KERNEL))
return -ENOMEM;
if (!legacy_mode && pdev->irq) {
int i;
rc = devm_request_irq(dev, pdev->irq, irq_handler,
IRQF_SHARED, drv_name, host);
if (rc)
goto out;
for (i = 0; i < 2; i++) {
if (ata_port_is_dummy(host->ports[i]))
continue;
ata_port_desc(host->ports[i], "irq %d", pdev->irq);
}
} else if (legacy_mode) {
if (!ata_port_is_dummy(host->ports[0])) {
rc = devm_request_irq(dev, ATA_PRIMARY_IRQ(pdev),
irq_handler, IRQF_SHARED,
drv_name, host);
if (rc)
goto out;
ata_port_desc(host->ports[0], "irq %d",
ATA_PRIMARY_IRQ(pdev));
}
if (!ata_port_is_dummy(host->ports[1])) {
rc = devm_request_irq(dev, ATA_SECONDARY_IRQ(pdev),
irq_handler, IRQF_SHARED,
drv_name, host);
if (rc)
goto out;
ata_port_desc(host->ports[1], "irq %d",
ATA_SECONDARY_IRQ(pdev));
}
}
rc = ata_host_register(host, sht);
out:
if (rc == 0)
devres_remove_group(dev, NULL);
else
devres_release_group(dev, NULL);
return rc;
}
EXPORT_SYMBOL_GPL(ata_pci_sff_activate_host);
static const struct ata_port_info *ata_sff_find_valid_pi(
const struct ata_port_info * const *ppi)
{
int i;
/* look up the first valid port_info */
for (i = 0; i < 2 && ppi[i]; i++)
if (ppi[i]->port_ops != &ata_dummy_port_ops)
return ppi[i];
return NULL;
}
static int ata_pci_init_one(struct pci_dev *pdev,
const struct ata_port_info * const *ppi,
struct scsi_host_template *sht, void *host_priv,
int hflags, bool bmdma)
{
struct device *dev = &pdev->dev;
const struct ata_port_info *pi;
struct ata_host *host = NULL;
int rc;
DPRINTK("ENTER\n");
pi = ata_sff_find_valid_pi(ppi);
if (!pi) {
dev_err(&pdev->dev, "no valid port_info specified\n");
return -EINVAL;
}
if (!devres_open_group(dev, NULL, GFP_KERNEL))
return -ENOMEM;
rc = pcim_enable_device(pdev);
if (rc)
goto out;
#ifdef CONFIG_ATA_BMDMA
if (bmdma)
/* prepare and activate BMDMA host */
rc = ata_pci_bmdma_prepare_host(pdev, ppi, &host);
else
#endif
/* prepare and activate SFF host */
rc = ata_pci_sff_prepare_host(pdev, ppi, &host);
if (rc)
goto out;
host->private_data = host_priv;
host->flags |= hflags;
#ifdef CONFIG_ATA_BMDMA
if (bmdma) {
pci_set_master(pdev);
rc = ata_pci_sff_activate_host(host, ata_bmdma_interrupt, sht);
} else
#endif
rc = ata_pci_sff_activate_host(host, ata_sff_interrupt, sht);
out:
if (rc == 0)
devres_remove_group(&pdev->dev, NULL);
else
devres_release_group(&pdev->dev, NULL);
return rc;
}
/**
* ata_pci_sff_init_one - Initialize/register PIO-only PCI IDE controller
* @pdev: Controller to be initialized
* @ppi: array of port_info, must be enough for two ports
* @sht: scsi_host_template to use when registering the host
* @host_priv: host private_data
* @hflag: host flags
*
* This is a helper function which can be called from a driver's
* xxx_init_one() probe function if the hardware uses traditional
* IDE taskfile registers and is PIO only.
*
* ASSUMPTION:
* Nobody makes a single channel controller that appears solely as
* the secondary legacy port on PCI.
*
* LOCKING:
* Inherited from PCI layer (may sleep).
*
* RETURNS:
* Zero on success, negative on errno-based value on error.
*/
int ata_pci_sff_init_one(struct pci_dev *pdev,
const struct ata_port_info * const *ppi,
struct scsi_host_template *sht, void *host_priv, int hflag)
{
return ata_pci_init_one(pdev, ppi, sht, host_priv, hflag, 0);
}
EXPORT_SYMBOL_GPL(ata_pci_sff_init_one);
#endif /* CONFIG_PCI */
/*
* BMDMA support
*/
#ifdef CONFIG_ATA_BMDMA
const struct ata_port_operations ata_bmdma_port_ops = {
.inherits = &ata_sff_port_ops,
.error_handler = ata_bmdma_error_handler,
.post_internal_cmd = ata_bmdma_post_internal_cmd,
.qc_prep = ata_bmdma_qc_prep,
.qc_issue = ata_bmdma_qc_issue,
.sff_irq_clear = ata_bmdma_irq_clear,
.bmdma_setup = ata_bmdma_setup,
.bmdma_start = ata_bmdma_start,
.bmdma_stop = ata_bmdma_stop,
.bmdma_status = ata_bmdma_status,
.port_start = ata_bmdma_port_start,
};
EXPORT_SYMBOL_GPL(ata_bmdma_port_ops);
const struct ata_port_operations ata_bmdma32_port_ops = {
.inherits = &ata_bmdma_port_ops,
.sff_data_xfer = ata_sff_data_xfer32,
.port_start = ata_bmdma_port_start32,
};
EXPORT_SYMBOL_GPL(ata_bmdma32_port_ops);
/**
* ata_bmdma_fill_sg - Fill PCI IDE PRD table
* @qc: Metadata associated with taskfile to be transferred
*
* Fill PCI IDE PRD (scatter-gather) table with segments
* associated with the current disk command.
*
* LOCKING:
* spin_lock_irqsave(host lock)
*
*/
static void ata_bmdma_fill_sg(struct ata_queued_cmd *qc)
{
struct ata_port *ap = qc->ap;
struct ata_bmdma_prd *prd = ap->bmdma_prd;
struct scatterlist *sg;
unsigned int si, pi;
pi = 0;
for_each_sg(qc->sg, sg, qc->n_elem, si) {
u32 addr, offset;
u32 sg_len, len;
/* determine if physical DMA addr spans 64K boundary.
* Note h/w doesn't support 64-bit, so we unconditionally
* truncate dma_addr_t to u32.
*/
addr = (u32) sg_dma_address(sg);
sg_len = sg_dma_len(sg);
while (sg_len) { offset = addr & 0xffff;
len = sg_len;
if ((offset + sg_len) > 0x10000)
len = 0x10000 - offset; prd[pi].addr = cpu_to_le32(addr);
prd[pi].flags_len = cpu_to_le32(len & 0xffff);
VPRINTK("PRD[%u] = (0x%X, 0x%X)\n", pi, addr, len);
pi++;
sg_len -= len;
addr += len;
}
}
prd[pi - 1].flags_len |= cpu_to_le32(ATA_PRD_EOT);
}
/**
* ata_bmdma_fill_sg_dumb - Fill PCI IDE PRD table
* @qc: Metadata associated with taskfile to be transferred
*
* Fill PCI IDE PRD (scatter-gather) table with segments
* associated with the current disk command. Perform the fill
* so that we avoid writing any length 64K records for
* controllers that don't follow the spec.
*
* LOCKING:
* spin_lock_irqsave(host lock)
*
*/
static void ata_bmdma_fill_sg_dumb(struct ata_queued_cmd *qc)
{
struct ata_port *ap = qc->ap;
struct ata_bmdma_prd *prd = ap->bmdma_prd;
struct scatterlist *sg;
unsigned int si, pi;
pi = 0;
for_each_sg(qc->sg, sg, qc->n_elem, si) {
u32 addr, offset;
u32 sg_len, len, blen;
/* determine if physical DMA addr spans 64K boundary.
* Note h/w doesn't support 64-bit, so we unconditionally
* truncate dma_addr_t to u32.
*/
addr = (u32) sg_dma_address(sg);
sg_len = sg_dma_len(sg);
while (sg_len) {
offset = addr & 0xffff;
len = sg_len;
if ((offset + sg_len) > 0x10000)
len = 0x10000 - offset;
blen = len & 0xffff;
prd[pi].addr = cpu_to_le32(addr);
if (blen == 0) {
/* Some PATA chipsets like the CS5530 can't
cope with 0x0000 meaning 64K as the spec
says */
prd[pi].flags_len = cpu_to_le32(0x8000);
blen = 0x8000;
prd[++pi].addr = cpu_to_le32(addr + 0x8000);
}
prd[pi].flags_len = cpu_to_le32(blen);
VPRINTK("PRD[%u] = (0x%X, 0x%X)\n", pi, addr, len);
pi++;
sg_len -= len;
addr += len;
}
}
prd[pi - 1].flags_len |= cpu_to_le32(ATA_PRD_EOT);
}
/**
* ata_bmdma_qc_prep - Prepare taskfile for submission
* @qc: Metadata associated with taskfile to be prepared
*
* Prepare ATA taskfile for submission.
*
* LOCKING:
* spin_lock_irqsave(host lock)
*/
enum ata_completion_errors ata_bmdma_qc_prep(struct ata_queued_cmd *qc)
{
if (!(qc->flags & ATA_QCFLAG_DMAMAP))
return AC_ERR_OK;
ata_bmdma_fill_sg(qc);
return AC_ERR_OK;
}
EXPORT_SYMBOL_GPL(ata_bmdma_qc_prep);
/**
* ata_bmdma_dumb_qc_prep - Prepare taskfile for submission
* @qc: Metadata associated with taskfile to be prepared
*
* Prepare ATA taskfile for submission.
*
* LOCKING:
* spin_lock_irqsave(host lock)
*/
enum ata_completion_errors ata_bmdma_dumb_qc_prep(struct ata_queued_cmd *qc)
{
if (!(qc->flags & ATA_QCFLAG_DMAMAP))
return AC_ERR_OK;
ata_bmdma_fill_sg_dumb(qc);
return AC_ERR_OK;
}
EXPORT_SYMBOL_GPL(ata_bmdma_dumb_qc_prep);
/**
* ata_bmdma_qc_issue - issue taskfile to a BMDMA controller
* @qc: command to issue to device
*
* This function issues a PIO, NODATA or DMA command to a
* SFF/BMDMA controller. PIO and NODATA are handled by
* ata_sff_qc_issue().
*
* LOCKING:
* spin_lock_irqsave(host lock)
*
* RETURNS:
* Zero on success, AC_ERR_* mask on failure
*/
unsigned int ata_bmdma_qc_issue(struct ata_queued_cmd *qc)
{
struct ata_port *ap = qc->ap;
struct ata_link *link = qc->dev->link;
/* defer PIO handling to sff_qc_issue */
if (!ata_is_dma(qc->tf.protocol)) return ata_sff_qc_issue(qc);
/* select the device */
ata_dev_select(ap, qc->dev->devno, 1, 0);
/* start the command */
switch (qc->tf.protocol) {
case ATA_PROT_DMA:
WARN_ON_ONCE(qc->tf.flags & ATA_TFLAG_POLLING); ap->ops->sff_tf_load(ap, &qc->tf); /* load tf registers */
ap->ops->bmdma_setup(qc); /* set up bmdma */
ap->ops->bmdma_start(qc); /* initiate bmdma */
ap->hsm_task_state = HSM_ST_LAST;
break;
case ATAPI_PROT_DMA:
WARN_ON_ONCE(qc->tf.flags & ATA_TFLAG_POLLING); ap->ops->sff_tf_load(ap, &qc->tf); /* load tf registers */
ap->ops->bmdma_setup(qc); /* set up bmdma */
ap->hsm_task_state = HSM_ST_FIRST;
/* send cdb by polling if no cdb interrupt */
if (!(qc->dev->flags & ATA_DFLAG_CDB_INTR))
ata_sff_queue_pio_task(link, 0);
break;
default:
WARN_ON(1); return AC_ERR_SYSTEM;
}
return 0;
}
EXPORT_SYMBOL_GPL(ata_bmdma_qc_issue);
/**
* ata_bmdma_port_intr - Handle BMDMA port interrupt
* @ap: Port on which interrupt arrived (possibly...)
* @qc: Taskfile currently active in engine
*
* Handle port interrupt for given queued command.
*
* LOCKING:
* spin_lock_irqsave(host lock)
*
* RETURNS:
* One if interrupt was handled, zero if not (shared irq).
*/
unsigned int ata_bmdma_port_intr(struct ata_port *ap, struct ata_queued_cmd *qc)
{
struct ata_eh_info *ehi = &ap->link.eh_info;
u8 host_stat = 0;
bool bmdma_stopped = false;
unsigned int handled;
if (ap->hsm_task_state == HSM_ST_LAST && ata_is_dma(qc->tf.protocol)) {
/* check status of DMA engine */
host_stat = ap->ops->bmdma_status(ap);
VPRINTK("ata%u: host_stat 0x%X\n", ap->print_id, host_stat);
/* if it's not our irq... */
if (!(host_stat & ATA_DMA_INTR))
return ata_sff_idle_irq(ap);
/* before we do anything else, clear DMA-Start bit */
ap->ops->bmdma_stop(qc);
bmdma_stopped = true;
if (unlikely(host_stat & ATA_DMA_ERR)) {
/* error when transferring data to/from memory */
qc->err_mask |= AC_ERR_HOST_BUS;
ap->hsm_task_state = HSM_ST_ERR;
}
}
handled = __ata_sff_port_intr(ap, qc, bmdma_stopped);
if (unlikely(qc->err_mask) && ata_is_dma(qc->tf.protocol))
ata_ehi_push_desc(ehi, "BMDMA stat 0x%x", host_stat);
return handled;
}
EXPORT_SYMBOL_GPL(ata_bmdma_port_intr);
/**
* ata_bmdma_interrupt - Default BMDMA ATA host interrupt handler
* @irq: irq line (unused)
* @dev_instance: pointer to our ata_host information structure
*
* Default interrupt handler for PCI IDE devices. Calls
* ata_bmdma_port_intr() for each port that is not disabled.
*
* LOCKING:
* Obtains host lock during operation.
*
* RETURNS:
* IRQ_NONE or IRQ_HANDLED.
*/
irqreturn_t ata_bmdma_interrupt(int irq, void *dev_instance)
{
return __ata_sff_interrupt(irq, dev_instance, ata_bmdma_port_intr);
}
EXPORT_SYMBOL_GPL(ata_bmdma_interrupt);
/**
* ata_bmdma_error_handler - Stock error handler for BMDMA controller
* @ap: port to handle error for
*
* Stock error handler for BMDMA controller. It can handle both
* PATA and SATA controllers. Most BMDMA controllers should be
* able to use this EH as-is or with some added handling before
* and after.
*
* LOCKING:
* Kernel thread context (may sleep)
*/
void ata_bmdma_error_handler(struct ata_port *ap)
{
struct ata_queued_cmd *qc;
unsigned long flags;
bool thaw = false;
qc = __ata_qc_from_tag(ap, ap->link.active_tag);
if (qc && !(qc->flags & ATA_QCFLAG_FAILED))
qc = NULL;
/* reset PIO HSM and stop DMA engine */
spin_lock_irqsave(ap->lock, flags);
if (qc && ata_is_dma(qc->tf.protocol)) {
u8 host_stat;
host_stat = ap->ops->bmdma_status(ap);
/* BMDMA controllers indicate host bus error by
* setting DMA_ERR bit and timing out. As it wasn't
* really a timeout event, adjust error mask and
* cancel frozen state.
*/
if (qc->err_mask == AC_ERR_TIMEOUT && (host_stat & ATA_DMA_ERR)) {
qc->err_mask = AC_ERR_HOST_BUS;
thaw = true;
}
ap->ops->bmdma_stop(qc);
/* if we're gonna thaw, make sure IRQ is clear */
if (thaw) {
ap->ops->sff_check_status(ap);
if (ap->ops->sff_irq_clear)
ap->ops->sff_irq_clear(ap);
}
}
spin_unlock_irqrestore(ap->lock, flags);
if (thaw)
ata_eh_thaw_port(ap);
ata_sff_error_handler(ap);
}
EXPORT_SYMBOL_GPL(ata_bmdma_error_handler);
/**
* ata_bmdma_post_internal_cmd - Stock post_internal_cmd for BMDMA
* @qc: internal command to clean up
*
* LOCKING:
* Kernel thread context (may sleep)
*/
void ata_bmdma_post_internal_cmd(struct ata_queued_cmd *qc)
{
struct ata_port *ap = qc->ap;
unsigned long flags;
if (ata_is_dma(qc->tf.protocol)) {
spin_lock_irqsave(ap->lock, flags);
ap->ops->bmdma_stop(qc);
spin_unlock_irqrestore(ap->lock, flags);
}
}
EXPORT_SYMBOL_GPL(ata_bmdma_post_internal_cmd);
/**
* ata_bmdma_irq_clear - Clear PCI IDE BMDMA interrupt.
* @ap: Port associated with this ATA transaction.
*
* Clear interrupt and error flags in DMA status register.
*
* May be used as the irq_clear() entry in ata_port_operations.
*
* LOCKING:
* spin_lock_irqsave(host lock)
*/
void ata_bmdma_irq_clear(struct ata_port *ap)
{
void __iomem *mmio = ap->ioaddr.bmdma_addr;
if (!mmio)
return;
iowrite8(ioread8(mmio + ATA_DMA_STATUS), mmio + ATA_DMA_STATUS);
}
EXPORT_SYMBOL_GPL(ata_bmdma_irq_clear);
/**
* ata_bmdma_setup - Set up PCI IDE BMDMA transaction
* @qc: Info associated with this ATA transaction.
*
* LOCKING:
* spin_lock_irqsave(host lock)
*/
void ata_bmdma_setup(struct ata_queued_cmd *qc)
{
struct ata_port *ap = qc->ap;
unsigned int rw = (qc->tf.flags & ATA_TFLAG_WRITE);
u8 dmactl;
/* load PRD table addr. */
mb(); /* make sure PRD table writes are visible to controller */
iowrite32(ap->bmdma_prd_dma, ap->ioaddr.bmdma_addr + ATA_DMA_TABLE_OFS);
/* specify data direction, triple-check start bit is clear */
dmactl = ioread8(ap->ioaddr.bmdma_addr + ATA_DMA_CMD);
dmactl &= ~(ATA_DMA_WR | ATA_DMA_START);
if (!rw)
dmactl |= ATA_DMA_WR; iowrite8(dmactl, ap->ioaddr.bmdma_addr + ATA_DMA_CMD);
/* issue r/w command */
ap->ops->sff_exec_command(ap, &qc->tf);
}
EXPORT_SYMBOL_GPL(ata_bmdma_setup);
/**
* ata_bmdma_start - Start a PCI IDE BMDMA transaction
* @qc: Info associated with this ATA transaction.
*
* LOCKING:
* spin_lock_irqsave(host lock)
*/
void ata_bmdma_start(struct ata_queued_cmd *qc)
{
struct ata_port *ap = qc->ap;
u8 dmactl;
/* start host DMA transaction */
dmactl = ioread8(ap->ioaddr.bmdma_addr + ATA_DMA_CMD);
iowrite8(dmactl | ATA_DMA_START, ap->ioaddr.bmdma_addr + ATA_DMA_CMD);
/* Strictly, one may wish to issue an ioread8() here, to
* flush the mmio write. However, control also passes
* to the hardware at this point, and it will interrupt
* us when we are to resume control. So, in effect,
* we don't care when the mmio write flushes.
* Further, a read of the DMA status register _immediately_
* following the write may not be what certain flaky hardware
* is expected, so I think it is best to not add a readb()
* without first all the MMIO ATA cards/mobos.
* Or maybe I'm just being paranoid.
*
* FIXME: The posting of this write means I/O starts are
* unnecessarily delayed for MMIO
*/
}
EXPORT_SYMBOL_GPL(ata_bmdma_start);
/**
* ata_bmdma_stop - Stop PCI IDE BMDMA transfer
* @qc: Command we are ending DMA for
*
* Clears the ATA_DMA_START flag in the dma control register
*
* May be used as the bmdma_stop() entry in ata_port_operations.
*
* LOCKING:
* spin_lock_irqsave(host lock)
*/
void ata_bmdma_stop(struct ata_queued_cmd *qc)
{
struct ata_port *ap = qc->ap;
void __iomem *mmio = ap->ioaddr.bmdma_addr;
/* clear start/stop bit */
iowrite8(ioread8(mmio + ATA_DMA_CMD) & ~ATA_DMA_START,
mmio + ATA_DMA_CMD);
/* one-PIO-cycle guaranteed wait, per spec, for HDMA1:0 transition */
ata_sff_dma_pause(ap);
}
EXPORT_SYMBOL_GPL(ata_bmdma_stop);
/**
* ata_bmdma_status - Read PCI IDE BMDMA status
* @ap: Port associated with this ATA transaction.
*
* Read and return BMDMA status register.
*
* May be used as the bmdma_status() entry in ata_port_operations.
*
* LOCKING:
* spin_lock_irqsave(host lock)
*/
u8 ata_bmdma_status(struct ata_port *ap)
{
return ioread8(ap->ioaddr.bmdma_addr + ATA_DMA_STATUS);
}
EXPORT_SYMBOL_GPL(ata_bmdma_status);
/**
* ata_bmdma_port_start - Set port up for bmdma.
* @ap: Port to initialize
*
* Called just after data structures for each port are
* initialized. Allocates space for PRD table.
*
* May be used as the port_start() entry in ata_port_operations.
*
* LOCKING:
* Inherited from caller.
*/
int ata_bmdma_port_start(struct ata_port *ap)
{
if (ap->mwdma_mask || ap->udma_mask) {
ap->bmdma_prd =
dmam_alloc_coherent(ap->host->dev, ATA_PRD_TBL_SZ,
&ap->bmdma_prd_dma, GFP_KERNEL);
if (!ap->bmdma_prd)
return -ENOMEM;
}
return 0;
}
EXPORT_SYMBOL_GPL(ata_bmdma_port_start);
/**
* ata_bmdma_port_start32 - Set port up for dma.
* @ap: Port to initialize
*
* Called just after data structures for each port are
* initialized. Enables 32bit PIO and allocates space for PRD
* table.
*
* May be used as the port_start() entry in ata_port_operations for
* devices that are capable of 32bit PIO.
*
* LOCKING:
* Inherited from caller.
*/
int ata_bmdma_port_start32(struct ata_port *ap)
{
ap->pflags |= ATA_PFLAG_PIO32 | ATA_PFLAG_PIO32CHANGE;
return ata_bmdma_port_start(ap);
}
EXPORT_SYMBOL_GPL(ata_bmdma_port_start32);
#ifdef CONFIG_PCI
/**
* ata_pci_bmdma_clear_simplex - attempt to kick device out of simplex
* @pdev: PCI device
*
* Some PCI ATA devices report simplex mode but in fact can be told to
* enter non simplex mode. This implements the necessary logic to
* perform the task on such devices. Calling it on other devices will
* have -undefined- behaviour.
*/
int ata_pci_bmdma_clear_simplex(struct pci_dev *pdev)
{
unsigned long bmdma = pci_resource_start(pdev, 4);
u8 simplex;
if (bmdma == 0)
return -ENOENT;
simplex = inb(bmdma + 0x02);
outb(simplex & 0x60, bmdma + 0x02);
simplex = inb(bmdma + 0x02);
if (simplex & 0x80)
return -EOPNOTSUPP;
return 0;
}
EXPORT_SYMBOL_GPL(ata_pci_bmdma_clear_simplex);
static void ata_bmdma_nodma(struct ata_host *host, const char *reason)
{
int i;
dev_err(host->dev, "BMDMA: %s, falling back to PIO\n", reason);
for (i = 0; i < 2; i++) {
host->ports[i]->mwdma_mask = 0;
host->ports[i]->udma_mask = 0;
}
}
/**
* ata_pci_bmdma_init - acquire PCI BMDMA resources and init ATA host
* @host: target ATA host
*
* Acquire PCI BMDMA resources and initialize @host accordingly.
*
* LOCKING:
* Inherited from calling layer (may sleep).
*/
void ata_pci_bmdma_init(struct ata_host *host)
{
struct device *gdev = host->dev;
struct pci_dev *pdev = to_pci_dev(gdev);
int i, rc;
/* No BAR4 allocation: No DMA */
if (pci_resource_start(pdev, 4) == 0) {
ata_bmdma_nodma(host, "BAR4 is zero");
return;
}
/*
* Some controllers require BMDMA region to be initialized
* even if DMA is not in use to clear IRQ status via
* ->sff_irq_clear method. Try to initialize bmdma_addr
* regardless of dma masks.
*/
rc = dma_set_mask_and_coherent(&pdev->dev, ATA_DMA_MASK);
if (rc)
ata_bmdma_nodma(host, "failed to set dma mask");
/* request and iomap DMA region */
rc = pcim_iomap_regions(pdev, 1 << 4, dev_driver_string(gdev));
if (rc) {
ata_bmdma_nodma(host, "failed to request/iomap BAR4");
return;
}
host->iomap = pcim_iomap_table(pdev);
for (i = 0; i < 2; i++) {
struct ata_port *ap = host->ports[i];
void __iomem *bmdma = host->iomap[4] + 8 * i;
if (ata_port_is_dummy(ap))
continue;
ap->ioaddr.bmdma_addr = bmdma;
if ((!(ap->flags & ATA_FLAG_IGN_SIMPLEX)) &&
(ioread8(bmdma + 2) & 0x80))
host->flags |= ATA_HOST_SIMPLEX;
ata_port_desc(ap, "bmdma 0x%llx",
(unsigned long long)pci_resource_start(pdev, 4) + 8 * i);
}
}
EXPORT_SYMBOL_GPL(ata_pci_bmdma_init);
/**
* ata_pci_bmdma_prepare_host - helper to prepare PCI BMDMA ATA host
* @pdev: target PCI device
* @ppi: array of port_info, must be enough for two ports
* @r_host: out argument for the initialized ATA host
*
* Helper to allocate BMDMA ATA host for @pdev, acquire all PCI
* resources and initialize it accordingly in one go.
*
* LOCKING:
* Inherited from calling layer (may sleep).
*
* RETURNS:
* 0 on success, -errno otherwise.
*/
int ata_pci_bmdma_prepare_host(struct pci_dev *pdev,
const struct ata_port_info * const * ppi,
struct ata_host **r_host)
{
int rc;
rc = ata_pci_sff_prepare_host(pdev, ppi, r_host);
if (rc)
return rc;
ata_pci_bmdma_init(*r_host);
return 0;
}
EXPORT_SYMBOL_GPL(ata_pci_bmdma_prepare_host);
/**
* ata_pci_bmdma_init_one - Initialize/register BMDMA PCI IDE controller
* @pdev: Controller to be initialized
* @ppi: array of port_info, must be enough for two ports
* @sht: scsi_host_template to use when registering the host
* @host_priv: host private_data
* @hflags: host flags
*
* This function is similar to ata_pci_sff_init_one() but also
* takes care of BMDMA initialization.
*
* LOCKING:
* Inherited from PCI layer (may sleep).
*
* RETURNS:
* Zero on success, negative on errno-based value on error.
*/
int ata_pci_bmdma_init_one(struct pci_dev *pdev,
const struct ata_port_info * const * ppi,
struct scsi_host_template *sht, void *host_priv,
int hflags)
{
return ata_pci_init_one(pdev, ppi, sht, host_priv, hflags, 1);
}
EXPORT_SYMBOL_GPL(ata_pci_bmdma_init_one);
#endif /* CONFIG_PCI */
#endif /* CONFIG_ATA_BMDMA */
/**
* ata_sff_port_init - Initialize SFF/BMDMA ATA port
* @ap: Port to initialize
*
* Called on port allocation to initialize SFF/BMDMA specific
* fields.
*
* LOCKING:
* None.
*/
void ata_sff_port_init(struct ata_port *ap)
{
INIT_DELAYED_WORK(&ap->sff_pio_task, ata_sff_pio_task);
ap->ctl = ATA_DEVCTL_OBS;
ap->last_ctl = 0xFF;
}
int __init ata_sff_init(void)
{
ata_sff_wq = alloc_workqueue("ata_sff", WQ_MEM_RECLAIM, WQ_MAX_ACTIVE);
if (!ata_sff_wq)
return -ENOMEM;
return 0;
}
void ata_sff_exit(void)
{
destroy_workqueue(ata_sff_wq);
}
/*
* random.c -- A strong random number generator
*
* Copyright (C) 2017 Jason A. Donenfeld <Jason@zx2c4.com>. All
* Rights Reserved.
*
* Copyright Matt Mackall <mpm@selenic.com>, 2003, 2004, 2005
*
* Copyright Theodore Ts'o, 1994, 1995, 1996, 1997, 1998, 1999. All
* rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, and the entire permission notice in its entirety,
* including the disclaimer of warranties.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. The name of the author may not be used to endorse or promote
* products derived from this software without specific prior
* written permission.
*
* ALTERNATIVELY, this product may be distributed under the terms of
* the GNU General Public License, in which case the provisions of the GPL are
* required INSTEAD OF the above restrictions. (This clause is
* necessary due to a potential bad interaction between the GPL and
* the restrictions contained in a BSD-style copyright.)
*
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE, ALL OF
* WHICH ARE HEREBY DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT
* OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
* USE OF THIS SOFTWARE, EVEN IF NOT ADVISED OF THE POSSIBILITY OF SUCH
* DAMAGE.
*/
/*
* (now, with legal B.S. out of the way.....)
*
* This routine gathers environmental noise from device drivers, etc.,
* and returns good random numbers, suitable for cryptographic use.
* Besides the obvious cryptographic uses, these numbers are also good
* for seeding TCP sequence numbers, and other places where it is
* desirable to have numbers which are not only random, but hard to
* predict by an attacker.
*
* Theory of operation
* ===================
*
* Computers are very predictable devices. Hence it is extremely hard
* to produce truly random numbers on a computer --- as opposed to
* pseudo-random numbers, which can easily generated by using a
* algorithm. Unfortunately, it is very easy for attackers to guess
* the sequence of pseudo-random number generators, and for some
* applications this is not acceptable. So instead, we must try to
* gather "environmental noise" from the computer's environment, which
* must be hard for outside attackers to observe, and use that to
* generate random numbers. In a Unix environment, this is best done
* from inside the kernel.
*
* Sources of randomness from the environment include inter-keyboard
* timings, inter-interrupt timings from some interrupts, and other
* events which are both (a) non-deterministic and (b) hard for an
* outside observer to measure. Randomness from these sources are
* added to an "entropy pool", which is mixed using a CRC-like function.
* This is not cryptographically strong, but it is adequate assuming
* the randomness is not chosen maliciously, and it is fast enough that
* the overhead of doing it on every interrupt is very reasonable.
* As random bytes are mixed into the entropy pool, the routines keep
* an *estimate* of how many bits of randomness have been stored into
* the random number generator's internal state.
*
* When random bytes are desired, they are obtained by taking the SHA
* hash of the contents of the "entropy pool". The SHA hash avoids
* exposing the internal state of the entropy pool. It is believed to
* be computationally infeasible to derive any useful information
* about the input of SHA from its output. Even if it is possible to
* analyze SHA in some clever way, as long as the amount of data
* returned from the generator is less than the inherent entropy in
* the pool, the output data is totally unpredictable. For this
* reason, the routine decreases its internal estimate of how many
* bits of "true randomness" are contained in the entropy pool as it
* outputs random numbers.
*
* If this estimate goes to zero, the routine can still generate
* random numbers; however, an attacker may (at least in theory) be
* able to infer the future output of the generator from prior
* outputs. This requires successful cryptanalysis of SHA, which is
* not believed to be feasible, but there is a remote possibility.
* Nonetheless, these numbers should be useful for the vast majority
* of purposes.
*
* Exported interfaces ---- output
* ===============================
*
* There are four exported interfaces; two for use within the kernel,
* and two or use from userspace.
*
* Exported interfaces ---- userspace output
* -----------------------------------------
*
* The userspace interfaces are two character devices /dev/random and
* /dev/urandom. /dev/random is suitable for use when very high
* quality randomness is desired (for example, for key generation or
* one-time pads), as it will only return a maximum of the number of
* bits of randomness (as estimated by the random number generator)
* contained in the entropy pool.
*
* The /dev/urandom device does not have this limit, and will return
* as many bytes as are requested. As more and more random bytes are
* requested without giving time for the entropy pool to recharge,
* this will result in random numbers that are merely cryptographically
* strong. For many applications, however, this is acceptable.
*
* Exported interfaces ---- kernel output
* --------------------------------------
*
* The primary kernel interface is
*
* void get_random_bytes(void *buf, int nbytes);
*
* This interface will return the requested number of random bytes,
* and place it in the requested buffer. This is equivalent to a
* read from /dev/urandom.
*
* For less critical applications, there are the functions:
*
* u32 get_random_u32()
* u64 get_random_u64()
* unsigned int get_random_int()
* unsigned long get_random_long()
*
* These are produced by a cryptographic RNG seeded from get_random_bytes,
* and so do not deplete the entropy pool as much. These are recommended
* for most in-kernel operations *if the result is going to be stored in
* the kernel*.
*
* Specifically, the get_random_int() family do not attempt to do
* "anti-backtracking". If you capture the state of the kernel (e.g.
* by snapshotting the VM), you can figure out previous get_random_int()
* return values. But if the value is stored in the kernel anyway,
* this is not a problem.
*
* It *is* safe to expose get_random_int() output to attackers (e.g. as
* network cookies); given outputs 1..n, it's not feasible to predict
* outputs 0 or n+1. The only concern is an attacker who breaks into
* the kernel later; the get_random_int() engine is not reseeded as
* often as the get_random_bytes() one.
*
* get_random_bytes() is needed for keys that need to stay secret after
* they are erased from the kernel. For example, any key that will
* be wrapped and stored encrypted. And session encryption keys: we'd
* like to know that after the session is closed and the keys erased,
* the plaintext is unrecoverable to someone who recorded the ciphertext.
*
* But for network ports/cookies, stack canaries, PRNG seeds, address
* space layout randomization, session *authentication* keys, or other
* applications where the sensitive data is stored in the kernel in
* plaintext for as long as it's sensitive, the get_random_int() family
* is just fine.
*
* Consider ASLR. We want to keep the address space secret from an
* outside attacker while the process is running, but once the address
* space is torn down, it's of no use to an attacker any more. And it's
* stored in kernel data structures as long as it's alive, so worrying
* about an attacker's ability to extrapolate it from the get_random_int()
* CRNG is silly.
*
* Even some cryptographic keys are safe to generate with get_random_int().
* In particular, keys for SipHash are generally fine. Here, knowledge
* of the key authorizes you to do something to a kernel object (inject
* packets to a network connection, or flood a hash table), and the
* key is stored with the object being protected. Once it goes away,
* we no longer care if anyone knows the key.
*
* prandom_u32()
* -------------
*
* For even weaker applications, see the pseudorandom generator
* prandom_u32(), prandom_max(), and prandom_bytes(). If the random
* numbers aren't security-critical at all, these are *far* cheaper.
* Useful for self-tests, random error simulation, randomized backoffs,
* and any other application where you trust that nobody is trying to
* maliciously mess with you by guessing the "random" numbers.
*
* Exported interfaces ---- input
* ==============================
*
* The current exported interfaces for gathering environmental noise
* from the devices are:
*
* void add_device_randomness(const void *buf, unsigned int size);
* void add_input_randomness(unsigned int type, unsigned int code,
* unsigned int value);
* void add_interrupt_randomness(int irq, int irq_flags);
* void add_disk_randomness(struct gendisk *disk);
*
* add_device_randomness() is for adding data to the random pool that
* is likely to differ between two devices (or possibly even per boot).
* This would be things like MAC addresses or serial numbers, or the
* read-out of the RTC. This does *not* add any actual entropy to the
* pool, but it initializes the pool to different values for devices
* that might otherwise be identical and have very little entropy
* available to them (particularly common in the embedded world).
*
* add_input_randomness() uses the input layer interrupt timing, as well as
* the event type information from the hardware.
*
* add_interrupt_randomness() uses the interrupt timing as random
* inputs to the entropy pool. Using the cycle counters and the irq source
* as inputs, it feeds the randomness roughly once a second.
*
* add_disk_randomness() uses what amounts to the seek time of block
* layer request events, on a per-disk_devt basis, as input to the
* entropy pool. Note that high-speed solid state drives with very low
* seek times do not make for good sources of entropy, as their seek
* times are usually fairly consistent.
*
* All of these routines try to estimate how many bits of randomness a
* particular randomness source. They do this by keeping track of the
* first and second order deltas of the event timings.
*
* Ensuring unpredictability at system startup
* ============================================
*
* When any operating system starts up, it will go through a sequence
* of actions that are fairly predictable by an adversary, especially
* if the start-up does not involve interaction with a human operator.
* This reduces the actual number of bits of unpredictability in the
* entropy pool below the value in entropy_count. In order to
* counteract this effect, it helps to carry information in the
* entropy pool across shut-downs and start-ups. To do this, put the
* following lines an appropriate script which is run during the boot
* sequence:
*
* echo "Initializing random number generator..."
* random_seed=/var/run/random-seed
* # Carry a random seed from start-up to start-up
* # Load and then save the whole entropy pool
* if [ -f $random_seed ]; then
* cat $random_seed >/dev/urandom
* else
* touch $random_seed
* fi
* chmod 600 $random_seed
* dd if=/dev/urandom of=$random_seed count=1 bs=512
*
* and the following lines in an appropriate script which is run as
* the system is shutdown:
*
* # Carry a random seed from shut-down to start-up
* # Save the whole entropy pool
* echo "Saving random seed..."
* random_seed=/var/run/random-seed
* touch $random_seed
* chmod 600 $random_seed
* dd if=/dev/urandom of=$random_seed count=1 bs=512
*
* For example, on most modern systems using the System V init
* scripts, such code fragments would be found in
* /etc/rc.d/init.d/random. On older Linux systems, the correct script
* location might be in /etc/rcb.d/rc.local or /etc/rc.d/rc.0.
*
* Effectively, these commands cause the contents of the entropy pool
* to be saved at shut-down time and reloaded into the entropy pool at
* start-up. (The 'dd' in the addition to the bootup script is to
* make sure that /etc/random-seed is different for every start-up,
* even if the system crashes without executing rc.0.) Even with
* complete knowledge of the start-up activities, predicting the state
* of the entropy pool requires knowledge of the previous history of
* the system.
*
* Configuring the /dev/random driver under Linux
* ==============================================
*
* The /dev/random driver under Linux uses minor numbers 8 and 9 of
* the /dev/mem major number (#1). So if your system does not have
* /dev/random and /dev/urandom created already, they can be created
* by using the commands:
*
* mknod /dev/random c 1 8
* mknod /dev/urandom c 1 9
*
* Acknowledgements:
* =================
*
* Ideas for constructing this random number generator were derived
* from Pretty Good Privacy's random number generator, and from private
* discussions with Phil Karn. Colin Plumb provided a faster random
* number generator, which speed up the mixing function of the entropy
* pool, taken from PGPfone. Dale Worley has also contributed many
* useful ideas and suggestions to improve this driver.
*
* Any flaws in the design are solely my responsibility, and should
* not be attributed to the Phil, Colin, or any of authors of PGP.
*
* Further background information on this topic may be obtained from
* RFC 1750, "Randomness Recommendations for Security", by Donald
* Eastlake, Steve Crocker, and Jeff Schiller.
*/
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/utsname.h>
#include <linux/module.h>
#include <linux/kernel.h>
#include <linux/major.h>
#include <linux/string.h>
#include <linux/fcntl.h>
#include <linux/slab.h>
#include <linux/random.h>
#include <linux/poll.h>
#include <linux/init.h>
#include <linux/fs.h>
#include <linux/genhd.h>
#include <linux/interrupt.h>
#include <linux/mm.h>
#include <linux/nodemask.h>
#include <linux/spinlock.h>
#include <linux/kthread.h>
#include <linux/percpu.h>
#include <linux/fips.h>
#include <linux/ptrace.h>
#include <linux/workqueue.h>
#include <linux/irq.h>
#include <linux/ratelimit.h>
#include <linux/syscalls.h>
#include <linux/completion.h>
#include <linux/uuid.h>
#include <crypto/chacha.h>
#include <crypto/sha1.h>
#include <asm/processor.h>
#include <linux/uaccess.h>
#include <asm/irq.h>
#include <asm/irq_regs.h>
#include <asm/io.h>
#define CREATE_TRACE_POINTS
#include <trace/events/random.h>
/* #define ADD_INTERRUPT_BENCH */
/*
* Configuration information
*/
#define INPUT_POOL_SHIFT 12
#define INPUT_POOL_WORDS (1 << (INPUT_POOL_SHIFT-5))
#define OUTPUT_POOL_SHIFT 10
#define OUTPUT_POOL_WORDS (1 << (OUTPUT_POOL_SHIFT-5))
#define EXTRACT_SIZE 10
#define LONGS(x) (((x) + sizeof(unsigned long) - 1)/sizeof(unsigned long))
/*
* To allow fractional bits to be tracked, the entropy_count field is
* denominated in units of 1/8th bits.
*
* 2*(ENTROPY_SHIFT + poolbitshift) must <= 31, or the multiply in
* credit_entropy_bits() needs to be 64 bits wide.
*/
#define ENTROPY_SHIFT 3
#define ENTROPY_BITS(r) ((r)->entropy_count >> ENTROPY_SHIFT)
/*
* If the entropy count falls under this number of bits, then we
* should wake up processes which are selecting or polling on write
* access to /dev/random.
*/
static int random_write_wakeup_bits = 28 * OUTPUT_POOL_WORDS;
/*
* Originally, we used a primitive polynomial of degree .poolwords
* over GF(2). The taps for various sizes are defined below. They
* were chosen to be evenly spaced except for the last tap, which is 1
* to get the twisting happening as fast as possible.
*
* For the purposes of better mixing, we use the CRC-32 polynomial as
* well to make a (modified) twisted Generalized Feedback Shift
* Register. (See M. Matsumoto & Y. Kurita, 1992. Twisted GFSR
* generators. ACM Transactions on Modeling and Computer Simulation
* 2(3):179-194. Also see M. Matsumoto & Y. Kurita, 1994. Twisted
* GFSR generators II. ACM Transactions on Modeling and Computer
* Simulation 4:254-266)
*
* Thanks to Colin Plumb for suggesting this.
*
* The mixing operation is much less sensitive than the output hash,
* where we use SHA-1. All that we want of mixing operation is that
* it be a good non-cryptographic hash; i.e. it not produce collisions
* when fed "random" data of the sort we expect to see. As long as
* the pool state differs for different inputs, we have preserved the
* input entropy and done a good job. The fact that an intelligent
* attacker can construct inputs that will produce controlled
* alterations to the pool's state is not important because we don't
* consider such inputs to contribute any randomness. The only
* property we need with respect to them is that the attacker can't
* increase his/her knowledge of the pool's state. Since all
* additions are reversible (knowing the final state and the input,
* you can reconstruct the initial state), if an attacker has any
* uncertainty about the initial state, he/she can only shuffle that
* uncertainty about, but never cause any collisions (which would
* decrease the uncertainty).
*
* Our mixing functions were analyzed by Lacharme, Roeck, Strubel, and
* Videau in their paper, "The Linux Pseudorandom Number Generator
* Revisited" (see: http://eprint.iacr.org/2012/251.pdf). In their
* paper, they point out that we are not using a true Twisted GFSR,
* since Matsumoto & Kurita used a trinomial feedback polynomial (that
* is, with only three taps, instead of the six that we are using).
* As a result, the resulting polynomial is neither primitive nor
* irreducible, and hence does not have a maximal period over
* GF(2**32). They suggest a slight change to the generator
* polynomial which improves the resulting TGFSR polynomial to be
* irreducible, which we have made here.
*/
static const struct poolinfo {
int poolbitshift, poolwords, poolbytes, poolfracbits;
#define S(x) ilog2(x)+5, (x), (x)*4, (x) << (ENTROPY_SHIFT+5)
int tap1, tap2, tap3, tap4, tap5;
} poolinfo_table[] = {
/* was: x^128 + x^103 + x^76 + x^51 +x^25 + x + 1 */
/* x^128 + x^104 + x^76 + x^51 +x^25 + x + 1 */
{ S(128), 104, 76, 51, 25, 1 },
};
/*
* Static global variables
*/
static DECLARE_WAIT_QUEUE_HEAD(random_write_wait);
static struct fasync_struct *fasync;
static DEFINE_SPINLOCK(random_ready_list_lock);
static LIST_HEAD(random_ready_list);
struct crng_state {
__u32 state[16];
unsigned long init_time;
spinlock_t lock;
};
static struct crng_state primary_crng = {
.lock = __SPIN_LOCK_UNLOCKED(primary_crng.lock),
};
/*
* crng_init = 0 --> Uninitialized
* 1 --> Initialized
* 2 --> Initialized from input_pool
*
* crng_init is protected by primary_crng->lock, and only increases
* its value (from 0->1->2).
*/
static int crng_init = 0;
static bool crng_need_final_init = false;
#define crng_ready() (likely(crng_init > 1))
static int crng_init_cnt = 0;
static unsigned long crng_global_init_time = 0;
#define CRNG_INIT_CNT_THRESH (2*CHACHA_KEY_SIZE)
static void _extract_crng(struct crng_state *crng, __u8 out[CHACHA_BLOCK_SIZE]);
static void _crng_backtrack_protect(struct crng_state *crng,
__u8 tmp[CHACHA_BLOCK_SIZE], int used);
static void process_random_ready_list(void);
static void _get_random_bytes(void *buf, int nbytes);
static struct ratelimit_state unseeded_warning =
RATELIMIT_STATE_INIT("warn_unseeded_randomness", HZ, 3);
static struct ratelimit_state urandom_warning =
RATELIMIT_STATE_INIT("warn_urandom_randomness", HZ, 3);
static int ratelimit_disable __read_mostly;
module_param_named(ratelimit_disable, ratelimit_disable, int, 0644);
MODULE_PARM_DESC(ratelimit_disable, "Disable random ratelimit suppression");
/**********************************************************************
*
* OS independent entropy store. Here are the functions which handle
* storing entropy in an entropy pool.
*
**********************************************************************/
struct entropy_store;
struct entropy_store {
/* read-only data: */
const struct poolinfo *poolinfo;
__u32 *pool;
const char *name;
/* read-write data: */
spinlock_t lock;
unsigned short add_ptr;
unsigned short input_rotate;
int entropy_count;
unsigned int last_data_init:1;
__u8 last_data[EXTRACT_SIZE];
};
static ssize_t extract_entropy(struct entropy_store *r, void *buf,
size_t nbytes, int min, int rsvd);
static ssize_t _extract_entropy(struct entropy_store *r, void *buf,
size_t nbytes, int fips);
static void crng_reseed(struct crng_state *crng, struct entropy_store *r);
static __u32 input_pool_data[INPUT_POOL_WORDS] __latent_entropy;
static struct entropy_store input_pool = {
.poolinfo = &poolinfo_table[0],
.name = "input",
.lock = __SPIN_LOCK_UNLOCKED(input_pool.lock),
.pool = input_pool_data
};
static __u32 const twist_table[8] = {
0x00000000, 0x3b6e20c8, 0x76dc4190, 0x4db26158,
0xedb88320, 0xd6d6a3e8, 0x9b64c2b0, 0xa00ae278 };
/*
* This function adds bytes into the entropy "pool". It does not
* update the entropy estimate. The caller should call
* credit_entropy_bits if this is appropriate.
*
* The pool is stirred with a primitive polynomial of the appropriate
* degree, and then twisted. We twist by three bits at a time because
* it's cheap to do so and helps slightly in the expected case where
* the entropy is concentrated in the low-order bits.
*/
static void _mix_pool_bytes(struct entropy_store *r, const void *in,
int nbytes)
{
unsigned long i, tap1, tap2, tap3, tap4, tap5;
int input_rotate;
int wordmask = r->poolinfo->poolwords - 1;
const char *bytes = in;
__u32 w;
tap1 = r->poolinfo->tap1;
tap2 = r->poolinfo->tap2;
tap3 = r->poolinfo->tap3;
tap4 = r->poolinfo->tap4;
tap5 = r->poolinfo->tap5;
input_rotate = r->input_rotate;
i = r->add_ptr;
/* mix one byte at a time to simplify size handling and churn faster */
while (nbytes--) {
w = rol32(*bytes++, input_rotate);
i = (i - 1) & wordmask;
/* XOR in the various taps */
w ^= r->pool[i];
w ^= r->pool[(i + tap1) & wordmask];
w ^= r->pool[(i + tap2) & wordmask];
w ^= r->pool[(i + tap3) & wordmask];
w ^= r->pool[(i + tap4) & wordmask];
w ^= r->pool[(i + tap5) & wordmask];
/* Mix the result back in with a twist */
r->pool[i] = (w >> 3) ^ twist_table[w & 7];
/*
* Normally, we add 7 bits of rotation to the pool.
* At the beginning of the pool, add an extra 7 bits
* rotation, so that successive passes spread the
* input bits across the pool evenly.
*/
input_rotate = (input_rotate + (i ? 7 : 14)) & 31;
}
r->input_rotate = input_rotate;
r->add_ptr = i;
}
static void __mix_pool_bytes(struct entropy_store *r, const void *in,
int nbytes)
{
trace_mix_pool_bytes_nolock(r->name, nbytes, _RET_IP_);
_mix_pool_bytes(r, in, nbytes);
}
static void mix_pool_bytes(struct entropy_store *r, const void *in,
int nbytes)
{
unsigned long flags;
trace_mix_pool_bytes(r->name, nbytes, _RET_IP_);
spin_lock_irqsave(&r->lock, flags);
_mix_pool_bytes(r, in, nbytes);
spin_unlock_irqrestore(&r->lock, flags);
}
struct fast_pool {
__u32 pool[4];
unsigned long last;
unsigned short reg_idx;
unsigned char count;
};
/*
* This is a fast mixing routine used by the interrupt randomness
* collector. It's hardcoded for an 128 bit pool and assumes that any
* locks that might be needed are taken by the caller.
*/
static void fast_mix(struct fast_pool *f)
{
__u32 a = f->pool[0], b = f->pool[1];
__u32 c = f->pool[2], d = f->pool[3];
a += b; c += d;
b = rol32(b, 6); d = rol32(d, 27);
d ^= a; b ^= c;
a += b; c += d;
b = rol32(b, 16); d = rol32(d, 14);
d ^= a; b ^= c;
a += b; c += d;
b = rol32(b, 6); d = rol32(d, 27);
d ^= a; b ^= c;
a += b; c += d;
b = rol32(b, 16); d = rol32(d, 14);
d ^= a; b ^= c;
f->pool[0] = a; f->pool[1] = b;
f->pool[2] = c; f->pool[3] = d;
f->count++;
}
static void process_random_ready_list(void)
{
unsigned long flags;
struct random_ready_callback *rdy, *tmp;
spin_lock_irqsave(&random_ready_list_lock, flags);
list_for_each_entry_safe(rdy, tmp, &random_ready_list, list) {
struct module *owner = rdy->owner;
list_del_init(&rdy->list);
rdy->func(rdy);
module_put(owner);
}
spin_unlock_irqrestore(&random_ready_list_lock, flags);
}
/*
* Credit (or debit) the entropy store with n bits of entropy.
* Use credit_entropy_bits_safe() if the value comes from userspace
* or otherwise should be checked for extreme values.
*/
static void credit_entropy_bits(struct entropy_store *r, int nbits)
{
int entropy_count, orig;
const int pool_size = r->poolinfo->poolfracbits;
int nfrac = nbits << ENTROPY_SHIFT;
if (!nbits)
return;
retry:
entropy_count = orig = READ_ONCE(r->entropy_count);
if (nfrac < 0) {
/* Debit */
entropy_count += nfrac;
} else {
/*
* Credit: we have to account for the possibility of
* overwriting already present entropy. Even in the
* ideal case of pure Shannon entropy, new contributions
* approach the full value asymptotically:
*
* entropy <- entropy + (pool_size - entropy) *
* (1 - exp(-add_entropy/pool_size))
*
* For add_entropy <= pool_size/2 then
* (1 - exp(-add_entropy/pool_size)) >=
* (add_entropy/pool_size)*0.7869...
* so we can approximate the exponential with
* 3/4*add_entropy/pool_size and still be on the
* safe side by adding at most pool_size/2 at a time.
*
* The use of pool_size-2 in the while statement is to
* prevent rounding artifacts from making the loop
* arbitrarily long; this limits the loop to log2(pool_size)*2
* turns no matter how large nbits is.
*/
int pnfrac = nfrac;
const int s = r->poolinfo->poolbitshift + ENTROPY_SHIFT + 2;
/* The +2 corresponds to the /4 in the denominator */
do {
unsigned int anfrac = min(pnfrac, pool_size/2);
unsigned int add =
((pool_size - entropy_count)*anfrac*3) >> s;
entropy_count += add;
pnfrac -= anfrac;
} while (unlikely(entropy_count < pool_size-2 && pnfrac));
}
if (WARN_ON(entropy_count < 0)) {
pr_warn("negative entropy/overflow: pool %s count %d\n",
r->name, entropy_count);
entropy_count = 0;
} else if (entropy_count > pool_size)
entropy_count = pool_size;
if (cmpxchg(&r->entropy_count, orig, entropy_count) != orig)
goto retry;
trace_credit_entropy_bits(r->name, nbits,
entropy_count >> ENTROPY_SHIFT, _RET_IP_);
if (r == &input_pool) {
int entropy_bits = entropy_count >> ENTROPY_SHIFT;
if (crng_init < 2 && entropy_bits >= 128)
crng_reseed(&primary_crng, r);
}
}
static int credit_entropy_bits_safe(struct entropy_store *r, int nbits)
{
const int nbits_max = r->poolinfo->poolwords * 32;
if (nbits < 0)
return -EINVAL;
/* Cap the value to avoid overflows */
nbits = min(nbits, nbits_max);
credit_entropy_bits(r, nbits);
return 0;
}
/*********************************************************************
*
* CRNG using CHACHA20
*
*********************************************************************/
#define CRNG_RESEED_INTERVAL (300*HZ)
static DECLARE_WAIT_QUEUE_HEAD(crng_init_wait);
#ifdef CONFIG_NUMA
/*
* Hack to deal with crazy userspace progams when they are all trying
* to access /dev/urandom in parallel. The programs are almost
* certainly doing something terribly wrong, but we'll work around
* their brain damage.
*/
static struct crng_state **crng_node_pool __read_mostly;
#endif
static void invalidate_batched_entropy(void);
static void numa_crng_init(void);
static bool trust_cpu __ro_after_init = IS_ENABLED(CONFIG_RANDOM_TRUST_CPU);
static int __init parse_trust_cpu(char *arg)
{
return kstrtobool(arg, &trust_cpu);
}
early_param("random.trust_cpu", parse_trust_cpu);
static bool crng_init_try_arch(struct crng_state *crng)
{
int i;
bool arch_init = true;
unsigned long rv;
for (i = 4; i < 16; i++) {
if (!arch_get_random_seed_long(&rv) &&
!arch_get_random_long(&rv)) {
rv = random_get_entropy();
arch_init = false;
}
crng->state[i] ^= rv;
}
return arch_init;
}
static bool __init crng_init_try_arch_early(struct crng_state *crng)
{
int i;
bool arch_init = true;
unsigned long rv;
for (i = 4; i < 16; i++) {
if (!arch_get_random_seed_long_early(&rv) &&
!arch_get_random_long_early(&rv)) {
rv = random_get_entropy();
arch_init = false;
}
crng->state[i] ^= rv;
}
return arch_init;
}
static void __maybe_unused crng_initialize_secondary(struct crng_state *crng)
{
chacha_init_consts(crng->state);
_get_random_bytes(&crng->state[4], sizeof(__u32) * 12);
crng_init_try_arch(crng);
crng->init_time = jiffies - CRNG_RESEED_INTERVAL - 1;
}
static void __init crng_initialize_primary(struct crng_state *crng)
{
chacha_init_consts(crng->state);
_extract_entropy(&input_pool, &crng->state[4], sizeof(__u32) * 12, 0);
if (crng_init_try_arch_early(crng) && trust_cpu) {
invalidate_batched_entropy();
numa_crng_init();
crng_init = 2;
pr_notice("crng done (trusting CPU's manufacturer)\n");
}
crng->init_time = jiffies - CRNG_RESEED_INTERVAL - 1;
}
static void crng_finalize_init(struct crng_state *crng)
{
if (crng != &primary_crng || crng_init >= 2)
return;
if (!system_wq) {
/* We can't call numa_crng_init until we have workqueues,
* so mark this for processing later. */
crng_need_final_init = true;
return;
}
invalidate_batched_entropy();
numa_crng_init();
crng_init = 2;
process_random_ready_list();
wake_up_interruptible(&crng_init_wait);
kill_fasync(&fasync, SIGIO, POLL_IN);
pr_notice("crng init done\n");
if (unseeded_warning.missed) {
pr_notice("%d get_random_xx warning(s) missed due to ratelimiting\n",
unseeded_warning.missed);
unseeded_warning.missed = 0;
}
if (urandom_warning.missed) {
pr_notice("%d urandom warning(s) missed due to ratelimiting\n",
urandom_warning.missed);
urandom_warning.missed = 0;
}
}
#ifdef CONFIG_NUMA
static void do_numa_crng_init(struct work_struct *work)
{
int i;
struct crng_state *crng;
struct crng_state **pool;
pool = kcalloc(nr_node_ids, sizeof(*pool), GFP_KERNEL|__GFP_NOFAIL);
for_each_online_node(i) {
crng = kmalloc_node(sizeof(struct crng_state),
GFP_KERNEL | __GFP_NOFAIL, i);
spin_lock_init(&crng->lock);
crng_initialize_secondary(crng);
pool[i] = crng;
}
/* pairs with READ_ONCE() in select_crng() */
if (cmpxchg_release(&crng_node_pool, NULL, pool) != NULL) {
for_each_node(i)
kfree(pool[i]);
kfree(pool);
}
}
static DECLARE_WORK(numa_crng_init_work, do_numa_crng_init);
static void numa_crng_init(void)
{
schedule_work(&numa_crng_init_work);
}
static struct crng_state *select_crng(void)
{
struct crng_state **pool;
int nid = numa_node_id();
/* pairs with cmpxchg_release() in do_numa_crng_init() */
pool = READ_ONCE(crng_node_pool);
if (pool && pool[nid])
return pool[nid];
return &primary_crng;
}
#else
static void numa_crng_init(void) {}
static struct crng_state *select_crng(void)
{
return &primary_crng;
}
#endif
/*
* crng_fast_load() can be called by code in the interrupt service
* path. So we can't afford to dilly-dally. Returns the number of
* bytes processed from cp.
*/
static size_t crng_fast_load(const char *cp, size_t len)
{
unsigned long flags;
char *p;
size_t ret = 0;
if (!spin_trylock_irqsave(&primary_crng.lock, flags))
return 0;
if (crng_init != 0) {
spin_unlock_irqrestore(&primary_crng.lock, flags);
return 0;
}
p = (unsigned char *) &primary_crng.state[4];
while (len > 0 && crng_init_cnt < CRNG_INIT_CNT_THRESH) {
p[crng_init_cnt % CHACHA_KEY_SIZE] ^= *cp;
cp++; crng_init_cnt++; len--; ret++;
}
spin_unlock_irqrestore(&primary_crng.lock, flags);
if (crng_init_cnt >= CRNG_INIT_CNT_THRESH) {
invalidate_batched_entropy();
crng_init = 1;
pr_notice("fast init done\n");
}
return ret;
}
/*
* crng_slow_load() is called by add_device_randomness, which has two
* attributes. (1) We can't trust the buffer passed to it is
* guaranteed to be unpredictable (so it might not have any entropy at
* all), and (2) it doesn't have the performance constraints of
* crng_fast_load().
*
* So we do something more comprehensive which is guaranteed to touch
* all of the primary_crng's state, and which uses a LFSR with a
* period of 255 as part of the mixing algorithm. Finally, we do
* *not* advance crng_init_cnt since buffer we may get may be something
* like a fixed DMI table (for example), which might very well be
* unique to the machine, but is otherwise unvarying.
*/
static int crng_slow_load(const char *cp, size_t len)
{
unsigned long flags;
static unsigned char lfsr = 1;
unsigned char tmp;
unsigned i, max = CHACHA_KEY_SIZE;
const char * src_buf = cp;
char * dest_buf = (char *) &primary_crng.state[4];
if (!spin_trylock_irqsave(&primary_crng.lock, flags))
return 0;
if (crng_init != 0) {
spin_unlock_irqrestore(&primary_crng.lock, flags);
return 0;
}
if (len > max)
max = len;
for (i = 0; i < max ; i++) {
tmp = lfsr;
lfsr >>= 1;
if (tmp & 1)
lfsr ^= 0xE1;
tmp = dest_buf[i % CHACHA_KEY_SIZE];
dest_buf[i % CHACHA_KEY_SIZE] ^= src_buf[i % len] ^ lfsr;
lfsr += (tmp << 3) | (tmp >> 5);
}
spin_unlock_irqrestore(&primary_crng.lock, flags);
return 1;
}
static void crng_reseed(struct crng_state *crng, struct entropy_store *r)
{
unsigned long flags;
int i, num;
union {
__u8 block[CHACHA_BLOCK_SIZE];
__u32 key[8];
} buf;
if (r) {
num = extract_entropy(r, &buf, 32, 16, 0);
if (num == 0)
return;
} else {
_extract_crng(&primary_crng, buf.block);
_crng_backtrack_protect(&primary_crng, buf.block,
CHACHA_KEY_SIZE);
}
spin_lock_irqsave(&crng->lock, flags);
for (i = 0; i < 8; i++) {
unsigned long rv;
if (!arch_get_random_seed_long(&rv) &&
!arch_get_random_long(&rv))
rv = random_get_entropy();
crng->state[i+4] ^= buf.key[i] ^ rv;
}
memzero_explicit(&buf, sizeof(buf));
WRITE_ONCE(crng->init_time, jiffies);
spin_unlock_irqrestore(&crng->lock, flags);
crng_finalize_init(crng);
}
static void _extract_crng(struct crng_state *crng,
__u8 out[CHACHA_BLOCK_SIZE])
{
unsigned long v, flags, init_time;
if (crng_ready()) { init_time = READ_ONCE(crng->init_time);
if (time_after(READ_ONCE(crng_global_init_time), init_time) ||
time_after(jiffies, init_time + CRNG_RESEED_INTERVAL)) crng_reseed(crng, crng == &primary_crng ?
&input_pool : NULL);
}
spin_lock_irqsave(&crng->lock, flags);
if (arch_get_random_long(&v))
crng->state[14] ^= v; chacha20_block(&crng->state[0], out);
if (crng->state[12] == 0)
crng->state[13]++;
spin_unlock_irqrestore(&crng->lock, flags);
}
static void extract_crng(__u8 out[CHACHA_BLOCK_SIZE])
{
_extract_crng(select_crng(), out);
}
/*
* Use the leftover bytes from the CRNG block output (if there is
* enough) to mutate the CRNG key to provide backtracking protection.
*/
static void _crng_backtrack_protect(struct crng_state *crng,
__u8 tmp[CHACHA_BLOCK_SIZE], int used)
{
unsigned long flags;
__u32 *s, *d;
int i;
used = round_up(used, sizeof(__u32));
if (used + CHACHA_KEY_SIZE > CHACHA_BLOCK_SIZE) {
extract_crng(tmp);
used = 0;
}
spin_lock_irqsave(&crng->lock, flags);
s = (__u32 *) &tmp[used];
d = &crng->state[4];
for (i=0; i < 8; i++)
*d++ ^= *s++;
spin_unlock_irqrestore(&crng->lock, flags);
}
static void crng_backtrack_protect(__u8 tmp[CHACHA_BLOCK_SIZE], int used)
{
_crng_backtrack_protect(select_crng(), tmp, used);
}
static ssize_t extract_crng_user(void __user *buf, size_t nbytes)
{
ssize_t ret = 0, i = CHACHA_BLOCK_SIZE;
__u8 tmp[CHACHA_BLOCK_SIZE] __aligned(4);
int large_request = (nbytes > 256);
while (nbytes) {
if (large_request && need_resched()) {
if (signal_pending(current)) {
if (ret == 0)
ret = -ERESTARTSYS;
break;
}
schedule();
}
extract_crng(tmp);
i = min_t(int, nbytes, CHACHA_BLOCK_SIZE);
if (copy_to_user(buf, tmp, i)) {
ret = -EFAULT;
break;
}
nbytes -= i;
buf += i;
ret += i;
}
crng_backtrack_protect(tmp, i);
/* Wipe data just written to memory */
memzero_explicit(tmp, sizeof(tmp));
return ret;
}
/*********************************************************************
*
* Entropy input management
*
*********************************************************************/
/* There is one of these per entropy source */
struct timer_rand_state {
cycles_t last_time;
long last_delta, last_delta2;
};
#define INIT_TIMER_RAND_STATE { INITIAL_JIFFIES, };
/*
* Add device- or boot-specific data to the input pool to help
* initialize it.
*
* None of this adds any entropy; it is meant to avoid the problem of
* the entropy pool having similar initial state across largely
* identical devices.
*/
void add_device_randomness(const void *buf, unsigned int size)
{
unsigned long time = random_get_entropy() ^ jiffies;
unsigned long flags;
if (!crng_ready() && size)
crng_slow_load(buf, size);
trace_add_device_randomness(size, _RET_IP_);
spin_lock_irqsave(&input_pool.lock, flags);
_mix_pool_bytes(&input_pool, buf, size);
_mix_pool_bytes(&input_pool, &time, sizeof(time));
spin_unlock_irqrestore(&input_pool.lock, flags);
}
EXPORT_SYMBOL(add_device_randomness);
static struct timer_rand_state input_timer_state = INIT_TIMER_RAND_STATE;
/*
* This function adds entropy to the entropy "pool" by using timing
* delays. It uses the timer_rand_state structure to make an estimate
* of how many bits of entropy this call has added to the pool.
*
* The number "num" is also added to the pool - it should somehow describe
* the type of event which just happened. This is currently 0-255 for
* keyboard scan codes, and 256 upwards for interrupts.
*
*/
static void add_timer_randomness(struct timer_rand_state *state, unsigned num)
{
struct entropy_store *r;
struct {
long jiffies;
unsigned cycles;
unsigned num;
} sample;
long delta, delta2, delta3;
sample.jiffies = jiffies;
sample.cycles = random_get_entropy();
sample.num = num;
r = &input_pool;
mix_pool_bytes(r, &sample, sizeof(sample));
/*
* Calculate number of bits of randomness we probably added.
* We take into account the first, second and third-order deltas
* in order to make our estimate.
*/
delta = sample.jiffies - READ_ONCE(state->last_time);
WRITE_ONCE(state->last_time, sample.jiffies);
delta2 = delta - READ_ONCE(state->last_delta);
WRITE_ONCE(state->last_delta, delta);
delta3 = delta2 - READ_ONCE(state->last_delta2);
WRITE_ONCE(state->last_delta2, delta2);
if (delta < 0)
delta = -delta;
if (delta2 < 0)
delta2 = -delta2;
if (delta3 < 0)
delta3 = -delta3;
if (delta > delta2)
delta = delta2;
if (delta > delta3)
delta = delta3;
/*
* delta is now minimum absolute delta.
* Round down by 1 bit on general principles,
* and limit entropy estimate to 12 bits.
*/
credit_entropy_bits(r, min_t(int, fls(delta>>1), 11));
}
void add_input_randomness(unsigned int type, unsigned int code,
unsigned int value)
{
static unsigned char last_value;
/* ignore autorepeat and the like */
if (value == last_value)
return;
last_value = value;
add_timer_randomness(&input_timer_state,
(type << 4) ^ code ^ (code >> 4) ^ value);
trace_add_input_randomness(ENTROPY_BITS(&input_pool));
}
EXPORT_SYMBOL_GPL(add_input_randomness);
static DEFINE_PER_CPU(struct fast_pool, irq_randomness);
#ifdef ADD_INTERRUPT_BENCH
static unsigned long avg_cycles, avg_deviation;
#define AVG_SHIFT 8 /* Exponential average factor k=1/256 */
#define FIXED_1_2 (1 << (AVG_SHIFT-1))
static void add_interrupt_bench(cycles_t start)
{
long delta = random_get_entropy() - start;
/* Use a weighted moving average */
delta = delta - ((avg_cycles + FIXED_1_2) >> AVG_SHIFT);
avg_cycles += delta;
/* And average deviation */
delta = abs(delta) - ((avg_deviation + FIXED_1_2) >> AVG_SHIFT);
avg_deviation += delta;
}
#else
#define add_interrupt_bench(x)
#endif
static __u32 get_reg(struct fast_pool *f, struct pt_regs *regs)
{
__u32 *ptr = (__u32 *) regs;
unsigned int idx;
if (regs == NULL)
return 0;
idx = READ_ONCE(f->reg_idx);
if (idx >= sizeof(struct pt_regs) / sizeof(__u32))
idx = 0;
ptr += idx++;
WRITE_ONCE(f->reg_idx, idx);
return *ptr;
}
void add_interrupt_randomness(int irq, int irq_flags)
{
struct entropy_store *r;
struct fast_pool *fast_pool = this_cpu_ptr(&irq_randomness);
struct pt_regs *regs = get_irq_regs();
unsigned long now = jiffies;
cycles_t cycles = random_get_entropy();
__u32 c_high, j_high;
__u64 ip;
if (cycles == 0)
cycles = get_reg(fast_pool, regs);
c_high = (sizeof(cycles) > 4) ? cycles >> 32 : 0;
j_high = (sizeof(now) > 4) ? now >> 32 : 0;
fast_pool->pool[0] ^= cycles ^ j_high ^ irq;
fast_pool->pool[1] ^= now ^ c_high;
ip = regs ? instruction_pointer(regs) : _RET_IP_;
fast_pool->pool[2] ^= ip;
fast_pool->pool[3] ^= (sizeof(ip) > 4) ? ip >> 32 :
get_reg(fast_pool, regs);
fast_mix(fast_pool);
add_interrupt_bench(cycles);
if (unlikely(crng_init == 0)) {
if ((fast_pool->count >= 64) &&
crng_fast_load((char *) fast_pool->pool,
sizeof(fast_pool->pool)) > 0) {
fast_pool->count = 0;
fast_pool->last = now;
}
return;
}
if ((fast_pool->count < 64) &&
!time_after(now, fast_pool->last + HZ))
return;
r = &input_pool;
if (!spin_trylock(&r->lock))
return;
fast_pool->last = now;
__mix_pool_bytes(r, &fast_pool->pool, sizeof(fast_pool->pool));
spin_unlock(&r->lock);
fast_pool->count = 0;
/* award one bit for the contents of the fast pool */
credit_entropy_bits(r, 1);
}
EXPORT_SYMBOL_GPL(add_interrupt_randomness);
#ifdef CONFIG_BLOCK
void add_disk_randomness(struct gendisk *disk)
{
if (!disk || !disk->random)
return;
/* first major is 1, so we get >= 0x200 here */
add_timer_randomness(disk->random, 0x100 + disk_devt(disk));
trace_add_disk_randomness(disk_devt(disk), ENTROPY_BITS(&input_pool));
}
EXPORT_SYMBOL_GPL(add_disk_randomness);
#endif
/*********************************************************************
*
* Entropy extraction routines
*
*********************************************************************/
/*
* This function decides how many bytes to actually take from the
* given pool, and also debits the entropy count accordingly.
*/
static size_t account(struct entropy_store *r, size_t nbytes, int min,
int reserved)
{
int entropy_count, orig, have_bytes;
size_t ibytes, nfrac;
BUG_ON(r->entropy_count > r->poolinfo->poolfracbits);
/* Can we pull enough? */
retry:
entropy_count = orig = READ_ONCE(r->entropy_count);
ibytes = nbytes;
/* never pull more than available */
have_bytes = entropy_count >> (ENTROPY_SHIFT + 3);
if ((have_bytes -= reserved) < 0)
have_bytes = 0;
ibytes = min_t(size_t, ibytes, have_bytes);
if (ibytes < min)
ibytes = 0;
if (WARN_ON(entropy_count < 0)) {
pr_warn("negative entropy count: pool %s count %d\n",
r->name, entropy_count);
entropy_count = 0;
}
nfrac = ibytes << (ENTROPY_SHIFT + 3);
if ((size_t) entropy_count > nfrac)
entropy_count -= nfrac;
else
entropy_count = 0;
if (cmpxchg(&r->entropy_count, orig, entropy_count) != orig)
goto retry;
trace_debit_entropy(r->name, 8 * ibytes);
if (ibytes && ENTROPY_BITS(r) < random_write_wakeup_bits) {
wake_up_interruptible(&random_write_wait);
kill_fasync(&fasync, SIGIO, POLL_OUT);
}
return ibytes;
}
/*
* This function does the actual extraction for extract_entropy.
*
* Note: we assume that .poolwords is a multiple of 16 words.
*/
static void extract_buf(struct entropy_store *r, __u8 *out)
{
int i;
union {
__u32 w[5];
unsigned long l[LONGS(20)];
} hash;
__u32 workspace[SHA1_WORKSPACE_WORDS];
unsigned long flags;
/*
* If we have an architectural hardware random number
* generator, use it for SHA's initial vector
*/
sha1_init(hash.w);
for (i = 0; i < LONGS(20); i++) {
unsigned long v;
if (!arch_get_random_long(&v))
break;
hash.l[i] = v;
}
/* Generate a hash across the pool, 16 words (512 bits) at a time */
spin_lock_irqsave(&r->lock, flags);
for (i = 0; i < r->poolinfo->poolwords; i += 16)
sha1_transform(hash.w, (__u8 *)(r->pool + i), workspace);
/*
* We mix the hash back into the pool to prevent backtracking
* attacks (where the attacker knows the state of the pool
* plus the current outputs, and attempts to find previous
* ouputs), unless the hash function can be inverted. By
* mixing at least a SHA1 worth of hash data back, we make
* brute-forcing the feedback as hard as brute-forcing the
* hash.
*/
__mix_pool_bytes(r, hash.w, sizeof(hash.w));
spin_unlock_irqrestore(&r->lock, flags);
memzero_explicit(workspace, sizeof(workspace));
/*
* In case the hash function has some recognizable output
* pattern, we fold it in half. Thus, we always feed back
* twice as much data as we output.
*/
hash.w[0] ^= hash.w[3];
hash.w[1] ^= hash.w[4];
hash.w[2] ^= rol32(hash.w[2], 16);
memcpy(out, &hash, EXTRACT_SIZE);
memzero_explicit(&hash, sizeof(hash));
}
static ssize_t _extract_entropy(struct entropy_store *r, void *buf,
size_t nbytes, int fips)
{
ssize_t ret = 0, i;
__u8 tmp[EXTRACT_SIZE];
unsigned long flags;
while (nbytes) {
extract_buf(r, tmp);
if (fips) {
spin_lock_irqsave(&r->lock, flags);
if (!memcmp(tmp, r->last_data, EXTRACT_SIZE))
panic("Hardware RNG duplicated output!\n");
memcpy(r->last_data, tmp, EXTRACT_SIZE);
spin_unlock_irqrestore(&r->lock, flags);
}
i = min_t(int, nbytes, EXTRACT_SIZE);
memcpy(buf, tmp, i);
nbytes -= i;
buf += i;
ret += i;
}
/* Wipe data just returned from memory */
memzero_explicit(tmp, sizeof(tmp));
return ret;
}
/*
* This function extracts randomness from the "entropy pool", and
* returns it in a buffer.
*
* The min parameter specifies the minimum amount we can pull before
* failing to avoid races that defeat catastrophic reseeding while the
* reserved parameter indicates how much entropy we must leave in the
* pool after each pull to avoid starving other readers.
*/
static ssize_t extract_entropy(struct entropy_store *r, void *buf,
size_t nbytes, int min, int reserved)
{
__u8 tmp[EXTRACT_SIZE];
unsigned long flags;
/* if last_data isn't primed, we need EXTRACT_SIZE extra bytes */
if (fips_enabled) {
spin_lock_irqsave(&r->lock, flags);
if (!r->last_data_init) {
r->last_data_init = 1;
spin_unlock_irqrestore(&r->lock, flags);
trace_extract_entropy(r->name, EXTRACT_SIZE,
ENTROPY_BITS(r), _RET_IP_);
extract_buf(r, tmp);
spin_lock_irqsave(&r->lock, flags);
memcpy(r->last_data, tmp, EXTRACT_SIZE);
}
spin_unlock_irqrestore(&r->lock, flags);
}
trace_extract_entropy(r->name, nbytes, ENTROPY_BITS(r), _RET_IP_);
nbytes = account(r, nbytes, min, reserved);
return _extract_entropy(r, buf, nbytes, fips_enabled);
}
#define warn_unseeded_randomness(previous) \
_warn_unseeded_randomness(__func__, (void *) _RET_IP_, (previous))
static void _warn_unseeded_randomness(const char *func_name, void *caller,
void **previous)
{
#ifdef CONFIG_WARN_ALL_UNSEEDED_RANDOM
const bool print_once = false;
#else
static bool print_once __read_mostly;
#endif
if (print_once || crng_ready() || (previous && (caller == READ_ONCE(*previous))))
return;
WRITE_ONCE(*previous, caller);
#ifndef CONFIG_WARN_ALL_UNSEEDED_RANDOM
print_once = true;
#endif
if (__ratelimit(&unseeded_warning))
printk_deferred(KERN_NOTICE "random: %s called from %pS "
"with crng_init=%d\n", func_name, caller,
crng_init);
}
/*
* This function is the exported kernel interface. It returns some
* number of good random numbers, suitable for key generation, seeding
* TCP sequence numbers, etc. It does not rely on the hardware random
* number generator. For random bytes direct from the hardware RNG
* (when available), use get_random_bytes_arch(). In order to ensure
* that the randomness provided by this function is okay, the function
* wait_for_random_bytes() should be called and return 0 at least once
* at any point prior.
*/
static void _get_random_bytes(void *buf, int nbytes)
{
__u8 tmp[CHACHA_BLOCK_SIZE] __aligned(4);
trace_get_random_bytes(nbytes, _RET_IP_);
while (nbytes >= CHACHA_BLOCK_SIZE) {
extract_crng(buf);
buf += CHACHA_BLOCK_SIZE;
nbytes -= CHACHA_BLOCK_SIZE;
}
if (nbytes > 0) {
extract_crng(tmp);
memcpy(buf, tmp, nbytes);
crng_backtrack_protect(tmp, nbytes);
} else
crng_backtrack_protect(tmp, CHACHA_BLOCK_SIZE);
memzero_explicit(tmp, sizeof(tmp));
}
void get_random_bytes(void *buf, int nbytes)
{
static void *previous;
warn_unseeded_randomness(&previous);
_get_random_bytes(buf, nbytes);
}
EXPORT_SYMBOL(get_random_bytes);
/*
* Each time the timer fires, we expect that we got an unpredictable
* jump in the cycle counter. Even if the timer is running on another
* CPU, the timer activity will be touching the stack of the CPU that is
* generating entropy..
*
* Note that we don't re-arm the timer in the timer itself - we are
* happy to be scheduled away, since that just makes the load more
* complex, but we do not want the timer to keep ticking unless the
* entropy loop is running.
*
* So the re-arming always happens in the entropy loop itself.
*/
static void entropy_timer(struct timer_list *t)
{
credit_entropy_bits(&input_pool, 1);
}
/*
* If we have an actual cycle counter, see if we can
* generate enough entropy with timing noise
*/
static void try_to_generate_entropy(void)
{
struct {
unsigned long now;
struct timer_list timer;
} stack;
stack.now = random_get_entropy();
/* Slow counter - or none. Don't even bother */
if (stack.now == random_get_entropy())
return;
timer_setup_on_stack(&stack.timer, entropy_timer, 0);
while (!crng_ready()) {
if (!timer_pending(&stack.timer))
mod_timer(&stack.timer, jiffies+1);
mix_pool_bytes(&input_pool, &stack.now, sizeof(stack.now));
schedule();
stack.now = random_get_entropy();
}
del_timer_sync(&stack.timer);
destroy_timer_on_stack(&stack.timer);
mix_pool_bytes(&input_pool, &stack.now, sizeof(stack.now));
}
/*
* Wait for the urandom pool to be seeded and thus guaranteed to supply
* cryptographically secure random numbers. This applies to: the /dev/urandom
* device, the get_random_bytes function, and the get_random_{u32,u64,int,long}
* family of functions. Using any of these functions without first calling
* this function forfeits the guarantee of security.
*
* Returns: 0 if the urandom pool has been seeded.
* -ERESTARTSYS if the function was interrupted by a signal.
*/
int wait_for_random_bytes(void)
{
if (likely(crng_ready()))
return 0;
do {
int ret;
ret = wait_event_interruptible_timeout(crng_init_wait, crng_ready(), HZ);
if (ret)
return ret > 0 ? 0 : ret;
try_to_generate_entropy();
} while (!crng_ready());
return 0;
}
EXPORT_SYMBOL(wait_for_random_bytes);
/*
* Returns whether or not the urandom pool has been seeded and thus guaranteed
* to supply cryptographically secure random numbers. This applies to: the
* /dev/urandom device, the get_random_bytes function, and the get_random_{u32,
* ,u64,int,long} family of functions.
*
* Returns: true if the urandom pool has been seeded.
* false if the urandom pool has not been seeded.
*/
bool rng_is_initialized(void)
{
return crng_ready();
}
EXPORT_SYMBOL(rng_is_initialized);
/*
* Add a callback function that will be invoked when the nonblocking
* pool is initialised.
*
* returns: 0 if callback is successfully added
* -EALREADY if pool is already initialised (callback not called)
* -ENOENT if module for callback is not alive
*/
int add_random_ready_callback(struct random_ready_callback *rdy)
{
struct module *owner;
unsigned long flags;
int err = -EALREADY;
if (crng_ready())
return err;
owner = rdy->owner;
if (!try_module_get(owner))
return -ENOENT;
spin_lock_irqsave(&random_ready_list_lock, flags);
if (crng_ready())
goto out;
owner = NULL;
list_add(&rdy->list, &random_ready_list);
err = 0;
out:
spin_unlock_irqrestore(&random_ready_list_lock, flags);
module_put(owner);
return err;
}
EXPORT_SYMBOL(add_random_ready_callback);
/*
* Delete a previously registered readiness callback function.
*/
void del_random_ready_callback(struct random_ready_callback *rdy)
{
unsigned long flags;
struct module *owner = NULL;
spin_lock_irqsave(&random_ready_list_lock, flags);
if (!list_empty(&rdy->list)) {
list_del_init(&rdy->list);
owner = rdy->owner;
}
spin_unlock_irqrestore(&random_ready_list_lock, flags);
module_put(owner);
}
EXPORT_SYMBOL(del_random_ready_callback);
/*
* This function will use the architecture-specific hardware random
* number generator if it is available. The arch-specific hw RNG will
* almost certainly be faster than what we can do in software, but it
* is impossible to verify that it is implemented securely (as
* opposed, to, say, the AES encryption of a sequence number using a
* key known by the NSA). So it's useful if we need the speed, but
* only if we're willing to trust the hardware manufacturer not to
* have put in a back door.
*
* Return number of bytes filled in.
*/
int __must_check get_random_bytes_arch(void *buf, int nbytes)
{
int left = nbytes;
char *p = buf;
trace_get_random_bytes_arch(left, _RET_IP_);
while (left) {
unsigned long v;
int chunk = min_t(int, left, sizeof(unsigned long));
if (!arch_get_random_long(&v))
break;
memcpy(p, &v, chunk);
p += chunk;
left -= chunk;
}
return nbytes - left;
}
EXPORT_SYMBOL(get_random_bytes_arch);
/*
* init_std_data - initialize pool with system data
*
* @r: pool to initialize
*
* This function clears the pool's entropy count and mixes some system
* data into the pool to prepare it for use. The pool is not cleared
* as that can only decrease the entropy in the pool.
*/
static void __init init_std_data(struct entropy_store *r)
{
int i;
ktime_t now = ktime_get_real();
unsigned long rv;
mix_pool_bytes(r, &now, sizeof(now));
for (i = r->poolinfo->poolbytes; i > 0; i -= sizeof(rv)) {
if (!arch_get_random_seed_long(&rv) &&
!arch_get_random_long(&rv))
rv = random_get_entropy();
mix_pool_bytes(r, &rv, sizeof(rv));
}
mix_pool_bytes(r, utsname(), sizeof(*(utsname())));
}
/*
* Note that setup_arch() may call add_device_randomness()
* long before we get here. This allows seeding of the pools
* with some platform dependent data very early in the boot
* process. But it limits our options here. We must use
* statically allocated structures that already have all
* initializations complete at compile time. We should also
* take care not to overwrite the precious per platform data
* we were given.
*/
int __init rand_initialize(void)
{
init_std_data(&input_pool);
if (crng_need_final_init)
crng_finalize_init(&primary_crng);
crng_initialize_primary(&primary_crng);
crng_global_init_time = jiffies;
if (ratelimit_disable) {
urandom_warning.interval = 0;
unseeded_warning.interval = 0;
}
return 0;
}
#ifdef CONFIG_BLOCK
void rand_initialize_disk(struct gendisk *disk)
{
struct timer_rand_state *state;
/*
* If kzalloc returns null, we just won't use that entropy
* source.
*/
state = kzalloc(sizeof(struct timer_rand_state), GFP_KERNEL);
if (state) {
state->last_time = INITIAL_JIFFIES;
disk->random = state;
}
}
#endif
static ssize_t
urandom_read_nowarn(struct file *file, char __user *buf, size_t nbytes,
loff_t *ppos)
{
int ret;
nbytes = min_t(size_t, nbytes, INT_MAX >> (ENTROPY_SHIFT + 3));
ret = extract_crng_user(buf, nbytes);
trace_urandom_read(8 * nbytes, 0, ENTROPY_BITS(&input_pool));
return ret;
}
static ssize_t
urandom_read(struct file *file, char __user *buf, size_t nbytes, loff_t *ppos)
{
unsigned long flags;
static int maxwarn = 10;
if (!crng_ready() && maxwarn > 0) {
maxwarn--;
if (__ratelimit(&urandom_warning))
pr_notice("%s: uninitialized urandom read (%zd bytes read)\n",
current->comm, nbytes);
spin_lock_irqsave(&primary_crng.lock, flags);
crng_init_cnt = 0;
spin_unlock_irqrestore(&primary_crng.lock, flags);
}
return urandom_read_nowarn(file, buf, nbytes, ppos);
}
static ssize_t
random_read(struct file *file, char __user *buf, size_t nbytes, loff_t *ppos)
{
int ret;
ret = wait_for_random_bytes();
if (ret != 0)
return ret;
return urandom_read_nowarn(file, buf, nbytes, ppos);
}
static __poll_t
random_poll(struct file *file, poll_table * wait)
{
__poll_t mask;
poll_wait(file, &crng_init_wait, wait);
poll_wait(file, &random_write_wait, wait);
mask = 0;
if (crng_ready())
mask |= EPOLLIN | EPOLLRDNORM;
if (ENTROPY_BITS(&input_pool) < random_write_wakeup_bits)
mask |= EPOLLOUT | EPOLLWRNORM;
return mask;
}
static int
write_pool(struct entropy_store *r, const char __user *buffer, size_t count)
{
size_t bytes;
__u32 t, buf[16];
const char __user *p = buffer;
while (count > 0) {
int b, i = 0;
bytes = min(count, sizeof(buf));
if (copy_from_user(&buf, p, bytes))
return -EFAULT;
for (b = bytes ; b > 0 ; b -= sizeof(__u32), i++) {
if (!arch_get_random_int(&t))
break;
buf[i] ^= t;
}
count -= bytes;
p += bytes;
mix_pool_bytes(r, buf, bytes);
cond_resched();
}
return 0;
}
static ssize_t random_write(struct file *file, const char __user *buffer,
size_t count, loff_t *ppos)
{
size_t ret;
ret = write_pool(&input_pool, buffer, count);
if (ret)
return ret;
return (ssize_t)count;
}
static long random_ioctl(struct file *f, unsigned int cmd, unsigned long arg)
{
int size, ent_count;
int __user *p = (int __user *)arg;
int retval;
switch (cmd) {
case RNDGETENTCNT:
/* inherently racy, no point locking */
ent_count = ENTROPY_BITS(&input_pool);
if (put_user(ent_count, p))
return -EFAULT;
return 0;
case RNDADDTOENTCNT:
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
if (get_user(ent_count, p))
return -EFAULT;
return credit_entropy_bits_safe(&input_pool, ent_count);
case RNDADDENTROPY:
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
if (get_user(ent_count, p++))
return -EFAULT;
if (ent_count < 0)
return -EINVAL;
if (get_user(size, p++))
return -EFAULT;
retval = write_pool(&input_pool, (const char __user *)p,
size);
if (retval < 0)
return retval;
return credit_entropy_bits_safe(&input_pool, ent_count);
case RNDZAPENTCNT:
case RNDCLEARPOOL:
/*
* Clear the entropy pool counters. We no longer clear
* the entropy pool, as that's silly.
*/
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
if (xchg(&input_pool.entropy_count, 0) && random_write_wakeup_bits) {
wake_up_interruptible(&random_write_wait);
kill_fasync(&fasync, SIGIO, POLL_OUT);
}
return 0;
case RNDRESEEDCRNG:
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
if (crng_init < 2)
return -ENODATA;
crng_reseed(&primary_crng, &input_pool);
WRITE_ONCE(crng_global_init_time, jiffies - 1);
return 0;
default:
return -EINVAL;
}
}
static int random_fasync(int fd, struct file *filp, int on)
{
return fasync_helper(fd, filp, on, &fasync);
}
const struct file_operations random_fops = {
.read = random_read,
.write = random_write,
.poll = random_poll,
.unlocked_ioctl = random_ioctl,
.compat_ioctl = compat_ptr_ioctl,
.fasync = random_fasync,
.llseek = noop_llseek,
};
const struct file_operations urandom_fops = {
.read = urandom_read,
.write = random_write,
.unlocked_ioctl = random_ioctl,
.compat_ioctl = compat_ptr_ioctl,
.fasync = random_fasync,
.llseek = noop_llseek,
};
SYSCALL_DEFINE3(getrandom, char __user *, buf, size_t, count,
unsigned int, flags)
{
int ret;
if (flags & ~(GRND_NONBLOCK|GRND_RANDOM|GRND_INSECURE))
return -EINVAL;
/*
* Requesting insecure and blocking randomness at the same time makes
* no sense.
*/
if ((flags & (GRND_INSECURE|GRND_RANDOM)) == (GRND_INSECURE|GRND_RANDOM))
return -EINVAL;
if (count > INT_MAX)
count = INT_MAX;
if (!(flags & GRND_INSECURE) && !crng_ready()) {
if (flags & GRND_NONBLOCK)
return -EAGAIN;
ret = wait_for_random_bytes();
if (unlikely(ret))
return ret;
}
return urandom_read_nowarn(NULL, buf, count, NULL);
}
/********************************************************************
*
* Sysctl interface
*
********************************************************************/
#ifdef CONFIG_SYSCTL
#include <linux/sysctl.h>
static int min_write_thresh;
static int max_write_thresh = INPUT_POOL_WORDS * 32;
static int random_min_urandom_seed = 60;
static char sysctl_bootid[16];
/*
* This function is used to return both the bootid UUID, and random
* UUID. The difference is in whether table->data is NULL; if it is,
* then a new UUID is generated and returned to the user.
*
* If the user accesses this via the proc interface, the UUID will be
* returned as an ASCII string in the standard UUID format; if via the
* sysctl system call, as 16 bytes of binary data.
*/
static int proc_do_uuid(struct ctl_table *table, int write,
void *buffer, size_t *lenp, loff_t *ppos)
{
struct ctl_table fake_table;
unsigned char buf[64], tmp_uuid[16], *uuid;
uuid = table->data;
if (!uuid) {
uuid = tmp_uuid;
generate_random_uuid(uuid);
} else {
static DEFINE_SPINLOCK(bootid_spinlock);
spin_lock(&bootid_spinlock);
if (!uuid[8])
generate_random_uuid(uuid);
spin_unlock(&bootid_spinlock);
}
sprintf(buf, "%pU", uuid);
fake_table.data = buf;
fake_table.maxlen = sizeof(buf);
return proc_dostring(&fake_table, write, buffer, lenp, ppos);
}
/*
* Return entropy available scaled to integral bits
*/
static int proc_do_entropy(struct ctl_table *table, int write,
void *buffer, size_t *lenp, loff_t *ppos)
{
struct ctl_table fake_table;
int entropy_count;
entropy_count = *(int *)table->data >> ENTROPY_SHIFT;
fake_table.data = &entropy_count;
fake_table.maxlen = sizeof(entropy_count);
return proc_dointvec(&fake_table, write, buffer, lenp, ppos);
}
static int sysctl_poolsize = INPUT_POOL_WORDS * 32;
extern struct ctl_table random_table[];
struct ctl_table random_table[] = {
{
.procname = "poolsize",
.data = &sysctl_poolsize,
.maxlen = sizeof(int),
.mode = 0444,
.proc_handler = proc_dointvec,
},
{
.procname = "entropy_avail",
.maxlen = sizeof(int),
.mode = 0444,
.proc_handler = proc_do_entropy,
.data = &input_pool.entropy_count,
},
{
.procname = "write_wakeup_threshold",
.data = &random_write_wakeup_bits,
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec_minmax,
.extra1 = &min_write_thresh,
.extra2 = &max_write_thresh,
},
{
.procname = "urandom_min_reseed_secs",
.data = &random_min_urandom_seed,
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec,
},
{
.procname = "boot_id",
.data = &sysctl_bootid,
.maxlen = 16,
.mode = 0444,
.proc_handler = proc_do_uuid,
},
{
.procname = "uuid",
.maxlen = 16,
.mode = 0444,
.proc_handler = proc_do_uuid,
},
#ifdef ADD_INTERRUPT_BENCH
{
.procname = "add_interrupt_avg_cycles",
.data = &avg_cycles,
.maxlen = sizeof(avg_cycles),
.mode = 0444,
.proc_handler = proc_doulongvec_minmax,
},
{
.procname = "add_interrupt_avg_deviation",
.data = &avg_deviation,
.maxlen = sizeof(avg_deviation),
.mode = 0444,
.proc_handler = proc_doulongvec_minmax,
},
#endif
{ }
};
#endif /* CONFIG_SYSCTL */
struct batched_entropy {
union {
u64 entropy_u64[CHACHA_BLOCK_SIZE / sizeof(u64)];
u32 entropy_u32[CHACHA_BLOCK_SIZE / sizeof(u32)];
};
unsigned int position;
spinlock_t batch_lock;
};
/*
* Get a random word for internal kernel use only. The quality of the random
* number is good as /dev/urandom, but there is no backtrack protection, with
* the goal of being quite fast and not depleting entropy. In order to ensure
* that the randomness provided by this function is okay, the function
* wait_for_random_bytes() should be called and return 0 at least once at any
* point prior.
*/
static DEFINE_PER_CPU(struct batched_entropy, batched_entropy_u64) = {
.batch_lock = __SPIN_LOCK_UNLOCKED(batched_entropy_u64.lock),
};
u64 get_random_u64(void)
{
u64 ret;
unsigned long flags;
struct batched_entropy *batch;
static void *previous;
warn_unseeded_randomness(&previous);
batch = raw_cpu_ptr(&batched_entropy_u64);
spin_lock_irqsave(&batch->batch_lock, flags);
if (batch->position % ARRAY_SIZE(batch->entropy_u64) == 0) {
extract_crng((u8 *)batch->entropy_u64);
batch->position = 0;
}
ret = batch->entropy_u64[batch->position++];
spin_unlock_irqrestore(&batch->batch_lock, flags);
return ret;
}
EXPORT_SYMBOL(get_random_u64);
static DEFINE_PER_CPU(struct batched_entropy, batched_entropy_u32) = {
.batch_lock = __SPIN_LOCK_UNLOCKED(batched_entropy_u32.lock),
};
u32 get_random_u32(void)
{
u32 ret;
unsigned long flags;
struct batched_entropy *batch;
static void *previous;
warn_unseeded_randomness(&previous);
batch = raw_cpu_ptr(&batched_entropy_u32);
spin_lock_irqsave(&batch->batch_lock, flags);
if (batch->position % ARRAY_SIZE(batch->entropy_u32) == 0) { extract_crng((u8 *)batch->entropy_u32);
batch->position = 0;
}
ret = batch->entropy_u32[batch->position++];
spin_unlock_irqrestore(&batch->batch_lock, flags);
return ret;
}
EXPORT_SYMBOL(get_random_u32);
/* It's important to invalidate all potential batched entropy that might
* be stored before the crng is initialized, which we can do lazily by
* simply resetting the counter to zero so that it's re-extracted on the
* next usage. */
static void invalidate_batched_entropy(void)
{
int cpu;
unsigned long flags;
for_each_possible_cpu (cpu) {
struct batched_entropy *batched_entropy;
batched_entropy = per_cpu_ptr(&batched_entropy_u32, cpu);
spin_lock_irqsave(&batched_entropy->batch_lock, flags);
batched_entropy->position = 0;
spin_unlock(&batched_entropy->batch_lock);
batched_entropy = per_cpu_ptr(&batched_entropy_u64, cpu);
spin_lock(&batched_entropy->batch_lock);
batched_entropy->position = 0;
spin_unlock_irqrestore(&batched_entropy->batch_lock, flags);
}
}
/**
* randomize_page - Generate a random, page aligned address
* @start: The smallest acceptable address the caller will take.
* @range: The size of the area, starting at @start, within which the
* random address must fall.
*
* If @start + @range would overflow, @range is capped.
*
* NOTE: Historical use of randomize_range, which this replaces, presumed that
* @start was already page aligned. We now align it regardless.
*
* Return: A page aligned address within [start, start + range). On error,
* @start is returned.
*/
unsigned long
randomize_page(unsigned long start, unsigned long range)
{
if (!PAGE_ALIGNED(start)) {
range -= PAGE_ALIGN(start) - start;
start = PAGE_ALIGN(start);
}
if (start > ULONG_MAX - range)
range = ULONG_MAX - start;
range >>= PAGE_SHIFT;
if (range == 0)
return start;
return start + (get_random_long() % range << PAGE_SHIFT);
}
/* Interface for in-kernel drivers of true hardware RNGs.
* Those devices may produce endless random bits and will be throttled
* when our pool is full.
*/
void add_hwgenerator_randomness(const char *buffer, size_t count,
size_t entropy)
{
struct entropy_store *poolp = &input_pool;
if (unlikely(crng_init == 0)) {
size_t ret = crng_fast_load(buffer, count);
count -= ret;
buffer += ret;
if (!count || crng_init == 0)
return;
}
/* Suspend writing if we're above the trickle threshold.
* We'll be woken up again once below random_write_wakeup_thresh,
* or when the calling thread is about to terminate.
*/
wait_event_interruptible(random_write_wait,
!system_wq || kthread_should_stop() ||
ENTROPY_BITS(&input_pool) <= random_write_wakeup_bits);
mix_pool_bytes(poolp, buffer, count);
credit_entropy_bits(poolp, entropy);
}
EXPORT_SYMBOL_GPL(add_hwgenerator_randomness);
/* Handle random seed passed by bootloader.
* If the seed is trustworthy, it would be regarded as hardware RNGs. Otherwise
* it would be regarded as device data.
* The decision is controlled by CONFIG_RANDOM_TRUST_BOOTLOADER.
*/
void add_bootloader_randomness(const void *buf, unsigned int size)
{
if (IS_ENABLED(CONFIG_RANDOM_TRUST_BOOTLOADER))
add_hwgenerator_randomness(buf, size, size * 8);
else
add_device_randomness(buf, size);
}
EXPORT_SYMBOL_GPL(add_bootloader_randomness);
// SPDX-License-Identifier: GPL-2.0
/*
* Copyright (C) 2007 Oracle. All rights reserved.
* Copyright (C) 2014 Fujitsu. All rights reserved.
*/
#include <linux/kthread.h>
#include <linux/slab.h>
#include <linux/list.h>
#include <linux/spinlock.h>
#include <linux/freezer.h>
#include "async-thread.h"
#include "ctree.h"
enum {
WORK_DONE_BIT,
WORK_ORDER_DONE_BIT,
WORK_HIGH_PRIO_BIT,
};
#define NO_THRESHOLD (-1)
#define DFT_THRESHOLD (32)
struct __btrfs_workqueue {
struct workqueue_struct *normal_wq;
/* File system this workqueue services */
struct btrfs_fs_info *fs_info;
/* List head pointing to ordered work list */
struct list_head ordered_list;
/* Spinlock for ordered_list */
spinlock_t list_lock;
/* Thresholding related variants */
atomic_t pending;
/* Up limit of concurrency workers */
int limit_active;
/* Current number of concurrency workers */
int current_active;
/* Threshold to change current_active */
int thresh;
unsigned int count;
spinlock_t thres_lock;
};
struct btrfs_workqueue {
struct __btrfs_workqueue *normal;
struct __btrfs_workqueue *high;
};
struct btrfs_fs_info * __pure btrfs_workqueue_owner(const struct __btrfs_workqueue *wq)
{
return wq->fs_info;
}
struct btrfs_fs_info * __pure btrfs_work_owner(const struct btrfs_work *work)
{
return work->wq->fs_info;
}
bool btrfs_workqueue_normal_congested(const struct btrfs_workqueue *wq)
{
/*
* We could compare wq->normal->pending with num_online_cpus()
* to support "thresh == NO_THRESHOLD" case, but it requires
* moving up atomic_inc/dec in thresh_queue/exec_hook. Let's
* postpone it until someone needs the support of that case.
*/
if (wq->normal->thresh == NO_THRESHOLD)
return false;
return atomic_read(&wq->normal->pending) > wq->normal->thresh * 2;
}
static struct __btrfs_workqueue *
__btrfs_alloc_workqueue(struct btrfs_fs_info *fs_info, const char *name,
unsigned int flags, int limit_active, int thresh)
{
struct __btrfs_workqueue *ret = kzalloc(sizeof(*ret), GFP_KERNEL);
if (!ret)
return NULL;
ret->fs_info = fs_info;
ret->limit_active = limit_active;
atomic_set(&ret->pending, 0);
if (thresh == 0)
thresh = DFT_THRESHOLD;
/* For low threshold, disabling threshold is a better choice */
if (thresh < DFT_THRESHOLD) { ret->current_active = limit_active;
ret->thresh = NO_THRESHOLD;
} else {
/*
* For threshold-able wq, let its concurrency grow on demand.
* Use minimal max_active at alloc time to reduce resource
* usage.
*/
ret->current_active = 1;
ret->thresh = thresh;
}
if (flags & WQ_HIGHPRI) ret->normal_wq = alloc_workqueue("btrfs-%s-high", flags,
ret->current_active, name);
else
ret->normal_wq = alloc_workqueue("btrfs-%s", flags,
ret->current_active, name);
if (!ret->normal_wq) { kfree(ret);
return NULL;
}
INIT_LIST_HEAD(&ret->ordered_list);
spin_lock_init(&ret->list_lock);
spin_lock_init(&ret->thres_lock);
trace_btrfs_workqueue_alloc(ret, name, flags & WQ_HIGHPRI);
return ret;
}
static inline void
__btrfs_destroy_workqueue(struct __btrfs_workqueue *wq);
struct btrfs_workqueue *btrfs_alloc_workqueue(struct btrfs_fs_info *fs_info,
const char *name,
unsigned int flags,
int limit_active,
int thresh)
{
struct btrfs_workqueue *ret = kzalloc(sizeof(*ret), GFP_KERNEL);
if (!ret)
return NULL;
ret->normal = __btrfs_alloc_workqueue(fs_info, name,
flags & ~WQ_HIGHPRI,
limit_active, thresh);
if (!ret->normal) {
kfree(ret);
return NULL;
}
if (flags & WQ_HIGHPRI) { ret->high = __btrfs_alloc_workqueue(fs_info, name, flags,
limit_active, thresh);
if (!ret->high) {
__btrfs_destroy_workqueue(ret->normal);
kfree(ret);
return NULL;
}
}
return ret;
}
/*
* Hook for threshold which will be called in btrfs_queue_work.
* This hook WILL be called in IRQ handler context,
* so workqueue_set_max_active MUST NOT be called in this hook
*/
static inline void thresh_queue_hook(struct __btrfs_workqueue *wq)
{
if (wq->thresh == NO_THRESHOLD)
return;
atomic_inc(&wq->pending);
}
/*
* Hook for threshold which will be called before executing the work,
* This hook is called in kthread content.
* So workqueue_set_max_active is called here.
*/
static inline void thresh_exec_hook(struct __btrfs_workqueue *wq)
{
int new_current_active;
long pending;
int need_change = 0;
if (wq->thresh == NO_THRESHOLD)
return;
atomic_dec(&wq->pending);
spin_lock(&wq->thres_lock);
/*
* Use wq->count to limit the calling frequency of
* workqueue_set_max_active.
*/
wq->count++;
wq->count %= (wq->thresh / 4);
if (!wq->count)
goto out;
new_current_active = wq->current_active;
/*
* pending may be changed later, but it's OK since we really
* don't need it so accurate to calculate new_max_active.
*/
pending = atomic_read(&wq->pending);
if (pending > wq->thresh)
new_current_active++;
if (pending < wq->thresh / 2)
new_current_active--;
new_current_active = clamp_val(new_current_active, 1, wq->limit_active);
if (new_current_active != wq->current_active) {
need_change = 1;
wq->current_active = new_current_active;
}
out:
spin_unlock(&wq->thres_lock);
if (need_change) {
workqueue_set_max_active(wq->normal_wq, wq->current_active);
}
}
static void run_ordered_work(struct __btrfs_workqueue *wq,
struct btrfs_work *self)
{
struct list_head *list = &wq->ordered_list;
struct btrfs_work *work;
spinlock_t *lock = &wq->list_lock;
unsigned long flags;
bool free_self = false;
while (1) {
spin_lock_irqsave(lock, flags);
if (list_empty(list))
break;
work = list_entry(list->next, struct btrfs_work,
ordered_list);
if (!test_bit(WORK_DONE_BIT, &work->flags))
break;
/*
* Orders all subsequent loads after reading WORK_DONE_BIT,
* paired with the smp_mb__before_atomic in btrfs_work_helper
* this guarantees that the ordered function will see all
* updates from ordinary work function.
*/
smp_rmb();
/*
* we are going to call the ordered done function, but
* we leave the work item on the list as a barrier so
* that later work items that are done don't have their
* functions called before this one returns
*/
if (test_and_set_bit(WORK_ORDER_DONE_BIT, &work->flags))
break;
trace_btrfs_ordered_sched(work);
spin_unlock_irqrestore(lock, flags);
work->ordered_func(work);
/* now take the lock again and drop our item from the list */
spin_lock_irqsave(lock, flags);
list_del(&work->ordered_list);
spin_unlock_irqrestore(lock, flags);
if (work == self) {
/*
* This is the work item that the worker is currently
* executing.
*
* The kernel workqueue code guarantees non-reentrancy
* of work items. I.e., if a work item with the same
* address and work function is queued twice, the second
* execution is blocked until the first one finishes. A
* work item may be freed and recycled with the same
* work function; the workqueue code assumes that the
* original work item cannot depend on the recycled work
* item in that case (see find_worker_executing_work()).
*
* Note that different types of Btrfs work can depend on
* each other, and one type of work on one Btrfs
* filesystem may even depend on the same type of work
* on another Btrfs filesystem via, e.g., a loop device.
* Therefore, we must not allow the current work item to
* be recycled until we are really done, otherwise we
* break the above assumption and can deadlock.
*/
free_self = true;
} else {
/*
* We don't want to call the ordered free functions with
* the lock held.
*/
work->ordered_free(work);
/* NB: work must not be dereferenced past this point. */
trace_btrfs_all_work_done(wq->fs_info, work);
}
}
spin_unlock_irqrestore(lock, flags);
if (free_self) {
self->ordered_free(self);
/* NB: self must not be dereferenced past this point. */
trace_btrfs_all_work_done(wq->fs_info, self);
}
}
static void btrfs_work_helper(struct work_struct *normal_work)
{
struct btrfs_work *work = container_of(normal_work, struct btrfs_work,
normal_work);
struct __btrfs_workqueue *wq;
int need_order = 0;
/*
* We should not touch things inside work in the following cases:
* 1) after work->func() if it has no ordered_free
* Since the struct is freed in work->func().
* 2) after setting WORK_DONE_BIT
* The work may be freed in other threads almost instantly.
* So we save the needed things here.
*/
if (work->ordered_func)
need_order = 1;
wq = work->wq;
trace_btrfs_work_sched(work);
thresh_exec_hook(wq);
work->func(work);
if (need_order) {
/*
* Ensures all memory accesses done in the work function are
* ordered before setting the WORK_DONE_BIT. Ensuring the thread
* which is going to executed the ordered work sees them.
* Pairs with the smp_rmb in run_ordered_work.
*/
smp_mb__before_atomic();
set_bit(WORK_DONE_BIT, &work->flags);
run_ordered_work(wq, work);
} else {
/* NB: work must not be dereferenced past this point. */
trace_btrfs_all_work_done(wq->fs_info, work);
}
}
void btrfs_init_work(struct btrfs_work *work, btrfs_func_t func,
btrfs_func_t ordered_func, btrfs_func_t ordered_free)
{
work->func = func;
work->ordered_func = ordered_func;
work->ordered_free = ordered_free;
INIT_WORK(&work->normal_work, btrfs_work_helper);
INIT_LIST_HEAD(&work->ordered_list);
work->flags = 0;
}
static inline void __btrfs_queue_work(struct __btrfs_workqueue *wq,
struct btrfs_work *work)
{
unsigned long flags;
work->wq = wq;
thresh_queue_hook(wq);
if (work->ordered_func) {
spin_lock_irqsave(&wq->list_lock, flags);
list_add_tail(&work->ordered_list, &wq->ordered_list);
spin_unlock_irqrestore(&wq->list_lock, flags);
}
trace_btrfs_work_queued(work);
queue_work(wq->normal_wq, &work->normal_work);
}
void btrfs_queue_work(struct btrfs_workqueue *wq,
struct btrfs_work *work)
{
struct __btrfs_workqueue *dest_wq;
if (test_bit(WORK_HIGH_PRIO_BIT, &work->flags) && wq->high)
dest_wq = wq->high;
else
dest_wq = wq->normal;
__btrfs_queue_work(dest_wq, work);
}
static inline void
__btrfs_destroy_workqueue(struct __btrfs_workqueue *wq)
{
destroy_workqueue(wq->normal_wq);
trace_btrfs_workqueue_destroy(wq);
kfree(wq);
}
void btrfs_destroy_workqueue(struct btrfs_workqueue *wq)
{
if (!wq)
return;
if (wq->high)
__btrfs_destroy_workqueue(wq->high);
__btrfs_destroy_workqueue(wq->normal);
kfree(wq);
}
void btrfs_workqueue_set_max(struct btrfs_workqueue *wq, int limit_active)
{
if (!wq)
return;
wq->normal->limit_active = limit_active;
if (wq->high)
wq->high->limit_active = limit_active;
}
void btrfs_set_work_high_priority(struct btrfs_work *work)
{
set_bit(WORK_HIGH_PRIO_BIT, &work->flags);
}
void btrfs_flush_workqueue(struct btrfs_workqueue *wq)
{
if (wq->high) flush_workqueue(wq->high->normal_wq); flush_workqueue(wq->normal->normal_wq);
}
// SPDX-License-Identifier: GPL-2.0
/*
* linux/fs/super.c
*
* Copyright (C) 1991, 1992 Linus Torvalds
*
* super.c contains code to handle: - mount structures
* - super-block tables
* - filesystem drivers list
* - mount system call
* - umount system call
* - ustat system call
*
* GK 2/5/95 - Changed to support mounting the root fs via NFS
*
* Added kerneld support: Jacques Gelinas and Bjorn Ekwall
* Added change_root: Werner Almesberger & Hans Lermen, Feb '96
* Added options to /proc/mounts:
* Torbjörn Lindh (torbjorn.lindh@gopta.se), April 14, 1996.
* Added devfs support: Richard Gooch <rgooch@atnf.csiro.au>, 13-JAN-1998
* Heavily rewritten for 'one fs - one tree' dcache architecture. AV, Mar 2000
*/
#include <linux/export.h>
#include <linux/slab.h>
#include <linux/blkdev.h>
#include <linux/mount.h>
#include <linux/security.h>
#include <linux/writeback.h> /* for the emergency remount stuff */
#include <linux/idr.h>
#include <linux/mutex.h>
#include <linux/backing-dev.h>
#include <linux/rculist_bl.h>
#include <linux/cleancache.h>
#include <linux/fscrypt.h>
#include <linux/fsnotify.h>
#include <linux/lockdep.h>
#include <linux/user_namespace.h>
#include <linux/fs_context.h>
#include <uapi/linux/mount.h>
#include "internal.h"
static int thaw_super_locked(struct super_block *sb);
static LIST_HEAD(super_blocks);
static DEFINE_SPINLOCK(sb_lock);
static char *sb_writers_name[SB_FREEZE_LEVELS] = {
"sb_writers",
"sb_pagefaults",
"sb_internal",
};
/*
* One thing we have to be careful of with a per-sb shrinker is that we don't
* drop the last active reference to the superblock from within the shrinker.
* If that happens we could trigger unregistering the shrinker from within the
* shrinker path and that leads to deadlock on the shrinker_rwsem. Hence we
* take a passive reference to the superblock to avoid this from occurring.
*/
static unsigned long super_cache_scan(struct shrinker *shrink,
struct shrink_control *sc)
{
struct super_block *sb;
long fs_objects = 0;
long total_objects;
long freed = 0;
long dentries;
long inodes;
sb = container_of(shrink, struct super_block, s_shrink);
/*
* Deadlock avoidance. We may hold various FS locks, and we don't want
* to recurse into the FS that called us in clear_inode() and friends..
*/
if (!(sc->gfp_mask & __GFP_FS))
return SHRINK_STOP;
if (!trylock_super(sb))
return SHRINK_STOP;
if (sb->s_op->nr_cached_objects)
fs_objects = sb->s_op->nr_cached_objects(sb, sc);
inodes = list_lru_shrink_count(&sb->s_inode_lru, sc);
dentries = list_lru_shrink_count(&sb->s_dentry_lru, sc);
total_objects = dentries + inodes + fs_objects + 1;
if (!total_objects)
total_objects = 1;
/* proportion the scan between the caches */
dentries = mult_frac(sc->nr_to_scan, dentries, total_objects);
inodes = mult_frac(sc->nr_to_scan, inodes, total_objects);
fs_objects = mult_frac(sc->nr_to_scan, fs_objects, total_objects);
/*
* prune the dcache first as the icache is pinned by it, then
* prune the icache, followed by the filesystem specific caches
*
* Ensure that we always scan at least one object - memcg kmem
* accounting uses this to fully empty the caches.
*/
sc->nr_to_scan = dentries + 1;
freed = prune_dcache_sb(sb, sc);
sc->nr_to_scan = inodes + 1;
freed += prune_icache_sb(sb, sc);
if (fs_objects) {
sc->nr_to_scan = fs_objects + 1;
freed += sb->s_op->free_cached_objects(sb, sc);
}
up_read(&sb->s_umount);
return freed;
}
static unsigned long super_cache_count(struct shrinker *shrink,
struct shrink_control *sc)
{
struct super_block *sb;
long total_objects = 0;
sb = container_of(shrink, struct super_block, s_shrink);
/*
* We don't call trylock_super() here as it is a scalability bottleneck,
* so we're exposed to partial setup state. The shrinker rwsem does not
* protect filesystem operations backing list_lru_shrink_count() or
* s_op->nr_cached_objects(). Counts can change between
* super_cache_count and super_cache_scan, so we really don't need locks
* here.
*
* However, if we are currently mounting the superblock, the underlying
* filesystem might be in a state of partial construction and hence it
* is dangerous to access it. trylock_super() uses a SB_BORN check to
* avoid this situation, so do the same here. The memory barrier is
* matched with the one in mount_fs() as we don't hold locks here.
*/
if (!(sb->s_flags & SB_BORN))
return 0;
smp_rmb();
if (sb->s_op && sb->s_op->nr_cached_objects)
total_objects = sb->s_op->nr_cached_objects(sb, sc);
total_objects += list_lru_shrink_count(&sb->s_dentry_lru, sc);
total_objects += list_lru_shrink_count(&sb->s_inode_lru, sc);
if (!total_objects)
return SHRINK_EMPTY;
total_objects = vfs_pressure_ratio(total_objects);
return total_objects;
}
static void destroy_super_work(struct work_struct *work)
{
struct super_block *s = container_of(work, struct super_block,
destroy_work);
int i;
for (i = 0; i < SB_FREEZE_LEVELS; i++)
percpu_free_rwsem(&s->s_writers.rw_sem[i]);
kfree(s);
}
static void destroy_super_rcu(struct rcu_head *head)
{
struct super_block *s = container_of(head, struct super_block, rcu);
INIT_WORK(&s->destroy_work, destroy_super_work);
schedule_work(&s->destroy_work);
}
/* Free a superblock that has never been seen by anyone */
static void destroy_unused_super(struct super_block *s)
{
if (!s)
return;
up_write(&s->s_umount);
list_lru_destroy(&s->s_dentry_lru);
list_lru_destroy(&s->s_inode_lru);
security_sb_free(s);
put_user_ns(s->s_user_ns);
kfree(s->s_subtype);
free_prealloced_shrinker(&s->s_shrink);
/* no delays needed */
destroy_super_work(&s->destroy_work);
}
/**
* alloc_super - create new superblock
* @type: filesystem type superblock should belong to
* @flags: the mount flags
* @user_ns: User namespace for the super_block
*
* Allocates and initializes a new &struct super_block. alloc_super()
* returns a pointer new superblock or %NULL if allocation had failed.
*/
static struct super_block *alloc_super(struct file_system_type *type, int flags,
struct user_namespace *user_ns)
{
struct super_block *s = kzalloc(sizeof(struct super_block), GFP_USER);
static const struct super_operations default_op;
int i;
if (!s)
return NULL;
INIT_LIST_HEAD(&s->s_mounts);
s->s_user_ns = get_user_ns(user_ns);
init_rwsem(&s->s_umount);
lockdep_set_class(&s->s_umount, &type->s_umount_key);
/*
* sget() can have s_umount recursion.
*
* When it cannot find a suitable sb, it allocates a new
* one (this one), and tries again to find a suitable old
* one.
*
* In case that succeeds, it will acquire the s_umount
* lock of the old one. Since these are clearly distrinct
* locks, and this object isn't exposed yet, there's no
* risk of deadlocks.
*
* Annotate this by putting this lock in a different
* subclass.
*/
down_write_nested(&s->s_umount, SINGLE_DEPTH_NESTING);
if (security_sb_alloc(s))
goto fail;
for (i = 0; i < SB_FREEZE_LEVELS; i++) { if (__percpu_init_rwsem(&s->s_writers.rw_sem[i],
sb_writers_name[i],
&type->s_writers_key[i]))
goto fail;
}
init_waitqueue_head(&s->s_writers.wait_unfrozen);
s->s_bdi = &noop_backing_dev_info;
s->s_flags = flags;
if (s->s_user_ns != &init_user_ns)
s->s_iflags |= SB_I_NODEV;
INIT_HLIST_NODE(&s->s_instances);
INIT_HLIST_BL_HEAD(&s->s_roots);
mutex_init(&s->s_sync_lock);
INIT_LIST_HEAD(&s->s_inodes);
spin_lock_init(&s->s_inode_list_lock);
INIT_LIST_HEAD(&s->s_inodes_wb);
spin_lock_init(&s->s_inode_wblist_lock);
s->s_count = 1;
atomic_set(&s->s_active, 1);
mutex_init(&s->s_vfs_rename_mutex);
lockdep_set_class(&s->s_vfs_rename_mutex, &type->s_vfs_rename_key);
init_rwsem(&s->s_dquot.dqio_sem);
s->s_maxbytes = MAX_NON_LFS;
s->s_op = &default_op;
s->s_time_gran = 1000000000;
s->s_time_min = TIME64_MIN;
s->s_time_max = TIME64_MAX;
s->cleancache_poolid = CLEANCACHE_NO_POOL;
s->s_shrink.seeks = DEFAULT_SEEKS;
s->s_shrink.scan_objects = super_cache_scan;
s->s_shrink.count_objects = super_cache_count;
s->s_shrink.batch = 1024;
s->s_shrink.flags = SHRINKER_NUMA_AWARE | SHRINKER_MEMCG_AWARE;
if (prealloc_shrinker(&s->s_shrink))
goto fail;
if (list_lru_init_memcg(&s->s_dentry_lru, &s->s_shrink))
goto fail;
if (list_lru_init_memcg(&s->s_inode_lru, &s->s_shrink))
goto fail;
return s;
fail:
destroy_unused_super(s);
return NULL;
}
/* Superblock refcounting */
/*
* Drop a superblock's refcount. The caller must hold sb_lock.
*/
static void __put_super(struct super_block *s)
{
if (!--s->s_count) { list_del_init(&s->s_list); WARN_ON(s->s_dentry_lru.node); WARN_ON(s->s_inode_lru.node); WARN_ON(!list_empty(&s->s_mounts)); security_sb_free(s);
fscrypt_sb_free(s);
put_user_ns(s->s_user_ns);
kfree(s->s_subtype);
call_rcu(&s->rcu, destroy_super_rcu);
}
}
/**
* put_super - drop a temporary reference to superblock
* @sb: superblock in question
*
* Drops a temporary reference, frees superblock if there's no
* references left.
*/
void put_super(struct super_block *sb)
{
spin_lock(&sb_lock);
__put_super(sb);
spin_unlock(&sb_lock);
}
/**
* deactivate_locked_super - drop an active reference to superblock
* @s: superblock to deactivate
*
* Drops an active reference to superblock, converting it into a temporary
* one if there is no other active references left. In that case we
* tell fs driver to shut it down and drop the temporary reference we
* had just acquired.
*
* Caller holds exclusive lock on superblock; that lock is released.
*/
void deactivate_locked_super(struct super_block *s)
{
struct file_system_type *fs = s->s_type;
if (atomic_dec_and_test(&s->s_active)) {
cleancache_invalidate_fs(s);
unregister_shrinker(&s->s_shrink);
fs->kill_sb(s);
/*
* Since list_lru_destroy() may sleep, we cannot call it from
* put_super(), where we hold the sb_lock. Therefore we destroy
* the lru lists right now.
*/
list_lru_destroy(&s->s_dentry_lru);
list_lru_destroy(&s->s_inode_lru);
put_filesystem(fs);
put_super(s);
} else {
up_write(&s->s_umount);
}
}
EXPORT_SYMBOL(deactivate_locked_super);
/**
* deactivate_super - drop an active reference to superblock
* @s: superblock to deactivate
*
* Variant of deactivate_locked_super(), except that superblock is *not*
* locked by caller. If we are going to drop the final active reference,
* lock will be acquired prior to that.
*/
void deactivate_super(struct super_block *s)
{
if (!atomic_add_unless(&s->s_active, -1, 1)) { down_write(&s->s_umount);
deactivate_locked_super(s);
}
}
EXPORT_SYMBOL(deactivate_super);
/**
* grab_super - acquire an active reference
* @s: reference we are trying to make active
*
* Tries to acquire an active reference. grab_super() is used when we
* had just found a superblock in super_blocks or fs_type->fs_supers
* and want to turn it into a full-blown active reference. grab_super()
* is called with sb_lock held and drops it. Returns 1 in case of
* success, 0 if we had failed (superblock contents was already dead or
* dying when grab_super() had been called). Note that this is only
* called for superblocks not in rundown mode (== ones still on ->fs_supers
* of their type), so increment of ->s_count is OK here.
*/
static int grab_super(struct super_block *s) __releases(sb_lock)
{
s->s_count++;
spin_unlock(&sb_lock);
down_write(&s->s_umount);
if ((s->s_flags & SB_BORN) && atomic_inc_not_zero(&s->s_active)) { put_super(s); return 1;
}
up_write(&s->s_umount);
put_super(s);
return 0;
}
/*
* trylock_super - try to grab ->s_umount shared
* @sb: reference we are trying to grab
*
* Try to prevent fs shutdown. This is used in places where we
* cannot take an active reference but we need to ensure that the
* filesystem is not shut down while we are working on it. It returns
* false if we cannot acquire s_umount or if we lose the race and
* filesystem already got into shutdown, and returns true with the s_umount
* lock held in read mode in case of success. On successful return,
* the caller must drop the s_umount lock when done.
*
* Note that unlike get_super() et.al. this one does *not* bump ->s_count.
* The reason why it's safe is that we are OK with doing trylock instead
* of down_read(). There's a couple of places that are OK with that, but
* it's very much not a general-purpose interface.
*/
bool trylock_super(struct super_block *sb)
{
if (down_read_trylock(&sb->s_umount)) {
if (!hlist_unhashed(&sb->s_instances) &&
sb->s_root && (sb->s_flags & SB_BORN))
return true;
up_read(&sb->s_umount);
}
return false;
}
/**
* generic_shutdown_super - common helper for ->kill_sb()
* @sb: superblock to kill
*
* generic_shutdown_super() does all fs-independent work on superblock
* shutdown. Typical ->kill_sb() should pick all fs-specific objects
* that need destruction out of superblock, call generic_shutdown_super()
* and release aforementioned objects. Note: dentries and inodes _are_
* taken care of and do not need specific handling.
*
* Upon calling this function, the filesystem may no longer alter or
* rearrange the set of dentries belonging to this super_block, nor may it
* change the attachments of dentries to inodes.
*/
void generic_shutdown_super(struct super_block *sb)
{
const struct super_operations *sop = sb->s_op; if (sb->s_root) {
shrink_dcache_for_umount(sb);
sync_filesystem(sb);
sb->s_flags &= ~SB_ACTIVE;
cgroup_writeback_umount();
/* evict all inodes with zero refcount */
evict_inodes(sb);
/* only nonzero refcount inodes can have marks */
fsnotify_sb_delete(sb);
security_sb_delete(sb);
if (sb->s_dio_done_wq) {
destroy_workqueue(sb->s_dio_done_wq);
sb->s_dio_done_wq = NULL;
}
if (sop->put_super) sop->put_super(sb); if (!list_empty(&sb->s_inodes)) {
printk("VFS: Busy inodes after unmount of %s. "
"Self-destruct in 5 seconds. Have a nice day...\n",
sb->s_id);
}
}
spin_lock(&sb_lock);
/* should be initialized for __put_super_and_need_restart() */
hlist_del_init(&sb->s_instances);
spin_unlock(&sb_lock);
up_write(&sb->s_umount);
if (sb->s_bdi != &noop_backing_dev_info) {
bdi_put(sb->s_bdi);
sb->s_bdi = &noop_backing_dev_info;
}
}
EXPORT_SYMBOL(generic_shutdown_super);
bool mount_capable(struct fs_context *fc)
{
if (!(fc->fs_type->fs_flags & FS_USERNS_MOUNT)) return capable(CAP_SYS_ADMIN);
else
return ns_capable(fc->user_ns, CAP_SYS_ADMIN);
}
/**
* sget_fc - Find or create a superblock
* @fc: Filesystem context.
* @test: Comparison callback
* @set: Setup callback
*
* Find or create a superblock using the parameters stored in the filesystem
* context and the two callback functions.
*
* If an extant superblock is matched, then that will be returned with an
* elevated reference count that the caller must transfer or discard.
*
* If no match is made, a new superblock will be allocated and basic
* initialisation will be performed (s_type, s_fs_info and s_id will be set and
* the set() callback will be invoked), the superblock will be published and it
* will be returned in a partially constructed state with SB_BORN and SB_ACTIVE
* as yet unset.
*/
struct super_block *sget_fc(struct fs_context *fc,
int (*test)(struct super_block *, struct fs_context *),
int (*set)(struct super_block *, struct fs_context *))
{
struct super_block *s = NULL;
struct super_block *old;
struct user_namespace *user_ns = fc->global ? &init_user_ns : fc->user_ns;
int err;
retry:
spin_lock(&sb_lock);
if (test) {
hlist_for_each_entry(old, &fc->fs_type->fs_supers, s_instances) {
if (test(old, fc))
goto share_extant_sb;
}
}
if (!s) {
spin_unlock(&sb_lock);
s = alloc_super(fc->fs_type, fc->sb_flags, user_ns);
if (!s)
return ERR_PTR(-ENOMEM);
goto retry;
}
s->s_fs_info = fc->s_fs_info;
err = set(s, fc);
if (err) {
s->s_fs_info = NULL;
spin_unlock(&sb_lock);
destroy_unused_super(s);
return ERR_PTR(err);
}
fc->s_fs_info = NULL;
s->s_type = fc->fs_type;
s->s_iflags |= fc->s_iflags;
strlcpy(s->s_id, s->s_type->name, sizeof(s->s_id));
list_add_tail(&s->s_list, &super_blocks);
hlist_add_head(&s->s_instances, &s->s_type->fs_supers);
spin_unlock(&sb_lock);
get_filesystem(s->s_type);
register_shrinker_prepared(&s->s_shrink);
return s;
share_extant_sb:
if (user_ns != old->s_user_ns) {
spin_unlock(&sb_lock);
destroy_unused_super(s);
return ERR_PTR(-EBUSY);
}
if (!grab_super(old))
goto retry;
destroy_unused_super(s);
return old;
}
EXPORT_SYMBOL(sget_fc);
/**
* sget - find or create a superblock
* @type: filesystem type superblock should belong to
* @test: comparison callback
* @set: setup callback
* @flags: mount flags
* @data: argument to each of them
*/
struct super_block *sget(struct file_system_type *type,
int (*test)(struct super_block *,void *),
int (*set)(struct super_block *,void *),
int flags,
void *data)
{
struct user_namespace *user_ns = current_user_ns();
struct super_block *s = NULL;
struct super_block *old;
int err;
/* We don't yet pass the user namespace of the parent
* mount through to here so always use &init_user_ns
* until that changes.
*/
if (flags & SB_SUBMOUNT)
user_ns = &init_user_ns;
retry:
spin_lock(&sb_lock);
if (test) {
hlist_for_each_entry(old, &type->fs_supers, s_instances) { if (!test(old, data))
continue;
if (user_ns != old->s_user_ns) {
spin_unlock(&sb_lock);
destroy_unused_super(s);
return ERR_PTR(-EBUSY);
}
if (!grab_super(old))
goto retry;
destroy_unused_super(s);
return old;
}
}
if (!s) {
spin_unlock(&sb_lock);
s = alloc_super(type, (flags & ~SB_SUBMOUNT), user_ns);
if (!s)
return ERR_PTR(-ENOMEM);
goto retry;
}
err = set(s, data);
if (err) {
spin_unlock(&sb_lock);
destroy_unused_super(s);
return ERR_PTR(err);
}
s->s_type = type;
strlcpy(s->s_id, type->name, sizeof(s->s_id));
list_add_tail(&s->s_list, &super_blocks);
hlist_add_head(&s->s_instances, &type->fs_supers);
spin_unlock(&sb_lock);
get_filesystem(type);
register_shrinker_prepared(&s->s_shrink);
return s;
}
EXPORT_SYMBOL(sget);
void drop_super(struct super_block *sb)
{
up_read(&sb->s_umount);
put_super(sb);
}
EXPORT_SYMBOL(drop_super);
void drop_super_exclusive(struct super_block *sb)
{
up_write(&sb->s_umount);
put_super(sb);
}
EXPORT_SYMBOL(drop_super_exclusive);
static void __iterate_supers(void (*f)(struct super_block *))
{
struct super_block *sb, *p = NULL;
spin_lock(&sb_lock);
list_for_each_entry(sb, &super_blocks, s_list) {
if (hlist_unhashed(&sb->s_instances))
continue;
sb->s_count++;
spin_unlock(&sb_lock);
f(sb);
spin_lock(&sb_lock);
if (p)
__put_super(p);
p = sb;
}
if (p)
__put_super(p);
spin_unlock(&sb_lock);
}
/**
* iterate_supers - call function for all active superblocks
* @f: function to call
* @arg: argument to pass to it
*
* Scans the superblock list and calls given function, passing it
* locked superblock and given argument.
*/
void iterate_supers(void (*f)(struct super_block *, void *), void *arg)
{
struct super_block *sb, *p = NULL;
spin_lock(&sb_lock);
list_for_each_entry(sb, &super_blocks, s_list) {
if (hlist_unhashed(&sb->s_instances))
continue;
sb->s_count++;
spin_unlock(&sb_lock);
down_read(&sb->s_umount);
if (sb->s_root && (sb->s_flags & SB_BORN))
f(sb, arg);
up_read(&sb->s_umount);
spin_lock(&sb_lock);
if (p)
__put_super(p);
p = sb;
}
if (p)
__put_super(p);
spin_unlock(&sb_lock);
}
/**
* iterate_supers_type - call function for superblocks of given type
* @type: fs type
* @f: function to call
* @arg: argument to pass to it
*
* Scans the superblock list and calls given function, passing it
* locked superblock and given argument.
*/
void iterate_supers_type(struct file_system_type *type,
void (*f)(struct super_block *, void *), void *arg)
{
struct super_block *sb, *p = NULL;
spin_lock(&sb_lock);
hlist_for_each_entry(sb, &type->fs_supers, s_instances) {
sb->s_count++;
spin_unlock(&sb_lock);
down_read(&sb->s_umount);
if (sb->s_root && (sb->s_flags & SB_BORN))
f(sb, arg);
up_read(&sb->s_umount);
spin_lock(&sb_lock);
if (p)
__put_super(p);
p = sb;
}
if (p)
__put_super(p);
spin_unlock(&sb_lock);
}
EXPORT_SYMBOL(iterate_supers_type);
/**
* get_super - get the superblock of a device
* @bdev: device to get the superblock for
*
* Scans the superblock list and finds the superblock of the file system
* mounted on the device given. %NULL is returned if no match is found.
*/
struct super_block *get_super(struct block_device *bdev)
{
struct super_block *sb;
if (!bdev)
return NULL;
spin_lock(&sb_lock);
rescan:
list_for_each_entry(sb, &super_blocks, s_list) { if (hlist_unhashed(&sb->s_instances))
continue;
if (sb->s_bdev == bdev) { sb->s_count++;
spin_unlock(&sb_lock);
down_read(&sb->s_umount);
/* still alive? */
if (sb->s_root && (sb->s_flags & SB_BORN))
return sb;
up_read(&sb->s_umount);
/* nope, got unmounted */
spin_lock(&sb_lock);
__put_super(sb);
goto rescan;
}
}
spin_unlock(&sb_lock);
return NULL;
}
/**
* get_active_super - get an active reference to the superblock of a device
* @bdev: device to get the superblock for
*
* Scans the superblock list and finds the superblock of the file system
* mounted on the device given. Returns the superblock with an active
* reference or %NULL if none was found.
*/
struct super_block *get_active_super(struct block_device *bdev)
{
struct super_block *sb;
if (!bdev)
return NULL;
restart:
spin_lock(&sb_lock);
list_for_each_entry(sb, &super_blocks, s_list) {
if (hlist_unhashed(&sb->s_instances))
continue;
if (sb->s_bdev == bdev) {
if (!grab_super(sb))
goto restart;
up_write(&sb->s_umount);
return sb;
}
}
spin_unlock(&sb_lock);
return NULL;
}
struct super_block *user_get_super(dev_t dev, bool excl)
{
struct super_block *sb;
spin_lock(&sb_lock);
rescan:
list_for_each_entry(sb, &super_blocks, s_list) {
if (hlist_unhashed(&sb->s_instances))
continue;
if (sb->s_dev == dev) {
sb->s_count++;
spin_unlock(&sb_lock);
if (excl)
down_write(&sb->s_umount);
else
down_read(&sb->s_umount);
/* still alive? */
if (sb->s_root && (sb->s_flags & SB_BORN))
return sb;
if (excl)
up_write(&sb->s_umount);
else
up_read(&sb->s_umount);
/* nope, got unmounted */
spin_lock(&sb_lock);
__put_super(sb);
goto rescan;
}
}
spin_unlock(&sb_lock);
return NULL;
}
/**
* reconfigure_super - asks filesystem to change superblock parameters
* @fc: The superblock and configuration
*
* Alters the configuration parameters of a live superblock.
*/
int reconfigure_super(struct fs_context *fc)
{
struct super_block *sb = fc->root->d_sb;
int retval;
bool remount_ro = false;
bool force = fc->sb_flags & SB_FORCE;
if (fc->sb_flags_mask & ~MS_RMT_MASK)
return -EINVAL;
if (sb->s_writers.frozen != SB_UNFROZEN)
return -EBUSY;
retval = security_sb_remount(sb, fc->security);
if (retval)
return retval;
if (fc->sb_flags_mask & SB_RDONLY) {
#ifdef CONFIG_BLOCK
if (!(fc->sb_flags & SB_RDONLY) && sb->s_bdev && bdev_read_only(sb->s_bdev))
return -EACCES;
#endif
remount_ro = (fc->sb_flags & SB_RDONLY) && !sb_rdonly(sb);
}
if (remount_ro) {
if (!hlist_empty(&sb->s_pins)) {
up_write(&sb->s_umount);
group_pin_kill(&sb->s_pins);
down_write(&sb->s_umount);
if (!sb->s_root)
return 0;
if (sb->s_writers.frozen != SB_UNFROZEN)
return -EBUSY;
remount_ro = !sb_rdonly(sb);
}
}
shrink_dcache_sb(sb);
/* If we are reconfiguring to RDONLY and current sb is read/write,
* make sure there are no files open for writing.
*/
if (remount_ro) {
if (force) { sb->s_readonly_remount = 1;
smp_wmb();
} else {
retval = sb_prepare_remount_readonly(sb);
if (retval)
return retval;
}
}
if (fc->ops->reconfigure) { retval = fc->ops->reconfigure(fc);
if (retval) {
if (!force)
goto cancel_readonly;
/* If forced remount, go ahead despite any errors */
WARN(1, "forced remount of a %s fs returned %i\n",
sb->s_type->name, retval);
}
}
WRITE_ONCE(sb->s_flags, ((sb->s_flags & ~fc->sb_flags_mask) |
(fc->sb_flags & fc->sb_flags_mask)));
/* Needs to be ordered wrt mnt_is_readonly() */
smp_wmb();
sb->s_readonly_remount = 0;
/*
* Some filesystems modify their metadata via some other path than the
* bdev buffer cache (eg. use a private mapping, or directories in
* pagecache, etc). Also file data modifications go via their own
* mappings. So If we try to mount readonly then copy the filesystem
* from bdev, we could get stale data, so invalidate it to give a best
* effort at coherency.
*/
if (remount_ro && sb->s_bdev) invalidate_bdev(sb->s_bdev);
return 0;
cancel_readonly:
sb->s_readonly_remount = 0; return retval;
}
static void do_emergency_remount_callback(struct super_block *sb)
{
down_write(&sb->s_umount);
if (sb->s_root && sb->s_bdev && (sb->s_flags & SB_BORN) &&
!sb_rdonly(sb)) {
struct fs_context *fc;
fc = fs_context_for_reconfigure(sb->s_root,
SB_RDONLY | SB_FORCE, SB_RDONLY);
if (!IS_ERR(fc)) {
if (parse_monolithic_mount_data(fc, NULL) == 0)
(void)reconfigure_super(fc);
put_fs_context(fc);
}
}
up_write(&sb->s_umount);
}
static void do_emergency_remount(struct work_struct *work)
{
__iterate_supers(do_emergency_remount_callback);
kfree(work);
printk("Emergency Remount complete\n");
}
void emergency_remount(void)
{
struct work_struct *work;
work = kmalloc(sizeof(*work), GFP_ATOMIC);
if (work) {
INIT_WORK(work, do_emergency_remount);
schedule_work(work);
}
}
static void do_thaw_all_callback(struct super_block *sb)
{
down_write(&sb->s_umount);
if (sb->s_root && sb->s_flags & SB_BORN) {
emergency_thaw_bdev(sb);
thaw_super_locked(sb);
} else {
up_write(&sb->s_umount);
}
}
static void do_thaw_all(struct work_struct *work)
{
__iterate_supers(do_thaw_all_callback);
kfree(work);
printk(KERN_WARNING "Emergency Thaw complete\n");
}
/**
* emergency_thaw_all -- forcibly thaw every frozen filesystem
*
* Used for emergency unfreeze of all filesystems via SysRq
*/
void emergency_thaw_all(void)
{
struct work_struct *work;
work = kmalloc(sizeof(*work), GFP_ATOMIC);
if (work) {
INIT_WORK(work, do_thaw_all);
schedule_work(work);
}
}
static DEFINE_IDA(unnamed_dev_ida);
/**
* get_anon_bdev - Allocate a block device for filesystems which don't have one.
* @p: Pointer to a dev_t.
*
* Filesystems which don't use real block devices can call this function
* to allocate a virtual block device.
*
* Context: Any context. Frequently called while holding sb_lock.
* Return: 0 on success, -EMFILE if there are no anonymous bdevs left
* or -ENOMEM if memory allocation failed.
*/
int get_anon_bdev(dev_t *p)
{
int dev;
/*
* Many userspace utilities consider an FSID of 0 invalid.
* Always return at least 1 from get_anon_bdev.
*/
dev = ida_alloc_range(&unnamed_dev_ida, 1, (1 << MINORBITS) - 1,
GFP_ATOMIC);
if (dev == -ENOSPC)
dev = -EMFILE;
if (dev < 0)
return dev;
*p = MKDEV(0, dev); return 0;
}
EXPORT_SYMBOL(get_anon_bdev);
void free_anon_bdev(dev_t dev)
{
ida_free(&unnamed_dev_ida, MINOR(dev));
}
EXPORT_SYMBOL(free_anon_bdev);
int set_anon_super(struct super_block *s, void *data)
{
return get_anon_bdev(&s->s_dev);
}
EXPORT_SYMBOL(set_anon_super);
void kill_anon_super(struct super_block *sb)
{
dev_t dev = sb->s_dev;
generic_shutdown_super(sb);
free_anon_bdev(dev);
}
EXPORT_SYMBOL(kill_anon_super);
void kill_litter_super(struct super_block *sb)
{
if (sb->s_root)
d_genocide(sb->s_root);
kill_anon_super(sb);
}
EXPORT_SYMBOL(kill_litter_super);
int set_anon_super_fc(struct super_block *sb, struct fs_context *fc)
{
return set_anon_super(sb, NULL);
}
EXPORT_SYMBOL(set_anon_super_fc);
static int test_keyed_super(struct super_block *sb, struct fs_context *fc)
{
return sb->s_fs_info == fc->s_fs_info;
}
static int test_single_super(struct super_block *s, struct fs_context *fc)
{
return 1;
}
/**
* vfs_get_super - Get a superblock with a search key set in s_fs_info.
* @fc: The filesystem context holding the parameters
* @keying: How to distinguish superblocks
* @fill_super: Helper to initialise a new superblock
*
* Search for a superblock and create a new one if not found. The search
* criterion is controlled by @keying. If the search fails, a new superblock
* is created and @fill_super() is called to initialise it.
*
* @keying can take one of a number of values:
*
* (1) vfs_get_single_super - Only one superblock of this type may exist on the
* system. This is typically used for special system filesystems.
*
* (2) vfs_get_keyed_super - Multiple superblocks may exist, but they must have
* distinct keys (where the key is in s_fs_info). Searching for the same
* key again will turn up the superblock for that key.
*
* (3) vfs_get_independent_super - Multiple superblocks may exist and are
* unkeyed. Each call will get a new superblock.
*
* A permissions check is made by sget_fc() unless we're getting a superblock
* for a kernel-internal mount or a submount.
*/
int vfs_get_super(struct fs_context *fc,
enum vfs_get_super_keying keying,
int (*fill_super)(struct super_block *sb,
struct fs_context *fc))
{
int (*test)(struct super_block *, struct fs_context *);
struct super_block *sb;
int err;
switch (keying) {
case vfs_get_single_super:
case vfs_get_single_reconf_super:
test = test_single_super;
break;
case vfs_get_keyed_super:
test = test_keyed_super;
break;
case vfs_get_independent_super:
test = NULL;
break;
default:
BUG();
}
sb = sget_fc(fc, test, set_anon_super_fc);
if (IS_ERR(sb))
return PTR_ERR(sb);
if (!sb->s_root) {
err = fill_super(sb, fc);
if (err)
goto error;
sb->s_flags |= SB_ACTIVE;
fc->root = dget(sb->s_root);
} else {
fc->root = dget(sb->s_root);
if (keying == vfs_get_single_reconf_super) {
err = reconfigure_super(fc);
if (err < 0) {
dput(fc->root);
fc->root = NULL;
goto error;
}
}
}
return 0;
error:
deactivate_locked_super(sb);
return err;
}
EXPORT_SYMBOL(vfs_get_super);
int get_tree_nodev(struct fs_context *fc,
int (*fill_super)(struct super_block *sb,
struct fs_context *fc))
{
return vfs_get_super(fc, vfs_get_independent_super, fill_super);
}
EXPORT_SYMBOL(get_tree_nodev);
int get_tree_single(struct fs_context *fc,
int (*fill_super)(struct super_block *sb,
struct fs_context *fc))
{
return vfs_get_super(fc, vfs_get_single_super, fill_super);
}
EXPORT_SYMBOL(get_tree_single);
int get_tree_single_reconf(struct fs_context *fc,
int (*fill_super)(struct super_block *sb,
struct fs_context *fc))
{
return vfs_get_super(fc, vfs_get_single_reconf_super, fill_super);
}
EXPORT_SYMBOL(get_tree_single_reconf);
int get_tree_keyed(struct fs_context *fc,
int (*fill_super)(struct super_block *sb,
struct fs_context *fc),
void *key)
{
fc->s_fs_info = key;
return vfs_get_super(fc, vfs_get_keyed_super, fill_super);
}
EXPORT_SYMBOL(get_tree_keyed);
#ifdef CONFIG_BLOCK
static int set_bdev_super(struct super_block *s, void *data)
{
s->s_bdev = data;
s->s_dev = s->s_bdev->bd_dev;
s->s_bdi = bdi_get(s->s_bdev->bd_disk->bdi);
if (blk_queue_stable_writes(s->s_bdev->bd_disk->queue))
s->s_iflags |= SB_I_STABLE_WRITES;
return 0;
}
static int set_bdev_super_fc(struct super_block *s, struct fs_context *fc)
{
return set_bdev_super(s, fc->sget_key);
}
static int test_bdev_super_fc(struct super_block *s, struct fs_context *fc)
{
return s->s_bdev == fc->sget_key;
}
/**
* get_tree_bdev - Get a superblock based on a single block device
* @fc: The filesystem context holding the parameters
* @fill_super: Helper to initialise a new superblock
*/
int get_tree_bdev(struct fs_context *fc,
int (*fill_super)(struct super_block *,
struct fs_context *))
{
struct block_device *bdev;
struct super_block *s;
fmode_t mode = FMODE_READ | FMODE_EXCL;
int error = 0;
if (!(fc->sb_flags & SB_RDONLY))
mode |= FMODE_WRITE;
if (!fc->source)
return invalf(fc, "No source specified");
bdev = blkdev_get_by_path(fc->source, mode, fc->fs_type);
if (IS_ERR(bdev)) {
errorf(fc, "%s: Can't open blockdev", fc->source);
return PTR_ERR(bdev);
}
/* Once the superblock is inserted into the list by sget_fc(), s_umount
* will protect the lockfs code from trying to start a snapshot while
* we are mounting
*/
mutex_lock(&bdev->bd_fsfreeze_mutex);
if (bdev->bd_fsfreeze_count > 0) {
mutex_unlock(&bdev->bd_fsfreeze_mutex);
warnf(fc, "%pg: Can't mount, blockdev is frozen", bdev);
blkdev_put(bdev, mode);
return -EBUSY;
}
fc->sb_flags |= SB_NOSEC;
fc->sget_key = bdev;
s = sget_fc(fc, test_bdev_super_fc, set_bdev_super_fc);
mutex_unlock(&bdev->bd_fsfreeze_mutex);
if (IS_ERR(s)) {
blkdev_put(bdev, mode);
return PTR_ERR(s);
}
if (s->s_root) {
/* Don't summarily change the RO/RW state. */
if ((fc->sb_flags ^ s->s_flags) & SB_RDONLY) {
warnf(fc, "%pg: Can't mount, would change RO state", bdev);
deactivate_locked_super(s);
blkdev_put(bdev, mode);
return -EBUSY;
}
/*
* s_umount nests inside open_mutex during
* __invalidate_device(). blkdev_put() acquires
* open_mutex and can't be called under s_umount. Drop
* s_umount temporarily. This is safe as we're
* holding an active reference.
*/
up_write(&s->s_umount);
blkdev_put(bdev, mode);
down_write(&s->s_umount);
} else {
s->s_mode = mode;
snprintf(s->s_id, sizeof(s->s_id), "%pg", bdev);
sb_set_blocksize(s, block_size(bdev));
error = fill_super(s, fc);
if (error) {
deactivate_locked_super(s);
return error;
}
s->s_flags |= SB_ACTIVE;
bdev->bd_super = s;
}
BUG_ON(fc->root);
fc->root = dget(s->s_root);
return 0;
}
EXPORT_SYMBOL(get_tree_bdev);
static int test_bdev_super(struct super_block *s, void *data)
{
return (void *)s->s_bdev == data;
}
struct dentry *mount_bdev(struct file_system_type *fs_type,
int flags, const char *dev_name, void *data,
int (*fill_super)(struct super_block *, void *, int))
{
struct block_device *bdev;
struct super_block *s;
fmode_t mode = FMODE_READ | FMODE_EXCL;
int error = 0;
if (!(flags & SB_RDONLY))
mode |= FMODE_WRITE;
bdev = blkdev_get_by_path(dev_name, mode, fs_type);
if (IS_ERR(bdev))
return ERR_CAST(bdev);
/*
* once the super is inserted into the list by sget, s_umount
* will protect the lockfs code from trying to start a snapshot
* while we are mounting
*/
mutex_lock(&bdev->bd_fsfreeze_mutex);
if (bdev->bd_fsfreeze_count > 0) {
mutex_unlock(&bdev->bd_fsfreeze_mutex);
error = -EBUSY;
goto error_bdev;
}
s = sget(fs_type, test_bdev_super, set_bdev_super, flags | SB_NOSEC,
bdev);
mutex_unlock(&bdev->bd_fsfreeze_mutex);
if (IS_ERR(s))
goto error_s;
if (s->s_root) {
if ((flags ^ s->s_flags) & SB_RDONLY) {
deactivate_locked_super(s);
error = -EBUSY;
goto error_bdev;
}
/*
* s_umount nests inside open_mutex during
* __invalidate_device(). blkdev_put() acquires
* open_mutex and can't be called under s_umount. Drop
* s_umount temporarily. This is safe as we're
* holding an active reference.
*/
up_write(&s->s_umount);
blkdev_put(bdev, mode);
down_write(&s->s_umount);
} else {
s->s_mode = mode;
snprintf(s->s_id, sizeof(s->s_id), "%pg", bdev);
sb_set_blocksize(s, block_size(bdev));
error = fill_super(s, data, flags & SB_SILENT ? 1 : 0);
if (error) {
deactivate_locked_super(s);
goto error;
}
s->s_flags |= SB_ACTIVE;
bdev->bd_super = s;
}
return dget(s->s_root);
error_s:
error = PTR_ERR(s);
error_bdev:
blkdev_put(bdev, mode);
error:
return ERR_PTR(error);
}
EXPORT_SYMBOL(mount_bdev);
void kill_block_super(struct super_block *sb)
{
struct block_device *bdev = sb->s_bdev;
fmode_t mode = sb->s_mode;
bdev->bd_super = NULL;
generic_shutdown_super(sb);
sync_blockdev(bdev);
WARN_ON_ONCE(!(mode & FMODE_EXCL));
blkdev_put(bdev, mode | FMODE_EXCL);
}
EXPORT_SYMBOL(kill_block_super);
#endif
struct dentry *mount_nodev(struct file_system_type *fs_type,
int flags, void *data,
int (*fill_super)(struct super_block *, void *, int))
{
int error;
struct super_block *s = sget(fs_type, NULL, set_anon_super, flags, NULL);
if (IS_ERR(s))
return ERR_CAST(s);
error = fill_super(s, data, flags & SB_SILENT ? 1 : 0);
if (error) {
deactivate_locked_super(s);
return ERR_PTR(error);
}
s->s_flags |= SB_ACTIVE;
return dget(s->s_root);
}
EXPORT_SYMBOL(mount_nodev);
int reconfigure_single(struct super_block *s,
int flags, void *data)
{
struct fs_context *fc;
int ret;
/* The caller really need to be passing fc down into mount_single(),
* then a chunk of this can be removed. [Bollocks -- AV]
* Better yet, reconfiguration shouldn't happen, but rather the second
* mount should be rejected if the parameters are not compatible.
*/
fc = fs_context_for_reconfigure(s->s_root, flags, MS_RMT_MASK);
if (IS_ERR(fc))
return PTR_ERR(fc);
ret = parse_monolithic_mount_data(fc, data);
if (ret < 0)
goto out;
ret = reconfigure_super(fc);
out:
put_fs_context(fc);
return ret;
}
static int compare_single(struct super_block *s, void *p)
{
return 1;
}
struct dentry *mount_single(struct file_system_type *fs_type,
int flags, void *data,
int (*fill_super)(struct super_block *, void *, int))
{
struct super_block *s;
int error;
s = sget(fs_type, compare_single, set_anon_super, flags, NULL);
if (IS_ERR(s))
return ERR_CAST(s);
if (!s->s_root) {
error = fill_super(s, data, flags & SB_SILENT ? 1 : 0);
if (!error)
s->s_flags |= SB_ACTIVE;
} else {
error = reconfigure_single(s, flags, data);
}
if (unlikely(error)) {
deactivate_locked_super(s);
return ERR_PTR(error);
}
return dget(s->s_root);
}
EXPORT_SYMBOL(mount_single);
/**
* vfs_get_tree - Get the mountable root
* @fc: The superblock configuration context.
*
* The filesystem is invoked to get or create a superblock which can then later
* be used for mounting. The filesystem places a pointer to the root to be
* used for mounting in @fc->root.
*/
int vfs_get_tree(struct fs_context *fc)
{
struct super_block *sb;
int error;
if (fc->root)
return -EBUSY;
/* Get the mountable root in fc->root, with a ref on the root and a ref
* on the superblock.
*/
error = fc->ops->get_tree(fc);
if (error < 0)
return error;
if (!fc->root) {
pr_err("Filesystem %s get_tree() didn't set fc->root\n",
fc->fs_type->name);
/* We don't know what the locking state of the superblock is -
* if there is a superblock.
*/
BUG();
}
sb = fc->root->d_sb; WARN_ON(!sb->s_bdi);
/*
* Write barrier is for super_cache_count(). We place it before setting
* SB_BORN as the data dependency between the two functions is the
* superblock structure contents that we just set up, not the SB_BORN
* flag.
*/
smp_wmb();
sb->s_flags |= SB_BORN;
error = security_sb_set_mnt_opts(sb, fc->security, 0, NULL);
if (unlikely(error)) {
fc_drop_locked(fc);
return error;
}
/*
* filesystems should never set s_maxbytes larger than MAX_LFS_FILESIZE
* but s_maxbytes was an unsigned long long for many releases. Throw
* this warning for a little while to try and catch filesystems that
* violate this rule.
*/
WARN((sb->s_maxbytes < 0), "%s set sb->s_maxbytes to "
"negative value (%lld)\n", fc->fs_type->name, sb->s_maxbytes);
return 0;
}
EXPORT_SYMBOL(vfs_get_tree);
/*
* Setup private BDI for given superblock. It gets automatically cleaned up
* in generic_shutdown_super().
*/
int super_setup_bdi_name(struct super_block *sb, char *fmt, ...)
{
struct backing_dev_info *bdi;
int err;
va_list args;
bdi = bdi_alloc(NUMA_NO_NODE);
if (!bdi)
return -ENOMEM;
va_start(args, fmt);
err = bdi_register_va(bdi, fmt, args);
va_end(args);
if (err) {
bdi_put(bdi);
return err;
}
WARN_ON(sb->s_bdi != &noop_backing_dev_info); sb->s_bdi = bdi; return 0;
}
EXPORT_SYMBOL(super_setup_bdi_name);
/*
* Setup private BDI for given superblock. I gets automatically cleaned up
* in generic_shutdown_super().
*/
int super_setup_bdi(struct super_block *sb)
{
static atomic_long_t bdi_seq = ATOMIC_LONG_INIT(0);
return super_setup_bdi_name(sb, "%.28s-%ld", sb->s_type->name,
atomic_long_inc_return(&bdi_seq));
}
EXPORT_SYMBOL(super_setup_bdi);
/**
* sb_wait_write - wait until all writers to given file system finish
* @sb: the super for which we wait
* @level: type of writers we wait for (normal vs page fault)
*
* This function waits until there are no writers of given type to given file
* system.
*/
static void sb_wait_write(struct super_block *sb, int level)
{
percpu_down_write(sb->s_writers.rw_sem + level-1);
}
/*
* We are going to return to userspace and forget about these locks, the
* ownership goes to the caller of thaw_super() which does unlock().
*/
static void lockdep_sb_freeze_release(struct super_block *sb)
{
int level;
for (level = SB_FREEZE_LEVELS - 1; level >= 0; level--)
percpu_rwsem_release(sb->s_writers.rw_sem + level, 0, _THIS_IP_);
}
/*
* Tell lockdep we are holding these locks before we call ->unfreeze_fs(sb).
*/
static void lockdep_sb_freeze_acquire(struct super_block *sb)
{
int level;
for (level = 0; level < SB_FREEZE_LEVELS; ++level)
percpu_rwsem_acquire(sb->s_writers.rw_sem + level, 0, _THIS_IP_);
}
static void sb_freeze_unlock(struct super_block *sb, int level)
{
for (level--; level >= 0; level--)
percpu_up_write(sb->s_writers.rw_sem + level);
}
/**
* freeze_super - lock the filesystem and force it into a consistent state
* @sb: the super to lock
*
* Syncs the super to make sure the filesystem is consistent and calls the fs's
* freeze_fs. Subsequent calls to this without first thawing the fs will return
* -EBUSY.
*
* During this function, sb->s_writers.frozen goes through these values:
*
* SB_UNFROZEN: File system is normal, all writes progress as usual.
*
* SB_FREEZE_WRITE: The file system is in the process of being frozen. New
* writes should be blocked, though page faults are still allowed. We wait for
* all writes to complete and then proceed to the next stage.
*
* SB_FREEZE_PAGEFAULT: Freezing continues. Now also page faults are blocked
* but internal fs threads can still modify the filesystem (although they
* should not dirty new pages or inodes), writeback can run etc. After waiting
* for all running page faults we sync the filesystem which will clean all
* dirty pages and inodes (no new dirty pages or inodes can be created when
* sync is running).
*
* SB_FREEZE_FS: The file system is frozen. Now all internal sources of fs
* modification are blocked (e.g. XFS preallocation truncation on inode
* reclaim). This is usually implemented by blocking new transactions for
* filesystems that have them and need this additional guard. After all
* internal writers are finished we call ->freeze_fs() to finish filesystem
* freezing. Then we transition to SB_FREEZE_COMPLETE state. This state is
* mostly auxiliary for filesystems to verify they do not modify frozen fs.
*
* sb->s_writers.frozen is protected by sb->s_umount.
*/
int freeze_super(struct super_block *sb)
{
int ret;
atomic_inc(&sb->s_active);
down_write(&sb->s_umount);
if (sb->s_writers.frozen != SB_UNFROZEN) {
deactivate_locked_super(sb);
return -EBUSY;
}
if (!(sb->s_flags & SB_BORN)) {
up_write(&sb->s_umount);
return 0; /* sic - it's "nothing to do" */
}
if (sb_rdonly(sb)) {
/* Nothing to do really... */
sb->s_writers.frozen = SB_FREEZE_COMPLETE;
up_write(&sb->s_umount);
return 0;
}
sb->s_writers.frozen = SB_FREEZE_WRITE;
/* Release s_umount to preserve sb_start_write -> s_umount ordering */
up_write(&sb->s_umount);
sb_wait_write(sb, SB_FREEZE_WRITE);
down_write(&sb->s_umount);
/* Now we go and block page faults... */
sb->s_writers.frozen = SB_FREEZE_PAGEFAULT;
sb_wait_write(sb, SB_FREEZE_PAGEFAULT);
/* All writers are done so after syncing there won't be dirty data */
ret = sync_filesystem(sb);
if (ret) {
sb->s_writers.frozen = SB_UNFROZEN;
sb_freeze_unlock(sb, SB_FREEZE_PAGEFAULT);
wake_up(&sb->s_writers.wait_unfrozen);
deactivate_locked_super(sb);
return ret;
}
/* Now wait for internal filesystem counter */
sb->s_writers.frozen = SB_FREEZE_FS;
sb_wait_write(sb, SB_FREEZE_FS);
if (sb->s_op->freeze_fs) {
ret = sb->s_op->freeze_fs(sb);
if (ret) {
printk(KERN_ERR
"VFS:Filesystem freeze failed\n");
sb->s_writers.frozen = SB_UNFROZEN;
sb_freeze_unlock(sb, SB_FREEZE_FS);
wake_up(&sb->s_writers.wait_unfrozen);
deactivate_locked_super(sb);
return ret;
}
}
/*
* For debugging purposes so that fs can warn if it sees write activity
* when frozen is set to SB_FREEZE_COMPLETE, and for thaw_super().
*/
sb->s_writers.frozen = SB_FREEZE_COMPLETE;
lockdep_sb_freeze_release(sb);
up_write(&sb->s_umount);
return 0;
}
EXPORT_SYMBOL(freeze_super);
static int thaw_super_locked(struct super_block *sb)
{
int error;
if (sb->s_writers.frozen != SB_FREEZE_COMPLETE) {
up_write(&sb->s_umount);
return -EINVAL;
}
if (sb_rdonly(sb)) {
sb->s_writers.frozen = SB_UNFROZEN;
goto out;
}
lockdep_sb_freeze_acquire(sb);
if (sb->s_op->unfreeze_fs) {
error = sb->s_op->unfreeze_fs(sb);
if (error) {
printk(KERN_ERR
"VFS:Filesystem thaw failed\n");
lockdep_sb_freeze_release(sb);
up_write(&sb->s_umount);
return error;
}
}
sb->s_writers.frozen = SB_UNFROZEN;
sb_freeze_unlock(sb, SB_FREEZE_FS);
out:
wake_up(&sb->s_writers.wait_unfrozen);
deactivate_locked_super(sb);
return 0;
}
/**
* thaw_super -- unlock filesystem
* @sb: the super to thaw
*
* Unlocks the filesystem and marks it writeable again after freeze_super().
*/
int thaw_super(struct super_block *sb)
{
down_write(&sb->s_umount);
return thaw_super_locked(sb);
}
EXPORT_SYMBOL(thaw_super);
// SPDX-License-Identifier: GPL-2.0-only
#define pr_fmt(fmt) "%s: " fmt, __func__
#include <linux/kernel.h>
#include <linux/sched.h>
#include <linux/wait.h>
#include <linux/slab.h>
#include <linux/mm.h>
#include <linux/percpu-refcount.h>
/*
* Initially, a percpu refcount is just a set of percpu counters. Initially, we
* don't try to detect the ref hitting 0 - which means that get/put can just
* increment or decrement the local counter. Note that the counter on a
* particular cpu can (and will) wrap - this is fine, when we go to shutdown the
* percpu counters will all sum to the correct value
*
* (More precisely: because modular arithmetic is commutative the sum of all the
* percpu_count vars will be equal to what it would have been if all the gets
* and puts were done to a single integer, even if some of the percpu integers
* overflow or underflow).
*
* The real trick to implementing percpu refcounts is shutdown. We can't detect
* the ref hitting 0 on every put - this would require global synchronization
* and defeat the whole purpose of using percpu refs.
*
* What we do is require the user to keep track of the initial refcount; we know
* the ref can't hit 0 before the user drops the initial ref, so as long as we
* convert to non percpu mode before the initial ref is dropped everything
* works.
*
* Converting to non percpu mode is done with some RCUish stuff in
* percpu_ref_kill. Additionally, we need a bias value so that the
* atomic_long_t can't hit 0 before we've added up all the percpu refs.
*/
#define PERCPU_COUNT_BIAS (1LU << (BITS_PER_LONG - 1))
static DEFINE_SPINLOCK(percpu_ref_switch_lock);
static DECLARE_WAIT_QUEUE_HEAD(percpu_ref_switch_waitq);
static unsigned long __percpu *percpu_count_ptr(struct percpu_ref *ref)
{
return (unsigned long __percpu *) (ref->percpu_count_ptr & ~__PERCPU_REF_ATOMIC_DEAD);
}
/**
* percpu_ref_init - initialize a percpu refcount
* @ref: percpu_ref to initialize
* @release: function which will be called when refcount hits 0
* @flags: PERCPU_REF_INIT_* flags
* @gfp: allocation mask to use
*
* Initializes @ref. @ref starts out in percpu mode with a refcount of 1 unless
* @flags contains PERCPU_REF_INIT_ATOMIC or PERCPU_REF_INIT_DEAD. These flags
* change the start state to atomic with the latter setting the initial refcount
* to 0. See the definitions of PERCPU_REF_INIT_* flags for flag behaviors.
*
* Note that @release must not sleep - it may potentially be called from RCU
* callback context by percpu_ref_kill().
*/
int percpu_ref_init(struct percpu_ref *ref, percpu_ref_func_t *release,
unsigned int flags, gfp_t gfp)
{
size_t align = max_t(size_t, 1 << __PERCPU_REF_FLAG_BITS,
__alignof__(unsigned long));
unsigned long start_count = 0;
struct percpu_ref_data *data;
ref->percpu_count_ptr = (unsigned long)
__alloc_percpu_gfp(sizeof(unsigned long), align, gfp);
if (!ref->percpu_count_ptr)
return -ENOMEM;
data = kzalloc(sizeof(*ref->data), gfp);
if (!data) {
free_percpu((void __percpu *)ref->percpu_count_ptr);
return -ENOMEM;
}
data->force_atomic = flags & PERCPU_REF_INIT_ATOMIC;
data->allow_reinit = flags & PERCPU_REF_ALLOW_REINIT;
if (flags & (PERCPU_REF_INIT_ATOMIC | PERCPU_REF_INIT_DEAD)) {
ref->percpu_count_ptr |= __PERCPU_REF_ATOMIC;
data->allow_reinit = true;
} else {
start_count += PERCPU_COUNT_BIAS;
}
if (flags & PERCPU_REF_INIT_DEAD)
ref->percpu_count_ptr |= __PERCPU_REF_DEAD;
else
start_count++;
atomic_long_set(&data->count, start_count);
data->release = release;
data->confirm_switch = NULL;
data->ref = ref;
ref->data = data;
return 0;
}
EXPORT_SYMBOL_GPL(percpu_ref_init);
static void __percpu_ref_exit(struct percpu_ref *ref)
{
unsigned long __percpu *percpu_count = percpu_count_ptr(ref);
if (percpu_count) {
/* non-NULL confirm_switch indicates switching in progress */
WARN_ON_ONCE(ref->data && ref->data->confirm_switch);
free_percpu(percpu_count);
ref->percpu_count_ptr = __PERCPU_REF_ATOMIC_DEAD;
}
}
/**
* percpu_ref_exit - undo percpu_ref_init()
* @ref: percpu_ref to exit
*
* This function exits @ref. The caller is responsible for ensuring that
* @ref is no longer in active use. The usual places to invoke this
* function from are the @ref->release() callback or in init failure path
* where percpu_ref_init() succeeded but other parts of the initialization
* of the embedding object failed.
*/
void percpu_ref_exit(struct percpu_ref *ref)
{
struct percpu_ref_data *data = ref->data;
unsigned long flags;
__percpu_ref_exit(ref);
if (!data)
return;
spin_lock_irqsave(&percpu_ref_switch_lock, flags);
ref->percpu_count_ptr |= atomic_long_read(&ref->data->count) <<
__PERCPU_REF_FLAG_BITS;
ref->data = NULL;
spin_unlock_irqrestore(&percpu_ref_switch_lock, flags);
kfree(data);
}
EXPORT_SYMBOL_GPL(percpu_ref_exit);
static void percpu_ref_call_confirm_rcu(struct rcu_head *rcu)
{
struct percpu_ref_data *data = container_of(rcu,
struct percpu_ref_data, rcu);
struct percpu_ref *ref = data->ref;
data->confirm_switch(ref);
data->confirm_switch = NULL;
wake_up_all(&percpu_ref_switch_waitq);
if (!data->allow_reinit)
__percpu_ref_exit(ref);
/* drop ref from percpu_ref_switch_to_atomic() */
percpu_ref_put(ref);
}
static void percpu_ref_switch_to_atomic_rcu(struct rcu_head *rcu)
{
struct percpu_ref_data *data = container_of(rcu,
struct percpu_ref_data, rcu);
struct percpu_ref *ref = data->ref;
unsigned long __percpu *percpu_count = percpu_count_ptr(ref);
static atomic_t underflows;
unsigned long count = 0;
int cpu;
for_each_possible_cpu(cpu)
count += *per_cpu_ptr(percpu_count, cpu);
pr_debug("global %lu percpu %lu\n",
atomic_long_read(&data->count), count);
/*
* It's crucial that we sum the percpu counters _before_ adding the sum
* to &ref->count; since gets could be happening on one cpu while puts
* happen on another, adding a single cpu's count could cause
* @ref->count to hit 0 before we've got a consistent value - but the
* sum of all the counts will be consistent and correct.
*
* Subtracting the bias value then has to happen _after_ adding count to
* &ref->count; we need the bias value to prevent &ref->count from
* reaching 0 before we add the percpu counts. But doing it at the same
* time is equivalent and saves us atomic operations:
*/
atomic_long_add((long)count - PERCPU_COUNT_BIAS, &data->count);
if (WARN_ONCE(atomic_long_read(&data->count) <= 0,
"percpu ref (%ps) <= 0 (%ld) after switching to atomic",
data->release, atomic_long_read(&data->count)) &&
atomic_inc_return(&underflows) < 4) {
pr_err("%s(): percpu_ref underflow", __func__);
mem_dump_obj(data);
}
/* @ref is viewed as dead on all CPUs, send out switch confirmation */
percpu_ref_call_confirm_rcu(rcu);
}
static void percpu_ref_noop_confirm_switch(struct percpu_ref *ref)
{
}
static void __percpu_ref_switch_to_atomic(struct percpu_ref *ref,
percpu_ref_func_t *confirm_switch)
{
if (ref->percpu_count_ptr & __PERCPU_REF_ATOMIC) { if (confirm_switch) confirm_switch(ref);
return;
}
/* switching from percpu to atomic */
ref->percpu_count_ptr |= __PERCPU_REF_ATOMIC;
/*
* Non-NULL ->confirm_switch is used to indicate that switching is
* in progress. Use noop one if unspecified.
*/
ref->data->confirm_switch = confirm_switch ?:
percpu_ref_noop_confirm_switch;
percpu_ref_get(ref); /* put after confirmation */
call_rcu(&ref->data->rcu, percpu_ref_switch_to_atomic_rcu);
}
static void __percpu_ref_switch_to_percpu(struct percpu_ref *ref)
{
unsigned long __percpu *percpu_count = percpu_count_ptr(ref);
int cpu;
BUG_ON(!percpu_count); if (!(ref->percpu_count_ptr & __PERCPU_REF_ATOMIC))
return;
if (WARN_ON_ONCE(!ref->data->allow_reinit))
return;
atomic_long_add(PERCPU_COUNT_BIAS, &ref->data->count);
/*
* Restore per-cpu operation. smp_store_release() is paired
* with READ_ONCE() in __ref_is_percpu() and guarantees that the
* zeroing is visible to all percpu accesses which can see the
* following __PERCPU_REF_ATOMIC clearing.
*/
for_each_possible_cpu(cpu) *per_cpu_ptr(percpu_count, cpu) = 0; smp_store_release(&ref->percpu_count_ptr,
ref->percpu_count_ptr & ~__PERCPU_REF_ATOMIC);
}
static void __percpu_ref_switch_mode(struct percpu_ref *ref,
percpu_ref_func_t *confirm_switch)
{
struct percpu_ref_data *data = ref->data;
lockdep_assert_held(&percpu_ref_switch_lock);
/*
* If the previous ATOMIC switching hasn't finished yet, wait for
* its completion. If the caller ensures that ATOMIC switching
* isn't in progress, this function can be called from any context.
*/
wait_event_lock_irq(percpu_ref_switch_waitq, !data->confirm_switch,
percpu_ref_switch_lock);
if (data->force_atomic || percpu_ref_is_dying(ref))
__percpu_ref_switch_to_atomic(ref, confirm_switch);
else
__percpu_ref_switch_to_percpu(ref);
}
/**
* percpu_ref_switch_to_atomic - switch a percpu_ref to atomic mode
* @ref: percpu_ref to switch to atomic mode
* @confirm_switch: optional confirmation callback
*
* There's no reason to use this function for the usual reference counting.
* Use percpu_ref_kill[_and_confirm]().
*
* Schedule switching of @ref to atomic mode. All its percpu counts will
* be collected to the main atomic counter. On completion, when all CPUs
* are guaraneed to be in atomic mode, @confirm_switch, which may not
* block, is invoked. This function may be invoked concurrently with all
* the get/put operations and can safely be mixed with kill and reinit
* operations. Note that @ref will stay in atomic mode across kill/reinit
* cycles until percpu_ref_switch_to_percpu() is called.
*
* This function may block if @ref is in the process of switching to atomic
* mode. If the caller ensures that @ref is not in the process of
* switching to atomic mode, this function can be called from any context.
*/
void percpu_ref_switch_to_atomic(struct percpu_ref *ref,
percpu_ref_func_t *confirm_switch)
{
unsigned long flags;
spin_lock_irqsave(&percpu_ref_switch_lock, flags);
ref->data->force_atomic = true;
__percpu_ref_switch_mode(ref, confirm_switch);
spin_unlock_irqrestore(&percpu_ref_switch_lock, flags);
}
EXPORT_SYMBOL_GPL(percpu_ref_switch_to_atomic);
/**
* percpu_ref_switch_to_atomic_sync - switch a percpu_ref to atomic mode
* @ref: percpu_ref to switch to atomic mode
*
* Schedule switching the ref to atomic mode, and wait for the
* switch to complete. Caller must ensure that no other thread
* will switch back to percpu mode.
*/
void percpu_ref_switch_to_atomic_sync(struct percpu_ref *ref)
{
percpu_ref_switch_to_atomic(ref, NULL);
wait_event(percpu_ref_switch_waitq, !ref->data->confirm_switch);
}
EXPORT_SYMBOL_GPL(percpu_ref_switch_to_atomic_sync);
/**
* percpu_ref_switch_to_percpu - switch a percpu_ref to percpu mode
* @ref: percpu_ref to switch to percpu mode
*
* There's no reason to use this function for the usual reference counting.
* To re-use an expired ref, use percpu_ref_reinit().
*
* Switch @ref to percpu mode. This function may be invoked concurrently
* with all the get/put operations and can safely be mixed with kill and
* reinit operations. This function reverses the sticky atomic state set
* by PERCPU_REF_INIT_ATOMIC or percpu_ref_switch_to_atomic(). If @ref is
* dying or dead, the actual switching takes place on the following
* percpu_ref_reinit().
*
* This function may block if @ref is in the process of switching to atomic
* mode. If the caller ensures that @ref is not in the process of
* switching to atomic mode, this function can be called from any context.
*/
void percpu_ref_switch_to_percpu(struct percpu_ref *ref)
{
unsigned long flags;
spin_lock_irqsave(&percpu_ref_switch_lock, flags);
ref->data->force_atomic = false;
__percpu_ref_switch_mode(ref, NULL);
spin_unlock_irqrestore(&percpu_ref_switch_lock, flags);
}
EXPORT_SYMBOL_GPL(percpu_ref_switch_to_percpu);
/**
* percpu_ref_kill_and_confirm - drop the initial ref and schedule confirmation
* @ref: percpu_ref to kill
* @confirm_kill: optional confirmation callback
*
* Equivalent to percpu_ref_kill() but also schedules kill confirmation if
* @confirm_kill is not NULL. @confirm_kill, which may not block, will be
* called after @ref is seen as dead from all CPUs at which point all
* further invocations of percpu_ref_tryget_live() will fail. See
* percpu_ref_tryget_live() for details.
*
* This function normally doesn't block and can be called from any context
* but it may block if @confirm_kill is specified and @ref is in the
* process of switching to atomic mode by percpu_ref_switch_to_atomic().
*
* There are no implied RCU grace periods between kill and release.
*/
void percpu_ref_kill_and_confirm(struct percpu_ref *ref,
percpu_ref_func_t *confirm_kill)
{
unsigned long flags;
spin_lock_irqsave(&percpu_ref_switch_lock, flags); WARN_ONCE(percpu_ref_is_dying(ref),
"%s called more than once on %ps!", __func__,
ref->data->release);
ref->percpu_count_ptr |= __PERCPU_REF_DEAD;
__percpu_ref_switch_mode(ref, confirm_kill);
percpu_ref_put(ref);
spin_unlock_irqrestore(&percpu_ref_switch_lock, flags);
}
EXPORT_SYMBOL_GPL(percpu_ref_kill_and_confirm);
/**
* percpu_ref_is_zero - test whether a percpu refcount reached zero
* @ref: percpu_ref to test
*
* Returns %true if @ref reached zero.
*
* This function is safe to call as long as @ref is between init and exit.
*/
bool percpu_ref_is_zero(struct percpu_ref *ref)
{
unsigned long __percpu *percpu_count;
unsigned long count, flags;
if (__ref_is_percpu(ref, &percpu_count))
return false;
/* protect us from being destroyed */
spin_lock_irqsave(&percpu_ref_switch_lock, flags);
if (ref->data)
count = atomic_long_read(&ref->data->count);
else
count = ref->percpu_count_ptr >> __PERCPU_REF_FLAG_BITS;
spin_unlock_irqrestore(&percpu_ref_switch_lock, flags);
return count == 0;
}
EXPORT_SYMBOL_GPL(percpu_ref_is_zero);
/**
* percpu_ref_reinit - re-initialize a percpu refcount
* @ref: perpcu_ref to re-initialize
*
* Re-initialize @ref so that it's in the same state as when it finished
* percpu_ref_init() ignoring %PERCPU_REF_INIT_DEAD. @ref must have been
* initialized successfully and reached 0 but not exited.
*
* Note that percpu_ref_tryget[_live]() are safe to perform on @ref while
* this function is in progress.
*/
void percpu_ref_reinit(struct percpu_ref *ref)
{
WARN_ON_ONCE(!percpu_ref_is_zero(ref));
percpu_ref_resurrect(ref);
}
EXPORT_SYMBOL_GPL(percpu_ref_reinit);
/**
* percpu_ref_resurrect - modify a percpu refcount from dead to live
* @ref: perpcu_ref to resurrect
*
* Modify @ref so that it's in the same state as before percpu_ref_kill() was
* called. @ref must be dead but must not yet have exited.
*
* If @ref->release() frees @ref then the caller is responsible for
* guaranteeing that @ref->release() does not get called while this
* function is in progress.
*
* Note that percpu_ref_tryget[_live]() are safe to perform on @ref while
* this function is in progress.
*/
void percpu_ref_resurrect(struct percpu_ref *ref)
{
unsigned long __percpu *percpu_count;
unsigned long flags;
spin_lock_irqsave(&percpu_ref_switch_lock, flags); WARN_ON_ONCE(!percpu_ref_is_dying(ref)); WARN_ON_ONCE(__ref_is_percpu(ref, &percpu_count)); ref->percpu_count_ptr &= ~__PERCPU_REF_DEAD;
percpu_ref_get(ref);
__percpu_ref_switch_mode(ref, NULL);
spin_unlock_irqrestore(&percpu_ref_switch_lock, flags);
}
EXPORT_SYMBOL_GPL(percpu_ref_resurrect);
/*
* Written by: Matthew Dobson, IBM Corporation
*
* Copyright (C) 2002, IBM Corp.
*
* All rights reserved.
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
* NON INFRINGEMENT. See the GNU General Public License for more
* details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
*
* Send feedback to <colpatch@us.ibm.com>
*/
#ifndef _ASM_X86_TOPOLOGY_H
#define _ASM_X86_TOPOLOGY_H
/*
* to preserve the visibility of NUMA_NO_NODE definition,
* moved to there from here. May be used independent of
* CONFIG_NUMA.
*/
#include <linux/numa.h>
#ifdef CONFIG_NUMA
#include <linux/cpumask.h>
#include <asm/mpspec.h>
#include <asm/percpu.h>
/* Mappings between logical cpu number and node number */
DECLARE_EARLY_PER_CPU(int, x86_cpu_to_node_map);
#ifdef CONFIG_DEBUG_PER_CPU_MAPS
/*
* override generic percpu implementation of cpu_to_node
*/
extern int __cpu_to_node(int cpu);
#define cpu_to_node __cpu_to_node
extern int early_cpu_to_node(int cpu);
#else /* !CONFIG_DEBUG_PER_CPU_MAPS */
/* Same function but used if called before per_cpu areas are setup */
static inline int early_cpu_to_node(int cpu)
{
return early_per_cpu(x86_cpu_to_node_map, cpu);
}
#endif /* !CONFIG_DEBUG_PER_CPU_MAPS */
/* Mappings between node number and cpus on that node. */
extern cpumask_var_t node_to_cpumask_map[MAX_NUMNODES];
#ifdef CONFIG_DEBUG_PER_CPU_MAPS
extern const struct cpumask *cpumask_of_node(int node);
#else
/* Returns a pointer to the cpumask of CPUs on Node 'node'. */
static inline const struct cpumask *cpumask_of_node(int node)
{
return node_to_cpumask_map[node];
}
#endif
extern void setup_node_to_cpumask_map(void);
#define pcibus_to_node(bus) __pcibus_to_node(bus)
extern int __node_distance(int, int);
#define node_distance(a, b) __node_distance(a, b)
#else /* !CONFIG_NUMA */
static inline int numa_node_id(void)
{
return 0;
}
/*
* indicate override:
*/
#define numa_node_id numa_node_id
static inline int early_cpu_to_node(int cpu)
{
return 0;
}
static inline void setup_node_to_cpumask_map(void) { }
#endif
#include <asm-generic/topology.h>
extern const struct cpumask *cpu_coregroup_mask(int cpu);
#define topology_logical_package_id(cpu) (cpu_data(cpu).logical_proc_id)
#define topology_physical_package_id(cpu) (cpu_data(cpu).phys_proc_id)
#define topology_logical_die_id(cpu) (cpu_data(cpu).logical_die_id)
#define topology_die_id(cpu) (cpu_data(cpu).cpu_die_id)
#define topology_core_id(cpu) (cpu_data(cpu).cpu_core_id)
extern unsigned int __max_die_per_package;
#ifdef CONFIG_SMP
#define topology_die_cpumask(cpu) (per_cpu(cpu_die_map, cpu))
#define topology_core_cpumask(cpu) (per_cpu(cpu_core_map, cpu))
#define topology_sibling_cpumask(cpu) (per_cpu(cpu_sibling_map, cpu))
extern unsigned int __max_logical_packages;
#define topology_max_packages() (__max_logical_packages)
static inline int topology_max_die_per_package(void)
{
return __max_die_per_package;
}
extern int __max_smt_threads;
static inline int topology_max_smt_threads(void)
{
return __max_smt_threads;
}
int topology_update_package_map(unsigned int apicid, unsigned int cpu);
int topology_update_die_map(unsigned int dieid, unsigned int cpu);
int topology_phys_to_logical_pkg(unsigned int pkg);
int topology_phys_to_logical_die(unsigned int die, unsigned int cpu);
bool topology_is_primary_thread(unsigned int cpu);
bool topology_smt_supported(void);
#else
#define topology_max_packages() (1)
static inline int
topology_update_package_map(unsigned int apicid, unsigned int cpu) { return 0; }
static inline int
topology_update_die_map(unsigned int dieid, unsigned int cpu) { return 0; }
static inline int topology_phys_to_logical_pkg(unsigned int pkg) { return 0; }
static inline int topology_phys_to_logical_die(unsigned int die,
unsigned int cpu) { return 0; }
static inline int topology_max_die_per_package(void) { return 1; }
static inline int topology_max_smt_threads(void) { return 1; }
static inline bool topology_is_primary_thread(unsigned int cpu) { return true; }
static inline bool topology_smt_supported(void) { return false; }
#endif
static inline void arch_fix_phys_package_id(int num, u32 slot)
{
}
struct pci_bus;
int x86_pci_root_bus_node(int bus);
void x86_pci_root_bus_resources(int bus, struct list_head *resources);
extern bool x86_topology_update;
#ifdef CONFIG_SCHED_MC_PRIO
#include <asm/percpu.h>
DECLARE_PER_CPU_READ_MOSTLY(int, sched_core_priority);
extern unsigned int __read_mostly sysctl_sched_itmt_enabled;
/* Interface to set priority of a cpu */
void sched_set_itmt_core_prio(int prio, int core_cpu);
/* Interface to notify scheduler that system supports ITMT */
int sched_set_itmt_support(void);
/* Interface to notify scheduler that system revokes ITMT support */
void sched_clear_itmt_support(void);
#else /* CONFIG_SCHED_MC_PRIO */
#define sysctl_sched_itmt_enabled 0
static inline void sched_set_itmt_core_prio(int prio, int core_cpu)
{
}
static inline int sched_set_itmt_support(void)
{
return 0;
}
static inline void sched_clear_itmt_support(void)
{
}
#endif /* CONFIG_SCHED_MC_PRIO */
#if defined(CONFIG_SMP) && defined(CONFIG_X86_64)
#include <asm/cpufeature.h>
DECLARE_STATIC_KEY_FALSE(arch_scale_freq_key);
#define arch_scale_freq_invariant() static_branch_likely(&arch_scale_freq_key)
DECLARE_PER_CPU(unsigned long, arch_freq_scale);
static inline long arch_scale_freq_capacity(int cpu)
{
return per_cpu(arch_freq_scale, cpu);
}
#define arch_scale_freq_capacity arch_scale_freq_capacity
extern void arch_scale_freq_tick(void);
#define arch_scale_freq_tick arch_scale_freq_tick
extern void arch_set_max_freq_ratio(bool turbo_disabled);
#else
static inline void arch_set_max_freq_ratio(bool turbo_disabled)
{
}
#endif
#if defined(CONFIG_ACPI_CPPC_LIB) && defined(CONFIG_SMP)
void init_freq_invariance_cppc(void);
#define init_freq_invariance_cppc init_freq_invariance_cppc
#endif
#endif /* _ASM_X86_TOPOLOGY_H */
/* SPDX-License-Identifier: GPL-2.0-only */
/*
* Fast and scalable bitmaps.
*
* Copyright (C) 2016 Facebook
* Copyright (C) 2013-2014 Jens Axboe
*/
#ifndef __LINUX_SCALE_BITMAP_H
#define __LINUX_SCALE_BITMAP_H
#include <linux/kernel.h>
#include <linux/slab.h>
struct seq_file;
/**
* struct sbitmap_word - Word in a &struct sbitmap.
*/
struct sbitmap_word {
/**
* @depth: Number of bits being used in @word/@cleared
*/
unsigned long depth;
/**
* @word: word holding free bits
*/
unsigned long word ____cacheline_aligned_in_smp;
/**
* @cleared: word holding cleared bits
*/
unsigned long cleared ____cacheline_aligned_in_smp;
} ____cacheline_aligned_in_smp;
/**
* struct sbitmap - Scalable bitmap.
*
* A &struct sbitmap is spread over multiple cachelines to avoid ping-pong. This
* trades off higher memory usage for better scalability.
*/
struct sbitmap {
/**
* @depth: Number of bits used in the whole bitmap.
*/
unsigned int depth;
/**
* @shift: log2(number of bits used per word)
*/
unsigned int shift;
/**
* @map_nr: Number of words (cachelines) being used for the bitmap.
*/
unsigned int map_nr;
/**
* @round_robin: Allocate bits in strict round-robin order.
*/
bool round_robin;
/**
* @map: Allocated bitmap.
*/
struct sbitmap_word *map;
/*
* @alloc_hint: Cache of last successfully allocated or freed bit.
*
* This is per-cpu, which allows multiple users to stick to different
* cachelines until the map is exhausted.
*/
unsigned int __percpu *alloc_hint;
};
#define SBQ_WAIT_QUEUES 8
#define SBQ_WAKE_BATCH 8
/**
* struct sbq_wait_state - Wait queue in a &struct sbitmap_queue.
*/
struct sbq_wait_state {
/**
* @wait_cnt: Number of frees remaining before we wake up.
*/
atomic_t wait_cnt;
/**
* @wait: Wait queue.
*/
wait_queue_head_t wait;
} ____cacheline_aligned_in_smp;
/**
* struct sbitmap_queue - Scalable bitmap with the added ability to wait on free
* bits.
*
* A &struct sbitmap_queue uses multiple wait queues and rolling wakeups to
* avoid contention on the wait queue spinlock. This ensures that we don't hit a
* scalability wall when we run out of free bits and have to start putting tasks
* to sleep.
*/
struct sbitmap_queue {
/**
* @sb: Scalable bitmap.
*/
struct sbitmap sb;
/**
* @wake_batch: Number of bits which must be freed before we wake up any
* waiters.
*/
unsigned int wake_batch;
/**
* @wake_index: Next wait queue in @ws to wake up.
*/
atomic_t wake_index;
/**
* @ws: Wait queues.
*/
struct sbq_wait_state *ws;
/*
* @ws_active: count of currently active ws waitqueues
*/
atomic_t ws_active;
/**
* @min_shallow_depth: The minimum shallow depth which may be passed to
* sbitmap_queue_get_shallow() or __sbitmap_queue_get_shallow().
*/
unsigned int min_shallow_depth;
};
/**
* sbitmap_init_node() - Initialize a &struct sbitmap on a specific memory node.
* @sb: Bitmap to initialize.
* @depth: Number of bits to allocate.
* @shift: Use 2^@shift bits per word in the bitmap; if a negative number if
* given, a good default is chosen.
* @flags: Allocation flags.
* @node: Memory node to allocate on.
* @round_robin: If true, be stricter about allocation order; always allocate
* starting from the last allocated bit. This is less efficient
* than the default behavior (false).
* @alloc_hint: If true, apply percpu hint for where to start searching for
* a free bit.
*
* Return: Zero on success or negative errno on failure.
*/
int sbitmap_init_node(struct sbitmap *sb, unsigned int depth, int shift,
gfp_t flags, int node, bool round_robin, bool alloc_hint);
/**
* sbitmap_free() - Free memory used by a &struct sbitmap.
* @sb: Bitmap to free.
*/
static inline void sbitmap_free(struct sbitmap *sb)
{
free_percpu(sb->alloc_hint);
kfree(sb->map);
sb->map = NULL;
}
/**
* sbitmap_resize() - Resize a &struct sbitmap.
* @sb: Bitmap to resize.
* @depth: New number of bits to resize to.
*
* Doesn't reallocate anything. It's up to the caller to ensure that the new
* depth doesn't exceed the depth that the sb was initialized with.
*/
void sbitmap_resize(struct sbitmap *sb, unsigned int depth);
/**
* sbitmap_get() - Try to allocate a free bit from a &struct sbitmap.
* @sb: Bitmap to allocate from.
*
* This operation provides acquire barrier semantics if it succeeds.
*
* Return: Non-negative allocated bit number if successful, -1 otherwise.
*/
int sbitmap_get(struct sbitmap *sb);
/**
* sbitmap_get_shallow() - Try to allocate a free bit from a &struct sbitmap,
* limiting the depth used from each word.
* @sb: Bitmap to allocate from.
* @shallow_depth: The maximum number of bits to allocate from a single word.
*
* This rather specific operation allows for having multiple users with
* different allocation limits. E.g., there can be a high-priority class that
* uses sbitmap_get() and a low-priority class that uses sbitmap_get_shallow()
* with a @shallow_depth of (1 << (@sb->shift - 1)). Then, the low-priority
* class can only allocate half of the total bits in the bitmap, preventing it
* from starving out the high-priority class.
*
* Return: Non-negative allocated bit number if successful, -1 otherwise.
*/
int sbitmap_get_shallow(struct sbitmap *sb, unsigned long shallow_depth);
/**
* sbitmap_any_bit_set() - Check for a set bit in a &struct sbitmap.
* @sb: Bitmap to check.
*
* Return: true if any bit in the bitmap is set, false otherwise.
*/
bool sbitmap_any_bit_set(const struct sbitmap *sb);
#define SB_NR_TO_INDEX(sb, bitnr) ((bitnr) >> (sb)->shift)
#define SB_NR_TO_BIT(sb, bitnr) ((bitnr) & ((1U << (sb)->shift) - 1U))
typedef bool (*sb_for_each_fn)(struct sbitmap *, unsigned int, void *);
/**
* __sbitmap_for_each_set() - Iterate over each set bit in a &struct sbitmap.
* @start: Where to start the iteration.
* @sb: Bitmap to iterate over.
* @fn: Callback. Should return true to continue or false to break early.
* @data: Pointer to pass to callback.
*
* This is inline even though it's non-trivial so that the function calls to the
* callback will hopefully get optimized away.
*/
static inline void __sbitmap_for_each_set(struct sbitmap *sb,
unsigned int start,
sb_for_each_fn fn, void *data)
{
unsigned int index;
unsigned int nr;
unsigned int scanned = 0;
if (start >= sb->depth)
start = 0;
index = SB_NR_TO_INDEX(sb, start);
nr = SB_NR_TO_BIT(sb, start);
while (scanned < sb->depth) {
unsigned long word;
unsigned int depth = min_t(unsigned int,
sb->map[index].depth - nr,
sb->depth - scanned);
scanned += depth;
word = sb->map[index].word & ~sb->map[index].cleared;
if (!word)
goto next;
/*
* On the first iteration of the outer loop, we need to add the
* bit offset back to the size of the word for find_next_bit().
* On all other iterations, nr is zero, so this is a noop.
*/
depth += nr;
while (1) {
nr = find_next_bit(&word, depth, nr);
if (nr >= depth)
break;
if (!fn(sb, (index << sb->shift) + nr, data))
return;
nr++;
}
next:
nr = 0;
if (++index >= sb->map_nr)
index = 0;
}
}
/**
* sbitmap_for_each_set() - Iterate over each set bit in a &struct sbitmap.
* @sb: Bitmap to iterate over.
* @fn: Callback. Should return true to continue or false to break early.
* @data: Pointer to pass to callback.
*/
static inline void sbitmap_for_each_set(struct sbitmap *sb, sb_for_each_fn fn,
void *data)
{
__sbitmap_for_each_set(sb, 0, fn, data);
}
static inline unsigned long *__sbitmap_word(struct sbitmap *sb,
unsigned int bitnr)
{
return &sb->map[SB_NR_TO_INDEX(sb, bitnr)].word;
}
/* Helpers equivalent to the operations in asm/bitops.h and linux/bitmap.h */
static inline void sbitmap_set_bit(struct sbitmap *sb, unsigned int bitnr)
{
set_bit(SB_NR_TO_BIT(sb, bitnr), __sbitmap_word(sb, bitnr));
}
static inline void sbitmap_clear_bit(struct sbitmap *sb, unsigned int bitnr)
{
clear_bit(SB_NR_TO_BIT(sb, bitnr), __sbitmap_word(sb, bitnr));
}
/*
* This one is special, since it doesn't actually clear the bit, rather it
* sets the corresponding bit in the ->cleared mask instead. Paired with
* the caller doing sbitmap_deferred_clear() if a given index is full, which
* will clear the previously freed entries in the corresponding ->word.
*/
static inline void sbitmap_deferred_clear_bit(struct sbitmap *sb, unsigned int bitnr)
{
unsigned long *addr = &sb->map[SB_NR_TO_INDEX(sb, bitnr)].cleared;
set_bit(SB_NR_TO_BIT(sb, bitnr), addr);
}
/*
* Pair of sbitmap_get, and this one applies both cleared bit and
* allocation hint.
*/
static inline void sbitmap_put(struct sbitmap *sb, unsigned int bitnr)
{
sbitmap_deferred_clear_bit(sb, bitnr);
if (likely(sb->alloc_hint && !sb->round_robin && bitnr < sb->depth)) *raw_cpu_ptr(sb->alloc_hint) = bitnr;
}
static inline int sbitmap_test_bit(struct sbitmap *sb, unsigned int bitnr)
{
return test_bit(SB_NR_TO_BIT(sb, bitnr), __sbitmap_word(sb, bitnr));
}
static inline int sbitmap_calculate_shift(unsigned int depth)
{
int shift = ilog2(BITS_PER_LONG);
/*
* If the bitmap is small, shrink the number of bits per word so
* we spread over a few cachelines, at least. If less than 4
* bits, just forget about it, it's not going to work optimally
* anyway.
*/
if (depth >= 4) {
while ((4U << shift) > depth)
shift--;
}
return shift;
}
/**
* sbitmap_show() - Dump &struct sbitmap information to a &struct seq_file.
* @sb: Bitmap to show.
* @m: struct seq_file to write to.
*
* This is intended for debugging. The format may change at any time.
*/
void sbitmap_show(struct sbitmap *sb, struct seq_file *m);
/**
* sbitmap_weight() - Return how many set and not cleared bits in a &struct
* sbitmap.
* @sb: Bitmap to check.
*
* Return: How many set and not cleared bits set
*/
unsigned int sbitmap_weight(const struct sbitmap *sb);
/**
* sbitmap_bitmap_show() - Write a hex dump of a &struct sbitmap to a &struct
* seq_file.
* @sb: Bitmap to show.
* @m: struct seq_file to write to.
*
* This is intended for debugging. The output isn't guaranteed to be internally
* consistent.
*/
void sbitmap_bitmap_show(struct sbitmap *sb, struct seq_file *m);
/**
* sbitmap_queue_init_node() - Initialize a &struct sbitmap_queue on a specific
* memory node.
* @sbq: Bitmap queue to initialize.
* @depth: See sbitmap_init_node().
* @shift: See sbitmap_init_node().
* @round_robin: See sbitmap_get().
* @flags: Allocation flags.
* @node: Memory node to allocate on.
*
* Return: Zero on success or negative errno on failure.
*/
int sbitmap_queue_init_node(struct sbitmap_queue *sbq, unsigned int depth,
int shift, bool round_robin, gfp_t flags, int node);
/**
* sbitmap_queue_free() - Free memory used by a &struct sbitmap_queue.
*
* @sbq: Bitmap queue to free.
*/
static inline void sbitmap_queue_free(struct sbitmap_queue *sbq)
{
kfree(sbq->ws);
sbitmap_free(&sbq->sb);
}
/**
* sbitmap_queue_resize() - Resize a &struct sbitmap_queue.
* @sbq: Bitmap queue to resize.
* @depth: New number of bits to resize to.
*
* Like sbitmap_resize(), this doesn't reallocate anything. It has to do
* some extra work on the &struct sbitmap_queue, so it's not safe to just
* resize the underlying &struct sbitmap.
*/
void sbitmap_queue_resize(struct sbitmap_queue *sbq, unsigned int depth);
/**
* __sbitmap_queue_get() - Try to allocate a free bit from a &struct
* sbitmap_queue with preemption already disabled.
* @sbq: Bitmap queue to allocate from.
*
* Return: Non-negative allocated bit number if successful, -1 otherwise.
*/
int __sbitmap_queue_get(struct sbitmap_queue *sbq);
/**
* __sbitmap_queue_get_shallow() - Try to allocate a free bit from a &struct
* sbitmap_queue, limiting the depth used from each word, with preemption
* already disabled.
* @sbq: Bitmap queue to allocate from.
* @shallow_depth: The maximum number of bits to allocate from a single word.
* See sbitmap_get_shallow().
*
* If you call this, make sure to call sbitmap_queue_min_shallow_depth() after
* initializing @sbq.
*
* Return: Non-negative allocated bit number if successful, -1 otherwise.
*/
int __sbitmap_queue_get_shallow(struct sbitmap_queue *sbq,
unsigned int shallow_depth);
/**
* sbitmap_queue_get() - Try to allocate a free bit from a &struct
* sbitmap_queue.
* @sbq: Bitmap queue to allocate from.
* @cpu: Output parameter; will contain the CPU we ran on (e.g., to be passed to
* sbitmap_queue_clear()).
*
* Return: Non-negative allocated bit number if successful, -1 otherwise.
*/
static inline int sbitmap_queue_get(struct sbitmap_queue *sbq,
unsigned int *cpu)
{
int nr;
*cpu = get_cpu();
nr = __sbitmap_queue_get(sbq);
put_cpu();
return nr;
}
/**
* sbitmap_queue_get_shallow() - Try to allocate a free bit from a &struct
* sbitmap_queue, limiting the depth used from each word.
* @sbq: Bitmap queue to allocate from.
* @cpu: Output parameter; will contain the CPU we ran on (e.g., to be passed to
* sbitmap_queue_clear()).
* @shallow_depth: The maximum number of bits to allocate from a single word.
* See sbitmap_get_shallow().
*
* If you call this, make sure to call sbitmap_queue_min_shallow_depth() after
* initializing @sbq.
*
* Return: Non-negative allocated bit number if successful, -1 otherwise.
*/
static inline int sbitmap_queue_get_shallow(struct sbitmap_queue *sbq,
unsigned int *cpu,
unsigned int shallow_depth)
{
int nr;
*cpu = get_cpu();
nr = __sbitmap_queue_get_shallow(sbq, shallow_depth);
put_cpu();
return nr;
}
/**
* sbitmap_queue_min_shallow_depth() - Inform a &struct sbitmap_queue of the
* minimum shallow depth that will be used.
* @sbq: Bitmap queue in question.
* @min_shallow_depth: The minimum shallow depth that will be passed to
* sbitmap_queue_get_shallow() or __sbitmap_queue_get_shallow().
*
* sbitmap_queue_clear() batches wakeups as an optimization. The batch size
* depends on the depth of the bitmap. Since the shallow allocation functions
* effectively operate with a different depth, the shallow depth must be taken
* into account when calculating the batch size. This function must be called
* with the minimum shallow depth that will be used. Failure to do so can result
* in missed wakeups.
*/
void sbitmap_queue_min_shallow_depth(struct sbitmap_queue *sbq,
unsigned int min_shallow_depth);
/**
* sbitmap_queue_clear() - Free an allocated bit and wake up waiters on a
* &struct sbitmap_queue.
* @sbq: Bitmap to free from.
* @nr: Bit number to free.
* @cpu: CPU the bit was allocated on.
*/
void sbitmap_queue_clear(struct sbitmap_queue *sbq, unsigned int nr,
unsigned int cpu);
static inline int sbq_index_inc(int index)
{
return (index + 1) & (SBQ_WAIT_QUEUES - 1);
}
static inline void sbq_index_atomic_inc(atomic_t *index)
{
int old = atomic_read(index);
int new = sbq_index_inc(old);
atomic_cmpxchg(index, old, new);
}
/**
* sbq_wait_ptr() - Get the next wait queue to use for a &struct
* sbitmap_queue.
* @sbq: Bitmap queue to wait on.
* @wait_index: A counter per "user" of @sbq.
*/
static inline struct sbq_wait_state *sbq_wait_ptr(struct sbitmap_queue *sbq,
atomic_t *wait_index)
{
struct sbq_wait_state *ws;
ws = &sbq->ws[atomic_read(wait_index)];
sbq_index_atomic_inc(wait_index);
return ws;
}
/**
* sbitmap_queue_wake_all() - Wake up everything waiting on a &struct
* sbitmap_queue.
* @sbq: Bitmap queue to wake up.
*/
void sbitmap_queue_wake_all(struct sbitmap_queue *sbq);
/**
* sbitmap_queue_wake_up() - Wake up some of waiters in one waitqueue
* on a &struct sbitmap_queue.
* @sbq: Bitmap queue to wake up.
*/
void sbitmap_queue_wake_up(struct sbitmap_queue *sbq);
/**
* sbitmap_queue_show() - Dump &struct sbitmap_queue information to a &struct
* seq_file.
* @sbq: Bitmap queue to show.
* @m: struct seq_file to write to.
*
* This is intended for debugging. The format may change at any time.
*/
void sbitmap_queue_show(struct sbitmap_queue *sbq, struct seq_file *m);
struct sbq_wait {
struct sbitmap_queue *sbq; /* if set, sbq_wait is accounted */
struct wait_queue_entry wait;
};
#define DEFINE_SBQ_WAIT(name) \
struct sbq_wait name = { \
.sbq = NULL, \
.wait = { \
.private = current, \
.func = autoremove_wake_function, \
.entry = LIST_HEAD_INIT((name).wait.entry), \
} \
}
/*
* Wrapper around prepare_to_wait_exclusive(), which maintains some extra
* internal state.
*/
void sbitmap_prepare_to_wait(struct sbitmap_queue *sbq,
struct sbq_wait_state *ws,
struct sbq_wait *sbq_wait, int state);
/*
* Must be paired with sbitmap_prepare_to_wait().
*/
void sbitmap_finish_wait(struct sbitmap_queue *sbq, struct sbq_wait_state *ws,
struct sbq_wait *sbq_wait);
/*
* Wrapper around add_wait_queue(), which maintains some extra internal state
*/
void sbitmap_add_wait_queue(struct sbitmap_queue *sbq,
struct sbq_wait_state *ws,
struct sbq_wait *sbq_wait);
/*
* Must be paired with sbitmap_add_wait_queue()
*/
void sbitmap_del_wait_queue(struct sbq_wait *sbq_wait);
#endif /* __LINUX_SCALE_BITMAP_H */
// SPDX-License-Identifier: GPL-2.0
/*
* USB-ACPI glue code
*
* Copyright 2012 Red Hat <mjg@redhat.com>
*/
#include <linux/module.h>
#include <linux/usb.h>
#include <linux/device.h>
#include <linux/errno.h>
#include <linux/kernel.h>
#include <linux/acpi.h>
#include <linux/pci.h>
#include <linux/usb/hcd.h>
#include "hub.h"
/**
* usb_acpi_power_manageable - check whether usb port has
* acpi power resource.
* @hdev: USB device belonging to the usb hub
* @index: port index based zero
*
* Return true if the port has acpi power resource and false if no.
*/
bool usb_acpi_power_manageable(struct usb_device *hdev, int index)
{
acpi_handle port_handle;
int port1 = index + 1;
port_handle = usb_get_hub_port_acpi_handle(hdev,
port1);
if (port_handle)
return acpi_bus_power_manageable(port_handle);
else
return false;
}
EXPORT_SYMBOL_GPL(usb_acpi_power_manageable);
/**
* usb_acpi_set_power_state - control usb port's power via acpi power
* resource
* @hdev: USB device belonging to the usb hub
* @index: port index based zero
* @enable: power state expected to be set
*
* Notice to use usb_acpi_power_manageable() to check whether the usb port
* has acpi power resource before invoking this function.
*
* Returns 0 on success, else negative errno.
*/
int usb_acpi_set_power_state(struct usb_device *hdev, int index, bool enable)
{
struct usb_hub *hub = usb_hub_to_struct_hub(hdev);
struct usb_port *port_dev;
acpi_handle port_handle;
unsigned char state;
int port1 = index + 1;
int error = -EINVAL;
if (!hub)
return -ENODEV;
port_dev = hub->ports[port1 - 1];
port_handle = (acpi_handle) usb_get_hub_port_acpi_handle(hdev, port1);
if (!port_handle)
return error;
if (enable)
state = ACPI_STATE_D0;
else
state = ACPI_STATE_D3_COLD;
error = acpi_bus_set_power(port_handle, state);
if (!error)
dev_dbg(&port_dev->dev, "acpi: power was set to %d\n", enable);
else
dev_dbg(&port_dev->dev, "acpi: power failed to be set\n");
return error;
}
EXPORT_SYMBOL_GPL(usb_acpi_set_power_state);
static enum usb_port_connect_type usb_acpi_get_connect_type(acpi_handle handle,
struct acpi_pld_info *pld)
{
enum usb_port_connect_type connect_type = USB_PORT_CONNECT_TYPE_UNKNOWN;
struct acpi_buffer buffer = { ACPI_ALLOCATE_BUFFER, NULL };
union acpi_object *upc = NULL;
acpi_status status;
/*
* According to 9.14 in ACPI Spec 6.2. _PLD indicates whether usb port
* is user visible and _UPC indicates whether it is connectable. If
* the port was visible and connectable, it could be freely connected
* and disconnected with USB devices. If no visible and connectable,
* a usb device is directly hard-wired to the port. If no visible and
* no connectable, the port would be not used.
*/
status = acpi_evaluate_object(handle, "_UPC", NULL, &buffer);
if (ACPI_FAILURE(status))
goto out;
upc = buffer.pointer;
if (!upc || (upc->type != ACPI_TYPE_PACKAGE) || upc->package.count != 4)
goto out;
if (upc->package.elements[0].integer.value)
if (pld->user_visible)
connect_type = USB_PORT_CONNECT_TYPE_HOT_PLUG;
else
connect_type = USB_PORT_CONNECT_TYPE_HARD_WIRED;
else if (!pld->user_visible)
connect_type = USB_PORT_NOT_USED;
out:
kfree(upc);
return connect_type;
}
/*
* Private to usb-acpi, all the core needs to know is that
* port_dev->location is non-zero when it has been set by the firmware.
*/
#define USB_ACPI_LOCATION_VALID (1 << 31)
static struct acpi_device *usb_acpi_find_port(struct acpi_device *parent,
int raw)
{
struct acpi_device *adev;
if (!parent)
return NULL;
list_for_each_entry(adev, &parent->children, node) {
if (acpi_device_adr(adev) == raw)
return adev;
}
return acpi_find_child_device(parent, raw, false);
}
static struct acpi_device *
usb_acpi_get_companion_for_port(struct usb_port *port_dev)
{
struct usb_device *udev;
struct acpi_device *adev;
acpi_handle *parent_handle;
int port1;
/* Get the struct usb_device point of port's hub */
udev = to_usb_device(port_dev->dev.parent->parent);
/*
* The root hub ports' parent is the root hub. The non-root-hub
* ports' parent is the parent hub port which the hub is
* connected to.
*/
if (!udev->parent) {
adev = ACPI_COMPANION(&udev->dev);
port1 = usb_hcd_find_raw_port_number(bus_to_hcd(udev->bus),
port_dev->portnum);
} else {
parent_handle = usb_get_hub_port_acpi_handle(udev->parent,
udev->portnum);
if (!parent_handle)
return NULL;
acpi_bus_get_device(parent_handle, &adev);
port1 = port_dev->portnum;
}
return usb_acpi_find_port(adev, port1);
}
static struct acpi_device *
usb_acpi_find_companion_for_port(struct usb_port *port_dev)
{
struct acpi_device *adev;
struct acpi_pld_info *pld;
acpi_handle *handle;
acpi_status status;
adev = usb_acpi_get_companion_for_port(port_dev);
if (!adev)
return NULL;
handle = adev->handle;
status = acpi_get_physical_device_location(handle, &pld);
if (ACPI_SUCCESS(status) && pld) {
port_dev->location = USB_ACPI_LOCATION_VALID
| pld->group_token << 8 | pld->group_position;
port_dev->connect_type = usb_acpi_get_connect_type(handle, pld);
ACPI_FREE(pld);
}
return adev;
}
static struct acpi_device *
usb_acpi_find_companion_for_device(struct usb_device *udev)
{
struct acpi_device *adev;
struct usb_port *port_dev;
struct usb_hub *hub;
if (!udev->parent) {
/* root hub is only child (_ADR=0) under its parent, the HC */
adev = ACPI_COMPANION(udev->dev.parent);
return acpi_find_child_device(adev, 0, false);
}
hub = usb_hub_to_struct_hub(udev->parent);
if (!hub)
return NULL;
/*
* This is an embedded USB device connected to a port and such
* devices share port's ACPI companion.
*/
port_dev = hub->ports[udev->portnum - 1];
return usb_acpi_get_companion_for_port(port_dev);
}
static struct acpi_device *usb_acpi_find_companion(struct device *dev)
{
/*
* The USB hierarchy like following:
*
* Device (EHC1)
* Device (HUBN)
* Device (PR01)
* Device (PR11)
* Device (PR12)
* Device (FN12)
* Device (FN13)
* Device (PR13)
* ...
* where HUBN is root hub, and PRNN are USB ports and devices
* connected to them, and FNNN are individualk functions for
* connected composite USB devices. PRNN and FNNN may contain
* _CRS and other methods describing sideband resources for
* the connected device.
*
* On the kernel side both root hub and embedded USB devices are
* represented as instances of usb_device structure, and ports
* are represented as usb_port structures, so the whole process
* is split into 2 parts: finding companions for devices and
* finding companions for ports.
*
* Note that we do not handle individual functions of composite
* devices yet, for that we would need to assign companions to
* devices corresponding to USB interfaces.
*/
if (is_usb_device(dev))
return usb_acpi_find_companion_for_device(to_usb_device(dev));
else if (is_usb_port(dev))
return usb_acpi_find_companion_for_port(to_usb_port(dev));
return NULL;
}
static bool usb_acpi_bus_match(struct device *dev)
{
return is_usb_device(dev) || is_usb_port(dev);
}
static struct acpi_bus_type usb_acpi_bus = {
.name = "USB",
.match = usb_acpi_bus_match,
.find_companion = usb_acpi_find_companion,
};
int usb_acpi_register(void)
{
return register_acpi_bus_type(&usb_acpi_bus);
}
void usb_acpi_unregister(void)
{
unregister_acpi_bus_type(&usb_acpi_bus);
}
// SPDX-License-Identifier: GPL-2.0-or-later
/*
* libata-core.c - helper library for ATA
*
* Copyright 2003-2004 Red Hat, Inc. All rights reserved.
* Copyright 2003-2004 Jeff Garzik
*
* libata documentation is available via 'make {ps|pdf}docs',
* as Documentation/driver-api/libata.rst
*
* Hardware documentation available from http://www.t13.org/ and
* http://www.sata-io.org/
*
* Standards documents from:
* http://www.t13.org (ATA standards, PCI DMA IDE spec)
* http://www.t10.org (SCSI MMC - for ATAPI MMC)
* http://www.sata-io.org (SATA)
* http://www.compactflash.org (CF)
* http://www.qic.org (QIC157 - Tape and DSC)
* http://www.ce-ata.org (CE-ATA: not supported)
*
* libata is essentially a library of internal helper functions for
* low-level ATA host controller drivers. As such, the API/ABI is
* likely to change as new drivers are added and updated.
* Do not depend on ABI/API stability.
*/
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/pci.h>
#include <linux/init.h>
#include <linux/list.h>
#include <linux/mm.h>
#include <linux/spinlock.h>
#include <linux/blkdev.h>
#include <linux/delay.h>
#include <linux/timer.h>
#include <linux/time.h>
#include <linux/interrupt.h>
#include <linux/completion.h>
#include <linux/suspend.h>
#include <linux/workqueue.h>
#include <linux/scatterlist.h>
#include <linux/io.h>
#include <linux/log2.h>
#include <linux/slab.h>
#include <linux/glob.h>
#include <scsi/scsi.h>
#include <scsi/scsi_cmnd.h>
#include <scsi/scsi_host.h>
#include <linux/libata.h>
#include <asm/byteorder.h>
#include <asm/unaligned.h>
#include <linux/cdrom.h>
#include <linux/ratelimit.h>
#include <linux/leds.h>
#include <linux/pm_runtime.h>
#include <linux/platform_device.h>
#include <asm/setup.h>
#define CREATE_TRACE_POINTS
#include <trace/events/libata.h>
#include "libata.h"
#include "libata-transport.h"
const struct ata_port_operations ata_base_port_ops = {
.prereset = ata_std_prereset,
.postreset = ata_std_postreset,
.error_handler = ata_std_error_handler,
.sched_eh = ata_std_sched_eh,
.end_eh = ata_std_end_eh,
};
const struct ata_port_operations sata_port_ops = {
.inherits = &ata_base_port_ops,
.qc_defer = ata_std_qc_defer,
.hardreset = sata_std_hardreset,
};
EXPORT_SYMBOL_GPL(sata_port_ops);
static unsigned int ata_dev_init_params(struct ata_device *dev,
u16 heads, u16 sectors);
static unsigned int ata_dev_set_xfermode(struct ata_device *dev);
static void ata_dev_xfermask(struct ata_device *dev);
static unsigned long ata_dev_blacklisted(const struct ata_device *dev);
atomic_t ata_print_id = ATOMIC_INIT(0);
#ifdef CONFIG_ATA_FORCE
struct ata_force_param {
const char *name;
u8 cbl;
u8 spd_limit;
unsigned long xfer_mask;
unsigned int horkage_on;
unsigned int horkage_off;
u16 lflags;
};
struct ata_force_ent {
int port;
int device;
struct ata_force_param param;
};
static struct ata_force_ent *ata_force_tbl;
static int ata_force_tbl_size;
static char ata_force_param_buf[COMMAND_LINE_SIZE] __initdata;
/* param_buf is thrown away after initialization, disallow read */
module_param_string(force, ata_force_param_buf, sizeof(ata_force_param_buf), 0);
MODULE_PARM_DESC(force, "Force ATA configurations including cable type, link speed and transfer mode (see Documentation/admin-guide/kernel-parameters.rst for details)");
#endif
static int atapi_enabled = 1;
module_param(atapi_enabled, int, 0444);
MODULE_PARM_DESC(atapi_enabled, "Enable discovery of ATAPI devices (0=off, 1=on [default])");
static int atapi_dmadir = 0;
module_param(atapi_dmadir, int, 0444);
MODULE_PARM_DESC(atapi_dmadir, "Enable ATAPI DMADIR bridge support (0=off [default], 1=on)");
int atapi_passthru16 = 1;
module_param(atapi_passthru16, int, 0444);
MODULE_PARM_DESC(atapi_passthru16, "Enable ATA_16 passthru for ATAPI devices (0=off, 1=on [default])");
int libata_fua = 0;
module_param_named(fua, libata_fua, int, 0444);
MODULE_PARM_DESC(fua, "FUA support (0=off [default], 1=on)");
static int ata_ignore_hpa;
module_param_named(ignore_hpa, ata_ignore_hpa, int, 0644);
MODULE_PARM_DESC(ignore_hpa, "Ignore HPA limit (0=keep BIOS limits, 1=ignore limits, using full disk)");
static int libata_dma_mask = ATA_DMA_MASK_ATA|ATA_DMA_MASK_ATAPI|ATA_DMA_MASK_CFA;
module_param_named(dma, libata_dma_mask, int, 0444);
MODULE_PARM_DESC(dma, "DMA enable/disable (0x1==ATA, 0x2==ATAPI, 0x4==CF)");
static int ata_probe_timeout;
module_param(ata_probe_timeout, int, 0444);
MODULE_PARM_DESC(ata_probe_timeout, "Set ATA probing timeout (seconds)");
int libata_noacpi = 0;
module_param_named(noacpi, libata_noacpi, int, 0444);
MODULE_PARM_DESC(noacpi, "Disable the use of ACPI in probe/suspend/resume (0=off [default], 1=on)");
int libata_allow_tpm = 0;
module_param_named(allow_tpm, libata_allow_tpm, int, 0444);
MODULE_PARM_DESC(allow_tpm, "Permit the use of TPM commands (0=off [default], 1=on)");
static int atapi_an;
module_param(atapi_an, int, 0444);
MODULE_PARM_DESC(atapi_an, "Enable ATAPI AN media presence notification (0=0ff [default], 1=on)");
MODULE_AUTHOR("Jeff Garzik");
MODULE_DESCRIPTION("Library module for ATA devices");
MODULE_LICENSE("GPL");
MODULE_VERSION(DRV_VERSION);
static inline bool ata_dev_print_info(struct ata_device *dev)
{
struct ata_eh_context *ehc = &dev->link->eh_context;
return ehc->i.flags & ATA_EHI_PRINTINFO;
}
static bool ata_sstatus_online(u32 sstatus)
{
return (sstatus & 0xf) == 0x3;
}
/**
* ata_link_next - link iteration helper
* @link: the previous link, NULL to start
* @ap: ATA port containing links to iterate
* @mode: iteration mode, one of ATA_LITER_*
*
* LOCKING:
* Host lock or EH context.
*
* RETURNS:
* Pointer to the next link.
*/
struct ata_link *ata_link_next(struct ata_link *link, struct ata_port *ap,
enum ata_link_iter_mode mode)
{
BUG_ON(mode != ATA_LITER_EDGE &&
mode != ATA_LITER_PMP_FIRST && mode != ATA_LITER_HOST_FIRST);
/* NULL link indicates start of iteration */
if (!link)
switch (mode) {
case ATA_LITER_EDGE:
case ATA_LITER_PMP_FIRST:
if (sata_pmp_attached(ap))
return ap->pmp_link;
fallthrough;
case ATA_LITER_HOST_FIRST:
return &ap->link;
}
/* we just iterated over the host link, what's next? */
if (link == &ap->link)
switch (mode) {
case ATA_LITER_HOST_FIRST:
if (sata_pmp_attached(ap))
return ap->pmp_link;
fallthrough;
case ATA_LITER_PMP_FIRST:
if (unlikely(ap->slave_link))
return ap->slave_link;
fallthrough;
case ATA_LITER_EDGE:
return NULL;
}
/* slave_link excludes PMP */
if (unlikely(link == ap->slave_link))
return NULL;
/* we were over a PMP link */
if (++link < ap->pmp_link + ap->nr_pmp_links)
return link;
if (mode == ATA_LITER_PMP_FIRST)
return &ap->link;
return NULL;
}
EXPORT_SYMBOL_GPL(ata_link_next);
/**
* ata_dev_next - device iteration helper
* @dev: the previous device, NULL to start
* @link: ATA link containing devices to iterate
* @mode: iteration mode, one of ATA_DITER_*
*
* LOCKING:
* Host lock or EH context.
*
* RETURNS:
* Pointer to the next device.
*/
struct ata_device *ata_dev_next(struct ata_device *dev, struct ata_link *link,
enum ata_dev_iter_mode mode)
{
BUG_ON(mode != ATA_DITER_ENABLED && mode != ATA_DITER_ENABLED_REVERSE &&
mode != ATA_DITER_ALL && mode != ATA_DITER_ALL_REVERSE);
/* NULL dev indicates start of iteration */
if (!dev)
switch (mode) {
case ATA_DITER_ENABLED:
case ATA_DITER_ALL:
dev = link->device;
goto check;
case ATA_DITER_ENABLED_REVERSE:
case ATA_DITER_ALL_REVERSE:
dev = link->device + ata_link_max_devices(link) - 1;
goto check;
}
next:
/* move to the next one */
switch (mode) {
case ATA_DITER_ENABLED:
case ATA_DITER_ALL:
if (++dev < link->device + ata_link_max_devices(link))
goto check;
return NULL;
case ATA_DITER_ENABLED_REVERSE:
case ATA_DITER_ALL_REVERSE:
if (--dev >= link->device)
goto check;
return NULL;
}
check:
if ((mode == ATA_DITER_ENABLED || mode == ATA_DITER_ENABLED_REVERSE) &&
!ata_dev_enabled(dev))
goto next;
return dev;
}
EXPORT_SYMBOL_GPL(ata_dev_next);
/**
* ata_dev_phys_link - find physical link for a device
* @dev: ATA device to look up physical link for
*
* Look up physical link which @dev is attached to. Note that
* this is different from @dev->link only when @dev is on slave
* link. For all other cases, it's the same as @dev->link.
*
* LOCKING:
* Don't care.
*
* RETURNS:
* Pointer to the found physical link.
*/
struct ata_link *ata_dev_phys_link(struct ata_device *dev)
{
struct ata_port *ap = dev->link->ap;
if (!ap->slave_link)
return dev->link;
if (!dev->devno)
return &ap->link;
return ap->slave_link;
}
#ifdef CONFIG_ATA_FORCE
/**
* ata_force_cbl - force cable type according to libata.force
* @ap: ATA port of interest
*
* Force cable type according to libata.force and whine about it.
* The last entry which has matching port number is used, so it
* can be specified as part of device force parameters. For
* example, both "a:40c,1.00:udma4" and "1.00:40c,udma4" have the
* same effect.
*
* LOCKING:
* EH context.
*/
void ata_force_cbl(struct ata_port *ap)
{
int i;
for (i = ata_force_tbl_size - 1; i >= 0; i--) {
const struct ata_force_ent *fe = &ata_force_tbl[i];
if (fe->port != -1 && fe->port != ap->print_id)
continue;
if (fe->param.cbl == ATA_CBL_NONE)
continue;
ap->cbl = fe->param.cbl;
ata_port_notice(ap, "FORCE: cable set to %s\n", fe->param.name);
return;
}
}
/**
* ata_force_link_limits - force link limits according to libata.force
* @link: ATA link of interest
*
* Force link flags and SATA spd limit according to libata.force
* and whine about it. When only the port part is specified
* (e.g. 1:), the limit applies to all links connected to both
* the host link and all fan-out ports connected via PMP. If the
* device part is specified as 0 (e.g. 1.00:), it specifies the
* first fan-out link not the host link. Device number 15 always
* points to the host link whether PMP is attached or not. If the
* controller has slave link, device number 16 points to it.
*
* LOCKING:
* EH context.
*/
static void ata_force_link_limits(struct ata_link *link)
{
bool did_spd = false;
int linkno = link->pmp;
int i;
if (ata_is_host_link(link))
linkno += 15;
for (i = ata_force_tbl_size - 1; i >= 0; i--) {
const struct ata_force_ent *fe = &ata_force_tbl[i];
if (fe->port != -1 && fe->port != link->ap->print_id)
continue;
if (fe->device != -1 && fe->device != linkno)
continue;
/* only honor the first spd limit */
if (!did_spd && fe->param.spd_limit) {
link->hw_sata_spd_limit = (1 << fe->param.spd_limit) - 1;
ata_link_notice(link, "FORCE: PHY spd limit set to %s\n",
fe->param.name);
did_spd = true;
}
/* let lflags stack */
if (fe->param.lflags) {
link->flags |= fe->param.lflags;
ata_link_notice(link,
"FORCE: link flag 0x%x forced -> 0x%x\n",
fe->param.lflags, link->flags);
}
}
}
/**
* ata_force_xfermask - force xfermask according to libata.force
* @dev: ATA device of interest
*
* Force xfer_mask according to libata.force and whine about it.
* For consistency with link selection, device number 15 selects
* the first device connected to the host link.
*
* LOCKING:
* EH context.
*/
static void ata_force_xfermask(struct ata_device *dev)
{
int devno = dev->link->pmp + dev->devno;
int alt_devno = devno;
int i;
/* allow n.15/16 for devices attached to host port */
if (ata_is_host_link(dev->link))
alt_devno += 15;
for (i = ata_force_tbl_size - 1; i >= 0; i--) {
const struct ata_force_ent *fe = &ata_force_tbl[i];
unsigned long pio_mask, mwdma_mask, udma_mask;
if (fe->port != -1 && fe->port != dev->link->ap->print_id)
continue;
if (fe->device != -1 && fe->device != devno &&
fe->device != alt_devno)
continue;
if (!fe->param.xfer_mask)
continue;
ata_unpack_xfermask(fe->param.xfer_mask,
&pio_mask, &mwdma_mask, &udma_mask);
if (udma_mask)
dev->udma_mask = udma_mask;
else if (mwdma_mask) {
dev->udma_mask = 0;
dev->mwdma_mask = mwdma_mask;
} else {
dev->udma_mask = 0;
dev->mwdma_mask = 0;
dev->pio_mask = pio_mask;
}
ata_dev_notice(dev, "FORCE: xfer_mask set to %s\n",
fe->param.name);
return;
}
}
/**
* ata_force_horkage - force horkage according to libata.force
* @dev: ATA device of interest
*
* Force horkage according to libata.force and whine about it.
* For consistency with link selection, device number 15 selects
* the first device connected to the host link.
*
* LOCKING:
* EH context.
*/
static void ata_force_horkage(struct ata_device *dev)
{
int devno = dev->link->pmp + dev->devno;
int alt_devno = devno;
int i;
/* allow n.15/16 for devices attached to host port */
if (ata_is_host_link(dev->link))
alt_devno += 15;
for (i = 0; i < ata_force_tbl_size; i++) {
const struct ata_force_ent *fe = &ata_force_tbl[i];
if (fe->port != -1 && fe->port != dev->link->ap->print_id)
continue;
if (fe->device != -1 && fe->device != devno &&
fe->device != alt_devno)
continue;
if (!(~dev->horkage & fe->param.horkage_on) &&
!(dev->horkage & fe->param.horkage_off))
continue;
dev->horkage |= fe->param.horkage_on;
dev->horkage &= ~fe->param.horkage_off;
ata_dev_notice(dev, "FORCE: horkage modified (%s)\n",
fe->param.name);
}
}
#else
static inline void ata_force_link_limits(struct ata_link *link) { }
static inline void ata_force_xfermask(struct ata_device *dev) { }
static inline void ata_force_horkage(struct ata_device *dev) { }
#endif
/**
* atapi_cmd_type - Determine ATAPI command type from SCSI opcode
* @opcode: SCSI opcode
*
* Determine ATAPI command type from @opcode.
*
* LOCKING:
* None.
*
* RETURNS:
* ATAPI_{READ|WRITE|READ_CD|PASS_THRU|MISC}
*/
int atapi_cmd_type(u8 opcode)
{
switch (opcode) {
case GPCMD_READ_10:
case GPCMD_READ_12:
return ATAPI_READ;
case GPCMD_WRITE_10:
case GPCMD_WRITE_12:
case GPCMD_WRITE_AND_VERIFY_10:
return ATAPI_WRITE;
case GPCMD_READ_CD:
case GPCMD_READ_CD_MSF:
return ATAPI_READ_CD;
case ATA_16:
case ATA_12:
if (atapi_passthru16)
return ATAPI_PASS_THRU;
fallthrough;
default:
return ATAPI_MISC;
}
}
EXPORT_SYMBOL_GPL(atapi_cmd_type);
static const u8 ata_rw_cmds[] = {
/* pio multi */
ATA_CMD_READ_MULTI,
ATA_CMD_WRITE_MULTI,
ATA_CMD_READ_MULTI_EXT,
ATA_CMD_WRITE_MULTI_EXT,
0,
0,
0,
ATA_CMD_WRITE_MULTI_FUA_EXT,
/* pio */
ATA_CMD_PIO_READ,
ATA_CMD_PIO_WRITE,
ATA_CMD_PIO_READ_EXT,
ATA_CMD_PIO_WRITE_EXT,
0,
0,
0,
0,
/* dma */
ATA_CMD_READ,
ATA_CMD_WRITE,
ATA_CMD_READ_EXT,
ATA_CMD_WRITE_EXT,
0,
0,
0,
ATA_CMD_WRITE_FUA_EXT
};
/**
* ata_rwcmd_protocol - set taskfile r/w commands and protocol
* @tf: command to examine and configure
* @dev: device tf belongs to
*
* Examine the device configuration and tf->flags to calculate
* the proper read/write commands and protocol to use.
*
* LOCKING:
* caller.
*/
static int ata_rwcmd_protocol(struct ata_taskfile *tf, struct ata_device *dev)
{
u8 cmd;
int index, fua, lba48, write;
fua = (tf->flags & ATA_TFLAG_FUA) ? 4 : 0;
lba48 = (tf->flags & ATA_TFLAG_LBA48) ? 2 : 0;
write = (tf->flags & ATA_TFLAG_WRITE) ? 1 : 0;
if (dev->flags & ATA_DFLAG_PIO) {
tf->protocol = ATA_PROT_PIO;
index = dev->multi_count ? 0 : 8;
} else if (lba48 && (dev->link->ap->flags & ATA_FLAG_PIO_LBA48)) {
/* Unable to use DMA due to host limitation */
tf->protocol = ATA_PROT_PIO;
index = dev->multi_count ? 0 : 8;
} else {
tf->protocol = ATA_PROT_DMA;
index = 16;
}
cmd = ata_rw_cmds[index + fua + lba48 + write];
if (cmd) {
tf->command = cmd; return 0;
}
return -1;
}
/**
* ata_tf_read_block - Read block address from ATA taskfile
* @tf: ATA taskfile of interest
* @dev: ATA device @tf belongs to
*
* LOCKING:
* None.
*
* Read block address from @tf. This function can handle all
* three address formats - LBA, LBA48 and CHS. tf->protocol and
* flags select the address format to use.
*
* RETURNS:
* Block address read from @tf.
*/
u64 ata_tf_read_block(const struct ata_taskfile *tf, struct ata_device *dev)
{
u64 block = 0;
if (tf->flags & ATA_TFLAG_LBA) {
if (tf->flags & ATA_TFLAG_LBA48) {
block |= (u64)tf->hob_lbah << 40;
block |= (u64)tf->hob_lbam << 32;
block |= (u64)tf->hob_lbal << 24;
} else
block |= (tf->device & 0xf) << 24;
block |= tf->lbah << 16;
block |= tf->lbam << 8;
block |= tf->lbal;
} else {
u32 cyl, head, sect;
cyl = tf->lbam | (tf->lbah << 8);
head = tf->device & 0xf;
sect = tf->lbal;
if (!sect) {
ata_dev_warn(dev,
"device reported invalid CHS sector 0\n");
return U64_MAX;
}
block = (cyl * dev->heads + head) * dev->sectors + sect - 1;
}
return block;
}
/**
* ata_build_rw_tf - Build ATA taskfile for given read/write request
* @tf: Target ATA taskfile
* @dev: ATA device @tf belongs to
* @block: Block address
* @n_block: Number of blocks
* @tf_flags: RW/FUA etc...
* @tag: tag
* @class: IO priority class
*
* LOCKING:
* None.
*
* Build ATA taskfile @tf for read/write request described by
* @block, @n_block, @tf_flags and @tag on @dev.
*
* RETURNS:
*
* 0 on success, -ERANGE if the request is too large for @dev,
* -EINVAL if the request is invalid.
*/
int ata_build_rw_tf(struct ata_taskfile *tf, struct ata_device *dev,
u64 block, u32 n_block, unsigned int tf_flags,
unsigned int tag, int class)
{
tf->flags |= ATA_TFLAG_ISADDR | ATA_TFLAG_DEVICE;
tf->flags |= tf_flags;
if (ata_ncq_enabled(dev) && !ata_tag_internal(tag)) {
/* yay, NCQ */
if (!lba_48_ok(block, n_block))
return -ERANGE; tf->protocol = ATA_PROT_NCQ;
tf->flags |= ATA_TFLAG_LBA | ATA_TFLAG_LBA48;
if (tf->flags & ATA_TFLAG_WRITE)
tf->command = ATA_CMD_FPDMA_WRITE;
else
tf->command = ATA_CMD_FPDMA_READ;
tf->nsect = tag << 3;
tf->hob_feature = (n_block >> 8) & 0xff;
tf->feature = n_block & 0xff;
tf->hob_lbah = (block >> 40) & 0xff;
tf->hob_lbam = (block >> 32) & 0xff;
tf->hob_lbal = (block >> 24) & 0xff;
tf->lbah = (block >> 16) & 0xff;
tf->lbam = (block >> 8) & 0xff;
tf->lbal = block & 0xff;
tf->device = ATA_LBA;
if (tf->flags & ATA_TFLAG_FUA)
tf->device |= 1 << 7; if (dev->flags & ATA_DFLAG_NCQ_PRIO_ENABLE &&
class == IOPRIO_CLASS_RT)
tf->hob_nsect |= ATA_PRIO_HIGH << ATA_SHIFT_PRIO; } else if (dev->flags & ATA_DFLAG_LBA) { tf->flags |= ATA_TFLAG_LBA;
if (lba_28_ok(block, n_block)) {
/* use LBA28 */
tf->device |= (block >> 24) & 0xf;
} else if (lba_48_ok(block, n_block)) {
if (!(dev->flags & ATA_DFLAG_LBA48))
return -ERANGE;
/* use LBA48 */
tf->flags |= ATA_TFLAG_LBA48;
tf->hob_nsect = (n_block >> 8) & 0xff;
tf->hob_lbah = (block >> 40) & 0xff;
tf->hob_lbam = (block >> 32) & 0xff;
tf->hob_lbal = (block >> 24) & 0xff;
} else
/* request too large even for LBA48 */
return -ERANGE;
if (unlikely(ata_rwcmd_protocol(tf, dev) < 0))
return -EINVAL;
tf->nsect = n_block & 0xff;
tf->lbah = (block >> 16) & 0xff;
tf->lbam = (block >> 8) & 0xff;
tf->lbal = block & 0xff;
tf->device |= ATA_LBA;
} else {
/* CHS */
u32 sect, head, cyl, track;
/* The request -may- be too large for CHS addressing. */
if (!lba_28_ok(block, n_block))
return -ERANGE;
if (unlikely(ata_rwcmd_protocol(tf, dev) < 0))
return -EINVAL;
/* Convert LBA to CHS */
track = (u32)block / dev->sectors;
cyl = track / dev->heads;
head = track % dev->heads;
sect = (u32)block % dev->sectors + 1;
DPRINTK("block %u track %u cyl %u head %u sect %u\n",
(u32)block, track, cyl, head, sect);
/* Check whether the converted CHS can fit.
Cylinder: 0-65535
Head: 0-15
Sector: 1-255*/
if ((cyl >> 16) || (head >> 4) || (sect >> 8) || (!sect))
return -ERANGE;
tf->nsect = n_block & 0xff; /* Sector count 0 means 256 sectors */
tf->lbal = sect;
tf->lbam = cyl;
tf->lbah = cyl >> 8;
tf->device |= head;
}
return 0;
}
/**
* ata_pack_xfermask - Pack pio, mwdma and udma masks into xfer_mask
* @pio_mask: pio_mask
* @mwdma_mask: mwdma_mask
* @udma_mask: udma_mask
*
* Pack @pio_mask, @mwdma_mask and @udma_mask into a single
* unsigned int xfer_mask.
*
* LOCKING:
* None.
*
* RETURNS:
* Packed xfer_mask.
*/
unsigned long ata_pack_xfermask(unsigned long pio_mask,
unsigned long mwdma_mask,
unsigned long udma_mask)
{
return ((pio_mask << ATA_SHIFT_PIO) & ATA_MASK_PIO) |
((mwdma_mask << ATA_SHIFT_MWDMA) & ATA_MASK_MWDMA) |
((udma_mask << ATA_SHIFT_UDMA) & ATA_MASK_UDMA);
}
EXPORT_SYMBOL_GPL(ata_pack_xfermask);
/**
* ata_unpack_xfermask - Unpack xfer_mask into pio, mwdma and udma masks
* @xfer_mask: xfer_mask to unpack
* @pio_mask: resulting pio_mask
* @mwdma_mask: resulting mwdma_mask
* @udma_mask: resulting udma_mask
*
* Unpack @xfer_mask into @pio_mask, @mwdma_mask and @udma_mask.
* Any NULL destination masks will be ignored.
*/
void ata_unpack_xfermask(unsigned long xfer_mask, unsigned long *pio_mask,
unsigned long *mwdma_mask, unsigned long *udma_mask)
{
if (pio_mask)
*pio_mask = (xfer_mask & ATA_MASK_PIO) >> ATA_SHIFT_PIO;
if (mwdma_mask)
*mwdma_mask = (xfer_mask & ATA_MASK_MWDMA) >> ATA_SHIFT_MWDMA;
if (udma_mask)
*udma_mask = (xfer_mask & ATA_MASK_UDMA) >> ATA_SHIFT_UDMA;
}
static const struct ata_xfer_ent {
int shift, bits;
u8 base;
} ata_xfer_tbl[] = {
{ ATA_SHIFT_PIO, ATA_NR_PIO_MODES, XFER_PIO_0 },
{ ATA_SHIFT_MWDMA, ATA_NR_MWDMA_MODES, XFER_MW_DMA_0 },
{ ATA_SHIFT_UDMA, ATA_NR_UDMA_MODES, XFER_UDMA_0 },
{ -1, },
};
/**
* ata_xfer_mask2mode - Find matching XFER_* for the given xfer_mask
* @xfer_mask: xfer_mask of interest
*
* Return matching XFER_* value for @xfer_mask. Only the highest
* bit of @xfer_mask is considered.
*
* LOCKING:
* None.
*
* RETURNS:
* Matching XFER_* value, 0xff if no match found.
*/
u8 ata_xfer_mask2mode(unsigned long xfer_mask)
{
int highbit = fls(xfer_mask) - 1;
const struct ata_xfer_ent *ent;
for (ent = ata_xfer_tbl; ent->shift >= 0; ent++)
if (highbit >= ent->shift && highbit < ent->shift + ent->bits)
return ent->base + highbit - ent->shift;
return 0xff;
}
EXPORT_SYMBOL_GPL(ata_xfer_mask2mode);
/**
* ata_xfer_mode2mask - Find matching xfer_mask for XFER_*
* @xfer_mode: XFER_* of interest
*
* Return matching xfer_mask for @xfer_mode.
*
* LOCKING:
* None.
*
* RETURNS:
* Matching xfer_mask, 0 if no match found.
*/
unsigned long ata_xfer_mode2mask(u8 xfer_mode)
{
const struct ata_xfer_ent *ent;
for (ent = ata_xfer_tbl; ent->shift >= 0; ent++)
if (xfer_mode >= ent->base && xfer_mode < ent->base + ent->bits)
return ((2 << (ent->shift + xfer_mode - ent->base)) - 1)
& ~((1 << ent->shift) - 1);
return 0;
}
EXPORT_SYMBOL_GPL(ata_xfer_mode2mask);
/**
* ata_xfer_mode2shift - Find matching xfer_shift for XFER_*
* @xfer_mode: XFER_* of interest
*
* Return matching xfer_shift for @xfer_mode.
*
* LOCKING:
* None.
*
* RETURNS:
* Matching xfer_shift, -1 if no match found.
*/
int ata_xfer_mode2shift(unsigned long xfer_mode)
{
const struct ata_xfer_ent *ent;
for (ent = ata_xfer_tbl; ent->shift >= 0; ent++)
if (xfer_mode >= ent->base && xfer_mode < ent->base + ent->bits)
return ent->shift;
return -1;
}
EXPORT_SYMBOL_GPL(ata_xfer_mode2shift);
/**
* ata_mode_string - convert xfer_mask to string
* @xfer_mask: mask of bits supported; only highest bit counts.
*
* Determine string which represents the highest speed
* (highest bit in @modemask).
*
* LOCKING:
* None.
*
* RETURNS:
* Constant C string representing highest speed listed in
* @mode_mask, or the constant C string "<n/a>".
*/
const char *ata_mode_string(unsigned long xfer_mask)
{
static const char * const xfer_mode_str[] = {
"PIO0",
"PIO1",
"PIO2",
"PIO3",
"PIO4",
"PIO5",
"PIO6",
"MWDMA0",
"MWDMA1",
"MWDMA2",
"MWDMA3",
"MWDMA4",
"UDMA/16",
"UDMA/25",
"UDMA/33",
"UDMA/44",
"UDMA/66",
"UDMA/100",
"UDMA/133",
"UDMA7",
};
int highbit;
highbit = fls(xfer_mask) - 1;
if (highbit >= 0 && highbit < ARRAY_SIZE(xfer_mode_str))
return xfer_mode_str[highbit];
return "<n/a>";
}
EXPORT_SYMBOL_GPL(ata_mode_string);
const char *sata_spd_string(unsigned int spd)
{
static const char * const spd_str[] = {
"1.5 Gbps",
"3.0 Gbps",
"6.0 Gbps",
};
if (spd == 0 || (spd - 1) >= ARRAY_SIZE(spd_str))
return "<unknown>";
return spd_str[spd - 1];
}
/**
* ata_dev_classify - determine device type based on ATA-spec signature
* @tf: ATA taskfile register set for device to be identified
*
* Determine from taskfile register contents whether a device is
* ATA or ATAPI, as per "Signature and persistence" section
* of ATA/PI spec (volume 1, sect 5.14).
*
* LOCKING:
* None.
*
* RETURNS:
* Device type, %ATA_DEV_ATA, %ATA_DEV_ATAPI, %ATA_DEV_PMP,
* %ATA_DEV_ZAC, or %ATA_DEV_UNKNOWN the event of failure.
*/
unsigned int ata_dev_classify(const struct ata_taskfile *tf)
{
/* Apple's open source Darwin code hints that some devices only
* put a proper signature into the LBA mid/high registers,
* So, we only check those. It's sufficient for uniqueness.
*
* ATA/ATAPI-7 (d1532v1r1: Feb. 19, 2003) specified separate
* signatures for ATA and ATAPI devices attached on SerialATA,
* 0x3c/0xc3 and 0x69/0x96 respectively. However, SerialATA
* spec has never mentioned about using different signatures
* for ATA/ATAPI devices. Then, Serial ATA II: Port
* Multiplier specification began to use 0x69/0x96 to identify
* port multpliers and 0x3c/0xc3 to identify SEMB device.
* ATA/ATAPI-7 dropped descriptions about 0x3c/0xc3 and
* 0x69/0x96 shortly and described them as reserved for
* SerialATA.
*
* We follow the current spec and consider that 0x69/0x96
* identifies a port multiplier and 0x3c/0xc3 a SEMB device.
* Unfortunately, WDC WD1600JS-62MHB5 (a hard drive) reports
* SEMB signature. This is worked around in
* ata_dev_read_id().
*/
if ((tf->lbam == 0) && (tf->lbah == 0)) {
DPRINTK("found ATA device by sig\n");
return ATA_DEV_ATA;
}
if ((tf->lbam == 0x14) && (tf->lbah == 0xeb)) {
DPRINTK("found ATAPI device by sig\n");
return ATA_DEV_ATAPI;
}
if ((tf->lbam == 0x69) && (tf->lbah == 0x96)) {
DPRINTK("found PMP device by sig\n");
return ATA_DEV_PMP;
}
if ((tf->lbam == 0x3c) && (tf->lbah == 0xc3)) {
DPRINTK("found SEMB device by sig (could be ATA device)\n");
return ATA_DEV_SEMB;
}
if ((tf->lbam == 0xcd) && (tf->lbah == 0xab)) {
DPRINTK("found ZAC device by sig\n");
return ATA_DEV_ZAC;
}
DPRINTK("unknown device\n");
return ATA_DEV_UNKNOWN;
}
EXPORT_SYMBOL_GPL(ata_dev_classify);
/**
* ata_id_string - Convert IDENTIFY DEVICE page into string
* @id: IDENTIFY DEVICE results we will examine
* @s: string into which data is output
* @ofs: offset into identify device page
* @len: length of string to return. must be an even number.
*
* The strings in the IDENTIFY DEVICE page are broken up into
* 16-bit chunks. Run through the string, and output each
* 8-bit chunk linearly, regardless of platform.
*
* LOCKING:
* caller.
*/
void ata_id_string(const u16 *id, unsigned char *s,
unsigned int ofs, unsigned int len)
{
unsigned int c;
BUG_ON(len & 1);
while (len > 0) {
c = id[ofs] >> 8;
*s = c;
s++;
c = id[ofs] & 0xff;
*s = c;
s++;
ofs++;
len -= 2;
}
}
EXPORT_SYMBOL_GPL(ata_id_string);
/**
* ata_id_c_string - Convert IDENTIFY DEVICE page into C string
* @id: IDENTIFY DEVICE results we will examine
* @s: string into which data is output
* @ofs: offset into identify device page
* @len: length of string to return. must be an odd number.
*
* This function is identical to ata_id_string except that it
* trims trailing spaces and terminates the resulting string with
* null. @len must be actual maximum length (even number) + 1.
*
* LOCKING:
* caller.
*/
void ata_id_c_string(const u16 *id, unsigned char *s,
unsigned int ofs, unsigned int len)
{
unsigned char *p;
ata_id_string(id, s, ofs, len - 1);
p = s + strnlen(s, len - 1);
while (p > s && p[-1] == ' ')
p--;
*p = '\0';
}
EXPORT_SYMBOL_GPL(ata_id_c_string);
static u64 ata_id_n_sectors(const u16 *id)
{
if (ata_id_has_lba(id)) {
if (ata_id_has_lba48(id))
return ata_id_u64(id, ATA_ID_LBA_CAPACITY_2);
else
return ata_id_u32(id, ATA_ID_LBA_CAPACITY);
} else {
if (ata_id_current_chs_valid(id))
return id[ATA_ID_CUR_CYLS] * id[ATA_ID_CUR_HEADS] *
id[ATA_ID_CUR_SECTORS];
else
return id[ATA_ID_CYLS] * id[ATA_ID_HEADS] *
id[ATA_ID_SECTORS];
}
}
u64 ata_tf_to_lba48(const struct ata_taskfile *tf)
{
u64 sectors = 0;
sectors |= ((u64)(tf->hob_lbah & 0xff)) << 40;
sectors |= ((u64)(tf->hob_lbam & 0xff)) << 32;
sectors |= ((u64)(tf->hob_lbal & 0xff)) << 24;
sectors |= (tf->lbah & 0xff) << 16;
sectors |= (tf->lbam & 0xff) << 8;
sectors |= (tf->lbal & 0xff);
return sectors;
}
u64 ata_tf_to_lba(const struct ata_taskfile *tf)
{
u64 sectors = 0;
sectors |= (tf->device & 0x0f) << 24;
sectors |= (tf->lbah & 0xff) << 16;
sectors |= (tf->lbam & 0xff) << 8;
sectors |= (tf->lbal & 0xff);
return sectors;
}
/**
* ata_read_native_max_address - Read native max address
* @dev: target device
* @max_sectors: out parameter for the result native max address
*
* Perform an LBA48 or LBA28 native size query upon the device in
* question.
*
* RETURNS:
* 0 on success, -EACCES if command is aborted by the drive.
* -EIO on other errors.
*/
static int ata_read_native_max_address(struct ata_device *dev, u64 *max_sectors)
{
unsigned int err_mask;
struct ata_taskfile tf;
int lba48 = ata_id_has_lba48(dev->id);
ata_tf_init(dev, &tf);
/* always clear all address registers */
tf.flags |= ATA_TFLAG_DEVICE | ATA_TFLAG_ISADDR;
if (lba48) {
tf.command = ATA_CMD_READ_NATIVE_MAX_EXT;
tf.flags |= ATA_TFLAG_LBA48;
} else
tf.command = ATA_CMD_READ_NATIVE_MAX;
tf.protocol = ATA_PROT_NODATA;
tf.device |= ATA_LBA;
err_mask = ata_exec_internal(dev, &tf, NULL, DMA_NONE, NULL, 0, 0);
if (err_mask) {
ata_dev_warn(dev,
"failed to read native max address (err_mask=0x%x)\n",
err_mask);
if (err_mask == AC_ERR_DEV && (tf.feature & ATA_ABORTED))
return -EACCES;
return -EIO;
}
if (lba48)
*max_sectors = ata_tf_to_lba48(&tf) + 1;
else
*max_sectors = ata_tf_to_lba(&tf) + 1;
if (dev->horkage & ATA_HORKAGE_HPA_SIZE)
(*max_sectors)--;
return 0;
}
/**
* ata_set_max_sectors - Set max sectors
* @dev: target device
* @new_sectors: new max sectors value to set for the device
*
* Set max sectors of @dev to @new_sectors.
*
* RETURNS:
* 0 on success, -EACCES if command is aborted or denied (due to
* previous non-volatile SET_MAX) by the drive. -EIO on other
* errors.
*/
static int ata_set_max_sectors(struct ata_device *dev, u64 new_sectors)
{
unsigned int err_mask;
struct ata_taskfile tf;
int lba48 = ata_id_has_lba48(dev->id);
new_sectors--;
ata_tf_init(dev, &tf);
tf.flags |= ATA_TFLAG_DEVICE | ATA_TFLAG_ISADDR;
if (lba48) {
tf.command = ATA_CMD_SET_MAX_EXT;
tf.flags |= ATA_TFLAG_LBA48;
tf.hob_lbal = (new_sectors >> 24) & 0xff;
tf.hob_lbam = (new_sectors >> 32) & 0xff;
tf.hob_lbah = (new_sectors >> 40) & 0xff;
} else {
tf.command = ATA_CMD_SET_MAX;
tf.device |= (new_sectors >> 24) & 0xf;
}
tf.protocol = ATA_PROT_NODATA;
tf.device |= ATA_LBA;
tf.lbal = (new_sectors >> 0) & 0xff;
tf.lbam = (new_sectors >> 8) & 0xff;
tf.lbah = (new_sectors >> 16) & 0xff;
err_mask = ata_exec_internal(dev, &tf, NULL, DMA_NONE, NULL, 0, 0);
if (err_mask) {
ata_dev_warn(dev,
"failed to set max address (err_mask=0x%x)\n",
err_mask);
if (err_mask == AC_ERR_DEV &&
(tf.feature & (ATA_ABORTED | ATA_IDNF)))
return -EACCES;
return -EIO;
}
return 0;
}
/**
* ata_hpa_resize - Resize a device with an HPA set
* @dev: Device to resize
*
* Read the size of an LBA28 or LBA48 disk with HPA features and resize
* it if required to the full size of the media. The caller must check
* the drive has the HPA feature set enabled.
*
* RETURNS:
* 0 on success, -errno on failure.
*/
static int ata_hpa_resize(struct ata_device *dev)
{
bool print_info = ata_dev_print_info(dev);
bool unlock_hpa = ata_ignore_hpa || dev->flags & ATA_DFLAG_UNLOCK_HPA;
u64 sectors = ata_id_n_sectors(dev->id);
u64 native_sectors;
int rc;
/* do we need to do it? */
if ((dev->class != ATA_DEV_ATA && dev->class != ATA_DEV_ZAC) ||
!ata_id_has_lba(dev->id) || !ata_id_hpa_enabled(dev->id) ||
(dev->horkage & ATA_HORKAGE_BROKEN_HPA))
return 0;
/* read native max address */
rc = ata_read_native_max_address(dev, &native_sectors);
if (rc) {
/* If device aborted the command or HPA isn't going to
* be unlocked, skip HPA resizing.
*/
if (rc == -EACCES || !unlock_hpa) {
ata_dev_warn(dev,
"HPA support seems broken, skipping HPA handling\n");
dev->horkage |= ATA_HORKAGE_BROKEN_HPA;
/* we can continue if device aborted the command */
if (rc == -EACCES)
rc = 0;
}
return rc;
}
dev->n_native_sectors = native_sectors;
/* nothing to do? */
if (native_sectors <= sectors || !unlock_hpa) {
if (!print_info || native_sectors == sectors)
return 0;
if (native_sectors > sectors)
ata_dev_info(dev,
"HPA detected: current %llu, native %llu\n",
(unsigned long long)sectors,
(unsigned long long)native_sectors);
else if (native_sectors < sectors)
ata_dev_warn(dev,
"native sectors (%llu) is smaller than sectors (%llu)\n",
(unsigned long long)native_sectors,
(unsigned long long)sectors);
return 0;
}
/* let's unlock HPA */
rc = ata_set_max_sectors(dev, native_sectors);
if (rc == -EACCES) {
/* if device aborted the command, skip HPA resizing */
ata_dev_warn(dev,
"device aborted resize (%llu -> %llu), skipping HPA handling\n",
(unsigned long long)sectors,
(unsigned long long)native_sectors);
dev->horkage |= ATA_HORKAGE_BROKEN_HPA;
return 0;
} else if (rc)
return rc;
/* re-read IDENTIFY data */
rc = ata_dev_reread_id(dev, 0);
if (rc) {
ata_dev_err(dev,
"failed to re-read IDENTIFY data after HPA resizing\n");
return rc;
}
if (print_info) {
u64 new_sectors = ata_id_n_sectors(dev->id);
ata_dev_info(dev,
"HPA unlocked: %llu -> %llu, native %llu\n",
(unsigned long long)sectors,
(unsigned long long)new_sectors,
(unsigned long long)native_sectors);
}
return 0;
}
/**
* ata_dump_id - IDENTIFY DEVICE info debugging output
* @id: IDENTIFY DEVICE page to dump
*
* Dump selected 16-bit words from the given IDENTIFY DEVICE
* page.
*
* LOCKING:
* caller.
*/
static inline void ata_dump_id(const u16 *id)
{
DPRINTK("49==0x%04x "
"53==0x%04x "
"63==0x%04x "
"64==0x%04x "
"75==0x%04x \n",
id[49],
id[53],
id[63],
id[64],
id[75]);
DPRINTK("80==0x%04x "
"81==0x%04x "
"82==0x%04x "
"83==0x%04x "
"84==0x%04x \n",
id[80],
id[81],
id[82],
id[83],
id[84]);
DPRINTK("88==0x%04x "
"93==0x%04x\n",
id[88],
id[93]);
}
/**
* ata_id_xfermask - Compute xfermask from the given IDENTIFY data
* @id: IDENTIFY data to compute xfer mask from
*
* Compute the xfermask for this device. This is not as trivial
* as it seems if we must consider early devices correctly.
*
* FIXME: pre IDE drive timing (do we care ?).
*
* LOCKING:
* None.
*
* RETURNS:
* Computed xfermask
*/
unsigned long ata_id_xfermask(const u16 *id)
{
unsigned long pio_mask, mwdma_mask, udma_mask;
/* Usual case. Word 53 indicates word 64 is valid */
if (id[ATA_ID_FIELD_VALID] & (1 << 1)) {
pio_mask = id[ATA_ID_PIO_MODES] & 0x03;
pio_mask <<= 3;
pio_mask |= 0x7;
} else {
/* If word 64 isn't valid then Word 51 high byte holds
* the PIO timing number for the maximum. Turn it into
* a mask.
*/
u8 mode = (id[ATA_ID_OLD_PIO_MODES] >> 8) & 0xFF;
if (mode < 5) /* Valid PIO range */
pio_mask = (2 << mode) - 1;
else
pio_mask = 1;
/* But wait.. there's more. Design your standards by
* committee and you too can get a free iordy field to
* process. However its the speeds not the modes that
* are supported... Note drivers using the timing API
* will get this right anyway
*/
}
mwdma_mask = id[ATA_ID_MWDMA_MODES] & 0x07;
if (ata_id_is_cfa(id)) {
/*
* Process compact flash extended modes
*/
int pio = (id[ATA_ID_CFA_MODES] >> 0) & 0x7;
int dma = (id[ATA_ID_CFA_MODES] >> 3) & 0x7;
if (pio)
pio_mask |= (1 << 5);
if (pio > 1)
pio_mask |= (1 << 6);
if (dma)
mwdma_mask |= (1 << 3);
if (dma > 1)
mwdma_mask |= (1 << 4);
}
udma_mask = 0;
if (id[ATA_ID_FIELD_VALID] & (1 << 2))
udma_mask = id[ATA_ID_UDMA_MODES] & 0xff;
return ata_pack_xfermask(pio_mask, mwdma_mask, udma_mask);
}
EXPORT_SYMBOL_GPL(ata_id_xfermask);
static void ata_qc_complete_internal(struct ata_queued_cmd *qc)
{
struct completion *waiting = qc->private_data;
complete(waiting);
}
/**
* ata_exec_internal_sg - execute libata internal command
* @dev: Device to which the command is sent
* @tf: Taskfile registers for the command and the result
* @cdb: CDB for packet command
* @dma_dir: Data transfer direction of the command
* @sgl: sg list for the data buffer of the command
* @n_elem: Number of sg entries
* @timeout: Timeout in msecs (0 for default)
*
* Executes libata internal command with timeout. @tf contains
* command on entry and result on return. Timeout and error
* conditions are reported via return value. No recovery action
* is taken after a command times out. It's caller's duty to
* clean up after timeout.
*
* LOCKING:
* None. Should be called with kernel context, might sleep.
*
* RETURNS:
* Zero on success, AC_ERR_* mask on failure
*/
unsigned ata_exec_internal_sg(struct ata_device *dev,
struct ata_taskfile *tf, const u8 *cdb,
int dma_dir, struct scatterlist *sgl,
unsigned int n_elem, unsigned long timeout)
{
struct ata_link *link = dev->link;
struct ata_port *ap = link->ap;
u8 command = tf->command;
int auto_timeout = 0;
struct ata_queued_cmd *qc;
unsigned int preempted_tag;
u32 preempted_sactive;
u64 preempted_qc_active;
int preempted_nr_active_links;
DECLARE_COMPLETION_ONSTACK(wait);
unsigned long flags;
unsigned int err_mask;
int rc;
spin_lock_irqsave(ap->lock, flags);
/* no internal command while frozen */
if (ap->pflags & ATA_PFLAG_FROZEN) {
spin_unlock_irqrestore(ap->lock, flags);
return AC_ERR_SYSTEM;
}
/* initialize internal qc */
qc = __ata_qc_from_tag(ap, ATA_TAG_INTERNAL);
qc->tag = ATA_TAG_INTERNAL;
qc->hw_tag = 0;
qc->scsicmd = NULL;
qc->ap = ap;
qc->dev = dev;
ata_qc_reinit(qc);
preempted_tag = link->active_tag;
preempted_sactive = link->sactive;
preempted_qc_active = ap->qc_active;
preempted_nr_active_links = ap->nr_active_links;
link->active_tag = ATA_TAG_POISON;
link->sactive = 0;
ap->qc_active = 0;
ap->nr_active_links = 0;
/* prepare & issue qc */
qc->tf = *tf;
if (cdb)
memcpy(qc->cdb, cdb, ATAPI_CDB_LEN);
/* some SATA bridges need us to indicate data xfer direction */
if (tf->protocol == ATAPI_PROT_DMA && (dev->flags & ATA_DFLAG_DMADIR) &&
dma_dir == DMA_FROM_DEVICE)
qc->tf.feature |= ATAPI_DMADIR;
qc->flags |= ATA_QCFLAG_RESULT_TF;
qc->dma_dir = dma_dir;
if (dma_dir != DMA_NONE) {
unsigned int i, buflen = 0;
struct scatterlist *sg;
for_each_sg(sgl, sg, n_elem, i)
buflen += sg->length;
ata_sg_init(qc, sgl, n_elem);
qc->nbytes = buflen;
}
qc->private_data = &wait;
qc->complete_fn = ata_qc_complete_internal;
ata_qc_issue(qc);
spin_unlock_irqrestore(ap->lock, flags);
if (!timeout) {
if (ata_probe_timeout)
timeout = ata_probe_timeout * 1000;
else {
timeout = ata_internal_cmd_timeout(dev, command);
auto_timeout = 1;
}
}
if (ap->ops->error_handler)
ata_eh_release(ap);
rc = wait_for_completion_timeout(&wait, msecs_to_jiffies(timeout));
if (ap->ops->error_handler)
ata_eh_acquire(ap);
ata_sff_flush_pio_task(ap);
if (!rc) {
spin_lock_irqsave(ap->lock, flags);
/* We're racing with irq here. If we lose, the
* following test prevents us from completing the qc
* twice. If we win, the port is frozen and will be
* cleaned up by ->post_internal_cmd().
*/
if (qc->flags & ATA_QCFLAG_ACTIVE) {
qc->err_mask |= AC_ERR_TIMEOUT;
if (ap->ops->error_handler)
ata_port_freeze(ap);
else
ata_qc_complete(qc);
if (ata_msg_warn(ap))
ata_dev_warn(dev, "qc timeout (cmd 0x%x)\n",
command);
}
spin_unlock_irqrestore(ap->lock, flags);
}
/* do post_internal_cmd */
if (ap->ops->post_internal_cmd)
ap->ops->post_internal_cmd(qc);
/* perform minimal error analysis */
if (qc->flags & ATA_QCFLAG_FAILED) {
if (qc->result_tf.command & (ATA_ERR | ATA_DF))
qc->err_mask |= AC_ERR_DEV;
if (!qc->err_mask)
qc->err_mask |= AC_ERR_OTHER;
if (qc->err_mask & ~AC_ERR_OTHER)
qc->err_mask &= ~AC_ERR_OTHER;
} else if (qc->tf.command == ATA_CMD_REQ_SENSE_DATA) {
qc->result_tf.command |= ATA_SENSE;
}
/* finish up */
spin_lock_irqsave(ap->lock, flags);
*tf = qc->result_tf;
err_mask = qc->err_mask;
ata_qc_free(qc);
link->active_tag = preempted_tag;
link->sactive = preempted_sactive;
ap->qc_active = preempted_qc_active;
ap->nr_active_links = preempted_nr_active_links;
spin_unlock_irqrestore(ap->lock, flags);
if ((err_mask & AC_ERR_TIMEOUT) && auto_timeout)
ata_internal_cmd_timed_out(dev, command);
return err_mask;
}
/**
* ata_exec_internal - execute libata internal command
* @dev: Device to which the command is sent
* @tf: Taskfile registers for the command and the result
* @cdb: CDB for packet command
* @dma_dir: Data transfer direction of the command
* @buf: Data buffer of the command
* @buflen: Length of data buffer
* @timeout: Timeout in msecs (0 for default)
*
* Wrapper around ata_exec_internal_sg() which takes simple
* buffer instead of sg list.
*
* LOCKING:
* None. Should be called with kernel context, might sleep.
*
* RETURNS:
* Zero on success, AC_ERR_* mask on failure
*/
unsigned ata_exec_internal(struct ata_device *dev,
struct ata_taskfile *tf, const u8 *cdb,
int dma_dir, void *buf, unsigned int buflen,
unsigned long timeout)
{
struct scatterlist *psg = NULL, sg;
unsigned int n_elem = 0;
if (dma_dir != DMA_NONE) {
WARN_ON(!buf);
sg_init_one(&sg, buf, buflen);
psg = &sg;
n_elem++;
}
return ata_exec_internal_sg(dev, tf, cdb, dma_dir, psg, n_elem,
timeout);
}
/**
* ata_pio_need_iordy - check if iordy needed
* @adev: ATA device
*
* Check if the current speed of the device requires IORDY. Used
* by various controllers for chip configuration.
*/
unsigned int ata_pio_need_iordy(const struct ata_device *adev)
{
/* Don't set IORDY if we're preparing for reset. IORDY may
* lead to controller lock up on certain controllers if the
* port is not occupied. See bko#11703 for details.
*/
if (adev->link->ap->pflags & ATA_PFLAG_RESETTING)
return 0;
/* Controller doesn't support IORDY. Probably a pointless
* check as the caller should know this.
*/
if (adev->link->ap->flags & ATA_FLAG_NO_IORDY)
return 0;
/* CF spec. r4.1 Table 22 says no iordy on PIO5 and PIO6. */
if (ata_id_is_cfa(adev->id)
&& (adev->pio_mode == XFER_PIO_5 || adev->pio_mode == XFER_PIO_6))
return 0;
/* PIO3 and higher it is mandatory */
if (adev->pio_mode > XFER_PIO_2)
return 1;
/* We turn it on when possible */
if (ata_id_has_iordy(adev->id))
return 1;
return 0;
}
EXPORT_SYMBOL_GPL(ata_pio_need_iordy);
/**
* ata_pio_mask_no_iordy - Return the non IORDY mask
* @adev: ATA device
*
* Compute the highest mode possible if we are not using iordy. Return
* -1 if no iordy mode is available.
*/
static u32 ata_pio_mask_no_iordy(const struct ata_device *adev)
{
/* If we have no drive specific rule, then PIO 2 is non IORDY */
if (adev->id[ATA_ID_FIELD_VALID] & 2) { /* EIDE */
u16 pio = adev->id[ATA_ID_EIDE_PIO];
/* Is the speed faster than the drive allows non IORDY ? */
if (pio) {
/* This is cycle times not frequency - watch the logic! */
if (pio > 240) /* PIO2 is 240nS per cycle */
return 3 << ATA_SHIFT_PIO;
return 7 << ATA_SHIFT_PIO;
}
}
return 3 << ATA_SHIFT_PIO;
}
/**
* ata_do_dev_read_id - default ID read method
* @dev: device
* @tf: proposed taskfile
* @id: data buffer
*
* Issue the identify taskfile and hand back the buffer containing
* identify data. For some RAID controllers and for pre ATA devices
* this function is wrapped or replaced by the driver
*/
unsigned int ata_do_dev_read_id(struct ata_device *dev,
struct ata_taskfile *tf, u16 *id)
{
return ata_exec_internal(dev, tf, NULL, DMA_FROM_DEVICE,
id, sizeof(id[0]) * ATA_ID_WORDS, 0);
}
EXPORT_SYMBOL_GPL(ata_do_dev_read_id);
/**
* ata_dev_read_id - Read ID data from the specified device
* @dev: target device
* @p_class: pointer to class of the target device (may be changed)
* @flags: ATA_READID_* flags
* @id: buffer to read IDENTIFY data into
*
* Read ID data from the specified device. ATA_CMD_ID_ATA is
* performed on ATA devices and ATA_CMD_ID_ATAPI on ATAPI
* devices. This function also issues ATA_CMD_INIT_DEV_PARAMS
* for pre-ATA4 drives.
*
* FIXME: ATA_CMD_ID_ATA is optional for early drives and right
* now we abort if we hit that case.
*
* LOCKING:
* Kernel thread context (may sleep)
*
* RETURNS:
* 0 on success, -errno otherwise.
*/
int ata_dev_read_id(struct ata_device *dev, unsigned int *p_class,
unsigned int flags, u16 *id)
{
struct ata_port *ap = dev->link->ap;
unsigned int class = *p_class;
struct ata_taskfile tf;
unsigned int err_mask = 0;
const char *reason;
bool is_semb = class == ATA_DEV_SEMB;
int may_fallback = 1, tried_spinup = 0;
int rc;
if (ata_msg_ctl(ap))
ata_dev_dbg(dev, "%s: ENTER\n", __func__);
retry:
ata_tf_init(dev, &tf);
switch (class) {
case ATA_DEV_SEMB:
class = ATA_DEV_ATA; /* some hard drives report SEMB sig */
fallthrough;
case ATA_DEV_ATA:
case ATA_DEV_ZAC:
tf.command = ATA_CMD_ID_ATA;
break;
case ATA_DEV_ATAPI:
tf.command = ATA_CMD_ID_ATAPI;
break;
default:
rc = -ENODEV;
reason = "unsupported class";
goto err_out;
}
tf.protocol = ATA_PROT_PIO;
/* Some devices choke if TF registers contain garbage. Make
* sure those are properly initialized.
*/
tf.flags |= ATA_TFLAG_ISADDR | ATA_TFLAG_DEVICE;
/* Device presence detection is unreliable on some
* controllers. Always poll IDENTIFY if available.
*/
tf.flags |= ATA_TFLAG_POLLING;
if (ap->ops->read_id)
err_mask = ap->ops->read_id(dev, &tf, id);
else
err_mask = ata_do_dev_read_id(dev, &tf, id);
if (err_mask) {
if (err_mask & AC_ERR_NODEV_HINT) {
ata_dev_dbg(dev, "NODEV after polling detection\n");
return -ENOENT;
}
if (is_semb) {
ata_dev_info(dev,
"IDENTIFY failed on device w/ SEMB sig, disabled\n");
/* SEMB is not supported yet */
*p_class = ATA_DEV_SEMB_UNSUP;
return 0;
}
if ((err_mask == AC_ERR_DEV) && (tf.feature & ATA_ABORTED)) {
/* Device or controller might have reported
* the wrong device class. Give a shot at the
* other IDENTIFY if the current one is
* aborted by the device.
*/
if (may_fallback) {
may_fallback = 0;
if (class == ATA_DEV_ATA)
class = ATA_DEV_ATAPI;
else
class = ATA_DEV_ATA;
goto retry;
}
/* Control reaches here iff the device aborted
* both flavors of IDENTIFYs which happens
* sometimes with phantom devices.
*/
ata_dev_dbg(dev,
"both IDENTIFYs aborted, assuming NODEV\n");
return -ENOENT;
}
rc = -EIO;
reason = "I/O error";
goto err_out;
}
if (dev->horkage & ATA_HORKAGE_DUMP_ID) {
ata_dev_dbg(dev, "dumping IDENTIFY data, "
"class=%d may_fallback=%d tried_spinup=%d\n",
class, may_fallback, tried_spinup);
print_hex_dump(KERN_DEBUG, "", DUMP_PREFIX_OFFSET,
16, 2, id, ATA_ID_WORDS * sizeof(*id), true);
}
/* Falling back doesn't make sense if ID data was read
* successfully at least once.
*/
may_fallback = 0;
swap_buf_le16(id, ATA_ID_WORDS);
/* sanity check */
rc = -EINVAL;
reason = "device reports invalid type";
if (class == ATA_DEV_ATA || class == ATA_DEV_ZAC) {
if (!ata_id_is_ata(id) && !ata_id_is_cfa(id))
goto err_out;
if (ap->host->flags & ATA_HOST_IGNORE_ATA &&
ata_id_is_ata(id)) {
ata_dev_dbg(dev,
"host indicates ignore ATA devices, ignored\n");
return -ENOENT;
}
} else {
if (ata_id_is_ata(id))
goto err_out;
}
if (!tried_spinup && (id[2] == 0x37c8 || id[2] == 0x738c)) {
tried_spinup = 1;
/*
* Drive powered-up in standby mode, and requires a specific
* SET_FEATURES spin-up subcommand before it will accept
* anything other than the original IDENTIFY command.
*/
err_mask = ata_dev_set_feature(dev, SETFEATURES_SPINUP, 0);
if (err_mask && id[2] != 0x738c) {
rc = -EIO;
reason = "SPINUP failed";
goto err_out;
}
/*
* If the drive initially returned incomplete IDENTIFY info,
* we now must reissue the IDENTIFY command.
*/
if (id[2] == 0x37c8)
goto retry;
}
if ((flags & ATA_READID_POSTRESET) &&
(class == ATA_DEV_ATA || class == ATA_DEV_ZAC)) {
/*
* The exact sequence expected by certain pre-ATA4 drives is:
* SRST RESET
* IDENTIFY (optional in early ATA)
* INITIALIZE DEVICE PARAMETERS (later IDE and ATA)
* anything else..
* Some drives were very specific about that exact sequence.
*
* Note that ATA4 says lba is mandatory so the second check
* should never trigger.
*/
if (ata_id_major_version(id) < 4 || !ata_id_has_lba(id)) {
err_mask = ata_dev_init_params(dev, id[3], id[6]);
if (err_mask) {
rc = -EIO;
reason = "INIT_DEV_PARAMS failed";
goto err_out;
}
/* current CHS translation info (id[53-58]) might be
* changed. reread the identify device info.
*/
flags &= ~ATA_READID_POSTRESET;
goto retry;
}
}
*p_class = class;
return 0;
err_out:
if (ata_msg_warn(ap))
ata_dev_warn(dev, "failed to IDENTIFY (%s, err_mask=0x%x)\n",
reason, err_mask);
return rc;
}
/**
* ata_read_log_page - read a specific log page
* @dev: target device
* @log: log to read
* @page: page to read
* @buf: buffer to store read page
* @sectors: number of sectors to read
*
* Read log page using READ_LOG_EXT command.
*
* LOCKING:
* Kernel thread context (may sleep).
*
* RETURNS:
* 0 on success, AC_ERR_* mask otherwise.
*/
unsigned int ata_read_log_page(struct ata_device *dev, u8 log,
u8 page, void *buf, unsigned int sectors)
{
unsigned long ap_flags = dev->link->ap->flags;
struct ata_taskfile tf;
unsigned int err_mask;
bool dma = false;
DPRINTK("read log page - log 0x%x, page 0x%x\n", log, page);
/*
* Return error without actually issuing the command on controllers
* which e.g. lockup on a read log page.
*/
if (ap_flags & ATA_FLAG_NO_LOG_PAGE)
return AC_ERR_DEV;
retry:
ata_tf_init(dev, &tf);
if (ata_dma_enabled(dev) && ata_id_has_read_log_dma_ext(dev->id) &&
!(dev->horkage & ATA_HORKAGE_NO_DMA_LOG)) {
tf.command = ATA_CMD_READ_LOG_DMA_EXT;
tf.protocol = ATA_PROT_DMA;
dma = true;
} else {
tf.command = ATA_CMD_READ_LOG_EXT;
tf.protocol = ATA_PROT_PIO;
dma = false;
}
tf.lbal = log;
tf.lbam = page;
tf.nsect = sectors;
tf.hob_nsect = sectors >> 8;
tf.flags |= ATA_TFLAG_ISADDR | ATA_TFLAG_LBA48 | ATA_TFLAG_DEVICE;
err_mask = ata_exec_internal(dev, &tf, NULL, DMA_FROM_DEVICE,
buf, sectors * ATA_SECT_SIZE, 0);
if (err_mask) {
if (dma) {
dev->horkage |= ATA_HORKAGE_NO_DMA_LOG;
goto retry;
}
ata_dev_err(dev,
"Read log 0x%02x page 0x%02x failed, Emask 0x%x\n",
(unsigned int)log, (unsigned int)page, err_mask);
}
return err_mask;
}
static bool ata_log_supported(struct ata_device *dev, u8 log)
{
struct ata_port *ap = dev->link->ap;
if (ata_read_log_page(dev, ATA_LOG_DIRECTORY, 0, ap->sector_buf, 1))
return false;
return get_unaligned_le16(&ap->sector_buf[log * 2]) ? true : false;
}
static bool ata_identify_page_supported(struct ata_device *dev, u8 page)
{
struct ata_port *ap = dev->link->ap;
unsigned int err, i;
if (!ata_log_supported(dev, ATA_LOG_IDENTIFY_DEVICE)) {
ata_dev_warn(dev, "ATA Identify Device Log not supported\n");
return false;
}
/*
* Read IDENTIFY DEVICE data log, page 0, to figure out if the page is
* supported.
*/
err = ata_read_log_page(dev, ATA_LOG_IDENTIFY_DEVICE, 0, ap->sector_buf,
1);
if (err)
return false;
for (i = 0; i < ap->sector_buf[8]; i++) {
if (ap->sector_buf[9 + i] == page)
return true;
}
return false;
}
static int ata_do_link_spd_horkage(struct ata_device *dev)
{
struct ata_link *plink = ata_dev_phys_link(dev);
u32 target, target_limit;
if (!sata_scr_valid(plink))
return 0;
if (dev->horkage & ATA_HORKAGE_1_5_GBPS)
target = 1;
else
return 0;
target_limit = (1 << target) - 1;
/* if already on stricter limit, no need to push further */
if (plink->sata_spd_limit <= target_limit)
return 0;
plink->sata_spd_limit = target_limit;
/* Request another EH round by returning -EAGAIN if link is
* going faster than the target speed. Forward progress is
* guaranteed by setting sata_spd_limit to target_limit above.
*/
if (plink->sata_spd > target) {
ata_dev_info(dev, "applying link speed limit horkage to %s\n",
sata_spd_string(target));
return -EAGAIN;
}
return 0;
}
static inline u8 ata_dev_knobble(struct ata_device *dev)
{
struct ata_port *ap = dev->link->ap;
if (ata_dev_blacklisted(dev) & ATA_HORKAGE_BRIDGE_OK)
return 0;
return ((ap->cbl == ATA_CBL_SATA) && (!ata_id_is_sata(dev->id)));
}
static void ata_dev_config_ncq_send_recv(struct ata_device *dev)
{
struct ata_port *ap = dev->link->ap;
unsigned int err_mask;
if (!ata_log_supported(dev, ATA_LOG_NCQ_SEND_RECV)) {
ata_dev_warn(dev, "NCQ Send/Recv Log not supported\n");
return;
}
err_mask = ata_read_log_page(dev, ATA_LOG_NCQ_SEND_RECV,
0, ap->sector_buf, 1);
if (!err_mask) {
u8 *cmds = dev->ncq_send_recv_cmds;
dev->flags |= ATA_DFLAG_NCQ_SEND_RECV;
memcpy(cmds, ap->sector_buf, ATA_LOG_NCQ_SEND_RECV_SIZE);
if (dev->horkage & ATA_HORKAGE_NO_NCQ_TRIM) {
ata_dev_dbg(dev, "disabling queued TRIM support\n");
cmds[ATA_LOG_NCQ_SEND_RECV_DSM_OFFSET] &=
~ATA_LOG_NCQ_SEND_RECV_DSM_TRIM;
}
}
}
static void ata_dev_config_ncq_non_data(struct ata_device *dev)
{
struct ata_port *ap = dev->link->ap;
unsigned int err_mask;
if (!ata_log_supported(dev, ATA_LOG_NCQ_NON_DATA)) {
ata_dev_warn(dev,
"NCQ Send/Recv Log not supported\n");
return;
}
err_mask = ata_read_log_page(dev, ATA_LOG_NCQ_NON_DATA,
0, ap->sector_buf, 1);
if (!err_mask) {
u8 *cmds = dev->ncq_non_data_cmds;
memcpy(cmds, ap->sector_buf, ATA_LOG_NCQ_NON_DATA_SIZE);
}
}
static void ata_dev_config_ncq_prio(struct ata_device *dev)
{
struct ata_port *ap = dev->link->ap;
unsigned int err_mask;
if (!ata_identify_page_supported(dev, ATA_LOG_SATA_SETTINGS))
return;
err_mask = ata_read_log_page(dev,
ATA_LOG_IDENTIFY_DEVICE,
ATA_LOG_SATA_SETTINGS,
ap->sector_buf,
1);
if (err_mask)
goto not_supported;
if (!(ap->sector_buf[ATA_LOG_NCQ_PRIO_OFFSET] & BIT(3)))
goto not_supported;
dev->flags |= ATA_DFLAG_NCQ_PRIO;
return;
not_supported:
dev->flags &= ~ATA_DFLAG_NCQ_PRIO_ENABLE;
dev->flags &= ~ATA_DFLAG_NCQ_PRIO;
}
static bool ata_dev_check_adapter(struct ata_device *dev,
unsigned short vendor_id)
{
struct pci_dev *pcidev = NULL;
struct device *parent_dev = NULL;
for (parent_dev = dev->tdev.parent; parent_dev != NULL;
parent_dev = parent_dev->parent) {
if (dev_is_pci(parent_dev)) {
pcidev = to_pci_dev(parent_dev);
if (pcidev->vendor == vendor_id)
return true;
break;
}
}
return false;
}
static int ata_dev_config_ncq(struct ata_device *dev,
char *desc, size_t desc_sz)
{
struct ata_port *ap = dev->link->ap;
int hdepth = 0, ddepth = ata_id_queue_depth(dev->id);
unsigned int err_mask;
char *aa_desc = "";
if (!ata_id_has_ncq(dev->id)) {
desc[0] = '\0';
return 0;
}
if (!IS_ENABLED(CONFIG_SATA_HOST))
return 0;
if (dev->horkage & ATA_HORKAGE_NONCQ) {
snprintf(desc, desc_sz, "NCQ (not used)");
return 0;
}
if (dev->horkage & ATA_HORKAGE_NO_NCQ_ON_ATI &&
ata_dev_check_adapter(dev, PCI_VENDOR_ID_ATI)) {
snprintf(desc, desc_sz, "NCQ (not used)");
return 0;
}
if (ap->flags & ATA_FLAG_NCQ) {
hdepth = min(ap->scsi_host->can_queue, ATA_MAX_QUEUE);
dev->flags |= ATA_DFLAG_NCQ;
}
if (!(dev->horkage & ATA_HORKAGE_BROKEN_FPDMA_AA) &&
(ap->flags & ATA_FLAG_FPDMA_AA) &&
ata_id_has_fpdma_aa(dev->id)) {
err_mask = ata_dev_set_feature(dev, SETFEATURES_SATA_ENABLE,
SATA_FPDMA_AA);
if (err_mask) {
ata_dev_err(dev,
"failed to enable AA (error_mask=0x%x)\n",
err_mask);
if (err_mask != AC_ERR_DEV) {
dev->horkage |= ATA_HORKAGE_BROKEN_FPDMA_AA;
return -EIO;
}
} else
aa_desc = ", AA";
}
if (hdepth >= ddepth)
snprintf(desc, desc_sz, "NCQ (depth %d)%s", ddepth, aa_desc);
else
snprintf(desc, desc_sz, "NCQ (depth %d/%d)%s", hdepth,
ddepth, aa_desc);
if ((ap->flags & ATA_FLAG_FPDMA_AUX)) {
if (ata_id_has_ncq_send_and_recv(dev->id))
ata_dev_config_ncq_send_recv(dev);
if (ata_id_has_ncq_non_data(dev->id))
ata_dev_config_ncq_non_data(dev);
if (ata_id_has_ncq_prio(dev->id))
ata_dev_config_ncq_prio(dev);
}
return 0;
}
static void ata_dev_config_sense_reporting(struct ata_device *dev)
{
unsigned int err_mask;
if (!ata_id_has_sense_reporting(dev->id))
return;
if (ata_id_sense_reporting_enabled(dev->id))
return;
err_mask = ata_dev_set_feature(dev, SETFEATURE_SENSE_DATA, 0x1);
if (err_mask) {
ata_dev_dbg(dev,
"failed to enable Sense Data Reporting, Emask 0x%x\n",
err_mask);
}
}
static void ata_dev_config_zac(struct ata_device *dev)
{
struct ata_port *ap = dev->link->ap;
unsigned int err_mask;
u8 *identify_buf = ap->sector_buf;
dev->zac_zones_optimal_open = U32_MAX;
dev->zac_zones_optimal_nonseq = U32_MAX;
dev->zac_zones_max_open = U32_MAX;
/*
* Always set the 'ZAC' flag for Host-managed devices.
*/
if (dev->class == ATA_DEV_ZAC)
dev->flags |= ATA_DFLAG_ZAC;
else if (ata_id_zoned_cap(dev->id) == 0x01)
/*
* Check for host-aware devices.
*/
dev->flags |= ATA_DFLAG_ZAC;
if (!(dev->flags & ATA_DFLAG_ZAC))
return;
if (!ata_identify_page_supported(dev, ATA_LOG_ZONED_INFORMATION)) {
ata_dev_warn(dev,
"ATA Zoned Information Log not supported\n");
return;
}
/*
* Read IDENTIFY DEVICE data log, page 9 (Zoned-device information)
*/
err_mask = ata_read_log_page(dev, ATA_LOG_IDENTIFY_DEVICE,
ATA_LOG_ZONED_INFORMATION,
identify_buf, 1);
if (!err_mask) {
u64 zoned_cap, opt_open, opt_nonseq, max_open;
zoned_cap = get_unaligned_le64(&identify_buf[8]);
if ((zoned_cap >> 63))
dev->zac_zoned_cap = (zoned_cap & 1);
opt_open = get_unaligned_le64(&identify_buf[24]);
if ((opt_open >> 63))
dev->zac_zones_optimal_open = (u32)opt_open;
opt_nonseq = get_unaligned_le64(&identify_buf[32]);
if ((opt_nonseq >> 63))
dev->zac_zones_optimal_nonseq = (u32)opt_nonseq;
max_open = get_unaligned_le64(&identify_buf[40]);
if ((max_open >> 63))
dev->zac_zones_max_open = (u32)max_open;
}
}
static void ata_dev_config_trusted(struct ata_device *dev)
{
struct ata_port *ap = dev->link->ap;
u64 trusted_cap;
unsigned int err;
if (!ata_id_has_trusted(dev->id))
return;
if (!ata_identify_page_supported(dev, ATA_LOG_SECURITY)) {
ata_dev_warn(dev,
"Security Log not supported\n");
return;
}
err = ata_read_log_page(dev, ATA_LOG_IDENTIFY_DEVICE, ATA_LOG_SECURITY,
ap->sector_buf, 1);
if (err)
return;
trusted_cap = get_unaligned_le64(&ap->sector_buf[40]);
if (!(trusted_cap & (1ULL << 63))) {
ata_dev_dbg(dev,
"Trusted Computing capability qword not valid!\n");
return;
}
if (trusted_cap & (1 << 0))
dev->flags |= ATA_DFLAG_TRUSTED;
}
static int ata_dev_config_lba(struct ata_device *dev)
{
struct ata_port *ap = dev->link->ap;
const u16 *id = dev->id;
const char *lba_desc;
char ncq_desc[24];
int ret;
dev->flags |= ATA_DFLAG_LBA;
if (ata_id_has_lba48(id)) {
lba_desc = "LBA48";
dev->flags |= ATA_DFLAG_LBA48;
if (dev->n_sectors >= (1UL << 28) &&
ata_id_has_flush_ext(id))
dev->flags |= ATA_DFLAG_FLUSH_EXT;
} else {
lba_desc = "LBA";
}
/* config NCQ */
ret = ata_dev_config_ncq(dev, ncq_desc, sizeof(ncq_desc));
/* print device info to dmesg */
if (ata_msg_drv(ap) && ata_dev_print_info(dev))
ata_dev_info(dev,
"%llu sectors, multi %u: %s %s\n",
(unsigned long long)dev->n_sectors,
dev->multi_count, lba_desc, ncq_desc);
return ret;
}
static void ata_dev_config_chs(struct ata_device *dev)
{
struct ata_port *ap = dev->link->ap;
const u16 *id = dev->id;
if (ata_id_current_chs_valid(id)) {
/* Current CHS translation is valid. */
dev->cylinders = id[54];
dev->heads = id[55];
dev->sectors = id[56];
} else {
/* Default translation */
dev->cylinders = id[1];
dev->heads = id[3];
dev->sectors = id[6];
}
/* print device info to dmesg */
if (ata_msg_drv(ap) && ata_dev_print_info(dev))
ata_dev_info(dev,
"%llu sectors, multi %u, CHS %u/%u/%u\n",
(unsigned long long)dev->n_sectors,
dev->multi_count, dev->cylinders,
dev->heads, dev->sectors);
}
static void ata_dev_config_devslp(struct ata_device *dev)
{
u8 *sata_setting = dev->link->ap->sector_buf;
unsigned int err_mask;
int i, j;
/*
* Check device sleep capability. Get DevSlp timing variables
* from SATA Settings page of Identify Device Data Log.
*/
if (!ata_id_has_devslp(dev->id) ||
!ata_identify_page_supported(dev, ATA_LOG_SATA_SETTINGS))
return;
err_mask = ata_read_log_page(dev,
ATA_LOG_IDENTIFY_DEVICE,
ATA_LOG_SATA_SETTINGS,
sata_setting, 1);
if (err_mask)
return;
dev->flags |= ATA_DFLAG_DEVSLP;
for (i = 0; i < ATA_LOG_DEVSLP_SIZE; i++) {
j = ATA_LOG_DEVSLP_OFFSET + i;
dev->devslp_timing[i] = sata_setting[j];
}
}
static void ata_dev_print_features(struct ata_device *dev)
{
if (!(dev->flags & ATA_DFLAG_FEATURES_MASK))
return;
ata_dev_info(dev,
"Features:%s%s%s%s%s\n",
dev->flags & ATA_DFLAG_TRUSTED ? " Trust" : "",
dev->flags & ATA_DFLAG_DA ? " Dev-Attention" : "",
dev->flags & ATA_DFLAG_DEVSLP ? " Dev-Sleep" : "",
dev->flags & ATA_DFLAG_NCQ_SEND_RECV ? " NCQ-sndrcv" : "",
dev->flags & ATA_DFLAG_NCQ_PRIO ? " NCQ-prio" : "");
}
/**
* ata_dev_configure - Configure the specified ATA/ATAPI device
* @dev: Target device to configure
*
* Configure @dev according to @dev->id. Generic and low-level
* driver specific fixups are also applied.
*
* LOCKING:
* Kernel thread context (may sleep)
*
* RETURNS:
* 0 on success, -errno otherwise
*/
int ata_dev_configure(struct ata_device *dev)
{
struct ata_port *ap = dev->link->ap;
bool print_info = ata_dev_print_info(dev);
const u16 *id = dev->id;
unsigned long xfer_mask;
unsigned int err_mask;
char revbuf[7]; /* XYZ-99\0 */
char fwrevbuf[ATA_ID_FW_REV_LEN+1];
char modelbuf[ATA_ID_PROD_LEN+1];
int rc;
if (!ata_dev_enabled(dev) && ata_msg_info(ap)) {
ata_dev_info(dev, "%s: ENTER/EXIT -- nodev\n", __func__);
return 0;
}
if (ata_msg_probe(ap))
ata_dev_dbg(dev, "%s: ENTER\n", __func__);
/* set horkage */
dev->horkage |= ata_dev_blacklisted(dev);
ata_force_horkage(dev);
if (dev->horkage & ATA_HORKAGE_DISABLE) {
ata_dev_info(dev, "unsupported device, disabling\n");
ata_dev_disable(dev);
return 0;
}
if ((!atapi_enabled || (ap->flags & ATA_FLAG_NO_ATAPI)) &&
dev->class == ATA_DEV_ATAPI) {
ata_dev_warn(dev, "WARNING: ATAPI is %s, device ignored\n",
atapi_enabled ? "not supported with this driver"
: "disabled");
ata_dev_disable(dev);
return 0;
}
rc = ata_do_link_spd_horkage(dev);
if (rc)
return rc;
/* some WD SATA-1 drives have issues with LPM, turn on NOLPM for them */
if ((dev->horkage & ATA_HORKAGE_WD_BROKEN_LPM) &&
(id[ATA_ID_SATA_CAPABILITY] & 0xe) == 0x2)
dev->horkage |= ATA_HORKAGE_NOLPM;
if (ap->flags & ATA_FLAG_NO_LPM)
dev->horkage |= ATA_HORKAGE_NOLPM;
if (dev->horkage & ATA_HORKAGE_NOLPM) {
ata_dev_warn(dev, "LPM support broken, forcing max_power\n");
dev->link->ap->target_lpm_policy = ATA_LPM_MAX_POWER;
}
/* let ACPI work its magic */
rc = ata_acpi_on_devcfg(dev);
if (rc)
return rc;
/* massage HPA, do it early as it might change IDENTIFY data */
rc = ata_hpa_resize(dev);
if (rc)
return rc;
/* print device capabilities */
if (ata_msg_probe(ap))
ata_dev_dbg(dev,
"%s: cfg 49:%04x 82:%04x 83:%04x 84:%04x "
"85:%04x 86:%04x 87:%04x 88:%04x\n",
__func__,
id[49], id[82], id[83], id[84],
id[85], id[86], id[87], id[88]);
/* initialize to-be-configured parameters */
dev->flags &= ~ATA_DFLAG_CFG_MASK;
dev->max_sectors = 0;
dev->cdb_len = 0;
dev->n_sectors = 0;
dev->cylinders = 0;
dev->heads = 0;
dev->sectors = 0;
dev->multi_count = 0;
/*
* common ATA, ATAPI feature tests
*/
/* find max transfer mode; for printk only */
xfer_mask = ata_id_xfermask(id);
if (ata_msg_probe(ap))
ata_dump_id(id);
/* SCSI only uses 4-char revisions, dump full 8 chars from ATA */
ata_id_c_string(dev->id, fwrevbuf, ATA_ID_FW_REV,
sizeof(fwrevbuf));
ata_id_c_string(dev->id, modelbuf, ATA_ID_PROD,
sizeof(modelbuf));
/* ATA-specific feature tests */
if (dev->class == ATA_DEV_ATA || dev->class == ATA_DEV_ZAC) {
if (ata_id_is_cfa(id)) {
/* CPRM may make this media unusable */
if (id[ATA_ID_CFA_KEY_MGMT] & 1)
ata_dev_warn(dev,
"supports DRM functions and may not be fully accessible\n");
snprintf(revbuf, 7, "CFA");
} else {
snprintf(revbuf, 7, "ATA-%d", ata_id_major_version(id));
/* Warn the user if the device has TPM extensions */
if (ata_id_has_tpm(id))
ata_dev_warn(dev,
"supports DRM functions and may not be fully accessible\n");
}
dev->n_sectors = ata_id_n_sectors(id);
/* get current R/W Multiple count setting */
if ((dev->id[47] >> 8) == 0x80 && (dev->id[59] & 0x100)) {
unsigned int max = dev->id[47] & 0xff;
unsigned int cnt = dev->id[59] & 0xff;
/* only recognize/allow powers of two here */
if (is_power_of_2(max) && is_power_of_2(cnt))
if (cnt <= max)
dev->multi_count = cnt;
}
/* print device info to dmesg */
if (ata_msg_drv(ap) && print_info)
ata_dev_info(dev, "%s: %s, %s, max %s\n",
revbuf, modelbuf, fwrevbuf,
ata_mode_string(xfer_mask));
if (ata_id_has_lba(id)) {
rc = ata_dev_config_lba(dev);
if (rc)
return rc;
} else {
ata_dev_config_chs(dev);
}
ata_dev_config_devslp(dev);
ata_dev_config_sense_reporting(dev);
ata_dev_config_zac(dev);
ata_dev_config_trusted(dev);
dev->cdb_len = 32;
if (ata_msg_drv(ap) && print_info)
ata_dev_print_features(dev);
}
/* ATAPI-specific feature tests */
else if (dev->class == ATA_DEV_ATAPI) {
const char *cdb_intr_string = "";
const char *atapi_an_string = "";
const char *dma_dir_string = "";
u32 sntf;
rc = atapi_cdb_len(id);
if ((rc < 12) || (rc > ATAPI_CDB_LEN)) {
if (ata_msg_warn(ap))
ata_dev_warn(dev, "unsupported CDB len\n");
rc = -EINVAL;
goto err_out_nosup;
}
dev->cdb_len = (unsigned int) rc;
/* Enable ATAPI AN if both the host and device have
* the support. If PMP is attached, SNTF is required
* to enable ATAPI AN to discern between PHY status
* changed notifications and ATAPI ANs.
*/
if (atapi_an &&
(ap->flags & ATA_FLAG_AN) && ata_id_has_atapi_AN(id) &&
(!sata_pmp_attached(ap) ||
sata_scr_read(&ap->link, SCR_NOTIFICATION, &sntf) == 0)) {
/* issue SET feature command to turn this on */
err_mask = ata_dev_set_feature(dev,
SETFEATURES_SATA_ENABLE, SATA_AN);
if (err_mask)
ata_dev_err(dev,
"failed to enable ATAPI AN (err_mask=0x%x)\n",
err_mask);
else {
dev->flags |= ATA_DFLAG_AN;
atapi_an_string = ", ATAPI AN";
}
}
if (ata_id_cdb_intr(dev->id)) {
dev->flags |= ATA_DFLAG_CDB_INTR;
cdb_intr_string = ", CDB intr";
}
if (atapi_dmadir || (dev->horkage & ATA_HORKAGE_ATAPI_DMADIR) || atapi_id_dmadir(dev->id)) {
dev->flags |= ATA_DFLAG_DMADIR;
dma_dir_string = ", DMADIR";
}
if (ata_id_has_da(dev->id)) {
dev->flags |= ATA_DFLAG_DA;
zpodd_init(dev);
}
/* print device info to dmesg */
if (ata_msg_drv(ap) && print_info)
ata_dev_info(dev,
"ATAPI: %s, %s, max %s%s%s%s\n",
modelbuf, fwrevbuf,
ata_mode_string(xfer_mask),
cdb_intr_string, atapi_an_string,
dma_dir_string);
}
/* determine max_sectors */
dev->max_sectors = ATA_MAX_SECTORS;
if (dev->flags & ATA_DFLAG_LBA48)
dev->max_sectors = ATA_MAX_SECTORS_LBA48;
/* Limit PATA drive on SATA cable bridge transfers to udma5,
200 sectors */
if (ata_dev_knobble(dev)) {
if (ata_msg_drv(ap) && print_info)
ata_dev_info(dev, "applying bridge limits\n");
dev->udma_mask &= ATA_UDMA5;
dev->max_sectors = ATA_MAX_SECTORS;
}
if ((dev->class == ATA_DEV_ATAPI) &&
(atapi_command_packet_set(id) == TYPE_TAPE)) {
dev->max_sectors = ATA_MAX_SECTORS_TAPE;
dev->horkage |= ATA_HORKAGE_STUCK_ERR;
}
if (dev->horkage & ATA_HORKAGE_MAX_SEC_128)
dev->max_sectors = min_t(unsigned int, ATA_MAX_SECTORS_128,
dev->max_sectors);
if (dev->horkage & ATA_HORKAGE_MAX_SEC_1024)
dev->max_sectors = min_t(unsigned int, ATA_MAX_SECTORS_1024,
dev->max_sectors);
if (dev->horkage & ATA_HORKAGE_MAX_SEC_LBA48)
dev->max_sectors = ATA_MAX_SECTORS_LBA48;
if (ap->ops->dev_config)
ap->ops->dev_config(dev);
if (dev->horkage & ATA_HORKAGE_DIAGNOSTIC) {
/* Let the user know. We don't want to disallow opens for
rescue purposes, or in case the vendor is just a blithering
idiot. Do this after the dev_config call as some controllers
with buggy firmware may want to avoid reporting false device
bugs */
if (print_info) {
ata_dev_warn(dev,
"Drive reports diagnostics failure. This may indicate a drive\n");
ata_dev_warn(dev,
"fault or invalid emulation. Contact drive vendor for information.\n");
}
}
if ((dev->horkage & ATA_HORKAGE_FIRMWARE_WARN) && print_info) {
ata_dev_warn(dev, "WARNING: device requires firmware update to be fully functional\n");
ata_dev_warn(dev, " contact the vendor or visit http://ata.wiki.kernel.org\n");
}
return 0;
err_out_nosup:
if (ata_msg_probe(ap))
ata_dev_dbg(dev, "%s: EXIT, err\n", __func__);
return rc;
}
/**
* ata_cable_40wire - return 40 wire cable type
* @ap: port
*
* Helper method for drivers which want to hardwire 40 wire cable
* detection.
*/
int ata_cable_40wire(struct ata_port *ap)
{
return ATA_CBL_PATA40;
}
EXPORT_SYMBOL_GPL(ata_cable_40wire);
/**
* ata_cable_80wire - return 80 wire cable type
* @ap: port
*
* Helper method for drivers which want to hardwire 80 wire cable
* detection.
*/
int ata_cable_80wire(struct ata_port *ap)
{
return ATA_CBL_PATA80;
}
EXPORT_SYMBOL_GPL(ata_cable_80wire);
/**
* ata_cable_unknown - return unknown PATA cable.
* @ap: port
*
* Helper method for drivers which have no PATA cable detection.
*/
int ata_cable_unknown(struct ata_port *ap)
{
return ATA_CBL_PATA_UNK;
}
EXPORT_SYMBOL_GPL(ata_cable_unknown);
/**
* ata_cable_ignore - return ignored PATA cable.
* @ap: port
*
* Helper method for drivers which don't use cable type to limit
* transfer mode.
*/
int ata_cable_ignore(struct ata_port *ap)
{
return ATA_CBL_PATA_IGN;
}
EXPORT_SYMBOL_GPL(ata_cable_ignore);
/**
* ata_cable_sata - return SATA cable type
* @ap: port
*
* Helper method for drivers which have SATA cables
*/
int ata_cable_sata(struct ata_port *ap)
{
return ATA_CBL_SATA;
}
EXPORT_SYMBOL_GPL(ata_cable_sata);
/**
* ata_bus_probe - Reset and probe ATA bus
* @ap: Bus to probe
*
* Master ATA bus probing function. Initiates a hardware-dependent
* bus reset, then attempts to identify any devices found on
* the bus.
*
* LOCKING:
* PCI/etc. bus probe sem.
*
* RETURNS:
* Zero on success, negative errno otherwise.
*/
int ata_bus_probe(struct ata_port *ap)
{
unsigned int classes[ATA_MAX_DEVICES];
int tries[ATA_MAX_DEVICES];
int rc;
struct ata_device *dev;
ata_for_each_dev(dev, &ap->link, ALL)
tries[dev->devno] = ATA_PROBE_MAX_TRIES;
retry:
ata_for_each_dev(dev, &ap->link, ALL) {
/* If we issue an SRST then an ATA drive (not ATAPI)
* may change configuration and be in PIO0 timing. If
* we do a hard reset (or are coming from power on)
* this is true for ATA or ATAPI. Until we've set a
* suitable controller mode we should not touch the
* bus as we may be talking too fast.
*/
dev->pio_mode = XFER_PIO_0;
dev->dma_mode = 0xff;
/* If the controller has a pio mode setup function
* then use it to set the chipset to rights. Don't
* touch the DMA setup as that will be dealt with when
* configuring devices.
*/
if (ap->ops->set_piomode)
ap->ops->set_piomode(ap, dev);
}
/* reset and determine device classes */
ap->ops->phy_reset(ap);
ata_for_each_dev(dev, &ap->link, ALL) {
if (dev->class != ATA_DEV_UNKNOWN)
classes[dev->devno] = dev->class;
else
classes[dev->devno] = ATA_DEV_NONE;
dev->class = ATA_DEV_UNKNOWN;
}
/* read IDENTIFY page and configure devices. We have to do the identify
specific sequence bass-ackwards so that PDIAG- is released by
the slave device */
ata_for_each_dev(dev, &ap->link, ALL_REVERSE) {
if (tries[dev->devno])
dev->class = classes[dev->devno];
if (!ata_dev_enabled(dev))
continue;
rc = ata_dev_read_id(dev, &dev->class, ATA_READID_POSTRESET,
dev->id);
if (rc)
goto fail;
}
/* Now ask for the cable type as PDIAG- should have been released */
if (ap->ops->cable_detect)
ap->cbl = ap->ops->cable_detect(ap);
/* We may have SATA bridge glue hiding here irrespective of
* the reported cable types and sensed types. When SATA
* drives indicate we have a bridge, we don't know which end
* of the link the bridge is which is a problem.
*/
ata_for_each_dev(dev, &ap->link, ENABLED)
if (ata_id_is_sata(dev->id))
ap->cbl = ATA_CBL_SATA;
/* After the identify sequence we can now set up the devices. We do
this in the normal order so that the user doesn't get confused */
ata_for_each_dev(dev, &ap->link, ENABLED) {
ap->link.eh_context.i.flags |= ATA_EHI_PRINTINFO;
rc = ata_dev_configure(dev);
ap->link.eh_context.i.flags &= ~ATA_EHI_PRINTINFO;
if (rc)
goto fail;
}
/* configure transfer mode */
rc = ata_set_mode(&ap->link, &dev);
if (rc)
goto fail;
ata_for_each_dev(dev, &ap->link, ENABLED)
return 0;
return -ENODEV;
fail:
tries[dev->devno]--;
switch (rc) {
case -EINVAL:
/* eeek, something went very wrong, give up */
tries[dev->devno] = 0;
break;
case -ENODEV:
/* give it just one more chance */
tries[dev->devno] = min(tries[dev->devno], 1);
fallthrough;
case -EIO:
if (tries[dev->devno] == 1) {
/* This is the last chance, better to slow
* down than lose it.
*/
sata_down_spd_limit(&ap->link, 0);
ata_down_xfermask_limit(dev, ATA_DNXFER_PIO);
}
}
if (!tries[dev->devno])
ata_dev_disable(dev);
goto retry;
}
/**
* sata_print_link_status - Print SATA link status
* @link: SATA link to printk link status about
*
* This function prints link speed and status of a SATA link.
*
* LOCKING:
* None.
*/
static void sata_print_link_status(struct ata_link *link)
{
u32 sstatus, scontrol, tmp;
if (sata_scr_read(link, SCR_STATUS, &sstatus))
return;
sata_scr_read(link, SCR_CONTROL, &scontrol);
if (ata_phys_link_online(link)) {
tmp = (sstatus >> 4) & 0xf;
ata_link_info(link, "SATA link up %s (SStatus %X SControl %X)\n",
sata_spd_string(tmp), sstatus, scontrol);
} else {
ata_link_info(link, "SATA link down (SStatus %X SControl %X)\n",
sstatus, scontrol);
}
}
/**
* ata_dev_pair - return other device on cable
* @adev: device
*
* Obtain the other device on the same cable, or if none is
* present NULL is returned
*/
struct ata_device *ata_dev_pair(struct ata_device *adev)
{
struct ata_link *link = adev->link;
struct ata_device *pair = &link->device[1 - adev->devno];
if (!ata_dev_enabled(pair))
return NULL;
return pair;
}
EXPORT_SYMBOL_GPL(ata_dev_pair);
/**
* sata_down_spd_limit - adjust SATA spd limit downward
* @link: Link to adjust SATA spd limit for
* @spd_limit: Additional limit
*
* Adjust SATA spd limit of @link downward. Note that this
* function only adjusts the limit. The change must be applied
* using sata_set_spd().
*
* If @spd_limit is non-zero, the speed is limited to equal to or
* lower than @spd_limit if such speed is supported. If
* @spd_limit is slower than any supported speed, only the lowest
* supported speed is allowed.
*
* LOCKING:
* Inherited from caller.
*
* RETURNS:
* 0 on success, negative errno on failure
*/
int sata_down_spd_limit(struct ata_link *link, u32 spd_limit)
{
u32 sstatus, spd, mask;
int rc, bit;
if (!sata_scr_valid(link))
return -EOPNOTSUPP;
/* If SCR can be read, use it to determine the current SPD.
* If not, use cached value in link->sata_spd.
*/
rc = sata_scr_read(link, SCR_STATUS, &sstatus);
if (rc == 0 && ata_sstatus_online(sstatus))
spd = (sstatus >> 4) & 0xf;
else
spd = link->sata_spd;
mask = link->sata_spd_limit;
if (mask <= 1)
return -EINVAL;
/* unconditionally mask off the highest bit */
bit = fls(mask) - 1;
mask &= ~(1 << bit);
/*
* Mask off all speeds higher than or equal to the current one. At
* this point, if current SPD is not available and we previously
* recorded the link speed from SStatus, the driver has already
* masked off the highest bit so mask should already be 1 or 0.
* Otherwise, we should not force 1.5Gbps on a link where we have
* not previously recorded speed from SStatus. Just return in this
* case.
*/
if (spd > 1)
mask &= (1 << (spd - 1)) - 1;
else
return -EINVAL;
/* were we already at the bottom? */
if (!mask)
return -EINVAL;
if (spd_limit) {
if (mask & ((1 << spd_limit) - 1))
mask &= (1 << spd_limit) - 1;
else {
bit = ffs(mask) - 1;
mask = 1 << bit;
}
}
link->sata_spd_limit = mask;
ata_link_warn(link, "limiting SATA link speed to %s\n",
sata_spd_string(fls(mask)));
return 0;
}
#ifdef CONFIG_ATA_ACPI
/**
* ata_timing_cycle2mode - find xfer mode for the specified cycle duration
* @xfer_shift: ATA_SHIFT_* value for transfer type to examine.
* @cycle: cycle duration in ns
*
* Return matching xfer mode for @cycle. The returned mode is of
* the transfer type specified by @xfer_shift. If @cycle is too
* slow for @xfer_shift, 0xff is returned. If @cycle is faster
* than the fastest known mode, the fasted mode is returned.
*
* LOCKING:
* None.
*
* RETURNS:
* Matching xfer_mode, 0xff if no match found.
*/
u8 ata_timing_cycle2mode(unsigned int xfer_shift, int cycle)
{
u8 base_mode = 0xff, last_mode = 0xff;
const struct ata_xfer_ent *ent;
const struct ata_timing *t;
for (ent = ata_xfer_tbl; ent->shift >= 0; ent++)
if (ent->shift == xfer_shift)
base_mode = ent->base;
for (t = ata_timing_find_mode(base_mode);
t && ata_xfer_mode2shift(t->mode) == xfer_shift; t++) {
unsigned short this_cycle;
switch (xfer_shift) {
case ATA_SHIFT_PIO:
case ATA_SHIFT_MWDMA:
this_cycle = t->cycle;
break;
case ATA_SHIFT_UDMA:
this_cycle = t->udma;
break;
default:
return 0xff;
}
if (cycle > this_cycle)
break;
last_mode = t->mode;
}
return last_mode;
}
#endif
/**
* ata_down_xfermask_limit - adjust dev xfer masks downward
* @dev: Device to adjust xfer masks
* @sel: ATA_DNXFER_* selector
*
* Adjust xfer masks of @dev downward. Note that this function
* does not apply the change. Invoking ata_set_mode() afterwards
* will apply the limit.
*
* LOCKING:
* Inherited from caller.
*
* RETURNS:
* 0 on success, negative errno on failure
*/
int ata_down_xfermask_limit(struct ata_device *dev, unsigned int sel)
{
char buf[32];
unsigned long orig_mask, xfer_mask;
unsigned long pio_mask, mwdma_mask, udma_mask;
int quiet, highbit;
quiet = !!(sel & ATA_DNXFER_QUIET);
sel &= ~ATA_DNXFER_QUIET;
xfer_mask = orig_mask = ata_pack_xfermask(dev->pio_mask,
dev->mwdma_mask,
dev->udma_mask);
ata_unpack_xfermask(xfer_mask, &pio_mask, &mwdma_mask, &udma_mask);
switch (sel) {
case ATA_DNXFER_PIO:
highbit = fls(pio_mask) - 1;
pio_mask &= ~(1 << highbit);
break;
case ATA_DNXFER_DMA:
if (udma_mask) {
highbit = fls(udma_mask) - 1;
udma_mask &= ~(1 << highbit);
if (!udma_mask)
return -ENOENT;
} else if (mwdma_mask) {
highbit = fls(mwdma_mask) - 1;
mwdma_mask &= ~(1 << highbit);
if (!mwdma_mask)
return -ENOENT;
}
break;
case ATA_DNXFER_40C:
udma_mask &= ATA_UDMA_MASK_40C;
break;
case ATA_DNXFER_FORCE_PIO0:
pio_mask &= 1;
fallthrough;
case ATA_DNXFER_FORCE_PIO:
mwdma_mask = 0;
udma_mask = 0;
break;
default:
BUG();
}
xfer_mask &= ata_pack_xfermask(pio_mask, mwdma_mask, udma_mask);
if (!(xfer_mask & ATA_MASK_PIO) || xfer_mask == orig_mask)
return -ENOENT;
if (!quiet) {
if (xfer_mask & (ATA_MASK_MWDMA | ATA_MASK_UDMA))
snprintf(buf, sizeof(buf), "%s:%s",
ata_mode_string(xfer_mask),
ata_mode_string(xfer_mask & ATA_MASK_PIO));
else
snprintf(buf, sizeof(buf), "%s",
ata_mode_string(xfer_mask));
ata_dev_warn(dev, "limiting speed to %s\n", buf);
}
ata_unpack_xfermask(xfer_mask, &dev->pio_mask, &dev->mwdma_mask,
&dev->udma_mask);
return 0;
}
static int ata_dev_set_mode(struct ata_device *dev)
{
struct ata_port *ap = dev->link->ap;
struct ata_eh_context *ehc = &dev->link->eh_context;
const bool nosetxfer = dev->horkage & ATA_HORKAGE_NOSETXFER;
const char *dev_err_whine = "";
int ign_dev_err = 0;
unsigned int err_mask = 0;
int rc;
dev->flags &= ~ATA_DFLAG_PIO;
if (dev->xfer_shift == ATA_SHIFT_PIO)
dev->flags |= ATA_DFLAG_PIO;
if (nosetxfer && ap->flags & ATA_FLAG_SATA && ata_id_is_sata(dev->id))
dev_err_whine = " (SET_XFERMODE skipped)";
else {
if (nosetxfer)
ata_dev_warn(dev,
"NOSETXFER but PATA detected - can't "
"skip SETXFER, might malfunction\n");
err_mask = ata_dev_set_xfermode(dev);
}
if (err_mask & ~AC_ERR_DEV)
goto fail;
/* revalidate */
ehc->i.flags |= ATA_EHI_POST_SETMODE;
rc = ata_dev_revalidate(dev, ATA_DEV_UNKNOWN, 0);
ehc->i.flags &= ~ATA_EHI_POST_SETMODE;
if (rc)
return rc;
if (dev->xfer_shift == ATA_SHIFT_PIO) {
/* Old CFA may refuse this command, which is just fine */
if (ata_id_is_cfa(dev->id))
ign_dev_err = 1;
/* Catch several broken garbage emulations plus some pre
ATA devices */
if (ata_id_major_version(dev->id) == 0 &&
dev->pio_mode <= XFER_PIO_2)
ign_dev_err = 1;
/* Some very old devices and some bad newer ones fail
any kind of SET_XFERMODE request but support PIO0-2
timings and no IORDY */
if (!ata_id_has_iordy(dev->id) && dev->pio_mode <= XFER_PIO_2)
ign_dev_err = 1;
}
/* Early MWDMA devices do DMA but don't allow DMA mode setting.
Don't fail an MWDMA0 set IFF the device indicates it is in MWDMA0 */
if (dev->xfer_shift == ATA_SHIFT_MWDMA &&
dev->dma_mode == XFER_MW_DMA_0 &&
(dev->id[63] >> 8) & 1)
ign_dev_err = 1;
/* if the device is actually configured correctly, ignore dev err */
if (dev->xfer_mode == ata_xfer_mask2mode(ata_id_xfermask(dev->id)))
ign_dev_err = 1;
if (err_mask & AC_ERR_DEV) {
if (!ign_dev_err)
goto fail;
else
dev_err_whine = " (device error ignored)";
}
DPRINTK("xfer_shift=%u, xfer_mode=0x%x\n",
dev->xfer_shift, (int)dev->xfer_mode);
if (!(ehc->i.flags & ATA_EHI_QUIET) ||
ehc->i.flags & ATA_EHI_DID_HARDRESET)
ata_dev_info(dev, "configured for %s%s\n",
ata_mode_string(ata_xfer_mode2mask(dev->xfer_mode)),
dev_err_whine);
return 0;
fail:
ata_dev_err(dev, "failed to set xfermode (err_mask=0x%x)\n", err_mask);
return -EIO;
}
/**
* ata_do_set_mode - Program timings and issue SET FEATURES - XFER
* @link: link on which timings will be programmed
* @r_failed_dev: out parameter for failed device
*
* Standard implementation of the function used to tune and set
* ATA device disk transfer mode (PIO3, UDMA6, etc.). If
* ata_dev_set_mode() fails, pointer to the failing device is
* returned in @r_failed_dev.
*
* LOCKING:
* PCI/etc. bus probe sem.
*
* RETURNS:
* 0 on success, negative errno otherwise
*/
int ata_do_set_mode(struct ata_link *link, struct ata_device **r_failed_dev)
{
struct ata_port *ap = link->ap;
struct ata_device *dev;
int rc = 0, used_dma = 0, found = 0;
/* step 1: calculate xfer_mask */
ata_for_each_dev(dev, link, ENABLED) {
unsigned long pio_mask, dma_mask;
unsigned int mode_mask;
mode_mask = ATA_DMA_MASK_ATA;
if (dev->class == ATA_DEV_ATAPI)
mode_mask = ATA_DMA_MASK_ATAPI;
else if (ata_id_is_cfa(dev->id))
mode_mask = ATA_DMA_MASK_CFA;
ata_dev_xfermask(dev);
ata_force_xfermask(dev);
pio_mask = ata_pack_xfermask(dev->pio_mask, 0, 0);
if (libata_dma_mask & mode_mask)
dma_mask = ata_pack_xfermask(0, dev->mwdma_mask,
dev->udma_mask);
else
dma_mask = 0;
dev->pio_mode = ata_xfer_mask2mode(pio_mask);
dev->dma_mode = ata_xfer_mask2mode(dma_mask);
found = 1;
if (ata_dma_enabled(dev))
used_dma = 1;
}
if (!found)
goto out;
/* step 2: always set host PIO timings */
ata_for_each_dev(dev, link, ENABLED) {
if (dev->pio_mode == 0xff) {
ata_dev_warn(dev, "no PIO support\n");
rc = -EINVAL;
goto out;
}
dev->xfer_mode = dev->pio_mode;
dev->xfer_shift = ATA_SHIFT_PIO;
if (ap->ops->set_piomode)
ap->ops->set_piomode(ap, dev);
}
/* step 3: set host DMA timings */
ata_for_each_dev(dev, link, ENABLED) {
if (!ata_dma_enabled(dev))
continue;
dev->xfer_mode = dev->dma_mode;
dev->xfer_shift = ata_xfer_mode2shift(dev->dma_mode);
if (ap->ops->set_dmamode)
ap->ops->set_dmamode(ap, dev);
}
/* step 4: update devices' xfer mode */
ata_for_each_dev(dev, link, ENABLED) {
rc = ata_dev_set_mode(dev);
if (rc)
goto out;
}
/* Record simplex status. If we selected DMA then the other
* host channels are not permitted to do so.
*/
if (used_dma && (ap->host->flags & ATA_HOST_SIMPLEX))
ap->host->simplex_claimed = ap;
out:
if (rc)
*r_failed_dev = dev;
return rc;
}
EXPORT_SYMBOL_GPL(ata_do_set_mode);
/**
* ata_wait_ready - wait for link to become ready
* @link: link to be waited on
* @deadline: deadline jiffies for the operation
* @check_ready: callback to check link readiness
*
* Wait for @link to become ready. @check_ready should return
* positive number if @link is ready, 0 if it isn't, -ENODEV if
* link doesn't seem to be occupied, other errno for other error
* conditions.
*
* Transient -ENODEV conditions are allowed for
* ATA_TMOUT_FF_WAIT.
*
* LOCKING:
* EH context.
*
* RETURNS:
* 0 if @link is ready before @deadline; otherwise, -errno.
*/
int ata_wait_ready(struct ata_link *link, unsigned long deadline,
int (*check_ready)(struct ata_link *link))
{
unsigned long start = jiffies;
unsigned long nodev_deadline;
int warned = 0;
/* choose which 0xff timeout to use, read comment in libata.h */
if (link->ap->host->flags & ATA_HOST_PARALLEL_SCAN)
nodev_deadline = ata_deadline(start, ATA_TMOUT_FF_WAIT_LONG);
else
nodev_deadline = ata_deadline(start, ATA_TMOUT_FF_WAIT);
/* Slave readiness can't be tested separately from master. On
* M/S emulation configuration, this function should be called
* only on the master and it will handle both master and slave.
*/
WARN_ON(link == link->ap->slave_link);
if (time_after(nodev_deadline, deadline))
nodev_deadline = deadline;
while (1) {
unsigned long now = jiffies;
int ready, tmp;
ready = tmp = check_ready(link);
if (ready > 0)
return 0;
/*
* -ENODEV could be transient. Ignore -ENODEV if link
* is online. Also, some SATA devices take a long
* time to clear 0xff after reset. Wait for
* ATA_TMOUT_FF_WAIT[_LONG] on -ENODEV if link isn't
* offline.
*
* Note that some PATA controllers (pata_ali) explode
* if status register is read more than once when
* there's no device attached.
*/
if (ready == -ENODEV) {
if (ata_link_online(link))
ready = 0;
else if ((link->ap->flags & ATA_FLAG_SATA) &&
!ata_link_offline(link) &&
time_before(now, nodev_deadline))
ready = 0;
}
if (ready)
return ready;
if (time_after(now, deadline))
return -EBUSY;
if (!warned && time_after(now, start + 5 * HZ) &&
(deadline - now > 3 * HZ)) {
ata_link_warn(link,
"link is slow to respond, please be patient "
"(ready=%d)\n", tmp);
warned = 1;
}
ata_msleep(link->ap, 50);
}
}
/**
* ata_wait_after_reset - wait for link to become ready after reset
* @link: link to be waited on
* @deadline: deadline jiffies for the operation
* @check_ready: callback to check link readiness
*
* Wait for @link to become ready after reset.
*
* LOCKING:
* EH context.
*
* RETURNS:
* 0 if @link is ready before @deadline; otherwise, -errno.
*/
int ata_wait_after_reset(struct ata_link *link, unsigned long deadline,
int (*check_ready)(struct ata_link *link))
{
ata_msleep(link->ap, ATA_WAIT_AFTER_RESET);
return ata_wait_ready(link, deadline, check_ready);
}
EXPORT_SYMBOL_GPL(ata_wait_after_reset);
/**
* ata_std_prereset - prepare for reset
* @link: ATA link to be reset
* @deadline: deadline jiffies for the operation
*
* @link is about to be reset. Initialize it. Failure from
* prereset makes libata abort whole reset sequence and give up
* that port, so prereset should be best-effort. It does its
* best to prepare for reset sequence but if things go wrong, it
* should just whine, not fail.
*
* LOCKING:
* Kernel thread context (may sleep)
*
* RETURNS:
* 0 on success, -errno otherwise.
*/
int ata_std_prereset(struct ata_link *link, unsigned long deadline)
{
struct ata_port *ap = link->ap;
struct ata_eh_context *ehc = &link->eh_context;
const unsigned long *timing = sata_ehc_deb_timing(ehc);
int rc;
/* if we're about to do hardreset, nothing more to do */
if (ehc->i.action & ATA_EH_HARDRESET)
return 0;
/* if SATA, resume link */
if (ap->flags & ATA_FLAG_SATA) {
rc = sata_link_resume(link, timing, deadline);
/* whine about phy resume failure but proceed */
if (rc && rc != -EOPNOTSUPP)
ata_link_warn(link,
"failed to resume link for reset (errno=%d)\n",
rc);
}
/* no point in trying softreset on offline link */
if (ata_phys_link_offline(link))
ehc->i.action &= ~ATA_EH_SOFTRESET;
return 0;
}
EXPORT_SYMBOL_GPL(ata_std_prereset);
/**
* sata_std_hardreset - COMRESET w/o waiting or classification
* @link: link to reset
* @class: resulting class of attached device
* @deadline: deadline jiffies for the operation
*
* Standard SATA COMRESET w/o waiting or classification.
*
* LOCKING:
* Kernel thread context (may sleep)
*
* RETURNS:
* 0 if link offline, -EAGAIN if link online, -errno on errors.
*/
int sata_std_hardreset(struct ata_link *link, unsigned int *class,
unsigned long deadline)
{
const unsigned long *timing = sata_ehc_deb_timing(&link->eh_context);
bool online;
int rc;
/* do hardreset */
rc = sata_link_hardreset(link, timing, deadline, &online, NULL);
return online ? -EAGAIN : rc;
}
EXPORT_SYMBOL_GPL(sata_std_hardreset);
/**
* ata_std_postreset - standard postreset callback
* @link: the target ata_link
* @classes: classes of attached devices
*
* This function is invoked after a successful reset. Note that
* the device might have been reset more than once using
* different reset methods before postreset is invoked.
*
* LOCKING:
* Kernel thread context (may sleep)
*/
void ata_std_postreset(struct ata_link *link, unsigned int *classes)
{
u32 serror;
DPRINTK("ENTER\n");
/* reset complete, clear SError */
if (!sata_scr_read(link, SCR_ERROR, &serror))
sata_scr_write(link, SCR_ERROR, serror);
/* print link status */
sata_print_link_status(link);
DPRINTK("EXIT\n");
}
EXPORT_SYMBOL_GPL(ata_std_postreset);
/**
* ata_dev_same_device - Determine whether new ID matches configured device
* @dev: device to compare against
* @new_class: class of the new device
* @new_id: IDENTIFY page of the new device
*
* Compare @new_class and @new_id against @dev and determine
* whether @dev is the device indicated by @new_class and
* @new_id.
*
* LOCKING:
* None.
*
* RETURNS:
* 1 if @dev matches @new_class and @new_id, 0 otherwise.
*/
static int ata_dev_same_device(struct ata_device *dev, unsigned int new_class,
const u16 *new_id)
{
const u16 *old_id = dev->id;
unsigned char model[2][ATA_ID_PROD_LEN + 1];
unsigned char serial[2][ATA_ID_SERNO_LEN + 1];
if (dev->class != new_class) {
ata_dev_info(dev, "class mismatch %d != %d\n",
dev->class, new_class);
return 0;
}
ata_id_c_string(old_id, model[0], ATA_ID_PROD, sizeof(model[0]));
ata_id_c_string(new_id, model[1], ATA_ID_PROD, sizeof(model[1]));
ata_id_c_string(old_id, serial[0], ATA_ID_SERNO, sizeof(serial[0]));
ata_id_c_string(new_id, serial[1], ATA_ID_SERNO, sizeof(serial[1]));
if (strcmp(model[0], model[1])) {
ata_dev_info(dev, "model number mismatch '%s' != '%s'\n",
model[0], model[1]);
return 0;
}
if (strcmp(serial[0], serial[1])) {
ata_dev_info(dev, "serial number mismatch '%s' != '%s'\n",
serial[0], serial[1]);
return 0;
}
return 1;
}
/**
* ata_dev_reread_id - Re-read IDENTIFY data
* @dev: target ATA device
* @readid_flags: read ID flags
*
* Re-read IDENTIFY page and make sure @dev is still attached to
* the port.
*
* LOCKING:
* Kernel thread context (may sleep)
*
* RETURNS:
* 0 on success, negative errno otherwise
*/
int ata_dev_reread_id(struct ata_device *dev, unsigned int readid_flags)
{
unsigned int class = dev->class;
u16 *id = (void *)dev->link->ap->sector_buf;
int rc;
/* read ID data */
rc = ata_dev_read_id(dev, &class, readid_flags, id);
if (rc)
return rc;
/* is the device still there? */
if (!ata_dev_same_device(dev, class, id))
return -ENODEV;
memcpy(dev->id, id, sizeof(id[0]) * ATA_ID_WORDS);
return 0;
}
/**
* ata_dev_revalidate - Revalidate ATA device
* @dev: device to revalidate
* @new_class: new class code
* @readid_flags: read ID flags
*
* Re-read IDENTIFY page, make sure @dev is still attached to the
* port and reconfigure it according to the new IDENTIFY page.
*
* LOCKING:
* Kernel thread context (may sleep)
*
* RETURNS:
* 0 on success, negative errno otherwise
*/
int ata_dev_revalidate(struct ata_device *dev, unsigned int new_class,
unsigned int readid_flags)
{
u64 n_sectors = dev->n_sectors;
u64 n_native_sectors = dev->n_native_sectors;
int rc;
if (!ata_dev_enabled(dev))
return -ENODEV;
/* fail early if !ATA && !ATAPI to avoid issuing [P]IDENTIFY to PMP */
if (ata_class_enabled(new_class) &&
new_class != ATA_DEV_ATA &&
new_class != ATA_DEV_ATAPI &&
new_class != ATA_DEV_ZAC &&
new_class != ATA_DEV_SEMB) {
ata_dev_info(dev, "class mismatch %u != %u\n",
dev->class, new_class);
rc = -ENODEV;
goto fail;
}
/* re-read ID */
rc = ata_dev_reread_id(dev, readid_flags);
if (rc)
goto fail;
/* configure device according to the new ID */
rc = ata_dev_configure(dev);
if (rc)
goto fail;
/* verify n_sectors hasn't changed */
if (dev->class != ATA_DEV_ATA || !n_sectors ||
dev->n_sectors == n_sectors)
return 0;
/* n_sectors has changed */
ata_dev_warn(dev, "n_sectors mismatch %llu != %llu\n",
(unsigned long long)n_sectors,
(unsigned long long)dev->n_sectors);
/*
* Something could have caused HPA to be unlocked
* involuntarily. If n_native_sectors hasn't changed and the
* new size matches it, keep the device.
*/
if (dev->n_native_sectors == n_native_sectors &&
dev->n_sectors > n_sectors && dev->n_sectors == n_native_sectors) {
ata_dev_warn(dev,
"new n_sectors matches native, probably "
"late HPA unlock, n_sectors updated\n");
/* use the larger n_sectors */
return 0;
}
/*
* Some BIOSes boot w/o HPA but resume w/ HPA locked. Try
* unlocking HPA in those cases.
*
* https://bugzilla.kernel.org/show_bug.cgi?id=15396
*/
if (dev->n_native_sectors == n_native_sectors &&
dev->n_sectors < n_sectors && n_sectors == n_native_sectors &&
!(dev->horkage & ATA_HORKAGE_BROKEN_HPA)) {
ata_dev_warn(dev,
"old n_sectors matches native, probably "
"late HPA lock, will try to unlock HPA\n");
/* try unlocking HPA */
dev->flags |= ATA_DFLAG_UNLOCK_HPA;
rc = -EIO;
} else
rc = -ENODEV;
/* restore original n_[native_]sectors and fail */
dev->n_native_sectors = n_native_sectors;
dev->n_sectors = n_sectors;
fail:
ata_dev_err(dev, "revalidation failed (errno=%d)\n", rc);
return rc;
}
struct ata_blacklist_entry {
const char *model_num;
const char *model_rev;
unsigned long horkage;
};
static const struct ata_blacklist_entry ata_device_blacklist [] = {
/* Devices with DMA related problems under Linux */
{ "WDC AC11000H", NULL, ATA_HORKAGE_NODMA },
{ "WDC AC22100H", NULL, ATA_HORKAGE_NODMA },
{ "WDC AC32500H", NULL, ATA_HORKAGE_NODMA },
{ "WDC AC33100H", NULL, ATA_HORKAGE_NODMA },
{ "WDC AC31600H", NULL, ATA_HORKAGE_NODMA },
{ "WDC AC32100H", "24.09P07", ATA_HORKAGE_NODMA },
{ "WDC AC23200L", "21.10N21", ATA_HORKAGE_NODMA },
{ "Compaq CRD-8241B", NULL, ATA_HORKAGE_NODMA },
{ "CRD-8400B", NULL, ATA_HORKAGE_NODMA },
{ "CRD-848[02]B", NULL, ATA_HORKAGE_NODMA },
{ "CRD-84", NULL, ATA_HORKAGE_NODMA },
{ "SanDisk SDP3B", NULL, ATA_HORKAGE_NODMA },
{ "SanDisk SDP3B-64", NULL, ATA_HORKAGE_NODMA },
{ "SANYO CD-ROM CRD", NULL, ATA_HORKAGE_NODMA },
{ "HITACHI CDR-8", NULL, ATA_HORKAGE_NODMA },
{ "HITACHI CDR-8[34]35",NULL, ATA_HORKAGE_NODMA },
{ "Toshiba CD-ROM XM-6202B", NULL, ATA_HORKAGE_NODMA },
{ "TOSHIBA CD-ROM XM-1702BC", NULL, ATA_HORKAGE_NODMA },
{ "CD-532E-A", NULL, ATA_HORKAGE_NODMA },
{ "E-IDE CD-ROM CR-840",NULL, ATA_HORKAGE_NODMA },
{ "CD-ROM Drive/F5A", NULL, ATA_HORKAGE_NODMA },
{ "WPI CDD-820", NULL, ATA_HORKAGE_NODMA },
{ "SAMSUNG CD-ROM SC-148C", NULL, ATA_HORKAGE_NODMA },
{ "SAMSUNG CD-ROM SC", NULL, ATA_HORKAGE_NODMA },
{ "ATAPI CD-ROM DRIVE 40X MAXIMUM",NULL,ATA_HORKAGE_NODMA },
{ "_NEC DV5800A", NULL, ATA_HORKAGE_NODMA },
{ "SAMSUNG CD-ROM SN-124", "N001", ATA_HORKAGE_NODMA },
{ "Seagate STT20000A", NULL, ATA_HORKAGE_NODMA },
{ " 2GB ATA Flash Disk", "ADMA428M", ATA_HORKAGE_NODMA },
{ "VRFDFC22048UCHC-TE*", NULL, ATA_HORKAGE_NODMA },
/* Odd clown on sil3726/4726 PMPs */
{ "Config Disk", NULL, ATA_HORKAGE_DISABLE },
/* Similar story with ASMedia 1092 */
{ "ASMT109x- Config", NULL, ATA_HORKAGE_DISABLE },
/* Weird ATAPI devices */
{ "TORiSAN DVD-ROM DRD-N216", NULL, ATA_HORKAGE_MAX_SEC_128 },
{ "QUANTUM DAT DAT72-000", NULL, ATA_HORKAGE_ATAPI_MOD16_DMA },
{ "Slimtype DVD A DS8A8SH", NULL, ATA_HORKAGE_MAX_SEC_LBA48 },
{ "Slimtype DVD A DS8A9SH", NULL, ATA_HORKAGE_MAX_SEC_LBA48 },
/*
* Causes silent data corruption with higher max sects.
* http://lkml.kernel.org/g/x49wpy40ysk.fsf@segfault.boston.devel.redhat.com
*/
{ "ST380013AS", "3.20", ATA_HORKAGE_MAX_SEC_1024 },
/*
* These devices time out with higher max sects.
* https://bugzilla.kernel.org/show_bug.cgi?id=121671
*/
{ "LITEON CX1-JB*-HP", NULL, ATA_HORKAGE_MAX_SEC_1024 },
{ "LITEON EP1-*", NULL, ATA_HORKAGE_MAX_SEC_1024 },
/* Devices we expect to fail diagnostics */
/* Devices where NCQ should be avoided */
/* NCQ is slow */
{ "WDC WD740ADFD-00", NULL, ATA_HORKAGE_NONCQ },
{ "WDC WD740ADFD-00NLR1", NULL, ATA_HORKAGE_NONCQ, },
/* http://thread.gmane.org/gmane.linux.ide/14907 */
{ "FUJITSU MHT2060BH", NULL, ATA_HORKAGE_NONCQ },
/* NCQ is broken */
{ "Maxtor *", "BANC*", ATA_HORKAGE_NONCQ },
{ "Maxtor 7V300F0", "VA111630", ATA_HORKAGE_NONCQ },
{ "ST380817AS", "3.42", ATA_HORKAGE_NONCQ },
{ "ST3160023AS", "3.42", ATA_HORKAGE_NONCQ },
{ "OCZ CORE_SSD", "02.10104", ATA_HORKAGE_NONCQ },
/* Seagate NCQ + FLUSH CACHE firmware bug */
{ "ST31500341AS", "SD1[5-9]", ATA_HORKAGE_NONCQ |
ATA_HORKAGE_FIRMWARE_WARN },
{ "ST31000333AS", "SD1[5-9]", ATA_HORKAGE_NONCQ |
ATA_HORKAGE_FIRMWARE_WARN },
{ "ST3640[36]23AS", "SD1[5-9]", ATA_HORKAGE_NONCQ |
ATA_HORKAGE_FIRMWARE_WARN },
{ "ST3320[68]13AS", "SD1[5-9]", ATA_HORKAGE_NONCQ |
ATA_HORKAGE_FIRMWARE_WARN },
/* drives which fail FPDMA_AA activation (some may freeze afterwards)
the ST disks also have LPM issues */
{ "ST1000LM024 HN-M101MBB", NULL, ATA_HORKAGE_BROKEN_FPDMA_AA |
ATA_HORKAGE_NOLPM, },
{ "VB0250EAVER", "HPG7", ATA_HORKAGE_BROKEN_FPDMA_AA },
/* Blacklist entries taken from Silicon Image 3124/3132
Windows driver .inf file - also several Linux problem reports */
{ "HTS541060G9SA00", "MB3OC60D", ATA_HORKAGE_NONCQ, },
{ "HTS541080G9SA00", "MB4OC60D", ATA_HORKAGE_NONCQ, },
{ "HTS541010G9SA00", "MBZOC60D", ATA_HORKAGE_NONCQ, },
/* https://bugzilla.kernel.org/show_bug.cgi?id=15573 */
{ "C300-CTFDDAC128MAG", "0001", ATA_HORKAGE_NONCQ, },
/* Sandisk SD7/8/9s lock up hard on large trims */
{ "SanDisk SD[789]*", NULL, ATA_HORKAGE_MAX_TRIM_128M, },
/* devices which puke on READ_NATIVE_MAX */
{ "HDS724040KLSA80", "KFAOA20N", ATA_HORKAGE_BROKEN_HPA, },
{ "WDC WD3200JD-00KLB0", "WD-WCAMR1130137", ATA_HORKAGE_BROKEN_HPA },
{ "WDC WD2500JD-00HBB0", "WD-WMAL71490727", ATA_HORKAGE_BROKEN_HPA },
{ "MAXTOR 6L080L4", "A93.0500", ATA_HORKAGE_BROKEN_HPA },
/* this one allows HPA unlocking but fails IOs on the area */
{ "OCZ-VERTEX", "1.30", ATA_HORKAGE_BROKEN_HPA },
/* Devices which report 1 sector over size HPA */
{ "ST340823A", NULL, ATA_HORKAGE_HPA_SIZE, },
{ "ST320413A", NULL, ATA_HORKAGE_HPA_SIZE, },
{ "ST310211A", NULL, ATA_HORKAGE_HPA_SIZE, },
/* Devices which get the IVB wrong */
{ "QUANTUM FIREBALLlct10 05", "A03.0900", ATA_HORKAGE_IVB, },
/* Maybe we should just blacklist TSSTcorp... */
{ "TSSTcorp CDDVDW SH-S202[HJN]", "SB0[01]", ATA_HORKAGE_IVB, },
/* Devices that do not need bridging limits applied */
{ "MTRON MSP-SATA*", NULL, ATA_HORKAGE_BRIDGE_OK, },
{ "BUFFALO HD-QSU2/R5", NULL, ATA_HORKAGE_BRIDGE_OK, },
/* Devices which aren't very happy with higher link speeds */
{ "WD My Book", NULL, ATA_HORKAGE_1_5_GBPS, },
{ "Seagate FreeAgent GoFlex", NULL, ATA_HORKAGE_1_5_GBPS, },
/*
* Devices which choke on SETXFER. Applies only if both the
* device and controller are SATA.
*/
{ "PIONEER DVD-RW DVRTD08", NULL, ATA_HORKAGE_NOSETXFER },
{ "PIONEER DVD-RW DVRTD08A", NULL, ATA_HORKAGE_NOSETXFER },
{ "PIONEER DVD-RW DVR-215", NULL, ATA_HORKAGE_NOSETXFER },
{ "PIONEER DVD-RW DVR-212D", NULL, ATA_HORKAGE_NOSETXFER },
{ "PIONEER DVD-RW DVR-216D", NULL, ATA_HORKAGE_NOSETXFER },
/* Crucial BX100 SSD 500GB has broken LPM support */
{ "CT500BX100SSD1", NULL, ATA_HORKAGE_NOLPM },
/* 512GB MX100 with MU01 firmware has both queued TRIM and LPM issues */
{ "Crucial_CT512MX100*", "MU01", ATA_HORKAGE_NO_NCQ_TRIM |
ATA_HORKAGE_ZERO_AFTER_TRIM |
ATA_HORKAGE_NOLPM, },
/* 512GB MX100 with newer firmware has only LPM issues */
{ "Crucial_CT512MX100*", NULL, ATA_HORKAGE_ZERO_AFTER_TRIM |
ATA_HORKAGE_NOLPM, },
/* 480GB+ M500 SSDs have both queued TRIM and LPM issues */
{ "Crucial_CT480M500*", NULL, ATA_HORKAGE_NO_NCQ_TRIM |
ATA_HORKAGE_ZERO_AFTER_TRIM |
ATA_HORKAGE_NOLPM, },
{ "Crucial_CT960M500*", NULL, ATA_HORKAGE_NO_NCQ_TRIM |
ATA_HORKAGE_ZERO_AFTER_TRIM |
ATA_HORKAGE_NOLPM, },
/* These specific Samsung models/firmware-revs do not handle LPM well */
{ "SAMSUNG MZMPC128HBFU-000MV", "CXM14M1Q", ATA_HORKAGE_NOLPM, },
{ "SAMSUNG SSD PM830 mSATA *", "CXM13D1Q", ATA_HORKAGE_NOLPM, },
{ "SAMSUNG MZ7TD256HAFV-000L9", NULL, ATA_HORKAGE_NOLPM, },
{ "SAMSUNG MZ7TE512HMHP-000L1", "EXT06L0Q", ATA_HORKAGE_NOLPM, },
/* devices that don't properly handle queued TRIM commands */
{ "Micron_M500IT_*", "MU01", ATA_HORKAGE_NO_NCQ_TRIM |
ATA_HORKAGE_ZERO_AFTER_TRIM, },
{ "Micron_M500_*", NULL, ATA_HORKAGE_NO_NCQ_TRIM |
ATA_HORKAGE_ZERO_AFTER_TRIM, },
{ "Crucial_CT*M500*", NULL, ATA_HORKAGE_NO_NCQ_TRIM |
ATA_HORKAGE_ZERO_AFTER_TRIM, },
{ "Micron_M5[15]0_*", "MU01", ATA_HORKAGE_NO_NCQ_TRIM |
ATA_HORKAGE_ZERO_AFTER_TRIM, },
{ "Crucial_CT*M550*", "MU01", ATA_HORKAGE_NO_NCQ_TRIM |
ATA_HORKAGE_ZERO_AFTER_TRIM, },
{ "Crucial_CT*MX100*", "MU01", ATA_HORKAGE_NO_NCQ_TRIM |
ATA_HORKAGE_ZERO_AFTER_TRIM, },
{ "Samsung SSD 840 EVO*", NULL, ATA_HORKAGE_NO_NCQ_TRIM |
ATA_HORKAGE_NO_DMA_LOG |
ATA_HORKAGE_ZERO_AFTER_TRIM, },
{ "Samsung SSD 840*", NULL, ATA_HORKAGE_NO_NCQ_TRIM |
ATA_HORKAGE_ZERO_AFTER_TRIM, },
{ "Samsung SSD 850*", NULL, ATA_HORKAGE_NO_NCQ_TRIM |
ATA_HORKAGE_ZERO_AFTER_TRIM, },
{ "Samsung SSD 860*", NULL, ATA_HORKAGE_NO_NCQ_TRIM |
ATA_HORKAGE_ZERO_AFTER_TRIM |
ATA_HORKAGE_NO_NCQ_ON_ATI, },
{ "Samsung SSD 870*", NULL, ATA_HORKAGE_NO_NCQ_TRIM |
ATA_HORKAGE_ZERO_AFTER_TRIM |
ATA_HORKAGE_NO_NCQ_ON_ATI, },
{ "FCCT*M500*", NULL, ATA_HORKAGE_NO_NCQ_TRIM |
ATA_HORKAGE_ZERO_AFTER_TRIM, },
/* devices that don't properly handle TRIM commands */
{ "SuperSSpeed S238*", NULL, ATA_HORKAGE_NOTRIM, },
{ "M88V29*", NULL, ATA_HORKAGE_NOTRIM, },
/*
* As defined, the DRAT (Deterministic Read After Trim) and RZAT
* (Return Zero After Trim) flags in the ATA Command Set are
* unreliable in the sense that they only define what happens if
* the device successfully executed the DSM TRIM command. TRIM
* is only advisory, however, and the device is free to silently
* ignore all or parts of the request.
*
* Whitelist drives that are known to reliably return zeroes
* after TRIM.
*/
/*
* The intel 510 drive has buggy DRAT/RZAT. Explicitly exclude
* that model before whitelisting all other intel SSDs.
*/
{ "INTEL*SSDSC2MH*", NULL, 0, },
{ "Micron*", NULL, ATA_HORKAGE_ZERO_AFTER_TRIM, },
{ "Crucial*", NULL, ATA_HORKAGE_ZERO_AFTER_TRIM, },
{ "INTEL*SSD*", NULL, ATA_HORKAGE_ZERO_AFTER_TRIM, },
{ "SSD*INTEL*", NULL, ATA_HORKAGE_ZERO_AFTER_TRIM, },
{ "Samsung*SSD*", NULL, ATA_HORKAGE_ZERO_AFTER_TRIM, },
{ "SAMSUNG*SSD*", NULL, ATA_HORKAGE_ZERO_AFTER_TRIM, },
{ "SAMSUNG*MZ7KM*", NULL, ATA_HORKAGE_ZERO_AFTER_TRIM, },
{ "ST[1248][0248]0[FH]*", NULL, ATA_HORKAGE_ZERO_AFTER_TRIM, },
/*
* Some WD SATA-I drives spin up and down erratically when the link
* is put into the slumber mode. We don't have full list of the
* affected devices. Disable LPM if the device matches one of the
* known prefixes and is SATA-1. As a side effect LPM partial is
* lost too.
*
* https://bugzilla.kernel.org/show_bug.cgi?id=57211
*/
{ "WDC WD800JD-*", NULL, ATA_HORKAGE_WD_BROKEN_LPM },
{ "WDC WD1200JD-*", NULL, ATA_HORKAGE_WD_BROKEN_LPM },
{ "WDC WD1600JD-*", NULL, ATA_HORKAGE_WD_BROKEN_LPM },
{ "WDC WD2000JD-*", NULL, ATA_HORKAGE_WD_BROKEN_LPM },
{ "WDC WD2500JD-*", NULL, ATA_HORKAGE_WD_BROKEN_LPM },
{ "WDC WD3000JD-*", NULL, ATA_HORKAGE_WD_BROKEN_LPM },
{ "WDC WD3200JD-*", NULL, ATA_HORKAGE_WD_BROKEN_LPM },
/* End Marker */
{ }
};
static unsigned long ata_dev_blacklisted(const struct ata_device *dev)
{
unsigned char model_num[ATA_ID_PROD_LEN + 1];
unsigned char model_rev[ATA_ID_FW_REV_LEN + 1];
const struct ata_blacklist_entry *ad = ata_device_blacklist;
ata_id_c_string(dev->id, model_num, ATA_ID_PROD, sizeof(model_num));
ata_id_c_string(dev->id, model_rev, ATA_ID_FW_REV, sizeof(model_rev));
while (ad->model_num) {
if (glob_match(ad->model_num, model_num)) {
if (ad->model_rev == NULL)
return ad->horkage;
if (glob_match(ad->model_rev, model_rev))
return ad->horkage;
}
ad++;
}
return 0;
}
static int ata_dma_blacklisted(const struct ata_device *dev)
{
/* We don't support polling DMA.
* DMA blacklist those ATAPI devices with CDB-intr (and use PIO)
* if the LLDD handles only interrupts in the HSM_ST_LAST state.
*/
if ((dev->link->ap->flags & ATA_FLAG_PIO_POLLING) &&
(dev->flags & ATA_DFLAG_CDB_INTR))
return 1;
return (dev->horkage & ATA_HORKAGE_NODMA) ? 1 : 0;
}
/**
* ata_is_40wire - check drive side detection
* @dev: device
*
* Perform drive side detection decoding, allowing for device vendors
* who can't follow the documentation.
*/
static int ata_is_40wire(struct ata_device *dev)
{
if (dev->horkage & ATA_HORKAGE_IVB)
return ata_drive_40wire_relaxed(dev->id);
return ata_drive_40wire(dev->id);
}
/**
* cable_is_40wire - 40/80/SATA decider
* @ap: port to consider
*
* This function encapsulates the policy for speed management
* in one place. At the moment we don't cache the result but
* there is a good case for setting ap->cbl to the result when
* we are called with unknown cables (and figuring out if it
* impacts hotplug at all).
*
* Return 1 if the cable appears to be 40 wire.
*/
static int cable_is_40wire(struct ata_port *ap)
{
struct ata_link *link;
struct ata_device *dev;
/* If the controller thinks we are 40 wire, we are. */
if (ap->cbl == ATA_CBL_PATA40)
return 1;
/* If the controller thinks we are 80 wire, we are. */
if (ap->cbl == ATA_CBL_PATA80 || ap->cbl == ATA_CBL_SATA)
return 0;
/* If the system is known to be 40 wire short cable (eg
* laptop), then we allow 80 wire modes even if the drive
* isn't sure.
*/
if (ap->cbl == ATA_CBL_PATA40_SHORT)
return 0;
/* If the controller doesn't know, we scan.
*
* Note: We look for all 40 wire detects at this point. Any
* 80 wire detect is taken to be 80 wire cable because
* - in many setups only the one drive (slave if present) will
* give a valid detect
* - if you have a non detect capable drive you don't want it
* to colour the choice
*/
ata_for_each_link(link, ap, EDGE) {
ata_for_each_dev(dev, link, ENABLED) {
if (!ata_is_40wire(dev))
return 0;
}
}
return 1;
}
/**
* ata_dev_xfermask - Compute supported xfermask of the given device
* @dev: Device to compute xfermask for
*
* Compute supported xfermask of @dev and store it in
* dev->*_mask. This function is responsible for applying all
* known limits including host controller limits, device
* blacklist, etc...
*
* LOCKING:
* None.
*/
static void ata_dev_xfermask(struct ata_device *dev)
{
struct ata_link *link = dev->link;
struct ata_port *ap = link->ap;
struct ata_host *host = ap->host;
unsigned long xfer_mask;
/* controller modes available */
xfer_mask = ata_pack_xfermask(ap->pio_mask,
ap->mwdma_mask, ap->udma_mask);
/* drive modes available */
xfer_mask &= ata_pack_xfermask(dev->pio_mask,
dev->mwdma_mask, dev->udma_mask);
xfer_mask &= ata_id_xfermask(dev->id);
/*
* CFA Advanced TrueIDE timings are not allowed on a shared
* cable
*/
if (ata_dev_pair(dev)) {
/* No PIO5 or PIO6 */
xfer_mask &= ~(0x03 << (ATA_SHIFT_PIO + 5));
/* No MWDMA3 or MWDMA 4 */
xfer_mask &= ~(0x03 << (ATA_SHIFT_MWDMA + 3));
}
if (ata_dma_blacklisted(dev)) {
xfer_mask &= ~(ATA_MASK_MWDMA | ATA_MASK_UDMA);
ata_dev_warn(dev,
"device is on DMA blacklist, disabling DMA\n");
}
if ((host->flags & ATA_HOST_SIMPLEX) &&
host->simplex_claimed && host->simplex_claimed != ap) {
xfer_mask &= ~(ATA_MASK_MWDMA | ATA_MASK_UDMA);
ata_dev_warn(dev,
"simplex DMA is claimed by other device, disabling DMA\n");
}
if (ap->flags & ATA_FLAG_NO_IORDY)
xfer_mask &= ata_pio_mask_no_iordy(dev);
if (ap->ops->mode_filter)
xfer_mask = ap->ops->mode_filter(dev, xfer_mask);
/* Apply cable rule here. Don't apply it early because when
* we handle hot plug the cable type can itself change.
* Check this last so that we know if the transfer rate was
* solely limited by the cable.
* Unknown or 80 wire cables reported host side are checked
* drive side as well. Cases where we know a 40wire cable
* is used safely for 80 are not checked here.
*/
if (xfer_mask & (0xF8 << ATA_SHIFT_UDMA))
/* UDMA/44 or higher would be available */
if (cable_is_40wire(ap)) {
ata_dev_warn(dev,
"limited to UDMA/33 due to 40-wire cable\n");
xfer_mask &= ~(0xF8 << ATA_SHIFT_UDMA);
}
ata_unpack_xfermask(xfer_mask, &dev->pio_mask,
&dev->mwdma_mask, &dev->udma_mask);
}
/**
* ata_dev_set_xfermode - Issue SET FEATURES - XFER MODE command
* @dev: Device to which command will be sent
*
* Issue SET FEATURES - XFER MODE command to device @dev
* on port @ap.
*
* LOCKING:
* PCI/etc. bus probe sem.
*
* RETURNS:
* 0 on success, AC_ERR_* mask otherwise.
*/
static unsigned int ata_dev_set_xfermode(struct ata_device *dev)
{
struct ata_taskfile tf;
unsigned int err_mask;
/* set up set-features taskfile */
DPRINTK("set features - xfer mode\n");
/* Some controllers and ATAPI devices show flaky interrupt
* behavior after setting xfer mode. Use polling instead.
*/
ata_tf_init(dev, &tf);
tf.command = ATA_CMD_SET_FEATURES;
tf.feature = SETFEATURES_XFER;
tf.flags |= ATA_TFLAG_ISADDR | ATA_TFLAG_DEVICE | ATA_TFLAG_POLLING;
tf.protocol = ATA_PROT_NODATA;
/* If we are using IORDY we must send the mode setting command */
if (ata_pio_need_iordy(dev))
tf.nsect = dev->xfer_mode;
/* If the device has IORDY and the controller does not - turn it off */
else if (ata_id_has_iordy(dev->id))
tf.nsect = 0x01;
else /* In the ancient relic department - skip all of this */
return 0;
/* On some disks, this command causes spin-up, so we need longer timeout */
err_mask = ata_exec_internal(dev, &tf, NULL, DMA_NONE, NULL, 0, 15000);
DPRINTK("EXIT, err_mask=%x\n", err_mask);
return err_mask;
}
/**
* ata_dev_set_feature - Issue SET FEATURES - SATA FEATURES
* @dev: Device to which command will be sent
* @enable: Whether to enable or disable the feature
* @feature: The sector count represents the feature to set
*
* Issue SET FEATURES - SATA FEATURES command to device @dev
* on port @ap with sector count
*
* LOCKING:
* PCI/etc. bus probe sem.
*
* RETURNS:
* 0 on success, AC_ERR_* mask otherwise.
*/
unsigned int ata_dev_set_feature(struct ata_device *dev, u8 enable, u8 feature)
{
struct ata_taskfile tf;
unsigned int err_mask;
unsigned long timeout = 0;
/* set up set-features taskfile */
DPRINTK("set features - SATA features\n");
ata_tf_init(dev, &tf);
tf.command = ATA_CMD_SET_FEATURES;
tf.feature = enable;
tf.flags |= ATA_TFLAG_ISADDR | ATA_TFLAG_DEVICE;
tf.protocol = ATA_PROT_NODATA;
tf.nsect = feature;
if (enable == SETFEATURES_SPINUP)
timeout = ata_probe_timeout ?
ata_probe_timeout * 1000 : SETFEATURES_SPINUP_TIMEOUT;
err_mask = ata_exec_internal(dev, &tf, NULL, DMA_NONE, NULL, 0, timeout);
DPRINTK("EXIT, err_mask=%x\n", err_mask);
return err_mask;
}
EXPORT_SYMBOL_GPL(ata_dev_set_feature);
/**
* ata_dev_init_params - Issue INIT DEV PARAMS command
* @dev: Device to which command will be sent
* @heads: Number of heads (taskfile parameter)
* @sectors: Number of sectors (taskfile parameter)
*
* LOCKING:
* Kernel thread context (may sleep)
*
* RETURNS:
* 0 on success, AC_ERR_* mask otherwise.
*/
static unsigned int ata_dev_init_params(struct ata_device *dev,
u16 heads, u16 sectors)
{
struct ata_taskfile tf;
unsigned int err_mask;
/* Number of sectors per track 1-255. Number of heads 1-16 */
if (sectors < 1 || sectors > 255 || heads < 1 || heads > 16)
return AC_ERR_INVALID;
/* set up init dev params taskfile */
DPRINTK("init dev params \n");
ata_tf_init(dev, &tf);
tf.command = ATA_CMD_INIT_DEV_PARAMS;
tf.flags |= ATA_TFLAG_ISADDR | ATA_TFLAG_DEVICE;
tf.protocol = ATA_PROT_NODATA;
tf.nsect = sectors;
tf.device |= (heads - 1) & 0x0f; /* max head = num. of heads - 1 */
err_mask = ata_exec_internal(dev, &tf, NULL, DMA_NONE, NULL, 0, 0);
/* A clean abort indicates an original or just out of spec drive
and we should continue as we issue the setup based on the
drive reported working geometry */
if (err_mask == AC_ERR_DEV && (tf.feature & ATA_ABORTED))
err_mask = 0;
DPRINTK("EXIT, err_mask=%x\n", err_mask);
return err_mask;
}
/**
* atapi_check_dma - Check whether ATAPI DMA can be supported
* @qc: Metadata associated with taskfile to check
*
* Allow low-level driver to filter ATA PACKET commands, returning
* a status indicating whether or not it is OK to use DMA for the
* supplied PACKET command.
*
* LOCKING:
* spin_lock_irqsave(host lock)
*
* RETURNS: 0 when ATAPI DMA can be used
* nonzero otherwise
*/
int atapi_check_dma(struct ata_queued_cmd *qc)
{
struct ata_port *ap = qc->ap;
/* Don't allow DMA if it isn't multiple of 16 bytes. Quite a
* few ATAPI devices choke on such DMA requests.
*/
if (!(qc->dev->horkage & ATA_HORKAGE_ATAPI_MOD16_DMA) &&
unlikely(qc->nbytes & 15))
return 1;
if (ap->ops->check_atapi_dma)
return ap->ops->check_atapi_dma(qc);
return 0;
}
/**
* ata_std_qc_defer - Check whether a qc needs to be deferred
* @qc: ATA command in question
*
* Non-NCQ commands cannot run with any other command, NCQ or
* not. As upper layer only knows the queue depth, we are
* responsible for maintaining exclusion. This function checks
* whether a new command @qc can be issued.
*
* LOCKING:
* spin_lock_irqsave(host lock)
*
* RETURNS:
* ATA_DEFER_* if deferring is needed, 0 otherwise.
*/
int ata_std_qc_defer(struct ata_queued_cmd *qc)
{
struct ata_link *link = qc->dev->link;
if (ata_is_ncq(qc->tf.protocol)) {
if (!ata_tag_valid(link->active_tag))
return 0;
} else {
if (!ata_tag_valid(link->active_tag) && !link->sactive)
return 0;
}
return ATA_DEFER_LINK;
}
EXPORT_SYMBOL_GPL(ata_std_qc_defer);
enum ata_completion_errors ata_noop_qc_prep(struct ata_queued_cmd *qc)
{
return AC_ERR_OK;
}
EXPORT_SYMBOL_GPL(ata_noop_qc_prep);
/**
* ata_sg_init - Associate command with scatter-gather table.
* @qc: Command to be associated
* @sg: Scatter-gather table.
* @n_elem: Number of elements in s/g table.
*
* Initialize the data-related elements of queued_cmd @qc
* to point to a scatter-gather table @sg, containing @n_elem
* elements.
*
* LOCKING:
* spin_lock_irqsave(host lock)
*/
void ata_sg_init(struct ata_queued_cmd *qc, struct scatterlist *sg,
unsigned int n_elem)
{
qc->sg = sg;
qc->n_elem = n_elem;
qc->cursg = qc->sg;
}
#ifdef CONFIG_HAS_DMA
/**
* ata_sg_clean - Unmap DMA memory associated with command
* @qc: Command containing DMA memory to be released
*
* Unmap all mapped DMA memory associated with this command.
*
* LOCKING:
* spin_lock_irqsave(host lock)
*/
static void ata_sg_clean(struct ata_queued_cmd *qc)
{
struct ata_port *ap = qc->ap;
struct scatterlist *sg = qc->sg;
int dir = qc->dma_dir;
WARN_ON_ONCE(sg == NULL);
VPRINTK("unmapping %u sg elements\n", qc->n_elem);
if (qc->n_elem)
dma_unmap_sg(ap->dev, sg, qc->orig_n_elem, dir);
qc->flags &= ~ATA_QCFLAG_DMAMAP;
qc->sg = NULL;
}
/**
* ata_sg_setup - DMA-map the scatter-gather table associated with a command.
* @qc: Command with scatter-gather table to be mapped.
*
* DMA-map the scatter-gather table associated with queued_cmd @qc.
*
* LOCKING:
* spin_lock_irqsave(host lock)
*
* RETURNS:
* Zero on success, negative on error.
*
*/
static int ata_sg_setup(struct ata_queued_cmd *qc)
{
struct ata_port *ap = qc->ap;
unsigned int n_elem;
VPRINTK("ENTER, ata%u\n", ap->print_id);
n_elem = dma_map_sg(ap->dev, qc->sg, qc->n_elem, qc->dma_dir);
if (n_elem < 1)
return -1;
VPRINTK("%d sg elements mapped\n", n_elem);
qc->orig_n_elem = qc->n_elem;
qc->n_elem = n_elem;
qc->flags |= ATA_QCFLAG_DMAMAP;
return 0;
}
#else /* !CONFIG_HAS_DMA */
static inline void ata_sg_clean(struct ata_queued_cmd *qc) {}
static inline int ata_sg_setup(struct ata_queued_cmd *qc) { return -1; }
#endif /* !CONFIG_HAS_DMA */
/**
* swap_buf_le16 - swap halves of 16-bit words in place
* @buf: Buffer to swap
* @buf_words: Number of 16-bit words in buffer.
*
* Swap halves of 16-bit words if needed to convert from
* little-endian byte order to native cpu byte order, or
* vice-versa.
*
* LOCKING:
* Inherited from caller.
*/
void swap_buf_le16(u16 *buf, unsigned int buf_words)
{
#ifdef __BIG_ENDIAN
unsigned int i;
for (i = 0; i < buf_words; i++)
buf[i] = le16_to_cpu(buf[i]);
#endif /* __BIG_ENDIAN */
}
/**
* ata_qc_new_init - Request an available ATA command, and initialize it
* @dev: Device from whom we request an available command structure
* @tag: tag
*
* LOCKING:
* None.
*/
struct ata_queued_cmd *ata_qc_new_init(struct ata_device *dev, int tag)
{
struct ata_port *ap = dev->link->ap;
struct ata_queued_cmd *qc;
/* no command while frozen */
if (unlikely(ap->pflags & ATA_PFLAG_FROZEN))
return NULL;
/* libsas case */
if (ap->flags & ATA_FLAG_SAS_HOST) { tag = ata_sas_allocate_tag(ap); if (tag < 0)
return NULL;
}
qc = __ata_qc_from_tag(ap, tag); qc->tag = qc->hw_tag = tag;
qc->scsicmd = NULL;
qc->ap = ap;
qc->dev = dev;
ata_qc_reinit(qc);
return qc;
}
/**
* ata_qc_free - free unused ata_queued_cmd
* @qc: Command to complete
*
* Designed to free unused ata_queued_cmd object
* in case something prevents using it.
*
* LOCKING:
* spin_lock_irqsave(host lock)
*/
void ata_qc_free(struct ata_queued_cmd *qc)
{
struct ata_port *ap;
unsigned int tag;
WARN_ON_ONCE(qc == NULL); /* ata_qc_from_tag _might_ return NULL */
ap = qc->ap;
qc->flags = 0;
tag = qc->tag;
if (ata_tag_valid(tag)) {
qc->tag = ATA_TAG_POISON;
if (ap->flags & ATA_FLAG_SAS_HOST)
ata_sas_free_tag(tag, ap);
}
}
void __ata_qc_complete(struct ata_queued_cmd *qc)
{
struct ata_port *ap;
struct ata_link *link;
WARN_ON_ONCE(qc == NULL); /* ata_qc_from_tag _might_ return NULL */
WARN_ON_ONCE(!(qc->flags & ATA_QCFLAG_ACTIVE));
ap = qc->ap;
link = qc->dev->link;
if (likely(qc->flags & ATA_QCFLAG_DMAMAP))
ata_sg_clean(qc);
/* command should be marked inactive atomically with qc completion */
if (ata_is_ncq(qc->tf.protocol)) {
link->sactive &= ~(1 << qc->hw_tag);
if (!link->sactive)
ap->nr_active_links--;
} else {
link->active_tag = ATA_TAG_POISON;
ap->nr_active_links--;
}
/* clear exclusive status */
if (unlikely(qc->flags & ATA_QCFLAG_CLEAR_EXCL &&
ap->excl_link == link))
ap->excl_link = NULL;
/* atapi: mark qc as inactive to prevent the interrupt handler
* from completing the command twice later, before the error handler
* is called. (when rc != 0 and atapi request sense is needed)
*/
qc->flags &= ~ATA_QCFLAG_ACTIVE;
ap->qc_active &= ~(1ULL << qc->tag);
/* call completion callback */
qc->complete_fn(qc);
}
static void fill_result_tf(struct ata_queued_cmd *qc)
{
struct ata_port *ap = qc->ap;
qc->result_tf.flags = qc->tf.flags;
ap->ops->qc_fill_rtf(qc);
}
static void ata_verify_xfer(struct ata_queued_cmd *qc)
{
struct ata_device *dev = qc->dev;
if (!ata_is_data(qc->tf.protocol))
return;
if ((dev->mwdma_mask || dev->udma_mask) && ata_is_pio(qc->tf.protocol))
return;
dev->flags &= ~ATA_DFLAG_DUBIOUS_XFER;
}
/**
* ata_qc_complete - Complete an active ATA command
* @qc: Command to complete
*
* Indicate to the mid and upper layers that an ATA command has
* completed, with either an ok or not-ok status.
*
* Refrain from calling this function multiple times when
* successfully completing multiple NCQ commands.
* ata_qc_complete_multiple() should be used instead, which will
* properly update IRQ expect state.
*
* LOCKING:
* spin_lock_irqsave(host lock)
*/
void ata_qc_complete(struct ata_queued_cmd *qc)
{
struct ata_port *ap = qc->ap;
/* Trigger the LED (if available) */
ledtrig_disk_activity(!!(qc->tf.flags & ATA_TFLAG_WRITE));
/* XXX: New EH and old EH use different mechanisms to
* synchronize EH with regular execution path.
*
* In new EH, a failed qc is marked with ATA_QCFLAG_FAILED.
* Normal execution path is responsible for not accessing a
* failed qc. libata core enforces the rule by returning NULL
* from ata_qc_from_tag() for failed qcs.
*
* Old EH depends on ata_qc_complete() nullifying completion
* requests if ATA_QCFLAG_EH_SCHEDULED is set. Old EH does
* not synchronize with interrupt handler. Only PIO task is
* taken care of.
*/
if (ap->ops->error_handler) {
struct ata_device *dev = qc->dev;
struct ata_eh_info *ehi = &dev->link->eh_info;
if (unlikely(qc->err_mask))
qc->flags |= ATA_QCFLAG_FAILED;
/*
* Finish internal commands without any further processing
* and always with the result TF filled.
*/
if (unlikely(ata_tag_internal(qc->tag))) {
fill_result_tf(qc);
trace_ata_qc_complete_internal(qc);
__ata_qc_complete(qc);
return;
}
/*
* Non-internal qc has failed. Fill the result TF and
* summon EH.
*/
if (unlikely(qc->flags & ATA_QCFLAG_FAILED)) {
fill_result_tf(qc);
trace_ata_qc_complete_failed(qc);
ata_qc_schedule_eh(qc);
return;
}
WARN_ON_ONCE(ap->pflags & ATA_PFLAG_FROZEN);
/* read result TF if requested */
if (qc->flags & ATA_QCFLAG_RESULT_TF)
fill_result_tf(qc);
trace_ata_qc_complete_done(qc);
/* Some commands need post-processing after successful
* completion.
*/
switch (qc->tf.command) {
case ATA_CMD_SET_FEATURES:
if (qc->tf.feature != SETFEATURES_WC_ON &&
qc->tf.feature != SETFEATURES_WC_OFF &&
qc->tf.feature != SETFEATURES_RA_ON &&
qc->tf.feature != SETFEATURES_RA_OFF)
break;
fallthrough;
case ATA_CMD_INIT_DEV_PARAMS: /* CHS translation changed */
case ATA_CMD_SET_MULTI: /* multi_count changed */
/* revalidate device */
ehi->dev_action[dev->devno] |= ATA_EH_REVALIDATE;
ata_port_schedule_eh(ap);
break;
case ATA_CMD_SLEEP:
dev->flags |= ATA_DFLAG_SLEEPING;
break;
}
if (unlikely(dev->flags & ATA_DFLAG_DUBIOUS_XFER))
ata_verify_xfer(qc);
__ata_qc_complete(qc);
} else {
if (qc->flags & ATA_QCFLAG_EH_SCHEDULED)
return;
/* read result TF if failed or requested */
if (qc->err_mask || qc->flags & ATA_QCFLAG_RESULT_TF)
fill_result_tf(qc);
__ata_qc_complete(qc);
}
}
EXPORT_SYMBOL_GPL(ata_qc_complete);
/**
* ata_qc_get_active - get bitmask of active qcs
* @ap: port in question
*
* LOCKING:
* spin_lock_irqsave(host lock)
*
* RETURNS:
* Bitmask of active qcs
*/
u64 ata_qc_get_active(struct ata_port *ap)
{
u64 qc_active = ap->qc_active;
/* ATA_TAG_INTERNAL is sent to hw as tag 0 */
if (qc_active & (1ULL << ATA_TAG_INTERNAL)) {
qc_active |= (1 << 0);
qc_active &= ~(1ULL << ATA_TAG_INTERNAL);
}
return qc_active;
}
EXPORT_SYMBOL_GPL(ata_qc_get_active);
/**
* ata_qc_issue - issue taskfile to device
* @qc: command to issue to device
*
* Prepare an ATA command to submission to device.
* This includes mapping the data into a DMA-able
* area, filling in the S/G table, and finally
* writing the taskfile to hardware, starting the command.
*
* LOCKING:
* spin_lock_irqsave(host lock)
*/
void ata_qc_issue(struct ata_queued_cmd *qc)
{
struct ata_port *ap = qc->ap;
struct ata_link *link = qc->dev->link;
u8 prot = qc->tf.protocol;
/* Make sure only one non-NCQ command is outstanding. The
* check is skipped for old EH because it reuses active qc to
* request ATAPI sense.
*/
WARN_ON_ONCE(ap->ops->error_handler && ata_tag_valid(link->active_tag));
if (ata_is_ncq(prot)) {
WARN_ON_ONCE(link->sactive & (1 << qc->hw_tag)); if (!link->sactive) ap->nr_active_links++; link->sactive |= 1 << qc->hw_tag;
} else {
WARN_ON_ONCE(link->sactive); ap->nr_active_links++;
link->active_tag = qc->tag;
}
qc->flags |= ATA_QCFLAG_ACTIVE;
ap->qc_active |= 1ULL << qc->tag;
/*
* We guarantee to LLDs that they will have at least one
* non-zero sg if the command is a data command.
*/
if (ata_is_data(prot) && (!qc->sg || !qc->n_elem || !qc->nbytes))
goto sys_err;
if (ata_is_dma(prot) || (ata_is_pio(prot) &&
(ap->flags & ATA_FLAG_PIO_DMA)))
if (ata_sg_setup(qc))
goto sys_err;
/* if device is sleeping, schedule reset and abort the link */
if (unlikely(qc->dev->flags & ATA_DFLAG_SLEEPING)) { link->eh_info.action |= ATA_EH_RESET;
ata_ehi_push_desc(&link->eh_info, "waking up from sleep");
ata_link_abort(link);
return;
}
qc->err_mask |= ap->ops->qc_prep(qc);
if (unlikely(qc->err_mask))
goto err;
trace_ata_qc_issue(qc);
qc->err_mask |= ap->ops->qc_issue(qc);
if (unlikely(qc->err_mask))
goto err;
return;
sys_err:
qc->err_mask |= AC_ERR_SYSTEM;
err:
ata_qc_complete(qc);
}
/**
* ata_phys_link_online - test whether the given link is online
* @link: ATA link to test
*
* Test whether @link is online. Note that this function returns
* 0 if online status of @link cannot be obtained, so
* ata_link_online(link) != !ata_link_offline(link).
*
* LOCKING:
* None.
*
* RETURNS:
* True if the port online status is available and online.
*/
bool ata_phys_link_online(struct ata_link *link)
{
u32 sstatus;
if (sata_scr_read(link, SCR_STATUS, &sstatus) == 0 &&
ata_sstatus_online(sstatus))
return true;
return false;
}
/**
* ata_phys_link_offline - test whether the given link is offline
* @link: ATA link to test
*
* Test whether @link is offline. Note that this function
* returns 0 if offline status of @link cannot be obtained, so
* ata_link_online(link) != !ata_link_offline(link).
*
* LOCKING:
* None.
*
* RETURNS:
* True if the port offline status is available and offline.
*/
bool ata_phys_link_offline(struct ata_link *link)
{
u32 sstatus;
if (sata_scr_read(link, SCR_STATUS, &sstatus) == 0 &&
!ata_sstatus_online(sstatus))
return true;
return false;
}
/**
* ata_link_online - test whether the given link is online
* @link: ATA link to test
*
* Test whether @link is online. This is identical to
* ata_phys_link_online() when there's no slave link. When
* there's a slave link, this function should only be called on
* the master link and will return true if any of M/S links is
* online.
*
* LOCKING:
* None.
*
* RETURNS:
* True if the port online status is available and online.
*/
bool ata_link_online(struct ata_link *link)
{
struct ata_link *slave = link->ap->slave_link;
WARN_ON(link == slave); /* shouldn't be called on slave link */
return ata_phys_link_online(link) ||
(slave && ata_phys_link_online(slave));
}
EXPORT_SYMBOL_GPL(ata_link_online);
/**
* ata_link_offline - test whether the given link is offline
* @link: ATA link to test
*
* Test whether @link is offline. This is identical to
* ata_phys_link_offline() when there's no slave link. When
* there's a slave link, this function should only be called on
* the master link and will return true if both M/S links are
* offline.
*
* LOCKING:
* None.
*
* RETURNS:
* True if the port offline status is available and offline.
*/
bool ata_link_offline(struct ata_link *link)
{
struct ata_link *slave = link->ap->slave_link;
WARN_ON(link == slave); /* shouldn't be called on slave link */
return ata_phys_link_offline(link) &&
(!slave || ata_phys_link_offline(slave));
}
EXPORT_SYMBOL_GPL(ata_link_offline);
#ifdef CONFIG_PM
static void ata_port_request_pm(struct ata_port *ap, pm_message_t mesg,
unsigned int action, unsigned int ehi_flags,
bool async)
{
struct ata_link *link;
unsigned long flags;
/* Previous resume operation might still be in
* progress. Wait for PM_PENDING to clear.
*/
if (ap->pflags & ATA_PFLAG_PM_PENDING) {
ata_port_wait_eh(ap);
WARN_ON(ap->pflags & ATA_PFLAG_PM_PENDING);
}
/* request PM ops to EH */
spin_lock_irqsave(ap->lock, flags);
ap->pm_mesg = mesg;
ap->pflags |= ATA_PFLAG_PM_PENDING;
ata_for_each_link(link, ap, HOST_FIRST) {
link->eh_info.action |= action;
link->eh_info.flags |= ehi_flags;
}
ata_port_schedule_eh(ap);
spin_unlock_irqrestore(ap->lock, flags);
if (!async) {
ata_port_wait_eh(ap);
WARN_ON(ap->pflags & ATA_PFLAG_PM_PENDING);
}
}
/*
* On some hardware, device fails to respond after spun down for suspend. As
* the device won't be used before being resumed, we don't need to touch the
* device. Ask EH to skip the usual stuff and proceed directly to suspend.
*
* http://thread.gmane.org/gmane.linux.ide/46764
*/
static const unsigned int ata_port_suspend_ehi = ATA_EHI_QUIET
| ATA_EHI_NO_AUTOPSY
| ATA_EHI_NO_RECOVERY;
static void ata_port_suspend(struct ata_port *ap, pm_message_t mesg)
{
ata_port_request_pm(ap, mesg, 0, ata_port_suspend_ehi, false);
}
static void ata_port_suspend_async(struct ata_port *ap, pm_message_t mesg)
{
ata_port_request_pm(ap, mesg, 0, ata_port_suspend_ehi, true);
}
static int ata_port_pm_suspend(struct device *dev)
{
struct ata_port *ap = to_ata_port(dev);
if (pm_runtime_suspended(dev))
return 0;
ata_port_suspend(ap, PMSG_SUSPEND);
return 0;
}
static int ata_port_pm_freeze(struct device *dev)
{
struct ata_port *ap = to_ata_port(dev);
if (pm_runtime_suspended(dev))
return 0;
ata_port_suspend(ap, PMSG_FREEZE);
return 0;
}
static int ata_port_pm_poweroff(struct device *dev)
{
ata_port_suspend(to_ata_port(dev), PMSG_HIBERNATE);
return 0;
}
static const unsigned int ata_port_resume_ehi = ATA_EHI_NO_AUTOPSY
| ATA_EHI_QUIET;
static void ata_port_resume(struct ata_port *ap, pm_message_t mesg)
{
ata_port_request_pm(ap, mesg, ATA_EH_RESET, ata_port_resume_ehi, false);
}
static void ata_port_resume_async(struct ata_port *ap, pm_message_t mesg)
{
ata_port_request_pm(ap, mesg, ATA_EH_RESET, ata_port_resume_ehi, true);
}
static int ata_port_pm_resume(struct device *dev)
{
ata_port_resume_async(to_ata_port(dev), PMSG_RESUME);
pm_runtime_disable(dev);
pm_runtime_set_active(dev);
pm_runtime_enable(dev);
return 0;
}
/*
* For ODDs, the upper layer will poll for media change every few seconds,
* which will make it enter and leave suspend state every few seconds. And
* as each suspend will cause a hard/soft reset, the gain of runtime suspend
* is very little and the ODD may malfunction after constantly being reset.
* So the idle callback here will not proceed to suspend if a non-ZPODD capable
* ODD is attached to the port.
*/
static int ata_port_runtime_idle(struct device *dev)
{
struct ata_port *ap = to_ata_port(dev);
struct ata_link *link;
struct ata_device *adev;
ata_for_each_link(link, ap, HOST_FIRST) {
ata_for_each_dev(adev, link, ENABLED)
if (adev->class == ATA_DEV_ATAPI &&
!zpodd_dev_enabled(adev))
return -EBUSY;
}
return 0;
}
static int ata_port_runtime_suspend(struct device *dev)
{
ata_port_suspend(to_ata_port(dev), PMSG_AUTO_SUSPEND);
return 0;
}
static int ata_port_runtime_resume(struct device *dev)
{
ata_port_resume(to_ata_port(dev), PMSG_AUTO_RESUME);
return 0;
}
static const struct dev_pm_ops ata_port_pm_ops = {
.suspend = ata_port_pm_suspend,
.resume = ata_port_pm_resume,
.freeze = ata_port_pm_freeze,
.thaw = ata_port_pm_resume,
.poweroff = ata_port_pm_poweroff,
.restore = ata_port_pm_resume,
.runtime_suspend = ata_port_runtime_suspend,
.runtime_resume = ata_port_runtime_resume,
.runtime_idle = ata_port_runtime_idle,
};
/* sas ports don't participate in pm runtime management of ata_ports,
* and need to resume ata devices at the domain level, not the per-port
* level. sas suspend/resume is async to allow parallel port recovery
* since sas has multiple ata_port instances per Scsi_Host.
*/
void ata_sas_port_suspend(struct ata_port *ap)
{
ata_port_suspend_async(ap, PMSG_SUSPEND);
}
EXPORT_SYMBOL_GPL(ata_sas_port_suspend);
void ata_sas_port_resume(struct ata_port *ap)
{
ata_port_resume_async(ap, PMSG_RESUME);
}
EXPORT_SYMBOL_GPL(ata_sas_port_resume);
/**
* ata_host_suspend - suspend host
* @host: host to suspend
* @mesg: PM message
*
* Suspend @host. Actual operation is performed by port suspend.
*/
int ata_host_suspend(struct ata_host *host, pm_message_t mesg)
{
host->dev->power.power_state = mesg;
return 0;
}
EXPORT_SYMBOL_GPL(ata_host_suspend);
/**
* ata_host_resume - resume host
* @host: host to resume
*
* Resume @host. Actual operation is performed by port resume.
*/
void ata_host_resume(struct ata_host *host)
{
host->dev->power.power_state = PMSG_ON;
}
EXPORT_SYMBOL_GPL(ata_host_resume);
#endif
const struct device_type ata_port_type = {
.name = "ata_port",
#ifdef CONFIG_PM
.pm = &ata_port_pm_ops,
#endif
};
/**
* ata_dev_init - Initialize an ata_device structure
* @dev: Device structure to initialize
*
* Initialize @dev in preparation for probing.
*
* LOCKING:
* Inherited from caller.
*/
void ata_dev_init(struct ata_device *dev)
{
struct ata_link *link = ata_dev_phys_link(dev);
struct ata_port *ap = link->ap;
unsigned long flags;
/* SATA spd limit is bound to the attached device, reset together */
link->sata_spd_limit = link->hw_sata_spd_limit;
link->sata_spd = 0;
/* High bits of dev->flags are used to record warm plug
* requests which occur asynchronously. Synchronize using
* host lock.
*/
spin_lock_irqsave(ap->lock, flags);
dev->flags &= ~ATA_DFLAG_INIT_MASK;
dev->horkage = 0;
spin_unlock_irqrestore(ap->lock, flags);
memset((void *)dev + ATA_DEVICE_CLEAR_BEGIN, 0,
ATA_DEVICE_CLEAR_END - ATA_DEVICE_CLEAR_BEGIN);
dev->pio_mask = UINT_MAX;
dev->mwdma_mask = UINT_MAX;
dev->udma_mask = UINT_MAX;
}
/**
* ata_link_init - Initialize an ata_link structure
* @ap: ATA port link is attached to
* @link: Link structure to initialize
* @pmp: Port multiplier port number
*
* Initialize @link.
*
* LOCKING:
* Kernel thread context (may sleep)
*/
void ata_link_init(struct ata_port *ap, struct ata_link *link, int pmp)
{
int i;
/* clear everything except for devices */
memset((void *)link + ATA_LINK_CLEAR_BEGIN, 0,
ATA_LINK_CLEAR_END - ATA_LINK_CLEAR_BEGIN);
link->ap = ap;
link->pmp = pmp;
link->active_tag = ATA_TAG_POISON;
link->hw_sata_spd_limit = UINT_MAX;
/* can't use iterator, ap isn't initialized yet */
for (i = 0; i < ATA_MAX_DEVICES; i++) {
struct ata_device *dev = &link->device[i];
dev->link = link;
dev->devno = dev - link->device;
#ifdef CONFIG_ATA_ACPI
dev->gtf_filter = ata_acpi_gtf_filter;
#endif
ata_dev_init(dev);
}
}
/**
* sata_link_init_spd - Initialize link->sata_spd_limit
* @link: Link to configure sata_spd_limit for
*
* Initialize ``link->[hw_]sata_spd_limit`` to the currently
* configured value.
*
* LOCKING:
* Kernel thread context (may sleep).
*
* RETURNS:
* 0 on success, -errno on failure.
*/
int sata_link_init_spd(struct ata_link *link)
{
u8 spd;
int rc;
rc = sata_scr_read(link, SCR_CONTROL, &link->saved_scontrol);
if (rc)
return rc;
spd = (link->saved_scontrol >> 4) & 0xf;
if (spd)
link->hw_sata_spd_limit &= (1 << spd) - 1;
ata_force_link_limits(link);
link->sata_spd_limit = link->hw_sata_spd_limit;
return 0;
}
/**
* ata_port_alloc - allocate and initialize basic ATA port resources
* @host: ATA host this allocated port belongs to
*
* Allocate and initialize basic ATA port resources.
*
* RETURNS:
* Allocate ATA port on success, NULL on failure.
*
* LOCKING:
* Inherited from calling layer (may sleep).
*/
struct ata_port *ata_port_alloc(struct ata_host *host)
{
struct ata_port *ap;
DPRINTK("ENTER\n");
ap = kzalloc(sizeof(*ap), GFP_KERNEL);
if (!ap)
return NULL;
ap->pflags |= ATA_PFLAG_INITIALIZING | ATA_PFLAG_FROZEN;
ap->lock = &host->lock;
ap->print_id = -1;
ap->local_port_no = -1;
ap->host = host;
ap->dev = host->dev;
#if defined(ATA_VERBOSE_DEBUG)
/* turn on all debugging levels */
ap->msg_enable = 0x00FF;
#elif defined(ATA_DEBUG)
ap->msg_enable = ATA_MSG_DRV | ATA_MSG_INFO | ATA_MSG_CTL | ATA_MSG_WARN | ATA_MSG_ERR;
#else
ap->msg_enable = ATA_MSG_DRV | ATA_MSG_ERR | ATA_MSG_WARN;
#endif
mutex_init(&ap->scsi_scan_mutex);
INIT_DELAYED_WORK(&ap->hotplug_task, ata_scsi_hotplug);
INIT_WORK(&ap->scsi_rescan_task, ata_scsi_dev_rescan);
INIT_LIST_HEAD(&ap->eh_done_q);
init_waitqueue_head(&ap->eh_wait_q);
init_completion(&ap->park_req_pending);
timer_setup(&ap->fastdrain_timer, ata_eh_fastdrain_timerfn,
TIMER_DEFERRABLE);
ap->cbl = ATA_CBL_NONE;
ata_link_init(ap, &ap->link, 0);
#ifdef ATA_IRQ_TRAP
ap->stats.unhandled_irq = 1;
ap->stats.idle_irq = 1;
#endif
ata_sff_port_init(ap);
return ap;
}
static void ata_devres_release(struct device *gendev, void *res)
{
struct ata_host *host = dev_get_drvdata(gendev);
int i;
for (i = 0; i < host->n_ports; i++) {
struct ata_port *ap = host->ports[i];
if (!ap)
continue;
if (ap->scsi_host)
scsi_host_put(ap->scsi_host);
}
dev_set_drvdata(gendev, NULL);
ata_host_put(host);
}
static void ata_host_release(struct kref *kref)
{
struct ata_host *host = container_of(kref, struct ata_host, kref);
int i;
for (i = 0; i < host->n_ports; i++) {
struct ata_port *ap = host->ports[i];
kfree(ap->pmp_link);
kfree(ap->slave_link);
kfree(ap);
host->ports[i] = NULL;
}
kfree(host);
}
void ata_host_get(struct ata_host *host)
{
kref_get(&host->kref);
}
void ata_host_put(struct ata_host *host)
{
kref_put(&host->kref, ata_host_release);
}
EXPORT_SYMBOL_GPL(ata_host_put);
/**
* ata_host_alloc - allocate and init basic ATA host resources
* @dev: generic device this host is associated with
* @max_ports: maximum number of ATA ports associated with this host
*
* Allocate and initialize basic ATA host resources. LLD calls
* this function to allocate a host, initializes it fully and
* attaches it using ata_host_register().
*
* @max_ports ports are allocated and host->n_ports is
* initialized to @max_ports. The caller is allowed to decrease
* host->n_ports before calling ata_host_register(). The unused
* ports will be automatically freed on registration.
*
* RETURNS:
* Allocate ATA host on success, NULL on failure.
*
* LOCKING:
* Inherited from calling layer (may sleep).
*/
struct ata_host *ata_host_alloc(struct device *dev, int max_ports)
{
struct ata_host *host;
size_t sz;
int i;
void *dr;
DPRINTK("ENTER\n");
/* alloc a container for our list of ATA ports (buses) */
sz = sizeof(struct ata_host) + (max_ports + 1) * sizeof(void *);
host = kzalloc(sz, GFP_KERNEL);
if (!host)
return NULL;
if (!devres_open_group(dev, NULL, GFP_KERNEL))
goto err_free;
dr = devres_alloc(ata_devres_release, 0, GFP_KERNEL);
if (!dr)
goto err_out;
devres_add(dev, dr);
dev_set_drvdata(dev, host);
spin_lock_init(&host->lock);
mutex_init(&host->eh_mutex);
host->dev = dev;
host->n_ports = max_ports;
kref_init(&host->kref);
/* allocate ports bound to this host */
for (i = 0; i < max_ports; i++) {
struct ata_port *ap;
ap = ata_port_alloc(host);
if (!ap)
goto err_out;
ap->port_no = i;
host->ports[i] = ap;
}
devres_remove_group(dev, NULL);
return host;
err_out:
devres_release_group(dev, NULL);
err_free:
kfree(host);
return NULL;
}
EXPORT_SYMBOL_GPL(ata_host_alloc);
/**
* ata_host_alloc_pinfo - alloc host and init with port_info array
* @dev: generic device this host is associated with
* @ppi: array of ATA port_info to initialize host with
* @n_ports: number of ATA ports attached to this host
*
* Allocate ATA host and initialize with info from @ppi. If NULL
* terminated, @ppi may contain fewer entries than @n_ports. The
* last entry will be used for the remaining ports.
*
* RETURNS:
* Allocate ATA host on success, NULL on failure.
*
* LOCKING:
* Inherited from calling layer (may sleep).
*/
struct ata_host *ata_host_alloc_pinfo(struct device *dev,
const struct ata_port_info * const * ppi,
int n_ports)
{
const struct ata_port_info *pi;
struct ata_host *host;
int i, j;
host = ata_host_alloc(dev, n_ports);
if (!host)
return NULL;
for (i = 0, j = 0, pi = NULL; i < host->n_ports; i++) {
struct ata_port *ap = host->ports[i];
if (ppi[j])
pi = ppi[j++];
ap->pio_mask = pi->pio_mask;
ap->mwdma_mask = pi->mwdma_mask;
ap->udma_mask = pi->udma_mask;
ap->flags |= pi->flags;
ap->link.flags |= pi->link_flags;
ap->ops = pi->port_ops;
if (!host->ops && (pi->port_ops != &ata_dummy_port_ops))
host->ops = pi->port_ops;
}
return host;
}
EXPORT_SYMBOL_GPL(ata_host_alloc_pinfo);
static void ata_host_stop(struct device *gendev, void *res)
{
struct ata_host *host = dev_get_drvdata(gendev);
int i;
WARN_ON(!(host->flags & ATA_HOST_STARTED));
for (i = 0; i < host->n_ports; i++) {
struct ata_port *ap = host->ports[i];
if (ap->ops->port_stop)
ap->ops->port_stop(ap);
}
if (host->ops->host_stop)
host->ops->host_stop(host);
}
/**
* ata_finalize_port_ops - finalize ata_port_operations
* @ops: ata_port_operations to finalize
*
* An ata_port_operations can inherit from another ops and that
* ops can again inherit from another. This can go on as many
* times as necessary as long as there is no loop in the
* inheritance chain.
*
* Ops tables are finalized when the host is started. NULL or
* unspecified entries are inherited from the closet ancestor
* which has the method and the entry is populated with it.
* After finalization, the ops table directly points to all the
* methods and ->inherits is no longer necessary and cleared.
*
* Using ATA_OP_NULL, inheriting ops can force a method to NULL.
*
* LOCKING:
* None.
*/
static void ata_finalize_port_ops(struct ata_port_operations *ops)
{
static DEFINE_SPINLOCK(lock);
const struct ata_port_operations *cur;
void **begin = (void **)ops;
void **end = (void **)&ops->inherits;
void **pp;
if (!ops || !ops->inherits)
return;
spin_lock(&lock);
for (cur = ops->inherits; cur; cur = cur->inherits) {
void **inherit = (void **)cur;
for (pp = begin; pp < end; pp++, inherit++)
if (!*pp)
*pp = *inherit;
}
for (pp = begin; pp < end; pp++)
if (IS_ERR(*pp))
*pp = NULL;
ops->inherits = NULL;
spin_unlock(&lock);
}
/**
* ata_host_start - start and freeze ports of an ATA host
* @host: ATA host to start ports for
*
* Start and then freeze ports of @host. Started status is
* recorded in host->flags, so this function can be called
* multiple times. Ports are guaranteed to get started only
* once. If host->ops isn't initialized yet, its set to the
* first non-dummy port ops.
*
* LOCKING:
* Inherited from calling layer (may sleep).
*
* RETURNS:
* 0 if all ports are started successfully, -errno otherwise.
*/
int ata_host_start(struct ata_host *host)
{
int have_stop = 0;
void *start_dr = NULL;
int i, rc;
if (host->flags & ATA_HOST_STARTED)
return 0;
ata_finalize_port_ops(host->ops);
for (i = 0; i < host->n_ports; i++) {
struct ata_port *ap = host->ports[i];
ata_finalize_port_ops(ap->ops);
if (!host->ops && !ata_port_is_dummy(ap))
host->ops = ap->ops;
if (ap->ops->port_stop)
have_stop = 1;
}
if (host->ops && host->ops->host_stop)
have_stop = 1;
if (have_stop) {
start_dr = devres_alloc(ata_host_stop, 0, GFP_KERNEL);
if (!start_dr)
return -ENOMEM;
}
for (i = 0; i < host->n_ports; i++) {
struct ata_port *ap = host->ports[i];
if (ap->ops->port_start) {
rc = ap->ops->port_start(ap);
if (rc) {
if (rc != -ENODEV)
dev_err(host->dev,
"failed to start port %d (errno=%d)\n",
i, rc);
goto err_out;
}
}
ata_eh_freeze_port(ap);
}
if (start_dr)
devres_add(host->dev, start_dr);
host->flags |= ATA_HOST_STARTED;
return 0;
err_out:
while (--i >= 0) {
struct ata_port *ap = host->ports[i];
if (ap->ops->port_stop)
ap->ops->port_stop(ap);
}
devres_free(start_dr);
return rc;
}
EXPORT_SYMBOL_GPL(ata_host_start);
/**
* ata_host_init - Initialize a host struct for sas (ipr, libsas)
* @host: host to initialize
* @dev: device host is attached to
* @ops: port_ops
*
*/
void ata_host_init(struct ata_host *host, struct device *dev,
struct ata_port_operations *ops)
{
spin_lock_init(&host->lock);
mutex_init(&host->eh_mutex);
host->n_tags = ATA_MAX_QUEUE;
host->dev = dev;
host->ops = ops;
kref_init(&host->kref);
}
EXPORT_SYMBOL_GPL(ata_host_init);
void __ata_port_probe(struct ata_port *ap)
{
struct ata_eh_info *ehi = &ap->link.eh_info;
unsigned long flags;
/* kick EH for boot probing */
spin_lock_irqsave(ap->lock, flags);
ehi->probe_mask |= ATA_ALL_DEVICES;
ehi->action |= ATA_EH_RESET;
ehi->flags |= ATA_EHI_NO_AUTOPSY | ATA_EHI_QUIET;
ap->pflags &= ~ATA_PFLAG_INITIALIZING;
ap->pflags |= ATA_PFLAG_LOADING;
ata_port_schedule_eh(ap);
spin_unlock_irqrestore(ap->lock, flags);
}
int ata_port_probe(struct ata_port *ap)
{
int rc = 0;
if (ap->ops->error_handler) {
__ata_port_probe(ap);
ata_port_wait_eh(ap);
} else {
DPRINTK("ata%u: bus probe begin\n", ap->print_id);
rc = ata_bus_probe(ap);
DPRINTK("ata%u: bus probe end\n", ap->print_id);
}
return rc;
}
static void async_port_probe(void *data, async_cookie_t cookie)
{
struct ata_port *ap = data;
/*
* If we're not allowed to scan this host in parallel,
* we need to wait until all previous scans have completed
* before going further.
* Jeff Garzik says this is only within a controller, so we
* don't need to wait for port 0, only for later ports.
*/
if (!(ap->host->flags & ATA_HOST_PARALLEL_SCAN) && ap->port_no != 0)
async_synchronize_cookie(cookie);
(void)ata_port_probe(ap);
/* in order to keep device order, we need to synchronize at this point */
async_synchronize_cookie(cookie);
ata_scsi_scan_host(ap, 1);
}
/**
* ata_host_register - register initialized ATA host
* @host: ATA host to register
* @sht: template for SCSI host
*
* Register initialized ATA host. @host is allocated using
* ata_host_alloc() and fully initialized by LLD. This function
* starts ports, registers @host with ATA and SCSI layers and
* probe registered devices.
*
* LOCKING:
* Inherited from calling layer (may sleep).
*
* RETURNS:
* 0 on success, -errno otherwise.
*/
int ata_host_register(struct ata_host *host, struct scsi_host_template *sht)
{
int i, rc;
host->n_tags = clamp(sht->can_queue, 1, ATA_MAX_QUEUE);
/* host must have been started */
if (!(host->flags & ATA_HOST_STARTED)) {
dev_err(host->dev, "BUG: trying to register unstarted host\n");
WARN_ON(1);
return -EINVAL;
}
/* Blow away unused ports. This happens when LLD can't
* determine the exact number of ports to allocate at
* allocation time.
*/
for (i = host->n_ports; host->ports[i]; i++)
kfree(host->ports[i]);
/* give ports names and add SCSI hosts */
for (i = 0; i < host->n_ports; i++) {
host->ports[i]->print_id = atomic_inc_return(&ata_print_id);
host->ports[i]->local_port_no = i + 1;
}
/* Create associated sysfs transport objects */
for (i = 0; i < host->n_ports; i++) {
rc = ata_tport_add(host->dev,host->ports[i]);
if (rc) {
goto err_tadd;
}
}
rc = ata_scsi_add_hosts(host, sht);
if (rc)
goto err_tadd;
/* set cable, sata_spd_limit and report */
for (i = 0; i < host->n_ports; i++) {
struct ata_port *ap = host->ports[i];
unsigned long xfer_mask;
/* set SATA cable type if still unset */
if (ap->cbl == ATA_CBL_NONE && (ap->flags & ATA_FLAG_SATA))
ap->cbl = ATA_CBL_SATA;
/* init sata_spd_limit to the current value */
sata_link_init_spd(&ap->link);
if (ap->slave_link)
sata_link_init_spd(ap->slave_link);
/* print per-port info to dmesg */
xfer_mask = ata_pack_xfermask(ap->pio_mask, ap->mwdma_mask,
ap->udma_mask);
if (!ata_port_is_dummy(ap)) {
ata_port_info(ap, "%cATA max %s %s\n",
(ap->flags & ATA_FLAG_SATA) ? 'S' : 'P',
ata_mode_string(xfer_mask),
ap->link.eh_info.desc);
ata_ehi_clear_desc(&ap->link.eh_info);
} else
ata_port_info(ap, "DUMMY\n");
}
/* perform each probe asynchronously */
for (i = 0; i < host->n_ports; i++) {
struct ata_port *ap = host->ports[i];
ap->cookie = async_schedule(async_port_probe, ap);
}
return 0;
err_tadd:
while (--i >= 0) {
ata_tport_delete(host->ports[i]);
}
return rc;
}
EXPORT_SYMBOL_GPL(ata_host_register);
/**
* ata_host_activate - start host, request IRQ and register it
* @host: target ATA host
* @irq: IRQ to request
* @irq_handler: irq_handler used when requesting IRQ
* @irq_flags: irq_flags used when requesting IRQ
* @sht: scsi_host_template to use when registering the host
*
* After allocating an ATA host and initializing it, most libata
* LLDs perform three steps to activate the host - start host,
* request IRQ and register it. This helper takes necessary
* arguments and performs the three steps in one go.
*
* An invalid IRQ skips the IRQ registration and expects the host to
* have set polling mode on the port. In this case, @irq_handler
* should be NULL.
*
* LOCKING:
* Inherited from calling layer (may sleep).
*
* RETURNS:
* 0 on success, -errno otherwise.
*/
int ata_host_activate(struct ata_host *host, int irq,
irq_handler_t irq_handler, unsigned long irq_flags,
struct scsi_host_template *sht)
{
int i, rc;
char *irq_desc;
rc = ata_host_start(host);
if (rc)
return rc;
/* Special case for polling mode */
if (!irq) {
WARN_ON(irq_handler);
return ata_host_register(host, sht);
}
irq_desc = devm_kasprintf(host->dev, GFP_KERNEL, "%s[%s]",
dev_driver_string(host->dev),
dev_name(host->dev));
if (!irq_desc)
return -ENOMEM;
rc = devm_request_irq(host->dev, irq, irq_handler, irq_flags,
irq_desc, host);
if (rc)
return rc;
for (i = 0; i < host->n_ports; i++)
ata_port_desc(host->ports[i], "irq %d", irq);
rc = ata_host_register(host, sht);
/* if failed, just free the IRQ and leave ports alone */
if (rc)
devm_free_irq(host->dev, irq, host);
return rc;
}
EXPORT_SYMBOL_GPL(ata_host_activate);
/**
* ata_port_detach - Detach ATA port in preparation of device removal
* @ap: ATA port to be detached
*
* Detach all ATA devices and the associated SCSI devices of @ap;
* then, remove the associated SCSI host. @ap is guaranteed to
* be quiescent on return from this function.
*
* LOCKING:
* Kernel thread context (may sleep).
*/
static void ata_port_detach(struct ata_port *ap)
{
unsigned long flags;
struct ata_link *link;
struct ata_device *dev;
if (!ap->ops->error_handler)
goto skip_eh;
/* tell EH we're leaving & flush EH */
spin_lock_irqsave(ap->lock, flags);
ap->pflags |= ATA_PFLAG_UNLOADING;
ata_port_schedule_eh(ap);
spin_unlock_irqrestore(ap->lock, flags);
/* wait till EH commits suicide */
ata_port_wait_eh(ap);
/* it better be dead now */
WARN_ON(!(ap->pflags & ATA_PFLAG_UNLOADED));
cancel_delayed_work_sync(&ap->hotplug_task);
skip_eh:
/* clean up zpodd on port removal */
ata_for_each_link(link, ap, HOST_FIRST) {
ata_for_each_dev(dev, link, ALL) {
if (zpodd_dev_enabled(dev))
zpodd_exit(dev);
}
}
if (ap->pmp_link) {
int i;
for (i = 0; i < SATA_PMP_MAX_PORTS; i++)
ata_tlink_delete(&ap->pmp_link[i]);
}
/* remove the associated SCSI host */
scsi_remove_host(ap->scsi_host);
ata_tport_delete(ap);
}
/**
* ata_host_detach - Detach all ports of an ATA host
* @host: Host to detach
*
* Detach all ports of @host.
*
* LOCKING:
* Kernel thread context (may sleep).
*/
void ata_host_detach(struct ata_host *host)
{
int i;
for (i = 0; i < host->n_ports; i++) {
/* Ensure ata_port probe has completed */
async_synchronize_cookie(host->ports[i]->cookie + 1);
ata_port_detach(host->ports[i]);
}
/* the host is dead now, dissociate ACPI */
ata_acpi_dissociate(host);
}
EXPORT_SYMBOL_GPL(ata_host_detach);
#ifdef CONFIG_PCI
/**
* ata_pci_remove_one - PCI layer callback for device removal
* @pdev: PCI device that was removed
*
* PCI layer indicates to libata via this hook that hot-unplug or
* module unload event has occurred. Detach all ports. Resource
* release is handled via devres.
*
* LOCKING:
* Inherited from PCI layer (may sleep).
*/
void ata_pci_remove_one(struct pci_dev *pdev)
{
struct ata_host *host = pci_get_drvdata(pdev);
ata_host_detach(host);
}
EXPORT_SYMBOL_GPL(ata_pci_remove_one);
void ata_pci_shutdown_one(struct pci_dev *pdev)
{
struct ata_host *host = pci_get_drvdata(pdev);
int i;
for (i = 0; i < host->n_ports; i++) {
struct ata_port *ap = host->ports[i];
ap->pflags |= ATA_PFLAG_FROZEN;
/* Disable port interrupts */
if (ap->ops->freeze)
ap->ops->freeze(ap);
/* Stop the port DMA engines */
if (ap->ops->port_stop)
ap->ops->port_stop(ap);
}
}
EXPORT_SYMBOL_GPL(ata_pci_shutdown_one);
/* move to PCI subsystem */
int pci_test_config_bits(struct pci_dev *pdev, const struct pci_bits *bits)
{
unsigned long tmp = 0;
switch (bits->width) {
case 1: {
u8 tmp8 = 0;
pci_read_config_byte(pdev, bits->reg, &tmp8);
tmp = tmp8;
break;
}
case 2: {
u16 tmp16 = 0;
pci_read_config_word(pdev, bits->reg, &tmp16);
tmp = tmp16;
break;
}
case 4: {
u32 tmp32 = 0;
pci_read_config_dword(pdev, bits->reg, &tmp32);
tmp = tmp32;
break;
}
default:
return -EINVAL;
}
tmp &= bits->mask;
return (tmp == bits->val) ? 1 : 0;
}
EXPORT_SYMBOL_GPL(pci_test_config_bits);
#ifdef CONFIG_PM
void ata_pci_device_do_suspend(struct pci_dev *pdev, pm_message_t mesg)
{
pci_save_state(pdev);
pci_disable_device(pdev);
if (mesg.event & PM_EVENT_SLEEP)
pci_set_power_state(pdev, PCI_D3hot);
}
EXPORT_SYMBOL_GPL(ata_pci_device_do_suspend);
int ata_pci_device_do_resume(struct pci_dev *pdev)
{
int rc;
pci_set_power_state(pdev, PCI_D0);
pci_restore_state(pdev);
rc = pcim_enable_device(pdev);
if (rc) {
dev_err(&pdev->dev,
"failed to enable device after resume (%d)\n", rc);
return rc;
}
pci_set_master(pdev);
return 0;
}
EXPORT_SYMBOL_GPL(ata_pci_device_do_resume);
int ata_pci_device_suspend(struct pci_dev *pdev, pm_message_t mesg)
{
struct ata_host *host = pci_get_drvdata(pdev);
int rc = 0;
rc = ata_host_suspend(host, mesg);
if (rc)
return rc;
ata_pci_device_do_suspend(pdev, mesg);
return 0;
}
EXPORT_SYMBOL_GPL(ata_pci_device_suspend);
int ata_pci_device_resume(struct pci_dev *pdev)
{
struct ata_host *host = pci_get_drvdata(pdev);
int rc;
rc = ata_pci_device_do_resume(pdev);
if (rc == 0)
ata_host_resume(host);
return rc;
}
EXPORT_SYMBOL_GPL(ata_pci_device_resume);
#endif /* CONFIG_PM */
#endif /* CONFIG_PCI */
/**
* ata_platform_remove_one - Platform layer callback for device removal
* @pdev: Platform device that was removed
*
* Platform layer indicates to libata via this hook that hot-unplug or
* module unload event has occurred. Detach all ports. Resource
* release is handled via devres.
*
* LOCKING:
* Inherited from platform layer (may sleep).
*/
int ata_platform_remove_one(struct platform_device *pdev)
{
struct ata_host *host = platform_get_drvdata(pdev);
ata_host_detach(host);
return 0;
}
EXPORT_SYMBOL_GPL(ata_platform_remove_one);
#ifdef CONFIG_ATA_FORCE
static int __init ata_parse_force_one(char **cur,
struct ata_force_ent *force_ent,
const char **reason)
{
static const struct ata_force_param force_tbl[] __initconst = {
{ "40c", .cbl = ATA_CBL_PATA40 },
{ "80c", .cbl = ATA_CBL_PATA80 },
{ "short40c", .cbl = ATA_CBL_PATA40_SHORT },
{ "unk", .cbl = ATA_CBL_PATA_UNK },
{ "ign", .cbl = ATA_CBL_PATA_IGN },
{ "sata", .cbl = ATA_CBL_SATA },
{ "1.5Gbps", .spd_limit = 1 },
{ "3.0Gbps", .spd_limit = 2 },
{ "noncq", .horkage_on = ATA_HORKAGE_NONCQ },
{ "ncq", .horkage_off = ATA_HORKAGE_NONCQ },
{ "noncqtrim", .horkage_on = ATA_HORKAGE_NO_NCQ_TRIM },
{ "ncqtrim", .horkage_off = ATA_HORKAGE_NO_NCQ_TRIM },
{ "noncqati", .horkage_on = ATA_HORKAGE_NO_NCQ_ON_ATI },
{ "ncqati", .horkage_off = ATA_HORKAGE_NO_NCQ_ON_ATI },
{ "dump_id", .horkage_on = ATA_HORKAGE_DUMP_ID },
{ "pio0", .xfer_mask = 1 << (ATA_SHIFT_PIO + 0) },
{ "pio1", .xfer_mask = 1 << (ATA_SHIFT_PIO + 1) },
{ "pio2", .xfer_mask = 1 << (ATA_SHIFT_PIO + 2) },
{ "pio3", .xfer_mask = 1 << (ATA_SHIFT_PIO + 3) },
{ "pio4", .xfer_mask = 1 << (ATA_SHIFT_PIO + 4) },
{ "pio5", .xfer_mask = 1 << (ATA_SHIFT_PIO + 5) },
{ "pio6", .xfer_mask = 1 << (ATA_SHIFT_PIO + 6) },
{ "mwdma0", .xfer_mask = 1 << (ATA_SHIFT_MWDMA + 0) },
{ "mwdma1", .xfer_mask = 1 << (ATA_SHIFT_MWDMA + 1) },
{ "mwdma2", .xfer_mask = 1 << (ATA_SHIFT_MWDMA + 2) },
{ "mwdma3", .xfer_mask = 1 << (ATA_SHIFT_MWDMA + 3) },
{ "mwdma4", .xfer_mask = 1 << (ATA_SHIFT_MWDMA + 4) },
{ "udma0", .xfer_mask = 1 << (ATA_SHIFT_UDMA + 0) },
{ "udma16", .xfer_mask = 1 << (ATA_SHIFT_UDMA + 0) },
{ "udma/16", .xfer_mask = 1 << (ATA_SHIFT_UDMA + 0) },
{ "udma1", .xfer_mask = 1 << (ATA_SHIFT_UDMA + 1) },
{ "udma25", .xfer_mask = 1 << (ATA_SHIFT_UDMA + 1) },
{ "udma/25", .xfer_mask = 1 << (ATA_SHIFT_UDMA + 1) },
{ "udma2", .xfer_mask = 1 << (ATA_SHIFT_UDMA + 2) },
{ "udma33", .xfer_mask = 1 << (ATA_SHIFT_UDMA + 2) },
{ "udma/33", .xfer_mask = 1 << (ATA_SHIFT_UDMA + 2) },
{ "udma3", .xfer_mask = 1 << (ATA_SHIFT_UDMA + 3) },
{ "udma44", .xfer_mask = 1 << (ATA_SHIFT_UDMA + 3) },
{ "udma/44", .xfer_mask = 1 << (ATA_SHIFT_UDMA + 3) },
{ "udma4", .xfer_mask = 1 << (ATA_SHIFT_UDMA + 4) },
{ "udma66", .xfer_mask = 1 << (ATA_SHIFT_UDMA + 4) },
{ "udma/66", .xfer_mask = 1 << (ATA_SHIFT_UDMA + 4) },
{ "udma5", .xfer_mask = 1 << (ATA_SHIFT_UDMA + 5) },
{ "udma100", .xfer_mask = 1 << (ATA_SHIFT_UDMA + 5) },
{ "udma/100", .xfer_mask = 1 << (ATA_SHIFT_UDMA + 5) },
{ "udma6", .xfer_mask = 1 << (ATA_SHIFT_UDMA + 6) },
{ "udma133", .xfer_mask = 1 << (ATA_SHIFT_UDMA + 6) },
{ "udma/133", .xfer_mask = 1 << (ATA_SHIFT_UDMA + 6) },
{ "udma7", .xfer_mask = 1 << (ATA_SHIFT_UDMA + 7) },
{ "nohrst", .lflags = ATA_LFLAG_NO_HRST },
{ "nosrst", .lflags = ATA_LFLAG_NO_SRST },
{ "norst", .lflags = ATA_LFLAG_NO_HRST | ATA_LFLAG_NO_SRST },
{ "rstonce", .lflags = ATA_LFLAG_RST_ONCE },
{ "atapi_dmadir", .horkage_on = ATA_HORKAGE_ATAPI_DMADIR },
{ "disable", .horkage_on = ATA_HORKAGE_DISABLE },
};
char *start = *cur, *p = *cur;
char *id, *val, *endp;
const struct ata_force_param *match_fp = NULL;
int nr_matches = 0, i;
/* find where this param ends and update *cur */
while (*p != '\0' && *p != ',')
p++;
if (*p == '\0')
*cur = p;
else
*cur = p + 1;
*p = '\0';
/* parse */
p = strchr(start, ':');
if (!p) {
val = strstrip(start);
goto parse_val;
}
*p = '\0';
id = strstrip(start);
val = strstrip(p + 1);
/* parse id */
p = strchr(id, '.');
if (p) {
*p++ = '\0';
force_ent->device = simple_strtoul(p, &endp, 10);
if (p == endp || *endp != '\0') {
*reason = "invalid device";
return -EINVAL;
}
}
force_ent->port = simple_strtoul(id, &endp, 10);
if (id == endp || *endp != '\0') {
*reason = "invalid port/link";
return -EINVAL;
}
parse_val:
/* parse val, allow shortcuts so that both 1.5 and 1.5Gbps work */
for (i = 0; i < ARRAY_SIZE(force_tbl); i++) {
const struct ata_force_param *fp = &force_tbl[i];
if (strncasecmp(val, fp->name, strlen(val)))
continue;
nr_matches++;
match_fp = fp;
if (strcasecmp(val, fp->name) == 0) {
nr_matches = 1;
break;
}
}
if (!nr_matches) {
*reason = "unknown value";
return -EINVAL;
}
if (nr_matches > 1) {
*reason = "ambiguous value";
return -EINVAL;
}
force_ent->param = *match_fp;
return 0;
}
static void __init ata_parse_force_param(void)
{
int idx = 0, size = 1;
int last_port = -1, last_device = -1;
char *p, *cur, *next;
/* calculate maximum number of params and allocate force_tbl */
for (p = ata_force_param_buf; *p; p++)
if (*p == ',')
size++;
ata_force_tbl = kcalloc(size, sizeof(ata_force_tbl[0]), GFP_KERNEL);
if (!ata_force_tbl) {
printk(KERN_WARNING "ata: failed to extend force table, "
"libata.force ignored\n");
return;
}
/* parse and populate the table */
for (cur = ata_force_param_buf; *cur != '\0'; cur = next) {
const char *reason = "";
struct ata_force_ent te = { .port = -1, .device = -1 };
next = cur;
if (ata_parse_force_one(&next, &te, &reason)) {
printk(KERN_WARNING "ata: failed to parse force "
"parameter \"%s\" (%s)\n",
cur, reason);
continue;
}
if (te.port == -1) {
te.port = last_port;
te.device = last_device;
}
ata_force_tbl[idx++] = te;
last_port = te.port;
last_device = te.device;
}
ata_force_tbl_size = idx;
}
static void ata_free_force_param(void)
{
kfree(ata_force_tbl);
}
#else
static inline void ata_parse_force_param(void) { }
static inline void ata_free_force_param(void) { }
#endif
static int __init ata_init(void)
{
int rc;
ata_parse_force_param();
rc = ata_sff_init();
if (rc) {
ata_free_force_param();
return rc;
}
libata_transport_init();
ata_scsi_transport_template = ata_attach_transport();
if (!ata_scsi_transport_template) {
ata_sff_exit();
rc = -ENOMEM;
goto err_out;
}
printk(KERN_DEBUG "libata version " DRV_VERSION " loaded.\n");
return 0;
err_out:
return rc;
}
static void __exit ata_exit(void)
{
ata_release_transport(ata_scsi_transport_template);
libata_transport_exit();
ata_sff_exit();
ata_free_force_param();
}
subsys_initcall(ata_init);
module_exit(ata_exit);
static DEFINE_RATELIMIT_STATE(ratelimit, HZ / 5, 1);
int ata_ratelimit(void)
{
return __ratelimit(&ratelimit);
}
EXPORT_SYMBOL_GPL(ata_ratelimit);
/**
* ata_msleep - ATA EH owner aware msleep
* @ap: ATA port to attribute the sleep to
* @msecs: duration to sleep in milliseconds
*
* Sleeps @msecs. If the current task is owner of @ap's EH, the
* ownership is released before going to sleep and reacquired
* after the sleep is complete. IOW, other ports sharing the
* @ap->host will be allowed to own the EH while this task is
* sleeping.
*
* LOCKING:
* Might sleep.
*/
void ata_msleep(struct ata_port *ap, unsigned int msecs)
{
bool owns_eh = ap && ap->host->eh_owner == current;
if (owns_eh)
ata_eh_release(ap);
if (msecs < 20) {
unsigned long usecs = msecs * USEC_PER_MSEC;
usleep_range(usecs, usecs + 50);
} else {
msleep(msecs);
}
if (owns_eh)
ata_eh_acquire(ap);
}
EXPORT_SYMBOL_GPL(ata_msleep);
/**
* ata_wait_register - wait until register value changes
* @ap: ATA port to wait register for, can be NULL
* @reg: IO-mapped register
* @mask: Mask to apply to read register value
* @val: Wait condition
* @interval: polling interval in milliseconds
* @timeout: timeout in milliseconds
*
* Waiting for some bits of register to change is a common
* operation for ATA controllers. This function reads 32bit LE
* IO-mapped register @reg and tests for the following condition.
*
* (*@reg & mask) != val
*
* If the condition is met, it returns; otherwise, the process is
* repeated after @interval_msec until timeout.
*
* LOCKING:
* Kernel thread context (may sleep)
*
* RETURNS:
* The final register value.
*/
u32 ata_wait_register(struct ata_port *ap, void __iomem *reg, u32 mask, u32 val,
unsigned long interval, unsigned long timeout)
{
unsigned long deadline;
u32 tmp;
tmp = ioread32(reg);
/* Calculate timeout _after_ the first read to make sure
* preceding writes reach the controller before starting to
* eat away the timeout.
*/
deadline = ata_deadline(jiffies, timeout);
while ((tmp & mask) == val && time_before(jiffies, deadline)) {
ata_msleep(ap, interval);
tmp = ioread32(reg);
}
return tmp;
}
EXPORT_SYMBOL_GPL(ata_wait_register);
/*
* Dummy port_ops
*/
static unsigned int ata_dummy_qc_issue(struct ata_queued_cmd *qc)
{
return AC_ERR_SYSTEM;
}
static void ata_dummy_error_handler(struct ata_port *ap)
{
/* truly dummy */
}
struct ata_port_operations ata_dummy_port_ops = {
.qc_prep = ata_noop_qc_prep,
.qc_issue = ata_dummy_qc_issue,
.error_handler = ata_dummy_error_handler,
.sched_eh = ata_std_sched_eh,
.end_eh = ata_std_end_eh,
};
EXPORT_SYMBOL_GPL(ata_dummy_port_ops);
const struct ata_port_info ata_dummy_port_info = {
.port_ops = &ata_dummy_port_ops,
};
EXPORT_SYMBOL_GPL(ata_dummy_port_info);
/*
* Utility print functions
*/
void ata_port_printk(const struct ata_port *ap, const char *level,
const char *fmt, ...)
{
struct va_format vaf;
va_list args;
va_start(args, fmt);
vaf.fmt = fmt;
vaf.va = &args;
printk("%sata%u: %pV", level, ap->print_id, &vaf);
va_end(args);
}
EXPORT_SYMBOL(ata_port_printk);
void ata_link_printk(const struct ata_link *link, const char *level,
const char *fmt, ...)
{
struct va_format vaf;
va_list args;
va_start(args, fmt);
vaf.fmt = fmt;
vaf.va = &args;
if (sata_pmp_attached(link->ap) || link->ap->slave_link)
printk("%sata%u.%02u: %pV",
level, link->ap->print_id, link->pmp, &vaf);
else
printk("%sata%u: %pV",
level, link->ap->print_id, &vaf);
va_end(args);
}
EXPORT_SYMBOL(ata_link_printk);
void ata_dev_printk(const struct ata_device *dev, const char *level,
const char *fmt, ...)
{
struct va_format vaf;
va_list args;
va_start(args, fmt);
vaf.fmt = fmt;
vaf.va = &args;
printk("%sata%u.%02u: %pV",
level, dev->link->ap->print_id, dev->link->pmp + dev->devno,
&vaf);
va_end(args);
}
EXPORT_SYMBOL(ata_dev_printk);
void ata_print_version(const struct device *dev, const char *version)
{
dev_printk(KERN_DEBUG, dev, "version %s\n", version);
}
EXPORT_SYMBOL(ata_print_version);
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_PGTABLE_INVERT_H
#define _ASM_PGTABLE_INVERT_H 1
#ifndef __ASSEMBLY__
/*
* A clear pte value is special, and doesn't get inverted.
*
* Note that even users that only pass a pgprot_t (rather
* than a full pte) won't trigger the special zero case,
* because even PAGE_NONE has _PAGE_PROTNONE | _PAGE_ACCESSED
* set. So the all zero case really is limited to just the
* cleared page table entry case.
*/
static inline bool __pte_needs_invert(u64 val)
{
return val && !(val & _PAGE_PRESENT);
}
/* Get a mask to xor with the page table entry to get the correct pfn. */
static inline u64 protnone_mask(u64 val)
{
return __pte_needs_invert(val) ? ~0ull : 0;
}
static inline u64 flip_protnone_guard(u64 oldval, u64 val, u64 mask)
{
/*
* When a PTE transitions from NONE to !NONE or vice-versa
* invert the PFN part to stop speculation.
* pte_pfn undoes this when needed.
*/
if (__pte_needs_invert(oldval) != __pte_needs_invert(val))
val = (val & ~mask) | (~val & mask);
return val;
}
#endif /* __ASSEMBLY__ */
#endif
// SPDX-License-Identifier: GPL-2.0-only
/*
* IPv6 library code, needed by static components when full IPv6 support is
* not configured or static.
*/
#include <linux/export.h>
#include <net/ipv6.h>
/*
* find out if nexthdr is a well-known extension header or a protocol
*/
bool ipv6_ext_hdr(u8 nexthdr)
{
/*
* find out if nexthdr is an extension header or a protocol
*/
return (nexthdr == NEXTHDR_HOP) ||
(nexthdr == NEXTHDR_ROUTING) ||
(nexthdr == NEXTHDR_FRAGMENT) ||
(nexthdr == NEXTHDR_AUTH) || (nexthdr == NEXTHDR_NONE) ||
(nexthdr == NEXTHDR_DEST);
}
EXPORT_SYMBOL(ipv6_ext_hdr);
/*
* Skip any extension headers. This is used by the ICMP module.
*
* Note that strictly speaking this conflicts with RFC 2460 4.0:
* ...The contents and semantics of each extension header determine whether
* or not to proceed to the next header. Therefore, extension headers must
* be processed strictly in the order they appear in the packet; a
* receiver must not, for example, scan through a packet looking for a
* particular kind of extension header and process that header prior to
* processing all preceding ones.
*
* We do exactly this. This is a protocol bug. We can't decide after a
* seeing an unknown discard-with-error flavour TLV option if it's a
* ICMP error message or not (errors should never be send in reply to
* ICMP error messages).
*
* But I see no other way to do this. This might need to be reexamined
* when Linux implements ESP (and maybe AUTH) headers.
* --AK
*
* This function parses (probably truncated) exthdr set "hdr".
* "nexthdrp" initially points to some place,
* where type of the first header can be found.
*
* It skips all well-known exthdrs, and returns pointer to the start
* of unparsable area i.e. the first header with unknown type.
* If it is not NULL *nexthdr is updated by type/protocol of this header.
*
* NOTES: - if packet terminated with NEXTHDR_NONE it returns NULL.
* - it may return pointer pointing beyond end of packet,
* if the last recognized header is truncated in the middle.
* - if packet is truncated, so that all parsed headers are skipped,
* it returns NULL.
* - First fragment header is skipped, not-first ones
* are considered as unparsable.
* - Reports the offset field of the final fragment header so it is
* possible to tell whether this is a first fragment, later fragment,
* or not fragmented.
* - ESP is unparsable for now and considered like
* normal payload protocol.
* - Note also special handling of AUTH header. Thanks to IPsec wizards.
*
* --ANK (980726)
*/
int ipv6_skip_exthdr(const struct sk_buff *skb, int start, u8 *nexthdrp,
__be16 *frag_offp)
{
u8 nexthdr = *nexthdrp;
*frag_offp = 0;
while (ipv6_ext_hdr(nexthdr)) {
struct ipv6_opt_hdr _hdr, *hp;
int hdrlen;
if (nexthdr == NEXTHDR_NONE)
return -1;
hp = skb_header_pointer(skb, start, sizeof(_hdr), &_hdr);
if (!hp)
return -1;
if (nexthdr == NEXTHDR_FRAGMENT) {
__be16 _frag_off, *fp;
fp = skb_header_pointer(skb,
start+offsetof(struct frag_hdr,
frag_off),
sizeof(_frag_off),
&_frag_off);
if (!fp)
return -1; *frag_offp = *fp;
if (ntohs(*frag_offp) & ~0x7)
break;
hdrlen = 8; } else if (nexthdr == NEXTHDR_AUTH) hdrlen = ipv6_authlen(hp);
else
hdrlen = ipv6_optlen(hp); nexthdr = hp->nexthdr;
start += hdrlen;
}
*nexthdrp = nexthdr; return start;
}
EXPORT_SYMBOL(ipv6_skip_exthdr);
int ipv6_find_tlv(const struct sk_buff *skb, int offset, int type)
{
const unsigned char *nh = skb_network_header(skb);
int packet_len = skb_tail_pointer(skb) - skb_network_header(skb);
struct ipv6_opt_hdr *hdr;
int len;
if (offset + 2 > packet_len)
goto bad;
hdr = (struct ipv6_opt_hdr *)(nh + offset);
len = ((hdr->hdrlen + 1) << 3);
if (offset + len > packet_len)
goto bad;
offset += 2;
len -= 2;
while (len > 0) {
int opttype = nh[offset];
int optlen;
if (opttype == type)
return offset;
switch (opttype) {
case IPV6_TLV_PAD1:
optlen = 1;
break;
default:
optlen = nh[offset + 1] + 2;
if (optlen > len)
goto bad;
break;
}
offset += optlen;
len -= optlen;
}
/* not_found */
bad:
return -1;
}
EXPORT_SYMBOL_GPL(ipv6_find_tlv);
/*
* find the offset to specified header or the protocol number of last header
* if target < 0. "last header" is transport protocol header, ESP, or
* "No next header".
*
* Note that *offset is used as input/output parameter, and if it is not zero,
* then it must be a valid offset to an inner IPv6 header. This can be used
* to explore inner IPv6 header, eg. ICMPv6 error messages.
*
* If target header is found, its offset is set in *offset and return protocol
* number. Otherwise, return -1.
*
* If the first fragment doesn't contain the final protocol header or
* NEXTHDR_NONE it is considered invalid.
*
* Note that non-1st fragment is special case that "the protocol number
* of last header" is "next header" field in Fragment header. In this case,
* *offset is meaningless and fragment offset is stored in *fragoff if fragoff
* isn't NULL.
*
* if flags is not NULL and it's a fragment, then the frag flag
* IP6_FH_F_FRAG will be set. If it's an AH header, the
* IP6_FH_F_AUTH flag is set and target < 0, then this function will
* stop at the AH header. If IP6_FH_F_SKIP_RH flag was passed, then this
* function will skip all those routing headers, where segements_left was 0.
*/
int ipv6_find_hdr(const struct sk_buff *skb, unsigned int *offset,
int target, unsigned short *fragoff, int *flags)
{
unsigned int start = skb_network_offset(skb) + sizeof(struct ipv6hdr);
u8 nexthdr = ipv6_hdr(skb)->nexthdr;
bool found;
if (fragoff)
*fragoff = 0;
if (*offset) {
struct ipv6hdr _ip6, *ip6;
ip6 = skb_header_pointer(skb, *offset, sizeof(_ip6), &_ip6);
if (!ip6 || (ip6->version != 6))
return -EBADMSG;
start = *offset + sizeof(struct ipv6hdr);
nexthdr = ip6->nexthdr;
}
do {
struct ipv6_opt_hdr _hdr, *hp;
unsigned int hdrlen;
found = (nexthdr == target);
if ((!ipv6_ext_hdr(nexthdr)) || nexthdr == NEXTHDR_NONE) {
if (target < 0 || found)
break;
return -ENOENT;
}
hp = skb_header_pointer(skb, start, sizeof(_hdr), &_hdr);
if (!hp)
return -EBADMSG;
if (nexthdr == NEXTHDR_ROUTING) {
struct ipv6_rt_hdr _rh, *rh;
rh = skb_header_pointer(skb, start, sizeof(_rh),
&_rh);
if (!rh)
return -EBADMSG;
if (flags && (*flags & IP6_FH_F_SKIP_RH) &&
rh->segments_left == 0)
found = false;
}
if (nexthdr == NEXTHDR_FRAGMENT) {
unsigned short _frag_off;
__be16 *fp;
if (flags) /* Indicate that this is a fragment */
*flags |= IP6_FH_F_FRAG;
fp = skb_header_pointer(skb,
start+offsetof(struct frag_hdr,
frag_off),
sizeof(_frag_off),
&_frag_off);
if (!fp)
return -EBADMSG;
_frag_off = ntohs(*fp) & ~0x7;
if (_frag_off) {
if (target < 0 &&
((!ipv6_ext_hdr(hp->nexthdr)) ||
hp->nexthdr == NEXTHDR_NONE)) {
if (fragoff)
*fragoff = _frag_off;
return hp->nexthdr;
}
if (!found)
return -ENOENT;
if (fragoff)
*fragoff = _frag_off;
break;
}
hdrlen = 8;
} else if (nexthdr == NEXTHDR_AUTH) {
if (flags && (*flags & IP6_FH_F_AUTH) && (target < 0))
break;
hdrlen = ipv6_authlen(hp);
} else
hdrlen = ipv6_optlen(hp);
if (!found) {
nexthdr = hp->nexthdr;
start += hdrlen;
}
} while (!found);
*offset = start;
return nexthdr;
}
EXPORT_SYMBOL(ipv6_find_hdr);
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_RCULIST_NULLS_H
#define _LINUX_RCULIST_NULLS_H
#ifdef __KERNEL__
/*
* RCU-protected list version
*/
#include <linux/list_nulls.h>
#include <linux/rcupdate.h>
/**
* hlist_nulls_del_init_rcu - deletes entry from hash list with re-initialization
* @n: the element to delete from the hash list.
*
* Note: hlist_nulls_unhashed() on the node return true after this. It is
* useful for RCU based read lockfree traversal if the writer side
* must know if the list entry is still hashed or already unhashed.
*
* In particular, it means that we can not poison the forward pointers
* that may still be used for walking the hash list and we can only
* zero the pprev pointer so list_unhashed() will return true after
* this.
*
* The caller must take whatever precautions are necessary (such as
* holding appropriate locks) to avoid racing with another
* list-mutation primitive, such as hlist_nulls_add_head_rcu() or
* hlist_nulls_del_rcu(), running on this same list. However, it is
* perfectly legal to run concurrently with the _rcu list-traversal
* primitives, such as hlist_nulls_for_each_entry_rcu().
*/
static inline void hlist_nulls_del_init_rcu(struct hlist_nulls_node *n)
{
if (!hlist_nulls_unhashed(n)) {
__hlist_nulls_del(n); WRITE_ONCE(n->pprev, NULL);
}
}
/**
* hlist_nulls_first_rcu - returns the first element of the hash list.
* @head: the head of the list.
*/
#define hlist_nulls_first_rcu(head) \
(*((struct hlist_nulls_node __rcu __force **)&(head)->first))
/**
* hlist_nulls_next_rcu - returns the element of the list after @node.
* @node: element of the list.
*/
#define hlist_nulls_next_rcu(node) \
(*((struct hlist_nulls_node __rcu __force **)&(node)->next))
/**
* hlist_nulls_del_rcu - deletes entry from hash list without re-initialization
* @n: the element to delete from the hash list.
*
* Note: hlist_nulls_unhashed() on entry does not return true after this,
* the entry is in an undefined state. It is useful for RCU based
* lockfree traversal.
*
* In particular, it means that we can not poison the forward
* pointers that may still be used for walking the hash list.
*
* The caller must take whatever precautions are necessary
* (such as holding appropriate locks) to avoid racing
* with another list-mutation primitive, such as hlist_nulls_add_head_rcu()
* or hlist_nulls_del_rcu(), running on this same list.
* However, it is perfectly legal to run concurrently with
* the _rcu list-traversal primitives, such as
* hlist_nulls_for_each_entry().
*/
static inline void hlist_nulls_del_rcu(struct hlist_nulls_node *n)
{
__hlist_nulls_del(n);
WRITE_ONCE(n->pprev, LIST_POISON2);
}
/**
* hlist_nulls_add_head_rcu
* @n: the element to add to the hash list.
* @h: the list to add to.
*
* Description:
* Adds the specified element to the specified hlist_nulls,
* while permitting racing traversals.
*
* The caller must take whatever precautions are necessary
* (such as holding appropriate locks) to avoid racing
* with another list-mutation primitive, such as hlist_nulls_add_head_rcu()
* or hlist_nulls_del_rcu(), running on this same list.
* However, it is perfectly legal to run concurrently with
* the _rcu list-traversal primitives, such as
* hlist_nulls_for_each_entry_rcu(), used to prevent memory-consistency
* problems on Alpha CPUs. Regardless of the type of CPU, the
* list-traversal primitive must be guarded by rcu_read_lock().
*/
static inline void hlist_nulls_add_head_rcu(struct hlist_nulls_node *n,
struct hlist_nulls_head *h)
{
struct hlist_nulls_node *first = h->first;
n->next = first;
WRITE_ONCE(n->pprev, &h->first);
rcu_assign_pointer(hlist_nulls_first_rcu(h), n);
if (!is_a_nulls(first))
WRITE_ONCE(first->pprev, &n->next);
}
/**
* hlist_nulls_add_tail_rcu
* @n: the element to add to the hash list.
* @h: the list to add to.
*
* Description:
* Adds the specified element to the specified hlist_nulls,
* while permitting racing traversals.
*
* The caller must take whatever precautions are necessary
* (such as holding appropriate locks) to avoid racing
* with another list-mutation primitive, such as hlist_nulls_add_head_rcu()
* or hlist_nulls_del_rcu(), running on this same list.
* However, it is perfectly legal to run concurrently with
* the _rcu list-traversal primitives, such as
* hlist_nulls_for_each_entry_rcu(), used to prevent memory-consistency
* problems on Alpha CPUs. Regardless of the type of CPU, the
* list-traversal primitive must be guarded by rcu_read_lock().
*/
static inline void hlist_nulls_add_tail_rcu(struct hlist_nulls_node *n,
struct hlist_nulls_head *h)
{
struct hlist_nulls_node *i, *last = NULL;
/* Note: write side code, so rcu accessors are not needed. */
for (i = h->first; !is_a_nulls(i); i = i->next)
last = i;
if (last) {
n->next = last->next;
n->pprev = &last->next;
rcu_assign_pointer(hlist_next_rcu(last), n);
} else {
hlist_nulls_add_head_rcu(n, h);
}
}
/* after that hlist_nulls_del will work */
static inline void hlist_nulls_add_fake(struct hlist_nulls_node *n)
{
n->pprev = &n->next;
n->next = (struct hlist_nulls_node *)NULLS_MARKER(NULL);
}
/**
* hlist_nulls_for_each_entry_rcu - iterate over rcu list of given type
* @tpos: the type * to use as a loop cursor.
* @pos: the &struct hlist_nulls_node to use as a loop cursor.
* @head: the head of the list.
* @member: the name of the hlist_nulls_node within the struct.
*
* The barrier() is needed to make sure compiler doesn't cache first element [1],
* as this loop can be restarted [2]
* [1] Documentation/memory-barriers.txt around line 1533
* [2] Documentation/RCU/rculist_nulls.rst around line 146
*/
#define hlist_nulls_for_each_entry_rcu(tpos, pos, head, member) \
for (({barrier();}), \
pos = rcu_dereference_raw(hlist_nulls_first_rcu(head)); \
(!is_a_nulls(pos)) && \
({ tpos = hlist_nulls_entry(pos, typeof(*tpos), member); 1; }); \
pos = rcu_dereference_raw(hlist_nulls_next_rcu(pos)))
/**
* hlist_nulls_for_each_entry_safe -
* iterate over list of given type safe against removal of list entry
* @tpos: the type * to use as a loop cursor.
* @pos: the &struct hlist_nulls_node to use as a loop cursor.
* @head: the head of the list.
* @member: the name of the hlist_nulls_node within the struct.
*/
#define hlist_nulls_for_each_entry_safe(tpos, pos, head, member) \
for (({barrier();}), \
pos = rcu_dereference_raw(hlist_nulls_first_rcu(head)); \
(!is_a_nulls(pos)) && \
({ tpos = hlist_nulls_entry(pos, typeof(*tpos), member); \
pos = rcu_dereference_raw(hlist_nulls_next_rcu(pos)); 1; });)
#endif
#endif
// SPDX-License-Identifier: GPL-2.0-only
/*
* Copyright (C) 1991, 1992 Linus Torvalds
* Copyright (C) 2001 Andrea Arcangeli <andrea@suse.de> SuSE
* Copyright (C) 2016 - 2020 Christoph Hellwig
*/
#include <linux/init.h>
#include <linux/mm.h>
#include <linux/slab.h>
#include <linux/kmod.h>
#include <linux/major.h>
#include <linux/device_cgroup.h>
#include <linux/blkdev.h>
#include <linux/backing-dev.h>
#include <linux/module.h>
#include <linux/blkpg.h>
#include <linux/magic.h>
#include <linux/buffer_head.h>
#include <linux/swap.h>
#include <linux/writeback.h>
#include <linux/mount.h>
#include <linux/pseudo_fs.h>
#include <linux/uio.h>
#include <linux/namei.h>
#include <linux/cleancache.h>
#include <linux/part_stat.h>
#include <linux/uaccess.h>
#include "../fs/internal.h"
#include "blk.h"
struct bdev_inode {
struct block_device bdev;
struct inode vfs_inode;
};
static inline struct bdev_inode *BDEV_I(struct inode *inode)
{
return container_of(inode, struct bdev_inode, vfs_inode);
}
struct block_device *I_BDEV(struct inode *inode)
{
return &BDEV_I(inode)->bdev;
}
EXPORT_SYMBOL(I_BDEV);
static void bdev_write_inode(struct block_device *bdev)
{
struct inode *inode = bdev->bd_inode;
int ret;
spin_lock(&inode->i_lock);
while (inode->i_state & I_DIRTY) {
spin_unlock(&inode->i_lock);
ret = write_inode_now(inode, true);
if (ret) {
char name[BDEVNAME_SIZE];
pr_warn_ratelimited("VFS: Dirty inode writeback failed "
"for block device %s (err=%d).\n",
bdevname(bdev, name), ret);
}
spin_lock(&inode->i_lock);
}
spin_unlock(&inode->i_lock);
}
/* Kill _all_ buffers and pagecache , dirty or not.. */
static void kill_bdev(struct block_device *bdev)
{
struct address_space *mapping = bdev->bd_inode->i_mapping;
if (mapping_empty(mapping))
return;
invalidate_bh_lrus();
truncate_inode_pages(mapping, 0);
}
/* Invalidate clean unused buffers and pagecache. */
void invalidate_bdev(struct block_device *bdev)
{
struct address_space *mapping = bdev->bd_inode->i_mapping;
if (mapping->nrpages) {
invalidate_bh_lrus();
lru_add_drain_all(); /* make sure all lru add caches are flushed */
invalidate_mapping_pages(mapping, 0, -1);
}
/* 99% of the time, we don't need to flush the cleancache on the bdev.
* But, for the strange corners, lets be cautious
*/
cleancache_invalidate_inode(mapping);
}
EXPORT_SYMBOL(invalidate_bdev);
/*
* Drop all buffers & page cache for given bdev range. This function bails
* with error if bdev has other exclusive owner (such as filesystem).
*/
int truncate_bdev_range(struct block_device *bdev, fmode_t mode,
loff_t lstart, loff_t lend)
{
/*
* If we don't hold exclusive handle for the device, upgrade to it
* while we discard the buffer cache to avoid discarding buffers
* under live filesystem.
*/
if (!(mode & FMODE_EXCL)) { int err = bd_prepare_to_claim(bdev, truncate_bdev_range);
if (err)
goto invalidate;
}
truncate_inode_pages_range(bdev->bd_inode->i_mapping, lstart, lend); if (!(mode & FMODE_EXCL))
bd_abort_claiming(bdev, truncate_bdev_range);
return 0;
invalidate:
/*
* Someone else has handle exclusively open. Try invalidating instead.
* The 'end' argument is inclusive so the rounding is safe.
*/
return invalidate_inode_pages2_range(bdev->bd_inode->i_mapping,
lstart >> PAGE_SHIFT,
lend >> PAGE_SHIFT);
}
static void set_init_blocksize(struct block_device *bdev)
{
unsigned int bsize = bdev_logical_block_size(bdev);
loff_t size = i_size_read(bdev->bd_inode);
while (bsize < PAGE_SIZE) { if (size & bsize)
break;
bsize <<= 1;
}
bdev->bd_inode->i_blkbits = blksize_bits(bsize);
}
int set_blocksize(struct block_device *bdev, int size)
{
/* Size must be a power of two, and between 512 and PAGE_SIZE */
if (size > PAGE_SIZE || size < 512 || !is_power_of_2(size)) return -EINVAL;
/* Size cannot be smaller than the size supported by the device */
if (size < bdev_logical_block_size(bdev))
return -EINVAL;
/* Don't change the size if it is same as current */
if (bdev->bd_inode->i_blkbits != blksize_bits(size)) {
sync_blockdev(bdev);
bdev->bd_inode->i_blkbits = blksize_bits(size);
kill_bdev(bdev);
}
return 0;
}
EXPORT_SYMBOL(set_blocksize);
int sb_set_blocksize(struct super_block *sb, int size)
{
if (set_blocksize(sb->s_bdev, size))
return 0;
/* If we get here, we know size is power of two
* and it's value is between 512 and PAGE_SIZE */
sb->s_blocksize = size;
sb->s_blocksize_bits = blksize_bits(size);
return sb->s_blocksize;
}
EXPORT_SYMBOL(sb_set_blocksize);
int sb_min_blocksize(struct super_block *sb, int size)
{
int minsize = bdev_logical_block_size(sb->s_bdev);
if (size < minsize)
size = minsize;
return sb_set_blocksize(sb, size);
}
EXPORT_SYMBOL(sb_min_blocksize);
int sync_blockdev_nowait(struct block_device *bdev)
{
if (!bdev)
return 0;
return filemap_flush(bdev->bd_inode->i_mapping);
}
EXPORT_SYMBOL_GPL(sync_blockdev_nowait);
/*
* Write out and wait upon all the dirty data associated with a block
* device via its mapping. Does not take the superblock lock.
*/
int sync_blockdev(struct block_device *bdev)
{
if (!bdev)
return 0;
return filemap_write_and_wait(bdev->bd_inode->i_mapping);
}
EXPORT_SYMBOL(sync_blockdev);
/*
* Write out and wait upon all dirty data associated with this
* device. Filesystem data as well as the underlying block
* device. Takes the superblock lock.
*/
int fsync_bdev(struct block_device *bdev)
{
struct super_block *sb = get_super(bdev);
if (sb) {
int res = sync_filesystem(sb);
drop_super(sb);
return res;
}
return sync_blockdev(bdev);
}
EXPORT_SYMBOL(fsync_bdev);
/**
* freeze_bdev -- lock a filesystem and force it into a consistent state
* @bdev: blockdevice to lock
*
* If a superblock is found on this device, we take the s_umount semaphore
* on it to make sure nobody unmounts until the snapshot creation is done.
* The reference counter (bd_fsfreeze_count) guarantees that only the last
* unfreeze process can unfreeze the frozen filesystem actually when multiple
* freeze requests arrive simultaneously. It counts up in freeze_bdev() and
* count down in thaw_bdev(). When it becomes 0, thaw_bdev() will unfreeze
* actually.
*/
int freeze_bdev(struct block_device *bdev)
{
struct super_block *sb;
int error = 0;
mutex_lock(&bdev->bd_fsfreeze_mutex);
if (++bdev->bd_fsfreeze_count > 1)
goto done;
sb = get_active_super(bdev);
if (!sb)
goto sync;
if (sb->s_op->freeze_super)
error = sb->s_op->freeze_super(sb);
else
error = freeze_super(sb);
deactivate_super(sb);
if (error) {
bdev->bd_fsfreeze_count--;
goto done;
}
bdev->bd_fsfreeze_sb = sb;
sync:
sync_blockdev(bdev);
done:
mutex_unlock(&bdev->bd_fsfreeze_mutex);
return error;
}
EXPORT_SYMBOL(freeze_bdev);
/**
* thaw_bdev -- unlock filesystem
* @bdev: blockdevice to unlock
*
* Unlocks the filesystem and marks it writeable again after freeze_bdev().
*/
int thaw_bdev(struct block_device *bdev)
{
struct super_block *sb;
int error = -EINVAL;
mutex_lock(&bdev->bd_fsfreeze_mutex);
if (!bdev->bd_fsfreeze_count)
goto out;
error = 0;
if (--bdev->bd_fsfreeze_count > 0)
goto out;
sb = bdev->bd_fsfreeze_sb;
if (!sb)
goto out;
if (sb->s_op->thaw_super)
error = sb->s_op->thaw_super(sb);
else
error = thaw_super(sb);
if (error)
bdev->bd_fsfreeze_count++;
else
bdev->bd_fsfreeze_sb = NULL;
out:
mutex_unlock(&bdev->bd_fsfreeze_mutex);
return error;
}
EXPORT_SYMBOL(thaw_bdev);
/**
* bdev_read_page() - Start reading a page from a block device
* @bdev: The device to read the page from
* @sector: The offset on the device to read the page to (need not be aligned)
* @page: The page to read
*
* On entry, the page should be locked. It will be unlocked when the page
* has been read. If the block driver implements rw_page synchronously,
* that will be true on exit from this function, but it need not be.
*
* Errors returned by this function are usually "soft", eg out of memory, or
* queue full; callers should try a different route to read this page rather
* than propagate an error back up the stack.
*
* Return: negative errno if an error occurs, 0 if submission was successful.
*/
int bdev_read_page(struct block_device *bdev, sector_t sector,
struct page *page)
{
const struct block_device_operations *ops = bdev->bd_disk->fops;
int result = -EOPNOTSUPP;
if (!ops->rw_page || bdev_get_integrity(bdev))
return result;
result = blk_queue_enter(bdev->bd_disk->queue, 0);
if (result)
return result;
result = ops->rw_page(bdev, sector + get_start_sect(bdev), page,
REQ_OP_READ);
blk_queue_exit(bdev->bd_disk->queue);
return result;
}
/**
* bdev_write_page() - Start writing a page to a block device
* @bdev: The device to write the page to
* @sector: The offset on the device to write the page to (need not be aligned)
* @page: The page to write
* @wbc: The writeback_control for the write
*
* On entry, the page should be locked and not currently under writeback.
* On exit, if the write started successfully, the page will be unlocked and
* under writeback. If the write failed already (eg the driver failed to
* queue the page to the device), the page will still be locked. If the
* caller is a ->writepage implementation, it will need to unlock the page.
*
* Errors returned by this function are usually "soft", eg out of memory, or
* queue full; callers should try a different route to write this page rather
* than propagate an error back up the stack.
*
* Return: negative errno if an error occurs, 0 if submission was successful.
*/
int bdev_write_page(struct block_device *bdev, sector_t sector,
struct page *page, struct writeback_control *wbc)
{
int result;
const struct block_device_operations *ops = bdev->bd_disk->fops;
if (!ops->rw_page || bdev_get_integrity(bdev))
return -EOPNOTSUPP;
result = blk_queue_enter(bdev->bd_disk->queue, 0);
if (result)
return result;
set_page_writeback(page);
result = ops->rw_page(bdev, sector + get_start_sect(bdev), page,
REQ_OP_WRITE);
if (result) {
end_page_writeback(page);
} else {
clean_page_buffers(page);
unlock_page(page);
}
blk_queue_exit(bdev->bd_disk->queue);
return result;
}
/*
* pseudo-fs
*/
static __cacheline_aligned_in_smp DEFINE_SPINLOCK(bdev_lock);
static struct kmem_cache * bdev_cachep __read_mostly;
static struct inode *bdev_alloc_inode(struct super_block *sb)
{
struct bdev_inode *ei = kmem_cache_alloc(bdev_cachep, GFP_KERNEL);
if (!ei)
return NULL;
memset(&ei->bdev, 0, sizeof(ei->bdev));
return &ei->vfs_inode;
}
static void bdev_free_inode(struct inode *inode)
{
struct block_device *bdev = I_BDEV(inode);
free_percpu(bdev->bd_stats);
kfree(bdev->bd_meta_info);
if (!bdev_is_partition(bdev)) {
if (bdev->bd_disk && bdev->bd_disk->bdi)
bdi_put(bdev->bd_disk->bdi);
kfree(bdev->bd_disk);
}
if (MAJOR(bdev->bd_dev) == BLOCK_EXT_MAJOR)
blk_free_ext_minor(MINOR(bdev->bd_dev));
kmem_cache_free(bdev_cachep, BDEV_I(inode));
}
static void init_once(void *data)
{
struct bdev_inode *ei = data;
inode_init_once(&ei->vfs_inode);
}
static void bdev_evict_inode(struct inode *inode)
{
truncate_inode_pages_final(&inode->i_data);
invalidate_inode_buffers(inode); /* is it needed here? */
clear_inode(inode);
}
static const struct super_operations bdev_sops = {
.statfs = simple_statfs,
.alloc_inode = bdev_alloc_inode,
.free_inode = bdev_free_inode,
.drop_inode = generic_delete_inode,
.evict_inode = bdev_evict_inode,
};
static int bd_init_fs_context(struct fs_context *fc)
{
struct pseudo_fs_context *ctx = init_pseudo(fc, BDEVFS_MAGIC);
if (!ctx)
return -ENOMEM;
fc->s_iflags |= SB_I_CGROUPWB;
ctx->ops = &bdev_sops;
return 0;
}
static struct file_system_type bd_type = {
.name = "bdev",
.init_fs_context = bd_init_fs_context,
.kill_sb = kill_anon_super,
};
struct super_block *blockdev_superblock __read_mostly;
EXPORT_SYMBOL_GPL(blockdev_superblock);
void __init bdev_cache_init(void)
{
int err;
static struct vfsmount *bd_mnt;
bdev_cachep = kmem_cache_create("bdev_cache", sizeof(struct bdev_inode),
0, (SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT|
SLAB_MEM_SPREAD|SLAB_ACCOUNT|SLAB_PANIC),
init_once);
err = register_filesystem(&bd_type);
if (err)
panic("Cannot register bdev pseudo-fs");
bd_mnt = kern_mount(&bd_type);
if (IS_ERR(bd_mnt))
panic("Cannot create bdev pseudo-fs");
blockdev_superblock = bd_mnt->mnt_sb; /* For writeback */
}
struct block_device *bdev_alloc(struct gendisk *disk, u8 partno)
{
struct block_device *bdev;
struct inode *inode;
inode = new_inode(blockdev_superblock);
if (!inode)
return NULL;
inode->i_mode = S_IFBLK;
inode->i_rdev = 0;
inode->i_data.a_ops = &def_blk_aops;
mapping_set_gfp_mask(&inode->i_data, GFP_USER);
bdev = I_BDEV(inode);
mutex_init(&bdev->bd_fsfreeze_mutex);
spin_lock_init(&bdev->bd_size_lock);
bdev->bd_partno = partno;
bdev->bd_inode = inode;
bdev->bd_stats = alloc_percpu(struct disk_stats);
if (!bdev->bd_stats) {
iput(inode);
return NULL;
}
bdev->bd_disk = disk;
return bdev;
}
void bdev_add(struct block_device *bdev, dev_t dev)
{
bdev->bd_dev = dev;
bdev->bd_inode->i_rdev = dev;
bdev->bd_inode->i_ino = dev;
insert_inode_hash(bdev->bd_inode);
}
long nr_blockdev_pages(void)
{
struct inode *inode;
long ret = 0;
spin_lock(&blockdev_superblock->s_inode_list_lock);
list_for_each_entry(inode, &blockdev_superblock->s_inodes, i_sb_list)
ret += inode->i_mapping->nrpages;
spin_unlock(&blockdev_superblock->s_inode_list_lock);
return ret;
}
/**
* bd_may_claim - test whether a block device can be claimed
* @bdev: block device of interest
* @whole: whole block device containing @bdev, may equal @bdev
* @holder: holder trying to claim @bdev
*
* Test whether @bdev can be claimed by @holder.
*
* CONTEXT:
* spin_lock(&bdev_lock).
*
* RETURNS:
* %true if @bdev can be claimed, %false otherwise.
*/
static bool bd_may_claim(struct block_device *bdev, struct block_device *whole,
void *holder)
{
if (bdev->bd_holder == holder)
return true; /* already a holder */
else if (bdev->bd_holder != NULL)
return false; /* held by someone else */
else if (whole == bdev)
return true; /* is a whole device which isn't held */
else if (whole->bd_holder == bd_may_claim)
return true; /* is a partition of a device that is being partitioned */
else if (whole->bd_holder != NULL)
return false; /* is a partition of a held device */
else
return true; /* is a partition of an un-held device */
}
/**
* bd_prepare_to_claim - claim a block device
* @bdev: block device of interest
* @holder: holder trying to claim @bdev
*
* Claim @bdev. This function fails if @bdev is already claimed by another
* holder and waits if another claiming is in progress. return, the caller
* has ownership of bd_claiming and bd_holder[s].
*
* RETURNS:
* 0 if @bdev can be claimed, -EBUSY otherwise.
*/
int bd_prepare_to_claim(struct block_device *bdev, void *holder)
{
struct block_device *whole = bdev_whole(bdev); if (WARN_ON_ONCE(!holder))
return -EINVAL;
retry:
spin_lock(&bdev_lock);
/* if someone else claimed, fail */
if (!bd_may_claim(bdev, whole, holder)) {
spin_unlock(&bdev_lock);
return -EBUSY;
}
/* if claiming is already in progress, wait for it to finish */
if (whole->bd_claiming) { wait_queue_head_t *wq = bit_waitqueue(&whole->bd_claiming, 0);
DEFINE_WAIT(wait);
prepare_to_wait(wq, &wait, TASK_UNINTERRUPTIBLE);
spin_unlock(&bdev_lock);
schedule();
finish_wait(wq, &wait);
goto retry;
}
/* yay, all mine */
whole->bd_claiming = holder;
spin_unlock(&bdev_lock);
return 0;
}
EXPORT_SYMBOL_GPL(bd_prepare_to_claim); /* only for the loop driver */
static void bd_clear_claiming(struct block_device *whole, void *holder)
{
lockdep_assert_held(&bdev_lock);
/* tell others that we're done */
BUG_ON(whole->bd_claiming != holder); whole->bd_claiming = NULL;
wake_up_bit(&whole->bd_claiming, 0);
}
/**
* bd_finish_claiming - finish claiming of a block device
* @bdev: block device of interest
* @holder: holder that has claimed @bdev
*
* Finish exclusive open of a block device. Mark the device as exlusively
* open by the holder and wake up all waiters for exclusive open to finish.
*/
static void bd_finish_claiming(struct block_device *bdev, void *holder)
{
struct block_device *whole = bdev_whole(bdev);
spin_lock(&bdev_lock);
BUG_ON(!bd_may_claim(bdev, whole, holder));
/*
* Note that for a whole device bd_holders will be incremented twice,
* and bd_holder will be set to bd_may_claim before being set to holder
*/
whole->bd_holders++;
whole->bd_holder = bd_may_claim;
bdev->bd_holders++;
bdev->bd_holder = holder;
bd_clear_claiming(whole, holder);
spin_unlock(&bdev_lock);
}
/**
* bd_abort_claiming - abort claiming of a block device
* @bdev: block device of interest
* @holder: holder that has claimed @bdev
*
* Abort claiming of a block device when the exclusive open failed. This can be
* also used when exclusive open is not actually desired and we just needed
* to block other exclusive openers for a while.
*/
void bd_abort_claiming(struct block_device *bdev, void *holder)
{
spin_lock(&bdev_lock);
bd_clear_claiming(bdev_whole(bdev), holder);
spin_unlock(&bdev_lock);
}
EXPORT_SYMBOL(bd_abort_claiming);
static void blkdev_flush_mapping(struct block_device *bdev)
{
WARN_ON_ONCE(bdev->bd_holders);
sync_blockdev(bdev);
kill_bdev(bdev);
bdev_write_inode(bdev);
}
static int blkdev_get_whole(struct block_device *bdev, fmode_t mode)
{
struct gendisk *disk = bdev->bd_disk;
int ret = 0;
if (disk->fops->open) {
ret = disk->fops->open(bdev, mode);
if (ret) {
/* avoid ghost partitions on a removed medium */
if (ret == -ENOMEDIUM && test_bit(GD_NEED_PART_SCAN, &disk->state)) bdev_disk_changed(disk, true);
return ret;
}
}
if (!bdev->bd_openers) set_init_blocksize(bdev); if (test_bit(GD_NEED_PART_SCAN, &disk->state)) bdev_disk_changed(disk, false); bdev->bd_openers++; return 0;;
}
static void blkdev_put_whole(struct block_device *bdev, fmode_t mode)
{
if (!--bdev->bd_openers) blkdev_flush_mapping(bdev); if (bdev->bd_disk->fops->release) bdev->bd_disk->fops->release(bdev->bd_disk, mode);
}
static int blkdev_get_part(struct block_device *part, fmode_t mode)
{
struct gendisk *disk = part->bd_disk;
int ret;
if (part->bd_openers)
goto done;
ret = blkdev_get_whole(bdev_whole(part), mode);
if (ret)
return ret;
ret = -ENXIO;
if (!bdev_nr_sectors(part))
goto out_blkdev_put;
disk->open_partitions++;
set_init_blocksize(part);
done:
part->bd_openers++;
return 0;
out_blkdev_put:
blkdev_put_whole(bdev_whole(part), mode);
return ret;
}
static void blkdev_put_part(struct block_device *part, fmode_t mode)
{
struct block_device *whole = bdev_whole(part);
if (--part->bd_openers)
return;
blkdev_flush_mapping(part);
whole->bd_disk->open_partitions--;
blkdev_put_whole(whole, mode);
}
struct block_device *blkdev_get_no_open(dev_t dev)
{
struct block_device *bdev;
struct inode *inode;
inode = ilookup(blockdev_superblock, dev);
if (!inode) {
blk_request_module(dev);
inode = ilookup(blockdev_superblock, dev);
if (!inode)
return NULL;
}
/* switch from the inode reference to a device mode one: */
bdev = &BDEV_I(inode)->bdev; if (!kobject_get_unless_zero(&bdev->bd_device.kobj))
bdev = NULL;
iput(inode); if (!bdev)
return NULL;
if ((bdev->bd_disk->flags & GENHD_FL_HIDDEN) || !try_module_get(bdev->bd_disk->fops->owner)) { put_device(&bdev->bd_device); return NULL;
}
return bdev;
}
void blkdev_put_no_open(struct block_device *bdev)
{
module_put(bdev->bd_disk->fops->owner);
put_device(&bdev->bd_device);
}
/**
* blkdev_get_by_dev - open a block device by device number
* @dev: device number of block device to open
* @mode: FMODE_* mask
* @holder: exclusive holder identifier
*
* Open the block device described by device number @dev. If @mode includes
* %FMODE_EXCL, the block device is opened with exclusive access. Specifying
* %FMODE_EXCL with a %NULL @holder is invalid. Exclusive opens may nest for
* the same @holder.
*
* Use this interface ONLY if you really do not have anything better - i.e. when
* you are behind a truly sucky interface and all you are given is a device
* number. Everything else should use blkdev_get_by_path().
*
* CONTEXT:
* Might sleep.
*
* RETURNS:
* Reference to the block_device on success, ERR_PTR(-errno) on failure.
*/
struct block_device *blkdev_get_by_dev(dev_t dev, fmode_t mode, void *holder)
{
bool unblock_events = true;
struct block_device *bdev;
struct gendisk *disk;
int ret;
ret = devcgroup_check_permission(DEVCG_DEV_BLOCK,
MAJOR(dev), MINOR(dev),
((mode & FMODE_READ) ? DEVCG_ACC_READ : 0) |
((mode & FMODE_WRITE) ? DEVCG_ACC_WRITE : 0));
if (ret)
return ERR_PTR(ret);
bdev = blkdev_get_no_open(dev);
if (!bdev)
return ERR_PTR(-ENXIO); disk = bdev->bd_disk;
if (mode & FMODE_EXCL) {
ret = bd_prepare_to_claim(bdev, holder);
if (ret)
goto put_blkdev;
}
disk_block_events(disk);
mutex_lock(&disk->open_mutex);
ret = -ENXIO;
if (!disk_live(disk))
goto abort_claiming;
if (bdev_is_partition(bdev))
ret = blkdev_get_part(bdev, mode);
else
ret = blkdev_get_whole(bdev, mode);
if (ret)
goto abort_claiming;
if (mode & FMODE_EXCL) {
bd_finish_claiming(bdev, holder);
/*
* Block event polling for write claims if requested. Any write
* holder makes the write_holder state stick until all are
* released. This is good enough and tracking individual
* writeable reference is too fragile given the way @mode is
* used in blkdev_get/put().
*/
if ((mode & FMODE_WRITE) && !bdev->bd_write_holder && (disk->flags & GENHD_FL_BLOCK_EVENTS_ON_EXCL_WRITE)) { bdev->bd_write_holder = true;
unblock_events = false;
}
}
mutex_unlock(&disk->open_mutex);
if (unblock_events)
disk_unblock_events(disk);
return bdev;
abort_claiming:
if (mode & FMODE_EXCL) bd_abort_claiming(bdev, holder); mutex_unlock(&disk->open_mutex);
disk_unblock_events(disk);
put_blkdev:
blkdev_put_no_open(bdev);
return ERR_PTR(ret);
}
EXPORT_SYMBOL(blkdev_get_by_dev);
/**
* blkdev_get_by_path - open a block device by name
* @path: path to the block device to open
* @mode: FMODE_* mask
* @holder: exclusive holder identifier
*
* Open the block device described by the device file at @path. If @mode
* includes %FMODE_EXCL, the block device is opened with exclusive access.
* Specifying %FMODE_EXCL with a %NULL @holder is invalid. Exclusive opens may
* nest for the same @holder.
*
* CONTEXT:
* Might sleep.
*
* RETURNS:
* Reference to the block_device on success, ERR_PTR(-errno) on failure.
*/
struct block_device *blkdev_get_by_path(const char *path, fmode_t mode,
void *holder)
{
struct block_device *bdev;
dev_t dev;
int error;
error = lookup_bdev(path, &dev);
if (error)
return ERR_PTR(error); bdev = blkdev_get_by_dev(dev, mode, holder); if (!IS_ERR(bdev) && (mode & FMODE_WRITE) && bdev_read_only(bdev)) { blkdev_put(bdev, mode);
return ERR_PTR(-EACCES);
}
return bdev;
}
EXPORT_SYMBOL(blkdev_get_by_path);
void blkdev_put(struct block_device *bdev, fmode_t mode)
{
struct gendisk *disk = bdev->bd_disk;
/*
* Sync early if it looks like we're the last one. If someone else
* opens the block device between now and the decrement of bd_openers
* then we did a sync that we didn't need to, but that's not the end
* of the world and we want to avoid long (could be several minute)
* syncs while holding the mutex.
*/
if (bdev->bd_openers == 1)
sync_blockdev(bdev);
mutex_lock(&disk->open_mutex);
if (mode & FMODE_EXCL) {
struct block_device *whole = bdev_whole(bdev);
bool bdev_free;
/*
* Release a claim on the device. The holder fields
* are protected with bdev_lock. open_mutex is to
* synchronize disk_holder unlinking.
*/
spin_lock(&bdev_lock);
WARN_ON_ONCE(--bdev->bd_holders < 0); WARN_ON_ONCE(--whole->bd_holders < 0); if ((bdev_free = !bdev->bd_holders)) bdev->bd_holder = NULL; if (!whole->bd_holders) whole->bd_holder = NULL;
spin_unlock(&bdev_lock);
/*
* If this was the last claim, remove holder link and
* unblock evpoll if it was a write holder.
*/
if (bdev_free && bdev->bd_write_holder) { disk_unblock_events(disk);
bdev->bd_write_holder = false;
}
}
/*
* Trigger event checking and tell drivers to flush MEDIA_CHANGE
* event. This is to ensure detection of media removal commanded
* from userland - e.g. eject(1).
*/
disk_flush_events(disk, DISK_EVENT_MEDIA_CHANGE);
if (bdev_is_partition(bdev))
blkdev_put_part(bdev, mode);
else
blkdev_put_whole(bdev, mode); mutex_unlock(&disk->open_mutex);
blkdev_put_no_open(bdev);
}
EXPORT_SYMBOL(blkdev_put);
/**
* lookup_bdev - lookup a struct block_device by name
* @pathname: special file representing the block device
* @dev: return value of the block device's dev_t
*
* Get a reference to the blockdevice at @pathname in the current
* namespace if possible and return it. Return ERR_PTR(error)
* otherwise.
*/
int lookup_bdev(const char *pathname, dev_t *dev)
{
struct inode *inode;
struct path path;
int error;
if (!pathname || !*pathname)
return -EINVAL;
error = kern_path(pathname, LOOKUP_FOLLOW, &path);
if (error)
return error;
inode = d_backing_inode(path.dentry);
error = -ENOTBLK;
if (!S_ISBLK(inode->i_mode))
goto out_path_put;
error = -EACCES;
if (!may_open_dev(&path))
goto out_path_put;
*dev = inode->i_rdev;
error = 0;
out_path_put:
path_put(&path); return error;
}
EXPORT_SYMBOL(lookup_bdev);
int __invalidate_device(struct block_device *bdev, bool kill_dirty)
{
struct super_block *sb = get_super(bdev);
int res = 0;
if (sb) {
/*
* no need to lock the super, get_super holds the
* read mutex so the filesystem cannot go away
* under us (->put_super runs with the write lock
* hold).
*/
shrink_dcache_sb(sb);
res = invalidate_inodes(sb, kill_dirty);
drop_super(sb);
}
invalidate_bdev(bdev);
return res;
}
EXPORT_SYMBOL(__invalidate_device);
void sync_bdevs(bool wait)
{
struct inode *inode, *old_inode = NULL;
spin_lock(&blockdev_superblock->s_inode_list_lock);
list_for_each_entry(inode, &blockdev_superblock->s_inodes, i_sb_list) {
struct address_space *mapping = inode->i_mapping;
struct block_device *bdev;
spin_lock(&inode->i_lock);
if (inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW) ||
mapping->nrpages == 0) {
spin_unlock(&inode->i_lock);
continue;
}
__iget(inode);
spin_unlock(&inode->i_lock);
spin_unlock(&blockdev_superblock->s_inode_list_lock);
/*
* We hold a reference to 'inode' so it couldn't have been
* removed from s_inodes list while we dropped the
* s_inode_list_lock We cannot iput the inode now as we can
* be holding the last reference and we cannot iput it under
* s_inode_list_lock. So we keep the reference and iput it
* later.
*/
iput(old_inode);
old_inode = inode;
bdev = I_BDEV(inode);
mutex_lock(&bdev->bd_disk->open_mutex);
if (!bdev->bd_openers) {
; /* skip */
} else if (wait) {
/*
* We keep the error status of individual mapping so
* that applications can catch the writeback error using
* fsync(2). See filemap_fdatawait_keep_errors() for
* details.
*/
filemap_fdatawait_keep_errors(inode->i_mapping);
} else {
filemap_fdatawrite(inode->i_mapping);
}
mutex_unlock(&bdev->bd_disk->open_mutex);
spin_lock(&blockdev_superblock->s_inode_list_lock);
}
spin_unlock(&blockdev_superblock->s_inode_list_lock);
iput(old_inode);
}
// SPDX-License-Identifier: GPL-2.0-only
/*
* fs/libfs.c
* Library for filesystems writers.
*/
#include <linux/blkdev.h>
#include <linux/export.h>
#include <linux/pagemap.h>
#include <linux/slab.h>
#include <linux/cred.h>
#include <linux/mount.h>
#include <linux/vfs.h>
#include <linux/quotaops.h>
#include <linux/mutex.h>
#include <linux/namei.h>
#include <linux/exportfs.h>
#include <linux/writeback.h>
#include <linux/buffer_head.h> /* sync_mapping_buffers */
#include <linux/fs_context.h>
#include <linux/pseudo_fs.h>
#include <linux/fsnotify.h>
#include <linux/unicode.h>
#include <linux/fscrypt.h>
#include <linux/uaccess.h>
#include "internal.h"
int simple_getattr(struct user_namespace *mnt_userns, const struct path *path,
struct kstat *stat, u32 request_mask,
unsigned int query_flags)
{
struct inode *inode = d_inode(path->dentry);
generic_fillattr(&init_user_ns, inode, stat);
stat->blocks = inode->i_mapping->nrpages << (PAGE_SHIFT - 9);
return 0;
}
EXPORT_SYMBOL(simple_getattr);
int simple_statfs(struct dentry *dentry, struct kstatfs *buf)
{
buf->f_type = dentry->d_sb->s_magic;
buf->f_bsize = PAGE_SIZE;
buf->f_namelen = NAME_MAX;
return 0;
}
EXPORT_SYMBOL(simple_statfs);
/*
* Retaining negative dentries for an in-memory filesystem just wastes
* memory and lookup time: arrange for them to be deleted immediately.
*/
int always_delete_dentry(const struct dentry *dentry)
{
return 1;
}
EXPORT_SYMBOL(always_delete_dentry);
const struct dentry_operations simple_dentry_operations = {
.d_delete = always_delete_dentry,
};
EXPORT_SYMBOL(simple_dentry_operations);
/*
* Lookup the data. This is trivial - if the dentry didn't already
* exist, we know it is negative. Set d_op to delete negative dentries.
*/
struct dentry *simple_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags)
{
if (dentry->d_name.len > NAME_MAX)
return ERR_PTR(-ENAMETOOLONG);
if (!dentry->d_sb->s_d_op) d_set_d_op(dentry, &simple_dentry_operations); d_add(dentry, NULL); return NULL;
}
EXPORT_SYMBOL(simple_lookup);
int dcache_dir_open(struct inode *inode, struct file *file)
{
file->private_data = d_alloc_cursor(file->f_path.dentry);
return file->private_data ? 0 : -ENOMEM;
}
EXPORT_SYMBOL(dcache_dir_open);
int dcache_dir_close(struct inode *inode, struct file *file)
{
dput(file->private_data);
return 0;
}
EXPORT_SYMBOL(dcache_dir_close);
/* parent is locked at least shared */
/*
* Returns an element of siblings' list.
* We are looking for <count>th positive after <p>; if
* found, dentry is grabbed and returned to caller.
* If no such element exists, NULL is returned.
*/
static struct dentry *scan_positives(struct dentry *cursor,
struct list_head *p,
loff_t count,
struct dentry *last)
{
struct dentry *dentry = cursor->d_parent, *found = NULL;
spin_lock(&dentry->d_lock);
while ((p = p->next) != &dentry->d_subdirs) {
struct dentry *d = list_entry(p, struct dentry, d_child);
// we must at least skip cursors, to avoid livelocks
if (d->d_flags & DCACHE_DENTRY_CURSOR)
continue;
if (simple_positive(d) && !--count) {
spin_lock_nested(&d->d_lock, DENTRY_D_LOCK_NESTED);
if (simple_positive(d))
found = dget_dlock(d);
spin_unlock(&d->d_lock);
if (likely(found))
break;
count = 1;
}
if (need_resched()) {
list_move(&cursor->d_child, p);
p = &cursor->d_child;
spin_unlock(&dentry->d_lock);
cond_resched();
spin_lock(&dentry->d_lock);
}
}
spin_unlock(&dentry->d_lock);
dput(last);
return found;
}
loff_t dcache_dir_lseek(struct file *file, loff_t offset, int whence)
{
struct dentry *dentry = file->f_path.dentry;
switch (whence) {
case 1:
offset += file->f_pos;
fallthrough;
case 0:
if (offset >= 0)
break;
fallthrough;
default:
return -EINVAL;
}
if (offset != file->f_pos) {
struct dentry *cursor = file->private_data;
struct dentry *to = NULL;
inode_lock_shared(dentry->d_inode);
if (offset > 2)
to = scan_positives(cursor, &dentry->d_subdirs,
offset - 2, NULL);
spin_lock(&dentry->d_lock);
if (to)
list_move(&cursor->d_child, &to->d_child);
else
list_del_init(&cursor->d_child);
spin_unlock(&dentry->d_lock);
dput(to);
file->f_pos = offset;
inode_unlock_shared(dentry->d_inode);
}
return offset;
}
EXPORT_SYMBOL(dcache_dir_lseek);
/* Relationship between i_mode and the DT_xxx types */
static inline unsigned char dt_type(struct inode *inode)
{
return (inode->i_mode >> 12) & 15;
}
/*
* Directory is locked and all positive dentries in it are safe, since
* for ramfs-type trees they can't go away without unlink() or rmdir(),
* both impossible due to the lock on directory.
*/
int dcache_readdir(struct file *file, struct dir_context *ctx)
{
struct dentry *dentry = file->f_path.dentry;
struct dentry *cursor = file->private_data;
struct list_head *anchor = &dentry->d_subdirs;
struct dentry *next = NULL;
struct list_head *p;
if (!dir_emit_dots(file, ctx))
return 0;
if (ctx->pos == 2)
p = anchor;
else if (!list_empty(&cursor->d_child))
p = &cursor->d_child;
else
return 0;
while ((next = scan_positives(cursor, p, 1, next)) != NULL) {
if (!dir_emit(ctx, next->d_name.name, next->d_name.len,
d_inode(next)->i_ino, dt_type(d_inode(next))))
break;
ctx->pos++;
p = &next->d_child;
}
spin_lock(&dentry->d_lock);
if (next)
list_move_tail(&cursor->d_child, &next->d_child);
else
list_del_init(&cursor->d_child);
spin_unlock(&dentry->d_lock);
dput(next);
return 0;
}
EXPORT_SYMBOL(dcache_readdir);
ssize_t generic_read_dir(struct file *filp, char __user *buf, size_t siz, loff_t *ppos)
{
return -EISDIR;
}
EXPORT_SYMBOL(generic_read_dir);
const struct file_operations simple_dir_operations = {
.open = dcache_dir_open,
.release = dcache_dir_close,
.llseek = dcache_dir_lseek,
.read = generic_read_dir,
.iterate_shared = dcache_readdir,
.fsync = noop_fsync,
};
EXPORT_SYMBOL(simple_dir_operations);
const struct inode_operations simple_dir_inode_operations = {
.lookup = simple_lookup,
};
EXPORT_SYMBOL(simple_dir_inode_operations);
static struct dentry *find_next_child(struct dentry *parent, struct dentry *prev)
{
struct dentry *child = NULL;
struct list_head *p = prev ? &prev->d_child : &parent->d_subdirs;
spin_lock(&parent->d_lock);
while ((p = p->next) != &parent->d_subdirs) { struct dentry *d = container_of(p, struct dentry, d_child);
if (simple_positive(d)) {
spin_lock_nested(&d->d_lock, DENTRY_D_LOCK_NESTED);
if (simple_positive(d))
child = dget_dlock(d);
spin_unlock(&d->d_lock);
if (likely(child))
break;
}
}
spin_unlock(&parent->d_lock);
dput(prev);
return child;
}
void simple_recursive_removal(struct dentry *dentry,
void (*callback)(struct dentry *))
{
struct dentry *this = dget(dentry);
while (true) {
struct dentry *victim = NULL, *child;
struct inode *inode = this->d_inode;
inode_lock(inode);
if (d_is_dir(this))
inode->i_flags |= S_DEAD;
while ((child = find_next_child(this, victim)) == NULL) {
// kill and ascend
// update metadata while it's still locked
inode->i_ctime = current_time(inode);
clear_nlink(inode);
inode_unlock(inode);
victim = this;
this = this->d_parent;
inode = this->d_inode;
inode_lock(inode);
if (simple_positive(victim)) {
d_invalidate(victim); // avoid lost mounts
if (d_is_dir(victim))
fsnotify_rmdir(inode, victim);
else
fsnotify_unlink(inode, victim);
if (callback) callback(victim); dput(victim); // unpin it
}
if (victim == dentry) {
inode->i_ctime = inode->i_mtime =
current_time(inode);
if (d_is_dir(dentry))
drop_nlink(inode);
inode_unlock(inode);
dput(dentry);
return;
}
}
inode_unlock(inode);
this = child;
}
}
EXPORT_SYMBOL(simple_recursive_removal);
static const struct super_operations simple_super_operations = {
.statfs = simple_statfs,
};
static int pseudo_fs_fill_super(struct super_block *s, struct fs_context *fc)
{
struct pseudo_fs_context *ctx = fc->fs_private;
struct inode *root;
s->s_maxbytes = MAX_LFS_FILESIZE;
s->s_blocksize = PAGE_SIZE;
s->s_blocksize_bits = PAGE_SHIFT;
s->s_magic = ctx->magic;
s->s_op = ctx->ops ?: &simple_super_operations;
s->s_xattr = ctx->xattr;
s->s_time_gran = 1;
root = new_inode(s);
if (!root)
return -ENOMEM;
/*
* since this is the first inode, make it number 1. New inodes created
* after this must take care not to collide with it (by passing
* max_reserved of 1 to iunique).
*/
root->i_ino = 1;
root->i_mode = S_IFDIR | S_IRUSR | S_IWUSR;
root->i_atime = root->i_mtime = root->i_ctime = current_time(root);
s->s_root = d_make_root(root);
if (!s->s_root)
return -ENOMEM;
s->s_d_op = ctx->dops;
return 0;
}
static int pseudo_fs_get_tree(struct fs_context *fc)
{
return get_tree_nodev(fc, pseudo_fs_fill_super);
}
static void pseudo_fs_free(struct fs_context *fc)
{
kfree(fc->fs_private);
}
static const struct fs_context_operations pseudo_fs_context_ops = {
.free = pseudo_fs_free,
.get_tree = pseudo_fs_get_tree,
};
/*
* Common helper for pseudo-filesystems (sockfs, pipefs, bdev - stuff that
* will never be mountable)
*/
struct pseudo_fs_context *init_pseudo(struct fs_context *fc,
unsigned long magic)
{
struct pseudo_fs_context *ctx;
ctx = kzalloc(sizeof(struct pseudo_fs_context), GFP_KERNEL);
if (likely(ctx)) {
ctx->magic = magic;
fc->fs_private = ctx;
fc->ops = &pseudo_fs_context_ops;
fc->sb_flags |= SB_NOUSER;
fc->global = true;
}
return ctx;
}
EXPORT_SYMBOL(init_pseudo);
int simple_open(struct inode *inode, struct file *file)
{
if (inode->i_private)
file->private_data = inode->i_private;
return 0;
}
EXPORT_SYMBOL(simple_open);
int simple_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry)
{
struct inode *inode = d_inode(old_dentry);
inode->i_ctime = dir->i_ctime = dir->i_mtime = current_time(inode);
inc_nlink(inode);
ihold(inode);
dget(dentry);
d_instantiate(dentry, inode);
return 0;
}
EXPORT_SYMBOL(simple_link);
int simple_empty(struct dentry *dentry)
{
struct dentry *child;
int ret = 0;
spin_lock(&dentry->d_lock);
list_for_each_entry(child, &dentry->d_subdirs, d_child) {
spin_lock_nested(&child->d_lock, DENTRY_D_LOCK_NESTED);
if (simple_positive(child)) {
spin_unlock(&child->d_lock);
goto out;
}
spin_unlock(&child->d_lock);
}
ret = 1;
out:
spin_unlock(&dentry->d_lock);
return ret;
}
EXPORT_SYMBOL(simple_empty);
int simple_unlink(struct inode *dir, struct dentry *dentry)
{
struct inode *inode = d_inode(dentry);
inode->i_ctime = dir->i_ctime = dir->i_mtime = current_time(inode);
drop_nlink(inode);
dput(dentry);
return 0;
}
EXPORT_SYMBOL(simple_unlink);
int simple_rmdir(struct inode *dir, struct dentry *dentry)
{
if (!simple_empty(dentry))
return -ENOTEMPTY;
drop_nlink(d_inode(dentry));
simple_unlink(dir, dentry);
drop_nlink(dir);
return 0;
}
EXPORT_SYMBOL(simple_rmdir);
int simple_rename(struct user_namespace *mnt_userns, struct inode *old_dir,
struct dentry *old_dentry, struct inode *new_dir,
struct dentry *new_dentry, unsigned int flags)
{
struct inode *inode = d_inode(old_dentry);
int they_are_dirs = d_is_dir(old_dentry);
if (flags & ~RENAME_NOREPLACE)
return -EINVAL;
if (!simple_empty(new_dentry))
return -ENOTEMPTY;
if (d_really_is_positive(new_dentry)) {
simple_unlink(new_dir, new_dentry);
if (they_are_dirs) {
drop_nlink(d_inode(new_dentry));
drop_nlink(old_dir);
}
} else if (they_are_dirs) {
drop_nlink(old_dir);
inc_nlink(new_dir);
}
old_dir->i_ctime = old_dir->i_mtime = new_dir->i_ctime =
new_dir->i_mtime = inode->i_ctime = current_time(old_dir);
return 0;
}
EXPORT_SYMBOL(simple_rename);
/**
* simple_setattr - setattr for simple filesystem
* @mnt_userns: user namespace of the target mount
* @dentry: dentry
* @iattr: iattr structure
*
* Returns 0 on success, -error on failure.
*
* simple_setattr is a simple ->setattr implementation without a proper
* implementation of size changes.
*
* It can either be used for in-memory filesystems or special files
* on simple regular filesystems. Anything that needs to change on-disk
* or wire state on size changes needs its own setattr method.
*/
int simple_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
struct iattr *iattr)
{
struct inode *inode = d_inode(dentry);
int error;
error = setattr_prepare(mnt_userns, dentry, iattr);
if (error)
return error;
if (iattr->ia_valid & ATTR_SIZE)
truncate_setsize(inode, iattr->ia_size);
setattr_copy(mnt_userns, inode, iattr);
mark_inode_dirty(inode);
return 0;
}
EXPORT_SYMBOL(simple_setattr);
static int simple_readpage(struct file *file, struct page *page)
{
clear_highpage(page);
flush_dcache_page(page);
SetPageUptodate(page);
unlock_page(page);
return 0;
}
int simple_write_begin(struct file *file, struct address_space *mapping,
loff_t pos, unsigned len, unsigned flags,
struct page **pagep, void **fsdata)
{
struct page *page;
pgoff_t index;
index = pos >> PAGE_SHIFT;
page = grab_cache_page_write_begin(mapping, index, flags);
if (!page)
return -ENOMEM;
*pagep = page;
if (!PageUptodate(page) && (len != PAGE_SIZE)) {
unsigned from = pos & (PAGE_SIZE - 1);
zero_user_segments(page, 0, from, from + len, PAGE_SIZE);
}
return 0;
}
EXPORT_SYMBOL(simple_write_begin);
/**
* simple_write_end - .write_end helper for non-block-device FSes
* @file: See .write_end of address_space_operations
* @mapping: "
* @pos: "
* @len: "
* @copied: "
* @page: "
* @fsdata: "
*
* simple_write_end does the minimum needed for updating a page after writing is
* done. It has the same API signature as the .write_end of
* address_space_operations vector. So it can just be set onto .write_end for
* FSes that don't need any other processing. i_mutex is assumed to be held.
* Block based filesystems should use generic_write_end().
* NOTE: Even though i_size might get updated by this function, mark_inode_dirty
* is not called, so a filesystem that actually does store data in .write_inode
* should extend on what's done here with a call to mark_inode_dirty() in the
* case that i_size has changed.
*
* Use *ONLY* with simple_readpage()
*/
static int simple_write_end(struct file *file, struct address_space *mapping,
loff_t pos, unsigned len, unsigned copied,
struct page *page, void *fsdata)
{
struct inode *inode = page->mapping->host;
loff_t last_pos = pos + copied;
/* zero the stale part of the page if we did a short copy */
if (!PageUptodate(page)) {
if (copied < len) {
unsigned from = pos & (PAGE_SIZE - 1);
zero_user(page, from + copied, len - copied);
}
SetPageUptodate(page);
}
/*
* No need to use i_size_read() here, the i_size
* cannot change under us because we hold the i_mutex.
*/
if (last_pos > inode->i_size)
i_size_write(inode, last_pos);
set_page_dirty(page);
unlock_page(page);
put_page(page);
return copied;
}
/*
* Provides ramfs-style behavior: data in the pagecache, but no writeback.
*/
const struct address_space_operations ram_aops = {
.readpage = simple_readpage,
.write_begin = simple_write_begin,
.write_end = simple_write_end,
.set_page_dirty = __set_page_dirty_no_writeback,
};
EXPORT_SYMBOL(ram_aops);
/*
* the inodes created here are not hashed. If you use iunique to generate
* unique inode values later for this filesystem, then you must take care
* to pass it an appropriate max_reserved value to avoid collisions.
*/
int simple_fill_super(struct super_block *s, unsigned long magic,
const struct tree_descr *files)
{
struct inode *inode;
struct dentry *root;
struct dentry *dentry;
int i;
s->s_blocksize = PAGE_SIZE;
s->s_blocksize_bits = PAGE_SHIFT;
s->s_magic = magic;
s->s_op = &simple_super_operations;
s->s_time_gran = 1;
inode = new_inode(s);
if (!inode)
return -ENOMEM;
/*
* because the root inode is 1, the files array must not contain an
* entry at index 1
*/
inode->i_ino = 1;
inode->i_mode = S_IFDIR | 0755;
inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode);
inode->i_op = &simple_dir_inode_operations;
inode->i_fop = &simple_dir_operations;
set_nlink(inode, 2);
root = d_make_root(inode);
if (!root)
return -ENOMEM;
for (i = 0; !files->name || files->name[0]; i++, files++) {
if (!files->name)
continue;
/* warn if it tries to conflict with the root inode */
if (unlikely(i == 1))
printk(KERN_WARNING "%s: %s passed in a files array"
"with an index of 1!\n", __func__,
s->s_type->name);
dentry = d_alloc_name(root, files->name);
if (!dentry)
goto out;
inode = new_inode(s);
if (!inode) {
dput(dentry);
goto out;
}
inode->i_mode = S_IFREG | files->mode;
inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode);
inode->i_fop = files->ops;
inode->i_ino = i;
d_add(dentry, inode);
}
s->s_root = root;
return 0;
out:
d_genocide(root);
shrink_dcache_parent(root);
dput(root);
return -ENOMEM;
}
EXPORT_SYMBOL(simple_fill_super);
static DEFINE_SPINLOCK(pin_fs_lock);
int simple_pin_fs(struct file_system_type *type, struct vfsmount **mount, int *count)
{
struct vfsmount *mnt = NULL;
spin_lock(&pin_fs_lock);
if (unlikely(!*mount)) {
spin_unlock(&pin_fs_lock);
mnt = vfs_kern_mount(type, SB_KERNMOUNT, type->name, NULL);
if (IS_ERR(mnt))
return PTR_ERR(mnt);
spin_lock(&pin_fs_lock);
if (!*mount)
*mount = mnt;
}
mntget(*mount);
++*count;
spin_unlock(&pin_fs_lock);
mntput(mnt);
return 0;
}
EXPORT_SYMBOL(simple_pin_fs);
void simple_release_fs(struct vfsmount **mount, int *count)
{
struct vfsmount *mnt;
spin_lock(&pin_fs_lock);
mnt = *mount;
if (!--*count)
*mount = NULL;
spin_unlock(&pin_fs_lock);
mntput(mnt);
}
EXPORT_SYMBOL(simple_release_fs);
/**
* simple_read_from_buffer - copy data from the buffer to user space
* @to: the user space buffer to read to
* @count: the maximum number of bytes to read
* @ppos: the current position in the buffer
* @from: the buffer to read from
* @available: the size of the buffer
*
* The simple_read_from_buffer() function reads up to @count bytes from the
* buffer @from at offset @ppos into the user space address starting at @to.
*
* On success, the number of bytes read is returned and the offset @ppos is
* advanced by this number, or negative value is returned on error.
**/
ssize_t simple_read_from_buffer(void __user *to, size_t count, loff_t *ppos,
const void *from, size_t available)
{
loff_t pos = *ppos;
size_t ret;
if (pos < 0)
return -EINVAL;
if (pos >= available || !count)
return 0;
if (count > available - pos)
count = available - pos;
ret = copy_to_user(to, from + pos, count);
if (ret == count)
return -EFAULT;
count -= ret;
*ppos = pos + count;
return count;
}
EXPORT_SYMBOL(simple_read_from_buffer);
/**
* simple_write_to_buffer - copy data from user space to the buffer
* @to: the buffer to write to
* @available: the size of the buffer
* @ppos: the current position in the buffer
* @from: the user space buffer to read from
* @count: the maximum number of bytes to read
*
* The simple_write_to_buffer() function reads up to @count bytes from the user
* space address starting at @from into the buffer @to at offset @ppos.
*
* On success, the number of bytes written is returned and the offset @ppos is
* advanced by this number, or negative value is returned on error.
**/
ssize_t simple_write_to_buffer(void *to, size_t available, loff_t *ppos,
const void __user *from, size_t count)
{
loff_t pos = *ppos;
size_t res;
if (pos < 0)
return -EINVAL;
if (pos >= available || !count)
return 0;
if (count > available - pos)
count = available - pos;
res = copy_from_user(to + pos, from, count);
if (res == count)
return -EFAULT;
count -= res;
*ppos = pos + count;
return count;
}
EXPORT_SYMBOL(simple_write_to_buffer);
/**
* memory_read_from_buffer - copy data from the buffer
* @to: the kernel space buffer to read to
* @count: the maximum number of bytes to read
* @ppos: the current position in the buffer
* @from: the buffer to read from
* @available: the size of the buffer
*
* The memory_read_from_buffer() function reads up to @count bytes from the
* buffer @from at offset @ppos into the kernel space address starting at @to.
*
* On success, the number of bytes read is returned and the offset @ppos is
* advanced by this number, or negative value is returned on error.
**/
ssize_t memory_read_from_buffer(void *to, size_t count, loff_t *ppos,
const void *from, size_t available)
{
loff_t pos = *ppos;
if (pos < 0)
return -EINVAL;
if (pos >= available)
return 0;
if (count > available - pos)
count = available - pos;
memcpy(to, from + pos, count);
*ppos = pos + count;
return count;
}
EXPORT_SYMBOL(memory_read_from_buffer);
/*
* Transaction based IO.
* The file expects a single write which triggers the transaction, and then
* possibly a read which collects the result - which is stored in a
* file-local buffer.
*/
void simple_transaction_set(struct file *file, size_t n)
{
struct simple_transaction_argresp *ar = file->private_data;
BUG_ON(n > SIMPLE_TRANSACTION_LIMIT);
/*
* The barrier ensures that ar->size will really remain zero until
* ar->data is ready for reading.
*/
smp_mb();
ar->size = n;
}
EXPORT_SYMBOL(simple_transaction_set);
char *simple_transaction_get(struct file *file, const char __user *buf, size_t size)
{
struct simple_transaction_argresp *ar;
static DEFINE_SPINLOCK(simple_transaction_lock);
if (size > SIMPLE_TRANSACTION_LIMIT - 1)
return ERR_PTR(-EFBIG);
ar = (struct simple_transaction_argresp *)get_zeroed_page(GFP_KERNEL);
if (!ar)
return ERR_PTR(-ENOMEM);
spin_lock(&simple_transaction_lock);
/* only one write allowed per open */
if (file->private_data) {
spin_unlock(&simple_transaction_lock);
free_page((unsigned long)ar);
return ERR_PTR(-EBUSY);
}
file->private_data = ar;
spin_unlock(&simple_transaction_lock);
if (copy_from_user(ar->data, buf, size))
return ERR_PTR(-EFAULT);
return ar->data;
}
EXPORT_SYMBOL(simple_transaction_get);
ssize_t simple_transaction_read(struct file *file, char __user *buf, size_t size, loff_t *pos)
{
struct simple_transaction_argresp *ar = file->private_data;
if (!ar)
return 0;
return simple_read_from_buffer(buf, size, pos, ar->data, ar->size);
}
EXPORT_SYMBOL(simple_transaction_read);
int simple_transaction_release(struct inode *inode, struct file *file)
{
free_page((unsigned long)file->private_data);
return 0;
}
EXPORT_SYMBOL(simple_transaction_release);
/* Simple attribute files */
struct simple_attr {
int (*get)(void *, u64 *);
int (*set)(void *, u64);
char get_buf[24]; /* enough to store a u64 and "\n\0" */
char set_buf[24];
void *data;
const char *fmt; /* format for read operation */
struct mutex mutex; /* protects access to these buffers */
};
/* simple_attr_open is called by an actual attribute open file operation
* to set the attribute specific access operations. */
int simple_attr_open(struct inode *inode, struct file *file,
int (*get)(void *, u64 *), int (*set)(void *, u64),
const char *fmt)
{
struct simple_attr *attr;
attr = kzalloc(sizeof(*attr), GFP_KERNEL);
if (!attr)
return -ENOMEM;
attr->get = get;
attr->set = set;
attr->data = inode->i_private;
attr->fmt = fmt;
mutex_init(&attr->mutex);
file->private_data = attr;
return nonseekable_open(inode, file);
}
EXPORT_SYMBOL_GPL(simple_attr_open);
int simple_attr_release(struct inode *inode, struct file *file)
{
kfree(file->private_data);
return 0;
}
EXPORT_SYMBOL_GPL(simple_attr_release); /* GPL-only? This? Really? */
/* read from the buffer that is filled with the get function */
ssize_t simple_attr_read(struct file *file, char __user *buf,
size_t len, loff_t *ppos)
{
struct simple_attr *attr;
size_t size;
ssize_t ret;
attr = file->private_data;
if (!attr->get)
return -EACCES;
ret = mutex_lock_interruptible(&attr->mutex);
if (ret)
return ret;
if (*ppos && attr->get_buf[0]) {
/* continued read */
size = strlen(attr->get_buf);
} else {
/* first read */
u64 val;
ret = attr->get(attr->data, &val);
if (ret)
goto out;
size = scnprintf(attr->get_buf, sizeof(attr->get_buf),
attr->fmt, (unsigned long long)val);
}
ret = simple_read_from_buffer(buf, len, ppos, attr->get_buf, size);
out:
mutex_unlock(&attr->mutex);
return ret;
}
EXPORT_SYMBOL_GPL(simple_attr_read);
/* interpret the buffer as a number to call the set function with */
ssize_t simple_attr_write(struct file *file, const char __user *buf,
size_t len, loff_t *ppos)
{
struct simple_attr *attr;
unsigned long long val;
size_t size;
ssize_t ret;
attr = file->private_data;
if (!attr->set)
return -EACCES;
ret = mutex_lock_interruptible(&attr->mutex);
if (ret)
return ret;
ret = -EFAULT;
size = min(sizeof(attr->set_buf) - 1, len);
if (copy_from_user(attr->set_buf, buf, size))
goto out;
attr->set_buf[size] = '\0';
ret = kstrtoull(attr->set_buf, 0, &val);
if (ret)
goto out;
ret = attr->set(attr->data, val);
if (ret == 0)
ret = len; /* on success, claim we got the whole input */
out:
mutex_unlock(&attr->mutex);
return ret;
}
EXPORT_SYMBOL_GPL(simple_attr_write);
/**
* generic_fh_to_dentry - generic helper for the fh_to_dentry export operation
* @sb: filesystem to do the file handle conversion on
* @fid: file handle to convert
* @fh_len: length of the file handle in bytes
* @fh_type: type of file handle
* @get_inode: filesystem callback to retrieve inode
*
* This function decodes @fid as long as it has one of the well-known
* Linux filehandle types and calls @get_inode on it to retrieve the
* inode for the object specified in the file handle.
*/
struct dentry *generic_fh_to_dentry(struct super_block *sb, struct fid *fid,
int fh_len, int fh_type, struct inode *(*get_inode)
(struct super_block *sb, u64 ino, u32 gen))
{
struct inode *inode = NULL;
if (fh_len < 2)
return NULL;
switch (fh_type) {
case FILEID_INO32_GEN:
case FILEID_INO32_GEN_PARENT:
inode = get_inode(sb, fid->i32.ino, fid->i32.gen);
break;
}
return d_obtain_alias(inode);
}
EXPORT_SYMBOL_GPL(generic_fh_to_dentry);
/**
* generic_fh_to_parent - generic helper for the fh_to_parent export operation
* @sb: filesystem to do the file handle conversion on
* @fid: file handle to convert
* @fh_len: length of the file handle in bytes
* @fh_type: type of file handle
* @get_inode: filesystem callback to retrieve inode
*
* This function decodes @fid as long as it has one of the well-known
* Linux filehandle types and calls @get_inode on it to retrieve the
* inode for the _parent_ object specified in the file handle if it
* is specified in the file handle, or NULL otherwise.
*/
struct dentry *generic_fh_to_parent(struct super_block *sb, struct fid *fid,
int fh_len, int fh_type, struct inode *(*get_inode)
(struct super_block *sb, u64 ino, u32 gen))
{
struct inode *inode = NULL;
if (fh_len <= 2)
return NULL;
switch (fh_type) {
case FILEID_INO32_GEN_PARENT:
inode = get_inode(sb, fid->i32.parent_ino,
(fh_len > 3 ? fid->i32.parent_gen : 0));
break;
}
return d_obtain_alias(inode);
}
EXPORT_SYMBOL_GPL(generic_fh_to_parent);
/**
* __generic_file_fsync - generic fsync implementation for simple filesystems
*
* @file: file to synchronize
* @start: start offset in bytes
* @end: end offset in bytes (inclusive)
* @datasync: only synchronize essential metadata if true
*
* This is a generic implementation of the fsync method for simple
* filesystems which track all non-inode metadata in the buffers list
* hanging off the address_space structure.
*/
int __generic_file_fsync(struct file *file, loff_t start, loff_t end,
int datasync)
{
struct inode *inode = file->f_mapping->host;
int err;
int ret;
err = file_write_and_wait_range(file, start, end);
if (err)
return err;
inode_lock(inode);
ret = sync_mapping_buffers(inode->i_mapping);
if (!(inode->i_state & I_DIRTY_ALL))
goto out;
if (datasync && !(inode->i_state & I_DIRTY_DATASYNC))
goto out;
err = sync_inode_metadata(inode, 1);
if (ret == 0)
ret = err;
out:
inode_unlock(inode);
/* check and advance again to catch errors after syncing out buffers */
err = file_check_and_advance_wb_err(file);
if (ret == 0)
ret = err;
return ret;
}
EXPORT_SYMBOL(__generic_file_fsync);
/**
* generic_file_fsync - generic fsync implementation for simple filesystems
* with flush
* @file: file to synchronize
* @start: start offset in bytes
* @end: end offset in bytes (inclusive)
* @datasync: only synchronize essential metadata if true
*
*/
int generic_file_fsync(struct file *file, loff_t start, loff_t end,
int datasync)
{
struct inode *inode = file->f_mapping->host;
int err;
err = __generic_file_fsync(file, start, end, datasync);
if (err)
return err;
return blkdev_issue_flush(inode->i_sb->s_bdev);
}
EXPORT_SYMBOL(generic_file_fsync);
/**
* generic_check_addressable - Check addressability of file system
* @blocksize_bits: log of file system block size
* @num_blocks: number of blocks in file system
*
* Determine whether a file system with @num_blocks blocks (and a
* block size of 2**@blocksize_bits) is addressable by the sector_t
* and page cache of the system. Return 0 if so and -EFBIG otherwise.
*/
int generic_check_addressable(unsigned blocksize_bits, u64 num_blocks)
{
u64 last_fs_block = num_blocks - 1;
u64 last_fs_page =
last_fs_block >> (PAGE_SHIFT - blocksize_bits);
if (unlikely(num_blocks == 0))
return 0;
if ((blocksize_bits < 9) || (blocksize_bits > PAGE_SHIFT))
return -EINVAL;
if ((last_fs_block > (sector_t)(~0ULL) >> (blocksize_bits - 9)) ||
(last_fs_page > (pgoff_t)(~0ULL))) {
return -EFBIG;
}
return 0;
}
EXPORT_SYMBOL(generic_check_addressable);
/*
* No-op implementation of ->fsync for in-memory filesystems.
*/
int noop_fsync(struct file *file, loff_t start, loff_t end, int datasync)
{
return 0;
}
EXPORT_SYMBOL(noop_fsync);
void noop_invalidatepage(struct page *page, unsigned int offset,
unsigned int length)
{
/*
* There is no page cache to invalidate in the dax case, however
* we need this callback defined to prevent falling back to
* block_invalidatepage() in do_invalidatepage().
*/
}
EXPORT_SYMBOL_GPL(noop_invalidatepage);
ssize_t noop_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
{
/*
* iomap based filesystems support direct I/O without need for
* this callback. However, it still needs to be set in
* inode->a_ops so that open/fcntl know that direct I/O is
* generally supported.
*/
return -EINVAL;
}
EXPORT_SYMBOL_GPL(noop_direct_IO);
/* Because kfree isn't assignment-compatible with void(void*) ;-/ */
void kfree_link(void *p)
{
kfree(p);
}
EXPORT_SYMBOL(kfree_link);
struct inode *alloc_anon_inode(struct super_block *s)
{
static const struct address_space_operations anon_aops = {
.set_page_dirty = __set_page_dirty_no_writeback,
};
struct inode *inode = new_inode_pseudo(s);
if (!inode)
return ERR_PTR(-ENOMEM);
inode->i_ino = get_next_ino();
inode->i_mapping->a_ops = &anon_aops;
/*
* Mark the inode dirty from the very beginning,
* that way it will never be moved to the dirty
* list because mark_inode_dirty() will think
* that it already _is_ on the dirty list.
*/
inode->i_state = I_DIRTY;
inode->i_mode = S_IRUSR | S_IWUSR;
inode->i_uid = current_fsuid();
inode->i_gid = current_fsgid();
inode->i_flags |= S_PRIVATE;
inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode);
return inode;
}
EXPORT_SYMBOL(alloc_anon_inode);
/**
* simple_nosetlease - generic helper for prohibiting leases
* @filp: file pointer
* @arg: type of lease to obtain
* @flp: new lease supplied for insertion
* @priv: private data for lm_setup operation
*
* Generic helper for filesystems that do not wish to allow leases to be set.
* All arguments are ignored and it just returns -EINVAL.
*/
int
simple_nosetlease(struct file *filp, long arg, struct file_lock **flp,
void **priv)
{
return -EINVAL;
}
EXPORT_SYMBOL(simple_nosetlease);
/**
* simple_get_link - generic helper to get the target of "fast" symlinks
* @dentry: not used here
* @inode: the symlink inode
* @done: not used here
*
* Generic helper for filesystems to use for symlink inodes where a pointer to
* the symlink target is stored in ->i_link. NOTE: this isn't normally called,
* since as an optimization the path lookup code uses any non-NULL ->i_link
* directly, without calling ->get_link(). But ->get_link() still must be set,
* to mark the inode_operations as being for a symlink.
*
* Return: the symlink target
*/
const char *simple_get_link(struct dentry *dentry, struct inode *inode,
struct delayed_call *done)
{
return inode->i_link;
}
EXPORT_SYMBOL(simple_get_link);
const struct inode_operations simple_symlink_inode_operations = {
.get_link = simple_get_link,
};
EXPORT_SYMBOL(simple_symlink_inode_operations);
/*
* Operations for a permanently empty directory.
*/
static struct dentry *empty_dir_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags)
{
return ERR_PTR(-ENOENT);
}
static int empty_dir_getattr(struct user_namespace *mnt_userns,
const struct path *path, struct kstat *stat,
u32 request_mask, unsigned int query_flags)
{
struct inode *inode = d_inode(path->dentry);
generic_fillattr(&init_user_ns, inode, stat);
return 0;
}
static int empty_dir_setattr(struct user_namespace *mnt_userns,
struct dentry *dentry, struct iattr *attr)
{
return -EPERM;
}
static ssize_t empty_dir_listxattr(struct dentry *dentry, char *list, size_t size)
{
return -EOPNOTSUPP;
}
static const struct inode_operations empty_dir_inode_operations = {
.lookup = empty_dir_lookup,
.permission = generic_permission,
.setattr = empty_dir_setattr,
.getattr = empty_dir_getattr,
.listxattr = empty_dir_listxattr,
};
static loff_t empty_dir_llseek(struct file *file, loff_t offset, int whence)
{
/* An empty directory has two entries . and .. at offsets 0 and 1 */
return generic_file_llseek_size(file, offset, whence, 2, 2);
}
static int empty_dir_readdir(struct file *file, struct dir_context *ctx)
{
dir_emit_dots(file, ctx);
return 0;
}
static const struct file_operations empty_dir_operations = {
.llseek = empty_dir_llseek,
.read = generic_read_dir,
.iterate_shared = empty_dir_readdir,
.fsync = noop_fsync,
};
void make_empty_dir_inode(struct inode *inode)
{
set_nlink(inode, 2);
inode->i_mode = S_IFDIR | S_IRUGO | S_IXUGO;
inode->i_uid = GLOBAL_ROOT_UID;
inode->i_gid = GLOBAL_ROOT_GID;
inode->i_rdev = 0;
inode->i_size = 0;
inode->i_blkbits = PAGE_SHIFT;
inode->i_blocks = 0;
inode->i_op = &empty_dir_inode_operations;
inode->i_opflags &= ~IOP_XATTR;
inode->i_fop = &empty_dir_operations;
}
bool is_empty_dir_inode(struct inode *inode)
{
return (inode->i_fop == &empty_dir_operations) &&
(inode->i_op == &empty_dir_inode_operations);
}
#ifdef CONFIG_UNICODE
/*
* Determine if the name of a dentry should be casefolded.
*
* Return: if names will need casefolding
*/
static bool needs_casefold(const struct inode *dir)
{
return IS_CASEFOLDED(dir) && dir->i_sb->s_encoding;
}
/**
* generic_ci_d_compare - generic d_compare implementation for casefolding filesystems
* @dentry: dentry whose name we are checking against
* @len: len of name of dentry
* @str: str pointer to name of dentry
* @name: Name to compare against
*
* Return: 0 if names match, 1 if mismatch, or -ERRNO
*/
static int generic_ci_d_compare(const struct dentry *dentry, unsigned int len,
const char *str, const struct qstr *name)
{
const struct dentry *parent = READ_ONCE(dentry->d_parent);
const struct inode *dir = READ_ONCE(parent->d_inode);
const struct super_block *sb = dentry->d_sb;
const struct unicode_map *um = sb->s_encoding;
struct qstr qstr = QSTR_INIT(str, len);
char strbuf[DNAME_INLINE_LEN];
int ret;
if (!dir || !needs_casefold(dir))
goto fallback;
/*
* If the dentry name is stored in-line, then it may be concurrently
* modified by a rename. If this happens, the VFS will eventually retry
* the lookup, so it doesn't matter what ->d_compare() returns.
* However, it's unsafe to call utf8_strncasecmp() with an unstable
* string. Therefore, we have to copy the name into a temporary buffer.
*/
if (len <= DNAME_INLINE_LEN - 1) {
memcpy(strbuf, str, len);
strbuf[len] = 0;
qstr.name = strbuf;
/* prevent compiler from optimizing out the temporary buffer */
barrier();
}
ret = utf8_strncasecmp(um, name, &qstr);
if (ret >= 0)
return ret;
if (sb_has_strict_encoding(sb))
return -EINVAL;
fallback:
if (len != name->len)
return 1;
return !!memcmp(str, name->name, len);
}
/**
* generic_ci_d_hash - generic d_hash implementation for casefolding filesystems
* @dentry: dentry of the parent directory
* @str: qstr of name whose hash we should fill in
*
* Return: 0 if hash was successful or unchanged, and -EINVAL on error
*/
static int generic_ci_d_hash(const struct dentry *dentry, struct qstr *str)
{
const struct inode *dir = READ_ONCE(dentry->d_inode);
struct super_block *sb = dentry->d_sb;
const struct unicode_map *um = sb->s_encoding;
int ret = 0;
if (!dir || !needs_casefold(dir))
return 0;
ret = utf8_casefold_hash(um, dentry, str);
if (ret < 0 && sb_has_strict_encoding(sb))
return -EINVAL;
return 0;
}
static const struct dentry_operations generic_ci_dentry_ops = {
.d_hash = generic_ci_d_hash,
.d_compare = generic_ci_d_compare,
};
#endif
#ifdef CONFIG_FS_ENCRYPTION
static const struct dentry_operations generic_encrypted_dentry_ops = {
.d_revalidate = fscrypt_d_revalidate,
};
#endif
#if defined(CONFIG_FS_ENCRYPTION) && defined(CONFIG_UNICODE)
static const struct dentry_operations generic_encrypted_ci_dentry_ops = {
.d_hash = generic_ci_d_hash,
.d_compare = generic_ci_d_compare,
.d_revalidate = fscrypt_d_revalidate,
};
#endif
/**
* generic_set_encrypted_ci_d_ops - helper for setting d_ops for given dentry
* @dentry: dentry to set ops on
*
* Casefolded directories need d_hash and d_compare set, so that the dentries
* contained in them are handled case-insensitively. Note that these operations
* are needed on the parent directory rather than on the dentries in it, and
* while the casefolding flag can be toggled on and off on an empty directory,
* dentry_operations can't be changed later. As a result, if the filesystem has
* casefolding support enabled at all, we have to give all dentries the
* casefolding operations even if their inode doesn't have the casefolding flag
* currently (and thus the casefolding ops would be no-ops for now).
*
* Encryption works differently in that the only dentry operation it needs is
* d_revalidate, which it only needs on dentries that have the no-key name flag.
* The no-key flag can't be set "later", so we don't have to worry about that.
*
* Finally, to maximize compatibility with overlayfs (which isn't compatible
* with certain dentry operations) and to avoid taking an unnecessary
* performance hit, we use custom dentry_operations for each possible
* combination rather than always installing all operations.
*/
void generic_set_encrypted_ci_d_ops(struct dentry *dentry)
{
#ifdef CONFIG_FS_ENCRYPTION
bool needs_encrypt_ops = dentry->d_flags & DCACHE_NOKEY_NAME;
#endif
#ifdef CONFIG_UNICODE
bool needs_ci_ops = dentry->d_sb->s_encoding;
#endif
#if defined(CONFIG_FS_ENCRYPTION) && defined(CONFIG_UNICODE)
if (needs_encrypt_ops && needs_ci_ops) {
d_set_d_op(dentry, &generic_encrypted_ci_dentry_ops);
return;
}
#endif
#ifdef CONFIG_FS_ENCRYPTION
if (needs_encrypt_ops) {
d_set_d_op(dentry, &generic_encrypted_dentry_ops);
return;
}
#endif
#ifdef CONFIG_UNICODE
if (needs_ci_ops) {
d_set_d_op(dentry, &generic_ci_dentry_ops);
return;
}
#endif
}
EXPORT_SYMBOL(generic_set_encrypted_ci_d_ops);
// SPDX-License-Identifier: GPL-2.0
/*
* drivers/base/power/runtime.c - Helper functions for device runtime PM
*
* Copyright (c) 2009 Rafael J. Wysocki <rjw@sisk.pl>, Novell Inc.
* Copyright (C) 2010 Alan Stern <stern@rowland.harvard.edu>
*/
#include <linux/sched/mm.h>
#include <linux/ktime.h>
#include <linux/hrtimer.h>
#include <linux/export.h>
#include <linux/pm_runtime.h>
#include <linux/pm_wakeirq.h>
#include <trace/events/rpm.h>
#include "../base.h"
#include "power.h"
typedef int (*pm_callback_t)(struct device *);
static pm_callback_t __rpm_get_callback(struct device *dev, size_t cb_offset)
{
pm_callback_t cb;
const struct dev_pm_ops *ops;
if (dev->pm_domain)
ops = &dev->pm_domain->ops;
else if (dev->type && dev->type->pm)
ops = dev->type->pm;
else if (dev->class && dev->class->pm)
ops = dev->class->pm;
else if (dev->bus && dev->bus->pm)
ops = dev->bus->pm;
else
ops = NULL;
if (ops)
cb = *(pm_callback_t *)((void *)ops + cb_offset);
else
cb = NULL;
if (!cb && dev->driver && dev->driver->pm)
cb = *(pm_callback_t *)((void *)dev->driver->pm + cb_offset);
return cb;
}
#define RPM_GET_CALLBACK(dev, callback) \
__rpm_get_callback(dev, offsetof(struct dev_pm_ops, callback))
static int rpm_resume(struct device *dev, int rpmflags);
static int rpm_suspend(struct device *dev, int rpmflags);
/**
* update_pm_runtime_accounting - Update the time accounting of power states
* @dev: Device to update the accounting for
*
* In order to be able to have time accounting of the various power states
* (as used by programs such as PowerTOP to show the effectiveness of runtime
* PM), we need to track the time spent in each state.
* update_pm_runtime_accounting must be called each time before the
* runtime_status field is updated, to account the time in the old state
* correctly.
*/
static void update_pm_runtime_accounting(struct device *dev)
{
u64 now, last, delta;
if (dev->power.disable_depth > 0)
return;
last = dev->power.accounting_timestamp;
now = ktime_get_mono_fast_ns();
dev->power.accounting_timestamp = now;
/*
* Because ktime_get_mono_fast_ns() is not monotonic during
* timekeeping updates, ensure that 'now' is after the last saved
* timesptamp.
*/
if (now < last)
return;
delta = now - last;
if (dev->power.runtime_status == RPM_SUSPENDED)
dev->power.suspended_time += delta;
else
dev->power.active_time += delta;
}
static void __update_runtime_status(struct device *dev, enum rpm_status status)
{
update_pm_runtime_accounting(dev);
dev->power.runtime_status = status;
}
static u64 rpm_get_accounted_time(struct device *dev, bool suspended)
{
u64 time;
unsigned long flags;
spin_lock_irqsave(&dev->power.lock, flags);
update_pm_runtime_accounting(dev);
time = suspended ? dev->power.suspended_time : dev->power.active_time;
spin_unlock_irqrestore(&dev->power.lock, flags);
return time;
}
u64 pm_runtime_active_time(struct device *dev)
{
return rpm_get_accounted_time(dev, false);
}
u64 pm_runtime_suspended_time(struct device *dev)
{
return rpm_get_accounted_time(dev, true);
}
EXPORT_SYMBOL_GPL(pm_runtime_suspended_time);
/**
* pm_runtime_deactivate_timer - Deactivate given device's suspend timer.
* @dev: Device to handle.
*/
static void pm_runtime_deactivate_timer(struct device *dev)
{
if (dev->power.timer_expires > 0) {
hrtimer_try_to_cancel(&dev->power.suspend_timer);
dev->power.timer_expires = 0;
}
}
/**
* pm_runtime_cancel_pending - Deactivate suspend timer and cancel requests.
* @dev: Device to handle.
*/
static void pm_runtime_cancel_pending(struct device *dev)
{
pm_runtime_deactivate_timer(dev);
/*
* In case there's a request pending, make sure its work function will
* return without doing anything.
*/
dev->power.request = RPM_REQ_NONE;
}
/*
* pm_runtime_autosuspend_expiration - Get a device's autosuspend-delay expiration time.
* @dev: Device to handle.
*
* Compute the autosuspend-delay expiration time based on the device's
* power.last_busy time. If the delay has already expired or is disabled
* (negative) or the power.use_autosuspend flag isn't set, return 0.
* Otherwise return the expiration time in nanoseconds (adjusted to be nonzero).
*
* This function may be called either with or without dev->power.lock held.
* Either way it can be racy, since power.last_busy may be updated at any time.
*/
u64 pm_runtime_autosuspend_expiration(struct device *dev)
{
int autosuspend_delay;
u64 expires;
if (!dev->power.use_autosuspend)
return 0;
autosuspend_delay = READ_ONCE(dev->power.autosuspend_delay);
if (autosuspend_delay < 0)
return 0;
expires = READ_ONCE(dev->power.last_busy);
expires += (u64)autosuspend_delay * NSEC_PER_MSEC;
if (expires > ktime_get_mono_fast_ns())
return expires; /* Expires in the future */
return 0;
}
EXPORT_SYMBOL_GPL(pm_runtime_autosuspend_expiration);
static int dev_memalloc_noio(struct device *dev, void *data)
{
return dev->power.memalloc_noio;
}
/*
* pm_runtime_set_memalloc_noio - Set a device's memalloc_noio flag.
* @dev: Device to handle.
* @enable: True for setting the flag and False for clearing the flag.
*
* Set the flag for all devices in the path from the device to the
* root device in the device tree if @enable is true, otherwise clear
* the flag for devices in the path whose siblings don't set the flag.
*
* The function should only be called by block device, or network
* device driver for solving the deadlock problem during runtime
* resume/suspend:
*
* If memory allocation with GFP_KERNEL is called inside runtime
* resume/suspend callback of any one of its ancestors(or the
* block device itself), the deadlock may be triggered inside the
* memory allocation since it might not complete until the block
* device becomes active and the involed page I/O finishes. The
* situation is pointed out first by Alan Stern. Network device
* are involved in iSCSI kind of situation.
*
* The lock of dev_hotplug_mutex is held in the function for handling
* hotplug race because pm_runtime_set_memalloc_noio() may be called
* in async probe().
*
* The function should be called between device_add() and device_del()
* on the affected device(block/network device).
*/
void pm_runtime_set_memalloc_noio(struct device *dev, bool enable)
{
static DEFINE_MUTEX(dev_hotplug_mutex);
mutex_lock(&dev_hotplug_mutex);
for (;;) {
bool enabled;
/* hold power lock since bitfield is not SMP-safe. */
spin_lock_irq(&dev->power.lock);
enabled = dev->power.memalloc_noio;
dev->power.memalloc_noio = enable;
spin_unlock_irq(&dev->power.lock);
/*
* not need to enable ancestors any more if the device
* has been enabled.
*/
if (enabled && enable)
break;
dev = dev->parent;
/*
* clear flag of the parent device only if all the
* children don't set the flag because ancestor's
* flag was set by any one of the descendants.
*/
if (!dev || (!enable &&
device_for_each_child(dev, NULL,
dev_memalloc_noio)))
break;
}
mutex_unlock(&dev_hotplug_mutex);
}
EXPORT_SYMBOL_GPL(pm_runtime_set_memalloc_noio);
/**
* rpm_check_suspend_allowed - Test whether a device may be suspended.
* @dev: Device to test.
*/
static int rpm_check_suspend_allowed(struct device *dev)
{
int retval = 0;
if (dev->power.runtime_error)
retval = -EINVAL;
else if (dev->power.disable_depth > 0)
retval = -EACCES;
else if (atomic_read(&dev->power.usage_count) > 0)
retval = -EAGAIN;
else if (!dev->power.ignore_children &&
atomic_read(&dev->power.child_count))
retval = -EBUSY;
/* Pending resume requests take precedence over suspends. */
else if ((dev->power.deferred_resume
&& dev->power.runtime_status == RPM_SUSPENDING)
|| (dev->power.request_pending
&& dev->power.request == RPM_REQ_RESUME))
retval = -EAGAIN;
else if (__dev_pm_qos_resume_latency(dev) == 0)
retval = -EPERM;
else if (dev->power.runtime_status == RPM_SUSPENDED)
retval = 1;
return retval;
}
static int rpm_get_suppliers(struct device *dev)
{
struct device_link *link;
list_for_each_entry_rcu(link, &dev->links.suppliers, c_node,
device_links_read_lock_held()) {
int retval;
if (!(link->flags & DL_FLAG_PM_RUNTIME))
continue;
retval = pm_runtime_get_sync(link->supplier);
/* Ignore suppliers with disabled runtime PM. */
if (retval < 0 && retval != -EACCES) {
pm_runtime_put_noidle(link->supplier);
return retval;
}
refcount_inc(&link->rpm_active);
}
return 0;
}
/**
* pm_runtime_release_supplier - Drop references to device link's supplier.
* @link: Target device link.
* @check_idle: Whether or not to check if the supplier device is idle.
*
* Drop all runtime PM references associated with @link to its supplier device
* and if @check_idle is set, check if that device is idle (and so it can be
* suspended).
*/
void pm_runtime_release_supplier(struct device_link *link, bool check_idle)
{
struct device *supplier = link->supplier;
/*
* The additional power.usage_count check is a safety net in case
* the rpm_active refcount becomes saturated, in which case
* refcount_dec_not_one() would return true forever, but it is not
* strictly necessary.
*/
while (refcount_dec_not_one(&link->rpm_active) &&
atomic_read(&supplier->power.usage_count) > 0)
pm_runtime_put_noidle(supplier);
if (check_idle)
pm_request_idle(supplier);
}
static void __rpm_put_suppliers(struct device *dev, bool try_to_suspend)
{
struct device_link *link;
list_for_each_entry_rcu(link, &dev->links.suppliers, c_node,
device_links_read_lock_held())
pm_runtime_release_supplier(link, try_to_suspend);
}
static void rpm_put_suppliers(struct device *dev)
{
__rpm_put_suppliers(dev, true);
}
static void rpm_suspend_suppliers(struct device *dev)
{
struct device_link *link;
int idx = device_links_read_lock();
list_for_each_entry_rcu(link, &dev->links.suppliers, c_node,
device_links_read_lock_held())
pm_request_idle(link->supplier);
device_links_read_unlock(idx);
}
/**
* __rpm_callback - Run a given runtime PM callback for a given device.
* @cb: Runtime PM callback to run.
* @dev: Device to run the callback for.
*/
static int __rpm_callback(int (*cb)(struct device *), struct device *dev)
__releases(&dev->power.lock) __acquires(&dev->power.lock)
{
int retval = 0, idx;
bool use_links = dev->power.links_count > 0;
if (dev->power.irq_safe) {
spin_unlock(&dev->power.lock);
} else {
spin_unlock_irq(&dev->power.lock);
/*
* Resume suppliers if necessary.
*
* The device's runtime PM status cannot change until this
* routine returns, so it is safe to read the status outside of
* the lock.
*/
if (use_links && dev->power.runtime_status == RPM_RESUMING) {
idx = device_links_read_lock();
retval = rpm_get_suppliers(dev);
if (retval) {
rpm_put_suppliers(dev);
goto fail;
}
device_links_read_unlock(idx);
}
}
if (cb)
retval = cb(dev);
if (dev->power.irq_safe) {
spin_lock(&dev->power.lock);
} else {
/*
* If the device is suspending and the callback has returned
* success, drop the usage counters of the suppliers that have
* been reference counted on its resume.
*
* Do that if resume fails too.
*/
if (use_links
&& ((dev->power.runtime_status == RPM_SUSPENDING && !retval)
|| (dev->power.runtime_status == RPM_RESUMING && retval))) {
idx = device_links_read_lock();
__rpm_put_suppliers(dev, false);
fail:
device_links_read_unlock(idx);
}
spin_lock_irq(&dev->power.lock);
}
return retval;
}
/**
* rpm_idle - Notify device bus type if the device can be suspended.
* @dev: Device to notify the bus type about.
* @rpmflags: Flag bits.
*
* Check if the device's runtime PM status allows it to be suspended. If
* another idle notification has been started earlier, return immediately. If
* the RPM_ASYNC flag is set then queue an idle-notification request; otherwise
* run the ->runtime_idle() callback directly. If the ->runtime_idle callback
* doesn't exist or if it returns 0, call rpm_suspend with the RPM_AUTO flag.
*
* This function must be called under dev->power.lock with interrupts disabled.
*/
static int rpm_idle(struct device *dev, int rpmflags)
{
int (*callback)(struct device *);
int retval;
trace_rpm_idle_rcuidle(dev, rpmflags);
retval = rpm_check_suspend_allowed(dev);
if (retval < 0)
; /* Conditions are wrong. */
/* Idle notifications are allowed only in the RPM_ACTIVE state. */
else if (dev->power.runtime_status != RPM_ACTIVE)
retval = -EAGAIN;
/*
* Any pending request other than an idle notification takes
* precedence over us, except that the timer may be running.
*/
else if (dev->power.request_pending &&
dev->power.request > RPM_REQ_IDLE)
retval = -EAGAIN;
/* Act as though RPM_NOWAIT is always set. */
else if (dev->power.idle_notification)
retval = -EINPROGRESS;
if (retval)
goto out;
/* Pending requests need to be canceled. */
dev->power.request = RPM_REQ_NONE;
callback = RPM_GET_CALLBACK(dev, runtime_idle);
/* If no callback assume success. */
if (!callback || dev->power.no_callbacks)
goto out;
/* Carry out an asynchronous or a synchronous idle notification. */
if (rpmflags & RPM_ASYNC) {
dev->power.request = RPM_REQ_IDLE;
if (!dev->power.request_pending) {
dev->power.request_pending = true;
queue_work(pm_wq, &dev->power.work);
}
trace_rpm_return_int_rcuidle(dev, _THIS_IP_, 0);
return 0;
}
dev->power.idle_notification = true;
retval = __rpm_callback(callback, dev);
dev->power.idle_notification = false;
wake_up_all(&dev->power.wait_queue);
out:
trace_rpm_return_int_rcuidle(dev, _THIS_IP_, retval);
return retval ? retval : rpm_suspend(dev, rpmflags | RPM_AUTO);
}
/**
* rpm_callback - Run a given runtime PM callback for a given device.
* @cb: Runtime PM callback to run.
* @dev: Device to run the callback for.
*/
static int rpm_callback(int (*cb)(struct device *), struct device *dev)
{
int retval;
if (dev->power.memalloc_noio) {
unsigned int noio_flag;
/*
* Deadlock might be caused if memory allocation with
* GFP_KERNEL happens inside runtime_suspend and
* runtime_resume callbacks of one block device's
* ancestor or the block device itself. Network
* device might be thought as part of iSCSI block
* device, so network device and its ancestor should
* be marked as memalloc_noio too.
*/
noio_flag = memalloc_noio_save();
retval = __rpm_callback(cb, dev);
memalloc_noio_restore(noio_flag);
} else {
retval = __rpm_callback(cb, dev);
}
dev->power.runtime_error = retval;
return retval != -EACCES ? retval : -EIO;
}
/**
* rpm_suspend - Carry out runtime suspend of given device.
* @dev: Device to suspend.
* @rpmflags: Flag bits.
*
* Check if the device's runtime PM status allows it to be suspended.
* Cancel a pending idle notification, autosuspend or suspend. If
* another suspend has been started earlier, either return immediately
* or wait for it to finish, depending on the RPM_NOWAIT and RPM_ASYNC
* flags. If the RPM_ASYNC flag is set then queue a suspend request;
* otherwise run the ->runtime_suspend() callback directly. When
* ->runtime_suspend succeeded, if a deferred resume was requested while
* the callback was running then carry it out, otherwise send an idle
* notification for its parent (if the suspend succeeded and both
* ignore_children of parent->power and irq_safe of dev->power are not set).
* If ->runtime_suspend failed with -EAGAIN or -EBUSY, and if the RPM_AUTO
* flag is set and the next autosuspend-delay expiration time is in the
* future, schedule another autosuspend attempt.
*
* This function must be called under dev->power.lock with interrupts disabled.
*/
static int rpm_suspend(struct device *dev, int rpmflags)
__releases(&dev->power.lock) __acquires(&dev->power.lock)
{
int (*callback)(struct device *);
struct device *parent = NULL;
int retval;
trace_rpm_suspend_rcuidle(dev, rpmflags);
repeat:
retval = rpm_check_suspend_allowed(dev);
if (retval < 0)
goto out; /* Conditions are wrong. */
/* Synchronous suspends are not allowed in the RPM_RESUMING state. */
if (dev->power.runtime_status == RPM_RESUMING && !(rpmflags & RPM_ASYNC))
retval = -EAGAIN;
if (retval)
goto out;
/* If the autosuspend_delay time hasn't expired yet, reschedule. */
if ((rpmflags & RPM_AUTO)
&& dev->power.runtime_status != RPM_SUSPENDING) {
u64 expires = pm_runtime_autosuspend_expiration(dev);
if (expires != 0) {
/* Pending requests need to be canceled. */
dev->power.request = RPM_REQ_NONE;
/*
* Optimization: If the timer is already running and is
* set to expire at or before the autosuspend delay,
* avoid the overhead of resetting it. Just let it
* expire; pm_suspend_timer_fn() will take care of the
* rest.
*/
if (!(dev->power.timer_expires &&
dev->power.timer_expires <= expires)) {
/*
* We add a slack of 25% to gather wakeups
* without sacrificing the granularity.
*/
u64 slack = (u64)READ_ONCE(dev->power.autosuspend_delay) *
(NSEC_PER_MSEC >> 2);
dev->power.timer_expires = expires;
hrtimer_start_range_ns(&dev->power.suspend_timer,
ns_to_ktime(expires),
slack,
HRTIMER_MODE_ABS);
}
dev->power.timer_autosuspends = 1;
goto out;
}
}
/* Other scheduled or pending requests need to be canceled. */
pm_runtime_cancel_pending(dev);
if (dev->power.runtime_status == RPM_SUSPENDING) {
DEFINE_WAIT(wait);
if (rpmflags & (RPM_ASYNC | RPM_NOWAIT)) {
retval = -EINPROGRESS;
goto out;
}
if (dev->power.irq_safe) {
spin_unlock(&dev->power.lock);
cpu_relax();
spin_lock(&dev->power.lock);
goto repeat;
}
/* Wait for the other suspend running in parallel with us. */
for (;;) {
prepare_to_wait(&dev->power.wait_queue, &wait,
TASK_UNINTERRUPTIBLE);
if (dev->power.runtime_status != RPM_SUSPENDING)
break;
spin_unlock_irq(&dev->power.lock);
schedule();
spin_lock_irq(&dev->power.lock);
}
finish_wait(&dev->power.wait_queue, &wait);
goto repeat;
}
if (dev->power.no_callbacks)
goto no_callback; /* Assume success. */
/* Carry out an asynchronous or a synchronous suspend. */
if (rpmflags & RPM_ASYNC) {
dev->power.request = (rpmflags & RPM_AUTO) ?
RPM_REQ_AUTOSUSPEND : RPM_REQ_SUSPEND;
if (!dev->power.request_pending) {
dev->power.request_pending = true;
queue_work(pm_wq, &dev->power.work);
}
goto out;
}
__update_runtime_status(dev, RPM_SUSPENDING);
callback = RPM_GET_CALLBACK(dev, runtime_suspend);
dev_pm_enable_wake_irq_check(dev, true);
retval = rpm_callback(callback, dev);
if (retval)
goto fail;
no_callback:
__update_runtime_status(dev, RPM_SUSPENDED);
pm_runtime_deactivate_timer(dev);
if (dev->parent) {
parent = dev->parent;
atomic_add_unless(&parent->power.child_count, -1, 0);
}
wake_up_all(&dev->power.wait_queue);
if (dev->power.deferred_resume) {
dev->power.deferred_resume = false;
rpm_resume(dev, 0);
retval = -EAGAIN;
goto out;
}
if (dev->power.irq_safe)
goto out;
/* Maybe the parent is now able to suspend. */
if (parent && !parent->power.ignore_children) {
spin_unlock(&dev->power.lock);
spin_lock(&parent->power.lock);
rpm_idle(parent, RPM_ASYNC);
spin_unlock(&parent->power.lock);
spin_lock(&dev->power.lock);
}
/* Maybe the suppliers are now able to suspend. */
if (dev->power.links_count > 0) {
spin_unlock_irq(&dev->power.lock);
rpm_suspend_suppliers(dev);
spin_lock_irq(&dev->power.lock);
}
out:
trace_rpm_return_int_rcuidle(dev, _THIS_IP_, retval);
return retval;
fail:
dev_pm_disable_wake_irq_check(dev);
__update_runtime_status(dev, RPM_ACTIVE);
dev->power.deferred_resume = false;
wake_up_all(&dev->power.wait_queue);
if (retval == -EAGAIN || retval == -EBUSY) {
dev->power.runtime_error = 0;
/*
* If the callback routine failed an autosuspend, and
* if the last_busy time has been updated so that there
* is a new autosuspend expiration time, automatically
* reschedule another autosuspend.
*/
if ((rpmflags & RPM_AUTO) &&
pm_runtime_autosuspend_expiration(dev) != 0)
goto repeat;
} else {
pm_runtime_cancel_pending(dev);
}
goto out;
}
/**
* rpm_resume - Carry out runtime resume of given device.
* @dev: Device to resume.
* @rpmflags: Flag bits.
*
* Check if the device's runtime PM status allows it to be resumed. Cancel
* any scheduled or pending requests. If another resume has been started
* earlier, either return immediately or wait for it to finish, depending on the
* RPM_NOWAIT and RPM_ASYNC flags. Similarly, if there's a suspend running in
* parallel with this function, either tell the other process to resume after
* suspending (deferred_resume) or wait for it to finish. If the RPM_ASYNC
* flag is set then queue a resume request; otherwise run the
* ->runtime_resume() callback directly. Queue an idle notification for the
* device if the resume succeeded.
*
* This function must be called under dev->power.lock with interrupts disabled.
*/
static int rpm_resume(struct device *dev, int rpmflags)
__releases(&dev->power.lock) __acquires(&dev->power.lock)
{
int (*callback)(struct device *);
struct device *parent = NULL;
int retval = 0;
trace_rpm_resume_rcuidle(dev, rpmflags);
repeat:
if (dev->power.runtime_error)
retval = -EINVAL;
else if (dev->power.disable_depth == 1 && dev->power.is_suspended
&& dev->power.runtime_status == RPM_ACTIVE)
retval = 1;
else if (dev->power.disable_depth > 0)
retval = -EACCES;
if (retval)
goto out;
/*
* Other scheduled or pending requests need to be canceled. Small
* optimization: If an autosuspend timer is running, leave it running
* rather than cancelling it now only to restart it again in the near
* future.
*/
dev->power.request = RPM_REQ_NONE;
if (!dev->power.timer_autosuspends)
pm_runtime_deactivate_timer(dev);
if (dev->power.runtime_status == RPM_ACTIVE) {
retval = 1;
goto out;
}
if (dev->power.runtime_status == RPM_RESUMING
|| dev->power.runtime_status == RPM_SUSPENDING) {
DEFINE_WAIT(wait);
if (rpmflags & (RPM_ASYNC | RPM_NOWAIT)) {
if (dev->power.runtime_status == RPM_SUSPENDING)
dev->power.deferred_resume = true;
else
retval = -EINPROGRESS;
goto out;
}
if (dev->power.irq_safe) {
spin_unlock(&dev->power.lock);
cpu_relax();
spin_lock(&dev->power.lock);
goto repeat;
}
/* Wait for the operation carried out in parallel with us. */
for (;;) {
prepare_to_wait(&dev->power.wait_queue, &wait,
TASK_UNINTERRUPTIBLE);
if (dev->power.runtime_status != RPM_RESUMING
&& dev->power.runtime_status != RPM_SUSPENDING)
break;
spin_unlock_irq(&dev->power.lock);
schedule();
spin_lock_irq(&dev->power.lock);
}
finish_wait(&dev->power.wait_queue, &wait);
goto repeat;
}
/*
* See if we can skip waking up the parent. This is safe only if
* power.no_callbacks is set, because otherwise we don't know whether
* the resume will actually succeed.
*/
if (dev->power.no_callbacks && !parent && dev->parent) {
spin_lock_nested(&dev->parent->power.lock, SINGLE_DEPTH_NESTING);
if (dev->parent->power.disable_depth > 0
|| dev->parent->power.ignore_children
|| dev->parent->power.runtime_status == RPM_ACTIVE) {
atomic_inc(&dev->parent->power.child_count);
spin_unlock(&dev->parent->power.lock);
retval = 1;
goto no_callback; /* Assume success. */
}
spin_unlock(&dev->parent->power.lock);
}
/* Carry out an asynchronous or a synchronous resume. */
if (rpmflags & RPM_ASYNC) {
dev->power.request = RPM_REQ_RESUME;
if (!dev->power.request_pending) {
dev->power.request_pending = true;
queue_work(pm_wq, &dev->power.work);
}
retval = 0;
goto out;
}
if (!parent && dev->parent) {
/*
* Increment the parent's usage counter and resume it if
* necessary. Not needed if dev is irq-safe; then the
* parent is permanently resumed.
*/
parent = dev->parent;
if (dev->power.irq_safe)
goto skip_parent;
spin_unlock(&dev->power.lock);
pm_runtime_get_noresume(parent);
spin_lock(&parent->power.lock);
/*
* Resume the parent if it has runtime PM enabled and not been
* set to ignore its children.
*/
if (!parent->power.disable_depth
&& !parent->power.ignore_children) {
rpm_resume(parent, 0);
if (parent->power.runtime_status != RPM_ACTIVE)
retval = -EBUSY;
}
spin_unlock(&parent->power.lock);
spin_lock(&dev->power.lock);
if (retval)
goto out;
goto repeat;
}
skip_parent:
if (dev->power.no_callbacks)
goto no_callback; /* Assume success. */
__update_runtime_status(dev, RPM_RESUMING);
callback = RPM_GET_CALLBACK(dev, runtime_resume);
dev_pm_disable_wake_irq_check(dev);
retval = rpm_callback(callback, dev);
if (retval) {
__update_runtime_status(dev, RPM_SUSPENDED);
pm_runtime_cancel_pending(dev);
dev_pm_enable_wake_irq_check(dev, false);
} else {
no_callback:
__update_runtime_status(dev, RPM_ACTIVE);
pm_runtime_mark_last_busy(dev);
if (parent)
atomic_inc(&parent->power.child_count);
}
wake_up_all(&dev->power.wait_queue);
if (retval >= 0)
rpm_idle(dev, RPM_ASYNC);
out:
if (parent && !dev->power.irq_safe) {
spin_unlock_irq(&dev->power.lock);
pm_runtime_put(parent);
spin_lock_irq(&dev->power.lock);
}
trace_rpm_return_int_rcuidle(dev, _THIS_IP_, retval);
return retval;
}
/**
* pm_runtime_work - Universal runtime PM work function.
* @work: Work structure used for scheduling the execution of this function.
*
* Use @work to get the device object the work is to be done for, determine what
* is to be done and execute the appropriate runtime PM function.
*/
static void pm_runtime_work(struct work_struct *work)
{
struct device *dev = container_of(work, struct device, power.work);
enum rpm_request req;
spin_lock_irq(&dev->power.lock);
if (!dev->power.request_pending)
goto out;
req = dev->power.request;
dev->power.request = RPM_REQ_NONE;
dev->power.request_pending = false;
switch (req) {
case RPM_REQ_NONE:
break;
case RPM_REQ_IDLE:
rpm_idle(dev, RPM_NOWAIT);
break;
case RPM_REQ_SUSPEND:
rpm_suspend(dev, RPM_NOWAIT);
break;
case RPM_REQ_AUTOSUSPEND:
rpm_suspend(dev, RPM_NOWAIT | RPM_AUTO);
break;
case RPM_REQ_RESUME:
rpm_resume(dev, RPM_NOWAIT);
break;
}
out:
spin_unlock_irq(&dev->power.lock);
}
/**
* pm_suspend_timer_fn - Timer function for pm_schedule_suspend().
* @timer: hrtimer used by pm_schedule_suspend().
*
* Check if the time is right and queue a suspend request.
*/
static enum hrtimer_restart pm_suspend_timer_fn(struct hrtimer *timer)
{
struct device *dev = container_of(timer, struct device, power.suspend_timer);
unsigned long flags;
u64 expires;
spin_lock_irqsave(&dev->power.lock, flags);
expires = dev->power.timer_expires;
/*
* If 'expires' is after the current time, we've been called
* too early.
*/
if (expires > 0 && expires < ktime_get_mono_fast_ns()) {
dev->power.timer_expires = 0;
rpm_suspend(dev, dev->power.timer_autosuspends ?
(RPM_ASYNC | RPM_AUTO) : RPM_ASYNC);
}
spin_unlock_irqrestore(&dev->power.lock, flags);
return HRTIMER_NORESTART;
}
/**
* pm_schedule_suspend - Set up a timer to submit a suspend request in future.
* @dev: Device to suspend.
* @delay: Time to wait before submitting a suspend request, in milliseconds.
*/
int pm_schedule_suspend(struct device *dev, unsigned int delay)
{
unsigned long flags;
u64 expires;
int retval;
spin_lock_irqsave(&dev->power.lock, flags);
if (!delay) {
retval = rpm_suspend(dev, RPM_ASYNC);
goto out;
}
retval = rpm_check_suspend_allowed(dev);
if (retval)
goto out;
/* Other scheduled or pending requests need to be canceled. */
pm_runtime_cancel_pending(dev);
expires = ktime_get_mono_fast_ns() + (u64)delay * NSEC_PER_MSEC;
dev->power.timer_expires = expires;
dev->power.timer_autosuspends = 0;
hrtimer_start(&dev->power.suspend_timer, expires, HRTIMER_MODE_ABS);
out:
spin_unlock_irqrestore(&dev->power.lock, flags);
return retval;
}
EXPORT_SYMBOL_GPL(pm_schedule_suspend);
/**
* __pm_runtime_idle - Entry point for runtime idle operations.
* @dev: Device to send idle notification for.
* @rpmflags: Flag bits.
*
* If the RPM_GET_PUT flag is set, decrement the device's usage count and
* return immediately if it is larger than zero. Then carry out an idle
* notification, either synchronous or asynchronous.
*
* This routine may be called in atomic context if the RPM_ASYNC flag is set,
* or if pm_runtime_irq_safe() has been called.
*/
int __pm_runtime_idle(struct device *dev, int rpmflags)
{
unsigned long flags;
int retval;
if (rpmflags & RPM_GET_PUT) {
if (!atomic_dec_and_test(&dev->power.usage_count)) {
trace_rpm_usage_rcuidle(dev, rpmflags);
return 0;
}
}
might_sleep_if(!(rpmflags & RPM_ASYNC) && !dev->power.irq_safe);
spin_lock_irqsave(&dev->power.lock, flags);
retval = rpm_idle(dev, rpmflags);
spin_unlock_irqrestore(&dev->power.lock, flags);
return retval;
}
EXPORT_SYMBOL_GPL(__pm_runtime_idle);
/**
* __pm_runtime_suspend - Entry point for runtime put/suspend operations.
* @dev: Device to suspend.
* @rpmflags: Flag bits.
*
* If the RPM_GET_PUT flag is set, decrement the device's usage count and
* return immediately if it is larger than zero. Then carry out a suspend,
* either synchronous or asynchronous.
*
* This routine may be called in atomic context if the RPM_ASYNC flag is set,
* or if pm_runtime_irq_safe() has been called.
*/
int __pm_runtime_suspend(struct device *dev, int rpmflags)
{
unsigned long flags;
int retval;
if (rpmflags & RPM_GET_PUT) {
if (!atomic_dec_and_test(&dev->power.usage_count)) {
trace_rpm_usage_rcuidle(dev, rpmflags);
return 0;
}
}
might_sleep_if(!(rpmflags & RPM_ASYNC) && !dev->power.irq_safe);
spin_lock_irqsave(&dev->power.lock, flags);
retval = rpm_suspend(dev, rpmflags);
spin_unlock_irqrestore(&dev->power.lock, flags);
return retval;
}
EXPORT_SYMBOL_GPL(__pm_runtime_suspend);
/**
* __pm_runtime_resume - Entry point for runtime resume operations.
* @dev: Device to resume.
* @rpmflags: Flag bits.
*
* If the RPM_GET_PUT flag is set, increment the device's usage count. Then
* carry out a resume, either synchronous or asynchronous.
*
* This routine may be called in atomic context if the RPM_ASYNC flag is set,
* or if pm_runtime_irq_safe() has been called.
*/
int __pm_runtime_resume(struct device *dev, int rpmflags)
{
unsigned long flags;
int retval;
might_sleep_if(!(rpmflags & RPM_ASYNC) && !dev->power.irq_safe &&
dev->power.runtime_status != RPM_ACTIVE);
if (rpmflags & RPM_GET_PUT)
atomic_inc(&dev->power.usage_count);
spin_lock_irqsave(&dev->power.lock, flags);
retval = rpm_resume(dev, rpmflags);
spin_unlock_irqrestore(&dev->power.lock, flags);
return retval;
}
EXPORT_SYMBOL_GPL(__pm_runtime_resume);
/**
* pm_runtime_get_if_active - Conditionally bump up device usage counter.
* @dev: Device to handle.
* @ign_usage_count: Whether or not to look at the current usage counter value.
*
* Return -EINVAL if runtime PM is disabled for @dev.
*
* Otherwise, if the runtime PM status of @dev is %RPM_ACTIVE and either
* @ign_usage_count is %true or the runtime PM usage counter of @dev is not
* zero, increment the usage counter of @dev and return 1. Otherwise, return 0
* without changing the usage counter.
*
* If @ign_usage_count is %true, this function can be used to prevent suspending
* the device when its runtime PM status is %RPM_ACTIVE.
*
* If @ign_usage_count is %false, this function can be used to prevent
* suspending the device when both its runtime PM status is %RPM_ACTIVE and its
* runtime PM usage counter is not zero.
*
* The caller is responsible for decrementing the runtime PM usage counter of
* @dev after this function has returned a positive value for it.
*/
int pm_runtime_get_if_active(struct device *dev, bool ign_usage_count)
{
unsigned long flags;
int retval;
spin_lock_irqsave(&dev->power.lock, flags);
if (dev->power.disable_depth > 0) {
retval = -EINVAL;
} else if (dev->power.runtime_status != RPM_ACTIVE) {
retval = 0;
} else if (ign_usage_count) {
retval = 1;
atomic_inc(&dev->power.usage_count);
} else {
retval = atomic_inc_not_zero(&dev->power.usage_count);
}
trace_rpm_usage_rcuidle(dev, 0);
spin_unlock_irqrestore(&dev->power.lock, flags);
return retval;
}
EXPORT_SYMBOL_GPL(pm_runtime_get_if_active);
/**
* __pm_runtime_set_status - Set runtime PM status of a device.
* @dev: Device to handle.
* @status: New runtime PM status of the device.
*
* If runtime PM of the device is disabled or its power.runtime_error field is
* different from zero, the status may be changed either to RPM_ACTIVE, or to
* RPM_SUSPENDED, as long as that reflects the actual state of the device.
* However, if the device has a parent and the parent is not active, and the
* parent's power.ignore_children flag is unset, the device's status cannot be
* set to RPM_ACTIVE, so -EBUSY is returned in that case.
*
* If successful, __pm_runtime_set_status() clears the power.runtime_error field
* and the device parent's counter of unsuspended children is modified to
* reflect the new status. If the new status is RPM_SUSPENDED, an idle
* notification request for the parent is submitted.
*
* If @dev has any suppliers (as reflected by device links to them), and @status
* is RPM_ACTIVE, they will be activated upfront and if the activation of one
* of them fails, the status of @dev will be changed to RPM_SUSPENDED (instead
* of the @status value) and the suppliers will be deacticated on exit. The
* error returned by the failing supplier activation will be returned in that
* case.
*/
int __pm_runtime_set_status(struct device *dev, unsigned int status)
{
struct device *parent = dev->parent;
bool notify_parent = false;
int error = 0;
if (status != RPM_ACTIVE && status != RPM_SUSPENDED)
return -EINVAL;
spin_lock_irq(&dev->power.lock);
/*
* Prevent PM-runtime from being enabled for the device or return an
* error if it is enabled already and working.
*/
if (dev->power.runtime_error || dev->power.disable_depth)
dev->power.disable_depth++;
else
error = -EAGAIN;
spin_unlock_irq(&dev->power.lock);
if (error)
return error;
/*
* If the new status is RPM_ACTIVE, the suppliers can be activated
* upfront regardless of the current status, because next time
* rpm_put_suppliers() runs, the rpm_active refcounts of the links
* involved will be dropped down to one anyway.
*/
if (status == RPM_ACTIVE) {
int idx = device_links_read_lock();
error = rpm_get_suppliers(dev);
if (error)
status = RPM_SUSPENDED;
device_links_read_unlock(idx);
}
spin_lock_irq(&dev->power.lock);
if (dev->power.runtime_status == status || !parent)
goto out_set;
if (status == RPM_SUSPENDED) {
atomic_add_unless(&parent->power.child_count, -1, 0);
notify_parent = !parent->power.ignore_children;
} else {
spin_lock_nested(&parent->power.lock, SINGLE_DEPTH_NESTING);
/*
* It is invalid to put an active child under a parent that is
* not active, has runtime PM enabled and the
* 'power.ignore_children' flag unset.
*/
if (!parent->power.disable_depth
&& !parent->power.ignore_children
&& parent->power.runtime_status != RPM_ACTIVE) {
dev_err(dev, "runtime PM trying to activate child device %s but parent (%s) is not active\n",
dev_name(dev),
dev_name(parent));
error = -EBUSY;
} else if (dev->power.runtime_status == RPM_SUSPENDED) {
atomic_inc(&parent->power.child_count);
}
spin_unlock(&parent->power.lock);
if (error) {
status = RPM_SUSPENDED;
goto out;
}
}
out_set:
__update_runtime_status(dev, status);
if (!error)
dev->power.runtime_error = 0;
out:
spin_unlock_irq(&dev->power.lock);
if (notify_parent)
pm_request_idle(parent);
if (status == RPM_SUSPENDED) {
int idx = device_links_read_lock();
rpm_put_suppliers(dev);
device_links_read_unlock(idx);
}
pm_runtime_enable(dev);
return error;
}
EXPORT_SYMBOL_GPL(__pm_runtime_set_status);
/**
* __pm_runtime_barrier - Cancel pending requests and wait for completions.
* @dev: Device to handle.
*
* Flush all pending requests for the device from pm_wq and wait for all
* runtime PM operations involving the device in progress to complete.
*
* Should be called under dev->power.lock with interrupts disabled.
*/
static void __pm_runtime_barrier(struct device *dev)
{
pm_runtime_deactivate_timer(dev);
if (dev->power.request_pending) {
dev->power.request = RPM_REQ_NONE;
spin_unlock_irq(&dev->power.lock);
cancel_work_sync(&dev->power.work);
spin_lock_irq(&dev->power.lock);
dev->power.request_pending = false;
}
if (dev->power.runtime_status == RPM_SUSPENDING
|| dev->power.runtime_status == RPM_RESUMING
|| dev->power.idle_notification) {
DEFINE_WAIT(wait);
/* Suspend, wake-up or idle notification in progress. */
for (;;) {
prepare_to_wait(&dev->power.wait_queue, &wait,
TASK_UNINTERRUPTIBLE);
if (dev->power.runtime_status != RPM_SUSPENDING
&& dev->power.runtime_status != RPM_RESUMING
&& !dev->power.idle_notification)
break;
spin_unlock_irq(&dev->power.lock);
schedule();
spin_lock_irq(&dev->power.lock);
}
finish_wait(&dev->power.wait_queue, &wait);
}
}
/**
* pm_runtime_barrier - Flush pending requests and wait for completions.
* @dev: Device to handle.
*
* Prevent the device from being suspended by incrementing its usage counter and
* if there's a pending resume request for the device, wake the device up.
* Next, make sure that all pending requests for the device have been flushed
* from pm_wq and wait for all runtime PM operations involving the device in
* progress to complete.
*
* Return value:
* 1, if there was a resume request pending and the device had to be woken up,
* 0, otherwise
*/
int pm_runtime_barrier(struct device *dev)
{
int retval = 0;
pm_runtime_get_noresume(dev);
spin_lock_irq(&dev->power.lock);
if (dev->power.request_pending
&& dev->power.request == RPM_REQ_RESUME) {
rpm_resume(dev, 0);
retval = 1;
}
__pm_runtime_barrier(dev);
spin_unlock_irq(&dev->power.lock);
pm_runtime_put_noidle(dev);
return retval;
}
EXPORT_SYMBOL_GPL(pm_runtime_barrier);
/**
* __pm_runtime_disable - Disable runtime PM of a device.
* @dev: Device to handle.
* @check_resume: If set, check if there's a resume request for the device.
*
* Increment power.disable_depth for the device and if it was zero previously,
* cancel all pending runtime PM requests for the device and wait for all
* operations in progress to complete. The device can be either active or
* suspended after its runtime PM has been disabled.
*
* If @check_resume is set and there's a resume request pending when
* __pm_runtime_disable() is called and power.disable_depth is zero, the
* function will wake up the device before disabling its runtime PM.
*/
void __pm_runtime_disable(struct device *dev, bool check_resume)
{
spin_lock_irq(&dev->power.lock);
if (dev->power.disable_depth > 0) {
dev->power.disable_depth++;
goto out;
}
/*
* Wake up the device if there's a resume request pending, because that
* means there probably is some I/O to process and disabling runtime PM
* shouldn't prevent the device from processing the I/O.
*/
if (check_resume && dev->power.request_pending && dev->power.request == RPM_REQ_RESUME) {
/*
* Prevent suspends and idle notifications from being carried
* out after we have woken up the device.
*/
pm_runtime_get_noresume(dev);
rpm_resume(dev, 0);
pm_runtime_put_noidle(dev);
}
/* Update time accounting before disabling PM-runtime. */
update_pm_runtime_accounting(dev);
if (!dev->power.disable_depth++) __pm_runtime_barrier(dev);
out:
spin_unlock_irq(&dev->power.lock);
}
EXPORT_SYMBOL_GPL(__pm_runtime_disable);
/**
* pm_runtime_enable - Enable runtime PM of a device.
* @dev: Device to handle.
*/
void pm_runtime_enable(struct device *dev)
{
unsigned long flags;
spin_lock_irqsave(&dev->power.lock, flags);
if (dev->power.disable_depth > 0) {
dev->power.disable_depth--;
/* About to enable runtime pm, set accounting_timestamp to now */
if (!dev->power.disable_depth)
dev->power.accounting_timestamp = ktime_get_mono_fast_ns();
} else {
dev_warn(dev, "Unbalanced %s!\n", __func__);
}
WARN(!dev->power.disable_depth &&
dev->power.runtime_status == RPM_SUSPENDED &&
!dev->power.ignore_children &&
atomic_read(&dev->power.child_count) > 0,
"Enabling runtime PM for inactive device (%s) with active children\n",
dev_name(dev));
spin_unlock_irqrestore(&dev->power.lock, flags);
}
EXPORT_SYMBOL_GPL(pm_runtime_enable);
static void pm_runtime_disable_action(void *data)
{
pm_runtime_disable(data);
}
/**
* devm_pm_runtime_enable - devres-enabled version of pm_runtime_enable.
* @dev: Device to handle.
*/
int devm_pm_runtime_enable(struct device *dev)
{
pm_runtime_enable(dev);
return devm_add_action_or_reset(dev, pm_runtime_disable_action, dev);
}
EXPORT_SYMBOL_GPL(devm_pm_runtime_enable);
/**
* pm_runtime_forbid - Block runtime PM of a device.
* @dev: Device to handle.
*
* Increase the device's usage count and clear its power.runtime_auto flag,
* so that it cannot be suspended at run time until pm_runtime_allow() is called
* for it.
*/
void pm_runtime_forbid(struct device *dev)
{
spin_lock_irq(&dev->power.lock);
if (!dev->power.runtime_auto)
goto out;
dev->power.runtime_auto = false;
atomic_inc(&dev->power.usage_count);
rpm_resume(dev, 0);
out:
spin_unlock_irq(&dev->power.lock);
}
EXPORT_SYMBOL_GPL(pm_runtime_forbid);
/**
* pm_runtime_allow - Unblock runtime PM of a device.
* @dev: Device to handle.
*
* Decrease the device's usage count and set its power.runtime_auto flag.
*/
void pm_runtime_allow(struct device *dev)
{
spin_lock_irq(&dev->power.lock);
if (dev->power.runtime_auto)
goto out;
dev->power.runtime_auto = true;
if (atomic_dec_and_test(&dev->power.usage_count))
rpm_idle(dev, RPM_AUTO | RPM_ASYNC);
else
trace_rpm_usage_rcuidle(dev, RPM_AUTO | RPM_ASYNC);
out:
spin_unlock_irq(&dev->power.lock);
}
EXPORT_SYMBOL_GPL(pm_runtime_allow);
/**
* pm_runtime_no_callbacks - Ignore runtime PM callbacks for a device.
* @dev: Device to handle.
*
* Set the power.no_callbacks flag, which tells the PM core that this
* device is power-managed through its parent and has no runtime PM
* callbacks of its own. The runtime sysfs attributes will be removed.
*/
void pm_runtime_no_callbacks(struct device *dev)
{
spin_lock_irq(&dev->power.lock);
dev->power.no_callbacks = 1;
spin_unlock_irq(&dev->power.lock);
if (device_is_registered(dev))
rpm_sysfs_remove(dev);
}
EXPORT_SYMBOL_GPL(pm_runtime_no_callbacks);
/**
* pm_runtime_irq_safe - Leave interrupts disabled during callbacks.
* @dev: Device to handle
*
* Set the power.irq_safe flag, which tells the PM core that the
* ->runtime_suspend() and ->runtime_resume() callbacks for this device should
* always be invoked with the spinlock held and interrupts disabled. It also
* causes the parent's usage counter to be permanently incremented, preventing
* the parent from runtime suspending -- otherwise an irq-safe child might have
* to wait for a non-irq-safe parent.
*/
void pm_runtime_irq_safe(struct device *dev)
{
if (dev->parent)
pm_runtime_get_sync(dev->parent);
spin_lock_irq(&dev->power.lock);
dev->power.irq_safe = 1;
spin_unlock_irq(&dev->power.lock);
}
EXPORT_SYMBOL_GPL(pm_runtime_irq_safe);
/**
* update_autosuspend - Handle a change to a device's autosuspend settings.
* @dev: Device to handle.
* @old_delay: The former autosuspend_delay value.
* @old_use: The former use_autosuspend value.
*
* Prevent runtime suspend if the new delay is negative and use_autosuspend is
* set; otherwise allow it. Send an idle notification if suspends are allowed.
*
* This function must be called under dev->power.lock with interrupts disabled.
*/
static void update_autosuspend(struct device *dev, int old_delay, int old_use)
{
int delay = dev->power.autosuspend_delay;
/* Should runtime suspend be prevented now? */
if (dev->power.use_autosuspend && delay < 0) {
/* If it used to be allowed then prevent it. */
if (!old_use || old_delay >= 0) {
atomic_inc(&dev->power.usage_count);
rpm_resume(dev, 0);
} else {
trace_rpm_usage_rcuidle(dev, 0);
}
}
/* Runtime suspend should be allowed now. */
else {
/* If it used to be prevented then allow it. */
if (old_use && old_delay < 0)
atomic_dec(&dev->power.usage_count);
/* Maybe we can autosuspend now. */
rpm_idle(dev, RPM_AUTO);
}
}
/**
* pm_runtime_set_autosuspend_delay - Set a device's autosuspend_delay value.
* @dev: Device to handle.
* @delay: Value of the new delay in milliseconds.
*
* Set the device's power.autosuspend_delay value. If it changes to negative
* and the power.use_autosuspend flag is set, prevent runtime suspends. If it
* changes the other way, allow runtime suspends.
*/
void pm_runtime_set_autosuspend_delay(struct device *dev, int delay)
{
int old_delay, old_use;
spin_lock_irq(&dev->power.lock);
old_delay = dev->power.autosuspend_delay;
old_use = dev->power.use_autosuspend;
dev->power.autosuspend_delay = delay;
update_autosuspend(dev, old_delay, old_use);
spin_unlock_irq(&dev->power.lock);
}
EXPORT_SYMBOL_GPL(pm_runtime_set_autosuspend_delay);
/**
* __pm_runtime_use_autosuspend - Set a device's use_autosuspend flag.
* @dev: Device to handle.
* @use: New value for use_autosuspend.
*
* Set the device's power.use_autosuspend flag, and allow or prevent runtime
* suspends as needed.
*/
void __pm_runtime_use_autosuspend(struct device *dev, bool use)
{
int old_delay, old_use;
spin_lock_irq(&dev->power.lock);
old_delay = dev->power.autosuspend_delay;
old_use = dev->power.use_autosuspend;
dev->power.use_autosuspend = use;
update_autosuspend(dev, old_delay, old_use);
spin_unlock_irq(&dev->power.lock);
}
EXPORT_SYMBOL_GPL(__pm_runtime_use_autosuspend);
/**
* pm_runtime_init - Initialize runtime PM fields in given device object.
* @dev: Device object to initialize.
*/
void pm_runtime_init(struct device *dev)
{
dev->power.runtime_status = RPM_SUSPENDED;
dev->power.idle_notification = false;
dev->power.disable_depth = 1;
atomic_set(&dev->power.usage_count, 0);
dev->power.runtime_error = 0;
atomic_set(&dev->power.child_count, 0);
pm_suspend_ignore_children(dev, false);
dev->power.runtime_auto = true;
dev->power.request_pending = false;
dev->power.request = RPM_REQ_NONE;
dev->power.deferred_resume = false;
dev->power.needs_force_resume = 0;
INIT_WORK(&dev->power.work, pm_runtime_work);
dev->power.timer_expires = 0;
hrtimer_init(&dev->power.suspend_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
dev->power.suspend_timer.function = pm_suspend_timer_fn;
init_waitqueue_head(&dev->power.wait_queue);
}
/**
* pm_runtime_reinit - Re-initialize runtime PM fields in given device object.
* @dev: Device object to re-initialize.
*/
void pm_runtime_reinit(struct device *dev)
{
if (!pm_runtime_enabled(dev)) {
if (dev->power.runtime_status == RPM_ACTIVE)
pm_runtime_set_suspended(dev);
if (dev->power.irq_safe) {
spin_lock_irq(&dev->power.lock);
dev->power.irq_safe = 0;
spin_unlock_irq(&dev->power.lock);
if (dev->parent)
pm_runtime_put(dev->parent);
}
}
}
/**
* pm_runtime_remove - Prepare for removing a device from device hierarchy.
* @dev: Device object being removed from device hierarchy.
*/
void pm_runtime_remove(struct device *dev)
{
__pm_runtime_disable(dev, false);
pm_runtime_reinit(dev);
}
/**
* pm_runtime_get_suppliers - Resume and reference-count supplier devices.
* @dev: Consumer device.
*/
void pm_runtime_get_suppliers(struct device *dev)
{
struct device_link *link;
int idx;
idx = device_links_read_lock();
list_for_each_entry_rcu(link, &dev->links.suppliers, c_node,
device_links_read_lock_held())
if (link->flags & DL_FLAG_PM_RUNTIME) {
link->supplier_preactivated = true;
pm_runtime_get_sync(link->supplier);
refcount_inc(&link->rpm_active);
}
device_links_read_unlock(idx);
}
/**
* pm_runtime_put_suppliers - Drop references to supplier devices.
* @dev: Consumer device.
*/
void pm_runtime_put_suppliers(struct device *dev)
{
struct device_link *link;
unsigned long flags;
bool put;
int idx;
idx = device_links_read_lock();
list_for_each_entry_rcu(link, &dev->links.suppliers, c_node,
device_links_read_lock_held())
if (link->supplier_preactivated) {
link->supplier_preactivated = false;
spin_lock_irqsave(&dev->power.lock, flags);
put = pm_runtime_status_suspended(dev) &&
refcount_dec_not_one(&link->rpm_active);
spin_unlock_irqrestore(&dev->power.lock, flags);
if (put)
pm_runtime_put(link->supplier);
}
device_links_read_unlock(idx);
}
void pm_runtime_new_link(struct device *dev)
{
spin_lock_irq(&dev->power.lock);
dev->power.links_count++;
spin_unlock_irq(&dev->power.lock);
}
static void pm_runtime_drop_link_count(struct device *dev)
{
spin_lock_irq(&dev->power.lock);
WARN_ON(dev->power.links_count == 0);
dev->power.links_count--;
spin_unlock_irq(&dev->power.lock);
}
/**
* pm_runtime_drop_link - Prepare for device link removal.
* @link: Device link going away.
*
* Drop the link count of the consumer end of @link and decrement the supplier
* device's runtime PM usage counter as many times as needed to drop all of the
* PM runtime reference to it from the consumer.
*/
void pm_runtime_drop_link(struct device_link *link)
{
if (!(link->flags & DL_FLAG_PM_RUNTIME))
return;
pm_runtime_drop_link_count(link->consumer);
pm_runtime_release_supplier(link, true);
}
static bool pm_runtime_need_not_resume(struct device *dev)
{
return atomic_read(&dev->power.usage_count) <= 1 &&
(atomic_read(&dev->power.child_count) == 0 ||
dev->power.ignore_children);
}
/**
* pm_runtime_force_suspend - Force a device into suspend state if needed.
* @dev: Device to suspend.
*
* Disable runtime PM so we safely can check the device's runtime PM status and
* if it is active, invoke its ->runtime_suspend callback to suspend it and
* change its runtime PM status field to RPM_SUSPENDED. Also, if the device's
* usage and children counters don't indicate that the device was in use before
* the system-wide transition under way, decrement its parent's children counter
* (if there is a parent). Keep runtime PM disabled to preserve the state
* unless we encounter errors.
*
* Typically this function may be invoked from a system suspend callback to make
* sure the device is put into low power state and it should only be used during
* system-wide PM transitions to sleep states. It assumes that the analogous
* pm_runtime_force_resume() will be used to resume the device.
*/
int pm_runtime_force_suspend(struct device *dev)
{
int (*callback)(struct device *);
int ret;
pm_runtime_disable(dev);
if (pm_runtime_status_suspended(dev))
return 0;
callback = RPM_GET_CALLBACK(dev, runtime_suspend);
ret = callback ? callback(dev) : 0;
if (ret)
goto err;
/*
* If the device can stay in suspend after the system-wide transition
* to the working state that will follow, drop the children counter of
* its parent, but set its status to RPM_SUSPENDED anyway in case this
* function will be called again for it in the meantime.
*/
if (pm_runtime_need_not_resume(dev)) {
pm_runtime_set_suspended(dev);
} else {
__update_runtime_status(dev, RPM_SUSPENDED);
dev->power.needs_force_resume = 1;
}
return 0;
err:
pm_runtime_enable(dev);
return ret;
}
EXPORT_SYMBOL_GPL(pm_runtime_force_suspend);
/**
* pm_runtime_force_resume - Force a device into resume state if needed.
* @dev: Device to resume.
*
* Prior invoking this function we expect the user to have brought the device
* into low power state by a call to pm_runtime_force_suspend(). Here we reverse
* those actions and bring the device into full power, if it is expected to be
* used on system resume. In the other case, we defer the resume to be managed
* via runtime PM.
*
* Typically this function may be invoked from a system resume callback.
*/
int pm_runtime_force_resume(struct device *dev)
{
int (*callback)(struct device *);
int ret = 0;
if (!pm_runtime_status_suspended(dev) || !dev->power.needs_force_resume)
goto out;
/*
* The value of the parent's children counter is correct already, so
* just update the status of the device.
*/
__update_runtime_status(dev, RPM_ACTIVE);
callback = RPM_GET_CALLBACK(dev, runtime_resume);
ret = callback ? callback(dev) : 0;
if (ret) {
pm_runtime_set_suspended(dev);
goto out;
}
pm_runtime_mark_last_busy(dev);
out:
dev->power.needs_force_resume = 0;
pm_runtime_enable(dev);
return ret;
}
EXPORT_SYMBOL_GPL(pm_runtime_force_resume);
// SPDX-License-Identifier: GPL-2.0
/*
* Copyright (C) 2007 Oracle. All rights reserved.
*/
#include "ctree.h"
#include "disk-io.h"
#include "transaction.h"
/*
* insert a name into a directory, doing overflow properly if there is a hash
* collision. data_size indicates how big the item inserted should be. On
* success a struct btrfs_dir_item pointer is returned, otherwise it is
* an ERR_PTR.
*
* The name is not copied into the dir item, you have to do that yourself.
*/
static struct btrfs_dir_item *insert_with_overflow(struct btrfs_trans_handle
*trans,
struct btrfs_root *root,
struct btrfs_path *path,
struct btrfs_key *cpu_key,
u32 data_size,
const char *name,
int name_len)
{
struct btrfs_fs_info *fs_info = root->fs_info;
int ret;
char *ptr;
struct btrfs_item *item;
struct extent_buffer *leaf;
ret = btrfs_insert_empty_item(trans, root, path, cpu_key, data_size);
if (ret == -EEXIST) {
struct btrfs_dir_item *di;
di = btrfs_match_dir_item_name(fs_info, path, name, name_len);
if (di)
return ERR_PTR(-EEXIST);
btrfs_extend_item(path, data_size); } else if (ret < 0) return ERR_PTR(ret); WARN_ON(ret > 0); leaf = path->nodes[0];
item = btrfs_item_nr(path->slots[0]);
ptr = btrfs_item_ptr(leaf, path->slots[0], char);
BUG_ON(data_size > btrfs_item_size(leaf, item));
ptr += btrfs_item_size(leaf, item) - data_size;
return (struct btrfs_dir_item *)ptr;
}
/*
* xattrs work a lot like directories, this inserts an xattr item
* into the tree
*/
int btrfs_insert_xattr_item(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_path *path, u64 objectid,
const char *name, u16 name_len,
const void *data, u16 data_len)
{
int ret = 0;
struct btrfs_dir_item *dir_item;
unsigned long name_ptr, data_ptr;
struct btrfs_key key, location;
struct btrfs_disk_key disk_key;
struct extent_buffer *leaf;
u32 data_size;
if (name_len + data_len > BTRFS_MAX_XATTR_SIZE(root->fs_info))
return -ENOSPC;
key.objectid = objectid;
key.type = BTRFS_XATTR_ITEM_KEY;
key.offset = btrfs_name_hash(name, name_len);
data_size = sizeof(*dir_item) + name_len + data_len;
dir_item = insert_with_overflow(trans, root, path, &key, data_size,
name, name_len);
if (IS_ERR(dir_item))
return PTR_ERR(dir_item);
memset(&location, 0, sizeof(location));
leaf = path->nodes[0];
btrfs_cpu_key_to_disk(&disk_key, &location);
btrfs_set_dir_item_key(leaf, dir_item, &disk_key);
btrfs_set_dir_type(leaf, dir_item, BTRFS_FT_XATTR);
btrfs_set_dir_name_len(leaf, dir_item, name_len);
btrfs_set_dir_transid(leaf, dir_item, trans->transid);
btrfs_set_dir_data_len(leaf, dir_item, data_len);
name_ptr = (unsigned long)(dir_item + 1);
data_ptr = (unsigned long)((char *)name_ptr + name_len);
write_extent_buffer(leaf, name, name_ptr, name_len);
write_extent_buffer(leaf, data, data_ptr, data_len);
btrfs_mark_buffer_dirty(path->nodes[0]);
return ret;
}
/*
* insert a directory item in the tree, doing all the magic for
* both indexes. 'dir' indicates which objectid to insert it into,
* 'location' is the key to stuff into the directory item, 'type' is the
* type of the inode we're pointing to, and 'index' is the sequence number
* to use for the second index (if one is created).
* Will return 0 or -ENOMEM
*/
int btrfs_insert_dir_item(struct btrfs_trans_handle *trans, const char *name,
int name_len, struct btrfs_inode *dir,
struct btrfs_key *location, u8 type, u64 index)
{
int ret = 0;
int ret2 = 0;
struct btrfs_root *root = dir->root;
struct btrfs_path *path;
struct btrfs_dir_item *dir_item;
struct extent_buffer *leaf;
unsigned long name_ptr;
struct btrfs_key key;
struct btrfs_disk_key disk_key;
u32 data_size;
key.objectid = btrfs_ino(dir);
key.type = BTRFS_DIR_ITEM_KEY;
key.offset = btrfs_name_hash(name, name_len);
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
btrfs_cpu_key_to_disk(&disk_key, location);
data_size = sizeof(*dir_item) + name_len;
dir_item = insert_with_overflow(trans, root, path, &key, data_size,
name, name_len);
if (IS_ERR(dir_item)) {
ret = PTR_ERR(dir_item);
if (ret == -EEXIST)
goto second_insert;
goto out_free;
}
leaf = path->nodes[0];
btrfs_set_dir_item_key(leaf, dir_item, &disk_key);
btrfs_set_dir_type(leaf, dir_item, type);
btrfs_set_dir_data_len(leaf, dir_item, 0);
btrfs_set_dir_name_len(leaf, dir_item, name_len);
btrfs_set_dir_transid(leaf, dir_item, trans->transid);
name_ptr = (unsigned long)(dir_item + 1);
write_extent_buffer(leaf, name, name_ptr, name_len);
btrfs_mark_buffer_dirty(leaf);
second_insert:
/* FIXME, use some real flag for selecting the extra index */
if (root == root->fs_info->tree_root) {
ret = 0;
goto out_free;
}
btrfs_release_path(path);
ret2 = btrfs_insert_delayed_dir_index(trans, name, name_len, dir,
&disk_key, type, index);
out_free:
btrfs_free_path(path);
if (ret)
return ret;
if (ret2)
return ret2;
return 0;
}
static struct btrfs_dir_item *btrfs_lookup_match_dir(
struct btrfs_trans_handle *trans,
struct btrfs_root *root, struct btrfs_path *path,
struct btrfs_key *key, const char *name,
int name_len, int mod)
{
const int ins_len = (mod < 0 ? -1 : 0);
const int cow = (mod != 0);
int ret;
ret = btrfs_search_slot(trans, root, key, path, ins_len, cow);
if (ret < 0)
return ERR_PTR(ret); if (ret > 0)
return ERR_PTR(-ENOENT);
return btrfs_match_dir_item_name(root->fs_info, path, name, name_len);
}
/*
* Lookup for a directory item by name.
*
* @trans: The transaction handle to use. Can be NULL if @mod is 0.
* @root: The root of the target tree.
* @path: Path to use for the search.
* @dir: The inode number (objectid) of the directory.
* @name: The name associated to the directory entry we are looking for.
* @name_len: The length of the name.
* @mod: Used to indicate if the tree search is meant for a read only
* lookup, for a modification lookup or for a deletion lookup, so
* its value should be 0, 1 or -1, respectively.
*
* Returns: NULL if the dir item does not exists, an error pointer if an error
* happened, or a pointer to a dir item if a dir item exists for the given name.
*/
struct btrfs_dir_item *btrfs_lookup_dir_item(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_path *path, u64 dir,
const char *name, int name_len,
int mod)
{
struct btrfs_key key;
struct btrfs_dir_item *di;
key.objectid = dir;
key.type = BTRFS_DIR_ITEM_KEY;
key.offset = btrfs_name_hash(name, name_len);
di = btrfs_lookup_match_dir(trans, root, path, &key, name, name_len, mod);
if (IS_ERR(di) && PTR_ERR(di) == -ENOENT)
return NULL;
return di;
}
int btrfs_check_dir_item_collision(struct btrfs_root *root, u64 dir,
const char *name, int name_len)
{
int ret;
struct btrfs_key key;
struct btrfs_dir_item *di;
int data_size;
struct extent_buffer *leaf;
int slot;
struct btrfs_path *path;
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
key.objectid = dir;
key.type = BTRFS_DIR_ITEM_KEY;
key.offset = btrfs_name_hash(name, name_len);
di = btrfs_lookup_match_dir(NULL, root, path, &key, name, name_len, 0);
if (IS_ERR(di)) {
ret = PTR_ERR(di);
/* Nothing found, we're safe */
if (ret == -ENOENT) {
ret = 0;
goto out;
}
if (ret < 0)
goto out;
}
/* we found an item, look for our name in the item */
if (di) {
/* our exact name was found */
ret = -EEXIST;
goto out;
}
/*
* see if there is room in the item to insert this
* name
*/
data_size = sizeof(*di) + name_len;
leaf = path->nodes[0];
slot = path->slots[0];
if (data_size + btrfs_item_size_nr(leaf, slot) +
sizeof(struct btrfs_item) > BTRFS_LEAF_DATA_SIZE(root->fs_info)) {
ret = -EOVERFLOW;
} else {
/* plenty of insertion room */
ret = 0;
}
out:
btrfs_free_path(path); return ret;
}
/*
* Lookup for a directory index item by name and index number.
*
* @trans: The transaction handle to use. Can be NULL if @mod is 0.
* @root: The root of the target tree.
* @path: Path to use for the search.
* @dir: The inode number (objectid) of the directory.
* @index: The index number.
* @name: The name associated to the directory entry we are looking for.
* @name_len: The length of the name.
* @mod: Used to indicate if the tree search is meant for a read only
* lookup, for a modification lookup or for a deletion lookup, so
* its value should be 0, 1 or -1, respectively.
*
* Returns: NULL if the dir index item does not exists, an error pointer if an
* error happened, or a pointer to a dir item if the dir index item exists and
* matches the criteria (name and index number).
*/
struct btrfs_dir_item *
btrfs_lookup_dir_index_item(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_path *path, u64 dir,
u64 index, const char *name, int name_len,
int mod)
{
struct btrfs_dir_item *di;
struct btrfs_key key;
key.objectid = dir;
key.type = BTRFS_DIR_INDEX_KEY;
key.offset = index;
di = btrfs_lookup_match_dir(trans, root, path, &key, name, name_len, mod);
if (di == ERR_PTR(-ENOENT))
return NULL;
return di;
}
struct btrfs_dir_item *
btrfs_search_dir_index_item(struct btrfs_root *root,
struct btrfs_path *path, u64 dirid,
const char *name, int name_len)
{
struct extent_buffer *leaf;
struct btrfs_dir_item *di;
struct btrfs_key key;
u32 nritems;
int ret;
key.objectid = dirid;
key.type = BTRFS_DIR_INDEX_KEY;
key.offset = 0;
ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
if (ret < 0)
return ERR_PTR(ret);
leaf = path->nodes[0];
nritems = btrfs_header_nritems(leaf);
while (1) {
if (path->slots[0] >= nritems) {
ret = btrfs_next_leaf(root, path);
if (ret < 0)
return ERR_PTR(ret);
if (ret > 0)
break;
leaf = path->nodes[0];
nritems = btrfs_header_nritems(leaf);
continue;
}
btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
if (key.objectid != dirid || key.type != BTRFS_DIR_INDEX_KEY)
break;
di = btrfs_match_dir_item_name(root->fs_info, path,
name, name_len);
if (di)
return di;
path->slots[0]++;
}
return NULL;
}
struct btrfs_dir_item *btrfs_lookup_xattr(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_path *path, u64 dir,
const char *name, u16 name_len,
int mod)
{
struct btrfs_key key;
struct btrfs_dir_item *di;
key.objectid = dir;
key.type = BTRFS_XATTR_ITEM_KEY;
key.offset = btrfs_name_hash(name, name_len);
di = btrfs_lookup_match_dir(trans, root, path, &key, name, name_len, mod);
if (IS_ERR(di) && PTR_ERR(di) == -ENOENT)
return NULL;
return di;
}
/*
* helper function to look at the directory item pointed to by 'path'
* this walks through all the entries in a dir item and finds one
* for a specific name.
*/
struct btrfs_dir_item *btrfs_match_dir_item_name(struct btrfs_fs_info *fs_info,
struct btrfs_path *path,
const char *name, int name_len)
{
struct btrfs_dir_item *dir_item;
unsigned long name_ptr;
u32 total_len;
u32 cur = 0;
u32 this_len;
struct extent_buffer *leaf;
leaf = path->nodes[0];
dir_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dir_item);
total_len = btrfs_item_size_nr(leaf, path->slots[0]);
while (cur < total_len) {
this_len = sizeof(*dir_item) +
btrfs_dir_name_len(leaf, dir_item) +
btrfs_dir_data_len(leaf, dir_item);
name_ptr = (unsigned long)(dir_item + 1);
if (btrfs_dir_name_len(leaf, dir_item) == name_len && memcmp_extent_buffer(leaf, name, name_ptr, name_len) == 0)
return dir_item;
cur += this_len;
dir_item = (struct btrfs_dir_item *)((char *)dir_item +
this_len);
}
return NULL;
}
/*
* given a pointer into a directory item, delete it. This
* handles items that have more than one entry in them.
*/
int btrfs_delete_one_dir_name(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_path *path,
struct btrfs_dir_item *di)
{
struct extent_buffer *leaf;
u32 sub_item_len;
u32 item_len;
int ret = 0;
leaf = path->nodes[0];
sub_item_len = sizeof(*di) + btrfs_dir_name_len(leaf, di) +
btrfs_dir_data_len(leaf, di);
item_len = btrfs_item_size_nr(leaf, path->slots[0]);
if (sub_item_len == item_len) {
ret = btrfs_del_item(trans, root, path);
} else {
/* MARKER */
unsigned long ptr = (unsigned long)di;
unsigned long start;
start = btrfs_item_ptr_offset(leaf, path->slots[0]);
memmove_extent_buffer(leaf, ptr, ptr + sub_item_len,
item_len - (ptr + sub_item_len - start));
btrfs_truncate_item(path, item_len - sub_item_len, 1);
}
return ret;
}
// SPDX-License-Identifier: GPL-2.0
#include "misc.h"
#include "ctree.h"
#include "space-info.h"
#include "sysfs.h"
#include "volumes.h"
#include "free-space-cache.h"
#include "ordered-data.h"
#include "transaction.h"
#include "block-group.h"
/*
* HOW DOES SPACE RESERVATION WORK
*
* If you want to know about delalloc specifically, there is a separate comment
* for that with the delalloc code. This comment is about how the whole system
* works generally.
*
* BASIC CONCEPTS
*
* 1) space_info. This is the ultimate arbiter of how much space we can use.
* There's a description of the bytes_ fields with the struct declaration,
* refer to that for specifics on each field. Suffice it to say that for
* reservations we care about total_bytes - SUM(space_info->bytes_) when
* determining if there is space to make an allocation. There is a space_info
* for METADATA, SYSTEM, and DATA areas.
*
* 2) block_rsv's. These are basically buckets for every different type of
* metadata reservation we have. You can see the comment in the block_rsv
* code on the rules for each type, but generally block_rsv->reserved is how
* much space is accounted for in space_info->bytes_may_use.
*
* 3) btrfs_calc*_size. These are the worst case calculations we used based
* on the number of items we will want to modify. We have one for changing
* items, and one for inserting new items. Generally we use these helpers to
* determine the size of the block reserves, and then use the actual bytes
* values to adjust the space_info counters.
*
* MAKING RESERVATIONS, THE NORMAL CASE
*
* We call into either btrfs_reserve_data_bytes() or
* btrfs_reserve_metadata_bytes(), depending on which we're looking for, with
* num_bytes we want to reserve.
*
* ->reserve
* space_info->bytes_may_reserve += num_bytes
*
* ->extent allocation
* Call btrfs_add_reserved_bytes() which does
* space_info->bytes_may_reserve -= num_bytes
* space_info->bytes_reserved += extent_bytes
*
* ->insert reference
* Call btrfs_update_block_group() which does
* space_info->bytes_reserved -= extent_bytes
* space_info->bytes_used += extent_bytes
*
* MAKING RESERVATIONS, FLUSHING NORMALLY (non-priority)
*
* Assume we are unable to simply make the reservation because we do not have
* enough space
*
* -> __reserve_bytes
* create a reserve_ticket with ->bytes set to our reservation, add it to
* the tail of space_info->tickets, kick async flush thread
*
* ->handle_reserve_ticket
* wait on ticket->wait for ->bytes to be reduced to 0, or ->error to be set
* on the ticket.
*
* -> btrfs_async_reclaim_metadata_space/btrfs_async_reclaim_data_space
* Flushes various things attempting to free up space.
*
* -> btrfs_try_granting_tickets()
* This is called by anything that either subtracts space from
* space_info->bytes_may_use, ->bytes_pinned, etc, or adds to the
* space_info->total_bytes. This loops through the ->priority_tickets and
* then the ->tickets list checking to see if the reservation can be
* completed. If it can the space is added to space_info->bytes_may_use and
* the ticket is woken up.
*
* -> ticket wakeup
* Check if ->bytes == 0, if it does we got our reservation and we can carry
* on, if not return the appropriate error (ENOSPC, but can be EINTR if we
* were interrupted.)
*
* MAKING RESERVATIONS, FLUSHING HIGH PRIORITY
*
* Same as the above, except we add ourselves to the
* space_info->priority_tickets, and we do not use ticket->wait, we simply
* call flush_space() ourselves for the states that are safe for us to call
* without deadlocking and hope for the best.
*
* THE FLUSHING STATES
*
* Generally speaking we will have two cases for each state, a "nice" state
* and a "ALL THE THINGS" state. In btrfs we delay a lot of work in order to
* reduce the locking over head on the various trees, and even to keep from
* doing any work at all in the case of delayed refs. Each of these delayed
* things however hold reservations, and so letting them run allows us to
* reclaim space so we can make new reservations.
*
* FLUSH_DELAYED_ITEMS
* Every inode has a delayed item to update the inode. Take a simple write
* for example, we would update the inode item at write time to update the
* mtime, and then again at finish_ordered_io() time in order to update the
* isize or bytes. We keep these delayed items to coalesce these operations
* into a single operation done on demand. These are an easy way to reclaim
* metadata space.
*
* FLUSH_DELALLOC
* Look at the delalloc comment to get an idea of how much space is reserved
* for delayed allocation. We can reclaim some of this space simply by
* running delalloc, but usually we need to wait for ordered extents to
* reclaim the bulk of this space.
*
* FLUSH_DELAYED_REFS
* We have a block reserve for the outstanding delayed refs space, and every
* delayed ref operation holds a reservation. Running these is a quick way
* to reclaim space, but we want to hold this until the end because COW can
* churn a lot and we can avoid making some extent tree modifications if we
* are able to delay for as long as possible.
*
* ALLOC_CHUNK
* We will skip this the first time through space reservation, because of
* overcommit and we don't want to have a lot of useless metadata space when
* our worst case reservations will likely never come true.
*
* RUN_DELAYED_IPUTS
* If we're freeing inodes we're likely freeing checksums, file extent
* items, and extent tree items. Loads of space could be freed up by these
* operations, however they won't be usable until the transaction commits.
*
* COMMIT_TRANS
* This will commit the transaction. Historically we had a lot of logic
* surrounding whether or not we'd commit the transaction, but this waits born
* out of a pre-tickets era where we could end up committing the transaction
* thousands of times in a row without making progress. Now thanks to our
* ticketing system we know if we're not making progress and can error
* everybody out after a few commits rather than burning the disk hoping for
* a different answer.
*
* OVERCOMMIT
*
* Because we hold so many reservations for metadata we will allow you to
* reserve more space than is currently free in the currently allocate
* metadata space. This only happens with metadata, data does not allow
* overcommitting.
*
* You can see the current logic for when we allow overcommit in
* btrfs_can_overcommit(), but it only applies to unallocated space. If there
* is no unallocated space to be had, all reservations are kept within the
* free space in the allocated metadata chunks.
*
* Because of overcommitting, you generally want to use the
* btrfs_can_overcommit() logic for metadata allocations, as it does the right
* thing with or without extra unallocated space.
*/
u64 __pure btrfs_space_info_used(struct btrfs_space_info *s_info,
bool may_use_included)
{
ASSERT(s_info);
return s_info->bytes_used + s_info->bytes_reserved +
s_info->bytes_pinned + s_info->bytes_readonly +
s_info->bytes_zone_unusable + (may_use_included ? s_info->bytes_may_use : 0);
}
/*
* after adding space to the filesystem, we need to clear the full flags
* on all the space infos.
*/
void btrfs_clear_space_info_full(struct btrfs_fs_info *info)
{
struct list_head *head = &info->space_info;
struct btrfs_space_info *found;
list_for_each_entry(found, head, list)
found->full = 0;
}
static int create_space_info(struct btrfs_fs_info *info, u64 flags)
{
struct btrfs_space_info *space_info;
int i;
int ret;
space_info = kzalloc(sizeof(*space_info), GFP_NOFS);
if (!space_info)
return -ENOMEM;
for (i = 0; i < BTRFS_NR_RAID_TYPES; i++)
INIT_LIST_HEAD(&space_info->block_groups[i]); init_rwsem(&space_info->groups_sem);
spin_lock_init(&space_info->lock);
space_info->flags = flags & BTRFS_BLOCK_GROUP_TYPE_MASK;
space_info->force_alloc = CHUNK_ALLOC_NO_FORCE;
INIT_LIST_HEAD(&space_info->ro_bgs);
INIT_LIST_HEAD(&space_info->tickets);
INIT_LIST_HEAD(&space_info->priority_tickets);
space_info->clamp = 1;
ret = btrfs_sysfs_add_space_info_type(info, space_info);
if (ret)
return ret;
list_add(&space_info->list, &info->space_info);
if (flags & BTRFS_BLOCK_GROUP_DATA)
info->data_sinfo = space_info;
return ret;
}
int btrfs_init_space_info(struct btrfs_fs_info *fs_info)
{
struct btrfs_super_block *disk_super;
u64 features;
u64 flags;
int mixed = 0;
int ret;
disk_super = fs_info->super_copy;
if (!btrfs_super_root(disk_super))
return -EINVAL;
features = btrfs_super_incompat_flags(disk_super);
if (features & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS)
mixed = 1;
flags = BTRFS_BLOCK_GROUP_SYSTEM;
ret = create_space_info(fs_info, flags);
if (ret)
goto out;
if (mixed) {
flags = BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_DATA;
ret = create_space_info(fs_info, flags);
} else {
flags = BTRFS_BLOCK_GROUP_METADATA;
ret = create_space_info(fs_info, flags);
if (ret)
goto out;
flags = BTRFS_BLOCK_GROUP_DATA;
ret = create_space_info(fs_info, flags);
}
out:
return ret;
}
void btrfs_update_space_info(struct btrfs_fs_info *info, u64 flags,
u64 total_bytes, u64 bytes_used,
u64 bytes_readonly, u64 bytes_zone_unusable,
struct btrfs_space_info **space_info)
{
struct btrfs_space_info *found;
int factor;
factor = btrfs_bg_type_to_factor(flags);
found = btrfs_find_space_info(info, flags);
ASSERT(found);
spin_lock(&found->lock);
found->total_bytes += total_bytes;
found->disk_total += total_bytes * factor;
found->bytes_used += bytes_used;
found->disk_used += bytes_used * factor;
found->bytes_readonly += bytes_readonly;
found->bytes_zone_unusable += bytes_zone_unusable;
if (total_bytes > 0)
found->full = 0; btrfs_try_granting_tickets(info, found);
spin_unlock(&found->lock);
*space_info = found;
}
struct btrfs_space_info *btrfs_find_space_info(struct btrfs_fs_info *info,
u64 flags)
{
struct list_head *head = &info->space_info;
struct btrfs_space_info *found;
flags &= BTRFS_BLOCK_GROUP_TYPE_MASK;
list_for_each_entry(found, head, list) { if (found->flags & flags)
return found;
}
return NULL;
}
static u64 calc_available_free_space(struct btrfs_fs_info *fs_info,
struct btrfs_space_info *space_info,
enum btrfs_reserve_flush_enum flush)
{
u64 profile;
u64 avail;
int factor;
if (space_info->flags & BTRFS_BLOCK_GROUP_SYSTEM)
profile = btrfs_system_alloc_profile(fs_info);
else
profile = btrfs_metadata_alloc_profile(fs_info);
avail = atomic64_read(&fs_info->free_chunk_space);
/*
* If we have dup, raid1 or raid10 then only half of the free
* space is actually usable. For raid56, the space info used
* doesn't include the parity drive, so we don't have to
* change the math
*/
factor = btrfs_bg_type_to_factor(profile);
avail = div_u64(avail, factor);
/*
* If we aren't flushing all things, let us overcommit up to
* 1/2th of the space. If we can flush, don't let us overcommit
* too much, let it overcommit up to 1/8 of the space.
*/
if (flush == BTRFS_RESERVE_FLUSH_ALL)
avail >>= 3;
else
avail >>= 1; return avail;
}
int btrfs_can_overcommit(struct btrfs_fs_info *fs_info,
struct btrfs_space_info *space_info, u64 bytes,
enum btrfs_reserve_flush_enum flush)
{
u64 avail;
u64 used;
/* Don't overcommit when in mixed mode */
if (space_info->flags & BTRFS_BLOCK_GROUP_DATA)
return 0;
used = btrfs_space_info_used(space_info, true);
avail = calc_available_free_space(fs_info, space_info, flush);
if (used + bytes < space_info->total_bytes + avail)
return 1;
return 0;
}
static void remove_ticket(struct btrfs_space_info *space_info,
struct reserve_ticket *ticket)
{
if (!list_empty(&ticket->list)) {
list_del_init(&ticket->list);
ASSERT(space_info->reclaim_size >= ticket->bytes);
space_info->reclaim_size -= ticket->bytes;
}
}
/*
* This is for space we already have accounted in space_info->bytes_may_use, so
* basically when we're returning space from block_rsv's.
*/
void btrfs_try_granting_tickets(struct btrfs_fs_info *fs_info,
struct btrfs_space_info *space_info)
{
struct list_head *head;
enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_NO_FLUSH;
lockdep_assert_held(&space_info->lock);
head = &space_info->priority_tickets;
again:
while (!list_empty(head)) {
struct reserve_ticket *ticket;
u64 used = btrfs_space_info_used(space_info, true);
ticket = list_first_entry(head, struct reserve_ticket, list);
/* Check and see if our ticket can be satisfied now. */
if ((used + ticket->bytes <= space_info->total_bytes) || btrfs_can_overcommit(fs_info, space_info, ticket->bytes,
flush)) {
btrfs_space_info_update_bytes_may_use(fs_info,
space_info,
ticket->bytes);
remove_ticket(space_info, ticket);
ticket->bytes = 0;
space_info->tickets_id++;
wake_up(&ticket->wait);
} else {
break;
}
}
if (head == &space_info->priority_tickets) { head = &space_info->tickets;
flush = BTRFS_RESERVE_FLUSH_ALL;
goto again;
}
}
#define DUMP_BLOCK_RSV(fs_info, rsv_name) \
do { \
struct btrfs_block_rsv *__rsv = &(fs_info)->rsv_name; \
spin_lock(&__rsv->lock); \
btrfs_info(fs_info, #rsv_name ": size %llu reserved %llu", \
__rsv->size, __rsv->reserved); \
spin_unlock(&__rsv->lock); \
} while (0)
static void __btrfs_dump_space_info(struct btrfs_fs_info *fs_info,
struct btrfs_space_info *info)
{
lockdep_assert_held(&info->lock);
/* The free space could be negative in case of overcommit */
btrfs_info(fs_info, "space_info %llu has %lld free, is %sfull",
info->flags,
(s64)(info->total_bytes - btrfs_space_info_used(info, true)),
info->full ? "" : "not ");
btrfs_info(fs_info,
"space_info total=%llu, used=%llu, pinned=%llu, reserved=%llu, may_use=%llu, readonly=%llu zone_unusable=%llu",
info->total_bytes, info->bytes_used, info->bytes_pinned,
info->bytes_reserved, info->bytes_may_use,
info->bytes_readonly, info->bytes_zone_unusable);
DUMP_BLOCK_RSV(fs_info, global_block_rsv);
DUMP_BLOCK_RSV(fs_info, trans_block_rsv);
DUMP_BLOCK_RSV(fs_info, chunk_block_rsv);
DUMP_BLOCK_RSV(fs_info, delayed_block_rsv);
DUMP_BLOCK_RSV(fs_info, delayed_refs_rsv);
}
void btrfs_dump_space_info(struct btrfs_fs_info *fs_info,
struct btrfs_space_info *info, u64 bytes,
int dump_block_groups)
{
struct btrfs_block_group *cache;
int index = 0;
spin_lock(&info->lock);
__btrfs_dump_space_info(fs_info, info);
spin_unlock(&info->lock);
if (!dump_block_groups)
return;
down_read(&info->groups_sem);
again:
list_for_each_entry(cache, &info->block_groups[index], list) {
spin_lock(&cache->lock);
btrfs_info(fs_info,
"block group %llu has %llu bytes, %llu used %llu pinned %llu reserved %llu zone_unusable %s",
cache->start, cache->length, cache->used, cache->pinned,
cache->reserved, cache->zone_unusable,
cache->ro ? "[readonly]" : "");
spin_unlock(&cache->lock);
btrfs_dump_free_space(cache, bytes);
}
if (++index < BTRFS_NR_RAID_TYPES)
goto again;
up_read(&info->groups_sem);
}
static inline u64 calc_reclaim_items_nr(struct btrfs_fs_info *fs_info,
u64 to_reclaim)
{
u64 bytes;
u64 nr;
bytes = btrfs_calc_insert_metadata_size(fs_info, 1);
nr = div64_u64(to_reclaim, bytes);
if (!nr)
nr = 1;
return nr;
}
#define EXTENT_SIZE_PER_ITEM SZ_256K
/*
* shrink metadata reservation for delalloc
*/
static void shrink_delalloc(struct btrfs_fs_info *fs_info,
struct btrfs_space_info *space_info,
u64 to_reclaim, bool wait_ordered,
bool for_preempt)
{
struct btrfs_trans_handle *trans;
u64 delalloc_bytes;
u64 ordered_bytes;
u64 items;
long time_left;
int loops;
delalloc_bytes = percpu_counter_sum_positive(&fs_info->delalloc_bytes);
ordered_bytes = percpu_counter_sum_positive(&fs_info->ordered_bytes);
if (delalloc_bytes == 0 && ordered_bytes == 0)
return;
/* Calc the number of the pages we need flush for space reservation */
if (to_reclaim == U64_MAX) {
items = U64_MAX;
} else {
/*
* to_reclaim is set to however much metadata we need to
* reclaim, but reclaiming that much data doesn't really track
* exactly. What we really want to do is reclaim full inode's
* worth of reservations, however that's not available to us
* here. We will take a fraction of the delalloc bytes for our
* flushing loops and hope for the best. Delalloc will expand
* the amount we write to cover an entire dirty extent, which
* will reclaim the metadata reservation for that range. If
* it's not enough subsequent flush stages will be more
* aggressive.
*/
to_reclaim = max(to_reclaim, delalloc_bytes >> 3);
items = calc_reclaim_items_nr(fs_info, to_reclaim) * 2;
}
trans = (struct btrfs_trans_handle *)current->journal_info;
/*
* If we are doing more ordered than delalloc we need to just wait on
* ordered extents, otherwise we'll waste time trying to flush delalloc
* that likely won't give us the space back we need.
*/
if (ordered_bytes > delalloc_bytes && !for_preempt)
wait_ordered = true;
loops = 0;
while ((delalloc_bytes || ordered_bytes) && loops < 3) {
u64 temp = min(delalloc_bytes, to_reclaim) >> PAGE_SHIFT;
long nr_pages = min_t(u64, temp, LONG_MAX);
int async_pages;
btrfs_start_delalloc_roots(fs_info, nr_pages, true);
/*
* We need to make sure any outstanding async pages are now
* processed before we continue. This is because things like
* sync_inode() try to be smart and skip writing if the inode is
* marked clean. We don't use filemap_fwrite for flushing
* because we want to control how many pages we write out at a
* time, thus this is the only safe way to make sure we've
* waited for outstanding compressed workers to have started
* their jobs and thus have ordered extents set up properly.
*
* This exists because we do not want to wait for each
* individual inode to finish its async work, we simply want to
* start the IO on everybody, and then come back here and wait
* for all of the async work to catch up. Once we're done with
* that we know we'll have ordered extents for everything and we
* can decide if we wait for that or not.
*
* If we choose to replace this in the future, make absolutely
* sure that the proper waiting is being done in the async case,
* as there have been bugs in that area before.
*/
async_pages = atomic_read(&fs_info->async_delalloc_pages);
if (!async_pages)
goto skip_async;
/*
* We don't want to wait forever, if we wrote less pages in this
* loop than we have outstanding, only wait for that number of
* pages, otherwise we can wait for all async pages to finish
* before continuing.
*/
if (async_pages > nr_pages)
async_pages -= nr_pages;
else
async_pages = 0;
wait_event(fs_info->async_submit_wait,
atomic_read(&fs_info->async_delalloc_pages) <=
async_pages);
skip_async:
loops++;
if (wait_ordered && !trans) {
btrfs_wait_ordered_roots(fs_info, items, 0, (u64)-1);
} else {
time_left = schedule_timeout_killable(1);
if (time_left)
break;
}
/*
* If we are for preemption we just want a one-shot of delalloc
* flushing so we can stop flushing if we decide we don't need
* to anymore.
*/
if (for_preempt)
break;
spin_lock(&space_info->lock);
if (list_empty(&space_info->tickets) &&
list_empty(&space_info->priority_tickets)) {
spin_unlock(&space_info->lock);
break;
}
spin_unlock(&space_info->lock);
delalloc_bytes = percpu_counter_sum_positive(
&fs_info->delalloc_bytes);
ordered_bytes = percpu_counter_sum_positive(
&fs_info->ordered_bytes);
}
}
/*
* Try to flush some data based on policy set by @state. This is only advisory
* and may fail for various reasons. The caller is supposed to examine the
* state of @space_info to detect the outcome.
*/
static void flush_space(struct btrfs_fs_info *fs_info,
struct btrfs_space_info *space_info, u64 num_bytes,
enum btrfs_flush_state state, bool for_preempt)
{
struct btrfs_root *root = fs_info->extent_root;
struct btrfs_trans_handle *trans;
int nr;
int ret = 0;
switch (state) {
case FLUSH_DELAYED_ITEMS_NR:
case FLUSH_DELAYED_ITEMS:
if (state == FLUSH_DELAYED_ITEMS_NR)
nr = calc_reclaim_items_nr(fs_info, num_bytes) * 2;
else
nr = -1;
trans = btrfs_join_transaction(root);
if (IS_ERR(trans)) {
ret = PTR_ERR(trans);
break;
}
ret = btrfs_run_delayed_items_nr(trans, nr);
btrfs_end_transaction(trans);
break;
case FLUSH_DELALLOC:
case FLUSH_DELALLOC_WAIT:
case FLUSH_DELALLOC_FULL:
if (state == FLUSH_DELALLOC_FULL)
num_bytes = U64_MAX;
shrink_delalloc(fs_info, space_info, num_bytes,
state != FLUSH_DELALLOC, for_preempt);
break;
case FLUSH_DELAYED_REFS_NR:
case FLUSH_DELAYED_REFS:
trans = btrfs_join_transaction(root);
if (IS_ERR(trans)) {
ret = PTR_ERR(trans);
break;
}
if (state == FLUSH_DELAYED_REFS_NR)
nr = calc_reclaim_items_nr(fs_info, num_bytes);
else
nr = 0;
btrfs_run_delayed_refs(trans, nr);
btrfs_end_transaction(trans);
break;
case ALLOC_CHUNK:
case ALLOC_CHUNK_FORCE:
trans = btrfs_join_transaction(root);
if (IS_ERR(trans)) {
ret = PTR_ERR(trans);
break;
}
ret = btrfs_chunk_alloc(trans,
btrfs_get_alloc_profile(fs_info, space_info->flags),
(state == ALLOC_CHUNK) ? CHUNK_ALLOC_NO_FORCE :
CHUNK_ALLOC_FORCE);
btrfs_end_transaction(trans);
if (ret > 0 || ret == -ENOSPC)
ret = 0;
break;
case RUN_DELAYED_IPUTS:
/*
* If we have pending delayed iputs then we could free up a
* bunch of pinned space, so make sure we run the iputs before
* we do our pinned bytes check below.
*/
btrfs_run_delayed_iputs(fs_info);
btrfs_wait_on_delayed_iputs(fs_info);
break;
case COMMIT_TRANS:
ASSERT(current->journal_info == NULL);
trans = btrfs_join_transaction(root);
if (IS_ERR(trans)) {
ret = PTR_ERR(trans);
break;
}
ret = btrfs_commit_transaction(trans);
break;
default:
ret = -ENOSPC;
break;
}
trace_btrfs_flush_space(fs_info, space_info->flags, num_bytes, state,
ret, for_preempt);
return;
}
static inline u64
btrfs_calc_reclaim_metadata_size(struct btrfs_fs_info *fs_info,
struct btrfs_space_info *space_info)
{
u64 used;
u64 avail;
u64 to_reclaim = space_info->reclaim_size;
lockdep_assert_held(&space_info->lock);
avail = calc_available_free_space(fs_info, space_info,
BTRFS_RESERVE_FLUSH_ALL);
used = btrfs_space_info_used(space_info, true);
/*
* We may be flushing because suddenly we have less space than we had
* before, and now we're well over-committed based on our current free
* space. If that's the case add in our overage so we make sure to put
* appropriate pressure on the flushing state machine.
*/
if (space_info->total_bytes + avail < used)
to_reclaim += used - (space_info->total_bytes + avail);
return to_reclaim;
}
static bool need_preemptive_reclaim(struct btrfs_fs_info *fs_info,
struct btrfs_space_info *space_info)
{
u64 global_rsv_size = fs_info->global_block_rsv.reserved;
u64 ordered, delalloc;
u64 thresh = div_factor_fine(space_info->total_bytes, 90);
u64 used;
/* If we're just plain full then async reclaim just slows us down. */
if ((space_info->bytes_used + space_info->bytes_reserved +
global_rsv_size) >= thresh)
return false;
used = space_info->bytes_may_use + space_info->bytes_pinned;
/* The total flushable belongs to the global rsv, don't flush. */
if (global_rsv_size >= used)
return false;
/*
* 128MiB is 1/4 of the maximum global rsv size. If we have less than
* that devoted to other reservations then there's no sense in flushing,
* we don't have a lot of things that need flushing.
*/
if (used - global_rsv_size <= SZ_128M)
return false;
/*
* We have tickets queued, bail so we don't compete with the async
* flushers.
*/
if (space_info->reclaim_size)
return false;
/*
* If we have over half of the free space occupied by reservations or
* pinned then we want to start flushing.
*
* We do not do the traditional thing here, which is to say
*
* if (used >= ((total_bytes + avail) / 2))
* return 1;
*
* because this doesn't quite work how we want. If we had more than 50%
* of the space_info used by bytes_used and we had 0 available we'd just
* constantly run the background flusher. Instead we want it to kick in
* if our reclaimable space exceeds our clamped free space.
*
* Our clamping range is 2^1 -> 2^8. Practically speaking that means
* the following:
*
* Amount of RAM Minimum threshold Maximum threshold
*
* 256GiB 1GiB 128GiB
* 128GiB 512MiB 64GiB
* 64GiB 256MiB 32GiB
* 32GiB 128MiB 16GiB
* 16GiB 64MiB 8GiB
*
* These are the range our thresholds will fall in, corresponding to how
* much delalloc we need for the background flusher to kick in.
*/
thresh = calc_available_free_space(fs_info, space_info,
BTRFS_RESERVE_FLUSH_ALL);
used = space_info->bytes_used + space_info->bytes_reserved +
space_info->bytes_readonly + global_rsv_size;
if (used < space_info->total_bytes)
thresh += space_info->total_bytes - used; thresh >>= space_info->clamp;
used = space_info->bytes_pinned;
/*
* If we have more ordered bytes than delalloc bytes then we're either
* doing a lot of DIO, or we simply don't have a lot of delalloc waiting
* around. Preemptive flushing is only useful in that it can free up
* space before tickets need to wait for things to finish. In the case
* of ordered extents, preemptively waiting on ordered extents gets us
* nothing, if our reservations are tied up in ordered extents we'll
* simply have to slow down writers by forcing them to wait on ordered
* extents.
*
* In the case that ordered is larger than delalloc, only include the
* block reserves that we would actually be able to directly reclaim
* from. In this case if we're heavy on metadata operations this will
* clearly be heavy enough to warrant preemptive flushing. In the case
* of heavy DIO or ordered reservations, preemptive flushing will just
* waste time and cause us to slow down.
*
* We want to make sure we truly are maxed out on ordered however, so
* cut ordered in half, and if it's still higher than delalloc then we
* can keep flushing. This is to avoid the case where we start
* flushing, and now delalloc == ordered and we stop preemptively
* flushing when we could still have several gigs of delalloc to flush.
*/
ordered = percpu_counter_read_positive(&fs_info->ordered_bytes) >> 1;
delalloc = percpu_counter_read_positive(&fs_info->delalloc_bytes);
if (ordered >= delalloc)
used += fs_info->delayed_refs_rsv.reserved +
fs_info->delayed_block_rsv.reserved;
else
used += space_info->bytes_may_use - global_rsv_size; return (used >= thresh && !btrfs_fs_closing(fs_info) && !test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state));
}
static bool steal_from_global_rsv(struct btrfs_fs_info *fs_info,
struct btrfs_space_info *space_info,
struct reserve_ticket *ticket)
{
struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
u64 min_bytes;
if (global_rsv->space_info != space_info)
return false;
spin_lock(&global_rsv->lock);
min_bytes = div_factor(global_rsv->size, 1);
if (global_rsv->reserved < min_bytes + ticket->bytes) {
spin_unlock(&global_rsv->lock);
return false;
}
global_rsv->reserved -= ticket->bytes;
remove_ticket(space_info, ticket);
ticket->bytes = 0;
wake_up(&ticket->wait);
space_info->tickets_id++;
if (global_rsv->reserved < global_rsv->size)
global_rsv->full = 0;
spin_unlock(&global_rsv->lock);
return true;
}
/*
* maybe_fail_all_tickets - we've exhausted our flushing, start failing tickets
* @fs_info - fs_info for this fs
* @space_info - the space info we were flushing
*
* We call this when we've exhausted our flushing ability and haven't made
* progress in satisfying tickets. The reservation code handles tickets in
* order, so if there is a large ticket first and then smaller ones we could
* very well satisfy the smaller tickets. This will attempt to wake up any
* tickets in the list to catch this case.
*
* This function returns true if it was able to make progress by clearing out
* other tickets, or if it stumbles across a ticket that was smaller than the
* first ticket.
*/
static bool maybe_fail_all_tickets(struct btrfs_fs_info *fs_info,
struct btrfs_space_info *space_info)
{
struct reserve_ticket *ticket;
u64 tickets_id = space_info->tickets_id;
trace_btrfs_fail_all_tickets(fs_info, space_info);
if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) {
btrfs_info(fs_info, "cannot satisfy tickets, dumping space info");
__btrfs_dump_space_info(fs_info, space_info);
}
while (!list_empty(&space_info->tickets) &&
tickets_id == space_info->tickets_id) {
ticket = list_first_entry(&space_info->tickets,
struct reserve_ticket, list);
if (ticket->steal &&
steal_from_global_rsv(fs_info, space_info, ticket))
return true;
if (btrfs_test_opt(fs_info, ENOSPC_DEBUG))
btrfs_info(fs_info, "failing ticket with %llu bytes",
ticket->bytes);
remove_ticket(space_info, ticket);
ticket->error = -ENOSPC;
wake_up(&ticket->wait);
/*
* We're just throwing tickets away, so more flushing may not
* trip over btrfs_try_granting_tickets, so we need to call it
* here to see if we can make progress with the next ticket in
* the list.
*/
btrfs_try_granting_tickets(fs_info, space_info);
}
return (tickets_id != space_info->tickets_id);
}
/*
* This is for normal flushers, we can wait all goddamned day if we want to. We
* will loop and continuously try to flush as long as we are making progress.
* We count progress as clearing off tickets each time we have to loop.
*/
static void btrfs_async_reclaim_metadata_space(struct work_struct *work)
{
struct btrfs_fs_info *fs_info;
struct btrfs_space_info *space_info;
u64 to_reclaim;
enum btrfs_flush_state flush_state;
int commit_cycles = 0;
u64 last_tickets_id;
fs_info = container_of(work, struct btrfs_fs_info, async_reclaim_work);
space_info = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
spin_lock(&space_info->lock);
to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info, space_info);
if (!to_reclaim) {
space_info->flush = 0;
spin_unlock(&space_info->lock);
return;
}
last_tickets_id = space_info->tickets_id;
spin_unlock(&space_info->lock);
flush_state = FLUSH_DELAYED_ITEMS_NR;
do {
flush_space(fs_info, space_info, to_reclaim, flush_state, false);
spin_lock(&space_info->lock);
if (list_empty(&space_info->tickets)) {
space_info->flush = 0;
spin_unlock(&space_info->lock);
return;
}
to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info,
space_info);
if (last_tickets_id == space_info->tickets_id) {
flush_state++;
} else {
last_tickets_id = space_info->tickets_id;
flush_state = FLUSH_DELAYED_ITEMS_NR;
if (commit_cycles)
commit_cycles--;
}
/*
* We do not want to empty the system of delalloc unless we're
* under heavy pressure, so allow one trip through the flushing
* logic before we start doing a FLUSH_DELALLOC_FULL.
*/
if (flush_state == FLUSH_DELALLOC_FULL && !commit_cycles)
flush_state++;
/*
* We don't want to force a chunk allocation until we've tried
* pretty hard to reclaim space. Think of the case where we
* freed up a bunch of space and so have a lot of pinned space
* to reclaim. We would rather use that than possibly create a
* underutilized metadata chunk. So if this is our first run
* through the flushing state machine skip ALLOC_CHUNK_FORCE and
* commit the transaction. If nothing has changed the next go
* around then we can force a chunk allocation.
*/
if (flush_state == ALLOC_CHUNK_FORCE && !commit_cycles)
flush_state++;
if (flush_state > COMMIT_TRANS) {
commit_cycles++;
if (commit_cycles > 2) {
if (maybe_fail_all_tickets(fs_info, space_info)) {
flush_state = FLUSH_DELAYED_ITEMS_NR;
commit_cycles--;
} else {
space_info->flush = 0;
}
} else {
flush_state = FLUSH_DELAYED_ITEMS_NR;
}
}
spin_unlock(&space_info->lock);
} while (flush_state <= COMMIT_TRANS);
}
/*
* This handles pre-flushing of metadata space before we get to the point that
* we need to start blocking threads on tickets. The logic here is different
* from the other flush paths because it doesn't rely on tickets to tell us how
* much we need to flush, instead it attempts to keep us below the 80% full
* watermark of space by flushing whichever reservation pool is currently the
* largest.
*/
static void btrfs_preempt_reclaim_metadata_space(struct work_struct *work)
{
struct btrfs_fs_info *fs_info;
struct btrfs_space_info *space_info;
struct btrfs_block_rsv *delayed_block_rsv;
struct btrfs_block_rsv *delayed_refs_rsv;
struct btrfs_block_rsv *global_rsv;
struct btrfs_block_rsv *trans_rsv;
int loops = 0;
fs_info = container_of(work, struct btrfs_fs_info,
preempt_reclaim_work);
space_info = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
delayed_block_rsv = &fs_info->delayed_block_rsv;
delayed_refs_rsv = &fs_info->delayed_refs_rsv;
global_rsv = &fs_info->global_block_rsv;
trans_rsv = &fs_info->trans_block_rsv;
spin_lock(&space_info->lock);
while (need_preemptive_reclaim(fs_info, space_info)) {
enum btrfs_flush_state flush;
u64 delalloc_size = 0;
u64 to_reclaim, block_rsv_size;
u64 global_rsv_size = global_rsv->reserved;
loops++;
/*
* We don't have a precise counter for the metadata being
* reserved for delalloc, so we'll approximate it by subtracting
* out the block rsv's space from the bytes_may_use. If that
* amount is higher than the individual reserves, then we can
* assume it's tied up in delalloc reservations.
*/
block_rsv_size = global_rsv_size +
delayed_block_rsv->reserved +
delayed_refs_rsv->reserved +
trans_rsv->reserved;
if (block_rsv_size < space_info->bytes_may_use)
delalloc_size = space_info->bytes_may_use - block_rsv_size;
/*
* We don't want to include the global_rsv in our calculation,
* because that's space we can't touch. Subtract it from the
* block_rsv_size for the next checks.
*/
block_rsv_size -= global_rsv_size;
/*
* We really want to avoid flushing delalloc too much, as it
* could result in poor allocation patterns, so only flush it if
* it's larger than the rest of the pools combined.
*/
if (delalloc_size > block_rsv_size) {
to_reclaim = delalloc_size;
flush = FLUSH_DELALLOC;
} else if (space_info->bytes_pinned >
(delayed_block_rsv->reserved +
delayed_refs_rsv->reserved)) {
to_reclaim = space_info->bytes_pinned;
flush = COMMIT_TRANS;
} else if (delayed_block_rsv->reserved >
delayed_refs_rsv->reserved) {
to_reclaim = delayed_block_rsv->reserved;
flush = FLUSH_DELAYED_ITEMS_NR;
} else {
to_reclaim = delayed_refs_rsv->reserved;
flush = FLUSH_DELAYED_REFS_NR;
}
spin_unlock(&space_info->lock);
/*
* We don't want to reclaim everything, just a portion, so scale
* down the to_reclaim by 1/4. If it takes us down to 0,
* reclaim 1 items worth.
*/
to_reclaim >>= 2;
if (!to_reclaim)
to_reclaim = btrfs_calc_insert_metadata_size(fs_info, 1);
flush_space(fs_info, space_info, to_reclaim, flush, true);
cond_resched();
spin_lock(&space_info->lock);
}
/* We only went through once, back off our clamping. */
if (loops == 1 && !space_info->reclaim_size)
space_info->clamp = max(1, space_info->clamp - 1);
trace_btrfs_done_preemptive_reclaim(fs_info, space_info);
spin_unlock(&space_info->lock);
}
/*
* FLUSH_DELALLOC_WAIT:
* Space is freed from flushing delalloc in one of two ways.
*
* 1) compression is on and we allocate less space than we reserved
* 2) we are overwriting existing space
*
* For #1 that extra space is reclaimed as soon as the delalloc pages are
* COWed, by way of btrfs_add_reserved_bytes() which adds the actual extent
* length to ->bytes_reserved, and subtracts the reserved space from
* ->bytes_may_use.
*
* For #2 this is trickier. Once the ordered extent runs we will drop the
* extent in the range we are overwriting, which creates a delayed ref for
* that freed extent. This however is not reclaimed until the transaction
* commits, thus the next stages.
*
* RUN_DELAYED_IPUTS
* If we are freeing inodes, we want to make sure all delayed iputs have
* completed, because they could have been on an inode with i_nlink == 0, and
* thus have been truncated and freed up space. But again this space is not
* immediately re-usable, it comes in the form of a delayed ref, which must be
* run and then the transaction must be committed.
*
* COMMIT_TRANS
* This is where we reclaim all of the pinned space generated by running the
* iputs
*
* ALLOC_CHUNK_FORCE
* For data we start with alloc chunk force, however we could have been full
* before, and then the transaction commit could have freed new block groups,
* so if we now have space to allocate do the force chunk allocation.
*/
static const enum btrfs_flush_state data_flush_states[] = {
FLUSH_DELALLOC_FULL,
RUN_DELAYED_IPUTS,
COMMIT_TRANS,
ALLOC_CHUNK_FORCE,
};
static void btrfs_async_reclaim_data_space(struct work_struct *work)
{
struct btrfs_fs_info *fs_info;
struct btrfs_space_info *space_info;
u64 last_tickets_id;
enum btrfs_flush_state flush_state = 0;
fs_info = container_of(work, struct btrfs_fs_info, async_data_reclaim_work);
space_info = fs_info->data_sinfo;
spin_lock(&space_info->lock);
if (list_empty(&space_info->tickets)) {
space_info->flush = 0;
spin_unlock(&space_info->lock);
return;
}
last_tickets_id = space_info->tickets_id;
spin_unlock(&space_info->lock);
while (!space_info->full) {
flush_space(fs_info, space_info, U64_MAX, ALLOC_CHUNK_FORCE, false);
spin_lock(&space_info->lock);
if (list_empty(&space_info->tickets)) {
space_info->flush = 0;
spin_unlock(&space_info->lock);
return;
}
last_tickets_id = space_info->tickets_id;
spin_unlock(&space_info->lock);
}
while (flush_state < ARRAY_SIZE(data_flush_states)) {
flush_space(fs_info, space_info, U64_MAX,
data_flush_states[flush_state], false);
spin_lock(&space_info->lock);
if (list_empty(&space_info->tickets)) {
space_info->flush = 0;
spin_unlock(&space_info->lock);
return;
}
if (last_tickets_id == space_info->tickets_id) {
flush_state++;
} else {
last_tickets_id = space_info->tickets_id;
flush_state = 0;
}
if (flush_state >= ARRAY_SIZE(data_flush_states)) {
if (space_info->full) {
if (maybe_fail_all_tickets(fs_info, space_info))
flush_state = 0;
else
space_info->flush = 0;
} else {
flush_state = 0;
}
}
spin_unlock(&space_info->lock);
}
}
void btrfs_init_async_reclaim_work(struct btrfs_fs_info *fs_info)
{
INIT_WORK(&fs_info->async_reclaim_work, btrfs_async_reclaim_metadata_space);
INIT_WORK(&fs_info->async_data_reclaim_work, btrfs_async_reclaim_data_space);
INIT_WORK(&fs_info->preempt_reclaim_work,
btrfs_preempt_reclaim_metadata_space);
}
static const enum btrfs_flush_state priority_flush_states[] = {
FLUSH_DELAYED_ITEMS_NR,
FLUSH_DELAYED_ITEMS,
ALLOC_CHUNK,
};
static const enum btrfs_flush_state evict_flush_states[] = {
FLUSH_DELAYED_ITEMS_NR,
FLUSH_DELAYED_ITEMS,
FLUSH_DELAYED_REFS_NR,
FLUSH_DELAYED_REFS,
FLUSH_DELALLOC,
FLUSH_DELALLOC_WAIT,
FLUSH_DELALLOC_FULL,
ALLOC_CHUNK,
COMMIT_TRANS,
};
static void priority_reclaim_metadata_space(struct btrfs_fs_info *fs_info,
struct btrfs_space_info *space_info,
struct reserve_ticket *ticket,
const enum btrfs_flush_state *states,
int states_nr)
{
u64 to_reclaim;
int flush_state;
spin_lock(&space_info->lock);
to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info, space_info);
if (!to_reclaim) {
spin_unlock(&space_info->lock);
return;
}
spin_unlock(&space_info->lock);
flush_state = 0;
do {
flush_space(fs_info, space_info, to_reclaim, states[flush_state],
false);
flush_state++;
spin_lock(&space_info->lock);
if (ticket->bytes == 0) {
spin_unlock(&space_info->lock);
return;
}
spin_unlock(&space_info->lock);
} while (flush_state < states_nr);
}
static void priority_reclaim_data_space(struct btrfs_fs_info *fs_info,
struct btrfs_space_info *space_info,
struct reserve_ticket *ticket)
{
while (!space_info->full) { flush_space(fs_info, space_info, U64_MAX, ALLOC_CHUNK_FORCE, false);
spin_lock(&space_info->lock);
if (ticket->bytes == 0) {
spin_unlock(&space_info->lock);
return;
}
spin_unlock(&space_info->lock);
}
}
static void wait_reserve_ticket(struct btrfs_fs_info *fs_info,
struct btrfs_space_info *space_info,
struct reserve_ticket *ticket)
{
DEFINE_WAIT(wait);
int ret = 0;
spin_lock(&space_info->lock);
while (ticket->bytes > 0 && ticket->error == 0) { ret = prepare_to_wait_event(&ticket->wait, &wait, TASK_KILLABLE);
if (ret) {
/*
* Delete us from the list. After we unlock the space
* info, we don't want the async reclaim job to reserve
* space for this ticket. If that would happen, then the
* ticket's task would not known that space was reserved
* despite getting an error, resulting in a space leak
* (bytes_may_use counter of our space_info).
*/
remove_ticket(space_info, ticket);
ticket->error = -EINTR;
break;
}
spin_unlock(&space_info->lock);
schedule();
finish_wait(&ticket->wait, &wait);
spin_lock(&space_info->lock);
}
spin_unlock(&space_info->lock);
}
/**
* Do the appropriate flushing and waiting for a ticket
*
* @fs_info: the filesystem
* @space_info: space info for the reservation
* @ticket: ticket for the reservation
* @start_ns: timestamp when the reservation started
* @orig_bytes: amount of bytes originally reserved
* @flush: how much we can flush
*
* This does the work of figuring out how to flush for the ticket, waiting for
* the reservation, and returning the appropriate error if there is one.
*/
static int handle_reserve_ticket(struct btrfs_fs_info *fs_info,
struct btrfs_space_info *space_info,
struct reserve_ticket *ticket,
u64 start_ns, u64 orig_bytes,
enum btrfs_reserve_flush_enum flush)
{
int ret;
switch (flush) {
case BTRFS_RESERVE_FLUSH_DATA:
case BTRFS_RESERVE_FLUSH_ALL:
case BTRFS_RESERVE_FLUSH_ALL_STEAL:
wait_reserve_ticket(fs_info, space_info, ticket);
break;
case BTRFS_RESERVE_FLUSH_LIMIT:
priority_reclaim_metadata_space(fs_info, space_info, ticket,
priority_flush_states,
ARRAY_SIZE(priority_flush_states));
break;
case BTRFS_RESERVE_FLUSH_EVICT:
priority_reclaim_metadata_space(fs_info, space_info, ticket,
evict_flush_states,
ARRAY_SIZE(evict_flush_states));
break;
case BTRFS_RESERVE_FLUSH_FREE_SPACE_INODE:
priority_reclaim_data_space(fs_info, space_info, ticket);
break;
default:
ASSERT(0);
break;
}
spin_lock(&space_info->lock);
ret = ticket->error;
if (ticket->bytes || ticket->error) {
/*
* We were a priority ticket, so we need to delete ourselves
* from the list. Because we could have other priority tickets
* behind us that require less space, run
* btrfs_try_granting_tickets() to see if their reservations can
* now be made.
*/
if (!list_empty(&ticket->list)) {
remove_ticket(space_info, ticket);
btrfs_try_granting_tickets(fs_info, space_info);
}
if (!ret)
ret = -ENOSPC;
}
spin_unlock(&space_info->lock);
ASSERT(list_empty(&ticket->list));
/*
* Check that we can't have an error set if the reservation succeeded,
* as that would confuse tasks and lead them to error out without
* releasing reserved space (if an error happens the expectation is that
* space wasn't reserved at all).
*/
ASSERT(!(ticket->bytes == 0 && ticket->error));
trace_btrfs_reserve_ticket(fs_info, space_info->flags, orig_bytes,
start_ns, flush, ticket->error);
return ret;
}
/*
* This returns true if this flush state will go through the ordinary flushing
* code.
*/
static inline bool is_normal_flushing(enum btrfs_reserve_flush_enum flush)
{
return (flush == BTRFS_RESERVE_FLUSH_ALL) ||
(flush == BTRFS_RESERVE_FLUSH_ALL_STEAL);
}
static inline void maybe_clamp_preempt(struct btrfs_fs_info *fs_info,
struct btrfs_space_info *space_info)
{
u64 ordered = percpu_counter_sum_positive(&fs_info->ordered_bytes);
u64 delalloc = percpu_counter_sum_positive(&fs_info->delalloc_bytes);
/*
* If we're heavy on ordered operations then clamping won't help us. We
* need to clamp specifically to keep up with dirty'ing buffered
* writers, because there's not a 1:1 correlation of writing delalloc
* and freeing space, like there is with flushing delayed refs or
* delayed nodes. If we're already more ordered than delalloc then
* we're keeping up, otherwise we aren't and should probably clamp.
*/
if (ordered < delalloc)
space_info->clamp = min(space_info->clamp + 1, 8);
}
/**
* Try to reserve bytes from the block_rsv's space
*
* @fs_info: the filesystem
* @space_info: space info we want to allocate from
* @orig_bytes: number of bytes we want
* @flush: whether or not we can flush to make our reservation
*
* This will reserve orig_bytes number of bytes from the space info associated
* with the block_rsv. If there is not enough space it will make an attempt to
* flush out space to make room. It will do this by flushing delalloc if
* possible or committing the transaction. If flush is 0 then no attempts to
* regain reservations will be made and this will fail if there is not enough
* space already.
*/
static int __reserve_bytes(struct btrfs_fs_info *fs_info,
struct btrfs_space_info *space_info, u64 orig_bytes,
enum btrfs_reserve_flush_enum flush)
{
struct work_struct *async_work;
struct reserve_ticket ticket;
u64 start_ns = 0;
u64 used;
int ret = 0;
bool pending_tickets;
ASSERT(orig_bytes);
ASSERT(!current->journal_info || flush != BTRFS_RESERVE_FLUSH_ALL);
if (flush == BTRFS_RESERVE_FLUSH_DATA)
async_work = &fs_info->async_data_reclaim_work;
else
async_work = &fs_info->async_reclaim_work;
spin_lock(&space_info->lock);
ret = -ENOSPC;
used = btrfs_space_info_used(space_info, true);
/*
* We don't want NO_FLUSH allocations to jump everybody, they can
* generally handle ENOSPC in a different way, so treat them the same as
* normal flushers when it comes to skipping pending tickets.
*/
if (is_normal_flushing(flush) || (flush == BTRFS_RESERVE_NO_FLUSH)) pending_tickets = !list_empty(&space_info->tickets) || !list_empty(&space_info->priority_tickets);
else
pending_tickets = !list_empty(&space_info->priority_tickets);
/*
* Carry on if we have enough space (short-circuit) OR call
* can_overcommit() to ensure we can overcommit to continue.
*/
if (!pending_tickets && ((used + orig_bytes <= space_info->total_bytes) || btrfs_can_overcommit(fs_info, space_info, orig_bytes, flush))) { btrfs_space_info_update_bytes_may_use(fs_info, space_info,
orig_bytes);
ret = 0;
}
/*
* If we couldn't make a reservation then setup our reservation ticket
* and kick the async worker if it's not already running.
*
* If we are a priority flusher then we just need to add our ticket to
* the list and we will do our own flushing further down.
*/
if (ret && flush != BTRFS_RESERVE_NO_FLUSH) { ticket.bytes = orig_bytes;
ticket.error = 0;
space_info->reclaim_size += ticket.bytes;
init_waitqueue_head(&ticket.wait);
ticket.steal = (flush == BTRFS_RESERVE_FLUSH_ALL_STEAL);
if (trace_btrfs_reserve_ticket_enabled())
start_ns = ktime_get_ns();
if (flush == BTRFS_RESERVE_FLUSH_ALL || flush == BTRFS_RESERVE_FLUSH_ALL_STEAL ||
flush == BTRFS_RESERVE_FLUSH_DATA) {
list_add_tail(&ticket.list, &space_info->tickets);
if (!space_info->flush) {
/*
* We were forced to add a reserve ticket, so
* our preemptive flushing is unable to keep
* up. Clamp down on the threshold for the
* preemptive flushing in order to keep up with
* the workload.
*/
maybe_clamp_preempt(fs_info, space_info);
space_info->flush = 1; trace_btrfs_trigger_flush(fs_info,
space_info->flags,
orig_bytes, flush,
"enospc");
queue_work(system_unbound_wq, async_work);
}
} else {
list_add_tail(&ticket.list,
&space_info->priority_tickets);
}
} else if (!ret && space_info->flags & BTRFS_BLOCK_GROUP_METADATA) {
used += orig_bytes;
/*
* We will do the space reservation dance during log replay,
* which means we won't have fs_info->fs_root set, so don't do
* the async reclaim as we will panic.
*/
if (!test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags) && !work_busy(&fs_info->preempt_reclaim_work) && need_preemptive_reclaim(fs_info, space_info)) { trace_btrfs_trigger_flush(fs_info, space_info->flags,
orig_bytes, flush, "preempt");
queue_work(system_unbound_wq,
&fs_info->preempt_reclaim_work);
}
}
spin_unlock(&space_info->lock);
if (!ret || flush == BTRFS_RESERVE_NO_FLUSH)
return ret;
return handle_reserve_ticket(fs_info, space_info, &ticket, start_ns,
orig_bytes, flush);
}
/**
* Trye to reserve metadata bytes from the block_rsv's space
*
* @root: the root we're allocating for
* @block_rsv: block_rsv we're allocating for
* @orig_bytes: number of bytes we want
* @flush: whether or not we can flush to make our reservation
*
* This will reserve orig_bytes number of bytes from the space info associated
* with the block_rsv. If there is not enough space it will make an attempt to
* flush out space to make room. It will do this by flushing delalloc if
* possible or committing the transaction. If flush is 0 then no attempts to
* regain reservations will be made and this will fail if there is not enough
* space already.
*/
int btrfs_reserve_metadata_bytes(struct btrfs_root *root,
struct btrfs_block_rsv *block_rsv,
u64 orig_bytes,
enum btrfs_reserve_flush_enum flush)
{
struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
int ret;
ret = __reserve_bytes(fs_info, block_rsv->space_info, orig_bytes, flush);
if (ret == -ENOSPC &&
unlikely(root->orphan_cleanup_state == ORPHAN_CLEANUP_STARTED)) {
if (block_rsv != global_rsv &&
!btrfs_block_rsv_use_bytes(global_rsv, orig_bytes))
ret = 0;
}
if (ret == -ENOSPC) {
trace_btrfs_space_reservation(fs_info, "space_info:enospc",
block_rsv->space_info->flags,
orig_bytes, 1);
if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) btrfs_dump_space_info(fs_info, block_rsv->space_info,
orig_bytes, 0);
}
return ret;
}
/**
* Try to reserve data bytes for an allocation
*
* @fs_info: the filesystem
* @bytes: number of bytes we need
* @flush: how we are allowed to flush
*
* This will reserve bytes from the data space info. If there is not enough
* space then we will attempt to flush space as specified by flush.
*/
int btrfs_reserve_data_bytes(struct btrfs_fs_info *fs_info, u64 bytes,
enum btrfs_reserve_flush_enum flush)
{
struct btrfs_space_info *data_sinfo = fs_info->data_sinfo;
int ret;
ASSERT(flush == BTRFS_RESERVE_FLUSH_DATA ||
flush == BTRFS_RESERVE_FLUSH_FREE_SPACE_INODE);
ASSERT(!current->journal_info || flush != BTRFS_RESERVE_FLUSH_DATA);
ret = __reserve_bytes(fs_info, data_sinfo, bytes, flush);
if (ret == -ENOSPC) {
trace_btrfs_space_reservation(fs_info, "space_info:enospc",
data_sinfo->flags, bytes, 1);
if (btrfs_test_opt(fs_info, ENOSPC_DEBUG))
btrfs_dump_space_info(fs_info, data_sinfo, bytes, 0);
}
return ret;
}
// SPDX-License-Identifier: GPL-2.0
/*
* linux/fs/attr.c
*
* Copyright (C) 1991, 1992 Linus Torvalds
* changes by Thomas Schoebel-Theuer
*/
#include <linux/export.h>
#include <linux/time.h>
#include <linux/mm.h>
#include <linux/string.h>
#include <linux/sched/signal.h>
#include <linux/capability.h>
#include <linux/fsnotify.h>
#include <linux/fcntl.h>
#include <linux/security.h>
#include <linux/evm.h>
#include <linux/ima.h>
/**
* chown_ok - verify permissions to chown inode
* @mnt_userns: user namespace of the mount @inode was found from
* @inode: inode to check permissions on
* @uid: uid to chown @inode to
*
* If the inode has been found through an idmapped mount the user namespace of
* the vfsmount must be passed through @mnt_userns. This function will then
* take care to map the inode according to @mnt_userns before checking
* permissions. On non-idmapped mounts or if permission checking is to be
* performed on the raw inode simply passs init_user_ns.
*/
static bool chown_ok(struct user_namespace *mnt_userns,
const struct inode *inode,
kuid_t uid)
{
kuid_t kuid = i_uid_into_mnt(mnt_userns, inode);
if (uid_eq(current_fsuid(), kuid) && uid_eq(uid, inode->i_uid))
return true;
if (capable_wrt_inode_uidgid(mnt_userns, inode, CAP_CHOWN))
return true;
if (uid_eq(kuid, INVALID_UID) && ns_capable(inode->i_sb->s_user_ns, CAP_CHOWN))
return true;
return false;
}
/**
* chgrp_ok - verify permissions to chgrp inode
* @mnt_userns: user namespace of the mount @inode was found from
* @inode: inode to check permissions on
* @gid: gid to chown @inode to
*
* If the inode has been found through an idmapped mount the user namespace of
* the vfsmount must be passed through @mnt_userns. This function will then
* take care to map the inode according to @mnt_userns before checking
* permissions. On non-idmapped mounts or if permission checking is to be
* performed on the raw inode simply passs init_user_ns.
*/
static bool chgrp_ok(struct user_namespace *mnt_userns,
const struct inode *inode, kgid_t gid)
{
kgid_t kgid = i_gid_into_mnt(mnt_userns, inode);
if (uid_eq(current_fsuid(), i_uid_into_mnt(mnt_userns, inode)) &&
(in_group_p(gid) || gid_eq(gid, inode->i_gid)))
return true;
if (capable_wrt_inode_uidgid(mnt_userns, inode, CAP_CHOWN))
return true;
if (gid_eq(kgid, INVALID_GID) && ns_capable(inode->i_sb->s_user_ns, CAP_CHOWN))
return true;
return false;
}
/**
* setattr_prepare - check if attribute changes to a dentry are allowed
* @mnt_userns: user namespace of the mount the inode was found from
* @dentry: dentry to check
* @attr: attributes to change
*
* Check if we are allowed to change the attributes contained in @attr
* in the given dentry. This includes the normal unix access permission
* checks, as well as checks for rlimits and others. The function also clears
* SGID bit from mode if user is not allowed to set it. Also file capabilities
* and IMA extended attributes are cleared if ATTR_KILL_PRIV is set.
*
* If the inode has been found through an idmapped mount the user namespace of
* the vfsmount must be passed through @mnt_userns. This function will then
* take care to map the inode according to @mnt_userns before checking
* permissions. On non-idmapped mounts or if permission checking is to be
* performed on the raw inode simply passs init_user_ns.
*
* Should be called as the first thing in ->setattr implementations,
* possibly after taking additional locks.
*/
int setattr_prepare(struct user_namespace *mnt_userns, struct dentry *dentry,
struct iattr *attr)
{
struct inode *inode = d_inode(dentry);
unsigned int ia_valid = attr->ia_valid;
/*
* First check size constraints. These can't be overriden using
* ATTR_FORCE.
*/
if (ia_valid & ATTR_SIZE) {
int error = inode_newsize_ok(inode, attr->ia_size);
if (error)
return error;
}
/* If force is set do it anyway. */
if (ia_valid & ATTR_FORCE)
goto kill_priv;
/* Make sure a caller can chown. */
if ((ia_valid & ATTR_UID) && !chown_ok(mnt_userns, inode, attr->ia_uid))
return -EPERM;
/* Make sure caller can chgrp. */
if ((ia_valid & ATTR_GID) && !chgrp_ok(mnt_userns, inode, attr->ia_gid))
return -EPERM;
/* Make sure a caller can chmod. */
if (ia_valid & ATTR_MODE) { if (!inode_owner_or_capable(mnt_userns, inode))
return -EPERM;
/* Also check the setgid bit! */
if (!in_group_p((ia_valid & ATTR_GID) ? attr->ia_gid :
i_gid_into_mnt(mnt_userns, inode)) &&
!capable_wrt_inode_uidgid(mnt_userns, inode, CAP_FSETID)) attr->ia_mode &= ~S_ISGID;
}
/* Check for setting the inode time. */
if (ia_valid & (ATTR_MTIME_SET | ATTR_ATIME_SET | ATTR_TIMES_SET)) { if (!inode_owner_or_capable(mnt_userns, inode))
return -EPERM;
}
kill_priv:
/* User has permission for the change */
if (ia_valid & ATTR_KILL_PRIV) {
int error;
error = security_inode_killpriv(mnt_userns, dentry);
if (error)
return error;
}
return 0;
}
EXPORT_SYMBOL(setattr_prepare);
/**
* inode_newsize_ok - may this inode be truncated to a given size
* @inode: the inode to be truncated
* @offset: the new size to assign to the inode
*
* inode_newsize_ok must be called with i_mutex held.
*
* inode_newsize_ok will check filesystem limits and ulimits to check that the
* new inode size is within limits. inode_newsize_ok will also send SIGXFSZ
* when necessary. Caller must not proceed with inode size change if failure is
* returned. @inode must be a file (not directory), with appropriate
* permissions to allow truncate (inode_newsize_ok does NOT check these
* conditions).
*
* Return: 0 on success, -ve errno on failure
*/
int inode_newsize_ok(const struct inode *inode, loff_t offset)
{
if (inode->i_size < offset) {
unsigned long limit;
limit = rlimit(RLIMIT_FSIZE);
if (limit != RLIM_INFINITY && offset > limit)
goto out_sig;
if (offset > inode->i_sb->s_maxbytes)
goto out_big;
} else {
/*
* truncation of in-use swapfiles is disallowed - it would
* cause subsequent swapout to scribble on the now-freed
* blocks.
*/
if (IS_SWAPFILE(inode))
return -ETXTBSY;
}
return 0;
out_sig:
send_sig(SIGXFSZ, current, 0);
out_big:
return -EFBIG;
}
EXPORT_SYMBOL(inode_newsize_ok);
/**
* setattr_copy - copy simple metadata updates into the generic inode
* @mnt_userns: user namespace of the mount the inode was found from
* @inode: the inode to be updated
* @attr: the new attributes
*
* setattr_copy must be called with i_mutex held.
*
* setattr_copy updates the inode's metadata with that specified
* in attr on idmapped mounts. If file ownership is changed setattr_copy
* doesn't map ia_uid and ia_gid. It will asssume the caller has already
* provided the intended values. Necessary permission checks to determine
* whether or not the S_ISGID property needs to be removed are performed with
* the correct idmapped mount permission helpers.
* Noticeably missing is inode size update, which is more complex
* as it requires pagecache updates.
*
* If the inode has been found through an idmapped mount the user namespace of
* the vfsmount must be passed through @mnt_userns. This function will then
* take care to map the inode according to @mnt_userns before checking
* permissions. On non-idmapped mounts or if permission checking is to be
* performed on the raw inode simply passs init_user_ns.
*
* The inode is not marked as dirty after this operation. The rationale is
* that for "simple" filesystems, the struct inode is the inode storage.
* The caller is free to mark the inode dirty afterwards if needed.
*/
void setattr_copy(struct user_namespace *mnt_userns, struct inode *inode,
const struct iattr *attr)
{
unsigned int ia_valid = attr->ia_valid;
if (ia_valid & ATTR_UID)
inode->i_uid = attr->ia_uid; if (ia_valid & ATTR_GID) inode->i_gid = attr->ia_gid; if (ia_valid & ATTR_ATIME) inode->i_atime = attr->ia_atime; if (ia_valid & ATTR_MTIME) inode->i_mtime = attr->ia_mtime; if (ia_valid & ATTR_CTIME) inode->i_ctime = attr->ia_ctime; if (ia_valid & ATTR_MODE) { umode_t mode = attr->ia_mode;
kgid_t kgid = i_gid_into_mnt(mnt_userns, inode);
if (!in_group_p(kgid) &&
!capable_wrt_inode_uidgid(mnt_userns, inode, CAP_FSETID)) mode &= ~S_ISGID; inode->i_mode = mode;
}
}
EXPORT_SYMBOL(setattr_copy);
int may_setattr(struct user_namespace *mnt_userns, struct inode *inode,
unsigned int ia_valid)
{
int error;
if (ia_valid & (ATTR_MODE | ATTR_UID | ATTR_GID | ATTR_TIMES_SET)) { if (IS_IMMUTABLE(inode) || IS_APPEND(inode))
return -EPERM;
}
/*
* If utimes(2) and friends are called with times == NULL (or both
* times are UTIME_NOW), then we need to check for write permission
*/
if (ia_valid & ATTR_TOUCH) { if (IS_IMMUTABLE(inode))
return -EPERM;
if (!inode_owner_or_capable(mnt_userns, inode)) { error = inode_permission(mnt_userns, inode, MAY_WRITE); if (error)
return error;
}
}
return 0;
}
EXPORT_SYMBOL(may_setattr);
/**
* notify_change - modify attributes of a filesytem object
* @mnt_userns: user namespace of the mount the inode was found from
* @dentry: object affected
* @attr: new attributes
* @delegated_inode: returns inode, if the inode is delegated
*
* The caller must hold the i_mutex on the affected object.
*
* If notify_change discovers a delegation in need of breaking,
* it will return -EWOULDBLOCK and return a reference to the inode in
* delegated_inode. The caller should then break the delegation and
* retry. Because breaking a delegation may take a long time, the
* caller should drop the i_mutex before doing so.
*
* If file ownership is changed notify_change() doesn't map ia_uid and
* ia_gid. It will asssume the caller has already provided the intended values.
*
* Alternatively, a caller may pass NULL for delegated_inode. This may
* be appropriate for callers that expect the underlying filesystem not
* to be NFS exported. Also, passing NULL is fine for callers holding
* the file open for write, as there can be no conflicting delegation in
* that case.
*
* If the inode has been found through an idmapped mount the user namespace of
* the vfsmount must be passed through @mnt_userns. This function will then
* take care to map the inode according to @mnt_userns before checking
* permissions. On non-idmapped mounts or if permission checking is to be
* performed on the raw inode simply passs init_user_ns.
*/
int notify_change(struct user_namespace *mnt_userns, struct dentry *dentry,
struct iattr *attr, struct inode **delegated_inode)
{
struct inode *inode = dentry->d_inode;
umode_t mode = inode->i_mode;
int error;
struct timespec64 now;
unsigned int ia_valid = attr->ia_valid;
WARN_ON_ONCE(!inode_is_locked(inode)); error = may_setattr(mnt_userns, inode, ia_valid);
if (error)
return error;
if ((ia_valid & ATTR_MODE)) { umode_t amode = attr->ia_mode;
/* Flag setting protected by i_mutex */
if (is_sxid(amode))
inode->i_flags &= ~S_NOSEC;
}
now = current_time(inode);
attr->ia_ctime = now;
if (!(ia_valid & ATTR_ATIME_SET))
attr->ia_atime = now;
else
attr->ia_atime = timestamp_truncate(attr->ia_atime, inode); if (!(ia_valid & ATTR_MTIME_SET)) attr->ia_mtime = now;
else
attr->ia_mtime = timestamp_truncate(attr->ia_mtime, inode); if (ia_valid & ATTR_KILL_PRIV) { error = security_inode_need_killpriv(dentry);
if (error < 0)
return error;
if (error == 0) ia_valid = attr->ia_valid &= ~ATTR_KILL_PRIV;
}
/*
* We now pass ATTR_KILL_S*ID to the lower level setattr function so
* that the function has the ability to reinterpret a mode change
* that's due to these bits. This adds an implicit restriction that
* no function will ever call notify_change with both ATTR_MODE and
* ATTR_KILL_S*ID set.
*/
if ((ia_valid & (ATTR_KILL_SUID|ATTR_KILL_SGID)) && (ia_valid & ATTR_MODE)) BUG(); if (ia_valid & ATTR_KILL_SUID) { if (mode & S_ISUID) { ia_valid = attr->ia_valid |= ATTR_MODE;
attr->ia_mode = (inode->i_mode & ~S_ISUID);
}
}
if (ia_valid & ATTR_KILL_SGID) { if ((mode & (S_ISGID | S_IXGRP)) == (S_ISGID | S_IXGRP)) { if (!(ia_valid & ATTR_MODE)) { ia_valid = attr->ia_valid |= ATTR_MODE;
attr->ia_mode = inode->i_mode;
}
attr->ia_mode &= ~S_ISGID;
}
}
if (!(attr->ia_valid & ~(ATTR_KILL_SUID | ATTR_KILL_SGID)))
return 0;
/*
* Verify that uid/gid changes are valid in the target
* namespace of the superblock.
*/
if (ia_valid & ATTR_UID &&
!kuid_has_mapping(inode->i_sb->s_user_ns, attr->ia_uid))
return -EOVERFLOW;
if (ia_valid & ATTR_GID &&
!kgid_has_mapping(inode->i_sb->s_user_ns, attr->ia_gid))
return -EOVERFLOW;
/* Don't allow modifications of files with invalid uids or
* gids unless those uids & gids are being made valid.
*/
if (!(ia_valid & ATTR_UID) &&
!uid_valid(i_uid_into_mnt(mnt_userns, inode)))
return -EOVERFLOW;
if (!(ia_valid & ATTR_GID) &&
!gid_valid(i_gid_into_mnt(mnt_userns, inode)))
return -EOVERFLOW;
error = security_inode_setattr(dentry, attr);
if (error)
return error;
error = try_break_deleg(inode, delegated_inode);
if (error)
return error;
if (inode->i_op->setattr) error = inode->i_op->setattr(mnt_userns, dentry, attr);
else
error = simple_setattr(mnt_userns, dentry, attr); if (!error) {
fsnotify_change(dentry, ia_valid);
ima_inode_post_setattr(mnt_userns, dentry);
evm_inode_post_setattr(dentry, ia_valid);
}
return error;
}
EXPORT_SYMBOL(notify_change);
// SPDX-License-Identifier: GPL-2.0-only
/*
* klist.c - Routines for manipulating klists.
*
* Copyright (C) 2005 Patrick Mochel
*
* This klist interface provides a couple of structures that wrap around
* struct list_head to provide explicit list "head" (struct klist) and list
* "node" (struct klist_node) objects. For struct klist, a spinlock is
* included that protects access to the actual list itself. struct
* klist_node provides a pointer to the klist that owns it and a kref
* reference count that indicates the number of current users of that node
* in the list.
*
* The entire point is to provide an interface for iterating over a list
* that is safe and allows for modification of the list during the
* iteration (e.g. insertion and removal), including modification of the
* current node on the list.
*
* It works using a 3rd object type - struct klist_iter - that is declared
* and initialized before an iteration. klist_next() is used to acquire the
* next element in the list. It returns NULL if there are no more items.
* Internally, that routine takes the klist's lock, decrements the
* reference count of the previous klist_node and increments the count of
* the next klist_node. It then drops the lock and returns.
*
* There are primitives for adding and removing nodes to/from a klist.
* When deleting, klist_del() will simply decrement the reference count.
* Only when the count goes to 0 is the node removed from the list.
* klist_remove() will try to delete the node from the list and block until
* it is actually removed. This is useful for objects (like devices) that
* have been removed from the system and must be freed (but must wait until
* all accessors have finished).
*/
#include <linux/klist.h>
#include <linux/export.h>
#include <linux/sched.h>
/*
* Use the lowest bit of n_klist to mark deleted nodes and exclude
* dead ones from iteration.
*/
#define KNODE_DEAD 1LU
#define KNODE_KLIST_MASK ~KNODE_DEAD
static struct klist *knode_klist(struct klist_node *knode)
{
return (struct klist *)
((unsigned long)knode->n_klist & KNODE_KLIST_MASK);
}
static bool knode_dead(struct klist_node *knode)
{
return (unsigned long)knode->n_klist & KNODE_DEAD;
}
static void knode_set_klist(struct klist_node *knode, struct klist *klist)
{
knode->n_klist = klist;
/* no knode deserves to start its life dead */
WARN_ON(knode_dead(knode));
}
static void knode_kill(struct klist_node *knode)
{
/* and no knode should die twice ever either, see we're very humane */
WARN_ON(knode_dead(knode)); *(unsigned long *)&knode->n_klist |= KNODE_DEAD;
}
/**
* klist_init - Initialize a klist structure.
* @k: The klist we're initializing.
* @get: The get function for the embedding object (NULL if none)
* @put: The put function for the embedding object (NULL if none)
*
* Initialises the klist structure. If the klist_node structures are
* going to be embedded in refcounted objects (necessary for safe
* deletion) then the get/put arguments are used to initialise
* functions that take and release references on the embedding
* objects.
*/
void klist_init(struct klist *k, void (*get)(struct klist_node *),
void (*put)(struct klist_node *))
{
INIT_LIST_HEAD(&k->k_list);
spin_lock_init(&k->k_lock);
k->get = get;
k->put = put;
}
EXPORT_SYMBOL_GPL(klist_init);
static void add_head(struct klist *k, struct klist_node *n)
{
spin_lock(&k->k_lock);
list_add(&n->n_node, &k->k_list);
spin_unlock(&k->k_lock);
}
static void add_tail(struct klist *k, struct klist_node *n)
{
spin_lock(&k->k_lock);
list_add_tail(&n->n_node, &k->k_list);
spin_unlock(&k->k_lock);
}
static void klist_node_init(struct klist *k, struct klist_node *n)
{
INIT_LIST_HEAD(&n->n_node);
kref_init(&n->n_ref);
knode_set_klist(n, k);
if (k->get) k->get(n);
}
/**
* klist_add_head - Initialize a klist_node and add it to front.
* @n: node we're adding.
* @k: klist it's going on.
*/
void klist_add_head(struct klist_node *n, struct klist *k)
{
klist_node_init(k, n);
add_head(k, n);
}
EXPORT_SYMBOL_GPL(klist_add_head);
/**
* klist_add_tail - Initialize a klist_node and add it to back.
* @n: node we're adding.
* @k: klist it's going on.
*/
void klist_add_tail(struct klist_node *n, struct klist *k)
{
klist_node_init(k, n);
add_tail(k, n);
}
EXPORT_SYMBOL_GPL(klist_add_tail);
/**
* klist_add_behind - Init a klist_node and add it after an existing node
* @n: node we're adding.
* @pos: node to put @n after
*/
void klist_add_behind(struct klist_node *n, struct klist_node *pos)
{
struct klist *k = knode_klist(pos);
klist_node_init(k, n);
spin_lock(&k->k_lock);
list_add(&n->n_node, &pos->n_node);
spin_unlock(&k->k_lock);
}
EXPORT_SYMBOL_GPL(klist_add_behind);
/**
* klist_add_before - Init a klist_node and add it before an existing node
* @n: node we're adding.
* @pos: node to put @n after
*/
void klist_add_before(struct klist_node *n, struct klist_node *pos)
{
struct klist *k = knode_klist(pos);
klist_node_init(k, n);
spin_lock(&k->k_lock);
list_add_tail(&n->n_node, &pos->n_node);
spin_unlock(&k->k_lock);
}
EXPORT_SYMBOL_GPL(klist_add_before);
struct klist_waiter {
struct list_head list;
struct klist_node *node;
struct task_struct *process;
int woken;
};
static DEFINE_SPINLOCK(klist_remove_lock);
static LIST_HEAD(klist_remove_waiters);
static void klist_release(struct kref *kref)
{
struct klist_waiter *waiter, *tmp;
struct klist_node *n = container_of(kref, struct klist_node, n_ref);
WARN_ON(!knode_dead(n));
list_del(&n->n_node);
spin_lock(&klist_remove_lock);
list_for_each_entry_safe(waiter, tmp, &klist_remove_waiters, list) { if (waiter->node != n)
continue;
list_del(&waiter->list);
waiter->woken = 1;
mb();
wake_up_process(waiter->process);
}
spin_unlock(&klist_remove_lock);
knode_set_klist(n, NULL);
}
static int klist_dec_and_del(struct klist_node *n)
{
return kref_put(&n->n_ref, klist_release);
}
static void klist_put(struct klist_node *n, bool kill)
{
struct klist *k = knode_klist(n);
void (*put)(struct klist_node *) = k->put;
spin_lock(&k->k_lock);
if (kill)
knode_kill(n);
if (!klist_dec_and_del(n))
put = NULL;
spin_unlock(&k->k_lock);
if (put)
put(n);
}
/**
* klist_del - Decrement the reference count of node and try to remove.
* @n: node we're deleting.
*/
void klist_del(struct klist_node *n)
{
klist_put(n, true);
}
EXPORT_SYMBOL_GPL(klist_del);
/**
* klist_remove - Decrement the refcount of node and wait for it to go away.
* @n: node we're removing.
*/
void klist_remove(struct klist_node *n)
{
struct klist_waiter waiter;
waiter.node = n;
waiter.process = current;
waiter.woken = 0;
spin_lock(&klist_remove_lock);
list_add(&waiter.list, &klist_remove_waiters);
spin_unlock(&klist_remove_lock);
klist_del(n);
for (;;) {
set_current_state(TASK_UNINTERRUPTIBLE);
if (waiter.woken)
break;
schedule();
}
__set_current_state(TASK_RUNNING);
}
EXPORT_SYMBOL_GPL(klist_remove);
/**
* klist_node_attached - Say whether a node is bound to a list or not.
* @n: Node that we're testing.
*/
int klist_node_attached(struct klist_node *n)
{
return (n->n_klist != NULL);
}
EXPORT_SYMBOL_GPL(klist_node_attached);
/**
* klist_iter_init_node - Initialize a klist_iter structure.
* @k: klist we're iterating.
* @i: klist_iter we're filling.
* @n: node to start with.
*
* Similar to klist_iter_init(), but starts the action off with @n,
* instead of with the list head.
*/
void klist_iter_init_node(struct klist *k, struct klist_iter *i,
struct klist_node *n)
{
i->i_klist = k;
i->i_cur = NULL;
if (n && kref_get_unless_zero(&n->n_ref))
i->i_cur = n;
}
EXPORT_SYMBOL_GPL(klist_iter_init_node);
/**
* klist_iter_init - Iniitalize a klist_iter structure.
* @k: klist we're iterating.
* @i: klist_iter structure we're filling.
*
* Similar to klist_iter_init_node(), but start with the list head.
*/
void klist_iter_init(struct klist *k, struct klist_iter *i)
{
klist_iter_init_node(k, i, NULL);
}
EXPORT_SYMBOL_GPL(klist_iter_init);
/**
* klist_iter_exit - Finish a list iteration.
* @i: Iterator structure.
*
* Must be called when done iterating over list, as it decrements the
* refcount of the current node. Necessary in case iteration exited before
* the end of the list was reached, and always good form.
*/
void klist_iter_exit(struct klist_iter *i)
{
if (i->i_cur) {
klist_put(i->i_cur, false);
i->i_cur = NULL;
}
}
EXPORT_SYMBOL_GPL(klist_iter_exit);
static struct klist_node *to_klist_node(struct list_head *n)
{
return container_of(n, struct klist_node, n_node);
}
/**
* klist_prev - Ante up prev node in list.
* @i: Iterator structure.
*
* First grab list lock. Decrement the reference count of the previous
* node, if there was one. Grab the prev node, increment its reference
* count, drop the lock, and return that prev node.
*/
struct klist_node *klist_prev(struct klist_iter *i)
{
void (*put)(struct klist_node *) = i->i_klist->put;
struct klist_node *last = i->i_cur;
struct klist_node *prev;
unsigned long flags;
spin_lock_irqsave(&i->i_klist->k_lock, flags);
if (last) {
prev = to_klist_node(last->n_node.prev);
if (!klist_dec_and_del(last))
put = NULL;
} else
prev = to_klist_node(i->i_klist->k_list.prev);
i->i_cur = NULL;
while (prev != to_klist_node(&i->i_klist->k_list)) {
if (likely(!knode_dead(prev))) {
kref_get(&prev->n_ref);
i->i_cur = prev;
break;
}
prev = to_klist_node(prev->n_node.prev);
}
spin_unlock_irqrestore(&i->i_klist->k_lock, flags);
if (put && last)
put(last);
return i->i_cur;
}
EXPORT_SYMBOL_GPL(klist_prev);
/**
* klist_next - Ante up next node in list.
* @i: Iterator structure.
*
* First grab list lock. Decrement the reference count of the previous
* node, if there was one. Grab the next node, increment its reference
* count, drop the lock, and return that next node.
*/
struct klist_node *klist_next(struct klist_iter *i)
{
void (*put)(struct klist_node *) = i->i_klist->put;
struct klist_node *last = i->i_cur;
struct klist_node *next;
unsigned long flags;
spin_lock_irqsave(&i->i_klist->k_lock, flags);
if (last) {
next = to_klist_node(last->n_node.next);
if (!klist_dec_and_del(last))
put = NULL;
} else
next = to_klist_node(i->i_klist->k_list.next);
i->i_cur = NULL;
while (next != to_klist_node(&i->i_klist->k_list)) {
if (likely(!knode_dead(next))) {
kref_get(&next->n_ref);
i->i_cur = next;
break;
}
next = to_klist_node(next->n_node.next);
}
spin_unlock_irqrestore(&i->i_klist->k_lock, flags);
if (put && last)
put(last);
return i->i_cur;
}
EXPORT_SYMBOL_GPL(klist_next);
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_IVERSION_H
#define _LINUX_IVERSION_H
#include <linux/fs.h>
/*
* The inode->i_version field:
* ---------------------------
* The change attribute (i_version) is mandated by NFSv4 and is mostly for
* knfsd, but is also used for other purposes (e.g. IMA). The i_version must
* appear different to observers if there was a change to the inode's data or
* metadata since it was last queried.
*
* Observers see the i_version as a 64-bit number that never decreases. If it
* remains the same since it was last checked, then nothing has changed in the
* inode. If it's different then something has changed. Observers cannot infer
* anything about the nature or magnitude of the changes from the value, only
* that the inode has changed in some fashion.
*
* Not all filesystems properly implement the i_version counter. Subsystems that
* want to use i_version field on an inode should first check whether the
* filesystem sets the SB_I_VERSION flag (usually via the IS_I_VERSION macro).
*
* Those that set SB_I_VERSION will automatically have their i_version counter
* incremented on writes to normal files. If the SB_I_VERSION is not set, then
* the VFS will not touch it on writes, and the filesystem can use it how it
* wishes. Note that the filesystem is always responsible for updating the
* i_version on namespace changes in directories (mkdir, rmdir, unlink, etc.).
* We consider these sorts of filesystems to have a kernel-managed i_version.
*
* It may be impractical for filesystems to keep i_version updates atomic with
* respect to the changes that cause them. They should, however, guarantee
* that i_version updates are never visible before the changes that caused
* them. Also, i_version updates should never be delayed longer than it takes
* the original change to reach disk.
*
* This implementation uses the low bit in the i_version field as a flag to
* track when the value has been queried. If it has not been queried since it
* was last incremented, we can skip the increment in most cases.
*
* In the event that we're updating the ctime, we will usually go ahead and
* bump the i_version anyway. Since that has to go to stable storage in some
* fashion, we might as well increment it as well.
*
* With this implementation, the value should always appear to observers to
* increase over time if the file has changed. It's recommended to use
* inode_eq_iversion() helper to compare values.
*
* Note that some filesystems (e.g. NFS and AFS) just use the field to store
* a server-provided value (for the most part). For that reason, those
* filesystems do not set SB_I_VERSION. These filesystems are considered to
* have a self-managed i_version.
*
* Persistently storing the i_version
* ----------------------------------
* Queries of the i_version field are not gated on them hitting the backing
* store. It's always possible that the host could crash after allowing
* a query of the value but before it has made it to disk.
*
* To mitigate this problem, filesystems should always use
* inode_set_iversion_queried when loading an existing inode from disk. This
* ensures that the next attempted inode increment will result in the value
* changing.
*
* Storing the value to disk therefore does not count as a query, so those
* filesystems should use inode_peek_iversion to grab the value to be stored.
* There is no need to flag the value as having been queried in that case.
*/
/*
* We borrow the lowest bit in the i_version to use as a flag to tell whether
* it has been queried since we last incremented it. If it has, then we must
* increment it on the next change. After that, we can clear the flag and
* avoid incrementing it again until it has again been queried.
*/
#define I_VERSION_QUERIED_SHIFT (1)
#define I_VERSION_QUERIED (1ULL << (I_VERSION_QUERIED_SHIFT - 1))
#define I_VERSION_INCREMENT (1ULL << I_VERSION_QUERIED_SHIFT)
/**
* inode_set_iversion_raw - set i_version to the specified raw value
* @inode: inode to set
* @val: new i_version value to set
*
* Set @inode's i_version field to @val. This function is for use by
* filesystems that self-manage the i_version.
*
* For example, the NFS client stores its NFSv4 change attribute in this way,
* and the AFS client stores the data_version from the server here.
*/
static inline void
inode_set_iversion_raw(struct inode *inode, u64 val)
{
atomic64_set(&inode->i_version, val);
}
/**
* inode_peek_iversion_raw - grab a "raw" iversion value
* @inode: inode from which i_version should be read
*
* Grab a "raw" inode->i_version value and return it. The i_version is not
* flagged or converted in any way. This is mostly used to access a self-managed
* i_version.
*
* With those filesystems, we want to treat the i_version as an entirely
* opaque value.
*/
static inline u64
inode_peek_iversion_raw(const struct inode *inode)
{
return atomic64_read(&inode->i_version);
}
/**
* inode_set_max_iversion_raw - update i_version new value is larger
* @inode: inode to set
* @val: new i_version to set
*
* Some self-managed filesystems (e.g Ceph) will only update the i_version
* value if the new value is larger than the one we already have.
*/
static inline void
inode_set_max_iversion_raw(struct inode *inode, u64 val)
{
u64 cur, old;
cur = inode_peek_iversion_raw(inode);
for (;;) {
if (cur > val)
break;
old = atomic64_cmpxchg(&inode->i_version, cur, val);
if (likely(old == cur))
break;
cur = old;
}
}
/**
* inode_set_iversion - set i_version to a particular value
* @inode: inode to set
* @val: new i_version value to set
*
* Set @inode's i_version field to @val. This function is for filesystems with
* a kernel-managed i_version, for initializing a newly-created inode from
* scratch.
*
* In this case, we do not set the QUERIED flag since we know that this value
* has never been queried.
*/
static inline void
inode_set_iversion(struct inode *inode, u64 val)
{
inode_set_iversion_raw(inode, val << I_VERSION_QUERIED_SHIFT);
}
/**
* inode_set_iversion_queried - set i_version to a particular value as quereied
* @inode: inode to set
* @val: new i_version value to set
*
* Set @inode's i_version field to @val, and flag it for increment on the next
* change.
*
* Filesystems that persistently store the i_version on disk should use this
* when loading an existing inode from disk.
*
* When loading in an i_version value from a backing store, we can't be certain
* that it wasn't previously viewed before being stored. Thus, we must assume
* that it was, to ensure that we don't end up handing out the same value for
* different versions of the same inode.
*/
static inline void
inode_set_iversion_queried(struct inode *inode, u64 val)
{
inode_set_iversion_raw(inode, (val << I_VERSION_QUERIED_SHIFT) |
I_VERSION_QUERIED);
}
/**
* inode_maybe_inc_iversion - increments i_version
* @inode: inode with the i_version that should be updated
* @force: increment the counter even if it's not necessary?
*
* Every time the inode is modified, the i_version field must be seen to have
* changed by any observer.
*
* If "force" is set or the QUERIED flag is set, then ensure that we increment
* the value, and clear the queried flag.
*
* In the common case where neither is set, then we can return "false" without
* updating i_version.
*
* If this function returns false, and no other metadata has changed, then we
* can avoid logging the metadata.
*/
static inline bool
inode_maybe_inc_iversion(struct inode *inode, bool force)
{
u64 cur, old, new;
/*
* The i_version field is not strictly ordered with any other inode
* information, but the legacy inode_inc_iversion code used a spinlock
* to serialize increments.
*
* Here, we add full memory barriers to ensure that any de-facto
* ordering with other info is preserved.
*
* This barrier pairs with the barrier in inode_query_iversion()
*/
smp_mb();
cur = inode_peek_iversion_raw(inode);
for (;;) {
/* If flag is clear then we needn't do anything */
if (!force && !(cur & I_VERSION_QUERIED))
return false;
/* Since lowest bit is flag, add 2 to avoid it */
new = (cur & ~I_VERSION_QUERIED) + I_VERSION_INCREMENT;
old = atomic64_cmpxchg(&inode->i_version, cur, new);
if (likely(old == cur))
break;
cur = old;
}
return true;
}
/**
* inode_inc_iversion - forcibly increment i_version
* @inode: inode that needs to be updated
*
* Forcbily increment the i_version field. This always results in a change to
* the observable value.
*/
static inline void
inode_inc_iversion(struct inode *inode)
{
inode_maybe_inc_iversion(inode, true);
}
/**
* inode_iversion_need_inc - is the i_version in need of being incremented?
* @inode: inode to check
*
* Returns whether the inode->i_version counter needs incrementing on the next
* change. Just fetch the value and check the QUERIED flag.
*/
static inline bool
inode_iversion_need_inc(struct inode *inode)
{
return inode_peek_iversion_raw(inode) & I_VERSION_QUERIED;
}
/**
* inode_inc_iversion_raw - forcibly increment raw i_version
* @inode: inode that needs to be updated
*
* Forcbily increment the raw i_version field. This always results in a change
* to the raw value.
*
* NFS will use the i_version field to store the value from the server. It
* mostly treats it as opaque, but in the case where it holds a write
* delegation, it must increment the value itself. This function does that.
*/
static inline void
inode_inc_iversion_raw(struct inode *inode)
{
atomic64_inc(&inode->i_version);
}
/**
* inode_peek_iversion - read i_version without flagging it to be incremented
* @inode: inode from which i_version should be read
*
* Read the inode i_version counter for an inode without registering it as a
* query.
*
* This is typically used by local filesystems that need to store an i_version
* on disk. In that situation, it's not necessary to flag it as having been
* viewed, as the result won't be used to gauge changes from that point.
*/
static inline u64
inode_peek_iversion(const struct inode *inode)
{
return inode_peek_iversion_raw(inode) >> I_VERSION_QUERIED_SHIFT;
}
/**
* inode_query_iversion - read i_version for later use
* @inode: inode from which i_version should be read
*
* Read the inode i_version counter. This should be used by callers that wish
* to store the returned i_version for later comparison. This will guarantee
* that a later query of the i_version will result in a different value if
* anything has changed.
*
* In this implementation, we fetch the current value, set the QUERIED flag and
* then try to swap it into place with a cmpxchg, if it wasn't already set. If
* that fails, we try again with the newly fetched value from the cmpxchg.
*/
static inline u64
inode_query_iversion(struct inode *inode)
{
u64 cur, old, new;
cur = inode_peek_iversion_raw(inode);
for (;;) {
/* If flag is already set, then no need to swap */
if (cur & I_VERSION_QUERIED) {
/*
* This barrier (and the implicit barrier in the
* cmpxchg below) pairs with the barrier in
* inode_maybe_inc_iversion().
*/
smp_mb();
break;
}
new = cur | I_VERSION_QUERIED;
old = atomic64_cmpxchg(&inode->i_version, cur, new);
if (likely(old == cur))
break;
cur = old;
}
return cur >> I_VERSION_QUERIED_SHIFT;
}
/*
* For filesystems without any sort of change attribute, the best we can
* do is fake one up from the ctime:
*/
static inline u64 time_to_chattr(struct timespec64 *t)
{
u64 chattr = t->tv_sec;
chattr <<= 32;
chattr += t->tv_nsec;
return chattr;
}
/**
* inode_eq_iversion_raw - check whether the raw i_version counter has changed
* @inode: inode to check
* @old: old value to check against its i_version
*
* Compare the current raw i_version counter with a previous one. Returns true
* if they are the same or false if they are different.
*/
static inline bool
inode_eq_iversion_raw(const struct inode *inode, u64 old)
{
return inode_peek_iversion_raw(inode) == old;
}
/**
* inode_eq_iversion - check whether the i_version counter has changed
* @inode: inode to check
* @old: old value to check against its i_version
*
* Compare an i_version counter with a previous one. Returns true if they are
* the same, and false if they are different.
*
* Note that we don't need to set the QUERIED flag in this case, as the value
* in the inode is not being recorded for later use.
*/
static inline bool
inode_eq_iversion(const struct inode *inode, u64 old)
{
return inode_peek_iversion(inode) == old;
}
#endif
// SPDX-License-Identifier: GPL-2.0
#include <linux/jiffies.h>
#include <linux/kernel.h>
#include <linux/ktime.h>
#include <linux/list.h>
#include <linux/math64.h>
#include <linux/sizes.h>
#include <linux/workqueue.h>
#include "ctree.h"
#include "block-group.h"
#include "discard.h"
#include "free-space-cache.h"
/*
* This contains the logic to handle async discard.
*
* Async discard manages trimming of free space outside of transaction commit.
* Discarding is done by managing the block_groups on a LRU list based on free
* space recency. Two passes are used to first prioritize discarding extents
* and then allow for trimming in the bitmap the best opportunity to coalesce.
* The block_groups are maintained on multiple lists to allow for multiple
* passes with different discard filter requirements. A delayed work item is
* used to manage discarding with timeout determined by a max of the delay
* incurred by the iops rate limit, the byte rate limit, and the max delay of
* BTRFS_DISCARD_MAX_DELAY.
*
* Note, this only keeps track of block_groups that are explicitly for data.
* Mixed block_groups are not supported.
*
* The first list is special to manage discarding of fully free block groups.
* This is necessary because we issue a final trim for a full free block group
* after forgetting it. When a block group becomes unused, instead of directly
* being added to the unused_bgs list, we add it to this first list. Then
* from there, if it becomes fully discarded, we place it onto the unused_bgs
* list.
*
* The in-memory free space cache serves as the backing state for discard.
* Consequently this means there is no persistence. We opt to load all the
* block groups in as not discarded, so the mount case degenerates to the
* crashing case.
*
* As the free space cache uses bitmaps, there exists a tradeoff between
* ease/efficiency for find_free_extent() and the accuracy of discard state.
* Here we opt to let untrimmed regions merge with everything while only letting
* trimmed regions merge with other trimmed regions. This can cause
* overtrimming, but the coalescing benefit seems to be worth it. Additionally,
* bitmap state is tracked as a whole. If we're able to fully trim a bitmap,
* the trimmed flag is set on the bitmap. Otherwise, if an allocation comes in,
* this resets the state and we will retry trimming the whole bitmap. This is a
* tradeoff between discard state accuracy and the cost of accounting.
*/
/* This is an initial delay to give some chance for block reuse */
#define BTRFS_DISCARD_DELAY (120ULL * NSEC_PER_SEC)
#define BTRFS_DISCARD_UNUSED_DELAY (10ULL * NSEC_PER_SEC)
/* Target completion latency of discarding all discardable extents */
#define BTRFS_DISCARD_TARGET_MSEC (6 * 60 * 60UL * MSEC_PER_SEC)
#define BTRFS_DISCARD_MIN_DELAY_MSEC (1UL)
#define BTRFS_DISCARD_MAX_DELAY_MSEC (1000UL)
#define BTRFS_DISCARD_MAX_IOPS (10U)
/* Montonically decreasing minimum length filters after index 0 */
static int discard_minlen[BTRFS_NR_DISCARD_LISTS] = {
0,
BTRFS_ASYNC_DISCARD_MAX_FILTER,
BTRFS_ASYNC_DISCARD_MIN_FILTER
};
static struct list_head *get_discard_list(struct btrfs_discard_ctl *discard_ctl,
struct btrfs_block_group *block_group)
{
return &discard_ctl->discard_list[block_group->discard_index];
}
static void __add_to_discard_list(struct btrfs_discard_ctl *discard_ctl,
struct btrfs_block_group *block_group)
{
if (!btrfs_run_discard_work(discard_ctl))
return;
if (list_empty(&block_group->discard_list) ||
block_group->discard_index == BTRFS_DISCARD_INDEX_UNUSED) {
if (block_group->discard_index == BTRFS_DISCARD_INDEX_UNUSED)
block_group->discard_index = BTRFS_DISCARD_INDEX_START;
block_group->discard_eligible_time = (ktime_get_ns() +
BTRFS_DISCARD_DELAY);
block_group->discard_state = BTRFS_DISCARD_RESET_CURSOR;
}
list_move_tail(&block_group->discard_list,
get_discard_list(discard_ctl, block_group));
}
static void add_to_discard_list(struct btrfs_discard_ctl *discard_ctl,
struct btrfs_block_group *block_group)
{
if (!btrfs_is_block_group_data_only(block_group))
return;
spin_lock(&discard_ctl->lock);
__add_to_discard_list(discard_ctl, block_group);
spin_unlock(&discard_ctl->lock);
}
static void add_to_discard_unused_list(struct btrfs_discard_ctl *discard_ctl,
struct btrfs_block_group *block_group)
{
spin_lock(&discard_ctl->lock);
if (!btrfs_run_discard_work(discard_ctl)) {
spin_unlock(&discard_ctl->lock);
return;
}
list_del_init(&block_group->discard_list);
block_group->discard_index = BTRFS_DISCARD_INDEX_UNUSED;
block_group->discard_eligible_time = (ktime_get_ns() +
BTRFS_DISCARD_UNUSED_DELAY);
block_group->discard_state = BTRFS_DISCARD_RESET_CURSOR;
list_add_tail(&block_group->discard_list,
&discard_ctl->discard_list[BTRFS_DISCARD_INDEX_UNUSED]);
spin_unlock(&discard_ctl->lock);
}
static bool remove_from_discard_list(struct btrfs_discard_ctl *discard_ctl,
struct btrfs_block_group *block_group)
{
bool running = false;
spin_lock(&discard_ctl->lock);
if (block_group == discard_ctl->block_group) {
running = true;
discard_ctl->block_group = NULL;
}
block_group->discard_eligible_time = 0;
list_del_init(&block_group->discard_list);
spin_unlock(&discard_ctl->lock);
return running;
}
/**
* find_next_block_group - find block_group that's up next for discarding
* @discard_ctl: discard control
* @now: current time
*
* Iterate over the discard lists to find the next block_group up for
* discarding checking the discard_eligible_time of block_group.
*/
static struct btrfs_block_group *find_next_block_group(
struct btrfs_discard_ctl *discard_ctl,
u64 now)
{
struct btrfs_block_group *ret_block_group = NULL, *block_group;
int i;
for (i = 0; i < BTRFS_NR_DISCARD_LISTS; i++) {
struct list_head *discard_list = &discard_ctl->discard_list[i];
if (!list_empty(discard_list)) {
block_group = list_first_entry(discard_list,
struct btrfs_block_group,
discard_list);
if (!ret_block_group)
ret_block_group = block_group;
if (ret_block_group->discard_eligible_time < now)
break;
if (ret_block_group->discard_eligible_time >
block_group->discard_eligible_time)
ret_block_group = block_group;
}
}
return ret_block_group;
}
/**
* Wrap find_next_block_group()
*
* @discard_ctl: discard control
* @discard_state: the discard_state of the block_group after state management
* @discard_index: the discard_index of the block_group after state management
* @now: time when discard was invoked, in ns
*
* This wraps find_next_block_group() and sets the block_group to be in use.
* discard_state's control flow is managed here. Variables related to
* discard_state are reset here as needed (eg discard_cursor). @discard_state
* and @discard_index are remembered as it may change while we're discarding,
* but we want the discard to execute in the context determined here.
*/
static struct btrfs_block_group *peek_discard_list(
struct btrfs_discard_ctl *discard_ctl,
enum btrfs_discard_state *discard_state,
int *discard_index, u64 now)
{
struct btrfs_block_group *block_group;
spin_lock(&discard_ctl->lock);
again:
block_group = find_next_block_group(discard_ctl, now);
if (block_group && now >= block_group->discard_eligible_time) {
if (block_group->discard_index == BTRFS_DISCARD_INDEX_UNUSED &&
block_group->used != 0) {
if (btrfs_is_block_group_data_only(block_group))
__add_to_discard_list(discard_ctl, block_group);
else
list_del_init(&block_group->discard_list);
goto again;
}
if (block_group->discard_state == BTRFS_DISCARD_RESET_CURSOR) {
block_group->discard_cursor = block_group->start;
block_group->discard_state = BTRFS_DISCARD_EXTENTS;
}
discard_ctl->block_group = block_group;
}
if (block_group) {
*discard_state = block_group->discard_state;
*discard_index = block_group->discard_index;
}
spin_unlock(&discard_ctl->lock);
return block_group;
}
/**
* btrfs_discard_check_filter - updates a block groups filters
* @block_group: block group of interest
* @bytes: recently freed region size after coalescing
*
* Async discard maintains multiple lists with progressively smaller filters
* to prioritize discarding based on size. Should a free space that matches
* a larger filter be returned to the free_space_cache, prioritize that discard
* by moving @block_group to the proper filter.
*/
void btrfs_discard_check_filter(struct btrfs_block_group *block_group,
u64 bytes)
{
struct btrfs_discard_ctl *discard_ctl;
if (!block_group || !btrfs_test_opt(block_group->fs_info, DISCARD_ASYNC))
return;
discard_ctl = &block_group->fs_info->discard_ctl; if (block_group->discard_index > BTRFS_DISCARD_INDEX_START && bytes >= discard_minlen[block_group->discard_index - 1]) {
int i;
remove_from_discard_list(discard_ctl, block_group);
for (i = BTRFS_DISCARD_INDEX_START; i < BTRFS_NR_DISCARD_LISTS;
i++) {
if (bytes >= discard_minlen[i]) { block_group->discard_index = i;
add_to_discard_list(discard_ctl, block_group);
break;
}
}
}
}
/**
* btrfs_update_discard_index - moves a block group along the discard lists
* @discard_ctl: discard control
* @block_group: block_group of interest
*
* Increment @block_group's discard_index. If it falls of the list, let it be.
* Otherwise add it back to the appropriate list.
*/
static void btrfs_update_discard_index(struct btrfs_discard_ctl *discard_ctl,
struct btrfs_block_group *block_group)
{
block_group->discard_index++;
if (block_group->discard_index == BTRFS_NR_DISCARD_LISTS) {
block_group->discard_index = 1;
return;
}
add_to_discard_list(discard_ctl, block_group);
}
/**
* btrfs_discard_cancel_work - remove a block_group from the discard lists
* @discard_ctl: discard control
* @block_group: block_group of interest
*
* This removes @block_group from the discard lists. If necessary, it waits on
* the current work and then reschedules the delayed work.
*/
void btrfs_discard_cancel_work(struct btrfs_discard_ctl *discard_ctl,
struct btrfs_block_group *block_group)
{
if (remove_from_discard_list(discard_ctl, block_group)) {
cancel_delayed_work_sync(&discard_ctl->work);
btrfs_discard_schedule_work(discard_ctl, true);
}
}
/**
* btrfs_discard_queue_work - handles queuing the block_groups
* @discard_ctl: discard control
* @block_group: block_group of interest
*
* This maintains the LRU order of the discard lists.
*/
void btrfs_discard_queue_work(struct btrfs_discard_ctl *discard_ctl,
struct btrfs_block_group *block_group)
{
if (!block_group || !btrfs_test_opt(block_group->fs_info, DISCARD_ASYNC))
return;
if (block_group->used == 0)
add_to_discard_unused_list(discard_ctl, block_group);
else
add_to_discard_list(discard_ctl, block_group);
if (!delayed_work_pending(&discard_ctl->work))
btrfs_discard_schedule_work(discard_ctl, false);
}
static void __btrfs_discard_schedule_work(struct btrfs_discard_ctl *discard_ctl,
u64 now, bool override)
{
struct btrfs_block_group *block_group;
if (!btrfs_run_discard_work(discard_ctl))
return;
if (!override && delayed_work_pending(&discard_ctl->work))
return;
block_group = find_next_block_group(discard_ctl, now);
if (block_group) {
u64 delay = discard_ctl->delay_ms * NSEC_PER_MSEC;
u32 kbps_limit = READ_ONCE(discard_ctl->kbps_limit);
/*
* A single delayed workqueue item is responsible for
* discarding, so we can manage the bytes rate limit by keeping
* track of the previous discard.
*/
if (kbps_limit && discard_ctl->prev_discard) {
u64 bps_limit = ((u64)kbps_limit) * SZ_1K;
u64 bps_delay = div64_u64(discard_ctl->prev_discard *
NSEC_PER_SEC, bps_limit);
delay = max(delay, bps_delay);
}
/*
* This timeout is to hopefully prevent immediate discarding
* in a recently allocated block group.
*/
if (now < block_group->discard_eligible_time) {
u64 bg_timeout = block_group->discard_eligible_time - now;
delay = max(delay, bg_timeout);
}
if (override && discard_ctl->prev_discard) {
u64 elapsed = now - discard_ctl->prev_discard_time;
if (delay > elapsed)
delay -= elapsed;
else
delay = 0;
}
mod_delayed_work(discard_ctl->discard_workers,
&discard_ctl->work, nsecs_to_jiffies(delay));
}
}
/*
* btrfs_discard_schedule_work - responsible for scheduling the discard work
* @discard_ctl: discard control
* @override: override the current timer
*
* Discards are issued by a delayed workqueue item. @override is used to
* update the current delay as the baseline delay interval is reevaluated on
* transaction commit. This is also maxed with any other rate limit.
*/
void btrfs_discard_schedule_work(struct btrfs_discard_ctl *discard_ctl,
bool override)
{
const u64 now = ktime_get_ns();
spin_lock(&discard_ctl->lock);
__btrfs_discard_schedule_work(discard_ctl, now, override);
spin_unlock(&discard_ctl->lock);
}
/**
* btrfs_finish_discard_pass - determine next step of a block_group
* @discard_ctl: discard control
* @block_group: block_group of interest
*
* This determines the next step for a block group after it's finished going
* through a pass on a discard list. If it is unused and fully trimmed, we can
* mark it unused and send it to the unused_bgs path. Otherwise, pass it onto
* the appropriate filter list or let it fall off.
*/
static void btrfs_finish_discard_pass(struct btrfs_discard_ctl *discard_ctl,
struct btrfs_block_group *block_group)
{
remove_from_discard_list(discard_ctl, block_group);
if (block_group->used == 0) {
if (btrfs_is_free_space_trimmed(block_group))
btrfs_mark_bg_unused(block_group);
else
add_to_discard_unused_list(discard_ctl, block_group);
} else {
btrfs_update_discard_index(discard_ctl, block_group);
}
}
/**
* btrfs_discard_workfn - discard work function
* @work: work
*
* This finds the next block_group to start discarding and then discards a
* single region. It does this in a two-pass fashion: first extents and second
* bitmaps. Completely discarded block groups are sent to the unused_bgs path.
*/
static void btrfs_discard_workfn(struct work_struct *work)
{
struct btrfs_discard_ctl *discard_ctl;
struct btrfs_block_group *block_group;
enum btrfs_discard_state discard_state;
int discard_index = 0;
u64 trimmed = 0;
u64 minlen = 0;
u64 now = ktime_get_ns();
discard_ctl = container_of(work, struct btrfs_discard_ctl, work.work);
block_group = peek_discard_list(discard_ctl, &discard_state,
&discard_index, now);
if (!block_group || !btrfs_run_discard_work(discard_ctl))
return;
if (now < block_group->discard_eligible_time) {
btrfs_discard_schedule_work(discard_ctl, false);
return;
}
/* Perform discarding */
minlen = discard_minlen[discard_index];
if (discard_state == BTRFS_DISCARD_BITMAPS) {
u64 maxlen = 0;
/*
* Use the previous levels minimum discard length as the max
* length filter. In the case something is added to make a
* region go beyond the max filter, the entire bitmap is set
* back to BTRFS_TRIM_STATE_UNTRIMMED.
*/
if (discard_index != BTRFS_DISCARD_INDEX_UNUSED)
maxlen = discard_minlen[discard_index - 1];
btrfs_trim_block_group_bitmaps(block_group, &trimmed,
block_group->discard_cursor,
btrfs_block_group_end(block_group),
minlen, maxlen, true);
discard_ctl->discard_bitmap_bytes += trimmed;
} else {
btrfs_trim_block_group_extents(block_group, &trimmed,
block_group->discard_cursor,
btrfs_block_group_end(block_group),
minlen, true);
discard_ctl->discard_extent_bytes += trimmed;
}
/* Determine next steps for a block_group */
if (block_group->discard_cursor >= btrfs_block_group_end(block_group)) {
if (discard_state == BTRFS_DISCARD_BITMAPS) {
btrfs_finish_discard_pass(discard_ctl, block_group);
} else {
block_group->discard_cursor = block_group->start;
spin_lock(&discard_ctl->lock);
if (block_group->discard_state !=
BTRFS_DISCARD_RESET_CURSOR)
block_group->discard_state =
BTRFS_DISCARD_BITMAPS;
spin_unlock(&discard_ctl->lock);
}
}
now = ktime_get_ns();
spin_lock(&discard_ctl->lock);
discard_ctl->prev_discard = trimmed;
discard_ctl->prev_discard_time = now;
discard_ctl->block_group = NULL;
__btrfs_discard_schedule_work(discard_ctl, now, false);
spin_unlock(&discard_ctl->lock);
}
/**
* btrfs_run_discard_work - determines if async discard should be running
* @discard_ctl: discard control
*
* Checks if the file system is writeable and BTRFS_FS_DISCARD_RUNNING is set.
*/
bool btrfs_run_discard_work(struct btrfs_discard_ctl *discard_ctl)
{
struct btrfs_fs_info *fs_info = container_of(discard_ctl,
struct btrfs_fs_info,
discard_ctl);
return (!(fs_info->sb->s_flags & SB_RDONLY) &&
test_bit(BTRFS_FS_DISCARD_RUNNING, &fs_info->flags));
}
/**
* btrfs_discard_calc_delay - recalculate the base delay
* @discard_ctl: discard control
*
* Recalculate the base delay which is based off the total number of
* discardable_extents. Clamp this between the lower_limit (iops_limit or 1ms)
* and the upper_limit (BTRFS_DISCARD_MAX_DELAY_MSEC).
*/
void btrfs_discard_calc_delay(struct btrfs_discard_ctl *discard_ctl)
{
s32 discardable_extents;
s64 discardable_bytes;
u32 iops_limit;
unsigned long delay;
discardable_extents = atomic_read(&discard_ctl->discardable_extents);
if (!discardable_extents)
return;
spin_lock(&discard_ctl->lock);
/*
* The following is to fix a potential -1 discrepenancy that we're not
* sure how to reproduce. But given that this is the only place that
* utilizes these numbers and this is only called by from
* btrfs_finish_extent_commit() which is synchronized, we can correct
* here.
*/
if (discardable_extents < 0)
atomic_add(-discardable_extents,
&discard_ctl->discardable_extents);
discardable_bytes = atomic64_read(&discard_ctl->discardable_bytes);
if (discardable_bytes < 0)
atomic64_add(-discardable_bytes,
&discard_ctl->discardable_bytes);
if (discardable_extents <= 0) {
spin_unlock(&discard_ctl->lock);
return;
}
iops_limit = READ_ONCE(discard_ctl->iops_limit);
if (iops_limit)
delay = MSEC_PER_SEC / iops_limit;
else
delay = BTRFS_DISCARD_TARGET_MSEC / discardable_extents;
delay = clamp(delay, BTRFS_DISCARD_MIN_DELAY_MSEC,
BTRFS_DISCARD_MAX_DELAY_MSEC);
discard_ctl->delay_ms = delay;
spin_unlock(&discard_ctl->lock);
}
/**
* btrfs_discard_update_discardable - propagate discard counters
* @block_group: block_group of interest
*
* This propagates deltas of counters up to the discard_ctl. It maintains a
* current counter and a previous counter passing the delta up to the global
* stat. Then the current counter value becomes the previous counter value.
*/
void btrfs_discard_update_discardable(struct btrfs_block_group *block_group)
{
struct btrfs_free_space_ctl *ctl;
struct btrfs_discard_ctl *discard_ctl;
s32 extents_delta;
s64 bytes_delta;
if (!block_group || !btrfs_test_opt(block_group->fs_info, DISCARD_ASYNC) ||
!btrfs_is_block_group_data_only(block_group))
return;
ctl = block_group->free_space_ctl;
discard_ctl = &block_group->fs_info->discard_ctl;
lockdep_assert_held(&ctl->tree_lock);
extents_delta = ctl->discardable_extents[BTRFS_STAT_CURR] -
ctl->discardable_extents[BTRFS_STAT_PREV];
if (extents_delta) {
atomic_add(extents_delta, &discard_ctl->discardable_extents);
ctl->discardable_extents[BTRFS_STAT_PREV] =
ctl->discardable_extents[BTRFS_STAT_CURR];
}
bytes_delta = ctl->discardable_bytes[BTRFS_STAT_CURR] -
ctl->discardable_bytes[BTRFS_STAT_PREV];
if (bytes_delta) {
atomic64_add(bytes_delta, &discard_ctl->discardable_bytes);
ctl->discardable_bytes[BTRFS_STAT_PREV] =
ctl->discardable_bytes[BTRFS_STAT_CURR];
}
}
/**
* btrfs_discard_punt_unused_bgs_list - punt unused_bgs list to discard lists
* @fs_info: fs_info of interest
*
* The unused_bgs list needs to be punted to the discard lists because the
* order of operations is changed. In the normal synchronous discard path, the
* block groups are trimmed via a single large trim in transaction commit. This
* is ultimately what we are trying to avoid with asynchronous discard. Thus,
* it must be done before going down the unused_bgs path.
*/
void btrfs_discard_punt_unused_bgs_list(struct btrfs_fs_info *fs_info)
{
struct btrfs_block_group *block_group, *next;
spin_lock(&fs_info->unused_bgs_lock);
/* We enabled async discard, so punt all to the queue */
list_for_each_entry_safe(block_group, next, &fs_info->unused_bgs,
bg_list) {
list_del_init(&block_group->bg_list);
btrfs_put_block_group(block_group);
btrfs_discard_queue_work(&fs_info->discard_ctl, block_group);
}
spin_unlock(&fs_info->unused_bgs_lock);
}
/**
* btrfs_discard_purge_list - purge discard lists
* @discard_ctl: discard control
*
* If we are disabling async discard, we may have intercepted block groups that
* are completely free and ready for the unused_bgs path. As discarding will
* now happen in transaction commit or not at all, we can safely mark the
* corresponding block groups as unused and they will be sent on their merry
* way to the unused_bgs list.
*/
static void btrfs_discard_purge_list(struct btrfs_discard_ctl *discard_ctl)
{
struct btrfs_block_group *block_group, *next;
int i;
spin_lock(&discard_ctl->lock);
for (i = 0; i < BTRFS_NR_DISCARD_LISTS; i++) { list_for_each_entry_safe(block_group, next,
&discard_ctl->discard_list[i],
discard_list) {
list_del_init(&block_group->discard_list);
spin_unlock(&discard_ctl->lock);
if (block_group->used == 0)
btrfs_mark_bg_unused(block_group);
spin_lock(&discard_ctl->lock);
}
}
spin_unlock(&discard_ctl->lock);
}
void btrfs_discard_resume(struct btrfs_fs_info *fs_info)
{
if (!btrfs_test_opt(fs_info, DISCARD_ASYNC)) { btrfs_discard_cleanup(fs_info);
return;
}
btrfs_discard_punt_unused_bgs_list(fs_info);
set_bit(BTRFS_FS_DISCARD_RUNNING, &fs_info->flags);
}
void btrfs_discard_stop(struct btrfs_fs_info *fs_info)
{
clear_bit(BTRFS_FS_DISCARD_RUNNING, &fs_info->flags);
}
void btrfs_discard_init(struct btrfs_fs_info *fs_info)
{
struct btrfs_discard_ctl *discard_ctl = &fs_info->discard_ctl;
int i;
spin_lock_init(&discard_ctl->lock);
INIT_DELAYED_WORK(&discard_ctl->work, btrfs_discard_workfn);
for (i = 0; i < BTRFS_NR_DISCARD_LISTS; i++)
INIT_LIST_HEAD(&discard_ctl->discard_list[i]);
discard_ctl->prev_discard = 0;
discard_ctl->prev_discard_time = 0;
atomic_set(&discard_ctl->discardable_extents, 0);
atomic64_set(&discard_ctl->discardable_bytes, 0);
discard_ctl->max_discard_size = BTRFS_ASYNC_DISCARD_DEFAULT_MAX_SIZE;
discard_ctl->delay_ms = BTRFS_DISCARD_MAX_DELAY_MSEC;
discard_ctl->iops_limit = BTRFS_DISCARD_MAX_IOPS;
discard_ctl->kbps_limit = 0;
discard_ctl->discard_extent_bytes = 0;
discard_ctl->discard_bitmap_bytes = 0;
atomic64_set(&discard_ctl->discard_bytes_saved, 0);
}
void btrfs_discard_cleanup(struct btrfs_fs_info *fs_info)
{
btrfs_discard_stop(fs_info);
cancel_delayed_work_sync(&fs_info->discard_ctl.work);
btrfs_discard_purge_list(&fs_info->discard_ctl);
}
// SPDX-License-Identifier: GPL-2.0
/*
* Copyright (C) 2009 Oracle. All rights reserved.
*/
#include <linux/sched.h>
#include <linux/slab.h>
#include <linux/sort.h>
#include "ctree.h"
#include "delayed-ref.h"
#include "transaction.h"
#include "qgroup.h"
#include "space-info.h"
#include "tree-mod-log.h"
struct kmem_cache *btrfs_delayed_ref_head_cachep;
struct kmem_cache *btrfs_delayed_tree_ref_cachep;
struct kmem_cache *btrfs_delayed_data_ref_cachep;
struct kmem_cache *btrfs_delayed_extent_op_cachep;
/*
* delayed back reference update tracking. For subvolume trees
* we queue up extent allocations and backref maintenance for
* delayed processing. This avoids deep call chains where we
* add extents in the middle of btrfs_search_slot, and it allows
* us to buffer up frequently modified backrefs in an rb tree instead
* of hammering updates on the extent allocation tree.
*/
bool btrfs_check_space_for_delayed_refs(struct btrfs_fs_info *fs_info)
{
struct btrfs_block_rsv *delayed_refs_rsv = &fs_info->delayed_refs_rsv;
struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
bool ret = false;
u64 reserved;
spin_lock(&global_rsv->lock);
reserved = global_rsv->reserved;
spin_unlock(&global_rsv->lock);
/*
* Since the global reserve is just kind of magic we don't really want
* to rely on it to save our bacon, so if our size is more than the
* delayed_refs_rsv and the global rsv then it's time to think about
* bailing.
*/
spin_lock(&delayed_refs_rsv->lock);
reserved += delayed_refs_rsv->reserved;
if (delayed_refs_rsv->size >= reserved)
ret = true;
spin_unlock(&delayed_refs_rsv->lock);
return ret;
}
int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans)
{
u64 num_entries =
atomic_read(&trans->transaction->delayed_refs.num_entries);
u64 avg_runtime;
u64 val;
smp_mb();
avg_runtime = trans->fs_info->avg_delayed_ref_runtime;
val = num_entries * avg_runtime;
if (val >= NSEC_PER_SEC)
return 1;
if (val >= NSEC_PER_SEC / 2)
return 2;
return btrfs_check_space_for_delayed_refs(trans->fs_info);
}
/**
* Release a ref head's reservation
*
* @fs_info: the filesystem
* @nr: number of items to drop
*
* This drops the delayed ref head's count from the delayed refs rsv and frees
* any excess reservation we had.
*/
void btrfs_delayed_refs_rsv_release(struct btrfs_fs_info *fs_info, int nr)
{
struct btrfs_block_rsv *block_rsv = &fs_info->delayed_refs_rsv;
u64 num_bytes = btrfs_calc_insert_metadata_size(fs_info, nr);
u64 released = 0;
released = btrfs_block_rsv_release(fs_info, block_rsv, num_bytes, NULL);
if (released)
trace_btrfs_space_reservation(fs_info, "delayed_refs_rsv",
0, released, 0);
}
/*
* btrfs_update_delayed_refs_rsv - adjust the size of the delayed refs rsv
* @trans - the trans that may have generated delayed refs
*
* This is to be called anytime we may have adjusted trans->delayed_ref_updates,
* it'll calculate the additional size and add it to the delayed_refs_rsv.
*/
void btrfs_update_delayed_refs_rsv(struct btrfs_trans_handle *trans)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
struct btrfs_block_rsv *delayed_rsv = &fs_info->delayed_refs_rsv;
u64 num_bytes;
if (!trans->delayed_ref_updates)
return;
num_bytes = btrfs_calc_insert_metadata_size(fs_info,
trans->delayed_ref_updates);
spin_lock(&delayed_rsv->lock);
delayed_rsv->size += num_bytes;
delayed_rsv->full = 0;
spin_unlock(&delayed_rsv->lock);
trans->delayed_ref_updates = 0;
}
/**
* Transfer bytes to our delayed refs rsv
*
* @fs_info: the filesystem
* @src: source block rsv to transfer from
* @num_bytes: number of bytes to transfer
*
* This transfers up to the num_bytes amount from the src rsv to the
* delayed_refs_rsv. Any extra bytes are returned to the space info.
*/
void btrfs_migrate_to_delayed_refs_rsv(struct btrfs_fs_info *fs_info,
struct btrfs_block_rsv *src,
u64 num_bytes)
{
struct btrfs_block_rsv *delayed_refs_rsv = &fs_info->delayed_refs_rsv;
u64 to_free = 0;
spin_lock(&src->lock);
src->reserved -= num_bytes;
src->size -= num_bytes;
spin_unlock(&src->lock);
spin_lock(&delayed_refs_rsv->lock);
if (delayed_refs_rsv->size > delayed_refs_rsv->reserved) {
u64 delta = delayed_refs_rsv->size -
delayed_refs_rsv->reserved;
if (num_bytes > delta) {
to_free = num_bytes - delta;
num_bytes = delta;
}
} else {
to_free = num_bytes;
num_bytes = 0;
}
if (num_bytes) delayed_refs_rsv->reserved += num_bytes;
if (delayed_refs_rsv->reserved >= delayed_refs_rsv->size)
delayed_refs_rsv->full = 1;
spin_unlock(&delayed_refs_rsv->lock);
if (num_bytes)
trace_btrfs_space_reservation(fs_info, "delayed_refs_rsv",
0, num_bytes, 1);
if (to_free) btrfs_space_info_free_bytes_may_use(fs_info,
delayed_refs_rsv->space_info, to_free);
}
/**
* Refill based on our delayed refs usage
*
* @fs_info: the filesystem
* @flush: control how we can flush for this reservation.
*
* This will refill the delayed block_rsv up to 1 items size worth of space and
* will return -ENOSPC if we can't make the reservation.
*/
int btrfs_delayed_refs_rsv_refill(struct btrfs_fs_info *fs_info,
enum btrfs_reserve_flush_enum flush)
{
struct btrfs_block_rsv *block_rsv = &fs_info->delayed_refs_rsv; u64 limit = btrfs_calc_insert_metadata_size(fs_info, 1);
u64 num_bytes = 0;
int ret = -ENOSPC;
spin_lock(&block_rsv->lock);
if (block_rsv->reserved < block_rsv->size) {
num_bytes = block_rsv->size - block_rsv->reserved;
num_bytes = min(num_bytes, limit);
}
spin_unlock(&block_rsv->lock);
if (!num_bytes)
return 0;
ret = btrfs_reserve_metadata_bytes(fs_info->extent_root, block_rsv,
num_bytes, flush);
if (ret)
return ret;
btrfs_block_rsv_add_bytes(block_rsv, num_bytes, 0);
trace_btrfs_space_reservation(fs_info, "delayed_refs_rsv",
0, num_bytes, 1);
return 0;
}
/*
* compare two delayed tree backrefs with same bytenr and type
*/
static int comp_tree_refs(struct btrfs_delayed_tree_ref *ref1,
struct btrfs_delayed_tree_ref *ref2)
{
if (ref1->node.type == BTRFS_TREE_BLOCK_REF_KEY) {
if (ref1->root < ref2->root)
return -1;
if (ref1->root > ref2->root)
return 1;
} else {
if (ref1->parent < ref2->parent)
return -1;
if (ref1->parent > ref2->parent)
return 1;
}
return 0;
}
/*
* compare two delayed data backrefs with same bytenr and type
*/
static int comp_data_refs(struct btrfs_delayed_data_ref *ref1,
struct btrfs_delayed_data_ref *ref2)
{
if (ref1->node.type == BTRFS_EXTENT_DATA_REF_KEY) { if (ref1->root < ref2->root)
return -1;
if (ref1->root > ref2->root)
return 1;
if (ref1->objectid < ref2->objectid)
return -1;
if (ref1->objectid > ref2->objectid)
return 1;
if (ref1->offset < ref2->offset)
return -1;
if (ref1->offset > ref2->offset)
return 1;
} else {
if (ref1->parent < ref2->parent)
return -1;
if (ref1->parent > ref2->parent)
return 1;
}
return 0;
}
static int comp_refs(struct btrfs_delayed_ref_node *ref1,
struct btrfs_delayed_ref_node *ref2,
bool check_seq)
{
int ret = 0;
if (ref1->type < ref2->type)
return -1;
if (ref1->type > ref2->type) return 1; if (ref1->type == BTRFS_TREE_BLOCK_REF_KEY ||
ref1->type == BTRFS_SHARED_BLOCK_REF_KEY)
ret = comp_tree_refs(btrfs_delayed_node_to_tree_ref(ref1),
btrfs_delayed_node_to_tree_ref(ref2));
else
ret = comp_data_refs(btrfs_delayed_node_to_data_ref(ref1),
btrfs_delayed_node_to_data_ref(ref2));
if (ret)
return ret;
if (check_seq) { if (ref1->seq < ref2->seq)
return -1;
if (ref1->seq > ref2->seq)
return 1;
}
return 0;
}
/* insert a new ref to head ref rbtree */
static struct btrfs_delayed_ref_head *htree_insert(struct rb_root_cached *root,
struct rb_node *node)
{
struct rb_node **p = &root->rb_root.rb_node;
struct rb_node *parent_node = NULL;
struct btrfs_delayed_ref_head *entry;
struct btrfs_delayed_ref_head *ins;
u64 bytenr;
bool leftmost = true;
ins = rb_entry(node, struct btrfs_delayed_ref_head, href_node);
bytenr = ins->bytenr;
while (*p) {
parent_node = *p;
entry = rb_entry(parent_node, struct btrfs_delayed_ref_head,
href_node);
if (bytenr < entry->bytenr) { p = &(*p)->rb_left; } else if (bytenr > entry->bytenr) { p = &(*p)->rb_right;
leftmost = false;
} else {
return entry;
}
}
rb_link_node(node, parent_node, p);
rb_insert_color_cached(node, root, leftmost);
return NULL;
}
static struct btrfs_delayed_ref_node* tree_insert(struct rb_root_cached *root,
struct btrfs_delayed_ref_node *ins)
{
struct rb_node **p = &root->rb_root.rb_node;
struct rb_node *node = &ins->ref_node;
struct rb_node *parent_node = NULL;
struct btrfs_delayed_ref_node *entry;
bool leftmost = true;
while (*p) {
int comp;
parent_node = *p;
entry = rb_entry(parent_node, struct btrfs_delayed_ref_node,
ref_node);
comp = comp_refs(ins, entry, true);
if (comp < 0) {
p = &(*p)->rb_left; } else if (comp > 0) { p = &(*p)->rb_right;
leftmost = false;
} else {
return entry;
}
}
rb_link_node(node, parent_node, p);
rb_insert_color_cached(node, root, leftmost);
return NULL;
}
static struct btrfs_delayed_ref_head *find_first_ref_head(
struct btrfs_delayed_ref_root *dr)
{
struct rb_node *n;
struct btrfs_delayed_ref_head *entry;
n = rb_first_cached(&dr->href_root);
if (!n)
return NULL;
entry = rb_entry(n, struct btrfs_delayed_ref_head, href_node);
return entry;
}
/*
* Find a head entry based on bytenr. This returns the delayed ref head if it
* was able to find one, or NULL if nothing was in that spot. If return_bigger
* is given, the next bigger entry is returned if no exact match is found.
*/
static struct btrfs_delayed_ref_head *find_ref_head(
struct btrfs_delayed_ref_root *dr, u64 bytenr,
bool return_bigger)
{
struct rb_root *root = &dr->href_root.rb_root;
struct rb_node *n;
struct btrfs_delayed_ref_head *entry;
n = root->rb_node;
entry = NULL;
while (n) { entry = rb_entry(n, struct btrfs_delayed_ref_head, href_node);
if (bytenr < entry->bytenr)
n = n->rb_left; else if (bytenr > entry->bytenr) n = n->rb_right;
else
return entry;
}
if (entry && return_bigger) { if (bytenr > entry->bytenr) { n = rb_next(&entry->href_node);
if (!n)
return NULL;
entry = rb_entry(n, struct btrfs_delayed_ref_head,
href_node);
}
return entry;
}
return NULL;
}
int btrfs_delayed_ref_lock(struct btrfs_delayed_ref_root *delayed_refs,
struct btrfs_delayed_ref_head *head)
{
lockdep_assert_held(&delayed_refs->lock);
if (mutex_trylock(&head->mutex)) return 0; refcount_inc(&head->refs);
spin_unlock(&delayed_refs->lock);
mutex_lock(&head->mutex);
spin_lock(&delayed_refs->lock);
if (RB_EMPTY_NODE(&head->href_node)) {
mutex_unlock(&head->mutex);
btrfs_put_delayed_ref_head(head);
return -EAGAIN;
}
btrfs_put_delayed_ref_head(head);
return 0;
}
static inline void drop_delayed_ref(struct btrfs_trans_handle *trans,
struct btrfs_delayed_ref_root *delayed_refs,
struct btrfs_delayed_ref_head *head,
struct btrfs_delayed_ref_node *ref)
{
lockdep_assert_held(&head->lock);
rb_erase_cached(&ref->ref_node, &head->ref_tree);
RB_CLEAR_NODE(&ref->ref_node);
if (!list_empty(&ref->add_list))
list_del(&ref->add_list);
ref->in_tree = 0;
btrfs_put_delayed_ref(ref);
atomic_dec(&delayed_refs->num_entries);
}
static bool merge_ref(struct btrfs_trans_handle *trans,
struct btrfs_delayed_ref_root *delayed_refs,
struct btrfs_delayed_ref_head *head,
struct btrfs_delayed_ref_node *ref,
u64 seq)
{
struct btrfs_delayed_ref_node *next;
struct rb_node *node = rb_next(&ref->ref_node);
bool done = false;
while (!done && node) {
int mod;
next = rb_entry(node, struct btrfs_delayed_ref_node, ref_node);
node = rb_next(node); if (seq && next->seq >= seq)
break;
if (comp_refs(ref, next, false))
break;
if (ref->action == next->action) {
mod = next->ref_mod;
} else {
if (ref->ref_mod < next->ref_mod) {
swap(ref, next);
done = true;
}
mod = -next->ref_mod;
}
drop_delayed_ref(trans, delayed_refs, head, next);
ref->ref_mod += mod;
if (ref->ref_mod == 0) {
drop_delayed_ref(trans, delayed_refs, head, ref);
done = true;
} else {
/*
* Can't have multiples of the same ref on a tree block.
*/
WARN_ON(ref->type == BTRFS_TREE_BLOCK_REF_KEY ||
ref->type == BTRFS_SHARED_BLOCK_REF_KEY);
}
}
return done;
}
void btrfs_merge_delayed_refs(struct btrfs_trans_handle *trans,
struct btrfs_delayed_ref_root *delayed_refs,
struct btrfs_delayed_ref_head *head)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
struct btrfs_delayed_ref_node *ref;
struct rb_node *node;
u64 seq = 0;
lockdep_assert_held(&head->lock);
if (RB_EMPTY_ROOT(&head->ref_tree.rb_root))
return;
/* We don't have too many refs to merge for data. */
if (head->is_data)
return;
seq = btrfs_tree_mod_log_lowest_seq(fs_info);
again:
for (node = rb_first_cached(&head->ref_tree); node; node = rb_next(node)) {
ref = rb_entry(node, struct btrfs_delayed_ref_node, ref_node);
if (seq && ref->seq >= seq)
continue;
if (merge_ref(trans, delayed_refs, head, ref, seq))
goto again;
}
}
int btrfs_check_delayed_seq(struct btrfs_fs_info *fs_info, u64 seq)
{
int ret = 0;
u64 min_seq = btrfs_tree_mod_log_lowest_seq(fs_info);
if (min_seq != 0 && seq >= min_seq) {
btrfs_debug(fs_info,
"holding back delayed_ref %llu, lowest is %llu",
seq, min_seq);
ret = 1;
}
return ret;
}
struct btrfs_delayed_ref_head *btrfs_select_ref_head(
struct btrfs_delayed_ref_root *delayed_refs)
{
struct btrfs_delayed_ref_head *head;
again:
head = find_ref_head(delayed_refs, delayed_refs->run_delayed_start,
true);
if (!head && delayed_refs->run_delayed_start != 0) { delayed_refs->run_delayed_start = 0;
head = find_first_ref_head(delayed_refs);
}
if (!head)
return NULL;
while (head->processing) {
struct rb_node *node;
node = rb_next(&head->href_node);
if (!node) {
if (delayed_refs->run_delayed_start == 0)
return NULL;
delayed_refs->run_delayed_start = 0;
goto again;
}
head = rb_entry(node, struct btrfs_delayed_ref_head,
href_node);
}
head->processing = 1; WARN_ON(delayed_refs->num_heads_ready == 0); delayed_refs->num_heads_ready--;
delayed_refs->run_delayed_start = head->bytenr +
head->num_bytes;
return head;
}
void btrfs_delete_ref_head(struct btrfs_delayed_ref_root *delayed_refs,
struct btrfs_delayed_ref_head *head)
{
lockdep_assert_held(&delayed_refs->lock);
lockdep_assert_held(&head->lock);
rb_erase_cached(&head->href_node, &delayed_refs->href_root);
RB_CLEAR_NODE(&head->href_node);
atomic_dec(&delayed_refs->num_entries);
delayed_refs->num_heads--;
if (head->processing == 0)
delayed_refs->num_heads_ready--;
}
/*
* Helper to insert the ref_node to the tail or merge with tail.
*
* Return 0 for insert.
* Return >0 for merge.
*/
static int insert_delayed_ref(struct btrfs_trans_handle *trans,
struct btrfs_delayed_ref_root *root,
struct btrfs_delayed_ref_head *href,
struct btrfs_delayed_ref_node *ref)
{
struct btrfs_delayed_ref_node *exist;
int mod;
int ret = 0;
spin_lock(&href->lock);
exist = tree_insert(&href->ref_tree, ref);
if (!exist)
goto inserted;
/* Now we are sure we can merge */
ret = 1;
if (exist->action == ref->action) {
mod = ref->ref_mod;
} else {
/* Need to change action */
if (exist->ref_mod < ref->ref_mod) { exist->action = ref->action;
mod = -exist->ref_mod;
exist->ref_mod = ref->ref_mod;
if (ref->action == BTRFS_ADD_DELAYED_REF)
list_add_tail(&exist->add_list,
&href->ref_add_list);
else if (ref->action == BTRFS_DROP_DELAYED_REF) {
ASSERT(!list_empty(&exist->add_list));
list_del(&exist->add_list);
} else {
ASSERT(0);
}
} else
mod = -ref->ref_mod;
}
exist->ref_mod += mod;
/* remove existing tail if its ref_mod is zero */
if (exist->ref_mod == 0)
drop_delayed_ref(trans, root, href, exist);
spin_unlock(&href->lock);
return ret;
inserted:
if (ref->action == BTRFS_ADD_DELAYED_REF)
list_add_tail(&ref->add_list, &href->ref_add_list); atomic_inc(&root->num_entries);
spin_unlock(&href->lock);
return ret;
}
/*
* helper function to update the accounting in the head ref
* existing and update must have the same bytenr
*/
static noinline void update_existing_head_ref(struct btrfs_trans_handle *trans,
struct btrfs_delayed_ref_head *existing,
struct btrfs_delayed_ref_head *update)
{
struct btrfs_delayed_ref_root *delayed_refs =
&trans->transaction->delayed_refs;
struct btrfs_fs_info *fs_info = trans->fs_info;
int old_ref_mod;
BUG_ON(existing->is_data != update->is_data);
spin_lock(&existing->lock);
if (update->must_insert_reserved) {
/* if the extent was freed and then
* reallocated before the delayed ref
* entries were processed, we can end up
* with an existing head ref without
* the must_insert_reserved flag set.
* Set it again here
*/
existing->must_insert_reserved = update->must_insert_reserved;
/*
* update the num_bytes so we make sure the accounting
* is done correctly
*/
existing->num_bytes = update->num_bytes;
}
if (update->extent_op) { if (!existing->extent_op) { existing->extent_op = update->extent_op;
} else {
if (update->extent_op->update_key) {
memcpy(&existing->extent_op->key,
&update->extent_op->key,
sizeof(update->extent_op->key));
existing->extent_op->update_key = true;
}
if (update->extent_op->update_flags) { existing->extent_op->flags_to_set |=
update->extent_op->flags_to_set;
existing->extent_op->update_flags = true;
}
btrfs_free_delayed_extent_op(update->extent_op);
}
}
/*
* update the reference mod on the head to reflect this new operation,
* only need the lock for this case cause we could be processing it
* currently, for refs we just added we know we're a-ok.
*/
old_ref_mod = existing->total_ref_mod;
existing->ref_mod += update->ref_mod;
existing->total_ref_mod += update->ref_mod;
/*
* If we are going to from a positive ref mod to a negative or vice
* versa we need to make sure to adjust pending_csums accordingly.
*/
if (existing->is_data) {
u64 csum_leaves =
btrfs_csum_bytes_to_leaves(fs_info,
existing->num_bytes);
if (existing->total_ref_mod >= 0 && old_ref_mod < 0) { delayed_refs->pending_csums -= existing->num_bytes;
btrfs_delayed_refs_rsv_release(fs_info, csum_leaves);
}
if (existing->total_ref_mod < 0 && old_ref_mod >= 0) { delayed_refs->pending_csums += existing->num_bytes;
trans->delayed_ref_updates += csum_leaves;
}
}
spin_unlock(&existing->lock);
}
static void init_delayed_ref_head(struct btrfs_delayed_ref_head *head_ref,
struct btrfs_qgroup_extent_record *qrecord,
u64 bytenr, u64 num_bytes, u64 ref_root,
u64 reserved, int action, bool is_data,
bool is_system)
{
int count_mod = 1;
int must_insert_reserved = 0;
/* If reserved is provided, it must be a data extent. */
BUG_ON(!is_data && reserved);
/*
* The head node stores the sum of all the mods, so dropping a ref
* should drop the sum in the head node by one.
*/
if (action == BTRFS_UPDATE_DELAYED_HEAD)
count_mod = 0;
else if (action == BTRFS_DROP_DELAYED_REF)
count_mod = -1;
/*
* BTRFS_ADD_DELAYED_EXTENT means that we need to update the reserved
* accounting when the extent is finally added, or if a later
* modification deletes the delayed ref without ever inserting the
* extent into the extent allocation tree. ref->must_insert_reserved
* is the flag used to record that accounting mods are required.
*
* Once we record must_insert_reserved, switch the action to
* BTRFS_ADD_DELAYED_REF because other special casing is not required.
*/
if (action == BTRFS_ADD_DELAYED_EXTENT)
must_insert_reserved = 1;
else
must_insert_reserved = 0;
refcount_set(&head_ref->refs, 1);
head_ref->bytenr = bytenr;
head_ref->num_bytes = num_bytes;
head_ref->ref_mod = count_mod;
head_ref->must_insert_reserved = must_insert_reserved;
head_ref->is_data = is_data;
head_ref->is_system = is_system;
head_ref->ref_tree = RB_ROOT_CACHED;
INIT_LIST_HEAD(&head_ref->ref_add_list);
RB_CLEAR_NODE(&head_ref->href_node);
head_ref->processing = 0;
head_ref->total_ref_mod = count_mod;
spin_lock_init(&head_ref->lock);
mutex_init(&head_ref->mutex);
if (qrecord) {
if (ref_root && reserved) { qrecord->data_rsv = reserved;
qrecord->data_rsv_refroot = ref_root;
}
qrecord->bytenr = bytenr;
qrecord->num_bytes = num_bytes;
qrecord->old_roots = NULL;
}
}
/*
* helper function to actually insert a head node into the rbtree.
* this does all the dirty work in terms of maintaining the correct
* overall modification count.
*/
static noinline struct btrfs_delayed_ref_head *
add_delayed_ref_head(struct btrfs_trans_handle *trans,
struct btrfs_delayed_ref_head *head_ref,
struct btrfs_qgroup_extent_record *qrecord,
int action, int *qrecord_inserted_ret)
{
struct btrfs_delayed_ref_head *existing;
struct btrfs_delayed_ref_root *delayed_refs;
int qrecord_inserted = 0;
delayed_refs = &trans->transaction->delayed_refs;
/* Record qgroup extent info if provided */
if (qrecord) {
if (btrfs_qgroup_trace_extent_nolock(trans->fs_info,
delayed_refs, qrecord))
kfree(qrecord);
else
qrecord_inserted = 1;
}
trace_add_delayed_ref_head(trans->fs_info, head_ref, action);
existing = htree_insert(&delayed_refs->href_root,
&head_ref->href_node);
if (existing) {
update_existing_head_ref(trans, existing, head_ref);
/*
* we've updated the existing ref, free the newly
* allocated ref
*/
kmem_cache_free(btrfs_delayed_ref_head_cachep, head_ref);
head_ref = existing;
} else {
if (head_ref->is_data && head_ref->ref_mod < 0) { delayed_refs->pending_csums += head_ref->num_bytes;
trans->delayed_ref_updates +=
btrfs_csum_bytes_to_leaves(trans->fs_info,
head_ref->num_bytes);
}
delayed_refs->num_heads++;
delayed_refs->num_heads_ready++;
atomic_inc(&delayed_refs->num_entries);
trans->delayed_ref_updates++;
}
if (qrecord_inserted_ret) *qrecord_inserted_ret = qrecord_inserted; return head_ref;
}
/*
* init_delayed_ref_common - Initialize the structure which represents a
* modification to a an extent.
*
* @fs_info: Internal to the mounted filesystem mount structure.
*
* @ref: The structure which is going to be initialized.
*
* @bytenr: The logical address of the extent for which a modification is
* going to be recorded.
*
* @num_bytes: Size of the extent whose modification is being recorded.
*
* @ref_root: The id of the root where this modification has originated, this
* can be either one of the well-known metadata trees or the
* subvolume id which references this extent.
*
* @action: Can be one of BTRFS_ADD_DELAYED_REF/BTRFS_DROP_DELAYED_REF or
* BTRFS_ADD_DELAYED_EXTENT
*
* @ref_type: Holds the type of the extent which is being recorded, can be
* one of BTRFS_SHARED_BLOCK_REF_KEY/BTRFS_TREE_BLOCK_REF_KEY
* when recording a metadata extent or BTRFS_SHARED_DATA_REF_KEY/
* BTRFS_EXTENT_DATA_REF_KEY when recording data extent
*/
static void init_delayed_ref_common(struct btrfs_fs_info *fs_info,
struct btrfs_delayed_ref_node *ref,
u64 bytenr, u64 num_bytes, u64 ref_root,
int action, u8 ref_type)
{
u64 seq = 0;
if (action == BTRFS_ADD_DELAYED_EXTENT)
action = BTRFS_ADD_DELAYED_REF;
if (is_fstree(ref_root))
seq = atomic64_read(&fs_info->tree_mod_seq);
refcount_set(&ref->refs, 1);
ref->bytenr = bytenr;
ref->num_bytes = num_bytes;
ref->ref_mod = 1;
ref->action = action;
ref->is_head = 0;
ref->in_tree = 1;
ref->seq = seq;
ref->type = ref_type;
RB_CLEAR_NODE(&ref->ref_node);
INIT_LIST_HEAD(&ref->add_list);
}
/*
* add a delayed tree ref. This does all of the accounting required
* to make sure the delayed ref is eventually processed before this
* transaction commits.
*/
int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans,
struct btrfs_ref *generic_ref,
struct btrfs_delayed_extent_op *extent_op)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
struct btrfs_delayed_tree_ref *ref;
struct btrfs_delayed_ref_head *head_ref;
struct btrfs_delayed_ref_root *delayed_refs;
struct btrfs_qgroup_extent_record *record = NULL;
int qrecord_inserted;
bool is_system;
int action = generic_ref->action;
int level = generic_ref->tree_ref.level;
int ret;
u64 bytenr = generic_ref->bytenr;
u64 num_bytes = generic_ref->len;
u64 parent = generic_ref->parent;
u8 ref_type;
is_system = (generic_ref->real_root == BTRFS_CHUNK_TREE_OBJECTID);
ASSERT(generic_ref->type == BTRFS_REF_METADATA && generic_ref->action);
BUG_ON(extent_op && extent_op->is_data); ref = kmem_cache_alloc(btrfs_delayed_tree_ref_cachep, GFP_NOFS);
if (!ref)
return -ENOMEM;
head_ref = kmem_cache_alloc(btrfs_delayed_ref_head_cachep, GFP_NOFS);
if (!head_ref) {
kmem_cache_free(btrfs_delayed_tree_ref_cachep, ref);
return -ENOMEM;
}
if (test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) && is_fstree(generic_ref->real_root) &&
is_fstree(generic_ref->tree_ref.root) &&
!generic_ref->skip_qgroup) {
record = kzalloc(sizeof(*record), GFP_NOFS);
if (!record) { kmem_cache_free(btrfs_delayed_tree_ref_cachep, ref);
kmem_cache_free(btrfs_delayed_ref_head_cachep, head_ref);
return -ENOMEM;
}
}
if (parent)
ref_type = BTRFS_SHARED_BLOCK_REF_KEY;
else
ref_type = BTRFS_TREE_BLOCK_REF_KEY;
init_delayed_ref_common(fs_info, &ref->node, bytenr, num_bytes,
generic_ref->tree_ref.root, action, ref_type);
ref->root = generic_ref->tree_ref.root;
ref->parent = parent;
ref->level = level;
init_delayed_ref_head(head_ref, record, bytenr, num_bytes,
generic_ref->tree_ref.root, 0, action, false,
is_system);
head_ref->extent_op = extent_op;
delayed_refs = &trans->transaction->delayed_refs;
spin_lock(&delayed_refs->lock);
/*
* insert both the head node and the new ref without dropping
* the spin lock
*/
head_ref = add_delayed_ref_head(trans, head_ref, record,
action, &qrecord_inserted);
ret = insert_delayed_ref(trans, delayed_refs, head_ref, &ref->node);
spin_unlock(&delayed_refs->lock);
/*
* Need to update the delayed_refs_rsv with any changes we may have
* made.
*/
btrfs_update_delayed_refs_rsv(trans);
trace_add_delayed_tree_ref(fs_info, &ref->node, ref,
action == BTRFS_ADD_DELAYED_EXTENT ?
BTRFS_ADD_DELAYED_REF : action);
if (ret > 0) kmem_cache_free(btrfs_delayed_tree_ref_cachep, ref); if (qrecord_inserted) btrfs_qgroup_trace_extent_post(trans, record);
return 0;
}
/*
* add a delayed data ref. it's similar to btrfs_add_delayed_tree_ref.
*/
int btrfs_add_delayed_data_ref(struct btrfs_trans_handle *trans,
struct btrfs_ref *generic_ref,
u64 reserved)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
struct btrfs_delayed_data_ref *ref;
struct btrfs_delayed_ref_head *head_ref;
struct btrfs_delayed_ref_root *delayed_refs;
struct btrfs_qgroup_extent_record *record = NULL;
int qrecord_inserted;
int action = generic_ref->action;
int ret;
u64 bytenr = generic_ref->bytenr;
u64 num_bytes = generic_ref->len;
u64 parent = generic_ref->parent;
u64 ref_root = generic_ref->data_ref.ref_root;
u64 owner = generic_ref->data_ref.ino;
u64 offset = generic_ref->data_ref.offset;
u8 ref_type;
ASSERT(generic_ref->type == BTRFS_REF_DATA && action);
ref = kmem_cache_alloc(btrfs_delayed_data_ref_cachep, GFP_NOFS);
if (!ref)
return -ENOMEM;
if (parent)
ref_type = BTRFS_SHARED_DATA_REF_KEY;
else
ref_type = BTRFS_EXTENT_DATA_REF_KEY;
init_delayed_ref_common(fs_info, &ref->node, bytenr, num_bytes,
ref_root, action, ref_type);
ref->root = ref_root;
ref->parent = parent;
ref->objectid = owner;
ref->offset = offset;
head_ref = kmem_cache_alloc(btrfs_delayed_ref_head_cachep, GFP_NOFS);
if (!head_ref) {
kmem_cache_free(btrfs_delayed_data_ref_cachep, ref);
return -ENOMEM;
}
if (test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) &&
is_fstree(ref_root) &&
is_fstree(generic_ref->real_root) && !generic_ref->skip_qgroup) {
record = kzalloc(sizeof(*record), GFP_NOFS);
if (!record) {
kmem_cache_free(btrfs_delayed_data_ref_cachep, ref);
kmem_cache_free(btrfs_delayed_ref_head_cachep,
head_ref);
return -ENOMEM;
}
}
init_delayed_ref_head(head_ref, record, bytenr, num_bytes, ref_root,
reserved, action, true, false);
head_ref->extent_op = NULL;
delayed_refs = &trans->transaction->delayed_refs;
spin_lock(&delayed_refs->lock);
/*
* insert both the head node and the new ref without dropping
* the spin lock
*/
head_ref = add_delayed_ref_head(trans, head_ref, record,
action, &qrecord_inserted);
ret = insert_delayed_ref(trans, delayed_refs, head_ref, &ref->node);
spin_unlock(&delayed_refs->lock);
/*
* Need to update the delayed_refs_rsv with any changes we may have
* made.
*/
btrfs_update_delayed_refs_rsv(trans);
trace_add_delayed_data_ref(trans->fs_info, &ref->node, ref,
action == BTRFS_ADD_DELAYED_EXTENT ?
BTRFS_ADD_DELAYED_REF : action);
if (ret > 0) kmem_cache_free(btrfs_delayed_data_ref_cachep, ref); if (qrecord_inserted) return btrfs_qgroup_trace_extent_post(trans, record);
return 0;
}
int btrfs_add_delayed_extent_op(struct btrfs_trans_handle *trans,
u64 bytenr, u64 num_bytes,
struct btrfs_delayed_extent_op *extent_op)
{
struct btrfs_delayed_ref_head *head_ref;
struct btrfs_delayed_ref_root *delayed_refs;
head_ref = kmem_cache_alloc(btrfs_delayed_ref_head_cachep, GFP_NOFS);
if (!head_ref)
return -ENOMEM;
init_delayed_ref_head(head_ref, NULL, bytenr, num_bytes, 0, 0,
BTRFS_UPDATE_DELAYED_HEAD, extent_op->is_data,
false);
head_ref->extent_op = extent_op;
delayed_refs = &trans->transaction->delayed_refs;
spin_lock(&delayed_refs->lock);
add_delayed_ref_head(trans, head_ref, NULL, BTRFS_UPDATE_DELAYED_HEAD,
NULL);
spin_unlock(&delayed_refs->lock);
/*
* Need to update the delayed_refs_rsv with any changes we may have
* made.
*/
btrfs_update_delayed_refs_rsv(trans);
return 0;
}
/*
* This does a simple search for the head node for a given extent. Returns the
* head node if found, or NULL if not.
*/
struct btrfs_delayed_ref_head *
btrfs_find_delayed_ref_head(struct btrfs_delayed_ref_root *delayed_refs, u64 bytenr)
{
lockdep_assert_held(&delayed_refs->lock);
return find_ref_head(delayed_refs, bytenr, false);
}
void __cold btrfs_delayed_ref_exit(void)
{
kmem_cache_destroy(btrfs_delayed_ref_head_cachep);
kmem_cache_destroy(btrfs_delayed_tree_ref_cachep);
kmem_cache_destroy(btrfs_delayed_data_ref_cachep);
kmem_cache_destroy(btrfs_delayed_extent_op_cachep);
}
int __init btrfs_delayed_ref_init(void)
{
btrfs_delayed_ref_head_cachep = kmem_cache_create(
"btrfs_delayed_ref_head",
sizeof(struct btrfs_delayed_ref_head), 0,
SLAB_MEM_SPREAD, NULL);
if (!btrfs_delayed_ref_head_cachep)
goto fail;
btrfs_delayed_tree_ref_cachep = kmem_cache_create(
"btrfs_delayed_tree_ref",
sizeof(struct btrfs_delayed_tree_ref), 0,
SLAB_MEM_SPREAD, NULL);
if (!btrfs_delayed_tree_ref_cachep)
goto fail;
btrfs_delayed_data_ref_cachep = kmem_cache_create(
"btrfs_delayed_data_ref",
sizeof(struct btrfs_delayed_data_ref), 0,
SLAB_MEM_SPREAD, NULL);
if (!btrfs_delayed_data_ref_cachep)
goto fail;
btrfs_delayed_extent_op_cachep = kmem_cache_create(
"btrfs_delayed_extent_op",
sizeof(struct btrfs_delayed_extent_op), 0,
SLAB_MEM_SPREAD, NULL);
if (!btrfs_delayed_extent_op_cachep)
goto fail;
return 0;
fail:
btrfs_delayed_ref_exit();
return -ENOMEM;
}
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef LINUX_IOMAP_H
#define LINUX_IOMAP_H 1
#include <linux/atomic.h>
#include <linux/bitmap.h>
#include <linux/blk_types.h>
#include <linux/mm.h>
#include <linux/types.h>
#include <linux/mm_types.h>
#include <linux/blkdev.h>
struct address_space;
struct fiemap_extent_info;
struct inode;
struct iomap_dio;
struct iomap_writepage_ctx;
struct iov_iter;
struct kiocb;
struct page;
struct vm_area_struct;
struct vm_fault;
/*
* Types of block ranges for iomap mappings:
*/
#define IOMAP_HOLE 0 /* no blocks allocated, need allocation */
#define IOMAP_DELALLOC 1 /* delayed allocation blocks */
#define IOMAP_MAPPED 2 /* blocks allocated at @addr */
#define IOMAP_UNWRITTEN 3 /* blocks allocated at @addr in unwritten state */
#define IOMAP_INLINE 4 /* data inline in the inode */
/*
* Flags reported by the file system from iomap_begin:
*
* IOMAP_F_NEW indicates that the blocks have been newly allocated and need
* zeroing for areas that no data is copied to.
*
* IOMAP_F_DIRTY indicates the inode has uncommitted metadata needed to access
* written data and requires fdatasync to commit them to persistent storage.
* This needs to take into account metadata changes that *may* be made at IO
* completion, such as file size updates from direct IO.
*
* IOMAP_F_SHARED indicates that the blocks are shared, and will need to be
* unshared as part a write.
*
* IOMAP_F_MERGED indicates that the iomap contains the merge of multiple block
* mappings.
*
* IOMAP_F_BUFFER_HEAD indicates that the file system requires the use of
* buffer heads for this mapping.
*/
#define IOMAP_F_NEW 0x01
#define IOMAP_F_DIRTY 0x02
#define IOMAP_F_SHARED 0x04
#define IOMAP_F_MERGED 0x08
#define IOMAP_F_BUFFER_HEAD 0x10
#define IOMAP_F_ZONE_APPEND 0x20
/*
* Flags set by the core iomap code during operations:
*
* IOMAP_F_SIZE_CHANGED indicates to the iomap_end method that the file size
* has changed as the result of this write operation.
*/
#define IOMAP_F_SIZE_CHANGED 0x100
/*
* Flags from 0x1000 up are for file system specific usage:
*/
#define IOMAP_F_PRIVATE 0x1000
/*
* Magic value for addr:
*/
#define IOMAP_NULL_ADDR -1ULL /* addr is not valid */
struct iomap_page_ops;
struct iomap {
u64 addr; /* disk offset of mapping, bytes */
loff_t offset; /* file offset of mapping, bytes */
u64 length; /* length of mapping, bytes */
u16 type; /* type of mapping */
u16 flags; /* flags for mapping */
struct block_device *bdev; /* block device for I/O */
struct dax_device *dax_dev; /* dax_dev for dax operations */
void *inline_data;
void *private; /* filesystem private */
const struct iomap_page_ops *page_ops;
};
static inline sector_t iomap_sector(const struct iomap *iomap, loff_t pos)
{
return (iomap->addr + pos - iomap->offset) >> SECTOR_SHIFT;
}
/*
* Returns the inline data pointer for logical offset @pos.
*/
static inline void *iomap_inline_data(const struct iomap *iomap, loff_t pos)
{
return iomap->inline_data + pos - iomap->offset;
}
/*
* Check if the mapping's length is within the valid range for inline data.
* This is used to guard against accessing data beyond the page inline_data
* points at.
*/
static inline bool iomap_inline_data_valid(const struct iomap *iomap)
{
return iomap->length <= PAGE_SIZE - offset_in_page(iomap->inline_data);
}
/*
* When a filesystem sets page_ops in an iomap mapping it returns, page_prepare
* and page_done will be called for each page written to. This only applies to
* buffered writes as unbuffered writes will not typically have pages
* associated with them.
*
* When page_prepare succeeds, page_done will always be called to do any
* cleanup work necessary. In that page_done call, @page will be NULL if the
* associated page could not be obtained.
*/
struct iomap_page_ops {
int (*page_prepare)(struct inode *inode, loff_t pos, unsigned len);
void (*page_done)(struct inode *inode, loff_t pos, unsigned copied,
struct page *page);
};
/*
* Flags for iomap_begin / iomap_end. No flag implies a read.
*/
#define IOMAP_WRITE (1 << 0) /* writing, must allocate blocks */
#define IOMAP_ZERO (1 << 1) /* zeroing operation, may skip holes */
#define IOMAP_REPORT (1 << 2) /* report extent status, e.g. FIEMAP */
#define IOMAP_FAULT (1 << 3) /* mapping for page fault */
#define IOMAP_DIRECT (1 << 4) /* direct I/O */
#define IOMAP_NOWAIT (1 << 5) /* do not block */
#define IOMAP_OVERWRITE_ONLY (1 << 6) /* only pure overwrites allowed */
#define IOMAP_UNSHARE (1 << 7) /* unshare_file_range */
struct iomap_ops {
/*
* Return the existing mapping at pos, or reserve space starting at
* pos for up to length, as long as we can do it as a single mapping.
* The actual length is returned in iomap->length.
*/
int (*iomap_begin)(struct inode *inode, loff_t pos, loff_t length,
unsigned flags, struct iomap *iomap,
struct iomap *srcmap);
/*
* Commit and/or unreserve space previous allocated using iomap_begin.
* Written indicates the length of the successful write operation which
* needs to be commited, while the rest needs to be unreserved.
* Written might be zero if no data was written.
*/
int (*iomap_end)(struct inode *inode, loff_t pos, loff_t length,
ssize_t written, unsigned flags, struct iomap *iomap);
};
/**
* struct iomap_iter - Iterate through a range of a file
* @inode: Set at the start of the iteration and should not change.
* @pos: The current file position we are operating on. It is updated by
* calls to iomap_iter(). Treat as read-only in the body.
* @len: The remaining length of the file segment we're operating on.
* It is updated at the same time as @pos.
* @processed: The number of bytes processed by the body in the most recent
* iteration, or a negative errno. 0 causes the iteration to stop.
* @flags: Zero or more of the iomap_begin flags above.
* @iomap: Map describing the I/O iteration
* @srcmap: Source map for COW operations
*/
struct iomap_iter {
struct inode *inode;
loff_t pos;
u64 len;
s64 processed;
unsigned flags;
struct iomap iomap;
struct iomap srcmap;
};
int iomap_iter(struct iomap_iter *iter, const struct iomap_ops *ops);
/**
* iomap_length - length of the current iomap iteration
* @iter: iteration structure
*
* Returns the length that the operation applies to for the current iteration.
*/
static inline u64 iomap_length(const struct iomap_iter *iter)
{
u64 end = iter->iomap.offset + iter->iomap.length;
if (iter->srcmap.type != IOMAP_HOLE)
end = min(end, iter->srcmap.offset + iter->srcmap.length); return min(iter->len, end - iter->pos);
}
/**
* iomap_iter_srcmap - return the source map for the current iomap iteration
* @i: iteration structure
*
* Write operations on file systems with reflink support might require a
* source and a destination map. This function retourns the source map
* for a given operation, which may or may no be identical to the destination
* map in &i->iomap.
*/
static inline const struct iomap *iomap_iter_srcmap(const struct iomap_iter *i)
{
if (i->srcmap.type != IOMAP_HOLE)
return &i->srcmap;
return &i->iomap;
}
ssize_t iomap_file_buffered_write(struct kiocb *iocb, struct iov_iter *from,
const struct iomap_ops *ops);
int iomap_readpage(struct page *page, const struct iomap_ops *ops);
void iomap_readahead(struct readahead_control *, const struct iomap_ops *ops);
int iomap_is_partially_uptodate(struct page *page, unsigned long from,
unsigned long count);
int iomap_releasepage(struct page *page, gfp_t gfp_mask);
void iomap_invalidatepage(struct page *page, unsigned int offset,
unsigned int len);
#ifdef CONFIG_MIGRATION
int iomap_migrate_page(struct address_space *mapping, struct page *newpage,
struct page *page, enum migrate_mode mode);
#else
#define iomap_migrate_page NULL
#endif
int iomap_file_unshare(struct inode *inode, loff_t pos, loff_t len,
const struct iomap_ops *ops);
int iomap_zero_range(struct inode *inode, loff_t pos, loff_t len,
bool *did_zero, const struct iomap_ops *ops);
int iomap_truncate_page(struct inode *inode, loff_t pos, bool *did_zero,
const struct iomap_ops *ops);
vm_fault_t iomap_page_mkwrite(struct vm_fault *vmf,
const struct iomap_ops *ops);
int iomap_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
u64 start, u64 len, const struct iomap_ops *ops);
loff_t iomap_seek_hole(struct inode *inode, loff_t offset,
const struct iomap_ops *ops);
loff_t iomap_seek_data(struct inode *inode, loff_t offset,
const struct iomap_ops *ops);
sector_t iomap_bmap(struct address_space *mapping, sector_t bno,
const struct iomap_ops *ops);
/*
* Structure for writeback I/O completions.
*/
struct iomap_ioend {
struct list_head io_list; /* next ioend in chain */
u16 io_type;
u16 io_flags; /* IOMAP_F_* */
struct inode *io_inode; /* file being written to */
size_t io_size; /* size of the extent */
loff_t io_offset; /* offset in the file */
struct bio *io_bio; /* bio being built */
struct bio io_inline_bio; /* MUST BE LAST! */
};
struct iomap_writeback_ops {
/*
* Required, maps the blocks so that writeback can be performed on
* the range starting at offset.
*/
int (*map_blocks)(struct iomap_writepage_ctx *wpc, struct inode *inode,
loff_t offset);
/*
* Optional, allows the file systems to perform actions just before
* submitting the bio and/or override the bio end_io handler for complex
* operations like copy on write extent manipulation or unwritten extent
* conversions.
*/
int (*prepare_ioend)(struct iomap_ioend *ioend, int status);
/*
* Optional, allows the file system to discard state on a page where
* we failed to submit any I/O.
*/
void (*discard_page)(struct page *page, loff_t fileoff);
};
struct iomap_writepage_ctx {
struct iomap iomap;
struct iomap_ioend *ioend;
const struct iomap_writeback_ops *ops;
};
void iomap_finish_ioends(struct iomap_ioend *ioend, int error);
void iomap_ioend_try_merge(struct iomap_ioend *ioend,
struct list_head *more_ioends);
void iomap_sort_ioends(struct list_head *ioend_list);
int iomap_writepage(struct page *page, struct writeback_control *wbc,
struct iomap_writepage_ctx *wpc,
const struct iomap_writeback_ops *ops);
int iomap_writepages(struct address_space *mapping,
struct writeback_control *wbc, struct iomap_writepage_ctx *wpc,
const struct iomap_writeback_ops *ops);
/*
* Flags for direct I/O ->end_io:
*/
#define IOMAP_DIO_UNWRITTEN (1 << 0) /* covers unwritten extent(s) */
#define IOMAP_DIO_COW (1 << 1) /* covers COW extent(s) */
struct iomap_dio_ops {
int (*end_io)(struct kiocb *iocb, ssize_t size, int error,
unsigned flags);
blk_qc_t (*submit_io)(const struct iomap_iter *iter, struct bio *bio,
loff_t file_offset);
};
/*
* Wait for the I/O to complete in iomap_dio_rw even if the kiocb is not
* synchronous.
*/
#define IOMAP_DIO_FORCE_WAIT (1 << 0)
/*
* Do not allocate blocks or zero partial blocks, but instead fall back to
* the caller by returning -EAGAIN. Used to optimize direct I/O writes that
* are not aligned to the file system block size.
*/
#define IOMAP_DIO_OVERWRITE_ONLY (1 << 1)
/*
* When a page fault occurs, return a partial synchronous result and allow
* the caller to retry the rest of the operation after dealing with the page
* fault.
*/
#define IOMAP_DIO_PARTIAL (1 << 2)
ssize_t iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
const struct iomap_ops *ops, const struct iomap_dio_ops *dops,
unsigned int dio_flags, size_t done_before);
struct iomap_dio *__iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
const struct iomap_ops *ops, const struct iomap_dio_ops *dops,
unsigned int dio_flags, size_t done_before);
ssize_t iomap_dio_complete(struct iomap_dio *dio);
int iomap_dio_iopoll(struct kiocb *kiocb, bool spin);
#ifdef CONFIG_SWAP
struct file;
struct swap_info_struct;
int iomap_swapfile_activate(struct swap_info_struct *sis,
struct file *swap_file, sector_t *pagespan,
const struct iomap_ops *ops);
#else
# define iomap_swapfile_activate(sis, swapfile, pagespan, ops) (-EIO)
#endif /* CONFIG_SWAP */
#endif /* LINUX_IOMAP_H */
/* SPDX-License-Identifier: GPL-2.0 */
/*
* Resizable, Scalable, Concurrent Hash Table
*
* Copyright (c) 2015-2016 Herbert Xu <herbert@gondor.apana.org.au>
* Copyright (c) 2014-2015 Thomas Graf <tgraf@suug.ch>
* Copyright (c) 2008-2014 Patrick McHardy <kaber@trash.net>
*
* Code partially derived from nft_hash
* Rewritten with rehash code from br_multicast plus single list
* pointer as suggested by Josh Triplett
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 as
* published by the Free Software Foundation.
*/
#ifndef _LINUX_RHASHTABLE_H
#define _LINUX_RHASHTABLE_H
#include <linux/err.h>
#include <linux/errno.h>
#include <linux/jhash.h>
#include <linux/list_nulls.h>
#include <linux/workqueue.h>
#include <linux/rculist.h>
#include <linux/bit_spinlock.h>
#include <linux/rhashtable-types.h>
/*
* Objects in an rhashtable have an embedded struct rhash_head
* which is linked into as hash chain from the hash table - or one
* of two or more hash tables when the rhashtable is being resized.
* The end of the chain is marked with a special nulls marks which has
* the least significant bit set but otherwise stores the address of
* the hash bucket. This allows us to be sure we've found the end
* of the right list.
* The value stored in the hash bucket has BIT(0) used as a lock bit.
* This bit must be atomically set before any changes are made to
* the chain. To avoid dereferencing this pointer without clearing
* the bit first, we use an opaque 'struct rhash_lock_head *' for the
* pointer stored in the bucket. This struct needs to be defined so
* that rcu_dereference() works on it, but it has no content so a
* cast is needed for it to be useful. This ensures it isn't
* used by mistake with clearing the lock bit first.
*/
struct rhash_lock_head {};
/* Maximum chain length before rehash
*
* The maximum (not average) chain length grows with the size of the hash
* table, at a rate of (log N)/(log log N).
*
* The value of 16 is selected so that even if the hash table grew to
* 2^32 you would not expect the maximum chain length to exceed it
* unless we are under attack (or extremely unlucky).
*
* As this limit is only to detect attacks, we don't need to set it to a
* lower value as you'd need the chain length to vastly exceed 16 to have
* any real effect on the system.
*/
#define RHT_ELASTICITY 16u
/**
* struct bucket_table - Table of hash buckets
* @size: Number of hash buckets
* @nest: Number of bits of first-level nested table.
* @rehash: Current bucket being rehashed
* @hash_rnd: Random seed to fold into hash
* @walkers: List of active walkers
* @rcu: RCU structure for freeing the table
* @future_tbl: Table under construction during rehashing
* @ntbl: Nested table used when out of memory.
* @buckets: size * hash buckets
*/
struct bucket_table {
unsigned int size;
unsigned int nest;
u32 hash_rnd;
struct list_head walkers;
struct rcu_head rcu;
struct bucket_table __rcu *future_tbl;
struct lockdep_map dep_map;
struct rhash_lock_head __rcu *buckets[] ____cacheline_aligned_in_smp;
};
/*
* NULLS_MARKER() expects a hash value with the low
* bits mostly likely to be significant, and it discards
* the msb.
* We give it an address, in which the bottom bit is
* always 0, and the msb might be significant.
* So we shift the address down one bit to align with
* expectations and avoid losing a significant bit.
*
* We never store the NULLS_MARKER in the hash table
* itself as we need the lsb for locking.
* Instead we store a NULL
*/
#define RHT_NULLS_MARKER(ptr) \
((void *)NULLS_MARKER(((unsigned long) (ptr)) >> 1))
#define INIT_RHT_NULLS_HEAD(ptr) \
((ptr) = NULL)
static inline bool rht_is_a_nulls(const struct rhash_head *ptr)
{
return ((unsigned long) ptr & 1);
}
static inline void *rht_obj(const struct rhashtable *ht,
const struct rhash_head *he)
{
return (char *)he - ht->p.head_offset;
}
static inline unsigned int rht_bucket_index(const struct bucket_table *tbl,
unsigned int hash)
{
return hash & (tbl->size - 1);
}
static inline unsigned int rht_key_get_hash(struct rhashtable *ht,
const void *key, const struct rhashtable_params params,
unsigned int hash_rnd)
{
unsigned int hash;
/* params must be equal to ht->p if it isn't constant. */
if (!__builtin_constant_p(params.key_len))
hash = ht->p.hashfn(key, ht->key_len, hash_rnd);
else if (params.key_len) {
unsigned int key_len = params.key_len;
if (params.hashfn)
hash = params.hashfn(key, key_len, hash_rnd);
else if (key_len & (sizeof(u32) - 1))
hash = jhash(key, key_len, hash_rnd);
else
hash = jhash2(key, key_len / sizeof(u32), hash_rnd);
} else {
unsigned int key_len = ht->p.key_len;
if (params.hashfn)
hash = params.hashfn(key, key_len, hash_rnd);
else
hash = jhash(key, key_len, hash_rnd);
}
return hash;
}
static inline unsigned int rht_key_hashfn(
struct rhashtable *ht, const struct bucket_table *tbl,
const void *key, const struct rhashtable_params params)
{
unsigned int hash = rht_key_get_hash(ht, key, params, tbl->hash_rnd);
return rht_bucket_index(tbl, hash);
}
static inline unsigned int rht_head_hashfn(
struct rhashtable *ht, const struct bucket_table *tbl,
const struct rhash_head *he, const struct rhashtable_params params)
{
const char *ptr = rht_obj(ht, he);
return likely(params.obj_hashfn) ?
rht_bucket_index(tbl, params.obj_hashfn(ptr, params.key_len ?:
ht->p.key_len,
tbl->hash_rnd)) :
rht_key_hashfn(ht, tbl, ptr + params.key_offset, params);
}
/**
* rht_grow_above_75 - returns true if nelems > 0.75 * table-size
* @ht: hash table
* @tbl: current table
*/
static inline bool rht_grow_above_75(const struct rhashtable *ht,
const struct bucket_table *tbl)
{
/* Expand table when exceeding 75% load */
return atomic_read(&ht->nelems) > (tbl->size / 4 * 3) &&
(!ht->p.max_size || tbl->size < ht->p.max_size);
}
/**
* rht_shrink_below_30 - returns true if nelems < 0.3 * table-size
* @ht: hash table
* @tbl: current table
*/
static inline bool rht_shrink_below_30(const struct rhashtable *ht,
const struct bucket_table *tbl)
{
/* Shrink table beneath 30% load */
return atomic_read(&ht->nelems) < (tbl->size * 3 / 10) &&
tbl->size > ht->p.min_size;
}
/**
* rht_grow_above_100 - returns true if nelems > table-size
* @ht: hash table
* @tbl: current table
*/
static inline bool rht_grow_above_100(const struct rhashtable *ht,
const struct bucket_table *tbl)
{
return atomic_read(&ht->nelems) > tbl->size &&
(!ht->p.max_size || tbl->size < ht->p.max_size);
}
/**
* rht_grow_above_max - returns true if table is above maximum
* @ht: hash table
* @tbl: current table
*/
static inline bool rht_grow_above_max(const struct rhashtable *ht,
const struct bucket_table *tbl)
{
return atomic_read(&ht->nelems) >= ht->max_elems;
}
#ifdef CONFIG_PROVE_LOCKING
int lockdep_rht_mutex_is_held(struct rhashtable *ht);
int lockdep_rht_bucket_is_held(const struct bucket_table *tbl, u32 hash);
#else
static inline int lockdep_rht_mutex_is_held(struct rhashtable *ht)
{
return 1;
}
static inline int lockdep_rht_bucket_is_held(const struct bucket_table *tbl,
u32 hash)
{
return 1;
}
#endif /* CONFIG_PROVE_LOCKING */
void *rhashtable_insert_slow(struct rhashtable *ht, const void *key,
struct rhash_head *obj);
void rhashtable_walk_enter(struct rhashtable *ht,
struct rhashtable_iter *iter);
void rhashtable_walk_exit(struct rhashtable_iter *iter);
int rhashtable_walk_start_check(struct rhashtable_iter *iter) __acquires(RCU);
static inline void rhashtable_walk_start(struct rhashtable_iter *iter)
{
(void)rhashtable_walk_start_check(iter);
}
void *rhashtable_walk_next(struct rhashtable_iter *iter);
void *rhashtable_walk_peek(struct rhashtable_iter *iter);
void rhashtable_walk_stop(struct rhashtable_iter *iter) __releases(RCU);
void rhashtable_free_and_destroy(struct rhashtable *ht,
void (*free_fn)(void *ptr, void *arg),
void *arg);
void rhashtable_destroy(struct rhashtable *ht);
struct rhash_lock_head __rcu **rht_bucket_nested(
const struct bucket_table *tbl, unsigned int hash);
struct rhash_lock_head __rcu **__rht_bucket_nested(
const struct bucket_table *tbl, unsigned int hash);
struct rhash_lock_head __rcu **rht_bucket_nested_insert(
struct rhashtable *ht, struct bucket_table *tbl, unsigned int hash);
#define rht_dereference(p, ht) \
rcu_dereference_protected(p, lockdep_rht_mutex_is_held(ht))
#define rht_dereference_rcu(p, ht) \
rcu_dereference_check(p, lockdep_rht_mutex_is_held(ht))
#define rht_dereference_bucket(p, tbl, hash) \
rcu_dereference_protected(p, lockdep_rht_bucket_is_held(tbl, hash))
#define rht_dereference_bucket_rcu(p, tbl, hash) \
rcu_dereference_check(p, lockdep_rht_bucket_is_held(tbl, hash))
#define rht_entry(tpos, pos, member) \
({ tpos = container_of(pos, typeof(*tpos), member); 1; })
static inline struct rhash_lock_head __rcu *const *rht_bucket(
const struct bucket_table *tbl, unsigned int hash)
{
return unlikely(tbl->nest) ? rht_bucket_nested(tbl, hash) :
&tbl->buckets[hash];
}
static inline struct rhash_lock_head __rcu **rht_bucket_var(
struct bucket_table *tbl, unsigned int hash)
{
return unlikely(tbl->nest) ? __rht_bucket_nested(tbl, hash) :
&tbl->buckets[hash];
}
static inline struct rhash_lock_head __rcu **rht_bucket_insert(
struct rhashtable *ht, struct bucket_table *tbl, unsigned int hash)
{
return unlikely(tbl->nest) ? rht_bucket_nested_insert(ht, tbl, hash) :
&tbl->buckets[hash];
}
/*
* We lock a bucket by setting BIT(0) in the pointer - this is always
* zero in real pointers. The NULLS mark is never stored in the bucket,
* rather we store NULL if the bucket is empty.
* bit_spin_locks do not handle contention well, but the whole point
* of the hashtable design is to achieve minimum per-bucket contention.
* A nested hash table might not have a bucket pointer. In that case
* we cannot get a lock. For remove and replace the bucket cannot be
* interesting and doesn't need locking.
* For insert we allocate the bucket if this is the last bucket_table,
* and then take the lock.
* Sometimes we unlock a bucket by writing a new pointer there. In that
* case we don't need to unlock, but we do need to reset state such as
* local_bh. For that we have rht_assign_unlock(). As rcu_assign_pointer()
* provides the same release semantics that bit_spin_unlock() provides,
* this is safe.
* When we write to a bucket without unlocking, we use rht_assign_locked().
*/
static inline void rht_lock(struct bucket_table *tbl,
struct rhash_lock_head __rcu **bkt)
{
local_bh_disable();
bit_spin_lock(0, (unsigned long *)bkt);
lock_map_acquire(&tbl->dep_map);
}
static inline void rht_lock_nested(struct bucket_table *tbl,
struct rhash_lock_head __rcu **bucket,
unsigned int subclass)
{
local_bh_disable();
bit_spin_lock(0, (unsigned long *)bucket);
lock_acquire_exclusive(&tbl->dep_map, subclass, 0, NULL, _THIS_IP_);
}
static inline void rht_unlock(struct bucket_table *tbl,
struct rhash_lock_head __rcu **bkt)
{
lock_map_release(&tbl->dep_map);
bit_spin_unlock(0, (unsigned long *)bkt);
local_bh_enable();
}
static inline struct rhash_head *__rht_ptr(
struct rhash_lock_head *p, struct rhash_lock_head __rcu *const *bkt)
{
return (struct rhash_head *)
((unsigned long)p & ~BIT(0) ?:
(unsigned long)RHT_NULLS_MARKER(bkt));
}
/*
* Where 'bkt' is a bucket and might be locked:
* rht_ptr_rcu() dereferences that pointer and clears the lock bit.
* rht_ptr() dereferences in a context where the bucket is locked.
* rht_ptr_exclusive() dereferences in a context where exclusive
* access is guaranteed, such as when destroying the table.
*/
static inline struct rhash_head *rht_ptr_rcu(
struct rhash_lock_head __rcu *const *bkt)
{
return __rht_ptr(rcu_dereference(*bkt), bkt);
}
static inline struct rhash_head *rht_ptr(
struct rhash_lock_head __rcu *const *bkt,
struct bucket_table *tbl,
unsigned int hash)
{
return __rht_ptr(rht_dereference_bucket(*bkt, tbl, hash), bkt);
}
static inline struct rhash_head *rht_ptr_exclusive(
struct rhash_lock_head __rcu *const *bkt)
{
return __rht_ptr(rcu_dereference_protected(*bkt, 1), bkt);
}
static inline void rht_assign_locked(struct rhash_lock_head __rcu **bkt,
struct rhash_head *obj)
{
if (rht_is_a_nulls(obj))
obj = NULL;
rcu_assign_pointer(*bkt, (void *)((unsigned long)obj | BIT(0)));
}
static inline void rht_assign_unlock(struct bucket_table *tbl,
struct rhash_lock_head __rcu **bkt,
struct rhash_head *obj)
{
if (rht_is_a_nulls(obj))
obj = NULL;
lock_map_release(&tbl->dep_map);
rcu_assign_pointer(*bkt, (void *)obj);
preempt_enable();
__release(bitlock);
local_bh_enable();
}
/**
* rht_for_each_from - iterate over hash chain from given head
* @pos: the &struct rhash_head to use as a loop cursor.
* @head: the &struct rhash_head to start from
* @tbl: the &struct bucket_table
* @hash: the hash value / bucket index
*/
#define rht_for_each_from(pos, head, tbl, hash) \
for (pos = head; \
!rht_is_a_nulls(pos); \
pos = rht_dereference_bucket((pos)->next, tbl, hash))
/**
* rht_for_each - iterate over hash chain
* @pos: the &struct rhash_head to use as a loop cursor.
* @tbl: the &struct bucket_table
* @hash: the hash value / bucket index
*/
#define rht_for_each(pos, tbl, hash) \
rht_for_each_from(pos, rht_ptr(rht_bucket(tbl, hash), tbl, hash), \
tbl, hash)
/**
* rht_for_each_entry_from - iterate over hash chain from given head
* @tpos: the type * to use as a loop cursor.
* @pos: the &struct rhash_head to use as a loop cursor.
* @head: the &struct rhash_head to start from
* @tbl: the &struct bucket_table
* @hash: the hash value / bucket index
* @member: name of the &struct rhash_head within the hashable struct.
*/
#define rht_for_each_entry_from(tpos, pos, head, tbl, hash, member) \
for (pos = head; \
(!rht_is_a_nulls(pos)) && rht_entry(tpos, pos, member); \
pos = rht_dereference_bucket((pos)->next, tbl, hash))
/**
* rht_for_each_entry - iterate over hash chain of given type
* @tpos: the type * to use as a loop cursor.
* @pos: the &struct rhash_head to use as a loop cursor.
* @tbl: the &struct bucket_table
* @hash: the hash value / bucket index
* @member: name of the &struct rhash_head within the hashable struct.
*/
#define rht_for_each_entry(tpos, pos, tbl, hash, member) \
rht_for_each_entry_from(tpos, pos, \
rht_ptr(rht_bucket(tbl, hash), tbl, hash), \
tbl, hash, member)
/**
* rht_for_each_entry_safe - safely iterate over hash chain of given type
* @tpos: the type * to use as a loop cursor.
* @pos: the &struct rhash_head to use as a loop cursor.
* @next: the &struct rhash_head to use as next in loop cursor.
* @tbl: the &struct bucket_table
* @hash: the hash value / bucket index
* @member: name of the &struct rhash_head within the hashable struct.
*
* This hash chain list-traversal primitive allows for the looped code to
* remove the loop cursor from the list.
*/
#define rht_for_each_entry_safe(tpos, pos, next, tbl, hash, member) \
for (pos = rht_ptr(rht_bucket(tbl, hash), tbl, hash), \
next = !rht_is_a_nulls(pos) ? \
rht_dereference_bucket(pos->next, tbl, hash) : NULL; \
(!rht_is_a_nulls(pos)) && rht_entry(tpos, pos, member); \
pos = next, \
next = !rht_is_a_nulls(pos) ? \
rht_dereference_bucket(pos->next, tbl, hash) : NULL)
/**
* rht_for_each_rcu_from - iterate over rcu hash chain from given head
* @pos: the &struct rhash_head to use as a loop cursor.
* @head: the &struct rhash_head to start from
* @tbl: the &struct bucket_table
* @hash: the hash value / bucket index
*
* This hash chain list-traversal primitive may safely run concurrently with
* the _rcu mutation primitives such as rhashtable_insert() as long as the
* traversal is guarded by rcu_read_lock().
*/
#define rht_for_each_rcu_from(pos, head, tbl, hash) \
for (({barrier(); }), \
pos = head; \
!rht_is_a_nulls(pos); \
pos = rcu_dereference_raw(pos->next))
/**
* rht_for_each_rcu - iterate over rcu hash chain
* @pos: the &struct rhash_head to use as a loop cursor.
* @tbl: the &struct bucket_table
* @hash: the hash value / bucket index
*
* This hash chain list-traversal primitive may safely run concurrently with
* the _rcu mutation primitives such as rhashtable_insert() as long as the
* traversal is guarded by rcu_read_lock().
*/
#define rht_for_each_rcu(pos, tbl, hash) \
for (({barrier(); }), \
pos = rht_ptr_rcu(rht_bucket(tbl, hash)); \
!rht_is_a_nulls(pos); \
pos = rcu_dereference_raw(pos->next))
/**
* rht_for_each_entry_rcu_from - iterated over rcu hash chain from given head
* @tpos: the type * to use as a loop cursor.
* @pos: the &struct rhash_head to use as a loop cursor.
* @head: the &struct rhash_head to start from
* @tbl: the &struct bucket_table
* @hash: the hash value / bucket index
* @member: name of the &struct rhash_head within the hashable struct.
*
* This hash chain list-traversal primitive may safely run concurrently with
* the _rcu mutation primitives such as rhashtable_insert() as long as the
* traversal is guarded by rcu_read_lock().
*/
#define rht_for_each_entry_rcu_from(tpos, pos, head, tbl, hash, member) \
for (({barrier(); }), \
pos = head; \
(!rht_is_a_nulls(pos)) && rht_entry(tpos, pos, member); \
pos = rht_dereference_bucket_rcu(pos->next, tbl, hash))
/**
* rht_for_each_entry_rcu - iterate over rcu hash chain of given type
* @tpos: the type * to use as a loop cursor.
* @pos: the &struct rhash_head to use as a loop cursor.
* @tbl: the &struct bucket_table
* @hash: the hash value / bucket index
* @member: name of the &struct rhash_head within the hashable struct.
*
* This hash chain list-traversal primitive may safely run concurrently with
* the _rcu mutation primitives such as rhashtable_insert() as long as the
* traversal is guarded by rcu_read_lock().
*/
#define rht_for_each_entry_rcu(tpos, pos, tbl, hash, member) \
rht_for_each_entry_rcu_from(tpos, pos, \
rht_ptr_rcu(rht_bucket(tbl, hash)), \
tbl, hash, member)
/**
* rhl_for_each_rcu - iterate over rcu hash table list
* @pos: the &struct rlist_head to use as a loop cursor.
* @list: the head of the list
*
* This hash chain list-traversal primitive should be used on the
* list returned by rhltable_lookup.
*/
#define rhl_for_each_rcu(pos, list) \
for (pos = list; pos; pos = rcu_dereference_raw(pos->next))
/**
* rhl_for_each_entry_rcu - iterate over rcu hash table list of given type
* @tpos: the type * to use as a loop cursor.
* @pos: the &struct rlist_head to use as a loop cursor.
* @list: the head of the list
* @member: name of the &struct rlist_head within the hashable struct.
*
* This hash chain list-traversal primitive should be used on the
* list returned by rhltable_lookup.
*/
#define rhl_for_each_entry_rcu(tpos, pos, list, member) \
for (pos = list; pos && rht_entry(tpos, pos, member); \
pos = rcu_dereference_raw(pos->next))
static inline int rhashtable_compare(struct rhashtable_compare_arg *arg,
const void *obj)
{
struct rhashtable *ht = arg->ht;
const char *ptr = obj;
return memcmp(ptr + ht->p.key_offset, arg->key, ht->p.key_len);
}
/* Internal function, do not use. */
static inline struct rhash_head *__rhashtable_lookup(
struct rhashtable *ht, const void *key,
const struct rhashtable_params params)
{
struct rhashtable_compare_arg arg = {
.ht = ht,
.key = key,
};
struct rhash_lock_head __rcu *const *bkt;
struct bucket_table *tbl;
struct rhash_head *he;
unsigned int hash;
tbl = rht_dereference_rcu(ht->tbl, ht);
restart:
hash = rht_key_hashfn(ht, tbl, key, params);
bkt = rht_bucket(tbl, hash);
do {
rht_for_each_rcu_from(he, rht_ptr_rcu(bkt), tbl, hash) { if (params.obj_cmpfn ? params.obj_cmpfn(&arg, rht_obj(ht, he)) :
rhashtable_compare(&arg, rht_obj(ht, he)))
continue;
return he;
}
/* An object might have been moved to a different hash chain,
* while we walk along it - better check and retry.
*/
} while (he != RHT_NULLS_MARKER(bkt));
/* Ensure we see any new tables. */
smp_rmb();
tbl = rht_dereference_rcu(tbl->future_tbl, ht);
if (unlikely(tbl))
goto restart;
return NULL;
}
/**
* rhashtable_lookup - search hash table
* @ht: hash table
* @key: the pointer to the key
* @params: hash table parameters
*
* Computes the hash value for the key and traverses the bucket chain looking
* for a entry with an identical key. The first matching entry is returned.
*
* This must only be called under the RCU read lock.
*
* Returns the first entry on which the compare function returned true.
*/
static inline void *rhashtable_lookup(
struct rhashtable *ht, const void *key,
const struct rhashtable_params params)
{
struct rhash_head *he = __rhashtable_lookup(ht, key, params);
return he ? rht_obj(ht, he) : NULL;
}
/**
* rhashtable_lookup_fast - search hash table, without RCU read lock
* @ht: hash table
* @key: the pointer to the key
* @params: hash table parameters
*
* Computes the hash value for the key and traverses the bucket chain looking
* for a entry with an identical key. The first matching entry is returned.
*
* Only use this function when you have other mechanisms guaranteeing
* that the object won't go away after the RCU read lock is released.
*
* Returns the first entry on which the compare function returned true.
*/
static inline void *rhashtable_lookup_fast(
struct rhashtable *ht, const void *key,
const struct rhashtable_params params)
{
void *obj;
rcu_read_lock();
obj = rhashtable_lookup(ht, key, params);
rcu_read_unlock();
return obj;
}
/**
* rhltable_lookup - search hash list table
* @hlt: hash table
* @key: the pointer to the key
* @params: hash table parameters
*
* Computes the hash value for the key and traverses the bucket chain looking
* for a entry with an identical key. All matching entries are returned
* in a list.
*
* This must only be called under the RCU read lock.
*
* Returns the list of entries that match the given key.
*/
static inline struct rhlist_head *rhltable_lookup(
struct rhltable *hlt, const void *key,
const struct rhashtable_params params)
{
struct rhash_head *he = __rhashtable_lookup(&hlt->ht, key, params);
return he ? container_of(he, struct rhlist_head, rhead) : NULL;
}
/* Internal function, please use rhashtable_insert_fast() instead. This
* function returns the existing element already in hashes in there is a clash,
* otherwise it returns an error via ERR_PTR().
*/
static inline void *__rhashtable_insert_fast(
struct rhashtable *ht, const void *key, struct rhash_head *obj,
const struct rhashtable_params params, bool rhlist)
{
struct rhashtable_compare_arg arg = {
.ht = ht,
.key = key,
};
struct rhash_lock_head __rcu **bkt;
struct rhash_head __rcu **pprev;
struct bucket_table *tbl;
struct rhash_head *head;
unsigned int hash;
int elasticity;
void *data;
rcu_read_lock();
tbl = rht_dereference_rcu(ht->tbl, ht);
hash = rht_head_hashfn(ht, tbl, obj, params);
elasticity = RHT_ELASTICITY;
bkt = rht_bucket_insert(ht, tbl, hash);
data = ERR_PTR(-ENOMEM);
if (!bkt)
goto out;
pprev = NULL;
rht_lock(tbl, bkt);
if (unlikely(rcu_access_pointer(tbl->future_tbl))) {
slow_path:
rht_unlock(tbl, bkt);
rcu_read_unlock();
return rhashtable_insert_slow(ht, key, obj);
}
rht_for_each_from(head, rht_ptr(bkt, tbl, hash), tbl, hash) {
struct rhlist_head *plist;
struct rhlist_head *list;
elasticity--;
if (!key ||
(params.obj_cmpfn ?
params.obj_cmpfn(&arg, rht_obj(ht, head)) :
rhashtable_compare(&arg, rht_obj(ht, head)))) {
pprev = &head->next;
continue;
}
data = rht_obj(ht, head);
if (!rhlist)
goto out_unlock;
list = container_of(obj, struct rhlist_head, rhead);
plist = container_of(head, struct rhlist_head, rhead);
RCU_INIT_POINTER(list->next, plist);
head = rht_dereference_bucket(head->next, tbl, hash);
RCU_INIT_POINTER(list->rhead.next, head);
if (pprev) {
rcu_assign_pointer(*pprev, obj);
rht_unlock(tbl, bkt);
} else
rht_assign_unlock(tbl, bkt, obj);
data = NULL;
goto out;
}
if (elasticity <= 0)
goto slow_path;
data = ERR_PTR(-E2BIG);
if (unlikely(rht_grow_above_max(ht, tbl)))
goto out_unlock;
if (unlikely(rht_grow_above_100(ht, tbl)))
goto slow_path;
/* Inserting at head of list makes unlocking free. */
head = rht_ptr(bkt, tbl, hash);
RCU_INIT_POINTER(obj->next, head);
if (rhlist) {
struct rhlist_head *list;
list = container_of(obj, struct rhlist_head, rhead);
RCU_INIT_POINTER(list->next, NULL);
}
atomic_inc(&ht->nelems);
rht_assign_unlock(tbl, bkt, obj);
if (rht_grow_above_75(ht, tbl))
schedule_work(&ht->run_work);
data = NULL;
out:
rcu_read_unlock();
return data;
out_unlock:
rht_unlock(tbl, bkt);
goto out;
}
/**
* rhashtable_insert_fast - insert object into hash table
* @ht: hash table
* @obj: pointer to hash head inside object
* @params: hash table parameters
*
* Will take the per bucket bitlock to protect against mutual mutations
* on the same bucket. Multiple insertions may occur in parallel unless
* they map to the same bucket.
*
* It is safe to call this function from atomic context.
*
* Will trigger an automatic deferred table resizing if residency in the
* table grows beyond 70%.
*/
static inline int rhashtable_insert_fast(
struct rhashtable *ht, struct rhash_head *obj,
const struct rhashtable_params params)
{
void *ret;
ret = __rhashtable_insert_fast(ht, NULL, obj, params, false);
if (IS_ERR(ret))
return PTR_ERR(ret);
return ret == NULL ? 0 : -EEXIST;
}
/**
* rhltable_insert_key - insert object into hash list table
* @hlt: hash list table
* @key: the pointer to the key
* @list: pointer to hash list head inside object
* @params: hash table parameters
*
* Will take the per bucket bitlock to protect against mutual mutations
* on the same bucket. Multiple insertions may occur in parallel unless
* they map to the same bucket.
*
* It is safe to call this function from atomic context.
*
* Will trigger an automatic deferred table resizing if residency in the
* table grows beyond 70%.
*/
static inline int rhltable_insert_key(
struct rhltable *hlt, const void *key, struct rhlist_head *list,
const struct rhashtable_params params)
{
return PTR_ERR(__rhashtable_insert_fast(&hlt->ht, key, &list->rhead,
params, true));
}
/**
* rhltable_insert - insert object into hash list table
* @hlt: hash list table
* @list: pointer to hash list head inside object
* @params: hash table parameters
*
* Will take the per bucket bitlock to protect against mutual mutations
* on the same bucket. Multiple insertions may occur in parallel unless
* they map to the same bucket.
*
* It is safe to call this function from atomic context.
*
* Will trigger an automatic deferred table resizing if residency in the
* table grows beyond 70%.
*/
static inline int rhltable_insert(
struct rhltable *hlt, struct rhlist_head *list,
const struct rhashtable_params params)
{
const char *key = rht_obj(&hlt->ht, &list->rhead);
key += params.key_offset;
return rhltable_insert_key(hlt, key, list, params);
}
/**
* rhashtable_lookup_insert_fast - lookup and insert object into hash table
* @ht: hash table
* @obj: pointer to hash head inside object
* @params: hash table parameters
*
* This lookup function may only be used for fixed key hash table (key_len
* parameter set). It will BUG() if used inappropriately.
*
* It is safe to call this function from atomic context.
*
* Will trigger an automatic deferred table resizing if residency in the
* table grows beyond 70%.
*/
static inline int rhashtable_lookup_insert_fast(
struct rhashtable *ht, struct rhash_head *obj,
const struct rhashtable_params params)
{
const char *key = rht_obj(ht, obj);
void *ret;
BUG_ON(ht->p.obj_hashfn);
ret = __rhashtable_insert_fast(ht, key + ht->p.key_offset, obj, params,
false);
if (IS_ERR(ret))
return PTR_ERR(ret);
return ret == NULL ? 0 : -EEXIST;
}
/**
* rhashtable_lookup_get_insert_fast - lookup and insert object into hash table
* @ht: hash table
* @obj: pointer to hash head inside object
* @params: hash table parameters
*
* Just like rhashtable_lookup_insert_fast(), but this function returns the
* object if it exists, NULL if it did not and the insertion was successful,
* and an ERR_PTR otherwise.
*/
static inline void *rhashtable_lookup_get_insert_fast(
struct rhashtable *ht, struct rhash_head *obj,
const struct rhashtable_params params)
{
const char *key = rht_obj(ht, obj);
BUG_ON(ht->p.obj_hashfn);
return __rhashtable_insert_fast(ht, key + ht->p.key_offset, obj, params,
false);
}
/**
* rhashtable_lookup_insert_key - search and insert object to hash table
* with explicit key
* @ht: hash table
* @key: key
* @obj: pointer to hash head inside object
* @params: hash table parameters
*
* Lookups may occur in parallel with hashtable mutations and resizing.
*
* Will trigger an automatic deferred table resizing if residency in the
* table grows beyond 70%.
*
* Returns zero on success.
*/
static inline int rhashtable_lookup_insert_key(
struct rhashtable *ht, const void *key, struct rhash_head *obj,
const struct rhashtable_params params)
{
void *ret;
BUG_ON(!ht->p.obj_hashfn || !key);
ret = __rhashtable_insert_fast(ht, key, obj, params, false);
if (IS_ERR(ret))
return PTR_ERR(ret);
return ret == NULL ? 0 : -EEXIST;
}
/**
* rhashtable_lookup_get_insert_key - lookup and insert object into hash table
* @ht: hash table
* @key: key
* @obj: pointer to hash head inside object
* @params: hash table parameters
*
* Just like rhashtable_lookup_insert_key(), but this function returns the
* object if it exists, NULL if it does not and the insertion was successful,
* and an ERR_PTR otherwise.
*/
static inline void *rhashtable_lookup_get_insert_key(
struct rhashtable *ht, const void *key, struct rhash_head *obj,
const struct rhashtable_params params)
{
BUG_ON(!ht->p.obj_hashfn || !key);
return __rhashtable_insert_fast(ht, key, obj, params, false);
}
/* Internal function, please use rhashtable_remove_fast() instead */
static inline int __rhashtable_remove_fast_one(
struct rhashtable *ht, struct bucket_table *tbl,
struct rhash_head *obj, const struct rhashtable_params params,
bool rhlist)
{
struct rhash_lock_head __rcu **bkt;
struct rhash_head __rcu **pprev;
struct rhash_head *he;
unsigned int hash;
int err = -ENOENT;
hash = rht_head_hashfn(ht, tbl, obj, params);
bkt = rht_bucket_var(tbl, hash);
if (!bkt)
return -ENOENT;
pprev = NULL;
rht_lock(tbl, bkt);
rht_for_each_from(he, rht_ptr(bkt, tbl, hash), tbl, hash) {
struct rhlist_head *list;
list = container_of(he, struct rhlist_head, rhead);
if (he != obj) {
struct rhlist_head __rcu **lpprev;
pprev = &he->next;
if (!rhlist)
continue;
do {
lpprev = &list->next;
list = rht_dereference_bucket(list->next,
tbl, hash);
} while (list && obj != &list->rhead);
if (!list)
continue;
list = rht_dereference_bucket(list->next, tbl, hash);
RCU_INIT_POINTER(*lpprev, list);
err = 0;
break;
}
obj = rht_dereference_bucket(obj->next, tbl, hash);
err = 1;
if (rhlist) {
list = rht_dereference_bucket(list->next, tbl, hash);
if (list) {
RCU_INIT_POINTER(list->rhead.next, obj);
obj = &list->rhead;
err = 0;
}
}
if (pprev) {
rcu_assign_pointer(*pprev, obj);
rht_unlock(tbl, bkt);
} else {
rht_assign_unlock(tbl, bkt, obj);
}
goto unlocked;
}
rht_unlock(tbl, bkt);
unlocked:
if (err > 0) {
atomic_dec(&ht->nelems);
if (unlikely(ht->p.automatic_shrinking &&
rht_shrink_below_30(ht, tbl)))
schedule_work(&ht->run_work);
err = 0;
}
return err;
}
/* Internal function, please use rhashtable_remove_fast() instead */
static inline int __rhashtable_remove_fast(
struct rhashtable *ht, struct rhash_head *obj,
const struct rhashtable_params params, bool rhlist)
{
struct bucket_table *tbl;
int err;
rcu_read_lock();
tbl = rht_dereference_rcu(ht->tbl, ht);
/* Because we have already taken (and released) the bucket
* lock in old_tbl, if we find that future_tbl is not yet
* visible then that guarantees the entry to still be in
* the old tbl if it exists.
*/
while ((err = __rhashtable_remove_fast_one(ht, tbl, obj, params,
rhlist)) &&
(tbl = rht_dereference_rcu(tbl->future_tbl, ht)))
;
rcu_read_unlock();
return err;
}
/**
* rhashtable_remove_fast - remove object from hash table
* @ht: hash table
* @obj: pointer to hash head inside object
* @params: hash table parameters
*
* Since the hash chain is single linked, the removal operation needs to
* walk the bucket chain upon removal. The removal operation is thus
* considerable slow if the hash table is not correctly sized.
*
* Will automatically shrink the table if permitted when residency drops
* below 30%.
*
* Returns zero on success, -ENOENT if the entry could not be found.
*/
static inline int rhashtable_remove_fast(
struct rhashtable *ht, struct rhash_head *obj,
const struct rhashtable_params params)
{
return __rhashtable_remove_fast(ht, obj, params, false);
}
/**
* rhltable_remove - remove object from hash list table
* @hlt: hash list table
* @list: pointer to hash list head inside object
* @params: hash table parameters
*
* Since the hash chain is single linked, the removal operation needs to
* walk the bucket chain upon removal. The removal operation is thus
* considerable slow if the hash table is not correctly sized.
*
* Will automatically shrink the table if permitted when residency drops
* below 30%
*
* Returns zero on success, -ENOENT if the entry could not be found.
*/
static inline int rhltable_remove(
struct rhltable *hlt, struct rhlist_head *list,
const struct rhashtable_params params)
{
return __rhashtable_remove_fast(&hlt->ht, &list->rhead, params, true);
}
/* Internal function, please use rhashtable_replace_fast() instead */
static inline int __rhashtable_replace_fast(
struct rhashtable *ht, struct bucket_table *tbl,
struct rhash_head *obj_old, struct rhash_head *obj_new,
const struct rhashtable_params params)
{
struct rhash_lock_head __rcu **bkt;
struct rhash_head __rcu **pprev;
struct rhash_head *he;
unsigned int hash;
int err = -ENOENT;
/* Minimally, the old and new objects must have same hash
* (which should mean identifiers are the same).
*/
hash = rht_head_hashfn(ht, tbl, obj_old, params);
if (hash != rht_head_hashfn(ht, tbl, obj_new, params))
return -EINVAL;
bkt = rht_bucket_var(tbl, hash);
if (!bkt)
return -ENOENT;
pprev = NULL;
rht_lock(tbl, bkt);
rht_for_each_from(he, rht_ptr(bkt, tbl, hash), tbl, hash) {
if (he != obj_old) {
pprev = &he->next;
continue;
}
rcu_assign_pointer(obj_new->next, obj_old->next);
if (pprev) {
rcu_assign_pointer(*pprev, obj_new);
rht_unlock(tbl, bkt);
} else {
rht_assign_unlock(tbl, bkt, obj_new);
}
err = 0;
goto unlocked;
}
rht_unlock(tbl, bkt);
unlocked:
return err;
}
/**
* rhashtable_replace_fast - replace an object in hash table
* @ht: hash table
* @obj_old: pointer to hash head inside object being replaced
* @obj_new: pointer to hash head inside object which is new
* @params: hash table parameters
*
* Replacing an object doesn't affect the number of elements in the hash table
* or bucket, so we don't need to worry about shrinking or expanding the
* table here.
*
* Returns zero on success, -ENOENT if the entry could not be found,
* -EINVAL if hash is not the same for the old and new objects.
*/
static inline int rhashtable_replace_fast(
struct rhashtable *ht, struct rhash_head *obj_old,
struct rhash_head *obj_new,
const struct rhashtable_params params)
{
struct bucket_table *tbl;
int err;
rcu_read_lock();
tbl = rht_dereference_rcu(ht->tbl, ht);
/* Because we have already taken (and released) the bucket
* lock in old_tbl, if we find that future_tbl is not yet
* visible then that guarantees the entry to still be in
* the old tbl if it exists.
*/
while ((err = __rhashtable_replace_fast(ht, tbl, obj_old,
obj_new, params)) &&
(tbl = rht_dereference_rcu(tbl->future_tbl, ht)))
;
rcu_read_unlock();
return err;
}
/**
* rhltable_walk_enter - Initialise an iterator
* @hlt: Table to walk over
* @iter: Hash table Iterator
*
* This function prepares a hash table walk.
*
* Note that if you restart a walk after rhashtable_walk_stop you
* may see the same object twice. Also, you may miss objects if
* there are removals in between rhashtable_walk_stop and the next
* call to rhashtable_walk_start.
*
* For a completely stable walk you should construct your own data
* structure outside the hash table.
*
* This function may be called from any process context, including
* non-preemptable context, but cannot be called from softirq or
* hardirq context.
*
* You must call rhashtable_walk_exit after this function returns.
*/
static inline void rhltable_walk_enter(struct rhltable *hlt,
struct rhashtable_iter *iter)
{
return rhashtable_walk_enter(&hlt->ht, iter);
}
/**
* rhltable_free_and_destroy - free elements and destroy hash list table
* @hlt: the hash list table to destroy
* @free_fn: callback to release resources of element
* @arg: pointer passed to free_fn
*
* See documentation for rhashtable_free_and_destroy.
*/
static inline void rhltable_free_and_destroy(struct rhltable *hlt,
void (*free_fn)(void *ptr,
void *arg),
void *arg)
{
return rhashtable_free_and_destroy(&hlt->ht, free_fn, arg);
}
static inline void rhltable_destroy(struct rhltable *hlt)
{
return rhltable_free_and_destroy(hlt, NULL, NULL);
}
#endif /* _LINUX_RHASHTABLE_H */
// SPDX-License-Identifier: GPL-2.0-or-later
/*
* INET An implementation of the TCP/IP protocol suite for the LINUX
* operating system. INET is implemented using the BSD Socket
* interface as the means of communication with the user level.
*
* Support for INET connection oriented protocols.
*
* Authors: See the TCP sources
*/
#include <linux/module.h>
#include <linux/jhash.h>
#include <net/inet_connection_sock.h>
#include <net/inet_hashtables.h>
#include <net/inet_timewait_sock.h>
#include <net/ip.h>
#include <net/route.h>
#include <net/tcp_states.h>
#include <net/xfrm.h>
#include <net/tcp.h>
#include <net/sock_reuseport.h>
#include <net/addrconf.h>
#if IS_ENABLED(CONFIG_IPV6)
/* match_sk*_wildcard == true: IPV6_ADDR_ANY equals to any IPv6 addresses
* if IPv6 only, and any IPv4 addresses
* if not IPv6 only
* match_sk*_wildcard == false: addresses must be exactly the same, i.e.
* IPV6_ADDR_ANY only equals to IPV6_ADDR_ANY,
* and 0.0.0.0 equals to 0.0.0.0 only
*/
static bool ipv6_rcv_saddr_equal(const struct in6_addr *sk1_rcv_saddr6,
const struct in6_addr *sk2_rcv_saddr6,
__be32 sk1_rcv_saddr, __be32 sk2_rcv_saddr,
bool sk1_ipv6only, bool sk2_ipv6only,
bool match_sk1_wildcard,
bool match_sk2_wildcard)
{
int addr_type = ipv6_addr_type(sk1_rcv_saddr6);
int addr_type2 = sk2_rcv_saddr6 ? ipv6_addr_type(sk2_rcv_saddr6) : IPV6_ADDR_MAPPED;
/* if both are mapped, treat as IPv4 */
if (addr_type == IPV6_ADDR_MAPPED && addr_type2 == IPV6_ADDR_MAPPED) {
if (!sk2_ipv6only) {
if (sk1_rcv_saddr == sk2_rcv_saddr)
return true;
return (match_sk1_wildcard && !sk1_rcv_saddr) ||
(match_sk2_wildcard && !sk2_rcv_saddr);
}
return false;
}
if (addr_type == IPV6_ADDR_ANY && addr_type2 == IPV6_ADDR_ANY)
return true;
if (addr_type2 == IPV6_ADDR_ANY && match_sk2_wildcard &&
!(sk2_ipv6only && addr_type == IPV6_ADDR_MAPPED))
return true;
if (addr_type == IPV6_ADDR_ANY && match_sk1_wildcard &&
!(sk1_ipv6only && addr_type2 == IPV6_ADDR_MAPPED))
return true;
if (sk2_rcv_saddr6 &&
ipv6_addr_equal(sk1_rcv_saddr6, sk2_rcv_saddr6))
return true;
return false;
}
#endif
/* match_sk*_wildcard == true: 0.0.0.0 equals to any IPv4 addresses
* match_sk*_wildcard == false: addresses must be exactly the same, i.e.
* 0.0.0.0 only equals to 0.0.0.0
*/
static bool ipv4_rcv_saddr_equal(__be32 sk1_rcv_saddr, __be32 sk2_rcv_saddr,
bool sk2_ipv6only, bool match_sk1_wildcard,
bool match_sk2_wildcard)
{
if (!sk2_ipv6only) {
if (sk1_rcv_saddr == sk2_rcv_saddr)
return true;
return (match_sk1_wildcard && !sk1_rcv_saddr) ||
(match_sk2_wildcard && !sk2_rcv_saddr);
}
return false;
}
bool inet_rcv_saddr_equal(const struct sock *sk, const struct sock *sk2,
bool match_wildcard)
{
#if IS_ENABLED(CONFIG_IPV6)
if (sk->sk_family == AF_INET6)
return ipv6_rcv_saddr_equal(&sk->sk_v6_rcv_saddr,
inet6_rcv_saddr(sk2),
sk->sk_rcv_saddr,
sk2->sk_rcv_saddr,
ipv6_only_sock(sk),
ipv6_only_sock(sk2),
match_wildcard,
match_wildcard);
#endif
return ipv4_rcv_saddr_equal(sk->sk_rcv_saddr, sk2->sk_rcv_saddr,
ipv6_only_sock(sk2), match_wildcard,
match_wildcard);
}
EXPORT_SYMBOL(inet_rcv_saddr_equal);
bool inet_rcv_saddr_any(const struct sock *sk)
{
#if IS_ENABLED(CONFIG_IPV6)
if (sk->sk_family == AF_INET6)
return ipv6_addr_any(&sk->sk_v6_rcv_saddr);
#endif
return !sk->sk_rcv_saddr;
}
void inet_get_local_port_range(struct net *net, int *low, int *high)
{
unsigned int seq;
do {
seq = read_seqbegin(&net->ipv4.ip_local_ports.lock);
*low = net->ipv4.ip_local_ports.range[0];
*high = net->ipv4.ip_local_ports.range[1];
} while (read_seqretry(&net->ipv4.ip_local_ports.lock, seq));
}
EXPORT_SYMBOL(inet_get_local_port_range);
static int inet_csk_bind_conflict(const struct sock *sk,
const struct inet_bind_bucket *tb,
bool relax, bool reuseport_ok)
{
struct sock *sk2;
bool reuseport_cb_ok;
bool reuse = sk->sk_reuse;
bool reuseport = !!sk->sk_reuseport;
struct sock_reuseport *reuseport_cb;
kuid_t uid = sock_i_uid((struct sock *)sk);
rcu_read_lock();
reuseport_cb = rcu_dereference(sk->sk_reuseport_cb);
/* paired with WRITE_ONCE() in __reuseport_(add|detach)_closed_sock */
reuseport_cb_ok = !reuseport_cb || READ_ONCE(reuseport_cb->num_closed_socks);
rcu_read_unlock();
/*
* Unlike other sk lookup places we do not check
* for sk_net here, since _all_ the socks listed
* in tb->owners list belong to the same net - the
* one this bucket belongs to.
*/
sk_for_each_bound(sk2, &tb->owners) {
if (sk != sk2 &&
(!sk->sk_bound_dev_if ||
!sk2->sk_bound_dev_if ||
sk->sk_bound_dev_if == sk2->sk_bound_dev_if)) {
if (reuse && sk2->sk_reuse &&
sk2->sk_state != TCP_LISTEN) {
if ((!relax ||
(!reuseport_ok &&
reuseport && sk2->sk_reuseport &&
reuseport_cb_ok &&
(sk2->sk_state == TCP_TIME_WAIT ||
uid_eq(uid, sock_i_uid(sk2))))) &&
inet_rcv_saddr_equal(sk, sk2, true))
break;
} else if (!reuseport_ok ||
!reuseport || !sk2->sk_reuseport ||
!reuseport_cb_ok ||
(sk2->sk_state != TCP_TIME_WAIT &&
!uid_eq(uid, sock_i_uid(sk2)))) {
if (inet_rcv_saddr_equal(sk, sk2, true))
break;
}
}
}
return sk2 != NULL;
}
/*
* Find an open port number for the socket. Returns with the
* inet_bind_hashbucket lock held.
*/
static struct inet_bind_hashbucket *
inet_csk_find_open_port(struct sock *sk, struct inet_bind_bucket **tb_ret, int *port_ret)
{
struct inet_hashinfo *hinfo = sk->sk_prot->h.hashinfo;
int port = 0;
struct inet_bind_hashbucket *head;
struct net *net = sock_net(sk);
bool relax = false;
int i, low, high, attempt_half;
struct inet_bind_bucket *tb;
u32 remaining, offset;
int l3mdev;
l3mdev = inet_sk_bound_l3mdev(sk);
ports_exhausted:
attempt_half = (sk->sk_reuse == SK_CAN_REUSE) ? 1 : 0;
other_half_scan:
inet_get_local_port_range(net, &low, &high);
high++; /* [32768, 60999] -> [32768, 61000[ */
if (high - low < 4)
attempt_half = 0;
if (attempt_half) {
int half = low + (((high - low) >> 2) << 1);
if (attempt_half == 1)
high = half;
else
low = half;
}
remaining = high - low;
if (likely(remaining > 1))
remaining &= ~1U;
offset = prandom_u32() % remaining;
/* __inet_hash_connect() favors ports having @low parity
* We do the opposite to not pollute connect() users.
*/
offset |= 1U;
other_parity_scan:
port = low + offset;
for (i = 0; i < remaining; i += 2, port += 2) {
if (unlikely(port >= high))
port -= remaining;
if (inet_is_local_reserved_port(net, port))
continue;
head = &hinfo->bhash[inet_bhashfn(net, port,
hinfo->bhash_size)];
spin_lock_bh(&head->lock);
inet_bind_bucket_for_each(tb, &head->chain)
if (net_eq(ib_net(tb), net) && tb->l3mdev == l3mdev &&
tb->port == port) {
if (!inet_csk_bind_conflict(sk, tb, relax, false))
goto success;
goto next_port;
}
tb = NULL;
goto success;
next_port:
spin_unlock_bh(&head->lock);
cond_resched();
}
offset--;
if (!(offset & 1))
goto other_parity_scan;
if (attempt_half == 1) {
/* OK we now try the upper half of the range */
attempt_half = 2;
goto other_half_scan;
}
if (net->ipv4.sysctl_ip_autobind_reuse && !relax) {
/* We still have a chance to connect to different destinations */
relax = true;
goto ports_exhausted;
}
return NULL;
success:
*port_ret = port;
*tb_ret = tb;
return head;
}
static inline int sk_reuseport_match(struct inet_bind_bucket *tb,
struct sock *sk)
{
kuid_t uid = sock_i_uid(sk);
if (tb->fastreuseport <= 0)
return 0;
if (!sk->sk_reuseport)
return 0;
if (rcu_access_pointer(sk->sk_reuseport_cb))
return 0;
if (!uid_eq(tb->fastuid, uid))
return 0;
/* We only need to check the rcv_saddr if this tb was once marked
* without fastreuseport and then was reset, as we can only know that
* the fast_*rcv_saddr doesn't have any conflicts with the socks on the
* owners list.
*/
if (tb->fastreuseport == FASTREUSEPORT_ANY)
return 1;
#if IS_ENABLED(CONFIG_IPV6)
if (tb->fast_sk_family == AF_INET6)
return ipv6_rcv_saddr_equal(&tb->fast_v6_rcv_saddr,
inet6_rcv_saddr(sk),
tb->fast_rcv_saddr,
sk->sk_rcv_saddr,
tb->fast_ipv6_only,
ipv6_only_sock(sk), true, false);
#endif
return ipv4_rcv_saddr_equal(tb->fast_rcv_saddr, sk->sk_rcv_saddr,
ipv6_only_sock(sk), true, false);
}
void inet_csk_update_fastreuse(struct inet_bind_bucket *tb,
struct sock *sk)
{
kuid_t uid = sock_i_uid(sk);
bool reuse = sk->sk_reuse && sk->sk_state != TCP_LISTEN;
if (hlist_empty(&tb->owners)) {
tb->fastreuse = reuse;
if (sk->sk_reuseport) {
tb->fastreuseport = FASTREUSEPORT_ANY;
tb->fastuid = uid;
tb->fast_rcv_saddr = sk->sk_rcv_saddr;
tb->fast_ipv6_only = ipv6_only_sock(sk);
tb->fast_sk_family = sk->sk_family;
#if IS_ENABLED(CONFIG_IPV6)
tb->fast_v6_rcv_saddr = sk->sk_v6_rcv_saddr;
#endif
} else {
tb->fastreuseport = 0;
}
} else {
if (!reuse)
tb->fastreuse = 0;
if (sk->sk_reuseport) {
/* We didn't match or we don't have fastreuseport set on
* the tb, but we have sk_reuseport set on this socket
* and we know that there are no bind conflicts with
* this socket in this tb, so reset our tb's reuseport
* settings so that any subsequent sockets that match
* our current socket will be put on the fast path.
*
* If we reset we need to set FASTREUSEPORT_STRICT so we
* do extra checking for all subsequent sk_reuseport
* socks.
*/
if (!sk_reuseport_match(tb, sk)) {
tb->fastreuseport = FASTREUSEPORT_STRICT;
tb->fastuid = uid;
tb->fast_rcv_saddr = sk->sk_rcv_saddr;
tb->fast_ipv6_only = ipv6_only_sock(sk);
tb->fast_sk_family = sk->sk_family;
#if IS_ENABLED(CONFIG_IPV6)
tb->fast_v6_rcv_saddr = sk->sk_v6_rcv_saddr;
#endif
}
} else {
tb->fastreuseport = 0;
}
}
}
/* Obtain a reference to a local port for the given sock,
* if snum is zero it means select any available local port.
* We try to allocate an odd port (and leave even ports for connect())
*/
int inet_csk_get_port(struct sock *sk, unsigned short snum)
{
bool reuse = sk->sk_reuse && sk->sk_state != TCP_LISTEN;
struct inet_hashinfo *hinfo = sk->sk_prot->h.hashinfo;
int ret = 1, port = snum;
struct inet_bind_hashbucket *head;
struct net *net = sock_net(sk);
struct inet_bind_bucket *tb = NULL;
int l3mdev;
l3mdev = inet_sk_bound_l3mdev(sk);
if (!port) {
head = inet_csk_find_open_port(sk, &tb, &port);
if (!head)
return ret;
if (!tb)
goto tb_not_found;
goto success;
}
head = &hinfo->bhash[inet_bhashfn(net, port,
hinfo->bhash_size)];
spin_lock_bh(&head->lock);
inet_bind_bucket_for_each(tb, &head->chain)
if (net_eq(ib_net(tb), net) && tb->l3mdev == l3mdev &&
tb->port == port)
goto tb_found;
tb_not_found:
tb = inet_bind_bucket_create(hinfo->bind_bucket_cachep,
net, head, port, l3mdev);
if (!tb)
goto fail_unlock;
tb_found:
if (!hlist_empty(&tb->owners)) {
if (sk->sk_reuse == SK_FORCE_REUSE)
goto success;
if ((tb->fastreuse > 0 && reuse) ||
sk_reuseport_match(tb, sk))
goto success;
if (inet_csk_bind_conflict(sk, tb, true, true))
goto fail_unlock;
}
success:
inet_csk_update_fastreuse(tb, sk);
if (!inet_csk(sk)->icsk_bind_hash)
inet_bind_hash(sk, tb, port);
WARN_ON(inet_csk(sk)->icsk_bind_hash != tb);
ret = 0;
fail_unlock:
spin_unlock_bh(&head->lock);
return ret;
}
EXPORT_SYMBOL_GPL(inet_csk_get_port);
/*
* Wait for an incoming connection, avoid race conditions. This must be called
* with the socket locked.
*/
static int inet_csk_wait_for_connect(struct sock *sk, long timeo)
{
struct inet_connection_sock *icsk = inet_csk(sk);
DEFINE_WAIT(wait);
int err;
/*
* True wake-one mechanism for incoming connections: only
* one process gets woken up, not the 'whole herd'.
* Since we do not 'race & poll' for established sockets
* anymore, the common case will execute the loop only once.
*
* Subtle issue: "add_wait_queue_exclusive()" will be added
* after any current non-exclusive waiters, and we know that
* it will always _stay_ after any new non-exclusive waiters
* because all non-exclusive waiters are added at the
* beginning of the wait-queue. As such, it's ok to "drop"
* our exclusiveness temporarily when we get woken up without
* having to remove and re-insert us on the wait queue.
*/
for (;;) {
prepare_to_wait_exclusive(sk_sleep(sk), &wait,
TASK_INTERRUPTIBLE);
release_sock(sk);
if (reqsk_queue_empty(&icsk->icsk_accept_queue))
timeo = schedule_timeout(timeo);
sched_annotate_sleep();
lock_sock(sk);
err = 0;
if (!reqsk_queue_empty(&icsk->icsk_accept_queue))
break;
err = -EINVAL;
if (sk->sk_state != TCP_LISTEN)
break;
err = sock_intr_errno(timeo);
if (signal_pending(current))
break;
err = -EAGAIN;
if (!timeo)
break;
}
finish_wait(sk_sleep(sk), &wait);
return err;
}
/*
* This will accept the next outstanding connection.
*/
struct sock *inet_csk_accept(struct sock *sk, int flags, int *err, bool kern)
{
struct inet_connection_sock *icsk = inet_csk(sk);
struct request_sock_queue *queue = &icsk->icsk_accept_queue;
struct request_sock *req;
struct sock *newsk;
int error;
lock_sock(sk);
/* We need to make sure that this socket is listening,
* and that it has something pending.
*/
error = -EINVAL;
if (sk->sk_state != TCP_LISTEN)
goto out_err;
/* Find already established connection */
if (reqsk_queue_empty(queue)) {
long timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK);
/* If this is a non blocking socket don't sleep */
error = -EAGAIN;
if (!timeo)
goto out_err;
error = inet_csk_wait_for_connect(sk, timeo);
if (error)
goto out_err;
}
req = reqsk_queue_remove(queue, sk);
newsk = req->sk;
if (sk->sk_protocol == IPPROTO_TCP &&
tcp_rsk(req)->tfo_listener) {
spin_lock_bh(&queue->fastopenq.lock);
if (tcp_rsk(req)->tfo_listener) {
/* We are still waiting for the final ACK from 3WHS
* so can't free req now. Instead, we set req->sk to
* NULL to signify that the child socket is taken
* so reqsk_fastopen_remove() will free the req
* when 3WHS finishes (or is aborted).
*/
req->sk = NULL;
req = NULL;
}
spin_unlock_bh(&queue->fastopenq.lock);
}
out:
release_sock(sk);
if (newsk && mem_cgroup_sockets_enabled) {
int amt;
/* atomically get the memory usage, set and charge the
* newsk->sk_memcg.
*/
lock_sock(newsk);
/* The socket has not been accepted yet, no need to look at
* newsk->sk_wmem_queued.
*/
amt = sk_mem_pages(newsk->sk_forward_alloc +
atomic_read(&newsk->sk_rmem_alloc));
mem_cgroup_sk_alloc(newsk);
if (newsk->sk_memcg && amt)
mem_cgroup_charge_skmem(newsk->sk_memcg, amt,
GFP_KERNEL | __GFP_NOFAIL);
release_sock(newsk);
}
if (req)
reqsk_put(req);
return newsk;
out_err:
newsk = NULL;
req = NULL;
*err = error;
goto out;
}
EXPORT_SYMBOL(inet_csk_accept);
/*
* Using different timers for retransmit, delayed acks and probes
* We may wish use just one timer maintaining a list of expire jiffies
* to optimize.
*/
void inet_csk_init_xmit_timers(struct sock *sk,
void (*retransmit_handler)(struct timer_list *t),
void (*delack_handler)(struct timer_list *t),
void (*keepalive_handler)(struct timer_list *t))
{
struct inet_connection_sock *icsk = inet_csk(sk);
timer_setup(&icsk->icsk_retransmit_timer, retransmit_handler, 0);
timer_setup(&icsk->icsk_delack_timer, delack_handler, 0);
timer_setup(&sk->sk_timer, keepalive_handler, 0);
icsk->icsk_pending = icsk->icsk_ack.pending = 0;
}
EXPORT_SYMBOL(inet_csk_init_xmit_timers);
void inet_csk_clear_xmit_timers(struct sock *sk)
{
struct inet_connection_sock *icsk = inet_csk(sk);
icsk->icsk_pending = icsk->icsk_ack.pending = 0;
sk_stop_timer(sk, &icsk->icsk_retransmit_timer);
sk_stop_timer(sk, &icsk->icsk_delack_timer);
sk_stop_timer(sk, &sk->sk_timer);
}
EXPORT_SYMBOL(inet_csk_clear_xmit_timers);
void inet_csk_delete_keepalive_timer(struct sock *sk)
{
sk_stop_timer(sk, &sk->sk_timer);
}
EXPORT_SYMBOL(inet_csk_delete_keepalive_timer);
void inet_csk_reset_keepalive_timer(struct sock *sk, unsigned long len)
{
sk_reset_timer(sk, &sk->sk_timer, jiffies + len);
}
EXPORT_SYMBOL(inet_csk_reset_keepalive_timer);
struct dst_entry *inet_csk_route_req(const struct sock *sk,
struct flowi4 *fl4,
const struct request_sock *req)
{
const struct inet_request_sock *ireq = inet_rsk(req);
struct net *net = read_pnet(&ireq->ireq_net);
struct ip_options_rcu *opt;
struct rtable *rt;
rcu_read_lock();
opt = rcu_dereference(ireq->ireq_opt);
flowi4_init_output(fl4, ireq->ir_iif, ireq->ir_mark,
RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE,
sk->sk_protocol, inet_sk_flowi_flags(sk),
(opt && opt->opt.srr) ? opt->opt.faddr : ireq->ir_rmt_addr,
ireq->ir_loc_addr, ireq->ir_rmt_port,
htons(ireq->ir_num), sk->sk_uid);
security_req_classify_flow(req, flowi4_to_flowi_common(fl4));
rt = ip_route_output_flow(net, fl4, sk);
if (IS_ERR(rt))
goto no_route;
if (opt && opt->opt.is_strictroute && rt->rt_uses_gateway)
goto route_err;
rcu_read_unlock();
return &rt->dst;
route_err:
ip_rt_put(rt);
no_route:
rcu_read_unlock();
__IP_INC_STATS(net, IPSTATS_MIB_OUTNOROUTES);
return NULL;
}
EXPORT_SYMBOL_GPL(inet_csk_route_req);
struct dst_entry *inet_csk_route_child_sock(const struct sock *sk,
struct sock *newsk,
const struct request_sock *req)
{
const struct inet_request_sock *ireq = inet_rsk(req);
struct net *net = read_pnet(&ireq->ireq_net);
struct inet_sock *newinet = inet_sk(newsk);
struct ip_options_rcu *opt;
struct flowi4 *fl4;
struct rtable *rt;
opt = rcu_dereference(ireq->ireq_opt);
fl4 = &newinet->cork.fl.u.ip4;
flowi4_init_output(fl4, ireq->ir_iif, ireq->ir_mark,
RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE,
sk->sk_protocol, inet_sk_flowi_flags(sk),
(opt && opt->opt.srr) ? opt->opt.faddr : ireq->ir_rmt_addr,
ireq->ir_loc_addr, ireq->ir_rmt_port,
htons(ireq->ir_num), sk->sk_uid);
security_req_classify_flow(req, flowi4_to_flowi_common(fl4));
rt = ip_route_output_flow(net, fl4, sk);
if (IS_ERR(rt))
goto no_route;
if (opt && opt->opt.is_strictroute && rt->rt_uses_gateway)
goto route_err;
return &rt->dst;
route_err:
ip_rt_put(rt);
no_route:
__IP_INC_STATS(net, IPSTATS_MIB_OUTNOROUTES);
return NULL;
}
EXPORT_SYMBOL_GPL(inet_csk_route_child_sock);
/* Decide when to expire the request and when to resend SYN-ACK */
static void syn_ack_recalc(struct request_sock *req,
const int max_syn_ack_retries,
const u8 rskq_defer_accept,
int *expire, int *resend)
{
if (!rskq_defer_accept) {
*expire = req->num_timeout >= max_syn_ack_retries;
*resend = 1;
return;
}
*expire = req->num_timeout >= max_syn_ack_retries &&
(!inet_rsk(req)->acked || req->num_timeout >= rskq_defer_accept);
/* Do not resend while waiting for data after ACK,
* start to resend on end of deferring period to give
* last chance for data or ACK to create established socket.
*/
*resend = !inet_rsk(req)->acked ||
req->num_timeout >= rskq_defer_accept - 1;
}
int inet_rtx_syn_ack(const struct sock *parent, struct request_sock *req)
{
int err = req->rsk_ops->rtx_syn_ack(parent, req);
if (!err)
req->num_retrans++;
return err;
}
EXPORT_SYMBOL(inet_rtx_syn_ack);
static struct request_sock *inet_reqsk_clone(struct request_sock *req,
struct sock *sk)
{
struct sock *req_sk, *nreq_sk;
struct request_sock *nreq;
nreq = kmem_cache_alloc(req->rsk_ops->slab, GFP_ATOMIC | __GFP_NOWARN);
if (!nreq) {
__NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMIGRATEREQFAILURE);
/* paired with refcount_inc_not_zero() in reuseport_migrate_sock() */
sock_put(sk);
return NULL;
}
req_sk = req_to_sk(req);
nreq_sk = req_to_sk(nreq);
memcpy(nreq_sk, req_sk,
offsetof(struct sock, sk_dontcopy_begin));
memcpy(&nreq_sk->sk_dontcopy_end, &req_sk->sk_dontcopy_end,
req->rsk_ops->obj_size - offsetof(struct sock, sk_dontcopy_end));
sk_node_init(&nreq_sk->sk_node);
nreq_sk->sk_tx_queue_mapping = req_sk->sk_tx_queue_mapping;
#ifdef CONFIG_SOCK_RX_QUEUE_MAPPING
nreq_sk->sk_rx_queue_mapping = req_sk->sk_rx_queue_mapping;
#endif
nreq_sk->sk_incoming_cpu = req_sk->sk_incoming_cpu;
nreq->rsk_listener = sk;
/* We need not acquire fastopenq->lock
* because the child socket is locked in inet_csk_listen_stop().
*/
if (sk->sk_protocol == IPPROTO_TCP && tcp_rsk(nreq)->tfo_listener)
rcu_assign_pointer(tcp_sk(nreq->sk)->fastopen_rsk, nreq);
return nreq;
}
static void reqsk_queue_migrated(struct request_sock_queue *queue,
const struct request_sock *req)
{
if (req->num_timeout == 0)
atomic_inc(&queue->young);
atomic_inc(&queue->qlen);
}
static void reqsk_migrate_reset(struct request_sock *req)
{
req->saved_syn = NULL;
#if IS_ENABLED(CONFIG_IPV6)
inet_rsk(req)->ipv6_opt = NULL;
inet_rsk(req)->pktopts = NULL;
#else
inet_rsk(req)->ireq_opt = NULL;
#endif
}
/* return true if req was found in the ehash table */
static bool reqsk_queue_unlink(struct request_sock *req)
{
struct inet_hashinfo *hashinfo = req_to_sk(req)->sk_prot->h.hashinfo;
bool found = false;
if (sk_hashed(req_to_sk(req))) {
spinlock_t *lock = inet_ehash_lockp(hashinfo, req->rsk_hash);
spin_lock(lock);
found = __sk_nulls_del_node_init_rcu(req_to_sk(req));
spin_unlock(lock);
}
if (timer_pending(&req->rsk_timer) && del_timer_sync(&req->rsk_timer))
reqsk_put(req);
return found;
}
bool inet_csk_reqsk_queue_drop(struct sock *sk, struct request_sock *req)
{
bool unlinked = reqsk_queue_unlink(req);
if (unlinked) {
reqsk_queue_removed(&inet_csk(sk)->icsk_accept_queue, req);
reqsk_put(req);
}
return unlinked;
}
EXPORT_SYMBOL(inet_csk_reqsk_queue_drop);
void inet_csk_reqsk_queue_drop_and_put(struct sock *sk, struct request_sock *req)
{
inet_csk_reqsk_queue_drop(sk, req);
reqsk_put(req);
}
EXPORT_SYMBOL(inet_csk_reqsk_queue_drop_and_put);
static void reqsk_timer_handler(struct timer_list *t)
{
struct request_sock *req = from_timer(req, t, rsk_timer);
struct request_sock *nreq = NULL, *oreq = req;
struct sock *sk_listener = req->rsk_listener;
struct inet_connection_sock *icsk;
struct request_sock_queue *queue;
struct net *net;
int max_syn_ack_retries, qlen, expire = 0, resend = 0;
if (inet_sk_state_load(sk_listener) != TCP_LISTEN) {
struct sock *nsk;
nsk = reuseport_migrate_sock(sk_listener, req_to_sk(req), NULL);
if (!nsk)
goto drop;
nreq = inet_reqsk_clone(req, nsk);
if (!nreq)
goto drop;
/* The new timer for the cloned req can decrease the 2
* by calling inet_csk_reqsk_queue_drop_and_put(), so
* hold another count to prevent use-after-free and
* call reqsk_put() just before return.
*/
refcount_set(&nreq->rsk_refcnt, 2 + 1);
timer_setup(&nreq->rsk_timer, reqsk_timer_handler, TIMER_PINNED);
reqsk_queue_migrated(&inet_csk(nsk)->icsk_accept_queue, req);
req = nreq;
sk_listener = nsk;
}
icsk = inet_csk(sk_listener);
net = sock_net(sk_listener);
max_syn_ack_retries = icsk->icsk_syn_retries ? : net->ipv4.sysctl_tcp_synack_retries;
/* Normally all the openreqs are young and become mature
* (i.e. converted to established socket) for first timeout.
* If synack was not acknowledged for 1 second, it means
* one of the following things: synack was lost, ack was lost,
* rtt is high or nobody planned to ack (i.e. synflood).
* When server is a bit loaded, queue is populated with old
* open requests, reducing effective size of queue.
* When server is well loaded, queue size reduces to zero
* after several minutes of work. It is not synflood,
* it is normal operation. The solution is pruning
* too old entries overriding normal timeout, when
* situation becomes dangerous.
*
* Essentially, we reserve half of room for young
* embrions; and abort old ones without pity, if old
* ones are about to clog our table.
*/
queue = &icsk->icsk_accept_queue;
qlen = reqsk_queue_len(queue);
if ((qlen << 1) > max(8U, READ_ONCE(sk_listener->sk_max_ack_backlog))) {
int young = reqsk_queue_len_young(queue) << 1;
while (max_syn_ack_retries > 2) {
if (qlen < young)
break;
max_syn_ack_retries--;
young <<= 1;
}
}
syn_ack_recalc(req, max_syn_ack_retries, READ_ONCE(queue->rskq_defer_accept),
&expire, &resend);
req->rsk_ops->syn_ack_timeout(req);
if (!expire &&
(!resend ||
!inet_rtx_syn_ack(sk_listener, req) ||
inet_rsk(req)->acked)) {
unsigned long timeo;
if (req->num_timeout++ == 0)
atomic_dec(&queue->young);
timeo = min(TCP_TIMEOUT_INIT << req->num_timeout, TCP_RTO_MAX);
mod_timer(&req->rsk_timer, jiffies + timeo);
if (!nreq)
return;
if (!inet_ehash_insert(req_to_sk(nreq), req_to_sk(oreq), NULL)) {
/* delete timer */
inet_csk_reqsk_queue_drop(sk_listener, nreq);
goto no_ownership;
}
__NET_INC_STATS(net, LINUX_MIB_TCPMIGRATEREQSUCCESS);
reqsk_migrate_reset(oreq);
reqsk_queue_removed(&inet_csk(oreq->rsk_listener)->icsk_accept_queue, oreq);
reqsk_put(oreq);
reqsk_put(nreq);
return;
}
/* Even if we can clone the req, we may need not retransmit any more
* SYN+ACKs (nreq->num_timeout > max_syn_ack_retries, etc), or another
* CPU may win the "own_req" race so that inet_ehash_insert() fails.
*/
if (nreq) {
__NET_INC_STATS(net, LINUX_MIB_TCPMIGRATEREQFAILURE);
no_ownership:
reqsk_migrate_reset(nreq);
reqsk_queue_removed(queue, nreq);
__reqsk_free(nreq);
}
drop:
inet_csk_reqsk_queue_drop_and_put(oreq->rsk_listener, oreq);
}
static void reqsk_queue_hash_req(struct request_sock *req,
unsigned long timeout)
{
timer_setup(&req->rsk_timer, reqsk_timer_handler, TIMER_PINNED);
mod_timer(&req->rsk_timer, jiffies + timeout);
inet_ehash_insert(req_to_sk(req), NULL, NULL);
/* before letting lookups find us, make sure all req fields
* are committed to memory and refcnt initialized.
*/
smp_wmb();
refcount_set(&req->rsk_refcnt, 2 + 1);
}
void inet_csk_reqsk_queue_hash_add(struct sock *sk, struct request_sock *req,
unsigned long timeout)
{
reqsk_queue_hash_req(req, timeout);
inet_csk_reqsk_queue_added(sk);
}
EXPORT_SYMBOL_GPL(inet_csk_reqsk_queue_hash_add);
static void inet_clone_ulp(const struct request_sock *req, struct sock *newsk,
const gfp_t priority)
{
struct inet_connection_sock *icsk = inet_csk(newsk);
if (!icsk->icsk_ulp_ops)
return;
if (icsk->icsk_ulp_ops->clone)
icsk->icsk_ulp_ops->clone(req, newsk, priority);
}
/**
* inet_csk_clone_lock - clone an inet socket, and lock its clone
* @sk: the socket to clone
* @req: request_sock
* @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
*
* Caller must unlock socket even in error path (bh_unlock_sock(newsk))
*/
struct sock *inet_csk_clone_lock(const struct sock *sk,
const struct request_sock *req,
const gfp_t priority)
{
struct sock *newsk = sk_clone_lock(sk, priority);
if (newsk) {
struct inet_connection_sock *newicsk = inet_csk(newsk);
inet_sk_set_state(newsk, TCP_SYN_RECV);
newicsk->icsk_bind_hash = NULL;
inet_sk(newsk)->inet_dport = inet_rsk(req)->ir_rmt_port;
inet_sk(newsk)->inet_num = inet_rsk(req)->ir_num;
inet_sk(newsk)->inet_sport = htons(inet_rsk(req)->ir_num);
/* listeners have SOCK_RCU_FREE, not the children */
sock_reset_flag(newsk, SOCK_RCU_FREE);
inet_sk(newsk)->mc_list = NULL;
newsk->sk_mark = inet_rsk(req)->ir_mark;
atomic64_set(&newsk->sk_cookie,
atomic64_read(&inet_rsk(req)->ir_cookie));
newicsk->icsk_retransmits = 0;
newicsk->icsk_backoff = 0;
newicsk->icsk_probes_out = 0;
newicsk->icsk_probes_tstamp = 0;
/* Deinitialize accept_queue to trap illegal accesses. */
memset(&newicsk->icsk_accept_queue, 0, sizeof(newicsk->icsk_accept_queue));
inet_clone_ulp(req, newsk, priority);
security_inet_csk_clone(newsk, req);
}
return newsk;
}
EXPORT_SYMBOL_GPL(inet_csk_clone_lock);
/*
* At this point, there should be no process reference to this
* socket, and thus no user references at all. Therefore we
* can assume the socket waitqueue is inactive and nobody will
* try to jump onto it.
*/
void inet_csk_destroy_sock(struct sock *sk)
{
WARN_ON(sk->sk_state != TCP_CLOSE);
WARN_ON(!sock_flag(sk, SOCK_DEAD));
/* It cannot be in hash table! */
WARN_ON(!sk_unhashed(sk));
/* If it has not 0 inet_sk(sk)->inet_num, it must be bound */
WARN_ON(inet_sk(sk)->inet_num && !inet_csk(sk)->icsk_bind_hash);
sk->sk_prot->destroy(sk);
sk_stream_kill_queues(sk);
xfrm_sk_free_policy(sk);
sk_refcnt_debug_release(sk);
this_cpu_dec(*sk->sk_prot->orphan_count);
sock_put(sk);
}
EXPORT_SYMBOL(inet_csk_destroy_sock);
/* This function allows to force a closure of a socket after the call to
* tcp/dccp_create_openreq_child().
*/
void inet_csk_prepare_forced_close(struct sock *sk)
__releases(&sk->sk_lock.slock)
{
/* sk_clone_lock locked the socket and set refcnt to 2 */
bh_unlock_sock(sk);
sock_put(sk);
inet_csk_prepare_for_destroy_sock(sk);
inet_sk(sk)->inet_num = 0;
}
EXPORT_SYMBOL(inet_csk_prepare_forced_close);
int inet_csk_listen_start(struct sock *sk, int backlog)
{
struct inet_connection_sock *icsk = inet_csk(sk);
struct inet_sock *inet = inet_sk(sk);
int err = -EADDRINUSE;
reqsk_queue_alloc(&icsk->icsk_accept_queue);
sk->sk_ack_backlog = 0;
inet_csk_delack_init(sk);
/* There is race window here: we announce ourselves listening,
* but this transition is still not validated by get_port().
* It is OK, because this socket enters to hash table only
* after validation is complete.
*/
inet_sk_state_store(sk, TCP_LISTEN);
if (!sk->sk_prot->get_port(sk, inet->inet_num)) {
inet->inet_sport = htons(inet->inet_num);
sk_dst_reset(sk);
err = sk->sk_prot->hash(sk);
if (likely(!err))
return 0;
}
inet_sk_set_state(sk, TCP_CLOSE);
return err;
}
EXPORT_SYMBOL_GPL(inet_csk_listen_start);
static void inet_child_forget(struct sock *sk, struct request_sock *req,
struct sock *child)
{
sk->sk_prot->disconnect(child, O_NONBLOCK);
sock_orphan(child);
this_cpu_inc(*sk->sk_prot->orphan_count);
if (sk->sk_protocol == IPPROTO_TCP && tcp_rsk(req)->tfo_listener) {
BUG_ON(rcu_access_pointer(tcp_sk(child)->fastopen_rsk) != req);
BUG_ON(sk != req->rsk_listener);
/* Paranoid, to prevent race condition if
* an inbound pkt destined for child is
* blocked by sock lock in tcp_v4_rcv().
* Also to satisfy an assertion in
* tcp_v4_destroy_sock().
*/
RCU_INIT_POINTER(tcp_sk(child)->fastopen_rsk, NULL);
}
inet_csk_destroy_sock(child);
}
struct sock *inet_csk_reqsk_queue_add(struct sock *sk,
struct request_sock *req,
struct sock *child)
{
struct request_sock_queue *queue = &inet_csk(sk)->icsk_accept_queue;
spin_lock(&queue->rskq_lock);
if (unlikely(sk->sk_state != TCP_LISTEN)) {
inet_child_forget(sk, req, child);
child = NULL;
} else {
req->sk = child;
req->dl_next = NULL;
if (queue->rskq_accept_head == NULL)
WRITE_ONCE(queue->rskq_accept_head, req);
else
queue->rskq_accept_tail->dl_next = req;
queue->rskq_accept_tail = req;
sk_acceptq_added(sk);
}
spin_unlock(&queue->rskq_lock);
return child;
}
EXPORT_SYMBOL(inet_csk_reqsk_queue_add);
struct sock *inet_csk_complete_hashdance(struct sock *sk, struct sock *child,
struct request_sock *req, bool own_req)
{
if (own_req) {
inet_csk_reqsk_queue_drop(req->rsk_listener, req);
reqsk_queue_removed(&inet_csk(req->rsk_listener)->icsk_accept_queue, req);
if (sk != req->rsk_listener) {
/* another listening sk has been selected,
* migrate the req to it.
*/
struct request_sock *nreq;
/* hold a refcnt for the nreq->rsk_listener
* which is assigned in inet_reqsk_clone()
*/
sock_hold(sk);
nreq = inet_reqsk_clone(req, sk);
if (!nreq) {
inet_child_forget(sk, req, child);
goto child_put;
}
refcount_set(&nreq->rsk_refcnt, 1);
if (inet_csk_reqsk_queue_add(sk, nreq, child)) {
__NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMIGRATEREQSUCCESS);
reqsk_migrate_reset(req);
reqsk_put(req);
return child;
}
__NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMIGRATEREQFAILURE);
reqsk_migrate_reset(nreq);
__reqsk_free(nreq);
} else if (inet_csk_reqsk_queue_add(sk, req, child)) {
return child;
}
}
/* Too bad, another child took ownership of the request, undo. */
child_put:
bh_unlock_sock(child);
sock_put(child);
return NULL;
}
EXPORT_SYMBOL(inet_csk_complete_hashdance);
/*
* This routine closes sockets which have been at least partially
* opened, but not yet accepted.
*/
void inet_csk_listen_stop(struct sock *sk)
{
struct inet_connection_sock *icsk = inet_csk(sk);
struct request_sock_queue *queue = &icsk->icsk_accept_queue;
struct request_sock *next, *req;
/* Following specs, it would be better either to send FIN
* (and enter FIN-WAIT-1, it is normal close)
* or to send active reset (abort).
* Certainly, it is pretty dangerous while synflood, but it is
* bad justification for our negligence 8)
* To be honest, we are not able to make either
* of the variants now. --ANK
*/
while ((req = reqsk_queue_remove(queue, sk)) != NULL) {
struct sock *child = req->sk, *nsk;
struct request_sock *nreq;
local_bh_disable();
bh_lock_sock(child);
WARN_ON(sock_owned_by_user(child));
sock_hold(child);
nsk = reuseport_migrate_sock(sk, child, NULL);
if (nsk) {
nreq = inet_reqsk_clone(req, nsk);
if (nreq) {
refcount_set(&nreq->rsk_refcnt, 1);
if (inet_csk_reqsk_queue_add(nsk, nreq, child)) {
__NET_INC_STATS(sock_net(nsk),
LINUX_MIB_TCPMIGRATEREQSUCCESS);
reqsk_migrate_reset(req);
} else {
__NET_INC_STATS(sock_net(nsk),
LINUX_MIB_TCPMIGRATEREQFAILURE);
reqsk_migrate_reset(nreq);
__reqsk_free(nreq);
}
/* inet_csk_reqsk_queue_add() has already
* called inet_child_forget() on failure case.
*/
goto skip_child_forget;
}
}
inet_child_forget(sk, req, child);
skip_child_forget:
reqsk_put(req);
bh_unlock_sock(child);
local_bh_enable();
sock_put(child);
cond_resched();
}
if (queue->fastopenq.rskq_rst_head) {
/* Free all the reqs queued in rskq_rst_head. */
spin_lock_bh(&queue->fastopenq.lock);
req = queue->fastopenq.rskq_rst_head;
queue->fastopenq.rskq_rst_head = NULL;
spin_unlock_bh(&queue->fastopenq.lock);
while (req != NULL) {
next = req->dl_next;
reqsk_put(req);
req = next;
}
}
WARN_ON_ONCE(sk->sk_ack_backlog);
}
EXPORT_SYMBOL_GPL(inet_csk_listen_stop);
void inet_csk_addr2sockaddr(struct sock *sk, struct sockaddr *uaddr)
{
struct sockaddr_in *sin = (struct sockaddr_in *)uaddr;
const struct inet_sock *inet = inet_sk(sk);
sin->sin_family = AF_INET;
sin->sin_addr.s_addr = inet->inet_daddr;
sin->sin_port = inet->inet_dport;
}
EXPORT_SYMBOL_GPL(inet_csk_addr2sockaddr);
static struct dst_entry *inet_csk_rebuild_route(struct sock *sk, struct flowi *fl)
{
const struct inet_sock *inet = inet_sk(sk);
const struct ip_options_rcu *inet_opt;
__be32 daddr = inet->inet_daddr;
struct flowi4 *fl4;
struct rtable *rt;
rcu_read_lock();
inet_opt = rcu_dereference(inet->inet_opt);
if (inet_opt && inet_opt->opt.srr)
daddr = inet_opt->opt.faddr;
fl4 = &fl->u.ip4;
rt = ip_route_output_ports(sock_net(sk), fl4, sk, daddr,
inet->inet_saddr, inet->inet_dport,
inet->inet_sport, sk->sk_protocol,
RT_CONN_FLAGS(sk), sk->sk_bound_dev_if);
if (IS_ERR(rt))
rt = NULL;
if (rt)
sk_setup_caps(sk, &rt->dst);
rcu_read_unlock();
return &rt->dst;
}
struct dst_entry *inet_csk_update_pmtu(struct sock *sk, u32 mtu)
{
struct dst_entry *dst = __sk_dst_check(sk, 0);
struct inet_sock *inet = inet_sk(sk);
if (!dst) {
dst = inet_csk_rebuild_route(sk, &inet->cork.fl);
if (!dst)
goto out;
}
dst->ops->update_pmtu(dst, sk, NULL, mtu, true);
dst = __sk_dst_check(sk, 0);
if (!dst)
dst = inet_csk_rebuild_route(sk, &inet->cork.fl);
out:
return dst;
}
EXPORT_SYMBOL_GPL(inet_csk_update_pmtu);
// SPDX-License-Identifier: GPL-2.0
/*
* class.c - basic device class management
*
* Copyright (c) 2002-3 Patrick Mochel
* Copyright (c) 2002-3 Open Source Development Labs
* Copyright (c) 2003-2004 Greg Kroah-Hartman
* Copyright (c) 2003-2004 IBM Corp.
*/
#include <linux/device/class.h>
#include <linux/device.h>
#include <linux/module.h>
#include <linux/init.h>
#include <linux/string.h>
#include <linux/kdev_t.h>
#include <linux/err.h>
#include <linux/slab.h>
#include <linux/genhd.h>
#include <linux/mutex.h>
#include "base.h"
#define to_class_attr(_attr) container_of(_attr, struct class_attribute, attr)
static ssize_t class_attr_show(struct kobject *kobj, struct attribute *attr,
char *buf)
{
struct class_attribute *class_attr = to_class_attr(attr);
struct subsys_private *cp = to_subsys_private(kobj);
ssize_t ret = -EIO;
if (class_attr->show)
ret = class_attr->show(cp->class, class_attr, buf);
return ret;
}
static ssize_t class_attr_store(struct kobject *kobj, struct attribute *attr,
const char *buf, size_t count)
{
struct class_attribute *class_attr = to_class_attr(attr);
struct subsys_private *cp = to_subsys_private(kobj);
ssize_t ret = -EIO;
if (class_attr->store)
ret = class_attr->store(cp->class, class_attr, buf, count);
return ret;
}
static void class_release(struct kobject *kobj)
{
struct subsys_private *cp = to_subsys_private(kobj);
struct class *class = cp->class;
pr_debug("class '%s': release.\n", class->name);
if (class->class_release)
class->class_release(class);
else
pr_debug("class '%s' does not have a release() function, "
"be careful\n", class->name);
kfree(cp);
}
static const struct kobj_ns_type_operations *class_child_ns_type(struct kobject *kobj)
{
struct subsys_private *cp = to_subsys_private(kobj);
struct class *class = cp->class;
return class->ns_type;
}
static const struct sysfs_ops class_sysfs_ops = {
.show = class_attr_show,
.store = class_attr_store,
};
static struct kobj_type class_ktype = {
.sysfs_ops = &class_sysfs_ops,
.release = class_release,
.child_ns_type = class_child_ns_type,
};
/* Hotplug events for classes go to the class subsys */
static struct kset *class_kset;
int class_create_file_ns(struct class *cls, const struct class_attribute *attr,
const void *ns)
{
int error;
if (cls)
error = sysfs_create_file_ns(&cls->p->subsys.kobj,
&attr->attr, ns);
else
error = -EINVAL;
return error;
}
void class_remove_file_ns(struct class *cls, const struct class_attribute *attr,
const void *ns)
{
if (cls)
sysfs_remove_file_ns(&cls->p->subsys.kobj, &attr->attr, ns);
}
static struct class *class_get(struct class *cls)
{
if (cls)
kset_get(&cls->p->subsys);
return cls;
}
static void class_put(struct class *cls)
{
if (cls)
kset_put(&cls->p->subsys);
}
static struct device *klist_class_to_dev(struct klist_node *n)
{
struct device_private *p = to_device_private_class(n);
return p->device;
}
static void klist_class_dev_get(struct klist_node *n)
{
struct device *dev = klist_class_to_dev(n);
get_device(dev);
}
static void klist_class_dev_put(struct klist_node *n)
{
struct device *dev = klist_class_to_dev(n);
put_device(dev);
}
static int class_add_groups(struct class *cls,
const struct attribute_group **groups)
{
return sysfs_create_groups(&cls->p->subsys.kobj, groups);
}
static void class_remove_groups(struct class *cls,
const struct attribute_group **groups)
{
return sysfs_remove_groups(&cls->p->subsys.kobj, groups);
}
int __class_register(struct class *cls, struct lock_class_key *key)
{
struct subsys_private *cp;
int error;
pr_debug("device class '%s': registering\n", cls->name);
cp = kzalloc(sizeof(*cp), GFP_KERNEL);
if (!cp)
return -ENOMEM;
klist_init(&cp->klist_devices, klist_class_dev_get, klist_class_dev_put);
INIT_LIST_HEAD(&cp->interfaces);
kset_init(&cp->glue_dirs);
__mutex_init(&cp->mutex, "subsys mutex", key);
error = kobject_set_name(&cp->subsys.kobj, "%s", cls->name);
if (error) {
kfree(cp);
return error;
}
/* set the default /sys/dev directory for devices of this class */
if (!cls->dev_kobj)
cls->dev_kobj = sysfs_dev_char_kobj;
#if defined(CONFIG_BLOCK)
/* let the block class directory show up in the root of sysfs */
if (!sysfs_deprecated || cls != &block_class)
cp->subsys.kobj.kset = class_kset;
#else
cp->subsys.kobj.kset = class_kset;
#endif
cp->subsys.kobj.ktype = &class_ktype;
cp->class = cls;
cls->p = cp;
error = kset_register(&cp->subsys);
if (error) {
kfree(cp);
return error;
}
error = class_add_groups(class_get(cls), cls->class_groups);
class_put(cls);
return error;
}
EXPORT_SYMBOL_GPL(__class_register);
void class_unregister(struct class *cls)
{
pr_debug("device class '%s': unregistering\n", cls->name);
class_remove_groups(cls, cls->class_groups);
kset_unregister(&cls->p->subsys);
}
static void class_create_release(struct class *cls)
{
pr_debug("%s called for %s\n", __func__, cls->name);
kfree(cls);
}
/**
* __class_create - create a struct class structure
* @owner: pointer to the module that is to "own" this struct class
* @name: pointer to a string for the name of this class.
* @key: the lock_class_key for this class; used by mutex lock debugging
*
* This is used to create a struct class pointer that can then be used
* in calls to device_create().
*
* Returns &struct class pointer on success, or ERR_PTR() on error.
*
* Note, the pointer created here is to be destroyed when finished by
* making a call to class_destroy().
*/
struct class *__class_create(struct module *owner, const char *name,
struct lock_class_key *key)
{
struct class *cls;
int retval;
cls = kzalloc(sizeof(*cls), GFP_KERNEL);
if (!cls) {
retval = -ENOMEM;
goto error;
}
cls->name = name;
cls->owner = owner;
cls->class_release = class_create_release;
retval = __class_register(cls, key);
if (retval)
goto error;
return cls;
error:
kfree(cls);
return ERR_PTR(retval);
}
EXPORT_SYMBOL_GPL(__class_create);
/**
* class_destroy - destroys a struct class structure
* @cls: pointer to the struct class that is to be destroyed
*
* Note, the pointer to be destroyed must have been created with a call
* to class_create().
*/
void class_destroy(struct class *cls)
{
if ((cls == NULL) || (IS_ERR(cls)))
return;
class_unregister(cls);
}
/**
* class_dev_iter_init - initialize class device iterator
* @iter: class iterator to initialize
* @class: the class we wanna iterate over
* @start: the device to start iterating from, if any
* @type: device_type of the devices to iterate over, NULL for all
*
* Initialize class iterator @iter such that it iterates over devices
* of @class. If @start is set, the list iteration will start there,
* otherwise if it is NULL, the iteration starts at the beginning of
* the list.
*/
void class_dev_iter_init(struct class_dev_iter *iter, struct class *class,
struct device *start, const struct device_type *type)
{
struct klist_node *start_knode = NULL;
if (start)
start_knode = &start->p->knode_class;
klist_iter_init_node(&class->p->klist_devices, &iter->ki, start_knode);
iter->type = type;
}
EXPORT_SYMBOL_GPL(class_dev_iter_init);
/**
* class_dev_iter_next - iterate to the next device
* @iter: class iterator to proceed
*
* Proceed @iter to the next device and return it. Returns NULL if
* iteration is complete.
*
* The returned device is referenced and won't be released till
* iterator is proceed to the next device or exited. The caller is
* free to do whatever it wants to do with the device including
* calling back into class code.
*/
struct device *class_dev_iter_next(struct class_dev_iter *iter)
{
struct klist_node *knode;
struct device *dev;
while (1) {
knode = klist_next(&iter->ki);
if (!knode)
return NULL;
dev = klist_class_to_dev(knode);
if (!iter->type || iter->type == dev->type)
return dev;
}
}
EXPORT_SYMBOL_GPL(class_dev_iter_next);
/**
* class_dev_iter_exit - finish iteration
* @iter: class iterator to finish
*
* Finish an iteration. Always call this function after iteration is
* complete whether the iteration ran till the end or not.
*/
void class_dev_iter_exit(struct class_dev_iter *iter)
{
klist_iter_exit(&iter->ki);
}
EXPORT_SYMBOL_GPL(class_dev_iter_exit);
/**
* class_for_each_device - device iterator
* @class: the class we're iterating
* @start: the device to start with in the list, if any.
* @data: data for the callback
* @fn: function to be called for each device
*
* Iterate over @class's list of devices, and call @fn for each,
* passing it @data. If @start is set, the list iteration will start
* there, otherwise if it is NULL, the iteration starts at the
* beginning of the list.
*
* We check the return of @fn each time. If it returns anything
* other than 0, we break out and return that value.
*
* @fn is allowed to do anything including calling back into class
* code. There's no locking restriction.
*/
int class_for_each_device(struct class *class, struct device *start,
void *data, int (*fn)(struct device *, void *))
{
struct class_dev_iter iter;
struct device *dev;
int error = 0;
if (!class)
return -EINVAL;
if (!class->p) {
WARN(1, "%s called for class '%s' before it was initialized",
__func__, class->name);
return -EINVAL;
}
class_dev_iter_init(&iter, class, start, NULL);
while ((dev = class_dev_iter_next(&iter))) {
error = fn(dev, data);
if (error)
break;
}
class_dev_iter_exit(&iter);
return error;
}
EXPORT_SYMBOL_GPL(class_for_each_device);
/**
* class_find_device - device iterator for locating a particular device
* @class: the class we're iterating
* @start: Device to begin with
* @data: data for the match function
* @match: function to check device
*
* This is similar to the class_for_each_dev() function above, but it
* returns a reference to a device that is 'found' for later use, as
* determined by the @match callback.
*
* The callback should return 0 if the device doesn't match and non-zero
* if it does. If the callback returns non-zero, this function will
* return to the caller and not iterate over any more devices.
*
* Note, you will need to drop the reference with put_device() after use.
*
* @match is allowed to do anything including calling back into class
* code. There's no locking restriction.
*/
struct device *class_find_device(struct class *class, struct device *start,
const void *data,
int (*match)(struct device *, const void *))
{
struct class_dev_iter iter;
struct device *dev;
if (!class)
return NULL;
if (!class->p) {
WARN(1, "%s called for class '%s' before it was initialized",
__func__, class->name);
return NULL;
}
class_dev_iter_init(&iter, class, start, NULL);
while ((dev = class_dev_iter_next(&iter))) {
if (match(dev, data)) {
get_device(dev);
break;
}
}
class_dev_iter_exit(&iter);
return dev;
}
EXPORT_SYMBOL_GPL(class_find_device);
int class_interface_register(struct class_interface *class_intf)
{
struct class *parent;
struct class_dev_iter iter;
struct device *dev;
if (!class_intf || !class_intf->class)
return -ENODEV;
parent = class_get(class_intf->class);
if (!parent)
return -EINVAL;
mutex_lock(&parent->p->mutex);
list_add_tail(&class_intf->node, &parent->p->interfaces);
if (class_intf->add_dev) {
class_dev_iter_init(&iter, parent, NULL, NULL);
while ((dev = class_dev_iter_next(&iter)))
class_intf->add_dev(dev, class_intf);
class_dev_iter_exit(&iter);
}
mutex_unlock(&parent->p->mutex);
return 0;
}
void class_interface_unregister(struct class_interface *class_intf)
{
struct class *parent = class_intf->class;
struct class_dev_iter iter;
struct device *dev;
if (!parent)
return;
mutex_lock(&parent->p->mutex);
list_del_init(&class_intf->node);
if (class_intf->remove_dev) {
class_dev_iter_init(&iter, parent, NULL, NULL);
while ((dev = class_dev_iter_next(&iter)))
class_intf->remove_dev(dev, class_intf);
class_dev_iter_exit(&iter);
}
mutex_unlock(&parent->p->mutex);
class_put(parent);
}
ssize_t show_class_attr_string(struct class *class,
struct class_attribute *attr, char *buf)
{
struct class_attribute_string *cs;
cs = container_of(attr, struct class_attribute_string, attr);
return sysfs_emit(buf, "%s\n", cs->str);
}
EXPORT_SYMBOL_GPL(show_class_attr_string);
struct class_compat {
struct kobject *kobj;
};
/**
* class_compat_register - register a compatibility class
* @name: the name of the class
*
* Compatibility class are meant as a temporary user-space compatibility
* workaround when converting a family of class devices to a bus devices.
*/
struct class_compat *class_compat_register(const char *name)
{
struct class_compat *cls;
cls = kmalloc(sizeof(struct class_compat), GFP_KERNEL);
if (!cls)
return NULL;
cls->kobj = kobject_create_and_add(name, &class_kset->kobj);
if (!cls->kobj) {
kfree(cls);
return NULL;
}
return cls;
}
EXPORT_SYMBOL_GPL(class_compat_register);
/**
* class_compat_unregister - unregister a compatibility class
* @cls: the class to unregister
*/
void class_compat_unregister(struct class_compat *cls)
{
kobject_put(cls->kobj);
kfree(cls);
}
EXPORT_SYMBOL_GPL(class_compat_unregister);
/**
* class_compat_create_link - create a compatibility class device link to
* a bus device
* @cls: the compatibility class
* @dev: the target bus device
* @device_link: an optional device to which a "device" link should be created
*/
int class_compat_create_link(struct class_compat *cls, struct device *dev,
struct device *device_link)
{
int error;
error = sysfs_create_link(cls->kobj, &dev->kobj, dev_name(dev));
if (error)
return error;
/*
* Optionally add a "device" link (typically to the parent), as a
* class device would have one and we want to provide as much
* backwards compatibility as possible.
*/
if (device_link) {
error = sysfs_create_link(&dev->kobj, &device_link->kobj,
"device");
if (error)
sysfs_remove_link(cls->kobj, dev_name(dev));
}
return error;
}
EXPORT_SYMBOL_GPL(class_compat_create_link);
/**
* class_compat_remove_link - remove a compatibility class device link to
* a bus device
* @cls: the compatibility class
* @dev: the target bus device
* @device_link: an optional device to which a "device" link was previously
* created
*/
void class_compat_remove_link(struct class_compat *cls, struct device *dev,
struct device *device_link)
{
if (device_link)
sysfs_remove_link(&dev->kobj, "device");
sysfs_remove_link(cls->kobj, dev_name(dev));
}
EXPORT_SYMBOL_GPL(class_compat_remove_link);
int __init classes_init(void)
{
class_kset = kset_create_and_add("class", NULL, NULL);
if (!class_kset)
return -ENOMEM;
return 0;
}
EXPORT_SYMBOL_GPL(class_create_file_ns);
EXPORT_SYMBOL_GPL(class_remove_file_ns);
EXPORT_SYMBOL_GPL(class_unregister);
EXPORT_SYMBOL_GPL(class_destroy);
EXPORT_SYMBOL_GPL(class_interface_register);
EXPORT_SYMBOL_GPL(class_interface_unregister);
/* SPDX-License-Identifier: GPL-2.0 */
/*
* Access vector cache interface for object managers.
*
* Author : Stephen Smalley, <sds@tycho.nsa.gov>
*/
#ifndef _SELINUX_AVC_H_
#define _SELINUX_AVC_H_
#include <linux/stddef.h>
#include <linux/errno.h>
#include <linux/kernel.h>
#include <linux/kdev_t.h>
#include <linux/spinlock.h>
#include <linux/init.h>
#include <linux/audit.h>
#include <linux/lsm_audit.h>
#include <linux/in6.h>
#include "flask.h"
#include "av_permissions.h"
#include "security.h"
/*
* An entry in the AVC.
*/
struct avc_entry;
struct task_struct;
struct inode;
struct sock;
struct sk_buff;
/*
* AVC statistics
*/
struct avc_cache_stats {
unsigned int lookups;
unsigned int misses;
unsigned int allocations;
unsigned int reclaims;
unsigned int frees;
};
/*
* We only need this data after we have decided to send an audit message.
*/
struct selinux_audit_data {
u32 ssid;
u32 tsid;
u16 tclass;
u32 requested;
u32 audited;
u32 denied;
int result;
struct selinux_state *state;
};
/*
* AVC operations
*/
void __init avc_init(void);
static inline u32 avc_audit_required(u32 requested,
struct av_decision *avd,
int result,
u32 auditdeny,
u32 *deniedp)
{
u32 denied, audited;
denied = requested & ~avd->allowed;
if (unlikely(denied)) {
audited = denied & avd->auditdeny;
/*
* auditdeny is TRICKY! Setting a bit in
* this field means that ANY denials should NOT be audited if
* the policy contains an explicit dontaudit rule for that
* permission. Take notice that this is unrelated to the
* actual permissions that were denied. As an example lets
* assume:
*
* denied == READ
* avd.auditdeny & ACCESS == 0 (not set means explicit rule)
* auditdeny & ACCESS == 1
*
* We will NOT audit the denial even though the denied
* permission was READ and the auditdeny checks were for
* ACCESS
*/
if (auditdeny && !(auditdeny & avd->auditdeny))
audited = 0;
} else if (result)
audited = denied = requested;
else
audited = requested & avd->auditallow;
*deniedp = denied;
return audited;
}
int slow_avc_audit(struct selinux_state *state,
u32 ssid, u32 tsid, u16 tclass,
u32 requested, u32 audited, u32 denied, int result,
struct common_audit_data *a);
/**
* avc_audit - Audit the granting or denial of permissions.
* @ssid: source security identifier
* @tsid: target security identifier
* @tclass: target security class
* @requested: requested permissions
* @avd: access vector decisions
* @result: result from avc_has_perm_noaudit
* @a: auxiliary audit data
*
* Audit the granting or denial of permissions in accordance
* with the policy. This function is typically called by
* avc_has_perm() after a permission check, but can also be
* called directly by callers who use avc_has_perm_noaudit()
* in order to separate the permission check from the auditing.
* For example, this separation is useful when the permission check must
* be performed under a lock, to allow the lock to be released
* before calling the auditing code.
*/
static inline int avc_audit(struct selinux_state *state,
u32 ssid, u32 tsid,
u16 tclass, u32 requested,
struct av_decision *avd,
int result,
struct common_audit_data *a)
{
u32 audited, denied;
audited = avc_audit_required(requested, avd, result, 0, &denied);
if (likely(!audited))
return 0;
return slow_avc_audit(state, ssid, tsid, tclass,
requested, audited, denied, result,
a);
}
#define AVC_STRICT 1 /* Ignore permissive mode. */
#define AVC_EXTENDED_PERMS 2 /* update extended permissions */
int avc_has_perm_noaudit(struct selinux_state *state,
u32 ssid, u32 tsid,
u16 tclass, u32 requested,
unsigned flags,
struct av_decision *avd);
int avc_has_perm(struct selinux_state *state,
u32 ssid, u32 tsid,
u16 tclass, u32 requested,
struct common_audit_data *auditdata);
int avc_has_extended_perms(struct selinux_state *state,
u32 ssid, u32 tsid, u16 tclass, u32 requested,
u8 driver, u8 perm, struct common_audit_data *ad);
u32 avc_policy_seqno(struct selinux_state *state);
#define AVC_CALLBACK_GRANT 1
#define AVC_CALLBACK_TRY_REVOKE 2
#define AVC_CALLBACK_REVOKE 4
#define AVC_CALLBACK_RESET 8
#define AVC_CALLBACK_AUDITALLOW_ENABLE 16
#define AVC_CALLBACK_AUDITALLOW_DISABLE 32
#define AVC_CALLBACK_AUDITDENY_ENABLE 64
#define AVC_CALLBACK_AUDITDENY_DISABLE 128
#define AVC_CALLBACK_ADD_XPERMS 256
int avc_add_callback(int (*callback)(u32 event), u32 events);
/* Exported to selinuxfs */
struct selinux_avc;
int avc_get_hash_stats(struct selinux_avc *avc, char *page);
unsigned int avc_get_cache_threshold(struct selinux_avc *avc);
void avc_set_cache_threshold(struct selinux_avc *avc,
unsigned int cache_threshold);
/* Attempt to free avc node cache */
void avc_disable(void);
#ifdef CONFIG_SECURITY_SELINUX_AVC_STATS
DECLARE_PER_CPU(struct avc_cache_stats, avc_cache_stats);
#endif
#endif /* _SELINUX_AVC_H_ */
/* SPDX-License-Identifier: GPL-2.0 */
/*
* Operations on the network namespace
*/
#ifndef __NET_NET_NAMESPACE_H
#define __NET_NET_NAMESPACE_H
#include <linux/atomic.h>
#include <linux/refcount.h>
#include <linux/workqueue.h>
#include <linux/list.h>
#include <linux/sysctl.h>
#include <linux/uidgid.h>
#include <net/flow.h>
#include <net/netns/core.h>
#include <net/netns/mib.h>
#include <net/netns/unix.h>
#include <net/netns/packet.h>
#include <net/netns/ipv4.h>
#include <net/netns/ipv6.h>
#include <net/netns/nexthop.h>
#include <net/netns/ieee802154_6lowpan.h>
#include <net/netns/sctp.h>
#include <net/netns/netfilter.h>
#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
#include <net/netns/conntrack.h>
#endif
#include <net/netns/nftables.h>
#include <net/netns/xfrm.h>
#include <net/netns/mpls.h>
#include <net/netns/can.h>
#include <net/netns/xdp.h>
#include <net/netns/smc.h>
#include <net/netns/bpf.h>
#include <net/netns/mctp.h>
#include <linux/ns_common.h>
#include <linux/idr.h>
#include <linux/skbuff.h>
#include <linux/notifier.h>
struct user_namespace;
struct proc_dir_entry;
struct net_device;
struct sock;
struct ctl_table_header;
struct net_generic;
struct uevent_sock;
struct netns_ipvs;
struct bpf_prog;
#define NETDEV_HASHBITS 8
#define NETDEV_HASHENTRIES (1 << NETDEV_HASHBITS)
struct net {
/* First cache line can be often dirtied.
* Do not place here read-mostly fields.
*/
refcount_t passive; /* To decide when the network
* namespace should be freed.
*/
spinlock_t rules_mod_lock;
unsigned int dev_unreg_count;
unsigned int dev_base_seq; /* protected by rtnl_mutex */
int ifindex;
spinlock_t nsid_lock;
atomic_t fnhe_genid;
struct list_head list; /* list of network namespaces */
struct list_head exit_list; /* To linked to call pernet exit
* methods on dead net (
* pernet_ops_rwsem read locked),
* or to unregister pernet ops
* (pernet_ops_rwsem write locked).
*/
struct llist_node cleanup_list; /* namespaces on death row */
#ifdef CONFIG_KEYS
struct key_tag *key_domain; /* Key domain of operation tag */
#endif
struct user_namespace *user_ns; /* Owning user namespace */
struct ucounts *ucounts;
struct idr netns_ids;
struct ns_common ns;
struct list_head dev_base_head;
struct proc_dir_entry *proc_net;
struct proc_dir_entry *proc_net_stat;
#ifdef CONFIG_SYSCTL
struct ctl_table_set sysctls;
#endif
struct sock *rtnl; /* rtnetlink socket */
struct sock *genl_sock;
struct uevent_sock *uevent_sock; /* uevent socket */
struct hlist_head *dev_name_head;
struct hlist_head *dev_index_head;
struct raw_notifier_head netdev_chain;
/* Note that @hash_mix can be read millions times per second,
* it is critical that it is on a read_mostly cache line.
*/
u32 hash_mix;
struct net_device *loopback_dev; /* The loopback */
/* core fib_rules */
struct list_head rules_ops;
struct netns_core core;
struct netns_mib mib;
struct netns_packet packet;
struct netns_unix unx;
struct netns_nexthop nexthop;
struct netns_ipv4 ipv4;
#if IS_ENABLED(CONFIG_IPV6)
struct netns_ipv6 ipv6;
#endif
#if IS_ENABLED(CONFIG_IEEE802154_6LOWPAN)
struct netns_ieee802154_lowpan ieee802154_lowpan;
#endif
#if defined(CONFIG_IP_SCTP) || defined(CONFIG_IP_SCTP_MODULE)
struct netns_sctp sctp;
#endif
#ifdef CONFIG_NETFILTER
struct netns_nf nf;
#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
struct netns_ct ct;
#endif
#if defined(CONFIG_NF_TABLES) || defined(CONFIG_NF_TABLES_MODULE)
struct netns_nftables nft;
#endif
#endif
#ifdef CONFIG_WEXT_CORE
struct sk_buff_head wext_nlevents;
#endif
struct net_generic __rcu *gen;
/* Used to store attached BPF programs */
struct netns_bpf bpf;
/* Note : following structs are cache line aligned */
#ifdef CONFIG_XFRM
struct netns_xfrm xfrm;
#endif
u64 net_cookie; /* written once */
#if IS_ENABLED(CONFIG_IP_VS)
struct netns_ipvs *ipvs;
#endif
#if IS_ENABLED(CONFIG_MPLS)
struct netns_mpls mpls;
#endif
#if IS_ENABLED(CONFIG_CAN)
struct netns_can can;
#endif
#ifdef CONFIG_XDP_SOCKETS
struct netns_xdp xdp;
#endif
#if IS_ENABLED(CONFIG_MCTP)
struct netns_mctp mctp;
#endif
#if IS_ENABLED(CONFIG_CRYPTO_USER)
struct sock *crypto_nlsk;
#endif
struct sock *diag_nlsk;
#if IS_ENABLED(CONFIG_SMC)
struct netns_smc smc;
#endif
} __randomize_layout;
#include <linux/seq_file_net.h>
/* Init's network namespace */
extern struct net init_net;
#ifdef CONFIG_NET_NS
struct net *copy_net_ns(unsigned long flags, struct user_namespace *user_ns,
struct net *old_net);
void net_ns_get_ownership(const struct net *net, kuid_t *uid, kgid_t *gid);
void net_ns_barrier(void);
struct ns_common *get_net_ns(struct ns_common *ns);
struct net *get_net_ns_by_fd(int fd);
#else /* CONFIG_NET_NS */
#include <linux/sched.h>
#include <linux/nsproxy.h>
static inline struct net *copy_net_ns(unsigned long flags,
struct user_namespace *user_ns, struct net *old_net)
{
if (flags & CLONE_NEWNET)
return ERR_PTR(-EINVAL);
return old_net;
}
static inline void net_ns_get_ownership(const struct net *net,
kuid_t *uid, kgid_t *gid)
{
*uid = GLOBAL_ROOT_UID;
*gid = GLOBAL_ROOT_GID;
}
static inline void net_ns_barrier(void) {}
static inline struct ns_common *get_net_ns(struct ns_common *ns)
{
return ERR_PTR(-EINVAL);
}
static inline struct net *get_net_ns_by_fd(int fd)
{
return ERR_PTR(-EINVAL);
}
#endif /* CONFIG_NET_NS */
extern struct list_head net_namespace_list;
struct net *get_net_ns_by_pid(pid_t pid);
#ifdef CONFIG_SYSCTL
void ipx_register_sysctl(void);
void ipx_unregister_sysctl(void);
#else
#define ipx_register_sysctl()
#define ipx_unregister_sysctl()
#endif
#ifdef CONFIG_NET_NS
void __put_net(struct net *net);
static inline struct net *get_net(struct net *net)
{
refcount_inc(&net->ns.count);
return net;
}
static inline struct net *maybe_get_net(struct net *net)
{
/* Used when we know struct net exists but we
* aren't guaranteed a previous reference count
* exists. If the reference count is zero this
* function fails and returns NULL.
*/
if (!refcount_inc_not_zero(&net->ns.count))
net = NULL;
return net;
}
static inline void put_net(struct net *net)
{
if (refcount_dec_and_test(&net->ns.count))
__put_net(net);
}
static inline
int net_eq(const struct net *net1, const struct net *net2)
{
return net1 == net2;
}
static inline int check_net(const struct net *net)
{
return refcount_read(&net->ns.count) != 0;
}
void net_drop_ns(void *);
#else
static inline struct net *get_net(struct net *net)
{
return net;
}
static inline void put_net(struct net *net)
{
}
static inline struct net *maybe_get_net(struct net *net)
{
return net;
}
static inline
int net_eq(const struct net *net1, const struct net *net2)
{
return 1;
}
static inline int check_net(const struct net *net)
{
return 1;
}
#define net_drop_ns NULL
#endif
typedef struct {
#ifdef CONFIG_NET_NS
struct net *net;
#endif
} possible_net_t;
static inline void write_pnet(possible_net_t *pnet, struct net *net)
{
#ifdef CONFIG_NET_NS
pnet->net = net;
#endif
}
static inline struct net *read_pnet(const possible_net_t *pnet)
{
#ifdef CONFIG_NET_NS
return pnet->net;
#else
return &init_net;
#endif
}
/* Protected by net_rwsem */
#define for_each_net(VAR) \
list_for_each_entry(VAR, &net_namespace_list, list)
#define for_each_net_continue_reverse(VAR) \
list_for_each_entry_continue_reverse(VAR, &net_namespace_list, list)
#define for_each_net_rcu(VAR) \
list_for_each_entry_rcu(VAR, &net_namespace_list, list)
#ifdef CONFIG_NET_NS
#define __net_init
#define __net_exit
#define __net_initdata
#define __net_initconst
#else
#define __net_init __init
#define __net_exit __ref
#define __net_initdata __initdata
#define __net_initconst __initconst
#endif
int peernet2id_alloc(struct net *net, struct net *peer, gfp_t gfp);
int peernet2id(const struct net *net, struct net *peer);
bool peernet_has_id(const struct net *net, struct net *peer);
struct net *get_net_ns_by_id(const struct net *net, int id);
struct pernet_operations {
struct list_head list;
/*
* Below methods are called without any exclusive locks.
* More than one net may be constructed and destructed
* in parallel on several cpus. Every pernet_operations
* have to keep in mind all other pernet_operations and
* to introduce a locking, if they share common resources.
*
* The only time they are called with exclusive lock is
* from register_pernet_subsys(), unregister_pernet_subsys()
* register_pernet_device() and unregister_pernet_device().
*
* Exit methods using blocking RCU primitives, such as
* synchronize_rcu(), should be implemented via exit_batch.
* Then, destruction of a group of net requires single
* synchronize_rcu() related to these pernet_operations,
* instead of separate synchronize_rcu() for every net.
* Please, avoid synchronize_rcu() at all, where it's possible.
*
* Note that a combination of pre_exit() and exit() can
* be used, since a synchronize_rcu() is guaranteed between
* the calls.
*/
int (*init)(struct net *net);
void (*pre_exit)(struct net *net);
void (*exit)(struct net *net);
void (*exit_batch)(struct list_head *net_exit_list);
unsigned int *id;
size_t size;
};
/*
* Use these carefully. If you implement a network device and it
* needs per network namespace operations use device pernet operations,
* otherwise use pernet subsys operations.
*
* Network interfaces need to be removed from a dying netns _before_
* subsys notifiers can be called, as most of the network code cleanup
* (which is done from subsys notifiers) runs with the assumption that
* dev_remove_pack has been called so no new packets will arrive during
* and after the cleanup functions have been called. dev_remove_pack
* is not per namespace so instead the guarantee of no more packets
* arriving in a network namespace is provided by ensuring that all
* network devices and all sockets have left the network namespace
* before the cleanup methods are called.
*
* For the longest time the ipv4 icmp code was registered as a pernet
* device which caused kernel oops, and panics during network
* namespace cleanup. So please don't get this wrong.
*/
int register_pernet_subsys(struct pernet_operations *);
void unregister_pernet_subsys(struct pernet_operations *);
int register_pernet_device(struct pernet_operations *);
void unregister_pernet_device(struct pernet_operations *);
struct ctl_table;
#ifdef CONFIG_SYSCTL
int net_sysctl_init(void);
struct ctl_table_header *register_net_sysctl(struct net *net, const char *path,
struct ctl_table *table);
void unregister_net_sysctl_table(struct ctl_table_header *header);
#else
static inline int net_sysctl_init(void) { return 0; }
static inline struct ctl_table_header *register_net_sysctl(struct net *net,
const char *path, struct ctl_table *table)
{
return NULL;
}
static inline void unregister_net_sysctl_table(struct ctl_table_header *header)
{
}
#endif
static inline int rt_genid_ipv4(const struct net *net)
{
return atomic_read(&net->ipv4.rt_genid);
}
#if IS_ENABLED(CONFIG_IPV6)
static inline int rt_genid_ipv6(const struct net *net)
{
return atomic_read(&net->ipv6.fib6_sernum);
}
#endif
static inline void rt_genid_bump_ipv4(struct net *net)
{
atomic_inc(&net->ipv4.rt_genid);
}
extern void (*__fib6_flush_trees)(struct net *net);
static inline void rt_genid_bump_ipv6(struct net *net)
{
if (__fib6_flush_trees)
__fib6_flush_trees(net);
}
#if IS_ENABLED(CONFIG_IEEE802154_6LOWPAN)
static inline struct netns_ieee802154_lowpan *
net_ieee802154_lowpan(struct net *net)
{
return &net->ieee802154_lowpan;
}
#endif
/* For callers who don't really care about whether it's IPv4 or IPv6 */
static inline void rt_genid_bump_all(struct net *net)
{
rt_genid_bump_ipv4(net);
rt_genid_bump_ipv6(net);
}
static inline int fnhe_genid(const struct net *net)
{
return atomic_read(&net->fnhe_genid);
}
static inline void fnhe_genid_bump(struct net *net)
{
atomic_inc(&net->fnhe_genid);
}
#ifdef CONFIG_NET
void net_ns_init(void);
#else
static inline void net_ns_init(void) {}
#endif
#endif /* __NET_NET_NAMESPACE_H */
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef BTRFS_BLOCK_GROUP_H
#define BTRFS_BLOCK_GROUP_H
#include "free-space-cache.h"
enum btrfs_disk_cache_state {
BTRFS_DC_WRITTEN,
BTRFS_DC_ERROR,
BTRFS_DC_CLEAR,
BTRFS_DC_SETUP,
};
/*
* This describes the state of the block_group for async discard. This is due
* to the two pass nature of it where extent discarding is prioritized over
* bitmap discarding. BTRFS_DISCARD_RESET_CURSOR is set when we are resetting
* between lists to prevent contention for discard state variables
* (eg. discard_cursor).
*/
enum btrfs_discard_state {
BTRFS_DISCARD_EXTENTS,
BTRFS_DISCARD_BITMAPS,
BTRFS_DISCARD_RESET_CURSOR,
};
/*
* Control flags for do_chunk_alloc's force field CHUNK_ALLOC_NO_FORCE means to
* only allocate a chunk if we really need one.
*
* CHUNK_ALLOC_LIMITED means to only try and allocate one if we have very few
* chunks already allocated. This is used as part of the clustering code to
* help make sure we have a good pool of storage to cluster in, without filling
* the FS with empty chunks
*
* CHUNK_ALLOC_FORCE means it must try to allocate one
*/
enum btrfs_chunk_alloc_enum {
CHUNK_ALLOC_NO_FORCE,
CHUNK_ALLOC_LIMITED,
CHUNK_ALLOC_FORCE,
};
struct btrfs_caching_control {
struct list_head list;
struct mutex mutex;
wait_queue_head_t wait;
struct btrfs_work work;
struct btrfs_block_group *block_group;
u64 progress;
refcount_t count;
};
/* Once caching_thread() finds this much free space, it will wake up waiters. */
#define CACHING_CTL_WAKE_UP SZ_2M
struct btrfs_block_group {
struct btrfs_fs_info *fs_info;
struct inode *inode;
spinlock_t lock;
u64 start;
u64 length;
u64 pinned;
u64 reserved;
u64 used;
u64 delalloc_bytes;
u64 bytes_super;
u64 flags;
u64 cache_generation;
/*
* If the free space extent count exceeds this number, convert the block
* group to bitmaps.
*/
u32 bitmap_high_thresh;
/*
* If the free space extent count drops below this number, convert the
* block group back to extents.
*/
u32 bitmap_low_thresh;
/*
* It is just used for the delayed data space allocation because
* only the data space allocation and the relative metadata update
* can be done cross the transaction.
*/
struct rw_semaphore data_rwsem;
/* For raid56, this is a full stripe, without parity */
unsigned long full_stripe_len;
unsigned int ro;
unsigned int iref:1;
unsigned int has_caching_ctl:1;
unsigned int removed:1;
unsigned int to_copy:1;
unsigned int relocating_repair:1;
unsigned int chunk_item_inserted:1;
int disk_cache_state;
/* Cache tracking stuff */
int cached;
struct btrfs_caching_control *caching_ctl;
u64 last_byte_to_unpin;
struct btrfs_space_info *space_info;
/* Free space cache stuff */
struct btrfs_free_space_ctl *free_space_ctl;
/* Block group cache stuff */
struct rb_node cache_node;
/* For block groups in the same raid type */
struct list_head list;
refcount_t refs;
/*
* List of struct btrfs_free_clusters for this block group.
* Today it will only have one thing on it, but that may change
*/
struct list_head cluster_list;
/* For delayed block group creation or deletion of empty block groups */
struct list_head bg_list;
/* For read-only block groups */
struct list_head ro_list;
/*
* When non-zero it means the block group's logical address and its
* device extents can not be reused for future block group allocations
* until the counter goes down to 0. This is to prevent them from being
* reused while some task is still using the block group after it was
* deleted - we want to make sure they can only be reused for new block
* groups after that task is done with the deleted block group.
*/
atomic_t frozen;
/* For discard operations */
struct list_head discard_list;
int discard_index;
u64 discard_eligible_time;
u64 discard_cursor;
enum btrfs_discard_state discard_state;
/* For dirty block groups */
struct list_head dirty_list;
struct list_head io_list;
struct btrfs_io_ctl io_ctl;
/*
* Incremented when doing extent allocations and holding a read lock
* on the space_info's groups_sem semaphore.
* Decremented when an ordered extent that represents an IO against this
* block group's range is created (after it's added to its inode's
* root's list of ordered extents) or immediately after the allocation
* if it's a metadata extent or fallocate extent (for these cases we
* don't create ordered extents).
*/
atomic_t reservations;
/*
* Incremented while holding the spinlock *lock* by a task checking if
* it can perform a nocow write (incremented if the value for the *ro*
* field is 0). Decremented by such tasks once they create an ordered
* extent or before that if some error happens before reaching that step.
* This is to prevent races between block group relocation and nocow
* writes through direct IO.
*/
atomic_t nocow_writers;
/* Lock for free space tree operations. */
struct mutex free_space_lock;
/*
* Does the block group need to be added to the free space tree?
* Protected by free_space_lock.
*/
int needs_free_space;
/* Flag indicating this block group is placed on a sequential zone */
bool seq_zone;
/*
* Number of extents in this block group used for swap files.
* All accesses protected by the spinlock 'lock'.
*/
int swap_extents;
/* Record locked full stripes for RAID5/6 block group */
struct btrfs_full_stripe_locks_tree full_stripe_locks_root;
/*
* Allocation offset for the block group to implement sequential
* allocation. This is used only on a zoned filesystem.
*/
u64 alloc_offset;
u64 zone_unusable;
u64 meta_write_pointer;
};
static inline u64 btrfs_block_group_end(struct btrfs_block_group *block_group)
{
return (block_group->start + block_group->length);
}
static inline bool btrfs_is_block_group_data_only(
struct btrfs_block_group *block_group)
{
/*
* In mixed mode the fragmentation is expected to be high, lowering the
* efficiency, so only proper data block groups are considered.
*/
return (block_group->flags & BTRFS_BLOCK_GROUP_DATA) &&
!(block_group->flags & BTRFS_BLOCK_GROUP_METADATA);
}
#ifdef CONFIG_BTRFS_DEBUG
static inline int btrfs_should_fragment_free_space(
struct btrfs_block_group *block_group)
{
struct btrfs_fs_info *fs_info = block_group->fs_info;
return (btrfs_test_opt(fs_info, FRAGMENT_METADATA) &&
block_group->flags & BTRFS_BLOCK_GROUP_METADATA) ||
(btrfs_test_opt(fs_info, FRAGMENT_DATA) &&
block_group->flags & BTRFS_BLOCK_GROUP_DATA);
}
#endif
struct btrfs_block_group *btrfs_lookup_first_block_group(
struct btrfs_fs_info *info, u64 bytenr);
struct btrfs_block_group *btrfs_lookup_block_group(
struct btrfs_fs_info *info, u64 bytenr);
struct btrfs_block_group *btrfs_next_block_group(
struct btrfs_block_group *cache);
void btrfs_get_block_group(struct btrfs_block_group *cache);
void btrfs_put_block_group(struct btrfs_block_group *cache);
void btrfs_dec_block_group_reservations(struct btrfs_fs_info *fs_info,
const u64 start);
void btrfs_wait_block_group_reservations(struct btrfs_block_group *bg);
bool btrfs_inc_nocow_writers(struct btrfs_fs_info *fs_info, u64 bytenr);
void btrfs_dec_nocow_writers(struct btrfs_fs_info *fs_info, u64 bytenr);
void btrfs_wait_nocow_writers(struct btrfs_block_group *bg);
void btrfs_wait_block_group_cache_progress(struct btrfs_block_group *cache,
u64 num_bytes);
int btrfs_wait_block_group_cache_done(struct btrfs_block_group *cache);
int btrfs_cache_block_group(struct btrfs_block_group *cache,
int load_cache_only);
void btrfs_put_caching_control(struct btrfs_caching_control *ctl);
struct btrfs_caching_control *btrfs_get_caching_control(
struct btrfs_block_group *cache);
u64 add_new_free_space(struct btrfs_block_group *block_group,
u64 start, u64 end);
struct btrfs_trans_handle *btrfs_start_trans_remove_block_group(
struct btrfs_fs_info *fs_info,
const u64 chunk_offset);
int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
u64 group_start, struct extent_map *em);
void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info);
void btrfs_mark_bg_unused(struct btrfs_block_group *bg);
void btrfs_reclaim_bgs_work(struct work_struct *work);
void btrfs_reclaim_bgs(struct btrfs_fs_info *fs_info);
void btrfs_mark_bg_to_reclaim(struct btrfs_block_group *bg);
int btrfs_read_block_groups(struct btrfs_fs_info *info);
struct btrfs_block_group *btrfs_make_block_group(struct btrfs_trans_handle *trans,
u64 bytes_used, u64 type,
u64 chunk_offset, u64 size);
void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans);
int btrfs_inc_block_group_ro(struct btrfs_block_group *cache,
bool do_chunk_alloc);
void btrfs_dec_block_group_ro(struct btrfs_block_group *cache);
int btrfs_start_dirty_block_groups(struct btrfs_trans_handle *trans);
int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans);
int btrfs_setup_space_cache(struct btrfs_trans_handle *trans);
int btrfs_update_block_group(struct btrfs_trans_handle *trans,
u64 bytenr, u64 num_bytes, int alloc);
int btrfs_add_reserved_bytes(struct btrfs_block_group *cache,
u64 ram_bytes, u64 num_bytes, int delalloc);
void btrfs_free_reserved_bytes(struct btrfs_block_group *cache,
u64 num_bytes, int delalloc);
int btrfs_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags,
enum btrfs_chunk_alloc_enum force);
int btrfs_force_chunk_alloc(struct btrfs_trans_handle *trans, u64 type);
void check_system_chunk(struct btrfs_trans_handle *trans, const u64 type);
u64 btrfs_get_alloc_profile(struct btrfs_fs_info *fs_info, u64 orig_flags);
void btrfs_put_block_group_cache(struct btrfs_fs_info *info);
int btrfs_free_block_groups(struct btrfs_fs_info *info);
void btrfs_wait_space_cache_v1_finished(struct btrfs_block_group *cache,
struct btrfs_caching_control *caching_ctl);
int btrfs_rmap_block(struct btrfs_fs_info *fs_info, u64 chunk_start,
struct block_device *bdev, u64 physical, u64 **logical,
int *naddrs, int *stripe_len);
static inline u64 btrfs_data_alloc_profile(struct btrfs_fs_info *fs_info)
{
return btrfs_get_alloc_profile(fs_info, BTRFS_BLOCK_GROUP_DATA);
}
static inline u64 btrfs_metadata_alloc_profile(struct btrfs_fs_info *fs_info)
{
return btrfs_get_alloc_profile(fs_info, BTRFS_BLOCK_GROUP_METADATA);
}
static inline u64 btrfs_system_alloc_profile(struct btrfs_fs_info *fs_info)
{
return btrfs_get_alloc_profile(fs_info, BTRFS_BLOCK_GROUP_SYSTEM);
}
static inline int btrfs_block_group_done(struct btrfs_block_group *cache)
{
smp_mb();
return cache->cached == BTRFS_CACHE_FINISHED ||
cache->cached == BTRFS_CACHE_ERROR;
}
void btrfs_freeze_block_group(struct btrfs_block_group *cache);
void btrfs_unfreeze_block_group(struct btrfs_block_group *cache);
bool btrfs_inc_block_group_swap_extents(struct btrfs_block_group *bg);
void btrfs_dec_block_group_swap_extents(struct btrfs_block_group *bg, int amount);
#endif /* BTRFS_BLOCK_GROUP_H */
// SPDX-License-Identifier: GPL-2.0
#include "tree-mod-log.h"
#include "disk-io.h"
struct tree_mod_root {
u64 logical;
u8 level;
};
struct tree_mod_elem {
struct rb_node node;
u64 logical;
u64 seq;
enum btrfs_mod_log_op op;
/*
* This is used for BTRFS_MOD_LOG_KEY_* and BTRFS_MOD_LOG_MOVE_KEYS
* operations.
*/
int slot;
/* This is used for BTRFS_MOD_LOG_KEY* and BTRFS_MOD_LOG_ROOT_REPLACE. */
u64 generation;
/* Those are used for op == BTRFS_MOD_LOG_KEY_{REPLACE,REMOVE}. */
struct btrfs_disk_key key;
u64 blockptr;
/* This is used for op == BTRFS_MOD_LOG_MOVE_KEYS. */
struct {
int dst_slot;
int nr_items;
} move;
/* This is used for op == BTRFS_MOD_LOG_ROOT_REPLACE. */
struct tree_mod_root old_root;
};
/*
* Pull a new tree mod seq number for our operation.
*/
static inline u64 btrfs_inc_tree_mod_seq(struct btrfs_fs_info *fs_info)
{
return atomic64_inc_return(&fs_info->tree_mod_seq);
}
/*
* This adds a new blocker to the tree mod log's blocker list if the @elem
* passed does not already have a sequence number set. So when a caller expects
* to record tree modifications, it should ensure to set elem->seq to zero
* before calling btrfs_get_tree_mod_seq.
* Returns a fresh, unused tree log modification sequence number, even if no new
* blocker was added.
*/
u64 btrfs_get_tree_mod_seq(struct btrfs_fs_info *fs_info,
struct btrfs_seq_list *elem)
{
write_lock(&fs_info->tree_mod_log_lock);
if (!elem->seq) {
elem->seq = btrfs_inc_tree_mod_seq(fs_info);
list_add_tail(&elem->list, &fs_info->tree_mod_seq_list);
set_bit(BTRFS_FS_TREE_MOD_LOG_USERS, &fs_info->flags);
}
write_unlock(&fs_info->tree_mod_log_lock);
return elem->seq;
}
void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info,
struct btrfs_seq_list *elem)
{
struct rb_root *tm_root;
struct rb_node *node;
struct rb_node *next;
struct tree_mod_elem *tm;
u64 min_seq = BTRFS_SEQ_LAST;
u64 seq_putting = elem->seq;
if (!seq_putting)
return;
write_lock(&fs_info->tree_mod_log_lock);
list_del(&elem->list);
elem->seq = 0;
if (list_empty(&fs_info->tree_mod_seq_list)) {
clear_bit(BTRFS_FS_TREE_MOD_LOG_USERS, &fs_info->flags);
} else {
struct btrfs_seq_list *first;
first = list_first_entry(&fs_info->tree_mod_seq_list,
struct btrfs_seq_list, list);
if (seq_putting > first->seq) {
/*
* Blocker with lower sequence number exists, we cannot
* remove anything from the log.
*/
write_unlock(&fs_info->tree_mod_log_lock);
return;
}
min_seq = first->seq;
}
/*
* Anything that's lower than the lowest existing (read: blocked)
* sequence number can be removed from the tree.
*/
tm_root = &fs_info->tree_mod_log;
for (node = rb_first(tm_root); node; node = next) {
next = rb_next(node);
tm = rb_entry(node, struct tree_mod_elem, node);
if (tm->seq >= min_seq)
continue;
rb_erase(node, tm_root);
kfree(tm);
}
write_unlock(&fs_info->tree_mod_log_lock);
}
/*
* Key order of the log:
* node/leaf start address -> sequence
*
* The 'start address' is the logical address of the *new* root node for root
* replace operations, or the logical address of the affected block for all
* other operations.
*/
static noinline int tree_mod_log_insert(struct btrfs_fs_info *fs_info,
struct tree_mod_elem *tm)
{
struct rb_root *tm_root;
struct rb_node **new;
struct rb_node *parent = NULL;
struct tree_mod_elem *cur;
lockdep_assert_held_write(&fs_info->tree_mod_log_lock);
tm->seq = btrfs_inc_tree_mod_seq(fs_info);
tm_root = &fs_info->tree_mod_log;
new = &tm_root->rb_node;
while (*new) {
cur = rb_entry(*new, struct tree_mod_elem, node);
parent = *new;
if (cur->logical < tm->logical)
new = &((*new)->rb_left);
else if (cur->logical > tm->logical)
new = &((*new)->rb_right);
else if (cur->seq < tm->seq)
new = &((*new)->rb_left);
else if (cur->seq > tm->seq)
new = &((*new)->rb_right);
else
return -EEXIST;
}
rb_link_node(&tm->node, parent, new);
rb_insert_color(&tm->node, tm_root);
return 0;
}
/*
* Determines if logging can be omitted. Returns true if it can. Otherwise, it
* returns false with the tree_mod_log_lock acquired. The caller must hold
* this until all tree mod log insertions are recorded in the rb tree and then
* write unlock fs_info::tree_mod_log_lock.
*/
static inline bool tree_mod_dont_log(struct btrfs_fs_info *fs_info,
struct extent_buffer *eb)
{
if (!test_bit(BTRFS_FS_TREE_MOD_LOG_USERS, &fs_info->flags))
return true;
if (eb && btrfs_header_level(eb) == 0)
return true;
write_lock(&fs_info->tree_mod_log_lock);
if (list_empty(&(fs_info)->tree_mod_seq_list)) {
write_unlock(&fs_info->tree_mod_log_lock);
return true;
}
return false;
}
/* Similar to tree_mod_dont_log, but doesn't acquire any locks. */
static inline bool tree_mod_need_log(const struct btrfs_fs_info *fs_info,
struct extent_buffer *eb)
{
if (!test_bit(BTRFS_FS_TREE_MOD_LOG_USERS, &fs_info->flags))
return false;
if (eb && btrfs_header_level(eb) == 0)
return false;
return true;
}
static struct tree_mod_elem *alloc_tree_mod_elem(struct extent_buffer *eb,
int slot,
enum btrfs_mod_log_op op,
gfp_t flags)
{
struct tree_mod_elem *tm;
tm = kzalloc(sizeof(*tm), flags);
if (!tm)
return NULL;
tm->logical = eb->start;
if (op != BTRFS_MOD_LOG_KEY_ADD) {
btrfs_node_key(eb, &tm->key, slot);
tm->blockptr = btrfs_node_blockptr(eb, slot);
}
tm->op = op;
tm->slot = slot;
tm->generation = btrfs_node_ptr_generation(eb, slot);
RB_CLEAR_NODE(&tm->node);
return tm;
}
int btrfs_tree_mod_log_insert_key(struct extent_buffer *eb, int slot,
enum btrfs_mod_log_op op, gfp_t flags)
{
struct tree_mod_elem *tm;
int ret;
if (!tree_mod_need_log(eb->fs_info, eb))
return 0;
tm = alloc_tree_mod_elem(eb, slot, op, flags);
if (!tm)
return -ENOMEM;
if (tree_mod_dont_log(eb->fs_info, eb)) { kfree(tm); return 0;
}
ret = tree_mod_log_insert(eb->fs_info, tm);
write_unlock(&eb->fs_info->tree_mod_log_lock);
if (ret)
kfree(tm);
return ret;
}
int btrfs_tree_mod_log_insert_move(struct extent_buffer *eb,
int dst_slot, int src_slot,
int nr_items)
{
struct tree_mod_elem *tm = NULL;
struct tree_mod_elem **tm_list = NULL;
int ret = 0;
int i;
bool locked = false;
if (!tree_mod_need_log(eb->fs_info, eb))
return 0;
tm_list = kcalloc(nr_items, sizeof(struct tree_mod_elem *), GFP_NOFS);
if (!tm_list)
return -ENOMEM;
tm = kzalloc(sizeof(*tm), GFP_NOFS);
if (!tm) {
ret = -ENOMEM;
goto free_tms;
}
tm->logical = eb->start;
tm->slot = src_slot;
tm->move.dst_slot = dst_slot;
tm->move.nr_items = nr_items;
tm->op = BTRFS_MOD_LOG_MOVE_KEYS;
for (i = 0; i + dst_slot < src_slot && i < nr_items; i++) { tm_list[i] = alloc_tree_mod_elem(eb, i + dst_slot,
BTRFS_MOD_LOG_KEY_REMOVE_WHILE_MOVING, GFP_NOFS);
if (!tm_list[i]) {
ret = -ENOMEM;
goto free_tms;
}
}
if (tree_mod_dont_log(eb->fs_info, eb))
goto free_tms;
locked = true;
/*
* When we override something during the move, we log these removals.
* This can only happen when we move towards the beginning of the
* buffer, i.e. dst_slot < src_slot.
*/
for (i = 0; i + dst_slot < src_slot && i < nr_items; i++) { ret = tree_mod_log_insert(eb->fs_info, tm_list[i]);
if (ret)
goto free_tms;
}
ret = tree_mod_log_insert(eb->fs_info, tm);
if (ret)
goto free_tms;
write_unlock(&eb->fs_info->tree_mod_log_lock);
kfree(tm_list);
return 0;
free_tms:
for (i = 0; i < nr_items; i++) { if (tm_list[i] && !RB_EMPTY_NODE(&tm_list[i]->node)) rb_erase(&tm_list[i]->node, &eb->fs_info->tree_mod_log); kfree(tm_list[i]);
}
if (locked) write_unlock(&eb->fs_info->tree_mod_log_lock); kfree(tm_list);
kfree(tm);
return ret;
}
static inline int tree_mod_log_free_eb(struct btrfs_fs_info *fs_info,
struct tree_mod_elem **tm_list,
int nritems)
{
int i, j;
int ret;
for (i = nritems - 1; i >= 0; i--) { ret = tree_mod_log_insert(fs_info, tm_list[i]);
if (ret) {
for (j = nritems - 1; j > i; j--) rb_erase(&tm_list[j]->node,
&fs_info->tree_mod_log);
return ret;
}
}
return 0;
}
int btrfs_tree_mod_log_insert_root(struct extent_buffer *old_root,
struct extent_buffer *new_root,
bool log_removal)
{
struct btrfs_fs_info *fs_info = old_root->fs_info;
struct tree_mod_elem *tm = NULL;
struct tree_mod_elem **tm_list = NULL;
int nritems = 0;
int ret = 0;
int i;
if (!tree_mod_need_log(fs_info, NULL))
return 0;
if (log_removal && btrfs_header_level(old_root) > 0) {
nritems = btrfs_header_nritems(old_root);
tm_list = kcalloc(nritems, sizeof(struct tree_mod_elem *),
GFP_NOFS);
if (!tm_list) {
ret = -ENOMEM;
goto free_tms;
}
for (i = 0; i < nritems; i++) { tm_list[i] = alloc_tree_mod_elem(old_root, i,
BTRFS_MOD_LOG_KEY_REMOVE_WHILE_FREEING, GFP_NOFS);
if (!tm_list[i]) {
ret = -ENOMEM;
goto free_tms;
}
}
}
tm = kzalloc(sizeof(*tm), GFP_NOFS);
if (!tm) {
ret = -ENOMEM;
goto free_tms;
}
tm->logical = new_root->start;
tm->old_root.logical = old_root->start;
tm->old_root.level = btrfs_header_level(old_root);
tm->generation = btrfs_header_generation(old_root);
tm->op = BTRFS_MOD_LOG_ROOT_REPLACE;
if (tree_mod_dont_log(fs_info, NULL))
goto free_tms;
if (tm_list)
ret = tree_mod_log_free_eb(fs_info, tm_list, nritems);
if (!ret)
ret = tree_mod_log_insert(fs_info, tm);
write_unlock(&fs_info->tree_mod_log_lock);
if (ret)
goto free_tms;
kfree(tm_list);
return ret;
free_tms:
if (tm_list) { for (i = 0; i < nritems; i++) kfree(tm_list[i]); kfree(tm_list);
}
kfree(tm); return ret;
}
static struct tree_mod_elem *__tree_mod_log_search(struct btrfs_fs_info *fs_info,
u64 start, u64 min_seq,
bool smallest)
{
struct rb_root *tm_root;
struct rb_node *node;
struct tree_mod_elem *cur = NULL;
struct tree_mod_elem *found = NULL;
read_lock(&fs_info->tree_mod_log_lock);
tm_root = &fs_info->tree_mod_log;
node = tm_root->rb_node;
while (node) {
cur = rb_entry(node, struct tree_mod_elem, node);
if (cur->logical < start) {
node = node->rb_left;
} else if (cur->logical > start) {
node = node->rb_right;
} else if (cur->seq < min_seq) {
node = node->rb_left;
} else if (!smallest) {
/* We want the node with the highest seq */
if (found)
BUG_ON(found->seq > cur->seq);
found = cur;
node = node->rb_left;
} else if (cur->seq > min_seq) {
/* We want the node with the smallest seq */
if (found)
BUG_ON(found->seq < cur->seq);
found = cur;
node = node->rb_right;
} else {
found = cur;
break;
}
}
read_unlock(&fs_info->tree_mod_log_lock);
return found;
}
/*
* This returns the element from the log with the smallest time sequence
* value that's in the log (the oldest log item). Any element with a time
* sequence lower than min_seq will be ignored.
*/
static struct tree_mod_elem *tree_mod_log_search_oldest(struct btrfs_fs_info *fs_info,
u64 start, u64 min_seq)
{
return __tree_mod_log_search(fs_info, start, min_seq, true);
}
/*
* This returns the element from the log with the largest time sequence
* value that's in the log (the most recent log item). Any element with
* a time sequence lower than min_seq will be ignored.
*/
static struct tree_mod_elem *tree_mod_log_search(struct btrfs_fs_info *fs_info,
u64 start, u64 min_seq)
{
return __tree_mod_log_search(fs_info, start, min_seq, false);
}
int btrfs_tree_mod_log_eb_copy(struct extent_buffer *dst,
struct extent_buffer *src,
unsigned long dst_offset,
unsigned long src_offset,
int nr_items)
{
struct btrfs_fs_info *fs_info = dst->fs_info;
int ret = 0;
struct tree_mod_elem **tm_list = NULL;
struct tree_mod_elem **tm_list_add, **tm_list_rem;
int i;
bool locked = false;
if (!tree_mod_need_log(fs_info, NULL))
return 0;
if (btrfs_header_level(dst) == 0 && btrfs_header_level(src) == 0)
return 0;
tm_list = kcalloc(nr_items * 2, sizeof(struct tree_mod_elem *),
GFP_NOFS);
if (!tm_list)
return -ENOMEM;
tm_list_add = tm_list;
tm_list_rem = tm_list + nr_items;
for (i = 0; i < nr_items; i++) {
tm_list_rem[i] = alloc_tree_mod_elem(src, i + src_offset,
BTRFS_MOD_LOG_KEY_REMOVE, GFP_NOFS);
if (!tm_list_rem[i]) {
ret = -ENOMEM;
goto free_tms;
}
tm_list_add[i] = alloc_tree_mod_elem(dst, i + dst_offset,
BTRFS_MOD_LOG_KEY_ADD, GFP_NOFS);
if (!tm_list_add[i]) {
ret = -ENOMEM;
goto free_tms;
}
}
if (tree_mod_dont_log(fs_info, NULL))
goto free_tms;
locked = true;
for (i = 0; i < nr_items; i++) {
ret = tree_mod_log_insert(fs_info, tm_list_rem[i]);
if (ret)
goto free_tms;
ret = tree_mod_log_insert(fs_info, tm_list_add[i]);
if (ret)
goto free_tms;
}
write_unlock(&fs_info->tree_mod_log_lock);
kfree(tm_list);
return 0;
free_tms:
for (i = 0; i < nr_items * 2; i++) {
if (tm_list[i] && !RB_EMPTY_NODE(&tm_list[i]->node))
rb_erase(&tm_list[i]->node, &fs_info->tree_mod_log);
kfree(tm_list[i]);
}
if (locked)
write_unlock(&fs_info->tree_mod_log_lock);
kfree(tm_list);
return ret;
}
int btrfs_tree_mod_log_free_eb(struct extent_buffer *eb)
{
struct tree_mod_elem **tm_list = NULL;
int nritems = 0;
int i;
int ret = 0;
if (!tree_mod_need_log(eb->fs_info, eb))
return 0;
nritems = btrfs_header_nritems(eb);
tm_list = kcalloc(nritems, sizeof(struct tree_mod_elem *), GFP_NOFS);
if (!tm_list)
return -ENOMEM;
for (i = 0; i < nritems; i++) { tm_list[i] = alloc_tree_mod_elem(eb, i,
BTRFS_MOD_LOG_KEY_REMOVE_WHILE_FREEING, GFP_NOFS);
if (!tm_list[i]) {
ret = -ENOMEM;
goto free_tms;
}
}
if (tree_mod_dont_log(eb->fs_info, eb))
goto free_tms;
ret = tree_mod_log_free_eb(eb->fs_info, tm_list, nritems); write_unlock(&eb->fs_info->tree_mod_log_lock);
if (ret)
goto free_tms;
kfree(tm_list);
return 0;
free_tms:
for (i = 0; i < nritems; i++) kfree(tm_list[i]); kfree(tm_list); return ret;
}
/*
* Returns the logical address of the oldest predecessor of the given root.
* Entries older than time_seq are ignored.
*/
static struct tree_mod_elem *tree_mod_log_oldest_root(struct extent_buffer *eb_root,
u64 time_seq)
{
struct tree_mod_elem *tm;
struct tree_mod_elem *found = NULL;
u64 root_logical = eb_root->start;
bool looped = false;
if (!time_seq)
return NULL;
/*
* The very last operation that's logged for a root is the replacement
* operation (if it is replaced at all). This has the logical address
* of the *new* root, making it the very first operation that's logged
* for this root.
*/
while (1) {
tm = tree_mod_log_search_oldest(eb_root->fs_info, root_logical,
time_seq);
if (!looped && !tm)
return NULL;
/*
* If there are no tree operation for the oldest root, we simply
* return it. This should only happen if that (old) root is at
* level 0.
*/
if (!tm)
break;
/*
* If there's an operation that's not a root replacement, we
* found the oldest version of our root. Normally, we'll find a
* BTRFS_MOD_LOG_KEY_REMOVE_WHILE_FREEING operation here.
*/
if (tm->op != BTRFS_MOD_LOG_ROOT_REPLACE)
break;
found = tm;
root_logical = tm->old_root.logical;
looped = true;
}
/* If there's no old root to return, return what we found instead */
if (!found)
found = tm;
return found;
}
/*
* tm is a pointer to the first operation to rewind within eb. Then, all
* previous operations will be rewound (until we reach something older than
* time_seq).
*/
static void tree_mod_log_rewind(struct btrfs_fs_info *fs_info,
struct extent_buffer *eb,
u64 time_seq,
struct tree_mod_elem *first_tm)
{
u32 n;
struct rb_node *next;
struct tree_mod_elem *tm = first_tm;
unsigned long o_dst;
unsigned long o_src;
unsigned long p_size = sizeof(struct btrfs_key_ptr);
n = btrfs_header_nritems(eb);
read_lock(&fs_info->tree_mod_log_lock);
while (tm && tm->seq >= time_seq) {
/*
* All the operations are recorded with the operator used for
* the modification. As we're going backwards, we do the
* opposite of each operation here.
*/
switch (tm->op) {
case BTRFS_MOD_LOG_KEY_REMOVE_WHILE_FREEING:
BUG_ON(tm->slot < n);
fallthrough;
case BTRFS_MOD_LOG_KEY_REMOVE_WHILE_MOVING:
case BTRFS_MOD_LOG_KEY_REMOVE:
btrfs_set_node_key(eb, &tm->key, tm->slot);
btrfs_set_node_blockptr(eb, tm->slot, tm->blockptr);
btrfs_set_node_ptr_generation(eb, tm->slot,
tm->generation);
n++;
break;
case BTRFS_MOD_LOG_KEY_REPLACE:
BUG_ON(tm->slot >= n);
btrfs_set_node_key(eb, &tm->key, tm->slot);
btrfs_set_node_blockptr(eb, tm->slot, tm->blockptr);
btrfs_set_node_ptr_generation(eb, tm->slot,
tm->generation);
break;
case BTRFS_MOD_LOG_KEY_ADD:
/* if a move operation is needed it's in the log */
n--;
break;
case BTRFS_MOD_LOG_MOVE_KEYS:
o_dst = btrfs_node_key_ptr_offset(tm->slot);
o_src = btrfs_node_key_ptr_offset(tm->move.dst_slot);
memmove_extent_buffer(eb, o_dst, o_src,
tm->move.nr_items * p_size);
break;
case BTRFS_MOD_LOG_ROOT_REPLACE:
/*
* This operation is special. For roots, this must be
* handled explicitly before rewinding.
* For non-roots, this operation may exist if the node
* was a root: root A -> child B; then A gets empty and
* B is promoted to the new root. In the mod log, we'll
* have a root-replace operation for B, a tree block
* that is no root. We simply ignore that operation.
*/
break;
}
next = rb_next(&tm->node);
if (!next)
break;
tm = rb_entry(next, struct tree_mod_elem, node);
if (tm->logical != first_tm->logical)
break;
}
read_unlock(&fs_info->tree_mod_log_lock);
btrfs_set_header_nritems(eb, n);
}
/*
* Called with eb read locked. If the buffer cannot be rewound, the same buffer
* is returned. If rewind operations happen, a fresh buffer is returned. The
* returned buffer is always read-locked. If the returned buffer is not the
* input buffer, the lock on the input buffer is released and the input buffer
* is freed (its refcount is decremented).
*/
struct extent_buffer *btrfs_tree_mod_log_rewind(struct btrfs_fs_info *fs_info,
struct btrfs_path *path,
struct extent_buffer *eb,
u64 time_seq)
{
struct extent_buffer *eb_rewin;
struct tree_mod_elem *tm;
if (!time_seq)
return eb;
if (btrfs_header_level(eb) == 0)
return eb;
tm = tree_mod_log_search(fs_info, eb->start, time_seq);
if (!tm)
return eb;
if (tm->op == BTRFS_MOD_LOG_KEY_REMOVE_WHILE_FREEING) {
BUG_ON(tm->slot != 0);
eb_rewin = alloc_dummy_extent_buffer(fs_info, eb->start);
if (!eb_rewin) {
btrfs_tree_read_unlock(eb);
free_extent_buffer(eb);
return NULL;
}
btrfs_set_header_bytenr(eb_rewin, eb->start);
btrfs_set_header_backref_rev(eb_rewin,
btrfs_header_backref_rev(eb));
btrfs_set_header_owner(eb_rewin, btrfs_header_owner(eb));
btrfs_set_header_level(eb_rewin, btrfs_header_level(eb));
} else {
eb_rewin = btrfs_clone_extent_buffer(eb);
if (!eb_rewin) {
btrfs_tree_read_unlock(eb);
free_extent_buffer(eb);
return NULL;
}
}
btrfs_tree_read_unlock(eb);
free_extent_buffer(eb);
btrfs_set_buffer_lockdep_class(btrfs_header_owner(eb_rewin),
eb_rewin, btrfs_header_level(eb_rewin));
btrfs_tree_read_lock(eb_rewin);
tree_mod_log_rewind(fs_info, eb_rewin, time_seq, tm);
WARN_ON(btrfs_header_nritems(eb_rewin) >
BTRFS_NODEPTRS_PER_BLOCK(fs_info));
return eb_rewin;
}
/*
* Rewind the state of @root's root node to the given @time_seq value.
* If there are no changes, the current root->root_node is returned. If anything
* changed in between, there's a fresh buffer allocated on which the rewind
* operations are done. In any case, the returned buffer is read locked.
* Returns NULL on error (with no locks held).
*/
struct extent_buffer *btrfs_get_old_root(struct btrfs_root *root, u64 time_seq)
{
struct btrfs_fs_info *fs_info = root->fs_info;
struct tree_mod_elem *tm;
struct extent_buffer *eb = NULL;
struct extent_buffer *eb_root;
u64 eb_root_owner = 0;
struct extent_buffer *old;
struct tree_mod_root *old_root = NULL;
u64 old_generation = 0;
u64 logical;
int level;
eb_root = btrfs_read_lock_root_node(root);
tm = tree_mod_log_oldest_root(eb_root, time_seq);
if (!tm)
return eb_root;
if (tm->op == BTRFS_MOD_LOG_ROOT_REPLACE) {
old_root = &tm->old_root;
old_generation = tm->generation;
logical = old_root->logical;
level = old_root->level;
} else {
logical = eb_root->start;
level = btrfs_header_level(eb_root);
}
tm = tree_mod_log_search(fs_info, logical, time_seq);
if (old_root && tm && tm->op != BTRFS_MOD_LOG_KEY_REMOVE_WHILE_FREEING) {
btrfs_tree_read_unlock(eb_root);
free_extent_buffer(eb_root);
old = read_tree_block(fs_info, logical, root->root_key.objectid,
0, level, NULL);
if (WARN_ON(IS_ERR(old) || !extent_buffer_uptodate(old))) {
if (!IS_ERR(old))
free_extent_buffer(old);
btrfs_warn(fs_info,
"failed to read tree block %llu from get_old_root",
logical);
} else {
struct tree_mod_elem *tm2;
btrfs_tree_read_lock(old);
eb = btrfs_clone_extent_buffer(old);
/*
* After the lookup for the most recent tree mod operation
* above and before we locked and cloned the extent buffer
* 'old', a new tree mod log operation may have been added.
* So lookup for a more recent one to make sure the number
* of mod log operations we replay is consistent with the
* number of items we have in the cloned extent buffer,
* otherwise we can hit a BUG_ON when rewinding the extent
* buffer.
*/
tm2 = tree_mod_log_search(fs_info, logical, time_seq);
btrfs_tree_read_unlock(old);
free_extent_buffer(old);
ASSERT(tm2);
ASSERT(tm2 == tm || tm2->seq > tm->seq);
if (!tm2 || tm2->seq < tm->seq) {
free_extent_buffer(eb);
return NULL;
}
tm = tm2;
}
} else if (old_root) {
eb_root_owner = btrfs_header_owner(eb_root);
btrfs_tree_read_unlock(eb_root);
free_extent_buffer(eb_root);
eb = alloc_dummy_extent_buffer(fs_info, logical);
} else {
eb = btrfs_clone_extent_buffer(eb_root);
btrfs_tree_read_unlock(eb_root);
free_extent_buffer(eb_root);
}
if (!eb)
return NULL;
if (old_root) {
btrfs_set_header_bytenr(eb, eb->start);
btrfs_set_header_backref_rev(eb, BTRFS_MIXED_BACKREF_REV);
btrfs_set_header_owner(eb, eb_root_owner);
btrfs_set_header_level(eb, old_root->level);
btrfs_set_header_generation(eb, old_generation);
}
btrfs_set_buffer_lockdep_class(btrfs_header_owner(eb), eb,
btrfs_header_level(eb));
btrfs_tree_read_lock(eb);
if (tm)
tree_mod_log_rewind(fs_info, eb, time_seq, tm);
else
WARN_ON(btrfs_header_level(eb) != 0);
WARN_ON(btrfs_header_nritems(eb) > BTRFS_NODEPTRS_PER_BLOCK(fs_info));
return eb;
}
int btrfs_old_root_level(struct btrfs_root *root, u64 time_seq)
{
struct tree_mod_elem *tm;
int level;
struct extent_buffer *eb_root = btrfs_root_node(root);
tm = tree_mod_log_oldest_root(eb_root, time_seq);
if (tm && tm->op == BTRFS_MOD_LOG_ROOT_REPLACE)
level = tm->old_root.level;
else
level = btrfs_header_level(eb_root);
free_extent_buffer(eb_root);
return level;
}
/*
* Return the lowest sequence number in the tree modification log.
*
* Return the sequence number of the oldest tree modification log user, which
* corresponds to the lowest sequence number of all existing users. If there are
* no users it returns 0.
*/
u64 btrfs_tree_mod_log_lowest_seq(struct btrfs_fs_info *fs_info)
{
u64 ret = 0;
read_lock(&fs_info->tree_mod_log_lock);
if (!list_empty(&fs_info->tree_mod_seq_list)) {
struct btrfs_seq_list *elem;
elem = list_first_entry(&fs_info->tree_mod_seq_list,
struct btrfs_seq_list, list);
ret = elem->seq;
}
read_unlock(&fs_info->tree_mod_log_lock);
return ret;
}
/* SPDX-License-Identifier: GPL-2.0-only */
/*
* Tracing hooks
*
* Copyright (C) 2008-2009 Red Hat, Inc. All rights reserved.
*
* This file defines hook entry points called by core code where
* user tracing/debugging support might need to do something. These
* entry points are called tracehook_*(). Each hook declared below
* has a detailed kerneldoc comment giving the context (locking et
* al) from which it is called, and the meaning of its return value.
*
* Each function here typically has only one call site, so it is ok
* to have some nontrivial tracehook_*() inlines. In all cases, the
* fast path when no tracing is enabled should be very short.
*
* The purpose of this file and the tracehook_* layer is to consolidate
* the interface that the kernel core and arch code uses to enable any
* user debugging or tracing facility (such as ptrace). The interfaces
* here are carefully documented so that maintainers of core and arch
* code do not need to think about the implementation details of the
* tracing facilities. Likewise, maintainers of the tracing code do not
* need to understand all the calling core or arch code in detail, just
* documented circumstances of each call, such as locking conditions.
*
* If the calling core code changes so that locking is different, then
* it is ok to change the interface documented here. The maintainer of
* core code changing should notify the maintainers of the tracing code
* that they need to work out the change.
*
* Some tracehook_*() inlines take arguments that the current tracing
* implementations might not necessarily use. These function signatures
* are chosen to pass in all the information that is on hand in the
* caller and might conceivably be relevant to a tracer, so that the
* core code won't have to be updated when tracing adds more features.
* If a call site changes so that some of those parameters are no longer
* already on hand without extra work, then the tracehook_* interface
* can change so there is no make-work burden on the core code. The
* maintainer of core code changing should notify the maintainers of the
* tracing code that they need to work out the change.
*/
#ifndef _LINUX_TRACEHOOK_H
#define _LINUX_TRACEHOOK_H 1
#include <linux/sched.h>
#include <linux/ptrace.h>
#include <linux/security.h>
#include <linux/task_work.h>
#include <linux/memcontrol.h>
#include <linux/blk-cgroup.h>
struct linux_binprm;
/*
* ptrace report for syscall entry and exit looks identical.
*/
static inline int ptrace_report_syscall(struct pt_regs *regs,
unsigned long message)
{
int ptrace = current->ptrace;
if (!(ptrace & PT_PTRACED))
return 0;
current->ptrace_message = message;
ptrace_notify(SIGTRAP | ((ptrace & PT_TRACESYSGOOD) ? 0x80 : 0));
/*
* this isn't the same as continuing with a signal, but it will do
* for normal use. strace only continues with a signal if the
* stopping signal is not SIGTRAP. -brl
*/
if (current->exit_code) {
send_sig(current->exit_code, current, 1);
current->exit_code = 0;
}
current->ptrace_message = 0;
return fatal_signal_pending(current);
}
/**
* tracehook_report_syscall_entry - task is about to attempt a system call
* @regs: user register state of current task
*
* This will be called if %SYSCALL_WORK_SYSCALL_TRACE or
* %SYSCALL_WORK_SYSCALL_EMU have been set, when the current task has just
* entered the kernel for a system call. Full user register state is
* available here. Changing the values in @regs can affect the system
* call number and arguments to be tried. It is safe to block here,
* preventing the system call from beginning.
*
* Returns zero normally, or nonzero if the calling arch code should abort
* the system call. That must prevent normal entry so no system call is
* made. If @task ever returns to user mode after this, its register state
* is unspecified, but should be something harmless like an %ENOSYS error
* return. It should preserve enough information so that syscall_rollback()
* can work (see asm-generic/syscall.h).
*
* Called without locks, just after entering kernel mode.
*/
static inline __must_check int tracehook_report_syscall_entry(
struct pt_regs *regs)
{
return ptrace_report_syscall(regs, PTRACE_EVENTMSG_SYSCALL_ENTRY);
}
/**
* tracehook_report_syscall_exit - task has just finished a system call
* @regs: user register state of current task
* @step: nonzero if simulating single-step or block-step
*
* This will be called if %SYSCALL_WORK_SYSCALL_TRACE has been set, when
* the current task has just finished an attempted system call. Full
* user register state is available here. It is safe to block here,
* preventing signals from being processed.
*
* If @step is nonzero, this report is also in lieu of the normal
* trap that would follow the system call instruction because
* user_enable_block_step() or user_enable_single_step() was used.
* In this case, %SYSCALL_WORK_SYSCALL_TRACE might not be set.
*
* Called without locks, just before checking for pending signals.
*/
static inline void tracehook_report_syscall_exit(struct pt_regs *regs, int step)
{
if (step)
user_single_step_report(regs);
else
ptrace_report_syscall(regs, PTRACE_EVENTMSG_SYSCALL_EXIT);
}
/**
* tracehook_signal_handler - signal handler setup is complete
* @stepping: nonzero if debugger single-step or block-step in use
*
* Called by the arch code after a signal handler has been set up.
* Register and stack state reflects the user handler about to run.
* Signal mask changes have already been made.
*
* Called without locks, shortly before returning to user mode
* (or handling more signals).
*/
static inline void tracehook_signal_handler(int stepping)
{
if (stepping) ptrace_notify(SIGTRAP);
}
/**
* set_notify_resume - cause tracehook_notify_resume() to be called
* @task: task that will call tracehook_notify_resume()
*
* Calling this arranges that @task will call tracehook_notify_resume()
* before returning to user mode. If it's already running in user mode,
* it will enter the kernel and call tracehook_notify_resume() soon.
* If it's blocked, it will not be woken.
*/
static inline void set_notify_resume(struct task_struct *task)
{
#ifdef TIF_NOTIFY_RESUME
if (!test_and_set_tsk_thread_flag(task, TIF_NOTIFY_RESUME))
kick_process(task);
#endif
}
/**
* tracehook_notify_resume - report when about to return to user mode
* @regs: user-mode registers of @current task
*
* This is called when %TIF_NOTIFY_RESUME has been set. Now we are
* about to return to user mode, and the user state in @regs can be
* inspected or adjusted. The caller in arch code has cleared
* %TIF_NOTIFY_RESUME before the call. If the flag gets set again
* asynchronously, this will be called again before we return to
* user mode.
*
* Called without locks.
*/
static inline void tracehook_notify_resume(struct pt_regs *regs)
{
clear_thread_flag(TIF_NOTIFY_RESUME);
/*
* This barrier pairs with task_work_add()->set_notify_resume() after
* hlist_add_head(task->task_works);
*/
smp_mb__after_atomic();
if (unlikely(current->task_works))
task_work_run();
#ifdef CONFIG_KEYS_REQUEST_CACHE
if (unlikely(current->cached_requested_key)) {
key_put(current->cached_requested_key);
current->cached_requested_key = NULL;
}
#endif
mem_cgroup_handle_over_high();
blkcg_maybe_throttle_current();
rseq_handle_notify_resume(NULL, regs);
}
/*
* called by exit_to_user_mode_loop() if ti_work & _TIF_NOTIFY_SIGNAL. This
* is currently used by TWA_SIGNAL based task_work, which requires breaking
* wait loops to ensure that task_work is noticed and run.
*/
static inline void tracehook_notify_signal(void)
{
clear_thread_flag(TIF_NOTIFY_SIGNAL);
smp_mb__after_atomic();
if (current->task_works)
task_work_run();
}
/*
* Called when we have work to process from exit_to_user_mode_loop()
*/
static inline void set_notify_signal(struct task_struct *task)
{
if (!test_and_set_tsk_thread_flag(task, TIF_NOTIFY_SIGNAL) &&
!wake_up_state(task, TASK_INTERRUPTIBLE))
kick_process(task);
}
#endif /* <linux/tracehook.h> */
/* SPDX-License-Identifier: GPL-2.0 */
/*
* workqueue.h --- work queue handling for Linux.
*/
#ifndef _LINUX_WORKQUEUE_H
#define _LINUX_WORKQUEUE_H
#include <linux/timer.h>
#include <linux/linkage.h>
#include <linux/bitops.h>
#include <linux/lockdep.h>
#include <linux/threads.h>
#include <linux/atomic.h>
#include <linux/cpumask.h>
#include <linux/rcupdate.h>
struct workqueue_struct;
struct work_struct;
typedef void (*work_func_t)(struct work_struct *work);
void delayed_work_timer_fn(struct timer_list *t);
/*
* The first word is the work queue pointer and the flags rolled into
* one
*/
#define work_data_bits(work) ((unsigned long *)(&(work)->data))
enum {
WORK_STRUCT_PENDING_BIT = 0, /* work item is pending execution */
WORK_STRUCT_INACTIVE_BIT= 1, /* work item is inactive */
WORK_STRUCT_PWQ_BIT = 2, /* data points to pwq */
WORK_STRUCT_LINKED_BIT = 3, /* next work is linked to this one */
#ifdef CONFIG_DEBUG_OBJECTS_WORK
WORK_STRUCT_STATIC_BIT = 4, /* static initializer (debugobjects) */
WORK_STRUCT_COLOR_SHIFT = 5, /* color for workqueue flushing */
#else
WORK_STRUCT_COLOR_SHIFT = 4, /* color for workqueue flushing */
#endif
WORK_STRUCT_COLOR_BITS = 4,
WORK_STRUCT_PENDING = 1 << WORK_STRUCT_PENDING_BIT,
WORK_STRUCT_INACTIVE = 1 << WORK_STRUCT_INACTIVE_BIT,
WORK_STRUCT_PWQ = 1 << WORK_STRUCT_PWQ_BIT,
WORK_STRUCT_LINKED = 1 << WORK_STRUCT_LINKED_BIT,
#ifdef CONFIG_DEBUG_OBJECTS_WORK
WORK_STRUCT_STATIC = 1 << WORK_STRUCT_STATIC_BIT,
#else
WORK_STRUCT_STATIC = 0,
#endif
WORK_NR_COLORS = (1 << WORK_STRUCT_COLOR_BITS),
/* not bound to any CPU, prefer the local CPU */
WORK_CPU_UNBOUND = NR_CPUS,
/*
* Reserve 8 bits off of pwq pointer w/ debugobjects turned off.
* This makes pwqs aligned to 256 bytes and allows 16 workqueue
* flush colors.
*/
WORK_STRUCT_FLAG_BITS = WORK_STRUCT_COLOR_SHIFT +
WORK_STRUCT_COLOR_BITS,
/* data contains off-queue information when !WORK_STRUCT_PWQ */
WORK_OFFQ_FLAG_BASE = WORK_STRUCT_COLOR_SHIFT,
__WORK_OFFQ_CANCELING = WORK_OFFQ_FLAG_BASE,
WORK_OFFQ_CANCELING = (1 << __WORK_OFFQ_CANCELING),
/*
* When a work item is off queue, its high bits point to the last
* pool it was on. Cap at 31 bits and use the highest number to
* indicate that no pool is associated.
*/
WORK_OFFQ_FLAG_BITS = 1,
WORK_OFFQ_POOL_SHIFT = WORK_OFFQ_FLAG_BASE + WORK_OFFQ_FLAG_BITS,
WORK_OFFQ_LEFT = BITS_PER_LONG - WORK_OFFQ_POOL_SHIFT,
WORK_OFFQ_POOL_BITS = WORK_OFFQ_LEFT <= 31 ? WORK_OFFQ_LEFT : 31,
WORK_OFFQ_POOL_NONE = (1LU << WORK_OFFQ_POOL_BITS) - 1,
/* convenience constants */
WORK_STRUCT_FLAG_MASK = (1UL << WORK_STRUCT_FLAG_BITS) - 1,
WORK_STRUCT_WQ_DATA_MASK = ~WORK_STRUCT_FLAG_MASK,
WORK_STRUCT_NO_POOL = (unsigned long)WORK_OFFQ_POOL_NONE << WORK_OFFQ_POOL_SHIFT,
/* bit mask for work_busy() return values */
WORK_BUSY_PENDING = 1 << 0,
WORK_BUSY_RUNNING = 1 << 1,
/* maximum string length for set_worker_desc() */
WORKER_DESC_LEN = 24,
};
struct work_struct {
atomic_long_t data;
struct list_head entry;
work_func_t func;
#ifdef CONFIG_LOCKDEP
struct lockdep_map lockdep_map;
#endif
};
#define WORK_DATA_INIT() ATOMIC_LONG_INIT((unsigned long)WORK_STRUCT_NO_POOL)
#define WORK_DATA_STATIC_INIT() \
ATOMIC_LONG_INIT((unsigned long)(WORK_STRUCT_NO_POOL | WORK_STRUCT_STATIC))
struct delayed_work {
struct work_struct work;
struct timer_list timer;
/* target workqueue and CPU ->timer uses to queue ->work */
struct workqueue_struct *wq;
int cpu;
};
struct rcu_work {
struct work_struct work;
struct rcu_head rcu;
/* target workqueue ->rcu uses to queue ->work */
struct workqueue_struct *wq;
};
/**
* struct workqueue_attrs - A struct for workqueue attributes.
*
* This can be used to change attributes of an unbound workqueue.
*/
struct workqueue_attrs {
/**
* @nice: nice level
*/
int nice;
/**
* @cpumask: allowed CPUs
*/
cpumask_var_t cpumask;
/**
* @no_numa: disable NUMA affinity
*
* Unlike other fields, ``no_numa`` isn't a property of a worker_pool. It
* only modifies how :c:func:`apply_workqueue_attrs` select pools and thus
* doesn't participate in pool hash calculations or equality comparisons.
*/
bool no_numa;
};
static inline struct delayed_work *to_delayed_work(struct work_struct *work)
{
return container_of(work, struct delayed_work, work);
}
static inline struct rcu_work *to_rcu_work(struct work_struct *work)
{
return container_of(work, struct rcu_work, work);
}
struct execute_work {
struct work_struct work;
};
#ifdef CONFIG_LOCKDEP
/*
* NB: because we have to copy the lockdep_map, setting _key
* here is required, otherwise it could get initialised to the
* copy of the lockdep_map!
*/
#define __WORK_INIT_LOCKDEP_MAP(n, k) \
.lockdep_map = STATIC_LOCKDEP_MAP_INIT(n, k),
#else
#define __WORK_INIT_LOCKDEP_MAP(n, k)
#endif
#define __WORK_INITIALIZER(n, f) { \
.data = WORK_DATA_STATIC_INIT(), \
.entry = { &(n).entry, &(n).entry }, \
.func = (f), \
__WORK_INIT_LOCKDEP_MAP(#n, &(n)) \
}
#define __DELAYED_WORK_INITIALIZER(n, f, tflags) { \
.work = __WORK_INITIALIZER((n).work, (f)), \
.timer = __TIMER_INITIALIZER(delayed_work_timer_fn,\
(tflags) | TIMER_IRQSAFE), \
}
#define DECLARE_WORK(n, f) \
struct work_struct n = __WORK_INITIALIZER(n, f)
#define DECLARE_DELAYED_WORK(n, f) \
struct delayed_work n = __DELAYED_WORK_INITIALIZER(n, f, 0)
#define DECLARE_DEFERRABLE_WORK(n, f) \
struct delayed_work n = __DELAYED_WORK_INITIALIZER(n, f, TIMER_DEFERRABLE)
#ifdef CONFIG_DEBUG_OBJECTS_WORK
extern void __init_work(struct work_struct *work, int onstack);
extern void destroy_work_on_stack(struct work_struct *work);
extern void destroy_delayed_work_on_stack(struct delayed_work *work);
static inline unsigned int work_static(struct work_struct *work)
{
return *work_data_bits(work) & WORK_STRUCT_STATIC;
}
#else
static inline void __init_work(struct work_struct *work, int onstack) { }
static inline void destroy_work_on_stack(struct work_struct *work) { }
static inline void destroy_delayed_work_on_stack(struct delayed_work *work) { }
static inline unsigned int work_static(struct work_struct *work) { return 0; }
#endif
/*
* initialize all of a work item in one go
*
* NOTE! No point in using "atomic_long_set()": using a direct
* assignment of the work data initializer allows the compiler
* to generate better code.
*/
#ifdef CONFIG_LOCKDEP
#define __INIT_WORK(_work, _func, _onstack) \
do { \
static struct lock_class_key __key; \
\
__init_work((_work), _onstack); \
(_work)->data = (atomic_long_t) WORK_DATA_INIT(); \
lockdep_init_map(&(_work)->lockdep_map, "(work_completion)"#_work, &__key, 0); \
INIT_LIST_HEAD(&(_work)->entry); \
(_work)->func = (_func); \
} while (0)
#else
#define __INIT_WORK(_work, _func, _onstack) \
do { \
__init_work((_work), _onstack); \
(_work)->data = (atomic_long_t) WORK_DATA_INIT(); \
INIT_LIST_HEAD(&(_work)->entry); \
(_work)->func = (_func); \
} while (0)
#endif
#define INIT_WORK(_work, _func) \
__INIT_WORK((_work), (_func), 0)
#define INIT_WORK_ONSTACK(_work, _func) \
__INIT_WORK((_work), (_func), 1)
#define __INIT_DELAYED_WORK(_work, _func, _tflags) \
do { \
INIT_WORK(&(_work)->work, (_func)); \
__init_timer(&(_work)->timer, \
delayed_work_timer_fn, \
(_tflags) | TIMER_IRQSAFE); \
} while (0)
#define __INIT_DELAYED_WORK_ONSTACK(_work, _func, _tflags) \
do { \
INIT_WORK_ONSTACK(&(_work)->work, (_func)); \
__init_timer_on_stack(&(_work)->timer, \
delayed_work_timer_fn, \
(_tflags) | TIMER_IRQSAFE); \
} while (0)
#define INIT_DELAYED_WORK(_work, _func) \
__INIT_DELAYED_WORK(_work, _func, 0)
#define INIT_DELAYED_WORK_ONSTACK(_work, _func) \
__INIT_DELAYED_WORK_ONSTACK(_work, _func, 0)
#define INIT_DEFERRABLE_WORK(_work, _func) \
__INIT_DELAYED_WORK(_work, _func, TIMER_DEFERRABLE)
#define INIT_DEFERRABLE_WORK_ONSTACK(_work, _func) \
__INIT_DELAYED_WORK_ONSTACK(_work, _func, TIMER_DEFERRABLE)
#define INIT_RCU_WORK(_work, _func) \
INIT_WORK(&(_work)->work, (_func))
#define INIT_RCU_WORK_ONSTACK(_work, _func) \
INIT_WORK_ONSTACK(&(_work)->work, (_func))
/**
* work_pending - Find out whether a work item is currently pending
* @work: The work item in question
*/
#define work_pending(work) \
test_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))
/**
* delayed_work_pending - Find out whether a delayable work item is currently
* pending
* @w: The work item in question
*/
#define delayed_work_pending(w) \
work_pending(&(w)->work)
/*
* Workqueue flags and constants. For details, please refer to
* Documentation/core-api/workqueue.rst.
*/
enum {
WQ_UNBOUND = 1 << 1, /* not bound to any cpu */
WQ_FREEZABLE = 1 << 2, /* freeze during suspend */
WQ_MEM_RECLAIM = 1 << 3, /* may be used for memory reclaim */
WQ_HIGHPRI = 1 << 4, /* high priority */
WQ_CPU_INTENSIVE = 1 << 5, /* cpu intensive workqueue */
WQ_SYSFS = 1 << 6, /* visible in sysfs, see workqueue_sysfs_register() */
/*
* Per-cpu workqueues are generally preferred because they tend to
* show better performance thanks to cache locality. Per-cpu
* workqueues exclude the scheduler from choosing the CPU to
* execute the worker threads, which has an unfortunate side effect
* of increasing power consumption.
*
* The scheduler considers a CPU idle if it doesn't have any task
* to execute and tries to keep idle cores idle to conserve power;
* however, for example, a per-cpu work item scheduled from an
* interrupt handler on an idle CPU will force the scheduler to
* execute the work item on that CPU breaking the idleness, which in
* turn may lead to more scheduling choices which are sub-optimal
* in terms of power consumption.
*
* Workqueues marked with WQ_POWER_EFFICIENT are per-cpu by default
* but become unbound if workqueue.power_efficient kernel param is
* specified. Per-cpu workqueues which are identified to
* contribute significantly to power-consumption are identified and
* marked with this flag and enabling the power_efficient mode
* leads to noticeable power saving at the cost of small
* performance disadvantage.
*
* http://thread.gmane.org/gmane.linux.kernel/1480396
*/
WQ_POWER_EFFICIENT = 1 << 7,
__WQ_DRAINING = 1 << 16, /* internal: workqueue is draining */
__WQ_ORDERED = 1 << 17, /* internal: workqueue is ordered */
__WQ_LEGACY = 1 << 18, /* internal: create*_workqueue() */
__WQ_ORDERED_EXPLICIT = 1 << 19, /* internal: alloc_ordered_workqueue() */
WQ_MAX_ACTIVE = 512, /* I like 512, better ideas? */
WQ_MAX_UNBOUND_PER_CPU = 4, /* 4 * #cpus for unbound wq */
WQ_DFL_ACTIVE = WQ_MAX_ACTIVE / 2,
};
/* unbound wq's aren't per-cpu, scale max_active according to #cpus */
#define WQ_UNBOUND_MAX_ACTIVE \
max_t(int, WQ_MAX_ACTIVE, num_possible_cpus() * WQ_MAX_UNBOUND_PER_CPU)
/*
* System-wide workqueues which are always present.
*
* system_wq is the one used by schedule[_delayed]_work[_on]().
* Multi-CPU multi-threaded. There are users which expect relatively
* short queue flush time. Don't queue works which can run for too
* long.
*
* system_highpri_wq is similar to system_wq but for work items which
* require WQ_HIGHPRI.
*
* system_long_wq is similar to system_wq but may host long running
* works. Queue flushing might take relatively long.
*
* system_unbound_wq is unbound workqueue. Workers are not bound to
* any specific CPU, not concurrency managed, and all queued works are
* executed immediately as long as max_active limit is not reached and
* resources are available.
*
* system_freezable_wq is equivalent to system_wq except that it's
* freezable.
*
* *_power_efficient_wq are inclined towards saving power and converted
* into WQ_UNBOUND variants if 'wq_power_efficient' is enabled; otherwise,
* they are same as their non-power-efficient counterparts - e.g.
* system_power_efficient_wq is identical to system_wq if
* 'wq_power_efficient' is disabled. See WQ_POWER_EFFICIENT for more info.
*/
extern struct workqueue_struct *system_wq;
extern struct workqueue_struct *system_highpri_wq;
extern struct workqueue_struct *system_long_wq;
extern struct workqueue_struct *system_unbound_wq;
extern struct workqueue_struct *system_freezable_wq;
extern struct workqueue_struct *system_power_efficient_wq;
extern struct workqueue_struct *system_freezable_power_efficient_wq;
/**
* alloc_workqueue - allocate a workqueue
* @fmt: printf format for the name of the workqueue
* @flags: WQ_* flags
* @max_active: max in-flight work items, 0 for default
* remaining args: args for @fmt
*
* Allocate a workqueue with the specified parameters. For detailed
* information on WQ_* flags, please refer to
* Documentation/core-api/workqueue.rst.
*
* RETURNS:
* Pointer to the allocated workqueue on success, %NULL on failure.
*/
__printf(1, 4) struct workqueue_struct *
alloc_workqueue(const char *fmt, unsigned int flags, int max_active, ...);
/**
* alloc_ordered_workqueue - allocate an ordered workqueue
* @fmt: printf format for the name of the workqueue
* @flags: WQ_* flags (only WQ_FREEZABLE and WQ_MEM_RECLAIM are meaningful)
* @args...: args for @fmt
*
* Allocate an ordered workqueue. An ordered workqueue executes at
* most one work item at any given time in the queued order. They are
* implemented as unbound workqueues with @max_active of one.
*
* RETURNS:
* Pointer to the allocated workqueue on success, %NULL on failure.
*/
#define alloc_ordered_workqueue(fmt, flags, args...) \
alloc_workqueue(fmt, WQ_UNBOUND | __WQ_ORDERED | \
__WQ_ORDERED_EXPLICIT | (flags), 1, ##args)
#define create_workqueue(name) \
alloc_workqueue("%s", __WQ_LEGACY | WQ_MEM_RECLAIM, 1, (name))
#define create_freezable_workqueue(name) \
alloc_workqueue("%s", __WQ_LEGACY | WQ_FREEZABLE | WQ_UNBOUND | \
WQ_MEM_RECLAIM, 1, (name))
#define create_singlethread_workqueue(name) \
alloc_ordered_workqueue("%s", __WQ_LEGACY | WQ_MEM_RECLAIM, name)
extern void destroy_workqueue(struct workqueue_struct *wq);
struct workqueue_attrs *alloc_workqueue_attrs(void);
void free_workqueue_attrs(struct workqueue_attrs *attrs);
int apply_workqueue_attrs(struct workqueue_struct *wq,
const struct workqueue_attrs *attrs);
int workqueue_set_unbound_cpumask(cpumask_var_t cpumask);
extern bool queue_work_on(int cpu, struct workqueue_struct *wq,
struct work_struct *work);
extern bool queue_work_node(int node, struct workqueue_struct *wq,
struct work_struct *work);
extern bool queue_delayed_work_on(int cpu, struct workqueue_struct *wq,
struct delayed_work *work, unsigned long delay);
extern bool mod_delayed_work_on(int cpu, struct workqueue_struct *wq,
struct delayed_work *dwork, unsigned long delay);
extern bool queue_rcu_work(struct workqueue_struct *wq, struct rcu_work *rwork);
extern void flush_workqueue(struct workqueue_struct *wq);
extern void drain_workqueue(struct workqueue_struct *wq);
extern int schedule_on_each_cpu(work_func_t func);
int execute_in_process_context(work_func_t fn, struct execute_work *);
extern bool flush_work(struct work_struct *work);
extern bool cancel_work_sync(struct work_struct *work);
extern bool flush_delayed_work(struct delayed_work *dwork);
extern bool cancel_delayed_work(struct delayed_work *dwork);
extern bool cancel_delayed_work_sync(struct delayed_work *dwork);
extern bool flush_rcu_work(struct rcu_work *rwork);
extern void workqueue_set_max_active(struct workqueue_struct *wq,
int max_active);
extern struct work_struct *current_work(void);
extern bool current_is_workqueue_rescuer(void);
extern bool workqueue_congested(int cpu, struct workqueue_struct *wq);
extern unsigned int work_busy(struct work_struct *work);
extern __printf(1, 2) void set_worker_desc(const char *fmt, ...);
extern void print_worker_info(const char *log_lvl, struct task_struct *task);
extern void show_workqueue_state(void);
extern void wq_worker_comm(char *buf, size_t size, struct task_struct *task);
/**
* queue_work - queue work on a workqueue
* @wq: workqueue to use
* @work: work to queue
*
* Returns %false if @work was already on a queue, %true otherwise.
*
* We queue the work to the CPU on which it was submitted, but if the CPU dies
* it can be processed by another CPU.
*
* Memory-ordering properties: If it returns %true, guarantees that all stores
* preceding the call to queue_work() in the program order will be visible from
* the CPU which will execute @work by the time such work executes, e.g.,
*
* { x is initially 0 }
*
* CPU0 CPU1
*
* WRITE_ONCE(x, 1); [ @work is being executed ]
* r0 = queue_work(wq, work); r1 = READ_ONCE(x);
*
* Forbids: r0 == true && r1 == 0
*/
static inline bool queue_work(struct workqueue_struct *wq,
struct work_struct *work)
{
return queue_work_on(WORK_CPU_UNBOUND, wq, work);
}
/**
* queue_delayed_work - queue work on a workqueue after delay
* @wq: workqueue to use
* @dwork: delayable work to queue
* @delay: number of jiffies to wait before queueing
*
* Equivalent to queue_delayed_work_on() but tries to use the local CPU.
*/
static inline bool queue_delayed_work(struct workqueue_struct *wq,
struct delayed_work *dwork,
unsigned long delay)
{
return queue_delayed_work_on(WORK_CPU_UNBOUND, wq, dwork, delay);
}
/**
* mod_delayed_work - modify delay of or queue a delayed work
* @wq: workqueue to use
* @dwork: work to queue
* @delay: number of jiffies to wait before queueing
*
* mod_delayed_work_on() on local CPU.
*/
static inline bool mod_delayed_work(struct workqueue_struct *wq,
struct delayed_work *dwork,
unsigned long delay)
{
return mod_delayed_work_on(WORK_CPU_UNBOUND, wq, dwork, delay);
}
/**
* schedule_work_on - put work task on a specific cpu
* @cpu: cpu to put the work task on
* @work: job to be done
*
* This puts a job on a specific cpu
*/
static inline bool schedule_work_on(int cpu, struct work_struct *work)
{
return queue_work_on(cpu, system_wq, work);
}
/**
* schedule_work - put work task in global workqueue
* @work: job to be done
*
* Returns %false if @work was already on the kernel-global workqueue and
* %true otherwise.
*
* This puts a job in the kernel-global workqueue if it was not already
* queued and leaves it in the same position on the kernel-global
* workqueue otherwise.
*
* Shares the same memory-ordering properties of queue_work(), cf. the
* DocBook header of queue_work().
*/
static inline bool schedule_work(struct work_struct *work)
{
return queue_work(system_wq, work);
}
/**
* flush_scheduled_work - ensure that any scheduled work has run to completion.
*
* Forces execution of the kernel-global workqueue and blocks until its
* completion.
*
* Think twice before calling this function! It's very easy to get into
* trouble if you don't take great care. Either of the following situations
* will lead to deadlock:
*
* One of the work items currently on the workqueue needs to acquire
* a lock held by your code or its caller.
*
* Your code is running in the context of a work routine.
*
* They will be detected by lockdep when they occur, but the first might not
* occur very often. It depends on what work items are on the workqueue and
* what locks they need, which you have no control over.
*
* In most situations flushing the entire workqueue is overkill; you merely
* need to know that a particular work item isn't queued and isn't running.
* In such cases you should use cancel_delayed_work_sync() or
* cancel_work_sync() instead.
*/
static inline void flush_scheduled_work(void)
{
flush_workqueue(system_wq);
}
/**
* schedule_delayed_work_on - queue work in global workqueue on CPU after delay
* @cpu: cpu to use
* @dwork: job to be done
* @delay: number of jiffies to wait
*
* After waiting for a given time this puts a job in the kernel-global
* workqueue on the specified CPU.
*/
static inline bool schedule_delayed_work_on(int cpu, struct delayed_work *dwork,
unsigned long delay)
{
return queue_delayed_work_on(cpu, system_wq, dwork, delay);
}
/**
* schedule_delayed_work - put work task in global workqueue after delay
* @dwork: job to be done
* @delay: number of jiffies to wait or 0 for immediate execution
*
* After waiting for a given time this puts a job in the kernel-global
* workqueue.
*/
static inline bool schedule_delayed_work(struct delayed_work *dwork,
unsigned long delay)
{
return queue_delayed_work(system_wq, dwork, delay);
}
#ifndef CONFIG_SMP
static inline long work_on_cpu(int cpu, long (*fn)(void *), void *arg)
{
return fn(arg);
}
static inline long work_on_cpu_safe(int cpu, long (*fn)(void *), void *arg)
{
return fn(arg);
}
#else
long work_on_cpu(int cpu, long (*fn)(void *), void *arg);
long work_on_cpu_safe(int cpu, long (*fn)(void *), void *arg);
#endif /* CONFIG_SMP */
#ifdef CONFIG_FREEZER
extern void freeze_workqueues_begin(void);
extern bool freeze_workqueues_busy(void);
extern void thaw_workqueues(void);
#endif /* CONFIG_FREEZER */
#ifdef CONFIG_SYSFS
int workqueue_sysfs_register(struct workqueue_struct *wq);
#else /* CONFIG_SYSFS */
static inline int workqueue_sysfs_register(struct workqueue_struct *wq)
{ return 0; }
#endif /* CONFIG_SYSFS */
#ifdef CONFIG_WQ_WATCHDOG
void wq_watchdog_touch(int cpu);
#else /* CONFIG_WQ_WATCHDOG */
static inline void wq_watchdog_touch(int cpu) { }
#endif /* CONFIG_WQ_WATCHDOG */
#ifdef CONFIG_SMP
int workqueue_prepare_cpu(unsigned int cpu);
int workqueue_online_cpu(unsigned int cpu);
int workqueue_offline_cpu(unsigned int cpu);
#endif
void __init workqueue_init_early(void);
void __init workqueue_init(void);
#endif
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_GENERIC_SECTIONS_H_
#define _ASM_GENERIC_SECTIONS_H_
/* References to section boundaries */
#include <linux/compiler.h>
#include <linux/types.h>
/*
* Usage guidelines:
* _text, _data: architecture specific, don't use them in arch-independent code
* [_stext, _etext]: contains .text.* sections, may also contain .rodata.*
* and/or .init.* sections
* [_sdata, _edata]: contains .data.* sections, may also contain .rodata.*
* and/or .init.* sections.
* [__start_rodata, __end_rodata]: contains .rodata.* sections
* [__start_ro_after_init, __end_ro_after_init]:
* contains .data..ro_after_init section
* [__init_begin, __init_end]: contains .init.* sections, but .init.text.*
* may be out of this range on some architectures.
* [_sinittext, _einittext]: contains .init.text.* sections
* [__bss_start, __bss_stop]: contains BSS sections
*
* Following global variables are optional and may be unavailable on some
* architectures and/or kernel configurations.
* _text, _data
* __kprobes_text_start, __kprobes_text_end
* __entry_text_start, __entry_text_end
* __ctors_start, __ctors_end
* __irqentry_text_start, __irqentry_text_end
* __softirqentry_text_start, __softirqentry_text_end
* __start_opd, __end_opd
*/
extern char _text[], _stext[], _etext[];
extern char _data[], _sdata[], _edata[];
extern char __bss_start[], __bss_stop[];
extern char __init_begin[], __init_end[];
extern char _sinittext[], _einittext[];
extern char __start_ro_after_init[], __end_ro_after_init[];
extern char _end[];
extern char __per_cpu_load[], __per_cpu_start[], __per_cpu_end[];
extern char __kprobes_text_start[], __kprobes_text_end[];
extern char __entry_text_start[], __entry_text_end[];
extern char __start_rodata[], __end_rodata[];
extern char __irqentry_text_start[], __irqentry_text_end[];
extern char __softirqentry_text_start[], __softirqentry_text_end[];
extern char __start_once[], __end_once[];
/* Start and end of .ctors section - used for constructor calls. */
extern char __ctors_start[], __ctors_end[];
/* Start and end of .opd section - used for function descriptors. */
extern char __start_opd[], __end_opd[];
/* Start and end of instrumentation protected text section */
extern char __noinstr_text_start[], __noinstr_text_end[];
extern __visible const void __nosave_begin, __nosave_end;
/* Function descriptor handling (if any). Override in asm/sections.h */
#ifndef dereference_function_descriptor
#define dereference_function_descriptor(p) ((void *)(p))
#define dereference_kernel_function_descriptor(p) ((void *)(p))
#endif
/* random extra sections (if any). Override
* in asm/sections.h */
#ifndef arch_is_kernel_text
static inline int arch_is_kernel_text(unsigned long addr)
{
return 0;
}
#endif
#ifndef arch_is_kernel_data
static inline int arch_is_kernel_data(unsigned long addr)
{
return 0;
}
#endif
/*
* Check if an address is part of freed initmem. This is needed on architectures
* with virt == phys kernel mapping, for code that wants to check if an address
* is part of a static object within [_stext, _end]. After initmem is freed,
* memory can be allocated from it, and such allocations would then have
* addresses within the range [_stext, _end].
*/
#ifndef arch_is_kernel_initmem_freed
static inline int arch_is_kernel_initmem_freed(unsigned long addr)
{
return 0;
}
#endif
/**
* memory_contains - checks if an object is contained within a memory region
* @begin: virtual address of the beginning of the memory region
* @end: virtual address of the end of the memory region
* @virt: virtual address of the memory object
* @size: size of the memory object
*
* Returns: true if the object specified by @virt and @size is entirely
* contained within the memory region defined by @begin and @end, false
* otherwise.
*/
static inline bool memory_contains(void *begin, void *end, void *virt,
size_t size)
{
return virt >= begin && virt + size <= end;
}
/**
* memory_intersects - checks if the region occupied by an object intersects
* with another memory region
* @begin: virtual address of the beginning of the memory regien
* @end: virtual address of the end of the memory region
* @virt: virtual address of the memory object
* @size: size of the memory object
*
* Returns: true if an object's memory region, specified by @virt and @size,
* intersects with the region specified by @begin and @end, false otherwise.
*/
static inline bool memory_intersects(void *begin, void *end, void *virt,
size_t size)
{
void *vend = virt + size;
return (virt >= begin && virt < end) || (vend >= begin && vend < end);
}
/**
* init_section_contains - checks if an object is contained within the init
* section
* @virt: virtual address of the memory object
* @size: size of the memory object
*
* Returns: true if the object specified by @virt and @size is entirely
* contained within the init section, false otherwise.
*/
static inline bool init_section_contains(void *virt, size_t size)
{
return memory_contains(__init_begin, __init_end, virt, size);
}
/**
* init_section_intersects - checks if the region occupied by an object
* intersects with the init section
* @virt: virtual address of the memory object
* @size: size of the memory object
*
* Returns: true if an object's memory region, specified by @virt and @size,
* intersects with the init section, false otherwise.
*/
static inline bool init_section_intersects(void *virt, size_t size)
{
return memory_intersects(__init_begin, __init_end, virt, size);
}
/**
* is_kernel_rodata - checks if the pointer address is located in the
* .rodata section
*
* @addr: address to check
*
* Returns: true if the address is located in .rodata, false otherwise.
*/
static inline bool is_kernel_rodata(unsigned long addr)
{
return addr >= (unsigned long)__start_rodata &&
addr < (unsigned long)__end_rodata;
}
#endif /* _ASM_GENERIC_SECTIONS_H_ */
// SPDX-License-Identifier: GPL-2.0
/*
* Copyright (C) 2007,2008 Oracle. All rights reserved.
*/
#include <linux/sched.h>
#include <linux/slab.h>
#include <linux/rbtree.h>
#include <linux/mm.h>
#include "ctree.h"
#include "disk-io.h"
#include "transaction.h"
#include "print-tree.h"
#include "locking.h"
#include "volumes.h"
#include "qgroup.h"
#include "tree-mod-log.h"
static int split_node(struct btrfs_trans_handle *trans, struct btrfs_root
*root, struct btrfs_path *path, int level);
static int split_leaf(struct btrfs_trans_handle *trans, struct btrfs_root *root,
const struct btrfs_key *ins_key, struct btrfs_path *path,
int data_size, int extend);
static int push_node_left(struct btrfs_trans_handle *trans,
struct extent_buffer *dst,
struct extent_buffer *src, int empty);
static int balance_node_right(struct btrfs_trans_handle *trans,
struct extent_buffer *dst_buf,
struct extent_buffer *src_buf);
static void del_ptr(struct btrfs_root *root, struct btrfs_path *path,
int level, int slot);
static const struct btrfs_csums {
u16 size;
const char name[10];
const char driver[12];
} btrfs_csums[] = {
[BTRFS_CSUM_TYPE_CRC32] = { .size = 4, .name = "crc32c" },
[BTRFS_CSUM_TYPE_XXHASH] = { .size = 8, .name = "xxhash64" },
[BTRFS_CSUM_TYPE_SHA256] = { .size = 32, .name = "sha256" },
[BTRFS_CSUM_TYPE_BLAKE2] = { .size = 32, .name = "blake2b",
.driver = "blake2b-256" },
};
int btrfs_super_csum_size(const struct btrfs_super_block *s)
{
u16 t = btrfs_super_csum_type(s);
/*
* csum type is validated at mount time
*/
return btrfs_csums[t].size;
}
const char *btrfs_super_csum_name(u16 csum_type)
{
/* csum type is validated at mount time */
return btrfs_csums[csum_type].name;
}
/*
* Return driver name if defined, otherwise the name that's also a valid driver
* name
*/
const char *btrfs_super_csum_driver(u16 csum_type)
{
/* csum type is validated at mount time */
return btrfs_csums[csum_type].driver[0] ? btrfs_csums[csum_type].driver :
btrfs_csums[csum_type].name;
}
size_t __attribute_const__ btrfs_get_num_csums(void)
{
return ARRAY_SIZE(btrfs_csums);
}
struct btrfs_path *btrfs_alloc_path(void)
{
return kmem_cache_zalloc(btrfs_path_cachep, GFP_NOFS);
}
/* this also releases the path */
void btrfs_free_path(struct btrfs_path *p)
{
if (!p)
return;
btrfs_release_path(p); kmem_cache_free(btrfs_path_cachep, p);
}
/*
* path release drops references on the extent buffers in the path
* and it drops any locks held by this path
*
* It is safe to call this on paths that no locks or extent buffers held.
*/
noinline void btrfs_release_path(struct btrfs_path *p)
{
int i;
for (i = 0; i < BTRFS_MAX_LEVEL; i++) { p->slots[i] = 0;
if (!p->nodes[i])
continue;
if (p->locks[i]) { btrfs_tree_unlock_rw(p->nodes[i], p->locks[i]); p->locks[i] = 0;
}
free_extent_buffer(p->nodes[i]);
p->nodes[i] = NULL;
}
}
/*
* safely gets a reference on the root node of a tree. A lock
* is not taken, so a concurrent writer may put a different node
* at the root of the tree. See btrfs_lock_root_node for the
* looping required.
*
* The extent buffer returned by this has a reference taken, so
* it won't disappear. It may stop being the root of the tree
* at any time because there are no locks held.
*/
struct extent_buffer *btrfs_root_node(struct btrfs_root *root)
{
struct extent_buffer *eb;
while (1) {
rcu_read_lock();
eb = rcu_dereference(root->node);
/*
* RCU really hurts here, we could free up the root node because
* it was COWed but we may not get the new root node yet so do
* the inc_not_zero dance and if it doesn't work then
* synchronize_rcu and try again.
*/
if (atomic_inc_not_zero(&eb->refs)) {
rcu_read_unlock();
break;
}
rcu_read_unlock();
synchronize_rcu();
}
return eb;
}
/*
* Cowonly root (not-shareable trees, everything not subvolume or reloc roots),
* just get put onto a simple dirty list. Transaction walks this list to make
* sure they get properly updated on disk.
*/
static void add_root_to_dirty_list(struct btrfs_root *root)
{
struct btrfs_fs_info *fs_info = root->fs_info;
if (test_bit(BTRFS_ROOT_DIRTY, &root->state) ||
!test_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state))
return;
spin_lock(&fs_info->trans_lock);
if (!test_and_set_bit(BTRFS_ROOT_DIRTY, &root->state)) {
/* Want the extent tree to be the last on the list */
if (root->root_key.objectid == BTRFS_EXTENT_TREE_OBJECTID)
list_move_tail(&root->dirty_list,
&fs_info->dirty_cowonly_roots);
else
list_move(&root->dirty_list,
&fs_info->dirty_cowonly_roots);
}
spin_unlock(&fs_info->trans_lock);
}
/*
* used by snapshot creation to make a copy of a root for a tree with
* a given objectid. The buffer with the new root node is returned in
* cow_ret, and this func returns zero on success or a negative error code.
*/
int btrfs_copy_root(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct extent_buffer *buf,
struct extent_buffer **cow_ret, u64 new_root_objectid)
{
struct btrfs_fs_info *fs_info = root->fs_info;
struct extent_buffer *cow;
int ret = 0;
int level;
struct btrfs_disk_key disk_key;
WARN_ON(test_bit(BTRFS_ROOT_SHAREABLE, &root->state) &&
trans->transid != fs_info->running_transaction->transid);
WARN_ON(test_bit(BTRFS_ROOT_SHAREABLE, &root->state) &&
trans->transid != root->last_trans);
level = btrfs_header_level(buf);
if (level == 0)
btrfs_item_key(buf, &disk_key, 0);
else
btrfs_node_key(buf, &disk_key, 0);
cow = btrfs_alloc_tree_block(trans, root, 0, new_root_objectid,
&disk_key, level, buf->start, 0,
BTRFS_NESTING_NEW_ROOT);
if (IS_ERR(cow))
return PTR_ERR(cow);
copy_extent_buffer_full(cow, buf);
btrfs_set_header_bytenr(cow, cow->start);
btrfs_set_header_generation(cow, trans->transid);
btrfs_set_header_backref_rev(cow, BTRFS_MIXED_BACKREF_REV);
btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN |
BTRFS_HEADER_FLAG_RELOC);
if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID)
btrfs_set_header_flag(cow, BTRFS_HEADER_FLAG_RELOC);
else
btrfs_set_header_owner(cow, new_root_objectid);
write_extent_buffer_fsid(cow, fs_info->fs_devices->metadata_uuid);
WARN_ON(btrfs_header_generation(buf) > trans->transid);
if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID)
ret = btrfs_inc_ref(trans, root, cow, 1);
else
ret = btrfs_inc_ref(trans, root, cow, 0);
if (ret) {
btrfs_tree_unlock(cow);
free_extent_buffer(cow);
btrfs_abort_transaction(trans, ret);
return ret;
}
btrfs_mark_buffer_dirty(cow);
*cow_ret = cow;
return 0;
}
/*
* check if the tree block can be shared by multiple trees
*/
int btrfs_block_can_be_shared(struct btrfs_root *root,
struct extent_buffer *buf)
{
/*
* Tree blocks not in shareable trees and tree roots are never shared.
* If a block was allocated after the last snapshot and the block was
* not allocated by tree relocation, we know the block is not shared.
*/
if (test_bit(BTRFS_ROOT_SHAREABLE, &root->state) && buf != root->node && buf != root->commit_root && (btrfs_header_generation(buf) <= btrfs_root_last_snapshot(&root->root_item) ||
btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC)))
return 1;
return 0;
}
static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct extent_buffer *buf,
struct extent_buffer *cow,
int *last_ref)
{
struct btrfs_fs_info *fs_info = root->fs_info;
u64 refs;
u64 owner;
u64 flags;
u64 new_flags = 0;
int ret;
/*
* Backrefs update rules:
*
* Always use full backrefs for extent pointers in tree block
* allocated by tree relocation.
*
* If a shared tree block is no longer referenced by its owner
* tree (btrfs_header_owner(buf) == root->root_key.objectid),
* use full backrefs for extent pointers in tree block.
*
* If a tree block is been relocating
* (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID),
* use full backrefs for extent pointers in tree block.
* The reason for this is some operations (such as drop tree)
* are only allowed for blocks use full backrefs.
*/
if (btrfs_block_can_be_shared(root, buf)) {
ret = btrfs_lookup_extent_info(trans, fs_info, buf->start,
btrfs_header_level(buf), 1,
&refs, &flags);
if (ret)
return ret;
if (refs == 0) {
ret = -EROFS;
btrfs_handle_fs_error(fs_info, ret, NULL);
return ret;
}
} else {
refs = 1;
if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID ||
btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV)
flags = BTRFS_BLOCK_FLAG_FULL_BACKREF;
else
flags = 0;
}
owner = btrfs_header_owner(buf);
BUG_ON(owner == BTRFS_TREE_RELOC_OBJECTID &&
!(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF));
if (refs > 1) { if ((owner == root->root_key.objectid ||
root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) &&
!(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)) { ret = btrfs_inc_ref(trans, root, buf, 1);
if (ret)
return ret;
if (root->root_key.objectid ==
BTRFS_TREE_RELOC_OBJECTID) {
ret = btrfs_dec_ref(trans, root, buf, 0);
if (ret)
return ret;
ret = btrfs_inc_ref(trans, root, cow, 1);
if (ret)
return ret;
}
new_flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF;
} else {
if (root->root_key.objectid ==
BTRFS_TREE_RELOC_OBJECTID)
ret = btrfs_inc_ref(trans, root, cow, 1);
else
ret = btrfs_inc_ref(trans, root, cow, 0); if (ret)
return ret;
}
if (new_flags != 0) {
int level = btrfs_header_level(buf);
ret = btrfs_set_disk_extent_flags(trans, buf,
new_flags, level, 0);
if (ret)
return ret;
}
} else {
if (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF) { if (root->root_key.objectid ==
BTRFS_TREE_RELOC_OBJECTID)
ret = btrfs_inc_ref(trans, root, cow, 1);
else
ret = btrfs_inc_ref(trans, root, cow, 0); if (ret)
return ret;
ret = btrfs_dec_ref(trans, root, buf, 1);
if (ret)
return ret;
}
btrfs_clean_tree_block(buf);
*last_ref = 1;
}
return 0;
}
/*
* does the dirty work in cow of a single block. The parent block (if
* supplied) is updated to point to the new cow copy. The new buffer is marked
* dirty and returned locked. If you modify the block it needs to be marked
* dirty again.
*
* search_start -- an allocation hint for the new block
*
* empty_size -- a hint that you plan on doing more cow. This is the size in
* bytes the allocator should try to find free next to the block it returns.
* This is just a hint and may be ignored by the allocator.
*/
static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct extent_buffer *buf,
struct extent_buffer *parent, int parent_slot,
struct extent_buffer **cow_ret,
u64 search_start, u64 empty_size,
enum btrfs_lock_nesting nest)
{
struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_disk_key disk_key;
struct extent_buffer *cow;
int level, ret;
int last_ref = 0;
int unlock_orig = 0;
u64 parent_start = 0;
if (*cow_ret == buf)
unlock_orig = 1;
btrfs_assert_tree_locked(buf);
WARN_ON(test_bit(BTRFS_ROOT_SHAREABLE, &root->state) &&
trans->transid != fs_info->running_transaction->transid);
WARN_ON(test_bit(BTRFS_ROOT_SHAREABLE, &root->state) &&
trans->transid != root->last_trans);
level = btrfs_header_level(buf);
if (level == 0)
btrfs_item_key(buf, &disk_key, 0);
else
btrfs_node_key(buf, &disk_key, 0); if ((root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) && parent) parent_start = parent->start; cow = btrfs_alloc_tree_block(trans, root, parent_start,
root->root_key.objectid, &disk_key, level,
search_start, empty_size, nest);
if (IS_ERR(cow))
return PTR_ERR(cow);
/* cow is set to blocking by btrfs_init_new_buffer */
copy_extent_buffer_full(cow, buf);
btrfs_set_header_bytenr(cow, cow->start);
btrfs_set_header_generation(cow, trans->transid);
btrfs_set_header_backref_rev(cow, BTRFS_MIXED_BACKREF_REV);
btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN |
BTRFS_HEADER_FLAG_RELOC);
if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID)
btrfs_set_header_flag(cow, BTRFS_HEADER_FLAG_RELOC);
else
btrfs_set_header_owner(cow, root->root_key.objectid); write_extent_buffer_fsid(cow, fs_info->fs_devices->metadata_uuid);
ret = update_ref_for_cow(trans, root, buf, cow, &last_ref);
if (ret) {
btrfs_tree_unlock(cow);
free_extent_buffer(cow);
btrfs_abort_transaction(trans, ret);
return ret;
}
if (test_bit(BTRFS_ROOT_SHAREABLE, &root->state)) {
ret = btrfs_reloc_cow_block(trans, root, buf, cow);
if (ret) {
btrfs_tree_unlock(cow);
free_extent_buffer(cow);
btrfs_abort_transaction(trans, ret);
return ret;
}
}
if (buf == root->node) { WARN_ON(parent && parent != buf); if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID ||
btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV)
parent_start = buf->start;
atomic_inc(&cow->refs);
ret = btrfs_tree_mod_log_insert_root(root->node, cow, true);
BUG_ON(ret < 0); rcu_assign_pointer(root->node, cow);
btrfs_free_tree_block(trans, root, buf, parent_start,
last_ref);
free_extent_buffer(buf);
add_root_to_dirty_list(root);
} else {
WARN_ON(trans->transid != btrfs_header_generation(parent)); btrfs_tree_mod_log_insert_key(parent, parent_slot,
BTRFS_MOD_LOG_KEY_REPLACE, GFP_NOFS);
btrfs_set_node_blockptr(parent, parent_slot,
cow->start);
btrfs_set_node_ptr_generation(parent, parent_slot,
trans->transid);
btrfs_mark_buffer_dirty(parent);
if (last_ref) {
ret = btrfs_tree_mod_log_free_eb(buf); if (ret) { btrfs_tree_unlock(cow);
free_extent_buffer(cow);
btrfs_abort_transaction(trans, ret);
return ret;
}
}
btrfs_free_tree_block(trans, root, buf, parent_start,
last_ref);
}
if (unlock_orig) btrfs_tree_unlock(buf); free_extent_buffer_stale(buf);
btrfs_mark_buffer_dirty(cow);
*cow_ret = cow;
return 0;
}
static inline int should_cow_block(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct extent_buffer *buf)
{
if (btrfs_is_testing(root->fs_info))
return 0;
/* Ensure we can see the FORCE_COW bit */
smp_mb__before_atomic();
/*
* We do not need to cow a block if
* 1) this block is not created or changed in this transaction;
* 2) this block does not belong to TREE_RELOC tree;
* 3) the root is not forced COW.
*
* What is forced COW:
* when we create snapshot during committing the transaction,
* after we've finished copying src root, we must COW the shared
* block to ensure the metadata consistency.
*/
if (btrfs_header_generation(buf) == trans->transid &&
!btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN) &&
!(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID && btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC)) && !test_bit(BTRFS_ROOT_FORCE_COW, &root->state))
return 0;
return 1;
}
/*
* cows a single block, see __btrfs_cow_block for the real work.
* This version of it has extra checks so that a block isn't COWed more than
* once per transaction, as long as it hasn't been written yet
*/
noinline int btrfs_cow_block(struct btrfs_trans_handle *trans,
struct btrfs_root *root, struct extent_buffer *buf,
struct extent_buffer *parent, int parent_slot,
struct extent_buffer **cow_ret,
enum btrfs_lock_nesting nest)
{
struct btrfs_fs_info *fs_info = root->fs_info;
u64 search_start;
int ret;
if (test_bit(BTRFS_ROOT_DELETING, &root->state))
btrfs_err(fs_info,
"COW'ing blocks on a fs root that's being dropped");
if (trans->transaction != fs_info->running_transaction) WARN(1, KERN_CRIT "trans %llu running %llu\n",
trans->transid,
fs_info->running_transaction->transid);
if (trans->transid != fs_info->generation) WARN(1, KERN_CRIT "trans %llu running %llu\n",
trans->transid, fs_info->generation);
if (!should_cow_block(trans, root, buf)) {
*cow_ret = buf;
return 0;
}
search_start = buf->start & ~((u64)SZ_1G - 1);
/*
* Before CoWing this block for later modification, check if it's
* the subtree root and do the delayed subtree trace if needed.
*
* Also We don't care about the error, as it's handled internally.
*/
btrfs_qgroup_trace_subtree_after_cow(trans, root, buf);
ret = __btrfs_cow_block(trans, root, buf, parent,
parent_slot, cow_ret, search_start, 0, nest);
trace_btrfs_cow_block(root, buf, *cow_ret);
return ret;
}
ALLOW_ERROR_INJECTION(btrfs_cow_block, ERRNO);
/*
* helper function for defrag to decide if two blocks pointed to by a
* node are actually close by
*/
static int close_blocks(u64 blocknr, u64 other, u32 blocksize)
{
if (blocknr < other && other - (blocknr + blocksize) < 32768)
return 1;
if (blocknr > other && blocknr - (other + blocksize) < 32768)
return 1;
return 0;
}
#ifdef __LITTLE_ENDIAN
/*
* Compare two keys, on little-endian the disk order is same as CPU order and
* we can avoid the conversion.
*/
static int comp_keys(const struct btrfs_disk_key *disk_key,
const struct btrfs_key *k2)
{
const struct btrfs_key *k1 = (const struct btrfs_key *)disk_key;
return btrfs_comp_cpu_keys(k1, k2);
}
#else
/*
* compare two keys in a memcmp fashion
*/
static int comp_keys(const struct btrfs_disk_key *disk,
const struct btrfs_key *k2)
{
struct btrfs_key k1;
btrfs_disk_key_to_cpu(&k1, disk);
return btrfs_comp_cpu_keys(&k1, k2);
}
#endif
/*
* same as comp_keys only with two btrfs_key's
*/
int __pure btrfs_comp_cpu_keys(const struct btrfs_key *k1, const struct btrfs_key *k2)
{
if (k1->objectid > k2->objectid)
return 1;
if (k1->objectid < k2->objectid)
return -1;
if (k1->type > k2->type)
return 1;
if (k1->type < k2->type)
return -1;
if (k1->offset > k2->offset)
return 1;
if (k1->offset < k2->offset)
return -1;
return 0;
}
/*
* this is used by the defrag code to go through all the
* leaves pointed to by a node and reallocate them so that
* disk order is close to key order
*/
int btrfs_realloc_node(struct btrfs_trans_handle *trans,
struct btrfs_root *root, struct extent_buffer *parent,
int start_slot, u64 *last_ret,
struct btrfs_key *progress)
{
struct btrfs_fs_info *fs_info = root->fs_info;
struct extent_buffer *cur;
u64 blocknr;
u64 search_start = *last_ret;
u64 last_block = 0;
u64 other;
u32 parent_nritems;
int end_slot;
int i;
int err = 0;
u32 blocksize;
int progress_passed = 0;
struct btrfs_disk_key disk_key;
WARN_ON(trans->transaction != fs_info->running_transaction);
WARN_ON(trans->transid != fs_info->generation);
parent_nritems = btrfs_header_nritems(parent);
blocksize = fs_info->nodesize;
end_slot = parent_nritems - 1;
if (parent_nritems <= 1)
return 0;
for (i = start_slot; i <= end_slot; i++) {
int close = 1;
btrfs_node_key(parent, &disk_key, i);
if (!progress_passed && comp_keys(&disk_key, progress) < 0)
continue;
progress_passed = 1;
blocknr = btrfs_node_blockptr(parent, i);
if (last_block == 0)
last_block = blocknr;
if (i > 0) {
other = btrfs_node_blockptr(parent, i - 1);
close = close_blocks(blocknr, other, blocksize);
}
if (!close && i < end_slot) {
other = btrfs_node_blockptr(parent, i + 1);
close = close_blocks(blocknr, other, blocksize);
}
if (close) {
last_block = blocknr;
continue;
}
cur = btrfs_read_node_slot(parent, i);
if (IS_ERR(cur))
return PTR_ERR(cur);
if (search_start == 0)
search_start = last_block;
btrfs_tree_lock(cur);
err = __btrfs_cow_block(trans, root, cur, parent, i,
&cur, search_start,
min(16 * blocksize,
(end_slot - i) * blocksize),
BTRFS_NESTING_COW);
if (err) {
btrfs_tree_unlock(cur);
free_extent_buffer(cur);
break;
}
search_start = cur->start;
last_block = cur->start;
*last_ret = search_start;
btrfs_tree_unlock(cur);
free_extent_buffer(cur);
}
return err;
}
/*
* search for key in the extent_buffer. The items start at offset p,
* and they are item_size apart.
*
* the slot in the array is returned via slot, and it points to
* the place where you would insert key if it is not found in
* the array.
*
* Slot may point to total number of items if the key is bigger than
* all of the keys
*/
static noinline int generic_bin_search(struct extent_buffer *eb,
unsigned long p, int item_size,
const struct btrfs_key *key, int *slot)
{
int low = 0;
int high = btrfs_header_nritems(eb);
int ret;
const int key_size = sizeof(struct btrfs_disk_key);
if (low > high) {
btrfs_err(eb->fs_info,
"%s: low (%d) > high (%d) eb %llu owner %llu level %d",
__func__, low, high, eb->start,
btrfs_header_owner(eb), btrfs_header_level(eb));
return -EINVAL;
}
while (low < high) {
unsigned long oip;
unsigned long offset;
struct btrfs_disk_key *tmp;
struct btrfs_disk_key unaligned;
int mid;
mid = (low + high) / 2;
offset = p + mid * item_size;
oip = offset_in_page(offset);
if (oip + key_size <= PAGE_SIZE) {
const unsigned long idx = get_eb_page_index(offset);
char *kaddr = page_address(eb->pages[idx]);
oip = get_eb_offset_in_page(eb, offset);
tmp = (struct btrfs_disk_key *)(kaddr + oip);
} else {
read_extent_buffer(eb, &unaligned, offset, key_size);
tmp = &unaligned;
}
ret = comp_keys(tmp, key);
if (ret < 0)
low = mid + 1; else if (ret > 0)
high = mid;
else {
*slot = mid;
return 0;
}
}
*slot = low; return 1;
}
/*
* simple bin_search frontend that does the right thing for
* leaves vs nodes
*/
int btrfs_bin_search(struct extent_buffer *eb, const struct btrfs_key *key,
int *slot)
{
if (btrfs_header_level(eb) == 0) return generic_bin_search(eb,
offsetof(struct btrfs_leaf, items),
sizeof(struct btrfs_item), key, slot);
else
return generic_bin_search(eb,
offsetof(struct btrfs_node, ptrs),
sizeof(struct btrfs_key_ptr), key, slot);
}
static void root_add_used(struct btrfs_root *root, u32 size)
{
spin_lock(&root->accounting_lock);
btrfs_set_root_used(&root->root_item,
btrfs_root_used(&root->root_item) + size);
spin_unlock(&root->accounting_lock);
}
static void root_sub_used(struct btrfs_root *root, u32 size)
{
spin_lock(&root->accounting_lock);
btrfs_set_root_used(&root->root_item,
btrfs_root_used(&root->root_item) - size);
spin_unlock(&root->accounting_lock);
}
/* given a node and slot number, this reads the blocks it points to. The
* extent buffer is returned with a reference taken (but unlocked).
*/
struct extent_buffer *btrfs_read_node_slot(struct extent_buffer *parent,
int slot)
{
int level = btrfs_header_level(parent);
struct extent_buffer *eb;
struct btrfs_key first_key;
if (slot < 0 || slot >= btrfs_header_nritems(parent)) return ERR_PTR(-ENOENT); BUG_ON(level == 0);
btrfs_node_key_to_cpu(parent, &first_key, slot);
eb = read_tree_block(parent->fs_info, btrfs_node_blockptr(parent, slot),
btrfs_header_owner(parent),
btrfs_node_ptr_generation(parent, slot),
level - 1, &first_key);
if (!IS_ERR(eb) && !extent_buffer_uptodate(eb)) {
free_extent_buffer(eb);
eb = ERR_PTR(-EIO);
}
return eb;
}
/*
* node level balancing, used to make sure nodes are in proper order for
* item deletion. We balance from the top down, so we have to make sure
* that a deletion won't leave an node completely empty later on.
*/
static noinline int balance_level(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_path *path, int level)
{
struct btrfs_fs_info *fs_info = root->fs_info;
struct extent_buffer *right = NULL;
struct extent_buffer *mid;
struct extent_buffer *left = NULL;
struct extent_buffer *parent = NULL;
int ret = 0;
int wret;
int pslot;
int orig_slot = path->slots[level];
u64 orig_ptr;
ASSERT(level > 0);
mid = path->nodes[level];
WARN_ON(path->locks[level] != BTRFS_WRITE_LOCK); WARN_ON(btrfs_header_generation(mid) != trans->transid);
orig_ptr = btrfs_node_blockptr(mid, orig_slot);
if (level < BTRFS_MAX_LEVEL - 1) { parent = path->nodes[level + 1]; pslot = path->slots[level + 1];
}
/*
* deal with the case where there is only one pointer in the root
* by promoting the node below to a root
*/
if (!parent) {
struct extent_buffer *child;
if (btrfs_header_nritems(mid) != 1) return 0;
/* promote the child to a root */
child = btrfs_read_node_slot(mid, 0);
if (IS_ERR(child)) {
ret = PTR_ERR(child);
btrfs_handle_fs_error(fs_info, ret, NULL);
goto enospc;
}
btrfs_tree_lock(child);
ret = btrfs_cow_block(trans, root, child, mid, 0, &child,
BTRFS_NESTING_COW);
if (ret) {
btrfs_tree_unlock(child);
free_extent_buffer(child);
goto enospc;
}
ret = btrfs_tree_mod_log_insert_root(root->node, child, true); BUG_ON(ret < 0); rcu_assign_pointer(root->node, child);
add_root_to_dirty_list(root);
btrfs_tree_unlock(child);
path->locks[level] = 0;
path->nodes[level] = NULL;
btrfs_clean_tree_block(mid);
btrfs_tree_unlock(mid);
/* once for the path */
free_extent_buffer(mid);
root_sub_used(root, mid->len);
btrfs_free_tree_block(trans, root, mid, 0, 1);
/* once for the root ptr */
free_extent_buffer_stale(mid);
return 0;
}
if (btrfs_header_nritems(mid) >
BTRFS_NODEPTRS_PER_BLOCK(fs_info) / 4)
return 0;
left = btrfs_read_node_slot(parent, pslot - 1);
if (IS_ERR(left))
left = NULL; if (left) { __btrfs_tree_lock(left, BTRFS_NESTING_LEFT);
wret = btrfs_cow_block(trans, root, left,
parent, pslot - 1, &left,
BTRFS_NESTING_LEFT_COW);
if (wret) {
ret = wret;
goto enospc;
}
}
right = btrfs_read_node_slot(parent, pslot + 1);
if (IS_ERR(right))
right = NULL; if (right) { __btrfs_tree_lock(right, BTRFS_NESTING_RIGHT);
wret = btrfs_cow_block(trans, root, right,
parent, pslot + 1, &right,
BTRFS_NESTING_RIGHT_COW);
if (wret) {
ret = wret;
goto enospc;
}
}
/* first, try to make some room in the middle buffer */
if (left) { orig_slot += btrfs_header_nritems(left);
wret = push_node_left(trans, left, mid, 1);
if (wret < 0)
ret = wret;
}
/*
* then try to empty the right most buffer into the middle
*/
if (right) { wret = push_node_left(trans, mid, right, 1); if (wret < 0 && wret != -ENOSPC)
ret = wret;
if (btrfs_header_nritems(right) == 0) { btrfs_clean_tree_block(right);
btrfs_tree_unlock(right);
del_ptr(root, path, level + 1, pslot + 1);
root_sub_used(root, right->len);
btrfs_free_tree_block(trans, root, right, 0, 1);
free_extent_buffer_stale(right);
right = NULL;
} else {
struct btrfs_disk_key right_key;
btrfs_node_key(right, &right_key, 0);
ret = btrfs_tree_mod_log_insert_key(parent, pslot + 1,
BTRFS_MOD_LOG_KEY_REPLACE, GFP_NOFS);
BUG_ON(ret < 0);
btrfs_set_node_key(parent, &right_key, pslot + 1);
btrfs_mark_buffer_dirty(parent);
}
}
if (btrfs_header_nritems(mid) == 1) {
/*
* we're not allowed to leave a node with one item in the
* tree during a delete. A deletion from lower in the tree
* could try to delete the only pointer in this node.
* So, pull some keys from the left.
* There has to be a left pointer at this point because
* otherwise we would have pulled some pointers from the
* right
*/
if (!left) {
ret = -EROFS;
btrfs_handle_fs_error(fs_info, ret, NULL);
goto enospc;
}
wret = balance_node_right(trans, mid, left);
if (wret < 0) {
ret = wret;
goto enospc;
}
if (wret == 1) { wret = push_node_left(trans, left, mid, 1);
if (wret < 0)
ret = wret;
}
BUG_ON(wret == 1);
}
if (btrfs_header_nritems(mid) == 0) { btrfs_clean_tree_block(mid);
btrfs_tree_unlock(mid);
del_ptr(root, path, level + 1, pslot);
root_sub_used(root, mid->len);
btrfs_free_tree_block(trans, root, mid, 0, 1);
free_extent_buffer_stale(mid);
mid = NULL;
} else {
/* update the parent key to reflect our changes */
struct btrfs_disk_key mid_key;
btrfs_node_key(mid, &mid_key, 0);
ret = btrfs_tree_mod_log_insert_key(parent, pslot,
BTRFS_MOD_LOG_KEY_REPLACE, GFP_NOFS);
BUG_ON(ret < 0);
btrfs_set_node_key(parent, &mid_key, pslot);
btrfs_mark_buffer_dirty(parent);
}
/* update the path */
if (left) { if (btrfs_header_nritems(left) > orig_slot) { atomic_inc(&left->refs);
/* left was locked after cow */
path->nodes[level] = left;
path->slots[level + 1] -= 1;
path->slots[level] = orig_slot;
if (mid) {
btrfs_tree_unlock(mid);
free_extent_buffer(mid);
}
} else {
orig_slot -= btrfs_header_nritems(left);
path->slots[level] = orig_slot;
}
}
/* double check we haven't messed things up */
if (orig_ptr !=
btrfs_node_blockptr(path->nodes[level], path->slots[level])) BUG();
enospc:
if (right) { btrfs_tree_unlock(right);
free_extent_buffer(right);
}
if (left) { if (path->nodes[level] != left) btrfs_tree_unlock(left); free_extent_buffer(left);
}
return ret;
}
/* Node balancing for insertion. Here we only split or push nodes around
* when they are completely full. This is also done top down, so we
* have to be pessimistic.
*/
static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_path *path, int level)
{
struct btrfs_fs_info *fs_info = root->fs_info;
struct extent_buffer *right = NULL;
struct extent_buffer *mid;
struct extent_buffer *left = NULL;
struct extent_buffer *parent = NULL;
int ret = 0;
int wret;
int pslot;
int orig_slot = path->slots[level];
if (level == 0)
return 1;
mid = path->nodes[level];
WARN_ON(btrfs_header_generation(mid) != trans->transid);
if (level < BTRFS_MAX_LEVEL - 1) {
parent = path->nodes[level + 1];
pslot = path->slots[level + 1];
}
if (!parent)
return 1;
left = btrfs_read_node_slot(parent, pslot - 1);
if (IS_ERR(left))
left = NULL;
/* first, try to make some room in the middle buffer */
if (left) {
u32 left_nr;
__btrfs_tree_lock(left, BTRFS_NESTING_LEFT);
left_nr = btrfs_header_nritems(left);
if (left_nr >= BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 1) {
wret = 1;
} else {
ret = btrfs_cow_block(trans, root, left, parent,
pslot - 1, &left,
BTRFS_NESTING_LEFT_COW);
if (ret)
wret = 1;
else {
wret = push_node_left(trans, left, mid, 0);
}
}
if (wret < 0)
ret = wret;
if (wret == 0) {
struct btrfs_disk_key disk_key;
orig_slot += left_nr;
btrfs_node_key(mid, &disk_key, 0);
ret = btrfs_tree_mod_log_insert_key(parent, pslot,
BTRFS_MOD_LOG_KEY_REPLACE, GFP_NOFS);
BUG_ON(ret < 0);
btrfs_set_node_key(parent, &disk_key, pslot);
btrfs_mark_buffer_dirty(parent);
if (btrfs_header_nritems(left) > orig_slot) {
path->nodes[level] = left;
path->slots[level + 1] -= 1;
path->slots[level] = orig_slot;
btrfs_tree_unlock(mid);
free_extent_buffer(mid);
} else {
orig_slot -=
btrfs_header_nritems(left);
path->slots[level] = orig_slot;
btrfs_tree_unlock(left);
free_extent_buffer(left);
}
return 0;
}
btrfs_tree_unlock(left);
free_extent_buffer(left);
}
right = btrfs_read_node_slot(parent, pslot + 1);
if (IS_ERR(right))
right = NULL;
/*
* then try to empty the right most buffer into the middle
*/
if (right) {
u32 right_nr;
__btrfs_tree_lock(right, BTRFS_NESTING_RIGHT);
right_nr = btrfs_header_nritems(right);
if (right_nr >= BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 1) {
wret = 1;
} else {
ret = btrfs_cow_block(trans, root, right,
parent, pslot + 1,
&right, BTRFS_NESTING_RIGHT_COW);
if (ret)
wret = 1;
else {
wret = balance_node_right(trans, right, mid);
}
}
if (wret < 0)
ret = wret;
if (wret == 0) {
struct btrfs_disk_key disk_key;
btrfs_node_key(right, &disk_key, 0);
ret = btrfs_tree_mod_log_insert_key(parent, pslot + 1,
BTRFS_MOD_LOG_KEY_REPLACE, GFP_NOFS);
BUG_ON(ret < 0);
btrfs_set_node_key(parent, &disk_key, pslot + 1);
btrfs_mark_buffer_dirty(parent);
if (btrfs_header_nritems(mid) <= orig_slot) {
path->nodes[level] = right;
path->slots[level + 1] += 1;
path->slots[level] = orig_slot -
btrfs_header_nritems(mid);
btrfs_tree_unlock(mid);
free_extent_buffer(mid);
} else {
btrfs_tree_unlock(right);
free_extent_buffer(right);
}
return 0;
}
btrfs_tree_unlock(right);
free_extent_buffer(right);
}
return 1;
}
/*
* readahead one full node of leaves, finding things that are close
* to the block in 'slot', and triggering ra on them.
*/
static void reada_for_search(struct btrfs_fs_info *fs_info,
struct btrfs_path *path,
int level, int slot, u64 objectid)
{
struct extent_buffer *node;
struct btrfs_disk_key disk_key;
u32 nritems;
u64 search;
u64 target;
u64 nread = 0;
u64 nread_max;
u32 nr;
u32 blocksize;
u32 nscan = 0;
if (level != 1 && path->reada != READA_FORWARD_ALWAYS)
return;
if (!path->nodes[level])
return;
node = path->nodes[level];
/*
* Since the time between visiting leaves is much shorter than the time
* between visiting nodes, limit read ahead of nodes to 1, to avoid too
* much IO at once (possibly random).
*/
if (path->reada == READA_FORWARD_ALWAYS) {
if (level > 1)
nread_max = node->fs_info->nodesize;
else
nread_max = SZ_128K;
} else {
nread_max = SZ_64K;
}
search = btrfs_node_blockptr(node, slot);
blocksize = fs_info->nodesize;
if (path->reada != READA_FORWARD_ALWAYS) {
struct extent_buffer *eb;
eb = find_extent_buffer(fs_info, search);
if (eb) {
free_extent_buffer(eb);
return;
}
}
target = search;
nritems = btrfs_header_nritems(node);
nr = slot;
while (1) {
if (path->reada == READA_BACK) {
if (nr == 0)
break;
nr--;
} else if (path->reada == READA_FORWARD ||
path->reada == READA_FORWARD_ALWAYS) {
nr++;
if (nr >= nritems)
break;
}
if (path->reada == READA_BACK && objectid) {
btrfs_node_key(node, &disk_key, nr);
if (btrfs_disk_key_objectid(&disk_key) != objectid)
break;
}
search = btrfs_node_blockptr(node, nr);
if (path->reada == READA_FORWARD_ALWAYS ||
(search <= target && target - search <= 65536) ||
(search > target && search - target <= 65536)) {
btrfs_readahead_node_child(node, nr);
nread += blocksize;
}
nscan++;
if (nread > nread_max || nscan > 32)
break;
}
}
static noinline void reada_for_balance(struct btrfs_path *path, int level)
{
struct extent_buffer *parent;
int slot;
int nritems;
parent = path->nodes[level + 1];
if (!parent)
return;
nritems = btrfs_header_nritems(parent);
slot = path->slots[level + 1];
if (slot > 0)
btrfs_readahead_node_child(parent, slot - 1); if (slot + 1 < nritems) btrfs_readahead_node_child(parent, slot + 1);
}
/*
* when we walk down the tree, it is usually safe to unlock the higher layers
* in the tree. The exceptions are when our path goes through slot 0, because
* operations on the tree might require changing key pointers higher up in the
* tree.
*
* callers might also have set path->keep_locks, which tells this code to keep
* the lock if the path points to the last slot in the block. This is part of
* walking through the tree, and selecting the next slot in the higher block.
*
* lowest_unlock sets the lowest level in the tree we're allowed to unlock. so
* if lowest_unlock is 1, level 0 won't be unlocked
*/
static noinline void unlock_up(struct btrfs_path *path, int level,
int lowest_unlock, int min_write_lock_level,
int *write_lock_level)
{
int i;
int skip_level = level;
int no_skips = 0;
struct extent_buffer *t;
for (i = level; i < BTRFS_MAX_LEVEL; i++) { if (!path->nodes[i])
break;
if (!path->locks[i])
break;
if (!no_skips && path->slots[i] == 0) { skip_level = i + 1;
continue;
}
if (!no_skips && path->keep_locks) {
u32 nritems;
t = path->nodes[i];
nritems = btrfs_header_nritems(t); if (nritems < 1 || path->slots[i] >= nritems - 1) { skip_level = i + 1;
continue;
}
}
if (skip_level < i && i >= lowest_unlock)
no_skips = 1;
t = path->nodes[i];
if (i >= lowest_unlock && i > skip_level) { btrfs_tree_unlock_rw(t, path->locks[i]); path->locks[i] = 0; if (write_lock_level &&
i > min_write_lock_level &&
i <= *write_lock_level) { *write_lock_level = i - 1;
}
}
}
}
/*
* helper function for btrfs_search_slot. The goal is to find a block
* in cache without setting the path to blocking. If we find the block
* we return zero and the path is unchanged.
*
* If we can't find the block, we set the path blocking and do some
* reada. -EAGAIN is returned and the search must be repeated.
*/
static int
read_block_for_search(struct btrfs_root *root, struct btrfs_path *p,
struct extent_buffer **eb_ret, int level, int slot,
const struct btrfs_key *key)
{
struct btrfs_fs_info *fs_info = root->fs_info;
u64 blocknr;
u64 gen;
struct extent_buffer *tmp;
struct btrfs_key first_key;
int ret;
int parent_level;
blocknr = btrfs_node_blockptr(*eb_ret, slot);
gen = btrfs_node_ptr_generation(*eb_ret, slot);
parent_level = btrfs_header_level(*eb_ret);
btrfs_node_key_to_cpu(*eb_ret, &first_key, slot);
tmp = find_extent_buffer(fs_info, blocknr);
if (tmp) {
if (p->reada == READA_FORWARD_ALWAYS) reada_for_search(fs_info, p, level, slot, key->objectid);
/* first we do an atomic uptodate check */
if (btrfs_buffer_uptodate(tmp, gen, 1) > 0) {
/*
* Do extra check for first_key, eb can be stale due to
* being cached, read from scrub, or have multiple
* parents (shared tree blocks).
*/
if (btrfs_verify_level_key(tmp,
parent_level - 1, &first_key, gen)) {
free_extent_buffer(tmp);
return -EUCLEAN;
}
*eb_ret = tmp;
return 0;
}
/* now we're allowed to do a blocking uptodate check */
ret = btrfs_read_buffer(tmp, gen, parent_level - 1, &first_key);
if (!ret) {
*eb_ret = tmp; return 0;
}
free_extent_buffer(tmp);
btrfs_release_path(p);
return -EIO;
}
/*
* reduce lock contention at high levels
* of the btree by dropping locks before
* we read. Don't release the lock on the current
* level because we need to walk this node to figure
* out which blocks to read.
*/
btrfs_unlock_up_safe(p, level + 1);
if (p->reada != READA_NONE)
reada_for_search(fs_info, p, level, slot, key->objectid);
ret = -EAGAIN;
tmp = read_tree_block(fs_info, blocknr, root->root_key.objectid,
gen, parent_level - 1, &first_key);
if (!IS_ERR(tmp)) {
/*
* If the read above didn't mark this buffer up to date,
* it will never end up being up to date. Set ret to EIO now
* and give up so that our caller doesn't loop forever
* on our EAGAINs.
*/
if (!extent_buffer_uptodate(tmp))
ret = -EIO;
free_extent_buffer(tmp);
} else {
ret = PTR_ERR(tmp);
}
btrfs_release_path(p);
return ret;
}
/*
* helper function for btrfs_search_slot. This does all of the checks
* for node-level blocks and does any balancing required based on
* the ins_len.
*
* If no extra work was required, zero is returned. If we had to
* drop the path, -EAGAIN is returned and btrfs_search_slot must
* start over
*/
static int
setup_nodes_for_search(struct btrfs_trans_handle *trans,
struct btrfs_root *root, struct btrfs_path *p,
struct extent_buffer *b, int level, int ins_len,
int *write_lock_level)
{
struct btrfs_fs_info *fs_info = root->fs_info;
int ret = 0;
if ((p->search_for_split || ins_len > 0) && btrfs_header_nritems(b) >=
BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 3) {
if (*write_lock_level < level + 1) {
*write_lock_level = level + 1;
btrfs_release_path(p);
return -EAGAIN;
}
reada_for_balance(p, level);
ret = split_node(trans, root, p, level);
b = p->nodes[level];
} else if (ins_len < 0 && btrfs_header_nritems(b) <
BTRFS_NODEPTRS_PER_BLOCK(fs_info) / 2) {
if (*write_lock_level < level + 1) {
*write_lock_level = level + 1;
btrfs_release_path(p);
return -EAGAIN;
}
reada_for_balance(p, level);
ret = balance_level(trans, root, p, level);
if (ret)
return ret;
b = p->nodes[level];
if (!b) {
btrfs_release_path(p);
return -EAGAIN;
}
BUG_ON(btrfs_header_nritems(b) == 1);
}
return ret;
}
int btrfs_find_item(struct btrfs_root *fs_root, struct btrfs_path *path,
u64 iobjectid, u64 ioff, u8 key_type,
struct btrfs_key *found_key)
{
int ret;
struct btrfs_key key;
struct extent_buffer *eb;
ASSERT(path);
ASSERT(found_key);
key.type = key_type;
key.objectid = iobjectid;
key.offset = ioff;
ret = btrfs_search_slot(NULL, fs_root, &key, path, 0, 0);
if (ret < 0)
return ret;
eb = path->nodes[0];
if (ret && path->slots[0] >= btrfs_header_nritems(eb)) {
ret = btrfs_next_leaf(fs_root, path);
if (ret)
return ret;
eb = path->nodes[0];
}
btrfs_item_key_to_cpu(eb, found_key, path->slots[0]);
if (found_key->type != key.type ||
found_key->objectid != key.objectid)
return 1;
return 0;
}
static struct extent_buffer *btrfs_search_slot_get_root(struct btrfs_root *root,
struct btrfs_path *p,
int write_lock_level)
{
struct extent_buffer *b;
int root_lock = 0;
int level = 0;
if (p->search_commit_root) { b = root->commit_root;
atomic_inc(&b->refs);
level = btrfs_header_level(b);
/*
* Ensure that all callers have set skip_locking when
* p->search_commit_root = 1.
*/
ASSERT(p->skip_locking == 1);
goto out;
}
if (p->skip_locking) { b = btrfs_root_node(root);
level = btrfs_header_level(b);
goto out;
}
/* We try very hard to do read locks on the root */
root_lock = BTRFS_READ_LOCK;
/*
* If the level is set to maximum, we can skip trying to get the read
* lock.
*/
if (write_lock_level < BTRFS_MAX_LEVEL) {
/*
* We don't know the level of the root node until we actually
* have it read locked
*/
b = btrfs_read_lock_root_node(root);
level = btrfs_header_level(b);
if (level > write_lock_level)
goto out;
/* Whoops, must trade for write lock */
btrfs_tree_read_unlock(b);
free_extent_buffer(b);
}
b = btrfs_lock_root_node(root);
root_lock = BTRFS_WRITE_LOCK;
/* The level might have changed, check again */
level = btrfs_header_level(b);
out:
/*
* The root may have failed to write out at some point, and thus is no
* longer valid, return an error in this case.
*/
if (!extent_buffer_uptodate(b)) {
if (root_lock)
btrfs_tree_unlock_rw(b, root_lock);
free_extent_buffer(b);
return ERR_PTR(-EIO);
}
p->nodes[level] = b;
if (!p->skip_locking)
p->locks[level] = root_lock;
/*
* Callers are responsible for dropping b's references.
*/
return b;
}
/*
* Replace the extent buffer at the lowest level of the path with a cloned
* version. The purpose is to be able to use it safely, after releasing the
* commit root semaphore, even if relocation is happening in parallel, the
* transaction used for relocation is committed and the extent buffer is
* reallocated in the next transaction.
*
* This is used in a context where the caller does not prevent transaction
* commits from happening, either by holding a transaction handle or holding
* some lock, while it's doing searches through a commit root.
* At the moment it's only used for send operations.
*/
static int finish_need_commit_sem_search(struct btrfs_path *path)
{
const int i = path->lowest_level;
const int slot = path->slots[i];
struct extent_buffer *lowest = path->nodes[i];
struct extent_buffer *clone;
ASSERT(path->need_commit_sem);
if (!lowest)
return 0;
lockdep_assert_held_read(&lowest->fs_info->commit_root_sem);
clone = btrfs_clone_extent_buffer(lowest);
if (!clone)
return -ENOMEM;
btrfs_release_path(path);
path->nodes[i] = clone;
path->slots[i] = slot;
return 0;
}
/*
* btrfs_search_slot - look for a key in a tree and perform necessary
* modifications to preserve tree invariants.
*
* @trans: Handle of transaction, used when modifying the tree
* @p: Holds all btree nodes along the search path
* @root: The root node of the tree
* @key: The key we are looking for
* @ins_len: Indicates purpose of search:
* >0 for inserts it's size of item inserted (*)
* <0 for deletions
* 0 for plain searches, not modifying the tree
*
* (*) If size of item inserted doesn't include
* sizeof(struct btrfs_item), then p->search_for_extension must
* be set.
* @cow: boolean should CoW operations be performed. Must always be 1
* when modifying the tree.
*
* If @ins_len > 0, nodes and leaves will be split as we walk down the tree.
* If @ins_len < 0, nodes will be merged as we walk down the tree (if possible)
*
* If @key is found, 0 is returned and you can find the item in the leaf level
* of the path (level 0)
*
* If @key isn't found, 1 is returned and the leaf level of the path (level 0)
* points to the slot where it should be inserted
*
* If an error is encountered while searching the tree a negative error number
* is returned
*/
int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root *root,
const struct btrfs_key *key, struct btrfs_path *p,
int ins_len, int cow)
{
struct btrfs_fs_info *fs_info = root->fs_info;
struct extent_buffer *b;
int slot;
int ret;
int err;
int level;
int lowest_unlock = 1;
/* everything at write_lock_level or lower must be write locked */
int write_lock_level = 0;
u8 lowest_level = 0;
int min_write_lock_level;
int prev_cmp;
lowest_level = p->lowest_level;
WARN_ON(lowest_level && ins_len > 0); WARN_ON(p->nodes[0] != NULL); BUG_ON(!cow && ins_len); if (ins_len < 0) {
lowest_unlock = 2;
/* when we are removing items, we might have to go up to level
* two as we update tree pointers Make sure we keep write
* for those levels as well
*/
write_lock_level = 2; } else if (ins_len > 0) {
/*
* for inserting items, make sure we have a write lock on
* level 1 so we can update keys
*/
write_lock_level = 1;
}
if (!cow) write_lock_level = -1; if (cow && (p->keep_locks || p->lowest_level)) write_lock_level = BTRFS_MAX_LEVEL;
min_write_lock_level = write_lock_level;
if (p->need_commit_sem) {
ASSERT(p->search_commit_root);
down_read(&fs_info->commit_root_sem);
}
again:
prev_cmp = -1;
b = btrfs_search_slot_get_root(root, p, write_lock_level);
if (IS_ERR(b)) {
ret = PTR_ERR(b);
goto done;
}
while (b) {
int dec = 0;
level = btrfs_header_level(b);
if (cow) {
bool last_level = (level == (BTRFS_MAX_LEVEL - 1));
/*
* if we don't really need to cow this block
* then we don't want to set the path blocking,
* so we test it here
*/
if (!should_cow_block(trans, root, b))
goto cow_done;
/*
* must have write locks on this node and the
* parent
*/
if (level > write_lock_level || (level + 1 > write_lock_level &&
level + 1 < BTRFS_MAX_LEVEL &&
p->nodes[level + 1])) {
write_lock_level = level + 1;
btrfs_release_path(p);
goto again;
}
if (last_level) err = btrfs_cow_block(trans, root, b, NULL, 0,
&b,
BTRFS_NESTING_COW);
else
err = btrfs_cow_block(trans, root, b,
p->nodes[level + 1],
p->slots[level + 1], &b,
BTRFS_NESTING_COW);
if (err) {
ret = err;
goto done;
}
}
cow_done:
p->nodes[level] = b;
/*
* Leave path with blocking locks to avoid massive
* lock context switch, this is made on purpose.
*/
/*
* we have a lock on b and as long as we aren't changing
* the tree, there is no way to for the items in b to change.
* It is safe to drop the lock on our parent before we
* go through the expensive btree search on b.
*
* If we're inserting or deleting (ins_len != 0), then we might
* be changing slot zero, which may require changing the parent.
* So, we can't drop the lock until after we know which slot
* we're operating on.
*/
if (!ins_len && !p->keep_locks) { int u = level + 1; if (u < BTRFS_MAX_LEVEL && p->locks[u]) { btrfs_tree_unlock_rw(p->nodes[u], p->locks[u]); p->locks[u] = 0;
}
}
/*
* If btrfs_bin_search returns an exact match (prev_cmp == 0)
* we can safely assume the target key will always be in slot 0
* on lower levels due to the invariants BTRFS' btree provides,
* namely that a btrfs_key_ptr entry always points to the
* lowest key in the child node, thus we can skip searching
* lower levels
*/
if (prev_cmp == 0) { slot = 0;
ret = 0;
} else {
ret = btrfs_bin_search(b, key, &slot);
prev_cmp = ret;
if (ret < 0)
goto done;
}
if (level == 0) { p->slots[level] = slot;
/*
* Item key already exists. In this case, if we are
* allowed to insert the item (for example, in dir_item
* case, item key collision is allowed), it will be
* merged with the original item. Only the item size
* grows, no new btrfs item will be added. If
* search_for_extension is not set, ins_len already
* accounts the size btrfs_item, deduct it here so leaf
* space check will be correct.
*/
if (ret == 0 && ins_len > 0 && !p->search_for_extension) { ASSERT(ins_len >= sizeof(struct btrfs_item));
ins_len -= sizeof(struct btrfs_item);
}
if (ins_len > 0 && btrfs_leaf_free_space(b) < ins_len) { if (write_lock_level < 1) { write_lock_level = 1;
btrfs_release_path(p);
goto again;
}
err = split_leaf(trans, root, key,
p, ins_len, ret == 0);
BUG_ON(err > 0); if (err) {
ret = err;
goto done;
}
}
if (!p->search_for_split) unlock_up(p, level, lowest_unlock,
min_write_lock_level, NULL);
goto done;
}
if (ret && slot > 0) {
dec = 1;
slot--;
}
p->slots[level] = slot;
err = setup_nodes_for_search(trans, root, p, b, level, ins_len,
&write_lock_level);
if (err == -EAGAIN)
goto again;
if (err) {
ret = err;
goto done;
}
b = p->nodes[level];
slot = p->slots[level];
/*
* Slot 0 is special, if we change the key we have to update
* the parent pointer which means we must have a write lock on
* the parent
*/
if (slot == 0 && ins_len && write_lock_level < level + 1) { write_lock_level = level + 1;
btrfs_release_path(p);
goto again;
}
unlock_up(p, level, lowest_unlock, min_write_lock_level,
&write_lock_level);
if (level == lowest_level) {
if (dec) p->slots[level]++;
goto done;
}
err = read_block_for_search(root, p, &b, level, slot, key);
if (err == -EAGAIN)
goto again;
if (err) {
ret = err;
goto done;
}
if (!p->skip_locking) { level = btrfs_header_level(b);
if (level <= write_lock_level) {
btrfs_tree_lock(b);
p->locks[level] = BTRFS_WRITE_LOCK;
} else {
btrfs_tree_read_lock(b); p->locks[level] = BTRFS_READ_LOCK;
}
p->nodes[level] = b;
}
}
ret = 1;
done:
if (ret < 0 && !p->skip_release_on_error) btrfs_release_path(p); if (p->need_commit_sem) {
int ret2;
ret2 = finish_need_commit_sem_search(p);
up_read(&fs_info->commit_root_sem);
if (ret2)
ret = ret2;
}
return ret;
}
ALLOW_ERROR_INJECTION(btrfs_search_slot, ERRNO);
/*
* Like btrfs_search_slot, this looks for a key in the given tree. It uses the
* current state of the tree together with the operations recorded in the tree
* modification log to search for the key in a previous version of this tree, as
* denoted by the time_seq parameter.
*
* Naturally, there is no support for insert, delete or cow operations.
*
* The resulting path and return value will be set up as if we called
* btrfs_search_slot at that point in time with ins_len and cow both set to 0.
*/
int btrfs_search_old_slot(struct btrfs_root *root, const struct btrfs_key *key,
struct btrfs_path *p, u64 time_seq)
{
struct btrfs_fs_info *fs_info = root->fs_info;
struct extent_buffer *b;
int slot;
int ret;
int err;
int level;
int lowest_unlock = 1;
u8 lowest_level = 0;
lowest_level = p->lowest_level;
WARN_ON(p->nodes[0] != NULL);
if (p->search_commit_root) {
BUG_ON(time_seq);
return btrfs_search_slot(NULL, root, key, p, 0, 0);
}
again:
b = btrfs_get_old_root(root, time_seq);
if (!b) {
ret = -EIO;
goto done;
}
level = btrfs_header_level(b);
p->locks[level] = BTRFS_READ_LOCK;
while (b) {
int dec = 0;
level = btrfs_header_level(b);
p->nodes[level] = b;
/*
* we have a lock on b and as long as we aren't changing
* the tree, there is no way to for the items in b to change.
* It is safe to drop the lock on our parent before we
* go through the expensive btree search on b.
*/
btrfs_unlock_up_safe(p, level + 1);
ret = btrfs_bin_search(b, key, &slot);
if (ret < 0)
goto done;
if (level == 0) {
p->slots[level] = slot;
unlock_up(p, level, lowest_unlock, 0, NULL);
goto done;
}
if (ret && slot > 0) {
dec = 1;
slot--;
}
p->slots[level] = slot;
unlock_up(p, level, lowest_unlock, 0, NULL);
if (level == lowest_level) {
if (dec)
p->slots[level]++;
goto done;
}
err = read_block_for_search(root, p, &b, level, slot, key);
if (err == -EAGAIN)
goto again;
if (err) {
ret = err;
goto done;
}
level = btrfs_header_level(b);
btrfs_tree_read_lock(b);
b = btrfs_tree_mod_log_rewind(fs_info, p, b, time_seq);
if (!b) {
ret = -ENOMEM;
goto done;
}
p->locks[level] = BTRFS_READ_LOCK;
p->nodes[level] = b;
}
ret = 1;
done:
if (ret < 0)
btrfs_release_path(p);
return ret;
}
/*
* helper to use instead of search slot if no exact match is needed but
* instead the next or previous item should be returned.
* When find_higher is true, the next higher item is returned, the next lower
* otherwise.
* When return_any and find_higher are both true, and no higher item is found,
* return the next lower instead.
* When return_any is true and find_higher is false, and no lower item is found,
* return the next higher instead.
* It returns 0 if any item is found, 1 if none is found (tree empty), and
* < 0 on error
*/
int btrfs_search_slot_for_read(struct btrfs_root *root,
const struct btrfs_key *key,
struct btrfs_path *p, int find_higher,
int return_any)
{
int ret;
struct extent_buffer *leaf;
again:
ret = btrfs_search_slot(NULL, root, key, p, 0, 0);
if (ret <= 0)
return ret;
/*
* a return value of 1 means the path is at the position where the
* item should be inserted. Normally this is the next bigger item,
* but in case the previous item is the last in a leaf, path points
* to the first free slot in the previous leaf, i.e. at an invalid
* item.
*/
leaf = p->nodes[0]; if (find_higher) {
if (p->slots[0] >= btrfs_header_nritems(leaf)) {
ret = btrfs_next_leaf(root, p);
if (ret <= 0)
return ret;
if (!return_any)
return 1;
/*
* no higher item found, return the next
* lower instead
*/
return_any = 0;
find_higher = 0;
btrfs_release_path(p);
goto again;
}
} else {
if (p->slots[0] == 0) { ret = btrfs_prev_leaf(root, p);
if (ret < 0)
return ret;
if (!ret) { leaf = p->nodes[0];
if (p->slots[0] == btrfs_header_nritems(leaf))
p->slots[0]--;
return 0;
}
if (!return_any)
return 1;
/*
* no lower item found, return the next
* higher instead
*/
return_any = 0;
find_higher = 1;
btrfs_release_path(p);
goto again;
} else {
--p->slots[0];
}
}
return 0;
}
/*
* Execute search and call btrfs_previous_item to traverse backwards if the item
* was not found.
*
* Return 0 if found, 1 if not found and < 0 if error.
*/
int btrfs_search_backwards(struct btrfs_root *root, struct btrfs_key *key,
struct btrfs_path *path)
{
int ret;
ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
if (ret > 0)
ret = btrfs_previous_item(root, path, key->objectid, key->type); if (ret == 0) btrfs_item_key_to_cpu(path->nodes[0], key, path->slots[0]); return ret;
}
/*
* adjust the pointers going up the tree, starting at level
* making sure the right key of each node is points to 'key'.
* This is used after shifting pointers to the left, so it stops
* fixing up pointers when a given leaf/node is not in slot 0 of the
* higher levels
*
*/
static void fixup_low_keys(struct btrfs_path *path,
struct btrfs_disk_key *key, int level)
{
int i;
struct extent_buffer *t;
int ret;
for (i = level; i < BTRFS_MAX_LEVEL; i++) { int tslot = path->slots[i];
if (!path->nodes[i])
break;
t = path->nodes[i];
ret = btrfs_tree_mod_log_insert_key(t, tslot,
BTRFS_MOD_LOG_KEY_REPLACE, GFP_ATOMIC);
BUG_ON(ret < 0);
btrfs_set_node_key(t, key, tslot);
btrfs_mark_buffer_dirty(path->nodes[i]);
if (tslot != 0)
break;
}
}
/*
* update item key.
*
* This function isn't completely safe. It's the caller's responsibility
* that the new key won't break the order
*/
void btrfs_set_item_key_safe(struct btrfs_fs_info *fs_info,
struct btrfs_path *path,
const struct btrfs_key *new_key)
{
struct btrfs_disk_key disk_key;
struct extent_buffer *eb;
int slot;
eb = path->nodes[0];
slot = path->slots[0];
if (slot > 0) {
btrfs_item_key(eb, &disk_key, slot - 1);
if (unlikely(comp_keys(&disk_key, new_key) >= 0)) {
btrfs_crit(fs_info,
"slot %u key (%llu %u %llu) new key (%llu %u %llu)",
slot, btrfs_disk_key_objectid(&disk_key),
btrfs_disk_key_type(&disk_key),
btrfs_disk_key_offset(&disk_key),
new_key->objectid, new_key->type,
new_key->offset);
btrfs_print_leaf(eb);
BUG();
}
}
if (slot < btrfs_header_nritems(eb) - 1) { btrfs_item_key(eb, &disk_key, slot + 1);
if (unlikely(comp_keys(&disk_key, new_key) <= 0)) {
btrfs_crit(fs_info,
"slot %u key (%llu %u %llu) new key (%llu %u %llu)",
slot, btrfs_disk_key_objectid(&disk_key),
btrfs_disk_key_type(&disk_key),
btrfs_disk_key_offset(&disk_key),
new_key->objectid, new_key->type,
new_key->offset);
btrfs_print_leaf(eb);
BUG();
}
}
btrfs_cpu_key_to_disk(&disk_key, new_key);
btrfs_set_item_key(eb, &disk_key, slot);
btrfs_mark_buffer_dirty(eb);
if (slot == 0) fixup_low_keys(path, &disk_key, 1);
}
/*
* Check key order of two sibling extent buffers.
*
* Return true if something is wrong.
* Return false if everything is fine.
*
* Tree-checker only works inside one tree block, thus the following
* corruption can not be detected by tree-checker:
*
* Leaf @left | Leaf @right
* --------------------------------------------------------------
* | 1 | 2 | 3 | 4 | 5 | f6 | | 7 | 8 |
*
* Key f6 in leaf @left itself is valid, but not valid when the next
* key in leaf @right is 7.
* This can only be checked at tree block merge time.
* And since tree checker has ensured all key order in each tree block
* is correct, we only need to bother the last key of @left and the first
* key of @right.
*/
static bool check_sibling_keys(struct extent_buffer *left,
struct extent_buffer *right)
{
struct btrfs_key left_last;
struct btrfs_key right_first;
int level = btrfs_header_level(left);
int nr_left = btrfs_header_nritems(left);
int nr_right = btrfs_header_nritems(right);
/* No key to check in one of the tree blocks */
if (!nr_left || !nr_right)
return false;
if (level) {
btrfs_node_key_to_cpu(left, &left_last, nr_left - 1);
btrfs_node_key_to_cpu(right, &right_first, 0);
} else {
btrfs_item_key_to_cpu(left, &left_last, nr_left - 1);
btrfs_item_key_to_cpu(right, &right_first, 0);
}
if (btrfs_comp_cpu_keys(&left_last, &right_first) >= 0) {
btrfs_crit(left->fs_info,
"bad key order, sibling blocks, left last (%llu %u %llu) right first (%llu %u %llu)",
left_last.objectid, left_last.type,
left_last.offset, right_first.objectid,
right_first.type, right_first.offset);
return true;
}
return false;
}
/*
* try to push data from one node into the next node left in the
* tree.
*
* returns 0 if some ptrs were pushed left, < 0 if there was some horrible
* error, and > 0 if there was no room in the left hand block.
*/
static int push_node_left(struct btrfs_trans_handle *trans,
struct extent_buffer *dst,
struct extent_buffer *src, int empty)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
int push_items = 0;
int src_nritems;
int dst_nritems;
int ret = 0;
src_nritems = btrfs_header_nritems(src);
dst_nritems = btrfs_header_nritems(dst);
push_items = BTRFS_NODEPTRS_PER_BLOCK(fs_info) - dst_nritems;
WARN_ON(btrfs_header_generation(src) != trans->transid);
WARN_ON(btrfs_header_generation(dst) != trans->transid);
if (!empty && src_nritems <= 8)
return 1;
if (push_items <= 0)
return 1;
if (empty) {
push_items = min(src_nritems, push_items);
if (push_items < src_nritems) {
/* leave at least 8 pointers in the node if
* we aren't going to empty it
*/
if (src_nritems - push_items < 8) {
if (push_items <= 8)
return 1;
push_items -= 8;
}
}
} else
push_items = min(src_nritems - 8, push_items);
/* dst is the left eb, src is the middle eb */
if (check_sibling_keys(dst, src)) {
ret = -EUCLEAN;
btrfs_abort_transaction(trans, ret);
return ret;
}
ret = btrfs_tree_mod_log_eb_copy(dst, src, dst_nritems, 0, push_items);
if (ret) {
btrfs_abort_transaction(trans, ret);
return ret;
}
copy_extent_buffer(dst, src,
btrfs_node_key_ptr_offset(dst_nritems),
btrfs_node_key_ptr_offset(0),
push_items * sizeof(struct btrfs_key_ptr));
if (push_items < src_nritems) {
/*
* Don't call btrfs_tree_mod_log_insert_move() here, key removal
* was already fully logged by btrfs_tree_mod_log_eb_copy() above.
*/
memmove_extent_buffer(src, btrfs_node_key_ptr_offset(0),
btrfs_node_key_ptr_offset(push_items),
(src_nritems - push_items) *
sizeof(struct btrfs_key_ptr));
}
btrfs_set_header_nritems(src, src_nritems - push_items);
btrfs_set_header_nritems(dst, dst_nritems + push_items);
btrfs_mark_buffer_dirty(src);
btrfs_mark_buffer_dirty(dst);
return ret;
}
/*
* try to push data from one node into the next node right in the
* tree.
*
* returns 0 if some ptrs were pushed, < 0 if there was some horrible
* error, and > 0 if there was no room in the right hand block.
*
* this will only push up to 1/2 the contents of the left node over
*/
static int balance_node_right(struct btrfs_trans_handle *trans,
struct extent_buffer *dst,
struct extent_buffer *src)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
int push_items = 0;
int max_push;
int src_nritems;
int dst_nritems;
int ret = 0;
WARN_ON(btrfs_header_generation(src) != trans->transid);
WARN_ON(btrfs_header_generation(dst) != trans->transid);
src_nritems = btrfs_header_nritems(src);
dst_nritems = btrfs_header_nritems(dst);
push_items = BTRFS_NODEPTRS_PER_BLOCK(fs_info) - dst_nritems;
if (push_items <= 0)
return 1;
if (src_nritems < 4)
return 1;
max_push = src_nritems / 2 + 1;
/* don't try to empty the node */
if (max_push >= src_nritems)
return 1;
if (max_push < push_items)
push_items = max_push;
/* dst is the right eb, src is the middle eb */
if (check_sibling_keys(src, dst)) {
ret = -EUCLEAN;
btrfs_abort_transaction(trans, ret);
return ret;
}
ret = btrfs_tree_mod_log_insert_move(dst, push_items, 0, dst_nritems);
BUG_ON(ret < 0);
memmove_extent_buffer(dst, btrfs_node_key_ptr_offset(push_items),
btrfs_node_key_ptr_offset(0),
(dst_nritems) *
sizeof(struct btrfs_key_ptr));
ret = btrfs_tree_mod_log_eb_copy(dst, src, 0, src_nritems - push_items,
push_items);
if (ret) {
btrfs_abort_transaction(trans, ret);
return ret;
}
copy_extent_buffer(dst, src,
btrfs_node_key_ptr_offset(0),
btrfs_node_key_ptr_offset(src_nritems - push_items),
push_items * sizeof(struct btrfs_key_ptr));
btrfs_set_header_nritems(src, src_nritems - push_items);
btrfs_set_header_nritems(dst, dst_nritems + push_items);
btrfs_mark_buffer_dirty(src);
btrfs_mark_buffer_dirty(dst);
return ret;
}
/*
* helper function to insert a new root level in the tree.
* A new node is allocated, and a single item is inserted to
* point to the existing root
*
* returns zero on success or < 0 on failure.
*/
static noinline int insert_new_root(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_path *path, int level)
{
struct btrfs_fs_info *fs_info = root->fs_info;
u64 lower_gen;
struct extent_buffer *lower;
struct extent_buffer *c;
struct extent_buffer *old;
struct btrfs_disk_key lower_key;
int ret;
BUG_ON(path->nodes[level]); BUG_ON(path->nodes[level-1] != root->node);
lower = path->nodes[level-1];
if (level == 1)
btrfs_item_key(lower, &lower_key, 0);
else
btrfs_node_key(lower, &lower_key, 0);
c = btrfs_alloc_tree_block(trans, root, 0, root->root_key.objectid,
&lower_key, level, root->node->start, 0,
BTRFS_NESTING_NEW_ROOT);
if (IS_ERR(c))
return PTR_ERR(c);
root_add_used(root, fs_info->nodesize);
btrfs_set_header_nritems(c, 1);
btrfs_set_node_key(c, &lower_key, 0);
btrfs_set_node_blockptr(c, 0, lower->start);
lower_gen = btrfs_header_generation(lower);
WARN_ON(lower_gen != trans->transid);
btrfs_set_node_ptr_generation(c, 0, lower_gen);
btrfs_mark_buffer_dirty(c);
old = root->node;
ret = btrfs_tree_mod_log_insert_root(root->node, c, false);
BUG_ON(ret < 0); rcu_assign_pointer(root->node, c);
/* the super has an extra ref to root->node */
free_extent_buffer(old);
add_root_to_dirty_list(root);
atomic_inc(&c->refs);
path->nodes[level] = c;
path->locks[level] = BTRFS_WRITE_LOCK;
path->slots[level] = 0;
return 0;
}
/*
* worker function to insert a single pointer in a node.
* the node should have enough room for the pointer already
*
* slot and level indicate where you want the key to go, and
* blocknr is the block the key points to.
*/
static void insert_ptr(struct btrfs_trans_handle *trans,
struct btrfs_path *path,
struct btrfs_disk_key *key, u64 bytenr,
int slot, int level)
{
struct extent_buffer *lower;
int nritems;
int ret;
BUG_ON(!path->nodes[level]);
btrfs_assert_tree_locked(path->nodes[level]);
lower = path->nodes[level];
nritems = btrfs_header_nritems(lower); BUG_ON(slot > nritems); BUG_ON(nritems == BTRFS_NODEPTRS_PER_BLOCK(trans->fs_info)); if (slot != nritems) { if (level) { ret = btrfs_tree_mod_log_insert_move(lower, slot + 1,
slot, nritems - slot);
BUG_ON(ret < 0);
}
memmove_extent_buffer(lower,
btrfs_node_key_ptr_offset(slot + 1),
btrfs_node_key_ptr_offset(slot),
(nritems - slot) * sizeof(struct btrfs_key_ptr));
}
if (level) { ret = btrfs_tree_mod_log_insert_key(lower, slot,
BTRFS_MOD_LOG_KEY_ADD, GFP_NOFS);
BUG_ON(ret < 0);
}
btrfs_set_node_key(lower, key, slot);
btrfs_set_node_blockptr(lower, slot, bytenr);
WARN_ON(trans->transid == 0);
btrfs_set_node_ptr_generation(lower, slot, trans->transid);
btrfs_set_header_nritems(lower, nritems + 1);
btrfs_mark_buffer_dirty(lower);
}
/*
* split the node at the specified level in path in two.
* The path is corrected to point to the appropriate node after the split
*
* Before splitting this tries to make some room in the node by pushing
* left and right, if either one works, it returns right away.
*
* returns 0 on success and < 0 on failure
*/
static noinline int split_node(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_path *path, int level)
{
struct btrfs_fs_info *fs_info = root->fs_info;
struct extent_buffer *c;
struct extent_buffer *split;
struct btrfs_disk_key disk_key;
int mid;
int ret;
u32 c_nritems;
c = path->nodes[level];
WARN_ON(btrfs_header_generation(c) != trans->transid);
if (c == root->node) {
/*
* trying to split the root, lets make a new one
*
* tree mod log: We don't log_removal old root in
* insert_new_root, because that root buffer will be kept as a
* normal node. We are going to log removal of half of the
* elements below with btrfs_tree_mod_log_eb_copy(). We're
* holding a tree lock on the buffer, which is why we cannot
* race with other tree_mod_log users.
*/
ret = insert_new_root(trans, root, path, level + 1);
if (ret)
return ret;
} else {
ret = push_nodes_for_insert(trans, root, path, level);
c = path->nodes[level];
if (!ret && btrfs_header_nritems(c) <
BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 3)
return 0;
if (ret < 0)
return ret;
}
c_nritems = btrfs_header_nritems(c);
mid = (c_nritems + 1) / 2;
btrfs_node_key(c, &disk_key, mid);
split = btrfs_alloc_tree_block(trans, root, 0, root->root_key.objectid,
&disk_key, level, c->start, 0,
BTRFS_NESTING_SPLIT);
if (IS_ERR(split))
return PTR_ERR(split);
root_add_used(root, fs_info->nodesize);
ASSERT(btrfs_header_level(c) == level);
ret = btrfs_tree_mod_log_eb_copy(split, c, 0, mid, c_nritems - mid);
if (ret) {
btrfs_abort_transaction(trans, ret);
return ret;
}
copy_extent_buffer(split, c,
btrfs_node_key_ptr_offset(0),
btrfs_node_key_ptr_offset(mid),
(c_nritems - mid) * sizeof(struct btrfs_key_ptr));
btrfs_set_header_nritems(split, c_nritems - mid);
btrfs_set_header_nritems(c, mid);
btrfs_mark_buffer_dirty(c);
btrfs_mark_buffer_dirty(split);
insert_ptr(trans, path, &disk_key, split->start,
path->slots[level + 1] + 1, level + 1);
if (path->slots[level] >= mid) {
path->slots[level] -= mid;
btrfs_tree_unlock(c);
free_extent_buffer(c);
path->nodes[level] = split;
path->slots[level + 1] += 1;
} else {
btrfs_tree_unlock(split);
free_extent_buffer(split);
}
return 0;
}
/*
* how many bytes are required to store the items in a leaf. start
* and nr indicate which items in the leaf to check. This totals up the
* space used both by the item structs and the item data
*/
static int leaf_space_used(struct extent_buffer *l, int start, int nr)
{
struct btrfs_item *start_item;
struct btrfs_item *end_item;
int data_len;
int nritems = btrfs_header_nritems(l);
int end = min(nritems, start + nr) - 1;
if (!nr)
return 0;
start_item = btrfs_item_nr(start);
end_item = btrfs_item_nr(end);
data_len = btrfs_item_offset(l, start_item) +
btrfs_item_size(l, start_item);
data_len = data_len - btrfs_item_offset(l, end_item);
data_len += sizeof(struct btrfs_item) * nr;
WARN_ON(data_len < 0);
return data_len;
}
/*
* The space between the end of the leaf items and
* the start of the leaf data. IOW, how much room
* the leaf has left for both items and data
*/
noinline int btrfs_leaf_free_space(struct extent_buffer *leaf)
{
struct btrfs_fs_info *fs_info = leaf->fs_info;
int nritems = btrfs_header_nritems(leaf);
int ret;
ret = BTRFS_LEAF_DATA_SIZE(fs_info) - leaf_space_used(leaf, 0, nritems);
if (ret < 0) {
btrfs_crit(fs_info,
"leaf free space ret %d, leaf data size %lu, used %d nritems %d",
ret,
(unsigned long) BTRFS_LEAF_DATA_SIZE(fs_info),
leaf_space_used(leaf, 0, nritems), nritems);
}
return ret;
}
/*
* min slot controls the lowest index we're willing to push to the
* right. We'll push up to and including min_slot, but no lower
*/
static noinline int __push_leaf_right(struct btrfs_path *path,
int data_size, int empty,
struct extent_buffer *right,
int free_space, u32 left_nritems,
u32 min_slot)
{
struct btrfs_fs_info *fs_info = right->fs_info;
struct extent_buffer *left = path->nodes[0];
struct extent_buffer *upper = path->nodes[1];
struct btrfs_map_token token;
struct btrfs_disk_key disk_key;
int slot;
u32 i;
int push_space = 0;
int push_items = 0;
struct btrfs_item *item;
u32 nr;
u32 right_nritems;
u32 data_end;
u32 this_item_size;
if (empty)
nr = 0;
else
nr = max_t(u32, 1, min_slot); if (path->slots[0] >= left_nritems)
push_space += data_size;
slot = path->slots[1];
i = left_nritems - 1;
while (i >= nr) { item = btrfs_item_nr(i); if (!empty && push_items > 0) { if (path->slots[0] > i)
break;
if (path->slots[0] == i) { int space = btrfs_leaf_free_space(left); if (space + push_space * 2 > free_space)
break;
}
}
if (path->slots[0] == i) push_space += data_size;
this_item_size = btrfs_item_size(left, item);
if (this_item_size + sizeof(*item) + push_space > free_space)
break;
push_items++;
push_space += this_item_size + sizeof(*item);
if (i == 0)
break;
i--;
}
if (push_items == 0)
goto out_unlock;
WARN_ON(!empty && push_items == left_nritems);
/* push left to right */
right_nritems = btrfs_header_nritems(right);
push_space = btrfs_item_end_nr(left, left_nritems - push_items);
push_space -= leaf_data_end(left);
/* make room in the right data area */
data_end = leaf_data_end(right);
memmove_extent_buffer(right,
BTRFS_LEAF_DATA_OFFSET + data_end - push_space,
BTRFS_LEAF_DATA_OFFSET + data_end,
BTRFS_LEAF_DATA_SIZE(fs_info) - data_end);
/* copy from the left data area */
copy_extent_buffer(right, left, BTRFS_LEAF_DATA_OFFSET +
BTRFS_LEAF_DATA_SIZE(fs_info) - push_space,
BTRFS_LEAF_DATA_OFFSET + leaf_data_end(left),
push_space);
memmove_extent_buffer(right, btrfs_item_nr_offset(push_items),
btrfs_item_nr_offset(0),
right_nritems * sizeof(struct btrfs_item));
/* copy the items from left to right */
copy_extent_buffer(right, left, btrfs_item_nr_offset(0),
btrfs_item_nr_offset(left_nritems - push_items),
push_items * sizeof(struct btrfs_item));
/* update the item pointers */
btrfs_init_map_token(&token, right);
right_nritems += push_items;
btrfs_set_header_nritems(right, right_nritems);
push_space = BTRFS_LEAF_DATA_SIZE(fs_info);
for (i = 0; i < right_nritems; i++) { item = btrfs_item_nr(i);
push_space -= btrfs_token_item_size(&token, item);
btrfs_set_token_item_offset(&token, item, push_space);
}
left_nritems -= push_items;
btrfs_set_header_nritems(left, left_nritems);
if (left_nritems)
btrfs_mark_buffer_dirty(left);
else
btrfs_clean_tree_block(left);
btrfs_mark_buffer_dirty(right);
btrfs_item_key(right, &disk_key, 0);
btrfs_set_node_key(upper, &disk_key, slot + 1);
btrfs_mark_buffer_dirty(upper);
/* then fixup the leaf pointer in the path */
if (path->slots[0] >= left_nritems) {
path->slots[0] -= left_nritems;
if (btrfs_header_nritems(path->nodes[0]) == 0)
btrfs_clean_tree_block(path->nodes[0]); btrfs_tree_unlock(path->nodes[0]);
free_extent_buffer(path->nodes[0]);
path->nodes[0] = right;
path->slots[1] += 1;
} else {
btrfs_tree_unlock(right); free_extent_buffer(right);
}
return 0;
out_unlock:
btrfs_tree_unlock(right);
free_extent_buffer(right);
return 1;
}
/*
* push some data in the path leaf to the right, trying to free up at
* least data_size bytes. returns zero if the push worked, nonzero otherwise
*
* returns 1 if the push failed because the other node didn't have enough
* room, 0 if everything worked out and < 0 if there were major errors.
*
* this will push starting from min_slot to the end of the leaf. It won't
* push any slot lower than min_slot
*/
static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
*root, struct btrfs_path *path,
int min_data_size, int data_size,
int empty, u32 min_slot)
{
struct extent_buffer *left = path->nodes[0];
struct extent_buffer *right;
struct extent_buffer *upper;
int slot;
int free_space;
u32 left_nritems;
int ret;
if (!path->nodes[1])
return 1;
slot = path->slots[1];
upper = path->nodes[1];
if (slot >= btrfs_header_nritems(upper) - 1)
return 1;
btrfs_assert_tree_locked(path->nodes[1]);
right = btrfs_read_node_slot(upper, slot + 1);
/*
* slot + 1 is not valid or we fail to read the right node,
* no big deal, just return.
*/
if (IS_ERR(right))
return 1;
__btrfs_tree_lock(right, BTRFS_NESTING_RIGHT);
free_space = btrfs_leaf_free_space(right);
if (free_space < data_size)
goto out_unlock;
/* cow and double check */
ret = btrfs_cow_block(trans, root, right, upper,
slot + 1, &right, BTRFS_NESTING_RIGHT_COW);
if (ret)
goto out_unlock;
free_space = btrfs_leaf_free_space(right);
if (free_space < data_size)
goto out_unlock;
left_nritems = btrfs_header_nritems(left);
if (left_nritems == 0)
goto out_unlock;
if (check_sibling_keys(left, right)) {
ret = -EUCLEAN;
btrfs_tree_unlock(right);
free_extent_buffer(right);
return ret;
}
if (path->slots[0] == left_nritems && !empty) {
/* Key greater than all keys in the leaf, right neighbor has
* enough room for it and we're not emptying our leaf to delete
* it, therefore use right neighbor to insert the new item and
* no need to touch/dirty our left leaf. */
btrfs_tree_unlock(left);
free_extent_buffer(left);
path->nodes[0] = right;
path->slots[0] = 0;
path->slots[1]++;
return 0;
}
return __push_leaf_right(path, min_data_size, empty,
right, free_space, left_nritems, min_slot);
out_unlock:
btrfs_tree_unlock(right);
free_extent_buffer(right);
return 1;
}
/*
* push some data in the path leaf to the left, trying to free up at
* least data_size bytes. returns zero if the push worked, nonzero otherwise
*
* max_slot can put a limit on how far into the leaf we'll push items. The
* item at 'max_slot' won't be touched. Use (u32)-1 to make us do all the
* items
*/
static noinline int __push_leaf_left(struct btrfs_path *path, int data_size,
int empty, struct extent_buffer *left,
int free_space, u32 right_nritems,
u32 max_slot)
{
struct btrfs_fs_info *fs_info = left->fs_info;
struct btrfs_disk_key disk_key;
struct extent_buffer *right = path->nodes[0];
int i;
int push_space = 0;
int push_items = 0;
struct btrfs_item *item;
u32 old_left_nritems;
u32 nr;
int ret = 0;
u32 this_item_size;
u32 old_left_item_size;
struct btrfs_map_token token;
if (empty)
nr = min(right_nritems, max_slot);
else
nr = min(right_nritems - 1, max_slot); for (i = 0; i < nr; i++) {
item = btrfs_item_nr(i);
if (!empty && push_items > 0) { if (path->slots[0] < i)
break;
if (path->slots[0] == i) { int space = btrfs_leaf_free_space(right); if (space + push_space * 2 > free_space)
break;
}
}
if (path->slots[0] == i) push_space += data_size;
this_item_size = btrfs_item_size(right, item);
if (this_item_size + sizeof(*item) + push_space > free_space)
break;
push_items++;
push_space += this_item_size + sizeof(*item);
}
if (push_items == 0) {
ret = 1;
goto out;
}
WARN_ON(!empty && push_items == btrfs_header_nritems(right));
/* push data from right to left */
copy_extent_buffer(left, right,
btrfs_item_nr_offset(btrfs_header_nritems(left)),
btrfs_item_nr_offset(0),
push_items * sizeof(struct btrfs_item));
push_space = BTRFS_LEAF_DATA_SIZE(fs_info) -
btrfs_item_offset_nr(right, push_items - 1);
copy_extent_buffer(left, right, BTRFS_LEAF_DATA_OFFSET +
leaf_data_end(left) - push_space,
BTRFS_LEAF_DATA_OFFSET +
btrfs_item_offset_nr(right, push_items - 1),
push_space);
old_left_nritems = btrfs_header_nritems(left);
BUG_ON(old_left_nritems <= 0);
btrfs_init_map_token(&token, left);
old_left_item_size = btrfs_item_offset_nr(left, old_left_nritems - 1);
for (i = old_left_nritems; i < old_left_nritems + push_items; i++) {
u32 ioff;
item = btrfs_item_nr(i);
ioff = btrfs_token_item_offset(&token, item);
btrfs_set_token_item_offset(&token, item,
ioff - (BTRFS_LEAF_DATA_SIZE(fs_info) - old_left_item_size));
}
btrfs_set_header_nritems(left, old_left_nritems + push_items);
/* fixup right node */
if (push_items > right_nritems)
WARN(1, KERN_CRIT "push items %d nr %u\n", push_items,
right_nritems);
if (push_items < right_nritems) { push_space = btrfs_item_offset_nr(right, push_items - 1) -
leaf_data_end(right);
memmove_extent_buffer(right, BTRFS_LEAF_DATA_OFFSET +
BTRFS_LEAF_DATA_SIZE(fs_info) - push_space,
BTRFS_LEAF_DATA_OFFSET +
leaf_data_end(right), push_space);
memmove_extent_buffer(right, btrfs_item_nr_offset(0),
btrfs_item_nr_offset(push_items),
(btrfs_header_nritems(right) - push_items) *
sizeof(struct btrfs_item));
}
btrfs_init_map_token(&token, right);
right_nritems -= push_items;
btrfs_set_header_nritems(right, right_nritems);
push_space = BTRFS_LEAF_DATA_SIZE(fs_info);
for (i = 0; i < right_nritems; i++) {
item = btrfs_item_nr(i);
push_space = push_space - btrfs_token_item_size(&token, item);
btrfs_set_token_item_offset(&token, item, push_space);
}
btrfs_mark_buffer_dirty(left);
if (right_nritems)
btrfs_mark_buffer_dirty(right);
else
btrfs_clean_tree_block(right);
btrfs_item_key(right, &disk_key, 0);
fixup_low_keys(path, &disk_key, 1);
/* then fixup the leaf pointer in the path */
if (path->slots[0] < push_items) {
path->slots[0] += old_left_nritems;
btrfs_tree_unlock(path->nodes[0]);
free_extent_buffer(path->nodes[0]);
path->nodes[0] = left;
path->slots[1] -= 1;
} else {
btrfs_tree_unlock(left);
free_extent_buffer(left);
path->slots[0] -= push_items;
}
BUG_ON(path->slots[0] < 0);
return ret;
out:
btrfs_tree_unlock(left);
free_extent_buffer(left);
return ret;
}
/*
* push some data in the path leaf to the left, trying to free up at
* least data_size bytes. returns zero if the push worked, nonzero otherwise
*
* max_slot can put a limit on how far into the leaf we'll push items. The
* item at 'max_slot' won't be touched. Use (u32)-1 to make us push all the
* items
*/
static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
*root, struct btrfs_path *path, int min_data_size,
int data_size, int empty, u32 max_slot)
{
struct extent_buffer *right = path->nodes[0];
struct extent_buffer *left;
int slot;
int free_space;
u32 right_nritems;
int ret = 0;
slot = path->slots[1];
if (slot == 0)
return 1; if (!path->nodes[1])
return 1;
right_nritems = btrfs_header_nritems(right);
if (right_nritems == 0)
return 1;
btrfs_assert_tree_locked(path->nodes[1]);
left = btrfs_read_node_slot(path->nodes[1], slot - 1);
/*
* slot - 1 is not valid or we fail to read the left node,
* no big deal, just return.
*/
if (IS_ERR(left))
return 1;
__btrfs_tree_lock(left, BTRFS_NESTING_LEFT);
free_space = btrfs_leaf_free_space(left);
if (free_space < data_size) {
ret = 1;
goto out;
}
/* cow and double check */
ret = btrfs_cow_block(trans, root, left,
path->nodes[1], slot - 1, &left,
BTRFS_NESTING_LEFT_COW);
if (ret) {
/* we hit -ENOSPC, but it isn't fatal here */
if (ret == -ENOSPC)
ret = 1;
goto out;
}
free_space = btrfs_leaf_free_space(left);
if (free_space < data_size) {
ret = 1;
goto out;
}
if (check_sibling_keys(left, right)) {
ret = -EUCLEAN;
goto out;
}
return __push_leaf_left(path, min_data_size,
empty, left, free_space, right_nritems,
max_slot);
out:
btrfs_tree_unlock(left);
free_extent_buffer(left);
return ret;
}
/*
* split the path's leaf in two, making sure there is at least data_size
* available for the resulting leaf level of the path.
*/
static noinline void copy_for_split(struct btrfs_trans_handle *trans,
struct btrfs_path *path,
struct extent_buffer *l,
struct extent_buffer *right,
int slot, int mid, int nritems)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
int data_copy_size;
int rt_data_off;
int i;
struct btrfs_disk_key disk_key;
struct btrfs_map_token token;
nritems = nritems - mid;
btrfs_set_header_nritems(right, nritems);
data_copy_size = btrfs_item_end_nr(l, mid) - leaf_data_end(l);
copy_extent_buffer(right, l, btrfs_item_nr_offset(0),
btrfs_item_nr_offset(mid),
nritems * sizeof(struct btrfs_item));
copy_extent_buffer(right, l,
BTRFS_LEAF_DATA_OFFSET + BTRFS_LEAF_DATA_SIZE(fs_info) -
data_copy_size, BTRFS_LEAF_DATA_OFFSET +
leaf_data_end(l), data_copy_size);
rt_data_off = BTRFS_LEAF_DATA_SIZE(fs_info) - btrfs_item_end_nr(l, mid);
btrfs_init_map_token(&token, right);
for (i = 0; i < nritems; i++) {
struct btrfs_item *item = btrfs_item_nr(i);
u32 ioff;
ioff = btrfs_token_item_offset(&token, item);
btrfs_set_token_item_offset(&token, item, ioff + rt_data_off);
}
btrfs_set_header_nritems(l, mid);
btrfs_item_key(right, &disk_key, 0);
insert_ptr(trans, path, &disk_key, right->start, path->slots[1] + 1, 1);
btrfs_mark_buffer_dirty(right);
btrfs_mark_buffer_dirty(l);
BUG_ON(path->slots[0] != slot); if (mid <= slot) { btrfs_tree_unlock(path->nodes[0]);
free_extent_buffer(path->nodes[0]);
path->nodes[0] = right;
path->slots[0] -= mid;
path->slots[1] += 1;
} else {
btrfs_tree_unlock(right);
free_extent_buffer(right);
}
BUG_ON(path->slots[0] < 0);
}
/*
* double splits happen when we need to insert a big item in the middle
* of a leaf. A double split can leave us with 3 mostly empty leaves:
* leaf: [ slots 0 - N] [ our target ] [ N + 1 - total in leaf ]
* A B C
*
* We avoid this by trying to push the items on either side of our target
* into the adjacent leaves. If all goes well we can avoid the double split
* completely.
*/
static noinline int push_for_double_split(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_path *path,
int data_size)
{
int ret;
int progress = 0;
int slot;
u32 nritems;
int space_needed = data_size;
slot = path->slots[0];
if (slot < btrfs_header_nritems(path->nodes[0]))
space_needed -= btrfs_leaf_free_space(path->nodes[0]);
/*
* try to push all the items after our slot into the
* right leaf
*/
ret = push_leaf_right(trans, root, path, 1, space_needed, 0, slot);
if (ret < 0)
return ret;
if (ret == 0)
progress++;
nritems = btrfs_header_nritems(path->nodes[0]);
/*
* our goal is to get our slot at the start or end of a leaf. If
* we've done so we're done
*/
if (path->slots[0] == 0 || path->slots[0] == nritems)
return 0;
if (btrfs_leaf_free_space(path->nodes[0]) >= data_size)
return 0;
/* try to push all the items before our slot into the next leaf */
slot = path->slots[0];
space_needed = data_size;
if (slot > 0)
space_needed -= btrfs_leaf_free_space(path->nodes[0]);
ret = push_leaf_left(trans, root, path, 1, space_needed, 0, slot);
if (ret < 0)
return ret;
if (ret == 0)
progress++;
if (progress)
return 0;
return 1;
}
/*
* split the path's leaf in two, making sure there is at least data_size
* available for the resulting leaf level of the path.
*
* returns 0 if all went well and < 0 on failure.
*/
static noinline int split_leaf(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
const struct btrfs_key *ins_key,
struct btrfs_path *path, int data_size,
int extend)
{
struct btrfs_disk_key disk_key;
struct extent_buffer *l;
u32 nritems;
int mid;
int slot;
struct extent_buffer *right;
struct btrfs_fs_info *fs_info = root->fs_info;
int ret = 0;
int wret;
int split;
int num_doubles = 0;
int tried_avoid_double = 0;
l = path->nodes[0];
slot = path->slots[0];
if (extend && data_size + btrfs_item_size_nr(l, slot) +
sizeof(struct btrfs_item) > BTRFS_LEAF_DATA_SIZE(fs_info))
return -EOVERFLOW;
/* first try to make some room by pushing left and right */
if (data_size && path->nodes[1]) {
int space_needed = data_size;
if (slot < btrfs_header_nritems(l)) space_needed -= btrfs_leaf_free_space(l); wret = push_leaf_right(trans, root, path, space_needed,
space_needed, 0, 0);
if (wret < 0)
return wret;
if (wret) {
space_needed = data_size;
if (slot > 0) space_needed -= btrfs_leaf_free_space(l); wret = push_leaf_left(trans, root, path, space_needed,
space_needed, 0, (u32)-1);
if (wret < 0)
return wret;
}
l = path->nodes[0];
/* did the pushes work? */
if (btrfs_leaf_free_space(l) >= data_size)
return 0;
}
if (!path->nodes[1]) { ret = insert_new_root(trans, root, path, 1);
if (ret)
return ret;
}
again:
split = 1;
l = path->nodes[0];
slot = path->slots[0];
nritems = btrfs_header_nritems(l);
mid = (nritems + 1) / 2;
if (mid <= slot) {
if (nritems == 1 || leaf_space_used(l, mid, nritems - mid) + data_size >
BTRFS_LEAF_DATA_SIZE(fs_info)) {
if (slot >= nritems) {
split = 0;
} else {
mid = slot;
if (mid != nritems &&
leaf_space_used(l, mid, nritems - mid) +
data_size > BTRFS_LEAF_DATA_SIZE(fs_info)) {
if (data_size && !tried_avoid_double)
goto push_for_double;
split = 2;
}
}
}
} else {
if (leaf_space_used(l, 0, mid) + data_size >
BTRFS_LEAF_DATA_SIZE(fs_info)) {
if (!extend && data_size && slot == 0) {
split = 0;
} else if ((extend || !data_size) && slot == 0) {
mid = 1;
} else {
mid = slot;
if (mid != nritems && leaf_space_used(l, mid, nritems - mid) +
data_size > BTRFS_LEAF_DATA_SIZE(fs_info)) {
if (data_size && !tried_avoid_double)
goto push_for_double;
split = 2;
}
}
}
}
if (split == 0)
btrfs_cpu_key_to_disk(&disk_key, ins_key);
else
btrfs_item_key(l, &disk_key, mid);
/*
* We have to about BTRFS_NESTING_NEW_ROOT here if we've done a double
* split, because we're only allowed to have MAX_LOCKDEP_SUBCLASSES
* subclasses, which is 8 at the time of this patch, and we've maxed it
* out. In the future we could add a
* BTRFS_NESTING_SPLIT_THE_SPLITTENING if we need to, but for now just
* use BTRFS_NESTING_NEW_ROOT.
*/
right = btrfs_alloc_tree_block(trans, root, 0, root->root_key.objectid,
&disk_key, 0, l->start, 0,
num_doubles ? BTRFS_NESTING_NEW_ROOT :
BTRFS_NESTING_SPLIT);
if (IS_ERR(right))
return PTR_ERR(right);
root_add_used(root, fs_info->nodesize);
if (split == 0) {
if (mid <= slot) {
btrfs_set_header_nritems(right, 0);
insert_ptr(trans, path, &disk_key,
right->start, path->slots[1] + 1, 1);
btrfs_tree_unlock(path->nodes[0]);
free_extent_buffer(path->nodes[0]);
path->nodes[0] = right;
path->slots[0] = 0;
path->slots[1] += 1;
} else {
btrfs_set_header_nritems(right, 0);
insert_ptr(trans, path, &disk_key,
right->start, path->slots[1], 1);
btrfs_tree_unlock(path->nodes[0]);
free_extent_buffer(path->nodes[0]);
path->nodes[0] = right;
path->slots[0] = 0;
if (path->slots[1] == 0)
fixup_low_keys(path, &disk_key, 1);
}
/*
* We create a new leaf 'right' for the required ins_len and
* we'll do btrfs_mark_buffer_dirty() on this leaf after copying
* the content of ins_len to 'right'.
*/
return ret;
}
copy_for_split(trans, path, l, right, slot, mid, nritems);
if (split == 2) {
BUG_ON(num_doubles != 0);
num_doubles++;
goto again;
}
return 0;
push_for_double:
push_for_double_split(trans, root, path, data_size);
tried_avoid_double = 1;
if (btrfs_leaf_free_space(path->nodes[0]) >= data_size)
return 0;
goto again;
}
static noinline int setup_leaf_for_split(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_path *path, int ins_len)
{
struct btrfs_key key;
struct extent_buffer *leaf;
struct btrfs_file_extent_item *fi;
u64 extent_len = 0;
u32 item_size;
int ret;
leaf = path->nodes[0];
btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
BUG_ON(key.type != BTRFS_EXTENT_DATA_KEY &&
key.type != BTRFS_EXTENT_CSUM_KEY);
if (btrfs_leaf_free_space(leaf) >= ins_len)
return 0;
item_size = btrfs_item_size_nr(leaf, path->slots[0]);
if (key.type == BTRFS_EXTENT_DATA_KEY) {
fi = btrfs_item_ptr(leaf, path->slots[0],
struct btrfs_file_extent_item);
extent_len = btrfs_file_extent_num_bytes(leaf, fi);
}
btrfs_release_path(path);
path->keep_locks = 1;
path->search_for_split = 1;
ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
path->search_for_split = 0;
if (ret > 0)
ret = -EAGAIN;
if (ret < 0)
goto err;
ret = -EAGAIN;
leaf = path->nodes[0];
/* if our item isn't there, return now */
if (item_size != btrfs_item_size_nr(leaf, path->slots[0]))
goto err;
/* the leaf has changed, it now has room. return now */
if (btrfs_leaf_free_space(path->nodes[0]) >= ins_len)
goto err;
if (key.type == BTRFS_EXTENT_DATA_KEY) { fi = btrfs_item_ptr(leaf, path->slots[0],
struct btrfs_file_extent_item);
if (extent_len != btrfs_file_extent_num_bytes(leaf, fi))
goto err;
}
ret = split_leaf(trans, root, &key, path, ins_len, 1);
if (ret)
goto err;
path->keep_locks = 0;
btrfs_unlock_up_safe(path, 1);
return 0;
err:
path->keep_locks = 0;
return ret;
}
static noinline int split_item(struct btrfs_path *path,
const struct btrfs_key *new_key,
unsigned long split_offset)
{
struct extent_buffer *leaf;
struct btrfs_item *item;
struct btrfs_item *new_item;
int slot;
char *buf;
u32 nritems;
u32 item_size;
u32 orig_offset;
struct btrfs_disk_key disk_key;
leaf = path->nodes[0];
BUG_ON(btrfs_leaf_free_space(leaf) < sizeof(struct btrfs_item));
item = btrfs_item_nr(path->slots[0]);
orig_offset = btrfs_item_offset(leaf, item);
item_size = btrfs_item_size(leaf, item);
buf = kmalloc(item_size, GFP_NOFS);
if (!buf)
return -ENOMEM;
read_extent_buffer(leaf, buf, btrfs_item_ptr_offset(leaf,
path->slots[0]), item_size);
slot = path->slots[0] + 1;
nritems = btrfs_header_nritems(leaf);
if (slot != nritems) {
/* shift the items */
memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + 1),
btrfs_item_nr_offset(slot),
(nritems - slot) * sizeof(struct btrfs_item));
}
btrfs_cpu_key_to_disk(&disk_key, new_key);
btrfs_set_item_key(leaf, &disk_key, slot);
new_item = btrfs_item_nr(slot);
btrfs_set_item_offset(leaf, new_item, orig_offset);
btrfs_set_item_size(leaf, new_item, item_size - split_offset);
btrfs_set_item_offset(leaf, item,
orig_offset + item_size - split_offset);
btrfs_set_item_size(leaf, item, split_offset);
btrfs_set_header_nritems(leaf, nritems + 1);
/* write the data for the start of the original item */
write_extent_buffer(leaf, buf,
btrfs_item_ptr_offset(leaf, path->slots[0]),
split_offset);
/* write the data for the new item */
write_extent_buffer(leaf, buf + split_offset,
btrfs_item_ptr_offset(leaf, slot),
item_size - split_offset);
btrfs_mark_buffer_dirty(leaf);
BUG_ON(btrfs_leaf_free_space(leaf) < 0);
kfree(buf);
return 0;
}
/*
* This function splits a single item into two items,
* giving 'new_key' to the new item and splitting the
* old one at split_offset (from the start of the item).
*
* The path may be released by this operation. After
* the split, the path is pointing to the old item. The
* new item is going to be in the same node as the old one.
*
* Note, the item being split must be smaller enough to live alone on
* a tree block with room for one extra struct btrfs_item
*
* This allows us to split the item in place, keeping a lock on the
* leaf the entire time.
*/
int btrfs_split_item(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_path *path,
const struct btrfs_key *new_key,
unsigned long split_offset)
{
int ret;
ret = setup_leaf_for_split(trans, root, path,
sizeof(struct btrfs_item));
if (ret)
return ret;
ret = split_item(path, new_key, split_offset);
return ret;
}
/*
* This function duplicate a item, giving 'new_key' to the new item.
* It guarantees both items live in the same tree leaf and the new item
* is contiguous with the original item.
*
* This allows us to split file extent in place, keeping a lock on the
* leaf the entire time.
*/
int btrfs_duplicate_item(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_path *path,
const struct btrfs_key *new_key)
{
struct extent_buffer *leaf;
int ret;
u32 item_size;
leaf = path->nodes[0];
item_size = btrfs_item_size_nr(leaf, path->slots[0]);
ret = setup_leaf_for_split(trans, root, path,
item_size + sizeof(struct btrfs_item));
if (ret)
return ret;
path->slots[0]++;
setup_items_for_insert(root, path, new_key, &item_size, 1);
leaf = path->nodes[0];
memcpy_extent_buffer(leaf,
btrfs_item_ptr_offset(leaf, path->slots[0]),
btrfs_item_ptr_offset(leaf, path->slots[0] - 1),
item_size);
return 0;
}
/*
* make the item pointed to by the path smaller. new_size indicates
* how small to make it, and from_end tells us if we just chop bytes
* off the end of the item or if we shift the item to chop bytes off
* the front.
*/
void btrfs_truncate_item(struct btrfs_path *path, u32 new_size, int from_end)
{
int slot;
struct extent_buffer *leaf;
struct btrfs_item *item;
u32 nritems;
unsigned int data_end;
unsigned int old_data_start;
unsigned int old_size;
unsigned int size_diff;
int i;
struct btrfs_map_token token;
leaf = path->nodes[0];
slot = path->slots[0];
old_size = btrfs_item_size_nr(leaf, slot);
if (old_size == new_size)
return; nritems = btrfs_header_nritems(leaf);
data_end = leaf_data_end(leaf);
old_data_start = btrfs_item_offset_nr(leaf, slot);
size_diff = old_size - new_size;
BUG_ON(slot < 0); BUG_ON(slot >= nritems);
/*
* item0..itemN ... dataN.offset..dataN.size .. data0.size
*/
/* first correct the data pointers */
btrfs_init_map_token(&token, leaf);
for (i = slot; i < nritems; i++) {
u32 ioff;
item = btrfs_item_nr(i);
ioff = btrfs_token_item_offset(&token, item);
btrfs_set_token_item_offset(&token, item, ioff + size_diff);
}
/* shift the data */
if (from_end) {
memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET +
data_end + size_diff, BTRFS_LEAF_DATA_OFFSET +
data_end, old_data_start + new_size - data_end);
} else {
struct btrfs_disk_key disk_key;
u64 offset;
btrfs_item_key(leaf, &disk_key, slot);
if (btrfs_disk_key_type(&disk_key) == BTRFS_EXTENT_DATA_KEY) {
unsigned long ptr;
struct btrfs_file_extent_item *fi;
fi = btrfs_item_ptr(leaf, slot,
struct btrfs_file_extent_item);
fi = (struct btrfs_file_extent_item *)(
(unsigned long)fi - size_diff);
if (btrfs_file_extent_type(leaf, fi) ==
BTRFS_FILE_EXTENT_INLINE) {
ptr = btrfs_item_ptr_offset(leaf, slot);
memmove_extent_buffer(leaf, ptr,
(unsigned long)fi,
BTRFS_FILE_EXTENT_INLINE_DATA_START);
}
}
memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET +
data_end + size_diff, BTRFS_LEAF_DATA_OFFSET +
data_end, old_data_start - data_end);
offset = btrfs_disk_key_offset(&disk_key);
btrfs_set_disk_key_offset(&disk_key, offset + size_diff);
btrfs_set_item_key(leaf, &disk_key, slot);
if (slot == 0) fixup_low_keys(path, &disk_key, 1);
}
item = btrfs_item_nr(slot);
btrfs_set_item_size(leaf, item, new_size);
btrfs_mark_buffer_dirty(leaf);
if (btrfs_leaf_free_space(leaf) < 0) {
btrfs_print_leaf(leaf);
BUG();
}
}
/*
* make the item pointed to by the path bigger, data_size is the added size.
*/
void btrfs_extend_item(struct btrfs_path *path, u32 data_size)
{
int slot;
struct extent_buffer *leaf;
struct btrfs_item *item;
u32 nritems;
unsigned int data_end;
unsigned int old_data;
unsigned int old_size;
int i;
struct btrfs_map_token token;
leaf = path->nodes[0];
nritems = btrfs_header_nritems(leaf);
data_end = leaf_data_end(leaf);
if (btrfs_leaf_free_space(leaf) < data_size) { btrfs_print_leaf(leaf);
BUG();
}
slot = path->slots[0];
old_data = btrfs_item_end_nr(leaf, slot);
BUG_ON(slot < 0); if (slot >= nritems) { btrfs_print_leaf(leaf);
btrfs_crit(leaf->fs_info, "slot %d too large, nritems %d",
slot, nritems);
BUG();
}
/*
* item0..itemN ... dataN.offset..dataN.size .. data0.size
*/
/* first correct the data pointers */
btrfs_init_map_token(&token, leaf);
for (i = slot; i < nritems; i++) {
u32 ioff;
item = btrfs_item_nr(i);
ioff = btrfs_token_item_offset(&token, item);
btrfs_set_token_item_offset(&token, item, ioff - data_size);
}
/* shift the data */
memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET +
data_end - data_size, BTRFS_LEAF_DATA_OFFSET +
data_end, old_data - data_end);
data_end = old_data;
old_size = btrfs_item_size_nr(leaf, slot);
item = btrfs_item_nr(slot);
btrfs_set_item_size(leaf, item, old_size + data_size);
btrfs_mark_buffer_dirty(leaf);
if (btrfs_leaf_free_space(leaf) < 0) {
btrfs_print_leaf(leaf); BUG();
}
}
/**
* setup_items_for_insert - Helper called before inserting one or more items
* to a leaf. Main purpose is to save stack depth by doing the bulk of the work
* in a function that doesn't call btrfs_search_slot
*
* @root: root we are inserting items to
* @path: points to the leaf/slot where we are going to insert new items
* @cpu_key: array of keys for items to be inserted
* @data_size: size of the body of each item we are going to insert
* @nr: size of @cpu_key/@data_size arrays
*/
void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path,
const struct btrfs_key *cpu_key, u32 *data_size,
int nr)
{
struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_item *item;
int i;
u32 nritems;
unsigned int data_end;
struct btrfs_disk_key disk_key;
struct extent_buffer *leaf;
int slot;
struct btrfs_map_token token;
u32 total_size;
u32 total_data = 0;
for (i = 0; i < nr; i++)
total_data += data_size[i]; total_size = total_data + (nr * sizeof(struct btrfs_item));
if (path->slots[0] == 0) {
btrfs_cpu_key_to_disk(&disk_key, cpu_key);
fixup_low_keys(path, &disk_key, 1);
}
btrfs_unlock_up_safe(path, 1);
leaf = path->nodes[0];
slot = path->slots[0];
nritems = btrfs_header_nritems(leaf);
data_end = leaf_data_end(leaf);
if (btrfs_leaf_free_space(leaf) < total_size) { btrfs_print_leaf(leaf);
btrfs_crit(fs_info, "not enough freespace need %u have %d",
total_size, btrfs_leaf_free_space(leaf));
BUG();
}
btrfs_init_map_token(&token, leaf);
if (slot != nritems) {
unsigned int old_data = btrfs_item_end_nr(leaf, slot);
if (old_data < data_end) {
btrfs_print_leaf(leaf);
btrfs_crit(fs_info,
"item at slot %d with data offset %u beyond data end of leaf %u",
slot, old_data, data_end);
BUG();
}
/*
* item0..itemN ... dataN.offset..dataN.size .. data0.size
*/
/* first correct the data pointers */
for (i = slot; i < nritems; i++) {
u32 ioff;
item = btrfs_item_nr(i);
ioff = btrfs_token_item_offset(&token, item);
btrfs_set_token_item_offset(&token, item,
ioff - total_data);
}
/* shift the items */
memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + nr),
btrfs_item_nr_offset(slot),
(nritems - slot) * sizeof(struct btrfs_item));
/* shift the data */
memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET +
data_end - total_data, BTRFS_LEAF_DATA_OFFSET +
data_end, old_data - data_end);
data_end = old_data;
}
/* setup the item for the new data */
for (i = 0; i < nr; i++) { btrfs_cpu_key_to_disk(&disk_key, cpu_key + i);
btrfs_set_item_key(leaf, &disk_key, slot + i);
item = btrfs_item_nr(slot + i);
data_end -= data_size[i];
btrfs_set_token_item_offset(&token, item, data_end);
btrfs_set_token_item_size(&token, item, data_size[i]);
}
btrfs_set_header_nritems(leaf, nritems + nr);
btrfs_mark_buffer_dirty(leaf);
if (btrfs_leaf_free_space(leaf) < 0) {
btrfs_print_leaf(leaf); BUG();
}
}
/*
* Given a key and some data, insert items into the tree.
* This does all the path init required, making room in the tree if needed.
*/
int btrfs_insert_empty_items(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_path *path,
const struct btrfs_key *cpu_key, u32 *data_size,
int nr)
{
int ret = 0;
int slot;
int i;
u32 total_size = 0;
u32 total_data = 0;
for (i = 0; i < nr; i++) total_data += data_size[i]; total_size = total_data + (nr * sizeof(struct btrfs_item));
ret = btrfs_search_slot(trans, root, cpu_key, path, total_size, 1);
if (ret == 0)
return -EEXIST;
if (ret < 0)
return ret;
slot = path->slots[0]; BUG_ON(slot < 0); setup_items_for_insert(root, path, cpu_key, data_size, nr); return 0;
}
/*
* Given a key and some data, insert an item into the tree.
* This does all the path init required, making room in the tree if needed.
*/
int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root *root,
const struct btrfs_key *cpu_key, void *data,
u32 data_size)
{
int ret = 0;
struct btrfs_path *path;
struct extent_buffer *leaf;
unsigned long ptr;
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
ret = btrfs_insert_empty_item(trans, root, path, cpu_key, data_size);
if (!ret) {
leaf = path->nodes[0];
ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
write_extent_buffer(leaf, data, ptr, data_size);
btrfs_mark_buffer_dirty(leaf);
}
btrfs_free_path(path);
return ret;
}
/*
* delete the pointer from a given node.
*
* the tree should have been previously balanced so the deletion does not
* empty a node.
*/
static void del_ptr(struct btrfs_root *root, struct btrfs_path *path,
int level, int slot)
{
struct extent_buffer *parent = path->nodes[level];
u32 nritems;
int ret;
nritems = btrfs_header_nritems(parent);
if (slot != nritems - 1) {
if (level) { ret = btrfs_tree_mod_log_insert_move(parent, slot,
slot + 1, nritems - slot - 1);
BUG_ON(ret < 0);
}
memmove_extent_buffer(parent,
btrfs_node_key_ptr_offset(slot),
btrfs_node_key_ptr_offset(slot + 1),
sizeof(struct btrfs_key_ptr) *
(nritems - slot - 1)); } else if (level) { ret = btrfs_tree_mod_log_insert_key(parent, slot,
BTRFS_MOD_LOG_KEY_REMOVE, GFP_NOFS);
BUG_ON(ret < 0);
}
nritems--;
btrfs_set_header_nritems(parent, nritems);
if (nritems == 0 && parent == root->node) { BUG_ON(btrfs_header_level(root->node) != 1);
/* just turn the root into a leaf and break */
btrfs_set_header_level(root->node, 0);
} else if (slot == 0) {
struct btrfs_disk_key disk_key;
btrfs_node_key(parent, &disk_key, 0);
fixup_low_keys(path, &disk_key, level + 1);
}
btrfs_mark_buffer_dirty(parent);
}
/*
* a helper function to delete the leaf pointed to by path->slots[1] and
* path->nodes[1].
*
* This deletes the pointer in path->nodes[1] and frees the leaf
* block extent. zero is returned if it all worked out, < 0 otherwise.
*
* The path must have already been setup for deleting the leaf, including
* all the proper balancing. path->nodes[1] must be locked.
*/
static noinline void btrfs_del_leaf(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_path *path,
struct extent_buffer *leaf)
{
WARN_ON(btrfs_header_generation(leaf) != trans->transid); del_ptr(root, path, 1, path->slots[1]);
/*
* btrfs_free_extent is expensive, we want to make sure we
* aren't holding any locks when we call it
*/
btrfs_unlock_up_safe(path, 0);
root_sub_used(root, leaf->len);
atomic_inc(&leaf->refs);
btrfs_free_tree_block(trans, root, leaf, 0, 1);
free_extent_buffer_stale(leaf);
}
/*
* delete the item at the leaf level in path. If that empties
* the leaf, remove it from the tree
*/
int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
struct btrfs_path *path, int slot, int nr)
{
struct btrfs_fs_info *fs_info = root->fs_info;
struct extent_buffer *leaf;
struct btrfs_item *item;
u32 last_off;
u32 dsize = 0;
int ret = 0;
int wret;
int i;
u32 nritems;
leaf = path->nodes[0];
last_off = btrfs_item_offset_nr(leaf, slot + nr - 1);
for (i = 0; i < nr; i++)
dsize += btrfs_item_size_nr(leaf, slot + i); nritems = btrfs_header_nritems(leaf);
if (slot + nr != nritems) {
int data_end = leaf_data_end(leaf);
struct btrfs_map_token token;
memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET +
data_end + dsize,
BTRFS_LEAF_DATA_OFFSET + data_end,
last_off - data_end);
btrfs_init_map_token(&token, leaf);
for (i = slot + nr; i < nritems; i++) {
u32 ioff;
item = btrfs_item_nr(i);
ioff = btrfs_token_item_offset(&token, item);
btrfs_set_token_item_offset(&token, item, ioff + dsize);
}
memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot),
btrfs_item_nr_offset(slot + nr),
sizeof(struct btrfs_item) *
(nritems - slot - nr));
}
btrfs_set_header_nritems(leaf, nritems - nr);
nritems -= nr;
/* delete the leaf if we've emptied it */
if (nritems == 0) {
if (leaf == root->node) {
btrfs_set_header_level(leaf, 0);
} else {
btrfs_clean_tree_block(leaf);
btrfs_del_leaf(trans, root, path, leaf);
}
} else {
int used = leaf_space_used(leaf, 0, nritems);
if (slot == 0) {
struct btrfs_disk_key disk_key;
btrfs_item_key(leaf, &disk_key, 0);
fixup_low_keys(path, &disk_key, 1);
}
/* delete the leaf if it is mostly empty */
if (used < BTRFS_LEAF_DATA_SIZE(fs_info) / 3) {
/* push_leaf_left fixes the path.
* make sure the path still points to our leaf
* for possible call to del_ptr below
*/
slot = path->slots[1];
atomic_inc(&leaf->refs);
wret = push_leaf_left(trans, root, path, 1, 1,
1, (u32)-1);
if (wret < 0 && wret != -ENOSPC)
ret = wret;
if (path->nodes[0] == leaf &&
btrfs_header_nritems(leaf)) {
wret = push_leaf_right(trans, root, path, 1,
1, 1, 0);
if (wret < 0 && wret != -ENOSPC)
ret = wret;
}
if (btrfs_header_nritems(leaf) == 0) { path->slots[1] = slot;
btrfs_del_leaf(trans, root, path, leaf);
free_extent_buffer(leaf);
ret = 0;
} else {
/* if we're still in the path, make sure
* we're dirty. Otherwise, one of the
* push_leaf functions must have already
* dirtied this buffer
*/
if (path->nodes[0] == leaf) btrfs_mark_buffer_dirty(leaf); free_extent_buffer(leaf);
}
} else {
btrfs_mark_buffer_dirty(leaf);
}
}
return ret;
}
/*
* search the tree again to find a leaf with lesser keys
* returns 0 if it found something or 1 if there are no lesser leaves.
* returns < 0 on io errors.
*
* This may release the path, and so you may lose any locks held at the
* time you call it.
*/
int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path)
{
struct btrfs_key key;
struct btrfs_disk_key found_key;
int ret;
btrfs_item_key_to_cpu(path->nodes[0], &key, 0);
if (key.offset > 0) {
key.offset--; } else if (key.type > 0) { key.type--;
key.offset = (u64)-1;
} else if (key.objectid > 0) { key.objectid--;
key.type = (u8)-1;
key.offset = (u64)-1;
} else {
return 1;
}
btrfs_release_path(path);
ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
if (ret < 0)
return ret;
btrfs_item_key(path->nodes[0], &found_key, 0);
ret = comp_keys(&found_key, &key);
/*
* We might have had an item with the previous key in the tree right
* before we released our path. And after we released our path, that
* item might have been pushed to the first slot (0) of the leaf we
* were holding due to a tree balance. Alternatively, an item with the
* previous key can exist as the only element of a leaf (big fat item).
* Therefore account for these 2 cases, so that our callers (like
* btrfs_previous_item) don't miss an existing item with a key matching
* the previous key we computed above.
*/
if (ret <= 0)
return 0;
return 1;
}
/*
* A helper function to walk down the tree starting at min_key, and looking
* for nodes or leaves that are have a minimum transaction id.
* This is used by the btree defrag code, and tree logging
*
* This does not cow, but it does stuff the starting key it finds back
* into min_key, so you can call btrfs_search_slot with cow=1 on the
* key and get a writable path.
*
* This honors path->lowest_level to prevent descent past a given level
* of the tree.
*
* min_trans indicates the oldest transaction that you are interested
* in walking through. Any nodes or leaves older than min_trans are
* skipped over (without reading them).
*
* returns zero if something useful was found, < 0 on error and 1 if there
* was nothing in the tree that matched the search criteria.
*/
int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key,
struct btrfs_path *path,
u64 min_trans)
{
struct extent_buffer *cur;
struct btrfs_key found_key;
int slot;
int sret;
u32 nritems;
int level;
int ret = 1;
int keep_locks = path->keep_locks;
path->keep_locks = 1;
again:
cur = btrfs_read_lock_root_node(root);
level = btrfs_header_level(cur);
WARN_ON(path->nodes[level]); path->nodes[level] = cur;
path->locks[level] = BTRFS_READ_LOCK;
if (btrfs_header_generation(cur) < min_trans) {
ret = 1;
goto out;
}
while (1) {
nritems = btrfs_header_nritems(cur);
level = btrfs_header_level(cur);
sret = btrfs_bin_search(cur, min_key, &slot);
if (sret < 0) {
ret = sret;
goto out;
}
/* at the lowest level, we're done, setup the path and exit */
if (level == path->lowest_level) { if (slot >= nritems)
goto find_next_key;
ret = 0;
path->slots[level] = slot;
btrfs_item_key_to_cpu(cur, &found_key, slot);
goto out;
}
if (sret && slot > 0) slot--;
/*
* check this node pointer against the min_trans parameters.
* If it is too old, skip to the next one.
*/
while (slot < nritems) {
u64 gen;
gen = btrfs_node_ptr_generation(cur, slot);
if (gen < min_trans) {
slot++;
continue;
}
break;
}
find_next_key:
/*
* we didn't find a candidate key in this node, walk forward
* and find another one
*/
if (slot >= nritems) { path->slots[level] = slot;
sret = btrfs_find_next_key(root, path, min_key, level,
min_trans);
if (sret == 0) {
btrfs_release_path(path);
goto again;
} else {
goto out;
}
}
/* save our key for returning back */
btrfs_node_key_to_cpu(cur, &found_key, slot);
path->slots[level] = slot;
if (level == path->lowest_level) {
ret = 0;
goto out;
}
cur = btrfs_read_node_slot(cur, slot);
if (IS_ERR(cur)) {
ret = PTR_ERR(cur);
goto out;
}
btrfs_tree_read_lock(cur);
path->locks[level - 1] = BTRFS_READ_LOCK;
path->nodes[level - 1] = cur;
unlock_up(path, level, 1, 0, NULL);
}
out:
path->keep_locks = keep_locks; if (ret == 0) { btrfs_unlock_up_safe(path, path->lowest_level + 1);
memcpy(min_key, &found_key, sizeof(found_key));
}
return ret;
}
/*
* this is similar to btrfs_next_leaf, but does not try to preserve
* and fixup the path. It looks for and returns the next key in the
* tree based on the current path and the min_trans parameters.
*
* 0 is returned if another key is found, < 0 if there are any errors
* and 1 is returned if there are no higher keys in the tree
*
* path->keep_locks should be set to 1 on the search made before
* calling this function.
*/
int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path,
struct btrfs_key *key, int level, u64 min_trans)
{
int slot;
struct extent_buffer *c;
WARN_ON(!path->keep_locks && !path->skip_locking); while (level < BTRFS_MAX_LEVEL) { if (!path->nodes[level])
return 1;
slot = path->slots[level] + 1;
c = path->nodes[level];
next:
if (slot >= btrfs_header_nritems(c)) {
int ret;
int orig_lowest;
struct btrfs_key cur_key;
if (level + 1 >= BTRFS_MAX_LEVEL || !path->nodes[level + 1]) return 1; if (path->locks[level + 1] || path->skip_locking) {
level++;
continue;
}
slot = btrfs_header_nritems(c) - 1;
if (level == 0)
btrfs_item_key_to_cpu(c, &cur_key, slot);
else
btrfs_node_key_to_cpu(c, &cur_key, slot);
orig_lowest = path->lowest_level;
btrfs_release_path(path);
path->lowest_level = level;
ret = btrfs_search_slot(NULL, root, &cur_key, path,
0, 0);
path->lowest_level = orig_lowest;
if (ret < 0)
return ret;
c = path->nodes[level];
slot = path->slots[level];
if (ret == 0)
slot++; goto next;
}
if (level == 0)
btrfs_item_key_to_cpu(c, key, slot);
else {
u64 gen = btrfs_node_ptr_generation(c, slot);
if (gen < min_trans) {
slot++;
goto next;
}
btrfs_node_key_to_cpu(c, key, slot);
}
return 0;
}
return 1;
}
int btrfs_next_old_leaf(struct btrfs_root *root, struct btrfs_path *path,
u64 time_seq)
{
int slot;
int level;
struct extent_buffer *c;
struct extent_buffer *next;
struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_key key;
bool need_commit_sem = false;
u32 nritems;
int ret;
int i;
nritems = btrfs_header_nritems(path->nodes[0]); if (nritems == 0)
return 1;
btrfs_item_key_to_cpu(path->nodes[0], &key, nritems - 1);
again:
level = 1;
next = NULL;
btrfs_release_path(path);
path->keep_locks = 1;
if (time_seq) {
ret = btrfs_search_old_slot(root, &key, path, time_seq);
} else {
if (path->need_commit_sem) { path->need_commit_sem = 0;
need_commit_sem = true;
down_read(&fs_info->commit_root_sem);
}
ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
}
path->keep_locks = 0;
if (ret < 0)
goto done;
nritems = btrfs_header_nritems(path->nodes[0]);
/*
* by releasing the path above we dropped all our locks. A balance
* could have added more items next to the key that used to be
* at the very end of the block. So, check again here and
* advance the path if there are now more items available.
*/
if (nritems > 0 && path->slots[0] < nritems - 1) { if (ret == 0) path->slots[0]++;
ret = 0;
goto done;
}
/*
* So the above check misses one case:
* - after releasing the path above, someone has removed the item that
* used to be at the very end of the block, and balance between leafs
* gets another one with bigger key.offset to replace it.
*
* This one should be returned as well, or we can get leaf corruption
* later(esp. in __btrfs_drop_extents()).
*
* And a bit more explanation about this check,
* with ret > 0, the key isn't found, the path points to the slot
* where it should be inserted, so the path->slots[0] item must be the
* bigger one.
*/
if (nritems > 0 && ret > 0 && path->slots[0] == nritems - 1) {
ret = 0;
goto done;
}
while (level < BTRFS_MAX_LEVEL) {
if (!path->nodes[level]) {
ret = 1;
goto done;
}
slot = path->slots[level] + 1;
c = path->nodes[level];
if (slot >= btrfs_header_nritems(c)) {
level++;
if (level == BTRFS_MAX_LEVEL) {
ret = 1;
goto done;
}
continue;
}
/*
* Our current level is where we're going to start from, and to
* make sure lockdep doesn't complain we need to drop our locks
* and nodes from 0 to our current level.
*/
for (i = 0; i < level; i++) { if (path->locks[level]) { btrfs_tree_read_unlock(path->nodes[i]);
path->locks[i] = 0;
}
free_extent_buffer(path->nodes[i]);
path->nodes[i] = NULL;
}
next = c;
ret = read_block_for_search(root, path, &next, level,
slot, &key);
if (ret == -EAGAIN)
goto again;
if (ret < 0) {
btrfs_release_path(path);
goto done;
}
if (!path->skip_locking) { ret = btrfs_try_tree_read_lock(next); if (!ret && time_seq) {
/*
* If we don't get the lock, we may be racing
* with push_leaf_left, holding that lock while
* itself waiting for the leaf we've currently
* locked. To solve this situation, we give up
* on our lock and cycle.
*/
free_extent_buffer(next);
btrfs_release_path(path);
cond_resched();
goto again;
}
if (!ret)
btrfs_tree_read_lock(next);
}
break;
}
path->slots[level] = slot;
while (1) {
level--;
path->nodes[level] = next;
path->slots[level] = 0;
if (!path->skip_locking)
path->locks[level] = BTRFS_READ_LOCK; if (!level)
break;
ret = read_block_for_search(root, path, &next, level,
0, &key);
if (ret == -EAGAIN)
goto again;
if (ret < 0) { btrfs_release_path(path);
goto done;
}
if (!path->skip_locking) btrfs_tree_read_lock(next);
}
ret = 0;
done:
unlock_up(path, 0, 1, 0, NULL);
if (need_commit_sem) {
int ret2;
path->need_commit_sem = 1;
ret2 = finish_need_commit_sem_search(path);
up_read(&fs_info->commit_root_sem);
if (ret2)
ret = ret2;
}
return ret;
}
/*
* this uses btrfs_prev_leaf to walk backwards in the tree, and keeps
* searching until it gets past min_objectid or finds an item of 'type'
*
* returns 0 if something is found, 1 if nothing was found and < 0 on error
*/
int btrfs_previous_item(struct btrfs_root *root,
struct btrfs_path *path, u64 min_objectid,
int type)
{
struct btrfs_key found_key;
struct extent_buffer *leaf;
u32 nritems;
int ret;
while (1) {
if (path->slots[0] == 0) { ret = btrfs_prev_leaf(root, path);
if (ret != 0)
return ret;
} else {
path->slots[0]--;
}
leaf = path->nodes[0];
nritems = btrfs_header_nritems(leaf);
if (nritems == 0)
return 1; if (path->slots[0] == nritems) path->slots[0]--;
btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
if (found_key.objectid < min_objectid)
break;
if (found_key.type == type)
return 0;
if (found_key.objectid == min_objectid &&
found_key.type < type)
break;
}
return 1;
}
/*
* search in extent tree to find a previous Metadata/Data extent item with
* min objecitd.
*
* returns 0 if something is found, 1 if nothing was found and < 0 on error
*/
int btrfs_previous_extent_item(struct btrfs_root *root,
struct btrfs_path *path, u64 min_objectid)
{
struct btrfs_key found_key;
struct extent_buffer *leaf;
u32 nritems;
int ret;
while (1) {
if (path->slots[0] == 0) {
ret = btrfs_prev_leaf(root, path);
if (ret != 0)
return ret;
} else {
path->slots[0]--;
}
leaf = path->nodes[0];
nritems = btrfs_header_nritems(leaf);
if (nritems == 0)
return 1;
if (path->slots[0] == nritems)
path->slots[0]--;
btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
if (found_key.objectid < min_objectid)
break;
if (found_key.type == BTRFS_EXTENT_ITEM_KEY ||
found_key.type == BTRFS_METADATA_ITEM_KEY)
return 0;
if (found_key.objectid == min_objectid &&
found_key.type < BTRFS_EXTENT_ITEM_KEY)
break;
}
return 1;
}
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef IOPRIO_H
#define IOPRIO_H
#include <linux/sched.h>
#include <linux/sched/rt.h>
#include <linux/iocontext.h>
#include <uapi/linux/ioprio.h>
/*
* Default IO priority.
*/
#define IOPRIO_DEFAULT IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, IOPRIO_BE_NORM)
/*
* Check that a priority value has a valid class.
*/
static inline bool ioprio_valid(unsigned short ioprio)
{
unsigned short class = IOPRIO_PRIO_CLASS(ioprio);
return class > IOPRIO_CLASS_NONE && class <= IOPRIO_CLASS_IDLE;
}
/*
* if process has set io priority explicitly, use that. if not, convert
* the cpu scheduler nice value to an io priority
*/
static inline int task_nice_ioprio(struct task_struct *task)
{
return (task_nice(task) + 20) / 5;
}
/*
* This is for the case where the task hasn't asked for a specific IO class.
* Check for idle and rt task process, and return appropriate IO class.
*/
static inline int task_nice_ioclass(struct task_struct *task)
{
if (task->policy == SCHED_IDLE)
return IOPRIO_CLASS_IDLE;
else if (task_is_realtime(task))
return IOPRIO_CLASS_RT;
else
return IOPRIO_CLASS_BE;
}
/*
* If the calling process has set an I/O priority, use that. Otherwise, return
* the default I/O priority.
*/
static inline int get_current_ioprio(void)
{
struct io_context *ioc = current->io_context;
if (ioc)
return ioc->ioprio;
return IOPRIO_DEFAULT;
}
/*
* For inheritance, return the highest of the two given priorities
*/
extern int ioprio_best(unsigned short aprio, unsigned short bprio);
extern int set_task_ioprio(struct task_struct *task, int ioprio);
#ifdef CONFIG_BLOCK
extern int ioprio_check_cap(int ioprio);
#else
static inline int ioprio_check_cap(int ioprio)
{
return -ENOTBLK;
}
#endif /* CONFIG_BLOCK */
#endif
// SPDX-License-Identifier: GPL-2.0
/*
* Copyright (C) STRATO AG 2012. All rights reserved.
*/
#include <linux/sched.h>
#include <linux/bio.h>
#include <linux/slab.h>
#include <linux/blkdev.h>
#include <linux/kthread.h>
#include <linux/math64.h>
#include "misc.h"
#include "ctree.h"
#include "extent_map.h"
#include "disk-io.h"
#include "transaction.h"
#include "print-tree.h"
#include "volumes.h"
#include "async-thread.h"
#include "check-integrity.h"
#include "rcu-string.h"
#include "dev-replace.h"
#include "sysfs.h"
#include "zoned.h"
#include "block-group.h"
/*
* Device replace overview
*
* [Objective]
* To copy all extents (both new and on-disk) from source device to target
* device, while still keeping the filesystem read-write.
*
* [Method]
* There are two main methods involved:
*
* - Write duplication
*
* All new writes will be written to both target and source devices, so even
* if replace gets canceled, sources device still contains up-to-date data.
*
* Location: handle_ops_on_dev_replace() from __btrfs_map_block()
* Start: btrfs_dev_replace_start()
* End: btrfs_dev_replace_finishing()
* Content: Latest data/metadata
*
* - Copy existing extents
*
* This happens by re-using scrub facility, as scrub also iterates through
* existing extents from commit root.
*
* Location: scrub_write_block_to_dev_replace() from
* scrub_block_complete()
* Content: Data/meta from commit root.
*
* Due to the content difference, we need to avoid nocow write when dev-replace
* is happening. This is done by marking the block group read-only and waiting
* for NOCOW writes.
*
* After replace is done, the finishing part is done by swapping the target and
* source devices.
*
* Location: btrfs_dev_replace_update_device_in_mapping_tree() from
* btrfs_dev_replace_finishing()
*/
static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
int scrub_ret);
static int btrfs_dev_replace_kthread(void *data);
int btrfs_init_dev_replace(struct btrfs_fs_info *fs_info)
{
struct btrfs_key key;
struct btrfs_root *dev_root = fs_info->dev_root;
struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
struct extent_buffer *eb;
int slot;
int ret = 0;
struct btrfs_path *path = NULL;
int item_size;
struct btrfs_dev_replace_item *ptr;
u64 src_devid;
if (!dev_root)
return 0;
path = btrfs_alloc_path();
if (!path) {
ret = -ENOMEM;
goto out;
}
key.objectid = 0;
key.type = BTRFS_DEV_REPLACE_KEY;
key.offset = 0;
ret = btrfs_search_slot(NULL, dev_root, &key, path, 0, 0);
if (ret) {
no_valid_dev_replace_entry_found:
/*
* We don't have a replace item or it's corrupted. If there is
* a replace target, fail the mount.
*/
if (btrfs_find_device(fs_info->fs_devices,
BTRFS_DEV_REPLACE_DEVID, NULL, NULL)) {
btrfs_err(fs_info,
"found replace target device without a valid replace item");
ret = -EUCLEAN;
goto out;
}
ret = 0;
dev_replace->replace_state =
BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED;
dev_replace->cont_reading_from_srcdev_mode =
BTRFS_DEV_REPLACE_ITEM_CONT_READING_FROM_SRCDEV_MODE_ALWAYS;
dev_replace->time_started = 0;
dev_replace->time_stopped = 0;
atomic64_set(&dev_replace->num_write_errors, 0);
atomic64_set(&dev_replace->num_uncorrectable_read_errors, 0);
dev_replace->cursor_left = 0;
dev_replace->committed_cursor_left = 0;
dev_replace->cursor_left_last_write_of_item = 0;
dev_replace->cursor_right = 0;
dev_replace->srcdev = NULL;
dev_replace->tgtdev = NULL;
dev_replace->is_valid = 0;
dev_replace->item_needs_writeback = 0;
goto out;
}
slot = path->slots[0];
eb = path->nodes[0];
item_size = btrfs_item_size_nr(eb, slot);
ptr = btrfs_item_ptr(eb, slot, struct btrfs_dev_replace_item);
if (item_size != sizeof(struct btrfs_dev_replace_item)) {
btrfs_warn(fs_info,
"dev_replace entry found has unexpected size, ignore entry");
goto no_valid_dev_replace_entry_found;
}
src_devid = btrfs_dev_replace_src_devid(eb, ptr);
dev_replace->cont_reading_from_srcdev_mode =
btrfs_dev_replace_cont_reading_from_srcdev_mode(eb, ptr);
dev_replace->replace_state = btrfs_dev_replace_replace_state(eb, ptr);
dev_replace->time_started = btrfs_dev_replace_time_started(eb, ptr);
dev_replace->time_stopped =
btrfs_dev_replace_time_stopped(eb, ptr);
atomic64_set(&dev_replace->num_write_errors,
btrfs_dev_replace_num_write_errors(eb, ptr));
atomic64_set(&dev_replace->num_uncorrectable_read_errors,
btrfs_dev_replace_num_uncorrectable_read_errors(eb, ptr));
dev_replace->cursor_left = btrfs_dev_replace_cursor_left(eb, ptr);
dev_replace->committed_cursor_left = dev_replace->cursor_left;
dev_replace->cursor_left_last_write_of_item = dev_replace->cursor_left;
dev_replace->cursor_right = btrfs_dev_replace_cursor_right(eb, ptr);
dev_replace->is_valid = 1;
dev_replace->item_needs_writeback = 0;
switch (dev_replace->replace_state) {
case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED:
/*
* We don't have an active replace item but if there is a
* replace target, fail the mount.
*/
if (btrfs_find_device(fs_info->fs_devices,
BTRFS_DEV_REPLACE_DEVID, NULL, NULL)) {
btrfs_err(fs_info,
"replace devid present without an active replace item");
ret = -EUCLEAN;
} else {
dev_replace->srcdev = NULL;
dev_replace->tgtdev = NULL;
}
break;
case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
dev_replace->srcdev = btrfs_find_device(fs_info->fs_devices,
src_devid, NULL, NULL);
dev_replace->tgtdev = btrfs_find_device(fs_info->fs_devices,
BTRFS_DEV_REPLACE_DEVID,
NULL, NULL);
/*
* allow 'btrfs dev replace_cancel' if src/tgt device is
* missing
*/
if (!dev_replace->srcdev &&
!btrfs_test_opt(fs_info, DEGRADED)) {
ret = -EIO;
btrfs_warn(fs_info,
"cannot mount because device replace operation is ongoing and");
btrfs_warn(fs_info,
"srcdev (devid %llu) is missing, need to run 'btrfs dev scan'?",
src_devid);
}
if (!dev_replace->tgtdev && !btrfs_test_opt(fs_info, DEGRADED)) {
ret = -EIO;
btrfs_warn(fs_info,
"cannot mount because device replace operation is ongoing and");
btrfs_warn(fs_info,
"tgtdev (devid %llu) is missing, need to run 'btrfs dev scan'?",
BTRFS_DEV_REPLACE_DEVID);
}
if (dev_replace->tgtdev) {
if (dev_replace->srcdev) {
dev_replace->tgtdev->total_bytes =
dev_replace->srcdev->total_bytes;
dev_replace->tgtdev->disk_total_bytes =
dev_replace->srcdev->disk_total_bytes;
dev_replace->tgtdev->commit_total_bytes =
dev_replace->srcdev->commit_total_bytes;
dev_replace->tgtdev->bytes_used =
dev_replace->srcdev->bytes_used;
dev_replace->tgtdev->commit_bytes_used =
dev_replace->srcdev->commit_bytes_used;
}
set_bit(BTRFS_DEV_STATE_REPLACE_TGT,
&dev_replace->tgtdev->dev_state); WARN_ON(fs_info->fs_devices->rw_devices == 0); dev_replace->tgtdev->io_width = fs_info->sectorsize;
dev_replace->tgtdev->io_align = fs_info->sectorsize;
dev_replace->tgtdev->sector_size = fs_info->sectorsize;
dev_replace->tgtdev->fs_info = fs_info;
set_bit(BTRFS_DEV_STATE_IN_FS_METADATA,
&dev_replace->tgtdev->dev_state);
}
break;
}
out:
btrfs_free_path(path); return ret;
}
/*
* Initialize a new device for device replace target from a given source dev
* and path.
*
* Return 0 and new device in @device_out, otherwise return < 0
*/
static int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
const char *device_path,
struct btrfs_device *srcdev,
struct btrfs_device **device_out)
{
struct btrfs_device *device;
struct block_device *bdev;
struct rcu_string *name;
u64 devid = BTRFS_DEV_REPLACE_DEVID;
int ret = 0;
*device_out = NULL;
if (srcdev->fs_devices->seeding) {
btrfs_err(fs_info, "the filesystem is a seed filesystem!");
return -EINVAL;
}
bdev = blkdev_get_by_path(device_path, FMODE_WRITE | FMODE_EXCL,
fs_info->bdev_holder);
if (IS_ERR(bdev)) {
btrfs_err(fs_info, "target device %s is invalid!", device_path);
return PTR_ERR(bdev);
}
if (!btrfs_check_device_zone_type(fs_info, bdev)) {
btrfs_err(fs_info,
"dev-replace: zoned type of target device mismatch with filesystem");
ret = -EINVAL;
goto error;
}
sync_blockdev(bdev);
list_for_each_entry(device, &fs_info->fs_devices->devices, dev_list) {
if (device->bdev == bdev) {
btrfs_err(fs_info,
"target device is in the filesystem!");
ret = -EEXIST;
goto error;
}
}
if (i_size_read(bdev->bd_inode) <
btrfs_device_get_total_bytes(srcdev)) {
btrfs_err(fs_info,
"target device is smaller than source device!");
ret = -EINVAL;
goto error;
}
device = btrfs_alloc_device(NULL, &devid, NULL);
if (IS_ERR(device)) {
ret = PTR_ERR(device);
goto error;
}
name = rcu_string_strdup(device_path, GFP_KERNEL);
if (!name) {
btrfs_free_device(device);
ret = -ENOMEM;
goto error;
}
rcu_assign_pointer(device->name, name);
set_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
device->generation = 0;
device->io_width = fs_info->sectorsize;
device->io_align = fs_info->sectorsize;
device->sector_size = fs_info->sectorsize;
device->total_bytes = btrfs_device_get_total_bytes(srcdev);
device->disk_total_bytes = btrfs_device_get_disk_total_bytes(srcdev);
device->bytes_used = btrfs_device_get_bytes_used(srcdev);
device->commit_total_bytes = srcdev->commit_total_bytes;
device->commit_bytes_used = device->bytes_used;
device->fs_info = fs_info;
device->bdev = bdev;
set_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
set_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state);
device->mode = FMODE_EXCL;
device->dev_stats_valid = 1;
set_blocksize(device->bdev, BTRFS_BDEV_BLOCKSIZE);
device->fs_devices = fs_info->fs_devices;
ret = btrfs_get_dev_zone_info(device, false);
if (ret)
goto error;
mutex_lock(&fs_info->fs_devices->device_list_mutex);
list_add(&device->dev_list, &fs_info->fs_devices->devices);
fs_info->fs_devices->num_devices++;
fs_info->fs_devices->open_devices++;
mutex_unlock(&fs_info->fs_devices->device_list_mutex);
*device_out = device;
return 0;
error:
blkdev_put(bdev, FMODE_EXCL);
return ret;
}
/*
* called from commit_transaction. Writes changed device replace state to
* disk.
*/
int btrfs_run_dev_replace(struct btrfs_trans_handle *trans)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
int ret;
struct btrfs_root *dev_root = fs_info->dev_root;
struct btrfs_path *path;
struct btrfs_key key;
struct extent_buffer *eb;
struct btrfs_dev_replace_item *ptr;
struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
down_read(&dev_replace->rwsem);
if (!dev_replace->is_valid ||
!dev_replace->item_needs_writeback) { up_read(&dev_replace->rwsem); return 0;
}
up_read(&dev_replace->rwsem);
key.objectid = 0;
key.type = BTRFS_DEV_REPLACE_KEY;
key.offset = 0;
path = btrfs_alloc_path();
if (!path) {
ret = -ENOMEM;
goto out;
}
ret = btrfs_search_slot(trans, dev_root, &key, path, -1, 1);
if (ret < 0) {
btrfs_warn(fs_info,
"error %d while searching for dev_replace item!",
ret);
goto out;
}
if (ret == 0 && btrfs_item_size_nr(path->nodes[0], path->slots[0]) < sizeof(*ptr)) {
/*
* need to delete old one and insert a new one.
* Since no attempt is made to recover any old state, if the
* dev_replace state is 'running', the data on the target
* drive is lost.
* It would be possible to recover the state: just make sure
* that the beginning of the item is never changed and always
* contains all the essential information. Then read this
* minimal set of information and use it as a base for the
* new state.
*/
ret = btrfs_del_item(trans, dev_root, path);
if (ret != 0) {
btrfs_warn(fs_info,
"delete too small dev_replace item failed %d!",
ret);
goto out;
}
ret = 1;
}
if (ret == 1) {
/* need to insert a new item */
btrfs_release_path(path);
ret = btrfs_insert_empty_item(trans, dev_root, path,
&key, sizeof(*ptr));
if (ret < 0) {
btrfs_warn(fs_info,
"insert dev_replace item failed %d!", ret);
goto out;
}
}
eb = path->nodes[0];
ptr = btrfs_item_ptr(eb, path->slots[0],
struct btrfs_dev_replace_item);
down_write(&dev_replace->rwsem);
if (dev_replace->srcdev)
btrfs_set_dev_replace_src_devid(eb, ptr,
dev_replace->srcdev->devid);
else
btrfs_set_dev_replace_src_devid(eb, ptr, (u64)-1);
btrfs_set_dev_replace_cont_reading_from_srcdev_mode(eb, ptr,
dev_replace->cont_reading_from_srcdev_mode);
btrfs_set_dev_replace_replace_state(eb, ptr,
dev_replace->replace_state);
btrfs_set_dev_replace_time_started(eb, ptr, dev_replace->time_started);
btrfs_set_dev_replace_time_stopped(eb, ptr, dev_replace->time_stopped);
btrfs_set_dev_replace_num_write_errors(eb, ptr,
atomic64_read(&dev_replace->num_write_errors));
btrfs_set_dev_replace_num_uncorrectable_read_errors(eb, ptr,
atomic64_read(&dev_replace->num_uncorrectable_read_errors));
dev_replace->cursor_left_last_write_of_item =
dev_replace->cursor_left;
btrfs_set_dev_replace_cursor_left(eb, ptr,
dev_replace->cursor_left_last_write_of_item);
btrfs_set_dev_replace_cursor_right(eb, ptr,
dev_replace->cursor_right);
dev_replace->item_needs_writeback = 0;
up_write(&dev_replace->rwsem);
btrfs_mark_buffer_dirty(eb);
out:
btrfs_free_path(path);
return ret;
}
static char* btrfs_dev_name(struct btrfs_device *device)
{
if (!device || test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state))
return "<missing disk>";
else
return rcu_str_deref(device->name);
}
static int mark_block_group_to_copy(struct btrfs_fs_info *fs_info,
struct btrfs_device *src_dev)
{
struct btrfs_path *path;
struct btrfs_key key;
struct btrfs_key found_key;
struct btrfs_root *root = fs_info->dev_root;
struct btrfs_dev_extent *dev_extent = NULL;
struct btrfs_block_group *cache;
struct btrfs_trans_handle *trans;
int ret = 0;
u64 chunk_offset;
/* Do not use "to_copy" on non zoned filesystem for now */
if (!btrfs_is_zoned(fs_info))
return 0;
mutex_lock(&fs_info->chunk_mutex);
/* Ensure we don't have pending new block group */
spin_lock(&fs_info->trans_lock);
while (fs_info->running_transaction &&
!list_empty(&fs_info->running_transaction->dev_update_list)) {
spin_unlock(&fs_info->trans_lock);
mutex_unlock(&fs_info->chunk_mutex);
trans = btrfs_attach_transaction(root);
if (IS_ERR(trans)) {
ret = PTR_ERR(trans);
mutex_lock(&fs_info->chunk_mutex);
if (ret == -ENOENT) {
spin_lock(&fs_info->trans_lock);
continue;
} else {
goto unlock;
}
}
ret = btrfs_commit_transaction(trans);
mutex_lock(&fs_info->chunk_mutex);
if (ret)
goto unlock;
spin_lock(&fs_info->trans_lock);
}
spin_unlock(&fs_info->trans_lock);
path = btrfs_alloc_path();
if (!path) {
ret = -ENOMEM;
goto unlock;
}
path->reada = READA_FORWARD;
path->search_commit_root = 1;
path->skip_locking = 1;
key.objectid = src_dev->devid;
key.type = BTRFS_DEV_EXTENT_KEY;
key.offset = 0;
ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
if (ret < 0)
goto free_path;
if (ret > 0) {
if (path->slots[0] >=
btrfs_header_nritems(path->nodes[0])) {
ret = btrfs_next_leaf(root, path);
if (ret < 0)
goto free_path;
if (ret > 0) {
ret = 0;
goto free_path;
}
} else {
ret = 0;
}
}
while (1) {
struct extent_buffer *leaf = path->nodes[0];
int slot = path->slots[0];
btrfs_item_key_to_cpu(leaf, &found_key, slot);
if (found_key.objectid != src_dev->devid)
break;
if (found_key.type != BTRFS_DEV_EXTENT_KEY)
break;
if (found_key.offset < key.offset)
break;
dev_extent = btrfs_item_ptr(leaf, slot, struct btrfs_dev_extent);
chunk_offset = btrfs_dev_extent_chunk_offset(leaf, dev_extent);
cache = btrfs_lookup_block_group(fs_info, chunk_offset);
if (!cache)
goto skip;
spin_lock(&cache->lock);
cache->to_copy = 1;
spin_unlock(&cache->lock);
btrfs_put_block_group(cache);
skip:
ret = btrfs_next_item(root, path);
if (ret != 0) {
if (ret > 0)
ret = 0;
break;
}
}
free_path:
btrfs_free_path(path);
unlock:
mutex_unlock(&fs_info->chunk_mutex);
return ret;
}
bool btrfs_finish_block_group_to_copy(struct btrfs_device *srcdev,
struct btrfs_block_group *cache,
u64 physical)
{
struct btrfs_fs_info *fs_info = cache->fs_info;
struct extent_map *em;
struct map_lookup *map;
u64 chunk_offset = cache->start;
int num_extents, cur_extent;
int i;
/* Do not use "to_copy" on non zoned filesystem for now */
if (!btrfs_is_zoned(fs_info))
return true;
spin_lock(&cache->lock);
if (cache->removed) {
spin_unlock(&cache->lock);
return true;
}
spin_unlock(&cache->lock);
em = btrfs_get_chunk_map(fs_info, chunk_offset, 1);
ASSERT(!IS_ERR(em));
map = em->map_lookup;
num_extents = cur_extent = 0;
for (i = 0; i < map->num_stripes; i++) {
/* We have more device extent to copy */
if (srcdev != map->stripes[i].dev)
continue;
num_extents++;
if (physical == map->stripes[i].physical)
cur_extent = i;
}
free_extent_map(em);
if (num_extents > 1 && cur_extent < num_extents - 1) {
/*
* Has more stripes on this device. Keep this block group
* readonly until we finish all the stripes.
*/
return false;
}
/* Last stripe on this device */
spin_lock(&cache->lock);
cache->to_copy = 0;
spin_unlock(&cache->lock);
return true;
}
static int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info,
const char *tgtdev_name, u64 srcdevid, const char *srcdev_name,
int read_src)
{
struct btrfs_root *root = fs_info->dev_root;
struct btrfs_trans_handle *trans;
struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
int ret;
struct btrfs_device *tgt_device = NULL;
struct btrfs_device *src_device = NULL;
src_device = btrfs_find_device_by_devspec(fs_info, srcdevid,
srcdev_name);
if (IS_ERR(src_device))
return PTR_ERR(src_device);
if (btrfs_pinned_by_swapfile(fs_info, src_device)) {
btrfs_warn_in_rcu(fs_info,
"cannot replace device %s (devid %llu) due to active swapfile",
btrfs_dev_name(src_device), src_device->devid);
return -ETXTBSY;
}
/*
* Here we commit the transaction to make sure commit_total_bytes
* of all the devices are updated.
*/
trans = btrfs_attach_transaction(root);
if (!IS_ERR(trans)) {
ret = btrfs_commit_transaction(trans);
if (ret)
return ret;
} else if (PTR_ERR(trans) != -ENOENT) {
return PTR_ERR(trans);
}
ret = btrfs_init_dev_replace_tgtdev(fs_info, tgtdev_name,
src_device, &tgt_device);
if (ret)
return ret;
ret = mark_block_group_to_copy(fs_info, src_device);
if (ret)
return ret;
down_write(&dev_replace->rwsem);
switch (dev_replace->replace_state) {
case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED:
break;
case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
ASSERT(0);
ret = BTRFS_IOCTL_DEV_REPLACE_RESULT_ALREADY_STARTED;
up_write(&dev_replace->rwsem);
goto leave;
}
dev_replace->cont_reading_from_srcdev_mode = read_src;
dev_replace->srcdev = src_device;
dev_replace->tgtdev = tgt_device;
btrfs_info_in_rcu(fs_info,
"dev_replace from %s (devid %llu) to %s started",
btrfs_dev_name(src_device),
src_device->devid,
rcu_str_deref(tgt_device->name));
/*
* from now on, the writes to the srcdev are all duplicated to
* go to the tgtdev as well (refer to btrfs_map_block()).
*/
dev_replace->replace_state = BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED;
dev_replace->time_started = ktime_get_real_seconds();
dev_replace->cursor_left = 0;
dev_replace->committed_cursor_left = 0;
dev_replace->cursor_left_last_write_of_item = 0;
dev_replace->cursor_right = 0;
dev_replace->is_valid = 1;
dev_replace->item_needs_writeback = 1;
atomic64_set(&dev_replace->num_write_errors, 0);
atomic64_set(&dev_replace->num_uncorrectable_read_errors, 0);
up_write(&dev_replace->rwsem);
ret = btrfs_sysfs_add_device(tgt_device);
if (ret)
btrfs_err(fs_info, "kobj add dev failed %d", ret);
btrfs_wait_ordered_roots(fs_info, U64_MAX, 0, (u64)-1);
/* Commit dev_replace state and reserve 1 item for it. */
trans = btrfs_start_transaction(root, 1);
if (IS_ERR(trans)) {
ret = PTR_ERR(trans);
down_write(&dev_replace->rwsem);
dev_replace->replace_state =
BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED;
dev_replace->srcdev = NULL;
dev_replace->tgtdev = NULL;
up_write(&dev_replace->rwsem);
goto leave;
}
ret = btrfs_commit_transaction(trans);
WARN_ON(ret);
/* the disk copy procedure reuses the scrub code */
ret = btrfs_scrub_dev(fs_info, src_device->devid, 0,
btrfs_device_get_total_bytes(src_device),
&dev_replace->scrub_progress, 0, 1);
ret = btrfs_dev_replace_finishing(fs_info, ret);
if (ret == -EINPROGRESS)
ret = BTRFS_IOCTL_DEV_REPLACE_RESULT_SCRUB_INPROGRESS;
return ret;
leave:
btrfs_destroy_dev_replace_tgtdev(tgt_device);
return ret;
}
int btrfs_dev_replace_by_ioctl(struct btrfs_fs_info *fs_info,
struct btrfs_ioctl_dev_replace_args *args)
{
int ret;
switch (args->start.cont_reading_from_srcdev_mode) {
case BTRFS_IOCTL_DEV_REPLACE_CONT_READING_FROM_SRCDEV_MODE_ALWAYS:
case BTRFS_IOCTL_DEV_REPLACE_CONT_READING_FROM_SRCDEV_MODE_AVOID:
break;
default:
return -EINVAL;
}
if ((args->start.srcdevid == 0 && args->start.srcdev_name[0] == '\0') ||
args->start.tgtdev_name[0] == '\0')
return -EINVAL;
ret = btrfs_dev_replace_start(fs_info, args->start.tgtdev_name,
args->start.srcdevid,
args->start.srcdev_name,
args->start.cont_reading_from_srcdev_mode);
args->result = ret;
/* don't warn if EINPROGRESS, someone else might be running scrub */
if (ret == BTRFS_IOCTL_DEV_REPLACE_RESULT_SCRUB_INPROGRESS ||
ret == BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR)
return 0;
return ret;
}
/*
* blocked until all in-flight bios operations are finished.
*/
static void btrfs_rm_dev_replace_blocked(struct btrfs_fs_info *fs_info)
{
set_bit(BTRFS_FS_STATE_DEV_REPLACING, &fs_info->fs_state);
wait_event(fs_info->dev_replace.replace_wait, !percpu_counter_sum(
&fs_info->dev_replace.bio_counter));
}
/*
* we have removed target device, it is safe to allow new bios request.
*/
static void btrfs_rm_dev_replace_unblocked(struct btrfs_fs_info *fs_info)
{
clear_bit(BTRFS_FS_STATE_DEV_REPLACING, &fs_info->fs_state);
wake_up(&fs_info->dev_replace.replace_wait);
}
/*
* When finishing the device replace, before swapping the source device with the
* target device we must update the chunk allocation state in the target device,
* as it is empty because replace works by directly copying the chunks and not
* through the normal chunk allocation path.
*/
static int btrfs_set_target_alloc_state(struct btrfs_device *srcdev,
struct btrfs_device *tgtdev)
{
struct extent_state *cached_state = NULL;
u64 start = 0;
u64 found_start;
u64 found_end;
int ret = 0;
lockdep_assert_held(&srcdev->fs_info->chunk_mutex);
while (!find_first_extent_bit(&srcdev->alloc_state, start,
&found_start, &found_end,
CHUNK_ALLOCATED, &cached_state)) {
ret = set_extent_bits(&tgtdev->alloc_state, found_start,
found_end, CHUNK_ALLOCATED);
if (ret)
break;
start = found_end + 1;
}
free_extent_state(cached_state);
return ret;
}
static void btrfs_dev_replace_update_device_in_mapping_tree(
struct btrfs_fs_info *fs_info,
struct btrfs_device *srcdev,
struct btrfs_device *tgtdev)
{
struct extent_map_tree *em_tree = &fs_info->mapping_tree;
struct extent_map *em;
struct map_lookup *map;
u64 start = 0;
int i;
write_lock(&em_tree->lock);
do {
em = lookup_extent_mapping(em_tree, start, (u64)-1);
if (!em)
break;
map = em->map_lookup;
for (i = 0; i < map->num_stripes; i++)
if (srcdev == map->stripes[i].dev)
map->stripes[i].dev = tgtdev;
start = em->start + em->len;
free_extent_map(em);
} while (start);
write_unlock(&em_tree->lock);
}
static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
int scrub_ret)
{
struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
struct btrfs_device *tgt_device;
struct btrfs_device *src_device;
struct btrfs_root *root = fs_info->tree_root;
u8 uuid_tmp[BTRFS_UUID_SIZE];
struct btrfs_trans_handle *trans;
int ret = 0;
/* don't allow cancel or unmount to disturb the finishing procedure */
mutex_lock(&dev_replace->lock_finishing_cancel_unmount);
down_read(&dev_replace->rwsem);
/* was the operation canceled, or is it finished? */
if (dev_replace->replace_state !=
BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED) {
up_read(&dev_replace->rwsem);
mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
return 0;
}
tgt_device = dev_replace->tgtdev;
src_device = dev_replace->srcdev;
up_read(&dev_replace->rwsem);
/*
* flush all outstanding I/O and inode extent mappings before the
* copy operation is declared as being finished
*/
ret = btrfs_start_delalloc_roots(fs_info, LONG_MAX, false);
if (ret) {
mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
return ret;
}
btrfs_wait_ordered_roots(fs_info, U64_MAX, 0, (u64)-1);
if (!scrub_ret)
btrfs_reada_remove_dev(src_device);
/*
* We have to use this loop approach because at this point src_device
* has to be available for transaction commit to complete, yet new
* chunks shouldn't be allocated on the device.
*/
while (1) {
trans = btrfs_start_transaction(root, 0);
if (IS_ERR(trans)) {
btrfs_reada_undo_remove_dev(src_device);
mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
return PTR_ERR(trans);
}
ret = btrfs_commit_transaction(trans);
WARN_ON(ret);
/* Prevent write_all_supers() during the finishing procedure */
mutex_lock(&fs_info->fs_devices->device_list_mutex);
/* Prevent new chunks being allocated on the source device */
mutex_lock(&fs_info->chunk_mutex);
if (!list_empty(&src_device->post_commit_list)) {
mutex_unlock(&fs_info->fs_devices->device_list_mutex);
mutex_unlock(&fs_info->chunk_mutex);
} else {
break;
}
}
down_write(&dev_replace->rwsem);
dev_replace->replace_state =
scrub_ret ? BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED
: BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED;
dev_replace->tgtdev = NULL;
dev_replace->srcdev = NULL;
dev_replace->time_stopped = ktime_get_real_seconds();
dev_replace->item_needs_writeback = 1;
/*
* Update allocation state in the new device and replace the old device
* with the new one in the mapping tree.
*/
if (!scrub_ret) {
scrub_ret = btrfs_set_target_alloc_state(src_device, tgt_device);
if (scrub_ret)
goto error;
btrfs_dev_replace_update_device_in_mapping_tree(fs_info,
src_device,
tgt_device);
} else {
if (scrub_ret != -ECANCELED)
btrfs_err_in_rcu(fs_info,
"btrfs_scrub_dev(%s, %llu, %s) failed %d",
btrfs_dev_name(src_device),
src_device->devid,
rcu_str_deref(tgt_device->name), scrub_ret);
error:
up_write(&dev_replace->rwsem);
mutex_unlock(&fs_info->chunk_mutex);
mutex_unlock(&fs_info->fs_devices->device_list_mutex);
btrfs_reada_undo_remove_dev(src_device);
btrfs_rm_dev_replace_blocked(fs_info);
if (tgt_device)
btrfs_destroy_dev_replace_tgtdev(tgt_device);
btrfs_rm_dev_replace_unblocked(fs_info);
mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
return scrub_ret;
}
btrfs_info_in_rcu(fs_info,
"dev_replace from %s (devid %llu) to %s finished",
btrfs_dev_name(src_device),
src_device->devid,
rcu_str_deref(tgt_device->name));
clear_bit(BTRFS_DEV_STATE_REPLACE_TGT, &tgt_device->dev_state);
tgt_device->devid = src_device->devid;
src_device->devid = BTRFS_DEV_REPLACE_DEVID;
memcpy(uuid_tmp, tgt_device->uuid, sizeof(uuid_tmp));
memcpy(tgt_device->uuid, src_device->uuid, sizeof(tgt_device->uuid));
memcpy(src_device->uuid, uuid_tmp, sizeof(src_device->uuid));
btrfs_device_set_total_bytes(tgt_device, src_device->total_bytes);
btrfs_device_set_disk_total_bytes(tgt_device,
src_device->disk_total_bytes);
btrfs_device_set_bytes_used(tgt_device, src_device->bytes_used);
tgt_device->commit_bytes_used = src_device->bytes_used;
btrfs_assign_next_active_device(src_device, tgt_device);
list_add(&tgt_device->dev_alloc_list, &fs_info->fs_devices->alloc_list);
fs_info->fs_devices->rw_devices++;
up_write(&dev_replace->rwsem);
btrfs_rm_dev_replace_blocked(fs_info);
btrfs_rm_dev_replace_remove_srcdev(src_device);
btrfs_rm_dev_replace_unblocked(fs_info);
/*
* Increment dev_stats_ccnt so that btrfs_run_dev_stats() will
* update on-disk dev stats value during commit transaction
*/
atomic_inc(&tgt_device->dev_stats_ccnt);
/*
* this is again a consistent state where no dev_replace procedure
* is running, the target device is part of the filesystem, the
* source device is not part of the filesystem anymore and its 1st
* superblock is scratched out so that it is no longer marked to
* belong to this filesystem.
*/
mutex_unlock(&fs_info->chunk_mutex);
mutex_unlock(&fs_info->fs_devices->device_list_mutex);
/* replace the sysfs entry */
btrfs_sysfs_remove_device(src_device);
btrfs_sysfs_update_devid(tgt_device);
if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &src_device->dev_state))
btrfs_scratch_superblocks(fs_info, src_device->bdev,
src_device->name->str);
/* write back the superblocks */
trans = btrfs_start_transaction(root, 0);
if (!IS_ERR(trans))
btrfs_commit_transaction(trans);
mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
btrfs_rm_dev_replace_free_srcdev(src_device);
return 0;
}
/*
* Read progress of device replace status according to the state and last
* stored position. The value format is the same as for
* btrfs_dev_replace::progress_1000
*/
static u64 btrfs_dev_replace_progress(struct btrfs_fs_info *fs_info)
{
struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
u64 ret = 0;
switch (dev_replace->replace_state) {
case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED:
ret = 0;
break;
case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
ret = 1000;
break;
case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
ret = div64_u64(dev_replace->cursor_left,
div_u64(btrfs_device_get_total_bytes(
dev_replace->srcdev), 1000));
break;
}
return ret;
}
void btrfs_dev_replace_status(struct btrfs_fs_info *fs_info,
struct btrfs_ioctl_dev_replace_args *args)
{
struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
down_read(&dev_replace->rwsem);
/* even if !dev_replace_is_valid, the values are good enough for
* the replace_status ioctl */
args->result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR;
args->status.replace_state = dev_replace->replace_state;
args->status.time_started = dev_replace->time_started;
args->status.time_stopped = dev_replace->time_stopped;
args->status.num_write_errors =
atomic64_read(&dev_replace->num_write_errors);
args->status.num_uncorrectable_read_errors =
atomic64_read(&dev_replace->num_uncorrectable_read_errors);
args->status.progress_1000 = btrfs_dev_replace_progress(fs_info);
up_read(&dev_replace->rwsem);
}
int btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info)
{
struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
struct btrfs_device *tgt_device = NULL;
struct btrfs_device *src_device = NULL;
struct btrfs_trans_handle *trans;
struct btrfs_root *root = fs_info->tree_root;
int result;
int ret;
if (sb_rdonly(fs_info->sb))
return -EROFS;
mutex_lock(&dev_replace->lock_finishing_cancel_unmount);
down_write(&dev_replace->rwsem);
switch (dev_replace->replace_state) {
case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED:
result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NOT_STARTED;
up_write(&dev_replace->rwsem);
break;
case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
tgt_device = dev_replace->tgtdev;
src_device = dev_replace->srcdev;
up_write(&dev_replace->rwsem);
ret = btrfs_scrub_cancel(fs_info);
if (ret < 0) {
result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NOT_STARTED;
} else {
result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR;
/*
* btrfs_dev_replace_finishing() will handle the
* cleanup part
*/
btrfs_info_in_rcu(fs_info,
"dev_replace from %s (devid %llu) to %s canceled",
btrfs_dev_name(src_device), src_device->devid,
btrfs_dev_name(tgt_device));
}
break;
case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
/*
* Scrub doing the replace isn't running so we need to do the
* cleanup step of btrfs_dev_replace_finishing() here
*/
result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR;
tgt_device = dev_replace->tgtdev;
src_device = dev_replace->srcdev;
dev_replace->tgtdev = NULL;
dev_replace->srcdev = NULL;
dev_replace->replace_state =
BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED;
dev_replace->time_stopped = ktime_get_real_seconds();
dev_replace->item_needs_writeback = 1;
up_write(&dev_replace->rwsem);
/* Scrub for replace must not be running in suspended state */
ret = btrfs_scrub_cancel(fs_info);
ASSERT(ret != -ENOTCONN);
trans = btrfs_start_transaction(root, 0);
if (IS_ERR(trans)) {
mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
return PTR_ERR(trans);
}
ret = btrfs_commit_transaction(trans);
WARN_ON(ret);
btrfs_info_in_rcu(fs_info,
"suspended dev_replace from %s (devid %llu) to %s canceled",
btrfs_dev_name(src_device), src_device->devid,
btrfs_dev_name(tgt_device));
if (tgt_device)
btrfs_destroy_dev_replace_tgtdev(tgt_device);
break;
default:
up_write(&dev_replace->rwsem);
result = -EINVAL;
}
mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
return result;
}
void btrfs_dev_replace_suspend_for_unmount(struct btrfs_fs_info *fs_info)
{
struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
mutex_lock(&dev_replace->lock_finishing_cancel_unmount);
down_write(&dev_replace->rwsem);
switch (dev_replace->replace_state) {
case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED:
case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
break;
case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
dev_replace->replace_state =
BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED;
dev_replace->time_stopped = ktime_get_real_seconds();
dev_replace->item_needs_writeback = 1;
btrfs_info(fs_info, "suspending dev_replace for unmount");
break;
}
up_write(&dev_replace->rwsem);
mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
}
/* resume dev_replace procedure that was interrupted by unmount */
int btrfs_resume_dev_replace_async(struct btrfs_fs_info *fs_info)
{
struct task_struct *task;
struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
down_write(&dev_replace->rwsem);
switch (dev_replace->replace_state) {
case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED:
up_write(&dev_replace->rwsem); return 0;
case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
break;
case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
dev_replace->replace_state =
BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED;
break;
}
if (!dev_replace->tgtdev || !dev_replace->tgtdev->bdev) {
btrfs_info(fs_info,
"cannot continue dev_replace, tgtdev is missing");
btrfs_info(fs_info,
"you may cancel the operation after 'mount -o degraded'");
dev_replace->replace_state =
BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED;
up_write(&dev_replace->rwsem);
return 0;
}
up_write(&dev_replace->rwsem);
/*
* This could collide with a paused balance, but the exclusive op logic
* should never allow both to start and pause. We don't want to allow
* dev-replace to start anyway.
*/
if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_DEV_REPLACE)) {
down_write(&dev_replace->rwsem);
dev_replace->replace_state =
BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED;
up_write(&dev_replace->rwsem);
btrfs_info(fs_info,
"cannot resume dev-replace, other exclusive operation running");
return 0;
}
task = kthread_run(btrfs_dev_replace_kthread, fs_info, "btrfs-devrepl");
return PTR_ERR_OR_ZERO(task);
}
static int btrfs_dev_replace_kthread(void *data)
{
struct btrfs_fs_info *fs_info = data;
struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
u64 progress;
int ret;
progress = btrfs_dev_replace_progress(fs_info);
progress = div_u64(progress, 10);
btrfs_info_in_rcu(fs_info,
"continuing dev_replace from %s (devid %llu) to target %s @%u%%",
btrfs_dev_name(dev_replace->srcdev),
dev_replace->srcdev->devid,
btrfs_dev_name(dev_replace->tgtdev),
(unsigned int)progress);
ret = btrfs_scrub_dev(fs_info, dev_replace->srcdev->devid,
dev_replace->committed_cursor_left,
btrfs_device_get_total_bytes(dev_replace->srcdev),
&dev_replace->scrub_progress, 0, 1);
ret = btrfs_dev_replace_finishing(fs_info, ret);
WARN_ON(ret && ret != -ECANCELED);
btrfs_exclop_finish(fs_info);
return 0;
}
int __pure btrfs_dev_replace_is_ongoing(struct btrfs_dev_replace *dev_replace)
{
if (!dev_replace->is_valid)
return 0;
switch (dev_replace->replace_state) {
case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED:
return 0;
case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
/*
* return true even if tgtdev is missing (this is
* something that can happen if the dev_replace
* procedure is suspended by an umount and then
* the tgtdev is missing (or "btrfs dev scan") was
* not called and the filesystem is remounted
* in degraded state. This does not stop the
* dev_replace procedure. It needs to be canceled
* manually if the cancellation is wanted.
*/
break;
}
return 1;
}
void btrfs_bio_counter_inc_noblocked(struct btrfs_fs_info *fs_info)
{
percpu_counter_inc(&fs_info->dev_replace.bio_counter);
}
void btrfs_bio_counter_sub(struct btrfs_fs_info *fs_info, s64 amount)
{
percpu_counter_sub(&fs_info->dev_replace.bio_counter, amount); cond_wake_up_nomb(&fs_info->dev_replace.replace_wait);
}
void btrfs_bio_counter_inc_blocked(struct btrfs_fs_info *fs_info)
{
while (1) {
percpu_counter_inc(&fs_info->dev_replace.bio_counter);
if (likely(!test_bit(BTRFS_FS_STATE_DEV_REPLACING,
&fs_info->fs_state)))
break;
btrfs_bio_counter_dec(fs_info);
wait_event(fs_info->dev_replace.replace_wait,
!test_bit(BTRFS_FS_STATE_DEV_REPLACING,
&fs_info->fs_state));
}
}
/* SPDX-License-Identifier: GPL-2.0 */
/*
* Written by Mark Hemment, 1996 (markhe@nextd.demon.co.uk).
*
* (C) SGI 2006, Christoph Lameter
* Cleaned up and restructured to ease the addition of alternative
* implementations of SLAB allocators.
* (C) Linux Foundation 2008-2013
* Unified interface for all slab allocators
*/
#ifndef _LINUX_SLAB_H
#define _LINUX_SLAB_H
#include <linux/gfp.h>
#include <linux/overflow.h>
#include <linux/types.h>
#include <linux/workqueue.h>
#include <linux/percpu-refcount.h>
/*
* Flags to pass to kmem_cache_create().
* The ones marked DEBUG are only valid if CONFIG_DEBUG_SLAB is set.
*/
/* DEBUG: Perform (expensive) checks on alloc/free */
#define SLAB_CONSISTENCY_CHECKS ((slab_flags_t __force)0x00000100U)
/* DEBUG: Red zone objs in a cache */
#define SLAB_RED_ZONE ((slab_flags_t __force)0x00000400U)
/* DEBUG: Poison objects */
#define SLAB_POISON ((slab_flags_t __force)0x00000800U)
/* Align objs on cache lines */
#define SLAB_HWCACHE_ALIGN ((slab_flags_t __force)0x00002000U)
/* Use GFP_DMA memory */
#define SLAB_CACHE_DMA ((slab_flags_t __force)0x00004000U)
/* Use GFP_DMA32 memory */
#define SLAB_CACHE_DMA32 ((slab_flags_t __force)0x00008000U)
/* DEBUG: Store the last owner for bug hunting */
#define SLAB_STORE_USER ((slab_flags_t __force)0x00010000U)
/* Panic if kmem_cache_create() fails */
#define SLAB_PANIC ((slab_flags_t __force)0x00040000U)
/*
* SLAB_TYPESAFE_BY_RCU - **WARNING** READ THIS!
*
* This delays freeing the SLAB page by a grace period, it does _NOT_
* delay object freeing. This means that if you do kmem_cache_free()
* that memory location is free to be reused at any time. Thus it may
* be possible to see another object there in the same RCU grace period.
*
* This feature only ensures the memory location backing the object
* stays valid, the trick to using this is relying on an independent
* object validation pass. Something like:
*
* rcu_read_lock()
* again:
* obj = lockless_lookup(key);
* if (obj) {
* if (!try_get_ref(obj)) // might fail for free objects
* goto again;
*
* if (obj->key != key) { // not the object we expected
* put_ref(obj);
* goto again;
* }
* }
* rcu_read_unlock();
*
* This is useful if we need to approach a kernel structure obliquely,
* from its address obtained without the usual locking. We can lock
* the structure to stabilize it and check it's still at the given address,
* only if we can be sure that the memory has not been meanwhile reused
* for some other kind of object (which our subsystem's lock might corrupt).
*
* rcu_read_lock before reading the address, then rcu_read_unlock after
* taking the spinlock within the structure expected at that address.
*
* Note that SLAB_TYPESAFE_BY_RCU was originally named SLAB_DESTROY_BY_RCU.
*/
/* Defer freeing slabs to RCU */
#define SLAB_TYPESAFE_BY_RCU ((slab_flags_t __force)0x00080000U)
/* Spread some memory over cpuset */
#define SLAB_MEM_SPREAD ((slab_flags_t __force)0x00100000U)
/* Trace allocations and frees */
#define SLAB_TRACE ((slab_flags_t __force)0x00200000U)
/* Flag to prevent checks on free */
#ifdef CONFIG_DEBUG_OBJECTS
# define SLAB_DEBUG_OBJECTS ((slab_flags_t __force)0x00400000U)
#else
# define SLAB_DEBUG_OBJECTS 0
#endif
/* Avoid kmemleak tracing */
#define SLAB_NOLEAKTRACE ((slab_flags_t __force)0x00800000U)
/* Fault injection mark */
#ifdef CONFIG_FAILSLAB
# define SLAB_FAILSLAB ((slab_flags_t __force)0x02000000U)
#else
# define SLAB_FAILSLAB 0
#endif
/* Account to memcg */
#ifdef CONFIG_MEMCG_KMEM
# define SLAB_ACCOUNT ((slab_flags_t __force)0x04000000U)
#else
# define SLAB_ACCOUNT 0
#endif
#ifdef CONFIG_KASAN
#define SLAB_KASAN ((slab_flags_t __force)0x08000000U)
#else
#define SLAB_KASAN 0
#endif
/* The following flags affect the page allocator grouping pages by mobility */
/* Objects are reclaimable */
#define SLAB_RECLAIM_ACCOUNT ((slab_flags_t __force)0x00020000U)
#define SLAB_TEMPORARY SLAB_RECLAIM_ACCOUNT /* Objects are short-lived */
/* Slab deactivation flag */
#define SLAB_DEACTIVATED ((slab_flags_t __force)0x10000000U)
/*
* ZERO_SIZE_PTR will be returned for zero sized kmalloc requests.
*
* Dereferencing ZERO_SIZE_PTR will lead to a distinct access fault.
*
* ZERO_SIZE_PTR can be passed to kfree though in the same way that NULL can.
* Both make kfree a no-op.
*/
#define ZERO_SIZE_PTR ((void *)16)
#define ZERO_OR_NULL_PTR(x) ((unsigned long)(x) <= \
(unsigned long)ZERO_SIZE_PTR)
#include <linux/kasan.h>
struct mem_cgroup;
/*
* struct kmem_cache related prototypes
*/
void __init kmem_cache_init(void);
bool slab_is_available(void);
extern bool usercopy_fallback;
struct kmem_cache *kmem_cache_create(const char *name, unsigned int size,
unsigned int align, slab_flags_t flags,
void (*ctor)(void *));
struct kmem_cache *kmem_cache_create_usercopy(const char *name,
unsigned int size, unsigned int align,
slab_flags_t flags,
unsigned int useroffset, unsigned int usersize,
void (*ctor)(void *));
void kmem_cache_destroy(struct kmem_cache *);
int kmem_cache_shrink(struct kmem_cache *);
/*
* Please use this macro to create slab caches. Simply specify the
* name of the structure and maybe some flags that are listed above.
*
* The alignment of the struct determines object alignment. If you
* f.e. add ____cacheline_aligned_in_smp to the struct declaration
* then the objects will be properly aligned in SMP configurations.
*/
#define KMEM_CACHE(__struct, __flags) \
kmem_cache_create(#__struct, sizeof(struct __struct), \
__alignof__(struct __struct), (__flags), NULL)
/*
* To whitelist a single field for copying to/from usercopy, use this
* macro instead for KMEM_CACHE() above.
*/
#define KMEM_CACHE_USERCOPY(__struct, __flags, __field) \
kmem_cache_create_usercopy(#__struct, \
sizeof(struct __struct), \
__alignof__(struct __struct), (__flags), \
offsetof(struct __struct, __field), \
sizeof_field(struct __struct, __field), NULL)
/*
* Common kmalloc functions provided by all allocators
*/
void * __must_check krealloc(const void *, size_t, gfp_t);
void kfree(const void *);
void kfree_sensitive(const void *);
size_t __ksize(const void *);
size_t ksize(const void *);
#ifdef CONFIG_PRINTK
bool kmem_valid_obj(void *object);
void kmem_dump_obj(void *object);
#endif
#ifdef CONFIG_HAVE_HARDENED_USERCOPY_ALLOCATOR
void __check_heap_object(const void *ptr, unsigned long n, struct page *page,
bool to_user);
#else
static inline void __check_heap_object(const void *ptr, unsigned long n,
struct page *page, bool to_user) { }
#endif
/*
* Some archs want to perform DMA into kmalloc caches and need a guaranteed
* alignment larger than the alignment of a 64-bit integer.
* Setting ARCH_KMALLOC_MINALIGN in arch headers allows that.
*/
#if defined(ARCH_DMA_MINALIGN) && ARCH_DMA_MINALIGN > 8
#define ARCH_KMALLOC_MINALIGN ARCH_DMA_MINALIGN
#define KMALLOC_MIN_SIZE ARCH_DMA_MINALIGN
#define KMALLOC_SHIFT_LOW ilog2(ARCH_DMA_MINALIGN)
#else
#define ARCH_KMALLOC_MINALIGN __alignof__(unsigned long long)
#endif
/*
* Setting ARCH_SLAB_MINALIGN in arch headers allows a different alignment.
* Intended for arches that get misalignment faults even for 64 bit integer
* aligned buffers.
*/
#ifndef ARCH_SLAB_MINALIGN
#define ARCH_SLAB_MINALIGN __alignof__(unsigned long long)
#endif
/*
* kmalloc and friends return ARCH_KMALLOC_MINALIGN aligned
* pointers. kmem_cache_alloc and friends return ARCH_SLAB_MINALIGN
* aligned pointers.
*/
#define __assume_kmalloc_alignment __assume_aligned(ARCH_KMALLOC_MINALIGN)
#define __assume_slab_alignment __assume_aligned(ARCH_SLAB_MINALIGN)
#define __assume_page_alignment __assume_aligned(PAGE_SIZE)
/*
* Kmalloc array related definitions
*/
#ifdef CONFIG_SLAB
/*
* The largest kmalloc size supported by the SLAB allocators is
* 32 megabyte (2^25) or the maximum allocatable page order if that is
* less than 32 MB.
*
* WARNING: Its not easy to increase this value since the allocators have
* to do various tricks to work around compiler limitations in order to
* ensure proper constant folding.
*/
#define KMALLOC_SHIFT_HIGH ((MAX_ORDER + PAGE_SHIFT - 1) <= 25 ? \
(MAX_ORDER + PAGE_SHIFT - 1) : 25)
#define KMALLOC_SHIFT_MAX KMALLOC_SHIFT_HIGH
#ifndef KMALLOC_SHIFT_LOW
#define KMALLOC_SHIFT_LOW 5
#endif
#endif
#ifdef CONFIG_SLUB
/*
* SLUB directly allocates requests fitting in to an order-1 page
* (PAGE_SIZE*2). Larger requests are passed to the page allocator.
*/
#define KMALLOC_SHIFT_HIGH (PAGE_SHIFT + 1)
#define KMALLOC_SHIFT_MAX (MAX_ORDER + PAGE_SHIFT - 1)
#ifndef KMALLOC_SHIFT_LOW
#define KMALLOC_SHIFT_LOW 3
#endif
#endif
#ifdef CONFIG_SLOB
/*
* SLOB passes all requests larger than one page to the page allocator.
* No kmalloc array is necessary since objects of different sizes can
* be allocated from the same page.
*/
#define KMALLOC_SHIFT_HIGH PAGE_SHIFT
#define KMALLOC_SHIFT_MAX (MAX_ORDER + PAGE_SHIFT - 1)
#ifndef KMALLOC_SHIFT_LOW
#define KMALLOC_SHIFT_LOW 3
#endif
#endif
/* Maximum allocatable size */
#define KMALLOC_MAX_SIZE (1UL << KMALLOC_SHIFT_MAX)
/* Maximum size for which we actually use a slab cache */
#define KMALLOC_MAX_CACHE_SIZE (1UL << KMALLOC_SHIFT_HIGH)
/* Maximum order allocatable via the slab allocator */
#define KMALLOC_MAX_ORDER (KMALLOC_SHIFT_MAX - PAGE_SHIFT)
/*
* Kmalloc subsystem.
*/
#ifndef KMALLOC_MIN_SIZE
#define KMALLOC_MIN_SIZE (1 << KMALLOC_SHIFT_LOW)
#endif
/*
* This restriction comes from byte sized index implementation.
* Page size is normally 2^12 bytes and, in this case, if we want to use
* byte sized index which can represent 2^8 entries, the size of the object
* should be equal or greater to 2^12 / 2^8 = 2^4 = 16.
* If minimum size of kmalloc is less than 16, we use it as minimum object
* size and give up to use byte sized index.
*/
#define SLAB_OBJ_MIN_SIZE (KMALLOC_MIN_SIZE < 16 ? \
(KMALLOC_MIN_SIZE) : 16)
/*
* Whenever changing this, take care of that kmalloc_type() and
* create_kmalloc_caches() still work as intended.
*
* KMALLOC_NORMAL can contain only unaccounted objects whereas KMALLOC_CGROUP
* is for accounted but unreclaimable and non-dma objects. All the other
* kmem caches can have both accounted and unaccounted objects.
*/
enum kmalloc_cache_type {
KMALLOC_NORMAL = 0,
#ifndef CONFIG_ZONE_DMA
KMALLOC_DMA = KMALLOC_NORMAL,
#endif
#ifndef CONFIG_MEMCG_KMEM
KMALLOC_CGROUP = KMALLOC_NORMAL,
#else
KMALLOC_CGROUP,
#endif
KMALLOC_RECLAIM,
#ifdef CONFIG_ZONE_DMA
KMALLOC_DMA,
#endif
NR_KMALLOC_TYPES
};
#ifndef CONFIG_SLOB
extern struct kmem_cache *
kmalloc_caches[NR_KMALLOC_TYPES][KMALLOC_SHIFT_HIGH + 1];
/*
* Define gfp bits that should not be set for KMALLOC_NORMAL.
*/
#define KMALLOC_NOT_NORMAL_BITS \
(__GFP_RECLAIMABLE | \
(IS_ENABLED(CONFIG_ZONE_DMA) ? __GFP_DMA : 0) | \
(IS_ENABLED(CONFIG_MEMCG_KMEM) ? __GFP_ACCOUNT : 0))
static __always_inline enum kmalloc_cache_type kmalloc_type(gfp_t flags)
{
/*
* The most common case is KMALLOC_NORMAL, so test for it
* with a single branch for all the relevant flags.
*/
if (likely((flags & KMALLOC_NOT_NORMAL_BITS) == 0))
return KMALLOC_NORMAL;
/*
* At least one of the flags has to be set. Their priorities in
* decreasing order are:
* 1) __GFP_DMA
* 2) __GFP_RECLAIMABLE
* 3) __GFP_ACCOUNT
*/
if (IS_ENABLED(CONFIG_ZONE_DMA) && (flags & __GFP_DMA))
return KMALLOC_DMA;
if (!IS_ENABLED(CONFIG_MEMCG_KMEM) || (flags & __GFP_RECLAIMABLE))
return KMALLOC_RECLAIM;
else
return KMALLOC_CGROUP;
}
/*
* Figure out which kmalloc slab an allocation of a certain size
* belongs to.
* 0 = zero alloc
* 1 = 65 .. 96 bytes
* 2 = 129 .. 192 bytes
* n = 2^(n-1)+1 .. 2^n
*
* Note: __kmalloc_index() is compile-time optimized, and not runtime optimized;
* typical usage is via kmalloc_index() and therefore evaluated at compile-time.
* Callers where !size_is_constant should only be test modules, where runtime
* overheads of __kmalloc_index() can be tolerated. Also see kmalloc_slab().
*/
static __always_inline unsigned int __kmalloc_index(size_t size,
bool size_is_constant)
{
if (!size)
return 0;
if (size <= KMALLOC_MIN_SIZE)
return KMALLOC_SHIFT_LOW;
if (KMALLOC_MIN_SIZE <= 32 && size > 64 && size <= 96)
return 1;
if (KMALLOC_MIN_SIZE <= 64 && size > 128 && size <= 192)
return 2;
if (size <= 8) return 3;
if (size <= 16) return 4;
if (size <= 32) return 5;
if (size <= 64) return 6;
if (size <= 128) return 7;
if (size <= 256) return 8;
if (size <= 512) return 9;
if (size <= 1024) return 10;
if (size <= 2 * 1024) return 11;
if (size <= 4 * 1024) return 12;
if (size <= 8 * 1024) return 13;
if (size <= 16 * 1024) return 14;
if (size <= 32 * 1024) return 15;
if (size <= 64 * 1024) return 16;
if (size <= 128 * 1024) return 17;
if (size <= 256 * 1024) return 18;
if (size <= 512 * 1024) return 19;
if (size <= 1024 * 1024) return 20;
if (size <= 2 * 1024 * 1024) return 21;
if (size <= 4 * 1024 * 1024) return 22;
if (size <= 8 * 1024 * 1024) return 23;
if (size <= 16 * 1024 * 1024) return 24;
if (size <= 32 * 1024 * 1024) return 25;
if ((IS_ENABLED(CONFIG_CC_IS_GCC) || CONFIG_CLANG_VERSION >= 110000)
&& !IS_ENABLED(CONFIG_PROFILE_ALL_BRANCHES) && size_is_constant)
BUILD_BUG_ON_MSG(1, "unexpected size in kmalloc_index()");
else
BUG();
/* Will never be reached. Needed because the compiler may complain */
return -1;
}
#define kmalloc_index(s) __kmalloc_index(s, true)
#endif /* !CONFIG_SLOB */
void *__kmalloc(size_t size, gfp_t flags) __assume_kmalloc_alignment __malloc;
void *kmem_cache_alloc(struct kmem_cache *, gfp_t flags) __assume_slab_alignment __malloc;
void kmem_cache_free(struct kmem_cache *, void *);
/*
* Bulk allocation and freeing operations. These are accelerated in an
* allocator specific way to avoid taking locks repeatedly or building
* metadata structures unnecessarily.
*
* Note that interrupts must be enabled when calling these functions.
*/
void kmem_cache_free_bulk(struct kmem_cache *, size_t, void **);
int kmem_cache_alloc_bulk(struct kmem_cache *, gfp_t, size_t, void **);
/*
* Caller must not use kfree_bulk() on memory not originally allocated
* by kmalloc(), because the SLOB allocator cannot handle this.
*/
static __always_inline void kfree_bulk(size_t size, void **p)
{
kmem_cache_free_bulk(NULL, size, p);
}
#ifdef CONFIG_NUMA
void *__kmalloc_node(size_t size, gfp_t flags, int node) __assume_kmalloc_alignment __malloc;
void *kmem_cache_alloc_node(struct kmem_cache *, gfp_t flags, int node) __assume_slab_alignment __malloc;
#else
static __always_inline void *__kmalloc_node(size_t size, gfp_t flags, int node)
{
return __kmalloc(size, flags);
}
static __always_inline void *kmem_cache_alloc_node(struct kmem_cache *s, gfp_t flags, int node)
{
return kmem_cache_alloc(s, flags);
}
#endif
#ifdef CONFIG_TRACING
extern void *kmem_cache_alloc_trace(struct kmem_cache *, gfp_t, size_t) __assume_slab_alignment __malloc;
#ifdef CONFIG_NUMA
extern void *kmem_cache_alloc_node_trace(struct kmem_cache *s,
gfp_t gfpflags,
int node, size_t size) __assume_slab_alignment __malloc;
#else
static __always_inline void *
kmem_cache_alloc_node_trace(struct kmem_cache *s,
gfp_t gfpflags,
int node, size_t size)
{
return kmem_cache_alloc_trace(s, gfpflags, size);
}
#endif /* CONFIG_NUMA */
#else /* CONFIG_TRACING */
static __always_inline void *kmem_cache_alloc_trace(struct kmem_cache *s,
gfp_t flags, size_t size)
{
void *ret = kmem_cache_alloc(s, flags);
ret = kasan_kmalloc(s, ret, size, flags);
return ret;
}
static __always_inline void *
kmem_cache_alloc_node_trace(struct kmem_cache *s,
gfp_t gfpflags,
int node, size_t size)
{
void *ret = kmem_cache_alloc_node(s, gfpflags, node);
ret = kasan_kmalloc(s, ret, size, gfpflags);
return ret;
}
#endif /* CONFIG_TRACING */
extern void *kmalloc_order(size_t size, gfp_t flags, unsigned int order) __assume_page_alignment __malloc;
#ifdef CONFIG_TRACING
extern void *kmalloc_order_trace(size_t size, gfp_t flags, unsigned int order) __assume_page_alignment __malloc;
#else
static __always_inline void *
kmalloc_order_trace(size_t size, gfp_t flags, unsigned int order)
{
return kmalloc_order(size, flags, order);
}
#endif
static __always_inline void *kmalloc_large(size_t size, gfp_t flags)
{
unsigned int order = get_order(size);
return kmalloc_order_trace(size, flags, order);
}
/**
* kmalloc - allocate memory
* @size: how many bytes of memory are required.
* @flags: the type of memory to allocate.
*
* kmalloc is the normal method of allocating memory
* for objects smaller than page size in the kernel.
*
* The allocated object address is aligned to at least ARCH_KMALLOC_MINALIGN
* bytes. For @size of power of two bytes, the alignment is also guaranteed
* to be at least to the size.
*
* The @flags argument may be one of the GFP flags defined at
* include/linux/gfp.h and described at
* :ref:`Documentation/core-api/mm-api.rst <mm-api-gfp-flags>`
*
* The recommended usage of the @flags is described at
* :ref:`Documentation/core-api/memory-allocation.rst <memory_allocation>`
*
* Below is a brief outline of the most useful GFP flags
*
* %GFP_KERNEL
* Allocate normal kernel ram. May sleep.
*
* %GFP_NOWAIT
* Allocation will not sleep.
*
* %GFP_ATOMIC
* Allocation will not sleep. May use emergency pools.
*
* %GFP_HIGHUSER
* Allocate memory from high memory on behalf of user.
*
* Also it is possible to set different flags by OR'ing
* in one or more of the following additional @flags:
*
* %__GFP_HIGH
* This allocation has high priority and may use emergency pools.
*
* %__GFP_NOFAIL
* Indicate that this allocation is in no way allowed to fail
* (think twice before using).
*
* %__GFP_NORETRY
* If memory is not immediately available,
* then give up at once.
*
* %__GFP_NOWARN
* If allocation fails, don't issue any warnings.
*
* %__GFP_RETRY_MAYFAIL
* Try really hard to succeed the allocation but fail
* eventually.
*/
static __always_inline void *kmalloc(size_t size, gfp_t flags)
{
if (__builtin_constant_p(size)) {
#ifndef CONFIG_SLOB
unsigned int index;
#endif
if (size > KMALLOC_MAX_CACHE_SIZE)
return kmalloc_large(size, flags);
#ifndef CONFIG_SLOB
index = kmalloc_index(size);
if (!index)
return ZERO_SIZE_PTR;
return kmem_cache_alloc_trace(
kmalloc_caches[kmalloc_type(flags)][index],
flags, size);
#endif
}
return __kmalloc(size, flags);
}
static __always_inline void *kmalloc_node(size_t size, gfp_t flags, int node)
{
#ifndef CONFIG_SLOB
if (__builtin_constant_p(size) &&
size <= KMALLOC_MAX_CACHE_SIZE) {
unsigned int i = kmalloc_index(size);
if (!i)
return ZERO_SIZE_PTR;
return kmem_cache_alloc_node_trace(
kmalloc_caches[kmalloc_type(flags)][i],
flags, node, size);
}
#endif
return __kmalloc_node(size, flags, node);
}
/**
* kmalloc_array - allocate memory for an array.
* @n: number of elements.
* @size: element size.
* @flags: the type of memory to allocate (see kmalloc).
*/
static inline void *kmalloc_array(size_t n, size_t size, gfp_t flags)
{
size_t bytes;
if (unlikely(check_mul_overflow(n, size, &bytes)))
return NULL;
if (__builtin_constant_p(n) && __builtin_constant_p(size))
return kmalloc(bytes, flags);
return __kmalloc(bytes, flags);
}
/**
* krealloc_array - reallocate memory for an array.
* @p: pointer to the memory chunk to reallocate
* @new_n: new number of elements to alloc
* @new_size: new size of a single member of the array
* @flags: the type of memory to allocate (see kmalloc)
*/
static __must_check inline void *
krealloc_array(void *p, size_t new_n, size_t new_size, gfp_t flags)
{
size_t bytes;
if (unlikely(check_mul_overflow(new_n, new_size, &bytes)))
return NULL;
return krealloc(p, bytes, flags);
}
/**
* kcalloc - allocate memory for an array. The memory is set to zero.
* @n: number of elements.
* @size: element size.
* @flags: the type of memory to allocate (see kmalloc).
*/
static inline void *kcalloc(size_t n, size_t size, gfp_t flags)
{
return kmalloc_array(n, size, flags | __GFP_ZERO);
}
/*
* kmalloc_track_caller is a special version of kmalloc that records the
* calling function of the routine calling it for slab leak tracking instead
* of just the calling function (confusing, eh?).
* It's useful when the call to kmalloc comes from a widely-used standard
* allocator where we care about the real place the memory allocation
* request comes from.
*/
extern void *__kmalloc_track_caller(size_t, gfp_t, unsigned long);
#define kmalloc_track_caller(size, flags) \
__kmalloc_track_caller(size, flags, _RET_IP_)
static inline void *kmalloc_array_node(size_t n, size_t size, gfp_t flags,
int node)
{
size_t bytes;
if (unlikely(check_mul_overflow(n, size, &bytes)))
return NULL;
if (__builtin_constant_p(n) && __builtin_constant_p(size))
return kmalloc_node(bytes, flags, node);
return __kmalloc_node(bytes, flags, node);
}
static inline void *kcalloc_node(size_t n, size_t size, gfp_t flags, int node)
{
return kmalloc_array_node(n, size, flags | __GFP_ZERO, node);
}
#ifdef CONFIG_NUMA
extern void *__kmalloc_node_track_caller(size_t, gfp_t, int, unsigned long);
#define kmalloc_node_track_caller(size, flags, node) \
__kmalloc_node_track_caller(size, flags, node, \
_RET_IP_)
#else /* CONFIG_NUMA */
#define kmalloc_node_track_caller(size, flags, node) \
kmalloc_track_caller(size, flags)
#endif /* CONFIG_NUMA */
/*
* Shortcuts
*/
static inline void *kmem_cache_zalloc(struct kmem_cache *k, gfp_t flags)
{
return kmem_cache_alloc(k, flags | __GFP_ZERO);
}
/**
* kzalloc - allocate memory. The memory is set to zero.
* @size: how many bytes of memory are required.
* @flags: the type of memory to allocate (see kmalloc).
*/
static inline void *kzalloc(size_t size, gfp_t flags)
{
return kmalloc(size, flags | __GFP_ZERO);
}
/**
* kzalloc_node - allocate zeroed memory from a particular memory node.
* @size: how many bytes of memory are required.
* @flags: the type of memory to allocate (see kmalloc).
* @node: memory node from which to allocate
*/
static inline void *kzalloc_node(size_t size, gfp_t flags, int node)
{
return kmalloc_node(size, flags | __GFP_ZERO, node);
}
unsigned int kmem_cache_size(struct kmem_cache *s);
void __init kmem_cache_init_late(void);
#if defined(CONFIG_SMP) && defined(CONFIG_SLAB)
int slab_prepare_cpu(unsigned int cpu);
int slab_dead_cpu(unsigned int cpu);
#else
#define slab_prepare_cpu NULL
#define slab_dead_cpu NULL
#endif
#endif /* _LINUX_SLAB_H */
// SPDX-License-Identifier: GPL-2.0-or-later
/*
* Derived from arch/ppc/mm/extable.c and arch/i386/mm/extable.c.
*
* Copyright (C) 2004 Paul Mackerras, IBM Corp.
*/
#include <linux/bsearch.h>
#include <linux/module.h>
#include <linux/init.h>
#include <linux/sort.h>
#include <linux/uaccess.h>
#include <linux/extable.h>
#ifndef ARCH_HAS_RELATIVE_EXTABLE
#define ex_to_insn(x) ((x)->insn)
#else
static inline unsigned long ex_to_insn(const struct exception_table_entry *x)
{
return (unsigned long)&x->insn + x->insn;
}
#endif
#ifndef ARCH_HAS_RELATIVE_EXTABLE
#define swap_ex NULL
#else
static void swap_ex(void *a, void *b, int size)
{
struct exception_table_entry *x = a, *y = b, tmp;
int delta = b - a;
tmp = *x;
x->insn = y->insn + delta;
y->insn = tmp.insn - delta;
#ifdef swap_ex_entry_fixup
swap_ex_entry_fixup(x, y, tmp, delta);
#else
x->fixup = y->fixup + delta;
y->fixup = tmp.fixup - delta;
#endif
}
#endif /* ARCH_HAS_RELATIVE_EXTABLE */
/*
* The exception table needs to be sorted so that the binary
* search that we use to find entries in it works properly.
* This is used both for the kernel exception table and for
* the exception tables of modules that get loaded.
*/
static int cmp_ex_sort(const void *a, const void *b)
{
const struct exception_table_entry *x = a, *y = b;
/* avoid overflow */
if (ex_to_insn(x) > ex_to_insn(y))
return 1;
if (ex_to_insn(x) < ex_to_insn(y))
return -1;
return 0;
}
void sort_extable(struct exception_table_entry *start,
struct exception_table_entry *finish)
{
sort(start, finish - start, sizeof(struct exception_table_entry),
cmp_ex_sort, swap_ex);
}
#ifdef CONFIG_MODULES
/*
* If the exception table is sorted, any referring to the module init
* will be at the beginning or the end.
*/
void trim_init_extable(struct module *m)
{
/*trim the beginning*/
while (m->num_exentries &&
within_module_init(ex_to_insn(&m->extable[0]), m)) {
m->extable++;
m->num_exentries--;
}
/*trim the end*/
while (m->num_exentries &&
within_module_init(ex_to_insn(&m->extable[m->num_exentries - 1]),
m))
m->num_exentries--;
}
#endif /* CONFIG_MODULES */
static int cmp_ex_search(const void *key, const void *elt)
{
const struct exception_table_entry *_elt = elt;
unsigned long _key = *(unsigned long *)key;
/* avoid overflow */
if (_key > ex_to_insn(_elt))
return 1;
if (_key < ex_to_insn(_elt))
return -1;
return 0;
}
/*
* Search one exception table for an entry corresponding to the
* given instruction address, and return the address of the entry,
* or NULL if none is found.
* We use a binary search, and thus we assume that the table is
* already sorted.
*/
const struct exception_table_entry *
search_extable(const struct exception_table_entry *base,
const size_t num,
unsigned long value)
{
return bsearch(&value, base, num,
sizeof(struct exception_table_entry), cmp_ex_search);
}
// SPDX-License-Identifier: GPL-2.0-only
#include <linux/extable.h>
#include <linux/uaccess.h>
#include <linux/sched/debug.h>
#include <xen/xen.h>
#include <asm/fpu/internal.h>
#include <asm/sev.h>
#include <asm/traps.h>
#include <asm/kdebug.h>
typedef bool (*ex_handler_t)(const struct exception_table_entry *,
struct pt_regs *, int, unsigned long,
unsigned long);
static inline unsigned long
ex_fixup_addr(const struct exception_table_entry *x)
{
return (unsigned long)&x->fixup + x->fixup;
}
static inline ex_handler_t
ex_fixup_handler(const struct exception_table_entry *x)
{
return (ex_handler_t)((unsigned long)&x->handler + x->handler);
}
__visible bool ex_handler_default(const struct exception_table_entry *fixup,
struct pt_regs *regs, int trapnr,
unsigned long error_code,
unsigned long fault_addr)
{
regs->ip = ex_fixup_addr(fixup);
return true;
}
EXPORT_SYMBOL(ex_handler_default);
__visible bool ex_handler_fault(const struct exception_table_entry *fixup,
struct pt_regs *regs, int trapnr,
unsigned long error_code,
unsigned long fault_addr)
{
regs->ip = ex_fixup_addr(fixup);
regs->ax = trapnr;
return true;
}
EXPORT_SYMBOL_GPL(ex_handler_fault);
/*
* Handler for when we fail to restore a task's FPU state. We should never get
* here because the FPU state of a task using the FPU (task->thread.fpu.state)
* should always be valid. However, past bugs have allowed userspace to set
* reserved bits in the XSAVE area using PTRACE_SETREGSET or sys_rt_sigreturn().
* These caused XRSTOR to fail when switching to the task, leaking the FPU
* registers of the task previously executing on the CPU. Mitigate this class
* of vulnerability by restoring from the initial state (essentially, zeroing
* out all the FPU registers) if we can't restore from the task's FPU state.
*/
__visible bool ex_handler_fprestore(const struct exception_table_entry *fixup,
struct pt_regs *regs, int trapnr,
unsigned long error_code,
unsigned long fault_addr)
{
regs->ip = ex_fixup_addr(fixup);
WARN_ONCE(1, "Bad FPU state detected at %pB, reinitializing FPU registers.",
(void *)instruction_pointer(regs));
__restore_fpregs_from_fpstate(&init_fpstate, xfeatures_mask_fpstate());
return true;
}
EXPORT_SYMBOL_GPL(ex_handler_fprestore);
__visible bool ex_handler_uaccess(const struct exception_table_entry *fixup,
struct pt_regs *regs, int trapnr,
unsigned long error_code,
unsigned long fault_addr)
{
WARN_ONCE(trapnr == X86_TRAP_GP, "General protection fault in user access. Non-canonical address?");
regs->ip = ex_fixup_addr(fixup);
return true;
}
EXPORT_SYMBOL(ex_handler_uaccess);
__visible bool ex_handler_copy(const struct exception_table_entry *fixup,
struct pt_regs *regs, int trapnr,
unsigned long error_code,
unsigned long fault_addr)
{
WARN_ONCE(trapnr == X86_TRAP_GP, "General protection fault in user access. Non-canonical address?");
regs->ip = ex_fixup_addr(fixup);
regs->ax = trapnr;
return true;
}
EXPORT_SYMBOL(ex_handler_copy);
__visible bool ex_handler_rdmsr_unsafe(const struct exception_table_entry *fixup,
struct pt_regs *regs, int trapnr,
unsigned long error_code,
unsigned long fault_addr)
{
if (pr_warn_once("unchecked MSR access error: RDMSR from 0x%x at rIP: 0x%lx (%pS)\n",
(unsigned int)regs->cx, regs->ip, (void *)regs->ip))
show_stack_regs(regs);
/* Pretend that the read succeeded and returned 0. */
regs->ip = ex_fixup_addr(fixup);
regs->ax = 0;
regs->dx = 0;
return true;
}
EXPORT_SYMBOL(ex_handler_rdmsr_unsafe);
__visible bool ex_handler_wrmsr_unsafe(const struct exception_table_entry *fixup,
struct pt_regs *regs, int trapnr,
unsigned long error_code,
unsigned long fault_addr)
{
if (pr_warn_once("unchecked MSR access error: WRMSR to 0x%x (tried to write 0x%08x%08x) at rIP: 0x%lx (%pS)\n",
(unsigned int)regs->cx, (unsigned int)regs->dx,
(unsigned int)regs->ax, regs->ip, (void *)regs->ip))
show_stack_regs(regs);
/* Pretend that the write succeeded. */
regs->ip = ex_fixup_addr(fixup);
return true;
}
EXPORT_SYMBOL(ex_handler_wrmsr_unsafe);
__visible bool ex_handler_clear_fs(const struct exception_table_entry *fixup,
struct pt_regs *regs, int trapnr,
unsigned long error_code,
unsigned long fault_addr)
{
if (static_cpu_has(X86_BUG_NULL_SEG))
asm volatile ("mov %0, %%fs" : : "rm" (__USER_DS));
asm volatile ("mov %0, %%fs" : : "rm" (0));
return ex_handler_default(fixup, regs, trapnr, error_code, fault_addr);
}
EXPORT_SYMBOL(ex_handler_clear_fs);
enum handler_type ex_get_fault_handler_type(unsigned long ip)
{
const struct exception_table_entry *e;
ex_handler_t handler;
e = search_exception_tables(ip);
if (!e)
return EX_HANDLER_NONE;
handler = ex_fixup_handler(e);
if (handler == ex_handler_fault)
return EX_HANDLER_FAULT;
else if (handler == ex_handler_uaccess || handler == ex_handler_copy)
return EX_HANDLER_UACCESS;
else
return EX_HANDLER_OTHER;
}
int fixup_exception(struct pt_regs *regs, int trapnr, unsigned long error_code,
unsigned long fault_addr)
{
const struct exception_table_entry *e;
ex_handler_t handler;
#ifdef CONFIG_PNPBIOS
if (unlikely(SEGMENT_IS_PNP_CODE(regs->cs))) {
extern u32 pnp_bios_fault_eip, pnp_bios_fault_esp;
extern u32 pnp_bios_is_utter_crap;
pnp_bios_is_utter_crap = 1;
printk(KERN_CRIT "PNPBIOS fault.. attempting recovery.\n");
__asm__ volatile(
"movl %0, %%esp\n\t"
"jmp *%1\n\t"
: : "g" (pnp_bios_fault_esp), "g" (pnp_bios_fault_eip));
panic("do_trap: can't hit this");
}
#endif
e = search_exception_tables(regs->ip);
if (!e)
return 0;
handler = ex_fixup_handler(e);
return handler(e, regs, trapnr, error_code, fault_addr);
}
extern unsigned int early_recursion_flag;
/* Restricted version used during very early boot */
void __init early_fixup_exception(struct pt_regs *regs, int trapnr)
{
/* Ignore early NMIs. */
if (trapnr == X86_TRAP_NMI)
return;
if (early_recursion_flag > 2)
goto halt_loop;
/*
* Old CPUs leave the high bits of CS on the stack
* undefined. I'm not sure which CPUs do this, but at least
* the 486 DX works this way.
* Xen pv domains are not using the default __KERNEL_CS.
*/
if (!xen_pv_domain() && regs->cs != __KERNEL_CS)
goto fail;
/*
* The full exception fixup machinery is available as soon as
* the early IDT is loaded. This means that it is the
* responsibility of extable users to either function correctly
* when handlers are invoked early or to simply avoid causing
* exceptions before they're ready to handle them.
*
* This is better than filtering which handlers can be used,
* because refusing to call a handler here is guaranteed to
* result in a hard-to-debug panic.
*
* Keep in mind that not all vectors actually get here. Early
* page faults, for example, are special.
*/
if (fixup_exception(regs, trapnr, regs->orig_ax, 0))
return;
if (trapnr == X86_TRAP_UD) {
if (report_bug(regs->ip, regs) == BUG_TRAP_TYPE_WARN) {
/* Skip the ud2. */
regs->ip += LEN_UD2;
return;
}
/*
* If this was a BUG and report_bug returns or if this
* was just a normal #UD, we want to continue onward and
* crash.
*/
}
fail:
early_printk("PANIC: early exception 0x%02x IP %lx:%lx error %lx cr2 0x%lx\n",
(unsigned)trapnr, (unsigned long)regs->cs, regs->ip,
regs->orig_ax, read_cr2());
show_regs(regs);
halt_loop:
while (true)
halt();
}
// SPDX-License-Identifier: GPL-2.0
/*
* drivers/base/dd.c - The core device/driver interactions.
*
* This file contains the (sometimes tricky) code that controls the
* interactions between devices and drivers, which primarily includes
* driver binding and unbinding.
*
* All of this code used to exist in drivers/base/bus.c, but was
* relocated to here in the name of compartmentalization (since it wasn't
* strictly code just for the 'struct bus_type'.
*
* Copyright (c) 2002-5 Patrick Mochel
* Copyright (c) 2002-3 Open Source Development Labs
* Copyright (c) 2007-2009 Greg Kroah-Hartman <gregkh@suse.de>
* Copyright (c) 2007-2009 Novell Inc.
*/
#include <linux/debugfs.h>
#include <linux/device.h>
#include <linux/delay.h>
#include <linux/dma-map-ops.h>
#include <linux/init.h>
#include <linux/module.h>
#include <linux/kthread.h>
#include <linux/wait.h>
#include <linux/async.h>
#include <linux/pm_runtime.h>
#include <linux/pinctrl/devinfo.h>
#include <linux/slab.h>
#include "base.h"
#include "power/power.h"
/*
* Deferred Probe infrastructure.
*
* Sometimes driver probe order matters, but the kernel doesn't always have
* dependency information which means some drivers will get probed before a
* resource it depends on is available. For example, an SDHCI driver may
* first need a GPIO line from an i2c GPIO controller before it can be
* initialized. If a required resource is not available yet, a driver can
* request probing to be deferred by returning -EPROBE_DEFER from its probe hook
*
* Deferred probe maintains two lists of devices, a pending list and an active
* list. A driver returning -EPROBE_DEFER causes the device to be added to the
* pending list. A successful driver probe will trigger moving all devices
* from the pending to the active list so that the workqueue will eventually
* retry them.
*
* The deferred_probe_mutex must be held any time the deferred_probe_*_list
* of the (struct device*)->p->deferred_probe pointers are manipulated
*/
static DEFINE_MUTEX(deferred_probe_mutex);
static LIST_HEAD(deferred_probe_pending_list);
static LIST_HEAD(deferred_probe_active_list);
static atomic_t deferred_trigger_count = ATOMIC_INIT(0);
static bool initcalls_done;
/* Save the async probe drivers' name from kernel cmdline */
#define ASYNC_DRV_NAMES_MAX_LEN 256
static char async_probe_drv_names[ASYNC_DRV_NAMES_MAX_LEN];
/*
* In some cases, like suspend to RAM or hibernation, It might be reasonable
* to prohibit probing of devices as it could be unsafe.
* Once defer_all_probes is true all drivers probes will be forcibly deferred.
*/
static bool defer_all_probes;
static void __device_set_deferred_probe_reason(const struct device *dev, char *reason)
{
kfree(dev->p->deferred_probe_reason);
dev->p->deferred_probe_reason = reason;
}
/*
* deferred_probe_work_func() - Retry probing devices in the active list.
*/
static void deferred_probe_work_func(struct work_struct *work)
{
struct device *dev;
struct device_private *private;
/*
* This block processes every device in the deferred 'active' list.
* Each device is removed from the active list and passed to
* bus_probe_device() to re-attempt the probe. The loop continues
* until every device in the active list is removed and retried.
*
* Note: Once the device is removed from the list and the mutex is
* released, it is possible for the device get freed by another thread
* and cause a illegal pointer dereference. This code uses
* get/put_device() to ensure the device structure cannot disappear
* from under our feet.
*/
mutex_lock(&deferred_probe_mutex);
while (!list_empty(&deferred_probe_active_list)) {
private = list_first_entry(&deferred_probe_active_list,
typeof(*dev->p), deferred_probe);
dev = private->device;
list_del_init(&private->deferred_probe);
get_device(dev);
__device_set_deferred_probe_reason(dev, NULL);
/*
* Drop the mutex while probing each device; the probe path may
* manipulate the deferred list
*/
mutex_unlock(&deferred_probe_mutex);
/*
* Force the device to the end of the dpm_list since
* the PM code assumes that the order we add things to
* the list is a good order for suspend but deferred
* probe makes that very unsafe.
*/
device_pm_move_to_tail(dev);
dev_dbg(dev, "Retrying from deferred list\n");
bus_probe_device(dev);
mutex_lock(&deferred_probe_mutex);
put_device(dev);
}
mutex_unlock(&deferred_probe_mutex);
}
static DECLARE_WORK(deferred_probe_work, deferred_probe_work_func);
void driver_deferred_probe_add(struct device *dev)
{
if (!dev->can_match)
return;
mutex_lock(&deferred_probe_mutex);
if (list_empty(&dev->p->deferred_probe)) {
dev_dbg(dev, "Added to deferred list\n");
list_add_tail(&dev->p->deferred_probe, &deferred_probe_pending_list);
}
mutex_unlock(&deferred_probe_mutex);
}
void driver_deferred_probe_del(struct device *dev)
{
mutex_lock(&deferred_probe_mutex);
if (!list_empty(&dev->p->deferred_probe)) {
dev_dbg(dev, "Removed from deferred list\n");
list_del_init(&dev->p->deferred_probe);
__device_set_deferred_probe_reason(dev, NULL);
}
mutex_unlock(&deferred_probe_mutex);
}
static bool driver_deferred_probe_enable = false;
/**
* driver_deferred_probe_trigger() - Kick off re-probing deferred devices
*
* This functions moves all devices from the pending list to the active
* list and schedules the deferred probe workqueue to process them. It
* should be called anytime a driver is successfully bound to a device.
*
* Note, there is a race condition in multi-threaded probe. In the case where
* more than one device is probing at the same time, it is possible for one
* probe to complete successfully while another is about to defer. If the second
* depends on the first, then it will get put on the pending list after the
* trigger event has already occurred and will be stuck there.
*
* The atomic 'deferred_trigger_count' is used to determine if a successful
* trigger has occurred in the midst of probing a driver. If the trigger count
* changes in the midst of a probe, then deferred processing should be triggered
* again.
*/
static void driver_deferred_probe_trigger(void)
{
if (!driver_deferred_probe_enable)
return;
/*
* A successful probe means that all the devices in the pending list
* should be triggered to be reprobed. Move all the deferred devices
* into the active list so they can be retried by the workqueue
*/
mutex_lock(&deferred_probe_mutex);
atomic_inc(&deferred_trigger_count);
list_splice_tail_init(&deferred_probe_pending_list,
&deferred_probe_active_list);
mutex_unlock(&deferred_probe_mutex);
/*
* Kick the re-probe thread. It may already be scheduled, but it is
* safe to kick it again.
*/
queue_work(system_unbound_wq, &deferred_probe_work);
}
/**
* device_block_probing() - Block/defer device's probes
*
* It will disable probing of devices and defer their probes instead.
*/
void device_block_probing(void)
{
defer_all_probes = true;
/* sync with probes to avoid races. */
wait_for_device_probe();
}
/**
* device_unblock_probing() - Unblock/enable device's probes
*
* It will restore normal behavior and trigger re-probing of deferred
* devices.
*/
void device_unblock_probing(void)
{
defer_all_probes = false;
driver_deferred_probe_trigger();
}
/**
* device_set_deferred_probe_reason() - Set defer probe reason message for device
* @dev: the pointer to the struct device
* @vaf: the pointer to va_format structure with message
*/
void device_set_deferred_probe_reason(const struct device *dev, struct va_format *vaf)
{
const char *drv = dev_driver_string(dev);
char *reason;
mutex_lock(&deferred_probe_mutex);
reason = kasprintf(GFP_KERNEL, "%s: %pV", drv, vaf);
__device_set_deferred_probe_reason(dev, reason);
mutex_unlock(&deferred_probe_mutex);
}
/*
* deferred_devs_show() - Show the devices in the deferred probe pending list.
*/
static int deferred_devs_show(struct seq_file *s, void *data)
{
struct device_private *curr;
mutex_lock(&deferred_probe_mutex);
list_for_each_entry(curr, &deferred_probe_pending_list, deferred_probe)
seq_printf(s, "%s\t%s", dev_name(curr->device),
curr->device->p->deferred_probe_reason ?: "\n");
mutex_unlock(&deferred_probe_mutex);
return 0;
}
DEFINE_SHOW_ATTRIBUTE(deferred_devs);
int driver_deferred_probe_timeout;
EXPORT_SYMBOL_GPL(driver_deferred_probe_timeout);
static DECLARE_WAIT_QUEUE_HEAD(probe_timeout_waitqueue);
static int __init deferred_probe_timeout_setup(char *str)
{
int timeout;
if (!kstrtoint(str, 10, &timeout))
driver_deferred_probe_timeout = timeout;
return 1;
}
__setup("deferred_probe_timeout=", deferred_probe_timeout_setup);
/**
* driver_deferred_probe_check_state() - Check deferred probe state
* @dev: device to check
*
* Return:
* -ENODEV if initcalls have completed and modules are disabled.
* -ETIMEDOUT if the deferred probe timeout was set and has expired
* and modules are enabled.
* -EPROBE_DEFER in other cases.
*
* Drivers or subsystems can opt-in to calling this function instead of directly
* returning -EPROBE_DEFER.
*/
int driver_deferred_probe_check_state(struct device *dev)
{
if (!IS_ENABLED(CONFIG_MODULES) && initcalls_done) {
dev_warn(dev, "ignoring dependency for device, assuming no driver\n");
return -ENODEV;
}
if (!driver_deferred_probe_timeout && initcalls_done) {
dev_warn(dev, "deferred probe timeout, ignoring dependency\n");
return -ETIMEDOUT;
}
return -EPROBE_DEFER;
}
EXPORT_SYMBOL_GPL(driver_deferred_probe_check_state);
static void deferred_probe_timeout_work_func(struct work_struct *work)
{
struct device_private *p;
fw_devlink_drivers_done();
driver_deferred_probe_timeout = 0;
driver_deferred_probe_trigger();
flush_work(&deferred_probe_work);
mutex_lock(&deferred_probe_mutex);
list_for_each_entry(p, &deferred_probe_pending_list, deferred_probe)
dev_info(p->device, "deferred probe pending\n");
mutex_unlock(&deferred_probe_mutex);
wake_up_all(&probe_timeout_waitqueue);
}
static DECLARE_DELAYED_WORK(deferred_probe_timeout_work, deferred_probe_timeout_work_func);
/**
* deferred_probe_initcall() - Enable probing of deferred devices
*
* We don't want to get in the way when the bulk of drivers are getting probed.
* Instead, this initcall makes sure that deferred probing is delayed until
* late_initcall time.
*/
static int deferred_probe_initcall(void)
{
debugfs_create_file("devices_deferred", 0444, NULL, NULL,
&deferred_devs_fops);
driver_deferred_probe_enable = true;
driver_deferred_probe_trigger();
/* Sort as many dependencies as possible before exiting initcalls */
flush_work(&deferred_probe_work);
initcalls_done = true;
if (!IS_ENABLED(CONFIG_MODULES))
fw_devlink_drivers_done();
/*
* Trigger deferred probe again, this time we won't defer anything
* that is optional
*/
driver_deferred_probe_trigger();
flush_work(&deferred_probe_work);
if (driver_deferred_probe_timeout > 0) {
schedule_delayed_work(&deferred_probe_timeout_work,
driver_deferred_probe_timeout * HZ);
}
return 0;
}
late_initcall(deferred_probe_initcall);
static void __exit deferred_probe_exit(void)
{
debugfs_remove_recursive(debugfs_lookup("devices_deferred", NULL));
}
__exitcall(deferred_probe_exit);
/**
* device_is_bound() - Check if device is bound to a driver
* @dev: device to check
*
* Returns true if passed device has already finished probing successfully
* against a driver.
*
* This function must be called with the device lock held.
*/
bool device_is_bound(struct device *dev)
{
return dev->p && klist_node_attached(&dev->p->knode_driver);
}
static void driver_bound(struct device *dev)
{
if (device_is_bound(dev)) {
pr_warn("%s: device %s already bound\n",
__func__, kobject_name(&dev->kobj));
return;
}
pr_debug("driver: '%s': %s: bound to device '%s'\n", dev->driver->name,
__func__, dev_name(dev));
klist_add_tail(&dev->p->knode_driver, &dev->driver->p->klist_devices);
device_links_driver_bound(dev);
device_pm_check_callbacks(dev);
/*
* Make sure the device is no longer in one of the deferred lists and
* kick off retrying all pending devices
*/
driver_deferred_probe_del(dev);
driver_deferred_probe_trigger();
if (dev->bus)
blocking_notifier_call_chain(&dev->bus->p->bus_notifier,
BUS_NOTIFY_BOUND_DRIVER, dev);
kobject_uevent(&dev->kobj, KOBJ_BIND);
}
static ssize_t coredump_store(struct device *dev, struct device_attribute *attr,
const char *buf, size_t count)
{
device_lock(dev);
dev->driver->coredump(dev);
device_unlock(dev);
return count;
}
static DEVICE_ATTR_WO(coredump);
static int driver_sysfs_add(struct device *dev)
{
int ret;
if (dev->bus)
blocking_notifier_call_chain(&dev->bus->p->bus_notifier,
BUS_NOTIFY_BIND_DRIVER, dev);
ret = sysfs_create_link(&dev->driver->p->kobj, &dev->kobj,
kobject_name(&dev->kobj));
if (ret)
goto fail;
ret = sysfs_create_link(&dev->kobj, &dev->driver->p->kobj,
"driver");
if (ret)
goto rm_dev;
if (!IS_ENABLED(CONFIG_DEV_COREDUMP) || !dev->driver->coredump)
return 0;
ret = device_create_file(dev, &dev_attr_coredump);
if (!ret)
return 0;
sysfs_remove_link(&dev->kobj, "driver");
rm_dev:
sysfs_remove_link(&dev->driver->p->kobj,
kobject_name(&dev->kobj));
fail:
return ret;
}
static void driver_sysfs_remove(struct device *dev)
{
struct device_driver *drv = dev->driver;
if (drv) {
if (drv->coredump)
device_remove_file(dev, &dev_attr_coredump);
sysfs_remove_link(&drv->p->kobj, kobject_name(&dev->kobj));
sysfs_remove_link(&dev->kobj, "driver");
}
}
/**
* device_bind_driver - bind a driver to one device.
* @dev: device.
*
* Allow manual attachment of a driver to a device.
* Caller must have already set @dev->driver.
*
* Note that this does not modify the bus reference count.
* Please verify that is accounted for before calling this.
* (It is ok to call with no other effort from a driver's probe() method.)
*
* This function must be called with the device lock held.
*
* Callers should prefer to use device_driver_attach() instead.
*/
int device_bind_driver(struct device *dev)
{
int ret;
ret = driver_sysfs_add(dev);
if (!ret) {
device_links_force_bind(dev);
driver_bound(dev);
}
else if (dev->bus)
blocking_notifier_call_chain(&dev->bus->p->bus_notifier,
BUS_NOTIFY_DRIVER_NOT_BOUND, dev);
return ret;
}
EXPORT_SYMBOL_GPL(device_bind_driver);
static atomic_t probe_count = ATOMIC_INIT(0);
static DECLARE_WAIT_QUEUE_HEAD(probe_waitqueue);
static ssize_t state_synced_show(struct device *dev,
struct device_attribute *attr, char *buf)
{
bool val;
device_lock(dev);
val = dev->state_synced;
device_unlock(dev);
return sysfs_emit(buf, "%u\n", val);
}
static DEVICE_ATTR_RO(state_synced);
static int call_driver_probe(struct device *dev, struct device_driver *drv)
{
int ret = 0;
if (dev->bus->probe)
ret = dev->bus->probe(dev);
else if (drv->probe)
ret = drv->probe(dev);
switch (ret) {
case 0:
break;
case -EPROBE_DEFER:
/* Driver requested deferred probing */
dev_dbg(dev, "Driver %s requests probe deferral\n", drv->name);
break;
case -ENODEV:
case -ENXIO:
pr_debug("%s: probe of %s rejects match %d\n",
drv->name, dev_name(dev), ret);
break;
default:
/* driver matched but the probe failed */
pr_warn("%s: probe of %s failed with error %d\n",
drv->name, dev_name(dev), ret);
break;
}
return ret;
}
static int really_probe(struct device *dev, struct device_driver *drv)
{
bool test_remove = IS_ENABLED(CONFIG_DEBUG_TEST_DRIVER_REMOVE) &&
!drv->suppress_bind_attrs;
int ret;
if (defer_all_probes) {
/*
* Value of defer_all_probes can be set only by
* device_block_probing() which, in turn, will call
* wait_for_device_probe() right after that to avoid any races.
*/
dev_dbg(dev, "Driver %s force probe deferral\n", drv->name);
return -EPROBE_DEFER;
}
ret = device_links_check_suppliers(dev);
if (ret)
return ret;
pr_debug("bus: '%s': %s: probing driver %s with device %s\n",
drv->bus->name, __func__, drv->name, dev_name(dev));
if (!list_empty(&dev->devres_head)) {
dev_crit(dev, "Resources present before probing\n");
ret = -EBUSY;
goto done;
}
re_probe:
dev->driver = drv;
/* If using pinctrl, bind pins now before probing */
ret = pinctrl_bind_pins(dev);
if (ret)
goto pinctrl_bind_failed;
if (dev->bus->dma_configure) {
ret = dev->bus->dma_configure(dev);
if (ret)
goto probe_failed;
}
ret = driver_sysfs_add(dev);
if (ret) {
pr_err("%s: driver_sysfs_add(%s) failed\n",
__func__, dev_name(dev));
goto probe_failed;
}
if (dev->pm_domain && dev->pm_domain->activate) {
ret = dev->pm_domain->activate(dev);
if (ret)
goto probe_failed;
}
ret = call_driver_probe(dev, drv);
if (ret) {
/*
* Return probe errors as positive values so that the callers
* can distinguish them from other errors.
*/
ret = -ret;
goto probe_failed;
}
ret = device_add_groups(dev, drv->dev_groups);
if (ret) {
dev_err(dev, "device_add_groups() failed\n");
goto dev_groups_failed;
}
if (dev_has_sync_state(dev)) {
ret = device_create_file(dev, &dev_attr_state_synced);
if (ret) {
dev_err(dev, "state_synced sysfs add failed\n");
goto dev_sysfs_state_synced_failed;
}
}
if (test_remove) {
test_remove = false;
device_remove_file(dev, &dev_attr_state_synced);
device_remove_groups(dev, drv->dev_groups);
if (dev->bus->remove)
dev->bus->remove(dev);
else if (drv->remove)
drv->remove(dev);
devres_release_all(dev);
arch_teardown_dma_ops(dev);
kfree(dev->dma_range_map);
dev->dma_range_map = NULL;
driver_sysfs_remove(dev);
dev->driver = NULL;
dev_set_drvdata(dev, NULL);
if (dev->pm_domain && dev->pm_domain->dismiss)
dev->pm_domain->dismiss(dev);
pm_runtime_reinit(dev);
goto re_probe;
}
pinctrl_init_done(dev);
if (dev->pm_domain && dev->pm_domain->sync)
dev->pm_domain->sync(dev);
driver_bound(dev);
pr_debug("bus: '%s': %s: bound device %s to driver %s\n",
drv->bus->name, __func__, dev_name(dev), drv->name);
goto done;
dev_sysfs_state_synced_failed:
device_remove_groups(dev, drv->dev_groups);
dev_groups_failed:
if (dev->bus->remove)
dev->bus->remove(dev);
else if (drv->remove)
drv->remove(dev);
probe_failed:
if (dev->bus)
blocking_notifier_call_chain(&dev->bus->p->bus_notifier,
BUS_NOTIFY_DRIVER_NOT_BOUND, dev);
pinctrl_bind_failed:
device_links_no_driver(dev);
devres_release_all(dev);
arch_teardown_dma_ops(dev);
kfree(dev->dma_range_map);
dev->dma_range_map = NULL;
driver_sysfs_remove(dev);
dev->driver = NULL;
dev_set_drvdata(dev, NULL);
if (dev->pm_domain && dev->pm_domain->dismiss)
dev->pm_domain->dismiss(dev);
pm_runtime_reinit(dev);
dev_pm_set_driver_flags(dev, 0);
done:
return ret;
}
/*
* For initcall_debug, show the driver probe time.
*/
static int really_probe_debug(struct device *dev, struct device_driver *drv)
{
ktime_t calltime, rettime;
int ret;
calltime = ktime_get();
ret = really_probe(dev, drv);
rettime = ktime_get();
pr_debug("probe of %s returned %d after %lld usecs\n",
dev_name(dev), ret, ktime_us_delta(rettime, calltime));
return ret;
}
/**
* driver_probe_done
* Determine if the probe sequence is finished or not.
*
* Should somehow figure out how to use a semaphore, not an atomic variable...
*/
int driver_probe_done(void)
{
int local_probe_count = atomic_read(&probe_count);
pr_debug("%s: probe_count = %d\n", __func__, local_probe_count);
if (local_probe_count)
return -EBUSY;
return 0;
}
/**
* wait_for_device_probe
* Wait for device probing to be completed.
*/
void wait_for_device_probe(void)
{
/* wait for probe timeout */
wait_event(probe_timeout_waitqueue, !driver_deferred_probe_timeout);
/* wait for the deferred probe workqueue to finish */
flush_work(&deferred_probe_work);
/* wait for the known devices to complete their probing */
wait_event(probe_waitqueue, atomic_read(&probe_count) == 0);
async_synchronize_full();
}
EXPORT_SYMBOL_GPL(wait_for_device_probe);
static int __driver_probe_device(struct device_driver *drv, struct device *dev)
{
int ret = 0;
if (dev->p->dead || !device_is_registered(dev))
return -ENODEV;
if (dev->driver)
return -EBUSY;
dev->can_match = true;
pr_debug("bus: '%s': %s: matched device %s with driver %s\n",
drv->bus->name, __func__, dev_name(dev), drv->name);
pm_runtime_get_suppliers(dev);
if (dev->parent)
pm_runtime_get_sync(dev->parent);
pm_runtime_barrier(dev);
if (initcall_debug)
ret = really_probe_debug(dev, drv);
else
ret = really_probe(dev, drv);
pm_request_idle(dev);
if (dev->parent)
pm_runtime_put(dev->parent);
pm_runtime_put_suppliers(dev);
return ret;
}
/**
* driver_probe_device - attempt to bind device & driver together
* @drv: driver to bind a device to
* @dev: device to try to bind to the driver
*
* This function returns -ENODEV if the device is not registered, -EBUSY if it
* already has a driver, 0 if the device is bound successfully and a positive
* (inverted) error code for failures from the ->probe method.
*
* This function must be called with @dev lock held. When called for a
* USB interface, @dev->parent lock must be held as well.
*
* If the device has a parent, runtime-resume the parent before driver probing.
*/
static int driver_probe_device(struct device_driver *drv, struct device *dev)
{
int trigger_count = atomic_read(&deferred_trigger_count);
int ret;
atomic_inc(&probe_count);
ret = __driver_probe_device(drv, dev);
if (ret == -EPROBE_DEFER || ret == EPROBE_DEFER) {
driver_deferred_probe_add(dev);
/*
* Did a trigger occur while probing? Need to re-trigger if yes
*/
if (trigger_count != atomic_read(&deferred_trigger_count) &&
!defer_all_probes)
driver_deferred_probe_trigger();
}
atomic_dec(&probe_count);
wake_up_all(&probe_waitqueue);
return ret;
}
static inline bool cmdline_requested_async_probing(const char *drv_name)
{
return parse_option_str(async_probe_drv_names, drv_name);
}
/* The option format is "driver_async_probe=drv_name1,drv_name2,..." */
static int __init save_async_options(char *buf)
{
if (strlen(buf) >= ASYNC_DRV_NAMES_MAX_LEN)
pr_warn("Too long list of driver names for 'driver_async_probe'!\n");
strlcpy(async_probe_drv_names, buf, ASYNC_DRV_NAMES_MAX_LEN);
return 1;
}
__setup("driver_async_probe=", save_async_options);
bool driver_allows_async_probing(struct device_driver *drv)
{
switch (drv->probe_type) {
case PROBE_PREFER_ASYNCHRONOUS:
return true;
case PROBE_FORCE_SYNCHRONOUS:
return false;
default:
if (cmdline_requested_async_probing(drv->name))
return true;
if (module_requested_async_probing(drv->owner))
return true;
return false;
}
}
struct device_attach_data {
struct device *dev;
/*
* Indicates whether we are are considering asynchronous probing or
* not. Only initial binding after device or driver registration
* (including deferral processing) may be done asynchronously, the
* rest is always synchronous, as we expect it is being done by
* request from userspace.
*/
bool check_async;
/*
* Indicates if we are binding synchronous or asynchronous drivers.
* When asynchronous probing is enabled we'll execute 2 passes
* over drivers: first pass doing synchronous probing and second
* doing asynchronous probing (if synchronous did not succeed -
* most likely because there was no driver requiring synchronous
* probing - and we found asynchronous driver during first pass).
* The 2 passes are done because we can't shoot asynchronous
* probe for given device and driver from bus_for_each_drv() since
* driver pointer is not guaranteed to stay valid once
* bus_for_each_drv() iterates to the next driver on the bus.
*/
bool want_async;
/*
* We'll set have_async to 'true' if, while scanning for matching
* driver, we'll encounter one that requests asynchronous probing.
*/
bool have_async;
};
static int __device_attach_driver(struct device_driver *drv, void *_data)
{
struct device_attach_data *data = _data;
struct device *dev = data->dev;
bool async_allowed;
int ret;
ret = driver_match_device(drv, dev);
if (ret == 0) {
/* no match */
return 0;
} else if (ret == -EPROBE_DEFER) {
dev_dbg(dev, "Device match requests probe deferral\n");
dev->can_match = true;
driver_deferred_probe_add(dev);
} else if (ret < 0) {
dev_dbg(dev, "Bus failed to match device: %d\n", ret);
return ret;
} /* ret > 0 means positive match */
async_allowed = driver_allows_async_probing(drv);
if (async_allowed)
data->have_async = true;
if (data->check_async && async_allowed != data->want_async)
return 0;
/*
* Ignore errors returned by ->probe so that the next driver can try
* its luck.
*/
ret = driver_probe_device(drv, dev);
if (ret < 0)
return ret;
return ret == 0;
}
static void __device_attach_async_helper(void *_dev, async_cookie_t cookie)
{
struct device *dev = _dev;
struct device_attach_data data = {
.dev = dev,
.check_async = true,
.want_async = true,
};
device_lock(dev);
/*
* Check if device has already been removed or claimed. This may
* happen with driver loading, device discovery/registration,
* and deferred probe processing happens all at once with
* multiple threads.
*/
if (dev->p->dead || dev->driver)
goto out_unlock;
if (dev->parent)
pm_runtime_get_sync(dev->parent);
bus_for_each_drv(dev->bus, NULL, &data, __device_attach_driver);
dev_dbg(dev, "async probe completed\n");
pm_request_idle(dev);
if (dev->parent)
pm_runtime_put(dev->parent);
out_unlock:
device_unlock(dev);
put_device(dev);
}
static int __device_attach(struct device *dev, bool allow_async)
{
int ret = 0;
device_lock(dev);
if (dev->p->dead) {
goto out_unlock;
} else if (dev->driver) {
if (device_is_bound(dev)) {
ret = 1;
goto out_unlock;
}
ret = device_bind_driver(dev);
if (ret == 0)
ret = 1;
else {
dev->driver = NULL;
ret = 0;
}
} else {
struct device_attach_data data = {
.dev = dev,
.check_async = allow_async,
.want_async = false,
};
if (dev->parent)
pm_runtime_get_sync(dev->parent);
ret = bus_for_each_drv(dev->bus, NULL, &data,
__device_attach_driver);
if (!ret && allow_async && data.have_async) {
/*
* If we could not find appropriate driver
* synchronously and we are allowed to do
* async probes and there are drivers that
* want to probe asynchronously, we'll
* try them.
*/
dev_dbg(dev, "scheduling asynchronous probe\n");
get_device(dev);
async_schedule_dev(__device_attach_async_helper, dev);
} else {
pm_request_idle(dev);
}
if (dev->parent)
pm_runtime_put(dev->parent);
}
out_unlock:
device_unlock(dev);
return ret;
}
/**
* device_attach - try to attach device to a driver.
* @dev: device.
*
* Walk the list of drivers that the bus has and call
* driver_probe_device() for each pair. If a compatible
* pair is found, break out and return.
*
* Returns 1 if the device was bound to a driver;
* 0 if no matching driver was found;
* -ENODEV if the device is not registered.
*
* When called for a USB interface, @dev->parent lock must be held.
*/
int device_attach(struct device *dev)
{
return __device_attach(dev, false);
}
EXPORT_SYMBOL_GPL(device_attach);
void device_initial_probe(struct device *dev)
{
__device_attach(dev, true);
}
/*
* __device_driver_lock - acquire locks needed to manipulate dev->drv
* @dev: Device we will update driver info for
* @parent: Parent device. Needed if the bus requires parent lock
*
* This function will take the required locks for manipulating dev->drv.
* Normally this will just be the @dev lock, but when called for a USB
* interface, @parent lock will be held as well.
*/
static void __device_driver_lock(struct device *dev, struct device *parent)
{
if (parent && dev->bus->need_parent_lock)
device_lock(parent);
device_lock(dev);
}
/*
* __device_driver_unlock - release locks needed to manipulate dev->drv
* @dev: Device we will update driver info for
* @parent: Parent device. Needed if the bus requires parent lock
*
* This function will release the required locks for manipulating dev->drv.
* Normally this will just be the the @dev lock, but when called for a
* USB interface, @parent lock will be released as well.
*/
static void __device_driver_unlock(struct device *dev, struct device *parent)
{
device_unlock(dev);
if (parent && dev->bus->need_parent_lock)
device_unlock(parent);
}
/**
* device_driver_attach - attach a specific driver to a specific device
* @drv: Driver to attach
* @dev: Device to attach it to
*
* Manually attach driver to a device. Will acquire both @dev lock and
* @dev->parent lock if needed. Returns 0 on success, -ERR on failure.
*/
int device_driver_attach(struct device_driver *drv, struct device *dev)
{
int ret;
__device_driver_lock(dev, dev->parent);
ret = __driver_probe_device(drv, dev);
__device_driver_unlock(dev, dev->parent);
/* also return probe errors as normal negative errnos */
if (ret > 0)
ret = -ret;
if (ret == -EPROBE_DEFER)
return -EAGAIN;
return ret;
}
EXPORT_SYMBOL_GPL(device_driver_attach);
static void __driver_attach_async_helper(void *_dev, async_cookie_t cookie)
{
struct device *dev = _dev;
struct device_driver *drv;
int ret;
__device_driver_lock(dev, dev->parent);
drv = dev->p->async_driver;
ret = driver_probe_device(drv, dev);
__device_driver_unlock(dev, dev->parent);
dev_dbg(dev, "driver %s async attach completed: %d\n", drv->name, ret);
put_device(dev);
}
static int __driver_attach(struct device *dev, void *data)
{
struct device_driver *drv = data;
int ret;
/*
* Lock device and try to bind to it. We drop the error
* here and always return 0, because we need to keep trying
* to bind to devices and some drivers will return an error
* simply if it didn't support the device.
*
* driver_probe_device() will spit a warning if there
* is an error.
*/
ret = driver_match_device(drv, dev);
if (ret == 0) {
/* no match */
return 0;
} else if (ret == -EPROBE_DEFER) {
dev_dbg(dev, "Device match requests probe deferral\n");
dev->can_match = true;
driver_deferred_probe_add(dev);
} else if (ret < 0) {
dev_dbg(dev, "Bus failed to match device: %d\n", ret);
return ret;
} /* ret > 0 means positive match */
if (driver_allows_async_probing(drv)) {
/*
* Instead of probing the device synchronously we will
* probe it asynchronously to allow for more parallelism.
*
* We only take the device lock here in order to guarantee
* that the dev->driver and async_driver fields are protected
*/
dev_dbg(dev, "probing driver %s asynchronously\n", drv->name);
device_lock(dev);
if (!dev->driver) {
get_device(dev);
dev->p->async_driver = drv;
async_schedule_dev(__driver_attach_async_helper, dev);
}
device_unlock(dev);
return 0;
}
__device_driver_lock(dev, dev->parent);
driver_probe_device(drv, dev);
__device_driver_unlock(dev, dev->parent);
return 0;
}
/**
* driver_attach - try to bind driver to devices.
* @drv: driver.
*
* Walk the list of devices that the bus has on it and try to
* match the driver with each one. If driver_probe_device()
* returns 0 and the @dev->driver is set, we've found a
* compatible pair.
*/
int driver_attach(struct device_driver *drv)
{
return bus_for_each_dev(drv->bus, NULL, drv, __driver_attach);
}
EXPORT_SYMBOL_GPL(driver_attach);
/*
* __device_release_driver() must be called with @dev lock held.
* When called for a USB interface, @dev->parent lock must be held as well.
*/
static void __device_release_driver(struct device *dev, struct device *parent)
{
struct device_driver *drv;
drv = dev->driver;
if (drv) {
pm_runtime_get_sync(dev);
while (device_links_busy(dev)) {
__device_driver_unlock(dev, parent);
device_links_unbind_consumers(dev);
__device_driver_lock(dev, parent);
/*
* A concurrent invocation of the same function might
* have released the driver successfully while this one
* was waiting, so check for that.
*/
if (dev->driver != drv) {
pm_runtime_put(dev);
return;
}
}
driver_sysfs_remove(dev);
if (dev->bus)
blocking_notifier_call_chain(&dev->bus->p->bus_notifier,
BUS_NOTIFY_UNBIND_DRIVER,
dev);
pm_runtime_put_sync(dev);
device_remove_file(dev, &dev_attr_state_synced);
device_remove_groups(dev, drv->dev_groups);
if (dev->bus && dev->bus->remove)
dev->bus->remove(dev);
else if (drv->remove)
drv->remove(dev);
device_links_driver_cleanup(dev);
devres_release_all(dev);
arch_teardown_dma_ops(dev);
kfree(dev->dma_range_map);
dev->dma_range_map = NULL;
dev->driver = NULL;
dev_set_drvdata(dev, NULL);
if (dev->pm_domain && dev->pm_domain->dismiss)
dev->pm_domain->dismiss(dev);
pm_runtime_reinit(dev);
dev_pm_set_driver_flags(dev, 0);
klist_remove(&dev->p->knode_driver);
device_pm_check_callbacks(dev);
if (dev->bus)
blocking_notifier_call_chain(&dev->bus->p->bus_notifier,
BUS_NOTIFY_UNBOUND_DRIVER,
dev);
kobject_uevent(&dev->kobj, KOBJ_UNBIND);
}
}
void device_release_driver_internal(struct device *dev,
struct device_driver *drv,
struct device *parent)
{
__device_driver_lock(dev, parent);
if (!drv || drv == dev->driver)
__device_release_driver(dev, parent);
__device_driver_unlock(dev, parent);
}
/**
* device_release_driver - manually detach device from driver.
* @dev: device.
*
* Manually detach device from driver.
* When called for a USB interface, @dev->parent lock must be held.
*
* If this function is to be called with @dev->parent lock held, ensure that
* the device's consumers are unbound in advance or that their locks can be
* acquired under the @dev->parent lock.
*/
void device_release_driver(struct device *dev)
{
/*
* If anyone calls device_release_driver() recursively from
* within their ->remove callback for the same device, they
* will deadlock right here.
*/
device_release_driver_internal(dev, NULL, NULL);
}
EXPORT_SYMBOL_GPL(device_release_driver);
/**
* device_driver_detach - detach driver from a specific device
* @dev: device to detach driver from
*
* Detach driver from device. Will acquire both @dev lock and @dev->parent
* lock if needed.
*/
void device_driver_detach(struct device *dev)
{
device_release_driver_internal(dev, NULL, dev->parent);
}
/**
* driver_detach - detach driver from all devices it controls.
* @drv: driver.
*/
void driver_detach(struct device_driver *drv)
{
struct device_private *dev_prv;
struct device *dev;
if (driver_allows_async_probing(drv))
async_synchronize_full();
for (;;) {
spin_lock(&drv->p->klist_devices.k_lock);
if (list_empty(&drv->p->klist_devices.k_list)) {
spin_unlock(&drv->p->klist_devices.k_lock);
break;
}
dev_prv = list_last_entry(&drv->p->klist_devices.k_list,
struct device_private,
knode_driver.n_node);
dev = dev_prv->device;
get_device(dev);
spin_unlock(&drv->p->klist_devices.k_lock);
device_release_driver_internal(dev, drv, dev->parent);
put_device(dev);
}
}
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_X86_PTRACE_H
#define _ASM_X86_PTRACE_H
#include <asm/segment.h>
#include <asm/page_types.h>
#include <uapi/asm/ptrace.h>
#ifndef __ASSEMBLY__
#ifdef __i386__
struct pt_regs {
/*
* NB: 32-bit x86 CPUs are inconsistent as what happens in the
* following cases (where %seg represents a segment register):
*
* - pushl %seg: some do a 16-bit write and leave the high
* bits alone
* - movl %seg, [mem]: some do a 16-bit write despite the movl
* - IDT entry: some (e.g. 486) will leave the high bits of CS
* and (if applicable) SS undefined.
*
* Fortunately, x86-32 doesn't read the high bits on POP or IRET,
* so we can just treat all of the segment registers as 16-bit
* values.
*/
unsigned long bx;
unsigned long cx;
unsigned long dx;
unsigned long si;
unsigned long di;
unsigned long bp;
unsigned long ax;
unsigned short ds;
unsigned short __dsh;
unsigned short es;
unsigned short __esh;
unsigned short fs;
unsigned short __fsh;
/*
* On interrupt, gs and __gsh store the vector number. They never
* store gs any more.
*/
unsigned short gs;
unsigned short __gsh;
/* On interrupt, this is the error code. */
unsigned long orig_ax;
unsigned long ip;
unsigned short cs;
unsigned short __csh;
unsigned long flags;
unsigned long sp;
unsigned short ss;
unsigned short __ssh;
};
#else /* __i386__ */
struct pt_regs {
/*
* C ABI says these regs are callee-preserved. They aren't saved on kernel entry
* unless syscall needs a complete, fully filled "struct pt_regs".
*/
unsigned long r15;
unsigned long r14;
unsigned long r13;
unsigned long r12;
unsigned long bp;
unsigned long bx;
/* These regs are callee-clobbered. Always saved on kernel entry. */
unsigned long r11;
unsigned long r10;
unsigned long r9;
unsigned long r8;
unsigned long ax;
unsigned long cx;
unsigned long dx;
unsigned long si;
unsigned long di;
/*
* On syscall entry, this is syscall#. On CPU exception, this is error code.
* On hw interrupt, it's IRQ number:
*/
unsigned long orig_ax;
/* Return frame for iretq */
unsigned long ip;
unsigned long cs;
unsigned long flags;
unsigned long sp;
unsigned long ss;
/* top of stack page */
};
#endif /* !__i386__ */
#ifdef CONFIG_PARAVIRT
#include <asm/paravirt_types.h>
#endif
#include <asm/proto.h>
struct cpuinfo_x86;
struct task_struct;
extern unsigned long profile_pc(struct pt_regs *regs);
extern unsigned long
convert_ip_to_linear(struct task_struct *child, struct pt_regs *regs);
extern void send_sigtrap(struct pt_regs *regs, int error_code, int si_code);
static inline unsigned long regs_return_value(struct pt_regs *regs)
{
return regs->ax;
}
static inline void regs_set_return_value(struct pt_regs *regs, unsigned long rc)
{
regs->ax = rc;
}
/*
* user_mode(regs) determines whether a register set came from user
* mode. On x86_32, this is true if V8086 mode was enabled OR if the
* register set was from protected mode with RPL-3 CS value. This
* tricky test checks that with one comparison.
*
* On x86_64, vm86 mode is mercifully nonexistent, and we don't need
* the extra check.
*/
static __always_inline int user_mode(struct pt_regs *regs)
{
#ifdef CONFIG_X86_32
return ((regs->cs & SEGMENT_RPL_MASK) | (regs->flags & X86_VM_MASK)) >= USER_RPL;
#else
return !!(regs->cs & 3);
#endif
}
static inline int v8086_mode(struct pt_regs *regs)
{
#ifdef CONFIG_X86_32
return (regs->flags & X86_VM_MASK);
#else
return 0; /* No V86 mode support in long mode */
#endif
}
static inline bool user_64bit_mode(struct pt_regs *regs)
{
#ifdef CONFIG_X86_64
#ifndef CONFIG_PARAVIRT_XXL
/*
* On non-paravirt systems, this is the only long mode CPL 3
* selector. We do not allow long mode selectors in the LDT.
*/
return regs->cs == __USER_CS;
#else
/* Headers are too twisted for this to go in paravirt.h. */
return regs->cs == __USER_CS || regs->cs == pv_info.extra_user_64bit_cs;
#endif
#else /* !CONFIG_X86_64 */
return false;
#endif
}
/*
* Determine whether the register set came from any context that is running in
* 64-bit mode.
*/
static inline bool any_64bit_mode(struct pt_regs *regs)
{
#ifdef CONFIG_X86_64
return !user_mode(regs) || user_64bit_mode(regs);
#else
return false;
#endif
}
#ifdef CONFIG_X86_64
#define current_user_stack_pointer() current_pt_regs()->sp
#define compat_user_stack_pointer() current_pt_regs()->sp
static inline bool ip_within_syscall_gap(struct pt_regs *regs)
{
bool ret = (regs->ip >= (unsigned long)entry_SYSCALL_64 &&
regs->ip < (unsigned long)entry_SYSCALL_64_safe_stack);
#ifdef CONFIG_IA32_EMULATION
ret = ret || (regs->ip >= (unsigned long)entry_SYSCALL_compat &&
regs->ip < (unsigned long)entry_SYSCALL_compat_safe_stack);
#endif
return ret;
}
#endif
static inline unsigned long kernel_stack_pointer(struct pt_regs *regs)
{
return regs->sp;
}
static inline unsigned long instruction_pointer(struct pt_regs *regs)
{
return regs->ip;
}
static inline void instruction_pointer_set(struct pt_regs *regs,
unsigned long val)
{
regs->ip = val;
}
static inline unsigned long frame_pointer(struct pt_regs *regs)
{
return regs->bp;
}
static inline unsigned long user_stack_pointer(struct pt_regs *regs)
{
return regs->sp;
}
static inline void user_stack_pointer_set(struct pt_regs *regs,
unsigned long val)
{
regs->sp = val;
}
static __always_inline bool regs_irqs_disabled(struct pt_regs *regs)
{
return !(regs->flags & X86_EFLAGS_IF);
}
/* Query offset/name of register from its name/offset */
extern int regs_query_register_offset(const char *name);
extern const char *regs_query_register_name(unsigned int offset);
#define MAX_REG_OFFSET (offsetof(struct pt_regs, ss))
/**
* regs_get_register() - get register value from its offset
* @regs: pt_regs from which register value is gotten.
* @offset: offset number of the register.
*
* regs_get_register returns the value of a register. The @offset is the
* offset of the register in struct pt_regs address which specified by @regs.
* If @offset is bigger than MAX_REG_OFFSET, this returns 0.
*/
static inline unsigned long regs_get_register(struct pt_regs *regs,
unsigned int offset)
{
if (unlikely(offset > MAX_REG_OFFSET))
return 0;
#ifdef CONFIG_X86_32
/* The selector fields are 16-bit. */
if (offset == offsetof(struct pt_regs, cs) ||
offset == offsetof(struct pt_regs, ss) ||
offset == offsetof(struct pt_regs, ds) ||
offset == offsetof(struct pt_regs, es) ||
offset == offsetof(struct pt_regs, fs) ||
offset == offsetof(struct pt_regs, gs)) {
return *(u16 *)((unsigned long)regs + offset);
}
#endif
return *(unsigned long *)((unsigned long)regs + offset);
}
/**
* regs_within_kernel_stack() - check the address in the stack
* @regs: pt_regs which contains kernel stack pointer.
* @addr: address which is checked.
*
* regs_within_kernel_stack() checks @addr is within the kernel stack page(s).
* If @addr is within the kernel stack, it returns true. If not, returns false.
*/
static inline int regs_within_kernel_stack(struct pt_regs *regs,
unsigned long addr)
{
return ((addr & ~(THREAD_SIZE - 1)) == (regs->sp & ~(THREAD_SIZE - 1)));
}
/**
* regs_get_kernel_stack_nth_addr() - get the address of the Nth entry on stack
* @regs: pt_regs which contains kernel stack pointer.
* @n: stack entry number.
*
* regs_get_kernel_stack_nth() returns the address of the @n th entry of the
* kernel stack which is specified by @regs. If the @n th entry is NOT in
* the kernel stack, this returns NULL.
*/
static inline unsigned long *regs_get_kernel_stack_nth_addr(struct pt_regs *regs, unsigned int n)
{
unsigned long *addr = (unsigned long *)regs->sp;
addr += n;
if (regs_within_kernel_stack(regs, (unsigned long)addr))
return addr;
else
return NULL;
}
/* To avoid include hell, we can't include uaccess.h */
extern long copy_from_kernel_nofault(void *dst, const void *src, size_t size);
/**
* regs_get_kernel_stack_nth() - get Nth entry of the stack
* @regs: pt_regs which contains kernel stack pointer.
* @n: stack entry number.
*
* regs_get_kernel_stack_nth() returns @n th entry of the kernel stack which
* is specified by @regs. If the @n th entry is NOT in the kernel stack
* this returns 0.
*/
static inline unsigned long regs_get_kernel_stack_nth(struct pt_regs *regs,
unsigned int n)
{
unsigned long *addr;
unsigned long val;
long ret;
addr = regs_get_kernel_stack_nth_addr(regs, n);
if (addr) {
ret = copy_from_kernel_nofault(&val, addr, sizeof(val));
if (!ret)
return val;
}
return 0;
}
/**
* regs_get_kernel_argument() - get Nth function argument in kernel
* @regs: pt_regs of that context
* @n: function argument number (start from 0)
*
* regs_get_argument() returns @n th argument of the function call.
* Note that this chooses most probably assignment, in some case
* it can be incorrect.
* This is expected to be called from kprobes or ftrace with regs
* where the top of stack is the return address.
*/
static inline unsigned long regs_get_kernel_argument(struct pt_regs *regs,
unsigned int n)
{
static const unsigned int argument_offs[] = {
#ifdef __i386__
offsetof(struct pt_regs, ax),
offsetof(struct pt_regs, dx),
offsetof(struct pt_regs, cx),
#define NR_REG_ARGUMENTS 3
#else
offsetof(struct pt_regs, di),
offsetof(struct pt_regs, si),
offsetof(struct pt_regs, dx),
offsetof(struct pt_regs, cx),
offsetof(struct pt_regs, r8),
offsetof(struct pt_regs, r9),
#define NR_REG_ARGUMENTS 6
#endif
};
if (n >= NR_REG_ARGUMENTS) {
n -= NR_REG_ARGUMENTS - 1;
return regs_get_kernel_stack_nth(regs, n);
} else
return regs_get_register(regs, argument_offs[n]);
}
#define arch_has_single_step() (1)
#ifdef CONFIG_X86_DEBUGCTLMSR
#define arch_has_block_step() (1)
#else
#define arch_has_block_step() (boot_cpu_data.x86 >= 6)
#endif
#define ARCH_HAS_USER_SINGLE_STEP_REPORT
struct user_desc;
extern int do_get_thread_area(struct task_struct *p, int idx,
struct user_desc __user *info);
extern int do_set_thread_area(struct task_struct *p, int idx,
struct user_desc __user *info, int can_allocate);
#ifdef CONFIG_X86_64
# define do_set_thread_area_64(p, s, t) do_arch_prctl_64(p, s, t)
#else
# define do_set_thread_area_64(p, s, t) (0)
#endif
#endif /* !__ASSEMBLY__ */
#endif /* _ASM_X86_PTRACE_H */
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_X86_MMU_CONTEXT_H
#define _ASM_X86_MMU_CONTEXT_H
#include <asm/desc.h>
#include <linux/atomic.h>
#include <linux/mm_types.h>
#include <linux/pkeys.h>
#include <trace/events/tlb.h>
#include <asm/tlbflush.h>
#include <asm/paravirt.h>
#include <asm/debugreg.h>
extern atomic64_t last_mm_ctx_id;
#ifndef CONFIG_PARAVIRT_XXL
static inline void paravirt_activate_mm(struct mm_struct *prev,
struct mm_struct *next)
{
}
#endif /* !CONFIG_PARAVIRT_XXL */
#ifdef CONFIG_PERF_EVENTS
DECLARE_STATIC_KEY_FALSE(rdpmc_never_available_key);
DECLARE_STATIC_KEY_FALSE(rdpmc_always_available_key);
void cr4_update_pce(void *ignored);
#endif
#ifdef CONFIG_MODIFY_LDT_SYSCALL
/*
* ldt_structs can be allocated, used, and freed, but they are never
* modified while live.
*/
struct ldt_struct {
/*
* Xen requires page-aligned LDTs with special permissions. This is
* needed to prevent us from installing evil descriptors such as
* call gates. On native, we could merge the ldt_struct and LDT
* allocations, but it's not worth trying to optimize.
*/
struct desc_struct *entries;
unsigned int nr_entries;
/*
* If PTI is in use, then the entries array is not mapped while we're
* in user mode. The whole array will be aliased at the addressed
* given by ldt_slot_va(slot). We use two slots so that we can allocate
* and map, and enable a new LDT without invalidating the mapping
* of an older, still-in-use LDT.
*
* slot will be -1 if this LDT doesn't have an alias mapping.
*/
int slot;
};
/*
* Used for LDT copy/destruction.
*/
static inline void init_new_context_ldt(struct mm_struct *mm)
{
mm->context.ldt = NULL;
init_rwsem(&mm->context.ldt_usr_sem);
}
int ldt_dup_context(struct mm_struct *oldmm, struct mm_struct *mm);
void destroy_context_ldt(struct mm_struct *mm);
void ldt_arch_exit_mmap(struct mm_struct *mm);
#else /* CONFIG_MODIFY_LDT_SYSCALL */
static inline void init_new_context_ldt(struct mm_struct *mm) { }
static inline int ldt_dup_context(struct mm_struct *oldmm,
struct mm_struct *mm)
{
return 0;
}
static inline void destroy_context_ldt(struct mm_struct *mm) { }
static inline void ldt_arch_exit_mmap(struct mm_struct *mm) { }
#endif
#ifdef CONFIG_MODIFY_LDT_SYSCALL
extern void load_mm_ldt(struct mm_struct *mm);
extern void switch_ldt(struct mm_struct *prev, struct mm_struct *next);
#else
static inline void load_mm_ldt(struct mm_struct *mm)
{
clear_LDT();
}
static inline void switch_ldt(struct mm_struct *prev, struct mm_struct *next)
{
DEBUG_LOCKS_WARN_ON(preemptible());
}
#endif
#define enter_lazy_tlb enter_lazy_tlb
extern void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk);
/*
* Init a new mm. Used on mm copies, like at fork()
* and on mm's that are brand-new, like at execve().
*/
#define init_new_context init_new_context
static inline int init_new_context(struct task_struct *tsk,
struct mm_struct *mm)
{
mutex_init(&mm->context.lock);
mm->context.ctx_id = atomic64_inc_return(&last_mm_ctx_id);
atomic64_set(&mm->context.tlb_gen, 0);
#ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS
if (cpu_feature_enabled(X86_FEATURE_OSPKE)) {
/* pkey 0 is the default and allocated implicitly */
mm->context.pkey_allocation_map = 0x1;
/* -1 means unallocated or invalid */
mm->context.execute_only_pkey = -1;
}
#endif
init_new_context_ldt(mm);
return 0;
}
#define destroy_context destroy_context
static inline void destroy_context(struct mm_struct *mm)
{
destroy_context_ldt(mm);
}
extern void switch_mm(struct mm_struct *prev, struct mm_struct *next,
struct task_struct *tsk);
extern void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
struct task_struct *tsk);
#define switch_mm_irqs_off switch_mm_irqs_off
#define activate_mm(prev, next) \
do { \
paravirt_activate_mm((prev), (next)); \
switch_mm((prev), (next), NULL); \
} while (0);
#ifdef CONFIG_X86_32
#define deactivate_mm(tsk, mm) \
do { \
lazy_load_gs(0); \
} while (0)
#else
#define deactivate_mm(tsk, mm) \
do { \
load_gs_index(0); \
loadsegment(fs, 0); \
} while (0)
#endif
static inline void arch_dup_pkeys(struct mm_struct *oldmm,
struct mm_struct *mm)
{
#ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS
if (!cpu_feature_enabled(X86_FEATURE_OSPKE))
return;
/* Duplicate the oldmm pkey state in mm: */
mm->context.pkey_allocation_map = oldmm->context.pkey_allocation_map;
mm->context.execute_only_pkey = oldmm->context.execute_only_pkey;
#endif
}
static inline int arch_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm)
{
arch_dup_pkeys(oldmm, mm);
paravirt_arch_dup_mmap(oldmm, mm);
return ldt_dup_context(oldmm, mm);
}
static inline void arch_exit_mmap(struct mm_struct *mm)
{
paravirt_arch_exit_mmap(mm);
ldt_arch_exit_mmap(mm);
}
#ifdef CONFIG_X86_64
static inline bool is_64bit_mm(struct mm_struct *mm)
{
return !IS_ENABLED(CONFIG_IA32_EMULATION) ||
!(mm->context.flags & MM_CONTEXT_UPROBE_IA32);
}
#else
static inline bool is_64bit_mm(struct mm_struct *mm)
{
return false;
}
#endif
static inline void arch_unmap(struct mm_struct *mm, unsigned long start,
unsigned long end)
{
}
/*
* We only want to enforce protection keys on the current process
* because we effectively have no access to PKRU for other
* processes or any way to tell *which * PKRU in a threaded
* process we could use.
*
* So do not enforce things if the VMA is not from the current
* mm, or if we are in a kernel thread.
*/
static inline bool arch_vma_access_permitted(struct vm_area_struct *vma,
bool write, bool execute, bool foreign)
{
/* pkeys never affect instruction fetches */
if (execute)
return true;
/* allow access if the VMA is not one from this process */
if (foreign || vma_is_foreign(vma))
return true;
return __pkru_allows_pkey(vma_pkey(vma), write);
}
unsigned long __get_current_cr3_fast(void);
#include <asm-generic/mmu_context.h>
#endif /* _ASM_X86_MMU_CONTEXT_H */
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_SCHED_H
#define _LINUX_SCHED_H
/*
* Define 'struct task_struct' and provide the main scheduler
* APIs (schedule(), wakeup variants, etc.)
*/
#include <uapi/linux/sched.h>
#include <asm/current.h>
#include <linux/pid.h>
#include <linux/sem.h>
#include <linux/shm.h>
#include <linux/mutex.h>
#include <linux/plist.h>
#include <linux/hrtimer.h>
#include <linux/irqflags.h>
#include <linux/seccomp.h>
#include <linux/nodemask.h>
#include <linux/rcupdate.h>
#include <linux/refcount.h>
#include <linux/resource.h>
#include <linux/latencytop.h>
#include <linux/sched/prio.h>
#include <linux/sched/types.h>
#include <linux/signal_types.h>
#include <linux/syscall_user_dispatch.h>
#include <linux/mm_types_task.h>
#include <linux/task_io_accounting.h>
#include <linux/posix-timers.h>
#include <linux/rseq.h>
#include <linux/seqlock.h>
#include <linux/kcsan.h>
#include <asm/kmap_size.h>
/* task_struct member predeclarations (sorted alphabetically): */
struct audit_context;
struct backing_dev_info;
struct bio_list;
struct blk_plug;
struct bpf_local_storage;
struct bpf_run_ctx;
struct capture_control;
struct cfs_rq;
struct fs_struct;
struct futex_pi_state;
struct io_context;
struct io_uring_task;
struct mempolicy;
struct nameidata;
struct nsproxy;
struct perf_event_context;
struct pid_namespace;
struct pipe_inode_info;
struct rcu_node;
struct reclaim_state;
struct robust_list_head;
struct root_domain;
struct rq;
struct sched_attr;
struct sched_param;
struct seq_file;
struct sighand_struct;
struct signal_struct;
struct task_delay_info;
struct task_group;
/*
* Task state bitmask. NOTE! These bits are also
* encoded in fs/proc/array.c: get_task_state().
*
* We have two separate sets of flags: task->state
* is about runnability, while task->exit_state are
* about the task exiting. Confusing, but this way
* modifying one set can't modify the other one by
* mistake.
*/
/* Used in tsk->state: */
#define TASK_RUNNING 0x0000
#define TASK_INTERRUPTIBLE 0x0001
#define TASK_UNINTERRUPTIBLE 0x0002
#define __TASK_STOPPED 0x0004
#define __TASK_TRACED 0x0008
/* Used in tsk->exit_state: */
#define EXIT_DEAD 0x0010
#define EXIT_ZOMBIE 0x0020
#define EXIT_TRACE (EXIT_ZOMBIE | EXIT_DEAD)
/* Used in tsk->state again: */
#define TASK_PARKED 0x0040
#define TASK_DEAD 0x0080
#define TASK_WAKEKILL 0x0100
#define TASK_WAKING 0x0200
#define TASK_NOLOAD 0x0400
#define TASK_NEW 0x0800
/* RT specific auxilliary flag to mark RT lock waiters */
#define TASK_RTLOCK_WAIT 0x1000
#define TASK_STATE_MAX 0x2000
/* Convenience macros for the sake of set_current_state: */
#define TASK_KILLABLE (TASK_WAKEKILL | TASK_UNINTERRUPTIBLE)
#define TASK_STOPPED (TASK_WAKEKILL | __TASK_STOPPED)
#define TASK_TRACED (TASK_WAKEKILL | __TASK_TRACED)
#define TASK_IDLE (TASK_UNINTERRUPTIBLE | TASK_NOLOAD)
/* Convenience macros for the sake of wake_up(): */
#define TASK_NORMAL (TASK_INTERRUPTIBLE | TASK_UNINTERRUPTIBLE)
/* get_task_state(): */
#define TASK_REPORT (TASK_RUNNING | TASK_INTERRUPTIBLE | \
TASK_UNINTERRUPTIBLE | __TASK_STOPPED | \
__TASK_TRACED | EXIT_DEAD | EXIT_ZOMBIE | \
TASK_PARKED)
#define task_is_running(task) (READ_ONCE((task)->__state) == TASK_RUNNING)
#define task_is_traced(task) ((READ_ONCE(task->__state) & __TASK_TRACED) != 0)
#define task_is_stopped(task) ((READ_ONCE(task->__state) & __TASK_STOPPED) != 0)
#define task_is_stopped_or_traced(task) ((READ_ONCE(task->__state) & (__TASK_STOPPED | __TASK_TRACED)) != 0)
/*
* Special states are those that do not use the normal wait-loop pattern. See
* the comment with set_special_state().
*/
#define is_special_task_state(state) \
((state) & (__TASK_STOPPED | __TASK_TRACED | TASK_PARKED | TASK_DEAD))
#ifdef CONFIG_DEBUG_ATOMIC_SLEEP
# define debug_normal_state_change(state_value) \
do { \
WARN_ON_ONCE(is_special_task_state(state_value)); \
current->task_state_change = _THIS_IP_; \
} while (0)
# define debug_special_state_change(state_value) \
do { \
WARN_ON_ONCE(!is_special_task_state(state_value)); \
current->task_state_change = _THIS_IP_; \
} while (0)
# define debug_rtlock_wait_set_state() \
do { \
current->saved_state_change = current->task_state_change;\
current->task_state_change = _THIS_IP_; \
} while (0)
# define debug_rtlock_wait_restore_state() \
do { \
current->task_state_change = current->saved_state_change;\
} while (0)
#else
# define debug_normal_state_change(cond) do { } while (0)
# define debug_special_state_change(cond) do { } while (0)
# define debug_rtlock_wait_set_state() do { } while (0)
# define debug_rtlock_wait_restore_state() do { } while (0)
#endif
/*
* set_current_state() includes a barrier so that the write of current->state
* is correctly serialised wrt the caller's subsequent test of whether to
* actually sleep:
*
* for (;;) {
* set_current_state(TASK_UNINTERRUPTIBLE);
* if (CONDITION)
* break;
*
* schedule();
* }
* __set_current_state(TASK_RUNNING);
*
* If the caller does not need such serialisation (because, for instance, the
* CONDITION test and condition change and wakeup are under the same lock) then
* use __set_current_state().
*
* The above is typically ordered against the wakeup, which does:
*
* CONDITION = 1;
* wake_up_state(p, TASK_UNINTERRUPTIBLE);
*
* where wake_up_state()/try_to_wake_up() executes a full memory barrier before
* accessing p->state.
*
* Wakeup will do: if (@state & p->state) p->state = TASK_RUNNING, that is,
* once it observes the TASK_UNINTERRUPTIBLE store the waking CPU can issue a
* TASK_RUNNING store which can collide with __set_current_state(TASK_RUNNING).
*
* However, with slightly different timing the wakeup TASK_RUNNING store can
* also collide with the TASK_UNINTERRUPTIBLE store. Losing that store is not
* a problem either because that will result in one extra go around the loop
* and our @cond test will save the day.
*
* Also see the comments of try_to_wake_up().
*/
#define __set_current_state(state_value) \
do { \
debug_normal_state_change((state_value)); \
WRITE_ONCE(current->__state, (state_value)); \
} while (0)
#define set_current_state(state_value) \
do { \
debug_normal_state_change((state_value)); \
smp_store_mb(current->__state, (state_value)); \
} while (0)
/*
* set_special_state() should be used for those states when the blocking task
* can not use the regular condition based wait-loop. In that case we must
* serialize against wakeups such that any possible in-flight TASK_RUNNING
* stores will not collide with our state change.
*/
#define set_special_state(state_value) \
do { \
unsigned long flags; /* may shadow */ \
\
raw_spin_lock_irqsave(¤t->pi_lock, flags); \
debug_special_state_change((state_value)); \
WRITE_ONCE(current->__state, (state_value)); \
raw_spin_unlock_irqrestore(¤t->pi_lock, flags); \
} while (0)
/*
* PREEMPT_RT specific variants for "sleeping" spin/rwlocks
*
* RT's spin/rwlock substitutions are state preserving. The state of the
* task when blocking on the lock is saved in task_struct::saved_state and
* restored after the lock has been acquired. These operations are
* serialized by task_struct::pi_lock against try_to_wake_up(). Any non RT
* lock related wakeups while the task is blocked on the lock are
* redirected to operate on task_struct::saved_state to ensure that these
* are not dropped. On restore task_struct::saved_state is set to
* TASK_RUNNING so any wakeup attempt redirected to saved_state will fail.
*
* The lock operation looks like this:
*
* current_save_and_set_rtlock_wait_state();
* for (;;) {
* if (try_lock())
* break;
* raw_spin_unlock_irq(&lock->wait_lock);
* schedule_rtlock();
* raw_spin_lock_irq(&lock->wait_lock);
* set_current_state(TASK_RTLOCK_WAIT);
* }
* current_restore_rtlock_saved_state();
*/
#define current_save_and_set_rtlock_wait_state() \
do { \
lockdep_assert_irqs_disabled(); \
raw_spin_lock(¤t->pi_lock); \
current->saved_state = current->__state; \
debug_rtlock_wait_set_state(); \
WRITE_ONCE(current->__state, TASK_RTLOCK_WAIT); \
raw_spin_unlock(¤t->pi_lock); \
} while (0);
#define current_restore_rtlock_saved_state() \
do { \
lockdep_assert_irqs_disabled(); \
raw_spin_lock(¤t->pi_lock); \
debug_rtlock_wait_restore_state(); \
WRITE_ONCE(current->__state, current->saved_state); \
current->saved_state = TASK_RUNNING; \
raw_spin_unlock(¤t->pi_lock); \
} while (0);
#define get_current_state() READ_ONCE(current->__state)
/* Task command name length: */
#define TASK_COMM_LEN 16
extern void scheduler_tick(void);
#define MAX_SCHEDULE_TIMEOUT LONG_MAX
extern long schedule_timeout(long timeout);
extern long schedule_timeout_interruptible(long timeout);
extern long schedule_timeout_killable(long timeout);
extern long schedule_timeout_uninterruptible(long timeout);
extern long schedule_timeout_idle(long timeout);
asmlinkage void schedule(void);
extern void schedule_preempt_disabled(void);
asmlinkage void preempt_schedule_irq(void);
#ifdef CONFIG_PREEMPT_RT
extern void schedule_rtlock(void);
#endif
extern int __must_check io_schedule_prepare(void);
extern void io_schedule_finish(int token);
extern long io_schedule_timeout(long timeout);
extern void io_schedule(void);
/**
* struct prev_cputime - snapshot of system and user cputime
* @utime: time spent in user mode
* @stime: time spent in system mode
* @lock: protects the above two fields
*
* Stores previous user/system time values such that we can guarantee
* monotonicity.
*/
struct prev_cputime {
#ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
u64 utime;
u64 stime;
raw_spinlock_t lock;
#endif
};
enum vtime_state {
/* Task is sleeping or running in a CPU with VTIME inactive: */
VTIME_INACTIVE = 0,
/* Task is idle */
VTIME_IDLE,
/* Task runs in kernelspace in a CPU with VTIME active: */
VTIME_SYS,
/* Task runs in userspace in a CPU with VTIME active: */
VTIME_USER,
/* Task runs as guests in a CPU with VTIME active: */
VTIME_GUEST,
};
struct vtime {
seqcount_t seqcount;
unsigned long long starttime;
enum vtime_state state;
unsigned int cpu;
u64 utime;
u64 stime;
u64 gtime;
};
/*
* Utilization clamp constraints.
* @UCLAMP_MIN: Minimum utilization
* @UCLAMP_MAX: Maximum utilization
* @UCLAMP_CNT: Utilization clamp constraints count
*/
enum uclamp_id {
UCLAMP_MIN = 0,
UCLAMP_MAX,
UCLAMP_CNT
};
#ifdef CONFIG_SMP
extern struct root_domain def_root_domain;
extern struct mutex sched_domains_mutex;
#endif
struct sched_info {
#ifdef CONFIG_SCHED_INFO
/* Cumulative counters: */
/* # of times we have run on this CPU: */
unsigned long pcount;
/* Time spent waiting on a runqueue: */
unsigned long long run_delay;
/* Timestamps: */
/* When did we last run on a CPU? */
unsigned long long last_arrival;
/* When were we last queued to run? */
unsigned long long last_queued;
#endif /* CONFIG_SCHED_INFO */
};
/*
* Integer metrics need fixed point arithmetic, e.g., sched/fair
* has a few: load, load_avg, util_avg, freq, and capacity.
*
* We define a basic fixed point arithmetic range, and then formalize
* all these metrics based on that basic range.
*/
# define SCHED_FIXEDPOINT_SHIFT 10
# define SCHED_FIXEDPOINT_SCALE (1L << SCHED_FIXEDPOINT_SHIFT)
/* Increase resolution of cpu_capacity calculations */
# define SCHED_CAPACITY_SHIFT SCHED_FIXEDPOINT_SHIFT
# define SCHED_CAPACITY_SCALE (1L << SCHED_CAPACITY_SHIFT)
struct load_weight {
unsigned long weight;
u32 inv_weight;
};
/**
* struct util_est - Estimation utilization of FAIR tasks
* @enqueued: instantaneous estimated utilization of a task/cpu
* @ewma: the Exponential Weighted Moving Average (EWMA)
* utilization of a task
*
* Support data structure to track an Exponential Weighted Moving Average
* (EWMA) of a FAIR task's utilization. New samples are added to the moving
* average each time a task completes an activation. Sample's weight is chosen
* so that the EWMA will be relatively insensitive to transient changes to the
* task's workload.
*
* The enqueued attribute has a slightly different meaning for tasks and cpus:
* - task: the task's util_avg at last task dequeue time
* - cfs_rq: the sum of util_est.enqueued for each RUNNABLE task on that CPU
* Thus, the util_est.enqueued of a task represents the contribution on the
* estimated utilization of the CPU where that task is currently enqueued.
*
* Only for tasks we track a moving average of the past instantaneous
* estimated utilization. This allows to absorb sporadic drops in utilization
* of an otherwise almost periodic task.
*
* The UTIL_AVG_UNCHANGED flag is used to synchronize util_est with util_avg
* updates. When a task is dequeued, its util_est should not be updated if its
* util_avg has not been updated in the meantime.
* This information is mapped into the MSB bit of util_est.enqueued at dequeue
* time. Since max value of util_est.enqueued for a task is 1024 (PELT util_avg
* for a task) it is safe to use MSB.
*/
struct util_est {
unsigned int enqueued;
unsigned int ewma;
#define UTIL_EST_WEIGHT_SHIFT 2
#define UTIL_AVG_UNCHANGED 0x80000000
} __attribute__((__aligned__(sizeof(u64))));
/*
* The load/runnable/util_avg accumulates an infinite geometric series
* (see __update_load_avg_cfs_rq() in kernel/sched/pelt.c).
*
* [load_avg definition]
*
* load_avg = runnable% * scale_load_down(load)
*
* [runnable_avg definition]
*
* runnable_avg = runnable% * SCHED_CAPACITY_SCALE
*
* [util_avg definition]
*
* util_avg = running% * SCHED_CAPACITY_SCALE
*
* where runnable% is the time ratio that a sched_entity is runnable and
* running% the time ratio that a sched_entity is running.
*
* For cfs_rq, they are the aggregated values of all runnable and blocked
* sched_entities.
*
* The load/runnable/util_avg doesn't directly factor frequency scaling and CPU
* capacity scaling. The scaling is done through the rq_clock_pelt that is used
* for computing those signals (see update_rq_clock_pelt())
*
* N.B., the above ratios (runnable% and running%) themselves are in the
* range of [0, 1]. To do fixed point arithmetics, we therefore scale them
* to as large a range as necessary. This is for example reflected by
* util_avg's SCHED_CAPACITY_SCALE.
*
* [Overflow issue]
*
* The 64-bit load_sum can have 4353082796 (=2^64/47742/88761) entities
* with the highest load (=88761), always runnable on a single cfs_rq,
* and should not overflow as the number already hits PID_MAX_LIMIT.
*
* For all other cases (including 32-bit kernels), struct load_weight's
* weight will overflow first before we do, because:
*
* Max(load_avg) <= Max(load.weight)
*
* Then it is the load_weight's responsibility to consider overflow
* issues.
*/
struct sched_avg {
u64 last_update_time;
u64 load_sum;
u64 runnable_sum;
u32 util_sum;
u32 period_contrib;
unsigned long load_avg;
unsigned long runnable_avg;
unsigned long util_avg;
struct util_est util_est;
} ____cacheline_aligned;
struct sched_statistics {
#ifdef CONFIG_SCHEDSTATS
u64 wait_start;
u64 wait_max;
u64 wait_count;
u64 wait_sum;
u64 iowait_count;
u64 iowait_sum;
u64 sleep_start;
u64 sleep_max;
s64 sum_sleep_runtime;
u64 block_start;
u64 block_max;
u64 exec_max;
u64 slice_max;
u64 nr_migrations_cold;
u64 nr_failed_migrations_affine;
u64 nr_failed_migrations_running;
u64 nr_failed_migrations_hot;
u64 nr_forced_migrations;
u64 nr_wakeups;
u64 nr_wakeups_sync;
u64 nr_wakeups_migrate;
u64 nr_wakeups_local;
u64 nr_wakeups_remote;
u64 nr_wakeups_affine;
u64 nr_wakeups_affine_attempts;
u64 nr_wakeups_passive;
u64 nr_wakeups_idle;
#endif
};
struct sched_entity {
/* For load-balancing: */
struct load_weight load;
struct rb_node run_node;
struct list_head group_node;
unsigned int on_rq;
u64 exec_start;
u64 sum_exec_runtime;
u64 vruntime;
u64 prev_sum_exec_runtime;
u64 nr_migrations;
struct sched_statistics statistics;
#ifdef CONFIG_FAIR_GROUP_SCHED
int depth;
struct sched_entity *parent;
/* rq on which this entity is (to be) queued: */
struct cfs_rq *cfs_rq;
/* rq "owned" by this entity/group: */
struct cfs_rq *my_q;
/* cached value of my_q->h_nr_running */
unsigned long runnable_weight;
#endif
#ifdef CONFIG_SMP
/*
* Per entity load average tracking.
*
* Put into separate cache line so it does not
* collide with read-mostly values above.
*/
struct sched_avg avg;
#endif
};
struct sched_rt_entity {
struct list_head run_list;
unsigned long timeout;
unsigned long watchdog_stamp;
unsigned int time_slice;
unsigned short on_rq;
unsigned short on_list;
struct sched_rt_entity *back;
#ifdef CONFIG_RT_GROUP_SCHED
struct sched_rt_entity *parent;
/* rq on which this entity is (to be) queued: */
struct rt_rq *rt_rq;
/* rq "owned" by this entity/group: */
struct rt_rq *my_q;
#endif
} __randomize_layout;
struct sched_dl_entity {
struct rb_node rb_node;
/*
* Original scheduling parameters. Copied here from sched_attr
* during sched_setattr(), they will remain the same until
* the next sched_setattr().
*/
u64 dl_runtime; /* Maximum runtime for each instance */
u64 dl_deadline; /* Relative deadline of each instance */
u64 dl_period; /* Separation of two instances (period) */
u64 dl_bw; /* dl_runtime / dl_period */
u64 dl_density; /* dl_runtime / dl_deadline */
/*
* Actual scheduling parameters. Initialized with the values above,
* they are continuously updated during task execution. Note that
* the remaining runtime could be < 0 in case we are in overrun.
*/
s64 runtime; /* Remaining runtime for this instance */
u64 deadline; /* Absolute deadline for this instance */
unsigned int flags; /* Specifying the scheduler behaviour */
/*
* Some bool flags:
*
* @dl_throttled tells if we exhausted the runtime. If so, the
* task has to wait for a replenishment to be performed at the
* next firing of dl_timer.
*
* @dl_boosted tells if we are boosted due to DI. If so we are
* outside bandwidth enforcement mechanism (but only until we
* exit the critical section);
*
* @dl_yielded tells if task gave up the CPU before consuming
* all its available runtime during the last job.
*
* @dl_non_contending tells if the task is inactive while still
* contributing to the active utilization. In other words, it
* indicates if the inactive timer has been armed and its handler
* has not been executed yet. This flag is useful to avoid race
* conditions between the inactive timer handler and the wakeup
* code.
*
* @dl_overrun tells if the task asked to be informed about runtime
* overruns.
*/
unsigned int dl_throttled : 1;
unsigned int dl_yielded : 1;
unsigned int dl_non_contending : 1;
unsigned int dl_overrun : 1;
/*
* Bandwidth enforcement timer. Each -deadline task has its
* own bandwidth to be enforced, thus we need one timer per task.
*/
struct hrtimer dl_timer;
/*
* Inactive timer, responsible for decreasing the active utilization
* at the "0-lag time". When a -deadline task blocks, it contributes
* to GRUB's active utilization until the "0-lag time", hence a
* timer is needed to decrease the active utilization at the correct
* time.
*/
struct hrtimer inactive_timer;
#ifdef CONFIG_RT_MUTEXES
/*
* Priority Inheritance. When a DEADLINE scheduling entity is boosted
* pi_se points to the donor, otherwise points to the dl_se it belongs
* to (the original one/itself).
*/
struct sched_dl_entity *pi_se;
#endif
};
#ifdef CONFIG_UCLAMP_TASK
/* Number of utilization clamp buckets (shorter alias) */
#define UCLAMP_BUCKETS CONFIG_UCLAMP_BUCKETS_COUNT
/*
* Utilization clamp for a scheduling entity
* @value: clamp value "assigned" to a se
* @bucket_id: bucket index corresponding to the "assigned" value
* @active: the se is currently refcounted in a rq's bucket
* @user_defined: the requested clamp value comes from user-space
*
* The bucket_id is the index of the clamp bucket matching the clamp value
* which is pre-computed and stored to avoid expensive integer divisions from
* the fast path.
*
* The active bit is set whenever a task has got an "effective" value assigned,
* which can be different from the clamp value "requested" from user-space.
* This allows to know a task is refcounted in the rq's bucket corresponding
* to the "effective" bucket_id.
*
* The user_defined bit is set whenever a task has got a task-specific clamp
* value requested from userspace, i.e. the system defaults apply to this task
* just as a restriction. This allows to relax default clamps when a less
* restrictive task-specific value has been requested, thus allowing to
* implement a "nice" semantic. For example, a task running with a 20%
* default boost can still drop its own boosting to 0%.
*/
struct uclamp_se {
unsigned int value : bits_per(SCHED_CAPACITY_SCALE);
unsigned int bucket_id : bits_per(UCLAMP_BUCKETS);
unsigned int active : 1;
unsigned int user_defined : 1;
};
#endif /* CONFIG_UCLAMP_TASK */
union rcu_special {
struct {
u8 blocked;
u8 need_qs;
u8 exp_hint; /* Hint for performance. */
u8 need_mb; /* Readers need smp_mb(). */
} b; /* Bits. */
u32 s; /* Set of bits. */
};
enum perf_event_task_context {
perf_invalid_context = -1,
perf_hw_context = 0,
perf_sw_context,
perf_nr_task_contexts,
};
struct wake_q_node {
struct wake_q_node *next;
};
struct kmap_ctrl {
#ifdef CONFIG_KMAP_LOCAL
int idx;
pte_t pteval[KM_MAX_IDX];
#endif
};
struct task_struct {
#ifdef CONFIG_THREAD_INFO_IN_TASK
/*
* For reasons of header soup (see current_thread_info()), this
* must be the first element of task_struct.
*/
struct thread_info thread_info;
#endif
unsigned int __state;
#ifdef CONFIG_PREEMPT_RT
/* saved state for "spinlock sleepers" */
unsigned int saved_state;
#endif
/*
* This begins the randomizable portion of task_struct. Only
* scheduling-critical items should be added above here.
*/
randomized_struct_fields_start
void *stack;
refcount_t usage;
/* Per task flags (PF_*), defined further below: */
unsigned int flags;
unsigned int ptrace;
#ifdef CONFIG_SMP
int on_cpu;
struct __call_single_node wake_entry;
#ifdef CONFIG_THREAD_INFO_IN_TASK
/* Current CPU: */
unsigned int cpu;
#endif
unsigned int wakee_flips;
unsigned long wakee_flip_decay_ts;
struct task_struct *last_wakee;
/*
* recent_used_cpu is initially set as the last CPU used by a task
* that wakes affine another task. Waker/wakee relationships can
* push tasks around a CPU where each wakeup moves to the next one.
* Tracking a recently used CPU allows a quick search for a recently
* used CPU that may be idle.
*/
int recent_used_cpu;
int wake_cpu;
#endif
int on_rq;
int prio;
int static_prio;
int normal_prio;
unsigned int rt_priority;
const struct sched_class *sched_class;
struct sched_entity se;
struct sched_rt_entity rt;
struct sched_dl_entity dl;
#ifdef CONFIG_SCHED_CORE
struct rb_node core_node;
unsigned long core_cookie;
unsigned int core_occupation;
#endif
#ifdef CONFIG_CGROUP_SCHED
struct task_group *sched_task_group;
#endif
#ifdef CONFIG_UCLAMP_TASK
/*
* Clamp values requested for a scheduling entity.
* Must be updated with task_rq_lock() held.
*/
struct uclamp_se uclamp_req[UCLAMP_CNT];
/*
* Effective clamp values used for a scheduling entity.
* Must be updated with task_rq_lock() held.
*/
struct uclamp_se uclamp[UCLAMP_CNT];
#endif
#ifdef CONFIG_PREEMPT_NOTIFIERS
/* List of struct preempt_notifier: */
struct hlist_head preempt_notifiers;
#endif
#ifdef CONFIG_BLK_DEV_IO_TRACE
unsigned int btrace_seq;
#endif
unsigned int policy;
int nr_cpus_allowed;
const cpumask_t *cpus_ptr;
cpumask_t *user_cpus_ptr;
cpumask_t cpus_mask;
void *migration_pending;
#ifdef CONFIG_SMP
unsigned short migration_disabled;
#endif
unsigned short migration_flags;
#ifdef CONFIG_PREEMPT_RCU
int rcu_read_lock_nesting;
union rcu_special rcu_read_unlock_special;
struct list_head rcu_node_entry;
struct rcu_node *rcu_blocked_node;
#endif /* #ifdef CONFIG_PREEMPT_RCU */
#ifdef CONFIG_TASKS_RCU
unsigned long rcu_tasks_nvcsw;
u8 rcu_tasks_holdout;
u8 rcu_tasks_idx;
int rcu_tasks_idle_cpu;
struct list_head rcu_tasks_holdout_list;
#endif /* #ifdef CONFIG_TASKS_RCU */
#ifdef CONFIG_TASKS_TRACE_RCU
int trc_reader_nesting;
int trc_ipi_to_cpu;
union rcu_special trc_reader_special;
bool trc_reader_checked;
struct list_head trc_holdout_list;
#endif /* #ifdef CONFIG_TASKS_TRACE_RCU */
struct sched_info sched_info;
struct list_head tasks;
#ifdef CONFIG_SMP
struct plist_node pushable_tasks;
struct rb_node pushable_dl_tasks;
#endif
struct mm_struct *mm;
struct mm_struct *active_mm;
/* Per-thread vma caching: */
struct vmacache vmacache;
#ifdef SPLIT_RSS_COUNTING
struct task_rss_stat rss_stat;
#endif
int exit_state;
int exit_code;
int exit_signal;
/* The signal sent when the parent dies: */
int pdeath_signal;
/* JOBCTL_*, siglock protected: */
unsigned long jobctl;
/* Used for emulating ABI behavior of previous Linux versions: */
unsigned int personality;
/* Scheduler bits, serialized by scheduler locks: */
unsigned sched_reset_on_fork:1;
unsigned sched_contributes_to_load:1;
unsigned sched_migrated:1;
#ifdef CONFIG_PSI
unsigned sched_psi_wake_requeue:1;
#endif
/* Force alignment to the next boundary: */
unsigned :0;
/* Unserialized, strictly 'current' */
/*
* This field must not be in the scheduler word above due to wakelist
* queueing no longer being serialized by p->on_cpu. However:
*
* p->XXX = X; ttwu()
* schedule() if (p->on_rq && ..) // false
* smp_mb__after_spinlock(); if (smp_load_acquire(&p->on_cpu) && //true
* deactivate_task() ttwu_queue_wakelist())
* p->on_rq = 0; p->sched_remote_wakeup = Y;
*
* guarantees all stores of 'current' are visible before
* ->sched_remote_wakeup gets used, so it can be in this word.
*/
unsigned sched_remote_wakeup:1;
/* Bit to tell LSMs we're in execve(): */
unsigned in_execve:1;
unsigned in_iowait:1;
#ifndef TIF_RESTORE_SIGMASK
unsigned restore_sigmask:1;
#endif
#ifdef CONFIG_MEMCG
unsigned in_user_fault:1;
#endif
#ifdef CONFIG_COMPAT_BRK
unsigned brk_randomized:1;
#endif
#ifdef CONFIG_CGROUPS
/* disallow userland-initiated cgroup migration */
unsigned no_cgroup_migration:1;
/* task is frozen/stopped (used by the cgroup freezer) */
unsigned frozen:1;
#endif
#ifdef CONFIG_BLK_CGROUP
unsigned use_memdelay:1;
#endif
#ifdef CONFIG_PSI
/* Stalled due to lack of memory */
unsigned in_memstall:1;
#endif
#ifdef CONFIG_PAGE_OWNER
/* Used by page_owner=on to detect recursion in page tracking. */
unsigned in_page_owner:1;
#endif
#ifdef CONFIG_EVENTFD
/* Recursion prevention for eventfd_signal() */
unsigned in_eventfd_signal:1;
#endif
unsigned long atomic_flags; /* Flags requiring atomic access. */
struct restart_block restart_block;
pid_t pid;
pid_t tgid;
#ifdef CONFIG_STACKPROTECTOR
/* Canary value for the -fstack-protector GCC feature: */
unsigned long stack_canary;
#endif
/*
* Pointers to the (original) parent process, youngest child, younger sibling,
* older sibling, respectively. (p->father can be replaced with
* p->real_parent->pid)
*/
/* Real parent process: */
struct task_struct __rcu *real_parent;
/* Recipient of SIGCHLD, wait4() reports: */
struct task_struct __rcu *parent;
/*
* Children/sibling form the list of natural children:
*/
struct list_head children;
struct list_head sibling;
struct task_struct *group_leader;
/*
* 'ptraced' is the list of tasks this task is using ptrace() on.
*
* This includes both natural children and PTRACE_ATTACH targets.
* 'ptrace_entry' is this task's link on the p->parent->ptraced list.
*/
struct list_head ptraced;
struct list_head ptrace_entry;
/* PID/PID hash table linkage. */
struct pid *thread_pid;
struct hlist_node pid_links[PIDTYPE_MAX];
struct list_head thread_group;
struct list_head thread_node;
struct completion *vfork_done;
/* CLONE_CHILD_SETTID: */
int __user *set_child_tid;
/* CLONE_CHILD_CLEARTID: */
int __user *clear_child_tid;
/* PF_IO_WORKER */
void *pf_io_worker;
u64 utime;
u64 stime;
#ifdef CONFIG_ARCH_HAS_SCALED_CPUTIME
u64 utimescaled;
u64 stimescaled;
#endif
u64 gtime;
struct prev_cputime prev_cputime;
#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
struct vtime vtime;
#endif
#ifdef CONFIG_NO_HZ_FULL
atomic_t tick_dep_mask;
#endif
/* Context switch counts: */
unsigned long nvcsw;
unsigned long nivcsw;
/* Monotonic time in nsecs: */
u64 start_time;
/* Boot based time in nsecs: */
u64 start_boottime;
/* MM fault and swap info: this can arguably be seen as either mm-specific or thread-specific: */
unsigned long min_flt;
unsigned long maj_flt;
/* Empty if CONFIG_POSIX_CPUTIMERS=n */
struct posix_cputimers posix_cputimers;
#ifdef CONFIG_POSIX_CPU_TIMERS_TASK_WORK
struct posix_cputimers_work posix_cputimers_work;
#endif
/* Process credentials: */
/* Tracer's credentials at attach: */
const struct cred __rcu *ptracer_cred;
/* Objective and real subjective task credentials (COW): */
const struct cred __rcu *real_cred;
/* Effective (overridable) subjective task credentials (COW): */
const struct cred __rcu *cred;
#ifdef CONFIG_KEYS
/* Cached requested key. */
struct key *cached_requested_key;
#endif
/*
* executable name, excluding path.
*
* - normally initialized setup_new_exec()
* - access it with [gs]et_task_comm()
* - lock it with task_lock()
*/
char comm[TASK_COMM_LEN];
struct nameidata *nameidata;
#ifdef CONFIG_SYSVIPC
struct sysv_sem sysvsem;
struct sysv_shm sysvshm;
#endif
#ifdef CONFIG_DETECT_HUNG_TASK
unsigned long last_switch_count;
unsigned long last_switch_time;
#endif
/* Filesystem information: */
struct fs_struct *fs;
/* Open file information: */
struct files_struct *files;
#ifdef CONFIG_IO_URING
struct io_uring_task *io_uring;
#endif
/* Namespaces: */
struct nsproxy *nsproxy;
/* Signal handlers: */
struct signal_struct *signal;
struct sighand_struct __rcu *sighand;
sigset_t blocked;
sigset_t real_blocked;
/* Restored if set_restore_sigmask() was used: */
sigset_t saved_sigmask;
struct sigpending pending;
unsigned long sas_ss_sp;
size_t sas_ss_size;
unsigned int sas_ss_flags;
struct callback_head *task_works;
#ifdef CONFIG_AUDIT
#ifdef CONFIG_AUDITSYSCALL
struct audit_context *audit_context;
#endif
kuid_t loginuid;
unsigned int sessionid;
#endif
struct seccomp seccomp;
struct syscall_user_dispatch syscall_dispatch;
/* Thread group tracking: */
u64 parent_exec_id;
u64 self_exec_id;
/* Protection against (de-)allocation: mm, files, fs, tty, keyrings, mems_allowed, mempolicy: */
spinlock_t alloc_lock;
/* Protection of the PI data structures: */
raw_spinlock_t pi_lock;
struct wake_q_node wake_q;
#ifdef CONFIG_RT_MUTEXES
/* PI waiters blocked on a rt_mutex held by this task: */
struct rb_root_cached pi_waiters;
/* Updated under owner's pi_lock and rq lock */
struct task_struct *pi_top_task;
/* Deadlock detection and priority inheritance handling: */
struct rt_mutex_waiter *pi_blocked_on;
#endif
#ifdef CONFIG_DEBUG_MUTEXES
/* Mutex deadlock detection: */
struct mutex_waiter *blocked_on;
#endif
#ifdef CONFIG_DEBUG_ATOMIC_SLEEP
int non_block_count;
#endif
#ifdef CONFIG_TRACE_IRQFLAGS
struct irqtrace_events irqtrace;
unsigned int hardirq_threaded;
u64 hardirq_chain_key;
int softirqs_enabled;
int softirq_context;
int irq_config;
#endif
#ifdef CONFIG_PREEMPT_RT
int softirq_disable_cnt;
#endif
#ifdef CONFIG_LOCKDEP
# define MAX_LOCK_DEPTH 48UL
u64 curr_chain_key;
int lockdep_depth;
unsigned int lockdep_recursion;
struct held_lock held_locks[MAX_LOCK_DEPTH];
#endif
#if defined(CONFIG_UBSAN) && !defined(CONFIG_UBSAN_TRAP)
unsigned int in_ubsan;
#endif
/* Journalling filesystem info: */
void *journal_info;
/* Stacked block device info: */
struct bio_list *bio_list;
#ifdef CONFIG_BLOCK
/* Stack plugging: */
struct blk_plug *plug;
#endif
/* VM state: */
struct reclaim_state *reclaim_state;
struct backing_dev_info *backing_dev_info;
struct io_context *io_context;
#ifdef CONFIG_COMPACTION
struct capture_control *capture_control;
#endif
/* Ptrace state: */
unsigned long ptrace_message;
kernel_siginfo_t *last_siginfo;
struct task_io_accounting ioac;
#ifdef CONFIG_PSI
/* Pressure stall state */
unsigned int psi_flags;
#endif
#ifdef CONFIG_TASK_XACCT
/* Accumulated RSS usage: */
u64 acct_rss_mem1;
/* Accumulated virtual memory usage: */
u64 acct_vm_mem1;
/* stime + utime since last update: */
u64 acct_timexpd;
#endif
#ifdef CONFIG_CPUSETS
/* Protected by ->alloc_lock: */
nodemask_t mems_allowed;
/* Sequence number to catch updates: */
seqcount_spinlock_t mems_allowed_seq;
int cpuset_mem_spread_rotor;
int cpuset_slab_spread_rotor;
#endif
#ifdef CONFIG_CGROUPS
/* Control Group info protected by css_set_lock: */
struct css_set __rcu *cgroups;
/* cg_list protected by css_set_lock and tsk->alloc_lock: */
struct list_head cg_list;
#endif
#ifdef CONFIG_X86_CPU_RESCTRL
u32 closid;
u32 rmid;
#endif
#ifdef CONFIG_FUTEX
struct robust_list_head __user *robust_list;
#ifdef CONFIG_COMPAT
struct compat_robust_list_head __user *compat_robust_list;
#endif
struct list_head pi_state_list;
struct futex_pi_state *pi_state_cache;
struct mutex futex_exit_mutex;
unsigned int futex_state;
#endif
#ifdef CONFIG_PERF_EVENTS
struct perf_event_context *perf_event_ctxp[perf_nr_task_contexts];
struct mutex perf_event_mutex;
struct list_head perf_event_list;
#endif
#ifdef CONFIG_DEBUG_PREEMPT
unsigned long preempt_disable_ip;
#endif
#ifdef CONFIG_NUMA
/* Protected by alloc_lock: */
struct mempolicy *mempolicy;
short il_prev;
short pref_node_fork;
#endif
#ifdef CONFIG_NUMA_BALANCING
int numa_scan_seq;
unsigned int numa_scan_period;
unsigned int numa_scan_period_max;
int numa_preferred_nid;
unsigned long numa_migrate_retry;
/* Migration stamp: */
u64 node_stamp;
u64 last_task_numa_placement;
u64 last_sum_exec_runtime;
struct callback_head numa_work;
/*
* This pointer is only modified for current in syscall and
* pagefault context (and for tasks being destroyed), so it can be read
* from any of the following contexts:
* - RCU read-side critical section
* - current->numa_group from everywhere
* - task's runqueue locked, task not running
*/
struct numa_group __rcu *numa_group;
/*
* numa_faults is an array split into four regions:
* faults_memory, faults_cpu, faults_memory_buffer, faults_cpu_buffer
* in this precise order.
*
* faults_memory: Exponential decaying average of faults on a per-node
* basis. Scheduling placement decisions are made based on these
* counts. The values remain static for the duration of a PTE scan.
* faults_cpu: Track the nodes the process was running on when a NUMA
* hinting fault was incurred.
* faults_memory_buffer and faults_cpu_buffer: Record faults per node
* during the current scan window. When the scan completes, the counts
* in faults_memory and faults_cpu decay and these values are copied.
*/
unsigned long *numa_faults;
unsigned long total_numa_faults;
/*
* numa_faults_locality tracks if faults recorded during the last
* scan window were remote/local or failed to migrate. The task scan
* period is adapted based on the locality of the faults with different
* weights depending on whether they were shared or private faults
*/
unsigned long numa_faults_locality[3];
unsigned long numa_pages_migrated;
#endif /* CONFIG_NUMA_BALANCING */
#ifdef CONFIG_RSEQ
struct rseq __user *rseq;
u32 rseq_sig;
/*
* RmW on rseq_event_mask must be performed atomically
* with respect to preemption.
*/
unsigned long rseq_event_mask;
#endif
struct tlbflush_unmap_batch tlb_ubc;
union {
refcount_t rcu_users;
struct rcu_head rcu;
};
/* Cache last used pipe for splice(): */
struct pipe_inode_info *splice_pipe;
struct page_frag task_frag;
#ifdef CONFIG_TASK_DELAY_ACCT
struct task_delay_info *delays;
#endif
#ifdef CONFIG_FAULT_INJECTION
int make_it_fail;
unsigned int fail_nth;
#endif
/*
* When (nr_dirtied >= nr_dirtied_pause), it's time to call
* balance_dirty_pages() for a dirty throttling pause:
*/
int nr_dirtied;
int nr_dirtied_pause;
/* Start of a write-and-pause period: */
unsigned long dirty_paused_when;
#ifdef CONFIG_LATENCYTOP
int latency_record_count;
struct latency_record latency_record[LT_SAVECOUNT];
#endif
/*
* Time slack values; these are used to round up poll() and
* select() etc timeout values. These are in nanoseconds.
*/
u64 timer_slack_ns;
u64 default_timer_slack_ns;
#if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS)
unsigned int kasan_depth;
#endif
#ifdef CONFIG_KCSAN
struct kcsan_ctx kcsan_ctx;
#ifdef CONFIG_TRACE_IRQFLAGS
struct irqtrace_events kcsan_save_irqtrace;
#endif
#endif
#if IS_ENABLED(CONFIG_KUNIT)
struct kunit *kunit_test;
#endif
#ifdef CONFIG_FUNCTION_GRAPH_TRACER
/* Index of current stored address in ret_stack: */
int curr_ret_stack;
int curr_ret_depth;
/* Stack of return addresses for return function tracing: */
struct ftrace_ret_stack *ret_stack;
/* Timestamp for last schedule: */
unsigned long long ftrace_timestamp;
/*
* Number of functions that haven't been traced
* because of depth overrun:
*/
atomic_t trace_overrun;
/* Pause tracing: */
atomic_t tracing_graph_pause;
#endif
#ifdef CONFIG_TRACING
/* State flags for use by tracers: */
unsigned long trace;
/* Bitmask and counter of trace recursion: */
unsigned long trace_recursion;
#endif /* CONFIG_TRACING */
#ifdef CONFIG_KCOV
/* See kernel/kcov.c for more details. */
/* Coverage collection mode enabled for this task (0 if disabled): */
unsigned int kcov_mode;
/* Size of the kcov_area: */
unsigned int kcov_size;
/* Buffer for coverage collection: */
void *kcov_area;
/* KCOV descriptor wired with this task or NULL: */
struct kcov *kcov;
/* KCOV common handle for remote coverage collection: */
u64 kcov_handle;
/* KCOV sequence number: */
int kcov_sequence;
/* Collect coverage from softirq context: */
unsigned int kcov_softirq;
#endif
#ifdef CONFIG_MEMCG
struct mem_cgroup *memcg_in_oom;
gfp_t memcg_oom_gfp_mask;
int memcg_oom_order;
/* Number of pages to reclaim on returning to userland: */
unsigned int memcg_nr_pages_over_high;
/* Used by memcontrol for targeted memcg charge: */
struct mem_cgroup *active_memcg;
#endif
#ifdef CONFIG_BLK_CGROUP
struct request_queue *throttle_queue;
#endif
#ifdef CONFIG_UPROBES
struct uprobe_task *utask;
#endif
#if defined(CONFIG_BCACHE) || defined(CONFIG_BCACHE_MODULE)
unsigned int sequential_io;
unsigned int sequential_io_avg;
#endif
struct kmap_ctrl kmap_ctrl;
#ifdef CONFIG_DEBUG_ATOMIC_SLEEP
unsigned long task_state_change;
# ifdef CONFIG_PREEMPT_RT
unsigned long saved_state_change;
# endif
#endif
int pagefault_disabled;
#ifdef CONFIG_MMU
struct task_struct *oom_reaper_list;
struct timer_list oom_reaper_timer;
#endif
#ifdef CONFIG_VMAP_STACK
struct vm_struct *stack_vm_area;
#endif
#ifdef CONFIG_THREAD_INFO_IN_TASK
/* A live task holds one reference: */
refcount_t stack_refcount;
#endif
#ifdef CONFIG_LIVEPATCH
int patch_state;
#endif
#ifdef CONFIG_SECURITY
/* Used by LSM modules for access restriction: */
void *security;
#endif
#ifdef CONFIG_BPF_SYSCALL
/* Used by BPF task local storage */
struct bpf_local_storage __rcu *bpf_storage;
/* Used for BPF run context */
struct bpf_run_ctx *bpf_ctx;
#endif
#ifdef CONFIG_GCC_PLUGIN_STACKLEAK
unsigned long lowest_stack;
unsigned long prev_lowest_stack;
#endif
#ifdef CONFIG_X86_MCE
void __user *mce_vaddr;
__u64 mce_kflags;
u64 mce_addr;
__u64 mce_ripv : 1,
mce_whole_page : 1,
__mce_reserved : 62;
struct callback_head mce_kill_me;
int mce_count;
#endif
#ifdef CONFIG_KRETPROBES
struct llist_head kretprobe_instances;
#endif
#ifdef CONFIG_ARCH_HAS_PARANOID_L1D_FLUSH
/*
* If L1D flush is supported on mm context switch
* then we use this callback head to queue kill work
* to kill tasks that are not running on SMT disabled
* cores
*/
struct callback_head l1d_flush_kill;
#endif
/*
* New fields for task_struct should be added above here, so that
* they are included in the randomized portion of task_struct.
*/
randomized_struct_fields_end
/* CPU-specific state of this task: */
struct thread_struct thread;
/*
* WARNING: on x86, 'thread_struct' contains a variable-sized
* structure. It *MUST* be at the end of 'task_struct'.
*
* Do not put anything below here!
*/
};
static inline struct pid *task_pid(struct task_struct *task)
{
return task->thread_pid;
}
/*
* the helpers to get the task's different pids as they are seen
* from various namespaces
*
* task_xid_nr() : global id, i.e. the id seen from the init namespace;
* task_xid_vnr() : virtual id, i.e. the id seen from the pid namespace of
* current.
* task_xid_nr_ns() : id seen from the ns specified;
*
* see also pid_nr() etc in include/linux/pid.h
*/
pid_t __task_pid_nr_ns(struct task_struct *task, enum pid_type type, struct pid_namespace *ns);
static inline pid_t task_pid_nr(struct task_struct *tsk)
{
return tsk->pid;
}
static inline pid_t task_pid_nr_ns(struct task_struct *tsk, struct pid_namespace *ns)
{
return __task_pid_nr_ns(tsk, PIDTYPE_PID, ns);
}
static inline pid_t task_pid_vnr(struct task_struct *tsk)
{
return __task_pid_nr_ns(tsk, PIDTYPE_PID, NULL);
}
static inline pid_t task_tgid_nr(struct task_struct *tsk)
{
return tsk->tgid;
}
/**
* pid_alive - check that a task structure is not stale
* @p: Task structure to be checked.
*
* Test if a process is not yet dead (at most zombie state)
* If pid_alive fails, then pointers within the task structure
* can be stale and must not be dereferenced.
*
* Return: 1 if the process is alive. 0 otherwise.
*/
static inline int pid_alive(const struct task_struct *p)
{
return p->thread_pid != NULL;
}
static inline pid_t task_pgrp_nr_ns(struct task_struct *tsk, struct pid_namespace *ns)
{
return __task_pid_nr_ns(tsk, PIDTYPE_PGID, ns);
}
static inline pid_t task_pgrp_vnr(struct task_struct *tsk)
{
return __task_pid_nr_ns(tsk, PIDTYPE_PGID, NULL);
}
static inline pid_t task_session_nr_ns(struct task_struct *tsk, struct pid_namespace *ns)
{
return __task_pid_nr_ns(tsk, PIDTYPE_SID, ns);
}
static inline pid_t task_session_vnr(struct task_struct *tsk)
{
return __task_pid_nr_ns(tsk, PIDTYPE_SID, NULL);
}
static inline pid_t task_tgid_nr_ns(struct task_struct *tsk, struct pid_namespace *ns)
{
return __task_pid_nr_ns(tsk, PIDTYPE_TGID, ns);
}
static inline pid_t task_tgid_vnr(struct task_struct *tsk)
{
return __task_pid_nr_ns(tsk, PIDTYPE_TGID, NULL);
}
static inline pid_t task_ppid_nr_ns(const struct task_struct *tsk, struct pid_namespace *ns)
{
pid_t pid = 0;
rcu_read_lock();
if (pid_alive(tsk))
pid = task_tgid_nr_ns(rcu_dereference(tsk->real_parent), ns);
rcu_read_unlock();
return pid;
}
static inline pid_t task_ppid_nr(const struct task_struct *tsk)
{
return task_ppid_nr_ns(tsk, &init_pid_ns);
}
/* Obsolete, do not use: */
static inline pid_t task_pgrp_nr(struct task_struct *tsk)
{
return task_pgrp_nr_ns(tsk, &init_pid_ns);
}
#define TASK_REPORT_IDLE (TASK_REPORT + 1)
#define TASK_REPORT_MAX (TASK_REPORT_IDLE << 1)
static inline unsigned int task_state_index(struct task_struct *tsk)
{
unsigned int tsk_state = READ_ONCE(tsk->__state);
unsigned int state = (tsk_state | tsk->exit_state) & TASK_REPORT;
BUILD_BUG_ON_NOT_POWER_OF_2(TASK_REPORT_MAX);
if (tsk_state == TASK_IDLE)
state = TASK_REPORT_IDLE;
/*
* We're lying here, but rather than expose a completely new task state
* to userspace, we can make this appear as if the task has gone through
* a regular rt_mutex_lock() call.
*/
if (tsk_state == TASK_RTLOCK_WAIT)
state = TASK_UNINTERRUPTIBLE;
return fls(state);
}
static inline char task_index_to_char(unsigned int state)
{
static const char state_char[] = "RSDTtXZPI";
BUILD_BUG_ON(1 + ilog2(TASK_REPORT_MAX) != sizeof(state_char) - 1);
return state_char[state];
}
static inline char task_state_to_char(struct task_struct *tsk)
{
return task_index_to_char(task_state_index(tsk));
}
/**
* is_global_init - check if a task structure is init. Since init
* is free to have sub-threads we need to check tgid.
* @tsk: Task structure to be checked.
*
* Check if a task structure is the first user space task the kernel created.
*
* Return: 1 if the task structure is init. 0 otherwise.
*/
static inline int is_global_init(struct task_struct *tsk)
{
return task_tgid_nr(tsk) == 1;
}
extern struct pid *cad_pid;
/*
* Per process flags
*/
#define PF_VCPU 0x00000001 /* I'm a virtual CPU */
#define PF_IDLE 0x00000002 /* I am an IDLE thread */
#define PF_EXITING 0x00000004 /* Getting shut down */
#define PF_IO_WORKER 0x00000010 /* Task is an IO worker */
#define PF_WQ_WORKER 0x00000020 /* I'm a workqueue worker */
#define PF_FORKNOEXEC 0x00000040 /* Forked but didn't exec */
#define PF_MCE_PROCESS 0x00000080 /* Process policy on mce errors */
#define PF_SUPERPRIV 0x00000100 /* Used super-user privileges */
#define PF_DUMPCORE 0x00000200 /* Dumped core */
#define PF_SIGNALED 0x00000400 /* Killed by a signal */
#define PF_MEMALLOC 0x00000800 /* Allocating memory */
#define PF_NPROC_EXCEEDED 0x00001000 /* set_user() noticed that RLIMIT_NPROC was exceeded */
#define PF_USED_MATH 0x00002000 /* If unset the fpu must be initialized before use */
#define PF_NOFREEZE 0x00008000 /* This thread should not be frozen */
#define PF_FROZEN 0x00010000 /* Frozen for system suspend */
#define PF_KSWAPD 0x00020000 /* I am kswapd */
#define PF_MEMALLOC_NOFS 0x00040000 /* All allocation requests will inherit GFP_NOFS */
#define PF_MEMALLOC_NOIO 0x00080000 /* All allocation requests will inherit GFP_NOIO */
#define PF_LOCAL_THROTTLE 0x00100000 /* Throttle writes only against the bdi I write to,
* I am cleaning dirty pages from some other bdi. */
#define PF_KTHREAD 0x00200000 /* I am a kernel thread */
#define PF_RANDOMIZE 0x00400000 /* Randomize virtual address space */
#define PF_SWAPWRITE 0x00800000 /* Allowed to write to swap */
#define PF_NO_SETAFFINITY 0x04000000 /* Userland is not allowed to meddle with cpus_mask */
#define PF_MCE_EARLY 0x08000000 /* Early kill for mce process policy */
#define PF_MEMALLOC_PIN 0x10000000 /* Allocation context constrained to zones which allow long term pinning. */
#define PF_FREEZER_SKIP 0x40000000 /* Freezer should not count it as freezable */
#define PF_SUSPEND_TASK 0x80000000 /* This thread called freeze_processes() and should not be frozen */
/*
* Only the _current_ task can read/write to tsk->flags, but other
* tasks can access tsk->flags in readonly mode for example
* with tsk_used_math (like during threaded core dumping).
* There is however an exception to this rule during ptrace
* or during fork: the ptracer task is allowed to write to the
* child->flags of its traced child (same goes for fork, the parent
* can write to the child->flags), because we're guaranteed the
* child is not running and in turn not changing child->flags
* at the same time the parent does it.
*/
#define clear_stopped_child_used_math(child) do { (child)->flags &= ~PF_USED_MATH; } while (0)
#define set_stopped_child_used_math(child) do { (child)->flags |= PF_USED_MATH; } while (0)
#define clear_used_math() clear_stopped_child_used_math(current)
#define set_used_math() set_stopped_child_used_math(current)
#define conditional_stopped_child_used_math(condition, child) \
do { (child)->flags &= ~PF_USED_MATH, (child)->flags |= (condition) ? PF_USED_MATH : 0; } while (0)
#define conditional_used_math(condition) conditional_stopped_child_used_math(condition, current)
#define copy_to_stopped_child_used_math(child) \
do { (child)->flags &= ~PF_USED_MATH, (child)->flags |= current->flags & PF_USED_MATH; } while (0)
/* NOTE: this will return 0 or PF_USED_MATH, it will never return 1 */
#define tsk_used_math(p) ((p)->flags & PF_USED_MATH)
#define used_math() tsk_used_math(current)
static __always_inline bool is_percpu_thread(void)
{
#ifdef CONFIG_SMP
return (current->flags & PF_NO_SETAFFINITY) &&
(current->nr_cpus_allowed == 1);
#else
return true;
#endif
}
/* Per-process atomic flags. */
#define PFA_NO_NEW_PRIVS 0 /* May not gain new privileges. */
#define PFA_SPREAD_PAGE 1 /* Spread page cache over cpuset */
#define PFA_SPREAD_SLAB 2 /* Spread some slab caches over cpuset */
#define PFA_SPEC_SSB_DISABLE 3 /* Speculative Store Bypass disabled */
#define PFA_SPEC_SSB_FORCE_DISABLE 4 /* Speculative Store Bypass force disabled*/
#define PFA_SPEC_IB_DISABLE 5 /* Indirect branch speculation restricted */
#define PFA_SPEC_IB_FORCE_DISABLE 6 /* Indirect branch speculation permanently restricted */
#define PFA_SPEC_SSB_NOEXEC 7 /* Speculative Store Bypass clear on execve() */
#define TASK_PFA_TEST(name, func) \
static inline bool task_##func(struct task_struct *p) \
{ return test_bit(PFA_##name, &p->atomic_flags); }
#define TASK_PFA_SET(name, func) \
static inline void task_set_##func(struct task_struct *p) \
{ set_bit(PFA_##name, &p->atomic_flags); }
#define TASK_PFA_CLEAR(name, func) \
static inline void task_clear_##func(struct task_struct *p) \
{ clear_bit(PFA_##name, &p->atomic_flags); }
TASK_PFA_TEST(NO_NEW_PRIVS, no_new_privs)
TASK_PFA_SET(NO_NEW_PRIVS, no_new_privs)
TASK_PFA_TEST(SPREAD_PAGE, spread_page)
TASK_PFA_SET(SPREAD_PAGE, spread_page)
TASK_PFA_CLEAR(SPREAD_PAGE, spread_page)
TASK_PFA_TEST(SPREAD_SLAB, spread_slab)
TASK_PFA_SET(SPREAD_SLAB, spread_slab)
TASK_PFA_CLEAR(SPREAD_SLAB, spread_slab)
TASK_PFA_TEST(SPEC_SSB_DISABLE, spec_ssb_disable)
TASK_PFA_SET(SPEC_SSB_DISABLE, spec_ssb_disable)
TASK_PFA_CLEAR(SPEC_SSB_DISABLE, spec_ssb_disable)
TASK_PFA_TEST(SPEC_SSB_NOEXEC, spec_ssb_noexec)
TASK_PFA_SET(SPEC_SSB_NOEXEC, spec_ssb_noexec)
TASK_PFA_CLEAR(SPEC_SSB_NOEXEC, spec_ssb_noexec)
TASK_PFA_TEST(SPEC_SSB_FORCE_DISABLE, spec_ssb_force_disable)
TASK_PFA_SET(SPEC_SSB_FORCE_DISABLE, spec_ssb_force_disable)
TASK_PFA_TEST(SPEC_IB_DISABLE, spec_ib_disable)
TASK_PFA_SET(SPEC_IB_DISABLE, spec_ib_disable)
TASK_PFA_CLEAR(SPEC_IB_DISABLE, spec_ib_disable)
TASK_PFA_TEST(SPEC_IB_FORCE_DISABLE, spec_ib_force_disable)
TASK_PFA_SET(SPEC_IB_FORCE_DISABLE, spec_ib_force_disable)
static inline void
current_restore_flags(unsigned long orig_flags, unsigned long flags)
{
current->flags &= ~flags;
current->flags |= orig_flags & flags;
}
extern int cpuset_cpumask_can_shrink(const struct cpumask *cur, const struct cpumask *trial);
extern int task_can_attach(struct task_struct *p, const struct cpumask *cs_cpus_allowed);
#ifdef CONFIG_SMP
extern void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask);
extern int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask);
extern int dup_user_cpus_ptr(struct task_struct *dst, struct task_struct *src, int node);
extern void release_user_cpus_ptr(struct task_struct *p);
extern int dl_task_check_affinity(struct task_struct *p, const struct cpumask *mask);
extern void force_compatible_cpus_allowed_ptr(struct task_struct *p);
extern void relax_compatible_cpus_allowed_ptr(struct task_struct *p);
#else
static inline void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
{
}
static inline int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
{
if (!cpumask_test_cpu(0, new_mask))
return -EINVAL;
return 0;
}
static inline int dup_user_cpus_ptr(struct task_struct *dst, struct task_struct *src, int node)
{
if (src->user_cpus_ptr)
return -EINVAL;
return 0;
}
static inline void release_user_cpus_ptr(struct task_struct *p)
{
WARN_ON(p->user_cpus_ptr);
}
static inline int dl_task_check_affinity(struct task_struct *p, const struct cpumask *mask)
{
return 0;
}
#endif
extern int yield_to(struct task_struct *p, bool preempt);
extern void set_user_nice(struct task_struct *p, long nice);
extern int task_prio(const struct task_struct *p);
/**
* task_nice - return the nice value of a given task.
* @p: the task in question.
*
* Return: The nice value [ -20 ... 0 ... 19 ].
*/
static inline int task_nice(const struct task_struct *p)
{
return PRIO_TO_NICE((p)->static_prio);
}
extern int can_nice(const struct task_struct *p, const int nice);
extern int task_curr(const struct task_struct *p);
extern int idle_cpu(int cpu);
extern int available_idle_cpu(int cpu);
extern int sched_setscheduler(struct task_struct *, int, const struct sched_param *);
extern int sched_setscheduler_nocheck(struct task_struct *, int, const struct sched_param *);
extern void sched_set_fifo(struct task_struct *p);
extern void sched_set_fifo_low(struct task_struct *p);
extern void sched_set_normal(struct task_struct *p, int nice);
extern int sched_setattr(struct task_struct *, const struct sched_attr *);
extern int sched_setattr_nocheck(struct task_struct *, const struct sched_attr *);
extern struct task_struct *idle_task(int cpu);
/**
* is_idle_task - is the specified task an idle task?
* @p: the task in question.
*
* Return: 1 if @p is an idle task. 0 otherwise.
*/
static __always_inline bool is_idle_task(const struct task_struct *p)
{
return !!(p->flags & PF_IDLE);
}
extern struct task_struct *curr_task(int cpu);
extern void ia64_set_curr_task(int cpu, struct task_struct *p);
void yield(void);
union thread_union {
#ifndef CONFIG_ARCH_TASK_STRUCT_ON_STACK
struct task_struct task;
#endif
#ifndef CONFIG_THREAD_INFO_IN_TASK
struct thread_info thread_info;
#endif
unsigned long stack[THREAD_SIZE/sizeof(long)];
};
#ifndef CONFIG_THREAD_INFO_IN_TASK
extern struct thread_info init_thread_info;
#endif
extern unsigned long init_stack[THREAD_SIZE / sizeof(unsigned long)];
#ifdef CONFIG_THREAD_INFO_IN_TASK
static inline struct thread_info *task_thread_info(struct task_struct *task)
{
return &task->thread_info;
}
#elif !defined(__HAVE_THREAD_FUNCTIONS)
# define task_thread_info(task) ((struct thread_info *)(task)->stack)
#endif
/*
* find a task by one of its numerical ids
*
* find_task_by_pid_ns():
* finds a task by its pid in the specified namespace
* find_task_by_vpid():
* finds a task by its virtual pid
*
* see also find_vpid() etc in include/linux/pid.h
*/
extern struct task_struct *find_task_by_vpid(pid_t nr);
extern struct task_struct *find_task_by_pid_ns(pid_t nr, struct pid_namespace *ns);
/*
* find a task by its virtual pid and get the task struct
*/
extern struct task_struct *find_get_task_by_vpid(pid_t nr);
extern int wake_up_state(struct task_struct *tsk, unsigned int state);
extern int wake_up_process(struct task_struct *tsk);
extern void wake_up_new_task(struct task_struct *tsk);
#ifdef CONFIG_SMP
extern void kick_process(struct task_struct *tsk);
#else
static inline void kick_process(struct task_struct *tsk) { }
#endif
extern void __set_task_comm(struct task_struct *tsk, const char *from, bool exec);
static inline void set_task_comm(struct task_struct *tsk, const char *from)
{
__set_task_comm(tsk, from, false);
}
extern char *__get_task_comm(char *to, size_t len, struct task_struct *tsk);
#define get_task_comm(buf, tsk) ({ \
BUILD_BUG_ON(sizeof(buf) != TASK_COMM_LEN); \
__get_task_comm(buf, sizeof(buf), tsk); \
})
#ifdef CONFIG_SMP
static __always_inline void scheduler_ipi(void)
{
/*
* Fold TIF_NEED_RESCHED into the preempt_count; anybody setting
* TIF_NEED_RESCHED remotely (for the first time) will also send
* this IPI.
*/
preempt_fold_need_resched();
}
extern unsigned long wait_task_inactive(struct task_struct *, unsigned int match_state);
#else
static inline void scheduler_ipi(void) { }
static inline unsigned long wait_task_inactive(struct task_struct *p, unsigned int match_state)
{
return 1;
}
#endif
/*
* Set thread flags in other task's structures.
* See asm/thread_info.h for TIF_xxxx flags available:
*/
static inline void set_tsk_thread_flag(struct task_struct *tsk, int flag)
{
set_ti_thread_flag(task_thread_info(tsk), flag);
}
static inline void clear_tsk_thread_flag(struct task_struct *tsk, int flag)
{
clear_ti_thread_flag(task_thread_info(tsk), flag);
}
static inline void update_tsk_thread_flag(struct task_struct *tsk, int flag,
bool value)
{
update_ti_thread_flag(task_thread_info(tsk), flag, value);
}
static inline int test_and_set_tsk_thread_flag(struct task_struct *tsk, int flag)
{
return test_and_set_ti_thread_flag(task_thread_info(tsk), flag);
}
static inline int test_and_clear_tsk_thread_flag(struct task_struct *tsk, int flag)
{
return test_and_clear_ti_thread_flag(task_thread_info(tsk), flag);
}
static inline int test_tsk_thread_flag(struct task_struct *tsk, int flag)
{
return test_ti_thread_flag(task_thread_info(tsk), flag);
}
static inline void set_tsk_need_resched(struct task_struct *tsk)
{
set_tsk_thread_flag(tsk,TIF_NEED_RESCHED);
}
static inline void clear_tsk_need_resched(struct task_struct *tsk)
{
clear_tsk_thread_flag(tsk,TIF_NEED_RESCHED);
}
static inline int test_tsk_need_resched(struct task_struct *tsk)
{
return unlikely(test_tsk_thread_flag(tsk,TIF_NEED_RESCHED));
}
/*
* cond_resched() and cond_resched_lock(): latency reduction via
* explicit rescheduling in places that are safe. The return
* value indicates whether a reschedule was done in fact.
* cond_resched_lock() will drop the spinlock before scheduling,
*/
#if !defined(CONFIG_PREEMPTION) || defined(CONFIG_PREEMPT_DYNAMIC)
extern int __cond_resched(void);
#ifdef CONFIG_PREEMPT_DYNAMIC
DECLARE_STATIC_CALL(cond_resched, __cond_resched);
static __always_inline int _cond_resched(void)
{
return static_call_mod(cond_resched)();
}
#else
static inline int _cond_resched(void)
{
return __cond_resched();
}
#endif /* CONFIG_PREEMPT_DYNAMIC */
#else
static inline int _cond_resched(void) { return 0; }
#endif /* !defined(CONFIG_PREEMPTION) || defined(CONFIG_PREEMPT_DYNAMIC) */
#define cond_resched() ({ \
___might_sleep(__FILE__, __LINE__, 0); \
_cond_resched(); \
})
extern int __cond_resched_lock(spinlock_t *lock);
extern int __cond_resched_rwlock_read(rwlock_t *lock);
extern int __cond_resched_rwlock_write(rwlock_t *lock);
#define cond_resched_lock(lock) ({ \
___might_sleep(__FILE__, __LINE__, PREEMPT_LOCK_OFFSET);\
__cond_resched_lock(lock); \
})
#define cond_resched_rwlock_read(lock) ({ \
__might_sleep(__FILE__, __LINE__, PREEMPT_LOCK_OFFSET); \
__cond_resched_rwlock_read(lock); \
})
#define cond_resched_rwlock_write(lock) ({ \
__might_sleep(__FILE__, __LINE__, PREEMPT_LOCK_OFFSET); \
__cond_resched_rwlock_write(lock); \
})
static inline void cond_resched_rcu(void)
{
#if defined(CONFIG_DEBUG_ATOMIC_SLEEP) || !defined(CONFIG_PREEMPT_RCU)
rcu_read_unlock();
cond_resched();
rcu_read_lock();
#endif
}
/*
* Does a critical section need to be broken due to another
* task waiting?: (technically does not depend on CONFIG_PREEMPTION,
* but a general need for low latency)
*/
static inline int spin_needbreak(spinlock_t *lock)
{
#ifdef CONFIG_PREEMPTION
return spin_is_contended(lock);
#else
return 0;
#endif
}
/*
* Check if a rwlock is contended.
* Returns non-zero if there is another task waiting on the rwlock.
* Returns zero if the lock is not contended or the system / underlying
* rwlock implementation does not support contention detection.
* Technically does not depend on CONFIG_PREEMPTION, but a general need
* for low latency.
*/
static inline int rwlock_needbreak(rwlock_t *lock)
{
#ifdef CONFIG_PREEMPTION
return rwlock_is_contended(lock);
#else
return 0;
#endif
}
static __always_inline bool need_resched(void)
{
return unlikely(tif_need_resched());
}
/*
* Wrappers for p->thread_info->cpu access. No-op on UP.
*/
#ifdef CONFIG_SMP
static inline unsigned int task_cpu(const struct task_struct *p)
{
#ifdef CONFIG_THREAD_INFO_IN_TASK
return READ_ONCE(p->cpu);
#else
return READ_ONCE(task_thread_info(p)->cpu);
#endif
}
extern void set_task_cpu(struct task_struct *p, unsigned int cpu);
#else
static inline unsigned int task_cpu(const struct task_struct *p)
{
return 0;
}
static inline void set_task_cpu(struct task_struct *p, unsigned int cpu)
{
}
#endif /* CONFIG_SMP */
extern bool sched_task_on_rq(struct task_struct *p);
/*
* In order to reduce various lock holder preemption latencies provide an
* interface to see if a vCPU is currently running or not.
*
* This allows us to terminate optimistic spin loops and block, analogous to
* the native optimistic spin heuristic of testing if the lock owner task is
* running or not.
*/
#ifndef vcpu_is_preempted
static inline bool vcpu_is_preempted(int cpu)
{
return false;
}
#endif
extern long sched_setaffinity(pid_t pid, const struct cpumask *new_mask);
extern long sched_getaffinity(pid_t pid, struct cpumask *mask);
#ifndef TASK_SIZE_OF
#define TASK_SIZE_OF(tsk) TASK_SIZE
#endif
#ifdef CONFIG_SMP
/* Returns effective CPU energy utilization, as seen by the scheduler */
unsigned long sched_cpu_util(int cpu, unsigned long max);
#endif /* CONFIG_SMP */
#ifdef CONFIG_RSEQ
/*
* Map the event mask on the user-space ABI enum rseq_cs_flags
* for direct mask checks.
*/
enum rseq_event_mask_bits {
RSEQ_EVENT_PREEMPT_BIT = RSEQ_CS_FLAG_NO_RESTART_ON_PREEMPT_BIT,
RSEQ_EVENT_SIGNAL_BIT = RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL_BIT,
RSEQ_EVENT_MIGRATE_BIT = RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE_BIT,
};
enum rseq_event_mask {
RSEQ_EVENT_PREEMPT = (1U << RSEQ_EVENT_PREEMPT_BIT),
RSEQ_EVENT_SIGNAL = (1U << RSEQ_EVENT_SIGNAL_BIT),
RSEQ_EVENT_MIGRATE = (1U << RSEQ_EVENT_MIGRATE_BIT),
};
static inline void rseq_set_notify_resume(struct task_struct *t)
{
if (t->rseq)
set_tsk_thread_flag(t, TIF_NOTIFY_RESUME);
}
void __rseq_handle_notify_resume(struct ksignal *sig, struct pt_regs *regs);
static inline void rseq_handle_notify_resume(struct ksignal *ksig,
struct pt_regs *regs)
{
if (current->rseq)
__rseq_handle_notify_resume(ksig, regs);
}
static inline void rseq_signal_deliver(struct ksignal *ksig,
struct pt_regs *regs)
{
preempt_disable();
__set_bit(RSEQ_EVENT_SIGNAL_BIT, ¤t->rseq_event_mask);
preempt_enable();
rseq_handle_notify_resume(ksig, regs);
}
/* rseq_preempt() requires preemption to be disabled. */
static inline void rseq_preempt(struct task_struct *t)
{
__set_bit(RSEQ_EVENT_PREEMPT_BIT, &t->rseq_event_mask);
rseq_set_notify_resume(t);
}
/* rseq_migrate() requires preemption to be disabled. */
static inline void rseq_migrate(struct task_struct *t)
{
__set_bit(RSEQ_EVENT_MIGRATE_BIT, &t->rseq_event_mask);
rseq_set_notify_resume(t);
}
/*
* If parent process has a registered restartable sequences area, the
* child inherits. Unregister rseq for a clone with CLONE_VM set.
*/
static inline void rseq_fork(struct task_struct *t, unsigned long clone_flags)
{
if (clone_flags & CLONE_VM) {
t->rseq = NULL;
t->rseq_sig = 0;
t->rseq_event_mask = 0;
} else {
t->rseq = current->rseq;
t->rseq_sig = current->rseq_sig;
t->rseq_event_mask = current->rseq_event_mask;
}
}
static inline void rseq_execve(struct task_struct *t)
{
t->rseq = NULL;
t->rseq_sig = 0;
t->rseq_event_mask = 0;
}
#else
static inline void rseq_set_notify_resume(struct task_struct *t)
{
}
static inline void rseq_handle_notify_resume(struct ksignal *ksig,
struct pt_regs *regs)
{
}
static inline void rseq_signal_deliver(struct ksignal *ksig,
struct pt_regs *regs)
{
}
static inline void rseq_preempt(struct task_struct *t)
{
}
static inline void rseq_migrate(struct task_struct *t)
{
}
static inline void rseq_fork(struct task_struct *t, unsigned long clone_flags)
{
}
static inline void rseq_execve(struct task_struct *t)
{
}
#endif
#ifdef CONFIG_DEBUG_RSEQ
void rseq_syscall(struct pt_regs *regs);
#else
static inline void rseq_syscall(struct pt_regs *regs)
{
}
#endif
const struct sched_avg *sched_trace_cfs_rq_avg(struct cfs_rq *cfs_rq);
char *sched_trace_cfs_rq_path(struct cfs_rq *cfs_rq, char *str, int len);
int sched_trace_cfs_rq_cpu(struct cfs_rq *cfs_rq);
const struct sched_avg *sched_trace_rq_avg_rt(struct rq *rq);
const struct sched_avg *sched_trace_rq_avg_dl(struct rq *rq);
const struct sched_avg *sched_trace_rq_avg_irq(struct rq *rq);
int sched_trace_rq_cpu(struct rq *rq);
int sched_trace_rq_cpu_capacity(struct rq *rq);
int sched_trace_rq_nr_running(struct rq *rq);
const struct cpumask *sched_trace_rd_span(struct root_domain *rd);
#ifdef CONFIG_SCHED_CORE
extern void sched_core_free(struct task_struct *tsk);
extern void sched_core_fork(struct task_struct *p);
extern int sched_core_share_pid(unsigned int cmd, pid_t pid, enum pid_type type,
unsigned long uaddr);
#else
static inline void sched_core_free(struct task_struct *tsk) { }
static inline void sched_core_fork(struct task_struct *p) { }
#endif
#endif
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef BLK_MQ_H
#define BLK_MQ_H
#include <linux/blkdev.h>
#include <linux/sbitmap.h>
#include <linux/srcu.h>
#include <linux/lockdep.h>
struct blk_mq_tags;
struct blk_flush_queue;
/**
* struct blk_mq_hw_ctx - State for a hardware queue facing the hardware
* block device
*/
struct blk_mq_hw_ctx {
struct {
/** @lock: Protects the dispatch list. */
spinlock_t lock;
/**
* @dispatch: Used for requests that are ready to be
* dispatched to the hardware but for some reason (e.g. lack of
* resources) could not be sent to the hardware. As soon as the
* driver can send new requests, requests at this list will
* be sent first for a fairer dispatch.
*/
struct list_head dispatch;
/**
* @state: BLK_MQ_S_* flags. Defines the state of the hw
* queue (active, scheduled to restart, stopped).
*/
unsigned long state;
} ____cacheline_aligned_in_smp;
/**
* @run_work: Used for scheduling a hardware queue run at a later time.
*/
struct delayed_work run_work;
/** @cpumask: Map of available CPUs where this hctx can run. */
cpumask_var_t cpumask;
/**
* @next_cpu: Used by blk_mq_hctx_next_cpu() for round-robin CPU
* selection from @cpumask.
*/
int next_cpu;
/**
* @next_cpu_batch: Counter of how many works left in the batch before
* changing to the next CPU.
*/
int next_cpu_batch;
/** @flags: BLK_MQ_F_* flags. Defines the behaviour of the queue. */
unsigned long flags;
/**
* @sched_data: Pointer owned by the IO scheduler attached to a request
* queue. It's up to the IO scheduler how to use this pointer.
*/
void *sched_data;
/**
* @queue: Pointer to the request queue that owns this hardware context.
*/
struct request_queue *queue;
/** @fq: Queue of requests that need to perform a flush operation. */
struct blk_flush_queue *fq;
/**
* @driver_data: Pointer to data owned by the block driver that created
* this hctx
*/
void *driver_data;
/**
* @ctx_map: Bitmap for each software queue. If bit is on, there is a
* pending request in that software queue.
*/
struct sbitmap ctx_map;
/**
* @dispatch_from: Software queue to be used when no scheduler was
* selected.
*/
struct blk_mq_ctx *dispatch_from;
/**
* @dispatch_busy: Number used by blk_mq_update_dispatch_busy() to
* decide if the hw_queue is busy using Exponential Weighted Moving
* Average algorithm.
*/
unsigned int dispatch_busy;
/** @type: HCTX_TYPE_* flags. Type of hardware queue. */
unsigned short type;
/** @nr_ctx: Number of software queues. */
unsigned short nr_ctx;
/** @ctxs: Array of software queues. */
struct blk_mq_ctx **ctxs;
/** @dispatch_wait_lock: Lock for dispatch_wait queue. */
spinlock_t dispatch_wait_lock;
/**
* @dispatch_wait: Waitqueue to put requests when there is no tag
* available at the moment, to wait for another try in the future.
*/
wait_queue_entry_t dispatch_wait;
/**
* @wait_index: Index of next available dispatch_wait queue to insert
* requests.
*/
atomic_t wait_index;
/**
* @tags: Tags owned by the block driver. A tag at this set is only
* assigned when a request is dispatched from a hardware queue.
*/
struct blk_mq_tags *tags;
/**
* @sched_tags: Tags owned by I/O scheduler. If there is an I/O
* scheduler associated with a request queue, a tag is assigned when
* that request is allocated. Else, this member is not used.
*/
struct blk_mq_tags *sched_tags;
/** @queued: Number of queued requests. */
unsigned long queued;
/** @run: Number of dispatched requests. */
unsigned long run;
#define BLK_MQ_MAX_DISPATCH_ORDER 7
/** @dispatched: Number of dispatch requests by queue. */
unsigned long dispatched[BLK_MQ_MAX_DISPATCH_ORDER];
/** @numa_node: NUMA node the storage adapter has been connected to. */
unsigned int numa_node;
/** @queue_num: Index of this hardware queue. */
unsigned int queue_num;
/**
* @nr_active: Number of active requests. Only used when a tag set is
* shared across request queues.
*/
atomic_t nr_active;
/** @cpuhp_online: List to store request if CPU is going to die */
struct hlist_node cpuhp_online;
/** @cpuhp_dead: List to store request if some CPU die. */
struct hlist_node cpuhp_dead;
/** @kobj: Kernel object for sysfs. */
struct kobject kobj;
/** @poll_considered: Count times blk_poll() was called. */
unsigned long poll_considered;
/** @poll_invoked: Count how many requests blk_poll() polled. */
unsigned long poll_invoked;
/** @poll_success: Count how many polled requests were completed. */
unsigned long poll_success;
#ifdef CONFIG_BLK_DEBUG_FS
/**
* @debugfs_dir: debugfs directory for this hardware queue. Named
* as cpu<cpu_number>.
*/
struct dentry *debugfs_dir;
/** @sched_debugfs_dir: debugfs directory for the scheduler. */
struct dentry *sched_debugfs_dir;
#endif
/**
* @hctx_list: if this hctx is not in use, this is an entry in
* q->unused_hctx_list.
*/
struct list_head hctx_list;
/**
* @srcu: Sleepable RCU. Use as lock when type of the hardware queue is
* blocking (BLK_MQ_F_BLOCKING). Must be the last member - see also
* blk_mq_hw_ctx_size().
*/
struct srcu_struct srcu[];
};
/**
* struct blk_mq_queue_map - Map software queues to hardware queues
* @mq_map: CPU ID to hardware queue index map. This is an array
* with nr_cpu_ids elements. Each element has a value in the range
* [@queue_offset, @queue_offset + @nr_queues).
* @nr_queues: Number of hardware queues to map CPU IDs onto.
* @queue_offset: First hardware queue to map onto. Used by the PCIe NVMe
* driver to map each hardware queue type (enum hctx_type) onto a distinct
* set of hardware queues.
*/
struct blk_mq_queue_map {
unsigned int *mq_map;
unsigned int nr_queues;
unsigned int queue_offset;
};
/**
* enum hctx_type - Type of hardware queue
* @HCTX_TYPE_DEFAULT: All I/O not otherwise accounted for.
* @HCTX_TYPE_READ: Just for READ I/O.
* @HCTX_TYPE_POLL: Polled I/O of any kind.
* @HCTX_MAX_TYPES: Number of types of hctx.
*/
enum hctx_type {
HCTX_TYPE_DEFAULT,
HCTX_TYPE_READ,
HCTX_TYPE_POLL,
HCTX_MAX_TYPES,
};
/**
* struct blk_mq_tag_set - tag set that can be shared between request queues
* @map: One or more ctx -> hctx mappings. One map exists for each
* hardware queue type (enum hctx_type) that the driver wishes
* to support. There are no restrictions on maps being of the
* same size, and it's perfectly legal to share maps between
* types.
* @nr_maps: Number of elements in the @map array. A number in the range
* [1, HCTX_MAX_TYPES].
* @ops: Pointers to functions that implement block driver behavior.
* @nr_hw_queues: Number of hardware queues supported by the block driver that
* owns this data structure.
* @queue_depth: Number of tags per hardware queue, reserved tags included.
* @reserved_tags: Number of tags to set aside for BLK_MQ_REQ_RESERVED tag
* allocations.
* @cmd_size: Number of additional bytes to allocate per request. The block
* driver owns these additional bytes.
* @numa_node: NUMA node the storage adapter has been connected to.
* @timeout: Request processing timeout in jiffies.
* @flags: Zero or more BLK_MQ_F_* flags.
* @driver_data: Pointer to data owned by the block driver that created this
* tag set.
* @active_queues_shared_sbitmap:
* number of active request queues per tag set.
* @__bitmap_tags: A shared tags sbitmap, used over all hctx's
* @__breserved_tags:
* A shared reserved tags sbitmap, used over all hctx's
* @tags: Tag sets. One tag set per hardware queue. Has @nr_hw_queues
* elements.
* @tag_list_lock: Serializes tag_list accesses.
* @tag_list: List of the request queues that use this tag set. See also
* request_queue.tag_set_list.
*/
struct blk_mq_tag_set {
struct blk_mq_queue_map map[HCTX_MAX_TYPES];
unsigned int nr_maps;
const struct blk_mq_ops *ops;
unsigned int nr_hw_queues;
unsigned int queue_depth;
unsigned int reserved_tags;
unsigned int cmd_size;
int numa_node;
unsigned int timeout;
unsigned int flags;
void *driver_data;
atomic_t active_queues_shared_sbitmap;
struct sbitmap_queue __bitmap_tags;
struct sbitmap_queue __breserved_tags;
struct blk_mq_tags **tags;
struct mutex tag_list_lock;
struct list_head tag_list;
};
/**
* struct blk_mq_queue_data - Data about a request inserted in a queue
*
* @rq: Request pointer.
* @last: If it is the last request in the queue.
*/
struct blk_mq_queue_data {
struct request *rq;
bool last;
};
typedef bool (busy_iter_fn)(struct blk_mq_hw_ctx *, struct request *, void *,
bool);
typedef bool (busy_tag_iter_fn)(struct request *, void *, bool);
/**
* struct blk_mq_ops - Callback functions that implements block driver
* behaviour.
*/
struct blk_mq_ops {
/**
* @queue_rq: Queue a new request from block IO.
*/
blk_status_t (*queue_rq)(struct blk_mq_hw_ctx *,
const struct blk_mq_queue_data *);
/**
* @commit_rqs: If a driver uses bd->last to judge when to submit
* requests to hardware, it must define this function. In case of errors
* that make us stop issuing further requests, this hook serves the
* purpose of kicking the hardware (which the last request otherwise
* would have done).
*/
void (*commit_rqs)(struct blk_mq_hw_ctx *);
/**
* @get_budget: Reserve budget before queue request, once .queue_rq is
* run, it is driver's responsibility to release the
* reserved budget. Also we have to handle failure case
* of .get_budget for avoiding I/O deadlock.
*/
int (*get_budget)(struct request_queue *);
/**
* @put_budget: Release the reserved budget.
*/
void (*put_budget)(struct request_queue *, int);
/**
* @set_rq_budget_token: store rq's budget token
*/
void (*set_rq_budget_token)(struct request *, int);
/**
* @get_rq_budget_token: retrieve rq's budget token
*/
int (*get_rq_budget_token)(struct request *);
/**
* @timeout: Called on request timeout.
*/
enum blk_eh_timer_return (*timeout)(struct request *, bool);
/**
* @poll: Called to poll for completion of a specific tag.
*/
int (*poll)(struct blk_mq_hw_ctx *);
/**
* @complete: Mark the request as complete.
*/
void (*complete)(struct request *);
/**
* @init_hctx: Called when the block layer side of a hardware queue has
* been set up, allowing the driver to allocate/init matching
* structures.
*/
int (*init_hctx)(struct blk_mq_hw_ctx *, void *, unsigned int);
/**
* @exit_hctx: Ditto for exit/teardown.
*/
void (*exit_hctx)(struct blk_mq_hw_ctx *, unsigned int);
/**
* @init_request: Called for every command allocated by the block layer
* to allow the driver to set up driver specific data.
*
* Tag greater than or equal to queue_depth is for setting up
* flush request.
*/
int (*init_request)(struct blk_mq_tag_set *set, struct request *,
unsigned int, unsigned int);
/**
* @exit_request: Ditto for exit/teardown.
*/
void (*exit_request)(struct blk_mq_tag_set *set, struct request *,
unsigned int);
/**
* @initialize_rq_fn: Called from inside blk_get_request().
*/
void (*initialize_rq_fn)(struct request *rq);
/**
* @cleanup_rq: Called before freeing one request which isn't completed
* yet, and usually for freeing the driver private data.
*/
void (*cleanup_rq)(struct request *);
/**
* @busy: If set, returns whether or not this queue currently is busy.
*/
bool (*busy)(struct request_queue *);
/**
* @map_queues: This allows drivers specify their own queue mapping by
* overriding the setup-time function that builds the mq_map.
*/
int (*map_queues)(struct blk_mq_tag_set *set);
#ifdef CONFIG_BLK_DEBUG_FS
/**
* @show_rq: Used by the debugfs implementation to show driver-specific
* information about a request.
*/
void (*show_rq)(struct seq_file *m, struct request *rq);
#endif
};
enum {
BLK_MQ_F_SHOULD_MERGE = 1 << 0,
BLK_MQ_F_TAG_QUEUE_SHARED = 1 << 1,
/*
* Set when this device requires underlying blk-mq device for
* completing IO:
*/
BLK_MQ_F_STACKING = 1 << 2,
BLK_MQ_F_TAG_HCTX_SHARED = 1 << 3,
BLK_MQ_F_BLOCKING = 1 << 5,
/* Do not allow an I/O scheduler to be configured. */
BLK_MQ_F_NO_SCHED = 1 << 6,
/*
* Select 'none' during queue registration in case of a single hwq
* or shared hwqs instead of 'mq-deadline'.
*/
BLK_MQ_F_NO_SCHED_BY_DEFAULT = 1 << 7,
BLK_MQ_F_ALLOC_POLICY_START_BIT = 8,
BLK_MQ_F_ALLOC_POLICY_BITS = 1,
BLK_MQ_S_STOPPED = 0,
BLK_MQ_S_TAG_ACTIVE = 1,
BLK_MQ_S_SCHED_RESTART = 2,
/* hw queue is inactive after all its CPUs become offline */
BLK_MQ_S_INACTIVE = 3,
BLK_MQ_MAX_DEPTH = 10240,
BLK_MQ_CPU_WORK_BATCH = 8,
};
#define BLK_MQ_FLAG_TO_ALLOC_POLICY(flags) \
((flags >> BLK_MQ_F_ALLOC_POLICY_START_BIT) & \
((1 << BLK_MQ_F_ALLOC_POLICY_BITS) - 1))
#define BLK_ALLOC_POLICY_TO_MQ_FLAG(policy) \
((policy & ((1 << BLK_MQ_F_ALLOC_POLICY_BITS) - 1)) \
<< BLK_MQ_F_ALLOC_POLICY_START_BIT)
struct gendisk *__blk_mq_alloc_disk(struct blk_mq_tag_set *set, void *queuedata,
struct lock_class_key *lkclass);
#define blk_mq_alloc_disk(set, queuedata) \
({ \
static struct lock_class_key __key; \
\
__blk_mq_alloc_disk(set, queuedata, &__key); \
})
struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *);
int blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
struct request_queue *q);
void blk_mq_unregister_dev(struct device *, struct request_queue *);
int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set);
int blk_mq_alloc_sq_tag_set(struct blk_mq_tag_set *set,
const struct blk_mq_ops *ops, unsigned int queue_depth,
unsigned int set_flags);
void blk_mq_free_tag_set(struct blk_mq_tag_set *set);
void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule);
void blk_mq_free_request(struct request *rq);
bool blk_mq_queue_inflight(struct request_queue *q);
enum {
/* return when out of requests */
BLK_MQ_REQ_NOWAIT = (__force blk_mq_req_flags_t)(1 << 0),
/* allocate from reserved pool */
BLK_MQ_REQ_RESERVED = (__force blk_mq_req_flags_t)(1 << 1),
/* set RQF_PM */
BLK_MQ_REQ_PM = (__force blk_mq_req_flags_t)(1 << 2),
};
struct request *blk_mq_alloc_request(struct request_queue *q, unsigned int op,
blk_mq_req_flags_t flags);
struct request *blk_mq_alloc_request_hctx(struct request_queue *q,
unsigned int op, blk_mq_req_flags_t flags,
unsigned int hctx_idx);
struct request *blk_mq_tag_to_rq(struct blk_mq_tags *tags, unsigned int tag);
enum {
BLK_MQ_UNIQUE_TAG_BITS = 16,
BLK_MQ_UNIQUE_TAG_MASK = (1 << BLK_MQ_UNIQUE_TAG_BITS) - 1,
};
u32 blk_mq_unique_tag(struct request *rq);
static inline u16 blk_mq_unique_tag_to_hwq(u32 unique_tag)
{
return unique_tag >> BLK_MQ_UNIQUE_TAG_BITS;
}
static inline u16 blk_mq_unique_tag_to_tag(u32 unique_tag)
{
return unique_tag & BLK_MQ_UNIQUE_TAG_MASK;
}
/**
* blk_mq_rq_state() - read the current MQ_RQ_* state of a request
* @rq: target request.
*/
static inline enum mq_rq_state blk_mq_rq_state(struct request *rq)
{
return READ_ONCE(rq->state);
}
static inline int blk_mq_request_started(struct request *rq)
{
return blk_mq_rq_state(rq) != MQ_RQ_IDLE;
}
static inline int blk_mq_request_completed(struct request *rq)
{
return blk_mq_rq_state(rq) == MQ_RQ_COMPLETE;
}
/*
*
* Set the state to complete when completing a request from inside ->queue_rq.
* This is used by drivers that want to ensure special complete actions that
* need access to the request are called on failure, e.g. by nvme for
* multipathing.
*/
static inline void blk_mq_set_request_complete(struct request *rq)
{
WRITE_ONCE(rq->state, MQ_RQ_COMPLETE);
}
void blk_mq_start_request(struct request *rq);
void blk_mq_end_request(struct request *rq, blk_status_t error);
void __blk_mq_end_request(struct request *rq, blk_status_t error);
void blk_mq_requeue_request(struct request *rq, bool kick_requeue_list);
void blk_mq_kick_requeue_list(struct request_queue *q);
void blk_mq_delay_kick_requeue_list(struct request_queue *q, unsigned long msecs);
void blk_mq_complete_request(struct request *rq);
bool blk_mq_complete_request_remote(struct request *rq);
bool blk_mq_queue_stopped(struct request_queue *q);
void blk_mq_stop_hw_queue(struct blk_mq_hw_ctx *hctx);
void blk_mq_start_hw_queue(struct blk_mq_hw_ctx *hctx);
void blk_mq_stop_hw_queues(struct request_queue *q);
void blk_mq_start_hw_queues(struct request_queue *q);
void blk_mq_start_stopped_hw_queue(struct blk_mq_hw_ctx *hctx, bool async);
void blk_mq_start_stopped_hw_queues(struct request_queue *q, bool async);
void blk_mq_quiesce_queue(struct request_queue *q);
void blk_mq_unquiesce_queue(struct request_queue *q);
void blk_mq_delay_run_hw_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs);
void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async);
void blk_mq_run_hw_queues(struct request_queue *q, bool async);
void blk_mq_delay_run_hw_queues(struct request_queue *q, unsigned long msecs);
void blk_mq_tagset_busy_iter(struct blk_mq_tag_set *tagset,
busy_tag_iter_fn *fn, void *priv);
void blk_mq_tagset_wait_completed_request(struct blk_mq_tag_set *tagset);
void blk_mq_freeze_queue(struct request_queue *q);
void blk_mq_unfreeze_queue(struct request_queue *q);
void blk_freeze_queue_start(struct request_queue *q);
void blk_mq_freeze_queue_wait(struct request_queue *q);
int blk_mq_freeze_queue_wait_timeout(struct request_queue *q,
unsigned long timeout);
int blk_mq_map_queues(struct blk_mq_queue_map *qmap);
void blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, int nr_hw_queues);
void blk_mq_quiesce_queue_nowait(struct request_queue *q);
unsigned int blk_mq_rq_cpu(struct request *rq);
bool __blk_should_fake_timeout(struct request_queue *q);
static inline bool blk_should_fake_timeout(struct request_queue *q)
{
if (IS_ENABLED(CONFIG_FAIL_IO_TIMEOUT) &&
test_bit(QUEUE_FLAG_FAIL_IO, &q->queue_flags))
return __blk_should_fake_timeout(q);
return false;
}
/**
* blk_mq_rq_from_pdu - cast a PDU to a request
* @pdu: the PDU (Protocol Data Unit) to be casted
*
* Return: request
*
* Driver command data is immediately after the request. So subtract request
* size to get back to the original request.
*/
static inline struct request *blk_mq_rq_from_pdu(void *pdu)
{
return pdu - sizeof(struct request);
}
/**
* blk_mq_rq_to_pdu - cast a request to a PDU
* @rq: the request to be casted
*
* Return: pointer to the PDU
*
* Driver command data is immediately after the request. So add request to get
* the PDU.
*/
static inline void *blk_mq_rq_to_pdu(struct request *rq)
{
return rq + 1;
}
#define queue_for_each_hw_ctx(q, hctx, i) \
for ((i) = 0; (i) < (q)->nr_hw_queues && \
({ hctx = (q)->queue_hw_ctx[i]; 1; }); (i)++)
#define hctx_for_each_ctx(hctx, ctx, i) \
for ((i) = 0; (i) < (hctx)->nr_ctx && \
({ ctx = (hctx)->ctxs[(i)]; 1; }); (i)++)
static inline blk_qc_t request_to_qc_t(struct blk_mq_hw_ctx *hctx,
struct request *rq)
{
if (rq->tag != -1)
return rq->tag | (hctx->queue_num << BLK_QC_T_SHIFT); return rq->internal_tag | (hctx->queue_num << BLK_QC_T_SHIFT) |
BLK_QC_T_INTERNAL;
}
static inline void blk_mq_cleanup_rq(struct request *rq)
{
if (rq->q->mq_ops->cleanup_rq)
rq->q->mq_ops->cleanup_rq(rq);
}
static inline void blk_rq_bio_prep(struct request *rq, struct bio *bio,
unsigned int nr_segs)
{
rq->nr_phys_segments = nr_segs;
rq->__data_len = bio->bi_iter.bi_size;
rq->bio = rq->biotail = bio;
rq->ioprio = bio_prio(bio);
if (bio->bi_bdev)
rq->rq_disk = bio->bi_bdev->bd_disk;
}
blk_qc_t blk_mq_submit_bio(struct bio *bio);
void blk_mq_hctx_set_fq_lock_class(struct blk_mq_hw_ctx *hctx,
struct lock_class_key *key);
#endif
// SPDX-License-Identifier: GPL-2.0-only
/*
* kernel/workqueue.c - generic async execution with shared worker pool
*
* Copyright (C) 2002 Ingo Molnar
*
* Derived from the taskqueue/keventd code by:
* David Woodhouse <dwmw2@infradead.org>
* Andrew Morton
* Kai Petzke <wpp@marie.physik.tu-berlin.de>
* Theodore Ts'o <tytso@mit.edu>
*
* Made to use alloc_percpu by Christoph Lameter.
*
* Copyright (C) 2010 SUSE Linux Products GmbH
* Copyright (C) 2010 Tejun Heo <tj@kernel.org>
*
* This is the generic async execution mechanism. Work items as are
* executed in process context. The worker pool is shared and
* automatically managed. There are two worker pools for each CPU (one for
* normal work items and the other for high priority ones) and some extra
* pools for workqueues which are not bound to any specific CPU - the
* number of these backing pools is dynamic.
*
* Please read Documentation/core-api/workqueue.rst for details.
*/
#include <linux/export.h>
#include <linux/kernel.h>
#include <linux/sched.h>
#include <linux/init.h>
#include <linux/signal.h>
#include <linux/completion.h>
#include <linux/workqueue.h>
#include <linux/slab.h>
#include <linux/cpu.h>
#include <linux/notifier.h>
#include <linux/kthread.h>
#include <linux/hardirq.h>
#include <linux/mempolicy.h>
#include <linux/freezer.h>
#include <linux/debug_locks.h>
#include <linux/lockdep.h>
#include <linux/idr.h>
#include <linux/jhash.h>
#include <linux/hashtable.h>
#include <linux/rculist.h>
#include <linux/nodemask.h>
#include <linux/moduleparam.h>
#include <linux/uaccess.h>
#include <linux/sched/isolation.h>
#include <linux/nmi.h>
#include <linux/kvm_para.h>
#include "workqueue_internal.h"
enum {
/*
* worker_pool flags
*
* A bound pool is either associated or disassociated with its CPU.
* While associated (!DISASSOCIATED), all workers are bound to the
* CPU and none has %WORKER_UNBOUND set and concurrency management
* is in effect.
*
* While DISASSOCIATED, the cpu may be offline and all workers have
* %WORKER_UNBOUND set and concurrency management disabled, and may
* be executing on any CPU. The pool behaves as an unbound one.
*
* Note that DISASSOCIATED should be flipped only while holding
* wq_pool_attach_mutex to avoid changing binding state while
* worker_attach_to_pool() is in progress.
*/
POOL_MANAGER_ACTIVE = 1 << 0, /* being managed */
POOL_DISASSOCIATED = 1 << 2, /* cpu can't serve workers */
/* worker flags */
WORKER_DIE = 1 << 1, /* die die die */
WORKER_IDLE = 1 << 2, /* is idle */
WORKER_PREP = 1 << 3, /* preparing to run works */
WORKER_CPU_INTENSIVE = 1 << 6, /* cpu intensive */
WORKER_UNBOUND = 1 << 7, /* worker is unbound */
WORKER_REBOUND = 1 << 8, /* worker was rebound */
WORKER_NOT_RUNNING = WORKER_PREP | WORKER_CPU_INTENSIVE |
WORKER_UNBOUND | WORKER_REBOUND,
NR_STD_WORKER_POOLS = 2, /* # standard pools per cpu */
UNBOUND_POOL_HASH_ORDER = 6, /* hashed by pool->attrs */
BUSY_WORKER_HASH_ORDER = 6, /* 64 pointers */
MAX_IDLE_WORKERS_RATIO = 4, /* 1/4 of busy can be idle */
IDLE_WORKER_TIMEOUT = 300 * HZ, /* keep idle ones for 5 mins */
MAYDAY_INITIAL_TIMEOUT = HZ / 100 >= 2 ? HZ / 100 : 2,
/* call for help after 10ms
(min two ticks) */
MAYDAY_INTERVAL = HZ / 10, /* and then every 100ms */
CREATE_COOLDOWN = HZ, /* time to breath after fail */
/*
* Rescue workers are used only on emergencies and shared by
* all cpus. Give MIN_NICE.
*/
RESCUER_NICE_LEVEL = MIN_NICE,
HIGHPRI_NICE_LEVEL = MIN_NICE,
WQ_NAME_LEN = 24,
};
/*
* Structure fields follow one of the following exclusion rules.
*
* I: Modifiable by initialization/destruction paths and read-only for
* everyone else.
*
* P: Preemption protected. Disabling preemption is enough and should
* only be modified and accessed from the local cpu.
*
* L: pool->lock protected. Access with pool->lock held.
*
* X: During normal operation, modification requires pool->lock and should
* be done only from local cpu. Either disabling preemption on local
* cpu or grabbing pool->lock is enough for read access. If
* POOL_DISASSOCIATED is set, it's identical to L.
*
* A: wq_pool_attach_mutex protected.
*
* PL: wq_pool_mutex protected.
*
* PR: wq_pool_mutex protected for writes. RCU protected for reads.
*
* PW: wq_pool_mutex and wq->mutex protected for writes. Either for reads.
*
* PWR: wq_pool_mutex and wq->mutex protected for writes. Either or
* RCU for reads.
*
* WQ: wq->mutex protected.
*
* WR: wq->mutex protected for writes. RCU protected for reads.
*
* MD: wq_mayday_lock protected.
*/
/* struct worker is defined in workqueue_internal.h */
struct worker_pool {
raw_spinlock_t lock; /* the pool lock */
int cpu; /* I: the associated cpu */
int node; /* I: the associated node ID */
int id; /* I: pool ID */
unsigned int flags; /* X: flags */
unsigned long watchdog_ts; /* L: watchdog timestamp */
struct list_head worklist; /* L: list of pending works */
int nr_workers; /* L: total number of workers */
int nr_idle; /* L: currently idle workers */
struct list_head idle_list; /* X: list of idle workers */
struct timer_list idle_timer; /* L: worker idle timeout */
struct timer_list mayday_timer; /* L: SOS timer for workers */
/* a workers is either on busy_hash or idle_list, or the manager */
DECLARE_HASHTABLE(busy_hash, BUSY_WORKER_HASH_ORDER);
/* L: hash of busy workers */
struct worker *manager; /* L: purely informational */
struct list_head workers; /* A: attached workers */
struct completion *detach_completion; /* all workers detached */
struct ida worker_ida; /* worker IDs for task name */
struct workqueue_attrs *attrs; /* I: worker attributes */
struct hlist_node hash_node; /* PL: unbound_pool_hash node */
int refcnt; /* PL: refcnt for unbound pools */
/*
* The current concurrency level. As it's likely to be accessed
* from other CPUs during try_to_wake_up(), put it in a separate
* cacheline.
*/
atomic_t nr_running ____cacheline_aligned_in_smp;
/*
* Destruction of pool is RCU protected to allow dereferences
* from get_work_pool().
*/
struct rcu_head rcu;
} ____cacheline_aligned_in_smp;
/*
* The per-pool workqueue. While queued, the lower WORK_STRUCT_FLAG_BITS
* of work_struct->data are used for flags and the remaining high bits
* point to the pwq; thus, pwqs need to be aligned at two's power of the
* number of flag bits.
*/
struct pool_workqueue {
struct worker_pool *pool; /* I: the associated pool */
struct workqueue_struct *wq; /* I: the owning workqueue */
int work_color; /* L: current color */
int flush_color; /* L: flushing color */
int refcnt; /* L: reference count */
int nr_in_flight[WORK_NR_COLORS];
/* L: nr of in_flight works */
/*
* nr_active management and WORK_STRUCT_INACTIVE:
*
* When pwq->nr_active >= max_active, new work item is queued to
* pwq->inactive_works instead of pool->worklist and marked with
* WORK_STRUCT_INACTIVE.
*
* All work items marked with WORK_STRUCT_INACTIVE do not participate
* in pwq->nr_active and all work items in pwq->inactive_works are
* marked with WORK_STRUCT_INACTIVE. But not all WORK_STRUCT_INACTIVE
* work items are in pwq->inactive_works. Some of them are ready to
* run in pool->worklist or worker->scheduled. Those work itmes are
* only struct wq_barrier which is used for flush_work() and should
* not participate in pwq->nr_active. For non-barrier work item, it
* is marked with WORK_STRUCT_INACTIVE iff it is in pwq->inactive_works.
*/
int nr_active; /* L: nr of active works */
int max_active; /* L: max active works */
struct list_head inactive_works; /* L: inactive works */
struct list_head pwqs_node; /* WR: node on wq->pwqs */
struct list_head mayday_node; /* MD: node on wq->maydays */
/*
* Release of unbound pwq is punted to system_wq. See put_pwq()
* and pwq_unbound_release_workfn() for details. pool_workqueue
* itself is also RCU protected so that the first pwq can be
* determined without grabbing wq->mutex.
*/
struct work_struct unbound_release_work;
struct rcu_head rcu;
} __aligned(1 << WORK_STRUCT_FLAG_BITS);
/*
* Structure used to wait for workqueue flush.
*/
struct wq_flusher {
struct list_head list; /* WQ: list of flushers */
int flush_color; /* WQ: flush color waiting for */
struct completion done; /* flush completion */
};
struct wq_device;
/*
* The externally visible workqueue. It relays the issued work items to
* the appropriate worker_pool through its pool_workqueues.
*/
struct workqueue_struct {
struct list_head pwqs; /* WR: all pwqs of this wq */
struct list_head list; /* PR: list of all workqueues */
struct mutex mutex; /* protects this wq */
int work_color; /* WQ: current work color */
int flush_color; /* WQ: current flush color */
atomic_t nr_pwqs_to_flush; /* flush in progress */
struct wq_flusher *first_flusher; /* WQ: first flusher */
struct list_head flusher_queue; /* WQ: flush waiters */
struct list_head flusher_overflow; /* WQ: flush overflow list */
struct list_head maydays; /* MD: pwqs requesting rescue */
struct worker *rescuer; /* MD: rescue worker */
int nr_drainers; /* WQ: drain in progress */
int saved_max_active; /* WQ: saved pwq max_active */
struct workqueue_attrs *unbound_attrs; /* PW: only for unbound wqs */
struct pool_workqueue *dfl_pwq; /* PW: only for unbound wqs */
#ifdef CONFIG_SYSFS
struct wq_device *wq_dev; /* I: for sysfs interface */
#endif
#ifdef CONFIG_LOCKDEP
char *lock_name;
struct lock_class_key key;
struct lockdep_map lockdep_map;
#endif
char name[WQ_NAME_LEN]; /* I: workqueue name */
/*
* Destruction of workqueue_struct is RCU protected to allow walking
* the workqueues list without grabbing wq_pool_mutex.
* This is used to dump all workqueues from sysrq.
*/
struct rcu_head rcu;
/* hot fields used during command issue, aligned to cacheline */
unsigned int flags ____cacheline_aligned; /* WQ: WQ_* flags */
struct pool_workqueue __percpu *cpu_pwqs; /* I: per-cpu pwqs */
struct pool_workqueue __rcu *numa_pwq_tbl[]; /* PWR: unbound pwqs indexed by node */
};
static struct kmem_cache *pwq_cache;
static cpumask_var_t *wq_numa_possible_cpumask;
/* possible CPUs of each node */
static bool wq_disable_numa;
module_param_named(disable_numa, wq_disable_numa, bool, 0444);
/* see the comment above the definition of WQ_POWER_EFFICIENT */
static bool wq_power_efficient = IS_ENABLED(CONFIG_WQ_POWER_EFFICIENT_DEFAULT);
module_param_named(power_efficient, wq_power_efficient, bool, 0444);
static bool wq_online; /* can kworkers be created yet? */
static bool wq_numa_enabled; /* unbound NUMA affinity enabled */
/* buf for wq_update_unbound_numa_attrs(), protected by CPU hotplug exclusion */
static struct workqueue_attrs *wq_update_unbound_numa_attrs_buf;
static DEFINE_MUTEX(wq_pool_mutex); /* protects pools and workqueues list */
static DEFINE_MUTEX(wq_pool_attach_mutex); /* protects worker attach/detach */
static DEFINE_RAW_SPINLOCK(wq_mayday_lock); /* protects wq->maydays list */
/* wait for manager to go away */
static struct rcuwait manager_wait = __RCUWAIT_INITIALIZER(manager_wait);
static LIST_HEAD(workqueues); /* PR: list of all workqueues */
static bool workqueue_freezing; /* PL: have wqs started freezing? */
/* PL: allowable cpus for unbound wqs and work items */
static cpumask_var_t wq_unbound_cpumask;
/* CPU where unbound work was last round robin scheduled from this CPU */
static DEFINE_PER_CPU(int, wq_rr_cpu_last);
/*
* Local execution of unbound work items is no longer guaranteed. The
* following always forces round-robin CPU selection on unbound work items
* to uncover usages which depend on it.
*/
#ifdef CONFIG_DEBUG_WQ_FORCE_RR_CPU
static bool wq_debug_force_rr_cpu = true;
#else
static bool wq_debug_force_rr_cpu = false;
#endif
module_param_named(debug_force_rr_cpu, wq_debug_force_rr_cpu, bool, 0644);
/* the per-cpu worker pools */
static DEFINE_PER_CPU_SHARED_ALIGNED(struct worker_pool [NR_STD_WORKER_POOLS], cpu_worker_pools);
static DEFINE_IDR(worker_pool_idr); /* PR: idr of all pools */
/* PL: hash of all unbound pools keyed by pool->attrs */
static DEFINE_HASHTABLE(unbound_pool_hash, UNBOUND_POOL_HASH_ORDER);
/* I: attributes used when instantiating standard unbound pools on demand */
static struct workqueue_attrs *unbound_std_wq_attrs[NR_STD_WORKER_POOLS];
/* I: attributes used when instantiating ordered pools on demand */
static struct workqueue_attrs *ordered_wq_attrs[NR_STD_WORKER_POOLS];
struct workqueue_struct *system_wq __read_mostly;
EXPORT_SYMBOL(system_wq);
struct workqueue_struct *system_highpri_wq __read_mostly;
EXPORT_SYMBOL_GPL(system_highpri_wq);
struct workqueue_struct *system_long_wq __read_mostly;
EXPORT_SYMBOL_GPL(system_long_wq);
struct workqueue_struct *system_unbound_wq __read_mostly;
EXPORT_SYMBOL_GPL(system_unbound_wq);
struct workqueue_struct *system_freezable_wq __read_mostly;
EXPORT_SYMBOL_GPL(system_freezable_wq);
struct workqueue_struct *system_power_efficient_wq __read_mostly;
EXPORT_SYMBOL_GPL(system_power_efficient_wq);
struct workqueue_struct *system_freezable_power_efficient_wq __read_mostly;
EXPORT_SYMBOL_GPL(system_freezable_power_efficient_wq);
static int worker_thread(void *__worker);
static void workqueue_sysfs_unregister(struct workqueue_struct *wq);
static void show_pwq(struct pool_workqueue *pwq);
#define CREATE_TRACE_POINTS
#include <trace/events/workqueue.h>
#define assert_rcu_or_pool_mutex() \
RCU_LOCKDEP_WARN(!rcu_read_lock_held() && \
!lockdep_is_held(&wq_pool_mutex), \
"RCU or wq_pool_mutex should be held")
#define assert_rcu_or_wq_mutex_or_pool_mutex(wq) \
RCU_LOCKDEP_WARN(!rcu_read_lock_held() && \
!lockdep_is_held(&wq->mutex) && \
!lockdep_is_held(&wq_pool_mutex), \
"RCU, wq->mutex or wq_pool_mutex should be held")
#define for_each_cpu_worker_pool(pool, cpu) \
for ((pool) = &per_cpu(cpu_worker_pools, cpu)[0]; \
(pool) < &per_cpu(cpu_worker_pools, cpu)[NR_STD_WORKER_POOLS]; \
(pool)++)
/**
* for_each_pool - iterate through all worker_pools in the system
* @pool: iteration cursor
* @pi: integer used for iteration
*
* This must be called either with wq_pool_mutex held or RCU read
* locked. If the pool needs to be used beyond the locking in effect, the
* caller is responsible for guaranteeing that the pool stays online.
*
* The if/else clause exists only for the lockdep assertion and can be
* ignored.
*/
#define for_each_pool(pool, pi) \
idr_for_each_entry(&worker_pool_idr, pool, pi) \
if (({ assert_rcu_or_pool_mutex(); false; })) { } \
else
/**
* for_each_pool_worker - iterate through all workers of a worker_pool
* @worker: iteration cursor
* @pool: worker_pool to iterate workers of
*
* This must be called with wq_pool_attach_mutex.
*
* The if/else clause exists only for the lockdep assertion and can be
* ignored.
*/
#define for_each_pool_worker(worker, pool) \
list_for_each_entry((worker), &(pool)->workers, node) \
if (({ lockdep_assert_held(&wq_pool_attach_mutex); false; })) { } \
else
/**
* for_each_pwq - iterate through all pool_workqueues of the specified workqueue
* @pwq: iteration cursor
* @wq: the target workqueue
*
* This must be called either with wq->mutex held or RCU read locked.
* If the pwq needs to be used beyond the locking in effect, the caller is
* responsible for guaranteeing that the pwq stays online.
*
* The if/else clause exists only for the lockdep assertion and can be
* ignored.
*/
#define for_each_pwq(pwq, wq) \
list_for_each_entry_rcu((pwq), &(wq)->pwqs, pwqs_node, \
lockdep_is_held(&(wq->mutex)))
#ifdef CONFIG_DEBUG_OBJECTS_WORK
static const struct debug_obj_descr work_debug_descr;
static void *work_debug_hint(void *addr)
{
return ((struct work_struct *) addr)->func;
}
static bool work_is_static_object(void *addr)
{
struct work_struct *work = addr;
return test_bit(WORK_STRUCT_STATIC_BIT, work_data_bits(work));
}
/*
* fixup_init is called when:
* - an active object is initialized
*/
static bool work_fixup_init(void *addr, enum debug_obj_state state)
{
struct work_struct *work = addr;
switch (state) {
case ODEBUG_STATE_ACTIVE:
cancel_work_sync(work);
debug_object_init(work, &work_debug_descr);
return true;
default:
return false;
}
}
/*
* fixup_free is called when:
* - an active object is freed
*/
static bool work_fixup_free(void *addr, enum debug_obj_state state)
{
struct work_struct *work = addr;
switch (state) {
case ODEBUG_STATE_ACTIVE:
cancel_work_sync(work);
debug_object_free(work, &work_debug_descr);
return true;
default:
return false;
}
}
static const struct debug_obj_descr work_debug_descr = {
.name = "work_struct",
.debug_hint = work_debug_hint,
.is_static_object = work_is_static_object,
.fixup_init = work_fixup_init,
.fixup_free = work_fixup_free,
};
static inline void debug_work_activate(struct work_struct *work)
{
debug_object_activate(work, &work_debug_descr);
}
static inline void debug_work_deactivate(struct work_struct *work)
{
debug_object_deactivate(work, &work_debug_descr);
}
void __init_work(struct work_struct *work, int onstack)
{
if (onstack)
debug_object_init_on_stack(work, &work_debug_descr);
else
debug_object_init(work, &work_debug_descr);
}
EXPORT_SYMBOL_GPL(__init_work);
void destroy_work_on_stack(struct work_struct *work)
{
debug_object_free(work, &work_debug_descr);
}
EXPORT_SYMBOL_GPL(destroy_work_on_stack);
void destroy_delayed_work_on_stack(struct delayed_work *work)
{
destroy_timer_on_stack(&work->timer);
debug_object_free(&work->work, &work_debug_descr);
}
EXPORT_SYMBOL_GPL(destroy_delayed_work_on_stack);
#else
static inline void debug_work_activate(struct work_struct *work) { }
static inline void debug_work_deactivate(struct work_struct *work) { }
#endif
/**
* worker_pool_assign_id - allocate ID and assign it to @pool
* @pool: the pool pointer of interest
*
* Returns 0 if ID in [0, WORK_OFFQ_POOL_NONE) is allocated and assigned
* successfully, -errno on failure.
*/
static int worker_pool_assign_id(struct worker_pool *pool)
{
int ret;
lockdep_assert_held(&wq_pool_mutex);
ret = idr_alloc(&worker_pool_idr, pool, 0, WORK_OFFQ_POOL_NONE,
GFP_KERNEL);
if (ret >= 0) {
pool->id = ret;
return 0;
}
return ret;
}
/**
* unbound_pwq_by_node - return the unbound pool_workqueue for the given node
* @wq: the target workqueue
* @node: the node ID
*
* This must be called with any of wq_pool_mutex, wq->mutex or RCU
* read locked.
* If the pwq needs to be used beyond the locking in effect, the caller is
* responsible for guaranteeing that the pwq stays online.
*
* Return: The unbound pool_workqueue for @node.
*/
static struct pool_workqueue *unbound_pwq_by_node(struct workqueue_struct *wq,
int node)
{
assert_rcu_or_wq_mutex_or_pool_mutex(wq);
/*
* XXX: @node can be NUMA_NO_NODE if CPU goes offline while a
* delayed item is pending. The plan is to keep CPU -> NODE
* mapping valid and stable across CPU on/offlines. Once that
* happens, this workaround can be removed.
*/
if (unlikely(node == NUMA_NO_NODE))
return wq->dfl_pwq; return rcu_dereference_raw(wq->numa_pwq_tbl[node]);
}
static unsigned int work_color_to_flags(int color)
{
return color << WORK_STRUCT_COLOR_SHIFT;
}
static int get_work_color(unsigned long work_data)
{
return (work_data >> WORK_STRUCT_COLOR_SHIFT) &
((1 << WORK_STRUCT_COLOR_BITS) - 1);
}
static int work_next_color(int color)
{
return (color + 1) % WORK_NR_COLORS;
}
/*
* While queued, %WORK_STRUCT_PWQ is set and non flag bits of a work's data
* contain the pointer to the queued pwq. Once execution starts, the flag
* is cleared and the high bits contain OFFQ flags and pool ID.
*
* set_work_pwq(), set_work_pool_and_clear_pending(), mark_work_canceling()
* and clear_work_data() can be used to set the pwq, pool or clear
* work->data. These functions should only be called while the work is
* owned - ie. while the PENDING bit is set.
*
* get_work_pool() and get_work_pwq() can be used to obtain the pool or pwq
* corresponding to a work. Pool is available once the work has been
* queued anywhere after initialization until it is sync canceled. pwq is
* available only while the work item is queued.
*
* %WORK_OFFQ_CANCELING is used to mark a work item which is being
* canceled. While being canceled, a work item may have its PENDING set
* but stay off timer and worklist for arbitrarily long and nobody should
* try to steal the PENDING bit.
*/
static inline void set_work_data(struct work_struct *work, unsigned long data,
unsigned long flags)
{
WARN_ON_ONCE(!work_pending(work)); atomic_long_set(&work->data, data | flags | work_static(work));
}
static void set_work_pwq(struct work_struct *work, struct pool_workqueue *pwq,
unsigned long extra_flags)
{
set_work_data(work, (unsigned long)pwq,
WORK_STRUCT_PENDING | WORK_STRUCT_PWQ | extra_flags);
}
static void set_work_pool_and_keep_pending(struct work_struct *work,
int pool_id)
{
set_work_data(work, (unsigned long)pool_id << WORK_OFFQ_POOL_SHIFT,
WORK_STRUCT_PENDING);
}
static void set_work_pool_and_clear_pending(struct work_struct *work,
int pool_id)
{
/*
* The following wmb is paired with the implied mb in
* test_and_set_bit(PENDING) and ensures all updates to @work made
* here are visible to and precede any updates by the next PENDING
* owner.
*/
smp_wmb();
set_work_data(work, (unsigned long)pool_id << WORK_OFFQ_POOL_SHIFT, 0);
/*
* The following mb guarantees that previous clear of a PENDING bit
* will not be reordered with any speculative LOADS or STORES from
* work->current_func, which is executed afterwards. This possible
* reordering can lead to a missed execution on attempt to queue
* the same @work. E.g. consider this case:
*
* CPU#0 CPU#1
* ---------------------------- --------------------------------
*
* 1 STORE event_indicated
* 2 queue_work_on() {
* 3 test_and_set_bit(PENDING)
* 4 } set_..._and_clear_pending() {
* 5 set_work_data() # clear bit
* 6 smp_mb()
* 7 work->current_func() {
* 8 LOAD event_indicated
* }
*
* Without an explicit full barrier speculative LOAD on line 8 can
* be executed before CPU#0 does STORE on line 1. If that happens,
* CPU#0 observes the PENDING bit is still set and new execution of
* a @work is not queued in a hope, that CPU#1 will eventually
* finish the queued @work. Meanwhile CPU#1 does not see
* event_indicated is set, because speculative LOAD was executed
* before actual STORE.
*/
smp_mb();
}
static void clear_work_data(struct work_struct *work)
{
smp_wmb(); /* see set_work_pool_and_clear_pending() */
set_work_data(work, WORK_STRUCT_NO_POOL, 0);
}
static struct pool_workqueue *get_work_pwq(struct work_struct *work)
{
unsigned long data = atomic_long_read(&work->data);
if (data & WORK_STRUCT_PWQ)
return (void *)(data & WORK_STRUCT_WQ_DATA_MASK);
else
return NULL;
}
/**
* get_work_pool - return the worker_pool a given work was associated with
* @work: the work item of interest
*
* Pools are created and destroyed under wq_pool_mutex, and allows read
* access under RCU read lock. As such, this function should be
* called under wq_pool_mutex or inside of a rcu_read_lock() region.
*
* All fields of the returned pool are accessible as long as the above
* mentioned locking is in effect. If the returned pool needs to be used
* beyond the critical section, the caller is responsible for ensuring the
* returned pool is and stays online.
*
* Return: The worker_pool @work was last associated with. %NULL if none.
*/
static struct worker_pool *get_work_pool(struct work_struct *work)
{
unsigned long data = atomic_long_read(&work->data);
int pool_id;
assert_rcu_or_pool_mutex();
if (data & WORK_STRUCT_PWQ)
return ((struct pool_workqueue *)
(data & WORK_STRUCT_WQ_DATA_MASK))->pool; pool_id = data >> WORK_OFFQ_POOL_SHIFT;
if (pool_id == WORK_OFFQ_POOL_NONE)
return NULL;
return idr_find(&worker_pool_idr, pool_id);}
/**
* get_work_pool_id - return the worker pool ID a given work is associated with
* @work: the work item of interest
*
* Return: The worker_pool ID @work was last associated with.
* %WORK_OFFQ_POOL_NONE if none.
*/
static int get_work_pool_id(struct work_struct *work)
{
unsigned long data = atomic_long_read(&work->data);
if (data & WORK_STRUCT_PWQ)
return ((struct pool_workqueue *)
(data & WORK_STRUCT_WQ_DATA_MASK))->pool->id; return data >> WORK_OFFQ_POOL_SHIFT;
}
static void mark_work_canceling(struct work_struct *work)
{
unsigned long pool_id = get_work_pool_id(work);
pool_id <<= WORK_OFFQ_POOL_SHIFT;
set_work_data(work, pool_id | WORK_OFFQ_CANCELING, WORK_STRUCT_PENDING);
}
static bool work_is_canceling(struct work_struct *work)
{
unsigned long data = atomic_long_read(&work->data);
return !(data & WORK_STRUCT_PWQ) && (data & WORK_OFFQ_CANCELING);
}
/*
* Policy functions. These define the policies on how the global worker
* pools are managed. Unless noted otherwise, these functions assume that
* they're being called with pool->lock held.
*/
static bool __need_more_worker(struct worker_pool *pool)
{
return !atomic_read(&pool->nr_running);
}
/*
* Need to wake up a worker? Called from anything but currently
* running workers.
*
* Note that, because unbound workers never contribute to nr_running, this
* function will always return %true for unbound pools as long as the
* worklist isn't empty.
*/
static bool need_more_worker(struct worker_pool *pool)
{
return !list_empty(&pool->worklist) && __need_more_worker(pool);
}
/* Can I start working? Called from busy but !running workers. */
static bool may_start_working(struct worker_pool *pool)
{
return pool->nr_idle;
}
/* Do I need to keep working? Called from currently running workers. */
static bool keep_working(struct worker_pool *pool)
{
return !list_empty(&pool->worklist) &&
atomic_read(&pool->nr_running) <= 1;
}
/* Do we need a new worker? Called from manager. */
static bool need_to_create_worker(struct worker_pool *pool)
{
return need_more_worker(pool) && !may_start_working(pool);
}
/* Do we have too many workers and should some go away? */
static bool too_many_workers(struct worker_pool *pool)
{
bool managing = pool->flags & POOL_MANAGER_ACTIVE;
int nr_idle = pool->nr_idle + managing; /* manager is considered idle */
int nr_busy = pool->nr_workers - nr_idle;
return nr_idle > 2 && (nr_idle - 2) * MAX_IDLE_WORKERS_RATIO >= nr_busy;
}
/*
* Wake up functions.
*/
/* Return the first idle worker. Safe with preemption disabled */
static struct worker *first_idle_worker(struct worker_pool *pool)
{
if (unlikely(list_empty(&pool->idle_list)))
return NULL;
return list_first_entry(&pool->idle_list, struct worker, entry);
}
/**
* wake_up_worker - wake up an idle worker
* @pool: worker pool to wake worker from
*
* Wake up the first idle worker of @pool.
*
* CONTEXT:
* raw_spin_lock_irq(pool->lock).
*/
static void wake_up_worker(struct worker_pool *pool)
{
struct worker *worker = first_idle_worker(pool);
if (likely(worker))
wake_up_process(worker->task);
}
/**
* wq_worker_running - a worker is running again
* @task: task waking up
*
* This function is called when a worker returns from schedule()
*/
void wq_worker_running(struct task_struct *task)
{
struct worker *worker = kthread_data(task);
if (!worker->sleeping)
return;
/*
* If preempted by unbind_workers() between the WORKER_NOT_RUNNING check
* and the nr_running increment below, we may ruin the nr_running reset
* and leave with an unexpected pool->nr_running == 1 on the newly unbound
* pool. Protect against such race.
*/
preempt_disable();
if (!(worker->flags & WORKER_NOT_RUNNING))
atomic_inc(&worker->pool->nr_running);
preempt_enable();
worker->sleeping = 0;
}
/**
* wq_worker_sleeping - a worker is going to sleep
* @task: task going to sleep
*
* This function is called from schedule() when a busy worker is
* going to sleep. Preemption needs to be disabled to protect ->sleeping
* assignment.
*/
void wq_worker_sleeping(struct task_struct *task)
{
struct worker *next, *worker = kthread_data(task);
struct worker_pool *pool;
/*
* Rescuers, which may not have all the fields set up like normal
* workers, also reach here, let's not access anything before
* checking NOT_RUNNING.
*/
if (worker->flags & WORKER_NOT_RUNNING)
return;
pool = worker->pool;
/* Return if preempted before wq_worker_running() was reached */
if (worker->sleeping)
return;
worker->sleeping = 1;
raw_spin_lock_irq(&pool->lock);
/*
* The counterpart of the following dec_and_test, implied mb,
* worklist not empty test sequence is in insert_work().
* Please read comment there.
*
* NOT_RUNNING is clear. This means that we're bound to and
* running on the local cpu w/ rq lock held and preemption
* disabled, which in turn means that none else could be
* manipulating idle_list, so dereferencing idle_list without pool
* lock is safe.
*/
if (atomic_dec_and_test(&pool->nr_running) &&
!list_empty(&pool->worklist)) {
next = first_idle_worker(pool);
if (next)
wake_up_process(next->task);
}
raw_spin_unlock_irq(&pool->lock);
}
/**
* wq_worker_last_func - retrieve worker's last work function
* @task: Task to retrieve last work function of.
*
* Determine the last function a worker executed. This is called from
* the scheduler to get a worker's last known identity.
*
* CONTEXT:
* raw_spin_lock_irq(rq->lock)
*
* This function is called during schedule() when a kworker is going
* to sleep. It's used by psi to identify aggregation workers during
* dequeuing, to allow periodic aggregation to shut-off when that
* worker is the last task in the system or cgroup to go to sleep.
*
* As this function doesn't involve any workqueue-related locking, it
* only returns stable values when called from inside the scheduler's
* queuing and dequeuing paths, when @task, which must be a kworker,
* is guaranteed to not be processing any works.
*
* Return:
* The last work function %current executed as a worker, NULL if it
* hasn't executed any work yet.
*/
work_func_t wq_worker_last_func(struct task_struct *task)
{
struct worker *worker = kthread_data(task);
return worker->last_func;
}
/**
* worker_set_flags - set worker flags and adjust nr_running accordingly
* @worker: self
* @flags: flags to set
*
* Set @flags in @worker->flags and adjust nr_running accordingly.
*
* CONTEXT:
* raw_spin_lock_irq(pool->lock)
*/
static inline void worker_set_flags(struct worker *worker, unsigned int flags)
{
struct worker_pool *pool = worker->pool;
WARN_ON_ONCE(worker->task != current);
/* If transitioning into NOT_RUNNING, adjust nr_running. */
if ((flags & WORKER_NOT_RUNNING) &&
!(worker->flags & WORKER_NOT_RUNNING)) {
atomic_dec(&pool->nr_running);
}
worker->flags |= flags;
}
/**
* worker_clr_flags - clear worker flags and adjust nr_running accordingly
* @worker: self
* @flags: flags to clear
*
* Clear @flags in @worker->flags and adjust nr_running accordingly.
*
* CONTEXT:
* raw_spin_lock_irq(pool->lock)
*/
static inline void worker_clr_flags(struct worker *worker, unsigned int flags)
{
struct worker_pool *pool = worker->pool;
unsigned int oflags = worker->flags;
WARN_ON_ONCE(worker->task != current);
worker->flags &= ~flags;
/*
* If transitioning out of NOT_RUNNING, increment nr_running. Note
* that the nested NOT_RUNNING is not a noop. NOT_RUNNING is mask
* of multiple flags, not a single flag.
*/
if ((flags & WORKER_NOT_RUNNING) && (oflags & WORKER_NOT_RUNNING))
if (!(worker->flags & WORKER_NOT_RUNNING))
atomic_inc(&pool->nr_running);
}
/**
* find_worker_executing_work - find worker which is executing a work
* @pool: pool of interest
* @work: work to find worker for
*
* Find a worker which is executing @work on @pool by searching
* @pool->busy_hash which is keyed by the address of @work. For a worker
* to match, its current execution should match the address of @work and
* its work function. This is to avoid unwanted dependency between
* unrelated work executions through a work item being recycled while still
* being executed.
*
* This is a bit tricky. A work item may be freed once its execution
* starts and nothing prevents the freed area from being recycled for
* another work item. If the same work item address ends up being reused
* before the original execution finishes, workqueue will identify the
* recycled work item as currently executing and make it wait until the
* current execution finishes, introducing an unwanted dependency.
*
* This function checks the work item address and work function to avoid
* false positives. Note that this isn't complete as one may construct a
* work function which can introduce dependency onto itself through a
* recycled work item. Well, if somebody wants to shoot oneself in the
* foot that badly, there's only so much we can do, and if such deadlock
* actually occurs, it should be easy to locate the culprit work function.
*
* CONTEXT:
* raw_spin_lock_irq(pool->lock).
*
* Return:
* Pointer to worker which is executing @work if found, %NULL
* otherwise.
*/
static struct worker *find_worker_executing_work(struct worker_pool *pool,
struct work_struct *work)
{
struct worker *worker;
hash_for_each_possible(pool->busy_hash, worker, hentry,
(unsigned long)work)
if (worker->current_work == work && worker->current_func == work->func)
return worker;
return NULL;
}
/**
* move_linked_works - move linked works to a list
* @work: start of series of works to be scheduled
* @head: target list to append @work to
* @nextp: out parameter for nested worklist walking
*
* Schedule linked works starting from @work to @head. Work series to
* be scheduled starts at @work and includes any consecutive work with
* WORK_STRUCT_LINKED set in its predecessor.
*
* If @nextp is not NULL, it's updated to point to the next work of
* the last scheduled work. This allows move_linked_works() to be
* nested inside outer list_for_each_entry_safe().
*
* CONTEXT:
* raw_spin_lock_irq(pool->lock).
*/
static void move_linked_works(struct work_struct *work, struct list_head *head,
struct work_struct **nextp)
{
struct work_struct *n;
/*
* Linked worklist will always end before the end of the list,
* use NULL for list head.
*/
list_for_each_entry_safe_from(work, n, NULL, entry) {
list_move_tail(&work->entry, head);
if (!(*work_data_bits(work) & WORK_STRUCT_LINKED))
break;
}
/*
* If we're already inside safe list traversal and have moved
* multiple works to the scheduled queue, the next position
* needs to be updated.
*/
if (nextp)
*nextp = n;
}
/**
* get_pwq - get an extra reference on the specified pool_workqueue
* @pwq: pool_workqueue to get
*
* Obtain an extra reference on @pwq. The caller should guarantee that
* @pwq has positive refcnt and be holding the matching pool->lock.
*/
static void get_pwq(struct pool_workqueue *pwq)
{
lockdep_assert_held(&pwq->pool->lock);
WARN_ON_ONCE(pwq->refcnt <= 0); pwq->refcnt++;
}
/**
* put_pwq - put a pool_workqueue reference
* @pwq: pool_workqueue to put
*
* Drop a reference of @pwq. If its refcnt reaches zero, schedule its
* destruction. The caller should be holding the matching pool->lock.
*/
static void put_pwq(struct pool_workqueue *pwq)
{
lockdep_assert_held(&pwq->pool->lock);
if (likely(--pwq->refcnt))
return;
if (WARN_ON_ONCE(!(pwq->wq->flags & WQ_UNBOUND)))
return;
/*
* @pwq can't be released under pool->lock, bounce to
* pwq_unbound_release_workfn(). This never recurses on the same
* pool->lock as this path is taken only for unbound workqueues and
* the release work item is scheduled on a per-cpu workqueue. To
* avoid lockdep warning, unbound pool->locks are given lockdep
* subclass of 1 in get_unbound_pool().
*/
schedule_work(&pwq->unbound_release_work);
}
/**
* put_pwq_unlocked - put_pwq() with surrounding pool lock/unlock
* @pwq: pool_workqueue to put (can be %NULL)
*
* put_pwq() with locking. This function also allows %NULL @pwq.
*/
static void put_pwq_unlocked(struct pool_workqueue *pwq)
{
if (pwq) {
/*
* As both pwqs and pools are RCU protected, the
* following lock operations are safe.
*/
raw_spin_lock_irq(&pwq->pool->lock);
put_pwq(pwq);
raw_spin_unlock_irq(&pwq->pool->lock);
}
}
static void pwq_activate_inactive_work(struct work_struct *work)
{
struct pool_workqueue *pwq = get_work_pwq(work);
trace_workqueue_activate_work(work);
if (list_empty(&pwq->pool->worklist))
pwq->pool->watchdog_ts = jiffies;
move_linked_works(work, &pwq->pool->worklist, NULL);
__clear_bit(WORK_STRUCT_INACTIVE_BIT, work_data_bits(work));
pwq->nr_active++;
}
static void pwq_activate_first_inactive(struct pool_workqueue *pwq)
{
struct work_struct *work = list_first_entry(&pwq->inactive_works,
struct work_struct, entry);
pwq_activate_inactive_work(work);
}
/**
* pwq_dec_nr_in_flight - decrement pwq's nr_in_flight
* @pwq: pwq of interest
* @work_data: work_data of work which left the queue
*
* A work either has completed or is removed from pending queue,
* decrement nr_in_flight of its pwq and handle workqueue flushing.
*
* CONTEXT:
* raw_spin_lock_irq(pool->lock).
*/
static void pwq_dec_nr_in_flight(struct pool_workqueue *pwq, unsigned long work_data)
{
int color = get_work_color(work_data);
if (!(work_data & WORK_STRUCT_INACTIVE)) {
pwq->nr_active--;
if (!list_empty(&pwq->inactive_works)) {
/* one down, submit an inactive one */
if (pwq->nr_active < pwq->max_active) pwq_activate_first_inactive(pwq);
}
}
pwq->nr_in_flight[color]--;
/* is flush in progress and are we at the flushing tip? */
if (likely(pwq->flush_color != color))
goto out_put;
/* are there still in-flight works? */
if (pwq->nr_in_flight[color])
goto out_put;
/* this pwq is done, clear flush_color */
pwq->flush_color = -1;
/*
* If this was the last pwq, wake up the first flusher. It
* will handle the rest.
*/
if (atomic_dec_and_test(&pwq->wq->nr_pwqs_to_flush))
complete(&pwq->wq->first_flusher->done);
out_put:
put_pwq(pwq);
}
/**
* try_to_grab_pending - steal work item from worklist and disable irq
* @work: work item to steal
* @is_dwork: @work is a delayed_work
* @flags: place to store irq state
*
* Try to grab PENDING bit of @work. This function can handle @work in any
* stable state - idle, on timer or on worklist.
*
* Return:
*
* ======== ================================================================
* 1 if @work was pending and we successfully stole PENDING
* 0 if @work was idle and we claimed PENDING
* -EAGAIN if PENDING couldn't be grabbed at the moment, safe to busy-retry
* -ENOENT if someone else is canceling @work, this state may persist
* for arbitrarily long
* ======== ================================================================
*
* Note:
* On >= 0 return, the caller owns @work's PENDING bit. To avoid getting
* interrupted while holding PENDING and @work off queue, irq must be
* disabled on entry. This, combined with delayed_work->timer being
* irqsafe, ensures that we return -EAGAIN for finite short period of time.
*
* On successful return, >= 0, irq is disabled and the caller is
* responsible for releasing it using local_irq_restore(*@flags).
*
* This function is safe to call from any context including IRQ handler.
*/
static int try_to_grab_pending(struct work_struct *work, bool is_dwork,
unsigned long *flags)
{
struct worker_pool *pool;
struct pool_workqueue *pwq;
local_irq_save(*flags);
/* try to steal the timer if it exists */
if (is_dwork) {
struct delayed_work *dwork = to_delayed_work(work);
/*
* dwork->timer is irqsafe. If del_timer() fails, it's
* guaranteed that the timer is not queued anywhere and not
* running on the local CPU.
*/
if (likely(del_timer(&dwork->timer)))
return 1;
}
/* try to claim PENDING the normal way */
if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) return 0;
rcu_read_lock();
/*
* The queueing is in progress, or it is already queued. Try to
* steal it from ->worklist without clearing WORK_STRUCT_PENDING.
*/
pool = get_work_pool(work);
if (!pool)
goto fail;
raw_spin_lock(&pool->lock);
/*
* work->data is guaranteed to point to pwq only while the work
* item is queued on pwq->wq, and both updating work->data to point
* to pwq on queueing and to pool on dequeueing are done under
* pwq->pool->lock. This in turn guarantees that, if work->data
* points to pwq which is associated with a locked pool, the work
* item is currently queued on that pool.
*/
pwq = get_work_pwq(work);
if (pwq && pwq->pool == pool) {
debug_work_deactivate(work);
/*
* A cancelable inactive work item must be in the
* pwq->inactive_works since a queued barrier can't be
* canceled (see the comments in insert_wq_barrier()).
*
* An inactive work item cannot be grabbed directly because
* it might have linked barrier work items which, if left
* on the inactive_works list, will confuse pwq->nr_active
* management later on and cause stall. Make sure the work
* item is activated before grabbing.
*/
if (*work_data_bits(work) & WORK_STRUCT_INACTIVE) pwq_activate_inactive_work(work); list_del_init(&work->entry);
pwq_dec_nr_in_flight(pwq, *work_data_bits(work));
/* work->data points to pwq iff queued, point to pool */
set_work_pool_and_keep_pending(work, pool->id);
raw_spin_unlock(&pool->lock);
rcu_read_unlock();
return 1;
}
raw_spin_unlock(&pool->lock);
fail:
rcu_read_unlock();
local_irq_restore(*flags);
if (work_is_canceling(work))
return -ENOENT;
cpu_relax();
return -EAGAIN;
}
/**
* insert_work - insert a work into a pool
* @pwq: pwq @work belongs to
* @work: work to insert
* @head: insertion point
* @extra_flags: extra WORK_STRUCT_* flags to set
*
* Insert @work which belongs to @pwq after @head. @extra_flags is or'd to
* work_struct flags.
*
* CONTEXT:
* raw_spin_lock_irq(pool->lock).
*/
static void insert_work(struct pool_workqueue *pwq, struct work_struct *work,
struct list_head *head, unsigned int extra_flags)
{
struct worker_pool *pool = pwq->pool;
/* record the work call stack in order to print it in KASAN reports */
kasan_record_aux_stack(work);
/* we own @work, set data and link */
set_work_pwq(work, pwq, extra_flags);
list_add_tail(&work->entry, head);
get_pwq(pwq);
/*
* Ensure either wq_worker_sleeping() sees the above
* list_add_tail() or we see zero nr_running to avoid workers lying
* around lazily while there are works to be processed.
*/
smp_mb();
if (__need_more_worker(pool))
wake_up_worker(pool);
}
/*
* Test whether @work is being queued from another work executing on the
* same workqueue.
*/
static bool is_chained_work(struct workqueue_struct *wq)
{
struct worker *worker;
worker = current_wq_worker();
/*
* Return %true iff I'm a worker executing a work item on @wq. If
* I'm @worker, it's safe to dereference it without locking.
*/
return worker && worker->current_pwq->wq == wq;
}
/*
* When queueing an unbound work item to a wq, prefer local CPU if allowed
* by wq_unbound_cpumask. Otherwise, round robin among the allowed ones to
* avoid perturbing sensitive tasks.
*/
static int wq_select_unbound_cpu(int cpu)
{
static bool printed_dbg_warning;
int new_cpu;
if (likely(!wq_debug_force_rr_cpu)) {
if (cpumask_test_cpu(cpu, wq_unbound_cpumask))
return cpu;
} else if (!printed_dbg_warning) {
pr_warn("workqueue: round-robin CPU selection forced, expect performance impact\n");
printed_dbg_warning = true;
}
if (cpumask_empty(wq_unbound_cpumask))
return cpu;
new_cpu = __this_cpu_read(wq_rr_cpu_last);
new_cpu = cpumask_next_and(new_cpu, wq_unbound_cpumask, cpu_online_mask);
if (unlikely(new_cpu >= nr_cpu_ids)) {
new_cpu = cpumask_first_and(wq_unbound_cpumask, cpu_online_mask);
if (unlikely(new_cpu >= nr_cpu_ids))
return cpu;
}
__this_cpu_write(wq_rr_cpu_last, new_cpu);
return new_cpu;
}
static void __queue_work(int cpu, struct workqueue_struct *wq,
struct work_struct *work)
{
struct pool_workqueue *pwq;
struct worker_pool *last_pool;
struct list_head *worklist;
unsigned int work_flags;
unsigned int req_cpu = cpu;
/*
* While a work item is PENDING && off queue, a task trying to
* steal the PENDING will busy-loop waiting for it to either get
* queued or lose PENDING. Grabbing PENDING and queueing should
* happen with IRQ disabled.
*/
lockdep_assert_irqs_disabled();
/* if draining, only works from the same workqueue are allowed */
if (unlikely(wq->flags & __WQ_DRAINING) &&
WARN_ON_ONCE(!is_chained_work(wq)))
return;
rcu_read_lock();
retry:
/* pwq which will be used unless @work is executing elsewhere */
if (wq->flags & WQ_UNBOUND) { if (req_cpu == WORK_CPU_UNBOUND) cpu = wq_select_unbound_cpu(raw_smp_processor_id());
pwq = unbound_pwq_by_node(wq, cpu_to_node(cpu));
} else {
if (req_cpu == WORK_CPU_UNBOUND) cpu = raw_smp_processor_id(); pwq = per_cpu_ptr(wq->cpu_pwqs, cpu);
}
/*
* If @work was previously on a different pool, it might still be
* running there, in which case the work needs to be queued on that
* pool to guarantee non-reentrancy.
*/
last_pool = get_work_pool(work); if (last_pool && last_pool != pwq->pool) {
struct worker *worker;
raw_spin_lock(&last_pool->lock);
worker = find_worker_executing_work(last_pool, work);
if (worker && worker->current_pwq->wq == wq) {
pwq = worker->current_pwq;
} else {
/* meh... not running there, queue here */
raw_spin_unlock(&last_pool->lock);
raw_spin_lock(&pwq->pool->lock);
}
} else {
raw_spin_lock(&pwq->pool->lock);
}
/*
* pwq is determined and locked. For unbound pools, we could have
* raced with pwq release and it could already be dead. If its
* refcnt is zero, repeat pwq selection. Note that pwqs never die
* without another pwq replacing it in the numa_pwq_tbl or while
* work items are executing on it, so the retrying is guaranteed to
* make forward-progress.
*/
if (unlikely(!pwq->refcnt)) { if (wq->flags & WQ_UNBOUND) { raw_spin_unlock(&pwq->pool->lock);
cpu_relax();
goto retry;
}
/* oops */
WARN_ONCE(true, "workqueue: per-cpu pwq for %s on cpu%d has 0 refcnt",
wq->name, cpu);
}
/* pwq determined, queue */
trace_workqueue_queue_work(req_cpu, pwq, work);
if (WARN_ON(!list_empty(&work->entry)))
goto out;
pwq->nr_in_flight[pwq->work_color]++;
work_flags = work_color_to_flags(pwq->work_color);
if (likely(pwq->nr_active < pwq->max_active)) {
trace_workqueue_activate_work(work);
pwq->nr_active++;
worklist = &pwq->pool->worklist;
if (list_empty(worklist))
pwq->pool->watchdog_ts = jiffies;
} else {
work_flags |= WORK_STRUCT_INACTIVE;
worklist = &pwq->inactive_works;
}
debug_work_activate(work);
insert_work(pwq, work, worklist, work_flags);
out:
raw_spin_unlock(&pwq->pool->lock);
rcu_read_unlock();
}
/**
* queue_work_on - queue work on specific cpu
* @cpu: CPU number to execute work on
* @wq: workqueue to use
* @work: work to queue
*
* We queue the work to a specific CPU, the caller must ensure it
* can't go away.
*
* Return: %false if @work was already on a queue, %true otherwise.
*/
bool queue_work_on(int cpu, struct workqueue_struct *wq,
struct work_struct *work)
{
bool ret = false;
unsigned long flags;
local_irq_save(flags);
if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) {
__queue_work(cpu, wq, work);
ret = true;
}
local_irq_restore(flags);
return ret;
}
EXPORT_SYMBOL(queue_work_on);
/**
* workqueue_select_cpu_near - Select a CPU based on NUMA node
* @node: NUMA node ID that we want to select a CPU from
*
* This function will attempt to find a "random" cpu available on a given
* node. If there are no CPUs available on the given node it will return
* WORK_CPU_UNBOUND indicating that we should just schedule to any
* available CPU if we need to schedule this work.
*/
static int workqueue_select_cpu_near(int node)
{
int cpu;
/* No point in doing this if NUMA isn't enabled for workqueues */
if (!wq_numa_enabled)
return WORK_CPU_UNBOUND;
/* Delay binding to CPU if node is not valid or online */
if (node < 0 || node >= MAX_NUMNODES || !node_online(node))
return WORK_CPU_UNBOUND;
/* Use local node/cpu if we are already there */
cpu = raw_smp_processor_id();
if (node == cpu_to_node(cpu))
return cpu;
/* Use "random" otherwise know as "first" online CPU of node */
cpu = cpumask_any_and(cpumask_of_node(node), cpu_online_mask);
/* If CPU is valid return that, otherwise just defer */
return cpu < nr_cpu_ids ? cpu : WORK_CPU_UNBOUND;
}
/**
* queue_work_node - queue work on a "random" cpu for a given NUMA node
* @node: NUMA node that we are targeting the work for
* @wq: workqueue to use
* @work: work to queue
*
* We queue the work to a "random" CPU within a given NUMA node. The basic
* idea here is to provide a way to somehow associate work with a given
* NUMA node.
*
* This function will only make a best effort attempt at getting this onto
* the right NUMA node. If no node is requested or the requested node is
* offline then we just fall back to standard queue_work behavior.
*
* Currently the "random" CPU ends up being the first available CPU in the
* intersection of cpu_online_mask and the cpumask of the node, unless we
* are running on the node. In that case we just use the current CPU.
*
* Return: %false if @work was already on a queue, %true otherwise.
*/
bool queue_work_node(int node, struct workqueue_struct *wq,
struct work_struct *work)
{
unsigned long flags;
bool ret = false;
/*
* This current implementation is specific to unbound workqueues.
* Specifically we only return the first available CPU for a given
* node instead of cycling through individual CPUs within the node.
*
* If this is used with a per-cpu workqueue then the logic in
* workqueue_select_cpu_near would need to be updated to allow for
* some round robin type logic.
*/
WARN_ON_ONCE(!(wq->flags & WQ_UNBOUND));
local_irq_save(flags);
if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) {
int cpu = workqueue_select_cpu_near(node);
__queue_work(cpu, wq, work);
ret = true;
}
local_irq_restore(flags);
return ret;
}
EXPORT_SYMBOL_GPL(queue_work_node);
void delayed_work_timer_fn(struct timer_list *t)
{
struct delayed_work *dwork = from_timer(dwork, t, timer);
/* should have been called from irqsafe timer with irq already off */
__queue_work(dwork->cpu, dwork->wq, &dwork->work);
}
EXPORT_SYMBOL(delayed_work_timer_fn);
static void __queue_delayed_work(int cpu, struct workqueue_struct *wq,
struct delayed_work *dwork, unsigned long delay)
{
struct timer_list *timer = &dwork->timer;
struct work_struct *work = &dwork->work;
WARN_ON_ONCE(!wq); WARN_ON_FUNCTION_MISMATCH(timer->function, delayed_work_timer_fn); WARN_ON_ONCE(timer_pending(timer)); WARN_ON_ONCE(!list_empty(&work->entry));
/*
* If @delay is 0, queue @dwork->work immediately. This is for
* both optimization and correctness. The earliest @timer can
* expire is on the closest next tick and delayed_work users depend
* on that there's no such delay when @delay is 0.
*/
if (!delay) { __queue_work(cpu, wq, &dwork->work);
return;
}
dwork->wq = wq;
dwork->cpu = cpu;
timer->expires = jiffies + delay;
if (unlikely(cpu != WORK_CPU_UNBOUND))
add_timer_on(timer, cpu);
else
add_timer(timer);
}
/**
* queue_delayed_work_on - queue work on specific CPU after delay
* @cpu: CPU number to execute work on
* @wq: workqueue to use
* @dwork: work to queue
* @delay: number of jiffies to wait before queueing
*
* Return: %false if @work was already on a queue, %true otherwise. If
* @delay is zero and @dwork is idle, it will be scheduled for immediate
* execution.
*/
bool queue_delayed_work_on(int cpu, struct workqueue_struct *wq,
struct delayed_work *dwork, unsigned long delay)
{
struct work_struct *work = &dwork->work;
bool ret = false;
unsigned long flags;
/* read the comment in __queue_work() */
local_irq_save(flags);
if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) {
__queue_delayed_work(cpu, wq, dwork, delay);
ret = true;
}
local_irq_restore(flags);
return ret;
}
EXPORT_SYMBOL(queue_delayed_work_on);
/**
* mod_delayed_work_on - modify delay of or queue a delayed work on specific CPU
* @cpu: CPU number to execute work on
* @wq: workqueue to use
* @dwork: work to queue
* @delay: number of jiffies to wait before queueing
*
* If @dwork is idle, equivalent to queue_delayed_work_on(); otherwise,
* modify @dwork's timer so that it expires after @delay. If @delay is
* zero, @work is guaranteed to be scheduled immediately regardless of its
* current state.
*
* Return: %false if @dwork was idle and queued, %true if @dwork was
* pending and its timer was modified.
*
* This function is safe to call from any context including IRQ handler.
* See try_to_grab_pending() for details.
*/
bool mod_delayed_work_on(int cpu, struct workqueue_struct *wq,
struct delayed_work *dwork, unsigned long delay)
{
unsigned long flags;
int ret;
do {
ret = try_to_grab_pending(&dwork->work, true, &flags);
} while (unlikely(ret == -EAGAIN));
if (likely(ret >= 0)) { __queue_delayed_work(cpu, wq, dwork, delay);
local_irq_restore(flags);
}
/* -ENOENT from try_to_grab_pending() becomes %true */
return ret;
}
EXPORT_SYMBOL_GPL(mod_delayed_work_on);
static void rcu_work_rcufn(struct rcu_head *rcu)
{
struct rcu_work *rwork = container_of(rcu, struct rcu_work, rcu);
/* read the comment in __queue_work() */
local_irq_disable();
__queue_work(WORK_CPU_UNBOUND, rwork->wq, &rwork->work);
local_irq_enable();
}
/**
* queue_rcu_work - queue work after a RCU grace period
* @wq: workqueue to use
* @rwork: work to queue
*
* Return: %false if @rwork was already pending, %true otherwise. Note
* that a full RCU grace period is guaranteed only after a %true return.
* While @rwork is guaranteed to be executed after a %false return, the
* execution may happen before a full RCU grace period has passed.
*/
bool queue_rcu_work(struct workqueue_struct *wq, struct rcu_work *rwork)
{
struct work_struct *work = &rwork->work;
if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) {
rwork->wq = wq;
call_rcu(&rwork->rcu, rcu_work_rcufn);
return true;
}
return false;
}
EXPORT_SYMBOL(queue_rcu_work);
/**
* worker_enter_idle - enter idle state
* @worker: worker which is entering idle state
*
* @worker is entering idle state. Update stats and idle timer if
* necessary.
*
* LOCKING:
* raw_spin_lock_irq(pool->lock).
*/
static void worker_enter_idle(struct worker *worker)
{
struct worker_pool *pool = worker->pool;
if (WARN_ON_ONCE(worker->flags & WORKER_IDLE) ||
WARN_ON_ONCE(!list_empty(&worker->entry) &&
(worker->hentry.next || worker->hentry.pprev)))
return;
/* can't use worker_set_flags(), also called from create_worker() */
worker->flags |= WORKER_IDLE;
pool->nr_idle++;
worker->last_active = jiffies;
/* idle_list is LIFO */
list_add(&worker->entry, &pool->idle_list);
if (too_many_workers(pool) && !timer_pending(&pool->idle_timer))
mod_timer(&pool->idle_timer, jiffies + IDLE_WORKER_TIMEOUT);
/*
* Sanity check nr_running. Because unbind_workers() releases
* pool->lock between setting %WORKER_UNBOUND and zapping
* nr_running, the warning may trigger spuriously. Check iff
* unbind is not in progress.
*/
WARN_ON_ONCE(!(pool->flags & POOL_DISASSOCIATED) &&
pool->nr_workers == pool->nr_idle &&
atomic_read(&pool->nr_running));
}
/**
* worker_leave_idle - leave idle state
* @worker: worker which is leaving idle state
*
* @worker is leaving idle state. Update stats.
*
* LOCKING:
* raw_spin_lock_irq(pool->lock).
*/
static void worker_leave_idle(struct worker *worker)
{
struct worker_pool *pool = worker->pool;
if (WARN_ON_ONCE(!(worker->flags & WORKER_IDLE)))
return;
worker_clr_flags(worker, WORKER_IDLE);
pool->nr_idle--;
list_del_init(&worker->entry);
}
static struct worker *alloc_worker(int node)
{
struct worker *worker;
worker = kzalloc_node(sizeof(*worker), GFP_KERNEL, node);
if (worker) {
INIT_LIST_HEAD(&worker->entry);
INIT_LIST_HEAD(&worker->scheduled);
INIT_LIST_HEAD(&worker->node);
/* on creation a worker is in !idle && prep state */
worker->flags = WORKER_PREP;
}
return worker;
}
/**
* worker_attach_to_pool() - attach a worker to a pool
* @worker: worker to be attached
* @pool: the target pool
*
* Attach @worker to @pool. Once attached, the %WORKER_UNBOUND flag and
* cpu-binding of @worker are kept coordinated with the pool across
* cpu-[un]hotplugs.
*/
static void worker_attach_to_pool(struct worker *worker,
struct worker_pool *pool)
{
mutex_lock(&wq_pool_attach_mutex);
/*
* The wq_pool_attach_mutex ensures %POOL_DISASSOCIATED remains
* stable across this function. See the comments above the flag
* definition for details.
*/
if (pool->flags & POOL_DISASSOCIATED)
worker->flags |= WORKER_UNBOUND;
else
kthread_set_per_cpu(worker->task, pool->cpu);
if (worker->rescue_wq)
set_cpus_allowed_ptr(worker->task, pool->attrs->cpumask);
list_add_tail(&worker->node, &pool->workers);
worker->pool = pool;
mutex_unlock(&wq_pool_attach_mutex);
}
/**
* worker_detach_from_pool() - detach a worker from its pool
* @worker: worker which is attached to its pool
*
* Undo the attaching which had been done in worker_attach_to_pool(). The
* caller worker shouldn't access to the pool after detached except it has
* other reference to the pool.
*/
static void worker_detach_from_pool(struct worker *worker)
{
struct worker_pool *pool = worker->pool;
struct completion *detach_completion = NULL;
mutex_lock(&wq_pool_attach_mutex);
kthread_set_per_cpu(worker->task, -1);
list_del(&worker->node);
worker->pool = NULL;
if (list_empty(&pool->workers))
detach_completion = pool->detach_completion;
mutex_unlock(&wq_pool_attach_mutex);
/* clear leftover flags without pool->lock after it is detached */
worker->flags &= ~(WORKER_UNBOUND | WORKER_REBOUND);
if (detach_completion)
complete(detach_completion);
}
/**
* create_worker - create a new workqueue worker
* @pool: pool the new worker will belong to
*
* Create and start a new worker which is attached to @pool.
*
* CONTEXT:
* Might sleep. Does GFP_KERNEL allocations.
*
* Return:
* Pointer to the newly created worker.
*/
static struct worker *create_worker(struct worker_pool *pool)
{
struct worker *worker;
int id;
char id_buf[16];
/* ID is needed to determine kthread name */
id = ida_alloc(&pool->worker_ida, GFP_KERNEL);
if (id < 0)
return NULL;
worker = alloc_worker(pool->node);
if (!worker)
goto fail;
worker->id = id;
if (pool->cpu >= 0)
snprintf(id_buf, sizeof(id_buf), "%d:%d%s", pool->cpu, id,
pool->attrs->nice < 0 ? "H" : "");
else
snprintf(id_buf, sizeof(id_buf), "u%d:%d", pool->id, id);
worker->task = kthread_create_on_node(worker_thread, worker, pool->node,
"kworker/%s", id_buf);
if (IS_ERR(worker->task))
goto fail;
set_user_nice(worker->task, pool->attrs->nice);
kthread_bind_mask(worker->task, pool->attrs->cpumask);
/* successful, attach the worker to the pool */
worker_attach_to_pool(worker, pool);
/* start the newly created worker */
raw_spin_lock_irq(&pool->lock);
worker->pool->nr_workers++;
worker_enter_idle(worker);
wake_up_process(worker->task);
raw_spin_unlock_irq(&pool->lock);
return worker;
fail:
ida_free(&pool->worker_ida, id);
kfree(worker);
return NULL;
}
/**
* destroy_worker - destroy a workqueue worker
* @worker: worker to be destroyed
*
* Destroy @worker and adjust @pool stats accordingly. The worker should
* be idle.
*
* CONTEXT:
* raw_spin_lock_irq(pool->lock).
*/
static void destroy_worker(struct worker *worker)
{
struct worker_pool *pool = worker->pool;
lockdep_assert_held(&pool->lock);
/* sanity check frenzy */
if (WARN_ON(worker->current_work) ||
WARN_ON(!list_empty(&worker->scheduled)) ||
WARN_ON(!(worker->flags & WORKER_IDLE)))
return;
pool->nr_workers--;
pool->nr_idle--;
list_del_init(&worker->entry);
worker->flags |= WORKER_DIE;
wake_up_process(worker->task);
}
static void idle_worker_timeout(struct timer_list *t)
{
struct worker_pool *pool = from_timer(pool, t, idle_timer);
raw_spin_lock_irq(&pool->lock);
while (too_many_workers(pool)) {
struct worker *worker;
unsigned long expires;
/* idle_list is kept in LIFO order, check the last one */
worker = list_entry(pool->idle_list.prev, struct worker, entry);
expires = worker->last_active + IDLE_WORKER_TIMEOUT;
if (time_before(jiffies, expires)) {
mod_timer(&pool->idle_timer, expires);
break;
}
destroy_worker(worker);
}
raw_spin_unlock_irq(&pool->lock);
}
static void send_mayday(struct work_struct *work)
{
struct pool_workqueue *pwq = get_work_pwq(work);
struct workqueue_struct *wq = pwq->wq;
lockdep_assert_held(&wq_mayday_lock);
if (!wq->rescuer)
return;
/* mayday mayday mayday */
if (list_empty(&pwq->mayday_node)) {
/*
* If @pwq is for an unbound wq, its base ref may be put at
* any time due to an attribute change. Pin @pwq until the
* rescuer is done with it.
*/
get_pwq(pwq);
list_add_tail(&pwq->mayday_node, &wq->maydays);
wake_up_process(wq->rescuer->task);
}
}
static void pool_mayday_timeout(struct timer_list *t)
{
struct worker_pool *pool = from_timer(pool, t, mayday_timer);
struct work_struct *work;
raw_spin_lock_irq(&pool->lock);
raw_spin_lock(&wq_mayday_lock); /* for wq->maydays */
if (need_to_create_worker(pool)) {
/*
* We've been trying to create a new worker but
* haven't been successful. We might be hitting an
* allocation deadlock. Send distress signals to
* rescuers.
*/
list_for_each_entry(work, &pool->worklist, entry)
send_mayday(work);
}
raw_spin_unlock(&wq_mayday_lock);
raw_spin_unlock_irq(&pool->lock);
mod_timer(&pool->mayday_timer, jiffies + MAYDAY_INTERVAL);
}
/**
* maybe_create_worker - create a new worker if necessary
* @pool: pool to create a new worker for
*
* Create a new worker for @pool if necessary. @pool is guaranteed to
* have at least one idle worker on return from this function. If
* creating a new worker takes longer than MAYDAY_INTERVAL, mayday is
* sent to all rescuers with works scheduled on @pool to resolve
* possible allocation deadlock.
*
* On return, need_to_create_worker() is guaranteed to be %false and
* may_start_working() %true.
*
* LOCKING:
* raw_spin_lock_irq(pool->lock) which may be released and regrabbed
* multiple times. Does GFP_KERNEL allocations. Called only from
* manager.
*/
static void maybe_create_worker(struct worker_pool *pool)
__releases(&pool->lock)
__acquires(&pool->lock)
{
restart:
raw_spin_unlock_irq(&pool->lock);
/* if we don't make progress in MAYDAY_INITIAL_TIMEOUT, call for help */
mod_timer(&pool->mayday_timer, jiffies + MAYDAY_INITIAL_TIMEOUT);
while (true) {
if (create_worker(pool) || !need_to_create_worker(pool))
break;
schedule_timeout_interruptible(CREATE_COOLDOWN);
if (!need_to_create_worker(pool))
break;
}
del_timer_sync(&pool->mayday_timer);
raw_spin_lock_irq(&pool->lock);
/*
* This is necessary even after a new worker was just successfully
* created as @pool->lock was dropped and the new worker might have
* already become busy.
*/
if (need_to_create_worker(pool))
goto restart;
}
/**
* manage_workers - manage worker pool
* @worker: self
*
* Assume the manager role and manage the worker pool @worker belongs
* to. At any given time, there can be only zero or one manager per
* pool. The exclusion is handled automatically by this function.
*
* The caller can safely start processing works on false return. On
* true return, it's guaranteed that need_to_create_worker() is false
* and may_start_working() is true.
*
* CONTEXT:
* raw_spin_lock_irq(pool->lock) which may be released and regrabbed
* multiple times. Does GFP_KERNEL allocations.
*
* Return:
* %false if the pool doesn't need management and the caller can safely
* start processing works, %true if management function was performed and
* the conditions that the caller verified before calling the function may
* no longer be true.
*/
static bool manage_workers(struct worker *worker)
{
struct worker_pool *pool = worker->pool;
if (pool->flags & POOL_MANAGER_ACTIVE)
return false;
pool->flags |= POOL_MANAGER_ACTIVE;
pool->manager = worker;
maybe_create_worker(pool);
pool->manager = NULL;
pool->flags &= ~POOL_MANAGER_ACTIVE;
rcuwait_wake_up(&manager_wait);
return true;
}
/**
* process_one_work - process single work
* @worker: self
* @work: work to process
*
* Process @work. This function contains all the logics necessary to
* process a single work including synchronization against and
* interaction with other workers on the same cpu, queueing and
* flushing. As long as context requirement is met, any worker can
* call this function to process a work.
*
* CONTEXT:
* raw_spin_lock_irq(pool->lock) which is released and regrabbed.
*/
static void process_one_work(struct worker *worker, struct work_struct *work)
__releases(&pool->lock)
__acquires(&pool->lock)
{
struct pool_workqueue *pwq = get_work_pwq(work);
struct worker_pool *pool = worker->pool;
bool cpu_intensive = pwq->wq->flags & WQ_CPU_INTENSIVE;
unsigned long work_data;
struct worker *collision;
#ifdef CONFIG_LOCKDEP
/*
* It is permissible to free the struct work_struct from
* inside the function that is called from it, this we need to
* take into account for lockdep too. To avoid bogus "held
* lock freed" warnings as well as problems when looking into
* work->lockdep_map, make a copy and use that here.
*/
struct lockdep_map lockdep_map;
lockdep_copy_map(&lockdep_map, &work->lockdep_map);
#endif
/* ensure we're on the correct CPU */
WARN_ON_ONCE(!(pool->flags & POOL_DISASSOCIATED) &&
raw_smp_processor_id() != pool->cpu);
/*
* A single work shouldn't be executed concurrently by
* multiple workers on a single cpu. Check whether anyone is
* already processing the work. If so, defer the work to the
* currently executing one.
*/
collision = find_worker_executing_work(pool, work);
if (unlikely(collision)) {
move_linked_works(work, &collision->scheduled, NULL);
return;
}
/* claim and dequeue */
debug_work_deactivate(work);
hash_add(pool->busy_hash, &worker->hentry, (unsigned long)work);
worker->current_work = work;
worker->current_func = work->func;
worker->current_pwq = pwq;
work_data = *work_data_bits(work);
worker->current_color = get_work_color(work_data);
/*
* Record wq name for cmdline and debug reporting, may get
* overridden through set_worker_desc().
*/
strscpy(worker->desc, pwq->wq->name, WORKER_DESC_LEN);
list_del_init(&work->entry);
/*
* CPU intensive works don't participate in concurrency management.
* They're the scheduler's responsibility. This takes @worker out
* of concurrency management and the next code block will chain
* execution of the pending work items.
*/
if (unlikely(cpu_intensive))
worker_set_flags(worker, WORKER_CPU_INTENSIVE);
/*
* Wake up another worker if necessary. The condition is always
* false for normal per-cpu workers since nr_running would always
* be >= 1 at this point. This is used to chain execution of the
* pending work items for WORKER_NOT_RUNNING workers such as the
* UNBOUND and CPU_INTENSIVE ones.
*/
if (need_more_worker(pool))
wake_up_worker(pool);
/*
* Record the last pool and clear PENDING which should be the last
* update to @work. Also, do this inside @pool->lock so that
* PENDING and queued state changes happen together while IRQ is
* disabled.
*/
set_work_pool_and_clear_pending(work, pool->id);
raw_spin_unlock_irq(&pool->lock);
lock_map_acquire(&pwq->wq->lockdep_map);
lock_map_acquire(&lockdep_map);
/*
* Strictly speaking we should mark the invariant state without holding
* any locks, that is, before these two lock_map_acquire()'s.
*
* However, that would result in:
*
* A(W1)
* WFC(C)
* A(W1)
* C(C)
*
* Which would create W1->C->W1 dependencies, even though there is no
* actual deadlock possible. There are two solutions, using a
* read-recursive acquire on the work(queue) 'locks', but this will then
* hit the lockdep limitation on recursive locks, or simply discard
* these locks.
*
* AFAICT there is no possible deadlock scenario between the
* flush_work() and complete() primitives (except for single-threaded
* workqueues), so hiding them isn't a problem.
*/
lockdep_invariant_state(true);
trace_workqueue_execute_start(work);
worker->current_func(work);
/*
* While we must be careful to not use "work" after this, the trace
* point will only record its address.
*/
trace_workqueue_execute_end(work, worker->current_func);
lock_map_release(&lockdep_map);
lock_map_release(&pwq->wq->lockdep_map);
if (unlikely(in_atomic() || lockdep_depth(current) > 0)) {
pr_err("BUG: workqueue leaked lock or atomic: %s/0x%08x/%d\n"
" last function: %ps\n",
current->comm, preempt_count(), task_pid_nr(current),
worker->current_func);
debug_show_held_locks(current);
dump_stack();
}
/*
* The following prevents a kworker from hogging CPU on !PREEMPTION
* kernels, where a requeueing work item waiting for something to
* happen could deadlock with stop_machine as such work item could
* indefinitely requeue itself while all other CPUs are trapped in
* stop_machine. At the same time, report a quiescent RCU state so
* the same condition doesn't freeze RCU.
*/
cond_resched();
raw_spin_lock_irq(&pool->lock);
/* clear cpu intensive status */
if (unlikely(cpu_intensive))
worker_clr_flags(worker, WORKER_CPU_INTENSIVE);
/* tag the worker for identification in schedule() */
worker->last_func = worker->current_func;
/* we're done with it, release */
hash_del(&worker->hentry);
worker->current_work = NULL;
worker->current_func = NULL;
worker->current_pwq = NULL;
worker->current_color = INT_MAX;
pwq_dec_nr_in_flight(pwq, work_data);
}
/**
* process_scheduled_works - process scheduled works
* @worker: self
*
* Process all scheduled works. Please note that the scheduled list
* may change while processing a work, so this function repeatedly
* fetches a work from the top and executes it.
*
* CONTEXT:
* raw_spin_lock_irq(pool->lock) which may be released and regrabbed
* multiple times.
*/
static void process_scheduled_works(struct worker *worker)
{
while (!list_empty(&worker->scheduled)) {
struct work_struct *work = list_first_entry(&worker->scheduled,
struct work_struct, entry);
process_one_work(worker, work);
}
}
static void set_pf_worker(bool val)
{
mutex_lock(&wq_pool_attach_mutex);
if (val)
current->flags |= PF_WQ_WORKER;
else
current->flags &= ~PF_WQ_WORKER;
mutex_unlock(&wq_pool_attach_mutex);
}
/**
* worker_thread - the worker thread function
* @__worker: self
*
* The worker thread function. All workers belong to a worker_pool -
* either a per-cpu one or dynamic unbound one. These workers process all
* work items regardless of their specific target workqueue. The only
* exception is work items which belong to workqueues with a rescuer which
* will be explained in rescuer_thread().
*
* Return: 0
*/
static int worker_thread(void *__worker)
{
struct worker *worker = __worker;
struct worker_pool *pool = worker->pool;
/* tell the scheduler that this is a workqueue worker */
set_pf_worker(true);
woke_up:
raw_spin_lock_irq(&pool->lock);
/* am I supposed to die? */
if (unlikely(worker->flags & WORKER_DIE)) {
raw_spin_unlock_irq(&pool->lock);
WARN_ON_ONCE(!list_empty(&worker->entry));
set_pf_worker(false);
set_task_comm(worker->task, "kworker/dying");
ida_free(&pool->worker_ida, worker->id);
worker_detach_from_pool(worker);
kfree(worker);
return 0;
}
worker_leave_idle(worker);
recheck:
/* no more worker necessary? */
if (!need_more_worker(pool))
goto sleep;
/* do we need to manage? */
if (unlikely(!may_start_working(pool)) && manage_workers(worker))
goto recheck;
/*
* ->scheduled list can only be filled while a worker is
* preparing to process a work or actually processing it.
* Make sure nobody diddled with it while I was sleeping.
*/
WARN_ON_ONCE(!list_empty(&worker->scheduled));
/*
* Finish PREP stage. We're guaranteed to have at least one idle
* worker or that someone else has already assumed the manager
* role. This is where @worker starts participating in concurrency
* management if applicable and concurrency management is restored
* after being rebound. See rebind_workers() for details.
*/
worker_clr_flags(worker, WORKER_PREP | WORKER_REBOUND);
do {
struct work_struct *work =
list_first_entry(&pool->worklist,
struct work_struct, entry);
pool->watchdog_ts = jiffies;
if (likely(!(*work_data_bits(work) & WORK_STRUCT_LINKED))) {
/* optimization path, not strictly necessary */
process_one_work(worker, work);
if (unlikely(!list_empty(&worker->scheduled)))
process_scheduled_works(worker);
} else {
move_linked_works(work, &worker->scheduled, NULL);
process_scheduled_works(worker);
}
} while (keep_working(pool));
worker_set_flags(worker, WORKER_PREP);
sleep:
/*
* pool->lock is held and there's no work to process and no need to
* manage, sleep. Workers are woken up only while holding
* pool->lock or from local cpu, so setting the current state
* before releasing pool->lock is enough to prevent losing any
* event.
*/
worker_enter_idle(worker);
__set_current_state(TASK_IDLE);
raw_spin_unlock_irq(&pool->lock);
schedule();
goto woke_up;
}
/**
* rescuer_thread - the rescuer thread function
* @__rescuer: self
*
* Workqueue rescuer thread function. There's one rescuer for each
* workqueue which has WQ_MEM_RECLAIM set.
*
* Regular work processing on a pool may block trying to create a new
* worker which uses GFP_KERNEL allocation which has slight chance of
* developing into deadlock if some works currently on the same queue
* need to be processed to satisfy the GFP_KERNEL allocation. This is
* the problem rescuer solves.
*
* When such condition is possible, the pool summons rescuers of all
* workqueues which have works queued on the pool and let them process
* those works so that forward progress can be guaranteed.
*
* This should happen rarely.
*
* Return: 0
*/
static int rescuer_thread(void *__rescuer)
{
struct worker *rescuer = __rescuer;
struct workqueue_struct *wq = rescuer->rescue_wq;
struct list_head *scheduled = &rescuer->scheduled;
bool should_stop;
set_user_nice(current, RESCUER_NICE_LEVEL);
/*
* Mark rescuer as worker too. As WORKER_PREP is never cleared, it
* doesn't participate in concurrency management.
*/
set_pf_worker(true);
repeat:
set_current_state(TASK_IDLE);
/*
* By the time the rescuer is requested to stop, the workqueue
* shouldn't have any work pending, but @wq->maydays may still have
* pwq(s) queued. This can happen by non-rescuer workers consuming
* all the work items before the rescuer got to them. Go through
* @wq->maydays processing before acting on should_stop so that the
* list is always empty on exit.
*/
should_stop = kthread_should_stop();
/* see whether any pwq is asking for help */
raw_spin_lock_irq(&wq_mayday_lock);
while (!list_empty(&wq->maydays)) {
struct pool_workqueue *pwq = list_first_entry(&wq->maydays,
struct pool_workqueue, mayday_node);
struct worker_pool *pool = pwq->pool;
struct work_struct *work, *n;
bool first = true;
__set_current_state(TASK_RUNNING);
list_del_init(&pwq->mayday_node);
raw_spin_unlock_irq(&wq_mayday_lock);
worker_attach_to_pool(rescuer, pool);
raw_spin_lock_irq(&pool->lock);
/*
* Slurp in all works issued via this workqueue and
* process'em.
*/
WARN_ON_ONCE(!list_empty(scheduled));
list_for_each_entry_safe(work, n, &pool->worklist, entry) {
if (get_work_pwq(work) == pwq) {
if (first)
pool->watchdog_ts = jiffies;
move_linked_works(work, scheduled, &n);
}
first = false;
}
if (!list_empty(scheduled)) {
process_scheduled_works(rescuer);
/*
* The above execution of rescued work items could
* have created more to rescue through
* pwq_activate_first_inactive() or chained
* queueing. Let's put @pwq back on mayday list so
* that such back-to-back work items, which may be
* being used to relieve memory pressure, don't
* incur MAYDAY_INTERVAL delay inbetween.
*/
if (pwq->nr_active && need_to_create_worker(pool)) {
raw_spin_lock(&wq_mayday_lock);
/*
* Queue iff we aren't racing destruction
* and somebody else hasn't queued it already.
*/
if (wq->rescuer && list_empty(&pwq->mayday_node)) {
get_pwq(pwq);
list_add_tail(&pwq->mayday_node, &wq->maydays);
}
raw_spin_unlock(&wq_mayday_lock);
}
}
/*
* Put the reference grabbed by send_mayday(). @pool won't
* go away while we're still attached to it.
*/
put_pwq(pwq);
/*
* Leave this pool. If need_more_worker() is %true, notify a
* regular worker; otherwise, we end up with 0 concurrency
* and stalling the execution.
*/
if (need_more_worker(pool))
wake_up_worker(pool);
raw_spin_unlock_irq(&pool->lock);
worker_detach_from_pool(rescuer);
raw_spin_lock_irq(&wq_mayday_lock);
}
raw_spin_unlock_irq(&wq_mayday_lock);
if (should_stop) {
__set_current_state(TASK_RUNNING);
set_pf_worker(false);
return 0;
}
/* rescuers should never participate in concurrency management */
WARN_ON_ONCE(!(rescuer->flags & WORKER_NOT_RUNNING));
schedule();
goto repeat;
}
/**
* check_flush_dependency - check for flush dependency sanity
* @target_wq: workqueue being flushed
* @target_work: work item being flushed (NULL for workqueue flushes)
*
* %current is trying to flush the whole @target_wq or @target_work on it.
* If @target_wq doesn't have %WQ_MEM_RECLAIM, verify that %current is not
* reclaiming memory or running on a workqueue which doesn't have
* %WQ_MEM_RECLAIM as that can break forward-progress guarantee leading to
* a deadlock.
*/
static void check_flush_dependency(struct workqueue_struct *target_wq,
struct work_struct *target_work)
{
work_func_t target_func = target_work ? target_work->func : NULL;
struct worker *worker;
if (target_wq->flags & WQ_MEM_RECLAIM)
return;
worker = current_wq_worker();
WARN_ONCE(current->flags & PF_MEMALLOC,
"workqueue: PF_MEMALLOC task %d(%s) is flushing !WQ_MEM_RECLAIM %s:%ps",
current->pid, current->comm, target_wq->name, target_func);
WARN_ONCE(worker && ((worker->current_pwq->wq->flags &
(WQ_MEM_RECLAIM | __WQ_LEGACY)) == WQ_MEM_RECLAIM),
"workqueue: WQ_MEM_RECLAIM %s:%ps is flushing !WQ_MEM_RECLAIM %s:%ps",
worker->current_pwq->wq->name, worker->current_func,
target_wq->name, target_func);
}
struct wq_barrier {
struct work_struct work;
struct completion done;
struct task_struct *task; /* purely informational */
};
static void wq_barrier_func(struct work_struct *work)
{
struct wq_barrier *barr = container_of(work, struct wq_barrier, work);
complete(&barr->done);
}
/**
* insert_wq_barrier - insert a barrier work
* @pwq: pwq to insert barrier into
* @barr: wq_barrier to insert
* @target: target work to attach @barr to
* @worker: worker currently executing @target, NULL if @target is not executing
*
* @barr is linked to @target such that @barr is completed only after
* @target finishes execution. Please note that the ordering
* guarantee is observed only with respect to @target and on the local
* cpu.
*
* Currently, a queued barrier can't be canceled. This is because
* try_to_grab_pending() can't determine whether the work to be
* grabbed is at the head of the queue and thus can't clear LINKED
* flag of the previous work while there must be a valid next work
* after a work with LINKED flag set.
*
* Note that when @worker is non-NULL, @target may be modified
* underneath us, so we can't reliably determine pwq from @target.
*
* CONTEXT:
* raw_spin_lock_irq(pool->lock).
*/
static void insert_wq_barrier(struct pool_workqueue *pwq,
struct wq_barrier *barr,
struct work_struct *target, struct worker *worker)
{
unsigned int work_flags = 0;
unsigned int work_color;
struct list_head *head;
/*
* debugobject calls are safe here even with pool->lock locked
* as we know for sure that this will not trigger any of the
* checks and call back into the fixup functions where we
* might deadlock.
*/
INIT_WORK_ONSTACK(&barr->work, wq_barrier_func);
__set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(&barr->work));
init_completion_map(&barr->done, &target->lockdep_map);
barr->task = current;
/* The barrier work item does not participate in pwq->nr_active. */
work_flags |= WORK_STRUCT_INACTIVE;
/*
* If @target is currently being executed, schedule the
* barrier to the worker; otherwise, put it after @target.
*/
if (worker) {
head = worker->scheduled.next;
work_color = worker->current_color;
} else {
unsigned long *bits = work_data_bits(target);
head = target->entry.next;
/* there can already be other linked works, inherit and set */
work_flags |= *bits & WORK_STRUCT_LINKED;
work_color = get_work_color(*bits);
__set_bit(WORK_STRUCT_LINKED_BIT, bits);
}
pwq->nr_in_flight[work_color]++;
work_flags |= work_color_to_flags(work_color);
debug_work_activate(&barr->work);
insert_work(pwq, &barr->work, head, work_flags);
}
/**
* flush_workqueue_prep_pwqs - prepare pwqs for workqueue flushing
* @wq: workqueue being flushed
* @flush_color: new flush color, < 0 for no-op
* @work_color: new work color, < 0 for no-op
*
* Prepare pwqs for workqueue flushing.
*
* If @flush_color is non-negative, flush_color on all pwqs should be
* -1. If no pwq has in-flight commands at the specified color, all
* pwq->flush_color's stay at -1 and %false is returned. If any pwq
* has in flight commands, its pwq->flush_color is set to
* @flush_color, @wq->nr_pwqs_to_flush is updated accordingly, pwq
* wakeup logic is armed and %true is returned.
*
* The caller should have initialized @wq->first_flusher prior to
* calling this function with non-negative @flush_color. If
* @flush_color is negative, no flush color update is done and %false
* is returned.
*
* If @work_color is non-negative, all pwqs should have the same
* work_color which is previous to @work_color and all will be
* advanced to @work_color.
*
* CONTEXT:
* mutex_lock(wq->mutex).
*
* Return:
* %true if @flush_color >= 0 and there's something to flush. %false
* otherwise.
*/
static bool flush_workqueue_prep_pwqs(struct workqueue_struct *wq,
int flush_color, int work_color)
{
bool wait = false;
struct pool_workqueue *pwq;
if (flush_color >= 0) {
WARN_ON_ONCE(atomic_read(&wq->nr_pwqs_to_flush));
atomic_set(&wq->nr_pwqs_to_flush, 1);
}
for_each_pwq(pwq, wq) { struct worker_pool *pool = pwq->pool;
raw_spin_lock_irq(&pool->lock);
if (flush_color >= 0) {
WARN_ON_ONCE(pwq->flush_color != -1); if (pwq->nr_in_flight[flush_color]) { pwq->flush_color = flush_color;
atomic_inc(&wq->nr_pwqs_to_flush);
wait = true;
}
}
if (work_color >= 0) { WARN_ON_ONCE(work_color != work_next_color(pwq->work_color)); pwq->work_color = work_color;
}
raw_spin_unlock_irq(&pool->lock);
}
if (flush_color >= 0 && atomic_dec_and_test(&wq->nr_pwqs_to_flush)) complete(&wq->first_flusher->done); return wait;
}
/**
* flush_workqueue - ensure that any scheduled work has run to completion.
* @wq: workqueue to flush
*
* This function sleeps until all work items which were queued on entry
* have finished execution, but it is not livelocked by new incoming ones.
*/
void flush_workqueue(struct workqueue_struct *wq)
{
struct wq_flusher this_flusher = {
.list = LIST_HEAD_INIT(this_flusher.list),
.flush_color = -1,
.done = COMPLETION_INITIALIZER_ONSTACK_MAP(this_flusher.done, wq->lockdep_map),
};
int next_color;
if (WARN_ON(!wq_online)) return;
lock_map_acquire(&wq->lockdep_map);
lock_map_release(&wq->lockdep_map);
mutex_lock(&wq->mutex);
/*
* Start-to-wait phase
*/
next_color = work_next_color(wq->work_color);
if (next_color != wq->flush_color) {
/*
* Color space is not full. The current work_color
* becomes our flush_color and work_color is advanced
* by one.
*/
WARN_ON_ONCE(!list_empty(&wq->flusher_overflow)); this_flusher.flush_color = wq->work_color;
wq->work_color = next_color;
if (!wq->first_flusher) {
/* no flush in progress, become the first flusher */
WARN_ON_ONCE(wq->flush_color != this_flusher.flush_color); wq->first_flusher = &this_flusher;
if (!flush_workqueue_prep_pwqs(wq, wq->flush_color,
wq->work_color)) {
/* nothing to flush, done */
wq->flush_color = next_color;
wq->first_flusher = NULL;
goto out_unlock;
}
} else {
/* wait in queue */
WARN_ON_ONCE(wq->flush_color == this_flusher.flush_color); list_add_tail(&this_flusher.list, &wq->flusher_queue);
flush_workqueue_prep_pwqs(wq, -1, wq->work_color);
}
} else {
/*
* Oops, color space is full, wait on overflow queue.
* The next flush completion will assign us
* flush_color and transfer to flusher_queue.
*/
list_add_tail(&this_flusher.list, &wq->flusher_overflow);
}
check_flush_dependency(wq, NULL);
mutex_unlock(&wq->mutex);
wait_for_completion(&this_flusher.done);
/*
* Wake-up-and-cascade phase
*
* First flushers are responsible for cascading flushes and
* handling overflow. Non-first flushers can simply return.
*/
if (READ_ONCE(wq->first_flusher) != &this_flusher)
return;
mutex_lock(&wq->mutex);
/* we might have raced, check again with mutex held */
if (wq->first_flusher != &this_flusher)
goto out_unlock;
WRITE_ONCE(wq->first_flusher, NULL); WARN_ON_ONCE(!list_empty(&this_flusher.list)); WARN_ON_ONCE(wq->flush_color != this_flusher.flush_color);
while (true) {
struct wq_flusher *next, *tmp;
/* complete all the flushers sharing the current flush color */
list_for_each_entry_safe(next, tmp, &wq->flusher_queue, list) { if (next->flush_color != wq->flush_color)
break;
list_del_init(&next->list);
complete(&next->done);
}
WARN_ON_ONCE(!list_empty(&wq->flusher_overflow) &&
wq->flush_color != work_next_color(wq->work_color));
/* this flush_color is finished, advance by one */
wq->flush_color = work_next_color(wq->flush_color);
/* one color has been freed, handle overflow queue */
if (!list_empty(&wq->flusher_overflow)) {
/*
* Assign the same color to all overflowed
* flushers, advance work_color and append to
* flusher_queue. This is the start-to-wait
* phase for these overflowed flushers.
*/
list_for_each_entry(tmp, &wq->flusher_overflow, list) tmp->flush_color = wq->work_color;
wq->work_color = work_next_color(wq->work_color);
list_splice_tail_init(&wq->flusher_overflow,
&wq->flusher_queue);
flush_workqueue_prep_pwqs(wq, -1, wq->work_color);
}
if (list_empty(&wq->flusher_queue)) {
WARN_ON_ONCE(wq->flush_color != wq->work_color);
break;
}
/*
* Need to flush more colors. Make the next flusher
* the new first flusher and arm pwqs.
*/
WARN_ON_ONCE(wq->flush_color == wq->work_color); WARN_ON_ONCE(wq->flush_color != next->flush_color);
list_del_init(&next->list);
wq->first_flusher = next;
if (flush_workqueue_prep_pwqs(wq, wq->flush_color, -1))
break;
/*
* Meh... this color is already done, clear first
* flusher and repeat cascading.
*/
wq->first_flusher = NULL;
}
out_unlock:
mutex_unlock(&wq->mutex);
}
EXPORT_SYMBOL(flush_workqueue);
/**
* drain_workqueue - drain a workqueue
* @wq: workqueue to drain
*
* Wait until the workqueue becomes empty. While draining is in progress,
* only chain queueing is allowed. IOW, only currently pending or running
* work items on @wq can queue further work items on it. @wq is flushed
* repeatedly until it becomes empty. The number of flushing is determined
* by the depth of chaining and should be relatively short. Whine if it
* takes too long.
*/
void drain_workqueue(struct workqueue_struct *wq)
{
unsigned int flush_cnt = 0;
struct pool_workqueue *pwq;
/*
* __queue_work() needs to test whether there are drainers, is much
* hotter than drain_workqueue() and already looks at @wq->flags.
* Use __WQ_DRAINING so that queue doesn't have to check nr_drainers.
*/
mutex_lock(&wq->mutex);
if (!wq->nr_drainers++)
wq->flags |= __WQ_DRAINING; mutex_unlock(&wq->mutex);
reflush:
flush_workqueue(wq);
mutex_lock(&wq->mutex);
for_each_pwq(pwq, wq) {
bool drained;
raw_spin_lock_irq(&pwq->pool->lock); drained = !pwq->nr_active && list_empty(&pwq->inactive_works); raw_spin_unlock_irq(&pwq->pool->lock);
if (drained)
continue;
if (++flush_cnt == 10 || (flush_cnt % 100 == 0 && flush_cnt <= 1000))
pr_warn("workqueue %s: %s() isn't complete after %u tries\n",
wq->name, __func__, flush_cnt);
mutex_unlock(&wq->mutex);
goto reflush;
}
if (!--wq->nr_drainers) wq->flags &= ~__WQ_DRAINING; mutex_unlock(&wq->mutex);
}
EXPORT_SYMBOL_GPL(drain_workqueue);
static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr,
bool from_cancel)
{
struct worker *worker = NULL;
struct worker_pool *pool;
struct pool_workqueue *pwq;
might_sleep();
rcu_read_lock();
pool = get_work_pool(work);
if (!pool) {
rcu_read_unlock();
return false;
}
raw_spin_lock_irq(&pool->lock);
/* see the comment in try_to_grab_pending() with the same code */
pwq = get_work_pwq(work);
if (pwq) {
if (unlikely(pwq->pool != pool))
goto already_gone;
} else {
worker = find_worker_executing_work(pool, work);
if (!worker)
goto already_gone;
pwq = worker->current_pwq;
}
check_flush_dependency(pwq->wq, work);
insert_wq_barrier(pwq, barr, work, worker);
raw_spin_unlock_irq(&pool->lock);
/*
* Force a lock recursion deadlock when using flush_work() inside a
* single-threaded or rescuer equipped workqueue.
*
* For single threaded workqueues the deadlock happens when the work
* is after the work issuing the flush_work(). For rescuer equipped
* workqueues the deadlock happens when the rescuer stalls, blocking
* forward progress.
*/
if (!from_cancel &&
(pwq->wq->saved_max_active == 1 || pwq->wq->rescuer)) {
lock_map_acquire(&pwq->wq->lockdep_map);
lock_map_release(&pwq->wq->lockdep_map);
}
rcu_read_unlock();
return true;
already_gone:
raw_spin_unlock_irq(&pool->lock);
rcu_read_unlock();
return false;
}
static bool __flush_work(struct work_struct *work, bool from_cancel)
{
struct wq_barrier barr;
if (WARN_ON(!wq_online))
return false;
if (WARN_ON(!work->func))
return false;
if (!from_cancel) {
lock_map_acquire(&work->lockdep_map);
lock_map_release(&work->lockdep_map);
}
if (start_flush_work(work, &barr, from_cancel)) {
wait_for_completion(&barr.done);
destroy_work_on_stack(&barr.work);
return true;
} else {
return false;
}
}
/**
* flush_work - wait for a work to finish executing the last queueing instance
* @work: the work to flush
*
* Wait until @work has finished execution. @work is guaranteed to be idle
* on return if it hasn't been requeued since flush started.
*
* Return:
* %true if flush_work() waited for the work to finish execution,
* %false if it was already idle.
*/
bool flush_work(struct work_struct *work)
{
return __flush_work(work, false);
}
EXPORT_SYMBOL_GPL(flush_work);
struct cwt_wait {
wait_queue_entry_t wait;
struct work_struct *work;
};
static int cwt_wakefn(wait_queue_entry_t *wait, unsigned mode, int sync, void *key)
{
struct cwt_wait *cwait = container_of(wait, struct cwt_wait, wait);
if (cwait->work != key)
return 0;
return autoremove_wake_function(wait, mode, sync, key);
}
static bool __cancel_work_timer(struct work_struct *work, bool is_dwork)
{
static DECLARE_WAIT_QUEUE_HEAD(cancel_waitq);
unsigned long flags;
int ret;
do {
ret = try_to_grab_pending(work, is_dwork, &flags);
/*
* If someone else is already canceling, wait for it to
* finish. flush_work() doesn't work for PREEMPT_NONE
* because we may get scheduled between @work's completion
* and the other canceling task resuming and clearing
* CANCELING - flush_work() will return false immediately
* as @work is no longer busy, try_to_grab_pending() will
* return -ENOENT as @work is still being canceled and the
* other canceling task won't be able to clear CANCELING as
* we're hogging the CPU.
*
* Let's wait for completion using a waitqueue. As this
* may lead to the thundering herd problem, use a custom
* wake function which matches @work along with exclusive
* wait and wakeup.
*/
if (unlikely(ret == -ENOENT)) {
struct cwt_wait cwait;
init_wait(&cwait.wait);
cwait.wait.func = cwt_wakefn;
cwait.work = work;
prepare_to_wait_exclusive(&cancel_waitq, &cwait.wait,
TASK_UNINTERRUPTIBLE);
if (work_is_canceling(work))
schedule(); finish_wait(&cancel_waitq, &cwait.wait);
}
} while (unlikely(ret < 0));
/* tell other tasks trying to grab @work to back off */
mark_work_canceling(work);
local_irq_restore(flags);
/*
* This allows canceling during early boot. We know that @work
* isn't executing.
*/
if (wq_online) __flush_work(work, true);
clear_work_data(work);
/*
* Paired with prepare_to_wait() above so that either
* waitqueue_active() is visible here or !work_is_canceling() is
* visible there.
*/
smp_mb();
if (waitqueue_active(&cancel_waitq))
__wake_up(&cancel_waitq, TASK_NORMAL, 1, work); return ret;
}
/**
* cancel_work_sync - cancel a work and wait for it to finish
* @work: the work to cancel
*
* Cancel @work and wait for its execution to finish. This function
* can be used even if the work re-queues itself or migrates to
* another workqueue. On return from this function, @work is
* guaranteed to be not pending or executing on any CPU.
*
* cancel_work_sync(&delayed_work->work) must not be used for
* delayed_work's. Use cancel_delayed_work_sync() instead.
*
* The caller must ensure that the workqueue on which @work was last
* queued can't be destroyed before this function returns.
*
* Return:
* %true if @work was pending, %false otherwise.
*/
bool cancel_work_sync(struct work_struct *work)
{
return __cancel_work_timer(work, false);
}
EXPORT_SYMBOL_GPL(cancel_work_sync);
/**
* flush_delayed_work - wait for a dwork to finish executing the last queueing
* @dwork: the delayed work to flush
*
* Delayed timer is cancelled and the pending work is queued for
* immediate execution. Like flush_work(), this function only
* considers the last queueing instance of @dwork.
*
* Return:
* %true if flush_work() waited for the work to finish execution,
* %false if it was already idle.
*/
bool flush_delayed_work(struct delayed_work *dwork)
{
local_irq_disable();
if (del_timer_sync(&dwork->timer))
__queue_work(dwork->cpu, dwork->wq, &dwork->work);
local_irq_enable();
return flush_work(&dwork->work);
}
EXPORT_SYMBOL(flush_delayed_work);
/**
* flush_rcu_work - wait for a rwork to finish executing the last queueing
* @rwork: the rcu work to flush
*
* Return:
* %true if flush_rcu_work() waited for the work to finish execution,
* %false if it was already idle.
*/
bool flush_rcu_work(struct rcu_work *rwork)
{
if (test_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(&rwork->work))) {
rcu_barrier();
flush_work(&rwork->work);
return true;
} else {
return flush_work(&rwork->work);
}
}
EXPORT_SYMBOL(flush_rcu_work);
static bool __cancel_work(struct work_struct *work, bool is_dwork)
{
unsigned long flags;
int ret;
do {
ret = try_to_grab_pending(work, is_dwork, &flags);
} while (unlikely(ret == -EAGAIN));
if (unlikely(ret < 0))
return false;
set_work_pool_and_clear_pending(work, get_work_pool_id(work));
local_irq_restore(flags);
return ret;
}
/**
* cancel_delayed_work - cancel a delayed work
* @dwork: delayed_work to cancel
*
* Kill off a pending delayed_work.
*
* Return: %true if @dwork was pending and canceled; %false if it wasn't
* pending.
*
* Note:
* The work callback function may still be running on return, unless
* it returns %true and the work doesn't re-arm itself. Explicitly flush or
* use cancel_delayed_work_sync() to wait on it.
*
* This function is safe to call from any context including IRQ handler.
*/
bool cancel_delayed_work(struct delayed_work *dwork)
{
return __cancel_work(&dwork->work, true);
}
EXPORT_SYMBOL(cancel_delayed_work);
/**
* cancel_delayed_work_sync - cancel a delayed work and wait for it to finish
* @dwork: the delayed work cancel
*
* This is cancel_work_sync() for delayed works.
*
* Return:
* %true if @dwork was pending, %false otherwise.
*/
bool cancel_delayed_work_sync(struct delayed_work *dwork)
{
return __cancel_work_timer(&dwork->work, true);
}
EXPORT_SYMBOL(cancel_delayed_work_sync);
/**
* schedule_on_each_cpu - execute a function synchronously on each online CPU
* @func: the function to call
*
* schedule_on_each_cpu() executes @func on each online CPU using the
* system workqueue and blocks until all CPUs have completed.
* schedule_on_each_cpu() is very slow.
*
* Return:
* 0 on success, -errno on failure.
*/
int schedule_on_each_cpu(work_func_t func)
{
int cpu;
struct work_struct __percpu *works;
works = alloc_percpu(struct work_struct);
if (!works)
return -ENOMEM;
cpus_read_lock();
for_each_online_cpu(cpu) {
struct work_struct *work = per_cpu_ptr(works, cpu);
INIT_WORK(work, func);
schedule_work_on(cpu, work);
}
for_each_online_cpu(cpu)
flush_work(per_cpu_ptr(works, cpu));
cpus_read_unlock();
free_percpu(works);
return 0;
}
/**
* execute_in_process_context - reliably execute the routine with user context
* @fn: the function to execute
* @ew: guaranteed storage for the execute work structure (must
* be available when the work executes)
*
* Executes the function immediately if process context is available,
* otherwise schedules the function for delayed execution.
*
* Return: 0 - function was executed
* 1 - function was scheduled for execution
*/
int execute_in_process_context(work_func_t fn, struct execute_work *ew)
{
if (!in_interrupt()) {
fn(&ew->work);
return 0;
}
INIT_WORK(&ew->work, fn);
schedule_work(&ew->work);
return 1;
}
EXPORT_SYMBOL_GPL(execute_in_process_context);
/**
* free_workqueue_attrs - free a workqueue_attrs
* @attrs: workqueue_attrs to free
*
* Undo alloc_workqueue_attrs().
*/
void free_workqueue_attrs(struct workqueue_attrs *attrs)
{
if (attrs) {
free_cpumask_var(attrs->cpumask);
kfree(attrs);
}
}
/**
* alloc_workqueue_attrs - allocate a workqueue_attrs
*
* Allocate a new workqueue_attrs, initialize with default settings and
* return it.
*
* Return: The allocated new workqueue_attr on success. %NULL on failure.
*/
struct workqueue_attrs *alloc_workqueue_attrs(void)
{
struct workqueue_attrs *attrs;
attrs = kzalloc(sizeof(*attrs), GFP_KERNEL);
if (!attrs)
goto fail;
if (!alloc_cpumask_var(&attrs->cpumask, GFP_KERNEL))
goto fail;
cpumask_copy(attrs->cpumask, cpu_possible_mask);
return attrs;
fail:
free_workqueue_attrs(attrs);
return NULL;
}
static void copy_workqueue_attrs(struct workqueue_attrs *to,
const struct workqueue_attrs *from)
{
to->nice = from->nice;
cpumask_copy(to->cpumask, from->cpumask);
/*
* Unlike hash and equality test, this function doesn't ignore
* ->no_numa as it is used for both pool and wq attrs. Instead,
* get_unbound_pool() explicitly clears ->no_numa after copying.
*/
to->no_numa = from->no_numa;
}
/* hash value of the content of @attr */
static u32 wqattrs_hash(const struct workqueue_attrs *attrs)
{
u32 hash = 0;
hash = jhash_1word(attrs->nice, hash);
hash = jhash(cpumask_bits(attrs->cpumask),
BITS_TO_LONGS(nr_cpumask_bits) * sizeof(long), hash);
return hash;
}
/* content equality test */
static bool wqattrs_equal(const struct workqueue_attrs *a,
const struct workqueue_attrs *b)
{
if (a->nice != b->nice)
return false;
if (!cpumask_equal(a->cpumask, b->cpumask))
return false;
return true;
}
/**
* init_worker_pool - initialize a newly zalloc'd worker_pool
* @pool: worker_pool to initialize
*
* Initialize a newly zalloc'd @pool. It also allocates @pool->attrs.
*
* Return: 0 on success, -errno on failure. Even on failure, all fields
* inside @pool proper are initialized and put_unbound_pool() can be called
* on @pool safely to release it.
*/
static int init_worker_pool(struct worker_pool *pool)
{
raw_spin_lock_init(&pool->lock);
pool->id = -1;
pool->cpu = -1;
pool->node = NUMA_NO_NODE;
pool->flags |= POOL_DISASSOCIATED;
pool->watchdog_ts = jiffies;
INIT_LIST_HEAD(&pool->worklist);
INIT_LIST_HEAD(&pool->idle_list);
hash_init(pool->busy_hash);
timer_setup(&pool->idle_timer, idle_worker_timeout, TIMER_DEFERRABLE);
timer_setup(&pool->mayday_timer, pool_mayday_timeout, 0);
INIT_LIST_HEAD(&pool->workers);
ida_init(&pool->worker_ida);
INIT_HLIST_NODE(&pool->hash_node);
pool->refcnt = 1;
/* shouldn't fail above this point */
pool->attrs = alloc_workqueue_attrs();
if (!pool->attrs)
return -ENOMEM;
return 0;
}
#ifdef CONFIG_LOCKDEP
static void wq_init_lockdep(struct workqueue_struct *wq)
{
char *lock_name;
lockdep_register_key(&wq->key);
lock_name = kasprintf(GFP_KERNEL, "%s%s", "(wq_completion)", wq->name);
if (!lock_name)
lock_name = wq->name;
wq->lock_name = lock_name;
lockdep_init_map(&wq->lockdep_map, lock_name, &wq->key, 0);
}
static void wq_unregister_lockdep(struct workqueue_struct *wq)
{
lockdep_unregister_key(&wq->key);
}
static void wq_free_lockdep(struct workqueue_struct *wq)
{
if (wq->lock_name != wq->name)
kfree(wq->lock_name);
}
#else
static void wq_init_lockdep(struct workqueue_struct *wq)
{
}
static void wq_unregister_lockdep(struct workqueue_struct *wq)
{
}
static void wq_free_lockdep(struct workqueue_struct *wq)
{
}
#endif
static void rcu_free_wq(struct rcu_head *rcu)
{
struct workqueue_struct *wq =
container_of(rcu, struct workqueue_struct, rcu);
wq_free_lockdep(wq);
if (!(wq->flags & WQ_UNBOUND))
free_percpu(wq->cpu_pwqs);
else
free_workqueue_attrs(wq->unbound_attrs);
kfree(wq);
}
static void rcu_free_pool(struct rcu_head *rcu)
{
struct worker_pool *pool = container_of(rcu, struct worker_pool, rcu);
ida_destroy(&pool->worker_ida);
free_workqueue_attrs(pool->attrs);
kfree(pool);
}
/* This returns with the lock held on success (pool manager is inactive). */
static bool wq_manager_inactive(struct worker_pool *pool)
{
raw_spin_lock_irq(&pool->lock);
if (pool->flags & POOL_MANAGER_ACTIVE) {
raw_spin_unlock_irq(&pool->lock);
return false;
}
return true;
}
/**
* put_unbound_pool - put a worker_pool
* @pool: worker_pool to put
*
* Put @pool. If its refcnt reaches zero, it gets destroyed in RCU
* safe manner. get_unbound_pool() calls this function on its failure path
* and this function should be able to release pools which went through,
* successfully or not, init_worker_pool().
*
* Should be called with wq_pool_mutex held.
*/
static void put_unbound_pool(struct worker_pool *pool)
{
DECLARE_COMPLETION_ONSTACK(detach_completion);
struct worker *worker;
lockdep_assert_held(&wq_pool_mutex);
if (--pool->refcnt)
return;
/* sanity checks */
if (WARN_ON(!(pool->cpu < 0)) ||
WARN_ON(!list_empty(&pool->worklist)))
return;
/* release id and unhash */
if (pool->id >= 0)
idr_remove(&worker_pool_idr, pool->id);
hash_del(&pool->hash_node);
/*
* Become the manager and destroy all workers. This prevents
* @pool's workers from blocking on attach_mutex. We're the last
* manager and @pool gets freed with the flag set.
* Because of how wq_manager_inactive() works, we will hold the
* spinlock after a successful wait.
*/
rcuwait_wait_event(&manager_wait, wq_manager_inactive(pool),
TASK_UNINTERRUPTIBLE);
pool->flags |= POOL_MANAGER_ACTIVE;
while ((worker = first_idle_worker(pool)))
destroy_worker(worker);
WARN_ON(pool->nr_workers || pool->nr_idle);
raw_spin_unlock_irq(&pool->lock);
mutex_lock(&wq_pool_attach_mutex);
if (!list_empty(&pool->workers))
pool->detach_completion = &detach_completion;
mutex_unlock(&wq_pool_attach_mutex);
if (pool->detach_completion)
wait_for_completion(pool->detach_completion);
/* shut down the timers */
del_timer_sync(&pool->idle_timer);
del_timer_sync(&pool->mayday_timer);
/* RCU protected to allow dereferences from get_work_pool() */
call_rcu(&pool->rcu, rcu_free_pool);
}
/**
* get_unbound_pool - get a worker_pool with the specified attributes
* @attrs: the attributes of the worker_pool to get
*
* Obtain a worker_pool which has the same attributes as @attrs, bump the
* reference count and return it. If there already is a matching
* worker_pool, it will be used; otherwise, this function attempts to
* create a new one.
*
* Should be called with wq_pool_mutex held.
*
* Return: On success, a worker_pool with the same attributes as @attrs.
* On failure, %NULL.
*/
static struct worker_pool *get_unbound_pool(const struct workqueue_attrs *attrs)
{
u32 hash = wqattrs_hash(attrs);
struct worker_pool *pool;
int node;
int target_node = NUMA_NO_NODE;
lockdep_assert_held(&wq_pool_mutex);
/* do we already have a matching pool? */
hash_for_each_possible(unbound_pool_hash, pool, hash_node, hash) { if (wqattrs_equal(pool->attrs, attrs)) { pool->refcnt++;
return pool;
}
}
/* if cpumask is contained inside a NUMA node, we belong to that node */
if (wq_numa_enabled) {
for_each_node(node) {
if (cpumask_subset(attrs->cpumask, wq_numa_possible_cpumask[node])) {
target_node = node;
break;
}
}
}
/* nope, create a new one */
pool = kzalloc_node(sizeof(*pool), GFP_KERNEL, target_node);
if (!pool || init_worker_pool(pool) < 0)
goto fail;
lockdep_set_subclass(&pool->lock, 1); /* see put_pwq() */
copy_workqueue_attrs(pool->attrs, attrs);
pool->node = target_node;
/*
* no_numa isn't a worker_pool attribute, always clear it. See
* 'struct workqueue_attrs' comments for detail.
*/
pool->attrs->no_numa = false;
if (worker_pool_assign_id(pool) < 0)
goto fail;
/* create and start the initial worker */
if (wq_online && !create_worker(pool))
goto fail;
/* install */
hash_add(unbound_pool_hash, &pool->hash_node, hash);
return pool;
fail:
if (pool)
put_unbound_pool(pool);
return NULL;
}
static void rcu_free_pwq(struct rcu_head *rcu)
{
kmem_cache_free(pwq_cache,
container_of(rcu, struct pool_workqueue, rcu));
}
/*
* Scheduled on system_wq by put_pwq() when an unbound pwq hits zero refcnt
* and needs to be destroyed.
*/
static void pwq_unbound_release_workfn(struct work_struct *work)
{
struct pool_workqueue *pwq = container_of(work, struct pool_workqueue,
unbound_release_work);
struct workqueue_struct *wq = pwq->wq;
struct worker_pool *pool = pwq->pool;
bool is_last = false;
/*
* when @pwq is not linked, it doesn't hold any reference to the
* @wq, and @wq is invalid to access.
*/
if (!list_empty(&pwq->pwqs_node)) {
if (WARN_ON_ONCE(!(wq->flags & WQ_UNBOUND)))
return;
mutex_lock(&wq->mutex);
list_del_rcu(&pwq->pwqs_node);
is_last = list_empty(&wq->pwqs);
mutex_unlock(&wq->mutex);
}
mutex_lock(&wq_pool_mutex);
put_unbound_pool(pool);
mutex_unlock(&wq_pool_mutex);
call_rcu(&pwq->rcu, rcu_free_pwq);
/*
* If we're the last pwq going away, @wq is already dead and no one
* is gonna access it anymore. Schedule RCU free.
*/
if (is_last) {
wq_unregister_lockdep(wq);
call_rcu(&wq->rcu, rcu_free_wq);
}
}
/**
* pwq_adjust_max_active - update a pwq's max_active to the current setting
* @pwq: target pool_workqueue
*
* If @pwq isn't freezing, set @pwq->max_active to the associated
* workqueue's saved_max_active and activate inactive work items
* accordingly. If @pwq is freezing, clear @pwq->max_active to zero.
*/
static void pwq_adjust_max_active(struct pool_workqueue *pwq)
{
struct workqueue_struct *wq = pwq->wq;
bool freezable = wq->flags & WQ_FREEZABLE;
unsigned long flags;
/* for @wq->saved_max_active */
lockdep_assert_held(&wq->mutex);
/* fast exit for non-freezable wqs */
if (!freezable && pwq->max_active == wq->saved_max_active)
return;
/* this function can be called during early boot w/ irq disabled */
raw_spin_lock_irqsave(&pwq->pool->lock, flags);
/*
* During [un]freezing, the caller is responsible for ensuring that
* this function is called at least once after @workqueue_freezing
* is updated and visible.
*/
if (!freezable || !workqueue_freezing) {
bool kick = false;
pwq->max_active = wq->saved_max_active; while (!list_empty(&pwq->inactive_works) && pwq->nr_active < pwq->max_active) { pwq_activate_first_inactive(pwq);
kick = true;
}
/*
* Need to kick a worker after thawed or an unbound wq's
* max_active is bumped. In realtime scenarios, always kicking a
* worker will cause interference on the isolated cpu cores, so
* let's kick iff work items were activated.
*/
if (kick)
wake_up_worker(pwq->pool);
} else {
pwq->max_active = 0;
}
raw_spin_unlock_irqrestore(&pwq->pool->lock, flags);
}
/* initialize newly allocated @pwq which is associated with @wq and @pool */
static void init_pwq(struct pool_workqueue *pwq, struct workqueue_struct *wq,
struct worker_pool *pool)
{
BUG_ON((unsigned long)pwq & WORK_STRUCT_FLAG_MASK); memset(pwq, 0, sizeof(*pwq));
pwq->pool = pool;
pwq->wq = wq;
pwq->flush_color = -1;
pwq->refcnt = 1;
INIT_LIST_HEAD(&pwq->inactive_works);
INIT_LIST_HEAD(&pwq->pwqs_node);
INIT_LIST_HEAD(&pwq->mayday_node);
INIT_WORK(&pwq->unbound_release_work, pwq_unbound_release_workfn);
}
/* sync @pwq with the current state of its associated wq and link it */
static void link_pwq(struct pool_workqueue *pwq)
{
struct workqueue_struct *wq = pwq->wq;
lockdep_assert_held(&wq->mutex);
/* may be called multiple times, ignore if already linked */
if (!list_empty(&pwq->pwqs_node))
return;
/* set the matching work_color */
pwq->work_color = wq->work_color;
/* sync max_active to the current setting */
pwq_adjust_max_active(pwq);
/* link in @pwq */
list_add_rcu(&pwq->pwqs_node, &wq->pwqs);
}
/* obtain a pool matching @attr and create a pwq associating the pool and @wq */
static struct pool_workqueue *alloc_unbound_pwq(struct workqueue_struct *wq,
const struct workqueue_attrs *attrs)
{
struct worker_pool *pool;
struct pool_workqueue *pwq;
lockdep_assert_held(&wq_pool_mutex);
pool = get_unbound_pool(attrs);
if (!pool)
return NULL;
pwq = kmem_cache_alloc_node(pwq_cache, GFP_KERNEL, pool->node);
if (!pwq) {
put_unbound_pool(pool);
return NULL;
}
init_pwq(pwq, wq, pool);
return pwq;
}
/**
* wq_calc_node_cpumask - calculate a wq_attrs' cpumask for the specified node
* @attrs: the wq_attrs of the default pwq of the target workqueue
* @node: the target NUMA node
* @cpu_going_down: if >= 0, the CPU to consider as offline
* @cpumask: outarg, the resulting cpumask
*
* Calculate the cpumask a workqueue with @attrs should use on @node. If
* @cpu_going_down is >= 0, that cpu is considered offline during
* calculation. The result is stored in @cpumask.
*
* If NUMA affinity is not enabled, @attrs->cpumask is always used. If
* enabled and @node has online CPUs requested by @attrs, the returned
* cpumask is the intersection of the possible CPUs of @node and
* @attrs->cpumask.
*
* The caller is responsible for ensuring that the cpumask of @node stays
* stable.
*
* Return: %true if the resulting @cpumask is different from @attrs->cpumask,
* %false if equal.
*/
static bool wq_calc_node_cpumask(const struct workqueue_attrs *attrs, int node,
int cpu_going_down, cpumask_t *cpumask)
{
if (!wq_numa_enabled || attrs->no_numa)
goto use_dfl;
/* does @node have any online CPUs @attrs wants? */
cpumask_and(cpumask, cpumask_of_node(node), attrs->cpumask);
if (cpu_going_down >= 0)
cpumask_clear_cpu(cpu_going_down, cpumask);
if (cpumask_empty(cpumask))
goto use_dfl;
/* yeap, return possible CPUs in @node that @attrs wants */
cpumask_and(cpumask, attrs->cpumask, wq_numa_possible_cpumask[node]);
if (cpumask_empty(cpumask)) {
pr_warn_once("WARNING: workqueue cpumask: online intersect > "
"possible intersect\n");
return false;
}
return !cpumask_equal(cpumask, attrs->cpumask);
use_dfl:
cpumask_copy(cpumask, attrs->cpumask);
return false;
}
/* install @pwq into @wq's numa_pwq_tbl[] for @node and return the old pwq */
static struct pool_workqueue *numa_pwq_tbl_install(struct workqueue_struct *wq,
int node,
struct pool_workqueue *pwq)
{
struct pool_workqueue *old_pwq;
lockdep_assert_held(&wq_pool_mutex);
lockdep_assert_held(&wq->mutex);
/* link_pwq() can handle duplicate calls */
link_pwq(pwq);
old_pwq = rcu_access_pointer(wq->numa_pwq_tbl[node]);
rcu_assign_pointer(wq->numa_pwq_tbl[node], pwq);
return old_pwq;
}
/* context to store the prepared attrs & pwqs before applying */
struct apply_wqattrs_ctx {
struct workqueue_struct *wq; /* target workqueue */
struct workqueue_attrs *attrs; /* attrs to apply */
struct list_head list; /* queued for batching commit */
struct pool_workqueue *dfl_pwq;
struct pool_workqueue *pwq_tbl[];
};
/* free the resources after success or abort */
static void apply_wqattrs_cleanup(struct apply_wqattrs_ctx *ctx)
{
if (ctx) {
int node;
for_each_node(node)
put_pwq_unlocked(ctx->pwq_tbl[node]); put_pwq_unlocked(ctx->dfl_pwq); free_workqueue_attrs(ctx->attrs); kfree(ctx);
}
}
/* allocate the attrs and pwqs for later installation */
static struct apply_wqattrs_ctx *
apply_wqattrs_prepare(struct workqueue_struct *wq,
const struct workqueue_attrs *attrs)
{
struct apply_wqattrs_ctx *ctx;
struct workqueue_attrs *new_attrs, *tmp_attrs;
int node;
lockdep_assert_held(&wq_pool_mutex);
ctx = kzalloc(struct_size(ctx, pwq_tbl, nr_node_ids), GFP_KERNEL);
new_attrs = alloc_workqueue_attrs();
tmp_attrs = alloc_workqueue_attrs();
if (!ctx || !new_attrs || !tmp_attrs)
goto out_free;
/*
* Calculate the attrs of the default pwq.
* If the user configured cpumask doesn't overlap with the
* wq_unbound_cpumask, we fallback to the wq_unbound_cpumask.
*/
copy_workqueue_attrs(new_attrs, attrs);
cpumask_and(new_attrs->cpumask, new_attrs->cpumask, wq_unbound_cpumask);
if (unlikely(cpumask_empty(new_attrs->cpumask)))
cpumask_copy(new_attrs->cpumask, wq_unbound_cpumask);
/*
* We may create multiple pwqs with differing cpumasks. Make a
* copy of @new_attrs which will be modified and used to obtain
* pools.
*/
copy_workqueue_attrs(tmp_attrs, new_attrs);
/*
* If something goes wrong during CPU up/down, we'll fall back to
* the default pwq covering whole @attrs->cpumask. Always create
* it even if we don't use it immediately.
*/
ctx->dfl_pwq = alloc_unbound_pwq(wq, new_attrs);
if (!ctx->dfl_pwq)
goto out_free;
for_each_node(node) { if (wq_calc_node_cpumask(new_attrs, node, -1, tmp_attrs->cpumask)) { ctx->pwq_tbl[node] = alloc_unbound_pwq(wq, tmp_attrs);
if (!ctx->pwq_tbl[node])
goto out_free;
} else {
ctx->dfl_pwq->refcnt++;
ctx->pwq_tbl[node] = ctx->dfl_pwq;
}
}
/* save the user configured attrs and sanitize it. */
copy_workqueue_attrs(new_attrs, attrs);
cpumask_and(new_attrs->cpumask, new_attrs->cpumask, cpu_possible_mask);
ctx->attrs = new_attrs;
ctx->wq = wq;
free_workqueue_attrs(tmp_attrs);
return ctx;
out_free:
free_workqueue_attrs(tmp_attrs);
free_workqueue_attrs(new_attrs);
apply_wqattrs_cleanup(ctx);
return NULL;
}
/* set attrs and install prepared pwqs, @ctx points to old pwqs on return */
static void apply_wqattrs_commit(struct apply_wqattrs_ctx *ctx)
{
int node;
/* all pwqs have been created successfully, let's install'em */
mutex_lock(&ctx->wq->mutex);
copy_workqueue_attrs(ctx->wq->unbound_attrs, ctx->attrs);
/* save the previous pwq and install the new one */
for_each_node(node)
ctx->pwq_tbl[node] = numa_pwq_tbl_install(ctx->wq, node,
ctx->pwq_tbl[node]);
/* @dfl_pwq might not have been used, ensure it's linked */
link_pwq(ctx->dfl_pwq);
swap(ctx->wq->dfl_pwq, ctx->dfl_pwq);
mutex_unlock(&ctx->wq->mutex);
}
static void apply_wqattrs_lock(void)
{
/* CPUs should stay stable across pwq creations and installations */
cpus_read_lock();
mutex_lock(&wq_pool_mutex);
}
static void apply_wqattrs_unlock(void)
{
mutex_unlock(&wq_pool_mutex);
cpus_read_unlock();
}
static int apply_workqueue_attrs_locked(struct workqueue_struct *wq,
const struct workqueue_attrs *attrs)
{
struct apply_wqattrs_ctx *ctx;
/* only unbound workqueues can change attributes */
if (WARN_ON(!(wq->flags & WQ_UNBOUND)))
return -EINVAL;
/* creating multiple pwqs breaks ordering guarantee */
if (!list_empty(&wq->pwqs)) { if (WARN_ON(wq->flags & __WQ_ORDERED_EXPLICIT))
return -EINVAL;
wq->flags &= ~__WQ_ORDERED;
}
ctx = apply_wqattrs_prepare(wq, attrs);
if (!ctx)
return -ENOMEM;
/* the ctx has been prepared successfully, let's commit it */
apply_wqattrs_commit(ctx);
apply_wqattrs_cleanup(ctx);
return 0;
}
/**
* apply_workqueue_attrs - apply new workqueue_attrs to an unbound workqueue
* @wq: the target workqueue
* @attrs: the workqueue_attrs to apply, allocated with alloc_workqueue_attrs()
*
* Apply @attrs to an unbound workqueue @wq. Unless disabled, on NUMA
* machines, this function maps a separate pwq to each NUMA node with
* possibles CPUs in @attrs->cpumask so that work items are affine to the
* NUMA node it was issued on. Older pwqs are released as in-flight work
* items finish. Note that a work item which repeatedly requeues itself
* back-to-back will stay on its current pwq.
*
* Performs GFP_KERNEL allocations.
*
* Assumes caller has CPU hotplug read exclusion, i.e. cpus_read_lock().
*
* Return: 0 on success and -errno on failure.
*/
int apply_workqueue_attrs(struct workqueue_struct *wq,
const struct workqueue_attrs *attrs)
{
int ret;
lockdep_assert_cpus_held();
mutex_lock(&wq_pool_mutex);
ret = apply_workqueue_attrs_locked(wq, attrs);
mutex_unlock(&wq_pool_mutex);
return ret;
}
/**
* wq_update_unbound_numa - update NUMA affinity of a wq for CPU hot[un]plug
* @wq: the target workqueue
* @cpu: the CPU coming up or going down
* @online: whether @cpu is coming up or going down
*
* This function is to be called from %CPU_DOWN_PREPARE, %CPU_ONLINE and
* %CPU_DOWN_FAILED. @cpu is being hot[un]plugged, update NUMA affinity of
* @wq accordingly.
*
* If NUMA affinity can't be adjusted due to memory allocation failure, it
* falls back to @wq->dfl_pwq which may not be optimal but is always
* correct.
*
* Note that when the last allowed CPU of a NUMA node goes offline for a
* workqueue with a cpumask spanning multiple nodes, the workers which were
* already executing the work items for the workqueue will lose their CPU
* affinity and may execute on any CPU. This is similar to how per-cpu
* workqueues behave on CPU_DOWN. If a workqueue user wants strict
* affinity, it's the user's responsibility to flush the work item from
* CPU_DOWN_PREPARE.
*/
static void wq_update_unbound_numa(struct workqueue_struct *wq, int cpu,
bool online)
{
int node = cpu_to_node(cpu);
int cpu_off = online ? -1 : cpu;
struct pool_workqueue *old_pwq = NULL, *pwq;
struct workqueue_attrs *target_attrs;
cpumask_t *cpumask;
lockdep_assert_held(&wq_pool_mutex);
if (!wq_numa_enabled || !(wq->flags & WQ_UNBOUND) ||
wq->unbound_attrs->no_numa)
return;
/*
* We don't wanna alloc/free wq_attrs for each wq for each CPU.
* Let's use a preallocated one. The following buf is protected by
* CPU hotplug exclusion.
*/
target_attrs = wq_update_unbound_numa_attrs_buf;
cpumask = target_attrs->cpumask;
copy_workqueue_attrs(target_attrs, wq->unbound_attrs);
pwq = unbound_pwq_by_node(wq, node);
/*
* Let's determine what needs to be done. If the target cpumask is
* different from the default pwq's, we need to compare it to @pwq's
* and create a new one if they don't match. If the target cpumask
* equals the default pwq's, the default pwq should be used.
*/
if (wq_calc_node_cpumask(wq->dfl_pwq->pool->attrs, node, cpu_off, cpumask)) {
if (cpumask_equal(cpumask, pwq->pool->attrs->cpumask))
return;
} else {
goto use_dfl_pwq;
}
/* create a new pwq */
pwq = alloc_unbound_pwq(wq, target_attrs);
if (!pwq) {
pr_warn("workqueue: allocation failed while updating NUMA affinity of \"%s\"\n",
wq->name);
goto use_dfl_pwq;
}
/* Install the new pwq. */
mutex_lock(&wq->mutex);
old_pwq = numa_pwq_tbl_install(wq, node, pwq);
goto out_unlock;
use_dfl_pwq:
mutex_lock(&wq->mutex);
raw_spin_lock_irq(&wq->dfl_pwq->pool->lock);
get_pwq(wq->dfl_pwq);
raw_spin_unlock_irq(&wq->dfl_pwq->pool->lock);
old_pwq = numa_pwq_tbl_install(wq, node, wq->dfl_pwq);
out_unlock:
mutex_unlock(&wq->mutex);
put_pwq_unlocked(old_pwq);
}
static int alloc_and_link_pwqs(struct workqueue_struct *wq)
{
bool highpri = wq->flags & WQ_HIGHPRI;
int cpu, ret;
if (!(wq->flags & WQ_UNBOUND)) {
wq->cpu_pwqs = alloc_percpu(struct pool_workqueue);
if (!wq->cpu_pwqs)
return -ENOMEM;
for_each_possible_cpu(cpu) {
struct pool_workqueue *pwq =
per_cpu_ptr(wq->cpu_pwqs, cpu);
struct worker_pool *cpu_pools =
per_cpu(cpu_worker_pools, cpu);
init_pwq(pwq, wq, &cpu_pools[highpri]);
mutex_lock(&wq->mutex);
link_pwq(pwq);
mutex_unlock(&wq->mutex);
}
return 0;
}
cpus_read_lock();
if (wq->flags & __WQ_ORDERED) {
ret = apply_workqueue_attrs(wq, ordered_wq_attrs[highpri]);
/* there should only be single pwq for ordering guarantee */
WARN(!ret && (wq->pwqs.next != &wq->dfl_pwq->pwqs_node ||
wq->pwqs.prev != &wq->dfl_pwq->pwqs_node),
"ordering guarantee broken for workqueue %s\n", wq->name);
} else {
ret = apply_workqueue_attrs(wq, unbound_std_wq_attrs[highpri]);
}
cpus_read_unlock(); return ret;
}
static int wq_clamp_max_active(int max_active, unsigned int flags,
const char *name)
{
int lim = flags & WQ_UNBOUND ? WQ_UNBOUND_MAX_ACTIVE : WQ_MAX_ACTIVE; if (max_active < 1 || max_active > lim) pr_warn("workqueue: max_active %d requested for %s is out of range, clamping between %d and %d\n",
max_active, name, 1, lim);
return clamp_val(max_active, 1, lim);
}
/*
* Workqueues which may be used during memory reclaim should have a rescuer
* to guarantee forward progress.
*/
static int init_rescuer(struct workqueue_struct *wq)
{
struct worker *rescuer;
int ret;
if (!(wq->flags & WQ_MEM_RECLAIM))
return 0;
rescuer = alloc_worker(NUMA_NO_NODE);
if (!rescuer)
return -ENOMEM;
rescuer->rescue_wq = wq;
rescuer->task = kthread_create(rescuer_thread, rescuer, "%s", wq->name);
if (IS_ERR(rescuer->task)) {
ret = PTR_ERR(rescuer->task);
kfree(rescuer);
return ret;
}
wq->rescuer = rescuer;
kthread_bind_mask(rescuer->task, cpu_possible_mask);
wake_up_process(rescuer->task);
return 0;
}
__printf(1, 4)
struct workqueue_struct *alloc_workqueue(const char *fmt,
unsigned int flags,
int max_active, ...)
{
size_t tbl_size = 0;
va_list args;
struct workqueue_struct *wq;
struct pool_workqueue *pwq;
/*
* Unbound && max_active == 1 used to imply ordered, which is no
* longer the case on NUMA machines due to per-node pools. While
* alloc_ordered_workqueue() is the right way to create an ordered
* workqueue, keep the previous behavior to avoid subtle breakages
* on NUMA.
*/
if ((flags & WQ_UNBOUND) && max_active == 1) flags |= __WQ_ORDERED;
/* see the comment above the definition of WQ_POWER_EFFICIENT */
if ((flags & WQ_POWER_EFFICIENT) && wq_power_efficient) flags |= WQ_UNBOUND;
/* allocate wq and format name */
if (flags & WQ_UNBOUND) tbl_size = nr_node_ids * sizeof(wq->numa_pwq_tbl[0]);
wq = kzalloc(sizeof(*wq) + tbl_size, GFP_KERNEL);
if (!wq)
return NULL;
if (flags & WQ_UNBOUND) { wq->unbound_attrs = alloc_workqueue_attrs();
if (!wq->unbound_attrs)
goto err_free_wq;
}
va_start(args, max_active);
vsnprintf(wq->name, sizeof(wq->name), fmt, args);
va_end(args);
max_active = max_active ?: WQ_DFL_ACTIVE;
max_active = wq_clamp_max_active(max_active, flags, wq->name);
/* init wq */
wq->flags = flags;
wq->saved_max_active = max_active;
mutex_init(&wq->mutex);
atomic_set(&wq->nr_pwqs_to_flush, 0);
INIT_LIST_HEAD(&wq->pwqs);
INIT_LIST_HEAD(&wq->flusher_queue);
INIT_LIST_HEAD(&wq->flusher_overflow);
INIT_LIST_HEAD(&wq->maydays);
wq_init_lockdep(wq);
INIT_LIST_HEAD(&wq->list);
if (alloc_and_link_pwqs(wq) < 0)
goto err_unreg_lockdep;
if (wq_online && init_rescuer(wq) < 0)
goto err_destroy;
if ((wq->flags & WQ_SYSFS) && workqueue_sysfs_register(wq))
goto err_destroy;
/*
* wq_pool_mutex protects global freeze state and workqueues list.
* Grab it, adjust max_active and add the new @wq to workqueues
* list.
*/
mutex_lock(&wq_pool_mutex);
mutex_lock(&wq->mutex);
for_each_pwq(pwq, wq)
pwq_adjust_max_active(pwq); mutex_unlock(&wq->mutex);
list_add_tail_rcu(&wq->list, &workqueues);
mutex_unlock(&wq_pool_mutex);
return wq;
err_unreg_lockdep:
wq_unregister_lockdep(wq);
wq_free_lockdep(wq);
err_free_wq:
free_workqueue_attrs(wq->unbound_attrs); kfree(wq);
return NULL;
err_destroy:
destroy_workqueue(wq);
return NULL;
}
EXPORT_SYMBOL_GPL(alloc_workqueue);
static bool pwq_busy(struct pool_workqueue *pwq)
{
int i;
for (i = 0; i < WORK_NR_COLORS; i++) if (pwq->nr_in_flight[i])
return true;
if ((pwq != pwq->wq->dfl_pwq) && (pwq->refcnt > 1))
return true;
if (pwq->nr_active || !list_empty(&pwq->inactive_works))
return true;
return false;
}
/**
* destroy_workqueue - safely terminate a workqueue
* @wq: target workqueue
*
* Safely destroy a workqueue. All work currently pending will be done first.
*/
void destroy_workqueue(struct workqueue_struct *wq)
{
struct pool_workqueue *pwq;
int node;
/*
* Remove it from sysfs first so that sanity check failure doesn't
* lead to sysfs name conflicts.
*/
workqueue_sysfs_unregister(wq);
/* drain it before proceeding with destruction */
drain_workqueue(wq);
/* kill rescuer, if sanity checks fail, leave it w/o rescuer */
if (wq->rescuer) {
struct worker *rescuer = wq->rescuer;
/* this prevents new queueing */
raw_spin_lock_irq(&wq_mayday_lock);
wq->rescuer = NULL;
raw_spin_unlock_irq(&wq_mayday_lock);
/* rescuer will empty maydays list before exiting */
kthread_stop(rescuer->task);
kfree(rescuer);
}
/*
* Sanity checks - grab all the locks so that we wait for all
* in-flight operations which may do put_pwq().
*/
mutex_lock(&wq_pool_mutex);
mutex_lock(&wq->mutex);
for_each_pwq(pwq, wq) {
raw_spin_lock_irq(&pwq->pool->lock);
if (WARN_ON(pwq_busy(pwq))) {
pr_warn("%s: %s has the following busy pwq\n",
__func__, wq->name);
show_pwq(pwq);
raw_spin_unlock_irq(&pwq->pool->lock);
mutex_unlock(&wq->mutex);
mutex_unlock(&wq_pool_mutex);
show_workqueue_state();
return;
}
raw_spin_unlock_irq(&pwq->pool->lock);
}
mutex_unlock(&wq->mutex);
/*
* wq list is used to freeze wq, remove from list after
* flushing is complete in case freeze races us.
*/
list_del_rcu(&wq->list);
mutex_unlock(&wq_pool_mutex);
if (!(wq->flags & WQ_UNBOUND)) {
wq_unregister_lockdep(wq);
/*
* The base ref is never dropped on per-cpu pwqs. Directly
* schedule RCU free.
*/
call_rcu(&wq->rcu, rcu_free_wq);
} else {
/*
* We're the sole accessor of @wq at this point. Directly
* access numa_pwq_tbl[] and dfl_pwq to put the base refs.
* @wq will be freed when the last pwq is released.
*/
for_each_node(node) {
pwq = rcu_access_pointer(wq->numa_pwq_tbl[node]);
RCU_INIT_POINTER(wq->numa_pwq_tbl[node], NULL);
put_pwq_unlocked(pwq);
}
/*
* Put dfl_pwq. @wq may be freed any time after dfl_pwq is
* put. Don't access it afterwards.
*/
pwq = wq->dfl_pwq;
wq->dfl_pwq = NULL;
put_pwq_unlocked(pwq);
}
}
EXPORT_SYMBOL_GPL(destroy_workqueue);
/**
* workqueue_set_max_active - adjust max_active of a workqueue
* @wq: target workqueue
* @max_active: new max_active value.
*
* Set max_active of @wq to @max_active.
*
* CONTEXT:
* Don't call from IRQ context.
*/
void workqueue_set_max_active(struct workqueue_struct *wq, int max_active)
{
struct pool_workqueue *pwq;
/* disallow meddling with max_active for ordered workqueues */
if (WARN_ON(wq->flags & __WQ_ORDERED_EXPLICIT))
return;
max_active = wq_clamp_max_active(max_active, wq->flags, wq->name);
mutex_lock(&wq->mutex);
wq->flags &= ~__WQ_ORDERED;
wq->saved_max_active = max_active;
for_each_pwq(pwq, wq)
pwq_adjust_max_active(pwq);
mutex_unlock(&wq->mutex);
}
EXPORT_SYMBOL_GPL(workqueue_set_max_active);
/**
* current_work - retrieve %current task's work struct
*
* Determine if %current task is a workqueue worker and what it's working on.
* Useful to find out the context that the %current task is running in.
*
* Return: work struct if %current task is a workqueue worker, %NULL otherwise.
*/
struct work_struct *current_work(void)
{
struct worker *worker = current_wq_worker();
return worker ? worker->current_work : NULL;
}
EXPORT_SYMBOL(current_work);
/**
* current_is_workqueue_rescuer - is %current workqueue rescuer?
*
* Determine whether %current is a workqueue rescuer. Can be used from
* work functions to determine whether it's being run off the rescuer task.
*
* Return: %true if %current is a workqueue rescuer. %false otherwise.
*/
bool current_is_workqueue_rescuer(void)
{
struct worker *worker = current_wq_worker();
return worker && worker->rescue_wq;
}
/**
* workqueue_congested - test whether a workqueue is congested
* @cpu: CPU in question
* @wq: target workqueue
*
* Test whether @wq's cpu workqueue for @cpu is congested. There is
* no synchronization around this function and the test result is
* unreliable and only useful as advisory hints or for debugging.
*
* If @cpu is WORK_CPU_UNBOUND, the test is performed on the local CPU.
* Note that both per-cpu and unbound workqueues may be associated with
* multiple pool_workqueues which have separate congested states. A
* workqueue being congested on one CPU doesn't mean the workqueue is also
* contested on other CPUs / NUMA nodes.
*
* Return:
* %true if congested, %false otherwise.
*/
bool workqueue_congested(int cpu, struct workqueue_struct *wq)
{
struct pool_workqueue *pwq;
bool ret;
rcu_read_lock();
preempt_disable();
if (cpu == WORK_CPU_UNBOUND)
cpu = smp_processor_id();
if (!(wq->flags & WQ_UNBOUND))
pwq = per_cpu_ptr(wq->cpu_pwqs, cpu);
else
pwq = unbound_pwq_by_node(wq, cpu_to_node(cpu));
ret = !list_empty(&pwq->inactive_works);
preempt_enable();
rcu_read_unlock();
return ret;
}
EXPORT_SYMBOL_GPL(workqueue_congested);
/**
* work_busy - test whether a work is currently pending or running
* @work: the work to be tested
*
* Test whether @work is currently pending or running. There is no
* synchronization around this function and the test result is
* unreliable and only useful as advisory hints or for debugging.
*
* Return:
* OR'd bitmask of WORK_BUSY_* bits.
*/
unsigned int work_busy(struct work_struct *work)
{
struct worker_pool *pool;
unsigned long flags;
unsigned int ret = 0;
if (work_pending(work))
ret |= WORK_BUSY_PENDING;
rcu_read_lock();
pool = get_work_pool(work);
if (pool) {
raw_spin_lock_irqsave(&pool->lock, flags);
if (find_worker_executing_work(pool, work))
ret |= WORK_BUSY_RUNNING; raw_spin_unlock_irqrestore(&pool->lock, flags);
}
rcu_read_unlock();
return ret;
}
EXPORT_SYMBOL_GPL(work_busy);
/**
* set_worker_desc - set description for the current work item
* @fmt: printf-style format string
* @...: arguments for the format string
*
* This function can be called by a running work function to describe what
* the work item is about. If the worker task gets dumped, this
* information will be printed out together to help debugging. The
* description can be at most WORKER_DESC_LEN including the trailing '\0'.
*/
void set_worker_desc(const char *fmt, ...)
{
struct worker *worker = current_wq_worker();
va_list args;
if (worker) {
va_start(args, fmt);
vsnprintf(worker->desc, sizeof(worker->desc), fmt, args);
va_end(args);
}
}
EXPORT_SYMBOL_GPL(set_worker_desc);
/**
* print_worker_info - print out worker information and description
* @log_lvl: the log level to use when printing
* @task: target task
*
* If @task is a worker and currently executing a work item, print out the
* name of the workqueue being serviced and worker description set with
* set_worker_desc() by the currently executing work item.
*
* This function can be safely called on any task as long as the
* task_struct itself is accessible. While safe, this function isn't
* synchronized and may print out mixups or garbages of limited length.
*/
void print_worker_info(const char *log_lvl, struct task_struct *task)
{
work_func_t *fn = NULL;
char name[WQ_NAME_LEN] = { };
char desc[WORKER_DESC_LEN] = { };
struct pool_workqueue *pwq = NULL;
struct workqueue_struct *wq = NULL;
struct worker *worker;
if (!(task->flags & PF_WQ_WORKER))
return;
/*
* This function is called without any synchronization and @task
* could be in any state. Be careful with dereferences.
*/
worker = kthread_probe_data(task);
/*
* Carefully copy the associated workqueue's workfn, name and desc.
* Keep the original last '\0' in case the original is garbage.
*/
copy_from_kernel_nofault(&fn, &worker->current_func, sizeof(fn));
copy_from_kernel_nofault(&pwq, &worker->current_pwq, sizeof(pwq));
copy_from_kernel_nofault(&wq, &pwq->wq, sizeof(wq));
copy_from_kernel_nofault(name, wq->name, sizeof(name) - 1);
copy_from_kernel_nofault(desc, worker->desc, sizeof(desc) - 1);
if (fn || name[0] || desc[0]) {
printk("%sWorkqueue: %s %ps", log_lvl, name, fn);
if (strcmp(name, desc))
pr_cont(" (%s)", desc);
pr_cont("\n");
}
}
static void pr_cont_pool_info(struct worker_pool *pool)
{
pr_cont(" cpus=%*pbl", nr_cpumask_bits, pool->attrs->cpumask);
if (pool->node != NUMA_NO_NODE)
pr_cont(" node=%d", pool->node);
pr_cont(" flags=0x%x nice=%d", pool->flags, pool->attrs->nice);
}
static void pr_cont_work(bool comma, struct work_struct *work)
{
if (work->func == wq_barrier_func) {
struct wq_barrier *barr;
barr = container_of(work, struct wq_barrier, work);
pr_cont("%s BAR(%d)", comma ? "," : "",
task_pid_nr(barr->task));
} else {
pr_cont("%s %ps", comma ? "," : "", work->func);
}
}
static void show_pwq(struct pool_workqueue *pwq)
{
struct worker_pool *pool = pwq->pool;
struct work_struct *work;
struct worker *worker;
bool has_in_flight = false, has_pending = false;
int bkt;
pr_info(" pwq %d:", pool->id);
pr_cont_pool_info(pool);
pr_cont(" active=%d/%d refcnt=%d%s\n",
pwq->nr_active, pwq->max_active, pwq->refcnt,
!list_empty(&pwq->mayday_node) ? " MAYDAY" : "");
hash_for_each(pool->busy_hash, bkt, worker, hentry) {
if (worker->current_pwq == pwq) {
has_in_flight = true;
break;
}
}
if (has_in_flight) {
bool comma = false;
pr_info(" in-flight:");
hash_for_each(pool->busy_hash, bkt, worker, hentry) {
if (worker->current_pwq != pwq)
continue;
pr_cont("%s %d%s:%ps", comma ? "," : "",
task_pid_nr(worker->task),
worker->rescue_wq ? "(RESCUER)" : "",
worker->current_func);
list_for_each_entry(work, &worker->scheduled, entry)
pr_cont_work(false, work);
comma = true;
}
pr_cont("\n");
}
list_for_each_entry(work, &pool->worklist, entry) {
if (get_work_pwq(work) == pwq) {
has_pending = true;
break;
}
}
if (has_pending) {
bool comma = false;
pr_info(" pending:");
list_for_each_entry(work, &pool->worklist, entry) {
if (get_work_pwq(work) != pwq)
continue;
pr_cont_work(comma, work);
comma = !(*work_data_bits(work) & WORK_STRUCT_LINKED);
}
pr_cont("\n");
}
if (!list_empty(&pwq->inactive_works)) {
bool comma = false;
pr_info(" inactive:");
list_for_each_entry(work, &pwq->inactive_works, entry) {
pr_cont_work(comma, work);
comma = !(*work_data_bits(work) & WORK_STRUCT_LINKED);
}
pr_cont("\n");
}
}
/**
* show_workqueue_state - dump workqueue state
*
* Called from a sysrq handler or try_to_freeze_tasks() and prints out
* all busy workqueues and pools.
*/
void show_workqueue_state(void)
{
struct workqueue_struct *wq;
struct worker_pool *pool;
unsigned long flags;
int pi;
rcu_read_lock();
pr_info("Showing busy workqueues and worker pools:\n");
list_for_each_entry_rcu(wq, &workqueues, list) {
struct pool_workqueue *pwq;
bool idle = true;
for_each_pwq(pwq, wq) {
if (pwq->nr_active || !list_empty(&pwq->inactive_works)) {
idle = false;
break;
}
}
if (idle)
continue;
pr_info("workqueue %s: flags=0x%x\n", wq->name, wq->flags);
for_each_pwq(pwq, wq) {
raw_spin_lock_irqsave(&pwq->pool->lock, flags);
if (pwq->nr_active || !list_empty(&pwq->inactive_works)) {
/*
* Defer printing to avoid deadlocks in console
* drivers that queue work while holding locks
* also taken in their write paths.
*/
printk_deferred_enter();
show_pwq(pwq);
printk_deferred_exit();
}
raw_spin_unlock_irqrestore(&pwq->pool->lock, flags);
/*
* We could be printing a lot from atomic context, e.g.
* sysrq-t -> show_workqueue_state(). Avoid triggering
* hard lockup.
*/
touch_nmi_watchdog();
}
}
for_each_pool(pool, pi) {
struct worker *worker;
bool first = true;
raw_spin_lock_irqsave(&pool->lock, flags);
if (pool->nr_workers == pool->nr_idle)
goto next_pool;
/*
* Defer printing to avoid deadlocks in console drivers that
* queue work while holding locks also taken in their write
* paths.
*/
printk_deferred_enter();
pr_info("pool %d:", pool->id);
pr_cont_pool_info(pool);
pr_cont(" hung=%us workers=%d",
jiffies_to_msecs(jiffies - pool->watchdog_ts) / 1000,
pool->nr_workers);
if (pool->manager)
pr_cont(" manager: %d",
task_pid_nr(pool->manager->task));
list_for_each_entry(worker, &pool->idle_list, entry) {
pr_cont(" %s%d", first ? "idle: " : "",
task_pid_nr(worker->task));
first = false;
}
pr_cont("\n");
printk_deferred_exit();
next_pool:
raw_spin_unlock_irqrestore(&pool->lock, flags);
/*
* We could be printing a lot from atomic context, e.g.
* sysrq-t -> show_workqueue_state(). Avoid triggering
* hard lockup.
*/
touch_nmi_watchdog();
}
rcu_read_unlock();
}
/* used to show worker information through /proc/PID/{comm,stat,status} */
void wq_worker_comm(char *buf, size_t size, struct task_struct *task)
{
int off;
/* always show the actual comm */
off = strscpy(buf, task->comm, size);
if (off < 0)
return;
/* stabilize PF_WQ_WORKER and worker pool association */
mutex_lock(&wq_pool_attach_mutex);
if (task->flags & PF_WQ_WORKER) {
struct worker *worker = kthread_data(task);
struct worker_pool *pool = worker->pool;
if (pool) {
raw_spin_lock_irq(&pool->lock);
/*
* ->desc tracks information (wq name or
* set_worker_desc()) for the latest execution. If
* current, prepend '+', otherwise '-'.
*/
if (worker->desc[0] != '\0') {
if (worker->current_work)
scnprintf(buf + off, size - off, "+%s",
worker->desc);
else
scnprintf(buf + off, size - off, "-%s",
worker->desc);
}
raw_spin_unlock_irq(&pool->lock);
}
}
mutex_unlock(&wq_pool_attach_mutex);
}
#ifdef CONFIG_SMP
/*
* CPU hotplug.
*
* There are two challenges in supporting CPU hotplug. Firstly, there
* are a lot of assumptions on strong associations among work, pwq and
* pool which make migrating pending and scheduled works very
* difficult to implement without impacting hot paths. Secondly,
* worker pools serve mix of short, long and very long running works making
* blocked draining impractical.
*
* This is solved by allowing the pools to be disassociated from the CPU
* running as an unbound one and allowing it to be reattached later if the
* cpu comes back online.
*/
static void unbind_workers(int cpu)
{
struct worker_pool *pool;
struct worker *worker;
for_each_cpu_worker_pool(pool, cpu) {
mutex_lock(&wq_pool_attach_mutex);
raw_spin_lock_irq(&pool->lock);
/*
* We've blocked all attach/detach operations. Make all workers
* unbound and set DISASSOCIATED. Before this, all workers
* except for the ones which are still executing works from
* before the last CPU down must be on the cpu. After
* this, they may become diasporas.
*/
for_each_pool_worker(worker, pool)
worker->flags |= WORKER_UNBOUND;
pool->flags |= POOL_DISASSOCIATED;
raw_spin_unlock_irq(&pool->lock);
for_each_pool_worker(worker, pool) {
kthread_set_per_cpu(worker->task, -1);
WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task, cpu_possible_mask) < 0);
}
mutex_unlock(&wq_pool_attach_mutex);
/*
* Call schedule() so that we cross rq->lock and thus can
* guarantee sched callbacks see the %WORKER_UNBOUND flag.
* This is necessary as scheduler callbacks may be invoked
* from other cpus.
*/
schedule();
/*
* Sched callbacks are disabled now. Zap nr_running.
* After this, nr_running stays zero and need_more_worker()
* and keep_working() are always true as long as the
* worklist is not empty. This pool now behaves as an
* unbound (in terms of concurrency management) pool which
* are served by workers tied to the pool.
*/
atomic_set(&pool->nr_running, 0);
/*
* With concurrency management just turned off, a busy
* worker blocking could lead to lengthy stalls. Kick off
* unbound chain execution of currently pending work items.
*/
raw_spin_lock_irq(&pool->lock);
wake_up_worker(pool);
raw_spin_unlock_irq(&pool->lock);
}
}
/**
* rebind_workers - rebind all workers of a pool to the associated CPU
* @pool: pool of interest
*
* @pool->cpu is coming online. Rebind all workers to the CPU.
*/
static void rebind_workers(struct worker_pool *pool)
{
struct worker *worker;
lockdep_assert_held(&wq_pool_attach_mutex);
/*
* Restore CPU affinity of all workers. As all idle workers should
* be on the run-queue of the associated CPU before any local
* wake-ups for concurrency management happen, restore CPU affinity
* of all workers first and then clear UNBOUND. As we're called
* from CPU_ONLINE, the following shouldn't fail.
*/
for_each_pool_worker(worker, pool) {
kthread_set_per_cpu(worker->task, pool->cpu);
WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task,
pool->attrs->cpumask) < 0);
}
raw_spin_lock_irq(&pool->lock);
pool->flags &= ~POOL_DISASSOCIATED;
for_each_pool_worker(worker, pool) {
unsigned int worker_flags = worker->flags;
/*
* A bound idle worker should actually be on the runqueue
* of the associated CPU for local wake-ups targeting it to
* work. Kick all idle workers so that they migrate to the
* associated CPU. Doing this in the same loop as
* replacing UNBOUND with REBOUND is safe as no worker will
* be bound before @pool->lock is released.
*/
if (worker_flags & WORKER_IDLE)
wake_up_process(worker->task);
/*
* We want to clear UNBOUND but can't directly call
* worker_clr_flags() or adjust nr_running. Atomically
* replace UNBOUND with another NOT_RUNNING flag REBOUND.
* @worker will clear REBOUND using worker_clr_flags() when
* it initiates the next execution cycle thus restoring
* concurrency management. Note that when or whether
* @worker clears REBOUND doesn't affect correctness.
*
* WRITE_ONCE() is necessary because @worker->flags may be
* tested without holding any lock in
* wq_worker_running(). Without it, NOT_RUNNING test may
* fail incorrectly leading to premature concurrency
* management operations.
*/
WARN_ON_ONCE(!(worker_flags & WORKER_UNBOUND));
worker_flags |= WORKER_REBOUND;
worker_flags &= ~WORKER_UNBOUND;
WRITE_ONCE(worker->flags, worker_flags);
}
raw_spin_unlock_irq(&pool->lock);
}
/**
* restore_unbound_workers_cpumask - restore cpumask of unbound workers
* @pool: unbound pool of interest
* @cpu: the CPU which is coming up
*
* An unbound pool may end up with a cpumask which doesn't have any online
* CPUs. When a worker of such pool get scheduled, the scheduler resets
* its cpus_allowed. If @cpu is in @pool's cpumask which didn't have any
* online CPU before, cpus_allowed of all its workers should be restored.
*/
static void restore_unbound_workers_cpumask(struct worker_pool *pool, int cpu)
{
static cpumask_t cpumask;
struct worker *worker;
lockdep_assert_held(&wq_pool_attach_mutex);
/* is @cpu allowed for @pool? */
if (!cpumask_test_cpu(cpu, pool->attrs->cpumask))
return;
cpumask_and(&cpumask, pool->attrs->cpumask, cpu_online_mask);
/* as we're called from CPU_ONLINE, the following shouldn't fail */
for_each_pool_worker(worker, pool)
WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task, &cpumask) < 0);
}
int workqueue_prepare_cpu(unsigned int cpu)
{
struct worker_pool *pool;
for_each_cpu_worker_pool(pool, cpu) {
if (pool->nr_workers)
continue;
if (!create_worker(pool))
return -ENOMEM;
}
return 0;
}
int workqueue_online_cpu(unsigned int cpu)
{
struct worker_pool *pool;
struct workqueue_struct *wq;
int pi;
mutex_lock(&wq_pool_mutex);
for_each_pool(pool, pi) {
mutex_lock(&wq_pool_attach_mutex);
if (pool->cpu == cpu)
rebind_workers(pool);
else if (pool->cpu < 0)
restore_unbound_workers_cpumask(pool, cpu);
mutex_unlock(&wq_pool_attach_mutex);
}
/* update NUMA affinity of unbound workqueues */
list_for_each_entry(wq, &workqueues, list)
wq_update_unbound_numa(wq, cpu, true);
mutex_unlock(&wq_pool_mutex);
return 0;
}
int workqueue_offline_cpu(unsigned int cpu)
{
struct workqueue_struct *wq;
/* unbinding per-cpu workers should happen on the local CPU */
if (WARN_ON(cpu != smp_processor_id()))
return -1;
unbind_workers(cpu);
/* update NUMA affinity of unbound workqueues */
mutex_lock(&wq_pool_mutex);
list_for_each_entry(wq, &workqueues, list)
wq_update_unbound_numa(wq, cpu, false);
mutex_unlock(&wq_pool_mutex);
return 0;
}
struct work_for_cpu {
struct work_struct work;
long (*fn)(void *);
void *arg;
long ret;
};
static void work_for_cpu_fn(struct work_struct *work)
{
struct work_for_cpu *wfc = container_of(work, struct work_for_cpu, work);
wfc->ret = wfc->fn(wfc->arg);
}
/**
* work_on_cpu - run a function in thread context on a particular cpu
* @cpu: the cpu to run on
* @fn: the function to run
* @arg: the function arg
*
* It is up to the caller to ensure that the cpu doesn't go offline.
* The caller must not hold any locks which would prevent @fn from completing.
*
* Return: The value @fn returns.
*/
long work_on_cpu(int cpu, long (*fn)(void *), void *arg)
{
struct work_for_cpu wfc = { .fn = fn, .arg = arg };
INIT_WORK_ONSTACK(&wfc.work, work_for_cpu_fn);
schedule_work_on(cpu, &wfc.work);
flush_work(&wfc.work);
destroy_work_on_stack(&wfc.work);
return wfc.ret;
}
EXPORT_SYMBOL_GPL(work_on_cpu);
/**
* work_on_cpu_safe - run a function in thread context on a particular cpu
* @cpu: the cpu to run on
* @fn: the function to run
* @arg: the function argument
*
* Disables CPU hotplug and calls work_on_cpu(). The caller must not hold
* any locks which would prevent @fn from completing.
*
* Return: The value @fn returns.
*/
long work_on_cpu_safe(int cpu, long (*fn)(void *), void *arg)
{
long ret = -ENODEV;
cpus_read_lock();
if (cpu_online(cpu))
ret = work_on_cpu(cpu, fn, arg);
cpus_read_unlock();
return ret;
}
EXPORT_SYMBOL_GPL(work_on_cpu_safe);
#endif /* CONFIG_SMP */
#ifdef CONFIG_FREEZER
/**
* freeze_workqueues_begin - begin freezing workqueues
*
* Start freezing workqueues. After this function returns, all freezable
* workqueues will queue new works to their inactive_works list instead of
* pool->worklist.
*
* CONTEXT:
* Grabs and releases wq_pool_mutex, wq->mutex and pool->lock's.
*/
void freeze_workqueues_begin(void)
{
struct workqueue_struct *wq;
struct pool_workqueue *pwq;
mutex_lock(&wq_pool_mutex);
WARN_ON_ONCE(workqueue_freezing);
workqueue_freezing = true;
list_for_each_entry(wq, &workqueues, list) {
mutex_lock(&wq->mutex);
for_each_pwq(pwq, wq)
pwq_adjust_max_active(pwq);
mutex_unlock(&wq->mutex);
}
mutex_unlock(&wq_pool_mutex);
}
/**
* freeze_workqueues_busy - are freezable workqueues still busy?
*
* Check whether freezing is complete. This function must be called
* between freeze_workqueues_begin() and thaw_workqueues().
*
* CONTEXT:
* Grabs and releases wq_pool_mutex.
*
* Return:
* %true if some freezable workqueues are still busy. %false if freezing
* is complete.
*/
bool freeze_workqueues_busy(void)
{
bool busy = false;
struct workqueue_struct *wq;
struct pool_workqueue *pwq;
mutex_lock(&wq_pool_mutex);
WARN_ON_ONCE(!workqueue_freezing);
list_for_each_entry(wq, &workqueues, list) {
if (!(wq->flags & WQ_FREEZABLE))
continue;
/*
* nr_active is monotonically decreasing. It's safe
* to peek without lock.
*/
rcu_read_lock();
for_each_pwq(pwq, wq) {
WARN_ON_ONCE(pwq->nr_active < 0);
if (pwq->nr_active) {
busy = true;
rcu_read_unlock();
goto out_unlock;
}
}
rcu_read_unlock();
}
out_unlock:
mutex_unlock(&wq_pool_mutex);
return busy;
}
/**
* thaw_workqueues - thaw workqueues
*
* Thaw workqueues. Normal queueing is restored and all collected
* frozen works are transferred to their respective pool worklists.
*
* CONTEXT:
* Grabs and releases wq_pool_mutex, wq->mutex and pool->lock's.
*/
void thaw_workqueues(void)
{
struct workqueue_struct *wq;
struct pool_workqueue *pwq;
mutex_lock(&wq_pool_mutex);
if (!workqueue_freezing)
goto out_unlock;
workqueue_freezing = false;
/* restore max_active and repopulate worklist */
list_for_each_entry(wq, &workqueues, list) {
mutex_lock(&wq->mutex);
for_each_pwq(pwq, wq)
pwq_adjust_max_active(pwq);
mutex_unlock(&wq->mutex);
}
out_unlock:
mutex_unlock(&wq_pool_mutex);
}
#endif /* CONFIG_FREEZER */
static int workqueue_apply_unbound_cpumask(void)
{
LIST_HEAD(ctxs);
int ret = 0;
struct workqueue_struct *wq;
struct apply_wqattrs_ctx *ctx, *n;
lockdep_assert_held(&wq_pool_mutex);
list_for_each_entry(wq, &workqueues, list) {
if (!(wq->flags & WQ_UNBOUND))
continue;
/* creating multiple pwqs breaks ordering guarantee */
if (wq->flags & __WQ_ORDERED)
continue;
ctx = apply_wqattrs_prepare(wq, wq->unbound_attrs);
if (!ctx) {
ret = -ENOMEM;
break;
}
list_add_tail(&ctx->list, &ctxs);
}
list_for_each_entry_safe(ctx, n, &ctxs, list) {
if (!ret)
apply_wqattrs_commit(ctx);
apply_wqattrs_cleanup(ctx);
}
return ret;
}
/**
* workqueue_set_unbound_cpumask - Set the low-level unbound cpumask
* @cpumask: the cpumask to set
*
* The low-level workqueues cpumask is a global cpumask that limits
* the affinity of all unbound workqueues. This function check the @cpumask
* and apply it to all unbound workqueues and updates all pwqs of them.
*
* Return: 0 - Success
* -EINVAL - Invalid @cpumask
* -ENOMEM - Failed to allocate memory for attrs or pwqs.
*/
int workqueue_set_unbound_cpumask(cpumask_var_t cpumask)
{
int ret = -EINVAL;
cpumask_var_t saved_cpumask;
/*
* Not excluding isolated cpus on purpose.
* If the user wishes to include them, we allow that.
*/
cpumask_and(cpumask, cpumask, cpu_possible_mask);
if (!cpumask_empty(cpumask)) {
apply_wqattrs_lock();
if (cpumask_equal(cpumask, wq_unbound_cpumask)) {
ret = 0;
goto out_unlock;
}
if (!zalloc_cpumask_var(&saved_cpumask, GFP_KERNEL)) {
ret = -ENOMEM;
goto out_unlock;
}
/* save the old wq_unbound_cpumask. */
cpumask_copy(saved_cpumask, wq_unbound_cpumask);
/* update wq_unbound_cpumask at first and apply it to wqs. */
cpumask_copy(wq_unbound_cpumask, cpumask);
ret = workqueue_apply_unbound_cpumask();
/* restore the wq_unbound_cpumask when failed. */
if (ret < 0)
cpumask_copy(wq_unbound_cpumask, saved_cpumask);
free_cpumask_var(saved_cpumask);
out_unlock:
apply_wqattrs_unlock();
}
return ret;
}
#ifdef CONFIG_SYSFS
/*
* Workqueues with WQ_SYSFS flag set is visible to userland via
* /sys/bus/workqueue/devices/WQ_NAME. All visible workqueues have the
* following attributes.
*
* per_cpu RO bool : whether the workqueue is per-cpu or unbound
* max_active RW int : maximum number of in-flight work items
*
* Unbound workqueues have the following extra attributes.
*
* pool_ids RO int : the associated pool IDs for each node
* nice RW int : nice value of the workers
* cpumask RW mask : bitmask of allowed CPUs for the workers
* numa RW bool : whether enable NUMA affinity
*/
struct wq_device {
struct workqueue_struct *wq;
struct device dev;
};
static struct workqueue_struct *dev_to_wq(struct device *dev)
{
struct wq_device *wq_dev = container_of(dev, struct wq_device, dev);
return wq_dev->wq;
}
static ssize_t per_cpu_show(struct device *dev, struct device_attribute *attr,
char *buf)
{
struct workqueue_struct *wq = dev_to_wq(dev);
return scnprintf(buf, PAGE_SIZE, "%d\n", (bool)!(wq->flags & WQ_UNBOUND));
}
static DEVICE_ATTR_RO(per_cpu);
static ssize_t max_active_show(struct device *dev,
struct device_attribute *attr, char *buf)
{
struct workqueue_struct *wq = dev_to_wq(dev);
return scnprintf(buf, PAGE_SIZE, "%d\n", wq->saved_max_active);
}
static ssize_t max_active_store(struct device *dev,
struct device_attribute *attr, const char *buf,
size_t count)
{
struct workqueue_struct *wq = dev_to_wq(dev);
int val;
if (sscanf(buf, "%d", &val) != 1 || val <= 0)
return -EINVAL;
workqueue_set_max_active(wq, val);
return count;
}
static DEVICE_ATTR_RW(max_active);
static struct attribute *wq_sysfs_attrs[] = {
&dev_attr_per_cpu.attr,
&dev_attr_max_active.attr,
NULL,
};
ATTRIBUTE_GROUPS(wq_sysfs);
static ssize_t wq_pool_ids_show(struct device *dev,
struct device_attribute *attr, char *buf)
{
struct workqueue_struct *wq = dev_to_wq(dev);
const char *delim = "";
int node, written = 0;
cpus_read_lock();
rcu_read_lock();
for_each_node(node) {
written += scnprintf(buf + written, PAGE_SIZE - written,
"%s%d:%d", delim, node,
unbound_pwq_by_node(wq, node)->pool->id);
delim = " ";
}
written += scnprintf(buf + written, PAGE_SIZE - written, "\n");
rcu_read_unlock();
cpus_read_unlock();
return written;
}
static ssize_t wq_nice_show(struct device *dev, struct device_attribute *attr,
char *buf)
{
struct workqueue_struct *wq = dev_to_wq(dev);
int written;
mutex_lock(&wq->mutex);
written = scnprintf(buf, PAGE_SIZE, "%d\n", wq->unbound_attrs->nice);
mutex_unlock(&wq->mutex);
return written;
}
/* prepare workqueue_attrs for sysfs store operations */
static struct workqueue_attrs *wq_sysfs_prep_attrs(struct workqueue_struct *wq)
{
struct workqueue_attrs *attrs;
lockdep_assert_held(&wq_pool_mutex);
attrs = alloc_workqueue_attrs();
if (!attrs)
return NULL;
copy_workqueue_attrs(attrs, wq->unbound_attrs);
return attrs;
}
static ssize_t wq_nice_store(struct device *dev, struct device_attribute *attr,
const char *buf, size_t count)
{
struct workqueue_struct *wq = dev_to_wq(dev);
struct workqueue_attrs *attrs;
int ret = -ENOMEM;
apply_wqattrs_lock();
attrs = wq_sysfs_prep_attrs(wq);
if (!attrs)
goto out_unlock;
if (sscanf(buf, "%d", &attrs->nice) == 1 &&
attrs->nice >= MIN_NICE && attrs->nice <= MAX_NICE)
ret = apply_workqueue_attrs_locked(wq, attrs);
else
ret = -EINVAL;
out_unlock:
apply_wqattrs_unlock();
free_workqueue_attrs(attrs);
return ret ?: count;
}
static ssize_t wq_cpumask_show(struct device *dev,
struct device_attribute *attr, char *buf)
{
struct workqueue_struct *wq = dev_to_wq(dev);
int written;
mutex_lock(&wq->mutex);
written = scnprintf(buf, PAGE_SIZE, "%*pb\n",
cpumask_pr_args(wq->unbound_attrs->cpumask));
mutex_unlock(&wq->mutex);
return written;
}
static ssize_t wq_cpumask_store(struct device *dev,
struct device_attribute *attr,
const char *buf, size_t count)
{
struct workqueue_struct *wq = dev_to_wq(dev);
struct workqueue_attrs *attrs;
int ret = -ENOMEM;
apply_wqattrs_lock();
attrs = wq_sysfs_prep_attrs(wq);
if (!attrs)
goto out_unlock;
ret = cpumask_parse(buf, attrs->cpumask);
if (!ret)
ret = apply_workqueue_attrs_locked(wq, attrs);
out_unlock:
apply_wqattrs_unlock();
free_workqueue_attrs(attrs);
return ret ?: count;
}
static ssize_t wq_numa_show(struct device *dev, struct device_attribute *attr,
char *buf)
{
struct workqueue_struct *wq = dev_to_wq(dev);
int written;
mutex_lock(&wq->mutex);
written = scnprintf(buf, PAGE_SIZE, "%d\n",
!wq->unbound_attrs->no_numa);
mutex_unlock(&wq->mutex);
return written;
}
static ssize_t wq_numa_store(struct device *dev, struct device_attribute *attr,
const char *buf, size_t count)
{
struct workqueue_struct *wq = dev_to_wq(dev);
struct workqueue_attrs *attrs;
int v, ret = -ENOMEM;
apply_wqattrs_lock();
attrs = wq_sysfs_prep_attrs(wq);
if (!attrs)
goto out_unlock;
ret = -EINVAL;
if (sscanf(buf, "%d", &v) == 1) {
attrs->no_numa = !v;
ret = apply_workqueue_attrs_locked(wq, attrs);
}
out_unlock:
apply_wqattrs_unlock();
free_workqueue_attrs(attrs);
return ret ?: count;
}
static struct device_attribute wq_sysfs_unbound_attrs[] = {
__ATTR(pool_ids, 0444, wq_pool_ids_show, NULL),
__ATTR(nice, 0644, wq_nice_show, wq_nice_store),
__ATTR(cpumask, 0644, wq_cpumask_show, wq_cpumask_store),
__ATTR(numa, 0644, wq_numa_show, wq_numa_store),
__ATTR_NULL,
};
static struct bus_type wq_subsys = {
.name = "workqueue",
.dev_groups = wq_sysfs_groups,
};
static ssize_t wq_unbound_cpumask_show(struct device *dev,
struct device_attribute *attr, char *buf)
{
int written;
mutex_lock(&wq_pool_mutex);
written = scnprintf(buf, PAGE_SIZE, "%*pb\n",
cpumask_pr_args(wq_unbound_cpumask));
mutex_unlock(&wq_pool_mutex);
return written;
}
static ssize_t wq_unbound_cpumask_store(struct device *dev,
struct device_attribute *attr, const char *buf, size_t count)
{
cpumask_var_t cpumask;
int ret;
if (!zalloc_cpumask_var(&cpumask, GFP_KERNEL))
return -ENOMEM;
ret = cpumask_parse(buf, cpumask);
if (!ret)
ret = workqueue_set_unbound_cpumask(cpumask);
free_cpumask_var(cpumask);
return ret ? ret : count;
}
static struct device_attribute wq_sysfs_cpumask_attr =
__ATTR(cpumask, 0644, wq_unbound_cpumask_show,
wq_unbound_cpumask_store);
static int __init wq_sysfs_init(void)
{
int err;
err = subsys_virtual_register(&wq_subsys, NULL);
if (err)
return err;
return device_create_file(wq_subsys.dev_root, &wq_sysfs_cpumask_attr);
}
core_initcall(wq_sysfs_init);
static void wq_device_release(struct device *dev)
{
struct wq_device *wq_dev = container_of(dev, struct wq_device, dev);
kfree(wq_dev);
}
/**
* workqueue_sysfs_register - make a workqueue visible in sysfs
* @wq: the workqueue to register
*
* Expose @wq in sysfs under /sys/bus/workqueue/devices.
* alloc_workqueue*() automatically calls this function if WQ_SYSFS is set
* which is the preferred method.
*
* Workqueue user should use this function directly iff it wants to apply
* workqueue_attrs before making the workqueue visible in sysfs; otherwise,
* apply_workqueue_attrs() may race against userland updating the
* attributes.
*
* Return: 0 on success, -errno on failure.
*/
int workqueue_sysfs_register(struct workqueue_struct *wq)
{
struct wq_device *wq_dev;
int ret;
/*
* Adjusting max_active or creating new pwqs by applying
* attributes breaks ordering guarantee. Disallow exposing ordered
* workqueues.
*/
if (WARN_ON(wq->flags & __WQ_ORDERED_EXPLICIT))
return -EINVAL;
wq->wq_dev = wq_dev = kzalloc(sizeof(*wq_dev), GFP_KERNEL);
if (!wq_dev)
return -ENOMEM;
wq_dev->wq = wq;
wq_dev->dev.bus = &wq_subsys;
wq_dev->dev.release = wq_device_release;
dev_set_name(&wq_dev->dev, "%s", wq->name);
/*
* unbound_attrs are created separately. Suppress uevent until
* everything is ready.
*/
dev_set_uevent_suppress(&wq_dev->dev, true);
ret = device_register(&wq_dev->dev);
if (ret) {
put_device(&wq_dev->dev);
wq->wq_dev = NULL;
return ret;
}
if (wq->flags & WQ_UNBOUND) {
struct device_attribute *attr;
for (attr = wq_sysfs_unbound_attrs; attr->attr.name; attr++) {
ret = device_create_file(&wq_dev->dev, attr);
if (ret) {
device_unregister(&wq_dev->dev);
wq->wq_dev = NULL;
return ret;
}
}
}
dev_set_uevent_suppress(&wq_dev->dev, false);
kobject_uevent(&wq_dev->dev.kobj, KOBJ_ADD);
return 0;
}
/**
* workqueue_sysfs_unregister - undo workqueue_sysfs_register()
* @wq: the workqueue to unregister
*
* If @wq is registered to sysfs by workqueue_sysfs_register(), unregister.
*/
static void workqueue_sysfs_unregister(struct workqueue_struct *wq)
{
struct wq_device *wq_dev = wq->wq_dev;
if (!wq->wq_dev)
return;
wq->wq_dev = NULL;
device_unregister(&wq_dev->dev);
}
#else /* CONFIG_SYSFS */
static void workqueue_sysfs_unregister(struct workqueue_struct *wq) { }
#endif /* CONFIG_SYSFS */
/*
* Workqueue watchdog.
*
* Stall may be caused by various bugs - missing WQ_MEM_RECLAIM, illegal
* flush dependency, a concurrency managed work item which stays RUNNING
* indefinitely. Workqueue stalls can be very difficult to debug as the
* usual warning mechanisms don't trigger and internal workqueue state is
* largely opaque.
*
* Workqueue watchdog monitors all worker pools periodically and dumps
* state if some pools failed to make forward progress for a while where
* forward progress is defined as the first item on ->worklist changing.
*
* This mechanism is controlled through the kernel parameter
* "workqueue.watchdog_thresh" which can be updated at runtime through the
* corresponding sysfs parameter file.
*/
#ifdef CONFIG_WQ_WATCHDOG
static unsigned long wq_watchdog_thresh = 30;
static struct timer_list wq_watchdog_timer;
static unsigned long wq_watchdog_touched = INITIAL_JIFFIES;
static DEFINE_PER_CPU(unsigned long, wq_watchdog_touched_cpu) = INITIAL_JIFFIES;
static void wq_watchdog_reset_touched(void)
{
int cpu;
wq_watchdog_touched = jiffies;
for_each_possible_cpu(cpu)
per_cpu(wq_watchdog_touched_cpu, cpu) = jiffies;
}
static void wq_watchdog_timer_fn(struct timer_list *unused)
{
unsigned long thresh = READ_ONCE(wq_watchdog_thresh) * HZ;
bool lockup_detected = false;
unsigned long now = jiffies;
struct worker_pool *pool;
int pi;
if (!thresh)
return;
rcu_read_lock();
for_each_pool(pool, pi) {
unsigned long pool_ts, touched, ts;
if (list_empty(&pool->worklist))
continue;
/*
* If a virtual machine is stopped by the host it can look to
* the watchdog like a stall.
*/
kvm_check_and_clear_guest_paused();
/* get the latest of pool and touched timestamps */
if (pool->cpu >= 0)
touched = READ_ONCE(per_cpu(wq_watchdog_touched_cpu, pool->cpu));
else
touched = READ_ONCE(wq_watchdog_touched);
pool_ts = READ_ONCE(pool->watchdog_ts);
if (time_after(pool_ts, touched))
ts = pool_ts;
else
ts = touched;
/* did we stall? */
if (time_after(now, ts + thresh)) {
lockup_detected = true;
pr_emerg("BUG: workqueue lockup - pool");
pr_cont_pool_info(pool);
pr_cont(" stuck for %us!\n",
jiffies_to_msecs(now - pool_ts) / 1000);
}
}
rcu_read_unlock();
if (lockup_detected)
show_workqueue_state();
wq_watchdog_reset_touched();
mod_timer(&wq_watchdog_timer, jiffies + thresh);
}
notrace void wq_watchdog_touch(int cpu)
{
if (cpu >= 0)
per_cpu(wq_watchdog_touched_cpu, cpu) = jiffies;
wq_watchdog_touched = jiffies;
}
static void wq_watchdog_set_thresh(unsigned long thresh)
{
wq_watchdog_thresh = 0;
del_timer_sync(&wq_watchdog_timer);
if (thresh) {
wq_watchdog_thresh = thresh;
wq_watchdog_reset_touched();
mod_timer(&wq_watchdog_timer, jiffies + thresh * HZ);
}
}
static int wq_watchdog_param_set_thresh(const char *val,
const struct kernel_param *kp)
{
unsigned long thresh;
int ret;
ret = kstrtoul(val, 0, &thresh);
if (ret)
return ret;
if (system_wq)
wq_watchdog_set_thresh(thresh);
else
wq_watchdog_thresh = thresh;
return 0;
}
static const struct kernel_param_ops wq_watchdog_thresh_ops = {
.set = wq_watchdog_param_set_thresh,
.get = param_get_ulong,
};
module_param_cb(watchdog_thresh, &wq_watchdog_thresh_ops, &wq_watchdog_thresh,
0644);
static void wq_watchdog_init(void)
{
timer_setup(&wq_watchdog_timer, wq_watchdog_timer_fn, TIMER_DEFERRABLE);
wq_watchdog_set_thresh(wq_watchdog_thresh);
}
#else /* CONFIG_WQ_WATCHDOG */
static inline void wq_watchdog_init(void) { }
#endif /* CONFIG_WQ_WATCHDOG */
static void __init wq_numa_init(void)
{
cpumask_var_t *tbl;
int node, cpu;
if (num_possible_nodes() <= 1)
return;
if (wq_disable_numa) {
pr_info("workqueue: NUMA affinity support disabled\n");
return;
}
for_each_possible_cpu(cpu) {
if (WARN_ON(cpu_to_node(cpu) == NUMA_NO_NODE)) {
pr_warn("workqueue: NUMA node mapping not available for cpu%d, disabling NUMA support\n", cpu);
return;
}
}
wq_update_unbound_numa_attrs_buf = alloc_workqueue_attrs();
BUG_ON(!wq_update_unbound_numa_attrs_buf);
/*
* We want masks of possible CPUs of each node which isn't readily
* available. Build one from cpu_to_node() which should have been
* fully initialized by now.
*/
tbl = kcalloc(nr_node_ids, sizeof(tbl[0]), GFP_KERNEL);
BUG_ON(!tbl);
for_each_node(node)
BUG_ON(!zalloc_cpumask_var_node(&tbl[node], GFP_KERNEL,
node_online(node) ? node : NUMA_NO_NODE));
for_each_possible_cpu(cpu) {
node = cpu_to_node(cpu);
cpumask_set_cpu(cpu, tbl[node]);
}
wq_numa_possible_cpumask = tbl;
wq_numa_enabled = true;
}
/**
* workqueue_init_early - early init for workqueue subsystem
*
* This is the first half of two-staged workqueue subsystem initialization
* and invoked as soon as the bare basics - memory allocation, cpumasks and
* idr are up. It sets up all the data structures and system workqueues
* and allows early boot code to create workqueues and queue/cancel work
* items. Actual work item execution starts only after kthreads can be
* created and scheduled right before early initcalls.
*/
void __init workqueue_init_early(void)
{
int std_nice[NR_STD_WORKER_POOLS] = { 0, HIGHPRI_NICE_LEVEL };
int hk_flags = HK_FLAG_DOMAIN | HK_FLAG_WQ;
int i, cpu;
BUILD_BUG_ON(__alignof__(struct pool_workqueue) < __alignof__(long long));
BUG_ON(!alloc_cpumask_var(&wq_unbound_cpumask, GFP_KERNEL));
cpumask_copy(wq_unbound_cpumask, housekeeping_cpumask(hk_flags));
pwq_cache = KMEM_CACHE(pool_workqueue, SLAB_PANIC);
/* initialize CPU pools */
for_each_possible_cpu(cpu) {
struct worker_pool *pool;
i = 0;
for_each_cpu_worker_pool(pool, cpu) {
BUG_ON(init_worker_pool(pool));
pool->cpu = cpu;
cpumask_copy(pool->attrs->cpumask, cpumask_of(cpu));
pool->attrs->nice = std_nice[i++];
pool->node = cpu_to_node(cpu);
/* alloc pool ID */
mutex_lock(&wq_pool_mutex);
BUG_ON(worker_pool_assign_id(pool));
mutex_unlock(&wq_pool_mutex);
}
}
/* create default unbound and ordered wq attrs */
for (i = 0; i < NR_STD_WORKER_POOLS; i++) {
struct workqueue_attrs *attrs;
BUG_ON(!(attrs = alloc_workqueue_attrs()));
attrs->nice = std_nice[i];
unbound_std_wq_attrs[i] = attrs;
/*
* An ordered wq should have only one pwq as ordering is
* guaranteed by max_active which is enforced by pwqs.
* Turn off NUMA so that dfl_pwq is used for all nodes.
*/
BUG_ON(!(attrs = alloc_workqueue_attrs()));
attrs->nice = std_nice[i];
attrs->no_numa = true;
ordered_wq_attrs[i] = attrs;
}
system_wq = alloc_workqueue("events", 0, 0);
system_highpri_wq = alloc_workqueue("events_highpri", WQ_HIGHPRI, 0);
system_long_wq = alloc_workqueue("events_long", 0, 0);
system_unbound_wq = alloc_workqueue("events_unbound", WQ_UNBOUND,
WQ_UNBOUND_MAX_ACTIVE);
system_freezable_wq = alloc_workqueue("events_freezable",
WQ_FREEZABLE, 0);
system_power_efficient_wq = alloc_workqueue("events_power_efficient",
WQ_POWER_EFFICIENT, 0);
system_freezable_power_efficient_wq = alloc_workqueue("events_freezable_power_efficient",
WQ_FREEZABLE | WQ_POWER_EFFICIENT,
0);
BUG_ON(!system_wq || !system_highpri_wq || !system_long_wq ||
!system_unbound_wq || !system_freezable_wq ||
!system_power_efficient_wq ||
!system_freezable_power_efficient_wq);
}
/**
* workqueue_init - bring workqueue subsystem fully online
*
* This is the latter half of two-staged workqueue subsystem initialization
* and invoked as soon as kthreads can be created and scheduled.
* Workqueues have been created and work items queued on them, but there
* are no kworkers executing the work items yet. Populate the worker pools
* with the initial workers and enable future kworker creations.
*/
void __init workqueue_init(void)
{
struct workqueue_struct *wq;
struct worker_pool *pool;
int cpu, bkt;
/*
* It'd be simpler to initialize NUMA in workqueue_init_early() but
* CPU to node mapping may not be available that early on some
* archs such as power and arm64. As per-cpu pools created
* previously could be missing node hint and unbound pools NUMA
* affinity, fix them up.
*
* Also, while iterating workqueues, create rescuers if requested.
*/
wq_numa_init();
mutex_lock(&wq_pool_mutex);
for_each_possible_cpu(cpu) {
for_each_cpu_worker_pool(pool, cpu) {
pool->node = cpu_to_node(cpu);
}
}
list_for_each_entry(wq, &workqueues, list) {
wq_update_unbound_numa(wq, smp_processor_id(), true);
WARN(init_rescuer(wq),
"workqueue: failed to create early rescuer for %s",
wq->name);
}
mutex_unlock(&wq_pool_mutex);
/* create the initial workers */
for_each_online_cpu(cpu) {
for_each_cpu_worker_pool(pool, cpu) {
pool->flags &= ~POOL_DISASSOCIATED;
BUG_ON(!create_worker(pool));
}
}
hash_for_each(unbound_pool_hash, bkt, pool, hash_node)
BUG_ON(!create_worker(pool));
wq_online = true;
wq_watchdog_init();
}
// SPDX-License-Identifier: GPL-2.0-only
/*
* Access kernel or user memory without faulting.
*/
#include <linux/export.h>
#include <linux/mm.h>
#include <linux/uaccess.h>
bool __weak copy_from_kernel_nofault_allowed(const void *unsafe_src,
size_t size)
{
return true;
}
#ifdef HAVE_GET_KERNEL_NOFAULT
#define copy_from_kernel_nofault_loop(dst, src, len, type, err_label) \
while (len >= sizeof(type)) { \
__get_kernel_nofault(dst, src, type, err_label); \
dst += sizeof(type); \
src += sizeof(type); \
len -= sizeof(type); \
}
long copy_from_kernel_nofault(void *dst, const void *src, size_t size)
{
unsigned long align = 0;
if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS))
align = (unsigned long)dst | (unsigned long)src;
if (!copy_from_kernel_nofault_allowed(src, size))
return -ERANGE;
pagefault_disable();
if (!(align & 7))
copy_from_kernel_nofault_loop(dst, src, size, u64, Efault);
if (!(align & 3))
copy_from_kernel_nofault_loop(dst, src, size, u32, Efault);
if (!(align & 1))
copy_from_kernel_nofault_loop(dst, src, size, u16, Efault); copy_from_kernel_nofault_loop(dst, src, size, u8, Efault);
pagefault_enable();
return 0;
Efault:
pagefault_enable();
return -EFAULT;
}
EXPORT_SYMBOL_GPL(copy_from_kernel_nofault);
#define copy_to_kernel_nofault_loop(dst, src, len, type, err_label) \
while (len >= sizeof(type)) { \
__put_kernel_nofault(dst, src, type, err_label); \
dst += sizeof(type); \
src += sizeof(type); \
len -= sizeof(type); \
}
long copy_to_kernel_nofault(void *dst, const void *src, size_t size)
{
unsigned long align = 0;
if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS))
align = (unsigned long)dst | (unsigned long)src;
pagefault_disable();
if (!(align & 7))
copy_to_kernel_nofault_loop(dst, src, size, u64, Efault);
if (!(align & 3))
copy_to_kernel_nofault_loop(dst, src, size, u32, Efault);
if (!(align & 1))
copy_to_kernel_nofault_loop(dst, src, size, u16, Efault);
copy_to_kernel_nofault_loop(dst, src, size, u8, Efault);
pagefault_enable();
return 0;
Efault:
pagefault_enable();
return -EFAULT;
}
long strncpy_from_kernel_nofault(char *dst, const void *unsafe_addr, long count)
{
const void *src = unsafe_addr;
if (unlikely(count <= 0))
return 0;
if (!copy_from_kernel_nofault_allowed(unsafe_addr, count))
return -ERANGE;
pagefault_disable();
do {
__get_kernel_nofault(dst, src, u8, Efault);
dst++;
src++;
} while (dst[-1] && src - unsafe_addr < count);
pagefault_enable();
dst[-1] = '\0';
return src - unsafe_addr;
Efault:
pagefault_enable();
dst[-1] = '\0';
return -EFAULT;
}
#else /* HAVE_GET_KERNEL_NOFAULT */
/**
* copy_from_kernel_nofault(): safely attempt to read from kernel-space
* @dst: pointer to the buffer that shall take the data
* @src: address to read from
* @size: size of the data chunk
*
* Safely read from kernel address @src to the buffer at @dst. If a kernel
* fault happens, handle that and return -EFAULT. If @src is not a valid kernel
* address, return -ERANGE.
*
* We ensure that the copy_from_user is executed in atomic context so that
* do_page_fault() doesn't attempt to take mmap_lock. This makes
* copy_from_kernel_nofault() suitable for use within regions where the caller
* already holds mmap_lock, or other locks which nest inside mmap_lock.
*/
long copy_from_kernel_nofault(void *dst, const void *src, size_t size)
{
long ret;
mm_segment_t old_fs = get_fs();
if (!copy_from_kernel_nofault_allowed(src, size))
return -ERANGE;
set_fs(KERNEL_DS);
pagefault_disable();
ret = __copy_from_user_inatomic(dst, (__force const void __user *)src,
size);
pagefault_enable();
set_fs(old_fs);
if (ret)
return -EFAULT;
return 0;
}
EXPORT_SYMBOL_GPL(copy_from_kernel_nofault);
/**
* copy_to_kernel_nofault(): safely attempt to write to a location
* @dst: address to write to
* @src: pointer to the data that shall be written
* @size: size of the data chunk
*
* Safely write to address @dst from the buffer at @src. If a kernel fault
* happens, handle that and return -EFAULT.
*/
long copy_to_kernel_nofault(void *dst, const void *src, size_t size)
{
long ret;
mm_segment_t old_fs = get_fs();
set_fs(KERNEL_DS);
pagefault_disable();
ret = __copy_to_user_inatomic((__force void __user *)dst, src, size);
pagefault_enable();
set_fs(old_fs);
if (ret)
return -EFAULT;
return 0;
}
/**
* strncpy_from_kernel_nofault: - Copy a NUL terminated string from unsafe
* address.
* @dst: Destination address, in kernel space. This buffer must be at
* least @count bytes long.
* @unsafe_addr: Unsafe address.
* @count: Maximum number of bytes to copy, including the trailing NUL.
*
* Copies a NUL-terminated string from unsafe address to kernel buffer.
*
* On success, returns the length of the string INCLUDING the trailing NUL.
*
* If access fails, returns -EFAULT (some data may have been copied and the
* trailing NUL added). If @unsafe_addr is not a valid kernel address, return
* -ERANGE.
*
* If @count is smaller than the length of the string, copies @count-1 bytes,
* sets the last byte of @dst buffer to NUL and returns @count.
*/
long strncpy_from_kernel_nofault(char *dst, const void *unsafe_addr, long count)
{
mm_segment_t old_fs = get_fs();
const void *src = unsafe_addr;
long ret;
if (unlikely(count <= 0))
return 0;
if (!copy_from_kernel_nofault_allowed(unsafe_addr, count))
return -ERANGE;
set_fs(KERNEL_DS);
pagefault_disable();
do {
ret = __get_user(*dst++, (const char __user __force *)src++);
} while (dst[-1] && ret == 0 && src - unsafe_addr < count);
dst[-1] = '\0';
pagefault_enable();
set_fs(old_fs);
return ret ? -EFAULT : src - unsafe_addr;
}
#endif /* HAVE_GET_KERNEL_NOFAULT */
/**
* copy_from_user_nofault(): safely attempt to read from a user-space location
* @dst: pointer to the buffer that shall take the data
* @src: address to read from. This must be a user address.
* @size: size of the data chunk
*
* Safely read from user address @src to the buffer at @dst. If a kernel fault
* happens, handle that and return -EFAULT.
*/
long copy_from_user_nofault(void *dst, const void __user *src, size_t size)
{
long ret = -EFAULT;
mm_segment_t old_fs = force_uaccess_begin();
if (access_ok(src, size)) {
pagefault_disable();
ret = __copy_from_user_inatomic(dst, src, size);
pagefault_enable();
}
force_uaccess_end(old_fs);
if (ret)
return -EFAULT;
return 0;
}
EXPORT_SYMBOL_GPL(copy_from_user_nofault);
/**
* copy_to_user_nofault(): safely attempt to write to a user-space location
* @dst: address to write to
* @src: pointer to the data that shall be written
* @size: size of the data chunk
*
* Safely write to address @dst from the buffer at @src. If a kernel fault
* happens, handle that and return -EFAULT.
*/
long copy_to_user_nofault(void __user *dst, const void *src, size_t size)
{
long ret = -EFAULT;
mm_segment_t old_fs = force_uaccess_begin();
if (access_ok(dst, size)) {
pagefault_disable();
ret = __copy_to_user_inatomic(dst, src, size);
pagefault_enable();
}
force_uaccess_end(old_fs);
if (ret)
return -EFAULT;
return 0;
}
EXPORT_SYMBOL_GPL(copy_to_user_nofault);
/**
* strncpy_from_user_nofault: - Copy a NUL terminated string from unsafe user
* address.
* @dst: Destination address, in kernel space. This buffer must be at
* least @count bytes long.
* @unsafe_addr: Unsafe user address.
* @count: Maximum number of bytes to copy, including the trailing NUL.
*
* Copies a NUL-terminated string from unsafe user address to kernel buffer.
*
* On success, returns the length of the string INCLUDING the trailing NUL.
*
* If access fails, returns -EFAULT (some data may have been copied
* and the trailing NUL added).
*
* If @count is smaller than the length of the string, copies @count-1 bytes,
* sets the last byte of @dst buffer to NUL and returns @count.
*/
long strncpy_from_user_nofault(char *dst, const void __user *unsafe_addr,
long count)
{
mm_segment_t old_fs;
long ret;
if (unlikely(count <= 0))
return 0;
old_fs = force_uaccess_begin();
pagefault_disable();
ret = strncpy_from_user(dst, unsafe_addr, count);
pagefault_enable();
force_uaccess_end(old_fs);
if (ret >= count) {
ret = count;
dst[ret - 1] = '\0';
} else if (ret > 0) {
ret++;
}
return ret;
}
/**
* strnlen_user_nofault: - Get the size of a user string INCLUDING final NUL.
* @unsafe_addr: The string to measure.
* @count: Maximum count (including NUL)
*
* Get the size of a NUL-terminated string in user space without pagefault.
*
* Returns the size of the string INCLUDING the terminating NUL.
*
* If the string is too long, returns a number larger than @count. User
* has to check the return value against "> count".
* On exception (or invalid count), returns 0.
*
* Unlike strnlen_user, this can be used from IRQ handler etc. because
* it disables pagefaults.
*/
long strnlen_user_nofault(const void __user *unsafe_addr, long count)
{
mm_segment_t old_fs;
int ret;
old_fs = force_uaccess_begin();
pagefault_disable();
ret = strnlen_user(unsafe_addr, count);
pagefault_enable();
force_uaccess_end(old_fs);
return ret;
}
// SPDX-License-Identifier: GPL-2.0-or-later
/*
* Copyright (C) 2008 Red Hat, Inc., Eric Paris <eparis@redhat.com>
*/
/*
* Basic idea behind the notification queue: An fsnotify group (like inotify)
* sends the userspace notification about events asynchronously some time after
* the event happened. When inotify gets an event it will need to add that
* event to the group notify queue. Since a single event might need to be on
* multiple group's notification queues we can't add the event directly to each
* queue and instead add a small "event_holder" to each queue. This event_holder
* has a pointer back to the original event. Since the majority of events are
* going to end up on one, and only one, notification queue we embed one
* event_holder into each event. This means we have a single allocation instead
* of always needing two. If the embedded event_holder is already in use by
* another group a new event_holder (from fsnotify_event_holder_cachep) will be
* allocated and used.
*/
#include <linux/fs.h>
#include <linux/init.h>
#include <linux/kernel.h>
#include <linux/list.h>
#include <linux/module.h>
#include <linux/mount.h>
#include <linux/mutex.h>
#include <linux/namei.h>
#include <linux/path.h>
#include <linux/slab.h>
#include <linux/spinlock.h>
#include <linux/atomic.h>
#include <linux/fsnotify_backend.h>
#include "fsnotify.h"
static atomic_t fsnotify_sync_cookie = ATOMIC_INIT(0);
/**
* fsnotify_get_cookie - return a unique cookie for use in synchronizing events.
* Called from fsnotify_move, which is inlined into filesystem modules.
*/
u32 fsnotify_get_cookie(void)
{
return atomic_inc_return(&fsnotify_sync_cookie);
}
EXPORT_SYMBOL_GPL(fsnotify_get_cookie);
void fsnotify_destroy_event(struct fsnotify_group *group,
struct fsnotify_event *event)
{
/* Overflow events are per-group and we don't want to free them */
if (!event || event == group->overflow_event)
return;
/*
* If the event is still queued, we have a problem... Do an unreliable
* lockless check first to avoid locking in the common case. The
* locking may be necessary for permission events which got removed
* from the list by a different CPU than the one freeing the event.
*/
if (!list_empty(&event->list)) {
spin_lock(&group->notification_lock);
WARN_ON(!list_empty(&event->list));
spin_unlock(&group->notification_lock);
}
group->ops->free_event(event);
}
/*
* Try to add an event to the notification queue.
* The group can later pull this event off the queue to deal with.
* The group can use the @merge hook to merge the event with a queued event.
* The group can use the @insert hook to insert the event into hash table.
* The function returns:
* 0 if the event was added to a queue
* 1 if the event was merged with some other queued event
* 2 if the event was not queued - either the queue of events has overflown
* or the group is shutting down.
*/
int fsnotify_add_event(struct fsnotify_group *group,
struct fsnotify_event *event,
int (*merge)(struct fsnotify_group *,
struct fsnotify_event *),
void (*insert)(struct fsnotify_group *,
struct fsnotify_event *))
{
int ret = 0;
struct list_head *list = &group->notification_list;
pr_debug("%s: group=%p event=%p\n", __func__, group, event);
spin_lock(&group->notification_lock);
if (group->shutdown) {
spin_unlock(&group->notification_lock);
return 2;
}
if (event == group->overflow_event || group->q_len >= group->max_events) {
ret = 2;
/* Queue overflow event only if it isn't already queued */
if (!list_empty(&group->overflow_event->list)) {
spin_unlock(&group->notification_lock);
return ret;
}
event = group->overflow_event;
goto queue;
}
if (!list_empty(list) && merge) { ret = merge(group, event); if (ret) {
spin_unlock(&group->notification_lock);
return ret;
}
}
queue:
group->q_len++;
list_add_tail(&event->list, list);
if (insert)
insert(group, event);
spin_unlock(&group->notification_lock);
wake_up(&group->notification_waitq);
kill_fasync(&group->fsn_fa, SIGIO, POLL_IN);
return ret;
}
void fsnotify_remove_queued_event(struct fsnotify_group *group,
struct fsnotify_event *event)
{
assert_spin_locked(&group->notification_lock);
/*
* We need to init list head for the case of overflow event so that
* check in fsnotify_add_event() works
*/
list_del_init(&event->list);
group->q_len--;
}
/*
* Return the first event on the notification list without removing it.
* Returns NULL if the list is empty.
*/
struct fsnotify_event *fsnotify_peek_first_event(struct fsnotify_group *group)
{
assert_spin_locked(&group->notification_lock);
if (fsnotify_notify_queue_is_empty(group))
return NULL;
return list_first_entry(&group->notification_list,
struct fsnotify_event, list);
}
/*
* Remove and return the first event from the notification list. It is the
* responsibility of the caller to destroy the obtained event
*/
struct fsnotify_event *fsnotify_remove_first_event(struct fsnotify_group *group)
{
struct fsnotify_event *event = fsnotify_peek_first_event(group);
if (!event)
return NULL;
pr_debug("%s: group=%p event=%p\n", __func__, group, event);
fsnotify_remove_queued_event(group, event);
return event;
}
/*
* Called when a group is being torn down to clean up any outstanding
* event notifications.
*/
void fsnotify_flush_notify(struct fsnotify_group *group)
{
struct fsnotify_event *event;
spin_lock(&group->notification_lock);
while (!fsnotify_notify_queue_is_empty(group)) {
event = fsnotify_remove_first_event(group);
spin_unlock(&group->notification_lock);
fsnotify_destroy_event(group, event);
spin_lock(&group->notification_lock);
}
spin_unlock(&group->notification_lock);
}
// SPDX-License-Identifier: GPL-2.0
/*
* Implementations of the security context functions.
*
* Author: Ondrej Mosnacek <omosnacek@gmail.com>
* Copyright (C) 2020 Red Hat, Inc.
*/
#include <linux/jhash.h>
#include "context.h"
#include "mls.h"
u32 context_compute_hash(const struct context *c)
{
u32 hash = 0;
/*
* If a context is invalid, it will always be represented by a
* context struct with only the len & str set (and vice versa)
* under a given policy. Since context structs from different
* policies should never meet, it is safe to hash valid and
* invalid contexts differently. The context_cmp() function
* already operates under the same assumption.
*/
if (c->len) return full_name_hash(NULL, c->str, c->len); hash = jhash_3words(c->user, c->role, c->type, hash);
hash = mls_range_hash(&c->range, hash);
return hash;
}
// SPDX-License-Identifier: GPL-2.0
/*
* FPU signal frame handling routines.
*/
#include <linux/compat.h>
#include <linux/cpu.h>
#include <linux/pagemap.h>
#include <asm/fpu/internal.h>
#include <asm/fpu/signal.h>
#include <asm/fpu/regset.h>
#include <asm/fpu/xstate.h>
#include <asm/sigframe.h>
#include <asm/trace/fpu.h>
static struct _fpx_sw_bytes fx_sw_reserved __ro_after_init;
static struct _fpx_sw_bytes fx_sw_reserved_ia32 __ro_after_init;
/*
* Check for the presence of extended state information in the
* user fpstate pointer in the sigcontext.
*/
static inline int check_xstate_in_sigframe(struct fxregs_state __user *fxbuf,
struct _fpx_sw_bytes *fx_sw)
{
int min_xstate_size = sizeof(struct fxregs_state) +
sizeof(struct xstate_header);
void __user *fpstate = fxbuf;
unsigned int magic2;
if (__copy_from_user(fx_sw, &fxbuf->sw_reserved[0], sizeof(*fx_sw)))
return -EFAULT;
/* Check for the first magic field and other error scenarios. */
if (fx_sw->magic1 != FP_XSTATE_MAGIC1 ||
fx_sw->xstate_size < min_xstate_size ||
fx_sw->xstate_size > fpu_user_xstate_size ||
fx_sw->xstate_size > fx_sw->extended_size)
goto setfx;
/*
* Check for the presence of second magic word at the end of memory
* layout. This detects the case where the user just copied the legacy
* fpstate layout with out copying the extended state information
* in the memory layout.
*/
if (__get_user(magic2, (__u32 __user *)(fpstate + fx_sw->xstate_size)))
return -EFAULT;
if (likely(magic2 == FP_XSTATE_MAGIC2))
return 0;
setfx:
trace_x86_fpu_xstate_check_failed(¤t->thread.fpu);
/* Set the parameters for fx only state */
fx_sw->magic1 = 0;
fx_sw->xstate_size = sizeof(struct fxregs_state);
fx_sw->xfeatures = XFEATURE_MASK_FPSSE;
return 0;
}
/*
* Signal frame handlers.
*/
static inline int save_fsave_header(struct task_struct *tsk, void __user *buf)
{
if (use_fxsr()) {
struct xregs_state *xsave = &tsk->thread.fpu.state.xsave;
struct user_i387_ia32_struct env;
struct _fpstate_32 __user *fp = buf;
fpregs_lock();
if (!test_thread_flag(TIF_NEED_FPU_LOAD))
fxsave(&tsk->thread.fpu.state.fxsave);
fpregs_unlock();
convert_from_fxsr(&env, tsk);
if (__copy_to_user(buf, &env, sizeof(env)) ||
__put_user(xsave->i387.swd, &fp->status) ||
__put_user(X86_FXSR_MAGIC, &fp->magic))
return -1;
} else {
struct fregs_state __user *fp = buf;
u32 swd;
if (__get_user(swd, &fp->swd) || __put_user(swd, &fp->status))
return -1;
}
return 0;
}
static inline int save_xstate_epilog(void __user *buf, int ia32_frame)
{
struct xregs_state __user *x = buf;
struct _fpx_sw_bytes *sw_bytes;
u32 xfeatures;
int err;
/* Setup the bytes not touched by the [f]xsave and reserved for SW. */
sw_bytes = ia32_frame ? &fx_sw_reserved_ia32 : &fx_sw_reserved;
err = __copy_to_user(&x->i387.sw_reserved, sw_bytes, sizeof(*sw_bytes));
if (!use_xsave())
return err;
err |= __put_user(FP_XSTATE_MAGIC2,
(__u32 __user *)(buf + fpu_user_xstate_size));
/*
* Read the xfeatures which we copied (directly from the cpu or
* from the state in task struct) to the user buffers.
*/
err |= __get_user(xfeatures, (__u32 __user *)&x->header.xfeatures);
/*
* For legacy compatible, we always set FP/SSE bits in the bit
* vector while saving the state to the user context. This will
* enable us capturing any changes(during sigreturn) to
* the FP/SSE bits by the legacy applications which don't touch
* xfeatures in the xsave header.
*
* xsave aware apps can change the xfeatures in the xsave
* header as well as change any contents in the memory layout.
* xrestore as part of sigreturn will capture all the changes.
*/
xfeatures |= XFEATURE_MASK_FPSSE;
err |= __put_user(xfeatures, (__u32 __user *)&x->header.xfeatures);
return err;
}
static inline int copy_fpregs_to_sigframe(struct xregs_state __user *buf)
{
int err;
if (use_xsave())
err = xsave_to_user_sigframe(buf);
else if (use_fxsr())
err = fxsave_to_user_sigframe((struct fxregs_state __user *) buf);
else
err = fnsave_to_user_sigframe((struct fregs_state __user *) buf);
if (unlikely(err) && __clear_user(buf, fpu_user_xstate_size))
err = -EFAULT;
return err;
}
/*
* Save the fpu, extended register state to the user signal frame.
*
* 'buf_fx' is the 64-byte aligned pointer at which the [f|fx|x]save
* state is copied.
* 'buf' points to the 'buf_fx' or to the fsave header followed by 'buf_fx'.
*
* buf == buf_fx for 64-bit frames and 32-bit fsave frame.
* buf != buf_fx for 32-bit frames with fxstate.
*
* Try to save it directly to the user frame with disabled page fault handler.
* If this fails then do the slow path where the FPU state is first saved to
* task's fpu->state and then copy it to the user frame pointed to by the
* aligned pointer 'buf_fx'.
*
* If this is a 32-bit frame with fxstate, put a fsave header before
* the aligned state at 'buf_fx'.
*
* For [f]xsave state, update the SW reserved fields in the [f]xsave frame
* indicating the absence/presence of the extended state to the user.
*/
int copy_fpstate_to_sigframe(void __user *buf, void __user *buf_fx, int size)
{
struct task_struct *tsk = current;
int ia32_fxstate = (buf != buf_fx);
int ret;
ia32_fxstate &= (IS_ENABLED(CONFIG_X86_32) ||
IS_ENABLED(CONFIG_IA32_EMULATION));
if (!static_cpu_has(X86_FEATURE_FPU)) {
struct user_i387_ia32_struct fp;
fpregs_soft_get(current, NULL, (struct membuf){.p = &fp,
.left = sizeof(fp)});
return copy_to_user(buf, &fp, sizeof(fp)) ? -EFAULT : 0;
}
if (!access_ok(buf, size))
return -EACCES;
retry:
/*
* Load the FPU registers if they are not valid for the current task.
* With a valid FPU state we can attempt to save the state directly to
* userland's stack frame which will likely succeed. If it does not,
* resolve the fault in the user memory and try again.
*/
fpregs_lock();
if (test_thread_flag(TIF_NEED_FPU_LOAD))
fpregs_restore_userregs();
pagefault_disable();
ret = copy_fpregs_to_sigframe(buf_fx);
pagefault_enable();
fpregs_unlock();
if (ret) {
if (!fault_in_writeable(buf_fx, fpu_user_xstate_size))
goto retry;
return -EFAULT;
}
/* Save the fsave header for the 32-bit frames. */
if ((ia32_fxstate || !use_fxsr()) && save_fsave_header(tsk, buf))
return -1;
if (use_fxsr() && save_xstate_epilog(buf_fx, ia32_fxstate))
return -1;
return 0;
}
static int __restore_fpregs_from_user(void __user *buf, u64 xrestore,
bool fx_only)
{
if (use_xsave()) {
u64 init_bv = xfeatures_mask_uabi() & ~xrestore;
int ret;
if (likely(!fx_only))
ret = xrstor_from_user_sigframe(buf, xrestore);
else
ret = fxrstor_from_user_sigframe(buf);
if (!ret && unlikely(init_bv))
os_xrstor(&init_fpstate.xsave, init_bv);
return ret;
} else if (use_fxsr()) {
return fxrstor_from_user_sigframe(buf);
} else {
return frstor_from_user_sigframe(buf);
}
}
/*
* Attempt to restore the FPU registers directly from user memory.
* Pagefaults are handled and any errors returned are fatal.
*/
static int restore_fpregs_from_user(void __user *buf, u64 xrestore,
bool fx_only, unsigned int size)
{
struct fpu *fpu = ¤t->thread.fpu;
int ret;
retry:
fpregs_lock();
pagefault_disable();
ret = __restore_fpregs_from_user(buf, xrestore, fx_only);
pagefault_enable();
if (unlikely(ret)) {
/*
* The above did an FPU restore operation, restricted to
* the user portion of the registers, and failed, but the
* microcode might have modified the FPU registers
* nevertheless.
*
* If the FPU registers do not belong to current, then
* invalidate the FPU register state otherwise the task
* might preempt current and return to user space with
* corrupted FPU registers.
*/
if (test_thread_flag(TIF_NEED_FPU_LOAD))
__cpu_invalidate_fpregs_state();
fpregs_unlock();
/* Try to handle #PF, but anything else is fatal. */
if (ret != -EFAULT)
return -EINVAL;
if (!fault_in_readable(buf, size))
goto retry;
return -EFAULT;
}
/*
* Restore supervisor states: previous context switch etc has done
* XSAVES and saved the supervisor states in the kernel buffer from
* which they can be restored now.
*
* It would be optimal to handle this with a single XRSTORS, but
* this does not work because the rest of the FPU registers have
* been restored from a user buffer directly.
*/
if (test_thread_flag(TIF_NEED_FPU_LOAD) && xfeatures_mask_supervisor())
os_xrstor(&fpu->state.xsave, xfeatures_mask_supervisor());
fpregs_mark_activate();
fpregs_unlock();
return 0;
}
static int __fpu_restore_sig(void __user *buf, void __user *buf_fx,
bool ia32_fxstate)
{
int state_size = fpu_kernel_xstate_size;
struct task_struct *tsk = current;
struct fpu *fpu = &tsk->thread.fpu;
struct user_i387_ia32_struct env;
u64 user_xfeatures = 0;
bool fx_only = false;
int ret;
if (use_xsave()) {
struct _fpx_sw_bytes fx_sw_user;
ret = check_xstate_in_sigframe(buf_fx, &fx_sw_user);
if (unlikely(ret))
return ret;
fx_only = !fx_sw_user.magic1;
state_size = fx_sw_user.xstate_size;
user_xfeatures = fx_sw_user.xfeatures;
} else {
user_xfeatures = XFEATURE_MASK_FPSSE;
}
if (likely(!ia32_fxstate)) {
/*
* Attempt to restore the FPU registers directly from user
* memory. For that to succeed, the user access cannot cause page
* faults. If it does, fall back to the slow path below, going
* through the kernel buffer with the enabled pagefault handler.
*/
return restore_fpregs_from_user(buf_fx, user_xfeatures, fx_only,
state_size);
}
/*
* Copy the legacy state because the FP portion of the FX frame has
* to be ignored for histerical raisins. The legacy state is folded
* in once the larger state has been copied.
*/
ret = __copy_from_user(&env, buf, sizeof(env));
if (ret)
return ret;
/*
* By setting TIF_NEED_FPU_LOAD it is ensured that our xstate is
* not modified on context switch and that the xstate is considered
* to be loaded again on return to userland (overriding last_cpu avoids
* the optimisation).
*/
fpregs_lock();
if (!test_thread_flag(TIF_NEED_FPU_LOAD)) {
/*
* If supervisor states are available then save the
* hardware state in current's fpstate so that the
* supervisor state is preserved. Save the full state for
* simplicity. There is no point in optimizing this by only
* saving the supervisor states and then shuffle them to
* the right place in memory. It's ia32 mode. Shrug.
*/
if (xfeatures_mask_supervisor())
os_xsave(&fpu->state.xsave);
set_thread_flag(TIF_NEED_FPU_LOAD);
}
__fpu_invalidate_fpregs_state(fpu);
__cpu_invalidate_fpregs_state();
fpregs_unlock();
if (use_xsave() && !fx_only) {
ret = copy_sigframe_from_user_to_xstate(&fpu->state.xsave, buf_fx);
if (ret)
return ret;
} else {
if (__copy_from_user(&fpu->state.fxsave, buf_fx,
sizeof(fpu->state.fxsave)))
return -EFAULT;
if (IS_ENABLED(CONFIG_X86_64)) {
/* Reject invalid MXCSR values. */
if (fpu->state.fxsave.mxcsr & ~mxcsr_feature_mask)
return -EINVAL;
} else {
/* Mask invalid bits out for historical reasons (broken hardware). */
fpu->state.fxsave.mxcsr &= mxcsr_feature_mask;
}
/* Enforce XFEATURE_MASK_FPSSE when XSAVE is enabled */
if (use_xsave())
fpu->state.xsave.header.xfeatures |= XFEATURE_MASK_FPSSE;
}
/* Fold the legacy FP storage */
convert_to_fxsr(&fpu->state.fxsave, &env);
fpregs_lock();
if (use_xsave()) {
/*
* Remove all UABI feature bits not set in user_xfeatures
* from the memory xstate header which makes the full
* restore below bring them into init state. This works for
* fx_only mode as well because that has only FP and SSE
* set in user_xfeatures.
*
* Preserve supervisor states!
*/
u64 mask = user_xfeatures | xfeatures_mask_supervisor();
fpu->state.xsave.header.xfeatures &= mask;
ret = os_xrstor_safe(&fpu->state.xsave, xfeatures_mask_all);
} else {
ret = fxrstor_safe(&fpu->state.fxsave);
}
if (likely(!ret))
fpregs_mark_activate();
fpregs_unlock();
return ret;
}
static inline int xstate_sigframe_size(void)
{
return use_xsave() ? fpu_user_xstate_size + FP_XSTATE_MAGIC2_SIZE :
fpu_user_xstate_size;
}
/*
* Restore FPU state from a sigframe:
*/
int fpu__restore_sig(void __user *buf, int ia32_frame)
{
unsigned int size = xstate_sigframe_size();
struct fpu *fpu = ¤t->thread.fpu;
void __user *buf_fx = buf;
bool ia32_fxstate = false;
int ret;
if (unlikely(!buf)) {
fpu__clear_user_states(fpu);
return 0;
}
ia32_frame &= (IS_ENABLED(CONFIG_X86_32) ||
IS_ENABLED(CONFIG_IA32_EMULATION));
/*
* Only FXSR enabled systems need the FX state quirk.
* FRSTOR does not need it and can use the fast path.
*/
if (ia32_frame && use_fxsr()) {
buf_fx = buf + sizeof(struct fregs_state);
size += sizeof(struct fregs_state);
ia32_fxstate = true;
}
if (!access_ok(buf, size)) {
ret = -EACCES;
goto out;
}
if (!IS_ENABLED(CONFIG_X86_64) && !cpu_feature_enabled(X86_FEATURE_FPU)) {
ret = fpregs_soft_set(current, NULL, 0,
sizeof(struct user_i387_ia32_struct),
NULL, buf);
} else {
ret = __fpu_restore_sig(buf, buf_fx, ia32_fxstate);
}
out:
if (unlikely(ret))
fpu__clear_user_states(fpu);
return ret;
}
unsigned long
fpu__alloc_mathframe(unsigned long sp, int ia32_frame,
unsigned long *buf_fx, unsigned long *size)
{
unsigned long frame_size = xstate_sigframe_size();
*buf_fx = sp = round_down(sp - frame_size, 64);
if (ia32_frame && use_fxsr()) {
frame_size += sizeof(struct fregs_state);
sp -= sizeof(struct fregs_state);
}
*size = frame_size;
return sp;
}
unsigned long fpu__get_fpstate_size(void)
{
unsigned long ret = xstate_sigframe_size();
/*
* This space is needed on (most) 32-bit kernels, or when a 32-bit
* app is running on a 64-bit kernel. To keep things simple, just
* assume the worst case and always include space for 'freg_state',
* even for 64-bit apps on 64-bit kernels. This wastes a bit of
* space, but keeps the code simple.
*/
if ((IS_ENABLED(CONFIG_IA32_EMULATION) ||
IS_ENABLED(CONFIG_X86_32)) && use_fxsr())
ret += sizeof(struct fregs_state);
return ret;
}
/*
* Prepare the SW reserved portion of the fxsave memory layout, indicating
* the presence of the extended state information in the memory layout
* pointed by the fpstate pointer in the sigcontext.
* This will be saved when ever the FP and extended state context is
* saved on the user stack during the signal handler delivery to the user.
*/
void fpu__init_prepare_fx_sw_frame(void)
{
int size = fpu_user_xstate_size + FP_XSTATE_MAGIC2_SIZE;
fx_sw_reserved.magic1 = FP_XSTATE_MAGIC1;
fx_sw_reserved.extended_size = size;
fx_sw_reserved.xfeatures = xfeatures_mask_uabi();
fx_sw_reserved.xstate_size = fpu_user_xstate_size;
if (IS_ENABLED(CONFIG_IA32_EMULATION) ||
IS_ENABLED(CONFIG_X86_32)) {
int fsave_header_size = sizeof(struct fregs_state);
fx_sw_reserved_ia32 = fx_sw_reserved;
fx_sw_reserved_ia32.extended_size = size + fsave_header_size;
}
}
// SPDX-License-Identifier: GPL-2.0-only
/*
* umh - the kernel usermode helper
*/
#include <linux/module.h>
#include <linux/sched.h>
#include <linux/sched/task.h>
#include <linux/binfmts.h>
#include <linux/syscalls.h>
#include <linux/unistd.h>
#include <linux/kmod.h>
#include <linux/slab.h>
#include <linux/completion.h>
#include <linux/cred.h>
#include <linux/file.h>
#include <linux/fdtable.h>
#include <linux/fs_struct.h>
#include <linux/workqueue.h>
#include <linux/security.h>
#include <linux/mount.h>
#include <linux/kernel.h>
#include <linux/init.h>
#include <linux/resource.h>
#include <linux/notifier.h>
#include <linux/suspend.h>
#include <linux/rwsem.h>
#include <linux/ptrace.h>
#include <linux/async.h>
#include <linux/uaccess.h>
#include <linux/initrd.h>
#include <trace/events/module.h>
#define CAP_BSET (void *)1
#define CAP_PI (void *)2
static kernel_cap_t usermodehelper_bset = CAP_FULL_SET;
static kernel_cap_t usermodehelper_inheritable = CAP_FULL_SET;
static DEFINE_SPINLOCK(umh_sysctl_lock);
static DECLARE_RWSEM(umhelper_sem);
static void call_usermodehelper_freeinfo(struct subprocess_info *info)
{
if (info->cleanup) (*info->cleanup)(info); kfree(info);
}
static void umh_complete(struct subprocess_info *sub_info)
{
struct completion *comp = xchg(&sub_info->complete, NULL);
/*
* See call_usermodehelper_exec(). If xchg() returns NULL
* we own sub_info, the UMH_KILLABLE caller has gone away
* or the caller used UMH_NO_WAIT.
*/
if (comp)
complete(comp);
else
call_usermodehelper_freeinfo(sub_info);
}
/*
* This is the task which runs the usermode application
*/
static int call_usermodehelper_exec_async(void *data)
{
struct subprocess_info *sub_info = data;
struct cred *new;
int retval;
spin_lock_irq(¤t->sighand->siglock);
flush_signal_handlers(current, 1);
spin_unlock_irq(¤t->sighand->siglock);
/*
* Initial kernel threads share ther FS with init, in order to
* get the init root directory. But we've now created a new
* thread that is going to execve a user process and has its own
* 'struct fs_struct'. Reset umask to the default.
*/
current->fs->umask = 0022;
/*
* Our parent (unbound workqueue) runs with elevated scheduling
* priority. Avoid propagating that into the userspace child.
*/
set_user_nice(current, 0);
retval = -ENOMEM;
new = prepare_kernel_cred(current);
if (!new)
goto out;
spin_lock(&umh_sysctl_lock);
new->cap_bset = cap_intersect(usermodehelper_bset, new->cap_bset);
new->cap_inheritable = cap_intersect(usermodehelper_inheritable,
new->cap_inheritable);
spin_unlock(&umh_sysctl_lock);
if (sub_info->init) {
retval = sub_info->init(sub_info, new);
if (retval) {
abort_creds(new);
goto out;
}
}
commit_creds(new);
wait_for_initramfs();
retval = kernel_execve(sub_info->path,
(const char *const *)sub_info->argv,
(const char *const *)sub_info->envp);
out:
sub_info->retval = retval;
/*
* call_usermodehelper_exec_sync() will call umh_complete
* if UHM_WAIT_PROC.
*/
if (!(sub_info->wait & UMH_WAIT_PROC))
umh_complete(sub_info);
if (!retval)
return 0;
do_exit(0);
}
/* Handles UMH_WAIT_PROC. */
static void call_usermodehelper_exec_sync(struct subprocess_info *sub_info)
{
pid_t pid;
/* If SIGCLD is ignored do_wait won't populate the status. */
kernel_sigaction(SIGCHLD, SIG_DFL);
pid = kernel_thread(call_usermodehelper_exec_async, sub_info, SIGCHLD);
if (pid < 0)
sub_info->retval = pid;
else
kernel_wait(pid, &sub_info->retval);
/* Restore default kernel sig handler */
kernel_sigaction(SIGCHLD, SIG_IGN);
umh_complete(sub_info);
}
/*
* We need to create the usermodehelper kernel thread from a task that is affine
* to an optimized set of CPUs (or nohz housekeeping ones) such that they
* inherit a widest affinity irrespective of call_usermodehelper() callers with
* possibly reduced affinity (eg: per-cpu workqueues). We don't want
* usermodehelper targets to contend a busy CPU.
*
* Unbound workqueues provide such wide affinity and allow to block on
* UMH_WAIT_PROC requests without blocking pending request (up to some limit).
*
* Besides, workqueues provide the privilege level that caller might not have
* to perform the usermodehelper request.
*
*/
static void call_usermodehelper_exec_work(struct work_struct *work)
{
struct subprocess_info *sub_info =
container_of(work, struct subprocess_info, work);
if (sub_info->wait & UMH_WAIT_PROC) {
call_usermodehelper_exec_sync(sub_info);
} else {
pid_t pid;
/*
* Use CLONE_PARENT to reparent it to kthreadd; we do not
* want to pollute current->children, and we need a parent
* that always ignores SIGCHLD to ensure auto-reaping.
*/
pid = kernel_thread(call_usermodehelper_exec_async, sub_info,
CLONE_PARENT | SIGCHLD);
if (pid < 0) {
sub_info->retval = pid;
umh_complete(sub_info);
}
}
}
/*
* If set, call_usermodehelper_exec() will exit immediately returning -EBUSY
* (used for preventing user land processes from being created after the user
* land has been frozen during a system-wide hibernation or suspend operation).
* Should always be manipulated under umhelper_sem acquired for write.
*/
static enum umh_disable_depth usermodehelper_disabled = UMH_DISABLED;
/* Number of helpers running */
static atomic_t running_helpers = ATOMIC_INIT(0);
/*
* Wait queue head used by usermodehelper_disable() to wait for all running
* helpers to finish.
*/
static DECLARE_WAIT_QUEUE_HEAD(running_helpers_waitq);
/*
* Used by usermodehelper_read_lock_wait() to wait for usermodehelper_disabled
* to become 'false'.
*/
static DECLARE_WAIT_QUEUE_HEAD(usermodehelper_disabled_waitq);
/*
* Time to wait for running_helpers to become zero before the setting of
* usermodehelper_disabled in usermodehelper_disable() fails
*/
#define RUNNING_HELPERS_TIMEOUT (5 * HZ)
int usermodehelper_read_trylock(void)
{
DEFINE_WAIT(wait);
int ret = 0;
down_read(&umhelper_sem);
for (;;) {
prepare_to_wait(&usermodehelper_disabled_waitq, &wait,
TASK_INTERRUPTIBLE);
if (!usermodehelper_disabled)
break;
if (usermodehelper_disabled == UMH_DISABLED)
ret = -EAGAIN;
up_read(&umhelper_sem);
if (ret)
break;
schedule();
try_to_freeze();
down_read(&umhelper_sem);
}
finish_wait(&usermodehelper_disabled_waitq, &wait);
return ret;
}
EXPORT_SYMBOL_GPL(usermodehelper_read_trylock);
long usermodehelper_read_lock_wait(long timeout)
{
DEFINE_WAIT(wait);
if (timeout < 0)
return -EINVAL;
down_read(&umhelper_sem);
for (;;) {
prepare_to_wait(&usermodehelper_disabled_waitq, &wait,
TASK_UNINTERRUPTIBLE);
if (!usermodehelper_disabled)
break;
up_read(&umhelper_sem);
timeout = schedule_timeout(timeout);
if (!timeout)
break;
down_read(&umhelper_sem);
}
finish_wait(&usermodehelper_disabled_waitq, &wait);
return timeout;
}
EXPORT_SYMBOL_GPL(usermodehelper_read_lock_wait);
void usermodehelper_read_unlock(void)
{
up_read(&umhelper_sem);
}
EXPORT_SYMBOL_GPL(usermodehelper_read_unlock);
/**
* __usermodehelper_set_disable_depth - Modify usermodehelper_disabled.
* @depth: New value to assign to usermodehelper_disabled.
*
* Change the value of usermodehelper_disabled (under umhelper_sem locked for
* writing) and wakeup tasks waiting for it to change.
*/
void __usermodehelper_set_disable_depth(enum umh_disable_depth depth)
{
down_write(&umhelper_sem);
usermodehelper_disabled = depth;
wake_up(&usermodehelper_disabled_waitq);
up_write(&umhelper_sem);
}
/**
* __usermodehelper_disable - Prevent new helpers from being started.
* @depth: New value to assign to usermodehelper_disabled.
*
* Set usermodehelper_disabled to @depth and wait for running helpers to exit.
*/
int __usermodehelper_disable(enum umh_disable_depth depth)
{
long retval;
if (!depth)
return -EINVAL;
down_write(&umhelper_sem);
usermodehelper_disabled = depth;
up_write(&umhelper_sem);
/*
* From now on call_usermodehelper_exec() won't start any new
* helpers, so it is sufficient if running_helpers turns out to
* be zero at one point (it may be increased later, but that
* doesn't matter).
*/
retval = wait_event_timeout(running_helpers_waitq,
atomic_read(&running_helpers) == 0,
RUNNING_HELPERS_TIMEOUT);
if (retval)
return 0;
__usermodehelper_set_disable_depth(UMH_ENABLED);
return -EAGAIN;
}
static void helper_lock(void)
{
atomic_inc(&running_helpers);
smp_mb__after_atomic();
}
static void helper_unlock(void)
{
if (atomic_dec_and_test(&running_helpers)) wake_up(&running_helpers_waitq);
}
/**
* call_usermodehelper_setup - prepare to call a usermode helper
* @path: path to usermode executable
* @argv: arg vector for process
* @envp: environment for process
* @gfp_mask: gfp mask for memory allocation
* @init: an init function
* @cleanup: a cleanup function
* @data: arbitrary context sensitive data
*
* Returns either %NULL on allocation failure, or a subprocess_info
* structure. This should be passed to call_usermodehelper_exec to
* exec the process and free the structure.
*
* The init function is used to customize the helper process prior to
* exec. A non-zero return code causes the process to error out, exit,
* and return the failure to the calling process
*
* The cleanup function is just before the subprocess_info is about to
* be freed. This can be used for freeing the argv and envp. The
* Function must be runnable in either a process context or the
* context in which call_usermodehelper_exec is called.
*/
struct subprocess_info *call_usermodehelper_setup(const char *path, char **argv,
char **envp, gfp_t gfp_mask,
int (*init)(struct subprocess_info *info, struct cred *new),
void (*cleanup)(struct subprocess_info *info),
void *data)
{
struct subprocess_info *sub_info;
sub_info = kzalloc(sizeof(struct subprocess_info), gfp_mask);
if (!sub_info)
goto out;
INIT_WORK(&sub_info->work, call_usermodehelper_exec_work);
#ifdef CONFIG_STATIC_USERMODEHELPER
sub_info->path = CONFIG_STATIC_USERMODEHELPER_PATH;
#else
sub_info->path = path;
#endif
sub_info->argv = argv;
sub_info->envp = envp;
sub_info->cleanup = cleanup;
sub_info->init = init;
sub_info->data = data;
out:
return sub_info;
}
EXPORT_SYMBOL(call_usermodehelper_setup);
/**
* call_usermodehelper_exec - start a usermode application
* @sub_info: information about the subprocess
* @wait: wait for the application to finish and return status.
* when UMH_NO_WAIT don't wait at all, but you get no useful error back
* when the program couldn't be exec'ed. This makes it safe to call
* from interrupt context.
*
* Runs a user-space application. The application is started
* asynchronously if wait is not set, and runs as a child of system workqueues.
* (ie. it runs with full root capabilities and optimized affinity).
*
* Note: successful return value does not guarantee the helper was called at
* all. You can't rely on sub_info->{init,cleanup} being called even for
* UMH_WAIT_* wait modes as STATIC_USERMODEHELPER_PATH="" turns all helpers
* into a successful no-op.
*/
int call_usermodehelper_exec(struct subprocess_info *sub_info, int wait)
{
DECLARE_COMPLETION_ONSTACK(done);
int retval = 0;
if (!sub_info->path) {
call_usermodehelper_freeinfo(sub_info);
return -EINVAL;
}
helper_lock();
if (usermodehelper_disabled) {
retval = -EBUSY;
goto out;
}
/*
* If there is no binary for us to call, then just return and get out of
* here. This allows us to set STATIC_USERMODEHELPER_PATH to "" and
* disable all call_usermodehelper() calls.
*/
if (strlen(sub_info->path) == 0)
goto out;
/*
* Set the completion pointer only if there is a waiter.
* This makes it possible to use umh_complete to free
* the data structure in case of UMH_NO_WAIT.
*/
sub_info->complete = (wait == UMH_NO_WAIT) ? NULL : &done;
sub_info->wait = wait;
queue_work(system_unbound_wq, &sub_info->work);
if (wait == UMH_NO_WAIT) /* task has freed sub_info */
goto unlock;
if (wait & UMH_KILLABLE) {
retval = wait_for_completion_killable(&done);
if (!retval)
goto wait_done;
/* umh_complete() will see NULL and free sub_info */
if (xchg(&sub_info->complete, NULL))
goto unlock;
/* fallthrough, umh_complete() was already called */
}
wait_for_completion(&done);
wait_done:
retval = sub_info->retval;
out:
call_usermodehelper_freeinfo(sub_info);
unlock:
helper_unlock();
return retval;
}
EXPORT_SYMBOL(call_usermodehelper_exec);
/**
* call_usermodehelper() - prepare and start a usermode application
* @path: path to usermode executable
* @argv: arg vector for process
* @envp: environment for process
* @wait: wait for the application to finish and return status.
* when UMH_NO_WAIT don't wait at all, but you get no useful error back
* when the program couldn't be exec'ed. This makes it safe to call
* from interrupt context.
*
* This function is the equivalent to use call_usermodehelper_setup() and
* call_usermodehelper_exec().
*/
int call_usermodehelper(const char *path, char **argv, char **envp, int wait)
{
struct subprocess_info *info;
gfp_t gfp_mask = (wait == UMH_NO_WAIT) ? GFP_ATOMIC : GFP_KERNEL;
info = call_usermodehelper_setup(path, argv, envp, gfp_mask,
NULL, NULL, NULL);
if (info == NULL)
return -ENOMEM;
return call_usermodehelper_exec(info, wait);
}
EXPORT_SYMBOL(call_usermodehelper);
static int proc_cap_handler(struct ctl_table *table, int write,
void *buffer, size_t *lenp, loff_t *ppos)
{
struct ctl_table t;
unsigned long cap_array[_KERNEL_CAPABILITY_U32S];
kernel_cap_t new_cap;
int err, i;
if (write && (!capable(CAP_SETPCAP) ||
!capable(CAP_SYS_MODULE)))
return -EPERM;
/*
* convert from the global kernel_cap_t to the ulong array to print to
* userspace if this is a read.
*/
spin_lock(&umh_sysctl_lock);
for (i = 0; i < _KERNEL_CAPABILITY_U32S; i++) {
if (table->data == CAP_BSET)
cap_array[i] = usermodehelper_bset.cap[i];
else if (table->data == CAP_PI)
cap_array[i] = usermodehelper_inheritable.cap[i];
else
BUG();
}
spin_unlock(&umh_sysctl_lock);
t = *table;
t.data = &cap_array;
/*
* actually read or write and array of ulongs from userspace. Remember
* these are least significant 32 bits first
*/
err = proc_doulongvec_minmax(&t, write, buffer, lenp, ppos);
if (err < 0)
return err;
/*
* convert from the sysctl array of ulongs to the kernel_cap_t
* internal representation
*/
for (i = 0; i < _KERNEL_CAPABILITY_U32S; i++)
new_cap.cap[i] = cap_array[i];
/*
* Drop everything not in the new_cap (but don't add things)
*/
if (write) {
spin_lock(&umh_sysctl_lock);
if (table->data == CAP_BSET)
usermodehelper_bset = cap_intersect(usermodehelper_bset, new_cap);
if (table->data == CAP_PI)
usermodehelper_inheritable = cap_intersect(usermodehelper_inheritable, new_cap);
spin_unlock(&umh_sysctl_lock);
}
return 0;
}
struct ctl_table usermodehelper_table[] = {
{
.procname = "bset",
.data = CAP_BSET,
.maxlen = _KERNEL_CAPABILITY_U32S * sizeof(unsigned long),
.mode = 0600,
.proc_handler = proc_cap_handler,
},
{
.procname = "inheritable",
.data = CAP_PI,
.maxlen = _KERNEL_CAPABILITY_U32S * sizeof(unsigned long),
.mode = 0600,
.proc_handler = proc_cap_handler,
},
{ }
};
/*
* include/net/tipc.h: Include file for TIPC message header routines
*
* Copyright (c) 2017 Ericsson AB
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the names of the copyright holders nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* Alternatively, this software may be distributed under the terms of the
* GNU General Public License ("GPL") version 2 as published by the Free
* Software Foundation.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef _TIPC_HDR_H
#define _TIPC_HDR_H
#include <linux/random.h>
#define KEEPALIVE_MSG_MASK 0x0e080000 /* LINK_PROTOCOL + MSG_IS_KEEPALIVE */
struct tipc_basic_hdr {
__be32 w[4];
};
static inline __be32 tipc_hdr_rps_key(struct tipc_basic_hdr *hdr)
{
u32 w0 = ntohl(hdr->w[0]);
bool keepalive_msg = (w0 & KEEPALIVE_MSG_MASK) == KEEPALIVE_MSG_MASK;
__be32 key;
/* Return source node identity as key */
if (likely(!keepalive_msg))
return hdr->w[3];
/* Spread PROBE/PROBE_REPLY messages across the cores */
get_random_bytes(&key, sizeof(key));
return key;
}
#endif
// SPDX-License-Identifier: GPL-2.0-only
/*
* linux/kernel/printk.c
*
* Copyright (C) 1991, 1992 Linus Torvalds
*
* Modified to make sys_syslog() more flexible: added commands to
* return the last 4k of kernel messages, regardless of whether
* they've been read or not. Added option to suppress kernel printk's
* to the console. Added hook for sending the console messages
* elsewhere, in preparation for a serial line console (someday).
* Ted Ts'o, 2/11/93.
* Modified for sysctl support, 1/8/97, Chris Horn.
* Fixed SMP synchronization, 08/08/99, Manfred Spraul
* manfred@colorfullife.com
* Rewrote bits to get rid of console_lock
* 01Mar01 Andrew Morton
*/
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/kernel.h>
#include <linux/mm.h>
#include <linux/tty.h>
#include <linux/tty_driver.h>
#include <linux/console.h>
#include <linux/init.h>
#include <linux/jiffies.h>
#include <linux/nmi.h>
#include <linux/module.h>
#include <linux/moduleparam.h>
#include <linux/delay.h>
#include <linux/smp.h>
#include <linux/security.h>
#include <linux/memblock.h>
#include <linux/syscalls.h>
#include <linux/crash_core.h>
#include <linux/ratelimit.h>
#include <linux/kmsg_dump.h>
#include <linux/syslog.h>
#include <linux/cpu.h>
#include <linux/rculist.h>
#include <linux/poll.h>
#include <linux/irq_work.h>
#include <linux/ctype.h>
#include <linux/uio.h>
#include <linux/sched/clock.h>
#include <linux/sched/debug.h>
#include <linux/sched/task_stack.h>
#include <linux/uaccess.h>
#include <asm/sections.h>
#include <trace/events/initcall.h>
#define CREATE_TRACE_POINTS
#include <trace/events/printk.h>
#include "printk_ringbuffer.h"
#include "console_cmdline.h"
#include "braille.h"
#include "internal.h"
int console_printk[4] = {
CONSOLE_LOGLEVEL_DEFAULT, /* console_loglevel */
MESSAGE_LOGLEVEL_DEFAULT, /* default_message_loglevel */
CONSOLE_LOGLEVEL_MIN, /* minimum_console_loglevel */
CONSOLE_LOGLEVEL_DEFAULT, /* default_console_loglevel */
};
EXPORT_SYMBOL_GPL(console_printk);
atomic_t ignore_console_lock_warning __read_mostly = ATOMIC_INIT(0);
EXPORT_SYMBOL(ignore_console_lock_warning);
/*
* Low level drivers may need that to know if they can schedule in
* their unblank() callback or not. So let's export it.
*/
int oops_in_progress;
EXPORT_SYMBOL(oops_in_progress);
/*
* console_sem protects the console_drivers list, and also
* provides serialisation for access to the entire console
* driver system.
*/
static DEFINE_SEMAPHORE(console_sem);
struct console *console_drivers;
EXPORT_SYMBOL_GPL(console_drivers);
/*
* System may need to suppress printk message under certain
* circumstances, like after kernel panic happens.
*/
int __read_mostly suppress_printk;
#ifdef CONFIG_LOCKDEP
static struct lockdep_map console_lock_dep_map = {
.name = "console_lock"
};
#endif
enum devkmsg_log_bits {
__DEVKMSG_LOG_BIT_ON = 0,
__DEVKMSG_LOG_BIT_OFF,
__DEVKMSG_LOG_BIT_LOCK,
};
enum devkmsg_log_masks {
DEVKMSG_LOG_MASK_ON = BIT(__DEVKMSG_LOG_BIT_ON),
DEVKMSG_LOG_MASK_OFF = BIT(__DEVKMSG_LOG_BIT_OFF),
DEVKMSG_LOG_MASK_LOCK = BIT(__DEVKMSG_LOG_BIT_LOCK),
};
/* Keep both the 'on' and 'off' bits clear, i.e. ratelimit by default: */
#define DEVKMSG_LOG_MASK_DEFAULT 0
static unsigned int __read_mostly devkmsg_log = DEVKMSG_LOG_MASK_DEFAULT;
static int __control_devkmsg(char *str)
{
size_t len;
if (!str)
return -EINVAL;
len = str_has_prefix(str, "on");
if (len) {
devkmsg_log = DEVKMSG_LOG_MASK_ON;
return len;
}
len = str_has_prefix(str, "off");
if (len) {
devkmsg_log = DEVKMSG_LOG_MASK_OFF;
return len;
}
len = str_has_prefix(str, "ratelimit");
if (len) {
devkmsg_log = DEVKMSG_LOG_MASK_DEFAULT;
return len;
}
return -EINVAL;
}
static int __init control_devkmsg(char *str)
{
if (__control_devkmsg(str) < 0) {
pr_warn("printk.devkmsg: bad option string '%s'\n", str);
return 1;
}
/*
* Set sysctl string accordingly:
*/
if (devkmsg_log == DEVKMSG_LOG_MASK_ON)
strcpy(devkmsg_log_str, "on");
else if (devkmsg_log == DEVKMSG_LOG_MASK_OFF)
strcpy(devkmsg_log_str, "off");
/* else "ratelimit" which is set by default. */
/*
* Sysctl cannot change it anymore. The kernel command line setting of
* this parameter is to force the setting to be permanent throughout the
* runtime of the system. This is a precation measure against userspace
* trying to be a smarta** and attempting to change it up on us.
*/
devkmsg_log |= DEVKMSG_LOG_MASK_LOCK;
return 1;
}
__setup("printk.devkmsg=", control_devkmsg);
char devkmsg_log_str[DEVKMSG_STR_MAX_SIZE] = "ratelimit";
int devkmsg_sysctl_set_loglvl(struct ctl_table *table, int write,
void *buffer, size_t *lenp, loff_t *ppos)
{
char old_str[DEVKMSG_STR_MAX_SIZE];
unsigned int old;
int err;
if (write) {
if (devkmsg_log & DEVKMSG_LOG_MASK_LOCK)
return -EINVAL;
old = devkmsg_log;
strncpy(old_str, devkmsg_log_str, DEVKMSG_STR_MAX_SIZE);
}
err = proc_dostring(table, write, buffer, lenp, ppos);
if (err)
return err;
if (write) {
err = __control_devkmsg(devkmsg_log_str);
/*
* Do not accept an unknown string OR a known string with
* trailing crap...
*/
if (err < 0 || (err + 1 != *lenp)) {
/* ... and restore old setting. */
devkmsg_log = old;
strncpy(devkmsg_log_str, old_str, DEVKMSG_STR_MAX_SIZE);
return -EINVAL;
}
}
return 0;
}
/* Number of registered extended console drivers. */
static int nr_ext_console_drivers;
/*
* Helper macros to handle lockdep when locking/unlocking console_sem. We use
* macros instead of functions so that _RET_IP_ contains useful information.
*/
#define down_console_sem() do { \
down(&console_sem);\
mutex_acquire(&console_lock_dep_map, 0, 0, _RET_IP_);\
} while (0)
static int __down_trylock_console_sem(unsigned long ip)
{
int lock_failed;
unsigned long flags;
/*
* Here and in __up_console_sem() we need to be in safe mode,
* because spindump/WARN/etc from under console ->lock will
* deadlock in printk()->down_trylock_console_sem() otherwise.
*/
printk_safe_enter_irqsave(flags);
lock_failed = down_trylock(&console_sem);
printk_safe_exit_irqrestore(flags);
if (lock_failed)
return 1;
mutex_acquire(&console_lock_dep_map, 0, 1, ip);
return 0;
}
#define down_trylock_console_sem() __down_trylock_console_sem(_RET_IP_)
static void __up_console_sem(unsigned long ip)
{
unsigned long flags;
mutex_release(&console_lock_dep_map, ip);
printk_safe_enter_irqsave(flags);
up(&console_sem);
printk_safe_exit_irqrestore(flags);
}
#define up_console_sem() __up_console_sem(_RET_IP_)
/*
* This is used for debugging the mess that is the VT code by
* keeping track if we have the console semaphore held. It's
* definitely not the perfect debug tool (we don't know if _WE_
* hold it and are racing, but it helps tracking those weird code
* paths in the console code where we end up in places I want
* locked without the console semaphore held).
*/
static int console_locked, console_suspended;
/*
* If exclusive_console is non-NULL then only this console is to be printed to.
*/
static struct console *exclusive_console;
/*
* Array of consoles built from command line options (console=)
*/
#define MAX_CMDLINECONSOLES 8
static struct console_cmdline console_cmdline[MAX_CMDLINECONSOLES];
static int preferred_console = -1;
static bool has_preferred_console;
int console_set_on_cmdline;
EXPORT_SYMBOL(console_set_on_cmdline);
/* Flag: console code may call schedule() */
static int console_may_schedule;
enum con_msg_format_flags {
MSG_FORMAT_DEFAULT = 0,
MSG_FORMAT_SYSLOG = (1 << 0),
};
static int console_msg_format = MSG_FORMAT_DEFAULT;
/*
* The printk log buffer consists of a sequenced collection of records, each
* containing variable length message text. Every record also contains its
* own meta-data (@info).
*
* Every record meta-data carries the timestamp in microseconds, as well as
* the standard userspace syslog level and syslog facility. The usual kernel
* messages use LOG_KERN; userspace-injected messages always carry a matching
* syslog facility, by default LOG_USER. The origin of every message can be
* reliably determined that way.
*
* The human readable log message of a record is available in @text, the
* length of the message text in @text_len. The stored message is not
* terminated.
*
* Optionally, a record can carry a dictionary of properties (key/value
* pairs), to provide userspace with a machine-readable message context.
*
* Examples for well-defined, commonly used property names are:
* DEVICE=b12:8 device identifier
* b12:8 block dev_t
* c127:3 char dev_t
* n8 netdev ifindex
* +sound:card0 subsystem:devname
* SUBSYSTEM=pci driver-core subsystem name
*
* Valid characters in property names are [a-zA-Z0-9.-_]. Property names
* and values are terminated by a '\0' character.
*
* Example of record values:
* record.text_buf = "it's a line" (unterminated)
* record.info.seq = 56
* record.info.ts_nsec = 36863
* record.info.text_len = 11
* record.info.facility = 0 (LOG_KERN)
* record.info.flags = 0
* record.info.level = 3 (LOG_ERR)
* record.info.caller_id = 299 (task 299)
* record.info.dev_info.subsystem = "pci" (terminated)
* record.info.dev_info.device = "+pci:0000:00:01.0" (terminated)
*
* The 'struct printk_info' buffer must never be directly exported to
* userspace, it is a kernel-private implementation detail that might
* need to be changed in the future, when the requirements change.
*
* /dev/kmsg exports the structured data in the following line format:
* "<level>,<sequnum>,<timestamp>,<contflag>[,additional_values, ... ];<message text>\n"
*
* Users of the export format should ignore possible additional values
* separated by ',', and find the message after the ';' character.
*
* The optional key/value pairs are attached as continuation lines starting
* with a space character and terminated by a newline. All possible
* non-prinatable characters are escaped in the "\xff" notation.
*/
/* syslog_lock protects syslog_* variables and write access to clear_seq. */
static DEFINE_MUTEX(syslog_lock);
#ifdef CONFIG_PRINTK
DECLARE_WAIT_QUEUE_HEAD(log_wait);
/* All 3 protected by @syslog_lock. */
/* the next printk record to read by syslog(READ) or /proc/kmsg */
static u64 syslog_seq;
static size_t syslog_partial;
static bool syslog_time;
/* All 3 protected by @console_sem. */
/* the next printk record to write to the console */
static u64 console_seq;
static u64 exclusive_console_stop_seq;
static unsigned long console_dropped;
struct latched_seq {
seqcount_latch_t latch;
u64 val[2];
};
/*
* The next printk record to read after the last 'clear' command. There are
* two copies (updated with seqcount_latch) so that reads can locklessly
* access a valid value. Writers are synchronized by @syslog_lock.
*/
static struct latched_seq clear_seq = {
.latch = SEQCNT_LATCH_ZERO(clear_seq.latch),
.val[0] = 0,
.val[1] = 0,
};
#ifdef CONFIG_PRINTK_CALLER
#define PREFIX_MAX 48
#else
#define PREFIX_MAX 32
#endif
/* the maximum size of a formatted record (i.e. with prefix added per line) */
#define CONSOLE_LOG_MAX 1024
/* the maximum size allowed to be reserved for a record */
#define LOG_LINE_MAX (CONSOLE_LOG_MAX - PREFIX_MAX)
#define LOG_LEVEL(v) ((v) & 0x07)
#define LOG_FACILITY(v) ((v) >> 3 & 0xff)
/* record buffer */
#define LOG_ALIGN __alignof__(unsigned long)
#define __LOG_BUF_LEN (1 << CONFIG_LOG_BUF_SHIFT)
#define LOG_BUF_LEN_MAX (u32)(1 << 31)
static char __log_buf[__LOG_BUF_LEN] __aligned(LOG_ALIGN);
static char *log_buf = __log_buf;
static u32 log_buf_len = __LOG_BUF_LEN;
/*
* Define the average message size. This only affects the number of
* descriptors that will be available. Underestimating is better than
* overestimating (too many available descriptors is better than not enough).
*/
#define PRB_AVGBITS 5 /* 32 character average length */
#if CONFIG_LOG_BUF_SHIFT <= PRB_AVGBITS
#error CONFIG_LOG_BUF_SHIFT value too small.
#endif
_DEFINE_PRINTKRB(printk_rb_static, CONFIG_LOG_BUF_SHIFT - PRB_AVGBITS,
PRB_AVGBITS, &__log_buf[0]);
static struct printk_ringbuffer printk_rb_dynamic;
static struct printk_ringbuffer *prb = &printk_rb_static;
/*
* We cannot access per-CPU data (e.g. per-CPU flush irq_work) before
* per_cpu_areas are initialised. This variable is set to true when
* it's safe to access per-CPU data.
*/
static bool __printk_percpu_data_ready __read_mostly;
bool printk_percpu_data_ready(void)
{
return __printk_percpu_data_ready;
}
/* Must be called under syslog_lock. */
static void latched_seq_write(struct latched_seq *ls, u64 val)
{
raw_write_seqcount_latch(&ls->latch);
ls->val[0] = val;
raw_write_seqcount_latch(&ls->latch);
ls->val[1] = val;
}
/* Can be called from any context. */
static u64 latched_seq_read_nolock(struct latched_seq *ls)
{
unsigned int seq;
unsigned int idx;
u64 val;
do {
seq = raw_read_seqcount_latch(&ls->latch);
idx = seq & 0x1;
val = ls->val[idx];
} while (read_seqcount_latch_retry(&ls->latch, seq));
return val;
}
/* Return log buffer address */
char *log_buf_addr_get(void)
{
return log_buf;
}
/* Return log buffer size */
u32 log_buf_len_get(void)
{
return log_buf_len;
}
/*
* Define how much of the log buffer we could take at maximum. The value
* must be greater than two. Note that only half of the buffer is available
* when the index points to the middle.
*/
#define MAX_LOG_TAKE_PART 4
static const char trunc_msg[] = "<truncated>";
static void truncate_msg(u16 *text_len, u16 *trunc_msg_len)
{
/*
* The message should not take the whole buffer. Otherwise, it might
* get removed too soon.
*/
u32 max_text_len = log_buf_len / MAX_LOG_TAKE_PART;
if (*text_len > max_text_len)
*text_len = max_text_len;
/* enable the warning message (if there is room) */
*trunc_msg_len = strlen(trunc_msg);
if (*text_len >= *trunc_msg_len) *text_len -= *trunc_msg_len;
else
*trunc_msg_len = 0;
}
int dmesg_restrict = IS_ENABLED(CONFIG_SECURITY_DMESG_RESTRICT);
static int syslog_action_restricted(int type)
{
if (dmesg_restrict)
return 1;
/*
* Unless restricted, we allow "read all" and "get buffer size"
* for everybody.
*/
return type != SYSLOG_ACTION_READ_ALL &&
type != SYSLOG_ACTION_SIZE_BUFFER;
}
static int check_syslog_permissions(int type, int source)
{
/*
* If this is from /proc/kmsg and we've already opened it, then we've
* already done the capabilities checks at open time.
*/
if (source == SYSLOG_FROM_PROC && type != SYSLOG_ACTION_OPEN)
goto ok;
if (syslog_action_restricted(type)) {
if (capable(CAP_SYSLOG))
goto ok;
/*
* For historical reasons, accept CAP_SYS_ADMIN too, with
* a warning.
*/
if (capable(CAP_SYS_ADMIN)) {
pr_warn_once("%s (%d): Attempt to access syslog with "
"CAP_SYS_ADMIN but no CAP_SYSLOG "
"(deprecated).\n",
current->comm, task_pid_nr(current));
goto ok;
}
return -EPERM;
}
ok:
return security_syslog(type);
}
static void append_char(char **pp, char *e, char c)
{
if (*pp < e)
*(*pp)++ = c;
}
static ssize_t info_print_ext_header(char *buf, size_t size,
struct printk_info *info)
{
u64 ts_usec = info->ts_nsec;
char caller[20];
#ifdef CONFIG_PRINTK_CALLER
u32 id = info->caller_id;
snprintf(caller, sizeof(caller), ",caller=%c%u",
id & 0x80000000 ? 'C' : 'T', id & ~0x80000000);
#else
caller[0] = '\0';
#endif
do_div(ts_usec, 1000);
return scnprintf(buf, size, "%u,%llu,%llu,%c%s;",
(info->facility << 3) | info->level, info->seq,
ts_usec, info->flags & LOG_CONT ? 'c' : '-', caller);
}
static ssize_t msg_add_ext_text(char *buf, size_t size,
const char *text, size_t text_len,
unsigned char endc)
{
char *p = buf, *e = buf + size;
size_t i;
/* escape non-printable characters */
for (i = 0; i < text_len; i++) {
unsigned char c = text[i];
if (c < ' ' || c >= 127 || c == '\\')
p += scnprintf(p, e - p, "\\x%02x", c);
else
append_char(&p, e, c);
}
append_char(&p, e, endc);
return p - buf;
}
static ssize_t msg_add_dict_text(char *buf, size_t size,
const char *key, const char *val)
{
size_t val_len = strlen(val);
ssize_t len;
if (!val_len)
return 0;
len = msg_add_ext_text(buf, size, "", 0, ' '); /* dict prefix */
len += msg_add_ext_text(buf + len, size - len, key, strlen(key), '=');
len += msg_add_ext_text(buf + len, size - len, val, val_len, '\n');
return len;
}
static ssize_t msg_print_ext_body(char *buf, size_t size,
char *text, size_t text_len,
struct dev_printk_info *dev_info)
{
ssize_t len;
len = msg_add_ext_text(buf, size, text, text_len, '\n');
if (!dev_info)
goto out;
len += msg_add_dict_text(buf + len, size - len, "SUBSYSTEM",
dev_info->subsystem);
len += msg_add_dict_text(buf + len, size - len, "DEVICE",
dev_info->device);
out:
return len;
}
/* /dev/kmsg - userspace message inject/listen interface */
struct devkmsg_user {
atomic64_t seq;
struct ratelimit_state rs;
struct mutex lock;
char buf[CONSOLE_EXT_LOG_MAX];
struct printk_info info;
char text_buf[CONSOLE_EXT_LOG_MAX];
struct printk_record record;
};
static __printf(3, 4) __cold
int devkmsg_emit(int facility, int level, const char *fmt, ...)
{
va_list args;
int r;
va_start(args, fmt);
r = vprintk_emit(facility, level, NULL, fmt, args);
va_end(args);
return r;
}
static ssize_t devkmsg_write(struct kiocb *iocb, struct iov_iter *from)
{
char *buf, *line;
int level = default_message_loglevel;
int facility = 1; /* LOG_USER */
struct file *file = iocb->ki_filp;
struct devkmsg_user *user = file->private_data;
size_t len = iov_iter_count(from);
ssize_t ret = len;
if (!user || len > LOG_LINE_MAX)
return -EINVAL;
/* Ignore when user logging is disabled. */
if (devkmsg_log & DEVKMSG_LOG_MASK_OFF)
return len;
/* Ratelimit when not explicitly enabled. */
if (!(devkmsg_log & DEVKMSG_LOG_MASK_ON)) {
if (!___ratelimit(&user->rs, current->comm))
return ret;
}
buf = kmalloc(len+1, GFP_KERNEL);
if (buf == NULL)
return -ENOMEM;
buf[len] = '\0';
if (!copy_from_iter_full(buf, len, from)) {
kfree(buf);
return -EFAULT;
}
/*
* Extract and skip the syslog prefix <[0-9]*>. Coming from userspace
* the decimal value represents 32bit, the lower 3 bit are the log
* level, the rest are the log facility.
*
* If no prefix or no userspace facility is specified, we
* enforce LOG_USER, to be able to reliably distinguish
* kernel-generated messages from userspace-injected ones.
*/
line = buf;
if (line[0] == '<') {
char *endp = NULL;
unsigned int u;
u = simple_strtoul(line + 1, &endp, 10);
if (endp && endp[0] == '>') {
level = LOG_LEVEL(u);
if (LOG_FACILITY(u) != 0)
facility = LOG_FACILITY(u);
endp++;
line = endp;
}
}
devkmsg_emit(facility, level, "%s", line);
kfree(buf);
return ret;
}
static ssize_t devkmsg_read(struct file *file, char __user *buf,
size_t count, loff_t *ppos)
{
struct devkmsg_user *user = file->private_data;
struct printk_record *r = &user->record;
size_t len;
ssize_t ret;
if (!user)
return -EBADF;
ret = mutex_lock_interruptible(&user->lock);
if (ret)
return ret;
if (!prb_read_valid(prb, atomic64_read(&user->seq), r)) {
if (file->f_flags & O_NONBLOCK) {
ret = -EAGAIN;
goto out;
}
ret = wait_event_interruptible(log_wait,
prb_read_valid(prb, atomic64_read(&user->seq), r));
if (ret)
goto out;
}
if (r->info->seq != atomic64_read(&user->seq)) {
/* our last seen message is gone, return error and reset */
atomic64_set(&user->seq, r->info->seq);
ret = -EPIPE;
goto out;
}
len = info_print_ext_header(user->buf, sizeof(user->buf), r->info);
len += msg_print_ext_body(user->buf + len, sizeof(user->buf) - len,
&r->text_buf[0], r->info->text_len,
&r->info->dev_info);
atomic64_set(&user->seq, r->info->seq + 1);
if (len > count) {
ret = -EINVAL;
goto out;
}
if (copy_to_user(buf, user->buf, len)) {
ret = -EFAULT;
goto out;
}
ret = len;
out:
mutex_unlock(&user->lock);
return ret;
}
/*
* Be careful when modifying this function!!!
*
* Only few operations are supported because the device works only with the
* entire variable length messages (records). Non-standard values are
* returned in the other cases and has been this way for quite some time.
* User space applications might depend on this behavior.
*/
static loff_t devkmsg_llseek(struct file *file, loff_t offset, int whence)
{
struct devkmsg_user *user = file->private_data;
loff_t ret = 0;
if (!user)
return -EBADF;
if (offset)
return -ESPIPE;
switch (whence) {
case SEEK_SET:
/* the first record */
atomic64_set(&user->seq, prb_first_valid_seq(prb));
break;
case SEEK_DATA:
/*
* The first record after the last SYSLOG_ACTION_CLEAR,
* like issued by 'dmesg -c'. Reading /dev/kmsg itself
* changes no global state, and does not clear anything.
*/
atomic64_set(&user->seq, latched_seq_read_nolock(&clear_seq));
break;
case SEEK_END:
/* after the last record */
atomic64_set(&user->seq, prb_next_seq(prb));
break;
default:
ret = -EINVAL;
}
return ret;
}
static __poll_t devkmsg_poll(struct file *file, poll_table *wait)
{
struct devkmsg_user *user = file->private_data;
struct printk_info info;
__poll_t ret = 0;
if (!user)
return EPOLLERR|EPOLLNVAL;
poll_wait(file, &log_wait, wait);
if (prb_read_valid_info(prb, atomic64_read(&user->seq), &info, NULL)) {
/* return error when data has vanished underneath us */
if (info.seq != atomic64_read(&user->seq))
ret = EPOLLIN|EPOLLRDNORM|EPOLLERR|EPOLLPRI;
else
ret = EPOLLIN|EPOLLRDNORM;
}
return ret;
}
static int devkmsg_open(struct inode *inode, struct file *file)
{
struct devkmsg_user *user;
int err;
if (devkmsg_log & DEVKMSG_LOG_MASK_OFF)
return -EPERM;
/* write-only does not need any file context */
if ((file->f_flags & O_ACCMODE) != O_WRONLY) {
err = check_syslog_permissions(SYSLOG_ACTION_READ_ALL,
SYSLOG_FROM_READER);
if (err)
return err;
}
user = kmalloc(sizeof(struct devkmsg_user), GFP_KERNEL);
if (!user)
return -ENOMEM;
ratelimit_default_init(&user->rs);
ratelimit_set_flags(&user->rs, RATELIMIT_MSG_ON_RELEASE);
mutex_init(&user->lock);
prb_rec_init_rd(&user->record, &user->info,
&user->text_buf[0], sizeof(user->text_buf));
atomic64_set(&user->seq, prb_first_valid_seq(prb));
file->private_data = user;
return 0;
}
static int devkmsg_release(struct inode *inode, struct file *file)
{
struct devkmsg_user *user = file->private_data;
if (!user)
return 0;
ratelimit_state_exit(&user->rs);
mutex_destroy(&user->lock);
kfree(user);
return 0;
}
const struct file_operations kmsg_fops = {
.open = devkmsg_open,
.read = devkmsg_read,
.write_iter = devkmsg_write,
.llseek = devkmsg_llseek,
.poll = devkmsg_poll,
.release = devkmsg_release,
};
#ifdef CONFIG_CRASH_CORE
/*
* This appends the listed symbols to /proc/vmcore
*
* /proc/vmcore is used by various utilities, like crash and makedumpfile to
* obtain access to symbols that are otherwise very difficult to locate. These
* symbols are specifically used so that utilities can access and extract the
* dmesg log from a vmcore file after a crash.
*/
void log_buf_vmcoreinfo_setup(void)
{
struct dev_printk_info *dev_info = NULL;
VMCOREINFO_SYMBOL(prb);
VMCOREINFO_SYMBOL(printk_rb_static);
VMCOREINFO_SYMBOL(clear_seq);
/*
* Export struct size and field offsets. User space tools can
* parse it and detect any changes to structure down the line.
*/
VMCOREINFO_STRUCT_SIZE(printk_ringbuffer);
VMCOREINFO_OFFSET(printk_ringbuffer, desc_ring);
VMCOREINFO_OFFSET(printk_ringbuffer, text_data_ring);
VMCOREINFO_OFFSET(printk_ringbuffer, fail);
VMCOREINFO_STRUCT_SIZE(prb_desc_ring);
VMCOREINFO_OFFSET(prb_desc_ring, count_bits);
VMCOREINFO_OFFSET(prb_desc_ring, descs);
VMCOREINFO_OFFSET(prb_desc_ring, infos);
VMCOREINFO_OFFSET(prb_desc_ring, head_id);
VMCOREINFO_OFFSET(prb_desc_ring, tail_id);
VMCOREINFO_STRUCT_SIZE(prb_desc);
VMCOREINFO_OFFSET(prb_desc, state_var);
VMCOREINFO_OFFSET(prb_desc, text_blk_lpos);
VMCOREINFO_STRUCT_SIZE(prb_data_blk_lpos);
VMCOREINFO_OFFSET(prb_data_blk_lpos, begin);
VMCOREINFO_OFFSET(prb_data_blk_lpos, next);
VMCOREINFO_STRUCT_SIZE(printk_info);
VMCOREINFO_OFFSET(printk_info, seq);
VMCOREINFO_OFFSET(printk_info, ts_nsec);
VMCOREINFO_OFFSET(printk_info, text_len);
VMCOREINFO_OFFSET(printk_info, caller_id);
VMCOREINFO_OFFSET(printk_info, dev_info);
VMCOREINFO_STRUCT_SIZE(dev_printk_info);
VMCOREINFO_OFFSET(dev_printk_info, subsystem);
VMCOREINFO_LENGTH(printk_info_subsystem, sizeof(dev_info->subsystem));
VMCOREINFO_OFFSET(dev_printk_info, device);
VMCOREINFO_LENGTH(printk_info_device, sizeof(dev_info->device));
VMCOREINFO_STRUCT_SIZE(prb_data_ring);
VMCOREINFO_OFFSET(prb_data_ring, size_bits);
VMCOREINFO_OFFSET(prb_data_ring, data);
VMCOREINFO_OFFSET(prb_data_ring, head_lpos);
VMCOREINFO_OFFSET(prb_data_ring, tail_lpos);
VMCOREINFO_SIZE(atomic_long_t);
VMCOREINFO_TYPE_OFFSET(atomic_long_t, counter);
VMCOREINFO_STRUCT_SIZE(latched_seq);
VMCOREINFO_OFFSET(latched_seq, val);
}
#endif
/* requested log_buf_len from kernel cmdline */
static unsigned long __initdata new_log_buf_len;
/* we practice scaling the ring buffer by powers of 2 */
static void __init log_buf_len_update(u64 size)
{
if (size > (u64)LOG_BUF_LEN_MAX) {
size = (u64)LOG_BUF_LEN_MAX;
pr_err("log_buf over 2G is not supported.\n");
}
if (size)
size = roundup_pow_of_two(size);
if (size > log_buf_len)
new_log_buf_len = (unsigned long)size;
}
/* save requested log_buf_len since it's too early to process it */
static int __init log_buf_len_setup(char *str)
{
u64 size;
if (!str)
return -EINVAL;
size = memparse(str, &str);
log_buf_len_update(size);
return 0;
}
early_param("log_buf_len", log_buf_len_setup);
#ifdef CONFIG_SMP
#define __LOG_CPU_MAX_BUF_LEN (1 << CONFIG_LOG_CPU_MAX_BUF_SHIFT)
static void __init log_buf_add_cpu(void)
{
unsigned int cpu_extra;
/*
* archs should set up cpu_possible_bits properly with
* set_cpu_possible() after setup_arch() but just in
* case lets ensure this is valid.
*/
if (num_possible_cpus() == 1)
return;
cpu_extra = (num_possible_cpus() - 1) * __LOG_CPU_MAX_BUF_LEN;
/* by default this will only continue through for large > 64 CPUs */
if (cpu_extra <= __LOG_BUF_LEN / 2)
return;
pr_info("log_buf_len individual max cpu contribution: %d bytes\n",
__LOG_CPU_MAX_BUF_LEN);
pr_info("log_buf_len total cpu_extra contributions: %d bytes\n",
cpu_extra);
pr_info("log_buf_len min size: %d bytes\n", __LOG_BUF_LEN);
log_buf_len_update(cpu_extra + __LOG_BUF_LEN);
}
#else /* !CONFIG_SMP */
static inline void log_buf_add_cpu(void) {}
#endif /* CONFIG_SMP */
static void __init set_percpu_data_ready(void)
{
__printk_percpu_data_ready = true;
}
static unsigned int __init add_to_rb(struct printk_ringbuffer *rb,
struct printk_record *r)
{
struct prb_reserved_entry e;
struct printk_record dest_r;
prb_rec_init_wr(&dest_r, r->info->text_len);
if (!prb_reserve(&e, rb, &dest_r))
return 0;
memcpy(&dest_r.text_buf[0], &r->text_buf[0], r->info->text_len);
dest_r.info->text_len = r->info->text_len;
dest_r.info->facility = r->info->facility;
dest_r.info->level = r->info->level;
dest_r.info->flags = r->info->flags;
dest_r.info->ts_nsec = r->info->ts_nsec;
dest_r.info->caller_id = r->info->caller_id;
memcpy(&dest_r.info->dev_info, &r->info->dev_info, sizeof(dest_r.info->dev_info));
prb_final_commit(&e);
return prb_record_text_space(&e);
}
static char setup_text_buf[LOG_LINE_MAX] __initdata;
void __init setup_log_buf(int early)
{
struct printk_info *new_infos;
unsigned int new_descs_count;
struct prb_desc *new_descs;
struct printk_info info;
struct printk_record r;
unsigned int text_size;
size_t new_descs_size;
size_t new_infos_size;
unsigned long flags;
char *new_log_buf;
unsigned int free;
u64 seq;
/*
* Some archs call setup_log_buf() multiple times - first is very
* early, e.g. from setup_arch(), and second - when percpu_areas
* are initialised.
*/
if (!early)
set_percpu_data_ready();
if (log_buf != __log_buf)
return;
if (!early && !new_log_buf_len)
log_buf_add_cpu();
if (!new_log_buf_len)
return;
new_descs_count = new_log_buf_len >> PRB_AVGBITS;
if (new_descs_count == 0) {
pr_err("new_log_buf_len: %lu too small\n", new_log_buf_len);
return;
}
new_log_buf = memblock_alloc(new_log_buf_len, LOG_ALIGN);
if (unlikely(!new_log_buf)) {
pr_err("log_buf_len: %lu text bytes not available\n",
new_log_buf_len);
return;
}
new_descs_size = new_descs_count * sizeof(struct prb_desc);
new_descs = memblock_alloc(new_descs_size, LOG_ALIGN);
if (unlikely(!new_descs)) {
pr_err("log_buf_len: %zu desc bytes not available\n",
new_descs_size);
goto err_free_log_buf;
}
new_infos_size = new_descs_count * sizeof(struct printk_info);
new_infos = memblock_alloc(new_infos_size, LOG_ALIGN);
if (unlikely(!new_infos)) {
pr_err("log_buf_len: %zu info bytes not available\n",
new_infos_size);
goto err_free_descs;
}
prb_rec_init_rd(&r, &info, &setup_text_buf[0], sizeof(setup_text_buf));
prb_init(&printk_rb_dynamic,
new_log_buf, ilog2(new_log_buf_len),
new_descs, ilog2(new_descs_count),
new_infos);
local_irq_save(flags);
log_buf_len = new_log_buf_len;
log_buf = new_log_buf;
new_log_buf_len = 0;
free = __LOG_BUF_LEN;
prb_for_each_record(0, &printk_rb_static, seq, &r) {
text_size = add_to_rb(&printk_rb_dynamic, &r);
if (text_size > free)
free = 0;
else
free -= text_size;
}
prb = &printk_rb_dynamic;
local_irq_restore(flags);
/*
* Copy any remaining messages that might have appeared from
* NMI context after copying but before switching to the
* dynamic buffer.
*/
prb_for_each_record(seq, &printk_rb_static, seq, &r) {
text_size = add_to_rb(&printk_rb_dynamic, &r);
if (text_size > free)
free = 0;
else
free -= text_size;
}
if (seq != prb_next_seq(&printk_rb_static)) {
pr_err("dropped %llu messages\n",
prb_next_seq(&printk_rb_static) - seq);
}
pr_info("log_buf_len: %u bytes\n", log_buf_len);
pr_info("early log buf free: %u(%u%%)\n",
free, (free * 100) / __LOG_BUF_LEN);
return;
err_free_descs:
memblock_free_ptr(new_descs, new_descs_size);
err_free_log_buf:
memblock_free_ptr(new_log_buf, new_log_buf_len);
}
static bool __read_mostly ignore_loglevel;
static int __init ignore_loglevel_setup(char *str)
{
ignore_loglevel = true;
pr_info("debug: ignoring loglevel setting.\n");
return 0;
}
early_param("ignore_loglevel", ignore_loglevel_setup);
module_param(ignore_loglevel, bool, S_IRUGO | S_IWUSR);
MODULE_PARM_DESC(ignore_loglevel,
"ignore loglevel setting (prints all kernel messages to the console)");
static bool suppress_message_printing(int level)
{
return (level >= console_loglevel && !ignore_loglevel);
}
#ifdef CONFIG_BOOT_PRINTK_DELAY
static int boot_delay; /* msecs delay after each printk during bootup */
static unsigned long long loops_per_msec; /* based on boot_delay */
static int __init boot_delay_setup(char *str)
{
unsigned long lpj;
lpj = preset_lpj ? preset_lpj : 1000000; /* some guess */
loops_per_msec = (unsigned long long)lpj / 1000 * HZ;
get_option(&str, &boot_delay);
if (boot_delay > 10 * 1000)
boot_delay = 0;
pr_debug("boot_delay: %u, preset_lpj: %ld, lpj: %lu, "
"HZ: %d, loops_per_msec: %llu\n",
boot_delay, preset_lpj, lpj, HZ, loops_per_msec);
return 0;
}
early_param("boot_delay", boot_delay_setup);
static void boot_delay_msec(int level)
{
unsigned long long k;
unsigned long timeout;
if ((boot_delay == 0 || system_state >= SYSTEM_RUNNING)
|| suppress_message_printing(level)) {
return;
}
k = (unsigned long long)loops_per_msec * boot_delay;
timeout = jiffies + msecs_to_jiffies(boot_delay);
while (k) {
k--;
cpu_relax();
/*
* use (volatile) jiffies to prevent
* compiler reduction; loop termination via jiffies
* is secondary and may or may not happen.
*/
if (time_after(jiffies, timeout))
break;
touch_nmi_watchdog();
}
}
#else
static inline void boot_delay_msec(int level)
{
}
#endif
static bool printk_time = IS_ENABLED(CONFIG_PRINTK_TIME);
module_param_named(time, printk_time, bool, S_IRUGO | S_IWUSR);
static size_t print_syslog(unsigned int level, char *buf)
{
return sprintf(buf, "<%u>", level);
}
static size_t print_time(u64 ts, char *buf)
{
unsigned long rem_nsec = do_div(ts, 1000000000);
return sprintf(buf, "[%5lu.%06lu]",
(unsigned long)ts, rem_nsec / 1000);
}
#ifdef CONFIG_PRINTK_CALLER
static size_t print_caller(u32 id, char *buf)
{
char caller[12];
snprintf(caller, sizeof(caller), "%c%u",
id & 0x80000000 ? 'C' : 'T', id & ~0x80000000);
return sprintf(buf, "[%6s]", caller);
}
#else
#define print_caller(id, buf) 0
#endif
static size_t info_print_prefix(const struct printk_info *info, bool syslog,
bool time, char *buf)
{
size_t len = 0;
if (syslog) len = print_syslog((info->facility << 3) | info->level, buf); if (time) len += print_time(info->ts_nsec, buf + len);
len += print_caller(info->caller_id, buf + len);
if (IS_ENABLED(CONFIG_PRINTK_CALLER) || time) {
buf[len++] = ' ';
buf[len] = '\0';
}
return len;
}
/*
* Prepare the record for printing. The text is shifted within the given
* buffer to avoid a need for another one. The following operations are
* done:
*
* - Add prefix for each line.
* - Drop truncated lines that no longer fit into the buffer.
* - Add the trailing newline that has been removed in vprintk_store().
* - Add a string terminator.
*
* Since the produced string is always terminated, the maximum possible
* return value is @r->text_buf_size - 1;
*
* Return: The length of the updated/prepared text, including the added
* prefixes and the newline. The terminator is not counted. The dropped
* line(s) are not counted.
*/
static size_t record_print_text(struct printk_record *r, bool syslog,
bool time)
{
size_t text_len = r->info->text_len;
size_t buf_size = r->text_buf_size;
char *text = r->text_buf;
char prefix[PREFIX_MAX];
bool truncated = false;
size_t prefix_len;
size_t line_len;
size_t len = 0;
char *next;
/*
* If the message was truncated because the buffer was not large
* enough, treat the available text as if it were the full text.
*/
if (text_len > buf_size)
text_len = buf_size;
prefix_len = info_print_prefix(r->info, syslog, time, prefix);
/*
* @text_len: bytes of unprocessed text
* @line_len: bytes of current line _without_ newline
* @text: pointer to beginning of current line
* @len: number of bytes prepared in r->text_buf
*/
for (;;) {
next = memchr(text, '\n', text_len);
if (next) {
line_len = next - text;
} else {
/* Drop truncated line(s). */
if (truncated)
break;
line_len = text_len;
}
/*
* Truncate the text if there is not enough space to add the
* prefix and a trailing newline and a terminator.
*/
if (len + prefix_len + text_len + 1 + 1 > buf_size) {
/* Drop even the current line if no space. */
if (len + prefix_len + line_len + 1 + 1 > buf_size)
break;
text_len = buf_size - len - prefix_len - 1 - 1;
truncated = true;
}
memmove(text + prefix_len, text, text_len);
memcpy(text, prefix, prefix_len);
/*
* Increment the prepared length to include the text and
* prefix that were just moved+copied. Also increment for the
* newline at the end of this line. If this is the last line,
* there is no newline, but it will be added immediately below.
*/
len += prefix_len + line_len + 1;
if (text_len == line_len) {
/*
* This is the last line. Add the trailing newline
* removed in vprintk_store().
*/
text[prefix_len + line_len] = '\n';
break;
}
/*
* Advance beyond the added prefix and the related line with
* its newline.
*/
text += prefix_len + line_len + 1;
/*
* The remaining text has only decreased by the line with its
* newline.
*
* Note that @text_len can become zero. It happens when @text
* ended with a newline (either due to truncation or the
* original string ending with "\n\n"). The loop is correctly
* repeated and (if not truncated) an empty line with a prefix
* will be prepared.
*/
text_len -= line_len + 1;
}
/*
* If a buffer was provided, it will be terminated. Space for the
* string terminator is guaranteed to be available. The terminator is
* not counted in the return value.
*/
if (buf_size > 0) r->text_buf[len] = 0; return len;
}
static size_t get_record_print_text_size(struct printk_info *info,
unsigned int line_count,
bool syslog, bool time)
{
char prefix[PREFIX_MAX];
size_t prefix_len;
prefix_len = info_print_prefix(info, syslog, time, prefix);
/*
* Each line will be preceded with a prefix. The intermediate
* newlines are already within the text, but a final trailing
* newline will be added.
*/
return ((prefix_len * line_count) + info->text_len + 1);
}
/*
* Beginning with @start_seq, find the first record where it and all following
* records up to (but not including) @max_seq fit into @size.
*
* @max_seq is simply an upper bound and does not need to exist. If the caller
* does not require an upper bound, -1 can be used for @max_seq.
*/
static u64 find_first_fitting_seq(u64 start_seq, u64 max_seq, size_t size,
bool syslog, bool time)
{
struct printk_info info;
unsigned int line_count;
size_t len = 0;
u64 seq;
/* Determine the size of the records up to @max_seq. */
prb_for_each_info(start_seq, prb, seq, &info, &line_count) {
if (info.seq >= max_seq)
break;
len += get_record_print_text_size(&info, line_count, syslog, time);
}
/*
* Adjust the upper bound for the next loop to avoid subtracting
* lengths that were never added.
*/
if (seq < max_seq)
max_seq = seq;
/*
* Move first record forward until length fits into the buffer. Ignore
* newest messages that were not counted in the above cycle. Messages
* might appear and get lost in the meantime. This is a best effort
* that prevents an infinite loop that could occur with a retry.
*/
prb_for_each_info(start_seq, prb, seq, &info, &line_count) {
if (len <= size || info.seq >= max_seq)
break;
len -= get_record_print_text_size(&info, line_count, syslog, time);
}
return seq;
}
/* The caller is responsible for making sure @size is greater than 0. */
static int syslog_print(char __user *buf, int size)
{
struct printk_info info;
struct printk_record r;
char *text;
int len = 0;
u64 seq;
text = kmalloc(CONSOLE_LOG_MAX, GFP_KERNEL);
if (!text)
return -ENOMEM;
prb_rec_init_rd(&r, &info, text, CONSOLE_LOG_MAX);
mutex_lock(&syslog_lock);
/*
* Wait for the @syslog_seq record to be available. @syslog_seq may
* change while waiting.
*/
do {
seq = syslog_seq;
mutex_unlock(&syslog_lock);
len = wait_event_interruptible(log_wait, prb_read_valid(prb, seq, NULL));
mutex_lock(&syslog_lock);
if (len)
goto out;
} while (syslog_seq != seq);
/*
* Copy records that fit into the buffer. The above cycle makes sure
* that the first record is always available.
*/
do {
size_t n;
size_t skip;
int err;
if (!prb_read_valid(prb, syslog_seq, &r))
break;
if (r.info->seq != syslog_seq) {
/* message is gone, move to next valid one */
syslog_seq = r.info->seq;
syslog_partial = 0;
}
/*
* To keep reading/counting partial line consistent,
* use printk_time value as of the beginning of a line.
*/
if (!syslog_partial)
syslog_time = printk_time;
skip = syslog_partial;
n = record_print_text(&r, true, syslog_time);
if (n - syslog_partial <= size) {
/* message fits into buffer, move forward */
syslog_seq = r.info->seq + 1;
n -= syslog_partial;
syslog_partial = 0;
} else if (!len){
/* partial read(), remember position */
n = size;
syslog_partial += n;
} else
n = 0;
if (!n)
break;
mutex_unlock(&syslog_lock);
err = copy_to_user(buf, text + skip, n);
mutex_lock(&syslog_lock);
if (err) {
if (!len)
len = -EFAULT;
break;
}
len += n;
size -= n;
buf += n;
} while (size);
out:
mutex_unlock(&syslog_lock);
kfree(text);
return len;
}
static int syslog_print_all(char __user *buf, int size, bool clear)
{
struct printk_info info;
struct printk_record r;
char *text;
int len = 0;
u64 seq;
bool time;
text = kmalloc(CONSOLE_LOG_MAX, GFP_KERNEL);
if (!text)
return -ENOMEM;
time = printk_time;
/*
* Find first record that fits, including all following records,
* into the user-provided buffer for this dump.
*/
seq = find_first_fitting_seq(latched_seq_read_nolock(&clear_seq), -1,
size, true, time);
prb_rec_init_rd(&r, &info, text, CONSOLE_LOG_MAX);
len = 0;
prb_for_each_record(seq, prb, seq, &r) {
int textlen;
textlen = record_print_text(&r, true, time);
if (len + textlen > size) {
seq--;
break;
}
if (copy_to_user(buf + len, text, textlen))
len = -EFAULT;
else
len += textlen;
if (len < 0)
break;
}
if (clear) {
mutex_lock(&syslog_lock);
latched_seq_write(&clear_seq, seq);
mutex_unlock(&syslog_lock);
}
kfree(text);
return len;
}
static void syslog_clear(void)
{
mutex_lock(&syslog_lock);
latched_seq_write(&clear_seq, prb_next_seq(prb));
mutex_unlock(&syslog_lock);
}
int do_syslog(int type, char __user *buf, int len, int source)
{
struct printk_info info;
bool clear = false;
static int saved_console_loglevel = LOGLEVEL_DEFAULT;
int error;
error = check_syslog_permissions(type, source);
if (error)
return error;
switch (type) {
case SYSLOG_ACTION_CLOSE: /* Close log */
break;
case SYSLOG_ACTION_OPEN: /* Open log */
break;
case SYSLOG_ACTION_READ: /* Read from log */
if (!buf || len < 0)
return -EINVAL;
if (!len)
return 0;
if (!access_ok(buf, len))
return -EFAULT;
error = syslog_print(buf, len);
break;
/* Read/clear last kernel messages */
case SYSLOG_ACTION_READ_CLEAR:
clear = true;
fallthrough;
/* Read last kernel messages */
case SYSLOG_ACTION_READ_ALL:
if (!buf || len < 0)
return -EINVAL;
if (!len)
return 0;
if (!access_ok(buf, len))
return -EFAULT;
error = syslog_print_all(buf, len, clear);
break;
/* Clear ring buffer */
case SYSLOG_ACTION_CLEAR:
syslog_clear();
break;
/* Disable logging to console */
case SYSLOG_ACTION_CONSOLE_OFF:
if (saved_console_loglevel == LOGLEVEL_DEFAULT)
saved_console_loglevel = console_loglevel;
console_loglevel = minimum_console_loglevel;
break;
/* Enable logging to console */
case SYSLOG_ACTION_CONSOLE_ON:
if (saved_console_loglevel != LOGLEVEL_DEFAULT) {
console_loglevel = saved_console_loglevel;
saved_console_loglevel = LOGLEVEL_DEFAULT;
}
break;
/* Set level of messages printed to console */
case SYSLOG_ACTION_CONSOLE_LEVEL:
if (len < 1 || len > 8)
return -EINVAL;
if (len < minimum_console_loglevel)
len = minimum_console_loglevel;
console_loglevel = len;
/* Implicitly re-enable logging to console */
saved_console_loglevel = LOGLEVEL_DEFAULT;
break;
/* Number of chars in the log buffer */
case SYSLOG_ACTION_SIZE_UNREAD:
mutex_lock(&syslog_lock);
if (!prb_read_valid_info(prb, syslog_seq, &info, NULL)) {
/* No unread messages. */
mutex_unlock(&syslog_lock);
return 0;
}
if (info.seq != syslog_seq) {
/* messages are gone, move to first one */
syslog_seq = info.seq;
syslog_partial = 0;
}
if (source == SYSLOG_FROM_PROC) {
/*
* Short-cut for poll(/"proc/kmsg") which simply checks
* for pending data, not the size; return the count of
* records, not the length.
*/
error = prb_next_seq(prb) - syslog_seq;
} else {
bool time = syslog_partial ? syslog_time : printk_time;
unsigned int line_count;
u64 seq;
prb_for_each_info(syslog_seq, prb, seq, &info,
&line_count) {
error += get_record_print_text_size(&info, line_count,
true, time);
time = printk_time;
}
error -= syslog_partial;
}
mutex_unlock(&syslog_lock);
break;
/* Size of the log buffer */
case SYSLOG_ACTION_SIZE_BUFFER:
error = log_buf_len;
break;
default:
error = -EINVAL;
break;
}
return error;
}
SYSCALL_DEFINE3(syslog, int, type, char __user *, buf, int, len)
{
return do_syslog(type, buf, len, SYSLOG_FROM_READER);
}
/*
* Special console_lock variants that help to reduce the risk of soft-lockups.
* They allow to pass console_lock to another printk() call using a busy wait.
*/
#ifdef CONFIG_LOCKDEP
static struct lockdep_map console_owner_dep_map = {
.name = "console_owner"
};
#endif
static DEFINE_RAW_SPINLOCK(console_owner_lock);
static struct task_struct *console_owner;
static bool console_waiter;
/**
* console_lock_spinning_enable - mark beginning of code where another
* thread might safely busy wait
*
* This basically converts console_lock into a spinlock. This marks
* the section where the console_lock owner can not sleep, because
* there may be a waiter spinning (like a spinlock). Also it must be
* ready to hand over the lock at the end of the section.
*/
static void console_lock_spinning_enable(void)
{
raw_spin_lock(&console_owner_lock);
console_owner = current;
raw_spin_unlock(&console_owner_lock);
/* The waiter may spin on us after setting console_owner */
spin_acquire(&console_owner_dep_map, 0, 0, _THIS_IP_);
}
/**
* console_lock_spinning_disable_and_check - mark end of code where another
* thread was able to busy wait and check if there is a waiter
*
* This is called at the end of the section where spinning is allowed.
* It has two functions. First, it is a signal that it is no longer
* safe to start busy waiting for the lock. Second, it checks if
* there is a busy waiter and passes the lock rights to her.
*
* Important: Callers lose the lock if there was a busy waiter.
* They must not touch items synchronized by console_lock
* in this case.
*
* Return: 1 if the lock rights were passed, 0 otherwise.
*/
static int console_lock_spinning_disable_and_check(void)
{
int waiter;
raw_spin_lock(&console_owner_lock);
waiter = READ_ONCE(console_waiter);
console_owner = NULL;
raw_spin_unlock(&console_owner_lock);
if (!waiter) {
spin_release(&console_owner_dep_map, _THIS_IP_);
return 0;
}
/* The waiter is now free to continue */
WRITE_ONCE(console_waiter, false);
spin_release(&console_owner_dep_map, _THIS_IP_);
/*
* Hand off console_lock to waiter. The waiter will perform
* the up(). After this, the waiter is the console_lock owner.
*/
mutex_release(&console_lock_dep_map, _THIS_IP_);
return 1;
}
/**
* console_trylock_spinning - try to get console_lock by busy waiting
*
* This allows to busy wait for the console_lock when the current
* owner is running in specially marked sections. It means that
* the current owner is running and cannot reschedule until it
* is ready to lose the lock.
*
* Return: 1 if we got the lock, 0 othrewise
*/
static int console_trylock_spinning(void)
{
struct task_struct *owner = NULL;
bool waiter;
bool spin = false;
unsigned long flags;
if (console_trylock())
return 1;
printk_safe_enter_irqsave(flags);
raw_spin_lock(&console_owner_lock);
owner = READ_ONCE(console_owner);
waiter = READ_ONCE(console_waiter);
if (!waiter && owner && owner != current) { WRITE_ONCE(console_waiter, true);
spin = true;
}
raw_spin_unlock(&console_owner_lock);
/*
* If there is an active printk() writing to the
* consoles, instead of having it write our data too,
* see if we can offload that load from the active
* printer, and do some printing ourselves.
* Go into a spin only if there isn't already a waiter
* spinning, and there is an active printer, and
* that active printer isn't us (recursive printk?).
*/
if (!spin) {
printk_safe_exit_irqrestore(flags);
return 0;
}
/* We spin waiting for the owner to release us */
spin_acquire(&console_owner_dep_map, 0, 0, _THIS_IP_);
/* Owner will clear console_waiter on hand off */
while (READ_ONCE(console_waiter))
cpu_relax();
spin_release(&console_owner_dep_map, _THIS_IP_);
printk_safe_exit_irqrestore(flags);
/*
* The owner passed the console lock to us.
* Since we did not spin on console lock, annotate
* this as a trylock. Otherwise lockdep will
* complain.
*/
mutex_acquire(&console_lock_dep_map, 0, 1, _THIS_IP_);
return 1;
}
/*
* Call the console drivers, asking them to write out
* log_buf[start] to log_buf[end - 1].
* The console_lock must be held.
*/
static void call_console_drivers(const char *ext_text, size_t ext_len,
const char *text, size_t len)
{
static char dropped_text[64];
size_t dropped_len = 0;
struct console *con;
trace_console_rcuidle(text, len);
if (!console_drivers)
return;
if (console_dropped) { dropped_len = snprintf(dropped_text, sizeof(dropped_text),
"** %lu printk messages dropped **\n",
console_dropped);
console_dropped = 0;
}
for_each_console(con) { if (exclusive_console && con != exclusive_console)
continue;
if (!(con->flags & CON_ENABLED))
continue;
if (!con->write)
continue;
if (!cpu_online(smp_processor_id()) && !(con->flags & CON_ANYTIME))
continue;
if (con->flags & CON_EXTENDED) con->write(con, ext_text, ext_len);
else {
if (dropped_len) con->write(con, dropped_text, dropped_len); con->write(con, text, len);
}
}
}
/*
* Recursion is tracked separately on each CPU. If NMIs are supported, an
* additional NMI context per CPU is also separately tracked. Until per-CPU
* is available, a separate "early tracking" is performed.
*/
static DEFINE_PER_CPU(u8, printk_count);
static u8 printk_count_early;
#ifdef CONFIG_HAVE_NMI
static DEFINE_PER_CPU(u8, printk_count_nmi);
static u8 printk_count_nmi_early;
#endif
/*
* Recursion is limited to keep the output sane. printk() should not require
* more than 1 level of recursion (allowing, for example, printk() to trigger
* a WARN), but a higher value is used in case some printk-internal errors
* exist, such as the ringbuffer validation checks failing.
*/
#define PRINTK_MAX_RECURSION 3
/*
* Return a pointer to the dedicated counter for the CPU+context of the
* caller.
*/
static u8 *__printk_recursion_counter(void)
{
#ifdef CONFIG_HAVE_NMI
if (in_nmi()) {
if (printk_percpu_data_ready())
return this_cpu_ptr(&printk_count_nmi);
return &printk_count_nmi_early;
}
#endif
if (printk_percpu_data_ready())
return this_cpu_ptr(&printk_count);
return &printk_count_early;
}
/*
* Enter recursion tracking. Interrupts are disabled to simplify tracking.
* The caller must check the boolean return value to see if the recursion is
* allowed. On failure, interrupts are not disabled.
*
* @recursion_ptr must be a variable of type (u8 *) and is the same variable
* that is passed to printk_exit_irqrestore().
*/
#define printk_enter_irqsave(recursion_ptr, flags) \
({ \
bool success = true; \
\
typecheck(u8 *, recursion_ptr); \
local_irq_save(flags); \
(recursion_ptr) = __printk_recursion_counter(); \
if (*(recursion_ptr) > PRINTK_MAX_RECURSION) { \
local_irq_restore(flags); \
success = false; \
} else { \
(*(recursion_ptr))++; \
} \
success; \
})
/* Exit recursion tracking, restoring interrupts. */
#define printk_exit_irqrestore(recursion_ptr, flags) \
do { \
typecheck(u8 *, recursion_ptr); \
(*(recursion_ptr))--; \
local_irq_restore(flags); \
} while (0)
int printk_delay_msec __read_mostly;
static inline void printk_delay(void)
{
if (unlikely(printk_delay_msec)) {
int m = printk_delay_msec;
while (m--) { mdelay(1);
touch_nmi_watchdog();
}
}
}
static inline u32 printk_caller_id(void)
{
return in_task() ? task_pid_nr(current) :
0x80000000 + raw_smp_processor_id();
}
/**
* printk_parse_prefix - Parse level and control flags.
*
* @text: The terminated text message.
* @level: A pointer to the current level value, will be updated.
* @flags: A pointer to the current printk_info flags, will be updated.
*
* @level may be NULL if the caller is not interested in the parsed value.
* Otherwise the variable pointed to by @level must be set to
* LOGLEVEL_DEFAULT in order to be updated with the parsed value.
*
* @flags may be NULL if the caller is not interested in the parsed value.
* Otherwise the variable pointed to by @flags will be OR'd with the parsed
* value.
*
* Return: The length of the parsed level and control flags.
*/
u16 printk_parse_prefix(const char *text, int *level,
enum printk_info_flags *flags)
{
u16 prefix_len = 0;
int kern_level;
while (*text) {
kern_level = printk_get_level(text);
if (!kern_level)
break;
switch (kern_level) {
case '0' ... '7':
if (level && *level == LOGLEVEL_DEFAULT) *level = kern_level - '0';
break;
case 'c': /* KERN_CONT */
if (flags) *flags |= LOG_CONT;
}
prefix_len += 2;
text += 2;
}
return prefix_len;
}
static u16 printk_sprint(char *text, u16 size, int facility,
enum printk_info_flags *flags, const char *fmt,
va_list args)
{
u16 text_len;
text_len = vscnprintf(text, size, fmt, args);
/* Mark and strip a trailing newline. */
if (text_len && text[text_len - 1] == '\n') { text_len--;
*flags |= LOG_NEWLINE;
}
/* Strip log level and control flags. */
if (facility == 0) {
u16 prefix_len;
prefix_len = printk_parse_prefix(text, NULL, NULL);
if (prefix_len) {
text_len -= prefix_len;
memmove(text, text + prefix_len, text_len);
}
}
return text_len;
}
__printf(4, 0)
int vprintk_store(int facility, int level,
const struct dev_printk_info *dev_info,
const char *fmt, va_list args)
{
const u32 caller_id = printk_caller_id();
struct prb_reserved_entry e;
enum printk_info_flags flags = 0;
struct printk_record r;
unsigned long irqflags;
u16 trunc_msg_len = 0;
char prefix_buf[8];
u8 *recursion_ptr;
u16 reserve_size;
va_list args2;
u16 text_len;
int ret = 0;
u64 ts_nsec;
/*
* Since the duration of printk() can vary depending on the message
* and state of the ringbuffer, grab the timestamp now so that it is
* close to the call of printk(). This provides a more deterministic
* timestamp with respect to the caller.
*/
ts_nsec = local_clock();
if (!printk_enter_irqsave(recursion_ptr, irqflags))
return 0;
/*
* The sprintf needs to come first since the syslog prefix might be
* passed in as a parameter. An extra byte must be reserved so that
* later the vscnprintf() into the reserved buffer has room for the
* terminating '\0', which is not counted by vsnprintf().
*/
va_copy(args2, args);
reserve_size = vsnprintf(&prefix_buf[0], sizeof(prefix_buf), fmt, args2) + 1;
va_end(args2);
if (reserve_size > LOG_LINE_MAX)
reserve_size = LOG_LINE_MAX;
/* Extract log level or control flags. */
if (facility == 0)
printk_parse_prefix(&prefix_buf[0], &level, &flags); if (level == LOGLEVEL_DEFAULT) level = default_message_loglevel; if (dev_info) flags |= LOG_NEWLINE; if (flags & LOG_CONT) {
prb_rec_init_wr(&r, reserve_size);
if (prb_reserve_in_last(&e, prb, &r, caller_id, LOG_LINE_MAX)) { text_len = printk_sprint(&r.text_buf[r.info->text_len], reserve_size,
facility, &flags, fmt, args);
r.info->text_len += text_len;
if (flags & LOG_NEWLINE) {
r.info->flags |= LOG_NEWLINE;
prb_final_commit(&e);
} else {
prb_commit(&e);
}
ret = text_len;
goto out;
}
}
/*
* Explicitly initialize the record before every prb_reserve() call.
* prb_reserve_in_last() and prb_reserve() purposely invalidate the
* structure when they fail.
*/
prb_rec_init_wr(&r, reserve_size);
if (!prb_reserve(&e, prb, &r)) {
/* truncate the message if it is too long for empty buffer */
truncate_msg(&reserve_size, &trunc_msg_len);
prb_rec_init_wr(&r, reserve_size + trunc_msg_len);
if (!prb_reserve(&e, prb, &r))
goto out;
}
/* fill message */
text_len = printk_sprint(&r.text_buf[0], reserve_size, facility, &flags, fmt, args);
if (trunc_msg_len)
memcpy(&r.text_buf[text_len], trunc_msg, trunc_msg_len); r.info->text_len = text_len + trunc_msg_len;
r.info->facility = facility;
r.info->level = level & 7;
r.info->flags = flags & 0x1f;
r.info->ts_nsec = ts_nsec;
r.info->caller_id = caller_id;
if (dev_info)
memcpy(&r.info->dev_info, dev_info, sizeof(r.info->dev_info));
/* A message without a trailing newline can be continued. */
if (!(flags & LOG_NEWLINE)) prb_commit(&e);
else
prb_final_commit(&e); ret = text_len + trunc_msg_len;
out:
printk_exit_irqrestore(recursion_ptr, irqflags);
return ret;
}
asmlinkage int vprintk_emit(int facility, int level,
const struct dev_printk_info *dev_info,
const char *fmt, va_list args)
{
int printed_len;
bool in_sched = false;
/* Suppress unimportant messages after panic happens */
if (unlikely(suppress_printk))
return 0;
if (level == LOGLEVEL_SCHED) {
level = LOGLEVEL_DEFAULT;
in_sched = true;
}
boot_delay_msec(level);
printk_delay();
printed_len = vprintk_store(facility, level, dev_info, fmt, args);
/* If called from the scheduler, we can not call up(). */
if (!in_sched) {
/*
* Disable preemption to avoid being preempted while holding
* console_sem which would prevent anyone from printing to
* console
*/
preempt_disable();
/*
* Try to acquire and then immediately release the console
* semaphore. The release will print out buffers and wake up
* /dev/kmsg and syslog() users.
*/
if (console_trylock_spinning())
console_unlock(); preempt_enable();
}
wake_up_klogd();
return printed_len;
}
EXPORT_SYMBOL(vprintk_emit);
int vprintk_default(const char *fmt, va_list args)
{
return vprintk_emit(0, LOGLEVEL_DEFAULT, NULL, fmt, args);
}
EXPORT_SYMBOL_GPL(vprintk_default);
asmlinkage __visible int _printk(const char *fmt, ...)
{
va_list args;
int r;
va_start(args, fmt);
r = vprintk(fmt, args);
va_end(args);
return r;
}
EXPORT_SYMBOL(_printk);
#else /* CONFIG_PRINTK */
#define CONSOLE_LOG_MAX 0
#define printk_time false
#define prb_read_valid(rb, seq, r) false
#define prb_first_valid_seq(rb) 0
static u64 syslog_seq;
static u64 console_seq;
static u64 exclusive_console_stop_seq;
static unsigned long console_dropped;
static size_t record_print_text(const struct printk_record *r,
bool syslog, bool time)
{
return 0;
}
static ssize_t info_print_ext_header(char *buf, size_t size,
struct printk_info *info)
{
return 0;
}
static ssize_t msg_print_ext_body(char *buf, size_t size,
char *text, size_t text_len,
struct dev_printk_info *dev_info) { return 0; }
static void console_lock_spinning_enable(void) { }
static int console_lock_spinning_disable_and_check(void) { return 0; }
static void call_console_drivers(const char *ext_text, size_t ext_len,
const char *text, size_t len) {}
static bool suppress_message_printing(int level) { return false; }
#endif /* CONFIG_PRINTK */
#ifdef CONFIG_EARLY_PRINTK
struct console *early_console;
asmlinkage __visible void early_printk(const char *fmt, ...)
{
va_list ap;
char buf[512];
int n;
if (!early_console)
return;
va_start(ap, fmt);
n = vscnprintf(buf, sizeof(buf), fmt, ap);
va_end(ap);
early_console->write(early_console, buf, n);
}
#endif
static int __add_preferred_console(char *name, int idx, char *options,
char *brl_options, bool user_specified)
{
struct console_cmdline *c;
int i;
/*
* See if this tty is not yet registered, and
* if we have a slot free.
*/
for (i = 0, c = console_cmdline;
i < MAX_CMDLINECONSOLES && c->name[0];
i++, c++) {
if (strcmp(c->name, name) == 0 && c->index == idx) {
if (!brl_options)
preferred_console = i;
if (user_specified)
c->user_specified = true;
return 0;
}
}
if (i == MAX_CMDLINECONSOLES)
return -E2BIG;
if (!brl_options)
preferred_console = i;
strlcpy(c->name, name, sizeof(c->name));
c->options = options;
c->user_specified = user_specified;
braille_set_options(c, brl_options);
c->index = idx;
return 0;
}
static int __init console_msg_format_setup(char *str)
{
if (!strcmp(str, "syslog"))
console_msg_format = MSG_FORMAT_SYSLOG;
if (!strcmp(str, "default"))
console_msg_format = MSG_FORMAT_DEFAULT;
return 1;
}
__setup("console_msg_format=", console_msg_format_setup);
/*
* Set up a console. Called via do_early_param() in init/main.c
* for each "console=" parameter in the boot command line.
*/
static int __init console_setup(char *str)
{
char buf[sizeof(console_cmdline[0].name) + 4]; /* 4 for "ttyS" */
char *s, *options, *brl_options = NULL;
int idx;
/*
* console="" or console=null have been suggested as a way to
* disable console output. Use ttynull that has been created
* for exactly this purpose.
*/
if (str[0] == 0 || strcmp(str, "null") == 0) {
__add_preferred_console("ttynull", 0, NULL, NULL, true);
return 1;
}
if (_braille_console_setup(&str, &brl_options))
return 1;
/*
* Decode str into name, index, options.
*/
if (str[0] >= '0' && str[0] <= '9') {
strcpy(buf, "ttyS");
strncpy(buf + 4, str, sizeof(buf) - 5);
} else {
strncpy(buf, str, sizeof(buf) - 1);
}
buf[sizeof(buf) - 1] = 0;
options = strchr(str, ',');
if (options)
*(options++) = 0;
#ifdef __sparc__
if (!strcmp(str, "ttya"))
strcpy(buf, "ttyS0");
if (!strcmp(str, "ttyb"))
strcpy(buf, "ttyS1");
#endif
for (s = buf; *s; s++)
if (isdigit(*s) || *s == ',')
break;
idx = simple_strtoul(s, NULL, 10);
*s = 0;
__add_preferred_console(buf, idx, options, brl_options, true);
console_set_on_cmdline = 1;
return 1;
}
__setup("console=", console_setup);
/**
* add_preferred_console - add a device to the list of preferred consoles.
* @name: device name
* @idx: device index
* @options: options for this console
*
* The last preferred console added will be used for kernel messages
* and stdin/out/err for init. Normally this is used by console_setup
* above to handle user-supplied console arguments; however it can also
* be used by arch-specific code either to override the user or more
* commonly to provide a default console (ie from PROM variables) when
* the user has not supplied one.
*/
int add_preferred_console(char *name, int idx, char *options)
{
return __add_preferred_console(name, idx, options, NULL, false);
}
bool console_suspend_enabled = true;
EXPORT_SYMBOL(console_suspend_enabled);
static int __init console_suspend_disable(char *str)
{
console_suspend_enabled = false;
return 1;
}
__setup("no_console_suspend", console_suspend_disable);
module_param_named(console_suspend, console_suspend_enabled,
bool, S_IRUGO | S_IWUSR);
MODULE_PARM_DESC(console_suspend, "suspend console during suspend"
" and hibernate operations");
static bool printk_console_no_auto_verbose;
void console_verbose(void)
{
if (console_loglevel && !printk_console_no_auto_verbose)
console_loglevel = CONSOLE_LOGLEVEL_MOTORMOUTH;
}
EXPORT_SYMBOL_GPL(console_verbose);
module_param_named(console_no_auto_verbose, printk_console_no_auto_verbose, bool, 0644);
MODULE_PARM_DESC(console_no_auto_verbose, "Disable console loglevel raise to highest on oops/panic/etc");
/**
* suspend_console - suspend the console subsystem
*
* This disables printk() while we go into suspend states
*/
void suspend_console(void)
{
if (!console_suspend_enabled)
return;
pr_info("Suspending console(s) (use no_console_suspend to debug)\n");
console_lock();
console_suspended = 1;
up_console_sem();
}
void resume_console(void)
{
if (!console_suspend_enabled)
return;
down_console_sem();
console_suspended = 0;
console_unlock();
}
/**
* console_cpu_notify - print deferred console messages after CPU hotplug
* @cpu: unused
*
* If printk() is called from a CPU that is not online yet, the messages
* will be printed on the console only if there are CON_ANYTIME consoles.
* This function is called when a new CPU comes online (or fails to come
* up) or goes offline.
*/
static int console_cpu_notify(unsigned int cpu)
{
if (!cpuhp_tasks_frozen) {
/* If trylock fails, someone else is doing the printing */
if (console_trylock())
console_unlock();
}
return 0;
}
/**
* console_lock - lock the console system for exclusive use.
*
* Acquires a lock which guarantees that the caller has
* exclusive access to the console system and the console_drivers list.
*
* Can sleep, returns nothing.
*/
void console_lock(void)
{
might_sleep();
down_console_sem();
if (console_suspended)
return;
console_locked = 1;
console_may_schedule = 1;
}
EXPORT_SYMBOL(console_lock);
/**
* console_trylock - try to lock the console system for exclusive use.
*
* Try to acquire a lock which guarantees that the caller has exclusive
* access to the console system and the console_drivers list.
*
* returns 1 on success, and 0 on failure to acquire the lock.
*/
int console_trylock(void)
{
if (down_trylock_console_sem())
return 0;
if (console_suspended) { up_console_sem(); return 0;
}
console_locked = 1;
console_may_schedule = 0;
return 1;
}
EXPORT_SYMBOL(console_trylock);
int is_console_locked(void)
{
return console_locked;
}
EXPORT_SYMBOL(is_console_locked);
/*
* Check if we have any console that is capable of printing while cpu is
* booting or shutting down. Requires console_sem.
*/
static int have_callable_console(void)
{
struct console *con;
for_each_console(con) if ((con->flags & CON_ENABLED) &&
(con->flags & CON_ANYTIME))
return 1;
return 0;
}
/*
* Can we actually use the console at this time on this cpu?
*
* Console drivers may assume that per-cpu resources have been allocated. So
* unless they're explicitly marked as being able to cope (CON_ANYTIME) don't
* call them until this CPU is officially up.
*/
static inline int can_use_console(void)
{
return cpu_online(raw_smp_processor_id()) || have_callable_console();
}
/**
* console_unlock - unlock the console system
*
* Releases the console_lock which the caller holds on the console system
* and the console driver list.
*
* While the console_lock was held, console output may have been buffered
* by printk(). If this is the case, console_unlock(); emits
* the output prior to releasing the lock.
*
* If there is output waiting, we wake /dev/kmsg and syslog() users.
*
* console_unlock(); may be called from any context.
*/
void console_unlock(void)
{
static char ext_text[CONSOLE_EXT_LOG_MAX];
static char text[CONSOLE_LOG_MAX];
unsigned long flags;
bool do_cond_resched, retry;
struct printk_info info;
struct printk_record r;
u64 __maybe_unused next_seq;
if (console_suspended) { up_console_sem();
return;
}
prb_rec_init_rd(&r, &info, text, sizeof(text));
/*
* Console drivers are called with interrupts disabled, so
* @console_may_schedule should be cleared before; however, we may
* end up dumping a lot of lines, for example, if called from
* console registration path, and should invoke cond_resched()
* between lines if allowable. Not doing so can cause a very long
* scheduling stall on a slow console leading to RCU stall and
* softlockup warnings which exacerbate the issue with more
* messages practically incapacitating the system.
*
* console_trylock() is not able to detect the preemptive
* context reliably. Therefore the value must be stored before
* and cleared after the "again" goto label.
*/
do_cond_resched = console_may_schedule;
again:
console_may_schedule = 0;
/*
* We released the console_sem lock, so we need to recheck if
* cpu is online and (if not) is there at least one CON_ANYTIME
* console.
*/
if (!can_use_console()) {
console_locked = 0;
up_console_sem();
return;
}
for (;;) {
size_t ext_len = 0;
int handover;
size_t len;
skip:
if (!prb_read_valid(prb, console_seq, &r))
break;
if (console_seq != r.info->seq) { console_dropped += r.info->seq - console_seq;
console_seq = r.info->seq;
}
if (suppress_message_printing(r.info->level)) {
/*
* Skip record we have buffered and already printed
* directly to the console when we received it, and
* record that has level above the console loglevel.
*/
console_seq++;
goto skip;
}
/* Output to all consoles once old messages replayed. */
if (unlikely(exclusive_console &&
console_seq >= exclusive_console_stop_seq)) {
exclusive_console = NULL;
}
/*
* Handle extended console text first because later
* record_print_text() will modify the record buffer in-place.
*/
if (nr_ext_console_drivers) { ext_len = info_print_ext_header(ext_text,
sizeof(ext_text),
r.info);
ext_len += msg_print_ext_body(ext_text + ext_len,
sizeof(ext_text) - ext_len,
&r.text_buf[0],
r.info->text_len,
&r.info->dev_info);
}
len = record_print_text(&r,
console_msg_format & MSG_FORMAT_SYSLOG,
printk_time);
console_seq++;
/*
* While actively printing out messages, if another printk()
* were to occur on another CPU, it may wait for this one to
* finish. This task can not be preempted if there is a
* waiter waiting to take over.
*
* Interrupts are disabled because the hand over to a waiter
* must not be interrupted until the hand over is completed
* (@console_waiter is cleared).
*/
printk_safe_enter_irqsave(flags);
console_lock_spinning_enable();
stop_critical_timings(); /* don't trace print latency */
call_console_drivers(ext_text, ext_len, text, len);
start_critical_timings();
handover = console_lock_spinning_disable_and_check();
printk_safe_exit_irqrestore(flags);
if (handover)
return;
if (do_cond_resched)
cond_resched();
}
/* Get consistent value of the next-to-be-used sequence number. */
next_seq = console_seq;
console_locked = 0;
up_console_sem();
/*
* Someone could have filled up the buffer again, so re-check if there's
* something to flush. In case we cannot trylock the console_sem again,
* there's a new owner and the console_unlock() from them will do the
* flush, no worries.
*/
retry = prb_read_valid(prb, next_seq, NULL);
if (retry && console_trylock())
goto again;
}
EXPORT_SYMBOL(console_unlock);
/**
* console_conditional_schedule - yield the CPU if required
*
* If the console code is currently allowed to sleep, and
* if this CPU should yield the CPU to another task, do
* so here.
*
* Must be called within console_lock();.
*/
void __sched console_conditional_schedule(void)
{
if (console_may_schedule)
cond_resched();
}
EXPORT_SYMBOL(console_conditional_schedule);
void console_unblank(void)
{
struct console *c;
/*
* console_unblank can no longer be called in interrupt context unless
* oops_in_progress is set to 1..
*/
if (oops_in_progress) {
if (down_trylock_console_sem() != 0)
return;
} else
console_lock();
console_locked = 1;
console_may_schedule = 0;
for_each_console(c)
if ((c->flags & CON_ENABLED) && c->unblank)
c->unblank();
console_unlock();
}
/**
* console_flush_on_panic - flush console content on panic
* @mode: flush all messages in buffer or just the pending ones
*
* Immediately output all pending messages no matter what.
*/
void console_flush_on_panic(enum con_flush_mode mode)
{
/*
* If someone else is holding the console lock, trylock will fail
* and may_schedule may be set. Ignore and proceed to unlock so
* that messages are flushed out. As this can be called from any
* context and we don't want to get preempted while flushing,
* ensure may_schedule is cleared.
*/
console_trylock();
console_may_schedule = 0;
if (mode == CONSOLE_REPLAY_ALL)
console_seq = prb_first_valid_seq(prb);
console_unlock();
}
/*
* Return the console tty driver structure and its associated index
*/
struct tty_driver *console_device(int *index)
{
struct console *c;
struct tty_driver *driver = NULL;
console_lock();
for_each_console(c) {
if (!c->device)
continue;
driver = c->device(c, index);
if (driver)
break;
}
console_unlock();
return driver;
}
/*
* Prevent further output on the passed console device so that (for example)
* serial drivers can disable console output before suspending a port, and can
* re-enable output afterwards.
*/
void console_stop(struct console *console)
{
console_lock();
console->flags &= ~CON_ENABLED;
console_unlock();
}
EXPORT_SYMBOL(console_stop);
void console_start(struct console *console)
{
console_lock();
console->flags |= CON_ENABLED;
console_unlock();
}
EXPORT_SYMBOL(console_start);
static int __read_mostly keep_bootcon;
static int __init keep_bootcon_setup(char *str)
{
keep_bootcon = 1;
pr_info("debug: skip boot console de-registration.\n");
return 0;
}
early_param("keep_bootcon", keep_bootcon_setup);
/*
* This is called by register_console() to try to match
* the newly registered console with any of the ones selected
* by either the command line or add_preferred_console() and
* setup/enable it.
*
* Care need to be taken with consoles that are statically
* enabled such as netconsole
*/
static int try_enable_new_console(struct console *newcon, bool user_specified)
{
struct console_cmdline *c;
int i, err;
for (i = 0, c = console_cmdline;
i < MAX_CMDLINECONSOLES && c->name[0];
i++, c++) {
if (c->user_specified != user_specified)
continue;
if (!newcon->match ||
newcon->match(newcon, c->name, c->index, c->options) != 0) {
/* default matching */
BUILD_BUG_ON(sizeof(c->name) != sizeof(newcon->name));
if (strcmp(c->name, newcon->name) != 0)
continue;
if (newcon->index >= 0 &&
newcon->index != c->index)
continue;
if (newcon->index < 0)
newcon->index = c->index;
if (_braille_register_console(newcon, c))
return 0;
if (newcon->setup &&
(err = newcon->setup(newcon, c->options)) != 0)
return err;
}
newcon->flags |= CON_ENABLED;
if (i == preferred_console) {
newcon->flags |= CON_CONSDEV;
has_preferred_console = true;
}
return 0;
}
/*
* Some consoles, such as pstore and netconsole, can be enabled even
* without matching. Accept the pre-enabled consoles only when match()
* and setup() had a chance to be called.
*/
if (newcon->flags & CON_ENABLED && c->user_specified == user_specified)
return 0;
return -ENOENT;
}
/*
* The console driver calls this routine during kernel initialization
* to register the console printing procedure with printk() and to
* print any messages that were printed by the kernel before the
* console driver was initialized.
*
* This can happen pretty early during the boot process (because of
* early_printk) - sometimes before setup_arch() completes - be careful
* of what kernel features are used - they may not be initialised yet.
*
* There are two types of consoles - bootconsoles (early_printk) and
* "real" consoles (everything which is not a bootconsole) which are
* handled differently.
* - Any number of bootconsoles can be registered at any time.
* - As soon as a "real" console is registered, all bootconsoles
* will be unregistered automatically.
* - Once a "real" console is registered, any attempt to register a
* bootconsoles will be rejected
*/
void register_console(struct console *newcon)
{
struct console *bcon = NULL;
int err;
for_each_console(bcon) {
if (WARN(bcon == newcon, "console '%s%d' already registered\n",
bcon->name, bcon->index))
return;
}
/*
* before we register a new CON_BOOT console, make sure we don't
* already have a valid console
*/
if (newcon->flags & CON_BOOT) {
for_each_console(bcon) {
if (!(bcon->flags & CON_BOOT)) {
pr_info("Too late to register bootconsole %s%d\n",
newcon->name, newcon->index);
return;
}
}
}
if (console_drivers && console_drivers->flags & CON_BOOT)
bcon = console_drivers;
if (!has_preferred_console || bcon || !console_drivers)
has_preferred_console = preferred_console >= 0;
/*
* See if we want to use this console driver. If we
* didn't select a console we take the first one
* that registers here.
*/
if (!has_preferred_console) {
if (newcon->index < 0)
newcon->index = 0;
if (newcon->setup == NULL ||
newcon->setup(newcon, NULL) == 0) {
newcon->flags |= CON_ENABLED;
if (newcon->device) {
newcon->flags |= CON_CONSDEV;
has_preferred_console = true;
}
}
}
/* See if this console matches one we selected on the command line */
err = try_enable_new_console(newcon, true);
/* If not, try to match against the platform default(s) */
if (err == -ENOENT)
err = try_enable_new_console(newcon, false);
/* printk() messages are not printed to the Braille console. */
if (err || newcon->flags & CON_BRL)
return;
/*
* If we have a bootconsole, and are switching to a real console,
* don't print everything out again, since when the boot console, and
* the real console are the same physical device, it's annoying to
* see the beginning boot messages twice
*/
if (bcon && ((newcon->flags & (CON_CONSDEV | CON_BOOT)) == CON_CONSDEV))
newcon->flags &= ~CON_PRINTBUFFER;
/*
* Put this console in the list - keep the
* preferred driver at the head of the list.
*/
console_lock();
if ((newcon->flags & CON_CONSDEV) || console_drivers == NULL) {
newcon->next = console_drivers;
console_drivers = newcon;
if (newcon->next)
newcon->next->flags &= ~CON_CONSDEV;
/* Ensure this flag is always set for the head of the list */
newcon->flags |= CON_CONSDEV;
} else {
newcon->next = console_drivers->next;
console_drivers->next = newcon;
}
if (newcon->flags & CON_EXTENDED)
nr_ext_console_drivers++;
if (newcon->flags & CON_PRINTBUFFER) {
/*
* console_unlock(); will print out the buffered messages
* for us.
*
* We're about to replay the log buffer. Only do this to the
* just-registered console to avoid excessive message spam to
* the already-registered consoles.
*
* Set exclusive_console with disabled interrupts to reduce
* race window with eventual console_flush_on_panic() that
* ignores console_lock.
*/
exclusive_console = newcon;
exclusive_console_stop_seq = console_seq;
/* Get a consistent copy of @syslog_seq. */
mutex_lock(&syslog_lock);
console_seq = syslog_seq;
mutex_unlock(&syslog_lock);
}
console_unlock();
console_sysfs_notify();
/*
* By unregistering the bootconsoles after we enable the real console
* we get the "console xxx enabled" message on all the consoles -
* boot consoles, real consoles, etc - this is to ensure that end
* users know there might be something in the kernel's log buffer that
* went to the bootconsole (that they do not see on the real console)
*/
pr_info("%sconsole [%s%d] enabled\n",
(newcon->flags & CON_BOOT) ? "boot" : "" ,
newcon->name, newcon->index);
if (bcon &&
((newcon->flags & (CON_CONSDEV | CON_BOOT)) == CON_CONSDEV) &&
!keep_bootcon) {
/* We need to iterate through all boot consoles, to make
* sure we print everything out, before we unregister them.
*/
for_each_console(bcon)
if (bcon->flags & CON_BOOT)
unregister_console(bcon);
}
}
EXPORT_SYMBOL(register_console);
int unregister_console(struct console *console)
{
struct console *con;
int res;
pr_info("%sconsole [%s%d] disabled\n",
(console->flags & CON_BOOT) ? "boot" : "" ,
console->name, console->index);
res = _braille_unregister_console(console);
if (res < 0)
return res;
if (res > 0)
return 0;
res = -ENODEV;
console_lock();
if (console_drivers == console) {
console_drivers=console->next;
res = 0;
} else {
for_each_console(con) {
if (con->next == console) {
con->next = console->next;
res = 0;
break;
}
}
}
if (res)
goto out_disable_unlock;
if (console->flags & CON_EXTENDED)
nr_ext_console_drivers--;
/*
* If this isn't the last console and it has CON_CONSDEV set, we
* need to set it on the next preferred console.
*/
if (console_drivers != NULL && console->flags & CON_CONSDEV)
console_drivers->flags |= CON_CONSDEV;
console->flags &= ~CON_ENABLED;
console_unlock();
console_sysfs_notify();
if (console->exit)
res = console->exit(console);
return res;
out_disable_unlock:
console->flags &= ~CON_ENABLED;
console_unlock();
return res;
}
EXPORT_SYMBOL(unregister_console);
/*
* Initialize the console device. This is called *early*, so
* we can't necessarily depend on lots of kernel help here.
* Just do some early initializations, and do the complex setup
* later.
*/
void __init console_init(void)
{
int ret;
initcall_t call;
initcall_entry_t *ce;
/* Setup the default TTY line discipline. */
n_tty_init();
/*
* set up the console device so that later boot sequences can
* inform about problems etc..
*/
ce = __con_initcall_start;
trace_initcall_level("console");
while (ce < __con_initcall_end) {
call = initcall_from_entry(ce);
trace_initcall_start(call);
ret = call();
trace_initcall_finish(call, ret);
ce++;
}
}
/*
* Some boot consoles access data that is in the init section and which will
* be discarded after the initcalls have been run. To make sure that no code
* will access this data, unregister the boot consoles in a late initcall.
*
* If for some reason, such as deferred probe or the driver being a loadable
* module, the real console hasn't registered yet at this point, there will
* be a brief interval in which no messages are logged to the console, which
* makes it difficult to diagnose problems that occur during this time.
*
* To mitigate this problem somewhat, only unregister consoles whose memory
* intersects with the init section. Note that all other boot consoles will
* get unregistered when the real preferred console is registered.
*/
static int __init printk_late_init(void)
{
struct console *con;
int ret;
for_each_console(con) {
if (!(con->flags & CON_BOOT))
continue;
/* Check addresses that might be used for enabled consoles. */
if (init_section_intersects(con, sizeof(*con)) ||
init_section_contains(con->write, 0) ||
init_section_contains(con->read, 0) ||
init_section_contains(con->device, 0) ||
init_section_contains(con->unblank, 0) ||
init_section_contains(con->data, 0)) {
/*
* Please, consider moving the reported consoles out
* of the init section.
*/
pr_warn("bootconsole [%s%d] uses init memory and must be disabled even before the real one is ready\n",
con->name, con->index);
unregister_console(con);
}
}
ret = cpuhp_setup_state_nocalls(CPUHP_PRINTK_DEAD, "printk:dead", NULL,
console_cpu_notify);
WARN_ON(ret < 0);
ret = cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, "printk:online",
console_cpu_notify, NULL);
WARN_ON(ret < 0);
return 0;
}
late_initcall(printk_late_init);
#if defined CONFIG_PRINTK
/*
* Delayed printk version, for scheduler-internal messages:
*/
#define PRINTK_PENDING_WAKEUP 0x01
#define PRINTK_PENDING_OUTPUT 0x02
static DEFINE_PER_CPU(int, printk_pending);
static void wake_up_klogd_work_func(struct irq_work *irq_work)
{
int pending = __this_cpu_xchg(printk_pending, 0);
if (pending & PRINTK_PENDING_OUTPUT) {
/* If trylock fails, someone else is doing the printing */
if (console_trylock())
console_unlock();
}
if (pending & PRINTK_PENDING_WAKEUP)
wake_up_interruptible(&log_wait);
}
static DEFINE_PER_CPU(struct irq_work, wake_up_klogd_work) =
IRQ_WORK_INIT_LAZY(wake_up_klogd_work_func);
void wake_up_klogd(void)
{
if (!printk_percpu_data_ready())
return;
preempt_disable();
if (waitqueue_active(&log_wait)) {
this_cpu_or(printk_pending, PRINTK_PENDING_WAKEUP);
irq_work_queue(this_cpu_ptr(&wake_up_klogd_work));
}
preempt_enable();
}
void defer_console_output(void)
{
if (!printk_percpu_data_ready())
return;
preempt_disable();
__this_cpu_or(printk_pending, PRINTK_PENDING_OUTPUT);
irq_work_queue(this_cpu_ptr(&wake_up_klogd_work));
preempt_enable();
}
void printk_trigger_flush(void)
{
defer_console_output();
}
int vprintk_deferred(const char *fmt, va_list args)
{
int r;
r = vprintk_emit(0, LOGLEVEL_SCHED, NULL, fmt, args);
defer_console_output();
return r;
}
int _printk_deferred(const char *fmt, ...)
{
va_list args;
int r;
va_start(args, fmt);
r = vprintk_deferred(fmt, args);
va_end(args);
return r;
}
/*
* printk rate limiting, lifted from the networking subsystem.
*
* This enforces a rate limit: not more than 10 kernel messages
* every 5s to make a denial-of-service attack impossible.
*/
DEFINE_RATELIMIT_STATE(printk_ratelimit_state, 5 * HZ, 10);
int __printk_ratelimit(const char *func)
{
return ___ratelimit(&printk_ratelimit_state, func);
}
EXPORT_SYMBOL(__printk_ratelimit);
/**
* printk_timed_ratelimit - caller-controlled printk ratelimiting
* @caller_jiffies: pointer to caller's state
* @interval_msecs: minimum interval between prints
*
* printk_timed_ratelimit() returns true if more than @interval_msecs
* milliseconds have elapsed since the last time printk_timed_ratelimit()
* returned true.
*/
bool printk_timed_ratelimit(unsigned long *caller_jiffies,
unsigned int interval_msecs)
{
unsigned long elapsed = jiffies - *caller_jiffies;
if (*caller_jiffies && elapsed <= msecs_to_jiffies(interval_msecs))
return false;
*caller_jiffies = jiffies;
return true;
}
EXPORT_SYMBOL(printk_timed_ratelimit);
static DEFINE_SPINLOCK(dump_list_lock);
static LIST_HEAD(dump_list);
/**
* kmsg_dump_register - register a kernel log dumper.
* @dumper: pointer to the kmsg_dumper structure
*
* Adds a kernel log dumper to the system. The dump callback in the
* structure will be called when the kernel oopses or panics and must be
* set. Returns zero on success and %-EINVAL or %-EBUSY otherwise.
*/
int kmsg_dump_register(struct kmsg_dumper *dumper)
{
unsigned long flags;
int err = -EBUSY;
/* The dump callback needs to be set */
if (!dumper->dump)
return -EINVAL;
spin_lock_irqsave(&dump_list_lock, flags);
/* Don't allow registering multiple times */
if (!dumper->registered) {
dumper->registered = 1;
list_add_tail_rcu(&dumper->list, &dump_list);
err = 0;
}
spin_unlock_irqrestore(&dump_list_lock, flags);
return err;
}
EXPORT_SYMBOL_GPL(kmsg_dump_register);
/**
* kmsg_dump_unregister - unregister a kmsg dumper.
* @dumper: pointer to the kmsg_dumper structure
*
* Removes a dump device from the system. Returns zero on success and
* %-EINVAL otherwise.
*/
int kmsg_dump_unregister(struct kmsg_dumper *dumper)
{
unsigned long flags;
int err = -EINVAL;
spin_lock_irqsave(&dump_list_lock, flags);
if (dumper->registered) {
dumper->registered = 0;
list_del_rcu(&dumper->list);
err = 0;
}
spin_unlock_irqrestore(&dump_list_lock, flags);
synchronize_rcu();
return err;
}
EXPORT_SYMBOL_GPL(kmsg_dump_unregister);
static bool always_kmsg_dump;
module_param_named(always_kmsg_dump, always_kmsg_dump, bool, S_IRUGO | S_IWUSR);
const char *kmsg_dump_reason_str(enum kmsg_dump_reason reason)
{
switch (reason) {
case KMSG_DUMP_PANIC:
return "Panic";
case KMSG_DUMP_OOPS:
return "Oops";
case KMSG_DUMP_EMERG:
return "Emergency";
case KMSG_DUMP_SHUTDOWN:
return "Shutdown";
default:
return "Unknown";
}
}
EXPORT_SYMBOL_GPL(kmsg_dump_reason_str);
/**
* kmsg_dump - dump kernel log to kernel message dumpers.
* @reason: the reason (oops, panic etc) for dumping
*
* Call each of the registered dumper's dump() callback, which can
* retrieve the kmsg records with kmsg_dump_get_line() or
* kmsg_dump_get_buffer().
*/
void kmsg_dump(enum kmsg_dump_reason reason)
{
struct kmsg_dumper *dumper;
rcu_read_lock();
list_for_each_entry_rcu(dumper, &dump_list, list) {
enum kmsg_dump_reason max_reason = dumper->max_reason;
/*
* If client has not provided a specific max_reason, default
* to KMSG_DUMP_OOPS, unless always_kmsg_dump was set.
*/
if (max_reason == KMSG_DUMP_UNDEF) {
max_reason = always_kmsg_dump ? KMSG_DUMP_MAX :
KMSG_DUMP_OOPS;
}
if (reason > max_reason)
continue;
/* invoke dumper which will iterate over records */
dumper->dump(dumper, reason);
}
rcu_read_unlock();
}
/**
* kmsg_dump_get_line - retrieve one kmsg log line
* @iter: kmsg dump iterator
* @syslog: include the "<4>" prefixes
* @line: buffer to copy the line to
* @size: maximum size of the buffer
* @len: length of line placed into buffer
*
* Start at the beginning of the kmsg buffer, with the oldest kmsg
* record, and copy one record into the provided buffer.
*
* Consecutive calls will return the next available record moving
* towards the end of the buffer with the youngest messages.
*
* A return value of FALSE indicates that there are no more records to
* read.
*/
bool kmsg_dump_get_line(struct kmsg_dump_iter *iter, bool syslog,
char *line, size_t size, size_t *len)
{
u64 min_seq = latched_seq_read_nolock(&clear_seq);
struct printk_info info;
unsigned int line_count;
struct printk_record r;
size_t l = 0;
bool ret = false;
if (iter->cur_seq < min_seq)
iter->cur_seq = min_seq;
prb_rec_init_rd(&r, &info, line, size);
/* Read text or count text lines? */
if (line) {
if (!prb_read_valid(prb, iter->cur_seq, &r))
goto out;
l = record_print_text(&r, syslog, printk_time);
} else {
if (!prb_read_valid_info(prb, iter->cur_seq,
&info, &line_count)) {
goto out;
}
l = get_record_print_text_size(&info, line_count, syslog,
printk_time);
}
iter->cur_seq = r.info->seq + 1;
ret = true;
out:
if (len)
*len = l;
return ret;
}
EXPORT_SYMBOL_GPL(kmsg_dump_get_line);
/**
* kmsg_dump_get_buffer - copy kmsg log lines
* @iter: kmsg dump iterator
* @syslog: include the "<4>" prefixes
* @buf: buffer to copy the line to
* @size: maximum size of the buffer
* @len_out: length of line placed into buffer
*
* Start at the end of the kmsg buffer and fill the provided buffer
* with as many of the *youngest* kmsg records that fit into it.
* If the buffer is large enough, all available kmsg records will be
* copied with a single call.
*
* Consecutive calls will fill the buffer with the next block of
* available older records, not including the earlier retrieved ones.
*
* A return value of FALSE indicates that there are no more records to
* read.
*/
bool kmsg_dump_get_buffer(struct kmsg_dump_iter *iter, bool syslog,
char *buf, size_t size, size_t *len_out)
{
u64 min_seq = latched_seq_read_nolock(&clear_seq);
struct printk_info info;
struct printk_record r;
u64 seq;
u64 next_seq;
size_t len = 0;
bool ret = false;
bool time = printk_time;
if (!buf || !size)
goto out;
if (iter->cur_seq < min_seq)
iter->cur_seq = min_seq;
if (prb_read_valid_info(prb, iter->cur_seq, &info, NULL)) {
if (info.seq != iter->cur_seq) {
/* messages are gone, move to first available one */
iter->cur_seq = info.seq;
}
}
/* last entry */
if (iter->cur_seq >= iter->next_seq)
goto out;
/*
* Find first record that fits, including all following records,
* into the user-provided buffer for this dump. Pass in size-1
* because this function (by way of record_print_text()) will
* not write more than size-1 bytes of text into @buf.
*/
seq = find_first_fitting_seq(iter->cur_seq, iter->next_seq,
size - 1, syslog, time);
/*
* Next kmsg_dump_get_buffer() invocation will dump block of
* older records stored right before this one.
*/
next_seq = seq;
prb_rec_init_rd(&r, &info, buf, size);
len = 0;
prb_for_each_record(seq, prb, seq, &r) {
if (r.info->seq >= iter->next_seq)
break;
len += record_print_text(&r, syslog, time);
/* Adjust record to store to remaining buffer space. */
prb_rec_init_rd(&r, &info, buf + len, size - len);
}
iter->next_seq = next_seq;
ret = true;
out:
if (len_out)
*len_out = len;
return ret;
}
EXPORT_SYMBOL_GPL(kmsg_dump_get_buffer);
/**
* kmsg_dump_rewind - reset the iterator
* @iter: kmsg dump iterator
*
* Reset the dumper's iterator so that kmsg_dump_get_line() and
* kmsg_dump_get_buffer() can be called again and used multiple
* times within the same dumper.dump() callback.
*/
void kmsg_dump_rewind(struct kmsg_dump_iter *iter)
{
iter->cur_seq = latched_seq_read_nolock(&clear_seq);
iter->next_seq = prb_next_seq(prb);
}
EXPORT_SYMBOL_GPL(kmsg_dump_rewind);
#endif
#ifdef CONFIG_SMP
static atomic_t printk_cpulock_owner = ATOMIC_INIT(-1);
static atomic_t printk_cpulock_nested = ATOMIC_INIT(0);
/**
* __printk_wait_on_cpu_lock() - Busy wait until the printk cpu-reentrant
* spinning lock is not owned by any CPU.
*
* Context: Any context.
*/
void __printk_wait_on_cpu_lock(void)
{
do {
cpu_relax();
} while (atomic_read(&printk_cpulock_owner) != -1);
}
EXPORT_SYMBOL(__printk_wait_on_cpu_lock);
/**
* __printk_cpu_trylock() - Try to acquire the printk cpu-reentrant
* spinning lock.
*
* If no processor has the lock, the calling processor takes the lock and
* becomes the owner. If the calling processor is already the owner of the
* lock, this function succeeds immediately.
*
* Context: Any context. Expects interrupts to be disabled.
* Return: 1 on success, otherwise 0.
*/
int __printk_cpu_trylock(void)
{
int cpu;
int old;
cpu = smp_processor_id();
/*
* Guarantee loads and stores from this CPU when it is the lock owner
* are _not_ visible to the previous lock owner. This pairs with
* __printk_cpu_unlock:B.
*
* Memory barrier involvement:
*
* If __printk_cpu_trylock:A reads from __printk_cpu_unlock:B, then
* __printk_cpu_unlock:A can never read from __printk_cpu_trylock:B.
*
* Relies on:
*
* RELEASE from __printk_cpu_unlock:A to __printk_cpu_unlock:B
* of the previous CPU
* matching
* ACQUIRE from __printk_cpu_trylock:A to __printk_cpu_trylock:B
* of this CPU
*/
old = atomic_cmpxchg_acquire(&printk_cpulock_owner, -1,
cpu); /* LMM(__printk_cpu_trylock:A) */
if (old == -1) {
/*
* This CPU is now the owner and begins loading/storing
* data: LMM(__printk_cpu_trylock:B)
*/
return 1;
} else if (old == cpu) {
/* This CPU is already the owner. */
atomic_inc(&printk_cpulock_nested);
return 1;
}
return 0;
}
EXPORT_SYMBOL(__printk_cpu_trylock);
/**
* __printk_cpu_unlock() - Release the printk cpu-reentrant spinning lock.
*
* The calling processor must be the owner of the lock.
*
* Context: Any context. Expects interrupts to be disabled.
*/
void __printk_cpu_unlock(void)
{
if (atomic_read(&printk_cpulock_nested)) {
atomic_dec(&printk_cpulock_nested);
return;
}
/*
* This CPU is finished loading/storing data:
* LMM(__printk_cpu_unlock:A)
*/
/*
* Guarantee loads and stores from this CPU when it was the
* lock owner are visible to the next lock owner. This pairs
* with __printk_cpu_trylock:A.
*
* Memory barrier involvement:
*
* If __printk_cpu_trylock:A reads from __printk_cpu_unlock:B,
* then __printk_cpu_trylock:B reads from __printk_cpu_unlock:A.
*
* Relies on:
*
* RELEASE from __printk_cpu_unlock:A to __printk_cpu_unlock:B
* of this CPU
* matching
* ACQUIRE from __printk_cpu_trylock:A to __printk_cpu_trylock:B
* of the next CPU
*/
atomic_set_release(&printk_cpulock_owner,
-1); /* LMM(__printk_cpu_unlock:B) */
}
EXPORT_SYMBOL(__printk_cpu_unlock);
#endif /* CONFIG_SMP */
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef __LINUX_BITMAP_H
#define __LINUX_BITMAP_H
#ifndef __ASSEMBLY__
#include <linux/align.h>
#include <linux/bitops.h>
#include <linux/limits.h>
#include <linux/string.h>
#include <linux/types.h>
struct device;
/*
* bitmaps provide bit arrays that consume one or more unsigned
* longs. The bitmap interface and available operations are listed
* here, in bitmap.h
*
* Function implementations generic to all architectures are in
* lib/bitmap.c. Functions implementations that are architecture
* specific are in various include/asm-<arch>/bitops.h headers
* and other arch/<arch> specific files.
*
* See lib/bitmap.c for more details.
*/
/**
* DOC: bitmap overview
*
* The available bitmap operations and their rough meaning in the
* case that the bitmap is a single unsigned long are thus:
*
* The generated code is more efficient when nbits is known at
* compile-time and at most BITS_PER_LONG.
*
* ::
*
* bitmap_zero(dst, nbits) *dst = 0UL
* bitmap_fill(dst, nbits) *dst = ~0UL
* bitmap_copy(dst, src, nbits) *dst = *src
* bitmap_and(dst, src1, src2, nbits) *dst = *src1 & *src2
* bitmap_or(dst, src1, src2, nbits) *dst = *src1 | *src2
* bitmap_xor(dst, src1, src2, nbits) *dst = *src1 ^ *src2
* bitmap_andnot(dst, src1, src2, nbits) *dst = *src1 & ~(*src2)
* bitmap_complement(dst, src, nbits) *dst = ~(*src)
* bitmap_equal(src1, src2, nbits) Are *src1 and *src2 equal?
* bitmap_intersects(src1, src2, nbits) Do *src1 and *src2 overlap?
* bitmap_subset(src1, src2, nbits) Is *src1 a subset of *src2?
* bitmap_empty(src, nbits) Are all bits zero in *src?
* bitmap_full(src, nbits) Are all bits set in *src?
* bitmap_weight(src, nbits) Hamming Weight: number set bits
* bitmap_set(dst, pos, nbits) Set specified bit area
* bitmap_clear(dst, pos, nbits) Clear specified bit area
* bitmap_find_next_zero_area(buf, len, pos, n, mask) Find bit free area
* bitmap_find_next_zero_area_off(buf, len, pos, n, mask, mask_off) as above
* bitmap_next_clear_region(map, &start, &end, nbits) Find next clear region
* bitmap_next_set_region(map, &start, &end, nbits) Find next set region
* bitmap_for_each_clear_region(map, rs, re, start, end)
* Iterate over all clear regions
* bitmap_for_each_set_region(map, rs, re, start, end)
* Iterate over all set regions
* bitmap_shift_right(dst, src, n, nbits) *dst = *src >> n
* bitmap_shift_left(dst, src, n, nbits) *dst = *src << n
* bitmap_cut(dst, src, first, n, nbits) Cut n bits from first, copy rest
* bitmap_replace(dst, old, new, mask, nbits) *dst = (*old & ~(*mask)) | (*new & *mask)
* bitmap_remap(dst, src, old, new, nbits) *dst = map(old, new)(src)
* bitmap_bitremap(oldbit, old, new, nbits) newbit = map(old, new)(oldbit)
* bitmap_onto(dst, orig, relmap, nbits) *dst = orig relative to relmap
* bitmap_fold(dst, orig, sz, nbits) dst bits = orig bits mod sz
* bitmap_parse(buf, buflen, dst, nbits) Parse bitmap dst from kernel buf
* bitmap_parse_user(ubuf, ulen, dst, nbits) Parse bitmap dst from user buf
* bitmap_parselist(buf, dst, nbits) Parse bitmap dst from kernel buf
* bitmap_parselist_user(buf, dst, nbits) Parse bitmap dst from user buf
* bitmap_find_free_region(bitmap, bits, order) Find and allocate bit region
* bitmap_release_region(bitmap, pos, order) Free specified bit region
* bitmap_allocate_region(bitmap, pos, order) Allocate specified bit region
* bitmap_from_arr32(dst, buf, nbits) Copy nbits from u32[] buf to dst
* bitmap_to_arr32(buf, src, nbits) Copy nbits from buf to u32[] dst
* bitmap_get_value8(map, start) Get 8bit value from map at start
* bitmap_set_value8(map, value, start) Set 8bit value to map at start
*
* Note, bitmap_zero() and bitmap_fill() operate over the region of
* unsigned longs, that is, bits behind bitmap till the unsigned long
* boundary will be zeroed or filled as well. Consider to use
* bitmap_clear() or bitmap_set() to make explicit zeroing or filling
* respectively.
*/
/**
* DOC: bitmap bitops
*
* Also the following operations in asm/bitops.h apply to bitmaps.::
*
* set_bit(bit, addr) *addr |= bit
* clear_bit(bit, addr) *addr &= ~bit
* change_bit(bit, addr) *addr ^= bit
* test_bit(bit, addr) Is bit set in *addr?
* test_and_set_bit(bit, addr) Set bit and return old value
* test_and_clear_bit(bit, addr) Clear bit and return old value
* test_and_change_bit(bit, addr) Change bit and return old value
* find_first_zero_bit(addr, nbits) Position first zero bit in *addr
* find_first_bit(addr, nbits) Position first set bit in *addr
* find_next_zero_bit(addr, nbits, bit)
* Position next zero bit in *addr >= bit
* find_next_bit(addr, nbits, bit) Position next set bit in *addr >= bit
* find_next_and_bit(addr1, addr2, nbits, bit)
* Same as find_next_bit, but in
* (*addr1 & *addr2)
*
*/
/**
* DOC: declare bitmap
* The DECLARE_BITMAP(name,bits) macro, in linux/types.h, can be used
* to declare an array named 'name' of just enough unsigned longs to
* contain all bit positions from 0 to 'bits' - 1.
*/
/*
* Allocation and deallocation of bitmap.
* Provided in lib/bitmap.c to avoid circular dependency.
*/
unsigned long *bitmap_alloc(unsigned int nbits, gfp_t flags);
unsigned long *bitmap_zalloc(unsigned int nbits, gfp_t flags);
void bitmap_free(const unsigned long *bitmap);
/* Managed variants of the above. */
unsigned long *devm_bitmap_alloc(struct device *dev,
unsigned int nbits, gfp_t flags);
unsigned long *devm_bitmap_zalloc(struct device *dev,
unsigned int nbits, gfp_t flags);
/*
* lib/bitmap.c provides these functions:
*/
int __bitmap_equal(const unsigned long *bitmap1,
const unsigned long *bitmap2, unsigned int nbits);
bool __pure __bitmap_or_equal(const unsigned long *src1,
const unsigned long *src2,
const unsigned long *src3,
unsigned int nbits);
void __bitmap_complement(unsigned long *dst, const unsigned long *src,
unsigned int nbits);
void __bitmap_shift_right(unsigned long *dst, const unsigned long *src,
unsigned int shift, unsigned int nbits);
void __bitmap_shift_left(unsigned long *dst, const unsigned long *src,
unsigned int shift, unsigned int nbits);
void bitmap_cut(unsigned long *dst, const unsigned long *src,
unsigned int first, unsigned int cut, unsigned int nbits);
int __bitmap_and(unsigned long *dst, const unsigned long *bitmap1,
const unsigned long *bitmap2, unsigned int nbits);
void __bitmap_or(unsigned long *dst, const unsigned long *bitmap1,
const unsigned long *bitmap2, unsigned int nbits);
void __bitmap_xor(unsigned long *dst, const unsigned long *bitmap1,
const unsigned long *bitmap2, unsigned int nbits);
int __bitmap_andnot(unsigned long *dst, const unsigned long *bitmap1,
const unsigned long *bitmap2, unsigned int nbits);
void __bitmap_replace(unsigned long *dst,
const unsigned long *old, const unsigned long *new,
const unsigned long *mask, unsigned int nbits);
int __bitmap_intersects(const unsigned long *bitmap1,
const unsigned long *bitmap2, unsigned int nbits);
int __bitmap_subset(const unsigned long *bitmap1,
const unsigned long *bitmap2, unsigned int nbits);
int __bitmap_weight(const unsigned long *bitmap, unsigned int nbits);
void __bitmap_set(unsigned long *map, unsigned int start, int len);
void __bitmap_clear(unsigned long *map, unsigned int start, int len);
unsigned long bitmap_find_next_zero_area_off(unsigned long *map,
unsigned long size,
unsigned long start,
unsigned int nr,
unsigned long align_mask,
unsigned long align_offset);
/**
* bitmap_find_next_zero_area - find a contiguous aligned zero area
* @map: The address to base the search on
* @size: The bitmap size in bits
* @start: The bitnumber to start searching at
* @nr: The number of zeroed bits we're looking for
* @align_mask: Alignment mask for zero area
*
* The @align_mask should be one less than a power of 2; the effect is that
* the bit offset of all zero areas this function finds is multiples of that
* power of 2. A @align_mask of 0 means no alignment is required.
*/
static inline unsigned long
bitmap_find_next_zero_area(unsigned long *map,
unsigned long size,
unsigned long start,
unsigned int nr,
unsigned long align_mask)
{
return bitmap_find_next_zero_area_off(map, size, start, nr,
align_mask, 0);
}
int bitmap_parse(const char *buf, unsigned int buflen,
unsigned long *dst, int nbits);
int bitmap_parse_user(const char __user *ubuf, unsigned int ulen,
unsigned long *dst, int nbits);
int bitmap_parselist(const char *buf, unsigned long *maskp,
int nmaskbits);
int bitmap_parselist_user(const char __user *ubuf, unsigned int ulen,
unsigned long *dst, int nbits);
void bitmap_remap(unsigned long *dst, const unsigned long *src,
const unsigned long *old, const unsigned long *new, unsigned int nbits);
int bitmap_bitremap(int oldbit,
const unsigned long *old, const unsigned long *new, int bits);
void bitmap_onto(unsigned long *dst, const unsigned long *orig,
const unsigned long *relmap, unsigned int bits);
void bitmap_fold(unsigned long *dst, const unsigned long *orig,
unsigned int sz, unsigned int nbits);
int bitmap_find_free_region(unsigned long *bitmap, unsigned int bits, int order);
void bitmap_release_region(unsigned long *bitmap, unsigned int pos, int order);
int bitmap_allocate_region(unsigned long *bitmap, unsigned int pos, int order);
#ifdef __BIG_ENDIAN
void bitmap_copy_le(unsigned long *dst, const unsigned long *src, unsigned int nbits);
#else
#define bitmap_copy_le bitmap_copy
#endif
unsigned int bitmap_ord_to_pos(const unsigned long *bitmap, unsigned int ord, unsigned int nbits);
int bitmap_print_to_pagebuf(bool list, char *buf,
const unsigned long *maskp, int nmaskbits);
extern int bitmap_print_bitmask_to_buf(char *buf, const unsigned long *maskp,
int nmaskbits, loff_t off, size_t count);
extern int bitmap_print_list_to_buf(char *buf, const unsigned long *maskp,
int nmaskbits, loff_t off, size_t count);
#define BITMAP_FIRST_WORD_MASK(start) (~0UL << ((start) & (BITS_PER_LONG - 1)))
#define BITMAP_LAST_WORD_MASK(nbits) (~0UL >> (-(nbits) & (BITS_PER_LONG - 1)))
static inline void bitmap_zero(unsigned long *dst, unsigned int nbits)
{
unsigned int len = BITS_TO_LONGS(nbits) * sizeof(unsigned long);
memset(dst, 0, len);
}
static inline void bitmap_fill(unsigned long *dst, unsigned int nbits)
{
unsigned int len = BITS_TO_LONGS(nbits) * sizeof(unsigned long);
memset(dst, 0xff, len);
}
static inline void bitmap_copy(unsigned long *dst, const unsigned long *src,
unsigned int nbits)
{
unsigned int len = BITS_TO_LONGS(nbits) * sizeof(unsigned long);
memcpy(dst, src, len);
}
/*
* Copy bitmap and clear tail bits in last word.
*/
static inline void bitmap_copy_clear_tail(unsigned long *dst,
const unsigned long *src, unsigned int nbits)
{
bitmap_copy(dst, src, nbits);
if (nbits % BITS_PER_LONG)
dst[nbits / BITS_PER_LONG] &= BITMAP_LAST_WORD_MASK(nbits);
}
/*
* On 32-bit systems bitmaps are represented as u32 arrays internally, and
* therefore conversion is not needed when copying data from/to arrays of u32.
*/
#if BITS_PER_LONG == 64
void bitmap_from_arr32(unsigned long *bitmap, const u32 *buf,
unsigned int nbits);
void bitmap_to_arr32(u32 *buf, const unsigned long *bitmap,
unsigned int nbits);
#else
#define bitmap_from_arr32(bitmap, buf, nbits) \
bitmap_copy_clear_tail((unsigned long *) (bitmap), \
(const unsigned long *) (buf), (nbits))
#define bitmap_to_arr32(buf, bitmap, nbits) \
bitmap_copy_clear_tail((unsigned long *) (buf), \
(const unsigned long *) (bitmap), (nbits))
#endif
static inline int bitmap_and(unsigned long *dst, const unsigned long *src1,
const unsigned long *src2, unsigned int nbits)
{
if (small_const_nbits(nbits))
return (*dst = *src1 & *src2 & BITMAP_LAST_WORD_MASK(nbits)) != 0;
return __bitmap_and(dst, src1, src2, nbits);
}
static inline void bitmap_or(unsigned long *dst, const unsigned long *src1,
const unsigned long *src2, unsigned int nbits)
{
if (small_const_nbits(nbits))
*dst = *src1 | *src2;
else
__bitmap_or(dst, src1, src2, nbits);
}
static inline void bitmap_xor(unsigned long *dst, const unsigned long *src1,
const unsigned long *src2, unsigned int nbits)
{
if (small_const_nbits(nbits))
*dst = *src1 ^ *src2;
else
__bitmap_xor(dst, src1, src2, nbits);
}
static inline int bitmap_andnot(unsigned long *dst, const unsigned long *src1,
const unsigned long *src2, unsigned int nbits)
{
if (small_const_nbits(nbits))
return (*dst = *src1 & ~(*src2) & BITMAP_LAST_WORD_MASK(nbits)) != 0;
return __bitmap_andnot(dst, src1, src2, nbits);
}
static inline void bitmap_complement(unsigned long *dst, const unsigned long *src,
unsigned int nbits)
{
if (small_const_nbits(nbits))
*dst = ~(*src);
else
__bitmap_complement(dst, src, nbits);
}
#ifdef __LITTLE_ENDIAN
#define BITMAP_MEM_ALIGNMENT 8
#else
#define BITMAP_MEM_ALIGNMENT (8 * sizeof(unsigned long))
#endif
#define BITMAP_MEM_MASK (BITMAP_MEM_ALIGNMENT - 1)
static inline int bitmap_equal(const unsigned long *src1,
const unsigned long *src2, unsigned int nbits)
{
if (small_const_nbits(nbits))
return !((*src1 ^ *src2) & BITMAP_LAST_WORD_MASK(nbits));
if (__builtin_constant_p(nbits & BITMAP_MEM_MASK) &&
IS_ALIGNED(nbits, BITMAP_MEM_ALIGNMENT))
return !memcmp(src1, src2, nbits / 8);
return __bitmap_equal(src1, src2, nbits);
}
/**
* bitmap_or_equal - Check whether the or of two bitmaps is equal to a third
* @src1: Pointer to bitmap 1
* @src2: Pointer to bitmap 2 will be or'ed with bitmap 1
* @src3: Pointer to bitmap 3. Compare to the result of *@src1 | *@src2
* @nbits: number of bits in each of these bitmaps
*
* Returns: True if (*@src1 | *@src2) == *@src3, false otherwise
*/
static inline bool bitmap_or_equal(const unsigned long *src1,
const unsigned long *src2,
const unsigned long *src3,
unsigned int nbits)
{
if (!small_const_nbits(nbits))
return __bitmap_or_equal(src1, src2, src3, nbits);
return !(((*src1 | *src2) ^ *src3) & BITMAP_LAST_WORD_MASK(nbits));
}
static inline int bitmap_intersects(const unsigned long *src1,
const unsigned long *src2, unsigned int nbits)
{
if (small_const_nbits(nbits))
return ((*src1 & *src2) & BITMAP_LAST_WORD_MASK(nbits)) != 0;
else
return __bitmap_intersects(src1, src2, nbits);
}
static inline int bitmap_subset(const unsigned long *src1,
const unsigned long *src2, unsigned int nbits)
{
if (small_const_nbits(nbits))
return ! ((*src1 & ~(*src2)) & BITMAP_LAST_WORD_MASK(nbits));
else
return __bitmap_subset(src1, src2, nbits);
}
static inline bool bitmap_empty(const unsigned long *src, unsigned nbits)
{
if (small_const_nbits(nbits))
return ! (*src & BITMAP_LAST_WORD_MASK(nbits));
return find_first_bit(src, nbits) == nbits;
}
static inline bool bitmap_full(const unsigned long *src, unsigned int nbits)
{
if (small_const_nbits(nbits))
return ! (~(*src) & BITMAP_LAST_WORD_MASK(nbits));
return find_first_zero_bit(src, nbits) == nbits;
}
static __always_inline int bitmap_weight(const unsigned long *src, unsigned int nbits)
{
if (small_const_nbits(nbits))
return hweight_long(*src & BITMAP_LAST_WORD_MASK(nbits));
return __bitmap_weight(src, nbits);
}
static __always_inline void bitmap_set(unsigned long *map, unsigned int start,
unsigned int nbits)
{
if (__builtin_constant_p(nbits) && nbits == 1)
__set_bit(start, map);
else if (__builtin_constant_p(start & BITMAP_MEM_MASK) &&
IS_ALIGNED(start, BITMAP_MEM_ALIGNMENT) &&
__builtin_constant_p(nbits & BITMAP_MEM_MASK) &&
IS_ALIGNED(nbits, BITMAP_MEM_ALIGNMENT))
memset((char *)map + start / 8, 0xff, nbits / 8);
else
__bitmap_set(map, start, nbits);
}
static __always_inline void bitmap_clear(unsigned long *map, unsigned int start,
unsigned int nbits)
{
if (__builtin_constant_p(nbits) && nbits == 1)
__clear_bit(start, map);
else if (__builtin_constant_p(start & BITMAP_MEM_MASK) &&
IS_ALIGNED(start, BITMAP_MEM_ALIGNMENT) &&
__builtin_constant_p(nbits & BITMAP_MEM_MASK) &&
IS_ALIGNED(nbits, BITMAP_MEM_ALIGNMENT))
memset((char *)map + start / 8, 0, nbits / 8);
else
__bitmap_clear(map, start, nbits);
}
static inline void bitmap_shift_right(unsigned long *dst, const unsigned long *src,
unsigned int shift, unsigned int nbits)
{
if (small_const_nbits(nbits))
*dst = (*src & BITMAP_LAST_WORD_MASK(nbits)) >> shift;
else
__bitmap_shift_right(dst, src, shift, nbits);
}
static inline void bitmap_shift_left(unsigned long *dst, const unsigned long *src,
unsigned int shift, unsigned int nbits)
{
if (small_const_nbits(nbits))
*dst = (*src << shift) & BITMAP_LAST_WORD_MASK(nbits);
else
__bitmap_shift_left(dst, src, shift, nbits);
}
static inline void bitmap_replace(unsigned long *dst,
const unsigned long *old,
const unsigned long *new,
const unsigned long *mask,
unsigned int nbits)
{
if (small_const_nbits(nbits))
*dst = (*old & ~(*mask)) | (*new & *mask);
else
__bitmap_replace(dst, old, new, mask, nbits);
}
static inline void bitmap_next_clear_region(unsigned long *bitmap,
unsigned int *rs, unsigned int *re,
unsigned int end)
{
*rs = find_next_zero_bit(bitmap, end, *rs);
*re = find_next_bit(bitmap, end, *rs + 1);
}
static inline void bitmap_next_set_region(unsigned long *bitmap,
unsigned int *rs, unsigned int *re,
unsigned int end)
{
*rs = find_next_bit(bitmap, end, *rs);
*re = find_next_zero_bit(bitmap, end, *rs + 1);
}
/*
* Bitmap region iterators. Iterates over the bitmap between [@start, @end).
* @rs and @re should be integer variables and will be set to start and end
* index of the current clear or set region.
*/
#define bitmap_for_each_clear_region(bitmap, rs, re, start, end) \
for ((rs) = (start), \
bitmap_next_clear_region((bitmap), &(rs), &(re), (end)); \
(rs) < (re); \
(rs) = (re) + 1, \
bitmap_next_clear_region((bitmap), &(rs), &(re), (end)))
#define bitmap_for_each_set_region(bitmap, rs, re, start, end) \
for ((rs) = (start), \
bitmap_next_set_region((bitmap), &(rs), &(re), (end)); \
(rs) < (re); \
(rs) = (re) + 1, \
bitmap_next_set_region((bitmap), &(rs), &(re), (end)))
/**
* BITMAP_FROM_U64() - Represent u64 value in the format suitable for bitmap.
* @n: u64 value
*
* Linux bitmaps are internally arrays of unsigned longs, i.e. 32-bit
* integers in 32-bit environment, and 64-bit integers in 64-bit one.
*
* There are four combinations of endianness and length of the word in linux
* ABIs: LE64, BE64, LE32 and BE32.
*
* On 64-bit kernels 64-bit LE and BE numbers are naturally ordered in
* bitmaps and therefore don't require any special handling.
*
* On 32-bit kernels 32-bit LE ABI orders lo word of 64-bit number in memory
* prior to hi, and 32-bit BE orders hi word prior to lo. The bitmap on the
* other hand is represented as an array of 32-bit words and the position of
* bit N may therefore be calculated as: word #(N/32) and bit #(N%32) in that
* word. For example, bit #42 is located at 10th position of 2nd word.
* It matches 32-bit LE ABI, and we can simply let the compiler store 64-bit
* values in memory as it usually does. But for BE we need to swap hi and lo
* words manually.
*
* With all that, the macro BITMAP_FROM_U64() does explicit reordering of hi and
* lo parts of u64. For LE32 it does nothing, and for BE environment it swaps
* hi and lo words, as is expected by bitmap.
*/
#if __BITS_PER_LONG == 64
#define BITMAP_FROM_U64(n) (n)
#else
#define BITMAP_FROM_U64(n) ((unsigned long) ((u64)(n) & ULONG_MAX)), \
((unsigned long) ((u64)(n) >> 32))
#endif
/**
* bitmap_from_u64 - Check and swap words within u64.
* @mask: source bitmap
* @dst: destination bitmap
*
* In 32-bit Big Endian kernel, when using ``(u32 *)(&val)[*]``
* to read u64 mask, we will get the wrong word.
* That is ``(u32 *)(&val)[0]`` gets the upper 32 bits,
* but we expect the lower 32-bits of u64.
*/
static inline void bitmap_from_u64(unsigned long *dst, u64 mask)
{
dst[0] = mask & ULONG_MAX;
if (sizeof(mask) > sizeof(unsigned long))
dst[1] = mask >> 32;
}
/**
* bitmap_get_value8 - get an 8-bit value within a memory region
* @map: address to the bitmap memory region
* @start: bit offset of the 8-bit value; must be a multiple of 8
*
* Returns the 8-bit value located at the @start bit offset within the @src
* memory region.
*/
static inline unsigned long bitmap_get_value8(const unsigned long *map,
unsigned long start)
{
const size_t index = BIT_WORD(start);
const unsigned long offset = start % BITS_PER_LONG;
return (map[index] >> offset) & 0xFF;
}
/**
* bitmap_set_value8 - set an 8-bit value within a memory region
* @map: address to the bitmap memory region
* @value: the 8-bit value; values wider than 8 bits may clobber bitmap
* @start: bit offset of the 8-bit value; must be a multiple of 8
*/
static inline void bitmap_set_value8(unsigned long *map, unsigned long value,
unsigned long start)
{
const size_t index = BIT_WORD(start);
const unsigned long offset = start % BITS_PER_LONG;
map[index] &= ~(0xFFUL << offset);
map[index] |= value << offset;
}
#endif /* __ASSEMBLY__ */
#endif /* __LINUX_BITMAP_H */
// SPDX-License-Identifier: GPL-2.0
/*
* Copyright (C) 2009 Oracle. All rights reserved.
*/
#include <linux/sched.h>
#include <linux/pagemap.h>
#include <linux/writeback.h>
#include <linux/blkdev.h>
#include <linux/rbtree.h>
#include <linux/slab.h>
#include <linux/error-injection.h>
#include "ctree.h"
#include "disk-io.h"
#include "transaction.h"
#include "volumes.h"
#include "locking.h"
#include "btrfs_inode.h"
#include "async-thread.h"
#include "free-space-cache.h"
#include "qgroup.h"
#include "print-tree.h"
#include "delalloc-space.h"
#include "block-group.h"
#include "backref.h"
#include "misc.h"
#include "subpage.h"
/*
* Relocation overview
*
* [What does relocation do]
*
* The objective of relocation is to relocate all extents of the target block
* group to other block groups.
* This is utilized by resize (shrink only), profile converting, compacting
* space, or balance routine to spread chunks over devices.
*
* Before | After
* ------------------------------------------------------------------
* BG A: 10 data extents | BG A: deleted
* BG B: 2 data extents | BG B: 10 data extents (2 old + 8 relocated)
* BG C: 1 extents | BG C: 3 data extents (1 old + 2 relocated)
*
* [How does relocation work]
*
* 1. Mark the target block group read-only
* New extents won't be allocated from the target block group.
*
* 2.1 Record each extent in the target block group
* To build a proper map of extents to be relocated.
*
* 2.2 Build data reloc tree and reloc trees
* Data reloc tree will contain an inode, recording all newly relocated
* data extents.
* There will be only one data reloc tree for one data block group.
*
* Reloc tree will be a special snapshot of its source tree, containing
* relocated tree blocks.
* Each tree referring to a tree block in target block group will get its
* reloc tree built.
*
* 2.3 Swap source tree with its corresponding reloc tree
* Each involved tree only refers to new extents after swap.
*
* 3. Cleanup reloc trees and data reloc tree.
* As old extents in the target block group are still referenced by reloc
* trees, we need to clean them up before really freeing the target block
* group.
*
* The main complexity is in steps 2.2 and 2.3.
*
* The entry point of relocation is relocate_block_group() function.
*/
#define RELOCATION_RESERVED_NODES 256
/*
* map address of tree root to tree
*/
struct mapping_node {
struct {
struct rb_node rb_node;
u64 bytenr;
}; /* Use rb_simle_node for search/insert */
void *data;
};
struct mapping_tree {
struct rb_root rb_root;
spinlock_t lock;
};
/*
* present a tree block to process
*/
struct tree_block {
struct {
struct rb_node rb_node;
u64 bytenr;
}; /* Use rb_simple_node for search/insert */
u64 owner;
struct btrfs_key key;
unsigned int level:8;
unsigned int key_ready:1;
};
#define MAX_EXTENTS 128
struct file_extent_cluster {
u64 start;
u64 end;
u64 boundary[MAX_EXTENTS];
unsigned int nr;
};
struct reloc_control {
/* block group to relocate */
struct btrfs_block_group *block_group;
/* extent tree */
struct btrfs_root *extent_root;
/* inode for moving data */
struct inode *data_inode;
struct btrfs_block_rsv *block_rsv;
struct btrfs_backref_cache backref_cache;
struct file_extent_cluster cluster;
/* tree blocks have been processed */
struct extent_io_tree processed_blocks;
/* map start of tree root to corresponding reloc tree */
struct mapping_tree reloc_root_tree;
/* list of reloc trees */
struct list_head reloc_roots;
/* list of subvolume trees that get relocated */
struct list_head dirty_subvol_roots;
/* size of metadata reservation for merging reloc trees */
u64 merging_rsv_size;
/* size of relocated tree nodes */
u64 nodes_relocated;
/* reserved size for block group relocation*/
u64 reserved_bytes;
u64 search_start;
u64 extents_found;
unsigned int stage:8;
unsigned int create_reloc_tree:1;
unsigned int merge_reloc_tree:1;
unsigned int found_file_extent:1;
};
/* stages of data relocation */
#define MOVE_DATA_EXTENTS 0
#define UPDATE_DATA_PTRS 1
static void mark_block_processed(struct reloc_control *rc,
struct btrfs_backref_node *node)
{
u32 blocksize;
if (node->level == 0 ||
in_range(node->bytenr, rc->block_group->start,
rc->block_group->length)) {
blocksize = rc->extent_root->fs_info->nodesize;
set_extent_bits(&rc->processed_blocks, node->bytenr,
node->bytenr + blocksize - 1, EXTENT_DIRTY);
}
node->processed = 1;
}
static void mapping_tree_init(struct mapping_tree *tree)
{
tree->rb_root = RB_ROOT;
spin_lock_init(&tree->lock);
}
/*
* walk up backref nodes until reach node presents tree root
*/
static struct btrfs_backref_node *walk_up_backref(
struct btrfs_backref_node *node,
struct btrfs_backref_edge *edges[], int *index)
{
struct btrfs_backref_edge *edge;
int idx = *index;
while (!list_empty(&node->upper)) {
edge = list_entry(node->upper.next,
struct btrfs_backref_edge, list[LOWER]);
edges[idx++] = edge;
node = edge->node[UPPER];
}
BUG_ON(node->detached);
*index = idx;
return node;
}
/*
* walk down backref nodes to find start of next reference path
*/
static struct btrfs_backref_node *walk_down_backref(
struct btrfs_backref_edge *edges[], int *index)
{
struct btrfs_backref_edge *edge;
struct btrfs_backref_node *lower;
int idx = *index;
while (idx > 0) {
edge = edges[idx - 1];
lower = edge->node[LOWER];
if (list_is_last(&edge->list[LOWER], &lower->upper)) {
idx--;
continue;
}
edge = list_entry(edge->list[LOWER].next,
struct btrfs_backref_edge, list[LOWER]);
edges[idx - 1] = edge;
*index = idx;
return edge->node[UPPER];
}
*index = 0;
return NULL;
}
static void update_backref_node(struct btrfs_backref_cache *cache,
struct btrfs_backref_node *node, u64 bytenr)
{
struct rb_node *rb_node;
rb_erase(&node->rb_node, &cache->rb_root);
node->bytenr = bytenr;
rb_node = rb_simple_insert(&cache->rb_root, node->bytenr, &node->rb_node);
if (rb_node)
btrfs_backref_panic(cache->fs_info, bytenr, -EEXIST);
}
/*
* update backref cache after a transaction commit
*/
static int update_backref_cache(struct btrfs_trans_handle *trans,
struct btrfs_backref_cache *cache)
{
struct btrfs_backref_node *node;
int level = 0;
if (cache->last_trans == 0) {
cache->last_trans = trans->transid;
return 0;
}
if (cache->last_trans == trans->transid)
return 0;
/*
* detached nodes are used to avoid unnecessary backref
* lookup. transaction commit changes the extent tree.
* so the detached nodes are no longer useful.
*/
while (!list_empty(&cache->detached)) {
node = list_entry(cache->detached.next,
struct btrfs_backref_node, list);
btrfs_backref_cleanup_node(cache, node);
}
while (!list_empty(&cache->changed)) {
node = list_entry(cache->changed.next,
struct btrfs_backref_node, list);
list_del_init(&node->list);
BUG_ON(node->pending);
update_backref_node(cache, node, node->new_bytenr);
}
/*
* some nodes can be left in the pending list if there were
* errors during processing the pending nodes.
*/
for (level = 0; level < BTRFS_MAX_LEVEL; level++) {
list_for_each_entry(node, &cache->pending[level], list) {
BUG_ON(!node->pending);
if (node->bytenr == node->new_bytenr)
continue;
update_backref_node(cache, node, node->new_bytenr);
}
}
cache->last_trans = 0;
return 1;
}
static bool reloc_root_is_dead(struct btrfs_root *root)
{
/*
* Pair with set_bit/clear_bit in clean_dirty_subvols and
* btrfs_update_reloc_root. We need to see the updated bit before
* trying to access reloc_root
*/
smp_rmb();
if (test_bit(BTRFS_ROOT_DEAD_RELOC_TREE, &root->state))
return true;
return false;
}
/*
* Check if this subvolume tree has valid reloc tree.
*
* Reloc tree after swap is considered dead, thus not considered as valid.
* This is enough for most callers, as they don't distinguish dead reloc root
* from no reloc root. But btrfs_should_ignore_reloc_root() below is a
* special case.
*/
static bool have_reloc_root(struct btrfs_root *root)
{
if (reloc_root_is_dead(root))
return false;
if (!root->reloc_root)
return false;
return true;
}
int btrfs_should_ignore_reloc_root(struct btrfs_root *root)
{
struct btrfs_root *reloc_root;
if (!test_bit(BTRFS_ROOT_SHAREABLE, &root->state))
return 0;
/* This root has been merged with its reloc tree, we can ignore it */
if (reloc_root_is_dead(root))
return 1;
reloc_root = root->reloc_root;
if (!reloc_root)
return 0;
if (btrfs_header_generation(reloc_root->commit_root) ==
root->fs_info->running_transaction->transid)
return 0;
/*
* if there is reloc tree and it was created in previous
* transaction backref lookup can find the reloc tree,
* so backref node for the fs tree root is useless for
* relocation.
*/
return 1;
}
/*
* find reloc tree by address of tree root
*/
struct btrfs_root *find_reloc_root(struct btrfs_fs_info *fs_info, u64 bytenr)
{
struct reloc_control *rc = fs_info->reloc_ctl;
struct rb_node *rb_node;
struct mapping_node *node;
struct btrfs_root *root = NULL;
ASSERT(rc);
spin_lock(&rc->reloc_root_tree.lock);
rb_node = rb_simple_search(&rc->reloc_root_tree.rb_root, bytenr);
if (rb_node) {
node = rb_entry(rb_node, struct mapping_node, rb_node);
root = (struct btrfs_root *)node->data;
}
spin_unlock(&rc->reloc_root_tree.lock);
return btrfs_grab_root(root);
}
/*
* For useless nodes, do two major clean ups:
*
* - Cleanup the children edges and nodes
* If child node is also orphan (no parent) during cleanup, then the child
* node will also be cleaned up.
*
* - Freeing up leaves (level 0), keeps nodes detached
* For nodes, the node is still cached as "detached"
*
* Return false if @node is not in the @useless_nodes list.
* Return true if @node is in the @useless_nodes list.
*/
static bool handle_useless_nodes(struct reloc_control *rc,
struct btrfs_backref_node *node)
{
struct btrfs_backref_cache *cache = &rc->backref_cache;
struct list_head *useless_node = &cache->useless_node;
bool ret = false;
while (!list_empty(useless_node)) {
struct btrfs_backref_node *cur;
cur = list_first_entry(useless_node, struct btrfs_backref_node,
list);
list_del_init(&cur->list);
/* Only tree root nodes can be added to @useless_nodes */
ASSERT(list_empty(&cur->upper));
if (cur == node)
ret = true;
/* The node is the lowest node */
if (cur->lowest) {
list_del_init(&cur->lower);
cur->lowest = 0;
}
/* Cleanup the lower edges */
while (!list_empty(&cur->lower)) {
struct btrfs_backref_edge *edge;
struct btrfs_backref_node *lower;
edge = list_entry(cur->lower.next,
struct btrfs_backref_edge, list[UPPER]);
list_del(&edge->list[UPPER]);
list_del(&edge->list[LOWER]);
lower = edge->node[LOWER];
btrfs_backref_free_edge(cache, edge);
/* Child node is also orphan, queue for cleanup */
if (list_empty(&lower->upper))
list_add(&lower->list, useless_node);
}
/* Mark this block processed for relocation */
mark_block_processed(rc, cur);
/*
* Backref nodes for tree leaves are deleted from the cache.
* Backref nodes for upper level tree blocks are left in the
* cache to avoid unnecessary backref lookup.
*/
if (cur->level > 0) {
list_add(&cur->list, &cache->detached);
cur->detached = 1;
} else {
rb_erase(&cur->rb_node, &cache->rb_root);
btrfs_backref_free_node(cache, cur);
}
}
return ret;
}
/*
* Build backref tree for a given tree block. Root of the backref tree
* corresponds the tree block, leaves of the backref tree correspond roots of
* b-trees that reference the tree block.
*
* The basic idea of this function is check backrefs of a given block to find
* upper level blocks that reference the block, and then check backrefs of
* these upper level blocks recursively. The recursion stops when tree root is
* reached or backrefs for the block is cached.
*
* NOTE: if we find that backrefs for a block are cached, we know backrefs for
* all upper level blocks that directly/indirectly reference the block are also
* cached.
*/
static noinline_for_stack struct btrfs_backref_node *build_backref_tree(
struct reloc_control *rc, struct btrfs_key *node_key,
int level, u64 bytenr)
{
struct btrfs_backref_iter *iter;
struct btrfs_backref_cache *cache = &rc->backref_cache;
/* For searching parent of TREE_BLOCK_REF */
struct btrfs_path *path;
struct btrfs_backref_node *cur;
struct btrfs_backref_node *node = NULL;
struct btrfs_backref_edge *edge;
int ret;
int err = 0;
iter = btrfs_backref_iter_alloc(rc->extent_root->fs_info, GFP_NOFS);
if (!iter)
return ERR_PTR(-ENOMEM);
path = btrfs_alloc_path();
if (!path) {
err = -ENOMEM;
goto out;
}
node = btrfs_backref_alloc_node(cache, bytenr, level);
if (!node) {
err = -ENOMEM;
goto out;
}
node->lowest = 1;
cur = node;
/* Breadth-first search to build backref cache */
do {
ret = btrfs_backref_add_tree_node(cache, path, iter, node_key,
cur);
if (ret < 0) {
err = ret;
goto out;
}
edge = list_first_entry_or_null(&cache->pending_edge,
struct btrfs_backref_edge, list[UPPER]);
/*
* The pending list isn't empty, take the first block to
* process
*/
if (edge) {
list_del_init(&edge->list[UPPER]);
cur = edge->node[UPPER];
}
} while (edge);
/* Finish the upper linkage of newly added edges/nodes */
ret = btrfs_backref_finish_upper_links(cache, node);
if (ret < 0) {
err = ret;
goto out;
}
if (handle_useless_nodes(rc, node))
node = NULL;
out:
btrfs_backref_iter_free(iter);
btrfs_free_path(path);
if (err) {
btrfs_backref_error_cleanup(cache, node);
return ERR_PTR(err);
}
ASSERT(!node || !node->detached);
ASSERT(list_empty(&cache->useless_node) &&
list_empty(&cache->pending_edge));
return node;
}
/*
* helper to add backref node for the newly created snapshot.
* the backref node is created by cloning backref node that
* corresponds to root of source tree
*/
static int clone_backref_node(struct btrfs_trans_handle *trans,
struct reloc_control *rc,
struct btrfs_root *src,
struct btrfs_root *dest)
{
struct btrfs_root *reloc_root = src->reloc_root;
struct btrfs_backref_cache *cache = &rc->backref_cache;
struct btrfs_backref_node *node = NULL;
struct btrfs_backref_node *new_node;
struct btrfs_backref_edge *edge;
struct btrfs_backref_edge *new_edge;
struct rb_node *rb_node;
if (cache->last_trans > 0)
update_backref_cache(trans, cache);
rb_node = rb_simple_search(&cache->rb_root, src->commit_root->start);
if (rb_node) {
node = rb_entry(rb_node, struct btrfs_backref_node, rb_node);
if (node->detached)
node = NULL;
else
BUG_ON(node->new_bytenr != reloc_root->node->start);
}
if (!node) {
rb_node = rb_simple_search(&cache->rb_root,
reloc_root->commit_root->start);
if (rb_node) {
node = rb_entry(rb_node, struct btrfs_backref_node,
rb_node);
BUG_ON(node->detached);
}
}
if (!node)
return 0;
new_node = btrfs_backref_alloc_node(cache, dest->node->start,
node->level);
if (!new_node)
return -ENOMEM;
new_node->lowest = node->lowest;
new_node->checked = 1;
new_node->root = btrfs_grab_root(dest);
ASSERT(new_node->root);
if (!node->lowest) {
list_for_each_entry(edge, &node->lower, list[UPPER]) {
new_edge = btrfs_backref_alloc_edge(cache);
if (!new_edge)
goto fail;
btrfs_backref_link_edge(new_edge, edge->node[LOWER],
new_node, LINK_UPPER);
}
} else {
list_add_tail(&new_node->lower, &cache->leaves);
}
rb_node = rb_simple_insert(&cache->rb_root, new_node->bytenr,
&new_node->rb_node);
if (rb_node)
btrfs_backref_panic(trans->fs_info, new_node->bytenr, -EEXIST);
if (!new_node->lowest) {
list_for_each_entry(new_edge, &new_node->lower, list[UPPER]) {
list_add_tail(&new_edge->list[LOWER],
&new_edge->node[LOWER]->upper);
}
}
return 0;
fail:
while (!list_empty(&new_node->lower)) {
new_edge = list_entry(new_node->lower.next,
struct btrfs_backref_edge, list[UPPER]);
list_del(&new_edge->list[UPPER]);
btrfs_backref_free_edge(cache, new_edge);
}
btrfs_backref_free_node(cache, new_node);
return -ENOMEM;
}
/*
* helper to add 'address of tree root -> reloc tree' mapping
*/
static int __must_check __add_reloc_root(struct btrfs_root *root)
{
struct btrfs_fs_info *fs_info = root->fs_info;
struct rb_node *rb_node;
struct mapping_node *node;
struct reloc_control *rc = fs_info->reloc_ctl;
node = kmalloc(sizeof(*node), GFP_NOFS);
if (!node)
return -ENOMEM;
node->bytenr = root->commit_root->start;
node->data = root;
spin_lock(&rc->reloc_root_tree.lock);
rb_node = rb_simple_insert(&rc->reloc_root_tree.rb_root,
node->bytenr, &node->rb_node);
spin_unlock(&rc->reloc_root_tree.lock);
if (rb_node) {
btrfs_err(fs_info,
"Duplicate root found for start=%llu while inserting into relocation tree",
node->bytenr);
return -EEXIST;
}
list_add_tail(&root->root_list, &rc->reloc_roots);
return 0;
}
/*
* helper to delete the 'address of tree root -> reloc tree'
* mapping
*/
static void __del_reloc_root(struct btrfs_root *root)
{
struct btrfs_fs_info *fs_info = root->fs_info;
struct rb_node *rb_node;
struct mapping_node *node = NULL;
struct reloc_control *rc = fs_info->reloc_ctl;
bool put_ref = false;
if (rc && root->node) {
spin_lock(&rc->reloc_root_tree.lock);
rb_node = rb_simple_search(&rc->reloc_root_tree.rb_root,
root->commit_root->start);
if (rb_node) {
node = rb_entry(rb_node, struct mapping_node, rb_node);
rb_erase(&node->rb_node, &rc->reloc_root_tree.rb_root);
RB_CLEAR_NODE(&node->rb_node);
}
spin_unlock(&rc->reloc_root_tree.lock);
ASSERT(!node || (struct btrfs_root *)node->data == root);
}
/*
* We only put the reloc root here if it's on the list. There's a lot
* of places where the pattern is to splice the rc->reloc_roots, process
* the reloc roots, and then add the reloc root back onto
* rc->reloc_roots. If we call __del_reloc_root while it's off of the
* list we don't want the reference being dropped, because the guy
* messing with the list is in charge of the reference.
*/
spin_lock(&fs_info->trans_lock);
if (!list_empty(&root->root_list)) {
put_ref = true;
list_del_init(&root->root_list);
}
spin_unlock(&fs_info->trans_lock);
if (put_ref)
btrfs_put_root(root);
kfree(node);
}
/*
* helper to update the 'address of tree root -> reloc tree'
* mapping
*/
static int __update_reloc_root(struct btrfs_root *root)
{
struct btrfs_fs_info *fs_info = root->fs_info;
struct rb_node *rb_node;
struct mapping_node *node = NULL;
struct reloc_control *rc = fs_info->reloc_ctl;
spin_lock(&rc->reloc_root_tree.lock);
rb_node = rb_simple_search(&rc->reloc_root_tree.rb_root,
root->commit_root->start);
if (rb_node) {
node = rb_entry(rb_node, struct mapping_node, rb_node);
rb_erase(&node->rb_node, &rc->reloc_root_tree.rb_root);
}
spin_unlock(&rc->reloc_root_tree.lock);
if (!node)
return 0;
BUG_ON((struct btrfs_root *)node->data != root);
spin_lock(&rc->reloc_root_tree.lock);
node->bytenr = root->node->start;
rb_node = rb_simple_insert(&rc->reloc_root_tree.rb_root,
node->bytenr, &node->rb_node);
spin_unlock(&rc->reloc_root_tree.lock);
if (rb_node)
btrfs_backref_panic(fs_info, node->bytenr, -EEXIST);
return 0;
}
static struct btrfs_root *create_reloc_root(struct btrfs_trans_handle *trans,
struct btrfs_root *root, u64 objectid)
{
struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_root *reloc_root;
struct extent_buffer *eb;
struct btrfs_root_item *root_item;
struct btrfs_key root_key;
int ret = 0;
bool must_abort = false;
root_item = kmalloc(sizeof(*root_item), GFP_NOFS);
if (!root_item)
return ERR_PTR(-ENOMEM);
root_key.objectid = BTRFS_TREE_RELOC_OBJECTID;
root_key.type = BTRFS_ROOT_ITEM_KEY;
root_key.offset = objectid;
if (root->root_key.objectid == objectid) {
u64 commit_root_gen;
/* called by btrfs_init_reloc_root */
ret = btrfs_copy_root(trans, root, root->commit_root, &eb,
BTRFS_TREE_RELOC_OBJECTID);
if (ret)
goto fail;
/*
* Set the last_snapshot field to the generation of the commit
* root - like this ctree.c:btrfs_block_can_be_shared() behaves
* correctly (returns true) when the relocation root is created
* either inside the critical section of a transaction commit
* (through transaction.c:qgroup_account_snapshot()) and when
* it's created before the transaction commit is started.
*/
commit_root_gen = btrfs_header_generation(root->commit_root);
btrfs_set_root_last_snapshot(&root->root_item, commit_root_gen);
} else {
/*
* called by btrfs_reloc_post_snapshot_hook.
* the source tree is a reloc tree, all tree blocks
* modified after it was created have RELOC flag
* set in their headers. so it's OK to not update
* the 'last_snapshot'.
*/
ret = btrfs_copy_root(trans, root, root->node, &eb,
BTRFS_TREE_RELOC_OBJECTID);
if (ret)
goto fail;
}
/*
* We have changed references at this point, we must abort the
* transaction if anything fails.
*/
must_abort = true;
memcpy(root_item, &root->root_item, sizeof(*root_item));
btrfs_set_root_bytenr(root_item, eb->start);
btrfs_set_root_level(root_item, btrfs_header_level(eb));
btrfs_set_root_generation(root_item, trans->transid);
if (root->root_key.objectid == objectid) {
btrfs_set_root_refs(root_item, 0);
memset(&root_item->drop_progress, 0,
sizeof(struct btrfs_disk_key));
btrfs_set_root_drop_level(root_item, 0);
}
btrfs_tree_unlock(eb);
free_extent_buffer(eb);
ret = btrfs_insert_root(trans, fs_info->tree_root,
&root_key, root_item);
if (ret)
goto fail;
kfree(root_item);
reloc_root = btrfs_read_tree_root(fs_info->tree_root, &root_key);
if (IS_ERR(reloc_root)) {
ret = PTR_ERR(reloc_root);
goto abort;
}
set_bit(BTRFS_ROOT_SHAREABLE, &reloc_root->state);
reloc_root->last_trans = trans->transid;
return reloc_root;
fail:
kfree(root_item);
abort:
if (must_abort)
btrfs_abort_transaction(trans, ret);
return ERR_PTR(ret);
}
/*
* create reloc tree for a given fs tree. reloc tree is just a
* snapshot of the fs tree with special root objectid.
*
* The reloc_root comes out of here with two references, one for
* root->reloc_root, and another for being on the rc->reloc_roots list.
*/
int btrfs_init_reloc_root(struct btrfs_trans_handle *trans,
struct btrfs_root *root)
{
struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_root *reloc_root;
struct reloc_control *rc = fs_info->reloc_ctl;
struct btrfs_block_rsv *rsv;
int clear_rsv = 0;
int ret;
if (!rc)
return 0;
/*
* The subvolume has reloc tree but the swap is finished, no need to
* create/update the dead reloc tree
*/
if (reloc_root_is_dead(root))
return 0;
/*
* This is subtle but important. We do not do
* record_root_in_transaction for reloc roots, instead we record their
* corresponding fs root, and then here we update the last trans for the
* reloc root. This means that we have to do this for the entire life
* of the reloc root, regardless of which stage of the relocation we are
* in.
*/
if (root->reloc_root) {
reloc_root = root->reloc_root;
reloc_root->last_trans = trans->transid; return 0;
}
/*
* We are merging reloc roots, we do not need new reloc trees. Also
* reloc trees never need their own reloc tree.
*/
if (!rc->create_reloc_tree || root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID)
return 0;
if (!trans->reloc_reserved) { rsv = trans->block_rsv;
trans->block_rsv = rc->block_rsv;
clear_rsv = 1;
}
reloc_root = create_reloc_root(trans, root, root->root_key.objectid);
if (clear_rsv)
trans->block_rsv = rsv;
if (IS_ERR(reloc_root))
return PTR_ERR(reloc_root);
ret = __add_reloc_root(reloc_root);
ASSERT(ret != -EEXIST);
if (ret) {
/* Pairs with create_reloc_root */
btrfs_put_root(reloc_root);
return ret;
}
root->reloc_root = btrfs_grab_root(reloc_root);
return 0;
}
/*
* update root item of reloc tree
*/
int btrfs_update_reloc_root(struct btrfs_trans_handle *trans,
struct btrfs_root *root)
{
struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_root *reloc_root;
struct btrfs_root_item *root_item;
int ret;
if (!have_reloc_root(root))
return 0;
reloc_root = root->reloc_root;
root_item = &reloc_root->root_item;
/*
* We are probably ok here, but __del_reloc_root() will drop its ref of
* the root. We have the ref for root->reloc_root, but just in case
* hold it while we update the reloc root.
*/
btrfs_grab_root(reloc_root);
/* root->reloc_root will stay until current relocation finished */
if (fs_info->reloc_ctl->merge_reloc_tree &&
btrfs_root_refs(root_item) == 0) {
set_bit(BTRFS_ROOT_DEAD_RELOC_TREE, &root->state);
/*
* Mark the tree as dead before we change reloc_root so
* have_reloc_root will not touch it from now on.
*/
smp_wmb();
__del_reloc_root(reloc_root);
}
if (reloc_root->commit_root != reloc_root->node) {
__update_reloc_root(reloc_root);
btrfs_set_root_node(root_item, reloc_root->node);
free_extent_buffer(reloc_root->commit_root);
reloc_root->commit_root = btrfs_root_node(reloc_root);
}
ret = btrfs_update_root(trans, fs_info->tree_root,
&reloc_root->root_key, root_item);
btrfs_put_root(reloc_root);
return ret;
}
/*
* helper to find first cached inode with inode number >= objectid
* in a subvolume
*/
static struct inode *find_next_inode(struct btrfs_root *root, u64 objectid)
{
struct rb_node *node;
struct rb_node *prev;
struct btrfs_inode *entry;
struct inode *inode;
spin_lock(&root->inode_lock);
again:
node = root->inode_tree.rb_node;
prev = NULL;
while (node) {
prev = node;
entry = rb_entry(node, struct btrfs_inode, rb_node);
if (objectid < btrfs_ino(entry))
node = node->rb_left;
else if (objectid > btrfs_ino(entry))
node = node->rb_right;
else
break;
}
if (!node) {
while (prev) {
entry = rb_entry(prev, struct btrfs_inode, rb_node);
if (objectid <= btrfs_ino(entry)) {
node = prev;
break;
}
prev = rb_next(prev);
}
}
while (node) {
entry = rb_entry(node, struct btrfs_inode, rb_node);
inode = igrab(&entry->vfs_inode);
if (inode) {
spin_unlock(&root->inode_lock);
return inode;
}
objectid = btrfs_ino(entry) + 1;
if (cond_resched_lock(&root->inode_lock))
goto again;
node = rb_next(node);
}
spin_unlock(&root->inode_lock);
return NULL;
}
/*
* get new location of data
*/
static int get_new_location(struct inode *reloc_inode, u64 *new_bytenr,
u64 bytenr, u64 num_bytes)
{
struct btrfs_root *root = BTRFS_I(reloc_inode)->root;
struct btrfs_path *path;
struct btrfs_file_extent_item *fi;
struct extent_buffer *leaf;
int ret;
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
bytenr -= BTRFS_I(reloc_inode)->index_cnt;
ret = btrfs_lookup_file_extent(NULL, root, path,
btrfs_ino(BTRFS_I(reloc_inode)), bytenr, 0);
if (ret < 0)
goto out;
if (ret > 0) {
ret = -ENOENT;
goto out;
}
leaf = path->nodes[0];
fi = btrfs_item_ptr(leaf, path->slots[0],
struct btrfs_file_extent_item);
BUG_ON(btrfs_file_extent_offset(leaf, fi) ||
btrfs_file_extent_compression(leaf, fi) ||
btrfs_file_extent_encryption(leaf, fi) ||
btrfs_file_extent_other_encoding(leaf, fi));
if (num_bytes != btrfs_file_extent_disk_num_bytes(leaf, fi)) {
ret = -EINVAL;
goto out;
}
*new_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
ret = 0;
out:
btrfs_free_path(path);
return ret;
}
/*
* update file extent items in the tree leaf to point to
* the new locations.
*/
static noinline_for_stack
int replace_file_extents(struct btrfs_trans_handle *trans,
struct reloc_control *rc,
struct btrfs_root *root,
struct extent_buffer *leaf)
{
struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_key key;
struct btrfs_file_extent_item *fi;
struct inode *inode = NULL;
u64 parent;
u64 bytenr;
u64 new_bytenr = 0;
u64 num_bytes;
u64 end;
u32 nritems;
u32 i;
int ret = 0;
int first = 1;
int dirty = 0;
if (rc->stage != UPDATE_DATA_PTRS)
return 0;
/* reloc trees always use full backref */
if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID)
parent = leaf->start;
else
parent = 0;
nritems = btrfs_header_nritems(leaf);
for (i = 0; i < nritems; i++) {
struct btrfs_ref ref = { 0 };
cond_resched();
btrfs_item_key_to_cpu(leaf, &key, i);
if (key.type != BTRFS_EXTENT_DATA_KEY)
continue;
fi = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item);
if (btrfs_file_extent_type(leaf, fi) ==
BTRFS_FILE_EXTENT_INLINE)
continue;
bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi);
if (bytenr == 0)
continue;
if (!in_range(bytenr, rc->block_group->start,
rc->block_group->length))
continue;
/*
* if we are modifying block in fs tree, wait for readpage
* to complete and drop the extent cache
*/
if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) {
if (first) {
inode = find_next_inode(root, key.objectid);
first = 0;
} else if (inode && btrfs_ino(BTRFS_I(inode)) < key.objectid) {
btrfs_add_delayed_iput(inode);
inode = find_next_inode(root, key.objectid);
}
if (inode && btrfs_ino(BTRFS_I(inode)) == key.objectid) {
end = key.offset +
btrfs_file_extent_num_bytes(leaf, fi);
WARN_ON(!IS_ALIGNED(key.offset,
fs_info->sectorsize));
WARN_ON(!IS_ALIGNED(end, fs_info->sectorsize));
end--;
ret = try_lock_extent(&BTRFS_I(inode)->io_tree,
key.offset, end);
if (!ret)
continue;
btrfs_drop_extent_cache(BTRFS_I(inode),
key.offset, end, 1);
unlock_extent(&BTRFS_I(inode)->io_tree,
key.offset, end);
}
}
ret = get_new_location(rc->data_inode, &new_bytenr,
bytenr, num_bytes);
if (ret) {
/*
* Don't have to abort since we've not changed anything
* in the file extent yet.
*/
break;
}
btrfs_set_file_extent_disk_bytenr(leaf, fi, new_bytenr);
dirty = 1;
key.offset -= btrfs_file_extent_offset(leaf, fi);
btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF, new_bytenr,
num_bytes, parent);
ref.real_root = root->root_key.objectid;
btrfs_init_data_ref(&ref, btrfs_header_owner(leaf),
key.objectid, key.offset);
ret = btrfs_inc_extent_ref(trans, &ref);
if (ret) {
btrfs_abort_transaction(trans, ret);
break;
}
btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF, bytenr,
num_bytes, parent);
ref.real_root = root->root_key.objectid;
btrfs_init_data_ref(&ref, btrfs_header_owner(leaf),
key.objectid, key.offset);
ret = btrfs_free_extent(trans, &ref);
if (ret) {
btrfs_abort_transaction(trans, ret);
break;
}
}
if (dirty)
btrfs_mark_buffer_dirty(leaf);
if (inode)
btrfs_add_delayed_iput(inode);
return ret;
}
static noinline_for_stack
int memcmp_node_keys(struct extent_buffer *eb, int slot,
struct btrfs_path *path, int level)
{
struct btrfs_disk_key key1;
struct btrfs_disk_key key2;
btrfs_node_key(eb, &key1, slot);
btrfs_node_key(path->nodes[level], &key2, path->slots[level]);
return memcmp(&key1, &key2, sizeof(key1));
}
/*
* try to replace tree blocks in fs tree with the new blocks
* in reloc tree. tree blocks haven't been modified since the
* reloc tree was create can be replaced.
*
* if a block was replaced, level of the block + 1 is returned.
* if no block got replaced, 0 is returned. if there are other
* errors, a negative error number is returned.
*/
static noinline_for_stack
int replace_path(struct btrfs_trans_handle *trans, struct reloc_control *rc,
struct btrfs_root *dest, struct btrfs_root *src,
struct btrfs_path *path, struct btrfs_key *next_key,
int lowest_level, int max_level)
{
struct btrfs_fs_info *fs_info = dest->fs_info;
struct extent_buffer *eb;
struct extent_buffer *parent;
struct btrfs_ref ref = { 0 };
struct btrfs_key key;
u64 old_bytenr;
u64 new_bytenr;
u64 old_ptr_gen;
u64 new_ptr_gen;
u64 last_snapshot;
u32 blocksize;
int cow = 0;
int level;
int ret;
int slot;
ASSERT(src->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID);
ASSERT(dest->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID);
last_snapshot = btrfs_root_last_snapshot(&src->root_item);
again:
slot = path->slots[lowest_level];
btrfs_node_key_to_cpu(path->nodes[lowest_level], &key, slot);
eb = btrfs_lock_root_node(dest);
level = btrfs_header_level(eb);
if (level < lowest_level) {
btrfs_tree_unlock(eb);
free_extent_buffer(eb);
return 0;
}
if (cow) {
ret = btrfs_cow_block(trans, dest, eb, NULL, 0, &eb,
BTRFS_NESTING_COW);
if (ret) {
btrfs_tree_unlock(eb);
free_extent_buffer(eb);
return ret;
}
}
if (next_key) {
next_key->objectid = (u64)-1;
next_key->type = (u8)-1;
next_key->offset = (u64)-1;
}
parent = eb;
while (1) {
level = btrfs_header_level(parent);
ASSERT(level >= lowest_level);
ret = btrfs_bin_search(parent, &key, &slot);
if (ret < 0)
break;
if (ret && slot > 0)
slot--;
if (next_key && slot + 1 < btrfs_header_nritems(parent))
btrfs_node_key_to_cpu(parent, next_key, slot + 1);
old_bytenr = btrfs_node_blockptr(parent, slot);
blocksize = fs_info->nodesize;
old_ptr_gen = btrfs_node_ptr_generation(parent, slot);
if (level <= max_level) {
eb = path->nodes[level];
new_bytenr = btrfs_node_blockptr(eb,
path->slots[level]);
new_ptr_gen = btrfs_node_ptr_generation(eb,
path->slots[level]);
} else {
new_bytenr = 0;
new_ptr_gen = 0;
}
if (WARN_ON(new_bytenr > 0 && new_bytenr == old_bytenr)) {
ret = level;
break;
}
if (new_bytenr == 0 || old_ptr_gen > last_snapshot ||
memcmp_node_keys(parent, slot, path, level)) {
if (level <= lowest_level) {
ret = 0;
break;
}
eb = btrfs_read_node_slot(parent, slot);
if (IS_ERR(eb)) {
ret = PTR_ERR(eb);
break;
}
btrfs_tree_lock(eb);
if (cow) {
ret = btrfs_cow_block(trans, dest, eb, parent,
slot, &eb,
BTRFS_NESTING_COW);
if (ret) {
btrfs_tree_unlock(eb);
free_extent_buffer(eb);
break;
}
}
btrfs_tree_unlock(parent);
free_extent_buffer(parent);
parent = eb;
continue;
}
if (!cow) {
btrfs_tree_unlock(parent);
free_extent_buffer(parent);
cow = 1;
goto again;
}
btrfs_node_key_to_cpu(path->nodes[level], &key,
path->slots[level]);
btrfs_release_path(path);
path->lowest_level = level;
ret = btrfs_search_slot(trans, src, &key, path, 0, 1);
path->lowest_level = 0;
if (ret) {
if (ret > 0)
ret = -ENOENT;
break;
}
/*
* Info qgroup to trace both subtrees.
*
* We must trace both trees.
* 1) Tree reloc subtree
* If not traced, we will leak data numbers
* 2) Fs subtree
* If not traced, we will double count old data
*
* We don't scan the subtree right now, but only record
* the swapped tree blocks.
* The real subtree rescan is delayed until we have new
* CoW on the subtree root node before transaction commit.
*/
ret = btrfs_qgroup_add_swapped_blocks(trans, dest,
rc->block_group, parent, slot,
path->nodes[level], path->slots[level],
last_snapshot);
if (ret < 0)
break;
/*
* swap blocks in fs tree and reloc tree.
*/
btrfs_set_node_blockptr(parent, slot, new_bytenr);
btrfs_set_node_ptr_generation(parent, slot, new_ptr_gen);
btrfs_mark_buffer_dirty(parent);
btrfs_set_node_blockptr(path->nodes[level],
path->slots[level], old_bytenr);
btrfs_set_node_ptr_generation(path->nodes[level],
path->slots[level], old_ptr_gen);
btrfs_mark_buffer_dirty(path->nodes[level]);
btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF, old_bytenr,
blocksize, path->nodes[level]->start);
ref.skip_qgroup = true;
btrfs_init_tree_ref(&ref, level - 1, src->root_key.objectid);
ret = btrfs_inc_extent_ref(trans, &ref);
if (ret) {
btrfs_abort_transaction(trans, ret);
break;
}
btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF, new_bytenr,
blocksize, 0);
ref.skip_qgroup = true;
btrfs_init_tree_ref(&ref, level - 1, dest->root_key.objectid);
ret = btrfs_inc_extent_ref(trans, &ref);
if (ret) {
btrfs_abort_transaction(trans, ret);
break;
}
btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF, new_bytenr,
blocksize, path->nodes[level]->start);
btrfs_init_tree_ref(&ref, level - 1, src->root_key.objectid);
ref.skip_qgroup = true;
ret = btrfs_free_extent(trans, &ref);
if (ret) {
btrfs_abort_transaction(trans, ret);
break;
}
btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF, old_bytenr,
blocksize, 0);
btrfs_init_tree_ref(&ref, level - 1, dest->root_key.objectid);
ref.skip_qgroup = true;
ret = btrfs_free_extent(trans, &ref);
if (ret) {
btrfs_abort_transaction(trans, ret);
break;
}
btrfs_unlock_up_safe(path, 0);
ret = level;
break;
}
btrfs_tree_unlock(parent);
free_extent_buffer(parent);
return ret;
}
/*
* helper to find next relocated block in reloc tree
*/
static noinline_for_stack
int walk_up_reloc_tree(struct btrfs_root *root, struct btrfs_path *path,
int *level)
{
struct extent_buffer *eb;
int i;
u64 last_snapshot;
u32 nritems;
last_snapshot = btrfs_root_last_snapshot(&root->root_item);
for (i = 0; i < *level; i++) {
free_extent_buffer(path->nodes[i]);
path->nodes[i] = NULL;
}
for (i = *level; i < BTRFS_MAX_LEVEL && path->nodes[i]; i++) {
eb = path->nodes[i];
nritems = btrfs_header_nritems(eb);
while (path->slots[i] + 1 < nritems) {
path->slots[i]++;
if (btrfs_node_ptr_generation(eb, path->slots[i]) <=
last_snapshot)
continue;
*level = i;
return 0;
}
free_extent_buffer(path->nodes[i]);
path->nodes[i] = NULL;
}
return 1;
}
/*
* walk down reloc tree to find relocated block of lowest level
*/
static noinline_for_stack
int walk_down_reloc_tree(struct btrfs_root *root, struct btrfs_path *path,
int *level)
{
struct extent_buffer *eb = NULL;
int i;
u64 ptr_gen = 0;
u64 last_snapshot;
u32 nritems;
last_snapshot = btrfs_root_last_snapshot(&root->root_item);
for (i = *level; i > 0; i--) {
eb = path->nodes[i];
nritems = btrfs_header_nritems(eb);
while (path->slots[i] < nritems) {
ptr_gen = btrfs_node_ptr_generation(eb, path->slots[i]);
if (ptr_gen > last_snapshot)
break;
path->slots[i]++;
}
if (path->slots[i] >= nritems) {
if (i == *level)
break;
*level = i + 1;
return 0;
}
if (i == 1) {
*level = i;
return 0;
}
eb = btrfs_read_node_slot(eb, path->slots[i]);
if (IS_ERR(eb))
return PTR_ERR(eb);
BUG_ON(btrfs_header_level(eb) != i - 1);
path->nodes[i - 1] = eb;
path->slots[i - 1] = 0;
}
return 1;
}
/*
* invalidate extent cache for file extents whose key in range of
* [min_key, max_key)
*/
static int invalidate_extent_cache(struct btrfs_root *root,
struct btrfs_key *min_key,
struct btrfs_key *max_key)
{
struct btrfs_fs_info *fs_info = root->fs_info;
struct inode *inode = NULL;
u64 objectid;
u64 start, end;
u64 ino;
objectid = min_key->objectid;
while (1) {
cond_resched();
iput(inode);
if (objectid > max_key->objectid)
break;
inode = find_next_inode(root, objectid);
if (!inode)
break;
ino = btrfs_ino(BTRFS_I(inode));
if (ino > max_key->objectid) {
iput(inode);
break;
}
objectid = ino + 1;
if (!S_ISREG(inode->i_mode))
continue;
if (unlikely(min_key->objectid == ino)) {
if (min_key->type > BTRFS_EXTENT_DATA_KEY)
continue;
if (min_key->type < BTRFS_EXTENT_DATA_KEY)
start = 0;
else {
start = min_key->offset;
WARN_ON(!IS_ALIGNED(start, fs_info->sectorsize));
}
} else {
start = 0;
}
if (unlikely(max_key->objectid == ino)) {
if (max_key->type < BTRFS_EXTENT_DATA_KEY)
continue;
if (max_key->type > BTRFS_EXTENT_DATA_KEY) {
end = (u64)-1;
} else {
if (max_key->offset == 0)
continue;
end = max_key->offset;
WARN_ON(!IS_ALIGNED(end, fs_info->sectorsize));
end--;
}
} else {
end = (u64)-1;
}
/* the lock_extent waits for readpage to complete */
lock_extent(&BTRFS_I(inode)->io_tree, start, end);
btrfs_drop_extent_cache(BTRFS_I(inode), start, end, 1);
unlock_extent(&BTRFS_I(inode)->io_tree, start, end);
}
return 0;
}
static int find_next_key(struct btrfs_path *path, int level,
struct btrfs_key *key)
{
while (level < BTRFS_MAX_LEVEL) {
if (!path->nodes[level])
break;
if (path->slots[level] + 1 <
btrfs_header_nritems(path->nodes[level])) {
btrfs_node_key_to_cpu(path->nodes[level], key,
path->slots[level] + 1);
return 0;
}
level++;
}
return 1;
}
/*
* Insert current subvolume into reloc_control::dirty_subvol_roots
*/
static int insert_dirty_subvol(struct btrfs_trans_handle *trans,
struct reloc_control *rc,
struct btrfs_root *root)
{
struct btrfs_root *reloc_root = root->reloc_root;
struct btrfs_root_item *reloc_root_item;
int ret;
/* @root must be a subvolume tree root with a valid reloc tree */
ASSERT(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID);
ASSERT(reloc_root);
reloc_root_item = &reloc_root->root_item;
memset(&reloc_root_item->drop_progress, 0,
sizeof(reloc_root_item->drop_progress));
btrfs_set_root_drop_level(reloc_root_item, 0);
btrfs_set_root_refs(reloc_root_item, 0);
ret = btrfs_update_reloc_root(trans, root);
if (ret)
return ret;
if (list_empty(&root->reloc_dirty_list)) {
btrfs_grab_root(root);
list_add_tail(&root->reloc_dirty_list, &rc->dirty_subvol_roots);
}
return 0;
}
static int clean_dirty_subvols(struct reloc_control *rc)
{
struct btrfs_root *root;
struct btrfs_root *next;
int ret = 0;
int ret2;
list_for_each_entry_safe(root, next, &rc->dirty_subvol_roots,
reloc_dirty_list) {
if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) {
/* Merged subvolume, cleanup its reloc root */
struct btrfs_root *reloc_root = root->reloc_root;
list_del_init(&root->reloc_dirty_list);
root->reloc_root = NULL;
/*
* Need barrier to ensure clear_bit() only happens after
* root->reloc_root = NULL. Pairs with have_reloc_root.
*/
smp_wmb();
clear_bit(BTRFS_ROOT_DEAD_RELOC_TREE, &root->state);
if (reloc_root) {
/*
* btrfs_drop_snapshot drops our ref we hold for
* ->reloc_root. If it fails however we must
* drop the ref ourselves.
*/
ret2 = btrfs_drop_snapshot(reloc_root, 0, 1);
if (ret2 < 0) {
btrfs_put_root(reloc_root);
if (!ret)
ret = ret2;
}
}
btrfs_put_root(root);
} else {
/* Orphan reloc tree, just clean it up */
ret2 = btrfs_drop_snapshot(root, 0, 1);
if (ret2 < 0) {
btrfs_put_root(root);
if (!ret)
ret = ret2;
}
}
}
return ret;
}
/*
* merge the relocated tree blocks in reloc tree with corresponding
* fs tree.
*/
static noinline_for_stack int merge_reloc_root(struct reloc_control *rc,
struct btrfs_root *root)
{
struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
struct btrfs_key key;
struct btrfs_key next_key;
struct btrfs_trans_handle *trans = NULL;
struct btrfs_root *reloc_root;
struct btrfs_root_item *root_item;
struct btrfs_path *path;
struct extent_buffer *leaf;
int reserve_level;
int level;
int max_level;
int replaced = 0;
int ret = 0;
u32 min_reserved;
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
path->reada = READA_FORWARD;
reloc_root = root->reloc_root;
root_item = &reloc_root->root_item;
if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) {
level = btrfs_root_level(root_item);
atomic_inc(&reloc_root->node->refs);
path->nodes[level] = reloc_root->node;
path->slots[level] = 0;
} else {
btrfs_disk_key_to_cpu(&key, &root_item->drop_progress);
level = btrfs_root_drop_level(root_item);
BUG_ON(level == 0);
path->lowest_level = level;
ret = btrfs_search_slot(NULL, reloc_root, &key, path, 0, 0);
path->lowest_level = 0;
if (ret < 0) {
btrfs_free_path(path);
return ret;
}
btrfs_node_key_to_cpu(path->nodes[level], &next_key,
path->slots[level]);
WARN_ON(memcmp(&key, &next_key, sizeof(key)));
btrfs_unlock_up_safe(path, 0);
}
/*
* In merge_reloc_root(), we modify the upper level pointer to swap the
* tree blocks between reloc tree and subvolume tree. Thus for tree
* block COW, we COW at most from level 1 to root level for each tree.
*
* Thus the needed metadata size is at most root_level * nodesize,
* and * 2 since we have two trees to COW.
*/
reserve_level = max_t(int, 1, btrfs_root_level(root_item));
min_reserved = fs_info->nodesize * reserve_level * 2;
memset(&next_key, 0, sizeof(next_key));
while (1) {
ret = btrfs_block_rsv_refill(root, rc->block_rsv, min_reserved,
BTRFS_RESERVE_FLUSH_LIMIT);
if (ret)
goto out;
trans = btrfs_start_transaction(root, 0);
if (IS_ERR(trans)) {
ret = PTR_ERR(trans);
trans = NULL;
goto out;
}
/*
* At this point we no longer have a reloc_control, so we can't
* depend on btrfs_init_reloc_root to update our last_trans.
*
* But that's ok, we started the trans handle on our
* corresponding fs_root, which means it's been added to the
* dirty list. At commit time we'll still call
* btrfs_update_reloc_root() and update our root item
* appropriately.
*/
reloc_root->last_trans = trans->transid;
trans->block_rsv = rc->block_rsv;
replaced = 0;
max_level = level;
ret = walk_down_reloc_tree(reloc_root, path, &level);
if (ret < 0)
goto out;
if (ret > 0)
break;
if (!find_next_key(path, level, &key) &&
btrfs_comp_cpu_keys(&next_key, &key) >= 0) {
ret = 0;
} else {
ret = replace_path(trans, rc, root, reloc_root, path,
&next_key, level, max_level);
}
if (ret < 0)
goto out;
if (ret > 0) {
level = ret;
btrfs_node_key_to_cpu(path->nodes[level], &key,
path->slots[level]);
replaced = 1;
}
ret = walk_up_reloc_tree(reloc_root, path, &level);
if (ret > 0)
break;
BUG_ON(level == 0);
/*
* save the merging progress in the drop_progress.
* this is OK since root refs == 1 in this case.
*/
btrfs_node_key(path->nodes[level], &root_item->drop_progress,
path->slots[level]);
btrfs_set_root_drop_level(root_item, level);
btrfs_end_transaction_throttle(trans);
trans = NULL;
btrfs_btree_balance_dirty(fs_info);
if (replaced && rc->stage == UPDATE_DATA_PTRS)
invalidate_extent_cache(root, &key, &next_key);
}
/*
* handle the case only one block in the fs tree need to be
* relocated and the block is tree root.
*/
leaf = btrfs_lock_root_node(root);
ret = btrfs_cow_block(trans, root, leaf, NULL, 0, &leaf,
BTRFS_NESTING_COW);
btrfs_tree_unlock(leaf);
free_extent_buffer(leaf);
out:
btrfs_free_path(path);
if (ret == 0) {
ret = insert_dirty_subvol(trans, rc, root);
if (ret)
btrfs_abort_transaction(trans, ret);
}
if (trans)
btrfs_end_transaction_throttle(trans);
btrfs_btree_balance_dirty(fs_info);
if (replaced && rc->stage == UPDATE_DATA_PTRS)
invalidate_extent_cache(root, &key, &next_key);
return ret;
}
static noinline_for_stack
int prepare_to_merge(struct reloc_control *rc, int err)
{
struct btrfs_root *root = rc->extent_root;
struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_root *reloc_root;
struct btrfs_trans_handle *trans;
LIST_HEAD(reloc_roots);
u64 num_bytes = 0;
int ret;
mutex_lock(&fs_info->reloc_mutex);
rc->merging_rsv_size += fs_info->nodesize * (BTRFS_MAX_LEVEL - 1) * 2;
rc->merging_rsv_size += rc->nodes_relocated * 2;
mutex_unlock(&fs_info->reloc_mutex);
again:
if (!err) {
num_bytes = rc->merging_rsv_size;
ret = btrfs_block_rsv_add(root, rc->block_rsv, num_bytes,
BTRFS_RESERVE_FLUSH_ALL);
if (ret)
err = ret;
}
trans = btrfs_join_transaction(rc->extent_root);
if (IS_ERR(trans)) {
if (!err)
btrfs_block_rsv_release(fs_info, rc->block_rsv,
num_bytes, NULL);
return PTR_ERR(trans);
}
if (!err) {
if (num_bytes != rc->merging_rsv_size) {
btrfs_end_transaction(trans);
btrfs_block_rsv_release(fs_info, rc->block_rsv,
num_bytes, NULL);
goto again;
}
}
rc->merge_reloc_tree = 1;
while (!list_empty(&rc->reloc_roots)) {
reloc_root = list_entry(rc->reloc_roots.next,
struct btrfs_root, root_list);
list_del_init(&reloc_root->root_list);
root = btrfs_get_fs_root(fs_info, reloc_root->root_key.offset,
false);
if (IS_ERR(root)) {
/*
* Even if we have an error we need this reloc root
* back on our list so we can clean up properly.
*/
list_add(&reloc_root->root_list, &reloc_roots);
btrfs_abort_transaction(trans, (int)PTR_ERR(root));
if (!err)
err = PTR_ERR(root);
break;
}
ASSERT(root->reloc_root == reloc_root);
/*
* set reference count to 1, so btrfs_recover_relocation
* knows it should resumes merging
*/
if (!err)
btrfs_set_root_refs(&reloc_root->root_item, 1);
ret = btrfs_update_reloc_root(trans, root);
/*
* Even if we have an error we need this reloc root back on our
* list so we can clean up properly.
*/
list_add(&reloc_root->root_list, &reloc_roots);
btrfs_put_root(root);
if (ret) {
btrfs_abort_transaction(trans, ret);
if (!err)
err = ret;
break;
}
}
list_splice(&reloc_roots, &rc->reloc_roots);
if (!err)
err = btrfs_commit_transaction(trans);
else
btrfs_end_transaction(trans);
return err;
}
static noinline_for_stack
void free_reloc_roots(struct list_head *list)
{
struct btrfs_root *reloc_root, *tmp;
list_for_each_entry_safe(reloc_root, tmp, list, root_list) __del_reloc_root(reloc_root);
}
static noinline_for_stack
void merge_reloc_roots(struct reloc_control *rc)
{
struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
struct btrfs_root *root;
struct btrfs_root *reloc_root;
LIST_HEAD(reloc_roots);
int found = 0;
int ret = 0;
again:
root = rc->extent_root;
/*
* this serializes us with btrfs_record_root_in_transaction,
* we have to make sure nobody is in the middle of
* adding their roots to the list while we are
* doing this splice
*/
mutex_lock(&fs_info->reloc_mutex);
list_splice_init(&rc->reloc_roots, &reloc_roots);
mutex_unlock(&fs_info->reloc_mutex);
while (!list_empty(&reloc_roots)) {
found = 1;
reloc_root = list_entry(reloc_roots.next,
struct btrfs_root, root_list);
root = btrfs_get_fs_root(fs_info, reloc_root->root_key.offset,
false);
if (btrfs_root_refs(&reloc_root->root_item) > 0) {
if (IS_ERR(root)) {
/*
* For recovery we read the fs roots on mount,
* and if we didn't find the root then we marked
* the reloc root as a garbage root. For normal
* relocation obviously the root should exist in
* memory. However there's no reason we can't
* handle the error properly here just in case.
*/
ASSERT(0);
ret = PTR_ERR(root);
goto out;
}
if (root->reloc_root != reloc_root) {
/*
* This is actually impossible without something
* going really wrong (like weird race condition
* or cosmic rays).
*/
ASSERT(0);
ret = -EINVAL;
goto out;
}
ret = merge_reloc_root(rc, root);
btrfs_put_root(root);
if (ret) {
if (list_empty(&reloc_root->root_list))
list_add_tail(&reloc_root->root_list,
&reloc_roots);
goto out;
}
} else {
if (!IS_ERR(root)) {
if (root->reloc_root == reloc_root) {
root->reloc_root = NULL;
btrfs_put_root(reloc_root);
}
clear_bit(BTRFS_ROOT_DEAD_RELOC_TREE,
&root->state);
btrfs_put_root(root);
}
list_del_init(&reloc_root->root_list);
/* Don't forget to queue this reloc root for cleanup */
list_add_tail(&reloc_root->reloc_dirty_list,
&rc->dirty_subvol_roots);
}
}
if (found) {
found = 0;
goto again;
}
out:
if (ret) {
btrfs_handle_fs_error(fs_info, ret, NULL);
free_reloc_roots(&reloc_roots);
/* new reloc root may be added */
mutex_lock(&fs_info->reloc_mutex);
list_splice_init(&rc->reloc_roots, &reloc_roots);
mutex_unlock(&fs_info->reloc_mutex);
free_reloc_roots(&reloc_roots);
}
/*
* We used to have
*
* BUG_ON(!RB_EMPTY_ROOT(&rc->reloc_root_tree.rb_root));
*
* here, but it's wrong. If we fail to start the transaction in
* prepare_to_merge() we will have only 0 ref reloc roots, none of which
* have actually been removed from the reloc_root_tree rb tree. This is
* fine because we're bailing here, and we hold a reference on the root
* for the list that holds it, so these roots will be cleaned up when we
* do the reloc_dirty_list afterwards. Meanwhile the root->reloc_root
* will be cleaned up on unmount.
*
* The remaining nodes will be cleaned up by free_reloc_control.
*/
}
static void free_block_list(struct rb_root *blocks)
{
struct tree_block *block;
struct rb_node *rb_node;
while ((rb_node = rb_first(blocks))) {
block = rb_entry(rb_node, struct tree_block, rb_node);
rb_erase(rb_node, blocks);
kfree(block);
}
}
static int record_reloc_root_in_trans(struct btrfs_trans_handle *trans,
struct btrfs_root *reloc_root)
{
struct btrfs_fs_info *fs_info = reloc_root->fs_info;
struct btrfs_root *root;
int ret;
if (reloc_root->last_trans == trans->transid)
return 0;
root = btrfs_get_fs_root(fs_info, reloc_root->root_key.offset, false);
/*
* This should succeed, since we can't have a reloc root without having
* already looked up the actual root and created the reloc root for this
* root.
*
* However if there's some sort of corruption where we have a ref to a
* reloc root without a corresponding root this could return ENOENT.
*/
if (IS_ERR(root)) {
ASSERT(0);
return PTR_ERR(root);
}
if (root->reloc_root != reloc_root) {
ASSERT(0);
btrfs_err(fs_info,
"root %llu has two reloc roots associated with it",
reloc_root->root_key.offset);
btrfs_put_root(root);
return -EUCLEAN;
}
ret = btrfs_record_root_in_trans(trans, root);
btrfs_put_root(root);
return ret;
}
static noinline_for_stack
struct btrfs_root *select_reloc_root(struct btrfs_trans_handle *trans,
struct reloc_control *rc,
struct btrfs_backref_node *node,
struct btrfs_backref_edge *edges[])
{
struct btrfs_backref_node *next;
struct btrfs_root *root;
int index = 0;
int ret;
next = node;
while (1) {
cond_resched();
next = walk_up_backref(next, edges, &index);
root = next->root;
/*
* If there is no root, then our references for this block are
* incomplete, as we should be able to walk all the way up to a
* block that is owned by a root.
*
* This path is only for SHAREABLE roots, so if we come upon a
* non-SHAREABLE root then we have backrefs that resolve
* improperly.
*
* Both of these cases indicate file system corruption, or a bug
* in the backref walking code.
*/
if (!root) {
ASSERT(0);
btrfs_err(trans->fs_info,
"bytenr %llu doesn't have a backref path ending in a root",
node->bytenr);
return ERR_PTR(-EUCLEAN);
}
if (!test_bit(BTRFS_ROOT_SHAREABLE, &root->state)) {
ASSERT(0);
btrfs_err(trans->fs_info,
"bytenr %llu has multiple refs with one ending in a non-shareable root",
node->bytenr);
return ERR_PTR(-EUCLEAN);
}
if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) {
ret = record_reloc_root_in_trans(trans, root);
if (ret)
return ERR_PTR(ret);
break;
}
ret = btrfs_record_root_in_trans(trans, root);
if (ret)
return ERR_PTR(ret);
root = root->reloc_root;
/*
* We could have raced with another thread which failed, so
* root->reloc_root may not be set, return ENOENT in this case.
*/
if (!root)
return ERR_PTR(-ENOENT);
if (next->new_bytenr != root->node->start) {
/*
* We just created the reloc root, so we shouldn't have
* ->new_bytenr set and this shouldn't be in the changed
* list. If it is then we have multiple roots pointing
* at the same bytenr which indicates corruption, or
* we've made a mistake in the backref walking code.
*/
ASSERT(next->new_bytenr == 0);
ASSERT(list_empty(&next->list));
if (next->new_bytenr || !list_empty(&next->list)) {
btrfs_err(trans->fs_info,
"bytenr %llu possibly has multiple roots pointing at the same bytenr %llu",
node->bytenr, next->bytenr);
return ERR_PTR(-EUCLEAN);
}
next->new_bytenr = root->node->start;
btrfs_put_root(next->root);
next->root = btrfs_grab_root(root);
ASSERT(next->root);
list_add_tail(&next->list,
&rc->backref_cache.changed);
mark_block_processed(rc, next);
break;
}
WARN_ON(1);
root = NULL;
next = walk_down_backref(edges, &index);
if (!next || next->level <= node->level)
break;
}
if (!root) {
/*
* This can happen if there's fs corruption or if there's a bug
* in the backref lookup code.
*/
ASSERT(0);
return ERR_PTR(-ENOENT);
}
next = node;
/* setup backref node path for btrfs_reloc_cow_block */
while (1) {
rc->backref_cache.path[next->level] = next;
if (--index < 0)
break;
next = edges[index]->node[UPPER];
}
return root;
}
/*
* Select a tree root for relocation.
*
* Return NULL if the block is not shareable. We should use do_relocation() in
* this case.
*
* Return a tree root pointer if the block is shareable.
* Return -ENOENT if the block is root of reloc tree.
*/
static noinline_for_stack
struct btrfs_root *select_one_root(struct btrfs_backref_node *node)
{
struct btrfs_backref_node *next;
struct btrfs_root *root;
struct btrfs_root *fs_root = NULL;
struct btrfs_backref_edge *edges[BTRFS_MAX_LEVEL - 1];
int index = 0;
next = node;
while (1) {
cond_resched();
next = walk_up_backref(next, edges, &index);
root = next->root;
/*
* This can occur if we have incomplete extent refs leading all
* the way up a particular path, in this case return -EUCLEAN.
*/
if (!root)
return ERR_PTR(-EUCLEAN);
/* No other choice for non-shareable tree */
if (!test_bit(BTRFS_ROOT_SHAREABLE, &root->state))
return root;
if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID)
fs_root = root;
if (next != node)
return NULL;
next = walk_down_backref(edges, &index);
if (!next || next->level <= node->level)
break;
}
if (!fs_root)
return ERR_PTR(-ENOENT);
return fs_root;
}
static noinline_for_stack
u64 calcu_metadata_size(struct reloc_control *rc,
struct btrfs_backref_node *node, int reserve)
{
struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
struct btrfs_backref_node *next = node;
struct btrfs_backref_edge *edge;
struct btrfs_backref_edge *edges[BTRFS_MAX_LEVEL - 1];
u64 num_bytes = 0;
int index = 0;
BUG_ON(reserve && node->processed);
while (next) {
cond_resched();
while (1) {
if (next->processed && (reserve || next != node))
break;
num_bytes += fs_info->nodesize;
if (list_empty(&next->upper))
break;
edge = list_entry(next->upper.next,
struct btrfs_backref_edge, list[LOWER]);
edges[index++] = edge;
next = edge->node[UPPER];
}
next = walk_down_backref(edges, &index);
}
return num_bytes;
}
static int reserve_metadata_space(struct btrfs_trans_handle *trans,
struct reloc_control *rc,
struct btrfs_backref_node *node)
{
struct btrfs_root *root = rc->extent_root;
struct btrfs_fs_info *fs_info = root->fs_info;
u64 num_bytes;
int ret;
u64 tmp;
num_bytes = calcu_metadata_size(rc, node, 1) * 2;
trans->block_rsv = rc->block_rsv;
rc->reserved_bytes += num_bytes;
/*
* We are under a transaction here so we can only do limited flushing.
* If we get an enospc just kick back -EAGAIN so we know to drop the
* transaction and try to refill when we can flush all the things.
*/
ret = btrfs_block_rsv_refill(root, rc->block_rsv, num_bytes,
BTRFS_RESERVE_FLUSH_LIMIT);
if (ret) {
tmp = fs_info->nodesize * RELOCATION_RESERVED_NODES;
while (tmp <= rc->reserved_bytes)
tmp <<= 1;
/*
* only one thread can access block_rsv at this point,
* so we don't need hold lock to protect block_rsv.
* we expand more reservation size here to allow enough
* space for relocation and we will return earlier in
* enospc case.
*/
rc->block_rsv->size = tmp + fs_info->nodesize *
RELOCATION_RESERVED_NODES;
return -EAGAIN;
}
return 0;
}
/*
* relocate a block tree, and then update pointers in upper level
* blocks that reference the block to point to the new location.
*
* if called by link_to_upper, the block has already been relocated.
* in that case this function just updates pointers.
*/
static int do_relocation(struct btrfs_trans_handle *trans,
struct reloc_control *rc,
struct btrfs_backref_node *node,
struct btrfs_key *key,
struct btrfs_path *path, int lowest)
{
struct btrfs_backref_node *upper;
struct btrfs_backref_edge *edge;
struct btrfs_backref_edge *edges[BTRFS_MAX_LEVEL - 1];
struct btrfs_root *root;
struct extent_buffer *eb;
u32 blocksize;
u64 bytenr;
int slot;
int ret = 0;
/*
* If we are lowest then this is the first time we're processing this
* block, and thus shouldn't have an eb associated with it yet.
*/
ASSERT(!lowest || !node->eb);
path->lowest_level = node->level + 1;
rc->backref_cache.path[node->level] = node;
list_for_each_entry(edge, &node->upper, list[LOWER]) {
struct btrfs_ref ref = { 0 };
cond_resched();
upper = edge->node[UPPER];
root = select_reloc_root(trans, rc, upper, edges);
if (IS_ERR(root)) {
ret = PTR_ERR(root);
goto next;
}
if (upper->eb && !upper->locked) {
if (!lowest) {
ret = btrfs_bin_search(upper->eb, key, &slot);
if (ret < 0)
goto next;
BUG_ON(ret);
bytenr = btrfs_node_blockptr(upper->eb, slot);
if (node->eb->start == bytenr)
goto next;
}
btrfs_backref_drop_node_buffer(upper);
}
if (!upper->eb) {
ret = btrfs_search_slot(trans, root, key, path, 0, 1);
if (ret) {
if (ret > 0)
ret = -ENOENT;
btrfs_release_path(path);
break;
}
if (!upper->eb) {
upper->eb = path->nodes[upper->level];
path->nodes[upper->level] = NULL;
} else {
BUG_ON(upper->eb != path->nodes[upper->level]);
}
upper->locked = 1;
path->locks[upper->level] = 0;
slot = path->slots[upper->level];
btrfs_release_path(path);
} else {
ret = btrfs_bin_search(upper->eb, key, &slot);
if (ret < 0)
goto next;
BUG_ON(ret);
}
bytenr = btrfs_node_blockptr(upper->eb, slot);
if (lowest) {
if (bytenr != node->bytenr) {
btrfs_err(root->fs_info,
"lowest leaf/node mismatch: bytenr %llu node->bytenr %llu slot %d upper %llu",
bytenr, node->bytenr, slot,
upper->eb->start);
ret = -EIO;
goto next;
}
} else {
if (node->eb->start == bytenr)
goto next;
}
blocksize = root->fs_info->nodesize;
eb = btrfs_read_node_slot(upper->eb, slot);
if (IS_ERR(eb)) {
ret = PTR_ERR(eb);
goto next;
}
btrfs_tree_lock(eb);
if (!node->eb) {
ret = btrfs_cow_block(trans, root, eb, upper->eb,
slot, &eb, BTRFS_NESTING_COW);
btrfs_tree_unlock(eb);
free_extent_buffer(eb);
if (ret < 0)
goto next;
/*
* We've just COWed this block, it should have updated
* the correct backref node entry.
*/
ASSERT(node->eb == eb);
} else {
btrfs_set_node_blockptr(upper->eb, slot,
node->eb->start);
btrfs_set_node_ptr_generation(upper->eb, slot,
trans->transid);
btrfs_mark_buffer_dirty(upper->eb);
btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF,
node->eb->start, blocksize,
upper->eb->start);
ref.real_root = root->root_key.objectid;
btrfs_init_tree_ref(&ref, node->level,
btrfs_header_owner(upper->eb));
ret = btrfs_inc_extent_ref(trans, &ref);
if (!ret)
ret = btrfs_drop_subtree(trans, root, eb,
upper->eb);
if (ret)
btrfs_abort_transaction(trans, ret);
}
next:
if (!upper->pending)
btrfs_backref_drop_node_buffer(upper);
else
btrfs_backref_unlock_node_buffer(upper);
if (ret)
break;
}
if (!ret && node->pending) {
btrfs_backref_drop_node_buffer(node);
list_move_tail(&node->list, &rc->backref_cache.changed);
node->pending = 0;
}
path->lowest_level = 0;
/*
* We should have allocated all of our space in the block rsv and thus
* shouldn't ENOSPC.
*/
ASSERT(ret != -ENOSPC);
return ret;
}
static int link_to_upper(struct btrfs_trans_handle *trans,
struct reloc_control *rc,
struct btrfs_backref_node *node,
struct btrfs_path *path)
{
struct btrfs_key key;
btrfs_node_key_to_cpu(node->eb, &key, 0);
return do_relocation(trans, rc, node, &key, path, 0);
}
static int finish_pending_nodes(struct btrfs_trans_handle *trans,
struct reloc_control *rc,
struct btrfs_path *path, int err)
{
LIST_HEAD(list);
struct btrfs_backref_cache *cache = &rc->backref_cache;
struct btrfs_backref_node *node;
int level;
int ret;
for (level = 0; level < BTRFS_MAX_LEVEL; level++) {
while (!list_empty(&cache->pending[level])) {
node = list_entry(cache->pending[level].next,
struct btrfs_backref_node, list);
list_move_tail(&node->list, &list);
BUG_ON(!node->pending);
if (!err) {
ret = link_to_upper(trans, rc, node, path);
if (ret < 0)
err = ret;
}
}
list_splice_init(&list, &cache->pending[level]);
}
return err;
}
/*
* mark a block and all blocks directly/indirectly reference the block
* as processed.
*/
static void update_processed_blocks(struct reloc_control *rc,
struct btrfs_backref_node *node)
{
struct btrfs_backref_node *next = node;
struct btrfs_backref_edge *edge;
struct btrfs_backref_edge *edges[BTRFS_MAX_LEVEL - 1];
int index = 0;
while (next) {
cond_resched();
while (1) {
if (next->processed)
break;
mark_block_processed(rc, next);
if (list_empty(&next->upper))
break;
edge = list_entry(next->upper.next,
struct btrfs_backref_edge, list[LOWER]);
edges[index++] = edge;
next = edge->node[UPPER];
}
next = walk_down_backref(edges, &index);
}
}
static int tree_block_processed(u64 bytenr, struct reloc_control *rc)
{
u32 blocksize = rc->extent_root->fs_info->nodesize;
if (test_range_bit(&rc->processed_blocks, bytenr,
bytenr + blocksize - 1, EXTENT_DIRTY, 1, NULL))
return 1;
return 0;
}
static int get_tree_block_key(struct btrfs_fs_info *fs_info,
struct tree_block *block)
{
struct extent_buffer *eb;
eb = read_tree_block(fs_info, block->bytenr, block->owner,
block->key.offset, block->level, NULL);
if (IS_ERR(eb)) {
return PTR_ERR(eb);
} else if (!extent_buffer_uptodate(eb)) {
free_extent_buffer(eb);
return -EIO;
}
if (block->level == 0)
btrfs_item_key_to_cpu(eb, &block->key, 0);
else
btrfs_node_key_to_cpu(eb, &block->key, 0);
free_extent_buffer(eb);
block->key_ready = 1;
return 0;
}
/*
* helper function to relocate a tree block
*/
static int relocate_tree_block(struct btrfs_trans_handle *trans,
struct reloc_control *rc,
struct btrfs_backref_node *node,
struct btrfs_key *key,
struct btrfs_path *path)
{
struct btrfs_root *root;
int ret = 0;
if (!node)
return 0;
/*
* If we fail here we want to drop our backref_node because we are going
* to start over and regenerate the tree for it.
*/
ret = reserve_metadata_space(trans, rc, node);
if (ret)
goto out;
BUG_ON(node->processed);
root = select_one_root(node);
if (IS_ERR(root)) {
ret = PTR_ERR(root);
/* See explanation in select_one_root for the -EUCLEAN case. */
ASSERT(ret == -ENOENT);
if (ret == -ENOENT) {
ret = 0;
update_processed_blocks(rc, node);
}
goto out;
}
if (root) {
if (test_bit(BTRFS_ROOT_SHAREABLE, &root->state)) {
/*
* This block was the root block of a root, and this is
* the first time we're processing the block and thus it
* should not have had the ->new_bytenr modified and
* should have not been included on the changed list.
*
* However in the case of corruption we could have
* multiple refs pointing to the same block improperly,
* and thus we would trip over these checks. ASSERT()
* for the developer case, because it could indicate a
* bug in the backref code, however error out for a
* normal user in the case of corruption.
*/
ASSERT(node->new_bytenr == 0);
ASSERT(list_empty(&node->list));
if (node->new_bytenr || !list_empty(&node->list)) {
btrfs_err(root->fs_info,
"bytenr %llu has improper references to it",
node->bytenr);
ret = -EUCLEAN;
goto out;
}
ret = btrfs_record_root_in_trans(trans, root);
if (ret)
goto out;
/*
* Another thread could have failed, need to check if we
* have reloc_root actually set.
*/
if (!root->reloc_root) {
ret = -ENOENT;
goto out;
}
root = root->reloc_root;
node->new_bytenr = root->node->start;
btrfs_put_root(node->root);
node->root = btrfs_grab_root(root);
ASSERT(node->root);
list_add_tail(&node->list, &rc->backref_cache.changed);
} else {
path->lowest_level = node->level;
ret = btrfs_search_slot(trans, root, key, path, 0, 1);
btrfs_release_path(path);
if (ret > 0)
ret = 0;
}
if (!ret)
update_processed_blocks(rc, node);
} else {
ret = do_relocation(trans, rc, node, key, path, 1);
}
out:
if (ret || node->level == 0 || node->cowonly)
btrfs_backref_cleanup_node(&rc->backref_cache, node);
return ret;
}
/*
* relocate a list of blocks
*/
static noinline_for_stack
int relocate_tree_blocks(struct btrfs_trans_handle *trans,
struct reloc_control *rc, struct rb_root *blocks)
{
struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
struct btrfs_backref_node *node;
struct btrfs_path *path;
struct tree_block *block;
struct tree_block *next;
int ret;
int err = 0;
path = btrfs_alloc_path();
if (!path) {
err = -ENOMEM;
goto out_free_blocks;
}
/* Kick in readahead for tree blocks with missing keys */
rbtree_postorder_for_each_entry_safe(block, next, blocks, rb_node) {
if (!block->key_ready)
btrfs_readahead_tree_block(fs_info, block->bytenr,
block->owner, 0,
block->level);
}
/* Get first keys */
rbtree_postorder_for_each_entry_safe(block, next, blocks, rb_node) {
if (!block->key_ready) {
err = get_tree_block_key(fs_info, block);
if (err)
goto out_free_path;
}
}
/* Do tree relocation */
rbtree_postorder_for_each_entry_safe(block, next, blocks, rb_node) {
node = build_backref_tree(rc, &block->key,
block->level, block->bytenr);
if (IS_ERR(node)) {
err = PTR_ERR(node);
goto out;
}
ret = relocate_tree_block(trans, rc, node, &block->key,
path);
if (ret < 0) {
err = ret;
break;
}
}
out:
err = finish_pending_nodes(trans, rc, path, err);
out_free_path:
btrfs_free_path(path);
out_free_blocks:
free_block_list(blocks);
return err;
}
static noinline_for_stack int prealloc_file_extent_cluster(
struct btrfs_inode *inode,
struct file_extent_cluster *cluster)
{
u64 alloc_hint = 0;
u64 start;
u64 end;
u64 offset = inode->index_cnt;
u64 num_bytes;
int nr;
int ret = 0;
u64 i_size = i_size_read(&inode->vfs_inode);
u64 prealloc_start = cluster->start - offset;
u64 prealloc_end = cluster->end - offset;
u64 cur_offset = prealloc_start;
/*
* For subpage case, previous i_size may not be aligned to PAGE_SIZE.
* This means the range [i_size, PAGE_END + 1) is filled with zeros by
* btrfs_do_readpage() call of previously relocated file cluster.
*
* If the current cluster starts in the above range, btrfs_do_readpage()
* will skip the read, and relocate_one_page() will later writeback
* the padding zeros as new data, causing data corruption.
*
* Here we have to manually invalidate the range (i_size, PAGE_END + 1).
*/
if (!IS_ALIGNED(i_size, PAGE_SIZE)) {
struct address_space *mapping = inode->vfs_inode.i_mapping;
struct btrfs_fs_info *fs_info = inode->root->fs_info;
const u32 sectorsize = fs_info->sectorsize;
struct page *page;
ASSERT(sectorsize < PAGE_SIZE);
ASSERT(IS_ALIGNED(i_size, sectorsize));
/*
* Subpage can't handle page with DIRTY but without UPTODATE
* bit as it can lead to the following deadlock:
*
* btrfs_readpage()
* | Page already *locked*
* |- btrfs_lock_and_flush_ordered_range()
* |- btrfs_start_ordered_extent()
* |- extent_write_cache_pages()
* |- lock_page()
* We try to lock the page we already hold.
*
* Here we just writeback the whole data reloc inode, so that
* we will be ensured to have no dirty range in the page, and
* are safe to clear the uptodate bits.
*
* This shouldn't cause too much overhead, as we need to write
* the data back anyway.
*/
ret = filemap_write_and_wait(mapping);
if (ret < 0)
return ret;
clear_extent_bits(&inode->io_tree, i_size,
round_up(i_size, PAGE_SIZE) - 1,
EXTENT_UPTODATE);
page = find_lock_page(mapping, i_size >> PAGE_SHIFT);
/*
* If page is freed we don't need to do anything then, as we
* will re-read the whole page anyway.
*/
if (page) {
btrfs_subpage_clear_uptodate(fs_info, page, i_size,
round_up(i_size, PAGE_SIZE) - i_size);
unlock_page(page);
put_page(page);
}
}
BUG_ON(cluster->start != cluster->boundary[0]);
ret = btrfs_alloc_data_chunk_ondemand(inode,
prealloc_end + 1 - prealloc_start);
if (ret)
return ret;
btrfs_inode_lock(&inode->vfs_inode, 0);
for (nr = 0; nr < cluster->nr; nr++) {
start = cluster->boundary[nr] - offset;
if (nr + 1 < cluster->nr)
end = cluster->boundary[nr + 1] - 1 - offset;
else
end = cluster->end - offset;
lock_extent(&inode->io_tree, start, end);
num_bytes = end + 1 - start;
ret = btrfs_prealloc_file_range(&inode->vfs_inode, 0, start,
num_bytes, num_bytes,
end + 1, &alloc_hint);
cur_offset = end + 1;
unlock_extent(&inode->io_tree, start, end);
if (ret)
break;
}
btrfs_inode_unlock(&inode->vfs_inode, 0);
if (cur_offset < prealloc_end)
btrfs_free_reserved_data_space_noquota(inode->root->fs_info,
prealloc_end + 1 - cur_offset);
return ret;
}
static noinline_for_stack
int setup_extent_mapping(struct inode *inode, u64 start, u64 end,
u64 block_start)
{
struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
struct extent_map *em;
int ret = 0;
em = alloc_extent_map();
if (!em)
return -ENOMEM;
em->start = start;
em->len = end + 1 - start;
em->block_len = em->len;
em->block_start = block_start;
set_bit(EXTENT_FLAG_PINNED, &em->flags);
lock_extent(&BTRFS_I(inode)->io_tree, start, end);
while (1) {
write_lock(&em_tree->lock);
ret = add_extent_mapping(em_tree, em, 0);
write_unlock(&em_tree->lock);
if (ret != -EEXIST) {
free_extent_map(em);
break;
}
btrfs_drop_extent_cache(BTRFS_I(inode), start, end, 0);
}
unlock_extent(&BTRFS_I(inode)->io_tree, start, end);
return ret;
}
/*
* Allow error injection to test balance/relocation cancellation
*/
noinline int btrfs_should_cancel_balance(struct btrfs_fs_info *fs_info)
{
return atomic_read(&fs_info->balance_cancel_req) ||
atomic_read(&fs_info->reloc_cancel_req) ||
fatal_signal_pending(current);
}
ALLOW_ERROR_INJECTION(btrfs_should_cancel_balance, TRUE);
static u64 get_cluster_boundary_end(struct file_extent_cluster *cluster,
int cluster_nr)
{
/* Last extent, use cluster end directly */
if (cluster_nr >= cluster->nr - 1)
return cluster->end;
/* Use next boundary start*/
return cluster->boundary[cluster_nr + 1] - 1;
}
static int relocate_one_page(struct inode *inode, struct file_ra_state *ra,
struct file_extent_cluster *cluster,
int *cluster_nr, unsigned long page_index)
{
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
u64 offset = BTRFS_I(inode)->index_cnt;
const unsigned long last_index = (cluster->end - offset) >> PAGE_SHIFT;
gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping);
struct page *page;
u64 page_start;
u64 page_end;
u64 cur;
int ret;
ASSERT(page_index <= last_index);
page = find_lock_page(inode->i_mapping, page_index);
if (!page) {
page_cache_sync_readahead(inode->i_mapping, ra, NULL,
page_index, last_index + 1 - page_index);
page = find_or_create_page(inode->i_mapping, page_index, mask);
if (!page)
return -ENOMEM;
}
ret = set_page_extent_mapped(page);
if (ret < 0)
goto release_page;
if (PageReadahead(page))
page_cache_async_readahead(inode->i_mapping, ra, NULL, page,
page_index, last_index + 1 - page_index);
if (!PageUptodate(page)) {
btrfs_readpage(NULL, page);
lock_page(page);
if (!PageUptodate(page)) {
ret = -EIO;
goto release_page;
}
}
page_start = page_offset(page);
page_end = page_start + PAGE_SIZE - 1;
/*
* Start from the cluster, as for subpage case, the cluster can start
* inside the page.
*/
cur = max(page_start, cluster->boundary[*cluster_nr] - offset);
while (cur <= page_end) {
u64 extent_start = cluster->boundary[*cluster_nr] - offset;
u64 extent_end = get_cluster_boundary_end(cluster,
*cluster_nr) - offset;
u64 clamped_start = max(page_start, extent_start);
u64 clamped_end = min(page_end, extent_end);
u32 clamped_len = clamped_end + 1 - clamped_start;
/* Reserve metadata for this range */
ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode),
clamped_len);
if (ret)
goto release_page;
/* Mark the range delalloc and dirty for later writeback */
lock_extent(&BTRFS_I(inode)->io_tree, clamped_start, clamped_end);
ret = btrfs_set_extent_delalloc(BTRFS_I(inode), clamped_start,
clamped_end, 0, NULL);
if (ret) {
clear_extent_bits(&BTRFS_I(inode)->io_tree,
clamped_start, clamped_end,
EXTENT_LOCKED | EXTENT_BOUNDARY);
btrfs_delalloc_release_metadata(BTRFS_I(inode),
clamped_len, true);
btrfs_delalloc_release_extents(BTRFS_I(inode),
clamped_len);
goto release_page;
}
btrfs_page_set_dirty(fs_info, page, clamped_start, clamped_len);
/*
* Set the boundary if it's inside the page.
* Data relocation requires the destination extents to have the
* same size as the source.
* EXTENT_BOUNDARY bit prevents current extent from being merged
* with previous extent.
*/
if (in_range(cluster->boundary[*cluster_nr] - offset,
page_start, PAGE_SIZE)) {
u64 boundary_start = cluster->boundary[*cluster_nr] -
offset;
u64 boundary_end = boundary_start +
fs_info->sectorsize - 1;
set_extent_bits(&BTRFS_I(inode)->io_tree,
boundary_start, boundary_end,
EXTENT_BOUNDARY);
}
unlock_extent(&BTRFS_I(inode)->io_tree, clamped_start, clamped_end);
btrfs_delalloc_release_extents(BTRFS_I(inode), clamped_len);
cur += clamped_len;
/* Crossed extent end, go to next extent */
if (cur >= extent_end) {
(*cluster_nr)++;
/* Just finished the last extent of the cluster, exit. */
if (*cluster_nr >= cluster->nr)
break;
}
}
unlock_page(page);
put_page(page);
balance_dirty_pages_ratelimited(inode->i_mapping);
btrfs_throttle(fs_info);
if (btrfs_should_cancel_balance(fs_info))
ret = -ECANCELED;
return ret;
release_page:
unlock_page(page);
put_page(page);
return ret;
}
static int relocate_file_extent_cluster(struct inode *inode,
struct file_extent_cluster *cluster)
{
u64 offset = BTRFS_I(inode)->index_cnt;
unsigned long index;
unsigned long last_index;
struct file_ra_state *ra;
int cluster_nr = 0;
int ret = 0;
if (!cluster->nr)
return 0;
ra = kzalloc(sizeof(*ra), GFP_NOFS);
if (!ra)
return -ENOMEM;
ret = prealloc_file_extent_cluster(BTRFS_I(inode), cluster);
if (ret)
goto out;
file_ra_state_init(ra, inode->i_mapping);
ret = setup_extent_mapping(inode, cluster->start - offset,
cluster->end - offset, cluster->start);
if (ret)
goto out;
last_index = (cluster->end - offset) >> PAGE_SHIFT;
for (index = (cluster->start - offset) >> PAGE_SHIFT;
index <= last_index && !ret; index++)
ret = relocate_one_page(inode, ra, cluster, &cluster_nr, index);
if (ret == 0)
WARN_ON(cluster_nr != cluster->nr);
out:
kfree(ra);
return ret;
}
static noinline_for_stack
int relocate_data_extent(struct inode *inode, struct btrfs_key *extent_key,
struct file_extent_cluster *cluster)
{
int ret;
if (cluster->nr > 0 && extent_key->objectid != cluster->end + 1) {
ret = relocate_file_extent_cluster(inode, cluster);
if (ret)
return ret;
cluster->nr = 0;
}
if (!cluster->nr)
cluster->start = extent_key->objectid;
else
BUG_ON(cluster->nr >= MAX_EXTENTS);
cluster->end = extent_key->objectid + extent_key->offset - 1;
cluster->boundary[cluster->nr] = extent_key->objectid;
cluster->nr++;
if (cluster->nr >= MAX_EXTENTS) {
ret = relocate_file_extent_cluster(inode, cluster);
if (ret)
return ret;
cluster->nr = 0;
}
return 0;
}
/*
* helper to add a tree block to the list.
* the major work is getting the generation and level of the block
*/
static int add_tree_block(struct reloc_control *rc,
struct btrfs_key *extent_key,
struct btrfs_path *path,
struct rb_root *blocks)
{
struct extent_buffer *eb;
struct btrfs_extent_item *ei;
struct btrfs_tree_block_info *bi;
struct tree_block *block;
struct rb_node *rb_node;
u32 item_size;
int level = -1;
u64 generation;
u64 owner = 0;
eb = path->nodes[0];
item_size = btrfs_item_size_nr(eb, path->slots[0]);
if (extent_key->type == BTRFS_METADATA_ITEM_KEY ||
item_size >= sizeof(*ei) + sizeof(*bi)) {
unsigned long ptr = 0, end;
ei = btrfs_item_ptr(eb, path->slots[0],
struct btrfs_extent_item);
end = (unsigned long)ei + item_size;
if (extent_key->type == BTRFS_EXTENT_ITEM_KEY) {
bi = (struct btrfs_tree_block_info *)(ei + 1);
level = btrfs_tree_block_level(eb, bi);
ptr = (unsigned long)(bi + 1);
} else {
level = (int)extent_key->offset;
ptr = (unsigned long)(ei + 1);
}
generation = btrfs_extent_generation(eb, ei);
/*
* We're reading random blocks without knowing their owner ahead
* of time. This is ok most of the time, as all reloc roots and
* fs roots have the same lock type. However normal trees do
* not, and the only way to know ahead of time is to read the
* inline ref offset. We know it's an fs root if
*
* 1. There's more than one ref.
* 2. There's a SHARED_DATA_REF_KEY set.
* 3. FULL_BACKREF is set on the flags.
*
* Otherwise it's safe to assume that the ref offset == the
* owner of this block, so we can use that when calling
* read_tree_block.
*/
if (btrfs_extent_refs(eb, ei) == 1 &&
!(btrfs_extent_flags(eb, ei) &
BTRFS_BLOCK_FLAG_FULL_BACKREF) &&
ptr < end) {
struct btrfs_extent_inline_ref *iref;
int type;
iref = (struct btrfs_extent_inline_ref *)ptr;
type = btrfs_get_extent_inline_ref_type(eb, iref,
BTRFS_REF_TYPE_BLOCK);
if (type == BTRFS_REF_TYPE_INVALID)
return -EINVAL;
if (type == BTRFS_TREE_BLOCK_REF_KEY)
owner = btrfs_extent_inline_ref_offset(eb, iref);
}
} else if (unlikely(item_size == sizeof(struct btrfs_extent_item_v0))) {
btrfs_print_v0_err(eb->fs_info);
btrfs_handle_fs_error(eb->fs_info, -EINVAL, NULL);
return -EINVAL;
} else {
BUG();
}
btrfs_release_path(path);
BUG_ON(level == -1);
block = kmalloc(sizeof(*block), GFP_NOFS);
if (!block)
return -ENOMEM;
block->bytenr = extent_key->objectid;
block->key.objectid = rc->extent_root->fs_info->nodesize;
block->key.offset = generation;
block->level = level;
block->key_ready = 0;
block->owner = owner;
rb_node = rb_simple_insert(blocks, block->bytenr, &block->rb_node);
if (rb_node)
btrfs_backref_panic(rc->extent_root->fs_info, block->bytenr,
-EEXIST);
return 0;
}
/*
* helper to add tree blocks for backref of type BTRFS_SHARED_DATA_REF_KEY
*/
static int __add_tree_block(struct reloc_control *rc,
u64 bytenr, u32 blocksize,
struct rb_root *blocks)
{
struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
struct btrfs_path *path;
struct btrfs_key key;
int ret;
bool skinny = btrfs_fs_incompat(fs_info, SKINNY_METADATA);
if (tree_block_processed(bytenr, rc))
return 0;
if (rb_simple_search(blocks, bytenr))
return 0;
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
again:
key.objectid = bytenr;
if (skinny) {
key.type = BTRFS_METADATA_ITEM_KEY;
key.offset = (u64)-1;
} else {
key.type = BTRFS_EXTENT_ITEM_KEY;
key.offset = blocksize;
}
path->search_commit_root = 1;
path->skip_locking = 1;
ret = btrfs_search_slot(NULL, rc->extent_root, &key, path, 0, 0);
if (ret < 0)
goto out;
if (ret > 0 && skinny) {
if (path->slots[0]) {
path->slots[0]--;
btrfs_item_key_to_cpu(path->nodes[0], &key,
path->slots[0]);
if (key.objectid == bytenr &&
(key.type == BTRFS_METADATA_ITEM_KEY ||
(key.type == BTRFS_EXTENT_ITEM_KEY &&
key.offset == blocksize)))
ret = 0;
}
if (ret) {
skinny = false;
btrfs_release_path(path);
goto again;
}
}
if (ret) {
ASSERT(ret == 1);
btrfs_print_leaf(path->nodes[0]);
btrfs_err(fs_info,
"tree block extent item (%llu) is not found in extent tree",
bytenr);
WARN_ON(1);
ret = -EINVAL;
goto out;
}
ret = add_tree_block(rc, &key, path, blocks);
out:
btrfs_free_path(path);
return ret;
}
static int delete_block_group_cache(struct btrfs_fs_info *fs_info,
struct btrfs_block_group *block_group,
struct inode *inode,
u64 ino)
{
struct btrfs_root *root = fs_info->tree_root;
struct btrfs_trans_handle *trans;
int ret = 0;
if (inode)
goto truncate;
inode = btrfs_iget(fs_info->sb, ino, root);
if (IS_ERR(inode))
return -ENOENT;
truncate:
ret = btrfs_check_trunc_cache_free_space(fs_info,
&fs_info->global_block_rsv);
if (ret)
goto out;
trans = btrfs_join_transaction(root);
if (IS_ERR(trans)) {
ret = PTR_ERR(trans);
goto out;
}
ret = btrfs_truncate_free_space_cache(trans, block_group, inode);
btrfs_end_transaction(trans);
btrfs_btree_balance_dirty(fs_info);
out:
iput(inode);
return ret;
}
/*
* Locate the free space cache EXTENT_DATA in root tree leaf and delete the
* cache inode, to avoid free space cache data extent blocking data relocation.
*/
static int delete_v1_space_cache(struct extent_buffer *leaf,
struct btrfs_block_group *block_group,
u64 data_bytenr)
{
u64 space_cache_ino;
struct btrfs_file_extent_item *ei;
struct btrfs_key key;
bool found = false;
int i;
int ret;
if (btrfs_header_owner(leaf) != BTRFS_ROOT_TREE_OBJECTID)
return 0;
for (i = 0; i < btrfs_header_nritems(leaf); i++) {
u8 type;
btrfs_item_key_to_cpu(leaf, &key, i);
if (key.type != BTRFS_EXTENT_DATA_KEY)
continue;
ei = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item);
type = btrfs_file_extent_type(leaf, ei);
if ((type == BTRFS_FILE_EXTENT_REG ||
type == BTRFS_FILE_EXTENT_PREALLOC) &&
btrfs_file_extent_disk_bytenr(leaf, ei) == data_bytenr) {
found = true;
space_cache_ino = key.objectid;
break;
}
}
if (!found)
return -ENOENT;
ret = delete_block_group_cache(leaf->fs_info, block_group, NULL,
space_cache_ino);
return ret;
}
/*
* helper to find all tree blocks that reference a given data extent
*/
static noinline_for_stack
int add_data_references(struct reloc_control *rc,
struct btrfs_key *extent_key,
struct btrfs_path *path,
struct rb_root *blocks)
{
struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
struct ulist *leaves = NULL;
struct ulist_iterator leaf_uiter;
struct ulist_node *ref_node = NULL;
const u32 blocksize = fs_info->nodesize;
int ret = 0;
btrfs_release_path(path);
ret = btrfs_find_all_leafs(NULL, fs_info, extent_key->objectid,
0, &leaves, NULL, true);
if (ret < 0)
return ret;
ULIST_ITER_INIT(&leaf_uiter);
while ((ref_node = ulist_next(leaves, &leaf_uiter))) {
struct extent_buffer *eb;
eb = read_tree_block(fs_info, ref_node->val, 0, 0, 0, NULL);
if (IS_ERR(eb)) {
ret = PTR_ERR(eb);
break;
}
ret = delete_v1_space_cache(eb, rc->block_group,
extent_key->objectid);
free_extent_buffer(eb);
if (ret < 0)
break;
ret = __add_tree_block(rc, ref_node->val, blocksize, blocks);
if (ret < 0)
break;
}
if (ret < 0)
free_block_list(blocks);
ulist_free(leaves);
return ret;
}
/*
* helper to find next unprocessed extent
*/
static noinline_for_stack
int find_next_extent(struct reloc_control *rc, struct btrfs_path *path,
struct btrfs_key *extent_key)
{
struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
struct btrfs_key key;
struct extent_buffer *leaf;
u64 start, end, last;
int ret;
last = rc->block_group->start + rc->block_group->length;
while (1) {
cond_resched();
if (rc->search_start >= last) {
ret = 1;
break;
}
key.objectid = rc->search_start;
key.type = BTRFS_EXTENT_ITEM_KEY;
key.offset = 0;
path->search_commit_root = 1;
path->skip_locking = 1;
ret = btrfs_search_slot(NULL, rc->extent_root, &key, path,
0, 0);
if (ret < 0)
break;
next:
leaf = path->nodes[0];
if (path->slots[0] >= btrfs_header_nritems(leaf)) {
ret = btrfs_next_leaf(rc->extent_root, path);
if (ret != 0)
break;
leaf = path->nodes[0];
}
btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
if (key.objectid >= last) {
ret = 1;
break;
}
if (key.type != BTRFS_EXTENT_ITEM_KEY &&
key.type != BTRFS_METADATA_ITEM_KEY) {
path->slots[0]++;
goto next;
}
if (key.type == BTRFS_EXTENT_ITEM_KEY &&
key.objectid + key.offset <= rc->search_start) {
path->slots[0]++;
goto next;
}
if (key.type == BTRFS_METADATA_ITEM_KEY &&
key.objectid + fs_info->nodesize <=
rc->search_start) {
path->slots[0]++;
goto next;
}
ret = find_first_extent_bit(&rc->processed_blocks,
key.objectid, &start, &end,
EXTENT_DIRTY, NULL);
if (ret == 0 && start <= key.objectid) {
btrfs_release_path(path);
rc->search_start = end + 1;
} else {
if (key.type == BTRFS_EXTENT_ITEM_KEY)
rc->search_start = key.objectid + key.offset;
else
rc->search_start = key.objectid +
fs_info->nodesize;
memcpy(extent_key, &key, sizeof(key));
return 0;
}
}
btrfs_release_path(path);
return ret;
}
static void set_reloc_control(struct reloc_control *rc)
{
struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
mutex_lock(&fs_info->reloc_mutex);
fs_info->reloc_ctl = rc;
mutex_unlock(&fs_info->reloc_mutex);
}
static void unset_reloc_control(struct reloc_control *rc)
{
struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
mutex_lock(&fs_info->reloc_mutex);
fs_info->reloc_ctl = NULL;
mutex_unlock(&fs_info->reloc_mutex);
}
static noinline_for_stack
int prepare_to_relocate(struct reloc_control *rc)
{
struct btrfs_trans_handle *trans;
int ret;
rc->block_rsv = btrfs_alloc_block_rsv(rc->extent_root->fs_info,
BTRFS_BLOCK_RSV_TEMP);
if (!rc->block_rsv)
return -ENOMEM;
memset(&rc->cluster, 0, sizeof(rc->cluster));
rc->search_start = rc->block_group->start;
rc->extents_found = 0;
rc->nodes_relocated = 0;
rc->merging_rsv_size = 0;
rc->reserved_bytes = 0;
rc->block_rsv->size = rc->extent_root->fs_info->nodesize *
RELOCATION_RESERVED_NODES;
ret = btrfs_block_rsv_refill(rc->extent_root,
rc->block_rsv, rc->block_rsv->size,
BTRFS_RESERVE_FLUSH_ALL);
if (ret)
return ret;
rc->create_reloc_tree = 1;
set_reloc_control(rc);
trans = btrfs_join_transaction(rc->extent_root);
if (IS_ERR(trans)) {
unset_reloc_control(rc);
/*
* extent tree is not a ref_cow tree and has no reloc_root to
* cleanup. And callers are responsible to free the above
* block rsv.
*/
return PTR_ERR(trans);
}
return btrfs_commit_transaction(trans);
}
static noinline_for_stack int relocate_block_group(struct reloc_control *rc)
{
struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
struct rb_root blocks = RB_ROOT;
struct btrfs_key key;
struct btrfs_trans_handle *trans = NULL;
struct btrfs_path *path;
struct btrfs_extent_item *ei;
u64 flags;
int ret;
int err = 0;
int progress = 0;
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
path->reada = READA_FORWARD;
ret = prepare_to_relocate(rc);
if (ret) {
err = ret;
goto out_free;
}
while (1) {
rc->reserved_bytes = 0;
ret = btrfs_block_rsv_refill(rc->extent_root,
rc->block_rsv, rc->block_rsv->size,
BTRFS_RESERVE_FLUSH_ALL);
if (ret) {
err = ret;
break;
}
progress++;
trans = btrfs_start_transaction(rc->extent_root, 0);
if (IS_ERR(trans)) {
err = PTR_ERR(trans);
trans = NULL;
break;
}
restart:
if (update_backref_cache(trans, &rc->backref_cache)) {
btrfs_end_transaction(trans);
trans = NULL;
continue;
}
ret = find_next_extent(rc, path, &key);
if (ret < 0)
err = ret;
if (ret != 0)
break;
rc->extents_found++;
ei = btrfs_item_ptr(path->nodes[0], path->slots[0],
struct btrfs_extent_item);
flags = btrfs_extent_flags(path->nodes[0], ei);
if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
ret = add_tree_block(rc, &key, path, &blocks);
} else if (rc->stage == UPDATE_DATA_PTRS &&
(flags & BTRFS_EXTENT_FLAG_DATA)) {
ret = add_data_references(rc, &key, path, &blocks);
} else {
btrfs_release_path(path);
ret = 0;
}
if (ret < 0) {
err = ret;
break;
}
if (!RB_EMPTY_ROOT(&blocks)) {
ret = relocate_tree_blocks(trans, rc, &blocks);
if (ret < 0) {
if (ret != -EAGAIN) {
err = ret;
break;
}
rc->extents_found--;
rc->search_start = key.objectid;
}
}
btrfs_end_transaction_throttle(trans);
btrfs_btree_balance_dirty(fs_info);
trans = NULL;
if (rc->stage == MOVE_DATA_EXTENTS &&
(flags & BTRFS_EXTENT_FLAG_DATA)) {
rc->found_file_extent = 1;
ret = relocate_data_extent(rc->data_inode,
&key, &rc->cluster);
if (ret < 0) {
err = ret;
break;
}
}
if (btrfs_should_cancel_balance(fs_info)) {
err = -ECANCELED;
break;
}
}
if (trans && progress && err == -ENOSPC) {
ret = btrfs_force_chunk_alloc(trans, rc->block_group->flags);
if (ret == 1) {
err = 0;
progress = 0;
goto restart;
}
}
btrfs_release_path(path);
clear_extent_bits(&rc->processed_blocks, 0, (u64)-1, EXTENT_DIRTY);
if (trans) {
btrfs_end_transaction_throttle(trans);
btrfs_btree_balance_dirty(fs_info);
}
if (!err) {
ret = relocate_file_extent_cluster(rc->data_inode,
&rc->cluster);
if (ret < 0)
err = ret;
}
rc->create_reloc_tree = 0;
set_reloc_control(rc);
btrfs_backref_release_cache(&rc->backref_cache);
btrfs_block_rsv_release(fs_info, rc->block_rsv, (u64)-1, NULL);
/*
* Even in the case when the relocation is cancelled, we should all go
* through prepare_to_merge() and merge_reloc_roots().
*
* For error (including cancelled balance), prepare_to_merge() will
* mark all reloc trees orphan, then queue them for cleanup in
* merge_reloc_roots()
*/
err = prepare_to_merge(rc, err);
merge_reloc_roots(rc);
rc->merge_reloc_tree = 0;
unset_reloc_control(rc);
btrfs_block_rsv_release(fs_info, rc->block_rsv, (u64)-1, NULL);
/* get rid of pinned extents */
trans = btrfs_join_transaction(rc->extent_root);
if (IS_ERR(trans)) {
err = PTR_ERR(trans);
goto out_free;
}
ret = btrfs_commit_transaction(trans);
if (ret && !err)
err = ret;
out_free:
ret = clean_dirty_subvols(rc);
if (ret < 0 && !err)
err = ret;
btrfs_free_block_rsv(fs_info, rc->block_rsv);
btrfs_free_path(path);
return err;
}
static int __insert_orphan_inode(struct btrfs_trans_handle *trans,
struct btrfs_root *root, u64 objectid)
{
struct btrfs_path *path;
struct btrfs_inode_item *item;
struct extent_buffer *leaf;
int ret;
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
ret = btrfs_insert_empty_inode(trans, root, path, objectid);
if (ret)
goto out;
leaf = path->nodes[0];
item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_inode_item);
memzero_extent_buffer(leaf, (unsigned long)item, sizeof(*item));
btrfs_set_inode_generation(leaf, item, 1);
btrfs_set_inode_size(leaf, item, 0);
btrfs_set_inode_mode(leaf, item, S_IFREG | 0600);
btrfs_set_inode_flags(leaf, item, BTRFS_INODE_NOCOMPRESS |
BTRFS_INODE_PREALLOC);
btrfs_mark_buffer_dirty(leaf);
out:
btrfs_free_path(path);
return ret;
}
static void delete_orphan_inode(struct btrfs_trans_handle *trans,
struct btrfs_root *root, u64 objectid)
{
struct btrfs_path *path;
struct btrfs_key key;
int ret = 0;
path = btrfs_alloc_path();
if (!path) {
ret = -ENOMEM;
goto out;
}
key.objectid = objectid;
key.type = BTRFS_INODE_ITEM_KEY;
key.offset = 0;
ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
if (ret) {
if (ret > 0)
ret = -ENOENT;
goto out;
}
ret = btrfs_del_item(trans, root, path);
out:
if (ret)
btrfs_abort_transaction(trans, ret);
btrfs_free_path(path);
}
/*
* helper to create inode for data relocation.
* the inode is in data relocation tree and its link count is 0
*/
static noinline_for_stack
struct inode *create_reloc_inode(struct btrfs_fs_info *fs_info,
struct btrfs_block_group *group)
{
struct inode *inode = NULL;
struct btrfs_trans_handle *trans;
struct btrfs_root *root;
u64 objectid;
int err = 0;
root = btrfs_grab_root(fs_info->data_reloc_root);
trans = btrfs_start_transaction(root, 6);
if (IS_ERR(trans)) {
btrfs_put_root(root);
return ERR_CAST(trans);
}
err = btrfs_get_free_objectid(root, &objectid);
if (err)
goto out;
err = __insert_orphan_inode(trans, root, objectid);
if (err)
goto out;
inode = btrfs_iget(fs_info->sb, objectid, root);
if (IS_ERR(inode)) {
delete_orphan_inode(trans, root, objectid);
err = PTR_ERR(inode);
inode = NULL;
goto out;
}
BTRFS_I(inode)->index_cnt = group->start;
err = btrfs_orphan_add(trans, BTRFS_I(inode));
out:
btrfs_put_root(root);
btrfs_end_transaction(trans);
btrfs_btree_balance_dirty(fs_info);
if (err) {
if (inode)
iput(inode);
inode = ERR_PTR(err);
}
return inode;
}
/*
* Mark start of chunk relocation that is cancellable. Check if the cancellation
* has been requested meanwhile and don't start in that case.
*
* Return:
* 0 success
* -EINPROGRESS operation is already in progress, that's probably a bug
* -ECANCELED cancellation request was set before the operation started
*/
static int reloc_chunk_start(struct btrfs_fs_info *fs_info)
{
if (test_and_set_bit(BTRFS_FS_RELOC_RUNNING, &fs_info->flags)) {
/* This should not happen */
btrfs_err(fs_info, "reloc already running, cannot start");
return -EINPROGRESS;
}
if (atomic_read(&fs_info->reloc_cancel_req) > 0) {
btrfs_info(fs_info, "chunk relocation canceled on start");
/*
* On cancel, clear all requests but let the caller mark
* the end after cleanup operations.
*/
atomic_set(&fs_info->reloc_cancel_req, 0);
return -ECANCELED;
}
return 0;
}
/*
* Mark end of chunk relocation that is cancellable and wake any waiters.
*/
static void reloc_chunk_end(struct btrfs_fs_info *fs_info)
{
/* Requested after start, clear bit first so any waiters can continue */
if (atomic_read(&fs_info->reloc_cancel_req) > 0)
btrfs_info(fs_info, "chunk relocation canceled during operation");
clear_and_wake_up_bit(BTRFS_FS_RELOC_RUNNING, &fs_info->flags);
atomic_set(&fs_info->reloc_cancel_req, 0);
}
static struct reloc_control *alloc_reloc_control(struct btrfs_fs_info *fs_info)
{
struct reloc_control *rc;
rc = kzalloc(sizeof(*rc), GFP_NOFS);
if (!rc)
return NULL;
INIT_LIST_HEAD(&rc->reloc_roots);
INIT_LIST_HEAD(&rc->dirty_subvol_roots);
btrfs_backref_init_cache(fs_info, &rc->backref_cache, 1);
mapping_tree_init(&rc->reloc_root_tree);
extent_io_tree_init(fs_info, &rc->processed_blocks,
IO_TREE_RELOC_BLOCKS, NULL);
return rc;
}
static void free_reloc_control(struct reloc_control *rc)
{
struct mapping_node *node, *tmp;
free_reloc_roots(&rc->reloc_roots);
rbtree_postorder_for_each_entry_safe(node, tmp,
&rc->reloc_root_tree.rb_root, rb_node)
kfree(node);
kfree(rc);
}
/*
* Print the block group being relocated
*/
static void describe_relocation(struct btrfs_fs_info *fs_info,
struct btrfs_block_group *block_group)
{
char buf[128] = {'\0'};
btrfs_describe_block_groups(block_group->flags, buf, sizeof(buf));
btrfs_info(fs_info,
"relocating block group %llu flags %s",
block_group->start, buf);
}
static const char *stage_to_string(int stage)
{
if (stage == MOVE_DATA_EXTENTS)
return "move data extents";
if (stage == UPDATE_DATA_PTRS)
return "update data pointers";
return "unknown";
}
/*
* function to relocate all extents in a block group.
*/
int btrfs_relocate_block_group(struct btrfs_fs_info *fs_info, u64 group_start)
{
struct btrfs_block_group *bg;
struct btrfs_root *extent_root = fs_info->extent_root;
struct reloc_control *rc;
struct inode *inode;
struct btrfs_path *path;
int ret;
int rw = 0;
int err = 0;
/*
* This only gets set if we had a half-deleted snapshot on mount. We
* cannot allow relocation to start while we're still trying to clean up
* these pending deletions.
*/
ret = wait_on_bit(&fs_info->flags, BTRFS_FS_UNFINISHED_DROPS, TASK_INTERRUPTIBLE);
if (ret)
return ret;
/* We may have been woken up by close_ctree, so bail if we're closing. */
if (btrfs_fs_closing(fs_info))
return -EINTR;
bg = btrfs_lookup_block_group(fs_info, group_start);
if (!bg)
return -ENOENT;
if (btrfs_pinned_by_swapfile(fs_info, bg)) {
btrfs_put_block_group(bg);
return -ETXTBSY;
}
rc = alloc_reloc_control(fs_info);
if (!rc) {
btrfs_put_block_group(bg);
return -ENOMEM;
}
ret = reloc_chunk_start(fs_info);
if (ret < 0) {
err = ret;
goto out_put_bg;
}
rc->extent_root = extent_root;
rc->block_group = bg;
ret = btrfs_inc_block_group_ro(rc->block_group, true);
if (ret) {
err = ret;
goto out;
}
rw = 1;
path = btrfs_alloc_path();
if (!path) {
err = -ENOMEM;
goto out;
}
inode = lookup_free_space_inode(rc->block_group, path);
btrfs_free_path(path);
if (!IS_ERR(inode))
ret = delete_block_group_cache(fs_info, rc->block_group, inode, 0);
else
ret = PTR_ERR(inode);
if (ret && ret != -ENOENT) {
err = ret;
goto out;
}
rc->data_inode = create_reloc_inode(fs_info, rc->block_group);
if (IS_ERR(rc->data_inode)) {
err = PTR_ERR(rc->data_inode);
rc->data_inode = NULL;
goto out;
}
describe_relocation(fs_info, rc->block_group);
btrfs_wait_block_group_reservations(rc->block_group);
btrfs_wait_nocow_writers(rc->block_group);
btrfs_wait_ordered_roots(fs_info, U64_MAX,
rc->block_group->start,
rc->block_group->length);
while (1) {
int finishes_stage;
mutex_lock(&fs_info->cleaner_mutex);
ret = relocate_block_group(rc);
mutex_unlock(&fs_info->cleaner_mutex);
if (ret < 0)
err = ret;
finishes_stage = rc->stage;
/*
* We may have gotten ENOSPC after we already dirtied some
* extents. If writeout happens while we're relocating a
* different block group we could end up hitting the
* BUG_ON(rc->stage == UPDATE_DATA_PTRS) in
* btrfs_reloc_cow_block. Make sure we write everything out
* properly so we don't trip over this problem, and then break
* out of the loop if we hit an error.
*/
if (rc->stage == MOVE_DATA_EXTENTS && rc->found_file_extent) {
ret = btrfs_wait_ordered_range(rc->data_inode, 0,
(u64)-1);
if (ret)
err = ret;
invalidate_mapping_pages(rc->data_inode->i_mapping,
0, -1);
rc->stage = UPDATE_DATA_PTRS;
}
if (err < 0)
goto out;
if (rc->extents_found == 0)
break;
btrfs_info(fs_info, "found %llu extents, stage: %s",
rc->extents_found, stage_to_string(finishes_stage));
}
WARN_ON(rc->block_group->pinned > 0);
WARN_ON(rc->block_group->reserved > 0);
WARN_ON(rc->block_group->used > 0);
out:
if (err && rw)
btrfs_dec_block_group_ro(rc->block_group);
iput(rc->data_inode);
out_put_bg:
btrfs_put_block_group(bg);
reloc_chunk_end(fs_info);
free_reloc_control(rc);
return err;
}
static noinline_for_stack int mark_garbage_root(struct btrfs_root *root)
{
struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_trans_handle *trans;
int ret, err;
trans = btrfs_start_transaction(fs_info->tree_root, 0);
if (IS_ERR(trans))
return PTR_ERR(trans);
memset(&root->root_item.drop_progress, 0,
sizeof(root->root_item.drop_progress));
btrfs_set_root_drop_level(&root->root_item, 0);
btrfs_set_root_refs(&root->root_item, 0);
ret = btrfs_update_root(trans, fs_info->tree_root,
&root->root_key, &root->root_item);
err = btrfs_end_transaction(trans);
if (err)
return err;
return ret;
}
/*
* recover relocation interrupted by system crash.
*
* this function resumes merging reloc trees with corresponding fs trees.
* this is important for keeping the sharing of tree blocks
*/
int btrfs_recover_relocation(struct btrfs_root *root)
{
struct btrfs_fs_info *fs_info = root->fs_info;
LIST_HEAD(reloc_roots);
struct btrfs_key key;
struct btrfs_root *fs_root;
struct btrfs_root *reloc_root;
struct btrfs_path *path;
struct extent_buffer *leaf;
struct reloc_control *rc = NULL;
struct btrfs_trans_handle *trans;
int ret;
int err = 0;
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
path->reada = READA_BACK;
key.objectid = BTRFS_TREE_RELOC_OBJECTID;
key.type = BTRFS_ROOT_ITEM_KEY;
key.offset = (u64)-1;
while (1) {
ret = btrfs_search_slot(NULL, fs_info->tree_root, &key,
path, 0, 0);
if (ret < 0) {
err = ret;
goto out;
}
if (ret > 0) { if (path->slots[0] == 0)
break;
path->slots[0]--;
}
leaf = path->nodes[0];
btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
btrfs_release_path(path);
if (key.objectid != BTRFS_TREE_RELOC_OBJECTID ||
key.type != BTRFS_ROOT_ITEM_KEY)
break;
reloc_root = btrfs_read_tree_root(root, &key);
if (IS_ERR(reloc_root)) {
err = PTR_ERR(reloc_root);
goto out;
}
set_bit(BTRFS_ROOT_SHAREABLE, &reloc_root->state);
list_add(&reloc_root->root_list, &reloc_roots);
if (btrfs_root_refs(&reloc_root->root_item) > 0) {
fs_root = btrfs_get_fs_root(fs_info,
reloc_root->root_key.offset, false);
if (IS_ERR(fs_root)) {
ret = PTR_ERR(fs_root);
if (ret != -ENOENT) {
err = ret;
goto out;
}
ret = mark_garbage_root(reloc_root);
if (ret < 0) {
err = ret;
goto out;
}
} else {
btrfs_put_root(fs_root);
}
}
if (key.offset == 0)
break;
key.offset--;
}
btrfs_release_path(path);
if (list_empty(&reloc_roots))
goto out;
rc = alloc_reloc_control(fs_info);
if (!rc) {
err = -ENOMEM;
goto out;
}
ret = reloc_chunk_start(fs_info);
if (ret < 0) {
err = ret;
goto out_end;
}
rc->extent_root = fs_info->extent_root;
set_reloc_control(rc);
trans = btrfs_join_transaction(rc->extent_root);
if (IS_ERR(trans)) {
err = PTR_ERR(trans);
goto out_unset;
}
rc->merge_reloc_tree = 1;
while (!list_empty(&reloc_roots)) {
reloc_root = list_entry(reloc_roots.next,
struct btrfs_root, root_list);
list_del(&reloc_root->root_list);
if (btrfs_root_refs(&reloc_root->root_item) == 0) {
list_add_tail(&reloc_root->root_list,
&rc->reloc_roots);
continue;
}
fs_root = btrfs_get_fs_root(fs_info, reloc_root->root_key.offset,
false);
if (IS_ERR(fs_root)) {
err = PTR_ERR(fs_root);
list_add_tail(&reloc_root->root_list, &reloc_roots);
btrfs_end_transaction(trans);
goto out_unset;
}
err = __add_reloc_root(reloc_root);
ASSERT(err != -EEXIST);
if (err) {
list_add_tail(&reloc_root->root_list, &reloc_roots);
btrfs_put_root(fs_root);
btrfs_end_transaction(trans);
goto out_unset;
}
fs_root->reloc_root = btrfs_grab_root(reloc_root);
btrfs_put_root(fs_root);
}
err = btrfs_commit_transaction(trans);
if (err)
goto out_unset;
merge_reloc_roots(rc);
unset_reloc_control(rc);
trans = btrfs_join_transaction(rc->extent_root);
if (IS_ERR(trans)) {
err = PTR_ERR(trans);
goto out_clean;
}
err = btrfs_commit_transaction(trans);
out_clean:
ret = clean_dirty_subvols(rc); if (ret < 0 && !err)
err = ret;
out_unset:
unset_reloc_control(rc);
out_end:
reloc_chunk_end(fs_info);
free_reloc_control(rc);
out:
free_reloc_roots(&reloc_roots);
btrfs_free_path(path);
if (err == 0) {
/* cleanup orphan inode in data relocation tree */
fs_root = btrfs_grab_root(fs_info->data_reloc_root);
ASSERT(fs_root);
err = btrfs_orphan_cleanup(fs_root); btrfs_put_root(fs_root);
}
return err;
}
/*
* helper to add ordered checksum for data relocation.
*
* cloning checksum properly handles the nodatasum extents.
* it also saves CPU time to re-calculate the checksum.
*/
int btrfs_reloc_clone_csums(struct btrfs_inode *inode, u64 file_pos, u64 len)
{
struct btrfs_fs_info *fs_info = inode->root->fs_info;
struct btrfs_ordered_sum *sums;
struct btrfs_ordered_extent *ordered;
int ret;
u64 disk_bytenr;
u64 new_bytenr;
LIST_HEAD(list);
ordered = btrfs_lookup_ordered_extent(inode, file_pos);
BUG_ON(ordered->file_offset != file_pos || ordered->num_bytes != len);
disk_bytenr = file_pos + inode->index_cnt;
ret = btrfs_lookup_csums_range(fs_info->csum_root, disk_bytenr,
disk_bytenr + len - 1, &list, 0);
if (ret)
goto out;
while (!list_empty(&list)) {
sums = list_entry(list.next, struct btrfs_ordered_sum, list);
list_del_init(&sums->list);
/*
* We need to offset the new_bytenr based on where the csum is.
* We need to do this because we will read in entire prealloc
* extents but we may have written to say the middle of the
* prealloc extent, so we need to make sure the csum goes with
* the right disk offset.
*
* We can do this because the data reloc inode refers strictly
* to the on disk bytes, so we don't have to worry about
* disk_len vs real len like with real inodes since it's all
* disk length.
*/
new_bytenr = ordered->disk_bytenr + sums->bytenr - disk_bytenr;
sums->bytenr = new_bytenr;
btrfs_add_ordered_sum(ordered, sums);
}
out:
btrfs_put_ordered_extent(ordered);
return ret;
}
int btrfs_reloc_cow_block(struct btrfs_trans_handle *trans,
struct btrfs_root *root, struct extent_buffer *buf,
struct extent_buffer *cow)
{
struct btrfs_fs_info *fs_info = root->fs_info;
struct reloc_control *rc;
struct btrfs_backref_node *node;
int first_cow = 0;
int level;
int ret = 0;
rc = fs_info->reloc_ctl;
if (!rc)
return 0;
BUG_ON(rc->stage == UPDATE_DATA_PTRS && btrfs_is_data_reloc_root(root)); level = btrfs_header_level(buf); if (btrfs_header_generation(buf) <=
btrfs_root_last_snapshot(&root->root_item))
first_cow = 1;
if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID &&
rc->create_reloc_tree) {
WARN_ON(!first_cow && level == 0); node = rc->backref_cache.path[level]; BUG_ON(node->bytenr != buf->start &&
node->new_bytenr != buf->start);
btrfs_backref_drop_node_buffer(node);
atomic_inc(&cow->refs);
node->eb = cow;
node->new_bytenr = cow->start;
if (!node->pending) {
list_move_tail(&node->list,
&rc->backref_cache.pending[level]);
node->pending = 1;
}
if (first_cow) mark_block_processed(rc, node);
if (first_cow && level > 0)
rc->nodes_relocated += buf->len;
}
if (level == 0 && first_cow && rc->stage == UPDATE_DATA_PTRS) ret = replace_file_extents(trans, rc, root, cow);
return ret;
}
/*
* called before creating snapshot. it calculates metadata reservation
* required for relocating tree blocks in the snapshot
*/
void btrfs_reloc_pre_snapshot(struct btrfs_pending_snapshot *pending,
u64 *bytes_to_reserve)
{
struct btrfs_root *root = pending->root;
struct reloc_control *rc = root->fs_info->reloc_ctl;
if (!rc || !have_reloc_root(root))
return;
if (!rc->merge_reloc_tree)
return;
root = root->reloc_root;
BUG_ON(btrfs_root_refs(&root->root_item) == 0);
/*
* relocation is in the stage of merging trees. the space
* used by merging a reloc tree is twice the size of
* relocated tree nodes in the worst case. half for cowing
* the reloc tree, half for cowing the fs tree. the space
* used by cowing the reloc tree will be freed after the
* tree is dropped. if we create snapshot, cowing the fs
* tree may use more space than it frees. so we need
* reserve extra space.
*/
*bytes_to_reserve += rc->nodes_relocated;
}
/*
* called after snapshot is created. migrate block reservation
* and create reloc root for the newly created snapshot
*
* This is similar to btrfs_init_reloc_root(), we come out of here with two
* references held on the reloc_root, one for root->reloc_root and one for
* rc->reloc_roots.
*/
int btrfs_reloc_post_snapshot(struct btrfs_trans_handle *trans,
struct btrfs_pending_snapshot *pending)
{
struct btrfs_root *root = pending->root;
struct btrfs_root *reloc_root;
struct btrfs_root *new_root;
struct reloc_control *rc = root->fs_info->reloc_ctl;
int ret;
if (!rc || !have_reloc_root(root))
return 0;
rc = root->fs_info->reloc_ctl;
rc->merging_rsv_size += rc->nodes_relocated;
if (rc->merge_reloc_tree) {
ret = btrfs_block_rsv_migrate(&pending->block_rsv,
rc->block_rsv,
rc->nodes_relocated, true);
if (ret)
return ret;
}
new_root = pending->snap;
reloc_root = create_reloc_root(trans, root->reloc_root,
new_root->root_key.objectid);
if (IS_ERR(reloc_root))
return PTR_ERR(reloc_root);
ret = __add_reloc_root(reloc_root);
ASSERT(ret != -EEXIST);
if (ret) {
/* Pairs with create_reloc_root */
btrfs_put_root(reloc_root);
return ret;
}
new_root->reloc_root = btrfs_grab_root(reloc_root);
if (rc->create_reloc_tree)
ret = clone_backref_node(trans, rc, root, reloc_root);
return ret;
}
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_X86_BITOPS_H
#define _ASM_X86_BITOPS_H
/*
* Copyright 1992, Linus Torvalds.
*
* Note: inlines with more than a single statement should be marked
* __always_inline to avoid problems with older gcc's inlining heuristics.
*/
#ifndef _LINUX_BITOPS_H
#error only <linux/bitops.h> can be included directly
#endif
#include <linux/compiler.h>
#include <asm/alternative.h>
#include <asm/rmwcc.h>
#include <asm/barrier.h>
#if BITS_PER_LONG == 32
# define _BITOPS_LONG_SHIFT 5
#elif BITS_PER_LONG == 64
# define _BITOPS_LONG_SHIFT 6
#else
# error "Unexpected BITS_PER_LONG"
#endif
#define BIT_64(n) (U64_C(1) << (n))
/*
* These have to be done with inline assembly: that way the bit-setting
* is guaranteed to be atomic. All bit operations return 0 if the bit
* was cleared before the operation and != 0 if it was not.
*
* bit 0 is the LSB of addr; bit 32 is the LSB of (addr+1).
*/
#define RLONG_ADDR(x) "m" (*(volatile long *) (x))
#define WBYTE_ADDR(x) "+m" (*(volatile char *) (x))
#define ADDR RLONG_ADDR(addr)
/*
* We do the locked ops that don't return the old value as
* a mask operation on a byte.
*/
#define CONST_MASK_ADDR(nr, addr) WBYTE_ADDR((void *)(addr) + ((nr)>>3))
#define CONST_MASK(nr) (1 << ((nr) & 7))
static __always_inline void
arch_set_bit(long nr, volatile unsigned long *addr)
{
if (__builtin_constant_p(nr)) {
asm volatile(LOCK_PREFIX "orb %b1,%0"
: CONST_MASK_ADDR(nr, addr)
: "iq" (CONST_MASK(nr))
: "memory");
} else {
asm volatile(LOCK_PREFIX __ASM_SIZE(bts) " %1,%0"
: : RLONG_ADDR(addr), "Ir" (nr) : "memory");
}
}
static __always_inline void
arch___set_bit(long nr, volatile unsigned long *addr)
{
asm volatile(__ASM_SIZE(bts) " %1,%0" : : ADDR, "Ir" (nr) : "memory");
}
static __always_inline void
arch_clear_bit(long nr, volatile unsigned long *addr)
{
if (__builtin_constant_p(nr)) {
asm volatile(LOCK_PREFIX "andb %b1,%0"
: CONST_MASK_ADDR(nr, addr)
: "iq" (~CONST_MASK(nr)));
} else {
asm volatile(LOCK_PREFIX __ASM_SIZE(btr) " %1,%0"
: : RLONG_ADDR(addr), "Ir" (nr) : "memory");
}
}
static __always_inline void
arch_clear_bit_unlock(long nr, volatile unsigned long *addr)
{
barrier();
arch_clear_bit(nr, addr);
}
static __always_inline void
arch___clear_bit(long nr, volatile unsigned long *addr)
{
asm volatile(__ASM_SIZE(btr) " %1,%0" : : ADDR, "Ir" (nr) : "memory");
}
static __always_inline bool
arch_clear_bit_unlock_is_negative_byte(long nr, volatile unsigned long *addr)
{
bool negative;
asm volatile(LOCK_PREFIX "andb %2,%1"
CC_SET(s)
: CC_OUT(s) (negative), WBYTE_ADDR(addr)
: "ir" ((char) ~(1 << nr)) : "memory");
return negative;
}
#define arch_clear_bit_unlock_is_negative_byte \
arch_clear_bit_unlock_is_negative_byte
static __always_inline void
arch___clear_bit_unlock(long nr, volatile unsigned long *addr)
{
arch___clear_bit(nr, addr);
}
static __always_inline void
arch___change_bit(long nr, volatile unsigned long *addr)
{
asm volatile(__ASM_SIZE(btc) " %1,%0" : : ADDR, "Ir" (nr) : "memory");
}
static __always_inline void
arch_change_bit(long nr, volatile unsigned long *addr)
{
if (__builtin_constant_p(nr)) {
asm volatile(LOCK_PREFIX "xorb %b1,%0"
: CONST_MASK_ADDR(nr, addr)
: "iq" (CONST_MASK(nr)));
} else {
asm volatile(LOCK_PREFIX __ASM_SIZE(btc) " %1,%0"
: : RLONG_ADDR(addr), "Ir" (nr) : "memory");
}
}
static __always_inline bool
arch_test_and_set_bit(long nr, volatile unsigned long *addr)
{
return GEN_BINARY_RMWcc(LOCK_PREFIX __ASM_SIZE(bts), *addr, c, "Ir", nr);
}
static __always_inline bool
arch_test_and_set_bit_lock(long nr, volatile unsigned long *addr)
{
return arch_test_and_set_bit(nr, addr);
}
static __always_inline bool
arch___test_and_set_bit(long nr, volatile unsigned long *addr)
{
bool oldbit;
asm(__ASM_SIZE(bts) " %2,%1"
CC_SET(c)
: CC_OUT(c) (oldbit)
: ADDR, "Ir" (nr) : "memory");
return oldbit;
}
static __always_inline bool
arch_test_and_clear_bit(long nr, volatile unsigned long *addr)
{
return GEN_BINARY_RMWcc(LOCK_PREFIX __ASM_SIZE(btr), *addr, c, "Ir", nr);
}
/*
* Note: the operation is performed atomically with respect to
* the local CPU, but not other CPUs. Portable code should not
* rely on this behaviour.
* KVM relies on this behaviour on x86 for modifying memory that is also
* accessed from a hypervisor on the same CPU if running in a VM: don't change
* this without also updating arch/x86/kernel/kvm.c
*/
static __always_inline bool
arch___test_and_clear_bit(long nr, volatile unsigned long *addr)
{
bool oldbit;
asm volatile(__ASM_SIZE(btr) " %2,%1"
CC_SET(c)
: CC_OUT(c) (oldbit)
: ADDR, "Ir" (nr) : "memory");
return oldbit;
}
static __always_inline bool
arch___test_and_change_bit(long nr, volatile unsigned long *addr)
{
bool oldbit;
asm volatile(__ASM_SIZE(btc) " %2,%1"
CC_SET(c)
: CC_OUT(c) (oldbit)
: ADDR, "Ir" (nr) : "memory");
return oldbit;
}
static __always_inline bool
arch_test_and_change_bit(long nr, volatile unsigned long *addr)
{
return GEN_BINARY_RMWcc(LOCK_PREFIX __ASM_SIZE(btc), *addr, c, "Ir", nr);
}
static __always_inline bool constant_test_bit(long nr, const volatile unsigned long *addr)
{
return ((1UL << (nr & (BITS_PER_LONG-1))) &
(addr[nr >> _BITOPS_LONG_SHIFT])) != 0;
}
static __always_inline bool variable_test_bit(long nr, volatile const unsigned long *addr)
{
bool oldbit;
asm volatile(__ASM_SIZE(bt) " %2,%1"
CC_SET(c)
: CC_OUT(c) (oldbit)
: "m" (*(unsigned long *)addr), "Ir" (nr) : "memory");
return oldbit;
}
#define arch_test_bit(nr, addr) \
(__builtin_constant_p((nr)) \
? constant_test_bit((nr), (addr)) \
: variable_test_bit((nr), (addr)))
/**
* __ffs - find first set bit in word
* @word: The word to search
*
* Undefined if no bit exists, so code should check against 0 first.
*/
static __always_inline unsigned long __ffs(unsigned long word)
{
asm("rep; bsf %1,%0"
: "=r" (word)
: "rm" (word));
return word;
}
/**
* ffz - find first zero bit in word
* @word: The word to search
*
* Undefined if no zero exists, so code should check against ~0UL first.
*/
static __always_inline unsigned long ffz(unsigned long word)
{
asm("rep; bsf %1,%0"
: "=r" (word)
: "r" (~word));
return word;
}
/*
* __fls: find last set bit in word
* @word: The word to search
*
* Undefined if no set bit exists, so code should check against 0 first.
*/
static __always_inline unsigned long __fls(unsigned long word)
{
asm("bsr %1,%0"
: "=r" (word)
: "rm" (word));
return word;
}
#undef ADDR
#ifdef __KERNEL__
/**
* ffs - find first set bit in word
* @x: the word to search
*
* This is defined the same way as the libc and compiler builtin ffs
* routines, therefore differs in spirit from the other bitops.
*
* ffs(value) returns 0 if value is 0 or the position of the first
* set bit if value is nonzero. The first (least significant) bit
* is at position 1.
*/
static __always_inline int ffs(int x)
{
int r;
#ifdef CONFIG_X86_64
/*
* AMD64 says BSFL won't clobber the dest reg if x==0; Intel64 says the
* dest reg is undefined if x==0, but their CPU architect says its
* value is written to set it to the same as before, except that the
* top 32 bits will be cleared.
*
* We cannot do this on 32 bits because at the very least some
* 486 CPUs did not behave this way.
*/
asm("bsfl %1,%0"
: "=r" (r)
: "rm" (x), "0" (-1));
#elif defined(CONFIG_X86_CMOV)
asm("bsfl %1,%0\n\t"
"cmovzl %2,%0"
: "=&r" (r) : "rm" (x), "r" (-1));
#else
asm("bsfl %1,%0\n\t"
"jnz 1f\n\t"
"movl $-1,%0\n"
"1:" : "=r" (r) : "rm" (x));
#endif
return r + 1;
}
/**
* fls - find last set bit in word
* @x: the word to search
*
* This is defined in a similar way as the libc and compiler builtin
* ffs, but returns the position of the most significant set bit.
*
* fls(value) returns 0 if value is 0 or the position of the last
* set bit if value is nonzero. The last (most significant) bit is
* at position 32.
*/
static __always_inline int fls(unsigned int x)
{
int r;
#ifdef CONFIG_X86_64
/*
* AMD64 says BSRL won't clobber the dest reg if x==0; Intel64 says the
* dest reg is undefined if x==0, but their CPU architect says its
* value is written to set it to the same as before, except that the
* top 32 bits will be cleared.
*
* We cannot do this on 32 bits because at the very least some
* 486 CPUs did not behave this way.
*/
asm("bsrl %1,%0"
: "=r" (r)
: "rm" (x), "0" (-1));
#elif defined(CONFIG_X86_CMOV)
asm("bsrl %1,%0\n\t"
"cmovzl %2,%0"
: "=&r" (r) : "rm" (x), "rm" (-1));
#else
asm("bsrl %1,%0\n\t"
"jnz 1f\n\t"
"movl $-1,%0\n"
"1:" : "=r" (r) : "rm" (x));
#endif
return r + 1;
}
/**
* fls64 - find last set bit in a 64-bit word
* @x: the word to search
*
* This is defined in a similar way as the libc and compiler builtin
* ffsll, but returns the position of the most significant set bit.
*
* fls64(value) returns 0 if value is 0 or the position of the last
* set bit if value is nonzero. The last (most significant) bit is
* at position 64.
*/
#ifdef CONFIG_X86_64
static __always_inline int fls64(__u64 x)
{
int bitpos = -1;
/*
* AMD64 says BSRQ won't clobber the dest reg if x==0; Intel64 says the
* dest reg is undefined if x==0, but their CPU architect says its
* value is written to set it to the same as before.
*/
asm("bsrq %1,%q0"
: "+r" (bitpos)
: "rm" (x));
return bitpos + 1;
}
#else
#include <asm-generic/bitops/fls64.h>
#endif
#include <asm-generic/bitops/find.h>
#include <asm-generic/bitops/sched.h>
#include <asm/arch_hweight.h>
#include <asm-generic/bitops/const_hweight.h>
#include <asm-generic/bitops/instrumented-atomic.h>
#include <asm-generic/bitops/instrumented-non-atomic.h>
#include <asm-generic/bitops/instrumented-lock.h>
#include <asm-generic/bitops/le.h>
#include <asm-generic/bitops/ext2-atomic-setbit.h>
#endif /* __KERNEL__ */
#endif /* _ASM_X86_BITOPS_H */
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _SCSI_SCSI_DEVICE_H
#define _SCSI_SCSI_DEVICE_H
#include <linux/list.h>
#include <linux/spinlock.h>
#include <linux/workqueue.h>
#include <linux/blkdev.h>
#include <scsi/scsi.h>
#include <linux/atomic.h>
#include <linux/sbitmap.h>
struct bsg_device;
struct device;
struct request_queue;
struct scsi_cmnd;
struct scsi_lun;
struct scsi_sense_hdr;
typedef __u64 __bitwise blist_flags_t;
#define SCSI_SENSE_BUFFERSIZE 96
struct scsi_mode_data {
__u32 length;
__u16 block_descriptor_length;
__u8 medium_type;
__u8 device_specific;
__u8 header_length;
__u8 longlba:1;
};
/*
* sdev state: If you alter this, you also need to alter scsi_sysfs.c
* (for the ascii descriptions) and the state model enforcer:
* scsi_lib:scsi_device_set_state().
*/
enum scsi_device_state {
SDEV_CREATED = 1, /* device created but not added to sysfs
* Only internal commands allowed (for inq) */
SDEV_RUNNING, /* device properly configured
* All commands allowed */
SDEV_CANCEL, /* beginning to delete device
* Only error handler commands allowed */
SDEV_DEL, /* device deleted
* no commands allowed */
SDEV_QUIESCE, /* Device quiescent. No block commands
* will be accepted, only specials (which
* originate in the mid-layer) */
SDEV_OFFLINE, /* Device offlined (by error handling or
* user request */
SDEV_TRANSPORT_OFFLINE, /* Offlined by transport class error handler */
SDEV_BLOCK, /* Device blocked by scsi lld. No
* scsi commands from user or midlayer
* should be issued to the scsi
* lld. */
SDEV_CREATED_BLOCK, /* same as above but for created devices */
};
enum scsi_scan_mode {
SCSI_SCAN_INITIAL = 0,
SCSI_SCAN_RESCAN,
SCSI_SCAN_MANUAL,
};
enum scsi_device_event {
SDEV_EVT_MEDIA_CHANGE = 1, /* media has changed */
SDEV_EVT_INQUIRY_CHANGE_REPORTED, /* 3F 03 UA reported */
SDEV_EVT_CAPACITY_CHANGE_REPORTED, /* 2A 09 UA reported */
SDEV_EVT_SOFT_THRESHOLD_REACHED_REPORTED, /* 38 07 UA reported */
SDEV_EVT_MODE_PARAMETER_CHANGE_REPORTED, /* 2A 01 UA reported */
SDEV_EVT_LUN_CHANGE_REPORTED, /* 3F 0E UA reported */
SDEV_EVT_ALUA_STATE_CHANGE_REPORTED, /* 2A 06 UA reported */
SDEV_EVT_POWER_ON_RESET_OCCURRED, /* 29 00 UA reported */
SDEV_EVT_FIRST = SDEV_EVT_MEDIA_CHANGE,
SDEV_EVT_LAST = SDEV_EVT_POWER_ON_RESET_OCCURRED,
SDEV_EVT_MAXBITS = SDEV_EVT_LAST + 1
};
struct scsi_event {
enum scsi_device_event evt_type;
struct list_head node;
/* put union of data structures, for non-simple event types,
* here
*/
};
/**
* struct scsi_vpd - SCSI Vital Product Data
* @rcu: For kfree_rcu().
* @len: Length in bytes of @data.
* @data: VPD data as defined in various T10 SCSI standard documents.
*/
struct scsi_vpd {
struct rcu_head rcu;
int len;
unsigned char data[];
};
struct scsi_device {
struct Scsi_Host *host;
struct request_queue *request_queue;
/* the next two are protected by the host->host_lock */
struct list_head siblings; /* list of all devices on this host */
struct list_head same_target_siblings; /* just the devices sharing same target id */
struct sbitmap budget_map;
atomic_t device_blocked; /* Device returned QUEUE_FULL. */
atomic_t restarts;
spinlock_t list_lock;
struct list_head starved_entry;
unsigned short queue_depth; /* How deep of a queue we want */
unsigned short max_queue_depth; /* max queue depth */
unsigned short last_queue_full_depth; /* These two are used by */
unsigned short last_queue_full_count; /* scsi_track_queue_full() */
unsigned long last_queue_full_time; /* last queue full time */
unsigned long queue_ramp_up_period; /* ramp up period in jiffies */
#define SCSI_DEFAULT_RAMP_UP_PERIOD (120 * HZ)
unsigned long last_queue_ramp_up; /* last queue ramp up time */
unsigned int id, channel;
u64 lun;
unsigned int manufacturer; /* Manufacturer of device, for using
* vendor-specific cmd's */
unsigned sector_size; /* size in bytes */
void *hostdata; /* available to low-level driver */
unsigned char type;
char scsi_level;
char inq_periph_qual; /* PQ from INQUIRY data */
struct mutex inquiry_mutex;
unsigned char inquiry_len; /* valid bytes in 'inquiry' */
unsigned char * inquiry; /* INQUIRY response data */
const char * vendor; /* [back_compat] point into 'inquiry' ... */
const char * model; /* ... after scan; point to static string */
const char * rev; /* ... "nullnullnullnull" before scan */
#define SCSI_VPD_PG_LEN 255
struct scsi_vpd __rcu *vpd_pg0;
struct scsi_vpd __rcu *vpd_pg83;
struct scsi_vpd __rcu *vpd_pg80;
struct scsi_vpd __rcu *vpd_pg89;
struct scsi_target *sdev_target;
blist_flags_t sdev_bflags; /* black/white flags as also found in
* scsi_devinfo.[hc]. For now used only to
* pass settings from slave_alloc to scsi
* core. */
unsigned int eh_timeout; /* Error handling timeout */
unsigned removable:1;
unsigned changed:1; /* Data invalid due to media change */
unsigned busy:1; /* Used to prevent races */
unsigned lockable:1; /* Able to prevent media removal */
unsigned locked:1; /* Media removal disabled */
unsigned borken:1; /* Tell the Seagate driver to be
* painfully slow on this device */
unsigned disconnect:1; /* can disconnect */
unsigned soft_reset:1; /* Uses soft reset option */
unsigned sdtr:1; /* Device supports SDTR messages */
unsigned wdtr:1; /* Device supports WDTR messages */
unsigned ppr:1; /* Device supports PPR messages */
unsigned tagged_supported:1; /* Supports SCSI-II tagged queuing */
unsigned simple_tags:1; /* simple queue tag messages are enabled */
unsigned was_reset:1; /* There was a bus reset on the bus for
* this device */
unsigned expecting_cc_ua:1; /* Expecting a CHECK_CONDITION/UNIT_ATTN
* because we did a bus reset. */
unsigned use_10_for_rw:1; /* first try 10-byte read / write */
unsigned use_10_for_ms:1; /* first try 10-byte mode sense/select */
unsigned set_dbd_for_ms:1; /* Set "DBD" field in mode sense */
unsigned no_report_opcodes:1; /* no REPORT SUPPORTED OPERATION CODES */
unsigned no_write_same:1; /* no WRITE SAME command */
unsigned use_16_for_rw:1; /* Use read/write(16) over read/write(10) */
unsigned skip_ms_page_8:1; /* do not use MODE SENSE page 0x08 */
unsigned skip_ms_page_3f:1; /* do not use MODE SENSE page 0x3f */
unsigned skip_vpd_pages:1; /* do not read VPD pages */
unsigned try_vpd_pages:1; /* attempt to read VPD pages */
unsigned use_192_bytes_for_3f:1; /* ask for 192 bytes from page 0x3f */
unsigned no_start_on_add:1; /* do not issue start on add */
unsigned allow_restart:1; /* issue START_UNIT in error handler */
unsigned manage_start_stop:1; /* Let HLD (sd) manage start/stop */
unsigned start_stop_pwr_cond:1; /* Set power cond. in START_STOP_UNIT */
unsigned no_uld_attach:1; /* disable connecting to upper level drivers */
unsigned select_no_atn:1;
unsigned fix_capacity:1; /* READ_CAPACITY is too high by 1 */
unsigned guess_capacity:1; /* READ_CAPACITY might be too high by 1 */
unsigned retry_hwerror:1; /* Retry HARDWARE_ERROR */
unsigned last_sector_bug:1; /* do not use multisector accesses on
SD_LAST_BUGGY_SECTORS */
unsigned no_read_disc_info:1; /* Avoid READ_DISC_INFO cmds */
unsigned no_read_capacity_16:1; /* Avoid READ_CAPACITY_16 cmds */
unsigned try_rc_10_first:1; /* Try READ_CAPACACITY_10 first */
unsigned security_supported:1; /* Supports Security Protocols */
unsigned is_visible:1; /* is the device visible in sysfs */
unsigned wce_default_on:1; /* Cache is ON by default */
unsigned no_dif:1; /* T10 PI (DIF) should be disabled */
unsigned broken_fua:1; /* Don't set FUA bit */
unsigned lun_in_cdb:1; /* Store LUN bits in CDB[1] */
unsigned unmap_limit_for_ws:1; /* Use the UNMAP limit for WRITE SAME */
unsigned rpm_autosuspend:1; /* Enable runtime autosuspend at device
* creation time */
unsigned ignore_media_change:1; /* Ignore MEDIA CHANGE on resume */
unsigned silence_suspend:1; /* Do not print runtime PM related messages */
bool offline_already; /* Device offline message logged */
atomic_t disk_events_disable_depth; /* disable depth for disk events */
DECLARE_BITMAP(supported_events, SDEV_EVT_MAXBITS); /* supported events */
DECLARE_BITMAP(pending_events, SDEV_EVT_MAXBITS); /* pending events */
struct list_head event_list; /* asserted events */
struct work_struct event_work;
unsigned int max_device_blocked; /* what device_blocked counts down from */
#define SCSI_DEFAULT_DEVICE_BLOCKED 3
atomic_t iorequest_cnt;
atomic_t iodone_cnt;
atomic_t ioerr_cnt;
struct device sdev_gendev,
sdev_dev;
struct execute_work ew; /* used to get process context on put */
struct work_struct requeue_work;
struct scsi_device_handler *handler;
void *handler_data;
size_t dma_drain_len;
void *dma_drain_buf;
unsigned int sg_timeout;
unsigned int sg_reserved_size;
struct bsg_device *bsg_dev;
unsigned char access_state;
struct mutex state_mutex;
enum scsi_device_state sdev_state;
struct task_struct *quiesced_by;
unsigned long sdev_data[];
} __attribute__((aligned(sizeof(unsigned long))));
#define to_scsi_device(d) \
container_of(d, struct scsi_device, sdev_gendev)
#define class_to_sdev(d) \
container_of(d, struct scsi_device, sdev_dev)
#define transport_class_to_sdev(class_dev) \
to_scsi_device(class_dev->parent)
#define sdev_dbg(sdev, fmt, a...) \
dev_dbg(&(sdev)->sdev_gendev, fmt, ##a)
/*
* like scmd_printk, but the device name is passed in
* as a string pointer
*/
__printf(4, 5) void
sdev_prefix_printk(const char *, const struct scsi_device *, const char *,
const char *, ...);
#define sdev_printk(l, sdev, fmt, a...) \
sdev_prefix_printk(l, sdev, NULL, fmt, ##a)
__printf(3, 4) void
scmd_printk(const char *, const struct scsi_cmnd *, const char *, ...);
#define scmd_dbg(scmd, fmt, a...) \
do { \
struct request *__rq = scsi_cmd_to_rq((scmd)); \
\
if (__rq->rq_disk) \
sdev_dbg((scmd)->device, "[%s] " fmt, \
__rq->rq_disk->disk_name, ##a); \
else \
sdev_dbg((scmd)->device, fmt, ##a); \
} while (0)
enum scsi_target_state {
STARGET_CREATED = 1,
STARGET_RUNNING,
STARGET_REMOVE,
STARGET_CREATED_REMOVE,
STARGET_DEL,
};
/*
* scsi_target: representation of a scsi target, for now, this is only
* used for single_lun devices. If no one has active IO to the target,
* starget_sdev_user is NULL, else it points to the active sdev.
*/
struct scsi_target {
struct scsi_device *starget_sdev_user;
struct list_head siblings;
struct list_head devices;
struct device dev;
struct kref reap_ref; /* last put renders target invisible */
unsigned int channel;
unsigned int id; /* target id ... replace
* scsi_device.id eventually */
unsigned int create:1; /* signal that it needs to be added */
unsigned int single_lun:1; /* Indicates we should only
* allow I/O to one of the luns
* for the device at a time. */
unsigned int pdt_1f_for_no_lun:1; /* PDT = 0x1f
* means no lun present. */
unsigned int no_report_luns:1; /* Don't use
* REPORT LUNS for scanning. */
unsigned int expecting_lun_change:1; /* A device has reported
* a 3F/0E UA, other devices on
* the same target will also. */
/* commands actually active on LLD. */
atomic_t target_busy;
atomic_t target_blocked;
/*
* LLDs should set this in the slave_alloc host template callout.
* If set to zero then there is not limit.
*/
unsigned int can_queue;
unsigned int max_target_blocked;
#define SCSI_DEFAULT_TARGET_BLOCKED 3
char scsi_level;
enum scsi_target_state state;
void *hostdata; /* available to low-level driver */
unsigned long starget_data[]; /* for the transport */
/* starget_data must be the last element!!!! */
} __attribute__((aligned(sizeof(unsigned long))));
#define to_scsi_target(d) container_of(d, struct scsi_target, dev)
static inline struct scsi_target *scsi_target(struct scsi_device *sdev)
{
return to_scsi_target(sdev->sdev_gendev.parent);
}
#define transport_class_to_starget(class_dev) \
to_scsi_target(class_dev->parent)
#define starget_printk(prefix, starget, fmt, a...) \
dev_printk(prefix, &(starget)->dev, fmt, ##a)
extern struct scsi_device *__scsi_add_device(struct Scsi_Host *,
uint, uint, u64, void *hostdata);
extern int scsi_add_device(struct Scsi_Host *host, uint channel,
uint target, u64 lun);
extern int scsi_register_device_handler(struct scsi_device_handler *scsi_dh);
extern void scsi_remove_device(struct scsi_device *);
extern int scsi_unregister_device_handler(struct scsi_device_handler *scsi_dh);
void scsi_attach_vpd(struct scsi_device *sdev);
extern struct scsi_device *scsi_device_from_queue(struct request_queue *q);
extern int __must_check scsi_device_get(struct scsi_device *);
extern void scsi_device_put(struct scsi_device *);
extern struct scsi_device *scsi_device_lookup(struct Scsi_Host *,
uint, uint, u64);
extern struct scsi_device *__scsi_device_lookup(struct Scsi_Host *,
uint, uint, u64);
extern struct scsi_device *scsi_device_lookup_by_target(struct scsi_target *,
u64);
extern struct scsi_device *__scsi_device_lookup_by_target(struct scsi_target *,
u64);
extern void starget_for_each_device(struct scsi_target *, void *,
void (*fn)(struct scsi_device *, void *));
extern void __starget_for_each_device(struct scsi_target *, void *,
void (*fn)(struct scsi_device *,
void *));
/* only exposed to implement shost_for_each_device */
extern struct scsi_device *__scsi_iterate_devices(struct Scsi_Host *,
struct scsi_device *);
/**
* shost_for_each_device - iterate over all devices of a host
* @sdev: the &struct scsi_device to use as a cursor
* @shost: the &struct scsi_host to iterate over
*
* Iterator that returns each device attached to @shost. This loop
* takes a reference on each device and releases it at the end. If
* you break out of the loop, you must call scsi_device_put(sdev).
*/
#define shost_for_each_device(sdev, shost) \
for ((sdev) = __scsi_iterate_devices((shost), NULL); \
(sdev); \
(sdev) = __scsi_iterate_devices((shost), (sdev)))
/**
* __shost_for_each_device - iterate over all devices of a host (UNLOCKED)
* @sdev: the &struct scsi_device to use as a cursor
* @shost: the &struct scsi_host to iterate over
*
* Iterator that returns each device attached to @shost. It does _not_
* take a reference on the scsi_device, so the whole loop must be
* protected by shost->host_lock.
*
* Note: The only reason to use this is because you need to access the
* device list in interrupt context. Otherwise you really want to use
* shost_for_each_device instead.
*/
#define __shost_for_each_device(sdev, shost) \
list_for_each_entry((sdev), &((shost)->__devices), siblings)
extern int scsi_change_queue_depth(struct scsi_device *, int);
extern int scsi_track_queue_full(struct scsi_device *, int);
extern int scsi_set_medium_removal(struct scsi_device *, char);
extern int scsi_mode_sense(struct scsi_device *sdev, int dbd, int modepage,
unsigned char *buffer, int len, int timeout,
int retries, struct scsi_mode_data *data,
struct scsi_sense_hdr *);
extern int scsi_mode_select(struct scsi_device *sdev, int pf, int sp,
int modepage, unsigned char *buffer, int len,
int timeout, int retries,
struct scsi_mode_data *data,
struct scsi_sense_hdr *);
extern int scsi_test_unit_ready(struct scsi_device *sdev, int timeout,
int retries, struct scsi_sense_hdr *sshdr);
extern int scsi_get_vpd_page(struct scsi_device *, u8 page, unsigned char *buf,
int buf_len);
extern int scsi_report_opcode(struct scsi_device *sdev, unsigned char *buffer,
unsigned int len, unsigned char opcode);
extern int scsi_device_set_state(struct scsi_device *sdev,
enum scsi_device_state state);
extern struct scsi_event *sdev_evt_alloc(enum scsi_device_event evt_type,
gfp_t gfpflags);
extern void sdev_evt_send(struct scsi_device *sdev, struct scsi_event *evt);
extern void sdev_evt_send_simple(struct scsi_device *sdev,
enum scsi_device_event evt_type, gfp_t gfpflags);
extern int scsi_device_quiesce(struct scsi_device *sdev);
extern void scsi_device_resume(struct scsi_device *sdev);
extern void scsi_target_quiesce(struct scsi_target *);
extern void scsi_target_resume(struct scsi_target *);
extern void scsi_scan_target(struct device *parent, unsigned int channel,
unsigned int id, u64 lun,
enum scsi_scan_mode rescan);
extern void scsi_target_reap(struct scsi_target *);
extern void scsi_target_block(struct device *);
extern void scsi_target_unblock(struct device *, enum scsi_device_state);
extern void scsi_remove_target(struct device *);
extern const char *scsi_device_state_name(enum scsi_device_state);
extern int scsi_is_sdev_device(const struct device *);
extern int scsi_is_target_device(const struct device *);
extern void scsi_sanitize_inquiry_string(unsigned char *s, int len);
extern int __scsi_execute(struct scsi_device *sdev, const unsigned char *cmd,
int data_direction, void *buffer, unsigned bufflen,
unsigned char *sense, struct scsi_sense_hdr *sshdr,
int timeout, int retries, u64 flags,
req_flags_t rq_flags, int *resid);
/* Make sure any sense buffer is the correct size. */
#define scsi_execute(sdev, cmd, data_direction, buffer, bufflen, sense, \
sshdr, timeout, retries, flags, rq_flags, resid) \
({ \
BUILD_BUG_ON((sense) != NULL && \
sizeof(sense) != SCSI_SENSE_BUFFERSIZE); \
__scsi_execute(sdev, cmd, data_direction, buffer, bufflen, \
sense, sshdr, timeout, retries, flags, rq_flags, \
resid); \
})
static inline int scsi_execute_req(struct scsi_device *sdev,
const unsigned char *cmd, int data_direction, void *buffer,
unsigned bufflen, struct scsi_sense_hdr *sshdr, int timeout,
int retries, int *resid)
{
return scsi_execute(sdev, cmd, data_direction, buffer,
bufflen, NULL, sshdr, timeout, retries, 0, 0, resid);
}
extern void sdev_disable_disk_events(struct scsi_device *sdev);
extern void sdev_enable_disk_events(struct scsi_device *sdev);
extern int scsi_vpd_lun_id(struct scsi_device *, char *, size_t);
extern int scsi_vpd_tpg_id(struct scsi_device *, int *);
#ifdef CONFIG_PM
extern int scsi_autopm_get_device(struct scsi_device *);
extern void scsi_autopm_put_device(struct scsi_device *);
#else
static inline int scsi_autopm_get_device(struct scsi_device *d) { return 0; }
static inline void scsi_autopm_put_device(struct scsi_device *d) {}
#endif /* CONFIG_PM */
static inline int __must_check scsi_device_reprobe(struct scsi_device *sdev)
{
return device_reprobe(&sdev->sdev_gendev);
}
static inline unsigned int sdev_channel(struct scsi_device *sdev)
{
return sdev->channel;
}
static inline unsigned int sdev_id(struct scsi_device *sdev)
{
return sdev->id;
}
#define scmd_id(scmd) sdev_id((scmd)->device)
#define scmd_channel(scmd) sdev_channel((scmd)->device)
/*
* checks for positions of the SCSI state machine
*/
static inline int scsi_device_online(struct scsi_device *sdev)
{
return (sdev->sdev_state != SDEV_OFFLINE && sdev->sdev_state != SDEV_TRANSPORT_OFFLINE &&
sdev->sdev_state != SDEV_DEL);
}
static inline int scsi_device_blocked(struct scsi_device *sdev)
{
return sdev->sdev_state == SDEV_BLOCK ||
sdev->sdev_state == SDEV_CREATED_BLOCK;
}
static inline int scsi_device_created(struct scsi_device *sdev)
{
return sdev->sdev_state == SDEV_CREATED ||
sdev->sdev_state == SDEV_CREATED_BLOCK;
}
int scsi_internal_device_block_nowait(struct scsi_device *sdev);
int scsi_internal_device_unblock_nowait(struct scsi_device *sdev,
enum scsi_device_state new_state);
/* accessor functions for the SCSI parameters */
static inline int scsi_device_sync(struct scsi_device *sdev)
{
return sdev->sdtr;
}
static inline int scsi_device_wide(struct scsi_device *sdev)
{
return sdev->wdtr;
}
static inline int scsi_device_dt(struct scsi_device *sdev)
{
return sdev->ppr;
}
static inline int scsi_device_dt_only(struct scsi_device *sdev)
{
if (sdev->inquiry_len < 57)
return 0;
return (sdev->inquiry[56] & 0x0c) == 0x04;
}
static inline int scsi_device_ius(struct scsi_device *sdev)
{
if (sdev->inquiry_len < 57)
return 0;
return sdev->inquiry[56] & 0x01;
}
static inline int scsi_device_qas(struct scsi_device *sdev)
{
if (sdev->inquiry_len < 57)
return 0;
return sdev->inquiry[56] & 0x02;
}
static inline int scsi_device_enclosure(struct scsi_device *sdev)
{
return sdev->inquiry ? (sdev->inquiry[6] & (1<<6)) : 1;
}
static inline int scsi_device_protection(struct scsi_device *sdev)
{
if (sdev->no_dif)
return 0;
return sdev->scsi_level > SCSI_2 && sdev->inquiry[5] & (1<<0);
}
static inline int scsi_device_tpgs(struct scsi_device *sdev)
{
return sdev->inquiry ? (sdev->inquiry[5] >> 4) & 0x3 : 0;
}
/**
* scsi_device_supports_vpd - test if a device supports VPD pages
* @sdev: the &struct scsi_device to test
*
* If the 'try_vpd_pages' flag is set it takes precedence.
* Otherwise we will assume VPD pages are supported if the
* SCSI level is at least SPC-3 and 'skip_vpd_pages' is not set.
*/
static inline int scsi_device_supports_vpd(struct scsi_device *sdev)
{
/* Attempt VPD inquiry if the device blacklist explicitly calls
* for it.
*/
if (sdev->try_vpd_pages)
return 1;
/*
* Although VPD inquiries can go to SCSI-2 type devices,
* some USB ones crash on receiving them, and the pages
* we currently ask for are mandatory for SPC-2 and beyond
*/
if (sdev->scsi_level >= SCSI_SPC_2 && !sdev->skip_vpd_pages)
return 1;
return 0;
}
static inline int scsi_device_busy(struct scsi_device *sdev)
{
return sbitmap_weight(&sdev->budget_map);
}
#define MODULE_ALIAS_SCSI_DEVICE(type) \
MODULE_ALIAS("scsi:t-" __stringify(type) "*")
#define SCSI_DEVICE_MODALIAS_FMT "scsi:t-0x%02x"
#endif /* _SCSI_SCSI_DEVICE_H */
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_PGTABLE_H
#define _LINUX_PGTABLE_H
#include <linux/pfn.h>
#include <asm/pgtable.h>
#ifndef __ASSEMBLY__
#ifdef CONFIG_MMU
#include <linux/mm_types.h>
#include <linux/bug.h>
#include <linux/errno.h>
#include <asm-generic/pgtable_uffd.h>
#if 5 - defined(__PAGETABLE_P4D_FOLDED) - defined(__PAGETABLE_PUD_FOLDED) - \
defined(__PAGETABLE_PMD_FOLDED) != CONFIG_PGTABLE_LEVELS
#error CONFIG_PGTABLE_LEVELS is not consistent with __PAGETABLE_{P4D,PUD,PMD}_FOLDED
#endif
/*
* On almost all architectures and configurations, 0 can be used as the
* upper ceiling to free_pgtables(): on many architectures it has the same
* effect as using TASK_SIZE. However, there is one configuration which
* must impose a more careful limit, to avoid freeing kernel pgtables.
*/
#ifndef USER_PGTABLES_CEILING
#define USER_PGTABLES_CEILING 0UL
#endif
/*
* This defines the first usable user address. Platforms
* can override its value with custom FIRST_USER_ADDRESS
* defined in their respective <asm/pgtable.h>.
*/
#ifndef FIRST_USER_ADDRESS
#define FIRST_USER_ADDRESS 0UL
#endif
/*
* This defines the generic helper for accessing PMD page
* table page. Although platforms can still override this
* via their respective <asm/pgtable.h>.
*/
#ifndef pmd_pgtable
#define pmd_pgtable(pmd) pmd_page(pmd)
#endif
/*
* A page table page can be thought of an array like this: pXd_t[PTRS_PER_PxD]
*
* The pXx_index() functions return the index of the entry in the page
* table page which would control the given virtual address
*
* As these functions may be used by the same code for different levels of
* the page table folding, they are always available, regardless of
* CONFIG_PGTABLE_LEVELS value. For the folded levels they simply return 0
* because in such cases PTRS_PER_PxD equals 1.
*/
static inline unsigned long pte_index(unsigned long address)
{
return (address >> PAGE_SHIFT) & (PTRS_PER_PTE - 1);
}
#define pte_index pte_index
#ifndef pmd_index
static inline unsigned long pmd_index(unsigned long address)
{
return (address >> PMD_SHIFT) & (PTRS_PER_PMD - 1);
}
#define pmd_index pmd_index
#endif
#ifndef pud_index
static inline unsigned long pud_index(unsigned long address)
{
return (address >> PUD_SHIFT) & (PTRS_PER_PUD - 1);
}
#define pud_index pud_index
#endif
#ifndef pgd_index
/* Must be a compile-time constant, so implement it as a macro */
#define pgd_index(a) (((a) >> PGDIR_SHIFT) & (PTRS_PER_PGD - 1))
#endif
#ifndef pte_offset_kernel
static inline pte_t *pte_offset_kernel(pmd_t *pmd, unsigned long address)
{
return (pte_t *)pmd_page_vaddr(*pmd) + pte_index(address);
}
#define pte_offset_kernel pte_offset_kernel
#endif
#if defined(CONFIG_HIGHPTE)
#define pte_offset_map(dir, address) \
((pte_t *)kmap_atomic(pmd_page(*(dir))) + \
pte_index((address)))
#define pte_unmap(pte) kunmap_atomic((pte))
#else
#define pte_offset_map(dir, address) pte_offset_kernel((dir), (address))
#define pte_unmap(pte) ((void)(pte)) /* NOP */
#endif
/* Find an entry in the second-level page table.. */
#ifndef pmd_offset
static inline pmd_t *pmd_offset(pud_t *pud, unsigned long address)
{
return pud_pgtable(*pud) + pmd_index(address);
}
#define pmd_offset pmd_offset
#endif
#ifndef pud_offset
static inline pud_t *pud_offset(p4d_t *p4d, unsigned long address)
{
return p4d_pgtable(*p4d) + pud_index(address);
}
#define pud_offset pud_offset
#endif
static inline pgd_t *pgd_offset_pgd(pgd_t *pgd, unsigned long address)
{
return (pgd + pgd_index(address));
};
/*
* a shortcut to get a pgd_t in a given mm
*/
#ifndef pgd_offset
#define pgd_offset(mm, address) pgd_offset_pgd((mm)->pgd, (address))
#endif
/*
* a shortcut which implies the use of the kernel's pgd, instead
* of a process's
*/
#ifndef pgd_offset_k
#define pgd_offset_k(address) pgd_offset(&init_mm, (address))
#endif
/*
* In many cases it is known that a virtual address is mapped at PMD or PTE
* level, so instead of traversing all the page table levels, we can get a
* pointer to the PMD entry in user or kernel page table or translate a virtual
* address to the pointer in the PTE in the kernel page tables with simple
* helpers.
*/
static inline pmd_t *pmd_off(struct mm_struct *mm, unsigned long va)
{
return pmd_offset(pud_offset(p4d_offset(pgd_offset(mm, va), va), va), va);
}
static inline pmd_t *pmd_off_k(unsigned long va)
{
return pmd_offset(pud_offset(p4d_offset(pgd_offset_k(va), va), va), va);
}
static inline pte_t *virt_to_kpte(unsigned long vaddr)
{
pmd_t *pmd = pmd_off_k(vaddr);
return pmd_none(*pmd) ? NULL : pte_offset_kernel(pmd, vaddr);
}
#ifndef __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS
extern int ptep_set_access_flags(struct vm_area_struct *vma,
unsigned long address, pte_t *ptep,
pte_t entry, int dirty);
#endif
#ifndef __HAVE_ARCH_PMDP_SET_ACCESS_FLAGS
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
extern int pmdp_set_access_flags(struct vm_area_struct *vma,
unsigned long address, pmd_t *pmdp,
pmd_t entry, int dirty);
extern int pudp_set_access_flags(struct vm_area_struct *vma,
unsigned long address, pud_t *pudp,
pud_t entry, int dirty);
#else
static inline int pmdp_set_access_flags(struct vm_area_struct *vma,
unsigned long address, pmd_t *pmdp,
pmd_t entry, int dirty)
{
BUILD_BUG();
return 0;
}
static inline int pudp_set_access_flags(struct vm_area_struct *vma,
unsigned long address, pud_t *pudp,
pud_t entry, int dirty)
{
BUILD_BUG();
return 0;
}
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
#endif
#ifndef __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG
static inline int ptep_test_and_clear_young(struct vm_area_struct *vma,
unsigned long address,
pte_t *ptep)
{
pte_t pte = *ptep;
int r = 1;
if (!pte_young(pte))
r = 0;
else
set_pte_at(vma->vm_mm, address, ptep, pte_mkold(pte));
return r;
}
#endif
#ifndef __HAVE_ARCH_PMDP_TEST_AND_CLEAR_YOUNG
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
static inline int pmdp_test_and_clear_young(struct vm_area_struct *vma,
unsigned long address,
pmd_t *pmdp)
{
pmd_t pmd = *pmdp;
int r = 1;
if (!pmd_young(pmd))
r = 0;
else
set_pmd_at(vma->vm_mm, address, pmdp, pmd_mkold(pmd));
return r;
}
#else
static inline int pmdp_test_and_clear_young(struct vm_area_struct *vma,
unsigned long address,
pmd_t *pmdp)
{
BUILD_BUG();
return 0;
}
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
#endif
#ifndef __HAVE_ARCH_PTEP_CLEAR_YOUNG_FLUSH
int ptep_clear_flush_young(struct vm_area_struct *vma,
unsigned long address, pte_t *ptep);
#endif
#ifndef __HAVE_ARCH_PMDP_CLEAR_YOUNG_FLUSH
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
extern int pmdp_clear_flush_young(struct vm_area_struct *vma,
unsigned long address, pmd_t *pmdp);
#else
/*
* Despite relevant to THP only, this API is called from generic rmap code
* under PageTransHuge(), hence needs a dummy implementation for !THP
*/
static inline int pmdp_clear_flush_young(struct vm_area_struct *vma,
unsigned long address, pmd_t *pmdp)
{
BUILD_BUG();
return 0;
}
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
#endif
#ifndef __HAVE_ARCH_PTEP_GET_AND_CLEAR
static inline pte_t ptep_get_and_clear(struct mm_struct *mm,
unsigned long address,
pte_t *ptep)
{
pte_t pte = *ptep;
pte_clear(mm, address, ptep);
return pte;
}
#endif
#ifndef __HAVE_ARCH_PTEP_GET
static inline pte_t ptep_get(pte_t *ptep)
{
return READ_ONCE(*ptep);
}
#endif
#ifdef CONFIG_GUP_GET_PTE_LOW_HIGH
/*
* WARNING: only to be used in the get_user_pages_fast() implementation.
*
* With get_user_pages_fast(), we walk down the pagetables without taking any
* locks. For this we would like to load the pointers atomically, but sometimes
* that is not possible (e.g. without expensive cmpxchg8b on x86_32 PAE). What
* we do have is the guarantee that a PTE will only either go from not present
* to present, or present to not present or both -- it will not switch to a
* completely different present page without a TLB flush in between; something
* that we are blocking by holding interrupts off.
*
* Setting ptes from not present to present goes:
*
* ptep->pte_high = h;
* smp_wmb();
* ptep->pte_low = l;
*
* And present to not present goes:
*
* ptep->pte_low = 0;
* smp_wmb();
* ptep->pte_high = 0;
*
* We must ensure here that the load of pte_low sees 'l' IFF pte_high sees 'h'.
* We load pte_high *after* loading pte_low, which ensures we don't see an older
* value of pte_high. *Then* we recheck pte_low, which ensures that we haven't
* picked up a changed pte high. We might have gotten rubbish values from
* pte_low and pte_high, but we are guaranteed that pte_low will not have the
* present bit set *unless* it is 'l'. Because get_user_pages_fast() only
* operates on present ptes we're safe.
*/
static inline pte_t ptep_get_lockless(pte_t *ptep)
{
pte_t pte;
do {
pte.pte_low = ptep->pte_low;
smp_rmb();
pte.pte_high = ptep->pte_high;
smp_rmb();
} while (unlikely(pte.pte_low != ptep->pte_low));
return pte;
}
#else /* CONFIG_GUP_GET_PTE_LOW_HIGH */
/*
* We require that the PTE can be read atomically.
*/
static inline pte_t ptep_get_lockless(pte_t *ptep)
{
return ptep_get(ptep);
}
#endif /* CONFIG_GUP_GET_PTE_LOW_HIGH */
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
#ifndef __HAVE_ARCH_PMDP_HUGE_GET_AND_CLEAR
static inline pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm,
unsigned long address,
pmd_t *pmdp)
{
pmd_t pmd = *pmdp;
pmd_clear(pmdp);
return pmd;
}
#endif /* __HAVE_ARCH_PMDP_HUGE_GET_AND_CLEAR */
#ifndef __HAVE_ARCH_PUDP_HUGE_GET_AND_CLEAR
static inline pud_t pudp_huge_get_and_clear(struct mm_struct *mm,
unsigned long address,
pud_t *pudp)
{
pud_t pud = *pudp;
pud_clear(pudp);
return pud;
}
#endif /* __HAVE_ARCH_PUDP_HUGE_GET_AND_CLEAR */
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
#ifndef __HAVE_ARCH_PMDP_HUGE_GET_AND_CLEAR_FULL
static inline pmd_t pmdp_huge_get_and_clear_full(struct vm_area_struct *vma,
unsigned long address, pmd_t *pmdp,
int full)
{
return pmdp_huge_get_and_clear(vma->vm_mm, address, pmdp);
}
#endif
#ifndef __HAVE_ARCH_PUDP_HUGE_GET_AND_CLEAR_FULL
static inline pud_t pudp_huge_get_and_clear_full(struct mm_struct *mm,
unsigned long address, pud_t *pudp,
int full)
{
return pudp_huge_get_and_clear(mm, address, pudp);
}
#endif
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
#ifndef __HAVE_ARCH_PTEP_GET_AND_CLEAR_FULL
static inline pte_t ptep_get_and_clear_full(struct mm_struct *mm,
unsigned long address, pte_t *ptep,
int full)
{
pte_t pte;
pte = ptep_get_and_clear(mm, address, ptep);
return pte;
}
#endif
/*
* If two threads concurrently fault at the same page, the thread that
* won the race updates the PTE and its local TLB/Cache. The other thread
* gives up, simply does nothing, and continues; on architectures where
* software can update TLB, local TLB can be updated here to avoid next page
* fault. This function updates TLB only, do nothing with cache or others.
* It is the difference with function update_mmu_cache.
*/
#ifndef __HAVE_ARCH_UPDATE_MMU_TLB
static inline void update_mmu_tlb(struct vm_area_struct *vma,
unsigned long address, pte_t *ptep)
{
}
#define __HAVE_ARCH_UPDATE_MMU_TLB
#endif
/*
* Some architectures may be able to avoid expensive synchronization
* primitives when modifications are made to PTE's which are already
* not present, or in the process of an address space destruction.
*/
#ifndef __HAVE_ARCH_PTE_CLEAR_NOT_PRESENT_FULL
static inline void pte_clear_not_present_full(struct mm_struct *mm,
unsigned long address,
pte_t *ptep,
int full)
{
pte_clear(mm, address, ptep);
}
#endif
#ifndef __HAVE_ARCH_PTEP_CLEAR_FLUSH
extern pte_t ptep_clear_flush(struct vm_area_struct *vma,
unsigned long address,
pte_t *ptep);
#endif
#ifndef __HAVE_ARCH_PMDP_HUGE_CLEAR_FLUSH
extern pmd_t pmdp_huge_clear_flush(struct vm_area_struct *vma,
unsigned long address,
pmd_t *pmdp);
extern pud_t pudp_huge_clear_flush(struct vm_area_struct *vma,
unsigned long address,
pud_t *pudp);
#endif
#ifndef __HAVE_ARCH_PTEP_SET_WRPROTECT
struct mm_struct;
static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long address, pte_t *ptep)
{
pte_t old_pte = *ptep;
set_pte_at(mm, address, ptep, pte_wrprotect(old_pte));
}
#endif
/*
* On some architectures hardware does not set page access bit when accessing
* memory page, it is responsibility of software setting this bit. It brings
* out extra page fault penalty to track page access bit. For optimization page
* access bit can be set during all page fault flow on these arches.
* To be differentiate with macro pte_mkyoung, this macro is used on platforms
* where software maintains page access bit.
*/
#ifndef pte_sw_mkyoung
static inline pte_t pte_sw_mkyoung(pte_t pte)
{
return pte;
}
#define pte_sw_mkyoung pte_sw_mkyoung
#endif
#ifndef pte_savedwrite
#define pte_savedwrite pte_write
#endif
#ifndef pte_mk_savedwrite
#define pte_mk_savedwrite pte_mkwrite
#endif
#ifndef pte_clear_savedwrite
#define pte_clear_savedwrite pte_wrprotect
#endif
#ifndef pmd_savedwrite
#define pmd_savedwrite pmd_write
#endif
#ifndef pmd_mk_savedwrite
#define pmd_mk_savedwrite pmd_mkwrite
#endif
#ifndef pmd_clear_savedwrite
#define pmd_clear_savedwrite pmd_wrprotect
#endif
#ifndef __HAVE_ARCH_PMDP_SET_WRPROTECT
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
static inline void pmdp_set_wrprotect(struct mm_struct *mm,
unsigned long address, pmd_t *pmdp)
{
pmd_t old_pmd = *pmdp;
set_pmd_at(mm, address, pmdp, pmd_wrprotect(old_pmd));
}
#else
static inline void pmdp_set_wrprotect(struct mm_struct *mm,
unsigned long address, pmd_t *pmdp)
{
BUILD_BUG();
}
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
#endif
#ifndef __HAVE_ARCH_PUDP_SET_WRPROTECT
#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
static inline void pudp_set_wrprotect(struct mm_struct *mm,
unsigned long address, pud_t *pudp)
{
pud_t old_pud = *pudp;
set_pud_at(mm, address, pudp, pud_wrprotect(old_pud));
}
#else
static inline void pudp_set_wrprotect(struct mm_struct *mm,
unsigned long address, pud_t *pudp)
{
BUILD_BUG();
}
#endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */
#endif
#ifndef pmdp_collapse_flush
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
extern pmd_t pmdp_collapse_flush(struct vm_area_struct *vma,
unsigned long address, pmd_t *pmdp);
#else
static inline pmd_t pmdp_collapse_flush(struct vm_area_struct *vma,
unsigned long address,
pmd_t *pmdp)
{
BUILD_BUG();
return *pmdp;
}
#define pmdp_collapse_flush pmdp_collapse_flush
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
#endif
#ifndef __HAVE_ARCH_PGTABLE_DEPOSIT
extern void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp,
pgtable_t pgtable);
#endif
#ifndef __HAVE_ARCH_PGTABLE_WITHDRAW
extern pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp);
#endif
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
/*
* This is an implementation of pmdp_establish() that is only suitable for an
* architecture that doesn't have hardware dirty/accessed bits. In this case we
* can't race with CPU which sets these bits and non-atomic approach is fine.
*/
static inline pmd_t generic_pmdp_establish(struct vm_area_struct *vma,
unsigned long address, pmd_t *pmdp, pmd_t pmd)
{
pmd_t old_pmd = *pmdp;
set_pmd_at(vma->vm_mm, address, pmdp, pmd);
return old_pmd;
}
#endif
#ifndef __HAVE_ARCH_PMDP_INVALIDATE
extern pmd_t pmdp_invalidate(struct vm_area_struct *vma, unsigned long address,
pmd_t *pmdp);
#endif
#ifndef __HAVE_ARCH_PTE_SAME
static inline int pte_same(pte_t pte_a, pte_t pte_b)
{
return pte_val(pte_a) == pte_val(pte_b);
}
#endif
#ifndef __HAVE_ARCH_PTE_UNUSED
/*
* Some architectures provide facilities to virtualization guests
* so that they can flag allocated pages as unused. This allows the
* host to transparently reclaim unused pages. This function returns
* whether the pte's page is unused.
*/
static inline int pte_unused(pte_t pte)
{
return 0;
}
#endif
#ifndef pte_access_permitted
#define pte_access_permitted(pte, write) \
(pte_present(pte) && (!(write) || pte_write(pte)))
#endif
#ifndef pmd_access_permitted
#define pmd_access_permitted(pmd, write) \
(pmd_present(pmd) && (!(write) || pmd_write(pmd)))
#endif
#ifndef pud_access_permitted
#define pud_access_permitted(pud, write) \
(pud_present(pud) && (!(write) || pud_write(pud)))
#endif
#ifndef p4d_access_permitted
#define p4d_access_permitted(p4d, write) \
(p4d_present(p4d) && (!(write) || p4d_write(p4d)))
#endif
#ifndef pgd_access_permitted
#define pgd_access_permitted(pgd, write) \
(pgd_present(pgd) && (!(write) || pgd_write(pgd)))
#endif
#ifndef __HAVE_ARCH_PMD_SAME
static inline int pmd_same(pmd_t pmd_a, pmd_t pmd_b)
{
return pmd_val(pmd_a) == pmd_val(pmd_b);
}
static inline int pud_same(pud_t pud_a, pud_t pud_b)
{
return pud_val(pud_a) == pud_val(pud_b);
}
#endif
#ifndef __HAVE_ARCH_P4D_SAME
static inline int p4d_same(p4d_t p4d_a, p4d_t p4d_b)
{
return p4d_val(p4d_a) == p4d_val(p4d_b);
}
#endif
#ifndef __HAVE_ARCH_PGD_SAME
static inline int pgd_same(pgd_t pgd_a, pgd_t pgd_b)
{
return pgd_val(pgd_a) == pgd_val(pgd_b);
}
#endif
/*
* Use set_p*_safe(), and elide TLB flushing, when confident that *no*
* TLB flush will be required as a result of the "set". For example, use
* in scenarios where it is known ahead of time that the routine is
* setting non-present entries, or re-setting an existing entry to the
* same value. Otherwise, use the typical "set" helpers and flush the
* TLB.
*/
#define set_pte_safe(ptep, pte) \
({ \
WARN_ON_ONCE(pte_present(*ptep) && !pte_same(*ptep, pte)); \
set_pte(ptep, pte); \
})
#define set_pmd_safe(pmdp, pmd) \
({ \
WARN_ON_ONCE(pmd_present(*pmdp) && !pmd_same(*pmdp, pmd)); \
set_pmd(pmdp, pmd); \
})
#define set_pud_safe(pudp, pud) \
({ \
WARN_ON_ONCE(pud_present(*pudp) && !pud_same(*pudp, pud)); \
set_pud(pudp, pud); \
})
#define set_p4d_safe(p4dp, p4d) \
({ \
WARN_ON_ONCE(p4d_present(*p4dp) && !p4d_same(*p4dp, p4d)); \
set_p4d(p4dp, p4d); \
})
#define set_pgd_safe(pgdp, pgd) \
({ \
WARN_ON_ONCE(pgd_present(*pgdp) && !pgd_same(*pgdp, pgd)); \
set_pgd(pgdp, pgd); \
})
#ifndef __HAVE_ARCH_DO_SWAP_PAGE
/*
* Some architectures support metadata associated with a page. When a
* page is being swapped out, this metadata must be saved so it can be
* restored when the page is swapped back in. SPARC M7 and newer
* processors support an ADI (Application Data Integrity) tag for the
* page as metadata for the page. arch_do_swap_page() can restore this
* metadata when a page is swapped back in.
*/
static inline void arch_do_swap_page(struct mm_struct *mm,
struct vm_area_struct *vma,
unsigned long addr,
pte_t pte, pte_t oldpte)
{
}
#endif
#ifndef __HAVE_ARCH_UNMAP_ONE
/*
* Some architectures support metadata associated with a page. When a
* page is being swapped out, this metadata must be saved so it can be
* restored when the page is swapped back in. SPARC M7 and newer
* processors support an ADI (Application Data Integrity) tag for the
* page as metadata for the page. arch_unmap_one() can save this
* metadata on a swap-out of a page.
*/
static inline int arch_unmap_one(struct mm_struct *mm,
struct vm_area_struct *vma,
unsigned long addr,
pte_t orig_pte)
{
return 0;
}
#endif
/*
* Allow architectures to preserve additional metadata associated with
* swapped-out pages. The corresponding __HAVE_ARCH_SWAP_* macros and function
* prototypes must be defined in the arch-specific asm/pgtable.h file.
*/
#ifndef __HAVE_ARCH_PREPARE_TO_SWAP
static inline int arch_prepare_to_swap(struct page *page)
{
return 0;
}
#endif
#ifndef __HAVE_ARCH_SWAP_INVALIDATE
static inline void arch_swap_invalidate_page(int type, pgoff_t offset)
{
}
static inline void arch_swap_invalidate_area(int type)
{
}
#endif
#ifndef __HAVE_ARCH_SWAP_RESTORE
static inline void arch_swap_restore(swp_entry_t entry, struct page *page)
{
}
#endif
#ifndef __HAVE_ARCH_PGD_OFFSET_GATE
#define pgd_offset_gate(mm, addr) pgd_offset(mm, addr)
#endif
#ifndef __HAVE_ARCH_MOVE_PTE
#define move_pte(pte, prot, old_addr, new_addr) (pte)
#endif
#ifndef pte_accessible
# define pte_accessible(mm, pte) ((void)(pte), 1)
#endif
#ifndef flush_tlb_fix_spurious_fault
#define flush_tlb_fix_spurious_fault(vma, address) flush_tlb_page(vma, address)
#endif
/*
* When walking page tables, get the address of the next boundary,
* or the end address of the range if that comes earlier. Although no
* vma end wraps to 0, rounded up __boundary may wrap to 0 throughout.
*/
#define pgd_addr_end(addr, end) \
({ unsigned long __boundary = ((addr) + PGDIR_SIZE) & PGDIR_MASK; \
(__boundary - 1 < (end) - 1)? __boundary: (end); \
})
#ifndef p4d_addr_end
#define p4d_addr_end(addr, end) \
({ unsigned long __boundary = ((addr) + P4D_SIZE) & P4D_MASK; \
(__boundary - 1 < (end) - 1)? __boundary: (end); \
})
#endif
#ifndef pud_addr_end
#define pud_addr_end(addr, end) \
({ unsigned long __boundary = ((addr) + PUD_SIZE) & PUD_MASK; \
(__boundary - 1 < (end) - 1)? __boundary: (end); \
})
#endif
#ifndef pmd_addr_end
#define pmd_addr_end(addr, end) \
({ unsigned long __boundary = ((addr) + PMD_SIZE) & PMD_MASK; \
(__boundary - 1 < (end) - 1)? __boundary: (end); \
})
#endif
/*
* When walking page tables, we usually want to skip any p?d_none entries;
* and any p?d_bad entries - reporting the error before resetting to none.
* Do the tests inline, but report and clear the bad entry in mm/memory.c.
*/
void pgd_clear_bad(pgd_t *);
#ifndef __PAGETABLE_P4D_FOLDED
void p4d_clear_bad(p4d_t *);
#else
#define p4d_clear_bad(p4d) do { } while (0)
#endif
#ifndef __PAGETABLE_PUD_FOLDED
void pud_clear_bad(pud_t *);
#else
#define pud_clear_bad(p4d) do { } while (0)
#endif
void pmd_clear_bad(pmd_t *);
static inline int pgd_none_or_clear_bad(pgd_t *pgd)
{
if (pgd_none(*pgd))
return 1;
if (unlikely(pgd_bad(*pgd))) {
pgd_clear_bad(pgd);
return 1;
}
return 0;
}
static inline int p4d_none_or_clear_bad(p4d_t *p4d)
{
if (p4d_none(*p4d))
return 1;
if (unlikely(p4d_bad(*p4d))) {
p4d_clear_bad(p4d);
return 1;
}
return 0;
}
static inline int pud_none_or_clear_bad(pud_t *pud)
{
if (pud_none(*pud))
return 1;
if (unlikely(pud_bad(*pud))) {
pud_clear_bad(pud);
return 1;
}
return 0;
}
static inline int pmd_none_or_clear_bad(pmd_t *pmd)
{
if (pmd_none(*pmd))
return 1;
if (unlikely(pmd_bad(*pmd))) {
pmd_clear_bad(pmd);
return 1;
}
return 0;
}
static inline pte_t __ptep_modify_prot_start(struct vm_area_struct *vma,
unsigned long addr,
pte_t *ptep)
{
/*
* Get the current pte state, but zero it out to make it
* non-present, preventing the hardware from asynchronously
* updating it.
*/
return ptep_get_and_clear(vma->vm_mm, addr, ptep);
}
static inline void __ptep_modify_prot_commit(struct vm_area_struct *vma,
unsigned long addr,
pte_t *ptep, pte_t pte)
{
/*
* The pte is non-present, so there's no hardware state to
* preserve.
*/
set_pte_at(vma->vm_mm, addr, ptep, pte);
}
#ifndef __HAVE_ARCH_PTEP_MODIFY_PROT_TRANSACTION
/*
* Start a pte protection read-modify-write transaction, which
* protects against asynchronous hardware modifications to the pte.
* The intention is not to prevent the hardware from making pte
* updates, but to prevent any updates it may make from being lost.
*
* This does not protect against other software modifications of the
* pte; the appropriate pte lock must be held over the transaction.
*
* Note that this interface is intended to be batchable, meaning that
* ptep_modify_prot_commit may not actually update the pte, but merely
* queue the update to be done at some later time. The update must be
* actually committed before the pte lock is released, however.
*/
static inline pte_t ptep_modify_prot_start(struct vm_area_struct *vma,
unsigned long addr,
pte_t *ptep)
{
return __ptep_modify_prot_start(vma, addr, ptep);
}
/*
* Commit an update to a pte, leaving any hardware-controlled bits in
* the PTE unmodified.
*/
static inline void ptep_modify_prot_commit(struct vm_area_struct *vma,
unsigned long addr,
pte_t *ptep, pte_t old_pte, pte_t pte)
{
__ptep_modify_prot_commit(vma, addr, ptep, pte);
}
#endif /* __HAVE_ARCH_PTEP_MODIFY_PROT_TRANSACTION */
#endif /* CONFIG_MMU */
/*
* No-op macros that just return the current protection value. Defined here
* because these macros can be used even if CONFIG_MMU is not defined.
*/
#ifndef pgprot_nx
#define pgprot_nx(prot) (prot)
#endif
#ifndef pgprot_noncached
#define pgprot_noncached(prot) (prot)
#endif
#ifndef pgprot_writecombine
#define pgprot_writecombine pgprot_noncached
#endif
#ifndef pgprot_writethrough
#define pgprot_writethrough pgprot_noncached
#endif
#ifndef pgprot_device
#define pgprot_device pgprot_noncached
#endif
#ifndef pgprot_mhp
#define pgprot_mhp(prot) (prot)
#endif
#ifdef CONFIG_MMU
#ifndef pgprot_modify
#define pgprot_modify pgprot_modify
static inline pgprot_t pgprot_modify(pgprot_t oldprot, pgprot_t newprot)
{
if (pgprot_val(oldprot) == pgprot_val(pgprot_noncached(oldprot)))
newprot = pgprot_noncached(newprot);
if (pgprot_val(oldprot) == pgprot_val(pgprot_writecombine(oldprot)))
newprot = pgprot_writecombine(newprot);
if (pgprot_val(oldprot) == pgprot_val(pgprot_device(oldprot)))
newprot = pgprot_device(newprot);
return newprot;
}
#endif
#endif /* CONFIG_MMU */
#ifndef pgprot_encrypted
#define pgprot_encrypted(prot) (prot)
#endif
#ifndef pgprot_decrypted
#define pgprot_decrypted(prot) (prot)
#endif
/*
* A facility to provide lazy MMU batching. This allows PTE updates and
* page invalidations to be delayed until a call to leave lazy MMU mode
* is issued. Some architectures may benefit from doing this, and it is
* beneficial for both shadow and direct mode hypervisors, which may batch
* the PTE updates which happen during this window. Note that using this
* interface requires that read hazards be removed from the code. A read
* hazard could result in the direct mode hypervisor case, since the actual
* write to the page tables may not yet have taken place, so reads though
* a raw PTE pointer after it has been modified are not guaranteed to be
* up to date. This mode can only be entered and left under the protection of
* the page table locks for all page tables which may be modified. In the UP
* case, this is required so that preemption is disabled, and in the SMP case,
* it must synchronize the delayed page table writes properly on other CPUs.
*/
#ifndef __HAVE_ARCH_ENTER_LAZY_MMU_MODE
#define arch_enter_lazy_mmu_mode() do {} while (0)
#define arch_leave_lazy_mmu_mode() do {} while (0)
#define arch_flush_lazy_mmu_mode() do {} while (0)
#endif
/*
* A facility to provide batching of the reload of page tables and
* other process state with the actual context switch code for
* paravirtualized guests. By convention, only one of the batched
* update (lazy) modes (CPU, MMU) should be active at any given time,
* entry should never be nested, and entry and exits should always be
* paired. This is for sanity of maintaining and reasoning about the
* kernel code. In this case, the exit (end of the context switch) is
* in architecture-specific code, and so doesn't need a generic
* definition.
*/
#ifndef __HAVE_ARCH_START_CONTEXT_SWITCH
#define arch_start_context_switch(prev) do {} while (0)
#endif
#ifdef CONFIG_HAVE_ARCH_SOFT_DIRTY
#ifndef CONFIG_ARCH_ENABLE_THP_MIGRATION
static inline pmd_t pmd_swp_mksoft_dirty(pmd_t pmd)
{
return pmd;
}
static inline int pmd_swp_soft_dirty(pmd_t pmd)
{
return 0;
}
static inline pmd_t pmd_swp_clear_soft_dirty(pmd_t pmd)
{
return pmd;
}
#endif
#else /* !CONFIG_HAVE_ARCH_SOFT_DIRTY */
static inline int pte_soft_dirty(pte_t pte)
{
return 0;
}
static inline int pmd_soft_dirty(pmd_t pmd)
{
return 0;
}
static inline pte_t pte_mksoft_dirty(pte_t pte)
{
return pte;
}
static inline pmd_t pmd_mksoft_dirty(pmd_t pmd)
{
return pmd;
}
static inline pte_t pte_clear_soft_dirty(pte_t pte)
{
return pte;
}
static inline pmd_t pmd_clear_soft_dirty(pmd_t pmd)
{
return pmd;
}
static inline pte_t pte_swp_mksoft_dirty(pte_t pte)
{
return pte;
}
static inline int pte_swp_soft_dirty(pte_t pte)
{
return 0;
}
static inline pte_t pte_swp_clear_soft_dirty(pte_t pte)
{
return pte;
}
static inline pmd_t pmd_swp_mksoft_dirty(pmd_t pmd)
{
return pmd;
}
static inline int pmd_swp_soft_dirty(pmd_t pmd)
{
return 0;
}
static inline pmd_t pmd_swp_clear_soft_dirty(pmd_t pmd)
{
return pmd;
}
#endif
#ifndef __HAVE_PFNMAP_TRACKING
/*
* Interfaces that can be used by architecture code to keep track of
* memory type of pfn mappings specified by the remap_pfn_range,
* vmf_insert_pfn.
*/
/*
* track_pfn_remap is called when a _new_ pfn mapping is being established
* by remap_pfn_range() for physical range indicated by pfn and size.
*/
static inline int track_pfn_remap(struct vm_area_struct *vma, pgprot_t *prot,
unsigned long pfn, unsigned long addr,
unsigned long size)
{
return 0;
}
/*
* track_pfn_insert is called when a _new_ single pfn is established
* by vmf_insert_pfn().
*/
static inline void track_pfn_insert(struct vm_area_struct *vma, pgprot_t *prot,
pfn_t pfn)
{
}
/*
* track_pfn_copy is called when vma that is covering the pfnmap gets
* copied through copy_page_range().
*/
static inline int track_pfn_copy(struct vm_area_struct *vma)
{
return 0;
}
/*
* untrack_pfn is called while unmapping a pfnmap for a region.
* untrack can be called for a specific region indicated by pfn and size or
* can be for the entire vma (in which case pfn, size are zero).
*/
static inline void untrack_pfn(struct vm_area_struct *vma,
unsigned long pfn, unsigned long size)
{
}
/*
* untrack_pfn_moved is called while mremapping a pfnmap for a new region.
*/
static inline void untrack_pfn_moved(struct vm_area_struct *vma)
{
}
#else
extern int track_pfn_remap(struct vm_area_struct *vma, pgprot_t *prot,
unsigned long pfn, unsigned long addr,
unsigned long size);
extern void track_pfn_insert(struct vm_area_struct *vma, pgprot_t *prot,
pfn_t pfn);
extern int track_pfn_copy(struct vm_area_struct *vma);
extern void untrack_pfn(struct vm_area_struct *vma, unsigned long pfn,
unsigned long size);
extern void untrack_pfn_moved(struct vm_area_struct *vma);
#endif
#ifdef CONFIG_MMU
#ifdef __HAVE_COLOR_ZERO_PAGE
static inline int is_zero_pfn(unsigned long pfn)
{
extern unsigned long zero_pfn;
unsigned long offset_from_zero_pfn = pfn - zero_pfn;
return offset_from_zero_pfn <= (zero_page_mask >> PAGE_SHIFT);
}
#define my_zero_pfn(addr) page_to_pfn(ZERO_PAGE(addr))
#else
static inline int is_zero_pfn(unsigned long pfn)
{
extern unsigned long zero_pfn;
return pfn == zero_pfn;
}
static inline unsigned long my_zero_pfn(unsigned long addr)
{
extern unsigned long zero_pfn;
return zero_pfn;
}
#endif
#else
static inline int is_zero_pfn(unsigned long pfn)
{
return 0;
}
static inline unsigned long my_zero_pfn(unsigned long addr)
{
return 0;
}
#endif /* CONFIG_MMU */
#ifdef CONFIG_MMU
#ifndef CONFIG_TRANSPARENT_HUGEPAGE
static inline int pmd_trans_huge(pmd_t pmd)
{
return 0;
}
#ifndef pmd_write
static inline int pmd_write(pmd_t pmd)
{
BUG();
return 0;
}
#endif /* pmd_write */
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
#ifndef pud_write
static inline int pud_write(pud_t pud)
{
BUG();
return 0;
}
#endif /* pud_write */
#if !defined(CONFIG_ARCH_HAS_PTE_DEVMAP) || !defined(CONFIG_TRANSPARENT_HUGEPAGE)
static inline int pmd_devmap(pmd_t pmd)
{
return 0;
}
static inline int pud_devmap(pud_t pud)
{
return 0;
}
static inline int pgd_devmap(pgd_t pgd)
{
return 0;
}
#endif
#if !defined(CONFIG_TRANSPARENT_HUGEPAGE) || \
(defined(CONFIG_TRANSPARENT_HUGEPAGE) && \
!defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD))
static inline int pud_trans_huge(pud_t pud)
{
return 0;
}
#endif
/* See pmd_none_or_trans_huge_or_clear_bad for discussion. */
static inline int pud_none_or_trans_huge_or_dev_or_clear_bad(pud_t *pud)
{
pud_t pudval = READ_ONCE(*pud);
if (pud_none(pudval) || pud_trans_huge(pudval) || pud_devmap(pudval))
return 1;
if (unlikely(pud_bad(pudval))) {
pud_clear_bad(pud);
return 1;
}
return 0;
}
/* See pmd_trans_unstable for discussion. */
static inline int pud_trans_unstable(pud_t *pud)
{
#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && \
defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD)
return pud_none_or_trans_huge_or_dev_or_clear_bad(pud);
#else
return 0;
#endif
}
#ifndef pmd_read_atomic
static inline pmd_t pmd_read_atomic(pmd_t *pmdp)
{
/*
* Depend on compiler for an atomic pmd read. NOTE: this is
* only going to work, if the pmdval_t isn't larger than
* an unsigned long.
*/
return *pmdp;
}
#endif
#ifndef arch_needs_pgtable_deposit
#define arch_needs_pgtable_deposit() (false)
#endif
/*
* This function is meant to be used by sites walking pagetables with
* the mmap_lock held in read mode to protect against MADV_DONTNEED and
* transhuge page faults. MADV_DONTNEED can convert a transhuge pmd
* into a null pmd and the transhuge page fault can convert a null pmd
* into an hugepmd or into a regular pmd (if the hugepage allocation
* fails). While holding the mmap_lock in read mode the pmd becomes
* stable and stops changing under us only if it's not null and not a
* transhuge pmd. When those races occurs and this function makes a
* difference vs the standard pmd_none_or_clear_bad, the result is
* undefined so behaving like if the pmd was none is safe (because it
* can return none anyway). The compiler level barrier() is critically
* important to compute the two checks atomically on the same pmdval.
*
* For 32bit kernels with a 64bit large pmd_t this automatically takes
* care of reading the pmd atomically to avoid SMP race conditions
* against pmd_populate() when the mmap_lock is hold for reading by the
* caller (a special atomic read not done by "gcc" as in the generic
* version above, is also needed when THP is disabled because the page
* fault can populate the pmd from under us).
*/
static inline int pmd_none_or_trans_huge_or_clear_bad(pmd_t *pmd)
{
pmd_t pmdval = pmd_read_atomic(pmd);
/*
* The barrier will stabilize the pmdval in a register or on
* the stack so that it will stop changing under the code.
*
* When CONFIG_TRANSPARENT_HUGEPAGE=y on x86 32bit PAE,
* pmd_read_atomic is allowed to return a not atomic pmdval
* (for example pointing to an hugepage that has never been
* mapped in the pmd). The below checks will only care about
* the low part of the pmd with 32bit PAE x86 anyway, with the
* exception of pmd_none(). So the important thing is that if
* the low part of the pmd is found null, the high part will
* be also null or the pmd_none() check below would be
* confused.
*/
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
barrier();
#endif
/*
* !pmd_present() checks for pmd migration entries
*
* The complete check uses is_pmd_migration_entry() in linux/swapops.h
* But using that requires moving current function and pmd_trans_unstable()
* to linux/swapops.h to resolve dependency, which is too much code move.
*
* !pmd_present() is equivalent to is_pmd_migration_entry() currently,
* because !pmd_present() pages can only be under migration not swapped
* out.
*
* pmd_none() is preserved for future condition checks on pmd migration
* entries and not confusing with this function name, although it is
* redundant with !pmd_present().
*/
if (pmd_none(pmdval) || pmd_trans_huge(pmdval) ||
(IS_ENABLED(CONFIG_ARCH_ENABLE_THP_MIGRATION) && !pmd_present(pmdval)))
return 1;
if (unlikely(pmd_bad(pmdval))) {
pmd_clear_bad(pmd);
return 1;
}
return 0;
}
/*
* This is a noop if Transparent Hugepage Support is not built into
* the kernel. Otherwise it is equivalent to
* pmd_none_or_trans_huge_or_clear_bad(), and shall only be called in
* places that already verified the pmd is not none and they want to
* walk ptes while holding the mmap sem in read mode (write mode don't
* need this). If THP is not enabled, the pmd can't go away under the
* code even if MADV_DONTNEED runs, but if THP is enabled we need to
* run a pmd_trans_unstable before walking the ptes after
* split_huge_pmd returns (because it may have run when the pmd become
* null, but then a page fault can map in a THP and not a regular page).
*/
static inline int pmd_trans_unstable(pmd_t *pmd)
{
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
return pmd_none_or_trans_huge_or_clear_bad(pmd);
#else
return 0;
#endif
}
/*
* the ordering of these checks is important for pmds with _page_devmap set.
* if we check pmd_trans_unstable() first we will trip the bad_pmd() check
* inside of pmd_none_or_trans_huge_or_clear_bad(). this will end up correctly
* returning 1 but not before it spams dmesg with the pmd_clear_bad() output.
*/
static inline int pmd_devmap_trans_unstable(pmd_t *pmd)
{
return pmd_devmap(*pmd) || pmd_trans_unstable(pmd);
}
#ifndef CONFIG_NUMA_BALANCING
/*
* Technically a PTE can be PROTNONE even when not doing NUMA balancing but
* the only case the kernel cares is for NUMA balancing and is only ever set
* when the VMA is accessible. For PROT_NONE VMAs, the PTEs are not marked
* _PAGE_PROTNONE so by default, implement the helper as "always no". It
* is the responsibility of the caller to distinguish between PROT_NONE
* protections and NUMA hinting fault protections.
*/
static inline int pte_protnone(pte_t pte)
{
return 0;
}
static inline int pmd_protnone(pmd_t pmd)
{
return 0;
}
#endif /* CONFIG_NUMA_BALANCING */
#endif /* CONFIG_MMU */
#ifdef CONFIG_HAVE_ARCH_HUGE_VMAP
#ifndef __PAGETABLE_P4D_FOLDED
int p4d_set_huge(p4d_t *p4d, phys_addr_t addr, pgprot_t prot);
int p4d_clear_huge(p4d_t *p4d);
#else
static inline int p4d_set_huge(p4d_t *p4d, phys_addr_t addr, pgprot_t prot)
{
return 0;
}
static inline int p4d_clear_huge(p4d_t *p4d)
{
return 0;
}
#endif /* !__PAGETABLE_P4D_FOLDED */
int pud_set_huge(pud_t *pud, phys_addr_t addr, pgprot_t prot);
int pmd_set_huge(pmd_t *pmd, phys_addr_t addr, pgprot_t prot);
int pud_clear_huge(pud_t *pud);
int pmd_clear_huge(pmd_t *pmd);
int p4d_free_pud_page(p4d_t *p4d, unsigned long addr);
int pud_free_pmd_page(pud_t *pud, unsigned long addr);
int pmd_free_pte_page(pmd_t *pmd, unsigned long addr);
#else /* !CONFIG_HAVE_ARCH_HUGE_VMAP */
static inline int p4d_set_huge(p4d_t *p4d, phys_addr_t addr, pgprot_t prot)
{
return 0;
}
static inline int pud_set_huge(pud_t *pud, phys_addr_t addr, pgprot_t prot)
{
return 0;
}
static inline int pmd_set_huge(pmd_t *pmd, phys_addr_t addr, pgprot_t prot)
{
return 0;
}
static inline int p4d_clear_huge(p4d_t *p4d)
{
return 0;
}
static inline int pud_clear_huge(pud_t *pud)
{
return 0;
}
static inline int pmd_clear_huge(pmd_t *pmd)
{
return 0;
}
static inline int p4d_free_pud_page(p4d_t *p4d, unsigned long addr)
{
return 0;
}
static inline int pud_free_pmd_page(pud_t *pud, unsigned long addr)
{
return 0;
}
static inline int pmd_free_pte_page(pmd_t *pmd, unsigned long addr)
{
return 0;
}
#endif /* CONFIG_HAVE_ARCH_HUGE_VMAP */
#ifndef __HAVE_ARCH_FLUSH_PMD_TLB_RANGE
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
/*
* ARCHes with special requirements for evicting THP backing TLB entries can
* implement this. Otherwise also, it can help optimize normal TLB flush in
* THP regime. Stock flush_tlb_range() typically has optimization to nuke the
* entire TLB if flush span is greater than a threshold, which will
* likely be true for a single huge page. Thus a single THP flush will
* invalidate the entire TLB which is not desirable.
* e.g. see arch/arc: flush_pmd_tlb_range
*/
#define flush_pmd_tlb_range(vma, addr, end) flush_tlb_range(vma, addr, end)
#define flush_pud_tlb_range(vma, addr, end) flush_tlb_range(vma, addr, end)
#else
#define flush_pmd_tlb_range(vma, addr, end) BUILD_BUG()
#define flush_pud_tlb_range(vma, addr, end) BUILD_BUG()
#endif
#endif
struct file;
int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn,
unsigned long size, pgprot_t *vma_prot);
#ifndef CONFIG_X86_ESPFIX64
static inline void init_espfix_bsp(void) { }
#endif
extern void __init pgtable_cache_init(void);
#ifndef __HAVE_ARCH_PFN_MODIFY_ALLOWED
static inline bool pfn_modify_allowed(unsigned long pfn, pgprot_t prot)
{
return true;
}
static inline bool arch_has_pfn_modify_check(void)
{
return false;
}
#endif /* !_HAVE_ARCH_PFN_MODIFY_ALLOWED */
/*
* Architecture PAGE_KERNEL_* fallbacks
*
* Some architectures don't define certain PAGE_KERNEL_* flags. This is either
* because they really don't support them, or the port needs to be updated to
* reflect the required functionality. Below are a set of relatively safe
* fallbacks, as best effort, which we can count on in lieu of the architectures
* not defining them on their own yet.
*/
#ifndef PAGE_KERNEL_RO
# define PAGE_KERNEL_RO PAGE_KERNEL
#endif
#ifndef PAGE_KERNEL_EXEC
# define PAGE_KERNEL_EXEC PAGE_KERNEL
#endif
/*
* Page Table Modification bits for pgtbl_mod_mask.
*
* These are used by the p?d_alloc_track*() set of functions an in the generic
* vmalloc/ioremap code to track at which page-table levels entries have been
* modified. Based on that the code can better decide when vmalloc and ioremap
* mapping changes need to be synchronized to other page-tables in the system.
*/
#define __PGTBL_PGD_MODIFIED 0
#define __PGTBL_P4D_MODIFIED 1
#define __PGTBL_PUD_MODIFIED 2
#define __PGTBL_PMD_MODIFIED 3
#define __PGTBL_PTE_MODIFIED 4
#define PGTBL_PGD_MODIFIED BIT(__PGTBL_PGD_MODIFIED)
#define PGTBL_P4D_MODIFIED BIT(__PGTBL_P4D_MODIFIED)
#define PGTBL_PUD_MODIFIED BIT(__PGTBL_PUD_MODIFIED)
#define PGTBL_PMD_MODIFIED BIT(__PGTBL_PMD_MODIFIED)
#define PGTBL_PTE_MODIFIED BIT(__PGTBL_PTE_MODIFIED)
/* Page-Table Modification Mask */
typedef unsigned int pgtbl_mod_mask;
#endif /* !__ASSEMBLY__ */
#if !defined(MAX_POSSIBLE_PHYSMEM_BITS) && !defined(CONFIG_64BIT)
#ifdef CONFIG_PHYS_ADDR_T_64BIT
/*
* ZSMALLOC needs to know the highest PFN on 32-bit architectures
* with physical address space extension, but falls back to
* BITS_PER_LONG otherwise.
*/
#error Missing MAX_POSSIBLE_PHYSMEM_BITS definition
#else
#define MAX_POSSIBLE_PHYSMEM_BITS 32
#endif
#endif
#ifndef has_transparent_hugepage
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
#define has_transparent_hugepage() 1
#else
#define has_transparent_hugepage() 0
#endif
#endif
/*
* On some architectures it depends on the mm if the p4d/pud or pmd
* layer of the page table hierarchy is folded or not.
*/
#ifndef mm_p4d_folded
#define mm_p4d_folded(mm) __is_defined(__PAGETABLE_P4D_FOLDED)
#endif
#ifndef mm_pud_folded
#define mm_pud_folded(mm) __is_defined(__PAGETABLE_PUD_FOLDED)
#endif
#ifndef mm_pmd_folded
#define mm_pmd_folded(mm) __is_defined(__PAGETABLE_PMD_FOLDED)
#endif
#ifndef p4d_offset_lockless
#define p4d_offset_lockless(pgdp, pgd, address) p4d_offset(&(pgd), address)
#endif
#ifndef pud_offset_lockless
#define pud_offset_lockless(p4dp, p4d, address) pud_offset(&(p4d), address)
#endif
#ifndef pmd_offset_lockless
#define pmd_offset_lockless(pudp, pud, address) pmd_offset(&(pud), address)
#endif
/*
* p?d_leaf() - true if this entry is a final mapping to a physical address.
* This differs from p?d_huge() by the fact that they are always available (if
* the architecture supports large pages at the appropriate level) even
* if CONFIG_HUGETLB_PAGE is not defined.
* Only meaningful when called on a valid entry.
*/
#ifndef pgd_leaf
#define pgd_leaf(x) 0
#endif
#ifndef p4d_leaf
#define p4d_leaf(x) 0
#endif
#ifndef pud_leaf
#define pud_leaf(x) 0
#endif
#ifndef pmd_leaf
#define pmd_leaf(x) 0
#endif
#ifndef pgd_leaf_size
#define pgd_leaf_size(x) (1ULL << PGDIR_SHIFT)
#endif
#ifndef p4d_leaf_size
#define p4d_leaf_size(x) P4D_SIZE
#endif
#ifndef pud_leaf_size
#define pud_leaf_size(x) PUD_SIZE
#endif
#ifndef pmd_leaf_size
#define pmd_leaf_size(x) PMD_SIZE
#endif
#ifndef pte_leaf_size
#define pte_leaf_size(x) PAGE_SIZE
#endif
/*
* Some architectures have MMUs that are configurable or selectable at boot
* time. These lead to variable PTRS_PER_x. For statically allocated arrays it
* helps to have a static maximum value.
*/
#ifndef MAX_PTRS_PER_PTE
#define MAX_PTRS_PER_PTE PTRS_PER_PTE
#endif
#ifndef MAX_PTRS_PER_PMD
#define MAX_PTRS_PER_PMD PTRS_PER_PMD
#endif
#ifndef MAX_PTRS_PER_PUD
#define MAX_PTRS_PER_PUD PTRS_PER_PUD
#endif
#ifndef MAX_PTRS_PER_P4D
#define MAX_PTRS_PER_P4D PTRS_PER_P4D
#endif
#endif /* _LINUX_PGTABLE_H */
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef __ASM_PREEMPT_H
#define __ASM_PREEMPT_H
#include <asm/rmwcc.h>
#include <asm/percpu.h>
#include <linux/thread_info.h>
#include <linux/static_call_types.h>
DECLARE_PER_CPU(int, __preempt_count);
/* We use the MSB mostly because its available */
#define PREEMPT_NEED_RESCHED 0x80000000
/*
* We use the PREEMPT_NEED_RESCHED bit as an inverted NEED_RESCHED such
* that a decrement hitting 0 means we can and should reschedule.
*/
#define PREEMPT_ENABLED (0 + PREEMPT_NEED_RESCHED)
/*
* We mask the PREEMPT_NEED_RESCHED bit so as not to confuse all current users
* that think a non-zero value indicates we cannot preempt.
*/
static __always_inline int preempt_count(void)
{
return raw_cpu_read_4(__preempt_count) & ~PREEMPT_NEED_RESCHED;
}
static __always_inline void preempt_count_set(int pc)
{
int old, new;
do {
old = raw_cpu_read_4(__preempt_count);
new = (old & PREEMPT_NEED_RESCHED) |
(pc & ~PREEMPT_NEED_RESCHED);
} while (raw_cpu_cmpxchg_4(__preempt_count, old, new) != old);
}
/*
* must be macros to avoid header recursion hell
*/
#define init_task_preempt_count(p) do { } while (0)
#define init_idle_preempt_count(p, cpu) do { \
per_cpu(__preempt_count, (cpu)) = PREEMPT_DISABLED; \
} while (0)
/*
* We fold the NEED_RESCHED bit into the preempt count such that
* preempt_enable() can decrement and test for needing to reschedule with a
* single instruction.
*
* We invert the actual bit, so that when the decrement hits 0 we know we both
* need to resched (the bit is cleared) and can resched (no preempt count).
*/
static __always_inline void set_preempt_need_resched(void)
{
raw_cpu_and_4(__preempt_count, ~PREEMPT_NEED_RESCHED);
}
static __always_inline void clear_preempt_need_resched(void)
{
raw_cpu_or_4(__preempt_count, PREEMPT_NEED_RESCHED);
}
static __always_inline bool test_preempt_need_resched(void)
{
return !(raw_cpu_read_4(__preempt_count) & PREEMPT_NEED_RESCHED);
}
/*
* The various preempt_count add/sub methods
*/
static __always_inline void __preempt_count_add(int val)
{
raw_cpu_add_4(__preempt_count, val);
}
static __always_inline void __preempt_count_sub(int val)
{
raw_cpu_add_4(__preempt_count, -val);
}
/*
* Because we keep PREEMPT_NEED_RESCHED set when we do _not_ need to reschedule
* a decrement which hits zero means we have no preempt_count and should
* reschedule.
*/
static __always_inline bool __preempt_count_dec_and_test(void)
{
return GEN_UNARY_RMWcc("decl", __preempt_count, e, __percpu_arg([var]));
}
/*
* Returns true when we need to resched and can (barring IRQ state).
*/
static __always_inline bool should_resched(int preempt_offset)
{
return unlikely(raw_cpu_read_4(__preempt_count) == preempt_offset);
}
#ifdef CONFIG_PREEMPTION
extern asmlinkage void preempt_schedule(void);
extern asmlinkage void preempt_schedule_thunk(void);
#define __preempt_schedule_func preempt_schedule_thunk
extern asmlinkage void preempt_schedule_notrace(void);
extern asmlinkage void preempt_schedule_notrace_thunk(void);
#define __preempt_schedule_notrace_func preempt_schedule_notrace_thunk
#ifdef CONFIG_PREEMPT_DYNAMIC
DECLARE_STATIC_CALL(preempt_schedule, __preempt_schedule_func);
#define __preempt_schedule() \
do { \
__STATIC_CALL_MOD_ADDRESSABLE(preempt_schedule); \
asm volatile ("call " STATIC_CALL_TRAMP_STR(preempt_schedule) : ASM_CALL_CONSTRAINT); \
} while (0)
DECLARE_STATIC_CALL(preempt_schedule_notrace, __preempt_schedule_notrace_func);
#define __preempt_schedule_notrace() \
do { \
__STATIC_CALL_MOD_ADDRESSABLE(preempt_schedule_notrace); \
asm volatile ("call " STATIC_CALL_TRAMP_STR(preempt_schedule_notrace) : ASM_CALL_CONSTRAINT); \
} while (0)
#else /* PREEMPT_DYNAMIC */
#define __preempt_schedule() \
asm volatile ("call preempt_schedule_thunk" : ASM_CALL_CONSTRAINT);
#define __preempt_schedule_notrace() \
asm volatile ("call preempt_schedule_notrace_thunk" : ASM_CALL_CONSTRAINT);
#endif /* PREEMPT_DYNAMIC */
#endif /* PREEMPTION */
#endif /* __ASM_PREEMPT_H */
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_RCULIST_H
#define _LINUX_RCULIST_H
#ifdef __KERNEL__
/*
* RCU-protected list version
*/
#include <linux/list.h>
#include <linux/rcupdate.h>
/*
* INIT_LIST_HEAD_RCU - Initialize a list_head visible to RCU readers
* @list: list to be initialized
*
* You should instead use INIT_LIST_HEAD() for normal initialization and
* cleanup tasks, when readers have no access to the list being initialized.
* However, if the list being initialized is visible to readers, you
* need to keep the compiler from being too mischievous.
*/
static inline void INIT_LIST_HEAD_RCU(struct list_head *list)
{
WRITE_ONCE(list->next, list);
WRITE_ONCE(list->prev, list);
}
/*
* return the ->next pointer of a list_head in an rcu safe
* way, we must not access it directly
*/
#define list_next_rcu(list) (*((struct list_head __rcu **)(&(list)->next)))
/**
* list_tail_rcu - returns the prev pointer of the head of the list
* @head: the head of the list
*
* Note: This should only be used with the list header, and even then
* only if list_del() and similar primitives are not also used on the
* list header.
*/
#define list_tail_rcu(head) (*((struct list_head __rcu **)(&(head)->prev)))
/*
* Check during list traversal that we are within an RCU reader
*/
#define check_arg_count_one(dummy)
#ifdef CONFIG_PROVE_RCU_LIST
#define __list_check_rcu(dummy, cond, extra...) \
({ \
check_arg_count_one(extra); \
RCU_LOCKDEP_WARN(!(cond) && !rcu_read_lock_any_held(), \
"RCU-list traversed in non-reader section!"); \
})
#define __list_check_srcu(cond) \
({ \
RCU_LOCKDEP_WARN(!(cond), \
"RCU-list traversed without holding the required lock!");\
})
#else
#define __list_check_rcu(dummy, cond, extra...) \
({ check_arg_count_one(extra); })
#define __list_check_srcu(cond) ({ })
#endif
/*
* Insert a new entry between two known consecutive entries.
*
* This is only for internal list manipulation where we know
* the prev/next entries already!
*/
static inline void __list_add_rcu(struct list_head *new,
struct list_head *prev, struct list_head *next)
{
if (!__list_add_valid(new, prev, next))
return;
new->next = next;
new->prev = prev;
rcu_assign_pointer(list_next_rcu(prev), new);
next->prev = new;
}
/**
* list_add_rcu - add a new entry to rcu-protected list
* @new: new entry to be added
* @head: list head to add it after
*
* Insert a new entry after the specified head.
* This is good for implementing stacks.
*
* The caller must take whatever precautions are necessary
* (such as holding appropriate locks) to avoid racing
* with another list-mutation primitive, such as list_add_rcu()
* or list_del_rcu(), running on this same list.
* However, it is perfectly legal to run concurrently with
* the _rcu list-traversal primitives, such as
* list_for_each_entry_rcu().
*/
static inline void list_add_rcu(struct list_head *new, struct list_head *head)
{
__list_add_rcu(new, head, head->next);
}
/**
* list_add_tail_rcu - add a new entry to rcu-protected list
* @new: new entry to be added
* @head: list head to add it before
*
* Insert a new entry before the specified head.
* This is useful for implementing queues.
*
* The caller must take whatever precautions are necessary
* (such as holding appropriate locks) to avoid racing
* with another list-mutation primitive, such as list_add_tail_rcu()
* or list_del_rcu(), running on this same list.
* However, it is perfectly legal to run concurrently with
* the _rcu list-traversal primitives, such as
* list_for_each_entry_rcu().
*/
static inline void list_add_tail_rcu(struct list_head *new,
struct list_head *head)
{
__list_add_rcu(new, head->prev, head);
}
/**
* list_del_rcu - deletes entry from list without re-initialization
* @entry: the element to delete from the list.
*
* Note: list_empty() on entry does not return true after this,
* the entry is in an undefined state. It is useful for RCU based
* lockfree traversal.
*
* In particular, it means that we can not poison the forward
* pointers that may still be used for walking the list.
*
* The caller must take whatever precautions are necessary
* (such as holding appropriate locks) to avoid racing
* with another list-mutation primitive, such as list_del_rcu()
* or list_add_rcu(), running on this same list.
* However, it is perfectly legal to run concurrently with
* the _rcu list-traversal primitives, such as
* list_for_each_entry_rcu().
*
* Note that the caller is not permitted to immediately free
* the newly deleted entry. Instead, either synchronize_rcu()
* or call_rcu() must be used to defer freeing until an RCU
* grace period has elapsed.
*/
static inline void list_del_rcu(struct list_head *entry)
{
__list_del_entry(entry);
entry->prev = LIST_POISON2;
}
/**
* hlist_del_init_rcu - deletes entry from hash list with re-initialization
* @n: the element to delete from the hash list.
*
* Note: list_unhashed() on the node return true after this. It is
* useful for RCU based read lockfree traversal if the writer side
* must know if the list entry is still hashed or already unhashed.
*
* In particular, it means that we can not poison the forward pointers
* that may still be used for walking the hash list and we can only
* zero the pprev pointer so list_unhashed() will return true after
* this.
*
* The caller must take whatever precautions are necessary (such as
* holding appropriate locks) to avoid racing with another
* list-mutation primitive, such as hlist_add_head_rcu() or
* hlist_del_rcu(), running on this same list. However, it is
* perfectly legal to run concurrently with the _rcu list-traversal
* primitives, such as hlist_for_each_entry_rcu().
*/
static inline void hlist_del_init_rcu(struct hlist_node *n)
{
if (!hlist_unhashed(n)) {
__hlist_del(n); WRITE_ONCE(n->pprev, NULL);
}
}
/**
* list_replace_rcu - replace old entry by new one
* @old : the element to be replaced
* @new : the new element to insert
*
* The @old entry will be replaced with the @new entry atomically.
* Note: @old should not be empty.
*/
static inline void list_replace_rcu(struct list_head *old,
struct list_head *new)
{
new->next = old->next;
new->prev = old->prev;
rcu_assign_pointer(list_next_rcu(new->prev), new);
new->next->prev = new;
old->prev = LIST_POISON2;
}
/**
* __list_splice_init_rcu - join an RCU-protected list into an existing list.
* @list: the RCU-protected list to splice
* @prev: points to the last element of the existing list
* @next: points to the first element of the existing list
* @sync: synchronize_rcu, synchronize_rcu_expedited, ...
*
* The list pointed to by @prev and @next can be RCU-read traversed
* concurrently with this function.
*
* Note that this function blocks.
*
* Important note: the caller must take whatever action is necessary to prevent
* any other updates to the existing list. In principle, it is possible to
* modify the list as soon as sync() begins execution. If this sort of thing
* becomes necessary, an alternative version based on call_rcu() could be
* created. But only if -really- needed -- there is no shortage of RCU API
* members.
*/
static inline void __list_splice_init_rcu(struct list_head *list,
struct list_head *prev,
struct list_head *next,
void (*sync)(void))
{
struct list_head *first = list->next;
struct list_head *last = list->prev;
/*
* "first" and "last" tracking list, so initialize it. RCU readers
* have access to this list, so we must use INIT_LIST_HEAD_RCU()
* instead of INIT_LIST_HEAD().
*/
INIT_LIST_HEAD_RCU(list);
/*
* At this point, the list body still points to the source list.
* Wait for any readers to finish using the list before splicing
* the list body into the new list. Any new readers will see
* an empty list.
*/
sync();
ASSERT_EXCLUSIVE_ACCESS(*first);
ASSERT_EXCLUSIVE_ACCESS(*last);
/*
* Readers are finished with the source list, so perform splice.
* The order is important if the new list is global and accessible
* to concurrent RCU readers. Note that RCU readers are not
* permitted to traverse the prev pointers without excluding
* this function.
*/
last->next = next;
rcu_assign_pointer(list_next_rcu(prev), first);
first->prev = prev;
next->prev = last;
}
/**
* list_splice_init_rcu - splice an RCU-protected list into an existing list,
* designed for stacks.
* @list: the RCU-protected list to splice
* @head: the place in the existing list to splice the first list into
* @sync: synchronize_rcu, synchronize_rcu_expedited, ...
*/
static inline void list_splice_init_rcu(struct list_head *list,
struct list_head *head,
void (*sync)(void))
{
if (!list_empty(list))
__list_splice_init_rcu(list, head, head->next, sync);
}
/**
* list_splice_tail_init_rcu - splice an RCU-protected list into an existing
* list, designed for queues.
* @list: the RCU-protected list to splice
* @head: the place in the existing list to splice the first list into
* @sync: synchronize_rcu, synchronize_rcu_expedited, ...
*/
static inline void list_splice_tail_init_rcu(struct list_head *list,
struct list_head *head,
void (*sync)(void))
{
if (!list_empty(list))
__list_splice_init_rcu(list, head->prev, head, sync);
}
/**
* list_entry_rcu - get the struct for this entry
* @ptr: the &struct list_head pointer.
* @type: the type of the struct this is embedded in.
* @member: the name of the list_head within the struct.
*
* This primitive may safely run concurrently with the _rcu list-mutation
* primitives such as list_add_rcu() as long as it's guarded by rcu_read_lock().
*/
#define list_entry_rcu(ptr, type, member) \
container_of(READ_ONCE(ptr), type, member)
/*
* Where are list_empty_rcu() and list_first_entry_rcu()?
*
* They do not exist because they would lead to subtle race conditions:
*
* if (!list_empty_rcu(mylist)) {
* struct foo *bar = list_first_entry_rcu(mylist, struct foo, list_member);
* do_something(bar);
* }
*
* The list might be non-empty when list_empty_rcu() checks it, but it
* might have become empty by the time that list_first_entry_rcu() rereads
* the ->next pointer, which would result in a SEGV.
*
* When not using RCU, it is OK for list_first_entry() to re-read that
* pointer because both functions should be protected by some lock that
* blocks writers.
*
* When using RCU, list_empty() uses READ_ONCE() to fetch the
* RCU-protected ->next pointer and then compares it to the address of the
* list head. However, it neither dereferences this pointer nor provides
* this pointer to its caller. Thus, READ_ONCE() suffices (that is,
* rcu_dereference() is not needed), which means that list_empty() can be
* used anywhere you would want to use list_empty_rcu(). Just don't
* expect anything useful to happen if you do a subsequent lockless
* call to list_first_entry_rcu()!!!
*
* See list_first_or_null_rcu for an alternative.
*/
/**
* list_first_or_null_rcu - get the first element from a list
* @ptr: the list head to take the element from.
* @type: the type of the struct this is embedded in.
* @member: the name of the list_head within the struct.
*
* Note that if the list is empty, it returns NULL.
*
* This primitive may safely run concurrently with the _rcu list-mutation
* primitives such as list_add_rcu() as long as it's guarded by rcu_read_lock().
*/
#define list_first_or_null_rcu(ptr, type, member) \
({ \
struct list_head *__ptr = (ptr); \
struct list_head *__next = READ_ONCE(__ptr->next); \
likely(__ptr != __next) ? list_entry_rcu(__next, type, member) : NULL; \
})
/**
* list_next_or_null_rcu - get the first element from a list
* @head: the head for the list.
* @ptr: the list head to take the next element from.
* @type: the type of the struct this is embedded in.
* @member: the name of the list_head within the struct.
*
* Note that if the ptr is at the end of the list, NULL is returned.
*
* This primitive may safely run concurrently with the _rcu list-mutation
* primitives such as list_add_rcu() as long as it's guarded by rcu_read_lock().
*/
#define list_next_or_null_rcu(head, ptr, type, member) \
({ \
struct list_head *__head = (head); \
struct list_head *__ptr = (ptr); \
struct list_head *__next = READ_ONCE(__ptr->next); \
likely(__next != __head) ? list_entry_rcu(__next, type, \
member) : NULL; \
})
/**
* list_for_each_entry_rcu - iterate over rcu list of given type
* @pos: the type * to use as a loop cursor.
* @head: the head for your list.
* @member: the name of the list_head within the struct.
* @cond: optional lockdep expression if called from non-RCU protection.
*
* This list-traversal primitive may safely run concurrently with
* the _rcu list-mutation primitives such as list_add_rcu()
* as long as the traversal is guarded by rcu_read_lock().
*/
#define list_for_each_entry_rcu(pos, head, member, cond...) \
for (__list_check_rcu(dummy, ## cond, 0), \
pos = list_entry_rcu((head)->next, typeof(*pos), member); \
&pos->member != (head); \
pos = list_entry_rcu(pos->member.next, typeof(*pos), member))
/**
* list_for_each_entry_srcu - iterate over rcu list of given type
* @pos: the type * to use as a loop cursor.
* @head: the head for your list.
* @member: the name of the list_head within the struct.
* @cond: lockdep expression for the lock required to traverse the list.
*
* This list-traversal primitive may safely run concurrently with
* the _rcu list-mutation primitives such as list_add_rcu()
* as long as the traversal is guarded by srcu_read_lock().
* The lockdep expression srcu_read_lock_held() can be passed as the
* cond argument from read side.
*/
#define list_for_each_entry_srcu(pos, head, member, cond) \
for (__list_check_srcu(cond), \
pos = list_entry_rcu((head)->next, typeof(*pos), member); \
&pos->member != (head); \
pos = list_entry_rcu(pos->member.next, typeof(*pos), member))
/**
* list_entry_lockless - get the struct for this entry
* @ptr: the &struct list_head pointer.
* @type: the type of the struct this is embedded in.
* @member: the name of the list_head within the struct.
*
* This primitive may safely run concurrently with the _rcu
* list-mutation primitives such as list_add_rcu(), but requires some
* implicit RCU read-side guarding. One example is running within a special
* exception-time environment where preemption is disabled and where lockdep
* cannot be invoked. Another example is when items are added to the list,
* but never deleted.
*/
#define list_entry_lockless(ptr, type, member) \
container_of((typeof(ptr))READ_ONCE(ptr), type, member)
/**
* list_for_each_entry_lockless - iterate over rcu list of given type
* @pos: the type * to use as a loop cursor.
* @head: the head for your list.
* @member: the name of the list_struct within the struct.
*
* This primitive may safely run concurrently with the _rcu
* list-mutation primitives such as list_add_rcu(), but requires some
* implicit RCU read-side guarding. One example is running within a special
* exception-time environment where preemption is disabled and where lockdep
* cannot be invoked. Another example is when items are added to the list,
* but never deleted.
*/
#define list_for_each_entry_lockless(pos, head, member) \
for (pos = list_entry_lockless((head)->next, typeof(*pos), member); \
&pos->member != (head); \
pos = list_entry_lockless(pos->member.next, typeof(*pos), member))
/**
* list_for_each_entry_continue_rcu - continue iteration over list of given type
* @pos: the type * to use as a loop cursor.
* @head: the head for your list.
* @member: the name of the list_head within the struct.
*
* Continue to iterate over list of given type, continuing after
* the current position which must have been in the list when the RCU read
* lock was taken.
* This would typically require either that you obtained the node from a
* previous walk of the list in the same RCU read-side critical section, or
* that you held some sort of non-RCU reference (such as a reference count)
* to keep the node alive *and* in the list.
*
* This iterator is similar to list_for_each_entry_from_rcu() except
* this starts after the given position and that one starts at the given
* position.
*/
#define list_for_each_entry_continue_rcu(pos, head, member) \
for (pos = list_entry_rcu(pos->member.next, typeof(*pos), member); \
&pos->member != (head); \
pos = list_entry_rcu(pos->member.next, typeof(*pos), member))
/**
* list_for_each_entry_from_rcu - iterate over a list from current point
* @pos: the type * to use as a loop cursor.
* @head: the head for your list.
* @member: the name of the list_node within the struct.
*
* Iterate over the tail of a list starting from a given position,
* which must have been in the list when the RCU read lock was taken.
* This would typically require either that you obtained the node from a
* previous walk of the list in the same RCU read-side critical section, or
* that you held some sort of non-RCU reference (such as a reference count)
* to keep the node alive *and* in the list.
*
* This iterator is similar to list_for_each_entry_continue_rcu() except
* this starts from the given position and that one starts from the position
* after the given position.
*/
#define list_for_each_entry_from_rcu(pos, head, member) \
for (; &(pos)->member != (head); \
pos = list_entry_rcu(pos->member.next, typeof(*(pos)), member))
/**
* hlist_del_rcu - deletes entry from hash list without re-initialization
* @n: the element to delete from the hash list.
*
* Note: list_unhashed() on entry does not return true after this,
* the entry is in an undefined state. It is useful for RCU based
* lockfree traversal.
*
* In particular, it means that we can not poison the forward
* pointers that may still be used for walking the hash list.
*
* The caller must take whatever precautions are necessary
* (such as holding appropriate locks) to avoid racing
* with another list-mutation primitive, such as hlist_add_head_rcu()
* or hlist_del_rcu(), running on this same list.
* However, it is perfectly legal to run concurrently with
* the _rcu list-traversal primitives, such as
* hlist_for_each_entry().
*/
static inline void hlist_del_rcu(struct hlist_node *n)
{
__hlist_del(n);
WRITE_ONCE(n->pprev, LIST_POISON2);
}
/**
* hlist_replace_rcu - replace old entry by new one
* @old : the element to be replaced
* @new : the new element to insert
*
* The @old entry will be replaced with the @new entry atomically.
*/
static inline void hlist_replace_rcu(struct hlist_node *old,
struct hlist_node *new)
{
struct hlist_node *next = old->next;
new->next = next;
WRITE_ONCE(new->pprev, old->pprev);
rcu_assign_pointer(*(struct hlist_node __rcu **)new->pprev, new);
if (next)
WRITE_ONCE(new->next->pprev, &new->next);
WRITE_ONCE(old->pprev, LIST_POISON2);
}
/**
* hlists_swap_heads_rcu - swap the lists the hlist heads point to
* @left: The hlist head on the left
* @right: The hlist head on the right
*
* The lists start out as [@left ][node1 ... ] and
* [@right ][node2 ... ]
* The lists end up as [@left ][node2 ... ]
* [@right ][node1 ... ]
*/
static inline void hlists_swap_heads_rcu(struct hlist_head *left, struct hlist_head *right)
{
struct hlist_node *node1 = left->first;
struct hlist_node *node2 = right->first;
rcu_assign_pointer(left->first, node2);
rcu_assign_pointer(right->first, node1);
WRITE_ONCE(node2->pprev, &left->first);
WRITE_ONCE(node1->pprev, &right->first);
}
/*
* return the first or the next element in an RCU protected hlist
*/
#define hlist_first_rcu(head) (*((struct hlist_node __rcu **)(&(head)->first)))
#define hlist_next_rcu(node) (*((struct hlist_node __rcu **)(&(node)->next)))
#define hlist_pprev_rcu(node) (*((struct hlist_node __rcu **)((node)->pprev)))
/**
* hlist_add_head_rcu
* @n: the element to add to the hash list.
* @h: the list to add to.
*
* Description:
* Adds the specified element to the specified hlist,
* while permitting racing traversals.
*
* The caller must take whatever precautions are necessary
* (such as holding appropriate locks) to avoid racing
* with another list-mutation primitive, such as hlist_add_head_rcu()
* or hlist_del_rcu(), running on this same list.
* However, it is perfectly legal to run concurrently with
* the _rcu list-traversal primitives, such as
* hlist_for_each_entry_rcu(), used to prevent memory-consistency
* problems on Alpha CPUs. Regardless of the type of CPU, the
* list-traversal primitive must be guarded by rcu_read_lock().
*/
static inline void hlist_add_head_rcu(struct hlist_node *n,
struct hlist_head *h)
{
struct hlist_node *first = h->first; n->next = first;
WRITE_ONCE(n->pprev, &h->first);
rcu_assign_pointer(hlist_first_rcu(h), n);
if (first)
WRITE_ONCE(first->pprev, &n->next);
}
/**
* hlist_add_tail_rcu
* @n: the element to add to the hash list.
* @h: the list to add to.
*
* Description:
* Adds the specified element to the specified hlist,
* while permitting racing traversals.
*
* The caller must take whatever precautions are necessary
* (such as holding appropriate locks) to avoid racing
* with another list-mutation primitive, such as hlist_add_head_rcu()
* or hlist_del_rcu(), running on this same list.
* However, it is perfectly legal to run concurrently with
* the _rcu list-traversal primitives, such as
* hlist_for_each_entry_rcu(), used to prevent memory-consistency
* problems on Alpha CPUs. Regardless of the type of CPU, the
* list-traversal primitive must be guarded by rcu_read_lock().
*/
static inline void hlist_add_tail_rcu(struct hlist_node *n,
struct hlist_head *h)
{
struct hlist_node *i, *last = NULL;
/* Note: write side code, so rcu accessors are not needed. */
for (i = h->first; i; i = i->next)
last = i;
if (last) { n->next = last->next;
WRITE_ONCE(n->pprev, &last->next);
rcu_assign_pointer(hlist_next_rcu(last), n);
} else {
hlist_add_head_rcu(n, h);
}
}
/**
* hlist_add_before_rcu
* @n: the new element to add to the hash list.
* @next: the existing element to add the new element before.
*
* Description:
* Adds the specified element to the specified hlist
* before the specified node while permitting racing traversals.
*
* The caller must take whatever precautions are necessary
* (such as holding appropriate locks) to avoid racing
* with another list-mutation primitive, such as hlist_add_head_rcu()
* or hlist_del_rcu(), running on this same list.
* However, it is perfectly legal to run concurrently with
* the _rcu list-traversal primitives, such as
* hlist_for_each_entry_rcu(), used to prevent memory-consistency
* problems on Alpha CPUs.
*/
static inline void hlist_add_before_rcu(struct hlist_node *n,
struct hlist_node *next)
{
WRITE_ONCE(n->pprev, next->pprev);
n->next = next;
rcu_assign_pointer(hlist_pprev_rcu(n), n);
WRITE_ONCE(next->pprev, &n->next);
}
/**
* hlist_add_behind_rcu
* @n: the new element to add to the hash list.
* @prev: the existing element to add the new element after.
*
* Description:
* Adds the specified element to the specified hlist
* after the specified node while permitting racing traversals.
*
* The caller must take whatever precautions are necessary
* (such as holding appropriate locks) to avoid racing
* with another list-mutation primitive, such as hlist_add_head_rcu()
* or hlist_del_rcu(), running on this same list.
* However, it is perfectly legal to run concurrently with
* the _rcu list-traversal primitives, such as
* hlist_for_each_entry_rcu(), used to prevent memory-consistency
* problems on Alpha CPUs.
*/
static inline void hlist_add_behind_rcu(struct hlist_node *n,
struct hlist_node *prev)
{
n->next = prev->next;
WRITE_ONCE(n->pprev, &prev->next);
rcu_assign_pointer(hlist_next_rcu(prev), n);
if (n->next)
WRITE_ONCE(n->next->pprev, &n->next);
}
#define __hlist_for_each_rcu(pos, head) \
for (pos = rcu_dereference(hlist_first_rcu(head)); \
pos; \
pos = rcu_dereference(hlist_next_rcu(pos)))
/**
* hlist_for_each_entry_rcu - iterate over rcu list of given type
* @pos: the type * to use as a loop cursor.
* @head: the head for your list.
* @member: the name of the hlist_node within the struct.
* @cond: optional lockdep expression if called from non-RCU protection.
*
* This list-traversal primitive may safely run concurrently with
* the _rcu list-mutation primitives such as hlist_add_head_rcu()
* as long as the traversal is guarded by rcu_read_lock().
*/
#define hlist_for_each_entry_rcu(pos, head, member, cond...) \
for (__list_check_rcu(dummy, ## cond, 0), \
pos = hlist_entry_safe(rcu_dereference_raw(hlist_first_rcu(head)),\
typeof(*(pos)), member); \
pos; \
pos = hlist_entry_safe(rcu_dereference_raw(hlist_next_rcu(\
&(pos)->member)), typeof(*(pos)), member))
/**
* hlist_for_each_entry_srcu - iterate over rcu list of given type
* @pos: the type * to use as a loop cursor.
* @head: the head for your list.
* @member: the name of the hlist_node within the struct.
* @cond: lockdep expression for the lock required to traverse the list.
*
* This list-traversal primitive may safely run concurrently with
* the _rcu list-mutation primitives such as hlist_add_head_rcu()
* as long as the traversal is guarded by srcu_read_lock().
* The lockdep expression srcu_read_lock_held() can be passed as the
* cond argument from read side.
*/
#define hlist_for_each_entry_srcu(pos, head, member, cond) \
for (__list_check_srcu(cond), \
pos = hlist_entry_safe(rcu_dereference_raw(hlist_first_rcu(head)),\
typeof(*(pos)), member); \
pos; \
pos = hlist_entry_safe(rcu_dereference_raw(hlist_next_rcu(\
&(pos)->member)), typeof(*(pos)), member))
/**
* hlist_for_each_entry_rcu_notrace - iterate over rcu list of given type (for tracing)
* @pos: the type * to use as a loop cursor.
* @head: the head for your list.
* @member: the name of the hlist_node within the struct.
*
* This list-traversal primitive may safely run concurrently with
* the _rcu list-mutation primitives such as hlist_add_head_rcu()
* as long as the traversal is guarded by rcu_read_lock().
*
* This is the same as hlist_for_each_entry_rcu() except that it does
* not do any RCU debugging or tracing.
*/
#define hlist_for_each_entry_rcu_notrace(pos, head, member) \
for (pos = hlist_entry_safe(rcu_dereference_raw_check(hlist_first_rcu(head)),\
typeof(*(pos)), member); \
pos; \
pos = hlist_entry_safe(rcu_dereference_raw_check(hlist_next_rcu(\
&(pos)->member)), typeof(*(pos)), member))
/**
* hlist_for_each_entry_rcu_bh - iterate over rcu list of given type
* @pos: the type * to use as a loop cursor.
* @head: the head for your list.
* @member: the name of the hlist_node within the struct.
*
* This list-traversal primitive may safely run concurrently with
* the _rcu list-mutation primitives such as hlist_add_head_rcu()
* as long as the traversal is guarded by rcu_read_lock().
*/
#define hlist_for_each_entry_rcu_bh(pos, head, member) \
for (pos = hlist_entry_safe(rcu_dereference_bh(hlist_first_rcu(head)),\
typeof(*(pos)), member); \
pos; \
pos = hlist_entry_safe(rcu_dereference_bh(hlist_next_rcu(\
&(pos)->member)), typeof(*(pos)), member))
/**
* hlist_for_each_entry_continue_rcu - iterate over a hlist continuing after current point
* @pos: the type * to use as a loop cursor.
* @member: the name of the hlist_node within the struct.
*/
#define hlist_for_each_entry_continue_rcu(pos, member) \
for (pos = hlist_entry_safe(rcu_dereference_raw(hlist_next_rcu( \
&(pos)->member)), typeof(*(pos)), member); \
pos; \
pos = hlist_entry_safe(rcu_dereference_raw(hlist_next_rcu( \
&(pos)->member)), typeof(*(pos)), member))
/**
* hlist_for_each_entry_continue_rcu_bh - iterate over a hlist continuing after current point
* @pos: the type * to use as a loop cursor.
* @member: the name of the hlist_node within the struct.
*/
#define hlist_for_each_entry_continue_rcu_bh(pos, member) \
for (pos = hlist_entry_safe(rcu_dereference_bh(hlist_next_rcu( \
&(pos)->member)), typeof(*(pos)), member); \
pos; \
pos = hlist_entry_safe(rcu_dereference_bh(hlist_next_rcu( \
&(pos)->member)), typeof(*(pos)), member))
/**
* hlist_for_each_entry_from_rcu - iterate over a hlist continuing from current point
* @pos: the type * to use as a loop cursor.
* @member: the name of the hlist_node within the struct.
*/
#define hlist_for_each_entry_from_rcu(pos, member) \
for (; pos; \
pos = hlist_entry_safe(rcu_dereference_raw(hlist_next_rcu( \
&(pos)->member)), typeof(*(pos)), member))
#endif /* __KERNEL__ */
#endif
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_LIST_H
#define _LINUX_LIST_H
#include <linux/types.h>
#include <linux/stddef.h>
#include <linux/poison.h>
#include <linux/const.h>
#include <linux/kernel.h>
/*
* Circular doubly linked list implementation.
*
* Some of the internal functions ("__xxx") are useful when
* manipulating whole lists rather than single entries, as
* sometimes we already know the next/prev entries and we can
* generate better code by using them directly rather than
* using the generic single-entry routines.
*/
#define LIST_HEAD_INIT(name) { &(name), &(name) }
#define LIST_HEAD(name) \
struct list_head name = LIST_HEAD_INIT(name)
/**
* INIT_LIST_HEAD - Initialize a list_head structure
* @list: list_head structure to be initialized.
*
* Initializes the list_head to point to itself. If it is a list header,
* the result is an empty list.
*/
static inline void INIT_LIST_HEAD(struct list_head *list)
{
WRITE_ONCE(list->next, list);
list->prev = list;
}
#ifdef CONFIG_DEBUG_LIST
extern bool __list_add_valid(struct list_head *new,
struct list_head *prev,
struct list_head *next);
extern bool __list_del_entry_valid(struct list_head *entry);
#else
static inline bool __list_add_valid(struct list_head *new,
struct list_head *prev,
struct list_head *next)
{
return true;
}
static inline bool __list_del_entry_valid(struct list_head *entry)
{
return true;
}
#endif
/*
* Insert a new entry between two known consecutive entries.
*
* This is only for internal list manipulation where we know
* the prev/next entries already!
*/
static inline void __list_add(struct list_head *new,
struct list_head *prev,
struct list_head *next)
{
if (!__list_add_valid(new, prev, next))
return;
next->prev = new;
new->next = next;
new->prev = prev;
WRITE_ONCE(prev->next, new);
}
/**
* list_add - add a new entry
* @new: new entry to be added
* @head: list head to add it after
*
* Insert a new entry after the specified head.
* This is good for implementing stacks.
*/
static inline void list_add(struct list_head *new, struct list_head *head)
{
__list_add(new, head, head->next);
}
/**
* list_add_tail - add a new entry
* @new: new entry to be added
* @head: list head to add it before
*
* Insert a new entry before the specified head.
* This is useful for implementing queues.
*/
static inline void list_add_tail(struct list_head *new, struct list_head *head)
{
__list_add(new, head->prev, head);}
/*
* Delete a list entry by making the prev/next entries
* point to each other.
*
* This is only for internal list manipulation where we know
* the prev/next entries already!
*/
static inline void __list_del(struct list_head * prev, struct list_head * next)
{
next->prev = prev; WRITE_ONCE(prev->next, next);
}
/*
* Delete a list entry and clear the 'prev' pointer.
*
* This is a special-purpose list clearing method used in the networking code
* for lists allocated as per-cpu, where we don't want to incur the extra
* WRITE_ONCE() overhead of a regular list_del_init(). The code that uses this
* needs to check the node 'prev' pointer instead of calling list_empty().
*/
static inline void __list_del_clearprev(struct list_head *entry)
{
__list_del(entry->prev, entry->next);
entry->prev = NULL;
}
static inline void __list_del_entry(struct list_head *entry)
{
if (!__list_del_entry_valid(entry))
return;
__list_del(entry->prev, entry->next);
}
/**
* list_del - deletes entry from list.
* @entry: the element to delete from the list.
* Note: list_empty() on entry does not return true after this, the entry is
* in an undefined state.
*/
static inline void list_del(struct list_head *entry)
{
__list_del_entry(entry);
entry->next = LIST_POISON1;
entry->prev = LIST_POISON2;
}
/**
* list_replace - replace old entry by new one
* @old : the element to be replaced
* @new : the new element to insert
*
* If @old was empty, it will be overwritten.
*/
static inline void list_replace(struct list_head *old,
struct list_head *new)
{
new->next = old->next;
new->next->prev = new;
new->prev = old->prev;
new->prev->next = new;
}
/**
* list_replace_init - replace old entry by new one and initialize the old one
* @old : the element to be replaced
* @new : the new element to insert
*
* If @old was empty, it will be overwritten.
*/
static inline void list_replace_init(struct list_head *old,
struct list_head *new)
{
list_replace(old, new);
INIT_LIST_HEAD(old);
}
/**
* list_swap - replace entry1 with entry2 and re-add entry1 at entry2's position
* @entry1: the location to place entry2
* @entry2: the location to place entry1
*/
static inline void list_swap(struct list_head *entry1,
struct list_head *entry2)
{
struct list_head *pos = entry2->prev;
list_del(entry2);
list_replace(entry1, entry2);
if (pos == entry1)
pos = entry2;
list_add(entry1, pos);
}
/**
* list_del_init - deletes entry from list and reinitialize it.
* @entry: the element to delete from the list.
*/
static inline void list_del_init(struct list_head *entry)
{
__list_del_entry(entry);
INIT_LIST_HEAD(entry);
}
/**
* list_move - delete from one list and add as another's head
* @list: the entry to move
* @head: the head that will precede our entry
*/
static inline void list_move(struct list_head *list, struct list_head *head)
{
__list_del_entry(list);
list_add(list, head);
}
/**
* list_move_tail - delete from one list and add as another's tail
* @list: the entry to move
* @head: the head that will follow our entry
*/
static inline void list_move_tail(struct list_head *list,
struct list_head *head)
{
__list_del_entry(list);
list_add_tail(list, head);
}
/**
* list_bulk_move_tail - move a subsection of a list to its tail
* @head: the head that will follow our entry
* @first: first entry to move
* @last: last entry to move, can be the same as first
*
* Move all entries between @first and including @last before @head.
* All three entries must belong to the same linked list.
*/
static inline void list_bulk_move_tail(struct list_head *head,
struct list_head *first,
struct list_head *last)
{
first->prev->next = last->next;
last->next->prev = first->prev;
head->prev->next = first;
first->prev = head->prev;
last->next = head;
head->prev = last;
}
/**
* list_is_first -- tests whether @list is the first entry in list @head
* @list: the entry to test
* @head: the head of the list
*/
static inline int list_is_first(const struct list_head *list,
const struct list_head *head)
{
return list->prev == head;
}
/**
* list_is_last - tests whether @list is the last entry in list @head
* @list: the entry to test
* @head: the head of the list
*/
static inline int list_is_last(const struct list_head *list,
const struct list_head *head)
{
return list->next == head;
}
/**
* list_empty - tests whether a list is empty
* @head: the list to test.
*/
static inline int list_empty(const struct list_head *head)
{
return READ_ONCE(head->next) == head;
}
/**
* list_del_init_careful - deletes entry from list and reinitialize it.
* @entry: the element to delete from the list.
*
* This is the same as list_del_init(), except designed to be used
* together with list_empty_careful() in a way to guarantee ordering
* of other memory operations.
*
* Any memory operations done before a list_del_init_careful() are
* guaranteed to be visible after a list_empty_careful() test.
*/
static inline void list_del_init_careful(struct list_head *entry)
{
__list_del_entry(entry);
entry->prev = entry;
smp_store_release(&entry->next, entry);
}
/**
* list_empty_careful - tests whether a list is empty and not being modified
* @head: the list to test
*
* Description:
* tests whether a list is empty _and_ checks that no other CPU might be
* in the process of modifying either member (next or prev)
*
* NOTE: using list_empty_careful() without synchronization
* can only be safe if the only activity that can happen
* to the list entry is list_del_init(). Eg. it cannot be used
* if another CPU could re-list_add() it.
*/
static inline int list_empty_careful(const struct list_head *head)
{
struct list_head *next = smp_load_acquire(&head->next);
return (next == head) && (next == head->prev);
}
/**
* list_rotate_left - rotate the list to the left
* @head: the head of the list
*/
static inline void list_rotate_left(struct list_head *head)
{
struct list_head *first;
if (!list_empty(head)) {
first = head->next;
list_move_tail(first, head);
}
}
/**
* list_rotate_to_front() - Rotate list to specific item.
* @list: The desired new front of the list.
* @head: The head of the list.
*
* Rotates list so that @list becomes the new front of the list.
*/
static inline void list_rotate_to_front(struct list_head *list,
struct list_head *head)
{
/*
* Deletes the list head from the list denoted by @head and
* places it as the tail of @list, this effectively rotates the
* list so that @list is at the front.
*/
list_move_tail(head, list);
}
/**
* list_is_singular - tests whether a list has just one entry.
* @head: the list to test.
*/
static inline int list_is_singular(const struct list_head *head)
{
return !list_empty(head) && (head->next == head->prev);
}
static inline void __list_cut_position(struct list_head *list,
struct list_head *head, struct list_head *entry)
{
struct list_head *new_first = entry->next;
list->next = head->next;
list->next->prev = list;
list->prev = entry;
entry->next = list;
head->next = new_first;
new_first->prev = head;
}
/**
* list_cut_position - cut a list into two
* @list: a new list to add all removed entries
* @head: a list with entries
* @entry: an entry within head, could be the head itself
* and if so we won't cut the list
*
* This helper moves the initial part of @head, up to and
* including @entry, from @head to @list. You should
* pass on @entry an element you know is on @head. @list
* should be an empty list or a list you do not care about
* losing its data.
*
*/
static inline void list_cut_position(struct list_head *list,
struct list_head *head, struct list_head *entry)
{
if (list_empty(head))
return;
if (list_is_singular(head) &&
(head->next != entry && head != entry))
return;
if (entry == head)
INIT_LIST_HEAD(list);
else
__list_cut_position(list, head, entry);
}
/**
* list_cut_before - cut a list into two, before given entry
* @list: a new list to add all removed entries
* @head: a list with entries
* @entry: an entry within head, could be the head itself
*
* This helper moves the initial part of @head, up to but
* excluding @entry, from @head to @list. You should pass
* in @entry an element you know is on @head. @list should
* be an empty list or a list you do not care about losing
* its data.
* If @entry == @head, all entries on @head are moved to
* @list.
*/
static inline void list_cut_before(struct list_head *list,
struct list_head *head,
struct list_head *entry)
{
if (head->next == entry) {
INIT_LIST_HEAD(list);
return;
}
list->next = head->next;
list->next->prev = list;
list->prev = entry->prev;
list->prev->next = list;
head->next = entry;
entry->prev = head;
}
static inline void __list_splice(const struct list_head *list,
struct list_head *prev,
struct list_head *next)
{
struct list_head *first = list->next;
struct list_head *last = list->prev;
first->prev = prev;
prev->next = first;
last->next = next;
next->prev = last;
}
/**
* list_splice - join two lists, this is designed for stacks
* @list: the new list to add.
* @head: the place to add it in the first list.
*/
static inline void list_splice(const struct list_head *list,
struct list_head *head)
{
if (!list_empty(list))
__list_splice(list, head, head->next);
}
/**
* list_splice_tail - join two lists, each list being a queue
* @list: the new list to add.
* @head: the place to add it in the first list.
*/
static inline void list_splice_tail(struct list_head *list,
struct list_head *head)
{
if (!list_empty(list))
__list_splice(list, head->prev, head);
}
/**
* list_splice_init - join two lists and reinitialise the emptied list.
* @list: the new list to add.
* @head: the place to add it in the first list.
*
* The list at @list is reinitialised
*/
static inline void list_splice_init(struct list_head *list,
struct list_head *head)
{
if (!list_empty(list)) {
__list_splice(list, head, head->next);
INIT_LIST_HEAD(list);
}
}
/**
* list_splice_tail_init - join two lists and reinitialise the emptied list
* @list: the new list to add.
* @head: the place to add it in the first list.
*
* Each of the lists is a queue.
* The list at @list is reinitialised
*/
static inline void list_splice_tail_init(struct list_head *list,
struct list_head *head)
{
if (!list_empty(list)) {
__list_splice(list, head->prev, head);
INIT_LIST_HEAD(list);
}
}
/**
* list_entry - get the struct for this entry
* @ptr: the &struct list_head pointer.
* @type: the type of the struct this is embedded in.
* @member: the name of the list_head within the struct.
*/
#define list_entry(ptr, type, member) \
container_of(ptr, type, member)
/**
* list_first_entry - get the first element from a list
* @ptr: the list head to take the element from.
* @type: the type of the struct this is embedded in.
* @member: the name of the list_head within the struct.
*
* Note, that list is expected to be not empty.
*/
#define list_first_entry(ptr, type, member) \
list_entry((ptr)->next, type, member)
/**
* list_last_entry - get the last element from a list
* @ptr: the list head to take the element from.
* @type: the type of the struct this is embedded in.
* @member: the name of the list_head within the struct.
*
* Note, that list is expected to be not empty.
*/
#define list_last_entry(ptr, type, member) \
list_entry((ptr)->prev, type, member)
/**
* list_first_entry_or_null - get the first element from a list
* @ptr: the list head to take the element from.
* @type: the type of the struct this is embedded in.
* @member: the name of the list_head within the struct.
*
* Note that if the list is empty, it returns NULL.
*/
#define list_first_entry_or_null(ptr, type, member) ({ \
struct list_head *head__ = (ptr); \
struct list_head *pos__ = READ_ONCE(head__->next); \
pos__ != head__ ? list_entry(pos__, type, member) : NULL; \
})
/**
* list_next_entry - get the next element in list
* @pos: the type * to cursor
* @member: the name of the list_head within the struct.
*/
#define list_next_entry(pos, member) \
list_entry((pos)->member.next, typeof(*(pos)), member)
/**
* list_prev_entry - get the prev element in list
* @pos: the type * to cursor
* @member: the name of the list_head within the struct.
*/
#define list_prev_entry(pos, member) \
list_entry((pos)->member.prev, typeof(*(pos)), member)
/**
* list_for_each - iterate over a list
* @pos: the &struct list_head to use as a loop cursor.
* @head: the head for your list.
*/
#define list_for_each(pos, head) \
for (pos = (head)->next; pos != (head); pos = pos->next)
/**
* list_for_each_continue - continue iteration over a list
* @pos: the &struct list_head to use as a loop cursor.
* @head: the head for your list.
*
* Continue to iterate over a list, continuing after the current position.
*/
#define list_for_each_continue(pos, head) \
for (pos = pos->next; pos != (head); pos = pos->next)
/**
* list_for_each_prev - iterate over a list backwards
* @pos: the &struct list_head to use as a loop cursor.
* @head: the head for your list.
*/
#define list_for_each_prev(pos, head) \
for (pos = (head)->prev; pos != (head); pos = pos->prev)
/**
* list_for_each_safe - iterate over a list safe against removal of list entry
* @pos: the &struct list_head to use as a loop cursor.
* @n: another &struct list_head to use as temporary storage
* @head: the head for your list.
*/
#define list_for_each_safe(pos, n, head) \
for (pos = (head)->next, n = pos->next; pos != (head); \
pos = n, n = pos->next)
/**
* list_for_each_prev_safe - iterate over a list backwards safe against removal of list entry
* @pos: the &struct list_head to use as a loop cursor.
* @n: another &struct list_head to use as temporary storage
* @head: the head for your list.
*/
#define list_for_each_prev_safe(pos, n, head) \
for (pos = (head)->prev, n = pos->prev; \
pos != (head); \
pos = n, n = pos->prev)
/**
* list_entry_is_head - test if the entry points to the head of the list
* @pos: the type * to cursor
* @head: the head for your list.
* @member: the name of the list_head within the struct.
*/
#define list_entry_is_head(pos, head, member) \
(&pos->member == (head))
/**
* list_for_each_entry - iterate over list of given type
* @pos: the type * to use as a loop cursor.
* @head: the head for your list.
* @member: the name of the list_head within the struct.
*/
#define list_for_each_entry(pos, head, member) \
for (pos = list_first_entry(head, typeof(*pos), member); \
!list_entry_is_head(pos, head, member); \
pos = list_next_entry(pos, member))
/**
* list_for_each_entry_reverse - iterate backwards over list of given type.
* @pos: the type * to use as a loop cursor.
* @head: the head for your list.
* @member: the name of the list_head within the struct.
*/
#define list_for_each_entry_reverse(pos, head, member) \
for (pos = list_last_entry(head, typeof(*pos), member); \
!list_entry_is_head(pos, head, member); \
pos = list_prev_entry(pos, member))
/**
* list_prepare_entry - prepare a pos entry for use in list_for_each_entry_continue()
* @pos: the type * to use as a start point
* @head: the head of the list
* @member: the name of the list_head within the struct.
*
* Prepares a pos entry for use as a start point in list_for_each_entry_continue().
*/
#define list_prepare_entry(pos, head, member) \
((pos) ? : list_entry(head, typeof(*pos), member))
/**
* list_for_each_entry_continue - continue iteration over list of given type
* @pos: the type * to use as a loop cursor.
* @head: the head for your list.
* @member: the name of the list_head within the struct.
*
* Continue to iterate over list of given type, continuing after
* the current position.
*/
#define list_for_each_entry_continue(pos, head, member) \
for (pos = list_next_entry(pos, member); \
!list_entry_is_head(pos, head, member); \
pos = list_next_entry(pos, member))
/**
* list_for_each_entry_continue_reverse - iterate backwards from the given point
* @pos: the type * to use as a loop cursor.
* @head: the head for your list.
* @member: the name of the list_head within the struct.
*
* Start to iterate over list of given type backwards, continuing after
* the current position.
*/
#define list_for_each_entry_continue_reverse(pos, head, member) \
for (pos = list_prev_entry(pos, member); \
!list_entry_is_head(pos, head, member); \
pos = list_prev_entry(pos, member))
/**
* list_for_each_entry_from - iterate over list of given type from the current point
* @pos: the type * to use as a loop cursor.
* @head: the head for your list.
* @member: the name of the list_head within the struct.
*
* Iterate over list of given type, continuing from current position.
*/
#define list_for_each_entry_from(pos, head, member) \
for (; !list_entry_is_head(pos, head, member); \
pos = list_next_entry(pos, member))
/**
* list_for_each_entry_from_reverse - iterate backwards over list of given type
* from the current point
* @pos: the type * to use as a loop cursor.
* @head: the head for your list.
* @member: the name of the list_head within the struct.
*
* Iterate backwards over list of given type, continuing from current position.
*/
#define list_for_each_entry_from_reverse(pos, head, member) \
for (; !list_entry_is_head(pos, head, member); \
pos = list_prev_entry(pos, member))
/**
* list_for_each_entry_safe - iterate over list of given type safe against removal of list entry
* @pos: the type * to use as a loop cursor.
* @n: another type * to use as temporary storage
* @head: the head for your list.
* @member: the name of the list_head within the struct.
*/
#define list_for_each_entry_safe(pos, n, head, member) \
for (pos = list_first_entry(head, typeof(*pos), member), \
n = list_next_entry(pos, member); \
!list_entry_is_head(pos, head, member); \
pos = n, n = list_next_entry(n, member))
/**
* list_for_each_entry_safe_continue - continue list iteration safe against removal
* @pos: the type * to use as a loop cursor.
* @n: another type * to use as temporary storage
* @head: the head for your list.
* @member: the name of the list_head within the struct.
*
* Iterate over list of given type, continuing after current point,
* safe against removal of list entry.
*/
#define list_for_each_entry_safe_continue(pos, n, head, member) \
for (pos = list_next_entry(pos, member), \
n = list_next_entry(pos, member); \
!list_entry_is_head(pos, head, member); \
pos = n, n = list_next_entry(n, member))
/**
* list_for_each_entry_safe_from - iterate over list from current point safe against removal
* @pos: the type * to use as a loop cursor.
* @n: another type * to use as temporary storage
* @head: the head for your list.
* @member: the name of the list_head within the struct.
*
* Iterate over list of given type from current point, safe against
* removal of list entry.
*/
#define list_for_each_entry_safe_from(pos, n, head, member) \
for (n = list_next_entry(pos, member); \
!list_entry_is_head(pos, head, member); \
pos = n, n = list_next_entry(n, member))
/**
* list_for_each_entry_safe_reverse - iterate backwards over list safe against removal
* @pos: the type * to use as a loop cursor.
* @n: another type * to use as temporary storage
* @head: the head for your list.
* @member: the name of the list_head within the struct.
*
* Iterate backwards over list of given type, safe against removal
* of list entry.
*/
#define list_for_each_entry_safe_reverse(pos, n, head, member) \
for (pos = list_last_entry(head, typeof(*pos), member), \
n = list_prev_entry(pos, member); \
!list_entry_is_head(pos, head, member); \
pos = n, n = list_prev_entry(n, member))
/**
* list_safe_reset_next - reset a stale list_for_each_entry_safe loop
* @pos: the loop cursor used in the list_for_each_entry_safe loop
* @n: temporary storage used in list_for_each_entry_safe
* @member: the name of the list_head within the struct.
*
* list_safe_reset_next is not safe to use in general if the list may be
* modified concurrently (eg. the lock is dropped in the loop body). An
* exception to this is if the cursor element (pos) is pinned in the list,
* and list_safe_reset_next is called after re-taking the lock and before
* completing the current iteration of the loop body.
*/
#define list_safe_reset_next(pos, n, member) \
n = list_next_entry(pos, member)
/*
* Double linked lists with a single pointer list head.
* Mostly useful for hash tables where the two pointer list head is
* too wasteful.
* You lose the ability to access the tail in O(1).
*/
#define HLIST_HEAD_INIT { .first = NULL }
#define HLIST_HEAD(name) struct hlist_head name = { .first = NULL }
#define INIT_HLIST_HEAD(ptr) ((ptr)->first = NULL)
static inline void INIT_HLIST_NODE(struct hlist_node *h)
{
h->next = NULL;
h->pprev = NULL;
}
/**
* hlist_unhashed - Has node been removed from list and reinitialized?
* @h: Node to be checked
*
* Not that not all removal functions will leave a node in unhashed
* state. For example, hlist_nulls_del_init_rcu() does leave the
* node in unhashed state, but hlist_nulls_del() does not.
*/
static inline int hlist_unhashed(const struct hlist_node *h)
{
return !h->pprev;
}
/**
* hlist_unhashed_lockless - Version of hlist_unhashed for lockless use
* @h: Node to be checked
*
* This variant of hlist_unhashed() must be used in lockless contexts
* to avoid potential load-tearing. The READ_ONCE() is paired with the
* various WRITE_ONCE() in hlist helpers that are defined below.
*/
static inline int hlist_unhashed_lockless(const struct hlist_node *h)
{
return !READ_ONCE(h->pprev);
}
/**
* hlist_empty - Is the specified hlist_head structure an empty hlist?
* @h: Structure to check.
*/
static inline int hlist_empty(const struct hlist_head *h)
{
return !READ_ONCE(h->first);
}
static inline void __hlist_del(struct hlist_node *n)
{
struct hlist_node *next = n->next;
struct hlist_node **pprev = n->pprev;
WRITE_ONCE(*pprev, next);
if (next)
WRITE_ONCE(next->pprev, pprev);
}
/**
* hlist_del - Delete the specified hlist_node from its list
* @n: Node to delete.
*
* Note that this function leaves the node in hashed state. Use
* hlist_del_init() or similar instead to unhash @n.
*/
static inline void hlist_del(struct hlist_node *n)
{
__hlist_del(n);
n->next = LIST_POISON1;
n->pprev = LIST_POISON2;
}
/**
* hlist_del_init - Delete the specified hlist_node from its list and initialize
* @n: Node to delete.
*
* Note that this function leaves the node in unhashed state.
*/
static inline void hlist_del_init(struct hlist_node *n)
{
if (!hlist_unhashed(n)) { __hlist_del(n);
INIT_HLIST_NODE(n);
}
}
/**
* hlist_add_head - add a new entry at the beginning of the hlist
* @n: new entry to be added
* @h: hlist head to add it after
*
* Insert a new entry after the specified head.
* This is good for implementing stacks.
*/
static inline void hlist_add_head(struct hlist_node *n, struct hlist_head *h)
{
struct hlist_node *first = h->first;
WRITE_ONCE(n->next, first);
if (first)
WRITE_ONCE(first->pprev, &n->next); WRITE_ONCE(h->first, n);
WRITE_ONCE(n->pprev, &h->first);
}
/**
* hlist_add_before - add a new entry before the one specified
* @n: new entry to be added
* @next: hlist node to add it before, which must be non-NULL
*/
static inline void hlist_add_before(struct hlist_node *n,
struct hlist_node *next)
{
WRITE_ONCE(n->pprev, next->pprev);
WRITE_ONCE(n->next, next);
WRITE_ONCE(next->pprev, &n->next);
WRITE_ONCE(*(n->pprev), n);
}
/**
* hlist_add_behind - add a new entry after the one specified
* @n: new entry to be added
* @prev: hlist node to add it after, which must be non-NULL
*/
static inline void hlist_add_behind(struct hlist_node *n,
struct hlist_node *prev)
{
WRITE_ONCE(n->next, prev->next);
WRITE_ONCE(prev->next, n);
WRITE_ONCE(n->pprev, &prev->next);
if (n->next)
WRITE_ONCE(n->next->pprev, &n->next);
}
/**
* hlist_add_fake - create a fake hlist consisting of a single headless node
* @n: Node to make a fake list out of
*
* This makes @n appear to be its own predecessor on a headless hlist.
* The point of this is to allow things like hlist_del() to work correctly
* in cases where there is no list.
*/
static inline void hlist_add_fake(struct hlist_node *n)
{
n->pprev = &n->next;
}
/**
* hlist_fake: Is this node a fake hlist?
* @h: Node to check for being a self-referential fake hlist.
*/
static inline bool hlist_fake(struct hlist_node *h)
{
return h->pprev == &h->next;
}
/**
* hlist_is_singular_node - is node the only element of the specified hlist?
* @n: Node to check for singularity.
* @h: Header for potentially singular list.
*
* Check whether the node is the only node of the head without
* accessing head, thus avoiding unnecessary cache misses.
*/
static inline bool
hlist_is_singular_node(struct hlist_node *n, struct hlist_head *h)
{
return !n->next && n->pprev == &h->first;
}
/**
* hlist_move_list - Move an hlist
* @old: hlist_head for old list.
* @new: hlist_head for new list.
*
* Move a list from one list head to another. Fixup the pprev
* reference of the first entry if it exists.
*/
static inline void hlist_move_list(struct hlist_head *old,
struct hlist_head *new)
{
new->first = old->first;
if (new->first)
new->first->pprev = &new->first; old->first = NULL;
}
#define hlist_entry(ptr, type, member) container_of(ptr,type,member)
#define hlist_for_each(pos, head) \
for (pos = (head)->first; pos ; pos = pos->next)
#define hlist_for_each_safe(pos, n, head) \
for (pos = (head)->first; pos && ({ n = pos->next; 1; }); \
pos = n)
#define hlist_entry_safe(ptr, type, member) \
({ typeof(ptr) ____ptr = (ptr); \
____ptr ? hlist_entry(____ptr, type, member) : NULL; \
})
/**
* hlist_for_each_entry - iterate over list of given type
* @pos: the type * to use as a loop cursor.
* @head: the head for your list.
* @member: the name of the hlist_node within the struct.
*/
#define hlist_for_each_entry(pos, head, member) \
for (pos = hlist_entry_safe((head)->first, typeof(*(pos)), member);\
pos; \
pos = hlist_entry_safe((pos)->member.next, typeof(*(pos)), member))
/**
* hlist_for_each_entry_continue - iterate over a hlist continuing after current point
* @pos: the type * to use as a loop cursor.
* @member: the name of the hlist_node within the struct.
*/
#define hlist_for_each_entry_continue(pos, member) \
for (pos = hlist_entry_safe((pos)->member.next, typeof(*(pos)), member);\
pos; \
pos = hlist_entry_safe((pos)->member.next, typeof(*(pos)), member))
/**
* hlist_for_each_entry_from - iterate over a hlist continuing from current point
* @pos: the type * to use as a loop cursor.
* @member: the name of the hlist_node within the struct.
*/
#define hlist_for_each_entry_from(pos, member) \
for (; pos; \
pos = hlist_entry_safe((pos)->member.next, typeof(*(pos)), member))
/**
* hlist_for_each_entry_safe - iterate over list of given type safe against removal of list entry
* @pos: the type * to use as a loop cursor.
* @n: a &struct hlist_node to use as temporary storage
* @head: the head for your list.
* @member: the name of the hlist_node within the struct.
*/
#define hlist_for_each_entry_safe(pos, n, head, member) \
for (pos = hlist_entry_safe((head)->first, typeof(*pos), member);\
pos && ({ n = pos->member.next; 1; }); \
pos = hlist_entry_safe(n, typeof(*pos), member))
#endif
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef __LINUX_SMP_H
#define __LINUX_SMP_H
/*
* Generic SMP support
* Alan Cox. <alan@redhat.com>
*/
#include <linux/errno.h>
#include <linux/types.h>
#include <linux/list.h>
#include <linux/cpumask.h>
#include <linux/init.h>
#include <linux/smp_types.h>
typedef void (*smp_call_func_t)(void *info);
typedef bool (*smp_cond_func_t)(int cpu, void *info);
/*
* structure shares (partial) layout with struct irq_work
*/
struct __call_single_data {
struct __call_single_node node;
smp_call_func_t func;
void *info;
};
#define CSD_INIT(_func, _info) \
(struct __call_single_data){ .func = (_func), .info = (_info), }
/* Use __aligned() to avoid to use 2 cache lines for 1 csd */
typedef struct __call_single_data call_single_data_t
__aligned(sizeof(struct __call_single_data));
#define INIT_CSD(_csd, _func, _info) \
do { \
*(_csd) = CSD_INIT((_func), (_info)); \
} while (0)
/*
* Enqueue a llist_node on the call_single_queue; be very careful, read
* flush_smp_call_function_queue() in detail.
*/
extern void __smp_call_single_queue(int cpu, struct llist_node *node);
/* total number of cpus in this system (may exceed NR_CPUS) */
extern unsigned int total_cpus;
int smp_call_function_single(int cpuid, smp_call_func_t func, void *info,
int wait);
void on_each_cpu_cond_mask(smp_cond_func_t cond_func, smp_call_func_t func,
void *info, bool wait, const struct cpumask *mask);
int smp_call_function_single_async(int cpu, struct __call_single_data *csd);
/*
* Cpus stopping functions in panic. All have default weak definitions.
* Architecture-dependent code may override them.
*/
void panic_smp_self_stop(void);
void nmi_panic_self_stop(struct pt_regs *regs);
void crash_smp_send_stop(void);
/*
* Call a function on all processors
*/
static inline void on_each_cpu(smp_call_func_t func, void *info, int wait)
{
on_each_cpu_cond_mask(NULL, func, info, wait, cpu_online_mask);
}
/**
* on_each_cpu_mask(): Run a function on processors specified by
* cpumask, which may include the local processor.
* @mask: The set of cpus to run on (only runs on online subset).
* @func: The function to run. This must be fast and non-blocking.
* @info: An arbitrary pointer to pass to the function.
* @wait: If true, wait (atomically) until function has completed
* on other CPUs.
*
* If @wait is true, then returns once @func has returned.
*
* You must not call this function with disabled interrupts or from a
* hardware interrupt handler or from a bottom half handler. The
* exception is that it may be used during early boot while
* early_boot_irqs_disabled is set.
*/
static inline void on_each_cpu_mask(const struct cpumask *mask,
smp_call_func_t func, void *info, bool wait)
{
on_each_cpu_cond_mask(NULL, func, info, wait, mask);
}
/*
* Call a function on each processor for which the supplied function
* cond_func returns a positive value. This may include the local
* processor. May be used during early boot while early_boot_irqs_disabled is
* set. Use local_irq_save/restore() instead of local_irq_disable/enable().
*/
static inline void on_each_cpu_cond(smp_cond_func_t cond_func,
smp_call_func_t func, void *info, bool wait)
{
on_each_cpu_cond_mask(cond_func, func, info, wait, cpu_online_mask);
}
#ifdef CONFIG_SMP
#include <linux/preempt.h>
#include <linux/kernel.h>
#include <linux/compiler.h>
#include <linux/thread_info.h>
#include <asm/smp.h>
/*
* main cross-CPU interfaces, handles INIT, TLB flush, STOP, etc.
* (defined in asm header):
*/
/*
* stops all CPUs but the current one:
*/
extern void smp_send_stop(void);
/*
* sends a 'reschedule' event to another CPU:
*/
extern void smp_send_reschedule(int cpu);
/*
* Prepare machine for booting other CPUs.
*/
extern void smp_prepare_cpus(unsigned int max_cpus);
/*
* Bring a CPU up
*/
extern int __cpu_up(unsigned int cpunum, struct task_struct *tidle);
/*
* Final polishing of CPUs
*/
extern void smp_cpus_done(unsigned int max_cpus);
/*
* Call a function on all other processors
*/
void smp_call_function(smp_call_func_t func, void *info, int wait);
void smp_call_function_many(const struct cpumask *mask,
smp_call_func_t func, void *info, bool wait);
int smp_call_function_any(const struct cpumask *mask,
smp_call_func_t func, void *info, int wait);
void kick_all_cpus_sync(void);
void wake_up_all_idle_cpus(void);
/*
* Generic and arch helpers
*/
void __init call_function_init(void);
void generic_smp_call_function_single_interrupt(void);
#define generic_smp_call_function_interrupt \
generic_smp_call_function_single_interrupt
/*
* Mark the boot cpu "online" so that it can call console drivers in
* printk() and can access its per-cpu storage.
*/
void smp_prepare_boot_cpu(void);
extern unsigned int setup_max_cpus;
extern void __init setup_nr_cpu_ids(void);
extern void __init smp_init(void);
extern int __boot_cpu_id;
static inline int get_boot_cpu_id(void)
{
return __boot_cpu_id;
}
#else /* !SMP */
static inline void smp_send_stop(void) { }
/*
* These macros fold the SMP functionality into a single CPU system
*/
#define raw_smp_processor_id() 0
static inline void up_smp_call_function(smp_call_func_t func, void *info)
{
}
#define smp_call_function(func, info, wait) \
(up_smp_call_function(func, info))
static inline void smp_send_reschedule(int cpu) { }
#define smp_prepare_boot_cpu() do {} while (0)
#define smp_call_function_many(mask, func, info, wait) \
(up_smp_call_function(func, info))
static inline void call_function_init(void) { }
static inline int
smp_call_function_any(const struct cpumask *mask, smp_call_func_t func,
void *info, int wait)
{
return smp_call_function_single(0, func, info, wait);
}
static inline void kick_all_cpus_sync(void) { }
static inline void wake_up_all_idle_cpus(void) { }
#ifdef CONFIG_UP_LATE_INIT
extern void __init up_late_init(void);
static inline void smp_init(void) { up_late_init(); }
#else
static inline void smp_init(void) { }
#endif
static inline int get_boot_cpu_id(void)
{
return 0;
}
#endif /* !SMP */
/**
* raw_processor_id() - get the current (unstable) CPU id
*
* For then you know what you are doing and need an unstable
* CPU id.
*/
/**
* smp_processor_id() - get the current (stable) CPU id
*
* This is the normal accessor to the CPU id and should be used
* whenever possible.
*
* The CPU id is stable when:
*
* - IRQs are disabled;
* - preemption is disabled;
* - the task is CPU affine.
*
* When CONFIG_DEBUG_PREEMPT; we verify these assumption and WARN
* when smp_processor_id() is used when the CPU id is not stable.
*/
/*
* Allow the architecture to differentiate between a stable and unstable read.
* For example, x86 uses an IRQ-safe asm-volatile read for the unstable but a
* regular asm read for the stable.
*/
#ifndef __smp_processor_id
#define __smp_processor_id(x) raw_smp_processor_id(x)
#endif
#ifdef CONFIG_DEBUG_PREEMPT
extern unsigned int debug_smp_processor_id(void);
# define smp_processor_id() debug_smp_processor_id()
#else
# define smp_processor_id() __smp_processor_id()
#endif
#define get_cpu() ({ preempt_disable(); __smp_processor_id(); })
#define put_cpu() preempt_enable()
/*
* Callback to arch code if there's nosmp or maxcpus=0 on the
* boot command line:
*/
extern void arch_disable_smp_support(void);
extern void arch_thaw_secondary_cpus_begin(void);
extern void arch_thaw_secondary_cpus_end(void);
void smp_setup_processor_id(void);
int smp_call_on_cpu(unsigned int cpu, int (*func)(void *), void *par,
bool phys);
/* SMP core functions */
int smpcfd_prepare_cpu(unsigned int cpu);
int smpcfd_dead_cpu(unsigned int cpu);
int smpcfd_dying_cpu(unsigned int cpu);
#endif /* __LINUX_SMP_H */
// SPDX-License-Identifier: GPL-2.0
/*
* hrtimers - High-resolution kernel timers
*
* Copyright(C) 2005, Thomas Gleixner <tglx@linutronix.de>
* Copyright(C) 2005, Red Hat, Inc., Ingo Molnar
*
* data type definitions, declarations, prototypes
*
* Started by: Thomas Gleixner and Ingo Molnar
*/
#ifndef _LINUX_HRTIMER_H
#define _LINUX_HRTIMER_H
#include <linux/hrtimer_defs.h>
#include <linux/rbtree.h>
#include <linux/init.h>
#include <linux/list.h>
#include <linux/percpu.h>
#include <linux/seqlock.h>
#include <linux/timer.h>
#include <linux/timerqueue.h>
struct hrtimer_clock_base;
struct hrtimer_cpu_base;
/*
* Mode arguments of xxx_hrtimer functions:
*
* HRTIMER_MODE_ABS - Time value is absolute
* HRTIMER_MODE_REL - Time value is relative to now
* HRTIMER_MODE_PINNED - Timer is bound to CPU (is only considered
* when starting the timer)
* HRTIMER_MODE_SOFT - Timer callback function will be executed in
* soft irq context
* HRTIMER_MODE_HARD - Timer callback function will be executed in
* hard irq context even on PREEMPT_RT.
*/
enum hrtimer_mode {
HRTIMER_MODE_ABS = 0x00,
HRTIMER_MODE_REL = 0x01,
HRTIMER_MODE_PINNED = 0x02,
HRTIMER_MODE_SOFT = 0x04,
HRTIMER_MODE_HARD = 0x08,
HRTIMER_MODE_ABS_PINNED = HRTIMER_MODE_ABS | HRTIMER_MODE_PINNED,
HRTIMER_MODE_REL_PINNED = HRTIMER_MODE_REL | HRTIMER_MODE_PINNED,
HRTIMER_MODE_ABS_SOFT = HRTIMER_MODE_ABS | HRTIMER_MODE_SOFT,
HRTIMER_MODE_REL_SOFT = HRTIMER_MODE_REL | HRTIMER_MODE_SOFT,
HRTIMER_MODE_ABS_PINNED_SOFT = HRTIMER_MODE_ABS_PINNED | HRTIMER_MODE_SOFT,
HRTIMER_MODE_REL_PINNED_SOFT = HRTIMER_MODE_REL_PINNED | HRTIMER_MODE_SOFT,
HRTIMER_MODE_ABS_HARD = HRTIMER_MODE_ABS | HRTIMER_MODE_HARD,
HRTIMER_MODE_REL_HARD = HRTIMER_MODE_REL | HRTIMER_MODE_HARD,
HRTIMER_MODE_ABS_PINNED_HARD = HRTIMER_MODE_ABS_PINNED | HRTIMER_MODE_HARD,
HRTIMER_MODE_REL_PINNED_HARD = HRTIMER_MODE_REL_PINNED | HRTIMER_MODE_HARD,
};
/*
* Return values for the callback function
*/
enum hrtimer_restart {
HRTIMER_NORESTART, /* Timer is not restarted */
HRTIMER_RESTART, /* Timer must be restarted */
};
/*
* Values to track state of the timer
*
* Possible states:
*
* 0x00 inactive
* 0x01 enqueued into rbtree
*
* The callback state is not part of the timer->state because clearing it would
* mean touching the timer after the callback, this makes it impossible to free
* the timer from the callback function.
*
* Therefore we track the callback state in:
*
* timer->base->cpu_base->running == timer
*
* On SMP it is possible to have a "callback function running and enqueued"
* status. It happens for example when a posix timer expired and the callback
* queued a signal. Between dropping the lock which protects the posix timer
* and reacquiring the base lock of the hrtimer, another CPU can deliver the
* signal and rearm the timer.
*
* All state transitions are protected by cpu_base->lock.
*/
#define HRTIMER_STATE_INACTIVE 0x00
#define HRTIMER_STATE_ENQUEUED 0x01
/**
* struct hrtimer - the basic hrtimer structure
* @node: timerqueue node, which also manages node.expires,
* the absolute expiry time in the hrtimers internal
* representation. The time is related to the clock on
* which the timer is based. Is setup by adding
* slack to the _softexpires value. For non range timers
* identical to _softexpires.
* @_softexpires: the absolute earliest expiry time of the hrtimer.
* The time which was given as expiry time when the timer
* was armed.
* @function: timer expiry callback function
* @base: pointer to the timer base (per cpu and per clock)
* @state: state information (See bit values above)
* @is_rel: Set if the timer was armed relative
* @is_soft: Set if hrtimer will be expired in soft interrupt context.
* @is_hard: Set if hrtimer will be expired in hard interrupt context
* even on RT.
*
* The hrtimer structure must be initialized by hrtimer_init()
*/
struct hrtimer {
struct timerqueue_node node;
ktime_t _softexpires;
enum hrtimer_restart (*function)(struct hrtimer *);
struct hrtimer_clock_base *base;
u8 state;
u8 is_rel;
u8 is_soft;
u8 is_hard;
};
/**
* struct hrtimer_sleeper - simple sleeper structure
* @timer: embedded timer structure
* @task: task to wake up
*
* task is set to NULL, when the timer expires.
*/
struct hrtimer_sleeper {
struct hrtimer timer;
struct task_struct *task;
};
#ifdef CONFIG_64BIT
# define __hrtimer_clock_base_align ____cacheline_aligned
#else
# define __hrtimer_clock_base_align
#endif
/**
* struct hrtimer_clock_base - the timer base for a specific clock
* @cpu_base: per cpu clock base
* @index: clock type index for per_cpu support when moving a
* timer to a base on another cpu.
* @clockid: clock id for per_cpu support
* @seq: seqcount around __run_hrtimer
* @running: pointer to the currently running hrtimer
* @active: red black tree root node for the active timers
* @get_time: function to retrieve the current time of the clock
* @offset: offset of this clock to the monotonic base
*/
struct hrtimer_clock_base {
struct hrtimer_cpu_base *cpu_base;
unsigned int index;
clockid_t clockid;
seqcount_raw_spinlock_t seq;
struct hrtimer *running;
struct timerqueue_head active;
ktime_t (*get_time)(void);
ktime_t offset;
} __hrtimer_clock_base_align;
enum hrtimer_base_type {
HRTIMER_BASE_MONOTONIC,
HRTIMER_BASE_REALTIME,
HRTIMER_BASE_BOOTTIME,
HRTIMER_BASE_TAI,
HRTIMER_BASE_MONOTONIC_SOFT,
HRTIMER_BASE_REALTIME_SOFT,
HRTIMER_BASE_BOOTTIME_SOFT,
HRTIMER_BASE_TAI_SOFT,
HRTIMER_MAX_CLOCK_BASES,
};
/**
* struct hrtimer_cpu_base - the per cpu clock bases
* @lock: lock protecting the base and associated clock bases
* and timers
* @cpu: cpu number
* @active_bases: Bitfield to mark bases with active timers
* @clock_was_set_seq: Sequence counter of clock was set events
* @hres_active: State of high resolution mode
* @in_hrtirq: hrtimer_interrupt() is currently executing
* @hang_detected: The last hrtimer interrupt detected a hang
* @softirq_activated: displays, if the softirq is raised - update of softirq
* related settings is not required then.
* @nr_events: Total number of hrtimer interrupt events
* @nr_retries: Total number of hrtimer interrupt retries
* @nr_hangs: Total number of hrtimer interrupt hangs
* @max_hang_time: Maximum time spent in hrtimer_interrupt
* @softirq_expiry_lock: Lock which is taken while softirq based hrtimer are
* expired
* @timer_waiters: A hrtimer_cancel() invocation waits for the timer
* callback to finish.
* @expires_next: absolute time of the next event, is required for remote
* hrtimer enqueue; it is the total first expiry time (hard
* and soft hrtimer are taken into account)
* @next_timer: Pointer to the first expiring timer
* @softirq_expires_next: Time to check, if soft queues needs also to be expired
* @softirq_next_timer: Pointer to the first expiring softirq based timer
* @clock_base: array of clock bases for this cpu
*
* Note: next_timer is just an optimization for __remove_hrtimer().
* Do not dereference the pointer because it is not reliable on
* cross cpu removals.
*/
struct hrtimer_cpu_base {
raw_spinlock_t lock;
unsigned int cpu;
unsigned int active_bases;
unsigned int clock_was_set_seq;
unsigned int hres_active : 1,
in_hrtirq : 1,
hang_detected : 1,
softirq_activated : 1;
#ifdef CONFIG_HIGH_RES_TIMERS
unsigned int nr_events;
unsigned short nr_retries;
unsigned short nr_hangs;
unsigned int max_hang_time;
#endif
#ifdef CONFIG_PREEMPT_RT
spinlock_t softirq_expiry_lock;
atomic_t timer_waiters;
#endif
ktime_t expires_next;
struct hrtimer *next_timer;
ktime_t softirq_expires_next;
struct hrtimer *softirq_next_timer;
struct hrtimer_clock_base clock_base[HRTIMER_MAX_CLOCK_BASES];
} ____cacheline_aligned;
static inline void hrtimer_set_expires(struct hrtimer *timer, ktime_t time)
{
timer->node.expires = time;
timer->_softexpires = time;
}
static inline void hrtimer_set_expires_range(struct hrtimer *timer, ktime_t time, ktime_t delta)
{
timer->_softexpires = time;
timer->node.expires = ktime_add_safe(time, delta);
}
static inline void hrtimer_set_expires_range_ns(struct hrtimer *timer, ktime_t time, u64 delta)
{
timer->_softexpires = time; timer->node.expires = ktime_add_safe(time, ns_to_ktime(delta));
}
static inline void hrtimer_set_expires_tv64(struct hrtimer *timer, s64 tv64)
{
timer->node.expires = tv64;
timer->_softexpires = tv64;
}
static inline void hrtimer_add_expires(struct hrtimer *timer, ktime_t time)
{
timer->node.expires = ktime_add_safe(timer->node.expires, time);
timer->_softexpires = ktime_add_safe(timer->_softexpires, time);
}
static inline void hrtimer_add_expires_ns(struct hrtimer *timer, u64 ns)
{
timer->node.expires = ktime_add_ns(timer->node.expires, ns);
timer->_softexpires = ktime_add_ns(timer->_softexpires, ns);
}
static inline ktime_t hrtimer_get_expires(const struct hrtimer *timer)
{
return timer->node.expires;
}
static inline ktime_t hrtimer_get_softexpires(const struct hrtimer *timer)
{
return timer->_softexpires;
}
static inline s64 hrtimer_get_expires_tv64(const struct hrtimer *timer)
{
return timer->node.expires;
}
static inline s64 hrtimer_get_softexpires_tv64(const struct hrtimer *timer)
{
return timer->_softexpires;
}
static inline s64 hrtimer_get_expires_ns(const struct hrtimer *timer)
{
return ktime_to_ns(timer->node.expires);
}
static inline ktime_t hrtimer_expires_remaining(const struct hrtimer *timer)
{
return ktime_sub(timer->node.expires, timer->base->get_time());
}
static inline ktime_t hrtimer_cb_get_time(struct hrtimer *timer)
{
return timer->base->get_time();
}
static inline int hrtimer_is_hres_active(struct hrtimer *timer)
{
return IS_ENABLED(CONFIG_HIGH_RES_TIMERS) ?
timer->base->cpu_base->hres_active : 0;
}
#ifdef CONFIG_HIGH_RES_TIMERS
struct clock_event_device;
extern void hrtimer_interrupt(struct clock_event_device *dev);
extern unsigned int hrtimer_resolution;
#else
#define hrtimer_resolution (unsigned int)LOW_RES_NSEC
#endif
static inline ktime_t
__hrtimer_expires_remaining_adjusted(const struct hrtimer *timer, ktime_t now)
{
ktime_t rem = ktime_sub(timer->node.expires, now);
/*
* Adjust relative timers for the extra we added in
* hrtimer_start_range_ns() to prevent short timeouts.
*/
if (IS_ENABLED(CONFIG_TIME_LOW_RES) && timer->is_rel)
rem -= hrtimer_resolution;
return rem;
}
static inline ktime_t
hrtimer_expires_remaining_adjusted(const struct hrtimer *timer)
{
return __hrtimer_expires_remaining_adjusted(timer,
timer->base->get_time());
}
#ifdef CONFIG_TIMERFD
extern void timerfd_clock_was_set(void);
extern void timerfd_resume(void);
#else
static inline void timerfd_clock_was_set(void) { }
static inline void timerfd_resume(void) { }
#endif
DECLARE_PER_CPU(struct tick_device, tick_cpu_device);
#ifdef CONFIG_PREEMPT_RT
void hrtimer_cancel_wait_running(const struct hrtimer *timer);
#else
static inline void hrtimer_cancel_wait_running(struct hrtimer *timer)
{
cpu_relax();
}
#endif
/* Exported timer functions: */
/* Initialize timers: */
extern void hrtimer_init(struct hrtimer *timer, clockid_t which_clock,
enum hrtimer_mode mode);
extern void hrtimer_init_sleeper(struct hrtimer_sleeper *sl, clockid_t clock_id,
enum hrtimer_mode mode);
#ifdef CONFIG_DEBUG_OBJECTS_TIMERS
extern void hrtimer_init_on_stack(struct hrtimer *timer, clockid_t which_clock,
enum hrtimer_mode mode);
extern void hrtimer_init_sleeper_on_stack(struct hrtimer_sleeper *sl,
clockid_t clock_id,
enum hrtimer_mode mode);
extern void destroy_hrtimer_on_stack(struct hrtimer *timer);
#else
static inline void hrtimer_init_on_stack(struct hrtimer *timer,
clockid_t which_clock,
enum hrtimer_mode mode)
{
hrtimer_init(timer, which_clock, mode);
}
static inline void hrtimer_init_sleeper_on_stack(struct hrtimer_sleeper *sl,
clockid_t clock_id,
enum hrtimer_mode mode)
{
hrtimer_init_sleeper(sl, clock_id, mode);
}
static inline void destroy_hrtimer_on_stack(struct hrtimer *timer) { }
#endif
/* Basic timer operations: */
extern void hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
u64 range_ns, const enum hrtimer_mode mode);
/**
* hrtimer_start - (re)start an hrtimer
* @timer: the timer to be added
* @tim: expiry time
* @mode: timer mode: absolute (HRTIMER_MODE_ABS) or
* relative (HRTIMER_MODE_REL), and pinned (HRTIMER_MODE_PINNED);
* softirq based mode is considered for debug purpose only!
*/
static inline void hrtimer_start(struct hrtimer *timer, ktime_t tim,
const enum hrtimer_mode mode)
{
hrtimer_start_range_ns(timer, tim, 0, mode);
}
extern int hrtimer_cancel(struct hrtimer *timer);
extern int hrtimer_try_to_cancel(struct hrtimer *timer);
static inline void hrtimer_start_expires(struct hrtimer *timer,
enum hrtimer_mode mode)
{
u64 delta;
ktime_t soft, hard;
soft = hrtimer_get_softexpires(timer);
hard = hrtimer_get_expires(timer);
delta = ktime_to_ns(ktime_sub(hard, soft));
hrtimer_start_range_ns(timer, soft, delta, mode);
}
void hrtimer_sleeper_start_expires(struct hrtimer_sleeper *sl,
enum hrtimer_mode mode);
static inline void hrtimer_restart(struct hrtimer *timer)
{
hrtimer_start_expires(timer, HRTIMER_MODE_ABS);
}
/* Query timers: */
extern ktime_t __hrtimer_get_remaining(const struct hrtimer *timer, bool adjust);
/**
* hrtimer_get_remaining - get remaining time for the timer
* @timer: the timer to read
*/
static inline ktime_t hrtimer_get_remaining(const struct hrtimer *timer)
{
return __hrtimer_get_remaining(timer, false);
}
extern u64 hrtimer_get_next_event(void);
extern u64 hrtimer_next_event_without(const struct hrtimer *exclude);
extern bool hrtimer_active(const struct hrtimer *timer);
/**
* hrtimer_is_queued - check, whether the timer is on one of the queues
* @timer: Timer to check
*
* Returns: True if the timer is queued, false otherwise
*
* The function can be used lockless, but it gives only a current snapshot.
*/
static inline bool hrtimer_is_queued(struct hrtimer *timer)
{
/* The READ_ONCE pairs with the update functions of timer->state */
return !!(READ_ONCE(timer->state) & HRTIMER_STATE_ENQUEUED);
}
/*
* Helper function to check, whether the timer is running the callback
* function
*/
static inline int hrtimer_callback_running(struct hrtimer *timer)
{
return timer->base->running == timer;
}
/* Forward a hrtimer so it expires after now: */
extern u64
hrtimer_forward(struct hrtimer *timer, ktime_t now, ktime_t interval);
/**
* hrtimer_forward_now - forward the timer expiry so it expires after now
* @timer: hrtimer to forward
* @interval: the interval to forward
*
* Forward the timer expiry so it will expire after the current time
* of the hrtimer clock base. Returns the number of overruns.
*
* Can be safely called from the callback function of @timer. If
* called from other contexts @timer must neither be enqueued nor
* running the callback and the caller needs to take care of
* serialization.
*
* Note: This only updates the timer expiry value and does not requeue
* the timer.
*/
static inline u64 hrtimer_forward_now(struct hrtimer *timer,
ktime_t interval)
{
return hrtimer_forward(timer, timer->base->get_time(), interval);
}
/* Precise sleep: */
extern int nanosleep_copyout(struct restart_block *, struct timespec64 *);
extern long hrtimer_nanosleep(ktime_t rqtp, const enum hrtimer_mode mode,
const clockid_t clockid);
extern int schedule_hrtimeout_range(ktime_t *expires, u64 delta,
const enum hrtimer_mode mode);
extern int schedule_hrtimeout_range_clock(ktime_t *expires,
u64 delta,
const enum hrtimer_mode mode,
clockid_t clock_id);
extern int schedule_hrtimeout(ktime_t *expires, const enum hrtimer_mode mode);
/* Soft interrupt function to run the hrtimer queues: */
extern void hrtimer_run_queues(void);
/* Bootup initialization: */
extern void __init hrtimers_init(void);
/* Show pending timers: */
extern void sysrq_timer_list_show(void);
int hrtimers_prepare_cpu(unsigned int cpu);
#ifdef CONFIG_HOTPLUG_CPU
int hrtimers_dead_cpu(unsigned int cpu);
#else
#define hrtimers_dead_cpu NULL
#endif
#endif
// SPDX-License-Identifier: GPL-2.0-or-later
/*
*
* Robert Olsson <robert.olsson@its.uu.se> Uppsala Universitet
* & Swedish University of Agricultural Sciences.
*
* Jens Laas <jens.laas@data.slu.se> Swedish University of
* Agricultural Sciences.
*
* Hans Liss <hans.liss@its.uu.se> Uppsala Universitet
*
* This work is based on the LPC-trie which is originally described in:
*
* An experimental study of compression methods for dynamic tries
* Stefan Nilsson and Matti Tikkanen. Algorithmica, 33(1):19-33, 2002.
* https://www.csc.kth.se/~snilsson/software/dyntrie2/
*
* IP-address lookup using LC-tries. Stefan Nilsson and Gunnar Karlsson
* IEEE Journal on Selected Areas in Communications, 17(6):1083-1092, June 1999
*
* Code from fib_hash has been reused which includes the following header:
*
* INET An implementation of the TCP/IP protocol suite for the LINUX
* operating system. INET is implemented using the BSD Socket
* interface as the means of communication with the user level.
*
* IPv4 FIB: lookup engine and maintenance routines.
*
* Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
*
* Substantial contributions to this work comes from:
*
* David S. Miller, <davem@davemloft.net>
* Stephen Hemminger <shemminger@osdl.org>
* Paul E. McKenney <paulmck@us.ibm.com>
* Patrick McHardy <kaber@trash.net>
*/
#include <linux/cache.h>
#include <linux/uaccess.h>
#include <linux/bitops.h>
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/mm.h>
#include <linux/string.h>
#include <linux/socket.h>
#include <linux/sockios.h>
#include <linux/errno.h>
#include <linux/in.h>
#include <linux/inet.h>
#include <linux/inetdevice.h>
#include <linux/netdevice.h>
#include <linux/if_arp.h>
#include <linux/proc_fs.h>
#include <linux/rcupdate.h>
#include <linux/skbuff.h>
#include <linux/netlink.h>
#include <linux/init.h>
#include <linux/list.h>
#include <linux/slab.h>
#include <linux/export.h>
#include <linux/vmalloc.h>
#include <linux/notifier.h>
#include <net/net_namespace.h>
#include <net/ip.h>
#include <net/protocol.h>
#include <net/route.h>
#include <net/tcp.h>
#include <net/sock.h>
#include <net/ip_fib.h>
#include <net/fib_notifier.h>
#include <trace/events/fib.h>
#include "fib_lookup.h"
static int call_fib_entry_notifier(struct notifier_block *nb,
enum fib_event_type event_type, u32 dst,
int dst_len, struct fib_alias *fa,
struct netlink_ext_ack *extack)
{
struct fib_entry_notifier_info info = {
.info.extack = extack,
.dst = dst,
.dst_len = dst_len,
.fi = fa->fa_info,
.tos = fa->fa_tos,
.type = fa->fa_type,
.tb_id = fa->tb_id,
};
return call_fib4_notifier(nb, event_type, &info.info);
}
static int call_fib_entry_notifiers(struct net *net,
enum fib_event_type event_type, u32 dst,
int dst_len, struct fib_alias *fa,
struct netlink_ext_ack *extack)
{
struct fib_entry_notifier_info info = {
.info.extack = extack,
.dst = dst,
.dst_len = dst_len,
.fi = fa->fa_info,
.tos = fa->fa_tos,
.type = fa->fa_type,
.tb_id = fa->tb_id,
};
return call_fib4_notifiers(net, event_type, &info.info);
}
#define MAX_STAT_DEPTH 32
#define KEYLENGTH (8*sizeof(t_key))
#define KEY_MAX ((t_key)~0)
typedef unsigned int t_key;
#define IS_TRIE(n) ((n)->pos >= KEYLENGTH)
#define IS_TNODE(n) ((n)->bits)
#define IS_LEAF(n) (!(n)->bits)
struct key_vector {
t_key key;
unsigned char pos; /* 2log(KEYLENGTH) bits needed */
unsigned char bits; /* 2log(KEYLENGTH) bits needed */
unsigned char slen;
union {
/* This list pointer if valid if (pos | bits) == 0 (LEAF) */
struct hlist_head leaf;
/* This array is valid if (pos | bits) > 0 (TNODE) */
struct key_vector __rcu *tnode[0];
};
};
struct tnode {
struct rcu_head rcu;
t_key empty_children; /* KEYLENGTH bits needed */
t_key full_children; /* KEYLENGTH bits needed */
struct key_vector __rcu *parent;
struct key_vector kv[1];
#define tn_bits kv[0].bits
};
#define TNODE_SIZE(n) offsetof(struct tnode, kv[0].tnode[n])
#define LEAF_SIZE TNODE_SIZE(1)
#ifdef CONFIG_IP_FIB_TRIE_STATS
struct trie_use_stats {
unsigned int gets;
unsigned int backtrack;
unsigned int semantic_match_passed;
unsigned int semantic_match_miss;
unsigned int null_node_hit;
unsigned int resize_node_skipped;
};
#endif
struct trie_stat {
unsigned int totdepth;
unsigned int maxdepth;
unsigned int tnodes;
unsigned int leaves;
unsigned int nullpointers;
unsigned int prefixes;
unsigned int nodesizes[MAX_STAT_DEPTH];
};
struct trie {
struct key_vector kv[1];
#ifdef CONFIG_IP_FIB_TRIE_STATS
struct trie_use_stats __percpu *stats;
#endif
};
static struct key_vector *resize(struct trie *t, struct key_vector *tn);
static unsigned int tnode_free_size;
/*
* synchronize_rcu after call_rcu for outstanding dirty memory; it should be
* especially useful before resizing the root node with PREEMPT_NONE configs;
* the value was obtained experimentally, aiming to avoid visible slowdown.
*/
unsigned int sysctl_fib_sync_mem = 512 * 1024;
unsigned int sysctl_fib_sync_mem_min = 64 * 1024;
unsigned int sysctl_fib_sync_mem_max = 64 * 1024 * 1024;
static struct kmem_cache *fn_alias_kmem __ro_after_init;
static struct kmem_cache *trie_leaf_kmem __ro_after_init;
static inline struct tnode *tn_info(struct key_vector *kv)
{
return container_of(kv, struct tnode, kv[0]);
}
/* caller must hold RTNL */
#define node_parent(tn) rtnl_dereference(tn_info(tn)->parent)
#define get_child(tn, i) rtnl_dereference((tn)->tnode[i])
/* caller must hold RCU read lock or RTNL */
#define node_parent_rcu(tn) rcu_dereference_rtnl(tn_info(tn)->parent)
#define get_child_rcu(tn, i) rcu_dereference_rtnl((tn)->tnode[i])
/* wrapper for rcu_assign_pointer */
static inline void node_set_parent(struct key_vector *n, struct key_vector *tp)
{
if (n)
rcu_assign_pointer(tn_info(n)->parent, tp);
}
#define NODE_INIT_PARENT(n, p) RCU_INIT_POINTER(tn_info(n)->parent, p)
/* This provides us with the number of children in this node, in the case of a
* leaf this will return 0 meaning none of the children are accessible.
*/
static inline unsigned long child_length(const struct key_vector *tn)
{
return (1ul << tn->bits) & ~(1ul);
}
#define get_cindex(key, kv) (((key) ^ (kv)->key) >> (kv)->pos)
static inline unsigned long get_index(t_key key, struct key_vector *kv)
{
unsigned long index = key ^ kv->key;
if ((BITS_PER_LONG <= KEYLENGTH) && (KEYLENGTH == kv->pos))
return 0;
return index >> kv->pos;
}
/* To understand this stuff, an understanding of keys and all their bits is
* necessary. Every node in the trie has a key associated with it, but not
* all of the bits in that key are significant.
*
* Consider a node 'n' and its parent 'tp'.
*
* If n is a leaf, every bit in its key is significant. Its presence is
* necessitated by path compression, since during a tree traversal (when
* searching for a leaf - unless we are doing an insertion) we will completely
* ignore all skipped bits we encounter. Thus we need to verify, at the end of
* a potentially successful search, that we have indeed been walking the
* correct key path.
*
* Note that we can never "miss" the correct key in the tree if present by
* following the wrong path. Path compression ensures that segments of the key
* that are the same for all keys with a given prefix are skipped, but the
* skipped part *is* identical for each node in the subtrie below the skipped
* bit! trie_insert() in this implementation takes care of that.
*
* if n is an internal node - a 'tnode' here, the various parts of its key
* have many different meanings.
*
* Example:
* _________________________________________________________________
* | i | i | i | i | i | i | i | N | N | N | S | S | S | S | S | C |
* -----------------------------------------------------------------
* 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16
*
* _________________________________________________________________
* | C | C | C | u | u | u | u | u | u | u | u | u | u | u | u | u |
* -----------------------------------------------------------------
* 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0
*
* tp->pos = 22
* tp->bits = 3
* n->pos = 13
* n->bits = 4
*
* First, let's just ignore the bits that come before the parent tp, that is
* the bits from (tp->pos + tp->bits) to 31. They are *known* but at this
* point we do not use them for anything.
*
* The bits from (tp->pos) to (tp->pos + tp->bits - 1) - "N", above - are the
* index into the parent's child array. That is, they will be used to find
* 'n' among tp's children.
*
* The bits from (n->pos + n->bits) to (tp->pos - 1) - "S" - are skipped bits
* for the node n.
*
* All the bits we have seen so far are significant to the node n. The rest
* of the bits are really not needed or indeed known in n->key.
*
* The bits from (n->pos) to (n->pos + n->bits - 1) - "C" - are the index into
* n's child array, and will of course be different for each child.
*
* The rest of the bits, from 0 to (n->pos -1) - "u" - are completely unknown
* at this point.
*/
static const int halve_threshold = 25;
static const int inflate_threshold = 50;
static const int halve_threshold_root = 15;
static const int inflate_threshold_root = 30;
static void __alias_free_mem(struct rcu_head *head)
{
struct fib_alias *fa = container_of(head, struct fib_alias, rcu);
kmem_cache_free(fn_alias_kmem, fa);
}
static inline void alias_free_mem_rcu(struct fib_alias *fa)
{
call_rcu(&fa->rcu, __alias_free_mem);
}
#define TNODE_VMALLOC_MAX \
ilog2((SIZE_MAX - TNODE_SIZE(0)) / sizeof(struct key_vector *))
static void __node_free_rcu(struct rcu_head *head)
{
struct tnode *n = container_of(head, struct tnode, rcu);
if (!n->tn_bits)
kmem_cache_free(trie_leaf_kmem, n);
else
kvfree(n);
}
#define node_free(n) call_rcu(&tn_info(n)->rcu, __node_free_rcu)
static struct tnode *tnode_alloc(int bits)
{
size_t size;
/* verify bits is within bounds */
if (bits > TNODE_VMALLOC_MAX)
return NULL;
/* determine size and verify it is non-zero and didn't overflow */
size = TNODE_SIZE(1ul << bits);
if (size <= PAGE_SIZE)
return kzalloc(size, GFP_KERNEL);
else
return vzalloc(size);
}
static inline void empty_child_inc(struct key_vector *n)
{
tn_info(n)->empty_children++;
if (!tn_info(n)->empty_children)
tn_info(n)->full_children++;
}
static inline void empty_child_dec(struct key_vector *n)
{
if (!tn_info(n)->empty_children)
tn_info(n)->full_children--;
tn_info(n)->empty_children--;
}
static struct key_vector *leaf_new(t_key key, struct fib_alias *fa)
{
struct key_vector *l;
struct tnode *kv;
kv = kmem_cache_alloc(trie_leaf_kmem, GFP_KERNEL);
if (!kv)
return NULL;
/* initialize key vector */
l = kv->kv;
l->key = key;
l->pos = 0;
l->bits = 0;
l->slen = fa->fa_slen;
/* link leaf to fib alias */
INIT_HLIST_HEAD(&l->leaf);
hlist_add_head(&fa->fa_list, &l->leaf);
return l;
}
static struct key_vector *tnode_new(t_key key, int pos, int bits)
{
unsigned int shift = pos + bits;
struct key_vector *tn;
struct tnode *tnode;
/* verify bits and pos their msb bits clear and values are valid */
BUG_ON(!bits || (shift > KEYLENGTH));
tnode = tnode_alloc(bits);
if (!tnode)
return NULL;
pr_debug("AT %p s=%zu %zu\n", tnode, TNODE_SIZE(0),
sizeof(struct key_vector *) << bits);
if (bits == KEYLENGTH)
tnode->full_children = 1;
else
tnode->empty_children = 1ul << bits;
tn = tnode->kv;
tn->key = (shift < KEYLENGTH) ? (key >> shift) << shift : 0;
tn->pos = pos;
tn->bits = bits;
tn->slen = pos;
return tn;
}
/* Check whether a tnode 'n' is "full", i.e. it is an internal node
* and no bits are skipped. See discussion in dyntree paper p. 6
*/
static inline int tnode_full(struct key_vector *tn, struct key_vector *n)
{
return n && ((n->pos + n->bits) == tn->pos) && IS_TNODE(n);
}
/* Add a child at position i overwriting the old value.
* Update the value of full_children and empty_children.
*/
static void put_child(struct key_vector *tn, unsigned long i,
struct key_vector *n)
{
struct key_vector *chi = get_child(tn, i);
int isfull, wasfull;
BUG_ON(i >= child_length(tn));
/* update emptyChildren, overflow into fullChildren */
if (!n && chi)
empty_child_inc(tn);
if (n && !chi)
empty_child_dec(tn);
/* update fullChildren */
wasfull = tnode_full(tn, chi);
isfull = tnode_full(tn, n);
if (wasfull && !isfull)
tn_info(tn)->full_children--;
else if (!wasfull && isfull)
tn_info(tn)->full_children++;
if (n && (tn->slen < n->slen))
tn->slen = n->slen;
rcu_assign_pointer(tn->tnode[i], n);
}
static void update_children(struct key_vector *tn)
{
unsigned long i;
/* update all of the child parent pointers */
for (i = child_length(tn); i;) {
struct key_vector *inode = get_child(tn, --i);
if (!inode)
continue;
/* Either update the children of a tnode that
* already belongs to us or update the child
* to point to ourselves.
*/
if (node_parent(inode) == tn)
update_children(inode);
else
node_set_parent(inode, tn);
}
}
static inline void put_child_root(struct key_vector *tp, t_key key,
struct key_vector *n)
{
if (IS_TRIE(tp))
rcu_assign_pointer(tp->tnode[0], n);
else
put_child(tp, get_index(key, tp), n);
}
static inline void tnode_free_init(struct key_vector *tn)
{
tn_info(tn)->rcu.next = NULL;
}
static inline void tnode_free_append(struct key_vector *tn,
struct key_vector *n)
{
tn_info(n)->rcu.next = tn_info(tn)->rcu.next;
tn_info(tn)->rcu.next = &tn_info(n)->rcu;
}
static void tnode_free(struct key_vector *tn)
{
struct callback_head *head = &tn_info(tn)->rcu;
while (head) {
head = head->next;
tnode_free_size += TNODE_SIZE(1ul << tn->bits);
node_free(tn);
tn = container_of(head, struct tnode, rcu)->kv;
}
if (tnode_free_size >= sysctl_fib_sync_mem) {
tnode_free_size = 0;
synchronize_rcu();
}
}
static struct key_vector *replace(struct trie *t,
struct key_vector *oldtnode,
struct key_vector *tn)
{
struct key_vector *tp = node_parent(oldtnode);
unsigned long i;
/* setup the parent pointer out of and back into this node */
NODE_INIT_PARENT(tn, tp);
put_child_root(tp, tn->key, tn);
/* update all of the child parent pointers */
update_children(tn);
/* all pointers should be clean so we are done */
tnode_free(oldtnode);
/* resize children now that oldtnode is freed */
for (i = child_length(tn); i;) {
struct key_vector *inode = get_child(tn, --i);
/* resize child node */
if (tnode_full(tn, inode))
tn = resize(t, inode);
}
return tp;
}
static struct key_vector *inflate(struct trie *t,
struct key_vector *oldtnode)
{
struct key_vector *tn;
unsigned long i;
t_key m;
pr_debug("In inflate\n");
tn = tnode_new(oldtnode->key, oldtnode->pos - 1, oldtnode->bits + 1);
if (!tn)
goto notnode;
/* prepare oldtnode to be freed */
tnode_free_init(oldtnode);
/* Assemble all of the pointers in our cluster, in this case that
* represents all of the pointers out of our allocated nodes that
* point to existing tnodes and the links between our allocated
* nodes.
*/
for (i = child_length(oldtnode), m = 1u << tn->pos; i;) {
struct key_vector *inode = get_child(oldtnode, --i);
struct key_vector *node0, *node1;
unsigned long j, k;
/* An empty child */
if (!inode)
continue;
/* A leaf or an internal node with skipped bits */
if (!tnode_full(oldtnode, inode)) {
put_child(tn, get_index(inode->key, tn), inode);
continue;
}
/* drop the node in the old tnode free list */
tnode_free_append(oldtnode, inode);
/* An internal node with two children */
if (inode->bits == 1) {
put_child(tn, 2 * i + 1, get_child(inode, 1));
put_child(tn, 2 * i, get_child(inode, 0));
continue;
}
/* We will replace this node 'inode' with two new
* ones, 'node0' and 'node1', each with half of the
* original children. The two new nodes will have
* a position one bit further down the key and this
* means that the "significant" part of their keys
* (see the discussion near the top of this file)
* will differ by one bit, which will be "0" in
* node0's key and "1" in node1's key. Since we are
* moving the key position by one step, the bit that
* we are moving away from - the bit at position
* (tn->pos) - is the one that will differ between
* node0 and node1. So... we synthesize that bit in the
* two new keys.
*/
node1 = tnode_new(inode->key | m, inode->pos, inode->bits - 1);
if (!node1)
goto nomem;
node0 = tnode_new(inode->key, inode->pos, inode->bits - 1);
tnode_free_append(tn, node1);
if (!node0)
goto nomem;
tnode_free_append(tn, node0);
/* populate child pointers in new nodes */
for (k = child_length(inode), j = k / 2; j;) {
put_child(node1, --j, get_child(inode, --k));
put_child(node0, j, get_child(inode, j));
put_child(node1, --j, get_child(inode, --k));
put_child(node0, j, get_child(inode, j));
}
/* link new nodes to parent */
NODE_INIT_PARENT(node1, tn);
NODE_INIT_PARENT(node0, tn);
/* link parent to nodes */
put_child(tn, 2 * i + 1, node1);
put_child(tn, 2 * i, node0);
}
/* setup the parent pointers into and out of this node */
return replace(t, oldtnode, tn);
nomem:
/* all pointers should be clean so we are done */
tnode_free(tn);
notnode:
return NULL;
}
static struct key_vector *halve(struct trie *t,
struct key_vector *oldtnode)
{
struct key_vector *tn;
unsigned long i;
pr_debug("In halve\n");
tn = tnode_new(oldtnode->key, oldtnode->pos + 1, oldtnode->bits - 1);
if (!tn)
goto notnode;
/* prepare oldtnode to be freed */
tnode_free_init(oldtnode);
/* Assemble all of the pointers in our cluster, in this case that
* represents all of the pointers out of our allocated nodes that
* point to existing tnodes and the links between our allocated
* nodes.
*/
for (i = child_length(oldtnode); i;) {
struct key_vector *node1 = get_child(oldtnode, --i);
struct key_vector *node0 = get_child(oldtnode, --i);
struct key_vector *inode;
/* At least one of the children is empty */
if (!node1 || !node0) {
put_child(tn, i / 2, node1 ? : node0);
continue;
}
/* Two nonempty children */
inode = tnode_new(node0->key, oldtnode->pos, 1);
if (!inode)
goto nomem;
tnode_free_append(tn, inode);
/* initialize pointers out of node */
put_child(inode, 1, node1);
put_child(inode, 0, node0);
NODE_INIT_PARENT(inode, tn);
/* link parent to node */
put_child(tn, i / 2, inode);
}
/* setup the parent pointers into and out of this node */
return replace(t, oldtnode, tn);
nomem:
/* all pointers should be clean so we are done */
tnode_free(tn);
notnode:
return NULL;
}
static struct key_vector *collapse(struct trie *t,
struct key_vector *oldtnode)
{
struct key_vector *n, *tp;
unsigned long i;
/* scan the tnode looking for that one child that might still exist */
for (n = NULL, i = child_length(oldtnode); !n && i;)
n = get_child(oldtnode, --i);
/* compress one level */
tp = node_parent(oldtnode);
put_child_root(tp, oldtnode->key, n);
node_set_parent(n, tp);
/* drop dead node */
node_free(oldtnode);
return tp;
}
static unsigned char update_suffix(struct key_vector *tn)
{
unsigned char slen = tn->pos;
unsigned long stride, i;
unsigned char slen_max;
/* only vector 0 can have a suffix length greater than or equal to
* tn->pos + tn->bits, the second highest node will have a suffix
* length at most of tn->pos + tn->bits - 1
*/
slen_max = min_t(unsigned char, tn->pos + tn->bits - 1, tn->slen);
/* search though the list of children looking for nodes that might
* have a suffix greater than the one we currently have. This is
* why we start with a stride of 2 since a stride of 1 would
* represent the nodes with suffix length equal to tn->pos
*/
for (i = 0, stride = 0x2ul ; i < child_length(tn); i += stride) {
struct key_vector *n = get_child(tn, i);
if (!n || (n->slen <= slen))
continue;
/* update stride and slen based on new value */
stride <<= (n->slen - slen);
slen = n->slen;
i &= ~(stride - 1);
/* stop searching if we have hit the maximum possible value */
if (slen >= slen_max)
break;
}
tn->slen = slen;
return slen;
}
/* From "Implementing a dynamic compressed trie" by Stefan Nilsson of
* the Helsinki University of Technology and Matti Tikkanen of Nokia
* Telecommunications, page 6:
* "A node is doubled if the ratio of non-empty children to all
* children in the *doubled* node is at least 'high'."
*
* 'high' in this instance is the variable 'inflate_threshold'. It
* is expressed as a percentage, so we multiply it with
* child_length() and instead of multiplying by 2 (since the
* child array will be doubled by inflate()) and multiplying
* the left-hand side by 100 (to handle the percentage thing) we
* multiply the left-hand side by 50.
*
* The left-hand side may look a bit weird: child_length(tn)
* - tn->empty_children is of course the number of non-null children
* in the current node. tn->full_children is the number of "full"
* children, that is non-null tnodes with a skip value of 0.
* All of those will be doubled in the resulting inflated tnode, so
* we just count them one extra time here.
*
* A clearer way to write this would be:
*
* to_be_doubled = tn->full_children;
* not_to_be_doubled = child_length(tn) - tn->empty_children -
* tn->full_children;
*
* new_child_length = child_length(tn) * 2;
*
* new_fill_factor = 100 * (not_to_be_doubled + 2*to_be_doubled) /
* new_child_length;
* if (new_fill_factor >= inflate_threshold)
*
* ...and so on, tho it would mess up the while () loop.
*
* anyway,
* 100 * (not_to_be_doubled + 2*to_be_doubled) / new_child_length >=
* inflate_threshold
*
* avoid a division:
* 100 * (not_to_be_doubled + 2*to_be_doubled) >=
* inflate_threshold * new_child_length
*
* expand not_to_be_doubled and to_be_doubled, and shorten:
* 100 * (child_length(tn) - tn->empty_children +
* tn->full_children) >= inflate_threshold * new_child_length
*
* expand new_child_length:
* 100 * (child_length(tn) - tn->empty_children +
* tn->full_children) >=
* inflate_threshold * child_length(tn) * 2
*
* shorten again:
* 50 * (tn->full_children + child_length(tn) -
* tn->empty_children) >= inflate_threshold *
* child_length(tn)
*
*/
static inline bool should_inflate(struct key_vector *tp, struct key_vector *tn)
{
unsigned long used = child_length(tn);
unsigned long threshold = used;
/* Keep root node larger */
threshold *= IS_TRIE(tp) ? inflate_threshold_root : inflate_threshold;
used -= tn_info(tn)->empty_children;
used += tn_info(tn)->full_children;
/* if bits == KEYLENGTH then pos = 0, and will fail below */
return (used > 1) && tn->pos && ((50 * used) >= threshold);
}
static inline bool should_halve(struct key_vector *tp, struct key_vector *tn)
{
unsigned long used = child_length(tn);
unsigned long threshold = used;
/* Keep root node larger */
threshold *= IS_TRIE(tp) ? halve_threshold_root : halve_threshold;
used -= tn_info(tn)->empty_children;
/* if bits == KEYLENGTH then used = 100% on wrap, and will fail below */
return (used > 1) && (tn->bits > 1) && ((100 * used) < threshold);
}
static inline bool should_collapse(struct key_vector *tn)
{
unsigned long used = child_length(tn);
used -= tn_info(tn)->empty_children;
/* account for bits == KEYLENGTH case */
if ((tn->bits == KEYLENGTH) && tn_info(tn)->full_children)
used -= KEY_MAX;
/* One child or none, time to drop us from the trie */
return used < 2;
}
#define MAX_WORK 10
static struct key_vector *resize(struct trie *t, struct key_vector *tn)
{
#ifdef CONFIG_IP_FIB_TRIE_STATS
struct trie_use_stats __percpu *stats = t->stats;
#endif
struct key_vector *tp = node_parent(tn);
unsigned long cindex = get_index(tn->key, tp);
int max_work = MAX_WORK;
pr_debug("In tnode_resize %p inflate_threshold=%d threshold=%d\n",
tn, inflate_threshold, halve_threshold);
/* track the tnode via the pointer from the parent instead of
* doing it ourselves. This way we can let RCU fully do its
* thing without us interfering
*/
BUG_ON(tn != get_child(tp, cindex));
/* Double as long as the resulting node has a number of
* nonempty nodes that are above the threshold.
*/
while (should_inflate(tp, tn) && max_work) {
tp = inflate(t, tn);
if (!tp) {
#ifdef CONFIG_IP_FIB_TRIE_STATS
this_cpu_inc(stats->resize_node_skipped);
#endif
break;
}
max_work--;
tn = get_child(tp, cindex);
}
/* update parent in case inflate failed */
tp = node_parent(tn);
/* Return if at least one inflate is run */
if (max_work != MAX_WORK)
return tp;
/* Halve as long as the number of empty children in this
* node is above threshold.
*/
while (should_halve(tp, tn) && max_work) {
tp = halve(t, tn);
if (!tp) {
#ifdef CONFIG_IP_FIB_TRIE_STATS
this_cpu_inc(stats->resize_node_skipped);
#endif
break;
}
max_work--;
tn = get_child(tp, cindex);
}
/* Only one child remains */
if (should_collapse(tn))
return collapse(t, tn);
/* update parent in case halve failed */
return node_parent(tn);
}
static void node_pull_suffix(struct key_vector *tn, unsigned char slen)
{
unsigned char node_slen = tn->slen;
while ((node_slen > tn->pos) && (node_slen > slen)) {
slen = update_suffix(tn);
if (node_slen == slen)
break;
tn = node_parent(tn);
node_slen = tn->slen;
}
}
static void node_push_suffix(struct key_vector *tn, unsigned char slen)
{
while (tn->slen < slen) {
tn->slen = slen;
tn = node_parent(tn);
}
}
/* rcu_read_lock needs to be hold by caller from readside */
static struct key_vector *fib_find_node(struct trie *t,
struct key_vector **tp, u32 key)
{
struct key_vector *pn, *n = t->kv;
unsigned long index = 0;
do {
pn = n;
n = get_child_rcu(n, index);
if (!n)
break;
index = get_cindex(key, n);
/* This bit of code is a bit tricky but it combines multiple
* checks into a single check. The prefix consists of the
* prefix plus zeros for the bits in the cindex. The index
* is the difference between the key and this value. From
* this we can actually derive several pieces of data.
* if (index >= (1ul << bits))
* we have a mismatch in skip bits and failed
* else
* we know the value is cindex
*
* This check is safe even if bits == KEYLENGTH due to the
* fact that we can only allocate a node with 32 bits if a
* long is greater than 32 bits.
*/
if (index >= (1ul << n->bits)) {
n = NULL;
break;
}
/* keep searching until we find a perfect match leaf or NULL */
} while (IS_TNODE(n));
*tp = pn;
return n;
}
/* Return the first fib alias matching TOS with
* priority less than or equal to PRIO.
* If 'find_first' is set, return the first matching
* fib alias, regardless of TOS and priority.
*/
static struct fib_alias *fib_find_alias(struct hlist_head *fah, u8 slen,
u8 tos, u32 prio, u32 tb_id,
bool find_first)
{
struct fib_alias *fa;
if (!fah)
return NULL;
hlist_for_each_entry(fa, fah, fa_list) {
if (fa->fa_slen < slen)
continue;
if (fa->fa_slen != slen)
break;
if (fa->tb_id > tb_id)
continue;
if (fa->tb_id != tb_id)
break;
if (find_first)
return fa;
if (fa->fa_tos > tos)
continue;
if (fa->fa_info->fib_priority >= prio || fa->fa_tos < tos)
return fa;
}
return NULL;
}
static struct fib_alias *
fib_find_matching_alias(struct net *net, const struct fib_rt_info *fri)
{
u8 slen = KEYLENGTH - fri->dst_len;
struct key_vector *l, *tp;
struct fib_table *tb;
struct fib_alias *fa;
struct trie *t;
tb = fib_get_table(net, fri->tb_id);
if (!tb)
return NULL;
t = (struct trie *)tb->tb_data;
l = fib_find_node(t, &tp, be32_to_cpu(fri->dst));
if (!l)
return NULL;
hlist_for_each_entry_rcu(fa, &l->leaf, fa_list) {
if (fa->fa_slen == slen && fa->tb_id == fri->tb_id &&
fa->fa_tos == fri->tos && fa->fa_info == fri->fi &&
fa->fa_type == fri->type)
return fa;
}
return NULL;
}
void fib_alias_hw_flags_set(struct net *net, const struct fib_rt_info *fri)
{
struct fib_alias *fa_match;
struct sk_buff *skb;
int err;
rcu_read_lock();
fa_match = fib_find_matching_alias(net, fri);
if (!fa_match)
goto out;
/* These are paired with the WRITE_ONCE() happening in this function.
* The reason is that we are only protected by RCU at this point.
*/
if (READ_ONCE(fa_match->offload) == fri->offload &&
READ_ONCE(fa_match->trap) == fri->trap &&
READ_ONCE(fa_match->offload_failed) == fri->offload_failed)
goto out;
WRITE_ONCE(fa_match->offload, fri->offload);
WRITE_ONCE(fa_match->trap, fri->trap);
/* 2 means send notifications only if offload_failed was changed. */
if (net->ipv4.sysctl_fib_notify_on_flag_change == 2 &&
READ_ONCE(fa_match->offload_failed) == fri->offload_failed)
goto out;
WRITE_ONCE(fa_match->offload_failed, fri->offload_failed);
if (!net->ipv4.sysctl_fib_notify_on_flag_change)
goto out;
skb = nlmsg_new(fib_nlmsg_size(fa_match->fa_info), GFP_ATOMIC);
if (!skb) {
err = -ENOBUFS;
goto errout;
}
err = fib_dump_info(skb, 0, 0, RTM_NEWROUTE, fri, 0);
if (err < 0) {
/* -EMSGSIZE implies BUG in fib_nlmsg_size() */
WARN_ON(err == -EMSGSIZE);
kfree_skb(skb);
goto errout;
}
rtnl_notify(skb, net, 0, RTNLGRP_IPV4_ROUTE, NULL, GFP_ATOMIC);
goto out;
errout:
rtnl_set_sk_err(net, RTNLGRP_IPV4_ROUTE, err);
out:
rcu_read_unlock();
}
EXPORT_SYMBOL_GPL(fib_alias_hw_flags_set);
static void trie_rebalance(struct trie *t, struct key_vector *tn)
{
while (!IS_TRIE(tn))
tn = resize(t, tn);
}
static int fib_insert_node(struct trie *t, struct key_vector *tp,
struct fib_alias *new, t_key key)
{
struct key_vector *n, *l;
l = leaf_new(key, new);
if (!l)
goto noleaf;
/* retrieve child from parent node */
n = get_child(tp, get_index(key, tp));
/* Case 2: n is a LEAF or a TNODE and the key doesn't match.
*
* Add a new tnode here
* first tnode need some special handling
* leaves us in position for handling as case 3
*/
if (n) {
struct key_vector *tn;
tn = tnode_new(key, __fls(key ^ n->key), 1);
if (!tn)
goto notnode;
/* initialize routes out of node */
NODE_INIT_PARENT(tn, tp);
put_child(tn, get_index(key, tn) ^ 1, n);
/* start adding routes into the node */
put_child_root(tp, key, tn);
node_set_parent(n, tn);
/* parent now has a NULL spot where the leaf can go */
tp = tn;
}
/* Case 3: n is NULL, and will just insert a new leaf */
node_push_suffix(tp, new->fa_slen);
NODE_INIT_PARENT(l, tp);
put_child_root(tp, key, l);
trie_rebalance(t, tp);
return 0;
notnode:
node_free(l);
noleaf:
return -ENOMEM;
}
static int fib_insert_alias(struct trie *t, struct key_vector *tp,
struct key_vector *l, struct fib_alias *new,
struct fib_alias *fa, t_key key)
{
if (!l)
return fib_insert_node(t, tp, new, key);
if (fa) {
hlist_add_before_rcu(&new->fa_list, &fa->fa_list);
} else {
struct fib_alias *last;
hlist_for_each_entry(last, &l->leaf, fa_list) {
if (new->fa_slen < last->fa_slen)
break;
if ((new->fa_slen == last->fa_slen) &&
(new->tb_id > last->tb_id))
break;
fa = last;
}
if (fa)
hlist_add_behind_rcu(&new->fa_list, &fa->fa_list);
else
hlist_add_head_rcu(&new->fa_list, &l->leaf);
}
/* if we added to the tail node then we need to update slen */
if (l->slen < new->fa_slen) {
l->slen = new->fa_slen;
node_push_suffix(tp, new->fa_slen);
}
return 0;
}
static bool fib_valid_key_len(u32 key, u8 plen, struct netlink_ext_ack *extack)
{
if (plen > KEYLENGTH) {
NL_SET_ERR_MSG(extack, "Invalid prefix length");
return false;
}
if ((plen < KEYLENGTH) && (key << plen)) {
NL_SET_ERR_MSG(extack,
"Invalid prefix for given prefix length");
return false;
}
return true;
}
static void fib_remove_alias(struct trie *t, struct key_vector *tp,
struct key_vector *l, struct fib_alias *old);
/* Caller must hold RTNL. */
int fib_table_insert(struct net *net, struct fib_table *tb,
struct fib_config *cfg, struct netlink_ext_ack *extack)
{
struct trie *t = (struct trie *)tb->tb_data;
struct fib_alias *fa, *new_fa;
struct key_vector *l, *tp;
u16 nlflags = NLM_F_EXCL;
struct fib_info *fi;
u8 plen = cfg->fc_dst_len;
u8 slen = KEYLENGTH - plen;
u8 tos = cfg->fc_tos;
u32 key;
int err;
key = ntohl(cfg->fc_dst);
if (!fib_valid_key_len(key, plen, extack))
return -EINVAL;
pr_debug("Insert table=%u %08x/%d\n", tb->tb_id, key, plen);
fi = fib_create_info(cfg, extack);
if (IS_ERR(fi)) {
err = PTR_ERR(fi);
goto err;
}
l = fib_find_node(t, &tp, key);
fa = l ? fib_find_alias(&l->leaf, slen, tos, fi->fib_priority,
tb->tb_id, false) : NULL;
/* Now fa, if non-NULL, points to the first fib alias
* with the same keys [prefix,tos,priority], if such key already
* exists or to the node before which we will insert new one.
*
* If fa is NULL, we will need to allocate a new one and
* insert to the tail of the section matching the suffix length
* of the new alias.
*/
if (fa && fa->fa_tos == tos &&
fa->fa_info->fib_priority == fi->fib_priority) {
struct fib_alias *fa_first, *fa_match;
err = -EEXIST;
if (cfg->fc_nlflags & NLM_F_EXCL)
goto out;
nlflags &= ~NLM_F_EXCL;
/* We have 2 goals:
* 1. Find exact match for type, scope, fib_info to avoid
* duplicate routes
* 2. Find next 'fa' (or head), NLM_F_APPEND inserts before it
*/
fa_match = NULL;
fa_first = fa;
hlist_for_each_entry_from(fa, fa_list) {
if ((fa->fa_slen != slen) ||
(fa->tb_id != tb->tb_id) ||
(fa->fa_tos != tos))
break;
if (fa->fa_info->fib_priority != fi->fib_priority)
break;
if (fa->fa_type == cfg->fc_type &&
fa->fa_info == fi) {
fa_match = fa;
break;
}
}
if (cfg->fc_nlflags & NLM_F_REPLACE) {
struct fib_info *fi_drop;
u8 state;
nlflags |= NLM_F_REPLACE;
fa = fa_first;
if (fa_match) {
if (fa == fa_match)
err = 0;
goto out;
}
err = -ENOBUFS;
new_fa = kmem_cache_alloc(fn_alias_kmem, GFP_KERNEL);
if (!new_fa)
goto out;
fi_drop = fa->fa_info;
new_fa->fa_tos = fa->fa_tos;
new_fa->fa_info = fi;
new_fa->fa_type = cfg->fc_type;
state = fa->fa_state;
new_fa->fa_state = state & ~FA_S_ACCESSED;
new_fa->fa_slen = fa->fa_slen;
new_fa->tb_id = tb->tb_id;
new_fa->fa_default = -1;
new_fa->offload = 0;
new_fa->trap = 0;
new_fa->offload_failed = 0;
hlist_replace_rcu(&fa->fa_list, &new_fa->fa_list);
if (fib_find_alias(&l->leaf, fa->fa_slen, 0, 0,
tb->tb_id, true) == new_fa) {
enum fib_event_type fib_event;
fib_event = FIB_EVENT_ENTRY_REPLACE;
err = call_fib_entry_notifiers(net, fib_event,
key, plen,
new_fa, extack);
if (err) {
hlist_replace_rcu(&new_fa->fa_list,
&fa->fa_list);
goto out_free_new_fa;
}
}
rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen,
tb->tb_id, &cfg->fc_nlinfo, nlflags);
alias_free_mem_rcu(fa);
fib_release_info(fi_drop);
if (state & FA_S_ACCESSED)
rt_cache_flush(cfg->fc_nlinfo.nl_net);
goto succeeded;
}
/* Error if we find a perfect match which
* uses the same scope, type, and nexthop
* information.
*/
if (fa_match)
goto out;
if (cfg->fc_nlflags & NLM_F_APPEND)
nlflags |= NLM_F_APPEND;
else
fa = fa_first;
}
err = -ENOENT;
if (!(cfg->fc_nlflags & NLM_F_CREATE))
goto out;
nlflags |= NLM_F_CREATE;
err = -ENOBUFS;
new_fa = kmem_cache_alloc(fn_alias_kmem, GFP_KERNEL);
if (!new_fa)
goto out;
new_fa->fa_info = fi;
new_fa->fa_tos = tos;
new_fa->fa_type = cfg->fc_type;
new_fa->fa_state = 0;
new_fa->fa_slen = slen;
new_fa->tb_id = tb->tb_id;
new_fa->fa_default = -1;
new_fa->offload = 0;
new_fa->trap = 0;
new_fa->offload_failed = 0;
/* Insert new entry to the list. */
err = fib_insert_alias(t, tp, l, new_fa, fa, key);
if (err)
goto out_free_new_fa;
/* The alias was already inserted, so the node must exist. */
l = l ? l : fib_find_node(t, &tp, key);
if (WARN_ON_ONCE(!l))
goto out_free_new_fa;
if (fib_find_alias(&l->leaf, new_fa->fa_slen, 0, 0, tb->tb_id, true) ==
new_fa) {
enum fib_event_type fib_event;
fib_event = FIB_EVENT_ENTRY_REPLACE;
err = call_fib_entry_notifiers(net, fib_event, key, plen,
new_fa, extack);
if (err)
goto out_remove_new_fa;
}
if (!plen)
tb->tb_num_default++;
rt_cache_flush(cfg->fc_nlinfo.nl_net);
rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen, new_fa->tb_id,
&cfg->fc_nlinfo, nlflags);
succeeded:
return 0;
out_remove_new_fa:
fib_remove_alias(t, tp, l, new_fa);
out_free_new_fa:
kmem_cache_free(fn_alias_kmem, new_fa);
out:
fib_release_info(fi);
err:
return err;
}
static inline t_key prefix_mismatch(t_key key, struct key_vector *n)
{
t_key prefix = n->key;
return (key ^ prefix) & (prefix | -prefix);
}
bool fib_lookup_good_nhc(const struct fib_nh_common *nhc, int fib_flags,
const struct flowi4 *flp)
{
if (nhc->nhc_flags & RTNH_F_DEAD)
return false;
if (ip_ignore_linkdown(nhc->nhc_dev) &&
nhc->nhc_flags & RTNH_F_LINKDOWN &&
!(fib_flags & FIB_LOOKUP_IGNORE_LINKSTATE))
return false;
if (!(flp->flowi4_flags & FLOWI_FLAG_SKIP_NH_OIF)) { if (flp->flowi4_oif && flp->flowi4_oif != nhc->nhc_oif)
return false;
}
return true;
}
/* should be called with rcu_read_lock */
int fib_table_lookup(struct fib_table *tb, const struct flowi4 *flp,
struct fib_result *res, int fib_flags)
{
struct trie *t = (struct trie *) tb->tb_data;
#ifdef CONFIG_IP_FIB_TRIE_STATS
struct trie_use_stats __percpu *stats = t->stats;
#endif
const t_key key = ntohl(flp->daddr);
struct key_vector *n, *pn;
struct fib_alias *fa;
unsigned long index;
t_key cindex;
pn = t->kv;
cindex = 0;
n = get_child_rcu(pn, cindex);
if (!n) {
trace_fib_table_lookup(tb->tb_id, flp, NULL, -EAGAIN);
return -EAGAIN;
}
#ifdef CONFIG_IP_FIB_TRIE_STATS
this_cpu_inc(stats->gets);
#endif
/* Step 1: Travel to the longest prefix match in the trie */
for (;;) {
index = get_cindex(key, n);
/* This bit of code is a bit tricky but it combines multiple
* checks into a single check. The prefix consists of the
* prefix plus zeros for the "bits" in the prefix. The index
* is the difference between the key and this value. From
* this we can actually derive several pieces of data.
* if (index >= (1ul << bits))
* we have a mismatch in skip bits and failed
* else
* we know the value is cindex
*
* This check is safe even if bits == KEYLENGTH due to the
* fact that we can only allocate a node with 32 bits if a
* long is greater than 32 bits.
*/
if (index >= (1ul << n->bits))
break;
/* we have found a leaf. Prefixes have already been compared */
if (IS_LEAF(n))
goto found;
/* only record pn and cindex if we are going to be chopping
* bits later. Otherwise we are just wasting cycles.
*/
if (n->slen > n->pos) {
pn = n;
cindex = index;
}
n = get_child_rcu(n, index);
if (unlikely(!n))
goto backtrace;
}
/* Step 2: Sort out leaves and begin backtracing for longest prefix */
for (;;) {
/* record the pointer where our next node pointer is stored */
struct key_vector __rcu **cptr = n->tnode;
/* This test verifies that none of the bits that differ
* between the key and the prefix exist in the region of
* the lsb and higher in the prefix.
*/
if (unlikely(prefix_mismatch(key, n)) || (n->slen == n->pos))
goto backtrace;
/* exit out and process leaf */
if (unlikely(IS_LEAF(n)))
break;
/* Don't bother recording parent info. Since we are in
* prefix match mode we will have to come back to wherever
* we started this traversal anyway
*/
while ((n = rcu_dereference(*cptr)) == NULL) {
backtrace:
#ifdef CONFIG_IP_FIB_TRIE_STATS
if (!n)
this_cpu_inc(stats->null_node_hit);
#endif
/* If we are at cindex 0 there are no more bits for
* us to strip at this level so we must ascend back
* up one level to see if there are any more bits to
* be stripped there.
*/
while (!cindex) {
t_key pkey = pn->key;
/* If we don't have a parent then there is
* nothing for us to do as we do not have any
* further nodes to parse.
*/
if (IS_TRIE(pn)) { trace_fib_table_lookup(tb->tb_id, flp,
NULL, -EAGAIN);
return -EAGAIN;
}
#ifdef CONFIG_IP_FIB_TRIE_STATS
this_cpu_inc(stats->backtrack);
#endif
/* Get Child's index */
pn = node_parent_rcu(pn);
cindex = get_index(pkey, pn);
}
/* strip the least significant bit from the cindex */
cindex &= cindex - 1;
/* grab pointer for next child node */
cptr = &pn->tnode[cindex];
}
}
found:
/* this line carries forward the xor from earlier in the function */
index = key ^ n->key;
/* Step 3: Process the leaf, if that fails fall back to backtracing */
hlist_for_each_entry_rcu(fa, &n->leaf, fa_list) { struct fib_info *fi = fa->fa_info;
struct fib_nh_common *nhc;
int nhsel, err;
if ((BITS_PER_LONG > KEYLENGTH) || (fa->fa_slen < KEYLENGTH)) {
if (index >= (1ul << fa->fa_slen))
continue;
}
if (fa->fa_tos && fa->fa_tos != flp->flowi4_tos)
continue;
if (fi->fib_dead)
continue;
if (fa->fa_info->fib_scope < flp->flowi4_scope)
continue;
fib_alias_accessed(fa);
err = fib_props[fa->fa_type].error;
if (unlikely(err < 0)) {
out_reject:
#ifdef CONFIG_IP_FIB_TRIE_STATS
this_cpu_inc(stats->semantic_match_passed);
#endif
trace_fib_table_lookup(tb->tb_id, flp, NULL, err);
return err;
}
if (fi->fib_flags & RTNH_F_DEAD)
continue;
if (unlikely(fi->nh)) {
if (nexthop_is_blackhole(fi->nh)) {
err = fib_props[RTN_BLACKHOLE].error;
goto out_reject;
}
nhc = nexthop_get_nhc_lookup(fi->nh, fib_flags, flp,
&nhsel);
if (nhc)
goto set_result;
goto miss;
}
for (nhsel = 0; nhsel < fib_info_num_path(fi); nhsel++) {
nhc = fib_info_nhc(fi, nhsel);
if (!fib_lookup_good_nhc(nhc, fib_flags, flp))
continue;
set_result:
if (!(fib_flags & FIB_LOOKUP_NOREF)) refcount_inc(&fi->fib_clntref); res->prefix = htonl(n->key);
res->prefixlen = KEYLENGTH - fa->fa_slen;
res->nh_sel = nhsel;
res->nhc = nhc;
res->type = fa->fa_type;
res->scope = fi->fib_scope;
res->fi = fi;
res->table = tb;
res->fa_head = &n->leaf;
#ifdef CONFIG_IP_FIB_TRIE_STATS
this_cpu_inc(stats->semantic_match_passed);
#endif
trace_fib_table_lookup(tb->tb_id, flp, nhc, err);
return err;
}
}
miss:
#ifdef CONFIG_IP_FIB_TRIE_STATS
this_cpu_inc(stats->semantic_match_miss);
#endif
goto backtrace;
}
EXPORT_SYMBOL_GPL(fib_table_lookup);
static void fib_remove_alias(struct trie *t, struct key_vector *tp,
struct key_vector *l, struct fib_alias *old)
{
/* record the location of the previous list_info entry */
struct hlist_node **pprev = old->fa_list.pprev;
struct fib_alias *fa = hlist_entry(pprev, typeof(*fa), fa_list.next);
/* remove the fib_alias from the list */
hlist_del_rcu(&old->fa_list);
/* if we emptied the list this leaf will be freed and we can sort
* out parent suffix lengths as a part of trie_rebalance
*/
if (hlist_empty(&l->leaf)) {
if (tp->slen == l->slen)
node_pull_suffix(tp, tp->pos);
put_child_root(tp, l->key, NULL);
node_free(l);
trie_rebalance(t, tp);
return;
}
/* only access fa if it is pointing at the last valid hlist_node */
if (*pprev)
return;
/* update the trie with the latest suffix length */
l->slen = fa->fa_slen;
node_pull_suffix(tp, fa->fa_slen);
}
static void fib_notify_alias_delete(struct net *net, u32 key,
struct hlist_head *fah,
struct fib_alias *fa_to_delete,
struct netlink_ext_ack *extack)
{
struct fib_alias *fa_next, *fa_to_notify;
u32 tb_id = fa_to_delete->tb_id;
u8 slen = fa_to_delete->fa_slen;
enum fib_event_type fib_event;
/* Do not notify if we do not care about the route. */
if (fib_find_alias(fah, slen, 0, 0, tb_id, true) != fa_to_delete)
return;
/* Determine if the route should be replaced by the next route in the
* list.
*/
fa_next = hlist_entry_safe(fa_to_delete->fa_list.next,
struct fib_alias, fa_list);
if (fa_next && fa_next->fa_slen == slen && fa_next->tb_id == tb_id) {
fib_event = FIB_EVENT_ENTRY_REPLACE;
fa_to_notify = fa_next;
} else {
fib_event = FIB_EVENT_ENTRY_DEL;
fa_to_notify = fa_to_delete;
}
call_fib_entry_notifiers(net, fib_event, key, KEYLENGTH - slen,
fa_to_notify, extack);
}
/* Caller must hold RTNL. */
int fib_table_delete(struct net *net, struct fib_table *tb,
struct fib_config *cfg, struct netlink_ext_ack *extack)
{
struct trie *t = (struct trie *) tb->tb_data;
struct fib_alias *fa, *fa_to_delete;
struct key_vector *l, *tp;
u8 plen = cfg->fc_dst_len;
u8 slen = KEYLENGTH - plen;
u8 tos = cfg->fc_tos;
u32 key;
key = ntohl(cfg->fc_dst);
if (!fib_valid_key_len(key, plen, extack))
return -EINVAL;
l = fib_find_node(t, &tp, key);
if (!l)
return -ESRCH;
fa = fib_find_alias(&l->leaf, slen, tos, 0, tb->tb_id, false);
if (!fa)
return -ESRCH;
pr_debug("Deleting %08x/%d tos=%d t=%p\n", key, plen, tos, t);
fa_to_delete = NULL;
hlist_for_each_entry_from(fa, fa_list) {
struct fib_info *fi = fa->fa_info;
if ((fa->fa_slen != slen) ||
(fa->tb_id != tb->tb_id) ||
(fa->fa_tos != tos))
break;
if ((!cfg->fc_type || fa->fa_type == cfg->fc_type) &&
(cfg->fc_scope == RT_SCOPE_NOWHERE ||
fa->fa_info->fib_scope == cfg->fc_scope) &&
(!cfg->fc_prefsrc ||
fi->fib_prefsrc == cfg->fc_prefsrc) &&
(!cfg->fc_protocol ||
fi->fib_protocol == cfg->fc_protocol) &&
fib_nh_match(net, cfg, fi, extack) == 0 &&
fib_metrics_match(cfg, fi)) {
fa_to_delete = fa;
break;
}
}
if (!fa_to_delete)
return -ESRCH;
fib_notify_alias_delete(net, key, &l->leaf, fa_to_delete, extack);
rtmsg_fib(RTM_DELROUTE, htonl(key), fa_to_delete, plen, tb->tb_id,
&cfg->fc_nlinfo, 0);
if (!plen)
tb->tb_num_default--;
fib_remove_alias(t, tp, l, fa_to_delete);
if (fa_to_delete->fa_state & FA_S_ACCESSED)
rt_cache_flush(cfg->fc_nlinfo.nl_net);
fib_release_info(fa_to_delete->fa_info);
alias_free_mem_rcu(fa_to_delete);
return 0;
}
/* Scan for the next leaf starting at the provided key value */
static struct key_vector *leaf_walk_rcu(struct key_vector **tn, t_key key)
{
struct key_vector *pn, *n = *tn;
unsigned long cindex;
/* this loop is meant to try and find the key in the trie */
do {
/* record parent and next child index */
pn = n;
cindex = (key > pn->key) ? get_index(key, pn) : 0;
if (cindex >> pn->bits)
break;
/* descend into the next child */
n = get_child_rcu(pn, cindex++);
if (!n)
break;
/* guarantee forward progress on the keys */
if (IS_LEAF(n) && (n->key >= key))
goto found;
} while (IS_TNODE(n));
/* this loop will search for the next leaf with a greater key */
while (!IS_TRIE(pn)) {
/* if we exhausted the parent node we will need to climb */
if (cindex >= (1ul << pn->bits)) {
t_key pkey = pn->key;
pn = node_parent_rcu(pn);
cindex = get_index(pkey, pn) + 1;
continue;
}
/* grab the next available node */
n = get_child_rcu(pn, cindex++);
if (!n)
continue;
/* no need to compare keys since we bumped the index */
if (IS_LEAF(n))
goto found;
/* Rescan start scanning in new node */
pn = n;
cindex = 0;
}
*tn = pn;
return NULL; /* Root of trie */
found:
/* if we are at the limit for keys just return NULL for the tnode */
*tn = pn;
return n;
}
static void fib_trie_free(struct fib_table *tb)
{
struct trie *t = (struct trie *)tb->tb_data;
struct key_vector *pn = t->kv;
unsigned long cindex = 1;
struct hlist_node *tmp;
struct fib_alias *fa;
/* walk trie in reverse order and free everything */
for (;;) {
struct key_vector *n;
if (!(cindex--)) {
t_key pkey = pn->key;
if (IS_TRIE(pn))
break;
n = pn;
pn = node_parent(pn);
/* drop emptied tnode */
put_child_root(pn, n->key, NULL);
node_free(n);
cindex = get_index(pkey, pn);
continue;
}
/* grab the next available node */
n = get_child(pn, cindex);
if (!n)
continue;
if (IS_TNODE(n)) {
/* record pn and cindex for leaf walking */
pn = n;
cindex = 1ul << n->bits;
continue;
}
hlist_for_each_entry_safe(fa, tmp, &n->leaf, fa_list) {
hlist_del_rcu(&fa->fa_list);
alias_free_mem_rcu(fa);
}
put_child_root(pn, n->key, NULL);
node_free(n);
}
#ifdef CONFIG_IP_FIB_TRIE_STATS
free_percpu(t->stats);
#endif
kfree(tb);
}
struct fib_table *fib_trie_unmerge(struct fib_table *oldtb)
{
struct trie *ot = (struct trie *)oldtb->tb_data;
struct key_vector *l, *tp = ot->kv;
struct fib_table *local_tb;
struct fib_alias *fa;
struct trie *lt;
t_key key = 0;
if (oldtb->tb_data == oldtb->__data)
return oldtb;
local_tb = fib_trie_table(RT_TABLE_LOCAL, NULL);
if (!local_tb)
return NULL;
lt = (struct trie *)local_tb->tb_data;
while ((l = leaf_walk_rcu(&tp, key)) != NULL) {
struct key_vector *local_l = NULL, *local_tp;
hlist_for_each_entry(fa, &l->leaf, fa_list) {
struct fib_alias *new_fa;
if (local_tb->tb_id != fa->tb_id)
continue;
/* clone fa for new local table */
new_fa = kmem_cache_alloc(fn_alias_kmem, GFP_KERNEL);
if (!new_fa)
goto out;
memcpy(new_fa, fa, sizeof(*fa));
/* insert clone into table */
if (!local_l)
local_l = fib_find_node(lt, &local_tp, l->key);
if (fib_insert_alias(lt, local_tp, local_l, new_fa,
NULL, l->key)) {
kmem_cache_free(fn_alias_kmem, new_fa);
goto out;
}
}
/* stop loop if key wrapped back to 0 */
key = l->key + 1;
if (key < l->key)
break;
}
return local_tb;
out:
fib_trie_free(local_tb);
return NULL;
}
/* Caller must hold RTNL */
void fib_table_flush_external(struct fib_table *tb)
{
struct trie *t = (struct trie *)tb->tb_data;
struct key_vector *pn = t->kv;
unsigned long cindex = 1;
struct hlist_node *tmp;
struct fib_alias *fa;
/* walk trie in reverse order */
for (;;) {
unsigned char slen = 0;
struct key_vector *n;
if (!(cindex--)) {
t_key pkey = pn->key;
/* cannot resize the trie vector */
if (IS_TRIE(pn))
break;
/* update the suffix to address pulled leaves */
if (pn->slen > pn->pos)
update_suffix(pn);
/* resize completed node */
pn = resize(t, pn);
cindex = get_index(pkey, pn);
continue;
}
/* grab the next available node */
n = get_child(pn, cindex);
if (!n)
continue;
if (IS_TNODE(n)) {
/* record pn and cindex for leaf walking */
pn = n;
cindex = 1ul << n->bits;
continue;
}
hlist_for_each_entry_safe(fa, tmp, &n->leaf, fa_list) {
/* if alias was cloned to local then we just
* need to remove the local copy from main
*/
if (tb->tb_id != fa->tb_id) {
hlist_del_rcu(&fa->fa_list);
alias_free_mem_rcu(fa);
continue;
}
/* record local slen */
slen = fa->fa_slen;
}
/* update leaf slen */
n->slen = slen;
if (hlist_empty(&n->leaf)) {
put_child_root(pn, n->key, NULL);
node_free(n);
}
}
}
/* Caller must hold RTNL. */
int fib_table_flush(struct net *net, struct fib_table *tb, bool flush_all)
{
struct trie *t = (struct trie *)tb->tb_data;
struct key_vector *pn = t->kv;
unsigned long cindex = 1;
struct hlist_node *tmp;
struct fib_alias *fa;
int found = 0;
/* walk trie in reverse order */
for (;;) {
unsigned char slen = 0;
struct key_vector *n;
if (!(cindex--)) {
t_key pkey = pn->key;
/* cannot resize the trie vector */
if (IS_TRIE(pn))
break;
/* update the suffix to address pulled leaves */
if (pn->slen > pn->pos)
update_suffix(pn);
/* resize completed node */
pn = resize(t, pn);
cindex = get_index(pkey, pn);
continue;
}
/* grab the next available node */
n = get_child(pn, cindex);
if (!n)
continue;
if (IS_TNODE(n)) {
/* record pn and cindex for leaf walking */
pn = n;
cindex = 1ul << n->bits;
continue;
}
hlist_for_each_entry_safe(fa, tmp, &n->leaf, fa_list) {
struct fib_info *fi = fa->fa_info;
if (!fi || tb->tb_id != fa->tb_id ||
(!(fi->fib_flags & RTNH_F_DEAD) &&
!fib_props[fa->fa_type].error)) {
slen = fa->fa_slen;
continue;
}
/* Do not flush error routes if network namespace is
* not being dismantled
*/
if (!flush_all && fib_props[fa->fa_type].error) {
slen = fa->fa_slen;
continue;
}
fib_notify_alias_delete(net, n->key, &n->leaf, fa,
NULL);
hlist_del_rcu(&fa->fa_list);
fib_release_info(fa->fa_info);
alias_free_mem_rcu(fa);
found++;
}
/* update leaf slen */
n->slen = slen;
if (hlist_empty(&n->leaf)) {
put_child_root(pn, n->key, NULL);
node_free(n);
}
}
pr_debug("trie_flush found=%d\n", found);
return found;
}
/* derived from fib_trie_free */
static void __fib_info_notify_update(struct net *net, struct fib_table *tb,
struct nl_info *info)
{
struct trie *t = (struct trie *)tb->tb_data;
struct key_vector *pn = t->kv;
unsigned long cindex = 1;
struct fib_alias *fa;
for (;;) {
struct key_vector *n;
if (!(cindex--)) {
t_key pkey = pn->key;
if (IS_TRIE(pn))
break;
pn = node_parent(pn);
cindex = get_index(pkey, pn);
continue;
}
/* grab the next available node */
n = get_child(pn, cindex);
if (!n)
continue;
if (IS_TNODE(n)) {
/* record pn and cindex for leaf walking */
pn = n;
cindex = 1ul << n->bits;
continue;
}
hlist_for_each_entry(fa, &n->leaf, fa_list) {
struct fib_info *fi = fa->fa_info;
if (!fi || !fi->nh_updated || fa->tb_id != tb->tb_id)
continue;
rtmsg_fib(RTM_NEWROUTE, htonl(n->key), fa,
KEYLENGTH - fa->fa_slen, tb->tb_id,
info, NLM_F_REPLACE);
}
}
}
void fib_info_notify_update(struct net *net, struct nl_info *info)
{
unsigned int h;
for (h = 0; h < FIB_TABLE_HASHSZ; h++) {
struct hlist_head *head = &net->ipv4.fib_table_hash[h];
struct fib_table *tb;
hlist_for_each_entry_rcu(tb, head, tb_hlist,
lockdep_rtnl_is_held())
__fib_info_notify_update(net, tb, info);
}
}
static int fib_leaf_notify(struct key_vector *l, struct fib_table *tb,
struct notifier_block *nb,
struct netlink_ext_ack *extack)
{
struct fib_alias *fa;
int last_slen = -1;
int err;
hlist_for_each_entry_rcu(fa, &l->leaf, fa_list) {
struct fib_info *fi = fa->fa_info;
if (!fi)
continue;
/* local and main table can share the same trie,
* so don't notify twice for the same entry.
*/
if (tb->tb_id != fa->tb_id)
continue;
if (fa->fa_slen == last_slen)
continue;
last_slen = fa->fa_slen;
err = call_fib_entry_notifier(nb, FIB_EVENT_ENTRY_REPLACE,
l->key, KEYLENGTH - fa->fa_slen,
fa, extack);
if (err)
return err;
}
return 0;
}
static int fib_table_notify(struct fib_table *tb, struct notifier_block *nb,
struct netlink_ext_ack *extack)
{
struct trie *t = (struct trie *)tb->tb_data;
struct key_vector *l, *tp = t->kv;
t_key key = 0;
int err;
while ((l = leaf_walk_rcu(&tp, key)) != NULL) {
err = fib_leaf_notify(l, tb, nb, extack);
if (err)
return err;
key = l->key + 1;
/* stop in case of wrap around */
if (key < l->key)
break;
}
return 0;
}
int fib_notify(struct net *net, struct notifier_block *nb,
struct netlink_ext_ack *extack)
{
unsigned int h;
int err;
for (h = 0; h < FIB_TABLE_HASHSZ; h++) {
struct hlist_head *head = &net->ipv4.fib_table_hash[h];
struct fib_table *tb;
hlist_for_each_entry_rcu(tb, head, tb_hlist) {
err = fib_table_notify(tb, nb, extack);
if (err)
return err;
}
}
return 0;
}
static void __trie_free_rcu(struct rcu_head *head)
{
struct fib_table *tb = container_of(head, struct fib_table, rcu);
#ifdef CONFIG_IP_FIB_TRIE_STATS
struct trie *t = (struct trie *)tb->tb_data;
if (tb->tb_data == tb->__data)
free_percpu(t->stats);
#endif /* CONFIG_IP_FIB_TRIE_STATS */
kfree(tb);
}
void fib_free_table(struct fib_table *tb)
{
call_rcu(&tb->rcu, __trie_free_rcu);
}
static int fn_trie_dump_leaf(struct key_vector *l, struct fib_table *tb,
struct sk_buff *skb, struct netlink_callback *cb,
struct fib_dump_filter *filter)
{
unsigned int flags = NLM_F_MULTI;
__be32 xkey = htonl(l->key);
int i, s_i, i_fa, s_fa, err;
struct fib_alias *fa;
if (filter->filter_set ||
!filter->dump_exceptions || !filter->dump_routes)
flags |= NLM_F_DUMP_FILTERED;
s_i = cb->args[4];
s_fa = cb->args[5];
i = 0;
/* rcu_read_lock is hold by caller */
hlist_for_each_entry_rcu(fa, &l->leaf, fa_list) {
struct fib_info *fi = fa->fa_info;
if (i < s_i)
goto next;
i_fa = 0;
if (tb->tb_id != fa->tb_id)
goto next;
if (filter->filter_set) {
if (filter->rt_type && fa->fa_type != filter->rt_type)
goto next;
if ((filter->protocol &&
fi->fib_protocol != filter->protocol))
goto next;
if (filter->dev &&
!fib_info_nh_uses_dev(fi, filter->dev))
goto next;
}
if (filter->dump_routes) {
if (!s_fa) {
struct fib_rt_info fri;
fri.fi = fi;
fri.tb_id = tb->tb_id;
fri.dst = xkey;
fri.dst_len = KEYLENGTH - fa->fa_slen;
fri.tos = fa->fa_tos;
fri.type = fa->fa_type;
fri.offload = READ_ONCE(fa->offload);
fri.trap = READ_ONCE(fa->trap);
fri.offload_failed = READ_ONCE(fa->offload_failed);
err = fib_dump_info(skb,
NETLINK_CB(cb->skb).portid,
cb->nlh->nlmsg_seq,
RTM_NEWROUTE, &fri, flags);
if (err < 0)
goto stop;
}
i_fa++;
}
if (filter->dump_exceptions) {
err = fib_dump_info_fnhe(skb, cb, tb->tb_id, fi,
&i_fa, s_fa, flags);
if (err < 0)
goto stop;
}
next:
i++;
}
cb->args[4] = i;
return skb->len;
stop:
cb->args[4] = i;
cb->args[5] = i_fa;
return err;
}
/* rcu_read_lock needs to be hold by caller from readside */
int fib_table_dump(struct fib_table *tb, struct sk_buff *skb,
struct netlink_callback *cb, struct fib_dump_filter *filter)
{
struct trie *t = (struct trie *)tb->tb_data;
struct key_vector *l, *tp = t->kv;
/* Dump starting at last key.
* Note: 0.0.0.0/0 (ie default) is first key.
*/
int count = cb->args[2];
t_key key = cb->args[3];
/* First time here, count and key are both always 0. Count > 0
* and key == 0 means the dump has wrapped around and we are done.
*/
if (count && !key)
return skb->len;
while ((l = leaf_walk_rcu(&tp, key)) != NULL) {
int err;
err = fn_trie_dump_leaf(l, tb, skb, cb, filter);
if (err < 0) {
cb->args[3] = key;
cb->args[2] = count;
return err;
}
++count;
key = l->key + 1;
memset(&cb->args[4], 0,
sizeof(cb->args) - 4*sizeof(cb->args[0]));
/* stop loop if key wrapped back to 0 */
if (key < l->key)
break;
}
cb->args[3] = key;
cb->args[2] = count;
return skb->len;
}
void __init fib_trie_init(void)
{
fn_alias_kmem = kmem_cache_create("ip_fib_alias",
sizeof(struct fib_alias),
0, SLAB_PANIC | SLAB_ACCOUNT, NULL);
trie_leaf_kmem = kmem_cache_create("ip_fib_trie",
LEAF_SIZE,
0, SLAB_PANIC | SLAB_ACCOUNT, NULL);
}
struct fib_table *fib_trie_table(u32 id, struct fib_table *alias)
{
struct fib_table *tb;
struct trie *t;
size_t sz = sizeof(*tb);
if (!alias)
sz += sizeof(struct trie);
tb = kzalloc(sz, GFP_KERNEL);
if (!tb)
return NULL;
tb->tb_id = id;
tb->tb_num_default = 0;
tb->tb_data = (alias ? alias->__data : tb->__data);
if (alias)
return tb;
t = (struct trie *) tb->tb_data;
t->kv[0].pos = KEYLENGTH;
t->kv[0].slen = KEYLENGTH;
#ifdef CONFIG_IP_FIB_TRIE_STATS
t->stats = alloc_percpu(struct trie_use_stats);
if (!t->stats) {
kfree(tb);
tb = NULL;
}
#endif
return tb;
}
#ifdef CONFIG_PROC_FS
/* Depth first Trie walk iterator */
struct fib_trie_iter {
struct seq_net_private p;
struct fib_table *tb;
struct key_vector *tnode;
unsigned int index;
unsigned int depth;
};
static struct key_vector *fib_trie_get_next(struct fib_trie_iter *iter)
{
unsigned long cindex = iter->index;
struct key_vector *pn = iter->tnode;
t_key pkey;
pr_debug("get_next iter={node=%p index=%d depth=%d}\n",
iter->tnode, iter->index, iter->depth);
while (!IS_TRIE(pn)) {
while (cindex < child_length(pn)) {
struct key_vector *n = get_child_rcu(pn, cindex++);
if (!n)
continue;
if (IS_LEAF(n)) {
iter->tnode = pn;
iter->index = cindex;
} else {
/* push down one level */
iter->tnode = n;
iter->index = 0;
++iter->depth;
}
return n;
}
/* Current node exhausted, pop back up */
pkey = pn->key;
pn = node_parent_rcu(pn);
cindex = get_index(pkey, pn) + 1;
--iter->depth;
}
/* record root node so further searches know we are done */
iter->tnode = pn;
iter->index = 0;
return NULL;
}
static struct key_vector *fib_trie_get_first(struct fib_trie_iter *iter,
struct trie *t)
{
struct key_vector *n, *pn;
if (!t)
return NULL;
pn = t->kv;
n = rcu_dereference(pn->tnode[0]);
if (!n)
return NULL;
if (IS_TNODE(n)) {
iter->tnode = n;
iter->index = 0;
iter->depth = 1;
} else {
iter->tnode = pn;
iter->index = 0;
iter->depth = 0;
}
return n;
}
static void trie_collect_stats(struct trie *t, struct trie_stat *s)
{
struct key_vector *n;
struct fib_trie_iter iter;
memset(s, 0, sizeof(*s));
rcu_read_lock();
for (n = fib_trie_get_first(&iter, t); n; n = fib_trie_get_next(&iter)) {
if (IS_LEAF(n)) {
struct fib_alias *fa;
s->leaves++;
s->totdepth += iter.depth;
if (iter.depth > s->maxdepth)
s->maxdepth = iter.depth;
hlist_for_each_entry_rcu(fa, &n->leaf, fa_list)
++s->prefixes;
} else {
s->tnodes++;
if (n->bits < MAX_STAT_DEPTH)
s->nodesizes[n->bits]++;
s->nullpointers += tn_info(n)->empty_children;
}
}
rcu_read_unlock();
}
/*
* This outputs /proc/net/fib_triestats
*/
static void trie_show_stats(struct seq_file *seq, struct trie_stat *stat)
{
unsigned int i, max, pointers, bytes, avdepth;
if (stat->leaves)
avdepth = stat->totdepth*100 / stat->leaves;
else
avdepth = 0;
seq_printf(seq, "\tAver depth: %u.%02d\n",
avdepth / 100, avdepth % 100);
seq_printf(seq, "\tMax depth: %u\n", stat->maxdepth);
seq_printf(seq, "\tLeaves: %u\n", stat->leaves);
bytes = LEAF_SIZE * stat->leaves;
seq_printf(seq, "\tPrefixes: %u\n", stat->prefixes);
bytes += sizeof(struct fib_alias) * stat->prefixes;
seq_printf(seq, "\tInternal nodes: %u\n\t", stat->tnodes);
bytes += TNODE_SIZE(0) * stat->tnodes;
max = MAX_STAT_DEPTH;
while (max > 0 && stat->nodesizes[max-1] == 0)
max--;
pointers = 0;
for (i = 1; i < max; i++)
if (stat->nodesizes[i] != 0) {
seq_printf(seq, " %u: %u", i, stat->nodesizes[i]);
pointers += (1<<i) * stat->nodesizes[i];
}
seq_putc(seq, '\n');
seq_printf(seq, "\tPointers: %u\n", pointers);
bytes += sizeof(struct key_vector *) * pointers;
seq_printf(seq, "Null ptrs: %u\n", stat->nullpointers);
seq_printf(seq, "Total size: %u kB\n", (bytes + 1023) / 1024);
}
#ifdef CONFIG_IP_FIB_TRIE_STATS
static void trie_show_usage(struct seq_file *seq,
const struct trie_use_stats __percpu *stats)
{
struct trie_use_stats s = { 0 };
int cpu;
/* loop through all of the CPUs and gather up the stats */
for_each_possible_cpu(cpu) {
const struct trie_use_stats *pcpu = per_cpu_ptr(stats, cpu);
s.gets += pcpu->gets;
s.backtrack += pcpu->backtrack;
s.semantic_match_passed += pcpu->semantic_match_passed;
s.semantic_match_miss += pcpu->semantic_match_miss;
s.null_node_hit += pcpu->null_node_hit;
s.resize_node_skipped += pcpu->resize_node_skipped;
}
seq_printf(seq, "\nCounters:\n---------\n");
seq_printf(seq, "gets = %u\n", s.gets);
seq_printf(seq, "backtracks = %u\n", s.backtrack);
seq_printf(seq, "semantic match passed = %u\n",
s.semantic_match_passed);
seq_printf(seq, "semantic match miss = %u\n", s.semantic_match_miss);
seq_printf(seq, "null node hit= %u\n", s.null_node_hit);
seq_printf(seq, "skipped node resize = %u\n\n", s.resize_node_skipped);
}
#endif /* CONFIG_IP_FIB_TRIE_STATS */
static void fib_table_print(struct seq_file *seq, struct fib_table *tb)
{
if (tb->tb_id == RT_TABLE_LOCAL)
seq_puts(seq, "Local:\n");
else if (tb->tb_id == RT_TABLE_MAIN)
seq_puts(seq, "Main:\n");
else
seq_printf(seq, "Id %d:\n", tb->tb_id);
}
static int fib_triestat_seq_show(struct seq_file *seq, void *v)
{
struct net *net = (struct net *)seq->private;
unsigned int h;
seq_printf(seq,
"Basic info: size of leaf:"
" %zd bytes, size of tnode: %zd bytes.\n",
LEAF_SIZE, TNODE_SIZE(0));
rcu_read_lock();
for (h = 0; h < FIB_TABLE_HASHSZ; h++) {
struct hlist_head *head = &net->ipv4.fib_table_hash[h];
struct fib_table *tb;
hlist_for_each_entry_rcu(tb, head, tb_hlist) {
struct trie *t = (struct trie *) tb->tb_data;
struct trie_stat stat;
if (!t)
continue;
fib_table_print(seq, tb);
trie_collect_stats(t, &stat);
trie_show_stats(seq, &stat);
#ifdef CONFIG_IP_FIB_TRIE_STATS
trie_show_usage(seq, t->stats);
#endif
}
cond_resched_rcu();
}
rcu_read_unlock();
return 0;
}
static struct key_vector *fib_trie_get_idx(struct seq_file *seq, loff_t pos)
{
struct fib_trie_iter *iter = seq->private;
struct net *net = seq_file_net(seq);
loff_t idx = 0;
unsigned int h;
for (h = 0; h < FIB_TABLE_HASHSZ; h++) {
struct hlist_head *head = &net->ipv4.fib_table_hash[h];
struct fib_table *tb;
hlist_for_each_entry_rcu(tb, head, tb_hlist) {
struct key_vector *n;
for (n = fib_trie_get_first(iter,
(struct trie *) tb->tb_data);
n; n = fib_trie_get_next(iter))
if (pos == idx++) {
iter->tb = tb;
return n;
}
}
}
return NULL;
}
static void *fib_trie_seq_start(struct seq_file *seq, loff_t *pos)
__acquires(RCU)
{
rcu_read_lock();
return fib_trie_get_idx(seq, *pos);
}
static void *fib_trie_seq_next(struct seq_file *seq, void *v, loff_t *pos)
{
struct fib_trie_iter *iter = seq->private;
struct net *net = seq_file_net(seq);
struct fib_table *tb = iter->tb;
struct hlist_node *tb_node;
unsigned int h;
struct key_vector *n;
++*pos;
/* next node in same table */
n = fib_trie_get_next(iter);
if (n)
return n;
/* walk rest of this hash chain */
h = tb->tb_id & (FIB_TABLE_HASHSZ - 1);
while ((tb_node = rcu_dereference(hlist_next_rcu(&tb->tb_hlist)))) {
tb = hlist_entry(tb_node, struct fib_table, tb_hlist);
n = fib_trie_get_first(iter, (struct trie *) tb->tb_data);
if (n)
goto found;
}
/* new hash chain */
while (++h < FIB_TABLE_HASHSZ) {
struct hlist_head *head = &net->ipv4.fib_table_hash[h];
hlist_for_each_entry_rcu(tb, head, tb_hlist) {
n = fib_trie_get_first(iter, (struct trie *) tb->tb_data);
if (n)
goto found;
}
}
return NULL;
found:
iter->tb = tb;
return n;
}
static void fib_trie_seq_stop(struct seq_file *seq, void *v)
__releases(RCU)
{
rcu_read_unlock();
}
static void seq_indent(struct seq_file *seq, int n)
{
while (n-- > 0)
seq_puts(seq, " ");
}
static inline const char *rtn_scope(char *buf, size_t len, enum rt_scope_t s)
{
switch (s) {
case RT_SCOPE_UNIVERSE: return "universe";
case RT_SCOPE_SITE: return "site";
case RT_SCOPE_LINK: return "link";
case RT_SCOPE_HOST: return "host";
case RT_SCOPE_NOWHERE: return "nowhere";
default:
snprintf(buf, len, "scope=%d", s);
return buf;
}
}
static const char *const rtn_type_names[__RTN_MAX] = {
[RTN_UNSPEC] = "UNSPEC",
[RTN_UNICAST] = "UNICAST",
[RTN_LOCAL] = "LOCAL",
[RTN_BROADCAST] = "BROADCAST",
[RTN_ANYCAST] = "ANYCAST",
[RTN_MULTICAST] = "MULTICAST",
[RTN_BLACKHOLE] = "BLACKHOLE",
[RTN_UNREACHABLE] = "UNREACHABLE",
[RTN_PROHIBIT] = "PROHIBIT",
[RTN_THROW] = "THROW",
[RTN_NAT] = "NAT",
[RTN_XRESOLVE] = "XRESOLVE",
};
static inline const char *rtn_type(char *buf, size_t len, unsigned int t)
{
if (t < __RTN_MAX && rtn_type_names[t])
return rtn_type_names[t];
snprintf(buf, len, "type %u", t);
return buf;
}
/* Pretty print the trie */
static int fib_trie_seq_show(struct seq_file *seq, void *v)
{
const struct fib_trie_iter *iter = seq->private;
struct key_vector *n = v;
if (IS_TRIE(node_parent_rcu(n)))
fib_table_print(seq, iter->tb);
if (IS_TNODE(n)) {
__be32 prf = htonl(n->key);
seq_indent(seq, iter->depth-1);
seq_printf(seq, " +-- %pI4/%zu %u %u %u\n",
&prf, KEYLENGTH - n->pos - n->bits, n->bits,
tn_info(n)->full_children,
tn_info(n)->empty_children);
} else {
__be32 val = htonl(n->key);
struct fib_alias *fa;
seq_indent(seq, iter->depth);
seq_printf(seq, " |-- %pI4\n", &val);
hlist_for_each_entry_rcu(fa, &n->leaf, fa_list) {
char buf1[32], buf2[32];
seq_indent(seq, iter->depth + 1);
seq_printf(seq, " /%zu %s %s",
KEYLENGTH - fa->fa_slen,
rtn_scope(buf1, sizeof(buf1),
fa->fa_info->fib_scope),
rtn_type(buf2, sizeof(buf2),
fa->fa_type));
if (fa->fa_tos)
seq_printf(seq, " tos=%d", fa->fa_tos);
seq_putc(seq, '\n');
}
}
return 0;
}
static const struct seq_operations fib_trie_seq_ops = {
.start = fib_trie_seq_start,
.next = fib_trie_seq_next,
.stop = fib_trie_seq_stop,
.show = fib_trie_seq_show,
};
struct fib_route_iter {
struct seq_net_private p;
struct fib_table *main_tb;
struct key_vector *tnode;
loff_t pos;
t_key key;
};
static struct key_vector *fib_route_get_idx(struct fib_route_iter *iter,
loff_t pos)
{
struct key_vector *l, **tp = &iter->tnode;
t_key key;
/* use cached location of previously found key */
if (iter->pos > 0 && pos >= iter->pos) {
key = iter->key;
} else {
iter->pos = 1;
key = 0;
}
pos -= iter->pos;
while ((l = leaf_walk_rcu(tp, key)) && (pos-- > 0)) {
key = l->key + 1;
iter->pos++;
l = NULL;
/* handle unlikely case of a key wrap */
if (!key)
break;
}
if (l)
iter->key = l->key; /* remember it */
else
iter->pos = 0; /* forget it */
return l;
}
static void *fib_route_seq_start(struct seq_file *seq, loff_t *pos)
__acquires(RCU)
{
struct fib_route_iter *iter = seq->private;
struct fib_table *tb;
struct trie *t;
rcu_read_lock();
tb = fib_get_table(seq_file_net(seq), RT_TABLE_MAIN);
if (!tb)
return NULL;
iter->main_tb = tb;
t = (struct trie *)tb->tb_data;
iter->tnode = t->kv;
if (*pos != 0)
return fib_route_get_idx(iter, *pos);
iter->pos = 0;
iter->key = KEY_MAX;
return SEQ_START_TOKEN;
}
static void *fib_route_seq_next(struct seq_file *seq, void *v, loff_t *pos)
{
struct fib_route_iter *iter = seq->private;
struct key_vector *l = NULL;
t_key key = iter->key + 1;
++*pos;
/* only allow key of 0 for start of sequence */
if ((v == SEQ_START_TOKEN) || key)
l = leaf_walk_rcu(&iter->tnode, key);
if (l) {
iter->key = l->key;
iter->pos++;
} else {
iter->pos = 0;
}
return l;
}
static void fib_route_seq_stop(struct seq_file *seq, void *v)
__releases(RCU)
{
rcu_read_unlock();
}
static unsigned int fib_flag_trans(int type, __be32 mask, struct fib_info *fi)
{
unsigned int flags = 0;
if (type == RTN_UNREACHABLE || type == RTN_PROHIBIT)
flags = RTF_REJECT;
if (fi) {
const struct fib_nh_common *nhc = fib_info_nhc(fi, 0);
if (nhc->nhc_gw.ipv4)
flags |= RTF_GATEWAY;
}
if (mask == htonl(0xFFFFFFFF))
flags |= RTF_HOST;
flags |= RTF_UP;
return flags;
}
/*
* This outputs /proc/net/route.
* The format of the file is not supposed to be changed
* and needs to be same as fib_hash output to avoid breaking
* legacy utilities
*/
static int fib_route_seq_show(struct seq_file *seq, void *v)
{
struct fib_route_iter *iter = seq->private;
struct fib_table *tb = iter->main_tb;
struct fib_alias *fa;
struct key_vector *l = v;
__be32 prefix;
if (v == SEQ_START_TOKEN) {
seq_printf(seq, "%-127s\n", "Iface\tDestination\tGateway "
"\tFlags\tRefCnt\tUse\tMetric\tMask\t\tMTU"
"\tWindow\tIRTT");
return 0;
}
prefix = htonl(l->key);
hlist_for_each_entry_rcu(fa, &l->leaf, fa_list) {
struct fib_info *fi = fa->fa_info;
__be32 mask = inet_make_mask(KEYLENGTH - fa->fa_slen);
unsigned int flags = fib_flag_trans(fa->fa_type, mask, fi);
if ((fa->fa_type == RTN_BROADCAST) ||
(fa->fa_type == RTN_MULTICAST))
continue;
if (fa->tb_id != tb->tb_id)
continue;
seq_setwidth(seq, 127);
if (fi) {
struct fib_nh_common *nhc = fib_info_nhc(fi, 0);
__be32 gw = 0;
if (nhc->nhc_gw_family == AF_INET)
gw = nhc->nhc_gw.ipv4;
seq_printf(seq,
"%s\t%08X\t%08X\t%04X\t%d\t%u\t"
"%d\t%08X\t%d\t%u\t%u",
nhc->nhc_dev ? nhc->nhc_dev->name : "*",
prefix, gw, flags, 0, 0,
fi->fib_priority,
mask,
(fi->fib_advmss ?
fi->fib_advmss + 40 : 0),
fi->fib_window,
fi->fib_rtt >> 3);
} else {
seq_printf(seq,
"*\t%08X\t%08X\t%04X\t%d\t%u\t"
"%d\t%08X\t%d\t%u\t%u",
prefix, 0, flags, 0, 0, 0,
mask, 0, 0, 0);
}
seq_pad(seq, '\n');
}
return 0;
}
static const struct seq_operations fib_route_seq_ops = {
.start = fib_route_seq_start,
.next = fib_route_seq_next,
.stop = fib_route_seq_stop,
.show = fib_route_seq_show,
};
int __net_init fib_proc_init(struct net *net)
{
if (!proc_create_net("fib_trie", 0444, net->proc_net, &fib_trie_seq_ops,
sizeof(struct fib_trie_iter)))
goto out1;
if (!proc_create_net_single("fib_triestat", 0444, net->proc_net,
fib_triestat_seq_show, NULL))
goto out2;
if (!proc_create_net("route", 0444, net->proc_net, &fib_route_seq_ops,
sizeof(struct fib_route_iter)))
goto out3;
return 0;
out3:
remove_proc_entry("fib_triestat", net->proc_net);
out2:
remove_proc_entry("fib_trie", net->proc_net);
out1:
return -ENOMEM;
}
void __net_exit fib_proc_exit(struct net *net)
{
remove_proc_entry("fib_trie", net->proc_net);
remove_proc_entry("fib_triestat", net->proc_net);
remove_proc_entry("route", net->proc_net);
}
#endif /* CONFIG_PROC_FS */
// SPDX-License-Identifier: GPL-2.0
/*
* kobject.h - generic kernel object infrastructure.
*
* Copyright (c) 2002-2003 Patrick Mochel
* Copyright (c) 2002-2003 Open Source Development Labs
* Copyright (c) 2006-2008 Greg Kroah-Hartman <greg@kroah.com>
* Copyright (c) 2006-2008 Novell Inc.
*
* Please read Documentation/core-api/kobject.rst before using the kobject
* interface, ESPECIALLY the parts about reference counts and object
* destructors.
*/
#ifndef _KOBJECT_H_
#define _KOBJECT_H_
#include <linux/types.h>
#include <linux/list.h>
#include <linux/sysfs.h>
#include <linux/compiler.h>
#include <linux/spinlock.h>
#include <linux/kref.h>
#include <linux/kobject_ns.h>
#include <linux/kernel.h>
#include <linux/wait.h>
#include <linux/atomic.h>
#include <linux/workqueue.h>
#include <linux/uidgid.h>
#define UEVENT_HELPER_PATH_LEN 256
#define UEVENT_NUM_ENVP 64 /* number of env pointers */
#define UEVENT_BUFFER_SIZE 2048 /* buffer for the variables */
#ifdef CONFIG_UEVENT_HELPER
/* path to the userspace helper executed on an event */
extern char uevent_helper[];
#endif
/* counter to tag the uevent, read only except for the kobject core */
extern u64 uevent_seqnum;
/*
* The actions here must match the index to the string array
* in lib/kobject_uevent.c
*
* Do not add new actions here without checking with the driver-core
* maintainers. Action strings are not meant to express subsystem
* or device specific properties. In most cases you want to send a
* kobject_uevent_env(kobj, KOBJ_CHANGE, env) with additional event
* specific variables added to the event environment.
*/
enum kobject_action {
KOBJ_ADD,
KOBJ_REMOVE,
KOBJ_CHANGE,
KOBJ_MOVE,
KOBJ_ONLINE,
KOBJ_OFFLINE,
KOBJ_BIND,
KOBJ_UNBIND,
};
struct kobject {
const char *name;
struct list_head entry;
struct kobject *parent;
struct kset *kset;
struct kobj_type *ktype;
struct kernfs_node *sd; /* sysfs directory entry */
struct kref kref;
#ifdef CONFIG_DEBUG_KOBJECT_RELEASE
struct delayed_work release;
#endif
unsigned int state_initialized:1;
unsigned int state_in_sysfs:1;
unsigned int state_add_uevent_sent:1;
unsigned int state_remove_uevent_sent:1;
unsigned int uevent_suppress:1;
};
extern __printf(2, 3)
int kobject_set_name(struct kobject *kobj, const char *name, ...);
extern __printf(2, 0)
int kobject_set_name_vargs(struct kobject *kobj, const char *fmt,
va_list vargs);
static inline const char *kobject_name(const struct kobject *kobj)
{
return kobj->name;
}
extern void kobject_init(struct kobject *kobj, struct kobj_type *ktype);
extern __printf(3, 4) __must_check
int kobject_add(struct kobject *kobj, struct kobject *parent,
const char *fmt, ...);
extern __printf(4, 5) __must_check
int kobject_init_and_add(struct kobject *kobj,
struct kobj_type *ktype, struct kobject *parent,
const char *fmt, ...);
extern void kobject_del(struct kobject *kobj);
extern struct kobject * __must_check kobject_create(void);
extern struct kobject * __must_check kobject_create_and_add(const char *name,
struct kobject *parent);
extern int __must_check kobject_rename(struct kobject *, const char *new_name);
extern int __must_check kobject_move(struct kobject *, struct kobject *);
extern struct kobject *kobject_get(struct kobject *kobj);
extern struct kobject * __must_check kobject_get_unless_zero(
struct kobject *kobj);
extern void kobject_put(struct kobject *kobj);
extern const void *kobject_namespace(struct kobject *kobj);
extern void kobject_get_ownership(struct kobject *kobj,
kuid_t *uid, kgid_t *gid);
extern char *kobject_get_path(struct kobject *kobj, gfp_t flag);
/**
* kobject_has_children - Returns whether a kobject has children.
* @kobj: the object to test
*
* This will return whether a kobject has other kobjects as children.
*
* It does NOT account for the presence of attribute files, only sub
* directories. It also assumes there is no concurrent addition or
* removal of such children, and thus relies on external locking.
*/
static inline bool kobject_has_children(struct kobject *kobj)
{
WARN_ON_ONCE(kref_read(&kobj->kref) == 0); return kobj->sd && kobj->sd->dir.subdirs;
}
struct kobj_type {
void (*release)(struct kobject *kobj);
const struct sysfs_ops *sysfs_ops;
struct attribute **default_attrs; /* use default_groups instead */
const struct attribute_group **default_groups;
const struct kobj_ns_type_operations *(*child_ns_type)(struct kobject *kobj);
const void *(*namespace)(struct kobject *kobj);
void (*get_ownership)(struct kobject *kobj, kuid_t *uid, kgid_t *gid);
};
struct kobj_uevent_env {
char *argv[3];
char *envp[UEVENT_NUM_ENVP];
int envp_idx;
char buf[UEVENT_BUFFER_SIZE];
int buflen;
};
struct kset_uevent_ops {
int (* const filter)(struct kset *kset, struct kobject *kobj);
const char *(* const name)(struct kset *kset, struct kobject *kobj);
int (* const uevent)(struct kset *kset, struct kobject *kobj,
struct kobj_uevent_env *env);
};
struct kobj_attribute {
struct attribute attr;
ssize_t (*show)(struct kobject *kobj, struct kobj_attribute *attr,
char *buf);
ssize_t (*store)(struct kobject *kobj, struct kobj_attribute *attr,
const char *buf, size_t count);
};
extern const struct sysfs_ops kobj_sysfs_ops;
struct sock;
/**
* struct kset - a set of kobjects of a specific type, belonging to a specific subsystem.
*
* A kset defines a group of kobjects. They can be individually
* different "types" but overall these kobjects all want to be grouped
* together and operated on in the same manner. ksets are used to
* define the attribute callbacks and other common events that happen to
* a kobject.
*
* @list: the list of all kobjects for this kset
* @list_lock: a lock for iterating over the kobjects
* @kobj: the embedded kobject for this kset (recursion, isn't it fun...)
* @uevent_ops: the set of uevent operations for this kset. These are
* called whenever a kobject has something happen to it so that the kset
* can add new environment variables, or filter out the uevents if so
* desired.
*/
struct kset {
struct list_head list;
spinlock_t list_lock;
struct kobject kobj;
const struct kset_uevent_ops *uevent_ops;
} __randomize_layout;
extern void kset_init(struct kset *kset);
extern int __must_check kset_register(struct kset *kset);
extern void kset_unregister(struct kset *kset);
extern struct kset * __must_check kset_create_and_add(const char *name,
const struct kset_uevent_ops *u,
struct kobject *parent_kobj);
static inline struct kset *to_kset(struct kobject *kobj)
{
return kobj ? container_of(kobj, struct kset, kobj) : NULL;
}
static inline struct kset *kset_get(struct kset *k)
{
return k ? to_kset(kobject_get(&k->kobj)) : NULL;
}
static inline void kset_put(struct kset *k)
{
kobject_put(&k->kobj);
}
static inline struct kobj_type *get_ktype(struct kobject *kobj)
{
return kobj->ktype;
}
extern struct kobject *kset_find_obj(struct kset *, const char *);
/* The global /sys/kernel/ kobject for people to chain off of */
extern struct kobject *kernel_kobj;
/* The global /sys/kernel/mm/ kobject for people to chain off of */
extern struct kobject *mm_kobj;
/* The global /sys/hypervisor/ kobject for people to chain off of */
extern struct kobject *hypervisor_kobj;
/* The global /sys/power/ kobject for people to chain off of */
extern struct kobject *power_kobj;
/* The global /sys/firmware/ kobject for people to chain off of */
extern struct kobject *firmware_kobj;
int kobject_uevent(struct kobject *kobj, enum kobject_action action);
int kobject_uevent_env(struct kobject *kobj, enum kobject_action action,
char *envp[]);
int kobject_synth_uevent(struct kobject *kobj, const char *buf, size_t count);
__printf(2, 3)
int add_uevent_var(struct kobj_uevent_env *env, const char *format, ...);
#endif /* _KOBJECT_H_ */
/*
* include/linux/topology.h
*
* Written by: Matthew Dobson, IBM Corporation
*
* Copyright (C) 2002, IBM Corp.
*
* All rights reserved.
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
* NON INFRINGEMENT. See the GNU General Public License for more
* details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
*
* Send feedback to <colpatch@us.ibm.com>
*/
#ifndef _LINUX_TOPOLOGY_H
#define _LINUX_TOPOLOGY_H
#include <linux/arch_topology.h>
#include <linux/cpumask.h>
#include <linux/bitops.h>
#include <linux/mmzone.h>
#include <linux/smp.h>
#include <linux/percpu.h>
#include <asm/topology.h>
#ifndef nr_cpus_node
#define nr_cpus_node(node) cpumask_weight(cpumask_of_node(node))
#endif
#define for_each_node_with_cpus(node) \
for_each_online_node(node) \
if (nr_cpus_node(node))
int arch_update_cpu_topology(void);
/* Conform to ACPI 2.0 SLIT distance definitions */
#define LOCAL_DISTANCE 10
#define REMOTE_DISTANCE 20
#define DISTANCE_BITS 8
#ifndef node_distance
#define node_distance(from,to) ((from) == (to) ? LOCAL_DISTANCE : REMOTE_DISTANCE)
#endif
#ifndef RECLAIM_DISTANCE
/*
* If the distance between nodes in a system is larger than RECLAIM_DISTANCE
* (in whatever arch specific measurement units returned by node_distance())
* and node_reclaim_mode is enabled then the VM will only call node_reclaim()
* on nodes within this distance.
*/
#define RECLAIM_DISTANCE 30
#endif
/*
* The following tunable allows platforms to override the default node
* reclaim distance (RECLAIM_DISTANCE) if remote memory accesses are
* sufficiently fast that the default value actually hurts
* performance.
*
* AMD EPYC machines use this because even though the 2-hop distance
* is 32 (3.2x slower than a local memory access) performance actually
* *improves* if allowed to reclaim memory and load balance tasks
* between NUMA nodes 2-hops apart.
*/
extern int __read_mostly node_reclaim_distance;
#ifndef PENALTY_FOR_NODE_WITH_CPUS
#define PENALTY_FOR_NODE_WITH_CPUS (1)
#endif
#ifdef CONFIG_USE_PERCPU_NUMA_NODE_ID
DECLARE_PER_CPU(int, numa_node);
#ifndef numa_node_id
/* Returns the number of the current Node. */
static inline int numa_node_id(void)
{
return raw_cpu_read(numa_node);
}
#endif
#ifndef cpu_to_node
static inline int cpu_to_node(int cpu)
{
return per_cpu(numa_node, cpu);
}
#endif
#ifndef set_numa_node
static inline void set_numa_node(int node)
{
this_cpu_write(numa_node, node);
}
#endif
#ifndef set_cpu_numa_node
static inline void set_cpu_numa_node(int cpu, int node)
{
per_cpu(numa_node, cpu) = node;
}
#endif
#else /* !CONFIG_USE_PERCPU_NUMA_NODE_ID */
/* Returns the number of the current Node. */
#ifndef numa_node_id
static inline int numa_node_id(void)
{
return cpu_to_node(raw_smp_processor_id());
}
#endif
#endif /* [!]CONFIG_USE_PERCPU_NUMA_NODE_ID */
#ifdef CONFIG_HAVE_MEMORYLESS_NODES
/*
* N.B., Do NOT reference the '_numa_mem_' per cpu variable directly.
* It will not be defined when CONFIG_HAVE_MEMORYLESS_NODES is not defined.
* Use the accessor functions set_numa_mem(), numa_mem_id() and cpu_to_mem().
*/
DECLARE_PER_CPU(int, _numa_mem_);
#ifndef set_numa_mem
static inline void set_numa_mem(int node)
{
this_cpu_write(_numa_mem_, node);
}
#endif
#ifndef numa_mem_id
/* Returns the number of the nearest Node with memory */
static inline int numa_mem_id(void)
{
return raw_cpu_read(_numa_mem_);
}
#endif
#ifndef cpu_to_mem
static inline int cpu_to_mem(int cpu)
{
return per_cpu(_numa_mem_, cpu);
}
#endif
#ifndef set_cpu_numa_mem
static inline void set_cpu_numa_mem(int cpu, int node)
{
per_cpu(_numa_mem_, cpu) = node;
}
#endif
#else /* !CONFIG_HAVE_MEMORYLESS_NODES */
#ifndef numa_mem_id
/* Returns the number of the nearest Node with memory */
static inline int numa_mem_id(void)
{
return numa_node_id();
}
#endif
#ifndef cpu_to_mem
static inline int cpu_to_mem(int cpu)
{
return cpu_to_node(cpu);
}
#endif
#endif /* [!]CONFIG_HAVE_MEMORYLESS_NODES */
#ifndef topology_physical_package_id
#define topology_physical_package_id(cpu) ((void)(cpu), -1)
#endif
#ifndef topology_die_id
#define topology_die_id(cpu) ((void)(cpu), -1)
#endif
#ifndef topology_core_id
#define topology_core_id(cpu) ((void)(cpu), 0)
#endif
#ifndef topology_sibling_cpumask
#define topology_sibling_cpumask(cpu) cpumask_of(cpu)
#endif
#ifndef topology_core_cpumask
#define topology_core_cpumask(cpu) cpumask_of(cpu)
#endif
#ifndef topology_die_cpumask
#define topology_die_cpumask(cpu) cpumask_of(cpu)
#endif
#if defined(CONFIG_SCHED_SMT) && !defined(cpu_smt_mask)
static inline const struct cpumask *cpu_smt_mask(int cpu)
{
return topology_sibling_cpumask(cpu);
}
#endif
static inline const struct cpumask *cpu_cpu_mask(int cpu)
{
return cpumask_of_node(cpu_to_node(cpu));
}
#endif /* _LINUX_TOPOLOGY_H */
/* SPDX-License-Identifier: GPL-2.0 */
/*
* Block data types and constants. Directly include this file only to
* break include dependency loop.
*/
#ifndef __LINUX_BLK_TYPES_H
#define __LINUX_BLK_TYPES_H
#include <linux/types.h>
#include <linux/bvec.h>
#include <linux/device.h>
#include <linux/ktime.h>
struct bio_set;
struct bio;
struct bio_integrity_payload;
struct page;
struct io_context;
struct cgroup_subsys_state;
typedef void (bio_end_io_t) (struct bio *);
struct bio_crypt_ctx;
struct block_device {
sector_t bd_start_sect;
struct disk_stats __percpu *bd_stats;
unsigned long bd_stamp;
bool bd_read_only; /* read-only policy */
dev_t bd_dev;
int bd_openers;
struct inode * bd_inode; /* will die */
struct super_block * bd_super;
void * bd_claiming;
struct device bd_device;
void * bd_holder;
int bd_holders;
bool bd_write_holder;
struct kobject *bd_holder_dir;
u8 bd_partno;
spinlock_t bd_size_lock; /* for bd_inode->i_size updates */
struct gendisk * bd_disk;
/* The counter of freeze processes */
int bd_fsfreeze_count;
/* Mutex for freeze */
struct mutex bd_fsfreeze_mutex;
struct super_block *bd_fsfreeze_sb;
struct partition_meta_info *bd_meta_info;
#ifdef CONFIG_FAIL_MAKE_REQUEST
bool bd_make_it_fail;
#endif
} __randomize_layout;
#define bdev_whole(_bdev) \
((_bdev)->bd_disk->part0)
#define dev_to_bdev(device) \
container_of((device), struct block_device, bd_device)
#define bdev_kobj(_bdev) \
(&((_bdev)->bd_device.kobj))
/*
* Block error status values. See block/blk-core:blk_errors for the details.
* Alpha cannot write a byte atomically, so we need to use 32-bit value.
*/
#if defined(CONFIG_ALPHA) && !defined(__alpha_bwx__)
typedef u32 __bitwise blk_status_t;
#else
typedef u8 __bitwise blk_status_t;
#endif
#define BLK_STS_OK 0
#define BLK_STS_NOTSUPP ((__force blk_status_t)1)
#define BLK_STS_TIMEOUT ((__force blk_status_t)2)
#define BLK_STS_NOSPC ((__force blk_status_t)3)
#define BLK_STS_TRANSPORT ((__force blk_status_t)4)
#define BLK_STS_TARGET ((__force blk_status_t)5)
#define BLK_STS_NEXUS ((__force blk_status_t)6)
#define BLK_STS_MEDIUM ((__force blk_status_t)7)
#define BLK_STS_PROTECTION ((__force blk_status_t)8)
#define BLK_STS_RESOURCE ((__force blk_status_t)9)
#define BLK_STS_IOERR ((__force blk_status_t)10)
/* hack for device mapper, don't use elsewhere: */
#define BLK_STS_DM_REQUEUE ((__force blk_status_t)11)
#define BLK_STS_AGAIN ((__force blk_status_t)12)
/*
* BLK_STS_DEV_RESOURCE is returned from the driver to the block layer if
* device related resources are unavailable, but the driver can guarantee
* that the queue will be rerun in the future once resources become
* available again. This is typically the case for device specific
* resources that are consumed for IO. If the driver fails allocating these
* resources, we know that inflight (or pending) IO will free these
* resource upon completion.
*
* This is different from BLK_STS_RESOURCE in that it explicitly references
* a device specific resource. For resources of wider scope, allocation
* failure can happen without having pending IO. This means that we can't
* rely on request completions freeing these resources, as IO may not be in
* flight. Examples of that are kernel memory allocations, DMA mappings, or
* any other system wide resources.
*/
#define BLK_STS_DEV_RESOURCE ((__force blk_status_t)13)
/*
* BLK_STS_ZONE_RESOURCE is returned from the driver to the block layer if zone
* related resources are unavailable, but the driver can guarantee the queue
* will be rerun in the future once the resources become available again.
*
* This is different from BLK_STS_DEV_RESOURCE in that it explicitly references
* a zone specific resource and IO to a different zone on the same device could
* still be served. Examples of that are zones that are write-locked, but a read
* to the same zone could be served.
*/
#define BLK_STS_ZONE_RESOURCE ((__force blk_status_t)14)
/*
* BLK_STS_ZONE_OPEN_RESOURCE is returned from the driver in the completion
* path if the device returns a status indicating that too many zone resources
* are currently open. The same command should be successful if resubmitted
* after the number of open zones decreases below the device's limits, which is
* reported in the request_queue's max_open_zones.
*/
#define BLK_STS_ZONE_OPEN_RESOURCE ((__force blk_status_t)15)
/*
* BLK_STS_ZONE_ACTIVE_RESOURCE is returned from the driver in the completion
* path if the device returns a status indicating that too many zone resources
* are currently active. The same command should be successful if resubmitted
* after the number of active zones decreases below the device's limits, which
* is reported in the request_queue's max_active_zones.
*/
#define BLK_STS_ZONE_ACTIVE_RESOURCE ((__force blk_status_t)16)
/**
* blk_path_error - returns true if error may be path related
* @error: status the request was completed with
*
* Description:
* This classifies block error status into non-retryable errors and ones
* that may be successful if retried on a failover path.
*
* Return:
* %false - retrying failover path will not help
* %true - may succeed if retried
*/
static inline bool blk_path_error(blk_status_t error)
{
switch (error) {
case BLK_STS_NOTSUPP:
case BLK_STS_NOSPC:
case BLK_STS_TARGET:
case BLK_STS_NEXUS:
case BLK_STS_MEDIUM:
case BLK_STS_PROTECTION:
return false;
}
/* Anything else could be a path failure, so should be retried */
return true;
}
/*
* From most significant bit:
* 1 bit: reserved for other usage, see below
* 12 bits: original size of bio
* 51 bits: issue time of bio
*/
#define BIO_ISSUE_RES_BITS 1
#define BIO_ISSUE_SIZE_BITS 12
#define BIO_ISSUE_RES_SHIFT (64 - BIO_ISSUE_RES_BITS)
#define BIO_ISSUE_SIZE_SHIFT (BIO_ISSUE_RES_SHIFT - BIO_ISSUE_SIZE_BITS)
#define BIO_ISSUE_TIME_MASK ((1ULL << BIO_ISSUE_SIZE_SHIFT) - 1)
#define BIO_ISSUE_SIZE_MASK \
(((1ULL << BIO_ISSUE_SIZE_BITS) - 1) << BIO_ISSUE_SIZE_SHIFT)
#define BIO_ISSUE_RES_MASK (~((1ULL << BIO_ISSUE_RES_SHIFT) - 1))
/* Reserved bit for blk-throtl */
#define BIO_ISSUE_THROTL_SKIP_LATENCY (1ULL << 63)
struct bio_issue {
u64 value;
};
static inline u64 __bio_issue_time(u64 time)
{
return time & BIO_ISSUE_TIME_MASK;
}
static inline u64 bio_issue_time(struct bio_issue *issue)
{
return __bio_issue_time(issue->value);
}
static inline sector_t bio_issue_size(struct bio_issue *issue)
{
return ((issue->value & BIO_ISSUE_SIZE_MASK) >> BIO_ISSUE_SIZE_SHIFT);
}
static inline void bio_issue_init(struct bio_issue *issue,
sector_t size)
{
size &= (1ULL << BIO_ISSUE_SIZE_BITS) - 1;
issue->value = ((issue->value & BIO_ISSUE_RES_MASK) |
(ktime_get_ns() & BIO_ISSUE_TIME_MASK) |
((u64)size << BIO_ISSUE_SIZE_SHIFT));
}
/*
* main unit of I/O for the block layer and lower layers (ie drivers and
* stacking drivers)
*/
struct bio {
struct bio *bi_next; /* request queue link */
struct block_device *bi_bdev;
unsigned int bi_opf; /* bottom bits req flags,
* top bits REQ_OP. Use
* accessors.
*/
unsigned short bi_flags; /* BIO_* below */
unsigned short bi_ioprio;
unsigned short bi_write_hint;
blk_status_t bi_status;
atomic_t __bi_remaining;
struct bvec_iter bi_iter;
bio_end_io_t *bi_end_io;
void *bi_private;
#ifdef CONFIG_BLK_CGROUP
/*
* Represents the association of the css and request_queue for the bio.
* If a bio goes direct to device, it will not have a blkg as it will
* not have a request_queue associated with it. The reference is put
* on release of the bio.
*/
struct blkcg_gq *bi_blkg;
struct bio_issue bi_issue;
#ifdef CONFIG_BLK_CGROUP_IOCOST
u64 bi_iocost_cost;
#endif
#endif
#ifdef CONFIG_BLK_INLINE_ENCRYPTION
struct bio_crypt_ctx *bi_crypt_context;
#endif
union {
#if defined(CONFIG_BLK_DEV_INTEGRITY)
struct bio_integrity_payload *bi_integrity; /* data integrity */
#endif
};
unsigned short bi_vcnt; /* how many bio_vec's */
/*
* Everything starting with bi_max_vecs will be preserved by bio_reset()
*/
unsigned short bi_max_vecs; /* max bvl_vecs we can hold */
atomic_t __bi_cnt; /* pin count */
struct bio_vec *bi_io_vec; /* the actual vec list */
struct bio_set *bi_pool;
/*
* We can inline a number of vecs at the end of the bio, to avoid
* double allocations for a small number of bio_vecs. This member
* MUST obviously be kept at the very end of the bio.
*/
struct bio_vec bi_inline_vecs[];
};
#define BIO_RESET_BYTES offsetof(struct bio, bi_max_vecs)
#define BIO_MAX_SECTORS (UINT_MAX >> SECTOR_SHIFT)
/*
* bio flags
*/
enum {
BIO_NO_PAGE_REF, /* don't put release vec pages */
BIO_CLONED, /* doesn't own data */
BIO_BOUNCED, /* bio is a bounce bio */
BIO_WORKINGSET, /* contains userspace workingset pages */
BIO_QUIET, /* Make BIO Quiet */
BIO_CHAIN, /* chained bio, ->bi_remaining in effect */
BIO_REFFED, /* bio has elevated ->bi_cnt */
BIO_THROTTLED, /* This bio has already been subjected to
* throttling rules. Don't do it again. */
BIO_TRACE_COMPLETION, /* bio_endio() should trace the final completion
* of this bio. */
BIO_CGROUP_ACCT, /* has been accounted to a cgroup */
BIO_TRACKED, /* set if bio goes through the rq_qos path */
BIO_REMAPPED,
BIO_ZONE_WRITE_LOCKED, /* Owns a zoned device zone write lock */
BIO_PERCPU_CACHE, /* can participate in per-cpu alloc cache */
BIO_FLAG_LAST
};
typedef __u32 __bitwise blk_mq_req_flags_t;
/*
* Operations and flags common to the bio and request structures.
* We use 8 bits for encoding the operation, and the remaining 24 for flags.
*
* The least significant bit of the operation number indicates the data
* transfer direction:
*
* - if the least significant bit is set transfers are TO the device
* - if the least significant bit is not set transfers are FROM the device
*
* If a operation does not transfer data the least significant bit has no
* meaning.
*/
#define REQ_OP_BITS 8
#define REQ_OP_MASK ((1 << REQ_OP_BITS) - 1)
#define REQ_FLAG_BITS 24
enum req_opf {
/* read sectors from the device */
REQ_OP_READ = 0,
/* write sectors to the device */
REQ_OP_WRITE = 1,
/* flush the volatile write cache */
REQ_OP_FLUSH = 2,
/* discard sectors */
REQ_OP_DISCARD = 3,
/* securely erase sectors */
REQ_OP_SECURE_ERASE = 5,
/* write the same sector many times */
REQ_OP_WRITE_SAME = 7,
/* write the zero filled sector many times */
REQ_OP_WRITE_ZEROES = 9,
/* Open a zone */
REQ_OP_ZONE_OPEN = 10,
/* Close a zone */
REQ_OP_ZONE_CLOSE = 11,
/* Transition a zone to full */
REQ_OP_ZONE_FINISH = 12,
/* write data at the current zone write pointer */
REQ_OP_ZONE_APPEND = 13,
/* reset a zone write pointer */
REQ_OP_ZONE_RESET = 15,
/* reset all the zone present on the device */
REQ_OP_ZONE_RESET_ALL = 17,
/* Driver private requests */
REQ_OP_DRV_IN = 34,
REQ_OP_DRV_OUT = 35,
REQ_OP_LAST,
};
enum req_flag_bits {
__REQ_FAILFAST_DEV = /* no driver retries of device errors */
REQ_OP_BITS,
__REQ_FAILFAST_TRANSPORT, /* no driver retries of transport errors */
__REQ_FAILFAST_DRIVER, /* no driver retries of driver errors */
__REQ_SYNC, /* request is sync (sync write or read) */
__REQ_META, /* metadata io request */
__REQ_PRIO, /* boost priority in cfq */
__REQ_NOMERGE, /* don't touch this for merging */
__REQ_IDLE, /* anticipate more IO after this one */
__REQ_INTEGRITY, /* I/O includes block integrity payload */
__REQ_FUA, /* forced unit access */
__REQ_PREFLUSH, /* request for cache flush */
__REQ_RAHEAD, /* read ahead, can fail anytime */
__REQ_BACKGROUND, /* background IO */
__REQ_NOWAIT, /* Don't wait if request will block */
/*
* When a shared kthread needs to issue a bio for a cgroup, doing
* so synchronously can lead to priority inversions as the kthread
* can be trapped waiting for that cgroup. CGROUP_PUNT flag makes
* submit_bio() punt the actual issuing to a dedicated per-blkcg
* work item to avoid such priority inversions.
*/
__REQ_CGROUP_PUNT,
/* command specific flags for REQ_OP_WRITE_ZEROES: */
__REQ_NOUNMAP, /* do not free blocks when zeroing */
__REQ_HIPRI,
/* for driver use */
__REQ_DRV,
__REQ_SWAP, /* swapping request. */
__REQ_NR_BITS, /* stops here */
};
#define REQ_FAILFAST_DEV (1ULL << __REQ_FAILFAST_DEV)
#define REQ_FAILFAST_TRANSPORT (1ULL << __REQ_FAILFAST_TRANSPORT)
#define REQ_FAILFAST_DRIVER (1ULL << __REQ_FAILFAST_DRIVER)
#define REQ_SYNC (1ULL << __REQ_SYNC)
#define REQ_META (1ULL << __REQ_META)
#define REQ_PRIO (1ULL << __REQ_PRIO)
#define REQ_NOMERGE (1ULL << __REQ_NOMERGE)
#define REQ_IDLE (1ULL << __REQ_IDLE)
#define REQ_INTEGRITY (1ULL << __REQ_INTEGRITY)
#define REQ_FUA (1ULL << __REQ_FUA)
#define REQ_PREFLUSH (1ULL << __REQ_PREFLUSH)
#define REQ_RAHEAD (1ULL << __REQ_RAHEAD)
#define REQ_BACKGROUND (1ULL << __REQ_BACKGROUND)
#define REQ_NOWAIT (1ULL << __REQ_NOWAIT)
#define REQ_CGROUP_PUNT (1ULL << __REQ_CGROUP_PUNT)
#define REQ_NOUNMAP (1ULL << __REQ_NOUNMAP)
#define REQ_HIPRI (1ULL << __REQ_HIPRI)
#define REQ_DRV (1ULL << __REQ_DRV)
#define REQ_SWAP (1ULL << __REQ_SWAP)
#define REQ_FAILFAST_MASK \
(REQ_FAILFAST_DEV | REQ_FAILFAST_TRANSPORT | REQ_FAILFAST_DRIVER)
#define REQ_NOMERGE_FLAGS \
(REQ_NOMERGE | REQ_PREFLUSH | REQ_FUA)
enum stat_group {
STAT_READ,
STAT_WRITE,
STAT_DISCARD,
STAT_FLUSH,
NR_STAT_GROUPS
};
#define bio_op(bio) \
((bio)->bi_opf & REQ_OP_MASK)
#define req_op(req) \
((req)->cmd_flags & REQ_OP_MASK)
/* obsolete, don't use in new code */
static inline void bio_set_op_attrs(struct bio *bio, unsigned op,
unsigned op_flags)
{
bio->bi_opf = op | op_flags;
}
static inline bool op_is_write(unsigned int op)
{
return (op & 1);
}
/*
* Check if the bio or request is one that needs special treatment in the
* flush state machine.
*/
static inline bool op_is_flush(unsigned int op)
{
return op & (REQ_FUA | REQ_PREFLUSH);
}
/*
* Reads are always treated as synchronous, as are requests with the FUA or
* PREFLUSH flag. Other operations may be marked as synchronous using the
* REQ_SYNC flag.
*/
static inline bool op_is_sync(unsigned int op)
{
return (op & REQ_OP_MASK) == REQ_OP_READ ||
(op & (REQ_SYNC | REQ_FUA | REQ_PREFLUSH));
}
static inline bool op_is_discard(unsigned int op)
{
return (op & REQ_OP_MASK) == REQ_OP_DISCARD;
}
/*
* Check if a bio or request operation is a zone management operation, with
* the exception of REQ_OP_ZONE_RESET_ALL which is treated as a special case
* due to its different handling in the block layer and device response in
* case of command failure.
*/
static inline bool op_is_zone_mgmt(enum req_opf op)
{
switch (op & REQ_OP_MASK) {
case REQ_OP_ZONE_RESET:
case REQ_OP_ZONE_OPEN:
case REQ_OP_ZONE_CLOSE:
case REQ_OP_ZONE_FINISH:
return true;
default:
return false;
}
}
static inline int op_stat_group(unsigned int op)
{
if (op_is_discard(op))
return STAT_DISCARD;
return op_is_write(op);
}
typedef unsigned int blk_qc_t;
#define BLK_QC_T_NONE -1U
#define BLK_QC_T_SHIFT 16
#define BLK_QC_T_INTERNAL (1U << 31)
static inline bool blk_qc_t_valid(blk_qc_t cookie)
{
return cookie != BLK_QC_T_NONE;
}
static inline unsigned int blk_qc_t_to_queue_num(blk_qc_t cookie)
{
return (cookie & ~BLK_QC_T_INTERNAL) >> BLK_QC_T_SHIFT;
}
static inline unsigned int blk_qc_t_to_tag(blk_qc_t cookie)
{
return cookie & ((1u << BLK_QC_T_SHIFT) - 1);
}
static inline bool blk_qc_t_is_internal(blk_qc_t cookie)
{
return (cookie & BLK_QC_T_INTERNAL) != 0;
}
struct blk_rq_stat {
u64 mean;
u64 min;
u64 max;
u32 nr_samples;
u64 batch;
};
#endif /* __LINUX_BLK_TYPES_H */
// SPDX-License-Identifier: GPL-2.0-only
/*
* INET An implementation of the TCP/IP protocol suite for the LINUX
* operating system. INET is implemented using the BSD Socket
* interface as the means of communication with the user level.
*
* Implementation of the Transmission Control Protocol(TCP).
*
* Authors: Ross Biro
* Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
* Mark Evans, <evansmp@uhura.aston.ac.uk>
* Corey Minyard <wf-rch!minyard@relay.EU.net>
* Florian La Roche, <flla@stud.uni-sb.de>
* Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
* Linus Torvalds, <torvalds@cs.helsinki.fi>
* Alan Cox, <gw4pts@gw4pts.ampr.org>
* Matthew Dillon, <dillon@apollo.west.oic.com>
* Arnt Gulbrandsen, <agulbra@nvg.unit.no>
* Jorge Cwik, <jorge@laser.satlink.net>
*/
/*
* Changes: Pedro Roque : Retransmit queue handled by TCP.
* : Fragmentation on mtu decrease
* : Segment collapse on retransmit
* : AF independence
*
* Linus Torvalds : send_delayed_ack
* David S. Miller : Charge memory using the right skb
* during syn/ack processing.
* David S. Miller : Output engine completely rewritten.
* Andrea Arcangeli: SYNACK carry ts_recent in tsecr.
* Cacophonix Gaul : draft-minshall-nagle-01
* J Hadi Salim : ECN support
*
*/
#define pr_fmt(fmt) "TCP: " fmt
#include <net/tcp.h>
#include <net/mptcp.h>
#include <linux/compiler.h>
#include <linux/gfp.h>
#include <linux/module.h>
#include <linux/static_key.h>
#include <trace/events/tcp.h>
/* Refresh clocks of a TCP socket,
* ensuring monotically increasing values.
*/
void tcp_mstamp_refresh(struct tcp_sock *tp)
{
u64 val = tcp_clock_ns();
tp->tcp_clock_cache = val;
tp->tcp_mstamp = div_u64(val, NSEC_PER_USEC);
}
static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
int push_one, gfp_t gfp);
/* Account for new data that has been sent to the network. */
static void tcp_event_new_data_sent(struct sock *sk, struct sk_buff *skb)
{
struct inet_connection_sock *icsk = inet_csk(sk);
struct tcp_sock *tp = tcp_sk(sk);
unsigned int prior_packets = tp->packets_out;
WRITE_ONCE(tp->snd_nxt, TCP_SKB_CB(skb)->end_seq);
__skb_unlink(skb, &sk->sk_write_queue);
tcp_rbtree_insert(&sk->tcp_rtx_queue, skb);
if (tp->highest_sack == NULL)
tp->highest_sack = skb;
tp->packets_out += tcp_skb_pcount(skb);
if (!prior_packets || icsk->icsk_pending == ICSK_TIME_LOSS_PROBE)
tcp_rearm_rto(sk);
NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPORIGDATASENT,
tcp_skb_pcount(skb));
tcp_check_space(sk);
}
/* SND.NXT, if window was not shrunk or the amount of shrunk was less than one
* window scaling factor due to loss of precision.
* If window has been shrunk, what should we make? It is not clear at all.
* Using SND.UNA we will fail to open window, SND.NXT is out of window. :-(
* Anything in between SND.UNA...SND.UNA+SND.WND also can be already
* invalid. OK, let's make this for now:
*/
static inline __u32 tcp_acceptable_seq(const struct sock *sk)
{
const struct tcp_sock *tp = tcp_sk(sk);
if (!before(tcp_wnd_end(tp), tp->snd_nxt) ||
(tp->rx_opt.wscale_ok &&
((tp->snd_nxt - tcp_wnd_end(tp)) < (1 << tp->rx_opt.rcv_wscale))))
return tp->snd_nxt;
else
return tcp_wnd_end(tp);
}
/* Calculate mss to advertise in SYN segment.
* RFC1122, RFC1063, draft-ietf-tcpimpl-pmtud-01 state that:
*
* 1. It is independent of path mtu.
* 2. Ideally, it is maximal possible segment size i.e. 65535-40.
* 3. For IPv4 it is reasonable to calculate it from maximal MTU of
* attached devices, because some buggy hosts are confused by
* large MSS.
* 4. We do not make 3, we advertise MSS, calculated from first
* hop device mtu, but allow to raise it to ip_rt_min_advmss.
* This may be overridden via information stored in routing table.
* 5. Value 65535 for MSS is valid in IPv6 and means "as large as possible,
* probably even Jumbo".
*/
static __u16 tcp_advertise_mss(struct sock *sk)
{
struct tcp_sock *tp = tcp_sk(sk);
const struct dst_entry *dst = __sk_dst_get(sk);
int mss = tp->advmss;
if (dst) {
unsigned int metric = dst_metric_advmss(dst);
if (metric < mss) { mss = metric;
tp->advmss = mss;
}
}
return (__u16)mss;
}
/* RFC2861. Reset CWND after idle period longer RTO to "restart window".
* This is the first part of cwnd validation mechanism.
*/
void tcp_cwnd_restart(struct sock *sk, s32 delta)
{
struct tcp_sock *tp = tcp_sk(sk);
u32 restart_cwnd = tcp_init_cwnd(tp, __sk_dst_get(sk));
u32 cwnd = tp->snd_cwnd;
tcp_ca_event(sk, CA_EVENT_CWND_RESTART);
tp->snd_ssthresh = tcp_current_ssthresh(sk);
restart_cwnd = min(restart_cwnd, cwnd);
while ((delta -= inet_csk(sk)->icsk_rto) > 0 && cwnd > restart_cwnd)
cwnd >>= 1;
tp->snd_cwnd = max(cwnd, restart_cwnd);
tp->snd_cwnd_stamp = tcp_jiffies32;
tp->snd_cwnd_used = 0;
}
/* Congestion state accounting after a packet has been sent. */
static void tcp_event_data_sent(struct tcp_sock *tp,
struct sock *sk)
{
struct inet_connection_sock *icsk = inet_csk(sk);
const u32 now = tcp_jiffies32;
if (tcp_packets_in_flight(tp) == 0)
tcp_ca_event(sk, CA_EVENT_TX_START);
/* If this is the first data packet sent in response to the
* previous received data,
* and it is a reply for ato after last received packet,
* increase pingpong count.
*/
if (before(tp->lsndtime, icsk->icsk_ack.lrcvtime) && (u32)(now - icsk->icsk_ack.lrcvtime) < icsk->icsk_ack.ato)
inet_csk_inc_pingpong_cnt(sk);
tp->lsndtime = now;
}
/* Account for an ACK we sent. */
static inline void tcp_event_ack_sent(struct sock *sk, unsigned int pkts,
u32 rcv_nxt)
{
struct tcp_sock *tp = tcp_sk(sk);
if (unlikely(tp->compressed_ack)) {
NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPACKCOMPRESSED,
tp->compressed_ack);
tp->compressed_ack = 0;
if (hrtimer_try_to_cancel(&tp->compressed_ack_timer) == 1)
__sock_put(sk);
}
if (unlikely(rcv_nxt != tp->rcv_nxt))
return; /* Special ACK sent by DCTCP to reflect ECN */
tcp_dec_quickack_mode(sk, pkts);
inet_csk_clear_xmit_timer(sk, ICSK_TIME_DACK);
}
/* Determine a window scaling and initial window to offer.
* Based on the assumption that the given amount of space
* will be offered. Store the results in the tp structure.
* NOTE: for smooth operation initial space offering should
* be a multiple of mss if possible. We assume here that mss >= 1.
* This MUST be enforced by all callers.
*/
void tcp_select_initial_window(const struct sock *sk, int __space, __u32 mss,
__u32 *rcv_wnd, __u32 *window_clamp,
int wscale_ok, __u8 *rcv_wscale,
__u32 init_rcv_wnd)
{
unsigned int space = (__space < 0 ? 0 : __space);
/* If no clamp set the clamp to the max possible scaled window */
if (*window_clamp == 0)
(*window_clamp) = (U16_MAX << TCP_MAX_WSCALE); space = min(*window_clamp, space);
/* Quantize space offering to a multiple of mss if possible. */
if (space > mss)
space = rounddown(space, mss);
/* NOTE: offering an initial window larger than 32767
* will break some buggy TCP stacks. If the admin tells us
* it is likely we could be speaking with such a buggy stack
* we will truncate our initial window offering to 32K-1
* unless the remote has sent us a window scaling option,
* which we interpret as a sign the remote TCP is not
* misinterpreting the window field as a signed quantity.
*/
if (sock_net(sk)->ipv4.sysctl_tcp_workaround_signed_windows)
(*rcv_wnd) = min(space, MAX_TCP_WINDOW);
else
(*rcv_wnd) = min_t(u32, space, U16_MAX); if (init_rcv_wnd) *rcv_wnd = min(*rcv_wnd, init_rcv_wnd * mss); *rcv_wscale = 0;
if (wscale_ok) {
/* Set window scaling on max possible window */
space = max_t(u32, space, sock_net(sk)->ipv4.sysctl_tcp_rmem[2]);
space = max_t(u32, space, sysctl_rmem_max);
space = min_t(u32, space, *window_clamp);
*rcv_wscale = clamp_t(int, ilog2(space) - 15,
0, TCP_MAX_WSCALE);
}
/* Set the clamp no higher than max representable value */
(*window_clamp) = min_t(__u32, U16_MAX << (*rcv_wscale), *window_clamp);
}
EXPORT_SYMBOL(tcp_select_initial_window);
/* Chose a new window to advertise, update state in tcp_sock for the
* socket, and return result with RFC1323 scaling applied. The return
* value can be stuffed directly into th->window for an outgoing
* frame.
*/
static u16 tcp_select_window(struct sock *sk)
{
struct tcp_sock *tp = tcp_sk(sk);
u32 old_win = tp->rcv_wnd;
u32 cur_win = tcp_receive_window(tp);
u32 new_win = __tcp_select_window(sk);
/* Never shrink the offered window */
if (new_win < cur_win) {
/* Danger Will Robinson!
* Don't update rcv_wup/rcv_wnd here or else
* we will not be able to advertise a zero
* window in time. --DaveM
*
* Relax Will Robinson.
*/
if (new_win == 0)
NET_INC_STATS(sock_net(sk),
LINUX_MIB_TCPWANTZEROWINDOWADV);
new_win = ALIGN(cur_win, 1 << tp->rx_opt.rcv_wscale);
}
tp->rcv_wnd = new_win;
tp->rcv_wup = tp->rcv_nxt;
/* Make sure we do not exceed the maximum possible
* scaled window.
*/
if (!tp->rx_opt.rcv_wscale &&
sock_net(sk)->ipv4.sysctl_tcp_workaround_signed_windows)
new_win = min(new_win, MAX_TCP_WINDOW);
else
new_win = min(new_win, (65535U << tp->rx_opt.rcv_wscale));
/* RFC1323 scaling applied */
new_win >>= tp->rx_opt.rcv_wscale;
/* If we advertise zero window, disable fast path. */
if (new_win == 0) {
tp->pred_flags = 0;
if (old_win)
NET_INC_STATS(sock_net(sk),
LINUX_MIB_TCPTOZEROWINDOWADV);
} else if (old_win == 0) {
NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPFROMZEROWINDOWADV);
}
return new_win;
}
/* Packet ECN state for a SYN-ACK */
static void tcp_ecn_send_synack(struct sock *sk, struct sk_buff *skb)
{
const struct tcp_sock *tp = tcp_sk(sk);
TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_CWR;
if (!(tp->ecn_flags & TCP_ECN_OK))
TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_ECE;
else if (tcp_ca_needs_ecn(sk) ||
tcp_bpf_ca_needs_ecn(sk))
INET_ECN_xmit(sk);
}
/* Packet ECN state for a SYN. */
static void tcp_ecn_send_syn(struct sock *sk, struct sk_buff *skb)
{
struct tcp_sock *tp = tcp_sk(sk);
bool bpf_needs_ecn = tcp_bpf_ca_needs_ecn(sk);
bool use_ecn = sock_net(sk)->ipv4.sysctl_tcp_ecn == 1 ||
tcp_ca_needs_ecn(sk) || bpf_needs_ecn;
if (!use_ecn) {
const struct dst_entry *dst = __sk_dst_get(sk);
if (dst && dst_feature(dst, RTAX_FEATURE_ECN))
use_ecn = true;
}
tp->ecn_flags = 0;
if (use_ecn) {
TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_ECE | TCPHDR_CWR;
tp->ecn_flags = TCP_ECN_OK;
if (tcp_ca_needs_ecn(sk) || bpf_needs_ecn)
INET_ECN_xmit(sk);
}
}
static void tcp_ecn_clear_syn(struct sock *sk, struct sk_buff *skb)
{
if (sock_net(sk)->ipv4.sysctl_tcp_ecn_fallback)
/* tp->ecn_flags are cleared at a later point in time when
* SYN ACK is ultimatively being received.
*/
TCP_SKB_CB(skb)->tcp_flags &= ~(TCPHDR_ECE | TCPHDR_CWR);
}
static void
tcp_ecn_make_synack(const struct request_sock *req, struct tcphdr *th)
{
if (inet_rsk(req)->ecn_ok)
th->ece = 1;
}
/* Set up ECN state for a packet on a ESTABLISHED socket that is about to
* be sent.
*/
static void tcp_ecn_send(struct sock *sk, struct sk_buff *skb,
struct tcphdr *th, int tcp_header_len)
{
struct tcp_sock *tp = tcp_sk(sk);
if (tp->ecn_flags & TCP_ECN_OK) {
/* Not-retransmitted data segment: set ECT and inject CWR. */
if (skb->len != tcp_header_len && !before(TCP_SKB_CB(skb)->seq, tp->snd_nxt)) {
INET_ECN_xmit(sk);
if (tp->ecn_flags & TCP_ECN_QUEUE_CWR) { tp->ecn_flags &= ~TCP_ECN_QUEUE_CWR;
th->cwr = 1;
skb_shinfo(skb)->gso_type |= SKB_GSO_TCP_ECN;
}
} else if (!tcp_ca_needs_ecn(sk)) {
/* ACK or retransmitted segment: clear ECT|CE */
INET_ECN_dontxmit(sk);
}
if (tp->ecn_flags & TCP_ECN_DEMAND_CWR) th->ece = 1;
}
}
/* Constructs common control bits of non-data skb. If SYN/FIN is present,
* auto increment end seqno.
*/
static void tcp_init_nondata_skb(struct sk_buff *skb, u32 seq, u8 flags)
{
skb->ip_summed = CHECKSUM_PARTIAL;
TCP_SKB_CB(skb)->tcp_flags = flags;
TCP_SKB_CB(skb)->sacked = 0;
tcp_skb_pcount_set(skb, 1);
TCP_SKB_CB(skb)->seq = seq;
if (flags & (TCPHDR_SYN | TCPHDR_FIN))
seq++;
TCP_SKB_CB(skb)->end_seq = seq;
}
static inline bool tcp_urg_mode(const struct tcp_sock *tp)
{
return tp->snd_una != tp->snd_up;
}
#define OPTION_SACK_ADVERTISE (1 << 0)
#define OPTION_TS (1 << 1)
#define OPTION_MD5 (1 << 2)
#define OPTION_WSCALE (1 << 3)
#define OPTION_FAST_OPEN_COOKIE (1 << 8)
#define OPTION_SMC (1 << 9)
#define OPTION_MPTCP (1 << 10)
static void smc_options_write(__be32 *ptr, u16 *options)
{
#if IS_ENABLED(CONFIG_SMC)
if (static_branch_unlikely(&tcp_have_smc)) {
if (unlikely(OPTION_SMC & *options)) {
*ptr++ = htonl((TCPOPT_NOP << 24) |
(TCPOPT_NOP << 16) |
(TCPOPT_EXP << 8) |
(TCPOLEN_EXP_SMC_BASE));
*ptr++ = htonl(TCPOPT_SMC_MAGIC);
}
}
#endif
}
struct tcp_out_options {
u16 options; /* bit field of OPTION_* */
u16 mss; /* 0 to disable */
u8 ws; /* window scale, 0 to disable */
u8 num_sack_blocks; /* number of SACK blocks to include */
u8 hash_size; /* bytes in hash_location */
u8 bpf_opt_len; /* length of BPF hdr option */
__u8 *hash_location; /* temporary pointer, overloaded */
__u32 tsval, tsecr; /* need to include OPTION_TS */
struct tcp_fastopen_cookie *fastopen_cookie; /* Fast open cookie */
struct mptcp_out_options mptcp;
};
static void mptcp_options_write(__be32 *ptr, const struct tcp_sock *tp,
struct tcp_out_options *opts)
{
#if IS_ENABLED(CONFIG_MPTCP)
if (unlikely(OPTION_MPTCP & opts->options))
mptcp_write_options(ptr, tp, &opts->mptcp);
#endif
}
#ifdef CONFIG_CGROUP_BPF
static int bpf_skops_write_hdr_opt_arg0(struct sk_buff *skb,
enum tcp_synack_type synack_type)
{
if (unlikely(!skb))
return BPF_WRITE_HDR_TCP_CURRENT_MSS;
if (unlikely(synack_type == TCP_SYNACK_COOKIE))
return BPF_WRITE_HDR_TCP_SYNACK_COOKIE;
return 0;
}
/* req, syn_skb and synack_type are used when writing synack */
static void bpf_skops_hdr_opt_len(struct sock *sk, struct sk_buff *skb,
struct request_sock *req,
struct sk_buff *syn_skb,
enum tcp_synack_type synack_type,
struct tcp_out_options *opts,
unsigned int *remaining)
{
struct bpf_sock_ops_kern sock_ops;
int err;
if (likely(!BPF_SOCK_OPS_TEST_FLAG(tcp_sk(sk),
BPF_SOCK_OPS_WRITE_HDR_OPT_CB_FLAG)) ||
!*remaining)
return;
/* *remaining has already been aligned to 4 bytes, so *remaining >= 4 */
/* init sock_ops */
memset(&sock_ops, 0, offsetof(struct bpf_sock_ops_kern, temp));
sock_ops.op = BPF_SOCK_OPS_HDR_OPT_LEN_CB;
if (req) {
/* The listen "sk" cannot be passed here because
* it is not locked. It would not make too much
* sense to do bpf_setsockopt(listen_sk) based
* on individual connection request also.
*
* Thus, "req" is passed here and the cgroup-bpf-progs
* of the listen "sk" will be run.
*
* "req" is also used here for fastopen even the "sk" here is
* a fullsock "child" sk. It is to keep the behavior
* consistent between fastopen and non-fastopen on
* the bpf programming side.
*/
sock_ops.sk = (struct sock *)req;
sock_ops.syn_skb = syn_skb;
} else {
sock_owned_by_me(sk);
sock_ops.is_fullsock = 1;
sock_ops.sk = sk;
}
sock_ops.args[0] = bpf_skops_write_hdr_opt_arg0(skb, synack_type);
sock_ops.remaining_opt_len = *remaining;
/* tcp_current_mss() does not pass a skb */
if (skb)
bpf_skops_init_skb(&sock_ops, skb, 0);
err = BPF_CGROUP_RUN_PROG_SOCK_OPS_SK(&sock_ops, sk);
if (err || sock_ops.remaining_opt_len == *remaining)
return;
opts->bpf_opt_len = *remaining - sock_ops.remaining_opt_len;
/* round up to 4 bytes */
opts->bpf_opt_len = (opts->bpf_opt_len + 3) & ~3;
*remaining -= opts->bpf_opt_len;
}
static void bpf_skops_write_hdr_opt(struct sock *sk, struct sk_buff *skb,
struct request_sock *req,
struct sk_buff *syn_skb,
enum tcp_synack_type synack_type,
struct tcp_out_options *opts)
{
u8 first_opt_off, nr_written, max_opt_len = opts->bpf_opt_len;
struct bpf_sock_ops_kern sock_ops;
int err;
if (likely(!max_opt_len))
return;
memset(&sock_ops, 0, offsetof(struct bpf_sock_ops_kern, temp));
sock_ops.op = BPF_SOCK_OPS_WRITE_HDR_OPT_CB;
if (req) {
sock_ops.sk = (struct sock *)req;
sock_ops.syn_skb = syn_skb;
} else {
sock_owned_by_me(sk);
sock_ops.is_fullsock = 1;
sock_ops.sk = sk;
}
sock_ops.args[0] = bpf_skops_write_hdr_opt_arg0(skb, synack_type);
sock_ops.remaining_opt_len = max_opt_len;
first_opt_off = tcp_hdrlen(skb) - max_opt_len;
bpf_skops_init_skb(&sock_ops, skb, first_opt_off);
err = BPF_CGROUP_RUN_PROG_SOCK_OPS_SK(&sock_ops, sk);
if (err)
nr_written = 0;
else
nr_written = max_opt_len - sock_ops.remaining_opt_len;
if (nr_written < max_opt_len)
memset(skb->data + first_opt_off + nr_written, TCPOPT_NOP,
max_opt_len - nr_written);
}
#else
static void bpf_skops_hdr_opt_len(struct sock *sk, struct sk_buff *skb,
struct request_sock *req,
struct sk_buff *syn_skb,
enum tcp_synack_type synack_type,
struct tcp_out_options *opts,
unsigned int *remaining)
{
}
static void bpf_skops_write_hdr_opt(struct sock *sk, struct sk_buff *skb,
struct request_sock *req,
struct sk_buff *syn_skb,
enum tcp_synack_type synack_type,
struct tcp_out_options *opts)
{
}
#endif
/* Write previously computed TCP options to the packet.
*
* Beware: Something in the Internet is very sensitive to the ordering of
* TCP options, we learned this through the hard way, so be careful here.
* Luckily we can at least blame others for their non-compliance but from
* inter-operability perspective it seems that we're somewhat stuck with
* the ordering which we have been using if we want to keep working with
* those broken things (not that it currently hurts anybody as there isn't
* particular reason why the ordering would need to be changed).
*
* At least SACK_PERM as the first option is known to lead to a disaster
* (but it may well be that other scenarios fail similarly).
*/
static void tcp_options_write(__be32 *ptr, struct tcp_sock *tp,
struct tcp_out_options *opts)
{
u16 options = opts->options; /* mungable copy */
if (unlikely(OPTION_MD5 & options)) {
*ptr++ = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
(TCPOPT_MD5SIG << 8) | TCPOLEN_MD5SIG);
/* overload cookie hash location */
opts->hash_location = (__u8 *)ptr;
ptr += 4;
}
if (unlikely(opts->mss)) { *ptr++ = htonl((TCPOPT_MSS << 24) |
(TCPOLEN_MSS << 16) |
opts->mss);
}
if (likely(OPTION_TS & options)) { if (unlikely(OPTION_SACK_ADVERTISE & options)) {
*ptr++ = htonl((TCPOPT_SACK_PERM << 24) |
(TCPOLEN_SACK_PERM << 16) |
(TCPOPT_TIMESTAMP << 8) |
TCPOLEN_TIMESTAMP);
options &= ~OPTION_SACK_ADVERTISE;
} else {
*ptr++ = htonl((TCPOPT_NOP << 24) |
(TCPOPT_NOP << 16) |
(TCPOPT_TIMESTAMP << 8) |
TCPOLEN_TIMESTAMP);
}
*ptr++ = htonl(opts->tsval);
*ptr++ = htonl(opts->tsecr);
}
if (unlikely(OPTION_SACK_ADVERTISE & options)) { *ptr++ = htonl((TCPOPT_NOP << 24) |
(TCPOPT_NOP << 16) |
(TCPOPT_SACK_PERM << 8) |
TCPOLEN_SACK_PERM);
}
if (unlikely(OPTION_WSCALE & options)) { *ptr++ = htonl((TCPOPT_NOP << 24) |
(TCPOPT_WINDOW << 16) |
(TCPOLEN_WINDOW << 8) |
opts->ws);
}
if (unlikely(opts->num_sack_blocks)) { struct tcp_sack_block *sp = tp->rx_opt.dsack ? tp->duplicate_sack : tp->selective_acks;
int this_sack;
*ptr++ = htonl((TCPOPT_NOP << 24) |
(TCPOPT_NOP << 16) |
(TCPOPT_SACK << 8) |
(TCPOLEN_SACK_BASE + (opts->num_sack_blocks *
TCPOLEN_SACK_PERBLOCK)));
for (this_sack = 0; this_sack < opts->num_sack_blocks;
++this_sack) {
*ptr++ = htonl(sp[this_sack].start_seq);
*ptr++ = htonl(sp[this_sack].end_seq);
}
tp->rx_opt.dsack = 0;
}
if (unlikely(OPTION_FAST_OPEN_COOKIE & options)) { struct tcp_fastopen_cookie *foc = opts->fastopen_cookie;
u8 *p = (u8 *)ptr;
u32 len; /* Fast Open option length */
if (foc->exp) {
len = TCPOLEN_EXP_FASTOPEN_BASE + foc->len;
*ptr = htonl((TCPOPT_EXP << 24) | (len << 16) |
TCPOPT_FASTOPEN_MAGIC);
p += TCPOLEN_EXP_FASTOPEN_BASE;
} else {
len = TCPOLEN_FASTOPEN_BASE + foc->len;
*p++ = TCPOPT_FASTOPEN;
*p++ = len;
}
memcpy(p, foc->val, foc->len);
if ((len & 3) == 2) {
p[foc->len] = TCPOPT_NOP;
p[foc->len + 1] = TCPOPT_NOP;
}
ptr += (len + 3) >> 2;
}
smc_options_write(ptr, &options);
mptcp_options_write(ptr, tp, opts);
}
static void smc_set_option(const struct tcp_sock *tp,
struct tcp_out_options *opts,
unsigned int *remaining)
{
#if IS_ENABLED(CONFIG_SMC)
if (static_branch_unlikely(&tcp_have_smc)) {
if (tp->syn_smc) {
if (*remaining >= TCPOLEN_EXP_SMC_BASE_ALIGNED) {
opts->options |= OPTION_SMC;
*remaining -= TCPOLEN_EXP_SMC_BASE_ALIGNED;
}
}
}
#endif
}
static void smc_set_option_cond(const struct tcp_sock *tp,
const struct inet_request_sock *ireq,
struct tcp_out_options *opts,
unsigned int *remaining)
{
#if IS_ENABLED(CONFIG_SMC)
if (static_branch_unlikely(&tcp_have_smc)) {
if (tp->syn_smc && ireq->smc_ok) {
if (*remaining >= TCPOLEN_EXP_SMC_BASE_ALIGNED) {
opts->options |= OPTION_SMC;
*remaining -= TCPOLEN_EXP_SMC_BASE_ALIGNED;
}
}
}
#endif
}
static void mptcp_set_option_cond(const struct request_sock *req,
struct tcp_out_options *opts,
unsigned int *remaining)
{
if (rsk_is_mptcp(req)) {
unsigned int size;
if (mptcp_synack_options(req, &size, &opts->mptcp)) {
if (*remaining >= size) {
opts->options |= OPTION_MPTCP;
*remaining -= size;
}
}
}
}
/* Compute TCP options for SYN packets. This is not the final
* network wire format yet.
*/
static unsigned int tcp_syn_options(struct sock *sk, struct sk_buff *skb,
struct tcp_out_options *opts,
struct tcp_md5sig_key **md5)
{
struct tcp_sock *tp = tcp_sk(sk);
unsigned int remaining = MAX_TCP_OPTION_SPACE;
struct tcp_fastopen_request *fastopen = tp->fastopen_req;
*md5 = NULL;
#ifdef CONFIG_TCP_MD5SIG
if (static_branch_unlikely(&tcp_md5_needed) &&
rcu_access_pointer(tp->md5sig_info)) { *md5 = tp->af_specific->md5_lookup(sk, sk);
if (*md5) {
opts->options |= OPTION_MD5;
remaining -= TCPOLEN_MD5SIG_ALIGNED;
}
}
#endif
/* We always get an MSS option. The option bytes which will be seen in
* normal data packets should timestamps be used, must be in the MSS
* advertised. But we subtract them from tp->mss_cache so that
* calculations in tcp_sendmsg are simpler etc. So account for this
* fact here if necessary. If we don't do this correctly, as a
* receiver we won't recognize data packets as being full sized when we
* should, and thus we won't abide by the delayed ACK rules correctly.
* SACKs don't matter, we never delay an ACK when we have any of those
* going out. */
opts->mss = tcp_advertise_mss(sk);
remaining -= TCPOLEN_MSS_ALIGNED;
if (likely(sock_net(sk)->ipv4.sysctl_tcp_timestamps && !*md5)) { opts->options |= OPTION_TS;
opts->tsval = tcp_skb_timestamp(skb) + tp->tsoffset;
opts->tsecr = tp->rx_opt.ts_recent;
remaining -= TCPOLEN_TSTAMP_ALIGNED;
}
if (likely(sock_net(sk)->ipv4.sysctl_tcp_window_scaling)) { opts->ws = tp->rx_opt.rcv_wscale;
opts->options |= OPTION_WSCALE;
remaining -= TCPOLEN_WSCALE_ALIGNED;
}
if (likely(sock_net(sk)->ipv4.sysctl_tcp_sack)) { opts->options |= OPTION_SACK_ADVERTISE;
if (unlikely(!(OPTION_TS & opts->options)))
remaining -= TCPOLEN_SACKPERM_ALIGNED;
}
if (fastopen && fastopen->cookie.len >= 0) { u32 need = fastopen->cookie.len; need += fastopen->cookie.exp ? TCPOLEN_EXP_FASTOPEN_BASE :
TCPOLEN_FASTOPEN_BASE;
need = (need + 3) & ~3U; /* Align to 32 bits */
if (remaining >= need) {
opts->options |= OPTION_FAST_OPEN_COOKIE;
opts->fastopen_cookie = &fastopen->cookie;
remaining -= need;
tp->syn_fastopen = 1;
tp->syn_fastopen_exp = fastopen->cookie.exp ? 1 : 0;
}
}
smc_set_option(tp, opts, &remaining);
if (sk_is_mptcp(sk)) {
unsigned int size;
if (mptcp_syn_options(sk, skb, &size, &opts->mptcp)) {
opts->options |= OPTION_MPTCP;
remaining -= size;
}
}
bpf_skops_hdr_opt_len(sk, skb, NULL, NULL, 0, opts, &remaining);
return MAX_TCP_OPTION_SPACE - remaining;
}
/* Set up TCP options for SYN-ACKs. */
static unsigned int tcp_synack_options(const struct sock *sk,
struct request_sock *req,
unsigned int mss, struct sk_buff *skb,
struct tcp_out_options *opts,
const struct tcp_md5sig_key *md5,
struct tcp_fastopen_cookie *foc,
enum tcp_synack_type synack_type,
struct sk_buff *syn_skb)
{
struct inet_request_sock *ireq = inet_rsk(req);
unsigned int remaining = MAX_TCP_OPTION_SPACE;
#ifdef CONFIG_TCP_MD5SIG
if (md5) {
opts->options |= OPTION_MD5;
remaining -= TCPOLEN_MD5SIG_ALIGNED;
/* We can't fit any SACK blocks in a packet with MD5 + TS
* options. There was discussion about disabling SACK
* rather than TS in order to fit in better with old,
* buggy kernels, but that was deemed to be unnecessary.
*/
if (synack_type != TCP_SYNACK_COOKIE)
ireq->tstamp_ok &= !ireq->sack_ok;
}
#endif
/* We always send an MSS option. */
opts->mss = mss;
remaining -= TCPOLEN_MSS_ALIGNED;
if (likely(ireq->wscale_ok)) {
opts->ws = ireq->rcv_wscale;
opts->options |= OPTION_WSCALE;
remaining -= TCPOLEN_WSCALE_ALIGNED;
}
if (likely(ireq->tstamp_ok)) {
opts->options |= OPTION_TS;
opts->tsval = tcp_skb_timestamp(skb) + tcp_rsk(req)->ts_off;
opts->tsecr = req->ts_recent;
remaining -= TCPOLEN_TSTAMP_ALIGNED;
}
if (likely(ireq->sack_ok)) {
opts->options |= OPTION_SACK_ADVERTISE;
if (unlikely(!ireq->tstamp_ok))
remaining -= TCPOLEN_SACKPERM_ALIGNED;
}
if (foc != NULL && foc->len >= 0) {
u32 need = foc->len;
need += foc->exp ? TCPOLEN_EXP_FASTOPEN_BASE :
TCPOLEN_FASTOPEN_BASE;
need = (need + 3) & ~3U; /* Align to 32 bits */
if (remaining >= need) {
opts->options |= OPTION_FAST_OPEN_COOKIE;
opts->fastopen_cookie = foc;
remaining -= need;
}
}
mptcp_set_option_cond(req, opts, &remaining);
smc_set_option_cond(tcp_sk(sk), ireq, opts, &remaining);
bpf_skops_hdr_opt_len((struct sock *)sk, skb, req, syn_skb,
synack_type, opts, &remaining);
return MAX_TCP_OPTION_SPACE - remaining;
}
/* Compute TCP options for ESTABLISHED sockets. This is not the
* final wire format yet.
*/
static unsigned int tcp_established_options(struct sock *sk, struct sk_buff *skb,
struct tcp_out_options *opts,
struct tcp_md5sig_key **md5)
{
struct tcp_sock *tp = tcp_sk(sk);
unsigned int size = 0;
unsigned int eff_sacks;
opts->options = 0;
*md5 = NULL;
#ifdef CONFIG_TCP_MD5SIG
if (static_branch_unlikely(&tcp_md5_needed) &&
rcu_access_pointer(tp->md5sig_info)) {
*md5 = tp->af_specific->md5_lookup(sk, sk);
if (*md5) {
opts->options |= OPTION_MD5;
size += TCPOLEN_MD5SIG_ALIGNED;
}
}
#endif
if (likely(tp->rx_opt.tstamp_ok)) {
opts->options |= OPTION_TS;
opts->tsval = skb ? tcp_skb_timestamp(skb) + tp->tsoffset : 0;
opts->tsecr = tp->rx_opt.ts_recent;
size += TCPOLEN_TSTAMP_ALIGNED;
}
/* MPTCP options have precedence over SACK for the limited TCP
* option space because a MPTCP connection would be forced to
* fall back to regular TCP if a required multipath option is
* missing. SACK still gets a chance to use whatever space is
* left.
*/
if (sk_is_mptcp(sk)) {
unsigned int remaining = MAX_TCP_OPTION_SPACE - size;
unsigned int opt_size = 0;
if (mptcp_established_options(sk, skb, &opt_size, remaining,
&opts->mptcp)) {
opts->options |= OPTION_MPTCP;
size += opt_size;
}
}
eff_sacks = tp->rx_opt.num_sacks + tp->rx_opt.dsack;
if (unlikely(eff_sacks)) {
const unsigned int remaining = MAX_TCP_OPTION_SPACE - size;
if (unlikely(remaining < TCPOLEN_SACK_BASE_ALIGNED +
TCPOLEN_SACK_PERBLOCK))
return size;
opts->num_sack_blocks =
min_t(unsigned int, eff_sacks,
(remaining - TCPOLEN_SACK_BASE_ALIGNED) /
TCPOLEN_SACK_PERBLOCK);
size += TCPOLEN_SACK_BASE_ALIGNED +
opts->num_sack_blocks * TCPOLEN_SACK_PERBLOCK;
}
if (unlikely(BPF_SOCK_OPS_TEST_FLAG(tp,
BPF_SOCK_OPS_WRITE_HDR_OPT_CB_FLAG))) {
unsigned int remaining = MAX_TCP_OPTION_SPACE - size;
bpf_skops_hdr_opt_len(sk, skb, NULL, NULL, 0, opts, &remaining);
size = MAX_TCP_OPTION_SPACE - remaining;
}
return size;
}
/* TCP SMALL QUEUES (TSQ)
*
* TSQ goal is to keep small amount of skbs per tcp flow in tx queues (qdisc+dev)
* to reduce RTT and bufferbloat.
* We do this using a special skb destructor (tcp_wfree).
*
* Its important tcp_wfree() can be replaced by sock_wfree() in the event skb
* needs to be reallocated in a driver.
* The invariant being skb->truesize subtracted from sk->sk_wmem_alloc
*
* Since transmit from skb destructor is forbidden, we use a tasklet
* to process all sockets that eventually need to send more skbs.
* We use one tasklet per cpu, with its own queue of sockets.
*/
struct tsq_tasklet {
struct tasklet_struct tasklet;
struct list_head head; /* queue of tcp sockets */
};
static DEFINE_PER_CPU(struct tsq_tasklet, tsq_tasklet);
static void tcp_tsq_write(struct sock *sk)
{
if ((1 << sk->sk_state) &
(TCPF_ESTABLISHED | TCPF_FIN_WAIT1 | TCPF_CLOSING |
TCPF_CLOSE_WAIT | TCPF_LAST_ACK)) {
struct tcp_sock *tp = tcp_sk(sk);
if (tp->lost_out > tp->retrans_out &&
tp->snd_cwnd > tcp_packets_in_flight(tp)) {
tcp_mstamp_refresh(tp);
tcp_xmit_retransmit_queue(sk);
}
tcp_write_xmit(sk, tcp_current_mss(sk), tp->nonagle,
0, GFP_ATOMIC);
}
}
static void tcp_tsq_handler(struct sock *sk)
{
bh_lock_sock(sk);
if (!sock_owned_by_user(sk))
tcp_tsq_write(sk);
else if (!test_and_set_bit(TCP_TSQ_DEFERRED, &sk->sk_tsq_flags))
sock_hold(sk);
bh_unlock_sock(sk);
}
/*
* One tasklet per cpu tries to send more skbs.
* We run in tasklet context but need to disable irqs when
* transferring tsq->head because tcp_wfree() might
* interrupt us (non NAPI drivers)
*/
static void tcp_tasklet_func(struct tasklet_struct *t)
{
struct tsq_tasklet *tsq = from_tasklet(tsq, t, tasklet);
LIST_HEAD(list);
unsigned long flags;
struct list_head *q, *n;
struct tcp_sock *tp;
struct sock *sk;
local_irq_save(flags);
list_splice_init(&tsq->head, &list);
local_irq_restore(flags);
list_for_each_safe(q, n, &list) {
tp = list_entry(q, struct tcp_sock, tsq_node);
list_del(&tp->tsq_node);
sk = (struct sock *)tp;
smp_mb__before_atomic();
clear_bit(TSQ_QUEUED, &sk->sk_tsq_flags);
tcp_tsq_handler(sk);
sk_free(sk);
}
}
#define TCP_DEFERRED_ALL (TCPF_TSQ_DEFERRED | \
TCPF_WRITE_TIMER_DEFERRED | \
TCPF_DELACK_TIMER_DEFERRED | \
TCPF_MTU_REDUCED_DEFERRED)
/**
* tcp_release_cb - tcp release_sock() callback
* @sk: socket
*
* called from release_sock() to perform protocol dependent
* actions before socket release.
*/
void tcp_release_cb(struct sock *sk)
{
unsigned long flags, nflags;
/* perform an atomic operation only if at least one flag is set */
do {
flags = sk->sk_tsq_flags;
if (!(flags & TCP_DEFERRED_ALL))
return;
nflags = flags & ~TCP_DEFERRED_ALL;
} while (cmpxchg(&sk->sk_tsq_flags, flags, nflags) != flags);
if (flags & TCPF_TSQ_DEFERRED) {
tcp_tsq_write(sk);
__sock_put(sk);
}
/* Here begins the tricky part :
* We are called from release_sock() with :
* 1) BH disabled
* 2) sk_lock.slock spinlock held
* 3) socket owned by us (sk->sk_lock.owned == 1)
*
* But following code is meant to be called from BH handlers,
* so we should keep BH disabled, but early release socket ownership
*/
sock_release_ownership(sk);
if (flags & TCPF_WRITE_TIMER_DEFERRED) { tcp_write_timer_handler(sk);
__sock_put(sk);
}
if (flags & TCPF_DELACK_TIMER_DEFERRED) { tcp_delack_timer_handler(sk);
__sock_put(sk);
}
if (flags & TCPF_MTU_REDUCED_DEFERRED) { inet_csk(sk)->icsk_af_ops->mtu_reduced(sk);
__sock_put(sk);
}
}
EXPORT_SYMBOL(tcp_release_cb);
void __init tcp_tasklet_init(void)
{
int i;
for_each_possible_cpu(i) {
struct tsq_tasklet *tsq = &per_cpu(tsq_tasklet, i);
INIT_LIST_HEAD(&tsq->head);
tasklet_setup(&tsq->tasklet, tcp_tasklet_func);
}
}
/*
* Write buffer destructor automatically called from kfree_skb.
* We can't xmit new skbs from this context, as we might already
* hold qdisc lock.
*/
void tcp_wfree(struct sk_buff *skb)
{
struct sock *sk = skb->sk;
struct tcp_sock *tp = tcp_sk(sk);
unsigned long flags, nval, oval;
/* Keep one reference on sk_wmem_alloc.
* Will be released by sk_free() from here or tcp_tasklet_func()
*/
WARN_ON(refcount_sub_and_test(skb->truesize - 1, &sk->sk_wmem_alloc));
/* If this softirq is serviced by ksoftirqd, we are likely under stress.
* Wait until our queues (qdisc + devices) are drained.
* This gives :
* - less callbacks to tcp_write_xmit(), reducing stress (batches)
* - chance for incoming ACK (processed by another cpu maybe)
* to migrate this flow (skb->ooo_okay will be eventually set)
*/
if (refcount_read(&sk->sk_wmem_alloc) >= SKB_TRUESIZE(1) && this_cpu_ksoftirqd() == current)
goto out;
for (oval = READ_ONCE(sk->sk_tsq_flags);; oval = nval) {
struct tsq_tasklet *tsq;
bool empty;
if (!(oval & TSQF_THROTTLED) || (oval & TSQF_QUEUED))
goto out;
nval = (oval & ~TSQF_THROTTLED) | TSQF_QUEUED;
nval = cmpxchg(&sk->sk_tsq_flags, oval, nval);
if (nval != oval)
continue;
/* queue this socket to tasklet queue */
local_irq_save(flags);
tsq = this_cpu_ptr(&tsq_tasklet);
empty = list_empty(&tsq->head);
list_add(&tp->tsq_node, &tsq->head);
if (empty)
tasklet_schedule(&tsq->tasklet);
local_irq_restore(flags);
return;
}
out:
sk_free(sk);
}
/* Note: Called under soft irq.
* We can call TCP stack right away, unless socket is owned by user.
*/
enum hrtimer_restart tcp_pace_kick(struct hrtimer *timer)
{
struct tcp_sock *tp = container_of(timer, struct tcp_sock, pacing_timer);
struct sock *sk = (struct sock *)tp;
tcp_tsq_handler(sk);
sock_put(sk);
return HRTIMER_NORESTART;
}
static void tcp_update_skb_after_send(struct sock *sk, struct sk_buff *skb,
u64 prior_wstamp)
{
struct tcp_sock *tp = tcp_sk(sk);
if (sk->sk_pacing_status != SK_PACING_NONE) { unsigned long rate = sk->sk_pacing_rate;
/* Original sch_fq does not pace first 10 MSS
* Note that tp->data_segs_out overflows after 2^32 packets,
* this is a minor annoyance.
*/
if (rate != ~0UL && rate && tp->data_segs_out >= 10) { u64 len_ns = div64_ul((u64)skb->len * NSEC_PER_SEC, rate);
u64 credit = tp->tcp_wstamp_ns - prior_wstamp;
/* take into account OS jitter */
len_ns -= min_t(u64, len_ns / 2, credit);
tp->tcp_wstamp_ns += len_ns;
}
}
list_move_tail(&skb->tcp_tsorted_anchor, &tp->tsorted_sent_queue);
}
INDIRECT_CALLABLE_DECLARE(int ip_queue_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl));
INDIRECT_CALLABLE_DECLARE(int inet6_csk_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl));
INDIRECT_CALLABLE_DECLARE(void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb));
/* This routine actually transmits TCP packets queued in by
* tcp_do_sendmsg(). This is used by both the initial
* transmission and possible later retransmissions.
* All SKB's seen here are completely headerless. It is our
* job to build the TCP header, and pass the packet down to
* IP so it can do the same plus pass the packet off to the
* device.
*
* We are working here with either a clone of the original
* SKB, or a fresh unique copy made by the retransmit engine.
*/
static int __tcp_transmit_skb(struct sock *sk, struct sk_buff *skb,
int clone_it, gfp_t gfp_mask, u32 rcv_nxt)
{
const struct inet_connection_sock *icsk = inet_csk(sk);
struct inet_sock *inet;
struct tcp_sock *tp;
struct tcp_skb_cb *tcb;
struct tcp_out_options opts;
unsigned int tcp_options_size, tcp_header_size;
struct sk_buff *oskb = NULL;
struct tcp_md5sig_key *md5;
struct tcphdr *th;
u64 prior_wstamp;
int err;
BUG_ON(!skb || !tcp_skb_pcount(skb));
tp = tcp_sk(sk);
prior_wstamp = tp->tcp_wstamp_ns;
tp->tcp_wstamp_ns = max(tp->tcp_wstamp_ns, tp->tcp_clock_cache);
skb->skb_mstamp_ns = tp->tcp_wstamp_ns;
if (clone_it) {
TCP_SKB_CB(skb)->tx.in_flight = TCP_SKB_CB(skb)->end_seq
- tp->snd_una;
oskb = skb;
tcp_skb_tsorted_save(oskb) {
if (unlikely(skb_cloned(oskb)))
skb = pskb_copy(oskb, gfp_mask);
else
skb = skb_clone(oskb, gfp_mask); } tcp_skb_tsorted_restore(oskb);
if (unlikely(!skb))
return -ENOBUFS;
/* retransmit skbs might have a non zero value in skb->dev
* because skb->dev is aliased with skb->rbnode.rb_left
*/
skb->dev = NULL;
}
inet = inet_sk(sk);
tcb = TCP_SKB_CB(skb);
memset(&opts, 0, sizeof(opts));
if (unlikely(tcb->tcp_flags & TCPHDR_SYN)) {
tcp_options_size = tcp_syn_options(sk, skb, &opts, &md5);
} else {
tcp_options_size = tcp_established_options(sk, skb, &opts,
&md5);
/* Force a PSH flag on all (GSO) packets to expedite GRO flush
* at receiver : This slightly improve GRO performance.
* Note that we do not force the PSH flag for non GSO packets,
* because they might be sent under high congestion events,
* and in this case it is better to delay the delivery of 1-MSS
* packets and thus the corresponding ACK packet that would
* release the following packet.
*/
if (tcp_skb_pcount(skb) > 1)
tcb->tcp_flags |= TCPHDR_PSH;
}
tcp_header_size = tcp_options_size + sizeof(struct tcphdr);
/* if no packet is in qdisc/device queue, then allow XPS to select
* another queue. We can be called from tcp_tsq_handler()
* which holds one reference to sk.
*
* TODO: Ideally, in-flight pure ACK packets should not matter here.
* One way to get this would be to set skb->truesize = 2 on them.
*/
skb->ooo_okay = sk_wmem_alloc_get(sk) < SKB_TRUESIZE(1);
/* If we had to use memory reserve to allocate this skb,
* this might cause drops if packet is looped back :
* Other socket might not have SOCK_MEMALLOC.
* Packets not looped back do not care about pfmemalloc.
*/
skb->pfmemalloc = 0;
skb_push(skb, tcp_header_size);
skb_reset_transport_header(skb);
skb_orphan(skb);
skb->sk = sk; skb->destructor = skb_is_tcp_pure_ack(skb) ? __sock_wfree : tcp_wfree;
refcount_add(skb->truesize, &sk->sk_wmem_alloc);
skb_set_dst_pending_confirm(skb, sk->sk_dst_pending_confirm);
/* Build TCP header and checksum it. */
th = (struct tcphdr *)skb->data;
th->source = inet->inet_sport;
th->dest = inet->inet_dport;
th->seq = htonl(tcb->seq);
th->ack_seq = htonl(rcv_nxt);
*(((__be16 *)th) + 6) = htons(((tcp_header_size >> 2) << 12) |
tcb->tcp_flags);
th->check = 0;
th->urg_ptr = 0;
/* The urg_mode check is necessary during a below snd_una win probe */
if (unlikely(tcp_urg_mode(tp) && before(tcb->seq, tp->snd_up))) { if (before(tp->snd_up, tcb->seq + 0x10000)) { th->urg_ptr = htons(tp->snd_up - tcb->seq);
th->urg = 1;
} else if (after(tcb->seq + 0xFFFF, tp->snd_nxt)) { th->urg_ptr = htons(0xFFFF);
th->urg = 1;
}
}
skb_shinfo(skb)->gso_type = sk->sk_gso_type;
if (likely(!(tcb->tcp_flags & TCPHDR_SYN))) {
th->window = htons(tcp_select_window(sk));
tcp_ecn_send(sk, skb, th, tcp_header_size);
} else {
/* RFC1323: The window in SYN & SYN/ACK segments
* is never scaled.
*/
th->window = htons(min(tp->rcv_wnd, 65535U));
}
tcp_options_write((__be32 *)(th + 1), tp, &opts);
#ifdef CONFIG_TCP_MD5SIG
/* Calculate the MD5 hash, as we have all we need now */
if (md5) {
sk_nocaps_add(sk, NETIF_F_GSO_MASK);
tp->af_specific->calc_md5_hash(opts.hash_location,
md5, sk, skb);
}
#endif
/* BPF prog is the last one writing header option */
bpf_skops_write_hdr_opt(sk, skb, NULL, NULL, 0, &opts);
INDIRECT_CALL_INET(icsk->icsk_af_ops->send_check,
tcp_v6_send_check, tcp_v4_send_check,
sk, skb);
if (likely(tcb->tcp_flags & TCPHDR_ACK))
tcp_event_ack_sent(sk, tcp_skb_pcount(skb), rcv_nxt);
if (skb->len != tcp_header_size) {
tcp_event_data_sent(tp, sk);
tp->data_segs_out += tcp_skb_pcount(skb);
tp->bytes_sent += skb->len - tcp_header_size;
}
if (after(tcb->end_seq, tp->snd_nxt) || tcb->seq == tcb->end_seq)
TCP_ADD_STATS(sock_net(sk), TCP_MIB_OUTSEGS,
tcp_skb_pcount(skb));
tp->segs_out += tcp_skb_pcount(skb);
skb_set_hash_from_sk(skb, sk);
/* OK, its time to fill skb_shinfo(skb)->gso_{segs|size} */
skb_shinfo(skb)->gso_segs = tcp_skb_pcount(skb);
skb_shinfo(skb)->gso_size = tcp_skb_mss(skb);
/* Leave earliest departure time in skb->tstamp (skb->skb_mstamp_ns) */
/* Cleanup our debris for IP stacks */
memset(skb->cb, 0, max(sizeof(struct inet_skb_parm),
sizeof(struct inet6_skb_parm)));
tcp_add_tx_delay(skb, tp);
err = INDIRECT_CALL_INET(icsk->icsk_af_ops->queue_xmit,
inet6_csk_xmit, ip_queue_xmit,
sk, skb, &inet->cork.fl);
if (unlikely(err > 0)) { tcp_enter_cwr(sk);
err = net_xmit_eval(err);
}
if (!err && oskb) { tcp_update_skb_after_send(sk, oskb, prior_wstamp); tcp_rate_skb_sent(sk, oskb);
}
return err;
}
static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
gfp_t gfp_mask)
{
return __tcp_transmit_skb(sk, skb, clone_it, gfp_mask,
tcp_sk(sk)->rcv_nxt);
}
/* This routine just queues the buffer for sending.
*
* NOTE: probe0 timer is not checked, do not forget tcp_push_pending_frames,
* otherwise socket can stall.
*/
static void tcp_queue_skb(struct sock *sk, struct sk_buff *skb)
{
struct tcp_sock *tp = tcp_sk(sk);
/* Advance write_seq and place onto the write_queue. */
WRITE_ONCE(tp->write_seq, TCP_SKB_CB(skb)->end_seq);
__skb_header_release(skb);
tcp_add_write_queue_tail(sk, skb);
sk_wmem_queued_add(sk, skb->truesize);
sk_mem_charge(sk, skb->truesize);
}
/* Initialize TSO segments for a packet. */
static void tcp_set_skb_tso_segs(struct sk_buff *skb, unsigned int mss_now)
{
if (skb->len <= mss_now) {
/* Avoid the costly divide in the normal
* non-TSO case.
*/
tcp_skb_pcount_set(skb, 1);
TCP_SKB_CB(skb)->tcp_gso_size = 0;
} else {
tcp_skb_pcount_set(skb, DIV_ROUND_UP(skb->len, mss_now));
TCP_SKB_CB(skb)->tcp_gso_size = mss_now;
}
}
/* Pcount in the middle of the write queue got changed, we need to do various
* tweaks to fix counters
*/
static void tcp_adjust_pcount(struct sock *sk, const struct sk_buff *skb, int decr)
{
struct tcp_sock *tp = tcp_sk(sk);
tp->packets_out -= decr;
if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)
tp->sacked_out -= decr;
if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS)
tp->retrans_out -= decr;
if (TCP_SKB_CB(skb)->sacked & TCPCB_LOST)
tp->lost_out -= decr;
/* Reno case is special. Sigh... */
if (tcp_is_reno(tp) && decr > 0)
tp->sacked_out -= min_t(u32, tp->sacked_out, decr);
if (tp->lost_skb_hint &&
before(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(tp->lost_skb_hint)->seq) &&
(TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED))
tp->lost_cnt_hint -= decr;
tcp_verify_left_out(tp);
}
static bool tcp_has_tx_tstamp(const struct sk_buff *skb)
{
return TCP_SKB_CB(skb)->txstamp_ack ||
(skb_shinfo(skb)->tx_flags & SKBTX_ANY_TSTAMP);
}
static void tcp_fragment_tstamp(struct sk_buff *skb, struct sk_buff *skb2)
{
struct skb_shared_info *shinfo = skb_shinfo(skb);
if (unlikely(tcp_has_tx_tstamp(skb)) &&
!before(shinfo->tskey, TCP_SKB_CB(skb2)->seq)) {
struct skb_shared_info *shinfo2 = skb_shinfo(skb2);
u8 tsflags = shinfo->tx_flags & SKBTX_ANY_TSTAMP;
shinfo->tx_flags &= ~tsflags;
shinfo2->tx_flags |= tsflags;
swap(shinfo->tskey, shinfo2->tskey);
TCP_SKB_CB(skb2)->txstamp_ack = TCP_SKB_CB(skb)->txstamp_ack;
TCP_SKB_CB(skb)->txstamp_ack = 0;
}
}
static void tcp_skb_fragment_eor(struct sk_buff *skb, struct sk_buff *skb2)
{
TCP_SKB_CB(skb2)->eor = TCP_SKB_CB(skb)->eor;
TCP_SKB_CB(skb)->eor = 0;
}
/* Insert buff after skb on the write or rtx queue of sk. */
static void tcp_insert_write_queue_after(struct sk_buff *skb,
struct sk_buff *buff,
struct sock *sk,
enum tcp_queue tcp_queue)
{
if (tcp_queue == TCP_FRAG_IN_WRITE_QUEUE)
__skb_queue_after(&sk->sk_write_queue, skb, buff);
else
tcp_rbtree_insert(&sk->tcp_rtx_queue, buff);
}
/* Function to create two new TCP segments. Shrinks the given segment
* to the specified size and appends a new segment with the rest of the
* packet to the list. This won't be called frequently, I hope.
* Remember, these are still headerless SKBs at this point.
*/
int tcp_fragment(struct sock *sk, enum tcp_queue tcp_queue,
struct sk_buff *skb, u32 len,
unsigned int mss_now, gfp_t gfp)
{
struct tcp_sock *tp = tcp_sk(sk);
struct sk_buff *buff;
int nsize, old_factor;
long limit;
int nlen;
u8 flags;
if (WARN_ON(len > skb->len))
return -EINVAL;
nsize = skb_headlen(skb) - len;
if (nsize < 0)
nsize = 0;
/* tcp_sendmsg() can overshoot sk_wmem_queued by one full size skb.
* We need some allowance to not penalize applications setting small
* SO_SNDBUF values.
* Also allow first and last skb in retransmit queue to be split.
*/
limit = sk->sk_sndbuf + 2 * SKB_TRUESIZE(GSO_MAX_SIZE);
if (unlikely((sk->sk_wmem_queued >> 1) > limit &&
tcp_queue != TCP_FRAG_IN_WRITE_QUEUE &&
skb != tcp_rtx_queue_head(sk) &&
skb != tcp_rtx_queue_tail(sk))) {
NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPWQUEUETOOBIG);
return -ENOMEM;
}
if (skb_unclone_keeptruesize(skb, gfp))
return -ENOMEM;
/* Get a new skb... force flag on. */
buff = sk_stream_alloc_skb(sk, nsize, gfp, true);
if (!buff)
return -ENOMEM; /* We'll just try again later. */
skb_copy_decrypted(buff, skb);
mptcp_skb_ext_copy(buff, skb);
sk_wmem_queued_add(sk, buff->truesize);
sk_mem_charge(sk, buff->truesize);
nlen = skb->len - len - nsize;
buff->truesize += nlen;
skb->truesize -= nlen;
/* Correct the sequence numbers. */
TCP_SKB_CB(buff)->seq = TCP_SKB_CB(skb)->seq + len;
TCP_SKB_CB(buff)->end_seq = TCP_SKB_CB(skb)->end_seq;
TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(buff)->seq;
/* PSH and FIN should only be set in the second packet. */
flags = TCP_SKB_CB(skb)->tcp_flags;
TCP_SKB_CB(skb)->tcp_flags = flags & ~(TCPHDR_FIN | TCPHDR_PSH);
TCP_SKB_CB(buff)->tcp_flags = flags;
TCP_SKB_CB(buff)->sacked = TCP_SKB_CB(skb)->sacked;
tcp_skb_fragment_eor(skb, buff);
skb_split(skb, buff, len);
buff->ip_summed = CHECKSUM_PARTIAL;
buff->tstamp = skb->tstamp;
tcp_fragment_tstamp(skb, buff);
old_factor = tcp_skb_pcount(skb);
/* Fix up tso_factor for both original and new SKB. */
tcp_set_skb_tso_segs(skb, mss_now);
tcp_set_skb_tso_segs(buff, mss_now);
/* Update delivered info for the new segment */
TCP_SKB_CB(buff)->tx = TCP_SKB_CB(skb)->tx;
/* If this packet has been sent out already, we must
* adjust the various packet counters.
*/
if (!before(tp->snd_nxt, TCP_SKB_CB(buff)->end_seq)) {
int diff = old_factor - tcp_skb_pcount(skb) -
tcp_skb_pcount(buff);
if (diff)
tcp_adjust_pcount(sk, skb, diff);
}
/* Link BUFF into the send queue. */
__skb_header_release(buff);
tcp_insert_write_queue_after(skb, buff, sk, tcp_queue);
if (tcp_queue == TCP_FRAG_IN_RTX_QUEUE)
list_add(&buff->tcp_tsorted_anchor, &skb->tcp_tsorted_anchor);
return 0;
}
/* This is similar to __pskb_pull_tail(). The difference is that pulled
* data is not copied, but immediately discarded.
*/
static int __pskb_trim_head(struct sk_buff *skb, int len)
{
struct skb_shared_info *shinfo;
int i, k, eat;
eat = min_t(int, len, skb_headlen(skb));
if (eat) {
__skb_pull(skb, eat);
len -= eat;
if (!len)
return 0;
}
eat = len;
k = 0;
shinfo = skb_shinfo(skb);
for (i = 0; i < shinfo->nr_frags; i++) {
int size = skb_frag_size(&shinfo->frags[i]);
if (size <= eat) {
skb_frag_unref(skb, i);
eat -= size;
} else {
shinfo->frags[k] = shinfo->frags[i];
if (eat) {
skb_frag_off_add(&shinfo->frags[k], eat);
skb_frag_size_sub(&shinfo->frags[k], eat);
eat = 0;
}
k++;
}
}
shinfo->nr_frags = k;
skb->data_len -= len;
skb->len = skb->data_len;
return len;
}
/* Remove acked data from a packet in the transmit queue. */
int tcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len)
{
u32 delta_truesize;
if (skb_unclone_keeptruesize(skb, GFP_ATOMIC))
return -ENOMEM;
delta_truesize = __pskb_trim_head(skb, len);
TCP_SKB_CB(skb)->seq += len;
skb->ip_summed = CHECKSUM_PARTIAL;
if (delta_truesize) {
skb->truesize -= delta_truesize;
sk_wmem_queued_add(sk, -delta_truesize);
sk_mem_uncharge(sk, delta_truesize);
}
/* Any change of skb->len requires recalculation of tso factor. */
if (tcp_skb_pcount(skb) > 1)
tcp_set_skb_tso_segs(skb, tcp_skb_mss(skb));
return 0;
}
/* Calculate MSS not accounting any TCP options. */
static inline int __tcp_mtu_to_mss(struct sock *sk, int pmtu)
{
const struct tcp_sock *tp = tcp_sk(sk);
const struct inet_connection_sock *icsk = inet_csk(sk);
int mss_now;
/* Calculate base mss without TCP options:
It is MMS_S - sizeof(tcphdr) of rfc1122
*/
mss_now = pmtu - icsk->icsk_af_ops->net_header_len - sizeof(struct tcphdr);
/* IPv6 adds a frag_hdr in case RTAX_FEATURE_ALLFRAG is set */
if (icsk->icsk_af_ops->net_frag_header_len) {
const struct dst_entry *dst = __sk_dst_get(sk);
if (dst && dst_allfrag(dst))
mss_now -= icsk->icsk_af_ops->net_frag_header_len;
}
/* Clamp it (mss_clamp does not include tcp options) */
if (mss_now > tp->rx_opt.mss_clamp)
mss_now = tp->rx_opt.mss_clamp;
/* Now subtract optional transport overhead */
mss_now -= icsk->icsk_ext_hdr_len;
/* Then reserve room for full set of TCP options and 8 bytes of data */
mss_now = max(mss_now, sock_net(sk)->ipv4.sysctl_tcp_min_snd_mss);
return mss_now;
}
/* Calculate MSS. Not accounting for SACKs here. */
int tcp_mtu_to_mss(struct sock *sk, int pmtu)
{
/* Subtract TCP options size, not including SACKs */
return __tcp_mtu_to_mss(sk, pmtu) -
(tcp_sk(sk)->tcp_header_len - sizeof(struct tcphdr));
}
EXPORT_SYMBOL(tcp_mtu_to_mss);
/* Inverse of above */
int tcp_mss_to_mtu(struct sock *sk, int mss)
{
const struct tcp_sock *tp = tcp_sk(sk);
const struct inet_connection_sock *icsk = inet_csk(sk);
int mtu;
mtu = mss +
tp->tcp_header_len +
icsk->icsk_ext_hdr_len +
icsk->icsk_af_ops->net_header_len;
/* IPv6 adds a frag_hdr in case RTAX_FEATURE_ALLFRAG is set */
if (icsk->icsk_af_ops->net_frag_header_len) {
const struct dst_entry *dst = __sk_dst_get(sk);
if (dst && dst_allfrag(dst))
mtu += icsk->icsk_af_ops->net_frag_header_len;
}
return mtu;
}
EXPORT_SYMBOL(tcp_mss_to_mtu);
/* MTU probing init per socket */
void tcp_mtup_init(struct sock *sk)
{
struct tcp_sock *tp = tcp_sk(sk);
struct inet_connection_sock *icsk = inet_csk(sk);
struct net *net = sock_net(sk);
icsk->icsk_mtup.enabled = net->ipv4.sysctl_tcp_mtu_probing > 1;
icsk->icsk_mtup.search_high = tp->rx_opt.mss_clamp + sizeof(struct tcphdr) +
icsk->icsk_af_ops->net_header_len;
icsk->icsk_mtup.search_low = tcp_mss_to_mtu(sk, net->ipv4.sysctl_tcp_base_mss);
icsk->icsk_mtup.probe_size = 0;
if (icsk->icsk_mtup.enabled)
icsk->icsk_mtup.probe_timestamp = tcp_jiffies32;
}
EXPORT_SYMBOL(tcp_mtup_init);
/* This function synchronize snd mss to current pmtu/exthdr set.
tp->rx_opt.user_mss is mss set by user by TCP_MAXSEG. It does NOT counts
for TCP options, but includes only bare TCP header.
tp->rx_opt.mss_clamp is mss negotiated at connection setup.
It is minimum of user_mss and mss received with SYN.
It also does not include TCP options.
inet_csk(sk)->icsk_pmtu_cookie is last pmtu, seen by this function.
tp->mss_cache is current effective sending mss, including
all tcp options except for SACKs. It is evaluated,
taking into account current pmtu, but never exceeds
tp->rx_opt.mss_clamp.
NOTE1. rfc1122 clearly states that advertised MSS
DOES NOT include either tcp or ip options.
NOTE2. inet_csk(sk)->icsk_pmtu_cookie and tp->mss_cache
are READ ONLY outside this function. --ANK (980731)
*/
unsigned int tcp_sync_mss(struct sock *sk, u32 pmtu)
{
struct tcp_sock *tp = tcp_sk(sk);
struct inet_connection_sock *icsk = inet_csk(sk);
int mss_now;
if (icsk->icsk_mtup.search_high > pmtu) icsk->icsk_mtup.search_high = pmtu;
mss_now = tcp_mtu_to_mss(sk, pmtu);
mss_now = tcp_bound_to_half_wnd(tp, mss_now);
/* And store cached results */
icsk->icsk_pmtu_cookie = pmtu;
if (icsk->icsk_mtup.enabled)
mss_now = min(mss_now, tcp_mtu_to_mss(sk, icsk->icsk_mtup.search_low)); tp->mss_cache = mss_now;
return mss_now;
}
EXPORT_SYMBOL(tcp_sync_mss);
/* Compute the current effective MSS, taking SACKs and IP options,
* and even PMTU discovery events into account.
*/
unsigned int tcp_current_mss(struct sock *sk)
{
const struct tcp_sock *tp = tcp_sk(sk);
const struct dst_entry *dst = __sk_dst_get(sk);
u32 mss_now;
unsigned int header_len;
struct tcp_out_options opts;
struct tcp_md5sig_key *md5;
mss_now = tp->mss_cache;
if (dst) {
u32 mtu = dst_mtu(dst);
if (mtu != inet_csk(sk)->icsk_pmtu_cookie)
mss_now = tcp_sync_mss(sk, mtu);
}
header_len = tcp_established_options(sk, NULL, &opts, &md5) +
sizeof(struct tcphdr);
/* The mss_cache is sized based on tp->tcp_header_len, which assumes
* some common options. If this is an odd packet (because we have SACK
* blocks etc) then our calculated header_len will be different, and
* we have to adjust mss_now correspondingly */
if (header_len != tp->tcp_header_len) {
int delta = (int) header_len - tp->tcp_header_len;
mss_now -= delta;
}
return mss_now;
}
/* RFC2861, slow part. Adjust cwnd, after it was not full during one rto.
* As additional protections, we do not touch cwnd in retransmission phases,
* and if application hit its sndbuf limit recently.
*/
static void tcp_cwnd_application_limited(struct sock *sk)
{
struct tcp_sock *tp = tcp_sk(sk);
if (inet_csk(sk)->icsk_ca_state == TCP_CA_Open &&
sk->sk_socket && !test_bit(SOCK_NOSPACE, &sk->sk_socket->flags)) {
/* Limited by application or receiver window. */
u32 init_win = tcp_init_cwnd(tp, __sk_dst_get(sk));
u32 win_used = max(tp->snd_cwnd_used, init_win);
if (win_used < tp->snd_cwnd) {
tp->snd_ssthresh = tcp_current_ssthresh(sk);
tp->snd_cwnd = (tp->snd_cwnd + win_used) >> 1;
}
tp->snd_cwnd_used = 0;
}
tp->snd_cwnd_stamp = tcp_jiffies32;
}
static void tcp_cwnd_validate(struct sock *sk, bool is_cwnd_limited)
{
const struct tcp_congestion_ops *ca_ops = inet_csk(sk)->icsk_ca_ops;
struct tcp_sock *tp = tcp_sk(sk);
/* Track the maximum number of outstanding packets in each
* window, and remember whether we were cwnd-limited then.
*/
if (!before(tp->snd_una, tp->max_packets_seq) ||
tp->packets_out > tp->max_packets_out ||
is_cwnd_limited) {
tp->max_packets_out = tp->packets_out;
tp->max_packets_seq = tp->snd_nxt;
tp->is_cwnd_limited = is_cwnd_limited;
}
if (tcp_is_cwnd_limited(sk)) {
/* Network is feed fully. */
tp->snd_cwnd_used = 0;
tp->snd_cwnd_stamp = tcp_jiffies32;
} else {
/* Network starves. */
if (tp->packets_out > tp->snd_cwnd_used)
tp->snd_cwnd_used = tp->packets_out;
if (sock_net(sk)->ipv4.sysctl_tcp_slow_start_after_idle &&
(s32)(tcp_jiffies32 - tp->snd_cwnd_stamp) >= inet_csk(sk)->icsk_rto &&
!ca_ops->cong_control)
tcp_cwnd_application_limited(sk);
/* The following conditions together indicate the starvation
* is caused by insufficient sender buffer:
* 1) just sent some data (see tcp_write_xmit)
* 2) not cwnd limited (this else condition)
* 3) no more data to send (tcp_write_queue_empty())
* 4) application is hitting buffer limit (SOCK_NOSPACE)
*/
if (tcp_write_queue_empty(sk) && sk->sk_socket &&
test_bit(SOCK_NOSPACE, &sk->sk_socket->flags) &&
(1 << sk->sk_state) & (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT))
tcp_chrono_start(sk, TCP_CHRONO_SNDBUF_LIMITED);
}
}
/* Minshall's variant of the Nagle send check. */
static bool tcp_minshall_check(const struct tcp_sock *tp)
{
return after(tp->snd_sml, tp->snd_una) &&
!after(tp->snd_sml, tp->snd_nxt);
}
/* Update snd_sml if this skb is under mss
* Note that a TSO packet might end with a sub-mss segment
* The test is really :
* if ((skb->len % mss) != 0)
* tp->snd_sml = TCP_SKB_CB(skb)->end_seq;
* But we can avoid doing the divide again given we already have
* skb_pcount = skb->len / mss_now
*/
static void tcp_minshall_update(struct tcp_sock *tp, unsigned int mss_now,
const struct sk_buff *skb)
{
if (skb->len < tcp_skb_pcount(skb) * mss_now)
tp->snd_sml = TCP_SKB_CB(skb)->end_seq;
}
/* Return false, if packet can be sent now without violation Nagle's rules:
* 1. It is full sized. (provided by caller in %partial bool)
* 2. Or it contains FIN. (already checked by caller)
* 3. Or TCP_CORK is not set, and TCP_NODELAY is set.
* 4. Or TCP_CORK is not set, and all sent packets are ACKed.
* With Minshall's modification: all sent small packets are ACKed.
*/
static bool tcp_nagle_check(bool partial, const struct tcp_sock *tp,
int nonagle)
{
return partial &&
((nonagle & TCP_NAGLE_CORK) ||
(!nonagle && tp->packets_out && tcp_minshall_check(tp)));
}
/* Return how many segs we'd like on a TSO packet,
* to send one TSO packet per ms
*/
static u32 tcp_tso_autosize(const struct sock *sk, unsigned int mss_now,
int min_tso_segs)
{
u32 bytes, segs;
bytes = min_t(unsigned long,
sk->sk_pacing_rate >> READ_ONCE(sk->sk_pacing_shift),
sk->sk_gso_max_size - 1 - MAX_TCP_HEADER);
/* Goal is to send at least one packet per ms,
* not one big TSO packet every 100 ms.
* This preserves ACK clocking and is consistent
* with tcp_tso_should_defer() heuristic.
*/
segs = max_t(u32, bytes / mss_now, min_tso_segs);
return segs;
}
/* Return the number of segments we want in the skb we are transmitting.
* See if congestion control module wants to decide; otherwise, autosize.
*/
static u32 tcp_tso_segs(struct sock *sk, unsigned int mss_now)
{
const struct tcp_congestion_ops *ca_ops = inet_csk(sk)->icsk_ca_ops;
u32 min_tso, tso_segs;
min_tso = ca_ops->min_tso_segs ?
ca_ops->min_tso_segs(sk) :
sock_net(sk)->ipv4.sysctl_tcp_min_tso_segs;
tso_segs = tcp_tso_autosize(sk, mss_now, min_tso);
return min_t(u32, tso_segs, sk->sk_gso_max_segs);
}
/* Returns the portion of skb which can be sent right away */
static unsigned int tcp_mss_split_point(const struct sock *sk,
const struct sk_buff *skb,
unsigned int mss_now,
unsigned int max_segs,
int nonagle)
{
const struct tcp_sock *tp = tcp_sk(sk);
u32 partial, needed, window, max_len;
window = tcp_wnd_end(tp) - TCP_SKB_CB(skb)->seq;
max_len = mss_now * max_segs;
if (likely(max_len <= window && skb != tcp_write_queue_tail(sk)))
return max_len;
needed = min(skb->len, window);
if (max_len <= needed)
return max_len;
partial = needed % mss_now;
/* If last segment is not a full MSS, check if Nagle rules allow us
* to include this last segment in this skb.
* Otherwise, we'll split the skb at last MSS boundary
*/
if (tcp_nagle_check(partial != 0, tp, nonagle))
return needed - partial;
return needed;
}
/* Can at least one segment of SKB be sent right now, according to the
* congestion window rules? If so, return how many segments are allowed.
*/
static inline unsigned int tcp_cwnd_test(const struct tcp_sock *tp,
const struct sk_buff *skb)
{
u32 in_flight, cwnd, halfcwnd;
/* Don't be strict about the congestion window for the final FIN. */
if ((TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) &&
tcp_skb_pcount(skb) == 1)
return 1;
in_flight = tcp_packets_in_flight(tp);
cwnd = tp->snd_cwnd;
if (in_flight >= cwnd)
return 0;
/* For better scheduling, ensure we have at least
* 2 GSO packets in flight.
*/
halfcwnd = max(cwnd >> 1, 1U);
return min(halfcwnd, cwnd - in_flight);
}
/* Initialize TSO state of a skb.
* This must be invoked the first time we consider transmitting
* SKB onto the wire.
*/
static int tcp_init_tso_segs(struct sk_buff *skb, unsigned int mss_now)
{
int tso_segs = tcp_skb_pcount(skb);
if (!tso_segs || (tso_segs > 1 && tcp_skb_mss(skb) != mss_now)) {
tcp_set_skb_tso_segs(skb, mss_now);
tso_segs = tcp_skb_pcount(skb);
}
return tso_segs;
}
/* Return true if the Nagle test allows this packet to be
* sent now.
*/
static inline bool tcp_nagle_test(const struct tcp_sock *tp, const struct sk_buff *skb,
unsigned int cur_mss, int nonagle)
{
/* Nagle rule does not apply to frames, which sit in the middle of the
* write_queue (they have no chances to get new data).
*
* This is implemented in the callers, where they modify the 'nonagle'
* argument based upon the location of SKB in the send queue.
*/
if (nonagle & TCP_NAGLE_PUSH)
return true;
/* Don't use the nagle rule for urgent data (or for the final FIN). */
if (tcp_urg_mode(tp) || (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN))
return true;
if (!tcp_nagle_check(skb->len < cur_mss, tp, nonagle))
return true;
return false;
}
/* Does at least the first segment of SKB fit into the send window? */
static bool tcp_snd_wnd_test(const struct tcp_sock *tp,
const struct sk_buff *skb,
unsigned int cur_mss)
{
u32 end_seq = TCP_SKB_CB(skb)->end_seq;
if (skb->len > cur_mss)
end_seq = TCP_SKB_CB(skb)->seq + cur_mss;
return !after(end_seq, tcp_wnd_end(tp));
}
/* Trim TSO SKB to LEN bytes, put the remaining data into a new packet
* which is put after SKB on the list. It is very much like
* tcp_fragment() except that it may make several kinds of assumptions
* in order to speed up the splitting operation. In particular, we
* know that all the data is in scatter-gather pages, and that the
* packet has never been sent out before (and thus is not cloned).
*/
static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len,
unsigned int mss_now, gfp_t gfp)
{
int nlen = skb->len - len;
struct sk_buff *buff;
u8 flags;
/* All of a TSO frame must be composed of paged data. */
if (skb->len != skb->data_len)
return tcp_fragment(sk, TCP_FRAG_IN_WRITE_QUEUE,
skb, len, mss_now, gfp);
buff = sk_stream_alloc_skb(sk, 0, gfp, true);
if (unlikely(!buff))
return -ENOMEM;
skb_copy_decrypted(buff, skb);
mptcp_skb_ext_copy(buff, skb);
sk_wmem_queued_add(sk, buff->truesize);
sk_mem_charge(sk, buff->truesize);
buff->truesize += nlen;
skb->truesize -= nlen;
/* Correct the sequence numbers. */
TCP_SKB_CB(buff)->seq = TCP_SKB_CB(skb)->seq + len;
TCP_SKB_CB(buff)->end_seq = TCP_SKB_CB(skb)->end_seq;
TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(buff)->seq;
/* PSH and FIN should only be set in the second packet. */
flags = TCP_SKB_CB(skb)->tcp_flags;
TCP_SKB_CB(skb)->tcp_flags = flags & ~(TCPHDR_FIN | TCPHDR_PSH);
TCP_SKB_CB(buff)->tcp_flags = flags;
/* This packet was never sent out yet, so no SACK bits. */
TCP_SKB_CB(buff)->sacked = 0;
tcp_skb_fragment_eor(skb, buff);
buff->ip_summed = CHECKSUM_PARTIAL;
skb_split(skb, buff, len);
tcp_fragment_tstamp(skb, buff);
/* Fix up tso_factor for both original and new SKB. */
tcp_set_skb_tso_segs(skb, mss_now);
tcp_set_skb_tso_segs(buff, mss_now);
/* Link BUFF into the send queue. */
__skb_header_release(buff);
tcp_insert_write_queue_after(skb, buff, sk, TCP_FRAG_IN_WRITE_QUEUE);
return 0;
}
/* Try to defer sending, if possible, in order to minimize the amount
* of TSO splitting we do. View it as a kind of TSO Nagle test.
*
* This algorithm is from John Heffner.
*/
static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb,
bool *is_cwnd_limited,
bool *is_rwnd_limited,
u32 max_segs)
{
const struct inet_connection_sock *icsk = inet_csk(sk);
u32 send_win, cong_win, limit, in_flight;
struct tcp_sock *tp = tcp_sk(sk);
struct sk_buff *head;
int win_divisor;
s64 delta;
if (icsk->icsk_ca_state >= TCP_CA_Recovery)
goto send_now;
/* Avoid bursty behavior by allowing defer
* only if the last write was recent (1 ms).
* Note that tp->tcp_wstamp_ns can be in the future if we have
* packets waiting in a qdisc or device for EDT delivery.
*/
delta = tp->tcp_clock_cache - tp->tcp_wstamp_ns - NSEC_PER_MSEC;
if (delta > 0)
goto send_now;
in_flight = tcp_packets_in_flight(tp);
BUG_ON(tcp_skb_pcount(skb) <= 1);
BUG_ON(tp->snd_cwnd <= in_flight);
send_win = tcp_wnd_end(tp) - TCP_SKB_CB(skb)->seq;
/* From in_flight test above, we know that cwnd > in_flight. */
cong_win = (tp->snd_cwnd - in_flight) * tp->mss_cache;
limit = min(send_win, cong_win);
/* If a full-sized TSO skb can be sent, do it. */
if (limit >= max_segs * tp->mss_cache)
goto send_now;
/* Middle in queue won't get any more data, full sendable already? */
if ((skb != tcp_write_queue_tail(sk)) && (limit >= skb->len))
goto send_now;
win_divisor = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_tso_win_divisor);
if (win_divisor) {
u32 chunk = min(tp->snd_wnd, tp->snd_cwnd * tp->mss_cache);
/* If at least some fraction of a window is available,
* just use it.
*/
chunk /= win_divisor;
if (limit >= chunk)
goto send_now;
} else {
/* Different approach, try not to defer past a single
* ACK. Receiver should ACK every other full sized
* frame, so if we have space for more than 3 frames
* then send now.
*/
if (limit > tcp_max_tso_deferred_mss(tp) * tp->mss_cache)
goto send_now;
}
/* TODO : use tsorted_sent_queue ? */
head = tcp_rtx_queue_head(sk);
if (!head)
goto send_now;
delta = tp->tcp_clock_cache - head->tstamp;
/* If next ACK is likely to come too late (half srtt), do not defer */
if ((s64)(delta - (u64)NSEC_PER_USEC * (tp->srtt_us >> 4)) < 0)
goto send_now;
/* Ok, it looks like it is advisable to defer.
* Three cases are tracked :
* 1) We are cwnd-limited
* 2) We are rwnd-limited
* 3) We are application limited.
*/
if (cong_win < send_win) {
if (cong_win <= skb->len) {
*is_cwnd_limited = true;
return true;
}
} else {
if (send_win <= skb->len) {
*is_rwnd_limited = true;
return true;
}
}
/* If this packet won't get more data, do not wait. */
if ((TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) ||
TCP_SKB_CB(skb)->eor)
goto send_now;
return true;
send_now:
return false;
}
static inline void tcp_mtu_check_reprobe(struct sock *sk)
{
struct inet_connection_sock *icsk = inet_csk(sk);
struct tcp_sock *tp = tcp_sk(sk);
struct net *net = sock_net(sk);
u32 interval;
s32 delta;
interval = net->ipv4.sysctl_tcp_probe_interval;
delta = tcp_jiffies32 - icsk->icsk_mtup.probe_timestamp;
if (unlikely(delta >= interval * HZ)) {
int mss = tcp_current_mss(sk);
/* Update current search range */
icsk->icsk_mtup.probe_size = 0;
icsk->icsk_mtup.search_high = tp->rx_opt.mss_clamp +
sizeof(struct tcphdr) +
icsk->icsk_af_ops->net_header_len;
icsk->icsk_mtup.search_low = tcp_mss_to_mtu(sk, mss);
/* Update probe time stamp */
icsk->icsk_mtup.probe_timestamp = tcp_jiffies32;
}
}
static bool tcp_can_coalesce_send_queue_head(struct sock *sk, int len)
{
struct sk_buff *skb, *next;
skb = tcp_send_head(sk);
tcp_for_write_queue_from_safe(skb, next, sk) {
if (len <= skb->len)
break;
if (unlikely(TCP_SKB_CB(skb)->eor) || tcp_has_tx_tstamp(skb))
return false;
len -= skb->len;
}
return true;
}
/* Create a new MTU probe if we are ready.
* MTU probe is regularly attempting to increase the path MTU by
* deliberately sending larger packets. This discovers routing
* changes resulting in larger path MTUs.
*
* Returns 0 if we should wait to probe (no cwnd available),
* 1 if a probe was sent,
* -1 otherwise
*/
static int tcp_mtu_probe(struct sock *sk)
{
struct inet_connection_sock *icsk = inet_csk(sk);
struct tcp_sock *tp = tcp_sk(sk);
struct sk_buff *skb, *nskb, *next;
struct net *net = sock_net(sk);
int probe_size;
int size_needed;
int copy, len;
int mss_now;
int interval;
/* Not currently probing/verifying,
* not in recovery,
* have enough cwnd, and
* not SACKing (the variable headers throw things off)
*/
if (likely(!icsk->icsk_mtup.enabled ||
icsk->icsk_mtup.probe_size ||
inet_csk(sk)->icsk_ca_state != TCP_CA_Open ||
tp->snd_cwnd < 11 ||
tp->rx_opt.num_sacks || tp->rx_opt.dsack))
return -1;
/* Use binary search for probe_size between tcp_mss_base,
* and current mss_clamp. if (search_high - search_low)
* smaller than a threshold, backoff from probing.
*/
mss_now = tcp_current_mss(sk);
probe_size = tcp_mtu_to_mss(sk, (icsk->icsk_mtup.search_high +
icsk->icsk_mtup.search_low) >> 1);
size_needed = probe_size + (tp->reordering + 1) * tp->mss_cache;
interval = icsk->icsk_mtup.search_high - icsk->icsk_mtup.search_low;
/* When misfortune happens, we are reprobing actively,
* and then reprobe timer has expired. We stick with current
* probing process by not resetting search range to its orignal.
*/
if (probe_size > tcp_mtu_to_mss(sk, icsk->icsk_mtup.search_high) ||
interval < net->ipv4.sysctl_tcp_probe_threshold) {
/* Check whether enough time has elaplased for
* another round of probing.
*/
tcp_mtu_check_reprobe(sk);
return -1;
}
/* Have enough data in the send queue to probe? */
if (tp->write_seq - tp->snd_nxt < size_needed)
return -1;
if (tp->snd_wnd < size_needed)
return -1;
if (after(tp->snd_nxt + size_needed, tcp_wnd_end(tp)))
return 0;
/* Do we need to wait to drain cwnd? With none in flight, don't stall */
if (tcp_packets_in_flight(tp) + 2 > tp->snd_cwnd) {
if (!tcp_packets_in_flight(tp))
return -1;
else
return 0;
}
if (!tcp_can_coalesce_send_queue_head(sk, probe_size))
return -1;
/* We're allowed to probe. Build it now. */
nskb = sk_stream_alloc_skb(sk, probe_size, GFP_ATOMIC, false);
if (!nskb)
return -1;
sk_wmem_queued_add(sk, nskb->truesize);
sk_mem_charge(sk, nskb->truesize);
skb = tcp_send_head(sk);
skb_copy_decrypted(nskb, skb);
mptcp_skb_ext_copy(nskb, skb);
TCP_SKB_CB(nskb)->seq = TCP_SKB_CB(skb)->seq;
TCP_SKB_CB(nskb)->end_seq = TCP_SKB_CB(skb)->seq + probe_size;
TCP_SKB_CB(nskb)->tcp_flags = TCPHDR_ACK;
TCP_SKB_CB(nskb)->sacked = 0;
nskb->csum = 0;
nskb->ip_summed = CHECKSUM_PARTIAL;
tcp_insert_write_queue_before(nskb, skb, sk);
tcp_highest_sack_replace(sk, skb, nskb);
len = 0;
tcp_for_write_queue_from_safe(skb, next, sk) {
copy = min_t(int, skb->len, probe_size - len);
skb_copy_bits(skb, 0, skb_put(nskb, copy), copy);
if (skb->len <= copy) {
/* We've eaten all the data from this skb.
* Throw it away. */
TCP_SKB_CB(nskb)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags;
/* If this is the last SKB we copy and eor is set
* we need to propagate it to the new skb.
*/
TCP_SKB_CB(nskb)->eor = TCP_SKB_CB(skb)->eor;
tcp_skb_collapse_tstamp(nskb, skb);
tcp_unlink_write_queue(skb, sk);
sk_wmem_free_skb(sk, skb);
} else {
TCP_SKB_CB(nskb)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags &
~(TCPHDR_FIN|TCPHDR_PSH);
if (!skb_shinfo(skb)->nr_frags) {
skb_pull(skb, copy);
} else {
__pskb_trim_head(skb, copy);
tcp_set_skb_tso_segs(skb, mss_now);
}
TCP_SKB_CB(skb)->seq += copy;
}
len += copy;
if (len >= probe_size)
break;
}
tcp_init_tso_segs(nskb, nskb->len);
/* We're ready to send. If this fails, the probe will
* be resegmented into mss-sized pieces by tcp_write_xmit().
*/
if (!tcp_transmit_skb(sk, nskb, 1, GFP_ATOMIC)) {
/* Decrement cwnd here because we are sending
* effectively two packets. */
tp->snd_cwnd--;
tcp_event_new_data_sent(sk, nskb);
icsk->icsk_mtup.probe_size = tcp_mss_to_mtu(sk, nskb->len);
tp->mtu_probe.probe_seq_start = TCP_SKB_CB(nskb)->seq;
tp->mtu_probe.probe_seq_end = TCP_SKB_CB(nskb)->end_seq;
return 1;
}
return -1;
}
static bool tcp_pacing_check(struct sock *sk)
{
struct tcp_sock *tp = tcp_sk(sk);
if (!tcp_needs_internal_pacing(sk))
return false;
if (tp->tcp_wstamp_ns <= tp->tcp_clock_cache)
return false;
if (!hrtimer_is_queued(&tp->pacing_timer)) {
hrtimer_start(&tp->pacing_timer,
ns_to_ktime(tp->tcp_wstamp_ns),
HRTIMER_MODE_ABS_PINNED_SOFT);
sock_hold(sk);
}
return true;
}
/* TCP Small Queues :
* Control number of packets in qdisc/devices to two packets / or ~1 ms.
* (These limits are doubled for retransmits)
* This allows for :
* - better RTT estimation and ACK scheduling
* - faster recovery
* - high rates
* Alas, some drivers / subsystems require a fair amount
* of queued bytes to ensure line rate.
* One example is wifi aggregation (802.11 AMPDU)
*/
static bool tcp_small_queue_check(struct sock *sk, const struct sk_buff *skb,
unsigned int factor)
{
unsigned long limit;
limit = max_t(unsigned long,
2 * skb->truesize,
sk->sk_pacing_rate >> READ_ONCE(sk->sk_pacing_shift));
if (sk->sk_pacing_status == SK_PACING_NONE)
limit = min_t(unsigned long, limit,
sock_net(sk)->ipv4.sysctl_tcp_limit_output_bytes);
limit <<= factor;
if (static_branch_unlikely(&tcp_tx_delay_enabled) &&
tcp_sk(sk)->tcp_tx_delay) {
u64 extra_bytes = (u64)sk->sk_pacing_rate * tcp_sk(sk)->tcp_tx_delay;
/* TSQ is based on skb truesize sum (sk_wmem_alloc), so we
* approximate our needs assuming an ~100% skb->truesize overhead.
* USEC_PER_SEC is approximated by 2^20.
* do_div(extra_bytes, USEC_PER_SEC/2) is replaced by a right shift.
*/
extra_bytes >>= (20 - 1);
limit += extra_bytes;
}
if (refcount_read(&sk->sk_wmem_alloc) > limit) {
/* Always send skb if rtx queue is empty.
* No need to wait for TX completion to call us back,
* after softirq/tasklet schedule.
* This helps when TX completions are delayed too much.
*/
if (tcp_rtx_queue_empty(sk))
return false;
set_bit(TSQ_THROTTLED, &sk->sk_tsq_flags);
/* It is possible TX completion already happened
* before we set TSQ_THROTTLED, so we must
* test again the condition.
*/
smp_mb__after_atomic();
if (refcount_read(&sk->sk_wmem_alloc) > limit)
return true;
}
return false;
}
static void tcp_chrono_set(struct tcp_sock *tp, const enum tcp_chrono new)
{
const u32 now = tcp_jiffies32;
enum tcp_chrono old = tp->chrono_type;
if (old > TCP_CHRONO_UNSPEC)
tp->chrono_stat[old - 1] += now - tp->chrono_start; tp->chrono_start = now;
tp->chrono_type = new;
}
void tcp_chrono_start(struct sock *sk, const enum tcp_chrono type)
{
struct tcp_sock *tp = tcp_sk(sk);
/* If there are multiple conditions worthy of tracking in a
* chronograph then the highest priority enum takes precedence
* over the other conditions. So that if something "more interesting"
* starts happening, stop the previous chrono and start a new one.
*/
if (type > tp->chrono_type)
tcp_chrono_set(tp, type);
}
void tcp_chrono_stop(struct sock *sk, const enum tcp_chrono type)
{
struct tcp_sock *tp = tcp_sk(sk);
/* There are multiple conditions worthy of tracking in a
* chronograph, so that the highest priority enum takes
* precedence over the other conditions (see tcp_chrono_start).
* If a condition stops, we only stop chrono tracking if
* it's the "most interesting" or current chrono we are
* tracking and starts busy chrono if we have pending data.
*/
if (tcp_rtx_and_write_queues_empty(sk))
tcp_chrono_set(tp, TCP_CHRONO_UNSPEC);
else if (type == tp->chrono_type)
tcp_chrono_set(tp, TCP_CHRONO_BUSY);
}
/* This routine writes packets to the network. It advances the
* send_head. This happens as incoming acks open up the remote
* window for us.
*
* LARGESEND note: !tcp_urg_mode is overkill, only frames between
* snd_up-64k-mss .. snd_up cannot be large. However, taking into
* account rare use of URG, this is not a big flaw.
*
* Send at most one packet when push_one > 0. Temporarily ignore
* cwnd limit to force at most one packet out when push_one == 2.
* Returns true, if no segments are in flight and we have queued segments,
* but cannot send anything now because of SWS or another problem.
*/
static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
int push_one, gfp_t gfp)
{
struct tcp_sock *tp = tcp_sk(sk);
struct sk_buff *skb;
unsigned int tso_segs, sent_pkts;
int cwnd_quota;
int result;
bool is_cwnd_limited = false, is_rwnd_limited = false;
u32 max_segs;
sent_pkts = 0;
tcp_mstamp_refresh(tp);
if (!push_one) {
/* Do MTU probing. */
result = tcp_mtu_probe(sk);
if (!result) {
return false;
} else if (result > 0) {
sent_pkts = 1;
}
}
max_segs = tcp_tso_segs(sk, mss_now);
while ((skb = tcp_send_head(sk))) {
unsigned int limit;
if (unlikely(tp->repair) && tp->repair_queue == TCP_SEND_QUEUE) {
/* "skb_mstamp_ns" is used as a start point for the retransmit timer */
skb->skb_mstamp_ns = tp->tcp_wstamp_ns = tp->tcp_clock_cache;
list_move_tail(&skb->tcp_tsorted_anchor, &tp->tsorted_sent_queue);
tcp_init_tso_segs(skb, mss_now);
goto repair; /* Skip network transmission */
}
if (tcp_pacing_check(sk))
break;
tso_segs = tcp_init_tso_segs(skb, mss_now);
BUG_ON(!tso_segs);
cwnd_quota = tcp_cwnd_test(tp, skb);
if (!cwnd_quota) {
if (push_one == 2)
/* Force out a loss probe pkt. */
cwnd_quota = 1;
else
break;
}
if (unlikely(!tcp_snd_wnd_test(tp, skb, mss_now))) {
is_rwnd_limited = true;
break;
}
if (tso_segs == 1) {
if (unlikely(!tcp_nagle_test(tp, skb, mss_now,
(tcp_skb_is_last(sk, skb) ?
nonagle : TCP_NAGLE_PUSH))))
break;
} else {
if (!push_one &&
tcp_tso_should_defer(sk, skb, &is_cwnd_limited,
&is_rwnd_limited, max_segs))
break;
}
limit = mss_now;
if (tso_segs > 1 && !tcp_urg_mode(tp))
limit = tcp_mss_split_point(sk, skb, mss_now,
min_t(unsigned int,
cwnd_quota,
max_segs),
nonagle);
if (skb->len > limit &&
unlikely(tso_fragment(sk, skb, limit, mss_now, gfp)))
break;
if (tcp_small_queue_check(sk, skb, 0))
break;
/* Argh, we hit an empty skb(), presumably a thread
* is sleeping in sendmsg()/sk_stream_wait_memory().
* We do not want to send a pure-ack packet and have
* a strange looking rtx queue with empty packet(s).
*/
if (TCP_SKB_CB(skb)->end_seq == TCP_SKB_CB(skb)->seq)
break;
if (unlikely(tcp_transmit_skb(sk, skb, 1, gfp)))
break;
repair:
/* Advance the send_head. This one is sent out.
* This call will increment packets_out.
*/
tcp_event_new_data_sent(sk, skb);
tcp_minshall_update(tp, mss_now, skb);
sent_pkts += tcp_skb_pcount(skb);
if (push_one)
break;
}
if (is_rwnd_limited)
tcp_chrono_start(sk, TCP_CHRONO_RWND_LIMITED);
else
tcp_chrono_stop(sk, TCP_CHRONO_RWND_LIMITED);
is_cwnd_limited |= (tcp_packets_in_flight(tp) >= tp->snd_cwnd);
if (likely(sent_pkts || is_cwnd_limited))
tcp_cwnd_validate(sk, is_cwnd_limited);
if (likely(sent_pkts)) {
if (tcp_in_cwnd_reduction(sk))
tp->prr_out += sent_pkts;
/* Send one loss probe per tail loss episode. */
if (push_one != 2)
tcp_schedule_loss_probe(sk, false);
return false;
}
return !tp->packets_out && !tcp_write_queue_empty(sk);
}
bool tcp_schedule_loss_probe(struct sock *sk, bool advancing_rto)
{
struct inet_connection_sock *icsk = inet_csk(sk);
struct tcp_sock *tp = tcp_sk(sk);
u32 timeout, rto_delta_us;
int early_retrans;
/* Don't do any loss probe on a Fast Open connection before 3WHS
* finishes.
*/
if (rcu_access_pointer(tp->fastopen_rsk))
return false;
early_retrans = sock_net(sk)->ipv4.sysctl_tcp_early_retrans;
/* Schedule a loss probe in 2*RTT for SACK capable connections
* not in loss recovery, that are either limited by cwnd or application.
*/
if ((early_retrans != 3 && early_retrans != 4) ||
!tp->packets_out || !tcp_is_sack(tp) ||
(icsk->icsk_ca_state != TCP_CA_Open &&
icsk->icsk_ca_state != TCP_CA_CWR))
return false;
/* Probe timeout is 2*rtt. Add minimum RTO to account
* for delayed ack when there's one outstanding packet. If no RTT
* sample is available then probe after TCP_TIMEOUT_INIT.
*/
if (tp->srtt_us) {
timeout = usecs_to_jiffies(tp->srtt_us >> 2);
if (tp->packets_out == 1)
timeout += TCP_RTO_MIN;
else
timeout += TCP_TIMEOUT_MIN;
} else {
timeout = TCP_TIMEOUT_INIT;
}
/* If the RTO formula yields an earlier time, then use that time. */
rto_delta_us = advancing_rto ?
jiffies_to_usecs(inet_csk(sk)->icsk_rto) :
tcp_rto_delta_us(sk); /* How far in future is RTO? */
if (rto_delta_us > 0)
timeout = min_t(u32, timeout, usecs_to_jiffies(rto_delta_us));
tcp_reset_xmit_timer(sk, ICSK_TIME_LOSS_PROBE, timeout, TCP_RTO_MAX);
return true;
}
/* Thanks to skb fast clones, we can detect if a prior transmit of
* a packet is still in a qdisc or driver queue.
* In this case, there is very little point doing a retransmit !
*/
static bool skb_still_in_host_queue(struct sock *sk,
const struct sk_buff *skb)
{
if (unlikely(skb_fclone_busy(sk, skb))) {
set_bit(TSQ_THROTTLED, &sk->sk_tsq_flags);
smp_mb__after_atomic();
if (skb_fclone_busy(sk, skb)) {
NET_INC_STATS(sock_net(sk),
LINUX_MIB_TCPSPURIOUS_RTX_HOSTQUEUES);
return true;
}
}
return false;
}
/* When probe timeout (PTO) fires, try send a new segment if possible, else
* retransmit the last segment.
*/
void tcp_send_loss_probe(struct sock *sk)
{
struct tcp_sock *tp = tcp_sk(sk);
struct sk_buff *skb;
int pcount;
int mss = tcp_current_mss(sk);
/* At most one outstanding TLP */
if (tp->tlp_high_seq)
goto rearm_timer;
tp->tlp_retrans = 0;
skb = tcp_send_head(sk);
if (skb && tcp_snd_wnd_test(tp, skb, mss)) {
pcount = tp->packets_out;
tcp_write_xmit(sk, mss, TCP_NAGLE_OFF, 2, GFP_ATOMIC);
if (tp->packets_out > pcount)
goto probe_sent;
goto rearm_timer;
}
skb = skb_rb_last(&sk->tcp_rtx_queue);
if (unlikely(!skb)) {
WARN_ONCE(tp->packets_out,
"invalid inflight: %u state %u cwnd %u mss %d\n",
tp->packets_out, sk->sk_state, tp->snd_cwnd, mss);
inet_csk(sk)->icsk_pending = 0;
return;
}
if (skb_still_in_host_queue(sk, skb))
goto rearm_timer;
pcount = tcp_skb_pcount(skb);
if (WARN_ON(!pcount))
goto rearm_timer;
if ((pcount > 1) && (skb->len > (pcount - 1) * mss)) {
if (unlikely(tcp_fragment(sk, TCP_FRAG_IN_RTX_QUEUE, skb,
(pcount - 1) * mss, mss,
GFP_ATOMIC)))
goto rearm_timer;
skb = skb_rb_next(skb);
}
if (WARN_ON(!skb || !tcp_skb_pcount(skb)))
goto rearm_timer;
if (__tcp_retransmit_skb(sk, skb, 1))
goto rearm_timer;
tp->tlp_retrans = 1;
probe_sent:
/* Record snd_nxt for loss detection. */
tp->tlp_high_seq = tp->snd_nxt;
NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPLOSSPROBES);
/* Reset s.t. tcp_rearm_rto will restart timer from now */
inet_csk(sk)->icsk_pending = 0;
rearm_timer:
tcp_rearm_rto(sk);
}
/* Push out any pending frames which were held back due to
* TCP_CORK or attempt at coalescing tiny packets.
* The socket must be locked by the caller.
*/
void __tcp_push_pending_frames(struct sock *sk, unsigned int cur_mss,
int nonagle)
{
/* If we are closed, the bytes will have to remain here.
* In time closedown will finish, we empty the write queue and
* all will be happy.
*/
if (unlikely(sk->sk_state == TCP_CLOSE))
return;
if (tcp_write_xmit(sk, cur_mss, nonagle, 0,
sk_gfp_mask(sk, GFP_ATOMIC)))
tcp_check_probe_timer(sk);
}
/* Send _single_ skb sitting at the send head. This function requires
* true push pending frames to setup probe timer etc.
*/
void tcp_push_one(struct sock *sk, unsigned int mss_now)
{
struct sk_buff *skb = tcp_send_head(sk);
BUG_ON(!skb || skb->len < mss_now);
tcp_write_xmit(sk, mss_now, TCP_NAGLE_PUSH, 1, sk->sk_allocation);
}
/* This function returns the amount that we can raise the
* usable window based on the following constraints
*
* 1. The window can never be shrunk once it is offered (RFC 793)
* 2. We limit memory per socket
*
* RFC 1122:
* "the suggested [SWS] avoidance algorithm for the receiver is to keep
* RECV.NEXT + RCV.WIN fixed until:
* RCV.BUFF - RCV.USER - RCV.WINDOW >= min(1/2 RCV.BUFF, MSS)"
*
* i.e. don't raise the right edge of the window until you can raise
* it at least MSS bytes.
*
* Unfortunately, the recommended algorithm breaks header prediction,
* since header prediction assumes th->window stays fixed.
*
* Strictly speaking, keeping th->window fixed violates the receiver
* side SWS prevention criteria. The problem is that under this rule
* a stream of single byte packets will cause the right side of the
* window to always advance by a single byte.
*
* Of course, if the sender implements sender side SWS prevention
* then this will not be a problem.
*
* BSD seems to make the following compromise:
*
* If the free space is less than the 1/4 of the maximum
* space available and the free space is less than 1/2 mss,
* then set the window to 0.
* [ Actually, bsd uses MSS and 1/4 of maximal _window_ ]
* Otherwise, just prevent the window from shrinking
* and from being larger than the largest representable value.
*
* This prevents incremental opening of the window in the regime
* where TCP is limited by the speed of the reader side taking
* data out of the TCP receive queue. It does nothing about
* those cases where the window is constrained on the sender side
* because the pipeline is full.
*
* BSD also seems to "accidentally" limit itself to windows that are a
* multiple of MSS, at least until the free space gets quite small.
* This would appear to be a side effect of the mbuf implementation.
* Combining these two algorithms results in the observed behavior
* of having a fixed window size at almost all times.
*
* Below we obtain similar behavior by forcing the offered window to
* a multiple of the mss when it is feasible to do so.
*
* Note, we don't "adjust" for TIMESTAMP or SACK option bytes.
* Regular options like TIMESTAMP are taken into account.
*/
u32 __tcp_select_window(struct sock *sk)
{
struct inet_connection_sock *icsk = inet_csk(sk);
struct tcp_sock *tp = tcp_sk(sk);
/* MSS for the peer's data. Previous versions used mss_clamp
* here. I don't know if the value based on our guesses
* of peer's MSS is better for the performance. It's more correct
* but may be worse for the performance because of rcv_mss
* fluctuations. --SAW 1998/11/1
*/
int mss = icsk->icsk_ack.rcv_mss;
int free_space = tcp_space(sk);
int allowed_space = tcp_full_space(sk);
int full_space, window;
if (sk_is_mptcp(sk))
mptcp_space(sk, &free_space, &allowed_space);
full_space = min_t(int, tp->window_clamp, allowed_space);
if (unlikely(mss > full_space)) {
mss = full_space;
if (mss <= 0)
return 0;
}
if (free_space < (full_space >> 1)) {
icsk->icsk_ack.quick = 0;
if (tcp_under_memory_pressure(sk))
tp->rcv_ssthresh = min(tp->rcv_ssthresh,
4U * tp->advmss);
/* free_space might become our new window, make sure we don't
* increase it due to wscale.
*/
free_space = round_down(free_space, 1 << tp->rx_opt.rcv_wscale);
/* if free space is less than mss estimate, or is below 1/16th
* of the maximum allowed, try to move to zero-window, else
* tcp_clamp_window() will grow rcv buf up to tcp_rmem[2], and
* new incoming data is dropped due to memory limits.
* With large window, mss test triggers way too late in order
* to announce zero window in time before rmem limit kicks in.
*/
if (free_space < (allowed_space >> 4) || free_space < mss)
return 0;
}
if (free_space > tp->rcv_ssthresh)
free_space = tp->rcv_ssthresh;
/* Don't do rounding if we are using window scaling, since the
* scaled window will not line up with the MSS boundary anyway.
*/
if (tp->rx_opt.rcv_wscale) {
window = free_space;
/* Advertise enough space so that it won't get scaled away.
* Import case: prevent zero window announcement if
* 1<<rcv_wscale > mss.
*/
window = ALIGN(window, (1 << tp->rx_opt.rcv_wscale));
} else {
window = tp->rcv_wnd;
/* Get the largest window that is a nice multiple of mss.
* Window clamp already applied above.
* If our current window offering is within 1 mss of the
* free space we just keep it. This prevents the divide
* and multiply from happening most of the time.
* We also don't do any window rounding when the free space
* is too small.
*/
if (window <= free_space - mss || window > free_space)
window = rounddown(free_space, mss);
else if (mss == full_space &&
free_space > window + (full_space >> 1))
window = free_space;
}
return window;
}
void tcp_skb_collapse_tstamp(struct sk_buff *skb,
const struct sk_buff *next_skb)
{
if (unlikely(tcp_has_tx_tstamp(next_skb))) {
const struct skb_shared_info *next_shinfo =
skb_shinfo(next_skb);
struct skb_shared_info *shinfo = skb_shinfo(skb);
shinfo->tx_flags |= next_shinfo->tx_flags & SKBTX_ANY_TSTAMP;
shinfo->tskey = next_shinfo->tskey;
TCP_SKB_CB(skb)->txstamp_ack |=
TCP_SKB_CB(next_skb)->txstamp_ack;
}
}
/* Collapses two adjacent SKB's during retransmission. */
static bool tcp_collapse_retrans(struct sock *sk, struct sk_buff *skb)
{
struct tcp_sock *tp = tcp_sk(sk);
struct sk_buff *next_skb = skb_rb_next(skb);
int next_skb_size;
next_skb_size = next_skb->len;
BUG_ON(tcp_skb_pcount(skb) != 1 || tcp_skb_pcount(next_skb) != 1);
if (next_skb_size) {
if (next_skb_size <= skb_availroom(skb))
skb_copy_bits(next_skb, 0, skb_put(skb, next_skb_size),
next_skb_size);
else if (!tcp_skb_shift(skb, next_skb, 1, next_skb_size))
return false;
}
tcp_highest_sack_replace(sk, next_skb, skb);
/* Update sequence range on original skb. */
TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(next_skb)->end_seq;
/* Merge over control information. This moves PSH/FIN etc. over */
TCP_SKB_CB(skb)->tcp_flags |= TCP_SKB_CB(next_skb)->tcp_flags;
/* All done, get rid of second SKB and account for it so
* packet counting does not break.
*/
TCP_SKB_CB(skb)->sacked |= TCP_SKB_CB(next_skb)->sacked & TCPCB_EVER_RETRANS;
TCP_SKB_CB(skb)->eor = TCP_SKB_CB(next_skb)->eor;
/* changed transmit queue under us so clear hints */
tcp_clear_retrans_hints_partial(tp);
if (next_skb == tp->retransmit_skb_hint)
tp->retransmit_skb_hint = skb;
tcp_adjust_pcount(sk, next_skb, tcp_skb_pcount(next_skb));
tcp_skb_collapse_tstamp(skb, next_skb);
tcp_rtx_queue_unlink_and_free(next_skb, sk);
return true;
}
/* Check if coalescing SKBs is legal. */
static bool tcp_can_collapse(const struct sock *sk, const struct sk_buff *skb)
{
if (tcp_skb_pcount(skb) > 1)
return false;
if (skb_cloned(skb))
return false;
/* Some heuristics for collapsing over SACK'd could be invented */
if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)
return false;
return true;
}
/* Collapse packets in the retransmit queue to make to create
* less packets on the wire. This is only done on retransmission.
*/
static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *to,
int space)
{
struct tcp_sock *tp = tcp_sk(sk);
struct sk_buff *skb = to, *tmp;
bool first = true;
if (!sock_net(sk)->ipv4.sysctl_tcp_retrans_collapse)
return;
if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)
return;
skb_rbtree_walk_from_safe(skb, tmp) {
if (!tcp_can_collapse(sk, skb))
break;
if (!tcp_skb_can_collapse(to, skb))
break;
space -= skb->len;
if (first) {
first = false;
continue;
}
if (space < 0)
break;
if (after(TCP_SKB_CB(skb)->end_seq, tcp_wnd_end(tp)))
break;
if (!tcp_collapse_retrans(sk, to))
break;
}
}
/* This retransmits one SKB. Policy decisions and retransmit queue
* state updates are done by the caller. Returns non-zero if an
* error occurred which prevented the send.
*/
int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs)
{
struct inet_connection_sock *icsk = inet_csk(sk);
struct tcp_sock *tp = tcp_sk(sk);
unsigned int cur_mss;
int diff, len, err;
/* Inconclusive MTU probe */
if (icsk->icsk_mtup.probe_size)
icsk->icsk_mtup.probe_size = 0;
if (skb_still_in_host_queue(sk, skb))
return -EBUSY;
if (before(TCP_SKB_CB(skb)->seq, tp->snd_una)) {
if (unlikely(before(TCP_SKB_CB(skb)->end_seq, tp->snd_una))) {
WARN_ON_ONCE(1);
return -EINVAL;
}
if (tcp_trim_head(sk, skb, tp->snd_una - TCP_SKB_CB(skb)->seq))
return -ENOMEM;
}
if (inet_csk(sk)->icsk_af_ops->rebuild_header(sk))
return -EHOSTUNREACH; /* Routing failure or similar. */
cur_mss = tcp_current_mss(sk);
/* If receiver has shrunk his window, and skb is out of
* new window, do not retransmit it. The exception is the
* case, when window is shrunk to zero. In this case
* our retransmit serves as a zero window probe.
*/
if (!before(TCP_SKB_CB(skb)->seq, tcp_wnd_end(tp)) &&
TCP_SKB_CB(skb)->seq != tp->snd_una)
return -EAGAIN;
len = cur_mss * segs;
if (skb->len > len) {
if (tcp_fragment(sk, TCP_FRAG_IN_RTX_QUEUE, skb, len,
cur_mss, GFP_ATOMIC))
return -ENOMEM; /* We'll try again later. */
} else {
if (skb_unclone_keeptruesize(skb, GFP_ATOMIC))
return -ENOMEM;
diff = tcp_skb_pcount(skb);
tcp_set_skb_tso_segs(skb, cur_mss);
diff -= tcp_skb_pcount(skb);
if (diff)
tcp_adjust_pcount(sk, skb, diff);
if (skb->len < cur_mss)
tcp_retrans_try_collapse(sk, skb, cur_mss);
}
/* RFC3168, section 6.1.1.1. ECN fallback */
if ((TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN_ECN) == TCPHDR_SYN_ECN)
tcp_ecn_clear_syn(sk, skb);
/* Update global and local TCP statistics. */
segs = tcp_skb_pcount(skb);
TCP_ADD_STATS(sock_net(sk), TCP_MIB_RETRANSSEGS, segs);
if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)
__NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPSYNRETRANS);
tp->total_retrans += segs;
tp->bytes_retrans += skb->len;
/* make sure skb->data is aligned on arches that require it
* and check if ack-trimming & collapsing extended the headroom
* beyond what csum_start can cover.
*/
if (unlikely((NET_IP_ALIGN && ((unsigned long)skb->data & 3)) ||
skb_headroom(skb) >= 0xFFFF)) {
struct sk_buff *nskb;
tcp_skb_tsorted_save(skb) {
nskb = __pskb_copy(skb, MAX_TCP_HEADER, GFP_ATOMIC);
if (nskb) {
nskb->dev = NULL;
err = tcp_transmit_skb(sk, nskb, 0, GFP_ATOMIC);
} else {
err = -ENOBUFS;
}
} tcp_skb_tsorted_restore(skb);
if (!err) {
tcp_update_skb_after_send(sk, skb, tp->tcp_wstamp_ns);
tcp_rate_skb_sent(sk, skb);
}
} else {
err = tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC);
}
/* To avoid taking spuriously low RTT samples based on a timestamp
* for a transmit that never happened, always mark EVER_RETRANS
*/
TCP_SKB_CB(skb)->sacked |= TCPCB_EVER_RETRANS;
if (BPF_SOCK_OPS_TEST_FLAG(tp, BPF_SOCK_OPS_RETRANS_CB_FLAG))
tcp_call_bpf_3arg(sk, BPF_SOCK_OPS_RETRANS_CB,
TCP_SKB_CB(skb)->seq, segs, err);
if (likely(!err)) {
trace_tcp_retransmit_skb(sk, skb);
} else if (err != -EBUSY) {
NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPRETRANSFAIL, segs);
}
return err;
}
int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs)
{
struct tcp_sock *tp = tcp_sk(sk);
int err = __tcp_retransmit_skb(sk, skb, segs);
if (err == 0) {
#if FASTRETRANS_DEBUG > 0
if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS) {
net_dbg_ratelimited("retrans_out leaked\n");
}
#endif
TCP_SKB_CB(skb)->sacked |= TCPCB_RETRANS;
tp->retrans_out += tcp_skb_pcount(skb);
}
/* Save stamp of the first (attempted) retransmit. */
if (!tp->retrans_stamp)
tp->retrans_stamp = tcp_skb_timestamp(skb);
if (tp->undo_retrans < 0)
tp->undo_retrans = 0;
tp->undo_retrans += tcp_skb_pcount(skb);
return err;
}
/* This gets called after a retransmit timeout, and the initially
* retransmitted data is acknowledged. It tries to continue
* resending the rest of the retransmit queue, until either
* we've sent it all or the congestion window limit is reached.
*/
void tcp_xmit_retransmit_queue(struct sock *sk)
{
const struct inet_connection_sock *icsk = inet_csk(sk);
struct sk_buff *skb, *rtx_head, *hole = NULL;
struct tcp_sock *tp = tcp_sk(sk);
bool rearm_timer = false;
u32 max_segs;
int mib_idx;
if (!tp->packets_out)
return;
rtx_head = tcp_rtx_queue_head(sk);
skb = tp->retransmit_skb_hint ?: rtx_head;
max_segs = tcp_tso_segs(sk, tcp_current_mss(sk));
skb_rbtree_walk_from(skb) {
__u8 sacked;
int segs;
if (tcp_pacing_check(sk))
break;
/* we could do better than to assign each time */
if (!hole)
tp->retransmit_skb_hint = skb;
segs = tp->snd_cwnd - tcp_packets_in_flight(tp);
if (segs <= 0)
break;
sacked = TCP_SKB_CB(skb)->sacked;
/* In case tcp_shift_skb_data() have aggregated large skbs,
* we need to make sure not sending too bigs TSO packets
*/
segs = min_t(int, segs, max_segs);
if (tp->retrans_out >= tp->lost_out) {
break;
} else if (!(sacked & TCPCB_LOST)) {
if (!hole && !(sacked & (TCPCB_SACKED_RETRANS|TCPCB_SACKED_ACKED)))
hole = skb;
continue;
} else {
if (icsk->icsk_ca_state != TCP_CA_Loss)
mib_idx = LINUX_MIB_TCPFASTRETRANS;
else
mib_idx = LINUX_MIB_TCPSLOWSTARTRETRANS;
}
if (sacked & (TCPCB_SACKED_ACKED|TCPCB_SACKED_RETRANS))
continue;
if (tcp_small_queue_check(sk, skb, 1))
break;
if (tcp_retransmit_skb(sk, skb, segs))
break;
NET_ADD_STATS(sock_net(sk), mib_idx, tcp_skb_pcount(skb));
if (tcp_in_cwnd_reduction(sk))
tp->prr_out += tcp_skb_pcount(skb);
if (skb == rtx_head &&
icsk->icsk_pending != ICSK_TIME_REO_TIMEOUT)
rearm_timer = true;
}
if (rearm_timer)
tcp_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
inet_csk(sk)->icsk_rto,
TCP_RTO_MAX);
}
/* We allow to exceed memory limits for FIN packets to expedite
* connection tear down and (memory) recovery.
* Otherwise tcp_send_fin() could be tempted to either delay FIN
* or even be forced to close flow without any FIN.
* In general, we want to allow one skb per socket to avoid hangs
* with edge trigger epoll()
*/
void sk_forced_mem_schedule(struct sock *sk, int size)
{
int amt;
if (size <= sk->sk_forward_alloc)
return;
amt = sk_mem_pages(size);
sk->sk_forward_alloc += amt * SK_MEM_QUANTUM;
sk_memory_allocated_add(sk, amt);
if (mem_cgroup_sockets_enabled && sk->sk_memcg)
mem_cgroup_charge_skmem(sk->sk_memcg, amt,
gfp_memcg_charge() | __GFP_NOFAIL);
}
/* Send a FIN. The caller locks the socket for us.
* We should try to send a FIN packet really hard, but eventually give up.
*/
void tcp_send_fin(struct sock *sk)
{
struct sk_buff *skb, *tskb, *tail = tcp_write_queue_tail(sk);
struct tcp_sock *tp = tcp_sk(sk);
/* Optimization, tack on the FIN if we have one skb in write queue and
* this skb was not yet sent, or we are under memory pressure.
* Note: in the latter case, FIN packet will be sent after a timeout,
* as TCP stack thinks it has already been transmitted.
*/
tskb = tail;
if (!tskb && tcp_under_memory_pressure(sk))
tskb = skb_rb_last(&sk->tcp_rtx_queue);
if (tskb) {
TCP_SKB_CB(tskb)->tcp_flags |= TCPHDR_FIN;
TCP_SKB_CB(tskb)->end_seq++;
tp->write_seq++;
if (!tail) {
/* This means tskb was already sent.
* Pretend we included the FIN on previous transmit.
* We need to set tp->snd_nxt to the value it would have
* if FIN had been sent. This is because retransmit path
* does not change tp->snd_nxt.
*/
WRITE_ONCE(tp->snd_nxt, tp->snd_nxt + 1);
return;
}
} else {
skb = alloc_skb_fclone(MAX_TCP_HEADER, sk->sk_allocation);
if (unlikely(!skb))
return;
INIT_LIST_HEAD(&skb->tcp_tsorted_anchor);
skb_reserve(skb, MAX_TCP_HEADER);
sk_forced_mem_schedule(sk, skb->truesize);
/* FIN eats a sequence byte, write_seq advanced by tcp_queue_skb(). */
tcp_init_nondata_skb(skb, tp->write_seq,
TCPHDR_ACK | TCPHDR_FIN);
tcp_queue_skb(sk, skb);
}
__tcp_push_pending_frames(sk, tcp_current_mss(sk), TCP_NAGLE_OFF);
}
/* We get here when a process closes a file descriptor (either due to
* an explicit close() or as a byproduct of exit()'ing) and there
* was unread data in the receive queue. This behavior is recommended
* by RFC 2525, section 2.17. -DaveM
*/
void tcp_send_active_reset(struct sock *sk, gfp_t priority)
{
struct sk_buff *skb;
TCP_INC_STATS(sock_net(sk), TCP_MIB_OUTRSTS);
/* NOTE: No TCP options attached and we never retransmit this. */
skb = alloc_skb(MAX_TCP_HEADER, priority);
if (!skb) {
NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTFAILED);
return;
}
/* Reserve space for headers and prepare control bits. */
skb_reserve(skb, MAX_TCP_HEADER);
tcp_init_nondata_skb(skb, tcp_acceptable_seq(sk),
TCPHDR_ACK | TCPHDR_RST);
tcp_mstamp_refresh(tcp_sk(sk));
/* Send it off. */
if (tcp_transmit_skb(sk, skb, 0, priority))
NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTFAILED);
/* skb of trace_tcp_send_reset() keeps the skb that caused RST,
* skb here is different to the troublesome skb, so use NULL
*/
trace_tcp_send_reset(sk, NULL);
}
/* Send a crossed SYN-ACK during socket establishment.
* WARNING: This routine must only be called when we have already sent
* a SYN packet that crossed the incoming SYN that caused this routine
* to get called. If this assumption fails then the initial rcv_wnd
* and rcv_wscale values will not be correct.
*/
int tcp_send_synack(struct sock *sk)
{
struct sk_buff *skb;
skb = tcp_rtx_queue_head(sk);
if (!skb || !(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)) {
pr_err("%s: wrong queue state\n", __func__);
return -EFAULT;
}
if (!(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_ACK)) {
if (skb_cloned(skb)) {
struct sk_buff *nskb;
tcp_skb_tsorted_save(skb) {
nskb = skb_copy(skb, GFP_ATOMIC);
} tcp_skb_tsorted_restore(skb);
if (!nskb)
return -ENOMEM;
INIT_LIST_HEAD(&nskb->tcp_tsorted_anchor);
tcp_highest_sack_replace(sk, skb, nskb);
tcp_rtx_queue_unlink_and_free(skb, sk);
__skb_header_release(nskb);
tcp_rbtree_insert(&sk->tcp_rtx_queue, nskb);
sk_wmem_queued_add(sk, nskb->truesize);
sk_mem_charge(sk, nskb->truesize);
skb = nskb;
}
TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_ACK;
tcp_ecn_send_synack(sk, skb);
}
return tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC);
}
/**
* tcp_make_synack - Allocate one skb and build a SYNACK packet.
* @sk: listener socket
* @dst: dst entry attached to the SYNACK. It is consumed and caller
* should not use it again.
* @req: request_sock pointer
* @foc: cookie for tcp fast open
* @synack_type: Type of synack to prepare
* @syn_skb: SYN packet just received. It could be NULL for rtx case.
*/
struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst,
struct request_sock *req,
struct tcp_fastopen_cookie *foc,
enum tcp_synack_type synack_type,
struct sk_buff *syn_skb)
{
struct inet_request_sock *ireq = inet_rsk(req);
const struct tcp_sock *tp = tcp_sk(sk);
struct tcp_md5sig_key *md5 = NULL;
struct tcp_out_options opts;
struct sk_buff *skb;
int tcp_header_size;
struct tcphdr *th;
int mss;
u64 now;
skb = alloc_skb(MAX_TCP_HEADER, GFP_ATOMIC);
if (unlikely(!skb)) {
dst_release(dst);
return NULL;
}
/* Reserve space for headers. */
skb_reserve(skb, MAX_TCP_HEADER);
switch (synack_type) {
case TCP_SYNACK_NORMAL:
skb_set_owner_w(skb, req_to_sk(req));
break;
case TCP_SYNACK_COOKIE:
/* Under synflood, we do not attach skb to a socket,
* to avoid false sharing.
*/
break;
case TCP_SYNACK_FASTOPEN:
/* sk is a const pointer, because we want to express multiple
* cpu might call us concurrently.
* sk->sk_wmem_alloc in an atomic, we can promote to rw.
*/
skb_set_owner_w(skb, (struct sock *)sk);
break;
}
skb_dst_set(skb, dst);
mss = tcp_mss_clamp(tp, dst_metric_advmss(dst));
memset(&opts, 0, sizeof(opts));
now = tcp_clock_ns();
#ifdef CONFIG_SYN_COOKIES
if (unlikely(synack_type == TCP_SYNACK_COOKIE && ireq->tstamp_ok))
skb->skb_mstamp_ns = cookie_init_timestamp(req, now);
else
#endif
{
skb->skb_mstamp_ns = now;
if (!tcp_rsk(req)->snt_synack) /* Timestamp first SYNACK */
tcp_rsk(req)->snt_synack = tcp_skb_timestamp_us(skb);
}
#ifdef CONFIG_TCP_MD5SIG
rcu_read_lock();
md5 = tcp_rsk(req)->af_specific->req_md5_lookup(sk, req_to_sk(req));
#endif
skb_set_hash(skb, tcp_rsk(req)->txhash, PKT_HASH_TYPE_L4);
/* bpf program will be interested in the tcp_flags */
TCP_SKB_CB(skb)->tcp_flags = TCPHDR_SYN | TCPHDR_ACK;
tcp_header_size = tcp_synack_options(sk, req, mss, skb, &opts, md5,
foc, synack_type,
syn_skb) + sizeof(*th);
skb_push(skb, tcp_header_size);
skb_reset_transport_header(skb);
th = (struct tcphdr *)skb->data;
memset(th, 0, sizeof(struct tcphdr));
th->syn = 1;
th->ack = 1;
tcp_ecn_make_synack(req, th);
th->source = htons(ireq->ir_num);
th->dest = ireq->ir_rmt_port;
skb->mark = ireq->ir_mark;
skb->ip_summed = CHECKSUM_PARTIAL;
th->seq = htonl(tcp_rsk(req)->snt_isn);
/* XXX data is queued and acked as is. No buffer/window check */
th->ack_seq = htonl(tcp_rsk(req)->rcv_nxt);
/* RFC1323: The window in SYN & SYN/ACK segments is never scaled. */
th->window = htons(min(req->rsk_rcv_wnd, 65535U));
tcp_options_write((__be32 *)(th + 1), NULL, &opts);
th->doff = (tcp_header_size >> 2);
__TCP_INC_STATS(sock_net(sk), TCP_MIB_OUTSEGS);
#ifdef CONFIG_TCP_MD5SIG
/* Okay, we have all we need - do the md5 hash if needed */
if (md5)
tcp_rsk(req)->af_specific->calc_md5_hash(opts.hash_location,
md5, req_to_sk(req), skb);
rcu_read_unlock();
#endif
bpf_skops_write_hdr_opt((struct sock *)sk, skb, req, syn_skb,
synack_type, &opts);
skb->skb_mstamp_ns = now;
tcp_add_tx_delay(skb, tp);
return skb;
}
EXPORT_SYMBOL(tcp_make_synack);
static void tcp_ca_dst_init(struct sock *sk, const struct dst_entry *dst)
{
struct inet_connection_sock *icsk = inet_csk(sk);
const struct tcp_congestion_ops *ca;
u32 ca_key = dst_metric(dst, RTAX_CC_ALGO);
if (ca_key == TCP_CA_UNSPEC)
return;
rcu_read_lock();
ca = tcp_ca_find_key(ca_key);
if (likely(ca && bpf_try_module_get(ca, ca->owner))) { bpf_module_put(icsk->icsk_ca_ops, icsk->icsk_ca_ops->owner);
icsk->icsk_ca_dst_locked = tcp_ca_dst_locked(dst);
icsk->icsk_ca_ops = ca;
}
rcu_read_unlock();
}
/* Do all connect socket setups that can be done AF independent. */
static void tcp_connect_init(struct sock *sk)
{
const struct dst_entry *dst = __sk_dst_get(sk);
struct tcp_sock *tp = tcp_sk(sk);
__u8 rcv_wscale;
u32 rcv_wnd;
/* We'll fix this up when we get a response from the other end.
* See tcp_input.c:tcp_rcv_state_process case TCP_SYN_SENT.
*/
tp->tcp_header_len = sizeof(struct tcphdr);
if (sock_net(sk)->ipv4.sysctl_tcp_timestamps)
tp->tcp_header_len += TCPOLEN_TSTAMP_ALIGNED;
#ifdef CONFIG_TCP_MD5SIG
if (tp->af_specific->md5_lookup(sk, sk)) tp->tcp_header_len += TCPOLEN_MD5SIG_ALIGNED;
#endif
/* If user gave his TCP_MAXSEG, record it to clamp */
if (tp->rx_opt.user_mss) tp->rx_opt.mss_clamp = tp->rx_opt.user_mss; tp->max_window = 0;
tcp_mtup_init(sk);
tcp_sync_mss(sk, dst_mtu(dst));
tcp_ca_dst_init(sk, dst);
if (!tp->window_clamp)
tp->window_clamp = dst_metric(dst, RTAX_WINDOW);
tp->advmss = tcp_mss_clamp(tp, dst_metric_advmss(dst));
tcp_initialize_rcv_mss(sk);
/* limit the window selection if the user enforce a smaller rx buffer */
if (sk->sk_userlocks & SOCK_RCVBUF_LOCK && (tp->window_clamp > tcp_full_space(sk) || tp->window_clamp == 0)) tp->window_clamp = tcp_full_space(sk);
rcv_wnd = tcp_rwnd_init_bpf(sk);
if (rcv_wnd == 0)
rcv_wnd = dst_metric(dst, RTAX_INITRWND);
tcp_select_initial_window(sk, tcp_full_space(sk), tp->advmss - (tp->rx_opt.ts_recent_stamp ? tp->tcp_header_len - sizeof(struct tcphdr) : 0),
&tp->rcv_wnd,
&tp->window_clamp,
sock_net(sk)->ipv4.sysctl_tcp_window_scaling,
&rcv_wscale,
rcv_wnd);
tp->rx_opt.rcv_wscale = rcv_wscale;
tp->rcv_ssthresh = tp->rcv_wnd;
sk->sk_err = 0;
sock_reset_flag(sk, SOCK_DONE);
tp->snd_wnd = 0;
tcp_init_wl(tp, 0);
tcp_write_queue_purge(sk);
tp->snd_una = tp->write_seq;
tp->snd_sml = tp->write_seq;
tp->snd_up = tp->write_seq;
WRITE_ONCE(tp->snd_nxt, tp->write_seq);
if (likely(!tp->repair))
tp->rcv_nxt = 0;
else
tp->rcv_tstamp = tcp_jiffies32; tp->rcv_wup = tp->rcv_nxt;
WRITE_ONCE(tp->copied_seq, tp->rcv_nxt);
inet_csk(sk)->icsk_rto = tcp_timeout_init(sk);
inet_csk(sk)->icsk_retransmits = 0;
tcp_clear_retrans(tp);
}
static void tcp_connect_queue_skb(struct sock *sk, struct sk_buff *skb)
{
struct tcp_sock *tp = tcp_sk(sk);
struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);
tcb->end_seq += skb->len;
__skb_header_release(skb);
sk_wmem_queued_add(sk, skb->truesize);
sk_mem_charge(sk, skb->truesize);
WRITE_ONCE(tp->write_seq, tcb->end_seq);
tp->packets_out += tcp_skb_pcount(skb);
}
/* Build and send a SYN with data and (cached) Fast Open cookie. However,
* queue a data-only packet after the regular SYN, such that regular SYNs
* are retransmitted on timeouts. Also if the remote SYN-ACK acknowledges
* only the SYN sequence, the data are retransmitted in the first ACK.
* If cookie is not cached or other error occurs, falls back to send a
* regular SYN with Fast Open cookie request option.
*/
static int tcp_send_syn_data(struct sock *sk, struct sk_buff *syn)
{
struct inet_connection_sock *icsk = inet_csk(sk);
struct tcp_sock *tp = tcp_sk(sk);
struct tcp_fastopen_request *fo = tp->fastopen_req;
int space, err = 0;
struct sk_buff *syn_data;
tp->rx_opt.mss_clamp = tp->advmss; /* If MSS is not cached */
if (!tcp_fastopen_cookie_check(sk, &tp->rx_opt.mss_clamp, &fo->cookie))
goto fallback;
/* MSS for SYN-data is based on cached MSS and bounded by PMTU and
* user-MSS. Reserve maximum option space for middleboxes that add
* private TCP options. The cost is reduced data space in SYN :(
*/
tp->rx_opt.mss_clamp = tcp_mss_clamp(tp, tp->rx_opt.mss_clamp);
/* Sync mss_cache after updating the mss_clamp */
tcp_sync_mss(sk, icsk->icsk_pmtu_cookie);
space = __tcp_mtu_to_mss(sk, icsk->icsk_pmtu_cookie) -
MAX_TCP_OPTION_SPACE;
space = min_t(size_t, space, fo->size);
/* limit to order-0 allocations */
space = min_t(size_t, space, SKB_MAX_HEAD(MAX_TCP_HEADER));
syn_data = sk_stream_alloc_skb(sk, space, sk->sk_allocation, false);
if (!syn_data)
goto fallback;
syn_data->ip_summed = CHECKSUM_PARTIAL;
memcpy(syn_data->cb, syn->cb, sizeof(syn->cb));
if (space) {
int copied = copy_from_iter(skb_put(syn_data, space), space,
&fo->data->msg_iter);
if (unlikely(!copied)) {
tcp_skb_tsorted_anchor_cleanup(syn_data);
kfree_skb(syn_data);
goto fallback;
}
if (copied != space) { skb_trim(syn_data, copied);
space = copied;
}
skb_zcopy_set(syn_data, fo->uarg, NULL);
}
/* No more data pending in inet_wait_for_connect() */
if (space == fo->size) fo->data = NULL; fo->copied = space;
tcp_connect_queue_skb(sk, syn_data);
if (syn_data->len)
tcp_chrono_start(sk, TCP_CHRONO_BUSY);
err = tcp_transmit_skb(sk, syn_data, 1, sk->sk_allocation);
syn->skb_mstamp_ns = syn_data->skb_mstamp_ns;
/* Now full SYN+DATA was cloned and sent (or not),
* remove the SYN from the original skb (syn_data)
* we keep in write queue in case of a retransmit, as we
* also have the SYN packet (with no data) in the same queue.
*/
TCP_SKB_CB(syn_data)->seq++;
TCP_SKB_CB(syn_data)->tcp_flags = TCPHDR_ACK | TCPHDR_PSH;
if (!err) {
tp->syn_data = (fo->copied > 0);
tcp_rbtree_insert(&sk->tcp_rtx_queue, syn_data);
NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPORIGDATASENT);
goto done;
}
/* data was not sent, put it in write_queue */
__skb_queue_tail(&sk->sk_write_queue, syn_data);
tp->packets_out -= tcp_skb_pcount(syn_data);
fallback:
/* Send a regular SYN with Fast Open cookie request option */
if (fo->cookie.len > 0) fo->cookie.len = 0; err = tcp_transmit_skb(sk, syn, 1, sk->sk_allocation);
if (err)
tp->syn_fastopen = 0;
done:
fo->cookie.len = -1; /* Exclude Fast Open option for SYN retries */
return err;
}
/* Build a SYN and send it off. */
int tcp_connect(struct sock *sk)
{
struct tcp_sock *tp = tcp_sk(sk);
struct sk_buff *buff;
int err;
tcp_call_bpf(sk, BPF_SOCK_OPS_TCP_CONNECT_CB, 0, NULL);
if (inet_csk(sk)->icsk_af_ops->rebuild_header(sk))
return -EHOSTUNREACH; /* Routing failure or similar. */
tcp_connect_init(sk);
if (unlikely(tp->repair)) {
tcp_finish_connect(sk, NULL);
return 0;
}
buff = sk_stream_alloc_skb(sk, 0, sk->sk_allocation, true);
if (unlikely(!buff))
return -ENOBUFS;
tcp_init_nondata_skb(buff, tp->write_seq++, TCPHDR_SYN);
tcp_mstamp_refresh(tp);
tp->retrans_stamp = tcp_time_stamp(tp);
tcp_connect_queue_skb(sk, buff);
tcp_ecn_send_syn(sk, buff);
tcp_rbtree_insert(&sk->tcp_rtx_queue, buff);
/* Send off SYN; include data in Fast Open. */
err = tp->fastopen_req ? tcp_send_syn_data(sk, buff) :
tcp_transmit_skb(sk, buff, 1, sk->sk_allocation); if (err == -ECONNREFUSED)
return err;
/* We change tp->snd_nxt after the tcp_transmit_skb() call
* in order to make this packet get counted in tcpOutSegs.
*/
WRITE_ONCE(tp->snd_nxt, tp->write_seq);
tp->pushed_seq = tp->write_seq;
buff = tcp_send_head(sk);
if (unlikely(buff)) { WRITE_ONCE(tp->snd_nxt, TCP_SKB_CB(buff)->seq);
tp->pushed_seq = TCP_SKB_CB(buff)->seq;
}
TCP_INC_STATS(sock_net(sk), TCP_MIB_ACTIVEOPENS);
/* Timer for repeating the SYN until an answer. */
inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
inet_csk(sk)->icsk_rto, TCP_RTO_MAX);
return 0;
}
EXPORT_SYMBOL(tcp_connect);
/* Send out a delayed ack, the caller does the policy checking
* to see if we should even be here. See tcp_input.c:tcp_ack_snd_check()
* for details.
*/
void tcp_send_delayed_ack(struct sock *sk)
{
struct inet_connection_sock *icsk = inet_csk(sk);
int ato = icsk->icsk_ack.ato;
unsigned long timeout;
if (ato > TCP_DELACK_MIN) {
const struct tcp_sock *tp = tcp_sk(sk);
int max_ato = HZ / 2;
if (inet_csk_in_pingpong_mode(sk) ||
(icsk->icsk_ack.pending & ICSK_ACK_PUSHED))
max_ato = TCP_DELACK_MAX;
/* Slow path, intersegment interval is "high". */
/* If some rtt estimate is known, use it to bound delayed ack.
* Do not use inet_csk(sk)->icsk_rto here, use results of rtt measurements
* directly.
*/
if (tp->srtt_us) {
int rtt = max_t(int, usecs_to_jiffies(tp->srtt_us >> 3),
TCP_DELACK_MIN);
if (rtt < max_ato)
max_ato = rtt;
}
ato = min(ato, max_ato);
}
ato = min_t(u32, ato, inet_csk(sk)->icsk_delack_max);
/* Stay within the limit we were given */
timeout = jiffies + ato;
/* Use new timeout only if there wasn't a older one earlier. */
if (icsk->icsk_ack.pending & ICSK_ACK_TIMER) {
/* If delack timer is about to expire, send ACK now. */
if (time_before_eq(icsk->icsk_ack.timeout, jiffies + (ato >> 2))) {
tcp_send_ack(sk);
return;
}
if (!time_before(timeout, icsk->icsk_ack.timeout))
timeout = icsk->icsk_ack.timeout;
}
icsk->icsk_ack.pending |= ICSK_ACK_SCHED | ICSK_ACK_TIMER;
icsk->icsk_ack.timeout = timeout;
sk_reset_timer(sk, &icsk->icsk_delack_timer, timeout);
}
/* This routine sends an ack and also updates the window. */
void __tcp_send_ack(struct sock *sk, u32 rcv_nxt)
{
struct sk_buff *buff;
/* If we have been reset, we may not send again. */
if (sk->sk_state == TCP_CLOSE)
return;
/* We are not putting this on the write queue, so
* tcp_transmit_skb() will set the ownership to this
* sock.
*/
buff = alloc_skb(MAX_TCP_HEADER,
sk_gfp_mask(sk, GFP_ATOMIC | __GFP_NOWARN));
if (unlikely(!buff)) {
struct inet_connection_sock *icsk = inet_csk(sk);
unsigned long delay;
delay = TCP_DELACK_MAX << icsk->icsk_ack.retry;
if (delay < TCP_RTO_MAX)
icsk->icsk_ack.retry++;
inet_csk_schedule_ack(sk);
icsk->icsk_ack.ato = TCP_ATO_MIN;
inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK, delay, TCP_RTO_MAX);
return;
}
/* Reserve space for headers and prepare control bits. */
skb_reserve(buff, MAX_TCP_HEADER);
tcp_init_nondata_skb(buff, tcp_acceptable_seq(sk), TCPHDR_ACK);
/* We do not want pure acks influencing TCP Small Queues or fq/pacing
* too much.
* SKB_TRUESIZE(max(1 .. 66, MAX_TCP_HEADER)) is unfortunately ~784
*/
skb_set_tcp_pure_ack(buff);
/* Send it off, this clears delayed acks for us. */
__tcp_transmit_skb(sk, buff, 0, (__force gfp_t)0, rcv_nxt);
}
EXPORT_SYMBOL_GPL(__tcp_send_ack);
void tcp_send_ack(struct sock *sk)
{
__tcp_send_ack(sk, tcp_sk(sk)->rcv_nxt);
}
/* This routine sends a packet with an out of date sequence
* number. It assumes the other end will try to ack it.
*
* Question: what should we make while urgent mode?
* 4.4BSD forces sending single byte of data. We cannot send
* out of window data, because we have SND.NXT==SND.MAX...
*
* Current solution: to send TWO zero-length segments in urgent mode:
* one is with SEG.SEQ=SND.UNA to deliver urgent pointer, another is
* out-of-date with SND.UNA-1 to probe window.
*/
static int tcp_xmit_probe_skb(struct sock *sk, int urgent, int mib)
{
struct tcp_sock *tp = tcp_sk(sk);
struct sk_buff *skb;
/* We don't queue it, tcp_transmit_skb() sets ownership. */
skb = alloc_skb(MAX_TCP_HEADER,
sk_gfp_mask(sk, GFP_ATOMIC | __GFP_NOWARN));
if (!skb)
return -1;
/* Reserve space for headers and set control bits. */
skb_reserve(skb, MAX_TCP_HEADER);
/* Use a previous sequence. This should cause the other
* end to send an ack. Don't queue or clone SKB, just
* send it.
*/
tcp_init_nondata_skb(skb, tp->snd_una - !urgent, TCPHDR_ACK);
NET_INC_STATS(sock_net(sk), mib);
return tcp_transmit_skb(sk, skb, 0, (__force gfp_t)0);
}
/* Called from setsockopt( ... TCP_REPAIR ) */
void tcp_send_window_probe(struct sock *sk)
{
if (sk->sk_state == TCP_ESTABLISHED) {
tcp_sk(sk)->snd_wl1 = tcp_sk(sk)->rcv_nxt - 1;
tcp_mstamp_refresh(tcp_sk(sk));
tcp_xmit_probe_skb(sk, 0, LINUX_MIB_TCPWINPROBE);
}
}
/* Initiate keepalive or window probe from timer. */
int tcp_write_wakeup(struct sock *sk, int mib)
{
struct tcp_sock *tp = tcp_sk(sk);
struct sk_buff *skb;
if (sk->sk_state == TCP_CLOSE)
return -1;
skb = tcp_send_head(sk);
if (skb && before(TCP_SKB_CB(skb)->seq, tcp_wnd_end(tp))) {
int err;
unsigned int mss = tcp_current_mss(sk);
unsigned int seg_size = tcp_wnd_end(tp) - TCP_SKB_CB(skb)->seq;
if (before(tp->pushed_seq, TCP_SKB_CB(skb)->end_seq))
tp->pushed_seq = TCP_SKB_CB(skb)->end_seq;
/* We are probing the opening of a window
* but the window size is != 0
* must have been a result SWS avoidance ( sender )
*/
if (seg_size < TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq ||
skb->len > mss) {
seg_size = min(seg_size, mss);
TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_PSH;
if (tcp_fragment(sk, TCP_FRAG_IN_WRITE_QUEUE,
skb, seg_size, mss, GFP_ATOMIC))
return -1;
} else if (!tcp_skb_pcount(skb))
tcp_set_skb_tso_segs(skb, mss);
TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_PSH;
err = tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC);
if (!err)
tcp_event_new_data_sent(sk, skb);
return err;
} else {
if (between(tp->snd_up, tp->snd_una + 1, tp->snd_una + 0xFFFF))
tcp_xmit_probe_skb(sk, 1, mib);
return tcp_xmit_probe_skb(sk, 0, mib);
}
}
/* A window probe timeout has occurred. If window is not closed send
* a partial packet else a zero probe.
*/
void tcp_send_probe0(struct sock *sk)
{
struct inet_connection_sock *icsk = inet_csk(sk);
struct tcp_sock *tp = tcp_sk(sk);
struct net *net = sock_net(sk);
unsigned long timeout;
int err;
err = tcp_write_wakeup(sk, LINUX_MIB_TCPWINPROBE);
if (tp->packets_out || tcp_write_queue_empty(sk)) {
/* Cancel probe timer, if it is not required. */
icsk->icsk_probes_out = 0;
icsk->icsk_backoff = 0;
icsk->icsk_probes_tstamp = 0;
return;
}
icsk->icsk_probes_out++;
if (err <= 0) {
if (icsk->icsk_backoff < net->ipv4.sysctl_tcp_retries2)
icsk->icsk_backoff++;
timeout = tcp_probe0_when(sk, TCP_RTO_MAX);
} else {
/* If packet was not sent due to local congestion,
* Let senders fight for local resources conservatively.
*/
timeout = TCP_RESOURCE_PROBE_INTERVAL;
}
timeout = tcp_clamp_probe0_to_user_timeout(sk, timeout);
tcp_reset_xmit_timer(sk, ICSK_TIME_PROBE0, timeout, TCP_RTO_MAX);
}
int tcp_rtx_synack(const struct sock *sk, struct request_sock *req)
{
const struct tcp_request_sock_ops *af_ops = tcp_rsk(req)->af_specific;
struct flowi fl;
int res;
tcp_rsk(req)->txhash = net_tx_rndhash();
res = af_ops->send_synack(sk, NULL, &fl, req, NULL, TCP_SYNACK_NORMAL,
NULL);
if (!res) {
__TCP_INC_STATS(sock_net(sk), TCP_MIB_RETRANSSEGS);
__NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPSYNRETRANS);
if (unlikely(tcp_passive_fastopen(sk)))
tcp_sk(sk)->total_retrans++;
trace_tcp_retransmit_synack(sk, req);
}
return res;
}
EXPORT_SYMBOL(tcp_rtx_synack);
/* SPDX-License-Identifier: GPL-2.0 */
/* rwsem.h: R/W semaphores, public interface
*
* Written by David Howells (dhowells@redhat.com).
* Derived from asm-i386/semaphore.h
*/
#ifndef _LINUX_RWSEM_H
#define _LINUX_RWSEM_H
#include <linux/linkage.h>
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/list.h>
#include <linux/spinlock.h>
#include <linux/atomic.h>
#include <linux/err.h>
#ifdef CONFIG_DEBUG_LOCK_ALLOC
# define __RWSEM_DEP_MAP_INIT(lockname) \
.dep_map = { \
.name = #lockname, \
.wait_type_inner = LD_WAIT_SLEEP, \
},
#else
# define __RWSEM_DEP_MAP_INIT(lockname)
#endif
#ifndef CONFIG_PREEMPT_RT
#ifdef CONFIG_RWSEM_SPIN_ON_OWNER
#include <linux/osq_lock.h>
#endif
/*
* For an uncontended rwsem, count and owner are the only fields a task
* needs to touch when acquiring the rwsem. So they are put next to each
* other to increase the chance that they will share the same cacheline.
*
* In a contended rwsem, the owner is likely the most frequently accessed
* field in the structure as the optimistic waiter that holds the osq lock
* will spin on owner. For an embedded rwsem, other hot fields in the
* containing structure should be moved further away from the rwsem to
* reduce the chance that they will share the same cacheline causing
* cacheline bouncing problem.
*/
struct rw_semaphore {
atomic_long_t count;
/*
* Write owner or one of the read owners as well flags regarding
* the current state of the rwsem. Can be used as a speculative
* check to see if the write owner is running on the cpu.
*/
atomic_long_t owner;
#ifdef CONFIG_RWSEM_SPIN_ON_OWNER
struct optimistic_spin_queue osq; /* spinner MCS lock */
#endif
raw_spinlock_t wait_lock;
struct list_head wait_list;
#ifdef CONFIG_DEBUG_RWSEMS
void *magic;
#endif
#ifdef CONFIG_DEBUG_LOCK_ALLOC
struct lockdep_map dep_map;
#endif
};
/* In all implementations count != 0 means locked */
static inline int rwsem_is_locked(struct rw_semaphore *sem)
{
return atomic_long_read(&sem->count) != 0;
}
#define RWSEM_UNLOCKED_VALUE 0L
#define __RWSEM_COUNT_INIT(name) .count = ATOMIC_LONG_INIT(RWSEM_UNLOCKED_VALUE)
/* Common initializer macros and functions */
#ifdef CONFIG_DEBUG_RWSEMS
# define __RWSEM_DEBUG_INIT(lockname) .magic = &lockname,
#else
# define __RWSEM_DEBUG_INIT(lockname)
#endif
#ifdef CONFIG_RWSEM_SPIN_ON_OWNER
#define __RWSEM_OPT_INIT(lockname) .osq = OSQ_LOCK_UNLOCKED,
#else
#define __RWSEM_OPT_INIT(lockname)
#endif
#define __RWSEM_INITIALIZER(name) \
{ __RWSEM_COUNT_INIT(name), \
.owner = ATOMIC_LONG_INIT(0), \
__RWSEM_OPT_INIT(name) \
.wait_lock = __RAW_SPIN_LOCK_UNLOCKED(name.wait_lock),\
.wait_list = LIST_HEAD_INIT((name).wait_list), \
__RWSEM_DEBUG_INIT(name) \
__RWSEM_DEP_MAP_INIT(name) }
#define DECLARE_RWSEM(name) \
struct rw_semaphore name = __RWSEM_INITIALIZER(name)
extern void __init_rwsem(struct rw_semaphore *sem, const char *name,
struct lock_class_key *key);
#define init_rwsem(sem) \
do { \
static struct lock_class_key __key; \
\
__init_rwsem((sem), #sem, &__key); \
} while (0)
/*
* This is the same regardless of which rwsem implementation that is being used.
* It is just a heuristic meant to be called by somebody already holding the
* rwsem to see if somebody from an incompatible type is wanting access to the
* lock.
*/
static inline int rwsem_is_contended(struct rw_semaphore *sem)
{
return !list_empty(&sem->wait_list);
}
#else /* !CONFIG_PREEMPT_RT */
#include <linux/rwbase_rt.h>
struct rw_semaphore {
struct rwbase_rt rwbase;
#ifdef CONFIG_DEBUG_LOCK_ALLOC
struct lockdep_map dep_map;
#endif
};
#define __RWSEM_INITIALIZER(name) \
{ \
.rwbase = __RWBASE_INITIALIZER(name), \
__RWSEM_DEP_MAP_INIT(name) \
}
#define DECLARE_RWSEM(lockname) \
struct rw_semaphore lockname = __RWSEM_INITIALIZER(lockname)
extern void __init_rwsem(struct rw_semaphore *rwsem, const char *name,
struct lock_class_key *key);
#define init_rwsem(sem) \
do { \
static struct lock_class_key __key; \
\
__init_rwsem((sem), #sem, &__key); \
} while (0)
static __always_inline int rwsem_is_locked(struct rw_semaphore *sem)
{
return rw_base_is_locked(&sem->rwbase);
}
static __always_inline int rwsem_is_contended(struct rw_semaphore *sem)
{
return rw_base_is_contended(&sem->rwbase);
}
#endif /* CONFIG_PREEMPT_RT */
/*
* The functions below are the same for all rwsem implementations including
* the RT specific variant.
*/
/*
* lock for reading
*/
extern void down_read(struct rw_semaphore *sem);
extern int __must_check down_read_interruptible(struct rw_semaphore *sem);
extern int __must_check down_read_killable(struct rw_semaphore *sem);
/*
* trylock for reading -- returns 1 if successful, 0 if contention
*/
extern int down_read_trylock(struct rw_semaphore *sem);
/*
* lock for writing
*/
extern void down_write(struct rw_semaphore *sem);
extern int __must_check down_write_killable(struct rw_semaphore *sem);
/*
* trylock for writing -- returns 1 if successful, 0 if contention
*/
extern int down_write_trylock(struct rw_semaphore *sem);
/*
* release a read lock
*/
extern void up_read(struct rw_semaphore *sem);
/*
* release a write lock
*/
extern void up_write(struct rw_semaphore *sem);
/*
* downgrade write lock to read lock
*/
extern void downgrade_write(struct rw_semaphore *sem);
#ifdef CONFIG_DEBUG_LOCK_ALLOC
/*
* nested locking. NOTE: rwsems are not allowed to recurse
* (which occurs if the same task tries to acquire the same
* lock instance multiple times), but multiple locks of the
* same lock class might be taken, if the order of the locks
* is always the same. This ordering rule can be expressed
* to lockdep via the _nested() APIs, but enumerating the
* subclasses that are used. (If the nesting relationship is
* static then another method for expressing nested locking is
* the explicit definition of lock class keys and the use of
* lockdep_set_class() at lock initialization time.
* See Documentation/locking/lockdep-design.rst for more details.)
*/
extern void down_read_nested(struct rw_semaphore *sem, int subclass);
extern int __must_check down_read_killable_nested(struct rw_semaphore *sem, int subclass);
extern void down_write_nested(struct rw_semaphore *sem, int subclass);
extern int down_write_killable_nested(struct rw_semaphore *sem, int subclass);
extern void _down_write_nest_lock(struct rw_semaphore *sem, struct lockdep_map *nest_lock);
# define down_write_nest_lock(sem, nest_lock) \
do { \
typecheck(struct lockdep_map *, &(nest_lock)->dep_map); \
_down_write_nest_lock(sem, &(nest_lock)->dep_map); \
} while (0);
/*
* Take/release a lock when not the owner will release it.
*
* [ This API should be avoided as much as possible - the
* proper abstraction for this case is completions. ]
*/
extern void down_read_non_owner(struct rw_semaphore *sem);
extern void up_read_non_owner(struct rw_semaphore *sem);
#else
# define down_read_nested(sem, subclass) down_read(sem)
# define down_read_killable_nested(sem, subclass) down_read_killable(sem)
# define down_write_nest_lock(sem, nest_lock) down_write(sem)
# define down_write_nested(sem, subclass) down_write(sem)
# define down_write_killable_nested(sem, subclass) down_write_killable(sem)
# define down_read_non_owner(sem) down_read(sem)
# define up_read_non_owner(sem) up_read(sem)
#endif
#endif /* _LINUX_RWSEM_H */
// SPDX-License-Identifier: GPL-2.0
/*
* linux/mm/mempool.c
*
* memory buffer pool support. Such pools are mostly used
* for guaranteed, deadlock-free memory allocations during
* extreme VM load.
*
* started by Ingo Molnar, Copyright (C) 2001
* debugging by David Rientjes, Copyright (C) 2015
*/
#include <linux/mm.h>
#include <linux/slab.h>
#include <linux/highmem.h>
#include <linux/kasan.h>
#include <linux/kmemleak.h>
#include <linux/export.h>
#include <linux/mempool.h>
#include <linux/blkdev.h>
#include <linux/writeback.h>
#include "slab.h"
#if defined(CONFIG_DEBUG_SLAB) || defined(CONFIG_SLUB_DEBUG_ON)
static void poison_error(mempool_t *pool, void *element, size_t size,
size_t byte)
{
const int nr = pool->curr_nr;
const int start = max_t(int, byte - (BITS_PER_LONG / 8), 0);
const int end = min_t(int, byte + (BITS_PER_LONG / 8), size);
int i;
pr_err("BUG: mempool element poison mismatch\n");
pr_err("Mempool %p size %zu\n", pool, size);
pr_err(" nr=%d @ %p: %s0x", nr, element, start > 0 ? "... " : "");
for (i = start; i < end; i++)
pr_cont("%x ", *(u8 *)(element + i));
pr_cont("%s\n", end < size ? "..." : "");
dump_stack();
}
static void __check_element(mempool_t *pool, void *element, size_t size)
{
u8 *obj = element;
size_t i;
for (i = 0; i < size; i++) {
u8 exp = (i < size - 1) ? POISON_FREE : POISON_END;
if (obj[i] != exp) {
poison_error(pool, element, size, i);
return;
}
}
memset(obj, POISON_INUSE, size);
}
static void check_element(mempool_t *pool, void *element)
{
/* Mempools backed by slab allocator */
if (pool->free == mempool_free_slab || pool->free == mempool_kfree) {
__check_element(pool, element, ksize(element));
} else if (pool->free == mempool_free_pages) {
/* Mempools backed by page allocator */
int order = (int)(long)pool->pool_data;
void *addr = kmap_atomic((struct page *)element);
__check_element(pool, addr, 1UL << (PAGE_SHIFT + order));
kunmap_atomic(addr);
}
}
static void __poison_element(void *element, size_t size)
{
u8 *obj = element;
memset(obj, POISON_FREE, size - 1);
obj[size - 1] = POISON_END;
}
static void poison_element(mempool_t *pool, void *element)
{
/* Mempools backed by slab allocator */
if (pool->alloc == mempool_alloc_slab || pool->alloc == mempool_kmalloc) {
__poison_element(element, ksize(element));
} else if (pool->alloc == mempool_alloc_pages) {
/* Mempools backed by page allocator */
int order = (int)(long)pool->pool_data;
void *addr = kmap_atomic((struct page *)element);
__poison_element(addr, 1UL << (PAGE_SHIFT + order));
kunmap_atomic(addr);
}
}
#else /* CONFIG_DEBUG_SLAB || CONFIG_SLUB_DEBUG_ON */
static inline void check_element(mempool_t *pool, void *element)
{
}
static inline void poison_element(mempool_t *pool, void *element)
{
}
#endif /* CONFIG_DEBUG_SLAB || CONFIG_SLUB_DEBUG_ON */
static __always_inline void kasan_poison_element(mempool_t *pool, void *element)
{
if (pool->alloc == mempool_alloc_slab || pool->alloc == mempool_kmalloc)
kasan_slab_free_mempool(element);
else if (pool->alloc == mempool_alloc_pages) kasan_poison_pages(element, (unsigned long)pool->pool_data,
false);
}
static void kasan_unpoison_element(mempool_t *pool, void *element)
{
if (pool->alloc == mempool_alloc_slab || pool->alloc == mempool_kmalloc)
kasan_unpoison_range(element, __ksize(element));
else if (pool->alloc == mempool_alloc_pages)
kasan_unpoison_pages(element, (unsigned long)pool->pool_data,
false);
}
static __always_inline void add_element(mempool_t *pool, void *element)
{
BUG_ON(pool->curr_nr >= pool->min_nr);
poison_element(pool, element);
kasan_poison_element(pool, element);
pool->elements[pool->curr_nr++] = element;
}
static void *remove_element(mempool_t *pool)
{
void *element = pool->elements[--pool->curr_nr];
BUG_ON(pool->curr_nr < 0);
kasan_unpoison_element(pool, element);
check_element(pool, element);
return element;
}
/**
* mempool_exit - exit a mempool initialized with mempool_init()
* @pool: pointer to the memory pool which was initialized with
* mempool_init().
*
* Free all reserved elements in @pool and @pool itself. This function
* only sleeps if the free_fn() function sleeps.
*
* May be called on a zeroed but uninitialized mempool (i.e. allocated with
* kzalloc()).
*/
void mempool_exit(mempool_t *pool)
{
while (pool->curr_nr) {
void *element = remove_element(pool);
pool->free(element, pool->pool_data);
}
kfree(pool->elements);
pool->elements = NULL;
}
EXPORT_SYMBOL(mempool_exit);
/**
* mempool_destroy - deallocate a memory pool
* @pool: pointer to the memory pool which was allocated via
* mempool_create().
*
* Free all reserved elements in @pool and @pool itself. This function
* only sleeps if the free_fn() function sleeps.
*/
void mempool_destroy(mempool_t *pool)
{
if (unlikely(!pool))
return;
mempool_exit(pool);
kfree(pool);
}
EXPORT_SYMBOL(mempool_destroy);
int mempool_init_node(mempool_t *pool, int min_nr, mempool_alloc_t *alloc_fn,
mempool_free_t *free_fn, void *pool_data,
gfp_t gfp_mask, int node_id)
{
spin_lock_init(&pool->lock);
pool->min_nr = min_nr;
pool->pool_data = pool_data;
pool->alloc = alloc_fn;
pool->free = free_fn;
init_waitqueue_head(&pool->wait);
pool->elements = kmalloc_array_node(min_nr, sizeof(void *),
gfp_mask, node_id);
if (!pool->elements)
return -ENOMEM;
/*
* First pre-allocate the guaranteed number of buffers.
*/
while (pool->curr_nr < pool->min_nr) {
void *element;
element = pool->alloc(gfp_mask, pool->pool_data);
if (unlikely(!element)) {
mempool_exit(pool);
return -ENOMEM;
}
add_element(pool, element);
}
return 0;
}
EXPORT_SYMBOL(mempool_init_node);
/**
* mempool_init - initialize a memory pool
* @pool: pointer to the memory pool that should be initialized
* @min_nr: the minimum number of elements guaranteed to be
* allocated for this pool.
* @alloc_fn: user-defined element-allocation function.
* @free_fn: user-defined element-freeing function.
* @pool_data: optional private data available to the user-defined functions.
*
* Like mempool_create(), but initializes the pool in (i.e. embedded in another
* structure).
*
* Return: %0 on success, negative error code otherwise.
*/
int mempool_init(mempool_t *pool, int min_nr, mempool_alloc_t *alloc_fn,
mempool_free_t *free_fn, void *pool_data)
{
return mempool_init_node(pool, min_nr, alloc_fn, free_fn,
pool_data, GFP_KERNEL, NUMA_NO_NODE);
}
EXPORT_SYMBOL(mempool_init);
/**
* mempool_create - create a memory pool
* @min_nr: the minimum number of elements guaranteed to be
* allocated for this pool.
* @alloc_fn: user-defined element-allocation function.
* @free_fn: user-defined element-freeing function.
* @pool_data: optional private data available to the user-defined functions.
*
* this function creates and allocates a guaranteed size, preallocated
* memory pool. The pool can be used from the mempool_alloc() and mempool_free()
* functions. This function might sleep. Both the alloc_fn() and the free_fn()
* functions might sleep - as long as the mempool_alloc() function is not called
* from IRQ contexts.
*
* Return: pointer to the created memory pool object or %NULL on error.
*/
mempool_t *mempool_create(int min_nr, mempool_alloc_t *alloc_fn,
mempool_free_t *free_fn, void *pool_data)
{
return mempool_create_node(min_nr, alloc_fn, free_fn, pool_data,
GFP_KERNEL, NUMA_NO_NODE);
}
EXPORT_SYMBOL(mempool_create);
mempool_t *mempool_create_node(int min_nr, mempool_alloc_t *alloc_fn,
mempool_free_t *free_fn, void *pool_data,
gfp_t gfp_mask, int node_id)
{
mempool_t *pool;
pool = kzalloc_node(sizeof(*pool), gfp_mask, node_id);
if (!pool)
return NULL;
if (mempool_init_node(pool, min_nr, alloc_fn, free_fn, pool_data,
gfp_mask, node_id)) {
kfree(pool);
return NULL;
}
return pool;
}
EXPORT_SYMBOL(mempool_create_node);
/**
* mempool_resize - resize an existing memory pool
* @pool: pointer to the memory pool which was allocated via
* mempool_create().
* @new_min_nr: the new minimum number of elements guaranteed to be
* allocated for this pool.
*
* This function shrinks/grows the pool. In the case of growing,
* it cannot be guaranteed that the pool will be grown to the new
* size immediately, but new mempool_free() calls will refill it.
* This function may sleep.
*
* Note, the caller must guarantee that no mempool_destroy is called
* while this function is running. mempool_alloc() & mempool_free()
* might be called (eg. from IRQ contexts) while this function executes.
*
* Return: %0 on success, negative error code otherwise.
*/
int mempool_resize(mempool_t *pool, int new_min_nr)
{
void *element;
void **new_elements;
unsigned long flags;
BUG_ON(new_min_nr <= 0);
might_sleep();
spin_lock_irqsave(&pool->lock, flags);
if (new_min_nr <= pool->min_nr) {
while (new_min_nr < pool->curr_nr) {
element = remove_element(pool);
spin_unlock_irqrestore(&pool->lock, flags);
pool->free(element, pool->pool_data);
spin_lock_irqsave(&pool->lock, flags);
}
pool->min_nr = new_min_nr;
goto out_unlock;
}
spin_unlock_irqrestore(&pool->lock, flags);
/* Grow the pool */
new_elements = kmalloc_array(new_min_nr, sizeof(*new_elements),
GFP_KERNEL);
if (!new_elements)
return -ENOMEM;
spin_lock_irqsave(&pool->lock, flags);
if (unlikely(new_min_nr <= pool->min_nr)) {
/* Raced, other resize will do our work */
spin_unlock_irqrestore(&pool->lock, flags);
kfree(new_elements);
goto out;
}
memcpy(new_elements, pool->elements,
pool->curr_nr * sizeof(*new_elements));
kfree(pool->elements);
pool->elements = new_elements;
pool->min_nr = new_min_nr;
while (pool->curr_nr < pool->min_nr) {
spin_unlock_irqrestore(&pool->lock, flags);
element = pool->alloc(GFP_KERNEL, pool->pool_data);
if (!element)
goto out;
spin_lock_irqsave(&pool->lock, flags);
if (pool->curr_nr < pool->min_nr) {
add_element(pool, element);
} else {
spin_unlock_irqrestore(&pool->lock, flags);
pool->free(element, pool->pool_data); /* Raced */
goto out;
}
}
out_unlock:
spin_unlock_irqrestore(&pool->lock, flags);
out:
return 0;
}
EXPORT_SYMBOL(mempool_resize);
/**
* mempool_alloc - allocate an element from a specific memory pool
* @pool: pointer to the memory pool which was allocated via
* mempool_create().
* @gfp_mask: the usual allocation bitmask.
*
* this function only sleeps if the alloc_fn() function sleeps or
* returns NULL. Note that due to preallocation, this function
* *never* fails when called from process contexts. (it might
* fail if called from an IRQ context.)
* Note: using __GFP_ZERO is not supported.
*
* Return: pointer to the allocated element or %NULL on error.
*/
void *mempool_alloc(mempool_t *pool, gfp_t gfp_mask)
{
void *element;
unsigned long flags;
wait_queue_entry_t wait;
gfp_t gfp_temp;
VM_WARN_ON_ONCE(gfp_mask & __GFP_ZERO);
might_sleep_if(gfp_mask & __GFP_DIRECT_RECLAIM);
gfp_mask |= __GFP_NOMEMALLOC; /* don't allocate emergency reserves */
gfp_mask |= __GFP_NORETRY; /* don't loop in __alloc_pages */
gfp_mask |= __GFP_NOWARN; /* failures are OK */
gfp_temp = gfp_mask & ~(__GFP_DIRECT_RECLAIM|__GFP_IO);
repeat_alloc:
element = pool->alloc(gfp_temp, pool->pool_data); if (likely(element != NULL))
return element;
spin_lock_irqsave(&pool->lock, flags);
if (likely(pool->curr_nr)) {
element = remove_element(pool);
spin_unlock_irqrestore(&pool->lock, flags);
/* paired with rmb in mempool_free(), read comment there */
smp_wmb();
/*
* Update the allocation stack trace as this is more useful
* for debugging.
*/
kmemleak_update_trace(element);
return element;
}
/*
* We use gfp mask w/o direct reclaim or IO for the first round. If
* alloc failed with that and @pool was empty, retry immediately.
*/
if (gfp_temp != gfp_mask) {
spin_unlock_irqrestore(&pool->lock, flags);
gfp_temp = gfp_mask;
goto repeat_alloc;
}
/* We must not sleep if !__GFP_DIRECT_RECLAIM */
if (!(gfp_mask & __GFP_DIRECT_RECLAIM)) {
spin_unlock_irqrestore(&pool->lock, flags);
return NULL;
}
/* Let's wait for someone else to return an element to @pool */
init_wait(&wait);
prepare_to_wait(&pool->wait, &wait, TASK_UNINTERRUPTIBLE);
spin_unlock_irqrestore(&pool->lock, flags);
/*
* FIXME: this should be io_schedule(). The timeout is there as a
* workaround for some DM problems in 2.6.18.
*/
io_schedule_timeout(5*HZ);
finish_wait(&pool->wait, &wait);
goto repeat_alloc;
}
EXPORT_SYMBOL(mempool_alloc);
/**
* mempool_free - return an element to the pool.
* @element: pool element pointer.
* @pool: pointer to the memory pool which was allocated via
* mempool_create().
*
* this function only sleeps if the free_fn() function sleeps.
*/
void mempool_free(void *element, mempool_t *pool)
{
unsigned long flags;
if (unlikely(element == NULL))
return;
/*
* Paired with the wmb in mempool_alloc(). The preceding read is
* for @element and the following @pool->curr_nr. This ensures
* that the visible value of @pool->curr_nr is from after the
* allocation of @element. This is necessary for fringe cases
* where @element was passed to this task without going through
* barriers.
*
* For example, assume @p is %NULL at the beginning and one task
* performs "p = mempool_alloc(...);" while another task is doing
* "while (!p) cpu_relax(); mempool_free(p, ...);". This function
* may end up using curr_nr value which is from before allocation
* of @p without the following rmb.
*/
smp_rmb();
/*
* For correctness, we need a test which is guaranteed to trigger
* if curr_nr + #allocated == min_nr. Testing curr_nr < min_nr
* without locking achieves that and refilling as soon as possible
* is desirable.
*
* Because curr_nr visible here is always a value after the
* allocation of @element, any task which decremented curr_nr below
* min_nr is guaranteed to see curr_nr < min_nr unless curr_nr gets
* incremented to min_nr afterwards. If curr_nr gets incremented
* to min_nr after the allocation of @element, the elements
* allocated after that are subject to the same guarantee.
*
* Waiters happen iff curr_nr is 0 and the above guarantee also
* ensures that there will be frees which return elements to the
* pool waking up the waiters.
*/
if (unlikely(READ_ONCE(pool->curr_nr) < pool->min_nr)) {
spin_lock_irqsave(&pool->lock, flags);
if (likely(pool->curr_nr < pool->min_nr)) {
add_element(pool, element);
spin_unlock_irqrestore(&pool->lock, flags);
wake_up(&pool->wait);
return;
}
spin_unlock_irqrestore(&pool->lock, flags);
}
pool->free(element, pool->pool_data);
}
EXPORT_SYMBOL(mempool_free);
/*
* A commonly used alloc and free fn.
*/
void *mempool_alloc_slab(gfp_t gfp_mask, void *pool_data)
{
struct kmem_cache *mem = pool_data;
VM_BUG_ON(mem->ctor);
return kmem_cache_alloc(mem, gfp_mask);
}
EXPORT_SYMBOL(mempool_alloc_slab);
void mempool_free_slab(void *element, void *pool_data)
{
struct kmem_cache *mem = pool_data;
kmem_cache_free(mem, element);
}
EXPORT_SYMBOL(mempool_free_slab);
/*
* A commonly used alloc and free fn that kmalloc/kfrees the amount of memory
* specified by pool_data
*/
void *mempool_kmalloc(gfp_t gfp_mask, void *pool_data)
{
size_t size = (size_t)pool_data;
return kmalloc(size, gfp_mask);
}
EXPORT_SYMBOL(mempool_kmalloc);
void mempool_kfree(void *element, void *pool_data)
{
kfree(element);
}
EXPORT_SYMBOL(mempool_kfree);
/*
* A simple mempool-backed page allocator that allocates pages
* of the order specified by pool_data.
*/
void *mempool_alloc_pages(gfp_t gfp_mask, void *pool_data)
{
int order = (int)(long)pool_data;
return alloc_pages(gfp_mask, order);
}
EXPORT_SYMBOL(mempool_alloc_pages);
void mempool_free_pages(void *element, void *pool_data)
{
int order = (int)(long)pool_data;
__free_pages(element, order);
}
EXPORT_SYMBOL(mempool_free_pages);
/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
* Copyright (C) 2001 Momchil Velikov
* Portions Copyright (C) 2001 Christoph Hellwig
* Copyright (C) 2006 Nick Piggin
* Copyright (C) 2012 Konstantin Khlebnikov
*/
#ifndef _LINUX_RADIX_TREE_H
#define _LINUX_RADIX_TREE_H
#include <linux/bitops.h>
#include <linux/kernel.h>
#include <linux/list.h>
#include <linux/percpu.h>
#include <linux/preempt.h>
#include <linux/rcupdate.h>
#include <linux/spinlock.h>
#include <linux/types.h>
#include <linux/xarray.h>
#include <linux/local_lock.h>
/* Keep unconverted code working */
#define radix_tree_root xarray
#define radix_tree_node xa_node
struct radix_tree_preload {
local_lock_t lock;
unsigned nr;
/* nodes->parent points to next preallocated node */
struct radix_tree_node *nodes;
};
DECLARE_PER_CPU(struct radix_tree_preload, radix_tree_preloads);
/*
* The bottom two bits of the slot determine how the remaining bits in the
* slot are interpreted:
*
* 00 - data pointer
* 10 - internal entry
* x1 - value entry
*
* The internal entry may be a pointer to the next level in the tree, a
* sibling entry, or an indicator that the entry in this slot has been moved
* to another location in the tree and the lookup should be restarted. While
* NULL fits the 'data pointer' pattern, it means that there is no entry in
* the tree for this index (no matter what level of the tree it is found at).
* This means that storing a NULL entry in the tree is the same as deleting
* the entry from the tree.
*/
#define RADIX_TREE_ENTRY_MASK 3UL
#define RADIX_TREE_INTERNAL_NODE 2UL
static inline bool radix_tree_is_internal_node(void *ptr)
{
return ((unsigned long)ptr & RADIX_TREE_ENTRY_MASK) ==
RADIX_TREE_INTERNAL_NODE;
}
/*** radix-tree API starts here ***/
#define RADIX_TREE_MAP_SHIFT XA_CHUNK_SHIFT
#define RADIX_TREE_MAP_SIZE (1UL << RADIX_TREE_MAP_SHIFT)
#define RADIX_TREE_MAP_MASK (RADIX_TREE_MAP_SIZE-1)
#define RADIX_TREE_MAX_TAGS XA_MAX_MARKS
#define RADIX_TREE_TAG_LONGS XA_MARK_LONGS
#define RADIX_TREE_INDEX_BITS (8 /* CHAR_BIT */ * sizeof(unsigned long))
#define RADIX_TREE_MAX_PATH (DIV_ROUND_UP(RADIX_TREE_INDEX_BITS, \
RADIX_TREE_MAP_SHIFT))
/* The IDR tag is stored in the low bits of xa_flags */
#define ROOT_IS_IDR ((__force gfp_t)4)
/* The top bits of xa_flags are used to store the root tags */
#define ROOT_TAG_SHIFT (__GFP_BITS_SHIFT)
#define RADIX_TREE_INIT(name, mask) XARRAY_INIT(name, mask)
#define RADIX_TREE(name, mask) \
struct radix_tree_root name = RADIX_TREE_INIT(name, mask)
#define INIT_RADIX_TREE(root, mask) xa_init_flags(root, mask)
static inline bool radix_tree_empty(const struct radix_tree_root *root)
{
return root->xa_head == NULL;
}
/**
* struct radix_tree_iter - radix tree iterator state
*
* @index: index of current slot
* @next_index: one beyond the last index for this chunk
* @tags: bit-mask for tag-iterating
* @node: node that contains current slot
*
* This radix tree iterator works in terms of "chunks" of slots. A chunk is a
* subinterval of slots contained within one radix tree leaf node. It is
* described by a pointer to its first slot and a struct radix_tree_iter
* which holds the chunk's position in the tree and its size. For tagged
* iteration radix_tree_iter also holds the slots' bit-mask for one chosen
* radix tree tag.
*/
struct radix_tree_iter {
unsigned long index;
unsigned long next_index;
unsigned long tags;
struct radix_tree_node *node;
};
/**
* Radix-tree synchronization
*
* The radix-tree API requires that users provide all synchronisation (with
* specific exceptions, noted below).
*
* Synchronization of access to the data items being stored in the tree, and
* management of their lifetimes must be completely managed by API users.
*
* For API usage, in general,
* - any function _modifying_ the tree or tags (inserting or deleting
* items, setting or clearing tags) must exclude other modifications, and
* exclude any functions reading the tree.
* - any function _reading_ the tree or tags (looking up items or tags,
* gang lookups) must exclude modifications to the tree, but may occur
* concurrently with other readers.
*
* The notable exceptions to this rule are the following functions:
* __radix_tree_lookup
* radix_tree_lookup
* radix_tree_lookup_slot
* radix_tree_tag_get
* radix_tree_gang_lookup
* radix_tree_gang_lookup_tag
* radix_tree_gang_lookup_tag_slot
* radix_tree_tagged
*
* The first 7 functions are able to be called locklessly, using RCU. The
* caller must ensure calls to these functions are made within rcu_read_lock()
* regions. Other readers (lock-free or otherwise) and modifications may be
* running concurrently.
*
* It is still required that the caller manage the synchronization and lifetimes
* of the items. So if RCU lock-free lookups are used, typically this would mean
* that the items have their own locks, or are amenable to lock-free access; and
* that the items are freed by RCU (or only freed after having been deleted from
* the radix tree *and* a synchronize_rcu() grace period).
*
* (Note, rcu_assign_pointer and rcu_dereference are not needed to control
* access to data items when inserting into or looking up from the radix tree)
*
* Note that the value returned by radix_tree_tag_get() may not be relied upon
* if only the RCU read lock is held. Functions to set/clear tags and to
* delete nodes running concurrently with it may affect its result such that
* two consecutive reads in the same locked section may return different
* values. If reliability is required, modification functions must also be
* excluded from concurrency.
*
* radix_tree_tagged is able to be called without locking or RCU.
*/
/**
* radix_tree_deref_slot - dereference a slot
* @slot: slot pointer, returned by radix_tree_lookup_slot
*
* For use with radix_tree_lookup_slot(). Caller must hold tree at least read
* locked across slot lookup and dereference. Not required if write lock is
* held (ie. items cannot be concurrently inserted).
*
* radix_tree_deref_retry must be used to confirm validity of the pointer if
* only the read lock is held.
*
* Return: entry stored in that slot.
*/
static inline void *radix_tree_deref_slot(void __rcu **slot)
{
return rcu_dereference(*slot);
}
/**
* radix_tree_deref_slot_protected - dereference a slot with tree lock held
* @slot: slot pointer, returned by radix_tree_lookup_slot
*
* Similar to radix_tree_deref_slot. The caller does not hold the RCU read
* lock but it must hold the tree lock to prevent parallel updates.
*
* Return: entry stored in that slot.
*/
static inline void *radix_tree_deref_slot_protected(void __rcu **slot,
spinlock_t *treelock)
{
return rcu_dereference_protected(*slot, lockdep_is_held(treelock));
}
/**
* radix_tree_deref_retry - check radix_tree_deref_slot
* @arg: pointer returned by radix_tree_deref_slot
* Returns: 0 if retry is not required, otherwise retry is required
*
* radix_tree_deref_retry must be used with radix_tree_deref_slot.
*/
static inline int radix_tree_deref_retry(void *arg)
{
return unlikely(radix_tree_is_internal_node(arg));
}
/**
* radix_tree_exception - radix_tree_deref_slot returned either exception?
* @arg: value returned by radix_tree_deref_slot
* Returns: 0 if well-aligned pointer, non-0 if either kind of exception.
*/
static inline int radix_tree_exception(void *arg)
{
return unlikely((unsigned long)arg & RADIX_TREE_ENTRY_MASK);
}
int radix_tree_insert(struct radix_tree_root *, unsigned long index,
void *);
void *__radix_tree_lookup(const struct radix_tree_root *, unsigned long index,
struct radix_tree_node **nodep, void __rcu ***slotp);
void *radix_tree_lookup(const struct radix_tree_root *, unsigned long);
void __rcu **radix_tree_lookup_slot(const struct radix_tree_root *,
unsigned long index);
void __radix_tree_replace(struct radix_tree_root *, struct radix_tree_node *,
void __rcu **slot, void *entry);
void radix_tree_iter_replace(struct radix_tree_root *,
const struct radix_tree_iter *, void __rcu **slot, void *entry);
void radix_tree_replace_slot(struct radix_tree_root *,
void __rcu **slot, void *entry);
void radix_tree_iter_delete(struct radix_tree_root *,
struct radix_tree_iter *iter, void __rcu **slot);
void *radix_tree_delete_item(struct radix_tree_root *, unsigned long, void *);
void *radix_tree_delete(struct radix_tree_root *, unsigned long);
unsigned int radix_tree_gang_lookup(const struct radix_tree_root *,
void **results, unsigned long first_index,
unsigned int max_items);
int radix_tree_preload(gfp_t gfp_mask);
int radix_tree_maybe_preload(gfp_t gfp_mask);
void radix_tree_init(void);
void *radix_tree_tag_set(struct radix_tree_root *,
unsigned long index, unsigned int tag);
void *radix_tree_tag_clear(struct radix_tree_root *,
unsigned long index, unsigned int tag);
int radix_tree_tag_get(const struct radix_tree_root *,
unsigned long index, unsigned int tag);
void radix_tree_iter_tag_clear(struct radix_tree_root *,
const struct radix_tree_iter *iter, unsigned int tag);
unsigned int radix_tree_gang_lookup_tag(const struct radix_tree_root *,
void **results, unsigned long first_index,
unsigned int max_items, unsigned int tag);
unsigned int radix_tree_gang_lookup_tag_slot(const struct radix_tree_root *,
void __rcu ***results, unsigned long first_index,
unsigned int max_items, unsigned int tag);
int radix_tree_tagged(const struct radix_tree_root *, unsigned int tag);
static inline void radix_tree_preload_end(void)
{
local_unlock(&radix_tree_preloads.lock);
}
void __rcu **idr_get_free(struct radix_tree_root *root,
struct radix_tree_iter *iter, gfp_t gfp,
unsigned long max);
enum {
RADIX_TREE_ITER_TAG_MASK = 0x0f, /* tag index in lower nybble */
RADIX_TREE_ITER_TAGGED = 0x10, /* lookup tagged slots */
RADIX_TREE_ITER_CONTIG = 0x20, /* stop at first hole */
};
/**
* radix_tree_iter_init - initialize radix tree iterator
*
* @iter: pointer to iterator state
* @start: iteration starting index
* Returns: NULL
*/
static __always_inline void __rcu **
radix_tree_iter_init(struct radix_tree_iter *iter, unsigned long start)
{
/*
* Leave iter->tags uninitialized. radix_tree_next_chunk() will fill it
* in the case of a successful tagged chunk lookup. If the lookup was
* unsuccessful or non-tagged then nobody cares about ->tags.
*
* Set index to zero to bypass next_index overflow protection.
* See the comment in radix_tree_next_chunk() for details.
*/
iter->index = 0;
iter->next_index = start;
return NULL;
}
/**
* radix_tree_next_chunk - find next chunk of slots for iteration
*
* @root: radix tree root
* @iter: iterator state
* @flags: RADIX_TREE_ITER_* flags and tag index
* Returns: pointer to chunk first slot, or NULL if there no more left
*
* This function looks up the next chunk in the radix tree starting from
* @iter->next_index. It returns a pointer to the chunk's first slot.
* Also it fills @iter with data about chunk: position in the tree (index),
* its end (next_index), and constructs a bit mask for tagged iterating (tags).
*/
void __rcu **radix_tree_next_chunk(const struct radix_tree_root *,
struct radix_tree_iter *iter, unsigned flags);
/**
* radix_tree_iter_lookup - look up an index in the radix tree
* @root: radix tree root
* @iter: iterator state
* @index: key to look up
*
* If @index is present in the radix tree, this function returns the slot
* containing it and updates @iter to describe the entry. If @index is not
* present, it returns NULL.
*/
static inline void __rcu **
radix_tree_iter_lookup(const struct radix_tree_root *root,
struct radix_tree_iter *iter, unsigned long index)
{
radix_tree_iter_init(iter, index);
return radix_tree_next_chunk(root, iter, RADIX_TREE_ITER_CONTIG);
}
/**
* radix_tree_iter_retry - retry this chunk of the iteration
* @iter: iterator state
*
* If we iterate over a tree protected only by the RCU lock, a race
* against deletion or creation may result in seeing a slot for which
* radix_tree_deref_retry() returns true. If so, call this function
* and continue the iteration.
*/
static inline __must_check
void __rcu **radix_tree_iter_retry(struct radix_tree_iter *iter)
{
iter->next_index = iter->index;
iter->tags = 0;
return NULL;
}
static inline unsigned long
__radix_tree_iter_add(struct radix_tree_iter *iter, unsigned long slots)
{
return iter->index + slots;
}
/**
* radix_tree_iter_resume - resume iterating when the chunk may be invalid
* @slot: pointer to current slot
* @iter: iterator state
* Returns: New slot pointer
*
* If the iterator needs to release then reacquire a lock, the chunk may
* have been invalidated by an insertion or deletion. Call this function
* before releasing the lock to continue the iteration from the next index.
*/
void __rcu **__must_check radix_tree_iter_resume(void __rcu **slot,
struct radix_tree_iter *iter);
/**
* radix_tree_chunk_size - get current chunk size
*
* @iter: pointer to radix tree iterator
* Returns: current chunk size
*/
static __always_inline long
radix_tree_chunk_size(struct radix_tree_iter *iter)
{
return iter->next_index - iter->index;
}
/**
* radix_tree_next_slot - find next slot in chunk
*
* @slot: pointer to current slot
* @iter: pointer to iterator state
* @flags: RADIX_TREE_ITER_*, should be constant
* Returns: pointer to next slot, or NULL if there no more left
*
* This function updates @iter->index in the case of a successful lookup.
* For tagged lookup it also eats @iter->tags.
*
* There are several cases where 'slot' can be passed in as NULL to this
* function. These cases result from the use of radix_tree_iter_resume() or
* radix_tree_iter_retry(). In these cases we don't end up dereferencing
* 'slot' because either:
* a) we are doing tagged iteration and iter->tags has been set to 0, or
* b) we are doing non-tagged iteration, and iter->index and iter->next_index
* have been set up so that radix_tree_chunk_size() returns 1 or 0.
*/
static __always_inline void __rcu **radix_tree_next_slot(void __rcu **slot,
struct radix_tree_iter *iter, unsigned flags)
{
if (flags & RADIX_TREE_ITER_TAGGED) {
iter->tags >>= 1;
if (unlikely(!iter->tags))
return NULL;
if (likely(iter->tags & 1ul)) { iter->index = __radix_tree_iter_add(iter, 1);
slot++;
goto found;
}
if (!(flags & RADIX_TREE_ITER_CONTIG)) {
unsigned offset = __ffs(iter->tags);
iter->tags >>= offset++;
iter->index = __radix_tree_iter_add(iter, offset);
slot += offset;
goto found;
}
} else {
long count = radix_tree_chunk_size(iter);
while (--count > 0) { slot++;
iter->index = __radix_tree_iter_add(iter, 1);
if (likely(*slot))
goto found;
if (flags & RADIX_TREE_ITER_CONTIG) {
/* forbid switching to the next chunk */
iter->next_index = 0;
break;
}
}
}
return NULL;
found:
return slot;
}
/**
* radix_tree_for_each_slot - iterate over non-empty slots
*
* @slot: the void** variable for pointer to slot
* @root: the struct radix_tree_root pointer
* @iter: the struct radix_tree_iter pointer
* @start: iteration starting index
*
* @slot points to radix tree slot, @iter->index contains its index.
*/
#define radix_tree_for_each_slot(slot, root, iter, start) \
for (slot = radix_tree_iter_init(iter, start) ; \
slot || (slot = radix_tree_next_chunk(root, iter, 0)) ; \
slot = radix_tree_next_slot(slot, iter, 0))
/**
* radix_tree_for_each_tagged - iterate over tagged slots
*
* @slot: the void** variable for pointer to slot
* @root: the struct radix_tree_root pointer
* @iter: the struct radix_tree_iter pointer
* @start: iteration starting index
* @tag: tag index
*
* @slot points to radix tree slot, @iter->index contains its index.
*/
#define radix_tree_for_each_tagged(slot, root, iter, start, tag) \
for (slot = radix_tree_iter_init(iter, start) ; \
slot || (slot = radix_tree_next_chunk(root, iter, \
RADIX_TREE_ITER_TAGGED | tag)) ; \
slot = radix_tree_next_slot(slot, iter, \
RADIX_TREE_ITER_TAGGED | tag))
#endif /* _LINUX_RADIX_TREE_H */
// SPDX-License-Identifier: GPL-2.0-only
/*
* fs/fs-writeback.c
*
* Copyright (C) 2002, Linus Torvalds.
*
* Contains all the functions related to writing back and waiting
* upon dirty inodes against superblocks, and writing back dirty
* pages against inodes. ie: data writeback. Writeout of the
* inode itself is not handled here.
*
* 10Apr2002 Andrew Morton
* Split out of fs/inode.c
* Additions for address_space-based writeback
*/
#include <linux/kernel.h>
#include <linux/export.h>
#include <linux/spinlock.h>
#include <linux/slab.h>
#include <linux/sched.h>
#include <linux/fs.h>
#include <linux/mm.h>
#include <linux/pagemap.h>
#include <linux/kthread.h>
#include <linux/writeback.h>
#include <linux/blkdev.h>
#include <linux/backing-dev.h>
#include <linux/tracepoint.h>
#include <linux/device.h>
#include <linux/memcontrol.h>
#include "internal.h"
/*
* 4MB minimal write chunk size
*/
#define MIN_WRITEBACK_PAGES (4096UL >> (PAGE_SHIFT - 10))
/*
* Passed into wb_writeback(), essentially a subset of writeback_control
*/
struct wb_writeback_work {
long nr_pages;
struct super_block *sb;
enum writeback_sync_modes sync_mode;
unsigned int tagged_writepages:1;
unsigned int for_kupdate:1;
unsigned int range_cyclic:1;
unsigned int for_background:1;
unsigned int for_sync:1; /* sync(2) WB_SYNC_ALL writeback */
unsigned int auto_free:1; /* free on completion */
enum wb_reason reason; /* why was writeback initiated? */
struct list_head list; /* pending work list */
struct wb_completion *done; /* set if the caller waits */
};
/*
* If an inode is constantly having its pages dirtied, but then the
* updates stop dirtytime_expire_interval seconds in the past, it's
* possible for the worst case time between when an inode has its
* timestamps updated and when they finally get written out to be two
* dirtytime_expire_intervals. We set the default to 12 hours (in
* seconds), which means most of the time inodes will have their
* timestamps written to disk after 12 hours, but in the worst case a
* few inodes might not their timestamps updated for 24 hours.
*/
unsigned int dirtytime_expire_interval = 12 * 60 * 60;
static inline struct inode *wb_inode(struct list_head *head)
{
return list_entry(head, struct inode, i_io_list);
}
/*
* Include the creation of the trace points after defining the
* wb_writeback_work structure and inline functions so that the definition
* remains local to this file.
*/
#define CREATE_TRACE_POINTS
#include <trace/events/writeback.h>
EXPORT_TRACEPOINT_SYMBOL_GPL(wbc_writepage);
static bool wb_io_lists_populated(struct bdi_writeback *wb)
{
if (wb_has_dirty_io(wb)) {
return false;
} else {
set_bit(WB_has_dirty_io, &wb->state);
WARN_ON_ONCE(!wb->avg_write_bandwidth);
atomic_long_add(wb->avg_write_bandwidth,
&wb->bdi->tot_write_bandwidth); return true;
}
}
static void wb_io_lists_depopulated(struct bdi_writeback *wb)
{
if (wb_has_dirty_io(wb) && list_empty(&wb->b_dirty) && list_empty(&wb->b_io) && list_empty(&wb->b_more_io)) {
clear_bit(WB_has_dirty_io, &wb->state);
WARN_ON_ONCE(atomic_long_sub_return(wb->avg_write_bandwidth,
&wb->bdi->tot_write_bandwidth) < 0);
}
}
/**
* inode_io_list_move_locked - move an inode onto a bdi_writeback IO list
* @inode: inode to be moved
* @wb: target bdi_writeback
* @head: one of @wb->b_{dirty|io|more_io|dirty_time}
*
* Move @inode->i_io_list to @list of @wb and set %WB_has_dirty_io.
* Returns %true if @inode is the first occupant of the !dirty_time IO
* lists; otherwise, %false.
*/
static bool inode_io_list_move_locked(struct inode *inode,
struct bdi_writeback *wb,
struct list_head *head)
{
assert_spin_locked(&wb->list_lock); list_move(&inode->i_io_list, head);
/* dirty_time doesn't count as dirty_io until expiration */
if (head != &wb->b_dirty_time)
return wb_io_lists_populated(wb); wb_io_lists_depopulated(wb);
return false;
}
static void wb_wakeup(struct bdi_writeback *wb)
{
spin_lock_bh(&wb->work_lock);
if (test_bit(WB_registered, &wb->state))
mod_delayed_work(bdi_wq, &wb->dwork, 0);
spin_unlock_bh(&wb->work_lock);
}
static void finish_writeback_work(struct bdi_writeback *wb,
struct wb_writeback_work *work)
{
struct wb_completion *done = work->done;
if (work->auto_free)
kfree(work);
if (done) {
wait_queue_head_t *waitq = done->waitq;
/* @done can't be accessed after the following dec */
if (atomic_dec_and_test(&done->cnt))
wake_up_all(waitq);
}
}
static void wb_queue_work(struct bdi_writeback *wb,
struct wb_writeback_work *work)
{
trace_writeback_queue(wb, work);
if (work->done) atomic_inc(&work->done->cnt);
spin_lock_bh(&wb->work_lock);
if (test_bit(WB_registered, &wb->state)) {
list_add_tail(&work->list, &wb->work_list);
mod_delayed_work(bdi_wq, &wb->dwork, 0);
} else
finish_writeback_work(wb, work);
spin_unlock_bh(&wb->work_lock);
}
/**
* wb_wait_for_completion - wait for completion of bdi_writeback_works
* @done: target wb_completion
*
* Wait for one or more work items issued to @bdi with their ->done field
* set to @done, which should have been initialized with
* DEFINE_WB_COMPLETION(). This function returns after all such work items
* are completed. Work items which are waited upon aren't freed
* automatically on completion.
*/
void wb_wait_for_completion(struct wb_completion *done)
{
atomic_dec(&done->cnt); /* put down the initial count */ wait_event(*done->waitq, !atomic_read(&done->cnt));}
#ifdef CONFIG_CGROUP_WRITEBACK
/*
* Parameters for foreign inode detection, see wbc_detach_inode() to see
* how they're used.
*
* These paramters are inherently heuristical as the detection target
* itself is fuzzy. All we want to do is detaching an inode from the
* current owner if it's being written to by some other cgroups too much.
*
* The current cgroup writeback is built on the assumption that multiple
* cgroups writing to the same inode concurrently is very rare and a mode
* of operation which isn't well supported. As such, the goal is not
* taking too long when a different cgroup takes over an inode while
* avoiding too aggressive flip-flops from occasional foreign writes.
*
* We record, very roughly, 2s worth of IO time history and if more than
* half of that is foreign, trigger the switch. The recording is quantized
* to 16 slots. To avoid tiny writes from swinging the decision too much,
* writes smaller than 1/8 of avg size are ignored.
*/
#define WB_FRN_TIME_SHIFT 13 /* 1s = 2^13, upto 8 secs w/ 16bit */
#define WB_FRN_TIME_AVG_SHIFT 3 /* avg = avg * 7/8 + new * 1/8 */
#define WB_FRN_TIME_CUT_DIV 8 /* ignore rounds < avg / 8 */
#define WB_FRN_TIME_PERIOD (2 * (1 << WB_FRN_TIME_SHIFT)) /* 2s */
#define WB_FRN_HIST_SLOTS 16 /* inode->i_wb_frn_history is 16bit */
#define WB_FRN_HIST_UNIT (WB_FRN_TIME_PERIOD / WB_FRN_HIST_SLOTS)
/* each slot's duration is 2s / 16 */
#define WB_FRN_HIST_THR_SLOTS (WB_FRN_HIST_SLOTS / 2)
/* if foreign slots >= 8, switch */
#define WB_FRN_HIST_MAX_SLOTS (WB_FRN_HIST_THR_SLOTS / 2 + 1)
/* one round can affect upto 5 slots */
#define WB_FRN_MAX_IN_FLIGHT 1024 /* don't queue too many concurrently */
/*
* Maximum inodes per isw. A specific value has been chosen to make
* struct inode_switch_wbs_context fit into 1024 bytes kmalloc.
*/
#define WB_MAX_INODES_PER_ISW ((1024UL - sizeof(struct inode_switch_wbs_context)) \
/ sizeof(struct inode *))
static atomic_t isw_nr_in_flight = ATOMIC_INIT(0);
static struct workqueue_struct *isw_wq;
void __inode_attach_wb(struct inode *inode, struct page *page)
{
struct backing_dev_info *bdi = inode_to_bdi(inode);
struct bdi_writeback *wb = NULL;
if (inode_cgwb_enabled(inode)) {
struct cgroup_subsys_state *memcg_css;
if (page) {
memcg_css = mem_cgroup_css_from_page(page);
wb = wb_get_create(bdi, memcg_css, GFP_ATOMIC);
} else {
/* must pin memcg_css, see wb_get_create() */
memcg_css = task_get_css(current, memory_cgrp_id);
wb = wb_get_create(bdi, memcg_css, GFP_ATOMIC);
css_put(memcg_css);
}
}
if (!wb)
wb = &bdi->wb;
/*
* There may be multiple instances of this function racing to
* update the same inode. Use cmpxchg() to tell the winner.
*/
if (unlikely(cmpxchg(&inode->i_wb, NULL, wb)))
wb_put(wb);
}
EXPORT_SYMBOL_GPL(__inode_attach_wb);
/**
* inode_cgwb_move_to_attached - put the inode onto wb->b_attached list
* @inode: inode of interest with i_lock held
* @wb: target bdi_writeback
*
* Remove the inode from wb's io lists and if necessarily put onto b_attached
* list. Only inodes attached to cgwb's are kept on this list.
*/
static void inode_cgwb_move_to_attached(struct inode *inode,
struct bdi_writeback *wb)
{
assert_spin_locked(&wb->list_lock);
assert_spin_locked(&inode->i_lock);
inode->i_state &= ~I_SYNC_QUEUED;
if (wb != &wb->bdi->wb)
list_move(&inode->i_io_list, &wb->b_attached);
else
list_del_init(&inode->i_io_list);
wb_io_lists_depopulated(wb);
}
/**
* locked_inode_to_wb_and_lock_list - determine a locked inode's wb and lock it
* @inode: inode of interest with i_lock held
*
* Returns @inode's wb with its list_lock held. @inode->i_lock must be
* held on entry and is released on return. The returned wb is guaranteed
* to stay @inode's associated wb until its list_lock is released.
*/
static struct bdi_writeback *
locked_inode_to_wb_and_lock_list(struct inode *inode)
__releases(&inode->i_lock)
__acquires(&wb->list_lock)
{
while (true) {
struct bdi_writeback *wb = inode_to_wb(inode);
/*
* inode_to_wb() association is protected by both
* @inode->i_lock and @wb->list_lock but list_lock nests
* outside i_lock. Drop i_lock and verify that the
* association hasn't changed after acquiring list_lock.
*/
wb_get(wb);
spin_unlock(&inode->i_lock);
spin_lock(&wb->list_lock);
/* i_wb may have changed inbetween, can't use inode_to_wb() */
if (likely(wb == inode->i_wb)) {
wb_put(wb); /* @inode already has ref */
return wb;
}
spin_unlock(&wb->list_lock);
wb_put(wb);
cpu_relax();
spin_lock(&inode->i_lock);
}
}
/**
* inode_to_wb_and_lock_list - determine an inode's wb and lock it
* @inode: inode of interest
*
* Same as locked_inode_to_wb_and_lock_list() but @inode->i_lock isn't held
* on entry.
*/
static struct bdi_writeback *inode_to_wb_and_lock_list(struct inode *inode)
__acquires(&wb->list_lock)
{
spin_lock(&inode->i_lock);
return locked_inode_to_wb_and_lock_list(inode);
}
struct inode_switch_wbs_context {
struct rcu_work work;
/*
* Multiple inodes can be switched at once. The switching procedure
* consists of two parts, separated by a RCU grace period. To make
* sure that the second part is executed for each inode gone through
* the first part, all inode pointers are placed into a NULL-terminated
* array embedded into struct inode_switch_wbs_context. Otherwise
* an inode could be left in a non-consistent state.
*/
struct bdi_writeback *new_wb;
struct inode *inodes[];
};
static void bdi_down_write_wb_switch_rwsem(struct backing_dev_info *bdi)
{
down_write(&bdi->wb_switch_rwsem);
}
static void bdi_up_write_wb_switch_rwsem(struct backing_dev_info *bdi)
{
up_write(&bdi->wb_switch_rwsem);
}
static bool inode_do_switch_wbs(struct inode *inode,
struct bdi_writeback *old_wb,
struct bdi_writeback *new_wb)
{
struct address_space *mapping = inode->i_mapping;
XA_STATE(xas, &mapping->i_pages, 0);
struct page *page;
bool switched = false;
spin_lock(&inode->i_lock);
xa_lock_irq(&mapping->i_pages);
/*
* Once I_FREEING or I_WILL_FREE are visible under i_lock, the eviction
* path owns the inode and we shouldn't modify ->i_io_list.
*/
if (unlikely(inode->i_state & (I_FREEING | I_WILL_FREE)))
goto skip_switch;
trace_inode_switch_wbs(inode, old_wb, new_wb);
/*
* Count and transfer stats. Note that PAGECACHE_TAG_DIRTY points
* to possibly dirty pages while PAGECACHE_TAG_WRITEBACK points to
* pages actually under writeback.
*/
xas_for_each_marked(&xas, page, ULONG_MAX, PAGECACHE_TAG_DIRTY) {
if (PageDirty(page)) {
dec_wb_stat(old_wb, WB_RECLAIMABLE);
inc_wb_stat(new_wb, WB_RECLAIMABLE);
}
}
xas_set(&xas, 0);
xas_for_each_marked(&xas, page, ULONG_MAX, PAGECACHE_TAG_WRITEBACK) {
WARN_ON_ONCE(!PageWriteback(page));
dec_wb_stat(old_wb, WB_WRITEBACK);
inc_wb_stat(new_wb, WB_WRITEBACK);
}
if (mapping_tagged(mapping, PAGECACHE_TAG_WRITEBACK)) {
atomic_dec(&old_wb->writeback_inodes);
atomic_inc(&new_wb->writeback_inodes);
}
wb_get(new_wb);
/*
* Transfer to @new_wb's IO list if necessary. If the @inode is dirty,
* the specific list @inode was on is ignored and the @inode is put on
* ->b_dirty which is always correct including from ->b_dirty_time.
* The transfer preserves @inode->dirtied_when ordering. If the @inode
* was clean, it means it was on the b_attached list, so move it onto
* the b_attached list of @new_wb.
*/
if (!list_empty(&inode->i_io_list)) {
inode->i_wb = new_wb;
if (inode->i_state & I_DIRTY_ALL) {
struct inode *pos;
list_for_each_entry(pos, &new_wb->b_dirty, i_io_list)
if (time_after_eq(inode->dirtied_when,
pos->dirtied_when))
break;
inode_io_list_move_locked(inode, new_wb,
pos->i_io_list.prev);
} else {
inode_cgwb_move_to_attached(inode, new_wb);
}
} else {
inode->i_wb = new_wb;
}
/* ->i_wb_frn updates may race wbc_detach_inode() but doesn't matter */
inode->i_wb_frn_winner = 0;
inode->i_wb_frn_avg_time = 0;
inode->i_wb_frn_history = 0;
switched = true;
skip_switch:
/*
* Paired with load_acquire in unlocked_inode_to_wb_begin() and
* ensures that the new wb is visible if they see !I_WB_SWITCH.
*/
smp_store_release(&inode->i_state, inode->i_state & ~I_WB_SWITCH);
xa_unlock_irq(&mapping->i_pages);
spin_unlock(&inode->i_lock);
return switched;
}
static void inode_switch_wbs_work_fn(struct work_struct *work)
{
struct inode_switch_wbs_context *isw =
container_of(to_rcu_work(work), struct inode_switch_wbs_context, work);
struct backing_dev_info *bdi = inode_to_bdi(isw->inodes[0]);
struct bdi_writeback *old_wb = isw->inodes[0]->i_wb;
struct bdi_writeback *new_wb = isw->new_wb;
unsigned long nr_switched = 0;
struct inode **inodep;
/*
* If @inode switches cgwb membership while sync_inodes_sb() is
* being issued, sync_inodes_sb() might miss it. Synchronize.
*/
down_read(&bdi->wb_switch_rwsem);
/*
* By the time control reaches here, RCU grace period has passed
* since I_WB_SWITCH assertion and all wb stat update transactions
* between unlocked_inode_to_wb_begin/end() are guaranteed to be
* synchronizing against the i_pages lock.
*
* Grabbing old_wb->list_lock, inode->i_lock and the i_pages lock
* gives us exclusion against all wb related operations on @inode
* including IO list manipulations and stat updates.
*/
if (old_wb < new_wb) {
spin_lock(&old_wb->list_lock);
spin_lock_nested(&new_wb->list_lock, SINGLE_DEPTH_NESTING);
} else {
spin_lock(&new_wb->list_lock);
spin_lock_nested(&old_wb->list_lock, SINGLE_DEPTH_NESTING);
}
for (inodep = isw->inodes; *inodep; inodep++) {
WARN_ON_ONCE((*inodep)->i_wb != old_wb);
if (inode_do_switch_wbs(*inodep, old_wb, new_wb))
nr_switched++;
}
spin_unlock(&new_wb->list_lock);
spin_unlock(&old_wb->list_lock);
up_read(&bdi->wb_switch_rwsem);
if (nr_switched) {
wb_wakeup(new_wb);
wb_put_many(old_wb, nr_switched);
}
for (inodep = isw->inodes; *inodep; inodep++)
iput(*inodep);
wb_put(new_wb);
kfree(isw);
atomic_dec(&isw_nr_in_flight);
}
static bool inode_prepare_wbs_switch(struct inode *inode,
struct bdi_writeback *new_wb)
{
/*
* Paired with smp_mb() in cgroup_writeback_umount().
* isw_nr_in_flight must be increased before checking SB_ACTIVE and
* grabbing an inode, otherwise isw_nr_in_flight can be observed as 0
* in cgroup_writeback_umount() and the isw_wq will be not flushed.
*/
smp_mb();
if (IS_DAX(inode))
return false;
/* while holding I_WB_SWITCH, no one else can update the association */
spin_lock(&inode->i_lock);
if (!(inode->i_sb->s_flags & SB_ACTIVE) ||
inode->i_state & (I_WB_SWITCH | I_FREEING | I_WILL_FREE) ||
inode_to_wb(inode) == new_wb) {
spin_unlock(&inode->i_lock);
return false;
}
inode->i_state |= I_WB_SWITCH;
__iget(inode);
spin_unlock(&inode->i_lock);
return true;
}
/**
* inode_switch_wbs - change the wb association of an inode
* @inode: target inode
* @new_wb_id: ID of the new wb
*
* Switch @inode's wb association to the wb identified by @new_wb_id. The
* switching is performed asynchronously and may fail silently.
*/
static void inode_switch_wbs(struct inode *inode, int new_wb_id)
{
struct backing_dev_info *bdi = inode_to_bdi(inode);
struct cgroup_subsys_state *memcg_css;
struct inode_switch_wbs_context *isw;
/* noop if seems to be already in progress */
if (inode->i_state & I_WB_SWITCH)
return;
/* avoid queueing a new switch if too many are already in flight */
if (atomic_read(&isw_nr_in_flight) > WB_FRN_MAX_IN_FLIGHT)
return;
isw = kzalloc(sizeof(*isw) + 2 * sizeof(struct inode *), GFP_ATOMIC);
if (!isw)
return;
atomic_inc(&isw_nr_in_flight);
/* find and pin the new wb */
rcu_read_lock();
memcg_css = css_from_id(new_wb_id, &memory_cgrp_subsys);
if (memcg_css && !css_tryget(memcg_css))
memcg_css = NULL;
rcu_read_unlock();
if (!memcg_css)
goto out_free;
isw->new_wb = wb_get_create(bdi, memcg_css, GFP_ATOMIC);
css_put(memcg_css);
if (!isw->new_wb)
goto out_free;
if (!inode_prepare_wbs_switch(inode, isw->new_wb))
goto out_free;
isw->inodes[0] = inode;
/*
* In addition to synchronizing among switchers, I_WB_SWITCH tells
* the RCU protected stat update paths to grab the i_page
* lock so that stat transfer can synchronize against them.
* Let's continue after I_WB_SWITCH is guaranteed to be visible.
*/
INIT_RCU_WORK(&isw->work, inode_switch_wbs_work_fn);
queue_rcu_work(isw_wq, &isw->work);
return;
out_free:
atomic_dec(&isw_nr_in_flight);
if (isw->new_wb)
wb_put(isw->new_wb);
kfree(isw);
}
/**
* cleanup_offline_cgwb - detach associated inodes
* @wb: target wb
*
* Switch all inodes attached to @wb to a nearest living ancestor's wb in order
* to eventually release the dying @wb. Returns %true if not all inodes were
* switched and the function has to be restarted.
*/
bool cleanup_offline_cgwb(struct bdi_writeback *wb)
{
struct cgroup_subsys_state *memcg_css;
struct inode_switch_wbs_context *isw;
struct inode *inode;
int nr;
bool restart = false;
isw = kzalloc(sizeof(*isw) + WB_MAX_INODES_PER_ISW *
sizeof(struct inode *), GFP_KERNEL);
if (!isw)
return restart;
atomic_inc(&isw_nr_in_flight);
for (memcg_css = wb->memcg_css->parent; memcg_css;
memcg_css = memcg_css->parent) {
isw->new_wb = wb_get_create(wb->bdi, memcg_css, GFP_KERNEL);
if (isw->new_wb)
break;
}
if (unlikely(!isw->new_wb))
isw->new_wb = &wb->bdi->wb; /* wb_get() is noop for bdi's wb */
nr = 0;
spin_lock(&wb->list_lock);
list_for_each_entry(inode, &wb->b_attached, i_io_list) {
if (!inode_prepare_wbs_switch(inode, isw->new_wb))
continue;
isw->inodes[nr++] = inode;
if (nr >= WB_MAX_INODES_PER_ISW - 1) {
restart = true;
break;
}
}
spin_unlock(&wb->list_lock);
/* no attached inodes? bail out */
if (nr == 0) {
atomic_dec(&isw_nr_in_flight);
wb_put(isw->new_wb);
kfree(isw);
return restart;
}
/*
* In addition to synchronizing among switchers, I_WB_SWITCH tells
* the RCU protected stat update paths to grab the i_page
* lock so that stat transfer can synchronize against them.
* Let's continue after I_WB_SWITCH is guaranteed to be visible.
*/
INIT_RCU_WORK(&isw->work, inode_switch_wbs_work_fn);
queue_rcu_work(isw_wq, &isw->work);
return restart;
}
/**
* wbc_attach_and_unlock_inode - associate wbc with target inode and unlock it
* @wbc: writeback_control of interest
* @inode: target inode
*
* @inode is locked and about to be written back under the control of @wbc.
* Record @inode's writeback context into @wbc and unlock the i_lock. On
* writeback completion, wbc_detach_inode() should be called. This is used
* to track the cgroup writeback context.
*/
void wbc_attach_and_unlock_inode(struct writeback_control *wbc,
struct inode *inode)
{
if (!inode_cgwb_enabled(inode)) {
spin_unlock(&inode->i_lock);
return;
}
wbc->wb = inode_to_wb(inode);
wbc->inode = inode;
wbc->wb_id = wbc->wb->memcg_css->id;
wbc->wb_lcand_id = inode->i_wb_frn_winner;
wbc->wb_tcand_id = 0;
wbc->wb_bytes = 0;
wbc->wb_lcand_bytes = 0;
wbc->wb_tcand_bytes = 0;
wb_get(wbc->wb);
spin_unlock(&inode->i_lock);
/*
* A dying wb indicates that either the blkcg associated with the
* memcg changed or the associated memcg is dying. In the first
* case, a replacement wb should already be available and we should
* refresh the wb immediately. In the second case, trying to
* refresh will keep failing.
*/
if (unlikely(wb_dying(wbc->wb) && !css_is_dying(wbc->wb->memcg_css)))
inode_switch_wbs(inode, wbc->wb_id);
}
EXPORT_SYMBOL_GPL(wbc_attach_and_unlock_inode);
/**
* wbc_detach_inode - disassociate wbc from inode and perform foreign detection
* @wbc: writeback_control of the just finished writeback
*
* To be called after a writeback attempt of an inode finishes and undoes
* wbc_attach_and_unlock_inode(). Can be called under any context.
*
* As concurrent write sharing of an inode is expected to be very rare and
* memcg only tracks page ownership on first-use basis severely confining
* the usefulness of such sharing, cgroup writeback tracks ownership
* per-inode. While the support for concurrent write sharing of an inode
* is deemed unnecessary, an inode being written to by different cgroups at
* different points in time is a lot more common, and, more importantly,
* charging only by first-use can too readily lead to grossly incorrect
* behaviors (single foreign page can lead to gigabytes of writeback to be
* incorrectly attributed).
*
* To resolve this issue, cgroup writeback detects the majority dirtier of
* an inode and transfers the ownership to it. To avoid unnnecessary
* oscillation, the detection mechanism keeps track of history and gives
* out the switch verdict only if the foreign usage pattern is stable over
* a certain amount of time and/or writeback attempts.
*
* On each writeback attempt, @wbc tries to detect the majority writer
* using Boyer-Moore majority vote algorithm. In addition to the byte
* count from the majority voting, it also counts the bytes written for the
* current wb and the last round's winner wb (max of last round's current
* wb, the winner from two rounds ago, and the last round's majority
* candidate). Keeping track of the historical winner helps the algorithm
* to semi-reliably detect the most active writer even when it's not the
* absolute majority.
*
* Once the winner of the round is determined, whether the winner is
* foreign or not and how much IO time the round consumed is recorded in
* inode->i_wb_frn_history. If the amount of recorded foreign IO time is
* over a certain threshold, the switch verdict is given.
*/
void wbc_detach_inode(struct writeback_control *wbc)
{
struct bdi_writeback *wb = wbc->wb;
struct inode *inode = wbc->inode;
unsigned long avg_time, max_bytes, max_time;
u16 history;
int max_id;
if (!wb)
return;
history = inode->i_wb_frn_history;
avg_time = inode->i_wb_frn_avg_time;
/* pick the winner of this round */
if (wbc->wb_bytes >= wbc->wb_lcand_bytes &&
wbc->wb_bytes >= wbc->wb_tcand_bytes) {
max_id = wbc->wb_id;
max_bytes = wbc->wb_bytes;
} else if (wbc->wb_lcand_bytes >= wbc->wb_tcand_bytes) {
max_id = wbc->wb_lcand_id;
max_bytes = wbc->wb_lcand_bytes;
} else {
max_id = wbc->wb_tcand_id;
max_bytes = wbc->wb_tcand_bytes;
}
/*
* Calculate the amount of IO time the winner consumed and fold it
* into the running average kept per inode. If the consumed IO
* time is lower than avag / WB_FRN_TIME_CUT_DIV, ignore it for
* deciding whether to switch or not. This is to prevent one-off
* small dirtiers from skewing the verdict.
*/
max_time = DIV_ROUND_UP((max_bytes >> PAGE_SHIFT) << WB_FRN_TIME_SHIFT,
wb->avg_write_bandwidth);
if (avg_time)
avg_time += (max_time >> WB_FRN_TIME_AVG_SHIFT) -
(avg_time >> WB_FRN_TIME_AVG_SHIFT);
else
avg_time = max_time; /* immediate catch up on first run */
if (max_time >= avg_time / WB_FRN_TIME_CUT_DIV) {
int slots;
/*
* The switch verdict is reached if foreign wb's consume
* more than a certain proportion of IO time in a
* WB_FRN_TIME_PERIOD. This is loosely tracked by 16 slot
* history mask where each bit represents one sixteenth of
* the period. Determine the number of slots to shift into
* history from @max_time.
*/
slots = min(DIV_ROUND_UP(max_time, WB_FRN_HIST_UNIT),
(unsigned long)WB_FRN_HIST_MAX_SLOTS);
history <<= slots;
if (wbc->wb_id != max_id)
history |= (1U << slots) - 1;
if (history)
trace_inode_foreign_history(inode, wbc, history);
/*
* Switch if the current wb isn't the consistent winner.
* If there are multiple closely competing dirtiers, the
* inode may switch across them repeatedly over time, which
* is okay. The main goal is avoiding keeping an inode on
* the wrong wb for an extended period of time.
*/
if (hweight32(history) > WB_FRN_HIST_THR_SLOTS)
inode_switch_wbs(inode, max_id);
}
/*
* Multiple instances of this function may race to update the
* following fields but we don't mind occassional inaccuracies.
*/
inode->i_wb_frn_winner = max_id;
inode->i_wb_frn_avg_time = min(avg_time, (unsigned long)U16_MAX);
inode->i_wb_frn_history = history;
wb_put(wbc->wb);
wbc->wb = NULL;
}
EXPORT_SYMBOL_GPL(wbc_detach_inode);
/**
* wbc_account_cgroup_owner - account writeback to update inode cgroup ownership
* @wbc: writeback_control of the writeback in progress
* @page: page being written out
* @bytes: number of bytes being written out
*
* @bytes from @page are about to written out during the writeback
* controlled by @wbc. Keep the book for foreign inode detection. See
* wbc_detach_inode().
*/
void wbc_account_cgroup_owner(struct writeback_control *wbc, struct page *page,
size_t bytes)
{
struct cgroup_subsys_state *css;
int id;
/*
* pageout() path doesn't attach @wbc to the inode being written
* out. This is intentional as we don't want the function to block
* behind a slow cgroup. Ultimately, we want pageout() to kick off
* regular writeback instead of writing things out itself.
*/
if (!wbc->wb || wbc->no_cgroup_owner)
return;
css = mem_cgroup_css_from_page(page);
/* dead cgroups shouldn't contribute to inode ownership arbitration */
if (!(css->flags & CSS_ONLINE))
return;
id = css->id;
if (id == wbc->wb_id) {
wbc->wb_bytes += bytes;
return;
}
if (id == wbc->wb_lcand_id)
wbc->wb_lcand_bytes += bytes;
/* Boyer-Moore majority vote algorithm */
if (!wbc->wb_tcand_bytes)
wbc->wb_tcand_id = id;
if (id == wbc->wb_tcand_id)
wbc->wb_tcand_bytes += bytes;
else
wbc->wb_tcand_bytes -= min(bytes, wbc->wb_tcand_bytes);
}
EXPORT_SYMBOL_GPL(wbc_account_cgroup_owner);
/**
* inode_congested - test whether an inode is congested
* @inode: inode to test for congestion (may be NULL)
* @cong_bits: mask of WB_[a]sync_congested bits to test
*
* Tests whether @inode is congested. @cong_bits is the mask of congestion
* bits to test and the return value is the mask of set bits.
*
* If cgroup writeback is enabled for @inode, the congestion state is
* determined by whether the cgwb (cgroup bdi_writeback) for the blkcg
* associated with @inode is congested; otherwise, the root wb's congestion
* state is used.
*
* @inode is allowed to be NULL as this function is often called on
* mapping->host which is NULL for the swapper space.
*/
int inode_congested(struct inode *inode, int cong_bits)
{
/*
* Once set, ->i_wb never becomes NULL while the inode is alive.
* Start transaction iff ->i_wb is visible.
*/
if (inode && inode_to_wb_is_valid(inode)) {
struct bdi_writeback *wb;
struct wb_lock_cookie lock_cookie = {};
bool congested;
wb = unlocked_inode_to_wb_begin(inode, &lock_cookie);
congested = wb_congested(wb, cong_bits);
unlocked_inode_to_wb_end(inode, &lock_cookie);
return congested;
}
return wb_congested(&inode_to_bdi(inode)->wb, cong_bits);
}
EXPORT_SYMBOL_GPL(inode_congested);
/**
* wb_split_bdi_pages - split nr_pages to write according to bandwidth
* @wb: target bdi_writeback to split @nr_pages to
* @nr_pages: number of pages to write for the whole bdi
*
* Split @wb's portion of @nr_pages according to @wb's write bandwidth in
* relation to the total write bandwidth of all wb's w/ dirty inodes on
* @wb->bdi.
*/
static long wb_split_bdi_pages(struct bdi_writeback *wb, long nr_pages)
{
unsigned long this_bw = wb->avg_write_bandwidth;
unsigned long tot_bw = atomic_long_read(&wb->bdi->tot_write_bandwidth);
if (nr_pages == LONG_MAX)
return LONG_MAX;
/*
* This may be called on clean wb's and proportional distribution
* may not make sense, just use the original @nr_pages in those
* cases. In general, we wanna err on the side of writing more.
*/
if (!tot_bw || this_bw >= tot_bw)
return nr_pages;
else
return DIV_ROUND_UP_ULL((u64)nr_pages * this_bw, tot_bw);
}
/**
* bdi_split_work_to_wbs - split a wb_writeback_work to all wb's of a bdi
* @bdi: target backing_dev_info
* @base_work: wb_writeback_work to issue
* @skip_if_busy: skip wb's which already have writeback in progress
*
* Split and issue @base_work to all wb's (bdi_writeback's) of @bdi which
* have dirty inodes. If @base_work->nr_page isn't %LONG_MAX, it's
* distributed to the busy wbs according to each wb's proportion in the
* total active write bandwidth of @bdi.
*/
static void bdi_split_work_to_wbs(struct backing_dev_info *bdi,
struct wb_writeback_work *base_work,
bool skip_if_busy)
{
struct bdi_writeback *last_wb = NULL;
struct bdi_writeback *wb = list_entry(&bdi->wb_list,
struct bdi_writeback, bdi_node);
might_sleep();
restart:
rcu_read_lock();
list_for_each_entry_continue_rcu(wb, &bdi->wb_list, bdi_node) {
DEFINE_WB_COMPLETION(fallback_work_done, bdi);
struct wb_writeback_work fallback_work;
struct wb_writeback_work *work;
long nr_pages;
if (last_wb) {
wb_put(last_wb);
last_wb = NULL;
}
/* SYNC_ALL writes out I_DIRTY_TIME too */
if (!wb_has_dirty_io(wb) &&
(base_work->sync_mode == WB_SYNC_NONE ||
list_empty(&wb->b_dirty_time)))
continue;
if (skip_if_busy && writeback_in_progress(wb))
continue;
nr_pages = wb_split_bdi_pages(wb, base_work->nr_pages);
work = kmalloc(sizeof(*work), GFP_ATOMIC);
if (work) {
*work = *base_work;
work->nr_pages = nr_pages;
work->auto_free = 1;
wb_queue_work(wb, work);
continue;
}
/* alloc failed, execute synchronously using on-stack fallback */
work = &fallback_work;
*work = *base_work;
work->nr_pages = nr_pages;
work->auto_free = 0;
work->done = &fallback_work_done;
wb_queue_work(wb, work);
/*
* Pin @wb so that it stays on @bdi->wb_list. This allows
* continuing iteration from @wb after dropping and
* regrabbing rcu read lock.
*/
wb_get(wb);
last_wb = wb;
rcu_read_unlock();
wb_wait_for_completion(&fallback_work_done);
goto restart;
}
rcu_read_unlock();
if (last_wb)
wb_put(last_wb);
}
/**
* cgroup_writeback_by_id - initiate cgroup writeback from bdi and memcg IDs
* @bdi_id: target bdi id
* @memcg_id: target memcg css id
* @reason: reason why some writeback work initiated
* @done: target wb_completion
*
* Initiate flush of the bdi_writeback identified by @bdi_id and @memcg_id
* with the specified parameters.
*/
int cgroup_writeback_by_id(u64 bdi_id, int memcg_id,
enum wb_reason reason, struct wb_completion *done)
{
struct backing_dev_info *bdi;
struct cgroup_subsys_state *memcg_css;
struct bdi_writeback *wb;
struct wb_writeback_work *work;
unsigned long dirty;
int ret;
/* lookup bdi and memcg */
bdi = bdi_get_by_id(bdi_id);
if (!bdi)
return -ENOENT;
rcu_read_lock();
memcg_css = css_from_id(memcg_id, &memory_cgrp_subsys);
if (memcg_css && !css_tryget(memcg_css))
memcg_css = NULL;
rcu_read_unlock();
if (!memcg_css) {
ret = -ENOENT;
goto out_bdi_put;
}
/*
* And find the associated wb. If the wb isn't there already
* there's nothing to flush, don't create one.
*/
wb = wb_get_lookup(bdi, memcg_css);
if (!wb) {
ret = -ENOENT;
goto out_css_put;
}
/*
* The caller is attempting to write out most of
* the currently dirty pages. Let's take the current dirty page
* count and inflate it by 25% which should be large enough to
* flush out most dirty pages while avoiding getting livelocked by
* concurrent dirtiers.
*
* BTW the memcg stats are flushed periodically and this is best-effort
* estimation, so some potential error is ok.
*/
dirty = memcg_page_state(mem_cgroup_from_css(memcg_css), NR_FILE_DIRTY);
dirty = dirty * 10 / 8;
/* issue the writeback work */
work = kzalloc(sizeof(*work), GFP_NOWAIT | __GFP_NOWARN);
if (work) {
work->nr_pages = dirty;
work->sync_mode = WB_SYNC_NONE;
work->range_cyclic = 1;
work->reason = reason;
work->done = done;
work->auto_free = 1;
wb_queue_work(wb, work);
ret = 0;
} else {
ret = -ENOMEM;
}
wb_put(wb);
out_css_put:
css_put(memcg_css);
out_bdi_put:
bdi_put(bdi);
return ret;
}
/**
* cgroup_writeback_umount - flush inode wb switches for umount
*
* This function is called when a super_block is about to be destroyed and
* flushes in-flight inode wb switches. An inode wb switch goes through
* RCU and then workqueue, so the two need to be flushed in order to ensure
* that all previously scheduled switches are finished. As wb switches are
* rare occurrences and synchronize_rcu() can take a while, perform
* flushing iff wb switches are in flight.
*/
void cgroup_writeback_umount(void)
{
/*
* SB_ACTIVE should be reliably cleared before checking
* isw_nr_in_flight, see generic_shutdown_super().
*/
smp_mb();
if (atomic_read(&isw_nr_in_flight)) {
/*
* Use rcu_barrier() to wait for all pending callbacks to
* ensure that all in-flight wb switches are in the workqueue.
*/
rcu_barrier();
flush_workqueue(isw_wq);
}
}
static int __init cgroup_writeback_init(void)
{
isw_wq = alloc_workqueue("inode_switch_wbs", 0, 0);
if (!isw_wq)
return -ENOMEM;
return 0;
}
fs_initcall(cgroup_writeback_init);
#else /* CONFIG_CGROUP_WRITEBACK */
static void bdi_down_write_wb_switch_rwsem(struct backing_dev_info *bdi) { }
static void bdi_up_write_wb_switch_rwsem(struct backing_dev_info *bdi) { }
static void inode_cgwb_move_to_attached(struct inode *inode,
struct bdi_writeback *wb)
{
assert_spin_locked(&wb->list_lock); assert_spin_locked(&inode->i_lock); inode->i_state &= ~I_SYNC_QUEUED;
list_del_init(&inode->i_io_list);
wb_io_lists_depopulated(wb);
}
static struct bdi_writeback *
locked_inode_to_wb_and_lock_list(struct inode *inode)
__releases(&inode->i_lock)
__acquires(&wb->list_lock)
{
struct bdi_writeback *wb = inode_to_wb(inode);
spin_unlock(&inode->i_lock);
spin_lock(&wb->list_lock);
return wb;
}
static struct bdi_writeback *inode_to_wb_and_lock_list(struct inode *inode)
__acquires(&wb->list_lock)
{
struct bdi_writeback *wb = inode_to_wb(inode);
spin_lock(&wb->list_lock);
return wb;
}
static long wb_split_bdi_pages(struct bdi_writeback *wb, long nr_pages)
{
return nr_pages;
}
static void bdi_split_work_to_wbs(struct backing_dev_info *bdi,
struct wb_writeback_work *base_work,
bool skip_if_busy)
{
might_sleep();
if (!skip_if_busy || !writeback_in_progress(&bdi->wb)) {
base_work->auto_free = 0;
wb_queue_work(&bdi->wb, base_work);
}
}
#endif /* CONFIG_CGROUP_WRITEBACK */
/*
* Add in the number of potentially dirty inodes, because each inode
* write can dirty pagecache in the underlying blockdev.
*/
static unsigned long get_nr_dirty_pages(void)
{
return global_node_page_state(NR_FILE_DIRTY) +
get_nr_dirty_inodes();
}
static void wb_start_writeback(struct bdi_writeback *wb, enum wb_reason reason)
{
if (!wb_has_dirty_io(wb))
return;
/*
* All callers of this function want to start writeback of all
* dirty pages. Places like vmscan can call this at a very
* high frequency, causing pointless allocations of tons of
* work items and keeping the flusher threads busy retrieving
* that work. Ensure that we only allow one of them pending and
* inflight at the time.
*/
if (test_bit(WB_start_all, &wb->state) ||
test_and_set_bit(WB_start_all, &wb->state))
return;
wb->start_all_reason = reason;
wb_wakeup(wb);
}
/**
* wb_start_background_writeback - start background writeback
* @wb: bdi_writback to write from
*
* Description:
* This makes sure WB_SYNC_NONE background writeback happens. When
* this function returns, it is only guaranteed that for given wb
* some IO is happening if we are over background dirty threshold.
* Caller need not hold sb s_umount semaphore.
*/
void wb_start_background_writeback(struct bdi_writeback *wb)
{
/*
* We just wake up the flusher thread. It will perform background
* writeback as soon as there is no other work to do.
*/
trace_writeback_wake_background(wb);
wb_wakeup(wb);
}
/*
* Remove the inode from the writeback list it is on.
*/
void inode_io_list_del(struct inode *inode)
{
struct bdi_writeback *wb;
wb = inode_to_wb_and_lock_list(inode);
spin_lock(&inode->i_lock);
inode->i_state &= ~I_SYNC_QUEUED;
list_del_init(&inode->i_io_list);
wb_io_lists_depopulated(wb);
spin_unlock(&inode->i_lock);
spin_unlock(&wb->list_lock);
}
EXPORT_SYMBOL(inode_io_list_del);
/*
* mark an inode as under writeback on the sb
*/
void sb_mark_inode_writeback(struct inode *inode)
{
struct super_block *sb = inode->i_sb;
unsigned long flags;
if (list_empty(&inode->i_wb_list)) {
spin_lock_irqsave(&sb->s_inode_wblist_lock, flags);
if (list_empty(&inode->i_wb_list)) {
list_add_tail(&inode->i_wb_list, &sb->s_inodes_wb);
trace_sb_mark_inode_writeback(inode);
}
spin_unlock_irqrestore(&sb->s_inode_wblist_lock, flags);
}
}
/*
* clear an inode as under writeback on the sb
*/
void sb_clear_inode_writeback(struct inode *inode)
{
struct super_block *sb = inode->i_sb;
unsigned long flags;
if (!list_empty(&inode->i_wb_list)) {
spin_lock_irqsave(&sb->s_inode_wblist_lock, flags);
if (!list_empty(&inode->i_wb_list)) {
list_del_init(&inode->i_wb_list);
trace_sb_clear_inode_writeback(inode);
}
spin_unlock_irqrestore(&sb->s_inode_wblist_lock, flags);
}
}
/*
* Redirty an inode: set its when-it-was dirtied timestamp and move it to the
* furthest end of its superblock's dirty-inode list.
*
* Before stamping the inode's ->dirtied_when, we check to see whether it is
* already the most-recently-dirtied inode on the b_dirty list. If that is
* the case then the inode must have been redirtied while it was being written
* out and we don't reset its dirtied_when.
*/
static void redirty_tail_locked(struct inode *inode, struct bdi_writeback *wb)
{
assert_spin_locked(&inode->i_lock);
if (!list_empty(&wb->b_dirty)) {
struct inode *tail;
tail = wb_inode(wb->b_dirty.next);
if (time_before(inode->dirtied_when, tail->dirtied_when))
inode->dirtied_when = jiffies;
}
inode_io_list_move_locked(inode, wb, &wb->b_dirty);
inode->i_state &= ~I_SYNC_QUEUED;
}
static void redirty_tail(struct inode *inode, struct bdi_writeback *wb)
{
spin_lock(&inode->i_lock);
redirty_tail_locked(inode, wb);
spin_unlock(&inode->i_lock);
}
/*
* requeue inode for re-scanning after bdi->b_io list is exhausted.
*/
static void requeue_io(struct inode *inode, struct bdi_writeback *wb)
{
inode_io_list_move_locked(inode, wb, &wb->b_more_io);
}
static void inode_sync_complete(struct inode *inode)
{
inode->i_state &= ~I_SYNC;
/* If inode is clean an unused, put it into LRU now... */
inode_add_lru(inode);
/* Waiters must see I_SYNC cleared before being woken up */
smp_mb();
wake_up_bit(&inode->i_state, __I_SYNC);
}
static bool inode_dirtied_after(struct inode *inode, unsigned long t)
{
bool ret = time_after(inode->dirtied_when, t);
#ifndef CONFIG_64BIT
/*
* For inodes being constantly redirtied, dirtied_when can get stuck.
* It _appears_ to be in the future, but is actually in distant past.
* This test is necessary to prevent such wrapped-around relative times
* from permanently stopping the whole bdi writeback.
*/
ret = ret && time_before_eq(inode->dirtied_when, jiffies);
#endif
return ret;
}
#define EXPIRE_DIRTY_ATIME 0x0001
/*
* Move expired (dirtied before dirtied_before) dirty inodes from
* @delaying_queue to @dispatch_queue.
*/
static int move_expired_inodes(struct list_head *delaying_queue,
struct list_head *dispatch_queue,
unsigned long dirtied_before)
{
LIST_HEAD(tmp);
struct list_head *pos, *node;
struct super_block *sb = NULL;
struct inode *inode;
int do_sb_sort = 0;
int moved = 0;
while (!list_empty(delaying_queue)) {
inode = wb_inode(delaying_queue->prev);
if (inode_dirtied_after(inode, dirtied_before))
break;
list_move(&inode->i_io_list, &tmp);
moved++;
spin_lock(&inode->i_lock);
inode->i_state |= I_SYNC_QUEUED;
spin_unlock(&inode->i_lock);
if (sb_is_blkdev_sb(inode->i_sb))
continue;
if (sb && sb != inode->i_sb)
do_sb_sort = 1;
sb = inode->i_sb;
}
/* just one sb in list, splice to dispatch_queue and we're done */
if (!do_sb_sort) {
list_splice(&tmp, dispatch_queue);
goto out;
}
/* Move inodes from one superblock together */
while (!list_empty(&tmp)) {
sb = wb_inode(tmp.prev)->i_sb;
list_for_each_prev_safe(pos, node, &tmp) {
inode = wb_inode(pos);
if (inode->i_sb == sb)
list_move(&inode->i_io_list, dispatch_queue);
}
}
out:
return moved;
}
/*
* Queue all expired dirty inodes for io, eldest first.
* Before
* newly dirtied b_dirty b_io b_more_io
* =============> gf edc BA
* After
* newly dirtied b_dirty b_io b_more_io
* =============> g fBAedc
* |
* +--> dequeue for IO
*/
static void queue_io(struct bdi_writeback *wb, struct wb_writeback_work *work,
unsigned long dirtied_before)
{
int moved;
unsigned long time_expire_jif = dirtied_before;
assert_spin_locked(&wb->list_lock);
list_splice_init(&wb->b_more_io, &wb->b_io);
moved = move_expired_inodes(&wb->b_dirty, &wb->b_io, dirtied_before);
if (!work->for_sync)
time_expire_jif = jiffies - dirtytime_expire_interval * HZ;
moved += move_expired_inodes(&wb->b_dirty_time, &wb->b_io,
time_expire_jif);
if (moved)
wb_io_lists_populated(wb);
trace_writeback_queue_io(wb, work, dirtied_before, moved);
}
static int write_inode(struct inode *inode, struct writeback_control *wbc)
{
int ret;
if (inode->i_sb->s_op->write_inode && !is_bad_inode(inode)) {
trace_writeback_write_inode_start(inode, wbc);
ret = inode->i_sb->s_op->write_inode(inode, wbc);
trace_writeback_write_inode(inode, wbc);
return ret;
}
return 0;
}
/*
* Wait for writeback on an inode to complete. Called with i_lock held.
* Caller must make sure inode cannot go away when we drop i_lock.
*/
static void __inode_wait_for_writeback(struct inode *inode)
__releases(inode->i_lock)
__acquires(inode->i_lock)
{
DEFINE_WAIT_BIT(wq, &inode->i_state, __I_SYNC);
wait_queue_head_t *wqh;
wqh = bit_waitqueue(&inode->i_state, __I_SYNC);
while (inode->i_state & I_SYNC) {
spin_unlock(&inode->i_lock);
__wait_on_bit(wqh, &wq, bit_wait,
TASK_UNINTERRUPTIBLE);
spin_lock(&inode->i_lock);
}
}
/*
* Wait for writeback on an inode to complete. Caller must have inode pinned.
*/
void inode_wait_for_writeback(struct inode *inode)
{
spin_lock(&inode->i_lock);
__inode_wait_for_writeback(inode);
spin_unlock(&inode->i_lock);
}
/*
* Sleep until I_SYNC is cleared. This function must be called with i_lock
* held and drops it. It is aimed for callers not holding any inode reference
* so once i_lock is dropped, inode can go away.
*/
static void inode_sleep_on_writeback(struct inode *inode)
__releases(inode->i_lock)
{
DEFINE_WAIT(wait);
wait_queue_head_t *wqh = bit_waitqueue(&inode->i_state, __I_SYNC);
int sleep;
prepare_to_wait(wqh, &wait, TASK_UNINTERRUPTIBLE);
sleep = inode->i_state & I_SYNC;
spin_unlock(&inode->i_lock);
if (sleep)
schedule();
finish_wait(wqh, &wait);
}
/*
* Find proper writeback list for the inode depending on its current state and
* possibly also change of its state while we were doing writeback. Here we
* handle things such as livelock prevention or fairness of writeback among
* inodes. This function can be called only by flusher thread - noone else
* processes all inodes in writeback lists and requeueing inodes behind flusher
* thread's back can have unexpected consequences.
*/
static void requeue_inode(struct inode *inode, struct bdi_writeback *wb,
struct writeback_control *wbc)
{
if (inode->i_state & I_FREEING)
return;
/*
* Sync livelock prevention. Each inode is tagged and synced in one
* shot. If still dirty, it will be redirty_tail()'ed below. Update
* the dirty time to prevent enqueue and sync it again.
*/
if ((inode->i_state & I_DIRTY) &&
(wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages))
inode->dirtied_when = jiffies;
if (wbc->pages_skipped) {
/*
* writeback is not making progress due to locked
* buffers. Skip this inode for now.
*/
redirty_tail_locked(inode, wb);
return;
}
if (mapping_tagged(inode->i_mapping, PAGECACHE_TAG_DIRTY)) {
/*
* We didn't write back all the pages. nfs_writepages()
* sometimes bales out without doing anything.
*/
if (wbc->nr_to_write <= 0) {
/* Slice used up. Queue for next turn. */
requeue_io(inode, wb);
} else {
/*
* Writeback blocked by something other than
* congestion. Delay the inode for some time to
* avoid spinning on the CPU (100% iowait)
* retrying writeback of the dirty page/inode
* that cannot be performed immediately.
*/
redirty_tail_locked(inode, wb);
}
} else if (inode->i_state & I_DIRTY) {
/*
* Filesystems can dirty the inode during writeback operations,
* such as delayed allocation during submission or metadata
* updates after data IO completion.
*/
redirty_tail_locked(inode, wb);
} else if (inode->i_state & I_DIRTY_TIME) {
inode->dirtied_when = jiffies;
inode_io_list_move_locked(inode, wb, &wb->b_dirty_time);
inode->i_state &= ~I_SYNC_QUEUED;
} else {
/* The inode is clean. Remove from writeback lists. */
inode_cgwb_move_to_attached(inode, wb);
}
}
/*
* Write out an inode and its dirty pages (or some of its dirty pages, depending
* on @wbc->nr_to_write), and clear the relevant dirty flags from i_state.
*
* This doesn't remove the inode from the writeback list it is on, except
* potentially to move it from b_dirty_time to b_dirty due to timestamp
* expiration. The caller is otherwise responsible for writeback list handling.
*
* The caller is also responsible for setting the I_SYNC flag beforehand and
* calling inode_sync_complete() to clear it afterwards.
*/
static int
__writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
{
struct address_space *mapping = inode->i_mapping;
long nr_to_write = wbc->nr_to_write;
unsigned dirty;
int ret;
WARN_ON(!(inode->i_state & I_SYNC)); trace_writeback_single_inode_start(inode, wbc, nr_to_write); ret = do_writepages(mapping, wbc);
/*
* Make sure to wait on the data before writing out the metadata.
* This is important for filesystems that modify metadata on data
* I/O completion. We don't do it for sync(2) writeback because it has a
* separate, external IO completion path and ->sync_fs for guaranteeing
* inode metadata is written back correctly.
*/
if (wbc->sync_mode == WB_SYNC_ALL && !wbc->for_sync) {
int err = filemap_fdatawait(mapping);
if (ret == 0)
ret = err;
}
/*
* If the inode has dirty timestamps and we need to write them, call
* mark_inode_dirty_sync() to notify the filesystem about it and to
* change I_DIRTY_TIME into I_DIRTY_SYNC.
*/
if ((inode->i_state & I_DIRTY_TIME) && (wbc->sync_mode == WB_SYNC_ALL || time_after(jiffies, inode->dirtied_time_when +
dirtytime_expire_interval * HZ))) {
trace_writeback_lazytime(inode);
mark_inode_dirty_sync(inode);
}
/*
* Get and clear the dirty flags from i_state. This needs to be done
* after calling writepages because some filesystems may redirty the
* inode during writepages due to delalloc. It also needs to be done
* after handling timestamp expiration, as that may dirty the inode too.
*/
spin_lock(&inode->i_lock);
dirty = inode->i_state & I_DIRTY;
inode->i_state &= ~dirty;
/*
* Paired with smp_mb() in __mark_inode_dirty(). This allows
* __mark_inode_dirty() to test i_state without grabbing i_lock -
* either they see the I_DIRTY bits cleared or we see the dirtied
* inode.
*
* I_DIRTY_PAGES is always cleared together above even if @mapping
* still has dirty pages. The flag is reinstated after smp_mb() if
* necessary. This guarantees that either __mark_inode_dirty()
* sees clear I_DIRTY_PAGES or we see PAGECACHE_TAG_DIRTY.
*/
smp_mb();
if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY))
inode->i_state |= I_DIRTY_PAGES;
spin_unlock(&inode->i_lock);
/* Don't write the inode if only I_DIRTY_PAGES was set */
if (dirty & ~I_DIRTY_PAGES) {
int err = write_inode(inode, wbc);
if (ret == 0)
ret = err;
}
trace_writeback_single_inode(inode, wbc, nr_to_write);
return ret;
}
/*
* Write out an inode's dirty data and metadata on-demand, i.e. separately from
* the regular batched writeback done by the flusher threads in
* writeback_sb_inodes(). @wbc controls various aspects of the write, such as
* whether it is a data-integrity sync (%WB_SYNC_ALL) or not (%WB_SYNC_NONE).
*
* To prevent the inode from going away, either the caller must have a reference
* to the inode, or the inode must have I_WILL_FREE or I_FREEING set.
*/
static int writeback_single_inode(struct inode *inode,
struct writeback_control *wbc)
{
struct bdi_writeback *wb;
int ret = 0;
spin_lock(&inode->i_lock);
if (!atomic_read(&inode->i_count))
WARN_ON(!(inode->i_state & (I_WILL_FREE|I_FREEING)));
else
WARN_ON(inode->i_state & I_WILL_FREE); if (inode->i_state & I_SYNC) {
/*
* Writeback is already running on the inode. For WB_SYNC_NONE,
* that's enough and we can just return. For WB_SYNC_ALL, we
* must wait for the existing writeback to complete, then do
* writeback again if there's anything left.
*/
if (wbc->sync_mode != WB_SYNC_ALL)
goto out;
__inode_wait_for_writeback(inode);
}
WARN_ON(inode->i_state & I_SYNC);
/*
* If the inode is already fully clean, then there's nothing to do.
*
* For data-integrity syncs we also need to check whether any pages are
* still under writeback, e.g. due to prior WB_SYNC_NONE writeback. If
* there are any such pages, we'll need to wait for them.
*/
if (!(inode->i_state & I_DIRTY_ALL) && (wbc->sync_mode != WB_SYNC_ALL || !mapping_tagged(inode->i_mapping, PAGECACHE_TAG_WRITEBACK)))
goto out;
inode->i_state |= I_SYNC;
wbc_attach_and_unlock_inode(wbc, inode);
ret = __writeback_single_inode(inode, wbc);
wbc_detach_inode(wbc);
wb = inode_to_wb_and_lock_list(inode);
spin_lock(&inode->i_lock);
/*
* If the inode is now fully clean, then it can be safely removed from
* its writeback list (if any). Otherwise the flusher threads are
* responsible for the writeback lists.
*/
if (!(inode->i_state & I_DIRTY_ALL))
inode_cgwb_move_to_attached(inode, wb);
spin_unlock(&wb->list_lock);
inode_sync_complete(inode);
out:
spin_unlock(&inode->i_lock);
return ret;
}
static long writeback_chunk_size(struct bdi_writeback *wb,
struct wb_writeback_work *work)
{
long pages;
/*
* WB_SYNC_ALL mode does livelock avoidance by syncing dirty
* inodes/pages in one big loop. Setting wbc.nr_to_write=LONG_MAX
* here avoids calling into writeback_inodes_wb() more than once.
*
* The intended call sequence for WB_SYNC_ALL writeback is:
*
* wb_writeback()
* writeback_sb_inodes() <== called only once
* write_cache_pages() <== called once for each inode
* (quickly) tag currently dirty pages
* (maybe slowly) sync all tagged pages
*/
if (work->sync_mode == WB_SYNC_ALL || work->tagged_writepages)
pages = LONG_MAX;
else {
pages = min(wb->avg_write_bandwidth / 2,
global_wb_domain.dirty_limit / DIRTY_SCOPE);
pages = min(pages, work->nr_pages);
pages = round_down(pages + MIN_WRITEBACK_PAGES,
MIN_WRITEBACK_PAGES);
}
return pages;
}
/*
* Write a portion of b_io inodes which belong to @sb.
*
* Return the number of pages and/or inodes written.
*
* NOTE! This is called with wb->list_lock held, and will
* unlock and relock that for each inode it ends up doing
* IO for.
*/
static long writeback_sb_inodes(struct super_block *sb,
struct bdi_writeback *wb,
struct wb_writeback_work *work)
{
struct writeback_control wbc = {
.sync_mode = work->sync_mode,
.tagged_writepages = work->tagged_writepages,
.for_kupdate = work->for_kupdate,
.for_background = work->for_background,
.for_sync = work->for_sync,
.range_cyclic = work->range_cyclic,
.range_start = 0,
.range_end = LLONG_MAX,
};
unsigned long start_time = jiffies;
long write_chunk;
long wrote = 0; /* count both pages and inodes */
while (!list_empty(&wb->b_io)) {
struct inode *inode = wb_inode(wb->b_io.prev);
struct bdi_writeback *tmp_wb;
if (inode->i_sb != sb) {
if (work->sb) {
/*
* We only want to write back data for this
* superblock, move all inodes not belonging
* to it back onto the dirty list.
*/
redirty_tail(inode, wb);
continue;
}
/*
* The inode belongs to a different superblock.
* Bounce back to the caller to unpin this and
* pin the next superblock.
*/
break;
}
/*
* Don't bother with new inodes or inodes being freed, first
* kind does not need periodic writeout yet, and for the latter
* kind writeout is handled by the freer.
*/
spin_lock(&inode->i_lock);
if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE)) {
redirty_tail_locked(inode, wb);
spin_unlock(&inode->i_lock);
continue;
}
if ((inode->i_state & I_SYNC) && wbc.sync_mode != WB_SYNC_ALL) {
/*
* If this inode is locked for writeback and we are not
* doing writeback-for-data-integrity, move it to
* b_more_io so that writeback can proceed with the
* other inodes on s_io.
*
* We'll have another go at writing back this inode
* when we completed a full scan of b_io.
*/
spin_unlock(&inode->i_lock);
requeue_io(inode, wb);
trace_writeback_sb_inodes_requeue(inode);
continue;
}
spin_unlock(&wb->list_lock);
/*
* We already requeued the inode if it had I_SYNC set and we
* are doing WB_SYNC_NONE writeback. So this catches only the
* WB_SYNC_ALL case.
*/
if (inode->i_state & I_SYNC) {
/* Wait for I_SYNC. This function drops i_lock... */
inode_sleep_on_writeback(inode);
/* Inode may be gone, start again */
spin_lock(&wb->list_lock);
continue;
}
inode->i_state |= I_SYNC;
wbc_attach_and_unlock_inode(&wbc, inode);
write_chunk = writeback_chunk_size(wb, work);
wbc.nr_to_write = write_chunk;
wbc.pages_skipped = 0;
/*
* We use I_SYNC to pin the inode in memory. While it is set
* evict_inode() will wait so the inode cannot be freed.
*/
__writeback_single_inode(inode, &wbc);
wbc_detach_inode(&wbc);
work->nr_pages -= write_chunk - wbc.nr_to_write;
wrote += write_chunk - wbc.nr_to_write;
if (need_resched()) {
/*
* We're trying to balance between building up a nice
* long list of IOs to improve our merge rate, and
* getting those IOs out quickly for anyone throttling
* in balance_dirty_pages(). cond_resched() doesn't
* unplug, so get our IOs out the door before we
* give up the CPU.
*/
blk_flush_plug(current);
cond_resched();
}
/*
* Requeue @inode if still dirty. Be careful as @inode may
* have been switched to another wb in the meantime.
*/
tmp_wb = inode_to_wb_and_lock_list(inode);
spin_lock(&inode->i_lock);
if (!(inode->i_state & I_DIRTY_ALL))
wrote++;
requeue_inode(inode, tmp_wb, &wbc);
inode_sync_complete(inode);
spin_unlock(&inode->i_lock);
if (unlikely(tmp_wb != wb)) {
spin_unlock(&tmp_wb->list_lock);
spin_lock(&wb->list_lock);
}
/*
* bail out to wb_writeback() often enough to check
* background threshold and other termination conditions.
*/
if (wrote) {
if (time_is_before_jiffies(start_time + HZ / 10UL))
break;
if (work->nr_pages <= 0)
break;
}
}
return wrote;
}
static long __writeback_inodes_wb(struct bdi_writeback *wb,
struct wb_writeback_work *work)
{
unsigned long start_time = jiffies;
long wrote = 0;
while (!list_empty(&wb->b_io)) {
struct inode *inode = wb_inode(wb->b_io.prev);
struct super_block *sb = inode->i_sb;
if (!trylock_super(sb)) {
/*
* trylock_super() may fail consistently due to
* s_umount being grabbed by someone else. Don't use
* requeue_io() to avoid busy retrying the inode/sb.
*/
redirty_tail(inode, wb);
continue;
}
wrote += writeback_sb_inodes(sb, wb, work);
up_read(&sb->s_umount);
/* refer to the same tests at the end of writeback_sb_inodes */
if (wrote) {
if (time_is_before_jiffies(start_time + HZ / 10UL))
break;
if (work->nr_pages <= 0)
break;
}
}
/* Leave any unwritten inodes on b_io */
return wrote;
}
static long writeback_inodes_wb(struct bdi_writeback *wb, long nr_pages,
enum wb_reason reason)
{
struct wb_writeback_work work = {
.nr_pages = nr_pages,
.sync_mode = WB_SYNC_NONE,
.range_cyclic = 1,
.reason = reason,
};
struct blk_plug plug;
blk_start_plug(&plug);
spin_lock(&wb->list_lock);
if (list_empty(&wb->b_io))
queue_io(wb, &work, jiffies);
__writeback_inodes_wb(wb, &work);
spin_unlock(&wb->list_lock);
blk_finish_plug(&plug);
return nr_pages - work.nr_pages;
}
/*
* Explicit flushing or periodic writeback of "old" data.
*
* Define "old": the first time one of an inode's pages is dirtied, we mark the
* dirtying-time in the inode's address_space. So this periodic writeback code
* just walks the superblock inode list, writing back any inodes which are
* older than a specific point in time.
*
* Try to run once per dirty_writeback_interval. But if a writeback event
* takes longer than a dirty_writeback_interval interval, then leave a
* one-second gap.
*
* dirtied_before takes precedence over nr_to_write. So we'll only write back
* all dirty pages if they are all attached to "old" mappings.
*/
static long wb_writeback(struct bdi_writeback *wb,
struct wb_writeback_work *work)
{
long nr_pages = work->nr_pages;
unsigned long dirtied_before = jiffies;
struct inode *inode;
long progress;
struct blk_plug plug;
blk_start_plug(&plug);
spin_lock(&wb->list_lock);
for (;;) {
/*
* Stop writeback when nr_pages has been consumed
*/
if (work->nr_pages <= 0)
break;
/*
* Background writeout and kupdate-style writeback may
* run forever. Stop them if there is other work to do
* so that e.g. sync can proceed. They'll be restarted
* after the other works are all done.
*/
if ((work->for_background || work->for_kupdate) &&
!list_empty(&wb->work_list))
break;
/*
* For background writeout, stop when we are below the
* background dirty threshold
*/
if (work->for_background && !wb_over_bg_thresh(wb))
break;
/*
* Kupdate and background works are special and we want to
* include all inodes that need writing. Livelock avoidance is
* handled by these works yielding to any other work so we are
* safe.
*/
if (work->for_kupdate) {
dirtied_before = jiffies -
msecs_to_jiffies(dirty_expire_interval * 10);
} else if (work->for_background)
dirtied_before = jiffies;
trace_writeback_start(wb, work);
if (list_empty(&wb->b_io))
queue_io(wb, work, dirtied_before);
if (work->sb)
progress = writeback_sb_inodes(work->sb, wb, work);
else
progress = __writeback_inodes_wb(wb, work);
trace_writeback_written(wb, work);
/*
* Did we write something? Try for more
*
* Dirty inodes are moved to b_io for writeback in batches.
* The completion of the current batch does not necessarily
* mean the overall work is done. So we keep looping as long
* as made some progress on cleaning pages or inodes.
*/
if (progress)
continue;
/*
* No more inodes for IO, bail
*/
if (list_empty(&wb->b_more_io))
break;
/*
* Nothing written. Wait for some inode to
* become available for writeback. Otherwise
* we'll just busyloop.
*/
trace_writeback_wait(wb, work);
inode = wb_inode(wb->b_more_io.prev);
spin_lock(&inode->i_lock);
spin_unlock(&wb->list_lock);
/* This function drops i_lock... */
inode_sleep_on_writeback(inode);
spin_lock(&wb->list_lock);
}
spin_unlock(&wb->list_lock);
blk_finish_plug(&plug);
return nr_pages - work->nr_pages;
}
/*
* Return the next wb_writeback_work struct that hasn't been processed yet.
*/
static struct wb_writeback_work *get_next_work_item(struct bdi_writeback *wb)
{
struct wb_writeback_work *work = NULL;
spin_lock_bh(&wb->work_lock);
if (!list_empty(&wb->work_list)) {
work = list_entry(wb->work_list.next,
struct wb_writeback_work, list);
list_del_init(&work->list);
}
spin_unlock_bh(&wb->work_lock);
return work;
}
static long wb_check_background_flush(struct bdi_writeback *wb)
{
if (wb_over_bg_thresh(wb)) {
struct wb_writeback_work work = {
.nr_pages = LONG_MAX,
.sync_mode = WB_SYNC_NONE,
.for_background = 1,
.range_cyclic = 1,
.reason = WB_REASON_BACKGROUND,
};
return wb_writeback(wb, &work);
}
return 0;
}
static long wb_check_old_data_flush(struct bdi_writeback *wb)
{
unsigned long expired;
long nr_pages;
/*
* When set to zero, disable periodic writeback
*/
if (!dirty_writeback_interval)
return 0;
expired = wb->last_old_flush +
msecs_to_jiffies(dirty_writeback_interval * 10);
if (time_before(jiffies, expired))
return 0;
wb->last_old_flush = jiffies;
nr_pages = get_nr_dirty_pages();
if (nr_pages) {
struct wb_writeback_work work = {
.nr_pages = nr_pages,
.sync_mode = WB_SYNC_NONE,
.for_kupdate = 1,
.range_cyclic = 1,
.reason = WB_REASON_PERIODIC,
};
return wb_writeback(wb, &work);
}
return 0;
}
static long wb_check_start_all(struct bdi_writeback *wb)
{
long nr_pages;
if (!test_bit(WB_start_all, &wb->state))
return 0;
nr_pages = get_nr_dirty_pages();
if (nr_pages) {
struct wb_writeback_work work = {
.nr_pages = wb_split_bdi_pages(wb, nr_pages),
.sync_mode = WB_SYNC_NONE,
.range_cyclic = 1,
.reason = wb->start_all_reason,
};
nr_pages = wb_writeback(wb, &work);
}
clear_bit(WB_start_all, &wb->state);
return nr_pages;
}
/*
* Retrieve work items and do the writeback they describe
*/
static long wb_do_writeback(struct bdi_writeback *wb)
{
struct wb_writeback_work *work;
long wrote = 0;
set_bit(WB_writeback_running, &wb->state);
while ((work = get_next_work_item(wb)) != NULL) {
trace_writeback_exec(wb, work);
wrote += wb_writeback(wb, work);
finish_writeback_work(wb, work);
}
/*
* Check for a flush-everything request
*/
wrote += wb_check_start_all(wb);
/*
* Check for periodic writeback, kupdated() style
*/
wrote += wb_check_old_data_flush(wb);
wrote += wb_check_background_flush(wb);
clear_bit(WB_writeback_running, &wb->state);
return wrote;
}
/*
* Handle writeback of dirty data for the device backed by this bdi. Also
* reschedules periodically and does kupdated style flushing.
*/
void wb_workfn(struct work_struct *work)
{
struct bdi_writeback *wb = container_of(to_delayed_work(work),
struct bdi_writeback, dwork);
long pages_written;
set_worker_desc("flush-%s", bdi_dev_name(wb->bdi));
current->flags |= PF_SWAPWRITE;
if (likely(!current_is_workqueue_rescuer() ||
!test_bit(WB_registered, &wb->state))) {
/*
* The normal path. Keep writing back @wb until its
* work_list is empty. Note that this path is also taken
* if @wb is shutting down even when we're running off the
* rescuer as work_list needs to be drained.
*/
do {
pages_written = wb_do_writeback(wb);
trace_writeback_pages_written(pages_written);
} while (!list_empty(&wb->work_list));
} else {
/*
* bdi_wq can't get enough workers and we're running off
* the emergency worker. Don't hog it. Hopefully, 1024 is
* enough for efficient IO.
*/
pages_written = writeback_inodes_wb(wb, 1024,
WB_REASON_FORKER_THREAD);
trace_writeback_pages_written(pages_written);
}
if (!list_empty(&wb->work_list))
wb_wakeup(wb);
else if (wb_has_dirty_io(wb) && dirty_writeback_interval)
wb_wakeup_delayed(wb);
current->flags &= ~PF_SWAPWRITE;
}
/*
* Start writeback of `nr_pages' pages on this bdi. If `nr_pages' is zero,
* write back the whole world.
*/
static void __wakeup_flusher_threads_bdi(struct backing_dev_info *bdi,
enum wb_reason reason)
{
struct bdi_writeback *wb;
if (!bdi_has_dirty_io(bdi))
return;
list_for_each_entry_rcu(wb, &bdi->wb_list, bdi_node)
wb_start_writeback(wb, reason);
}
void wakeup_flusher_threads_bdi(struct backing_dev_info *bdi,
enum wb_reason reason)
{
rcu_read_lock();
__wakeup_flusher_threads_bdi(bdi, reason);
rcu_read_unlock();
}
/*
* Wakeup the flusher threads to start writeback of all currently dirty pages
*/
void wakeup_flusher_threads(enum wb_reason reason)
{
struct backing_dev_info *bdi;
/*
* If we are expecting writeback progress we must submit plugged IO.
*/
if (blk_needs_flush_plug(current))
blk_schedule_flush_plug(current);
rcu_read_lock();
list_for_each_entry_rcu(bdi, &bdi_list, bdi_list)
__wakeup_flusher_threads_bdi(bdi, reason);
rcu_read_unlock();
}
/*
* Wake up bdi's periodically to make sure dirtytime inodes gets
* written back periodically. We deliberately do *not* check the
* b_dirtytime list in wb_has_dirty_io(), since this would cause the
* kernel to be constantly waking up once there are any dirtytime
* inodes on the system. So instead we define a separate delayed work
* function which gets called much more rarely. (By default, only
* once every 12 hours.)
*
* If there is any other write activity going on in the file system,
* this function won't be necessary. But if the only thing that has
* happened on the file system is a dirtytime inode caused by an atime
* update, we need this infrastructure below to make sure that inode
* eventually gets pushed out to disk.
*/
static void wakeup_dirtytime_writeback(struct work_struct *w);
static DECLARE_DELAYED_WORK(dirtytime_work, wakeup_dirtytime_writeback);
static void wakeup_dirtytime_writeback(struct work_struct *w)
{
struct backing_dev_info *bdi;
rcu_read_lock();
list_for_each_entry_rcu(bdi, &bdi_list, bdi_list) {
struct bdi_writeback *wb;
list_for_each_entry_rcu(wb, &bdi->wb_list, bdi_node)
if (!list_empty(&wb->b_dirty_time))
wb_wakeup(wb);
}
rcu_read_unlock();
schedule_delayed_work(&dirtytime_work, dirtytime_expire_interval * HZ);
}
static int __init start_dirtytime_writeback(void)
{
schedule_delayed_work(&dirtytime_work, dirtytime_expire_interval * HZ);
return 0;
}
__initcall(start_dirtytime_writeback);
int dirtytime_interval_handler(struct ctl_table *table, int write,
void *buffer, size_t *lenp, loff_t *ppos)
{
int ret;
ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
if (ret == 0 && write)
mod_delayed_work(system_wq, &dirtytime_work, 0);
return ret;
}
/**
* __mark_inode_dirty - internal function to mark an inode dirty
*
* @inode: inode to mark
* @flags: what kind of dirty, e.g. I_DIRTY_SYNC. This can be a combination of
* multiple I_DIRTY_* flags, except that I_DIRTY_TIME can't be combined
* with I_DIRTY_PAGES.
*
* Mark an inode as dirty. We notify the filesystem, then update the inode's
* dirty flags. Then, if needed we add the inode to the appropriate dirty list.
*
* Most callers should use mark_inode_dirty() or mark_inode_dirty_sync()
* instead of calling this directly.
*
* CAREFUL! We only add the inode to the dirty list if it is hashed or if it
* refers to a blockdev. Unhashed inodes will never be added to the dirty list
* even if they are later hashed, as they will have been marked dirty already.
*
* In short, ensure you hash any inodes _before_ you start marking them dirty.
*
* Note that for blockdevs, inode->dirtied_when represents the dirtying time of
* the block-special inode (/dev/hda1) itself. And the ->dirtied_when field of
* the kernel-internal blockdev inode represents the dirtying time of the
* blockdev's pages. This is why for I_DIRTY_PAGES we always use
* page->mapping->host, so the page-dirtying time is recorded in the internal
* blockdev inode.
*/
void __mark_inode_dirty(struct inode *inode, int flags)
{
struct super_block *sb = inode->i_sb;
int dirtytime = 0;
trace_writeback_mark_inode_dirty(inode, flags);
if (flags & I_DIRTY_INODE) {
/*
* Notify the filesystem about the inode being dirtied, so that
* (if needed) it can update on-disk fields and journal the
* inode. This is only needed when the inode itself is being
* dirtied now. I.e. it's only needed for I_DIRTY_INODE, not
* for just I_DIRTY_PAGES or I_DIRTY_TIME.
*/
trace_writeback_dirty_inode_start(inode, flags);
if (sb->s_op->dirty_inode) sb->s_op->dirty_inode(inode, flags & I_DIRTY_INODE);
trace_writeback_dirty_inode(inode, flags);
/* I_DIRTY_INODE supersedes I_DIRTY_TIME. */
flags &= ~I_DIRTY_TIME;
} else {
/*
* Else it's either I_DIRTY_PAGES, I_DIRTY_TIME, or nothing.
* (We don't support setting both I_DIRTY_PAGES and I_DIRTY_TIME
* in one call to __mark_inode_dirty().)
*/
dirtytime = flags & I_DIRTY_TIME; WARN_ON_ONCE(dirtytime && flags != I_DIRTY_TIME);
}
/*
* Paired with smp_mb() in __writeback_single_inode() for the
* following lockless i_state test. See there for details.
*/
smp_mb(); if (((inode->i_state & flags) == flags) || (dirtytime && (inode->i_state & I_DIRTY_INODE)))
return;
spin_lock(&inode->i_lock);
if (dirtytime && (inode->i_state & I_DIRTY_INODE))
goto out_unlock_inode;
if ((inode->i_state & flags) != flags) { const int was_dirty = inode->i_state & I_DIRTY;
inode_attach_wb(inode, NULL);
/* I_DIRTY_INODE supersedes I_DIRTY_TIME. */
if (flags & I_DIRTY_INODE)
inode->i_state &= ~I_DIRTY_TIME; inode->i_state |= flags;
/*
* If the inode is queued for writeback by flush worker, just
* update its dirty state. Once the flush worker is done with
* the inode it will place it on the appropriate superblock
* list, based upon its state.
*/
if (inode->i_state & I_SYNC_QUEUED)
goto out_unlock_inode;
/*
* Only add valid (hashed) inodes to the superblock's
* dirty list. Add blockdev inodes as well.
*/
if (!S_ISBLK(inode->i_mode)) {
if (inode_unhashed(inode))
goto out_unlock_inode;
}
if (inode->i_state & I_FREEING)
goto out_unlock_inode;
/*
* If the inode was already on b_dirty/b_io/b_more_io, don't
* reposition it (that would break b_dirty time-ordering).
*/
if (!was_dirty) {
struct bdi_writeback *wb;
struct list_head *dirty_list;
bool wakeup_bdi = false;
wb = locked_inode_to_wb_and_lock_list(inode);
inode->dirtied_when = jiffies;
if (dirtytime)
inode->dirtied_time_when = jiffies; if (inode->i_state & I_DIRTY) dirty_list = &wb->b_dirty;
else
dirty_list = &wb->b_dirty_time; wakeup_bdi = inode_io_list_move_locked(inode, wb,
dirty_list);
spin_unlock(&wb->list_lock);
trace_writeback_dirty_inode_enqueue(inode);
/*
* If this is the first dirty inode for this bdi,
* we have to wake-up the corresponding bdi thread
* to make sure background write-back happens
* later.
*/
if (wakeup_bdi && (wb->bdi->capabilities & BDI_CAP_WRITEBACK)) wb_wakeup_delayed(wb);
return;
}
}
out_unlock_inode:
spin_unlock(&inode->i_lock);
}
EXPORT_SYMBOL(__mark_inode_dirty);
/*
* The @s_sync_lock is used to serialise concurrent sync operations
* to avoid lock contention problems with concurrent wait_sb_inodes() calls.
* Concurrent callers will block on the s_sync_lock rather than doing contending
* walks. The queueing maintains sync(2) required behaviour as all the IO that
* has been issued up to the time this function is enter is guaranteed to be
* completed by the time we have gained the lock and waited for all IO that is
* in progress regardless of the order callers are granted the lock.
*/
static void wait_sb_inodes(struct super_block *sb)
{
LIST_HEAD(sync_list);
/*
* We need to be protected against the filesystem going from
* r/o to r/w or vice versa.
*/
WARN_ON(!rwsem_is_locked(&sb->s_umount)); mutex_lock(&sb->s_sync_lock);
/*
* Splice the writeback list onto a temporary list to avoid waiting on
* inodes that have started writeback after this point.
*
* Use rcu_read_lock() to keep the inodes around until we have a
* reference. s_inode_wblist_lock protects sb->s_inodes_wb as well as
* the local list because inodes can be dropped from either by writeback
* completion.
*/
rcu_read_lock();
spin_lock_irq(&sb->s_inode_wblist_lock);
list_splice_init(&sb->s_inodes_wb, &sync_list);
/*
* Data integrity sync. Must wait for all pages under writeback, because
* there may have been pages dirtied before our sync call, but which had
* writeout started before we write it out. In which case, the inode
* may not be on the dirty list, but we still have to wait for that
* writeout.
*/
while (!list_empty(&sync_list)) {
struct inode *inode = list_first_entry(&sync_list, struct inode,
i_wb_list);
struct address_space *mapping = inode->i_mapping;
/*
* Move each inode back to the wb list before we drop the lock
* to preserve consistency between i_wb_list and the mapping
* writeback tag. Writeback completion is responsible to remove
* the inode from either list once the writeback tag is cleared.
*/
list_move_tail(&inode->i_wb_list, &sb->s_inodes_wb);
/*
* The mapping can appear untagged while still on-list since we
* do not have the mapping lock. Skip it here, wb completion
* will remove it.
*/
if (!mapping_tagged(mapping, PAGECACHE_TAG_WRITEBACK))
continue;
spin_unlock_irq(&sb->s_inode_wblist_lock);
spin_lock(&inode->i_lock);
if (inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) {
spin_unlock(&inode->i_lock);
spin_lock_irq(&sb->s_inode_wblist_lock);
continue;
}
__iget(inode);
spin_unlock(&inode->i_lock);
rcu_read_unlock();
/*
* We keep the error status of individual mapping so that
* applications can catch the writeback error using fsync(2).
* See filemap_fdatawait_keep_errors() for details.
*/
filemap_fdatawait_keep_errors(mapping);
cond_resched();
iput(inode);
rcu_read_lock();
spin_lock_irq(&sb->s_inode_wblist_lock);
}
spin_unlock_irq(&sb->s_inode_wblist_lock);
rcu_read_unlock();
mutex_unlock(&sb->s_sync_lock);
}
static void __writeback_inodes_sb_nr(struct super_block *sb, unsigned long nr,
enum wb_reason reason, bool skip_if_busy)
{
struct backing_dev_info *bdi = sb->s_bdi;
DEFINE_WB_COMPLETION(done, bdi);
struct wb_writeback_work work = {
.sb = sb,
.sync_mode = WB_SYNC_NONE,
.tagged_writepages = 1,
.done = &done,
.nr_pages = nr,
.reason = reason,
};
if (!bdi_has_dirty_io(bdi) || bdi == &noop_backing_dev_info) return; WARN_ON(!rwsem_is_locked(&sb->s_umount)); bdi_split_work_to_wbs(sb->s_bdi, &work, skip_if_busy); wb_wait_for_completion(&done);
}
/**
* writeback_inodes_sb_nr - writeback dirty inodes from given super_block
* @sb: the superblock
* @nr: the number of pages to write
* @reason: reason why some writeback work initiated
*
* Start writeback on some inodes on this super_block. No guarantees are made
* on how many (if any) will be written, and this function does not wait
* for IO completion of submitted IO.
*/
void writeback_inodes_sb_nr(struct super_block *sb,
unsigned long nr,
enum wb_reason reason)
{
__writeback_inodes_sb_nr(sb, nr, reason, false);
}
EXPORT_SYMBOL(writeback_inodes_sb_nr);
/**
* writeback_inodes_sb - writeback dirty inodes from given super_block
* @sb: the superblock
* @reason: reason why some writeback work was initiated
*
* Start writeback on some inodes on this super_block. No guarantees are made
* on how many (if any) will be written, and this function does not wait
* for IO completion of submitted IO.
*/
void writeback_inodes_sb(struct super_block *sb, enum wb_reason reason)
{
return writeback_inodes_sb_nr(sb, get_nr_dirty_pages(), reason);
}
EXPORT_SYMBOL(writeback_inodes_sb);
/**
* try_to_writeback_inodes_sb - try to start writeback if none underway
* @sb: the superblock
* @reason: reason why some writeback work was initiated
*
* Invoke __writeback_inodes_sb_nr if no writeback is currently underway.
*/
void try_to_writeback_inodes_sb(struct super_block *sb, enum wb_reason reason)
{
if (!down_read_trylock(&sb->s_umount))
return;
__writeback_inodes_sb_nr(sb, get_nr_dirty_pages(), reason, true);
up_read(&sb->s_umount);
}
EXPORT_SYMBOL(try_to_writeback_inodes_sb);
/**
* sync_inodes_sb - sync sb inode pages
* @sb: the superblock
*
* This function writes and waits on any dirty inode belonging to this
* super_block.
*/
void sync_inodes_sb(struct super_block *sb)
{
struct backing_dev_info *bdi = sb->s_bdi;
DEFINE_WB_COMPLETION(done, bdi);
struct wb_writeback_work work = {
.sb = sb,
.sync_mode = WB_SYNC_ALL,
.nr_pages = LONG_MAX,
.range_cyclic = 0,
.done = &done,
.reason = WB_REASON_SYNC,
.for_sync = 1,
};
/*
* Can't skip on !bdi_has_dirty() because we should wait for !dirty
* inodes under writeback and I_DIRTY_TIME inodes ignored by
* bdi_has_dirty() need to be written out too.
*/
if (bdi == &noop_backing_dev_info)
return; WARN_ON(!rwsem_is_locked(&sb->s_umount));
/* protect against inode wb switch, see inode_switch_wbs_work_fn() */
bdi_down_write_wb_switch_rwsem(bdi);
bdi_split_work_to_wbs(bdi, &work, false);
wb_wait_for_completion(&done);
bdi_up_write_wb_switch_rwsem(bdi);
wait_sb_inodes(sb);
}
EXPORT_SYMBOL(sync_inodes_sb);
/**
* write_inode_now - write an inode to disk
* @inode: inode to write to disk
* @sync: whether the write should be synchronous or not
*
* This function commits an inode to disk immediately if it is dirty. This is
* primarily needed by knfsd.
*
* The caller must either have a ref on the inode or must have set I_WILL_FREE.
*/
int write_inode_now(struct inode *inode, int sync)
{
struct writeback_control wbc = {
.nr_to_write = LONG_MAX,
.sync_mode = sync ? WB_SYNC_ALL : WB_SYNC_NONE,
.range_start = 0,
.range_end = LLONG_MAX,
};
if (!mapping_can_writeback(inode->i_mapping))
wbc.nr_to_write = 0; might_sleep();
return writeback_single_inode(inode, &wbc);
}
EXPORT_SYMBOL(write_inode_now);
/**
* sync_inode_metadata - write an inode to disk
* @inode: the inode to sync
* @wait: wait for I/O to complete.
*
* Write an inode to disk and adjust its dirty state after completion.
*
* Note: only writes the actual inode, no associated data or other metadata.
*/
int sync_inode_metadata(struct inode *inode, int wait)
{
struct writeback_control wbc = {
.sync_mode = wait ? WB_SYNC_ALL : WB_SYNC_NONE,
.nr_to_write = 0, /* metadata-only */
};
return writeback_single_inode(inode, &wbc);
}
EXPORT_SYMBOL(sync_inode_metadata);
// SPDX-License-Identifier: GPL-2.0-only
/*
* mm/mmap.c
*
* Written by obz.
*
* Address space accounting code <alan@lxorguk.ukuu.org.uk>
*/
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/kernel.h>
#include <linux/slab.h>
#include <linux/backing-dev.h>
#include <linux/mm.h>
#include <linux/vmacache.h>
#include <linux/shm.h>
#include <linux/mman.h>
#include <linux/pagemap.h>
#include <linux/swap.h>
#include <linux/syscalls.h>
#include <linux/capability.h>
#include <linux/init.h>
#include <linux/file.h>
#include <linux/fs.h>
#include <linux/personality.h>
#include <linux/security.h>
#include <linux/hugetlb.h>
#include <linux/shmem_fs.h>
#include <linux/profile.h>
#include <linux/export.h>
#include <linux/mount.h>
#include <linux/mempolicy.h>
#include <linux/rmap.h>
#include <linux/mmu_notifier.h>
#include <linux/mmdebug.h>
#include <linux/perf_event.h>
#include <linux/audit.h>
#include <linux/khugepaged.h>
#include <linux/uprobes.h>
#include <linux/rbtree_augmented.h>
#include <linux/notifier.h>
#include <linux/memory.h>
#include <linux/printk.h>
#include <linux/userfaultfd_k.h>
#include <linux/moduleparam.h>
#include <linux/pkeys.h>
#include <linux/oom.h>
#include <linux/sched/mm.h>
#include <linux/uaccess.h>
#include <asm/cacheflush.h>
#include <asm/tlb.h>
#include <asm/mmu_context.h>
#define CREATE_TRACE_POINTS
#include <trace/events/mmap.h>
#include "internal.h"
#ifndef arch_mmap_check
#define arch_mmap_check(addr, len, flags) (0)
#endif
#ifdef CONFIG_HAVE_ARCH_MMAP_RND_BITS
const int mmap_rnd_bits_min = CONFIG_ARCH_MMAP_RND_BITS_MIN;
const int mmap_rnd_bits_max = CONFIG_ARCH_MMAP_RND_BITS_MAX;
int mmap_rnd_bits __read_mostly = CONFIG_ARCH_MMAP_RND_BITS;
#endif
#ifdef CONFIG_HAVE_ARCH_MMAP_RND_COMPAT_BITS
const int mmap_rnd_compat_bits_min = CONFIG_ARCH_MMAP_RND_COMPAT_BITS_MIN;
const int mmap_rnd_compat_bits_max = CONFIG_ARCH_MMAP_RND_COMPAT_BITS_MAX;
int mmap_rnd_compat_bits __read_mostly = CONFIG_ARCH_MMAP_RND_COMPAT_BITS;
#endif
static bool ignore_rlimit_data;
core_param(ignore_rlimit_data, ignore_rlimit_data, bool, 0644);
static void unmap_region(struct mm_struct *mm,
struct vm_area_struct *vma, struct vm_area_struct *prev,
unsigned long start, unsigned long end);
/* description of effects of mapping type and prot in current implementation.
* this is due to the limited x86 page protection hardware. The expected
* behavior is in parens:
*
* map_type prot
* PROT_NONE PROT_READ PROT_WRITE PROT_EXEC
* MAP_SHARED r: (no) no r: (yes) yes r: (no) yes r: (no) yes
* w: (no) no w: (no) no w: (yes) yes w: (no) no
* x: (no) no x: (no) yes x: (no) yes x: (yes) yes
*
* MAP_PRIVATE r: (no) no r: (yes) yes r: (no) yes r: (no) yes
* w: (no) no w: (no) no w: (copy) copy w: (no) no
* x: (no) no x: (no) yes x: (no) yes x: (yes) yes
*
* On arm64, PROT_EXEC has the following behaviour for both MAP_SHARED and
* MAP_PRIVATE (with Enhanced PAN supported):
* r: (no) no
* w: (no) no
* x: (yes) yes
*/
pgprot_t protection_map[16] __ro_after_init = {
__P000, __P001, __P010, __P011, __P100, __P101, __P110, __P111,
__S000, __S001, __S010, __S011, __S100, __S101, __S110, __S111
};
#ifndef CONFIG_ARCH_HAS_FILTER_PGPROT
static inline pgprot_t arch_filter_pgprot(pgprot_t prot)
{
return prot;
}
#endif
pgprot_t vm_get_page_prot(unsigned long vm_flags)
{
pgprot_t ret = __pgprot(pgprot_val(protection_map[vm_flags &
(VM_READ|VM_WRITE|VM_EXEC|VM_SHARED)]) |
pgprot_val(arch_vm_get_page_prot(vm_flags)));
return arch_filter_pgprot(ret);
}
EXPORT_SYMBOL(vm_get_page_prot);
static pgprot_t vm_pgprot_modify(pgprot_t oldprot, unsigned long vm_flags)
{
return pgprot_modify(oldprot, vm_get_page_prot(vm_flags));
}
/* Update vma->vm_page_prot to reflect vma->vm_flags. */
void vma_set_page_prot(struct vm_area_struct *vma)
{
unsigned long vm_flags = vma->vm_flags;
pgprot_t vm_page_prot;
vm_page_prot = vm_pgprot_modify(vma->vm_page_prot, vm_flags);
if (vma_wants_writenotify(vma, vm_page_prot)) {
vm_flags &= ~VM_SHARED;
vm_page_prot = vm_pgprot_modify(vm_page_prot, vm_flags);
}
/* remove_protection_ptes reads vma->vm_page_prot without mmap_lock */
WRITE_ONCE(vma->vm_page_prot, vm_page_prot);
}
/*
* Requires inode->i_mapping->i_mmap_rwsem
*/
static void __remove_shared_vm_struct(struct vm_area_struct *vma,
struct file *file, struct address_space *mapping)
{
if (vma->vm_flags & VM_SHARED)
mapping_unmap_writable(mapping);
flush_dcache_mmap_lock(mapping);
vma_interval_tree_remove(vma, &mapping->i_mmap);
flush_dcache_mmap_unlock(mapping);
}
/*
* Unlink a file-based vm structure from its interval tree, to hide
* vma from rmap and vmtruncate before freeing its page tables.
*/
void unlink_file_vma(struct vm_area_struct *vma)
{
struct file *file = vma->vm_file;
if (file) {
struct address_space *mapping = file->f_mapping;
i_mmap_lock_write(mapping);
__remove_shared_vm_struct(vma, file, mapping);
i_mmap_unlock_write(mapping);
}
}
/*
* Close a vm structure and free it, returning the next.
*/
static struct vm_area_struct *remove_vma(struct vm_area_struct *vma)
{
struct vm_area_struct *next = vma->vm_next;
might_sleep();
if (vma->vm_ops && vma->vm_ops->close)
vma->vm_ops->close(vma);
if (vma->vm_file)
fput(vma->vm_file);
mpol_put(vma_policy(vma));
vm_area_free(vma);
return next;
}
static int do_brk_flags(unsigned long addr, unsigned long request, unsigned long flags,
struct list_head *uf);
SYSCALL_DEFINE1(brk, unsigned long, brk)
{
unsigned long newbrk, oldbrk, origbrk;
struct mm_struct *mm = current->mm;
struct vm_area_struct *next;
unsigned long min_brk;
bool populate;
bool downgraded = false;
LIST_HEAD(uf);
if (mmap_write_lock_killable(mm))
return -EINTR;
origbrk = mm->brk;
#ifdef CONFIG_COMPAT_BRK
/*
* CONFIG_COMPAT_BRK can still be overridden by setting
* randomize_va_space to 2, which will still cause mm->start_brk
* to be arbitrarily shifted
*/
if (current->brk_randomized)
min_brk = mm->start_brk;
else
min_brk = mm->end_data;
#else
min_brk = mm->start_brk;
#endif
if (brk < min_brk)
goto out;
/*
* Check against rlimit here. If this check is done later after the test
* of oldbrk with newbrk then it can escape the test and let the data
* segment grow beyond its set limit the in case where the limit is
* not page aligned -Ram Gupta
*/
if (check_data_rlimit(rlimit(RLIMIT_DATA), brk, mm->start_brk,
mm->end_data, mm->start_data))
goto out;
newbrk = PAGE_ALIGN(brk);
oldbrk = PAGE_ALIGN(mm->brk);
if (oldbrk == newbrk) {
mm->brk = brk;
goto success;
}
/*
* Always allow shrinking brk.
* __do_munmap() may downgrade mmap_lock to read.
*/
if (brk <= mm->brk) {
int ret;
/*
* mm->brk must to be protected by write mmap_lock so update it
* before downgrading mmap_lock. When __do_munmap() fails,
* mm->brk will be restored from origbrk.
*/
mm->brk = brk;
ret = __do_munmap(mm, newbrk, oldbrk-newbrk, &uf, true);
if (ret < 0) {
mm->brk = origbrk;
goto out;
} else if (ret == 1) {
downgraded = true;
}
goto success;
}
/* Check against existing mmap mappings. */
next = find_vma(mm, oldbrk);
if (next && newbrk + PAGE_SIZE > vm_start_gap(next))
goto out;
/* Ok, looks good - let it rip. */
if (do_brk_flags(oldbrk, newbrk-oldbrk, 0, &uf) < 0)
goto out;
mm->brk = brk;
success:
populate = newbrk > oldbrk && (mm->def_flags & VM_LOCKED) != 0;
if (downgraded)
mmap_read_unlock(mm);
else
mmap_write_unlock(mm);
userfaultfd_unmap_complete(mm, &uf);
if (populate)
mm_populate(oldbrk, newbrk - oldbrk);
return brk;
out:
mmap_write_unlock(mm);
return origbrk;
}
static inline unsigned long vma_compute_gap(struct vm_area_struct *vma)
{
unsigned long gap, prev_end;
/*
* Note: in the rare case of a VM_GROWSDOWN above a VM_GROWSUP, we
* allow two stack_guard_gaps between them here, and when choosing
* an unmapped area; whereas when expanding we only require one.
* That's a little inconsistent, but keeps the code here simpler.
*/
gap = vm_start_gap(vma); if (vma->vm_prev) { prev_end = vm_end_gap(vma->vm_prev);
if (gap > prev_end)
gap -= prev_end;
else
gap = 0;
}
return gap;
}
#ifdef CONFIG_DEBUG_VM_RB
static unsigned long vma_compute_subtree_gap(struct vm_area_struct *vma)
{
unsigned long max = vma_compute_gap(vma), subtree_gap;
if (vma->vm_rb.rb_left) {
subtree_gap = rb_entry(vma->vm_rb.rb_left,
struct vm_area_struct, vm_rb)->rb_subtree_gap;
if (subtree_gap > max)
max = subtree_gap;
}
if (vma->vm_rb.rb_right) {
subtree_gap = rb_entry(vma->vm_rb.rb_right,
struct vm_area_struct, vm_rb)->rb_subtree_gap;
if (subtree_gap > max)
max = subtree_gap;
}
return max;
}
static int browse_rb(struct mm_struct *mm)
{
struct rb_root *root = &mm->mm_rb;
int i = 0, j, bug = 0;
struct rb_node *nd, *pn = NULL;
unsigned long prev = 0, pend = 0;
for (nd = rb_first(root); nd; nd = rb_next(nd)) {
struct vm_area_struct *vma;
vma = rb_entry(nd, struct vm_area_struct, vm_rb);
if (vma->vm_start < prev) {
pr_emerg("vm_start %lx < prev %lx\n",
vma->vm_start, prev);
bug = 1;
}
if (vma->vm_start < pend) {
pr_emerg("vm_start %lx < pend %lx\n",
vma->vm_start, pend);
bug = 1;
}
if (vma->vm_start > vma->vm_end) {
pr_emerg("vm_start %lx > vm_end %lx\n",
vma->vm_start, vma->vm_end);
bug = 1;
}
spin_lock(&mm->page_table_lock);
if (vma->rb_subtree_gap != vma_compute_subtree_gap(vma)) {
pr_emerg("free gap %lx, correct %lx\n",
vma->rb_subtree_gap,
vma_compute_subtree_gap(vma));
bug = 1;
}
spin_unlock(&mm->page_table_lock);
i++;
pn = nd;
prev = vma->vm_start;
pend = vma->vm_end;
}
j = 0;
for (nd = pn; nd; nd = rb_prev(nd))
j++;
if (i != j) {
pr_emerg("backwards %d, forwards %d\n", j, i);
bug = 1;
}
return bug ? -1 : i;
}
static void validate_mm_rb(struct rb_root *root, struct vm_area_struct *ignore)
{
struct rb_node *nd;
for (nd = rb_first(root); nd; nd = rb_next(nd)) {
struct vm_area_struct *vma;
vma = rb_entry(nd, struct vm_area_struct, vm_rb);
VM_BUG_ON_VMA(vma != ignore &&
vma->rb_subtree_gap != vma_compute_subtree_gap(vma),
vma);
}
}
static void validate_mm(struct mm_struct *mm)
{
int bug = 0;
int i = 0;
unsigned long highest_address = 0;
struct vm_area_struct *vma = mm->mmap;
while (vma) {
struct anon_vma *anon_vma = vma->anon_vma;
struct anon_vma_chain *avc;
if (anon_vma) {
anon_vma_lock_read(anon_vma);
list_for_each_entry(avc, &vma->anon_vma_chain, same_vma)
anon_vma_interval_tree_verify(avc);
anon_vma_unlock_read(anon_vma);
}
highest_address = vm_end_gap(vma);
vma = vma->vm_next;
i++;
}
if (i != mm->map_count) {
pr_emerg("map_count %d vm_next %d\n", mm->map_count, i);
bug = 1;
}
if (highest_address != mm->highest_vm_end) {
pr_emerg("mm->highest_vm_end %lx, found %lx\n",
mm->highest_vm_end, highest_address);
bug = 1;
}
i = browse_rb(mm);
if (i != mm->map_count) {
if (i != -1)
pr_emerg("map_count %d rb %d\n", mm->map_count, i);
bug = 1;
}
VM_BUG_ON_MM(bug, mm);
}
#else
#define validate_mm_rb(root, ignore) do { } while (0)
#define validate_mm(mm) do { } while (0)
#endif
RB_DECLARE_CALLBACKS_MAX(static, vma_gap_callbacks,
struct vm_area_struct, vm_rb,
unsigned long, rb_subtree_gap, vma_compute_gap)
/*
* Update augmented rbtree rb_subtree_gap values after vma->vm_start or
* vma->vm_prev->vm_end values changed, without modifying the vma's position
* in the rbtree.
*/
static void vma_gap_update(struct vm_area_struct *vma)
{
/*
* As it turns out, RB_DECLARE_CALLBACKS_MAX() already created
* a callback function that does exactly what we want.
*/
vma_gap_callbacks_propagate(&vma->vm_rb, NULL);
}
static inline void vma_rb_insert(struct vm_area_struct *vma,
struct rb_root *root)
{
/* All rb_subtree_gap values must be consistent prior to insertion */
validate_mm_rb(root, NULL);
rb_insert_augmented(&vma->vm_rb, root, &vma_gap_callbacks);
}
static void __vma_rb_erase(struct vm_area_struct *vma, struct rb_root *root)
{
/*
* Note rb_erase_augmented is a fairly large inline function,
* so make sure we instantiate it only once with our desired
* augmented rbtree callbacks.
*/
rb_erase_augmented(&vma->vm_rb, root, &vma_gap_callbacks);
}
static __always_inline void vma_rb_erase_ignore(struct vm_area_struct *vma,
struct rb_root *root,
struct vm_area_struct *ignore)
{
/*
* All rb_subtree_gap values must be consistent prior to erase,
* with the possible exception of
*
* a. the "next" vma being erased if next->vm_start was reduced in
* __vma_adjust() -> __vma_unlink()
* b. the vma being erased in detach_vmas_to_be_unmapped() ->
* vma_rb_erase()
*/
validate_mm_rb(root, ignore);
__vma_rb_erase(vma, root);
}
static __always_inline void vma_rb_erase(struct vm_area_struct *vma,
struct rb_root *root)
{
vma_rb_erase_ignore(vma, root, vma);
}
/*
* vma has some anon_vma assigned, and is already inserted on that
* anon_vma's interval trees.
*
* Before updating the vma's vm_start / vm_end / vm_pgoff fields, the
* vma must be removed from the anon_vma's interval trees using
* anon_vma_interval_tree_pre_update_vma().
*
* After the update, the vma will be reinserted using
* anon_vma_interval_tree_post_update_vma().
*
* The entire update must be protected by exclusive mmap_lock and by
* the root anon_vma's mutex.
*/
static inline void
anon_vma_interval_tree_pre_update_vma(struct vm_area_struct *vma)
{
struct anon_vma_chain *avc;
list_for_each_entry(avc, &vma->anon_vma_chain, same_vma) anon_vma_interval_tree_remove(avc, &avc->anon_vma->rb_root);
}
static inline void
anon_vma_interval_tree_post_update_vma(struct vm_area_struct *vma)
{
struct anon_vma_chain *avc;
list_for_each_entry(avc, &vma->anon_vma_chain, same_vma)
anon_vma_interval_tree_insert(avc, &avc->anon_vma->rb_root);
}
static int find_vma_links(struct mm_struct *mm, unsigned long addr,
unsigned long end, struct vm_area_struct **pprev,
struct rb_node ***rb_link, struct rb_node **rb_parent)
{
struct rb_node **__rb_link, *__rb_parent, *rb_prev;
mmap_assert_locked(mm);
__rb_link = &mm->mm_rb.rb_node;
rb_prev = __rb_parent = NULL;
while (*__rb_link) {
struct vm_area_struct *vma_tmp;
__rb_parent = *__rb_link;
vma_tmp = rb_entry(__rb_parent, struct vm_area_struct, vm_rb);
if (vma_tmp->vm_end > addr) {
/* Fail if an existing vma overlaps the area */
if (vma_tmp->vm_start < end)
return -ENOMEM;
__rb_link = &__rb_parent->rb_left;
} else {
rb_prev = __rb_parent;
__rb_link = &__rb_parent->rb_right;
}
}
*pprev = NULL;
if (rb_prev)
*pprev = rb_entry(rb_prev, struct vm_area_struct, vm_rb);
*rb_link = __rb_link;
*rb_parent = __rb_parent;
return 0;
}
/*
* vma_next() - Get the next VMA.
* @mm: The mm_struct.
* @vma: The current vma.
*
* If @vma is NULL, return the first vma in the mm.
*
* Returns: The next VMA after @vma.
*/
static inline struct vm_area_struct *vma_next(struct mm_struct *mm,
struct vm_area_struct *vma)
{
if (!vma)
return mm->mmap;
return vma->vm_next;
}
/*
* munmap_vma_range() - munmap VMAs that overlap a range.
* @mm: The mm struct
* @start: The start of the range.
* @len: The length of the range.
* @pprev: pointer to the pointer that will be set to previous vm_area_struct
* @rb_link: the rb_node
* @rb_parent: the parent rb_node
*
* Find all the vm_area_struct that overlap from @start to
* @end and munmap them. Set @pprev to the previous vm_area_struct.
*
* Returns: -ENOMEM on munmap failure or 0 on success.
*/
static inline int
munmap_vma_range(struct mm_struct *mm, unsigned long start, unsigned long len,
struct vm_area_struct **pprev, struct rb_node ***link,
struct rb_node **parent, struct list_head *uf)
{
while (find_vma_links(mm, start, start + len, pprev, link, parent))
if (do_munmap(mm, start, len, uf))
return -ENOMEM;
return 0;
}
static unsigned long count_vma_pages_range(struct mm_struct *mm,
unsigned long addr, unsigned long end)
{
unsigned long nr_pages = 0;
struct vm_area_struct *vma;
/* Find first overlapping mapping */
vma = find_vma_intersection(mm, addr, end);
if (!vma)
return 0;
nr_pages = (min(end, vma->vm_end) -
max(addr, vma->vm_start)) >> PAGE_SHIFT;
/* Iterate over the rest of the overlaps */
for (vma = vma->vm_next; vma; vma = vma->vm_next) {
unsigned long overlap_len;
if (vma->vm_start > end)
break;
overlap_len = min(end, vma->vm_end) - vma->vm_start;
nr_pages += overlap_len >> PAGE_SHIFT;
}
return nr_pages;
}
void __vma_link_rb(struct mm_struct *mm, struct vm_area_struct *vma,
struct rb_node **rb_link, struct rb_node *rb_parent)
{
/* Update tracking information for the gap following the new vma. */
if (vma->vm_next)
vma_gap_update(vma->vm_next);
else
mm->highest_vm_end = vm_end_gap(vma);
/*
* vma->vm_prev wasn't known when we followed the rbtree to find the
* correct insertion point for that vma. As a result, we could not
* update the vma vm_rb parents rb_subtree_gap values on the way down.
* So, we first insert the vma with a zero rb_subtree_gap value
* (to be consistent with what we did on the way down), and then
* immediately update the gap to the correct value. Finally we
* rebalance the rbtree after all augmented values have been set.
*/
rb_link_node(&vma->vm_rb, rb_parent, rb_link);
vma->rb_subtree_gap = 0;
vma_gap_update(vma);
vma_rb_insert(vma, &mm->mm_rb);
}
static void __vma_link_file(struct vm_area_struct *vma)
{
struct file *file;
file = vma->vm_file;
if (file) {
struct address_space *mapping = file->f_mapping;
if (vma->vm_flags & VM_SHARED)
mapping_allow_writable(mapping);
flush_dcache_mmap_lock(mapping);
vma_interval_tree_insert(vma, &mapping->i_mmap);
flush_dcache_mmap_unlock(mapping);
}
}
static void
__vma_link(struct mm_struct *mm, struct vm_area_struct *vma,
struct vm_area_struct *prev, struct rb_node **rb_link,
struct rb_node *rb_parent)
{
__vma_link_list(mm, vma, prev);
__vma_link_rb(mm, vma, rb_link, rb_parent);
}
static void vma_link(struct mm_struct *mm, struct vm_area_struct *vma,
struct vm_area_struct *prev, struct rb_node **rb_link,
struct rb_node *rb_parent)
{
struct address_space *mapping = NULL;
if (vma->vm_file) {
mapping = vma->vm_file->f_mapping;
i_mmap_lock_write(mapping);
}
__vma_link(mm, vma, prev, rb_link, rb_parent);
__vma_link_file(vma);
if (mapping)
i_mmap_unlock_write(mapping);
mm->map_count++;
validate_mm(mm);
}
/*
* Helper for vma_adjust() in the split_vma insert case: insert a vma into the
* mm's list and rbtree. It has already been inserted into the interval tree.
*/
static void __insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma)
{
struct vm_area_struct *prev;
struct rb_node **rb_link, *rb_parent;
if (find_vma_links(mm, vma->vm_start, vma->vm_end,
&prev, &rb_link, &rb_parent))
BUG();
__vma_link(mm, vma, prev, rb_link, rb_parent);
mm->map_count++;
}
static __always_inline void __vma_unlink(struct mm_struct *mm,
struct vm_area_struct *vma,
struct vm_area_struct *ignore)
{
vma_rb_erase_ignore(vma, &mm->mm_rb, ignore);
__vma_unlink_list(mm, vma);
/* Kill the cache */
vmacache_invalidate(mm);
}
/*
* We cannot adjust vm_start, vm_end, vm_pgoff fields of a vma that
* is already present in an i_mmap tree without adjusting the tree.
* The following helper function should be used when such adjustments
* are necessary. The "insert" vma (if any) is to be inserted
* before we drop the necessary locks.
*/
int __vma_adjust(struct vm_area_struct *vma, unsigned long start,
unsigned long end, pgoff_t pgoff, struct vm_area_struct *insert,
struct vm_area_struct *expand)
{
struct mm_struct *mm = vma->vm_mm;
struct vm_area_struct *next = vma->vm_next, *orig_vma = vma;
struct address_space *mapping = NULL;
struct rb_root_cached *root = NULL;
struct anon_vma *anon_vma = NULL;
struct file *file = vma->vm_file;
bool start_changed = false, end_changed = false;
long adjust_next = 0;
int remove_next = 0;
if (next && !insert) {
struct vm_area_struct *exporter = NULL, *importer = NULL;
if (end >= next->vm_end) {
/*
* vma expands, overlapping all the next, and
* perhaps the one after too (mprotect case 6).
* The only other cases that gets here are
* case 1, case 7 and case 8.
*/
if (next == expand) {
/*
* The only case where we don't expand "vma"
* and we expand "next" instead is case 8.
*/
VM_WARN_ON(end != next->vm_end);
/*
* remove_next == 3 means we're
* removing "vma" and that to do so we
* swapped "vma" and "next".
*/
remove_next = 3;
VM_WARN_ON(file != next->vm_file);
swap(vma, next);
} else {
VM_WARN_ON(expand != vma);
/*
* case 1, 6, 7, remove_next == 2 is case 6,
* remove_next == 1 is case 1 or 7.
*/
remove_next = 1 + (end > next->vm_end);
VM_WARN_ON(remove_next == 2 &&
end != next->vm_next->vm_end);
/* trim end to next, for case 6 first pass */
end = next->vm_end;
}
exporter = next;
importer = vma;
/*
* If next doesn't have anon_vma, import from vma after
* next, if the vma overlaps with it.
*/
if (remove_next == 2 && !next->anon_vma)
exporter = next->vm_next;
} else if (end > next->vm_start) {
/*
* vma expands, overlapping part of the next:
* mprotect case 5 shifting the boundary up.
*/
adjust_next = (end - next->vm_start);
exporter = next;
importer = vma;
VM_WARN_ON(expand != importer);
} else if (end < vma->vm_end) {
/*
* vma shrinks, and !insert tells it's not
* split_vma inserting another: so it must be
* mprotect case 4 shifting the boundary down.
*/
adjust_next = -(vma->vm_end - end);
exporter = vma;
importer = next;
VM_WARN_ON(expand != importer);
}
/*
* Easily overlooked: when mprotect shifts the boundary,
* make sure the expanding vma has anon_vma set if the
* shrinking vma had, to cover any anon pages imported.
*/
if (exporter && exporter->anon_vma && !importer->anon_vma) {
int error;
importer->anon_vma = exporter->anon_vma;
error = anon_vma_clone(importer, exporter);
if (error)
return error;
}
}
again:
vma_adjust_trans_huge(orig_vma, start, end, adjust_next);
if (file) {
mapping = file->f_mapping;
root = &mapping->i_mmap;
uprobe_munmap(vma, vma->vm_start, vma->vm_end);
if (adjust_next)
uprobe_munmap(next, next->vm_start, next->vm_end);
i_mmap_lock_write(mapping);
if (insert) {
/*
* Put into interval tree now, so instantiated pages
* are visible to arm/parisc __flush_dcache_page
* throughout; but we cannot insert into address
* space until vma start or end is updated.
*/
__vma_link_file(insert);
}
}
anon_vma = vma->anon_vma;
if (!anon_vma && adjust_next)
anon_vma = next->anon_vma;
if (anon_vma) {
VM_WARN_ON(adjust_next && next->anon_vma &&
anon_vma != next->anon_vma);
anon_vma_lock_write(anon_vma);
anon_vma_interval_tree_pre_update_vma(vma);
if (adjust_next)
anon_vma_interval_tree_pre_update_vma(next);
}
if (file) {
flush_dcache_mmap_lock(mapping);
vma_interval_tree_remove(vma, root);
if (adjust_next)
vma_interval_tree_remove(next, root);
}
if (start != vma->vm_start) {
vma->vm_start = start;
start_changed = true;
}
if (end != vma->vm_end) {
vma->vm_end = end;
end_changed = true;
}
vma->vm_pgoff = pgoff;
if (adjust_next) {
next->vm_start += adjust_next;
next->vm_pgoff += adjust_next >> PAGE_SHIFT;
}
if (file) {
if (adjust_next)
vma_interval_tree_insert(next, root);
vma_interval_tree_insert(vma, root);
flush_dcache_mmap_unlock(mapping);
}
if (remove_next) {
/*
* vma_merge has merged next into vma, and needs
* us to remove next before dropping the locks.
*/
if (remove_next != 3)
__vma_unlink(mm, next, next);
else
/*
* vma is not before next if they've been
* swapped.
*
* pre-swap() next->vm_start was reduced so
* tell validate_mm_rb to ignore pre-swap()
* "next" (which is stored in post-swap()
* "vma").
*/
__vma_unlink(mm, next, vma);
if (file)
__remove_shared_vm_struct(next, file, mapping);
} else if (insert) {
/*
* split_vma has split insert from vma, and needs
* us to insert it before dropping the locks
* (it may either follow vma or precede it).
*/
__insert_vm_struct(mm, insert);
} else {
if (start_changed)
vma_gap_update(vma);
if (end_changed) {
if (!next)
mm->highest_vm_end = vm_end_gap(vma);
else if (!adjust_next)
vma_gap_update(next);
}
}
if (anon_vma) {
anon_vma_interval_tree_post_update_vma(vma);
if (adjust_next)
anon_vma_interval_tree_post_update_vma(next);
anon_vma_unlock_write(anon_vma);
}
if (file) {
i_mmap_unlock_write(mapping);
uprobe_mmap(vma);
if (adjust_next)
uprobe_mmap(next);
}
if (remove_next) {
if (file) {
uprobe_munmap(next, next->vm_start, next->vm_end);
fput(file);
}
if (next->anon_vma)
anon_vma_merge(vma, next);
mm->map_count--;
mpol_put(vma_policy(next));
vm_area_free(next);
/*
* In mprotect's case 6 (see comments on vma_merge),
* we must remove another next too. It would clutter
* up the code too much to do both in one go.
*/
if (remove_next != 3) {
/*
* If "next" was removed and vma->vm_end was
* expanded (up) over it, in turn
* "next->vm_prev->vm_end" changed and the
* "vma->vm_next" gap must be updated.
*/
next = vma->vm_next;
} else {
/*
* For the scope of the comment "next" and
* "vma" considered pre-swap(): if "vma" was
* removed, next->vm_start was expanded (down)
* over it and the "next" gap must be updated.
* Because of the swap() the post-swap() "vma"
* actually points to pre-swap() "next"
* (post-swap() "next" as opposed is now a
* dangling pointer).
*/
next = vma;
}
if (remove_next == 2) {
remove_next = 1;
end = next->vm_end;
goto again;
}
else if (next)
vma_gap_update(next);
else {
/*
* If remove_next == 2 we obviously can't
* reach this path.
*
* If remove_next == 3 we can't reach this
* path because pre-swap() next is always not
* NULL. pre-swap() "next" is not being
* removed and its next->vm_end is not altered
* (and furthermore "end" already matches
* next->vm_end in remove_next == 3).
*
* We reach this only in the remove_next == 1
* case if the "next" vma that was removed was
* the highest vma of the mm. However in such
* case next->vm_end == "end" and the extended
* "vma" has vma->vm_end == next->vm_end so
* mm->highest_vm_end doesn't need any update
* in remove_next == 1 case.
*/
VM_WARN_ON(mm->highest_vm_end != vm_end_gap(vma));
}
}
if (insert && file)
uprobe_mmap(insert);
validate_mm(mm);
return 0;
}
/*
* If the vma has a ->close operation then the driver probably needs to release
* per-vma resources, so we don't attempt to merge those.
*/
static inline int is_mergeable_vma(struct vm_area_struct *vma,
struct file *file, unsigned long vm_flags,
struct vm_userfaultfd_ctx vm_userfaultfd_ctx)
{
/*
* VM_SOFTDIRTY should not prevent from VMA merging, if we
* match the flags but dirty bit -- the caller should mark
* merged VMA as dirty. If dirty bit won't be excluded from
* comparison, we increase pressure on the memory system forcing
* the kernel to generate new VMAs when old one could be
* extended instead.
*/
if ((vma->vm_flags ^ vm_flags) & ~VM_SOFTDIRTY)
return 0;
if (vma->vm_file != file)
return 0;
if (vma->vm_ops && vma->vm_ops->close)
return 0;
if (!is_mergeable_vm_userfaultfd_ctx(vma, vm_userfaultfd_ctx))
return 0;
return 1;
}
static inline int is_mergeable_anon_vma(struct anon_vma *anon_vma1,
struct anon_vma *anon_vma2,
struct vm_area_struct *vma)
{
/*
* The list_is_singular() test is to avoid merging VMA cloned from
* parents. This can improve scalability caused by anon_vma lock.
*/
if ((!anon_vma1 || !anon_vma2) && (!vma ||
list_is_singular(&vma->anon_vma_chain)))
return 1;
return anon_vma1 == anon_vma2;
}
/*
* Return true if we can merge this (vm_flags,anon_vma,file,vm_pgoff)
* in front of (at a lower virtual address and file offset than) the vma.
*
* We cannot merge two vmas if they have differently assigned (non-NULL)
* anon_vmas, nor if same anon_vma is assigned but offsets incompatible.
*
* We don't check here for the merged mmap wrapping around the end of pagecache
* indices (16TB on ia32) because do_mmap() does not permit mmap's which
* wrap, nor mmaps which cover the final page at index -1UL.
*/
static int
can_vma_merge_before(struct vm_area_struct *vma, unsigned long vm_flags,
struct anon_vma *anon_vma, struct file *file,
pgoff_t vm_pgoff,
struct vm_userfaultfd_ctx vm_userfaultfd_ctx)
{
if (is_mergeable_vma(vma, file, vm_flags, vm_userfaultfd_ctx) &&
is_mergeable_anon_vma(anon_vma, vma->anon_vma, vma)) {
if (vma->vm_pgoff == vm_pgoff)
return 1;
}
return 0;
}
/*
* Return true if we can merge this (vm_flags,anon_vma,file,vm_pgoff)
* beyond (at a higher virtual address and file offset than) the vma.
*
* We cannot merge two vmas if they have differently assigned (non-NULL)
* anon_vmas, nor if same anon_vma is assigned but offsets incompatible.
*/
static int
can_vma_merge_after(struct vm_area_struct *vma, unsigned long vm_flags,
struct anon_vma *anon_vma, struct file *file,
pgoff_t vm_pgoff,
struct vm_userfaultfd_ctx vm_userfaultfd_ctx)
{
if (is_mergeable_vma(vma, file, vm_flags, vm_userfaultfd_ctx) &&
is_mergeable_anon_vma(anon_vma, vma->anon_vma, vma)) {
pgoff_t vm_pglen;
vm_pglen = vma_pages(vma);
if (vma->vm_pgoff + vm_pglen == vm_pgoff)
return 1;
}
return 0;
}
/*
* Given a mapping request (addr,end,vm_flags,file,pgoff), figure out
* whether that can be merged with its predecessor or its successor.
* Or both (it neatly fills a hole).
*
* In most cases - when called for mmap, brk or mremap - [addr,end) is
* certain not to be mapped by the time vma_merge is called; but when
* called for mprotect, it is certain to be already mapped (either at
* an offset within prev, or at the start of next), and the flags of
* this area are about to be changed to vm_flags - and the no-change
* case has already been eliminated.
*
* The following mprotect cases have to be considered, where AAAA is
* the area passed down from mprotect_fixup, never extending beyond one
* vma, PPPPPP is the prev vma specified, and NNNNNN the next vma after:
*
* AAAA AAAA AAAA
* PPPPPPNNNNNN PPPPPPNNNNNN PPPPPPNNNNNN
* cannot merge might become might become
* PPNNNNNNNNNN PPPPPPPPPPNN
* mmap, brk or case 4 below case 5 below
* mremap move:
* AAAA AAAA
* PPPP NNNN PPPPNNNNXXXX
* might become might become
* PPPPPPPPPPPP 1 or PPPPPPPPPPPP 6 or
* PPPPPPPPNNNN 2 or PPPPPPPPXXXX 7 or
* PPPPNNNNNNNN 3 PPPPXXXXXXXX 8
*
* It is important for case 8 that the vma NNNN overlapping the
* region AAAA is never going to extended over XXXX. Instead XXXX must
* be extended in region AAAA and NNNN must be removed. This way in
* all cases where vma_merge succeeds, the moment vma_adjust drops the
* rmap_locks, the properties of the merged vma will be already
* correct for the whole merged range. Some of those properties like
* vm_page_prot/vm_flags may be accessed by rmap_walks and they must
* be correct for the whole merged range immediately after the
* rmap_locks are released. Otherwise if XXXX would be removed and
* NNNN would be extended over the XXXX range, remove_migration_ptes
* or other rmap walkers (if working on addresses beyond the "end"
* parameter) may establish ptes with the wrong permissions of NNNN
* instead of the right permissions of XXXX.
*/
struct vm_area_struct *vma_merge(struct mm_struct *mm,
struct vm_area_struct *prev, unsigned long addr,
unsigned long end, unsigned long vm_flags,
struct anon_vma *anon_vma, struct file *file,
pgoff_t pgoff, struct mempolicy *policy,
struct vm_userfaultfd_ctx vm_userfaultfd_ctx)
{
pgoff_t pglen = (end - addr) >> PAGE_SHIFT;
struct vm_area_struct *area, *next;
int err;
/*
* We later require that vma->vm_flags == vm_flags,
* so this tests vma->vm_flags & VM_SPECIAL, too.
*/
if (vm_flags & VM_SPECIAL)
return NULL;
next = vma_next(mm, prev);
area = next;
if (area && area->vm_end == end) /* cases 6, 7, 8 */
next = next->vm_next;
/* verify some invariant that must be enforced by the caller */
VM_WARN_ON(prev && addr <= prev->vm_start);
VM_WARN_ON(area && end > area->vm_end);
VM_WARN_ON(addr >= end);
/*
* Can it merge with the predecessor?
*/
if (prev && prev->vm_end == addr &&
mpol_equal(vma_policy(prev), policy) &&
can_vma_merge_after(prev, vm_flags,
anon_vma, file, pgoff,
vm_userfaultfd_ctx)) {
/*
* OK, it can. Can we now merge in the successor as well?
*/
if (next && end == next->vm_start &&
mpol_equal(policy, vma_policy(next)) &&
can_vma_merge_before(next, vm_flags,
anon_vma, file,
pgoff+pglen,
vm_userfaultfd_ctx) &&
is_mergeable_anon_vma(prev->anon_vma,
next->anon_vma, NULL)) {
/* cases 1, 6 */
err = __vma_adjust(prev, prev->vm_start,
next->vm_end, prev->vm_pgoff, NULL,
prev);
} else /* cases 2, 5, 7 */
err = __vma_adjust(prev, prev->vm_start,
end, prev->vm_pgoff, NULL, prev);
if (err)
return NULL;
khugepaged_enter_vma_merge(prev, vm_flags);
return prev;
}
/*
* Can this new request be merged in front of next?
*/
if (next && end == next->vm_start &&
mpol_equal(policy, vma_policy(next)) &&
can_vma_merge_before(next, vm_flags,
anon_vma, file, pgoff+pglen,
vm_userfaultfd_ctx)) {
if (prev && addr < prev->vm_end) /* case 4 */
err = __vma_adjust(prev, prev->vm_start,
addr, prev->vm_pgoff, NULL, next);
else { /* cases 3, 8 */
err = __vma_adjust(area, addr, next->vm_end,
next->vm_pgoff - pglen, NULL, next);
/*
* In case 3 area is already equal to next and
* this is a noop, but in case 8 "area" has
* been removed and next was expanded over it.
*/
area = next;
}
if (err)
return NULL;
khugepaged_enter_vma_merge(area, vm_flags);
return area;
}
return NULL;
}
/*
* Rough compatibility check to quickly see if it's even worth looking
* at sharing an anon_vma.
*
* They need to have the same vm_file, and the flags can only differ
* in things that mprotect may change.
*
* NOTE! The fact that we share an anon_vma doesn't _have_ to mean that
* we can merge the two vma's. For example, we refuse to merge a vma if
* there is a vm_ops->close() function, because that indicates that the
* driver is doing some kind of reference counting. But that doesn't
* really matter for the anon_vma sharing case.
*/
static int anon_vma_compatible(struct vm_area_struct *a, struct vm_area_struct *b)
{
return a->vm_end == b->vm_start && mpol_equal(vma_policy(a), vma_policy(b)) && a->vm_file == b->vm_file && !((a->vm_flags ^ b->vm_flags) & ~(VM_ACCESS_FLAGS | VM_SOFTDIRTY)) && b->vm_pgoff == a->vm_pgoff + ((b->vm_start - a->vm_start) >> PAGE_SHIFT);
}
/*
* Do some basic sanity checking to see if we can re-use the anon_vma
* from 'old'. The 'a'/'b' vma's are in VM order - one of them will be
* the same as 'old', the other will be the new one that is trying
* to share the anon_vma.
*
* NOTE! This runs with mm_sem held for reading, so it is possible that
* the anon_vma of 'old' is concurrently in the process of being set up
* by another page fault trying to merge _that_. But that's ok: if it
* is being set up, that automatically means that it will be a singleton
* acceptable for merging, so we can do all of this optimistically. But
* we do that READ_ONCE() to make sure that we never re-load the pointer.
*
* IOW: that the "list_is_singular()" test on the anon_vma_chain only
* matters for the 'stable anon_vma' case (ie the thing we want to avoid
* is to return an anon_vma that is "complex" due to having gone through
* a fork).
*
* We also make sure that the two vma's are compatible (adjacent,
* and with the same memory policies). That's all stable, even with just
* a read lock on the mm_sem.
*/
static struct anon_vma *reusable_anon_vma(struct vm_area_struct *old, struct vm_area_struct *a, struct vm_area_struct *b)
{
if (anon_vma_compatible(a, b)) { struct anon_vma *anon_vma = READ_ONCE(old->anon_vma); if (anon_vma && list_is_singular(&old->anon_vma_chain))
return anon_vma;
}
return NULL;
}
/*
* find_mergeable_anon_vma is used by anon_vma_prepare, to check
* neighbouring vmas for a suitable anon_vma, before it goes off
* to allocate a new anon_vma. It checks because a repetitive
* sequence of mprotects and faults may otherwise lead to distinct
* anon_vmas being allocated, preventing vma merge in subsequent
* mprotect.
*/
struct anon_vma *find_mergeable_anon_vma(struct vm_area_struct *vma)
{
struct anon_vma *anon_vma = NULL;
/* Try next first. */
if (vma->vm_next) {
anon_vma = reusable_anon_vma(vma->vm_next, vma, vma->vm_next);
if (anon_vma)
return anon_vma;
}
/* Try prev next. */
if (vma->vm_prev)
anon_vma = reusable_anon_vma(vma->vm_prev, vma->vm_prev, vma);
/*
* We might reach here with anon_vma == NULL if we can't find
* any reusable anon_vma.
* There's no absolute need to look only at touching neighbours:
* we could search further afield for "compatible" anon_vmas.
* But it would probably just be a waste of time searching,
* or lead to too many vmas hanging off the same anon_vma.
* We're trying to allow mprotect remerging later on,
* not trying to minimize memory used for anon_vmas.
*/
return anon_vma;
}
/*
* If a hint addr is less than mmap_min_addr change hint to be as
* low as possible but still greater than mmap_min_addr
*/
static inline unsigned long round_hint_to_min(unsigned long hint)
{
hint &= PAGE_MASK;
if (((void *)hint != NULL) &&
(hint < mmap_min_addr))
return PAGE_ALIGN(mmap_min_addr);
return hint;
}
int mlock_future_check(struct mm_struct *mm, unsigned long flags,
unsigned long len)
{
unsigned long locked, lock_limit;
/* mlock MCL_FUTURE? */
if (flags & VM_LOCKED) {
locked = len >> PAGE_SHIFT;
locked += mm->locked_vm;
lock_limit = rlimit(RLIMIT_MEMLOCK);
lock_limit >>= PAGE_SHIFT;
if (locked > lock_limit && !capable(CAP_IPC_LOCK))
return -EAGAIN;
}
return 0;
}
static inline u64 file_mmap_size_max(struct file *file, struct inode *inode)
{
if (S_ISREG(inode->i_mode))
return MAX_LFS_FILESIZE;
if (S_ISBLK(inode->i_mode))
return MAX_LFS_FILESIZE;
if (S_ISSOCK(inode->i_mode))
return MAX_LFS_FILESIZE;
/* Special "we do even unsigned file positions" case */
if (file->f_mode & FMODE_UNSIGNED_OFFSET)
return 0;
/* Yes, random drivers might want more. But I'm tired of buggy drivers */
return ULONG_MAX;
}
static inline bool file_mmap_ok(struct file *file, struct inode *inode,
unsigned long pgoff, unsigned long len)
{
u64 maxsize = file_mmap_size_max(file, inode);
if (maxsize && len > maxsize)
return false;
maxsize -= len;
if (pgoff > maxsize >> PAGE_SHIFT)
return false;
return true;
}
/*
* The caller must write-lock current->mm->mmap_lock.
*/
unsigned long do_mmap(struct file *file, unsigned long addr,
unsigned long len, unsigned long prot,
unsigned long flags, unsigned long pgoff,
unsigned long *populate, struct list_head *uf)
{
struct mm_struct *mm = current->mm;
vm_flags_t vm_flags;
int pkey = 0;
*populate = 0;
if (!len)
return -EINVAL;
/*
* Does the application expect PROT_READ to imply PROT_EXEC?
*
* (the exception is when the underlying filesystem is noexec
* mounted, in which case we dont add PROT_EXEC.)
*/
if ((prot & PROT_READ) && (current->personality & READ_IMPLIES_EXEC))
if (!(file && path_noexec(&file->f_path)))
prot |= PROT_EXEC;
/* force arch specific MAP_FIXED handling in get_unmapped_area */
if (flags & MAP_FIXED_NOREPLACE)
flags |= MAP_FIXED;
if (!(flags & MAP_FIXED))
addr = round_hint_to_min(addr);
/* Careful about overflows.. */
len = PAGE_ALIGN(len);
if (!len)
return -ENOMEM;
/* offset overflow? */
if ((pgoff + (len >> PAGE_SHIFT)) < pgoff)
return -EOVERFLOW;
/* Too many mappings? */
if (mm->map_count > sysctl_max_map_count)
return -ENOMEM;
/* Obtain the address to map to. we verify (or select) it and ensure
* that it represents a valid section of the address space.
*/
addr = get_unmapped_area(file, addr, len, pgoff, flags);
if (IS_ERR_VALUE(addr))
return addr;
if (flags & MAP_FIXED_NOREPLACE) {
if (find_vma_intersection(mm, addr, addr + len))
return -EEXIST;
}
if (prot == PROT_EXEC) {
pkey = execute_only_pkey(mm);
if (pkey < 0)
pkey = 0;
}
/* Do simple checking here so the lower-level routines won't have
* to. we assume access permissions have been handled by the open
* of the memory object, so we don't do any here.
*/
vm_flags = calc_vm_prot_bits(prot, pkey) | calc_vm_flag_bits(flags) |
mm->def_flags | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC;
if (flags & MAP_LOCKED)
if (!can_do_mlock())
return -EPERM;
if (mlock_future_check(mm, vm_flags, len))
return -EAGAIN;
if (file) {
struct inode *inode = file_inode(file);
unsigned long flags_mask;
if (!file_mmap_ok(file, inode, pgoff, len))
return -EOVERFLOW;
flags_mask = LEGACY_MAP_MASK | file->f_op->mmap_supported_flags;
switch (flags & MAP_TYPE) {
case MAP_SHARED:
/*
* Force use of MAP_SHARED_VALIDATE with non-legacy
* flags. E.g. MAP_SYNC is dangerous to use with
* MAP_SHARED as you don't know which consistency model
* you will get. We silently ignore unsupported flags
* with MAP_SHARED to preserve backward compatibility.
*/
flags &= LEGACY_MAP_MASK;
fallthrough;
case MAP_SHARED_VALIDATE:
if (flags & ~flags_mask)
return -EOPNOTSUPP;
if (prot & PROT_WRITE) {
if (!(file->f_mode & FMODE_WRITE))
return -EACCES;
if (IS_SWAPFILE(file->f_mapping->host))
return -ETXTBSY;
}
/*
* Make sure we don't allow writing to an append-only
* file..
*/
if (IS_APPEND(inode) && (file->f_mode & FMODE_WRITE))
return -EACCES;
vm_flags |= VM_SHARED | VM_MAYSHARE;
if (!(file->f_mode & FMODE_WRITE))
vm_flags &= ~(VM_MAYWRITE | VM_SHARED);
fallthrough;
case MAP_PRIVATE:
if (!(file->f_mode & FMODE_READ))
return -EACCES;
if (path_noexec(&file->f_path)) {
if (vm_flags & VM_EXEC)
return -EPERM;
vm_flags &= ~VM_MAYEXEC;
}
if (!file->f_op->mmap)
return -ENODEV;
if (vm_flags & (VM_GROWSDOWN|VM_GROWSUP))
return -EINVAL;
break;
default:
return -EINVAL;
}
} else {
switch (flags & MAP_TYPE) {
case MAP_SHARED:
if (vm_flags & (VM_GROWSDOWN|VM_GROWSUP))
return -EINVAL;
/*
* Ignore pgoff.
*/
pgoff = 0;
vm_flags |= VM_SHARED | VM_MAYSHARE;
break;
case MAP_PRIVATE:
/*
* Set pgoff according to addr for anon_vma.
*/
pgoff = addr >> PAGE_SHIFT;
break;
default:
return -EINVAL;
}
}
/*
* Set 'VM_NORESERVE' if we should not account for the
* memory use of this mapping.
*/
if (flags & MAP_NORESERVE) {
/* We honor MAP_NORESERVE if allowed to overcommit */
if (sysctl_overcommit_memory != OVERCOMMIT_NEVER)
vm_flags |= VM_NORESERVE;
/* hugetlb applies strict overcommit unless MAP_NORESERVE */
if (file && is_file_hugepages(file))
vm_flags |= VM_NORESERVE;
}
addr = mmap_region(file, addr, len, vm_flags, pgoff, uf);
if (!IS_ERR_VALUE(addr) &&
((vm_flags & VM_LOCKED) ||
(flags & (MAP_POPULATE | MAP_NONBLOCK)) == MAP_POPULATE))
*populate = len;
return addr;
}
unsigned long ksys_mmap_pgoff(unsigned long addr, unsigned long len,
unsigned long prot, unsigned long flags,
unsigned long fd, unsigned long pgoff)
{
struct file *file = NULL;
unsigned long retval;
if (!(flags & MAP_ANONYMOUS)) {
audit_mmap_fd(fd, flags);
file = fget(fd);
if (!file)
return -EBADF;
if (is_file_hugepages(file)) {
len = ALIGN(len, huge_page_size(hstate_file(file)));
} else if (unlikely(flags & MAP_HUGETLB)) {
retval = -EINVAL;
goto out_fput;
}
} else if (flags & MAP_HUGETLB) {
struct ucounts *ucounts = NULL;
struct hstate *hs;
hs = hstate_sizelog((flags >> MAP_HUGE_SHIFT) & MAP_HUGE_MASK);
if (!hs)
return -EINVAL;
len = ALIGN(len, huge_page_size(hs));
/*
* VM_NORESERVE is used because the reservations will be
* taken when vm_ops->mmap() is called
* A dummy user value is used because we are not locking
* memory so no accounting is necessary
*/
file = hugetlb_file_setup(HUGETLB_ANON_FILE, len,
VM_NORESERVE,
&ucounts, HUGETLB_ANONHUGE_INODE,
(flags >> MAP_HUGE_SHIFT) & MAP_HUGE_MASK);
if (IS_ERR(file))
return PTR_ERR(file);
}
retval = vm_mmap_pgoff(file, addr, len, prot, flags, pgoff);
out_fput:
if (file)
fput(file);
return retval;
}
SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len,
unsigned long, prot, unsigned long, flags,
unsigned long, fd, unsigned long, pgoff)
{
return ksys_mmap_pgoff(addr, len, prot, flags, fd, pgoff);
}
#ifdef __ARCH_WANT_SYS_OLD_MMAP
struct mmap_arg_struct {
unsigned long addr;
unsigned long len;
unsigned long prot;
unsigned long flags;
unsigned long fd;
unsigned long offset;
};
SYSCALL_DEFINE1(old_mmap, struct mmap_arg_struct __user *, arg)
{
struct mmap_arg_struct a;
if (copy_from_user(&a, arg, sizeof(a)))
return -EFAULT;
if (offset_in_page(a.offset))
return -EINVAL;
return ksys_mmap_pgoff(a.addr, a.len, a.prot, a.flags, a.fd,
a.offset >> PAGE_SHIFT);
}
#endif /* __ARCH_WANT_SYS_OLD_MMAP */
/*
* Some shared mappings will want the pages marked read-only
* to track write events. If so, we'll downgrade vm_page_prot
* to the private version (using protection_map[] without the
* VM_SHARED bit).
*/
int vma_wants_writenotify(struct vm_area_struct *vma, pgprot_t vm_page_prot)
{
vm_flags_t vm_flags = vma->vm_flags;
const struct vm_operations_struct *vm_ops = vma->vm_ops;
/* If it was private or non-writable, the write bit is already clear */
if ((vm_flags & (VM_WRITE|VM_SHARED)) != ((VM_WRITE|VM_SHARED)))
return 0;
/* The backer wishes to know when pages are first written to? */
if (vm_ops && (vm_ops->page_mkwrite || vm_ops->pfn_mkwrite))
return 1;
/* The open routine did something to the protections that pgprot_modify
* won't preserve? */
if (pgprot_val(vm_page_prot) !=
pgprot_val(vm_pgprot_modify(vm_page_prot, vm_flags)))
return 0;
/* Do we need to track softdirty? */
if (IS_ENABLED(CONFIG_MEM_SOFT_DIRTY) && !(vm_flags & VM_SOFTDIRTY))
return 1;
/* Specialty mapping? */
if (vm_flags & VM_PFNMAP)
return 0;
/* Can the mapping track the dirty pages? */
return vma->vm_file && vma->vm_file->f_mapping &&
mapping_can_writeback(vma->vm_file->f_mapping);
}
/*
* We account for memory if it's a private writeable mapping,
* not hugepages and VM_NORESERVE wasn't set.
*/
static inline int accountable_mapping(struct file *file, vm_flags_t vm_flags)
{
/*
* hugetlb has its own accounting separate from the core VM
* VM_HUGETLB may not be set yet so we cannot check for that flag.
*/
if (file && is_file_hugepages(file))
return 0;
return (vm_flags & (VM_NORESERVE | VM_SHARED | VM_WRITE)) == VM_WRITE;
}
unsigned long mmap_region(struct file *file, unsigned long addr,
unsigned long len, vm_flags_t vm_flags, unsigned long pgoff,
struct list_head *uf)
{
struct mm_struct *mm = current->mm;
struct vm_area_struct *vma, *prev, *merge;
int error;
struct rb_node **rb_link, *rb_parent;
unsigned long charged = 0;
/* Check against address space limit. */
if (!may_expand_vm(mm, vm_flags, len >> PAGE_SHIFT)) {
unsigned long nr_pages;
/*
* MAP_FIXED may remove pages of mappings that intersects with
* requested mapping. Account for the pages it would unmap.
*/
nr_pages = count_vma_pages_range(mm, addr, addr + len);
if (!may_expand_vm(mm, vm_flags,
(len >> PAGE_SHIFT) - nr_pages))
return -ENOMEM;
}
/* Clear old maps, set up prev, rb_link, rb_parent, and uf */
if (munmap_vma_range(mm, addr, len, &prev, &rb_link, &rb_parent, uf))
return -ENOMEM;
/*
* Private writable mapping: check memory availability
*/
if (accountable_mapping(file, vm_flags)) {
charged = len >> PAGE_SHIFT;
if (security_vm_enough_memory_mm(mm, charged))
return -ENOMEM;
vm_flags |= VM_ACCOUNT;
}
/*
* Can we just expand an old mapping?
*/
vma = vma_merge(mm, prev, addr, addr + len, vm_flags,
NULL, file, pgoff, NULL, NULL_VM_UFFD_CTX);
if (vma)
goto out;
/*
* Determine the object being mapped and call the appropriate
* specific mapper. the address has already been validated, but
* not unmapped, but the maps are removed from the list.
*/
vma = vm_area_alloc(mm);
if (!vma) {
error = -ENOMEM;
goto unacct_error;
}
vma->vm_start = addr;
vma->vm_end = addr + len;
vma->vm_flags = vm_flags;
vma->vm_page_prot = vm_get_page_prot(vm_flags);
vma->vm_pgoff = pgoff;
if (file) {
if (vm_flags & VM_SHARED) {
error = mapping_map_writable(file->f_mapping);
if (error)
goto free_vma;
}
vma->vm_file = get_file(file);
error = call_mmap(file, vma);
if (error)
goto unmap_and_free_vma;
/* Can addr have changed??
*
* Answer: Yes, several device drivers can do it in their
* f_op->mmap method. -DaveM
* Bug: If addr is changed, prev, rb_link, rb_parent should
* be updated for vma_link()
*/
WARN_ON_ONCE(addr != vma->vm_start);
addr = vma->vm_start;
/* If vm_flags changed after call_mmap(), we should try merge vma again
* as we may succeed this time.
*/
if (unlikely(vm_flags != vma->vm_flags && prev)) {
merge = vma_merge(mm, prev, vma->vm_start, vma->vm_end, vma->vm_flags,
NULL, vma->vm_file, vma->vm_pgoff, NULL, NULL_VM_UFFD_CTX);
if (merge) {
/* ->mmap() can change vma->vm_file and fput the original file. So
* fput the vma->vm_file here or we would add an extra fput for file
* and cause general protection fault ultimately.
*/
fput(vma->vm_file);
vm_area_free(vma);
vma = merge;
/* Update vm_flags to pick up the change. */
vm_flags = vma->vm_flags;
goto unmap_writable;
}
}
vm_flags = vma->vm_flags;
} else if (vm_flags & VM_SHARED) {
error = shmem_zero_setup(vma);
if (error)
goto free_vma;
} else {
vma_set_anonymous(vma);
}
/* Allow architectures to sanity-check the vm_flags */
if (!arch_validate_flags(vma->vm_flags)) {
error = -EINVAL;
if (file)
goto unmap_and_free_vma;
else
goto free_vma;
}
vma_link(mm, vma, prev, rb_link, rb_parent);
/* Once vma denies write, undo our temporary denial count */
unmap_writable:
if (file && vm_flags & VM_SHARED)
mapping_unmap_writable(file->f_mapping);
file = vma->vm_file;
out:
perf_event_mmap(vma);
vm_stat_account(mm, vm_flags, len >> PAGE_SHIFT);
if (vm_flags & VM_LOCKED) {
if ((vm_flags & VM_SPECIAL) || vma_is_dax(vma) ||
is_vm_hugetlb_page(vma) ||
vma == get_gate_vma(current->mm))
vma->vm_flags &= VM_LOCKED_CLEAR_MASK;
else
mm->locked_vm += (len >> PAGE_SHIFT);
}
if (file)
uprobe_mmap(vma);
/*
* New (or expanded) vma always get soft dirty status.
* Otherwise user-space soft-dirty page tracker won't
* be able to distinguish situation when vma area unmapped,
* then new mapped in-place (which must be aimed as
* a completely new data area).
*/
vma->vm_flags |= VM_SOFTDIRTY;
vma_set_page_prot(vma);
return addr;
unmap_and_free_vma:
fput(vma->vm_file);
vma->vm_file = NULL;
/* Undo any partial mapping done by a device driver. */
unmap_region(mm, vma, prev, vma->vm_start, vma->vm_end);
charged = 0;
if (vm_flags & VM_SHARED)
mapping_unmap_writable(file->f_mapping);
free_vma:
vm_area_free(vma);
unacct_error:
if (charged)
vm_unacct_memory(charged);
return error;
}
static unsigned long unmapped_area(struct vm_unmapped_area_info *info)
{
/*
* We implement the search by looking for an rbtree node that
* immediately follows a suitable gap. That is,
* - gap_start = vma->vm_prev->vm_end <= info->high_limit - length;
* - gap_end = vma->vm_start >= info->low_limit + length;
* - gap_end - gap_start >= length
*/
struct mm_struct *mm = current->mm;
struct vm_area_struct *vma;
unsigned long length, low_limit, high_limit, gap_start, gap_end;
/* Adjust search length to account for worst case alignment overhead */
length = info->length + info->align_mask;
if (length < info->length)
return -ENOMEM;
/* Adjust search limits by the desired length */
if (info->high_limit < length)
return -ENOMEM;
high_limit = info->high_limit - length;
if (info->low_limit > high_limit)
return -ENOMEM;
low_limit = info->low_limit + length;
/* Check if rbtree root looks promising */
if (RB_EMPTY_ROOT(&mm->mm_rb))
goto check_highest;
vma = rb_entry(mm->mm_rb.rb_node, struct vm_area_struct, vm_rb);
if (vma->rb_subtree_gap < length)
goto check_highest;
while (true) {
/* Visit left subtree if it looks promising */
gap_end = vm_start_gap(vma);
if (gap_end >= low_limit && vma->vm_rb.rb_left) {
struct vm_area_struct *left =
rb_entry(vma->vm_rb.rb_left,
struct vm_area_struct, vm_rb);
if (left->rb_subtree_gap >= length) {
vma = left;
continue;
}
}
gap_start = vma->vm_prev ? vm_end_gap(vma->vm_prev) : 0;
check_current:
/* Check if current node has a suitable gap */
if (gap_start > high_limit)
return -ENOMEM;
if (gap_end >= low_limit &&
gap_end > gap_start && gap_end - gap_start >= length)
goto found;
/* Visit right subtree if it looks promising */
if (vma->vm_rb.rb_right) {
struct vm_area_struct *right =
rb_entry(vma->vm_rb.rb_right,
struct vm_area_struct, vm_rb);
if (right->rb_subtree_gap >= length) {
vma = right;
continue;
}
}
/* Go back up the rbtree to find next candidate node */
while (true) {
struct rb_node *prev = &vma->vm_rb;
if (!rb_parent(prev))
goto check_highest;
vma = rb_entry(rb_parent(prev),
struct vm_area_struct, vm_rb);
if (prev == vma->vm_rb.rb_left) {
gap_start = vm_end_gap(vma->vm_prev);
gap_end = vm_start_gap(vma);
goto check_current;
}
}
}
check_highest:
/* Check highest gap, which does not precede any rbtree node */
gap_start = mm->highest_vm_end;
gap_end = ULONG_MAX; /* Only for VM_BUG_ON below */
if (gap_start > high_limit)
return -ENOMEM;
found:
/* We found a suitable gap. Clip it with the original low_limit. */
if (gap_start < info->low_limit)
gap_start = info->low_limit;
/* Adjust gap address to the desired alignment */
gap_start += (info->align_offset - gap_start) & info->align_mask;
VM_BUG_ON(gap_start + info->length > info->high_limit);
VM_BUG_ON(gap_start + info->length > gap_end);
return gap_start;
}
static unsigned long unmapped_area_topdown(struct vm_unmapped_area_info *info)
{
struct mm_struct *mm = current->mm;
struct vm_area_struct *vma;
unsigned long length, low_limit, high_limit, gap_start, gap_end;
/* Adjust search length to account for worst case alignment overhead */
length = info->length + info->align_mask;
if (length < info->length)
return -ENOMEM;
/*
* Adjust search limits by the desired length.
* See implementation comment at top of unmapped_area().
*/
gap_end = info->high_limit;
if (gap_end < length)
return -ENOMEM;
high_limit = gap_end - length;
if (info->low_limit > high_limit)
return -ENOMEM;
low_limit = info->low_limit + length;
/* Check highest gap, which does not precede any rbtree node */
gap_start = mm->highest_vm_end;
if (gap_start <= high_limit)
goto found_highest;
/* Check if rbtree root looks promising */
if (RB_EMPTY_ROOT(&mm->mm_rb))
return -ENOMEM;
vma = rb_entry(mm->mm_rb.rb_node, struct vm_area_struct, vm_rb);
if (vma->rb_subtree_gap < length)
return -ENOMEM;
while (true) {
/* Visit right subtree if it looks promising */
gap_start = vma->vm_prev ? vm_end_gap(vma->vm_prev) : 0;
if (gap_start <= high_limit && vma->vm_rb.rb_right) {
struct vm_area_struct *right =
rb_entry(vma->vm_rb.rb_right,
struct vm_area_struct, vm_rb);
if (right->rb_subtree_gap >= length) {
vma = right;
continue;
}
}
check_current:
/* Check if current node has a suitable gap */
gap_end = vm_start_gap(vma);
if (gap_end < low_limit)
return -ENOMEM;
if (gap_start <= high_limit &&
gap_end > gap_start && gap_end - gap_start >= length)
goto found;
/* Visit left subtree if it looks promising */
if (vma->vm_rb.rb_left) {
struct vm_area_struct *left =
rb_entry(vma->vm_rb.rb_left,
struct vm_area_struct, vm_rb);
if (left->rb_subtree_gap >= length) {
vma = left;
continue;
}
}
/* Go back up the rbtree to find next candidate node */
while (true) {
struct rb_node *prev = &vma->vm_rb;
if (!rb_parent(prev))
return -ENOMEM;
vma = rb_entry(rb_parent(prev),
struct vm_area_struct, vm_rb);
if (prev == vma->vm_rb.rb_right) {
gap_start = vma->vm_prev ?
vm_end_gap(vma->vm_prev) : 0;
goto check_current;
}
}
}
found:
/* We found a suitable gap. Clip it with the original high_limit. */
if (gap_end > info->high_limit)
gap_end = info->high_limit;
found_highest:
/* Compute highest gap address at the desired alignment */
gap_end -= info->length;
gap_end -= (gap_end - info->align_offset) & info->align_mask;
VM_BUG_ON(gap_end < info->low_limit);
VM_BUG_ON(gap_end < gap_start);
return gap_end;
}
/*
* Search for an unmapped address range.
*
* We are looking for a range that:
* - does not intersect with any VMA;
* - is contained within the [low_limit, high_limit) interval;
* - is at least the desired size.
* - satisfies (begin_addr & align_mask) == (align_offset & align_mask)
*/
unsigned long vm_unmapped_area(struct vm_unmapped_area_info *info)
{
unsigned long addr;
if (info->flags & VM_UNMAPPED_AREA_TOPDOWN)
addr = unmapped_area_topdown(info);
else
addr = unmapped_area(info);
trace_vm_unmapped_area(addr, info);
return addr;
}
/* Get an address range which is currently unmapped.
* For shmat() with addr=0.
*
* Ugly calling convention alert:
* Return value with the low bits set means error value,
* ie
* if (ret & ~PAGE_MASK)
* error = ret;
*
* This function "knows" that -ENOMEM has the bits set.
*/
#ifndef HAVE_ARCH_UNMAPPED_AREA
unsigned long
arch_get_unmapped_area(struct file *filp, unsigned long addr,
unsigned long len, unsigned long pgoff, unsigned long flags)
{
struct mm_struct *mm = current->mm;
struct vm_area_struct *vma, *prev;
struct vm_unmapped_area_info info;
const unsigned long mmap_end = arch_get_mmap_end(addr);
if (len > mmap_end - mmap_min_addr)
return -ENOMEM;
if (flags & MAP_FIXED)
return addr;
if (addr) {
addr = PAGE_ALIGN(addr);
vma = find_vma_prev(mm, addr, &prev);
if (mmap_end - len >= addr && addr >= mmap_min_addr &&
(!vma || addr + len <= vm_start_gap(vma)) &&
(!prev || addr >= vm_end_gap(prev)))
return addr;
}
info.flags = 0;
info.length = len;
info.low_limit = mm->mmap_base;
info.high_limit = mmap_end;
info.align_mask = 0;
info.align_offset = 0;
return vm_unmapped_area(&info);
}
#endif
/*
* This mmap-allocator allocates new areas top-down from below the
* stack's low limit (the base):
*/
#ifndef HAVE_ARCH_UNMAPPED_AREA_TOPDOWN
unsigned long
arch_get_unmapped_area_topdown(struct file *filp, unsigned long addr,
unsigned long len, unsigned long pgoff,
unsigned long flags)
{
struct vm_area_struct *vma, *prev;
struct mm_struct *mm = current->mm;
struct vm_unmapped_area_info info;
const unsigned long mmap_end = arch_get_mmap_end(addr);
/* requested length too big for entire address space */
if (len > mmap_end - mmap_min_addr)
return -ENOMEM;
if (flags & MAP_FIXED)
return addr;
/* requesting a specific address */
if (addr) {
addr = PAGE_ALIGN(addr);
vma = find_vma_prev(mm, addr, &prev);
if (mmap_end - len >= addr && addr >= mmap_min_addr &&
(!vma || addr + len <= vm_start_gap(vma)) &&
(!prev || addr >= vm_end_gap(prev)))
return addr;
}
info.flags = VM_UNMAPPED_AREA_TOPDOWN;
info.length = len;
info.low_limit = max(PAGE_SIZE, mmap_min_addr);
info.high_limit = arch_get_mmap_base(addr, mm->mmap_base);
info.align_mask = 0;
info.align_offset = 0;
addr = vm_unmapped_area(&info);
/*
* A failed mmap() very likely causes application failure,
* so fall back to the bottom-up function here. This scenario
* can happen with large stack limits and large mmap()
* allocations.
*/
if (offset_in_page(addr)) {
VM_BUG_ON(addr != -ENOMEM);
info.flags = 0;
info.low_limit = TASK_UNMAPPED_BASE;
info.high_limit = mmap_end;
addr = vm_unmapped_area(&info);
}
return addr;
}
#endif
unsigned long
get_unmapped_area(struct file *file, unsigned long addr, unsigned long len,
unsigned long pgoff, unsigned long flags)
{
unsigned long (*get_area)(struct file *, unsigned long,
unsigned long, unsigned long, unsigned long);
unsigned long error = arch_mmap_check(addr, len, flags);
if (error)
return error;
/* Careful about overflows.. */
if (len > TASK_SIZE)
return -ENOMEM;
get_area = current->mm->get_unmapped_area;
if (file) {
if (file->f_op->get_unmapped_area)
get_area = file->f_op->get_unmapped_area;
} else if (flags & MAP_SHARED) {
/*
* mmap_region() will call shmem_zero_setup() to create a file,
* so use shmem's get_unmapped_area in case it can be huge.
* do_mmap() will clear pgoff, so match alignment.
*/
pgoff = 0;
get_area = shmem_get_unmapped_area;
}
addr = get_area(file, addr, len, pgoff, flags);
if (IS_ERR_VALUE(addr))
return addr;
if (addr > TASK_SIZE - len)
return -ENOMEM;
if (offset_in_page(addr))
return -EINVAL;
error = security_mmap_addr(addr);
return error ? error : addr;
}
EXPORT_SYMBOL(get_unmapped_area);
/* Look up the first VMA which satisfies addr < vm_end, NULL if none. */
struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr)
{
struct rb_node *rb_node;
struct vm_area_struct *vma;
mmap_assert_locked(mm);
/* Check the cache first. */
vma = vmacache_find(mm, addr); if (likely(vma))
return vma;
rb_node = mm->mm_rb.rb_node; while (rb_node) {
struct vm_area_struct *tmp;
tmp = rb_entry(rb_node, struct vm_area_struct, vm_rb); if (tmp->vm_end > addr) {
vma = tmp;
if (tmp->vm_start <= addr)
break;
rb_node = rb_node->rb_left;
} else
rb_node = rb_node->rb_right;
}
if (vma) vmacache_update(addr, vma);
return vma;
}
EXPORT_SYMBOL(find_vma);
/*
* Same as find_vma, but also return a pointer to the previous VMA in *pprev.
*/
struct vm_area_struct *
find_vma_prev(struct mm_struct *mm, unsigned long addr,
struct vm_area_struct **pprev)
{
struct vm_area_struct *vma;
vma = find_vma(mm, addr);
if (vma) {
*pprev = vma->vm_prev;
} else {
struct rb_node *rb_node = rb_last(&mm->mm_rb);
*pprev = rb_node ? rb_entry(rb_node, struct vm_area_struct, vm_rb) : NULL;
}
return vma;
}
/*
* Verify that the stack growth is acceptable and
* update accounting. This is shared with both the
* grow-up and grow-down cases.
*/
static int acct_stack_growth(struct vm_area_struct *vma,
unsigned long size, unsigned long grow)
{
struct mm_struct *mm = vma->vm_mm;
unsigned long new_start;
/* address space limit tests */
if (!may_expand_vm(mm, vma->vm_flags, grow))
return -ENOMEM;
/* Stack limit test */
if (size > rlimit(RLIMIT_STACK))
return -ENOMEM;
/* mlock limit tests */
if (vma->vm_flags & VM_LOCKED) {
unsigned long locked;
unsigned long limit;
locked = mm->locked_vm + grow;
limit = rlimit(RLIMIT_MEMLOCK);
limit >>= PAGE_SHIFT;
if (locked > limit && !capable(CAP_IPC_LOCK))
return -ENOMEM;
}
/* Check to ensure the stack will not grow into a hugetlb-only region */
new_start = (vma->vm_flags & VM_GROWSUP) ? vma->vm_start :
vma->vm_end - size;
if (is_hugepage_only_range(vma->vm_mm, new_start, size))
return -EFAULT;
/*
* Overcommit.. This must be the final test, as it will
* update security statistics.
*/
if (security_vm_enough_memory_mm(mm, grow))
return -ENOMEM;
return 0;
}
#if defined(CONFIG_STACK_GROWSUP) || defined(CONFIG_IA64)
/*
* PA-RISC uses this for its stack; IA64 for its Register Backing Store.
* vma is the last one with address > vma->vm_end. Have to extend vma.
*/
int expand_upwards(struct vm_area_struct *vma, unsigned long address)
{
struct mm_struct *mm = vma->vm_mm;
struct vm_area_struct *next;
unsigned long gap_addr;
int error = 0;
if (!(vma->vm_flags & VM_GROWSUP))
return -EFAULT;
/* Guard against exceeding limits of the address space. */
address &= PAGE_MASK;
if (address >= (TASK_SIZE & PAGE_MASK))
return -ENOMEM;
address += PAGE_SIZE;
/* Enforce stack_guard_gap */
gap_addr = address + stack_guard_gap;
/* Guard against overflow */
if (gap_addr < address || gap_addr > TASK_SIZE)
gap_addr = TASK_SIZE;
next = vma->vm_next;
if (next && next->vm_start < gap_addr && vma_is_accessible(next)) {
if (!(next->vm_flags & VM_GROWSUP))
return -ENOMEM;
/* Check that both stack segments have the same anon_vma? */
}
/* We must make sure the anon_vma is allocated. */
if (unlikely(anon_vma_prepare(vma)))
return -ENOMEM;
/*
* vma->vm_start/vm_end cannot change under us because the caller
* is required to hold the mmap_lock in read mode. We need the
* anon_vma lock to serialize against concurrent expand_stacks.
*/
anon_vma_lock_write(vma->anon_vma);
/* Somebody else might have raced and expanded it already */
if (address > vma->vm_end) {
unsigned long size, grow;
size = address - vma->vm_start;
grow = (address - vma->vm_end) >> PAGE_SHIFT;
error = -ENOMEM;
if (vma->vm_pgoff + (size >> PAGE_SHIFT) >= vma->vm_pgoff) {
error = acct_stack_growth(vma, size, grow);
if (!error) {
/*
* vma_gap_update() doesn't support concurrent
* updates, but we only hold a shared mmap_lock
* lock here, so we need to protect against
* concurrent vma expansions.
* anon_vma_lock_write() doesn't help here, as
* we don't guarantee that all growable vmas
* in a mm share the same root anon vma.
* So, we reuse mm->page_table_lock to guard
* against concurrent vma expansions.
*/
spin_lock(&mm->page_table_lock);
if (vma->vm_flags & VM_LOCKED)
mm->locked_vm += grow;
vm_stat_account(mm, vma->vm_flags, grow);
anon_vma_interval_tree_pre_update_vma(vma);
vma->vm_end = address;
anon_vma_interval_tree_post_update_vma(vma);
if (vma->vm_next)
vma_gap_update(vma->vm_next);
else
mm->highest_vm_end = vm_end_gap(vma);
spin_unlock(&mm->page_table_lock);
perf_event_mmap(vma);
}
}
}
anon_vma_unlock_write(vma->anon_vma);
khugepaged_enter_vma_merge(vma, vma->vm_flags);
validate_mm(mm);
return error;
}
#endif /* CONFIG_STACK_GROWSUP || CONFIG_IA64 */
/*
* vma is the first one with address < vma->vm_start. Have to extend vma.
*/
int expand_downwards(struct vm_area_struct *vma,
unsigned long address)
{
struct mm_struct *mm = vma->vm_mm;
struct vm_area_struct *prev;
int error = 0;
address &= PAGE_MASK;
if (address < mmap_min_addr)
return -EPERM;
/* Enforce stack_guard_gap */
prev = vma->vm_prev;
/* Check that both stack segments have the same anon_vma? */
if (prev && !(prev->vm_flags & VM_GROWSDOWN) &&
vma_is_accessible(prev)) {
if (address - prev->vm_end < stack_guard_gap)
return -ENOMEM;
}
/* We must make sure the anon_vma is allocated. */
if (unlikely(anon_vma_prepare(vma)))
return -ENOMEM;
/*
* vma->vm_start/vm_end cannot change under us because the caller
* is required to hold the mmap_lock in read mode. We need the
* anon_vma lock to serialize against concurrent expand_stacks.
*/
anon_vma_lock_write(vma->anon_vma);
/* Somebody else might have raced and expanded it already */
if (address < vma->vm_start) {
unsigned long size, grow;
size = vma->vm_end - address; grow = (vma->vm_start - address) >> PAGE_SHIFT;
error = -ENOMEM;
if (grow <= vma->vm_pgoff) {
error = acct_stack_growth(vma, size, grow);
if (!error) {
/*
* vma_gap_update() doesn't support concurrent
* updates, but we only hold a shared mmap_lock
* lock here, so we need to protect against
* concurrent vma expansions.
* anon_vma_lock_write() doesn't help here, as
* we don't guarantee that all growable vmas
* in a mm share the same root anon vma.
* So, we reuse mm->page_table_lock to guard
* against concurrent vma expansions.
*/
spin_lock(&mm->page_table_lock);
if (vma->vm_flags & VM_LOCKED)
mm->locked_vm += grow;
vm_stat_account(mm, vma->vm_flags, grow);
anon_vma_interval_tree_pre_update_vma(vma);
vma->vm_start = address;
vma->vm_pgoff -= grow;
anon_vma_interval_tree_post_update_vma(vma);
vma_gap_update(vma);
spin_unlock(&mm->page_table_lock);
perf_event_mmap(vma);
}
}
}
anon_vma_unlock_write(vma->anon_vma);
khugepaged_enter_vma_merge(vma, vma->vm_flags);
validate_mm(mm);
return error;
}
/* enforced gap between the expanding stack and other mappings. */
unsigned long stack_guard_gap = 256UL<<PAGE_SHIFT;
static int __init cmdline_parse_stack_guard_gap(char *p)
{
unsigned long val;
char *endptr;
val = simple_strtoul(p, &endptr, 10);
if (!*endptr)
stack_guard_gap = val << PAGE_SHIFT;
return 1;
}
__setup("stack_guard_gap=", cmdline_parse_stack_guard_gap);
#ifdef CONFIG_STACK_GROWSUP
int expand_stack(struct vm_area_struct *vma, unsigned long address)
{
return expand_upwards(vma, address);
}
struct vm_area_struct *
find_extend_vma(struct mm_struct *mm, unsigned long addr)
{
struct vm_area_struct *vma, *prev;
addr &= PAGE_MASK;
vma = find_vma_prev(mm, addr, &prev);
if (vma && (vma->vm_start <= addr))
return vma;
/* don't alter vm_end if the coredump is running */
if (!prev || expand_stack(prev, addr))
return NULL;
if (prev->vm_flags & VM_LOCKED)
populate_vma_page_range(prev, addr, prev->vm_end, NULL);
return prev;
}
#else
int expand_stack(struct vm_area_struct *vma, unsigned long address)
{
return expand_downwards(vma, address);
}
struct vm_area_struct *
find_extend_vma(struct mm_struct *mm, unsigned long addr)
{
struct vm_area_struct *vma;
unsigned long start;
addr &= PAGE_MASK;
vma = find_vma(mm, addr);
if (!vma)
return NULL;
if (vma->vm_start <= addr)
return vma;
if (!(vma->vm_flags & VM_GROWSDOWN))
return NULL;
start = vma->vm_start;
if (expand_stack(vma, addr))
return NULL;
if (vma->vm_flags & VM_LOCKED) populate_vma_page_range(vma, addr, start, NULL);
return vma;
}
#endif
EXPORT_SYMBOL_GPL(find_extend_vma);
/*
* Ok - we have the memory areas we should free on the vma list,
* so release them, and do the vma updates.
*
* Called with the mm semaphore held.
*/
static void remove_vma_list(struct mm_struct *mm, struct vm_area_struct *vma)
{
unsigned long nr_accounted = 0;
/* Update high watermark before we lower total_vm */
update_hiwater_vm(mm);
do {
long nrpages = vma_pages(vma);
if (vma->vm_flags & VM_ACCOUNT)
nr_accounted += nrpages;
vm_stat_account(mm, vma->vm_flags, -nrpages);
vma = remove_vma(vma);
} while (vma);
vm_unacct_memory(nr_accounted);
validate_mm(mm);
}
/*
* Get rid of page table information in the indicated region.
*
* Called with the mm semaphore held.
*/
static void unmap_region(struct mm_struct *mm,
struct vm_area_struct *vma, struct vm_area_struct *prev,
unsigned long start, unsigned long end)
{
struct vm_area_struct *next = vma_next(mm, prev);
struct mmu_gather tlb;
lru_add_drain();
tlb_gather_mmu(&tlb, mm);
update_hiwater_rss(mm);
unmap_vmas(&tlb, vma, start, end);
free_pgtables(&tlb, vma, prev ? prev->vm_end : FIRST_USER_ADDRESS,
next ? next->vm_start : USER_PGTABLES_CEILING);
tlb_finish_mmu(&tlb);
}
/*
* Create a list of vma's touched by the unmap, removing them from the mm's
* vma list as we go..
*/
static bool
detach_vmas_to_be_unmapped(struct mm_struct *mm, struct vm_area_struct *vma,
struct vm_area_struct *prev, unsigned long end)
{
struct vm_area_struct **insertion_point;
struct vm_area_struct *tail_vma = NULL;
insertion_point = (prev ? &prev->vm_next : &mm->mmap);
vma->vm_prev = NULL;
do {
vma_rb_erase(vma, &mm->mm_rb);
mm->map_count--;
tail_vma = vma;
vma = vma->vm_next;
} while (vma && vma->vm_start < end);
*insertion_point = vma;
if (vma) {
vma->vm_prev = prev;
vma_gap_update(vma);
} else
mm->highest_vm_end = prev ? vm_end_gap(prev) : 0;
tail_vma->vm_next = NULL;
/* Kill the cache */
vmacache_invalidate(mm);
/*
* Do not downgrade mmap_lock if we are next to VM_GROWSDOWN or
* VM_GROWSUP VMA. Such VMAs can change their size under
* down_read(mmap_lock) and collide with the VMA we are about to unmap.
*/
if (vma && (vma->vm_flags & VM_GROWSDOWN))
return false;
if (prev && (prev->vm_flags & VM_GROWSUP))
return false;
return true;
}
/*
* __split_vma() bypasses sysctl_max_map_count checking. We use this where it
* has already been checked or doesn't make sense to fail.
*/
int __split_vma(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long addr, int new_below)
{
struct vm_area_struct *new;
int err;
if (vma->vm_ops && vma->vm_ops->may_split) {
err = vma->vm_ops->may_split(vma, addr);
if (err)
return err;
}
new = vm_area_dup(vma);
if (!new)
return -ENOMEM;
if (new_below)
new->vm_end = addr;
else {
new->vm_start = addr;
new->vm_pgoff += ((addr - vma->vm_start) >> PAGE_SHIFT);
}
err = vma_dup_policy(vma, new);
if (err)
goto out_free_vma;
err = anon_vma_clone(new, vma);
if (err)
goto out_free_mpol;
if (new->vm_file)
get_file(new->vm_file);
if (new->vm_ops && new->vm_ops->open)
new->vm_ops->open(new);
if (new_below)
err = vma_adjust(vma, addr, vma->vm_end, vma->vm_pgoff +
((addr - new->vm_start) >> PAGE_SHIFT), new);
else
err = vma_adjust(vma, vma->vm_start, addr, vma->vm_pgoff, new);
/* Success. */
if (!err)
return 0;
/* Clean everything up if vma_adjust failed. */
if (new->vm_ops && new->vm_ops->close)
new->vm_ops->close(new);
if (new->vm_file)
fput(new->vm_file);
unlink_anon_vmas(new);
out_free_mpol:
mpol_put(vma_policy(new));
out_free_vma:
vm_area_free(new);
return err;
}
/*
* Split a vma into two pieces at address 'addr', a new vma is allocated
* either for the first part or the tail.
*/
int split_vma(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long addr, int new_below)
{
if (mm->map_count >= sysctl_max_map_count)
return -ENOMEM;
return __split_vma(mm, vma, addr, new_below);
}
static inline void
unlock_range(struct vm_area_struct *start, unsigned long limit)
{
struct mm_struct *mm = start->vm_mm;
struct vm_area_struct *tmp = start;
while (tmp && tmp->vm_start < limit) {
if (tmp->vm_flags & VM_LOCKED) {
mm->locked_vm -= vma_pages(tmp);
munlock_vma_pages_all(tmp);
}
tmp = tmp->vm_next;
}
}
/* Munmap is split into 2 main parts -- this part which finds
* what needs doing, and the areas themselves, which do the
* work. This now handles partial unmappings.
* Jeremy Fitzhardinge <jeremy@goop.org>
*/
int __do_munmap(struct mm_struct *mm, unsigned long start, size_t len,
struct list_head *uf, bool downgrade)
{
unsigned long end;
struct vm_area_struct *vma, *prev, *last;
if ((offset_in_page(start)) || start > TASK_SIZE || len > TASK_SIZE-start)
return -EINVAL;
len = PAGE_ALIGN(len);
end = start + len;
if (len == 0)
return -EINVAL;
/*
* arch_unmap() might do unmaps itself. It must be called
* and finish any rbtree manipulation before this code
* runs and also starts to manipulate the rbtree.
*/
arch_unmap(mm, start, end);
/* Find the first overlapping VMA where start < vma->vm_end */
vma = find_vma_intersection(mm, start, end);
if (!vma)
return 0;
prev = vma->vm_prev;
/*
* If we need to split any vma, do it now to save pain later.
*
* Note: mremap's move_vma VM_ACCOUNT handling assumes a partially
* unmapped vm_area_struct will remain in use: so lower split_vma
* places tmp vma above, and higher split_vma places tmp vma below.
*/
if (start > vma->vm_start) {
int error;
/*
* Make sure that map_count on return from munmap() will
* not exceed its limit; but let map_count go just above
* its limit temporarily, to help free resources as expected.
*/
if (end < vma->vm_end && mm->map_count >= sysctl_max_map_count)
return -ENOMEM;
error = __split_vma(mm, vma, start, 0);
if (error)
return error;
prev = vma;
}
/* Does it split the last one? */
last = find_vma(mm, end);
if (last && end > last->vm_start) {
int error = __split_vma(mm, last, end, 1);
if (error)
return error;
}
vma = vma_next(mm, prev);
if (unlikely(uf)) {
/*
* If userfaultfd_unmap_prep returns an error the vmas
* will remain split, but userland will get a
* highly unexpected error anyway. This is no
* different than the case where the first of the two
* __split_vma fails, but we don't undo the first
* split, despite we could. This is unlikely enough
* failure that it's not worth optimizing it for.
*/
int error = userfaultfd_unmap_prep(vma, start, end, uf);
if (error)
return error;
}
/*
* unlock any mlock()ed ranges before detaching vmas
*/
if (mm->locked_vm)
unlock_range(vma, end);
/* Detach vmas from rbtree */
if (!detach_vmas_to_be_unmapped(mm, vma, prev, end))
downgrade = false;
if (downgrade)
mmap_write_downgrade(mm);
unmap_region(mm, vma, prev, start, end);
/* Fix up all other VM information */
remove_vma_list(mm, vma);
return downgrade ? 1 : 0;
}
int do_munmap(struct mm_struct *mm, unsigned long start, size_t len,
struct list_head *uf)
{
return __do_munmap(mm, start, len, uf, false);
}
static int __vm_munmap(unsigned long start, size_t len, bool downgrade)
{
int ret;
struct mm_struct *mm = current->mm;
LIST_HEAD(uf);
if (mmap_write_lock_killable(mm))
return -EINTR;
ret = __do_munmap(mm, start, len, &uf, downgrade);
/*
* Returning 1 indicates mmap_lock is downgraded.
* But 1 is not legal return value of vm_munmap() and munmap(), reset
* it to 0 before return.
*/
if (ret == 1) {
mmap_read_unlock(mm);
ret = 0;
} else
mmap_write_unlock(mm);
userfaultfd_unmap_complete(mm, &uf);
return ret;
}
int vm_munmap(unsigned long start, size_t len)
{
return __vm_munmap(start, len, false);
}
EXPORT_SYMBOL(vm_munmap);
SYSCALL_DEFINE2(munmap, unsigned long, addr, size_t, len)
{
addr = untagged_addr(addr);
profile_munmap(addr);
return __vm_munmap(addr, len, true);
}
/*
* Emulation of deprecated remap_file_pages() syscall.
*/
SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size,
unsigned long, prot, unsigned long, pgoff, unsigned long, flags)
{
struct mm_struct *mm = current->mm;
struct vm_area_struct *vma;
unsigned long populate = 0;
unsigned long ret = -EINVAL;
struct file *file;
pr_warn_once("%s (%d) uses deprecated remap_file_pages() syscall. See Documentation/vm/remap_file_pages.rst.\n",
current->comm, current->pid);
if (prot)
return ret;
start = start & PAGE_MASK;
size = size & PAGE_MASK;
if (start + size <= start)
return ret;
/* Does pgoff wrap? */
if (pgoff + (size >> PAGE_SHIFT) < pgoff)
return ret;
if (mmap_write_lock_killable(mm))
return -EINTR;
vma = vma_lookup(mm, start);
if (!vma || !(vma->vm_flags & VM_SHARED))
goto out;
if (start + size > vma->vm_end) {
struct vm_area_struct *next;
for (next = vma->vm_next; next; next = next->vm_next) {
/* hole between vmas ? */
if (next->vm_start != next->vm_prev->vm_end)
goto out;
if (next->vm_file != vma->vm_file)
goto out;
if (next->vm_flags != vma->vm_flags)
goto out;
if (start + size <= next->vm_end)
break;
}
if (!next)
goto out;
}
prot |= vma->vm_flags & VM_READ ? PROT_READ : 0;
prot |= vma->vm_flags & VM_WRITE ? PROT_WRITE : 0;
prot |= vma->vm_flags & VM_EXEC ? PROT_EXEC : 0;
flags &= MAP_NONBLOCK;
flags |= MAP_SHARED | MAP_FIXED | MAP_POPULATE;
if (vma->vm_flags & VM_LOCKED)
flags |= MAP_LOCKED;
file = get_file(vma->vm_file);
ret = do_mmap(vma->vm_file, start, size,
prot, flags, pgoff, &populate, NULL);
fput(file);
out:
mmap_write_unlock(mm);
if (populate)
mm_populate(ret, populate);
if (!IS_ERR_VALUE(ret))
ret = 0;
return ret;
}
/*
* this is really a simplified "do_mmap". it only handles
* anonymous maps. eventually we may be able to do some
* brk-specific accounting here.
*/
static int do_brk_flags(unsigned long addr, unsigned long len, unsigned long flags, struct list_head *uf)
{
struct mm_struct *mm = current->mm;
struct vm_area_struct *vma, *prev;
struct rb_node **rb_link, *rb_parent;
pgoff_t pgoff = addr >> PAGE_SHIFT;
int error;
unsigned long mapped_addr;
/* Until we need other flags, refuse anything except VM_EXEC. */
if ((flags & (~VM_EXEC)) != 0)
return -EINVAL;
flags |= VM_DATA_DEFAULT_FLAGS | VM_ACCOUNT | mm->def_flags;
mapped_addr = get_unmapped_area(NULL, addr, len, 0, MAP_FIXED);
if (IS_ERR_VALUE(mapped_addr))
return mapped_addr;
error = mlock_future_check(mm, mm->def_flags, len);
if (error)
return error;
/* Clear old maps, set up prev, rb_link, rb_parent, and uf */
if (munmap_vma_range(mm, addr, len, &prev, &rb_link, &rb_parent, uf))
return -ENOMEM;
/* Check against address space limits *after* clearing old maps... */
if (!may_expand_vm(mm, flags, len >> PAGE_SHIFT))
return -ENOMEM;
if (mm->map_count > sysctl_max_map_count)
return -ENOMEM;
if (security_vm_enough_memory_mm(mm, len >> PAGE_SHIFT))
return -ENOMEM;
/* Can we just expand an old private anonymous mapping? */
vma = vma_merge(mm, prev, addr, addr + len, flags,
NULL, NULL, pgoff, NULL, NULL_VM_UFFD_CTX);
if (vma)
goto out;
/*
* create a vma struct for an anonymous mapping
*/
vma = vm_area_alloc(mm);
if (!vma) {
vm_unacct_memory(len >> PAGE_SHIFT);
return -ENOMEM;
}
vma_set_anonymous(vma);
vma->vm_start = addr;
vma->vm_end = addr + len;
vma->vm_pgoff = pgoff;
vma->vm_flags = flags;
vma->vm_page_prot = vm_get_page_prot(flags);
vma_link(mm, vma, prev, rb_link, rb_parent);
out:
perf_event_mmap(vma);
mm->total_vm += len >> PAGE_SHIFT;
mm->data_vm += len >> PAGE_SHIFT;
if (flags & VM_LOCKED)
mm->locked_vm += (len >> PAGE_SHIFT);
vma->vm_flags |= VM_SOFTDIRTY;
return 0;
}
int vm_brk_flags(unsigned long addr, unsigned long request, unsigned long flags)
{
struct mm_struct *mm = current->mm;
unsigned long len;
int ret;
bool populate;
LIST_HEAD(uf);
len = PAGE_ALIGN(request);
if (len < request)
return -ENOMEM;
if (!len)
return 0;
if (mmap_write_lock_killable(mm))
return -EINTR;
ret = do_brk_flags(addr, len, flags, &uf);
populate = ((mm->def_flags & VM_LOCKED) != 0);
mmap_write_unlock(mm);
userfaultfd_unmap_complete(mm, &uf);
if (populate && !ret)
mm_populate(addr, len);
return ret;
}
EXPORT_SYMBOL(vm_brk_flags);
int vm_brk(unsigned long addr, unsigned long len)
{
return vm_brk_flags(addr, len, 0);
}
EXPORT_SYMBOL(vm_brk);
/* Release all mmaps. */
void exit_mmap(struct mm_struct *mm)
{
struct mmu_gather tlb;
struct vm_area_struct *vma;
unsigned long nr_accounted = 0;
/* mm's last user has gone, and its about to be pulled down */
mmu_notifier_release(mm);
if (unlikely(mm_is_oom_victim(mm))) {
/*
* Manually reap the mm to free as much memory as possible.
* Then, as the oom reaper does, set MMF_OOM_SKIP to disregard
* this mm from further consideration. Taking mm->mmap_lock for
* write after setting MMF_OOM_SKIP will guarantee that the oom
* reaper will not run on this mm again after mmap_lock is
* dropped.
*
* Nothing can be holding mm->mmap_lock here and the above call
* to mmu_notifier_release(mm) ensures mmu notifier callbacks in
* __oom_reap_task_mm() will not block.
*
* This needs to be done before calling munlock_vma_pages_all(),
* which clears VM_LOCKED, otherwise the oom reaper cannot
* reliably test it.
*/
(void)__oom_reap_task_mm(mm);
set_bit(MMF_OOM_SKIP, &mm->flags);
mmap_write_lock(mm);
mmap_write_unlock(mm);
}
if (mm->locked_vm)
unlock_range(mm->mmap, ULONG_MAX);
arch_exit_mmap(mm);
vma = mm->mmap;
if (!vma) /* Can happen if dup_mmap() received an OOM */
return;
lru_add_drain();
flush_cache_mm(mm);
tlb_gather_mmu_fullmm(&tlb, mm);
/* update_hiwater_rss(mm) here? but nobody should be looking */
/* Use -1 here to ensure all VMAs in the mm are unmapped */
unmap_vmas(&tlb, vma, 0, -1);
free_pgtables(&tlb, vma, FIRST_USER_ADDRESS, USER_PGTABLES_CEILING);
tlb_finish_mmu(&tlb);
/*
* Walk the list again, actually closing and freeing it,
* with preemption enabled, without holding any MM locks.
*/
while (vma) {
if (vma->vm_flags & VM_ACCOUNT)
nr_accounted += vma_pages(vma);
vma = remove_vma(vma);
cond_resched();
}
vm_unacct_memory(nr_accounted);
}
/* Insert vm structure into process list sorted by address
* and into the inode's i_mmap tree. If vm_file is non-NULL
* then i_mmap_rwsem is taken here.
*/
int insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma)
{
struct vm_area_struct *prev;
struct rb_node **rb_link, *rb_parent;
if (find_vma_links(mm, vma->vm_start, vma->vm_end,
&prev, &rb_link, &rb_parent))
return -ENOMEM;
if ((vma->vm_flags & VM_ACCOUNT) &&
security_vm_enough_memory_mm(mm, vma_pages(vma)))
return -ENOMEM;
/*
* The vm_pgoff of a purely anonymous vma should be irrelevant
* until its first write fault, when page's anon_vma and index
* are set. But now set the vm_pgoff it will almost certainly
* end up with (unless mremap moves it elsewhere before that
* first wfault), so /proc/pid/maps tells a consistent story.
*
* By setting it to reflect the virtual start address of the
* vma, merges and splits can happen in a seamless way, just
* using the existing file pgoff checks and manipulations.
* Similarly in do_mmap and in do_brk_flags.
*/
if (vma_is_anonymous(vma)) {
BUG_ON(vma->anon_vma);
vma->vm_pgoff = vma->vm_start >> PAGE_SHIFT;
}
vma_link(mm, vma, prev, rb_link, rb_parent);
return 0;
}
/*
* Copy the vma structure to a new location in the same mm,
* prior to moving page table entries, to effect an mremap move.
*/
struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
unsigned long addr, unsigned long len, pgoff_t pgoff,
bool *need_rmap_locks)
{
struct vm_area_struct *vma = *vmap;
unsigned long vma_start = vma->vm_start;
struct mm_struct *mm = vma->vm_mm;
struct vm_area_struct *new_vma, *prev;
struct rb_node **rb_link, *rb_parent;
bool faulted_in_anon_vma = true;
/*
* If anonymous vma has not yet been faulted, update new pgoff
* to match new location, to increase its chance of merging.
*/
if (unlikely(vma_is_anonymous(vma) && !vma->anon_vma)) {
pgoff = addr >> PAGE_SHIFT;
faulted_in_anon_vma = false;
}
if (find_vma_links(mm, addr, addr + len, &prev, &rb_link, &rb_parent))
return NULL; /* should never get here */
new_vma = vma_merge(mm, prev, addr, addr + len, vma->vm_flags,
vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma),
vma->vm_userfaultfd_ctx);
if (new_vma) {
/*
* Source vma may have been merged into new_vma
*/
if (unlikely(vma_start >= new_vma->vm_start &&
vma_start < new_vma->vm_end)) {
/*
* The only way we can get a vma_merge with
* self during an mremap is if the vma hasn't
* been faulted in yet and we were allowed to
* reset the dst vma->vm_pgoff to the
* destination address of the mremap to allow
* the merge to happen. mremap must change the
* vm_pgoff linearity between src and dst vmas
* (in turn preventing a vma_merge) to be
* safe. It is only safe to keep the vm_pgoff
* linear if there are no pages mapped yet.
*/
VM_BUG_ON_VMA(faulted_in_anon_vma, new_vma);
*vmap = vma = new_vma;
}
*need_rmap_locks = (new_vma->vm_pgoff <= vma->vm_pgoff);
} else {
new_vma = vm_area_dup(vma);
if (!new_vma)
goto out;
new_vma->vm_start = addr;
new_vma->vm_end = addr + len;
new_vma->vm_pgoff = pgoff;
if (vma_dup_policy(vma, new_vma))
goto out_free_vma;
if (anon_vma_clone(new_vma, vma))
goto out_free_mempol;
if (new_vma->vm_file)
get_file(new_vma->vm_file);
if (new_vma->vm_ops && new_vma->vm_ops->open)
new_vma->vm_ops->open(new_vma);
vma_link(mm, new_vma, prev, rb_link, rb_parent);
*need_rmap_locks = false;
}
return new_vma;
out_free_mempol:
mpol_put(vma_policy(new_vma));
out_free_vma:
vm_area_free(new_vma);
out:
return NULL;
}
/*
* Return true if the calling process may expand its vm space by the passed
* number of pages
*/
bool may_expand_vm(struct mm_struct *mm, vm_flags_t flags, unsigned long npages)
{
if (mm->total_vm + npages > rlimit(RLIMIT_AS) >> PAGE_SHIFT)
return false;
if (is_data_mapping(flags) && mm->data_vm + npages > rlimit(RLIMIT_DATA) >> PAGE_SHIFT) {
/* Workaround for Valgrind */
if (rlimit(RLIMIT_DATA) == 0 &&
mm->data_vm + npages <= rlimit_max(RLIMIT_DATA) >> PAGE_SHIFT)
return true;
pr_warn_once("%s (%d): VmData %lu exceed data ulimit %lu. Update limits%s.\n",
current->comm, current->pid,
(mm->data_vm + npages) << PAGE_SHIFT,
rlimit(RLIMIT_DATA),
ignore_rlimit_data ? "" : " or use boot option ignore_rlimit_data");
if (!ignore_rlimit_data)
return false;
}
return true;
}
void vm_stat_account(struct mm_struct *mm, vm_flags_t flags, long npages)
{
mm->total_vm += npages;
if (is_exec_mapping(flags))
mm->exec_vm += npages;
else if (is_stack_mapping(flags))
mm->stack_vm += npages;
else if (is_data_mapping(flags))
mm->data_vm += npages;
}
static vm_fault_t special_mapping_fault(struct vm_fault *vmf);
/*
* Having a close hook prevents vma merging regardless of flags.
*/
static void special_mapping_close(struct vm_area_struct *vma)
{
}
static const char *special_mapping_name(struct vm_area_struct *vma)
{
return ((struct vm_special_mapping *)vma->vm_private_data)->name;
}
static int special_mapping_mremap(struct vm_area_struct *new_vma)
{
struct vm_special_mapping *sm = new_vma->vm_private_data;
if (WARN_ON_ONCE(current->mm != new_vma->vm_mm))
return -EFAULT;
if (sm->mremap)
return sm->mremap(sm, new_vma);
return 0;
}
static int special_mapping_split(struct vm_area_struct *vma, unsigned long addr)
{
/*
* Forbid splitting special mappings - kernel has expectations over
* the number of pages in mapping. Together with VM_DONTEXPAND
* the size of vma should stay the same over the special mapping's
* lifetime.
*/
return -EINVAL;
}
static const struct vm_operations_struct special_mapping_vmops = {
.close = special_mapping_close,
.fault = special_mapping_fault,
.mremap = special_mapping_mremap,
.name = special_mapping_name,
/* vDSO code relies that VVAR can't be accessed remotely */
.access = NULL,
.may_split = special_mapping_split,
};
static const struct vm_operations_struct legacy_special_mapping_vmops = {
.close = special_mapping_close,
.fault = special_mapping_fault,
};
static vm_fault_t special_mapping_fault(struct vm_fault *vmf)
{
struct vm_area_struct *vma = vmf->vma;
pgoff_t pgoff;
struct page **pages;
if (vma->vm_ops == &legacy_special_mapping_vmops) {
pages = vma->vm_private_data;
} else {
struct vm_special_mapping *sm = vma->vm_private_data;
if (sm->fault)
return sm->fault(sm, vmf->vma, vmf);
pages = sm->pages;
}
for (pgoff = vmf->pgoff; pgoff && *pages; ++pages)
pgoff--;
if (*pages) {
struct page *page = *pages;
get_page(page);
vmf->page = page;
return 0;
}
return VM_FAULT_SIGBUS;
}
static struct vm_area_struct *__install_special_mapping(
struct mm_struct *mm,
unsigned long addr, unsigned long len,
unsigned long vm_flags, void *priv,
const struct vm_operations_struct *ops)
{
int ret;
struct vm_area_struct *vma;
vma = vm_area_alloc(mm);
if (unlikely(vma == NULL))
return ERR_PTR(-ENOMEM);
vma->vm_start = addr;
vma->vm_end = addr + len;
vma->vm_flags = vm_flags | mm->def_flags | VM_DONTEXPAND | VM_SOFTDIRTY;
vma->vm_page_prot = vm_get_page_prot(vma->vm_flags);
vma->vm_ops = ops;
vma->vm_private_data = priv;
ret = insert_vm_struct(mm, vma);
if (ret)
goto out;
vm_stat_account(mm, vma->vm_flags, len >> PAGE_SHIFT);
perf_event_mmap(vma);
return vma;
out:
vm_area_free(vma);
return ERR_PTR(ret);
}
bool vma_is_special_mapping(const struct vm_area_struct *vma,
const struct vm_special_mapping *sm)
{
return vma->vm_private_data == sm &&
(vma->vm_ops == &special_mapping_vmops ||
vma->vm_ops == &legacy_special_mapping_vmops);
}
/*
* Called with mm->mmap_lock held for writing.
* Insert a new vma covering the given region, with the given flags.
* Its pages are supplied by the given array of struct page *.
* The array can be shorter than len >> PAGE_SHIFT if it's null-terminated.
* The region past the last page supplied will always produce SIGBUS.
* The array pointer and the pages it points to are assumed to stay alive
* for as long as this mapping might exist.
*/
struct vm_area_struct *_install_special_mapping(
struct mm_struct *mm,
unsigned long addr, unsigned long len,
unsigned long vm_flags, const struct vm_special_mapping *spec)
{
return __install_special_mapping(mm, addr, len, vm_flags, (void *)spec,
&special_mapping_vmops);
}
int install_special_mapping(struct mm_struct *mm,
unsigned long addr, unsigned long len,
unsigned long vm_flags, struct page **pages)
{
struct vm_area_struct *vma = __install_special_mapping(
mm, addr, len, vm_flags, (void *)pages,
&legacy_special_mapping_vmops);
return PTR_ERR_OR_ZERO(vma);
}
static DEFINE_MUTEX(mm_all_locks_mutex);
static void vm_lock_anon_vma(struct mm_struct *mm, struct anon_vma *anon_vma)
{
if (!test_bit(0, (unsigned long *) &anon_vma->root->rb_root.rb_root.rb_node)) {
/*
* The LSB of head.next can't change from under us
* because we hold the mm_all_locks_mutex.
*/
down_write_nest_lock(&anon_vma->root->rwsem, &mm->mmap_lock);
/*
* We can safely modify head.next after taking the
* anon_vma->root->rwsem. If some other vma in this mm shares
* the same anon_vma we won't take it again.
*
* No need of atomic instructions here, head.next
* can't change from under us thanks to the
* anon_vma->root->rwsem.
*/
if (__test_and_set_bit(0, (unsigned long *)
&anon_vma->root->rb_root.rb_root.rb_node))
BUG();
}
}
static void vm_lock_mapping(struct mm_struct *mm, struct address_space *mapping)
{
if (!test_bit(AS_MM_ALL_LOCKS, &mapping->flags)) {
/*
* AS_MM_ALL_LOCKS can't change from under us because
* we hold the mm_all_locks_mutex.
*
* Operations on ->flags have to be atomic because
* even if AS_MM_ALL_LOCKS is stable thanks to the
* mm_all_locks_mutex, there may be other cpus
* changing other bitflags in parallel to us.
*/
if (test_and_set_bit(AS_MM_ALL_LOCKS, &mapping->flags))
BUG();
down_write_nest_lock(&mapping->i_mmap_rwsem, &mm->mmap_lock);
}
}
/*
* This operation locks against the VM for all pte/vma/mm related
* operations that could ever happen on a certain mm. This includes
* vmtruncate, try_to_unmap, and all page faults.
*
* The caller must take the mmap_lock in write mode before calling
* mm_take_all_locks(). The caller isn't allowed to release the
* mmap_lock until mm_drop_all_locks() returns.
*
* mmap_lock in write mode is required in order to block all operations
* that could modify pagetables and free pages without need of
* altering the vma layout. It's also needed in write mode to avoid new
* anon_vmas to be associated with existing vmas.
*
* A single task can't take more than one mm_take_all_locks() in a row
* or it would deadlock.
*
* The LSB in anon_vma->rb_root.rb_node and the AS_MM_ALL_LOCKS bitflag in
* mapping->flags avoid to take the same lock twice, if more than one
* vma in this mm is backed by the same anon_vma or address_space.
*
* We take locks in following order, accordingly to comment at beginning
* of mm/rmap.c:
* - all hugetlbfs_i_mmap_rwsem_key locks (aka mapping->i_mmap_rwsem for
* hugetlb mapping);
* - all i_mmap_rwsem locks;
* - all anon_vma->rwseml
*
* We can take all locks within these types randomly because the VM code
* doesn't nest them and we protected from parallel mm_take_all_locks() by
* mm_all_locks_mutex.
*
* mm_take_all_locks() and mm_drop_all_locks are expensive operations
* that may have to take thousand of locks.
*
* mm_take_all_locks() can fail if it's interrupted by signals.
*/
int mm_take_all_locks(struct mm_struct *mm)
{
struct vm_area_struct *vma;
struct anon_vma_chain *avc;
BUG_ON(mmap_read_trylock(mm));
mutex_lock(&mm_all_locks_mutex);
for (vma = mm->mmap; vma; vma = vma->vm_next) {
if (signal_pending(current))
goto out_unlock;
if (vma->vm_file && vma->vm_file->f_mapping &&
is_vm_hugetlb_page(vma))
vm_lock_mapping(mm, vma->vm_file->f_mapping);
}
for (vma = mm->mmap; vma; vma = vma->vm_next) {
if (signal_pending(current))
goto out_unlock;
if (vma->vm_file && vma->vm_file->f_mapping &&
!is_vm_hugetlb_page(vma))
vm_lock_mapping(mm, vma->vm_file->f_mapping);
}
for (vma = mm->mmap; vma; vma = vma->vm_next) {
if (signal_pending(current))
goto out_unlock;
if (vma->anon_vma)
list_for_each_entry(avc, &vma->anon_vma_chain, same_vma)
vm_lock_anon_vma(mm, avc->anon_vma);
}
return 0;
out_unlock:
mm_drop_all_locks(mm);
return -EINTR;
}
static void vm_unlock_anon_vma(struct anon_vma *anon_vma)
{
if (test_bit(0, (unsigned long *) &anon_vma->root->rb_root.rb_root.rb_node)) {
/*
* The LSB of head.next can't change to 0 from under
* us because we hold the mm_all_locks_mutex.
*
* We must however clear the bitflag before unlocking
* the vma so the users using the anon_vma->rb_root will
* never see our bitflag.
*
* No need of atomic instructions here, head.next
* can't change from under us until we release the
* anon_vma->root->rwsem.
*/
if (!__test_and_clear_bit(0, (unsigned long *)
&anon_vma->root->rb_root.rb_root.rb_node))
BUG();
anon_vma_unlock_write(anon_vma);
}
}
static void vm_unlock_mapping(struct address_space *mapping)
{
if (test_bit(AS_MM_ALL_LOCKS, &mapping->flags)) {
/*
* AS_MM_ALL_LOCKS can't change to 0 from under us
* because we hold the mm_all_locks_mutex.
*/
i_mmap_unlock_write(mapping);
if (!test_and_clear_bit(AS_MM_ALL_LOCKS,
&mapping->flags))
BUG();
}
}
/*
* The mmap_lock cannot be released by the caller until
* mm_drop_all_locks() returns.
*/
void mm_drop_all_locks(struct mm_struct *mm)
{
struct vm_area_struct *vma;
struct anon_vma_chain *avc;
BUG_ON(mmap_read_trylock(mm));
BUG_ON(!mutex_is_locked(&mm_all_locks_mutex));
for (vma = mm->mmap; vma; vma = vma->vm_next) {
if (vma->anon_vma)
list_for_each_entry(avc, &vma->anon_vma_chain, same_vma)
vm_unlock_anon_vma(avc->anon_vma);
if (vma->vm_file && vma->vm_file->f_mapping)
vm_unlock_mapping(vma->vm_file->f_mapping);
}
mutex_unlock(&mm_all_locks_mutex);
}
/*
* initialise the percpu counter for VM
*/
void __init mmap_init(void)
{
int ret;
ret = percpu_counter_init(&vm_committed_as, 0, GFP_KERNEL);
VM_BUG_ON(ret);
}
/*
* Initialise sysctl_user_reserve_kbytes.
*
* This is intended to prevent a user from starting a single memory hogging
* process, such that they cannot recover (kill the hog) in OVERCOMMIT_NEVER
* mode.
*
* The default value is min(3% of free memory, 128MB)
* 128MB is enough to recover with sshd/login, bash, and top/kill.
*/
static int init_user_reserve(void)
{
unsigned long free_kbytes;
free_kbytes = global_zone_page_state(NR_FREE_PAGES) << (PAGE_SHIFT - 10);
sysctl_user_reserve_kbytes = min(free_kbytes / 32, 1UL << 17);
return 0;
}
subsys_initcall(init_user_reserve);
/*
* Initialise sysctl_admin_reserve_kbytes.
*
* The purpose of sysctl_admin_reserve_kbytes is to allow the sys admin
* to log in and kill a memory hogging process.
*
* Systems with more than 256MB will reserve 8MB, enough to recover
* with sshd, bash, and top in OVERCOMMIT_GUESS. Smaller systems will
* only reserve 3% of free pages by default.
*/
static int init_admin_reserve(void)
{
unsigned long free_kbytes;
free_kbytes = global_zone_page_state(NR_FREE_PAGES) << (PAGE_SHIFT - 10);
sysctl_admin_reserve_kbytes = min(free_kbytes / 32, 1UL << 13);
return 0;
}
subsys_initcall(init_admin_reserve);
/*
* Reinititalise user and admin reserves if memory is added or removed.
*
* The default user reserve max is 128MB, and the default max for the
* admin reserve is 8MB. These are usually, but not always, enough to
* enable recovery from a memory hogging process using login/sshd, a shell,
* and tools like top. It may make sense to increase or even disable the
* reserve depending on the existence of swap or variations in the recovery
* tools. So, the admin may have changed them.
*
* If memory is added and the reserves have been eliminated or increased above
* the default max, then we'll trust the admin.
*
* If memory is removed and there isn't enough free memory, then we
* need to reset the reserves.
*
* Otherwise keep the reserve set by the admin.
*/
static int reserve_mem_notifier(struct notifier_block *nb,
unsigned long action, void *data)
{
unsigned long tmp, free_kbytes;
switch (action) {
case MEM_ONLINE:
/* Default max is 128MB. Leave alone if modified by operator. */
tmp = sysctl_user_reserve_kbytes;
if (0 < tmp && tmp < (1UL << 17))
init_user_reserve();
/* Default max is 8MB. Leave alone if modified by operator. */
tmp = sysctl_admin_reserve_kbytes;
if (0 < tmp && tmp < (1UL << 13))
init_admin_reserve();
break;
case MEM_OFFLINE:
free_kbytes = global_zone_page_state(NR_FREE_PAGES) << (PAGE_SHIFT - 10);
if (sysctl_user_reserve_kbytes > free_kbytes) {
init_user_reserve();
pr_info("vm.user_reserve_kbytes reset to %lu\n",
sysctl_user_reserve_kbytes);
}
if (sysctl_admin_reserve_kbytes > free_kbytes) {
init_admin_reserve();
pr_info("vm.admin_reserve_kbytes reset to %lu\n",
sysctl_admin_reserve_kbytes);
}
break;
default:
break;
}
return NOTIFY_OK;
}
static struct notifier_block reserve_mem_nb = {
.notifier_call = reserve_mem_notifier,
};
static int __meminit init_reserve_notifier(void)
{
if (register_hotmemory_notifier(&reserve_mem_nb))
pr_err("Failed registering memory add/remove notifier for admin reserve\n");
return 0;
}
subsys_initcall(init_reserve_notifier);
/* SPDX-License-Identifier: GPL-2.0 */
/*
* Copyright (c) 2013 Red Hat, Inc. and Parallels Inc. All rights reserved.
* Authors: David Chinner and Glauber Costa
*
* Generic LRU infrastructure
*/
#ifndef _LRU_LIST_H
#define _LRU_LIST_H
#include <linux/list.h>
#include <linux/nodemask.h>
#include <linux/shrinker.h>
struct mem_cgroup;
/* list_lru_walk_cb has to always return one of those */
enum lru_status {
LRU_REMOVED, /* item removed from list */
LRU_REMOVED_RETRY, /* item removed, but lock has been
dropped and reacquired */
LRU_ROTATE, /* item referenced, give another pass */
LRU_SKIP, /* item cannot be locked, skip */
LRU_RETRY, /* item not freeable. May drop the lock
internally, but has to return locked. */
};
struct list_lru_one {
struct list_head list;
/* may become negative during memcg reparenting */
long nr_items;
};
struct list_lru_memcg {
struct rcu_head rcu;
/* array of per cgroup lists, indexed by memcg_cache_id */
struct list_lru_one *lru[];
};
struct list_lru_node {
/* protects all lists on the node, including per cgroup */
spinlock_t lock;
/* global list, used for the root cgroup in cgroup aware lrus */
struct list_lru_one lru;
#ifdef CONFIG_MEMCG_KMEM
/* for cgroup aware lrus points to per cgroup lists, otherwise NULL */
struct list_lru_memcg __rcu *memcg_lrus;
#endif
long nr_items;
} ____cacheline_aligned_in_smp;
struct list_lru {
struct list_lru_node *node;
#ifdef CONFIG_MEMCG_KMEM
struct list_head list;
int shrinker_id;
bool memcg_aware;
#endif
};
void list_lru_destroy(struct list_lru *lru);
int __list_lru_init(struct list_lru *lru, bool memcg_aware,
struct lock_class_key *key, struct shrinker *shrinker);
#define list_lru_init(lru) \
__list_lru_init((lru), false, NULL, NULL)
#define list_lru_init_key(lru, key) \
__list_lru_init((lru), false, (key), NULL)
#define list_lru_init_memcg(lru, shrinker) \
__list_lru_init((lru), true, NULL, shrinker)
int memcg_update_all_list_lrus(int num_memcgs);
void memcg_drain_all_list_lrus(int src_idx, struct mem_cgroup *dst_memcg);
/**
* list_lru_add: add an element to the lru list's tail
* @list_lru: the lru pointer
* @item: the item to be added.
*
* If the element is already part of a list, this function returns doing
* nothing. Therefore the caller does not need to keep state about whether or
* not the element already belongs in the list and is allowed to lazy update
* it. Note however that this is valid for *a* list, not *this* list. If
* the caller organize itself in a way that elements can be in more than
* one type of list, it is up to the caller to fully remove the item from
* the previous list (with list_lru_del() for instance) before moving it
* to @list_lru
*
* Return value: true if the list was updated, false otherwise
*/
bool list_lru_add(struct list_lru *lru, struct list_head *item);
/**
* list_lru_del: delete an element to the lru list
* @list_lru: the lru pointer
* @item: the item to be deleted.
*
* This function works analogously as list_lru_add in terms of list
* manipulation. The comments about an element already pertaining to
* a list are also valid for list_lru_del.
*
* Return value: true if the list was updated, false otherwise
*/
bool list_lru_del(struct list_lru *lru, struct list_head *item);
/**
* list_lru_count_one: return the number of objects currently held by @lru
* @lru: the lru pointer.
* @nid: the node id to count from.
* @memcg: the cgroup to count from.
*
* Always return a non-negative number, 0 for empty lists. There is no
* guarantee that the list is not updated while the count is being computed.
* Callers that want such a guarantee need to provide an outer lock.
*/
unsigned long list_lru_count_one(struct list_lru *lru,
int nid, struct mem_cgroup *memcg);
unsigned long list_lru_count_node(struct list_lru *lru, int nid);
static inline unsigned long list_lru_shrink_count(struct list_lru *lru,
struct shrink_control *sc)
{
return list_lru_count_one(lru, sc->nid, sc->memcg);
}
static inline unsigned long list_lru_count(struct list_lru *lru)
{
long count = 0;
int nid;
for_each_node_state(nid, N_NORMAL_MEMORY)
count += list_lru_count_node(lru, nid);
return count;
}
void list_lru_isolate(struct list_lru_one *list, struct list_head *item);
void list_lru_isolate_move(struct list_lru_one *list, struct list_head *item,
struct list_head *head);
typedef enum lru_status (*list_lru_walk_cb)(struct list_head *item,
struct list_lru_one *list, spinlock_t *lock, void *cb_arg);
/**
* list_lru_walk_one: walk a list_lru, isolating and disposing freeable items.
* @lru: the lru pointer.
* @nid: the node id to scan from.
* @memcg: the cgroup to scan from.
* @isolate: callback function that is responsible for deciding what to do with
* the item currently being scanned
* @cb_arg: opaque type that will be passed to @isolate
* @nr_to_walk: how many items to scan.
*
* This function will scan all elements in a particular list_lru, calling the
* @isolate callback for each of those items, along with the current list
* spinlock and a caller-provided opaque. The @isolate callback can choose to
* drop the lock internally, but *must* return with the lock held. The callback
* will return an enum lru_status telling the list_lru infrastructure what to
* do with the object being scanned.
*
* Please note that nr_to_walk does not mean how many objects will be freed,
* just how many objects will be scanned.
*
* Return value: the number of objects effectively removed from the LRU.
*/
unsigned long list_lru_walk_one(struct list_lru *lru,
int nid, struct mem_cgroup *memcg,
list_lru_walk_cb isolate, void *cb_arg,
unsigned long *nr_to_walk);
/**
* list_lru_walk_one_irq: walk a list_lru, isolating and disposing freeable items.
* @lru: the lru pointer.
* @nid: the node id to scan from.
* @memcg: the cgroup to scan from.
* @isolate: callback function that is responsible for deciding what to do with
* the item currently being scanned
* @cb_arg: opaque type that will be passed to @isolate
* @nr_to_walk: how many items to scan.
*
* Same as @list_lru_walk_one except that the spinlock is acquired with
* spin_lock_irq().
*/
unsigned long list_lru_walk_one_irq(struct list_lru *lru,
int nid, struct mem_cgroup *memcg,
list_lru_walk_cb isolate, void *cb_arg,
unsigned long *nr_to_walk);
unsigned long list_lru_walk_node(struct list_lru *lru, int nid,
list_lru_walk_cb isolate, void *cb_arg,
unsigned long *nr_to_walk);
static inline unsigned long
list_lru_shrink_walk(struct list_lru *lru, struct shrink_control *sc,
list_lru_walk_cb isolate, void *cb_arg)
{
return list_lru_walk_one(lru, sc->nid, sc->memcg, isolate, cb_arg,
&sc->nr_to_scan);
}
static inline unsigned long
list_lru_shrink_walk_irq(struct list_lru *lru, struct shrink_control *sc,
list_lru_walk_cb isolate, void *cb_arg)
{
return list_lru_walk_one_irq(lru, sc->nid, sc->memcg, isolate, cb_arg,
&sc->nr_to_scan);
}
static inline unsigned long
list_lru_walk(struct list_lru *lru, list_lru_walk_cb isolate,
void *cb_arg, unsigned long nr_to_walk)
{
long isolated = 0;
int nid;
for_each_node_state(nid, N_NORMAL_MEMORY) {
isolated += list_lru_walk_node(lru, nid, isolate,
cb_arg, &nr_to_walk);
if (nr_to_walk <= 0)
break;
}
return isolated;
}
#endif /* _LRU_LIST_H */
// SPDX-License-Identifier: GPL-2.0
/*
* Copyright (C) 2007 Oracle. All rights reserved.
*/
#include <linux/fs.h>
#include <linux/slab.h>
#include <linux/sched.h>
#include <linux/writeback.h>
#include <linux/pagemap.h>
#include <linux/blkdev.h>
#include <linux/uuid.h>
#include "misc.h"
#include "ctree.h"
#include "disk-io.h"
#include "transaction.h"
#include "locking.h"
#include "tree-log.h"
#include "volumes.h"
#include "dev-replace.h"
#include "qgroup.h"
#include "block-group.h"
#include "space-info.h"
#include "zoned.h"
#define BTRFS_ROOT_TRANS_TAG 0
/*
* Transaction states and transitions
*
* No running transaction (fs tree blocks are not modified)
* |
* | To next stage:
* | Call start_transaction() variants. Except btrfs_join_transaction_nostart().
* V
* Transaction N [[TRANS_STATE_RUNNING]]
* |
* | New trans handles can be attached to transaction N by calling all
* | start_transaction() variants.
* |
* | To next stage:
* | Call btrfs_commit_transaction() on any trans handle attached to
* | transaction N
* V
* Transaction N [[TRANS_STATE_COMMIT_START]]
* |
* | Will wait for previous running transaction to completely finish if there
* | is one
* |
* | Then one of the following happes:
* | - Wait for all other trans handle holders to release.
* | The btrfs_commit_transaction() caller will do the commit work.
* | - Wait for current transaction to be committed by others.
* | Other btrfs_commit_transaction() caller will do the commit work.
* |
* | At this stage, only btrfs_join_transaction*() variants can attach
* | to this running transaction.
* | All other variants will wait for current one to finish and attach to
* | transaction N+1.
* |
* | To next stage:
* | Caller is chosen to commit transaction N, and all other trans handle
* | haven been released.
* V
* Transaction N [[TRANS_STATE_COMMIT_DOING]]
* |
* | The heavy lifting transaction work is started.
* | From running delayed refs (modifying extent tree) to creating pending
* | snapshots, running qgroups.
* | In short, modify supporting trees to reflect modifications of subvolume
* | trees.
* |
* | At this stage, all start_transaction() calls will wait for this
* | transaction to finish and attach to transaction N+1.
* |
* | To next stage:
* | Until all supporting trees are updated.
* V
* Transaction N [[TRANS_STATE_UNBLOCKED]]
* | Transaction N+1
* | All needed trees are modified, thus we only [[TRANS_STATE_RUNNING]]
* | need to write them back to disk and update |
* | super blocks. |
* | |
* | At this stage, new transaction is allowed to |
* | start. |
* | All new start_transaction() calls will be |
* | attached to transid N+1. |
* | |
* | To next stage: |
* | Until all tree blocks are super blocks are |
* | written to block devices |
* V |
* Transaction N [[TRANS_STATE_COMPLETED]] V
* All tree blocks and super blocks are written. Transaction N+1
* This transaction is finished and all its [[TRANS_STATE_COMMIT_START]]
* data structures will be cleaned up. | Life goes on
*/
static const unsigned int btrfs_blocked_trans_types[TRANS_STATE_MAX] = {
[TRANS_STATE_RUNNING] = 0U,
[TRANS_STATE_COMMIT_START] = (__TRANS_START | __TRANS_ATTACH),
[TRANS_STATE_COMMIT_DOING] = (__TRANS_START |
__TRANS_ATTACH |
__TRANS_JOIN |
__TRANS_JOIN_NOSTART),
[TRANS_STATE_UNBLOCKED] = (__TRANS_START |
__TRANS_ATTACH |
__TRANS_JOIN |
__TRANS_JOIN_NOLOCK |
__TRANS_JOIN_NOSTART),
[TRANS_STATE_SUPER_COMMITTED] = (__TRANS_START |
__TRANS_ATTACH |
__TRANS_JOIN |
__TRANS_JOIN_NOLOCK |
__TRANS_JOIN_NOSTART),
[TRANS_STATE_COMPLETED] = (__TRANS_START |
__TRANS_ATTACH |
__TRANS_JOIN |
__TRANS_JOIN_NOLOCK |
__TRANS_JOIN_NOSTART),
};
void btrfs_put_transaction(struct btrfs_transaction *transaction)
{
WARN_ON(refcount_read(&transaction->use_count) == 0);
if (refcount_dec_and_test(&transaction->use_count)) {
BUG_ON(!list_empty(&transaction->list)); WARN_ON(!RB_EMPTY_ROOT(
&transaction->delayed_refs.href_root.rb_root));
WARN_ON(!RB_EMPTY_ROOT(
&transaction->delayed_refs.dirty_extent_root));
if (transaction->delayed_refs.pending_csums)
btrfs_err(transaction->fs_info,
"pending csums is %llu",
transaction->delayed_refs.pending_csums);
/*
* If any block groups are found in ->deleted_bgs then it's
* because the transaction was aborted and a commit did not
* happen (things failed before writing the new superblock
* and calling btrfs_finish_extent_commit()), so we can not
* discard the physical locations of the block groups.
*/
while (!list_empty(&transaction->deleted_bgs)) {
struct btrfs_block_group *cache;
cache = list_first_entry(&transaction->deleted_bgs,
struct btrfs_block_group,
bg_list);
list_del_init(&cache->bg_list);
btrfs_unfreeze_block_group(cache);
btrfs_put_block_group(cache);
}
WARN_ON(!list_empty(&transaction->dev_update_list)); kfree(transaction);
}
}
static noinline void switch_commit_roots(struct btrfs_trans_handle *trans)
{
struct btrfs_transaction *cur_trans = trans->transaction;
struct btrfs_fs_info *fs_info = trans->fs_info;
struct btrfs_root *root, *tmp;
struct btrfs_caching_control *caching_ctl, *next;
down_write(&fs_info->commit_root_sem);
if (test_bit(BTRFS_FS_RELOC_RUNNING, &fs_info->flags))
fs_info->last_reloc_trans = trans->transid; list_for_each_entry_safe(root, tmp, &cur_trans->switch_commits,
dirty_list) {
list_del_init(&root->dirty_list);
free_extent_buffer(root->commit_root);
root->commit_root = btrfs_root_node(root);
extent_io_tree_release(&root->dirty_log_pages);
btrfs_qgroup_clean_swapped_blocks(root);
}
/* We can free old roots now. */
spin_lock(&cur_trans->dropped_roots_lock);
while (!list_empty(&cur_trans->dropped_roots)) {
root = list_first_entry(&cur_trans->dropped_roots,
struct btrfs_root, root_list);
list_del_init(&root->root_list);
spin_unlock(&cur_trans->dropped_roots_lock);
btrfs_free_log(trans, root);
btrfs_drop_and_free_fs_root(fs_info, root);
spin_lock(&cur_trans->dropped_roots_lock);
}
spin_unlock(&cur_trans->dropped_roots_lock);
/*
* We have to update the last_byte_to_unpin under the commit_root_sem,
* at the same time we swap out the commit roots.
*
* This is because we must have a real view of the last spot the caching
* kthreads were while caching. Consider the following views of the
* extent tree for a block group
*
* commit root
* +----+----+----+----+----+----+----+
* |\\\\| |\\\\|\\\\| |\\\\|\\\\|
* +----+----+----+----+----+----+----+
* 0 1 2 3 4 5 6 7
*
* new commit root
* +----+----+----+----+----+----+----+
* | | | |\\\\| | |\\\\|
* +----+----+----+----+----+----+----+
* 0 1 2 3 4 5 6 7
*
* If the cache_ctl->progress was at 3, then we are only allowed to
* unpin [0,1) and [2,3], because the caching thread has already
* processed those extents. We are not allowed to unpin [5,6), because
* the caching thread will re-start it's search from 3, and thus find
* the hole from [4,6) to add to the free space cache.
*/
spin_lock(&fs_info->block_group_cache_lock);
list_for_each_entry_safe(caching_ctl, next,
&fs_info->caching_block_groups, list) {
struct btrfs_block_group *cache = caching_ctl->block_group;
if (btrfs_block_group_done(cache)) {
cache->last_byte_to_unpin = (u64)-1;
list_del_init(&caching_ctl->list);
btrfs_put_caching_control(caching_ctl);
} else {
cache->last_byte_to_unpin = caching_ctl->progress;
}
}
spin_unlock(&fs_info->block_group_cache_lock);
up_write(&fs_info->commit_root_sem);
}
static inline void extwriter_counter_inc(struct btrfs_transaction *trans,
unsigned int type)
{
if (type & TRANS_EXTWRITERS)
atomic_inc(&trans->num_extwriters);
}
static inline void extwriter_counter_dec(struct btrfs_transaction *trans,
unsigned int type)
{
if (type & TRANS_EXTWRITERS)
atomic_dec(&trans->num_extwriters);
}
static inline void extwriter_counter_init(struct btrfs_transaction *trans,
unsigned int type)
{
atomic_set(&trans->num_extwriters, ((type & TRANS_EXTWRITERS) ? 1 : 0));
}
static inline int extwriter_counter_read(struct btrfs_transaction *trans)
{
return atomic_read(&trans->num_extwriters);
}
/*
* To be called after doing the chunk btree updates right after allocating a new
* chunk (after btrfs_chunk_alloc_add_chunk_item() is called), when removing a
* chunk after all chunk btree updates and after finishing the second phase of
* chunk allocation (btrfs_create_pending_block_groups()) in case some block
* group had its chunk item insertion delayed to the second phase.
*/
void btrfs_trans_release_chunk_metadata(struct btrfs_trans_handle *trans)
{
struct btrfs_fs_info *fs_info = trans->fs_info; if (!trans->chunk_bytes_reserved)
return;
btrfs_block_rsv_release(fs_info, &fs_info->chunk_block_rsv,
trans->chunk_bytes_reserved, NULL);
trans->chunk_bytes_reserved = 0;
}
/*
* either allocate a new transaction or hop into the existing one
*/
static noinline int join_transaction(struct btrfs_fs_info *fs_info,
unsigned int type)
{
struct btrfs_transaction *cur_trans;
spin_lock(&fs_info->trans_lock);
loop:
/* The file system has been taken offline. No new transactions. */
if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) {
spin_unlock(&fs_info->trans_lock);
return -EROFS;
}
cur_trans = fs_info->running_transaction;
if (cur_trans) {
if (TRANS_ABORTED(cur_trans)) {
spin_unlock(&fs_info->trans_lock);
return cur_trans->aborted;
}
if (btrfs_blocked_trans_types[cur_trans->state] & type) {
spin_unlock(&fs_info->trans_lock);
return -EBUSY;
}
refcount_inc(&cur_trans->use_count); atomic_inc(&cur_trans->num_writers);
extwriter_counter_inc(cur_trans, type);
spin_unlock(&fs_info->trans_lock);
return 0;
}
spin_unlock(&fs_info->trans_lock);
/*
* If we are ATTACH, we just want to catch the current transaction,
* and commit it. If there is no transaction, just return ENOENT.
*/
if (type == TRANS_ATTACH)
return -ENOENT;
/*
* JOIN_NOLOCK only happens during the transaction commit, so
* it is impossible that ->running_transaction is NULL
*/
BUG_ON(type == TRANS_JOIN_NOLOCK);
cur_trans = kmalloc(sizeof(*cur_trans), GFP_NOFS);
if (!cur_trans)
return -ENOMEM;
spin_lock(&fs_info->trans_lock);
if (fs_info->running_transaction) {
/*
* someone started a transaction after we unlocked. Make sure
* to redo the checks above
*/
kfree(cur_trans);
goto loop;
} else if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) {
spin_unlock(&fs_info->trans_lock);
kfree(cur_trans);
return -EROFS;
}
cur_trans->fs_info = fs_info;
atomic_set(&cur_trans->pending_ordered, 0);
init_waitqueue_head(&cur_trans->pending_wait);
atomic_set(&cur_trans->num_writers, 1);
extwriter_counter_init(cur_trans, type);
init_waitqueue_head(&cur_trans->writer_wait);
init_waitqueue_head(&cur_trans->commit_wait);
cur_trans->state = TRANS_STATE_RUNNING;
/*
* One for this trans handle, one so it will live on until we
* commit the transaction.
*/
refcount_set(&cur_trans->use_count, 2);
cur_trans->flags = 0;
cur_trans->start_time = ktime_get_seconds();
memset(&cur_trans->delayed_refs, 0, sizeof(cur_trans->delayed_refs));
cur_trans->delayed_refs.href_root = RB_ROOT_CACHED;
cur_trans->delayed_refs.dirty_extent_root = RB_ROOT;
atomic_set(&cur_trans->delayed_refs.num_entries, 0);
/*
* although the tree mod log is per file system and not per transaction,
* the log must never go across transaction boundaries.
*/
smp_mb();
if (!list_empty(&fs_info->tree_mod_seq_list))
WARN(1, KERN_ERR "BTRFS: tree_mod_seq_list not empty when creating a fresh transaction\n"); if (!RB_EMPTY_ROOT(&fs_info->tree_mod_log)) WARN(1, KERN_ERR "BTRFS: tree_mod_log rb tree not empty when creating a fresh transaction\n"); atomic64_set(&fs_info->tree_mod_seq, 0);
spin_lock_init(&cur_trans->delayed_refs.lock);
INIT_LIST_HEAD(&cur_trans->pending_snapshots);
INIT_LIST_HEAD(&cur_trans->dev_update_list);
INIT_LIST_HEAD(&cur_trans->switch_commits);
INIT_LIST_HEAD(&cur_trans->dirty_bgs);
INIT_LIST_HEAD(&cur_trans->io_bgs);
INIT_LIST_HEAD(&cur_trans->dropped_roots);
mutex_init(&cur_trans->cache_write_mutex);
spin_lock_init(&cur_trans->dirty_bgs_lock);
INIT_LIST_HEAD(&cur_trans->deleted_bgs);
spin_lock_init(&cur_trans->dropped_roots_lock);
INIT_LIST_HEAD(&cur_trans->releasing_ebs);
spin_lock_init(&cur_trans->releasing_ebs_lock);
list_add_tail(&cur_trans->list, &fs_info->trans_list);
extent_io_tree_init(fs_info, &cur_trans->dirty_pages,
IO_TREE_TRANS_DIRTY_PAGES, fs_info->btree_inode);
extent_io_tree_init(fs_info, &cur_trans->pinned_extents,
IO_TREE_FS_PINNED_EXTENTS, NULL);
fs_info->generation++;
cur_trans->transid = fs_info->generation;
fs_info->running_transaction = cur_trans;
cur_trans->aborted = 0;
spin_unlock(&fs_info->trans_lock);
return 0;
}
/*
* This does all the record keeping required to make sure that a shareable root
* is properly recorded in a given transaction. This is required to make sure
* the old root from before we joined the transaction is deleted when the
* transaction commits.
*/
static int record_root_in_trans(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
int force)
{
struct btrfs_fs_info *fs_info = root->fs_info;
int ret = 0;
if ((test_bit(BTRFS_ROOT_SHAREABLE, &root->state) &&
root->last_trans < trans->transid) || force) { WARN_ON(root == fs_info->extent_root); WARN_ON(!force && root->commit_root != root->node);
/*
* see below for IN_TRANS_SETUP usage rules
* we have the reloc mutex held now, so there
* is only one writer in this function
*/
set_bit(BTRFS_ROOT_IN_TRANS_SETUP, &root->state);
/* make sure readers find IN_TRANS_SETUP before
* they find our root->last_trans update
*/
smp_wmb();
spin_lock(&fs_info->fs_roots_radix_lock);
if (root->last_trans == trans->transid && !force) {
spin_unlock(&fs_info->fs_roots_radix_lock);
return 0;
}
radix_tree_tag_set(&fs_info->fs_roots_radix,
(unsigned long)root->root_key.objectid,
BTRFS_ROOT_TRANS_TAG);
spin_unlock(&fs_info->fs_roots_radix_lock);
root->last_trans = trans->transid;
/* this is pretty tricky. We don't want to
* take the relocation lock in btrfs_record_root_in_trans
* unless we're really doing the first setup for this root in
* this transaction.
*
* Normally we'd use root->last_trans as a flag to decide
* if we want to take the expensive mutex.
*
* But, we have to set root->last_trans before we
* init the relocation root, otherwise, we trip over warnings
* in ctree.c. The solution used here is to flag ourselves
* with root IN_TRANS_SETUP. When this is 1, we're still
* fixing up the reloc trees and everyone must wait.
*
* When this is zero, they can trust root->last_trans and fly
* through btrfs_record_root_in_trans without having to take the
* lock. smp_wmb() makes sure that all the writes above are
* done before we pop in the zero below
*/
ret = btrfs_init_reloc_root(trans, root);
smp_mb__before_atomic();
clear_bit(BTRFS_ROOT_IN_TRANS_SETUP, &root->state);
}
return ret;
}
void btrfs_add_dropped_root(struct btrfs_trans_handle *trans,
struct btrfs_root *root)
{
struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_transaction *cur_trans = trans->transaction;
/* Add ourselves to the transaction dropped list */
spin_lock(&cur_trans->dropped_roots_lock);
list_add_tail(&root->root_list, &cur_trans->dropped_roots);
spin_unlock(&cur_trans->dropped_roots_lock);
/* Make sure we don't try to update the root at commit time */
spin_lock(&fs_info->fs_roots_radix_lock);
radix_tree_tag_clear(&fs_info->fs_roots_radix,
(unsigned long)root->root_key.objectid,
BTRFS_ROOT_TRANS_TAG);
spin_unlock(&fs_info->fs_roots_radix_lock);
}
int btrfs_record_root_in_trans(struct btrfs_trans_handle *trans,
struct btrfs_root *root)
{
struct btrfs_fs_info *fs_info = root->fs_info;
int ret;
if (!test_bit(BTRFS_ROOT_SHAREABLE, &root->state))
return 0;
/*
* see record_root_in_trans for comments about IN_TRANS_SETUP usage
* and barriers
*/
smp_rmb();
if (root->last_trans == trans->transid &&
!test_bit(BTRFS_ROOT_IN_TRANS_SETUP, &root->state))
return 0;
mutex_lock(&fs_info->reloc_mutex);
ret = record_root_in_trans(trans, root, 0);
mutex_unlock(&fs_info->reloc_mutex);
return ret;
}
static inline int is_transaction_blocked(struct btrfs_transaction *trans)
{
return (trans->state >= TRANS_STATE_COMMIT_START &&
trans->state < TRANS_STATE_UNBLOCKED &&
!TRANS_ABORTED(trans));
}
/* wait for commit against the current transaction to become unblocked
* when this is done, it is safe to start a new transaction, but the current
* transaction might not be fully on disk.
*/
static void wait_current_trans(struct btrfs_fs_info *fs_info)
{
struct btrfs_transaction *cur_trans;
spin_lock(&fs_info->trans_lock);
cur_trans = fs_info->running_transaction;
if (cur_trans && is_transaction_blocked(cur_trans)) {
refcount_inc(&cur_trans->use_count);
spin_unlock(&fs_info->trans_lock);
wait_event(fs_info->transaction_wait,
cur_trans->state >= TRANS_STATE_UNBLOCKED ||
TRANS_ABORTED(cur_trans));
btrfs_put_transaction(cur_trans);
} else {
spin_unlock(&fs_info->trans_lock);
}
}
static int may_wait_transaction(struct btrfs_fs_info *fs_info, int type)
{
if (test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags))
return 0;
if (type == TRANS_START)
return 1;
return 0;
}
static inline bool need_reserve_reloc_root(struct btrfs_root *root)
{
struct btrfs_fs_info *fs_info = root->fs_info;
if (!fs_info->reloc_ctl ||
!test_bit(BTRFS_ROOT_SHAREABLE, &root->state) || root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID || root->reloc_root)
return false;
return true;
}
static struct btrfs_trans_handle *
start_transaction(struct btrfs_root *root, unsigned int num_items,
unsigned int type, enum btrfs_reserve_flush_enum flush,
bool enforce_qgroups)
{
struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_block_rsv *delayed_refs_rsv = &fs_info->delayed_refs_rsv;
struct btrfs_trans_handle *h;
struct btrfs_transaction *cur_trans;
u64 num_bytes = 0;
u64 qgroup_reserved = 0;
bool reloc_reserved = false;
bool do_chunk_alloc = false;
int ret;
if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state))
return ERR_PTR(-EROFS);
if (current->journal_info) {
WARN_ON(type & TRANS_EXTWRITERS);
h = current->journal_info;
refcount_inc(&h->use_count);
WARN_ON(refcount_read(&h->use_count) > 2); h->orig_rsv = h->block_rsv;
h->block_rsv = NULL;
goto got_it;
}
/*
* Do the reservation before we join the transaction so we can do all
* the appropriate flushing if need be.
*/
if (num_items && root != fs_info->chunk_root) { struct btrfs_block_rsv *rsv = &fs_info->trans_block_rsv;
u64 delayed_refs_bytes = 0;
qgroup_reserved = num_items * fs_info->nodesize;
ret = btrfs_qgroup_reserve_meta_pertrans(root, qgroup_reserved,
enforce_qgroups);
if (ret)
return ERR_PTR(ret);
/*
* We want to reserve all the bytes we may need all at once, so
* we only do 1 enospc flushing cycle per transaction start. We
* accomplish this by simply assuming we'll do 2 x num_items
* worth of delayed refs updates in this trans handle, and
* refill that amount for whatever is missing in the reserve.
*/
num_bytes = btrfs_calc_insert_metadata_size(fs_info, num_items);
if (flush == BTRFS_RESERVE_FLUSH_ALL &&
delayed_refs_rsv->full == 0) {
delayed_refs_bytes = num_bytes;
num_bytes <<= 1;
}
/*
* Do the reservation for the relocation root creation
*/
if (need_reserve_reloc_root(root)) {
num_bytes += fs_info->nodesize;
reloc_reserved = true;
}
ret = btrfs_block_rsv_add(root, rsv, num_bytes, flush);
if (ret)
goto reserve_fail;
if (delayed_refs_bytes) { btrfs_migrate_to_delayed_refs_rsv(fs_info, rsv,
delayed_refs_bytes);
num_bytes -= delayed_refs_bytes;
}
if (rsv->space_info->force_alloc)
do_chunk_alloc = true;
} else if (num_items == 0 && flush == BTRFS_RESERVE_FLUSH_ALL && !delayed_refs_rsv->full) {
/*
* Some people call with btrfs_start_transaction(root, 0)
* because they can be throttled, but have some other mechanism
* for reserving space. We still want these guys to refill the
* delayed block_rsv so just add 1 items worth of reservation
* here.
*/
ret = btrfs_delayed_refs_rsv_refill(fs_info, flush); if (ret)
goto reserve_fail;
}
again:
h = kmem_cache_zalloc(btrfs_trans_handle_cachep, GFP_NOFS);
if (!h) {
ret = -ENOMEM;
goto alloc_fail;
}
/*
* If we are JOIN_NOLOCK we're already committing a transaction and
* waiting on this guy, so we don't need to do the sb_start_intwrite
* because we're already holding a ref. We need this because we could
* have raced in and did an fsync() on a file which can kick a commit
* and then we deadlock with somebody doing a freeze.
*
* If we are ATTACH, it means we just want to catch the current
* transaction and commit it, so we needn't do sb_start_intwrite().
*/
if (type & __TRANS_FREEZABLE) sb_start_intwrite(fs_info->sb);
if (may_wait_transaction(fs_info, type))
wait_current_trans(fs_info);
do {
ret = join_transaction(fs_info, type);
if (ret == -EBUSY) {
wait_current_trans(fs_info); if (unlikely(type == TRANS_ATTACH ||
type == TRANS_JOIN_NOSTART))
ret = -ENOENT;
}
} while (ret == -EBUSY);
if (ret < 0)
goto join_fail;
cur_trans = fs_info->running_transaction;
h->transid = cur_trans->transid;
h->transaction = cur_trans;
h->root = root;
refcount_set(&h->use_count, 1);
h->fs_info = root->fs_info;
h->type = type;
INIT_LIST_HEAD(&h->new_bgs);
smp_mb();
if (cur_trans->state >= TRANS_STATE_COMMIT_START &&
may_wait_transaction(fs_info, type)) {
current->journal_info = h;
btrfs_commit_transaction(h);
goto again;
}
if (num_bytes) { trace_btrfs_space_reservation(fs_info, "transaction",
h->transid, num_bytes, 1);
h->block_rsv = &fs_info->trans_block_rsv;
h->bytes_reserved = num_bytes;
h->reloc_reserved = reloc_reserved;
}
got_it:
if (!current->journal_info)
current->journal_info = h;
/*
* If the space_info is marked ALLOC_FORCE then we'll get upgraded to
* ALLOC_FORCE the first run through, and then we won't allocate for
* anybody else who races in later. We don't care about the return
* value here.
*/
if (do_chunk_alloc && num_bytes) { u64 flags = h->block_rsv->space_info->flags;
btrfs_chunk_alloc(h, btrfs_get_alloc_profile(fs_info, flags),
CHUNK_ALLOC_NO_FORCE);
}
/*
* btrfs_record_root_in_trans() needs to alloc new extents, and may
* call btrfs_join_transaction() while we're also starting a
* transaction.
*
* Thus it need to be called after current->journal_info initialized,
* or we can deadlock.
*/
ret = btrfs_record_root_in_trans(h, root); if (ret) {
/*
* The transaction handle is fully initialized and linked with
* other structures so it needs to be ended in case of errors,
* not just freed.
*/
btrfs_end_transaction(h);
return ERR_PTR(ret);
}
return h;
join_fail:
if (type & __TRANS_FREEZABLE) sb_end_intwrite(fs_info->sb); kmem_cache_free(btrfs_trans_handle_cachep, h);
alloc_fail:
if (num_bytes) btrfs_block_rsv_release(fs_info, &fs_info->trans_block_rsv,
num_bytes, NULL);
reserve_fail:
btrfs_qgroup_free_meta_pertrans(root, qgroup_reserved);
return ERR_PTR(ret);
}
struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root,
unsigned int num_items)
{
return start_transaction(root, num_items, TRANS_START,
BTRFS_RESERVE_FLUSH_ALL, true);
}
struct btrfs_trans_handle *btrfs_start_transaction_fallback_global_rsv(
struct btrfs_root *root,
unsigned int num_items)
{
return start_transaction(root, num_items, TRANS_START,
BTRFS_RESERVE_FLUSH_ALL_STEAL, false);
}
struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root)
{
return start_transaction(root, 0, TRANS_JOIN, BTRFS_RESERVE_NO_FLUSH,
true);
}
struct btrfs_trans_handle *btrfs_join_transaction_spacecache(struct btrfs_root *root)
{
return start_transaction(root, 0, TRANS_JOIN_NOLOCK,
BTRFS_RESERVE_NO_FLUSH, true);
}
/*
* Similar to regular join but it never starts a transaction when none is
* running or after waiting for the current one to finish.
*/
struct btrfs_trans_handle *btrfs_join_transaction_nostart(struct btrfs_root *root)
{
return start_transaction(root, 0, TRANS_JOIN_NOSTART,
BTRFS_RESERVE_NO_FLUSH, true);
}
/*
* btrfs_attach_transaction() - catch the running transaction
*
* It is used when we want to commit the current the transaction, but
* don't want to start a new one.
*
* Note: If this function return -ENOENT, it just means there is no
* running transaction. But it is possible that the inactive transaction
* is still in the memory, not fully on disk. If you hope there is no
* inactive transaction in the fs when -ENOENT is returned, you should
* invoke
* btrfs_attach_transaction_barrier()
*/
struct btrfs_trans_handle *btrfs_attach_transaction(struct btrfs_root *root)
{
return start_transaction(root, 0, TRANS_ATTACH,
BTRFS_RESERVE_NO_FLUSH, true);
}
/*
* btrfs_attach_transaction_barrier() - catch the running transaction
*
* It is similar to the above function, the difference is this one
* will wait for all the inactive transactions until they fully
* complete.
*/
struct btrfs_trans_handle *
btrfs_attach_transaction_barrier(struct btrfs_root *root)
{
struct btrfs_trans_handle *trans;
trans = start_transaction(root, 0, TRANS_ATTACH,
BTRFS_RESERVE_NO_FLUSH, true);
if (trans == ERR_PTR(-ENOENT))
btrfs_wait_for_commit(root->fs_info, 0); return trans;
}
/* Wait for a transaction commit to reach at least the given state. */
static noinline void wait_for_commit(struct btrfs_transaction *commit,
const enum btrfs_trans_state min_state)
{
struct btrfs_fs_info *fs_info = commit->fs_info;
u64 transid = commit->transid;
bool put = false;
while (1) {
wait_event(commit->commit_wait, commit->state >= min_state); if (put) btrfs_put_transaction(commit); if (min_state < TRANS_STATE_COMPLETED)
break;
/*
* A transaction isn't really completed until all of the
* previous transactions are completed, but with fsync we can
* end up with SUPER_COMMITTED transactions before a COMPLETED
* transaction. Wait for those.
*/
spin_lock(&fs_info->trans_lock);
commit = list_first_entry_or_null(&fs_info->trans_list,
struct btrfs_transaction,
list);
if (!commit || commit->transid > transid) {
spin_unlock(&fs_info->trans_lock);
break;
}
refcount_inc(&commit->use_count);
put = true;
spin_unlock(&fs_info->trans_lock);
}
}
int btrfs_wait_for_commit(struct btrfs_fs_info *fs_info, u64 transid)
{
struct btrfs_transaction *cur_trans = NULL, *t;
int ret = 0;
if (transid) { if (transid <= fs_info->last_trans_committed)
goto out;
/* find specified transaction */
spin_lock(&fs_info->trans_lock);
list_for_each_entry(t, &fs_info->trans_list, list) { if (t->transid == transid) {
cur_trans = t;
refcount_inc(&cur_trans->use_count);
ret = 0;
break;
}
if (t->transid > transid) {
ret = 0;
break;
}
}
spin_unlock(&fs_info->trans_lock);
/*
* The specified transaction doesn't exist, or we
* raced with btrfs_commit_transaction
*/
if (!cur_trans) {
if (transid > fs_info->last_trans_committed)
ret = -EINVAL;
goto out;
}
} else {
/* find newest transaction that is committing | committed */
spin_lock(&fs_info->trans_lock);
list_for_each_entry_reverse(t, &fs_info->trans_list,
list) {
if (t->state >= TRANS_STATE_COMMIT_START) { if (t->state == TRANS_STATE_COMPLETED)
break;
cur_trans = t;
refcount_inc(&cur_trans->use_count);
break;
}
}
spin_unlock(&fs_info->trans_lock);
if (!cur_trans)
goto out; /* nothing committing|committed */
}
wait_for_commit(cur_trans, TRANS_STATE_COMPLETED);
btrfs_put_transaction(cur_trans);
out:
return ret;
}
void btrfs_throttle(struct btrfs_fs_info *fs_info)
{
wait_current_trans(fs_info);
}
static bool should_end_transaction(struct btrfs_trans_handle *trans)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
if (btrfs_check_space_for_delayed_refs(fs_info))
return true;
return !!btrfs_block_rsv_check(&fs_info->global_block_rsv, 5);
}
bool btrfs_should_end_transaction(struct btrfs_trans_handle *trans)
{
struct btrfs_transaction *cur_trans = trans->transaction; if (cur_trans->state >= TRANS_STATE_COMMIT_START || test_bit(BTRFS_DELAYED_REFS_FLUSHING, &cur_trans->delayed_refs.flags))
return true;
return should_end_transaction(trans);
}
static void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans)
{
struct btrfs_fs_info *fs_info = trans->fs_info; if (!trans->block_rsv) {
ASSERT(!trans->bytes_reserved);
return;
}
if (!trans->bytes_reserved)
return;
ASSERT(trans->block_rsv == &fs_info->trans_block_rsv);
trace_btrfs_space_reservation(fs_info, "transaction",
trans->transid, trans->bytes_reserved, 0);
btrfs_block_rsv_release(fs_info, trans->block_rsv,
trans->bytes_reserved, NULL);
trans->bytes_reserved = 0;
}
static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
int throttle)
{
struct btrfs_fs_info *info = trans->fs_info;
struct btrfs_transaction *cur_trans = trans->transaction;
int err = 0;
if (refcount_read(&trans->use_count) > 1) {
refcount_dec(&trans->use_count);
trans->block_rsv = trans->orig_rsv;
return 0;
}
btrfs_trans_release_metadata(trans);
trans->block_rsv = NULL;
btrfs_create_pending_block_groups(trans);
btrfs_trans_release_chunk_metadata(trans);
if (trans->type & __TRANS_FREEZABLE) sb_end_intwrite(info->sb); WARN_ON(cur_trans != info->running_transaction); WARN_ON(atomic_read(&cur_trans->num_writers) < 1);
atomic_dec(&cur_trans->num_writers);
extwriter_counter_dec(cur_trans, trans->type);
cond_wake_up(&cur_trans->writer_wait); btrfs_put_transaction(cur_trans);
if (current->journal_info == trans)
current->journal_info = NULL;
if (throttle) btrfs_run_delayed_iputs(info); if (TRANS_ABORTED(trans) || test_bit(BTRFS_FS_STATE_ERROR, &info->fs_state)) { wake_up_process(info->transaction_kthread);
if (TRANS_ABORTED(trans))
err = trans->aborted;
else
err = -EROFS;
}
kmem_cache_free(btrfs_trans_handle_cachep, trans); return err;
}
int btrfs_end_transaction(struct btrfs_trans_handle *trans)
{
return __btrfs_end_transaction(trans, 0);
}
int btrfs_end_transaction_throttle(struct btrfs_trans_handle *trans)
{
return __btrfs_end_transaction(trans, 1);
}
/*
* when btree blocks are allocated, they have some corresponding bits set for
* them in one of two extent_io trees. This is used to make sure all of
* those extents are sent to disk but does not wait on them
*/
int btrfs_write_marked_extents(struct btrfs_fs_info *fs_info,
struct extent_io_tree *dirty_pages, int mark)
{
int err = 0;
int werr = 0;
struct address_space *mapping = fs_info->btree_inode->i_mapping;
struct extent_state *cached_state = NULL;
u64 start = 0;
u64 end;
atomic_inc(&BTRFS_I(fs_info->btree_inode)->sync_writers);
while (!find_first_extent_bit(dirty_pages, start, &start, &end,
mark, &cached_state)) {
bool wait_writeback = false;
err = convert_extent_bit(dirty_pages, start, end,
EXTENT_NEED_WAIT,
mark, &cached_state);
/*
* convert_extent_bit can return -ENOMEM, which is most of the
* time a temporary error. So when it happens, ignore the error
* and wait for writeback of this range to finish - because we
* failed to set the bit EXTENT_NEED_WAIT for the range, a call
* to __btrfs_wait_marked_extents() would not know that
* writeback for this range started and therefore wouldn't
* wait for it to finish - we don't want to commit a
* superblock that points to btree nodes/leafs for which
* writeback hasn't finished yet (and without errors).
* We cleanup any entries left in the io tree when committing
* the transaction (through extent_io_tree_release()).
*/
if (err == -ENOMEM) {
err = 0;
wait_writeback = true;
}
if (!err) err = filemap_fdatawrite_range(mapping, start, end);
if (err)
werr = err;
else if (wait_writeback)
werr = filemap_fdatawait_range(mapping, start, end); free_extent_state(cached_state);
cached_state = NULL;
cond_resched();
start = end + 1;
}
atomic_dec(&BTRFS_I(fs_info->btree_inode)->sync_writers);
return werr;
}
/*
* when btree blocks are allocated, they have some corresponding bits set for
* them in one of two extent_io trees. This is used to make sure all of
* those extents are on disk for transaction or log commit. We wait
* on all the pages and clear them from the dirty pages state tree
*/
static int __btrfs_wait_marked_extents(struct btrfs_fs_info *fs_info,
struct extent_io_tree *dirty_pages)
{
int err = 0;
int werr = 0;
struct address_space *mapping = fs_info->btree_inode->i_mapping;
struct extent_state *cached_state = NULL;
u64 start = 0;
u64 end;
while (!find_first_extent_bit(dirty_pages, start, &start, &end,
EXTENT_NEED_WAIT, &cached_state)) {
/*
* Ignore -ENOMEM errors returned by clear_extent_bit().
* When committing the transaction, we'll remove any entries
* left in the io tree. For a log commit, we don't remove them
* after committing the log because the tree can be accessed
* concurrently - we do it only at transaction commit time when
* it's safe to do it (through extent_io_tree_release()).
*/
err = clear_extent_bit(dirty_pages, start, end,
EXTENT_NEED_WAIT, 0, 0, &cached_state);
if (err == -ENOMEM)
err = 0;
if (!err) err = filemap_fdatawait_range(mapping, start, end);
if (err)
werr = err;
free_extent_state(cached_state);
cached_state = NULL;
cond_resched();
start = end + 1;
}
if (err)
werr = err;
return werr;
}
static int btrfs_wait_extents(struct btrfs_fs_info *fs_info,
struct extent_io_tree *dirty_pages)
{
bool errors = false;
int err;
err = __btrfs_wait_marked_extents(fs_info, dirty_pages);
if (test_and_clear_bit(BTRFS_FS_BTREE_ERR, &fs_info->flags))
errors = true;
if (errors && !err)
err = -EIO;
return err;
}
int btrfs_wait_tree_log_extents(struct btrfs_root *log_root, int mark)
{
struct btrfs_fs_info *fs_info = log_root->fs_info;
struct extent_io_tree *dirty_pages = &log_root->dirty_log_pages;
bool errors = false;
int err;
ASSERT(log_root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID);
err = __btrfs_wait_marked_extents(fs_info, dirty_pages);
if ((mark & EXTENT_DIRTY) &&
test_and_clear_bit(BTRFS_FS_LOG1_ERR, &fs_info->flags))
errors = true;
if ((mark & EXTENT_NEW) && test_and_clear_bit(BTRFS_FS_LOG2_ERR, &fs_info->flags))
errors = true;
if (errors && !err)
err = -EIO;
return err;
}
/*
* When btree blocks are allocated the corresponding extents are marked dirty.
* This function ensures such extents are persisted on disk for transaction or
* log commit.
*
* @trans: transaction whose dirty pages we'd like to write
*/
static int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans)
{
int ret;
int ret2;
struct extent_io_tree *dirty_pages = &trans->transaction->dirty_pages;
struct btrfs_fs_info *fs_info = trans->fs_info;
struct blk_plug plug;
blk_start_plug(&plug);
ret = btrfs_write_marked_extents(fs_info, dirty_pages, EXTENT_DIRTY);
blk_finish_plug(&plug);
ret2 = btrfs_wait_extents(fs_info, dirty_pages);
extent_io_tree_release(&trans->transaction->dirty_pages); if (ret)
return ret;
else if (ret2)
return ret2;
else
return 0;
}
/*
* this is used to update the root pointer in the tree of tree roots.
*
* But, in the case of the extent allocation tree, updating the root
* pointer may allocate blocks which may change the root of the extent
* allocation tree.
*
* So, this loops and repeats and makes sure the cowonly root didn't
* change while the root pointer was being updated in the metadata.
*/
static int update_cowonly_root(struct btrfs_trans_handle *trans,
struct btrfs_root *root)
{
int ret;
u64 old_root_bytenr;
u64 old_root_used;
struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_root *tree_root = fs_info->tree_root;
old_root_used = btrfs_root_used(&root->root_item);
while (1) {
old_root_bytenr = btrfs_root_bytenr(&root->root_item);
if (old_root_bytenr == root->node->start &&
old_root_used == btrfs_root_used(&root->root_item))
break;
btrfs_set_root_node(&root->root_item, root->node);
ret = btrfs_update_root(trans, tree_root,
&root->root_key,
&root->root_item);
if (ret)
return ret;
old_root_used = btrfs_root_used(&root->root_item);
}
return 0;
}
/*
* update all the cowonly tree roots on disk
*
* The error handling in this function may not be obvious. Any of the
* failures will cause the file system to go offline. We still need
* to clean up the delayed refs.
*/
static noinline int commit_cowonly_roots(struct btrfs_trans_handle *trans)
{
struct btrfs_fs_info *fs_info = trans->fs_info; struct list_head *dirty_bgs = &trans->transaction->dirty_bgs;
struct list_head *io_bgs = &trans->transaction->io_bgs;
struct list_head *next;
struct extent_buffer *eb;
int ret;
eb = btrfs_lock_root_node(fs_info->tree_root);
ret = btrfs_cow_block(trans, fs_info->tree_root, eb, NULL,
0, &eb, BTRFS_NESTING_COW);
btrfs_tree_unlock(eb);
free_extent_buffer(eb);
if (ret)
return ret;
ret = btrfs_run_dev_stats(trans);
if (ret)
return ret;
ret = btrfs_run_dev_replace(trans);
if (ret)
return ret;
ret = btrfs_run_qgroups(trans);
if (ret)
return ret;
ret = btrfs_setup_space_cache(trans);
if (ret)
return ret;
again:
while (!list_empty(&fs_info->dirty_cowonly_roots)) {
struct btrfs_root *root;
next = fs_info->dirty_cowonly_roots.next;
list_del_init(next);
root = list_entry(next, struct btrfs_root, dirty_list);
clear_bit(BTRFS_ROOT_DIRTY, &root->state);
if (root != fs_info->extent_root)
list_add_tail(&root->dirty_list,
&trans->transaction->switch_commits);
ret = update_cowonly_root(trans, root);
if (ret)
return ret;
}
/* Now flush any delayed refs generated by updating all of the roots */
ret = btrfs_run_delayed_refs(trans, (unsigned long)-1);
if (ret)
return ret;
while (!list_empty(dirty_bgs) || !list_empty(io_bgs)) {
ret = btrfs_write_dirty_block_groups(trans);
if (ret)
return ret;
/*
* We're writing the dirty block groups, which could generate
* delayed refs, which could generate more dirty block groups,
* so we want to keep this flushing in this loop to make sure
* everything gets run.
*/
ret = btrfs_run_delayed_refs(trans, (unsigned long)-1);
if (ret)
return ret;
}
if (!list_empty(&fs_info->dirty_cowonly_roots))
goto again;
list_add_tail(&fs_info->extent_root->dirty_list,
&trans->transaction->switch_commits);
/* Update dev-replace pointer once everything is committed */
fs_info->dev_replace.committed_cursor_left =
fs_info->dev_replace.cursor_left_last_write_of_item;
return 0;
}
/*
* If we had a pending drop we need to see if there are any others left in our
* dead roots list, and if not clear our bit and wake any waiters.
*/
void btrfs_maybe_wake_unfinished_drop(struct btrfs_fs_info *fs_info)
{
/*
* We put the drop in progress roots at the front of the list, so if the
* first entry doesn't have UNFINISHED_DROP set we can wake everybody
* up.
*/
spin_lock(&fs_info->trans_lock);
if (!list_empty(&fs_info->dead_roots)) {
struct btrfs_root *root = list_first_entry(&fs_info->dead_roots,
struct btrfs_root,
root_list);
if (test_bit(BTRFS_ROOT_UNFINISHED_DROP, &root->state)) {
spin_unlock(&fs_info->trans_lock);
return;
}
}
spin_unlock(&fs_info->trans_lock);
btrfs_wake_unfinished_drop(fs_info);
}
/*
* dead roots are old snapshots that need to be deleted. This allocates
* a dirty root struct and adds it into the list of dead roots that need to
* be deleted
*/
void btrfs_add_dead_root(struct btrfs_root *root)
{
struct btrfs_fs_info *fs_info = root->fs_info;
spin_lock(&fs_info->trans_lock);
if (list_empty(&root->root_list)) {
btrfs_grab_root(root);
/* We want to process the partially complete drops first. */
if (test_bit(BTRFS_ROOT_UNFINISHED_DROP, &root->state))
list_add(&root->root_list, &fs_info->dead_roots);
else
list_add_tail(&root->root_list, &fs_info->dead_roots);
}
spin_unlock(&fs_info->trans_lock);
}
/*
* update all the cowonly tree roots on disk
*/
static noinline int commit_fs_roots(struct btrfs_trans_handle *trans)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
struct btrfs_root *gang[8];
int i;
int ret;
spin_lock(&fs_info->fs_roots_radix_lock);
while (1) {
ret = radix_tree_gang_lookup_tag(&fs_info->fs_roots_radix,
(void **)gang, 0,
ARRAY_SIZE(gang),
BTRFS_ROOT_TRANS_TAG);
if (ret == 0)
break;
for (i = 0; i < ret; i++) { struct btrfs_root *root = gang[i];
int ret2;
radix_tree_tag_clear(&fs_info->fs_roots_radix,
(unsigned long)root->root_key.objectid,
BTRFS_ROOT_TRANS_TAG);
spin_unlock(&fs_info->fs_roots_radix_lock);
btrfs_free_log(trans, root);
ret2 = btrfs_update_reloc_root(trans, root);
if (ret2)
return ret2;
/* see comments in should_cow_block() */
clear_bit(BTRFS_ROOT_FORCE_COW, &root->state);
smp_mb__after_atomic();
if (root->commit_root != root->node) {
list_add_tail(&root->dirty_list,
&trans->transaction->switch_commits);
btrfs_set_root_node(&root->root_item,
root->node);
}
ret2 = btrfs_update_root(trans, fs_info->tree_root,
&root->root_key,
&root->root_item);
if (ret2)
return ret2;
spin_lock(&fs_info->fs_roots_radix_lock);
btrfs_qgroup_free_meta_all_pertrans(root);
}
}
spin_unlock(&fs_info->fs_roots_radix_lock);
return 0;
}
/*
* defrag a given btree.
* Every leaf in the btree is read and defragged.
*/
int btrfs_defrag_root(struct btrfs_root *root)
{
struct btrfs_fs_info *info = root->fs_info;
struct btrfs_trans_handle *trans;
int ret;
if (test_and_set_bit(BTRFS_ROOT_DEFRAG_RUNNING, &root->state))
return 0;
while (1) {
trans = btrfs_start_transaction(root, 0);
if (IS_ERR(trans)) {
ret = PTR_ERR(trans);
break;
}
ret = btrfs_defrag_leaves(trans, root);
btrfs_end_transaction(trans);
btrfs_btree_balance_dirty(info);
cond_resched();
if (btrfs_fs_closing(info) || ret != -EAGAIN)
break;
if (btrfs_defrag_cancelled(info)) {
btrfs_debug(info, "defrag_root cancelled");
ret = -EAGAIN;
break;
}
}
clear_bit(BTRFS_ROOT_DEFRAG_RUNNING, &root->state);
return ret;
}
/*
* Do all special snapshot related qgroup dirty hack.
*
* Will do all needed qgroup inherit and dirty hack like switch commit
* roots inside one transaction and write all btree into disk, to make
* qgroup works.
*/
static int qgroup_account_snapshot(struct btrfs_trans_handle *trans,
struct btrfs_root *src,
struct btrfs_root *parent,
struct btrfs_qgroup_inherit *inherit,
u64 dst_objectid)
{
struct btrfs_fs_info *fs_info = src->fs_info;
int ret;
/*
* Save some performance in the case that qgroups are not
* enabled. If this check races with the ioctl, rescan will
* kick in anyway.
*/
if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags))
return 0;
/*
* Ensure dirty @src will be committed. Or, after coming
* commit_fs_roots() and switch_commit_roots(), any dirty but not
* recorded root will never be updated again, causing an outdated root
* item.
*/
ret = record_root_in_trans(trans, src, 1);
if (ret)
return ret;
/*
* btrfs_qgroup_inherit relies on a consistent view of the usage for the
* src root, so we must run the delayed refs here.
*
* However this isn't particularly fool proof, because there's no
* synchronization keeping us from changing the tree after this point
* before we do the qgroup_inherit, or even from making changes while
* we're doing the qgroup_inherit. But that's a problem for the future,
* for now flush the delayed refs to narrow the race window where the
* qgroup counters could end up wrong.
*/
ret = btrfs_run_delayed_refs(trans, (unsigned long)-1);
if (ret) {
btrfs_abort_transaction(trans, ret);
return ret;
}
/*
* We are going to commit transaction, see btrfs_commit_transaction()
* comment for reason locking tree_log_mutex
*/
mutex_lock(&fs_info->tree_log_mutex);
ret = commit_fs_roots(trans);
if (ret)
goto out;
ret = btrfs_qgroup_account_extents(trans);
if (ret < 0)
goto out;
/* Now qgroup are all updated, we can inherit it to new qgroups */
ret = btrfs_qgroup_inherit(trans, src->root_key.objectid, dst_objectid,
inherit);
if (ret < 0)
goto out;
/*
* Now we do a simplified commit transaction, which will:
* 1) commit all subvolume and extent tree
* To ensure all subvolume and extent tree have a valid
* commit_root to accounting later insert_dir_item()
* 2) write all btree blocks onto disk
* This is to make sure later btree modification will be cowed
* Or commit_root can be populated and cause wrong qgroup numbers
* In this simplified commit, we don't really care about other trees
* like chunk and root tree, as they won't affect qgroup.
* And we don't write super to avoid half committed status.
*/
ret = commit_cowonly_roots(trans);
if (ret)
goto out;
switch_commit_roots(trans);
ret = btrfs_write_and_wait_transaction(trans);
if (ret)
btrfs_handle_fs_error(fs_info, ret,
"Error while writing out transaction for qgroup");
out:
mutex_unlock(&fs_info->tree_log_mutex);
/*
* Force parent root to be updated, as we recorded it before so its
* last_trans == cur_transid.
* Or it won't be committed again onto disk after later
* insert_dir_item()
*/
if (!ret)
ret = record_root_in_trans(trans, parent, 1);
return ret;
}
/*
* new snapshots need to be created at a very specific time in the
* transaction commit. This does the actual creation.
*
* Note:
* If the error which may affect the commitment of the current transaction
* happens, we should return the error number. If the error which just affect
* the creation of the pending snapshots, just return 0.
*/
static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
struct btrfs_pending_snapshot *pending)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
struct btrfs_key key;
struct btrfs_root_item *new_root_item;
struct btrfs_root *tree_root = fs_info->tree_root;
struct btrfs_root *root = pending->root;
struct btrfs_root *parent_root;
struct btrfs_block_rsv *rsv;
struct inode *parent_inode;
struct btrfs_path *path;
struct btrfs_dir_item *dir_item;
struct dentry *dentry;
struct extent_buffer *tmp;
struct extent_buffer *old;
struct timespec64 cur_time;
int ret = 0;
u64 to_reserve = 0;
u64 index = 0;
u64 objectid;
u64 root_flags;
ASSERT(pending->path);
path = pending->path;
ASSERT(pending->root_item);
new_root_item = pending->root_item;
pending->error = btrfs_get_free_objectid(tree_root, &objectid);
if (pending->error)
goto no_free_objectid;
/*
* Make qgroup to skip current new snapshot's qgroupid, as it is
* accounted by later btrfs_qgroup_inherit().
*/
btrfs_set_skip_qgroup(trans, objectid);
btrfs_reloc_pre_snapshot(pending, &to_reserve);
if (to_reserve > 0) {
pending->error = btrfs_block_rsv_add(root,
&pending->block_rsv,
to_reserve,
BTRFS_RESERVE_NO_FLUSH);
if (pending->error)
goto clear_skip_qgroup;
}
key.objectid = objectid;
key.offset = (u64)-1;
key.type = BTRFS_ROOT_ITEM_KEY;
rsv = trans->block_rsv;
trans->block_rsv = &pending->block_rsv;
trans->bytes_reserved = trans->block_rsv->reserved;
trace_btrfs_space_reservation(fs_info, "transaction",
trans->transid,
trans->bytes_reserved, 1);
dentry = pending->dentry;
parent_inode = pending->dir;
parent_root = BTRFS_I(parent_inode)->root;
ret = record_root_in_trans(trans, parent_root, 0);
if (ret)
goto fail;
cur_time = current_time(parent_inode);
/*
* insert the directory item
*/
ret = btrfs_set_inode_index(BTRFS_I(parent_inode), &index);
BUG_ON(ret); /* -ENOMEM */
/* check if there is a file/dir which has the same name. */
dir_item = btrfs_lookup_dir_item(NULL, parent_root, path,
btrfs_ino(BTRFS_I(parent_inode)),
dentry->d_name.name,
dentry->d_name.len, 0);
if (dir_item != NULL && !IS_ERR(dir_item)) {
pending->error = -EEXIST;
goto dir_item_existed;
} else if (IS_ERR(dir_item)) {
ret = PTR_ERR(dir_item);
btrfs_abort_transaction(trans, ret);
goto fail;
}
btrfs_release_path(path);
/*
* pull in the delayed directory update
* and the delayed inode item
* otherwise we corrupt the FS during
* snapshot
*/
ret = btrfs_run_delayed_items(trans);
if (ret) { /* Transaction aborted */
btrfs_abort_transaction(trans, ret);
goto fail;
}
ret = record_root_in_trans(trans, root, 0);
if (ret) {
btrfs_abort_transaction(trans, ret);
goto fail;
}
btrfs_set_root_last_snapshot(&root->root_item, trans->transid);
memcpy(new_root_item, &root->root_item, sizeof(*new_root_item));
btrfs_check_and_init_root_item(new_root_item);
root_flags = btrfs_root_flags(new_root_item);
if (pending->readonly)
root_flags |= BTRFS_ROOT_SUBVOL_RDONLY;
else
root_flags &= ~BTRFS_ROOT_SUBVOL_RDONLY;
btrfs_set_root_flags(new_root_item, root_flags);
btrfs_set_root_generation_v2(new_root_item,
trans->transid);
generate_random_guid(new_root_item->uuid);
memcpy(new_root_item->parent_uuid, root->root_item.uuid,
BTRFS_UUID_SIZE);
if (!(root_flags & BTRFS_ROOT_SUBVOL_RDONLY)) {
memset(new_root_item->received_uuid, 0,
sizeof(new_root_item->received_uuid));
memset(&new_root_item->stime, 0, sizeof(new_root_item->stime));
memset(&new_root_item->rtime, 0, sizeof(new_root_item->rtime));
btrfs_set_root_stransid(new_root_item, 0);
btrfs_set_root_rtransid(new_root_item, 0);
}
btrfs_set_stack_timespec_sec(&new_root_item->otime, cur_time.tv_sec);
btrfs_set_stack_timespec_nsec(&new_root_item->otime, cur_time.tv_nsec);
btrfs_set_root_otransid(new_root_item, trans->transid);
old = btrfs_lock_root_node(root);
ret = btrfs_cow_block(trans, root, old, NULL, 0, &old,
BTRFS_NESTING_COW);
if (ret) {
btrfs_tree_unlock(old);
free_extent_buffer(old);
btrfs_abort_transaction(trans, ret);
goto fail;
}
ret = btrfs_copy_root(trans, root, old, &tmp, objectid);
/* clean up in any case */
btrfs_tree_unlock(old);
free_extent_buffer(old);
if (ret) {
btrfs_abort_transaction(trans, ret);
goto fail;
}
/* see comments in should_cow_block() */
set_bit(BTRFS_ROOT_FORCE_COW, &root->state);
smp_wmb();
btrfs_set_root_node(new_root_item, tmp);
/* record when the snapshot was created in key.offset */
key.offset = trans->transid;
ret = btrfs_insert_root(trans, tree_root, &key, new_root_item);
btrfs_tree_unlock(tmp);
free_extent_buffer(tmp);
if (ret) {
btrfs_abort_transaction(trans, ret);
goto fail;
}
/*
* insert root back/forward references
*/
ret = btrfs_add_root_ref(trans, objectid,
parent_root->root_key.objectid,
btrfs_ino(BTRFS_I(parent_inode)), index,
dentry->d_name.name, dentry->d_name.len);
if (ret) {
btrfs_abort_transaction(trans, ret);
goto fail;
}
key.offset = (u64)-1;
pending->snap = btrfs_get_new_fs_root(fs_info, objectid, pending->anon_dev);
if (IS_ERR(pending->snap)) {
ret = PTR_ERR(pending->snap);
pending->snap = NULL;
btrfs_abort_transaction(trans, ret);
goto fail;
}
ret = btrfs_reloc_post_snapshot(trans, pending);
if (ret) {
btrfs_abort_transaction(trans, ret);
goto fail;
}
/*
* Do special qgroup accounting for snapshot, as we do some qgroup
* snapshot hack to do fast snapshot.
* To co-operate with that hack, we do hack again.
* Or snapshot will be greatly slowed down by a subtree qgroup rescan
*/
ret = qgroup_account_snapshot(trans, root, parent_root,
pending->inherit, objectid);
if (ret < 0)
goto fail;
ret = btrfs_insert_dir_item(trans, dentry->d_name.name,
dentry->d_name.len, BTRFS_I(parent_inode),
&key, BTRFS_FT_DIR, index);
/* We have check then name at the beginning, so it is impossible. */
BUG_ON(ret == -EEXIST || ret == -EOVERFLOW);
if (ret) {
btrfs_abort_transaction(trans, ret);
goto fail;
}
btrfs_i_size_write(BTRFS_I(parent_inode), parent_inode->i_size +
dentry->d_name.len * 2);
parent_inode->i_mtime = parent_inode->i_ctime =
current_time(parent_inode);
ret = btrfs_update_inode_fallback(trans, parent_root, BTRFS_I(parent_inode));
if (ret) {
btrfs_abort_transaction(trans, ret);
goto fail;
}
ret = btrfs_uuid_tree_add(trans, new_root_item->uuid,
BTRFS_UUID_KEY_SUBVOL,
objectid);
if (ret) {
btrfs_abort_transaction(trans, ret);
goto fail;
}
if (!btrfs_is_empty_uuid(new_root_item->received_uuid)) {
ret = btrfs_uuid_tree_add(trans, new_root_item->received_uuid,
BTRFS_UUID_KEY_RECEIVED_SUBVOL,
objectid);
if (ret && ret != -EEXIST) {
btrfs_abort_transaction(trans, ret);
goto fail;
}
}
fail:
pending->error = ret;
dir_item_existed:
trans->block_rsv = rsv;
trans->bytes_reserved = 0;
clear_skip_qgroup:
btrfs_clear_skip_qgroup(trans);
no_free_objectid:
kfree(new_root_item);
pending->root_item = NULL;
btrfs_free_path(path);
pending->path = NULL;
return ret;
}
/*
* create all the snapshots we've scheduled for creation
*/
static noinline int create_pending_snapshots(struct btrfs_trans_handle *trans)
{
struct btrfs_pending_snapshot *pending, *next;
struct list_head *head = &trans->transaction->pending_snapshots;
int ret = 0;
list_for_each_entry_safe(pending, next, head, list) {
list_del(&pending->list);
ret = create_pending_snapshot(trans, pending);
if (ret)
break;
}
return ret;
}
static void update_super_roots(struct btrfs_fs_info *fs_info)
{
struct btrfs_root_item *root_item;
struct btrfs_super_block *super;
super = fs_info->super_copy;
root_item = &fs_info->chunk_root->root_item;
super->chunk_root = root_item->bytenr;
super->chunk_root_generation = root_item->generation;
super->chunk_root_level = root_item->level;
root_item = &fs_info->tree_root->root_item;
super->root = root_item->bytenr;
super->generation = root_item->generation;
super->root_level = root_item->level;
if (btrfs_test_opt(fs_info, SPACE_CACHE))
super->cache_generation = root_item->generation;
else if (test_bit(BTRFS_FS_CLEANUP_SPACE_CACHE_V1, &fs_info->flags))
super->cache_generation = 0;
if (test_bit(BTRFS_FS_UPDATE_UUID_TREE_GEN, &fs_info->flags))
super->uuid_tree_generation = root_item->generation;
}
int btrfs_transaction_in_commit(struct btrfs_fs_info *info)
{
struct btrfs_transaction *trans;
int ret = 0;
spin_lock(&info->trans_lock);
trans = info->running_transaction;
if (trans)
ret = (trans->state >= TRANS_STATE_COMMIT_START);
spin_unlock(&info->trans_lock);
return ret;
}
int btrfs_transaction_blocked(struct btrfs_fs_info *info)
{
struct btrfs_transaction *trans;
int ret = 0;
spin_lock(&info->trans_lock);
trans = info->running_transaction;
if (trans)
ret = is_transaction_blocked(trans);
spin_unlock(&info->trans_lock);
return ret;
}
/*
* commit transactions asynchronously. once btrfs_commit_transaction_async
* returns, any subsequent transaction will not be allowed to join.
*/
struct btrfs_async_commit {
struct btrfs_trans_handle *newtrans;
struct work_struct work;
};
static void do_async_commit(struct work_struct *work)
{
struct btrfs_async_commit *ac =
container_of(work, struct btrfs_async_commit, work);
/*
* We've got freeze protection passed with the transaction.
* Tell lockdep about it.
*/
if (ac->newtrans->type & __TRANS_FREEZABLE)
__sb_writers_acquired(ac->newtrans->fs_info->sb, SB_FREEZE_FS);
current->journal_info = ac->newtrans;
btrfs_commit_transaction(ac->newtrans);
kfree(ac);
}
int btrfs_commit_transaction_async(struct btrfs_trans_handle *trans)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
struct btrfs_async_commit *ac;
struct btrfs_transaction *cur_trans;
ac = kmalloc(sizeof(*ac), GFP_NOFS);
if (!ac)
return -ENOMEM;
INIT_WORK(&ac->work, do_async_commit);
ac->newtrans = btrfs_join_transaction(trans->root);
if (IS_ERR(ac->newtrans)) {
int err = PTR_ERR(ac->newtrans);
kfree(ac);
return err;
}
/* take transaction reference */
cur_trans = trans->transaction;
refcount_inc(&cur_trans->use_count);
btrfs_end_transaction(trans);
/*
* Tell lockdep we've released the freeze rwsem, since the
* async commit thread will be the one to unlock it.
*/
if (ac->newtrans->type & __TRANS_FREEZABLE)
__sb_writers_release(fs_info->sb, SB_FREEZE_FS);
schedule_work(&ac->work);
/*
* Wait for the current transaction commit to start and block
* subsequent transaction joins
*/
wait_event(fs_info->transaction_blocked_wait,
cur_trans->state >= TRANS_STATE_COMMIT_START ||
TRANS_ABORTED(cur_trans));
if (current->journal_info == trans)
current->journal_info = NULL;
btrfs_put_transaction(cur_trans);
return 0;
}
static void cleanup_transaction(struct btrfs_trans_handle *trans, int err)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
struct btrfs_transaction *cur_trans = trans->transaction;
WARN_ON(refcount_read(&trans->use_count) > 1);
btrfs_abort_transaction(trans, err);
spin_lock(&fs_info->trans_lock);
/*
* If the transaction is removed from the list, it means this
* transaction has been committed successfully, so it is impossible
* to call the cleanup function.
*/
BUG_ON(list_empty(&cur_trans->list));
if (cur_trans == fs_info->running_transaction) {
cur_trans->state = TRANS_STATE_COMMIT_DOING;
spin_unlock(&fs_info->trans_lock);
wait_event(cur_trans->writer_wait,
atomic_read(&cur_trans->num_writers) == 1);
spin_lock(&fs_info->trans_lock);
}
/*
* Now that we know no one else is still using the transaction we can
* remove the transaction from the list of transactions. This avoids
* the transaction kthread from cleaning up the transaction while some
* other task is still using it, which could result in a use-after-free
* on things like log trees, as it forces the transaction kthread to
* wait for this transaction to be cleaned up by us.
*/
list_del_init(&cur_trans->list);
spin_unlock(&fs_info->trans_lock);
btrfs_cleanup_one_transaction(trans->transaction, fs_info);
spin_lock(&fs_info->trans_lock);
if (cur_trans == fs_info->running_transaction)
fs_info->running_transaction = NULL;
spin_unlock(&fs_info->trans_lock);
if (trans->type & __TRANS_FREEZABLE)
sb_end_intwrite(fs_info->sb);
btrfs_put_transaction(cur_trans);
btrfs_put_transaction(cur_trans);
trace_btrfs_transaction_commit(trans->root);
if (current->journal_info == trans)
current->journal_info = NULL;
btrfs_scrub_cancel(fs_info);
kmem_cache_free(btrfs_trans_handle_cachep, trans);
}
/*
* Release reserved delayed ref space of all pending block groups of the
* transaction and remove them from the list
*/
static void btrfs_cleanup_pending_block_groups(struct btrfs_trans_handle *trans)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
struct btrfs_block_group *block_group, *tmp;
list_for_each_entry_safe(block_group, tmp, &trans->new_bgs, bg_list) { btrfs_delayed_refs_rsv_release(fs_info, 1);
list_del_init(&block_group->bg_list);
}
}
static inline int btrfs_start_delalloc_flush(struct btrfs_fs_info *fs_info)
{
/*
* We use try_to_writeback_inodes_sb() here because if we used
* btrfs_start_delalloc_roots we would deadlock with fs freeze.
* Currently are holding the fs freeze lock, if we do an async flush
* we'll do btrfs_join_transaction() and deadlock because we need to
* wait for the fs freeze lock. Using the direct flushing we benefit
* from already being in a transaction and our join_transaction doesn't
* have to re-take the fs freeze lock.
*
* Note that try_to_writeback_inodes_sb() will only trigger writeback
* if it can read lock sb->s_umount. It will always be able to lock it,
* except when the filesystem is being unmounted or being frozen, but in
* those cases sync_filesystem() is called, which results in calling
* writeback_inodes_sb() while holding a write lock on sb->s_umount.
* Note that we don't call writeback_inodes_sb() directly, because it
* will emit a warning if sb->s_umount is not locked.
*/
if (btrfs_test_opt(fs_info, FLUSHONCOMMIT))
try_to_writeback_inodes_sb(fs_info->sb, WB_REASON_SYNC);
return 0;
}
static inline void btrfs_wait_delalloc_flush(struct btrfs_fs_info *fs_info)
{
if (btrfs_test_opt(fs_info, FLUSHONCOMMIT)) btrfs_wait_ordered_roots(fs_info, U64_MAX, 0, (u64)-1);
}
/*
* Add a pending snapshot associated with the given transaction handle to the
* respective handle. This must be called after the transaction commit started
* and while holding fs_info->trans_lock.
* This serves to guarantee a caller of btrfs_commit_transaction() that it can
* safely free the pending snapshot pointer in case btrfs_commit_transaction()
* returns an error.
*/
static void add_pending_snapshot(struct btrfs_trans_handle *trans)
{
struct btrfs_transaction *cur_trans = trans->transaction;
if (!trans->pending_snapshot)
return;
lockdep_assert_held(&trans->fs_info->trans_lock);
ASSERT(cur_trans->state >= TRANS_STATE_COMMIT_START);
list_add(&trans->pending_snapshot->list, &cur_trans->pending_snapshots);
}
int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
struct btrfs_transaction *cur_trans = trans->transaction;
struct btrfs_transaction *prev_trans = NULL;
int ret;
ASSERT(refcount_read(&trans->use_count) == 1);
/* Stop the commit early if ->aborted is set */
if (TRANS_ABORTED(cur_trans)) {
ret = cur_trans->aborted;
btrfs_end_transaction(trans);
return ret;
}
btrfs_trans_release_metadata(trans);
trans->block_rsv = NULL;
/*
* We only want one transaction commit doing the flushing so we do not
* waste a bunch of time on lock contention on the extent root node.
*/
if (!test_and_set_bit(BTRFS_DELAYED_REFS_FLUSHING,
&cur_trans->delayed_refs.flags)) {
/*
* Make a pass through all the delayed refs we have so far.
* Any running threads may add more while we are here.
*/
ret = btrfs_run_delayed_refs(trans, 0);
if (ret) {
btrfs_end_transaction(trans);
return ret;
}
}
btrfs_create_pending_block_groups(trans);
if (!test_bit(BTRFS_TRANS_DIRTY_BG_RUN, &cur_trans->flags)) {
int run_it = 0;
/* this mutex is also taken before trying to set
* block groups readonly. We need to make sure
* that nobody has set a block group readonly
* after a extents from that block group have been
* allocated for cache files. btrfs_set_block_group_ro
* will wait for the transaction to commit if it
* finds BTRFS_TRANS_DIRTY_BG_RUN set.
*
* The BTRFS_TRANS_DIRTY_BG_RUN flag is also used to make sure
* only one process starts all the block group IO. It wouldn't
* hurt to have more than one go through, but there's no
* real advantage to it either.
*/
mutex_lock(&fs_info->ro_block_group_mutex);
if (!test_and_set_bit(BTRFS_TRANS_DIRTY_BG_RUN,
&cur_trans->flags))
run_it = 1;
mutex_unlock(&fs_info->ro_block_group_mutex);
if (run_it) {
ret = btrfs_start_dirty_block_groups(trans);
if (ret) {
btrfs_end_transaction(trans);
return ret;
}
}
}
spin_lock(&fs_info->trans_lock);
if (cur_trans->state >= TRANS_STATE_COMMIT_START) {
enum btrfs_trans_state want_state = TRANS_STATE_COMPLETED;
add_pending_snapshot(trans);
spin_unlock(&fs_info->trans_lock);
refcount_inc(&cur_trans->use_count);
if (trans->in_fsync)
want_state = TRANS_STATE_SUPER_COMMITTED;
ret = btrfs_end_transaction(trans);
wait_for_commit(cur_trans, want_state);
if (TRANS_ABORTED(cur_trans))
ret = cur_trans->aborted; btrfs_put_transaction(cur_trans); return ret;
}
cur_trans->state = TRANS_STATE_COMMIT_START;
wake_up(&fs_info->transaction_blocked_wait);
if (cur_trans->list.prev != &fs_info->trans_list) {
enum btrfs_trans_state want_state = TRANS_STATE_COMPLETED;
if (trans->in_fsync)
want_state = TRANS_STATE_SUPER_COMMITTED;
prev_trans = list_entry(cur_trans->list.prev,
struct btrfs_transaction, list);
if (prev_trans->state < want_state) { refcount_inc(&prev_trans->use_count);
spin_unlock(&fs_info->trans_lock);
wait_for_commit(prev_trans, want_state);
ret = READ_ONCE(prev_trans->aborted);
btrfs_put_transaction(prev_trans);
if (ret)
goto cleanup_transaction;
} else {
spin_unlock(&fs_info->trans_lock);
}
} else {
spin_unlock(&fs_info->trans_lock);
/*
* The previous transaction was aborted and was already removed
* from the list of transactions at fs_info->trans_list. So we
* abort to prevent writing a new superblock that reflects a
* corrupt state (pointing to trees with unwritten nodes/leafs).
*/
if (test_bit(BTRFS_FS_STATE_TRANS_ABORTED, &fs_info->fs_state)) {
ret = -EROFS;
goto cleanup_transaction;
}
}
extwriter_counter_dec(cur_trans, trans->type); ret = btrfs_start_delalloc_flush(fs_info);
if (ret)
goto cleanup_transaction;
ret = btrfs_run_delayed_items(trans);
if (ret)
goto cleanup_transaction;
wait_event(cur_trans->writer_wait,
extwriter_counter_read(cur_trans) == 0);
/* some pending stuffs might be added after the previous flush. */
ret = btrfs_run_delayed_items(trans);
if (ret)
goto cleanup_transaction;
btrfs_wait_delalloc_flush(fs_info);
/*
* Wait for all ordered extents started by a fast fsync that joined this
* transaction. Otherwise if this transaction commits before the ordered
* extents complete we lose logged data after a power failure.
*/
wait_event(cur_trans->pending_wait,
atomic_read(&cur_trans->pending_ordered) == 0);
btrfs_scrub_pause(fs_info);
/*
* Ok now we need to make sure to block out any other joins while we
* commit the transaction. We could have started a join before setting
* COMMIT_DOING so make sure to wait for num_writers to == 1 again.
*/
spin_lock(&fs_info->trans_lock);
add_pending_snapshot(trans); cur_trans->state = TRANS_STATE_COMMIT_DOING;
spin_unlock(&fs_info->trans_lock);
wait_event(cur_trans->writer_wait,
atomic_read(&cur_trans->num_writers) == 1);
if (TRANS_ABORTED(cur_trans)) { ret = cur_trans->aborted;
goto scrub_continue;
}
/*
* the reloc mutex makes sure that we stop
* the balancing code from coming in and moving
* extents around in the middle of the commit
*/
mutex_lock(&fs_info->reloc_mutex);
/*
* We needn't worry about the delayed items because we will
* deal with them in create_pending_snapshot(), which is the
* core function of the snapshot creation.
*/
ret = create_pending_snapshots(trans);
if (ret)
goto unlock_reloc;
/*
* We insert the dir indexes of the snapshots and update the inode
* of the snapshots' parents after the snapshot creation, so there
* are some delayed items which are not dealt with. Now deal with
* them.
*
* We needn't worry that this operation will corrupt the snapshots,
* because all the tree which are snapshoted will be forced to COW
* the nodes and leaves.
*/
ret = btrfs_run_delayed_items(trans);
if (ret)
goto unlock_reloc;
ret = btrfs_run_delayed_refs(trans, (unsigned long)-1);
if (ret)
goto unlock_reloc;
/*
* make sure none of the code above managed to slip in a
* delayed item
*/
btrfs_assert_delayed_root_empty(fs_info); WARN_ON(cur_trans != trans->transaction);
/* btrfs_commit_tree_roots is responsible for getting the
* various roots consistent with each other. Every pointer
* in the tree of tree roots has to point to the most up to date
* root for every subvolume and other tree. So, we have to keep
* the tree logging code from jumping in and changing any
* of the trees.
*
* At this point in the commit, there can't be any tree-log
* writers, but a little lower down we drop the trans mutex
* and let new people in. By holding the tree_log_mutex
* from now until after the super is written, we avoid races
* with the tree-log code.
*/
mutex_lock(&fs_info->tree_log_mutex);
ret = commit_fs_roots(trans);
if (ret)
goto unlock_tree_log;
/*
* Since the transaction is done, we can apply the pending changes
* before the next transaction.
*/
btrfs_apply_pending_changes(fs_info);
/* commit_fs_roots gets rid of all the tree log roots, it is now
* safe to free the root of tree log roots
*/
btrfs_free_log_root_tree(trans, fs_info);
/*
* Since fs roots are all committed, we can get a quite accurate
* new_roots. So let's do quota accounting.
*/
ret = btrfs_qgroup_account_extents(trans);
if (ret < 0)
goto unlock_tree_log;
ret = commit_cowonly_roots(trans);
if (ret)
goto unlock_tree_log;
/*
* The tasks which save the space cache and inode cache may also
* update ->aborted, check it.
*/
if (TRANS_ABORTED(cur_trans)) { ret = cur_trans->aborted;
goto unlock_tree_log;
}
cur_trans = fs_info->running_transaction;
btrfs_set_root_node(&fs_info->tree_root->root_item,
fs_info->tree_root->node);
list_add_tail(&fs_info->tree_root->dirty_list,
&cur_trans->switch_commits);
btrfs_set_root_node(&fs_info->chunk_root->root_item,
fs_info->chunk_root->node);
list_add_tail(&fs_info->chunk_root->dirty_list,
&cur_trans->switch_commits);
switch_commit_roots(trans);
ASSERT(list_empty(&cur_trans->dirty_bgs));
ASSERT(list_empty(&cur_trans->io_bgs));
update_super_roots(fs_info);
btrfs_set_super_log_root(fs_info->super_copy, 0);
btrfs_set_super_log_root_level(fs_info->super_copy, 0);
memcpy(fs_info->super_for_commit, fs_info->super_copy,
sizeof(*fs_info->super_copy));
btrfs_commit_device_sizes(cur_trans);
clear_bit(BTRFS_FS_LOG1_ERR, &fs_info->flags);
clear_bit(BTRFS_FS_LOG2_ERR, &fs_info->flags);
btrfs_trans_release_chunk_metadata(trans);
spin_lock(&fs_info->trans_lock);
cur_trans->state = TRANS_STATE_UNBLOCKED;
fs_info->running_transaction = NULL;
spin_unlock(&fs_info->trans_lock);
mutex_unlock(&fs_info->reloc_mutex);
wake_up(&fs_info->transaction_wait);
ret = btrfs_write_and_wait_transaction(trans);
if (ret) {
btrfs_handle_fs_error(fs_info, ret,
"Error while writing out transaction");
/*
* reloc_mutex has been unlocked, tree_log_mutex is still held
* but we can't jump to unlock_tree_log causing double unlock
*/
mutex_unlock(&fs_info->tree_log_mutex);
goto scrub_continue;
}
/*
* At this point, we should have written all the tree blocks allocated
* in this transaction. So it's now safe to free the redirtyied extent
* buffers.
*/
btrfs_free_redirty_list(cur_trans);
ret = write_all_supers(fs_info, 0);
/*
* the super is written, we can safely allow the tree-loggers
* to go about their business
*/
mutex_unlock(&fs_info->tree_log_mutex);
if (ret)
goto scrub_continue;
/*
* We needn't acquire the lock here because there is no other task
* which can change it.
*/
cur_trans->state = TRANS_STATE_SUPER_COMMITTED;
wake_up(&cur_trans->commit_wait);
btrfs_finish_extent_commit(trans);
if (test_bit(BTRFS_TRANS_HAVE_FREE_BGS, &cur_trans->flags))
btrfs_clear_space_info_full(fs_info); fs_info->last_trans_committed = cur_trans->transid;
/*
* We needn't acquire the lock here because there is no other task
* which can change it.
*/
cur_trans->state = TRANS_STATE_COMPLETED;
wake_up(&cur_trans->commit_wait);
spin_lock(&fs_info->trans_lock);
list_del_init(&cur_trans->list);
spin_unlock(&fs_info->trans_lock);
btrfs_put_transaction(cur_trans);
btrfs_put_transaction(cur_trans);
if (trans->type & __TRANS_FREEZABLE)
sb_end_intwrite(fs_info->sb); trace_btrfs_transaction_commit(trans->root); btrfs_scrub_continue(fs_info);
if (current->journal_info == trans)
current->journal_info = NULL;
kmem_cache_free(btrfs_trans_handle_cachep, trans);
return ret;
unlock_tree_log:
mutex_unlock(&fs_info->tree_log_mutex);
unlock_reloc:
mutex_unlock(&fs_info->reloc_mutex);
scrub_continue:
btrfs_scrub_continue(fs_info);
cleanup_transaction:
btrfs_trans_release_metadata(trans);
btrfs_cleanup_pending_block_groups(trans);
btrfs_trans_release_chunk_metadata(trans);
trans->block_rsv = NULL;
btrfs_warn(fs_info, "Skipping commit of aborted transaction.");
if (current->journal_info == trans)
current->journal_info = NULL;
cleanup_transaction(trans, ret);
return ret;
}
/*
* return < 0 if error
* 0 if there are no more dead_roots at the time of call
* 1 there are more to be processed, call me again
*
* The return value indicates there are certainly more snapshots to delete, but
* if there comes a new one during processing, it may return 0. We don't mind,
* because btrfs_commit_super will poke cleaner thread and it will process it a
* few seconds later.
*/
int btrfs_clean_one_deleted_snapshot(struct btrfs_root *root)
{
int ret;
struct btrfs_fs_info *fs_info = root->fs_info;
spin_lock(&fs_info->trans_lock);
if (list_empty(&fs_info->dead_roots)) {
spin_unlock(&fs_info->trans_lock);
return 0;
}
root = list_first_entry(&fs_info->dead_roots,
struct btrfs_root, root_list);
list_del_init(&root->root_list);
spin_unlock(&fs_info->trans_lock);
btrfs_debug(fs_info, "cleaner removing %llu", root->root_key.objectid);
btrfs_kill_all_delayed_nodes(root);
if (btrfs_header_backref_rev(root->node) <
BTRFS_MIXED_BACKREF_REV)
ret = btrfs_drop_snapshot(root, 0, 0);
else
ret = btrfs_drop_snapshot(root, 1, 0);
btrfs_put_root(root);
return (ret < 0) ? 0 : 1;
}
void btrfs_apply_pending_changes(struct btrfs_fs_info *fs_info)
{
unsigned long prev;
unsigned long bit;
prev = xchg(&fs_info->pending_changes, 0);
if (!prev)
return;
bit = 1 << BTRFS_PENDING_COMMIT;
if (prev & bit)
btrfs_debug(fs_info, "pending commit done");
prev &= ~bit;
if (prev)
btrfs_warn(fs_info,
"unknown pending changes left 0x%lx, ignoring", prev);
}
// SPDX-License-Identifier: GPL-2.0
/*
* drivers/base/core.c - core driver model code (device registration, etc)
*
* Copyright (c) 2002-3 Patrick Mochel
* Copyright (c) 2002-3 Open Source Development Labs
* Copyright (c) 2006 Greg Kroah-Hartman <gregkh@suse.de>
* Copyright (c) 2006 Novell, Inc.
*/
#include <linux/acpi.h>
#include <linux/cpufreq.h>
#include <linux/device.h>
#include <linux/err.h>
#include <linux/fwnode.h>
#include <linux/init.h>
#include <linux/module.h>
#include <linux/slab.h>
#include <linux/string.h>
#include <linux/kdev_t.h>
#include <linux/notifier.h>
#include <linux/of.h>
#include <linux/of_device.h>
#include <linux/genhd.h>
#include <linux/mutex.h>
#include <linux/pm_runtime.h>
#include <linux/netdevice.h>
#include <linux/sched/signal.h>
#include <linux/sched/mm.h>
#include <linux/swiotlb.h>
#include <linux/sysfs.h>
#include <linux/dma-map-ops.h> /* for dma_default_coherent */
#include "base.h"
#include "power/power.h"
#ifdef CONFIG_SYSFS_DEPRECATED
#ifdef CONFIG_SYSFS_DEPRECATED_V2
long sysfs_deprecated = 1;
#else
long sysfs_deprecated = 0;
#endif
static int __init sysfs_deprecated_setup(char *arg)
{
return kstrtol(arg, 10, &sysfs_deprecated);
}
early_param("sysfs.deprecated", sysfs_deprecated_setup);
#endif
/* Device links support. */
static LIST_HEAD(deferred_sync);
static unsigned int defer_sync_state_count = 1;
static DEFINE_MUTEX(fwnode_link_lock);
static bool fw_devlink_is_permissive(void);
static bool fw_devlink_drv_reg_done;
/**
* fwnode_link_add - Create a link between two fwnode_handles.
* @con: Consumer end of the link.
* @sup: Supplier end of the link.
*
* Create a fwnode link between fwnode handles @con and @sup. The fwnode link
* represents the detail that the firmware lists @sup fwnode as supplying a
* resource to @con.
*
* The driver core will use the fwnode link to create a device link between the
* two device objects corresponding to @con and @sup when they are created. The
* driver core will automatically delete the fwnode link between @con and @sup
* after doing that.
*
* Attempts to create duplicate links between the same pair of fwnode handles
* are ignored and there is no reference counting.
*/
int fwnode_link_add(struct fwnode_handle *con, struct fwnode_handle *sup)
{
struct fwnode_link *link;
int ret = 0;
mutex_lock(&fwnode_link_lock);
list_for_each_entry(link, &sup->consumers, s_hook)
if (link->consumer == con)
goto out;
link = kzalloc(sizeof(*link), GFP_KERNEL);
if (!link) {
ret = -ENOMEM;
goto out;
}
link->supplier = sup;
INIT_LIST_HEAD(&link->s_hook);
link->consumer = con;
INIT_LIST_HEAD(&link->c_hook);
list_add(&link->s_hook, &sup->consumers);
list_add(&link->c_hook, &con->suppliers);
pr_debug("%pfwP Linked as a fwnode consumer to %pfwP\n",
con, sup);
out:
mutex_unlock(&fwnode_link_lock);
return ret;
}
/**
* __fwnode_link_del - Delete a link between two fwnode_handles.
* @link: the fwnode_link to be deleted
*
* The fwnode_link_lock needs to be held when this function is called.
*/
static void __fwnode_link_del(struct fwnode_link *link)
{
pr_debug("%pfwP Dropping the fwnode link to %pfwP\n",
link->consumer, link->supplier);
list_del(&link->s_hook);
list_del(&link->c_hook);
kfree(link);
}
/**
* fwnode_links_purge_suppliers - Delete all supplier links of fwnode_handle.
* @fwnode: fwnode whose supplier links need to be deleted
*
* Deletes all supplier links connecting directly to @fwnode.
*/
static void fwnode_links_purge_suppliers(struct fwnode_handle *fwnode)
{
struct fwnode_link *link, *tmp;
mutex_lock(&fwnode_link_lock);
list_for_each_entry_safe(link, tmp, &fwnode->suppliers, c_hook)
__fwnode_link_del(link);
mutex_unlock(&fwnode_link_lock);
}
/**
* fwnode_links_purge_consumers - Delete all consumer links of fwnode_handle.
* @fwnode: fwnode whose consumer links need to be deleted
*
* Deletes all consumer links connecting directly to @fwnode.
*/
static void fwnode_links_purge_consumers(struct fwnode_handle *fwnode)
{
struct fwnode_link *link, *tmp;
mutex_lock(&fwnode_link_lock);
list_for_each_entry_safe(link, tmp, &fwnode->consumers, s_hook)
__fwnode_link_del(link);
mutex_unlock(&fwnode_link_lock);
}
/**
* fwnode_links_purge - Delete all links connected to a fwnode_handle.
* @fwnode: fwnode whose links needs to be deleted
*
* Deletes all links connecting directly to a fwnode.
*/
void fwnode_links_purge(struct fwnode_handle *fwnode)
{
fwnode_links_purge_suppliers(fwnode);
fwnode_links_purge_consumers(fwnode);
}
void fw_devlink_purge_absent_suppliers(struct fwnode_handle *fwnode)
{
struct fwnode_handle *child;
/* Don't purge consumer links of an added child */
if (fwnode->dev)
return;
fwnode->flags |= FWNODE_FLAG_NOT_DEVICE;
fwnode_links_purge_consumers(fwnode);
fwnode_for_each_available_child_node(fwnode, child)
fw_devlink_purge_absent_suppliers(child);
}
EXPORT_SYMBOL_GPL(fw_devlink_purge_absent_suppliers);
#ifdef CONFIG_SRCU
static DEFINE_MUTEX(device_links_lock);
DEFINE_STATIC_SRCU(device_links_srcu);
static inline void device_links_write_lock(void)
{
mutex_lock(&device_links_lock);
}
static inline void device_links_write_unlock(void)
{
mutex_unlock(&device_links_lock);
}
int device_links_read_lock(void) __acquires(&device_links_srcu)
{
return srcu_read_lock(&device_links_srcu);
}
void device_links_read_unlock(int idx) __releases(&device_links_srcu)
{
srcu_read_unlock(&device_links_srcu, idx);
}
int device_links_read_lock_held(void)
{
return srcu_read_lock_held(&device_links_srcu);
}
static void device_link_synchronize_removal(void)
{
synchronize_srcu(&device_links_srcu);
}
static void device_link_remove_from_lists(struct device_link *link)
{
list_del_rcu(&link->s_node);
list_del_rcu(&link->c_node);
}
#else /* !CONFIG_SRCU */
static DECLARE_RWSEM(device_links_lock);
static inline void device_links_write_lock(void)
{
down_write(&device_links_lock);
}
static inline void device_links_write_unlock(void)
{
up_write(&device_links_lock);
}
int device_links_read_lock(void)
{
down_read(&device_links_lock);
return 0;
}
void device_links_read_unlock(int not_used)
{
up_read(&device_links_lock);
}
#ifdef CONFIG_DEBUG_LOCK_ALLOC
int device_links_read_lock_held(void)
{
return lockdep_is_held(&device_links_lock);
}
#endif
static inline void device_link_synchronize_removal(void)
{
}
static void device_link_remove_from_lists(struct device_link *link)
{
list_del(&link->s_node);
list_del(&link->c_node);
}
#endif /* !CONFIG_SRCU */
static bool device_is_ancestor(struct device *dev, struct device *target)
{
while (target->parent) {
target = target->parent;
if (dev == target)
return true;
}
return false;
}
/**
* device_is_dependent - Check if one device depends on another one
* @dev: Device to check dependencies for.
* @target: Device to check against.
*
* Check if @target depends on @dev or any device dependent on it (its child or
* its consumer etc). Return 1 if that is the case or 0 otherwise.
*/
int device_is_dependent(struct device *dev, void *target)
{
struct device_link *link;
int ret;
/*
* The "ancestors" check is needed to catch the case when the target
* device has not been completely initialized yet and it is still
* missing from the list of children of its parent device.
*/
if (dev == target || device_is_ancestor(dev, target))
return 1;
ret = device_for_each_child(dev, target, device_is_dependent);
if (ret)
return ret;
list_for_each_entry(link, &dev->links.consumers, s_node) {
if ((link->flags & ~DL_FLAG_INFERRED) ==
(DL_FLAG_SYNC_STATE_ONLY | DL_FLAG_MANAGED))
continue;
if (link->consumer == target)
return 1;
ret = device_is_dependent(link->consumer, target);
if (ret)
break;
}
return ret;
}
static void device_link_init_status(struct device_link *link,
struct device *consumer,
struct device *supplier)
{
switch (supplier->links.status) {
case DL_DEV_PROBING:
switch (consumer->links.status) {
case DL_DEV_PROBING:
/*
* A consumer driver can create a link to a supplier
* that has not completed its probing yet as long as it
* knows that the supplier is already functional (for
* example, it has just acquired some resources from the
* supplier).
*/
link->status = DL_STATE_CONSUMER_PROBE;
break;
default:
link->status = DL_STATE_DORMANT;
break;
}
break;
case DL_DEV_DRIVER_BOUND:
switch (consumer->links.status) {
case DL_DEV_PROBING:
link->status = DL_STATE_CONSUMER_PROBE;
break;
case DL_DEV_DRIVER_BOUND:
link->status = DL_STATE_ACTIVE;
break;
default:
link->status = DL_STATE_AVAILABLE;
break;
}
break;
case DL_DEV_UNBINDING:
link->status = DL_STATE_SUPPLIER_UNBIND;
break;
default:
link->status = DL_STATE_DORMANT;
break;
}
}
static int device_reorder_to_tail(struct device *dev, void *not_used)
{
struct device_link *link;
/*
* Devices that have not been registered yet will be put to the ends
* of the lists during the registration, so skip them here.
*/
if (device_is_registered(dev))
devices_kset_move_last(dev);
if (device_pm_initialized(dev))
device_pm_move_last(dev);
device_for_each_child(dev, NULL, device_reorder_to_tail);
list_for_each_entry(link, &dev->links.consumers, s_node) {
if ((link->flags & ~DL_FLAG_INFERRED) ==
(DL_FLAG_SYNC_STATE_ONLY | DL_FLAG_MANAGED))
continue;
device_reorder_to_tail(link->consumer, NULL);
}
return 0;
}
/**
* device_pm_move_to_tail - Move set of devices to the end of device lists
* @dev: Device to move
*
* This is a device_reorder_to_tail() wrapper taking the requisite locks.
*
* It moves the @dev along with all of its children and all of its consumers
* to the ends of the device_kset and dpm_list, recursively.
*/
void device_pm_move_to_tail(struct device *dev)
{
int idx;
idx = device_links_read_lock();
device_pm_lock();
device_reorder_to_tail(dev, NULL);
device_pm_unlock();
device_links_read_unlock(idx);
}
#define to_devlink(dev) container_of((dev), struct device_link, link_dev)
static ssize_t status_show(struct device *dev,
struct device_attribute *attr, char *buf)
{
const char *output;
switch (to_devlink(dev)->status) {
case DL_STATE_NONE:
output = "not tracked";
break;
case DL_STATE_DORMANT:
output = "dormant";
break;
case DL_STATE_AVAILABLE:
output = "available";
break;
case DL_STATE_CONSUMER_PROBE:
output = "consumer probing";
break;
case DL_STATE_ACTIVE:
output = "active";
break;
case DL_STATE_SUPPLIER_UNBIND:
output = "supplier unbinding";
break;
default:
output = "unknown";
break;
}
return sysfs_emit(buf, "%s\n", output);
}
static DEVICE_ATTR_RO(status);
static ssize_t auto_remove_on_show(struct device *dev,
struct device_attribute *attr, char *buf)
{
struct device_link *link = to_devlink(dev);
const char *output;
if (link->flags & DL_FLAG_AUTOREMOVE_SUPPLIER)
output = "supplier unbind";
else if (link->flags & DL_FLAG_AUTOREMOVE_CONSUMER)
output = "consumer unbind";
else
output = "never";
return sysfs_emit(buf, "%s\n", output);
}
static DEVICE_ATTR_RO(auto_remove_on);
static ssize_t runtime_pm_show(struct device *dev,
struct device_attribute *attr, char *buf)
{
struct device_link *link = to_devlink(dev);
return sysfs_emit(buf, "%d\n", !!(link->flags & DL_FLAG_PM_RUNTIME));
}
static DEVICE_ATTR_RO(runtime_pm);
static ssize_t sync_state_only_show(struct device *dev,
struct device_attribute *attr, char *buf)
{
struct device_link *link = to_devlink(dev);
return sysfs_emit(buf, "%d\n",
!!(link->flags & DL_FLAG_SYNC_STATE_ONLY));
}
static DEVICE_ATTR_RO(sync_state_only);
static struct attribute *devlink_attrs[] = {
&dev_attr_status.attr,
&dev_attr_auto_remove_on.attr,
&dev_attr_runtime_pm.attr,
&dev_attr_sync_state_only.attr,
NULL,
};
ATTRIBUTE_GROUPS(devlink);
static void device_link_release_fn(struct work_struct *work)
{
struct device_link *link = container_of(work, struct device_link, rm_work);
/* Ensure that all references to the link object have been dropped. */
device_link_synchronize_removal();
pm_runtime_release_supplier(link, true);
put_device(link->consumer);
put_device(link->supplier);
kfree(link);
}
static void devlink_dev_release(struct device *dev)
{
struct device_link *link = to_devlink(dev);
INIT_WORK(&link->rm_work, device_link_release_fn);
/*
* It may take a while to complete this work because of the SRCU
* synchronization in device_link_release_fn() and if the consumer or
* supplier devices get deleted when it runs, so put it into the "long"
* workqueue.
*/
queue_work(system_long_wq, &link->rm_work);
}
static struct class devlink_class = {
.name = "devlink",
.owner = THIS_MODULE,
.dev_groups = devlink_groups,
.dev_release = devlink_dev_release,
};
static int devlink_add_symlinks(struct device *dev,
struct class_interface *class_intf)
{
int ret;
size_t len;
struct device_link *link = to_devlink(dev);
struct device *sup = link->supplier;
struct device *con = link->consumer;
char *buf;
len = max(strlen(dev_bus_name(sup)) + strlen(dev_name(sup)),
strlen(dev_bus_name(con)) + strlen(dev_name(con)));
len += strlen(":");
len += strlen("supplier:") + 1;
buf = kzalloc(len, GFP_KERNEL);
if (!buf)
return -ENOMEM;
ret = sysfs_create_link(&link->link_dev.kobj, &sup->kobj, "supplier");
if (ret)
goto out;
ret = sysfs_create_link(&link->link_dev.kobj, &con->kobj, "consumer");
if (ret)
goto err_con;
snprintf(buf, len, "consumer:%s:%s", dev_bus_name(con), dev_name(con));
ret = sysfs_create_link(&sup->kobj, &link->link_dev.kobj, buf);
if (ret)
goto err_con_dev;
snprintf(buf, len, "supplier:%s:%s", dev_bus_name(sup), dev_name(sup));
ret = sysfs_create_link(&con->kobj, &link->link_dev.kobj, buf);
if (ret)
goto err_sup_dev;
goto out;
err_sup_dev:
snprintf(buf, len, "consumer:%s:%s", dev_bus_name(con), dev_name(con));
sysfs_remove_link(&sup->kobj, buf);
err_con_dev:
sysfs_remove_link(&link->link_dev.kobj, "consumer");
err_con:
sysfs_remove_link(&link->link_dev.kobj, "supplier");
out:
kfree(buf);
return ret;
}
static void devlink_remove_symlinks(struct device *dev,
struct class_interface *class_intf)
{
struct device_link *link = to_devlink(dev);
size_t len;
struct device *sup = link->supplier;
struct device *con = link->consumer;
char *buf;
sysfs_remove_link(&link->link_dev.kobj, "consumer");
sysfs_remove_link(&link->link_dev.kobj, "supplier");
len = max(strlen(dev_bus_name(sup)) + strlen(dev_name(sup)),
strlen(dev_bus_name(con)) + strlen(dev_name(con)));
len += strlen(":");
len += strlen("supplier:") + 1;
buf = kzalloc(len, GFP_KERNEL);
if (!buf) {
WARN(1, "Unable to properly free device link symlinks!\n");
return;
}
if (device_is_registered(con)) {
snprintf(buf, len, "supplier:%s:%s", dev_bus_name(sup), dev_name(sup));
sysfs_remove_link(&con->kobj, buf);
}
snprintf(buf, len, "consumer:%s:%s", dev_bus_name(con), dev_name(con));
sysfs_remove_link(&sup->kobj, buf);
kfree(buf);
}
static struct class_interface devlink_class_intf = {
.class = &devlink_class,
.add_dev = devlink_add_symlinks,
.remove_dev = devlink_remove_symlinks,
};
static int __init devlink_class_init(void)
{
int ret;
ret = class_register(&devlink_class);
if (ret)
return ret;
ret = class_interface_register(&devlink_class_intf);
if (ret)
class_unregister(&devlink_class);
return ret;
}
postcore_initcall(devlink_class_init);
#define DL_MANAGED_LINK_FLAGS (DL_FLAG_AUTOREMOVE_CONSUMER | \
DL_FLAG_AUTOREMOVE_SUPPLIER | \
DL_FLAG_AUTOPROBE_CONSUMER | \
DL_FLAG_SYNC_STATE_ONLY | \
DL_FLAG_INFERRED)
#define DL_ADD_VALID_FLAGS (DL_MANAGED_LINK_FLAGS | DL_FLAG_STATELESS | \
DL_FLAG_PM_RUNTIME | DL_FLAG_RPM_ACTIVE)
/**
* device_link_add - Create a link between two devices.
* @consumer: Consumer end of the link.
* @supplier: Supplier end of the link.
* @flags: Link flags.
*
* The caller is responsible for the proper synchronization of the link creation
* with runtime PM. First, setting the DL_FLAG_PM_RUNTIME flag will cause the
* runtime PM framework to take the link into account. Second, if the
* DL_FLAG_RPM_ACTIVE flag is set in addition to it, the supplier devices will
* be forced into the active meta state and reference-counted upon the creation
* of the link. If DL_FLAG_PM_RUNTIME is not set, DL_FLAG_RPM_ACTIVE will be
* ignored.
*
* If DL_FLAG_STATELESS is set in @flags, the caller of this function is
* expected to release the link returned by it directly with the help of either
* device_link_del() or device_link_remove().
*
* If that flag is not set, however, the caller of this function is handing the
* management of the link over to the driver core entirely and its return value
* can only be used to check whether or not the link is present. In that case,
* the DL_FLAG_AUTOREMOVE_CONSUMER and DL_FLAG_AUTOREMOVE_SUPPLIER device link
* flags can be used to indicate to the driver core when the link can be safely
* deleted. Namely, setting one of them in @flags indicates to the driver core
* that the link is not going to be used (by the given caller of this function)
* after unbinding the consumer or supplier driver, respectively, from its
* device, so the link can be deleted at that point. If none of them is set,
* the link will be maintained until one of the devices pointed to by it (either
* the consumer or the supplier) is unregistered.
*
* Also, if DL_FLAG_STATELESS, DL_FLAG_AUTOREMOVE_CONSUMER and
* DL_FLAG_AUTOREMOVE_SUPPLIER are not set in @flags (that is, a persistent
* managed device link is being added), the DL_FLAG_AUTOPROBE_CONSUMER flag can
* be used to request the driver core to automatically probe for a consumer
* driver after successfully binding a driver to the supplier device.
*
* The combination of DL_FLAG_STATELESS and one of DL_FLAG_AUTOREMOVE_CONSUMER,
* DL_FLAG_AUTOREMOVE_SUPPLIER, or DL_FLAG_AUTOPROBE_CONSUMER set in @flags at
* the same time is invalid and will cause NULL to be returned upfront.
* However, if a device link between the given @consumer and @supplier pair
* exists already when this function is called for them, the existing link will
* be returned regardless of its current type and status (the link's flags may
* be modified then). The caller of this function is then expected to treat
* the link as though it has just been created, so (in particular) if
* DL_FLAG_STATELESS was passed in @flags, the link needs to be released
* explicitly when not needed any more (as stated above).
*
* A side effect of the link creation is re-ordering of dpm_list and the
* devices_kset list by moving the consumer device and all devices depending
* on it to the ends of these lists (that does not happen to devices that have
* not been registered when this function is called).
*
* The supplier device is required to be registered when this function is called
* and NULL will be returned if that is not the case. The consumer device need
* not be registered, however.
*/
struct device_link *device_link_add(struct device *consumer,
struct device *supplier, u32 flags)
{
struct device_link *link;
if (!consumer || !supplier || consumer == supplier ||
flags & ~DL_ADD_VALID_FLAGS ||
(flags & DL_FLAG_STATELESS && flags & DL_MANAGED_LINK_FLAGS) ||
(flags & DL_FLAG_SYNC_STATE_ONLY &&
(flags & ~DL_FLAG_INFERRED) != DL_FLAG_SYNC_STATE_ONLY) ||
(flags & DL_FLAG_AUTOPROBE_CONSUMER &&
flags & (DL_FLAG_AUTOREMOVE_CONSUMER |
DL_FLAG_AUTOREMOVE_SUPPLIER)))
return NULL;
if (flags & DL_FLAG_PM_RUNTIME && flags & DL_FLAG_RPM_ACTIVE) {
if (pm_runtime_get_sync(supplier) < 0) {
pm_runtime_put_noidle(supplier);
return NULL;
}
}
if (!(flags & DL_FLAG_STATELESS))
flags |= DL_FLAG_MANAGED;
device_links_write_lock();
device_pm_lock();
/*
* If the supplier has not been fully registered yet or there is a
* reverse (non-SYNC_STATE_ONLY) dependency between the consumer and
* the supplier already in the graph, return NULL. If the link is a
* SYNC_STATE_ONLY link, we don't check for reverse dependencies
* because it only affects sync_state() callbacks.
*/
if (!device_pm_initialized(supplier)
|| (!(flags & DL_FLAG_SYNC_STATE_ONLY) &&
device_is_dependent(consumer, supplier))) {
link = NULL;
goto out;
}
/*
* SYNC_STATE_ONLY links are useless once a consumer device has probed.
* So, only create it if the consumer hasn't probed yet.
*/
if (flags & DL_FLAG_SYNC_STATE_ONLY &&
consumer->links.status != DL_DEV_NO_DRIVER &&
consumer->links.status != DL_DEV_PROBING) {
link = NULL;
goto out;
}
/*
* DL_FLAG_AUTOREMOVE_SUPPLIER indicates that the link will be needed
* longer than for DL_FLAG_AUTOREMOVE_CONSUMER and setting them both
* together doesn't make sense, so prefer DL_FLAG_AUTOREMOVE_SUPPLIER.
*/
if (flags & DL_FLAG_AUTOREMOVE_SUPPLIER)
flags &= ~DL_FLAG_AUTOREMOVE_CONSUMER;
list_for_each_entry(link, &supplier->links.consumers, s_node) {
if (link->consumer != consumer)
continue;
if (link->flags & DL_FLAG_INFERRED &&
!(flags & DL_FLAG_INFERRED))
link->flags &= ~DL_FLAG_INFERRED;
if (flags & DL_FLAG_PM_RUNTIME) {
if (!(link->flags & DL_FLAG_PM_RUNTIME)) {
pm_runtime_new_link(consumer);
link->flags |= DL_FLAG_PM_RUNTIME;
}
if (flags & DL_FLAG_RPM_ACTIVE)
refcount_inc(&link->rpm_active);
}
if (flags & DL_FLAG_STATELESS) {
kref_get(&link->kref);
if (link->flags & DL_FLAG_SYNC_STATE_ONLY &&
!(link->flags & DL_FLAG_STATELESS)) {
link->flags |= DL_FLAG_STATELESS;
goto reorder;
} else {
link->flags |= DL_FLAG_STATELESS;
goto out;
}
}
/*
* If the life time of the link following from the new flags is
* longer than indicated by the flags of the existing link,
* update the existing link to stay around longer.
*/
if (flags & DL_FLAG_AUTOREMOVE_SUPPLIER) {
if (link->flags & DL_FLAG_AUTOREMOVE_CONSUMER) {
link->flags &= ~DL_FLAG_AUTOREMOVE_CONSUMER;
link->flags |= DL_FLAG_AUTOREMOVE_SUPPLIER;
}
} else if (!(flags & DL_FLAG_AUTOREMOVE_CONSUMER)) {
link->flags &= ~(DL_FLAG_AUTOREMOVE_CONSUMER |
DL_FLAG_AUTOREMOVE_SUPPLIER);
}
if (!(link->flags & DL_FLAG_MANAGED)) {
kref_get(&link->kref);
link->flags |= DL_FLAG_MANAGED;
device_link_init_status(link, consumer, supplier);
}
if (link->flags & DL_FLAG_SYNC_STATE_ONLY &&
!(flags & DL_FLAG_SYNC_STATE_ONLY)) {
link->flags &= ~DL_FLAG_SYNC_STATE_ONLY;
goto reorder;
}
goto out;
}
link = kzalloc(sizeof(*link), GFP_KERNEL);
if (!link)
goto out;
refcount_set(&link->rpm_active, 1);
get_device(supplier);
link->supplier = supplier;
INIT_LIST_HEAD(&link->s_node);
get_device(consumer);
link->consumer = consumer;
INIT_LIST_HEAD(&link->c_node);
link->flags = flags;
kref_init(&link->kref);
link->link_dev.class = &devlink_class;
device_set_pm_not_required(&link->link_dev);
dev_set_name(&link->link_dev, "%s:%s--%s:%s",
dev_bus_name(supplier), dev_name(supplier),
dev_bus_name(consumer), dev_name(consumer));
if (device_register(&link->link_dev)) {
put_device(&link->link_dev);
link = NULL;
goto out;
}
if (flags & DL_FLAG_PM_RUNTIME) {
if (flags & DL_FLAG_RPM_ACTIVE)
refcount_inc(&link->rpm_active);
pm_runtime_new_link(consumer);
}
/* Determine the initial link state. */
if (flags & DL_FLAG_STATELESS)
link->status = DL_STATE_NONE;
else
device_link_init_status(link, consumer, supplier);
/*
* Some callers expect the link creation during consumer driver probe to
* resume the supplier even without DL_FLAG_RPM_ACTIVE.
*/
if (link->status == DL_STATE_CONSUMER_PROBE &&
flags & DL_FLAG_PM_RUNTIME)
pm_runtime_resume(supplier);
list_add_tail_rcu(&link->s_node, &supplier->links.consumers);
list_add_tail_rcu(&link->c_node, &consumer->links.suppliers);
if (flags & DL_FLAG_SYNC_STATE_ONLY) {
dev_dbg(consumer,
"Linked as a sync state only consumer to %s\n",
dev_name(supplier));
goto out;
}
reorder:
/*
* Move the consumer and all of the devices depending on it to the end
* of dpm_list and the devices_kset list.
*
* It is necessary to hold dpm_list locked throughout all that or else
* we may end up suspending with a wrong ordering of it.
*/
device_reorder_to_tail(consumer, NULL);
dev_dbg(consumer, "Linked as a consumer to %s\n", dev_name(supplier));
out:
device_pm_unlock();
device_links_write_unlock();
if ((flags & DL_FLAG_PM_RUNTIME && flags & DL_FLAG_RPM_ACTIVE) && !link)
pm_runtime_put(supplier);
return link;
}
EXPORT_SYMBOL_GPL(device_link_add);
static void __device_link_del(struct kref *kref)
{
struct device_link *link = container_of(kref, struct device_link, kref);
dev_dbg(link->consumer, "Dropping the link to %s\n",
dev_name(link->supplier));
pm_runtime_drop_link(link);
device_link_remove_from_lists(link);
device_unregister(&link->link_dev);
}
static void device_link_put_kref(struct device_link *link)
{
if (link->flags & DL_FLAG_STATELESS)
kref_put(&link->kref, __device_link_del);
else if (!device_is_registered(link->consumer))
__device_link_del(&link->kref);
else
WARN(1, "Unable to drop a managed device link reference\n");
}
/**
* device_link_del - Delete a stateless link between two devices.
* @link: Device link to delete.
*
* The caller must ensure proper synchronization of this function with runtime
* PM. If the link was added multiple times, it needs to be deleted as often.
* Care is required for hotplugged devices: Their links are purged on removal
* and calling device_link_del() is then no longer allowed.
*/
void device_link_del(struct device_link *link)
{
device_links_write_lock();
device_link_put_kref(link);
device_links_write_unlock();
}
EXPORT_SYMBOL_GPL(device_link_del);
/**
* device_link_remove - Delete a stateless link between two devices.
* @consumer: Consumer end of the link.
* @supplier: Supplier end of the link.
*
* The caller must ensure proper synchronization of this function with runtime
* PM.
*/
void device_link_remove(void *consumer, struct device *supplier)
{
struct device_link *link;
if (WARN_ON(consumer == supplier))
return;
device_links_write_lock();
list_for_each_entry(link, &supplier->links.consumers, s_node) {
if (link->consumer == consumer) {
device_link_put_kref(link);
break;
}
}
device_links_write_unlock();
}
EXPORT_SYMBOL_GPL(device_link_remove);
static void device_links_missing_supplier(struct device *dev)
{
struct device_link *link;
list_for_each_entry(link, &dev->links.suppliers, c_node) {
if (link->status != DL_STATE_CONSUMER_PROBE)
continue;
if (link->supplier->links.status == DL_DEV_DRIVER_BOUND) {
WRITE_ONCE(link->status, DL_STATE_AVAILABLE);
} else {
WARN_ON(!(link->flags & DL_FLAG_SYNC_STATE_ONLY));
WRITE_ONCE(link->status, DL_STATE_DORMANT);
}
}
}
/**
* device_links_check_suppliers - Check presence of supplier drivers.
* @dev: Consumer device.
*
* Check links from this device to any suppliers. Walk the list of the device's
* links to suppliers and see if all of them are available. If not, simply
* return -EPROBE_DEFER.
*
* We need to guarantee that the supplier will not go away after the check has
* been positive here. It only can go away in __device_release_driver() and
* that function checks the device's links to consumers. This means we need to
* mark the link as "consumer probe in progress" to make the supplier removal
* wait for us to complete (or bad things may happen).
*
* Links without the DL_FLAG_MANAGED flag set are ignored.
*/
int device_links_check_suppliers(struct device *dev)
{
struct device_link *link;
int ret = 0;
struct fwnode_handle *sup_fw;
/*
* Device waiting for supplier to become available is not allowed to
* probe.
*/
mutex_lock(&fwnode_link_lock);
if (dev->fwnode && !list_empty(&dev->fwnode->suppliers) &&
!fw_devlink_is_permissive()) {
sup_fw = list_first_entry(&dev->fwnode->suppliers,
struct fwnode_link,
c_hook)->supplier;
dev_err_probe(dev, -EPROBE_DEFER, "wait for supplier %pfwP\n",
sup_fw);
mutex_unlock(&fwnode_link_lock);
return -EPROBE_DEFER;
}
mutex_unlock(&fwnode_link_lock);
device_links_write_lock();
list_for_each_entry(link, &dev->links.suppliers, c_node) {
if (!(link->flags & DL_FLAG_MANAGED))
continue;
if (link->status != DL_STATE_AVAILABLE &&
!(link->flags & DL_FLAG_SYNC_STATE_ONLY)) {
device_links_missing_supplier(dev);
dev_err_probe(dev, -EPROBE_DEFER,
"supplier %s not ready\n",
dev_name(link->supplier));
ret = -EPROBE_DEFER;
break;
}
WRITE_ONCE(link->status, DL_STATE_CONSUMER_PROBE);
}
dev->links.status = DL_DEV_PROBING;
device_links_write_unlock();
return ret;
}
/**
* __device_links_queue_sync_state - Queue a device for sync_state() callback
* @dev: Device to call sync_state() on
* @list: List head to queue the @dev on
*
* Queues a device for a sync_state() callback when the device links write lock
* isn't held. This allows the sync_state() execution flow to use device links
* APIs. The caller must ensure this function is called with
* device_links_write_lock() held.
*
* This function does a get_device() to make sure the device is not freed while
* on this list.
*
* So the caller must also ensure that device_links_flush_sync_list() is called
* as soon as the caller releases device_links_write_lock(). This is necessary
* to make sure the sync_state() is called in a timely fashion and the
* put_device() is called on this device.
*/
static void __device_links_queue_sync_state(struct device *dev,
struct list_head *list)
{
struct device_link *link;
if (!dev_has_sync_state(dev))
return;
if (dev->state_synced)
return;
list_for_each_entry(link, &dev->links.consumers, s_node) {
if (!(link->flags & DL_FLAG_MANAGED))
continue;
if (link->status != DL_STATE_ACTIVE)
return;
}
/*
* Set the flag here to avoid adding the same device to a list more
* than once. This can happen if new consumers get added to the device
* and probed before the list is flushed.
*/
dev->state_synced = true;
if (WARN_ON(!list_empty(&dev->links.defer_sync)))
return;
get_device(dev);
list_add_tail(&dev->links.defer_sync, list);
}
/**
* device_links_flush_sync_list - Call sync_state() on a list of devices
* @list: List of devices to call sync_state() on
* @dont_lock_dev: Device for which lock is already held by the caller
*
* Calls sync_state() on all the devices that have been queued for it. This
* function is used in conjunction with __device_links_queue_sync_state(). The
* @dont_lock_dev parameter is useful when this function is called from a
* context where a device lock is already held.
*/
static void device_links_flush_sync_list(struct list_head *list,
struct device *dont_lock_dev)
{
struct device *dev, *tmp;
list_for_each_entry_safe(dev, tmp, list, links.defer_sync) {
list_del_init(&dev->links.defer_sync);
if (dev != dont_lock_dev)
device_lock(dev);
if (dev->bus->sync_state)
dev->bus->sync_state(dev);
else if (dev->driver && dev->driver->sync_state)
dev->driver->sync_state(dev);
if (dev != dont_lock_dev)
device_unlock(dev);
put_device(dev);
}
}
void device_links_supplier_sync_state_pause(void)
{
device_links_write_lock();
defer_sync_state_count++;
device_links_write_unlock();
}
void device_links_supplier_sync_state_resume(void)
{
struct device *dev, *tmp;
LIST_HEAD(sync_list);
device_links_write_lock();
if (!defer_sync_state_count) {
WARN(true, "Unmatched sync_state pause/resume!");
goto out;
}
defer_sync_state_count--;
if (defer_sync_state_count)
goto out;
list_for_each_entry_safe(dev, tmp, &deferred_sync, links.defer_sync) {
/*
* Delete from deferred_sync list before queuing it to
* sync_list because defer_sync is used for both lists.
*/
list_del_init(&dev->links.defer_sync);
__device_links_queue_sync_state(dev, &sync_list);
}
out:
device_links_write_unlock();
device_links_flush_sync_list(&sync_list, NULL);
}
static int sync_state_resume_initcall(void)
{
device_links_supplier_sync_state_resume();
return 0;
}
late_initcall(sync_state_resume_initcall);
static void __device_links_supplier_defer_sync(struct device *sup)
{
if (list_empty(&sup->links.defer_sync) && dev_has_sync_state(sup))
list_add_tail(&sup->links.defer_sync, &deferred_sync);
}
static void device_link_drop_managed(struct device_link *link)
{
link->flags &= ~DL_FLAG_MANAGED;
WRITE_ONCE(link->status, DL_STATE_NONE);
kref_put(&link->kref, __device_link_del);
}
static ssize_t waiting_for_supplier_show(struct device *dev,
struct device_attribute *attr,
char *buf)
{
bool val;
device_lock(dev);
val = !list_empty(&dev->fwnode->suppliers);
device_unlock(dev);
return sysfs_emit(buf, "%u\n", val);
}
static DEVICE_ATTR_RO(waiting_for_supplier);
/**
* device_links_force_bind - Prepares device to be force bound
* @dev: Consumer device.
*
* device_bind_driver() force binds a device to a driver without calling any
* driver probe functions. So the consumer really isn't going to wait for any
* supplier before it's bound to the driver. We still want the device link
* states to be sensible when this happens.
*
* In preparation for device_bind_driver(), this function goes through each
* supplier device links and checks if the supplier is bound. If it is, then
* the device link status is set to CONSUMER_PROBE. Otherwise, the device link
* is dropped. Links without the DL_FLAG_MANAGED flag set are ignored.
*/
void device_links_force_bind(struct device *dev)
{
struct device_link *link, *ln;
device_links_write_lock();
list_for_each_entry_safe(link, ln, &dev->links.suppliers, c_node) {
if (!(link->flags & DL_FLAG_MANAGED))
continue;
if (link->status != DL_STATE_AVAILABLE) {
device_link_drop_managed(link);
continue;
}
WRITE_ONCE(link->status, DL_STATE_CONSUMER_PROBE);
}
dev->links.status = DL_DEV_PROBING;
device_links_write_unlock();
}
/**
* device_links_driver_bound - Update device links after probing its driver.
* @dev: Device to update the links for.
*
* The probe has been successful, so update links from this device to any
* consumers by changing their status to "available".
*
* Also change the status of @dev's links to suppliers to "active".
*
* Links without the DL_FLAG_MANAGED flag set are ignored.
*/
void device_links_driver_bound(struct device *dev)
{
struct device_link *link, *ln;
LIST_HEAD(sync_list);
/*
* If a device binds successfully, it's expected to have created all
* the device links it needs to or make new device links as it needs
* them. So, fw_devlink no longer needs to create device links to any
* of the device's suppliers.
*
* Also, if a child firmware node of this bound device is not added as
* a device by now, assume it is never going to be added and make sure
* other devices don't defer probe indefinitely by waiting for such a
* child device.
*/
if (dev->fwnode && dev->fwnode->dev == dev) {
struct fwnode_handle *child;
fwnode_links_purge_suppliers(dev->fwnode);
fwnode_for_each_available_child_node(dev->fwnode, child)
fw_devlink_purge_absent_suppliers(child);
}
device_remove_file(dev, &dev_attr_waiting_for_supplier);
device_links_write_lock();
list_for_each_entry(link, &dev->links.consumers, s_node) {
if (!(link->flags & DL_FLAG_MANAGED))
continue;
/*
* Links created during consumer probe may be in the "consumer
* probe" state to start with if the supplier is still probing
* when they are created and they may become "active" if the
* consumer probe returns first. Skip them here.
*/
if (link->status == DL_STATE_CONSUMER_PROBE ||
link->status == DL_STATE_ACTIVE)
continue;
WARN_ON(link->status != DL_STATE_DORMANT);
WRITE_ONCE(link->status, DL_STATE_AVAILABLE);
if (link->flags & DL_FLAG_AUTOPROBE_CONSUMER)
driver_deferred_probe_add(link->consumer);
}
if (defer_sync_state_count)
__device_links_supplier_defer_sync(dev);
else
__device_links_queue_sync_state(dev, &sync_list);
list_for_each_entry_safe(link, ln, &dev->links.suppliers, c_node) {
struct device *supplier;
if (!(link->flags & DL_FLAG_MANAGED))
continue;
supplier = link->supplier;
if (link->flags & DL_FLAG_SYNC_STATE_ONLY) {
/*
* When DL_FLAG_SYNC_STATE_ONLY is set, it means no
* other DL_MANAGED_LINK_FLAGS have been set. So, it's
* save to drop the managed link completely.
*/
device_link_drop_managed(link);
} else {
WARN_ON(link->status != DL_STATE_CONSUMER_PROBE);
WRITE_ONCE(link->status, DL_STATE_ACTIVE);
}
/*
* This needs to be done even for the deleted
* DL_FLAG_SYNC_STATE_ONLY device link in case it was the last
* device link that was preventing the supplier from getting a
* sync_state() call.
*/
if (defer_sync_state_count)
__device_links_supplier_defer_sync(supplier);
else
__device_links_queue_sync_state(supplier, &sync_list);
}
dev->links.status = DL_DEV_DRIVER_BOUND;
device_links_write_unlock();
device_links_flush_sync_list(&sync_list, dev);
}
/**
* __device_links_no_driver - Update links of a device without a driver.
* @dev: Device without a drvier.
*
* Delete all non-persistent links from this device to any suppliers.
*
* Persistent links stay around, but their status is changed to "available",
* unless they already are in the "supplier unbind in progress" state in which
* case they need not be updated.
*
* Links without the DL_FLAG_MANAGED flag set are ignored.
*/
static void __device_links_no_driver(struct device *dev)
{
struct device_link *link, *ln;
list_for_each_entry_safe_reverse(link, ln, &dev->links.suppliers, c_node) {
if (!(link->flags & DL_FLAG_MANAGED))
continue;
if (link->flags & DL_FLAG_AUTOREMOVE_CONSUMER) {
device_link_drop_managed(link);
continue;
}
if (link->status != DL_STATE_CONSUMER_PROBE &&
link->status != DL_STATE_ACTIVE)
continue;
if (link->supplier->links.status == DL_DEV_DRIVER_BOUND) {
WRITE_ONCE(link->status, DL_STATE_AVAILABLE);
} else {
WARN_ON(!(link->flags & DL_FLAG_SYNC_STATE_ONLY));
WRITE_ONCE(link->status, DL_STATE_DORMANT);
}
}
dev->links.status = DL_DEV_NO_DRIVER;
}
/**
* device_links_no_driver - Update links after failing driver probe.
* @dev: Device whose driver has just failed to probe.
*
* Clean up leftover links to consumers for @dev and invoke
* %__device_links_no_driver() to update links to suppliers for it as
* appropriate.
*
* Links without the DL_FLAG_MANAGED flag set are ignored.
*/
void device_links_no_driver(struct device *dev)
{
struct device_link *link;
device_links_write_lock();
list_for_each_entry(link, &dev->links.consumers, s_node) {
if (!(link->flags & DL_FLAG_MANAGED))
continue;
/*
* The probe has failed, so if the status of the link is
* "consumer probe" or "active", it must have been added by
* a probing consumer while this device was still probing.
* Change its state to "dormant", as it represents a valid
* relationship, but it is not functionally meaningful.
*/
if (link->status == DL_STATE_CONSUMER_PROBE ||
link->status == DL_STATE_ACTIVE)
WRITE_ONCE(link->status, DL_STATE_DORMANT);
}
__device_links_no_driver(dev);
device_links_write_unlock();
}
/**
* device_links_driver_cleanup - Update links after driver removal.
* @dev: Device whose driver has just gone away.
*
* Update links to consumers for @dev by changing their status to "dormant" and
* invoke %__device_links_no_driver() to update links to suppliers for it as
* appropriate.
*
* Links without the DL_FLAG_MANAGED flag set are ignored.
*/
void device_links_driver_cleanup(struct device *dev)
{
struct device_link *link, *ln;
device_links_write_lock();
list_for_each_entry_safe(link, ln, &dev->links.consumers, s_node) {
if (!(link->flags & DL_FLAG_MANAGED))
continue;
WARN_ON(link->flags & DL_FLAG_AUTOREMOVE_CONSUMER);
WARN_ON(link->status != DL_STATE_SUPPLIER_UNBIND);
/*
* autoremove the links between this @dev and its consumer
* devices that are not active, i.e. where the link state
* has moved to DL_STATE_SUPPLIER_UNBIND.
*/
if (link->status == DL_STATE_SUPPLIER_UNBIND &&
link->flags & DL_FLAG_AUTOREMOVE_SUPPLIER)
device_link_drop_managed(link);
WRITE_ONCE(link->status, DL_STATE_DORMANT);
}
list_del_init(&dev->links.defer_sync);
__device_links_no_driver(dev);
device_links_write_unlock();
}
/**
* device_links_busy - Check if there are any busy links to consumers.
* @dev: Device to check.
*
* Check each consumer of the device and return 'true' if its link's status
* is one of "consumer probe" or "active" (meaning that the given consumer is
* probing right now or its driver is present). Otherwise, change the link
* state to "supplier unbind" to prevent the consumer from being probed
* successfully going forward.
*
* Return 'false' if there are no probing or active consumers.
*
* Links without the DL_FLAG_MANAGED flag set are ignored.
*/
bool device_links_busy(struct device *dev)
{
struct device_link *link;
bool ret = false;
device_links_write_lock();
list_for_each_entry(link, &dev->links.consumers, s_node) {
if (!(link->flags & DL_FLAG_MANAGED))
continue;
if (link->status == DL_STATE_CONSUMER_PROBE
|| link->status == DL_STATE_ACTIVE) {
ret = true;
break;
}
WRITE_ONCE(link->status, DL_STATE_SUPPLIER_UNBIND);
}
dev->links.status = DL_DEV_UNBINDING;
device_links_write_unlock();
return ret;
}
/**
* device_links_unbind_consumers - Force unbind consumers of the given device.
* @dev: Device to unbind the consumers of.
*
* Walk the list of links to consumers for @dev and if any of them is in the
* "consumer probe" state, wait for all device probes in progress to complete
* and start over.
*
* If that's not the case, change the status of the link to "supplier unbind"
* and check if the link was in the "active" state. If so, force the consumer
* driver to unbind and start over (the consumer will not re-probe as we have
* changed the state of the link already).
*
* Links without the DL_FLAG_MANAGED flag set are ignored.
*/
void device_links_unbind_consumers(struct device *dev)
{
struct device_link *link;
start:
device_links_write_lock();
list_for_each_entry(link, &dev->links.consumers, s_node) {
enum device_link_state status;
if (!(link->flags & DL_FLAG_MANAGED) ||
link->flags & DL_FLAG_SYNC_STATE_ONLY)
continue;
status = link->status;
if (status == DL_STATE_CONSUMER_PROBE) {
device_links_write_unlock();
wait_for_device_probe();
goto start;
}
WRITE_ONCE(link->status, DL_STATE_SUPPLIER_UNBIND);
if (status == DL_STATE_ACTIVE) {
struct device *consumer = link->consumer;
get_device(consumer);
device_links_write_unlock();
device_release_driver_internal(consumer, NULL,
consumer->parent);
put_device(consumer);
goto start;
}
}
device_links_write_unlock();
}
/**
* device_links_purge - Delete existing links to other devices.
* @dev: Target device.
*/
static void device_links_purge(struct device *dev)
{
struct device_link *link, *ln;
if (dev->class == &devlink_class)
return;
/*
* Delete all of the remaining links from this device to any other
* devices (either consumers or suppliers).
*/
device_links_write_lock();
list_for_each_entry_safe_reverse(link, ln, &dev->links.suppliers, c_node) {
WARN_ON(link->status == DL_STATE_ACTIVE); __device_link_del(&link->kref);
}
list_for_each_entry_safe_reverse(link, ln, &dev->links.consumers, s_node) { WARN_ON(link->status != DL_STATE_DORMANT &&
link->status != DL_STATE_NONE);
__device_link_del(&link->kref);
}
device_links_write_unlock();
}
#define FW_DEVLINK_FLAGS_PERMISSIVE (DL_FLAG_INFERRED | \
DL_FLAG_SYNC_STATE_ONLY)
#define FW_DEVLINK_FLAGS_ON (DL_FLAG_INFERRED | \
DL_FLAG_AUTOPROBE_CONSUMER)
#define FW_DEVLINK_FLAGS_RPM (FW_DEVLINK_FLAGS_ON | \
DL_FLAG_PM_RUNTIME)
static u32 fw_devlink_flags = FW_DEVLINK_FLAGS_ON;
static int __init fw_devlink_setup(char *arg)
{
if (!arg)
return -EINVAL;
if (strcmp(arg, "off") == 0) {
fw_devlink_flags = 0;
} else if (strcmp(arg, "permissive") == 0) {
fw_devlink_flags = FW_DEVLINK_FLAGS_PERMISSIVE;
} else if (strcmp(arg, "on") == 0) {
fw_devlink_flags = FW_DEVLINK_FLAGS_ON;
} else if (strcmp(arg, "rpm") == 0) {
fw_devlink_flags = FW_DEVLINK_FLAGS_RPM;
}
return 0;
}
early_param("fw_devlink", fw_devlink_setup);
static bool fw_devlink_strict;
static int __init fw_devlink_strict_setup(char *arg)
{
return strtobool(arg, &fw_devlink_strict);
}
early_param("fw_devlink.strict", fw_devlink_strict_setup);
u32 fw_devlink_get_flags(void)
{
return fw_devlink_flags;
}
static bool fw_devlink_is_permissive(void)
{
return fw_devlink_flags == FW_DEVLINK_FLAGS_PERMISSIVE;
}
bool fw_devlink_is_strict(void)
{
return fw_devlink_strict && !fw_devlink_is_permissive();
}
static void fw_devlink_parse_fwnode(struct fwnode_handle *fwnode)
{
if (fwnode->flags & FWNODE_FLAG_LINKS_ADDED)
return;
fwnode_call_int_op(fwnode, add_links);
fwnode->flags |= FWNODE_FLAG_LINKS_ADDED;
}
static void fw_devlink_parse_fwtree(struct fwnode_handle *fwnode)
{
struct fwnode_handle *child = NULL;
fw_devlink_parse_fwnode(fwnode);
while ((child = fwnode_get_next_available_child_node(fwnode, child)))
fw_devlink_parse_fwtree(child);
}
static void fw_devlink_relax_link(struct device_link *link)
{
if (!(link->flags & DL_FLAG_INFERRED))
return;
if (link->flags == (DL_FLAG_MANAGED | FW_DEVLINK_FLAGS_PERMISSIVE))
return;
pm_runtime_drop_link(link);
link->flags = DL_FLAG_MANAGED | FW_DEVLINK_FLAGS_PERMISSIVE;
dev_dbg(link->consumer, "Relaxing link with %s\n",
dev_name(link->supplier));
}
static int fw_devlink_no_driver(struct device *dev, void *data)
{
struct device_link *link = to_devlink(dev);
if (!link->supplier->can_match)
fw_devlink_relax_link(link);
return 0;
}
void fw_devlink_drivers_done(void)
{
fw_devlink_drv_reg_done = true;
device_links_write_lock();
class_for_each_device(&devlink_class, NULL, NULL,
fw_devlink_no_driver);
device_links_write_unlock();
}
static void fw_devlink_unblock_consumers(struct device *dev)
{
struct device_link *link;
if (!fw_devlink_flags || fw_devlink_is_permissive())
return;
device_links_write_lock();
list_for_each_entry(link, &dev->links.consumers, s_node)
fw_devlink_relax_link(link);
device_links_write_unlock();
}
/**
* fw_devlink_relax_cycle - Convert cyclic links to SYNC_STATE_ONLY links
* @con: Device to check dependencies for.
* @sup: Device to check against.
*
* Check if @sup depends on @con or any device dependent on it (its child or
* its consumer etc). When such a cyclic dependency is found, convert all
* device links created solely by fw_devlink into SYNC_STATE_ONLY device links.
* This is the equivalent of doing fw_devlink=permissive just between the
* devices in the cycle. We need to do this because, at this point, fw_devlink
* can't tell which of these dependencies is not a real dependency.
*
* Return 1 if a cycle is found. Otherwise, return 0.
*/
static int fw_devlink_relax_cycle(struct device *con, void *sup)
{
struct device_link *link;
int ret;
if (con == sup)
return 1;
ret = device_for_each_child(con, sup, fw_devlink_relax_cycle);
if (ret)
return ret;
list_for_each_entry(link, &con->links.consumers, s_node) {
if ((link->flags & ~DL_FLAG_INFERRED) ==
(DL_FLAG_SYNC_STATE_ONLY | DL_FLAG_MANAGED))
continue;
if (!fw_devlink_relax_cycle(link->consumer, sup))
continue;
ret = 1;
fw_devlink_relax_link(link);
}
return ret;
}
/**
* fw_devlink_create_devlink - Create a device link from a consumer to fwnode
* @con: consumer device for the device link
* @sup_handle: fwnode handle of supplier
* @flags: devlink flags
*
* This function will try to create a device link between the consumer device
* @con and the supplier device represented by @sup_handle.
*
* The supplier has to be provided as a fwnode because incorrect cycles in
* fwnode links can sometimes cause the supplier device to never be created.
* This function detects such cases and returns an error if it cannot create a
* device link from the consumer to a missing supplier.
*
* Returns,
* 0 on successfully creating a device link
* -EINVAL if the device link cannot be created as expected
* -EAGAIN if the device link cannot be created right now, but it may be
* possible to do that in the future
*/
static int fw_devlink_create_devlink(struct device *con,
struct fwnode_handle *sup_handle, u32 flags)
{
struct device *sup_dev;
int ret = 0;
/*
* In some cases, a device P might also be a supplier to its child node
* C. However, this would defer the probe of C until the probe of P
* completes successfully. This is perfectly fine in the device driver
* model. device_add() doesn't guarantee probe completion of the device
* by the time it returns.
*
* However, there are a few drivers that assume C will finish probing
* as soon as it's added and before P finishes probing. So, we provide
* a flag to let fw_devlink know not to delay the probe of C until the
* probe of P completes successfully.
*
* When such a flag is set, we can't create device links where P is the
* supplier of C as that would delay the probe of C.
*/
if (sup_handle->flags & FWNODE_FLAG_NEEDS_CHILD_BOUND_ON_ADD &&
fwnode_is_ancestor_of(sup_handle, con->fwnode))
return -EINVAL;
sup_dev = get_dev_from_fwnode(sup_handle);
if (sup_dev) {
/*
* If it's one of those drivers that don't actually bind to
* their device using driver core, then don't wait on this
* supplier device indefinitely.
*/
if (sup_dev->links.status == DL_DEV_NO_DRIVER &&
sup_handle->flags & FWNODE_FLAG_INITIALIZED) {
ret = -EINVAL;
goto out;
}
/*
* If this fails, it is due to cycles in device links. Just
* give up on this link and treat it as invalid.
*/
if (!device_link_add(con, sup_dev, flags) &&
!(flags & DL_FLAG_SYNC_STATE_ONLY)) {
dev_info(con, "Fixing up cyclic dependency with %s\n",
dev_name(sup_dev));
device_links_write_lock();
fw_devlink_relax_cycle(con, sup_dev);
device_links_write_unlock();
device_link_add(con, sup_dev,
FW_DEVLINK_FLAGS_PERMISSIVE);
ret = -EINVAL;
}
goto out;
}
/* Supplier that's already initialized without a struct device. */
if (sup_handle->flags & FWNODE_FLAG_INITIALIZED)
return -EINVAL;
/*
* DL_FLAG_SYNC_STATE_ONLY doesn't block probing and supports
* cycles. So cycle detection isn't necessary and shouldn't be
* done.
*/
if (flags & DL_FLAG_SYNC_STATE_ONLY)
return -EAGAIN;
/*
* If we can't find the supplier device from its fwnode, it might be
* due to a cyclic dependency between fwnodes. Some of these cycles can
* be broken by applying logic. Check for these types of cycles and
* break them so that devices in the cycle probe properly.
*
* If the supplier's parent is dependent on the consumer, then the
* consumer and supplier have a cyclic dependency. Since fw_devlink
* can't tell which of the inferred dependencies are incorrect, don't
* enforce probe ordering between any of the devices in this cyclic
* dependency. Do this by relaxing all the fw_devlink device links in
* this cycle and by treating the fwnode link between the consumer and
* the supplier as an invalid dependency.
*/
sup_dev = fwnode_get_next_parent_dev(sup_handle);
if (sup_dev && device_is_dependent(con, sup_dev)) {
dev_info(con, "Fixing up cyclic dependency with %pfwP (%s)\n",
sup_handle, dev_name(sup_dev));
device_links_write_lock();
fw_devlink_relax_cycle(con, sup_dev);
device_links_write_unlock();
ret = -EINVAL;
} else {
/*
* Can't check for cycles or no cycles. So let's try
* again later.
*/
ret = -EAGAIN;
}
out:
put_device(sup_dev);
return ret;
}
/**
* __fw_devlink_link_to_consumers - Create device links to consumers of a device
* @dev: Device that needs to be linked to its consumers
*
* This function looks at all the consumer fwnodes of @dev and creates device
* links between the consumer device and @dev (supplier).
*
* If the consumer device has not been added yet, then this function creates a
* SYNC_STATE_ONLY link between @dev (supplier) and the closest ancestor device
* of the consumer fwnode. This is necessary to make sure @dev doesn't get a
* sync_state() callback before the real consumer device gets to be added and
* then probed.
*
* Once device links are created from the real consumer to @dev (supplier), the
* fwnode links are deleted.
*/
static void __fw_devlink_link_to_consumers(struct device *dev)
{
struct fwnode_handle *fwnode = dev->fwnode;
struct fwnode_link *link, *tmp;
list_for_each_entry_safe(link, tmp, &fwnode->consumers, s_hook) {
u32 dl_flags = fw_devlink_get_flags();
struct device *con_dev;
bool own_link = true;
int ret;
con_dev = get_dev_from_fwnode(link->consumer);
/*
* If consumer device is not available yet, make a "proxy"
* SYNC_STATE_ONLY link from the consumer's parent device to
* the supplier device. This is necessary to make sure the
* supplier doesn't get a sync_state() callback before the real
* consumer can create a device link to the supplier.
*
* This proxy link step is needed to handle the case where the
* consumer's parent device is added before the supplier.
*/
if (!con_dev) {
con_dev = fwnode_get_next_parent_dev(link->consumer);
/*
* However, if the consumer's parent device is also the
* parent of the supplier, don't create a
* consumer-supplier link from the parent to its child
* device. Such a dependency is impossible.
*/
if (con_dev &&
fwnode_is_ancestor_of(con_dev->fwnode, fwnode)) {
put_device(con_dev);
con_dev = NULL;
} else {
own_link = false;
dl_flags = FW_DEVLINK_FLAGS_PERMISSIVE;
}
}
if (!con_dev)
continue;
ret = fw_devlink_create_devlink(con_dev, fwnode, dl_flags);
put_device(con_dev);
if (!own_link || ret == -EAGAIN)
continue;
__fwnode_link_del(link);
}
}
/**
* __fw_devlink_link_to_suppliers - Create device links to suppliers of a device
* @dev: The consumer device that needs to be linked to its suppliers
* @fwnode: Root of the fwnode tree that is used to create device links
*
* This function looks at all the supplier fwnodes of fwnode tree rooted at
* @fwnode and creates device links between @dev (consumer) and all the
* supplier devices of the entire fwnode tree at @fwnode.
*
* The function creates normal (non-SYNC_STATE_ONLY) device links between @dev
* and the real suppliers of @dev. Once these device links are created, the
* fwnode links are deleted. When such device links are successfully created,
* this function is called recursively on those supplier devices. This is
* needed to detect and break some invalid cycles in fwnode links. See
* fw_devlink_create_devlink() for more details.
*
* In addition, it also looks at all the suppliers of the entire fwnode tree
* because some of the child devices of @dev that have not been added yet
* (because @dev hasn't probed) might already have their suppliers added to
* driver core. So, this function creates SYNC_STATE_ONLY device links between
* @dev (consumer) and these suppliers to make sure they don't execute their
* sync_state() callbacks before these child devices have a chance to create
* their device links. The fwnode links that correspond to the child devices
* aren't delete because they are needed later to create the device links
* between the real consumer and supplier devices.
*/
static void __fw_devlink_link_to_suppliers(struct device *dev,
struct fwnode_handle *fwnode)
{
bool own_link = (dev->fwnode == fwnode);
struct fwnode_link *link, *tmp;
struct fwnode_handle *child = NULL;
u32 dl_flags;
if (own_link)
dl_flags = fw_devlink_get_flags();
else
dl_flags = FW_DEVLINK_FLAGS_PERMISSIVE;
list_for_each_entry_safe(link, tmp, &fwnode->suppliers, c_hook) {
int ret;
struct device *sup_dev;
struct fwnode_handle *sup = link->supplier;
ret = fw_devlink_create_devlink(dev, sup, dl_flags);
if (!own_link || ret == -EAGAIN)
continue;
__fwnode_link_del(link);
/* If no device link was created, nothing more to do. */
if (ret)
continue;
/*
* If a device link was successfully created to a supplier, we
* now need to try and link the supplier to all its suppliers.
*
* This is needed to detect and delete false dependencies in
* fwnode links that haven't been converted to a device link
* yet. See comments in fw_devlink_create_devlink() for more
* details on the false dependency.
*
* Without deleting these false dependencies, some devices will
* never probe because they'll keep waiting for their false
* dependency fwnode links to be converted to device links.
*/
sup_dev = get_dev_from_fwnode(sup);
__fw_devlink_link_to_suppliers(sup_dev, sup_dev->fwnode);
put_device(sup_dev);
}
/*
* Make "proxy" SYNC_STATE_ONLY device links to represent the needs of
* all the descendants. This proxy link step is needed to handle the
* case where the supplier is added before the consumer's parent device
* (@dev).
*/
while ((child = fwnode_get_next_available_child_node(fwnode, child)))
__fw_devlink_link_to_suppliers(dev, child);
}
static void fw_devlink_link_device(struct device *dev)
{
struct fwnode_handle *fwnode = dev->fwnode;
if (!fw_devlink_flags)
return;
fw_devlink_parse_fwtree(fwnode);
mutex_lock(&fwnode_link_lock);
__fw_devlink_link_to_consumers(dev);
__fw_devlink_link_to_suppliers(dev, fwnode);
mutex_unlock(&fwnode_link_lock);
}
/* Device links support end. */
int (*platform_notify)(struct device *dev) = NULL;
int (*platform_notify_remove)(struct device *dev) = NULL;
static struct kobject *dev_kobj;
struct kobject *sysfs_dev_char_kobj;
struct kobject *sysfs_dev_block_kobj;
static DEFINE_MUTEX(device_hotplug_lock);
void lock_device_hotplug(void)
{
mutex_lock(&device_hotplug_lock);
}
void unlock_device_hotplug(void)
{
mutex_unlock(&device_hotplug_lock);
}
int lock_device_hotplug_sysfs(void)
{
if (mutex_trylock(&device_hotplug_lock))
return 0;
/* Avoid busy looping (5 ms of sleep should do). */
msleep(5);
return restart_syscall();
}
#ifdef CONFIG_BLOCK
static inline int device_is_not_partition(struct device *dev)
{
return !(dev->type == &part_type);
}
#else
static inline int device_is_not_partition(struct device *dev)
{
return 1;
}
#endif
static void device_platform_notify(struct device *dev)
{
acpi_device_notify(dev);
software_node_notify(dev);
if (platform_notify)
platform_notify(dev);
}
static void device_platform_notify_remove(struct device *dev)
{
acpi_device_notify_remove(dev);
software_node_notify_remove(dev);
if (platform_notify_remove)
platform_notify_remove(dev);
}
/**
* dev_driver_string - Return a device's driver name, if at all possible
* @dev: struct device to get the name of
*
* Will return the device's driver's name if it is bound to a device. If
* the device is not bound to a driver, it will return the name of the bus
* it is attached to. If it is not attached to a bus either, an empty
* string will be returned.
*/
const char *dev_driver_string(const struct device *dev)
{
struct device_driver *drv;
/* dev->driver can change to NULL underneath us because of unbinding,
* so be careful about accessing it. dev->bus and dev->class should
* never change once they are set, so they don't need special care.
*/
drv = READ_ONCE(dev->driver);
return drv ? drv->name : dev_bus_name(dev);
}
EXPORT_SYMBOL(dev_driver_string);
#define to_dev_attr(_attr) container_of(_attr, struct device_attribute, attr)
static ssize_t dev_attr_show(struct kobject *kobj, struct attribute *attr,
char *buf)
{
struct device_attribute *dev_attr = to_dev_attr(attr);
struct device *dev = kobj_to_dev(kobj);
ssize_t ret = -EIO;
if (dev_attr->show)
ret = dev_attr->show(dev, dev_attr, buf);
if (ret >= (ssize_t)PAGE_SIZE) {
printk("dev_attr_show: %pS returned bad count\n",
dev_attr->show);
}
return ret;
}
static ssize_t dev_attr_store(struct kobject *kobj, struct attribute *attr,
const char *buf, size_t count)
{
struct device_attribute *dev_attr = to_dev_attr(attr);
struct device *dev = kobj_to_dev(kobj);
ssize_t ret = -EIO;
if (dev_attr->store)
ret = dev_attr->store(dev, dev_attr, buf, count);
return ret;
}
static const struct sysfs_ops dev_sysfs_ops = {
.show = dev_attr_show,
.store = dev_attr_store,
};
#define to_ext_attr(x) container_of(x, struct dev_ext_attribute, attr)
ssize_t device_store_ulong(struct device *dev,
struct device_attribute *attr,
const char *buf, size_t size)
{
struct dev_ext_attribute *ea = to_ext_attr(attr);
int ret;
unsigned long new;
ret = kstrtoul(buf, 0, &new);
if (ret)
return ret;
*(unsigned long *)(ea->var) = new;
/* Always return full write size even if we didn't consume all */
return size;
}
EXPORT_SYMBOL_GPL(device_store_ulong);
ssize_t device_show_ulong(struct device *dev,
struct device_attribute *attr,
char *buf)
{
struct dev_ext_attribute *ea = to_ext_attr(attr);
return sysfs_emit(buf, "%lx\n", *(unsigned long *)(ea->var));
}
EXPORT_SYMBOL_GPL(device_show_ulong);
ssize_t device_store_int(struct device *dev,
struct device_attribute *attr,
const char *buf, size_t size)
{
struct dev_ext_attribute *ea = to_ext_attr(attr);
int ret;
long new;
ret = kstrtol(buf, 0, &new);
if (ret)
return ret;
if (new > INT_MAX || new < INT_MIN)
return -EINVAL;
*(int *)(ea->var) = new;
/* Always return full write size even if we didn't consume all */
return size;
}
EXPORT_SYMBOL_GPL(device_store_int);
ssize_t device_show_int(struct device *dev,
struct device_attribute *attr,
char *buf)
{
struct dev_ext_attribute *ea = to_ext_attr(attr);
return sysfs_emit(buf, "%d\n", *(int *)(ea->var));
}
EXPORT_SYMBOL_GPL(device_show_int);
ssize_t device_store_bool(struct device *dev, struct device_attribute *attr,
const char *buf, size_t size)
{
struct dev_ext_attribute *ea = to_ext_attr(attr);
if (strtobool(buf, ea->var) < 0)
return -EINVAL;
return size;
}
EXPORT_SYMBOL_GPL(device_store_bool);
ssize_t device_show_bool(struct device *dev, struct device_attribute *attr,
char *buf)
{
struct dev_ext_attribute *ea = to_ext_attr(attr);
return sysfs_emit(buf, "%d\n", *(bool *)(ea->var));
}
EXPORT_SYMBOL_GPL(device_show_bool);
/**
* device_release - free device structure.
* @kobj: device's kobject.
*
* This is called once the reference count for the object
* reaches 0. We forward the call to the device's release
* method, which should handle actually freeing the structure.
*/
static void device_release(struct kobject *kobj)
{
struct device *dev = kobj_to_dev(kobj);
struct device_private *p = dev->p;
/*
* Some platform devices are driven without driver attached
* and managed resources may have been acquired. Make sure
* all resources are released.
*
* Drivers still can add resources into device after device
* is deleted but alive, so release devres here to avoid
* possible memory leak.
*/
devres_release_all(dev);
kfree(dev->dma_range_map);
if (dev->release)
dev->release(dev);
else if (dev->type && dev->type->release)
dev->type->release(dev);
else if (dev->class && dev->class->dev_release) dev->class->dev_release(dev);
else
WARN(1, KERN_ERR "Device '%s' does not have a release() function, it is broken and must be fixed. See Documentation/core-api/kobject.rst.\n",
dev_name(dev));
kfree(p);
}
static const void *device_namespace(struct kobject *kobj)
{
struct device *dev = kobj_to_dev(kobj);
const void *ns = NULL;
if (dev->class && dev->class->ns_type)
ns = dev->class->namespace(dev);
return ns;
}
static void device_get_ownership(struct kobject *kobj, kuid_t *uid, kgid_t *gid)
{
struct device *dev = kobj_to_dev(kobj);
if (dev->class && dev->class->get_ownership) dev->class->get_ownership(dev, uid, gid);
}
static struct kobj_type device_ktype = {
.release = device_release,
.sysfs_ops = &dev_sysfs_ops,
.namespace = device_namespace,
.get_ownership = device_get_ownership,
};
static int dev_uevent_filter(struct kset *kset, struct kobject *kobj)
{
struct kobj_type *ktype = get_ktype(kobj); if (ktype == &device_ktype) {
struct device *dev = kobj_to_dev(kobj);
if (dev->bus)
return 1;
if (dev->class)
return 1;
}
return 0;
}
static const char *dev_uevent_name(struct kset *kset, struct kobject *kobj)
{
struct device *dev = kobj_to_dev(kobj);
if (dev->bus) return dev->bus->name; if (dev->class) return dev->class->name;
return NULL;
}
static int dev_uevent(struct kset *kset, struct kobject *kobj,
struct kobj_uevent_env *env)
{
struct device *dev = kobj_to_dev(kobj);
int retval = 0;
/* add device node properties if present */
if (MAJOR(dev->devt)) {
const char *tmp;
const char *name;
umode_t mode = 0;
kuid_t uid = GLOBAL_ROOT_UID;
kgid_t gid = GLOBAL_ROOT_GID;
add_uevent_var(env, "MAJOR=%u", MAJOR(dev->devt));
add_uevent_var(env, "MINOR=%u", MINOR(dev->devt));
name = device_get_devnode(dev, &mode, &uid, &gid, &tmp);
if (name) {
add_uevent_var(env, "DEVNAME=%s", name);
if (mode)
add_uevent_var(env, "DEVMODE=%#o", mode & 0777); if (!uid_eq(uid, GLOBAL_ROOT_UID)) add_uevent_var(env, "DEVUID=%u", from_kuid(&init_user_ns, uid)); if (!gid_eq(gid, GLOBAL_ROOT_GID)) add_uevent_var(env, "DEVGID=%u", from_kgid(&init_user_ns, gid)); kfree(tmp);
}
}
if (dev->type && dev->type->name) add_uevent_var(env, "DEVTYPE=%s", dev->type->name); if (dev->driver) add_uevent_var(env, "DRIVER=%s", dev->driver->name);
/* Add common DT information about the device */
of_device_uevent(dev, env);
/* have the bus specific function add its stuff */
if (dev->bus && dev->bus->uevent) { retval = dev->bus->uevent(dev, env);
if (retval)
pr_debug("device: '%s': %s: bus uevent() returned %d\n",
dev_name(dev), __func__, retval);
}
/* have the class specific function add its stuff */
if (dev->class && dev->class->dev_uevent) { retval = dev->class->dev_uevent(dev, env);
if (retval)
pr_debug("device: '%s': %s: class uevent() "
"returned %d\n", dev_name(dev),
__func__, retval);
}
/* have the device type specific function add its stuff */
if (dev->type && dev->type->uevent) { retval = dev->type->uevent(dev, env);
if (retval)
pr_debug("device: '%s': %s: dev_type uevent() "
"returned %d\n", dev_name(dev),
__func__, retval);
}
return retval;
}
static const struct kset_uevent_ops device_uevent_ops = {
.filter = dev_uevent_filter,
.name = dev_uevent_name,
.uevent = dev_uevent,
};
static ssize_t uevent_show(struct device *dev, struct device_attribute *attr,
char *buf)
{
struct kobject *top_kobj;
struct kset *kset;
struct kobj_uevent_env *env = NULL;
int i;
int len = 0;
int retval;
/* search the kset, the device belongs to */
top_kobj = &dev->kobj;
while (!top_kobj->kset && top_kobj->parent)
top_kobj = top_kobj->parent;
if (!top_kobj->kset)
goto out;
kset = top_kobj->kset;
if (!kset->uevent_ops || !kset->uevent_ops->uevent)
goto out;
/* respect filter */
if (kset->uevent_ops && kset->uevent_ops->filter)
if (!kset->uevent_ops->filter(kset, &dev->kobj))
goto out;
env = kzalloc(sizeof(struct kobj_uevent_env), GFP_KERNEL);
if (!env)
return -ENOMEM;
/* let the kset specific function add its keys */
retval = kset->uevent_ops->uevent(kset, &dev->kobj, env);
if (retval)
goto out;
/* copy keys to file */
for (i = 0; i < env->envp_idx; i++)
len += sysfs_emit_at(buf, len, "%s\n", env->envp[i]);
out:
kfree(env);
return len;
}
static ssize_t uevent_store(struct device *dev, struct device_attribute *attr,
const char *buf, size_t count)
{
int rc;
rc = kobject_synth_uevent(&dev->kobj, buf, count);
if (rc) {
dev_err(dev, "uevent: failed to send synthetic uevent\n");
return rc;
}
return count;
}
static DEVICE_ATTR_RW(uevent);
static ssize_t online_show(struct device *dev, struct device_attribute *attr,
char *buf)
{
bool val;
device_lock(dev);
val = !dev->offline;
device_unlock(dev);
return sysfs_emit(buf, "%u\n", val);
}
static ssize_t online_store(struct device *dev, struct device_attribute *attr,
const char *buf, size_t count)
{
bool val;
int ret;
ret = strtobool(buf, &val);
if (ret < 0)
return ret;
ret = lock_device_hotplug_sysfs();
if (ret)
return ret;
ret = val ? device_online(dev) : device_offline(dev);
unlock_device_hotplug();
return ret < 0 ? ret : count;
}
static DEVICE_ATTR_RW(online);
static ssize_t removable_show(struct device *dev, struct device_attribute *attr,
char *buf)
{
const char *loc;
switch (dev->removable) {
case DEVICE_REMOVABLE:
loc = "removable";
break;
case DEVICE_FIXED:
loc = "fixed";
break;
default:
loc = "unknown";
}
return sysfs_emit(buf, "%s\n", loc);
}
static DEVICE_ATTR_RO(removable);
int device_add_groups(struct device *dev, const struct attribute_group **groups)
{
return sysfs_create_groups(&dev->kobj, groups);
}
EXPORT_SYMBOL_GPL(device_add_groups);
void device_remove_groups(struct device *dev,
const struct attribute_group **groups)
{
sysfs_remove_groups(&dev->kobj, groups);
}
EXPORT_SYMBOL_GPL(device_remove_groups);
union device_attr_group_devres {
const struct attribute_group *group;
const struct attribute_group **groups;
};
static int devm_attr_group_match(struct device *dev, void *res, void *data)
{
return ((union device_attr_group_devres *)res)->group == data;
}
static void devm_attr_group_remove(struct device *dev, void *res)
{
union device_attr_group_devres *devres = res;
const struct attribute_group *group = devres->group;
dev_dbg(dev, "%s: removing group %p\n", __func__, group);
sysfs_remove_group(&dev->kobj, group);
}
static void devm_attr_groups_remove(struct device *dev, void *res)
{
union device_attr_group_devres *devres = res;
const struct attribute_group **groups = devres->groups;
dev_dbg(dev, "%s: removing groups %p\n", __func__, groups);
sysfs_remove_groups(&dev->kobj, groups);
}
/**
* devm_device_add_group - given a device, create a managed attribute group
* @dev: The device to create the group for
* @grp: The attribute group to create
*
* This function creates a group for the first time. It will explicitly
* warn and error if any of the attribute files being created already exist.
*
* Returns 0 on success or error code on failure.
*/
int devm_device_add_group(struct device *dev, const struct attribute_group *grp)
{
union device_attr_group_devres *devres;
int error;
devres = devres_alloc(devm_attr_group_remove,
sizeof(*devres), GFP_KERNEL);
if (!devres)
return -ENOMEM;
error = sysfs_create_group(&dev->kobj, grp);
if (error) {
devres_free(devres);
return error;
}
devres->group = grp;
devres_add(dev, devres);
return 0;
}
EXPORT_SYMBOL_GPL(devm_device_add_group);
/**
* devm_device_remove_group: remove a managed group from a device
* @dev: device to remove the group from
* @grp: group to remove
*
* This function removes a group of attributes from a device. The attributes
* previously have to have been created for this group, otherwise it will fail.
*/
void devm_device_remove_group(struct device *dev,
const struct attribute_group *grp)
{
WARN_ON(devres_release(dev, devm_attr_group_remove,
devm_attr_group_match,
/* cast away const */ (void *)grp));
}
EXPORT_SYMBOL_GPL(devm_device_remove_group);
/**
* devm_device_add_groups - create a bunch of managed attribute groups
* @dev: The device to create the group for
* @groups: The attribute groups to create, NULL terminated
*
* This function creates a bunch of managed attribute groups. If an error
* occurs when creating a group, all previously created groups will be
* removed, unwinding everything back to the original state when this
* function was called. It will explicitly warn and error if any of the
* attribute files being created already exist.
*
* Returns 0 on success or error code from sysfs_create_group on failure.
*/
int devm_device_add_groups(struct device *dev,
const struct attribute_group **groups)
{
union device_attr_group_devres *devres;
int error;
devres = devres_alloc(devm_attr_groups_remove,
sizeof(*devres), GFP_KERNEL);
if (!devres)
return -ENOMEM;
error = sysfs_create_groups(&dev->kobj, groups);
if (error) {
devres_free(devres);
return error;
}
devres->groups = groups;
devres_add(dev, devres);
return 0;
}
EXPORT_SYMBOL_GPL(devm_device_add_groups);
/**
* devm_device_remove_groups - remove a list of managed groups
*
* @dev: The device for the groups to be removed from
* @groups: NULL terminated list of groups to be removed
*
* If groups is not NULL, remove the specified groups from the device.
*/
void devm_device_remove_groups(struct device *dev,
const struct attribute_group **groups)
{
WARN_ON(devres_release(dev, devm_attr_groups_remove,
devm_attr_group_match,
/* cast away const */ (void *)groups));
}
EXPORT_SYMBOL_GPL(devm_device_remove_groups);
static int device_add_attrs(struct device *dev)
{
struct class *class = dev->class; const struct device_type *type = dev->type;
int error;
if (class) {
error = device_add_groups(dev, class->dev_groups);
if (error)
return error;
}
if (type) { error = device_add_groups(dev, type->groups);
if (error)
goto err_remove_class_groups;
}
error = device_add_groups(dev, dev->groups);
if (error)
goto err_remove_type_groups;
if (device_supports_offline(dev) && !dev->offline_disabled) { error = device_create_file(dev, &dev_attr_online);
if (error)
goto err_remove_dev_groups;
}
if (fw_devlink_flags && !fw_devlink_is_permissive() && dev->fwnode) { error = device_create_file(dev, &dev_attr_waiting_for_supplier);
if (error)
goto err_remove_dev_online;
}
if (dev_removable_is_valid(dev)) { error = device_create_file(dev, &dev_attr_removable);
if (error)
goto err_remove_dev_waiting_for_supplier;
}
return 0;
err_remove_dev_waiting_for_supplier:
device_remove_file(dev, &dev_attr_waiting_for_supplier);
err_remove_dev_online:
device_remove_file(dev, &dev_attr_online);
err_remove_dev_groups:
device_remove_groups(dev, dev->groups);
err_remove_type_groups:
if (type)
device_remove_groups(dev, type->groups);
err_remove_class_groups:
if (class) device_remove_groups(dev, class->dev_groups);
return error;
}
static void device_remove_attrs(struct device *dev)
{
struct class *class = dev->class;
const struct device_type *type = dev->type;
device_remove_file(dev, &dev_attr_removable);
device_remove_file(dev, &dev_attr_waiting_for_supplier);
device_remove_file(dev, &dev_attr_online);
device_remove_groups(dev, dev->groups);
if (type)
device_remove_groups(dev, type->groups); if (class) device_remove_groups(dev, class->dev_groups);
}
static ssize_t dev_show(struct device *dev, struct device_attribute *attr,
char *buf)
{
return print_dev_t(buf, dev->devt);
}
static DEVICE_ATTR_RO(dev);
/* /sys/devices/ */
struct kset *devices_kset;
/**
* devices_kset_move_before - Move device in the devices_kset's list.
* @deva: Device to move.
* @devb: Device @deva should come before.
*/
static void devices_kset_move_before(struct device *deva, struct device *devb)
{
if (!devices_kset)
return;
pr_debug("devices_kset: Moving %s before %s\n",
dev_name(deva), dev_name(devb));
spin_lock(&devices_kset->list_lock);
list_move_tail(&deva->kobj.entry, &devb->kobj.entry);
spin_unlock(&devices_kset->list_lock);
}
/**
* devices_kset_move_after - Move device in the devices_kset's list.
* @deva: Device to move
* @devb: Device @deva should come after.
*/
static void devices_kset_move_after(struct device *deva, struct device *devb)
{
if (!devices_kset)
return;
pr_debug("devices_kset: Moving %s after %s\n",
dev_name(deva), dev_name(devb));
spin_lock(&devices_kset->list_lock);
list_move(&deva->kobj.entry, &devb->kobj.entry);
spin_unlock(&devices_kset->list_lock);
}
/**
* devices_kset_move_last - move the device to the end of devices_kset's list.
* @dev: device to move
*/
void devices_kset_move_last(struct device *dev)
{
if (!devices_kset)
return;
pr_debug("devices_kset: Moving %s to end of list\n", dev_name(dev));
spin_lock(&devices_kset->list_lock);
list_move_tail(&dev->kobj.entry, &devices_kset->list);
spin_unlock(&devices_kset->list_lock);
}
/**
* device_create_file - create sysfs attribute file for device.
* @dev: device.
* @attr: device attribute descriptor.
*/
int device_create_file(struct device *dev,
const struct device_attribute *attr)
{
int error = 0;
if (dev) { WARN(((attr->attr.mode & S_IWUGO) && !attr->store),
"Attribute %s: write permission without 'store'\n",
attr->attr.name);
WARN(((attr->attr.mode & S_IRUGO) && !attr->show),
"Attribute %s: read permission without 'show'\n",
attr->attr.name);
error = sysfs_create_file(&dev->kobj, &attr->attr);
}
return error;
}
EXPORT_SYMBOL_GPL(device_create_file);
/**
* device_remove_file - remove sysfs attribute file.
* @dev: device.
* @attr: device attribute descriptor.
*/
void device_remove_file(struct device *dev,
const struct device_attribute *attr)
{
if (dev)
sysfs_remove_file(&dev->kobj, &attr->attr);
}
EXPORT_SYMBOL_GPL(device_remove_file);
/**
* device_remove_file_self - remove sysfs attribute file from its own method.
* @dev: device.
* @attr: device attribute descriptor.
*
* See kernfs_remove_self() for details.
*/
bool device_remove_file_self(struct device *dev,
const struct device_attribute *attr)
{
if (dev)
return sysfs_remove_file_self(&dev->kobj, &attr->attr);
else
return false;
}
EXPORT_SYMBOL_GPL(device_remove_file_self);
/**
* device_create_bin_file - create sysfs binary attribute file for device.
* @dev: device.
* @attr: device binary attribute descriptor.
*/
int device_create_bin_file(struct device *dev,
const struct bin_attribute *attr)
{
int error = -EINVAL;
if (dev)
error = sysfs_create_bin_file(&dev->kobj, attr);
return error;
}
EXPORT_SYMBOL_GPL(device_create_bin_file);
/**
* device_remove_bin_file - remove sysfs binary attribute file
* @dev: device.
* @attr: device binary attribute descriptor.
*/
void device_remove_bin_file(struct device *dev,
const struct bin_attribute *attr)
{
if (dev)
sysfs_remove_bin_file(&dev->kobj, attr);
}
EXPORT_SYMBOL_GPL(device_remove_bin_file);
static void klist_children_get(struct klist_node *n)
{
struct device_private *p = to_device_private_parent(n);
struct device *dev = p->device;
get_device(dev);
}
static void klist_children_put(struct klist_node *n)
{
struct device_private *p = to_device_private_parent(n);
struct device *dev = p->device;
put_device(dev);
}
/**
* device_initialize - init device structure.
* @dev: device.
*
* This prepares the device for use by other layers by initializing
* its fields.
* It is the first half of device_register(), if called by
* that function, though it can also be called separately, so one
* may use @dev's fields. In particular, get_device()/put_device()
* may be used for reference counting of @dev after calling this
* function.
*
* All fields in @dev must be initialized by the caller to 0, except
* for those explicitly set to some other value. The simplest
* approach is to use kzalloc() to allocate the structure containing
* @dev.
*
* NOTE: Use put_device() to give up your reference instead of freeing
* @dev directly once you have called this function.
*/
void device_initialize(struct device *dev)
{
dev->kobj.kset = devices_kset;
kobject_init(&dev->kobj, &device_ktype);
INIT_LIST_HEAD(&dev->dma_pools);
mutex_init(&dev->mutex);
#ifdef CONFIG_PROVE_LOCKING
mutex_init(&dev->lockdep_mutex);
#endif
lockdep_set_novalidate_class(&dev->mutex);
spin_lock_init(&dev->devres_lock);
INIT_LIST_HEAD(&dev->devres_head);
device_pm_init(dev);
set_dev_node(dev, -1);
#ifdef CONFIG_GENERIC_MSI_IRQ
raw_spin_lock_init(&dev->msi_lock);
INIT_LIST_HEAD(&dev->msi_list);
#endif
INIT_LIST_HEAD(&dev->links.consumers);
INIT_LIST_HEAD(&dev->links.suppliers);
INIT_LIST_HEAD(&dev->links.defer_sync);
dev->links.status = DL_DEV_NO_DRIVER;
#if defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_DEVICE) || \
defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU) || \
defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU_ALL)
dev->dma_coherent = dma_default_coherent;
#endif
#ifdef CONFIG_SWIOTLB
dev->dma_io_tlb_mem = &io_tlb_default_mem;
#endif
}
EXPORT_SYMBOL_GPL(device_initialize);
struct kobject *virtual_device_parent(struct device *dev)
{
static struct kobject *virtual_dir = NULL;
if (!virtual_dir)
virtual_dir = kobject_create_and_add("virtual",
&devices_kset->kobj);
return virtual_dir;
}
struct class_dir {
struct kobject kobj;
struct class *class;
};
#define to_class_dir(obj) container_of(obj, struct class_dir, kobj)
static void class_dir_release(struct kobject *kobj)
{
struct class_dir *dir = to_class_dir(kobj);
kfree(dir);
}
static const
struct kobj_ns_type_operations *class_dir_child_ns_type(struct kobject *kobj)
{
struct class_dir *dir = to_class_dir(kobj);
return dir->class->ns_type;
}
static struct kobj_type class_dir_ktype = {
.release = class_dir_release,
.sysfs_ops = &kobj_sysfs_ops,
.child_ns_type = class_dir_child_ns_type
};
static struct kobject *
class_dir_create_and_add(struct class *class, struct kobject *parent_kobj)
{
struct class_dir *dir;
int retval;
dir = kzalloc(sizeof(*dir), GFP_KERNEL);
if (!dir)
return ERR_PTR(-ENOMEM);
dir->class = class;
kobject_init(&dir->kobj, &class_dir_ktype);
dir->kobj.kset = &class->p->glue_dirs;
retval = kobject_add(&dir->kobj, parent_kobj, "%s", class->name);
if (retval < 0) {
kobject_put(&dir->kobj);
return ERR_PTR(retval);
}
return &dir->kobj;
}
static DEFINE_MUTEX(gdp_mutex);
static struct kobject *get_device_parent(struct device *dev,
struct device *parent)
{
if (dev->class) {
struct kobject *kobj = NULL;
struct kobject *parent_kobj;
struct kobject *k;
#ifdef CONFIG_BLOCK
/* block disks show up in /sys/block */
if (sysfs_deprecated && dev->class == &block_class) {
if (parent && parent->class == &block_class)
return &parent->kobj;
return &block_class.p->subsys.kobj;
}
#endif
/*
* If we have no parent, we live in "virtual".
* Class-devices with a non class-device as parent, live
* in a "glue" directory to prevent namespace collisions.
*/
if (parent == NULL)
parent_kobj = virtual_device_parent(dev);
else if (parent->class && !dev->class->ns_type)
return &parent->kobj;
else
parent_kobj = &parent->kobj;
mutex_lock(&gdp_mutex);
/* find our class-directory at the parent and reference it */
spin_lock(&dev->class->p->glue_dirs.list_lock);
list_for_each_entry(k, &dev->class->p->glue_dirs.list, entry) if (k->parent == parent_kobj) { kobj = kobject_get(k);
break;
}
spin_unlock(&dev->class->p->glue_dirs.list_lock);
if (kobj) {
mutex_unlock(&gdp_mutex);
return kobj;
}
/* or create a new class-directory at the parent device */
k = class_dir_create_and_add(dev->class, parent_kobj);
/* do not emit an uevent for this simple "glue" directory */
mutex_unlock(&gdp_mutex); return k;
}
/* subsystems can specify a default root directory for their devices */
if (!parent && dev->bus && dev->bus->dev_root)
return &dev->bus->dev_root->kobj;
if (parent)
return &parent->kobj;
return NULL;
}
static inline bool live_in_glue_dir(struct kobject *kobj,
struct device *dev)
{
if (!kobj || !dev->class || kobj->kset != &dev->class->p->glue_dirs)
return false;
return true;
}
static inline struct kobject *get_glue_dir(struct device *dev)
{
return dev->kobj.parent;
}
/*
* make sure cleaning up dir as the last step, we need to make
* sure .release handler of kobject is run with holding the
* global lock
*/
static void cleanup_glue_dir(struct device *dev, struct kobject *glue_dir)
{
unsigned int ref;
/* see if we live in a "glue" directory */
if (!live_in_glue_dir(glue_dir, dev))
return;
mutex_lock(&gdp_mutex);
/**
* There is a race condition between removing glue directory
* and adding a new device under the glue directory.
*
* CPU1: CPU2:
*
* device_add()
* get_device_parent()
* class_dir_create_and_add()
* kobject_add_internal()
* create_dir() // create glue_dir
*
* device_add()
* get_device_parent()
* kobject_get() // get glue_dir
*
* device_del()
* cleanup_glue_dir()
* kobject_del(glue_dir)
*
* kobject_add()
* kobject_add_internal()
* create_dir() // in glue_dir
* sysfs_create_dir_ns()
* kernfs_create_dir_ns(sd)
*
* sysfs_remove_dir() // glue_dir->sd=NULL
* sysfs_put() // free glue_dir->sd
*
* // sd is freed
* kernfs_new_node(sd)
* kernfs_get(glue_dir)
* kernfs_add_one()
* kernfs_put()
*
* Before CPU1 remove last child device under glue dir, if CPU2 add
* a new device under glue dir, the glue_dir kobject reference count
* will be increase to 2 in kobject_get(k). And CPU2 has been called
* kernfs_create_dir_ns(). Meanwhile, CPU1 call sysfs_remove_dir()
* and sysfs_put(). This result in glue_dir->sd is freed.
*
* Then the CPU2 will see a stale "empty" but still potentially used
* glue dir around in kernfs_new_node().
*
* In order to avoid this happening, we also should make sure that
* kernfs_node for glue_dir is released in CPU1 only when refcount
* for glue_dir kobj is 1.
*/
ref = kref_read(&glue_dir->kref);
if (!kobject_has_children(glue_dir) && !--ref) kobject_del(glue_dir); kobject_put(glue_dir);
mutex_unlock(&gdp_mutex);
}
static int device_add_class_symlinks(struct device *dev)
{
struct device_node *of_node = dev_of_node(dev);
int error;
if (of_node) {
error = sysfs_create_link(&dev->kobj, of_node_kobj(of_node), "of_node");
if (error)
dev_warn(dev, "Error %d creating of_node link\n",error);
/* An error here doesn't warrant bringing down the device */
}
if (!dev->class)
return 0;
error = sysfs_create_link(&dev->kobj,
&dev->class->p->subsys.kobj,
"subsystem");
if (error)
goto out_devnode;
if (dev->parent && device_is_not_partition(dev)) { error = sysfs_create_link(&dev->kobj, &dev->parent->kobj,
"device");
if (error)
goto out_subsys;
}
#ifdef CONFIG_BLOCK
/* /sys/block has directories and does not need symlinks */
if (sysfs_deprecated && dev->class == &block_class)
return 0;
#endif
/* link in the class directory pointing to the device */
error = sysfs_create_link(&dev->class->p->subsys.kobj,
&dev->kobj, dev_name(dev));
if (error)
goto out_device;
return 0;
out_device:
sysfs_remove_link(&dev->kobj, "device");
out_subsys:
sysfs_remove_link(&dev->kobj, "subsystem");
out_devnode:
sysfs_remove_link(&dev->kobj, "of_node");
return error;
}
static void device_remove_class_symlinks(struct device *dev)
{
if (dev_of_node(dev))
sysfs_remove_link(&dev->kobj, "of_node");
if (!dev->class)
return;
if (dev->parent && device_is_not_partition(dev)) sysfs_remove_link(&dev->kobj, "device"); sysfs_remove_link(&dev->kobj, "subsystem");
#ifdef CONFIG_BLOCK
if (sysfs_deprecated && dev->class == &block_class)
return;
#endif
sysfs_delete_link(&dev->class->p->subsys.kobj, &dev->kobj, dev_name(dev));
}
/**
* dev_set_name - set a device name
* @dev: device
* @fmt: format string for the device's name
*/
int dev_set_name(struct device *dev, const char *fmt, ...)
{
va_list vargs;
int err;
va_start(vargs, fmt);
err = kobject_set_name_vargs(&dev->kobj, fmt, vargs);
va_end(vargs);
return err;
}
EXPORT_SYMBOL_GPL(dev_set_name);
/**
* device_to_dev_kobj - select a /sys/dev/ directory for the device
* @dev: device
*
* By default we select char/ for new entries. Setting class->dev_obj
* to NULL prevents an entry from being created. class->dev_kobj must
* be set (or cleared) before any devices are registered to the class
* otherwise device_create_sys_dev_entry() and
* device_remove_sys_dev_entry() will disagree about the presence of
* the link.
*/
static struct kobject *device_to_dev_kobj(struct device *dev)
{
struct kobject *kobj;
if (dev->class)
kobj = dev->class->dev_kobj;
else
kobj = sysfs_dev_char_kobj;
return kobj;
}
static int device_create_sys_dev_entry(struct device *dev)
{
struct kobject *kobj = device_to_dev_kobj(dev);
int error = 0;
char devt_str[15];
if (kobj) { format_dev_t(devt_str, dev->devt);
error = sysfs_create_link(kobj, &dev->kobj, devt_str);
}
return error;
}
static void device_remove_sys_dev_entry(struct device *dev)
{
struct kobject *kobj = device_to_dev_kobj(dev);
char devt_str[15];
if (kobj) { format_dev_t(devt_str, dev->devt); sysfs_remove_link(kobj, devt_str);
}
}
static int device_private_init(struct device *dev)
{
dev->p = kzalloc(sizeof(*dev->p), GFP_KERNEL);
if (!dev->p)
return -ENOMEM;
dev->p->device = dev;
klist_init(&dev->p->klist_children, klist_children_get,
klist_children_put);
INIT_LIST_HEAD(&dev->p->deferred_probe);
return 0;
}
/**
* device_add - add device to device hierarchy.
* @dev: device.
*
* This is part 2 of device_register(), though may be called
* separately _iff_ device_initialize() has been called separately.
*
* This adds @dev to the kobject hierarchy via kobject_add(), adds it
* to the global and sibling lists for the device, then
* adds it to the other relevant subsystems of the driver model.
*
* Do not call this routine or device_register() more than once for
* any device structure. The driver model core is not designed to work
* with devices that get unregistered and then spring back to life.
* (Among other things, it's very hard to guarantee that all references
* to the previous incarnation of @dev have been dropped.) Allocate
* and register a fresh new struct device instead.
*
* NOTE: _Never_ directly free @dev after calling this function, even
* if it returned an error! Always use put_device() to give up your
* reference instead.
*
* Rule of thumb is: if device_add() succeeds, you should call
* device_del() when you want to get rid of it. If device_add() has
* *not* succeeded, use *only* put_device() to drop the reference
* count.
*/
int device_add(struct device *dev)
{
struct device *parent;
struct kobject *kobj;
struct class_interface *class_intf;
int error = -EINVAL;
struct kobject *glue_dir = NULL;
dev = get_device(dev);
if (!dev)
goto done;
if (!dev->p) {
error = device_private_init(dev);
if (error)
goto done;
}
/*
* for statically allocated devices, which should all be converted
* some day, we need to initialize the name. We prevent reading back
* the name, and force the use of dev_name()
*/
if (dev->init_name) { dev_set_name(dev, "%s", dev->init_name);
dev->init_name = NULL;
}
/* subsystems can specify simple device enumeration */
if (!dev_name(dev) && dev->bus && dev->bus->dev_name) dev_set_name(dev, "%s%u", dev->bus->dev_name, dev->id); if (!dev_name(dev)) {
error = -EINVAL;
goto name_error;
}
pr_debug("device: '%s': %s\n", dev_name(dev), __func__);
parent = get_device(dev->parent); kobj = get_device_parent(dev, parent);
if (IS_ERR(kobj)) {
error = PTR_ERR(kobj);
goto parent_error;
}
if (kobj) dev->kobj.parent = kobj;
/* use parent numa_node */
if (parent && (dev_to_node(dev) == NUMA_NO_NODE)) set_dev_node(dev, dev_to_node(parent));
/* first, register with generic layer. */
/* we require the name to be set before, and pass NULL */
error = kobject_add(&dev->kobj, dev->kobj.parent, NULL);
if (error) {
glue_dir = get_glue_dir(dev);
goto Error;
}
/* notify platform of device entry */
device_platform_notify(dev);
error = device_create_file(dev, &dev_attr_uevent);
if (error)
goto attrError;
error = device_add_class_symlinks(dev);
if (error)
goto SymlinkError;
error = device_add_attrs(dev);
if (error)
goto AttrsError;
error = bus_add_device(dev);
if (error)
goto BusError;
error = dpm_sysfs_add(dev);
if (error)
goto DPMError;
device_pm_add(dev);
if (MAJOR(dev->devt)) {
error = device_create_file(dev, &dev_attr_dev);
if (error)
goto DevAttrError;
error = device_create_sys_dev_entry(dev);
if (error)
goto SysEntryError;
devtmpfs_create_node(dev);
}
/* Notify clients of device addition. This call must come
* after dpm_sysfs_add() and before kobject_uevent().
*/
if (dev->bus) blocking_notifier_call_chain(&dev->bus->p->bus_notifier,
BUS_NOTIFY_ADD_DEVICE, dev);
kobject_uevent(&dev->kobj, KOBJ_ADD);
/*
* Check if any of the other devices (consumers) have been waiting for
* this device (supplier) to be added so that they can create a device
* link to it.
*
* This needs to happen after device_pm_add() because device_link_add()
* requires the supplier be registered before it's called.
*
* But this also needs to happen before bus_probe_device() to make sure
* waiting consumers can link to it before the driver is bound to the
* device and the driver sync_state callback is called for this device.
*/
if (dev->fwnode && !dev->fwnode->dev) { dev->fwnode->dev = dev;
fw_devlink_link_device(dev);
}
bus_probe_device(dev);
/*
* If all driver registration is done and a newly added device doesn't
* match with any driver, don't block its consumers from probing in
* case the consumer device is able to operate without this supplier.
*/
if (dev->fwnode && fw_devlink_drv_reg_done && !dev->can_match)
fw_devlink_unblock_consumers(dev);
if (parent)
klist_add_tail(&dev->p->knode_parent,
&parent->p->klist_children); if (dev->class) { mutex_lock(&dev->class->p->mutex);
/* tie the class to the device */
klist_add_tail(&dev->p->knode_class,
&dev->class->p->klist_devices);
/* notify any interfaces that the device is here */
list_for_each_entry(class_intf,
&dev->class->p->interfaces, node)
if (class_intf->add_dev) class_intf->add_dev(dev, class_intf); mutex_unlock(&dev->class->p->mutex);
}
done:
put_device(dev);
return error;
SysEntryError:
if (MAJOR(dev->devt))
device_remove_file(dev, &dev_attr_dev);
DevAttrError:
device_pm_remove(dev);
dpm_sysfs_remove(dev);
DPMError:
bus_remove_device(dev);
BusError:
device_remove_attrs(dev);
AttrsError:
device_remove_class_symlinks(dev);
SymlinkError:
device_remove_file(dev, &dev_attr_uevent);
attrError:
device_platform_notify_remove(dev);
kobject_uevent(&dev->kobj, KOBJ_REMOVE);
glue_dir = get_glue_dir(dev);
kobject_del(&dev->kobj);
Error:
cleanup_glue_dir(dev, glue_dir);
parent_error:
put_device(parent);
name_error:
kfree(dev->p);
dev->p = NULL;
goto done;
}
EXPORT_SYMBOL_GPL(device_add);
/**
* device_register - register a device with the system.
* @dev: pointer to the device structure
*
* This happens in two clean steps - initialize the device
* and add it to the system. The two steps can be called
* separately, but this is the easiest and most common.
* I.e. you should only call the two helpers separately if
* have a clearly defined need to use and refcount the device
* before it is added to the hierarchy.
*
* For more information, see the kerneldoc for device_initialize()
* and device_add().
*
* NOTE: _Never_ directly free @dev after calling this function, even
* if it returned an error! Always use put_device() to give up the
* reference initialized in this function instead.
*/
int device_register(struct device *dev)
{
device_initialize(dev);
return device_add(dev);
}
EXPORT_SYMBOL_GPL(device_register);
/**
* get_device - increment reference count for device.
* @dev: device.
*
* This simply forwards the call to kobject_get(), though
* we do take care to provide for the case that we get a NULL
* pointer passed in.
*/
struct device *get_device(struct device *dev)
{
return dev ? kobj_to_dev(kobject_get(&dev->kobj)) : NULL;
}
EXPORT_SYMBOL_GPL(get_device);
/**
* put_device - decrement reference count.
* @dev: device in question.
*/
void put_device(struct device *dev)
{
/* might_sleep(); */
if (dev) kobject_put(&dev->kobj);
}
EXPORT_SYMBOL_GPL(put_device);
bool kill_device(struct device *dev)
{
/*
* Require the device lock and set the "dead" flag to guarantee that
* the update behavior is consistent with the other bitfields near
* it and that we cannot have an asynchronous probe routine trying
* to run while we are tearing out the bus/class/sysfs from
* underneath the device.
*/
device_lock_assert(dev);
if (dev->p->dead)
return false;
dev->p->dead = true;
return true;
}
EXPORT_SYMBOL_GPL(kill_device);
/**
* device_del - delete device from system.
* @dev: device.
*
* This is the first part of the device unregistration
* sequence. This removes the device from the lists we control
* from here, has it removed from the other driver model
* subsystems it was added to in device_add(), and removes it
* from the kobject hierarchy.
*
* NOTE: this should be called manually _iff_ device_add() was
* also called manually.
*/
void device_del(struct device *dev)
{
struct device *parent = dev->parent;
struct kobject *glue_dir = NULL;
struct class_interface *class_intf;
unsigned int noio_flag;
device_lock(dev);
kill_device(dev);
device_unlock(dev);
if (dev->fwnode && dev->fwnode->dev == dev) dev->fwnode->dev = NULL;
/* Notify clients of device removal. This call must come
* before dpm_sysfs_remove().
*/
noio_flag = memalloc_noio_save();
if (dev->bus)
blocking_notifier_call_chain(&dev->bus->p->bus_notifier,
BUS_NOTIFY_DEL_DEVICE, dev);
dpm_sysfs_remove(dev);
if (parent)
klist_del(&dev->p->knode_parent); if (MAJOR(dev->devt)) { devtmpfs_delete_node(dev);
device_remove_sys_dev_entry(dev);
device_remove_file(dev, &dev_attr_dev);
}
if (dev->class) { device_remove_class_symlinks(dev);
mutex_lock(&dev->class->p->mutex);
/* notify any interfaces that the device is now gone */
list_for_each_entry(class_intf,
&dev->class->p->interfaces, node)
if (class_intf->remove_dev) class_intf->remove_dev(dev, class_intf);
/* remove the device from the class list */
klist_del(&dev->p->knode_class);
mutex_unlock(&dev->class->p->mutex);
}
device_remove_file(dev, &dev_attr_uevent);
device_remove_attrs(dev);
bus_remove_device(dev);
device_pm_remove(dev);
driver_deferred_probe_del(dev);
device_platform_notify_remove(dev);
device_remove_properties(dev);
device_links_purge(dev);
if (dev->bus) blocking_notifier_call_chain(&dev->bus->p->bus_notifier,
BUS_NOTIFY_REMOVED_DEVICE, dev);
kobject_uevent(&dev->kobj, KOBJ_REMOVE);
glue_dir = get_glue_dir(dev);
kobject_del(&dev->kobj);
cleanup_glue_dir(dev, glue_dir);
memalloc_noio_restore(noio_flag);
put_device(parent);
}
EXPORT_SYMBOL_GPL(device_del);
/**
* device_unregister - unregister device from system.
* @dev: device going away.
*
* We do this in two parts, like we do device_register(). First,
* we remove it from all the subsystems with device_del(), then
* we decrement the reference count via put_device(). If that
* is the final reference count, the device will be cleaned up
* via device_release() above. Otherwise, the structure will
* stick around until the final reference to the device is dropped.
*/
void device_unregister(struct device *dev)
{
pr_debug("device: '%s': %s\n", dev_name(dev), __func__);
device_del(dev);
put_device(dev);
}
EXPORT_SYMBOL_GPL(device_unregister);
static struct device *prev_device(struct klist_iter *i)
{
struct klist_node *n = klist_prev(i);
struct device *dev = NULL;
struct device_private *p;
if (n) {
p = to_device_private_parent(n);
dev = p->device;
}
return dev;
}
static struct device *next_device(struct klist_iter *i)
{
struct klist_node *n = klist_next(i);
struct device *dev = NULL;
struct device_private *p;
if (n) {
p = to_device_private_parent(n);
dev = p->device;
}
return dev;
}
/**
* device_get_devnode - path of device node file
* @dev: device
* @mode: returned file access mode
* @uid: returned file owner
* @gid: returned file group
* @tmp: possibly allocated string
*
* Return the relative path of a possible device node.
* Non-default names may need to allocate a memory to compose
* a name. This memory is returned in tmp and needs to be
* freed by the caller.
*/
const char *device_get_devnode(struct device *dev,
umode_t *mode, kuid_t *uid, kgid_t *gid,
const char **tmp)
{
char *s;
*tmp = NULL;
/* the device type may provide a specific name */
if (dev->type && dev->type->devnode) *tmp = dev->type->devnode(dev, mode, uid, gid); if (*tmp)
return *tmp;
/* the class may provide a specific name */
if (dev->class && dev->class->devnode) *tmp = dev->class->devnode(dev, mode);
if (*tmp)
return *tmp;
/* return name without allocation, tmp == NULL */
if (strchr(dev_name(dev), '!') == NULL)
return dev_name(dev);
/* replace '!' in the name with '/' */
s = kstrdup(dev_name(dev), GFP_KERNEL);
if (!s)
return NULL;
strreplace(s, '!', '/');
return *tmp = s;
}
/**
* device_for_each_child - device child iterator.
* @parent: parent struct device.
* @fn: function to be called for each device.
* @data: data for the callback.
*
* Iterate over @parent's child devices, and call @fn for each,
* passing it @data.
*
* We check the return of @fn each time. If it returns anything
* other than 0, we break out and return that value.
*/
int device_for_each_child(struct device *parent, void *data,
int (*fn)(struct device *dev, void *data))
{
struct klist_iter i;
struct device *child;
int error = 0;
if (!parent->p)
return 0;
klist_iter_init(&parent->p->klist_children, &i);
while (!error && (child = next_device(&i)))
error = fn(child, data);
klist_iter_exit(&i);
return error;
}
EXPORT_SYMBOL_GPL(device_for_each_child);
/**
* device_for_each_child_reverse - device child iterator in reversed order.
* @parent: parent struct device.
* @fn: function to be called for each device.
* @data: data for the callback.
*
* Iterate over @parent's child devices, and call @fn for each,
* passing it @data.
*
* We check the return of @fn each time. If it returns anything
* other than 0, we break out and return that value.
*/
int device_for_each_child_reverse(struct device *parent, void *data,
int (*fn)(struct device *dev, void *data))
{
struct klist_iter i;
struct device *child;
int error = 0;
if (!parent->p)
return 0;
klist_iter_init(&parent->p->klist_children, &i);
while ((child = prev_device(&i)) && !error)
error = fn(child, data);
klist_iter_exit(&i);
return error;
}
EXPORT_SYMBOL_GPL(device_for_each_child_reverse);
/**
* device_find_child - device iterator for locating a particular device.
* @parent: parent struct device
* @match: Callback function to check device
* @data: Data to pass to match function
*
* This is similar to the device_for_each_child() function above, but it
* returns a reference to a device that is 'found' for later use, as
* determined by the @match callback.
*
* The callback should return 0 if the device doesn't match and non-zero
* if it does. If the callback returns non-zero and a reference to the
* current device can be obtained, this function will return to the caller
* and not iterate over any more devices.
*
* NOTE: you will need to drop the reference with put_device() after use.
*/
struct device *device_find_child(struct device *parent, void *data,
int (*match)(struct device *dev, void *data))
{
struct klist_iter i;
struct device *child;
if (!parent)
return NULL;
klist_iter_init(&parent->p->klist_children, &i);
while ((child = next_device(&i)))
if (match(child, data) && get_device(child))
break;
klist_iter_exit(&i);
return child;
}
EXPORT_SYMBOL_GPL(device_find_child);
/**
* device_find_child_by_name - device iterator for locating a child device.
* @parent: parent struct device
* @name: name of the child device
*
* This is similar to the device_find_child() function above, but it
* returns a reference to a device that has the name @name.
*
* NOTE: you will need to drop the reference with put_device() after use.
*/
struct device *device_find_child_by_name(struct device *parent,
const char *name)
{
struct klist_iter i;
struct device *child;
if (!parent)
return NULL;
klist_iter_init(&parent->p->klist_children, &i);
while ((child = next_device(&i)))
if (sysfs_streq(dev_name(child), name) && get_device(child))
break;
klist_iter_exit(&i);
return child;
}
EXPORT_SYMBOL_GPL(device_find_child_by_name);
int __init devices_init(void)
{
devices_kset = kset_create_and_add("devices", &device_uevent_ops, NULL);
if (!devices_kset)
return -ENOMEM;
dev_kobj = kobject_create_and_add("dev", NULL);
if (!dev_kobj)
goto dev_kobj_err;
sysfs_dev_block_kobj = kobject_create_and_add("block", dev_kobj);
if (!sysfs_dev_block_kobj)
goto block_kobj_err;
sysfs_dev_char_kobj = kobject_create_and_add("char", dev_kobj);
if (!sysfs_dev_char_kobj)
goto char_kobj_err;
return 0;
char_kobj_err:
kobject_put(sysfs_dev_block_kobj);
block_kobj_err:
kobject_put(dev_kobj);
dev_kobj_err:
kset_unregister(devices_kset);
return -ENOMEM;
}
static int device_check_offline(struct device *dev, void *not_used)
{
int ret;
ret = device_for_each_child(dev, NULL, device_check_offline);
if (ret)
return ret;
return device_supports_offline(dev) && !dev->offline ? -EBUSY : 0;
}
/**
* device_offline - Prepare the device for hot-removal.
* @dev: Device to be put offline.
*
* Execute the device bus type's .offline() callback, if present, to prepare
* the device for a subsequent hot-removal. If that succeeds, the device must
* not be used until either it is removed or its bus type's .online() callback
* is executed.
*
* Call under device_hotplug_lock.
*/
int device_offline(struct device *dev)
{
int ret;
if (dev->offline_disabled)
return -EPERM;
ret = device_for_each_child(dev, NULL, device_check_offline);
if (ret)
return ret;
device_lock(dev);
if (device_supports_offline(dev)) {
if (dev->offline) {
ret = 1;
} else {
ret = dev->bus->offline(dev);
if (!ret) {
kobject_uevent(&dev->kobj, KOBJ_OFFLINE);
dev->offline = true;
}
}
}
device_unlock(dev);
return ret;
}
/**
* device_online - Put the device back online after successful device_offline().
* @dev: Device to be put back online.
*
* If device_offline() has been successfully executed for @dev, but the device
* has not been removed subsequently, execute its bus type's .online() callback
* to indicate that the device can be used again.
*
* Call under device_hotplug_lock.
*/
int device_online(struct device *dev)
{
int ret = 0;
device_lock(dev);
if (device_supports_offline(dev)) {
if (dev->offline) {
ret = dev->bus->online(dev);
if (!ret) {
kobject_uevent(&dev->kobj, KOBJ_ONLINE);
dev->offline = false;
}
} else {
ret = 1;
}
}
device_unlock(dev);
return ret;
}
struct root_device {
struct device dev;
struct module *owner;
};
static inline struct root_device *to_root_device(struct device *d)
{
return container_of(d, struct root_device, dev);
}
static void root_device_release(struct device *dev)
{
kfree(to_root_device(dev));
}
/**
* __root_device_register - allocate and register a root device
* @name: root device name
* @owner: owner module of the root device, usually THIS_MODULE
*
* This function allocates a root device and registers it
* using device_register(). In order to free the returned
* device, use root_device_unregister().
*
* Root devices are dummy devices which allow other devices
* to be grouped under /sys/devices. Use this function to
* allocate a root device and then use it as the parent of
* any device which should appear under /sys/devices/{name}
*
* The /sys/devices/{name} directory will also contain a
* 'module' symlink which points to the @owner directory
* in sysfs.
*
* Returns &struct device pointer on success, or ERR_PTR() on error.
*
* Note: You probably want to use root_device_register().
*/
struct device *__root_device_register(const char *name, struct module *owner)
{
struct root_device *root;
int err = -ENOMEM;
root = kzalloc(sizeof(struct root_device), GFP_KERNEL);
if (!root)
return ERR_PTR(err);
err = dev_set_name(&root->dev, "%s", name);
if (err) {
kfree(root);
return ERR_PTR(err);
}
root->dev.release = root_device_release;
err = device_register(&root->dev);
if (err) {
put_device(&root->dev);
return ERR_PTR(err);
}
#ifdef CONFIG_MODULES /* gotta find a "cleaner" way to do this */
if (owner) {
struct module_kobject *mk = &owner->mkobj;
err = sysfs_create_link(&root->dev.kobj, &mk->kobj, "module");
if (err) {
device_unregister(&root->dev);
return ERR_PTR(err);
}
root->owner = owner;
}
#endif
return &root->dev;
}
EXPORT_SYMBOL_GPL(__root_device_register);
/**
* root_device_unregister - unregister and free a root device
* @dev: device going away
*
* This function unregisters and cleans up a device that was created by
* root_device_register().
*/
void root_device_unregister(struct device *dev)
{
struct root_device *root = to_root_device(dev);
if (root->owner)
sysfs_remove_link(&root->dev.kobj, "module");
device_unregister(dev);
}
EXPORT_SYMBOL_GPL(root_device_unregister);
static void device_create_release(struct device *dev)
{
pr_debug("device: '%s': %s\n", dev_name(dev), __func__);
kfree(dev);
}
static __printf(6, 0) struct device *
device_create_groups_vargs(struct class *class, struct device *parent,
dev_t devt, void *drvdata,
const struct attribute_group **groups,
const char *fmt, va_list args)
{
struct device *dev = NULL;
int retval = -ENODEV;
if (class == NULL || IS_ERR(class))
goto error;
dev = kzalloc(sizeof(*dev), GFP_KERNEL);
if (!dev) {
retval = -ENOMEM;
goto error;
}
device_initialize(dev);
dev->devt = devt;
dev->class = class;
dev->parent = parent;
dev->groups = groups;
dev->release = device_create_release;
dev_set_drvdata(dev, drvdata);
retval = kobject_set_name_vargs(&dev->kobj, fmt, args);
if (retval)
goto error;
retval = device_add(dev);
if (retval)
goto error;
return dev;
error:
put_device(dev);
return ERR_PTR(retval);
}
/**
* device_create - creates a device and registers it with sysfs
* @class: pointer to the struct class that this device should be registered to
* @parent: pointer to the parent struct device of this new device, if any
* @devt: the dev_t for the char device to be added
* @drvdata: the data to be added to the device for callbacks
* @fmt: string for the device's name
*
* This function can be used by char device classes. A struct device
* will be created in sysfs, registered to the specified class.
*
* A "dev" file will be created, showing the dev_t for the device, if
* the dev_t is not 0,0.
* If a pointer to a parent struct device is passed in, the newly created
* struct device will be a child of that device in sysfs.
* The pointer to the struct device will be returned from the call.
* Any further sysfs files that might be required can be created using this
* pointer.
*
* Returns &struct device pointer on success, or ERR_PTR() on error.
*
* Note: the struct class passed to this function must have previously
* been created with a call to class_create().
*/
struct device *device_create(struct class *class, struct device *parent,
dev_t devt, void *drvdata, const char *fmt, ...)
{
va_list vargs;
struct device *dev;
va_start(vargs, fmt);
dev = device_create_groups_vargs(class, parent, devt, drvdata, NULL,
fmt, vargs);
va_end(vargs);
return dev;
}
EXPORT_SYMBOL_GPL(device_create);
/**
* device_create_with_groups - creates a device and registers it with sysfs
* @class: pointer to the struct class that this device should be registered to
* @parent: pointer to the parent struct device of this new device, if any
* @devt: the dev_t for the char device to be added
* @drvdata: the data to be added to the device for callbacks
* @groups: NULL-terminated list of attribute groups to be created
* @fmt: string for the device's name
*
* This function can be used by char device classes. A struct device
* will be created in sysfs, registered to the specified class.
* Additional attributes specified in the groups parameter will also
* be created automatically.
*
* A "dev" file will be created, showing the dev_t for the device, if
* the dev_t is not 0,0.
* If a pointer to a parent struct device is passed in, the newly created
* struct device will be a child of that device in sysfs.
* The pointer to the struct device will be returned from the call.
* Any further sysfs files that might be required can be created using this
* pointer.
*
* Returns &struct device pointer on success, or ERR_PTR() on error.
*
* Note: the struct class passed to this function must have previously
* been created with a call to class_create().
*/
struct device *device_create_with_groups(struct class *class,
struct device *parent, dev_t devt,
void *drvdata,
const struct attribute_group **groups,
const char *fmt, ...)
{
va_list vargs;
struct device *dev;
va_start(vargs, fmt);
dev = device_create_groups_vargs(class, parent, devt, drvdata, groups,
fmt, vargs);
va_end(vargs);
return dev;
}
EXPORT_SYMBOL_GPL(device_create_with_groups);
/**
* device_destroy - removes a device that was created with device_create()
* @class: pointer to the struct class that this device was registered with
* @devt: the dev_t of the device that was previously registered
*
* This call unregisters and cleans up a device that was created with a
* call to device_create().
*/
void device_destroy(struct class *class, dev_t devt)
{
struct device *dev;
dev = class_find_device_by_devt(class, devt);
if (dev) {
put_device(dev);
device_unregister(dev);
}
}
EXPORT_SYMBOL_GPL(device_destroy);
/**
* device_rename - renames a device
* @dev: the pointer to the struct device to be renamed
* @new_name: the new name of the device
*
* It is the responsibility of the caller to provide mutual
* exclusion between two different calls of device_rename
* on the same device to ensure that new_name is valid and
* won't conflict with other devices.
*
* Note: Don't call this function. Currently, the networking layer calls this
* function, but that will change. The following text from Kay Sievers offers
* some insight:
*
* Renaming devices is racy at many levels, symlinks and other stuff are not
* replaced atomically, and you get a "move" uevent, but it's not easy to
* connect the event to the old and new device. Device nodes are not renamed at
* all, there isn't even support for that in the kernel now.
*
* In the meantime, during renaming, your target name might be taken by another
* driver, creating conflicts. Or the old name is taken directly after you
* renamed it -- then you get events for the same DEVPATH, before you even see
* the "move" event. It's just a mess, and nothing new should ever rely on
* kernel device renaming. Besides that, it's not even implemented now for
* other things than (driver-core wise very simple) network devices.
*
* We are currently about to change network renaming in udev to completely
* disallow renaming of devices in the same namespace as the kernel uses,
* because we can't solve the problems properly, that arise with swapping names
* of multiple interfaces without races. Means, renaming of eth[0-9]* will only
* be allowed to some other name than eth[0-9]*, for the aforementioned
* reasons.
*
* Make up a "real" name in the driver before you register anything, or add
* some other attributes for userspace to find the device, or use udev to add
* symlinks -- but never rename kernel devices later, it's a complete mess. We
* don't even want to get into that and try to implement the missing pieces in
* the core. We really have other pieces to fix in the driver core mess. :)
*/
int device_rename(struct device *dev, const char *new_name)
{
struct kobject *kobj = &dev->kobj;
char *old_device_name = NULL;
int error;
dev = get_device(dev);
if (!dev)
return -EINVAL;
dev_dbg(dev, "renaming to %s\n", new_name);
old_device_name = kstrdup(dev_name(dev), GFP_KERNEL);
if (!old_device_name) {
error = -ENOMEM;
goto out;
}
if (dev->class) {
error = sysfs_rename_link_ns(&dev->class->p->subsys.kobj,
kobj, old_device_name,
new_name, kobject_namespace(kobj));
if (error)
goto out;
}
error = kobject_rename(kobj, new_name);
if (error)
goto out;
out:
put_device(dev);
kfree(old_device_name);
return error;
}
EXPORT_SYMBOL_GPL(device_rename);
static int device_move_class_links(struct device *dev,
struct device *old_parent,
struct device *new_parent)
{
int error = 0;
if (old_parent)
sysfs_remove_link(&dev->kobj, "device");
if (new_parent)
error = sysfs_create_link(&dev->kobj, &new_parent->kobj,
"device");
return error;
}
/**
* device_move - moves a device to a new parent
* @dev: the pointer to the struct device to be moved
* @new_parent: the new parent of the device (can be NULL)
* @dpm_order: how to reorder the dpm_list
*/
int device_move(struct device *dev, struct device *new_parent,
enum dpm_order dpm_order)
{
int error;
struct device *old_parent;
struct kobject *new_parent_kobj;
dev = get_device(dev);
if (!dev)
return -EINVAL;
device_pm_lock();
new_parent = get_device(new_parent);
new_parent_kobj = get_device_parent(dev, new_parent);
if (IS_ERR(new_parent_kobj)) {
error = PTR_ERR(new_parent_kobj);
put_device(new_parent);
goto out;
}
pr_debug("device: '%s': %s: moving to '%s'\n", dev_name(dev),
__func__, new_parent ? dev_name(new_parent) : "<NULL>");
error = kobject_move(&dev->kobj, new_parent_kobj);
if (error) {
cleanup_glue_dir(dev, new_parent_kobj);
put_device(new_parent);
goto out;
}
old_parent = dev->parent;
dev->parent = new_parent;
if (old_parent)
klist_remove(&dev->p->knode_parent);
if (new_parent) {
klist_add_tail(&dev->p->knode_parent,
&new_parent->p->klist_children);
set_dev_node(dev, dev_to_node(new_parent));
}
if (dev->class) {
error = device_move_class_links(dev, old_parent, new_parent);
if (error) {
/* We ignore errors on cleanup since we're hosed anyway... */
device_move_class_links(dev, new_parent, old_parent);
if (!kobject_move(&dev->kobj, &old_parent->kobj)) {
if (new_parent)
klist_remove(&dev->p->knode_parent);
dev->parent = old_parent;
if (old_parent) {
klist_add_tail(&dev->p->knode_parent,
&old_parent->p->klist_children);
set_dev_node(dev, dev_to_node(old_parent));
}
}
cleanup_glue_dir(dev, new_parent_kobj);
put_device(new_parent);
goto out;
}
}
switch (dpm_order) {
case DPM_ORDER_NONE:
break;
case DPM_ORDER_DEV_AFTER_PARENT:
device_pm_move_after(dev, new_parent);
devices_kset_move_after(dev, new_parent);
break;
case DPM_ORDER_PARENT_BEFORE_DEV:
device_pm_move_before(new_parent, dev);
devices_kset_move_before(new_parent, dev);
break;
case DPM_ORDER_DEV_LAST:
device_pm_move_last(dev);
devices_kset_move_last(dev);
break;
}
put_device(old_parent);
out:
device_pm_unlock();
put_device(dev);
return error;
}
EXPORT_SYMBOL_GPL(device_move);
static int device_attrs_change_owner(struct device *dev, kuid_t kuid,
kgid_t kgid)
{
struct kobject *kobj = &dev->kobj;
struct class *class = dev->class;
const struct device_type *type = dev->type;
int error;
if (class) {
/*
* Change the device groups of the device class for @dev to
* @kuid/@kgid.
*/
error = sysfs_groups_change_owner(kobj, class->dev_groups, kuid,
kgid);
if (error)
return error;
}
if (type) {
/*
* Change the device groups of the device type for @dev to
* @kuid/@kgid.
*/
error = sysfs_groups_change_owner(kobj, type->groups, kuid,
kgid);
if (error)
return error;
}
/* Change the device groups of @dev to @kuid/@kgid. */
error = sysfs_groups_change_owner(kobj, dev->groups, kuid, kgid);
if (error)
return error;
if (device_supports_offline(dev) && !dev->offline_disabled) {
/* Change online device attributes of @dev to @kuid/@kgid. */
error = sysfs_file_change_owner(kobj, dev_attr_online.attr.name,
kuid, kgid);
if (error)
return error;
}
return 0;
}
/**
* device_change_owner - change the owner of an existing device.
* @dev: device.
* @kuid: new owner's kuid
* @kgid: new owner's kgid
*
* This changes the owner of @dev and its corresponding sysfs entries to
* @kuid/@kgid. This function closely mirrors how @dev was added via driver
* core.
*
* Returns 0 on success or error code on failure.
*/
int device_change_owner(struct device *dev, kuid_t kuid, kgid_t kgid)
{
int error;
struct kobject *kobj = &dev->kobj;
dev = get_device(dev);
if (!dev)
return -EINVAL;
/*
* Change the kobject and the default attributes and groups of the
* ktype associated with it to @kuid/@kgid.
*/
error = sysfs_change_owner(kobj, kuid, kgid);
if (error)
goto out;
/*
* Change the uevent file for @dev to the new owner. The uevent file
* was created in a separate step when @dev got added and we mirror
* that step here.
*/
error = sysfs_file_change_owner(kobj, dev_attr_uevent.attr.name, kuid,
kgid);
if (error)
goto out;
/*
* Change the device groups, the device groups associated with the
* device class, and the groups associated with the device type of @dev
* to @kuid/@kgid.
*/
error = device_attrs_change_owner(dev, kuid, kgid);
if (error)
goto out;
error = dpm_sysfs_change_owner(dev, kuid, kgid);
if (error)
goto out;
#ifdef CONFIG_BLOCK
if (sysfs_deprecated && dev->class == &block_class)
goto out;
#endif
/*
* Change the owner of the symlink located in the class directory of
* the device class associated with @dev which points to the actual
* directory entry for @dev to @kuid/@kgid. This ensures that the
* symlink shows the same permissions as its target.
*/
error = sysfs_link_change_owner(&dev->class->p->subsys.kobj, &dev->kobj,
dev_name(dev), kuid, kgid);
if (error)
goto out;
out:
put_device(dev);
return error;
}
EXPORT_SYMBOL_GPL(device_change_owner);
/**
* device_shutdown - call ->shutdown() on each device to shutdown.
*/
void device_shutdown(void)
{
struct device *dev, *parent;
wait_for_device_probe();
device_block_probing();
cpufreq_suspend();
spin_lock(&devices_kset->list_lock);
/*
* Walk the devices list backward, shutting down each in turn.
* Beware that device unplug events may also start pulling
* devices offline, even as the system is shutting down.
*/
while (!list_empty(&devices_kset->list)) {
dev = list_entry(devices_kset->list.prev, struct device,
kobj.entry);
/*
* hold reference count of device's parent to
* prevent it from being freed because parent's
* lock is to be held
*/
parent = get_device(dev->parent);
get_device(dev);
/*
* Make sure the device is off the kset list, in the
* event that dev->*->shutdown() doesn't remove it.
*/
list_del_init(&dev->kobj.entry);
spin_unlock(&devices_kset->list_lock);
/* hold lock to avoid race with probe/release */
if (parent)
device_lock(parent);
device_lock(dev);
/* Don't allow any more runtime suspends */
pm_runtime_get_noresume(dev);
pm_runtime_barrier(dev);
if (dev->class && dev->class->shutdown_pre) {
if (initcall_debug)
dev_info(dev, "shutdown_pre\n");
dev->class->shutdown_pre(dev);
}
if (dev->bus && dev->bus->shutdown) {
if (initcall_debug)
dev_info(dev, "shutdown\n");
dev->bus->shutdown(dev);
} else if (dev->driver && dev->driver->shutdown) {
if (initcall_debug)
dev_info(dev, "shutdown\n");
dev->driver->shutdown(dev);
}
device_unlock(dev);
if (parent)
device_unlock(parent);
put_device(dev);
put_device(parent);
spin_lock(&devices_kset->list_lock);
}
spin_unlock(&devices_kset->list_lock);
}
/*
* Device logging functions
*/
#ifdef CONFIG_PRINTK
static void
set_dev_info(const struct device *dev, struct dev_printk_info *dev_info)
{
const char *subsys;
memset(dev_info, 0, sizeof(*dev_info));
if (dev->class)
subsys = dev->class->name;
else if (dev->bus)
subsys = dev->bus->name;
else
return;
strscpy(dev_info->subsystem, subsys, sizeof(dev_info->subsystem));
/*
* Add device identifier DEVICE=:
* b12:8 block dev_t
* c127:3 char dev_t
* n8 netdev ifindex
* +sound:card0 subsystem:devname
*/
if (MAJOR(dev->devt)) {
char c;
if (strcmp(subsys, "block") == 0)
c = 'b';
else
c = 'c';
snprintf(dev_info->device, sizeof(dev_info->device),
"%c%u:%u", c, MAJOR(dev->devt), MINOR(dev->devt));
} else if (strcmp(subsys, "net") == 0) {
struct net_device *net = to_net_dev(dev);
snprintf(dev_info->device, sizeof(dev_info->device),
"n%u", net->ifindex);
} else {
snprintf(dev_info->device, sizeof(dev_info->device),
"+%s:%s", subsys, dev_name(dev));
}
}
int dev_vprintk_emit(int level, const struct device *dev,
const char *fmt, va_list args)
{
struct dev_printk_info dev_info;
set_dev_info(dev, &dev_info);
return vprintk_emit(0, level, &dev_info, fmt, args);
}
EXPORT_SYMBOL(dev_vprintk_emit);
int dev_printk_emit(int level, const struct device *dev, const char *fmt, ...)
{
va_list args;
int r;
va_start(args, fmt);
r = dev_vprintk_emit(level, dev, fmt, args);
va_end(args);
return r;
}
EXPORT_SYMBOL(dev_printk_emit);
static void __dev_printk(const char *level, const struct device *dev,
struct va_format *vaf)
{
if (dev)
dev_printk_emit(level[1] - '0', dev, "%s %s: %pV",
dev_driver_string(dev), dev_name(dev), vaf);
else
printk("%s(NULL device *): %pV", level, vaf);
}
void _dev_printk(const char *level, const struct device *dev,
const char *fmt, ...)
{
struct va_format vaf;
va_list args;
va_start(args, fmt);
vaf.fmt = fmt;
vaf.va = &args;
__dev_printk(level, dev, &vaf);
va_end(args);
}
EXPORT_SYMBOL(_dev_printk);
#define define_dev_printk_level(func, kern_level) \
void func(const struct device *dev, const char *fmt, ...) \
{ \
struct va_format vaf; \
va_list args; \
\
va_start(args, fmt); \
\
vaf.fmt = fmt; \
vaf.va = &args; \
\
__dev_printk(kern_level, dev, &vaf); \
\
va_end(args); \
} \
EXPORT_SYMBOL(func);
define_dev_printk_level(_dev_emerg, KERN_EMERG);
define_dev_printk_level(_dev_alert, KERN_ALERT);
define_dev_printk_level(_dev_crit, KERN_CRIT);
define_dev_printk_level(_dev_err, KERN_ERR);
define_dev_printk_level(_dev_warn, KERN_WARNING);
define_dev_printk_level(_dev_notice, KERN_NOTICE);
define_dev_printk_level(_dev_info, KERN_INFO);
#endif
/**
* dev_err_probe - probe error check and log helper
* @dev: the pointer to the struct device
* @err: error value to test
* @fmt: printf-style format string
* @...: arguments as specified in the format string
*
* This helper implements common pattern present in probe functions for error
* checking: print debug or error message depending if the error value is
* -EPROBE_DEFER and propagate error upwards.
* In case of -EPROBE_DEFER it sets also defer probe reason, which can be
* checked later by reading devices_deferred debugfs attribute.
* It replaces code sequence::
*
* if (err != -EPROBE_DEFER)
* dev_err(dev, ...);
* else
* dev_dbg(dev, ...);
* return err;
*
* with::
*
* return dev_err_probe(dev, err, ...);
*
* Returns @err.
*
*/
int dev_err_probe(const struct device *dev, int err, const char *fmt, ...)
{
struct va_format vaf;
va_list args;
va_start(args, fmt);
vaf.fmt = fmt;
vaf.va = &args;
if (err != -EPROBE_DEFER) {
dev_err(dev, "error %pe: %pV", ERR_PTR(err), &vaf);
} else {
device_set_deferred_probe_reason(dev, &vaf);
dev_dbg(dev, "error %pe: %pV", ERR_PTR(err), &vaf);
}
va_end(args);
return err;
}
EXPORT_SYMBOL_GPL(dev_err_probe);
static inline bool fwnode_is_primary(struct fwnode_handle *fwnode)
{
return fwnode && !IS_ERR(fwnode->secondary);
}
/**
* set_primary_fwnode - Change the primary firmware node of a given device.
* @dev: Device to handle.
* @fwnode: New primary firmware node of the device.
*
* Set the device's firmware node pointer to @fwnode, but if a secondary
* firmware node of the device is present, preserve it.
*
* Valid fwnode cases are:
* - primary --> secondary --> -ENODEV
* - primary --> NULL
* - secondary --> -ENODEV
* - NULL
*/
void set_primary_fwnode(struct device *dev, struct fwnode_handle *fwnode)
{
struct device *parent = dev->parent;
struct fwnode_handle *fn = dev->fwnode;
if (fwnode) {
if (fwnode_is_primary(fn))
fn = fn->secondary;
if (fn) {
WARN_ON(fwnode->secondary);
fwnode->secondary = fn;
}
dev->fwnode = fwnode;
} else {
if (fwnode_is_primary(fn)) {
dev->fwnode = fn->secondary;
/* Set fn->secondary = NULL, so fn remains the primary fwnode */
if (!(parent && fn == parent->fwnode))
fn->secondary = NULL;
} else {
dev->fwnode = NULL;
}
}
}
EXPORT_SYMBOL_GPL(set_primary_fwnode);
/**
* set_secondary_fwnode - Change the secondary firmware node of a given device.
* @dev: Device to handle.
* @fwnode: New secondary firmware node of the device.
*
* If a primary firmware node of the device is present, set its secondary
* pointer to @fwnode. Otherwise, set the device's firmware node pointer to
* @fwnode.
*/
void set_secondary_fwnode(struct device *dev, struct fwnode_handle *fwnode)
{
if (fwnode)
fwnode->secondary = ERR_PTR(-ENODEV);
if (fwnode_is_primary(dev->fwnode))
dev->fwnode->secondary = fwnode;
else
dev->fwnode = fwnode;
}
EXPORT_SYMBOL_GPL(set_secondary_fwnode);
/**
* device_set_of_node_from_dev - reuse device-tree node of another device
* @dev: device whose device-tree node is being set
* @dev2: device whose device-tree node is being reused
*
* Takes another reference to the new device-tree node after first dropping
* any reference held to the old node.
*/
void device_set_of_node_from_dev(struct device *dev, const struct device *dev2)
{
of_node_put(dev->of_node);
dev->of_node = of_node_get(dev2->of_node);
dev->of_node_reused = true;
}
EXPORT_SYMBOL_GPL(device_set_of_node_from_dev);
void device_set_node(struct device *dev, struct fwnode_handle *fwnode)
{
dev->fwnode = fwnode;
dev->of_node = to_of_node(fwnode);
}
EXPORT_SYMBOL_GPL(device_set_node);
int device_match_name(struct device *dev, const void *name)
{
return sysfs_streq(dev_name(dev), name);
}
EXPORT_SYMBOL_GPL(device_match_name);
int device_match_of_node(struct device *dev, const void *np)
{
return dev->of_node == np;
}
EXPORT_SYMBOL_GPL(device_match_of_node);
int device_match_fwnode(struct device *dev, const void *fwnode)
{
return dev_fwnode(dev) == fwnode;
}
EXPORT_SYMBOL_GPL(device_match_fwnode);
int device_match_devt(struct device *dev, const void *pdevt)
{
return dev->devt == *(dev_t *)pdevt;
}
EXPORT_SYMBOL_GPL(device_match_devt);
int device_match_acpi_dev(struct device *dev, const void *adev)
{
return ACPI_COMPANION(dev) == adev;
}
EXPORT_SYMBOL(device_match_acpi_dev);
int device_match_any(struct device *dev, const void *unused)
{
return 1;
}
EXPORT_SYMBOL_GPL(device_match_any);
// SPDX-License-Identifier: GPL-2.0-only
/*
* linux/lib/cmdline.c
* Helper functions generally used for parsing kernel command line
* and module options.
*
* Code and copyrights come from init/main.c and arch/i386/kernel/setup.c.
*
* GNU Indent formatting options for this file: -kr -i8 -npsl -pcs
*/
#include <linux/export.h>
#include <linux/kernel.h>
#include <linux/string.h>
#include <linux/ctype.h>
/*
* If a hyphen was found in get_option, this will handle the
* range of numbers, M-N. This will expand the range and insert
* the values[M, M+1, ..., N] into the ints array in get_options.
*/
static int get_range(char **str, int *pint, int n)
{
int x, inc_counter, upper_range;
(*str)++;
upper_range = simple_strtol((*str), NULL, 0);
inc_counter = upper_range - *pint;
for (x = *pint; n && x < upper_range; x++, n--)
*pint++ = x;
return inc_counter;
}
/**
* get_option - Parse integer from an option string
* @str: option string
* @pint: (optional output) integer value parsed from @str
*
* Read an int from an option string; if available accept a subsequent
* comma as well.
*
* When @pint is NULL the function can be used as a validator of
* the current option in the string.
*
* Return values:
* 0 - no int in string
* 1 - int found, no subsequent comma
* 2 - int found including a subsequent comma
* 3 - hyphen found to denote a range
*
* Leading hyphen without integer is no integer case, but we consume it
* for the sake of simplification.
*/
int get_option(char **str, int *pint)
{
char *cur = *str;
int value;
if (!cur || !(*cur))
return 0;
if (*cur == '-')
value = -simple_strtoull(++cur, str, 0);
else
value = simple_strtoull(cur, str, 0);
if (pint)
*pint = value;
if (cur == *str)
return 0;
if (**str == ',') {
(*str)++;
return 2;
}
if (**str == '-')
return 3;
return 1;
}
EXPORT_SYMBOL(get_option);
/**
* get_options - Parse a string into a list of integers
* @str: String to be parsed
* @nints: size of integer array
* @ints: integer array (must have room for at least one element)
*
* This function parses a string containing a comma-separated
* list of integers, a hyphen-separated range of _positive_ integers,
* or a combination of both. The parse halts when the array is
* full, or when no more numbers can be retrieved from the
* string.
*
* When @nints is 0, the function just validates the given @str and
* returns the amount of parseable integers as described below.
*
* Returns:
*
* The first element is filled by the number of collected integers
* in the range. The rest is what was parsed from the @str.
*
* Return value is the character in the string which caused
* the parse to end (typically a null terminator, if @str is
* completely parseable).
*/
char *get_options(const char *str, int nints, int *ints)
{
bool validate = (nints == 0);
int res, i = 1;
while (i < nints || validate) {
int *pint = validate ? ints : ints + i;
res = get_option((char **)&str, pint);
if (res == 0)
break;
if (res == 3) {
int n = validate ? 0 : nints - i;
int range_nums;
range_nums = get_range((char **)&str, pint, n);
if (range_nums < 0)
break;
/*
* Decrement the result by one to leave out the
* last number in the range. The next iteration
* will handle the upper number in the range
*/
i += (range_nums - 1);
}
i++;
if (res == 1)
break;
}
ints[0] = i - 1;
return (char *)str;
}
EXPORT_SYMBOL(get_options);
/**
* memparse - parse a string with mem suffixes into a number
* @ptr: Where parse begins
* @retptr: (output) Optional pointer to next char after parse completes
*
* Parses a string into a number. The number stored at @ptr is
* potentially suffixed with K, M, G, T, P, E.
*/
unsigned long long memparse(const char *ptr, char **retptr)
{
char *endptr; /* local pointer to end of parsed string */
unsigned long long ret = simple_strtoull(ptr, &endptr, 0);
switch (*endptr) {
case 'E':
case 'e':
ret <<= 10;
fallthrough;
case 'P':
case 'p':
ret <<= 10;
fallthrough;
case 'T':
case 't':
ret <<= 10;
fallthrough;
case 'G':
case 'g':
ret <<= 10;
fallthrough;
case 'M':
case 'm':
ret <<= 10;
fallthrough;
case 'K':
case 'k':
ret <<= 10;
endptr++;
fallthrough;
default:
break;
}
if (retptr) *retptr = endptr; return ret;
}
EXPORT_SYMBOL(memparse);
/**
* parse_option_str - Parse a string and check an option is set or not
* @str: String to be parsed
* @option: option name
*
* This function parses a string containing a comma-separated list of
* strings like a=b,c.
*
* Return true if there's such option in the string, or return false.
*/
bool parse_option_str(const char *str, const char *option)
{
while (*str) {
if (!strncmp(str, option, strlen(option))) {
str += strlen(option);
if (!*str || *str == ',')
return true;
}
while (*str && *str != ',')
str++;
if (*str == ',')
str++;
}
return false;
}
/*
* Parse a string to get a param value pair.
* You can use " around spaces, but can't escape ".
* Hyphens and underscores equivalent in parameter names.
*/
char *next_arg(char *args, char **param, char **val)
{
unsigned int i, equals = 0;
int in_quote = 0, quoted = 0;
if (*args == '"') {
args++;
in_quote = 1;
quoted = 1;
}
for (i = 0; args[i]; i++) {
if (isspace(args[i]) && !in_quote)
break;
if (equals == 0) {
if (args[i] == '=')
equals = i;
}
if (args[i] == '"')
in_quote = !in_quote;
}
*param = args;
if (!equals)
*val = NULL;
else {
args[equals] = '\0';
*val = args + equals + 1;
/* Don't include quotes in value. */
if (**val == '"') {
(*val)++;
if (args[i-1] == '"')
args[i-1] = '\0';
}
}
if (quoted && args[i-1] == '"')
args[i-1] = '\0';
if (args[i]) {
args[i] = '\0';
args += i + 1;
} else
args += i;
/* Chew up trailing spaces. */
return skip_spaces(args);
}
EXPORT_SYMBOL(next_arg);
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_BITOPS_H
#define _LINUX_BITOPS_H
#include <asm/types.h>
#include <linux/bits.h>
#include <linux/typecheck.h>
#include <uapi/linux/kernel.h>
/* Set bits in the first 'n' bytes when loaded from memory */
#ifdef __LITTLE_ENDIAN
# define aligned_byte_mask(n) ((1UL << 8*(n))-1)
#else
# define aligned_byte_mask(n) (~0xffUL << (BITS_PER_LONG - 8 - 8*(n)))
#endif
#define BITS_PER_TYPE(type) (sizeof(type) * BITS_PER_BYTE)
#define BITS_TO_LONGS(nr) __KERNEL_DIV_ROUND_UP(nr, BITS_PER_TYPE(long))
#define BITS_TO_U64(nr) __KERNEL_DIV_ROUND_UP(nr, BITS_PER_TYPE(u64))
#define BITS_TO_U32(nr) __KERNEL_DIV_ROUND_UP(nr, BITS_PER_TYPE(u32))
#define BITS_TO_BYTES(nr) __KERNEL_DIV_ROUND_UP(nr, BITS_PER_TYPE(char))
extern unsigned int __sw_hweight8(unsigned int w);
extern unsigned int __sw_hweight16(unsigned int w);
extern unsigned int __sw_hweight32(unsigned int w);
extern unsigned long __sw_hweight64(__u64 w);
/*
* Include this here because some architectures need generic_ffs/fls in
* scope
*/
#include <asm/bitops.h>
#define for_each_set_bit(bit, addr, size) \
for ((bit) = find_first_bit((addr), (size)); \
(bit) < (size); \
(bit) = find_next_bit((addr), (size), (bit) + 1))
/* same as for_each_set_bit() but use bit as value to start with */
#define for_each_set_bit_from(bit, addr, size) \
for ((bit) = find_next_bit((addr), (size), (bit)); \
(bit) < (size); \
(bit) = find_next_bit((addr), (size), (bit) + 1))
#define for_each_clear_bit(bit, addr, size) \
for ((bit) = find_first_zero_bit((addr), (size)); \
(bit) < (size); \
(bit) = find_next_zero_bit((addr), (size), (bit) + 1))
/* same as for_each_clear_bit() but use bit as value to start with */
#define for_each_clear_bit_from(bit, addr, size) \
for ((bit) = find_next_zero_bit((addr), (size), (bit)); \
(bit) < (size); \
(bit) = find_next_zero_bit((addr), (size), (bit) + 1))
/**
* for_each_set_clump8 - iterate over bitmap for each 8-bit clump with set bits
* @start: bit offset to start search and to store the current iteration offset
* @clump: location to store copy of current 8-bit clump
* @bits: bitmap address to base the search on
* @size: bitmap size in number of bits
*/
#define for_each_set_clump8(start, clump, bits, size) \
for ((start) = find_first_clump8(&(clump), (bits), (size)); \
(start) < (size); \
(start) = find_next_clump8(&(clump), (bits), (size), (start) + 8))
static inline int get_bitmask_order(unsigned int count)
{
int order;
order = fls(count);
return order; /* We could be slightly more clever with -1 here... */
}
static __always_inline unsigned long hweight_long(unsigned long w)
{
return sizeof(w) == 4 ? hweight32(w) : hweight64((__u64)w);
}
/**
* rol64 - rotate a 64-bit value left
* @word: value to rotate
* @shift: bits to roll
*/
static inline __u64 rol64(__u64 word, unsigned int shift)
{
return (word << (shift & 63)) | (word >> ((-shift) & 63));
}
/**
* ror64 - rotate a 64-bit value right
* @word: value to rotate
* @shift: bits to roll
*/
static inline __u64 ror64(__u64 word, unsigned int shift)
{
return (word >> (shift & 63)) | (word << ((-shift) & 63));
}
/**
* rol32 - rotate a 32-bit value left
* @word: value to rotate
* @shift: bits to roll
*/
static inline __u32 rol32(__u32 word, unsigned int shift)
{
return (word << (shift & 31)) | (word >> ((-shift) & 31));
}
/**
* ror32 - rotate a 32-bit value right
* @word: value to rotate
* @shift: bits to roll
*/
static inline __u32 ror32(__u32 word, unsigned int shift)
{
return (word >> (shift & 31)) | (word << ((-shift) & 31));
}
/**
* rol16 - rotate a 16-bit value left
* @word: value to rotate
* @shift: bits to roll
*/
static inline __u16 rol16(__u16 word, unsigned int shift)
{
return (word << (shift & 15)) | (word >> ((-shift) & 15));
}
/**
* ror16 - rotate a 16-bit value right
* @word: value to rotate
* @shift: bits to roll
*/
static inline __u16 ror16(__u16 word, unsigned int shift)
{
return (word >> (shift & 15)) | (word << ((-shift) & 15));
}
/**
* rol8 - rotate an 8-bit value left
* @word: value to rotate
* @shift: bits to roll
*/
static inline __u8 rol8(__u8 word, unsigned int shift)
{
return (word << (shift & 7)) | (word >> ((-shift) & 7));
}
/**
* ror8 - rotate an 8-bit value right
* @word: value to rotate
* @shift: bits to roll
*/
static inline __u8 ror8(__u8 word, unsigned int shift)
{
return (word >> (shift & 7)) | (word << ((-shift) & 7));
}
/**
* sign_extend32 - sign extend a 32-bit value using specified bit as sign-bit
* @value: value to sign extend
* @index: 0 based bit index (0<=index<32) to sign bit
*
* This is safe to use for 16- and 8-bit types as well.
*/
static __always_inline __s32 sign_extend32(__u32 value, int index)
{
__u8 shift = 31 - index;
return (__s32)(value << shift) >> shift;
}
/**
* sign_extend64 - sign extend a 64-bit value using specified bit as sign-bit
* @value: value to sign extend
* @index: 0 based bit index (0<=index<64) to sign bit
*/
static __always_inline __s64 sign_extend64(__u64 value, int index)
{
__u8 shift = 63 - index;
return (__s64)(value << shift) >> shift;
}
static inline unsigned fls_long(unsigned long l)
{
if (sizeof(l) == 4)
return fls(l);
return fls64(l);
}
static inline int get_count_order(unsigned int count)
{
if (count == 0)
return -1;
return fls(--count);
}
/**
* get_count_order_long - get order after rounding @l up to power of 2
* @l: parameter
*
* it is same as get_count_order() but with long type parameter
*/
static inline int get_count_order_long(unsigned long l)
{
if (l == 0UL)
return -1;
return (int)fls_long(--l);
}
/**
* __ffs64 - find first set bit in a 64 bit word
* @word: The 64 bit word
*
* On 64 bit arches this is a synonym for __ffs
* The result is not defined if no bits are set, so check that @word
* is non-zero before calling this.
*/
static inline unsigned long __ffs64(u64 word)
{
#if BITS_PER_LONG == 32
if (((u32)word) == 0UL)
return __ffs((u32)(word >> 32)) + 32;
#elif BITS_PER_LONG != 64
#error BITS_PER_LONG not 32 or 64
#endif
return __ffs((unsigned long)word);
}
/**
* assign_bit - Assign value to a bit in memory
* @nr: the bit to set
* @addr: the address to start counting from
* @value: the value to assign
*/
static __always_inline void assign_bit(long nr, volatile unsigned long *addr,
bool value)
{
if (value)
set_bit(nr, addr);
else
clear_bit(nr, addr);
}
static __always_inline void __assign_bit(long nr, volatile unsigned long *addr,
bool value)
{
if (value)
__set_bit(nr, addr);
else
__clear_bit(nr, addr);
}
/**
* __ptr_set_bit - Set bit in a pointer's value
* @nr: the bit to set
* @addr: the address of the pointer variable
*
* Example:
* void *p = foo();
* __ptr_set_bit(bit, &p);
*/
#define __ptr_set_bit(nr, addr) \
({ \
typecheck_pointer(*(addr)); \
__set_bit(nr, (unsigned long *)(addr)); \
})
/**
* __ptr_clear_bit - Clear bit in a pointer's value
* @nr: the bit to clear
* @addr: the address of the pointer variable
*
* Example:
* void *p = foo();
* __ptr_clear_bit(bit, &p);
*/
#define __ptr_clear_bit(nr, addr) \
({ \
typecheck_pointer(*(addr)); \
__clear_bit(nr, (unsigned long *)(addr)); \
})
/**
* __ptr_test_bit - Test bit in a pointer's value
* @nr: the bit to test
* @addr: the address of the pointer variable
*
* Example:
* void *p = foo();
* if (__ptr_test_bit(bit, &p)) {
* ...
* } else {
* ...
* }
*/
#define __ptr_test_bit(nr, addr) \
({ \
typecheck_pointer(*(addr)); \
test_bit(nr, (unsigned long *)(addr)); \
})
#ifdef __KERNEL__
#ifndef set_mask_bits
#define set_mask_bits(ptr, mask, bits) \
({ \
const typeof(*(ptr)) mask__ = (mask), bits__ = (bits); \
typeof(*(ptr)) old__, new__; \
\
do { \
old__ = READ_ONCE(*(ptr)); \
new__ = (old__ & ~mask__) | bits__; \
} while (cmpxchg(ptr, old__, new__) != old__); \
\
old__; \
})
#endif
#ifndef bit_clear_unless
#define bit_clear_unless(ptr, clear, test) \
({ \
const typeof(*(ptr)) clear__ = (clear), test__ = (test);\
typeof(*(ptr)) old__, new__; \
\
do { \
old__ = READ_ONCE(*(ptr)); \
new__ = old__ & ~clear__; \
} while (!(old__ & test__) && \
cmpxchg(ptr, old__, new__) != old__); \
\
!(old__ & test__); \
})
#endif
#endif /* __KERNEL__ */
#endif
/* SPDX-License-Identifier: GPL-2.0 */
/*
* A hash table (hashtab) maintains associations between
* key values and datum values. The type of the key values
* and the type of the datum values is arbitrary. The
* functions for hash computation and key comparison are
* provided by the creator of the table.
*
* Author : Stephen Smalley, <sds@tycho.nsa.gov>
*/
#ifndef _SS_HASHTAB_H_
#define _SS_HASHTAB_H_
#include <linux/types.h>
#include <linux/errno.h>
#include <linux/sched.h>
#define HASHTAB_MAX_NODES U32_MAX
struct hashtab_key_params {
u32 (*hash)(const void *key); /* hash function */
int (*cmp)(const void *key1, const void *key2);
/* key comparison function */
};
struct hashtab_node {
void *key;
void *datum;
struct hashtab_node *next;
};
struct hashtab {
struct hashtab_node **htable; /* hash table */
u32 size; /* number of slots in hash table */
u32 nel; /* number of elements in hash table */
};
struct hashtab_info {
u32 slots_used;
u32 max_chain_len;
};
/*
* Initializes a new hash table with the specified characteristics.
*
* Returns -ENOMEM if insufficient space is available or 0 otherwise.
*/
int hashtab_init(struct hashtab *h, u32 nel_hint);
int __hashtab_insert(struct hashtab *h, struct hashtab_node **dst,
void *key, void *datum);
/*
* Inserts the specified (key, datum) pair into the specified hash table.
*
* Returns -ENOMEM on memory allocation error,
* -EEXIST if there is already an entry with the same key,
* -EINVAL for general errors or
0 otherwise.
*/
static inline int hashtab_insert(struct hashtab *h, void *key, void *datum,
struct hashtab_key_params key_params)
{
u32 hvalue;
struct hashtab_node *prev, *cur;
cond_resched();
if (!h->size || h->nel == HASHTAB_MAX_NODES)
return -EINVAL;
hvalue = key_params.hash(key) & (h->size - 1);
prev = NULL;
cur = h->htable[hvalue];
while (cur) {
int cmp = key_params.cmp(key, cur->key);
if (cmp == 0)
return -EEXIST;
if (cmp < 0)
break;
prev = cur;
cur = cur->next;
}
return __hashtab_insert(h, prev ? &prev->next : &h->htable[hvalue],
key, datum);
}
/*
* Searches for the entry with the specified key in the hash table.
*
* Returns NULL if no entry has the specified key or
* the datum of the entry otherwise.
*/
static inline void *hashtab_search(struct hashtab *h, const void *key,
struct hashtab_key_params key_params)
{
u32 hvalue;
struct hashtab_node *cur;
if (!h->size)
return NULL;
hvalue = key_params.hash(key) & (h->size - 1);
cur = h->htable[hvalue];
while (cur) {
int cmp = key_params.cmp(key, cur->key);
if (cmp == 0)
return cur->datum; if (cmp < 0)
break;
cur = cur->next;
}
return NULL;
}
/*
* Destroys the specified hash table.
*/
void hashtab_destroy(struct hashtab *h);
/*
* Applies the specified apply function to (key,datum,args)
* for each entry in the specified hash table.
*
* The order in which the function is applied to the entries
* is dependent upon the internal structure of the hash table.
*
* If apply returns a non-zero status, then hashtab_map will cease
* iterating through the hash table and will propagate the error
* return to its caller.
*/
int hashtab_map(struct hashtab *h,
int (*apply)(void *k, void *d, void *args),
void *args);
int hashtab_duplicate(struct hashtab *new, struct hashtab *orig,
int (*copy)(struct hashtab_node *new,
struct hashtab_node *orig, void *args),
int (*destroy)(void *k, void *d, void *args),
void *args);
/* Fill info with some hash table statistics */
void hashtab_stat(struct hashtab *h, struct hashtab_info *info);
#endif /* _SS_HASHTAB_H */
/* SPDX-License-Identifier: GPL-2.0 */
/*
* Copyright (C) 2008 Oracle. All rights reserved.
*/
#ifndef BTRFS_DELAYED_REF_H
#define BTRFS_DELAYED_REF_H
#include <linux/refcount.h>
/* these are the possible values of struct btrfs_delayed_ref_node->action */
#define BTRFS_ADD_DELAYED_REF 1 /* add one backref to the tree */
#define BTRFS_DROP_DELAYED_REF 2 /* delete one backref from the tree */
#define BTRFS_ADD_DELAYED_EXTENT 3 /* record a full extent allocation */
#define BTRFS_UPDATE_DELAYED_HEAD 4 /* not changing ref count on head ref */
struct btrfs_delayed_ref_node {
struct rb_node ref_node;
/*
* If action is BTRFS_ADD_DELAYED_REF, also link this node to
* ref_head->ref_add_list, then we do not need to iterate the
* whole ref_head->ref_list to find BTRFS_ADD_DELAYED_REF nodes.
*/
struct list_head add_list;
/* the starting bytenr of the extent */
u64 bytenr;
/* the size of the extent */
u64 num_bytes;
/* seq number to keep track of insertion order */
u64 seq;
/* ref count on this data structure */
refcount_t refs;
/*
* how many refs is this entry adding or deleting. For
* head refs, this may be a negative number because it is keeping
* track of the total mods done to the reference count.
* For individual refs, this will always be a positive number
*
* It may be more than one, since it is possible for a single
* parent to have more than one ref on an extent
*/
int ref_mod;
unsigned int action:8;
unsigned int type:8;
/* is this node still in the rbtree? */
unsigned int is_head:1;
unsigned int in_tree:1;
};
struct btrfs_delayed_extent_op {
struct btrfs_disk_key key;
u8 level;
bool update_key;
bool update_flags;
bool is_data;
u64 flags_to_set;
};
/*
* the head refs are used to hold a lock on a given extent, which allows us
* to make sure that only one process is running the delayed refs
* at a time for a single extent. They also store the sum of all the
* reference count modifications we've queued up.
*/
struct btrfs_delayed_ref_head {
u64 bytenr;
u64 num_bytes;
refcount_t refs;
/*
* the mutex is held while running the refs, and it is also
* held when checking the sum of reference modifications.
*/
struct mutex mutex;
spinlock_t lock;
struct rb_root_cached ref_tree;
/* accumulate add BTRFS_ADD_DELAYED_REF nodes to this ref_add_list. */
struct list_head ref_add_list;
struct rb_node href_node;
struct btrfs_delayed_extent_op *extent_op;
/*
* This is used to track the final ref_mod from all the refs associated
* with this head ref, this is not adjusted as delayed refs are run,
* this is meant to track if we need to do the csum accounting or not.
*/
int total_ref_mod;
/*
* This is the current outstanding mod references for this bytenr. This
* is used with lookup_extent_info to get an accurate reference count
* for a bytenr, so it is adjusted as delayed refs are run so that any
* on disk reference count + ref_mod is accurate.
*/
int ref_mod;
/*
* when a new extent is allocated, it is just reserved in memory
* The actual extent isn't inserted into the extent allocation tree
* until the delayed ref is processed. must_insert_reserved is
* used to flag a delayed ref so the accounting can be updated
* when a full insert is done.
*
* It is possible the extent will be freed before it is ever
* inserted into the extent allocation tree. In this case
* we need to update the in ram accounting to properly reflect
* the free has happened.
*/
unsigned int must_insert_reserved:1;
unsigned int is_data:1;
unsigned int is_system:1;
unsigned int processing:1;
};
struct btrfs_delayed_tree_ref {
struct btrfs_delayed_ref_node node;
u64 root;
u64 parent;
int level;
};
struct btrfs_delayed_data_ref {
struct btrfs_delayed_ref_node node;
u64 root;
u64 parent;
u64 objectid;
u64 offset;
};
enum btrfs_delayed_ref_flags {
/* Indicate that we are flushing delayed refs for the commit */
BTRFS_DELAYED_REFS_FLUSHING,
};
struct btrfs_delayed_ref_root {
/* head ref rbtree */
struct rb_root_cached href_root;
/* dirty extent records */
struct rb_root dirty_extent_root;
/* this spin lock protects the rbtree and the entries inside */
spinlock_t lock;
/* how many delayed ref updates we've queued, used by the
* throttling code
*/
atomic_t num_entries;
/* total number of head nodes in tree */
unsigned long num_heads;
/* total number of head nodes ready for processing */
unsigned long num_heads_ready;
u64 pending_csums;
unsigned long flags;
u64 run_delayed_start;
/*
* To make qgroup to skip given root.
* This is for snapshot, as btrfs_qgroup_inherit() will manually
* modify counters for snapshot and its source, so we should skip
* the snapshot in new_root/old_roots or it will get calculated twice
*/
u64 qgroup_to_skip;
};
enum btrfs_ref_type {
BTRFS_REF_NOT_SET,
BTRFS_REF_DATA,
BTRFS_REF_METADATA,
BTRFS_REF_LAST,
};
struct btrfs_data_ref {
/* For EXTENT_DATA_REF */
/* Root which refers to this data extent */
u64 ref_root;
/* Inode which refers to this data extent */
u64 ino;
/*
* file_offset - extent_offset
*
* file_offset is the key.offset of the EXTENT_DATA key.
* extent_offset is btrfs_file_extent_offset() of the EXTENT_DATA data.
*/
u64 offset;
};
struct btrfs_tree_ref {
/*
* Level of this tree block
*
* Shared for skinny (TREE_BLOCK_REF) and normal tree ref.
*/
int level;
/*
* Root which refers to this tree block.
*
* For TREE_BLOCK_REF (skinny metadata, either inline or keyed)
*/
u64 root;
/* For non-skinny metadata, no special member needed */
};
struct btrfs_ref {
enum btrfs_ref_type type;
int action;
/*
* Whether this extent should go through qgroup record.
*
* Normally false, but for certain cases like delayed subtree scan,
* setting this flag can hugely reduce qgroup overhead.
*/
bool skip_qgroup;
/*
* Optional. For which root is this modification.
* Mostly used for qgroup optimization.
*
* When unset, data/tree ref init code will populate it.
* In certain cases, we're modifying reference for a different root.
* E.g. COW fs tree blocks for balance.
* In that case, tree_ref::root will be fs tree, but we're doing this
* for reloc tree, then we should set @real_root to reloc tree.
*/
u64 real_root;
u64 bytenr;
u64 len;
/* Bytenr of the parent tree block */
u64 parent;
union {
struct btrfs_data_ref data_ref;
struct btrfs_tree_ref tree_ref;
};
};
extern struct kmem_cache *btrfs_delayed_ref_head_cachep;
extern struct kmem_cache *btrfs_delayed_tree_ref_cachep;
extern struct kmem_cache *btrfs_delayed_data_ref_cachep;
extern struct kmem_cache *btrfs_delayed_extent_op_cachep;
int __init btrfs_delayed_ref_init(void);
void __cold btrfs_delayed_ref_exit(void);
static inline void btrfs_init_generic_ref(struct btrfs_ref *generic_ref,
int action, u64 bytenr, u64 len, u64 parent)
{
generic_ref->action = action;
generic_ref->bytenr = bytenr;
generic_ref->len = len;
generic_ref->parent = parent;
}
static inline void btrfs_init_tree_ref(struct btrfs_ref *generic_ref,
int level, u64 root)
{
/* If @real_root not set, use @root as fallback */
if (!generic_ref->real_root)
generic_ref->real_root = root; generic_ref->tree_ref.level = level;
generic_ref->tree_ref.root = root;
generic_ref->type = BTRFS_REF_METADATA;
}
static inline void btrfs_init_data_ref(struct btrfs_ref *generic_ref,
u64 ref_root, u64 ino, u64 offset)
{
/* If @real_root not set, use @root as fallback */
if (!generic_ref->real_root)
generic_ref->real_root = ref_root; generic_ref->data_ref.ref_root = ref_root;
generic_ref->data_ref.ino = ino;
generic_ref->data_ref.offset = offset;
generic_ref->type = BTRFS_REF_DATA;
}
static inline struct btrfs_delayed_extent_op *
btrfs_alloc_delayed_extent_op(void)
{
return kmem_cache_alloc(btrfs_delayed_extent_op_cachep, GFP_NOFS);
}
static inline void
btrfs_free_delayed_extent_op(struct btrfs_delayed_extent_op *op)
{
if (op) kmem_cache_free(btrfs_delayed_extent_op_cachep, op);
}
static inline void btrfs_put_delayed_ref(struct btrfs_delayed_ref_node *ref)
{
WARN_ON(refcount_read(&ref->refs) == 0);
if (refcount_dec_and_test(&ref->refs)) {
WARN_ON(ref->in_tree); switch (ref->type) {
case BTRFS_TREE_BLOCK_REF_KEY:
case BTRFS_SHARED_BLOCK_REF_KEY:
kmem_cache_free(btrfs_delayed_tree_ref_cachep, ref);
break;
case BTRFS_EXTENT_DATA_REF_KEY:
case BTRFS_SHARED_DATA_REF_KEY:
kmem_cache_free(btrfs_delayed_data_ref_cachep, ref);
break;
default:
BUG();
}
}
}
static inline u64 btrfs_ref_head_to_space_flags(
struct btrfs_delayed_ref_head *head_ref)
{
if (head_ref->is_data)
return BTRFS_BLOCK_GROUP_DATA;
else if (head_ref->is_system)
return BTRFS_BLOCK_GROUP_SYSTEM;
return BTRFS_BLOCK_GROUP_METADATA;
}
static inline void btrfs_put_delayed_ref_head(struct btrfs_delayed_ref_head *head)
{
if (refcount_dec_and_test(&head->refs))
kmem_cache_free(btrfs_delayed_ref_head_cachep, head);
}
int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans,
struct btrfs_ref *generic_ref,
struct btrfs_delayed_extent_op *extent_op);
int btrfs_add_delayed_data_ref(struct btrfs_trans_handle *trans,
struct btrfs_ref *generic_ref,
u64 reserved);
int btrfs_add_delayed_extent_op(struct btrfs_trans_handle *trans,
u64 bytenr, u64 num_bytes,
struct btrfs_delayed_extent_op *extent_op);
void btrfs_merge_delayed_refs(struct btrfs_trans_handle *trans,
struct btrfs_delayed_ref_root *delayed_refs,
struct btrfs_delayed_ref_head *head);
struct btrfs_delayed_ref_head *
btrfs_find_delayed_ref_head(struct btrfs_delayed_ref_root *delayed_refs,
u64 bytenr);
int btrfs_delayed_ref_lock(struct btrfs_delayed_ref_root *delayed_refs,
struct btrfs_delayed_ref_head *head);
static inline void btrfs_delayed_ref_unlock(struct btrfs_delayed_ref_head *head)
{
mutex_unlock(&head->mutex);
}
void btrfs_delete_ref_head(struct btrfs_delayed_ref_root *delayed_refs,
struct btrfs_delayed_ref_head *head);
struct btrfs_delayed_ref_head *btrfs_select_ref_head(
struct btrfs_delayed_ref_root *delayed_refs);
int btrfs_check_delayed_seq(struct btrfs_fs_info *fs_info, u64 seq);
void btrfs_delayed_refs_rsv_release(struct btrfs_fs_info *fs_info, int nr);
void btrfs_update_delayed_refs_rsv(struct btrfs_trans_handle *trans);
int btrfs_delayed_refs_rsv_refill(struct btrfs_fs_info *fs_info,
enum btrfs_reserve_flush_enum flush);
void btrfs_migrate_to_delayed_refs_rsv(struct btrfs_fs_info *fs_info,
struct btrfs_block_rsv *src,
u64 num_bytes);
int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans);
bool btrfs_check_space_for_delayed_refs(struct btrfs_fs_info *fs_info);
/*
* helper functions to cast a node into its container
*/
static inline struct btrfs_delayed_tree_ref *
btrfs_delayed_node_to_tree_ref(struct btrfs_delayed_ref_node *node)
{
return container_of(node, struct btrfs_delayed_tree_ref, node);
}
static inline struct btrfs_delayed_data_ref *
btrfs_delayed_node_to_data_ref(struct btrfs_delayed_ref_node *node)
{
return container_of(node, struct btrfs_delayed_data_ref, node);
}
#endif
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_MM_H
#define _LINUX_MM_H
#include <linux/errno.h>
#ifdef __KERNEL__
#include <linux/mmdebug.h>
#include <linux/gfp.h>
#include <linux/bug.h>
#include <linux/list.h>
#include <linux/mmzone.h>
#include <linux/rbtree.h>
#include <linux/atomic.h>
#include <linux/debug_locks.h>
#include <linux/mm_types.h>
#include <linux/mmap_lock.h>
#include <linux/range.h>
#include <linux/pfn.h>
#include <linux/percpu-refcount.h>
#include <linux/bit_spinlock.h>
#include <linux/shrinker.h>
#include <linux/resource.h>
#include <linux/page_ext.h>
#include <linux/err.h>
#include <linux/page-flags.h>
#include <linux/page_ref.h>
#include <linux/memremap.h>
#include <linux/overflow.h>
#include <linux/sizes.h>
#include <linux/sched.h>
#include <linux/pgtable.h>
#include <linux/kasan.h>
struct mempolicy;
struct anon_vma;
struct anon_vma_chain;
struct file_ra_state;
struct user_struct;
struct writeback_control;
struct bdi_writeback;
struct pt_regs;
extern int sysctl_page_lock_unfairness;
void init_mm_internals(void);
#ifndef CONFIG_NUMA /* Don't use mapnrs, do it properly */
extern unsigned long max_mapnr;
static inline void set_max_mapnr(unsigned long limit)
{
max_mapnr = limit;
}
#else
static inline void set_max_mapnr(unsigned long limit) { }
#endif
extern atomic_long_t _totalram_pages;
static inline unsigned long totalram_pages(void)
{
return (unsigned long)atomic_long_read(&_totalram_pages);
}
static inline void totalram_pages_inc(void)
{
atomic_long_inc(&_totalram_pages);
}
static inline void totalram_pages_dec(void)
{
atomic_long_dec(&_totalram_pages);
}
static inline void totalram_pages_add(long count)
{
atomic_long_add(count, &_totalram_pages);
}
extern void * high_memory;
extern int page_cluster;
#ifdef CONFIG_SYSCTL
extern int sysctl_legacy_va_layout;
#else
#define sysctl_legacy_va_layout 0
#endif
#ifdef CONFIG_HAVE_ARCH_MMAP_RND_BITS
extern const int mmap_rnd_bits_min;
extern const int mmap_rnd_bits_max;
extern int mmap_rnd_bits __read_mostly;
#endif
#ifdef CONFIG_HAVE_ARCH_MMAP_RND_COMPAT_BITS
extern const int mmap_rnd_compat_bits_min;
extern const int mmap_rnd_compat_bits_max;
extern int mmap_rnd_compat_bits __read_mostly;
#endif
#include <asm/page.h>
#include <asm/processor.h>
/*
* Architectures that support memory tagging (assigning tags to memory regions,
* embedding these tags into addresses that point to these memory regions, and
* checking that the memory and the pointer tags match on memory accesses)
* redefine this macro to strip tags from pointers.
* It's defined as noop for architectures that don't support memory tagging.
*/
#ifndef untagged_addr
#define untagged_addr(addr) (addr)
#endif
#ifndef __pa_symbol
#define __pa_symbol(x) __pa(RELOC_HIDE((unsigned long)(x), 0))
#endif
#ifndef page_to_virt
#define page_to_virt(x) __va(PFN_PHYS(page_to_pfn(x)))
#endif
#ifndef lm_alias
#define lm_alias(x) __va(__pa_symbol(x))
#endif
/*
* To prevent common memory management code establishing
* a zero page mapping on a read fault.
* This macro should be defined within <asm/pgtable.h>.
* s390 does this to prevent multiplexing of hardware bits
* related to the physical page in case of virtualization.
*/
#ifndef mm_forbids_zeropage
#define mm_forbids_zeropage(X) (0)
#endif
/*
* On some architectures it is expensive to call memset() for small sizes.
* If an architecture decides to implement their own version of
* mm_zero_struct_page they should wrap the defines below in a #ifndef and
* define their own version of this macro in <asm/pgtable.h>
*/
#if BITS_PER_LONG == 64
/* This function must be updated when the size of struct page grows above 80
* or reduces below 56. The idea that compiler optimizes out switch()
* statement, and only leaves move/store instructions. Also the compiler can
* combine write statements if they are both assignments and can be reordered,
* this can result in several of the writes here being dropped.
*/
#define mm_zero_struct_page(pp) __mm_zero_struct_page(pp)
static inline void __mm_zero_struct_page(struct page *page)
{
unsigned long *_pp = (void *)page;
/* Check that struct page is either 56, 64, 72, or 80 bytes */
BUILD_BUG_ON(sizeof(struct page) & 7);
BUILD_BUG_ON(sizeof(struct page) < 56);
BUILD_BUG_ON(sizeof(struct page) > 80);
switch (sizeof(struct page)) {
case 80:
_pp[9] = 0;
fallthrough;
case 72:
_pp[8] = 0;
fallthrough;
case 64:
_pp[7] = 0;
fallthrough;
case 56:
_pp[6] = 0;
_pp[5] = 0;
_pp[4] = 0;
_pp[3] = 0;
_pp[2] = 0;
_pp[1] = 0;
_pp[0] = 0;
}
}
#else
#define mm_zero_struct_page(pp) ((void)memset((pp), 0, sizeof(struct page)))
#endif
/*
* Default maximum number of active map areas, this limits the number of vmas
* per mm struct. Users can overwrite this number by sysctl but there is a
* problem.
*
* When a program's coredump is generated as ELF format, a section is created
* per a vma. In ELF, the number of sections is represented in unsigned short.
* This means the number of sections should be smaller than 65535 at coredump.
* Because the kernel adds some informative sections to a image of program at
* generating coredump, we need some margin. The number of extra sections is
* 1-3 now and depends on arch. We use "5" as safe margin, here.
*
* ELF extended numbering allows more than 65535 sections, so 16-bit bound is
* not a hard limit any more. Although some userspace tools can be surprised by
* that.
*/
#define MAPCOUNT_ELF_CORE_MARGIN (5)
#define DEFAULT_MAX_MAP_COUNT (USHRT_MAX - MAPCOUNT_ELF_CORE_MARGIN)
extern int sysctl_max_map_count;
extern unsigned long sysctl_user_reserve_kbytes;
extern unsigned long sysctl_admin_reserve_kbytes;
extern int sysctl_overcommit_memory;
extern int sysctl_overcommit_ratio;
extern unsigned long sysctl_overcommit_kbytes;
int overcommit_ratio_handler(struct ctl_table *, int, void *, size_t *,
loff_t *);
int overcommit_kbytes_handler(struct ctl_table *, int, void *, size_t *,
loff_t *);
int overcommit_policy_handler(struct ctl_table *, int, void *, size_t *,
loff_t *);
/*
* Any attempt to mark this function as static leads to build failure
* when CONFIG_DEBUG_INFO_BTF is enabled because __add_to_page_cache_locked()
* is referred to by BPF code. This must be visible for error injection.
*/
int __add_to_page_cache_locked(struct page *page, struct address_space *mapping,
pgoff_t index, gfp_t gfp, void **shadowp);
#if defined(CONFIG_SPARSEMEM) && !defined(CONFIG_SPARSEMEM_VMEMMAP)
#define nth_page(page,n) pfn_to_page(page_to_pfn((page)) + (n))
#else
#define nth_page(page,n) ((page) + (n))
#endif
/* to align the pointer to the (next) page boundary */
#define PAGE_ALIGN(addr) ALIGN(addr, PAGE_SIZE)
/* test whether an address (unsigned long or pointer) is aligned to PAGE_SIZE */
#define PAGE_ALIGNED(addr) IS_ALIGNED((unsigned long)(addr), PAGE_SIZE)
#define lru_to_page(head) (list_entry((head)->prev, struct page, lru))
void setup_initial_init_mm(void *start_code, void *end_code,
void *end_data, void *brk);
/*
* Linux kernel virtual memory manager primitives.
* The idea being to have a "virtual" mm in the same way
* we have a virtual fs - giving a cleaner interface to the
* mm details, and allowing different kinds of memory mappings
* (from shared memory to executable loading to arbitrary
* mmap() functions).
*/
struct vm_area_struct *vm_area_alloc(struct mm_struct *);
struct vm_area_struct *vm_area_dup(struct vm_area_struct *);
void vm_area_free(struct vm_area_struct *);
#ifndef CONFIG_MMU
extern struct rb_root nommu_region_tree;
extern struct rw_semaphore nommu_region_sem;
extern unsigned int kobjsize(const void *objp);
#endif
/*
* vm_flags in vm_area_struct, see mm_types.h.
* When changing, update also include/trace/events/mmflags.h
*/
#define VM_NONE 0x00000000
#define VM_READ 0x00000001 /* currently active flags */
#define VM_WRITE 0x00000002
#define VM_EXEC 0x00000004
#define VM_SHARED 0x00000008
/* mprotect() hardcodes VM_MAYREAD >> 4 == VM_READ, and so for r/w/x bits. */
#define VM_MAYREAD 0x00000010 /* limits for mprotect() etc */
#define VM_MAYWRITE 0x00000020
#define VM_MAYEXEC 0x00000040
#define VM_MAYSHARE 0x00000080
#define VM_GROWSDOWN 0x00000100 /* general info on the segment */
#define VM_UFFD_MISSING 0x00000200 /* missing pages tracking */
#define VM_PFNMAP 0x00000400 /* Page-ranges managed without "struct page", just pure PFN */
#define VM_UFFD_WP 0x00001000 /* wrprotect pages tracking */
#define VM_LOCKED 0x00002000
#define VM_IO 0x00004000 /* Memory mapped I/O or similar */
/* Used by sys_madvise() */
#define VM_SEQ_READ 0x00008000 /* App will access data sequentially */
#define VM_RAND_READ 0x00010000 /* App will not benefit from clustered reads */
#define VM_DONTCOPY 0x00020000 /* Do not copy this vma on fork */
#define VM_DONTEXPAND 0x00040000 /* Cannot expand with mremap() */
#define VM_LOCKONFAULT 0x00080000 /* Lock the pages covered when they are faulted in */
#define VM_ACCOUNT 0x00100000 /* Is a VM accounted object */
#define VM_NORESERVE 0x00200000 /* should the VM suppress accounting */
#define VM_HUGETLB 0x00400000 /* Huge TLB Page VM */
#define VM_SYNC 0x00800000 /* Synchronous page faults */
#define VM_ARCH_1 0x01000000 /* Architecture-specific flag */
#define VM_WIPEONFORK 0x02000000 /* Wipe VMA contents in child. */
#define VM_DONTDUMP 0x04000000 /* Do not include in the core dump */
#ifdef CONFIG_MEM_SOFT_DIRTY
# define VM_SOFTDIRTY 0x08000000 /* Not soft dirty clean area */
#else
# define VM_SOFTDIRTY 0
#endif
#define VM_MIXEDMAP 0x10000000 /* Can contain "struct page" and pure PFN pages */
#define VM_HUGEPAGE 0x20000000 /* MADV_HUGEPAGE marked this vma */
#define VM_NOHUGEPAGE 0x40000000 /* MADV_NOHUGEPAGE marked this vma */
#define VM_MERGEABLE 0x80000000 /* KSM may merge identical pages */
#ifdef CONFIG_ARCH_USES_HIGH_VMA_FLAGS
#define VM_HIGH_ARCH_BIT_0 32 /* bit only usable on 64-bit architectures */
#define VM_HIGH_ARCH_BIT_1 33 /* bit only usable on 64-bit architectures */
#define VM_HIGH_ARCH_BIT_2 34 /* bit only usable on 64-bit architectures */
#define VM_HIGH_ARCH_BIT_3 35 /* bit only usable on 64-bit architectures */
#define VM_HIGH_ARCH_BIT_4 36 /* bit only usable on 64-bit architectures */
#define VM_HIGH_ARCH_0 BIT(VM_HIGH_ARCH_BIT_0)
#define VM_HIGH_ARCH_1 BIT(VM_HIGH_ARCH_BIT_1)
#define VM_HIGH_ARCH_2 BIT(VM_HIGH_ARCH_BIT_2)
#define VM_HIGH_ARCH_3 BIT(VM_HIGH_ARCH_BIT_3)
#define VM_HIGH_ARCH_4 BIT(VM_HIGH_ARCH_BIT_4)
#endif /* CONFIG_ARCH_USES_HIGH_VMA_FLAGS */
#ifdef CONFIG_ARCH_HAS_PKEYS
# define VM_PKEY_SHIFT VM_HIGH_ARCH_BIT_0
# define VM_PKEY_BIT0 VM_HIGH_ARCH_0 /* A protection key is a 4-bit value */
# define VM_PKEY_BIT1 VM_HIGH_ARCH_1 /* on x86 and 5-bit value on ppc64 */
# define VM_PKEY_BIT2 VM_HIGH_ARCH_2
# define VM_PKEY_BIT3 VM_HIGH_ARCH_3
#ifdef CONFIG_PPC
# define VM_PKEY_BIT4 VM_HIGH_ARCH_4
#else
# define VM_PKEY_BIT4 0
#endif
#endif /* CONFIG_ARCH_HAS_PKEYS */
#if defined(CONFIG_X86)
# define VM_PAT VM_ARCH_1 /* PAT reserves whole VMA at once (x86) */
#elif defined(CONFIG_PPC)
# define VM_SAO VM_ARCH_1 /* Strong Access Ordering (powerpc) */
#elif defined(CONFIG_PARISC)
# define VM_GROWSUP VM_ARCH_1
#elif defined(CONFIG_IA64)
# define VM_GROWSUP VM_ARCH_1
#elif defined(CONFIG_SPARC64)
# define VM_SPARC_ADI VM_ARCH_1 /* Uses ADI tag for access control */
# define VM_ARCH_CLEAR VM_SPARC_ADI
#elif defined(CONFIG_ARM64)
# define VM_ARM64_BTI VM_ARCH_1 /* BTI guarded page, a.k.a. GP bit */
# define VM_ARCH_CLEAR VM_ARM64_BTI
#elif !defined(CONFIG_MMU)
# define VM_MAPPED_COPY VM_ARCH_1 /* T if mapped copy of data (nommu mmap) */
#endif
#if defined(CONFIG_ARM64_MTE)
# define VM_MTE VM_HIGH_ARCH_0 /* Use Tagged memory for access control */
# define VM_MTE_ALLOWED VM_HIGH_ARCH_1 /* Tagged memory permitted */
#else
# define VM_MTE VM_NONE
# define VM_MTE_ALLOWED VM_NONE
#endif
#ifndef VM_GROWSUP
# define VM_GROWSUP VM_NONE
#endif
#ifdef CONFIG_HAVE_ARCH_USERFAULTFD_MINOR
# define VM_UFFD_MINOR_BIT 37
# define VM_UFFD_MINOR BIT(VM_UFFD_MINOR_BIT) /* UFFD minor faults */
#else /* !CONFIG_HAVE_ARCH_USERFAULTFD_MINOR */
# define VM_UFFD_MINOR VM_NONE
#endif /* CONFIG_HAVE_ARCH_USERFAULTFD_MINOR */
/* Bits set in the VMA until the stack is in its final location */
#define VM_STACK_INCOMPLETE_SETUP (VM_RAND_READ | VM_SEQ_READ)
#define TASK_EXEC ((current->personality & READ_IMPLIES_EXEC) ? VM_EXEC : 0)
/* Common data flag combinations */
#define VM_DATA_FLAGS_TSK_EXEC (VM_READ | VM_WRITE | TASK_EXEC | \
VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC)
#define VM_DATA_FLAGS_NON_EXEC (VM_READ | VM_WRITE | VM_MAYREAD | \
VM_MAYWRITE | VM_MAYEXEC)
#define VM_DATA_FLAGS_EXEC (VM_READ | VM_WRITE | VM_EXEC | \
VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC)
#ifndef VM_DATA_DEFAULT_FLAGS /* arch can override this */
#define VM_DATA_DEFAULT_FLAGS VM_DATA_FLAGS_EXEC
#endif
#ifndef VM_STACK_DEFAULT_FLAGS /* arch can override this */
#define VM_STACK_DEFAULT_FLAGS VM_DATA_DEFAULT_FLAGS
#endif
#ifdef CONFIG_STACK_GROWSUP
#define VM_STACK VM_GROWSUP
#else
#define VM_STACK VM_GROWSDOWN
#endif
#define VM_STACK_FLAGS (VM_STACK | VM_STACK_DEFAULT_FLAGS | VM_ACCOUNT)
/* VMA basic access permission flags */
#define VM_ACCESS_FLAGS (VM_READ | VM_WRITE | VM_EXEC)
/*
* Special vmas that are non-mergable, non-mlock()able.
*/
#define VM_SPECIAL (VM_IO | VM_DONTEXPAND | VM_PFNMAP | VM_MIXEDMAP)
/* This mask prevents VMA from being scanned with khugepaged */
#define VM_NO_KHUGEPAGED (VM_SPECIAL | VM_HUGETLB)
/* This mask defines which mm->def_flags a process can inherit its parent */
#define VM_INIT_DEF_MASK VM_NOHUGEPAGE
/* This mask is used to clear all the VMA flags used by mlock */
#define VM_LOCKED_CLEAR_MASK (~(VM_LOCKED | VM_LOCKONFAULT))
/* Arch-specific flags to clear when updating VM flags on protection change */
#ifndef VM_ARCH_CLEAR
# define VM_ARCH_CLEAR VM_NONE
#endif
#define VM_FLAGS_CLEAR (ARCH_VM_PKEY_FLAGS | VM_ARCH_CLEAR)
/*
* mapping from the currently active vm_flags protection bits (the
* low four bits) to a page protection mask..
*/
extern pgprot_t protection_map[16];
/**
* enum fault_flag - Fault flag definitions.
* @FAULT_FLAG_WRITE: Fault was a write fault.
* @FAULT_FLAG_MKWRITE: Fault was mkwrite of existing PTE.
* @FAULT_FLAG_ALLOW_RETRY: Allow to retry the fault if blocked.
* @FAULT_FLAG_RETRY_NOWAIT: Don't drop mmap_lock and wait when retrying.
* @FAULT_FLAG_KILLABLE: The fault task is in SIGKILL killable region.
* @FAULT_FLAG_TRIED: The fault has been tried once.
* @FAULT_FLAG_USER: The fault originated in userspace.
* @FAULT_FLAG_REMOTE: The fault is not for current task/mm.
* @FAULT_FLAG_INSTRUCTION: The fault was during an instruction fetch.
* @FAULT_FLAG_INTERRUPTIBLE: The fault can be interrupted by non-fatal signals.
*
* About @FAULT_FLAG_ALLOW_RETRY and @FAULT_FLAG_TRIED: we can specify
* whether we would allow page faults to retry by specifying these two
* fault flags correctly. Currently there can be three legal combinations:
*
* (a) ALLOW_RETRY and !TRIED: this means the page fault allows retry, and
* this is the first try
*
* (b) ALLOW_RETRY and TRIED: this means the page fault allows retry, and
* we've already tried at least once
*
* (c) !ALLOW_RETRY and !TRIED: this means the page fault does not allow retry
*
* The unlisted combination (!ALLOW_RETRY && TRIED) is illegal and should never
* be used. Note that page faults can be allowed to retry for multiple times,
* in which case we'll have an initial fault with flags (a) then later on
* continuous faults with flags (b). We should always try to detect pending
* signals before a retry to make sure the continuous page faults can still be
* interrupted if necessary.
*/
enum fault_flag {
FAULT_FLAG_WRITE = 1 << 0,
FAULT_FLAG_MKWRITE = 1 << 1,
FAULT_FLAG_ALLOW_RETRY = 1 << 2,
FAULT_FLAG_RETRY_NOWAIT = 1 << 3,
FAULT_FLAG_KILLABLE = 1 << 4,
FAULT_FLAG_TRIED = 1 << 5,
FAULT_FLAG_USER = 1 << 6,
FAULT_FLAG_REMOTE = 1 << 7,
FAULT_FLAG_INSTRUCTION = 1 << 8,
FAULT_FLAG_INTERRUPTIBLE = 1 << 9,
};
/*
* The default fault flags that should be used by most of the
* arch-specific page fault handlers.
*/
#define FAULT_FLAG_DEFAULT (FAULT_FLAG_ALLOW_RETRY | \
FAULT_FLAG_KILLABLE | \
FAULT_FLAG_INTERRUPTIBLE)
/**
* fault_flag_allow_retry_first - check ALLOW_RETRY the first time
* @flags: Fault flags.
*
* This is mostly used for places where we want to try to avoid taking
* the mmap_lock for too long a time when waiting for another condition
* to change, in which case we can try to be polite to release the
* mmap_lock in the first round to avoid potential starvation of other
* processes that would also want the mmap_lock.
*
* Return: true if the page fault allows retry and this is the first
* attempt of the fault handling; false otherwise.
*/
static inline bool fault_flag_allow_retry_first(enum fault_flag flags)
{
return (flags & FAULT_FLAG_ALLOW_RETRY) &&
(!(flags & FAULT_FLAG_TRIED));
}
#define FAULT_FLAG_TRACE \
{ FAULT_FLAG_WRITE, "WRITE" }, \
{ FAULT_FLAG_MKWRITE, "MKWRITE" }, \
{ FAULT_FLAG_ALLOW_RETRY, "ALLOW_RETRY" }, \
{ FAULT_FLAG_RETRY_NOWAIT, "RETRY_NOWAIT" }, \
{ FAULT_FLAG_KILLABLE, "KILLABLE" }, \
{ FAULT_FLAG_TRIED, "TRIED" }, \
{ FAULT_FLAG_USER, "USER" }, \
{ FAULT_FLAG_REMOTE, "REMOTE" }, \
{ FAULT_FLAG_INSTRUCTION, "INSTRUCTION" }, \
{ FAULT_FLAG_INTERRUPTIBLE, "INTERRUPTIBLE" }
/*
* vm_fault is filled by the pagefault handler and passed to the vma's
* ->fault function. The vma's ->fault is responsible for returning a bitmask
* of VM_FAULT_xxx flags that give details about how the fault was handled.
*
* MM layer fills up gfp_mask for page allocations but fault handler might
* alter it if its implementation requires a different allocation context.
*
* pgoff should be used in favour of virtual_address, if possible.
*/
struct vm_fault {
const struct {
struct vm_area_struct *vma; /* Target VMA */
gfp_t gfp_mask; /* gfp mask to be used for allocations */
pgoff_t pgoff; /* Logical page offset based on vma */
unsigned long address; /* Faulting virtual address */
};
enum fault_flag flags; /* FAULT_FLAG_xxx flags
* XXX: should really be 'const' */
pmd_t *pmd; /* Pointer to pmd entry matching
* the 'address' */
pud_t *pud; /* Pointer to pud entry matching
* the 'address'
*/
union {
pte_t orig_pte; /* Value of PTE at the time of fault */
pmd_t orig_pmd; /* Value of PMD at the time of fault,
* used by PMD fault only.
*/
};
struct page *cow_page; /* Page handler may use for COW fault */
struct page *page; /* ->fault handlers should return a
* page here, unless VM_FAULT_NOPAGE
* is set (which is also implied by
* VM_FAULT_ERROR).
*/
/* These three entries are valid only while holding ptl lock */
pte_t *pte; /* Pointer to pte entry matching
* the 'address'. NULL if the page
* table hasn't been allocated.
*/
spinlock_t *ptl; /* Page table lock.
* Protects pte page table if 'pte'
* is not NULL, otherwise pmd.
*/
pgtable_t prealloc_pte; /* Pre-allocated pte page table.
* vm_ops->map_pages() sets up a page
* table from atomic context.
* do_fault_around() pre-allocates
* page table to avoid allocation from
* atomic context.
*/
};
/* page entry size for vm->huge_fault() */
enum page_entry_size {
PE_SIZE_PTE = 0,
PE_SIZE_PMD,
PE_SIZE_PUD,
};
/*
* These are the virtual MM functions - opening of an area, closing and
* unmapping it (needed to keep files on disk up-to-date etc), pointer
* to the functions called when a no-page or a wp-page exception occurs.
*/
struct vm_operations_struct {
void (*open)(struct vm_area_struct * area);
void (*close)(struct vm_area_struct * area);
/* Called any time before splitting to check if it's allowed */
int (*may_split)(struct vm_area_struct *area, unsigned long addr);
int (*mremap)(struct vm_area_struct *area);
/*
* Called by mprotect() to make driver-specific permission
* checks before mprotect() is finalised. The VMA must not
* be modified. Returns 0 if eprotect() can proceed.
*/
int (*mprotect)(struct vm_area_struct *vma, unsigned long start,
unsigned long end, unsigned long newflags);
vm_fault_t (*fault)(struct vm_fault *vmf);
vm_fault_t (*huge_fault)(struct vm_fault *vmf,
enum page_entry_size pe_size);
vm_fault_t (*map_pages)(struct vm_fault *vmf,
pgoff_t start_pgoff, pgoff_t end_pgoff);
unsigned long (*pagesize)(struct vm_area_struct * area);
/* notification that a previously read-only page is about to become
* writable, if an error is returned it will cause a SIGBUS */
vm_fault_t (*page_mkwrite)(struct vm_fault *vmf);
/* same as page_mkwrite when using VM_PFNMAP|VM_MIXEDMAP */
vm_fault_t (*pfn_mkwrite)(struct vm_fault *vmf);
/* called by access_process_vm when get_user_pages() fails, typically
* for use by special VMAs. See also generic_access_phys() for a generic
* implementation useful for any iomem mapping.
*/
int (*access)(struct vm_area_struct *vma, unsigned long addr,
void *buf, int len, int write);
/* Called by the /proc/PID/maps code to ask the vma whether it
* has a special name. Returning non-NULL will also cause this
* vma to be dumped unconditionally. */
const char *(*name)(struct vm_area_struct *vma);
#ifdef CONFIG_NUMA
/*
* set_policy() op must add a reference to any non-NULL @new mempolicy
* to hold the policy upon return. Caller should pass NULL @new to
* remove a policy and fall back to surrounding context--i.e. do not
* install a MPOL_DEFAULT policy, nor the task or system default
* mempolicy.
*/
int (*set_policy)(struct vm_area_struct *vma, struct mempolicy *new);
/*
* get_policy() op must add reference [mpol_get()] to any policy at
* (vma,addr) marked as MPOL_SHARED. The shared policy infrastructure
* in mm/mempolicy.c will do this automatically.
* get_policy() must NOT add a ref if the policy at (vma,addr) is not
* marked as MPOL_SHARED. vma policies are protected by the mmap_lock.
* If no [shared/vma] mempolicy exists at the addr, get_policy() op
* must return NULL--i.e., do not "fallback" to task or system default
* policy.
*/
struct mempolicy *(*get_policy)(struct vm_area_struct *vma,
unsigned long addr);
#endif
/*
* Called by vm_normal_page() for special PTEs to find the
* page for @addr. This is useful if the default behavior
* (using pte_page()) would not find the correct page.
*/
struct page *(*find_special_page)(struct vm_area_struct *vma,
unsigned long addr);
};
static inline void vma_init(struct vm_area_struct *vma, struct mm_struct *mm)
{
static const struct vm_operations_struct dummy_vm_ops = {};
memset(vma, 0, sizeof(*vma));
vma->vm_mm = mm;
vma->vm_ops = &dummy_vm_ops;
INIT_LIST_HEAD(&vma->anon_vma_chain);
}
static inline void vma_set_anonymous(struct vm_area_struct *vma)
{
vma->vm_ops = NULL;
}
static inline bool vma_is_anonymous(struct vm_area_struct *vma)
{
return !vma->vm_ops;
}
static inline bool vma_is_temporary_stack(struct vm_area_struct *vma)
{
int maybe_stack = vma->vm_flags & (VM_GROWSDOWN | VM_GROWSUP);
if (!maybe_stack)
return false;
if ((vma->vm_flags & VM_STACK_INCOMPLETE_SETUP) ==
VM_STACK_INCOMPLETE_SETUP)
return true;
return false;
}
static inline bool vma_is_foreign(struct vm_area_struct *vma)
{
if (!current->mm)
return true;
if (current->mm != vma->vm_mm)
return true;
return false;
}
static inline bool vma_is_accessible(struct vm_area_struct *vma)
{
return vma->vm_flags & VM_ACCESS_FLAGS;
}
#ifdef CONFIG_SHMEM
/*
* The vma_is_shmem is not inline because it is used only by slow
* paths in userfault.
*/
bool vma_is_shmem(struct vm_area_struct *vma);
#else
static inline bool vma_is_shmem(struct vm_area_struct *vma) { return false; }
#endif
int vma_is_stack_for_current(struct vm_area_struct *vma);
/* flush_tlb_range() takes a vma, not a mm, and can care about flags */
#define TLB_FLUSH_VMA(mm,flags) { .vm_mm = (mm), .vm_flags = (flags) }
struct mmu_gather;
struct inode;
#include <linux/huge_mm.h>
/*
* Methods to modify the page usage count.
*
* What counts for a page usage:
* - cache mapping (page->mapping)
* - private data (page->private)
* - page mapped in a task's page tables, each mapping
* is counted separately
*
* Also, many kernel routines increase the page count before a critical
* routine so they can be sure the page doesn't go away from under them.
*/
/*
* Drop a ref, return true if the refcount fell to zero (the page has no users)
*/
static inline int put_page_testzero(struct page *page)
{
VM_BUG_ON_PAGE(page_ref_count(page) == 0, page);
return page_ref_dec_and_test(page);
}
/*
* Try to grab a ref unless the page has a refcount of zero, return false if
* that is the case.
* This can be called when MMU is off so it must not access
* any of the virtual mappings.
*/
static inline int get_page_unless_zero(struct page *page)
{
return page_ref_add_unless(page, 1, 0);
}
extern int page_is_ram(unsigned long pfn);
enum {
REGION_INTERSECTS,
REGION_DISJOINT,
REGION_MIXED,
};
int region_intersects(resource_size_t offset, size_t size, unsigned long flags,
unsigned long desc);
/* Support for virtually mapped pages */
struct page *vmalloc_to_page(const void *addr);
unsigned long vmalloc_to_pfn(const void *addr);
/*
* Determine if an address is within the vmalloc range
*
* On nommu, vmalloc/vfree wrap through kmalloc/kfree directly, so there
* is no special casing required.
*/
#ifndef is_ioremap_addr
#define is_ioremap_addr(x) is_vmalloc_addr(x)
#endif
#ifdef CONFIG_MMU
extern bool is_vmalloc_addr(const void *x);
extern int is_vmalloc_or_module_addr(const void *x);
#else
static inline bool is_vmalloc_addr(const void *x)
{
return false;
}
static inline int is_vmalloc_or_module_addr(const void *x)
{
return 0;
}
#endif
extern void *kvmalloc_node(size_t size, gfp_t flags, int node);
static inline void *kvmalloc(size_t size, gfp_t flags)
{
return kvmalloc_node(size, flags, NUMA_NO_NODE);
}
static inline void *kvzalloc_node(size_t size, gfp_t flags, int node)
{
return kvmalloc_node(size, flags | __GFP_ZERO, node);
}
static inline void *kvzalloc(size_t size, gfp_t flags)
{
return kvmalloc(size, flags | __GFP_ZERO);
}
static inline void *kvmalloc_array(size_t n, size_t size, gfp_t flags)
{
size_t bytes;
if (unlikely(check_mul_overflow(n, size, &bytes)))
return NULL;
return kvmalloc(bytes, flags);
}
static inline void *kvcalloc(size_t n, size_t size, gfp_t flags)
{
return kvmalloc_array(n, size, flags | __GFP_ZERO);
}
extern void *kvrealloc(const void *p, size_t oldsize, size_t newsize,
gfp_t flags);
extern void kvfree(const void *addr);
extern void kvfree_sensitive(const void *addr, size_t len);
static inline int head_compound_mapcount(struct page *head)
{
return atomic_read(compound_mapcount_ptr(head)) + 1;
}
/*
* Mapcount of compound page as a whole, does not include mapped sub-pages.
*
* Must be called only for compound pages or any their tail sub-pages.
*/
static inline int compound_mapcount(struct page *page)
{
VM_BUG_ON_PAGE(!PageCompound(page), page);
page = compound_head(page);
return head_compound_mapcount(page);
}
/*
* The atomic page->_mapcount, starts from -1: so that transitions
* both from it and to it can be tracked, using atomic_inc_and_test
* and atomic_add_negative(-1).
*/
static inline void page_mapcount_reset(struct page *page)
{
atomic_set(&(page)->_mapcount, -1);
}
int __page_mapcount(struct page *page);
/*
* Mapcount of 0-order page; when compound sub-page, includes
* compound_mapcount().
*
* Result is undefined for pages which cannot be mapped into userspace.
* For example SLAB or special types of pages. See function page_has_type().
* They use this place in struct page differently.
*/
static inline int page_mapcount(struct page *page)
{
if (unlikely(PageCompound(page)))
return __page_mapcount(page); return atomic_read(&page->_mapcount) + 1;
}
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
int total_mapcount(struct page *page);
int page_trans_huge_mapcount(struct page *page, int *total_mapcount);
#else
static inline int total_mapcount(struct page *page)
{
return page_mapcount(page);
}
static inline int page_trans_huge_mapcount(struct page *page,
int *total_mapcount)
{
int mapcount = page_mapcount(page);
if (total_mapcount)
*total_mapcount = mapcount;
return mapcount;
}
#endif
static inline struct page *virt_to_head_page(const void *x)
{
struct page *page = virt_to_page(x);
return compound_head(page);
}
void __put_page(struct page *page);
void put_pages_list(struct list_head *pages);
void split_page(struct page *page, unsigned int order);
void copy_huge_page(struct page *dst, struct page *src);
/*
* Compound pages have a destructor function. Provide a
* prototype for that function and accessor functions.
* These are _only_ valid on the head of a compound page.
*/
typedef void compound_page_dtor(struct page *);
/* Keep the enum in sync with compound_page_dtors array in mm/page_alloc.c */
enum compound_dtor_id {
NULL_COMPOUND_DTOR,
COMPOUND_PAGE_DTOR,
#ifdef CONFIG_HUGETLB_PAGE
HUGETLB_PAGE_DTOR,
#endif
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
TRANSHUGE_PAGE_DTOR,
#endif
NR_COMPOUND_DTORS,
};
extern compound_page_dtor * const compound_page_dtors[NR_COMPOUND_DTORS];
static inline void set_compound_page_dtor(struct page *page,
enum compound_dtor_id compound_dtor)
{
VM_BUG_ON_PAGE(compound_dtor >= NR_COMPOUND_DTORS, page);
page[1].compound_dtor = compound_dtor;
}
static inline void destroy_compound_page(struct page *page)
{
VM_BUG_ON_PAGE(page[1].compound_dtor >= NR_COMPOUND_DTORS, page);
compound_page_dtors[page[1].compound_dtor](page);
}
static inline unsigned int compound_order(struct page *page)
{
if (!PageHead(page))
return 0;
return page[1].compound_order;
}
static inline bool hpage_pincount_available(struct page *page)
{
/*
* Can the page->hpage_pinned_refcount field be used? That field is in
* the 3rd page of the compound page, so the smallest (2-page) compound
* pages cannot support it.
*/
page = compound_head(page);
return PageCompound(page) && compound_order(page) > 1;
}
static inline int head_compound_pincount(struct page *head)
{
return atomic_read(compound_pincount_ptr(head));
}
static inline int compound_pincount(struct page *page)
{
VM_BUG_ON_PAGE(!hpage_pincount_available(page), page);
page = compound_head(page);
return head_compound_pincount(page);
}
static inline void set_compound_order(struct page *page, unsigned int order)
{
page[1].compound_order = order;
page[1].compound_nr = 1U << order;
}
/* Returns the number of pages in this potentially compound page. */
static inline unsigned long compound_nr(struct page *page)
{
if (!PageHead(page))
return 1;
return page[1].compound_nr;
}
/* Returns the number of bytes in this potentially compound page. */
static inline unsigned long page_size(struct page *page)
{
return PAGE_SIZE << compound_order(page);
}
/* Returns the number of bits needed for the number of bytes in a page */
static inline unsigned int page_shift(struct page *page)
{
return PAGE_SHIFT + compound_order(page);
}
void free_compound_page(struct page *page);
#ifdef CONFIG_MMU
/*
* Do pte_mkwrite, but only if the vma says VM_WRITE. We do this when
* servicing faults for write access. In the normal case, do always want
* pte_mkwrite. But get_user_pages can cause write faults for mappings
* that do not have writing enabled, when used by access_process_vm.
*/
static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma)
{
if (likely(vma->vm_flags & VM_WRITE))
pte = pte_mkwrite(pte);
return pte;
}
vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page);
void do_set_pte(struct vm_fault *vmf, struct page *page, unsigned long addr);
vm_fault_t finish_fault(struct vm_fault *vmf);
vm_fault_t finish_mkwrite_fault(struct vm_fault *vmf);
#endif
/*
* Multiple processes may "see" the same page. E.g. for untouched
* mappings of /dev/null, all processes see the same page full of
* zeroes, and text pages of executables and shared libraries have
* only one copy in memory, at most, normally.
*
* For the non-reserved pages, page_count(page) denotes a reference count.
* page_count() == 0 means the page is free. page->lru is then used for
* freelist management in the buddy allocator.
* page_count() > 0 means the page has been allocated.
*
* Pages are allocated by the slab allocator in order to provide memory
* to kmalloc and kmem_cache_alloc. In this case, the management of the
* page, and the fields in 'struct page' are the responsibility of mm/slab.c
* unless a particular usage is carefully commented. (the responsibility of
* freeing the kmalloc memory is the caller's, of course).
*
* A page may be used by anyone else who does a __get_free_page().
* In this case, page_count still tracks the references, and should only
* be used through the normal accessor functions. The top bits of page->flags
* and page->virtual store page management information, but all other fields
* are unused and could be used privately, carefully. The management of this
* page is the responsibility of the one who allocated it, and those who have
* subsequently been given references to it.
*
* The other pages (we may call them "pagecache pages") are completely
* managed by the Linux memory manager: I/O, buffers, swapping etc.
* The following discussion applies only to them.
*
* A pagecache page contains an opaque `private' member, which belongs to the
* page's address_space. Usually, this is the address of a circular list of
* the page's disk buffers. PG_private must be set to tell the VM to call
* into the filesystem to release these pages.
*
* A page may belong to an inode's memory mapping. In this case, page->mapping
* is the pointer to the inode, and page->index is the file offset of the page,
* in units of PAGE_SIZE.
*
* If pagecache pages are not associated with an inode, they are said to be
* anonymous pages. These may become associated with the swapcache, and in that
* case PG_swapcache is set, and page->private is an offset into the swapcache.
*
* In either case (swapcache or inode backed), the pagecache itself holds one
* reference to the page. Setting PG_private should also increment the
* refcount. The each user mapping also has a reference to the page.
*
* The pagecache pages are stored in a per-mapping radix tree, which is
* rooted at mapping->i_pages, and indexed by offset.
* Where 2.4 and early 2.6 kernels kept dirty/clean pages in per-address_space
* lists, we instead now tag pages as dirty/writeback in the radix tree.
*
* All pagecache pages may be subject to I/O:
* - inode pages may need to be read from disk,
* - inode pages which have been modified and are MAP_SHARED may need
* to be written back to the inode on disk,
* - anonymous pages (including MAP_PRIVATE file mappings) which have been
* modified may need to be swapped out to swap space and (later) to be read
* back into memory.
*/
/*
* The zone field is never updated after free_area_init_core()
* sets it, so none of the operations on it need to be atomic.
*/
/* Page flags: | [SECTION] | [NODE] | ZONE | [LAST_CPUPID] | ... | FLAGS | */
#define SECTIONS_PGOFF ((sizeof(unsigned long)*8) - SECTIONS_WIDTH)
#define NODES_PGOFF (SECTIONS_PGOFF - NODES_WIDTH)
#define ZONES_PGOFF (NODES_PGOFF - ZONES_WIDTH)
#define LAST_CPUPID_PGOFF (ZONES_PGOFF - LAST_CPUPID_WIDTH)
#define KASAN_TAG_PGOFF (LAST_CPUPID_PGOFF - KASAN_TAG_WIDTH)
/*
* Define the bit shifts to access each section. For non-existent
* sections we define the shift as 0; that plus a 0 mask ensures
* the compiler will optimise away reference to them.
*/
#define SECTIONS_PGSHIFT (SECTIONS_PGOFF * (SECTIONS_WIDTH != 0))
#define NODES_PGSHIFT (NODES_PGOFF * (NODES_WIDTH != 0))
#define ZONES_PGSHIFT (ZONES_PGOFF * (ZONES_WIDTH != 0))
#define LAST_CPUPID_PGSHIFT (LAST_CPUPID_PGOFF * (LAST_CPUPID_WIDTH != 0))
#define KASAN_TAG_PGSHIFT (KASAN_TAG_PGOFF * (KASAN_TAG_WIDTH != 0))
/* NODE:ZONE or SECTION:ZONE is used to ID a zone for the buddy allocator */
#ifdef NODE_NOT_IN_PAGE_FLAGS
#define ZONEID_SHIFT (SECTIONS_SHIFT + ZONES_SHIFT)
#define ZONEID_PGOFF ((SECTIONS_PGOFF < ZONES_PGOFF)? \
SECTIONS_PGOFF : ZONES_PGOFF)
#else
#define ZONEID_SHIFT (NODES_SHIFT + ZONES_SHIFT)
#define ZONEID_PGOFF ((NODES_PGOFF < ZONES_PGOFF)? \
NODES_PGOFF : ZONES_PGOFF)
#endif
#define ZONEID_PGSHIFT (ZONEID_PGOFF * (ZONEID_SHIFT != 0))
#define ZONES_MASK ((1UL << ZONES_WIDTH) - 1)
#define NODES_MASK ((1UL << NODES_WIDTH) - 1)
#define SECTIONS_MASK ((1UL << SECTIONS_WIDTH) - 1)
#define LAST_CPUPID_MASK ((1UL << LAST_CPUPID_SHIFT) - 1)
#define KASAN_TAG_MASK ((1UL << KASAN_TAG_WIDTH) - 1)
#define ZONEID_MASK ((1UL << ZONEID_SHIFT) - 1)
static inline enum zone_type page_zonenum(const struct page *page)
{
ASSERT_EXCLUSIVE_BITS(page->flags, ZONES_MASK << ZONES_PGSHIFT);
return (page->flags >> ZONES_PGSHIFT) & ZONES_MASK;
}
#ifdef CONFIG_ZONE_DEVICE
static inline bool is_zone_device_page(const struct page *page)
{
return page_zonenum(page) == ZONE_DEVICE;
}
extern void memmap_init_zone_device(struct zone *, unsigned long,
unsigned long, struct dev_pagemap *);
#else
static inline bool is_zone_device_page(const struct page *page)
{
return false;
}
#endif
static inline bool is_zone_movable_page(const struct page *page)
{
return page_zonenum(page) == ZONE_MOVABLE;
}
#ifdef CONFIG_DEV_PAGEMAP_OPS
void free_devmap_managed_page(struct page *page);
DECLARE_STATIC_KEY_FALSE(devmap_managed_key);
static inline bool page_is_devmap_managed(struct page *page)
{
if (!static_branch_unlikely(&devmap_managed_key))
return false;
if (!is_zone_device_page(page))
return false;
switch (page->pgmap->type) {
case MEMORY_DEVICE_PRIVATE:
case MEMORY_DEVICE_FS_DAX:
return true;
default:
break;
}
return false;
}
void put_devmap_managed_page(struct page *page);
#else /* CONFIG_DEV_PAGEMAP_OPS */
static inline bool page_is_devmap_managed(struct page *page)
{
return false;
}
static inline void put_devmap_managed_page(struct page *page)
{
}
#endif /* CONFIG_DEV_PAGEMAP_OPS */
static inline bool is_device_private_page(const struct page *page)
{
return IS_ENABLED(CONFIG_DEV_PAGEMAP_OPS) &&
IS_ENABLED(CONFIG_DEVICE_PRIVATE) &&
is_zone_device_page(page) &&
page->pgmap->type == MEMORY_DEVICE_PRIVATE;
}
static inline bool is_pci_p2pdma_page(const struct page *page)
{
return IS_ENABLED(CONFIG_DEV_PAGEMAP_OPS) &&
IS_ENABLED(CONFIG_PCI_P2PDMA) &&
is_zone_device_page(page) &&
page->pgmap->type == MEMORY_DEVICE_PCI_P2PDMA;
}
/* 127: arbitrary random number, small enough to assemble well */
#define page_ref_zero_or_close_to_overflow(page) \
((unsigned int) page_ref_count(page) + 127u <= 127u)
static inline void get_page(struct page *page)
{
page = compound_head(page);
/*
* Getting a normal page or the head of a compound page
* requires to already have an elevated page->_refcount.
*/
VM_BUG_ON_PAGE(page_ref_zero_or_close_to_overflow(page), page);
page_ref_inc(page);
}
bool __must_check try_grab_page(struct page *page, unsigned int flags);
struct page *try_grab_compound_head(struct page *page, int refs,
unsigned int flags);
static inline __must_check bool try_get_page(struct page *page)
{
page = compound_head(page);
if (WARN_ON_ONCE(page_ref_count(page) <= 0))
return false;
page_ref_inc(page);
return true;
}
static inline void put_page(struct page *page)
{
page = compound_head(page);
/*
* For devmap managed pages we need to catch refcount transition from
* 2 to 1, when refcount reach one it means the page is free and we
* need to inform the device driver through callback. See
* include/linux/memremap.h and HMM for details.
*/
if (page_is_devmap_managed(page)) {
put_devmap_managed_page(page);
return;
}
if (put_page_testzero(page)) __put_page(page);
}
/*
* GUP_PIN_COUNTING_BIAS, and the associated functions that use it, overload
* the page's refcount so that two separate items are tracked: the original page
* reference count, and also a new count of how many pin_user_pages() calls were
* made against the page. ("gup-pinned" is another term for the latter).
*
* With this scheme, pin_user_pages() becomes special: such pages are marked as
* distinct from normal pages. As such, the unpin_user_page() call (and its
* variants) must be used in order to release gup-pinned pages.
*
* Choice of value:
*
* By making GUP_PIN_COUNTING_BIAS a power of two, debugging of page reference
* counts with respect to pin_user_pages() and unpin_user_page() becomes
* simpler, due to the fact that adding an even power of two to the page
* refcount has the effect of using only the upper N bits, for the code that
* counts up using the bias value. This means that the lower bits are left for
* the exclusive use of the original code that increments and decrements by one
* (or at least, by much smaller values than the bias value).
*
* Of course, once the lower bits overflow into the upper bits (and this is
* OK, because subtraction recovers the original values), then visual inspection
* no longer suffices to directly view the separate counts. However, for normal
* applications that don't have huge page reference counts, this won't be an
* issue.
*
* Locking: the lockless algorithm described in page_cache_get_speculative()
* and page_cache_gup_pin_speculative() provides safe operation for
* get_user_pages and page_mkclean and other calls that race to set up page
* table entries.
*/
#define GUP_PIN_COUNTING_BIAS (1U << 10)
void unpin_user_page(struct page *page);
void unpin_user_pages_dirty_lock(struct page **pages, unsigned long npages,
bool make_dirty);
void unpin_user_page_range_dirty_lock(struct page *page, unsigned long npages,
bool make_dirty);
void unpin_user_pages(struct page **pages, unsigned long npages);
/**
* page_maybe_dma_pinned - Report if a page is pinned for DMA.
* @page: The page.
*
* This function checks if a page has been pinned via a call to
* a function in the pin_user_pages() family.
*
* For non-huge pages, the return value is partially fuzzy: false is not fuzzy,
* because it means "definitely not pinned for DMA", but true means "probably
* pinned for DMA, but possibly a false positive due to having at least
* GUP_PIN_COUNTING_BIAS worth of normal page references".
*
* False positives are OK, because: a) it's unlikely for a page to get that many
* refcounts, and b) all the callers of this routine are expected to be able to
* deal gracefully with a false positive.
*
* For huge pages, the result will be exactly correct. That's because we have
* more tracking data available: the 3rd struct page in the compound page is
* used to track the pincount (instead using of the GUP_PIN_COUNTING_BIAS
* scheme).
*
* For more information, please see Documentation/core-api/pin_user_pages.rst.
*
* Return: True, if it is likely that the page has been "dma-pinned".
* False, if the page is definitely not dma-pinned.
*/
static inline bool page_maybe_dma_pinned(struct page *page)
{
if (hpage_pincount_available(page))
return compound_pincount(page) > 0;
/*
* page_ref_count() is signed. If that refcount overflows, then
* page_ref_count() returns a negative value, and callers will avoid
* further incrementing the refcount.
*
* Here, for that overflow case, use the signed bit to count a little
* bit higher via unsigned math, and thus still get an accurate result.
*/
return ((unsigned int)page_ref_count(compound_head(page))) >=
GUP_PIN_COUNTING_BIAS;
}
static inline bool is_cow_mapping(vm_flags_t flags)
{
return (flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE;
}
/*
* This should most likely only be called during fork() to see whether we
* should break the cow immediately for a page on the src mm.
*/
static inline bool page_needs_cow_for_dma(struct vm_area_struct *vma,
struct page *page)
{
if (!is_cow_mapping(vma->vm_flags))
return false;
if (!test_bit(MMF_HAS_PINNED, &vma->vm_mm->flags))
return false;
return page_maybe_dma_pinned(page);
}
#if defined(CONFIG_SPARSEMEM) && !defined(CONFIG_SPARSEMEM_VMEMMAP)
#define SECTION_IN_PAGE_FLAGS
#endif
/*
* The identification function is mainly used by the buddy allocator for
* determining if two pages could be buddies. We are not really identifying
* the zone since we could be using the section number id if we do not have
* node id available in page flags.
* We only guarantee that it will return the same value for two combinable
* pages in a zone.
*/
static inline int page_zone_id(struct page *page)
{
return (page->flags >> ZONEID_PGSHIFT) & ZONEID_MASK;
}
#ifdef NODE_NOT_IN_PAGE_FLAGS
extern int page_to_nid(const struct page *page);
#else
static inline int page_to_nid(const struct page *page)
{
struct page *p = (struct page *)page;
return (PF_POISONED_CHECK(p)->flags >> NODES_PGSHIFT) & NODES_MASK;
}
#endif
#ifdef CONFIG_NUMA_BALANCING
static inline int cpu_pid_to_cpupid(int cpu, int pid)
{
return ((cpu & LAST__CPU_MASK) << LAST__PID_SHIFT) | (pid & LAST__PID_MASK);
}
static inline int cpupid_to_pid(int cpupid)
{
return cpupid & LAST__PID_MASK;
}
static inline int cpupid_to_cpu(int cpupid)
{
return (cpupid >> LAST__PID_SHIFT) & LAST__CPU_MASK;
}
static inline int cpupid_to_nid(int cpupid)
{
return cpu_to_node(cpupid_to_cpu(cpupid));
}
static inline bool cpupid_pid_unset(int cpupid)
{
return cpupid_to_pid(cpupid) == (-1 & LAST__PID_MASK);
}
static inline bool cpupid_cpu_unset(int cpupid)
{
return cpupid_to_cpu(cpupid) == (-1 & LAST__CPU_MASK);
}
static inline bool __cpupid_match_pid(pid_t task_pid, int cpupid)
{
return (task_pid & LAST__PID_MASK) == cpupid_to_pid(cpupid);
}
#define cpupid_match_pid(task, cpupid) __cpupid_match_pid(task->pid, cpupid)
#ifdef LAST_CPUPID_NOT_IN_PAGE_FLAGS
static inline int page_cpupid_xchg_last(struct page *page, int cpupid)
{
return xchg(&page->_last_cpupid, cpupid & LAST_CPUPID_MASK);
}
static inline int page_cpupid_last(struct page *page)
{
return page->_last_cpupid;
}
static inline void page_cpupid_reset_last(struct page *page)
{
page->_last_cpupid = -1 & LAST_CPUPID_MASK;
}
#else
static inline int page_cpupid_last(struct page *page)
{
return (page->flags >> LAST_CPUPID_PGSHIFT) & LAST_CPUPID_MASK;
}
extern int page_cpupid_xchg_last(struct page *page, int cpupid);
static inline void page_cpupid_reset_last(struct page *page)
{
page->flags |= LAST_CPUPID_MASK << LAST_CPUPID_PGSHIFT;
}
#endif /* LAST_CPUPID_NOT_IN_PAGE_FLAGS */
#else /* !CONFIG_NUMA_BALANCING */
static inline int page_cpupid_xchg_last(struct page *page, int cpupid)
{
return page_to_nid(page); /* XXX */
}
static inline int page_cpupid_last(struct page *page)
{
return page_to_nid(page); /* XXX */
}
static inline int cpupid_to_nid(int cpupid)
{
return -1;
}
static inline int cpupid_to_pid(int cpupid)
{
return -1;
}
static inline int cpupid_to_cpu(int cpupid)
{
return -1;
}
static inline int cpu_pid_to_cpupid(int nid, int pid)
{
return -1;
}
static inline bool cpupid_pid_unset(int cpupid)
{
return true;
}
static inline void page_cpupid_reset_last(struct page *page)
{
}
static inline bool cpupid_match_pid(struct task_struct *task, int cpupid)
{
return false;
}
#endif /* CONFIG_NUMA_BALANCING */
#if defined(CONFIG_KASAN_SW_TAGS) || defined(CONFIG_KASAN_HW_TAGS)
/*
* KASAN per-page tags are stored xor'ed with 0xff. This allows to avoid
* setting tags for all pages to native kernel tag value 0xff, as the default
* value 0x00 maps to 0xff.
*/
static inline u8 page_kasan_tag(const struct page *page)
{
u8 tag = 0xff;
if (kasan_enabled()) {
tag = (page->flags >> KASAN_TAG_PGSHIFT) & KASAN_TAG_MASK;
tag ^= 0xff;
}
return tag;
}
static inline void page_kasan_tag_set(struct page *page, u8 tag)
{
unsigned long old_flags, flags;
if (!kasan_enabled())
return;
tag ^= 0xff;
old_flags = READ_ONCE(page->flags);
do {
flags = old_flags;
flags &= ~(KASAN_TAG_MASK << KASAN_TAG_PGSHIFT);
flags |= (tag & KASAN_TAG_MASK) << KASAN_TAG_PGSHIFT;
} while (unlikely(!try_cmpxchg(&page->flags, &old_flags, flags)));
}
static inline void page_kasan_tag_reset(struct page *page)
{
if (kasan_enabled())
page_kasan_tag_set(page, 0xff);
}
#else /* CONFIG_KASAN_SW_TAGS || CONFIG_KASAN_HW_TAGS */
static inline u8 page_kasan_tag(const struct page *page)
{
return 0xff;
}
static inline void page_kasan_tag_set(struct page *page, u8 tag) { }
static inline void page_kasan_tag_reset(struct page *page) { }
#endif /* CONFIG_KASAN_SW_TAGS || CONFIG_KASAN_HW_TAGS */
static inline struct zone *page_zone(const struct page *page)
{
return &NODE_DATA(page_to_nid(page))->node_zones[page_zonenum(page)];
}
static inline pg_data_t *page_pgdat(const struct page *page)
{
return NODE_DATA(page_to_nid(page));
}
#ifdef SECTION_IN_PAGE_FLAGS
static inline void set_page_section(struct page *page, unsigned long section)
{
page->flags &= ~(SECTIONS_MASK << SECTIONS_PGSHIFT);
page->flags |= (section & SECTIONS_MASK) << SECTIONS_PGSHIFT;
}
static inline unsigned long page_to_section(const struct page *page)
{
return (page->flags >> SECTIONS_PGSHIFT) & SECTIONS_MASK;
}
#endif
/* MIGRATE_CMA and ZONE_MOVABLE do not allow pin pages */
#ifdef CONFIG_MIGRATION
static inline bool is_pinnable_page(struct page *page)
{
return !(is_zone_movable_page(page) || is_migrate_cma_page(page)) ||
is_zero_pfn(page_to_pfn(page));
}
#else
static inline bool is_pinnable_page(struct page *page)
{
return true;
}
#endif
static inline void set_page_zone(struct page *page, enum zone_type zone)
{
page->flags &= ~(ZONES_MASK << ZONES_PGSHIFT);
page->flags |= (zone & ZONES_MASK) << ZONES_PGSHIFT;
}
static inline void set_page_node(struct page *page, unsigned long node)
{
page->flags &= ~(NODES_MASK << NODES_PGSHIFT);
page->flags |= (node & NODES_MASK) << NODES_PGSHIFT;
}
static inline void set_page_links(struct page *page, enum zone_type zone,
unsigned long node, unsigned long pfn)
{
set_page_zone(page, zone);
set_page_node(page, node);
#ifdef SECTION_IN_PAGE_FLAGS
set_page_section(page, pfn_to_section_nr(pfn));
#endif
}
/*
* Some inline functions in vmstat.h depend on page_zone()
*/
#include <linux/vmstat.h>
static __always_inline void *lowmem_page_address(const struct page *page)
{
return page_to_virt(page);
}
#if defined(CONFIG_HIGHMEM) && !defined(WANT_PAGE_VIRTUAL)
#define HASHED_PAGE_VIRTUAL
#endif
#if defined(WANT_PAGE_VIRTUAL)
static inline void *page_address(const struct page *page)
{
return page->virtual;
}
static inline void set_page_address(struct page *page, void *address)
{
page->virtual = address;
}
#define page_address_init() do { } while(0)
#endif
#if defined(HASHED_PAGE_VIRTUAL)
void *page_address(const struct page *page);
void set_page_address(struct page *page, void *virtual);
void page_address_init(void);
#endif
#if !defined(HASHED_PAGE_VIRTUAL) && !defined(WANT_PAGE_VIRTUAL)
#define page_address(page) lowmem_page_address(page)
#define set_page_address(page, address) do { } while(0)
#define page_address_init() do { } while(0)
#endif
extern void *page_rmapping(struct page *page);
extern struct anon_vma *page_anon_vma(struct page *page);
extern struct address_space *page_mapping(struct page *page);
extern struct address_space *__page_file_mapping(struct page *);
static inline
struct address_space *page_file_mapping(struct page *page)
{
if (unlikely(PageSwapCache(page)))
return __page_file_mapping(page);
return page->mapping;
}
extern pgoff_t __page_file_index(struct page *page);
/*
* Return the pagecache index of the passed page. Regular pagecache pages
* use ->index whereas swapcache pages use swp_offset(->private)
*/
static inline pgoff_t page_index(struct page *page)
{
if (unlikely(PageSwapCache(page)))
return __page_file_index(page); return page->index;
}
bool page_mapped(struct page *page);
struct address_space *page_mapping(struct page *page);
/*
* Return true only if the page has been allocated with
* ALLOC_NO_WATERMARKS and the low watermark was not
* met implying that the system is under some pressure.
*/
static inline bool page_is_pfmemalloc(const struct page *page)
{
/*
* lru.next has bit 1 set if the page is allocated from the
* pfmemalloc reserves. Callers may simply overwrite it if
* they do not need to preserve that information.
*/
return (uintptr_t)page->lru.next & BIT(1);
}
/*
* Only to be called by the page allocator on a freshly allocated
* page.
*/
static inline void set_page_pfmemalloc(struct page *page)
{
page->lru.next = (void *)BIT(1);
}
static inline void clear_page_pfmemalloc(struct page *page)
{
page->lru.next = NULL;
}
/*
* Can be called by the pagefault handler when it gets a VM_FAULT_OOM.
*/
extern void pagefault_out_of_memory(void);
#define offset_in_page(p) ((unsigned long)(p) & ~PAGE_MASK)
#define offset_in_thp(page, p) ((unsigned long)(p) & (thp_size(page) - 1))
/*
* Flags passed to show_mem() and show_free_areas() to suppress output in
* various contexts.
*/
#define SHOW_MEM_FILTER_NODES (0x0001u) /* disallowed nodes */
extern void show_free_areas(unsigned int flags, nodemask_t *nodemask);
#ifdef CONFIG_MMU
extern bool can_do_mlock(void);
#else
static inline bool can_do_mlock(void) { return false; }
#endif
extern int user_shm_lock(size_t, struct ucounts *);
extern void user_shm_unlock(size_t, struct ucounts *);
/*
* Parameter block passed down to zap_pte_range in exceptional cases.
*/
struct zap_details {
struct address_space *check_mapping; /* Check page->mapping if set */
pgoff_t first_index; /* Lowest page->index to unmap */
pgoff_t last_index; /* Highest page->index to unmap */
struct page *single_page; /* Locked page to be unmapped */
};
struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
pte_t pte);
struct page *vm_normal_page_pmd(struct vm_area_struct *vma, unsigned long addr,
pmd_t pmd);
void zap_vma_ptes(struct vm_area_struct *vma, unsigned long address,
unsigned long size);
void zap_page_range(struct vm_area_struct *vma, unsigned long address,
unsigned long size);
void unmap_vmas(struct mmu_gather *tlb, struct vm_area_struct *start_vma,
unsigned long start, unsigned long end);
struct mmu_notifier_range;
void free_pgd_range(struct mmu_gather *tlb, unsigned long addr,
unsigned long end, unsigned long floor, unsigned long ceiling);
int
copy_page_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma);
int follow_invalidate_pte(struct mm_struct *mm, unsigned long address,
struct mmu_notifier_range *range, pte_t **ptepp,
pmd_t **pmdpp, spinlock_t **ptlp);
int follow_pte(struct mm_struct *mm, unsigned long address,
pte_t **ptepp, spinlock_t **ptlp);
int follow_pfn(struct vm_area_struct *vma, unsigned long address,
unsigned long *pfn);
int follow_phys(struct vm_area_struct *vma, unsigned long address,
unsigned int flags, unsigned long *prot, resource_size_t *phys);
int generic_access_phys(struct vm_area_struct *vma, unsigned long addr,
void *buf, int len, int write);
extern void truncate_pagecache(struct inode *inode, loff_t new);
extern void truncate_setsize(struct inode *inode, loff_t newsize);
void pagecache_isize_extended(struct inode *inode, loff_t from, loff_t to);
void truncate_pagecache_range(struct inode *inode, loff_t offset, loff_t end);
int truncate_inode_page(struct address_space *mapping, struct page *page);
int generic_error_remove_page(struct address_space *mapping, struct page *page);
int invalidate_inode_page(struct page *page);
#ifdef CONFIG_MMU
extern vm_fault_t handle_mm_fault(struct vm_area_struct *vma,
unsigned long address, unsigned int flags,
struct pt_regs *regs);
extern int fixup_user_fault(struct mm_struct *mm,
unsigned long address, unsigned int fault_flags,
bool *unlocked);
void unmap_mapping_page(struct page *page);
void unmap_mapping_pages(struct address_space *mapping,
pgoff_t start, pgoff_t nr, bool even_cows);
void unmap_mapping_range(struct address_space *mapping,
loff_t const holebegin, loff_t const holelen, int even_cows);
#else
static inline vm_fault_t handle_mm_fault(struct vm_area_struct *vma,
unsigned long address, unsigned int flags,
struct pt_regs *regs)
{
/* should never happen if there's no MMU */
BUG();
return VM_FAULT_SIGBUS;
}
static inline int fixup_user_fault(struct mm_struct *mm, unsigned long address,
unsigned int fault_flags, bool *unlocked)
{
/* should never happen if there's no MMU */
BUG();
return -EFAULT;
}
static inline void unmap_mapping_page(struct page *page) { }
static inline void unmap_mapping_pages(struct address_space *mapping,
pgoff_t start, pgoff_t nr, bool even_cows) { }
static inline void unmap_mapping_range(struct address_space *mapping,
loff_t const holebegin, loff_t const holelen, int even_cows) { }
#endif
static inline void unmap_shared_mapping_range(struct address_space *mapping,
loff_t const holebegin, loff_t const holelen)
{
unmap_mapping_range(mapping, holebegin, holelen, 0);
}
extern int access_process_vm(struct task_struct *tsk, unsigned long addr,
void *buf, int len, unsigned int gup_flags);
extern int access_remote_vm(struct mm_struct *mm, unsigned long addr,
void *buf, int len, unsigned int gup_flags);
extern int __access_remote_vm(struct mm_struct *mm, unsigned long addr,
void *buf, int len, unsigned int gup_flags);
long get_user_pages_remote(struct mm_struct *mm,
unsigned long start, unsigned long nr_pages,
unsigned int gup_flags, struct page **pages,
struct vm_area_struct **vmas, int *locked);
long pin_user_pages_remote(struct mm_struct *mm,
unsigned long start, unsigned long nr_pages,
unsigned int gup_flags, struct page **pages,
struct vm_area_struct **vmas, int *locked);
long get_user_pages(unsigned long start, unsigned long nr_pages,
unsigned int gup_flags, struct page **pages,
struct vm_area_struct **vmas);
long pin_user_pages(unsigned long start, unsigned long nr_pages,
unsigned int gup_flags, struct page **pages,
struct vm_area_struct **vmas);
long get_user_pages_locked(unsigned long start, unsigned long nr_pages,
unsigned int gup_flags, struct page **pages, int *locked);
long pin_user_pages_locked(unsigned long start, unsigned long nr_pages,
unsigned int gup_flags, struct page **pages, int *locked);
long get_user_pages_unlocked(unsigned long start, unsigned long nr_pages,
struct page **pages, unsigned int gup_flags);
long pin_user_pages_unlocked(unsigned long start, unsigned long nr_pages,
struct page **pages, unsigned int gup_flags);
int get_user_pages_fast(unsigned long start, int nr_pages,
unsigned int gup_flags, struct page **pages);
int pin_user_pages_fast(unsigned long start, int nr_pages,
unsigned int gup_flags, struct page **pages);
int account_locked_vm(struct mm_struct *mm, unsigned long pages, bool inc);
int __account_locked_vm(struct mm_struct *mm, unsigned long pages, bool inc,
struct task_struct *task, bool bypass_rlim);
struct kvec;
int get_kernel_pages(const struct kvec *iov, int nr_pages, int write,
struct page **pages);
struct page *get_dump_page(unsigned long addr);
extern int try_to_release_page(struct page * page, gfp_t gfp_mask);
extern void do_invalidatepage(struct page *page, unsigned int offset,
unsigned int length);
int redirty_page_for_writepage(struct writeback_control *wbc,
struct page *page);
void account_page_cleaned(struct page *page, struct address_space *mapping,
struct bdi_writeback *wb);
int set_page_dirty(struct page *page);
int set_page_dirty_lock(struct page *page);
void __cancel_dirty_page(struct page *page);
static inline void cancel_dirty_page(struct page *page)
{
/* Avoid atomic ops, locking, etc. when not actually needed. */
if (PageDirty(page))
__cancel_dirty_page(page);
}
int clear_page_dirty_for_io(struct page *page);
int get_cmdline(struct task_struct *task, char *buffer, int buflen);
extern unsigned long move_page_tables(struct vm_area_struct *vma,
unsigned long old_addr, struct vm_area_struct *new_vma,
unsigned long new_addr, unsigned long len,
bool need_rmap_locks);
/*
* Flags used by change_protection(). For now we make it a bitmap so
* that we can pass in multiple flags just like parameters. However
* for now all the callers are only use one of the flags at the same
* time.
*/
/* Whether we should allow dirty bit accounting */
#define MM_CP_DIRTY_ACCT (1UL << 0)
/* Whether this protection change is for NUMA hints */
#define MM_CP_PROT_NUMA (1UL << 1)
/* Whether this change is for write protecting */
#define MM_CP_UFFD_WP (1UL << 2) /* do wp */
#define MM_CP_UFFD_WP_RESOLVE (1UL << 3) /* Resolve wp */
#define MM_CP_UFFD_WP_ALL (MM_CP_UFFD_WP | \
MM_CP_UFFD_WP_RESOLVE)
extern unsigned long change_protection(struct vm_area_struct *vma, unsigned long start,
unsigned long end, pgprot_t newprot,
unsigned long cp_flags);
extern int mprotect_fixup(struct vm_area_struct *vma,
struct vm_area_struct **pprev, unsigned long start,
unsigned long end, unsigned long newflags);
/*
* doesn't attempt to fault and will return short.
*/
int get_user_pages_fast_only(unsigned long start, int nr_pages,
unsigned int gup_flags, struct page **pages);
int pin_user_pages_fast_only(unsigned long start, int nr_pages,
unsigned int gup_flags, struct page **pages);
static inline bool get_user_page_fast_only(unsigned long addr,
unsigned int gup_flags, struct page **pagep)
{
return get_user_pages_fast_only(addr, 1, gup_flags, pagep) == 1;
}
/*
* per-process(per-mm_struct) statistics.
*/
static inline unsigned long get_mm_counter(struct mm_struct *mm, int member)
{
long val = atomic_long_read(&mm->rss_stat.count[member]);
#ifdef SPLIT_RSS_COUNTING
/*
* counter is updated in asynchronous manner and may go to minus.
* But it's never be expected number for users.
*/
if (val < 0)
val = 0;
#endif
return (unsigned long)val;
}
void mm_trace_rss_stat(struct mm_struct *mm, int member, long count);
static inline void add_mm_counter(struct mm_struct *mm, int member, long value)
{
long count = atomic_long_add_return(value, &mm->rss_stat.count[member]);
mm_trace_rss_stat(mm, member, count);
}
static inline void inc_mm_counter(struct mm_struct *mm, int member)
{
long count = atomic_long_inc_return(&mm->rss_stat.count[member]);
mm_trace_rss_stat(mm, member, count);
}
static inline void dec_mm_counter(struct mm_struct *mm, int member)
{
long count = atomic_long_dec_return(&mm->rss_stat.count[member]);
mm_trace_rss_stat(mm, member, count);
}
/* Optimized variant when page is already known not to be PageAnon */
static inline int mm_counter_file(struct page *page)
{
if (PageSwapBacked(page))
return MM_SHMEMPAGES;
return MM_FILEPAGES;
}
static inline int mm_counter(struct page *page)
{
if (PageAnon(page))
return MM_ANONPAGES;
return mm_counter_file(page);
}
static inline unsigned long get_mm_rss(struct mm_struct *mm)
{
return get_mm_counter(mm, MM_FILEPAGES) +
get_mm_counter(mm, MM_ANONPAGES) +
get_mm_counter(mm, MM_SHMEMPAGES);
}
static inline unsigned long get_mm_hiwater_rss(struct mm_struct *mm)
{
return max(mm->hiwater_rss, get_mm_rss(mm));
}
static inline unsigned long get_mm_hiwater_vm(struct mm_struct *mm)
{
return max(mm->hiwater_vm, mm->total_vm);
}
static inline void update_hiwater_rss(struct mm_struct *mm)
{
unsigned long _rss = get_mm_rss(mm);
if ((mm)->hiwater_rss < _rss)
(mm)->hiwater_rss = _rss;
}
static inline void update_hiwater_vm(struct mm_struct *mm)
{
if (mm->hiwater_vm < mm->total_vm)
mm->hiwater_vm = mm->total_vm;
}
static inline void reset_mm_hiwater_rss(struct mm_struct *mm)
{
mm->hiwater_rss = get_mm_rss(mm);
}
static inline void setmax_mm_hiwater_rss(unsigned long *maxrss,
struct mm_struct *mm)
{
unsigned long hiwater_rss = get_mm_hiwater_rss(mm);
if (*maxrss < hiwater_rss)
*maxrss = hiwater_rss;
}
#if defined(SPLIT_RSS_COUNTING)
void sync_mm_rss(struct mm_struct *mm);
#else
static inline void sync_mm_rss(struct mm_struct *mm)
{
}
#endif
#ifndef CONFIG_ARCH_HAS_PTE_SPECIAL
static inline int pte_special(pte_t pte)
{
return 0;
}
static inline pte_t pte_mkspecial(pte_t pte)
{
return pte;
}
#endif
#ifndef CONFIG_ARCH_HAS_PTE_DEVMAP
static inline int pte_devmap(pte_t pte)
{
return 0;
}
#endif
int vma_wants_writenotify(struct vm_area_struct *vma, pgprot_t vm_page_prot);
extern pte_t *__get_locked_pte(struct mm_struct *mm, unsigned long addr,
spinlock_t **ptl);
static inline pte_t *get_locked_pte(struct mm_struct *mm, unsigned long addr,
spinlock_t **ptl)
{
pte_t *ptep;
__cond_lock(*ptl, ptep = __get_locked_pte(mm, addr, ptl));
return ptep;
}
#ifdef __PAGETABLE_P4D_FOLDED
static inline int __p4d_alloc(struct mm_struct *mm, pgd_t *pgd,
unsigned long address)
{
return 0;
}
#else
int __p4d_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address);
#endif
#if defined(__PAGETABLE_PUD_FOLDED) || !defined(CONFIG_MMU)
static inline int __pud_alloc(struct mm_struct *mm, p4d_t *p4d,
unsigned long address)
{
return 0;
}
static inline void mm_inc_nr_puds(struct mm_struct *mm) {}
static inline void mm_dec_nr_puds(struct mm_struct *mm) {}
#else
int __pud_alloc(struct mm_struct *mm, p4d_t *p4d, unsigned long address);
static inline void mm_inc_nr_puds(struct mm_struct *mm)
{
if (mm_pud_folded(mm))
return;
atomic_long_add(PTRS_PER_PUD * sizeof(pud_t), &mm->pgtables_bytes);
}
static inline void mm_dec_nr_puds(struct mm_struct *mm)
{
if (mm_pud_folded(mm))
return;
atomic_long_sub(PTRS_PER_PUD * sizeof(pud_t), &mm->pgtables_bytes);
}
#endif
#if defined(__PAGETABLE_PMD_FOLDED) || !defined(CONFIG_MMU)
static inline int __pmd_alloc(struct mm_struct *mm, pud_t *pud,
unsigned long address)
{
return 0;
}
static inline void mm_inc_nr_pmds(struct mm_struct *mm) {}
static inline void mm_dec_nr_pmds(struct mm_struct *mm) {}
#else
int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address);
static inline void mm_inc_nr_pmds(struct mm_struct *mm)
{
if (mm_pmd_folded(mm))
return;
atomic_long_add(PTRS_PER_PMD * sizeof(pmd_t), &mm->pgtables_bytes);
}
static inline void mm_dec_nr_pmds(struct mm_struct *mm)
{
if (mm_pmd_folded(mm))
return;
atomic_long_sub(PTRS_PER_PMD * sizeof(pmd_t), &mm->pgtables_bytes);
}
#endif
#ifdef CONFIG_MMU
static inline void mm_pgtables_bytes_init(struct mm_struct *mm)
{
atomic_long_set(&mm->pgtables_bytes, 0);
}
static inline unsigned long mm_pgtables_bytes(const struct mm_struct *mm)
{
return atomic_long_read(&mm->pgtables_bytes);
}
static inline void mm_inc_nr_ptes(struct mm_struct *mm)
{
atomic_long_add(PTRS_PER_PTE * sizeof(pte_t), &mm->pgtables_bytes);
}
static inline void mm_dec_nr_ptes(struct mm_struct *mm)
{
atomic_long_sub(PTRS_PER_PTE * sizeof(pte_t), &mm->pgtables_bytes);
}
#else
static inline void mm_pgtables_bytes_init(struct mm_struct *mm) {}
static inline unsigned long mm_pgtables_bytes(const struct mm_struct *mm)
{
return 0;
}
static inline void mm_inc_nr_ptes(struct mm_struct *mm) {}
static inline void mm_dec_nr_ptes(struct mm_struct *mm) {}
#endif
int __pte_alloc(struct mm_struct *mm, pmd_t *pmd);
int __pte_alloc_kernel(pmd_t *pmd);
#if defined(CONFIG_MMU)
static inline p4d_t *p4d_alloc(struct mm_struct *mm, pgd_t *pgd,
unsigned long address)
{
return (unlikely(pgd_none(*pgd)) && __p4d_alloc(mm, pgd, address)) ?
NULL : p4d_offset(pgd, address);
}
static inline pud_t *pud_alloc(struct mm_struct *mm, p4d_t *p4d,
unsigned long address)
{
return (unlikely(p4d_none(*p4d)) && __pud_alloc(mm, p4d, address)) ?
NULL : pud_offset(p4d, address);
}
static inline pmd_t *pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address)
{
return (unlikely(pud_none(*pud)) && __pmd_alloc(mm, pud, address))?
NULL: pmd_offset(pud, address);
}
#endif /* CONFIG_MMU */
#if USE_SPLIT_PTE_PTLOCKS
#if ALLOC_SPLIT_PTLOCKS
void __init ptlock_cache_init(void);
extern bool ptlock_alloc(struct page *page);
extern void ptlock_free(struct page *page);
static inline spinlock_t *ptlock_ptr(struct page *page)
{
return page->ptl;
}
#else /* ALLOC_SPLIT_PTLOCKS */
static inline void ptlock_cache_init(void)
{
}
static inline bool ptlock_alloc(struct page *page)
{
return true;
}
static inline void ptlock_free(struct page *page)
{
}
static inline spinlock_t *ptlock_ptr(struct page *page)
{
return &page->ptl;
}
#endif /* ALLOC_SPLIT_PTLOCKS */
static inline spinlock_t *pte_lockptr(struct mm_struct *mm, pmd_t *pmd)
{
return ptlock_ptr(pmd_page(*pmd));
}
static inline bool ptlock_init(struct page *page)
{
/*
* prep_new_page() initialize page->private (and therefore page->ptl)
* with 0. Make sure nobody took it in use in between.
*
* It can happen if arch try to use slab for page table allocation:
* slab code uses page->slab_cache, which share storage with page->ptl.
*/
VM_BUG_ON_PAGE(*(unsigned long *)&page->ptl, page);
if (!ptlock_alloc(page))
return false;
spin_lock_init(ptlock_ptr(page));
return true;
}
#else /* !USE_SPLIT_PTE_PTLOCKS */
/*
* We use mm->page_table_lock to guard all pagetable pages of the mm.
*/
static inline spinlock_t *pte_lockptr(struct mm_struct *mm, pmd_t *pmd)
{
return &mm->page_table_lock;
}
static inline void ptlock_cache_init(void) {}
static inline bool ptlock_init(struct page *page) { return true; }
static inline void ptlock_free(struct page *page) {}
#endif /* USE_SPLIT_PTE_PTLOCKS */
static inline void pgtable_init(void)
{
ptlock_cache_init();
pgtable_cache_init();
}
static inline bool pgtable_pte_page_ctor(struct page *page)
{
if (!ptlock_init(page))
return false;
__SetPageTable(page);
inc_lruvec_page_state(page, NR_PAGETABLE);
return true;
}
static inline void pgtable_pte_page_dtor(struct page *page)
{
ptlock_free(page);
__ClearPageTable(page);
dec_lruvec_page_state(page, NR_PAGETABLE);
}
#define pte_offset_map_lock(mm, pmd, address, ptlp) \
({ \
spinlock_t *__ptl = pte_lockptr(mm, pmd); \
pte_t *__pte = pte_offset_map(pmd, address); \
*(ptlp) = __ptl; \
spin_lock(__ptl); \
__pte; \
})
#define pte_unmap_unlock(pte, ptl) do { \
spin_unlock(ptl); \
pte_unmap(pte); \
} while (0)
#define pte_alloc(mm, pmd) (unlikely(pmd_none(*(pmd))) && __pte_alloc(mm, pmd))
#define pte_alloc_map(mm, pmd, address) \
(pte_alloc(mm, pmd) ? NULL : pte_offset_map(pmd, address))
#define pte_alloc_map_lock(mm, pmd, address, ptlp) \
(pte_alloc(mm, pmd) ? \
NULL : pte_offset_map_lock(mm, pmd, address, ptlp))
#define pte_alloc_kernel(pmd, address) \
((unlikely(pmd_none(*(pmd))) && __pte_alloc_kernel(pmd))? \
NULL: pte_offset_kernel(pmd, address))
#if USE_SPLIT_PMD_PTLOCKS
static struct page *pmd_to_page(pmd_t *pmd)
{
unsigned long mask = ~(PTRS_PER_PMD * sizeof(pmd_t) - 1);
return virt_to_page((void *)((unsigned long) pmd & mask));
}
static inline spinlock_t *pmd_lockptr(struct mm_struct *mm, pmd_t *pmd)
{
return ptlock_ptr(pmd_to_page(pmd));
}
static inline bool pmd_ptlock_init(struct page *page)
{
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
page->pmd_huge_pte = NULL;
#endif
return ptlock_init(page);
}
static inline void pmd_ptlock_free(struct page *page)
{
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
VM_BUG_ON_PAGE(page->pmd_huge_pte, page);
#endif
ptlock_free(page);
}
#define pmd_huge_pte(mm, pmd) (pmd_to_page(pmd)->pmd_huge_pte)
#else
static inline spinlock_t *pmd_lockptr(struct mm_struct *mm, pmd_t *pmd)
{
return &mm->page_table_lock;
}
static inline bool pmd_ptlock_init(struct page *page) { return true; }
static inline void pmd_ptlock_free(struct page *page) {}
#define pmd_huge_pte(mm, pmd) ((mm)->pmd_huge_pte)
#endif
static inline spinlock_t *pmd_lock(struct mm_struct *mm, pmd_t *pmd)
{
spinlock_t *ptl = pmd_lockptr(mm, pmd);
spin_lock(ptl);
return ptl;
}
static inline bool pgtable_pmd_page_ctor(struct page *page)
{
if (!pmd_ptlock_init(page))
return false;
__SetPageTable(page);
inc_lruvec_page_state(page, NR_PAGETABLE);
return true;
}
static inline void pgtable_pmd_page_dtor(struct page *page)
{
pmd_ptlock_free(page);
__ClearPageTable(page);
dec_lruvec_page_state(page, NR_PAGETABLE);
}
/*
* No scalability reason to split PUD locks yet, but follow the same pattern
* as the PMD locks to make it easier if we decide to. The VM should not be
* considered ready to switch to split PUD locks yet; there may be places
* which need to be converted from page_table_lock.
*/
static inline spinlock_t *pud_lockptr(struct mm_struct *mm, pud_t *pud)
{
return &mm->page_table_lock;
}
static inline spinlock_t *pud_lock(struct mm_struct *mm, pud_t *pud)
{
spinlock_t *ptl = pud_lockptr(mm, pud);
spin_lock(ptl);
return ptl;
}
extern void __init pagecache_init(void);
extern void __init free_area_init_memoryless_node(int nid);
extern void free_initmem(void);
/*
* Free reserved pages within range [PAGE_ALIGN(start), end & PAGE_MASK)
* into the buddy system. The freed pages will be poisoned with pattern
* "poison" if it's within range [0, UCHAR_MAX].
* Return pages freed into the buddy system.
*/
extern unsigned long free_reserved_area(void *start, void *end,
int poison, const char *s);
extern void adjust_managed_page_count(struct page *page, long count);
extern void mem_init_print_info(void);
extern void reserve_bootmem_region(phys_addr_t start, phys_addr_t end);
/* Free the reserved page into the buddy system, so it gets managed. */
static inline void free_reserved_page(struct page *page)
{
ClearPageReserved(page);
init_page_count(page);
__free_page(page);
adjust_managed_page_count(page, 1);
}
#define free_highmem_page(page) free_reserved_page(page)
static inline void mark_page_reserved(struct page *page)
{
SetPageReserved(page);
adjust_managed_page_count(page, -1);
}
/*
* Default method to free all the __init memory into the buddy system.
* The freed pages will be poisoned with pattern "poison" if it's within
* range [0, UCHAR_MAX].
* Return pages freed into the buddy system.
*/
static inline unsigned long free_initmem_default(int poison)
{
extern char __init_begin[], __init_end[];
return free_reserved_area(&__init_begin, &__init_end,
poison, "unused kernel image (initmem)");
}
static inline unsigned long get_num_physpages(void)
{
int nid;
unsigned long phys_pages = 0;
for_each_online_node(nid)
phys_pages += node_present_pages(nid);
return phys_pages;
}
/*
* Using memblock node mappings, an architecture may initialise its
* zones, allocate the backing mem_map and account for memory holes in an
* architecture independent manner.
*
* An architecture is expected to register range of page frames backed by
* physical memory with memblock_add[_node]() before calling
* free_area_init() passing in the PFN each zone ends at. At a basic
* usage, an architecture is expected to do something like
*
* unsigned long max_zone_pfns[MAX_NR_ZONES] = {max_dma, max_normal_pfn,
* max_highmem_pfn};
* for_each_valid_physical_page_range()
* memblock_add_node(base, size, nid)
* free_area_init(max_zone_pfns);
*/
void free_area_init(unsigned long *max_zone_pfn);
unsigned long node_map_pfn_alignment(void);
unsigned long __absent_pages_in_range(int nid, unsigned long start_pfn,
unsigned long end_pfn);
extern unsigned long absent_pages_in_range(unsigned long start_pfn,
unsigned long end_pfn);
extern void get_pfn_range_for_nid(unsigned int nid,
unsigned long *start_pfn, unsigned long *end_pfn);
extern unsigned long find_min_pfn_with_active_regions(void);
#ifndef CONFIG_NUMA
static inline int early_pfn_to_nid(unsigned long pfn)
{
return 0;
}
#else
/* please see mm/page_alloc.c */
extern int __meminit early_pfn_to_nid(unsigned long pfn);
#endif
extern void set_dma_reserve(unsigned long new_dma_reserve);
extern void memmap_init_range(unsigned long, int, unsigned long,
unsigned long, unsigned long, enum meminit_context,
struct vmem_altmap *, int migratetype);
extern void setup_per_zone_wmarks(void);
extern int __meminit init_per_zone_wmark_min(void);
extern void mem_init(void);
extern void __init mmap_init(void);
extern void show_mem(unsigned int flags, nodemask_t *nodemask);
extern long si_mem_available(void);
extern void si_meminfo(struct sysinfo * val);
extern void si_meminfo_node(struct sysinfo *val, int nid);
#ifdef __HAVE_ARCH_RESERVED_KERNEL_PAGES
extern unsigned long arch_reserved_kernel_pages(void);
#endif
extern __printf(3, 4)
void warn_alloc(gfp_t gfp_mask, nodemask_t *nodemask, const char *fmt, ...);
extern void setup_per_cpu_pageset(void);
/* page_alloc.c */
extern int min_free_kbytes;
extern int watermark_boost_factor;
extern int watermark_scale_factor;
extern bool arch_has_descending_max_zone_pfns(void);
/* nommu.c */
extern atomic_long_t mmap_pages_allocated;
extern int nommu_shrink_inode_mappings(struct inode *, size_t, size_t);
/* interval_tree.c */
void vma_interval_tree_insert(struct vm_area_struct *node,
struct rb_root_cached *root);
void vma_interval_tree_insert_after(struct vm_area_struct *node,
struct vm_area_struct *prev,
struct rb_root_cached *root);
void vma_interval_tree_remove(struct vm_area_struct *node,
struct rb_root_cached *root);
struct vm_area_struct *vma_interval_tree_iter_first(struct rb_root_cached *root,
unsigned long start, unsigned long last);
struct vm_area_struct *vma_interval_tree_iter_next(struct vm_area_struct *node,
unsigned long start, unsigned long last);
#define vma_interval_tree_foreach(vma, root, start, last) \
for (vma = vma_interval_tree_iter_first(root, start, last); \
vma; vma = vma_interval_tree_iter_next(vma, start, last))
void anon_vma_interval_tree_insert(struct anon_vma_chain *node,
struct rb_root_cached *root);
void anon_vma_interval_tree_remove(struct anon_vma_chain *node,
struct rb_root_cached *root);
struct anon_vma_chain *
anon_vma_interval_tree_iter_first(struct rb_root_cached *root,
unsigned long start, unsigned long last);
struct anon_vma_chain *anon_vma_interval_tree_iter_next(
struct anon_vma_chain *node, unsigned long start, unsigned long last);
#ifdef CONFIG_DEBUG_VM_RB
void anon_vma_interval_tree_verify(struct anon_vma_chain *node);
#endif
#define anon_vma_interval_tree_foreach(avc, root, start, last) \
for (avc = anon_vma_interval_tree_iter_first(root, start, last); \
avc; avc = anon_vma_interval_tree_iter_next(avc, start, last))
/* mmap.c */
extern int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin);
extern int __vma_adjust(struct vm_area_struct *vma, unsigned long start,
unsigned long end, pgoff_t pgoff, struct vm_area_struct *insert,
struct vm_area_struct *expand);
static inline int vma_adjust(struct vm_area_struct *vma, unsigned long start,
unsigned long end, pgoff_t pgoff, struct vm_area_struct *insert)
{
return __vma_adjust(vma, start, end, pgoff, insert, NULL);
}
extern struct vm_area_struct *vma_merge(struct mm_struct *,
struct vm_area_struct *prev, unsigned long addr, unsigned long end,
unsigned long vm_flags, struct anon_vma *, struct file *, pgoff_t,
struct mempolicy *, struct vm_userfaultfd_ctx);
extern struct anon_vma *find_mergeable_anon_vma(struct vm_area_struct *);
extern int __split_vma(struct mm_struct *, struct vm_area_struct *,
unsigned long addr, int new_below);
extern int split_vma(struct mm_struct *, struct vm_area_struct *,
unsigned long addr, int new_below);
extern int insert_vm_struct(struct mm_struct *, struct vm_area_struct *);
extern void __vma_link_rb(struct mm_struct *, struct vm_area_struct *,
struct rb_node **, struct rb_node *);
extern void unlink_file_vma(struct vm_area_struct *);
extern struct vm_area_struct *copy_vma(struct vm_area_struct **,
unsigned long addr, unsigned long len, pgoff_t pgoff,
bool *need_rmap_locks);
extern void exit_mmap(struct mm_struct *);
static inline int check_data_rlimit(unsigned long rlim,
unsigned long new,
unsigned long start,
unsigned long end_data,
unsigned long start_data)
{
if (rlim < RLIM_INFINITY) {
if (((new - start) + (end_data - start_data)) > rlim)
return -ENOSPC;
}
return 0;
}
extern int mm_take_all_locks(struct mm_struct *mm);
extern void mm_drop_all_locks(struct mm_struct *mm);
extern int set_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file);
extern int replace_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file);
extern struct file *get_mm_exe_file(struct mm_struct *mm);
extern struct file *get_task_exe_file(struct task_struct *task);
extern bool may_expand_vm(struct mm_struct *, vm_flags_t, unsigned long npages);
extern void vm_stat_account(struct mm_struct *, vm_flags_t, long npages);
extern bool vma_is_special_mapping(const struct vm_area_struct *vma,
const struct vm_special_mapping *sm);
extern struct vm_area_struct *_install_special_mapping(struct mm_struct *mm,
unsigned long addr, unsigned long len,
unsigned long flags,
const struct vm_special_mapping *spec);
/* This is an obsolete alternative to _install_special_mapping. */
extern int install_special_mapping(struct mm_struct *mm,
unsigned long addr, unsigned long len,
unsigned long flags, struct page **pages);
unsigned long randomize_stack_top(unsigned long stack_top);
extern unsigned long get_unmapped_area(struct file *, unsigned long, unsigned long, unsigned long, unsigned long);
extern unsigned long mmap_region(struct file *file, unsigned long addr,
unsigned long len, vm_flags_t vm_flags, unsigned long pgoff,
struct list_head *uf);
extern unsigned long do_mmap(struct file *file, unsigned long addr,
unsigned long len, unsigned long prot, unsigned long flags,
unsigned long pgoff, unsigned long *populate, struct list_head *uf);
extern int __do_munmap(struct mm_struct *, unsigned long, size_t,
struct list_head *uf, bool downgrade);
extern int do_munmap(struct mm_struct *, unsigned long, size_t,
struct list_head *uf);
extern int do_madvise(struct mm_struct *mm, unsigned long start, size_t len_in, int behavior);
#ifdef CONFIG_MMU
extern int __mm_populate(unsigned long addr, unsigned long len,
int ignore_errors);
static inline void mm_populate(unsigned long addr, unsigned long len)
{
/* Ignore errors */
(void) __mm_populate(addr, len, 1);
}
#else
static inline void mm_populate(unsigned long addr, unsigned long len) {}
#endif
/* These take the mm semaphore themselves */
extern int __must_check vm_brk(unsigned long, unsigned long);
extern int __must_check vm_brk_flags(unsigned long, unsigned long, unsigned long);
extern int vm_munmap(unsigned long, size_t);
extern unsigned long __must_check vm_mmap(struct file *, unsigned long,
unsigned long, unsigned long,
unsigned long, unsigned long);
struct vm_unmapped_area_info {
#define VM_UNMAPPED_AREA_TOPDOWN 1
unsigned long flags;
unsigned long length;
unsigned long low_limit;
unsigned long high_limit;
unsigned long align_mask;
unsigned long align_offset;
};
extern unsigned long vm_unmapped_area(struct vm_unmapped_area_info *info);
/* truncate.c */
extern void truncate_inode_pages(struct address_space *, loff_t);
extern void truncate_inode_pages_range(struct address_space *,
loff_t lstart, loff_t lend);
extern void truncate_inode_pages_final(struct address_space *);
/* generic vm_area_ops exported for stackable file systems */
extern vm_fault_t filemap_fault(struct vm_fault *vmf);
extern vm_fault_t filemap_map_pages(struct vm_fault *vmf,
pgoff_t start_pgoff, pgoff_t end_pgoff);
extern vm_fault_t filemap_page_mkwrite(struct vm_fault *vmf);
/* mm/page-writeback.c */
int __must_check write_one_page(struct page *page);
void task_dirty_inc(struct task_struct *tsk);
extern unsigned long stack_guard_gap;
/* Generic expand stack which grows the stack according to GROWS{UP,DOWN} */
extern int expand_stack(struct vm_area_struct *vma, unsigned long address);
/* CONFIG_STACK_GROWSUP still needs to grow downwards at some places */
extern int expand_downwards(struct vm_area_struct *vma,
unsigned long address);
#if VM_GROWSUP
extern int expand_upwards(struct vm_area_struct *vma, unsigned long address);
#else
#define expand_upwards(vma, address) (0)
#endif
/* Look up the first VMA which satisfies addr < vm_end, NULL if none. */
extern struct vm_area_struct * find_vma(struct mm_struct * mm, unsigned long addr);
extern struct vm_area_struct * find_vma_prev(struct mm_struct * mm, unsigned long addr,
struct vm_area_struct **pprev);
/**
* find_vma_intersection() - Look up the first VMA which intersects the interval
* @mm: The process address space.
* @start_addr: The inclusive start user address.
* @end_addr: The exclusive end user address.
*
* Returns: The first VMA within the provided range, %NULL otherwise. Assumes
* start_addr < end_addr.
*/
static inline
struct vm_area_struct *find_vma_intersection(struct mm_struct *mm,
unsigned long start_addr,
unsigned long end_addr)
{
struct vm_area_struct *vma = find_vma(mm, start_addr);
if (vma && end_addr <= vma->vm_start)
vma = NULL;
return vma;
}
/**
* vma_lookup() - Find a VMA at a specific address
* @mm: The process address space.
* @addr: The user address.
*
* Return: The vm_area_struct at the given address, %NULL otherwise.
*/
static inline
struct vm_area_struct *vma_lookup(struct mm_struct *mm, unsigned long addr)
{
struct vm_area_struct *vma = find_vma(mm, addr);
if (vma && addr < vma->vm_start)
vma = NULL;
return vma;
}
static inline unsigned long vm_start_gap(struct vm_area_struct *vma)
{
unsigned long vm_start = vma->vm_start;
if (vma->vm_flags & VM_GROWSDOWN) {
vm_start -= stack_guard_gap;
if (vm_start > vma->vm_start)
vm_start = 0;
}
return vm_start;
}
static inline unsigned long vm_end_gap(struct vm_area_struct *vma)
{
unsigned long vm_end = vma->vm_end;
if (vma->vm_flags & VM_GROWSUP) {
vm_end += stack_guard_gap;
if (vm_end < vma->vm_end)
vm_end = -PAGE_SIZE;
}
return vm_end;
}
static inline unsigned long vma_pages(struct vm_area_struct *vma)
{
return (vma->vm_end - vma->vm_start) >> PAGE_SHIFT;
}
/* Look up the first VMA which exactly match the interval vm_start ... vm_end */
static inline struct vm_area_struct *find_exact_vma(struct mm_struct *mm,
unsigned long vm_start, unsigned long vm_end)
{
struct vm_area_struct *vma = find_vma(mm, vm_start);
if (vma && (vma->vm_start != vm_start || vma->vm_end != vm_end))
vma = NULL;
return vma;
}
static inline bool range_in_vma(struct vm_area_struct *vma,
unsigned long start, unsigned long end)
{
return (vma && vma->vm_start <= start && end <= vma->vm_end);
}
#ifdef CONFIG_MMU
pgprot_t vm_get_page_prot(unsigned long vm_flags);
void vma_set_page_prot(struct vm_area_struct *vma);
#else
static inline pgprot_t vm_get_page_prot(unsigned long vm_flags)
{
return __pgprot(0);
}
static inline void vma_set_page_prot(struct vm_area_struct *vma)
{
vma->vm_page_prot = vm_get_page_prot(vma->vm_flags);
}
#endif
void vma_set_file(struct vm_area_struct *vma, struct file *file);
#ifdef CONFIG_NUMA_BALANCING
unsigned long change_prot_numa(struct vm_area_struct *vma,
unsigned long start, unsigned long end);
#endif
struct vm_area_struct *find_extend_vma(struct mm_struct *, unsigned long addr);
int remap_pfn_range(struct vm_area_struct *, unsigned long addr,
unsigned long pfn, unsigned long size, pgprot_t);
int remap_pfn_range_notrack(struct vm_area_struct *vma, unsigned long addr,
unsigned long pfn, unsigned long size, pgprot_t prot);
int vm_insert_page(struct vm_area_struct *, unsigned long addr, struct page *);
int vm_insert_pages(struct vm_area_struct *vma, unsigned long addr,
struct page **pages, unsigned long *num);
int vm_map_pages(struct vm_area_struct *vma, struct page **pages,
unsigned long num);
int vm_map_pages_zero(struct vm_area_struct *vma, struct page **pages,
unsigned long num);
vm_fault_t vmf_insert_pfn(struct vm_area_struct *vma, unsigned long addr,
unsigned long pfn);
vm_fault_t vmf_insert_pfn_prot(struct vm_area_struct *vma, unsigned long addr,
unsigned long pfn, pgprot_t pgprot);
vm_fault_t vmf_insert_mixed(struct vm_area_struct *vma, unsigned long addr,
pfn_t pfn);
vm_fault_t vmf_insert_mixed_prot(struct vm_area_struct *vma, unsigned long addr,
pfn_t pfn, pgprot_t pgprot);
vm_fault_t vmf_insert_mixed_mkwrite(struct vm_area_struct *vma,
unsigned long addr, pfn_t pfn);
int vm_iomap_memory(struct vm_area_struct *vma, phys_addr_t start, unsigned long len);
static inline vm_fault_t vmf_insert_page(struct vm_area_struct *vma,
unsigned long addr, struct page *page)
{
int err = vm_insert_page(vma, addr, page);
if (err == -ENOMEM)
return VM_FAULT_OOM;
if (err < 0 && err != -EBUSY)
return VM_FAULT_SIGBUS;
return VM_FAULT_NOPAGE;
}
#ifndef io_remap_pfn_range
static inline int io_remap_pfn_range(struct vm_area_struct *vma,
unsigned long addr, unsigned long pfn,
unsigned long size, pgprot_t prot)
{
return remap_pfn_range(vma, addr, pfn, size, pgprot_decrypted(prot));
}
#endif
static inline vm_fault_t vmf_error(int err)
{
if (err == -ENOMEM)
return VM_FAULT_OOM;
return VM_FAULT_SIGBUS;
}
struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
unsigned int foll_flags);
#define FOLL_WRITE 0x01 /* check pte is writable */
#define FOLL_TOUCH 0x02 /* mark page accessed */
#define FOLL_GET 0x04 /* do get_page on page */
#define FOLL_DUMP 0x08 /* give error on hole if it would be zero */
#define FOLL_FORCE 0x10 /* get_user_pages read/write w/o permission */
#define FOLL_NOWAIT 0x20 /* if a disk transfer is needed, start the IO
* and return without waiting upon it */
#define FOLL_POPULATE 0x40 /* fault in pages (with FOLL_MLOCK) */
#define FOLL_NOFAULT 0x80 /* do not fault in pages */
#define FOLL_HWPOISON 0x100 /* check page is hwpoisoned */
#define FOLL_NUMA 0x200 /* force NUMA hinting page fault */
#define FOLL_MIGRATION 0x400 /* wait for page to replace migration entry */
#define FOLL_TRIED 0x800 /* a retry, previous pass started an IO */
#define FOLL_MLOCK 0x1000 /* lock present pages */
#define FOLL_REMOTE 0x2000 /* we are working on non-current tsk/mm */
#define FOLL_COW 0x4000 /* internal GUP flag */
#define FOLL_ANON 0x8000 /* don't do file mappings */
#define FOLL_LONGTERM 0x10000 /* mapping lifetime is indefinite: see below */
#define FOLL_SPLIT_PMD 0x20000 /* split huge pmd before returning */
#define FOLL_PIN 0x40000 /* pages must be released via unpin_user_page */
#define FOLL_FAST_ONLY 0x80000 /* gup_fast: prevent fall-back to slow gup */
/*
* FOLL_PIN and FOLL_LONGTERM may be used in various combinations with each
* other. Here is what they mean, and how to use them:
*
* FOLL_LONGTERM indicates that the page will be held for an indefinite time
* period _often_ under userspace control. This is in contrast to
* iov_iter_get_pages(), whose usages are transient.
*
* FIXME: For pages which are part of a filesystem, mappings are subject to the
* lifetime enforced by the filesystem and we need guarantees that longterm
* users like RDMA and V4L2 only establish mappings which coordinate usage with
* the filesystem. Ideas for this coordination include revoking the longterm
* pin, delaying writeback, bounce buffer page writeback, etc. As FS DAX was
* added after the problem with filesystems was found FS DAX VMAs are
* specifically failed. Filesystem pages are still subject to bugs and use of
* FOLL_LONGTERM should be avoided on those pages.
*
* FIXME: Also NOTE that FOLL_LONGTERM is not supported in every GUP call.
* Currently only get_user_pages() and get_user_pages_fast() support this flag
* and calls to get_user_pages_[un]locked are specifically not allowed. This
* is due to an incompatibility with the FS DAX check and
* FAULT_FLAG_ALLOW_RETRY.
*
* In the CMA case: long term pins in a CMA region would unnecessarily fragment
* that region. And so, CMA attempts to migrate the page before pinning, when
* FOLL_LONGTERM is specified.
*
* FOLL_PIN indicates that a special kind of tracking (not just page->_refcount,
* but an additional pin counting system) will be invoked. This is intended for
* anything that gets a page reference and then touches page data (for example,
* Direct IO). This lets the filesystem know that some non-file-system entity is
* potentially changing the pages' data. In contrast to FOLL_GET (whose pages
* are released via put_page()), FOLL_PIN pages must be released, ultimately, by
* a call to unpin_user_page().
*
* FOLL_PIN is similar to FOLL_GET: both of these pin pages. They use different
* and separate refcounting mechanisms, however, and that means that each has
* its own acquire and release mechanisms:
*
* FOLL_GET: get_user_pages*() to acquire, and put_page() to release.
*
* FOLL_PIN: pin_user_pages*() to acquire, and unpin_user_pages to release.
*
* FOLL_PIN and FOLL_GET are mutually exclusive for a given function call.
* (The underlying pages may experience both FOLL_GET-based and FOLL_PIN-based
* calls applied to them, and that's perfectly OK. This is a constraint on the
* callers, not on the pages.)
*
* FOLL_PIN should be set internally by the pin_user_pages*() APIs, never
* directly by the caller. That's in order to help avoid mismatches when
* releasing pages: get_user_pages*() pages must be released via put_page(),
* while pin_user_pages*() pages must be released via unpin_user_page().
*
* Please see Documentation/core-api/pin_user_pages.rst for more information.
*/
static inline int vm_fault_to_errno(vm_fault_t vm_fault, int foll_flags)
{
if (vm_fault & VM_FAULT_OOM)
return -ENOMEM;
if (vm_fault & (VM_FAULT_HWPOISON | VM_FAULT_HWPOISON_LARGE))
return (foll_flags & FOLL_HWPOISON) ? -EHWPOISON : -EFAULT;
if (vm_fault & (VM_FAULT_SIGBUS | VM_FAULT_SIGSEGV))
return -EFAULT;
return 0;
}
typedef int (*pte_fn_t)(pte_t *pte, unsigned long addr, void *data);
extern int apply_to_page_range(struct mm_struct *mm, unsigned long address,
unsigned long size, pte_fn_t fn, void *data);
extern int apply_to_existing_page_range(struct mm_struct *mm,
unsigned long address, unsigned long size,
pte_fn_t fn, void *data);
extern void init_mem_debugging_and_hardening(void);
#ifdef CONFIG_PAGE_POISONING
extern void __kernel_poison_pages(struct page *page, int numpages);
extern void __kernel_unpoison_pages(struct page *page, int numpages);
extern bool _page_poisoning_enabled_early;
DECLARE_STATIC_KEY_FALSE(_page_poisoning_enabled);
static inline bool page_poisoning_enabled(void)
{
return _page_poisoning_enabled_early;
}
/*
* For use in fast paths after init_mem_debugging() has run, or when a
* false negative result is not harmful when called too early.
*/
static inline bool page_poisoning_enabled_static(void)
{
return static_branch_unlikely(&_page_poisoning_enabled);
}
static inline void kernel_poison_pages(struct page *page, int numpages)
{
if (page_poisoning_enabled_static())
__kernel_poison_pages(page, numpages);
}
static inline void kernel_unpoison_pages(struct page *page, int numpages)
{
if (page_poisoning_enabled_static())
__kernel_unpoison_pages(page, numpages);
}
#else
static inline bool page_poisoning_enabled(void) { return false; }
static inline bool page_poisoning_enabled_static(void) { return false; }
static inline void __kernel_poison_pages(struct page *page, int nunmpages) { }
static inline void kernel_poison_pages(struct page *page, int numpages) { }
static inline void kernel_unpoison_pages(struct page *page, int numpages) { }
#endif
DECLARE_STATIC_KEY_MAYBE(CONFIG_INIT_ON_ALLOC_DEFAULT_ON, init_on_alloc);
static inline bool want_init_on_alloc(gfp_t flags)
{
if (static_branch_maybe(CONFIG_INIT_ON_ALLOC_DEFAULT_ON,
&init_on_alloc))
return true;
return flags & __GFP_ZERO;
}
DECLARE_STATIC_KEY_MAYBE(CONFIG_INIT_ON_FREE_DEFAULT_ON, init_on_free);
static inline bool want_init_on_free(void)
{
return static_branch_maybe(CONFIG_INIT_ON_FREE_DEFAULT_ON,
&init_on_free);
}
extern bool _debug_pagealloc_enabled_early;
DECLARE_STATIC_KEY_FALSE(_debug_pagealloc_enabled);
static inline bool debug_pagealloc_enabled(void)
{
return IS_ENABLED(CONFIG_DEBUG_PAGEALLOC) &&
_debug_pagealloc_enabled_early;
}
/*
* For use in fast paths after init_debug_pagealloc() has run, or when a
* false negative result is not harmful when called too early.
*/
static inline bool debug_pagealloc_enabled_static(void)
{
if (!IS_ENABLED(CONFIG_DEBUG_PAGEALLOC))
return false;
return static_branch_unlikely(&_debug_pagealloc_enabled);
}
#ifdef CONFIG_DEBUG_PAGEALLOC
/*
* To support DEBUG_PAGEALLOC architecture must ensure that
* __kernel_map_pages() never fails
*/
extern void __kernel_map_pages(struct page *page, int numpages, int enable);
static inline void debug_pagealloc_map_pages(struct page *page, int numpages)
{
if (debug_pagealloc_enabled_static())
__kernel_map_pages(page, numpages, 1);
}
static inline void debug_pagealloc_unmap_pages(struct page *page, int numpages)
{
if (debug_pagealloc_enabled_static())
__kernel_map_pages(page, numpages, 0);
}
#else /* CONFIG_DEBUG_PAGEALLOC */
static inline void debug_pagealloc_map_pages(struct page *page, int numpages) {}
static inline void debug_pagealloc_unmap_pages(struct page *page, int numpages) {}
#endif /* CONFIG_DEBUG_PAGEALLOC */
#ifdef __HAVE_ARCH_GATE_AREA
extern struct vm_area_struct *get_gate_vma(struct mm_struct *mm);
extern int in_gate_area_no_mm(unsigned long addr);
extern int in_gate_area(struct mm_struct *mm, unsigned long addr);
#else
static inline struct vm_area_struct *get_gate_vma(struct mm_struct *mm)
{
return NULL;
}
static inline int in_gate_area_no_mm(unsigned long addr) { return 0; }
static inline int in_gate_area(struct mm_struct *mm, unsigned long addr)
{
return 0;
}
#endif /* __HAVE_ARCH_GATE_AREA */
extern bool process_shares_mm(struct task_struct *p, struct mm_struct *mm);
#ifdef CONFIG_SYSCTL
extern int sysctl_drop_caches;
int drop_caches_sysctl_handler(struct ctl_table *, int, void *, size_t *,
loff_t *);
#endif
void drop_slab(void);
void drop_slab_node(int nid);
#ifndef CONFIG_MMU
#define randomize_va_space 0
#else
extern int randomize_va_space;
#endif
const char * arch_vma_name(struct vm_area_struct *vma);
#ifdef CONFIG_MMU
void print_vma_addr(char *prefix, unsigned long rip);
#else
static inline void print_vma_addr(char *prefix, unsigned long rip)
{
}
#endif
int vmemmap_remap_free(unsigned long start, unsigned long end,
unsigned long reuse);
int vmemmap_remap_alloc(unsigned long start, unsigned long end,
unsigned long reuse, gfp_t gfp_mask);
void *sparse_buffer_alloc(unsigned long size);
struct page * __populate_section_memmap(unsigned long pfn,
unsigned long nr_pages, int nid, struct vmem_altmap *altmap);
pgd_t *vmemmap_pgd_populate(unsigned long addr, int node);
p4d_t *vmemmap_p4d_populate(pgd_t *pgd, unsigned long addr, int node);
pud_t *vmemmap_pud_populate(p4d_t *p4d, unsigned long addr, int node);
pmd_t *vmemmap_pmd_populate(pud_t *pud, unsigned long addr, int node);
pte_t *vmemmap_pte_populate(pmd_t *pmd, unsigned long addr, int node,
struct vmem_altmap *altmap);
void *vmemmap_alloc_block(unsigned long size, int node);
struct vmem_altmap;
void *vmemmap_alloc_block_buf(unsigned long size, int node,
struct vmem_altmap *altmap);
void vmemmap_verify(pte_t *, int, unsigned long, unsigned long);
int vmemmap_populate_basepages(unsigned long start, unsigned long end,
int node, struct vmem_altmap *altmap);
int vmemmap_populate(unsigned long start, unsigned long end, int node,
struct vmem_altmap *altmap);
void vmemmap_populate_print_last(void);
#ifdef CONFIG_MEMORY_HOTPLUG
void vmemmap_free(unsigned long start, unsigned long end,
struct vmem_altmap *altmap);
#endif
void register_page_bootmem_memmap(unsigned long section_nr, struct page *map,
unsigned long nr_pages);
enum mf_flags {
MF_COUNT_INCREASED = 1 << 0,
MF_ACTION_REQUIRED = 1 << 1,
MF_MUST_KILL = 1 << 2,
MF_SOFT_OFFLINE = 1 << 3,
};
extern int memory_failure(unsigned long pfn, int flags);
extern void memory_failure_queue(unsigned long pfn, int flags);
extern void memory_failure_queue_kick(int cpu);
extern int unpoison_memory(unsigned long pfn);
extern int sysctl_memory_failure_early_kill;
extern int sysctl_memory_failure_recovery;
extern void shake_page(struct page *p);
extern atomic_long_t num_poisoned_pages __read_mostly;
extern int soft_offline_page(unsigned long pfn, int flags);
/*
* Error handlers for various types of pages.
*/
enum mf_result {
MF_IGNORED, /* Error: cannot be handled */
MF_FAILED, /* Error: handling failed */
MF_DELAYED, /* Will be handled later */
MF_RECOVERED, /* Successfully recovered */
};
enum mf_action_page_type {
MF_MSG_KERNEL,
MF_MSG_KERNEL_HIGH_ORDER,
MF_MSG_SLAB,
MF_MSG_DIFFERENT_COMPOUND,
MF_MSG_POISONED_HUGE,
MF_MSG_HUGE,
MF_MSG_FREE_HUGE,
MF_MSG_NON_PMD_HUGE,
MF_MSG_UNMAP_FAILED,
MF_MSG_DIRTY_SWAPCACHE,
MF_MSG_CLEAN_SWAPCACHE,
MF_MSG_DIRTY_MLOCKED_LRU,
MF_MSG_CLEAN_MLOCKED_LRU,
MF_MSG_DIRTY_UNEVICTABLE_LRU,
MF_MSG_CLEAN_UNEVICTABLE_LRU,
MF_MSG_DIRTY_LRU,
MF_MSG_CLEAN_LRU,
MF_MSG_TRUNCATED_LRU,
MF_MSG_BUDDY,
MF_MSG_BUDDY_2ND,
MF_MSG_DAX,
MF_MSG_UNSPLIT_THP,
MF_MSG_UNKNOWN,
};
#if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_HUGETLBFS)
extern void clear_huge_page(struct page *page,
unsigned long addr_hint,
unsigned int pages_per_huge_page);
extern void copy_user_huge_page(struct page *dst, struct page *src,
unsigned long addr_hint,
struct vm_area_struct *vma,
unsigned int pages_per_huge_page);
extern long copy_huge_page_from_user(struct page *dst_page,
const void __user *usr_src,
unsigned int pages_per_huge_page,
bool allow_pagefault);
/**
* vma_is_special_huge - Are transhuge page-table entries considered special?
* @vma: Pointer to the struct vm_area_struct to consider
*
* Whether transhuge page-table entries are considered "special" following
* the definition in vm_normal_page().
*
* Return: true if transhuge page-table entries should be considered special,
* false otherwise.
*/
static inline bool vma_is_special_huge(const struct vm_area_struct *vma)
{
return vma_is_dax(vma) || (vma->vm_file &&
(vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP)));
}
#endif /* CONFIG_TRANSPARENT_HUGEPAGE || CONFIG_HUGETLBFS */
#ifdef CONFIG_DEBUG_PAGEALLOC
extern unsigned int _debug_guardpage_minorder;
DECLARE_STATIC_KEY_FALSE(_debug_guardpage_enabled);
static inline unsigned int debug_guardpage_minorder(void)
{
return _debug_guardpage_minorder;
}
static inline bool debug_guardpage_enabled(void)
{
return static_branch_unlikely(&_debug_guardpage_enabled);
}
static inline bool page_is_guard(struct page *page)
{
if (!debug_guardpage_enabled())
return false;
return PageGuard(page);
}
#else
static inline unsigned int debug_guardpage_minorder(void) { return 0; }
static inline bool debug_guardpage_enabled(void) { return false; }
static inline bool page_is_guard(struct page *page) { return false; }
#endif /* CONFIG_DEBUG_PAGEALLOC */
#if MAX_NUMNODES > 1
void __init setup_nr_node_ids(void);
#else
static inline void setup_nr_node_ids(void) {}
#endif
extern int memcmp_pages(struct page *page1, struct page *page2);
static inline int pages_identical(struct page *page1, struct page *page2)
{
return !memcmp_pages(page1, page2);
}
#ifdef CONFIG_MAPPING_DIRTY_HELPERS
unsigned long clean_record_shared_mapping_range(struct address_space *mapping,
pgoff_t first_index, pgoff_t nr,
pgoff_t bitmap_pgoff,
unsigned long *bitmap,
pgoff_t *start,
pgoff_t *end);
unsigned long wp_shared_mapping_range(struct address_space *mapping,
pgoff_t first_index, pgoff_t nr);
#endif
extern int sysctl_nr_trim_pages;
#ifdef CONFIG_PRINTK
void mem_dump_obj(void *object);
#else
static inline void mem_dump_obj(void *object) {}
#endif
/**
* seal_check_future_write - Check for F_SEAL_FUTURE_WRITE flag and handle it
* @seals: the seals to check
* @vma: the vma to operate on
*
* Check whether F_SEAL_FUTURE_WRITE is set; if so, do proper check/handling on
* the vma flags. Return 0 if check pass, or <0 for errors.
*/
static inline int seal_check_future_write(int seals, struct vm_area_struct *vma)
{
if (seals & F_SEAL_FUTURE_WRITE) {
/*
* New PROT_WRITE and MAP_SHARED mmaps are not allowed when
* "future write" seal active.
*/
if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_WRITE))
return -EPERM;
/*
* Since an F_SEAL_FUTURE_WRITE sealed memfd can be mapped as
* MAP_SHARED and read-only, take care to not allow mprotect to
* revert protections on such mappings. Do this only for shared
* mappings. For private mappings, don't need to mask
* VM_MAYWRITE as we still want them to be COW-writable.
*/
if (vma->vm_flags & VM_SHARED)
vma->vm_flags &= ~(VM_MAYWRITE);
}
return 0;
}
#endif /* __KERNEL__ */
#endif /* _LINUX_MM_H */
// SPDX-License-Identifier: GPL-2.0-or-later
/*
* Copyright (C) 2001 Momchil Velikov
* Portions Copyright (C) 2001 Christoph Hellwig
* Copyright (C) 2005 SGI, Christoph Lameter
* Copyright (C) 2006 Nick Piggin
* Copyright (C) 2012 Konstantin Khlebnikov
* Copyright (C) 2016 Intel, Matthew Wilcox
* Copyright (C) 2016 Intel, Ross Zwisler
*/
#include <linux/bitmap.h>
#include <linux/bitops.h>
#include <linux/bug.h>
#include <linux/cpu.h>
#include <linux/errno.h>
#include <linux/export.h>
#include <linux/idr.h>
#include <linux/init.h>
#include <linux/kernel.h>
#include <linux/kmemleak.h>
#include <linux/percpu.h>
#include <linux/preempt.h> /* in_interrupt() */
#include <linux/radix-tree.h>
#include <linux/rcupdate.h>
#include <linux/slab.h>
#include <linux/string.h>
#include <linux/xarray.h>
/*
* Radix tree node cache.
*/
struct kmem_cache *radix_tree_node_cachep;
/*
* The radix tree is variable-height, so an insert operation not only has
* to build the branch to its corresponding item, it also has to build the
* branch to existing items if the size has to be increased (by
* radix_tree_extend).
*
* The worst case is a zero height tree with just a single item at index 0,
* and then inserting an item at index ULONG_MAX. This requires 2 new branches
* of RADIX_TREE_MAX_PATH size to be created, with only the root node shared.
* Hence:
*/
#define RADIX_TREE_PRELOAD_SIZE (RADIX_TREE_MAX_PATH * 2 - 1)
/*
* The IDR does not have to be as high as the radix tree since it uses
* signed integers, not unsigned longs.
*/
#define IDR_INDEX_BITS (8 /* CHAR_BIT */ * sizeof(int) - 1)
#define IDR_MAX_PATH (DIV_ROUND_UP(IDR_INDEX_BITS, \
RADIX_TREE_MAP_SHIFT))
#define IDR_PRELOAD_SIZE (IDR_MAX_PATH * 2 - 1)
/*
* Per-cpu pool of preloaded nodes
*/
DEFINE_PER_CPU(struct radix_tree_preload, radix_tree_preloads) = {
.lock = INIT_LOCAL_LOCK(lock),
};
EXPORT_PER_CPU_SYMBOL_GPL(radix_tree_preloads);
static inline struct radix_tree_node *entry_to_node(void *ptr)
{
return (void *)((unsigned long)ptr & ~RADIX_TREE_INTERNAL_NODE);
}
static inline void *node_to_entry(void *ptr)
{
return (void *)((unsigned long)ptr | RADIX_TREE_INTERNAL_NODE);
}
#define RADIX_TREE_RETRY XA_RETRY_ENTRY
static inline unsigned long
get_slot_offset(const struct radix_tree_node *parent, void __rcu **slot)
{
return parent ? slot - parent->slots : 0;
}
static unsigned int radix_tree_descend(const struct radix_tree_node *parent,
struct radix_tree_node **nodep, unsigned long index)
{
unsigned int offset = (index >> parent->shift) & RADIX_TREE_MAP_MASK;
void __rcu **entry = rcu_dereference_raw(parent->slots[offset]);
*nodep = (void *)entry;
return offset;
}
static inline gfp_t root_gfp_mask(const struct radix_tree_root *root)
{
return root->xa_flags & (__GFP_BITS_MASK & ~GFP_ZONEMASK);
}
static inline void tag_set(struct radix_tree_node *node, unsigned int tag,
int offset)
{
__set_bit(offset, node->tags[tag]);
}
static inline void tag_clear(struct radix_tree_node *node, unsigned int tag,
int offset)
{
__clear_bit(offset, node->tags[tag]);
}
static inline int tag_get(const struct radix_tree_node *node, unsigned int tag,
int offset)
{
return test_bit(offset, node->tags[tag]);
}
static inline void root_tag_set(struct radix_tree_root *root, unsigned tag)
{
root->xa_flags |= (__force gfp_t)(1 << (tag + ROOT_TAG_SHIFT));
}
static inline void root_tag_clear(struct radix_tree_root *root, unsigned tag)
{
root->xa_flags &= (__force gfp_t)~(1 << (tag + ROOT_TAG_SHIFT));
}
static inline void root_tag_clear_all(struct radix_tree_root *root)
{
root->xa_flags &= (__force gfp_t)((1 << ROOT_TAG_SHIFT) - 1);
}
static inline int root_tag_get(const struct radix_tree_root *root, unsigned tag)
{
return (__force int)root->xa_flags & (1 << (tag + ROOT_TAG_SHIFT));
}
static inline unsigned root_tags_get(const struct radix_tree_root *root)
{
return (__force unsigned)root->xa_flags >> ROOT_TAG_SHIFT;
}
static inline bool is_idr(const struct radix_tree_root *root)
{
return !!(root->xa_flags & ROOT_IS_IDR);
}
/*
* Returns 1 if any slot in the node has this tag set.
* Otherwise returns 0.
*/
static inline int any_tag_set(const struct radix_tree_node *node,
unsigned int tag)
{
unsigned idx;
for (idx = 0; idx < RADIX_TREE_TAG_LONGS; idx++) {
if (node->tags[tag][idx])
return 1;
}
return 0;
}
static inline void all_tag_set(struct radix_tree_node *node, unsigned int tag)
{
bitmap_fill(node->tags[tag], RADIX_TREE_MAP_SIZE);
}
/**
* radix_tree_find_next_bit - find the next set bit in a memory region
*
* @node: where to begin the search
* @tag: the tag index
* @offset: the bitnumber to start searching at
*
* Unrollable variant of find_next_bit() for constant size arrays.
* Tail bits starting from size to roundup(size, BITS_PER_LONG) must be zero.
* Returns next bit offset, or size if nothing found.
*/
static __always_inline unsigned long
radix_tree_find_next_bit(struct radix_tree_node *node, unsigned int tag,
unsigned long offset)
{
const unsigned long *addr = node->tags[tag];
if (offset < RADIX_TREE_MAP_SIZE) {
unsigned long tmp;
addr += offset / BITS_PER_LONG;
tmp = *addr >> (offset % BITS_PER_LONG);
if (tmp)
return __ffs(tmp) + offset;
offset = (offset + BITS_PER_LONG) & ~(BITS_PER_LONG - 1);
while (offset < RADIX_TREE_MAP_SIZE) {
tmp = *++addr;
if (tmp)
return __ffs(tmp) + offset;
offset += BITS_PER_LONG;
}
}
return RADIX_TREE_MAP_SIZE;
}
static unsigned int iter_offset(const struct radix_tree_iter *iter)
{
return iter->index & RADIX_TREE_MAP_MASK;
}
/*
* The maximum index which can be stored in a radix tree
*/
static inline unsigned long shift_maxindex(unsigned int shift)
{
return (RADIX_TREE_MAP_SIZE << shift) - 1;
}
static inline unsigned long node_maxindex(const struct radix_tree_node *node)
{
return shift_maxindex(node->shift);
}
static unsigned long next_index(unsigned long index,
const struct radix_tree_node *node,
unsigned long offset)
{
return (index & ~node_maxindex(node)) + (offset << node->shift);
}
/*
* This assumes that the caller has performed appropriate preallocation, and
* that the caller has pinned this thread of control to the current CPU.
*/
static struct radix_tree_node *
radix_tree_node_alloc(gfp_t gfp_mask, struct radix_tree_node *parent,
struct radix_tree_root *root,
unsigned int shift, unsigned int offset,
unsigned int count, unsigned int nr_values)
{
struct radix_tree_node *ret = NULL;
/*
* Preload code isn't irq safe and it doesn't make sense to use
* preloading during an interrupt anyway as all the allocations have
* to be atomic. So just do normal allocation when in interrupt.
*/
if (!gfpflags_allow_blocking(gfp_mask) && !in_interrupt()) {
struct radix_tree_preload *rtp;
/*
* Even if the caller has preloaded, try to allocate from the
* cache first for the new node to get accounted to the memory
* cgroup.
*/
ret = kmem_cache_alloc(radix_tree_node_cachep,
gfp_mask | __GFP_NOWARN);
if (ret)
goto out;
/*
* Provided the caller has preloaded here, we will always
* succeed in getting a node here (and never reach
* kmem_cache_alloc)
*/
rtp = this_cpu_ptr(&radix_tree_preloads);
if (rtp->nr) {
ret = rtp->nodes;
rtp->nodes = ret->parent;
rtp->nr--;
}
/*
* Update the allocation stack trace as this is more useful
* for debugging.
*/
kmemleak_update_trace(ret);
goto out;
}
ret = kmem_cache_alloc(radix_tree_node_cachep, gfp_mask);
out:
BUG_ON(radix_tree_is_internal_node(ret)); if (ret) { ret->shift = shift;
ret->offset = offset;
ret->count = count;
ret->nr_values = nr_values;
ret->parent = parent;
ret->array = root;
}
return ret;
}
void radix_tree_node_rcu_free(struct rcu_head *head)
{
struct radix_tree_node *node =
container_of(head, struct radix_tree_node, rcu_head);
/*
* Must only free zeroed nodes into the slab. We can be left with
* non-NULL entries by radix_tree_free_nodes, so clear the entries
* and tags here.
*/
memset(node->slots, 0, sizeof(node->slots));
memset(node->tags, 0, sizeof(node->tags));
INIT_LIST_HEAD(&node->private_list);
kmem_cache_free(radix_tree_node_cachep, node);
}
static inline void
radix_tree_node_free(struct radix_tree_node *node)
{
call_rcu(&node->rcu_head, radix_tree_node_rcu_free);
}
/*
* Load up this CPU's radix_tree_node buffer with sufficient objects to
* ensure that the addition of a single element in the tree cannot fail. On
* success, return zero, with preemption disabled. On error, return -ENOMEM
* with preemption not disabled.
*
* To make use of this facility, the radix tree must be initialised without
* __GFP_DIRECT_RECLAIM being passed to INIT_RADIX_TREE().
*/
static __must_check int __radix_tree_preload(gfp_t gfp_mask, unsigned nr)
{
struct radix_tree_preload *rtp;
struct radix_tree_node *node;
int ret = -ENOMEM;
/*
* Nodes preloaded by one cgroup can be used by another cgroup, so
* they should never be accounted to any particular memory cgroup.
*/
gfp_mask &= ~__GFP_ACCOUNT;
local_lock(&radix_tree_preloads.lock);
rtp = this_cpu_ptr(&radix_tree_preloads);
while (rtp->nr < nr) { local_unlock(&radix_tree_preloads.lock);
node = kmem_cache_alloc(radix_tree_node_cachep, gfp_mask);
if (node == NULL)
goto out;
local_lock(&radix_tree_preloads.lock);
rtp = this_cpu_ptr(&radix_tree_preloads);
if (rtp->nr < nr) {
node->parent = rtp->nodes;
rtp->nodes = node;
rtp->nr++;
} else {
kmem_cache_free(radix_tree_node_cachep, node);
}
}
ret = 0;
out:
return ret;
}
/*
* Load up this CPU's radix_tree_node buffer with sufficient objects to
* ensure that the addition of a single element in the tree cannot fail. On
* success, return zero, with preemption disabled. On error, return -ENOMEM
* with preemption not disabled.
*
* To make use of this facility, the radix tree must be initialised without
* __GFP_DIRECT_RECLAIM being passed to INIT_RADIX_TREE().
*/
int radix_tree_preload(gfp_t gfp_mask)
{
/* Warn on non-sensical use... */
WARN_ON_ONCE(!gfpflags_allow_blocking(gfp_mask)); return __radix_tree_preload(gfp_mask, RADIX_TREE_PRELOAD_SIZE);
}
EXPORT_SYMBOL(radix_tree_preload);
/*
* The same as above function, except we don't guarantee preloading happens.
* We do it, if we decide it helps. On success, return zero with preemption
* disabled. On error, return -ENOMEM with preemption not disabled.
*/
int radix_tree_maybe_preload(gfp_t gfp_mask)
{
if (gfpflags_allow_blocking(gfp_mask))
return __radix_tree_preload(gfp_mask, RADIX_TREE_PRELOAD_SIZE);
/* Preloading doesn't help anything with this gfp mask, skip it */
local_lock(&radix_tree_preloads.lock);
return 0;
}
EXPORT_SYMBOL(radix_tree_maybe_preload);
static unsigned radix_tree_load_root(const struct radix_tree_root *root,
struct radix_tree_node **nodep, unsigned long *maxindex)
{
struct radix_tree_node *node = rcu_dereference_raw(root->xa_head);
*nodep = node;
if (likely(radix_tree_is_internal_node(node))) {
node = entry_to_node(node);
*maxindex = node_maxindex(node);
return node->shift + RADIX_TREE_MAP_SHIFT;
}
*maxindex = 0;
return 0;
}
/*
* Extend a radix tree so it can store key @index.
*/
static int radix_tree_extend(struct radix_tree_root *root, gfp_t gfp,
unsigned long index, unsigned int shift)
{
void *entry;
unsigned int maxshift;
int tag;
/* Figure out what the shift should be. */
maxshift = shift;
while (index > shift_maxindex(maxshift))
maxshift += RADIX_TREE_MAP_SHIFT; entry = rcu_dereference_raw(root->xa_head); if (!entry && (!is_idr(root) || root_tag_get(root, IDR_FREE)))
goto out;
do {
struct radix_tree_node *node = radix_tree_node_alloc(gfp, NULL,
root, shift, 0, 1, 0);
if (!node)
return -ENOMEM;
if (is_idr(root)) {
all_tag_set(node, IDR_FREE);
if (!root_tag_get(root, IDR_FREE)) {
tag_clear(node, IDR_FREE, 0);
root_tag_set(root, IDR_FREE);
}
} else {
/* Propagate the aggregated tag info to the new child */
for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++) { if (root_tag_get(root, tag))
tag_set(node, tag, 0);
}
}
BUG_ON(shift > BITS_PER_LONG);
if (radix_tree_is_internal_node(entry)) {
entry_to_node(entry)->parent = node;
} else if (xa_is_value(entry)) {
/* Moving a value entry root->xa_head to a node */
node->nr_values = 1;
}
/*
* entry was already in the radix tree, so we do not need
* rcu_assign_pointer here
*/
node->slots[0] = (void __rcu *)entry;
entry = node_to_entry(node);
rcu_assign_pointer(root->xa_head, entry);
shift += RADIX_TREE_MAP_SHIFT;
} while (shift <= maxshift);
out:
return maxshift + RADIX_TREE_MAP_SHIFT;
}
/**
* radix_tree_shrink - shrink radix tree to minimum height
* @root: radix tree root
*/
static inline bool radix_tree_shrink(struct radix_tree_root *root)
{
bool shrunk = false;
for (;;) {
struct radix_tree_node *node = rcu_dereference_raw(root->xa_head);
struct radix_tree_node *child;
if (!radix_tree_is_internal_node(node))
break;
node = entry_to_node(node);
/*
* The candidate node has more than one child, or its child
* is not at the leftmost slot, we cannot shrink.
*/
if (node->count != 1)
break;
child = rcu_dereference_raw(node->slots[0]);
if (!child)
break;
/*
* For an IDR, we must not shrink entry 0 into the root in
* case somebody calls idr_replace() with a pointer that
* appears to be an internal entry
*/
if (!node->shift && is_idr(root))
break;
if (radix_tree_is_internal_node(child))
entry_to_node(child)->parent = NULL;
/*
* We don't need rcu_assign_pointer(), since we are simply
* moving the node from one part of the tree to another: if it
* was safe to dereference the old pointer to it
* (node->slots[0]), it will be safe to dereference the new
* one (root->xa_head) as far as dependent read barriers go.
*/
root->xa_head = (void __rcu *)child;
if (is_idr(root) && !tag_get(node, IDR_FREE, 0))
root_tag_clear(root, IDR_FREE);
/*
* We have a dilemma here. The node's slot[0] must not be
* NULLed in case there are concurrent lookups expecting to
* find the item. However if this was a bottom-level node,
* then it may be subject to the slot pointer being visible
* to callers dereferencing it. If item corresponding to
* slot[0] is subsequently deleted, these callers would expect
* their slot to become empty sooner or later.
*
* For example, lockless pagecache will look up a slot, deref
* the page pointer, and if the page has 0 refcount it means it
* was concurrently deleted from pagecache so try the deref
* again. Fortunately there is already a requirement for logic
* to retry the entire slot lookup -- the indirect pointer
* problem (replacing direct root node with an indirect pointer
* also results in a stale slot). So tag the slot as indirect
* to force callers to retry.
*/
node->count = 0;
if (!radix_tree_is_internal_node(child)) {
node->slots[0] = (void __rcu *)RADIX_TREE_RETRY;
}
WARN_ON_ONCE(!list_empty(&node->private_list));
radix_tree_node_free(node);
shrunk = true;
}
return shrunk;
}
static bool delete_node(struct radix_tree_root *root,
struct radix_tree_node *node)
{
bool deleted = false;
do {
struct radix_tree_node *parent;
if (node->count) {
if (node_to_entry(node) ==
rcu_dereference_raw(root->xa_head))
deleted |= radix_tree_shrink(root);
return deleted;
}
parent = node->parent;
if (parent) {
parent->slots[node->offset] = NULL;
parent->count--;
} else {
/*
* Shouldn't the tags already have all been cleared
* by the caller?
*/
if (!is_idr(root))
root_tag_clear_all(root);
root->xa_head = NULL;
}
WARN_ON_ONCE(!list_empty(&node->private_list));
radix_tree_node_free(node);
deleted = true;
node = parent;
} while (node);
return deleted;
}
/**
* __radix_tree_create - create a slot in a radix tree
* @root: radix tree root
* @index: index key
* @nodep: returns node
* @slotp: returns slot
*
* Create, if necessary, and return the node and slot for an item
* at position @index in the radix tree @root.
*
* Until there is more than one item in the tree, no nodes are
* allocated and @root->xa_head is used as a direct slot instead of
* pointing to a node, in which case *@nodep will be NULL.
*
* Returns -ENOMEM, or 0 for success.
*/
static int __radix_tree_create(struct radix_tree_root *root,
unsigned long index, struct radix_tree_node **nodep,
void __rcu ***slotp)
{
struct radix_tree_node *node = NULL, *child;
void __rcu **slot = (void __rcu **)&root->xa_head;
unsigned long maxindex;
unsigned int shift, offset = 0;
unsigned long max = index;
gfp_t gfp = root_gfp_mask(root);
shift = radix_tree_load_root(root, &child, &maxindex);
/* Make sure the tree is high enough. */
if (max > maxindex) { int error = radix_tree_extend(root, gfp, max, shift);
if (error < 0)
return error;
shift = error;
child = rcu_dereference_raw(root->xa_head);
}
while (shift > 0) {
shift -= RADIX_TREE_MAP_SHIFT;
if (child == NULL) {
/* Have to add a child node. */
child = radix_tree_node_alloc(gfp, node, root, shift,
offset, 0, 0);
if (!child)
return -ENOMEM;
rcu_assign_pointer(*slot, node_to_entry(child));
if (node)
node->count++;
} else if (!radix_tree_is_internal_node(child))
break;
/* Go a level down */
node = entry_to_node(child);
offset = radix_tree_descend(node, &child, index);
slot = &node->slots[offset];
}
if (nodep)
*nodep = node;
if (slotp)
*slotp = slot;
return 0;
}
/*
* Free any nodes below this node. The tree is presumed to not need
* shrinking, and any user data in the tree is presumed to not need a
* destructor called on it. If we need to add a destructor, we can
* add that functionality later. Note that we may not clear tags or
* slots from the tree as an RCU walker may still have a pointer into
* this subtree. We could replace the entries with RADIX_TREE_RETRY,
* but we'll still have to clear those in rcu_free.
*/
static void radix_tree_free_nodes(struct radix_tree_node *node)
{
unsigned offset = 0;
struct radix_tree_node *child = entry_to_node(node);
for (;;) {
void *entry = rcu_dereference_raw(child->slots[offset]);
if (xa_is_node(entry) && child->shift) {
child = entry_to_node(entry);
offset = 0;
continue;
}
offset++;
while (offset == RADIX_TREE_MAP_SIZE) {
struct radix_tree_node *old = child;
offset = child->offset + 1;
child = child->parent;
WARN_ON_ONCE(!list_empty(&old->private_list));
radix_tree_node_free(old);
if (old == entry_to_node(node))
return;
}
}
}
static inline int insert_entries(struct radix_tree_node *node,
void __rcu **slot, void *item, bool replace)
{
if (*slot)
return -EEXIST;
rcu_assign_pointer(*slot, item);
if (node) {
node->count++;
if (xa_is_value(item))
node->nr_values++;
}
return 1;
}
/**
* radix_tree_insert - insert into a radix tree
* @root: radix tree root
* @index: index key
* @item: item to insert
*
* Insert an item into the radix tree at position @index.
*/
int radix_tree_insert(struct radix_tree_root *root, unsigned long index,
void *item)
{
struct radix_tree_node *node;
void __rcu **slot;
int error;
BUG_ON(radix_tree_is_internal_node(item));
error = __radix_tree_create(root, index, &node, &slot);
if (error)
return error;
error = insert_entries(node, slot, item, false);
if (error < 0)
return error;
if (node) {
unsigned offset = get_slot_offset(node, slot);
BUG_ON(tag_get(node, 0, offset)); BUG_ON(tag_get(node, 1, offset)); BUG_ON(tag_get(node, 2, offset));
} else {
BUG_ON(root_tags_get(root));
}
return 0;
}
EXPORT_SYMBOL(radix_tree_insert);
/**
* __radix_tree_lookup - lookup an item in a radix tree
* @root: radix tree root
* @index: index key
* @nodep: returns node
* @slotp: returns slot
*
* Lookup and return the item at position @index in the radix
* tree @root.
*
* Until there is more than one item in the tree, no nodes are
* allocated and @root->xa_head is used as a direct slot instead of
* pointing to a node, in which case *@nodep will be NULL.
*/
void *__radix_tree_lookup(const struct radix_tree_root *root,
unsigned long index, struct radix_tree_node **nodep,
void __rcu ***slotp)
{
struct radix_tree_node *node, *parent;
unsigned long maxindex;
void __rcu **slot;
restart:
parent = NULL;
slot = (void __rcu **)&root->xa_head;
radix_tree_load_root(root, &node, &maxindex);
if (index > maxindex)
return NULL;
while (radix_tree_is_internal_node(node)) {
unsigned offset;
parent = entry_to_node(node);
offset = radix_tree_descend(parent, &node, index);
slot = parent->slots + offset;
if (node == RADIX_TREE_RETRY)
goto restart;
if (parent->shift == 0)
break;
}
if (nodep) *nodep = parent; if (slotp) *slotp = slot;
return node;
}
/**
* radix_tree_lookup_slot - lookup a slot in a radix tree
* @root: radix tree root
* @index: index key
*
* Returns: the slot corresponding to the position @index in the
* radix tree @root. This is useful for update-if-exists operations.
*
* This function can be called under rcu_read_lock iff the slot is not
* modified by radix_tree_replace_slot, otherwise it must be called
* exclusive from other writers. Any dereference of the slot must be done
* using radix_tree_deref_slot.
*/
void __rcu **radix_tree_lookup_slot(const struct radix_tree_root *root,
unsigned long index)
{
void __rcu **slot;
if (!__radix_tree_lookup(root, index, NULL, &slot))
return NULL;
return slot;
}
EXPORT_SYMBOL(radix_tree_lookup_slot);
/**
* radix_tree_lookup - perform lookup operation on a radix tree
* @root: radix tree root
* @index: index key
*
* Lookup the item at the position @index in the radix tree @root.
*
* This function can be called under rcu_read_lock, however the caller
* must manage lifetimes of leaf nodes (eg. RCU may also be used to free
* them safely). No RCU barriers are required to access or modify the
* returned item, however.
*/
void *radix_tree_lookup(const struct radix_tree_root *root, unsigned long index)
{
return __radix_tree_lookup(root, index, NULL, NULL);
}
EXPORT_SYMBOL(radix_tree_lookup);
static void replace_slot(void __rcu **slot, void *item,
struct radix_tree_node *node, int count, int values)
{
if (node && (count || values)) { node->count += count;
node->nr_values += values;
}
rcu_assign_pointer(*slot, item);
}
static bool node_tag_get(const struct radix_tree_root *root,
const struct radix_tree_node *node,
unsigned int tag, unsigned int offset)
{
if (node)
return tag_get(node, tag, offset);
return root_tag_get(root, tag);
}
/*
* IDR users want to be able to store NULL in the tree, so if the slot isn't
* free, don't adjust the count, even if it's transitioning between NULL and
* non-NULL. For the IDA, we mark slots as being IDR_FREE while they still
* have empty bits, but it only stores NULL in slots when they're being
* deleted.
*/
static int calculate_count(struct radix_tree_root *root,
struct radix_tree_node *node, void __rcu **slot,
void *item, void *old)
{
if (is_idr(root)) {
unsigned offset = get_slot_offset(node, slot);
bool free = node_tag_get(root, node, IDR_FREE, offset);
if (!free)
return 0;
if (!old)
return 1;
}
return !!item - !!old;
}
/**
* __radix_tree_replace - replace item in a slot
* @root: radix tree root
* @node: pointer to tree node
* @slot: pointer to slot in @node
* @item: new item to store in the slot.
*
* For use with __radix_tree_lookup(). Caller must hold tree write locked
* across slot lookup and replacement.
*/
void __radix_tree_replace(struct radix_tree_root *root,
struct radix_tree_node *node,
void __rcu **slot, void *item)
{
void *old = rcu_dereference_raw(*slot);
int values = !!xa_is_value(item) - !!xa_is_value(old);
int count = calculate_count(root, node, slot, item, old);
/*
* This function supports replacing value entries and
* deleting entries, but that needs accounting against the
* node unless the slot is root->xa_head.
*/
WARN_ON_ONCE(!node && (slot != (void __rcu **)&root->xa_head) &&
(count || values));
replace_slot(slot, item, node, count, values);
if (!node)
return;
delete_node(root, node);
}
/**
* radix_tree_replace_slot - replace item in a slot
* @root: radix tree root
* @slot: pointer to slot
* @item: new item to store in the slot.
*
* For use with radix_tree_lookup_slot() and
* radix_tree_gang_lookup_tag_slot(). Caller must hold tree write locked
* across slot lookup and replacement.
*
* NOTE: This cannot be used to switch between non-entries (empty slots),
* regular entries, and value entries, as that requires accounting
* inside the radix tree node. When switching from one type of entry or
* deleting, use __radix_tree_lookup() and __radix_tree_replace() or
* radix_tree_iter_replace().
*/
void radix_tree_replace_slot(struct radix_tree_root *root,
void __rcu **slot, void *item)
{
__radix_tree_replace(root, NULL, slot, item);
}
EXPORT_SYMBOL(radix_tree_replace_slot);
/**
* radix_tree_iter_replace - replace item in a slot
* @root: radix tree root
* @iter: iterator state
* @slot: pointer to slot
* @item: new item to store in the slot.
*
* For use with radix_tree_for_each_slot().
* Caller must hold tree write locked.
*/
void radix_tree_iter_replace(struct radix_tree_root *root,
const struct radix_tree_iter *iter,
void __rcu **slot, void *item)
{
__radix_tree_replace(root, iter->node, slot, item);
}
static void node_tag_set(struct radix_tree_root *root,
struct radix_tree_node *node,
unsigned int tag, unsigned int offset)
{
while (node) {
if (tag_get(node, tag, offset))
return;
tag_set(node, tag, offset);
offset = node->offset;
node = node->parent;
}
if (!root_tag_get(root, tag))
root_tag_set(root, tag);
}
/**
* radix_tree_tag_set - set a tag on a radix tree node
* @root: radix tree root
* @index: index key
* @tag: tag index
*
* Set the search tag (which must be < RADIX_TREE_MAX_TAGS)
* corresponding to @index in the radix tree. From
* the root all the way down to the leaf node.
*
* Returns the address of the tagged item. Setting a tag on a not-present
* item is a bug.
*/
void *radix_tree_tag_set(struct radix_tree_root *root,
unsigned long index, unsigned int tag)
{
struct radix_tree_node *node, *parent;
unsigned long maxindex;
radix_tree_load_root(root, &node, &maxindex);
BUG_ON(index > maxindex); while (radix_tree_is_internal_node(node)) {
unsigned offset;
parent = entry_to_node(node);
offset = radix_tree_descend(parent, &node, index);
BUG_ON(!node);
if (!tag_get(parent, tag, offset))
tag_set(parent, tag, offset);
}
/* set the root's tag bit */
if (!root_tag_get(root, tag))
root_tag_set(root, tag);
return node;
}
EXPORT_SYMBOL(radix_tree_tag_set);
static void node_tag_clear(struct radix_tree_root *root,
struct radix_tree_node *node,
unsigned int tag, unsigned int offset)
{
while (node) { if (!tag_get(node, tag, offset))
return;
tag_clear(node, tag, offset);
if (any_tag_set(node, tag))
return;
offset = node->offset;
node = node->parent;
}
/* clear the root's tag bit */
if (root_tag_get(root, tag))
root_tag_clear(root, tag);
}
/**
* radix_tree_tag_clear - clear a tag on a radix tree node
* @root: radix tree root
* @index: index key
* @tag: tag index
*
* Clear the search tag (which must be < RADIX_TREE_MAX_TAGS)
* corresponding to @index in the radix tree. If this causes
* the leaf node to have no tags set then clear the tag in the
* next-to-leaf node, etc.
*
* Returns the address of the tagged item on success, else NULL. ie:
* has the same return value and semantics as radix_tree_lookup().
*/
void *radix_tree_tag_clear(struct radix_tree_root *root,
unsigned long index, unsigned int tag)
{
struct radix_tree_node *node, *parent;
unsigned long maxindex;
int offset;
radix_tree_load_root(root, &node, &maxindex);
if (index > maxindex)
return NULL;
parent = NULL;
while (radix_tree_is_internal_node(node)) {
parent = entry_to_node(node);
offset = radix_tree_descend(parent, &node, index);
}
if (node) node_tag_clear(root, parent, tag, offset);
return node;
}
EXPORT_SYMBOL(radix_tree_tag_clear);
/**
* radix_tree_iter_tag_clear - clear a tag on the current iterator entry
* @root: radix tree root
* @iter: iterator state
* @tag: tag to clear
*/
void radix_tree_iter_tag_clear(struct radix_tree_root *root,
const struct radix_tree_iter *iter, unsigned int tag)
{
node_tag_clear(root, iter->node, tag, iter_offset(iter));
}
/**
* radix_tree_tag_get - get a tag on a radix tree node
* @root: radix tree root
* @index: index key
* @tag: tag index (< RADIX_TREE_MAX_TAGS)
*
* Return values:
*
* 0: tag not present or not set
* 1: tag set
*
* Note that the return value of this function may not be relied on, even if
* the RCU lock is held, unless tag modification and node deletion are excluded
* from concurrency.
*/
int radix_tree_tag_get(const struct radix_tree_root *root,
unsigned long index, unsigned int tag)
{
struct radix_tree_node *node, *parent;
unsigned long maxindex;
if (!root_tag_get(root, tag))
return 0;
radix_tree_load_root(root, &node, &maxindex);
if (index > maxindex)
return 0;
while (radix_tree_is_internal_node(node)) {
unsigned offset;
parent = entry_to_node(node);
offset = radix_tree_descend(parent, &node, index);
if (!tag_get(parent, tag, offset))
return 0;
if (node == RADIX_TREE_RETRY)
break;
}
return 1;
}
EXPORT_SYMBOL(radix_tree_tag_get);
/* Construct iter->tags bit-mask from node->tags[tag] array */
static void set_iter_tags(struct radix_tree_iter *iter,
struct radix_tree_node *node, unsigned offset,
unsigned tag)
{
unsigned tag_long = offset / BITS_PER_LONG;
unsigned tag_bit = offset % BITS_PER_LONG;
if (!node) {
iter->tags = 1;
return;
}
iter->tags = node->tags[tag][tag_long] >> tag_bit;
/* This never happens if RADIX_TREE_TAG_LONGS == 1 */
if (tag_long < RADIX_TREE_TAG_LONGS - 1) {
/* Pick tags from next element */
if (tag_bit)
iter->tags |= node->tags[tag][tag_long + 1] <<
(BITS_PER_LONG - tag_bit);
/* Clip chunk size, here only BITS_PER_LONG tags */
iter->next_index = __radix_tree_iter_add(iter, BITS_PER_LONG);
}
}
void __rcu **radix_tree_iter_resume(void __rcu **slot,
struct radix_tree_iter *iter)
{
slot++;
iter->index = __radix_tree_iter_add(iter, 1);
iter->next_index = iter->index;
iter->tags = 0;
return NULL;
}
EXPORT_SYMBOL(radix_tree_iter_resume);
/**
* radix_tree_next_chunk - find next chunk of slots for iteration
*
* @root: radix tree root
* @iter: iterator state
* @flags: RADIX_TREE_ITER_* flags and tag index
* Returns: pointer to chunk first slot, or NULL if iteration is over
*/
void __rcu **radix_tree_next_chunk(const struct radix_tree_root *root,
struct radix_tree_iter *iter, unsigned flags)
{
unsigned tag = flags & RADIX_TREE_ITER_TAG_MASK;
struct radix_tree_node *node, *child;
unsigned long index, offset, maxindex;
if ((flags & RADIX_TREE_ITER_TAGGED) && !root_tag_get(root, tag)) return NULL;
/*
* Catch next_index overflow after ~0UL. iter->index never overflows
* during iterating; it can be zero only at the beginning.
* And we cannot overflow iter->next_index in a single step,
* because RADIX_TREE_MAP_SHIFT < BITS_PER_LONG.
*
* This condition also used by radix_tree_next_slot() to stop
* contiguous iterating, and forbid switching to the next chunk.
*/
index = iter->next_index; if (!index && iter->index)
return NULL;
restart:
radix_tree_load_root(root, &child, &maxindex);
if (index > maxindex)
return NULL;
if (!child)
return NULL;
if (!radix_tree_is_internal_node(child)) {
/* Single-slot tree */
iter->index = index;
iter->next_index = maxindex + 1;
iter->tags = 1;
iter->node = NULL;
return (void __rcu **)&root->xa_head;
}
do {
node = entry_to_node(child);
offset = radix_tree_descend(node, &child, index);
if ((flags & RADIX_TREE_ITER_TAGGED) ?
!tag_get(node, tag, offset) : !child) {
/* Hole detected */
if (flags & RADIX_TREE_ITER_CONTIG)
return NULL;
if (flags & RADIX_TREE_ITER_TAGGED)
offset = radix_tree_find_next_bit(node, tag,
offset + 1);
else
while (++offset < RADIX_TREE_MAP_SIZE) { void *slot = rcu_dereference_raw(
node->slots[offset]);
if (slot)
break;
}
index &= ~node_maxindex(node);
index += offset << node->shift;
/* Overflow after ~0UL */
if (!index)
return NULL;
if (offset == RADIX_TREE_MAP_SIZE)
goto restart;
child = rcu_dereference_raw(node->slots[offset]);
}
if (!child)
goto restart;
if (child == RADIX_TREE_RETRY)
break;
} while (node->shift && radix_tree_is_internal_node(child));
/* Update the iterator state */
iter->index = (index &~ node_maxindex(node)) | offset;
iter->next_index = (index | node_maxindex(node)) + 1;
iter->node = node;
if (flags & RADIX_TREE_ITER_TAGGED)
set_iter_tags(iter, node, offset, tag); return node->slots + offset;
}
EXPORT_SYMBOL(radix_tree_next_chunk);
/**
* radix_tree_gang_lookup - perform multiple lookup on a radix tree
* @root: radix tree root
* @results: where the results of the lookup are placed
* @first_index: start the lookup from this key
* @max_items: place up to this many items at *results
*
* Performs an index-ascending scan of the tree for present items. Places
* them at *@results and returns the number of items which were placed at
* *@results.
*
* The implementation is naive.
*
* Like radix_tree_lookup, radix_tree_gang_lookup may be called under
* rcu_read_lock. In this case, rather than the returned results being
* an atomic snapshot of the tree at a single point in time, the
* semantics of an RCU protected gang lookup are as though multiple
* radix_tree_lookups have been issued in individual locks, and results
* stored in 'results'.
*/
unsigned int
radix_tree_gang_lookup(const struct radix_tree_root *root, void **results,
unsigned long first_index, unsigned int max_items)
{
struct radix_tree_iter iter;
void __rcu **slot;
unsigned int ret = 0;
if (unlikely(!max_items))
return 0;
radix_tree_for_each_slot(slot, root, &iter, first_index) { results[ret] = rcu_dereference_raw(*slot);
if (!results[ret])
continue;
if (radix_tree_is_internal_node(results[ret])) {
slot = radix_tree_iter_retry(&iter);
continue;
}
if (++ret == max_items)
break;
}
return ret;
}
EXPORT_SYMBOL(radix_tree_gang_lookup);
/**
* radix_tree_gang_lookup_tag - perform multiple lookup on a radix tree
* based on a tag
* @root: radix tree root
* @results: where the results of the lookup are placed
* @first_index: start the lookup from this key
* @max_items: place up to this many items at *results
* @tag: the tag index (< RADIX_TREE_MAX_TAGS)
*
* Performs an index-ascending scan of the tree for present items which
* have the tag indexed by @tag set. Places the items at *@results and
* returns the number of items which were placed at *@results.
*/
unsigned int
radix_tree_gang_lookup_tag(const struct radix_tree_root *root, void **results,
unsigned long first_index, unsigned int max_items,
unsigned int tag)
{
struct radix_tree_iter iter;
void __rcu **slot;
unsigned int ret = 0;
if (unlikely(!max_items))
return 0;
radix_tree_for_each_tagged(slot, root, &iter, first_index, tag) { results[ret] = rcu_dereference_raw(*slot);
if (!results[ret])
continue;
if (radix_tree_is_internal_node(results[ret])) {
slot = radix_tree_iter_retry(&iter);
continue;
}
if (++ret == max_items)
break;
}
return ret;
}
EXPORT_SYMBOL(radix_tree_gang_lookup_tag);
/**
* radix_tree_gang_lookup_tag_slot - perform multiple slot lookup on a
* radix tree based on a tag
* @root: radix tree root
* @results: where the results of the lookup are placed
* @first_index: start the lookup from this key
* @max_items: place up to this many items at *results
* @tag: the tag index (< RADIX_TREE_MAX_TAGS)
*
* Performs an index-ascending scan of the tree for present items which
* have the tag indexed by @tag set. Places the slots at *@results and
* returns the number of slots which were placed at *@results.
*/
unsigned int
radix_tree_gang_lookup_tag_slot(const struct radix_tree_root *root,
void __rcu ***results, unsigned long first_index,
unsigned int max_items, unsigned int tag)
{
struct radix_tree_iter iter;
void __rcu **slot;
unsigned int ret = 0;
if (unlikely(!max_items))
return 0;
radix_tree_for_each_tagged(slot, root, &iter, first_index, tag) {
results[ret] = slot;
if (++ret == max_items)
break;
}
return ret;
}
EXPORT_SYMBOL(radix_tree_gang_lookup_tag_slot);
static bool __radix_tree_delete(struct radix_tree_root *root,
struct radix_tree_node *node, void __rcu **slot)
{
void *old = rcu_dereference_raw(*slot);
int values = xa_is_value(old) ? -1 : 0;
unsigned offset = get_slot_offset(node, slot);
int tag;
if (is_idr(root))
node_tag_set(root, node, IDR_FREE, offset);
else
for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++)
node_tag_clear(root, node, tag, offset);
replace_slot(slot, NULL, node, -1, values);
return node && delete_node(root, node);
}
/**
* radix_tree_iter_delete - delete the entry at this iterator position
* @root: radix tree root
* @iter: iterator state
* @slot: pointer to slot
*
* Delete the entry at the position currently pointed to by the iterator.
* This may result in the current node being freed; if it is, the iterator
* is advanced so that it will not reference the freed memory. This
* function may be called without any locking if there are no other threads
* which can access this tree.
*/
void radix_tree_iter_delete(struct radix_tree_root *root,
struct radix_tree_iter *iter, void __rcu **slot)
{
if (__radix_tree_delete(root, iter->node, slot))
iter->index = iter->next_index;
}
EXPORT_SYMBOL(radix_tree_iter_delete);
/**
* radix_tree_delete_item - delete an item from a radix tree
* @root: radix tree root
* @index: index key
* @item: expected item
*
* Remove @item at @index from the radix tree rooted at @root.
*
* Return: the deleted entry, or %NULL if it was not present
* or the entry at the given @index was not @item.
*/
void *radix_tree_delete_item(struct radix_tree_root *root,
unsigned long index, void *item)
{
struct radix_tree_node *node = NULL;
void __rcu **slot = NULL;
void *entry;
entry = __radix_tree_lookup(root, index, &node, &slot);
if (!slot)
return NULL;
if (!entry && (!is_idr(root) || node_tag_get(root, node, IDR_FREE, get_slot_offset(node, slot))))
return NULL;
if (item && entry != item)
return NULL;
__radix_tree_delete(root, node, slot); return entry;
}
EXPORT_SYMBOL(radix_tree_delete_item);
/**
* radix_tree_delete - delete an entry from a radix tree
* @root: radix tree root
* @index: index key
*
* Remove the entry at @index from the radix tree rooted at @root.
*
* Return: The deleted entry, or %NULL if it was not present.
*/
void *radix_tree_delete(struct radix_tree_root *root, unsigned long index)
{
return radix_tree_delete_item(root, index, NULL);
}
EXPORT_SYMBOL(radix_tree_delete);
/**
* radix_tree_tagged - test whether any items in the tree are tagged
* @root: radix tree root
* @tag: tag to test
*/
int radix_tree_tagged(const struct radix_tree_root *root, unsigned int tag)
{
return root_tag_get(root, tag);
}
EXPORT_SYMBOL(radix_tree_tagged);
/**
* idr_preload - preload for idr_alloc()
* @gfp_mask: allocation mask to use for preloading
*
* Preallocate memory to use for the next call to idr_alloc(). This function
* returns with preemption disabled. It will be enabled by idr_preload_end().
*/
void idr_preload(gfp_t gfp_mask)
{
if (__radix_tree_preload(gfp_mask, IDR_PRELOAD_SIZE)) local_lock(&radix_tree_preloads.lock);
}
EXPORT_SYMBOL(idr_preload);
void __rcu **idr_get_free(struct radix_tree_root *root,
struct radix_tree_iter *iter, gfp_t gfp,
unsigned long max)
{
struct radix_tree_node *node = NULL, *child;
void __rcu **slot = (void __rcu **)&root->xa_head;
unsigned long maxindex, start = iter->next_index;
unsigned int shift, offset = 0;
grow:
shift = radix_tree_load_root(root, &child, &maxindex);
if (!radix_tree_tagged(root, IDR_FREE))
start = max(start, maxindex + 1); if (start > max)
return ERR_PTR(-ENOSPC);
if (start > maxindex) { int error = radix_tree_extend(root, gfp, start, shift);
if (error < 0)
return ERR_PTR(error); shift = error;
child = rcu_dereference_raw(root->xa_head);
}
if (start == 0 && shift == 0) shift = RADIX_TREE_MAP_SHIFT; while (shift) { shift -= RADIX_TREE_MAP_SHIFT;
if (child == NULL) {
/* Have to add a child node. */
child = radix_tree_node_alloc(gfp, node, root, shift,
offset, 0, 0);
if (!child)
return ERR_PTR(-ENOMEM);
all_tag_set(child, IDR_FREE);
rcu_assign_pointer(*slot, node_to_entry(child));
if (node)
node->count++;
} else if (!radix_tree_is_internal_node(child))
break;
node = entry_to_node(child);
offset = radix_tree_descend(node, &child, start);
if (!tag_get(node, IDR_FREE, offset)) {
offset = radix_tree_find_next_bit(node, IDR_FREE,
offset + 1);
start = next_index(start, node, offset);
if (start > max || start == 0)
return ERR_PTR(-ENOSPC);
while (offset == RADIX_TREE_MAP_SIZE) { offset = node->offset + 1;
node = node->parent;
if (!node)
goto grow;
shift = node->shift;
}
child = rcu_dereference_raw(node->slots[offset]);
}
slot = &node->slots[offset];
}
iter->index = start;
if (node)
iter->next_index = 1 + min(max, (start | node_maxindex(node)));
else
iter->next_index = 1;
iter->node = node;
set_iter_tags(iter, node, offset, IDR_FREE);
return slot;
}
/**
* idr_destroy - release all internal memory from an IDR
* @idr: idr handle
*
* After this function is called, the IDR is empty, and may be reused or
* the data structure containing it may be freed.
*
* A typical clean-up sequence for objects stored in an idr tree will use
* idr_for_each() to free all objects, if necessary, then idr_destroy() to
* free the memory used to keep track of those objects.
*/
void idr_destroy(struct idr *idr)
{
struct radix_tree_node *node = rcu_dereference_raw(idr->idr_rt.xa_head);
if (radix_tree_is_internal_node(node))
radix_tree_free_nodes(node);
idr->idr_rt.xa_head = NULL;
root_tag_set(&idr->idr_rt, IDR_FREE);
}
EXPORT_SYMBOL(idr_destroy);
static void
radix_tree_node_ctor(void *arg)
{
struct radix_tree_node *node = arg;
memset(node, 0, sizeof(*node));
INIT_LIST_HEAD(&node->private_list);
}
static int radix_tree_cpu_dead(unsigned int cpu)
{
struct radix_tree_preload *rtp;
struct radix_tree_node *node;
/* Free per-cpu pool of preloaded nodes */
rtp = &per_cpu(radix_tree_preloads, cpu);
while (rtp->nr) {
node = rtp->nodes;
rtp->nodes = node->parent;
kmem_cache_free(radix_tree_node_cachep, node);
rtp->nr--;
}
return 0;
}
void __init radix_tree_init(void)
{
int ret;
BUILD_BUG_ON(RADIX_TREE_MAX_TAGS + __GFP_BITS_SHIFT > 32);
BUILD_BUG_ON(ROOT_IS_IDR & ~GFP_ZONEMASK);
BUILD_BUG_ON(XA_CHUNK_SIZE > 255);
radix_tree_node_cachep = kmem_cache_create("radix_tree_node",
sizeof(struct radix_tree_node), 0,
SLAB_PANIC | SLAB_RECLAIM_ACCOUNT,
radix_tree_node_ctor);
ret = cpuhp_setup_state_nocalls(CPUHP_RADIX_DEAD, "lib/radix:dead",
NULL, radix_tree_cpu_dead);
WARN_ON(ret < 0);
}
/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
* VLAN An implementation of 802.1Q VLAN tagging.
*
* Authors: Ben Greear <greearb@candelatech.com>
*/
#ifndef _LINUX_IF_VLAN_H_
#define _LINUX_IF_VLAN_H_
#include <linux/netdevice.h>
#include <linux/etherdevice.h>
#include <linux/rtnetlink.h>
#include <linux/bug.h>
#include <uapi/linux/if_vlan.h>
#define VLAN_HLEN 4 /* The additional bytes required by VLAN
* (in addition to the Ethernet header)
*/
#define VLAN_ETH_HLEN 18 /* Total octets in header. */
#define VLAN_ETH_ZLEN 64 /* Min. octets in frame sans FCS */
/*
* According to 802.3ac, the packet can be 4 bytes longer. --Klika Jan
*/
#define VLAN_ETH_DATA_LEN 1500 /* Max. octets in payload */
#define VLAN_ETH_FRAME_LEN 1518 /* Max. octets in frame sans FCS */
#define VLAN_MAX_DEPTH 8 /* Max. number of nested VLAN tags parsed */
/*
* struct vlan_hdr - vlan header
* @h_vlan_TCI: priority and VLAN ID
* @h_vlan_encapsulated_proto: packet type ID or len
*/
struct vlan_hdr {
__be16 h_vlan_TCI;
__be16 h_vlan_encapsulated_proto;
};
/**
* struct vlan_ethhdr - vlan ethernet header (ethhdr + vlan_hdr)
* @h_dest: destination ethernet address
* @h_source: source ethernet address
* @h_vlan_proto: ethernet protocol
* @h_vlan_TCI: priority and VLAN ID
* @h_vlan_encapsulated_proto: packet type ID or len
*/
struct vlan_ethhdr {
unsigned char h_dest[ETH_ALEN];
unsigned char h_source[ETH_ALEN];
__be16 h_vlan_proto;
__be16 h_vlan_TCI;
__be16 h_vlan_encapsulated_proto;
};
#include <linux/skbuff.h>
static inline struct vlan_ethhdr *vlan_eth_hdr(const struct sk_buff *skb)
{
return (struct vlan_ethhdr *)skb_mac_header(skb);
}
#define VLAN_PRIO_MASK 0xe000 /* Priority Code Point */
#define VLAN_PRIO_SHIFT 13
#define VLAN_CFI_MASK 0x1000 /* Canonical Format Indicator / Drop Eligible Indicator */
#define VLAN_VID_MASK 0x0fff /* VLAN Identifier */
#define VLAN_N_VID 4096
/* found in socket.c */
extern void vlan_ioctl_set(int (*hook)(struct net *, void __user *));
static inline bool is_vlan_dev(const struct net_device *dev)
{
return dev->priv_flags & IFF_802_1Q_VLAN;
}
#define skb_vlan_tag_present(__skb) ((__skb)->vlan_present)
#define skb_vlan_tag_get(__skb) ((__skb)->vlan_tci)
#define skb_vlan_tag_get_id(__skb) ((__skb)->vlan_tci & VLAN_VID_MASK)
#define skb_vlan_tag_get_cfi(__skb) (!!((__skb)->vlan_tci & VLAN_CFI_MASK))
#define skb_vlan_tag_get_prio(__skb) (((__skb)->vlan_tci & VLAN_PRIO_MASK) >> VLAN_PRIO_SHIFT)
static inline int vlan_get_rx_ctag_filter_info(struct net_device *dev)
{
ASSERT_RTNL();
return notifier_to_errno(call_netdevice_notifiers(NETDEV_CVLAN_FILTER_PUSH_INFO, dev));
}
static inline void vlan_drop_rx_ctag_filter_info(struct net_device *dev)
{
ASSERT_RTNL();
call_netdevice_notifiers(NETDEV_CVLAN_FILTER_DROP_INFO, dev);
}
static inline int vlan_get_rx_stag_filter_info(struct net_device *dev)
{
ASSERT_RTNL();
return notifier_to_errno(call_netdevice_notifiers(NETDEV_SVLAN_FILTER_PUSH_INFO, dev));
}
static inline void vlan_drop_rx_stag_filter_info(struct net_device *dev)
{
ASSERT_RTNL();
call_netdevice_notifiers(NETDEV_SVLAN_FILTER_DROP_INFO, dev);
}
/**
* struct vlan_pcpu_stats - VLAN percpu rx/tx stats
* @rx_packets: number of received packets
* @rx_bytes: number of received bytes
* @rx_multicast: number of received multicast packets
* @tx_packets: number of transmitted packets
* @tx_bytes: number of transmitted bytes
* @syncp: synchronization point for 64bit counters
* @rx_errors: number of rx errors
* @tx_dropped: number of tx drops
*/
struct vlan_pcpu_stats {
u64 rx_packets;
u64 rx_bytes;
u64 rx_multicast;
u64 tx_packets;
u64 tx_bytes;
struct u64_stats_sync syncp;
u32 rx_errors;
u32 tx_dropped;
};
#if defined(CONFIG_VLAN_8021Q) || defined(CONFIG_VLAN_8021Q_MODULE)
extern struct net_device *__vlan_find_dev_deep_rcu(struct net_device *real_dev,
__be16 vlan_proto, u16 vlan_id);
extern int vlan_for_each(struct net_device *dev,
int (*action)(struct net_device *dev, int vid,
void *arg), void *arg);
extern struct net_device *vlan_dev_real_dev(const struct net_device *dev);
extern u16 vlan_dev_vlan_id(const struct net_device *dev);
extern __be16 vlan_dev_vlan_proto(const struct net_device *dev);
/**
* struct vlan_priority_tci_mapping - vlan egress priority mappings
* @priority: skb priority
* @vlan_qos: vlan priority: (skb->priority << 13) & 0xE000
* @next: pointer to next struct
*/
struct vlan_priority_tci_mapping {
u32 priority;
u16 vlan_qos;
struct vlan_priority_tci_mapping *next;
};
struct proc_dir_entry;
struct netpoll;
/**
* struct vlan_dev_priv - VLAN private device data
* @nr_ingress_mappings: number of ingress priority mappings
* @ingress_priority_map: ingress priority mappings
* @nr_egress_mappings: number of egress priority mappings
* @egress_priority_map: hash of egress priority mappings
* @vlan_proto: VLAN encapsulation protocol
* @vlan_id: VLAN identifier
* @flags: device flags
* @real_dev: underlying netdevice
* @real_dev_addr: address of underlying netdevice
* @dent: proc dir entry
* @vlan_pcpu_stats: ptr to percpu rx stats
*/
struct vlan_dev_priv {
unsigned int nr_ingress_mappings;
u32 ingress_priority_map[8];
unsigned int nr_egress_mappings;
struct vlan_priority_tci_mapping *egress_priority_map[16];
__be16 vlan_proto;
u16 vlan_id;
u16 flags;
struct net_device *real_dev;
unsigned char real_dev_addr[ETH_ALEN];
struct proc_dir_entry *dent;
struct vlan_pcpu_stats __percpu *vlan_pcpu_stats;
#ifdef CONFIG_NET_POLL_CONTROLLER
struct netpoll *netpoll;
#endif
};
static inline struct vlan_dev_priv *vlan_dev_priv(const struct net_device *dev)
{
return netdev_priv(dev);
}
static inline u16
vlan_dev_get_egress_qos_mask(struct net_device *dev, u32 skprio)
{
struct vlan_priority_tci_mapping *mp;
smp_rmb(); /* coupled with smp_wmb() in vlan_dev_set_egress_priority() */
mp = vlan_dev_priv(dev)->egress_priority_map[(skprio & 0xF)];
while (mp) {
if (mp->priority == skprio) {
return mp->vlan_qos; /* This should already be shifted
* to mask correctly with the
* VLAN's TCI */
}
mp = mp->next;
}
return 0;
}
extern bool vlan_do_receive(struct sk_buff **skb);
extern int vlan_vid_add(struct net_device *dev, __be16 proto, u16 vid);
extern void vlan_vid_del(struct net_device *dev, __be16 proto, u16 vid);
extern int vlan_vids_add_by_dev(struct net_device *dev,
const struct net_device *by_dev);
extern void vlan_vids_del_by_dev(struct net_device *dev,
const struct net_device *by_dev);
extern bool vlan_uses_dev(const struct net_device *dev);
#else
static inline struct net_device *
__vlan_find_dev_deep_rcu(struct net_device *real_dev,
__be16 vlan_proto, u16 vlan_id)
{
return NULL;
}
static inline int
vlan_for_each(struct net_device *dev,
int (*action)(struct net_device *dev, int vid, void *arg),
void *arg)
{
return 0;
}
static inline struct net_device *vlan_dev_real_dev(const struct net_device *dev)
{
BUG();
return NULL;
}
static inline u16 vlan_dev_vlan_id(const struct net_device *dev)
{
BUG();
return 0;
}
static inline __be16 vlan_dev_vlan_proto(const struct net_device *dev)
{
BUG();
return 0;
}
static inline u16 vlan_dev_get_egress_qos_mask(struct net_device *dev,
u32 skprio)
{
return 0;
}
static inline bool vlan_do_receive(struct sk_buff **skb)
{
return false;
}
static inline int vlan_vid_add(struct net_device *dev, __be16 proto, u16 vid)
{
return 0;
}
static inline void vlan_vid_del(struct net_device *dev, __be16 proto, u16 vid)
{
}
static inline int vlan_vids_add_by_dev(struct net_device *dev,
const struct net_device *by_dev)
{
return 0;
}
static inline void vlan_vids_del_by_dev(struct net_device *dev,
const struct net_device *by_dev)
{
}
static inline bool vlan_uses_dev(const struct net_device *dev)
{
return false;
}
#endif
/**
* eth_type_vlan - check for valid vlan ether type.
* @ethertype: ether type to check
*
* Returns true if the ether type is a vlan ether type.
*/
static inline bool eth_type_vlan(__be16 ethertype)
{
switch (ethertype) {
case htons(ETH_P_8021Q):
case htons(ETH_P_8021AD):
return true;
default:
return false;
}
}
static inline bool vlan_hw_offload_capable(netdev_features_t features,
__be16 proto)
{
if (proto == htons(ETH_P_8021Q) && features & NETIF_F_HW_VLAN_CTAG_TX)
return true;
if (proto == htons(ETH_P_8021AD) && features & NETIF_F_HW_VLAN_STAG_TX)
return true;
return false;
}
/**
* __vlan_insert_inner_tag - inner VLAN tag inserting
* @skb: skbuff to tag
* @vlan_proto: VLAN encapsulation protocol
* @vlan_tci: VLAN TCI to insert
* @mac_len: MAC header length including outer vlan headers
*
* Inserts the VLAN tag into @skb as part of the payload at offset mac_len
* Returns error if skb_cow_head fails.
*
* Does not change skb->protocol so this function can be used during receive.
*/
static inline int __vlan_insert_inner_tag(struct sk_buff *skb,
__be16 vlan_proto, u16 vlan_tci,
unsigned int mac_len)
{
struct vlan_ethhdr *veth;
if (skb_cow_head(skb, VLAN_HLEN) < 0)
return -ENOMEM;
skb_push(skb, VLAN_HLEN);
/* Move the mac header sans proto to the beginning of the new header. */
if (likely(mac_len > ETH_TLEN))
memmove(skb->data, skb->data + VLAN_HLEN, mac_len - ETH_TLEN);
skb->mac_header -= VLAN_HLEN;
veth = (struct vlan_ethhdr *)(skb->data + mac_len - ETH_HLEN);
/* first, the ethernet type */
if (likely(mac_len >= ETH_TLEN)) {
/* h_vlan_encapsulated_proto should already be populated, and
* skb->data has space for h_vlan_proto
*/
veth->h_vlan_proto = vlan_proto;
} else {
/* h_vlan_encapsulated_proto should not be populated, and
* skb->data has no space for h_vlan_proto
*/
veth->h_vlan_encapsulated_proto = skb->protocol;
}
/* now, the TCI */
veth->h_vlan_TCI = htons(vlan_tci);
return 0;
}
/**
* __vlan_insert_tag - regular VLAN tag inserting
* @skb: skbuff to tag
* @vlan_proto: VLAN encapsulation protocol
* @vlan_tci: VLAN TCI to insert
*
* Inserts the VLAN tag into @skb as part of the payload
* Returns error if skb_cow_head fails.
*
* Does not change skb->protocol so this function can be used during receive.
*/
static inline int __vlan_insert_tag(struct sk_buff *skb,
__be16 vlan_proto, u16 vlan_tci)
{
return __vlan_insert_inner_tag(skb, vlan_proto, vlan_tci, ETH_HLEN);
}
/**
* vlan_insert_inner_tag - inner VLAN tag inserting
* @skb: skbuff to tag
* @vlan_proto: VLAN encapsulation protocol
* @vlan_tci: VLAN TCI to insert
* @mac_len: MAC header length including outer vlan headers
*
* Inserts the VLAN tag into @skb as part of the payload at offset mac_len
* Returns a VLAN tagged skb. If a new skb is created, @skb is freed.
*
* Following the skb_unshare() example, in case of error, the calling function
* doesn't have to worry about freeing the original skb.
*
* Does not change skb->protocol so this function can be used during receive.
*/
static inline struct sk_buff *vlan_insert_inner_tag(struct sk_buff *skb,
__be16 vlan_proto,
u16 vlan_tci,
unsigned int mac_len)
{
int err;
err = __vlan_insert_inner_tag(skb, vlan_proto, vlan_tci, mac_len);
if (err) {
dev_kfree_skb_any(skb);
return NULL;
}
return skb;
}
/**
* vlan_insert_tag - regular VLAN tag inserting
* @skb: skbuff to tag
* @vlan_proto: VLAN encapsulation protocol
* @vlan_tci: VLAN TCI to insert
*
* Inserts the VLAN tag into @skb as part of the payload
* Returns a VLAN tagged skb. If a new skb is created, @skb is freed.
*
* Following the skb_unshare() example, in case of error, the calling function
* doesn't have to worry about freeing the original skb.
*
* Does not change skb->protocol so this function can be used during receive.
*/
static inline struct sk_buff *vlan_insert_tag(struct sk_buff *skb,
__be16 vlan_proto, u16 vlan_tci)
{
return vlan_insert_inner_tag(skb, vlan_proto, vlan_tci, ETH_HLEN);
}
/**
* vlan_insert_tag_set_proto - regular VLAN tag inserting
* @skb: skbuff to tag
* @vlan_proto: VLAN encapsulation protocol
* @vlan_tci: VLAN TCI to insert
*
* Inserts the VLAN tag into @skb as part of the payload
* Returns a VLAN tagged skb. If a new skb is created, @skb is freed.
*
* Following the skb_unshare() example, in case of error, the calling function
* doesn't have to worry about freeing the original skb.
*/
static inline struct sk_buff *vlan_insert_tag_set_proto(struct sk_buff *skb,
__be16 vlan_proto,
u16 vlan_tci)
{
skb = vlan_insert_tag(skb, vlan_proto, vlan_tci);
if (skb)
skb->protocol = vlan_proto;
return skb;
}
/**
* __vlan_hwaccel_clear_tag - clear hardware accelerated VLAN info
* @skb: skbuff to clear
*
* Clears the VLAN information from @skb
*/
static inline void __vlan_hwaccel_clear_tag(struct sk_buff *skb)
{
skb->vlan_present = 0;
}
/**
* __vlan_hwaccel_copy_tag - copy hardware accelerated VLAN info from another skb
* @dst: skbuff to copy to
* @src: skbuff to copy from
*
* Copies VLAN information from @src to @dst (for branchless code)
*/
static inline void __vlan_hwaccel_copy_tag(struct sk_buff *dst, const struct sk_buff *src)
{
dst->vlan_present = src->vlan_present;
dst->vlan_proto = src->vlan_proto;
dst->vlan_tci = src->vlan_tci;
}
/*
* __vlan_hwaccel_push_inside - pushes vlan tag to the payload
* @skb: skbuff to tag
*
* Pushes the VLAN tag from @skb->vlan_tci inside to the payload.
*
* Following the skb_unshare() example, in case of error, the calling function
* doesn't have to worry about freeing the original skb.
*/
static inline struct sk_buff *__vlan_hwaccel_push_inside(struct sk_buff *skb)
{
skb = vlan_insert_tag_set_proto(skb, skb->vlan_proto,
skb_vlan_tag_get(skb));
if (likely(skb))
__vlan_hwaccel_clear_tag(skb);
return skb;
}
/**
* __vlan_hwaccel_put_tag - hardware accelerated VLAN inserting
* @skb: skbuff to tag
* @vlan_proto: VLAN encapsulation protocol
* @vlan_tci: VLAN TCI to insert
*
* Puts the VLAN TCI in @skb->vlan_tci and lets the device do the rest
*/
static inline void __vlan_hwaccel_put_tag(struct sk_buff *skb,
__be16 vlan_proto, u16 vlan_tci)
{
skb->vlan_proto = vlan_proto;
skb->vlan_tci = vlan_tci;
skb->vlan_present = 1;
}
/**
* __vlan_get_tag - get the VLAN ID that is part of the payload
* @skb: skbuff to query
* @vlan_tci: buffer to store value
*
* Returns error if the skb is not of VLAN type
*/
static inline int __vlan_get_tag(const struct sk_buff *skb, u16 *vlan_tci)
{
struct vlan_ethhdr *veth = (struct vlan_ethhdr *)skb->data;
if (!eth_type_vlan(veth->h_vlan_proto))
return -EINVAL;
*vlan_tci = ntohs(veth->h_vlan_TCI);
return 0;
}
/**
* __vlan_hwaccel_get_tag - get the VLAN ID that is in @skb->cb[]
* @skb: skbuff to query
* @vlan_tci: buffer to store value
*
* Returns error if @skb->vlan_tci is not set correctly
*/
static inline int __vlan_hwaccel_get_tag(const struct sk_buff *skb,
u16 *vlan_tci)
{
if (skb_vlan_tag_present(skb)) {
*vlan_tci = skb_vlan_tag_get(skb);
return 0;
} else {
*vlan_tci = 0;
return -EINVAL;
}
}
/**
* vlan_get_tag - get the VLAN ID from the skb
* @skb: skbuff to query
* @vlan_tci: buffer to store value
*
* Returns error if the skb is not VLAN tagged
*/
static inline int vlan_get_tag(const struct sk_buff *skb, u16 *vlan_tci)
{
if (skb->dev->features & NETIF_F_HW_VLAN_CTAG_TX) {
return __vlan_hwaccel_get_tag(skb, vlan_tci);
} else {
return __vlan_get_tag(skb, vlan_tci);
}
}
/**
* vlan_get_protocol - get protocol EtherType.
* @skb: skbuff to query
* @type: first vlan protocol
* @depth: buffer to store length of eth and vlan tags in bytes
*
* Returns the EtherType of the packet, regardless of whether it is
* vlan encapsulated (normal or hardware accelerated) or not.
*/
static inline __be16 __vlan_get_protocol(const struct sk_buff *skb, __be16 type,
int *depth)
{
unsigned int vlan_depth = skb->mac_len, parse_depth = VLAN_MAX_DEPTH;
/* if type is 802.1Q/AD then the header should already be
* present at mac_len - VLAN_HLEN (if mac_len > 0), or at
* ETH_HLEN otherwise
*/
if (eth_type_vlan(type)) {
if (vlan_depth) { if (WARN_ON(vlan_depth < VLAN_HLEN))
return 0;
vlan_depth -= VLAN_HLEN;
} else {
vlan_depth = ETH_HLEN;
}
do {
struct vlan_hdr vhdr, *vh;
vh = skb_header_pointer(skb, vlan_depth, sizeof(vhdr), &vhdr); if (unlikely(!vh || !--parse_depth)) return 0; type = vh->h_vlan_encapsulated_proto;
vlan_depth += VLAN_HLEN;
} while (eth_type_vlan(type));
}
if (depth) *depth = vlan_depth;
return type;
}
/**
* vlan_get_protocol - get protocol EtherType.
* @skb: skbuff to query
*
* Returns the EtherType of the packet, regardless of whether it is
* vlan encapsulated (normal or hardware accelerated) or not.
*/
static inline __be16 vlan_get_protocol(const struct sk_buff *skb)
{
return __vlan_get_protocol(skb, skb->protocol, NULL);
}
/* A getter for the SKB protocol field which will handle VLAN tags consistently
* whether VLAN acceleration is enabled or not.
*/
static inline __be16 skb_protocol(const struct sk_buff *skb, bool skip_vlan)
{
if (!skip_vlan)
/* VLAN acceleration strips the VLAN header from the skb and
* moves it to skb->vlan_proto
*/
return skb_vlan_tag_present(skb) ? skb->vlan_proto : skb->protocol;
return vlan_get_protocol(skb);
}
static inline void vlan_set_encap_proto(struct sk_buff *skb,
struct vlan_hdr *vhdr)
{
__be16 proto;
unsigned short *rawp;
/*
* Was a VLAN packet, grab the encapsulated protocol, which the layer
* three protocols care about.
*/
proto = vhdr->h_vlan_encapsulated_proto;
if (eth_proto_is_802_3(proto)) {
skb->protocol = proto;
return;
}
rawp = (unsigned short *)(vhdr + 1);
if (*rawp == 0xFFFF)
/*
* This is a magic hack to spot IPX packets. Older Novell
* breaks the protocol design and runs IPX over 802.3 without
* an 802.2 LLC layer. We look for FFFF which isn't a used
* 802.2 SSAP/DSAP. This won't work for fault tolerant netware
* but does for the rest.
*/
skb->protocol = htons(ETH_P_802_3);
else
/*
* Real 802.2 LLC
*/
skb->protocol = htons(ETH_P_802_2);
}
/**
* skb_vlan_tagged - check if skb is vlan tagged.
* @skb: skbuff to query
*
* Returns true if the skb is tagged, regardless of whether it is hardware
* accelerated or not.
*/
static inline bool skb_vlan_tagged(const struct sk_buff *skb)
{
if (!skb_vlan_tag_present(skb) && likely(!eth_type_vlan(skb->protocol)))
return false;
return true;
}
/**
* skb_vlan_tagged_multi - check if skb is vlan tagged with multiple headers.
* @skb: skbuff to query
*
* Returns true if the skb is tagged with multiple vlan headers, regardless
* of whether it is hardware accelerated or not.
*/
static inline bool skb_vlan_tagged_multi(struct sk_buff *skb)
{
__be16 protocol = skb->protocol;
if (!skb_vlan_tag_present(skb)) {
struct vlan_ethhdr *veh;
if (likely(!eth_type_vlan(protocol)))
return false;
if (unlikely(!pskb_may_pull(skb, VLAN_ETH_HLEN)))
return false;
veh = (struct vlan_ethhdr *)skb->data;
protocol = veh->h_vlan_encapsulated_proto;
}
if (!eth_type_vlan(protocol))
return false;
return true;
}
/**
* vlan_features_check - drop unsafe features for skb with multiple tags.
* @skb: skbuff to query
* @features: features to be checked
*
* Returns features without unsafe ones if the skb has multiple tags.
*/
static inline netdev_features_t vlan_features_check(struct sk_buff *skb,
netdev_features_t features)
{
if (skb_vlan_tagged_multi(skb)) {
/* In the case of multi-tagged packets, use a direct mask
* instead of using netdev_interesect_features(), to make
* sure that only devices supporting NETIF_F_HW_CSUM will
* have checksum offloading support.
*/
features &= NETIF_F_SG | NETIF_F_HIGHDMA | NETIF_F_HW_CSUM |
NETIF_F_FRAGLIST | NETIF_F_HW_VLAN_CTAG_TX |
NETIF_F_HW_VLAN_STAG_TX;
}
return features;
}
/**
* compare_vlan_header - Compare two vlan headers
* @h1: Pointer to vlan header
* @h2: Pointer to vlan header
*
* Compare two vlan headers, returns 0 if equal.
*
* Please note that alignment of h1 & h2 are only guaranteed to be 16 bits.
*/
static inline unsigned long compare_vlan_header(const struct vlan_hdr *h1,
const struct vlan_hdr *h2)
{
#if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)
return *(u32 *)h1 ^ *(u32 *)h2;
#else
return ((__force u32)h1->h_vlan_TCI ^ (__force u32)h2->h_vlan_TCI) |
((__force u32)h1->h_vlan_encapsulated_proto ^
(__force u32)h2->h_vlan_encapsulated_proto);
#endif
}
#endif /* !(_LINUX_IF_VLAN_H_) */
// SPDX-License-Identifier: GPL-2.0-only
#include <linux/wait.h>
#include <linux/rbtree.h>
#include <linux/backing-dev.h>
#include <linux/kthread.h>
#include <linux/freezer.h>
#include <linux/fs.h>
#include <linux/pagemap.h>
#include <linux/mm.h>
#include <linux/sched/mm.h>
#include <linux/sched.h>
#include <linux/module.h>
#include <linux/writeback.h>
#include <linux/device.h>
#include <trace/events/writeback.h>
struct backing_dev_info noop_backing_dev_info;
EXPORT_SYMBOL_GPL(noop_backing_dev_info);
static struct class *bdi_class;
static const char *bdi_unknown_name = "(unknown)";
/*
* bdi_lock protects bdi_tree and updates to bdi_list. bdi_list has RCU
* reader side locking.
*/
DEFINE_SPINLOCK(bdi_lock);
static u64 bdi_id_cursor;
static struct rb_root bdi_tree = RB_ROOT;
LIST_HEAD(bdi_list);
/* bdi_wq serves all asynchronous writeback tasks */
struct workqueue_struct *bdi_wq;
#define K(x) ((x) << (PAGE_SHIFT - 10))
#ifdef CONFIG_DEBUG_FS
#include <linux/debugfs.h>
#include <linux/seq_file.h>
static struct dentry *bdi_debug_root;
static void bdi_debug_init(void)
{
bdi_debug_root = debugfs_create_dir("bdi", NULL);
}
static int bdi_debug_stats_show(struct seq_file *m, void *v)
{
struct backing_dev_info *bdi = m->private;
struct bdi_writeback *wb = &bdi->wb;
unsigned long background_thresh;
unsigned long dirty_thresh;
unsigned long wb_thresh;
unsigned long nr_dirty, nr_io, nr_more_io, nr_dirty_time;
struct inode *inode;
nr_dirty = nr_io = nr_more_io = nr_dirty_time = 0;
spin_lock(&wb->list_lock);
list_for_each_entry(inode, &wb->b_dirty, i_io_list)
nr_dirty++;
list_for_each_entry(inode, &wb->b_io, i_io_list)
nr_io++;
list_for_each_entry(inode, &wb->b_more_io, i_io_list)
nr_more_io++;
list_for_each_entry(inode, &wb->b_dirty_time, i_io_list)
if (inode->i_state & I_DIRTY_TIME)
nr_dirty_time++;
spin_unlock(&wb->list_lock);
global_dirty_limits(&background_thresh, &dirty_thresh);
wb_thresh = wb_calc_thresh(wb, dirty_thresh);
seq_printf(m,
"BdiWriteback: %10lu kB\n"
"BdiReclaimable: %10lu kB\n"
"BdiDirtyThresh: %10lu kB\n"
"DirtyThresh: %10lu kB\n"
"BackgroundThresh: %10lu kB\n"
"BdiDirtied: %10lu kB\n"
"BdiWritten: %10lu kB\n"
"BdiWriteBandwidth: %10lu kBps\n"
"b_dirty: %10lu\n"
"b_io: %10lu\n"
"b_more_io: %10lu\n"
"b_dirty_time: %10lu\n"
"bdi_list: %10u\n"
"state: %10lx\n",
(unsigned long) K(wb_stat(wb, WB_WRITEBACK)),
(unsigned long) K(wb_stat(wb, WB_RECLAIMABLE)),
K(wb_thresh),
K(dirty_thresh),
K(background_thresh),
(unsigned long) K(wb_stat(wb, WB_DIRTIED)),
(unsigned long) K(wb_stat(wb, WB_WRITTEN)),
(unsigned long) K(wb->write_bandwidth),
nr_dirty,
nr_io,
nr_more_io,
nr_dirty_time,
!list_empty(&bdi->bdi_list), bdi->wb.state);
return 0;
}
DEFINE_SHOW_ATTRIBUTE(bdi_debug_stats);
static void bdi_debug_register(struct backing_dev_info *bdi, const char *name)
{
bdi->debug_dir = debugfs_create_dir(name, bdi_debug_root);
debugfs_create_file("stats", 0444, bdi->debug_dir, bdi,
&bdi_debug_stats_fops);
}
static void bdi_debug_unregister(struct backing_dev_info *bdi)
{
debugfs_remove_recursive(bdi->debug_dir);
}
#else
static inline void bdi_debug_init(void)
{
}
static inline void bdi_debug_register(struct backing_dev_info *bdi,
const char *name)
{
}
static inline void bdi_debug_unregister(struct backing_dev_info *bdi)
{
}
#endif
static ssize_t read_ahead_kb_store(struct device *dev,
struct device_attribute *attr,
const char *buf, size_t count)
{
struct backing_dev_info *bdi = dev_get_drvdata(dev);
unsigned long read_ahead_kb;
ssize_t ret;
ret = kstrtoul(buf, 10, &read_ahead_kb);
if (ret < 0)
return ret;
bdi->ra_pages = read_ahead_kb >> (PAGE_SHIFT - 10);
return count;
}
#define BDI_SHOW(name, expr) \
static ssize_t name##_show(struct device *dev, \
struct device_attribute *attr, char *buf) \
{ \
struct backing_dev_info *bdi = dev_get_drvdata(dev); \
\
return sysfs_emit(buf, "%lld\n", (long long)expr); \
} \
static DEVICE_ATTR_RW(name);
BDI_SHOW(read_ahead_kb, K(bdi->ra_pages))
static ssize_t min_ratio_store(struct device *dev,
struct device_attribute *attr, const char *buf, size_t count)
{
struct backing_dev_info *bdi = dev_get_drvdata(dev);
unsigned int ratio;
ssize_t ret;
ret = kstrtouint(buf, 10, &ratio);
if (ret < 0)
return ret;
ret = bdi_set_min_ratio(bdi, ratio);
if (!ret)
ret = count;
return ret;
}
BDI_SHOW(min_ratio, bdi->min_ratio)
static ssize_t max_ratio_store(struct device *dev,
struct device_attribute *attr, const char *buf, size_t count)
{
struct backing_dev_info *bdi = dev_get_drvdata(dev);
unsigned int ratio;
ssize_t ret;
ret = kstrtouint(buf, 10, &ratio);
if (ret < 0)
return ret;
ret = bdi_set_max_ratio(bdi, ratio);
if (!ret)
ret = count;
return ret;
}
BDI_SHOW(max_ratio, bdi->max_ratio)
static ssize_t stable_pages_required_show(struct device *dev,
struct device_attribute *attr,
char *buf)
{
dev_warn_once(dev,
"the stable_pages_required attribute has been removed. Use the stable_writes queue attribute instead.\n");
return sysfs_emit(buf, "%d\n", 0);
}
static DEVICE_ATTR_RO(stable_pages_required);
static struct attribute *bdi_dev_attrs[] = {
&dev_attr_read_ahead_kb.attr,
&dev_attr_min_ratio.attr,
&dev_attr_max_ratio.attr,
&dev_attr_stable_pages_required.attr,
NULL,
};
ATTRIBUTE_GROUPS(bdi_dev);
static __init int bdi_class_init(void)
{
bdi_class = class_create(THIS_MODULE, "bdi");
if (IS_ERR(bdi_class))
return PTR_ERR(bdi_class);
bdi_class->dev_groups = bdi_dev_groups;
bdi_debug_init();
return 0;
}
postcore_initcall(bdi_class_init);
static int bdi_init(struct backing_dev_info *bdi);
static int __init default_bdi_init(void)
{
int err;
bdi_wq = alloc_workqueue("writeback", WQ_MEM_RECLAIM | WQ_UNBOUND |
WQ_SYSFS, 0);
if (!bdi_wq)
return -ENOMEM;
err = bdi_init(&noop_backing_dev_info);
return err;
}
subsys_initcall(default_bdi_init);
/*
* This function is used when the first inode for this wb is marked dirty. It
* wakes-up the corresponding bdi thread which should then take care of the
* periodic background write-out of dirty inodes. Since the write-out would
* starts only 'dirty_writeback_interval' centisecs from now anyway, we just
* set up a timer which wakes the bdi thread up later.
*
* Note, we wouldn't bother setting up the timer, but this function is on the
* fast-path (used by '__mark_inode_dirty()'), so we save few context switches
* by delaying the wake-up.
*
* We have to be careful not to postpone flush work if it is scheduled for
* earlier. Thus we use queue_delayed_work().
*/
void wb_wakeup_delayed(struct bdi_writeback *wb)
{
unsigned long timeout;
timeout = msecs_to_jiffies(dirty_writeback_interval * 10);
spin_lock_bh(&wb->work_lock);
if (test_bit(WB_registered, &wb->state))
queue_delayed_work(bdi_wq, &wb->dwork, timeout);
spin_unlock_bh(&wb->work_lock);
}
static void wb_update_bandwidth_workfn(struct work_struct *work)
{
struct bdi_writeback *wb = container_of(to_delayed_work(work),
struct bdi_writeback, bw_dwork);
wb_update_bandwidth(wb);
}
/*
* Initial write bandwidth: 100 MB/s
*/
#define INIT_BW (100 << (20 - PAGE_SHIFT))
static int wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi,
gfp_t gfp)
{
int i, err;
memset(wb, 0, sizeof(*wb));
if (wb != &bdi->wb)
bdi_get(bdi);
wb->bdi = bdi;
wb->last_old_flush = jiffies;
INIT_LIST_HEAD(&wb->b_dirty);
INIT_LIST_HEAD(&wb->b_io);
INIT_LIST_HEAD(&wb->b_more_io);
INIT_LIST_HEAD(&wb->b_dirty_time);
spin_lock_init(&wb->list_lock);
atomic_set(&wb->writeback_inodes, 0);
wb->bw_time_stamp = jiffies;
wb->balanced_dirty_ratelimit = INIT_BW;
wb->dirty_ratelimit = INIT_BW;
wb->write_bandwidth = INIT_BW;
wb->avg_write_bandwidth = INIT_BW;
spin_lock_init(&wb->work_lock);
INIT_LIST_HEAD(&wb->work_list);
INIT_DELAYED_WORK(&wb->dwork, wb_workfn);
INIT_DELAYED_WORK(&wb->bw_dwork, wb_update_bandwidth_workfn);
wb->dirty_sleep = jiffies;
err = fprop_local_init_percpu(&wb->completions, gfp);
if (err)
goto out_put_bdi;
for (i = 0; i < NR_WB_STAT_ITEMS; i++) { err = percpu_counter_init(&wb->stat[i], 0, gfp);
if (err)
goto out_destroy_stat;
}
return 0;
out_destroy_stat:
while (i--) percpu_counter_destroy(&wb->stat[i]); fprop_local_destroy_percpu(&wb->completions);
out_put_bdi:
if (wb != &bdi->wb)
bdi_put(bdi);
return err;
}
static void cgwb_remove_from_bdi_list(struct bdi_writeback *wb);
/*
* Remove bdi from the global list and shutdown any threads we have running
*/
static void wb_shutdown(struct bdi_writeback *wb)
{
/* Make sure nobody queues further work */
spin_lock_bh(&wb->work_lock);
if (!test_and_clear_bit(WB_registered, &wb->state)) {
spin_unlock_bh(&wb->work_lock);
return;
}
spin_unlock_bh(&wb->work_lock);
cgwb_remove_from_bdi_list(wb);
/*
* Drain work list and shutdown the delayed_work. !WB_registered
* tells wb_workfn() that @wb is dying and its work_list needs to
* be drained no matter what.
*/
mod_delayed_work(bdi_wq, &wb->dwork, 0);
flush_delayed_work(&wb->dwork);
WARN_ON(!list_empty(&wb->work_list)); flush_delayed_work(&wb->bw_dwork);
}
static void wb_exit(struct bdi_writeback *wb)
{
int i;
WARN_ON(delayed_work_pending(&wb->dwork));
for (i = 0; i < NR_WB_STAT_ITEMS; i++)
percpu_counter_destroy(&wb->stat[i]);
fprop_local_destroy_percpu(&wb->completions);
if (wb != &wb->bdi->wb)
bdi_put(wb->bdi);
}
#ifdef CONFIG_CGROUP_WRITEBACK
#include <linux/memcontrol.h>
/*
* cgwb_lock protects bdi->cgwb_tree, blkcg->cgwb_list, offline_cgwbs and
* memcg->cgwb_list. bdi->cgwb_tree is also RCU protected.
*/
static DEFINE_SPINLOCK(cgwb_lock);
static struct workqueue_struct *cgwb_release_wq;
static LIST_HEAD(offline_cgwbs);
static void cleanup_offline_cgwbs_workfn(struct work_struct *work);
static DECLARE_WORK(cleanup_offline_cgwbs_work, cleanup_offline_cgwbs_workfn);
static void cgwb_release_workfn(struct work_struct *work)
{
struct bdi_writeback *wb = container_of(work, struct bdi_writeback,
release_work);
struct blkcg *blkcg = css_to_blkcg(wb->blkcg_css);
mutex_lock(&wb->bdi->cgwb_release_mutex);
wb_shutdown(wb);
css_put(wb->memcg_css);
css_put(wb->blkcg_css);
mutex_unlock(&wb->bdi->cgwb_release_mutex);
/* triggers blkg destruction if no online users left */
blkcg_unpin_online(blkcg);
fprop_local_destroy_percpu(&wb->memcg_completions);
spin_lock_irq(&cgwb_lock);
list_del(&wb->offline_node);
spin_unlock_irq(&cgwb_lock);
percpu_ref_exit(&wb->refcnt);
wb_exit(wb);
WARN_ON_ONCE(!list_empty(&wb->b_attached));
kfree_rcu(wb, rcu);
}
static void cgwb_release(struct percpu_ref *refcnt)
{
struct bdi_writeback *wb = container_of(refcnt, struct bdi_writeback,
refcnt);
queue_work(cgwb_release_wq, &wb->release_work);
}
static void cgwb_kill(struct bdi_writeback *wb)
{
lockdep_assert_held(&cgwb_lock);
WARN_ON(!radix_tree_delete(&wb->bdi->cgwb_tree, wb->memcg_css->id));
list_del(&wb->memcg_node);
list_del(&wb->blkcg_node);
list_add(&wb->offline_node, &offline_cgwbs);
percpu_ref_kill(&wb->refcnt);
}
static void cgwb_remove_from_bdi_list(struct bdi_writeback *wb)
{
spin_lock_irq(&cgwb_lock);
list_del_rcu(&wb->bdi_node);
spin_unlock_irq(&cgwb_lock);
}
static int cgwb_create(struct backing_dev_info *bdi,
struct cgroup_subsys_state *memcg_css, gfp_t gfp)
{
struct mem_cgroup *memcg;
struct cgroup_subsys_state *blkcg_css;
struct blkcg *blkcg;
struct list_head *memcg_cgwb_list, *blkcg_cgwb_list;
struct bdi_writeback *wb;
unsigned long flags;
int ret = 0;
memcg = mem_cgroup_from_css(memcg_css);
blkcg_css = cgroup_get_e_css(memcg_css->cgroup, &io_cgrp_subsys);
blkcg = css_to_blkcg(blkcg_css);
memcg_cgwb_list = &memcg->cgwb_list;
blkcg_cgwb_list = &blkcg->cgwb_list;
/* look up again under lock and discard on blkcg mismatch */
spin_lock_irqsave(&cgwb_lock, flags);
wb = radix_tree_lookup(&bdi->cgwb_tree, memcg_css->id);
if (wb && wb->blkcg_css != blkcg_css) {
cgwb_kill(wb);
wb = NULL;
}
spin_unlock_irqrestore(&cgwb_lock, flags);
if (wb)
goto out_put;
/* need to create a new one */
wb = kmalloc(sizeof(*wb), gfp);
if (!wb) {
ret = -ENOMEM;
goto out_put;
}
ret = wb_init(wb, bdi, gfp);
if (ret)
goto err_free;
ret = percpu_ref_init(&wb->refcnt, cgwb_release, 0, gfp);
if (ret)
goto err_wb_exit;
ret = fprop_local_init_percpu(&wb->memcg_completions, gfp);
if (ret)
goto err_ref_exit;
wb->memcg_css = memcg_css;
wb->blkcg_css = blkcg_css;
INIT_LIST_HEAD(&wb->b_attached);
INIT_WORK(&wb->release_work, cgwb_release_workfn);
set_bit(WB_registered, &wb->state);
/*
* The root wb determines the registered state of the whole bdi and
* memcg_cgwb_list and blkcg_cgwb_list's next pointers indicate
* whether they're still online. Don't link @wb if any is dead.
* See wb_memcg_offline() and wb_blkcg_offline().
*/
ret = -ENODEV;
spin_lock_irqsave(&cgwb_lock, flags);
if (test_bit(WB_registered, &bdi->wb.state) &&
blkcg_cgwb_list->next && memcg_cgwb_list->next) {
/* we might have raced another instance of this function */
ret = radix_tree_insert(&bdi->cgwb_tree, memcg_css->id, wb);
if (!ret) {
list_add_tail_rcu(&wb->bdi_node, &bdi->wb_list);
list_add(&wb->memcg_node, memcg_cgwb_list);
list_add(&wb->blkcg_node, blkcg_cgwb_list);
blkcg_pin_online(blkcg);
css_get(memcg_css);
css_get(blkcg_css);
}
}
spin_unlock_irqrestore(&cgwb_lock, flags);
if (ret) {
if (ret == -EEXIST)
ret = 0;
goto err_fprop_exit;
}
goto out_put;
err_fprop_exit:
fprop_local_destroy_percpu(&wb->memcg_completions);
err_ref_exit:
percpu_ref_exit(&wb->refcnt);
err_wb_exit:
wb_exit(wb);
err_free:
kfree(wb);
out_put:
css_put(blkcg_css);
return ret;
}
/**
* wb_get_lookup - get wb for a given memcg
* @bdi: target bdi
* @memcg_css: cgroup_subsys_state of the target memcg (must have positive ref)
*
* Try to get the wb for @memcg_css on @bdi. The returned wb has its
* refcount incremented.
*
* This function uses css_get() on @memcg_css and thus expects its refcnt
* to be positive on invocation. IOW, rcu_read_lock() protection on
* @memcg_css isn't enough. try_get it before calling this function.
*
* A wb is keyed by its associated memcg. As blkcg implicitly enables
* memcg on the default hierarchy, memcg association is guaranteed to be
* more specific (equal or descendant to the associated blkcg) and thus can
* identify both the memcg and blkcg associations.
*
* Because the blkcg associated with a memcg may change as blkcg is enabled
* and disabled closer to root in the hierarchy, each wb keeps track of
* both the memcg and blkcg associated with it and verifies the blkcg on
* each lookup. On mismatch, the existing wb is discarded and a new one is
* created.
*/
struct bdi_writeback *wb_get_lookup(struct backing_dev_info *bdi,
struct cgroup_subsys_state *memcg_css)
{
struct bdi_writeback *wb;
if (!memcg_css->parent)
return &bdi->wb;
rcu_read_lock();
wb = radix_tree_lookup(&bdi->cgwb_tree, memcg_css->id);
if (wb) {
struct cgroup_subsys_state *blkcg_css;
/* see whether the blkcg association has changed */
blkcg_css = cgroup_get_e_css(memcg_css->cgroup, &io_cgrp_subsys);
if (unlikely(wb->blkcg_css != blkcg_css || !wb_tryget(wb)))
wb = NULL;
css_put(blkcg_css);
}
rcu_read_unlock();
return wb;
}
/**
* wb_get_create - get wb for a given memcg, create if necessary
* @bdi: target bdi
* @memcg_css: cgroup_subsys_state of the target memcg (must have positive ref)
* @gfp: allocation mask to use
*
* Try to get the wb for @memcg_css on @bdi. If it doesn't exist, try to
* create one. See wb_get_lookup() for more details.
*/
struct bdi_writeback *wb_get_create(struct backing_dev_info *bdi,
struct cgroup_subsys_state *memcg_css,
gfp_t gfp)
{
struct bdi_writeback *wb;
might_alloc(gfp);
if (!memcg_css->parent)
return &bdi->wb;
do {
wb = wb_get_lookup(bdi, memcg_css);
} while (!wb && !cgwb_create(bdi, memcg_css, gfp));
return wb;
}
static int cgwb_bdi_init(struct backing_dev_info *bdi)
{
int ret;
INIT_RADIX_TREE(&bdi->cgwb_tree, GFP_ATOMIC);
mutex_init(&bdi->cgwb_release_mutex);
init_rwsem(&bdi->wb_switch_rwsem);
ret = wb_init(&bdi->wb, bdi, GFP_KERNEL);
if (!ret) {
bdi->wb.memcg_css = &root_mem_cgroup->css;
bdi->wb.blkcg_css = blkcg_root_css;
}
return ret;
}
static void cgwb_bdi_unregister(struct backing_dev_info *bdi)
{
struct radix_tree_iter iter;
void **slot;
struct bdi_writeback *wb;
WARN_ON(test_bit(WB_registered, &bdi->wb.state));
spin_lock_irq(&cgwb_lock);
radix_tree_for_each_slot(slot, &bdi->cgwb_tree, &iter, 0)
cgwb_kill(*slot);
spin_unlock_irq(&cgwb_lock);
mutex_lock(&bdi->cgwb_release_mutex);
spin_lock_irq(&cgwb_lock);
while (!list_empty(&bdi->wb_list)) {
wb = list_first_entry(&bdi->wb_list, struct bdi_writeback,
bdi_node);
spin_unlock_irq(&cgwb_lock);
wb_shutdown(wb);
spin_lock_irq(&cgwb_lock);
}
spin_unlock_irq(&cgwb_lock);
mutex_unlock(&bdi->cgwb_release_mutex);
}
/*
* cleanup_offline_cgwbs_workfn - try to release dying cgwbs
*
* Try to release dying cgwbs by switching attached inodes to the nearest
* living ancestor's writeback. Processed wbs are placed at the end
* of the list to guarantee the forward progress.
*/
static void cleanup_offline_cgwbs_workfn(struct work_struct *work)
{
struct bdi_writeback *wb;
LIST_HEAD(processed);
spin_lock_irq(&cgwb_lock);
while (!list_empty(&offline_cgwbs)) {
wb = list_first_entry(&offline_cgwbs, struct bdi_writeback,
offline_node);
list_move(&wb->offline_node, &processed);
/*
* If wb is dirty, cleaning up the writeback by switching
* attached inodes will result in an effective removal of any
* bandwidth restrictions, which isn't the goal. Instead,
* it can be postponed until the next time, when all io
* will be likely completed. If in the meantime some inodes
* will get re-dirtied, they should be eventually switched to
* a new cgwb.
*/
if (wb_has_dirty_io(wb))
continue;
if (!wb_tryget(wb))
continue;
spin_unlock_irq(&cgwb_lock);
while (cleanup_offline_cgwb(wb))
cond_resched();
spin_lock_irq(&cgwb_lock);
wb_put(wb);
}
if (!list_empty(&processed))
list_splice_tail(&processed, &offline_cgwbs);
spin_unlock_irq(&cgwb_lock);
}
/**
* wb_memcg_offline - kill all wb's associated with a memcg being offlined
* @memcg: memcg being offlined
*
* Also prevents creation of any new wb's associated with @memcg.
*/
void wb_memcg_offline(struct mem_cgroup *memcg)
{
struct list_head *memcg_cgwb_list = &memcg->cgwb_list;
struct bdi_writeback *wb, *next;
spin_lock_irq(&cgwb_lock);
list_for_each_entry_safe(wb, next, memcg_cgwb_list, memcg_node)
cgwb_kill(wb);
memcg_cgwb_list->next = NULL; /* prevent new wb's */
spin_unlock_irq(&cgwb_lock);
queue_work(system_unbound_wq, &cleanup_offline_cgwbs_work);
}
/**
* wb_blkcg_offline - kill all wb's associated with a blkcg being offlined
* @blkcg: blkcg being offlined
*
* Also prevents creation of any new wb's associated with @blkcg.
*/
void wb_blkcg_offline(struct blkcg *blkcg)
{
struct bdi_writeback *wb, *next;
spin_lock_irq(&cgwb_lock);
list_for_each_entry_safe(wb, next, &blkcg->cgwb_list, blkcg_node)
cgwb_kill(wb);
blkcg->cgwb_list.next = NULL; /* prevent new wb's */
spin_unlock_irq(&cgwb_lock);
}
static void cgwb_bdi_register(struct backing_dev_info *bdi)
{
spin_lock_irq(&cgwb_lock);
list_add_tail_rcu(&bdi->wb.bdi_node, &bdi->wb_list);
spin_unlock_irq(&cgwb_lock);
}
static int __init cgwb_init(void)
{
/*
* There can be many concurrent release work items overwhelming
* system_wq. Put them in a separate wq and limit concurrency.
* There's no point in executing many of these in parallel.
*/
cgwb_release_wq = alloc_workqueue("cgwb_release", 0, 1);
if (!cgwb_release_wq)
return -ENOMEM;
return 0;
}
subsys_initcall(cgwb_init);
#else /* CONFIG_CGROUP_WRITEBACK */
static int cgwb_bdi_init(struct backing_dev_info *bdi)
{
return wb_init(&bdi->wb, bdi, GFP_KERNEL);
}
static void cgwb_bdi_unregister(struct backing_dev_info *bdi) { }
static void cgwb_bdi_register(struct backing_dev_info *bdi)
{
list_add_tail_rcu(&bdi->wb.bdi_node, &bdi->wb_list);
}
static void cgwb_remove_from_bdi_list(struct bdi_writeback *wb)
{
list_del_rcu(&wb->bdi_node);
}
#endif /* CONFIG_CGROUP_WRITEBACK */
static int bdi_init(struct backing_dev_info *bdi)
{
int ret;
bdi->dev = NULL;
kref_init(&bdi->refcnt);
bdi->min_ratio = 0;
bdi->max_ratio = 100;
bdi->max_prop_frac = FPROP_FRAC_BASE;
INIT_LIST_HEAD(&bdi->bdi_list);
INIT_LIST_HEAD(&bdi->wb_list);
init_waitqueue_head(&bdi->wb_waitq);
ret = cgwb_bdi_init(bdi);
return ret;
}
struct backing_dev_info *bdi_alloc(int node_id)
{
struct backing_dev_info *bdi;
bdi = kzalloc_node(sizeof(*bdi), GFP_KERNEL, node_id);
if (!bdi)
return NULL;
if (bdi_init(bdi)) { kfree(bdi); return NULL;
}
bdi->capabilities = BDI_CAP_WRITEBACK | BDI_CAP_WRITEBACK_ACCT;
bdi->ra_pages = VM_READAHEAD_PAGES;
bdi->io_pages = VM_READAHEAD_PAGES;
timer_setup(&bdi->laptop_mode_wb_timer, laptop_mode_timer_fn, 0);
return bdi;
}
EXPORT_SYMBOL(bdi_alloc);
static struct rb_node **bdi_lookup_rb_node(u64 id, struct rb_node **parentp)
{
struct rb_node **p = &bdi_tree.rb_node;
struct rb_node *parent = NULL;
struct backing_dev_info *bdi;
lockdep_assert_held(&bdi_lock);
while (*p) {
parent = *p;
bdi = rb_entry(parent, struct backing_dev_info, rb_node);
if (bdi->id > id) p = &(*p)->rb_left; else if (bdi->id < id) p = &(*p)->rb_right;
else
break;
}
if (parentp)
*parentp = parent;
return p;
}
/**
* bdi_get_by_id - lookup and get bdi from its id
* @id: bdi id to lookup
*
* Find bdi matching @id and get it. Returns NULL if the matching bdi
* doesn't exist or is already unregistered.
*/
struct backing_dev_info *bdi_get_by_id(u64 id)
{
struct backing_dev_info *bdi = NULL;
struct rb_node **p;
spin_lock_bh(&bdi_lock);
p = bdi_lookup_rb_node(id, NULL);
if (*p) {
bdi = rb_entry(*p, struct backing_dev_info, rb_node);
bdi_get(bdi);
}
spin_unlock_bh(&bdi_lock);
return bdi;
}
int bdi_register_va(struct backing_dev_info *bdi, const char *fmt, va_list args)
{
struct device *dev;
struct rb_node *parent, **p;
if (bdi->dev) /* The driver needs to use separate queues per device */
return 0;
vsnprintf(bdi->dev_name, sizeof(bdi->dev_name), fmt, args);
dev = device_create(bdi_class, NULL, MKDEV(0, 0), bdi, bdi->dev_name);
if (IS_ERR(dev))
return PTR_ERR(dev);
cgwb_bdi_register(bdi);
bdi->dev = dev;
bdi_debug_register(bdi, dev_name(dev));
set_bit(WB_registered, &bdi->wb.state);
spin_lock_bh(&bdi_lock);
bdi->id = ++bdi_id_cursor;
p = bdi_lookup_rb_node(bdi->id, &parent);
rb_link_node(&bdi->rb_node, parent, p);
rb_insert_color(&bdi->rb_node, &bdi_tree);
list_add_tail_rcu(&bdi->bdi_list, &bdi_list);
spin_unlock_bh(&bdi_lock);
trace_writeback_bdi_register(bdi);
return 0;}
int bdi_register(struct backing_dev_info *bdi, const char *fmt, ...)
{
va_list args;
int ret;
va_start(args, fmt);
ret = bdi_register_va(bdi, fmt, args);
va_end(args);
return ret;
}
EXPORT_SYMBOL(bdi_register);
void bdi_set_owner(struct backing_dev_info *bdi, struct device *owner)
{
WARN_ON_ONCE(bdi->owner);
bdi->owner = owner;
get_device(owner);
}
/*
* Remove bdi from bdi_list, and ensure that it is no longer visible
*/
static void bdi_remove_from_list(struct backing_dev_info *bdi)
{
spin_lock_bh(&bdi_lock);
rb_erase(&bdi->rb_node, &bdi_tree);
list_del_rcu(&bdi->bdi_list);
spin_unlock_bh(&bdi_lock);
synchronize_rcu_expedited();
}
void bdi_unregister(struct backing_dev_info *bdi)
{
del_timer_sync(&bdi->laptop_mode_wb_timer);
/* make sure nobody finds us on the bdi_list anymore */
bdi_remove_from_list(bdi);
wb_shutdown(&bdi->wb);
cgwb_bdi_unregister(bdi);
/*
* If this BDI's min ratio has been set, use bdi_set_min_ratio() to
* update the global bdi_min_ratio.
*/
if (bdi->min_ratio) bdi_set_min_ratio(bdi, 0); if (bdi->dev) { bdi_debug_unregister(bdi);
device_unregister(bdi->dev);
bdi->dev = NULL;
}
if (bdi->owner) { put_device(bdi->owner);
bdi->owner = NULL;
}
}
static void release_bdi(struct kref *ref)
{
struct backing_dev_info *bdi =
container_of(ref, struct backing_dev_info, refcnt);
if (test_bit(WB_registered, &bdi->wb.state))
bdi_unregister(bdi); WARN_ON_ONCE(bdi->dev); wb_exit(&bdi->wb); kfree(bdi);
}
void bdi_put(struct backing_dev_info *bdi)
{
kref_put(&bdi->refcnt, release_bdi);
}
EXPORT_SYMBOL(bdi_put);
const char *bdi_dev_name(struct backing_dev_info *bdi)
{
if (!bdi || !bdi->dev)
return bdi_unknown_name;
return bdi->dev_name;
}
EXPORT_SYMBOL_GPL(bdi_dev_name);
static wait_queue_head_t congestion_wqh[2] = {
__WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[0]),
__WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[1])
};
static atomic_t nr_wb_congested[2];
void clear_bdi_congested(struct backing_dev_info *bdi, int sync)
{
wait_queue_head_t *wqh = &congestion_wqh[sync];
enum wb_congested_state bit;
bit = sync ? WB_sync_congested : WB_async_congested;
if (test_and_clear_bit(bit, &bdi->wb.congested))
atomic_dec(&nr_wb_congested[sync]);
smp_mb__after_atomic();
if (waitqueue_active(wqh))
wake_up(wqh);
}
EXPORT_SYMBOL(clear_bdi_congested);
void set_bdi_congested(struct backing_dev_info *bdi, int sync)
{
enum wb_congested_state bit;
bit = sync ? WB_sync_congested : WB_async_congested;
if (!test_and_set_bit(bit, &bdi->wb.congested))
atomic_inc(&nr_wb_congested[sync]);
}
EXPORT_SYMBOL(set_bdi_congested);
/**
* congestion_wait - wait for a backing_dev to become uncongested
* @sync: SYNC or ASYNC IO
* @timeout: timeout in jiffies
*
* Waits for up to @timeout jiffies for a backing_dev (any backing_dev) to exit
* write congestion. If no backing_devs are congested then just wait for the
* next write to be completed.
*/
long congestion_wait(int sync, long timeout)
{
long ret;
unsigned long start = jiffies;
DEFINE_WAIT(wait);
wait_queue_head_t *wqh = &congestion_wqh[sync];
prepare_to_wait(wqh, &wait, TASK_UNINTERRUPTIBLE);
ret = io_schedule_timeout(timeout);
finish_wait(wqh, &wait);
trace_writeback_congestion_wait(jiffies_to_usecs(timeout),
jiffies_to_usecs(jiffies - start));
return ret;
}
EXPORT_SYMBOL(congestion_wait);
/**
* wait_iff_congested - Conditionally wait for a backing_dev to become uncongested or a pgdat to complete writes
* @sync: SYNC or ASYNC IO
* @timeout: timeout in jiffies
*
* In the event of a congested backing_dev (any backing_dev) this waits
* for up to @timeout jiffies for either a BDI to exit congestion of the
* given @sync queue or a write to complete.
*
* The return value is 0 if the sleep is for the full timeout. Otherwise,
* it is the number of jiffies that were still remaining when the function
* returned. return_value == timeout implies the function did not sleep.
*/
long wait_iff_congested(int sync, long timeout)
{
long ret;
unsigned long start = jiffies;
DEFINE_WAIT(wait);
wait_queue_head_t *wqh = &congestion_wqh[sync];
/*
* If there is no congestion, yield if necessary instead
* of sleeping on the congestion queue
*/
if (atomic_read(&nr_wb_congested[sync]) == 0) {
cond_resched();
/* In case we scheduled, work out time remaining */
ret = timeout - (jiffies - start);
if (ret < 0)
ret = 0;
goto out;
}
/* Sleep until uncongested or a write happens */
prepare_to_wait(wqh, &wait, TASK_UNINTERRUPTIBLE);
ret = io_schedule_timeout(timeout);
finish_wait(wqh, &wait);
out:
trace_writeback_wait_iff_congested(jiffies_to_usecs(timeout),
jiffies_to_usecs(jiffies - start));
return ret;
}
EXPORT_SYMBOL(wait_iff_congested);
// SPDX-License-Identifier: GPL-2.0
/*
* Copyright (C) 2008 Oracle. All rights reserved.
*/
#include <linux/sched.h>
#include <linux/slab.h>
#include <linux/blkdev.h>
#include <linux/list_sort.h>
#include <linux/iversion.h>
#include "misc.h"
#include "ctree.h"
#include "tree-log.h"
#include "disk-io.h"
#include "locking.h"
#include "print-tree.h"
#include "backref.h"
#include "compression.h"
#include "qgroup.h"
#include "block-group.h"
#include "space-info.h"
#include "zoned.h"
/* magic values for the inode_only field in btrfs_log_inode:
*
* LOG_INODE_ALL means to log everything
* LOG_INODE_EXISTS means to log just enough to recreate the inode
* during log replay
*/
enum {
LOG_INODE_ALL,
LOG_INODE_EXISTS,
LOG_OTHER_INODE,
LOG_OTHER_INODE_ALL,
};
/*
* directory trouble cases
*
* 1) on rename or unlink, if the inode being unlinked isn't in the fsync
* log, we must force a full commit before doing an fsync of the directory
* where the unlink was done.
* ---> record transid of last unlink/rename per directory
*
* mkdir foo/some_dir
* normal commit
* rename foo/some_dir foo2/some_dir
* mkdir foo/some_dir
* fsync foo/some_dir/some_file
*
* The fsync above will unlink the original some_dir without recording
* it in its new location (foo2). After a crash, some_dir will be gone
* unless the fsync of some_file forces a full commit
*
* 2) we must log any new names for any file or dir that is in the fsync
* log. ---> check inode while renaming/linking.
*
* 2a) we must log any new names for any file or dir during rename
* when the directory they are being removed from was logged.
* ---> check inode and old parent dir during rename
*
* 2a is actually the more important variant. With the extra logging
* a crash might unlink the old name without recreating the new one
*
* 3) after a crash, we must go through any directories with a link count
* of zero and redo the rm -rf
*
* mkdir f1/foo
* normal commit
* rm -rf f1/foo
* fsync(f1)
*
* The directory f1 was fully removed from the FS, but fsync was never
* called on f1, only its parent dir. After a crash the rm -rf must
* be replayed. This must be able to recurse down the entire
* directory tree. The inode link count fixup code takes care of the
* ugly details.
*/
/*
* stages for the tree walking. The first
* stage (0) is to only pin down the blocks we find
* the second stage (1) is to make sure that all the inodes
* we find in the log are created in the subvolume.
*
* The last stage is to deal with directories and links and extents
* and all the other fun semantics
*/
enum {
LOG_WALK_PIN_ONLY,
LOG_WALK_REPLAY_INODES,
LOG_WALK_REPLAY_DIR_INDEX,
LOG_WALK_REPLAY_ALL,
};
static int btrfs_log_inode(struct btrfs_trans_handle *trans,
struct btrfs_root *root, struct btrfs_inode *inode,
int inode_only,
struct btrfs_log_ctx *ctx);
static int link_to_fixup_dir(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_path *path, u64 objectid);
static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_root *log,
struct btrfs_path *path,
u64 dirid, int del_all);
static void wait_log_commit(struct btrfs_root *root, int transid);
/*
* tree logging is a special write ahead log used to make sure that
* fsyncs and O_SYNCs can happen without doing full tree commits.
*
* Full tree commits are expensive because they require commonly
* modified blocks to be recowed, creating many dirty pages in the
* extent tree an 4x-6x higher write load than ext3.
*
* Instead of doing a tree commit on every fsync, we use the
* key ranges and transaction ids to find items for a given file or directory
* that have changed in this transaction. Those items are copied into
* a special tree (one per subvolume root), that tree is written to disk
* and then the fsync is considered complete.
*
* After a crash, items are copied out of the log-tree back into the
* subvolume tree. Any file data extents found are recorded in the extent
* allocation tree, and the log-tree freed.
*
* The log tree is read three times, once to pin down all the extents it is
* using in ram and once, once to create all the inodes logged in the tree
* and once to do all the other items.
*/
/*
* start a sub transaction and setup the log tree
* this increments the log tree writer count to make the people
* syncing the tree wait for us to finish
*/
static int start_log_trans(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_log_ctx *ctx)
{
struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_root *tree_root = fs_info->tree_root;
const bool zoned = btrfs_is_zoned(fs_info);
int ret = 0;
bool created = false;
/*
* First check if the log root tree was already created. If not, create
* it before locking the root's log_mutex, just to keep lockdep happy.
*/
if (!test_bit(BTRFS_ROOT_HAS_LOG_TREE, &tree_root->state)) {
mutex_lock(&tree_root->log_mutex);
if (!fs_info->log_root_tree) {
ret = btrfs_init_log_root_tree(trans, fs_info);
if (!ret) {
set_bit(BTRFS_ROOT_HAS_LOG_TREE, &tree_root->state);
created = true;
}
}
mutex_unlock(&tree_root->log_mutex);
if (ret)
return ret;
}
mutex_lock(&root->log_mutex);
again:
if (root->log_root) {
int index = (root->log_transid + 1) % 2;
if (btrfs_need_log_full_commit(trans)) {
ret = -EAGAIN;
goto out;
}
if (zoned && atomic_read(&root->log_commit[index])) { wait_log_commit(root, root->log_transid - 1);
goto again;
}
if (!root->log_start_pid) { clear_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state);
root->log_start_pid = current->pid;
} else if (root->log_start_pid != current->pid) {
set_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state);
}
} else {
/*
* This means fs_info->log_root_tree was already created
* for some other FS trees. Do the full commit not to mix
* nodes from multiple log transactions to do sequential
* writing.
*/
if (zoned && !created) {
ret = -EAGAIN;
goto out;
}
ret = btrfs_add_log_tree(trans, root);
if (ret)
goto out;
set_bit(BTRFS_ROOT_HAS_LOG_TREE, &root->state);
clear_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state);
root->log_start_pid = current->pid;
}
atomic_inc(&root->log_writers); if (ctx && !ctx->logging_new_name) { int index = root->log_transid % 2;
list_add_tail(&ctx->list, &root->log_ctxs[index]);
ctx->log_transid = root->log_transid;
}
out:
mutex_unlock(&root->log_mutex);
return ret;
}
/*
* returns 0 if there was a log transaction running and we were able
* to join, or returns -ENOENT if there were not transactions
* in progress
*/
static int join_running_log_trans(struct btrfs_root *root)
{
const bool zoned = btrfs_is_zoned(root->fs_info);
int ret = -ENOENT;
if (!test_bit(BTRFS_ROOT_HAS_LOG_TREE, &root->state))
return ret;
mutex_lock(&root->log_mutex);
again:
if (root->log_root) {
int index = (root->log_transid + 1) % 2;
ret = 0;
if (zoned && atomic_read(&root->log_commit[index])) { wait_log_commit(root, root->log_transid - 1);
goto again;
}
atomic_inc(&root->log_writers);
}
mutex_unlock(&root->log_mutex); return ret;
}
/*
* This either makes the current running log transaction wait
* until you call btrfs_end_log_trans() or it makes any future
* log transactions wait until you call btrfs_end_log_trans()
*/
void btrfs_pin_log_trans(struct btrfs_root *root)
{
atomic_inc(&root->log_writers);
}
/*
* indicate we're done making changes to the log tree
* and wake up anyone waiting to do a sync
*/
void btrfs_end_log_trans(struct btrfs_root *root)
{
if (atomic_dec_and_test(&root->log_writers)) {
/* atomic_dec_and_test implies a barrier */
cond_wake_up_nomb(&root->log_writer_wait);
}
}
static int btrfs_write_tree_block(struct extent_buffer *buf)
{
return filemap_fdatawrite_range(buf->pages[0]->mapping, buf->start,
buf->start + buf->len - 1);
}
static void btrfs_wait_tree_block_writeback(struct extent_buffer *buf)
{
filemap_fdatawait_range(buf->pages[0]->mapping,
buf->start, buf->start + buf->len - 1);
}
/*
* the walk control struct is used to pass state down the chain when
* processing the log tree. The stage field tells us which part
* of the log tree processing we are currently doing. The others
* are state fields used for that specific part
*/
struct walk_control {
/* should we free the extent on disk when done? This is used
* at transaction commit time while freeing a log tree
*/
int free;
/* should we write out the extent buffer? This is used
* while flushing the log tree to disk during a sync
*/
int write;
/* should we wait for the extent buffer io to finish? Also used
* while flushing the log tree to disk for a sync
*/
int wait;
/* pin only walk, we record which extents on disk belong to the
* log trees
*/
int pin;
/* what stage of the replay code we're currently in */
int stage;
/*
* Ignore any items from the inode currently being processed. Needs
* to be set every time we find a BTRFS_INODE_ITEM_KEY and we are in
* the LOG_WALK_REPLAY_INODES stage.
*/
bool ignore_cur_inode;
/* the root we are currently replaying */
struct btrfs_root *replay_dest;
/* the trans handle for the current replay */
struct btrfs_trans_handle *trans;
/* the function that gets used to process blocks we find in the
* tree. Note the extent_buffer might not be up to date when it is
* passed in, and it must be checked or read if you need the data
* inside it
*/
int (*process_func)(struct btrfs_root *log, struct extent_buffer *eb,
struct walk_control *wc, u64 gen, int level);
};
/*
* process_func used to pin down extents, write them or wait on them
*/
static int process_one_buffer(struct btrfs_root *log,
struct extent_buffer *eb,
struct walk_control *wc, u64 gen, int level)
{
struct btrfs_fs_info *fs_info = log->fs_info;
int ret = 0;
/*
* If this fs is mixed then we need to be able to process the leaves to
* pin down any logged extents, so we have to read the block.
*/
if (btrfs_fs_incompat(fs_info, MIXED_GROUPS)) {
ret = btrfs_read_buffer(eb, gen, level, NULL);
if (ret)
return ret;
}
if (wc->pin)
ret = btrfs_pin_extent_for_log_replay(wc->trans, eb->start,
eb->len); if (!ret && btrfs_buffer_uptodate(eb, gen, 0)) { if (wc->pin && btrfs_header_level(eb) == 0) ret = btrfs_exclude_logged_extents(eb); if (wc->write)
btrfs_write_tree_block(eb);
if (wc->wait)
btrfs_wait_tree_block_writeback(eb);
}
return ret;
}
/*
* Item overwrite used by replay and tree logging. eb, slot and key all refer
* to the src data we are copying out.
*
* root is the tree we are copying into, and path is a scratch
* path for use in this function (it should be released on entry and
* will be released on exit).
*
* If the key is already in the destination tree the existing item is
* overwritten. If the existing item isn't big enough, it is extended.
* If it is too large, it is truncated.
*
* If the key isn't in the destination yet, a new item is inserted.
*/
static noinline int overwrite_item(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_path *path,
struct extent_buffer *eb, int slot,
struct btrfs_key *key)
{
int ret;
u32 item_size;
u64 saved_i_size = 0;
int save_old_i_size = 0;
unsigned long src_ptr;
unsigned long dst_ptr;
int overwrite_root = 0;
bool inode_item = key->type == BTRFS_INODE_ITEM_KEY;
if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID)
overwrite_root = 1;
item_size = btrfs_item_size_nr(eb, slot);
src_ptr = btrfs_item_ptr_offset(eb, slot);
/* look for the key in the destination tree */
ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
if (ret < 0)
return ret;
if (ret == 0) {
char *src_copy;
char *dst_copy;
u32 dst_size = btrfs_item_size_nr(path->nodes[0],
path->slots[0]);
if (dst_size != item_size)
goto insert;
if (item_size == 0) { btrfs_release_path(path);
return 0;
}
dst_copy = kmalloc(item_size, GFP_NOFS);
src_copy = kmalloc(item_size, GFP_NOFS);
if (!dst_copy || !src_copy) { btrfs_release_path(path);
kfree(dst_copy);
kfree(src_copy);
return -ENOMEM;
}
read_extent_buffer(eb, src_copy, src_ptr, item_size);
dst_ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]);
read_extent_buffer(path->nodes[0], dst_copy, dst_ptr,
item_size);
ret = memcmp(dst_copy, src_copy, item_size);
kfree(dst_copy);
kfree(src_copy);
/*
* they have the same contents, just return, this saves
* us from cowing blocks in the destination tree and doing
* extra writes that may not have been done by a previous
* sync
*/
if (ret == 0) {
btrfs_release_path(path);
return 0;
}
/*
* We need to load the old nbytes into the inode so when we
* replay the extents we've logged we get the right nbytes.
*/
if (inode_item) {
struct btrfs_inode_item *item;
u64 nbytes;
u32 mode;
item = btrfs_item_ptr(path->nodes[0], path->slots[0],
struct btrfs_inode_item);
nbytes = btrfs_inode_nbytes(path->nodes[0], item);
item = btrfs_item_ptr(eb, slot,
struct btrfs_inode_item);
btrfs_set_inode_nbytes(eb, item, nbytes);
/*
* If this is a directory we need to reset the i_size to
* 0 so that we can set it up properly when replaying
* the rest of the items in this log.
*/
mode = btrfs_inode_mode(eb, item);
if (S_ISDIR(mode))
btrfs_set_inode_size(eb, item, 0);
}
} else if (inode_item) {
struct btrfs_inode_item *item;
u32 mode;
/*
* New inode, set nbytes to 0 so that the nbytes comes out
* properly when we replay the extents.
*/
item = btrfs_item_ptr(eb, slot, struct btrfs_inode_item);
btrfs_set_inode_nbytes(eb, item, 0);
/*
* If this is a directory we need to reset the i_size to 0 so
* that we can set it up properly when replaying the rest of
* the items in this log.
*/
mode = btrfs_inode_mode(eb, item);
if (S_ISDIR(mode))
btrfs_set_inode_size(eb, item, 0);
}
insert:
btrfs_release_path(path);
/* try to insert the key into the destination tree */
path->skip_release_on_error = 1;
ret = btrfs_insert_empty_item(trans, root, path,
key, item_size);
path->skip_release_on_error = 0;
/* make sure any existing item is the correct size */
if (ret == -EEXIST || ret == -EOVERFLOW) {
u32 found_size;
found_size = btrfs_item_size_nr(path->nodes[0],
path->slots[0]);
if (found_size > item_size)
btrfs_truncate_item(path, item_size, 1); else if (found_size < item_size) btrfs_extend_item(path, item_size - found_size); } else if (ret) {
return ret;
}
dst_ptr = btrfs_item_ptr_offset(path->nodes[0],
path->slots[0]);
/* don't overwrite an existing inode if the generation number
* was logged as zero. This is done when the tree logging code
* is just logging an inode to make sure it exists after recovery.
*
* Also, don't overwrite i_size on directories during replay.
* log replay inserts and removes directory items based on the
* state of the tree found in the subvolume, and i_size is modified
* as it goes
*/
if (key->type == BTRFS_INODE_ITEM_KEY && ret == -EEXIST) {
struct btrfs_inode_item *src_item;
struct btrfs_inode_item *dst_item;
src_item = (struct btrfs_inode_item *)src_ptr;
dst_item = (struct btrfs_inode_item *)dst_ptr;
if (btrfs_inode_generation(eb, src_item) == 0) {
struct extent_buffer *dst_eb = path->nodes[0];
const u64 ino_size = btrfs_inode_size(eb, src_item);
/*
* For regular files an ino_size == 0 is used only when
* logging that an inode exists, as part of a directory
* fsync, and the inode wasn't fsynced before. In this
* case don't set the size of the inode in the fs/subvol
* tree, otherwise we would be throwing valid data away.
*/
if (S_ISREG(btrfs_inode_mode(eb, src_item)) && S_ISREG(btrfs_inode_mode(dst_eb, dst_item)) &&
ino_size != 0)
btrfs_set_inode_size(dst_eb, dst_item, ino_size);
goto no_copy;
}
if (overwrite_root &&
S_ISDIR(btrfs_inode_mode(eb, src_item)) &&
S_ISDIR(btrfs_inode_mode(path->nodes[0], dst_item))) {
save_old_i_size = 1;
saved_i_size = btrfs_inode_size(path->nodes[0],
dst_item);
}
}
copy_extent_buffer(path->nodes[0], eb, dst_ptr,
src_ptr, item_size);
if (save_old_i_size) {
struct btrfs_inode_item *dst_item;
dst_item = (struct btrfs_inode_item *)dst_ptr;
btrfs_set_inode_size(path->nodes[0], dst_item, saved_i_size);
}
/* make sure the generation is filled in */
if (key->type == BTRFS_INODE_ITEM_KEY) {
struct btrfs_inode_item *dst_item;
dst_item = (struct btrfs_inode_item *)dst_ptr; if (btrfs_inode_generation(path->nodes[0], dst_item) == 0) { btrfs_set_inode_generation(path->nodes[0], dst_item,
trans->transid);
}
}
no_copy:
btrfs_mark_buffer_dirty(path->nodes[0]);
btrfs_release_path(path);
return 0;
}
/*
* simple helper to read an inode off the disk from a given root
* This can only be called for subvolume roots and not for the log
*/
static noinline struct inode *read_one_inode(struct btrfs_root *root,
u64 objectid)
{
struct inode *inode;
inode = btrfs_iget(root->fs_info->sb, objectid, root);
if (IS_ERR(inode))
inode = NULL;
return inode;
}
/* replays a single extent in 'eb' at 'slot' with 'key' into the
* subvolume 'root'. path is released on entry and should be released
* on exit.
*
* extents in the log tree have not been allocated out of the extent
* tree yet. So, this completes the allocation, taking a reference
* as required if the extent already exists or creating a new extent
* if it isn't in the extent allocation tree yet.
*
* The extent is inserted into the file, dropping any existing extents
* from the file that overlap the new one.
*/
static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_path *path,
struct extent_buffer *eb, int slot,
struct btrfs_key *key)
{
struct btrfs_drop_extents_args drop_args = { 0 };
struct btrfs_fs_info *fs_info = root->fs_info;
int found_type;
u64 extent_end;
u64 start = key->offset;
u64 nbytes = 0;
struct btrfs_file_extent_item *item;
struct inode *inode = NULL;
unsigned long size;
int ret = 0;
item = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item);
found_type = btrfs_file_extent_type(eb, item);
if (found_type == BTRFS_FILE_EXTENT_REG ||
found_type == BTRFS_FILE_EXTENT_PREALLOC) {
nbytes = btrfs_file_extent_num_bytes(eb, item);
extent_end = start + nbytes;
/*
* We don't add to the inodes nbytes if we are prealloc or a
* hole.
*/
if (btrfs_file_extent_disk_bytenr(eb, item) == 0)
nbytes = 0;
} else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
size = btrfs_file_extent_ram_bytes(eb, item);
nbytes = btrfs_file_extent_ram_bytes(eb, item);
extent_end = ALIGN(start + size,
fs_info->sectorsize);
} else {
ret = 0;
goto out;
}
inode = read_one_inode(root, key->objectid);
if (!inode) {
ret = -EIO;
goto out;
}
/*
* first check to see if we already have this extent in the
* file. This must be done before the btrfs_drop_extents run
* so we don't try to drop this extent.
*/
ret = btrfs_lookup_file_extent(trans, root, path,
btrfs_ino(BTRFS_I(inode)), start, 0);
if (ret == 0 &&
(found_type == BTRFS_FILE_EXTENT_REG ||
found_type == BTRFS_FILE_EXTENT_PREALLOC)) {
struct btrfs_file_extent_item cmp1;
struct btrfs_file_extent_item cmp2;
struct btrfs_file_extent_item *existing;
struct extent_buffer *leaf;
leaf = path->nodes[0];
existing = btrfs_item_ptr(leaf, path->slots[0],
struct btrfs_file_extent_item);
read_extent_buffer(eb, &cmp1, (unsigned long)item,
sizeof(cmp1));
read_extent_buffer(leaf, &cmp2, (unsigned long)existing,
sizeof(cmp2));
/*
* we already have a pointer to this exact extent,
* we don't have to do anything
*/
if (memcmp(&cmp1, &cmp2, sizeof(cmp1)) == 0) {
btrfs_release_path(path);
goto out;
}
}
btrfs_release_path(path);
/* drop any overlapping extents */
drop_args.start = start;
drop_args.end = extent_end;
drop_args.drop_cache = true;
ret = btrfs_drop_extents(trans, root, BTRFS_I(inode), &drop_args);
if (ret)
goto out;
if (found_type == BTRFS_FILE_EXTENT_REG ||
found_type == BTRFS_FILE_EXTENT_PREALLOC) {
u64 offset;
unsigned long dest_offset;
struct btrfs_key ins;
if (btrfs_file_extent_disk_bytenr(eb, item) == 0 &&
btrfs_fs_incompat(fs_info, NO_HOLES))
goto update_inode;
ret = btrfs_insert_empty_item(trans, root, path, key,
sizeof(*item));
if (ret)
goto out;
dest_offset = btrfs_item_ptr_offset(path->nodes[0],
path->slots[0]);
copy_extent_buffer(path->nodes[0], eb, dest_offset,
(unsigned long)item, sizeof(*item));
ins.objectid = btrfs_file_extent_disk_bytenr(eb, item);
ins.offset = btrfs_file_extent_disk_num_bytes(eb, item);
ins.type = BTRFS_EXTENT_ITEM_KEY;
offset = key->offset - btrfs_file_extent_offset(eb, item);
/*
* Manually record dirty extent, as here we did a shallow
* file extent item copy and skip normal backref update,
* but modifying extent tree all by ourselves.
* So need to manually record dirty extent for qgroup,
* as the owner of the file extent changed from log tree
* (doesn't affect qgroup) to fs/file tree(affects qgroup)
*/
ret = btrfs_qgroup_trace_extent(trans,
btrfs_file_extent_disk_bytenr(eb, item),
btrfs_file_extent_disk_num_bytes(eb, item),
GFP_NOFS);
if (ret < 0)
goto out;
if (ins.objectid > 0) {
struct btrfs_ref ref = { 0 };
u64 csum_start;
u64 csum_end;
LIST_HEAD(ordered_sums);
/*
* is this extent already allocated in the extent
* allocation tree? If so, just add a reference
*/
ret = btrfs_lookup_data_extent(fs_info, ins.objectid,
ins.offset);
if (ret < 0) {
goto out;
} else if (ret == 0) {
btrfs_init_generic_ref(&ref,
BTRFS_ADD_DELAYED_REF,
ins.objectid, ins.offset, 0);
btrfs_init_data_ref(&ref,
root->root_key.objectid,
key->objectid, offset);
ret = btrfs_inc_extent_ref(trans, &ref);
if (ret)
goto out;
} else {
/*
* insert the extent pointer in the extent
* allocation tree
*/
ret = btrfs_alloc_logged_file_extent(trans,
root->root_key.objectid,
key->objectid, offset, &ins);
if (ret)
goto out;
}
btrfs_release_path(path);
if (btrfs_file_extent_compression(eb, item)) {
csum_start = ins.objectid;
csum_end = csum_start + ins.offset;
} else {
csum_start = ins.objectid +
btrfs_file_extent_offset(eb, item);
csum_end = csum_start +
btrfs_file_extent_num_bytes(eb, item);
}
ret = btrfs_lookup_csums_range(root->log_root,
csum_start, csum_end - 1,
&ordered_sums, 0);
if (ret)
goto out;
/*
* Now delete all existing cums in the csum root that
* cover our range. We do this because we can have an
* extent that is completely referenced by one file
* extent item and partially referenced by another
* file extent item (like after using the clone or
* extent_same ioctls). In this case if we end up doing
* the replay of the one that partially references the
* extent first, and we do not do the csum deletion
* below, we can get 2 csum items in the csum tree that
* overlap each other. For example, imagine our log has
* the two following file extent items:
*
* key (257 EXTENT_DATA 409600)
* extent data disk byte 12845056 nr 102400
* extent data offset 20480 nr 20480 ram 102400
*
* key (257 EXTENT_DATA 819200)
* extent data disk byte 12845056 nr 102400
* extent data offset 0 nr 102400 ram 102400
*
* Where the second one fully references the 100K extent
* that starts at disk byte 12845056, and the log tree
* has a single csum item that covers the entire range
* of the extent:
*
* key (EXTENT_CSUM EXTENT_CSUM 12845056) itemsize 100
*
* After the first file extent item is replayed, the
* csum tree gets the following csum item:
*
* key (EXTENT_CSUM EXTENT_CSUM 12865536) itemsize 20
*
* Which covers the 20K sub-range starting at offset 20K
* of our extent. Now when we replay the second file
* extent item, if we do not delete existing csum items
* that cover any of its blocks, we end up getting two
* csum items in our csum tree that overlap each other:
*
* key (EXTENT_CSUM EXTENT_CSUM 12845056) itemsize 100
* key (EXTENT_CSUM EXTENT_CSUM 12865536) itemsize 20
*
* Which is a problem, because after this anyone trying
* to lookup up for the checksum of any block of our
* extent starting at an offset of 40K or higher, will
* end up looking at the second csum item only, which
* does not contain the checksum for any block starting
* at offset 40K or higher of our extent.
*/
while (!list_empty(&ordered_sums)) {
struct btrfs_ordered_sum *sums;
sums = list_entry(ordered_sums.next,
struct btrfs_ordered_sum,
list);
if (!ret)
ret = btrfs_del_csums(trans,
fs_info->csum_root,
sums->bytenr,
sums->len);
if (!ret)
ret = btrfs_csum_file_blocks(trans,
fs_info->csum_root, sums);
list_del(&sums->list);
kfree(sums);
}
if (ret)
goto out;
} else {
btrfs_release_path(path);
}
} else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
/* inline extents are easy, we just overwrite them */
ret = overwrite_item(trans, root, path, eb, slot, key);
if (ret)
goto out;
}
ret = btrfs_inode_set_file_extent_range(BTRFS_I(inode), start,
extent_end - start);
if (ret)
goto out;
update_inode:
btrfs_update_inode_bytes(BTRFS_I(inode), nbytes, drop_args.bytes_found);
ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
out:
if (inode)
iput(inode);
return ret;
}
/*
* when cleaning up conflicts between the directory names in the
* subvolume, directory names in the log and directory names in the
* inode back references, we may have to unlink inodes from directories.
*
* This is a helper function to do the unlink of a specific directory
* item
*/
static noinline int drop_one_dir_item(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_path *path,
struct btrfs_inode *dir,
struct btrfs_dir_item *di)
{
struct inode *inode;
char *name;
int name_len;
struct extent_buffer *leaf;
struct btrfs_key location;
int ret;
leaf = path->nodes[0];
btrfs_dir_item_key_to_cpu(leaf, di, &location);
name_len = btrfs_dir_name_len(leaf, di);
name = kmalloc(name_len, GFP_NOFS);
if (!name)
return -ENOMEM;
read_extent_buffer(leaf, name, (unsigned long)(di + 1), name_len);
btrfs_release_path(path);
inode = read_one_inode(root, location.objectid);
if (!inode) {
ret = -EIO;
goto out;
}
ret = link_to_fixup_dir(trans, root, path, location.objectid);
if (ret)
goto out;
ret = btrfs_unlink_inode(trans, root, dir, BTRFS_I(inode), name,
name_len);
if (ret)
goto out;
else
ret = btrfs_run_delayed_items(trans);
out:
kfree(name);
iput(inode);
return ret;
}
/*
* See if a given name and sequence number found in an inode back reference are
* already in a directory and correctly point to this inode.
*
* Returns: < 0 on error, 0 if the directory entry does not exists and 1 if it
* exists.
*/
static noinline int inode_in_dir(struct btrfs_root *root,
struct btrfs_path *path,
u64 dirid, u64 objectid, u64 index,
const char *name, int name_len)
{
struct btrfs_dir_item *di;
struct btrfs_key location;
int ret = 0;
di = btrfs_lookup_dir_index_item(NULL, root, path, dirid,
index, name, name_len, 0);
if (IS_ERR(di)) {
ret = PTR_ERR(di);
goto out;
} else if (di) {
btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location);
if (location.objectid != objectid)
goto out;
} else {
goto out;
}
btrfs_release_path(path);
di = btrfs_lookup_dir_item(NULL, root, path, dirid, name, name_len, 0);
if (IS_ERR(di)) {
ret = PTR_ERR(di);
goto out;
} else if (di) {
btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location);
if (location.objectid == objectid)
ret = 1;
}
out:
btrfs_release_path(path);
return ret;
}
/*
* helper function to check a log tree for a named back reference in
* an inode. This is used to decide if a back reference that is
* found in the subvolume conflicts with what we find in the log.
*
* inode backreferences may have multiple refs in a single item,
* during replay we process one reference at a time, and we don't
* want to delete valid links to a file from the subvolume if that
* link is also in the log.
*/
static noinline int backref_in_log(struct btrfs_root *log,
struct btrfs_key *key,
u64 ref_objectid,
const char *name, int namelen)
{
struct btrfs_path *path;
int ret;
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
ret = btrfs_search_slot(NULL, log, key, path, 0, 0);
if (ret < 0) {
goto out;
} else if (ret == 1) {
ret = 0;
goto out;
}
if (key->type == BTRFS_INODE_EXTREF_KEY)
ret = !!btrfs_find_name_in_ext_backref(path->nodes[0],
path->slots[0],
ref_objectid,
name, namelen);
else
ret = !!btrfs_find_name_in_backref(path->nodes[0],
path->slots[0],
name, namelen);
out:
btrfs_free_path(path);
return ret;
}
static inline int __add_inode_ref(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_path *path,
struct btrfs_root *log_root,
struct btrfs_inode *dir,
struct btrfs_inode *inode,
u64 inode_objectid, u64 parent_objectid,
u64 ref_index, char *name, int namelen,
int *search_done)
{
int ret;
char *victim_name;
int victim_name_len;
struct extent_buffer *leaf;
struct btrfs_dir_item *di;
struct btrfs_key search_key;
struct btrfs_inode_extref *extref;
again:
/* Search old style refs */
search_key.objectid = inode_objectid;
search_key.type = BTRFS_INODE_REF_KEY;
search_key.offset = parent_objectid;
ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
if (ret == 0) {
struct btrfs_inode_ref *victim_ref;
unsigned long ptr;
unsigned long ptr_end;
leaf = path->nodes[0];
/* are we trying to overwrite a back ref for the root directory
* if so, just jump out, we're done
*/
if (search_key.objectid == search_key.offset)
return 1;
/* check all the names in this back reference to see
* if they are in the log. if so, we allow them to stay
* otherwise they must be unlinked as a conflict
*/
ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
ptr_end = ptr + btrfs_item_size_nr(leaf, path->slots[0]);
while (ptr < ptr_end) {
victim_ref = (struct btrfs_inode_ref *)ptr;
victim_name_len = btrfs_inode_ref_name_len(leaf,
victim_ref);
victim_name = kmalloc(victim_name_len, GFP_NOFS);
if (!victim_name)
return -ENOMEM;
read_extent_buffer(leaf, victim_name,
(unsigned long)(victim_ref + 1),
victim_name_len);
ret = backref_in_log(log_root, &search_key,
parent_objectid, victim_name,
victim_name_len);
if (ret < 0) {
kfree(victim_name);
return ret;
} else if (!ret) {
inc_nlink(&inode->vfs_inode);
btrfs_release_path(path);
ret = btrfs_unlink_inode(trans, root, dir, inode,
victim_name, victim_name_len);
kfree(victim_name);
if (ret)
return ret;
ret = btrfs_run_delayed_items(trans);
if (ret)
return ret;
*search_done = 1;
goto again;
}
kfree(victim_name);
ptr = (unsigned long)(victim_ref + 1) + victim_name_len;
}
/*
* NOTE: we have searched root tree and checked the
* corresponding ref, it does not need to check again.
*/
*search_done = 1;
}
btrfs_release_path(path);
/* Same search but for extended refs */
extref = btrfs_lookup_inode_extref(NULL, root, path, name, namelen,
inode_objectid, parent_objectid, 0,
0);
if (!IS_ERR_OR_NULL(extref)) {
u32 item_size;
u32 cur_offset = 0;
unsigned long base;
struct inode *victim_parent;
leaf = path->nodes[0];
item_size = btrfs_item_size_nr(leaf, path->slots[0]);
base = btrfs_item_ptr_offset(leaf, path->slots[0]);
while (cur_offset < item_size) {
extref = (struct btrfs_inode_extref *)(base + cur_offset);
victim_name_len = btrfs_inode_extref_name_len(leaf, extref);
if (btrfs_inode_extref_parent(leaf, extref) != parent_objectid)
goto next;
victim_name = kmalloc(victim_name_len, GFP_NOFS);
if (!victim_name)
return -ENOMEM;
read_extent_buffer(leaf, victim_name, (unsigned long)&extref->name,
victim_name_len);
search_key.objectid = inode_objectid;
search_key.type = BTRFS_INODE_EXTREF_KEY;
search_key.offset = btrfs_extref_hash(parent_objectid,
victim_name,
victim_name_len);
ret = backref_in_log(log_root, &search_key,
parent_objectid, victim_name,
victim_name_len);
if (ret < 0) {
kfree(victim_name);
return ret;
} else if (!ret) {
ret = -ENOENT;
victim_parent = read_one_inode(root,
parent_objectid);
if (victim_parent) {
inc_nlink(&inode->vfs_inode);
btrfs_release_path(path);
ret = btrfs_unlink_inode(trans, root,
BTRFS_I(victim_parent),
inode,
victim_name,
victim_name_len);
if (!ret)
ret = btrfs_run_delayed_items(
trans);
}
iput(victim_parent);
kfree(victim_name);
if (ret)
return ret;
*search_done = 1;
goto again;
}
kfree(victim_name);
next:
cur_offset += victim_name_len + sizeof(*extref);
}
*search_done = 1;
}
btrfs_release_path(path);
/* look for a conflicting sequence number */
di = btrfs_lookup_dir_index_item(trans, root, path, btrfs_ino(dir),
ref_index, name, namelen, 0);
if (IS_ERR(di)) {
return PTR_ERR(di);
} else if (di) {
ret = drop_one_dir_item(trans, root, path, dir, di);
if (ret)
return ret;
}
btrfs_release_path(path);
/* look for a conflicting name */
di = btrfs_lookup_dir_item(trans, root, path, btrfs_ino(dir),
name, namelen, 0);
if (IS_ERR(di)) {
return PTR_ERR(di);
} else if (di) {
ret = drop_one_dir_item(trans, root, path, dir, di);
if (ret)
return ret;
}
btrfs_release_path(path);
return 0;
}
static int extref_get_fields(struct extent_buffer *eb, unsigned long ref_ptr,
u32 *namelen, char **name, u64 *index,
u64 *parent_objectid)
{
struct btrfs_inode_extref *extref;
extref = (struct btrfs_inode_extref *)ref_ptr;
*namelen = btrfs_inode_extref_name_len(eb, extref);
*name = kmalloc(*namelen, GFP_NOFS);
if (*name == NULL)
return -ENOMEM;
read_extent_buffer(eb, *name, (unsigned long)&extref->name,
*namelen);
if (index)
*index = btrfs_inode_extref_index(eb, extref);
if (parent_objectid)
*parent_objectid = btrfs_inode_extref_parent(eb, extref);
return 0;
}
static int ref_get_fields(struct extent_buffer *eb, unsigned long ref_ptr,
u32 *namelen, char **name, u64 *index)
{
struct btrfs_inode_ref *ref;
ref = (struct btrfs_inode_ref *)ref_ptr;
*namelen = btrfs_inode_ref_name_len(eb, ref);
*name = kmalloc(*namelen, GFP_NOFS);
if (*name == NULL)
return -ENOMEM;
read_extent_buffer(eb, *name, (unsigned long)(ref + 1), *namelen);
if (index)
*index = btrfs_inode_ref_index(eb, ref);
return 0;
}
/*
* Take an inode reference item from the log tree and iterate all names from the
* inode reference item in the subvolume tree with the same key (if it exists).
* For any name that is not in the inode reference item from the log tree, do a
* proper unlink of that name (that is, remove its entry from the inode
* reference item and both dir index keys).
*/
static int unlink_old_inode_refs(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_path *path,
struct btrfs_inode *inode,
struct extent_buffer *log_eb,
int log_slot,
struct btrfs_key *key)
{
int ret;
unsigned long ref_ptr;
unsigned long ref_end;
struct extent_buffer *eb;
again:
btrfs_release_path(path);
ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
if (ret > 0) {
ret = 0;
goto out;
}
if (ret < 0)
goto out;
eb = path->nodes[0];
ref_ptr = btrfs_item_ptr_offset(eb, path->slots[0]);
ref_end = ref_ptr + btrfs_item_size_nr(eb, path->slots[0]);
while (ref_ptr < ref_end) {
char *name = NULL;
int namelen;
u64 parent_id;
if (key->type == BTRFS_INODE_EXTREF_KEY) {
ret = extref_get_fields(eb, ref_ptr, &namelen, &name,
NULL, &parent_id);
} else {
parent_id = key->offset;
ret = ref_get_fields(eb, ref_ptr, &namelen, &name,
NULL);
}
if (ret)
goto out;
if (key->type == BTRFS_INODE_EXTREF_KEY)
ret = !!btrfs_find_name_in_ext_backref(log_eb, log_slot,
parent_id, name,
namelen);
else
ret = !!btrfs_find_name_in_backref(log_eb, log_slot,
name, namelen);
if (!ret) {
struct inode *dir;
btrfs_release_path(path);
dir = read_one_inode(root, parent_id);
if (!dir) {
ret = -ENOENT;
kfree(name);
goto out;
}
ret = btrfs_unlink_inode(trans, root, BTRFS_I(dir),
inode, name, namelen);
kfree(name);
iput(dir);
/*
* Whenever we need to check if a name exists or not, we
* check the subvolume tree. So after an unlink we must
* run delayed items, so that future checks for a name
* during log replay see that the name does not exists
* anymore.
*/
if (!ret)
ret = btrfs_run_delayed_items(trans);
if (ret)
goto out;
goto again;
}
kfree(name);
ref_ptr += namelen;
if (key->type == BTRFS_INODE_EXTREF_KEY)
ref_ptr += sizeof(struct btrfs_inode_extref);
else
ref_ptr += sizeof(struct btrfs_inode_ref);
}
ret = 0;
out:
btrfs_release_path(path);
return ret;
}
static int btrfs_inode_ref_exists(struct inode *inode, struct inode *dir,
const u8 ref_type, const char *name,
const int namelen)
{
struct btrfs_key key;
struct btrfs_path *path;
const u64 parent_id = btrfs_ino(BTRFS_I(dir));
int ret;
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
key.objectid = btrfs_ino(BTRFS_I(inode));
key.type = ref_type;
if (key.type == BTRFS_INODE_REF_KEY)
key.offset = parent_id;
else
key.offset = btrfs_extref_hash(parent_id, name, namelen);
ret = btrfs_search_slot(NULL, BTRFS_I(inode)->root, &key, path, 0, 0);
if (ret < 0)
goto out;
if (ret > 0) {
ret = 0;
goto out;
}
if (key.type == BTRFS_INODE_EXTREF_KEY)
ret = !!btrfs_find_name_in_ext_backref(path->nodes[0],
path->slots[0], parent_id, name, namelen);
else
ret = !!btrfs_find_name_in_backref(path->nodes[0], path->slots[0],
name, namelen);
out:
btrfs_free_path(path);
return ret;
}
static int add_link(struct btrfs_trans_handle *trans, struct btrfs_root *root,
struct inode *dir, struct inode *inode, const char *name,
int namelen, u64 ref_index)
{
struct btrfs_dir_item *dir_item;
struct btrfs_key key;
struct btrfs_path *path;
struct inode *other_inode = NULL;
int ret;
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
dir_item = btrfs_lookup_dir_item(NULL, root, path,
btrfs_ino(BTRFS_I(dir)),
name, namelen, 0);
if (!dir_item) {
btrfs_release_path(path);
goto add_link;
} else if (IS_ERR(dir_item)) {
ret = PTR_ERR(dir_item);
goto out;
}
/*
* Our inode's dentry collides with the dentry of another inode which is
* in the log but not yet processed since it has a higher inode number.
* So delete that other dentry.
*/
btrfs_dir_item_key_to_cpu(path->nodes[0], dir_item, &key);
btrfs_release_path(path);
other_inode = read_one_inode(root, key.objectid);
if (!other_inode) {
ret = -ENOENT;
goto out;
}
ret = btrfs_unlink_inode(trans, root, BTRFS_I(dir), BTRFS_I(other_inode),
name, namelen);
if (ret)
goto out;
/*
* If we dropped the link count to 0, bump it so that later the iput()
* on the inode will not free it. We will fixup the link count later.
*/
if (other_inode->i_nlink == 0)
inc_nlink(other_inode);
ret = btrfs_run_delayed_items(trans);
if (ret)
goto out;
add_link:
ret = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode),
name, namelen, 0, ref_index);
out:
iput(other_inode);
btrfs_free_path(path);
return ret;
}
/*
* replay one inode back reference item found in the log tree.
* eb, slot and key refer to the buffer and key found in the log tree.
* root is the destination we are replaying into, and path is for temp
* use by this function. (it should be released on return).
*/
static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_root *log,
struct btrfs_path *path,
struct extent_buffer *eb, int slot,
struct btrfs_key *key)
{
struct inode *dir = NULL;
struct inode *inode = NULL;
unsigned long ref_ptr;
unsigned long ref_end;
char *name = NULL;
int namelen;
int ret;
int search_done = 0;
int log_ref_ver = 0;
u64 parent_objectid;
u64 inode_objectid;
u64 ref_index = 0;
int ref_struct_size;
ref_ptr = btrfs_item_ptr_offset(eb, slot);
ref_end = ref_ptr + btrfs_item_size_nr(eb, slot);
if (key->type == BTRFS_INODE_EXTREF_KEY) {
struct btrfs_inode_extref *r;
ref_struct_size = sizeof(struct btrfs_inode_extref);
log_ref_ver = 1;
r = (struct btrfs_inode_extref *)ref_ptr;
parent_objectid = btrfs_inode_extref_parent(eb, r);
} else {
ref_struct_size = sizeof(struct btrfs_inode_ref);
parent_objectid = key->offset;
}
inode_objectid = key->objectid;
/*
* it is possible that we didn't log all the parent directories
* for a given inode. If we don't find the dir, just don't
* copy the back ref in. The link count fixup code will take
* care of the rest
*/
dir = read_one_inode(root, parent_objectid);
if (!dir) {
ret = -ENOENT;
goto out;
}
inode = read_one_inode(root, inode_objectid);
if (!inode) {
ret = -EIO;
goto out;
}
while (ref_ptr < ref_end) {
if (log_ref_ver) {
ret = extref_get_fields(eb, ref_ptr, &namelen, &name,
&ref_index, &parent_objectid);
/*
* parent object can change from one array
* item to another.
*/
if (!dir)
dir = read_one_inode(root, parent_objectid);
if (!dir) {
ret = -ENOENT;
goto out;
}
} else {
ret = ref_get_fields(eb, ref_ptr, &namelen, &name,
&ref_index);
}
if (ret)
goto out;
ret = inode_in_dir(root, path, btrfs_ino(BTRFS_I(dir)),
btrfs_ino(BTRFS_I(inode)), ref_index,
name, namelen);
if (ret < 0) {
goto out;
} else if (ret == 0) {
/*
* look for a conflicting back reference in the
* metadata. if we find one we have to unlink that name
* of the file before we add our new link. Later on, we
* overwrite any existing back reference, and we don't
* want to create dangling pointers in the directory.
*/
if (!search_done) {
ret = __add_inode_ref(trans, root, path, log,
BTRFS_I(dir),
BTRFS_I(inode),
inode_objectid,
parent_objectid,
ref_index, name, namelen,
&search_done);
if (ret) {
if (ret == 1)
ret = 0;
goto out;
}
}
/*
* If a reference item already exists for this inode
* with the same parent and name, but different index,
* drop it and the corresponding directory index entries
* from the parent before adding the new reference item
* and dir index entries, otherwise we would fail with
* -EEXIST returned from btrfs_add_link() below.
*/
ret = btrfs_inode_ref_exists(inode, dir, key->type,
name, namelen);
if (ret > 0) {
ret = btrfs_unlink_inode(trans, root,
BTRFS_I(dir),
BTRFS_I(inode),
name, namelen);
/*
* If we dropped the link count to 0, bump it so
* that later the iput() on the inode will not
* free it. We will fixup the link count later.
*/
if (!ret && inode->i_nlink == 0)
inc_nlink(inode);
/*
* Whenever we need to check if a name exists or
* not, we check the subvolume tree. So after an
* unlink we must run delayed items, so that future
* checks for a name during log replay see that the
* name does not exists anymore.
*/
if (!ret)
ret = btrfs_run_delayed_items(trans);
}
if (ret < 0)
goto out;
/* insert our name */
ret = add_link(trans, root, dir, inode, name, namelen,
ref_index);
if (ret)
goto out;
ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
if (ret)
goto out;
}
/* Else, ret == 1, we already have a perfect match, we're done. */
ref_ptr = (unsigned long)(ref_ptr + ref_struct_size) + namelen;
kfree(name);
name = NULL;
if (log_ref_ver) {
iput(dir);
dir = NULL;
}
}
/*
* Before we overwrite the inode reference item in the subvolume tree
* with the item from the log tree, we must unlink all names from the
* parent directory that are in the subvolume's tree inode reference
* item, otherwise we end up with an inconsistent subvolume tree where
* dir index entries exist for a name but there is no inode reference
* item with the same name.
*/
ret = unlink_old_inode_refs(trans, root, path, BTRFS_I(inode), eb, slot,
key);
if (ret)
goto out;
/* finally write the back reference in the inode */
ret = overwrite_item(trans, root, path, eb, slot, key);
out:
btrfs_release_path(path);
kfree(name);
iput(dir);
iput(inode);
return ret;
}
static int count_inode_extrefs(struct btrfs_root *root,
struct btrfs_inode *inode, struct btrfs_path *path)
{
int ret = 0;
int name_len;
unsigned int nlink = 0;
u32 item_size;
u32 cur_offset = 0;
u64 inode_objectid = btrfs_ino(inode);
u64 offset = 0;
unsigned long ptr;
struct btrfs_inode_extref *extref;
struct extent_buffer *leaf;
while (1) {
ret = btrfs_find_one_extref(root, inode_objectid, offset, path,
&extref, &offset);
if (ret)
break;
leaf = path->nodes[0];
item_size = btrfs_item_size_nr(leaf, path->slots[0]);
ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
cur_offset = 0;
while (cur_offset < item_size) {
extref = (struct btrfs_inode_extref *) (ptr + cur_offset);
name_len = btrfs_inode_extref_name_len(leaf, extref);
nlink++;
cur_offset += name_len + sizeof(*extref);
}
offset++;
btrfs_release_path(path);
}
btrfs_release_path(path);
if (ret < 0 && ret != -ENOENT)
return ret;
return nlink;
}
static int count_inode_refs(struct btrfs_root *root,
struct btrfs_inode *inode, struct btrfs_path *path)
{
int ret;
struct btrfs_key key;
unsigned int nlink = 0;
unsigned long ptr;
unsigned long ptr_end;
int name_len;
u64 ino = btrfs_ino(inode);
key.objectid = ino;
key.type = BTRFS_INODE_REF_KEY;
key.offset = (u64)-1;
while (1) {
ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
if (ret < 0)
break;
if (ret > 0) {
if (path->slots[0] == 0)
break;
path->slots[0]--;
}
process_slot:
btrfs_item_key_to_cpu(path->nodes[0], &key,
path->slots[0]);
if (key.objectid != ino ||
key.type != BTRFS_INODE_REF_KEY)
break;
ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]);
ptr_end = ptr + btrfs_item_size_nr(path->nodes[0],
path->slots[0]);
while (ptr < ptr_end) {
struct btrfs_inode_ref *ref;
ref = (struct btrfs_inode_ref *)ptr;
name_len = btrfs_inode_ref_name_len(path->nodes[0],
ref);
ptr = (unsigned long)(ref + 1) + name_len;
nlink++;
}
if (key.offset == 0)
break;
if (path->slots[0] > 0) {
path->slots[0]--;
goto process_slot;
}
key.offset--;
btrfs_release_path(path);
}
btrfs_release_path(path);
return nlink;
}
/*
* There are a few corners where the link count of the file can't
* be properly maintained during replay. So, instead of adding
* lots of complexity to the log code, we just scan the backrefs
* for any file that has been through replay.
*
* The scan will update the link count on the inode to reflect the
* number of back refs found. If it goes down to zero, the iput
* will free the inode.
*/
static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct inode *inode)
{
struct btrfs_path *path;
int ret;
u64 nlink = 0;
u64 ino = btrfs_ino(BTRFS_I(inode));
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
ret = count_inode_refs(root, BTRFS_I(inode), path);
if (ret < 0)
goto out;
nlink = ret;
ret = count_inode_extrefs(root, BTRFS_I(inode), path);
if (ret < 0)
goto out;
nlink += ret;
ret = 0;
if (nlink != inode->i_nlink) {
set_nlink(inode, nlink);
ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
if (ret)
goto out;
}
BTRFS_I(inode)->index_cnt = (u64)-1;
if (inode->i_nlink == 0) {
if (S_ISDIR(inode->i_mode)) {
ret = replay_dir_deletes(trans, root, NULL, path,
ino, 1);
if (ret)
goto out;
}
ret = btrfs_insert_orphan_item(trans, root, ino);
if (ret == -EEXIST)
ret = 0;
}
out:
btrfs_free_path(path);
return ret;
}
static noinline int fixup_inode_link_counts(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_path *path)
{
int ret;
struct btrfs_key key;
struct inode *inode;
key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID;
key.type = BTRFS_ORPHAN_ITEM_KEY;
key.offset = (u64)-1;
while (1) {
ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
if (ret < 0)
break;
if (ret == 1) {
ret = 0;
if (path->slots[0] == 0)
break;
path->slots[0]--;
}
btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
if (key.objectid != BTRFS_TREE_LOG_FIXUP_OBJECTID ||
key.type != BTRFS_ORPHAN_ITEM_KEY)
break;
ret = btrfs_del_item(trans, root, path);
if (ret)
break;
btrfs_release_path(path);
inode = read_one_inode(root, key.offset);
if (!inode) {
ret = -EIO;
break;
}
ret = fixup_inode_link_count(trans, root, inode);
iput(inode);
if (ret)
break;
/*
* fixup on a directory may create new entries,
* make sure we always look for the highset possible
* offset
*/
key.offset = (u64)-1;
}
btrfs_release_path(path);
return ret;
}
/*
* record a given inode in the fixup dir so we can check its link
* count when replay is done. The link count is incremented here
* so the inode won't go away until we check it
*/
static noinline int link_to_fixup_dir(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_path *path,
u64 objectid)
{
struct btrfs_key key;
int ret = 0;
struct inode *inode;
inode = read_one_inode(root, objectid);
if (!inode)
return -EIO;
key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID;
key.type = BTRFS_ORPHAN_ITEM_KEY;
key.offset = objectid;
ret = btrfs_insert_empty_item(trans, root, path, &key, 0);
btrfs_release_path(path);
if (ret == 0) {
if (!inode->i_nlink)
set_nlink(inode, 1);
else
inc_nlink(inode);
ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
} else if (ret == -EEXIST) {
ret = 0;
}
iput(inode);
return ret;
}
/*
* when replaying the log for a directory, we only insert names
* for inodes that actually exist. This means an fsync on a directory
* does not implicitly fsync all the new files in it
*/
static noinline int insert_one_name(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
u64 dirid, u64 index,
char *name, int name_len,
struct btrfs_key *location)
{
struct inode *inode;
struct inode *dir;
int ret;
inode = read_one_inode(root, location->objectid);
if (!inode)
return -ENOENT;
dir = read_one_inode(root, dirid);
if (!dir) {
iput(inode);
return -EIO;
}
ret = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode), name,
name_len, 1, index);
/* FIXME, put inode into FIXUP list */
iput(inode);
iput(dir);
return ret;
}
/*
* take a single entry in a log directory item and replay it into
* the subvolume.
*
* if a conflicting item exists in the subdirectory already,
* the inode it points to is unlinked and put into the link count
* fix up tree.
*
* If a name from the log points to a file or directory that does
* not exist in the FS, it is skipped. fsyncs on directories
* do not force down inodes inside that directory, just changes to the
* names or unlinks in a directory.
*
* Returns < 0 on error, 0 if the name wasn't replayed (dentry points to a
* non-existing inode) and 1 if the name was replayed.
*/
static noinline int replay_one_name(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_path *path,
struct extent_buffer *eb,
struct btrfs_dir_item *di,
struct btrfs_key *key)
{
char *name;
int name_len;
struct btrfs_dir_item *dst_di;
struct btrfs_key found_key;
struct btrfs_key log_key;
struct inode *dir;
u8 log_type;
bool exists;
int ret;
bool update_size = (key->type == BTRFS_DIR_INDEX_KEY);
bool name_added = false;
dir = read_one_inode(root, key->objectid);
if (!dir)
return -EIO;
name_len = btrfs_dir_name_len(eb, di);
name = kmalloc(name_len, GFP_NOFS);
if (!name) {
ret = -ENOMEM;
goto out;
}
log_type = btrfs_dir_type(eb, di);
read_extent_buffer(eb, name, (unsigned long)(di + 1),
name_len);
btrfs_dir_item_key_to_cpu(eb, di, &log_key);
ret = btrfs_lookup_inode(trans, root, path, &log_key, 0);
btrfs_release_path(path);
if (ret < 0)
goto out;
exists = (ret == 0);
ret = 0;
if (key->type == BTRFS_DIR_ITEM_KEY) {
dst_di = btrfs_lookup_dir_item(trans, root, path, key->objectid,
name, name_len, 1);
} else if (key->type == BTRFS_DIR_INDEX_KEY) {
dst_di = btrfs_lookup_dir_index_item(trans, root, path,
key->objectid,
key->offset, name,
name_len, 1);
} else {
/* Corruption */
ret = -EINVAL;
goto out;
}
if (IS_ERR(dst_di)) {
ret = PTR_ERR(dst_di);
goto out;
} else if (!dst_di) {
/* we need a sequence number to insert, so we only
* do inserts for the BTRFS_DIR_INDEX_KEY types
*/
if (key->type != BTRFS_DIR_INDEX_KEY)
goto out;
goto insert;
}
btrfs_dir_item_key_to_cpu(path->nodes[0], dst_di, &found_key);
/* the existing item matches the logged item */
if (found_key.objectid == log_key.objectid &&
found_key.type == log_key.type &&
found_key.offset == log_key.offset &&
btrfs_dir_type(path->nodes[0], dst_di) == log_type) {
update_size = false;
goto out;
}
/*
* don't drop the conflicting directory entry if the inode
* for the new entry doesn't exist
*/
if (!exists)
goto out;
ret = drop_one_dir_item(trans, root, path, BTRFS_I(dir), dst_di);
if (ret)
goto out;
if (key->type == BTRFS_DIR_INDEX_KEY)
goto insert;
out:
btrfs_release_path(path);
if (!ret && update_size) {
btrfs_i_size_write(BTRFS_I(dir), dir->i_size + name_len * 2);
ret = btrfs_update_inode(trans, root, BTRFS_I(dir));
}
kfree(name);
iput(dir);
if (!ret && name_added)
ret = 1;
return ret;
insert:
/*
* Check if the inode reference exists in the log for the given name,
* inode and parent inode
*/
found_key.objectid = log_key.objectid;
found_key.type = BTRFS_INODE_REF_KEY;
found_key.offset = key->objectid;
ret = backref_in_log(root->log_root, &found_key, 0, name, name_len);
if (ret < 0) {
goto out;
} else if (ret) {
/* The dentry will be added later. */
ret = 0;
update_size = false;
goto out;
}
found_key.objectid = log_key.objectid;
found_key.type = BTRFS_INODE_EXTREF_KEY;
found_key.offset = key->objectid;
ret = backref_in_log(root->log_root, &found_key, key->objectid, name,
name_len);
if (ret < 0) {
goto out;
} else if (ret) {
/* The dentry will be added later. */
ret = 0;
update_size = false;
goto out;
}
btrfs_release_path(path);
ret = insert_one_name(trans, root, key->objectid, key->offset,
name, name_len, &log_key);
if (ret && ret != -ENOENT && ret != -EEXIST)
goto out;
if (!ret)
name_added = true;
update_size = false;
ret = 0;
goto out;
}
/*
* find all the names in a directory item and reconcile them into
* the subvolume. Only BTRFS_DIR_ITEM_KEY types will have more than
* one name in a directory item, but the same code gets used for
* both directory index types
*/
static noinline int replay_one_dir_item(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_path *path,
struct extent_buffer *eb, int slot,
struct btrfs_key *key)
{
int ret = 0;
u32 item_size = btrfs_item_size_nr(eb, slot);
struct btrfs_dir_item *di;
int name_len;
unsigned long ptr;
unsigned long ptr_end;
struct btrfs_path *fixup_path = NULL;
ptr = btrfs_item_ptr_offset(eb, slot);
ptr_end = ptr + item_size;
while (ptr < ptr_end) {
di = (struct btrfs_dir_item *)ptr;
name_len = btrfs_dir_name_len(eb, di);
ret = replay_one_name(trans, root, path, eb, di, key);
if (ret < 0)
break;
ptr = (unsigned long)(di + 1);
ptr += name_len;
/*
* If this entry refers to a non-directory (directories can not
* have a link count > 1) and it was added in the transaction
* that was not committed, make sure we fixup the link count of
* the inode it the entry points to. Otherwise something like
* the following would result in a directory pointing to an
* inode with a wrong link that does not account for this dir
* entry:
*
* mkdir testdir
* touch testdir/foo
* touch testdir/bar
* sync
*
* ln testdir/bar testdir/bar_link
* ln testdir/foo testdir/foo_link
* xfs_io -c "fsync" testdir/bar
*
* <power failure>
*
* mount fs, log replay happens
*
* File foo would remain with a link count of 1 when it has two
* entries pointing to it in the directory testdir. This would
* make it impossible to ever delete the parent directory has
* it would result in stale dentries that can never be deleted.
*/
if (ret == 1 && btrfs_dir_type(eb, di) != BTRFS_FT_DIR) {
struct btrfs_key di_key;
if (!fixup_path) {
fixup_path = btrfs_alloc_path();
if (!fixup_path) {
ret = -ENOMEM;
break;
}
}
btrfs_dir_item_key_to_cpu(eb, di, &di_key);
ret = link_to_fixup_dir(trans, root, fixup_path,
di_key.objectid);
if (ret)
break;
}
ret = 0;
}
btrfs_free_path(fixup_path);
return ret;
}
/*
* directory replay has two parts. There are the standard directory
* items in the log copied from the subvolume, and range items
* created in the log while the subvolume was logged.
*
* The range items tell us which parts of the key space the log
* is authoritative for. During replay, if a key in the subvolume
* directory is in a logged range item, but not actually in the log
* that means it was deleted from the directory before the fsync
* and should be removed.
*/
static noinline int find_dir_range(struct btrfs_root *root,
struct btrfs_path *path,
u64 dirid, int key_type,
u64 *start_ret, u64 *end_ret)
{
struct btrfs_key key;
u64 found_end;
struct btrfs_dir_log_item *item;
int ret;
int nritems;
if (*start_ret == (u64)-1)
return 1;
key.objectid = dirid;
key.type = key_type;
key.offset = *start_ret;
ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
if (ret < 0)
goto out;
if (ret > 0) {
if (path->slots[0] == 0)
goto out;
path->slots[0]--;
}
if (ret != 0)
btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
if (key.type != key_type || key.objectid != dirid) {
ret = 1;
goto next;
}
item = btrfs_item_ptr(path->nodes[0], path->slots[0],
struct btrfs_dir_log_item);
found_end = btrfs_dir_log_end(path->nodes[0], item);
if (*start_ret >= key.offset && *start_ret <= found_end) {
ret = 0;
*start_ret = key.offset;
*end_ret = found_end;
goto out;
}
ret = 1;
next:
/* check the next slot in the tree to see if it is a valid item */
nritems = btrfs_header_nritems(path->nodes[0]);
path->slots[0]++;
if (path->slots[0] >= nritems) {
ret = btrfs_next_leaf(root, path);
if (ret)
goto out;
}
btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
if (key.type != key_type || key.objectid != dirid) {
ret = 1;
goto out;
}
item = btrfs_item_ptr(path->nodes[0], path->slots[0],
struct btrfs_dir_log_item);
found_end = btrfs_dir_log_end(path->nodes[0], item);
*start_ret = key.offset;
*end_ret = found_end;
ret = 0;
out:
btrfs_release_path(path);
return ret;
}
/*
* this looks for a given directory item in the log. If the directory
* item is not in the log, the item is removed and the inode it points
* to is unlinked
*/
static noinline int check_item_in_log(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_root *log,
struct btrfs_path *path,
struct btrfs_path *log_path,
struct inode *dir,
struct btrfs_key *dir_key)
{
int ret;
struct extent_buffer *eb;
int slot;
u32 item_size;
struct btrfs_dir_item *di;
struct btrfs_dir_item *log_di;
int name_len;
unsigned long ptr;
unsigned long ptr_end;
char *name;
struct inode *inode;
struct btrfs_key location;
again:
eb = path->nodes[0];
slot = path->slots[0];
item_size = btrfs_item_size_nr(eb, slot);
ptr = btrfs_item_ptr_offset(eb, slot);
ptr_end = ptr + item_size;
while (ptr < ptr_end) {
di = (struct btrfs_dir_item *)ptr;
name_len = btrfs_dir_name_len(eb, di);
name = kmalloc(name_len, GFP_NOFS);
if (!name) {
ret = -ENOMEM;
goto out;
}
read_extent_buffer(eb, name, (unsigned long)(di + 1),
name_len);
log_di = NULL;
if (log && dir_key->type == BTRFS_DIR_ITEM_KEY) {
log_di = btrfs_lookup_dir_item(trans, log, log_path,
dir_key->objectid,
name, name_len, 0);
} else if (log && dir_key->type == BTRFS_DIR_INDEX_KEY) {
log_di = btrfs_lookup_dir_index_item(trans, log,
log_path,
dir_key->objectid,
dir_key->offset,
name, name_len, 0);
}
if (!log_di) {
btrfs_dir_item_key_to_cpu(eb, di, &location);
btrfs_release_path(path);
btrfs_release_path(log_path);
inode = read_one_inode(root, location.objectid);
if (!inode) {
kfree(name);
return -EIO;
}
ret = link_to_fixup_dir(trans, root,
path, location.objectid);
if (ret) {
kfree(name);
iput(inode);
goto out;
}
inc_nlink(inode);
ret = btrfs_unlink_inode(trans, root, BTRFS_I(dir),
BTRFS_I(inode), name, name_len);
if (!ret)
ret = btrfs_run_delayed_items(trans);
kfree(name);
iput(inode);
if (ret)
goto out;
/* there might still be more names under this key
* check and repeat if required
*/
ret = btrfs_search_slot(NULL, root, dir_key, path,
0, 0);
if (ret == 0)
goto again;
ret = 0;
goto out;
} else if (IS_ERR(log_di)) {
kfree(name);
return PTR_ERR(log_di);
}
btrfs_release_path(log_path);
kfree(name);
ptr = (unsigned long)(di + 1);
ptr += name_len;
}
ret = 0;
out:
btrfs_release_path(path);
btrfs_release_path(log_path);
return ret;
}
static int replay_xattr_deletes(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_root *log,
struct btrfs_path *path,
const u64 ino)
{
struct btrfs_key search_key;
struct btrfs_path *log_path;
int i;
int nritems;
int ret;
log_path = btrfs_alloc_path();
if (!log_path)
return -ENOMEM;
search_key.objectid = ino;
search_key.type = BTRFS_XATTR_ITEM_KEY;
search_key.offset = 0;
again:
ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
if (ret < 0)
goto out;
process_leaf:
nritems = btrfs_header_nritems(path->nodes[0]);
for (i = path->slots[0]; i < nritems; i++) {
struct btrfs_key key;
struct btrfs_dir_item *di;
struct btrfs_dir_item *log_di;
u32 total_size;
u32 cur;
btrfs_item_key_to_cpu(path->nodes[0], &key, i);
if (key.objectid != ino || key.type != BTRFS_XATTR_ITEM_KEY) {
ret = 0;
goto out;
}
di = btrfs_item_ptr(path->nodes[0], i, struct btrfs_dir_item);
total_size = btrfs_item_size_nr(path->nodes[0], i);
cur = 0;
while (cur < total_size) {
u16 name_len = btrfs_dir_name_len(path->nodes[0], di);
u16 data_len = btrfs_dir_data_len(path->nodes[0], di);
u32 this_len = sizeof(*di) + name_len + data_len;
char *name;
name = kmalloc(name_len, GFP_NOFS);
if (!name) {
ret = -ENOMEM;
goto out;
}
read_extent_buffer(path->nodes[0], name,
(unsigned long)(di + 1), name_len);
log_di = btrfs_lookup_xattr(NULL, log, log_path, ino,
name, name_len, 0);
btrfs_release_path(log_path);
if (!log_di) {
/* Doesn't exist in log tree, so delete it. */
btrfs_release_path(path);
di = btrfs_lookup_xattr(trans, root, path, ino,
name, name_len, -1);
kfree(name);
if (IS_ERR(di)) {
ret = PTR_ERR(di);
goto out;
}
ASSERT(di);
ret = btrfs_delete_one_dir_name(trans, root,
path, di);
if (ret)
goto out;
btrfs_release_path(path);
search_key = key;
goto again;
}
kfree(name);
if (IS_ERR(log_di)) {
ret = PTR_ERR(log_di);
goto out;
}
cur += this_len;
di = (struct btrfs_dir_item *)((char *)di + this_len);
}
}
ret = btrfs_next_leaf(root, path);
if (ret > 0)
ret = 0;
else if (ret == 0)
goto process_leaf;
out:
btrfs_free_path(log_path);
btrfs_release_path(path);
return ret;
}
/*
* deletion replay happens before we copy any new directory items
* out of the log or out of backreferences from inodes. It
* scans the log to find ranges of keys that log is authoritative for,
* and then scans the directory to find items in those ranges that are
* not present in the log.
*
* Anything we don't find in the log is unlinked and removed from the
* directory.
*/
static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_root *log,
struct btrfs_path *path,
u64 dirid, int del_all)
{
u64 range_start;
u64 range_end;
int key_type = BTRFS_DIR_LOG_ITEM_KEY;
int ret = 0;
struct btrfs_key dir_key;
struct btrfs_key found_key;
struct btrfs_path *log_path;
struct inode *dir;
dir_key.objectid = dirid;
dir_key.type = BTRFS_DIR_ITEM_KEY;
log_path = btrfs_alloc_path();
if (!log_path)
return -ENOMEM;
dir = read_one_inode(root, dirid);
/* it isn't an error if the inode isn't there, that can happen
* because we replay the deletes before we copy in the inode item
* from the log
*/
if (!dir) {
btrfs_free_path(log_path);
return 0;
}
again:
range_start = 0;
range_end = 0;
while (1) {
if (del_all)
range_end = (u64)-1;
else {
ret = find_dir_range(log, path, dirid, key_type,
&range_start, &range_end);
if (ret < 0)
goto out;
else if (ret > 0)
break;
}
dir_key.offset = range_start;
while (1) {
int nritems;
ret = btrfs_search_slot(NULL, root, &dir_key, path,
0, 0);
if (ret < 0)
goto out;
nritems = btrfs_header_nritems(path->nodes[0]);
if (path->slots[0] >= nritems) {
ret = btrfs_next_leaf(root, path);
if (ret == 1)
break;
else if (ret < 0)
goto out;
}
btrfs_item_key_to_cpu(path->nodes[0], &found_key,
path->slots[0]);
if (found_key.objectid != dirid ||
found_key.type != dir_key.type)
goto next_type;
if (found_key.offset > range_end)
break;
ret = check_item_in_log(trans, root, log, path,
log_path, dir,
&found_key);
if (ret)
goto out;
if (found_key.offset == (u64)-1)
break;
dir_key.offset = found_key.offset + 1;
}
btrfs_release_path(path);
if (range_end == (u64)-1)
break;
range_start = range_end + 1;
}
next_type:
ret = 0;
if (key_type == BTRFS_DIR_LOG_ITEM_KEY) {
key_type = BTRFS_DIR_LOG_INDEX_KEY;
dir_key.type = BTRFS_DIR_INDEX_KEY;
btrfs_release_path(path);
goto again;
}
out:
btrfs_release_path(path);
btrfs_free_path(log_path);
iput(dir);
return ret;
}
/*
* the process_func used to replay items from the log tree. This
* gets called in two different stages. The first stage just looks
* for inodes and makes sure they are all copied into the subvolume.
*
* The second stage copies all the other item types from the log into
* the subvolume. The two stage approach is slower, but gets rid of
* lots of complexity around inodes referencing other inodes that exist
* only in the log (references come from either directory items or inode
* back refs).
*/
static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb,
struct walk_control *wc, u64 gen, int level)
{
int nritems;
struct btrfs_path *path;
struct btrfs_root *root = wc->replay_dest;
struct btrfs_key key;
int i;
int ret;
ret = btrfs_read_buffer(eb, gen, level, NULL);
if (ret)
return ret;
level = btrfs_header_level(eb);
if (level != 0)
return 0;
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
nritems = btrfs_header_nritems(eb);
for (i = 0; i < nritems; i++) {
btrfs_item_key_to_cpu(eb, &key, i);
/* inode keys are done during the first stage */
if (key.type == BTRFS_INODE_ITEM_KEY &&
wc->stage == LOG_WALK_REPLAY_INODES) {
struct btrfs_inode_item *inode_item;
u32 mode;
inode_item = btrfs_item_ptr(eb, i,
struct btrfs_inode_item);
/*
* If we have a tmpfile (O_TMPFILE) that got fsync'ed
* and never got linked before the fsync, skip it, as
* replaying it is pointless since it would be deleted
* later. We skip logging tmpfiles, but it's always
* possible we are replaying a log created with a kernel
* that used to log tmpfiles.
*/
if (btrfs_inode_nlink(eb, inode_item) == 0) {
wc->ignore_cur_inode = true;
continue;
} else {
wc->ignore_cur_inode = false;
}
ret = replay_xattr_deletes(wc->trans, root, log,
path, key.objectid);
if (ret)
break;
mode = btrfs_inode_mode(eb, inode_item);
if (S_ISDIR(mode)) {
ret = replay_dir_deletes(wc->trans,
root, log, path, key.objectid, 0);
if (ret)
break;
}
ret = overwrite_item(wc->trans, root, path,
eb, i, &key);
if (ret)
break;
/*
* Before replaying extents, truncate the inode to its
* size. We need to do it now and not after log replay
* because before an fsync we can have prealloc extents
* added beyond the inode's i_size. If we did it after,
* through orphan cleanup for example, we would drop
* those prealloc extents just after replaying them.
*/
if (S_ISREG(mode)) {
struct btrfs_drop_extents_args drop_args = { 0 };
struct inode *inode;
u64 from;
inode = read_one_inode(root, key.objectid);
if (!inode) {
ret = -EIO;
break;
}
from = ALIGN(i_size_read(inode),
root->fs_info->sectorsize);
drop_args.start = from;
drop_args.end = (u64)-1;
drop_args.drop_cache = true;
ret = btrfs_drop_extents(wc->trans, root,
BTRFS_I(inode),
&drop_args);
if (!ret) {
inode_sub_bytes(inode,
drop_args.bytes_found);
/* Update the inode's nbytes. */
ret = btrfs_update_inode(wc->trans,
root, BTRFS_I(inode));
}
iput(inode);
if (ret)
break;
}
ret = link_to_fixup_dir(wc->trans, root,
path, key.objectid);
if (ret)
break;
}
if (wc->ignore_cur_inode)
continue;
if (key.type == BTRFS_DIR_INDEX_KEY &&
wc->stage == LOG_WALK_REPLAY_DIR_INDEX) {
ret = replay_one_dir_item(wc->trans, root, path,
eb, i, &key);
if (ret)
break;
}
if (wc->stage < LOG_WALK_REPLAY_ALL)
continue;
/* these keys are simply copied */
if (key.type == BTRFS_XATTR_ITEM_KEY) {
ret = overwrite_item(wc->trans, root, path,
eb, i, &key);
if (ret)
break;
} else if (key.type == BTRFS_INODE_REF_KEY ||
key.type == BTRFS_INODE_EXTREF_KEY) {
ret = add_inode_ref(wc->trans, root, log, path,
eb, i, &key);
if (ret && ret != -ENOENT)
break;
ret = 0;
} else if (key.type == BTRFS_EXTENT_DATA_KEY) {
ret = replay_one_extent(wc->trans, root, path,
eb, i, &key);
if (ret)
break;
} else if (key.type == BTRFS_DIR_ITEM_KEY) {
ret = replay_one_dir_item(wc->trans, root, path,
eb, i, &key);
if (ret)
break;
}
}
btrfs_free_path(path);
return ret;
}
/*
* Correctly adjust the reserved bytes occupied by a log tree extent buffer
*/
static void unaccount_log_buffer(struct btrfs_fs_info *fs_info, u64 start)
{
struct btrfs_block_group *cache;
cache = btrfs_lookup_block_group(fs_info, start);
if (!cache) {
btrfs_err(fs_info, "unable to find block group for %llu", start);
return;
}
spin_lock(&cache->space_info->lock);
spin_lock(&cache->lock);
cache->reserved -= fs_info->nodesize;
cache->space_info->bytes_reserved -= fs_info->nodesize;
spin_unlock(&cache->lock);
spin_unlock(&cache->space_info->lock);
btrfs_put_block_group(cache);
}
static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_path *path, int *level,
struct walk_control *wc)
{
struct btrfs_fs_info *fs_info = root->fs_info;
u64 bytenr;
u64 ptr_gen;
struct extent_buffer *next;
struct extent_buffer *cur;
u32 blocksize;
int ret = 0;
while (*level > 0) {
struct btrfs_key first_key;
cur = path->nodes[*level];
WARN_ON(btrfs_header_level(cur) != *level); if (path->slots[*level] >=
btrfs_header_nritems(cur))
break;
bytenr = btrfs_node_blockptr(cur, path->slots[*level]);
ptr_gen = btrfs_node_ptr_generation(cur, path->slots[*level]);
btrfs_node_key_to_cpu(cur, &first_key, path->slots[*level]);
blocksize = fs_info->nodesize;
next = btrfs_find_create_tree_block(fs_info, bytenr,
btrfs_header_owner(cur),
*level - 1);
if (IS_ERR(next))
return PTR_ERR(next); if (*level == 1) { ret = wc->process_func(root, next, wc, ptr_gen,
*level - 1);
if (ret) {
free_extent_buffer(next);
return ret;
}
path->slots[*level]++;
if (wc->free) {
ret = btrfs_read_buffer(next, ptr_gen,
*level - 1, &first_key);
if (ret) {
free_extent_buffer(next);
return ret;
}
if (trans) { btrfs_tree_lock(next);
btrfs_clean_tree_block(next);
btrfs_wait_tree_block_writeback(next);
btrfs_tree_unlock(next);
ret = btrfs_pin_reserved_extent(trans,
bytenr, blocksize);
if (ret) {
free_extent_buffer(next);
return ret;
}
btrfs_redirty_list_add(
trans->transaction, next);
} else {
if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &next->bflags)) clear_extent_buffer_dirty(next); unaccount_log_buffer(fs_info, bytenr);
}
}
free_extent_buffer(next);
continue;
}
ret = btrfs_read_buffer(next, ptr_gen, *level - 1, &first_key);
if (ret) {
free_extent_buffer(next);
return ret;
}
if (path->nodes[*level-1]) free_extent_buffer(path->nodes[*level-1]); path->nodes[*level-1] = next;
*level = btrfs_header_level(next);
path->slots[*level] = 0;
cond_resched();
}
path->slots[*level] = btrfs_header_nritems(path->nodes[*level]);
cond_resched();
return 0;
}
static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_path *path, int *level,
struct walk_control *wc)
{
struct btrfs_fs_info *fs_info = root->fs_info;
int i;
int slot;
int ret;
for (i = *level; i < BTRFS_MAX_LEVEL - 1 && path->nodes[i]; i++) { slot = path->slots[i];
if (slot + 1 < btrfs_header_nritems(path->nodes[i])) {
path->slots[i]++;
*level = i;
WARN_ON(*level == 0);
return 0;
} else {
ret = wc->process_func(root, path->nodes[*level], wc,
btrfs_header_generation(path->nodes[*level]),
*level);
if (ret)
return ret;
if (wc->free) {
struct extent_buffer *next;
next = path->nodes[*level];
if (trans) { btrfs_tree_lock(next);
btrfs_clean_tree_block(next);
btrfs_wait_tree_block_writeback(next);
btrfs_tree_unlock(next);
ret = btrfs_pin_reserved_extent(trans,
path->nodes[*level]->start,
path->nodes[*level]->len);
if (ret)
return ret;
btrfs_redirty_list_add(trans->transaction,
next);
} else {
if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &next->bflags)) clear_extent_buffer_dirty(next);
unaccount_log_buffer(fs_info,
path->nodes[*level]->start);
}
}
free_extent_buffer(path->nodes[*level]);
path->nodes[*level] = NULL;
*level = i + 1;
}
}
return 1;
}
/*
* drop the reference count on the tree rooted at 'snap'. This traverses
* the tree freeing any blocks that have a ref count of zero after being
* decremented.
*/
static int walk_log_tree(struct btrfs_trans_handle *trans,
struct btrfs_root *log, struct walk_control *wc)
{
struct btrfs_fs_info *fs_info = log->fs_info;
int ret = 0;
int wret;
int level;
struct btrfs_path *path;
int orig_level;
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
level = btrfs_header_level(log->node);
orig_level = level;
path->nodes[level] = log->node;
atomic_inc(&log->node->refs);
path->slots[level] = 0;
while (1) {
wret = walk_down_log_tree(trans, log, path, &level, wc);
if (wret > 0)
break;
if (wret < 0) {
ret = wret;
goto out;
}
wret = walk_up_log_tree(trans, log, path, &level, wc);
if (wret > 0)
break;
if (wret < 0) {
ret = wret;
goto out;
}
}
/* was the root node processed? if not, catch it here */
if (path->nodes[orig_level]) { ret = wc->process_func(log, path->nodes[orig_level], wc,
btrfs_header_generation(path->nodes[orig_level]),
orig_level);
if (ret)
goto out;
if (wc->free) {
struct extent_buffer *next;
next = path->nodes[orig_level];
if (trans) {
btrfs_tree_lock(next);
btrfs_clean_tree_block(next);
btrfs_wait_tree_block_writeback(next);
btrfs_tree_unlock(next);
ret = btrfs_pin_reserved_extent(trans,
next->start, next->len);
if (ret)
goto out;
btrfs_redirty_list_add(trans->transaction, next);
} else {
if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &next->bflags)) clear_extent_buffer_dirty(next); unaccount_log_buffer(fs_info, next->start);
}
}
}
out:
btrfs_free_path(path); return ret;
}
/*
* helper function to update the item for a given subvolumes log root
* in the tree of log roots
*/
static int update_log_root(struct btrfs_trans_handle *trans,
struct btrfs_root *log,
struct btrfs_root_item *root_item)
{
struct btrfs_fs_info *fs_info = log->fs_info;
int ret;
if (log->log_transid == 1) {
/* insert root item on the first sync */
ret = btrfs_insert_root(trans, fs_info->log_root_tree,
&log->root_key, root_item);
} else {
ret = btrfs_update_root(trans, fs_info->log_root_tree,
&log->root_key, root_item);
}
return ret;
}
static void wait_log_commit(struct btrfs_root *root, int transid)
{
DEFINE_WAIT(wait);
int index = transid % 2;
/*
* we only allow two pending log transactions at a time,
* so we know that if ours is more than 2 older than the
* current transaction, we're done
*/
for (;;) {
prepare_to_wait(&root->log_commit_wait[index],
&wait, TASK_UNINTERRUPTIBLE);
if (!(root->log_transid_committed < transid &&
atomic_read(&root->log_commit[index])))
break;
mutex_unlock(&root->log_mutex);
schedule();
mutex_lock(&root->log_mutex);
}
finish_wait(&root->log_commit_wait[index], &wait);
}
static void wait_for_writer(struct btrfs_root *root)
{
DEFINE_WAIT(wait);
for (;;) {
prepare_to_wait(&root->log_writer_wait, &wait,
TASK_UNINTERRUPTIBLE);
if (!atomic_read(&root->log_writers))
break;
mutex_unlock(&root->log_mutex);
schedule();
mutex_lock(&root->log_mutex);
}
finish_wait(&root->log_writer_wait, &wait);
}
static inline void btrfs_remove_log_ctx(struct btrfs_root *root,
struct btrfs_log_ctx *ctx)
{
if (!ctx)
return;
mutex_lock(&root->log_mutex);
list_del_init(&ctx->list);
mutex_unlock(&root->log_mutex);
}
/*
* Invoked in log mutex context, or be sure there is no other task which
* can access the list.
*/
static inline void btrfs_remove_all_log_ctxs(struct btrfs_root *root,
int index, int error)
{
struct btrfs_log_ctx *ctx;
struct btrfs_log_ctx *safe;
list_for_each_entry_safe(ctx, safe, &root->log_ctxs[index], list) {
list_del_init(&ctx->list);
ctx->log_ret = error;
}
}
/*
* btrfs_sync_log does sends a given tree log down to the disk and
* updates the super blocks to record it. When this call is done,
* you know that any inodes previously logged are safely on disk only
* if it returns 0.
*
* Any other return value means you need to call btrfs_commit_transaction.
* Some of the edge cases for fsyncing directories that have had unlinks
* or renames done in the past mean that sometimes the only safe
* fsync is to commit the whole FS. When btrfs_sync_log returns -EAGAIN,
* that has happened.
*/
int btrfs_sync_log(struct btrfs_trans_handle *trans,
struct btrfs_root *root, struct btrfs_log_ctx *ctx)
{
int index1;
int index2;
int mark;
int ret;
struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_root *log = root->log_root;
struct btrfs_root *log_root_tree = fs_info->log_root_tree;
struct btrfs_root_item new_root_item;
int log_transid = 0;
struct btrfs_log_ctx root_log_ctx;
struct blk_plug plug;
u64 log_root_start;
u64 log_root_level;
mutex_lock(&root->log_mutex);
log_transid = ctx->log_transid;
if (root->log_transid_committed >= log_transid) {
mutex_unlock(&root->log_mutex);
return ctx->log_ret;
}
index1 = log_transid % 2;
if (atomic_read(&root->log_commit[index1])) {
wait_log_commit(root, log_transid);
mutex_unlock(&root->log_mutex);
return ctx->log_ret;
}
ASSERT(log_transid == root->log_transid);
atomic_set(&root->log_commit[index1], 1);
/* wait for previous tree log sync to complete */
if (atomic_read(&root->log_commit[(index1 + 1) % 2])) wait_log_commit(root, log_transid - 1);
while (1) {
int batch = atomic_read(&root->log_batch);
/* when we're on an ssd, just kick the log commit out */
if (!btrfs_test_opt(fs_info, SSD) &&
test_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state)) { mutex_unlock(&root->log_mutex);
schedule_timeout_uninterruptible(1);
mutex_lock(&root->log_mutex);
}
wait_for_writer(root);
if (batch == atomic_read(&root->log_batch))
break;
}
/* bail out if we need to do a full commit */
if (btrfs_need_log_full_commit(trans)) {
ret = -EAGAIN;
mutex_unlock(&root->log_mutex);
goto out;
}
if (log_transid % 2 == 0)
mark = EXTENT_DIRTY;
else
mark = EXTENT_NEW;
/* we start IO on all the marked extents here, but we don't actually
* wait for them until later.
*/
blk_start_plug(&plug);
ret = btrfs_write_marked_extents(fs_info, &log->dirty_log_pages, mark);
/*
* -EAGAIN happens when someone, e.g., a concurrent transaction
* commit, writes a dirty extent in this tree-log commit. This
* concurrent write will create a hole writing out the extents,
* and we cannot proceed on a zoned filesystem, requiring
* sequential writing. While we can bail out to a full commit
* here, but we can continue hoping the concurrent writing fills
* the hole.
*/
if (ret == -EAGAIN && btrfs_is_zoned(fs_info))
ret = 0;
if (ret) { blk_finish_plug(&plug); btrfs_abort_transaction(trans, ret);
btrfs_set_log_full_commit(trans);
mutex_unlock(&root->log_mutex);
goto out;
}
/*
* We _must_ update under the root->log_mutex in order to make sure we
* have a consistent view of the log root we are trying to commit at
* this moment.
*
* We _must_ copy this into a local copy, because we are not holding the
* log_root_tree->log_mutex yet. This is important because when we
* commit the log_root_tree we must have a consistent view of the
* log_root_tree when we update the super block to point at the
* log_root_tree bytenr. If we update the log_root_tree here we'll race
* with the commit and possibly point at the new block which we may not
* have written out.
*/
btrfs_set_root_node(&log->root_item, log->node);
memcpy(&new_root_item, &log->root_item, sizeof(new_root_item));
root->log_transid++;
log->log_transid = root->log_transid;
root->log_start_pid = 0;
/*
* IO has been started, blocks of the log tree have WRITTEN flag set
* in their headers. new modifications of the log will be written to
* new positions. so it's safe to allow log writers to go in.
*/
mutex_unlock(&root->log_mutex);
if (btrfs_is_zoned(fs_info)) {
mutex_lock(&fs_info->tree_root->log_mutex);
if (!log_root_tree->node) {
ret = btrfs_alloc_log_tree_node(trans, log_root_tree);
if (ret) {
mutex_unlock(&fs_info->tree_root->log_mutex);
blk_finish_plug(&plug);
goto out;
}
}
mutex_unlock(&fs_info->tree_root->log_mutex);
}
btrfs_init_log_ctx(&root_log_ctx, NULL);
mutex_lock(&log_root_tree->log_mutex);
index2 = log_root_tree->log_transid % 2;
list_add_tail(&root_log_ctx.list, &log_root_tree->log_ctxs[index2]);
root_log_ctx.log_transid = log_root_tree->log_transid;
/*
* Now we are safe to update the log_root_tree because we're under the
* log_mutex, and we're a current writer so we're holding the commit
* open until we drop the log_mutex.
*/
ret = update_log_root(trans, log, &new_root_item);
if (ret) {
if (!list_empty(&root_log_ctx.list))
list_del_init(&root_log_ctx.list);
blk_finish_plug(&plug);
btrfs_set_log_full_commit(trans);
if (ret != -ENOSPC) {
btrfs_abort_transaction(trans, ret);
mutex_unlock(&log_root_tree->log_mutex);
goto out;
}
btrfs_wait_tree_log_extents(log, mark);
mutex_unlock(&log_root_tree->log_mutex);
ret = -EAGAIN;
goto out;
}
if (log_root_tree->log_transid_committed >= root_log_ctx.log_transid) { blk_finish_plug(&plug);
list_del_init(&root_log_ctx.list);
mutex_unlock(&log_root_tree->log_mutex);
ret = root_log_ctx.log_ret;
goto out;
}
index2 = root_log_ctx.log_transid % 2;
if (atomic_read(&log_root_tree->log_commit[index2])) {
blk_finish_plug(&plug);
ret = btrfs_wait_tree_log_extents(log, mark);
wait_log_commit(log_root_tree,
root_log_ctx.log_transid);
mutex_unlock(&log_root_tree->log_mutex);
if (!ret)
ret = root_log_ctx.log_ret;
goto out;
}
ASSERT(root_log_ctx.log_transid == log_root_tree->log_transid);
atomic_set(&log_root_tree->log_commit[index2], 1);
if (atomic_read(&log_root_tree->log_commit[(index2 + 1) % 2])) {
wait_log_commit(log_root_tree,
root_log_ctx.log_transid - 1);
}
/*
* now that we've moved on to the tree of log tree roots,
* check the full commit flag again
*/
if (btrfs_need_log_full_commit(trans)) { blk_finish_plug(&plug);
btrfs_wait_tree_log_extents(log, mark);
mutex_unlock(&log_root_tree->log_mutex);
ret = -EAGAIN;
goto out_wake_log_root;
}
ret = btrfs_write_marked_extents(fs_info,
&log_root_tree->dirty_log_pages,
EXTENT_DIRTY | EXTENT_NEW);
blk_finish_plug(&plug);
/*
* As described above, -EAGAIN indicates a hole in the extents. We
* cannot wait for these write outs since the waiting cause a
* deadlock. Bail out to the full commit instead.
*/
if (ret == -EAGAIN && btrfs_is_zoned(fs_info)) {
btrfs_set_log_full_commit(trans);
btrfs_wait_tree_log_extents(log, mark);
mutex_unlock(&log_root_tree->log_mutex);
goto out_wake_log_root;
} else if (ret) { btrfs_set_log_full_commit(trans); btrfs_abort_transaction(trans, ret);
mutex_unlock(&log_root_tree->log_mutex);
goto out_wake_log_root;
}
ret = btrfs_wait_tree_log_extents(log, mark);
if (!ret)
ret = btrfs_wait_tree_log_extents(log_root_tree,
EXTENT_NEW | EXTENT_DIRTY);
if (ret) {
btrfs_set_log_full_commit(trans);
mutex_unlock(&log_root_tree->log_mutex);
goto out_wake_log_root;
}
log_root_start = log_root_tree->node->start;
log_root_level = btrfs_header_level(log_root_tree->node);
log_root_tree->log_transid++;
mutex_unlock(&log_root_tree->log_mutex);
/*
* Here we are guaranteed that nobody is going to write the superblock
* for the current transaction before us and that neither we do write
* our superblock before the previous transaction finishes its commit
* and writes its superblock, because:
*
* 1) We are holding a handle on the current transaction, so no body
* can commit it until we release the handle;
*
* 2) Before writing our superblock we acquire the tree_log_mutex, so
* if the previous transaction is still committing, and hasn't yet
* written its superblock, we wait for it to do it, because a
* transaction commit acquires the tree_log_mutex when the commit
* begins and releases it only after writing its superblock.
*/
mutex_lock(&fs_info->tree_log_mutex);
/*
* The previous transaction writeout phase could have failed, and thus
* marked the fs in an error state. We must not commit here, as we
* could have updated our generation in the super_for_commit and
* writing the super here would result in transid mismatches. If there
* is an error here just bail.
*/
if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) {
ret = -EIO;
btrfs_set_log_full_commit(trans);
btrfs_abort_transaction(trans, ret);
mutex_unlock(&fs_info->tree_log_mutex);
goto out_wake_log_root;
}
btrfs_set_super_log_root(fs_info->super_for_commit, log_root_start);
btrfs_set_super_log_root_level(fs_info->super_for_commit, log_root_level);
ret = write_all_supers(fs_info, 1);
mutex_unlock(&fs_info->tree_log_mutex);
if (ret) {
btrfs_set_log_full_commit(trans); btrfs_abort_transaction(trans, ret);
goto out_wake_log_root;
}
/*
* We know there can only be one task here, since we have not yet set
* root->log_commit[index1] to 0 and any task attempting to sync the
* log must wait for the previous log transaction to commit if it's
* still in progress or wait for the current log transaction commit if
* someone else already started it. We use <= and not < because the
* first log transaction has an ID of 0.
*/
ASSERT(root->last_log_commit <= log_transid);
root->last_log_commit = log_transid;
out_wake_log_root:
mutex_lock(&log_root_tree->log_mutex);
btrfs_remove_all_log_ctxs(log_root_tree, index2, ret);
log_root_tree->log_transid_committed++;
atomic_set(&log_root_tree->log_commit[index2], 0);
mutex_unlock(&log_root_tree->log_mutex);
/*
* The barrier before waitqueue_active (in cond_wake_up) is needed so
* all the updates above are seen by the woken threads. It might not be
* necessary, but proving that seems to be hard.
*/
cond_wake_up(&log_root_tree->log_commit_wait[index2]);
out:
mutex_lock(&root->log_mutex);
btrfs_remove_all_log_ctxs(root, index1, ret);
root->log_transid_committed++;
atomic_set(&root->log_commit[index1], 0);
mutex_unlock(&root->log_mutex);
/*
* The barrier before waitqueue_active (in cond_wake_up) is needed so
* all the updates above are seen by the woken threads. It might not be
* necessary, but proving that seems to be hard.
*/
cond_wake_up(&root->log_commit_wait[index1]);
return ret;
}
static void free_log_tree(struct btrfs_trans_handle *trans,
struct btrfs_root *log)
{
int ret;
struct walk_control wc = {
.free = 1,
.process_func = process_one_buffer
};
if (log->node) {
ret = walk_log_tree(trans, log, &wc);
if (ret) {
/*
* We weren't able to traverse the entire log tree, the
* typical scenario is getting an -EIO when reading an
* extent buffer of the tree, due to a previous writeback
* failure of it.
*/
set_bit(BTRFS_FS_STATE_LOG_CLEANUP_ERROR,
&log->fs_info->fs_state);
/*
* Some extent buffers of the log tree may still be dirty
* and not yet written back to storage, because we may
* have updates to a log tree without syncing a log tree,
* such as during rename and link operations. So flush
* them out and wait for their writeback to complete, so
* that we properly cleanup their state and pages.
*/
btrfs_write_marked_extents(log->fs_info,
&log->dirty_log_pages,
EXTENT_DIRTY | EXTENT_NEW);
btrfs_wait_tree_log_extents(log,
EXTENT_DIRTY | EXTENT_NEW);
if (trans)
btrfs_abort_transaction(trans, ret);
else
btrfs_handle_fs_error(log->fs_info, ret, NULL);
}
}
clear_extent_bits(&log->dirty_log_pages, 0, (u64)-1,
EXTENT_DIRTY | EXTENT_NEW | EXTENT_NEED_WAIT);
extent_io_tree_release(&log->log_csum_range);
btrfs_put_root(log);
}
/*
* free all the extents used by the tree log. This should be called
* at commit time of the full transaction
*/
int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root)
{
if (root->log_root) { free_log_tree(trans, root->log_root);
root->log_root = NULL;
clear_bit(BTRFS_ROOT_HAS_LOG_TREE, &root->state);
}
return 0;
}
int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info)
{
if (fs_info->log_root_tree) { free_log_tree(trans, fs_info->log_root_tree);
fs_info->log_root_tree = NULL;
clear_bit(BTRFS_ROOT_HAS_LOG_TREE, &fs_info->tree_root->state);
}
return 0;
}
/*
* Check if an inode was logged in the current transaction. This may often
* return some false positives, because logged_trans is an in memory only field,
* not persisted anywhere. This is meant to be used in contexts where a false
* positive has no functional consequences.
*/
static bool inode_logged(struct btrfs_trans_handle *trans,
struct btrfs_inode *inode)
{
if (inode->logged_trans == trans->transid)
return true;
/*
* The inode's logged_trans is always 0 when we load it (because it is
* not persisted in the inode item or elsewhere). So if it is 0, the
* inode was last modified in the current transaction then the inode may
* have been logged before in the current transaction, then evicted and
* loaded again in the current transaction - or may have never been logged
* in the current transaction, but since we can not be sure, we have to
* assume it was, otherwise our callers can leave an inconsistent log.
*/
if (inode->logged_trans == 0 &&
inode->last_trans == trans->transid &&
!test_bit(BTRFS_FS_LOG_RECOVERING, &trans->fs_info->flags))
return true;
return false;
}
/*
* If both a file and directory are logged, and unlinks or renames are
* mixed in, we have a few interesting corners:
*
* create file X in dir Y
* link file X to X.link in dir Y
* fsync file X
* unlink file X but leave X.link
* fsync dir Y
*
* After a crash we would expect only X.link to exist. But file X
* didn't get fsync'd again so the log has back refs for X and X.link.
*
* We solve this by removing directory entries and inode backrefs from the
* log when a file that was logged in the current transaction is
* unlinked. Any later fsync will include the updated log entries, and
* we'll be able to reconstruct the proper directory items from backrefs.
*
* This optimizations allows us to avoid relogging the entire inode
* or the entire directory.
*/
int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
const char *name, int name_len,
struct btrfs_inode *dir, u64 index)
{
struct btrfs_root *log;
struct btrfs_dir_item *di;
struct btrfs_path *path;
int ret;
int err = 0;
u64 dir_ino = btrfs_ino(dir);
if (!inode_logged(trans, dir)) return 0; ret = join_running_log_trans(root);
if (ret)
return 0;
mutex_lock(&dir->log_mutex);
log = root->log_root;
path = btrfs_alloc_path();
if (!path) {
err = -ENOMEM;
goto out_unlock;
}
di = btrfs_lookup_dir_item(trans, log, path, dir_ino,
name, name_len, -1);
if (IS_ERR(di)) {
err = PTR_ERR(di);
goto fail;
}
if (di) { ret = btrfs_delete_one_dir_name(trans, log, path, di);
if (ret) {
err = ret;
goto fail;
}
}
btrfs_release_path(path);
di = btrfs_lookup_dir_index_item(trans, log, path, dir_ino,
index, name, name_len, -1);
if (IS_ERR(di)) {
err = PTR_ERR(di);
goto fail;
}
if (di) { ret = btrfs_delete_one_dir_name(trans, log, path, di);
if (ret) {
err = ret;
goto fail;
}
}
/*
* We do not need to update the size field of the directory's inode item
* because on log replay we update the field to reflect all existing
* entries in the directory (see overwrite_item()).
*/
fail:
btrfs_free_path(path);
out_unlock:
mutex_unlock(&dir->log_mutex);
if (err == -ENOSPC) {
btrfs_set_log_full_commit(trans);
err = 0;
} else if (err < 0) { btrfs_abort_transaction(trans, err);
}
btrfs_end_log_trans(root);
return err;
}
/* see comments for btrfs_del_dir_entries_in_log */
int btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
const char *name, int name_len,
struct btrfs_inode *inode, u64 dirid)
{
struct btrfs_root *log;
u64 index;
int ret;
if (!inode_logged(trans, inode)) return 0; ret = join_running_log_trans(root);
if (ret)
return 0;
log = root->log_root;
mutex_lock(&inode->log_mutex);
ret = btrfs_del_inode_ref(trans, log, name, name_len, btrfs_ino(inode),
dirid, &index);
mutex_unlock(&inode->log_mutex);
if (ret == -ENOSPC) {
btrfs_set_log_full_commit(trans);
ret = 0;
} else if (ret < 0 && ret != -ENOENT) btrfs_abort_transaction(trans, ret); btrfs_end_log_trans(root);
return ret;
}
/*
* creates a range item in the log for 'dirid'. first_offset and
* last_offset tell us which parts of the key space the log should
* be considered authoritative for.
*/
static noinline int insert_dir_log_key(struct btrfs_trans_handle *trans,
struct btrfs_root *log,
struct btrfs_path *path,
int key_type, u64 dirid,
u64 first_offset, u64 last_offset)
{
int ret;
struct btrfs_key key;
struct btrfs_dir_log_item *item;
key.objectid = dirid;
key.offset = first_offset;
if (key_type == BTRFS_DIR_ITEM_KEY)
key.type = BTRFS_DIR_LOG_ITEM_KEY;
else
key.type = BTRFS_DIR_LOG_INDEX_KEY;
ret = btrfs_insert_empty_item(trans, log, path, &key, sizeof(*item));
if (ret)
return ret;
item = btrfs_item_ptr(path->nodes[0], path->slots[0],
struct btrfs_dir_log_item);
btrfs_set_dir_log_end(path->nodes[0], item, last_offset);
btrfs_mark_buffer_dirty(path->nodes[0]);
btrfs_release_path(path);
return 0;
}
/*
* log all the items included in the current transaction for a given
* directory. This also creates the range items in the log tree required
* to replay anything deleted before the fsync
*/
static noinline int log_dir_items(struct btrfs_trans_handle *trans,
struct btrfs_root *root, struct btrfs_inode *inode,
struct btrfs_path *path,
struct btrfs_path *dst_path, int key_type,
struct btrfs_log_ctx *ctx,
u64 min_offset, u64 *last_offset_ret)
{
struct btrfs_key min_key;
struct btrfs_root *log = root->log_root;
struct extent_buffer *src;
int err = 0;
int ret;
int i;
int nritems;
u64 first_offset = min_offset;
u64 last_offset = (u64)-1;
u64 ino = btrfs_ino(inode);
log = root->log_root;
min_key.objectid = ino;
min_key.type = key_type;
min_key.offset = min_offset;
ret = btrfs_search_forward(root, &min_key, path, trans->transid);
/*
* we didn't find anything from this transaction, see if there
* is anything at all
*/
if (ret != 0 || min_key.objectid != ino || min_key.type != key_type) { min_key.objectid = ino;
min_key.type = key_type;
min_key.offset = (u64)-1;
btrfs_release_path(path);
ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0);
if (ret < 0) {
btrfs_release_path(path);
return ret;
}
ret = btrfs_previous_item(root, path, ino, key_type);
/* if ret == 0 there are items for this type,
* create a range to tell us the last key of this type.
* otherwise, there are no items in this directory after
* *min_offset, and we create a range to indicate that.
*/
if (ret == 0) {
struct btrfs_key tmp;
btrfs_item_key_to_cpu(path->nodes[0], &tmp,
path->slots[0]);
if (key_type == tmp.type)
first_offset = max(min_offset, tmp.offset) + 1;
}
goto done;
}
/* go backward to find any previous key */
ret = btrfs_previous_item(root, path, ino, key_type);
if (ret == 0) {
struct btrfs_key tmp;
btrfs_item_key_to_cpu(path->nodes[0], &tmp, path->slots[0]); if (key_type == tmp.type) { first_offset = tmp.offset;
ret = overwrite_item(trans, log, dst_path,
path->nodes[0], path->slots[0],
&tmp);
if (ret) {
err = ret;
goto done;
}
}
}
btrfs_release_path(path);
/*
* Find the first key from this transaction again. See the note for
* log_new_dir_dentries, if we're logging a directory recursively we
* won't be holding its i_mutex, which means we can modify the directory
* while we're logging it. If we remove an entry between our first
* search and this search we'll not find the key again and can just
* bail.
*/
search:
ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0); if (ret != 0)
goto done;
/*
* we have a block from this transaction, log every item in it
* from our directory
*/
while (1) {
struct btrfs_key tmp;
src = path->nodes[0];
nritems = btrfs_header_nritems(src); for (i = path->slots[0]; i < nritems; i++) {
struct btrfs_dir_item *di;
btrfs_item_key_to_cpu(src, &min_key, i);
if (min_key.objectid != ino || min_key.type != key_type) goto done;
if (need_resched()) {
btrfs_release_path(path);
cond_resched();
goto search;
}
ret = overwrite_item(trans, log, dst_path, src, i,
&min_key);
if (ret) {
err = ret;
goto done;
}
/*
* We must make sure that when we log a directory entry,
* the corresponding inode, after log replay, has a
* matching link count. For example:
*
* touch foo
* mkdir mydir
* sync
* ln foo mydir/bar
* xfs_io -c "fsync" mydir
* <crash>
* <mount fs and log replay>
*
* Would result in a fsync log that when replayed, our
* file inode would have a link count of 1, but we get
* two directory entries pointing to the same inode.
* After removing one of the names, it would not be
* possible to remove the other name, which resulted
* always in stale file handle errors, and would not
* be possible to rmdir the parent directory, since
* its i_size could never decrement to the value
* BTRFS_EMPTY_DIR_SIZE, resulting in -ENOTEMPTY errors.
*/
di = btrfs_item_ptr(src, i, struct btrfs_dir_item);
btrfs_dir_item_key_to_cpu(src, di, &tmp);
if (ctx &&
(btrfs_dir_transid(src, di) == trans->transid ||
btrfs_dir_type(src, di) == BTRFS_FT_DIR) &&
tmp.type != BTRFS_ROOT_ITEM_KEY) ctx->log_new_dentries = true;
}
path->slots[0] = nritems;
/*
* look ahead to the next item and see if it is also
* from this directory and from this transaction
*/
ret = btrfs_next_leaf(root, path);
if (ret) {
if (ret == 1)
last_offset = (u64)-1;
else
err = ret;
goto done;
}
btrfs_item_key_to_cpu(path->nodes[0], &tmp, path->slots[0]); if (tmp.objectid != ino || tmp.type != key_type) {
last_offset = (u64)-1;
goto done;
}
if (btrfs_header_generation(path->nodes[0]) != trans->transid) { ret = overwrite_item(trans, log, dst_path,
path->nodes[0], path->slots[0],
&tmp);
if (ret)
err = ret;
else
last_offset = tmp.offset;
goto done;
}
}
done:
btrfs_release_path(path);
btrfs_release_path(dst_path);
if (err == 0) {
*last_offset_ret = last_offset;
/*
* insert the log range keys to indicate where the log
* is valid
*/
ret = insert_dir_log_key(trans, log, path, key_type,
ino, first_offset, last_offset);
if (ret)
err = ret;
}
return err;
}
/*
* logging directories is very similar to logging inodes, We find all the items
* from the current transaction and write them to the log.
*
* The recovery code scans the directory in the subvolume, and if it finds a
* key in the range logged that is not present in the log tree, then it means
* that dir entry was unlinked during the transaction.
*
* In order for that scan to work, we must include one key smaller than
* the smallest logged by this transaction and one key larger than the largest
* key logged by this transaction.
*/
static noinline int log_directory_changes(struct btrfs_trans_handle *trans,
struct btrfs_root *root, struct btrfs_inode *inode,
struct btrfs_path *path,
struct btrfs_path *dst_path,
struct btrfs_log_ctx *ctx)
{
u64 min_key;
u64 max_key;
int ret;
int key_type = BTRFS_DIR_ITEM_KEY;
again:
min_key = 0;
max_key = 0;
while (1) {
ret = log_dir_items(trans, root, inode, path, dst_path, key_type,
ctx, min_key, &max_key);
if (ret)
return ret;
if (max_key == (u64)-1)
break;
min_key = max_key + 1;
}
if (key_type == BTRFS_DIR_ITEM_KEY) {
key_type = BTRFS_DIR_INDEX_KEY;
goto again;
}
return 0;
}
/*
* a helper function to drop items from the log before we relog an
* inode. max_key_type indicates the highest item type to remove.
* This cannot be run for file data extents because it does not
* free the extents they point to.
*/
static int drop_objectid_items(struct btrfs_trans_handle *trans,
struct btrfs_root *log,
struct btrfs_path *path,
u64 objectid, int max_key_type)
{
int ret;
struct btrfs_key key;
struct btrfs_key found_key;
int start_slot;
key.objectid = objectid;
key.type = max_key_type;
key.offset = (u64)-1;
while (1) {
ret = btrfs_search_slot(trans, log, &key, path, -1, 1); BUG_ON(ret == 0); /* Logic error */ if (ret < 0)
break;
if (path->slots[0] == 0)
break;
path->slots[0]--;
btrfs_item_key_to_cpu(path->nodes[0], &found_key,
path->slots[0]);
if (found_key.objectid != objectid)
break;
found_key.offset = 0;
found_key.type = 0;
ret = btrfs_bin_search(path->nodes[0], &found_key, &start_slot);
if (ret < 0)
break;
ret = btrfs_del_items(trans, log, path, start_slot,
path->slots[0] - start_slot + 1);
/*
* If start slot isn't 0 then we don't need to re-search, we've
* found the last guy with the objectid in this tree.
*/
if (ret || start_slot != 0)
break;
btrfs_release_path(path);
}
btrfs_release_path(path);
if (ret > 0)
ret = 0;
return ret;
}
static void fill_inode_item(struct btrfs_trans_handle *trans,
struct extent_buffer *leaf,
struct btrfs_inode_item *item,
struct inode *inode, int log_inode_only,
u64 logged_isize)
{
struct btrfs_map_token token;
u64 flags;
btrfs_init_map_token(&token, leaf);
if (log_inode_only) {
/* set the generation to zero so the recover code
* can tell the difference between an logging
* just to say 'this inode exists' and a logging
* to say 'update this inode with these values'
*/
btrfs_set_token_inode_generation(&token, item, 0);
btrfs_set_token_inode_size(&token, item, logged_isize);
} else {
btrfs_set_token_inode_generation(&token, item,
BTRFS_I(inode)->generation);
btrfs_set_token_inode_size(&token, item, inode->i_size);
}
btrfs_set_token_inode_uid(&token, item, i_uid_read(inode));
btrfs_set_token_inode_gid(&token, item, i_gid_read(inode));
btrfs_set_token_inode_mode(&token, item, inode->i_mode);
btrfs_set_token_inode_nlink(&token, item, inode->i_nlink);
btrfs_set_token_timespec_sec(&token, &item->atime,
inode->i_atime.tv_sec);
btrfs_set_token_timespec_nsec(&token, &item->atime,
inode->i_atime.tv_nsec);
btrfs_set_token_timespec_sec(&token, &item->mtime,
inode->i_mtime.tv_sec);
btrfs_set_token_timespec_nsec(&token, &item->mtime,
inode->i_mtime.tv_nsec);
btrfs_set_token_timespec_sec(&token, &item->ctime,
inode->i_ctime.tv_sec);
btrfs_set_token_timespec_nsec(&token, &item->ctime,
inode->i_ctime.tv_nsec);
/*
* We do not need to set the nbytes field, in fact during a fast fsync
* its value may not even be correct, since a fast fsync does not wait
* for ordered extent completion, which is where we update nbytes, it
* only waits for writeback to complete. During log replay as we find
* file extent items and replay them, we adjust the nbytes field of the
* inode item in subvolume tree as needed (see overwrite_item()).
*/
btrfs_set_token_inode_sequence(&token, item, inode_peek_iversion(inode));
btrfs_set_token_inode_transid(&token, item, trans->transid);
btrfs_set_token_inode_rdev(&token, item, inode->i_rdev);
flags = btrfs_inode_combine_flags(BTRFS_I(inode)->flags,
BTRFS_I(inode)->ro_flags);
btrfs_set_token_inode_flags(&token, item, flags);
btrfs_set_token_inode_block_group(&token, item, 0);
}
static int log_inode_item(struct btrfs_trans_handle *trans,
struct btrfs_root *log, struct btrfs_path *path,
struct btrfs_inode *inode, bool inode_item_dropped)
{
struct btrfs_inode_item *inode_item;
int ret;
/*
* If we are doing a fast fsync and the inode was logged before in the
* current transaction, then we know the inode was previously logged and
* it exists in the log tree. For performance reasons, in this case use
* btrfs_search_slot() directly with ins_len set to 0 so that we never
* attempt a write lock on the leaf's parent, which adds unnecessary lock
* contention in case there are concurrent fsyncs for other inodes of the
* same subvolume. Using btrfs_insert_empty_item() when the inode item
* already exists can also result in unnecessarily splitting a leaf.
*/
if (!inode_item_dropped && inode->logged_trans == trans->transid) {
ret = btrfs_search_slot(trans, log, &inode->location, path, 0, 1);
ASSERT(ret <= 0);
if (ret > 0)
ret = -ENOENT;
} else {
/*
* This means it is the first fsync in the current transaction,
* so the inode item is not in the log and we need to insert it.
* We can never get -EEXIST because we are only called for a fast
* fsync and in case an inode eviction happens after the inode was
* logged before in the current transaction, when we load again
* the inode, we set BTRFS_INODE_NEEDS_FULL_SYNC on its runtime
* flags and set ->logged_trans to 0.
*/
ret = btrfs_insert_empty_item(trans, log, path, &inode->location,
sizeof(*inode_item));
ASSERT(ret != -EEXIST);
}
if (ret)
return ret;
inode_item = btrfs_item_ptr(path->nodes[0], path->slots[0],
struct btrfs_inode_item);
fill_inode_item(trans, path->nodes[0], inode_item, &inode->vfs_inode,
0, 0);
btrfs_release_path(path);
return 0;
}
static int log_csums(struct btrfs_trans_handle *trans,
struct btrfs_inode *inode,
struct btrfs_root *log_root,
struct btrfs_ordered_sum *sums)
{
const u64 lock_end = sums->bytenr + sums->len - 1;
struct extent_state *cached_state = NULL;
int ret;
/*
* If this inode was not used for reflink operations in the current
* transaction with new extents, then do the fast path, no need to
* worry about logging checksum items with overlapping ranges.
*/
if (inode->last_reflink_trans < trans->transid) return btrfs_csum_file_blocks(trans, log_root, sums);
/*
* Serialize logging for checksums. This is to avoid racing with the
* same checksum being logged by another task that is logging another
* file which happens to refer to the same extent as well. Such races
* can leave checksum items in the log with overlapping ranges.
*/
ret = lock_extent_bits(&log_root->log_csum_range, sums->bytenr,
lock_end, &cached_state);
if (ret)
return ret;
/*
* Due to extent cloning, we might have logged a csum item that covers a
* subrange of a cloned extent, and later we can end up logging a csum
* item for a larger subrange of the same extent or the entire range.
* This would leave csum items in the log tree that cover the same range
* and break the searches for checksums in the log tree, resulting in
* some checksums missing in the fs/subvolume tree. So just delete (or
* trim and adjust) any existing csum items in the log for this range.
*/
ret = btrfs_del_csums(trans, log_root, sums->bytenr, sums->len);
if (!ret)
ret = btrfs_csum_file_blocks(trans, log_root, sums); unlock_extent_cached(&log_root->log_csum_range, sums->bytenr, lock_end,
&cached_state);
return ret;
}
static noinline int copy_items(struct btrfs_trans_handle *trans,
struct btrfs_inode *inode,
struct btrfs_path *dst_path,
struct btrfs_path *src_path,
int start_slot, int nr, int inode_only,
u64 logged_isize)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
unsigned long src_offset;
unsigned long dst_offset;
struct btrfs_root *log = inode->root->log_root;
struct btrfs_file_extent_item *extent;
struct btrfs_inode_item *inode_item;
struct extent_buffer *src = src_path->nodes[0];
int ret;
struct btrfs_key *ins_keys;
u32 *ins_sizes;
char *ins_data;
int i;
struct list_head ordered_sums;
int skip_csum = inode->flags & BTRFS_INODE_NODATASUM;
INIT_LIST_HEAD(&ordered_sums);
ins_data = kmalloc(nr * sizeof(struct btrfs_key) +
nr * sizeof(u32), GFP_NOFS);
if (!ins_data)
return -ENOMEM;
ins_sizes = (u32 *)ins_data;
ins_keys = (struct btrfs_key *)(ins_data + nr * sizeof(u32));
for (i = 0; i < nr; i++) {
ins_sizes[i] = btrfs_item_size_nr(src, i + start_slot);
btrfs_item_key_to_cpu(src, ins_keys + i, i + start_slot);
}
ret = btrfs_insert_empty_items(trans, log, dst_path,
ins_keys, ins_sizes, nr);
if (ret) { kfree(ins_data); return ret;
}
for (i = 0; i < nr; i++, dst_path->slots[0]++) { dst_offset = btrfs_item_ptr_offset(dst_path->nodes[0],
dst_path->slots[0]);
src_offset = btrfs_item_ptr_offset(src, start_slot + i);
if (ins_keys[i].type == BTRFS_INODE_ITEM_KEY) {
inode_item = btrfs_item_ptr(dst_path->nodes[0],
dst_path->slots[0],
struct btrfs_inode_item);
fill_inode_item(trans, dst_path->nodes[0], inode_item,
&inode->vfs_inode,
inode_only == LOG_INODE_EXISTS,
logged_isize);
} else {
copy_extent_buffer(dst_path->nodes[0], src, dst_offset,
src_offset, ins_sizes[i]);
}
/* take a reference on file data extents so that truncates
* or deletes of this inode don't have to relog the inode
* again
*/
if (ins_keys[i].type == BTRFS_EXTENT_DATA_KEY &&
!skip_csum) {
int found_type;
extent = btrfs_item_ptr(src, start_slot + i,
struct btrfs_file_extent_item);
if (btrfs_file_extent_generation(src, extent) < trans->transid)
continue;
found_type = btrfs_file_extent_type(src, extent);
if (found_type == BTRFS_FILE_EXTENT_REG) {
u64 ds, dl, cs, cl;
ds = btrfs_file_extent_disk_bytenr(src,
extent);
/* ds == 0 is a hole */
if (ds == 0)
continue;
dl = btrfs_file_extent_disk_num_bytes(src,
extent);
cs = btrfs_file_extent_offset(src, extent);
cl = btrfs_file_extent_num_bytes(src,
extent);
if (btrfs_file_extent_compression(src,
extent)) {
cs = 0;
cl = dl;
}
ret = btrfs_lookup_csums_range(
fs_info->csum_root,
ds + cs, ds + cs + cl - 1,
&ordered_sums, 0);
if (ret)
break;
}
}
}
btrfs_mark_buffer_dirty(dst_path->nodes[0]);
btrfs_release_path(dst_path);
kfree(ins_data);
/*
* we have to do this after the loop above to avoid changing the
* log tree while trying to change the log tree.
*/
while (!list_empty(&ordered_sums)) {
struct btrfs_ordered_sum *sums = list_entry(ordered_sums.next,
struct btrfs_ordered_sum,
list);
if (!ret)
ret = log_csums(trans, inode, log, sums);
list_del(&sums->list);
kfree(sums);
}
return ret;
}
static int extent_cmp(void *priv, const struct list_head *a,
const struct list_head *b)
{
const struct extent_map *em1, *em2;
em1 = list_entry(a, struct extent_map, list);
em2 = list_entry(b, struct extent_map, list);
if (em1->start < em2->start)
return -1;
else if (em1->start > em2->start)
return 1;
return 0;
}
static int log_extent_csums(struct btrfs_trans_handle *trans,
struct btrfs_inode *inode,
struct btrfs_root *log_root,
const struct extent_map *em,
struct btrfs_log_ctx *ctx)
{
struct btrfs_ordered_extent *ordered;
u64 csum_offset;
u64 csum_len;
u64 mod_start = em->mod_start;
u64 mod_len = em->mod_len;
LIST_HEAD(ordered_sums);
int ret = 0;
if (inode->flags & BTRFS_INODE_NODATASUM || test_bit(EXTENT_FLAG_PREALLOC, &em->flags) || em->block_start == EXTENT_MAP_HOLE)
return 0;
list_for_each_entry(ordered, &ctx->ordered_extents, log_list) { const u64 ordered_end = ordered->file_offset + ordered->num_bytes;
const u64 mod_end = mod_start + mod_len;
struct btrfs_ordered_sum *sums;
if (mod_len == 0)
break;
if (ordered_end <= mod_start)
continue;
if (mod_end <= ordered->file_offset)
break;
/*
* We are going to copy all the csums on this ordered extent, so
* go ahead and adjust mod_start and mod_len in case this ordered
* extent has already been logged.
*/
if (ordered->file_offset > mod_start) { if (ordered_end >= mod_end) mod_len = ordered->file_offset - mod_start;
/*
* If we have this case
*
* |--------- logged extent ---------|
* |----- ordered extent ----|
*
* Just don't mess with mod_start and mod_len, we'll
* just end up logging more csums than we need and it
* will be ok.
*/
} else {
if (ordered_end < mod_end) { mod_len = mod_end - ordered_end;
mod_start = ordered_end;
} else {
mod_len = 0;
}
}
/*
* To keep us from looping for the above case of an ordered
* extent that falls inside of the logged extent.
*/
if (test_and_set_bit(BTRFS_ORDERED_LOGGED_CSUM, &ordered->flags))
continue;
list_for_each_entry(sums, &ordered->list, list) { ret = log_csums(trans, inode, log_root, sums); if (ret)
return ret;
}
}
/* We're done, found all csums in the ordered extents. */
if (mod_len == 0)
return 0;
/* If we're compressed we have to save the entire range of csums. */
if (em->compress_type) {
csum_offset = 0;
csum_len = max(em->block_len, em->orig_block_len);
} else {
csum_offset = mod_start - em->start; csum_len = mod_len;
}
/* block start is already adjusted for the file extent offset. */
ret = btrfs_lookup_csums_range(trans->fs_info->csum_root,
em->block_start + csum_offset,
em->block_start + csum_offset +
csum_len - 1, &ordered_sums, 0);
if (ret)
return ret;
while (!list_empty(&ordered_sums)) {
struct btrfs_ordered_sum *sums = list_entry(ordered_sums.next,
struct btrfs_ordered_sum,
list);
if (!ret)
ret = log_csums(trans, inode, log_root, sums);
list_del(&sums->list);
kfree(sums);
}
return ret;
}
static int log_one_extent(struct btrfs_trans_handle *trans,
struct btrfs_inode *inode, struct btrfs_root *root,
const struct extent_map *em,
struct btrfs_path *path,
struct btrfs_log_ctx *ctx)
{
struct btrfs_drop_extents_args drop_args = { 0 };
struct btrfs_root *log = root->log_root;
struct btrfs_file_extent_item *fi;
struct extent_buffer *leaf;
struct btrfs_map_token token;
struct btrfs_key key;
u64 extent_offset = em->start - em->orig_start;
u64 block_len;
int ret;
ret = log_extent_csums(trans, inode, log, em, ctx);
if (ret)
return ret;
drop_args.path = path;
drop_args.start = em->start;
drop_args.end = em->start + em->len;
drop_args.replace_extent = true;
drop_args.extent_item_size = sizeof(*fi);
ret = btrfs_drop_extents(trans, log, inode, &drop_args);
if (ret)
return ret;
if (!drop_args.extent_inserted) { key.objectid = btrfs_ino(inode);
key.type = BTRFS_EXTENT_DATA_KEY;
key.offset = em->start;
ret = btrfs_insert_empty_item(trans, log, path, &key,
sizeof(*fi));
if (ret)
return ret;
}
leaf = path->nodes[0];
btrfs_init_map_token(&token, leaf);
fi = btrfs_item_ptr(leaf, path->slots[0],
struct btrfs_file_extent_item);
btrfs_set_token_file_extent_generation(&token, fi, trans->transid);
if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
btrfs_set_token_file_extent_type(&token, fi,
BTRFS_FILE_EXTENT_PREALLOC);
else
btrfs_set_token_file_extent_type(&token, fi,
BTRFS_FILE_EXTENT_REG);
block_len = max(em->block_len, em->orig_block_len);
if (em->compress_type != BTRFS_COMPRESS_NONE) {
btrfs_set_token_file_extent_disk_bytenr(&token, fi,
em->block_start);
btrfs_set_token_file_extent_disk_num_bytes(&token, fi, block_len);
} else if (em->block_start < EXTENT_MAP_LAST_BYTE) { btrfs_set_token_file_extent_disk_bytenr(&token, fi,
em->block_start -
extent_offset);
btrfs_set_token_file_extent_disk_num_bytes(&token, fi, block_len);
} else {
btrfs_set_token_file_extent_disk_bytenr(&token, fi, 0);
btrfs_set_token_file_extent_disk_num_bytes(&token, fi, 0);
}
btrfs_set_token_file_extent_offset(&token, fi, extent_offset);
btrfs_set_token_file_extent_num_bytes(&token, fi, em->len);
btrfs_set_token_file_extent_ram_bytes(&token, fi, em->ram_bytes);
btrfs_set_token_file_extent_compression(&token, fi, em->compress_type);
btrfs_set_token_file_extent_encryption(&token, fi, 0);
btrfs_set_token_file_extent_other_encoding(&token, fi, 0);
btrfs_mark_buffer_dirty(leaf);
btrfs_release_path(path);
return ret;
}
/*
* Log all prealloc extents beyond the inode's i_size to make sure we do not
* lose them after doing a full/fast fsync and replaying the log. We scan the
* subvolume's root instead of iterating the inode's extent map tree because
* otherwise we can log incorrect extent items based on extent map conversion.
* That can happen due to the fact that extent maps are merged when they
* are not in the extent map tree's list of modified extents.
*/
static int btrfs_log_prealloc_extents(struct btrfs_trans_handle *trans,
struct btrfs_inode *inode,
struct btrfs_path *path)
{
struct btrfs_root *root = inode->root;
struct btrfs_key key;
const u64 i_size = i_size_read(&inode->vfs_inode);
const u64 ino = btrfs_ino(inode);
struct btrfs_path *dst_path = NULL;
bool dropped_extents = false;
u64 truncate_offset = i_size;
struct extent_buffer *leaf;
int slot;
int ins_nr = 0;
int start_slot;
int ret;
if (!(inode->flags & BTRFS_INODE_PREALLOC))
return 0;
key.objectid = ino;
key.type = BTRFS_EXTENT_DATA_KEY;
key.offset = i_size;
ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
if (ret < 0)
goto out;
/*
* We must check if there is a prealloc extent that starts before the
* i_size and crosses the i_size boundary. This is to ensure later we
* truncate down to the end of that extent and not to the i_size, as
* otherwise we end up losing part of the prealloc extent after a log
* replay and with an implicit hole if there is another prealloc extent
* that starts at an offset beyond i_size.
*/
ret = btrfs_previous_item(root, path, ino, BTRFS_EXTENT_DATA_KEY);
if (ret < 0)
goto out;
if (ret == 0) {
struct btrfs_file_extent_item *ei;
leaf = path->nodes[0];
slot = path->slots[0];
ei = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
if (btrfs_file_extent_type(leaf, ei) ==
BTRFS_FILE_EXTENT_PREALLOC) {
u64 extent_end;
btrfs_item_key_to_cpu(leaf, &key, slot);
extent_end = key.offset +
btrfs_file_extent_num_bytes(leaf, ei);
if (extent_end > i_size)
truncate_offset = extent_end;
}
} else {
ret = 0;
}
while (true) {
leaf = path->nodes[0];
slot = path->slots[0];
if (slot >= btrfs_header_nritems(leaf)) { if (ins_nr > 0) { ret = copy_items(trans, inode, dst_path, path,
start_slot, ins_nr, 1, 0);
if (ret < 0)
goto out;
ins_nr = 0;
}
ret = btrfs_next_leaf(root, path);
if (ret < 0)
goto out;
if (ret > 0) {
ret = 0;
break;
}
continue;
}
btrfs_item_key_to_cpu(leaf, &key, slot);
if (key.objectid > ino)
break;
if (WARN_ON_ONCE(key.objectid < ino) || key.type < BTRFS_EXTENT_DATA_KEY || key.offset < i_size) { path->slots[0]++;
continue;
}
if (!dropped_extents) {
/*
* Avoid logging extent items logged in past fsync calls
* and leading to duplicate keys in the log tree.
*/
do {
ret = btrfs_truncate_inode_items(trans,
root->log_root,
inode, truncate_offset,
BTRFS_EXTENT_DATA_KEY,
NULL);
} while (ret == -EAGAIN);
if (ret)
goto out;
dropped_extents = true;
}
if (ins_nr == 0)
start_slot = slot;
ins_nr++;
path->slots[0]++;
if (!dst_path) {
dst_path = btrfs_alloc_path(); if (!dst_path) {
ret = -ENOMEM;
goto out;
}
}
}
if (ins_nr > 0) ret = copy_items(trans, inode, dst_path, path,
start_slot, ins_nr, 1, 0);
out:
btrfs_release_path(path);
btrfs_free_path(dst_path);
return ret;
}
static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_inode *inode,
struct btrfs_path *path,
struct btrfs_log_ctx *ctx)
{
struct btrfs_ordered_extent *ordered;
struct btrfs_ordered_extent *tmp;
struct extent_map *em, *n;
struct list_head extents;
struct extent_map_tree *tree = &inode->extent_tree;
int ret = 0;
int num = 0;
INIT_LIST_HEAD(&extents);
write_lock(&tree->lock);
list_for_each_entry_safe(em, n, &tree->modified_extents, list) {
list_del_init(&em->list);
/*
* Just an arbitrary number, this can be really CPU intensive
* once we start getting a lot of extents, and really once we
* have a bunch of extents we just want to commit since it will
* be faster.
*/
if (++num > 32768) {
list_del_init(&tree->modified_extents);
ret = -EFBIG;
goto process;
}
if (em->generation < trans->transid)
continue;
/* We log prealloc extents beyond eof later. */
if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags) && em->start >= i_size_read(&inode->vfs_inode))
continue;
/* Need a ref to keep it from getting evicted from cache */
refcount_inc(&em->refs);
set_bit(EXTENT_FLAG_LOGGING, &em->flags);
list_add_tail(&em->list, &extents);
num++;
}
list_sort(NULL, &extents, extent_cmp);
process:
while (!list_empty(&extents)) {
em = list_entry(extents.next, struct extent_map, list);
list_del_init(&em->list);
/*
* If we had an error we just need to delete everybody from our
* private list.
*/
if (ret) {
clear_em_logging(tree, em);
free_extent_map(em);
continue;
}
write_unlock(&tree->lock);
ret = log_one_extent(trans, inode, root, em, path, ctx);
write_lock(&tree->lock);
clear_em_logging(tree, em);
free_extent_map(em);
}
WARN_ON(!list_empty(&extents));
write_unlock(&tree->lock);
btrfs_release_path(path);
if (!ret)
ret = btrfs_log_prealloc_extents(trans, inode, path);
if (ret)
return ret;
/*
* We have logged all extents successfully, now make sure the commit of
* the current transaction waits for the ordered extents to complete
* before it commits and wipes out the log trees, otherwise we would
* lose data if an ordered extents completes after the transaction
* commits and a power failure happens after the transaction commit.
*/
list_for_each_entry_safe(ordered, tmp, &ctx->ordered_extents, log_list) {
list_del_init(&ordered->log_list);
set_bit(BTRFS_ORDERED_LOGGED, &ordered->flags);
if (!test_bit(BTRFS_ORDERED_COMPLETE, &ordered->flags)) {
spin_lock_irq(&inode->ordered_tree.lock);
if (!test_bit(BTRFS_ORDERED_COMPLETE, &ordered->flags)) {
set_bit(BTRFS_ORDERED_PENDING, &ordered->flags);
atomic_inc(&trans->transaction->pending_ordered);
}
spin_unlock_irq(&inode->ordered_tree.lock);
}
btrfs_put_ordered_extent(ordered);
}
return 0;
}
static int logged_inode_size(struct btrfs_root *log, struct btrfs_inode *inode,
struct btrfs_path *path, u64 *size_ret)
{
struct btrfs_key key;
int ret;
key.objectid = btrfs_ino(inode);
key.type = BTRFS_INODE_ITEM_KEY;
key.offset = 0;
ret = btrfs_search_slot(NULL, log, &key, path, 0, 0);
if (ret < 0) {
return ret;
} else if (ret > 0) {
*size_ret = 0;
} else {
struct btrfs_inode_item *item;
item = btrfs_item_ptr(path->nodes[0], path->slots[0],
struct btrfs_inode_item);
*size_ret = btrfs_inode_size(path->nodes[0], item);
/*
* If the in-memory inode's i_size is smaller then the inode
* size stored in the btree, return the inode's i_size, so
* that we get a correct inode size after replaying the log
* when before a power failure we had a shrinking truncate
* followed by addition of a new name (rename / new hard link).
* Otherwise return the inode size from the btree, to avoid
* data loss when replaying a log due to previously doing a
* write that expands the inode's size and logging a new name
* immediately after.
*/
if (*size_ret > inode->vfs_inode.i_size)
*size_ret = inode->vfs_inode.i_size;
}
btrfs_release_path(path);
return 0;
}
/*
* At the moment we always log all xattrs. This is to figure out at log replay
* time which xattrs must have their deletion replayed. If a xattr is missing
* in the log tree and exists in the fs/subvol tree, we delete it. This is
* because if a xattr is deleted, the inode is fsynced and a power failure
* happens, causing the log to be replayed the next time the fs is mounted,
* we want the xattr to not exist anymore (same behaviour as other filesystems
* with a journal, ext3/4, xfs, f2fs, etc).
*/
static int btrfs_log_all_xattrs(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_inode *inode,
struct btrfs_path *path,
struct btrfs_path *dst_path)
{
int ret;
struct btrfs_key key;
const u64 ino = btrfs_ino(inode);
int ins_nr = 0;
int start_slot = 0;
bool found_xattrs = false;
if (test_bit(BTRFS_INODE_NO_XATTRS, &inode->runtime_flags))
return 0;
key.objectid = ino;
key.type = BTRFS_XATTR_ITEM_KEY;
key.offset = 0;
ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
if (ret < 0)
return ret;
while (true) {
int slot = path->slots[0];
struct extent_buffer *leaf = path->nodes[0];
int nritems = btrfs_header_nritems(leaf);
if (slot >= nritems) {
if (ins_nr > 0) { ret = copy_items(trans, inode, dst_path, path,
start_slot, ins_nr, 1, 0);
if (ret < 0)
return ret;
ins_nr = 0;
}
ret = btrfs_next_leaf(root, path);
if (ret < 0)
return ret;
else if (ret > 0)
break;
continue;
}
btrfs_item_key_to_cpu(leaf, &key, slot);
if (key.objectid != ino || key.type != BTRFS_XATTR_ITEM_KEY)
break;
if (ins_nr == 0)
start_slot = slot;
ins_nr++;
path->slots[0]++;
found_xattrs = true;
cond_resched();
}
if (ins_nr > 0) { ret = copy_items(trans, inode, dst_path, path,
start_slot, ins_nr, 1, 0);
if (ret < 0)
return ret;
}
if (!found_xattrs)
set_bit(BTRFS_INODE_NO_XATTRS, &inode->runtime_flags);
return 0;
}
/*
* When using the NO_HOLES feature if we punched a hole that causes the
* deletion of entire leafs or all the extent items of the first leaf (the one
* that contains the inode item and references) we may end up not processing
* any extents, because there are no leafs with a generation matching the
* current transaction that have extent items for our inode. So we need to find
* if any holes exist and then log them. We also need to log holes after any
* truncate operation that changes the inode's size.
*/
static int btrfs_log_holes(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_inode *inode,
struct btrfs_path *path)
{
struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_key key;
const u64 ino = btrfs_ino(inode);
const u64 i_size = i_size_read(&inode->vfs_inode);
u64 prev_extent_end = 0;
int ret;
if (!btrfs_fs_incompat(fs_info, NO_HOLES) || i_size == 0)
return 0;
key.objectid = ino;
key.type = BTRFS_EXTENT_DATA_KEY;
key.offset = 0;
ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
if (ret < 0)
return ret;
while (true) {
struct extent_buffer *leaf = path->nodes[0];
if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) {
ret = btrfs_next_leaf(root, path);
if (ret < 0)
return ret;
if (ret > 0) {
ret = 0;
break;
}
leaf = path->nodes[0];
}
btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
if (key.objectid != ino || key.type != BTRFS_EXTENT_DATA_KEY)
break;
/* We have a hole, log it. */
if (prev_extent_end < key.offset) { const u64 hole_len = key.offset - prev_extent_end;
/*
* Release the path to avoid deadlocks with other code
* paths that search the root while holding locks on
* leafs from the log root.
*/
btrfs_release_path(path);
ret = btrfs_insert_file_extent(trans, root->log_root,
ino, prev_extent_end, 0,
0, hole_len, 0, hole_len,
0, 0, 0);
if (ret < 0)
return ret;
/*
* Search for the same key again in the root. Since it's
* an extent item and we are holding the inode lock, the
* key must still exist. If it doesn't just emit warning
* and return an error to fall back to a transaction
* commit.
*/
ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
if (ret < 0)
return ret;
if (WARN_ON(ret > 0))
return -ENOENT;
leaf = path->nodes[0];
}
prev_extent_end = btrfs_file_extent_end(path);
path->slots[0]++;
cond_resched();
}
if (prev_extent_end < i_size) {
u64 hole_len;
btrfs_release_path(path);
hole_len = ALIGN(i_size - prev_extent_end, fs_info->sectorsize);
ret = btrfs_insert_file_extent(trans, root->log_root,
ino, prev_extent_end, 0, 0,
hole_len, 0, hole_len,
0, 0, 0);
if (ret < 0)
return ret;
}
return 0;
}
/*
* When we are logging a new inode X, check if it doesn't have a reference that
* matches the reference from some other inode Y created in a past transaction
* and that was renamed in the current transaction. If we don't do this, then at
* log replay time we can lose inode Y (and all its files if it's a directory):
*
* mkdir /mnt/x
* echo "hello world" > /mnt/x/foobar
* sync
* mv /mnt/x /mnt/y
* mkdir /mnt/x # or touch /mnt/x
* xfs_io -c fsync /mnt/x
* <power fail>
* mount fs, trigger log replay
*
* After the log replay procedure, we would lose the first directory and all its
* files (file foobar).
* For the case where inode Y is not a directory we simply end up losing it:
*
* echo "123" > /mnt/foo
* sync
* mv /mnt/foo /mnt/bar
* echo "abc" > /mnt/foo
* xfs_io -c fsync /mnt/foo
* <power fail>
*
* We also need this for cases where a snapshot entry is replaced by some other
* entry (file or directory) otherwise we end up with an unreplayable log due to
* attempts to delete the snapshot entry (entry of type BTRFS_ROOT_ITEM_KEY) as
* if it were a regular entry:
*
* mkdir /mnt/x
* btrfs subvolume snapshot /mnt /mnt/x/snap
* btrfs subvolume delete /mnt/x/snap
* rmdir /mnt/x
* mkdir /mnt/x
* fsync /mnt/x or fsync some new file inside it
* <power fail>
*
* The snapshot delete, rmdir of x, mkdir of a new x and the fsync all happen in
* the same transaction.
*/
static int btrfs_check_ref_name_override(struct extent_buffer *eb,
const int slot,
const struct btrfs_key *key,
struct btrfs_inode *inode,
u64 *other_ino, u64 *other_parent)
{
int ret;
struct btrfs_path *search_path;
char *name = NULL;
u32 name_len = 0;
u32 item_size = btrfs_item_size_nr(eb, slot);
u32 cur_offset = 0;
unsigned long ptr = btrfs_item_ptr_offset(eb, slot);
search_path = btrfs_alloc_path();
if (!search_path)
return -ENOMEM;
search_path->search_commit_root = 1;
search_path->skip_locking = 1;
while (cur_offset < item_size) {
u64 parent;
u32 this_name_len;
u32 this_len;
unsigned long name_ptr;
struct btrfs_dir_item *di;
if (key->type == BTRFS_INODE_REF_KEY) {
struct btrfs_inode_ref *iref;
iref = (struct btrfs_inode_ref *)(ptr + cur_offset);
parent = key->offset;
this_name_len = btrfs_inode_ref_name_len(eb, iref);
name_ptr = (unsigned long)(iref + 1);
this_len = sizeof(*iref) + this_name_len;
} else {
struct btrfs_inode_extref *extref;
extref = (struct btrfs_inode_extref *)(ptr +
cur_offset);
parent = btrfs_inode_extref_parent(eb, extref);
this_name_len = btrfs_inode_extref_name_len(eb, extref);
name_ptr = (unsigned long)&extref->name;
this_len = sizeof(*extref) + this_name_len;
}
if (this_name_len > name_len) {
char *new_name;
new_name = krealloc(name, this_name_len, GFP_NOFS);
if (!new_name) {
ret = -ENOMEM;
goto out;
}
name_len = this_name_len;
name = new_name;
}
read_extent_buffer(eb, name, name_ptr, this_name_len);
di = btrfs_lookup_dir_item(NULL, inode->root, search_path,
parent, name, this_name_len, 0);
if (di && !IS_ERR(di)) {
struct btrfs_key di_key;
btrfs_dir_item_key_to_cpu(search_path->nodes[0],
di, &di_key);
if (di_key.type == BTRFS_INODE_ITEM_KEY) {
if (di_key.objectid != key->objectid) {
ret = 1;
*other_ino = di_key.objectid;
*other_parent = parent;
} else {
ret = 0;
}
} else {
ret = -EAGAIN;
}
goto out;
} else if (IS_ERR(di)) {
ret = PTR_ERR(di);
goto out;
}
btrfs_release_path(search_path);
cur_offset += this_len;
}
ret = 0;
out:
btrfs_free_path(search_path);
kfree(name);
return ret;
}
struct btrfs_ino_list {
u64 ino;
u64 parent;
struct list_head list;
};
static int log_conflicting_inodes(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_path *path,
struct btrfs_log_ctx *ctx,
u64 ino, u64 parent)
{
struct btrfs_ino_list *ino_elem;
LIST_HEAD(inode_list);
int ret = 0;
ino_elem = kmalloc(sizeof(*ino_elem), GFP_NOFS);
if (!ino_elem)
return -ENOMEM; ino_elem->ino = ino;
ino_elem->parent = parent;
list_add_tail(&ino_elem->list, &inode_list);
while (!list_empty(&inode_list)) {
struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_key key;
struct inode *inode;
ino_elem = list_first_entry(&inode_list, struct btrfs_ino_list,
list);
ino = ino_elem->ino;
parent = ino_elem->parent;
list_del(&ino_elem->list);
kfree(ino_elem);
if (ret)
continue; btrfs_release_path(path);
inode = btrfs_iget(fs_info->sb, ino, root);
/*
* If the other inode that had a conflicting dir entry was
* deleted in the current transaction, we need to log its parent
* directory.
*/
if (IS_ERR(inode)) {
ret = PTR_ERR(inode);
if (ret == -ENOENT) {
inode = btrfs_iget(fs_info->sb, parent, root);
if (IS_ERR(inode)) {
ret = PTR_ERR(inode);
} else {
ret = btrfs_log_inode(trans, root,
BTRFS_I(inode),
LOG_OTHER_INODE_ALL,
ctx);
btrfs_add_delayed_iput(inode);
}
}
continue;
}
/*
* If the inode was already logged skip it - otherwise we can
* hit an infinite loop. Example:
*
* From the commit root (previous transaction) we have the
* following inodes:
*
* inode 257 a directory
* inode 258 with references "zz" and "zz_link" on inode 257
* inode 259 with reference "a" on inode 257
*
* And in the current (uncommitted) transaction we have:
*
* inode 257 a directory, unchanged
* inode 258 with references "a" and "a2" on inode 257
* inode 259 with reference "zz_link" on inode 257
* inode 261 with reference "zz" on inode 257
*
* When logging inode 261 the following infinite loop could
* happen if we don't skip already logged inodes:
*
* - we detect inode 258 as a conflicting inode, with inode 261
* on reference "zz", and log it;
*
* - we detect inode 259 as a conflicting inode, with inode 258
* on reference "a", and log it;
*
* - we detect inode 258 as a conflicting inode, with inode 259
* on reference "zz_link", and log it - again! After this we
* repeat the above steps forever.
*/
spin_lock(&BTRFS_I(inode)->lock);
/*
* Check the inode's logged_trans only instead of
* btrfs_inode_in_log(). This is because the last_log_commit of
* the inode is not updated when we only log that it exists (see
* btrfs_log_inode()).
*/
if (BTRFS_I(inode)->logged_trans == trans->transid) {
spin_unlock(&BTRFS_I(inode)->lock);
btrfs_add_delayed_iput(inode);
continue;
}
spin_unlock(&BTRFS_I(inode)->lock);
/*
* We are safe logging the other inode without acquiring its
* lock as long as we log with the LOG_INODE_EXISTS mode. We
* are safe against concurrent renames of the other inode as
* well because during a rename we pin the log and update the
* log with the new name before we unpin it.
*/
ret = btrfs_log_inode(trans, root, BTRFS_I(inode),
LOG_OTHER_INODE, ctx);
if (ret) {
btrfs_add_delayed_iput(inode);
continue;
}
key.objectid = ino;
key.type = BTRFS_INODE_REF_KEY;
key.offset = 0;
ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
if (ret < 0) { btrfs_add_delayed_iput(inode);
continue;
}
while (true) {
struct extent_buffer *leaf = path->nodes[0];
int slot = path->slots[0];
u64 other_ino = 0;
u64 other_parent = 0;
if (slot >= btrfs_header_nritems(leaf)) {
ret = btrfs_next_leaf(root, path);
if (ret < 0) {
break;
} else if (ret > 0) {
ret = 0;
break;
}
continue;
}
btrfs_item_key_to_cpu(leaf, &key, slot);
if (key.objectid != ino ||
(key.type != BTRFS_INODE_REF_KEY &&
key.type != BTRFS_INODE_EXTREF_KEY)) {
ret = 0;
break;
}
ret = btrfs_check_ref_name_override(leaf, slot, &key,
BTRFS_I(inode), &other_ino,
&other_parent);
if (ret < 0)
break;
if (ret > 0) {
ino_elem = kmalloc(sizeof(*ino_elem), GFP_NOFS);
if (!ino_elem) {
ret = -ENOMEM;
break;
}
ino_elem->ino = other_ino;
ino_elem->parent = other_parent;
list_add_tail(&ino_elem->list, &inode_list);
ret = 0;
}
path->slots[0]++;
}
btrfs_add_delayed_iput(inode);
}
return ret;
}
static int copy_inode_items_to_log(struct btrfs_trans_handle *trans,
struct btrfs_inode *inode,
struct btrfs_key *min_key,
const struct btrfs_key *max_key,
struct btrfs_path *path,
struct btrfs_path *dst_path,
const u64 logged_isize,
const bool recursive_logging,
const int inode_only,
struct btrfs_log_ctx *ctx,
bool *need_log_inode_item)
{
const u64 i_size = i_size_read(&inode->vfs_inode);
struct btrfs_root *root = inode->root;
int ins_start_slot = 0;
int ins_nr = 0;
int ret;
while (1) {
ret = btrfs_search_forward(root, min_key, path, trans->transid);
if (ret < 0)
return ret;
if (ret > 0) {
ret = 0;
break;
}
again:
/* Note, ins_nr might be > 0 here, cleanup outside the loop */
if (min_key->objectid != max_key->objectid)
break;
if (min_key->type > max_key->type)
break;
if (min_key->type == BTRFS_INODE_ITEM_KEY) {
*need_log_inode_item = false;
} else if (min_key->type == BTRFS_EXTENT_DATA_KEY && min_key->offset >= i_size) {
/*
* Extents at and beyond eof are logged with
* btrfs_log_prealloc_extents().
* Only regular files have BTRFS_EXTENT_DATA_KEY keys,
* and no keys greater than that, so bail out.
*/
break;
} else if ((min_key->type == BTRFS_INODE_REF_KEY ||
min_key->type == BTRFS_INODE_EXTREF_KEY) &&
inode->generation == trans->transid &&
!recursive_logging) {
u64 other_ino = 0;
u64 other_parent = 0;
ret = btrfs_check_ref_name_override(path->nodes[0],
path->slots[0], min_key, inode,
&other_ino, &other_parent);
if (ret < 0) {
return ret; } else if (ret > 0 && ctx && other_ino != btrfs_ino(BTRFS_I(ctx->inode))) { if (ins_nr > 0) { ins_nr++;
} else {
ins_nr = 1;
ins_start_slot = path->slots[0];
}
ret = copy_items(trans, inode, dst_path, path,
ins_start_slot, ins_nr,
inode_only, logged_isize);
if (ret < 0)
return ret;
ins_nr = 0;
ret = log_conflicting_inodes(trans, root, path,
ctx, other_ino, other_parent);
if (ret)
return ret;
btrfs_release_path(path);
goto next_key;
}
} else if (min_key->type == BTRFS_XATTR_ITEM_KEY) {
/* Skip xattrs, logged later with btrfs_log_all_xattrs() */
if (ins_nr == 0)
goto next_slot;
ret = copy_items(trans, inode, dst_path, path,
ins_start_slot,
ins_nr, inode_only, logged_isize);
if (ret < 0)
return ret;
ins_nr = 0;
goto next_slot;
}
if (ins_nr && ins_start_slot + ins_nr == path->slots[0]) { ins_nr++;
goto next_slot;
} else if (!ins_nr) {
ins_start_slot = path->slots[0];
ins_nr = 1;
goto next_slot;
}
ret = copy_items(trans, inode, dst_path, path, ins_start_slot,
ins_nr, inode_only, logged_isize);
if (ret < 0)
return ret;
ins_nr = 1;
ins_start_slot = path->slots[0];
next_slot:
path->slots[0]++;
if (path->slots[0] < btrfs_header_nritems(path->nodes[0])) {
btrfs_item_key_to_cpu(path->nodes[0], min_key,
path->slots[0]);
goto again;
}
if (ins_nr) { ret = copy_items(trans, inode, dst_path, path,
ins_start_slot, ins_nr, inode_only,
logged_isize);
if (ret < 0)
return ret;
ins_nr = 0;
}
btrfs_release_path(path);
next_key:
if (min_key->offset < (u64)-1) { min_key->offset++; } else if (min_key->type < max_key->type) { min_key->type++;
min_key->offset = 0;
} else {
break;
}
}
if (ins_nr) { ret = copy_items(trans, inode, dst_path, path, ins_start_slot,
ins_nr, inode_only, logged_isize);
if (ret)
return ret;
}
if (inode_only == LOG_INODE_ALL && S_ISREG(inode->vfs_inode.i_mode)) {
/*
* Release the path because otherwise we might attempt to double
* lock the same leaf with btrfs_log_prealloc_extents() below.
*/
btrfs_release_path(path);
ret = btrfs_log_prealloc_extents(trans, inode, dst_path);
}
return ret;
}
/* log a single inode in the tree log.
* At least one parent directory for this inode must exist in the tree
* or be logged already.
*
* Any items from this inode changed by the current transaction are copied
* to the log tree. An extra reference is taken on any extents in this
* file, allowing us to avoid a whole pile of corner cases around logging
* blocks that have been removed from the tree.
*
* See LOG_INODE_ALL and related defines for a description of what inode_only
* does.
*
* This handles both files and directories.
*/
static int btrfs_log_inode(struct btrfs_trans_handle *trans,
struct btrfs_root *root, struct btrfs_inode *inode,
int inode_only,
struct btrfs_log_ctx *ctx)
{
struct btrfs_path *path;
struct btrfs_path *dst_path;
struct btrfs_key min_key;
struct btrfs_key max_key;
struct btrfs_root *log = root->log_root;
int err = 0;
int ret = 0;
bool fast_search = false;
u64 ino = btrfs_ino(inode);
struct extent_map_tree *em_tree = &inode->extent_tree;
u64 logged_isize = 0;
bool need_log_inode_item = true;
bool xattrs_logged = false;
bool recursive_logging = false;
bool inode_item_dropped = true;
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
dst_path = btrfs_alloc_path();
if (!dst_path) {
btrfs_free_path(path); return -ENOMEM;
}
min_key.objectid = ino;
min_key.type = BTRFS_INODE_ITEM_KEY;
min_key.offset = 0;
max_key.objectid = ino;
/* today the code can only do partial logging of directories */
if (S_ISDIR(inode->vfs_inode.i_mode) ||
(!test_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
&inode->runtime_flags) &&
inode_only >= LOG_INODE_EXISTS))
max_key.type = BTRFS_XATTR_ITEM_KEY;
else
max_key.type = (u8)-1;
max_key.offset = (u64)-1;
/*
* Only run delayed items if we are a directory. We want to make sure
* all directory indexes hit the fs/subvolume tree so we can find them
* and figure out which index ranges have to be logged.
*
* Otherwise commit the delayed inode only if the full sync flag is set,
* as we want to make sure an up to date version is in the subvolume
* tree so copy_inode_items_to_log() / copy_items() can find it and copy
* it to the log tree. For a non full sync, we always log the inode item
* based on the in-memory struct btrfs_inode which is always up to date.
*/
if (S_ISDIR(inode->vfs_inode.i_mode)) ret = btrfs_commit_inode_delayed_items(trans, inode);
else if (test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags))
ret = btrfs_commit_inode_delayed_inode(inode); if (ret) { btrfs_free_path(path);
btrfs_free_path(dst_path);
return ret;
}
if (inode_only == LOG_OTHER_INODE || inode_only == LOG_OTHER_INODE_ALL) {
recursive_logging = true;
if (inode_only == LOG_OTHER_INODE)
inode_only = LOG_INODE_EXISTS;
else
inode_only = LOG_INODE_ALL;
mutex_lock_nested(&inode->log_mutex, SINGLE_DEPTH_NESTING);
} else {
mutex_lock(&inode->log_mutex);
}
/*
* This is for cases where logging a directory could result in losing a
* a file after replaying the log. For example, if we move a file from a
* directory A to a directory B, then fsync directory A, we have no way
* to known the file was moved from A to B, so logging just A would
* result in losing the file after a log replay.
*/
if (S_ISDIR(inode->vfs_inode.i_mode) &&
inode_only == LOG_INODE_ALL &&
inode->last_unlink_trans >= trans->transid) { btrfs_set_log_full_commit(trans);
err = 1;
goto out_unlock;
}
/*
* a brute force approach to making sure we get the most uptodate
* copies of everything.
*/
if (S_ISDIR(inode->vfs_inode.i_mode)) {
int max_key_type = BTRFS_DIR_LOG_INDEX_KEY;
clear_bit(BTRFS_INODE_COPY_EVERYTHING, &inode->runtime_flags);
if (inode_only == LOG_INODE_EXISTS)
max_key_type = BTRFS_XATTR_ITEM_KEY;
ret = drop_objectid_items(trans, log, path, ino, max_key_type);
} else {
if (inode_only == LOG_INODE_EXISTS) {
/*
* Make sure the new inode item we write to the log has
* the same isize as the current one (if it exists).
* This is necessary to prevent data loss after log
* replay, and also to prevent doing a wrong expanding
* truncate - for e.g. create file, write 4K into offset
* 0, fsync, write 4K into offset 4096, add hard link,
* fsync some other file (to sync log), power fail - if
* we use the inode's current i_size, after log replay
* we get a 8Kb file, with the last 4Kb extent as a hole
* (zeroes), as if an expanding truncate happened,
* instead of getting a file of 4Kb only.
*/
err = logged_inode_size(log, inode, path, &logged_isize);
if (err)
goto out_unlock;
}
if (test_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
&inode->runtime_flags)) {
if (inode_only == LOG_INODE_EXISTS) {
max_key.type = BTRFS_XATTR_ITEM_KEY;
ret = drop_objectid_items(trans, log, path, ino,
max_key.type);
} else {
clear_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
&inode->runtime_flags);
clear_bit(BTRFS_INODE_COPY_EVERYTHING,
&inode->runtime_flags);
while(1) {
ret = btrfs_truncate_inode_items(trans,
log, inode, 0, 0, NULL);
if (ret != -EAGAIN)
break;
}
}
} else if (test_and_clear_bit(BTRFS_INODE_COPY_EVERYTHING,
&inode->runtime_flags) ||
inode_only == LOG_INODE_EXISTS) {
if (inode_only == LOG_INODE_ALL)
fast_search = true;
max_key.type = BTRFS_XATTR_ITEM_KEY;
ret = drop_objectid_items(trans, log, path, ino,
max_key.type);
} else {
if (inode_only == LOG_INODE_ALL)
fast_search = true;
inode_item_dropped = false;
goto log_extents;
}
}
if (ret) {
err = ret;
goto out_unlock;
}
err = copy_inode_items_to_log(trans, inode, &min_key, &max_key,
path, dst_path, logged_isize,
recursive_logging, inode_only, ctx,
&need_log_inode_item);
if (err)
goto out_unlock;
btrfs_release_path(path);
btrfs_release_path(dst_path);
err = btrfs_log_all_xattrs(trans, root, inode, path, dst_path);
if (err)
goto out_unlock;
xattrs_logged = true;
if (max_key.type >= BTRFS_EXTENT_DATA_KEY && !fast_search) { btrfs_release_path(path);
btrfs_release_path(dst_path);
err = btrfs_log_holes(trans, root, inode, path);
if (err)
goto out_unlock;
}
log_extents:
btrfs_release_path(path);
btrfs_release_path(dst_path);
if (need_log_inode_item) {
err = log_inode_item(trans, log, dst_path, inode, inode_item_dropped);
if (err)
goto out_unlock;
/*
* If we are doing a fast fsync and the inode was logged before
* in this transaction, we don't need to log the xattrs because
* they were logged before. If xattrs were added, changed or
* deleted since the last time we logged the inode, then we have
* already logged them because the inode had the runtime flag
* BTRFS_INODE_COPY_EVERYTHING set.
*/
if (!xattrs_logged && inode->logged_trans < trans->transid) { err = btrfs_log_all_xattrs(trans, root, inode, path,
dst_path);
if (err)
goto out_unlock;
btrfs_release_path(path);
}
}
if (fast_search) {
ret = btrfs_log_changed_extents(trans, root, inode, dst_path,
ctx);
if (ret) {
err = ret;
goto out_unlock;
}
} else if (inode_only == LOG_INODE_ALL) {
struct extent_map *em, *n;
write_lock(&em_tree->lock);
list_for_each_entry_safe(em, n, &em_tree->modified_extents, list)
list_del_init(&em->list);
write_unlock(&em_tree->lock);
}
if (inode_only == LOG_INODE_ALL && S_ISDIR(inode->vfs_inode.i_mode)) { ret = log_directory_changes(trans, root, inode, path, dst_path,
ctx);
if (ret) {
err = ret;
goto out_unlock;
}
}
/*
* If we are logging that an ancestor inode exists as part of logging a
* new name from a link or rename operation, don't mark the inode as
* logged - otherwise if an explicit fsync is made against an ancestor,
* the fsync considers the inode in the log and doesn't sync the log,
* resulting in the ancestor missing after a power failure unless the
* log was synced as part of an fsync against any other unrelated inode.
* So keep it simple for this case and just don't flag the ancestors as
* logged.
*/
if (!ctx || !(S_ISDIR(inode->vfs_inode.i_mode) && ctx->logging_new_name && &inode->vfs_inode != ctx->inode)) {
spin_lock(&inode->lock);
inode->logged_trans = trans->transid;
/*
* Don't update last_log_commit if we logged that an inode exists.
* We do this for two reasons:
*
* 1) We might have had buffered writes to this inode that were
* flushed and had their ordered extents completed in this
* transaction, but we did not previously log the inode with
* LOG_INODE_ALL. Later the inode was evicted and after that
* it was loaded again and this LOG_INODE_EXISTS log operation
* happened. We must make sure that if an explicit fsync against
* the inode is performed later, it logs the new extents, an
* updated inode item, etc, and syncs the log. The same logic
* applies to direct IO writes instead of buffered writes.
*
* 2) When we log the inode with LOG_INODE_EXISTS, its inode item
* is logged with an i_size of 0 or whatever value was logged
* before. If later the i_size of the inode is increased by a
* truncate operation, the log is synced through an fsync of
* some other inode and then finally an explicit fsync against
* this inode is made, we must make sure this fsync logs the
* inode with the new i_size, the hole between old i_size and
* the new i_size, and syncs the log.
*/
if (inode_only != LOG_INODE_EXISTS)
inode->last_log_commit = inode->last_sub_trans;
spin_unlock(&inode->lock);
}
out_unlock:
mutex_unlock(&inode->log_mutex);
btrfs_free_path(path);
btrfs_free_path(dst_path);
return err;
}
/*
* Check if we need to log an inode. This is used in contexts where while
* logging an inode we need to log another inode (either that it exists or in
* full mode). This is used instead of btrfs_inode_in_log() because the later
* requires the inode to be in the log and have the log transaction committed,
* while here we do not care if the log transaction was already committed - our
* caller will commit the log later - and we want to avoid logging an inode
* multiple times when multiple tasks have joined the same log transaction.
*/
static bool need_log_inode(struct btrfs_trans_handle *trans,
struct btrfs_inode *inode)
{
/*
* If a directory was not modified, no dentries added or removed, we can
* and should avoid logging it.
*/
if (S_ISDIR(inode->vfs_inode.i_mode) && inode->last_trans < trans->transid)
return false;
/*
* If this inode does not have new/updated/deleted xattrs since the last
* time it was logged and is flagged as logged in the current transaction,
* we can skip logging it. As for new/deleted names, those are updated in
* the log by link/unlink/rename operations.
* In case the inode was logged and then evicted and reloaded, its
* logged_trans will be 0, in which case we have to fully log it since
* logged_trans is a transient field, not persisted.
*/
if (inode->logged_trans == trans->transid && !test_bit(BTRFS_INODE_COPY_EVERYTHING, &inode->runtime_flags))
return false;
return true;
}
struct btrfs_dir_list {
u64 ino;
struct list_head list;
};
/*
* Log the inodes of the new dentries of a directory. See log_dir_items() for
* details about the why it is needed.
* This is a recursive operation - if an existing dentry corresponds to a
* directory, that directory's new entries are logged too (same behaviour as
* ext3/4, xfs, f2fs, reiserfs, nilfs2). Note that when logging the inodes
* the dentries point to we do not lock their i_mutex, otherwise lockdep
* complains about the following circular lock dependency / possible deadlock:
*
* CPU0 CPU1
* ---- ----
* lock(&type->i_mutex_dir_key#3/2);
* lock(sb_internal#2);
* lock(&type->i_mutex_dir_key#3/2);
* lock(&sb->s_type->i_mutex_key#14);
*
* Where sb_internal is the lock (a counter that works as a lock) acquired by
* sb_start_intwrite() in btrfs_start_transaction().
* Not locking i_mutex of the inodes is still safe because:
*
* 1) For regular files we log with a mode of LOG_INODE_EXISTS. It's possible
* that while logging the inode new references (names) are added or removed
* from the inode, leaving the logged inode item with a link count that does
* not match the number of logged inode reference items. This is fine because
* at log replay time we compute the real number of links and correct the
* link count in the inode item (see replay_one_buffer() and
* link_to_fixup_dir());
*
* 2) For directories we log with a mode of LOG_INODE_ALL. It's possible that
* while logging the inode's items new items with keys BTRFS_DIR_ITEM_KEY and
* BTRFS_DIR_INDEX_KEY are added to fs/subvol tree and the logged inode item
* has a size that doesn't match the sum of the lengths of all the logged
* names. This does not result in a problem because if a dir_item key is
* logged but its matching dir_index key is not logged, at log replay time we
* don't use it to replay the respective name (see replay_one_name()). On the
* other hand if only the dir_index key ends up being logged, the respective
* name is added to the fs/subvol tree with both the dir_item and dir_index
* keys created (see replay_one_name()).
* The directory's inode item with a wrong i_size is not a problem as well,
* since we don't use it at log replay time to set the i_size in the inode
* item of the fs/subvol tree (see overwrite_item()).
*/
static int log_new_dir_dentries(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_inode *start_inode,
struct btrfs_log_ctx *ctx)
{
struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_root *log = root->log_root;
struct btrfs_path *path;
LIST_HEAD(dir_list);
struct btrfs_dir_list *dir_elem;
int ret = 0;
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
dir_elem = kmalloc(sizeof(*dir_elem), GFP_NOFS);
if (!dir_elem) {
btrfs_free_path(path);
return -ENOMEM;
}
dir_elem->ino = btrfs_ino(start_inode);
list_add_tail(&dir_elem->list, &dir_list);
while (!list_empty(&dir_list)) {
struct extent_buffer *leaf;
struct btrfs_key min_key;
int nritems;
int i;
dir_elem = list_first_entry(&dir_list, struct btrfs_dir_list,
list);
if (ret)
goto next_dir_inode;
min_key.objectid = dir_elem->ino;
min_key.type = BTRFS_DIR_ITEM_KEY;
min_key.offset = 0;
again:
btrfs_release_path(path);
ret = btrfs_search_forward(log, &min_key, path, trans->transid);
if (ret < 0) {
goto next_dir_inode;
} else if (ret > 0) {
ret = 0;
goto next_dir_inode;
}
process_leaf:
leaf = path->nodes[0];
nritems = btrfs_header_nritems(leaf);
for (i = path->slots[0]; i < nritems; i++) {
struct btrfs_dir_item *di;
struct btrfs_key di_key;
struct inode *di_inode;
struct btrfs_dir_list *new_dir_elem;
int log_mode = LOG_INODE_EXISTS;
int type;
btrfs_item_key_to_cpu(leaf, &min_key, i);
if (min_key.objectid != dir_elem->ino ||
min_key.type != BTRFS_DIR_ITEM_KEY) goto next_dir_inode;
di = btrfs_item_ptr(leaf, i, struct btrfs_dir_item);
type = btrfs_dir_type(leaf, di);
if (btrfs_dir_transid(leaf, di) < trans->transid &&
type != BTRFS_FT_DIR)
continue;
btrfs_dir_item_key_to_cpu(leaf, di, &di_key);
if (di_key.type == BTRFS_ROOT_ITEM_KEY)
continue;
btrfs_release_path(path);
di_inode = btrfs_iget(fs_info->sb, di_key.objectid, root);
if (IS_ERR(di_inode)) {
ret = PTR_ERR(di_inode);
goto next_dir_inode;
}
if (!need_log_inode(trans, BTRFS_I(di_inode))) {
btrfs_add_delayed_iput(di_inode); break;
}
ctx->log_new_dentries = false; if (type == BTRFS_FT_DIR || type == BTRFS_FT_SYMLINK)
log_mode = LOG_INODE_ALL;
ret = btrfs_log_inode(trans, root, BTRFS_I(di_inode),
log_mode, ctx);
btrfs_add_delayed_iput(di_inode);
if (ret)
goto next_dir_inode;
if (ctx->log_new_dentries) {
new_dir_elem = kmalloc(sizeof(*new_dir_elem),
GFP_NOFS);
if (!new_dir_elem) {
ret = -ENOMEM;
goto next_dir_inode;
}
new_dir_elem->ino = di_key.objectid;
list_add_tail(&new_dir_elem->list, &dir_list);
}
break;
}
if (i == nritems) {
ret = btrfs_next_leaf(log, path);
if (ret < 0) {
goto next_dir_inode;
} else if (ret > 0) {
ret = 0;
goto next_dir_inode;
}
goto process_leaf;
}
if (min_key.offset < (u64)-1) { min_key.offset++;
goto again;
}
next_dir_inode:
list_del(&dir_elem->list);
kfree(dir_elem);
}
btrfs_free_path(path); return ret;
}
static int btrfs_log_all_parents(struct btrfs_trans_handle *trans,
struct btrfs_inode *inode,
struct btrfs_log_ctx *ctx)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
int ret;
struct btrfs_path *path;
struct btrfs_key key;
struct btrfs_root *root = inode->root;
const u64 ino = btrfs_ino(inode);
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
path->skip_locking = 1;
path->search_commit_root = 1;
key.objectid = ino;
key.type = BTRFS_INODE_REF_KEY;
key.offset = 0;
ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
if (ret < 0)
goto out;
while (true) {
struct extent_buffer *leaf = path->nodes[0];
int slot = path->slots[0];
u32 cur_offset = 0;
u32 item_size;
unsigned long ptr;
if (slot >= btrfs_header_nritems(leaf)) {
ret = btrfs_next_leaf(root, path);
if (ret < 0)
goto out;
else if (ret > 0)
break;
continue;
}
btrfs_item_key_to_cpu(leaf, &key, slot);
/* BTRFS_INODE_EXTREF_KEY is BTRFS_INODE_REF_KEY + 1 */
if (key.objectid != ino || key.type > BTRFS_INODE_EXTREF_KEY)
break;
item_size = btrfs_item_size_nr(leaf, slot);
ptr = btrfs_item_ptr_offset(leaf, slot);
while (cur_offset < item_size) {
struct btrfs_key inode_key;
struct inode *dir_inode;
inode_key.type = BTRFS_INODE_ITEM_KEY;
inode_key.offset = 0;
if (key.type == BTRFS_INODE_EXTREF_KEY) {
struct btrfs_inode_extref *extref;
extref = (struct btrfs_inode_extref *)
(ptr + cur_offset);
inode_key.objectid = btrfs_inode_extref_parent(
leaf, extref);
cur_offset += sizeof(*extref);
cur_offset += btrfs_inode_extref_name_len(leaf,
extref);
} else {
inode_key.objectid = key.offset;
cur_offset = item_size;
}
dir_inode = btrfs_iget(fs_info->sb, inode_key.objectid,
root);
/*
* If the parent inode was deleted, return an error to
* fallback to a transaction commit. This is to prevent
* getting an inode that was moved from one parent A to
* a parent B, got its former parent A deleted and then
* it got fsync'ed, from existing at both parents after
* a log replay (and the old parent still existing).
* Example:
*
* mkdir /mnt/A
* mkdir /mnt/B
* touch /mnt/B/bar
* sync
* mv /mnt/B/bar /mnt/A/bar
* mv -T /mnt/A /mnt/B
* fsync /mnt/B/bar
* <power fail>
*
* If we ignore the old parent B which got deleted,
* after a log replay we would have file bar linked
* at both parents and the old parent B would still
* exist.
*/
if (IS_ERR(dir_inode)) {
ret = PTR_ERR(dir_inode);
goto out;
}
if (!need_log_inode(trans, BTRFS_I(dir_inode))) {
btrfs_add_delayed_iput(dir_inode);
continue;
}
if (ctx) ctx->log_new_dentries = false; ret = btrfs_log_inode(trans, root, BTRFS_I(dir_inode),
LOG_INODE_ALL, ctx);
if (!ret && ctx && ctx->log_new_dentries) ret = log_new_dir_dentries(trans, root,
BTRFS_I(dir_inode), ctx);
btrfs_add_delayed_iput(dir_inode);
if (ret)
goto out;
}
path->slots[0]++;
}
ret = 0;
out:
btrfs_free_path(path);
return ret;
}
static int log_new_ancestors(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_path *path,
struct btrfs_log_ctx *ctx)
{
struct btrfs_key found_key;
btrfs_item_key_to_cpu(path->nodes[0], &found_key, path->slots[0]);
while (true) {
struct btrfs_fs_info *fs_info = root->fs_info;
struct extent_buffer *leaf = path->nodes[0];
int slot = path->slots[0];
struct btrfs_key search_key;
struct inode *inode;
u64 ino;
int ret = 0;
btrfs_release_path(path);
ino = found_key.offset;
search_key.objectid = found_key.offset;
search_key.type = BTRFS_INODE_ITEM_KEY;
search_key.offset = 0;
inode = btrfs_iget(fs_info->sb, ino, root);
if (IS_ERR(inode))
return PTR_ERR(inode); if (BTRFS_I(inode)->generation >= trans->transid &&
need_log_inode(trans, BTRFS_I(inode)))
ret = btrfs_log_inode(trans, root, BTRFS_I(inode),
LOG_INODE_EXISTS, ctx);
btrfs_add_delayed_iput(inode);
if (ret)
return ret;
if (search_key.objectid == BTRFS_FIRST_FREE_OBJECTID)
break;
search_key.type = BTRFS_INODE_REF_KEY;
ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
if (ret < 0)
return ret;
leaf = path->nodes[0];
slot = path->slots[0];
if (slot >= btrfs_header_nritems(leaf)) {
ret = btrfs_next_leaf(root, path);
if (ret < 0)
return ret;
else if (ret > 0)
return -ENOENT;
leaf = path->nodes[0];
slot = path->slots[0];
}
btrfs_item_key_to_cpu(leaf, &found_key, slot);
if (found_key.objectid != search_key.objectid || found_key.type != BTRFS_INODE_REF_KEY)
return -ENOENT;
}
return 0;
}
static int log_new_ancestors_fast(struct btrfs_trans_handle *trans,
struct btrfs_inode *inode,
struct dentry *parent,
struct btrfs_log_ctx *ctx)
{
struct btrfs_root *root = inode->root;
struct dentry *old_parent = NULL;
struct super_block *sb = inode->vfs_inode.i_sb;
int ret = 0;
while (true) {
if (!parent || d_really_is_negative(parent) || sb != parent->d_sb)
break;
inode = BTRFS_I(d_inode(parent));
if (root != inode->root)
break;
if (inode->generation >= trans->transid &&
need_log_inode(trans, inode)) {
ret = btrfs_log_inode(trans, root, inode,
LOG_INODE_EXISTS, ctx);
if (ret)
break;
}
if (IS_ROOT(parent))
break;
parent = dget_parent(parent);
dput(old_parent);
old_parent = parent;
}
dput(old_parent);
return ret;
}
static int log_all_new_ancestors(struct btrfs_trans_handle *trans,
struct btrfs_inode *inode,
struct dentry *parent,
struct btrfs_log_ctx *ctx)
{
struct btrfs_root *root = inode->root;
const u64 ino = btrfs_ino(inode);
struct btrfs_path *path;
struct btrfs_key search_key;
int ret;
/*
* For a single hard link case, go through a fast path that does not
* need to iterate the fs/subvolume tree.
*/
if (inode->vfs_inode.i_nlink < 2) return log_new_ancestors_fast(trans, inode, parent, ctx); path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
search_key.objectid = ino;
search_key.type = BTRFS_INODE_REF_KEY;
search_key.offset = 0;
again:
ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
if (ret < 0)
goto out;
if (ret == 0) path->slots[0]++;
while (true) {
struct extent_buffer *leaf = path->nodes[0];
int slot = path->slots[0];
struct btrfs_key found_key;
if (slot >= btrfs_header_nritems(leaf)) {
ret = btrfs_next_leaf(root, path);
if (ret < 0)
goto out; else if (ret > 0)
break;
continue;
}
btrfs_item_key_to_cpu(leaf, &found_key, slot);
if (found_key.objectid != ino ||
found_key.type > BTRFS_INODE_EXTREF_KEY)
break;
/*
* Don't deal with extended references because they are rare
* cases and too complex to deal with (we would need to keep
* track of which subitem we are processing for each item in
* this loop, etc). So just return some error to fallback to
* a transaction commit.
*/
if (found_key.type == BTRFS_INODE_EXTREF_KEY) {
ret = -EMLINK;
goto out;
}
/*
* Logging ancestors needs to do more searches on the fs/subvol
* tree, so it releases the path as needed to avoid deadlocks.
* Keep track of the last inode ref key and resume from that key
* after logging all new ancestors for the current hard link.
*/
memcpy(&search_key, &found_key, sizeof(search_key));
ret = log_new_ancestors(trans, root, path, ctx);
if (ret)
goto out;
btrfs_release_path(path);
goto again;
}
ret = 0;
out:
btrfs_free_path(path);
return ret;
}
/*
* helper function around btrfs_log_inode to make sure newly created
* parent directories also end up in the log. A minimal inode and backref
* only logging is done of any parent directories that are older than
* the last committed transaction
*/
static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
struct btrfs_inode *inode,
struct dentry *parent,
int inode_only,
struct btrfs_log_ctx *ctx)
{
struct btrfs_root *root = inode->root;
struct btrfs_fs_info *fs_info = root->fs_info;
int ret = 0;
bool log_dentries = false;
if (btrfs_test_opt(fs_info, NOTREELOG)) {
ret = 1;
goto end_no_trans;
}
if (btrfs_root_refs(&root->root_item) == 0) {
ret = 1;
goto end_no_trans;
}
/*
* Skip already logged inodes or inodes corresponding to tmpfiles
* (since logging them is pointless, a link count of 0 means they
* will never be accessible).
*/
if ((btrfs_inode_in_log(inode, trans->transid) &&
list_empty(&ctx->ordered_extents)) ||
inode->vfs_inode.i_nlink == 0) {
ret = BTRFS_NO_LOG_SYNC;
goto end_no_trans;
}
ret = start_log_trans(trans, root, ctx);
if (ret)
goto end_no_trans;
ret = btrfs_log_inode(trans, root, inode, inode_only, ctx);
if (ret)
goto end_trans;
/*
* for regular files, if its inode is already on disk, we don't
* have to worry about the parents at all. This is because
* we can use the last_unlink_trans field to record renames
* and other fun in this file.
*/
if (S_ISREG(inode->vfs_inode.i_mode) && inode->generation < trans->transid &&
inode->last_unlink_trans < trans->transid) {
ret = 0;
goto end_trans;
}
if (S_ISDIR(inode->vfs_inode.i_mode) && ctx && ctx->log_new_dentries)
log_dentries = true;
/*
* On unlink we must make sure all our current and old parent directory
* inodes are fully logged. This is to prevent leaving dangling
* directory index entries in directories that were our parents but are
* not anymore. Not doing this results in old parent directory being
* impossible to delete after log replay (rmdir will always fail with
* error -ENOTEMPTY).
*
* Example 1:
*
* mkdir testdir
* touch testdir/foo
* ln testdir/foo testdir/bar
* sync
* unlink testdir/bar
* xfs_io -c fsync testdir/foo
* <power failure>
* mount fs, triggers log replay
*
* If we don't log the parent directory (testdir), after log replay the
* directory still has an entry pointing to the file inode using the bar
* name, but a matching BTRFS_INODE_[REF|EXTREF]_KEY does not exist and
* the file inode has a link count of 1.
*
* Example 2:
*
* mkdir testdir
* touch foo
* ln foo testdir/foo2
* ln foo testdir/foo3
* sync
* unlink testdir/foo3
* xfs_io -c fsync foo
* <power failure>
* mount fs, triggers log replay
*
* Similar as the first example, after log replay the parent directory
* testdir still has an entry pointing to the inode file with name foo3
* but the file inode does not have a matching BTRFS_INODE_REF_KEY item
* and has a link count of 2.
*/
if (inode->last_unlink_trans >= trans->transid) {
ret = btrfs_log_all_parents(trans, inode, ctx);
if (ret)
goto end_trans;
}
ret = log_all_new_ancestors(trans, inode, parent, ctx);
if (ret)
goto end_trans;
if (log_dentries) ret = log_new_dir_dentries(trans, root, inode, ctx);
else
ret = 0;
end_trans:
if (ret < 0) { btrfs_set_log_full_commit(trans);
ret = 1;
}
if (ret)
btrfs_remove_log_ctx(root, ctx);
btrfs_end_log_trans(root);
end_no_trans:
return ret;
}
/*
* it is not safe to log dentry if the chunk root has added new
* chunks. This returns 0 if the dentry was logged, and 1 otherwise.
* If this returns 1, you must commit the transaction to safely get your
* data on disk.
*/
int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans,
struct dentry *dentry,
struct btrfs_log_ctx *ctx)
{
struct dentry *parent = dget_parent(dentry);
int ret;
ret = btrfs_log_inode_parent(trans, BTRFS_I(d_inode(dentry)), parent,
LOG_INODE_ALL, ctx);
dput(parent);
return ret;
}
/*
* should be called during mount to recover any replay any log trees
* from the FS
*/
int btrfs_recover_log_trees(struct btrfs_root *log_root_tree)
{
int ret;
struct btrfs_path *path;
struct btrfs_trans_handle *trans;
struct btrfs_key key;
struct btrfs_key found_key;
struct btrfs_root *log;
struct btrfs_fs_info *fs_info = log_root_tree->fs_info;
struct walk_control wc = {
.process_func = process_one_buffer,
.stage = LOG_WALK_PIN_ONLY,
};
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
set_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags);
trans = btrfs_start_transaction(fs_info->tree_root, 0);
if (IS_ERR(trans)) {
ret = PTR_ERR(trans);
goto error;
}
wc.trans = trans;
wc.pin = 1;
ret = walk_log_tree(trans, log_root_tree, &wc);
if (ret) {
btrfs_handle_fs_error(fs_info, ret,
"Failed to pin buffers while recovering log root tree.");
goto error;
}
again:
key.objectid = BTRFS_TREE_LOG_OBJECTID;
key.offset = (u64)-1;
key.type = BTRFS_ROOT_ITEM_KEY;
while (1) {
ret = btrfs_search_slot(NULL, log_root_tree, &key, path, 0, 0);
if (ret < 0) {
btrfs_handle_fs_error(fs_info, ret,
"Couldn't find tree log root.");
goto error;
}
if (ret > 0) {
if (path->slots[0] == 0)
break;
path->slots[0]--;
}
btrfs_item_key_to_cpu(path->nodes[0], &found_key,
path->slots[0]);
btrfs_release_path(path);
if (found_key.objectid != BTRFS_TREE_LOG_OBJECTID)
break;
log = btrfs_read_tree_root(log_root_tree, &found_key);
if (IS_ERR(log)) {
ret = PTR_ERR(log);
btrfs_handle_fs_error(fs_info, ret,
"Couldn't read tree log root.");
goto error;
}
wc.replay_dest = btrfs_get_fs_root(fs_info, found_key.offset,
true);
if (IS_ERR(wc.replay_dest)) {
ret = PTR_ERR(wc.replay_dest);
/*
* We didn't find the subvol, likely because it was
* deleted. This is ok, simply skip this log and go to
* the next one.
*
* We need to exclude the root because we can't have
* other log replays overwriting this log as we'll read
* it back in a few more times. This will keep our
* block from being modified, and we'll just bail for
* each subsequent pass.
*/
if (ret == -ENOENT)
ret = btrfs_pin_extent_for_log_replay(trans,
log->node->start,
log->node->len);
btrfs_put_root(log);
if (!ret)
goto next;
btrfs_handle_fs_error(fs_info, ret,
"Couldn't read target root for tree log recovery.");
goto error;
}
wc.replay_dest->log_root = log;
ret = btrfs_record_root_in_trans(trans, wc.replay_dest);
if (ret)
/* The loop needs to continue due to the root refs */
btrfs_handle_fs_error(fs_info, ret,
"failed to record the log root in transaction");
else
ret = walk_log_tree(trans, log, &wc);
if (!ret && wc.stage == LOG_WALK_REPLAY_ALL) {
ret = fixup_inode_link_counts(trans, wc.replay_dest,
path);
}
if (!ret && wc.stage == LOG_WALK_REPLAY_ALL) {
struct btrfs_root *root = wc.replay_dest;
btrfs_release_path(path);
/*
* We have just replayed everything, and the highest
* objectid of fs roots probably has changed in case
* some inode_item's got replayed.
*
* root->objectid_mutex is not acquired as log replay
* could only happen during mount.
*/
ret = btrfs_init_root_free_objectid(root);
}
wc.replay_dest->log_root = NULL;
btrfs_put_root(wc.replay_dest);
btrfs_put_root(log);
if (ret)
goto error;
next:
if (found_key.offset == 0)
break;
key.offset = found_key.offset - 1;
}
btrfs_release_path(path);
/* step one is to pin it all, step two is to replay just inodes */
if (wc.pin) {
wc.pin = 0;
wc.process_func = replay_one_buffer;
wc.stage = LOG_WALK_REPLAY_INODES;
goto again;
}
/* step three is to replay everything */
if (wc.stage < LOG_WALK_REPLAY_ALL) {
wc.stage++;
goto again;
}
btrfs_free_path(path);
/* step 4: commit the transaction, which also unpins the blocks */
ret = btrfs_commit_transaction(trans);
if (ret)
return ret;
log_root_tree->log_root = NULL;
clear_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags);
btrfs_put_root(log_root_tree);
return 0;
error:
if (wc.trans)
btrfs_end_transaction(wc.trans);
clear_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags);
btrfs_free_path(path);
return ret;
}
/*
* there are some corner cases where we want to force a full
* commit instead of allowing a directory to be logged.
*
* They revolve around files there were unlinked from the directory, and
* this function updates the parent directory so that a full commit is
* properly done if it is fsync'd later after the unlinks are done.
*
* Must be called before the unlink operations (updates to the subvolume tree,
* inodes, etc) are done.
*/
void btrfs_record_unlink_dir(struct btrfs_trans_handle *trans,
struct btrfs_inode *dir, struct btrfs_inode *inode,
int for_rename)
{
/*
* when we're logging a file, if it hasn't been renamed
* or unlinked, and its inode is fully committed on disk,
* we don't have to worry about walking up the directory chain
* to log its parents.
*
* So, we use the last_unlink_trans field to put this transid
* into the file. When the file is logged we check it and
* don't log the parents if the file is fully on disk.
*/
mutex_lock(&inode->log_mutex);
inode->last_unlink_trans = trans->transid;
mutex_unlock(&inode->log_mutex);
/*
* if this directory was already logged any new
* names for this file/dir will get recorded
*/
if (dir->logged_trans == trans->transid)
return;
/*
* if the inode we're about to unlink was logged,
* the log will be properly updated for any new names
*/
if (inode->logged_trans == trans->transid)
return;
/*
* when renaming files across directories, if the directory
* there we're unlinking from gets fsync'd later on, there's
* no way to find the destination directory later and fsync it
* properly. So, we have to be conservative and force commits
* so the new name gets discovered.
*/
if (for_rename)
goto record;
/* we can safely do the unlink without any special recording */
return;
record:
mutex_lock(&dir->log_mutex);
dir->last_unlink_trans = trans->transid;
mutex_unlock(&dir->log_mutex);
}
/*
* Make sure that if someone attempts to fsync the parent directory of a deleted
* snapshot, it ends up triggering a transaction commit. This is to guarantee
* that after replaying the log tree of the parent directory's root we will not
* see the snapshot anymore and at log replay time we will not see any log tree
* corresponding to the deleted snapshot's root, which could lead to replaying
* it after replaying the log tree of the parent directory (which would replay
* the snapshot delete operation).
*
* Must be called before the actual snapshot destroy operation (updates to the
* parent root and tree of tree roots trees, etc) are done.
*/
void btrfs_record_snapshot_destroy(struct btrfs_trans_handle *trans,
struct btrfs_inode *dir)
{
mutex_lock(&dir->log_mutex);
dir->last_unlink_trans = trans->transid;
mutex_unlock(&dir->log_mutex);
}
/*
* Call this after adding a new name for a file and it will properly
* update the log to reflect the new name.
*/
void btrfs_log_new_name(struct btrfs_trans_handle *trans,
struct btrfs_inode *inode, struct btrfs_inode *old_dir,
struct dentry *parent)
{
struct btrfs_log_ctx ctx;
/*
* this will force the logging code to walk the dentry chain
* up for the file
*/
if (!S_ISDIR(inode->vfs_inode.i_mode)) inode->last_unlink_trans = trans->transid;
/*
* if this inode hasn't been logged and directory we're renaming it
* from hasn't been logged, we don't need to log it
*/
if (!inode_logged(trans, inode) && (!old_dir || !inode_logged(trans, old_dir))) return;
/*
* If we are doing a rename (old_dir is not NULL) from a directory that
* was previously logged, make sure the next log attempt on the directory
* is not skipped and logs the inode again. This is because the log may
* not currently be authoritative for a range including the old
* BTRFS_DIR_ITEM_KEY and BTRFS_DIR_INDEX_KEY keys, so we want to make
* sure after a log replay we do not end up with both the new and old
* dentries around (in case the inode is a directory we would have a
* directory with two hard links and 2 inode references for different
* parents). The next log attempt of old_dir will happen at
* btrfs_log_all_parents(), called through btrfs_log_inode_parent()
* below, because we have previously set inode->last_unlink_trans to the
* current transaction ID, either here or at btrfs_record_unlink_dir() in
* case inode is a directory.
*/
if (old_dir) old_dir->logged_trans = 0; btrfs_init_log_ctx(&ctx, &inode->vfs_inode);
ctx.logging_new_name = true;
/*
* We don't care about the return value. If we fail to log the new name
* then we know the next attempt to sync the log will fallback to a full
* transaction commit (due to a call to btrfs_set_log_full_commit()), so
* we don't need to worry about getting a log committed that has an
* inconsistent state after a rename operation.
*/
btrfs_log_inode_parent(trans, inode, parent, LOG_INODE_EXISTS, &ctx);
}
// SPDX-License-Identifier: GPL-2.0-only
/* Kernel thread helper functions.
* Copyright (C) 2004 IBM Corporation, Rusty Russell.
* Copyright (C) 2009 Red Hat, Inc.
*
* Creation is done via kthreadd, so that we get a clean environment
* even if we're invoked from userspace (think modprobe, hotplug cpu,
* etc.).
*/
#include <uapi/linux/sched/types.h>
#include <linux/mm.h>
#include <linux/mmu_context.h>
#include <linux/sched.h>
#include <linux/sched/mm.h>
#include <linux/sched/task.h>
#include <linux/kthread.h>
#include <linux/completion.h>
#include <linux/err.h>
#include <linux/cgroup.h>
#include <linux/cpuset.h>
#include <linux/unistd.h>
#include <linux/file.h>
#include <linux/export.h>
#include <linux/mutex.h>
#include <linux/slab.h>
#include <linux/freezer.h>
#include <linux/ptrace.h>
#include <linux/uaccess.h>
#include <linux/numa.h>
#include <linux/sched/isolation.h>
#include <trace/events/sched.h>
static DEFINE_SPINLOCK(kthread_create_lock);
static LIST_HEAD(kthread_create_list);
struct task_struct *kthreadd_task;
struct kthread_create_info
{
/* Information passed to kthread() from kthreadd. */
int (*threadfn)(void *data);
void *data;
int node;
/* Result passed back to kthread_create() from kthreadd. */
struct task_struct *result;
struct completion *done;
struct list_head list;
};
struct kthread {
unsigned long flags;
unsigned int cpu;
int (*threadfn)(void *);
void *data;
mm_segment_t oldfs;
struct completion parked;
struct completion exited;
#ifdef CONFIG_BLK_CGROUP
struct cgroup_subsys_state *blkcg_css;
#endif
};
enum KTHREAD_BITS {
KTHREAD_IS_PER_CPU = 0,
KTHREAD_SHOULD_STOP,
KTHREAD_SHOULD_PARK,
};
static inline struct kthread *to_kthread(struct task_struct *k)
{
WARN_ON(!(k->flags & PF_KTHREAD)); return (__force void *)k->set_child_tid;
}
/*
* Variant of to_kthread() that doesn't assume @p is a kthread.
*
* Per construction; when:
*
* (p->flags & PF_KTHREAD) && p->set_child_tid
*
* the task is both a kthread and struct kthread is persistent. However
* PF_KTHREAD on it's own is not, kernel_thread() can exec() (See umh.c and
* begin_new_exec()).
*/
static inline struct kthread *__to_kthread(struct task_struct *p)
{
void *kthread = (__force void *)p->set_child_tid;
if (kthread && !(p->flags & PF_KTHREAD))
kthread = NULL;
return kthread;
}
void set_kthread_struct(struct task_struct *p)
{
struct kthread *kthread;
if (__to_kthread(p))
return;
kthread = kzalloc(sizeof(*kthread), GFP_KERNEL);
/*
* We abuse ->set_child_tid to avoid the new member and because it
* can't be wrongly copied by copy_process(). We also rely on fact
* that the caller can't exec, so PF_KTHREAD can't be cleared.
*/
p->set_child_tid = (__force void __user *)kthread;
}
void free_kthread_struct(struct task_struct *k)
{
struct kthread *kthread;
/*
* Can be NULL if this kthread was created by kernel_thread()
* or if kmalloc() in kthread() failed.
*/
kthread = to_kthread(k);
#ifdef CONFIG_BLK_CGROUP
WARN_ON_ONCE(kthread && kthread->blkcg_css);
#endif
kfree(kthread);
}
/**
* kthread_should_stop - should this kthread return now?
*
* When someone calls kthread_stop() on your kthread, it will be woken
* and this will return true. You should then return, and your return
* value will be passed through to kthread_stop().
*/
bool kthread_should_stop(void)
{
return test_bit(KTHREAD_SHOULD_STOP, &to_kthread(current)->flags);
}
EXPORT_SYMBOL(kthread_should_stop);
bool __kthread_should_park(struct task_struct *k)
{
return test_bit(KTHREAD_SHOULD_PARK, &to_kthread(k)->flags);
}
EXPORT_SYMBOL_GPL(__kthread_should_park);
/**
* kthread_should_park - should this kthread park now?
*
* When someone calls kthread_park() on your kthread, it will be woken
* and this will return true. You should then do the necessary
* cleanup and call kthread_parkme()
*
* Similar to kthread_should_stop(), but this keeps the thread alive
* and in a park position. kthread_unpark() "restarts" the thread and
* calls the thread function again.
*/
bool kthread_should_park(void)
{
return __kthread_should_park(current);
}
EXPORT_SYMBOL_GPL(kthread_should_park);
/**
* kthread_freezable_should_stop - should this freezable kthread return now?
* @was_frozen: optional out parameter, indicates whether %current was frozen
*
* kthread_should_stop() for freezable kthreads, which will enter
* refrigerator if necessary. This function is safe from kthread_stop() /
* freezer deadlock and freezable kthreads should use this function instead
* of calling try_to_freeze() directly.
*/
bool kthread_freezable_should_stop(bool *was_frozen)
{
bool frozen = false;
might_sleep();
if (unlikely(freezing(current)))
frozen = __refrigerator(true);
if (was_frozen)
*was_frozen = frozen;
return kthread_should_stop();
}
EXPORT_SYMBOL_GPL(kthread_freezable_should_stop);
/**
* kthread_func - return the function specified on kthread creation
* @task: kthread task in question
*
* Returns NULL if the task is not a kthread.
*/
void *kthread_func(struct task_struct *task)
{
struct kthread *kthread = __to_kthread(task);
if (kthread)
return kthread->threadfn;
return NULL;
}
EXPORT_SYMBOL_GPL(kthread_func);
/**
* kthread_data - return data value specified on kthread creation
* @task: kthread task in question
*
* Return the data value specified when kthread @task was created.
* The caller is responsible for ensuring the validity of @task when
* calling this function.
*/
void *kthread_data(struct task_struct *task)
{
return to_kthread(task)->data;
}
EXPORT_SYMBOL_GPL(kthread_data);
/**
* kthread_probe_data - speculative version of kthread_data()
* @task: possible kthread task in question
*
* @task could be a kthread task. Return the data value specified when it
* was created if accessible. If @task isn't a kthread task or its data is
* inaccessible for any reason, %NULL is returned. This function requires
* that @task itself is safe to dereference.
*/
void *kthread_probe_data(struct task_struct *task)
{
struct kthread *kthread = __to_kthread(task);
void *data = NULL;
if (kthread)
copy_from_kernel_nofault(&data, &kthread->data, sizeof(data));
return data;
}
static void __kthread_parkme(struct kthread *self)
{
for (;;) {
/*
* TASK_PARKED is a special state; we must serialize against
* possible pending wakeups to avoid store-store collisions on
* task->state.
*
* Such a collision might possibly result in the task state
* changin from TASK_PARKED and us failing the
* wait_task_inactive() in kthread_park().
*/
set_special_state(TASK_PARKED);
if (!test_bit(KTHREAD_SHOULD_PARK, &self->flags))
break;
/*
* Thread is going to call schedule(), do not preempt it,
* or the caller of kthread_park() may spend more time in
* wait_task_inactive().
*/
preempt_disable();
complete(&self->parked);
schedule_preempt_disabled();
preempt_enable();
}
__set_current_state(TASK_RUNNING);
}
void kthread_parkme(void)
{
__kthread_parkme(to_kthread(current));
}
EXPORT_SYMBOL_GPL(kthread_parkme);
static int kthread(void *_create)
{
/* Copy data: it's on kthread's stack */
struct kthread_create_info *create = _create;
int (*threadfn)(void *data) = create->threadfn;
void *data = create->data;
struct completion *done;
struct kthread *self;
int ret;
set_kthread_struct(current);
self = to_kthread(current);
/* If user was SIGKILLed, I release the structure. */
done = xchg(&create->done, NULL);
if (!done) {
kfree(create);
do_exit(-EINTR);
}
if (!self) {
create->result = ERR_PTR(-ENOMEM);
complete(done);
do_exit(-ENOMEM);
}
self->threadfn = threadfn;
self->data = data;
init_completion(&self->exited);
init_completion(&self->parked);
current->vfork_done = &self->exited;
/* OK, tell user we're spawned, wait for stop or wakeup */
__set_current_state(TASK_UNINTERRUPTIBLE);
create->result = current;
/*
* Thread is going to call schedule(), do not preempt it,
* or the creator may spend more time in wait_task_inactive().
*/
preempt_disable();
complete(done);
schedule_preempt_disabled();
preempt_enable();
ret = -EINTR;
if (!test_bit(KTHREAD_SHOULD_STOP, &self->flags)) {
cgroup_kthread_ready();
__kthread_parkme(self);
ret = threadfn(data);
}
do_exit(ret);
}
/* called from kernel_clone() to get node information for about to be created task */
int tsk_fork_get_node(struct task_struct *tsk)
{
#ifdef CONFIG_NUMA
if (tsk == kthreadd_task)
return tsk->pref_node_fork;
#endif
return NUMA_NO_NODE;
}
static void create_kthread(struct kthread_create_info *create)
{
int pid;
#ifdef CONFIG_NUMA
current->pref_node_fork = create->node;
#endif
/* We want our own signal handler (we take no signals by default). */
pid = kernel_thread(kthread, create, CLONE_FS | CLONE_FILES | SIGCHLD);
if (pid < 0) {
/* If user was SIGKILLed, I release the structure. */
struct completion *done = xchg(&create->done, NULL);
if (!done) {
kfree(create);
return;
}
create->result = ERR_PTR(pid);
complete(done);
}
}
static __printf(4, 0)
struct task_struct *__kthread_create_on_node(int (*threadfn)(void *data),
void *data, int node,
const char namefmt[],
va_list args)
{
DECLARE_COMPLETION_ONSTACK(done);
struct task_struct *task;
struct kthread_create_info *create = kmalloc(sizeof(*create),
GFP_KERNEL);
if (!create)
return ERR_PTR(-ENOMEM);
create->threadfn = threadfn;
create->data = data;
create->node = node;
create->done = &done;
spin_lock(&kthread_create_lock);
list_add_tail(&create->list, &kthread_create_list);
spin_unlock(&kthread_create_lock);
wake_up_process(kthreadd_task);
/*
* Wait for completion in killable state, for I might be chosen by
* the OOM killer while kthreadd is trying to allocate memory for
* new kernel thread.
*/
if (unlikely(wait_for_completion_killable(&done))) {
/*
* If I was SIGKILLed before kthreadd (or new kernel thread)
* calls complete(), leave the cleanup of this structure to
* that thread.
*/
if (xchg(&create->done, NULL))
return ERR_PTR(-EINTR);
/*
* kthreadd (or new kernel thread) will call complete()
* shortly.
*/
wait_for_completion(&done);
}
task = create->result;
if (!IS_ERR(task)) {
static const struct sched_param param = { .sched_priority = 0 };
char name[TASK_COMM_LEN];
/*
* task is already visible to other tasks, so updating
* COMM must be protected.
*/
vsnprintf(name, sizeof(name), namefmt, args);
set_task_comm(task, name);
/*
* root may have changed our (kthreadd's) priority or CPU mask.
* The kernel thread should not inherit these properties.
*/
sched_setscheduler_nocheck(task, SCHED_NORMAL, ¶m);
set_cpus_allowed_ptr(task,
housekeeping_cpumask(HK_FLAG_KTHREAD));
}
kfree(create); return task;
}
/**
* kthread_create_on_node - create a kthread.
* @threadfn: the function to run until signal_pending(current).
* @data: data ptr for @threadfn.
* @node: task and thread structures for the thread are allocated on this node
* @namefmt: printf-style name for the thread.
*
* Description: This helper function creates and names a kernel
* thread. The thread will be stopped: use wake_up_process() to start
* it. See also kthread_run(). The new thread has SCHED_NORMAL policy and
* is affine to all CPUs.
*
* If thread is going to be bound on a particular cpu, give its node
* in @node, to get NUMA affinity for kthread stack, or else give NUMA_NO_NODE.
* When woken, the thread will run @threadfn() with @data as its
* argument. @threadfn() can either call do_exit() directly if it is a
* standalone thread for which no one will call kthread_stop(), or
* return when 'kthread_should_stop()' is true (which means
* kthread_stop() has been called). The return value should be zero
* or a negative error number; it will be passed to kthread_stop().
*
* Returns a task_struct or ERR_PTR(-ENOMEM) or ERR_PTR(-EINTR).
*/
struct task_struct *kthread_create_on_node(int (*threadfn)(void *data),
void *data, int node,
const char namefmt[],
...)
{
struct task_struct *task;
va_list args;
va_start(args, namefmt);
task = __kthread_create_on_node(threadfn, data, node, namefmt, args);
va_end(args);
return task;
}
EXPORT_SYMBOL(kthread_create_on_node);
static void __kthread_bind_mask(struct task_struct *p, const struct cpumask *mask, unsigned int state)
{
unsigned long flags;
if (!wait_task_inactive(p, state)) { WARN_ON(1);
return;
}
/* It's safe because the task is inactive. */
raw_spin_lock_irqsave(&p->pi_lock, flags);
do_set_cpus_allowed(p, mask);
p->flags |= PF_NO_SETAFFINITY;
raw_spin_unlock_irqrestore(&p->pi_lock, flags);
}
static void __kthread_bind(struct task_struct *p, unsigned int cpu, unsigned int state)
{
__kthread_bind_mask(p, cpumask_of(cpu), state);
}
void kthread_bind_mask(struct task_struct *p, const struct cpumask *mask)
{
__kthread_bind_mask(p, mask, TASK_UNINTERRUPTIBLE);
}
/**
* kthread_bind - bind a just-created kthread to a cpu.
* @p: thread created by kthread_create().
* @cpu: cpu (might not be online, must be possible) for @k to run on.
*
* Description: This function is equivalent to set_cpus_allowed(),
* except that @cpu doesn't need to be online, and the thread must be
* stopped (i.e., just returned from kthread_create()).
*/
void kthread_bind(struct task_struct *p, unsigned int cpu)
{
__kthread_bind(p, cpu, TASK_UNINTERRUPTIBLE);
}
EXPORT_SYMBOL(kthread_bind);
/**
* kthread_create_on_cpu - Create a cpu bound kthread
* @threadfn: the function to run until signal_pending(current).
* @data: data ptr for @threadfn.
* @cpu: The cpu on which the thread should be bound,
* @namefmt: printf-style name for the thread. Format is restricted
* to "name.*%u". Code fills in cpu number.
*
* Description: This helper function creates and names a kernel thread
*/
struct task_struct *kthread_create_on_cpu(int (*threadfn)(void *data),
void *data, unsigned int cpu,
const char *namefmt)
{
struct task_struct *p;
p = kthread_create_on_node(threadfn, data, cpu_to_node(cpu), namefmt,
cpu);
if (IS_ERR(p))
return p;
kthread_bind(p, cpu);
/* CPU hotplug need to bind once again when unparking the thread. */
to_kthread(p)->cpu = cpu;
return p;
}
void kthread_set_per_cpu(struct task_struct *k, int cpu)
{
struct kthread *kthread = to_kthread(k);
if (!kthread)
return;
WARN_ON_ONCE(!(k->flags & PF_NO_SETAFFINITY));
if (cpu < 0) {
clear_bit(KTHREAD_IS_PER_CPU, &kthread->flags);
return;
}
kthread->cpu = cpu;
set_bit(KTHREAD_IS_PER_CPU, &kthread->flags);
}
bool kthread_is_per_cpu(struct task_struct *p)
{
struct kthread *kthread = __to_kthread(p);
if (!kthread)
return false;
return test_bit(KTHREAD_IS_PER_CPU, &kthread->flags);}
/**
* kthread_unpark - unpark a thread created by kthread_create().
* @k: thread created by kthread_create().
*
* Sets kthread_should_park() for @k to return false, wakes it, and
* waits for it to return. If the thread is marked percpu then its
* bound to the cpu again.
*/
void kthread_unpark(struct task_struct *k)
{
struct kthread *kthread = to_kthread(k);
/*
* Newly created kthread was parked when the CPU was offline.
* The binding was lost and we need to set it again.
*/
if (test_bit(KTHREAD_IS_PER_CPU, &kthread->flags))
__kthread_bind(k, kthread->cpu, TASK_PARKED);
clear_bit(KTHREAD_SHOULD_PARK, &kthread->flags);
/*
* __kthread_parkme() will either see !SHOULD_PARK or get the wakeup.
*/
wake_up_state(k, TASK_PARKED);
}
EXPORT_SYMBOL_GPL(kthread_unpark);
/**
* kthread_park - park a thread created by kthread_create().
* @k: thread created by kthread_create().
*
* Sets kthread_should_park() for @k to return true, wakes it, and
* waits for it to return. This can also be called after kthread_create()
* instead of calling wake_up_process(): the thread will park without
* calling threadfn().
*
* Returns 0 if the thread is parked, -ENOSYS if the thread exited.
* If called by the kthread itself just the park bit is set.
*/
int kthread_park(struct task_struct *k)
{
struct kthread *kthread = to_kthread(k); if (WARN_ON(k->flags & PF_EXITING))
return -ENOSYS;
if (WARN_ON_ONCE(test_bit(KTHREAD_SHOULD_PARK, &kthread->flags)))
return -EBUSY;
set_bit(KTHREAD_SHOULD_PARK, &kthread->flags);
if (k != current) {
wake_up_process(k);
/*
* Wait for __kthread_parkme() to complete(), this means we
* _will_ have TASK_PARKED and are about to call schedule().
*/
wait_for_completion(&kthread->parked);
/*
* Now wait for that schedule() to complete and the task to
* get scheduled out.
*/
WARN_ON_ONCE(!wait_task_inactive(k, TASK_PARKED));
}
return 0;
}
EXPORT_SYMBOL_GPL(kthread_park);
/**
* kthread_stop - stop a thread created by kthread_create().
* @k: thread created by kthread_create().
*
* Sets kthread_should_stop() for @k to return true, wakes it, and
* waits for it to exit. This can also be called after kthread_create()
* instead of calling wake_up_process(): the thread will exit without
* calling threadfn().
*
* If threadfn() may call do_exit() itself, the caller must ensure
* task_struct can't go away.
*
* Returns the result of threadfn(), or %-EINTR if wake_up_process()
* was never called.
*/
int kthread_stop(struct task_struct *k)
{
struct kthread *kthread;
int ret;
trace_sched_kthread_stop(k);
get_task_struct(k);
kthread = to_kthread(k);
set_bit(KTHREAD_SHOULD_STOP, &kthread->flags);
kthread_unpark(k);
wake_up_process(k);
wait_for_completion(&kthread->exited);
ret = k->exit_code;
put_task_struct(k);
trace_sched_kthread_stop_ret(ret);
return ret;
}
EXPORT_SYMBOL(kthread_stop);
int kthreadd(void *unused)
{
struct task_struct *tsk = current;
/* Setup a clean context for our children to inherit. */
set_task_comm(tsk, "kthreadd");
ignore_signals(tsk);
set_cpus_allowed_ptr(tsk, housekeeping_cpumask(HK_FLAG_KTHREAD));
set_mems_allowed(node_states[N_MEMORY]);
current->flags |= PF_NOFREEZE;
cgroup_init_kthreadd();
for (;;) {
set_current_state(TASK_INTERRUPTIBLE);
if (list_empty(&kthread_create_list))
schedule();
__set_current_state(TASK_RUNNING);
spin_lock(&kthread_create_lock);
while (!list_empty(&kthread_create_list)) {
struct kthread_create_info *create;
create = list_entry(kthread_create_list.next,
struct kthread_create_info, list);
list_del_init(&create->list);
spin_unlock(&kthread_create_lock);
create_kthread(create);
spin_lock(&kthread_create_lock);
}
spin_unlock(&kthread_create_lock);
}
return 0;
}
void __kthread_init_worker(struct kthread_worker *worker,
const char *name,
struct lock_class_key *key)
{
memset(worker, 0, sizeof(struct kthread_worker));
raw_spin_lock_init(&worker->lock);
lockdep_set_class_and_name(&worker->lock, key, name);
INIT_LIST_HEAD(&worker->work_list);
INIT_LIST_HEAD(&worker->delayed_work_list);
}
EXPORT_SYMBOL_GPL(__kthread_init_worker);
/**
* kthread_worker_fn - kthread function to process kthread_worker
* @worker_ptr: pointer to initialized kthread_worker
*
* This function implements the main cycle of kthread worker. It processes
* work_list until it is stopped with kthread_stop(). It sleeps when the queue
* is empty.
*
* The works are not allowed to keep any locks, disable preemption or interrupts
* when they finish. There is defined a safe point for freezing when one work
* finishes and before a new one is started.
*
* Also the works must not be handled by more than one worker at the same time,
* see also kthread_queue_work().
*/
int kthread_worker_fn(void *worker_ptr)
{
struct kthread_worker *worker = worker_ptr;
struct kthread_work *work;
/*
* FIXME: Update the check and remove the assignment when all kthread
* worker users are created using kthread_create_worker*() functions.
*/
WARN_ON(worker->task && worker->task != current);
worker->task = current;
if (worker->flags & KTW_FREEZABLE)
set_freezable();
repeat:
set_current_state(TASK_INTERRUPTIBLE); /* mb paired w/ kthread_stop */
if (kthread_should_stop()) {
__set_current_state(TASK_RUNNING);
raw_spin_lock_irq(&worker->lock);
worker->task = NULL;
raw_spin_unlock_irq(&worker->lock);
return 0;
}
work = NULL;
raw_spin_lock_irq(&worker->lock);
if (!list_empty(&worker->work_list)) {
work = list_first_entry(&worker->work_list,
struct kthread_work, node);
list_del_init(&work->node);
}
worker->current_work = work;
raw_spin_unlock_irq(&worker->lock);
if (work) {
kthread_work_func_t func = work->func;
__set_current_state(TASK_RUNNING);
trace_sched_kthread_work_execute_start(work);
work->func(work);
/*
* Avoid dereferencing work after this point. The trace
* event only cares about the address.
*/
trace_sched_kthread_work_execute_end(work, func);
} else if (!freezing(current))
schedule();
try_to_freeze();
cond_resched();
goto repeat;
}
EXPORT_SYMBOL_GPL(kthread_worker_fn);
static __printf(3, 0) struct kthread_worker *
__kthread_create_worker(int cpu, unsigned int flags,
const char namefmt[], va_list args)
{
struct kthread_worker *worker;
struct task_struct *task;
int node = NUMA_NO_NODE;
worker = kzalloc(sizeof(*worker), GFP_KERNEL);
if (!worker)
return ERR_PTR(-ENOMEM);
kthread_init_worker(worker);
if (cpu >= 0)
node = cpu_to_node(cpu);
task = __kthread_create_on_node(kthread_worker_fn, worker,
node, namefmt, args);
if (IS_ERR(task))
goto fail_task;
if (cpu >= 0)
kthread_bind(task, cpu);
worker->flags = flags;
worker->task = task;
wake_up_process(task);
return worker;
fail_task:
kfree(worker);
return ERR_CAST(task);
}
/**
* kthread_create_worker - create a kthread worker
* @flags: flags modifying the default behavior of the worker
* @namefmt: printf-style name for the kthread worker (task).
*
* Returns a pointer to the allocated worker on success, ERR_PTR(-ENOMEM)
* when the needed structures could not get allocated, and ERR_PTR(-EINTR)
* when the worker was SIGKILLed.
*/
struct kthread_worker *
kthread_create_worker(unsigned int flags, const char namefmt[], ...)
{
struct kthread_worker *worker;
va_list args;
va_start(args, namefmt);
worker = __kthread_create_worker(-1, flags, namefmt, args);
va_end(args);
return worker;
}
EXPORT_SYMBOL(kthread_create_worker);
/**
* kthread_create_worker_on_cpu - create a kthread worker and bind it
* to a given CPU and the associated NUMA node.
* @cpu: CPU number
* @flags: flags modifying the default behavior of the worker
* @namefmt: printf-style name for the kthread worker (task).
*
* Use a valid CPU number if you want to bind the kthread worker
* to the given CPU and the associated NUMA node.
*
* A good practice is to add the cpu number also into the worker name.
* For example, use kthread_create_worker_on_cpu(cpu, "helper/%d", cpu).
*
* CPU hotplug:
* The kthread worker API is simple and generic. It just provides a way
* to create, use, and destroy workers.
*
* It is up to the API user how to handle CPU hotplug. They have to decide
* how to handle pending work items, prevent queuing new ones, and
* restore the functionality when the CPU goes off and on. There are a
* few catches:
*
* - CPU affinity gets lost when it is scheduled on an offline CPU.
*
* - The worker might not exist when the CPU was off when the user
* created the workers.
*
* Good practice is to implement two CPU hotplug callbacks and to
* destroy/create the worker when the CPU goes down/up.
*
* Return:
* The pointer to the allocated worker on success, ERR_PTR(-ENOMEM)
* when the needed structures could not get allocated, and ERR_PTR(-EINTR)
* when the worker was SIGKILLed.
*/
struct kthread_worker *
kthread_create_worker_on_cpu(int cpu, unsigned int flags,
const char namefmt[], ...)
{
struct kthread_worker *worker;
va_list args;
va_start(args, namefmt);
worker = __kthread_create_worker(cpu, flags, namefmt, args);
va_end(args);
return worker;
}
EXPORT_SYMBOL(kthread_create_worker_on_cpu);
/*
* Returns true when the work could not be queued at the moment.
* It happens when it is already pending in a worker list
* or when it is being cancelled.
*/
static inline bool queuing_blocked(struct kthread_worker *worker,
struct kthread_work *work)
{
lockdep_assert_held(&worker->lock);
return !list_empty(&work->node) || work->canceling;
}
static void kthread_insert_work_sanity_check(struct kthread_worker *worker,
struct kthread_work *work)
{
lockdep_assert_held(&worker->lock);
WARN_ON_ONCE(!list_empty(&work->node));
/* Do not use a work with >1 worker, see kthread_queue_work() */
WARN_ON_ONCE(work->worker && work->worker != worker);
}
/* insert @work before @pos in @worker */
static void kthread_insert_work(struct kthread_worker *worker,
struct kthread_work *work,
struct list_head *pos)
{
kthread_insert_work_sanity_check(worker, work);
trace_sched_kthread_work_queue_work(worker, work);
list_add_tail(&work->node, pos);
work->worker = worker;
if (!worker->current_work && likely(worker->task))
wake_up_process(worker->task);
}
/**
* kthread_queue_work - queue a kthread_work
* @worker: target kthread_worker
* @work: kthread_work to queue
*
* Queue @work to work processor @task for async execution. @task
* must have been created with kthread_worker_create(). Returns %true
* if @work was successfully queued, %false if it was already pending.
*
* Reinitialize the work if it needs to be used by another worker.
* For example, when the worker was stopped and started again.
*/
bool kthread_queue_work(struct kthread_worker *worker,
struct kthread_work *work)
{
bool ret = false;
unsigned long flags;
raw_spin_lock_irqsave(&worker->lock, flags);
if (!queuing_blocked(worker, work)) {
kthread_insert_work(worker, work, &worker->work_list);
ret = true;
}
raw_spin_unlock_irqrestore(&worker->lock, flags);
return ret;
}
EXPORT_SYMBOL_GPL(kthread_queue_work);
/**
* kthread_delayed_work_timer_fn - callback that queues the associated kthread
* delayed work when the timer expires.
* @t: pointer to the expired timer
*
* The format of the function is defined by struct timer_list.
* It should have been called from irqsafe timer with irq already off.
*/
void kthread_delayed_work_timer_fn(struct timer_list *t)
{
struct kthread_delayed_work *dwork = from_timer(dwork, t, timer);
struct kthread_work *work = &dwork->work;
struct kthread_worker *worker = work->worker;
unsigned long flags;
/*
* This might happen when a pending work is reinitialized.
* It means that it is used a wrong way.
*/
if (WARN_ON_ONCE(!worker))
return;
raw_spin_lock_irqsave(&worker->lock, flags);
/* Work must not be used with >1 worker, see kthread_queue_work(). */
WARN_ON_ONCE(work->worker != worker);
/* Move the work from worker->delayed_work_list. */
WARN_ON_ONCE(list_empty(&work->node));
list_del_init(&work->node);
if (!work->canceling)
kthread_insert_work(worker, work, &worker->work_list);
raw_spin_unlock_irqrestore(&worker->lock, flags);
}
EXPORT_SYMBOL(kthread_delayed_work_timer_fn);
static void __kthread_queue_delayed_work(struct kthread_worker *worker,
struct kthread_delayed_work *dwork,
unsigned long delay)
{
struct timer_list *timer = &dwork->timer;
struct kthread_work *work = &dwork->work;
WARN_ON_FUNCTION_MISMATCH(timer->function,
kthread_delayed_work_timer_fn);
/*
* If @delay is 0, queue @dwork->work immediately. This is for
* both optimization and correctness. The earliest @timer can
* expire is on the closest next tick and delayed_work users depend
* on that there's no such delay when @delay is 0.
*/
if (!delay) {
kthread_insert_work(worker, work, &worker->work_list);
return;
}
/* Be paranoid and try to detect possible races already now. */
kthread_insert_work_sanity_check(worker, work);
list_add(&work->node, &worker->delayed_work_list);
work->worker = worker;
timer->expires = jiffies + delay;
add_timer(timer);
}
/**
* kthread_queue_delayed_work - queue the associated kthread work
* after a delay.
* @worker: target kthread_worker
* @dwork: kthread_delayed_work to queue
* @delay: number of jiffies to wait before queuing
*
* If the work has not been pending it starts a timer that will queue
* the work after the given @delay. If @delay is zero, it queues the
* work immediately.
*
* Return: %false if the @work has already been pending. It means that
* either the timer was running or the work was queued. It returns %true
* otherwise.
*/
bool kthread_queue_delayed_work(struct kthread_worker *worker,
struct kthread_delayed_work *dwork,
unsigned long delay)
{
struct kthread_work *work = &dwork->work;
unsigned long flags;
bool ret = false;
raw_spin_lock_irqsave(&worker->lock, flags);
if (!queuing_blocked(worker, work)) {
__kthread_queue_delayed_work(worker, dwork, delay);
ret = true;
}
raw_spin_unlock_irqrestore(&worker->lock, flags);
return ret;
}
EXPORT_SYMBOL_GPL(kthread_queue_delayed_work);
struct kthread_flush_work {
struct kthread_work work;
struct completion done;
};
static void kthread_flush_work_fn(struct kthread_work *work)
{
struct kthread_flush_work *fwork =
container_of(work, struct kthread_flush_work, work);
complete(&fwork->done);
}
/**
* kthread_flush_work - flush a kthread_work
* @work: work to flush
*
* If @work is queued or executing, wait for it to finish execution.
*/
void kthread_flush_work(struct kthread_work *work)
{
struct kthread_flush_work fwork = {
KTHREAD_WORK_INIT(fwork.work, kthread_flush_work_fn),
COMPLETION_INITIALIZER_ONSTACK(fwork.done),
};
struct kthread_worker *worker;
bool noop = false;
worker = work->worker;
if (!worker)
return;
raw_spin_lock_irq(&worker->lock);
/* Work must not be used with >1 worker, see kthread_queue_work(). */
WARN_ON_ONCE(work->worker != worker);
if (!list_empty(&work->node))
kthread_insert_work(worker, &fwork.work, work->node.next);
else if (worker->current_work == work)
kthread_insert_work(worker, &fwork.work,
worker->work_list.next);
else
noop = true;
raw_spin_unlock_irq(&worker->lock);
if (!noop)
wait_for_completion(&fwork.done);
}
EXPORT_SYMBOL_GPL(kthread_flush_work);
/*
* Make sure that the timer is neither set nor running and could
* not manipulate the work list_head any longer.
*
* The function is called under worker->lock. The lock is temporary
* released but the timer can't be set again in the meantime.
*/
static void kthread_cancel_delayed_work_timer(struct kthread_work *work,
unsigned long *flags)
{
struct kthread_delayed_work *dwork =
container_of(work, struct kthread_delayed_work, work);
struct kthread_worker *worker = work->worker;
/*
* del_timer_sync() must be called to make sure that the timer
* callback is not running. The lock must be temporary released
* to avoid a deadlock with the callback. In the meantime,
* any queuing is blocked by setting the canceling counter.
*/
work->canceling++;
raw_spin_unlock_irqrestore(&worker->lock, *flags);
del_timer_sync(&dwork->timer);
raw_spin_lock_irqsave(&worker->lock, *flags);
work->canceling--;
}
/*
* This function removes the work from the worker queue.
*
* It is called under worker->lock. The caller must make sure that
* the timer used by delayed work is not running, e.g. by calling
* kthread_cancel_delayed_work_timer().
*
* The work might still be in use when this function finishes. See the
* current_work proceed by the worker.
*
* Return: %true if @work was pending and successfully canceled,
* %false if @work was not pending
*/
static bool __kthread_cancel_work(struct kthread_work *work)
{
/*
* Try to remove the work from a worker list. It might either
* be from worker->work_list or from worker->delayed_work_list.
*/
if (!list_empty(&work->node)) {
list_del_init(&work->node);
return true;
}
return false;
}
/**
* kthread_mod_delayed_work - modify delay of or queue a kthread delayed work
* @worker: kthread worker to use
* @dwork: kthread delayed work to queue
* @delay: number of jiffies to wait before queuing
*
* If @dwork is idle, equivalent to kthread_queue_delayed_work(). Otherwise,
* modify @dwork's timer so that it expires after @delay. If @delay is zero,
* @work is guaranteed to be queued immediately.
*
* Return: %false if @dwork was idle and queued, %true otherwise.
*
* A special case is when the work is being canceled in parallel.
* It might be caused either by the real kthread_cancel_delayed_work_sync()
* or yet another kthread_mod_delayed_work() call. We let the other command
* win and return %true here. The return value can be used for reference
* counting and the number of queued works stays the same. Anyway, the caller
* is supposed to synchronize these operations a reasonable way.
*
* This function is safe to call from any context including IRQ handler.
* See __kthread_cancel_work() and kthread_delayed_work_timer_fn()
* for details.
*/
bool kthread_mod_delayed_work(struct kthread_worker *worker,
struct kthread_delayed_work *dwork,
unsigned long delay)
{
struct kthread_work *work = &dwork->work;
unsigned long flags;
int ret;
raw_spin_lock_irqsave(&worker->lock, flags);
/* Do not bother with canceling when never queued. */
if (!work->worker) {
ret = false;
goto fast_queue;
}
/* Work must not be used with >1 worker, see kthread_queue_work() */
WARN_ON_ONCE(work->worker != worker);
/*
* Temporary cancel the work but do not fight with another command
* that is canceling the work as well.
*
* It is a bit tricky because of possible races with another
* mod_delayed_work() and cancel_delayed_work() callers.
*
* The timer must be canceled first because worker->lock is released
* when doing so. But the work can be removed from the queue (list)
* only when it can be queued again so that the return value can
* be used for reference counting.
*/
kthread_cancel_delayed_work_timer(work, &flags);
if (work->canceling) {
/* The number of works in the queue does not change. */
ret = true;
goto out;
}
ret = __kthread_cancel_work(work);
fast_queue:
__kthread_queue_delayed_work(worker, dwork, delay);
out:
raw_spin_unlock_irqrestore(&worker->lock, flags);
return ret;
}
EXPORT_SYMBOL_GPL(kthread_mod_delayed_work);
static bool __kthread_cancel_work_sync(struct kthread_work *work, bool is_dwork)
{
struct kthread_worker *worker = work->worker;
unsigned long flags;
int ret = false;
if (!worker)
goto out;
raw_spin_lock_irqsave(&worker->lock, flags);
/* Work must not be used with >1 worker, see kthread_queue_work(). */
WARN_ON_ONCE(work->worker != worker);
if (is_dwork)
kthread_cancel_delayed_work_timer(work, &flags);
ret = __kthread_cancel_work(work);
if (worker->current_work != work)
goto out_fast;
/*
* The work is in progress and we need to wait with the lock released.
* In the meantime, block any queuing by setting the canceling counter.
*/
work->canceling++;
raw_spin_unlock_irqrestore(&worker->lock, flags);
kthread_flush_work(work);
raw_spin_lock_irqsave(&worker->lock, flags);
work->canceling--;
out_fast:
raw_spin_unlock_irqrestore(&worker->lock, flags);
out:
return ret;
}
/**
* kthread_cancel_work_sync - cancel a kthread work and wait for it to finish
* @work: the kthread work to cancel
*
* Cancel @work and wait for its execution to finish. This function
* can be used even if the work re-queues itself. On return from this
* function, @work is guaranteed to be not pending or executing on any CPU.
*
* kthread_cancel_work_sync(&delayed_work->work) must not be used for
* delayed_work's. Use kthread_cancel_delayed_work_sync() instead.
*
* The caller must ensure that the worker on which @work was last
* queued can't be destroyed before this function returns.
*
* Return: %true if @work was pending, %false otherwise.
*/
bool kthread_cancel_work_sync(struct kthread_work *work)
{
return __kthread_cancel_work_sync(work, false);
}
EXPORT_SYMBOL_GPL(kthread_cancel_work_sync);
/**
* kthread_cancel_delayed_work_sync - cancel a kthread delayed work and
* wait for it to finish.
* @dwork: the kthread delayed work to cancel
*
* This is kthread_cancel_work_sync() for delayed works.
*
* Return: %true if @dwork was pending, %false otherwise.
*/
bool kthread_cancel_delayed_work_sync(struct kthread_delayed_work *dwork)
{
return __kthread_cancel_work_sync(&dwork->work, true);
}
EXPORT_SYMBOL_GPL(kthread_cancel_delayed_work_sync);
/**
* kthread_flush_worker - flush all current works on a kthread_worker
* @worker: worker to flush
*
* Wait until all currently executing or pending works on @worker are
* finished.
*/
void kthread_flush_worker(struct kthread_worker *worker)
{
struct kthread_flush_work fwork = {
KTHREAD_WORK_INIT(fwork.work, kthread_flush_work_fn),
COMPLETION_INITIALIZER_ONSTACK(fwork.done),
};
kthread_queue_work(worker, &fwork.work);
wait_for_completion(&fwork.done);
}
EXPORT_SYMBOL_GPL(kthread_flush_worker);
/**
* kthread_destroy_worker - destroy a kthread worker
* @worker: worker to be destroyed
*
* Flush and destroy @worker. The simple flush is enough because the kthread
* worker API is used only in trivial scenarios. There are no multi-step state
* machines needed.
*/
void kthread_destroy_worker(struct kthread_worker *worker)
{
struct task_struct *task;
task = worker->task;
if (WARN_ON(!task))
return;
kthread_flush_worker(worker);
kthread_stop(task);
WARN_ON(!list_empty(&worker->work_list));
kfree(worker);
}
EXPORT_SYMBOL(kthread_destroy_worker);
/**
* kthread_use_mm - make the calling kthread operate on an address space
* @mm: address space to operate on
*/
void kthread_use_mm(struct mm_struct *mm)
{
struct mm_struct *active_mm;
struct task_struct *tsk = current;
WARN_ON_ONCE(!(tsk->flags & PF_KTHREAD));
WARN_ON_ONCE(tsk->mm);
task_lock(tsk);
/* Hold off tlb flush IPIs while switching mm's */
local_irq_disable();
active_mm = tsk->active_mm;
if (active_mm != mm) {
mmgrab(mm);
tsk->active_mm = mm;
}
tsk->mm = mm;
membarrier_update_current_mm(mm);
switch_mm_irqs_off(active_mm, mm, tsk);
local_irq_enable();
task_unlock(tsk);
#ifdef finish_arch_post_lock_switch
finish_arch_post_lock_switch();
#endif
/*
* When a kthread starts operating on an address space, the loop
* in membarrier_{private,global}_expedited() may not observe
* that tsk->mm, and not issue an IPI. Membarrier requires a
* memory barrier after storing to tsk->mm, before accessing
* user-space memory. A full memory barrier for membarrier
* {PRIVATE,GLOBAL}_EXPEDITED is implicitly provided by
* mmdrop(), or explicitly with smp_mb().
*/
if (active_mm != mm)
mmdrop(active_mm);
else
smp_mb();
to_kthread(tsk)->oldfs = force_uaccess_begin();
}
EXPORT_SYMBOL_GPL(kthread_use_mm);
/**
* kthread_unuse_mm - reverse the effect of kthread_use_mm()
* @mm: address space to operate on
*/
void kthread_unuse_mm(struct mm_struct *mm)
{
struct task_struct *tsk = current;
WARN_ON_ONCE(!(tsk->flags & PF_KTHREAD));
WARN_ON_ONCE(!tsk->mm);
force_uaccess_end(to_kthread(tsk)->oldfs);
task_lock(tsk);
/*
* When a kthread stops operating on an address space, the loop
* in membarrier_{private,global}_expedited() may not observe
* that tsk->mm, and not issue an IPI. Membarrier requires a
* memory barrier after accessing user-space memory, before
* clearing tsk->mm.
*/
smp_mb__after_spinlock();
sync_mm_rss(mm);
local_irq_disable();
tsk->mm = NULL;
membarrier_update_current_mm(NULL);
/* active_mm is still 'mm' */
enter_lazy_tlb(mm, tsk);
local_irq_enable();
task_unlock(tsk);
}
EXPORT_SYMBOL_GPL(kthread_unuse_mm);
#ifdef CONFIG_BLK_CGROUP
/**
* kthread_associate_blkcg - associate blkcg to current kthread
* @css: the cgroup info
*
* Current thread must be a kthread. The thread is running jobs on behalf of
* other threads. In some cases, we expect the jobs attach cgroup info of
* original threads instead of that of current thread. This function stores
* original thread's cgroup info in current kthread context for later
* retrieval.
*/
void kthread_associate_blkcg(struct cgroup_subsys_state *css)
{
struct kthread *kthread;
if (!(current->flags & PF_KTHREAD))
return;
kthread = to_kthread(current);
if (!kthread)
return;
if (kthread->blkcg_css) {
css_put(kthread->blkcg_css);
kthread->blkcg_css = NULL;
}
if (css) {
css_get(css);
kthread->blkcg_css = css;
}
}
EXPORT_SYMBOL(kthread_associate_blkcg);
/**
* kthread_blkcg - get associated blkcg css of current kthread
*
* Current thread must be a kthread.
*/
struct cgroup_subsys_state *kthread_blkcg(void)
{
struct kthread *kthread;
if (current->flags & PF_KTHREAD) {
kthread = to_kthread(current);
if (kthread)
return kthread->blkcg_css;
}
return NULL;
}
EXPORT_SYMBOL(kthread_blkcg);
#endif
// SPDX-License-Identifier: GPL-2.0
/*
* fs/sysfs/group.c - Operations for adding/removing multiple files at once.
*
* Copyright (c) 2003 Patrick Mochel
* Copyright (c) 2003 Open Source Development Lab
* Copyright (c) 2013 Greg Kroah-Hartman
* Copyright (c) 2013 The Linux Foundation
*/
#include <linux/kobject.h>
#include <linux/module.h>
#include <linux/dcache.h>
#include <linux/namei.h>
#include <linux/err.h>
#include <linux/fs.h>
#include "sysfs.h"
static void remove_files(struct kernfs_node *parent,
const struct attribute_group *grp)
{
struct attribute *const *attr;
struct bin_attribute *const *bin_attr;
if (grp->attrs) for (attr = grp->attrs; *attr; attr++) kernfs_remove_by_name(parent, (*attr)->name); if (grp->bin_attrs) for (bin_attr = grp->bin_attrs; *bin_attr; bin_attr++) kernfs_remove_by_name(parent, (*bin_attr)->attr.name);
}
static int create_files(struct kernfs_node *parent, struct kobject *kobj,
kuid_t uid, kgid_t gid,
const struct attribute_group *grp, int update)
{
struct attribute *const *attr;
struct bin_attribute *const *bin_attr;
int error = 0, i;
if (grp->attrs) {
for (i = 0, attr = grp->attrs; *attr && !error; i++, attr++) { umode_t mode = (*attr)->mode;
/*
* In update mode, we're changing the permissions or
* visibility. Do this by first removing then
* re-adding (if required) the file.
*/
if (update)
kernfs_remove_by_name(parent, (*attr)->name); if (grp->is_visible) { mode = grp->is_visible(kobj, *attr, i); if (!mode)
continue;
}
WARN(mode & ~(SYSFS_PREALLOC | 0664),
"Attribute %s: Invalid permissions 0%o\n",
(*attr)->name, mode);
mode &= SYSFS_PREALLOC | 0664;
error = sysfs_add_file_mode_ns(parent, *attr, false,
mode, uid, gid, NULL);
if (unlikely(error))
break;
}
if (error) {
remove_files(parent, grp);
goto exit;
}
}
if (grp->bin_attrs) { for (i = 0, bin_attr = grp->bin_attrs; *bin_attr; i++, bin_attr++) { umode_t mode = (*bin_attr)->attr.mode;
if (update)
kernfs_remove_by_name(parent,
(*bin_attr)->attr.name);
if (grp->is_bin_visible) { mode = grp->is_bin_visible(kobj, *bin_attr, i); if (!mode)
continue;
}
WARN(mode & ~(SYSFS_PREALLOC | 0664),
"Attribute %s: Invalid permissions 0%o\n",
(*bin_attr)->attr.name, mode);
mode &= SYSFS_PREALLOC | 0664;
error = sysfs_add_file_mode_ns(parent,
&(*bin_attr)->attr, true,
mode,
uid, gid, NULL);
if (error)
break;
}
if (error)
remove_files(parent, grp);
}
exit:
return error;
}
static int internal_create_group(struct kobject *kobj, int update,
const struct attribute_group *grp)
{
struct kernfs_node *kn;
kuid_t uid;
kgid_t gid;
int error;
if (WARN_ON(!kobj || (!update && !kobj->sd)))
return -EINVAL;
/* Updates may happen before the object has been instantiated */
if (unlikely(update && !kobj->sd))
return -EINVAL;
if (!grp->attrs && !grp->bin_attrs) { WARN(1, "sysfs: (bin_)attrs not set by subsystem for group: %s/%s\n",
kobj->name, grp->name ?: "");
return -EINVAL;
}
kobject_get_ownership(kobj, &uid, &gid);
if (grp->name) {
if (update) {
kn = kernfs_find_and_get(kobj->sd, grp->name);
if (!kn) {
pr_warn("Can't update unknown attr grp name: %s/%s\n",
kobj->name, grp->name);
return -EINVAL;
}
} else {
kn = kernfs_create_dir_ns(kobj->sd, grp->name,
S_IRWXU | S_IRUGO | S_IXUGO,
uid, gid, kobj, NULL);
if (IS_ERR(kn)) {
if (PTR_ERR(kn) == -EEXIST)
sysfs_warn_dup(kobj->sd, grp->name); return PTR_ERR(kn);
}
}
} else
kn = kobj->sd; kernfs_get(kn); error = create_files(kn, kobj, uid, gid, grp, update);
if (error) {
if (grp->name)
kernfs_remove(kn);
}
kernfs_put(kn); if (grp->name && update) kernfs_put(kn);
return error;
}
/**
* sysfs_create_group - given a directory kobject, create an attribute group
* @kobj: The kobject to create the group on
* @grp: The attribute group to create
*
* This function creates a group for the first time. It will explicitly
* warn and error if any of the attribute files being created already exist.
*
* Returns 0 on success or error code on failure.
*/
int sysfs_create_group(struct kobject *kobj,
const struct attribute_group *grp)
{
return internal_create_group(kobj, 0, grp);
}
EXPORT_SYMBOL_GPL(sysfs_create_group);
static int internal_create_groups(struct kobject *kobj, int update,
const struct attribute_group **groups)
{
int error = 0;
int i;
if (!groups)
return 0;
for (i = 0; groups[i]; i++) { error = internal_create_group(kobj, update, groups[i]);
if (error) {
while (--i >= 0) sysfs_remove_group(kobj, groups[i]);
break;
}
}
return error;
}
/**
* sysfs_create_groups - given a directory kobject, create a bunch of attribute groups
* @kobj: The kobject to create the group on
* @groups: The attribute groups to create, NULL terminated
*
* This function creates a bunch of attribute groups. If an error occurs when
* creating a group, all previously created groups will be removed, unwinding
* everything back to the original state when this function was called.
* It will explicitly warn and error if any of the attribute files being
* created already exist.
*
* Returns 0 on success or error code from sysfs_create_group on failure.
*/
int sysfs_create_groups(struct kobject *kobj,
const struct attribute_group **groups)
{
return internal_create_groups(kobj, 0, groups);
}
EXPORT_SYMBOL_GPL(sysfs_create_groups);
/**
* sysfs_update_groups - given a directory kobject, create a bunch of attribute groups
* @kobj: The kobject to update the group on
* @groups: The attribute groups to update, NULL terminated
*
* This function update a bunch of attribute groups. If an error occurs when
* updating a group, all previously updated groups will be removed together
* with already existing (not updated) attributes.
*
* Returns 0 on success or error code from sysfs_update_group on failure.
*/
int sysfs_update_groups(struct kobject *kobj,
const struct attribute_group **groups)
{
return internal_create_groups(kobj, 1, groups);
}
EXPORT_SYMBOL_GPL(sysfs_update_groups);
/**
* sysfs_update_group - given a directory kobject, update an attribute group
* @kobj: The kobject to update the group on
* @grp: The attribute group to update
*
* This function updates an attribute group. Unlike
* sysfs_create_group(), it will explicitly not warn or error if any
* of the attribute files being created already exist. Furthermore,
* if the visibility of the files has changed through the is_visible()
* callback, it will update the permissions and add or remove the
* relevant files. Changing a group's name (subdirectory name under
* kobj's directory in sysfs) is not allowed.
*
* The primary use for this function is to call it after making a change
* that affects group visibility.
*
* Returns 0 on success or error code on failure.
*/
int sysfs_update_group(struct kobject *kobj,
const struct attribute_group *grp)
{
return internal_create_group(kobj, 1, grp);
}
EXPORT_SYMBOL_GPL(sysfs_update_group);
/**
* sysfs_remove_group: remove a group from a kobject
* @kobj: kobject to remove the group from
* @grp: group to remove
*
* This function removes a group of attributes from a kobject. The attributes
* previously have to have been created for this group, otherwise it will fail.
*/
void sysfs_remove_group(struct kobject *kobj,
const struct attribute_group *grp)
{
struct kernfs_node *parent = kobj->sd;
struct kernfs_node *kn;
if (grp->name) {
kn = kernfs_find_and_get(parent, grp->name);
if (!kn) {
WARN(!kn, KERN_WARNING
"sysfs group '%s' not found for kobject '%s'\n",
grp->name, kobject_name(kobj));
return;
}
} else {
kn = parent;
kernfs_get(kn);
}
remove_files(kn, grp);
if (grp->name)
kernfs_remove(kn); kernfs_put(kn);
}
EXPORT_SYMBOL_GPL(sysfs_remove_group);
/**
* sysfs_remove_groups - remove a list of groups
*
* @kobj: The kobject for the groups to be removed from
* @groups: NULL terminated list of groups to be removed
*
* If groups is not NULL, remove the specified groups from the kobject.
*/
void sysfs_remove_groups(struct kobject *kobj,
const struct attribute_group **groups)
{
int i;
if (!groups)
return;
for (i = 0; groups[i]; i++) sysfs_remove_group(kobj, groups[i]);
}
EXPORT_SYMBOL_GPL(sysfs_remove_groups);
/**
* sysfs_merge_group - merge files into a pre-existing attribute group.
* @kobj: The kobject containing the group.
* @grp: The files to create and the attribute group they belong to.
*
* This function returns an error if the group doesn't exist or any of the
* files already exist in that group, in which case none of the new files
* are created.
*/
int sysfs_merge_group(struct kobject *kobj,
const struct attribute_group *grp)
{
struct kernfs_node *parent;
kuid_t uid;
kgid_t gid;
int error = 0;
struct attribute *const *attr;
int i;
parent = kernfs_find_and_get(kobj->sd, grp->name);
if (!parent)
return -ENOENT;
kobject_get_ownership(kobj, &uid, &gid); for ((i = 0, attr = grp->attrs); *attr && !error; (++i, ++attr))
error = sysfs_add_file_mode_ns(parent, *attr, false,
(*attr)->mode, uid, gid, NULL); if (error) { while (--i >= 0) kernfs_remove_by_name(parent, (*--attr)->name);
}
kernfs_put(parent); return error;
}
EXPORT_SYMBOL_GPL(sysfs_merge_group);
/**
* sysfs_unmerge_group - remove files from a pre-existing attribute group.
* @kobj: The kobject containing the group.
* @grp: The files to remove and the attribute group they belong to.
*/
void sysfs_unmerge_group(struct kobject *kobj,
const struct attribute_group *grp)
{
struct kernfs_node *parent;
struct attribute *const *attr;
parent = kernfs_find_and_get(kobj->sd, grp->name);
if (parent) {
for (attr = grp->attrs; *attr; ++attr) kernfs_remove_by_name(parent, (*attr)->name); kernfs_put(parent);
}
}
EXPORT_SYMBOL_GPL(sysfs_unmerge_group);
/**
* sysfs_add_link_to_group - add a symlink to an attribute group.
* @kobj: The kobject containing the group.
* @group_name: The name of the group.
* @target: The target kobject of the symlink to create.
* @link_name: The name of the symlink to create.
*/
int sysfs_add_link_to_group(struct kobject *kobj, const char *group_name,
struct kobject *target, const char *link_name)
{
struct kernfs_node *parent;
int error = 0;
parent = kernfs_find_and_get(kobj->sd, group_name);
if (!parent)
return -ENOENT;
error = sysfs_create_link_sd(parent, target, link_name);
kernfs_put(parent);
return error;
}
EXPORT_SYMBOL_GPL(sysfs_add_link_to_group);
/**
* sysfs_remove_link_from_group - remove a symlink from an attribute group.
* @kobj: The kobject containing the group.
* @group_name: The name of the group.
* @link_name: The name of the symlink to remove.
*/
void sysfs_remove_link_from_group(struct kobject *kobj, const char *group_name,
const char *link_name)
{
struct kernfs_node *parent;
parent = kernfs_find_and_get(kobj->sd, group_name);
if (parent) {
kernfs_remove_by_name(parent, link_name);
kernfs_put(parent);
}
}
EXPORT_SYMBOL_GPL(sysfs_remove_link_from_group);
/**
* compat_only_sysfs_link_entry_to_kobj - add a symlink to a kobject pointing
* to a group or an attribute
* @kobj: The kobject containing the group.
* @target_kobj: The target kobject.
* @target_name: The name of the target group or attribute.
* @symlink_name: The name of the symlink file (target_name will be
* considered if symlink_name is NULL).
*/
int compat_only_sysfs_link_entry_to_kobj(struct kobject *kobj,
struct kobject *target_kobj,
const char *target_name,
const char *symlink_name)
{
struct kernfs_node *target;
struct kernfs_node *entry;
struct kernfs_node *link;
/*
* We don't own @target_kobj and it may be removed at any time.
* Synchronize using sysfs_symlink_target_lock. See sysfs_remove_dir()
* for details.
*/
spin_lock(&sysfs_symlink_target_lock);
target = target_kobj->sd;
if (target)
kernfs_get(target);
spin_unlock(&sysfs_symlink_target_lock);
if (!target)
return -ENOENT;
entry = kernfs_find_and_get(target, target_name);
if (!entry) {
kernfs_put(target);
return -ENOENT;
}
if (!symlink_name)
symlink_name = target_name;
link = kernfs_create_link(kobj->sd, symlink_name, entry);
if (PTR_ERR(link) == -EEXIST)
sysfs_warn_dup(kobj->sd, symlink_name);
kernfs_put(entry);
kernfs_put(target);
return PTR_ERR_OR_ZERO(link);
}
EXPORT_SYMBOL_GPL(compat_only_sysfs_link_entry_to_kobj);
static int sysfs_group_attrs_change_owner(struct kernfs_node *grp_kn,
const struct attribute_group *grp,
struct iattr *newattrs)
{
struct kernfs_node *kn;
int error;
if (grp->attrs) {
struct attribute *const *attr;
for (attr = grp->attrs; *attr; attr++) {
kn = kernfs_find_and_get(grp_kn, (*attr)->name);
if (!kn)
return -ENOENT;
error = kernfs_setattr(kn, newattrs);
kernfs_put(kn);
if (error)
return error;
}
}
if (grp->bin_attrs) {
struct bin_attribute *const *bin_attr;
for (bin_attr = grp->bin_attrs; *bin_attr; bin_attr++) {
kn = kernfs_find_and_get(grp_kn, (*bin_attr)->attr.name);
if (!kn)
return -ENOENT;
error = kernfs_setattr(kn, newattrs);
kernfs_put(kn);
if (error)
return error;
}
}
return 0;
}
/**
* sysfs_group_change_owner - change owner of an attribute group.
* @kobj: The kobject containing the group.
* @grp: The attribute group.
* @kuid: new owner's kuid
* @kgid: new owner's kgid
*
* Returns 0 on success or error code on failure.
*/
int sysfs_group_change_owner(struct kobject *kobj,
const struct attribute_group *grp, kuid_t kuid,
kgid_t kgid)
{
struct kernfs_node *grp_kn;
int error;
struct iattr newattrs = {
.ia_valid = ATTR_UID | ATTR_GID,
.ia_uid = kuid,
.ia_gid = kgid,
};
if (!kobj->state_in_sysfs)
return -EINVAL;
if (grp->name) {
grp_kn = kernfs_find_and_get(kobj->sd, grp->name);
} else {
kernfs_get(kobj->sd);
grp_kn = kobj->sd;
}
if (!grp_kn)
return -ENOENT;
error = kernfs_setattr(grp_kn, &newattrs);
if (!error)
error = sysfs_group_attrs_change_owner(grp_kn, grp, &newattrs);
kernfs_put(grp_kn);
return error;
}
EXPORT_SYMBOL_GPL(sysfs_group_change_owner);
/**
* sysfs_groups_change_owner - change owner of a set of attribute groups.
* @kobj: The kobject containing the groups.
* @groups: The attribute groups.
* @kuid: new owner's kuid
* @kgid: new owner's kgid
*
* Returns 0 on success or error code on failure.
*/
int sysfs_groups_change_owner(struct kobject *kobj,
const struct attribute_group **groups,
kuid_t kuid, kgid_t kgid)
{
int error = 0, i;
if (!kobj->state_in_sysfs)
return -EINVAL;
if (!groups)
return 0;
for (i = 0; groups[i]; i++) {
error = sysfs_group_change_owner(kobj, groups[i], kuid, kgid);
if (error)
break;
}
return error;
}
EXPORT_SYMBOL_GPL(sysfs_groups_change_owner);
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_CLEANCACHE_H
#define _LINUX_CLEANCACHE_H
#include <linux/fs.h>
#include <linux/exportfs.h>
#include <linux/mm.h>
#define CLEANCACHE_NO_POOL -1
#define CLEANCACHE_NO_BACKEND -2
#define CLEANCACHE_NO_BACKEND_SHARED -3
#define CLEANCACHE_KEY_MAX 6
/*
* cleancache requires every file with a page in cleancache to have a
* unique key unless/until the file is removed/truncated. For some
* filesystems, the inode number is unique, but for "modern" filesystems
* an exportable filehandle is required (see exportfs.h)
*/
struct cleancache_filekey {
union {
ino_t ino;
__u32 fh[CLEANCACHE_KEY_MAX];
u32 key[CLEANCACHE_KEY_MAX];
} u;
};
struct cleancache_ops {
int (*init_fs)(size_t);
int (*init_shared_fs)(uuid_t *uuid, size_t);
int (*get_page)(int, struct cleancache_filekey,
pgoff_t, struct page *);
void (*put_page)(int, struct cleancache_filekey,
pgoff_t, struct page *);
void (*invalidate_page)(int, struct cleancache_filekey, pgoff_t);
void (*invalidate_inode)(int, struct cleancache_filekey);
void (*invalidate_fs)(int);
};
extern int cleancache_register_ops(const struct cleancache_ops *ops);
extern void __cleancache_init_fs(struct super_block *);
extern void __cleancache_init_shared_fs(struct super_block *);
extern int __cleancache_get_page(struct page *);
extern void __cleancache_put_page(struct page *);
extern void __cleancache_invalidate_page(struct address_space *, struct page *);
extern void __cleancache_invalidate_inode(struct address_space *);
extern void __cleancache_invalidate_fs(struct super_block *);
#ifdef CONFIG_CLEANCACHE
#define cleancache_enabled (1)
static inline bool cleancache_fs_enabled_mapping(struct address_space *mapping)
{
return mapping->host->i_sb->cleancache_poolid >= 0;
}
static inline bool cleancache_fs_enabled(struct page *page)
{
return cleancache_fs_enabled_mapping(page->mapping);
}
#else
#define cleancache_enabled (0)
#define cleancache_fs_enabled(_page) (0)
#define cleancache_fs_enabled_mapping(_page) (0)
#endif
/*
* The shim layer provided by these inline functions allows the compiler
* to reduce all cleancache hooks to nothingness if CONFIG_CLEANCACHE
* is disabled, to a single global variable check if CONFIG_CLEANCACHE
* is enabled but no cleancache "backend" has dynamically enabled it,
* and, for the most frequent cleancache ops, to a single global variable
* check plus a superblock element comparison if CONFIG_CLEANCACHE is enabled
* and a cleancache backend has dynamically enabled cleancache, but the
* filesystem referenced by that cleancache op has not enabled cleancache.
* As a result, CONFIG_CLEANCACHE can be enabled by default with essentially
* no measurable performance impact.
*/
static inline void cleancache_init_fs(struct super_block *sb)
{
if (cleancache_enabled)
__cleancache_init_fs(sb);
}
static inline void cleancache_init_shared_fs(struct super_block *sb)
{
if (cleancache_enabled)
__cleancache_init_shared_fs(sb);
}
static inline int cleancache_get_page(struct page *page)
{
if (cleancache_enabled && cleancache_fs_enabled(page))
return __cleancache_get_page(page);
return -1;
}
static inline void cleancache_put_page(struct page *page)
{
if (cleancache_enabled && cleancache_fs_enabled(page))
__cleancache_put_page(page);
}
static inline void cleancache_invalidate_page(struct address_space *mapping,
struct page *page)
{
/* careful... page->mapping is NULL sometimes when this is called */
if (cleancache_enabled && cleancache_fs_enabled_mapping(mapping))
__cleancache_invalidate_page(mapping, page);
}
static inline void cleancache_invalidate_inode(struct address_space *mapping)
{
if (cleancache_enabled && cleancache_fs_enabled_mapping(mapping))
__cleancache_invalidate_inode(mapping);
}
static inline void cleancache_invalidate_fs(struct super_block *sb)
{
if (cleancache_enabled)
__cleancache_invalidate_fs(sb);
}
#endif /* _LINUX_CLEANCACHE_H */
// SPDX-License-Identifier: GPL-2.0
/*
* Copyright (C) 2007 Oracle. All rights reserved.
*/
#include <crypto/hash.h>
#include <linux/kernel.h>
#include <linux/bio.h>
#include <linux/file.h>
#include <linux/fs.h>
#include <linux/pagemap.h>
#include <linux/highmem.h>
#include <linux/time.h>
#include <linux/init.h>
#include <linux/string.h>
#include <linux/backing-dev.h>
#include <linux/writeback.h>
#include <linux/compat.h>
#include <linux/xattr.h>
#include <linux/posix_acl.h>
#include <linux/falloc.h>
#include <linux/slab.h>
#include <linux/ratelimit.h>
#include <linux/btrfs.h>
#include <linux/blkdev.h>
#include <linux/posix_acl_xattr.h>
#include <linux/uio.h>
#include <linux/magic.h>
#include <linux/iversion.h>
#include <linux/swap.h>
#include <linux/migrate.h>
#include <linux/sched/mm.h>
#include <linux/iomap.h>
#include <asm/unaligned.h>
#include <linux/fsverity.h>
#include "misc.h"
#include "ctree.h"
#include "disk-io.h"
#include "transaction.h"
#include "btrfs_inode.h"
#include "print-tree.h"
#include "ordered-data.h"
#include "xattr.h"
#include "tree-log.h"
#include "volumes.h"
#include "compression.h"
#include "locking.h"
#include "free-space-cache.h"
#include "props.h"
#include "qgroup.h"
#include "delalloc-space.h"
#include "block-group.h"
#include "space-info.h"
#include "zoned.h"
#include "subpage.h"
struct btrfs_iget_args {
u64 ino;
struct btrfs_root *root;
};
struct btrfs_dio_data {
ssize_t submitted;
struct extent_changeset *data_reserved;
};
static const struct inode_operations btrfs_dir_inode_operations;
static const struct inode_operations btrfs_symlink_inode_operations;
static const struct inode_operations btrfs_special_inode_operations;
static const struct inode_operations btrfs_file_inode_operations;
static const struct address_space_operations btrfs_aops;
static const struct file_operations btrfs_dir_file_operations;
static struct kmem_cache *btrfs_inode_cachep;
struct kmem_cache *btrfs_trans_handle_cachep;
struct kmem_cache *btrfs_path_cachep;
struct kmem_cache *btrfs_free_space_cachep;
struct kmem_cache *btrfs_free_space_bitmap_cachep;
static int btrfs_setsize(struct inode *inode, struct iattr *attr);
static int btrfs_truncate(struct inode *inode, bool skip_writeback);
static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent);
static noinline int cow_file_range(struct btrfs_inode *inode,
struct page *locked_page,
u64 start, u64 end, int *page_started,
unsigned long *nr_written, int unlock);
static struct extent_map *create_io_em(struct btrfs_inode *inode, u64 start,
u64 len, u64 orig_start, u64 block_start,
u64 block_len, u64 orig_block_len,
u64 ram_bytes, int compress_type,
int type);
static void __endio_write_update_ordered(struct btrfs_inode *inode,
const u64 offset, const u64 bytes,
const bool uptodate);
/*
* btrfs_inode_lock - lock inode i_rwsem based on arguments passed
*
* ilock_flags can have the following bit set:
*
* BTRFS_ILOCK_SHARED - acquire a shared lock on the inode
* BTRFS_ILOCK_TRY - try to acquire the lock, if fails on first attempt
* return -EAGAIN
* BTRFS_ILOCK_MMAP - acquire a write lock on the i_mmap_lock
*/
int btrfs_inode_lock(struct inode *inode, unsigned int ilock_flags)
{
if (ilock_flags & BTRFS_ILOCK_SHARED) { if (ilock_flags & BTRFS_ILOCK_TRY) {
if (!inode_trylock_shared(inode))
return -EAGAIN;
else
return 0;
}
inode_lock_shared(inode);
} else {
if (ilock_flags & BTRFS_ILOCK_TRY) {
if (!inode_trylock(inode))
return -EAGAIN;
else
return 0;
}
inode_lock(inode);
}
if (ilock_flags & BTRFS_ILOCK_MMAP) down_write(&BTRFS_I(inode)->i_mmap_lock); return 0;
}
/*
* btrfs_inode_unlock - unock inode i_rwsem
*
* ilock_flags should contain the same bits set as passed to btrfs_inode_lock()
* to decide whether the lock acquired is shared or exclusive.
*/
void btrfs_inode_unlock(struct inode *inode, unsigned int ilock_flags)
{
if (ilock_flags & BTRFS_ILOCK_MMAP) up_write(&BTRFS_I(inode)->i_mmap_lock); if (ilock_flags & BTRFS_ILOCK_SHARED)
inode_unlock_shared(inode);
else
inode_unlock(inode);
}
/*
* Cleanup all submitted ordered extents in specified range to handle errors
* from the btrfs_run_delalloc_range() callback.
*
* NOTE: caller must ensure that when an error happens, it can not call
* extent_clear_unlock_delalloc() to clear both the bits EXTENT_DO_ACCOUNTING
* and EXTENT_DELALLOC simultaneously, because that causes the reserved metadata
* to be released, which we want to happen only when finishing the ordered
* extent (btrfs_finish_ordered_io()).
*/
static inline void btrfs_cleanup_ordered_extents(struct btrfs_inode *inode,
struct page *locked_page,
u64 offset, u64 bytes)
{
unsigned long index = offset >> PAGE_SHIFT;
unsigned long end_index = (offset + bytes - 1) >> PAGE_SHIFT;
u64 page_start = page_offset(locked_page); u64 page_end = page_start + PAGE_SIZE - 1;
struct page *page;
while (index <= end_index) {
/*
* For locked page, we will call end_extent_writepage() on it
* in run_delalloc_range() for the error handling. That
* end_extent_writepage() function will call
* btrfs_mark_ordered_io_finished() to clear page Ordered and
* run the ordered extent accounting.
*
* Here we can't just clear the Ordered bit, or
* btrfs_mark_ordered_io_finished() would skip the accounting
* for the page range, and the ordered extent will never finish.
*/
if (index == (page_offset(locked_page) >> PAGE_SHIFT)) {
index++;
continue;
}
page = find_get_page(inode->vfs_inode.i_mapping, index);
index++;
if (!page)
continue;
/*
* Here we just clear all Ordered bits for every page in the
* range, then __endio_write_update_ordered() will handle
* the ordered extent accounting for the range.
*/
btrfs_page_clamp_clear_ordered(inode->root->fs_info, page,
offset, bytes);
put_page(page);
}
/* The locked page covers the full range, nothing needs to be done */
if (bytes + offset <= page_offset(locked_page) + PAGE_SIZE)
return;
/*
* In case this page belongs to the delalloc range being instantiated
* then skip it, since the first page of a range is going to be
* properly cleaned up by the caller of run_delalloc_range
*/
if (page_start >= offset && page_end <= (offset + bytes - 1)) {
bytes = offset + bytes - page_offset(locked_page) - PAGE_SIZE;
offset = page_offset(locked_page) + PAGE_SIZE;
}
return __endio_write_update_ordered(inode, offset, bytes, false);
}
static int btrfs_dirty_inode(struct inode *inode);
static int btrfs_init_inode_security(struct btrfs_trans_handle *trans,
struct inode *inode, struct inode *dir,
const struct qstr *qstr)
{
int err;
err = btrfs_init_acl(trans, inode, dir);
if (!err)
err = btrfs_xattr_security_init(trans, inode, dir, qstr); return err;
}
/*
* this does all the hard work for inserting an inline extent into
* the btree. The caller should have done a btrfs_drop_extents so that
* no overlapping inline items exist in the btree
*/
static int insert_inline_extent(struct btrfs_trans_handle *trans,
struct btrfs_path *path, bool extent_inserted,
struct btrfs_root *root, struct inode *inode,
u64 start, size_t size, size_t compressed_size,
int compress_type,
struct page **compressed_pages)
{
struct extent_buffer *leaf;
struct page *page = NULL;
char *kaddr;
unsigned long ptr;
struct btrfs_file_extent_item *ei;
int ret;
size_t cur_size = size;
unsigned long offset;
ASSERT((compressed_size > 0 && compressed_pages) ||
(compressed_size == 0 && !compressed_pages));
if (compressed_size && compressed_pages)
cur_size = compressed_size;
if (!extent_inserted) {
struct btrfs_key key;
size_t datasize;
key.objectid = btrfs_ino(BTRFS_I(inode));
key.offset = start;
key.type = BTRFS_EXTENT_DATA_KEY;
datasize = btrfs_file_extent_calc_inline_size(cur_size);
ret = btrfs_insert_empty_item(trans, root, path, &key,
datasize);
if (ret) goto fail;
}
leaf = path->nodes[0];
ei = btrfs_item_ptr(leaf, path->slots[0],
struct btrfs_file_extent_item);
btrfs_set_file_extent_generation(leaf, ei, trans->transid);
btrfs_set_file_extent_type(leaf, ei, BTRFS_FILE_EXTENT_INLINE);
btrfs_set_file_extent_encryption(leaf, ei, 0);
btrfs_set_file_extent_other_encoding(leaf, ei, 0);
btrfs_set_file_extent_ram_bytes(leaf, ei, size);
ptr = btrfs_file_extent_inline_start(ei);
if (compress_type != BTRFS_COMPRESS_NONE) {
struct page *cpage;
int i = 0;
while (compressed_size > 0) { cpage = compressed_pages[i];
cur_size = min_t(unsigned long, compressed_size,
PAGE_SIZE);
kaddr = kmap_atomic(cpage);
write_extent_buffer(leaf, kaddr, ptr, cur_size);
kunmap_atomic(kaddr);
i++;
ptr += cur_size;
compressed_size -= cur_size;
}
btrfs_set_file_extent_compression(leaf, ei,
compress_type);
} else {
page = find_get_page(inode->i_mapping,
start >> PAGE_SHIFT);
btrfs_set_file_extent_compression(leaf, ei, 0);
kaddr = kmap_atomic(page);
offset = offset_in_page(start);
write_extent_buffer(leaf, kaddr + offset, ptr, size);
kunmap_atomic(kaddr);
put_page(page);
}
btrfs_mark_buffer_dirty(leaf);
btrfs_release_path(path);
/*
* We align size to sectorsize for inline extents just for simplicity
* sake.
*/
size = ALIGN(size, root->fs_info->sectorsize);
ret = btrfs_inode_set_file_extent_range(BTRFS_I(inode), start, size);
if (ret)
goto fail;
/*
* we're an inline extent, so nobody can
* extend the file past i_size without locking
* a page we already have locked.
*
* We must do any isize and inode updates
* before we unlock the pages. Otherwise we
* could end up racing with unlink.
*/
BTRFS_I(inode)->disk_i_size = inode->i_size;
fail:
return ret;
}
/*
* conditionally insert an inline extent into the file. This
* does the checks required to make sure the data is small enough
* to fit as an inline extent.
*/
static noinline int cow_file_range_inline(struct btrfs_inode *inode, u64 start,
u64 end, size_t compressed_size,
int compress_type,
struct page **compressed_pages)
{
struct btrfs_drop_extents_args drop_args = { 0 };
struct btrfs_root *root = inode->root;
struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_trans_handle *trans;
u64 isize = i_size_read(&inode->vfs_inode);
u64 actual_end = min(end + 1, isize);
u64 inline_len = actual_end - start;
u64 aligned_end = ALIGN(end, fs_info->sectorsize);
u64 data_len = inline_len;
int ret;
struct btrfs_path *path;
if (compressed_size)
data_len = compressed_size;
if (start > 0 ||
actual_end > fs_info->sectorsize ||
data_len > BTRFS_MAX_INLINE_DATA_SIZE(fs_info) ||
(!compressed_size &&
(actual_end & (fs_info->sectorsize - 1)) == 0) || end + 1 < isize || data_len > fs_info->max_inline) {
return 1;
}
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
trans = btrfs_join_transaction(root);
if (IS_ERR(trans)) {
btrfs_free_path(path);
return PTR_ERR(trans);
}
trans->block_rsv = &inode->block_rsv;
drop_args.path = path;
drop_args.start = start;
drop_args.end = aligned_end;
drop_args.drop_cache = true;
drop_args.replace_extent = true;
if (compressed_size && compressed_pages) drop_args.extent_item_size = btrfs_file_extent_calc_inline_size(
compressed_size);
else
drop_args.extent_item_size = btrfs_file_extent_calc_inline_size(
inline_len);
ret = btrfs_drop_extents(trans, root, inode, &drop_args);
if (ret) {
btrfs_abort_transaction(trans, ret);
goto out;
}
if (isize > actual_end)
inline_len = min_t(u64, isize, actual_end); ret = insert_inline_extent(trans, path, drop_args.extent_inserted,
root, &inode->vfs_inode, start,
inline_len, compressed_size,
compress_type, compressed_pages);
if (ret && ret != -ENOSPC) { btrfs_abort_transaction(trans, ret);
goto out;
} else if (ret == -ENOSPC) {
ret = 1;
goto out;
}
btrfs_update_inode_bytes(inode, inline_len, drop_args.bytes_found);
ret = btrfs_update_inode(trans, root, inode); if (ret && ret != -ENOSPC) { btrfs_abort_transaction(trans, ret);
goto out;
} else if (ret == -ENOSPC) {
ret = 1;
goto out;
}
set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags);
out:
/*
* Don't forget to free the reserved space, as for inlined extent
* it won't count as data extent, free them directly here.
* And at reserve time, it's always aligned to page size, so
* just free one page here.
*/
btrfs_qgroup_free_data(inode, NULL, 0, PAGE_SIZE);
btrfs_free_path(path);
btrfs_end_transaction(trans);
return ret;
}
struct async_extent {
u64 start;
u64 ram_size;
u64 compressed_size;
struct page **pages;
unsigned long nr_pages;
int compress_type;
struct list_head list;
};
struct async_chunk {
struct inode *inode;
struct page *locked_page;
u64 start;
u64 end;
unsigned int write_flags;
struct list_head extents;
struct cgroup_subsys_state *blkcg_css;
struct btrfs_work work;
atomic_t *pending;
};
struct async_cow {
/* Number of chunks in flight; must be first in the structure */
atomic_t num_chunks;
struct async_chunk chunks[];
};
static noinline int add_async_extent(struct async_chunk *cow,
u64 start, u64 ram_size,
u64 compressed_size,
struct page **pages,
unsigned long nr_pages,
int compress_type)
{
struct async_extent *async_extent;
async_extent = kmalloc(sizeof(*async_extent), GFP_NOFS);
BUG_ON(!async_extent); /* -ENOMEM */
async_extent->start = start;
async_extent->ram_size = ram_size;
async_extent->compressed_size = compressed_size;
async_extent->pages = pages;
async_extent->nr_pages = nr_pages;
async_extent->compress_type = compress_type;
list_add_tail(&async_extent->list, &cow->extents);
return 0;
}
/*
* Check if the inode has flags compatible with compression
*/
static inline bool inode_can_compress(struct btrfs_inode *inode)
{
/* Subpage doesn't support compression yet */
if (inode->root->fs_info->sectorsize < PAGE_SIZE)
return false;
if (inode->flags & BTRFS_INODE_NODATACOW ||
inode->flags & BTRFS_INODE_NODATASUM)
return false;
return true;
}
/*
* Check if the inode needs to be submitted to compression, based on mount
* options, defragmentation, properties or heuristics.
*/
static inline int inode_need_compress(struct btrfs_inode *inode, u64 start,
u64 end)
{
struct btrfs_fs_info *fs_info = inode->root->fs_info;
if (!inode_can_compress(inode)) {
WARN(IS_ENABLED(CONFIG_BTRFS_DEBUG),
KERN_ERR "BTRFS: unexpected compression for ino %llu\n",
btrfs_ino(inode));
return 0;
}
/* force compress */
if (btrfs_test_opt(fs_info, FORCE_COMPRESS))
return 1;
/* defrag ioctl */
if (inode->defrag_compress)
return 1;
/* bad compression ratios */
if (inode->flags & BTRFS_INODE_NOCOMPRESS)
return 0;
if (btrfs_test_opt(fs_info, COMPRESS) || inode->flags & BTRFS_INODE_COMPRESS || inode->prop_compress) return btrfs_compress_heuristic(&inode->vfs_inode, start, end);
return 0;
}
static inline void inode_should_defrag(struct btrfs_inode *inode,
u64 start, u64 end, u64 num_bytes, u64 small_write)
{
/* If this is a small write inside eof, kick off a defrag */
if (num_bytes < small_write && (start > 0 || end + 1 < inode->disk_i_size)) btrfs_add_inode_defrag(NULL, inode);
}
/*
* we create compressed extents in two phases. The first
* phase compresses a range of pages that have already been
* locked (both pages and state bits are locked).
*
* This is done inside an ordered work queue, and the compression
* is spread across many cpus. The actual IO submission is step
* two, and the ordered work queue takes care of making sure that
* happens in the same order things were put onto the queue by
* writepages and friends.
*
* If this code finds it can't get good compression, it puts an
* entry onto the work queue to write the uncompressed bytes. This
* makes sure that both compressed inodes and uncompressed inodes
* are written in the same order that the flusher thread sent them
* down.
*/
static noinline int compress_file_range(struct async_chunk *async_chunk)
{
struct inode *inode = async_chunk->inode;
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
u64 blocksize = fs_info->sectorsize;
u64 start = async_chunk->start;
u64 end = async_chunk->end;
u64 actual_end;
u64 i_size;
int ret = 0;
struct page **pages = NULL;
unsigned long nr_pages;
unsigned long total_compressed = 0;
unsigned long total_in = 0;
int i;
int will_compress;
int compress_type = fs_info->compress_type;
int compressed_extents = 0;
int redirty = 0;
inode_should_defrag(BTRFS_I(inode), start, end, end - start + 1,
SZ_16K);
/*
* We need to save i_size before now because it could change in between
* us evaluating the size and assigning it. This is because we lock and
* unlock the page in truncate and fallocate, and then modify the i_size
* later on.
*
* The barriers are to emulate READ_ONCE, remove that once i_size_read
* does that for us.
*/
barrier();
i_size = i_size_read(inode);
barrier();
actual_end = min_t(u64, i_size, end + 1);
again:
will_compress = 0;
nr_pages = (end >> PAGE_SHIFT) - (start >> PAGE_SHIFT) + 1;
BUILD_BUG_ON((BTRFS_MAX_COMPRESSED % PAGE_SIZE) != 0);
nr_pages = min_t(unsigned long, nr_pages,
BTRFS_MAX_COMPRESSED / PAGE_SIZE);
/*
* we don't want to send crud past the end of i_size through
* compression, that's just a waste of CPU time. So, if the
* end of the file is before the start of our current
* requested range of bytes, we bail out to the uncompressed
* cleanup code that can deal with all of this.
*
* It isn't really the fastest way to fix things, but this is a
* very uncommon corner.
*/
if (actual_end <= start)
goto cleanup_and_bail_uncompressed;
total_compressed = actual_end - start;
/*
* skip compression for a small file range(<=blocksize) that
* isn't an inline extent, since it doesn't save disk space at all.
*/
if (total_compressed <= blocksize &&
(start > 0 || end + 1 < BTRFS_I(inode)->disk_i_size))
goto cleanup_and_bail_uncompressed;
total_compressed = min_t(unsigned long, total_compressed,
BTRFS_MAX_UNCOMPRESSED);
total_in = 0;
ret = 0;
/*
* we do compression for mount -o compress and when the
* inode has not been flagged as nocompress. This flag can
* change at any time if we discover bad compression ratios.
*/
if (inode_need_compress(BTRFS_I(inode), start, end)) {
WARN_ON(pages);
pages = kcalloc(nr_pages, sizeof(struct page *), GFP_NOFS);
if (!pages) {
/* just bail out to the uncompressed code */
nr_pages = 0;
goto cont;
}
if (BTRFS_I(inode)->defrag_compress)
compress_type = BTRFS_I(inode)->defrag_compress;
else if (BTRFS_I(inode)->prop_compress)
compress_type = BTRFS_I(inode)->prop_compress;
/*
* we need to call clear_page_dirty_for_io on each
* page in the range. Otherwise applications with the file
* mmap'd can wander in and change the page contents while
* we are compressing them.
*
* If the compression fails for any reason, we set the pages
* dirty again later on.
*
* Note that the remaining part is redirtied, the start pointer
* has moved, the end is the original one.
*/
if (!redirty) {
extent_range_clear_dirty_for_io(inode, start, end);
redirty = 1;
}
/* Compression level is applied here and only here */
ret = btrfs_compress_pages(
compress_type | (fs_info->compress_level << 4),
inode->i_mapping, start,
pages,
&nr_pages,
&total_in,
&total_compressed);
if (!ret) {
unsigned long offset = offset_in_page(total_compressed);
struct page *page = pages[nr_pages - 1];
/* zero the tail end of the last page, we might be
* sending it down to disk
*/
if (offset)
memzero_page(page, offset, PAGE_SIZE - offset);
will_compress = 1;
}
}
cont:
/*
* Check cow_file_range() for why we don't even try to create inline
* extent for subpage case.
*/
if (start == 0 && fs_info->sectorsize == PAGE_SIZE) {
/* lets try to make an inline extent */
if (ret || total_in < actual_end) {
/* we didn't compress the entire range, try
* to make an uncompressed inline extent.
*/
ret = cow_file_range_inline(BTRFS_I(inode), start, end,
0, BTRFS_COMPRESS_NONE,
NULL);
} else {
/* try making a compressed inline extent */
ret = cow_file_range_inline(BTRFS_I(inode), start, end,
total_compressed,
compress_type, pages);
}
if (ret <= 0) {
unsigned long clear_flags = EXTENT_DELALLOC |
EXTENT_DELALLOC_NEW | EXTENT_DEFRAG |
EXTENT_DO_ACCOUNTING;
unsigned long page_error_op;
page_error_op = ret < 0 ? PAGE_SET_ERROR : 0;
/*
* inline extent creation worked or returned error,
* we don't need to create any more async work items.
* Unlock and free up our temp pages.
*
* We use DO_ACCOUNTING here because we need the
* delalloc_release_metadata to be done _after_ we drop
* our outstanding extent for clearing delalloc for this
* range.
*/
extent_clear_unlock_delalloc(BTRFS_I(inode), start, end,
NULL,
clear_flags,
PAGE_UNLOCK |
PAGE_START_WRITEBACK |
page_error_op |
PAGE_END_WRITEBACK);
/*
* Ensure we only free the compressed pages if we have
* them allocated, as we can still reach here with
* inode_need_compress() == false.
*/
if (pages) {
for (i = 0; i < nr_pages; i++) {
WARN_ON(pages[i]->mapping);
put_page(pages[i]);
}
kfree(pages);
}
return 0;
}
}
if (will_compress) {
/*
* we aren't doing an inline extent round the compressed size
* up to a block size boundary so the allocator does sane
* things
*/
total_compressed = ALIGN(total_compressed, blocksize);
/*
* one last check to make sure the compression is really a
* win, compare the page count read with the blocks on disk,
* compression must free at least one sector size
*/
total_in = ALIGN(total_in, PAGE_SIZE);
if (total_compressed + blocksize <= total_in) {
compressed_extents++;
/*
* The async work queues will take care of doing actual
* allocation on disk for these compressed pages, and
* will submit them to the elevator.
*/
add_async_extent(async_chunk, start, total_in,
total_compressed, pages, nr_pages,
compress_type);
if (start + total_in < end) {
start += total_in;
pages = NULL;
cond_resched();
goto again;
}
return compressed_extents;
}
}
if (pages) {
/*
* the compression code ran but failed to make things smaller,
* free any pages it allocated and our page pointer array
*/
for (i = 0; i < nr_pages; i++) {
WARN_ON(pages[i]->mapping);
put_page(pages[i]);
}
kfree(pages);
pages = NULL;
total_compressed = 0;
nr_pages = 0;
/* flag the file so we don't compress in the future */
if (!btrfs_test_opt(fs_info, FORCE_COMPRESS) &&
!(BTRFS_I(inode)->prop_compress)) {
BTRFS_I(inode)->flags |= BTRFS_INODE_NOCOMPRESS;
}
}
cleanup_and_bail_uncompressed:
/*
* No compression, but we still need to write the pages in the file
* we've been given so far. redirty the locked page if it corresponds
* to our extent and set things up for the async work queue to run
* cow_file_range to do the normal delalloc dance.
*/
if (async_chunk->locked_page &&
(page_offset(async_chunk->locked_page) >= start &&
page_offset(async_chunk->locked_page)) <= end) {
__set_page_dirty_nobuffers(async_chunk->locked_page);
/* unlocked later on in the async handlers */
}
if (redirty)
extent_range_redirty_for_io(inode, start, end);
add_async_extent(async_chunk, start, end - start + 1, 0, NULL, 0,
BTRFS_COMPRESS_NONE);
compressed_extents++;
return compressed_extents;
}
static void free_async_extent_pages(struct async_extent *async_extent)
{
int i;
if (!async_extent->pages)
return;
for (i = 0; i < async_extent->nr_pages; i++) {
WARN_ON(async_extent->pages[i]->mapping);
put_page(async_extent->pages[i]);
}
kfree(async_extent->pages);
async_extent->nr_pages = 0;
async_extent->pages = NULL;
}
/*
* phase two of compressed writeback. This is the ordered portion
* of the code, which only gets called in the order the work was
* queued. We walk all the async extents created by compress_file_range
* and send them down to the disk.
*/
static noinline void submit_compressed_extents(struct async_chunk *async_chunk)
{
struct btrfs_inode *inode = BTRFS_I(async_chunk->inode);
struct btrfs_fs_info *fs_info = inode->root->fs_info;
struct async_extent *async_extent;
u64 alloc_hint = 0;
struct btrfs_key ins;
struct extent_map *em;
struct btrfs_root *root = inode->root;
struct extent_io_tree *io_tree = &inode->io_tree;
int ret = 0;
again:
while (!list_empty(&async_chunk->extents)) {
async_extent = list_entry(async_chunk->extents.next,
struct async_extent, list);
list_del(&async_extent->list);
retry:
lock_extent(io_tree, async_extent->start,
async_extent->start + async_extent->ram_size - 1);
/* did the compression code fall back to uncompressed IO? */
if (!async_extent->pages) {
int page_started = 0;
unsigned long nr_written = 0;
/* allocate blocks */
ret = cow_file_range(inode, async_chunk->locked_page,
async_extent->start,
async_extent->start +
async_extent->ram_size - 1,
&page_started, &nr_written, 0);
/* JDM XXX */
/*
* if page_started, cow_file_range inserted an
* inline extent and took care of all the unlocking
* and IO for us. Otherwise, we need to submit
* all those pages down to the drive.
*/
if (!page_started && !ret)
extent_write_locked_range(&inode->vfs_inode,
async_extent->start,
async_extent->start +
async_extent->ram_size - 1,
WB_SYNC_ALL);
else if (ret && async_chunk->locked_page)
unlock_page(async_chunk->locked_page);
kfree(async_extent);
cond_resched();
continue;
}
ret = btrfs_reserve_extent(root, async_extent->ram_size,
async_extent->compressed_size,
async_extent->compressed_size,
0, alloc_hint, &ins, 1, 1);
if (ret) {
free_async_extent_pages(async_extent);
if (ret == -ENOSPC) {
unlock_extent(io_tree, async_extent->start,
async_extent->start +
async_extent->ram_size - 1);
/*
* we need to redirty the pages if we decide to
* fallback to uncompressed IO, otherwise we
* will not submit these pages down to lower
* layers.
*/
extent_range_redirty_for_io(&inode->vfs_inode,
async_extent->start,
async_extent->start +
async_extent->ram_size - 1);
goto retry;
}
goto out_free;
}
/*
* here we're doing allocation and writeback of the
* compressed pages
*/
em = create_io_em(inode, async_extent->start,
async_extent->ram_size, /* len */
async_extent->start, /* orig_start */
ins.objectid, /* block_start */
ins.offset, /* block_len */
ins.offset, /* orig_block_len */
async_extent->ram_size, /* ram_bytes */
async_extent->compress_type,
BTRFS_ORDERED_COMPRESSED);
if (IS_ERR(em))
/* ret value is not necessary due to void function */
goto out_free_reserve;
free_extent_map(em);
ret = btrfs_add_ordered_extent_compress(inode,
async_extent->start,
ins.objectid,
async_extent->ram_size,
ins.offset,
async_extent->compress_type);
if (ret) {
btrfs_drop_extent_cache(inode, async_extent->start,
async_extent->start +
async_extent->ram_size - 1, 0);
goto out_free_reserve;
}
btrfs_dec_block_group_reservations(fs_info, ins.objectid);
/*
* clear dirty, set writeback and unlock the pages.
*/
extent_clear_unlock_delalloc(inode, async_extent->start,
async_extent->start +
async_extent->ram_size - 1,
NULL, EXTENT_LOCKED | EXTENT_DELALLOC,
PAGE_UNLOCK | PAGE_START_WRITEBACK);
if (btrfs_submit_compressed_write(inode, async_extent->start,
async_extent->ram_size,
ins.objectid,
ins.offset, async_extent->pages,
async_extent->nr_pages,
async_chunk->write_flags,
async_chunk->blkcg_css)) {
struct page *p = async_extent->pages[0];
const u64 start = async_extent->start;
const u64 end = start + async_extent->ram_size - 1;
p->mapping = inode->vfs_inode.i_mapping;
btrfs_writepage_endio_finish_ordered(inode, p, start,
end, false);
p->mapping = NULL;
extent_clear_unlock_delalloc(inode, start, end, NULL, 0,
PAGE_END_WRITEBACK |
PAGE_SET_ERROR);
free_async_extent_pages(async_extent);
}
alloc_hint = ins.objectid + ins.offset;
kfree(async_extent);
cond_resched();
}
return;
out_free_reserve:
btrfs_dec_block_group_reservations(fs_info, ins.objectid);
btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 1);
out_free:
extent_clear_unlock_delalloc(inode, async_extent->start,
async_extent->start +
async_extent->ram_size - 1,
NULL, EXTENT_LOCKED | EXTENT_DELALLOC |
EXTENT_DELALLOC_NEW |
EXTENT_DEFRAG | EXTENT_DO_ACCOUNTING,
PAGE_UNLOCK | PAGE_START_WRITEBACK |
PAGE_END_WRITEBACK | PAGE_SET_ERROR);
free_async_extent_pages(async_extent);
kfree(async_extent);
goto again;
}
static u64 get_extent_allocation_hint(struct btrfs_inode *inode, u64 start,
u64 num_bytes)
{
struct extent_map_tree *em_tree = &inode->extent_tree;
struct extent_map *em;
u64 alloc_hint = 0;
read_lock(&em_tree->lock);
em = search_extent_mapping(em_tree, start, num_bytes);
if (em) {
/*
* if block start isn't an actual block number then find the
* first block in this inode and use that as a hint. If that
* block is also bogus then just don't worry about it.
*/
if (em->block_start >= EXTENT_MAP_LAST_BYTE) { free_extent_map(em);
em = search_extent_mapping(em_tree, 0, 0);
if (em && em->block_start < EXTENT_MAP_LAST_BYTE)
alloc_hint = em->block_start;
if (em)
free_extent_map(em);
} else {
alloc_hint = em->block_start;
free_extent_map(em);
}
}
read_unlock(&em_tree->lock);
return alloc_hint;
}
/*
* when extent_io.c finds a delayed allocation range in the file,
* the call backs end up in this code. The basic idea is to
* allocate extents on disk for the range, and create ordered data structs
* in ram to track those extents.
*
* locked_page is the page that writepage had locked already. We use
* it to make sure we don't do extra locks or unlocks.
*
* *page_started is set to one if we unlock locked_page and do everything
* required to start IO on it. It may be clean and already done with
* IO when we return.
*/
static noinline int cow_file_range(struct btrfs_inode *inode,
struct page *locked_page,
u64 start, u64 end, int *page_started,
unsigned long *nr_written, int unlock)
{
struct btrfs_root *root = inode->root;
struct btrfs_fs_info *fs_info = root->fs_info;
u64 alloc_hint = 0;
u64 num_bytes;
unsigned long ram_size;
u64 cur_alloc_size = 0;
u64 min_alloc_size;
u64 blocksize = fs_info->sectorsize;
struct btrfs_key ins;
struct extent_map *em;
unsigned clear_bits;
unsigned long page_ops;
bool extent_reserved = false;
int ret = 0;
if (btrfs_is_free_space_inode(inode)) {
ret = -EINVAL;
goto out_unlock;
}
num_bytes = ALIGN(end - start + 1, blocksize);
num_bytes = max(blocksize, num_bytes);
ASSERT(num_bytes <= btrfs_super_total_bytes(fs_info->super_copy));
inode_should_defrag(inode, start, end, num_bytes, SZ_64K);
/*
* Due to the page size limit, for subpage we can only trigger the
* writeback for the dirty sectors of page, that means data writeback
* is doing more writeback than what we want.
*
* This is especially unexpected for some call sites like fallocate,
* where we only increase i_size after everything is done.
* This means we can trigger inline extent even if we didn't want to.
* So here we skip inline extent creation completely.
*/
if (start == 0 && fs_info->sectorsize == PAGE_SIZE) {
/* lets try to make an inline extent */
ret = cow_file_range_inline(inode, start, end, 0,
BTRFS_COMPRESS_NONE, NULL);
if (ret == 0) {
/*
* We use DO_ACCOUNTING here because we need the
* delalloc_release_metadata to be run _after_ we drop
* our outstanding extent for clearing delalloc for this
* range.
*/
extent_clear_unlock_delalloc(inode, start, end,
locked_page,
EXTENT_LOCKED | EXTENT_DELALLOC |
EXTENT_DELALLOC_NEW | EXTENT_DEFRAG |
EXTENT_DO_ACCOUNTING, PAGE_UNLOCK |
PAGE_START_WRITEBACK | PAGE_END_WRITEBACK);
*nr_written = *nr_written +
(end - start + PAGE_SIZE) / PAGE_SIZE;
*page_started = 1;
/*
* locked_page is locked by the caller of
* writepage_delalloc(), not locked by
* __process_pages_contig().
*
* We can't let __process_pages_contig() to unlock it,
* as it doesn't have any subpage::writers recorded.
*
* Here we manually unlock the page, since the caller
* can't use page_started to determine if it's an
* inline extent or a compressed extent.
*/
unlock_page(locked_page);
goto out;
} else if (ret < 0) {
goto out_unlock;
}
}
alloc_hint = get_extent_allocation_hint(inode, start, num_bytes);
btrfs_drop_extent_cache(inode, start, start + num_bytes - 1, 0);
/*
* Relocation relies on the relocated extents to have exactly the same
* size as the original extents. Normally writeback for relocation data
* extents follows a NOCOW path because relocation preallocates the
* extents. However, due to an operation such as scrub turning a block
* group to RO mode, it may fallback to COW mode, so we must make sure
* an extent allocated during COW has exactly the requested size and can
* not be split into smaller extents, otherwise relocation breaks and
* fails during the stage where it updates the bytenr of file extent
* items.
*/
if (btrfs_is_data_reloc_root(root))
min_alloc_size = num_bytes;
else
min_alloc_size = fs_info->sectorsize; while (num_bytes > 0) {
cur_alloc_size = num_bytes;
ret = btrfs_reserve_extent(root, cur_alloc_size, cur_alloc_size,
min_alloc_size, 0, alloc_hint,
&ins, 1, 1);
if (ret < 0)
goto out_unlock;
cur_alloc_size = ins.offset;
extent_reserved = true;
ram_size = ins.offset;
em = create_io_em(inode, start, ins.offset, /* len */
start, /* orig_start */
ins.objectid, /* block_start */
ins.offset, /* block_len */
ins.offset, /* orig_block_len */
ram_size, /* ram_bytes */
BTRFS_COMPRESS_NONE, /* compress_type */
BTRFS_ORDERED_REGULAR /* type */);
if (IS_ERR(em)) {
ret = PTR_ERR(em);
goto out_reserve;
}
free_extent_map(em);
ret = btrfs_add_ordered_extent(inode, start, ins.objectid,
ram_size, cur_alloc_size,
BTRFS_ORDERED_REGULAR);
if (ret)
goto out_drop_extent_cache;
if (btrfs_is_data_reloc_root(root)) {
ret = btrfs_reloc_clone_csums(inode, start,
cur_alloc_size);
/*
* Only drop cache here, and process as normal.
*
* We must not allow extent_clear_unlock_delalloc()
* at out_unlock label to free meta of this ordered
* extent, as its meta should be freed by
* btrfs_finish_ordered_io().
*
* So we must continue until @start is increased to
* skip current ordered extent.
*/
if (ret)
btrfs_drop_extent_cache(inode, start,
start + ram_size - 1, 0);
}
btrfs_dec_block_group_reservations(fs_info, ins.objectid);
/*
* We're not doing compressed IO, don't unlock the first page
* (which the caller expects to stay locked), don't clear any
* dirty bits and don't set any writeback bits
*
* Do set the Ordered (Private2) bit so we know this page was
* properly setup for writepage.
*/
page_ops = unlock ? PAGE_UNLOCK : 0;
page_ops |= PAGE_SET_ORDERED;
extent_clear_unlock_delalloc(inode, start, start + ram_size - 1,
locked_page,
EXTENT_LOCKED | EXTENT_DELALLOC,
page_ops);
if (num_bytes < cur_alloc_size)
num_bytes = 0;
else
num_bytes -= cur_alloc_size; alloc_hint = ins.objectid + ins.offset;
start += cur_alloc_size;
extent_reserved = false;
/*
* btrfs_reloc_clone_csums() error, since start is increased
* extent_clear_unlock_delalloc() at out_unlock label won't
* free metadata of current ordered extent, we're OK to exit.
*/
if (ret)
goto out_unlock;
}
out:
return ret;
out_drop_extent_cache:
btrfs_drop_extent_cache(inode, start, start + ram_size - 1, 0);
out_reserve:
btrfs_dec_block_group_reservations(fs_info, ins.objectid);
btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 1);
out_unlock:
clear_bits = EXTENT_LOCKED | EXTENT_DELALLOC | EXTENT_DELALLOC_NEW |
EXTENT_DEFRAG | EXTENT_CLEAR_META_RESV;
page_ops = PAGE_UNLOCK | PAGE_START_WRITEBACK | PAGE_END_WRITEBACK;
/*
* If we reserved an extent for our delalloc range (or a subrange) and
* failed to create the respective ordered extent, then it means that
* when we reserved the extent we decremented the extent's size from
* the data space_info's bytes_may_use counter and incremented the
* space_info's bytes_reserved counter by the same amount. We must make
* sure extent_clear_unlock_delalloc() does not try to decrement again
* the data space_info's bytes_may_use counter, therefore we do not pass
* it the flag EXTENT_CLEAR_DATA_RESV.
*/
if (extent_reserved) {
extent_clear_unlock_delalloc(inode, start,
start + cur_alloc_size - 1,
locked_page,
clear_bits,
page_ops);
start += cur_alloc_size;
if (start >= end)
goto out;
}
extent_clear_unlock_delalloc(inode, start, end, locked_page,
clear_bits | EXTENT_CLEAR_DATA_RESV,
page_ops);
goto out;
}
/*
* work queue call back to started compression on a file and pages
*/
static noinline void async_cow_start(struct btrfs_work *work)
{
struct async_chunk *async_chunk;
int compressed_extents;
async_chunk = container_of(work, struct async_chunk, work);
compressed_extents = compress_file_range(async_chunk);
if (compressed_extents == 0) {
btrfs_add_delayed_iput(async_chunk->inode);
async_chunk->inode = NULL;
}
}
/*
* work queue call back to submit previously compressed pages
*/
static noinline void async_cow_submit(struct btrfs_work *work)
{
struct async_chunk *async_chunk = container_of(work, struct async_chunk,
work);
struct btrfs_fs_info *fs_info = btrfs_work_owner(work);
unsigned long nr_pages;
nr_pages = (async_chunk->end - async_chunk->start + PAGE_SIZE) >>
PAGE_SHIFT;
/*
* ->inode could be NULL if async_chunk_start has failed to compress,
* in which case we don't have anything to submit, yet we need to
* always adjust ->async_delalloc_pages as its paired with the init
* happening in cow_file_range_async
*/
if (async_chunk->inode)
submit_compressed_extents(async_chunk);
/* atomic_sub_return implies a barrier */
if (atomic_sub_return(nr_pages, &fs_info->async_delalloc_pages) <
5 * SZ_1M)
cond_wake_up_nomb(&fs_info->async_submit_wait);
}
static noinline void async_cow_free(struct btrfs_work *work)
{
struct async_chunk *async_chunk;
async_chunk = container_of(work, struct async_chunk, work);
if (async_chunk->inode)
btrfs_add_delayed_iput(async_chunk->inode);
if (async_chunk->blkcg_css)
css_put(async_chunk->blkcg_css);
/*
* Since the pointer to 'pending' is at the beginning of the array of
* async_chunk's, freeing it ensures the whole array has been freed.
*/
if (atomic_dec_and_test(async_chunk->pending))
kvfree(async_chunk->pending);
}
static int cow_file_range_async(struct btrfs_inode *inode,
struct writeback_control *wbc,
struct page *locked_page,
u64 start, u64 end, int *page_started,
unsigned long *nr_written)
{
struct btrfs_fs_info *fs_info = inode->root->fs_info;
struct cgroup_subsys_state *blkcg_css = wbc_blkcg_css(wbc);
struct async_cow *ctx;
struct async_chunk *async_chunk;
unsigned long nr_pages;
u64 cur_end;
u64 num_chunks = DIV_ROUND_UP(end - start, SZ_512K);
int i;
bool should_compress;
unsigned nofs_flag;
const unsigned int write_flags = wbc_to_write_flags(wbc);
unlock_extent(&inode->io_tree, start, end);
if (inode->flags & BTRFS_INODE_NOCOMPRESS &&
!btrfs_test_opt(fs_info, FORCE_COMPRESS)) {
num_chunks = 1;
should_compress = false;
} else {
should_compress = true;
}
nofs_flag = memalloc_nofs_save();
ctx = kvmalloc(struct_size(ctx, chunks, num_chunks), GFP_KERNEL);
memalloc_nofs_restore(nofs_flag);
if (!ctx) {
unsigned clear_bits = EXTENT_LOCKED | EXTENT_DELALLOC |
EXTENT_DELALLOC_NEW | EXTENT_DEFRAG |
EXTENT_DO_ACCOUNTING;
unsigned long page_ops = PAGE_UNLOCK | PAGE_START_WRITEBACK |
PAGE_END_WRITEBACK | PAGE_SET_ERROR;
extent_clear_unlock_delalloc(inode, start, end, locked_page,
clear_bits, page_ops);
return -ENOMEM;
}
async_chunk = ctx->chunks;
atomic_set(&ctx->num_chunks, num_chunks);
for (i = 0; i < num_chunks; i++) { if (should_compress) cur_end = min(end, start + SZ_512K - 1);
else
cur_end = end;
/*
* igrab is called higher up in the call chain, take only the
* lightweight reference for the callback lifetime
*/
ihold(&inode->vfs_inode);
async_chunk[i].pending = &ctx->num_chunks;
async_chunk[i].inode = &inode->vfs_inode;
async_chunk[i].start = start;
async_chunk[i].end = cur_end;
async_chunk[i].write_flags = write_flags;
INIT_LIST_HEAD(&async_chunk[i].extents);
/*
* The locked_page comes all the way from writepage and its
* the original page we were actually given. As we spread
* this large delalloc region across multiple async_chunk
* structs, only the first struct needs a pointer to locked_page
*
* This way we don't need racey decisions about who is supposed
* to unlock it.
*/
if (locked_page) {
/*
* Depending on the compressibility, the pages might or
* might not go through async. We want all of them to
* be accounted against wbc once. Let's do it here
* before the paths diverge. wbc accounting is used
* only for foreign writeback detection and doesn't
* need full accuracy. Just account the whole thing
* against the first page.
*/
wbc_account_cgroup_owner(wbc, locked_page,
cur_end - start);
async_chunk[i].locked_page = locked_page;
locked_page = NULL;
} else {
async_chunk[i].locked_page = NULL;
}
if (blkcg_css != blkcg_root_css) {
css_get(blkcg_css);
async_chunk[i].blkcg_css = blkcg_css;
} else {
async_chunk[i].blkcg_css = NULL;
}
btrfs_init_work(&async_chunk[i].work, async_cow_start,
async_cow_submit, async_cow_free);
nr_pages = DIV_ROUND_UP(cur_end - start, PAGE_SIZE);
atomic_add(nr_pages, &fs_info->async_delalloc_pages);
btrfs_queue_work(fs_info->delalloc_workers, &async_chunk[i].work);
*nr_written += nr_pages;
start = cur_end + 1;
}
*page_started = 1;
return 0;
}
static noinline int run_delalloc_zoned(struct btrfs_inode *inode,
struct page *locked_page, u64 start,
u64 end, int *page_started,
unsigned long *nr_written)
{
int ret;
ret = cow_file_range(inode, locked_page, start, end, page_started,
nr_written, 0);
if (ret)
return ret;
if (*page_started)
return 0;
__set_page_dirty_nobuffers(locked_page);
account_page_redirty(locked_page);
extent_write_locked_range(&inode->vfs_inode, start, end, WB_SYNC_ALL);
*page_started = 1;
return 0;
}
static noinline int csum_exist_in_range(struct btrfs_fs_info *fs_info,
u64 bytenr, u64 num_bytes)
{
int ret;
struct btrfs_ordered_sum *sums;
LIST_HEAD(list);
ret = btrfs_lookup_csums_range(fs_info->csum_root, bytenr,
bytenr + num_bytes - 1, &list, 0);
if (ret == 0 && list_empty(&list))
return 0;
while (!list_empty(&list)) {
sums = list_entry(list.next, struct btrfs_ordered_sum, list);
list_del(&sums->list);
kfree(sums);
}
if (ret < 0)
return ret;
return 1;
}
static int fallback_to_cow(struct btrfs_inode *inode, struct page *locked_page,
const u64 start, const u64 end,
int *page_started, unsigned long *nr_written)
{
const bool is_space_ino = btrfs_is_free_space_inode(inode);
const bool is_reloc_ino = btrfs_is_data_reloc_root(inode->root);
const u64 range_bytes = end + 1 - start;
struct extent_io_tree *io_tree = &inode->io_tree;
u64 range_start = start;
u64 count;
/*
* If EXTENT_NORESERVE is set it means that when the buffered write was
* made we had not enough available data space and therefore we did not
* reserve data space for it, since we though we could do NOCOW for the
* respective file range (either there is prealloc extent or the inode
* has the NOCOW bit set).
*
* However when we need to fallback to COW mode (because for example the
* block group for the corresponding extent was turned to RO mode by a
* scrub or relocation) we need to do the following:
*
* 1) We increment the bytes_may_use counter of the data space info.
* If COW succeeds, it allocates a new data extent and after doing
* that it decrements the space info's bytes_may_use counter and
* increments its bytes_reserved counter by the same amount (we do
* this at btrfs_add_reserved_bytes()). So we need to increment the
* bytes_may_use counter to compensate (when space is reserved at
* buffered write time, the bytes_may_use counter is incremented);
*
* 2) We clear the EXTENT_NORESERVE bit from the range. We do this so
* that if the COW path fails for any reason, it decrements (through
* extent_clear_unlock_delalloc()) the bytes_may_use counter of the
* data space info, which we incremented in the step above.
*
* If we need to fallback to cow and the inode corresponds to a free
* space cache inode or an inode of the data relocation tree, we must
* also increment bytes_may_use of the data space_info for the same
* reason. Space caches and relocated data extents always get a prealloc
* extent for them, however scrub or balance may have set the block
* group that contains that extent to RO mode and therefore force COW
* when starting writeback.
*/
count = count_range_bits(io_tree, &range_start, end, range_bytes,
EXTENT_NORESERVE, 0);
if (count > 0 || is_space_ino || is_reloc_ino) {
u64 bytes = count;
struct btrfs_fs_info *fs_info = inode->root->fs_info;
struct btrfs_space_info *sinfo = fs_info->data_sinfo;
if (is_space_ino || is_reloc_ino)
bytes = range_bytes;
spin_lock(&sinfo->lock);
btrfs_space_info_update_bytes_may_use(fs_info, sinfo, bytes);
spin_unlock(&sinfo->lock);
if (count > 0)
clear_extent_bit(io_tree, start, end, EXTENT_NORESERVE,
0, 0, NULL);
}
return cow_file_range(inode, locked_page, start, end, page_started,
nr_written, 1);
}
/*
* when nowcow writeback call back. This checks for snapshots or COW copies
* of the extents that exist in the file, and COWs the file as required.
*
* If no cow copies or snapshots exist, we write directly to the existing
* blocks on disk
*/
static noinline int run_delalloc_nocow(struct btrfs_inode *inode,
struct page *locked_page,
const u64 start, const u64 end,
int *page_started,
unsigned long *nr_written)
{
struct btrfs_fs_info *fs_info = inode->root->fs_info;
struct btrfs_root *root = inode->root;
struct btrfs_path *path;
u64 cow_start = (u64)-1;
u64 cur_offset = start;
int ret;
bool check_prev = true;
const bool freespace_inode = btrfs_is_free_space_inode(inode);
u64 ino = btrfs_ino(inode);
bool nocow = false;
u64 disk_bytenr = 0;
const bool force = inode->flags & BTRFS_INODE_NODATACOW;
path = btrfs_alloc_path();
if (!path) {
extent_clear_unlock_delalloc(inode, start, end, locked_page,
EXTENT_LOCKED | EXTENT_DELALLOC |
EXTENT_DO_ACCOUNTING |
EXTENT_DEFRAG, PAGE_UNLOCK |
PAGE_START_WRITEBACK |
PAGE_END_WRITEBACK);
return -ENOMEM;
}
while (1) {
struct btrfs_key found_key;
struct btrfs_file_extent_item *fi;
struct extent_buffer *leaf;
u64 extent_end;
u64 extent_offset;
u64 num_bytes = 0;
u64 disk_num_bytes;
u64 ram_bytes;
int extent_type;
nocow = false;
ret = btrfs_lookup_file_extent(NULL, root, path, ino,
cur_offset, 0);
if (ret < 0)
goto error;
/*
* If there is no extent for our range when doing the initial
* search, then go back to the previous slot as it will be the
* one containing the search offset
*/
if (ret > 0 && path->slots[0] > 0 && check_prev) {
leaf = path->nodes[0];
btrfs_item_key_to_cpu(leaf, &found_key,
path->slots[0] - 1);
if (found_key.objectid == ino && found_key.type == BTRFS_EXTENT_DATA_KEY) path->slots[0]--;
}
check_prev = false;
next_slot:
/* Go to next leaf if we have exhausted the current one */
leaf = path->nodes[0];
if (path->slots[0] >= btrfs_header_nritems(leaf)) {
ret = btrfs_next_leaf(root, path);
if (ret < 0) {
if (cow_start != (u64)-1)
cur_offset = cow_start;
goto error;
}
if (ret > 0)
break;
leaf = path->nodes[0];
}
btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
/* Didn't find anything for our INO */
if (found_key.objectid > ino)
break;
/*
* Keep searching until we find an EXTENT_ITEM or there are no
* more extents for this inode
*/
if (WARN_ON_ONCE(found_key.objectid < ino) || found_key.type < BTRFS_EXTENT_DATA_KEY) { path->slots[0]++;
goto next_slot;
}
/* Found key is not EXTENT_DATA_KEY or starts after req range */
if (found_key.type > BTRFS_EXTENT_DATA_KEY || found_key.offset > end)
break;
/*
* If the found extent starts after requested offset, then
* adjust extent_end to be right before this extent begins
*/
if (found_key.offset > cur_offset) {
extent_end = found_key.offset;
extent_type = 0;
goto out_check;
}
/*
* Found extent which begins before our range and potentially
* intersect it
*/
fi = btrfs_item_ptr(leaf, path->slots[0],
struct btrfs_file_extent_item);
extent_type = btrfs_file_extent_type(leaf, fi);
ram_bytes = btrfs_file_extent_ram_bytes(leaf, fi);
if (extent_type == BTRFS_FILE_EXTENT_REG ||
extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
extent_offset = btrfs_file_extent_offset(leaf, fi);
extent_end = found_key.offset +
btrfs_file_extent_num_bytes(leaf, fi);
disk_num_bytes =
btrfs_file_extent_disk_num_bytes(leaf, fi);
/*
* If the extent we got ends before our current offset,
* skip to the next extent.
*/
if (extent_end <= cur_offset) {
path->slots[0]++;
goto next_slot;
}
/* Skip holes */
if (disk_bytenr == 0)
goto out_check;
/* Skip compressed/encrypted/encoded extents */
if (btrfs_file_extent_compression(leaf, fi) ||
btrfs_file_extent_encryption(leaf, fi) ||
btrfs_file_extent_other_encoding(leaf, fi))
goto out_check;
/*
* If extent is created before the last volume's snapshot
* this implies the extent is shared, hence we can't do
* nocow. This is the same check as in
* btrfs_cross_ref_exist but without calling
* btrfs_search_slot.
*/
if (!freespace_inode &&
btrfs_file_extent_generation(leaf, fi) <=
btrfs_root_last_snapshot(&root->root_item))
goto out_check;
if (extent_type == BTRFS_FILE_EXTENT_REG && !force)
goto out_check;
/*
* The following checks can be expensive, as they need to
* take other locks and do btree or rbtree searches, so
* release the path to avoid blocking other tasks for too
* long.
*/
btrfs_release_path(path);
ret = btrfs_cross_ref_exist(root, ino,
found_key.offset -
extent_offset, disk_bytenr, false);
if (ret) {
/*
* ret could be -EIO if the above fails to read
* metadata.
*/
if (ret < 0) {
if (cow_start != (u64)-1)
cur_offset = cow_start;
goto error;
}
WARN_ON_ONCE(freespace_inode);
goto out_check;
}
disk_bytenr += extent_offset;
disk_bytenr += cur_offset - found_key.offset;
num_bytes = min(end + 1, extent_end) - cur_offset;
/*
* If there are pending snapshots for this root, we
* fall into common COW way
*/
if (!freespace_inode && atomic_read(&root->snapshot_force_cow))
goto out_check;
/*
* force cow if csum exists in the range.
* this ensure that csum for a given extent are
* either valid or do not exist.
*/
ret = csum_exist_in_range(fs_info, disk_bytenr,
num_bytes);
if (ret) {
/*
* ret could be -EIO if the above fails to read
* metadata.
*/
if (ret < 0) { if (cow_start != (u64)-1)
cur_offset = cow_start;
goto error;
}
WARN_ON_ONCE(freespace_inode);
goto out_check;
}
/* If the extent's block group is RO, we must COW */
if (!btrfs_inc_nocow_writers(fs_info, disk_bytenr))
goto out_check;
nocow = true;
} else if (extent_type == BTRFS_FILE_EXTENT_INLINE) { extent_end = found_key.offset + ram_bytes;
extent_end = ALIGN(extent_end, fs_info->sectorsize);
/* Skip extents outside of our requested range */
if (extent_end <= start) {
path->slots[0]++; goto next_slot;
}
} else {
/* If this triggers then we have a memory corruption */
BUG();
}
out_check:
/*
* If nocow is false then record the beginning of the range
* that needs to be COWed
*/
if (!nocow) {
if (cow_start == (u64)-1)
cow_start = cur_offset;
cur_offset = extent_end;
if (cur_offset > end)
break;
if (!path->nodes[0]) continue; path->slots[0]++;
goto next_slot;
}
/*
* COW range from cow_start to found_key.offset - 1. As the key
* will contain the beginning of the first extent that can be
* NOCOW, following one which needs to be COW'ed
*/
if (cow_start != (u64)-1) {
ret = fallback_to_cow(inode, locked_page,
cow_start, found_key.offset - 1,
page_started, nr_written);
if (ret)
goto error;
cow_start = (u64)-1;
}
if (extent_type == BTRFS_FILE_EXTENT_PREALLOC) { u64 orig_start = found_key.offset - extent_offset;
struct extent_map *em;
em = create_io_em(inode, cur_offset, num_bytes,
orig_start,
disk_bytenr, /* block_start */
num_bytes, /* block_len */
disk_num_bytes, /* orig_block_len */
ram_bytes, BTRFS_COMPRESS_NONE,
BTRFS_ORDERED_PREALLOC);
if (IS_ERR(em)) {
ret = PTR_ERR(em);
goto error;
}
free_extent_map(em);
ret = btrfs_add_ordered_extent(inode, cur_offset,
disk_bytenr, num_bytes,
num_bytes,
BTRFS_ORDERED_PREALLOC);
if (ret) {
btrfs_drop_extent_cache(inode, cur_offset,
cur_offset + num_bytes - 1,
0);
goto error;
}
} else {
ret = btrfs_add_ordered_extent(inode, cur_offset,
disk_bytenr, num_bytes,
num_bytes,
BTRFS_ORDERED_NOCOW);
if (ret)
goto error;
}
if (nocow)
btrfs_dec_nocow_writers(fs_info, disk_bytenr);
nocow = false;
if (btrfs_is_data_reloc_root(root))
/*
* Error handled later, as we must prevent
* extent_clear_unlock_delalloc() in error handler
* from freeing metadata of created ordered extent.
*/
ret = btrfs_reloc_clone_csums(inode, cur_offset,
num_bytes);
extent_clear_unlock_delalloc(inode, cur_offset,
cur_offset + num_bytes - 1,
locked_page, EXTENT_LOCKED |
EXTENT_DELALLOC |
EXTENT_CLEAR_DATA_RESV,
PAGE_UNLOCK | PAGE_SET_ORDERED);
cur_offset = extent_end;
/*
* btrfs_reloc_clone_csums() error, now we're OK to call error
* handler, as metadata for created ordered extent will only
* be freed by btrfs_finish_ordered_io().
*/
if (ret)
goto error;
if (cur_offset > end)
break;
}
btrfs_release_path(path);
if (cur_offset <= end && cow_start == (u64)-1)
cow_start = cur_offset;
if (cow_start != (u64)-1) {
cur_offset = end;
ret = fallback_to_cow(inode, locked_page, cow_start, end,
page_started, nr_written);
if (ret)
goto error;
}
error:
if (nocow) btrfs_dec_nocow_writers(fs_info, disk_bytenr); if (ret && cur_offset < end) extent_clear_unlock_delalloc(inode, cur_offset, end,
locked_page, EXTENT_LOCKED |
EXTENT_DELALLOC | EXTENT_DEFRAG |
EXTENT_DO_ACCOUNTING, PAGE_UNLOCK |
PAGE_START_WRITEBACK |
PAGE_END_WRITEBACK);
btrfs_free_path(path); return ret;
}
static bool should_nocow(struct btrfs_inode *inode, u64 start, u64 end)
{
if (inode->flags & (BTRFS_INODE_NODATACOW | BTRFS_INODE_PREALLOC)) {
if (inode->defrag_bytes && test_range_bit(&inode->io_tree, start, end, EXTENT_DEFRAG,
0, NULL))
return false;
return true;
}
return false;
}
/*
* Function to process delayed allocation (create CoW) for ranges which are
* being touched for the first time.
*/
int btrfs_run_delalloc_range(struct btrfs_inode *inode, struct page *locked_page,
u64 start, u64 end, int *page_started, unsigned long *nr_written,
struct writeback_control *wbc)
{
int ret;
const bool zoned = btrfs_is_zoned(inode->root->fs_info);
if (should_nocow(inode, start, end)) {
/*
* Normally on a zoned device we're only doing COW writes, but
* in case of relocation on a zoned filesystem we have taken
* precaution, that we're only writing sequentially. It's safe
* to use run_delalloc_nocow() here, like for regular
* preallocated inodes.
*/
ASSERT(!zoned ||
(zoned && btrfs_is_data_reloc_root(inode->root)));
ret = run_delalloc_nocow(inode, locked_page, start, end,
page_started, nr_written);
} else if (!inode_can_compress(inode) ||
!inode_need_compress(inode, start, end)) {
if (zoned) ret = run_delalloc_zoned(inode, locked_page, start, end,
page_started, nr_written);
else
ret = cow_file_range(inode, locked_page, start, end,
page_started, nr_written, 1);
} else {
set_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, &inode->runtime_flags);
ret = cow_file_range_async(inode, wbc, locked_page, start, end,
page_started, nr_written);
}
ASSERT(ret <= 0);
if (ret) btrfs_cleanup_ordered_extents(inode, locked_page, start,
end - start + 1);
return ret;
}
void btrfs_split_delalloc_extent(struct inode *inode,
struct extent_state *orig, u64 split)
{
u64 size;
/* not delalloc, ignore it */
if (!(orig->state & EXTENT_DELALLOC))
return;
size = orig->end - orig->start + 1;
if (size > BTRFS_MAX_EXTENT_SIZE) {
u32 num_extents;
u64 new_size;
/*
* See the explanation in btrfs_merge_delalloc_extent, the same
* applies here, just in reverse.
*/
new_size = orig->end - split + 1;
num_extents = count_max_extents(new_size);
new_size = split - orig->start;
num_extents += count_max_extents(new_size);
if (count_max_extents(size) >= num_extents)
return;
}
spin_lock(&BTRFS_I(inode)->lock);
btrfs_mod_outstanding_extents(BTRFS_I(inode), 1);
spin_unlock(&BTRFS_I(inode)->lock);
}
/*
* Handle merged delayed allocation extents so we can keep track of new extents
* that are just merged onto old extents, such as when we are doing sequential
* writes, so we can properly account for the metadata space we'll need.
*/
void btrfs_merge_delalloc_extent(struct inode *inode, struct extent_state *new,
struct extent_state *other)
{
u64 new_size, old_size;
u32 num_extents;
/* not delalloc, ignore it */
if (!(other->state & EXTENT_DELALLOC))
return;
if (new->start > other->start) new_size = new->end - other->start + 1;
else
new_size = other->end - new->start + 1;
/* we're not bigger than the max, unreserve the space and go */
if (new_size <= BTRFS_MAX_EXTENT_SIZE) {
spin_lock(&BTRFS_I(inode)->lock);
btrfs_mod_outstanding_extents(BTRFS_I(inode), -1);
spin_unlock(&BTRFS_I(inode)->lock);
return;
}
/*
* We have to add up either side to figure out how many extents were
* accounted for before we merged into one big extent. If the number of
* extents we accounted for is <= the amount we need for the new range
* then we can return, otherwise drop. Think of it like this
*
* [ 4k][MAX_SIZE]
*
* So we've grown the extent by a MAX_SIZE extent, this would mean we
* need 2 outstanding extents, on one side we have 1 and the other side
* we have 1 so they are == and we can return. But in this case
*
* [MAX_SIZE+4k][MAX_SIZE+4k]
*
* Each range on their own accounts for 2 extents, but merged together
* they are only 3 extents worth of accounting, so we need to drop in
* this case.
*/
old_size = other->end - other->start + 1;
num_extents = count_max_extents(old_size);
old_size = new->end - new->start + 1;
num_extents += count_max_extents(old_size);
if (count_max_extents(new_size) >= num_extents)
return;
spin_lock(&BTRFS_I(inode)->lock);
btrfs_mod_outstanding_extents(BTRFS_I(inode), -1);
spin_unlock(&BTRFS_I(inode)->lock);
}
static void btrfs_add_delalloc_inodes(struct btrfs_root *root,
struct inode *inode)
{
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
spin_lock(&root->delalloc_lock);
if (list_empty(&BTRFS_I(inode)->delalloc_inodes)) {
list_add_tail(&BTRFS_I(inode)->delalloc_inodes,
&root->delalloc_inodes);
set_bit(BTRFS_INODE_IN_DELALLOC_LIST,
&BTRFS_I(inode)->runtime_flags);
root->nr_delalloc_inodes++;
if (root->nr_delalloc_inodes == 1) {
spin_lock(&fs_info->delalloc_root_lock);
BUG_ON(!list_empty(&root->delalloc_root)); list_add_tail(&root->delalloc_root,
&fs_info->delalloc_roots);
spin_unlock(&fs_info->delalloc_root_lock);
}
}
spin_unlock(&root->delalloc_lock);
}
void __btrfs_del_delalloc_inode(struct btrfs_root *root,
struct btrfs_inode *inode)
{
struct btrfs_fs_info *fs_info = root->fs_info; if (!list_empty(&inode->delalloc_inodes)) {
list_del_init(&inode->delalloc_inodes);
clear_bit(BTRFS_INODE_IN_DELALLOC_LIST,
&inode->runtime_flags);
root->nr_delalloc_inodes--;
if (!root->nr_delalloc_inodes) {
ASSERT(list_empty(&root->delalloc_inodes));
spin_lock(&fs_info->delalloc_root_lock);
BUG_ON(list_empty(&root->delalloc_root));
list_del_init(&root->delalloc_root);
spin_unlock(&fs_info->delalloc_root_lock);
}
}
}
static void btrfs_del_delalloc_inode(struct btrfs_root *root,
struct btrfs_inode *inode)
{
spin_lock(&root->delalloc_lock);
__btrfs_del_delalloc_inode(root, inode);
spin_unlock(&root->delalloc_lock);
}
/*
* Properly track delayed allocation bytes in the inode and to maintain the
* list of inodes that have pending delalloc work to be done.
*/
void btrfs_set_delalloc_extent(struct inode *inode, struct extent_state *state,
unsigned *bits)
{
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); if ((*bits & EXTENT_DEFRAG) && !(*bits & EXTENT_DELALLOC)) WARN_ON(1);
/*
* set_bit and clear bit hooks normally require _irqsave/restore
* but in this case, we are only testing for the DELALLOC
* bit, which is only set or cleared with irqs on
*/
if (!(state->state & EXTENT_DELALLOC) && (*bits & EXTENT_DELALLOC)) { struct btrfs_root *root = BTRFS_I(inode)->root;
u64 len = state->end + 1 - state->start;
u32 num_extents = count_max_extents(len);
bool do_list = !btrfs_is_free_space_inode(BTRFS_I(inode));
spin_lock(&BTRFS_I(inode)->lock);
btrfs_mod_outstanding_extents(BTRFS_I(inode), num_extents);
spin_unlock(&BTRFS_I(inode)->lock);
/* For sanity tests */
if (btrfs_is_testing(fs_info))
return;
percpu_counter_add_batch(&fs_info->delalloc_bytes, len,
fs_info->delalloc_batch);
spin_lock(&BTRFS_I(inode)->lock);
BTRFS_I(inode)->delalloc_bytes += len;
if (*bits & EXTENT_DEFRAG)
BTRFS_I(inode)->defrag_bytes += len; if (do_list && !test_bit(BTRFS_INODE_IN_DELALLOC_LIST, &BTRFS_I(inode)->runtime_flags))
btrfs_add_delalloc_inodes(root, inode);
spin_unlock(&BTRFS_I(inode)->lock);
}
if (!(state->state & EXTENT_DELALLOC_NEW) && (*bits & EXTENT_DELALLOC_NEW)) {
spin_lock(&BTRFS_I(inode)->lock);
BTRFS_I(inode)->new_delalloc_bytes += state->end + 1 -
state->start;
spin_unlock(&BTRFS_I(inode)->lock);
}
}
/*
* Once a range is no longer delalloc this function ensures that proper
* accounting happens.
*/
void btrfs_clear_delalloc_extent(struct inode *vfs_inode,
struct extent_state *state, unsigned *bits)
{
struct btrfs_inode *inode = BTRFS_I(vfs_inode);
struct btrfs_fs_info *fs_info = btrfs_sb(vfs_inode->i_sb);
u64 len = state->end + 1 - state->start;
u32 num_extents = count_max_extents(len);
if ((state->state & EXTENT_DEFRAG) && (*bits & EXTENT_DEFRAG)) {
spin_lock(&inode->lock);
inode->defrag_bytes -= len;
spin_unlock(&inode->lock);
}
/*
* set_bit and clear bit hooks normally require _irqsave/restore
* but in this case, we are only testing for the DELALLOC
* bit, which is only set or cleared with irqs on
*/
if ((state->state & EXTENT_DELALLOC) && (*bits & EXTENT_DELALLOC)) { struct btrfs_root *root = inode->root;
bool do_list = !btrfs_is_free_space_inode(inode);
spin_lock(&inode->lock);
btrfs_mod_outstanding_extents(inode, -num_extents);
spin_unlock(&inode->lock);
/*
* We don't reserve metadata space for space cache inodes so we
* don't need to call delalloc_release_metadata if there is an
* error.
*/
if (*bits & EXTENT_CLEAR_META_RESV &&
root != fs_info->tree_root) btrfs_delalloc_release_metadata(inode, len, false);
/* For sanity tests. */
if (btrfs_is_testing(fs_info))
return;
if (!btrfs_is_data_reloc_root(root) && do_list && !(state->state & EXTENT_NORESERVE) && (*bits & EXTENT_CLEAR_DATA_RESV)) btrfs_free_reserved_data_space_noquota(fs_info, len); percpu_counter_add_batch(&fs_info->delalloc_bytes, -len,
fs_info->delalloc_batch);
spin_lock(&inode->lock);
inode->delalloc_bytes -= len;
if (do_list && inode->delalloc_bytes == 0 &&
test_bit(BTRFS_INODE_IN_DELALLOC_LIST,
&inode->runtime_flags))
btrfs_del_delalloc_inode(root, inode);
spin_unlock(&inode->lock);
}
if ((state->state & EXTENT_DELALLOC_NEW) && (*bits & EXTENT_DELALLOC_NEW)) {
spin_lock(&inode->lock);
ASSERT(inode->new_delalloc_bytes >= len);
inode->new_delalloc_bytes -= len;
if (*bits & EXTENT_ADD_INODE_BYTES)
inode_add_bytes(&inode->vfs_inode, len);
spin_unlock(&inode->lock);
}
}
/*
* btrfs_bio_fits_in_stripe - Checks whether the size of the given bio will fit
* in a chunk's stripe. This function ensures that bios do not span a
* stripe/chunk
*
* @page - The page we are about to add to the bio
* @size - size we want to add to the bio
* @bio - bio we want to ensure is smaller than a stripe
* @bio_flags - flags of the bio
*
* return 1 if page cannot be added to the bio
* return 0 if page can be added to the bio
* return error otherwise
*/
int btrfs_bio_fits_in_stripe(struct page *page, size_t size, struct bio *bio,
unsigned long bio_flags)
{
struct inode *inode = page->mapping->host;
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
u64 logical = bio->bi_iter.bi_sector << 9;
u32 bio_len = bio->bi_iter.bi_size;
struct extent_map *em;
int ret = 0;
struct btrfs_io_geometry geom;
if (bio_flags & EXTENT_BIO_COMPRESSED)
return 0;
em = btrfs_get_chunk_map(fs_info, logical, fs_info->sectorsize);
if (IS_ERR(em))
return PTR_ERR(em);
ret = btrfs_get_io_geometry(fs_info, em, btrfs_op(bio), logical, &geom);
if (ret < 0)
goto out;
if (geom.len < bio_len + size)
ret = 1;
out:
free_extent_map(em);
return ret;
}
/*
* in order to insert checksums into the metadata in large chunks,
* we wait until bio submission time. All the pages in the bio are
* checksummed and sums are attached onto the ordered extent record.
*
* At IO completion time the cums attached on the ordered extent record
* are inserted into the btree
*/
static blk_status_t btrfs_submit_bio_start(struct inode *inode, struct bio *bio,
u64 dio_file_offset)
{
return btrfs_csum_one_bio(BTRFS_I(inode), bio, 0, 0);
}
/*
* Split an extent_map at [start, start + len]
*
* This function is intended to be used only for extract_ordered_extent().
*/
static int split_zoned_em(struct btrfs_inode *inode, u64 start, u64 len,
u64 pre, u64 post)
{
struct extent_map_tree *em_tree = &inode->extent_tree;
struct extent_map *em;
struct extent_map *split_pre = NULL;
struct extent_map *split_mid = NULL;
struct extent_map *split_post = NULL;
int ret = 0;
unsigned long flags;
/* Sanity check */
if (pre == 0 && post == 0)
return 0;
split_pre = alloc_extent_map();
if (pre)
split_mid = alloc_extent_map();
if (post)
split_post = alloc_extent_map();
if (!split_pre || (pre && !split_mid) || (post && !split_post)) {
ret = -ENOMEM;
goto out;
}
ASSERT(pre + post < len);
lock_extent(&inode->io_tree, start, start + len - 1);
write_lock(&em_tree->lock);
em = lookup_extent_mapping(em_tree, start, len);
if (!em) {
ret = -EIO;
goto out_unlock;
}
ASSERT(em->len == len);
ASSERT(!test_bit(EXTENT_FLAG_COMPRESSED, &em->flags));
ASSERT(em->block_start < EXTENT_MAP_LAST_BYTE);
ASSERT(test_bit(EXTENT_FLAG_PINNED, &em->flags));
ASSERT(!test_bit(EXTENT_FLAG_LOGGING, &em->flags));
ASSERT(!list_empty(&em->list));
flags = em->flags;
clear_bit(EXTENT_FLAG_PINNED, &em->flags);
/* First, replace the em with a new extent_map starting from * em->start */
split_pre->start = em->start;
split_pre->len = (pre ? pre : em->len - post);
split_pre->orig_start = split_pre->start;
split_pre->block_start = em->block_start;
split_pre->block_len = split_pre->len;
split_pre->orig_block_len = split_pre->block_len;
split_pre->ram_bytes = split_pre->len;
split_pre->flags = flags;
split_pre->compress_type = em->compress_type;
split_pre->generation = em->generation;
replace_extent_mapping(em_tree, em, split_pre, 1);
/*
* Now we only have an extent_map at:
* [em->start, em->start + pre] if pre != 0
* [em->start, em->start + em->len - post] if pre == 0
*/
if (pre) {
/* Insert the middle extent_map */
split_mid->start = em->start + pre;
split_mid->len = em->len - pre - post;
split_mid->orig_start = split_mid->start;
split_mid->block_start = em->block_start + pre;
split_mid->block_len = split_mid->len;
split_mid->orig_block_len = split_mid->block_len;
split_mid->ram_bytes = split_mid->len;
split_mid->flags = flags;
split_mid->compress_type = em->compress_type;
split_mid->generation = em->generation;
add_extent_mapping(em_tree, split_mid, 1);
}
if (post) {
split_post->start = em->start + em->len - post;
split_post->len = post;
split_post->orig_start = split_post->start;
split_post->block_start = em->block_start + em->len - post;
split_post->block_len = split_post->len;
split_post->orig_block_len = split_post->block_len;
split_post->ram_bytes = split_post->len;
split_post->flags = flags;
split_post->compress_type = em->compress_type;
split_post->generation = em->generation;
add_extent_mapping(em_tree, split_post, 1);
}
/* Once for us */
free_extent_map(em);
/* Once for the tree */
free_extent_map(em);
out_unlock:
write_unlock(&em_tree->lock);
unlock_extent(&inode->io_tree, start, start + len - 1);
out:
free_extent_map(split_pre);
free_extent_map(split_mid);
free_extent_map(split_post);
return ret;
}
static blk_status_t extract_ordered_extent(struct btrfs_inode *inode,
struct bio *bio, loff_t file_offset)
{
struct btrfs_ordered_extent *ordered;
u64 start = (u64)bio->bi_iter.bi_sector << SECTOR_SHIFT;
u64 file_len;
u64 len = bio->bi_iter.bi_size;
u64 end = start + len;
u64 ordered_end;
u64 pre, post;
int ret = 0;
ordered = btrfs_lookup_ordered_extent(inode, file_offset);
if (WARN_ON_ONCE(!ordered))
return BLK_STS_IOERR;
/* No need to split */
if (ordered->disk_num_bytes == len)
goto out;
/* We cannot split once end_bio'd ordered extent */
if (WARN_ON_ONCE(ordered->bytes_left != ordered->disk_num_bytes)) {
ret = -EINVAL;
goto out;
}
/* We cannot split a compressed ordered extent */
if (WARN_ON_ONCE(ordered->disk_num_bytes != ordered->num_bytes)) {
ret = -EINVAL;
goto out;
}
ordered_end = ordered->disk_bytenr + ordered->disk_num_bytes;
/* bio must be in one ordered extent */
if (WARN_ON_ONCE(start < ordered->disk_bytenr || end > ordered_end)) {
ret = -EINVAL;
goto out;
}
/* Checksum list should be empty */
if (WARN_ON_ONCE(!list_empty(&ordered->list))) {
ret = -EINVAL;
goto out;
}
file_len = ordered->num_bytes;
pre = start - ordered->disk_bytenr;
post = ordered_end - end;
ret = btrfs_split_ordered_extent(ordered, pre, post);
if (ret)
goto out;
ret = split_zoned_em(inode, file_offset, file_len, pre, post);
out:
btrfs_put_ordered_extent(ordered);
return errno_to_blk_status(ret);
}
/*
* extent_io.c submission hook. This does the right thing for csum calculation
* on write, or reading the csums from the tree before a read.
*
* Rules about async/sync submit,
* a) read: sync submit
*
* b) write without checksum: sync submit
*
* c) write with checksum:
* c-1) if bio is issued by fsync: sync submit
* (sync_writers != 0)
*
* c-2) if root is reloc root: sync submit
* (only in case of buffered IO)
*
* c-3) otherwise: async submit
*/
blk_status_t btrfs_submit_data_bio(struct inode *inode, struct bio *bio,
int mirror_num, unsigned long bio_flags)
{
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct btrfs_root *root = BTRFS_I(inode)->root;
enum btrfs_wq_endio_type metadata = BTRFS_WQ_ENDIO_DATA;
blk_status_t ret = 0;
int skip_sum;
int async = !atomic_read(&BTRFS_I(inode)->sync_writers);
skip_sum = (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM) ||
!fs_info->csum_root;
if (btrfs_is_free_space_inode(BTRFS_I(inode)))
metadata = BTRFS_WQ_ENDIO_FREE_SPACE;
if (bio_op(bio) == REQ_OP_ZONE_APPEND) {
struct page *page = bio_first_bvec_all(bio)->bv_page;
loff_t file_offset = page_offset(page);
ret = extract_ordered_extent(BTRFS_I(inode), bio, file_offset);
if (ret)
goto out;
}
if (btrfs_op(bio) != BTRFS_MAP_WRITE) {
ret = btrfs_bio_wq_end_io(fs_info, bio, metadata);
if (ret)
goto out;
if (bio_flags & EXTENT_BIO_COMPRESSED) { ret = btrfs_submit_compressed_read(inode, bio,
mirror_num,
bio_flags);
goto out;
} else {
/*
* Lookup bio sums does extra checks around whether we
* need to csum or not, which is why we ignore skip_sum
* here.
*/
ret = btrfs_lookup_bio_sums(inode, bio, NULL);
if (ret)
goto out;
}
goto mapit;
} else if (async && !skip_sum) {
/* csum items have already been cloned */
if (btrfs_is_data_reloc_root(root))
goto mapit;
/* we're doing a write, do the async checksumming */
ret = btrfs_wq_submit_bio(inode, bio, mirror_num, bio_flags,
0, btrfs_submit_bio_start);
goto out;
} else if (!skip_sum) {
ret = btrfs_csum_one_bio(BTRFS_I(inode), bio, 0, 0);
if (ret)
goto out;
}
mapit:
ret = btrfs_map_bio(fs_info, bio, mirror_num);
out:
if (ret) { bio->bi_status = ret;
bio_endio(bio);
}
return ret;
}
/*
* given a list of ordered sums record them in the inode. This happens
* at IO completion time based on sums calculated at bio submission time.
*/
static int add_pending_csums(struct btrfs_trans_handle *trans,
struct list_head *list)
{
struct btrfs_ordered_sum *sum;
int ret;
list_for_each_entry(sum, list, list) { trans->adding_csums = true;
ret = btrfs_csum_file_blocks(trans, trans->fs_info->csum_root, sum);
trans->adding_csums = false;
if (ret)
return ret;
}
return 0;
}
static int btrfs_find_new_delalloc_bytes(struct btrfs_inode *inode,
const u64 start,
const u64 len,
struct extent_state **cached_state)
{
u64 search_start = start;
const u64 end = start + len - 1;
while (search_start < end) { const u64 search_len = end - search_start + 1;
struct extent_map *em;
u64 em_len;
int ret = 0;
em = btrfs_get_extent(inode, NULL, 0, search_start, search_len);
if (IS_ERR(em))
return PTR_ERR(em);
if (em->block_start != EXTENT_MAP_HOLE)
goto next;
em_len = em->len;
if (em->start < search_start) em_len -= search_start - em->start; if (em_len > search_len)
em_len = search_len;
ret = set_extent_bit(&inode->io_tree, search_start,
search_start + em_len - 1,
EXTENT_DELALLOC_NEW, 0, NULL, cached_state,
GFP_NOFS, NULL);
next:
search_start = extent_map_end(em);
free_extent_map(em); if (ret)
return ret;
}
return 0;
}
int btrfs_set_extent_delalloc(struct btrfs_inode *inode, u64 start, u64 end,
unsigned int extra_bits,
struct extent_state **cached_state)
{
WARN_ON(PAGE_ALIGNED(end)); if (start >= i_size_read(&inode->vfs_inode) && !(inode->flags & BTRFS_INODE_PREALLOC)) {
/*
* There can't be any extents following eof in this case so just
* set the delalloc new bit for the range directly.
*/
extra_bits |= EXTENT_DELALLOC_NEW;
} else {
int ret;
ret = btrfs_find_new_delalloc_bytes(inode, start,
end + 1 - start,
cached_state);
if (ret)
return ret;
}
return set_extent_delalloc(&inode->io_tree, start, end, extra_bits,
cached_state);
}
/* see btrfs_writepage_start_hook for details on why this is required */
struct btrfs_writepage_fixup {
struct page *page;
struct inode *inode;
struct btrfs_work work;
};
static void btrfs_writepage_fixup_worker(struct btrfs_work *work)
{
struct btrfs_writepage_fixup *fixup;
struct btrfs_ordered_extent *ordered;
struct extent_state *cached_state = NULL;
struct extent_changeset *data_reserved = NULL;
struct page *page;
struct btrfs_inode *inode;
u64 page_start;
u64 page_end;
int ret = 0;
bool free_delalloc_space = true;
fixup = container_of(work, struct btrfs_writepage_fixup, work);
page = fixup->page;
inode = BTRFS_I(fixup->inode);
page_start = page_offset(page);
page_end = page_offset(page) + PAGE_SIZE - 1;
/*
* This is similar to page_mkwrite, we need to reserve the space before
* we take the page lock.
*/
ret = btrfs_delalloc_reserve_space(inode, &data_reserved, page_start,
PAGE_SIZE);
again:
lock_page(page);
/*
* Before we queued this fixup, we took a reference on the page.
* page->mapping may go NULL, but it shouldn't be moved to a different
* address space.
*/
if (!page->mapping || !PageDirty(page) || !PageChecked(page)) {
/*
* Unfortunately this is a little tricky, either
*
* 1) We got here and our page had already been dealt with and
* we reserved our space, thus ret == 0, so we need to just
* drop our space reservation and bail. This can happen the
* first time we come into the fixup worker, or could happen
* while waiting for the ordered extent.
* 2) Our page was already dealt with, but we happened to get an
* ENOSPC above from the btrfs_delalloc_reserve_space. In
* this case we obviously don't have anything to release, but
* because the page was already dealt with we don't want to
* mark the page with an error, so make sure we're resetting
* ret to 0. This is why we have this check _before_ the ret
* check, because we do not want to have a surprise ENOSPC
* when the page was already properly dealt with.
*/
if (!ret) {
btrfs_delalloc_release_extents(inode, PAGE_SIZE);
btrfs_delalloc_release_space(inode, data_reserved,
page_start, PAGE_SIZE,
true);
}
ret = 0;
goto out_page;
}
/*
* We can't mess with the page state unless it is locked, so now that
* it is locked bail if we failed to make our space reservation.
*/
if (ret)
goto out_page;
lock_extent_bits(&inode->io_tree, page_start, page_end, &cached_state);
/* already ordered? We're done */
if (PageOrdered(page))
goto out_reserved;
ordered = btrfs_lookup_ordered_range(inode, page_start, PAGE_SIZE);
if (ordered) {
unlock_extent_cached(&inode->io_tree, page_start, page_end,
&cached_state);
unlock_page(page);
btrfs_start_ordered_extent(ordered, 1);
btrfs_put_ordered_extent(ordered);
goto again;
}
ret = btrfs_set_extent_delalloc(inode, page_start, page_end, 0,
&cached_state);
if (ret)
goto out_reserved;
/*
* Everything went as planned, we're now the owner of a dirty page with
* delayed allocation bits set and space reserved for our COW
* destination.
*
* The page was dirty when we started, nothing should have cleaned it.
*/
BUG_ON(!PageDirty(page));
free_delalloc_space = false;
out_reserved:
btrfs_delalloc_release_extents(inode, PAGE_SIZE);
if (free_delalloc_space)
btrfs_delalloc_release_space(inode, data_reserved, page_start,
PAGE_SIZE, true);
unlock_extent_cached(&inode->io_tree, page_start, page_end,
&cached_state);
out_page:
if (ret) {
/*
* We hit ENOSPC or other errors. Update the mapping and page
* to reflect the errors and clean the page.
*/
mapping_set_error(page->mapping, ret);
end_extent_writepage(page, ret, page_start, page_end);
clear_page_dirty_for_io(page);
SetPageError(page);
}
ClearPageChecked(page);
unlock_page(page);
put_page(page);
kfree(fixup);
extent_changeset_free(data_reserved);
/*
* As a precaution, do a delayed iput in case it would be the last iput
* that could need flushing space. Recursing back to fixup worker would
* deadlock.
*/
btrfs_add_delayed_iput(&inode->vfs_inode);
}
/*
* There are a few paths in the higher layers of the kernel that directly
* set the page dirty bit without asking the filesystem if it is a
* good idea. This causes problems because we want to make sure COW
* properly happens and the data=ordered rules are followed.
*
* In our case any range that doesn't have the ORDERED bit set
* hasn't been properly setup for IO. We kick off an async process
* to fix it up. The async helper will wait for ordered extents, set
* the delalloc bit and make it safe to write the page.
*/
int btrfs_writepage_cow_fixup(struct page *page)
{
struct inode *inode = page->mapping->host;
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct btrfs_writepage_fixup *fixup;
/* This page has ordered extent covering it already */
if (PageOrdered(page))
return 0;
/*
* PageChecked is set below when we create a fixup worker for this page,
* don't try to create another one if we're already PageChecked()
*
* The extent_io writepage code will redirty the page if we send back
* EAGAIN.
*/
if (PageChecked(page))
return -EAGAIN;
fixup = kzalloc(sizeof(*fixup), GFP_NOFS);
if (!fixup)
return -EAGAIN;
/*
* We are already holding a reference to this inode from
* write_cache_pages. We need to hold it because the space reservation
* takes place outside of the page lock, and we can't trust
* page->mapping outside of the page lock.
*/
ihold(inode);
SetPageChecked(page);
get_page(page);
btrfs_init_work(&fixup->work, btrfs_writepage_fixup_worker, NULL, NULL);
fixup->page = page;
fixup->inode = inode;
btrfs_queue_work(fs_info->fixup_workers, &fixup->work);
return -EAGAIN;
}
static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
struct btrfs_inode *inode, u64 file_pos,
struct btrfs_file_extent_item *stack_fi,
const bool update_inode_bytes,
u64 qgroup_reserved)
{
struct btrfs_root *root = inode->root; const u64 sectorsize = root->fs_info->sectorsize;
struct btrfs_path *path;
struct extent_buffer *leaf;
struct btrfs_key ins;
u64 disk_num_bytes = btrfs_stack_file_extent_disk_num_bytes(stack_fi);
u64 disk_bytenr = btrfs_stack_file_extent_disk_bytenr(stack_fi);
u64 num_bytes = btrfs_stack_file_extent_num_bytes(stack_fi);
u64 ram_bytes = btrfs_stack_file_extent_ram_bytes(stack_fi);
struct btrfs_drop_extents_args drop_args = { 0 };
int ret;
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
/*
* we may be replacing one extent in the tree with another.
* The new extent is pinned in the extent map, and we don't want
* to drop it from the cache until it is completely in the btree.
*
* So, tell btrfs_drop_extents to leave this extent in the cache.
* the caller is expected to unpin it and allow it to be merged
* with the others.
*/
drop_args.path = path;
drop_args.start = file_pos;
drop_args.end = file_pos + num_bytes;
drop_args.replace_extent = true;
drop_args.extent_item_size = sizeof(*stack_fi);
ret = btrfs_drop_extents(trans, root, inode, &drop_args);
if (ret)
goto out;
if (!drop_args.extent_inserted) { ins.objectid = btrfs_ino(inode);
ins.offset = file_pos;
ins.type = BTRFS_EXTENT_DATA_KEY;
ret = btrfs_insert_empty_item(trans, root, path, &ins,
sizeof(*stack_fi));
if (ret)
goto out;
}
leaf = path->nodes[0];
btrfs_set_stack_file_extent_generation(stack_fi, trans->transid);
write_extent_buffer(leaf, stack_fi,
btrfs_item_ptr_offset(leaf, path->slots[0]),
sizeof(struct btrfs_file_extent_item));
btrfs_mark_buffer_dirty(leaf);
btrfs_release_path(path);
/*
* If we dropped an inline extent here, we know the range where it is
* was not marked with the EXTENT_DELALLOC_NEW bit, so we update the
* number of bytes only for that range containing the inline extent.
* The remaining of the range will be processed when clearning the
* EXTENT_DELALLOC_BIT bit through the ordered extent completion.
*/
if (file_pos == 0 && !IS_ALIGNED(drop_args.bytes_found, sectorsize)) {
u64 inline_size = round_down(drop_args.bytes_found, sectorsize);
inline_size = drop_args.bytes_found - inline_size;
btrfs_update_inode_bytes(inode, sectorsize, inline_size);
drop_args.bytes_found -= inline_size;
num_bytes -= sectorsize;
}
if (update_inode_bytes) btrfs_update_inode_bytes(inode, num_bytes, drop_args.bytes_found); ins.objectid = disk_bytenr;
ins.offset = disk_num_bytes;
ins.type = BTRFS_EXTENT_ITEM_KEY;
ret = btrfs_inode_set_file_extent_range(inode, file_pos, ram_bytes);
if (ret)
goto out;
ret = btrfs_alloc_reserved_file_extent(trans, root, btrfs_ino(inode),
file_pos, qgroup_reserved, &ins);
out:
btrfs_free_path(path); return ret;
}
static void btrfs_release_delalloc_bytes(struct btrfs_fs_info *fs_info,
u64 start, u64 len)
{
struct btrfs_block_group *cache;
cache = btrfs_lookup_block_group(fs_info, start);
ASSERT(cache);
spin_lock(&cache->lock);
cache->delalloc_bytes -= len;
spin_unlock(&cache->lock);
btrfs_put_block_group(cache);
}
static int insert_ordered_extent_file_extent(struct btrfs_trans_handle *trans,
struct btrfs_ordered_extent *oe)
{
struct btrfs_file_extent_item stack_fi;
u64 logical_len;
bool update_inode_bytes;
memset(&stack_fi, 0, sizeof(stack_fi));
btrfs_set_stack_file_extent_type(&stack_fi, BTRFS_FILE_EXTENT_REG);
btrfs_set_stack_file_extent_disk_bytenr(&stack_fi, oe->disk_bytenr);
btrfs_set_stack_file_extent_disk_num_bytes(&stack_fi,
oe->disk_num_bytes);
if (test_bit(BTRFS_ORDERED_TRUNCATED, &oe->flags))
logical_len = oe->truncated_len;
else
logical_len = oe->num_bytes;
btrfs_set_stack_file_extent_num_bytes(&stack_fi, logical_len);
btrfs_set_stack_file_extent_ram_bytes(&stack_fi, logical_len);
btrfs_set_stack_file_extent_compression(&stack_fi, oe->compress_type);
/* Encryption and other encoding is reserved and all 0 */
/*
* For delalloc, when completing an ordered extent we update the inode's
* bytes when clearing the range in the inode's io tree, so pass false
* as the argument 'update_inode_bytes' to insert_reserved_file_extent(),
* except if the ordered extent was truncated.
*/
update_inode_bytes = test_bit(BTRFS_ORDERED_DIRECT, &oe->flags) ||
test_bit(BTRFS_ORDERED_TRUNCATED, &oe->flags);
return insert_reserved_file_extent(trans, BTRFS_I(oe->inode),
oe->file_offset, &stack_fi,
update_inode_bytes, oe->qgroup_rsv);
}
/*
* As ordered data IO finishes, this gets called so we can finish
* an ordered extent if the range of bytes in the file it covers are
* fully written.
*/
static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
{
struct btrfs_inode *inode = BTRFS_I(ordered_extent->inode);
struct btrfs_root *root = inode->root;
struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_trans_handle *trans = NULL;
struct extent_io_tree *io_tree = &inode->io_tree;
struct extent_state *cached_state = NULL;
u64 start, end;
int compress_type = 0;
int ret = 0;
u64 logical_len = ordered_extent->num_bytes;
bool freespace_inode;
bool truncated = false;
bool clear_reserved_extent = true;
unsigned int clear_bits = EXTENT_DEFRAG;
start = ordered_extent->file_offset;
end = start + ordered_extent->num_bytes - 1;
if (!test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags) &&
!test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags) &&
!test_bit(BTRFS_ORDERED_DIRECT, &ordered_extent->flags))
clear_bits |= EXTENT_DELALLOC_NEW;
freespace_inode = btrfs_is_free_space_inode(inode);
if (test_bit(BTRFS_ORDERED_IOERR, &ordered_extent->flags)) {
ret = -EIO;
goto out;
}
if (ordered_extent->bdev)
btrfs_rewrite_logical_zoned(ordered_extent);
btrfs_free_io_failure_record(inode, start, end);
if (test_bit(BTRFS_ORDERED_TRUNCATED, &ordered_extent->flags)) {
truncated = true;
logical_len = ordered_extent->truncated_len;
/* Truncated the entire extent, don't bother adding */
if (!logical_len)
goto out;
}
if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags)) {
BUG_ON(!list_empty(&ordered_extent->list)); /* Logic error */ btrfs_inode_safe_disk_i_size_write(inode, 0);
if (freespace_inode)
trans = btrfs_join_transaction_spacecache(root);
else
trans = btrfs_join_transaction(root);
if (IS_ERR(trans)) {
ret = PTR_ERR(trans);
trans = NULL;
goto out;
}
trans->block_rsv = &inode->block_rsv;
ret = btrfs_update_inode_fallback(trans, root, inode);
if (ret) /* -ENOMEM or corruption */
btrfs_abort_transaction(trans, ret);
goto out;
}
clear_bits |= EXTENT_LOCKED;
lock_extent_bits(io_tree, start, end, &cached_state);
if (freespace_inode)
trans = btrfs_join_transaction_spacecache(root);
else
trans = btrfs_join_transaction(root);
if (IS_ERR(trans)) {
ret = PTR_ERR(trans);
trans = NULL;
goto out;
}
trans->block_rsv = &inode->block_rsv;
if (test_bit(BTRFS_ORDERED_COMPRESSED, &ordered_extent->flags))
compress_type = ordered_extent->compress_type;
if (test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) {
BUG_ON(compress_type);
ret = btrfs_mark_extent_written(trans, inode,
ordered_extent->file_offset,
ordered_extent->file_offset +
logical_len);
} else {
BUG_ON(root == fs_info->tree_root);
ret = insert_ordered_extent_file_extent(trans, ordered_extent);
if (!ret) {
clear_reserved_extent = false;
btrfs_release_delalloc_bytes(fs_info,
ordered_extent->disk_bytenr,
ordered_extent->disk_num_bytes);
}
}
unpin_extent_cache(&inode->extent_tree, ordered_extent->file_offset,
ordered_extent->num_bytes, trans->transid);
if (ret < 0) {
btrfs_abort_transaction(trans, ret);
goto out;
}
ret = add_pending_csums(trans, &ordered_extent->list);
if (ret) {
btrfs_abort_transaction(trans, ret);
goto out;
}
/*
* If this is a new delalloc range, clear its new delalloc flag to
* update the inode's number of bytes. This needs to be done first
* before updating the inode item.
*/
if ((clear_bits & EXTENT_DELALLOC_NEW) &&
!test_bit(BTRFS_ORDERED_TRUNCATED, &ordered_extent->flags))
clear_extent_bit(&inode->io_tree, start, end,
EXTENT_DELALLOC_NEW | EXTENT_ADD_INODE_BYTES,
0, 0, &cached_state);
btrfs_inode_safe_disk_i_size_write(inode, 0);
ret = btrfs_update_inode_fallback(trans, root, inode);
if (ret) { /* -ENOMEM or corruption */
btrfs_abort_transaction(trans, ret);
goto out;
}
ret = 0;
out:
clear_extent_bit(&inode->io_tree, start, end, clear_bits,
(clear_bits & EXTENT_LOCKED) ? 1 : 0, 0,
&cached_state);
if (trans)
btrfs_end_transaction(trans); if (ret || truncated) {
u64 unwritten_start = start;
/*
* If we failed to finish this ordered extent for any reason we
* need to make sure BTRFS_ORDERED_IOERR is set on the ordered
* extent, and mark the inode with the error if it wasn't
* already set. Any error during writeback would have already
* set the mapping error, so we need to set it if we're the ones
* marking this ordered extent as failed.
*/
if (ret && !test_and_set_bit(BTRFS_ORDERED_IOERR,
&ordered_extent->flags))
mapping_set_error(ordered_extent->inode->i_mapping, -EIO); if (truncated) unwritten_start += logical_len;
clear_extent_uptodate(io_tree, unwritten_start, end, NULL);
/* Drop the cache for the part of the extent we didn't write. */
btrfs_drop_extent_cache(inode, unwritten_start, end, 0);
/*
* If the ordered extent had an IOERR or something else went
* wrong we need to return the space for this ordered extent
* back to the allocator. We only free the extent in the
* truncated case if we didn't write out the extent at all.
*
* If we made it past insert_reserved_file_extent before we
* errored out then we don't need to do this as the accounting
* has already been done.
*/
if ((ret || !logical_len) &&
clear_reserved_extent &&
!test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags) &&
!test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) {
/*
* Discard the range before returning it back to the
* free space pool
*/
if (ret && btrfs_test_opt(fs_info, DISCARD_SYNC)) btrfs_discard_extent(fs_info,
ordered_extent->disk_bytenr,
ordered_extent->disk_num_bytes,
NULL);
btrfs_free_reserved_extent(fs_info,
ordered_extent->disk_bytenr,
ordered_extent->disk_num_bytes, 1);
}
}
/*
* This needs to be done to make sure anybody waiting knows we are done
* updating everything for this ordered extent.
*/
btrfs_remove_ordered_extent(inode, ordered_extent);
/* once for us */
btrfs_put_ordered_extent(ordered_extent);
/* once for the tree */
btrfs_put_ordered_extent(ordered_extent);
return ret;
}
static void finish_ordered_fn(struct btrfs_work *work)
{
struct btrfs_ordered_extent *ordered_extent;
ordered_extent = container_of(work, struct btrfs_ordered_extent, work);
btrfs_finish_ordered_io(ordered_extent);
}
void btrfs_writepage_endio_finish_ordered(struct btrfs_inode *inode,
struct page *page, u64 start,
u64 end, bool uptodate)
{
trace_btrfs_writepage_end_io_hook(inode, start, end, uptodate);
btrfs_mark_ordered_io_finished(inode, page, start, end + 1 - start,
finish_ordered_fn, uptodate);
}
/*
* check_data_csum - verify checksum of one sector of uncompressed data
* @inode: inode
* @io_bio: btrfs_io_bio which contains the csum
* @bio_offset: offset to the beginning of the bio (in bytes)
* @page: page where is the data to be verified
* @pgoff: offset inside the page
* @start: logical offset in the file
*
* The length of such check is always one sector size.
*/
static int check_data_csum(struct inode *inode, struct btrfs_io_bio *io_bio,
u32 bio_offset, struct page *page, u32 pgoff,
u64 start)
{
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
char *kaddr;
u32 len = fs_info->sectorsize;
const u32 csum_size = fs_info->csum_size;
unsigned int offset_sectors;
u8 *csum_expected;
u8 csum[BTRFS_CSUM_SIZE];
ASSERT(pgoff + len <= PAGE_SIZE);
offset_sectors = bio_offset >> fs_info->sectorsize_bits;
csum_expected = ((u8 *)io_bio->csum) + offset_sectors * csum_size;
kaddr = kmap_atomic(page);
shash->tfm = fs_info->csum_shash;
crypto_shash_digest(shash, kaddr + pgoff, len, csum);
if (memcmp(csum, csum_expected, csum_size))
goto zeroit;
kunmap_atomic(kaddr);
return 0;
zeroit:
btrfs_print_data_csum_error(BTRFS_I(inode), start, csum, csum_expected,
io_bio->mirror_num);
if (io_bio->device)
btrfs_dev_stat_inc_and_print(io_bio->device,
BTRFS_DEV_STAT_CORRUPTION_ERRS);
memset(kaddr + pgoff, 1, len);
flush_dcache_page(page);
kunmap_atomic(kaddr);
return -EIO;
}
/*
* When reads are done, we need to check csums to verify the data is correct.
* if there's a match, we allow the bio to finish. If not, the code in
* extent_io.c will try to find good copies for us.
*
* @bio_offset: offset to the beginning of the bio (in bytes)
* @start: file offset of the range start
* @end: file offset of the range end (inclusive)
*
* Return a bitmap where bit set means a csum mismatch, and bit not set means
* csum match.
*/
unsigned int btrfs_verify_data_csum(struct btrfs_io_bio *io_bio, u32 bio_offset,
struct page *page, u64 start, u64 end)
{
struct inode *inode = page->mapping->host;
struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
struct btrfs_root *root = BTRFS_I(inode)->root;
const u32 sectorsize = root->fs_info->sectorsize;
u32 pg_off;
unsigned int result = 0;
if (PageChecked(page)) {
ClearPageChecked(page);
return 0;
}
/*
* For subpage case, above PageChecked is not safe as it's not subpage
* compatible.
* But for now only cow fixup and compressed read utilize PageChecked
* flag, while in this context we can easily use io_bio->csum to
* determine if we really need to do csum verification.
*
* So for now, just exit if io_bio->csum is NULL, as it means it's
* compressed read, and its compressed data csum has already been
* verified.
*/
if (io_bio->csum == NULL)
return 0;
if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)
return 0;
if (!root->fs_info->csum_root)
return 0;
ASSERT(page_offset(page) <= start &&
end <= page_offset(page) + PAGE_SIZE - 1);
for (pg_off = offset_in_page(start);
pg_off < offset_in_page(end);
pg_off += sectorsize, bio_offset += sectorsize) {
u64 file_offset = pg_off + page_offset(page);
int ret;
if (btrfs_is_data_reloc_root(root) &&
test_range_bit(io_tree, file_offset,
file_offset + sectorsize - 1,
EXTENT_NODATASUM, 1, NULL)) {
/* Skip the range without csum for data reloc inode */
clear_extent_bits(io_tree, file_offset,
file_offset + sectorsize - 1,
EXTENT_NODATASUM);
continue;
}
ret = check_data_csum(inode, io_bio, bio_offset, page, pg_off,
page_offset(page) + pg_off);
if (ret < 0) {
const int nr_bit = (pg_off - offset_in_page(start)) >>
root->fs_info->sectorsize_bits;
result |= (1U << nr_bit);
}
}
return result;
}
/*
* btrfs_add_delayed_iput - perform a delayed iput on @inode
*
* @inode: The inode we want to perform iput on
*
* This function uses the generic vfs_inode::i_count to track whether we should
* just decrement it (in case it's > 1) or if this is the last iput then link
* the inode to the delayed iput machinery. Delayed iputs are processed at
* transaction commit time/superblock commit/cleaner kthread.
*/
void btrfs_add_delayed_iput(struct inode *inode)
{
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct btrfs_inode *binode = BTRFS_I(inode);
if (atomic_add_unless(&inode->i_count, -1, 1))
return;
atomic_inc(&fs_info->nr_delayed_iputs);
spin_lock(&fs_info->delayed_iput_lock);
ASSERT(list_empty(&binode->delayed_iput));
list_add_tail(&binode->delayed_iput, &fs_info->delayed_iputs);
spin_unlock(&fs_info->delayed_iput_lock);
if (!test_bit(BTRFS_FS_CLEANER_RUNNING, &fs_info->flags))
wake_up_process(fs_info->cleaner_kthread);
}
static void run_delayed_iput_locked(struct btrfs_fs_info *fs_info,
struct btrfs_inode *inode)
{
list_del_init(&inode->delayed_iput);
spin_unlock(&fs_info->delayed_iput_lock);
iput(&inode->vfs_inode);
if (atomic_dec_and_test(&fs_info->nr_delayed_iputs))
wake_up(&fs_info->delayed_iputs_wait);
spin_lock(&fs_info->delayed_iput_lock);
}
static void btrfs_run_delayed_iput(struct btrfs_fs_info *fs_info,
struct btrfs_inode *inode)
{
if (!list_empty(&inode->delayed_iput)) {
spin_lock(&fs_info->delayed_iput_lock);
if (!list_empty(&inode->delayed_iput))
run_delayed_iput_locked(fs_info, inode);
spin_unlock(&fs_info->delayed_iput_lock);
}
}
void btrfs_run_delayed_iputs(struct btrfs_fs_info *fs_info)
{
spin_lock(&fs_info->delayed_iput_lock);
while (!list_empty(&fs_info->delayed_iputs)) {
struct btrfs_inode *inode;
inode = list_first_entry(&fs_info->delayed_iputs,
struct btrfs_inode, delayed_iput);
run_delayed_iput_locked(fs_info, inode);
cond_resched_lock(&fs_info->delayed_iput_lock);
}
spin_unlock(&fs_info->delayed_iput_lock);
}
/**
* Wait for flushing all delayed iputs
*
* @fs_info: the filesystem
*
* This will wait on any delayed iputs that are currently running with KILLABLE
* set. Once they are all done running we will return, unless we are killed in
* which case we return EINTR. This helps in user operations like fallocate etc
* that might get blocked on the iputs.
*
* Return EINTR if we were killed, 0 if nothing's pending
*/
int btrfs_wait_on_delayed_iputs(struct btrfs_fs_info *fs_info)
{
int ret = wait_event_killable(fs_info->delayed_iputs_wait,
atomic_read(&fs_info->nr_delayed_iputs) == 0);
if (ret)
return -EINTR;
return 0;
}
/*
* This creates an orphan entry for the given inode in case something goes wrong
* in the middle of an unlink.
*/
int btrfs_orphan_add(struct btrfs_trans_handle *trans,
struct btrfs_inode *inode)
{
int ret;
ret = btrfs_insert_orphan_item(trans, inode->root, btrfs_ino(inode)); if (ret && ret != -EEXIST) { btrfs_abort_transaction(trans, ret);
return ret;
}
return 0;
}
/*
* We have done the delete so we can go ahead and remove the orphan item for
* this particular inode.
*/
static int btrfs_orphan_del(struct btrfs_trans_handle *trans,
struct btrfs_inode *inode)
{
return btrfs_del_orphan_item(trans, inode->root, btrfs_ino(inode));
}
/*
* this cleans up any orphans that may be left on the list from the last use
* of this root.
*/
int btrfs_orphan_cleanup(struct btrfs_root *root)
{
struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_path *path;
struct extent_buffer *leaf;
struct btrfs_key key, found_key;
struct btrfs_trans_handle *trans;
struct inode *inode;
u64 last_objectid = 0;
int ret = 0, nr_unlink = 0;
if (cmpxchg(&root->orphan_cleanup_state, 0, ORPHAN_CLEANUP_STARTED))
return 0; path = btrfs_alloc_path();
if (!path) {
ret = -ENOMEM;
goto out;
}
path->reada = READA_BACK;
key.objectid = BTRFS_ORPHAN_OBJECTID;
key.type = BTRFS_ORPHAN_ITEM_KEY;
key.offset = (u64)-1;
while (1) {
ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
if (ret < 0)
goto out;
/*
* if ret == 0 means we found what we were searching for, which
* is weird, but possible, so only screw with path if we didn't
* find the key and see if we have stuff that matches
*/
if (ret > 0) {
ret = 0;
if (path->slots[0] == 0)
break;
path->slots[0]--;
}
/* pull out the item */
leaf = path->nodes[0];
btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
/* make sure the item matches what we want */
if (found_key.objectid != BTRFS_ORPHAN_OBJECTID)
break;
if (found_key.type != BTRFS_ORPHAN_ITEM_KEY)
break;
/* release the path since we're done with it */
btrfs_release_path(path);
/*
* this is where we are basically btrfs_lookup, without the
* crossing root thing. we store the inode number in the
* offset of the orphan item.
*/
if (found_key.offset == last_objectid) {
btrfs_err(fs_info,
"Error removing orphan entry, stopping orphan cleanup");
ret = -EINVAL;
goto out;
}
last_objectid = found_key.offset;
found_key.objectid = found_key.offset;
found_key.type = BTRFS_INODE_ITEM_KEY;
found_key.offset = 0;
inode = btrfs_iget(fs_info->sb, last_objectid, root);
ret = PTR_ERR_OR_ZERO(inode);
if (ret && ret != -ENOENT)
goto out;
if (ret == -ENOENT && root == fs_info->tree_root) {
struct btrfs_root *dead_root;
int is_dead_root = 0;
/*
* This is an orphan in the tree root. Currently these
* could come from 2 sources:
* a) a root (snapshot/subvolume) deletion in progress
* b) a free space cache inode
* We need to distinguish those two, as the orphan item
* for a root must not get deleted before the deletion
* of the snapshot/subvolume's tree completes.
*
* btrfs_find_orphan_roots() ran before us, which has
* found all deleted roots and loaded them into
* fs_info->fs_roots_radix. So here we can find if an
* orphan item corresponds to a deleted root by looking
* up the root from that radix tree.
*/
spin_lock(&fs_info->fs_roots_radix_lock);
dead_root = radix_tree_lookup(&fs_info->fs_roots_radix,
(unsigned long)found_key.objectid);
if (dead_root && btrfs_root_refs(&dead_root->root_item) == 0)
is_dead_root = 1;
spin_unlock(&fs_info->fs_roots_radix_lock);
if (is_dead_root) {
/* prevent this orphan from being found again */
key.offset = found_key.objectid - 1;
continue;
}
}
/*
* If we have an inode with links, there are a couple of
* possibilities:
*
* 1. We were halfway through creating fsverity metadata for the
* file. In that case, the orphan item represents incomplete
* fsverity metadata which must be cleaned up with
* btrfs_drop_verity_items and deleting the orphan item.
* 2. Old kernels (before v3.12) used to create an
* orphan item for truncate indicating that there were possibly
* extent items past i_size that needed to be deleted. In v3.12,
* truncate was changed to update i_size in sync with the extent
* items, but the (useless) orphan item was still created. Since
* v4.18, we don't create the orphan item for truncate at all.
*
* So, this item could mean that we need to do a truncate, but
* only if this filesystem was last used on a pre-v3.12 kernel
* and was not cleanly unmounted. The odds of that are quite
* slim, and it's a pain to do the truncate now, so just delete
* the orphan item.
*
* It's also possible that this orphan item was supposed to be
* deleted but wasn't. The inode number may have been reused,
* but either way, we can delete the orphan item.
*/
if (ret == -ENOENT || inode->i_nlink) {
if (!ret) {
ret = btrfs_drop_verity_items(BTRFS_I(inode));
iput(inode);
if (ret)
goto out;
}
trans = btrfs_start_transaction(root, 1);
if (IS_ERR(trans)) {
ret = PTR_ERR(trans);
goto out;
}
btrfs_debug(fs_info, "auto deleting %Lu",
found_key.objectid);
ret = btrfs_del_orphan_item(trans, root,
found_key.objectid);
btrfs_end_transaction(trans);
if (ret)
goto out;
continue;
}
nr_unlink++;
/* this will do delete_inode and everything for us */
iput(inode);
}
/* release the path since we're done with it */
btrfs_release_path(path);
root->orphan_cleanup_state = ORPHAN_CLEANUP_DONE;
if (test_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &root->state)) {
trans = btrfs_join_transaction(root);
if (!IS_ERR(trans))
btrfs_end_transaction(trans);
}
if (nr_unlink)
btrfs_debug(fs_info, "unlinked %d orphans", nr_unlink);
out:
if (ret)
btrfs_err(fs_info, "could not do orphan cleanup %d", ret);
btrfs_free_path(path);
return ret;
}
/*
* very simple check to peek ahead in the leaf looking for xattrs. If we
* don't find any xattrs, we know there can't be any acls.
*
* slot is the slot the inode is in, objectid is the objectid of the inode
*/
static noinline int acls_after_inode_item(struct extent_buffer *leaf,
int slot, u64 objectid,
int *first_xattr_slot)
{
u32 nritems = btrfs_header_nritems(leaf);
struct btrfs_key found_key;
static u64 xattr_access = 0;
static u64 xattr_default = 0;
int scanned = 0;
if (!xattr_access) {
xattr_access = btrfs_name_hash(XATTR_NAME_POSIX_ACL_ACCESS,
strlen(XATTR_NAME_POSIX_ACL_ACCESS));
xattr_default = btrfs_name_hash(XATTR_NAME_POSIX_ACL_DEFAULT,
strlen(XATTR_NAME_POSIX_ACL_DEFAULT));
}
slot++;
*first_xattr_slot = -1;
while (slot < nritems) {
btrfs_item_key_to_cpu(leaf, &found_key, slot);
/* we found a different objectid, there must not be acls */
if (found_key.objectid != objectid)
return 0;
/* we found an xattr, assume we've got an acl */
if (found_key.type == BTRFS_XATTR_ITEM_KEY) { if (*first_xattr_slot == -1) *first_xattr_slot = slot; if (found_key.offset == xattr_access || found_key.offset == xattr_default)
return 1;
}
/*
* we found a key greater than an xattr key, there can't
* be any acls later on
*/
if (found_key.type > BTRFS_XATTR_ITEM_KEY)
return 0;
slot++;
scanned++;
/*
* it goes inode, inode backrefs, xattrs, extents,
* so if there are a ton of hard links to an inode there can
* be a lot of backrefs. Don't waste time searching too hard,
* this is just an optimization
*/
if (scanned >= 8)
break;
}
/* we hit the end of the leaf before we found an xattr or
* something larger than an xattr. We have to assume the inode
* has acls
*/
if (*first_xattr_slot == -1) *first_xattr_slot = slot; return 1;
}
/*
* read an inode from the btree into the in-memory inode
*/
static int btrfs_read_locked_inode(struct inode *inode,
struct btrfs_path *in_path)
{
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct btrfs_path *path = in_path;
struct extent_buffer *leaf;
struct btrfs_inode_item *inode_item;
struct btrfs_root *root = BTRFS_I(inode)->root;
struct btrfs_key location;
unsigned long ptr;
int maybe_acls;
u32 rdev;
int ret;
bool filled = false;
int first_xattr_slot;
ret = btrfs_fill_inode(inode, &rdev);
if (!ret)
filled = true;
if (!path) {
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
}
memcpy(&location, &BTRFS_I(inode)->location, sizeof(location));
ret = btrfs_lookup_inode(NULL, root, path, &location, 0);
if (ret) {
if (path != in_path)
btrfs_free_path(path);
return ret;
}
leaf = path->nodes[0];
if (filled)
goto cache_index;
inode_item = btrfs_item_ptr(leaf, path->slots[0],
struct btrfs_inode_item);
inode->i_mode = btrfs_inode_mode(leaf, inode_item);
set_nlink(inode, btrfs_inode_nlink(leaf, inode_item));
i_uid_write(inode, btrfs_inode_uid(leaf, inode_item));
i_gid_write(inode, btrfs_inode_gid(leaf, inode_item));
btrfs_i_size_write(BTRFS_I(inode), btrfs_inode_size(leaf, inode_item));
btrfs_inode_set_file_extent_range(BTRFS_I(inode), 0,
round_up(i_size_read(inode), fs_info->sectorsize));
inode->i_atime.tv_sec = btrfs_timespec_sec(leaf, &inode_item->atime);
inode->i_atime.tv_nsec = btrfs_timespec_nsec(leaf, &inode_item->atime);
inode->i_mtime.tv_sec = btrfs_timespec_sec(leaf, &inode_item->mtime);
inode->i_mtime.tv_nsec = btrfs_timespec_nsec(leaf, &inode_item->mtime);
inode->i_ctime.tv_sec = btrfs_timespec_sec(leaf, &inode_item->ctime);
inode->i_ctime.tv_nsec = btrfs_timespec_nsec(leaf, &inode_item->ctime);
BTRFS_I(inode)->i_otime.tv_sec =
btrfs_timespec_sec(leaf, &inode_item->otime);
BTRFS_I(inode)->i_otime.tv_nsec =
btrfs_timespec_nsec(leaf, &inode_item->otime);
inode_set_bytes(inode, btrfs_inode_nbytes(leaf, inode_item));
BTRFS_I(inode)->generation = btrfs_inode_generation(leaf, inode_item);
BTRFS_I(inode)->last_trans = btrfs_inode_transid(leaf, inode_item);
inode_set_iversion_queried(inode,
btrfs_inode_sequence(leaf, inode_item));
inode->i_generation = BTRFS_I(inode)->generation;
inode->i_rdev = 0;
rdev = btrfs_inode_rdev(leaf, inode_item);
BTRFS_I(inode)->index_cnt = (u64)-1;
btrfs_inode_split_flags(btrfs_inode_flags(leaf, inode_item),
&BTRFS_I(inode)->flags, &BTRFS_I(inode)->ro_flags);
cache_index:
/*
* If we were modified in the current generation and evicted from memory
* and then re-read we need to do a full sync since we don't have any
* idea about which extents were modified before we were evicted from
* cache.
*
* This is required for both inode re-read from disk and delayed inode
* in delayed_nodes_tree.
*/
if (BTRFS_I(inode)->last_trans == fs_info->generation)
set_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
&BTRFS_I(inode)->runtime_flags);
/*
* We don't persist the id of the transaction where an unlink operation
* against the inode was last made. So here we assume the inode might
* have been evicted, and therefore the exact value of last_unlink_trans
* lost, and set it to last_trans to avoid metadata inconsistencies
* between the inode and its parent if the inode is fsync'ed and the log
* replayed. For example, in the scenario:
*
* touch mydir/foo
* ln mydir/foo mydir/bar
* sync
* unlink mydir/bar
* echo 2 > /proc/sys/vm/drop_caches # evicts inode
* xfs_io -c fsync mydir/foo
* <power failure>
* mount fs, triggers fsync log replay
*
* We must make sure that when we fsync our inode foo we also log its
* parent inode, otherwise after log replay the parent still has the
* dentry with the "bar" name but our inode foo has a link count of 1
* and doesn't have an inode ref with the name "bar" anymore.
*
* Setting last_unlink_trans to last_trans is a pessimistic approach,
* but it guarantees correctness at the expense of occasional full
* transaction commits on fsync if our inode is a directory, or if our
* inode is not a directory, logging its parent unnecessarily.
*/
BTRFS_I(inode)->last_unlink_trans = BTRFS_I(inode)->last_trans;
/*
* Same logic as for last_unlink_trans. We don't persist the generation
* of the last transaction where this inode was used for a reflink
* operation, so after eviction and reloading the inode we must be
* pessimistic and assume the last transaction that modified the inode.
*/
BTRFS_I(inode)->last_reflink_trans = BTRFS_I(inode)->last_trans;
path->slots[0]++;
if (inode->i_nlink != 1 || path->slots[0] >= btrfs_header_nritems(leaf))
goto cache_acl;
btrfs_item_key_to_cpu(leaf, &location, path->slots[0]);
if (location.objectid != btrfs_ino(BTRFS_I(inode)))
goto cache_acl;
ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
if (location.type == BTRFS_INODE_REF_KEY) {
struct btrfs_inode_ref *ref;
ref = (struct btrfs_inode_ref *)ptr;
BTRFS_I(inode)->dir_index = btrfs_inode_ref_index(leaf, ref);
} else if (location.type == BTRFS_INODE_EXTREF_KEY) {
struct btrfs_inode_extref *extref;
extref = (struct btrfs_inode_extref *)ptr;
BTRFS_I(inode)->dir_index = btrfs_inode_extref_index(leaf,
extref);
}
cache_acl:
/*
* try to precache a NULL acl entry for files that don't have
* any xattrs or acls
*/
maybe_acls = acls_after_inode_item(leaf, path->slots[0],
btrfs_ino(BTRFS_I(inode)), &first_xattr_slot);
if (first_xattr_slot != -1) {
path->slots[0] = first_xattr_slot;
ret = btrfs_load_inode_props(inode, path);
if (ret)
btrfs_err(fs_info,
"error loading props for ino %llu (root %llu): %d",
btrfs_ino(BTRFS_I(inode)),
root->root_key.objectid, ret);
}
if (path != in_path) btrfs_free_path(path); if (!maybe_acls)
cache_no_acl(inode);
switch (inode->i_mode & S_IFMT) {
case S_IFREG:
inode->i_mapping->a_ops = &btrfs_aops;
inode->i_fop = &btrfs_file_operations;
inode->i_op = &btrfs_file_inode_operations;
break;
case S_IFDIR:
inode->i_fop = &btrfs_dir_file_operations;
inode->i_op = &btrfs_dir_inode_operations;
break;
case S_IFLNK:
inode->i_op = &btrfs_symlink_inode_operations;
inode_nohighmem(inode);
inode->i_mapping->a_ops = &btrfs_aops;
break;
default:
inode->i_op = &btrfs_special_inode_operations;
init_special_inode(inode, inode->i_mode, rdev);
break;
}
btrfs_sync_inode_flags_to_i_flags(inode);
return 0;
}
/*
* given a leaf and an inode, copy the inode fields into the leaf
*/
static void fill_inode_item(struct btrfs_trans_handle *trans,
struct extent_buffer *leaf,
struct btrfs_inode_item *item,
struct inode *inode)
{
struct btrfs_map_token token;
u64 flags;
btrfs_init_map_token(&token, leaf);
btrfs_set_token_inode_uid(&token, item, i_uid_read(inode));
btrfs_set_token_inode_gid(&token, item, i_gid_read(inode));
btrfs_set_token_inode_size(&token, item, BTRFS_I(inode)->disk_i_size);
btrfs_set_token_inode_mode(&token, item, inode->i_mode);
btrfs_set_token_inode_nlink(&token, item, inode->i_nlink);
btrfs_set_token_timespec_sec(&token, &item->atime,
inode->i_atime.tv_sec);
btrfs_set_token_timespec_nsec(&token, &item->atime,
inode->i_atime.tv_nsec);
btrfs_set_token_timespec_sec(&token, &item->mtime,
inode->i_mtime.tv_sec);
btrfs_set_token_timespec_nsec(&token, &item->mtime,
inode->i_mtime.tv_nsec);
btrfs_set_token_timespec_sec(&token, &item->ctime,
inode->i_ctime.tv_sec);
btrfs_set_token_timespec_nsec(&token, &item->ctime,
inode->i_ctime.tv_nsec);
btrfs_set_token_timespec_sec(&token, &item->otime,
BTRFS_I(inode)->i_otime.tv_sec);
btrfs_set_token_timespec_nsec(&token, &item->otime,
BTRFS_I(inode)->i_otime.tv_nsec);
btrfs_set_token_inode_nbytes(&token, item, inode_get_bytes(inode));
btrfs_set_token_inode_generation(&token, item,
BTRFS_I(inode)->generation);
btrfs_set_token_inode_sequence(&token, item, inode_peek_iversion(inode));
btrfs_set_token_inode_transid(&token, item, trans->transid);
btrfs_set_token_inode_rdev(&token, item, inode->i_rdev);
flags = btrfs_inode_combine_flags(BTRFS_I(inode)->flags,
BTRFS_I(inode)->ro_flags);
btrfs_set_token_inode_flags(&token, item, flags);
btrfs_set_token_inode_block_group(&token, item, 0);
}
/*
* copy everything in the in-memory inode into the btree.
*/
static noinline int btrfs_update_inode_item(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_inode *inode)
{
struct btrfs_inode_item *inode_item;
struct btrfs_path *path;
struct extent_buffer *leaf;
int ret;
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
ret = btrfs_lookup_inode(trans, root, path, &inode->location, 1);
if (ret) {
if (ret > 0)
ret = -ENOENT;
goto failed;
}
leaf = path->nodes[0];
inode_item = btrfs_item_ptr(leaf, path->slots[0],
struct btrfs_inode_item);
fill_inode_item(trans, leaf, inode_item, &inode->vfs_inode);
btrfs_mark_buffer_dirty(leaf);
btrfs_set_inode_last_trans(trans, inode);
ret = 0;
failed:
btrfs_free_path(path); return ret;
}
/*
* copy everything in the in-memory inode into the btree.
*/
noinline int btrfs_update_inode(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_inode *inode)
{
struct btrfs_fs_info *fs_info = root->fs_info;
int ret;
/*
* If the inode is a free space inode, we can deadlock during commit
* if we put it into the delayed code.
*
* The data relocation inode should also be directly updated
* without delay
*/
if (!btrfs_is_free_space_inode(inode)
&& !btrfs_is_data_reloc_root(root)
&& !test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags)) {
btrfs_update_root_times(trans, root);
ret = btrfs_delayed_update_inode(trans, root, inode);
if (!ret)
btrfs_set_inode_last_trans(trans, inode);
return ret;
}
return btrfs_update_inode_item(trans, root, inode);
}
int btrfs_update_inode_fallback(struct btrfs_trans_handle *trans,
struct btrfs_root *root, struct btrfs_inode *inode)
{
int ret;
ret = btrfs_update_inode(trans, root, inode); if (ret == -ENOSPC) return btrfs_update_inode_item(trans, root, inode);
return ret;
}
/*
* unlink helper that gets used here in inode.c and in the tree logging
* recovery code. It remove a link in a directory with a given name, and
* also drops the back refs in the inode to the directory
*/
static int __btrfs_unlink_inode(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_inode *dir,
struct btrfs_inode *inode,
const char *name, int name_len)
{
struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_path *path;
int ret = 0;
struct btrfs_dir_item *di;
u64 index;
u64 ino = btrfs_ino(inode);
u64 dir_ino = btrfs_ino(dir);
path = btrfs_alloc_path();
if (!path) {
ret = -ENOMEM;
goto out;
}
di = btrfs_lookup_dir_item(trans, root, path, dir_ino,
name, name_len, -1);
if (IS_ERR_OR_NULL(di)) {
ret = di ? PTR_ERR(di) : -ENOENT;
goto err;
}
ret = btrfs_delete_one_dir_name(trans, root, path, di);
if (ret)
goto err;
btrfs_release_path(path);
/*
* If we don't have dir index, we have to get it by looking up
* the inode ref, since we get the inode ref, remove it directly,
* it is unnecessary to do delayed deletion.
*
* But if we have dir index, needn't search inode ref to get it.
* Since the inode ref is close to the inode item, it is better
* that we delay to delete it, and just do this deletion when
* we update the inode item.
*/
if (inode->dir_index) {
ret = btrfs_delayed_delete_inode_ref(inode);
if (!ret) {
index = inode->dir_index;
goto skip_backref;
}
}
ret = btrfs_del_inode_ref(trans, root, name, name_len, ino,
dir_ino, &index);
if (ret) {
btrfs_info(fs_info,
"failed to delete reference to %.*s, inode %llu parent %llu",
name_len, name, ino, dir_ino);
btrfs_abort_transaction(trans, ret);
goto err;
}
skip_backref:
ret = btrfs_delete_delayed_dir_index(trans, dir, index);
if (ret) {
btrfs_abort_transaction(trans, ret);
goto err;
}
ret = btrfs_del_inode_ref_in_log(trans, root, name, name_len, inode,
dir_ino);
if (ret != 0 && ret != -ENOENT) {
btrfs_abort_transaction(trans, ret);
goto err;
}
ret = btrfs_del_dir_entries_in_log(trans, root, name, name_len, dir,
index);
if (ret == -ENOENT)
ret = 0;
else if (ret)
btrfs_abort_transaction(trans, ret);
/*
* If we have a pending delayed iput we could end up with the final iput
* being run in btrfs-cleaner context. If we have enough of these built
* up we can end up burning a lot of time in btrfs-cleaner without any
* way to throttle the unlinks. Since we're currently holding a ref on
* the inode we can run the delayed iput here without any issues as the
* final iput won't be done until after we drop the ref we're currently
* holding.
*/
btrfs_run_delayed_iput(fs_info, inode);
err:
btrfs_free_path(path);
if (ret)
goto out;
btrfs_i_size_write(dir, dir->vfs_inode.i_size - name_len * 2);
inode_inc_iversion(&inode->vfs_inode);
inode_inc_iversion(&dir->vfs_inode);
inode->vfs_inode.i_ctime = dir->vfs_inode.i_mtime =
dir->vfs_inode.i_ctime = current_time(&inode->vfs_inode);
ret = btrfs_update_inode(trans, root, dir);
out:
return ret;
}
int btrfs_unlink_inode(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_inode *dir, struct btrfs_inode *inode,
const char *name, int name_len)
{
int ret;
ret = __btrfs_unlink_inode(trans, root, dir, inode, name, name_len);
if (!ret) {
drop_nlink(&inode->vfs_inode);
ret = btrfs_update_inode(trans, root, inode);
}
return ret;
}
/*
* helper to start transaction for unlink and rmdir.
*
* unlink and rmdir are special in btrfs, they do not always free space, so
* if we cannot make our reservations the normal way try and see if there is
* plenty of slack room in the global reserve to migrate, otherwise we cannot
* allow the unlink to occur.
*/
static struct btrfs_trans_handle *__unlink_start_trans(struct inode *dir)
{
struct btrfs_root *root = BTRFS_I(dir)->root;
/*
* 1 for the possible orphan item
* 1 for the dir item
* 1 for the dir index
* 1 for the inode ref
* 1 for the inode
*/
return btrfs_start_transaction_fallback_global_rsv(root, 5);
}
static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
{
struct btrfs_root *root = BTRFS_I(dir)->root;
struct btrfs_trans_handle *trans;
struct inode *inode = d_inode(dentry);
int ret;
trans = __unlink_start_trans(dir);
if (IS_ERR(trans))
return PTR_ERR(trans);
btrfs_record_unlink_dir(trans, BTRFS_I(dir), BTRFS_I(d_inode(dentry)),
0);
ret = btrfs_unlink_inode(trans, root, BTRFS_I(dir),
BTRFS_I(d_inode(dentry)), dentry->d_name.name,
dentry->d_name.len);
if (ret)
goto out;
if (inode->i_nlink == 0) {
ret = btrfs_orphan_add(trans, BTRFS_I(inode));
if (ret)
goto out;
}
out:
btrfs_end_transaction(trans);
btrfs_btree_balance_dirty(root->fs_info);
return ret;
}
static int btrfs_unlink_subvol(struct btrfs_trans_handle *trans,
struct inode *dir, struct dentry *dentry)
{
struct btrfs_root *root = BTRFS_I(dir)->root;
struct btrfs_inode *inode = BTRFS_I(d_inode(dentry));
struct btrfs_path *path;
struct extent_buffer *leaf;
struct btrfs_dir_item *di;
struct btrfs_key key;
const char *name = dentry->d_name.name;
int name_len = dentry->d_name.len;
u64 index;
int ret;
u64 objectid;
u64 dir_ino = btrfs_ino(BTRFS_I(dir));
if (btrfs_ino(inode) == BTRFS_FIRST_FREE_OBJECTID) {
objectid = inode->root->root_key.objectid;
} else if (btrfs_ino(inode) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID) {
objectid = inode->location.objectid;
} else {
WARN_ON(1);
return -EINVAL;
}
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
di = btrfs_lookup_dir_item(trans, root, path, dir_ino,
name, name_len, -1);
if (IS_ERR_OR_NULL(di)) {
ret = di ? PTR_ERR(di) : -ENOENT;
goto out;
}
leaf = path->nodes[0];
btrfs_dir_item_key_to_cpu(leaf, di, &key);
WARN_ON(key.type != BTRFS_ROOT_ITEM_KEY || key.objectid != objectid);
ret = btrfs_delete_one_dir_name(trans, root, path, di);
if (ret) {
btrfs_abort_transaction(trans, ret);
goto out;
}
btrfs_release_path(path);
/*
* This is a placeholder inode for a subvolume we didn't have a
* reference to at the time of the snapshot creation. In the meantime
* we could have renamed the real subvol link into our snapshot, so
* depending on btrfs_del_root_ref to return -ENOENT here is incorrect.
* Instead simply lookup the dir_index_item for this entry so we can
* remove it. Otherwise we know we have a ref to the root and we can
* call btrfs_del_root_ref, and it _shouldn't_ fail.
*/
if (btrfs_ino(inode) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID) {
di = btrfs_search_dir_index_item(root, path, dir_ino,
name, name_len);
if (IS_ERR_OR_NULL(di)) {
if (!di)
ret = -ENOENT;
else
ret = PTR_ERR(di);
btrfs_abort_transaction(trans, ret);
goto out;
}
leaf = path->nodes[0];
btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
index = key.offset;
btrfs_release_path(path);
} else {
ret = btrfs_del_root_ref(trans, objectid,
root->root_key.objectid, dir_ino,
&index, name, name_len);
if (ret) {
btrfs_abort_transaction(trans, ret);
goto out;
}
}
ret = btrfs_delete_delayed_dir_index(trans, BTRFS_I(dir), index);
if (ret) {
btrfs_abort_transaction(trans, ret);
goto out;
}
btrfs_i_size_write(BTRFS_I(dir), dir->i_size - name_len * 2);
inode_inc_iversion(dir);
dir->i_mtime = dir->i_ctime = current_time(dir);
ret = btrfs_update_inode_fallback(trans, root, BTRFS_I(dir));
if (ret)
btrfs_abort_transaction(trans, ret);
out:
btrfs_free_path(path);
return ret;
}
/*
* Helper to check if the subvolume references other subvolumes or if it's
* default.
*/
static noinline int may_destroy_subvol(struct btrfs_root *root)
{
struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_path *path;
struct btrfs_dir_item *di;
struct btrfs_key key;
u64 dir_id;
int ret;
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
/* Make sure this root isn't set as the default subvol */
dir_id = btrfs_super_root_dir(fs_info->super_copy);
di = btrfs_lookup_dir_item(NULL, fs_info->tree_root, path,
dir_id, "default", 7, 0);
if (di && !IS_ERR(di)) {
btrfs_dir_item_key_to_cpu(path->nodes[0], di, &key);
if (key.objectid == root->root_key.objectid) {
ret = -EPERM;
btrfs_err(fs_info,
"deleting default subvolume %llu is not allowed",
key.objectid);
goto out;
}
btrfs_release_path(path);
}
key.objectid = root->root_key.objectid;
key.type = BTRFS_ROOT_REF_KEY;
key.offset = (u64)-1;
ret = btrfs_search_slot(NULL, fs_info->tree_root, &key, path, 0, 0);
if (ret < 0)
goto out;
BUG_ON(ret == 0);
ret = 0;
if (path->slots[0] > 0) {
path->slots[0]--;
btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
if (key.objectid == root->root_key.objectid &&
key.type == BTRFS_ROOT_REF_KEY)
ret = -ENOTEMPTY;
}
out:
btrfs_free_path(path);
return ret;
}
/* Delete all dentries for inodes belonging to the root */
static void btrfs_prune_dentries(struct btrfs_root *root)
{
struct btrfs_fs_info *fs_info = root->fs_info;
struct rb_node *node;
struct rb_node *prev;
struct btrfs_inode *entry;
struct inode *inode;
u64 objectid = 0;
if (!test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state))
WARN_ON(btrfs_root_refs(&root->root_item) != 0);
spin_lock(&root->inode_lock);
again:
node = root->inode_tree.rb_node;
prev = NULL;
while (node) {
prev = node;
entry = rb_entry(node, struct btrfs_inode, rb_node);
if (objectid < btrfs_ino(entry))
node = node->rb_left;
else if (objectid > btrfs_ino(entry))
node = node->rb_right;
else
break;
}
if (!node) {
while (prev) {
entry = rb_entry(prev, struct btrfs_inode, rb_node);
if (objectid <= btrfs_ino(entry)) {
node = prev;
break;
}
prev = rb_next(prev);
}
}
while (node) {
entry = rb_entry(node, struct btrfs_inode, rb_node);
objectid = btrfs_ino(entry) + 1;
inode = igrab(&entry->vfs_inode);
if (inode) {
spin_unlock(&root->inode_lock);
if (atomic_read(&inode->i_count) > 1)
d_prune_aliases(inode);
/*
* btrfs_drop_inode will have it removed from the inode
* cache when its usage count hits zero.
*/
iput(inode);
cond_resched();
spin_lock(&root->inode_lock);
goto again;
}
if (cond_resched_lock(&root->inode_lock))
goto again;
node = rb_next(node);
}
spin_unlock(&root->inode_lock);
}
int btrfs_delete_subvolume(struct inode *dir, struct dentry *dentry)
{
struct btrfs_fs_info *fs_info = btrfs_sb(dentry->d_sb);
struct btrfs_root *root = BTRFS_I(dir)->root;
struct inode *inode = d_inode(dentry);
struct btrfs_root *dest = BTRFS_I(inode)->root;
struct btrfs_trans_handle *trans;
struct btrfs_block_rsv block_rsv;
u64 root_flags;
int ret;
/*
* Don't allow to delete a subvolume with send in progress. This is
* inside the inode lock so the error handling that has to drop the bit
* again is not run concurrently.
*/
spin_lock(&dest->root_item_lock);
if (dest->send_in_progress) {
spin_unlock(&dest->root_item_lock);
btrfs_warn(fs_info,
"attempt to delete subvolume %llu during send",
dest->root_key.objectid);
return -EPERM;
}
if (atomic_read(&dest->nr_swapfiles)) {
spin_unlock(&dest->root_item_lock);
btrfs_warn(fs_info,
"attempt to delete subvolume %llu with active swapfile",
root->root_key.objectid);
return -EPERM;
}
root_flags = btrfs_root_flags(&dest->root_item);
btrfs_set_root_flags(&dest->root_item,
root_flags | BTRFS_ROOT_SUBVOL_DEAD);
spin_unlock(&dest->root_item_lock);
down_write(&fs_info->subvol_sem);
ret = may_destroy_subvol(dest);
if (ret)
goto out_up_write;
btrfs_init_block_rsv(&block_rsv, BTRFS_BLOCK_RSV_TEMP);
/*
* One for dir inode,
* two for dir entries,
* two for root ref/backref.
*/
ret = btrfs_subvolume_reserve_metadata(root, &block_rsv, 5, true);
if (ret)
goto out_up_write;
trans = btrfs_start_transaction(root, 0);
if (IS_ERR(trans)) {
ret = PTR_ERR(trans);
goto out_release;
}
trans->block_rsv = &block_rsv;
trans->bytes_reserved = block_rsv.size;
btrfs_record_snapshot_destroy(trans, BTRFS_I(dir));
ret = btrfs_unlink_subvol(trans, dir, dentry);
if (ret) {
btrfs_abort_transaction(trans, ret);
goto out_end_trans;
}
ret = btrfs_record_root_in_trans(trans, dest);
if (ret) {
btrfs_abort_transaction(trans, ret);
goto out_end_trans;
}
memset(&dest->root_item.drop_progress, 0,
sizeof(dest->root_item.drop_progress));
btrfs_set_root_drop_level(&dest->root_item, 0);
btrfs_set_root_refs(&dest->root_item, 0);
if (!test_and_set_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &dest->state)) {
ret = btrfs_insert_orphan_item(trans,
fs_info->tree_root,
dest->root_key.objectid);
if (ret) {
btrfs_abort_transaction(trans, ret);
goto out_end_trans;
}
}
ret = btrfs_uuid_tree_remove(trans, dest->root_item.uuid,
BTRFS_UUID_KEY_SUBVOL,
dest->root_key.objectid);
if (ret && ret != -ENOENT) {
btrfs_abort_transaction(trans, ret);
goto out_end_trans;
}
if (!btrfs_is_empty_uuid(dest->root_item.received_uuid)) {
ret = btrfs_uuid_tree_remove(trans,
dest->root_item.received_uuid,
BTRFS_UUID_KEY_RECEIVED_SUBVOL,
dest->root_key.objectid);
if (ret && ret != -ENOENT) {
btrfs_abort_transaction(trans, ret);
goto out_end_trans;
}
}
free_anon_bdev(dest->anon_dev);
dest->anon_dev = 0;
out_end_trans:
trans->block_rsv = NULL;
trans->bytes_reserved = 0;
ret = btrfs_end_transaction(trans);
inode->i_flags |= S_DEAD;
out_release:
btrfs_subvolume_release_metadata(root, &block_rsv);
out_up_write:
up_write(&fs_info->subvol_sem);
if (ret) {
spin_lock(&dest->root_item_lock);
root_flags = btrfs_root_flags(&dest->root_item);
btrfs_set_root_flags(&dest->root_item,
root_flags & ~BTRFS_ROOT_SUBVOL_DEAD);
spin_unlock(&dest->root_item_lock);
} else {
d_invalidate(dentry);
btrfs_prune_dentries(dest);
ASSERT(dest->send_in_progress == 0);
}
return ret;
}
static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
{
struct inode *inode = d_inode(dentry);
int err = 0;
struct btrfs_root *root = BTRFS_I(dir)->root;
struct btrfs_trans_handle *trans;
u64 last_unlink_trans;
if (inode->i_size > BTRFS_EMPTY_DIR_SIZE)
return -ENOTEMPTY;
if (btrfs_ino(BTRFS_I(inode)) == BTRFS_FIRST_FREE_OBJECTID) return btrfs_delete_subvolume(dir, dentry);
trans = __unlink_start_trans(dir);
if (IS_ERR(trans))
return PTR_ERR(trans);
if (unlikely(btrfs_ino(BTRFS_I(inode)) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) { err = btrfs_unlink_subvol(trans, dir, dentry);
goto out;
}
err = btrfs_orphan_add(trans, BTRFS_I(inode));
if (err)
goto out;
last_unlink_trans = BTRFS_I(inode)->last_unlink_trans;
/* now the directory is empty */
err = btrfs_unlink_inode(trans, root, BTRFS_I(dir),
BTRFS_I(d_inode(dentry)), dentry->d_name.name,
dentry->d_name.len);
if (!err) {
btrfs_i_size_write(BTRFS_I(inode), 0);
/*
* Propagate the last_unlink_trans value of the deleted dir to
* its parent directory. This is to prevent an unrecoverable
* log tree in the case we do something like this:
* 1) create dir foo
* 2) create snapshot under dir foo
* 3) delete the snapshot
* 4) rmdir foo
* 5) mkdir foo
* 6) fsync foo or some file inside foo
*/
if (last_unlink_trans >= trans->transid)
BTRFS_I(dir)->last_unlink_trans = last_unlink_trans;
}
out:
btrfs_end_transaction(trans);
btrfs_btree_balance_dirty(root->fs_info);
return err;
}
/*
* Return this if we need to call truncate_block for the last bit of the
* truncate.
*/
#define NEED_TRUNCATE_BLOCK 1
/*
* Remove inode items from a given root.
*
* @trans: A transaction handle.
* @root: The root from which to remove items.
* @inode: The inode whose items we want to remove.
* @new_size: The new i_size for the inode. This is only applicable when
* @min_type is BTRFS_EXTENT_DATA_KEY, must be 0 otherwise.
* @min_type: The minimum key type to remove. All keys with a type
* greater than this value are removed and all keys with
* this type are removed only if their offset is >= @new_size.
* @extents_found: Output parameter that will contain the number of file
* extent items that were removed or adjusted to the new
* inode i_size. The caller is responsible for initializing
* the counter. Also, it can be NULL if the caller does not
* need this counter.
*
* Remove all keys associated with the inode from the given root that have a key
* with a type greater than or equals to @min_type. When @min_type has a value of
* BTRFS_EXTENT_DATA_KEY, only remove file extent items that have an offset value
* greater than or equals to @new_size. If a file extent item that starts before
* @new_size and ends after it is found, its length is adjusted.
*
* Returns: 0 on success, < 0 on error and NEED_TRUNCATE_BLOCK when @min_type is
* BTRFS_EXTENT_DATA_KEY and the caller must truncate the last block.
*/
int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_inode *inode,
u64 new_size, u32 min_type,
u64 *extents_found)
{
struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_path *path;
struct extent_buffer *leaf;
struct btrfs_file_extent_item *fi;
struct btrfs_key key;
struct btrfs_key found_key;
u64 extent_start = 0;
u64 extent_num_bytes = 0;
u64 extent_offset = 0;
u64 item_end = 0;
u64 last_size = new_size;
u32 found_type = (u8)-1;
int found_extent;
int del_item;
int pending_del_nr = 0;
int pending_del_slot = 0;
int extent_type = -1;
int ret;
u64 ino = btrfs_ino(inode);
u64 bytes_deleted = 0;
bool be_nice = false;
bool should_throttle = false;
const u64 lock_start = ALIGN_DOWN(new_size, fs_info->sectorsize);
struct extent_state *cached_state = NULL;
BUG_ON(new_size > 0 && min_type != BTRFS_EXTENT_DATA_KEY);
/*
* For non-free space inodes and non-shareable roots, we want to back
* off from time to time. This means all inodes in subvolume roots,
* reloc roots, and data reloc roots.
*/
if (!btrfs_is_free_space_inode(inode) &&
test_bit(BTRFS_ROOT_SHAREABLE, &root->state))
be_nice = true;
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
path->reada = READA_BACK;
if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
lock_extent_bits(&inode->io_tree, lock_start, (u64)-1,
&cached_state);
/*
* We want to drop from the next block forward in case this
* new size is not block aligned since we will be keeping the
* last block of the extent just the way it is.
*/
btrfs_drop_extent_cache(inode, ALIGN(new_size,
fs_info->sectorsize),
(u64)-1, 0);
}
/*
* This function is also used to drop the items in the log tree before
* we relog the inode, so if root != BTRFS_I(inode)->root, it means
* it is used to drop the logged items. So we shouldn't kill the delayed
* items.
*/
if (min_type == 0 && root == inode->root) btrfs_kill_delayed_inode_items(inode); key.objectid = ino;
key.offset = (u64)-1;
key.type = (u8)-1;
search_again:
/*
* with a 16K leaf size and 128MB extents, you can actually queue
* up a huge file in a single leaf. Most of the time that
* bytes_deleted is > 0, it will be huge by the time we get here
*/
if (be_nice && bytes_deleted > SZ_32M && btrfs_should_end_transaction(trans)) {
ret = -EAGAIN;
goto out;
}
ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
if (ret < 0)
goto out;
if (ret > 0) {
ret = 0;
/* there are no items in the tree for us to truncate, we're
* done
*/
if (path->slots[0] == 0)
goto out;
path->slots[0]--;
}
while (1) {
u64 clear_start = 0, clear_len = 0;
fi = NULL;
leaf = path->nodes[0];
btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
found_type = found_key.type;
if (found_key.objectid != ino)
break;
if (found_type < min_type)
break;
item_end = found_key.offset;
if (found_type == BTRFS_EXTENT_DATA_KEY) {
fi = btrfs_item_ptr(leaf, path->slots[0],
struct btrfs_file_extent_item);
extent_type = btrfs_file_extent_type(leaf, fi);
if (extent_type != BTRFS_FILE_EXTENT_INLINE) {
item_end +=
btrfs_file_extent_num_bytes(leaf, fi);
trace_btrfs_truncate_show_fi_regular(
inode, leaf, fi, found_key.offset);
} else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
item_end += btrfs_file_extent_ram_bytes(leaf,
fi);
trace_btrfs_truncate_show_fi_inline(
inode, leaf, fi, path->slots[0],
found_key.offset);
}
item_end--;
}
if (found_type > min_type) {
del_item = 1;
} else {
if (item_end < new_size)
break;
if (found_key.offset >= new_size)
del_item = 1;
else
del_item = 0;
}
found_extent = 0;
/* FIXME, shrink the extent if the ref count is only 1 */
if (found_type != BTRFS_EXTENT_DATA_KEY)
goto delete;
if (extents_found != NULL) (*extents_found)++; if (extent_type != BTRFS_FILE_EXTENT_INLINE) {
u64 num_dec;
clear_start = found_key.offset;
extent_start = btrfs_file_extent_disk_bytenr(leaf, fi);
if (!del_item) {
u64 orig_num_bytes =
btrfs_file_extent_num_bytes(leaf, fi);
extent_num_bytes = ALIGN(new_size -
found_key.offset,
fs_info->sectorsize);
clear_start = ALIGN(new_size, fs_info->sectorsize);
btrfs_set_file_extent_num_bytes(leaf, fi,
extent_num_bytes);
num_dec = (orig_num_bytes -
extent_num_bytes);
if (test_bit(BTRFS_ROOT_SHAREABLE,
&root->state) &&
extent_start != 0)
inode_sub_bytes(&inode->vfs_inode,
num_dec);
btrfs_mark_buffer_dirty(leaf);
} else {
extent_num_bytes =
btrfs_file_extent_disk_num_bytes(leaf,
fi);
extent_offset = found_key.offset -
btrfs_file_extent_offset(leaf, fi);
/* FIXME blocksize != 4096 */
num_dec = btrfs_file_extent_num_bytes(leaf, fi);
if (extent_start != 0) {
found_extent = 1;
if (test_bit(BTRFS_ROOT_SHAREABLE,
&root->state)) inode_sub_bytes(&inode->vfs_inode,
num_dec);
}
}
clear_len = num_dec;
} else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
/*
* we can't truncate inline items that have had
* special encodings
*/
if (!del_item &&
btrfs_file_extent_encryption(leaf, fi) == 0 &&
btrfs_file_extent_other_encoding(leaf, fi) == 0 &&
btrfs_file_extent_compression(leaf, fi) == 0) {
u32 size = (u32)(new_size - found_key.offset);
btrfs_set_file_extent_ram_bytes(leaf, fi, size);
size = btrfs_file_extent_calc_inline_size(size);
btrfs_truncate_item(path, size, 1);
} else if (!del_item) {
/*
* We have to bail so the last_size is set to
* just before this extent.
*/
ret = NEED_TRUNCATE_BLOCK;
break;
} else {
/*
* Inline extents are special, we just treat
* them as a full sector worth in the file
* extent tree just for simplicity sake.
*/
clear_len = fs_info->sectorsize;
}
if (test_bit(BTRFS_ROOT_SHAREABLE, &root->state)) inode_sub_bytes(&inode->vfs_inode,
item_end + 1 - new_size);
}
delete:
/*
* We use btrfs_truncate_inode_items() to clean up log trees for
* multiple fsyncs, and in this case we don't want to clear the
* file extent range because it's just the log.
*/
if (root == inode->root) { ret = btrfs_inode_clear_file_extent_range(inode,
clear_start, clear_len);
if (ret) {
btrfs_abort_transaction(trans, ret);
break;
}
}
if (del_item) last_size = found_key.offset;
else
last_size = new_size;
if (del_item) {
if (!pending_del_nr) {
/* no pending yet, add ourselves */
pending_del_slot = path->slots[0];
pending_del_nr = 1;
} else if (pending_del_nr &&
path->slots[0] + 1 == pending_del_slot) {
/* hop on the pending chunk */
pending_del_nr++;
pending_del_slot = path->slots[0];
} else {
BUG();
}
} else {
break;
}
should_throttle = false;
if (found_extent && root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) { struct btrfs_ref ref = { 0 };
bytes_deleted += extent_num_bytes;
btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF,
extent_start, extent_num_bytes, 0);
ref.real_root = root->root_key.objectid;
btrfs_init_data_ref(&ref, btrfs_header_owner(leaf),
ino, extent_offset);
ret = btrfs_free_extent(trans, &ref);
if (ret) {
btrfs_abort_transaction(trans, ret);
break;
}
if (be_nice) { if (btrfs_should_throttle_delayed_refs(trans))
should_throttle = true;
}
}
if (found_type == BTRFS_INODE_ITEM_KEY)
break;
if (path->slots[0] == 0 || path->slots[0] != pending_del_slot ||
should_throttle) {
if (pending_del_nr) { ret = btrfs_del_items(trans, root, path,
pending_del_slot,
pending_del_nr);
if (ret) {
btrfs_abort_transaction(trans, ret);
break;
}
pending_del_nr = 0;
}
btrfs_release_path(path);
/*
* We can generate a lot of delayed refs, so we need to
* throttle every once and a while and make sure we're
* adding enough space to keep up with the work we are
* generating. Since we hold a transaction here we
* can't flush, and we don't want to FLUSH_LIMIT because
* we could have generated too many delayed refs to
* actually allocate, so just bail if we're short and
* let the normal reservation dance happen higher up.
*/
if (should_throttle) {
ret = btrfs_delayed_refs_rsv_refill(fs_info,
BTRFS_RESERVE_NO_FLUSH);
if (ret) {
ret = -EAGAIN;
break;
}
}
goto search_again;
} else {
path->slots[0]--;
}
}
out:
if (ret >= 0 && pending_del_nr) {
int err;
err = btrfs_del_items(trans, root, path, pending_del_slot,
pending_del_nr);
if (err) {
btrfs_abort_transaction(trans, err);
ret = err;
}
}
if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
ASSERT(last_size >= new_size);
if (!ret && last_size > new_size)
last_size = new_size;
btrfs_inode_safe_disk_i_size_write(inode, last_size);
unlock_extent_cached(&inode->io_tree, lock_start, (u64)-1,
&cached_state);
}
btrfs_free_path(path); return ret;
}
/*
* btrfs_truncate_block - read, zero a chunk and write a block
* @inode - inode that we're zeroing
* @from - the offset to start zeroing
* @len - the length to zero, 0 to zero the entire range respective to the
* offset
* @front - zero up to the offset instead of from the offset on
*
* This will find the block for the "from" offset and cow the block and zero the
* part we want to zero. This is used with truncate and hole punching.
*/
int btrfs_truncate_block(struct btrfs_inode *inode, loff_t from, loff_t len,
int front)
{
struct btrfs_fs_info *fs_info = inode->root->fs_info;
struct address_space *mapping = inode->vfs_inode.i_mapping;
struct extent_io_tree *io_tree = &inode->io_tree;
struct btrfs_ordered_extent *ordered;
struct extent_state *cached_state = NULL;
struct extent_changeset *data_reserved = NULL;
bool only_release_metadata = false;
u32 blocksize = fs_info->sectorsize;
pgoff_t index = from >> PAGE_SHIFT;
unsigned offset = from & (blocksize - 1);
struct page *page;
gfp_t mask = btrfs_alloc_write_mask(mapping);
size_t write_bytes = blocksize;
int ret = 0;
u64 block_start;
u64 block_end;
if (IS_ALIGNED(offset, blocksize) && (!len || IS_ALIGNED(len, blocksize)))
goto out;
block_start = round_down(from, blocksize);
block_end = block_start + blocksize - 1;
ret = btrfs_check_data_free_space(inode, &data_reserved, block_start,
blocksize);
if (ret < 0) {
if (btrfs_check_nocow_lock(inode, block_start, &write_bytes) > 0) {
/* For nocow case, no need to reserve data space */
only_release_metadata = true;
} else {
goto out;
}
}
ret = btrfs_delalloc_reserve_metadata(inode, blocksize);
if (ret < 0) {
if (!only_release_metadata)
btrfs_free_reserved_data_space(inode, data_reserved,
block_start, blocksize);
goto out;
}
again:
page = find_or_create_page(mapping, index, mask);
if (!page) {
btrfs_delalloc_release_space(inode, data_reserved, block_start,
blocksize, true);
btrfs_delalloc_release_extents(inode, blocksize);
ret = -ENOMEM;
goto out;
}
ret = set_page_extent_mapped(page);
if (ret < 0)
goto out_unlock;
if (!PageUptodate(page)) {
ret = btrfs_readpage(NULL, page);
lock_page(page);
if (page->mapping != mapping) { unlock_page(page);
put_page(page);
goto again;
}
if (!PageUptodate(page)) {
ret = -EIO;
goto out_unlock;
}
}
wait_on_page_writeback(page);
lock_extent_bits(io_tree, block_start, block_end, &cached_state);
ordered = btrfs_lookup_ordered_extent(inode, block_start);
if (ordered) {
unlock_extent_cached(io_tree, block_start, block_end,
&cached_state);
unlock_page(page);
put_page(page);
btrfs_start_ordered_extent(ordered, 1);
btrfs_put_ordered_extent(ordered);
goto again;
}
clear_extent_bit(&inode->io_tree, block_start, block_end,
EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG,
0, 0, &cached_state);
ret = btrfs_set_extent_delalloc(inode, block_start, block_end, 0,
&cached_state);
if (ret) {
unlock_extent_cached(io_tree, block_start, block_end,
&cached_state);
goto out_unlock;
}
if (offset != blocksize) { if (!len) len = blocksize - offset; if (front) memzero_page(page, (block_start - page_offset(page)),
offset);
else
memzero_page(page, (block_start - page_offset(page)) + offset,
len);
flush_dcache_page(page);
}
ClearPageChecked(page);
btrfs_page_set_dirty(fs_info, page, block_start, block_end + 1 - block_start);
unlock_extent_cached(io_tree, block_start, block_end, &cached_state);
if (only_release_metadata)
set_extent_bit(&inode->io_tree, block_start, block_end,
EXTENT_NORESERVE, 0, NULL, NULL, GFP_NOFS, NULL);
out_unlock:
if (ret) {
if (only_release_metadata) btrfs_delalloc_release_metadata(inode, blocksize, true);
else
btrfs_delalloc_release_space(inode, data_reserved,
block_start, blocksize, true);
}
btrfs_delalloc_release_extents(inode, blocksize);
unlock_page(page);
put_page(page);
out:
if (only_release_metadata) btrfs_check_nocow_unlock(inode); extent_changeset_free(data_reserved); return ret;
}
static int maybe_insert_hole(struct btrfs_root *root, struct btrfs_inode *inode,
u64 offset, u64 len)
{
struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_trans_handle *trans;
struct btrfs_drop_extents_args drop_args = { 0 };
int ret;
/*
* If NO_HOLES is enabled, we don't need to do anything.
* Later, up in the call chain, either btrfs_set_inode_last_sub_trans()
* or btrfs_update_inode() will be called, which guarantee that the next
* fsync will know this inode was changed and needs to be logged.
*/
if (btrfs_fs_incompat(fs_info, NO_HOLES))
return 0;
/*
* 1 - for the one we're dropping
* 1 - for the one we're adding
* 1 - for updating the inode.
*/
trans = btrfs_start_transaction(root, 3);
if (IS_ERR(trans))
return PTR_ERR(trans);
drop_args.start = offset;
drop_args.end = offset + len;
drop_args.drop_cache = true;
ret = btrfs_drop_extents(trans, root, inode, &drop_args);
if (ret) {
btrfs_abort_transaction(trans, ret);
btrfs_end_transaction(trans);
return ret;
}
ret = btrfs_insert_file_extent(trans, root, btrfs_ino(inode),
offset, 0, 0, len, 0, len, 0, 0, 0);
if (ret) {
btrfs_abort_transaction(trans, ret);
} else {
btrfs_update_inode_bytes(inode, 0, drop_args.bytes_found); btrfs_update_inode(trans, root, inode);
}
btrfs_end_transaction(trans); return ret;
}
/*
* This function puts in dummy file extents for the area we're creating a hole
* for. So if we are truncating this file to a larger size we need to insert
* these file extents so that btrfs_get_extent will return a EXTENT_MAP_HOLE for
* the range between oldsize and size
*/
int btrfs_cont_expand(struct btrfs_inode *inode, loff_t oldsize, loff_t size)
{
struct btrfs_root *root = inode->root;
struct btrfs_fs_info *fs_info = root->fs_info;
struct extent_io_tree *io_tree = &inode->io_tree;
struct extent_map *em = NULL;
struct extent_state *cached_state = NULL;
struct extent_map_tree *em_tree = &inode->extent_tree; u64 hole_start = ALIGN(oldsize, fs_info->sectorsize);
u64 block_end = ALIGN(size, fs_info->sectorsize);
u64 last_byte;
u64 cur_offset;
u64 hole_size;
int err = 0;
/*
* If our size started in the middle of a block we need to zero out the
* rest of the block before we expand the i_size, otherwise we could
* expose stale data.
*/
err = btrfs_truncate_block(inode, oldsize, 0, 0);
if (err)
return err;
if (size <= hole_start)
return 0;
btrfs_lock_and_flush_ordered_range(inode, hole_start, block_end - 1,
&cached_state);
cur_offset = hole_start;
while (1) {
em = btrfs_get_extent(inode, NULL, 0, cur_offset,
block_end - cur_offset);
if (IS_ERR(em)) {
err = PTR_ERR(em);
em = NULL;
break;
}
last_byte = min(extent_map_end(em), block_end); last_byte = ALIGN(last_byte, fs_info->sectorsize);
hole_size = last_byte - cur_offset;
if (!test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) {
struct extent_map *hole_em;
err = maybe_insert_hole(root, inode, cur_offset,
hole_size);
if (err)
break;
err = btrfs_inode_set_file_extent_range(inode,
cur_offset, hole_size);
if (err)
break;
btrfs_drop_extent_cache(inode, cur_offset,
cur_offset + hole_size - 1, 0);
hole_em = alloc_extent_map();
if (!hole_em) {
set_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
&inode->runtime_flags);
goto next;
}
hole_em->start = cur_offset;
hole_em->len = hole_size;
hole_em->orig_start = cur_offset;
hole_em->block_start = EXTENT_MAP_HOLE;
hole_em->block_len = 0;
hole_em->orig_block_len = 0;
hole_em->ram_bytes = hole_size;
hole_em->compress_type = BTRFS_COMPRESS_NONE;
hole_em->generation = fs_info->generation;
while (1) {
write_lock(&em_tree->lock);
err = add_extent_mapping(em_tree, hole_em, 1);
write_unlock(&em_tree->lock);
if (err != -EEXIST)
break;
btrfs_drop_extent_cache(inode, cur_offset,
cur_offset +
hole_size - 1, 0);
}
free_extent_map(hole_em);
} else {
err = btrfs_inode_set_file_extent_range(inode,
cur_offset, hole_size);
if (err)
break;
}
next:
free_extent_map(em);
em = NULL;
cur_offset = last_byte;
if (cur_offset >= block_end)
break;
}
free_extent_map(em);
unlock_extent_cached(io_tree, hole_start, block_end - 1, &cached_state);
return err;
}
static int btrfs_setsize(struct inode *inode, struct iattr *attr)
{
struct btrfs_root *root = BTRFS_I(inode)->root;
struct btrfs_trans_handle *trans;
loff_t oldsize = i_size_read(inode);
loff_t newsize = attr->ia_size;
int mask = attr->ia_valid;
int ret;
/*
* The regular truncate() case without ATTR_CTIME and ATTR_MTIME is a
* special case where we need to update the times despite not having
* these flags set. For all other operations the VFS set these flags
* explicitly if it wants a timestamp update.
*/
if (newsize != oldsize) {
inode_inc_iversion(inode);
if (!(mask & (ATTR_CTIME | ATTR_MTIME)))
inode->i_ctime = inode->i_mtime =
current_time(inode);
}
if (newsize > oldsize) {
/*
* Don't do an expanding truncate while snapshotting is ongoing.
* This is to ensure the snapshot captures a fully consistent
* state of this file - if the snapshot captures this expanding
* truncation, it must capture all writes that happened before
* this truncation.
*/
btrfs_drew_write_lock(&root->snapshot_lock);
ret = btrfs_cont_expand(BTRFS_I(inode), oldsize, newsize);
if (ret) {
btrfs_drew_write_unlock(&root->snapshot_lock); return ret;
}
trans = btrfs_start_transaction(root, 1);
if (IS_ERR(trans)) {
btrfs_drew_write_unlock(&root->snapshot_lock);
return PTR_ERR(trans);
}
i_size_write(inode, newsize);
btrfs_inode_safe_disk_i_size_write(BTRFS_I(inode), 0);
pagecache_isize_extended(inode, oldsize, newsize);
ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
btrfs_drew_write_unlock(&root->snapshot_lock);
btrfs_end_transaction(trans);
} else {
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
if (btrfs_is_zoned(fs_info)) {
ret = btrfs_wait_ordered_range(inode,
ALIGN(newsize, fs_info->sectorsize),
(u64)-1);
if (ret)
return ret;
}
/*
* We're truncating a file that used to have good data down to
* zero. Make sure any new writes to the file get on disk
* on close.
*/
if (newsize == 0)
set_bit(BTRFS_INODE_FLUSH_ON_CLOSE,
&BTRFS_I(inode)->runtime_flags); truncate_setsize(inode, newsize);
inode_dio_wait(inode);
ret = btrfs_truncate(inode, newsize == oldsize);
if (ret && inode->i_nlink) {
int err;
/*
* Truncate failed, so fix up the in-memory size. We
* adjusted disk_i_size down as we removed extents, so
* wait for disk_i_size to be stable and then update the
* in-memory size to match.
*/
err = btrfs_wait_ordered_range(inode, 0, (u64)-1);
if (err)
return err;
i_size_write(inode, BTRFS_I(inode)->disk_i_size);
}
}
return ret;
}
static int btrfs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
struct iattr *attr)
{
struct inode *inode = d_inode(dentry);
struct btrfs_root *root = BTRFS_I(inode)->root;
int err;
if (btrfs_root_readonly(root))
return -EROFS;
err = setattr_prepare(mnt_userns, dentry, attr);
if (err)
return err;
if (S_ISREG(inode->i_mode) && (attr->ia_valid & ATTR_SIZE)) { err = btrfs_setsize(inode, attr); if (err)
return err;
}
if (attr->ia_valid) { setattr_copy(mnt_userns, inode, attr);
inode_inc_iversion(inode);
err = btrfs_dirty_inode(inode); if (!err && attr->ia_valid & ATTR_MODE) err = posix_acl_chmod(mnt_userns, inode, inode->i_mode);
}
return err;
}
/*
* While truncating the inode pages during eviction, we get the VFS calling
* btrfs_invalidatepage() against each page of the inode. This is slow because
* the calls to btrfs_invalidatepage() result in a huge amount of calls to
* lock_extent_bits() and clear_extent_bit(), which keep merging and splitting
* extent_state structures over and over, wasting lots of time.
*
* Therefore if the inode is being evicted, let btrfs_invalidatepage() skip all
* those expensive operations on a per page basis and do only the ordered io
* finishing, while we release here the extent_map and extent_state structures,
* without the excessive merging and splitting.
*/
static void evict_inode_truncate_pages(struct inode *inode)
{
struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
struct extent_map_tree *map_tree = &BTRFS_I(inode)->extent_tree;
struct rb_node *node;
ASSERT(inode->i_state & I_FREEING);
truncate_inode_pages_final(&inode->i_data);
write_lock(&map_tree->lock);
while (!RB_EMPTY_ROOT(&map_tree->map.rb_root)) {
struct extent_map *em;
node = rb_first_cached(&map_tree->map);
em = rb_entry(node, struct extent_map, rb_node);
clear_bit(EXTENT_FLAG_PINNED, &em->flags);
clear_bit(EXTENT_FLAG_LOGGING, &em->flags);
remove_extent_mapping(map_tree, em);
free_extent_map(em);
if (need_resched()) {
write_unlock(&map_tree->lock);
cond_resched();
write_lock(&map_tree->lock);
}
}
write_unlock(&map_tree->lock);
/*
* Keep looping until we have no more ranges in the io tree.
* We can have ongoing bios started by readahead that have
* their endio callback (extent_io.c:end_bio_extent_readpage)
* still in progress (unlocked the pages in the bio but did not yet
* unlocked the ranges in the io tree). Therefore this means some
* ranges can still be locked and eviction started because before
* submitting those bios, which are executed by a separate task (work
* queue kthread), inode references (inode->i_count) were not taken
* (which would be dropped in the end io callback of each bio).
* Therefore here we effectively end up waiting for those bios and
* anyone else holding locked ranges without having bumped the inode's
* reference count - if we don't do it, when they access the inode's
* io_tree to unlock a range it may be too late, leading to an
* use-after-free issue.
*/
spin_lock(&io_tree->lock);
while (!RB_EMPTY_ROOT(&io_tree->state)) {
struct extent_state *state;
struct extent_state *cached_state = NULL;
u64 start;
u64 end;
unsigned state_flags;
node = rb_first(&io_tree->state);
state = rb_entry(node, struct extent_state, rb_node);
start = state->start;
end = state->end;
state_flags = state->state;
spin_unlock(&io_tree->lock);
lock_extent_bits(io_tree, start, end, &cached_state);
/*
* If still has DELALLOC flag, the extent didn't reach disk,
* and its reserved space won't be freed by delayed_ref.
* So we need to free its reserved space here.
* (Refer to comment in btrfs_invalidatepage, case 2)
*
* Note, end is the bytenr of last byte, so we need + 1 here.
*/
if (state_flags & EXTENT_DELALLOC)
btrfs_qgroup_free_data(BTRFS_I(inode), NULL, start,
end - start + 1);
clear_extent_bit(io_tree, start, end,
EXTENT_LOCKED | EXTENT_DELALLOC |
EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 1, 1,
&cached_state);
cond_resched();
spin_lock(&io_tree->lock);
}
spin_unlock(&io_tree->lock);
}
static struct btrfs_trans_handle *evict_refill_and_join(struct btrfs_root *root,
struct btrfs_block_rsv *rsv)
{
struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
struct btrfs_trans_handle *trans;
u64 delayed_refs_extra = btrfs_calc_insert_metadata_size(fs_info, 1);
int ret;
/*
* Eviction should be taking place at some place safe because of our
* delayed iputs. However the normal flushing code will run delayed
* iputs, so we cannot use FLUSH_ALL otherwise we'll deadlock.
*
* We reserve the delayed_refs_extra here again because we can't use
* btrfs_start_transaction(root, 0) for the same deadlocky reason as
* above. We reserve our extra bit here because we generate a ton of
* delayed refs activity by truncating.
*
* If we cannot make our reservation we'll attempt to steal from the
* global reserve, because we really want to be able to free up space.
*/
ret = btrfs_block_rsv_refill(root, rsv, rsv->size + delayed_refs_extra,
BTRFS_RESERVE_FLUSH_EVICT);
if (ret) {
/*
* Try to steal from the global reserve if there is space for
* it.
*/
if (btrfs_check_space_for_delayed_refs(fs_info) || btrfs_block_rsv_migrate(global_rsv, rsv, rsv->size, 0)) {
btrfs_warn(fs_info,
"could not allocate space for delete; will truncate on mount");
return ERR_PTR(-ENOSPC);
}
delayed_refs_extra = 0;
}
trans = btrfs_join_transaction(root);
if (IS_ERR(trans))
return trans;
if (delayed_refs_extra) { trans->block_rsv = &fs_info->trans_block_rsv;
trans->bytes_reserved = delayed_refs_extra;
btrfs_block_rsv_migrate(rsv, trans->block_rsv,
delayed_refs_extra, 1);
}
return trans;
}
void btrfs_evict_inode(struct inode *inode)
{
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct btrfs_trans_handle *trans;
struct btrfs_root *root = BTRFS_I(inode)->root;
struct btrfs_block_rsv *rsv;
int ret;
trace_btrfs_inode_evict(inode);
if (!root) {
fsverity_cleanup_inode(inode);
clear_inode(inode);
return;
}
evict_inode_truncate_pages(inode);
if (inode->i_nlink &&
((btrfs_root_refs(&root->root_item) != 0 &&
root->root_key.objectid != BTRFS_ROOT_TREE_OBJECTID) ||
btrfs_is_free_space_inode(BTRFS_I(inode))))
goto no_delete;
if (is_bad_inode(inode))
goto no_delete;
btrfs_free_io_failure_record(BTRFS_I(inode), 0, (u64)-1);
if (test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags))
goto no_delete;
if (inode->i_nlink > 0) { BUG_ON(btrfs_root_refs(&root->root_item) != 0 &&
root->root_key.objectid != BTRFS_ROOT_TREE_OBJECTID);
goto no_delete;
}
ret = btrfs_commit_inode_delayed_inode(BTRFS_I(inode));
if (ret)
goto no_delete;
rsv = btrfs_alloc_block_rsv(fs_info, BTRFS_BLOCK_RSV_TEMP);
if (!rsv)
goto no_delete;
rsv->size = btrfs_calc_metadata_size(fs_info, 1);
rsv->failfast = 1;
btrfs_i_size_write(BTRFS_I(inode), 0);
while (1) {
trans = evict_refill_and_join(root, rsv);
if (IS_ERR(trans))
goto free_rsv;
trans->block_rsv = rsv;
ret = btrfs_truncate_inode_items(trans, root, BTRFS_I(inode),
0, 0, NULL);
trans->block_rsv = &fs_info->trans_block_rsv;
btrfs_end_transaction(trans);
btrfs_btree_balance_dirty(fs_info);
if (ret && ret != -ENOSPC && ret != -EAGAIN)
goto free_rsv;
else if (!ret)
break;
}
/*
* Errors here aren't a big deal, it just means we leave orphan items in
* the tree. They will be cleaned up on the next mount. If the inode
* number gets reused, cleanup deletes the orphan item without doing
* anything, and unlink reuses the existing orphan item.
*
* If it turns out that we are dropping too many of these, we might want
* to add a mechanism for retrying these after a commit.
*/
trans = evict_refill_and_join(root, rsv);
if (!IS_ERR(trans)) {
trans->block_rsv = rsv;
btrfs_orphan_del(trans, BTRFS_I(inode));
trans->block_rsv = &fs_info->trans_block_rsv;
btrfs_end_transaction(trans);
}
free_rsv:
btrfs_free_block_rsv(fs_info, rsv);
no_delete:
/*
* If we didn't successfully delete, the orphan item will still be in
* the tree and we'll retry on the next mount. Again, we might also want
* to retry these periodically in the future.
*/
btrfs_remove_delayed_node(BTRFS_I(inode));
fsverity_cleanup_inode(inode);
clear_inode(inode);
}
/*
* Return the key found in the dir entry in the location pointer, fill @type
* with BTRFS_FT_*, and return 0.
*
* If no dir entries were found, returns -ENOENT.
* If found a corrupted location in dir entry, returns -EUCLEAN.
*/
static int btrfs_inode_by_name(struct inode *dir, struct dentry *dentry,
struct btrfs_key *location, u8 *type)
{
const char *name = dentry->d_name.name;
int namelen = dentry->d_name.len;
struct btrfs_dir_item *di;
struct btrfs_path *path;
struct btrfs_root *root = BTRFS_I(dir)->root;
int ret = 0;
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
di = btrfs_lookup_dir_item(NULL, root, path, btrfs_ino(BTRFS_I(dir)),
name, namelen, 0);
if (IS_ERR_OR_NULL(di)) {
ret = di ? PTR_ERR(di) : -ENOENT;
goto out;
}
btrfs_dir_item_key_to_cpu(path->nodes[0], di, location); if (location->type != BTRFS_INODE_ITEM_KEY &&
location->type != BTRFS_ROOT_ITEM_KEY) {
ret = -EUCLEAN;
btrfs_warn(root->fs_info,
"%s gets something invalid in DIR_ITEM (name %s, directory ino %llu, location(%llu %u %llu))",
__func__, name, btrfs_ino(BTRFS_I(dir)),
location->objectid, location->type, location->offset);
}
if (!ret)
*type = btrfs_dir_type(path->nodes[0], di);
out:
btrfs_free_path(path);
return ret;
}
/*
* when we hit a tree root in a directory, the btrfs part of the inode
* needs to be changed to reflect the root directory of the tree root. This
* is kind of like crossing a mount point.
*/
static int fixup_tree_root_location(struct btrfs_fs_info *fs_info,
struct inode *dir,
struct dentry *dentry,
struct btrfs_key *location,
struct btrfs_root **sub_root)
{
struct btrfs_path *path;
struct btrfs_root *new_root;
struct btrfs_root_ref *ref;
struct extent_buffer *leaf;
struct btrfs_key key;
int ret;
int err = 0;
path = btrfs_alloc_path();
if (!path) {
err = -ENOMEM;
goto out;
}
err = -ENOENT;
key.objectid = BTRFS_I(dir)->root->root_key.objectid;
key.type = BTRFS_ROOT_REF_KEY;
key.offset = location->objectid;
ret = btrfs_search_slot(NULL, fs_info->tree_root, &key, path, 0, 0);
if (ret) {
if (ret < 0)
err = ret;
goto out;
}
leaf = path->nodes[0];
ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_root_ref);
if (btrfs_root_ref_dirid(leaf, ref) != btrfs_ino(BTRFS_I(dir)) ||
btrfs_root_ref_name_len(leaf, ref) != dentry->d_name.len)
goto out;
ret = memcmp_extent_buffer(leaf, dentry->d_name.name,
(unsigned long)(ref + 1),
dentry->d_name.len);
if (ret)
goto out;
btrfs_release_path(path);
new_root = btrfs_get_fs_root(fs_info, location->objectid, true);
if (IS_ERR(new_root)) {
err = PTR_ERR(new_root);
goto out;
}
*sub_root = new_root;
location->objectid = btrfs_root_dirid(&new_root->root_item);
location->type = BTRFS_INODE_ITEM_KEY;
location->offset = 0;
err = 0;
out:
btrfs_free_path(path);
return err;
}
static void inode_tree_add(struct inode *inode)
{
struct btrfs_root *root = BTRFS_I(inode)->root;
struct btrfs_inode *entry;
struct rb_node **p;
struct rb_node *parent;
struct rb_node *new = &BTRFS_I(inode)->rb_node;
u64 ino = btrfs_ino(BTRFS_I(inode));
if (inode_unhashed(inode))
return;
parent = NULL;
spin_lock(&root->inode_lock);
p = &root->inode_tree.rb_node;
while (*p) {
parent = *p;
entry = rb_entry(parent, struct btrfs_inode, rb_node);
if (ino < btrfs_ino(entry)) p = &parent->rb_left; else if (ino > btrfs_ino(entry)) p = &parent->rb_right;
else {
WARN_ON(!(entry->vfs_inode.i_state &
(I_WILL_FREE | I_FREEING)));
rb_replace_node(parent, new, &root->inode_tree);
RB_CLEAR_NODE(parent);
spin_unlock(&root->inode_lock);
return;
}
}
rb_link_node(new, parent, p);
rb_insert_color(new, &root->inode_tree);
spin_unlock(&root->inode_lock);
}
static void inode_tree_del(struct btrfs_inode *inode)
{
struct btrfs_root *root = inode->root;
int empty = 0;
spin_lock(&root->inode_lock);
if (!RB_EMPTY_NODE(&inode->rb_node)) {
rb_erase(&inode->rb_node, &root->inode_tree);
RB_CLEAR_NODE(&inode->rb_node);
empty = RB_EMPTY_ROOT(&root->inode_tree);
}
spin_unlock(&root->inode_lock);
if (empty && btrfs_root_refs(&root->root_item) == 0) {
spin_lock(&root->inode_lock);
empty = RB_EMPTY_ROOT(&root->inode_tree);
spin_unlock(&root->inode_lock);
if (empty)
btrfs_add_dead_root(root);
}
}
static int btrfs_init_locked_inode(struct inode *inode, void *p)
{
struct btrfs_iget_args *args = p;
inode->i_ino = args->ino;
BTRFS_I(inode)->location.objectid = args->ino;
BTRFS_I(inode)->location.type = BTRFS_INODE_ITEM_KEY;
BTRFS_I(inode)->location.offset = 0;
BTRFS_I(inode)->root = btrfs_grab_root(args->root); BUG_ON(args->root && !BTRFS_I(inode)->root); return 0;
}
static int btrfs_find_actor(struct inode *inode, void *opaque)
{
struct btrfs_iget_args *args = opaque;
return args->ino == BTRFS_I(inode)->location.objectid && args->root == BTRFS_I(inode)->root;
}
static struct inode *btrfs_iget_locked(struct super_block *s, u64 ino,
struct btrfs_root *root)
{
struct inode *inode;
struct btrfs_iget_args args;
unsigned long hashval = btrfs_inode_hash(ino, root);
args.ino = ino;
args.root = root;
inode = iget5_locked(s, hashval, btrfs_find_actor,
btrfs_init_locked_inode,
(void *)&args);
return inode;
}
/*
* Get an inode object given its inode number and corresponding root.
* Path can be preallocated to prevent recursing back to iget through
* allocator. NULL is also valid but may require an additional allocation
* later.
*/
struct inode *btrfs_iget_path(struct super_block *s, u64 ino,
struct btrfs_root *root, struct btrfs_path *path)
{
struct inode *inode;
inode = btrfs_iget_locked(s, ino, root);
if (!inode)
return ERR_PTR(-ENOMEM);
if (inode->i_state & I_NEW) {
int ret;
ret = btrfs_read_locked_inode(inode, path);
if (!ret) {
inode_tree_add(inode);
unlock_new_inode(inode);
} else {
iget_failed(inode);
/*
* ret > 0 can come from btrfs_search_slot called by
* btrfs_read_locked_inode, this means the inode item
* was not found.
*/
if (ret > 0)
ret = -ENOENT;
inode = ERR_PTR(ret);
}
}
return inode;
}
struct inode *btrfs_iget(struct super_block *s, u64 ino, struct btrfs_root *root)
{
return btrfs_iget_path(s, ino, root, NULL);
}
static struct inode *new_simple_dir(struct super_block *s,
struct btrfs_key *key,
struct btrfs_root *root)
{
struct inode *inode = new_inode(s);
if (!inode)
return ERR_PTR(-ENOMEM);
BTRFS_I(inode)->root = btrfs_grab_root(root);
memcpy(&BTRFS_I(inode)->location, key, sizeof(*key));
set_bit(BTRFS_INODE_DUMMY, &BTRFS_I(inode)->runtime_flags);
inode->i_ino = BTRFS_EMPTY_SUBVOL_DIR_OBJECTID;
/*
* We only need lookup, the rest is read-only and there's no inode
* associated with the dentry
*/
inode->i_op = &simple_dir_inode_operations;
inode->i_opflags &= ~IOP_XATTR;
inode->i_fop = &simple_dir_operations;
inode->i_mode = S_IFDIR | S_IRUGO | S_IWUSR | S_IXUGO;
inode->i_mtime = current_time(inode);
inode->i_atime = inode->i_mtime;
inode->i_ctime = inode->i_mtime;
BTRFS_I(inode)->i_otime = inode->i_mtime;
return inode;
}
static inline u8 btrfs_inode_type(struct inode *inode)
{
/*
* Compile-time asserts that generic FT_* types still match
* BTRFS_FT_* types
*/
BUILD_BUG_ON(BTRFS_FT_UNKNOWN != FT_UNKNOWN);
BUILD_BUG_ON(BTRFS_FT_REG_FILE != FT_REG_FILE);
BUILD_BUG_ON(BTRFS_FT_DIR != FT_DIR);
BUILD_BUG_ON(BTRFS_FT_CHRDEV != FT_CHRDEV);
BUILD_BUG_ON(BTRFS_FT_BLKDEV != FT_BLKDEV);
BUILD_BUG_ON(BTRFS_FT_FIFO != FT_FIFO);
BUILD_BUG_ON(BTRFS_FT_SOCK != FT_SOCK);
BUILD_BUG_ON(BTRFS_FT_SYMLINK != FT_SYMLINK);
return fs_umode_to_ftype(inode->i_mode);
}
struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry)
{
struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb);
struct inode *inode;
struct btrfs_root *root = BTRFS_I(dir)->root;
struct btrfs_root *sub_root = root;
struct btrfs_key location;
u8 di_type = 0;
int ret = 0;
if (dentry->d_name.len > BTRFS_NAME_LEN)
return ERR_PTR(-ENAMETOOLONG);
ret = btrfs_inode_by_name(dir, dentry, &location, &di_type);
if (ret < 0)
return ERR_PTR(ret);
if (location.type == BTRFS_INODE_ITEM_KEY) { inode = btrfs_iget(dir->i_sb, location.objectid, root);
if (IS_ERR(inode))
return inode;
/* Do extra check against inode mode with di_type */
if (btrfs_inode_type(inode) != di_type) { btrfs_crit(fs_info,
"inode mode mismatch with dir: inode mode=0%o btrfs type=%u dir type=%u",
inode->i_mode, btrfs_inode_type(inode),
di_type);
iput(inode);
return ERR_PTR(-EUCLEAN);
}
return inode;
}
ret = fixup_tree_root_location(fs_info, dir, dentry,
&location, &sub_root);
if (ret < 0) {
if (ret != -ENOENT)
inode = ERR_PTR(ret);
else
inode = new_simple_dir(dir->i_sb, &location, sub_root);
} else {
inode = btrfs_iget(dir->i_sb, location.objectid, sub_root);
}
if (root != sub_root)
btrfs_put_root(sub_root);
if (!IS_ERR(inode) && root != sub_root) {
down_read(&fs_info->cleanup_work_sem);
if (!sb_rdonly(inode->i_sb))
ret = btrfs_orphan_cleanup(sub_root); up_read(&fs_info->cleanup_work_sem);
if (ret) {
iput(inode);
inode = ERR_PTR(ret);
}
}
return inode;
}
static int btrfs_dentry_delete(const struct dentry *dentry)
{
struct btrfs_root *root;
struct inode *inode = d_inode(dentry); if (!inode && !IS_ROOT(dentry)) inode = d_inode(dentry->d_parent); if (inode) { root = BTRFS_I(inode)->root;
if (btrfs_root_refs(&root->root_item) == 0)
return 1;
if (btrfs_ino(BTRFS_I(inode)) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)
return 1;
}
return 0;
}
static struct dentry *btrfs_lookup(struct inode *dir, struct dentry *dentry,
unsigned int flags)
{
struct inode *inode = btrfs_lookup_dentry(dir, dentry);
if (inode == ERR_PTR(-ENOENT))
inode = NULL;
return d_splice_alias(inode, dentry);
}
/*
* All this infrastructure exists because dir_emit can fault, and we are holding
* the tree lock when doing readdir. For now just allocate a buffer and copy
* our information into that, and then dir_emit from the buffer. This is
* similar to what NFS does, only we don't keep the buffer around in pagecache
* because I'm afraid I'll mess that up. Long term we need to make filldir do
* copy_to_user_inatomic so we don't have to worry about page faulting under the
* tree lock.
*/
static int btrfs_opendir(struct inode *inode, struct file *file)
{
struct btrfs_file_private *private;
private = kzalloc(sizeof(struct btrfs_file_private), GFP_KERNEL);
if (!private)
return -ENOMEM;
private->filldir_buf = kzalloc(PAGE_SIZE, GFP_KERNEL);
if (!private->filldir_buf) {
kfree(private);
return -ENOMEM;
}
file->private_data = private; return 0;
}
struct dir_entry {
u64 ino;
u64 offset;
unsigned type;
int name_len;
};
static int btrfs_filldir(void *addr, int entries, struct dir_context *ctx)
{
while (entries--) {
struct dir_entry *entry = addr;
char *name = (char *)(entry + 1);
ctx->pos = get_unaligned(&entry->offset);
if (!dir_emit(ctx, name, get_unaligned(&entry->name_len),
get_unaligned(&entry->ino),
get_unaligned(&entry->type)))
return 1;
addr += sizeof(struct dir_entry) +
get_unaligned(&entry->name_len);
ctx->pos++;
}
return 0;
}
static int btrfs_real_readdir(struct file *file, struct dir_context *ctx)
{
struct inode *inode = file_inode(file);
struct btrfs_root *root = BTRFS_I(inode)->root;
struct btrfs_file_private *private = file->private_data;
struct btrfs_dir_item *di;
struct btrfs_key key;
struct btrfs_key found_key;
struct btrfs_path *path;
void *addr;
struct list_head ins_list;
struct list_head del_list;
int ret;
struct extent_buffer *leaf;
int slot;
char *name_ptr;
int name_len;
int entries = 0;
int total_len = 0;
bool put = false;
struct btrfs_key location;
if (!dir_emit_dots(file, ctx))
return 0; path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
addr = private->filldir_buf;
path->reada = READA_FORWARD;
INIT_LIST_HEAD(&ins_list);
INIT_LIST_HEAD(&del_list);
put = btrfs_readdir_get_delayed_items(inode, &ins_list, &del_list);
again:
key.type = BTRFS_DIR_INDEX_KEY;
key.offset = ctx->pos;
key.objectid = btrfs_ino(BTRFS_I(inode));
ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
if (ret < 0)
goto err;
while (1) {
struct dir_entry *entry;
leaf = path->nodes[0];
slot = path->slots[0];
if (slot >= btrfs_header_nritems(leaf)) {
ret = btrfs_next_leaf(root, path);
if (ret < 0)
goto err;
else if (ret > 0)
break;
continue;
}
btrfs_item_key_to_cpu(leaf, &found_key, slot);
if (found_key.objectid != key.objectid)
break;
if (found_key.type != BTRFS_DIR_INDEX_KEY)
break;
if (found_key.offset < ctx->pos)
goto next;
if (btrfs_should_delete_dir_index(&del_list, found_key.offset))
goto next;
di = btrfs_item_ptr(leaf, slot, struct btrfs_dir_item);
name_len = btrfs_dir_name_len(leaf, di);
if ((total_len + sizeof(struct dir_entry) + name_len) >=
PAGE_SIZE) {
btrfs_release_path(path);
ret = btrfs_filldir(private->filldir_buf, entries, ctx);
if (ret)
goto nopos;
addr = private->filldir_buf;
entries = 0;
total_len = 0;
goto again;
}
entry = addr;
put_unaligned(name_len, &entry->name_len);
name_ptr = (char *)(entry + 1);
read_extent_buffer(leaf, name_ptr, (unsigned long)(di + 1),
name_len);
put_unaligned(fs_ftype_to_dtype(btrfs_dir_type(leaf, di)),
&entry->type);
btrfs_dir_item_key_to_cpu(leaf, di, &location);
put_unaligned(location.objectid, &entry->ino);
put_unaligned(found_key.offset, &entry->offset);
entries++;
addr += sizeof(struct dir_entry) + name_len;
total_len += sizeof(struct dir_entry) + name_len;
next:
path->slots[0]++;
}
btrfs_release_path(path);
ret = btrfs_filldir(private->filldir_buf, entries, ctx);
if (ret)
goto nopos;
ret = btrfs_readdir_delayed_dir_index(ctx, &ins_list);
if (ret)
goto nopos;
/*
* Stop new entries from being returned after we return the last
* entry.
*
* New directory entries are assigned a strictly increasing
* offset. This means that new entries created during readdir
* are *guaranteed* to be seen in the future by that readdir.
* This has broken buggy programs which operate on names as
* they're returned by readdir. Until we re-use freed offsets
* we have this hack to stop new entries from being returned
* under the assumption that they'll never reach this huge
* offset.
*
* This is being careful not to overflow 32bit loff_t unless the
* last entry requires it because doing so has broken 32bit apps
* in the past.
*/
if (ctx->pos >= INT_MAX) ctx->pos = LLONG_MAX;
else
ctx->pos = INT_MAX;
nopos:
ret = 0;
err:
if (put) btrfs_readdir_put_delayed_items(inode, &ins_list, &del_list); btrfs_free_path(path);
return ret;
}
/*
* This is somewhat expensive, updating the tree every time the
* inode changes. But, it is most likely to find the inode in cache.
* FIXME, needs more benchmarking...there are no reasons other than performance
* to keep or drop this code.
*/
static int btrfs_dirty_inode(struct inode *inode)
{
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct btrfs_root *root = BTRFS_I(inode)->root;
struct btrfs_trans_handle *trans;
int ret;
if (test_bit(BTRFS_INODE_DUMMY, &BTRFS_I(inode)->runtime_flags))
return 0;
trans = btrfs_join_transaction(root);
if (IS_ERR(trans))
return PTR_ERR(trans);
ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
if (ret && (ret == -ENOSPC || ret == -EDQUOT)) {
/* whoops, lets try again with the full transaction */
btrfs_end_transaction(trans);
trans = btrfs_start_transaction(root, 1);
if (IS_ERR(trans))
return PTR_ERR(trans);
ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
}
btrfs_end_transaction(trans);
if (BTRFS_I(inode)->delayed_node)
btrfs_balance_delayed_items(fs_info);
return ret;
}
/*
* This is a copy of file_update_time. We need this so we can return error on
* ENOSPC for updating the inode in the case of file write and mmap writes.
*/
static int btrfs_update_time(struct inode *inode, struct timespec64 *now,
int flags)
{
struct btrfs_root *root = BTRFS_I(inode)->root;
bool dirty = flags & ~S_VERSION;
if (btrfs_root_readonly(root))
return -EROFS;
if (flags & S_VERSION)
dirty |= inode_maybe_inc_iversion(inode, dirty);
if (flags & S_CTIME) inode->i_ctime = *now; if (flags & S_MTIME) inode->i_mtime = *now; if (flags & S_ATIME) inode->i_atime = *now; return dirty ? btrfs_dirty_inode(inode) : 0;
}
/*
* find the highest existing sequence number in a directory
* and then set the in-memory index_cnt variable to reflect
* free sequence numbers
*/
static int btrfs_set_inode_index_count(struct btrfs_inode *inode)
{
struct btrfs_root *root = inode->root;
struct btrfs_key key, found_key;
struct btrfs_path *path;
struct extent_buffer *leaf;
int ret;
key.objectid = btrfs_ino(inode);
key.type = BTRFS_DIR_INDEX_KEY;
key.offset = (u64)-1;
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
if (ret < 0)
goto out;
/* FIXME: we should be able to handle this */
if (ret == 0)
goto out;
ret = 0;
/*
* MAGIC NUMBER EXPLANATION:
* since we search a directory based on f_pos we have to start at 2
* since '.' and '..' have f_pos of 0 and 1 respectively, so everybody
* else has to start at 2
*/
if (path->slots[0] == 0) { inode->index_cnt = 2;
goto out;
}
path->slots[0]--;
leaf = path->nodes[0];
btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
if (found_key.objectid != btrfs_ino(inode) || found_key.type != BTRFS_DIR_INDEX_KEY) { inode->index_cnt = 2;
goto out;
}
inode->index_cnt = found_key.offset + 1;
out:
btrfs_free_path(path);
return ret;
}
/*
* helper to find a free sequence number in a given directory. This current
* code is very simple, later versions will do smarter things in the btree
*/
int btrfs_set_inode_index(struct btrfs_inode *dir, u64 *index)
{
int ret = 0;
if (dir->index_cnt == (u64)-1) { ret = btrfs_inode_delayed_dir_index_count(dir); if (ret) {
ret = btrfs_set_inode_index_count(dir);
if (ret)
return ret;
}
}
*index = dir->index_cnt;
dir->index_cnt++;
return ret;
}
static int btrfs_insert_inode_locked(struct inode *inode)
{
struct btrfs_iget_args args;
args.ino = BTRFS_I(inode)->location.objectid;
args.root = BTRFS_I(inode)->root;
return insert_inode_locked4(inode,
btrfs_inode_hash(inode->i_ino, BTRFS_I(inode)->root),
btrfs_find_actor, &args);
}
/*
* Inherit flags from the parent inode.
*
* Currently only the compression flags and the cow flags are inherited.
*/
static void btrfs_inherit_iflags(struct inode *inode, struct inode *dir)
{
unsigned int flags;
if (!dir)
return;
flags = BTRFS_I(dir)->flags;
if (flags & BTRFS_INODE_NOCOMPRESS) {
BTRFS_I(inode)->flags &= ~BTRFS_INODE_COMPRESS;
BTRFS_I(inode)->flags |= BTRFS_INODE_NOCOMPRESS;
} else if (flags & BTRFS_INODE_COMPRESS) { BTRFS_I(inode)->flags &= ~BTRFS_INODE_NOCOMPRESS;
BTRFS_I(inode)->flags |= BTRFS_INODE_COMPRESS;
}
if (flags & BTRFS_INODE_NODATACOW) { BTRFS_I(inode)->flags |= BTRFS_INODE_NODATACOW;
if (S_ISREG(inode->i_mode))
BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM;
}
btrfs_sync_inode_flags_to_i_flags(inode);
}
static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct user_namespace *mnt_userns,
struct inode *dir,
const char *name, int name_len,
u64 ref_objectid, u64 objectid,
umode_t mode, u64 *index)
{
struct btrfs_fs_info *fs_info = root->fs_info;
struct inode *inode;
struct btrfs_inode_item *inode_item;
struct btrfs_key *location;
struct btrfs_path *path;
struct btrfs_inode_ref *ref;
struct btrfs_key key[2];
u32 sizes[2];
int nitems = name ? 2 : 1;
unsigned long ptr;
unsigned int nofs_flag;
int ret;
path = btrfs_alloc_path();
if (!path)
return ERR_PTR(-ENOMEM);
nofs_flag = memalloc_nofs_save();
inode = new_inode(fs_info->sb);
memalloc_nofs_restore(nofs_flag);
if (!inode) {
btrfs_free_path(path);
return ERR_PTR(-ENOMEM);
}
/*
* O_TMPFILE, set link count to 0, so that after this point,
* we fill in an inode item with the correct link count.
*/
if (!name) set_nlink(inode, 0);
/*
* we have to initialize this early, so we can reclaim the inode
* number if we fail afterwards in this function.
*/
inode->i_ino = objectid;
if (dir && name) {
trace_btrfs_inode_request(dir);
ret = btrfs_set_inode_index(BTRFS_I(dir), index);
if (ret) {
btrfs_free_path(path);
iput(inode);
return ERR_PTR(ret);
}
} else if (dir) {
*index = 0;
}
/*
* index_cnt is ignored for everything but a dir,
* btrfs_set_inode_index_count has an explanation for the magic
* number
*/
BTRFS_I(inode)->index_cnt = 2;
BTRFS_I(inode)->dir_index = *index;
BTRFS_I(inode)->root = btrfs_grab_root(root);
BTRFS_I(inode)->generation = trans->transid;
inode->i_generation = BTRFS_I(inode)->generation;
/*
* We could have gotten an inode number from somebody who was fsynced
* and then removed in this same transaction, so let's just set full
* sync since it will be a full sync anyway and this will blow away the
* old info in the log.
*/
set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(inode)->runtime_flags);
key[0].objectid = objectid;
key[0].type = BTRFS_INODE_ITEM_KEY;
key[0].offset = 0;
sizes[0] = sizeof(struct btrfs_inode_item);
if (name) {
/*
* Start new inodes with an inode_ref. This is slightly more
* efficient for small numbers of hard links since they will
* be packed into one item. Extended refs will kick in if we
* add more hard links than can fit in the ref item.
*/
key[1].objectid = objectid;
key[1].type = BTRFS_INODE_REF_KEY;
key[1].offset = ref_objectid;
sizes[1] = name_len + sizeof(*ref);
}
location = &BTRFS_I(inode)->location;
location->objectid = objectid;
location->offset = 0;
location->type = BTRFS_INODE_ITEM_KEY;
ret = btrfs_insert_inode_locked(inode);
if (ret < 0) {
iput(inode);
goto fail;
}
ret = btrfs_insert_empty_items(trans, root, path, key, sizes, nitems);
if (ret != 0)
goto fail_unlock;
inode_init_owner(mnt_userns, inode, dir, mode);
inode_set_bytes(inode, 0);
inode->i_mtime = current_time(inode);
inode->i_atime = inode->i_mtime;
inode->i_ctime = inode->i_mtime;
BTRFS_I(inode)->i_otime = inode->i_mtime;
inode_item = btrfs_item_ptr(path->nodes[0], path->slots[0],
struct btrfs_inode_item);
memzero_extent_buffer(path->nodes[0], (unsigned long)inode_item,
sizeof(*inode_item));
fill_inode_item(trans, path->nodes[0], inode_item, inode);
if (name) {
ref = btrfs_item_ptr(path->nodes[0], path->slots[0] + 1,
struct btrfs_inode_ref);
btrfs_set_inode_ref_name_len(path->nodes[0], ref, name_len);
btrfs_set_inode_ref_index(path->nodes[0], ref, *index);
ptr = (unsigned long)(ref + 1);
write_extent_buffer(path->nodes[0], name, ptr, name_len);
}
btrfs_mark_buffer_dirty(path->nodes[0]);
btrfs_free_path(path);
btrfs_inherit_iflags(inode, dir);
if (S_ISREG(mode)) { if (btrfs_test_opt(fs_info, NODATASUM)) BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM; if (btrfs_test_opt(fs_info, NODATACOW)) BTRFS_I(inode)->flags |= BTRFS_INODE_NODATACOW |
BTRFS_INODE_NODATASUM;
}
inode_tree_add(inode);
trace_btrfs_inode_new(inode);
btrfs_set_inode_last_trans(trans, BTRFS_I(inode));
btrfs_update_root_times(trans, root);
ret = btrfs_inode_inherit_props(trans, inode, dir);
if (ret)
btrfs_err(fs_info,
"error inheriting props for ino %llu (root %llu): %d",
btrfs_ino(BTRFS_I(inode)), root->root_key.objectid, ret);
return inode;
fail_unlock:
discard_new_inode(inode);
fail:
if (dir && name) BTRFS_I(dir)->index_cnt--; btrfs_free_path(path);
return ERR_PTR(ret);
}
/*
* utility function to add 'inode' into 'parent_inode' with
* a give name and a given sequence number.
* if 'add_backref' is true, also insert a backref from the
* inode to the parent directory.
*/
int btrfs_add_link(struct btrfs_trans_handle *trans,
struct btrfs_inode *parent_inode, struct btrfs_inode *inode,
const char *name, int name_len, int add_backref, u64 index)
{
int ret = 0;
struct btrfs_key key;
struct btrfs_root *root = parent_inode->root;
u64 ino = btrfs_ino(inode);
u64 parent_ino = btrfs_ino(parent_inode);
if (unlikely(ino == BTRFS_FIRST_FREE_OBJECTID)) { memcpy(&key, &inode->root->root_key, sizeof(key));
} else {
key.objectid = ino;
key.type = BTRFS_INODE_ITEM_KEY;
key.offset = 0;
}
if (unlikely(ino == BTRFS_FIRST_FREE_OBJECTID)) {
ret = btrfs_add_root_ref(trans, key.objectid,
root->root_key.objectid, parent_ino,
index, name, name_len);
} else if (add_backref) {
ret = btrfs_insert_inode_ref(trans, root, name, name_len, ino,
parent_ino, index);
}
/* Nothing to clean up yet */
if (ret)
return ret;
ret = btrfs_insert_dir_item(trans, name, name_len, parent_inode, &key,
btrfs_inode_type(&inode->vfs_inode), index); if (ret == -EEXIST || ret == -EOVERFLOW)
goto fail_dir_item;
else if (ret) { btrfs_abort_transaction(trans, ret);
return ret;
}
btrfs_i_size_write(parent_inode, parent_inode->vfs_inode.i_size +
name_len * 2);
inode_inc_iversion(&parent_inode->vfs_inode);
/*
* If we are replaying a log tree, we do not want to update the mtime
* and ctime of the parent directory with the current time, since the
* log replay procedure is responsible for setting them to their correct
* values (the ones it had when the fsync was done).
*/
if (!test_bit(BTRFS_FS_LOG_RECOVERING, &root->fs_info->flags)) {
struct timespec64 now = current_time(&parent_inode->vfs_inode);
parent_inode->vfs_inode.i_mtime = now;
parent_inode->vfs_inode.i_ctime = now;
}
ret = btrfs_update_inode(trans, root, parent_inode); if (ret) btrfs_abort_transaction(trans, ret);
return ret;
fail_dir_item:
if (unlikely(ino == BTRFS_FIRST_FREE_OBJECTID)) {
u64 local_index;
int err;
err = btrfs_del_root_ref(trans, key.objectid,
root->root_key.objectid, parent_ino,
&local_index, name, name_len);
if (err)
btrfs_abort_transaction(trans, err); } else if (add_backref) {
u64 local_index;
int err;
err = btrfs_del_inode_ref(trans, root, name, name_len,
ino, parent_ino, &local_index);
if (err)
btrfs_abort_transaction(trans, err);
}
/* Return the original error code */
return ret;
}
static int btrfs_add_nondir(struct btrfs_trans_handle *trans,
struct btrfs_inode *dir, struct dentry *dentry,
struct btrfs_inode *inode, int backref, u64 index)
{
int err = btrfs_add_link(trans, dir, inode,
dentry->d_name.name, dentry->d_name.len,
backref, index);
if (err > 0)
err = -EEXIST;
return err;
}
static int btrfs_mknod(struct user_namespace *mnt_userns, struct inode *dir,
struct dentry *dentry, umode_t mode, dev_t rdev)
{
struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb);
struct btrfs_trans_handle *trans;
struct btrfs_root *root = BTRFS_I(dir)->root;
struct inode *inode = NULL;
int err;
u64 objectid;
u64 index = 0;
/*
* 2 for inode item and ref
* 2 for dir items
* 1 for xattr if selinux is on
*/
trans = btrfs_start_transaction(root, 5);
if (IS_ERR(trans))
return PTR_ERR(trans);
err = btrfs_get_free_objectid(root, &objectid);
if (err)
goto out_unlock;
inode = btrfs_new_inode(trans, root, mnt_userns, dir,
dentry->d_name.name, dentry->d_name.len,
btrfs_ino(BTRFS_I(dir)), objectid, mode, &index);
if (IS_ERR(inode)) {
err = PTR_ERR(inode);
inode = NULL;
goto out_unlock;
}
/*
* If the active LSM wants to access the inode during
* d_instantiate it needs these. Smack checks to see
* if the filesystem supports xattrs by looking at the
* ops vector.
*/
inode->i_op = &btrfs_special_inode_operations;
init_special_inode(inode, inode->i_mode, rdev);
err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name);
if (err)
goto out_unlock;
err = btrfs_add_nondir(trans, BTRFS_I(dir), dentry, BTRFS_I(inode),
0, index);
if (err)
goto out_unlock;
btrfs_update_inode(trans, root, BTRFS_I(inode));
d_instantiate_new(dentry, inode);
out_unlock:
btrfs_end_transaction(trans);
btrfs_btree_balance_dirty(fs_info);
if (err && inode) {
inode_dec_link_count(inode);
discard_new_inode(inode);
}
return err;
}
static int btrfs_create(struct user_namespace *mnt_userns, struct inode *dir,
struct dentry *dentry, umode_t mode, bool excl)
{
struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb);
struct btrfs_trans_handle *trans;
struct btrfs_root *root = BTRFS_I(dir)->root;
struct inode *inode = NULL;
int err;
u64 objectid;
u64 index = 0;
/*
* 2 for inode item and ref
* 2 for dir items
* 1 for xattr if selinux is on
*/
trans = btrfs_start_transaction(root, 5);
if (IS_ERR(trans))
return PTR_ERR(trans);
err = btrfs_get_free_objectid(root, &objectid);
if (err)
goto out_unlock;
inode = btrfs_new_inode(trans, root, mnt_userns, dir, dentry->d_name.name, dentry->d_name.len,
btrfs_ino(BTRFS_I(dir)), objectid, mode, &index);
if (IS_ERR(inode)) {
err = PTR_ERR(inode);
inode = NULL;
goto out_unlock;
}
/*
* If the active LSM wants to access the inode during
* d_instantiate it needs these. Smack checks to see
* if the filesystem supports xattrs by looking at the
* ops vector.
*/
inode->i_fop = &btrfs_file_operations;
inode->i_op = &btrfs_file_inode_operations;
inode->i_mapping->a_ops = &btrfs_aops;
err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name);
if (err)
goto out_unlock;
err = btrfs_update_inode(trans, root, BTRFS_I(inode));
if (err)
goto out_unlock;
err = btrfs_add_nondir(trans, BTRFS_I(dir), dentry, BTRFS_I(inode),
0, index);
if (err)
goto out_unlock;
d_instantiate_new(dentry, inode);
out_unlock:
btrfs_end_transaction(trans); if (err && inode) {
inode_dec_link_count(inode);
discard_new_inode(inode);
}
btrfs_btree_balance_dirty(fs_info); return err;
}
static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
struct dentry *dentry)
{
struct btrfs_trans_handle *trans = NULL;
struct btrfs_root *root = BTRFS_I(dir)->root;
struct inode *inode = d_inode(old_dentry);
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
u64 index;
int err;
int drop_inode = 0;
/* do not allow sys_link's with other subvols of the same device */
if (root->root_key.objectid != BTRFS_I(inode)->root->root_key.objectid)
return -EXDEV;
if (inode->i_nlink >= BTRFS_LINK_MAX)
return -EMLINK;
err = btrfs_set_inode_index(BTRFS_I(dir), &index);
if (err)
goto fail;
/*
* 2 items for inode and inode ref
* 2 items for dir items
* 1 item for parent inode
* 1 item for orphan item deletion if O_TMPFILE
*/
trans = btrfs_start_transaction(root, inode->i_nlink ? 5 : 6);
if (IS_ERR(trans)) {
err = PTR_ERR(trans);
trans = NULL;
goto fail;
}
/* There are several dir indexes for this inode, clear the cache. */
BTRFS_I(inode)->dir_index = 0ULL;
inc_nlink(inode);
inode_inc_iversion(inode);
inode->i_ctime = current_time(inode);
ihold(inode);
set_bit(BTRFS_INODE_COPY_EVERYTHING, &BTRFS_I(inode)->runtime_flags);
err = btrfs_add_nondir(trans, BTRFS_I(dir), dentry, BTRFS_I(inode),
1, index);
if (err) {
drop_inode = 1;
} else {
struct dentry *parent = dentry->d_parent;
err = btrfs_update_inode(trans, root, BTRFS_I(inode));
if (err)
goto fail;
if (inode->i_nlink == 1) {
/*
* If new hard link count is 1, it's a file created
* with open(2) O_TMPFILE flag.
*/
err = btrfs_orphan_del(trans, BTRFS_I(inode));
if (err)
goto fail;
}
d_instantiate(dentry, inode);
btrfs_log_new_name(trans, BTRFS_I(inode), NULL, parent);
}
fail:
if (trans) btrfs_end_transaction(trans);
if (drop_inode) {
inode_dec_link_count(inode);
iput(inode);
}
btrfs_btree_balance_dirty(fs_info); return err;
}
static int btrfs_mkdir(struct user_namespace *mnt_userns, struct inode *dir,
struct dentry *dentry, umode_t mode)
{
struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb);
struct inode *inode = NULL;
struct btrfs_trans_handle *trans;
struct btrfs_root *root = BTRFS_I(dir)->root;
int err = 0;
u64 objectid = 0;
u64 index = 0;
/*
* 2 items for inode and ref
* 2 items for dir items
* 1 for xattr if selinux is on
*/
trans = btrfs_start_transaction(root, 5);
if (IS_ERR(trans))
return PTR_ERR(trans);
err = btrfs_get_free_objectid(root, &objectid);
if (err)
goto out_fail;
inode = btrfs_new_inode(trans, root, mnt_userns, dir, dentry->d_name.name, dentry->d_name.len,
btrfs_ino(BTRFS_I(dir)), objectid,
S_IFDIR | mode, &index);
if (IS_ERR(inode)) {
err = PTR_ERR(inode);
inode = NULL;
goto out_fail;
}
/* these must be set before we unlock the inode */
inode->i_op = &btrfs_dir_inode_operations;
inode->i_fop = &btrfs_dir_file_operations;
err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name);
if (err)
goto out_fail;
btrfs_i_size_write(BTRFS_I(inode), 0);
err = btrfs_update_inode(trans, root, BTRFS_I(inode));
if (err)
goto out_fail;
err = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode),
dentry->d_name.name,
dentry->d_name.len, 0, index);
if (err)
goto out_fail;
d_instantiate_new(dentry, inode);
out_fail:
btrfs_end_transaction(trans);
if (err && inode) {
inode_dec_link_count(inode);
discard_new_inode(inode);
}
btrfs_btree_balance_dirty(fs_info); return err;
}
static noinline int uncompress_inline(struct btrfs_path *path,
struct page *page,
size_t pg_offset, u64 extent_offset,
struct btrfs_file_extent_item *item)
{
int ret;
struct extent_buffer *leaf = path->nodes[0];
char *tmp;
size_t max_size;
unsigned long inline_size;
unsigned long ptr;
int compress_type;
WARN_ON(pg_offset != 0);
compress_type = btrfs_file_extent_compression(leaf, item);
max_size = btrfs_file_extent_ram_bytes(leaf, item);
inline_size = btrfs_file_extent_inline_item_len(leaf,
btrfs_item_nr(path->slots[0]));
tmp = kmalloc(inline_size, GFP_NOFS);
if (!tmp)
return -ENOMEM;
ptr = btrfs_file_extent_inline_start(item);
read_extent_buffer(leaf, tmp, ptr, inline_size);
max_size = min_t(unsigned long, PAGE_SIZE, max_size);
ret = btrfs_decompress(compress_type, tmp, page,
extent_offset, inline_size, max_size);
/*
* decompression code contains a memset to fill in any space between the end
* of the uncompressed data and the end of max_size in case the decompressed
* data ends up shorter than ram_bytes. That doesn't cover the hole between
* the end of an inline extent and the beginning of the next block, so we
* cover that region here.
*/
if (max_size + pg_offset < PAGE_SIZE)
memzero_page(page, pg_offset + max_size,
PAGE_SIZE - max_size - pg_offset);
kfree(tmp); return ret;
}
/**
* btrfs_get_extent - Lookup the first extent overlapping a range in a file.
* @inode: file to search in
* @page: page to read extent data into if the extent is inline
* @pg_offset: offset into @page to copy to
* @start: file offset
* @len: length of range starting at @start
*
* This returns the first &struct extent_map which overlaps with the given
* range, reading it from the B-tree and caching it if necessary. Note that
* there may be more extents which overlap the given range after the returned
* extent_map.
*
* If @page is not NULL and the extent is inline, this also reads the extent
* data directly into the page and marks the extent up to date in the io_tree.
*
* Return: ERR_PTR on error, non-NULL extent_map on success.
*/
struct extent_map *btrfs_get_extent(struct btrfs_inode *inode,
struct page *page, size_t pg_offset,
u64 start, u64 len)
{
struct btrfs_fs_info *fs_info = inode->root->fs_info;
int ret = 0;
u64 extent_start = 0;
u64 extent_end = 0;
u64 objectid = btrfs_ino(inode);
int extent_type = -1;
struct btrfs_path *path = NULL;
struct btrfs_root *root = inode->root;
struct btrfs_file_extent_item *item;
struct extent_buffer *leaf;
struct btrfs_key found_key;
struct extent_map *em = NULL;
struct extent_map_tree *em_tree = &inode->extent_tree;
struct extent_io_tree *io_tree = &inode->io_tree;
read_lock(&em_tree->lock);
em = lookup_extent_mapping(em_tree, start, len);
read_unlock(&em_tree->lock);
if (em) {
if (em->start > start || em->start + em->len <= start)
free_extent_map(em);
else if (em->block_start == EXTENT_MAP_INLINE && page) free_extent_map(em);
else
goto out;
}
em = alloc_extent_map();
if (!em) {
ret = -ENOMEM;
goto out;
}
em->start = EXTENT_MAP_HOLE;
em->orig_start = EXTENT_MAP_HOLE;
em->len = (u64)-1;
em->block_len = (u64)-1;
path = btrfs_alloc_path();
if (!path) {
ret = -ENOMEM;
goto out;
}
/* Chances are we'll be called again, so go ahead and do readahead */
path->reada = READA_FORWARD;
/*
* The same explanation in load_free_space_cache applies here as well,
* we only read when we're loading the free space cache, and at that
* point the commit_root has everything we need.
*/
if (btrfs_is_free_space_inode(inode)) {
path->search_commit_root = 1;
path->skip_locking = 1;
}
ret = btrfs_lookup_file_extent(NULL, root, path, objectid, start, 0);
if (ret < 0) {
goto out;
} else if (ret > 0) { if (path->slots[0] == 0)
goto not_found;
path->slots[0]--;
ret = 0;
}
leaf = path->nodes[0];
item = btrfs_item_ptr(leaf, path->slots[0],
struct btrfs_file_extent_item);
btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
if (found_key.objectid != objectid ||
found_key.type != BTRFS_EXTENT_DATA_KEY) {
/*
* If we backup past the first extent we want to move forward
* and see if there is an extent in front of us, otherwise we'll
* say there is a hole for our whole search range which can
* cause problems.
*/
extent_end = start;
goto next;
}
extent_type = btrfs_file_extent_type(leaf, item);
extent_start = found_key.offset;
extent_end = btrfs_file_extent_end(path);
if (extent_type == BTRFS_FILE_EXTENT_REG ||
extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
/* Only regular file could have regular/prealloc extent */
if (!S_ISREG(inode->vfs_inode.i_mode)) {
ret = -EUCLEAN;
btrfs_crit(fs_info,
"regular/prealloc extent found for non-regular inode %llu",
btrfs_ino(inode));
goto out;
}
trace_btrfs_get_extent_show_fi_regular(inode, leaf, item,
extent_start);
} else if (extent_type == BTRFS_FILE_EXTENT_INLINE) { trace_btrfs_get_extent_show_fi_inline(inode, leaf, item,
path->slots[0],
extent_start);
}
next:
if (start >= extent_end) { path->slots[0]++;
if (path->slots[0] >= btrfs_header_nritems(leaf)) {
ret = btrfs_next_leaf(root, path);
if (ret < 0)
goto out;
else if (ret > 0)
goto not_found;
leaf = path->nodes[0];
}
btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
if (found_key.objectid != objectid ||
found_key.type != BTRFS_EXTENT_DATA_KEY)
goto not_found;
if (start + len <= found_key.offset)
goto not_found;
if (start > found_key.offset)
goto next;
/* New extent overlaps with existing one */
em->start = start;
em->orig_start = start;
em->len = found_key.offset - start;
em->block_start = EXTENT_MAP_HOLE;
goto insert;
}
btrfs_extent_item_to_extent_map(inode, path, item, !page, em);
if (extent_type == BTRFS_FILE_EXTENT_REG ||
extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
goto insert;
} else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
unsigned long ptr;
char *map;
size_t size;
size_t extent_offset;
size_t copy_size;
if (!page)
goto out;
size = btrfs_file_extent_ram_bytes(leaf, item);
extent_offset = page_offset(page) + pg_offset - extent_start;
copy_size = min_t(u64, PAGE_SIZE - pg_offset,
size - extent_offset);
em->start = extent_start + extent_offset;
em->len = ALIGN(copy_size, fs_info->sectorsize);
em->orig_block_len = em->len;
em->orig_start = em->start;
ptr = btrfs_file_extent_inline_start(item) + extent_offset;
if (!PageUptodate(page)) {
if (btrfs_file_extent_compression(leaf, item) !=
BTRFS_COMPRESS_NONE) {
ret = uncompress_inline(path, page, pg_offset,
extent_offset, item);
if (ret)
goto out;
} else {
map = kmap_local_page(page);
read_extent_buffer(leaf, map + pg_offset, ptr,
copy_size);
if (pg_offset + copy_size < PAGE_SIZE) {
memset(map + pg_offset + copy_size, 0,
PAGE_SIZE - pg_offset -
copy_size);
}
kunmap_local(map);
}
flush_dcache_page(page);
}
set_extent_uptodate(io_tree, em->start,
extent_map_end(em) - 1, NULL, GFP_NOFS);
goto insert;
}
not_found:
em->start = start;
em->orig_start = start;
em->len = len;
em->block_start = EXTENT_MAP_HOLE;
insert:
ret = 0;
btrfs_release_path(path); if (em->start > start || extent_map_end(em) <= start) {
btrfs_err(fs_info,
"bad extent! em: [%llu %llu] passed [%llu %llu]",
em->start, em->len, start, len);
ret = -EIO;
goto out;
}
write_lock(&em_tree->lock);
ret = btrfs_add_extent_mapping(fs_info, em_tree, &em, start, len);
write_unlock(&em_tree->lock);
out:
btrfs_free_path(path);
trace_btrfs_get_extent(root, inode, em);
if (ret) { free_extent_map(em); return ERR_PTR(ret);
}
return em;
}
struct extent_map *btrfs_get_extent_fiemap(struct btrfs_inode *inode,
u64 start, u64 len)
{
struct extent_map *em;
struct extent_map *hole_em = NULL;
u64 delalloc_start = start;
u64 end;
u64 delalloc_len;
u64 delalloc_end;
int err = 0;
em = btrfs_get_extent(inode, NULL, 0, start, len);
if (IS_ERR(em))
return em;
/*
* If our em maps to:
* - a hole or
* - a pre-alloc extent,
* there might actually be delalloc bytes behind it.
*/
if (em->block_start != EXTENT_MAP_HOLE && !test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
return em;
else
hole_em = em;
/* check to see if we've wrapped (len == -1 or similar) */
end = start + len;
if (end < start)
end = (u64)-1;
else
end -= 1;
em = NULL;
/* ok, we didn't find anything, lets look for delalloc */
delalloc_len = count_range_bits(&inode->io_tree, &delalloc_start,
end, len, EXTENT_DELALLOC, 1);
delalloc_end = delalloc_start + delalloc_len;
if (delalloc_end < delalloc_start)
delalloc_end = (u64)-1;
/*
* We didn't find anything useful, return the original results from
* get_extent()
*/
if (delalloc_start > end || delalloc_end <= start) {
em = hole_em;
hole_em = NULL;
goto out;
}
/*
* Adjust the delalloc_start to make sure it doesn't go backwards from
* the start they passed in
*/
delalloc_start = max(start, delalloc_start);
delalloc_len = delalloc_end - delalloc_start;
if (delalloc_len > 0) {
u64 hole_start;
u64 hole_len;
const u64 hole_end = extent_map_end(hole_em); em = alloc_extent_map();
if (!em) {
err = -ENOMEM;
goto out;
}
ASSERT(hole_em);
/*
* When btrfs_get_extent can't find anything it returns one
* huge hole
*
* Make sure what it found really fits our range, and adjust to
* make sure it is based on the start from the caller
*/
if (hole_end <= start || hole_em->start > end) { free_extent_map(hole_em);
hole_em = NULL;
} else {
hole_start = max(hole_em->start, start); hole_len = hole_end - hole_start;
}
if (hole_em && delalloc_start > hole_start) {
/*
* Our hole starts before our delalloc, so we have to
* return just the parts of the hole that go until the
* delalloc starts
*/
em->len = min(hole_len, delalloc_start - hole_start);
em->start = hole_start;
em->orig_start = hole_start;
/*
* Don't adjust block start at all, it is fixed at
* EXTENT_MAP_HOLE
*/
em->block_start = hole_em->block_start;
em->block_len = hole_len;
if (test_bit(EXTENT_FLAG_PREALLOC, &hole_em->flags))
set_bit(EXTENT_FLAG_PREALLOC, &em->flags);
} else {
/*
* Hole is out of passed range or it starts after
* delalloc range
*/
em->start = delalloc_start;
em->len = delalloc_len;
em->orig_start = delalloc_start;
em->block_start = EXTENT_MAP_DELALLOC;
em->block_len = delalloc_len;
}
} else {
return hole_em;
}
out:
free_extent_map(hole_em); if (err) {
free_extent_map(em);
return ERR_PTR(err);
}
return em;
}
static struct extent_map *btrfs_create_dio_extent(struct btrfs_inode *inode,
const u64 start,
const u64 len,
const u64 orig_start,
const u64 block_start,
const u64 block_len,
const u64 orig_block_len,
const u64 ram_bytes,
const int type)
{
struct extent_map *em = NULL;
int ret;
if (type != BTRFS_ORDERED_NOCOW) { em = create_io_em(inode, start, len, orig_start, block_start,
block_len, orig_block_len, ram_bytes,
BTRFS_COMPRESS_NONE, /* compress_type */
type);
if (IS_ERR(em))
goto out;
}
ret = btrfs_add_ordered_extent_dio(inode, start, block_start, len,
block_len, type);
if (ret) {
if (em) { free_extent_map(em);
btrfs_drop_extent_cache(inode, start, start + len - 1, 0);
}
em = ERR_PTR(ret);
}
out:
return em;
}
static struct extent_map *btrfs_new_extent_direct(struct btrfs_inode *inode,
u64 start, u64 len)
{
struct btrfs_root *root = inode->root;
struct btrfs_fs_info *fs_info = root->fs_info;
struct extent_map *em;
struct btrfs_key ins;
u64 alloc_hint;
int ret;
alloc_hint = get_extent_allocation_hint(inode, start, len);
ret = btrfs_reserve_extent(root, len, len, fs_info->sectorsize,
0, alloc_hint, &ins, 1, 1);
if (ret)
return ERR_PTR(ret); em = btrfs_create_dio_extent(inode, start, ins.offset, start,
ins.objectid, ins.offset, ins.offset,
ins.offset, BTRFS_ORDERED_REGULAR);
btrfs_dec_block_group_reservations(fs_info, ins.objectid);
if (IS_ERR(em)) btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset,
1);
return em;
}
static bool btrfs_extent_readonly(struct btrfs_fs_info *fs_info, u64 bytenr)
{
struct btrfs_block_group *block_group;
bool readonly = false;
block_group = btrfs_lookup_block_group(fs_info, bytenr); if (!block_group || block_group->ro)
readonly = true;
if (block_group)
btrfs_put_block_group(block_group);
return readonly;
}
/*
* Check if we can do nocow write into the range [@offset, @offset + @len)
*
* @offset: File offset
* @len: The length to write, will be updated to the nocow writeable
* range
* @orig_start: (optional) Return the original file offset of the file extent
* @orig_len: (optional) Return the original on-disk length of the file extent
* @ram_bytes: (optional) Return the ram_bytes of the file extent
* @strict: if true, omit optimizations that might force us into unnecessary
* cow. e.g., don't trust generation number.
*
* Return:
* >0 and update @len if we can do nocow write
* 0 if we can't do nocow write
* <0 if error happened
*
* NOTE: This only checks the file extents, caller is responsible to wait for
* any ordered extents.
*/
noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len,
u64 *orig_start, u64 *orig_block_len,
u64 *ram_bytes, bool strict)
{
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct btrfs_path *path;
int ret;
struct extent_buffer *leaf;
struct btrfs_root *root = BTRFS_I(inode)->root;
struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
struct btrfs_file_extent_item *fi;
struct btrfs_key key;
u64 disk_bytenr;
u64 backref_offset;
u64 extent_end;
u64 num_bytes;
int slot;
int found_type;
bool nocow = (BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW);
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
ret = btrfs_lookup_file_extent(NULL, root, path,
btrfs_ino(BTRFS_I(inode)), offset, 0);
if (ret < 0)
goto out;
slot = path->slots[0];
if (ret == 1) {
if (slot == 0) {
/* can't find the item, must cow */
ret = 0;
goto out;
}
slot--;
}
ret = 0;
leaf = path->nodes[0];
btrfs_item_key_to_cpu(leaf, &key, slot);
if (key.objectid != btrfs_ino(BTRFS_I(inode)) || key.type != BTRFS_EXTENT_DATA_KEY) {
/* not our file or wrong item type, must cow */
goto out;
}
if (key.offset > offset) {
/* Wrong offset, must cow */
goto out;
}
fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
found_type = btrfs_file_extent_type(leaf, fi);
if (found_type != BTRFS_FILE_EXTENT_REG &&
found_type != BTRFS_FILE_EXTENT_PREALLOC) {
/* not a regular extent, must cow */
goto out;
}
if (!nocow && found_type == BTRFS_FILE_EXTENT_REG)
goto out;
extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, fi);
if (extent_end <= offset)
goto out;
disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
if (disk_bytenr == 0)
goto out;
if (btrfs_file_extent_compression(leaf, fi) ||
btrfs_file_extent_encryption(leaf, fi) ||
btrfs_file_extent_other_encoding(leaf, fi))
goto out;
/*
* Do the same check as in btrfs_cross_ref_exist but without the
* unnecessary search.
*/
if (!strict &&
(btrfs_file_extent_generation(leaf, fi) <=
btrfs_root_last_snapshot(&root->root_item)))
goto out;
backref_offset = btrfs_file_extent_offset(leaf, fi);
if (orig_start) {
*orig_start = key.offset - backref_offset;
*orig_block_len = btrfs_file_extent_disk_num_bytes(leaf, fi);
*ram_bytes = btrfs_file_extent_ram_bytes(leaf, fi);
}
if (btrfs_extent_readonly(fs_info, disk_bytenr))
goto out;
num_bytes = min(offset + *len, extent_end) - offset; if (!nocow && found_type == BTRFS_FILE_EXTENT_PREALLOC) {
u64 range_end;
range_end = round_up(offset + num_bytes,
root->fs_info->sectorsize) - 1;
ret = test_range_bit(io_tree, offset, range_end,
EXTENT_DELALLOC, 0, NULL);
if (ret) {
ret = -EAGAIN;
goto out;
}
}
btrfs_release_path(path);
/*
* look for other files referencing this extent, if we
* find any we must cow
*/
ret = btrfs_cross_ref_exist(root, btrfs_ino(BTRFS_I(inode)),
key.offset - backref_offset, disk_bytenr,
strict);
if (ret) {
ret = 0;
goto out;
}
/*
* adjust disk_bytenr and num_bytes to cover just the bytes
* in this extent we are about to write. If there
* are any csums in that range we have to cow in order
* to keep the csums correct
*/
disk_bytenr += backref_offset;
disk_bytenr += offset - key.offset;
if (csum_exist_in_range(fs_info, disk_bytenr, num_bytes))
goto out;
/*
* all of the above have passed, it is safe to overwrite this extent
* without cow
*/
*len = num_bytes;
ret = 1;
out:
btrfs_free_path(path); return ret;
}
static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend,
struct extent_state **cached_state, bool writing)
{
struct btrfs_ordered_extent *ordered;
int ret = 0;
while (1) {
lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend,
cached_state);
/*
* We're concerned with the entire range that we're going to be
* doing DIO to, so we need to make sure there's no ordered
* extents in this range.
*/
ordered = btrfs_lookup_ordered_range(BTRFS_I(inode), lockstart,
lockend - lockstart + 1);
/*
* We need to make sure there are no buffered pages in this
* range either, we could have raced between the invalidate in
* generic_file_direct_write and locking the extent. The
* invalidate needs to happen so that reads after a write do not
* get stale data.
*/
if (!ordered && (!writing || !filemap_range_has_page(inode->i_mapping,
lockstart, lockend)))
break;
unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend,
cached_state);
if (ordered) {
/*
* If we are doing a DIO read and the ordered extent we
* found is for a buffered write, we can not wait for it
* to complete and retry, because if we do so we can
* deadlock with concurrent buffered writes on page
* locks. This happens only if our DIO read covers more
* than one extent map, if at this point has already
* created an ordered extent for a previous extent map
* and locked its range in the inode's io tree, and a
* concurrent write against that previous extent map's
* range and this range started (we unlock the ranges
* in the io tree only when the bios complete and
* buffered writes always lock pages before attempting
* to lock range in the io tree).
*/
if (writing ||
test_bit(BTRFS_ORDERED_DIRECT, &ordered->flags)) btrfs_start_ordered_extent(ordered, 1);
else
ret = -ENOTBLK;
btrfs_put_ordered_extent(ordered);
} else {
/*
* We could trigger writeback for this range (and wait
* for it to complete) and then invalidate the pages for
* this range (through invalidate_inode_pages2_range()),
* but that can lead us to a deadlock with a concurrent
* call to readahead (a buffered read or a defrag call
* triggered a readahead) on a page lock due to an
* ordered dio extent we created before but did not have
* yet a corresponding bio submitted (whence it can not
* complete), which makes readahead wait for that
* ordered extent to complete while holding a lock on
* that page.
*/
ret = -ENOTBLK;
}
if (ret)
break;
cond_resched();
}
return ret;
}
/* The callers of this must take lock_extent() */
static struct extent_map *create_io_em(struct btrfs_inode *inode, u64 start,
u64 len, u64 orig_start, u64 block_start,
u64 block_len, u64 orig_block_len,
u64 ram_bytes, int compress_type,
int type)
{
struct extent_map_tree *em_tree;
struct extent_map *em;
int ret;
ASSERT(type == BTRFS_ORDERED_PREALLOC ||
type == BTRFS_ORDERED_COMPRESSED ||
type == BTRFS_ORDERED_NOCOW ||
type == BTRFS_ORDERED_REGULAR);
em_tree = &inode->extent_tree; em = alloc_extent_map();
if (!em)
return ERR_PTR(-ENOMEM);
em->start = start;
em->orig_start = orig_start;
em->len = len;
em->block_len = block_len;
em->block_start = block_start;
em->orig_block_len = orig_block_len;
em->ram_bytes = ram_bytes;
em->generation = -1;
set_bit(EXTENT_FLAG_PINNED, &em->flags);
if (type == BTRFS_ORDERED_PREALLOC) {
set_bit(EXTENT_FLAG_FILLING, &em->flags);
} else if (type == BTRFS_ORDERED_COMPRESSED) {
set_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
em->compress_type = compress_type;
}
do {
btrfs_drop_extent_cache(inode, em->start,
em->start + em->len - 1, 0);
write_lock(&em_tree->lock);
ret = add_extent_mapping(em_tree, em, 1);
write_unlock(&em_tree->lock);
/*
* The caller has taken lock_extent(), who could race with us
* to add em?
*/
} while (ret == -EEXIST);
if (ret) { free_extent_map(em);
return ERR_PTR(ret);
}
/* em got 2 refs now, callers needs to do free_extent_map once. */
return em;
}
static int btrfs_get_blocks_direct_write(struct extent_map **map,
struct inode *inode,
struct btrfs_dio_data *dio_data,
u64 start, u64 len)
{
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct extent_map *em = *map;
int type;
u64 block_start, orig_start, orig_block_len, ram_bytes;
bool can_nocow = false;
bool space_reserved = false;
u64 prev_len;
int ret = 0;
/*
* We don't allocate a new extent in the following cases
*
* 1) The inode is marked as NODATACOW. In this case we'll just use the
* existing extent.
* 2) The extent is marked as PREALLOC. We're good to go here and can
* just use the extent.
*
*/
if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags) ||
((BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) && em->block_start != EXTENT_MAP_HOLE)) {
if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
type = BTRFS_ORDERED_PREALLOC;
else
type = BTRFS_ORDERED_NOCOW;
len = min(len, em->len - (start - em->start));
block_start = em->block_start + (start - em->start);
if (can_nocow_extent(inode, start, &len, &orig_start,
&orig_block_len, &ram_bytes, false) == 1 &&
btrfs_inc_nocow_writers(fs_info, block_start))
can_nocow = true;
}
prev_len = len;
if (can_nocow) {
struct extent_map *em2;
/* We can NOCOW, so only need to reserve metadata space. */
ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode), len);
if (ret < 0) {
/* Our caller expects us to free the input extent map. */
free_extent_map(em);
*map = NULL;
btrfs_dec_nocow_writers(fs_info, block_start);
goto out;
}
space_reserved = true;
em2 = btrfs_create_dio_extent(BTRFS_I(inode), start, len,
orig_start, block_start,
len, orig_block_len,
ram_bytes, type);
btrfs_dec_nocow_writers(fs_info, block_start);
if (type == BTRFS_ORDERED_PREALLOC) {
free_extent_map(em);
*map = em = em2;
}
if (IS_ERR(em2)) {
ret = PTR_ERR(em2);
goto out;
}
} else {
/* Our caller expects us to free the input extent map. */
free_extent_map(em);
*map = NULL;
/* We have to COW, so need to reserve metadata and data space. */
ret = btrfs_delalloc_reserve_space(BTRFS_I(inode),
&dio_data->data_reserved,
start, len);
if (ret < 0)
goto out;
space_reserved = true;
em = btrfs_new_extent_direct(BTRFS_I(inode), start, len);
if (IS_ERR(em)) {
ret = PTR_ERR(em);
goto out;
}
*map = em;
len = min(len, em->len - (start - em->start));
if (len < prev_len)
btrfs_delalloc_release_space(BTRFS_I(inode),
dio_data->data_reserved,
start + len, prev_len - len,
true);
}
/*
* We have created our ordered extent, so we can now release our reservation
* for an outstanding extent.
*/
btrfs_delalloc_release_extents(BTRFS_I(inode), prev_len);
/*
* Need to update the i_size under the extent lock so buffered
* readers will get the updated i_size when we unlock.
*/
if (start + len > i_size_read(inode))
i_size_write(inode, start + len);
out:
if (ret && space_reserved) { btrfs_delalloc_release_extents(BTRFS_I(inode), len);
if (can_nocow) {
btrfs_delalloc_release_metadata(BTRFS_I(inode), len, true);
} else {
btrfs_delalloc_release_space(BTRFS_I(inode),
dio_data->data_reserved,
start, len, true);
extent_changeset_free(dio_data->data_reserved);
dio_data->data_reserved = NULL;
}
}
return ret;
}
static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start,
loff_t length, unsigned int flags, struct iomap *iomap,
struct iomap *srcmap)
{
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct extent_map *em;
struct extent_state *cached_state = NULL;
struct btrfs_dio_data *dio_data = NULL;
u64 lockstart, lockend;
const bool write = !!(flags & IOMAP_WRITE);
int ret = 0;
u64 len = length;
bool unlock_extents = false;
if (!write)
len = min_t(u64, len, fs_info->sectorsize); lockstart = start;
lockend = start + len - 1;
/*
* The generic stuff only does filemap_write_and_wait_range, which
* isn't enough if we've written compressed pages to this area, so we
* need to flush the dirty pages again to make absolutely sure that any
* outstanding dirty pages are on disk.
*/
if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
&BTRFS_I(inode)->runtime_flags)) {
ret = filemap_fdatawrite_range(inode->i_mapping, start,
start + length - 1);
if (ret)
return ret;
}
dio_data = kzalloc(sizeof(*dio_data), GFP_NOFS);
if (!dio_data)
return -ENOMEM;
iomap->private = dio_data;
/*
* If this errors out it's because we couldn't invalidate pagecache for
* this range and we need to fallback to buffered.
*/
if (lock_extent_direct(inode, lockstart, lockend, &cached_state, write)) {
ret = -ENOTBLK;
goto err;
}
em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, start, len);
if (IS_ERR(em)) {
ret = PTR_ERR(em);
goto unlock_err;
}
/*
* Ok for INLINE and COMPRESSED extents we need to fallback on buffered
* io. INLINE is special, and we could probably kludge it in here, but
* it's still buffered so for safety lets just fall back to the generic
* buffered path.
*
* For COMPRESSED we _have_ to read the entire extent in so we can
* decompress it, so there will be buffering required no matter what we
* do, so go ahead and fallback to buffered.
*
* We return -ENOTBLK because that's what makes DIO go ahead and go back
* to buffered IO. Don't blame me, this is the price we pay for using
* the generic code.
*/
if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags) || em->block_start == EXTENT_MAP_INLINE) { free_extent_map(em);
ret = -ENOTBLK;
goto unlock_err;
}
len = min(len, em->len - (start - em->start));
/*
* If we have a NOWAIT request and the range contains multiple extents
* (or a mix of extents and holes), then we return -EAGAIN to make the
* caller fallback to a context where it can do a blocking (without
* NOWAIT) request. This way we avoid doing partial IO and returning
* success to the caller, which is not optimal for writes and for reads
* it can result in unexpected behaviour for an application.
*
* When doing a read, because we use IOMAP_DIO_PARTIAL when calling
* iomap_dio_rw(), we can end up returning less data then what the caller
* asked for, resulting in an unexpected, and incorrect, short read.
* That is, the caller asked to read N bytes and we return less than that,
* which is wrong unless we are crossing EOF. This happens if we get a
* page fault error when trying to fault in pages for the buffer that is
* associated to the struct iov_iter passed to iomap_dio_rw(), and we
* have previously submitted bios for other extents in the range, in
* which case iomap_dio_rw() may return us EIOCBQUEUED if not all of
* those bios have completed by the time we get the page fault error,
* which we return back to our caller - we should only return EIOCBQUEUED
* after we have submitted bios for all the extents in the range.
*/
if ((flags & IOMAP_NOWAIT) && len < length) { free_extent_map(em);
ret = -EAGAIN;
goto unlock_err;
}
if (write) { ret = btrfs_get_blocks_direct_write(&em, inode, dio_data,
start, len);
if (ret < 0)
goto unlock_err;
unlock_extents = true;
/* Recalc len in case the new em is smaller than requested */
len = min(len, em->len - (start - em->start));
} else {
/*
* We need to unlock only the end area that we aren't using.
* The rest is going to be unlocked by the endio routine.
*/
lockstart = start + len;
if (lockstart < lockend)
unlock_extents = true;
}
if (unlock_extents)
unlock_extent_cached(&BTRFS_I(inode)->io_tree,
lockstart, lockend, &cached_state);
else
free_extent_state(cached_state);
/*
* Translate extent map information to iomap.
* We trim the extents (and move the addr) even though iomap code does
* that, since we have locked only the parts we are performing I/O in.
*/
if ((em->block_start == EXTENT_MAP_HOLE) || (test_bit(EXTENT_FLAG_PREALLOC, &em->flags) && !write)) { iomap->addr = IOMAP_NULL_ADDR;
iomap->type = IOMAP_HOLE;
} else {
iomap->addr = em->block_start + (start - em->start);
iomap->type = IOMAP_MAPPED;
}
iomap->offset = start;
iomap->bdev = fs_info->fs_devices->latest_dev->bdev;
iomap->length = len;
if (write && btrfs_use_zone_append(BTRFS_I(inode), em->block_start))
iomap->flags |= IOMAP_F_ZONE_APPEND;
free_extent_map(em);
return 0;
unlock_err:
unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend,
&cached_state);
err:
kfree(dio_data); return ret;
}
static int btrfs_dio_iomap_end(struct inode *inode, loff_t pos, loff_t length,
ssize_t written, unsigned int flags, struct iomap *iomap)
{
int ret = 0;
struct btrfs_dio_data *dio_data = iomap->private;
size_t submitted = dio_data->submitted;
const bool write = !!(flags & IOMAP_WRITE);
if (!write && (iomap->type == IOMAP_HOLE)) {
/* If reading from a hole, unlock and return */
unlock_extent(&BTRFS_I(inode)->io_tree, pos, pos + length - 1);
goto out;
}
if (submitted < length) { pos += submitted;
length -= submitted;
if (write)
__endio_write_update_ordered(BTRFS_I(inode), pos,
length, false);
else
unlock_extent(&BTRFS_I(inode)->io_tree, pos,
pos + length - 1);
ret = -ENOTBLK;
}
if (write)
extent_changeset_free(dio_data->data_reserved);
out:
kfree(dio_data);
iomap->private = NULL;
return ret;
}
static void btrfs_dio_private_put(struct btrfs_dio_private *dip)
{
/*
* This implies a barrier so that stores to dio_bio->bi_status before
* this and loads of dio_bio->bi_status after this are fully ordered.
*/
if (!refcount_dec_and_test(&dip->refs))
return;
if (btrfs_op(dip->dio_bio) == BTRFS_MAP_WRITE) {
__endio_write_update_ordered(BTRFS_I(dip->inode),
dip->logical_offset,
dip->bytes,
!dip->dio_bio->bi_status);
} else {
unlock_extent(&BTRFS_I(dip->inode)->io_tree,
dip->logical_offset,
dip->logical_offset + dip->bytes - 1);
}
bio_endio(dip->dio_bio);
kfree(dip);
}
static blk_status_t submit_dio_repair_bio(struct inode *inode, struct bio *bio,
int mirror_num,
unsigned long bio_flags)
{
struct btrfs_dio_private *dip = bio->bi_private;
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
blk_status_t ret;
BUG_ON(bio_op(bio) == REQ_OP_WRITE);
ret = btrfs_bio_wq_end_io(fs_info, bio, BTRFS_WQ_ENDIO_DATA);
if (ret)
return ret;
refcount_inc(&dip->refs);
ret = btrfs_map_bio(fs_info, bio, mirror_num);
if (ret)
refcount_dec(&dip->refs);
return ret;
}
static blk_status_t btrfs_check_read_dio_bio(struct inode *inode,
struct btrfs_io_bio *io_bio,
const bool uptodate)
{
struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
const u32 sectorsize = fs_info->sectorsize;
struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree;
struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
const bool csum = !(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM);
struct bio_vec bvec;
struct bvec_iter iter;
u64 start = io_bio->logical;
u32 bio_offset = 0;
blk_status_t err = BLK_STS_OK;
__bio_for_each_segment(bvec, &io_bio->bio, iter, io_bio->iter) {
unsigned int i, nr_sectors, pgoff;
nr_sectors = BTRFS_BYTES_TO_BLKS(fs_info, bvec.bv_len);
pgoff = bvec.bv_offset;
for (i = 0; i < nr_sectors; i++) {
ASSERT(pgoff < PAGE_SIZE);
if (uptodate &&
(!csum || !check_data_csum(inode, io_bio,
bio_offset, bvec.bv_page,
pgoff, start))) {
clean_io_failure(fs_info, failure_tree, io_tree,
start, bvec.bv_page,
btrfs_ino(BTRFS_I(inode)),
pgoff);
} else {
int ret;
ASSERT((start - io_bio->logical) < UINT_MAX);
ret = btrfs_repair_one_sector(inode,
&io_bio->bio,
start - io_bio->logical,
bvec.bv_page, pgoff,
start, io_bio->mirror_num,
submit_dio_repair_bio);
if (ret)
err = errno_to_blk_status(ret);
}
start += sectorsize;
ASSERT(bio_offset + sectorsize > bio_offset);
bio_offset += sectorsize;
pgoff += sectorsize;
}
}
return err;
}
static void __endio_write_update_ordered(struct btrfs_inode *inode,
const u64 offset, const u64 bytes,
const bool uptodate)
{
btrfs_mark_ordered_io_finished(inode, NULL, offset, bytes,
finish_ordered_fn, uptodate);
}
static blk_status_t btrfs_submit_bio_start_direct_io(struct inode *inode,
struct bio *bio,
u64 dio_file_offset)
{
return btrfs_csum_one_bio(BTRFS_I(inode), bio, dio_file_offset, 1);
}
static void btrfs_end_dio_bio(struct bio *bio)
{
struct btrfs_dio_private *dip = bio->bi_private;
blk_status_t err = bio->bi_status;
if (err)
btrfs_warn(BTRFS_I(dip->inode)->root->fs_info,
"direct IO failed ino %llu rw %d,%u sector %#Lx len %u err no %d",
btrfs_ino(BTRFS_I(dip->inode)), bio_op(bio),
bio->bi_opf, bio->bi_iter.bi_sector,
bio->bi_iter.bi_size, err);
if (bio_op(bio) == REQ_OP_READ) {
err = btrfs_check_read_dio_bio(dip->inode, btrfs_io_bio(bio),
!err);
}
if (err)
dip->dio_bio->bi_status = err;
btrfs_record_physical_zoned(dip->inode, dip->logical_offset, bio);
bio_put(bio);
btrfs_dio_private_put(dip);
}
static inline blk_status_t btrfs_submit_dio_bio(struct bio *bio,
struct inode *inode, u64 file_offset, int async_submit)
{
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct btrfs_dio_private *dip = bio->bi_private;
bool write = btrfs_op(bio) == BTRFS_MAP_WRITE;
blk_status_t ret;
/* Check btrfs_submit_bio_hook() for rules about async submit. */
if (async_submit) async_submit = !atomic_read(&BTRFS_I(inode)->sync_writers); if (!write) { ret = btrfs_bio_wq_end_io(fs_info, bio, BTRFS_WQ_ENDIO_DATA);
if (ret)
goto err;
}
if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)
goto map;
if (write && async_submit) { ret = btrfs_wq_submit_bio(inode, bio, 0, 0, file_offset,
btrfs_submit_bio_start_direct_io);
goto err;
} else if (write) {
/*
* If we aren't doing async submit, calculate the csum of the
* bio now.
*/
ret = btrfs_csum_one_bio(BTRFS_I(inode), bio, file_offset, 1);
if (ret)
goto err;
} else {
u64 csum_offset;
csum_offset = file_offset - dip->logical_offset;
csum_offset >>= fs_info->sectorsize_bits;
csum_offset *= fs_info->csum_size;
btrfs_io_bio(bio)->csum = dip->csums + csum_offset;
}
map:
ret = btrfs_map_bio(fs_info, bio, 0);
err:
return ret;
}
/*
* If this succeeds, the btrfs_dio_private is responsible for cleaning up locked
* or ordered extents whether or not we submit any bios.
*/
static struct btrfs_dio_private *btrfs_create_dio_private(struct bio *dio_bio,
struct inode *inode,
loff_t file_offset)
{
const bool write = (btrfs_op(dio_bio) == BTRFS_MAP_WRITE);
const bool csum = !(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM);
size_t dip_size;
struct btrfs_dio_private *dip;
dip_size = sizeof(*dip);
if (!write && csum) { struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
size_t nblocks;
nblocks = dio_bio->bi_iter.bi_size >> fs_info->sectorsize_bits;
dip_size += fs_info->csum_size * nblocks;
}
dip = kzalloc(dip_size, GFP_NOFS);
if (!dip)
return NULL;
dip->inode = inode;
dip->logical_offset = file_offset;
dip->bytes = dio_bio->bi_iter.bi_size;
dip->disk_bytenr = dio_bio->bi_iter.bi_sector << 9;
dip->dio_bio = dio_bio;
refcount_set(&dip->refs, 1);
return dip;
}
static blk_qc_t btrfs_submit_direct(const struct iomap_iter *iter,
struct bio *dio_bio, loff_t file_offset)
{
struct inode *inode = iter->inode;
const bool write = (btrfs_op(dio_bio) == BTRFS_MAP_WRITE);
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); const bool raid56 = (btrfs_data_alloc_profile(fs_info) &
BTRFS_BLOCK_GROUP_RAID56_MASK);
struct btrfs_dio_private *dip;
struct bio *bio;
u64 start_sector;
int async_submit = 0;
u64 submit_len;
u64 clone_offset = 0;
u64 clone_len;
u64 logical;
int ret;
blk_status_t status;
struct btrfs_io_geometry geom;
struct btrfs_dio_data *dio_data = iter->iomap.private;
struct extent_map *em = NULL;
dip = btrfs_create_dio_private(dio_bio, inode, file_offset);
if (!dip) {
if (!write) {
unlock_extent(&BTRFS_I(inode)->io_tree, file_offset,
file_offset + dio_bio->bi_iter.bi_size - 1);
}
dio_bio->bi_status = BLK_STS_RESOURCE;
bio_endio(dio_bio);
return BLK_QC_T_NONE;
}
if (!write) {
/*
* Load the csums up front to reduce csum tree searches and
* contention when submitting bios.
*
* If we have csums disabled this will do nothing.
*/
status = btrfs_lookup_bio_sums(inode, dio_bio, dip->csums);
if (status != BLK_STS_OK)
goto out_err;
}
start_sector = dio_bio->bi_iter.bi_sector;
submit_len = dio_bio->bi_iter.bi_size;
do {
logical = start_sector << 9;
em = btrfs_get_chunk_map(fs_info, logical, submit_len);
if (IS_ERR(em)) {
status = errno_to_blk_status(PTR_ERR(em));
em = NULL;
goto out_err_em;
}
ret = btrfs_get_io_geometry(fs_info, em, btrfs_op(dio_bio),
logical, &geom);
if (ret) {
status = errno_to_blk_status(ret);
goto out_err_em;
}
clone_len = min(submit_len, geom.len);
ASSERT(clone_len <= UINT_MAX);
/*
* This will never fail as it's passing GPF_NOFS and
* the allocation is backed by btrfs_bioset.
*/
bio = btrfs_bio_clone_partial(dio_bio, clone_offset, clone_len);
bio->bi_private = dip;
bio->bi_end_io = btrfs_end_dio_bio;
btrfs_io_bio(bio)->logical = file_offset;
if (bio_op(bio) == REQ_OP_ZONE_APPEND) {
status = extract_ordered_extent(BTRFS_I(inode), bio,
file_offset);
if (status) {
bio_put(bio);
goto out_err;
}
}
ASSERT(submit_len >= clone_len);
submit_len -= clone_len;
/*
* Increase the count before we submit the bio so we know
* the end IO handler won't happen before we increase the
* count. Otherwise, the dip might get freed before we're
* done setting it up.
*
* We transfer the initial reference to the last bio, so we
* don't need to increment the reference count for the last one.
*/
if (submit_len > 0) {
refcount_inc(&dip->refs);
/*
* If we are submitting more than one bio, submit them
* all asynchronously. The exception is RAID 5 or 6, as
* asynchronous checksums make it difficult to collect
* full stripe writes.
*/
if (!raid56)
async_submit = 1;
}
status = btrfs_submit_dio_bio(bio, inode, file_offset,
async_submit);
if (status) { bio_put(bio);
if (submit_len > 0)
refcount_dec(&dip->refs);
goto out_err_em;
}
dio_data->submitted += clone_len;
clone_offset += clone_len;
start_sector += clone_len >> 9;
file_offset += clone_len;
free_extent_map(em);
} while (submit_len > 0);
return BLK_QC_T_NONE;
out_err_em:
free_extent_map(em);
out_err:
dip->dio_bio->bi_status = status;
btrfs_dio_private_put(dip);
return BLK_QC_T_NONE;
}
const struct iomap_ops btrfs_dio_iomap_ops = {
.iomap_begin = btrfs_dio_iomap_begin,
.iomap_end = btrfs_dio_iomap_end,
};
const struct iomap_dio_ops btrfs_dio_ops = {
.submit_io = btrfs_submit_direct,
};
static int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
u64 start, u64 len)
{
int ret;
ret = fiemap_prep(inode, fieinfo, start, &len, 0);
if (ret)
return ret;
return extent_fiemap(BTRFS_I(inode), fieinfo, start, len);
}
int btrfs_readpage(struct file *file, struct page *page)
{
struct btrfs_inode *inode = BTRFS_I(page->mapping->host);
u64 start = page_offset(page);
u64 end = start + PAGE_SIZE - 1;
struct btrfs_bio_ctrl bio_ctrl = { 0 };
int ret;
btrfs_lock_and_flush_ordered_range(inode, start, end, NULL);
ret = btrfs_do_readpage(page, NULL, &bio_ctrl, 0, NULL);
if (bio_ctrl.bio)
ret = submit_one_bio(bio_ctrl.bio, 0, bio_ctrl.bio_flags); return ret;
}
static int btrfs_writepage(struct page *page, struct writeback_control *wbc)
{
struct inode *inode = page->mapping->host;
int ret;
if (current->flags & PF_MEMALLOC) {
redirty_page_for_writepage(wbc, page);
unlock_page(page);
return 0;
}
/*
* If we are under memory pressure we will call this directly from the
* VM, we need to make sure we have the inode referenced for the ordered
* extent. If not just return like we didn't do anything.
*/
if (!igrab(inode)) {
redirty_page_for_writepage(wbc, page);
return AOP_WRITEPAGE_ACTIVATE;
}
ret = extent_write_full_page(page, wbc);
btrfs_add_delayed_iput(inode);
return ret;
}
static int btrfs_writepages(struct address_space *mapping,
struct writeback_control *wbc)
{
return extent_writepages(mapping, wbc);
}
static void btrfs_readahead(struct readahead_control *rac)
{
extent_readahead(rac);
}
/*
* For releasepage() and invalidatepage() we have a race window where
* end_page_writeback() is called but the subpage spinlock is not yet released.
* If we continue to release/invalidate the page, we could cause use-after-free
* for subpage spinlock. So this function is to spin and wait for subpage
* spinlock.
*/
static void wait_subpage_spinlock(struct page *page)
{
struct btrfs_fs_info *fs_info = btrfs_sb(page->mapping->host->i_sb);
struct btrfs_subpage *subpage;
if (fs_info->sectorsize == PAGE_SIZE)
return;
ASSERT(PagePrivate(page) && page->private);
subpage = (struct btrfs_subpage *)page->private;
/*
* This may look insane as we just acquire the spinlock and release it,
* without doing anything. But we just want to make sure no one is
* still holding the subpage spinlock.
* And since the page is not dirty nor writeback, and we have page
* locked, the only possible way to hold a spinlock is from the endio
* function to clear page writeback.
*
* Here we just acquire the spinlock so that all existing callers
* should exit and we're safe to release/invalidate the page.
*/
spin_lock_irq(&subpage->lock);
spin_unlock_irq(&subpage->lock);
}
static int __btrfs_releasepage(struct page *page, gfp_t gfp_flags)
{
int ret = try_release_extent_mapping(page, gfp_flags); if (ret == 1) {
wait_subpage_spinlock(page);
clear_page_extent_mapped(page);
}
return ret;
}
static int btrfs_releasepage(struct page *page, gfp_t gfp_flags)
{
if (PageWriteback(page) || PageDirty(page))
return 0;
return __btrfs_releasepage(page, gfp_flags);
}
#ifdef CONFIG_MIGRATION
static int btrfs_migratepage(struct address_space *mapping,
struct page *newpage, struct page *page,
enum migrate_mode mode)
{
int ret;
ret = migrate_page_move_mapping(mapping, newpage, page, 0);
if (ret != MIGRATEPAGE_SUCCESS)
return ret;
if (page_has_private(page))
attach_page_private(newpage, detach_page_private(page));
if (PageOrdered(page)) {
ClearPageOrdered(page);
SetPageOrdered(newpage);
}
if (mode != MIGRATE_SYNC_NO_COPY)
migrate_page_copy(newpage, page);
else
migrate_page_states(newpage, page);
return MIGRATEPAGE_SUCCESS;
}
#endif
static void btrfs_invalidatepage(struct page *page, unsigned int offset,
unsigned int length)
{
struct btrfs_inode *inode = BTRFS_I(page->mapping->host);
struct btrfs_fs_info *fs_info = inode->root->fs_info;
struct extent_io_tree *tree = &inode->io_tree;
struct extent_state *cached_state = NULL;
u64 page_start = page_offset(page);
u64 page_end = page_start + PAGE_SIZE - 1;
u64 cur;
int inode_evicting = inode->vfs_inode.i_state & I_FREEING;
/*
* We have page locked so no new ordered extent can be created on this
* page, nor bio can be submitted for this page.
*
* But already submitted bio can still be finished on this page.
* Furthermore, endio function won't skip page which has Ordered
* (Private2) already cleared, so it's possible for endio and
* invalidatepage to do the same ordered extent accounting twice
* on one page.
*
* So here we wait for any submitted bios to finish, so that we won't
* do double ordered extent accounting on the same page.
*/
wait_on_page_writeback(page);
wait_subpage_spinlock(page);
/*
* For subpage case, we have call sites like
* btrfs_punch_hole_lock_range() which passes range not aligned to
* sectorsize.
* If the range doesn't cover the full page, we don't need to and
* shouldn't clear page extent mapped, as page->private can still
* record subpage dirty bits for other part of the range.
*
* For cases that can invalidate the full even the range doesn't
* cover the full page, like invalidating the last page, we're
* still safe to wait for ordered extent to finish.
*/
if (!(offset == 0 && length == PAGE_SIZE)) { btrfs_releasepage(page, GFP_NOFS); return;
}
if (!inode_evicting)
lock_extent_bits(tree, page_start, page_end, &cached_state);
cur = page_start;
while (cur < page_end) {
struct btrfs_ordered_extent *ordered;
bool delete_states;
u64 range_end;
u32 range_len;
ordered = btrfs_lookup_first_ordered_range(inode, cur,
page_end + 1 - cur);
if (!ordered) {
range_end = page_end;
/*
* No ordered extent covering this range, we are safe
* to delete all extent states in the range.
*/
delete_states = true;
goto next;
}
if (ordered->file_offset > cur) {
/*
* There is a range between [cur, oe->file_offset) not
* covered by any ordered extent.
* We are safe to delete all extent states, and handle
* the ordered extent in the next iteration.
*/
range_end = ordered->file_offset - 1;
delete_states = true;
goto next;
}
range_end = min(ordered->file_offset + ordered->num_bytes - 1,
page_end);
ASSERT(range_end + 1 - cur < U32_MAX);
range_len = range_end + 1 - cur;
if (!btrfs_page_test_ordered(fs_info, page, cur, range_len)) {
/*
* If Ordered (Private2) is cleared, it means endio has
* already been executed for the range.
* We can't delete the extent states as
* btrfs_finish_ordered_io() may still use some of them.
*/
delete_states = false;
goto next;
}
btrfs_page_clear_ordered(fs_info, page, cur, range_len);
/*
* IO on this page will never be started, so we need to account
* for any ordered extents now. Don't clear EXTENT_DELALLOC_NEW
* here, must leave that up for the ordered extent completion.
*
* This will also unlock the range for incoming
* btrfs_finish_ordered_io().
*/
if (!inode_evicting)
clear_extent_bit(tree, cur, range_end,
EXTENT_DELALLOC |
EXTENT_LOCKED | EXTENT_DO_ACCOUNTING |
EXTENT_DEFRAG, 1, 0, &cached_state);
spin_lock_irq(&inode->ordered_tree.lock);
set_bit(BTRFS_ORDERED_TRUNCATED, &ordered->flags);
ordered->truncated_len = min(ordered->truncated_len,
cur - ordered->file_offset);
spin_unlock_irq(&inode->ordered_tree.lock);
if (btrfs_dec_test_ordered_pending(inode, &ordered,
cur, range_end + 1 - cur)) {
btrfs_finish_ordered_io(ordered);
/*
* The ordered extent has finished, now we're again
* safe to delete all extent states of the range.
*/
delete_states = true;
} else {
/*
* btrfs_finish_ordered_io() will get executed by endio
* of other pages, thus we can't delete extent states
* anymore
*/
delete_states = false;
}
next:
if (ordered) btrfs_put_ordered_extent(ordered);
/*
* Qgroup reserved space handler
* Sector(s) here will be either:
*
* 1) Already written to disk or bio already finished
* Then its QGROUP_RESERVED bit in io_tree is already cleared.
* Qgroup will be handled by its qgroup_record then.
* btrfs_qgroup_free_data() call will do nothing here.
*
* 2) Not written to disk yet
* Then btrfs_qgroup_free_data() call will clear the
* QGROUP_RESERVED bit of its io_tree, and free the qgroup
* reserved data space.
* Since the IO will never happen for this page.
*/
btrfs_qgroup_free_data(inode, NULL, cur, range_end + 1 - cur);
if (!inode_evicting) {
clear_extent_bit(tree, cur, range_end, EXTENT_LOCKED |
EXTENT_DELALLOC | EXTENT_UPTODATE |
EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 1,
delete_states, &cached_state);
}
cur = range_end + 1;
}
/*
* We have iterated through all ordered extents of the page, the page
* should not have Ordered (Private2) anymore, or the above iteration
* did something wrong.
*/
ASSERT(!PageOrdered(page));
if (!inode_evicting)
__btrfs_releasepage(page, GFP_NOFS);
ClearPageChecked(page);
clear_page_extent_mapped(page);
}
/*
* btrfs_page_mkwrite() is not allowed to change the file size as it gets
* called from a page fault handler when a page is first dirtied. Hence we must
* be careful to check for EOF conditions here. We set the page up correctly
* for a written page which means we get ENOSPC checking when writing into
* holes and correct delalloc and unwritten extent mapping on filesystems that
* support these features.
*
* We are not allowed to take the i_mutex here so we have to play games to
* protect against truncate races as the page could now be beyond EOF. Because
* truncate_setsize() writes the inode size before removing pages, once we have
* the page lock we can determine safely if the page is beyond EOF. If it is not
* beyond EOF, then the page is guaranteed safe against truncation until we
* unlock the page.
*/
vm_fault_t btrfs_page_mkwrite(struct vm_fault *vmf)
{
struct page *page = vmf->page;
struct inode *inode = file_inode(vmf->vma->vm_file);
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
struct btrfs_ordered_extent *ordered;
struct extent_state *cached_state = NULL;
struct extent_changeset *data_reserved = NULL;
unsigned long zero_start;
loff_t size;
vm_fault_t ret;
int ret2;
int reserved = 0;
u64 reserved_space;
u64 page_start;
u64 page_end;
u64 end;
reserved_space = PAGE_SIZE;
sb_start_pagefault(inode->i_sb);
page_start = page_offset(page);
page_end = page_start + PAGE_SIZE - 1;
end = page_end;
/*
* Reserving delalloc space after obtaining the page lock can lead to
* deadlock. For example, if a dirty page is locked by this function
* and the call to btrfs_delalloc_reserve_space() ends up triggering
* dirty page write out, then the btrfs_writepage() function could
* end up waiting indefinitely to get a lock on the page currently
* being processed by btrfs_page_mkwrite() function.
*/
ret2 = btrfs_delalloc_reserve_space(BTRFS_I(inode), &data_reserved,
page_start, reserved_space);
if (!ret2) {
ret2 = file_update_time(vmf->vma->vm_file);
reserved = 1;
}
if (ret2) {
ret = vmf_error(ret2);
if (reserved)
goto out;
goto out_noreserve;
}
ret = VM_FAULT_NOPAGE; /* make the VM retry the fault */
again:
down_read(&BTRFS_I(inode)->i_mmap_lock);
lock_page(page);
size = i_size_read(inode);
if ((page->mapping != inode->i_mapping) ||
(page_start >= size)) {
/* page got truncated out from underneath us */
goto out_unlock;
}
wait_on_page_writeback(page);
lock_extent_bits(io_tree, page_start, page_end, &cached_state);
ret2 = set_page_extent_mapped(page);
if (ret2 < 0) {
ret = vmf_error(ret2);
unlock_extent_cached(io_tree, page_start, page_end, &cached_state);
goto out_unlock;
}
/*
* we can't set the delalloc bits if there are pending ordered
* extents. Drop our locks and wait for them to finish
*/
ordered = btrfs_lookup_ordered_range(BTRFS_I(inode), page_start,
PAGE_SIZE);
if (ordered) {
unlock_extent_cached(io_tree, page_start, page_end,
&cached_state);
unlock_page(page);
up_read(&BTRFS_I(inode)->i_mmap_lock);
btrfs_start_ordered_extent(ordered, 1);
btrfs_put_ordered_extent(ordered);
goto again;
}
if (page->index == ((size - 1) >> PAGE_SHIFT)) {
reserved_space = round_up(size - page_start,
fs_info->sectorsize);
if (reserved_space < PAGE_SIZE) {
end = page_start + reserved_space - 1;
btrfs_delalloc_release_space(BTRFS_I(inode),
data_reserved, page_start,
PAGE_SIZE - reserved_space, true);
}
}
/*
* page_mkwrite gets called when the page is firstly dirtied after it's
* faulted in, but write(2) could also dirty a page and set delalloc
* bits, thus in this case for space account reason, we still need to
* clear any delalloc bits within this page range since we have to
* reserve data&meta space before lock_page() (see above comments).
*/
clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start, end,
EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING |
EXTENT_DEFRAG, 0, 0, &cached_state);
ret2 = btrfs_set_extent_delalloc(BTRFS_I(inode), page_start, end, 0,
&cached_state);
if (ret2) {
unlock_extent_cached(io_tree, page_start, page_end,
&cached_state);
ret = VM_FAULT_SIGBUS;
goto out_unlock;
}
/* page is wholly or partially inside EOF */
if (page_start + PAGE_SIZE > size)
zero_start = offset_in_page(size);
else
zero_start = PAGE_SIZE;
if (zero_start != PAGE_SIZE) {
memzero_page(page, zero_start, PAGE_SIZE - zero_start);
flush_dcache_page(page);
}
ClearPageChecked(page);
btrfs_page_set_dirty(fs_info, page, page_start, end + 1 - page_start);
btrfs_page_set_uptodate(fs_info, page, page_start, end + 1 - page_start);
btrfs_set_inode_last_sub_trans(BTRFS_I(inode));
unlock_extent_cached(io_tree, page_start, page_end, &cached_state);
up_read(&BTRFS_I(inode)->i_mmap_lock);
btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE);
sb_end_pagefault(inode->i_sb);
extent_changeset_free(data_reserved);
return VM_FAULT_LOCKED;
out_unlock:
unlock_page(page);
up_read(&BTRFS_I(inode)->i_mmap_lock);
out:
btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE);
btrfs_delalloc_release_space(BTRFS_I(inode), data_reserved, page_start,
reserved_space, (ret != 0));
out_noreserve:
sb_end_pagefault(inode->i_sb);
extent_changeset_free(data_reserved);
return ret;
}
static int btrfs_truncate(struct inode *inode, bool skip_writeback)
{
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct btrfs_root *root = BTRFS_I(inode)->root;
struct btrfs_block_rsv *rsv;
int ret;
struct btrfs_trans_handle *trans;
u64 mask = fs_info->sectorsize - 1;
u64 min_size = btrfs_calc_metadata_size(fs_info, 1);
u64 extents_found = 0;
if (!skip_writeback) {
ret = btrfs_wait_ordered_range(inode, inode->i_size & (~mask),
(u64)-1);
if (ret)
return ret;
}
/*
* Yes ladies and gentlemen, this is indeed ugly. We have a couple of
* things going on here:
*
* 1) We need to reserve space to update our inode.
*
* 2) We need to have something to cache all the space that is going to
* be free'd up by the truncate operation, but also have some slack
* space reserved in case it uses space during the truncate (thank you
* very much snapshotting).
*
* And we need these to be separate. The fact is we can use a lot of
* space doing the truncate, and we have no earthly idea how much space
* we will use, so we need the truncate reservation to be separate so it
* doesn't end up using space reserved for updating the inode. We also
* need to be able to stop the transaction and start a new one, which
* means we need to be able to update the inode several times, and we
* have no idea of knowing how many times that will be, so we can't just
* reserve 1 item for the entirety of the operation, so that has to be
* done separately as well.
*
* So that leaves us with
*
* 1) rsv - for the truncate reservation, which we will steal from the
* transaction reservation.
* 2) fs_info->trans_block_rsv - this will have 1 items worth left for
* updating the inode.
*/
rsv = btrfs_alloc_block_rsv(fs_info, BTRFS_BLOCK_RSV_TEMP);
if (!rsv)
return -ENOMEM;
rsv->size = min_size;
rsv->failfast = 1;
/*
* 1 for the truncate slack space
* 1 for updating the inode.
*/
trans = btrfs_start_transaction(root, 2);
if (IS_ERR(trans)) {
ret = PTR_ERR(trans);
goto out;
}
/* Migrate the slack space for the truncate to our reserve */
ret = btrfs_block_rsv_migrate(&fs_info->trans_block_rsv, rsv,
min_size, false);
BUG_ON(ret); trans->block_rsv = rsv;
while (1) {
ret = btrfs_truncate_inode_items(trans, root, BTRFS_I(inode),
inode->i_size,
BTRFS_EXTENT_DATA_KEY,
&extents_found);
trans->block_rsv = &fs_info->trans_block_rsv;
if (ret != -ENOSPC && ret != -EAGAIN)
break;
ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
if (ret)
break;
btrfs_end_transaction(trans);
btrfs_btree_balance_dirty(fs_info);
trans = btrfs_start_transaction(root, 2);
if (IS_ERR(trans)) {
ret = PTR_ERR(trans);
trans = NULL;
break;
}
btrfs_block_rsv_release(fs_info, rsv, -1, NULL);
ret = btrfs_block_rsv_migrate(&fs_info->trans_block_rsv,
rsv, min_size, false);
BUG_ON(ret); /* shouldn't happen */ trans->block_rsv = rsv;
}
/*
* We can't call btrfs_truncate_block inside a trans handle as we could
* deadlock with freeze, if we got NEED_TRUNCATE_BLOCK then we know
* we've truncated everything except the last little bit, and can do
* btrfs_truncate_block and then update the disk_i_size.
*/
if (ret == NEED_TRUNCATE_BLOCK) { btrfs_end_transaction(trans);
btrfs_btree_balance_dirty(fs_info);
ret = btrfs_truncate_block(BTRFS_I(inode), inode->i_size, 0, 0);
if (ret)
goto out;
trans = btrfs_start_transaction(root, 1);
if (IS_ERR(trans)) {
ret = PTR_ERR(trans);
goto out;
}
btrfs_inode_safe_disk_i_size_write(BTRFS_I(inode), 0);
}
if (trans) {
int ret2;
trans->block_rsv = &fs_info->trans_block_rsv;
ret2 = btrfs_update_inode(trans, root, BTRFS_I(inode));
if (ret2 && !ret)
ret = ret2;
ret2 = btrfs_end_transaction(trans); if (ret2 && !ret)
ret = ret2;
btrfs_btree_balance_dirty(fs_info);
}
out:
btrfs_free_block_rsv(fs_info, rsv);
/*
* So if we truncate and then write and fsync we normally would just
* write the extents that changed, which is a problem if we need to
* first truncate that entire inode. So set this flag so we write out
* all of the extents in the inode to the sync log so we're completely
* safe.
*
* If no extents were dropped or trimmed we don't need to force the next
* fsync to truncate all the inode's items from the log and re-log them
* all. This means the truncate operation did not change the file size,
* or changed it to a smaller size but there was only an implicit hole
* between the old i_size and the new i_size, and there were no prealloc
* extents beyond i_size to drop.
*/
if (extents_found > 0) set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(inode)->runtime_flags);
return ret;
}
/*
* create a new subvolume directory/inode (helper for the ioctl).
*/
int btrfs_create_subvol_root(struct btrfs_trans_handle *trans,
struct btrfs_root *new_root,
struct btrfs_root *parent_root,
struct user_namespace *mnt_userns)
{
struct inode *inode;
int err;
u64 index = 0;
u64 ino;
err = btrfs_get_free_objectid(new_root, &ino);
if (err < 0)
return err;
inode = btrfs_new_inode(trans, new_root, mnt_userns, NULL, "..", 2,
ino, ino,
S_IFDIR | (~current_umask() & S_IRWXUGO),
&index);
if (IS_ERR(inode))
return PTR_ERR(inode);
inode->i_op = &btrfs_dir_inode_operations;
inode->i_fop = &btrfs_dir_file_operations;
set_nlink(inode, 1);
btrfs_i_size_write(BTRFS_I(inode), 0);
unlock_new_inode(inode);
err = btrfs_subvol_inherit_props(trans, new_root, parent_root);
if (err)
btrfs_err(new_root->fs_info,
"error inheriting subvolume %llu properties: %d",
new_root->root_key.objectid, err);
err = btrfs_update_inode(trans, new_root, BTRFS_I(inode));
iput(inode);
return err;
}
struct inode *btrfs_alloc_inode(struct super_block *sb)
{
struct btrfs_fs_info *fs_info = btrfs_sb(sb);
struct btrfs_inode *ei;
struct inode *inode;
ei = kmem_cache_alloc(btrfs_inode_cachep, GFP_KERNEL);
if (!ei)
return NULL;
ei->root = NULL;
ei->generation = 0;
ei->last_trans = 0;
ei->last_sub_trans = 0;
ei->logged_trans = 0;
ei->delalloc_bytes = 0;
ei->new_delalloc_bytes = 0;
ei->defrag_bytes = 0;
ei->disk_i_size = 0;
ei->flags = 0;
ei->ro_flags = 0;
ei->csum_bytes = 0;
ei->index_cnt = (u64)-1;
ei->dir_index = 0;
ei->last_unlink_trans = 0;
ei->last_reflink_trans = 0;
ei->last_log_commit = 0;
spin_lock_init(&ei->lock);
ei->outstanding_extents = 0;
if (sb->s_magic != BTRFS_TEST_MAGIC)
btrfs_init_metadata_block_rsv(fs_info, &ei->block_rsv,
BTRFS_BLOCK_RSV_DELALLOC);
ei->runtime_flags = 0;
ei->prop_compress = BTRFS_COMPRESS_NONE;
ei->defrag_compress = BTRFS_COMPRESS_NONE;
ei->delayed_node = NULL;
ei->i_otime.tv_sec = 0;
ei->i_otime.tv_nsec = 0;
inode = &ei->vfs_inode;
extent_map_tree_init(&ei->extent_tree);
extent_io_tree_init(fs_info, &ei->io_tree, IO_TREE_INODE_IO, inode);
extent_io_tree_init(fs_info, &ei->io_failure_tree,
IO_TREE_INODE_IO_FAILURE, inode);
extent_io_tree_init(fs_info, &ei->file_extent_tree,
IO_TREE_INODE_FILE_EXTENT, inode);
ei->io_tree.track_uptodate = true;
ei->io_failure_tree.track_uptodate = true;
atomic_set(&ei->sync_writers, 0);
mutex_init(&ei->log_mutex);
btrfs_ordered_inode_tree_init(&ei->ordered_tree);
INIT_LIST_HEAD(&ei->delalloc_inodes);
INIT_LIST_HEAD(&ei->delayed_iput);
RB_CLEAR_NODE(&ei->rb_node);
init_rwsem(&ei->i_mmap_lock);
return inode;
}
#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
void btrfs_test_destroy_inode(struct inode *inode)
{
btrfs_drop_extent_cache(BTRFS_I(inode), 0, (u64)-1, 0);
kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode));
}
#endif
void btrfs_free_inode(struct inode *inode)
{
kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode));
}
void btrfs_destroy_inode(struct inode *vfs_inode)
{
struct btrfs_ordered_extent *ordered;
struct btrfs_inode *inode = BTRFS_I(vfs_inode);
struct btrfs_root *root = inode->root;
WARN_ON(!hlist_empty(&vfs_inode->i_dentry)); WARN_ON(vfs_inode->i_data.nrpages); WARN_ON(inode->block_rsv.reserved); WARN_ON(inode->block_rsv.size); WARN_ON(inode->outstanding_extents); WARN_ON(inode->delalloc_bytes); WARN_ON(inode->new_delalloc_bytes); WARN_ON(inode->csum_bytes); WARN_ON(inode->defrag_bytes);
/*
* This can happen where we create an inode, but somebody else also
* created the same inode and we need to destroy the one we already
* created.
*/
if (!root)
return;
while (1) {
ordered = btrfs_lookup_first_ordered_extent(inode, (u64)-1);
if (!ordered)
break;
else {
btrfs_err(root->fs_info,
"found ordered extent %llu %llu on inode cleanup",
ordered->file_offset, ordered->num_bytes);
btrfs_remove_ordered_extent(inode, ordered);
btrfs_put_ordered_extent(ordered);
btrfs_put_ordered_extent(ordered);
}
}
btrfs_qgroup_check_reserved_leak(inode);
inode_tree_del(inode);
btrfs_drop_extent_cache(inode, 0, (u64)-1, 0);
btrfs_inode_clear_file_extent_range(inode, 0, (u64)-1);
btrfs_put_root(inode->root);
}
int btrfs_drop_inode(struct inode *inode)
{
struct btrfs_root *root = BTRFS_I(inode)->root;
if (root == NULL)
return 1;
/* the snap/subvol tree is on deleting */
if (btrfs_root_refs(&root->root_item) == 0)
return 1;
else
return generic_drop_inode(inode);
}
static void init_once(void *foo)
{
struct btrfs_inode *ei = (struct btrfs_inode *) foo;
inode_init_once(&ei->vfs_inode);
}
void __cold btrfs_destroy_cachep(void)
{
/*
* Make sure all delayed rcu free inodes are flushed before we
* destroy cache.
*/
rcu_barrier();
kmem_cache_destroy(btrfs_inode_cachep);
kmem_cache_destroy(btrfs_trans_handle_cachep);
kmem_cache_destroy(btrfs_path_cachep);
kmem_cache_destroy(btrfs_free_space_cachep);
kmem_cache_destroy(btrfs_free_space_bitmap_cachep);
}
int __init btrfs_init_cachep(void)
{
btrfs_inode_cachep = kmem_cache_create("btrfs_inode",
sizeof(struct btrfs_inode), 0,
SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD | SLAB_ACCOUNT,
init_once);
if (!btrfs_inode_cachep)
goto fail;
btrfs_trans_handle_cachep = kmem_cache_create("btrfs_trans_handle",
sizeof(struct btrfs_trans_handle), 0,
SLAB_TEMPORARY | SLAB_MEM_SPREAD, NULL);
if (!btrfs_trans_handle_cachep)
goto fail;
btrfs_path_cachep = kmem_cache_create("btrfs_path",
sizeof(struct btrfs_path), 0,
SLAB_MEM_SPREAD, NULL);
if (!btrfs_path_cachep)
goto fail;
btrfs_free_space_cachep = kmem_cache_create("btrfs_free_space",
sizeof(struct btrfs_free_space), 0,
SLAB_MEM_SPREAD, NULL);
if (!btrfs_free_space_cachep)
goto fail;
btrfs_free_space_bitmap_cachep = kmem_cache_create("btrfs_free_space_bitmap",
PAGE_SIZE, PAGE_SIZE,
SLAB_MEM_SPREAD, NULL);
if (!btrfs_free_space_bitmap_cachep)
goto fail;
return 0;
fail:
btrfs_destroy_cachep();
return -ENOMEM;
}
static int btrfs_getattr(struct user_namespace *mnt_userns,
const struct path *path, struct kstat *stat,
u32 request_mask, unsigned int flags)
{
u64 delalloc_bytes;
u64 inode_bytes;
struct inode *inode = d_inode(path->dentry);
u32 blocksize = inode->i_sb->s_blocksize;
u32 bi_flags = BTRFS_I(inode)->flags;
u32 bi_ro_flags = BTRFS_I(inode)->ro_flags;
stat->result_mask |= STATX_BTIME;
stat->btime.tv_sec = BTRFS_I(inode)->i_otime.tv_sec;
stat->btime.tv_nsec = BTRFS_I(inode)->i_otime.tv_nsec;
if (bi_flags & BTRFS_INODE_APPEND)
stat->attributes |= STATX_ATTR_APPEND; if (bi_flags & BTRFS_INODE_COMPRESS) stat->attributes |= STATX_ATTR_COMPRESSED; if (bi_flags & BTRFS_INODE_IMMUTABLE) stat->attributes |= STATX_ATTR_IMMUTABLE; if (bi_flags & BTRFS_INODE_NODUMP) stat->attributes |= STATX_ATTR_NODUMP; if (bi_ro_flags & BTRFS_INODE_RO_VERITY) stat->attributes |= STATX_ATTR_VERITY; stat->attributes_mask |= (STATX_ATTR_APPEND |
STATX_ATTR_COMPRESSED |
STATX_ATTR_IMMUTABLE |
STATX_ATTR_NODUMP);
generic_fillattr(mnt_userns, inode, stat);
stat->dev = BTRFS_I(inode)->root->anon_dev;
spin_lock(&BTRFS_I(inode)->lock);
delalloc_bytes = BTRFS_I(inode)->new_delalloc_bytes;
inode_bytes = inode_get_bytes(inode);
spin_unlock(&BTRFS_I(inode)->lock);
stat->blocks = (ALIGN(inode_bytes, blocksize) +
ALIGN(delalloc_bytes, blocksize)) >> 9;
return 0;
}
static int btrfs_rename_exchange(struct inode *old_dir,
struct dentry *old_dentry,
struct inode *new_dir,
struct dentry *new_dentry)
{
struct btrfs_fs_info *fs_info = btrfs_sb(old_dir->i_sb);
struct btrfs_trans_handle *trans;
struct btrfs_root *root = BTRFS_I(old_dir)->root;
struct btrfs_root *dest = BTRFS_I(new_dir)->root;
struct inode *new_inode = new_dentry->d_inode;
struct inode *old_inode = old_dentry->d_inode;
struct timespec64 ctime = current_time(old_inode);
u64 old_ino = btrfs_ino(BTRFS_I(old_inode));
u64 new_ino = btrfs_ino(BTRFS_I(new_inode));
u64 old_idx = 0;
u64 new_idx = 0;
int ret;
int ret2;
bool root_log_pinned = false;
bool dest_log_pinned = false;
bool need_abort = false;
/*
* For non-subvolumes allow exchange only within one subvolume, in the
* same inode namespace. Two subvolumes (represented as directory) can
* be exchanged as they're a logical link and have a fixed inode number.
*/
if (root != dest && (old_ino != BTRFS_FIRST_FREE_OBJECTID ||
new_ino != BTRFS_FIRST_FREE_OBJECTID))
return -EXDEV;
/* close the race window with snapshot create/destroy ioctl */
if (old_ino == BTRFS_FIRST_FREE_OBJECTID ||
new_ino == BTRFS_FIRST_FREE_OBJECTID)
down_read(&fs_info->subvol_sem);
/*
* We want to reserve the absolute worst case amount of items. So if
* both inodes are subvols and we need to unlink them then that would
* require 4 item modifications, but if they are both normal inodes it
* would require 5 item modifications, so we'll assume their normal
* inodes. So 5 * 2 is 10, plus 2 for the new links, so 12 total items
* should cover the worst case number of items we'll modify.
*/
trans = btrfs_start_transaction(root, 12);
if (IS_ERR(trans)) {
ret = PTR_ERR(trans);
goto out_notrans;
}
if (dest != root) { ret = btrfs_record_root_in_trans(trans, dest);
if (ret)
goto out_fail;
}
/*
* We need to find a free sequence number both in the source and
* in the destination directory for the exchange.
*/
ret = btrfs_set_inode_index(BTRFS_I(new_dir), &old_idx);
if (ret)
goto out_fail;
ret = btrfs_set_inode_index(BTRFS_I(old_dir), &new_idx);
if (ret)
goto out_fail;
BTRFS_I(old_inode)->dir_index = 0ULL;
BTRFS_I(new_inode)->dir_index = 0ULL;
/* Reference for the source. */
if (old_ino == BTRFS_FIRST_FREE_OBJECTID) {
/* force full log commit if subvolume involved. */
btrfs_set_log_full_commit(trans);
} else {
ret = btrfs_insert_inode_ref(trans, dest,
new_dentry->d_name.name,
new_dentry->d_name.len,
old_ino,
btrfs_ino(BTRFS_I(new_dir)),
old_idx);
if (ret)
goto out_fail;
need_abort = true;
}
/* And now for the dest. */
if (new_ino == BTRFS_FIRST_FREE_OBJECTID) {
/* force full log commit if subvolume involved. */
btrfs_set_log_full_commit(trans);
} else {
ret = btrfs_insert_inode_ref(trans, root,
old_dentry->d_name.name,
old_dentry->d_name.len,
new_ino,
btrfs_ino(BTRFS_I(old_dir)),
new_idx);
if (ret) {
if (need_abort) btrfs_abort_transaction(trans, ret);
goto out_fail;
}
}
/* Update inode version and ctime/mtime. */
inode_inc_iversion(old_dir);
inode_inc_iversion(new_dir);
inode_inc_iversion(old_inode);
inode_inc_iversion(new_inode);
old_dir->i_ctime = old_dir->i_mtime = ctime;
new_dir->i_ctime = new_dir->i_mtime = ctime;
old_inode->i_ctime = ctime;
new_inode->i_ctime = ctime;
if (old_dentry->d_parent != new_dentry->d_parent) {
btrfs_record_unlink_dir(trans, BTRFS_I(old_dir),
BTRFS_I(old_inode), 1);
btrfs_record_unlink_dir(trans, BTRFS_I(new_dir),
BTRFS_I(new_inode), 1);
}
/*
* Now pin the logs of the roots. We do it to ensure that no other task
* can sync the logs while we are in progress with the rename, because
* that could result in an inconsistency in case any of the inodes that
* are part of this rename operation were logged before.
*
* We pin the logs even if at this precise moment none of the inodes was
* logged before. This is because right after we checked for that, some
* other task fsyncing some other inode not involved with this rename
* operation could log that one of our inodes exists.
*
* We don't need to pin the logs before the above calls to
* btrfs_insert_inode_ref(), since those don't ever need to change a log.
*/
if (old_ino != BTRFS_FIRST_FREE_OBJECTID) { btrfs_pin_log_trans(root);
root_log_pinned = true;
}
if (new_ino != BTRFS_FIRST_FREE_OBJECTID) { btrfs_pin_log_trans(dest);
dest_log_pinned = true;
}
/* src is a subvolume */
if (old_ino == BTRFS_FIRST_FREE_OBJECTID) {
ret = btrfs_unlink_subvol(trans, old_dir, old_dentry);
} else { /* src is an inode */
ret = __btrfs_unlink_inode(trans, root, BTRFS_I(old_dir),
BTRFS_I(old_dentry->d_inode),
old_dentry->d_name.name,
old_dentry->d_name.len);
if (!ret)
ret = btrfs_update_inode(trans, root, BTRFS_I(old_inode));
}
if (ret) { btrfs_abort_transaction(trans, ret);
goto out_fail;
}
/* dest is a subvolume */
if (new_ino == BTRFS_FIRST_FREE_OBJECTID) { ret = btrfs_unlink_subvol(trans, new_dir, new_dentry);
} else { /* dest is an inode */
ret = __btrfs_unlink_inode(trans, dest, BTRFS_I(new_dir),
BTRFS_I(new_dentry->d_inode),
new_dentry->d_name.name, new_dentry->d_name.len);
if (!ret)
ret = btrfs_update_inode(trans, dest, BTRFS_I(new_inode));
}
if (ret) { btrfs_abort_transaction(trans, ret);
goto out_fail;
}
ret = btrfs_add_link(trans, BTRFS_I(new_dir), BTRFS_I(old_inode),
new_dentry->d_name.name,
new_dentry->d_name.len, 0, old_idx);
if (ret) {
btrfs_abort_transaction(trans, ret);
goto out_fail;
}
ret = btrfs_add_link(trans, BTRFS_I(old_dir), BTRFS_I(new_inode),
old_dentry->d_name.name,
old_dentry->d_name.len, 0, new_idx);
if (ret) {
btrfs_abort_transaction(trans, ret);
goto out_fail;
}
if (old_inode->i_nlink == 1) BTRFS_I(old_inode)->dir_index = old_idx; if (new_inode->i_nlink == 1) BTRFS_I(new_inode)->dir_index = new_idx; if (root_log_pinned) { btrfs_log_new_name(trans, BTRFS_I(old_inode), BTRFS_I(old_dir),
new_dentry->d_parent);
btrfs_end_log_trans(root);
root_log_pinned = false;
}
if (dest_log_pinned) { btrfs_log_new_name(trans, BTRFS_I(new_inode), BTRFS_I(new_dir),
old_dentry->d_parent);
btrfs_end_log_trans(dest);
dest_log_pinned = false;
}
out_fail:
/*
* If we have pinned a log and an error happened, we unpin tasks
* trying to sync the log and force them to fallback to a transaction
* commit if the log currently contains any of the inodes involved in
* this rename operation (to ensure we do not persist a log with an
* inconsistent state for any of these inodes or leading to any
* inconsistencies when replayed). If the transaction was aborted, the
* abortion reason is propagated to userspace when attempting to commit
* the transaction. If the log does not contain any of these inodes, we
* allow the tasks to sync it.
*/
if (ret && (root_log_pinned || dest_log_pinned)) {
if (btrfs_inode_in_log(BTRFS_I(old_dir), fs_info->generation) ||
btrfs_inode_in_log(BTRFS_I(new_dir), fs_info->generation) ||
btrfs_inode_in_log(BTRFS_I(old_inode), fs_info->generation) ||
btrfs_inode_in_log(BTRFS_I(new_inode), fs_info->generation))
btrfs_set_log_full_commit(trans);
if (root_log_pinned) {
btrfs_end_log_trans(root);
root_log_pinned = false;
}
if (dest_log_pinned) {
btrfs_end_log_trans(dest);
dest_log_pinned = false;
}
}
ret2 = btrfs_end_transaction(trans);
ret = ret ? ret : ret2;
out_notrans:
if (new_ino == BTRFS_FIRST_FREE_OBJECTID ||
old_ino == BTRFS_FIRST_FREE_OBJECTID)
up_read(&fs_info->subvol_sem);
return ret;
}
static int btrfs_whiteout_for_rename(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct user_namespace *mnt_userns,
struct inode *dir,
struct dentry *dentry)
{
int ret;
struct inode *inode;
u64 objectid;
u64 index;
ret = btrfs_get_free_objectid(root, &objectid);
if (ret)
return ret;
inode = btrfs_new_inode(trans, root, mnt_userns, dir,
dentry->d_name.name,
dentry->d_name.len,
btrfs_ino(BTRFS_I(dir)),
objectid,
S_IFCHR | WHITEOUT_MODE,
&index);
if (IS_ERR(inode)) {
ret = PTR_ERR(inode);
return ret;
}
inode->i_op = &btrfs_special_inode_operations;
init_special_inode(inode, inode->i_mode,
WHITEOUT_DEV);
ret = btrfs_init_inode_security(trans, inode, dir,
&dentry->d_name);
if (ret)
goto out;
ret = btrfs_add_nondir(trans, BTRFS_I(dir), dentry,
BTRFS_I(inode), 0, index);
if (ret)
goto out;
ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
out:
unlock_new_inode(inode);
if (ret)
inode_dec_link_count(inode);
iput(inode);
return ret;
}
static int btrfs_rename(struct user_namespace *mnt_userns,
struct inode *old_dir, struct dentry *old_dentry,
struct inode *new_dir, struct dentry *new_dentry,
unsigned int flags)
{
struct btrfs_fs_info *fs_info = btrfs_sb(old_dir->i_sb);
struct btrfs_trans_handle *trans;
unsigned int trans_num_items;
struct btrfs_root *root = BTRFS_I(old_dir)->root;
struct btrfs_root *dest = BTRFS_I(new_dir)->root;
struct inode *new_inode = d_inode(new_dentry);
struct inode *old_inode = d_inode(old_dentry);
u64 index = 0;
int ret;
int ret2;
u64 old_ino = btrfs_ino(BTRFS_I(old_inode));
bool log_pinned = false;
if (btrfs_ino(BTRFS_I(new_dir)) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)
return -EPERM;
/* we only allow rename subvolume link between subvolumes */
if (old_ino != BTRFS_FIRST_FREE_OBJECTID && root != dest) return -EXDEV; if (old_ino == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID || (new_inode && btrfs_ino(BTRFS_I(new_inode)) == BTRFS_FIRST_FREE_OBJECTID))
return -ENOTEMPTY;
if (S_ISDIR(old_inode->i_mode) && new_inode && new_inode->i_size > BTRFS_EMPTY_DIR_SIZE)
return -ENOTEMPTY;
/* check for collisions, even if the name isn't there */
ret = btrfs_check_dir_item_collision(dest, new_dir->i_ino,
new_dentry->d_name.name,
new_dentry->d_name.len);
if (ret) {
if (ret == -EEXIST) {
/* we shouldn't get
* eexist without a new_inode */
if (WARN_ON(!new_inode)) {
return ret;
}
} else {
/* maybe -EOVERFLOW */
return ret;
}
}
ret = 0;
/*
* we're using rename to replace one file with another. Start IO on it
* now so we don't add too much work to the end of the transaction
*/
if (new_inode && S_ISREG(old_inode->i_mode) && new_inode->i_size) filemap_flush(old_inode->i_mapping);
/* close the racy window with snapshot create/destroy ioctl */
if (old_ino == BTRFS_FIRST_FREE_OBJECTID) down_read(&fs_info->subvol_sem);
/*
* We want to reserve the absolute worst case amount of items. So if
* both inodes are subvols and we need to unlink them then that would
* require 4 item modifications, but if they are both normal inodes it
* would require 5 item modifications, so we'll assume they are normal
* inodes. So 5 * 2 is 10, plus 1 for the new link, so 11 total items
* should cover the worst case number of items we'll modify.
* If our rename has the whiteout flag, we need more 5 units for the
* new inode (1 inode item, 1 inode ref, 2 dir items and 1 xattr item
* when selinux is enabled).
*/
trans_num_items = 11;
if (flags & RENAME_WHITEOUT)
trans_num_items += 5;
trans = btrfs_start_transaction(root, trans_num_items);
if (IS_ERR(trans)) {
ret = PTR_ERR(trans);
goto out_notrans;
}
if (dest != root) { ret = btrfs_record_root_in_trans(trans, dest);
if (ret)
goto out_fail;
}
ret = btrfs_set_inode_index(BTRFS_I(new_dir), &index);
if (ret)
goto out_fail;
BTRFS_I(old_inode)->dir_index = 0ULL;
if (unlikely(old_ino == BTRFS_FIRST_FREE_OBJECTID)) {
/* force full log commit if subvolume involved. */
btrfs_set_log_full_commit(trans);
} else {
ret = btrfs_insert_inode_ref(trans, dest,
new_dentry->d_name.name,
new_dentry->d_name.len,
old_ino,
btrfs_ino(BTRFS_I(new_dir)), index);
if (ret)
goto out_fail;
}
inode_inc_iversion(old_dir);
inode_inc_iversion(new_dir);
inode_inc_iversion(old_inode);
old_dir->i_ctime = old_dir->i_mtime =
new_dir->i_ctime = new_dir->i_mtime =
old_inode->i_ctime = current_time(old_dir);
if (old_dentry->d_parent != new_dentry->d_parent)
btrfs_record_unlink_dir(trans, BTRFS_I(old_dir),
BTRFS_I(old_inode), 1);
if (unlikely(old_ino == BTRFS_FIRST_FREE_OBJECTID)) { ret = btrfs_unlink_subvol(trans, old_dir, old_dentry);
} else {
/*
* Now pin the log. We do it to ensure that no other task can
* sync the log while we are in progress with the rename, as
* that could result in an inconsistency in case any of the
* inodes that are part of this rename operation were logged
* before.
*
* We pin the log even if at this precise moment none of the
* inodes was logged before. This is because right after we
* checked for that, some other task fsyncing some other inode
* not involved with this rename operation could log that one of
* our inodes exists.
*
* We don't need to pin the logs before the above call to
* btrfs_insert_inode_ref(), since that does not need to change
* a log.
*/
btrfs_pin_log_trans(root);
log_pinned = true;
ret = __btrfs_unlink_inode(trans, root, BTRFS_I(old_dir),
BTRFS_I(d_inode(old_dentry)),
old_dentry->d_name.name,
old_dentry->d_name.len);
if (!ret)
ret = btrfs_update_inode(trans, root, BTRFS_I(old_inode));
}
if (ret) { btrfs_abort_transaction(trans, ret);
goto out_fail;
}
if (new_inode) {
inode_inc_iversion(new_inode);
new_inode->i_ctime = current_time(new_inode); if (unlikely(btrfs_ino(BTRFS_I(new_inode)) ==
BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) {
ret = btrfs_unlink_subvol(trans, new_dir, new_dentry); BUG_ON(new_inode->i_nlink == 0);
} else {
ret = btrfs_unlink_inode(trans, dest, BTRFS_I(new_dir),
BTRFS_I(d_inode(new_dentry)),
new_dentry->d_name.name,
new_dentry->d_name.len);
}
if (!ret && new_inode->i_nlink == 0)
ret = btrfs_orphan_add(trans,
BTRFS_I(d_inode(new_dentry)));
if (ret) {
btrfs_abort_transaction(trans, ret);
goto out_fail;
}
}
ret = btrfs_add_link(trans, BTRFS_I(new_dir), BTRFS_I(old_inode),
new_dentry->d_name.name,
new_dentry->d_name.len, 0, index);
if (ret) {
btrfs_abort_transaction(trans, ret);
goto out_fail;
}
if (old_inode->i_nlink == 1) BTRFS_I(old_inode)->dir_index = index; if (log_pinned) { btrfs_log_new_name(trans, BTRFS_I(old_inode), BTRFS_I(old_dir),
new_dentry->d_parent);
btrfs_end_log_trans(root);
log_pinned = false;
}
if (flags & RENAME_WHITEOUT) {
ret = btrfs_whiteout_for_rename(trans, root, mnt_userns,
old_dir, old_dentry);
if (ret) {
btrfs_abort_transaction(trans, ret);
goto out_fail;
}
}
out_fail:
/*
* If we have pinned the log and an error happened, we unpin tasks
* trying to sync the log and force them to fallback to a transaction
* commit if the log currently contains any of the inodes involved in
* this rename operation (to ensure we do not persist a log with an
* inconsistent state for any of these inodes or leading to any
* inconsistencies when replayed). If the transaction was aborted, the
* abortion reason is propagated to userspace when attempting to commit
* the transaction. If the log does not contain any of these inodes, we
* allow the tasks to sync it.
*/
if (ret && log_pinned) {
if (btrfs_inode_in_log(BTRFS_I(old_dir), fs_info->generation) ||
btrfs_inode_in_log(BTRFS_I(new_dir), fs_info->generation) ||
btrfs_inode_in_log(BTRFS_I(old_inode), fs_info->generation) ||
(new_inode &&
btrfs_inode_in_log(BTRFS_I(new_inode), fs_info->generation)))
btrfs_set_log_full_commit(trans);
btrfs_end_log_trans(root);
log_pinned = false;
}
ret2 = btrfs_end_transaction(trans);
ret = ret ? ret : ret2;
out_notrans:
if (old_ino == BTRFS_FIRST_FREE_OBJECTID) up_read(&fs_info->subvol_sem);
return ret;
}
static int btrfs_rename2(struct user_namespace *mnt_userns, struct inode *old_dir,
struct dentry *old_dentry, struct inode *new_dir,
struct dentry *new_dentry, unsigned int flags)
{
if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT))
return -EINVAL;
if (flags & RENAME_EXCHANGE)
return btrfs_rename_exchange(old_dir, old_dentry, new_dir,
new_dentry);
return btrfs_rename(mnt_userns, old_dir, old_dentry, new_dir,
new_dentry, flags);
}
struct btrfs_delalloc_work {
struct inode *inode;
struct completion completion;
struct list_head list;
struct btrfs_work work;
};
static void btrfs_run_delalloc_work(struct btrfs_work *work)
{
struct btrfs_delalloc_work *delalloc_work;
struct inode *inode;
delalloc_work = container_of(work, struct btrfs_delalloc_work,
work);
inode = delalloc_work->inode;
filemap_flush(inode->i_mapping);
if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
&BTRFS_I(inode)->runtime_flags))
filemap_flush(inode->i_mapping);
iput(inode);
complete(&delalloc_work->completion);
}
static struct btrfs_delalloc_work *btrfs_alloc_delalloc_work(struct inode *inode)
{
struct btrfs_delalloc_work *work;
work = kmalloc(sizeof(*work), GFP_NOFS);
if (!work)
return NULL;
init_completion(&work->completion);
INIT_LIST_HEAD(&work->list);
work->inode = inode;
btrfs_init_work(&work->work, btrfs_run_delalloc_work, NULL, NULL);
return work;
}
/*
* some fairly slow code that needs optimization. This walks the list
* of all the inodes with pending delalloc and forces them to disk.
*/
static int start_delalloc_inodes(struct btrfs_root *root,
struct writeback_control *wbc, bool snapshot,
bool in_reclaim_context)
{
struct btrfs_inode *binode;
struct inode *inode;
struct btrfs_delalloc_work *work, *next;
struct list_head works;
struct list_head splice;
int ret = 0;
bool full_flush = wbc->nr_to_write == LONG_MAX;
INIT_LIST_HEAD(&works);
INIT_LIST_HEAD(&splice);
mutex_lock(&root->delalloc_mutex);
spin_lock(&root->delalloc_lock);
list_splice_init(&root->delalloc_inodes, &splice);
while (!list_empty(&splice)) {
binode = list_entry(splice.next, struct btrfs_inode,
delalloc_inodes);
list_move_tail(&binode->delalloc_inodes,
&root->delalloc_inodes);
if (in_reclaim_context &&
test_bit(BTRFS_INODE_NO_DELALLOC_FLUSH, &binode->runtime_flags))
continue;
inode = igrab(&binode->vfs_inode);
if (!inode) {
cond_resched_lock(&root->delalloc_lock);
continue;
}
spin_unlock(&root->delalloc_lock);
if (snapshot)
set_bit(BTRFS_INODE_SNAPSHOT_FLUSH,
&binode->runtime_flags);
if (full_flush) {
work = btrfs_alloc_delalloc_work(inode);
if (!work) {
iput(inode);
ret = -ENOMEM;
goto out;
}
list_add_tail(&work->list, &works);
btrfs_queue_work(root->fs_info->flush_workers,
&work->work);
} else {
ret = filemap_fdatawrite_wbc(inode->i_mapping, wbc);
btrfs_add_delayed_iput(inode);
if (ret || wbc->nr_to_write <= 0)
goto out;
}
cond_resched();
spin_lock(&root->delalloc_lock);
}
spin_unlock(&root->delalloc_lock);
out:
list_for_each_entry_safe(work, next, &works, list) {
list_del_init(&work->list);
wait_for_completion(&work->completion);
kfree(work);
}
if (!list_empty(&splice)) {
spin_lock(&root->delalloc_lock);
list_splice_tail(&splice, &root->delalloc_inodes);
spin_unlock(&root->delalloc_lock);
}
mutex_unlock(&root->delalloc_mutex);
return ret;
}
int btrfs_start_delalloc_snapshot(struct btrfs_root *root, bool in_reclaim_context)
{
struct writeback_control wbc = {
.nr_to_write = LONG_MAX,
.sync_mode = WB_SYNC_NONE,
.range_start = 0,
.range_end = LLONG_MAX,
};
struct btrfs_fs_info *fs_info = root->fs_info;
if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state))
return -EROFS;
return start_delalloc_inodes(root, &wbc, true, in_reclaim_context);
}
int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, long nr,
bool in_reclaim_context)
{
struct writeback_control wbc = {
.nr_to_write = nr,
.sync_mode = WB_SYNC_NONE,
.range_start = 0,
.range_end = LLONG_MAX,
};
struct btrfs_root *root;
struct list_head splice;
int ret;
if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state))
return -EROFS;
INIT_LIST_HEAD(&splice);
mutex_lock(&fs_info->delalloc_root_mutex);
spin_lock(&fs_info->delalloc_root_lock);
list_splice_init(&fs_info->delalloc_roots, &splice);
while (!list_empty(&splice)) {
/*
* Reset nr_to_write here so we know that we're doing a full
* flush.
*/
if (nr == LONG_MAX)
wbc.nr_to_write = LONG_MAX;
root = list_first_entry(&splice, struct btrfs_root,
delalloc_root);
root = btrfs_grab_root(root);
BUG_ON(!root);
list_move_tail(&root->delalloc_root,
&fs_info->delalloc_roots);
spin_unlock(&fs_info->delalloc_root_lock);
ret = start_delalloc_inodes(root, &wbc, false, in_reclaim_context);
btrfs_put_root(root);
if (ret < 0 || wbc.nr_to_write <= 0)
goto out;
spin_lock(&fs_info->delalloc_root_lock);
}
spin_unlock(&fs_info->delalloc_root_lock);
ret = 0;
out:
if (!list_empty(&splice)) {
spin_lock(&fs_info->delalloc_root_lock);
list_splice_tail(&splice, &fs_info->delalloc_roots);
spin_unlock(&fs_info->delalloc_root_lock);
}
mutex_unlock(&fs_info->delalloc_root_mutex);
return ret;
}
static int btrfs_symlink(struct user_namespace *mnt_userns, struct inode *dir,
struct dentry *dentry, const char *symname)
{
struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb);
struct btrfs_trans_handle *trans;
struct btrfs_root *root = BTRFS_I(dir)->root;
struct btrfs_path *path;
struct btrfs_key key;
struct inode *inode = NULL;
int err;
u64 objectid;
u64 index = 0;
int name_len;
int datasize;
unsigned long ptr;
struct btrfs_file_extent_item *ei;
struct extent_buffer *leaf;
name_len = strlen(symname);
if (name_len > BTRFS_MAX_INLINE_DATA_SIZE(fs_info))
return -ENAMETOOLONG;
/*
* 2 items for inode item and ref
* 2 items for dir items
* 1 item for updating parent inode item
* 1 item for the inline extent item
* 1 item for xattr if selinux is on
*/
trans = btrfs_start_transaction(root, 7);
if (IS_ERR(trans))
return PTR_ERR(trans);
err = btrfs_get_free_objectid(root, &objectid);
if (err)
goto out_unlock;
inode = btrfs_new_inode(trans, root, mnt_userns, dir, dentry->d_name.name, dentry->d_name.len,
btrfs_ino(BTRFS_I(dir)), objectid,
S_IFLNK | S_IRWXUGO, &index);
if (IS_ERR(inode)) {
err = PTR_ERR(inode);
inode = NULL;
goto out_unlock;
}
/*
* If the active LSM wants to access the inode during
* d_instantiate it needs these. Smack checks to see
* if the filesystem supports xattrs by looking at the
* ops vector.
*/
inode->i_fop = &btrfs_file_operations;
inode->i_op = &btrfs_file_inode_operations;
inode->i_mapping->a_ops = &btrfs_aops;
err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name);
if (err)
goto out_unlock;
path = btrfs_alloc_path();
if (!path) {
err = -ENOMEM;
goto out_unlock;
}
key.objectid = btrfs_ino(BTRFS_I(inode));
key.offset = 0;
key.type = BTRFS_EXTENT_DATA_KEY;
datasize = btrfs_file_extent_calc_inline_size(name_len);
err = btrfs_insert_empty_item(trans, root, path, &key,
datasize);
if (err) {
btrfs_free_path(path);
goto out_unlock;
}
leaf = path->nodes[0];
ei = btrfs_item_ptr(leaf, path->slots[0],
struct btrfs_file_extent_item);
btrfs_set_file_extent_generation(leaf, ei, trans->transid);
btrfs_set_file_extent_type(leaf, ei,
BTRFS_FILE_EXTENT_INLINE);
btrfs_set_file_extent_encryption(leaf, ei, 0);
btrfs_set_file_extent_compression(leaf, ei, 0);
btrfs_set_file_extent_other_encoding(leaf, ei, 0);
btrfs_set_file_extent_ram_bytes(leaf, ei, name_len);
ptr = btrfs_file_extent_inline_start(ei);
write_extent_buffer(leaf, symname, ptr, name_len);
btrfs_mark_buffer_dirty(leaf);
btrfs_free_path(path);
inode->i_op = &btrfs_symlink_inode_operations;
inode_nohighmem(inode);
inode_set_bytes(inode, name_len);
btrfs_i_size_write(BTRFS_I(inode), name_len);
err = btrfs_update_inode(trans, root, BTRFS_I(inode));
/*
* Last step, add directory indexes for our symlink inode. This is the
* last step to avoid extra cleanup of these indexes if an error happens
* elsewhere above.
*/
if (!err)
err = btrfs_add_nondir(trans, BTRFS_I(dir), dentry,
BTRFS_I(inode), 0, index);
if (err)
goto out_unlock;
d_instantiate_new(dentry, inode);
out_unlock:
btrfs_end_transaction(trans); if (err && inode) {
inode_dec_link_count(inode);
discard_new_inode(inode);
}
btrfs_btree_balance_dirty(fs_info); return err;
}
static struct btrfs_trans_handle *insert_prealloc_file_extent(
struct btrfs_trans_handle *trans_in,
struct btrfs_inode *inode,
struct btrfs_key *ins,
u64 file_offset)
{
struct btrfs_file_extent_item stack_fi;
struct btrfs_replace_extent_info extent_info;
struct btrfs_trans_handle *trans = trans_in;
struct btrfs_path *path;
u64 start = ins->objectid;
u64 len = ins->offset;
int qgroup_released;
int ret;
memset(&stack_fi, 0, sizeof(stack_fi));
btrfs_set_stack_file_extent_type(&stack_fi, BTRFS_FILE_EXTENT_PREALLOC);
btrfs_set_stack_file_extent_disk_bytenr(&stack_fi, start);
btrfs_set_stack_file_extent_disk_num_bytes(&stack_fi, len);
btrfs_set_stack_file_extent_num_bytes(&stack_fi, len);
btrfs_set_stack_file_extent_ram_bytes(&stack_fi, len);
btrfs_set_stack_file_extent_compression(&stack_fi, BTRFS_COMPRESS_NONE);
/* Encryption and other encoding is reserved and all 0 */
qgroup_released = btrfs_qgroup_release_data(inode, file_offset, len);
if (qgroup_released < 0)
return ERR_PTR(qgroup_released); if (trans) { ret = insert_reserved_file_extent(trans, inode,
file_offset, &stack_fi,
true, qgroup_released);
if (ret)
goto free_qgroup;
return trans;
}
extent_info.disk_offset = start;
extent_info.disk_len = len;
extent_info.data_offset = 0;
extent_info.data_len = len;
extent_info.file_offset = file_offset;
extent_info.extent_buf = (char *)&stack_fi;
extent_info.is_new_extent = true;
extent_info.qgroup_reserved = qgroup_released;
extent_info.insertions = 0;
path = btrfs_alloc_path();
if (!path) {
ret = -ENOMEM;
goto free_qgroup;
}
ret = btrfs_replace_file_extents(inode, path, file_offset,
file_offset + len - 1, &extent_info,
&trans);
btrfs_free_path(path);
if (ret)
goto free_qgroup;
return trans;
free_qgroup:
/*
* We have released qgroup data range at the beginning of the function,
* and normally qgroup_released bytes will be freed when committing
* transaction.
* But if we error out early, we have to free what we have released
* or we leak qgroup data reservation.
*/
btrfs_qgroup_free_refroot(inode->root->fs_info,
inode->root->root_key.objectid, qgroup_released,
BTRFS_QGROUP_RSV_DATA);
return ERR_PTR(ret);
}
static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
u64 start, u64 num_bytes, u64 min_size,
loff_t actual_len, u64 *alloc_hint,
struct btrfs_trans_handle *trans)
{
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
struct extent_map *em;
struct btrfs_root *root = BTRFS_I(inode)->root;
struct btrfs_key ins;
u64 cur_offset = start;
u64 clear_offset = start;
u64 i_size;
u64 cur_bytes;
u64 last_alloc = (u64)-1;
int ret = 0;
bool own_trans = true;
u64 end = start + num_bytes - 1;
if (trans)
own_trans = false;
while (num_bytes > 0) { cur_bytes = min_t(u64, num_bytes, SZ_256M);
cur_bytes = max(cur_bytes, min_size);
/*
* If we are severely fragmented we could end up with really
* small allocations, so if the allocator is returning small
* chunks lets make its job easier by only searching for those
* sized chunks.
*/
cur_bytes = min(cur_bytes, last_alloc);
ret = btrfs_reserve_extent(root, cur_bytes, cur_bytes,
min_size, 0, *alloc_hint, &ins, 1, 0);
if (ret)
break;
/*
* We've reserved this space, and thus converted it from
* ->bytes_may_use to ->bytes_reserved. Any error that happens
* from here on out we will only need to clear our reservation
* for the remaining unreserved area, so advance our
* clear_offset by our extent size.
*/
clear_offset += ins.offset;
last_alloc = ins.offset;
trans = insert_prealloc_file_extent(trans, BTRFS_I(inode),
&ins, cur_offset);
/*
* Now that we inserted the prealloc extent we can finally
* decrement the number of reservations in the block group.
* If we did it before, we could race with relocation and have
* relocation miss the reserved extent, making it fail later.
*/
btrfs_dec_block_group_reservations(fs_info, ins.objectid);
if (IS_ERR(trans)) {
ret = PTR_ERR(trans);
btrfs_free_reserved_extent(fs_info, ins.objectid,
ins.offset, 0);
break;
}
btrfs_drop_extent_cache(BTRFS_I(inode), cur_offset,
cur_offset + ins.offset -1, 0);
em = alloc_extent_map();
if (!em) {
set_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
&BTRFS_I(inode)->runtime_flags);
goto next;
}
em->start = cur_offset;
em->orig_start = cur_offset;
em->len = ins.offset;
em->block_start = ins.objectid;
em->block_len = ins.offset;
em->orig_block_len = ins.offset;
em->ram_bytes = ins.offset;
set_bit(EXTENT_FLAG_PREALLOC, &em->flags);
em->generation = trans->transid;
while (1) {
write_lock(&em_tree->lock);
ret = add_extent_mapping(em_tree, em, 1);
write_unlock(&em_tree->lock);
if (ret != -EEXIST)
break;
btrfs_drop_extent_cache(BTRFS_I(inode), cur_offset,
cur_offset + ins.offset - 1,
0);
}
free_extent_map(em);
next:
num_bytes -= ins.offset;
cur_offset += ins.offset;
*alloc_hint = ins.objectid + ins.offset;
inode_inc_iversion(inode);
inode->i_ctime = current_time(inode);
BTRFS_I(inode)->flags |= BTRFS_INODE_PREALLOC;
if (!(mode & FALLOC_FL_KEEP_SIZE) &&
(actual_len > inode->i_size) && (cur_offset > inode->i_size)) { if (cur_offset > actual_len)
i_size = actual_len;
else
i_size = cur_offset;
i_size_write(inode, i_size);
btrfs_inode_safe_disk_i_size_write(BTRFS_I(inode), 0);
}
ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
if (ret) {
btrfs_abort_transaction(trans, ret);
if (own_trans)
btrfs_end_transaction(trans);
break;
}
if (own_trans) { btrfs_end_transaction(trans);
trans = NULL;
}
}
if (clear_offset < end) btrfs_free_reserved_data_space(BTRFS_I(inode), NULL, clear_offset,
end - clear_offset + 1);
return ret;
}
int btrfs_prealloc_file_range(struct inode *inode, int mode,
u64 start, u64 num_bytes, u64 min_size,
loff_t actual_len, u64 *alloc_hint)
{
return __btrfs_prealloc_file_range(inode, mode, start, num_bytes,
min_size, actual_len, alloc_hint,
NULL);
}
int btrfs_prealloc_file_range_trans(struct inode *inode,
struct btrfs_trans_handle *trans, int mode,
u64 start, u64 num_bytes, u64 min_size,
loff_t actual_len, u64 *alloc_hint)
{
return __btrfs_prealloc_file_range(inode, mode, start, num_bytes,
min_size, actual_len, alloc_hint, trans);
}
static int btrfs_set_page_dirty(struct page *page)
{
return __set_page_dirty_nobuffers(page);
}
static int btrfs_permission(struct user_namespace *mnt_userns,
struct inode *inode, int mask)
{
struct btrfs_root *root = BTRFS_I(inode)->root; umode_t mode = inode->i_mode; if (mask & MAY_WRITE &&
(S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode))) {
if (btrfs_root_readonly(root))
return -EROFS;
if (BTRFS_I(inode)->flags & BTRFS_INODE_READONLY)
return -EACCES;
}
return generic_permission(mnt_userns, inode, mask);
}
static int btrfs_tmpfile(struct user_namespace *mnt_userns, struct inode *dir,
struct dentry *dentry, umode_t mode)
{
struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb);
struct btrfs_trans_handle *trans;
struct btrfs_root *root = BTRFS_I(dir)->root;
struct inode *inode = NULL;
u64 objectid;
u64 index;
int ret = 0;
/*
* 5 units required for adding orphan entry
*/
trans = btrfs_start_transaction(root, 5);
if (IS_ERR(trans))
return PTR_ERR(trans);
ret = btrfs_get_free_objectid(root, &objectid);
if (ret)
goto out;
inode = btrfs_new_inode(trans, root, mnt_userns, dir, NULL, 0,
btrfs_ino(BTRFS_I(dir)), objectid, mode, &index);
if (IS_ERR(inode)) {
ret = PTR_ERR(inode);
inode = NULL;
goto out;
}
inode->i_fop = &btrfs_file_operations;
inode->i_op = &btrfs_file_inode_operations;
inode->i_mapping->a_ops = &btrfs_aops;
ret = btrfs_init_inode_security(trans, inode, dir, NULL);
if (ret)
goto out;
ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
if (ret)
goto out;
ret = btrfs_orphan_add(trans, BTRFS_I(inode));
if (ret)
goto out;
/*
* We set number of links to 0 in btrfs_new_inode(), and here we set
* it to 1 because d_tmpfile() will issue a warning if the count is 0,
* through:
*
* d_tmpfile() -> inode_dec_link_count() -> drop_nlink()
*/
set_nlink(inode, 1);
d_tmpfile(dentry, inode);
unlock_new_inode(inode);
mark_inode_dirty(inode);
out:
btrfs_end_transaction(trans);
if (ret && inode)
discard_new_inode(inode); btrfs_btree_balance_dirty(fs_info); return ret;
}
void btrfs_set_range_writeback(struct btrfs_inode *inode, u64 start, u64 end)
{
struct btrfs_fs_info *fs_info = inode->root->fs_info;
unsigned long index = start >> PAGE_SHIFT;
unsigned long end_index = end >> PAGE_SHIFT;
struct page *page;
u32 len;
ASSERT(end + 1 - start <= U32_MAX);
len = end + 1 - start;
while (index <= end_index) {
page = find_get_page(inode->vfs_inode.i_mapping, index);
ASSERT(page); /* Pages should be in the extent_io_tree */
btrfs_page_set_writeback(fs_info, page, start, len);
put_page(page);
index++;
}
}
#ifdef CONFIG_SWAP
/*
* Add an entry indicating a block group or device which is pinned by a
* swapfile. Returns 0 on success, 1 if there is already an entry for it, or a
* negative errno on failure.
*/
static int btrfs_add_swapfile_pin(struct inode *inode, void *ptr,
bool is_block_group)
{
struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
struct btrfs_swapfile_pin *sp, *entry;
struct rb_node **p;
struct rb_node *parent = NULL;
sp = kmalloc(sizeof(*sp), GFP_NOFS);
if (!sp)
return -ENOMEM;
sp->ptr = ptr;
sp->inode = inode;
sp->is_block_group = is_block_group;
sp->bg_extent_count = 1;
spin_lock(&fs_info->swapfile_pins_lock);
p = &fs_info->swapfile_pins.rb_node;
while (*p) {
parent = *p;
entry = rb_entry(parent, struct btrfs_swapfile_pin, node);
if (sp->ptr < entry->ptr ||
(sp->ptr == entry->ptr && sp->inode < entry->inode)) {
p = &(*p)->rb_left;
} else if (sp->ptr > entry->ptr ||
(sp->ptr == entry->ptr && sp->inode > entry->inode)) {
p = &(*p)->rb_right;
} else {
if (is_block_group)
entry->bg_extent_count++;
spin_unlock(&fs_info->swapfile_pins_lock);
kfree(sp);
return 1;
}
}
rb_link_node(&sp->node, parent, p);
rb_insert_color(&sp->node, &fs_info->swapfile_pins);
spin_unlock(&fs_info->swapfile_pins_lock);
return 0;
}
/* Free all of the entries pinned by this swapfile. */
static void btrfs_free_swapfile_pins(struct inode *inode)
{
struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
struct btrfs_swapfile_pin *sp;
struct rb_node *node, *next;
spin_lock(&fs_info->swapfile_pins_lock);
node = rb_first(&fs_info->swapfile_pins);
while (node) {
next = rb_next(node);
sp = rb_entry(node, struct btrfs_swapfile_pin, node);
if (sp->inode == inode) {
rb_erase(&sp->node, &fs_info->swapfile_pins);
if (sp->is_block_group) {
btrfs_dec_block_group_swap_extents(sp->ptr,
sp->bg_extent_count);
btrfs_put_block_group(sp->ptr);
}
kfree(sp);
}
node = next;
}
spin_unlock(&fs_info->swapfile_pins_lock);
}
struct btrfs_swap_info {
u64 start;
u64 block_start;
u64 block_len;
u64 lowest_ppage;
u64 highest_ppage;
unsigned long nr_pages;
int nr_extents;
};
static int btrfs_add_swap_extent(struct swap_info_struct *sis,
struct btrfs_swap_info *bsi)
{
unsigned long nr_pages;
unsigned long max_pages;
u64 first_ppage, first_ppage_reported, next_ppage;
int ret;
/*
* Our swapfile may have had its size extended after the swap header was
* written. In that case activating the swapfile should not go beyond
* the max size set in the swap header.
*/
if (bsi->nr_pages >= sis->max)
return 0;
max_pages = sis->max - bsi->nr_pages;
first_ppage = ALIGN(bsi->block_start, PAGE_SIZE) >> PAGE_SHIFT;
next_ppage = ALIGN_DOWN(bsi->block_start + bsi->block_len,
PAGE_SIZE) >> PAGE_SHIFT;
if (first_ppage >= next_ppage)
return 0;
nr_pages = next_ppage - first_ppage;
nr_pages = min(nr_pages, max_pages);
first_ppage_reported = first_ppage;
if (bsi->start == 0)
first_ppage_reported++;
if (bsi->lowest_ppage > first_ppage_reported)
bsi->lowest_ppage = first_ppage_reported;
if (bsi->highest_ppage < (next_ppage - 1))
bsi->highest_ppage = next_ppage - 1;
ret = add_swap_extent(sis, bsi->nr_pages, nr_pages, first_ppage);
if (ret < 0)
return ret;
bsi->nr_extents += ret;
bsi->nr_pages += nr_pages;
return 0;
}
static void btrfs_swap_deactivate(struct file *file)
{
struct inode *inode = file_inode(file);
btrfs_free_swapfile_pins(inode);
atomic_dec(&BTRFS_I(inode)->root->nr_swapfiles);
}
static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file,
sector_t *span)
{
struct inode *inode = file_inode(file);
struct btrfs_root *root = BTRFS_I(inode)->root;
struct btrfs_fs_info *fs_info = root->fs_info;
struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
struct extent_state *cached_state = NULL;
struct extent_map *em = NULL;
struct btrfs_device *device = NULL;
struct btrfs_swap_info bsi = {
.lowest_ppage = (sector_t)-1ULL,
};
int ret = 0;
u64 isize;
u64 start;
/*
* If the swap file was just created, make sure delalloc is done. If the
* file changes again after this, the user is doing something stupid and
* we don't really care.
*/
ret = btrfs_wait_ordered_range(inode, 0, (u64)-1);
if (ret)
return ret;
/*
* The inode is locked, so these flags won't change after we check them.
*/
if (BTRFS_I(inode)->flags & BTRFS_INODE_COMPRESS) {
btrfs_warn(fs_info, "swapfile must not be compressed");
return -EINVAL;
}
if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW)) {
btrfs_warn(fs_info, "swapfile must not be copy-on-write");
return -EINVAL;
}
if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) {
btrfs_warn(fs_info, "swapfile must not be checksummed");
return -EINVAL;
}
/*
* Balance or device remove/replace/resize can move stuff around from
* under us. The exclop protection makes sure they aren't running/won't
* run concurrently while we are mapping the swap extents, and
* fs_info->swapfile_pins prevents them from running while the swap
* file is active and moving the extents. Note that this also prevents
* a concurrent device add which isn't actually necessary, but it's not
* really worth the trouble to allow it.
*/
if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_SWAP_ACTIVATE)) {
btrfs_warn(fs_info,
"cannot activate swapfile while exclusive operation is running");
return -EBUSY;
}
/*
* Prevent snapshot creation while we are activating the swap file.
* We do not want to race with snapshot creation. If snapshot creation
* already started before we bumped nr_swapfiles from 0 to 1 and
* completes before the first write into the swap file after it is
* activated, than that write would fallback to COW.
*/
if (!btrfs_drew_try_write_lock(&root->snapshot_lock)) {
btrfs_exclop_finish(fs_info);
btrfs_warn(fs_info,
"cannot activate swapfile because snapshot creation is in progress");
return -EINVAL;
}
/*
* Snapshots can create extents which require COW even if NODATACOW is
* set. We use this counter to prevent snapshots. We must increment it
* before walking the extents because we don't want a concurrent
* snapshot to run after we've already checked the extents.
*
* It is possible that subvolume is marked for deletion but still not
* removed yet. To prevent this race, we check the root status before
* activating the swapfile.
*/
spin_lock(&root->root_item_lock);
if (btrfs_root_dead(root)) {
spin_unlock(&root->root_item_lock);
btrfs_exclop_finish(fs_info);
btrfs_warn(fs_info,
"cannot activate swapfile because subvolume %llu is being deleted",
root->root_key.objectid);
return -EPERM;
}
atomic_inc(&root->nr_swapfiles);
spin_unlock(&root->root_item_lock);
isize = ALIGN_DOWN(inode->i_size, fs_info->sectorsize);
lock_extent_bits(io_tree, 0, isize - 1, &cached_state);
start = 0;
while (start < isize) {
u64 logical_block_start, physical_block_start;
struct btrfs_block_group *bg;
u64 len = isize - start;
em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, start, len);
if (IS_ERR(em)) {
ret = PTR_ERR(em);
goto out;
}
if (em->block_start == EXTENT_MAP_HOLE) {
btrfs_warn(fs_info, "swapfile must not have holes");
ret = -EINVAL;
goto out;
}
if (em->block_start == EXTENT_MAP_INLINE) {
/*
* It's unlikely we'll ever actually find ourselves
* here, as a file small enough to fit inline won't be
* big enough to store more than the swap header, but in
* case something changes in the future, let's catch it
* here rather than later.
*/
btrfs_warn(fs_info, "swapfile must not be inline");
ret = -EINVAL;
goto out;
}
if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
btrfs_warn(fs_info, "swapfile must not be compressed");
ret = -EINVAL;
goto out;
}
logical_block_start = em->block_start + (start - em->start);
len = min(len, em->len - (start - em->start));
free_extent_map(em);
em = NULL;
ret = can_nocow_extent(inode, start, &len, NULL, NULL, NULL, true);
if (ret < 0) {
goto out;
} else if (ret) {
ret = 0;
} else {
btrfs_warn(fs_info,
"swapfile must not be copy-on-write");
ret = -EINVAL;
goto out;
}
em = btrfs_get_chunk_map(fs_info, logical_block_start, len);
if (IS_ERR(em)) {
ret = PTR_ERR(em);
goto out;
}
if (em->map_lookup->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) {
btrfs_warn(fs_info,
"swapfile must have single data profile");
ret = -EINVAL;
goto out;
}
if (device == NULL) {
device = em->map_lookup->stripes[0].dev;
ret = btrfs_add_swapfile_pin(inode, device, false);
if (ret == 1)
ret = 0;
else if (ret)
goto out;
} else if (device != em->map_lookup->stripes[0].dev) {
btrfs_warn(fs_info, "swapfile must be on one device");
ret = -EINVAL;
goto out;
}
physical_block_start = (em->map_lookup->stripes[0].physical +
(logical_block_start - em->start));
len = min(len, em->len - (logical_block_start - em->start));
free_extent_map(em);
em = NULL;
bg = btrfs_lookup_block_group(fs_info, logical_block_start);
if (!bg) {
btrfs_warn(fs_info,
"could not find block group containing swapfile");
ret = -EINVAL;
goto out;
}
if (!btrfs_inc_block_group_swap_extents(bg)) {
btrfs_warn(fs_info,
"block group for swapfile at %llu is read-only%s",
bg->start,
atomic_read(&fs_info->scrubs_running) ?
" (scrub running)" : "");
btrfs_put_block_group(bg);
ret = -EINVAL;
goto out;
}
ret = btrfs_add_swapfile_pin(inode, bg, true);
if (ret) {
btrfs_put_block_group(bg);
if (ret == 1)
ret = 0;
else
goto out;
}
if (bsi.block_len &&
bsi.block_start + bsi.block_len == physical_block_start) {
bsi.block_len += len;
} else {
if (bsi.block_len) {
ret = btrfs_add_swap_extent(sis, &bsi);
if (ret)
goto out;
}
bsi.start = start;
bsi.block_start = physical_block_start;
bsi.block_len = len;
}
start += len;
}
if (bsi.block_len)
ret = btrfs_add_swap_extent(sis, &bsi);
out:
if (!IS_ERR_OR_NULL(em))
free_extent_map(em);
unlock_extent_cached(io_tree, 0, isize - 1, &cached_state);
if (ret)
btrfs_swap_deactivate(file);
btrfs_drew_write_unlock(&root->snapshot_lock);
btrfs_exclop_finish(fs_info);
if (ret)
return ret;
if (device)
sis->bdev = device->bdev;
*span = bsi.highest_ppage - bsi.lowest_ppage + 1;
sis->max = bsi.nr_pages;
sis->pages = bsi.nr_pages - 1;
sis->highest_bit = bsi.nr_pages - 1;
return bsi.nr_extents;
}
#else
static void btrfs_swap_deactivate(struct file *file)
{
}
static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file,
sector_t *span)
{
return -EOPNOTSUPP;
}
#endif
/*
* Update the number of bytes used in the VFS' inode. When we replace extents in
* a range (clone, dedupe, fallocate's zero range), we must update the number of
* bytes used by the inode in an atomic manner, so that concurrent stat(2) calls
* always get a correct value.
*/
void btrfs_update_inode_bytes(struct btrfs_inode *inode,
const u64 add_bytes,
const u64 del_bytes)
{
if (add_bytes == del_bytes)
return;
spin_lock(&inode->lock);
if (del_bytes > 0)
inode_sub_bytes(&inode->vfs_inode, del_bytes); if (add_bytes > 0) inode_add_bytes(&inode->vfs_inode, add_bytes);
spin_unlock(&inode->lock);
}
static const struct inode_operations btrfs_dir_inode_operations = {
.getattr = btrfs_getattr,
.lookup = btrfs_lookup,
.create = btrfs_create,
.unlink = btrfs_unlink,
.link = btrfs_link,
.mkdir = btrfs_mkdir,
.rmdir = btrfs_rmdir,
.rename = btrfs_rename2,
.symlink = btrfs_symlink,
.setattr = btrfs_setattr,
.mknod = btrfs_mknod,
.listxattr = btrfs_listxattr,
.permission = btrfs_permission,
.get_acl = btrfs_get_acl,
.set_acl = btrfs_set_acl,
.update_time = btrfs_update_time,
.tmpfile = btrfs_tmpfile,
.fileattr_get = btrfs_fileattr_get,
.fileattr_set = btrfs_fileattr_set,
};
static const struct file_operations btrfs_dir_file_operations = {
.llseek = generic_file_llseek,
.read = generic_read_dir,
.iterate_shared = btrfs_real_readdir,
.open = btrfs_opendir,
.unlocked_ioctl = btrfs_ioctl,
#ifdef CONFIG_COMPAT
.compat_ioctl = btrfs_compat_ioctl,
#endif
.release = btrfs_release_file,
.fsync = btrfs_sync_file,
};
/*
* btrfs doesn't support the bmap operation because swapfiles
* use bmap to make a mapping of extents in the file. They assume
* these extents won't change over the life of the file and they
* use the bmap result to do IO directly to the drive.
*
* the btrfs bmap call would return logical addresses that aren't
* suitable for IO and they also will change frequently as COW
* operations happen. So, swapfile + btrfs == corruption.
*
* For now we're avoiding this by dropping bmap.
*/
static const struct address_space_operations btrfs_aops = {
.readpage = btrfs_readpage,
.writepage = btrfs_writepage,
.writepages = btrfs_writepages,
.readahead = btrfs_readahead,
.direct_IO = noop_direct_IO,
.invalidatepage = btrfs_invalidatepage,
.releasepage = btrfs_releasepage,
#ifdef CONFIG_MIGRATION
.migratepage = btrfs_migratepage,
#endif
.set_page_dirty = btrfs_set_page_dirty,
.error_remove_page = generic_error_remove_page,
.swap_activate = btrfs_swap_activate,
.swap_deactivate = btrfs_swap_deactivate,
};
static const struct inode_operations btrfs_file_inode_operations = {
.getattr = btrfs_getattr,
.setattr = btrfs_setattr,
.listxattr = btrfs_listxattr,
.permission = btrfs_permission,
.fiemap = btrfs_fiemap,
.get_acl = btrfs_get_acl,
.set_acl = btrfs_set_acl,
.update_time = btrfs_update_time,
.fileattr_get = btrfs_fileattr_get,
.fileattr_set = btrfs_fileattr_set,
};
static const struct inode_operations btrfs_special_inode_operations = {
.getattr = btrfs_getattr,
.setattr = btrfs_setattr,
.permission = btrfs_permission,
.listxattr = btrfs_listxattr,
.get_acl = btrfs_get_acl,
.set_acl = btrfs_set_acl,
.update_time = btrfs_update_time,
};
static const struct inode_operations btrfs_symlink_inode_operations = {
.get_link = page_get_link,
.getattr = btrfs_getattr,
.setattr = btrfs_setattr,
.permission = btrfs_permission,
.listxattr = btrfs_listxattr,
.update_time = btrfs_update_time,
};
const struct dentry_operations btrfs_dentry_operations = {
.d_delete = btrfs_dentry_delete,
};
// SPDX-License-Identifier: GPL-2.0
#include <linux/err.h>
#include <linux/bug.h>
#include <linux/atomic.h>
#include <linux/errseq.h>
#include <linux/log2.h>
/*
* An errseq_t is a way of recording errors in one place, and allowing any
* number of "subscribers" to tell whether it has changed since a previous
* point where it was sampled.
*
* It's implemented as an unsigned 32-bit value. The low order bits are
* designated to hold an error code (between 0 and -MAX_ERRNO). The upper bits
* are used as a counter. This is done with atomics instead of locking so that
* these functions can be called from any context.
*
* The general idea is for consumers to sample an errseq_t value. That value
* can later be used to tell whether any new errors have occurred since that
* sampling was done.
*
* Note that there is a risk of collisions if new errors are being recorded
* frequently, since we have so few bits to use as a counter.
*
* To mitigate this, one bit is used as a flag to tell whether the value has
* been sampled since a new value was recorded. That allows us to avoid bumping
* the counter if no one has sampled it since the last time an error was
* recorded.
*
* A new errseq_t should always be zeroed out. A errseq_t value of all zeroes
* is the special (but common) case where there has never been an error. An all
* zero value thus serves as the "epoch" if one wishes to know whether there
* has ever been an error set since it was first initialized.
*/
/* The low bits are designated for error code (max of MAX_ERRNO) */
#define ERRSEQ_SHIFT ilog2(MAX_ERRNO + 1)
/* This bit is used as a flag to indicate whether the value has been seen */
#define ERRSEQ_SEEN (1 << ERRSEQ_SHIFT)
/* The lowest bit of the counter */
#define ERRSEQ_CTR_INC (1 << (ERRSEQ_SHIFT + 1))
/**
* errseq_set - set a errseq_t for later reporting
* @eseq: errseq_t field that should be set
* @err: error to set (must be between -1 and -MAX_ERRNO)
*
* This function sets the error in @eseq, and increments the sequence counter
* if the last sequence was sampled at some point in the past.
*
* Any error set will always overwrite an existing error.
*
* Return: The previous value, primarily for debugging purposes. The
* return value should not be used as a previously sampled value in later
* calls as it will not have the SEEN flag set.
*/
errseq_t errseq_set(errseq_t *eseq, int err)
{
errseq_t cur, old;
/* MAX_ERRNO must be able to serve as a mask */
BUILD_BUG_ON_NOT_POWER_OF_2(MAX_ERRNO + 1);
/*
* Ensure the error code actually fits where we want it to go. If it
* doesn't then just throw a warning and don't record anything. We
* also don't accept zero here as that would effectively clear a
* previous error.
*/
old = READ_ONCE(*eseq);
if (WARN(unlikely(err == 0 || (unsigned int)-err > MAX_ERRNO),
"err = %d\n", err))
return old;
for (;;) {
errseq_t new;
/* Clear out error bits and set new error */
new = (old & ~(MAX_ERRNO|ERRSEQ_SEEN)) | -err;
/* Only increment if someone has looked at it */
if (old & ERRSEQ_SEEN)
new += ERRSEQ_CTR_INC;
/* If there would be no change, then call it done */
if (new == old) {
cur = new;
break;
}
/* Try to swap the new value into place */
cur = cmpxchg(eseq, old, new);
/*
* Call it success if we did the swap or someone else beat us
* to it for the same value.
*/
if (likely(cur == old || cur == new))
break;
/* Raced with an update, try again */
old = cur;
}
return cur;
}
EXPORT_SYMBOL(errseq_set);
/**
* errseq_sample() - Grab current errseq_t value.
* @eseq: Pointer to errseq_t to be sampled.
*
* This function allows callers to initialise their errseq_t variable.
* If the error has been "seen", new callers will not see an old error.
* If there is an unseen error in @eseq, the caller of this function will
* see it the next time it checks for an error.
*
* Context: Any context.
* Return: The current errseq value.
*/
errseq_t errseq_sample(errseq_t *eseq)
{
errseq_t old = READ_ONCE(*eseq);
/* If nobody has seen this error yet, then we can be the first. */
if (!(old & ERRSEQ_SEEN))
old = 0;
return old;
}
EXPORT_SYMBOL(errseq_sample);
/**
* errseq_check() - Has an error occurred since a particular sample point?
* @eseq: Pointer to errseq_t value to be checked.
* @since: Previously-sampled errseq_t from which to check.
*
* Grab the value that eseq points to, and see if it has changed @since
* the given value was sampled. The @since value is not advanced, so there
* is no need to mark the value as seen.
*
* Return: The latest error set in the errseq_t or 0 if it hasn't changed.
*/
int errseq_check(errseq_t *eseq, errseq_t since)
{
errseq_t cur = READ_ONCE(*eseq); if (likely(cur == since))
return 0;
return -(cur & MAX_ERRNO);
}
EXPORT_SYMBOL(errseq_check);
/**
* errseq_check_and_advance() - Check an errseq_t and advance to current value.
* @eseq: Pointer to value being checked and reported.
* @since: Pointer to previously-sampled errseq_t to check against and advance.
*
* Grab the eseq value, and see whether it matches the value that @since
* points to. If it does, then just return 0.
*
* If it doesn't, then the value has changed. Set the "seen" flag, and try to
* swap it into place as the new eseq value. Then, set that value as the new
* "since" value, and return whatever the error portion is set to.
*
* Note that no locking is provided here for concurrent updates to the "since"
* value. The caller must provide that if necessary. Because of this, callers
* may want to do a lockless errseq_check before taking the lock and calling
* this.
*
* Return: Negative errno if one has been stored, or 0 if no new error has
* occurred.
*/
int errseq_check_and_advance(errseq_t *eseq, errseq_t *since)
{
int err = 0;
errseq_t old, new;
/*
* Most callers will want to use the inline wrapper to check this,
* so that the common case of no error is handled without needing
* to take the lock that protects the "since" value.
*/
old = READ_ONCE(*eseq);
if (old != *since) {
/*
* Set the flag and try to swap it into place if it has
* changed.
*
* We don't care about the outcome of the swap here. If the
* swap doesn't occur, then it has either been updated by a
* writer who is altering the value in some way (updating
* counter or resetting the error), or another reader who is
* just setting the "seen" flag. Either outcome is OK, and we
* can advance "since" and return an error based on what we
* have.
*/
new = old | ERRSEQ_SEEN;
if (new != old)
cmpxchg(eseq, old, new);
*since = new;
err = -(new & MAX_ERRNO);
}
return err;
}
EXPORT_SYMBOL(errseq_check_and_advance);
// SPDX-License-Identifier: GPL-2.0-only
/*
* Copyright (C) 1994 Linus Torvalds
*
* Pentium III FXSR, SSE support
* General FPU state handling cleanups
* Gareth Hughes <gareth@valinux.com>, May 2000
*/
#include <asm/fpu/internal.h>
#include <asm/fpu/regset.h>
#include <asm/fpu/signal.h>
#include <asm/fpu/types.h>
#include <asm/traps.h>
#include <asm/irq_regs.h>
#include <linux/hardirq.h>
#include <linux/pkeys.h>
#define CREATE_TRACE_POINTS
#include <asm/trace/fpu.h>
/*
* Represents the initial FPU state. It's mostly (but not completely) zeroes,
* depending on the FPU hardware format:
*/
union fpregs_state init_fpstate __ro_after_init;
/*
* Track whether the kernel is using the FPU state
* currently.
*
* This flag is used:
*
* - by IRQ context code to potentially use the FPU
* if it's unused.
*
* - to debug kernel_fpu_begin()/end() correctness
*/
static DEFINE_PER_CPU(bool, in_kernel_fpu);
/*
* Track which context is using the FPU on the CPU:
*/
DEFINE_PER_CPU(struct fpu *, fpu_fpregs_owner_ctx);
static bool kernel_fpu_disabled(void)
{
return this_cpu_read(in_kernel_fpu);
}
static bool interrupted_kernel_fpu_idle(void)
{
return !kernel_fpu_disabled();
}
/*
* Were we in user mode (or vm86 mode) when we were
* interrupted?
*
* Doing kernel_fpu_begin/end() is ok if we are running
* in an interrupt context from user mode - we'll just
* save the FPU state as required.
*/
static bool interrupted_user_mode(void)
{
struct pt_regs *regs = get_irq_regs();
return regs && user_mode(regs);
}
/*
* Can we use the FPU in kernel mode with the
* whole "kernel_fpu_begin/end()" sequence?
*
* It's always ok in process context (ie "not interrupt")
* but it is sometimes ok even from an irq.
*/
bool irq_fpu_usable(void)
{
return !in_interrupt() ||
interrupted_user_mode() ||
interrupted_kernel_fpu_idle();
}
EXPORT_SYMBOL(irq_fpu_usable);
/*
* Save the FPU register state in fpu->state. The register state is
* preserved.
*
* Must be called with fpregs_lock() held.
*
* The legacy FNSAVE instruction clears all FPU state unconditionally, so
* register state has to be reloaded. That might be a pointless exercise
* when the FPU is going to be used by another task right after that. But
* this only affects 20+ years old 32bit systems and avoids conditionals all
* over the place.
*
* FXSAVE and all XSAVE variants preserve the FPU register state.
*/
void save_fpregs_to_fpstate(struct fpu *fpu)
{
if (likely(use_xsave())) {
os_xsave(&fpu->state.xsave);
/*
* AVX512 state is tracked here because its use is
* known to slow the max clock speed of the core.
*/
if (fpu->state.xsave.header.xfeatures & XFEATURE_MASK_AVX512)
fpu->avx512_timestamp = jiffies;
return;
}
if (likely(use_fxsr())) {
fxsave(&fpu->state.fxsave);
return;
}
/*
* Legacy FPU register saving, FNSAVE always clears FPU registers,
* so we have to reload them from the memory state.
*/
asm volatile("fnsave %[fp]; fwait" : [fp] "=m" (fpu->state.fsave));
frstor(&fpu->state.fsave);
}
EXPORT_SYMBOL(save_fpregs_to_fpstate);
void __restore_fpregs_from_fpstate(union fpregs_state *fpstate, u64 mask)
{
/*
* AMD K7/K8 and later CPUs up to Zen don't save/restore
* FDP/FIP/FOP unless an exception is pending. Clear the x87 state
* here by setting it to fixed values. "m" is a random variable
* that should be in L1.
*/
if (unlikely(static_cpu_has_bug(X86_BUG_FXSAVE_LEAK))) {
asm volatile(
"fnclex\n\t"
"emms\n\t"
"fildl %P[addr]" /* set F?P to defined value */
: : [addr] "m" (fpstate));
}
if (use_xsave()) {
os_xrstor(&fpstate->xsave, mask);
} else {
if (use_fxsr())
fxrstor(&fpstate->fxsave);
else
frstor(&fpstate->fsave);
}
}
EXPORT_SYMBOL_GPL(__restore_fpregs_from_fpstate);
void kernel_fpu_begin_mask(unsigned int kfpu_mask)
{
preempt_disable();
WARN_ON_FPU(!irq_fpu_usable());
WARN_ON_FPU(this_cpu_read(in_kernel_fpu));
this_cpu_write(in_kernel_fpu, true);
if (!(current->flags & PF_KTHREAD) &&
!test_thread_flag(TIF_NEED_FPU_LOAD)) {
set_thread_flag(TIF_NEED_FPU_LOAD);
save_fpregs_to_fpstate(¤t->thread.fpu);
}
__cpu_invalidate_fpregs_state();
/* Put sane initial values into the control registers. */
if (likely(kfpu_mask & KFPU_MXCSR) && boot_cpu_has(X86_FEATURE_XMM))
ldmxcsr(MXCSR_DEFAULT);
if (unlikely(kfpu_mask & KFPU_387) && boot_cpu_has(X86_FEATURE_FPU))
asm volatile ("fninit");
}
EXPORT_SYMBOL_GPL(kernel_fpu_begin_mask);
void kernel_fpu_end(void)
{
WARN_ON_FPU(!this_cpu_read(in_kernel_fpu));
this_cpu_write(in_kernel_fpu, false);
preempt_enable();
}
EXPORT_SYMBOL_GPL(kernel_fpu_end);
/*
* Sync the FPU register state to current's memory register state when the
* current task owns the FPU. The hardware register state is preserved.
*/
void fpu_sync_fpstate(struct fpu *fpu)
{
WARN_ON_FPU(fpu != ¤t->thread.fpu);
fpregs_lock();
trace_x86_fpu_before_save(fpu);
if (!test_thread_flag(TIF_NEED_FPU_LOAD))
save_fpregs_to_fpstate(fpu);
trace_x86_fpu_after_save(fpu);
fpregs_unlock();
}
static inline void fpstate_init_xstate(struct xregs_state *xsave)
{
/*
* XRSTORS requires these bits set in xcomp_bv, or it will
* trigger #GP:
*/
xsave->header.xcomp_bv = XCOMP_BV_COMPACTED_FORMAT | xfeatures_mask_all;
}
static inline void fpstate_init_fxstate(struct fxregs_state *fx)
{
fx->cwd = 0x37f;
fx->mxcsr = MXCSR_DEFAULT;
}
/*
* Legacy x87 fpstate state init:
*/
static inline void fpstate_init_fstate(struct fregs_state *fp)
{
fp->cwd = 0xffff037fu;
fp->swd = 0xffff0000u;
fp->twd = 0xffffffffu;
fp->fos = 0xffff0000u;
}
void fpstate_init(union fpregs_state *state)
{
if (!static_cpu_has(X86_FEATURE_FPU)) {
fpstate_init_soft(&state->soft);
return;
}
memset(state, 0, fpu_kernel_xstate_size);
if (static_cpu_has(X86_FEATURE_XSAVES))
fpstate_init_xstate(&state->xsave);
if (static_cpu_has(X86_FEATURE_FXSR))
fpstate_init_fxstate(&state->fxsave);
else
fpstate_init_fstate(&state->fsave);
}
EXPORT_SYMBOL_GPL(fpstate_init);
/* Clone current's FPU state on fork */
int fpu_clone(struct task_struct *dst)
{
struct fpu *src_fpu = ¤t->thread.fpu;
struct fpu *dst_fpu = &dst->thread.fpu;
/* The new task's FPU state cannot be valid in the hardware. */
dst_fpu->last_cpu = -1;
if (!cpu_feature_enabled(X86_FEATURE_FPU))
return 0;
/*
* Don't let 'init optimized' areas of the XSAVE area
* leak into the child task:
*/
memset(&dst_fpu->state.xsave, 0, fpu_kernel_xstate_size);
/*
* If the FPU registers are not owned by current just memcpy() the
* state. Otherwise save the FPU registers directly into the
* child's FPU context, without any memory-to-memory copying.
*/
fpregs_lock();
if (test_thread_flag(TIF_NEED_FPU_LOAD))
memcpy(&dst_fpu->state, &src_fpu->state, fpu_kernel_xstate_size);
else
save_fpregs_to_fpstate(dst_fpu);
fpregs_unlock();
set_tsk_thread_flag(dst, TIF_NEED_FPU_LOAD);
trace_x86_fpu_copy_src(src_fpu);
trace_x86_fpu_copy_dst(dst_fpu);
return 0;
}
/*
* Drops current FPU state: deactivates the fpregs and
* the fpstate. NOTE: it still leaves previous contents
* in the fpregs in the eager-FPU case.
*
* This function can be used in cases where we know that
* a state-restore is coming: either an explicit one,
* or a reschedule.
*/
void fpu__drop(struct fpu *fpu)
{
preempt_disable();
if (fpu == ¤t->thread.fpu) {
/* Ignore delayed exceptions from user space */
asm volatile("1: fwait\n"
"2:\n"
_ASM_EXTABLE(1b, 2b));
fpregs_deactivate(fpu);
}
trace_x86_fpu_dropped(fpu);
preempt_enable();
}
/*
* Clear FPU registers by setting them up from the init fpstate.
* Caller must do fpregs_[un]lock() around it.
*/
static inline void restore_fpregs_from_init_fpstate(u64 features_mask)
{
if (use_xsave())
os_xrstor(&init_fpstate.xsave, features_mask);
else if (use_fxsr())
fxrstor(&init_fpstate.fxsave);
else
frstor(&init_fpstate.fsave);
pkru_write_default();
}
static inline unsigned int init_fpstate_copy_size(void)
{
if (!use_xsave())
return fpu_kernel_xstate_size;
/* XSAVE(S) just needs the legacy and the xstate header part */
return sizeof(init_fpstate.xsave);
}
/*
* Reset current->fpu memory state to the init values.
*/
static void fpu_reset_fpstate(void)
{
struct fpu *fpu = ¤t->thread.fpu;
fpregs_lock();
fpu__drop(fpu);
/*
* This does not change the actual hardware registers. It just
* resets the memory image and sets TIF_NEED_FPU_LOAD so a
* subsequent return to usermode will reload the registers from the
* task's memory image.
*
* Do not use fpstate_init() here. Just copy init_fpstate which has
* the correct content already except for PKRU.
*
* PKRU handling does not rely on the xstate when restoring for
* user space as PKRU is eagerly written in switch_to() and
* flush_thread().
*/
memcpy(&fpu->state, &init_fpstate, init_fpstate_copy_size());
set_thread_flag(TIF_NEED_FPU_LOAD);
fpregs_unlock();
}
/*
* Reset current's user FPU states to the init states. current's
* supervisor states, if any, are not modified by this function. The
* caller guarantees that the XSTATE header in memory is intact.
*/
void fpu__clear_user_states(struct fpu *fpu)
{
WARN_ON_FPU(fpu != ¤t->thread.fpu);
fpregs_lock();
if (!cpu_feature_enabled(X86_FEATURE_FPU)) {
fpu_reset_fpstate();
fpregs_unlock();
return;
}
/*
* Ensure that current's supervisor states are loaded into their
* corresponding registers.
*/
if (xfeatures_mask_supervisor() &&
!fpregs_state_valid(fpu, smp_processor_id())) { os_xrstor(&fpu->state.xsave, xfeatures_mask_supervisor());
}
/* Reset user states in registers. */
restore_fpregs_from_init_fpstate(xfeatures_mask_restore_user());
/*
* Now all FPU registers have their desired values. Inform the FPU
* state machine that current's FPU registers are in the hardware
* registers. The memory image does not need to be updated because
* any operation relying on it has to save the registers first when
* current's FPU is marked active.
*/
fpregs_mark_activate();
fpregs_unlock();
}
void fpu_flush_thread(void)
{
fpu_reset_fpstate();
}
/*
* Load FPU context before returning to userspace.
*/
void switch_fpu_return(void)
{
if (!static_cpu_has(X86_FEATURE_FPU))
return;
fpregs_restore_userregs();
}
EXPORT_SYMBOL_GPL(switch_fpu_return);
#ifdef CONFIG_X86_DEBUG_FPU
/*
* If current FPU state according to its tracking (loaded FPU context on this
* CPU) is not valid then we must have TIF_NEED_FPU_LOAD set so the context is
* loaded on return to userland.
*/
void fpregs_assert_state_consistent(void)
{
struct fpu *fpu = ¤t->thread.fpu;
if (test_thread_flag(TIF_NEED_FPU_LOAD))
return;
WARN_ON_FPU(!fpregs_state_valid(fpu, smp_processor_id()));
}
EXPORT_SYMBOL_GPL(fpregs_assert_state_consistent);
#endif
void fpregs_mark_activate(void)
{
struct fpu *fpu = ¤t->thread.fpu;
fpregs_activate(fpu);
fpu->last_cpu = smp_processor_id();
clear_thread_flag(TIF_NEED_FPU_LOAD);
}
EXPORT_SYMBOL_GPL(fpregs_mark_activate);
/*
* x87 math exception handling:
*/
int fpu__exception_code(struct fpu *fpu, int trap_nr)
{
int err;
if (trap_nr == X86_TRAP_MF) {
unsigned short cwd, swd;
/*
* (~cwd & swd) will mask out exceptions that are not set to unmasked
* status. 0x3f is the exception bits in these regs, 0x200 is the
* C1 reg you need in case of a stack fault, 0x040 is the stack
* fault bit. We should only be taking one exception at a time,
* so if this combination doesn't produce any single exception,
* then we have a bad program that isn't synchronizing its FPU usage
* and it will suffer the consequences since we won't be able to
* fully reproduce the context of the exception.
*/
if (boot_cpu_has(X86_FEATURE_FXSR)) {
cwd = fpu->state.fxsave.cwd;
swd = fpu->state.fxsave.swd;
} else {
cwd = (unsigned short)fpu->state.fsave.cwd;
swd = (unsigned short)fpu->state.fsave.swd;
}
err = swd & ~cwd;
} else {
/*
* The SIMD FPU exceptions are handled a little differently, as there
* is only a single status/control register. Thus, to determine which
* unmasked exception was caught we must mask the exception mask bits
* at 0x1f80, and then use these to mask the exception bits at 0x3f.
*/
unsigned short mxcsr = MXCSR_DEFAULT;
if (boot_cpu_has(X86_FEATURE_XMM))
mxcsr = fpu->state.fxsave.mxcsr;
err = ~(mxcsr >> 7) & mxcsr;
}
if (err & 0x001) { /* Invalid op */
/*
* swd & 0x240 == 0x040: Stack Underflow
* swd & 0x240 == 0x240: Stack Overflow
* User must clear the SF bit (0x40) if set
*/
return FPE_FLTINV;
} else if (err & 0x004) { /* Divide by Zero */
return FPE_FLTDIV;
} else if (err & 0x008) { /* Overflow */
return FPE_FLTOVF;
} else if (err & 0x012) { /* Denormal, Underflow */
return FPE_FLTUND;
} else if (err & 0x020) { /* Precision */
return FPE_FLTRES;
}
/*
* If we're using IRQ 13, or supposedly even some trap
* X86_TRAP_MF implementations, it's possible
* we get a spurious trap, which is not an error.
*/
return 0;
}
/* SPDX-License-Identifier: GPL-2.0+ */
#ifndef _LINUX_XARRAY_H
#define _LINUX_XARRAY_H
/*
* eXtensible Arrays
* Copyright (c) 2017 Microsoft Corporation
* Author: Matthew Wilcox <willy@infradead.org>
*
* See Documentation/core-api/xarray.rst for how to use the XArray.
*/
#include <linux/bug.h>
#include <linux/compiler.h>
#include <linux/gfp.h>
#include <linux/kconfig.h>
#include <linux/kernel.h>
#include <linux/rcupdate.h>
#include <linux/spinlock.h>
#include <linux/types.h>
/*
* The bottom two bits of the entry determine how the XArray interprets
* the contents:
*
* 00: Pointer entry
* 10: Internal entry
* x1: Value entry or tagged pointer
*
* Attempting to store internal entries in the XArray is a bug.
*
* Most internal entries are pointers to the next node in the tree.
* The following internal entries have a special meaning:
*
* 0-62: Sibling entries
* 256: Retry entry
* 257: Zero entry
*
* Errors are also represented as internal entries, but use the negative
* space (-4094 to -2). They're never stored in the slots array; only
* returned by the normal API.
*/
#define BITS_PER_XA_VALUE (BITS_PER_LONG - 1)
/**
* xa_mk_value() - Create an XArray entry from an integer.
* @v: Value to store in XArray.
*
* Context: Any context.
* Return: An entry suitable for storing in the XArray.
*/
static inline void *xa_mk_value(unsigned long v)
{
WARN_ON((long)v < 0);
return (void *)((v << 1) | 1);
}
/**
* xa_to_value() - Get value stored in an XArray entry.
* @entry: XArray entry.
*
* Context: Any context.
* Return: The value stored in the XArray entry.
*/
static inline unsigned long xa_to_value(const void *entry)
{
return (unsigned long)entry >> 1;
}
/**
* xa_is_value() - Determine if an entry is a value.
* @entry: XArray entry.
*
* Context: Any context.
* Return: True if the entry is a value, false if it is a pointer.
*/
static inline bool xa_is_value(const void *entry)
{
return (unsigned long)entry & 1;
}
/**
* xa_tag_pointer() - Create an XArray entry for a tagged pointer.
* @p: Plain pointer.
* @tag: Tag value (0, 1 or 3).
*
* If the user of the XArray prefers, they can tag their pointers instead
* of storing value entries. Three tags are available (0, 1 and 3).
* These are distinct from the xa_mark_t as they are not replicated up
* through the array and cannot be searched for.
*
* Context: Any context.
* Return: An XArray entry.
*/
static inline void *xa_tag_pointer(void *p, unsigned long tag)
{
return (void *)((unsigned long)p | tag);
}
/**
* xa_untag_pointer() - Turn an XArray entry into a plain pointer.
* @entry: XArray entry.
*
* If you have stored a tagged pointer in the XArray, call this function
* to get the untagged version of the pointer.
*
* Context: Any context.
* Return: A pointer.
*/
static inline void *xa_untag_pointer(void *entry)
{
return (void *)((unsigned long)entry & ~3UL);
}
/**
* xa_pointer_tag() - Get the tag stored in an XArray entry.
* @entry: XArray entry.
*
* If you have stored a tagged pointer in the XArray, call this function
* to get the tag of that pointer.
*
* Context: Any context.
* Return: A tag.
*/
static inline unsigned int xa_pointer_tag(void *entry)
{
return (unsigned long)entry & 3UL;
}
/*
* xa_mk_internal() - Create an internal entry.
* @v: Value to turn into an internal entry.
*
* Internal entries are used for a number of purposes. Entries 0-255 are
* used for sibling entries (only 0-62 are used by the current code). 256
* is used for the retry entry. 257 is used for the reserved / zero entry.
* Negative internal entries are used to represent errnos. Node pointers
* are also tagged as internal entries in some situations.
*
* Context: Any context.
* Return: An XArray internal entry corresponding to this value.
*/
static inline void *xa_mk_internal(unsigned long v)
{
return (void *)((v << 2) | 2);
}
/*
* xa_to_internal() - Extract the value from an internal entry.
* @entry: XArray entry.
*
* Context: Any context.
* Return: The value which was stored in the internal entry.
*/
static inline unsigned long xa_to_internal(const void *entry)
{
return (unsigned long)entry >> 2;
}
/*
* xa_is_internal() - Is the entry an internal entry?
* @entry: XArray entry.
*
* Context: Any context.
* Return: %true if the entry is an internal entry.
*/
static inline bool xa_is_internal(const void *entry)
{
return ((unsigned long)entry & 3) == 2;
}
#define XA_ZERO_ENTRY xa_mk_internal(257)
/**
* xa_is_zero() - Is the entry a zero entry?
* @entry: Entry retrieved from the XArray
*
* The normal API will return NULL as the contents of a slot containing
* a zero entry. You can only see zero entries by using the advanced API.
*
* Return: %true if the entry is a zero entry.
*/
static inline bool xa_is_zero(const void *entry)
{
return unlikely(entry == XA_ZERO_ENTRY);
}
/**
* xa_is_err() - Report whether an XArray operation returned an error
* @entry: Result from calling an XArray function
*
* If an XArray operation cannot complete an operation, it will return
* a special value indicating an error. This function tells you
* whether an error occurred; xa_err() tells you which error occurred.
*
* Context: Any context.
* Return: %true if the entry indicates an error.
*/
static inline bool xa_is_err(const void *entry)
{
return unlikely(xa_is_internal(entry) &&
entry >= xa_mk_internal(-MAX_ERRNO));
}
/**
* xa_err() - Turn an XArray result into an errno.
* @entry: Result from calling an XArray function.
*
* If an XArray operation cannot complete an operation, it will return
* a special pointer value which encodes an errno. This function extracts
* the errno from the pointer value, or returns 0 if the pointer does not
* represent an errno.
*
* Context: Any context.
* Return: A negative errno or 0.
*/
static inline int xa_err(void *entry)
{
/* xa_to_internal() would not do sign extension. */
if (xa_is_err(entry))
return (long)entry >> 2;
return 0;
}
/**
* struct xa_limit - Represents a range of IDs.
* @min: The lowest ID to allocate (inclusive).
* @max: The maximum ID to allocate (inclusive).
*
* This structure is used either directly or via the XA_LIMIT() macro
* to communicate the range of IDs that are valid for allocation.
* Three common ranges are predefined for you:
* * xa_limit_32b - [0 - UINT_MAX]
* * xa_limit_31b - [0 - INT_MAX]
* * xa_limit_16b - [0 - USHRT_MAX]
*/
struct xa_limit {
u32 max;
u32 min;
};
#define XA_LIMIT(_min, _max) (struct xa_limit) { .min = _min, .max = _max }
#define xa_limit_32b XA_LIMIT(0, UINT_MAX)
#define xa_limit_31b XA_LIMIT(0, INT_MAX)
#define xa_limit_16b XA_LIMIT(0, USHRT_MAX)
typedef unsigned __bitwise xa_mark_t;
#define XA_MARK_0 ((__force xa_mark_t)0U)
#define XA_MARK_1 ((__force xa_mark_t)1U)
#define XA_MARK_2 ((__force xa_mark_t)2U)
#define XA_PRESENT ((__force xa_mark_t)8U)
#define XA_MARK_MAX XA_MARK_2
#define XA_FREE_MARK XA_MARK_0
enum xa_lock_type {
XA_LOCK_IRQ = 1,
XA_LOCK_BH = 2,
};
/*
* Values for xa_flags. The radix tree stores its GFP flags in the xa_flags,
* and we remain compatible with that.
*/
#define XA_FLAGS_LOCK_IRQ ((__force gfp_t)XA_LOCK_IRQ)
#define XA_FLAGS_LOCK_BH ((__force gfp_t)XA_LOCK_BH)
#define XA_FLAGS_TRACK_FREE ((__force gfp_t)4U)
#define XA_FLAGS_ZERO_BUSY ((__force gfp_t)8U)
#define XA_FLAGS_ALLOC_WRAPPED ((__force gfp_t)16U)
#define XA_FLAGS_ACCOUNT ((__force gfp_t)32U)
#define XA_FLAGS_MARK(mark) ((__force gfp_t)((1U << __GFP_BITS_SHIFT) << \
(__force unsigned)(mark)))
/* ALLOC is for a normal 0-based alloc. ALLOC1 is for an 1-based alloc */
#define XA_FLAGS_ALLOC (XA_FLAGS_TRACK_FREE | XA_FLAGS_MARK(XA_FREE_MARK))
#define XA_FLAGS_ALLOC1 (XA_FLAGS_TRACK_FREE | XA_FLAGS_ZERO_BUSY)
/**
* struct xarray - The anchor of the XArray.
* @xa_lock: Lock that protects the contents of the XArray.
*
* To use the xarray, define it statically or embed it in your data structure.
* It is a very small data structure, so it does not usually make sense to
* allocate it separately and keep a pointer to it in your data structure.
*
* You may use the xa_lock to protect your own data structures as well.
*/
/*
* If all of the entries in the array are NULL, @xa_head is a NULL pointer.
* If the only non-NULL entry in the array is at index 0, @xa_head is that
* entry. If any other entry in the array is non-NULL, @xa_head points
* to an @xa_node.
*/
struct xarray {
spinlock_t xa_lock;
/* private: The rest of the data structure is not to be used directly. */
gfp_t xa_flags;
void __rcu * xa_head;
};
#define XARRAY_INIT(name, flags) { \
.xa_lock = __SPIN_LOCK_UNLOCKED(name.xa_lock), \
.xa_flags = flags, \
.xa_head = NULL, \
}
/**
* DEFINE_XARRAY_FLAGS() - Define an XArray with custom flags.
* @name: A string that names your XArray.
* @flags: XA_FLAG values.
*
* This is intended for file scope definitions of XArrays. It declares
* and initialises an empty XArray with the chosen name and flags. It is
* equivalent to calling xa_init_flags() on the array, but it does the
* initialisation at compiletime instead of runtime.
*/
#define DEFINE_XARRAY_FLAGS(name, flags) \
struct xarray name = XARRAY_INIT(name, flags)
/**
* DEFINE_XARRAY() - Define an XArray.
* @name: A string that names your XArray.
*
* This is intended for file scope definitions of XArrays. It declares
* and initialises an empty XArray with the chosen name. It is equivalent
* to calling xa_init() on the array, but it does the initialisation at
* compiletime instead of runtime.
*/
#define DEFINE_XARRAY(name) DEFINE_XARRAY_FLAGS(name, 0)
/**
* DEFINE_XARRAY_ALLOC() - Define an XArray which allocates IDs starting at 0.
* @name: A string that names your XArray.
*
* This is intended for file scope definitions of allocating XArrays.
* See also DEFINE_XARRAY().
*/
#define DEFINE_XARRAY_ALLOC(name) DEFINE_XARRAY_FLAGS(name, XA_FLAGS_ALLOC)
/**
* DEFINE_XARRAY_ALLOC1() - Define an XArray which allocates IDs starting at 1.
* @name: A string that names your XArray.
*
* This is intended for file scope definitions of allocating XArrays.
* See also DEFINE_XARRAY().
*/
#define DEFINE_XARRAY_ALLOC1(name) DEFINE_XARRAY_FLAGS(name, XA_FLAGS_ALLOC1)
void *xa_load(struct xarray *, unsigned long index);
void *xa_store(struct xarray *, unsigned long index, void *entry, gfp_t);
void *xa_erase(struct xarray *, unsigned long index);
void *xa_store_range(struct xarray *, unsigned long first, unsigned long last,
void *entry, gfp_t);
bool xa_get_mark(struct xarray *, unsigned long index, xa_mark_t);
void xa_set_mark(struct xarray *, unsigned long index, xa_mark_t);
void xa_clear_mark(struct xarray *, unsigned long index, xa_mark_t);
void *xa_find(struct xarray *xa, unsigned long *index,
unsigned long max, xa_mark_t) __attribute__((nonnull(2)));
void *xa_find_after(struct xarray *xa, unsigned long *index,
unsigned long max, xa_mark_t) __attribute__((nonnull(2)));
unsigned int xa_extract(struct xarray *, void **dst, unsigned long start,
unsigned long max, unsigned int n, xa_mark_t);
void xa_destroy(struct xarray *);
/**
* xa_init_flags() - Initialise an empty XArray with flags.
* @xa: XArray.
* @flags: XA_FLAG values.
*
* If you need to initialise an XArray with special flags (eg you need
* to take the lock from interrupt context), use this function instead
* of xa_init().
*
* Context: Any context.
*/
static inline void xa_init_flags(struct xarray *xa, gfp_t flags)
{
spin_lock_init(&xa->xa_lock);
xa->xa_flags = flags;
xa->xa_head = NULL;
}
/**
* xa_init() - Initialise an empty XArray.
* @xa: XArray.
*
* An empty XArray is full of NULL entries.
*
* Context: Any context.
*/
static inline void xa_init(struct xarray *xa)
{
xa_init_flags(xa, 0);
}
/**
* xa_empty() - Determine if an array has any present entries.
* @xa: XArray.
*
* Context: Any context.
* Return: %true if the array contains only NULL pointers.
*/
static inline bool xa_empty(const struct xarray *xa)
{
return xa->xa_head == NULL;
}
/**
* xa_marked() - Inquire whether any entry in this array has a mark set
* @xa: Array
* @mark: Mark value
*
* Context: Any context.
* Return: %true if any entry has this mark set.
*/
static inline bool xa_marked(const struct xarray *xa, xa_mark_t mark)
{
return xa->xa_flags & XA_FLAGS_MARK(mark);
}
/**
* xa_for_each_range() - Iterate over a portion of an XArray.
* @xa: XArray.
* @index: Index of @entry.
* @entry: Entry retrieved from array.
* @start: First index to retrieve from array.
* @last: Last index to retrieve from array.
*
* During the iteration, @entry will have the value of the entry stored
* in @xa at @index. You may modify @index during the iteration if you
* want to skip or reprocess indices. It is safe to modify the array
* during the iteration. At the end of the iteration, @entry will be set
* to NULL and @index will have a value less than or equal to max.
*
* xa_for_each_range() is O(n.log(n)) while xas_for_each() is O(n). You have
* to handle your own locking with xas_for_each(), and if you have to unlock
* after each iteration, it will also end up being O(n.log(n)).
* xa_for_each_range() will spin if it hits a retry entry; if you intend to
* see retry entries, you should use the xas_for_each() iterator instead.
* The xas_for_each() iterator will expand into more inline code than
* xa_for_each_range().
*
* Context: Any context. Takes and releases the RCU lock.
*/
#define xa_for_each_range(xa, index, entry, start, last) \
for (index = start, \
entry = xa_find(xa, &index, last, XA_PRESENT); \
entry; \
entry = xa_find_after(xa, &index, last, XA_PRESENT))
/**
* xa_for_each_start() - Iterate over a portion of an XArray.
* @xa: XArray.
* @index: Index of @entry.
* @entry: Entry retrieved from array.
* @start: First index to retrieve from array.
*
* During the iteration, @entry will have the value of the entry stored
* in @xa at @index. You may modify @index during the iteration if you
* want to skip or reprocess indices. It is safe to modify the array
* during the iteration. At the end of the iteration, @entry will be set
* to NULL and @index will have a value less than or equal to max.
*
* xa_for_each_start() is O(n.log(n)) while xas_for_each() is O(n). You have
* to handle your own locking with xas_for_each(), and if you have to unlock
* after each iteration, it will also end up being O(n.log(n)).
* xa_for_each_start() will spin if it hits a retry entry; if you intend to
* see retry entries, you should use the xas_for_each() iterator instead.
* The xas_for_each() iterator will expand into more inline code than
* xa_for_each_start().
*
* Context: Any context. Takes and releases the RCU lock.
*/
#define xa_for_each_start(xa, index, entry, start) \
xa_for_each_range(xa, index, entry, start, ULONG_MAX)
/**
* xa_for_each() - Iterate over present entries in an XArray.
* @xa: XArray.
* @index: Index of @entry.
* @entry: Entry retrieved from array.
*
* During the iteration, @entry will have the value of the entry stored
* in @xa at @index. You may modify @index during the iteration if you want
* to skip or reprocess indices. It is safe to modify the array during the
* iteration. At the end of the iteration, @entry will be set to NULL and
* @index will have a value less than or equal to max.
*
* xa_for_each() is O(n.log(n)) while xas_for_each() is O(n). You have
* to handle your own locking with xas_for_each(), and if you have to unlock
* after each iteration, it will also end up being O(n.log(n)). xa_for_each()
* will spin if it hits a retry entry; if you intend to see retry entries,
* you should use the xas_for_each() iterator instead. The xas_for_each()
* iterator will expand into more inline code than xa_for_each().
*
* Context: Any context. Takes and releases the RCU lock.
*/
#define xa_for_each(xa, index, entry) \
xa_for_each_start(xa, index, entry, 0)
/**
* xa_for_each_marked() - Iterate over marked entries in an XArray.
* @xa: XArray.
* @index: Index of @entry.
* @entry: Entry retrieved from array.
* @filter: Selection criterion.
*
* During the iteration, @entry will have the value of the entry stored
* in @xa at @index. The iteration will skip all entries in the array
* which do not match @filter. You may modify @index during the iteration
* if you want to skip or reprocess indices. It is safe to modify the array
* during the iteration. At the end of the iteration, @entry will be set to
* NULL and @index will have a value less than or equal to max.
*
* xa_for_each_marked() is O(n.log(n)) while xas_for_each_marked() is O(n).
* You have to handle your own locking with xas_for_each(), and if you have
* to unlock after each iteration, it will also end up being O(n.log(n)).
* xa_for_each_marked() will spin if it hits a retry entry; if you intend to
* see retry entries, you should use the xas_for_each_marked() iterator
* instead. The xas_for_each_marked() iterator will expand into more inline
* code than xa_for_each_marked().
*
* Context: Any context. Takes and releases the RCU lock.
*/
#define xa_for_each_marked(xa, index, entry, filter) \
for (index = 0, entry = xa_find(xa, &index, ULONG_MAX, filter); \
entry; entry = xa_find_after(xa, &index, ULONG_MAX, filter))
#define xa_trylock(xa) spin_trylock(&(xa)->xa_lock)
#define xa_lock(xa) spin_lock(&(xa)->xa_lock)
#define xa_unlock(xa) spin_unlock(&(xa)->xa_lock)
#define xa_lock_bh(xa) spin_lock_bh(&(xa)->xa_lock)
#define xa_unlock_bh(xa) spin_unlock_bh(&(xa)->xa_lock)
#define xa_lock_irq(xa) spin_lock_irq(&(xa)->xa_lock)
#define xa_unlock_irq(xa) spin_unlock_irq(&(xa)->xa_lock)
#define xa_lock_irqsave(xa, flags) \
spin_lock_irqsave(&(xa)->xa_lock, flags)
#define xa_unlock_irqrestore(xa, flags) \
spin_unlock_irqrestore(&(xa)->xa_lock, flags)
#define xa_lock_nested(xa, subclass) \
spin_lock_nested(&(xa)->xa_lock, subclass)
#define xa_lock_bh_nested(xa, subclass) \
spin_lock_bh_nested(&(xa)->xa_lock, subclass)
#define xa_lock_irq_nested(xa, subclass) \
spin_lock_irq_nested(&(xa)->xa_lock, subclass)
#define xa_lock_irqsave_nested(xa, flags, subclass) \
spin_lock_irqsave_nested(&(xa)->xa_lock, flags, subclass)
/*
* Versions of the normal API which require the caller to hold the
* xa_lock. If the GFP flags allow it, they will drop the lock to
* allocate memory, then reacquire it afterwards. These functions
* may also re-enable interrupts if the XArray flags indicate the
* locking should be interrupt safe.
*/
void *__xa_erase(struct xarray *, unsigned long index);
void *__xa_store(struct xarray *, unsigned long index, void *entry, gfp_t);
void *__xa_cmpxchg(struct xarray *, unsigned long index, void *old,
void *entry, gfp_t);
int __must_check __xa_insert(struct xarray *, unsigned long index,
void *entry, gfp_t);
int __must_check __xa_alloc(struct xarray *, u32 *id, void *entry,
struct xa_limit, gfp_t);
int __must_check __xa_alloc_cyclic(struct xarray *, u32 *id, void *entry,
struct xa_limit, u32 *next, gfp_t);
void __xa_set_mark(struct xarray *, unsigned long index, xa_mark_t);
void __xa_clear_mark(struct xarray *, unsigned long index, xa_mark_t);
/**
* xa_store_bh() - Store this entry in the XArray.
* @xa: XArray.
* @index: Index into array.
* @entry: New entry.
* @gfp: Memory allocation flags.
*
* This function is like calling xa_store() except it disables softirqs
* while holding the array lock.
*
* Context: Any context. Takes and releases the xa_lock while
* disabling softirqs.
* Return: The old entry at this index or xa_err() if an error happened.
*/
static inline void *xa_store_bh(struct xarray *xa, unsigned long index,
void *entry, gfp_t gfp)
{
void *curr;
xa_lock_bh(xa);
curr = __xa_store(xa, index, entry, gfp);
xa_unlock_bh(xa);
return curr;
}
/**
* xa_store_irq() - Store this entry in the XArray.
* @xa: XArray.
* @index: Index into array.
* @entry: New entry.
* @gfp: Memory allocation flags.
*
* This function is like calling xa_store() except it disables interrupts
* while holding the array lock.
*
* Context: Process context. Takes and releases the xa_lock while
* disabling interrupts.
* Return: The old entry at this index or xa_err() if an error happened.
*/
static inline void *xa_store_irq(struct xarray *xa, unsigned long index,
void *entry, gfp_t gfp)
{
void *curr;
xa_lock_irq(xa);
curr = __xa_store(xa, index, entry, gfp);
xa_unlock_irq(xa);
return curr;
}
/**
* xa_erase_bh() - Erase this entry from the XArray.
* @xa: XArray.
* @index: Index of entry.
*
* After this function returns, loading from @index will return %NULL.
* If the index is part of a multi-index entry, all indices will be erased
* and none of the entries will be part of a multi-index entry.
*
* Context: Any context. Takes and releases the xa_lock while
* disabling softirqs.
* Return: The entry which used to be at this index.
*/
static inline void *xa_erase_bh(struct xarray *xa, unsigned long index)
{
void *entry;
xa_lock_bh(xa);
entry = __xa_erase(xa, index);
xa_unlock_bh(xa);
return entry;
}
/**
* xa_erase_irq() - Erase this entry from the XArray.
* @xa: XArray.
* @index: Index of entry.
*
* After this function returns, loading from @index will return %NULL.
* If the index is part of a multi-index entry, all indices will be erased
* and none of the entries will be part of a multi-index entry.
*
* Context: Process context. Takes and releases the xa_lock while
* disabling interrupts.
* Return: The entry which used to be at this index.
*/
static inline void *xa_erase_irq(struct xarray *xa, unsigned long index)
{
void *entry;
xa_lock_irq(xa);
entry = __xa_erase(xa, index);
xa_unlock_irq(xa);
return entry;
}
/**
* xa_cmpxchg() - Conditionally replace an entry in the XArray.
* @xa: XArray.
* @index: Index into array.
* @old: Old value to test against.
* @entry: New value to place in array.
* @gfp: Memory allocation flags.
*
* If the entry at @index is the same as @old, replace it with @entry.
* If the return value is equal to @old, then the exchange was successful.
*
* Context: Any context. Takes and releases the xa_lock. May sleep
* if the @gfp flags permit.
* Return: The old value at this index or xa_err() if an error happened.
*/
static inline void *xa_cmpxchg(struct xarray *xa, unsigned long index,
void *old, void *entry, gfp_t gfp)
{
void *curr;
xa_lock(xa);
curr = __xa_cmpxchg(xa, index, old, entry, gfp);
xa_unlock(xa);
return curr;
}
/**
* xa_cmpxchg_bh() - Conditionally replace an entry in the XArray.
* @xa: XArray.
* @index: Index into array.
* @old: Old value to test against.
* @entry: New value to place in array.
* @gfp: Memory allocation flags.
*
* This function is like calling xa_cmpxchg() except it disables softirqs
* while holding the array lock.
*
* Context: Any context. Takes and releases the xa_lock while
* disabling softirqs. May sleep if the @gfp flags permit.
* Return: The old value at this index or xa_err() if an error happened.
*/
static inline void *xa_cmpxchg_bh(struct xarray *xa, unsigned long index,
void *old, void *entry, gfp_t gfp)
{
void *curr;
xa_lock_bh(xa);
curr = __xa_cmpxchg(xa, index, old, entry, gfp);
xa_unlock_bh(xa);
return curr;
}
/**
* xa_cmpxchg_irq() - Conditionally replace an entry in the XArray.
* @xa: XArray.
* @index: Index into array.
* @old: Old value to test against.
* @entry: New value to place in array.
* @gfp: Memory allocation flags.
*
* This function is like calling xa_cmpxchg() except it disables interrupts
* while holding the array lock.
*
* Context: Process context. Takes and releases the xa_lock while
* disabling interrupts. May sleep if the @gfp flags permit.
* Return: The old value at this index or xa_err() if an error happened.
*/
static inline void *xa_cmpxchg_irq(struct xarray *xa, unsigned long index,
void *old, void *entry, gfp_t gfp)
{
void *curr;
xa_lock_irq(xa);
curr = __xa_cmpxchg(xa, index, old, entry, gfp);
xa_unlock_irq(xa);
return curr;
}
/**
* xa_insert() - Store this entry in the XArray unless another entry is
* already present.
* @xa: XArray.
* @index: Index into array.
* @entry: New entry.
* @gfp: Memory allocation flags.
*
* Inserting a NULL entry will store a reserved entry (like xa_reserve())
* if no entry is present. Inserting will fail if a reserved entry is
* present, even though loading from this index will return NULL.
*
* Context: Any context. Takes and releases the xa_lock. May sleep if
* the @gfp flags permit.
* Return: 0 if the store succeeded. -EBUSY if another entry was present.
* -ENOMEM if memory could not be allocated.
*/
static inline int __must_check xa_insert(struct xarray *xa,
unsigned long index, void *entry, gfp_t gfp)
{
int err;
xa_lock(xa);
err = __xa_insert(xa, index, entry, gfp);
xa_unlock(xa);
return err;
}
/**
* xa_insert_bh() - Store this entry in the XArray unless another entry is
* already present.
* @xa: XArray.
* @index: Index into array.
* @entry: New entry.
* @gfp: Memory allocation flags.
*
* Inserting a NULL entry will store a reserved entry (like xa_reserve())
* if no entry is present. Inserting will fail if a reserved entry is
* present, even though loading from this index will return NULL.
*
* Context: Any context. Takes and releases the xa_lock while
* disabling softirqs. May sleep if the @gfp flags permit.
* Return: 0 if the store succeeded. -EBUSY if another entry was present.
* -ENOMEM if memory could not be allocated.
*/
static inline int __must_check xa_insert_bh(struct xarray *xa,
unsigned long index, void *entry, gfp_t gfp)
{
int err;
xa_lock_bh(xa);
err = __xa_insert(xa, index, entry, gfp);
xa_unlock_bh(xa);
return err;
}
/**
* xa_insert_irq() - Store this entry in the XArray unless another entry is
* already present.
* @xa: XArray.
* @index: Index into array.
* @entry: New entry.
* @gfp: Memory allocation flags.
*
* Inserting a NULL entry will store a reserved entry (like xa_reserve())
* if no entry is present. Inserting will fail if a reserved entry is
* present, even though loading from this index will return NULL.
*
* Context: Process context. Takes and releases the xa_lock while
* disabling interrupts. May sleep if the @gfp flags permit.
* Return: 0 if the store succeeded. -EBUSY if another entry was present.
* -ENOMEM if memory could not be allocated.
*/
static inline int __must_check xa_insert_irq(struct xarray *xa,
unsigned long index, void *entry, gfp_t gfp)
{
int err;
xa_lock_irq(xa);
err = __xa_insert(xa, index, entry, gfp);
xa_unlock_irq(xa);
return err;
}
/**
* xa_alloc() - Find somewhere to store this entry in the XArray.
* @xa: XArray.
* @id: Pointer to ID.
* @entry: New entry.
* @limit: Range of ID to allocate.
* @gfp: Memory allocation flags.
*
* Finds an empty entry in @xa between @limit.min and @limit.max,
* stores the index into the @id pointer, then stores the entry at
* that index. A concurrent lookup will not see an uninitialised @id.
*
* Context: Any context. Takes and releases the xa_lock. May sleep if
* the @gfp flags permit.
* Return: 0 on success, -ENOMEM if memory could not be allocated or
* -EBUSY if there are no free entries in @limit.
*/
static inline __must_check int xa_alloc(struct xarray *xa, u32 *id,
void *entry, struct xa_limit limit, gfp_t gfp)
{
int err;
xa_lock(xa);
err = __xa_alloc(xa, id, entry, limit, gfp);
xa_unlock(xa);
return err;
}
/**
* xa_alloc_bh() - Find somewhere to store this entry in the XArray.
* @xa: XArray.
* @id: Pointer to ID.
* @entry: New entry.
* @limit: Range of ID to allocate.
* @gfp: Memory allocation flags.
*
* Finds an empty entry in @xa between @limit.min and @limit.max,
* stores the index into the @id pointer, then stores the entry at
* that index. A concurrent lookup will not see an uninitialised @id.
*
* Context: Any context. Takes and releases the xa_lock while
* disabling softirqs. May sleep if the @gfp flags permit.
* Return: 0 on success, -ENOMEM if memory could not be allocated or
* -EBUSY if there are no free entries in @limit.
*/
static inline int __must_check xa_alloc_bh(struct xarray *xa, u32 *id,
void *entry, struct xa_limit limit, gfp_t gfp)
{
int err;
xa_lock_bh(xa);
err = __xa_alloc(xa, id, entry, limit, gfp);
xa_unlock_bh(xa);
return err;
}
/**
* xa_alloc_irq() - Find somewhere to store this entry in the XArray.
* @xa: XArray.
* @id: Pointer to ID.
* @entry: New entry.
* @limit: Range of ID to allocate.
* @gfp: Memory allocation flags.
*
* Finds an empty entry in @xa between @limit.min and @limit.max,
* stores the index into the @id pointer, then stores the entry at
* that index. A concurrent lookup will not see an uninitialised @id.
*
* Context: Process context. Takes and releases the xa_lock while
* disabling interrupts. May sleep if the @gfp flags permit.
* Return: 0 on success, -ENOMEM if memory could not be allocated or
* -EBUSY if there are no free entries in @limit.
*/
static inline int __must_check xa_alloc_irq(struct xarray *xa, u32 *id,
void *entry, struct xa_limit limit, gfp_t gfp)
{
int err;
xa_lock_irq(xa);
err = __xa_alloc(xa, id, entry, limit, gfp);
xa_unlock_irq(xa);
return err;
}
/**
* xa_alloc_cyclic() - Find somewhere to store this entry in the XArray.
* @xa: XArray.
* @id: Pointer to ID.
* @entry: New entry.
* @limit: Range of allocated ID.
* @next: Pointer to next ID to allocate.
* @gfp: Memory allocation flags.
*
* Finds an empty entry in @xa between @limit.min and @limit.max,
* stores the index into the @id pointer, then stores the entry at
* that index. A concurrent lookup will not see an uninitialised @id.
* The search for an empty entry will start at @next and will wrap
* around if necessary.
*
* Context: Any context. Takes and releases the xa_lock. May sleep if
* the @gfp flags permit.
* Return: 0 if the allocation succeeded without wrapping. 1 if the
* allocation succeeded after wrapping, -ENOMEM if memory could not be
* allocated or -EBUSY if there are no free entries in @limit.
*/
static inline int xa_alloc_cyclic(struct xarray *xa, u32 *id, void *entry,
struct xa_limit limit, u32 *next, gfp_t gfp)
{
int err;
xa_lock(xa);
err = __xa_alloc_cyclic(xa, id, entry, limit, next, gfp);
xa_unlock(xa);
return err;
}
/**
* xa_alloc_cyclic_bh() - Find somewhere to store this entry in the XArray.
* @xa: XArray.
* @id: Pointer to ID.
* @entry: New entry.
* @limit: Range of allocated ID.
* @next: Pointer to next ID to allocate.
* @gfp: Memory allocation flags.
*
* Finds an empty entry in @xa between @limit.min and @limit.max,
* stores the index into the @id pointer, then stores the entry at
* that index. A concurrent lookup will not see an uninitialised @id.
* The search for an empty entry will start at @next and will wrap
* around if necessary.
*
* Context: Any context. Takes and releases the xa_lock while
* disabling softirqs. May sleep if the @gfp flags permit.
* Return: 0 if the allocation succeeded without wrapping. 1 if the
* allocation succeeded after wrapping, -ENOMEM if memory could not be
* allocated or -EBUSY if there are no free entries in @limit.
*/
static inline int xa_alloc_cyclic_bh(struct xarray *xa, u32 *id, void *entry,
struct xa_limit limit, u32 *next, gfp_t gfp)
{
int err;
xa_lock_bh(xa);
err = __xa_alloc_cyclic(xa, id, entry, limit, next, gfp);
xa_unlock_bh(xa);
return err;
}
/**
* xa_alloc_cyclic_irq() - Find somewhere to store this entry in the XArray.
* @xa: XArray.
* @id: Pointer to ID.
* @entry: New entry.
* @limit: Range of allocated ID.
* @next: Pointer to next ID to allocate.
* @gfp: Memory allocation flags.
*
* Finds an empty entry in @xa between @limit.min and @limit.max,
* stores the index into the @id pointer, then stores the entry at
* that index. A concurrent lookup will not see an uninitialised @id.
* The search for an empty entry will start at @next and will wrap
* around if necessary.
*
* Context: Process context. Takes and releases the xa_lock while
* disabling interrupts. May sleep if the @gfp flags permit.
* Return: 0 if the allocation succeeded without wrapping. 1 if the
* allocation succeeded after wrapping, -ENOMEM if memory could not be
* allocated or -EBUSY if there are no free entries in @limit.
*/
static inline int xa_alloc_cyclic_irq(struct xarray *xa, u32 *id, void *entry,
struct xa_limit limit, u32 *next, gfp_t gfp)
{
int err;
xa_lock_irq(xa);
err = __xa_alloc_cyclic(xa, id, entry, limit, next, gfp);
xa_unlock_irq(xa);
return err;
}
/**
* xa_reserve() - Reserve this index in the XArray.
* @xa: XArray.
* @index: Index into array.
* @gfp: Memory allocation flags.
*
* Ensures there is somewhere to store an entry at @index in the array.
* If there is already something stored at @index, this function does
* nothing. If there was nothing there, the entry is marked as reserved.
* Loading from a reserved entry returns a %NULL pointer.
*
* If you do not use the entry that you have reserved, call xa_release()
* or xa_erase() to free any unnecessary memory.
*
* Context: Any context. Takes and releases the xa_lock.
* May sleep if the @gfp flags permit.
* Return: 0 if the reservation succeeded or -ENOMEM if it failed.
*/
static inline __must_check
int xa_reserve(struct xarray *xa, unsigned long index, gfp_t gfp)
{
return xa_err(xa_cmpxchg(xa, index, NULL, XA_ZERO_ENTRY, gfp));
}
/**
* xa_reserve_bh() - Reserve this index in the XArray.
* @xa: XArray.
* @index: Index into array.
* @gfp: Memory allocation flags.
*
* A softirq-disabling version of xa_reserve().
*
* Context: Any context. Takes and releases the xa_lock while
* disabling softirqs.
* Return: 0 if the reservation succeeded or -ENOMEM if it failed.
*/
static inline __must_check
int xa_reserve_bh(struct xarray *xa, unsigned long index, gfp_t gfp)
{
return xa_err(xa_cmpxchg_bh(xa, index, NULL, XA_ZERO_ENTRY, gfp));
}
/**
* xa_reserve_irq() - Reserve this index in the XArray.
* @xa: XArray.
* @index: Index into array.
* @gfp: Memory allocation flags.
*
* An interrupt-disabling version of xa_reserve().
*
* Context: Process context. Takes and releases the xa_lock while
* disabling interrupts.
* Return: 0 if the reservation succeeded or -ENOMEM if it failed.
*/
static inline __must_check
int xa_reserve_irq(struct xarray *xa, unsigned long index, gfp_t gfp)
{
return xa_err(xa_cmpxchg_irq(xa, index, NULL, XA_ZERO_ENTRY, gfp));
}
/**
* xa_release() - Release a reserved entry.
* @xa: XArray.
* @index: Index of entry.
*
* After calling xa_reserve(), you can call this function to release the
* reservation. If the entry at @index has been stored to, this function
* will do nothing.
*/
static inline void xa_release(struct xarray *xa, unsigned long index)
{
xa_cmpxchg(xa, index, XA_ZERO_ENTRY, NULL, 0);
}
/* Everything below here is the Advanced API. Proceed with caution. */
/*
* The xarray is constructed out of a set of 'chunks' of pointers. Choosing
* the best chunk size requires some tradeoffs. A power of two recommends
* itself so that we can walk the tree based purely on shifts and masks.
* Generally, the larger the better; as the number of slots per level of the
* tree increases, the less tall the tree needs to be. But that needs to be
* balanced against the memory consumption of each node. On a 64-bit system,
* xa_node is currently 576 bytes, and we get 7 of them per 4kB page. If we
* doubled the number of slots per node, we'd get only 3 nodes per 4kB page.
*/
#ifndef XA_CHUNK_SHIFT
#define XA_CHUNK_SHIFT (CONFIG_BASE_SMALL ? 4 : 6)
#endif
#define XA_CHUNK_SIZE (1UL << XA_CHUNK_SHIFT)
#define XA_CHUNK_MASK (XA_CHUNK_SIZE - 1)
#define XA_MAX_MARKS 3
#define XA_MARK_LONGS DIV_ROUND_UP(XA_CHUNK_SIZE, BITS_PER_LONG)
/*
* @count is the count of every non-NULL element in the ->slots array
* whether that is a value entry, a retry entry, a user pointer,
* a sibling entry or a pointer to the next level of the tree.
* @nr_values is the count of every element in ->slots which is
* either a value entry or a sibling of a value entry.
*/
struct xa_node {
unsigned char shift; /* Bits remaining in each slot */
unsigned char offset; /* Slot offset in parent */
unsigned char count; /* Total entry count */
unsigned char nr_values; /* Value entry count */
struct xa_node __rcu *parent; /* NULL at top of tree */
struct xarray *array; /* The array we belong to */
union {
struct list_head private_list; /* For tree user */
struct rcu_head rcu_head; /* Used when freeing node */
};
void __rcu *slots[XA_CHUNK_SIZE];
union {
unsigned long tags[XA_MAX_MARKS][XA_MARK_LONGS];
unsigned long marks[XA_MAX_MARKS][XA_MARK_LONGS];
};
};
void xa_dump(const struct xarray *);
void xa_dump_node(const struct xa_node *);
#ifdef XA_DEBUG
#define XA_BUG_ON(xa, x) do { \
if (x) { \
xa_dump(xa); \
BUG(); \
} \
} while (0)
#define XA_NODE_BUG_ON(node, x) do { \
if (x) { \
if (node) xa_dump_node(node); \
BUG(); \
} \
} while (0)
#else
#define XA_BUG_ON(xa, x) do { } while (0)
#define XA_NODE_BUG_ON(node, x) do { } while (0)
#endif
/* Private */
static inline void *xa_head(const struct xarray *xa)
{
return rcu_dereference_check(xa->xa_head,
lockdep_is_held(&xa->xa_lock));
}
/* Private */
static inline void *xa_head_locked(const struct xarray *xa)
{
return rcu_dereference_protected(xa->xa_head,
lockdep_is_held(&xa->xa_lock));
}
/* Private */
static inline void *xa_entry(const struct xarray *xa,
const struct xa_node *node, unsigned int offset)
{
XA_NODE_BUG_ON(node, offset >= XA_CHUNK_SIZE);
return rcu_dereference_check(node->slots[offset],
lockdep_is_held(&xa->xa_lock));
}
/* Private */
static inline void *xa_entry_locked(const struct xarray *xa,
const struct xa_node *node, unsigned int offset)
{
XA_NODE_BUG_ON(node, offset >= XA_CHUNK_SIZE);
return rcu_dereference_protected(node->slots[offset],
lockdep_is_held(&xa->xa_lock));
}
/* Private */
static inline struct xa_node *xa_parent(const struct xarray *xa,
const struct xa_node *node)
{
return rcu_dereference_check(node->parent,
lockdep_is_held(&xa->xa_lock));
}
/* Private */
static inline struct xa_node *xa_parent_locked(const struct xarray *xa,
const struct xa_node *node)
{
return rcu_dereference_protected(node->parent,
lockdep_is_held(&xa->xa_lock));
}
/* Private */
static inline void *xa_mk_node(const struct xa_node *node)
{
return (void *)((unsigned long)node | 2);
}
/* Private */
static inline struct xa_node *xa_to_node(const void *entry)
{
return (struct xa_node *)((unsigned long)entry - 2);
}
/* Private */
static inline bool xa_is_node(const void *entry)
{
return xa_is_internal(entry) && (unsigned long)entry > 4096;
}
/* Private */
static inline void *xa_mk_sibling(unsigned int offset)
{
return xa_mk_internal(offset);
}
/* Private */
static inline unsigned long xa_to_sibling(const void *entry)
{
return xa_to_internal(entry);
}
/**
* xa_is_sibling() - Is the entry a sibling entry?
* @entry: Entry retrieved from the XArray
*
* Return: %true if the entry is a sibling entry.
*/
static inline bool xa_is_sibling(const void *entry)
{
return IS_ENABLED(CONFIG_XARRAY_MULTI) && xa_is_internal(entry) &&
(entry < xa_mk_sibling(XA_CHUNK_SIZE - 1));
}
#define XA_RETRY_ENTRY xa_mk_internal(256)
/**
* xa_is_retry() - Is the entry a retry entry?
* @entry: Entry retrieved from the XArray
*
* Return: %true if the entry is a retry entry.
*/
static inline bool xa_is_retry(const void *entry)
{
return unlikely(entry == XA_RETRY_ENTRY);
}
/**
* xa_is_advanced() - Is the entry only permitted for the advanced API?
* @entry: Entry to be stored in the XArray.
*
* Return: %true if the entry cannot be stored by the normal API.
*/
static inline bool xa_is_advanced(const void *entry)
{
return xa_is_internal(entry) && (entry <= XA_RETRY_ENTRY);
}
/**
* typedef xa_update_node_t - A callback function from the XArray.
* @node: The node which is being processed
*
* This function is called every time the XArray updates the count of
* present and value entries in a node. It allows advanced users to
* maintain the private_list in the node.
*
* Context: The xa_lock is held and interrupts may be disabled.
* Implementations should not drop the xa_lock, nor re-enable
* interrupts.
*/
typedef void (*xa_update_node_t)(struct xa_node *node);
void xa_delete_node(struct xa_node *, xa_update_node_t);
/*
* The xa_state is opaque to its users. It contains various different pieces
* of state involved in the current operation on the XArray. It should be
* declared on the stack and passed between the various internal routines.
* The various elements in it should not be accessed directly, but only
* through the provided accessor functions. The below documentation is for
* the benefit of those working on the code, not for users of the XArray.
*
* @xa_node usually points to the xa_node containing the slot we're operating
* on (and @xa_offset is the offset in the slots array). If there is a
* single entry in the array at index 0, there are no allocated xa_nodes to
* point to, and so we store %NULL in @xa_node. @xa_node is set to
* the value %XAS_RESTART if the xa_state is not walked to the correct
* position in the tree of nodes for this operation. If an error occurs
* during an operation, it is set to an %XAS_ERROR value. If we run off the
* end of the allocated nodes, it is set to %XAS_BOUNDS.
*/
struct xa_state {
struct xarray *xa;
unsigned long xa_index;
unsigned char xa_shift;
unsigned char xa_sibs;
unsigned char xa_offset;
unsigned char xa_pad; /* Helps gcc generate better code */
struct xa_node *xa_node;
struct xa_node *xa_alloc;
xa_update_node_t xa_update;
};
/*
* We encode errnos in the xas->xa_node. If an error has happened, we need to
* drop the lock to fix it, and once we've done so the xa_state is invalid.
*/
#define XA_ERROR(errno) ((struct xa_node *)(((unsigned long)errno << 2) | 2UL))
#define XAS_BOUNDS ((struct xa_node *)1UL)
#define XAS_RESTART ((struct xa_node *)3UL)
#define __XA_STATE(array, index, shift, sibs) { \
.xa = array, \
.xa_index = index, \
.xa_shift = shift, \
.xa_sibs = sibs, \
.xa_offset = 0, \
.xa_pad = 0, \
.xa_node = XAS_RESTART, \
.xa_alloc = NULL, \
.xa_update = NULL \
}
/**
* XA_STATE() - Declare an XArray operation state.
* @name: Name of this operation state (usually xas).
* @array: Array to operate on.
* @index: Initial index of interest.
*
* Declare and initialise an xa_state on the stack.
*/
#define XA_STATE(name, array, index) \
struct xa_state name = __XA_STATE(array, index, 0, 0)
/**
* XA_STATE_ORDER() - Declare an XArray operation state.
* @name: Name of this operation state (usually xas).
* @array: Array to operate on.
* @index: Initial index of interest.
* @order: Order of entry.
*
* Declare and initialise an xa_state on the stack. This variant of
* XA_STATE() allows you to specify the 'order' of the element you
* want to operate on.`
*/
#define XA_STATE_ORDER(name, array, index, order) \
struct xa_state name = __XA_STATE(array, \
(index >> order) << order, \
order - (order % XA_CHUNK_SHIFT), \
(1U << (order % XA_CHUNK_SHIFT)) - 1)
#define xas_marked(xas, mark) xa_marked((xas)->xa, (mark))
#define xas_trylock(xas) xa_trylock((xas)->xa)
#define xas_lock(xas) xa_lock((xas)->xa)
#define xas_unlock(xas) xa_unlock((xas)->xa)
#define xas_lock_bh(xas) xa_lock_bh((xas)->xa)
#define xas_unlock_bh(xas) xa_unlock_bh((xas)->xa)
#define xas_lock_irq(xas) xa_lock_irq((xas)->xa)
#define xas_unlock_irq(xas) xa_unlock_irq((xas)->xa)
#define xas_lock_irqsave(xas, flags) \
xa_lock_irqsave((xas)->xa, flags)
#define xas_unlock_irqrestore(xas, flags) \
xa_unlock_irqrestore((xas)->xa, flags)
/**
* xas_error() - Return an errno stored in the xa_state.
* @xas: XArray operation state.
*
* Return: 0 if no error has been noted. A negative errno if one has.
*/
static inline int xas_error(const struct xa_state *xas)
{
return xa_err(xas->xa_node);
}
/**
* xas_set_err() - Note an error in the xa_state.
* @xas: XArray operation state.
* @err: Negative error number.
*
* Only call this function with a negative @err; zero or positive errors
* will probably not behave the way you think they should. If you want
* to clear the error from an xa_state, use xas_reset().
*/
static inline void xas_set_err(struct xa_state *xas, long err)
{
xas->xa_node = XA_ERROR(err);
}
/**
* xas_invalid() - Is the xas in a retry or error state?
* @xas: XArray operation state.
*
* Return: %true if the xas cannot be used for operations.
*/
static inline bool xas_invalid(const struct xa_state *xas)
{
return (unsigned long)xas->xa_node & 3;
}
/**
* xas_valid() - Is the xas a valid cursor into the array?
* @xas: XArray operation state.
*
* Return: %true if the xas can be used for operations.
*/
static inline bool xas_valid(const struct xa_state *xas)
{
return !xas_invalid(xas);
}
/**
* xas_is_node() - Does the xas point to a node?
* @xas: XArray operation state.
*
* Return: %true if the xas currently references a node.
*/
static inline bool xas_is_node(const struct xa_state *xas)
{
return xas_valid(xas) && xas->xa_node;
}
/* True if the pointer is something other than a node */
static inline bool xas_not_node(struct xa_node *node)
{
return ((unsigned long)node & 3) || !node;
}
/* True if the node represents RESTART or an error */
static inline bool xas_frozen(struct xa_node *node)
{
return (unsigned long)node & 2;
}
/* True if the node represents head-of-tree, RESTART or BOUNDS */
static inline bool xas_top(struct xa_node *node)
{
return node <= XAS_RESTART;
}
/**
* xas_reset() - Reset an XArray operation state.
* @xas: XArray operation state.
*
* Resets the error or walk state of the @xas so future walks of the
* array will start from the root. Use this if you have dropped the
* xarray lock and want to reuse the xa_state.
*
* Context: Any context.
*/
static inline void xas_reset(struct xa_state *xas)
{
xas->xa_node = XAS_RESTART;
}
/**
* xas_retry() - Retry the operation if appropriate.
* @xas: XArray operation state.
* @entry: Entry from xarray.
*
* The advanced functions may sometimes return an internal entry, such as
* a retry entry or a zero entry. This function sets up the @xas to restart
* the walk from the head of the array if needed.
*
* Context: Any context.
* Return: true if the operation needs to be retried.
*/
static inline bool xas_retry(struct xa_state *xas, const void *entry)
{
if (xa_is_zero(entry))
return true;
if (!xa_is_retry(entry))
return false;
xas_reset(xas);
return true;
}
void *xas_load(struct xa_state *);
void *xas_store(struct xa_state *, void *entry);
void *xas_find(struct xa_state *, unsigned long max);
void *xas_find_conflict(struct xa_state *);
bool xas_get_mark(const struct xa_state *, xa_mark_t);
void xas_set_mark(const struct xa_state *, xa_mark_t);
void xas_clear_mark(const struct xa_state *, xa_mark_t);
void *xas_find_marked(struct xa_state *, unsigned long max, xa_mark_t);
void xas_init_marks(const struct xa_state *);
bool xas_nomem(struct xa_state *, gfp_t);
void xas_pause(struct xa_state *);
void xas_create_range(struct xa_state *);
#ifdef CONFIG_XARRAY_MULTI
int xa_get_order(struct xarray *, unsigned long index);
void xas_split(struct xa_state *, void *entry, unsigned int order);
void xas_split_alloc(struct xa_state *, void *entry, unsigned int order, gfp_t);
#else
static inline int xa_get_order(struct xarray *xa, unsigned long index)
{
return 0;
}
static inline void xas_split(struct xa_state *xas, void *entry,
unsigned int order)
{
xas_store(xas, entry);
}
static inline void xas_split_alloc(struct xa_state *xas, void *entry,
unsigned int order, gfp_t gfp)
{
}
#endif
/**
* xas_reload() - Refetch an entry from the xarray.
* @xas: XArray operation state.
*
* Use this function to check that a previously loaded entry still has
* the same value. This is useful for the lockless pagecache lookup where
* we walk the array with only the RCU lock to protect us, lock the page,
* then check that the page hasn't moved since we looked it up.
*
* The caller guarantees that @xas is still valid. If it may be in an
* error or restart state, call xas_load() instead.
*
* Return: The entry at this location in the xarray.
*/
static inline void *xas_reload(struct xa_state *xas)
{
struct xa_node *node = xas->xa_node;
void *entry;
char offset;
if (!node) return xa_head(xas->xa);
if (IS_ENABLED(CONFIG_XARRAY_MULTI)) {
offset = (xas->xa_index >> node->shift) & XA_CHUNK_MASK;
entry = xa_entry(xas->xa, node, offset);
if (!xa_is_sibling(entry))
return entry;
offset = xa_to_sibling(entry);
} else {
offset = xas->xa_offset;
}
return xa_entry(xas->xa, node, offset);
}
/**
* xas_set() - Set up XArray operation state for a different index.
* @xas: XArray operation state.
* @index: New index into the XArray.
*
* Move the operation state to refer to a different index. This will
* have the effect of starting a walk from the top; see xas_next()
* to move to an adjacent index.
*/
static inline void xas_set(struct xa_state *xas, unsigned long index)
{
xas->xa_index = index;
xas->xa_node = XAS_RESTART;
}
/**
* xas_set_order() - Set up XArray operation state for a multislot entry.
* @xas: XArray operation state.
* @index: Target of the operation.
* @order: Entry occupies 2^@order indices.
*/
static inline void xas_set_order(struct xa_state *xas, unsigned long index,
unsigned int order)
{
#ifdef CONFIG_XARRAY_MULTI
xas->xa_index = order < BITS_PER_LONG ? (index >> order) << order : 0;
xas->xa_shift = order - (order % XA_CHUNK_SHIFT);
xas->xa_sibs = (1 << (order % XA_CHUNK_SHIFT)) - 1;
xas->xa_node = XAS_RESTART;
#else
BUG_ON(order > 0);
xas_set(xas, index);
#endif
}
/**
* xas_set_update() - Set up XArray operation state for a callback.
* @xas: XArray operation state.
* @update: Function to call when updating a node.
*
* The XArray can notify a caller after it has updated an xa_node.
* This is advanced functionality and is only needed by the page cache.
*/
static inline void xas_set_update(struct xa_state *xas, xa_update_node_t update)
{
xas->xa_update = update;
}
/**
* xas_next_entry() - Advance iterator to next present entry.
* @xas: XArray operation state.
* @max: Highest index to return.
*
* xas_next_entry() is an inline function to optimise xarray traversal for
* speed. It is equivalent to calling xas_find(), and will call xas_find()
* for all the hard cases.
*
* Return: The next present entry after the one currently referred to by @xas.
*/
static inline void *xas_next_entry(struct xa_state *xas, unsigned long max)
{
struct xa_node *node = xas->xa_node;
void *entry;
if (unlikely(xas_not_node(node) || node->shift ||
xas->xa_offset != (xas->xa_index & XA_CHUNK_MASK)))
return xas_find(xas, max);
do {
if (unlikely(xas->xa_index >= max)) return xas_find(xas, max); if (unlikely(xas->xa_offset == XA_CHUNK_MASK)) return xas_find(xas, max); entry = xa_entry(xas->xa, node, xas->xa_offset + 1);
if (unlikely(xa_is_internal(entry)))
return xas_find(xas, max); xas->xa_offset++;
xas->xa_index++;
} while (!entry);
return entry;
}
/* Private */
static inline unsigned int xas_find_chunk(struct xa_state *xas, bool advance,
xa_mark_t mark)
{
unsigned long *addr = xas->xa_node->marks[(__force unsigned)mark];
unsigned int offset = xas->xa_offset;
if (advance)
offset++;
if (XA_CHUNK_SIZE == BITS_PER_LONG) {
if (offset < XA_CHUNK_SIZE) { unsigned long data = *addr & (~0UL << offset);
if (data)
return __ffs(data);
}
return XA_CHUNK_SIZE;
}
return find_next_bit(addr, XA_CHUNK_SIZE, offset);
}
/**
* xas_next_marked() - Advance iterator to next marked entry.
* @xas: XArray operation state.
* @max: Highest index to return.
* @mark: Mark to search for.
*
* xas_next_marked() is an inline function to optimise xarray traversal for
* speed. It is equivalent to calling xas_find_marked(), and will call
* xas_find_marked() for all the hard cases.
*
* Return: The next marked entry after the one currently referred to by @xas.
*/
static inline void *xas_next_marked(struct xa_state *xas, unsigned long max,
xa_mark_t mark)
{
struct xa_node *node = xas->xa_node;
void *entry;
unsigned int offset;
if (unlikely(xas_not_node(node) || node->shift))
return xas_find_marked(xas, max, mark);
offset = xas_find_chunk(xas, true, mark);
xas->xa_offset = offset;
xas->xa_index = (xas->xa_index & ~XA_CHUNK_MASK) + offset;
if (xas->xa_index > max)
return NULL;
if (offset == XA_CHUNK_SIZE)
return xas_find_marked(xas, max, mark);
entry = xa_entry(xas->xa, node, offset);
if (!entry)
return xas_find_marked(xas, max, mark);
return entry;
}
/*
* If iterating while holding a lock, drop the lock and reschedule
* every %XA_CHECK_SCHED loops.
*/
enum {
XA_CHECK_SCHED = 4096,
};
/**
* xas_for_each() - Iterate over a range of an XArray.
* @xas: XArray operation state.
* @entry: Entry retrieved from the array.
* @max: Maximum index to retrieve from array.
*
* The loop body will be executed for each entry present in the xarray
* between the current xas position and @max. @entry will be set to
* the entry retrieved from the xarray. It is safe to delete entries
* from the array in the loop body. You should hold either the RCU lock
* or the xa_lock while iterating. If you need to drop the lock, call
* xas_pause() first.
*/
#define xas_for_each(xas, entry, max) \
for (entry = xas_find(xas, max); entry; \
entry = xas_next_entry(xas, max))
/**
* xas_for_each_marked() - Iterate over a range of an XArray.
* @xas: XArray operation state.
* @entry: Entry retrieved from the array.
* @max: Maximum index to retrieve from array.
* @mark: Mark to search for.
*
* The loop body will be executed for each marked entry in the xarray
* between the current xas position and @max. @entry will be set to
* the entry retrieved from the xarray. It is safe to delete entries
* from the array in the loop body. You should hold either the RCU lock
* or the xa_lock while iterating. If you need to drop the lock, call
* xas_pause() first.
*/
#define xas_for_each_marked(xas, entry, max, mark) \
for (entry = xas_find_marked(xas, max, mark); entry; \
entry = xas_next_marked(xas, max, mark))
/**
* xas_for_each_conflict() - Iterate over a range of an XArray.
* @xas: XArray operation state.
* @entry: Entry retrieved from the array.
*
* The loop body will be executed for each entry in the XArray that
* lies within the range specified by @xas. If the loop terminates
* normally, @entry will be %NULL. The user may break out of the loop,
* which will leave @entry set to the conflicting entry. The caller
* may also call xa_set_err() to exit the loop while setting an error
* to record the reason.
*/
#define xas_for_each_conflict(xas, entry) \
while ((entry = xas_find_conflict(xas)))
void *__xas_next(struct xa_state *);
void *__xas_prev(struct xa_state *);
/**
* xas_prev() - Move iterator to previous index.
* @xas: XArray operation state.
*
* If the @xas was in an error state, it will remain in an error state
* and this function will return %NULL. If the @xas has never been walked,
* it will have the effect of calling xas_load(). Otherwise one will be
* subtracted from the index and the state will be walked to the correct
* location in the array for the next operation.
*
* If the iterator was referencing index 0, this function wraps
* around to %ULONG_MAX.
*
* Return: The entry at the new index. This may be %NULL or an internal
* entry.
*/
static inline void *xas_prev(struct xa_state *xas)
{
struct xa_node *node = xas->xa_node; if (unlikely(xas_not_node(node) || node->shift ||
xas->xa_offset == 0))
return __xas_prev(xas); xas->xa_index--;
xas->xa_offset--;
return xa_entry(xas->xa, node, xas->xa_offset);
}
/**
* xas_next() - Move state to next index.
* @xas: XArray operation state.
*
* If the @xas was in an error state, it will remain in an error state
* and this function will return %NULL. If the @xas has never been walked,
* it will have the effect of calling xas_load(). Otherwise one will be
* added to the index and the state will be walked to the correct
* location in the array for the next operation.
*
* If the iterator was referencing index %ULONG_MAX, this function wraps
* around to 0.
*
* Return: The entry at the new index. This may be %NULL or an internal
* entry.
*/
static inline void *xas_next(struct xa_state *xas)
{
struct xa_node *node = xas->xa_node; if (unlikely(xas_not_node(node) || node->shift ||
xas->xa_offset == XA_CHUNK_MASK))
return __xas_next(xas); xas->xa_index++;
xas->xa_offset++;
return xa_entry(xas->xa, node, xas->xa_offset);
}
#endif /* _LINUX_XARRAY_H */
// SPDX-License-Identifier: GPL-2.0
#include <linux/kernel.h>
#include <linux/bug.h>
#include <linux/compiler.h>
#include <linux/export.h>
#include <linux/string.h>
#include <linux/list_sort.h>
#include <linux/list.h>
/*
* Returns a list organized in an intermediate format suited
* to chaining of merge() calls: null-terminated, no reserved or
* sentinel head node, "prev" links not maintained.
*/
__attribute__((nonnull(2,3,4)))
static struct list_head *merge(void *priv, list_cmp_func_t cmp,
struct list_head *a, struct list_head *b)
{
struct list_head *head, **tail = &head;
for (;;) {
/* if equal, take 'a' -- important for sort stability */
if (cmp(priv, a, b) <= 0) { *tail = a;
tail = &a->next;
a = a->next;
if (!a) {
*tail = b;
break;
}
} else {
*tail = b;
tail = &b->next;
b = b->next;
if (!b) {
*tail = a;
break;
}
}
}
return head;
}
/*
* Combine final list merge with restoration of standard doubly-linked
* list structure. This approach duplicates code from merge(), but
* runs faster than the tidier alternatives of either a separate final
* prev-link restoration pass, or maintaining the prev links
* throughout.
*/
__attribute__((nonnull(2,3,4,5)))
static void merge_final(void *priv, list_cmp_func_t cmp, struct list_head *head,
struct list_head *a, struct list_head *b)
{
struct list_head *tail = head;
u8 count = 0;
for (;;) {
/* if equal, take 'a' -- important for sort stability */
if (cmp(priv, a, b) <= 0) { tail->next = a;
a->prev = tail;
tail = a;
a = a->next;
if (!a)
break;
} else {
tail->next = b;
b->prev = tail;
tail = b;
b = b->next;
if (!b) {
b = a;
break;
}
}
}
/* Finish linking remainder of list b on to tail */
tail->next = b;
do {
/*
* If the merge is highly unbalanced (e.g. the input is
* already sorted), this loop may run many iterations.
* Continue callbacks to the client even though no
* element comparison is needed, so the client's cmp()
* routine can invoke cond_resched() periodically.
*/
if (unlikely(!++count)) cmp(priv, b, b); b->prev = tail;
tail = b;
b = b->next;
} while (b);
/* And the final links to make a circular doubly-linked list */
tail->next = head; head->prev = tail;
}
/**
* list_sort - sort a list
* @priv: private data, opaque to list_sort(), passed to @cmp
* @head: the list to sort
* @cmp: the elements comparison function
*
* The comparison function @cmp must return > 0 if @a should sort after
* @b ("@a > @b" if you want an ascending sort), and <= 0 if @a should
* sort before @b *or* their original order should be preserved. It is
* always called with the element that came first in the input in @a,
* and list_sort is a stable sort, so it is not necessary to distinguish
* the @a < @b and @a == @b cases.
*
* This is compatible with two styles of @cmp function:
* - The traditional style which returns <0 / =0 / >0, or
* - Returning a boolean 0/1.
* The latter offers a chance to save a few cycles in the comparison
* (which is used by e.g. plug_ctx_cmp() in block/blk-mq.c).
*
* A good way to write a multi-word comparison is::
*
* if (a->high != b->high)
* return a->high > b->high;
* if (a->middle != b->middle)
* return a->middle > b->middle;
* return a->low > b->low;
*
*
* This mergesort is as eager as possible while always performing at least
* 2:1 balanced merges. Given two pending sublists of size 2^k, they are
* merged to a size-2^(k+1) list as soon as we have 2^k following elements.
*
* Thus, it will avoid cache thrashing as long as 3*2^k elements can
* fit into the cache. Not quite as good as a fully-eager bottom-up
* mergesort, but it does use 0.2*n fewer comparisons, so is faster in
* the common case that everything fits into L1.
*
*
* The merging is controlled by "count", the number of elements in the
* pending lists. This is beautifully simple code, but rather subtle.
*
* Each time we increment "count", we set one bit (bit k) and clear
* bits k-1 .. 0. Each time this happens (except the very first time
* for each bit, when count increments to 2^k), we merge two lists of
* size 2^k into one list of size 2^(k+1).
*
* This merge happens exactly when the count reaches an odd multiple of
* 2^k, which is when we have 2^k elements pending in smaller lists,
* so it's safe to merge away two lists of size 2^k.
*
* After this happens twice, we have created two lists of size 2^(k+1),
* which will be merged into a list of size 2^(k+2) before we create
* a third list of size 2^(k+1), so there are never more than two pending.
*
* The number of pending lists of size 2^k is determined by the
* state of bit k of "count" plus two extra pieces of information:
*
* - The state of bit k-1 (when k == 0, consider bit -1 always set), and
* - Whether the higher-order bits are zero or non-zero (i.e.
* is count >= 2^(k+1)).
*
* There are six states we distinguish. "x" represents some arbitrary
* bits, and "y" represents some arbitrary non-zero bits:
* 0: 00x: 0 pending of size 2^k; x pending of sizes < 2^k
* 1: 01x: 0 pending of size 2^k; 2^(k-1) + x pending of sizes < 2^k
* 2: x10x: 0 pending of size 2^k; 2^k + x pending of sizes < 2^k
* 3: x11x: 1 pending of size 2^k; 2^(k-1) + x pending of sizes < 2^k
* 4: y00x: 1 pending of size 2^k; 2^k + x pending of sizes < 2^k
* 5: y01x: 2 pending of size 2^k; 2^(k-1) + x pending of sizes < 2^k
* (merge and loop back to state 2)
*
* We gain lists of size 2^k in the 2->3 and 4->5 transitions (because
* bit k-1 is set while the more significant bits are non-zero) and
* merge them away in the 5->2 transition. Note in particular that just
* before the 5->2 transition, all lower-order bits are 11 (state 3),
* so there is one list of each smaller size.
*
* When we reach the end of the input, we merge all the pending
* lists, from smallest to largest. If you work through cases 2 to
* 5 above, you can see that the number of elements we merge with a list
* of size 2^k varies from 2^(k-1) (cases 3 and 5 when x == 0) to
* 2^(k+1) - 1 (second merge of case 5 when x == 2^(k-1) - 1).
*/
__attribute__((nonnull(2,3)))
void list_sort(void *priv, struct list_head *head, list_cmp_func_t cmp)
{
struct list_head *list = head->next, *pending = NULL;
size_t count = 0; /* Count of pending */
if (list == head->prev) /* Zero or one elements */
return;
/* Convert to a null-terminated singly-linked list. */
head->prev->next = NULL;
/*
* Data structure invariants:
* - All lists are singly linked and null-terminated; prev
* pointers are not maintained.
* - pending is a prev-linked "list of lists" of sorted
* sublists awaiting further merging.
* - Each of the sorted sublists is power-of-two in size.
* - Sublists are sorted by size and age, smallest & newest at front.
* - There are zero to two sublists of each size.
* - A pair of pending sublists are merged as soon as the number
* of following pending elements equals their size (i.e.
* each time count reaches an odd multiple of that size).
* That ensures each later final merge will be at worst 2:1.
* - Each round consists of:
* - Merging the two sublists selected by the highest bit
* which flips when count is incremented, and
* - Adding an element from the input as a size-1 sublist.
*/
do {
size_t bits;
struct list_head **tail = &pending;
/* Find the least-significant clear bit in count */
for (bits = count; bits & 1; bits >>= 1) tail = &(*tail)->prev;
/* Do the indicated merge */
if (likely(bits)) { struct list_head *a = *tail, *b = a->prev;
a = merge(priv, cmp, b, a);
/* Install the merged result in place of the inputs */
a->prev = b->prev;
*tail = a;
}
/* Move one element from input list to pending */
list->prev = pending;
pending = list;
list = list->next;
pending->next = NULL;
count++;
} while (list);
/* End of input; merge together all the pending lists. */
list = pending;
pending = pending->prev;
for (;;) {
struct list_head *next = pending->prev;
if (!next)
break;
list = merge(priv, cmp, pending, list);
pending = next;
}
/* The final merge, rebuilding prev links */
merge_final(priv, cmp, head, pending, list);
}
EXPORT_SYMBOL(list_sort);
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_X86_PKEYS_H
#define _ASM_X86_PKEYS_H
/*
* If more than 16 keys are ever supported, a thorough audit
* will be necessary to ensure that the types that store key
* numbers and masks have sufficient capacity.
*/
#define arch_max_pkey() (cpu_feature_enabled(X86_FEATURE_OSPKE) ? 16 : 1)
extern int arch_set_user_pkey_access(struct task_struct *tsk, int pkey,
unsigned long init_val);
static inline bool arch_pkeys_enabled(void)
{
return cpu_feature_enabled(X86_FEATURE_OSPKE);
}
/*
* Try to dedicate one of the protection keys to be used as an
* execute-only protection key.
*/
extern int __execute_only_pkey(struct mm_struct *mm);
static inline int execute_only_pkey(struct mm_struct *mm)
{
if (!cpu_feature_enabled(X86_FEATURE_OSPKE))
return ARCH_DEFAULT_PKEY;
return __execute_only_pkey(mm);
}
extern int __arch_override_mprotect_pkey(struct vm_area_struct *vma,
int prot, int pkey);
static inline int arch_override_mprotect_pkey(struct vm_area_struct *vma,
int prot, int pkey)
{
if (!cpu_feature_enabled(X86_FEATURE_OSPKE))
return 0;
return __arch_override_mprotect_pkey(vma, prot, pkey);
}
extern int __arch_set_user_pkey_access(struct task_struct *tsk, int pkey,
unsigned long init_val);
#define ARCH_VM_PKEY_FLAGS (VM_PKEY_BIT0 | VM_PKEY_BIT1 | VM_PKEY_BIT2 | VM_PKEY_BIT3)
#define mm_pkey_allocation_map(mm) (mm->context.pkey_allocation_map)
#define mm_set_pkey_allocated(mm, pkey) do { \
mm_pkey_allocation_map(mm) |= (1U << pkey); \
} while (0)
#define mm_set_pkey_free(mm, pkey) do { \
mm_pkey_allocation_map(mm) &= ~(1U << pkey); \
} while (0)
static inline
bool mm_pkey_is_allocated(struct mm_struct *mm, int pkey)
{
/*
* "Allocated" pkeys are those that have been returned
* from pkey_alloc() or pkey 0 which is allocated
* implicitly when the mm is created.
*/
if (pkey < 0)
return false;
if (pkey >= arch_max_pkey())
return false;
/*
* The exec-only pkey is set in the allocation map, but
* is not available to any of the user interfaces like
* mprotect_pkey().
*/
if (pkey == mm->context.execute_only_pkey)
return false;
return mm_pkey_allocation_map(mm) & (1U << pkey);
}
/*
* Returns a positive, 4-bit key on success, or -1 on failure.
*/
static inline
int mm_pkey_alloc(struct mm_struct *mm)
{
/*
* Note: this is the one and only place we make sure
* that the pkey is valid as far as the hardware is
* concerned. The rest of the kernel trusts that
* only good, valid pkeys come out of here.
*/
u16 all_pkeys_mask = ((1U << arch_max_pkey()) - 1);
int ret;
/*
* Are we out of pkeys? We must handle this specially
* because ffz() behavior is undefined if there are no
* zeros.
*/
if (mm_pkey_allocation_map(mm) == all_pkeys_mask)
return -1;
ret = ffz(mm_pkey_allocation_map(mm));
mm_set_pkey_allocated(mm, ret);
return ret;
}
static inline
int mm_pkey_free(struct mm_struct *mm, int pkey)
{
if (!mm_pkey_is_allocated(mm, pkey))
return -EINVAL;
mm_set_pkey_free(mm, pkey);
return 0;
}
extern int arch_set_user_pkey_access(struct task_struct *tsk, int pkey,
unsigned long init_val);
extern int __arch_set_user_pkey_access(struct task_struct *tsk, int pkey,
unsigned long init_val);
static inline int vma_pkey(struct vm_area_struct *vma)
{
unsigned long vma_pkey_mask = VM_PKEY_BIT0 | VM_PKEY_BIT1 |
VM_PKEY_BIT2 | VM_PKEY_BIT3;
return (vma->vm_flags & vma_pkey_mask) >> VM_PKEY_SHIFT;
}
#endif /*_ASM_X86_PKEYS_H */
/* SPDX-License-Identifier: GPL-2.0 */
/*
* Filesystem access notification for Linux
*
* Copyright (C) 2008 Red Hat, Inc., Eric Paris <eparis@redhat.com>
*/
#ifndef __LINUX_FSNOTIFY_BACKEND_H
#define __LINUX_FSNOTIFY_BACKEND_H
#ifdef __KERNEL__
#include <linux/idr.h> /* inotify uses this */
#include <linux/fs.h> /* struct inode */
#include <linux/list.h>
#include <linux/path.h> /* struct path */
#include <linux/spinlock.h>
#include <linux/types.h>
#include <linux/atomic.h>
#include <linux/user_namespace.h>
#include <linux/refcount.h>
/*
* IN_* from inotfy.h lines up EXACTLY with FS_*, this is so we can easily
* convert between them. dnotify only needs conversion at watch creation
* so no perf loss there. fanotify isn't defined yet, so it can use the
* wholes if it needs more events.
*/
#define FS_ACCESS 0x00000001 /* File was accessed */
#define FS_MODIFY 0x00000002 /* File was modified */
#define FS_ATTRIB 0x00000004 /* Metadata changed */
#define FS_CLOSE_WRITE 0x00000008 /* Writtable file was closed */
#define FS_CLOSE_NOWRITE 0x00000010 /* Unwrittable file closed */
#define FS_OPEN 0x00000020 /* File was opened */
#define FS_MOVED_FROM 0x00000040 /* File was moved from X */
#define FS_MOVED_TO 0x00000080 /* File was moved to Y */
#define FS_CREATE 0x00000100 /* Subfile was created */
#define FS_DELETE 0x00000200 /* Subfile was deleted */
#define FS_DELETE_SELF 0x00000400 /* Self was deleted */
#define FS_MOVE_SELF 0x00000800 /* Self was moved */
#define FS_OPEN_EXEC 0x00001000 /* File was opened for exec */
#define FS_UNMOUNT 0x00002000 /* inode on umount fs */
#define FS_Q_OVERFLOW 0x00004000 /* Event queued overflowed */
#define FS_IN_IGNORED 0x00008000 /* last inotify event here */
#define FS_OPEN_PERM 0x00010000 /* open event in an permission hook */
#define FS_ACCESS_PERM 0x00020000 /* access event in a permissions hook */
#define FS_OPEN_EXEC_PERM 0x00040000 /* open/exec event in a permission hook */
#define FS_EXCL_UNLINK 0x04000000 /* do not send events if object is unlinked */
/*
* Set on inode mark that cares about things that happen to its children.
* Always set for dnotify and inotify.
* Set on inode/sb/mount marks that care about parent/name info.
*/
#define FS_EVENT_ON_CHILD 0x08000000
#define FS_DN_RENAME 0x10000000 /* file renamed */
#define FS_DN_MULTISHOT 0x20000000 /* dnotify multishot */
#define FS_ISDIR 0x40000000 /* event occurred against dir */
#define FS_IN_ONESHOT 0x80000000 /* only send event once */
#define FS_MOVE (FS_MOVED_FROM | FS_MOVED_TO)
/*
* Directory entry modification events - reported only to directory
* where entry is modified and not to a watching parent.
* The watching parent may get an FS_ATTRIB|FS_EVENT_ON_CHILD event
* when a directory entry inside a child subdir changes.
*/
#define ALL_FSNOTIFY_DIRENT_EVENTS (FS_CREATE | FS_DELETE | FS_MOVE)
#define ALL_FSNOTIFY_PERM_EVENTS (FS_OPEN_PERM | FS_ACCESS_PERM | \
FS_OPEN_EXEC_PERM)
/*
* This is a list of all events that may get sent to a parent that is watching
* with flag FS_EVENT_ON_CHILD based on fs event on a child of that directory.
*/
#define FS_EVENTS_POSS_ON_CHILD (ALL_FSNOTIFY_PERM_EVENTS | \
FS_ACCESS | FS_MODIFY | FS_ATTRIB | \
FS_CLOSE_WRITE | FS_CLOSE_NOWRITE | \
FS_OPEN | FS_OPEN_EXEC)
/*
* This is a list of all events that may get sent with the parent inode as the
* @to_tell argument of fsnotify().
* It may include events that can be sent to an inode/sb/mount mark, but cannot
* be sent to a parent watching children.
*/
#define FS_EVENTS_POSS_TO_PARENT (FS_EVENTS_POSS_ON_CHILD)
/* Events that can be reported to backends */
#define ALL_FSNOTIFY_EVENTS (ALL_FSNOTIFY_DIRENT_EVENTS | \
FS_EVENTS_POSS_ON_CHILD | \
FS_DELETE_SELF | FS_MOVE_SELF | FS_DN_RENAME | \
FS_UNMOUNT | FS_Q_OVERFLOW | FS_IN_IGNORED)
/* Extra flags that may be reported with event or control handling of events */
#define ALL_FSNOTIFY_FLAGS (FS_EXCL_UNLINK | FS_ISDIR | FS_IN_ONESHOT | \
FS_DN_MULTISHOT | FS_EVENT_ON_CHILD)
#define ALL_FSNOTIFY_BITS (ALL_FSNOTIFY_EVENTS | ALL_FSNOTIFY_FLAGS)
struct fsnotify_group;
struct fsnotify_event;
struct fsnotify_mark;
struct fsnotify_event_private_data;
struct fsnotify_fname;
struct fsnotify_iter_info;
struct mem_cgroup;
/*
* Each group much define these ops. The fsnotify infrastructure will call
* these operations for each relevant group.
*
* handle_event - main call for a group to handle an fs event
* @group: group to notify
* @mask: event type and flags
* @data: object that event happened on
* @data_type: type of object for fanotify_data_XXX() accessors
* @dir: optional directory associated with event -
* if @file_name is not NULL, this is the directory that
* @file_name is relative to
* @file_name: optional file name associated with event
* @cookie: inotify rename cookie
* @iter_info: array of marks from this group that are interested in the event
*
* handle_inode_event - simple variant of handle_event() for groups that only
* have inode marks and don't have ignore mask
* @mark: mark to notify
* @mask: event type and flags
* @inode: inode that event happened on
* @dir: optional directory associated with event -
* if @file_name is not NULL, this is the directory that
* @file_name is relative to.
* @file_name: optional file name associated with event
* @cookie: inotify rename cookie
*
* free_group_priv - called when a group refcnt hits 0 to clean up the private union
* freeing_mark - called when a mark is being destroyed for some reason. The group
* MUST be holding a reference on each mark and that reference must be
* dropped in this function. inotify uses this function to send
* userspace messages that marks have been removed.
*/
struct fsnotify_ops {
int (*handle_event)(struct fsnotify_group *group, u32 mask,
const void *data, int data_type, struct inode *dir,
const struct qstr *file_name, u32 cookie,
struct fsnotify_iter_info *iter_info);
int (*handle_inode_event)(struct fsnotify_mark *mark, u32 mask,
struct inode *inode, struct inode *dir,
const struct qstr *file_name, u32 cookie);
void (*free_group_priv)(struct fsnotify_group *group);
void (*freeing_mark)(struct fsnotify_mark *mark, struct fsnotify_group *group);
void (*free_event)(struct fsnotify_event *event);
/* called on final put+free to free memory */
void (*free_mark)(struct fsnotify_mark *mark);
};
/*
* all of the information about the original object we want to now send to
* a group. If you want to carry more info from the accessing task to the
* listener this structure is where you need to be adding fields.
*/
struct fsnotify_event {
struct list_head list;
};
/*
* A group is a "thing" that wants to receive notification about filesystem
* events. The mask holds the subset of event types this group cares about.
* refcnt on a group is up to the implementor and at any moment if it goes 0
* everything will be cleaned up.
*/
struct fsnotify_group {
const struct fsnotify_ops *ops; /* how this group handles things */
/*
* How the refcnt is used is up to each group. When the refcnt hits 0
* fsnotify will clean up all of the resources associated with this group.
* As an example, the dnotify group will always have a refcnt=1 and that
* will never change. Inotify, on the other hand, has a group per
* inotify_init() and the refcnt will hit 0 only when that fd has been
* closed.
*/
refcount_t refcnt; /* things with interest in this group */
/* needed to send notification to userspace */
spinlock_t notification_lock; /* protect the notification_list */
struct list_head notification_list; /* list of event_holder this group needs to send to userspace */
wait_queue_head_t notification_waitq; /* read() on the notification file blocks on this waitq */
unsigned int q_len; /* events on the queue */
unsigned int max_events; /* maximum events allowed on the list */
/*
* Valid fsnotify group priorities. Events are send in order from highest
* priority to lowest priority. We default to the lowest priority.
*/
#define FS_PRIO_0 0 /* normal notifiers, no permissions */
#define FS_PRIO_1 1 /* fanotify content based access control */
#define FS_PRIO_2 2 /* fanotify pre-content access */
unsigned int priority;
bool shutdown; /* group is being shut down, don't queue more events */
/* stores all fastpath marks assoc with this group so they can be cleaned on unregister */
struct mutex mark_mutex; /* protect marks_list */
atomic_t user_waits; /* Number of tasks waiting for user
* response */
struct list_head marks_list; /* all inode marks for this group */
struct fasync_struct *fsn_fa; /* async notification */
struct fsnotify_event *overflow_event; /* Event we queue when the
* notification list is too
* full */
struct mem_cgroup *memcg; /* memcg to charge allocations */
/* groups can define private fields here or use the void *private */
union {
void *private;
#ifdef CONFIG_INOTIFY_USER
struct inotify_group_private_data {
spinlock_t idr_lock;
struct idr idr;
struct ucounts *ucounts;
} inotify_data;
#endif
#ifdef CONFIG_FANOTIFY
struct fanotify_group_private_data {
/* Hash table of events for merge */
struct hlist_head *merge_hash;
/* allows a group to block waiting for a userspace response */
struct list_head access_list;
wait_queue_head_t access_waitq;
int flags; /* flags from fanotify_init() */
int f_flags; /* event_f_flags from fanotify_init() */
struct ucounts *ucounts;
} fanotify_data;
#endif /* CONFIG_FANOTIFY */
};
};
/* When calling fsnotify tell it if the data is a path or inode */
enum fsnotify_data_type {
FSNOTIFY_EVENT_NONE,
FSNOTIFY_EVENT_PATH,
FSNOTIFY_EVENT_INODE,
};
static inline struct inode *fsnotify_data_inode(const void *data, int data_type)
{
switch (data_type) {
case FSNOTIFY_EVENT_INODE:
return (struct inode *)data;
case FSNOTIFY_EVENT_PATH:
return d_inode(((const struct path *)data)->dentry);
default:
return NULL;
}
}
static inline const struct path *fsnotify_data_path(const void *data,
int data_type)
{
switch (data_type) {
case FSNOTIFY_EVENT_PATH:
return data;
default:
return NULL;
}
}
enum fsnotify_obj_type {
FSNOTIFY_OBJ_TYPE_INODE,
FSNOTIFY_OBJ_TYPE_PARENT,
FSNOTIFY_OBJ_TYPE_VFSMOUNT,
FSNOTIFY_OBJ_TYPE_SB,
FSNOTIFY_OBJ_TYPE_COUNT,
FSNOTIFY_OBJ_TYPE_DETACHED = FSNOTIFY_OBJ_TYPE_COUNT
};
#define FSNOTIFY_OBJ_TYPE_INODE_FL (1U << FSNOTIFY_OBJ_TYPE_INODE)
#define FSNOTIFY_OBJ_TYPE_PARENT_FL (1U << FSNOTIFY_OBJ_TYPE_PARENT)
#define FSNOTIFY_OBJ_TYPE_VFSMOUNT_FL (1U << FSNOTIFY_OBJ_TYPE_VFSMOUNT)
#define FSNOTIFY_OBJ_TYPE_SB_FL (1U << FSNOTIFY_OBJ_TYPE_SB)
#define FSNOTIFY_OBJ_ALL_TYPES_MASK ((1U << FSNOTIFY_OBJ_TYPE_COUNT) - 1)
static inline bool fsnotify_valid_obj_type(unsigned int type)
{
return (type < FSNOTIFY_OBJ_TYPE_COUNT);
}
struct fsnotify_iter_info {
struct fsnotify_mark *marks[FSNOTIFY_OBJ_TYPE_COUNT];
unsigned int report_mask;
int srcu_idx;
};
static inline bool fsnotify_iter_should_report_type(
struct fsnotify_iter_info *iter_info, int type)
{
return (iter_info->report_mask & (1U << type));
}
static inline void fsnotify_iter_set_report_type(
struct fsnotify_iter_info *iter_info, int type)
{
iter_info->report_mask |= (1U << type);
}
static inline void fsnotify_iter_set_report_type_mark(
struct fsnotify_iter_info *iter_info, int type,
struct fsnotify_mark *mark)
{
iter_info->marks[type] = mark;
iter_info->report_mask |= (1U << type);
}
#define FSNOTIFY_ITER_FUNCS(name, NAME) \
static inline struct fsnotify_mark *fsnotify_iter_##name##_mark( \
struct fsnotify_iter_info *iter_info) \
{ \
return (iter_info->report_mask & FSNOTIFY_OBJ_TYPE_##NAME##_FL) ? \
iter_info->marks[FSNOTIFY_OBJ_TYPE_##NAME] : NULL; \
}
FSNOTIFY_ITER_FUNCS(inode, INODE)
FSNOTIFY_ITER_FUNCS(parent, PARENT)
FSNOTIFY_ITER_FUNCS(vfsmount, VFSMOUNT)FSNOTIFY_ITER_FUNCS(sb, SB)
#define fsnotify_foreach_obj_type(type) \
for (type = 0; type < FSNOTIFY_OBJ_TYPE_COUNT; type++)
/*
* fsnotify_connp_t is what we embed in objects which connector can be attached
* to. fsnotify_connp_t * is how we refer from connector back to object.
*/
struct fsnotify_mark_connector;
typedef struct fsnotify_mark_connector __rcu *fsnotify_connp_t;
/*
* Inode/vfsmount/sb point to this structure which tracks all marks attached to
* the inode/vfsmount/sb. The reference to inode/vfsmount/sb is held by this
* structure. We destroy this structure when there are no more marks attached
* to it. The structure is protected by fsnotify_mark_srcu.
*/
struct fsnotify_mark_connector {
spinlock_t lock;
unsigned short type; /* Type of object [lock] */
#define FSNOTIFY_CONN_FLAG_HAS_FSID 0x01
unsigned short flags; /* flags [lock] */
__kernel_fsid_t fsid; /* fsid of filesystem containing object */
union {
/* Object pointer [lock] */
fsnotify_connp_t *obj;
/* Used listing heads to free after srcu period expires */
struct fsnotify_mark_connector *destroy_next;
};
struct hlist_head list;
};
/*
* A mark is simply an object attached to an in core inode which allows an
* fsnotify listener to indicate they are either no longer interested in events
* of a type matching mask or only interested in those events.
*
* These are flushed when an inode is evicted from core and may be flushed
* when the inode is modified (as seen by fsnotify_access). Some fsnotify
* users (such as dnotify) will flush these when the open fd is closed and not
* at inode eviction or modification.
*
* Text in brackets is showing the lock(s) protecting modifications of a
* particular entry. obj_lock means either inode->i_lock or
* mnt->mnt_root->d_lock depending on the mark type.
*/
struct fsnotify_mark {
/* Mask this mark is for [mark->lock, group->mark_mutex] */
__u32 mask;
/* We hold one for presence in g_list. Also one ref for each 'thing'
* in kernel that found and may be using this mark. */
refcount_t refcnt;
/* Group this mark is for. Set on mark creation, stable until last ref
* is dropped */
struct fsnotify_group *group;
/* List of marks by group->marks_list. Also reused for queueing
* mark into destroy_list when it's waiting for the end of SRCU period
* before it can be freed. [group->mark_mutex] */
struct list_head g_list;
/* Protects inode / mnt pointers, flags, masks */
spinlock_t lock;
/* List of marks for inode / vfsmount [connector->lock, mark ref] */
struct hlist_node obj_list;
/* Head of list of marks for an object [mark ref] */
struct fsnotify_mark_connector *connector;
/* Events types to ignore [mark->lock, group->mark_mutex] */
__u32 ignored_mask;
#define FSNOTIFY_MARK_FLAG_IGNORED_SURV_MODIFY 0x01
#define FSNOTIFY_MARK_FLAG_ALIVE 0x02
#define FSNOTIFY_MARK_FLAG_ATTACHED 0x04
unsigned int flags; /* flags [mark->lock] */
};
#ifdef CONFIG_FSNOTIFY
/* called from the vfs helpers */
/* main fsnotify call to send events */
extern int fsnotify(__u32 mask, const void *data, int data_type,
struct inode *dir, const struct qstr *name,
struct inode *inode, u32 cookie);
extern int __fsnotify_parent(struct dentry *dentry, __u32 mask, const void *data,
int data_type);
extern void __fsnotify_inode_delete(struct inode *inode);
extern void __fsnotify_vfsmount_delete(struct vfsmount *mnt);
extern void fsnotify_sb_delete(struct super_block *sb);
extern u32 fsnotify_get_cookie(void);
static inline __u32 fsnotify_parent_needed_mask(__u32 mask)
{
/* FS_EVENT_ON_CHILD is set on marks that want parent/name info */
if (!(mask & FS_EVENT_ON_CHILD))
return 0;
/*
* This object might be watched by a mark that cares about parent/name
* info, does it care about the specific set of events that can be
* reported with parent/name info?
*/
return mask & FS_EVENTS_POSS_TO_PARENT;
}
static inline int fsnotify_inode_watches_children(struct inode *inode)
{
/* FS_EVENT_ON_CHILD is set if the inode may care */
if (!(inode->i_fsnotify_mask & FS_EVENT_ON_CHILD))
return 0;
/* this inode might care about child events, does it care about the
* specific set of events that can happen on a child? */
return inode->i_fsnotify_mask & FS_EVENTS_POSS_ON_CHILD;
}
/*
* Update the dentry with a flag indicating the interest of its parent to receive
* filesystem events when those events happens to this dentry->d_inode.
*/
static inline void fsnotify_update_flags(struct dentry *dentry)
{
assert_spin_locked(&dentry->d_lock);
/*
* Serialisation of setting PARENT_WATCHED on the dentries is provided
* by d_lock. If inotify_inode_watched changes after we have taken
* d_lock, the following __fsnotify_update_child_dentry_flags call will
* find our entry, so it will spin until we complete here, and update
* us with the new state.
*/
if (fsnotify_inode_watches_children(dentry->d_parent->d_inode)) dentry->d_flags |= DCACHE_FSNOTIFY_PARENT_WATCHED;
else
dentry->d_flags &= ~DCACHE_FSNOTIFY_PARENT_WATCHED;
}
/* called from fsnotify listeners, such as fanotify or dnotify */
/* create a new group */
extern struct fsnotify_group *fsnotify_alloc_group(const struct fsnotify_ops *ops);
extern struct fsnotify_group *fsnotify_alloc_user_group(const struct fsnotify_ops *ops);
/* get reference to a group */
extern void fsnotify_get_group(struct fsnotify_group *group);
/* drop reference on a group from fsnotify_alloc_group */
extern void fsnotify_put_group(struct fsnotify_group *group);
/* group destruction begins, stop queuing new events */
extern void fsnotify_group_stop_queueing(struct fsnotify_group *group);
/* destroy group */
extern void fsnotify_destroy_group(struct fsnotify_group *group);
/* fasync handler function */
extern int fsnotify_fasync(int fd, struct file *file, int on);
/* Free event from memory */
extern void fsnotify_destroy_event(struct fsnotify_group *group,
struct fsnotify_event *event);
/* attach the event to the group notification queue */
extern int fsnotify_add_event(struct fsnotify_group *group,
struct fsnotify_event *event,
int (*merge)(struct fsnotify_group *,
struct fsnotify_event *),
void (*insert)(struct fsnotify_group *,
struct fsnotify_event *));
/* Queue overflow event to a notification group */
static inline void fsnotify_queue_overflow(struct fsnotify_group *group)
{
fsnotify_add_event(group, group->overflow_event, NULL, NULL);
}
static inline bool fsnotify_notify_queue_is_empty(struct fsnotify_group *group)
{
assert_spin_locked(&group->notification_lock);
return list_empty(&group->notification_list);
}
extern bool fsnotify_notify_queue_is_empty(struct fsnotify_group *group);
/* return, but do not dequeue the first event on the notification queue */
extern struct fsnotify_event *fsnotify_peek_first_event(struct fsnotify_group *group);
/* return AND dequeue the first event on the notification queue */
extern struct fsnotify_event *fsnotify_remove_first_event(struct fsnotify_group *group);
/* Remove event queued in the notification list */
extern void fsnotify_remove_queued_event(struct fsnotify_group *group,
struct fsnotify_event *event);
/* functions used to manipulate the marks attached to inodes */
/* Get mask of events for a list of marks */
extern __u32 fsnotify_conn_mask(struct fsnotify_mark_connector *conn);
/* Calculate mask of events for a list of marks */
extern void fsnotify_recalc_mask(struct fsnotify_mark_connector *conn);
extern void fsnotify_init_mark(struct fsnotify_mark *mark,
struct fsnotify_group *group);
/* Find mark belonging to given group in the list of marks */
extern struct fsnotify_mark *fsnotify_find_mark(fsnotify_connp_t *connp,
struct fsnotify_group *group);
/* Get cached fsid of filesystem containing object */
extern int fsnotify_get_conn_fsid(const struct fsnotify_mark_connector *conn,
__kernel_fsid_t *fsid);
/* attach the mark to the object */
extern int fsnotify_add_mark(struct fsnotify_mark *mark,
fsnotify_connp_t *connp, unsigned int type,
int allow_dups, __kernel_fsid_t *fsid);
extern int fsnotify_add_mark_locked(struct fsnotify_mark *mark,
fsnotify_connp_t *connp,
unsigned int type, int allow_dups,
__kernel_fsid_t *fsid);
/* attach the mark to the inode */
static inline int fsnotify_add_inode_mark(struct fsnotify_mark *mark,
struct inode *inode,
int allow_dups)
{
return fsnotify_add_mark(mark, &inode->i_fsnotify_marks,
FSNOTIFY_OBJ_TYPE_INODE, allow_dups, NULL);
}
static inline int fsnotify_add_inode_mark_locked(struct fsnotify_mark *mark,
struct inode *inode,
int allow_dups)
{
return fsnotify_add_mark_locked(mark, &inode->i_fsnotify_marks,
FSNOTIFY_OBJ_TYPE_INODE, allow_dups,
NULL);
}
/* given a group and a mark, flag mark to be freed when all references are dropped */
extern void fsnotify_destroy_mark(struct fsnotify_mark *mark,
struct fsnotify_group *group);
/* detach mark from inode / mount list, group list, drop inode reference */
extern void fsnotify_detach_mark(struct fsnotify_mark *mark);
/* free mark */
extern void fsnotify_free_mark(struct fsnotify_mark *mark);
/* Wait until all marks queued for destruction are destroyed */
extern void fsnotify_wait_marks_destroyed(void);
/* run all the marks in a group, and clear all of the marks attached to given object type */
extern void fsnotify_clear_marks_by_group(struct fsnotify_group *group, unsigned int type);
/* run all the marks in a group, and clear all of the vfsmount marks */
static inline void fsnotify_clear_vfsmount_marks_by_group(struct fsnotify_group *group)
{
fsnotify_clear_marks_by_group(group, FSNOTIFY_OBJ_TYPE_VFSMOUNT_FL);
}
/* run all the marks in a group, and clear all of the inode marks */
static inline void fsnotify_clear_inode_marks_by_group(struct fsnotify_group *group)
{
fsnotify_clear_marks_by_group(group, FSNOTIFY_OBJ_TYPE_INODE_FL);
}
/* run all the marks in a group, and clear all of the sn marks */
static inline void fsnotify_clear_sb_marks_by_group(struct fsnotify_group *group)
{
fsnotify_clear_marks_by_group(group, FSNOTIFY_OBJ_TYPE_SB_FL);
}
extern void fsnotify_get_mark(struct fsnotify_mark *mark);
extern void fsnotify_put_mark(struct fsnotify_mark *mark);
extern void fsnotify_finish_user_wait(struct fsnotify_iter_info *iter_info);
extern bool fsnotify_prepare_user_wait(struct fsnotify_iter_info *iter_info);
static inline void fsnotify_init_event(struct fsnotify_event *event)
{
INIT_LIST_HEAD(&event->list);
}
#else
static inline int fsnotify(__u32 mask, const void *data, int data_type,
struct inode *dir, const struct qstr *name,
struct inode *inode, u32 cookie)
{
return 0;
}
static inline int __fsnotify_parent(struct dentry *dentry, __u32 mask,
const void *data, int data_type)
{
return 0;
}
static inline void __fsnotify_inode_delete(struct inode *inode)
{}
static inline void __fsnotify_vfsmount_delete(struct vfsmount *mnt)
{}
static inline void fsnotify_sb_delete(struct super_block *sb)
{}
static inline void fsnotify_update_flags(struct dentry *dentry)
{}
static inline u32 fsnotify_get_cookie(void)
{
return 0;
}
static inline void fsnotify_unmount_inodes(struct super_block *sb)
{}
#endif /* CONFIG_FSNOTIFY */
#endif /* __KERNEL __ */
#endif /* __LINUX_FSNOTIFY_BACKEND_H */
// SPDX-License-Identifier: GPL-2.0-only
/*
* linux/mm/memory.c
*
* Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds
*/
/*
* demand-loading started 01.12.91 - seems it is high on the list of
* things wanted, and it should be easy to implement. - Linus
*/
/*
* Ok, demand-loading was easy, shared pages a little bit tricker. Shared
* pages started 02.12.91, seems to work. - Linus.
*
* Tested sharing by executing about 30 /bin/sh: under the old kernel it
* would have taken more than the 6M I have free, but it worked well as
* far as I could see.
*
* Also corrected some "invalidate()"s - I wasn't doing enough of them.
*/
/*
* Real VM (paging to/from disk) started 18.12.91. Much more work and
* thought has to go into this. Oh, well..
* 19.12.91 - works, somewhat. Sometimes I get faults, don't know why.
* Found it. Everything seems to work now.
* 20.12.91 - Ok, making the swap-device changeable like the root.
*/
/*
* 05.04.94 - Multi-page memory management added for v1.1.
* Idea by Alex Bligh (alex@cconcepts.co.uk)
*
* 16.07.99 - Support of BIGMEM added by Gerhard Wichert, Siemens AG
* (Gerhard.Wichert@pdb.siemens.de)
*
* Aug/Sep 2004 Changed to four level page tables (Andi Kleen)
*/
#include <linux/kernel_stat.h>
#include <linux/mm.h>
#include <linux/sched/mm.h>
#include <linux/sched/coredump.h>
#include <linux/sched/numa_balancing.h>
#include <linux/sched/task.h>
#include <linux/hugetlb.h>
#include <linux/mman.h>
#include <linux/swap.h>
#include <linux/highmem.h>
#include <linux/pagemap.h>
#include <linux/memremap.h>
#include <linux/ksm.h>
#include <linux/rmap.h>
#include <linux/export.h>
#include <linux/delayacct.h>
#include <linux/init.h>
#include <linux/pfn_t.h>
#include <linux/writeback.h>
#include <linux/memcontrol.h>
#include <linux/mmu_notifier.h>
#include <linux/swapops.h>
#include <linux/elf.h>
#include <linux/gfp.h>
#include <linux/migrate.h>
#include <linux/string.h>
#include <linux/debugfs.h>
#include <linux/userfaultfd_k.h>
#include <linux/dax.h>
#include <linux/oom.h>
#include <linux/numa.h>
#include <linux/perf_event.h>
#include <linux/ptrace.h>
#include <linux/vmalloc.h>
#include <trace/events/kmem.h>
#include <asm/io.h>
#include <asm/mmu_context.h>
#include <asm/pgalloc.h>
#include <linux/uaccess.h>
#include <asm/tlb.h>
#include <asm/tlbflush.h>
#include "pgalloc-track.h"
#include "internal.h"
#if defined(LAST_CPUPID_NOT_IN_PAGE_FLAGS) && !defined(CONFIG_COMPILE_TEST)
#warning Unfortunate NUMA and NUMA Balancing config, growing page-frame for last_cpupid.
#endif
#ifndef CONFIG_NUMA
unsigned long max_mapnr;
EXPORT_SYMBOL(max_mapnr);
struct page *mem_map;
EXPORT_SYMBOL(mem_map);
#endif
/*
* A number of key systems in x86 including ioremap() rely on the assumption
* that high_memory defines the upper bound on direct map memory, then end
* of ZONE_NORMAL. Under CONFIG_DISCONTIG this means that max_low_pfn and
* highstart_pfn must be the same; there must be no gap between ZONE_NORMAL
* and ZONE_HIGHMEM.
*/
void *high_memory;
EXPORT_SYMBOL(high_memory);
/*
* Randomize the address space (stacks, mmaps, brk, etc.).
*
* ( When CONFIG_COMPAT_BRK=y we exclude brk from randomization,
* as ancient (libc5 based) binaries can segfault. )
*/
int randomize_va_space __read_mostly =
#ifdef CONFIG_COMPAT_BRK
1;
#else
2;
#endif
#ifndef arch_faults_on_old_pte
static inline bool arch_faults_on_old_pte(void)
{
/*
* Those arches which don't have hw access flag feature need to
* implement their own helper. By default, "true" means pagefault
* will be hit on old pte.
*/
return true;
}
#endif
#ifndef arch_wants_old_prefaulted_pte
static inline bool arch_wants_old_prefaulted_pte(void)
{
/*
* Transitioning a PTE from 'old' to 'young' can be expensive on
* some architectures, even if it's performed in hardware. By
* default, "false" means prefaulted entries will be 'young'.
*/
return false;
}
#endif
static int __init disable_randmaps(char *s)
{
randomize_va_space = 0;
return 1;
}
__setup("norandmaps", disable_randmaps);
unsigned long zero_pfn __read_mostly;
EXPORT_SYMBOL(zero_pfn);
unsigned long highest_memmap_pfn __read_mostly;
/*
* CONFIG_MMU architectures set up ZERO_PAGE in their paging_init()
*/
static int __init init_zero_pfn(void)
{
zero_pfn = page_to_pfn(ZERO_PAGE(0));
return 0;
}
early_initcall(init_zero_pfn);
void mm_trace_rss_stat(struct mm_struct *mm, int member, long count)
{
trace_rss_stat(mm, member, count);
}
#if defined(SPLIT_RSS_COUNTING)
void sync_mm_rss(struct mm_struct *mm)
{
int i;
for (i = 0; i < NR_MM_COUNTERS; i++) {
if (current->rss_stat.count[i]) {
add_mm_counter(mm, i, current->rss_stat.count[i]);
current->rss_stat.count[i] = 0;
}
}
current->rss_stat.events = 0;
}
static void add_mm_counter_fast(struct mm_struct *mm, int member, int val)
{
struct task_struct *task = current;
if (likely(task->mm == mm))
task->rss_stat.count[member] += val;
else
add_mm_counter(mm, member, val);
}
#define inc_mm_counter_fast(mm, member) add_mm_counter_fast(mm, member, 1)
#define dec_mm_counter_fast(mm, member) add_mm_counter_fast(mm, member, -1)
/* sync counter once per 64 page faults */
#define TASK_RSS_EVENTS_THRESH (64)
static void check_sync_rss_stat(struct task_struct *task)
{
if (unlikely(task != current))
return;
if (unlikely(task->rss_stat.events++ > TASK_RSS_EVENTS_THRESH)) sync_mm_rss(task->mm);
}
#else /* SPLIT_RSS_COUNTING */
#define inc_mm_counter_fast(mm, member) inc_mm_counter(mm, member)
#define dec_mm_counter_fast(mm, member) dec_mm_counter(mm, member)
static void check_sync_rss_stat(struct task_struct *task)
{
}
#endif /* SPLIT_RSS_COUNTING */
/*
* Note: this doesn't free the actual pages themselves. That
* has been handled earlier when unmapping all the memory regions.
*/
static void free_pte_range(struct mmu_gather *tlb, pmd_t *pmd,
unsigned long addr)
{
pgtable_t token = pmd_pgtable(*pmd);
pmd_clear(pmd);
pte_free_tlb(tlb, token, addr);
mm_dec_nr_ptes(tlb->mm);
}
static inline void free_pmd_range(struct mmu_gather *tlb, pud_t *pud,
unsigned long addr, unsigned long end,
unsigned long floor, unsigned long ceiling)
{
pmd_t *pmd;
unsigned long next;
unsigned long start;
start = addr;
pmd = pmd_offset(pud, addr);
do {
next = pmd_addr_end(addr, end);
if (pmd_none_or_clear_bad(pmd))
continue;
free_pte_range(tlb, pmd, addr);
} while (pmd++, addr = next, addr != end);
start &= PUD_MASK;
if (start < floor)
return;
if (ceiling) {
ceiling &= PUD_MASK;
if (!ceiling)
return;
}
if (end - 1 > ceiling - 1)
return;
pmd = pmd_offset(pud, start);
pud_clear(pud);
pmd_free_tlb(tlb, pmd, start);
mm_dec_nr_pmds(tlb->mm);
}
static inline void free_pud_range(struct mmu_gather *tlb, p4d_t *p4d,
unsigned long addr, unsigned long end,
unsigned long floor, unsigned long ceiling)
{
pud_t *pud;
unsigned long next;
unsigned long start;
start = addr;
pud = pud_offset(p4d, addr);
do {
next = pud_addr_end(addr, end);
if (pud_none_or_clear_bad(pud))
continue;
free_pmd_range(tlb, pud, addr, next, floor, ceiling);
} while (pud++, addr = next, addr != end);
start &= P4D_MASK;
if (start < floor)
return;
if (ceiling) {
ceiling &= P4D_MASK;
if (!ceiling)
return;
}
if (end - 1 > ceiling - 1)
return;
pud = pud_offset(p4d, start);
p4d_clear(p4d);
pud_free_tlb(tlb, pud, start);
mm_dec_nr_puds(tlb->mm);
}
static inline void free_p4d_range(struct mmu_gather *tlb, pgd_t *pgd,
unsigned long addr, unsigned long end,
unsigned long floor, unsigned long ceiling)
{
p4d_t *p4d;
unsigned long next;
unsigned long start;
start = addr;
p4d = p4d_offset(pgd, addr);
do {
next = p4d_addr_end(addr, end);
if (p4d_none_or_clear_bad(p4d))
continue;
free_pud_range(tlb, p4d, addr, next, floor, ceiling);
} while (p4d++, addr = next, addr != end);
start &= PGDIR_MASK;
if (start < floor)
return;
if (ceiling) {
ceiling &= PGDIR_MASK;
if (!ceiling)
return;
}
if (end - 1 > ceiling - 1)
return;
p4d = p4d_offset(pgd, start);
pgd_clear(pgd);
p4d_free_tlb(tlb, p4d, start);
}
/*
* This function frees user-level page tables of a process.
*/
void free_pgd_range(struct mmu_gather *tlb,
unsigned long addr, unsigned long end,
unsigned long floor, unsigned long ceiling)
{
pgd_t *pgd;
unsigned long next;
/*
* The next few lines have given us lots of grief...
*
* Why are we testing PMD* at this top level? Because often
* there will be no work to do at all, and we'd prefer not to
* go all the way down to the bottom just to discover that.
*
* Why all these "- 1"s? Because 0 represents both the bottom
* of the address space and the top of it (using -1 for the
* top wouldn't help much: the masks would do the wrong thing).
* The rule is that addr 0 and floor 0 refer to the bottom of
* the address space, but end 0 and ceiling 0 refer to the top
* Comparisons need to use "end - 1" and "ceiling - 1" (though
* that end 0 case should be mythical).
*
* Wherever addr is brought up or ceiling brought down, we must
* be careful to reject "the opposite 0" before it confuses the
* subsequent tests. But what about where end is brought down
* by PMD_SIZE below? no, end can't go down to 0 there.
*
* Whereas we round start (addr) and ceiling down, by different
* masks at different levels, in order to test whether a table
* now has no other vmas using it, so can be freed, we don't
* bother to round floor or end up - the tests don't need that.
*/
addr &= PMD_MASK;
if (addr < floor) {
addr += PMD_SIZE;
if (!addr)
return;
}
if (ceiling) {
ceiling &= PMD_MASK;
if (!ceiling)
return;
}
if (end - 1 > ceiling - 1)
end -= PMD_SIZE;
if (addr > end - 1)
return;
/*
* We add page table cache pages with PAGE_SIZE,
* (see pte_free_tlb()), flush the tlb if we need
*/
tlb_change_page_size(tlb, PAGE_SIZE);
pgd = pgd_offset(tlb->mm, addr);
do {
next = pgd_addr_end(addr, end);
if (pgd_none_or_clear_bad(pgd))
continue;
free_p4d_range(tlb, pgd, addr, next, floor, ceiling);
} while (pgd++, addr = next, addr != end);
}
void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *vma,
unsigned long floor, unsigned long ceiling)
{
while (vma) {
struct vm_area_struct *next = vma->vm_next;
unsigned long addr = vma->vm_start;
/*
* Hide vma from rmap and truncate_pagecache before freeing
* pgtables
*/
unlink_anon_vmas(vma);
unlink_file_vma(vma);
if (is_vm_hugetlb_page(vma)) {
hugetlb_free_pgd_range(tlb, addr, vma->vm_end,
floor, next ? next->vm_start : ceiling);
} else {
/*
* Optimization: gather nearby vmas into one call down
*/
while (next && next->vm_start <= vma->vm_end + PMD_SIZE
&& !is_vm_hugetlb_page(next)) {
vma = next;
next = vma->vm_next;
unlink_anon_vmas(vma);
unlink_file_vma(vma);
}
free_pgd_range(tlb, addr, vma->vm_end,
floor, next ? next->vm_start : ceiling);
}
vma = next;
}
}
int __pte_alloc(struct mm_struct *mm, pmd_t *pmd)
{
spinlock_t *ptl;
pgtable_t new = pte_alloc_one(mm);
if (!new)
return -ENOMEM;
/*
* Ensure all pte setup (eg. pte page lock and page clearing) are
* visible before the pte is made visible to other CPUs by being
* put into page tables.
*
* The other side of the story is the pointer chasing in the page
* table walking code (when walking the page table without locking;
* ie. most of the time). Fortunately, these data accesses consist
* of a chain of data-dependent loads, meaning most CPUs (alpha
* being the notable exception) will already guarantee loads are
* seen in-order. See the alpha page table accessors for the
* smp_rmb() barriers in page table walking code.
*/
smp_wmb(); /* Could be smp_wmb__xxx(before|after)_spin_lock */
ptl = pmd_lock(mm, pmd);
if (likely(pmd_none(*pmd))) { /* Has another populated it ? */
mm_inc_nr_ptes(mm);
pmd_populate(mm, pmd, new);
new = NULL;
}
spin_unlock(ptl);
if (new)
pte_free(mm, new);
return 0;
}
int __pte_alloc_kernel(pmd_t *pmd)
{
pte_t *new = pte_alloc_one_kernel(&init_mm);
if (!new)
return -ENOMEM;
smp_wmb(); /* See comment in __pte_alloc */
spin_lock(&init_mm.page_table_lock);
if (likely(pmd_none(*pmd))) { /* Has another populated it ? */
pmd_populate_kernel(&init_mm, pmd, new);
new = NULL;
}
spin_unlock(&init_mm.page_table_lock);
if (new)
pte_free_kernel(&init_mm, new);
return 0;
}
static inline void init_rss_vec(int *rss)
{
memset(rss, 0, sizeof(int) * NR_MM_COUNTERS);
}
static inline void add_mm_rss_vec(struct mm_struct *mm, int *rss)
{
int i;
if (current->mm == mm)
sync_mm_rss(mm);
for (i = 0; i < NR_MM_COUNTERS; i++)
if (rss[i])
add_mm_counter(mm, i, rss[i]);
}
/*
* This function is called to print an error when a bad pte
* is found. For example, we might have a PFN-mapped pte in
* a region that doesn't allow it.
*
* The calling function must still handle the error.
*/
static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr,
pte_t pte, struct page *page)
{
pgd_t *pgd = pgd_offset(vma->vm_mm, addr);
p4d_t *p4d = p4d_offset(pgd, addr);
pud_t *pud = pud_offset(p4d, addr);
pmd_t *pmd = pmd_offset(pud, addr);
struct address_space *mapping;
pgoff_t index;
static unsigned long resume;
static unsigned long nr_shown;
static unsigned long nr_unshown;
/*
* Allow a burst of 60 reports, then keep quiet for that minute;
* or allow a steady drip of one report per second.
*/
if (nr_shown == 60) {
if (time_before(jiffies, resume)) {
nr_unshown++;
return;
}
if (nr_unshown) {
pr_alert("BUG: Bad page map: %lu messages suppressed\n",
nr_unshown);
nr_unshown = 0;
}
nr_shown = 0;
}
if (nr_shown++ == 0)
resume = jiffies + 60 * HZ;
mapping = vma->vm_file ? vma->vm_file->f_mapping : NULL;
index = linear_page_index(vma, addr);
pr_alert("BUG: Bad page map in process %s pte:%08llx pmd:%08llx\n",
current->comm,
(long long)pte_val(pte), (long long)pmd_val(*pmd));
if (page)
dump_page(page, "bad pte");
pr_alert("addr:%px vm_flags:%08lx anon_vma:%px mapping:%px index:%lx\n",
(void *)addr, vma->vm_flags, vma->anon_vma, mapping, index);
pr_alert("file:%pD fault:%ps mmap:%ps readpage:%ps\n",
vma->vm_file,
vma->vm_ops ? vma->vm_ops->fault : NULL,
vma->vm_file ? vma->vm_file->f_op->mmap : NULL,
mapping ? mapping->a_ops->readpage : NULL);
dump_stack();
add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
}
/*
* vm_normal_page -- This function gets the "struct page" associated with a pte.
*
* "Special" mappings do not wish to be associated with a "struct page" (either
* it doesn't exist, or it exists but they don't want to touch it). In this
* case, NULL is returned here. "Normal" mappings do have a struct page.
*
* There are 2 broad cases. Firstly, an architecture may define a pte_special()
* pte bit, in which case this function is trivial. Secondly, an architecture
* may not have a spare pte bit, which requires a more complicated scheme,
* described below.
*
* A raw VM_PFNMAP mapping (ie. one that is not COWed) is always considered a
* special mapping (even if there are underlying and valid "struct pages").
* COWed pages of a VM_PFNMAP are always normal.
*
* The way we recognize COWed pages within VM_PFNMAP mappings is through the
* rules set up by "remap_pfn_range()": the vma will have the VM_PFNMAP bit
* set, and the vm_pgoff will point to the first PFN mapped: thus every special
* mapping will always honor the rule
*
* pfn_of_page == vma->vm_pgoff + ((addr - vma->vm_start) >> PAGE_SHIFT)
*
* And for normal mappings this is false.
*
* This restricts such mappings to be a linear translation from virtual address
* to pfn. To get around this restriction, we allow arbitrary mappings so long
* as the vma is not a COW mapping; in that case, we know that all ptes are
* special (because none can have been COWed).
*
*
* In order to support COW of arbitrary special mappings, we have VM_MIXEDMAP.
*
* VM_MIXEDMAP mappings can likewise contain memory with or without "struct
* page" backing, however the difference is that _all_ pages with a struct
* page (that is, those where pfn_valid is true) are refcounted and considered
* normal pages by the VM. The disadvantage is that pages are refcounted
* (which can be slower and simply not an option for some PFNMAP users). The
* advantage is that we don't have to follow the strict linearity rule of
* PFNMAP mappings in order to support COWable mappings.
*
*/
struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
pte_t pte)
{
unsigned long pfn = pte_pfn(pte);
if (IS_ENABLED(CONFIG_ARCH_HAS_PTE_SPECIAL)) {
if (likely(!pte_special(pte)))
goto check_pfn;
if (vma->vm_ops && vma->vm_ops->find_special_page) return vma->vm_ops->find_special_page(vma, addr); if (vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP))
return NULL;
if (is_zero_pfn(pfn))
return NULL;
if (pte_devmap(pte))
return NULL;
print_bad_pte(vma, addr, pte, NULL);
return NULL;
}
/* !CONFIG_ARCH_HAS_PTE_SPECIAL case follows: */
if (unlikely(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP))) {
if (vma->vm_flags & VM_MIXEDMAP) {
if (!pfn_valid(pfn))
return NULL;
goto out;
} else {
unsigned long off;
off = (addr - vma->vm_start) >> PAGE_SHIFT;
if (pfn == vma->vm_pgoff + off)
return NULL;
if (!is_cow_mapping(vma->vm_flags))
return NULL;
}
}
if (is_zero_pfn(pfn))
return NULL;
check_pfn:
if (unlikely(pfn > highest_memmap_pfn)) {
print_bad_pte(vma, addr, pte, NULL);
return NULL;
}
/*
* NOTE! We still have PageReserved() pages in the page tables.
* eg. VDSO mappings can cause them to exist.
*/
out:
return pfn_to_page(pfn);
}
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
struct page *vm_normal_page_pmd(struct vm_area_struct *vma, unsigned long addr,
pmd_t pmd)
{
unsigned long pfn = pmd_pfn(pmd);
/*
* There is no pmd_special() but there may be special pmds, e.g.
* in a direct-access (dax) mapping, so let's just replicate the
* !CONFIG_ARCH_HAS_PTE_SPECIAL case from vm_normal_page() here.
*/
if (unlikely(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP))) {
if (vma->vm_flags & VM_MIXEDMAP) {
if (!pfn_valid(pfn))
return NULL;
goto out;
} else {
unsigned long off;
off = (addr - vma->vm_start) >> PAGE_SHIFT;
if (pfn == vma->vm_pgoff + off)
return NULL;
if (!is_cow_mapping(vma->vm_flags))
return NULL;
}
}
if (pmd_devmap(pmd))
return NULL;
if (is_huge_zero_pmd(pmd))
return NULL;
if (unlikely(pfn > highest_memmap_pfn))
return NULL;
/*
* NOTE! We still have PageReserved() pages in the page tables.
* eg. VDSO mappings can cause them to exist.
*/
out:
return pfn_to_page(pfn);
}
#endif
static void restore_exclusive_pte(struct vm_area_struct *vma,
struct page *page, unsigned long address,
pte_t *ptep)
{
pte_t pte;
swp_entry_t entry;
pte = pte_mkold(mk_pte(page, READ_ONCE(vma->vm_page_prot)));
if (pte_swp_soft_dirty(*ptep))
pte = pte_mksoft_dirty(pte);
entry = pte_to_swp_entry(*ptep);
if (pte_swp_uffd_wp(*ptep))
pte = pte_mkuffd_wp(pte);
else if (is_writable_device_exclusive_entry(entry))
pte = maybe_mkwrite(pte_mkdirty(pte), vma);
set_pte_at(vma->vm_mm, address, ptep, pte);
/*
* No need to take a page reference as one was already
* created when the swap entry was made.
*/
if (PageAnon(page))
page_add_anon_rmap(page, vma, address, false);
else
/*
* Currently device exclusive access only supports anonymous
* memory so the entry shouldn't point to a filebacked page.
*/
WARN_ON_ONCE(!PageAnon(page));
if (vma->vm_flags & VM_LOCKED)
mlock_vma_page(page);
/*
* No need to invalidate - it was non-present before. However
* secondary CPUs may have mappings that need invalidating.
*/
update_mmu_cache(vma, address, ptep);
}
/*
* Tries to restore an exclusive pte if the page lock can be acquired without
* sleeping.
*/
static int
try_restore_exclusive_pte(pte_t *src_pte, struct vm_area_struct *vma,
unsigned long addr)
{
swp_entry_t entry = pte_to_swp_entry(*src_pte);
struct page *page = pfn_swap_entry_to_page(entry);
if (trylock_page(page)) {
restore_exclusive_pte(vma, page, addr, src_pte);
unlock_page(page);
return 0;
}
return -EBUSY;
}
/*
* copy one vm_area from one task to the other. Assumes the page tables
* already present in the new task to be cleared in the whole range
* covered by this vma.
*/
static unsigned long
copy_nonpresent_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
pte_t *dst_pte, pte_t *src_pte, struct vm_area_struct *dst_vma,
struct vm_area_struct *src_vma, unsigned long addr, int *rss)
{
unsigned long vm_flags = dst_vma->vm_flags;
pte_t pte = *src_pte;
struct page *page;
swp_entry_t entry = pte_to_swp_entry(pte);
if (likely(!non_swap_entry(entry))) {
if (swap_duplicate(entry) < 0)
return -EIO;
/* make sure dst_mm is on swapoff's mmlist. */
if (unlikely(list_empty(&dst_mm->mmlist))) {
spin_lock(&mmlist_lock);
if (list_empty(&dst_mm->mmlist))
list_add(&dst_mm->mmlist,
&src_mm->mmlist);
spin_unlock(&mmlist_lock);
}
rss[MM_SWAPENTS]++;
} else if (is_migration_entry(entry)) {
page = pfn_swap_entry_to_page(entry);
rss[mm_counter(page)]++;
if (is_writable_migration_entry(entry) &&
is_cow_mapping(vm_flags)) {
/*
* COW mappings require pages in both
* parent and child to be set to read.
*/
entry = make_readable_migration_entry(
swp_offset(entry));
pte = swp_entry_to_pte(entry);
if (pte_swp_soft_dirty(*src_pte))
pte = pte_swp_mksoft_dirty(pte);
if (pte_swp_uffd_wp(*src_pte))
pte = pte_swp_mkuffd_wp(pte);
set_pte_at(src_mm, addr, src_pte, pte);
}
} else if (is_device_private_entry(entry)) {
page = pfn_swap_entry_to_page(entry);
/*
* Update rss count even for unaddressable pages, as
* they should treated just like normal pages in this
* respect.
*
* We will likely want to have some new rss counters
* for unaddressable pages, at some point. But for now
* keep things as they are.
*/
get_page(page);
rss[mm_counter(page)]++;
page_dup_rmap(page, false);
/*
* We do not preserve soft-dirty information, because so
* far, checkpoint/restore is the only feature that
* requires that. And checkpoint/restore does not work
* when a device driver is involved (you cannot easily
* save and restore device driver state).
*/
if (is_writable_device_private_entry(entry) &&
is_cow_mapping(vm_flags)) {
entry = make_readable_device_private_entry(
swp_offset(entry));
pte = swp_entry_to_pte(entry);
if (pte_swp_uffd_wp(*src_pte))
pte = pte_swp_mkuffd_wp(pte);
set_pte_at(src_mm, addr, src_pte, pte);
}
} else if (is_device_exclusive_entry(entry)) {
/*
* Make device exclusive entries present by restoring the
* original entry then copying as for a present pte. Device
* exclusive entries currently only support private writable
* (ie. COW) mappings.
*/
VM_BUG_ON(!is_cow_mapping(src_vma->vm_flags));
if (try_restore_exclusive_pte(src_pte, src_vma, addr))
return -EBUSY;
return -ENOENT;
}
if (!userfaultfd_wp(dst_vma))
pte = pte_swp_clear_uffd_wp(pte);
set_pte_at(dst_mm, addr, dst_pte, pte);
return 0;
}
/*
* Copy a present and normal page if necessary.
*
* NOTE! The usual case is that this doesn't need to do
* anything, and can just return a positive value. That
* will let the caller know that it can just increase
* the page refcount and re-use the pte the traditional
* way.
*
* But _if_ we need to copy it because it needs to be
* pinned in the parent (and the child should get its own
* copy rather than just a reference to the same page),
* we'll do that here and return zero to let the caller
* know we're done.
*
* And if we need a pre-allocated page but don't yet have
* one, return a negative error to let the preallocation
* code know so that it can do so outside the page table
* lock.
*/
static inline int
copy_present_page(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
pte_t *dst_pte, pte_t *src_pte, unsigned long addr, int *rss,
struct page **prealloc, pte_t pte, struct page *page)
{
struct page *new_page;
/*
* What we want to do is to check whether this page may
* have been pinned by the parent process. If so,
* instead of wrprotect the pte on both sides, we copy
* the page immediately so that we'll always guarantee
* the pinned page won't be randomly replaced in the
* future.
*
* The page pinning checks are just "has this mm ever
* seen pinning", along with the (inexact) check of
* the page count. That might give false positives for
* for pinning, but it will work correctly.
*/
if (likely(!page_needs_cow_for_dma(src_vma, page)))
return 1;
new_page = *prealloc;
if (!new_page)
return -EAGAIN;
/*
* We have a prealloc page, all good! Take it
* over and copy the page & arm it.
*/
*prealloc = NULL;
copy_user_highpage(new_page, page, addr, src_vma);
__SetPageUptodate(new_page);
page_add_new_anon_rmap(new_page, dst_vma, addr, false);
lru_cache_add_inactive_or_unevictable(new_page, dst_vma);
rss[mm_counter(new_page)]++;
/* All done, just insert the new page copy in the child */
pte = mk_pte(new_page, dst_vma->vm_page_prot);
pte = maybe_mkwrite(pte_mkdirty(pte), dst_vma);
if (userfaultfd_pte_wp(dst_vma, *src_pte))
/* Uffd-wp needs to be delivered to dest pte as well */
pte = pte_wrprotect(pte_mkuffd_wp(pte));
set_pte_at(dst_vma->vm_mm, addr, dst_pte, pte);
return 0;
}
/*
* Copy one pte. Returns 0 if succeeded, or -EAGAIN if one preallocated page
* is required to copy this pte.
*/
static inline int
copy_present_pte(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
pte_t *dst_pte, pte_t *src_pte, unsigned long addr, int *rss,
struct page **prealloc)
{
struct mm_struct *src_mm = src_vma->vm_mm;
unsigned long vm_flags = src_vma->vm_flags;
pte_t pte = *src_pte;
struct page *page;
page = vm_normal_page(src_vma, addr, pte);
if (page) {
int retval;
retval = copy_present_page(dst_vma, src_vma, dst_pte, src_pte,
addr, rss, prealloc, pte, page);
if (retval <= 0)
return retval;
get_page(page);
page_dup_rmap(page, false);
rss[mm_counter(page)]++;
}
/*
* If it's a COW mapping, write protect it both
* in the parent and the child
*/
if (is_cow_mapping(vm_flags) && pte_write(pte)) {
ptep_set_wrprotect(src_mm, addr, src_pte);
pte = pte_wrprotect(pte);
}
/*
* If it's a shared mapping, mark it clean in
* the child
*/
if (vm_flags & VM_SHARED)
pte = pte_mkclean(pte);
pte = pte_mkold(pte);
if (!userfaultfd_wp(dst_vma))
pte = pte_clear_uffd_wp(pte);
set_pte_at(dst_vma->vm_mm, addr, dst_pte, pte);
return 0;
}
static inline struct page *
page_copy_prealloc(struct mm_struct *src_mm, struct vm_area_struct *vma,
unsigned long addr)
{
struct page *new_page;
new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, addr);
if (!new_page)
return NULL;
if (mem_cgroup_charge(new_page, src_mm, GFP_KERNEL)) {
put_page(new_page);
return NULL;
}
cgroup_throttle_swaprate(new_page, GFP_KERNEL);
return new_page;
}
static int
copy_pte_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr,
unsigned long end)
{
struct mm_struct *dst_mm = dst_vma->vm_mm;
struct mm_struct *src_mm = src_vma->vm_mm;
pte_t *orig_src_pte, *orig_dst_pte;
pte_t *src_pte, *dst_pte;
spinlock_t *src_ptl, *dst_ptl;
int progress, ret = 0;
int rss[NR_MM_COUNTERS];
swp_entry_t entry = (swp_entry_t){0};
struct page *prealloc = NULL;
again:
progress = 0;
init_rss_vec(rss);
dst_pte = pte_alloc_map_lock(dst_mm, dst_pmd, addr, &dst_ptl);
if (!dst_pte) {
ret = -ENOMEM;
goto out;
}
src_pte = pte_offset_map(src_pmd, addr);
src_ptl = pte_lockptr(src_mm, src_pmd);
spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
orig_src_pte = src_pte;
orig_dst_pte = dst_pte;
arch_enter_lazy_mmu_mode();
do {
/*
* We are holding two locks at this point - either of them
* could generate latencies in another task on another CPU.
*/
if (progress >= 32) {
progress = 0;
if (need_resched() ||
spin_needbreak(src_ptl) || spin_needbreak(dst_ptl))
break;
}
if (pte_none(*src_pte)) {
progress++;
continue;
}
if (unlikely(!pte_present(*src_pte))) {
ret = copy_nonpresent_pte(dst_mm, src_mm,
dst_pte, src_pte,
dst_vma, src_vma,
addr, rss);
if (ret == -EIO) {
entry = pte_to_swp_entry(*src_pte);
break;
} else if (ret == -EBUSY) {
break;
} else if (!ret) {
progress += 8;
continue;
}
/*
* Device exclusive entry restored, continue by copying
* the now present pte.
*/
WARN_ON_ONCE(ret != -ENOENT);
}
/* copy_present_pte() will clear `*prealloc' if consumed */
ret = copy_present_pte(dst_vma, src_vma, dst_pte, src_pte,
addr, rss, &prealloc);
/*
* If we need a pre-allocated page for this pte, drop the
* locks, allocate, and try again.
*/
if (unlikely(ret == -EAGAIN))
break;
if (unlikely(prealloc)) {
/*
* pre-alloc page cannot be reused by next time so as
* to strictly follow mempolicy (e.g., alloc_page_vma()
* will allocate page according to address). This
* could only happen if one pinned pte changed.
*/
put_page(prealloc);
prealloc = NULL;
}
progress += 8;
} while (dst_pte++, src_pte++, addr += PAGE_SIZE, addr != end);
arch_leave_lazy_mmu_mode();
spin_unlock(src_ptl);
pte_unmap(orig_src_pte);
add_mm_rss_vec(dst_mm, rss);
pte_unmap_unlock(orig_dst_pte, dst_ptl);
cond_resched();
if (ret == -EIO) {
VM_WARN_ON_ONCE(!entry.val);
if (add_swap_count_continuation(entry, GFP_KERNEL) < 0) {
ret = -ENOMEM;
goto out;
}
entry.val = 0;
} else if (ret == -EBUSY) {
goto out;
} else if (ret == -EAGAIN) {
prealloc = page_copy_prealloc(src_mm, src_vma, addr);
if (!prealloc)
return -ENOMEM;
} else if (ret) {
VM_WARN_ON_ONCE(1);
}
/* We've captured and resolved the error. Reset, try again. */
ret = 0;
if (addr != end)
goto again;
out:
if (unlikely(prealloc))
put_page(prealloc);
return ret;
}
static inline int
copy_pmd_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
pud_t *dst_pud, pud_t *src_pud, unsigned long addr,
unsigned long end)
{
struct mm_struct *dst_mm = dst_vma->vm_mm;
struct mm_struct *src_mm = src_vma->vm_mm;
pmd_t *src_pmd, *dst_pmd;
unsigned long next;
dst_pmd = pmd_alloc(dst_mm, dst_pud, addr);
if (!dst_pmd)
return -ENOMEM;
src_pmd = pmd_offset(src_pud, addr);
do {
next = pmd_addr_end(addr, end);
if (is_swap_pmd(*src_pmd) || pmd_trans_huge(*src_pmd)
|| pmd_devmap(*src_pmd)) {
int err;
VM_BUG_ON_VMA(next-addr != HPAGE_PMD_SIZE, src_vma);
err = copy_huge_pmd(dst_mm, src_mm, dst_pmd, src_pmd,
addr, dst_vma, src_vma);
if (err == -ENOMEM)
return -ENOMEM;
if (!err)
continue;
/* fall through */
}
if (pmd_none_or_clear_bad(src_pmd))
continue;
if (copy_pte_range(dst_vma, src_vma, dst_pmd, src_pmd,
addr, next))
return -ENOMEM;
} while (dst_pmd++, src_pmd++, addr = next, addr != end);
return 0;
}
static inline int
copy_pud_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
p4d_t *dst_p4d, p4d_t *src_p4d, unsigned long addr,
unsigned long end)
{
struct mm_struct *dst_mm = dst_vma->vm_mm;
struct mm_struct *src_mm = src_vma->vm_mm;
pud_t *src_pud, *dst_pud;
unsigned long next;
dst_pud = pud_alloc(dst_mm, dst_p4d, addr);
if (!dst_pud)
return -ENOMEM;
src_pud = pud_offset(src_p4d, addr);
do {
next = pud_addr_end(addr, end);
if (pud_trans_huge(*src_pud) || pud_devmap(*src_pud)) {
int err;
VM_BUG_ON_VMA(next-addr != HPAGE_PUD_SIZE, src_vma);
err = copy_huge_pud(dst_mm, src_mm,
dst_pud, src_pud, addr, src_vma);
if (err == -ENOMEM)
return -ENOMEM;
if (!err)
continue;
/* fall through */
}
if (pud_none_or_clear_bad(src_pud))
continue;
if (copy_pmd_range(dst_vma, src_vma, dst_pud, src_pud,
addr, next))
return -ENOMEM;
} while (dst_pud++, src_pud++, addr = next, addr != end);
return 0;
}
static inline int
copy_p4d_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
pgd_t *dst_pgd, pgd_t *src_pgd, unsigned long addr,
unsigned long end)
{
struct mm_struct *dst_mm = dst_vma->vm_mm;
p4d_t *src_p4d, *dst_p4d;
unsigned long next;
dst_p4d = p4d_alloc(dst_mm, dst_pgd, addr);
if (!dst_p4d)
return -ENOMEM;
src_p4d = p4d_offset(src_pgd, addr);
do {
next = p4d_addr_end(addr, end);
if (p4d_none_or_clear_bad(src_p4d))
continue;
if (copy_pud_range(dst_vma, src_vma, dst_p4d, src_p4d,
addr, next))
return -ENOMEM;
} while (dst_p4d++, src_p4d++, addr = next, addr != end);
return 0;
}
int
copy_page_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma)
{
pgd_t *src_pgd, *dst_pgd;
unsigned long next;
unsigned long addr = src_vma->vm_start;
unsigned long end = src_vma->vm_end;
struct mm_struct *dst_mm = dst_vma->vm_mm;
struct mm_struct *src_mm = src_vma->vm_mm;
struct mmu_notifier_range range;
bool is_cow;
int ret;
/*
* Don't copy ptes where a page fault will fill them correctly.
* Fork becomes much lighter when there are big shared or private
* readonly mappings. The tradeoff is that copy_page_range is more
* efficient than faulting.
*/
if (!(src_vma->vm_flags & (VM_HUGETLB | VM_PFNMAP | VM_MIXEDMAP)) &&
!src_vma->anon_vma)
return 0;
if (is_vm_hugetlb_page(src_vma))
return copy_hugetlb_page_range(dst_mm, src_mm, src_vma);
if (unlikely(src_vma->vm_flags & VM_PFNMAP)) {
/*
* We do not free on error cases below as remove_vma
* gets called on error from higher level routine
*/
ret = track_pfn_copy(src_vma);
if (ret)
return ret;
}
/*
* We need to invalidate the secondary MMU mappings only when
* there could be a permission downgrade on the ptes of the
* parent mm. And a permission downgrade will only happen if
* is_cow_mapping() returns true.
*/
is_cow = is_cow_mapping(src_vma->vm_flags);
if (is_cow) {
mmu_notifier_range_init(&range, MMU_NOTIFY_PROTECTION_PAGE,
0, src_vma, src_mm, addr, end);
mmu_notifier_invalidate_range_start(&range);
/*
* Disabling preemption is not needed for the write side, as
* the read side doesn't spin, but goes to the mmap_lock.
*
* Use the raw variant of the seqcount_t write API to avoid
* lockdep complaining about preemptibility.
*/
mmap_assert_write_locked(src_mm);
raw_write_seqcount_begin(&src_mm->write_protect_seq);
}
ret = 0;
dst_pgd = pgd_offset(dst_mm, addr);
src_pgd = pgd_offset(src_mm, addr);
do {
next = pgd_addr_end(addr, end);
if (pgd_none_or_clear_bad(src_pgd))
continue;
if (unlikely(copy_p4d_range(dst_vma, src_vma, dst_pgd, src_pgd,
addr, next))) {
ret = -ENOMEM;
break;
}
} while (dst_pgd++, src_pgd++, addr = next, addr != end);
if (is_cow) {
raw_write_seqcount_end(&src_mm->write_protect_seq);
mmu_notifier_invalidate_range_end(&range);
}
return ret;
}
/* Whether we should zap all COWed (private) pages too */
static inline bool should_zap_cows(struct zap_details *details)
{
/* By default, zap all pages */
if (!details)
return true;
/* Or, we zap COWed pages only if the caller wants to */
return !details->check_mapping;
}
static unsigned long zap_pte_range(struct mmu_gather *tlb,
struct vm_area_struct *vma, pmd_t *pmd,
unsigned long addr, unsigned long end,
struct zap_details *details)
{
struct mm_struct *mm = tlb->mm;
int force_flush = 0;
int rss[NR_MM_COUNTERS];
spinlock_t *ptl;
pte_t *start_pte;
pte_t *pte;
swp_entry_t entry;
tlb_change_page_size(tlb, PAGE_SIZE);
again:
init_rss_vec(rss);
start_pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
pte = start_pte;
flush_tlb_batched_pending(mm);
arch_enter_lazy_mmu_mode();
do {
pte_t ptent = *pte;
if (pte_none(ptent))
continue;
if (need_resched())
break;
if (pte_present(ptent)) {
struct page *page;
page = vm_normal_page(vma, addr, ptent);
if (unlikely(details) && page) {
/*
* unmap_shared_mapping_pages() wants to
* invalidate cache without truncating:
* unmap shared but keep private pages.
*/
if (details->check_mapping &&
details->check_mapping != page_rmapping(page))
continue;
}
ptent = ptep_get_and_clear_full(mm, addr, pte,
tlb->fullmm);
tlb_remove_tlb_entry(tlb, pte, addr);
if (unlikely(!page))
continue;
if (!PageAnon(page)) {
if (pte_dirty(ptent)) {
force_flush = 1;
set_page_dirty(page);
}
if (pte_young(ptent) &&
likely(!(vma->vm_flags & VM_SEQ_READ)))
mark_page_accessed(page);
}
rss[mm_counter(page)]--;
page_remove_rmap(page, false);
if (unlikely(page_mapcount(page) < 0))
print_bad_pte(vma, addr, ptent, page);
if (unlikely(__tlb_remove_page(tlb, page))) {
force_flush = 1;
addr += PAGE_SIZE;
break;
}
continue;
}
entry = pte_to_swp_entry(ptent);
if (is_device_private_entry(entry) ||
is_device_exclusive_entry(entry)) {
struct page *page = pfn_swap_entry_to_page(entry);
if (unlikely(details && details->check_mapping)) {
/*
* unmap_shared_mapping_pages() wants to
* invalidate cache without truncating:
* unmap shared but keep private pages.
*/
if (details->check_mapping !=
page_rmapping(page))
continue;
}
pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
rss[mm_counter(page)]--;
if (is_device_private_entry(entry))
page_remove_rmap(page, false);
put_page(page);
continue;
}
if (!non_swap_entry(entry)) {
/* Genuine swap entry, hence a private anon page */
if (!should_zap_cows(details))
continue;
rss[MM_SWAPENTS]--;
} else if (is_migration_entry(entry)) {
struct page *page;
page = pfn_swap_entry_to_page(entry);
if (details && details->check_mapping &&
details->check_mapping != page_rmapping(page))
continue;
rss[mm_counter(page)]--;
}
if (unlikely(!free_swap_and_cache(entry)))
print_bad_pte(vma, addr, ptent, NULL);
pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
} while (pte++, addr += PAGE_SIZE, addr != end);
add_mm_rss_vec(mm, rss);
arch_leave_lazy_mmu_mode();
/* Do the actual TLB flush before dropping ptl */
if (force_flush)
tlb_flush_mmu_tlbonly(tlb);
pte_unmap_unlock(start_pte, ptl);
/*
* If we forced a TLB flush (either due to running out of
* batch buffers or because we needed to flush dirty TLB
* entries before releasing the ptl), free the batched
* memory too. Restart if we didn't do everything.
*/
if (force_flush) {
force_flush = 0;
tlb_flush_mmu(tlb);
}
if (addr != end) {
cond_resched();
goto again;
}
return addr;
}
static inline unsigned long zap_pmd_range(struct mmu_gather *tlb,
struct vm_area_struct *vma, pud_t *pud,
unsigned long addr, unsigned long end,
struct zap_details *details)
{
pmd_t *pmd;
unsigned long next;
pmd = pmd_offset(pud, addr);
do {
next = pmd_addr_end(addr, end);
if (is_swap_pmd(*pmd) || pmd_trans_huge(*pmd) || pmd_devmap(*pmd)) {
if (next - addr != HPAGE_PMD_SIZE)
__split_huge_pmd(vma, pmd, addr, false, NULL);
else if (zap_huge_pmd(tlb, vma, pmd, addr))
goto next;
/* fall through */
} else if (details && details->single_page &&
PageTransCompound(details->single_page) &&
next - addr == HPAGE_PMD_SIZE && pmd_none(*pmd)) {
spinlock_t *ptl = pmd_lock(tlb->mm, pmd);
/*
* Take and drop THP pmd lock so that we cannot return
* prematurely, while zap_huge_pmd() has cleared *pmd,
* but not yet decremented compound_mapcount().
*/
spin_unlock(ptl);
}
/*
* Here there can be other concurrent MADV_DONTNEED or
* trans huge page faults running, and if the pmd is
* none or trans huge it can change under us. This is
* because MADV_DONTNEED holds the mmap_lock in read
* mode.
*/
if (pmd_none_or_trans_huge_or_clear_bad(pmd))
goto next;
next = zap_pte_range(tlb, vma, pmd, addr, next, details);
next:
cond_resched();
} while (pmd++, addr = next, addr != end);
return addr;
}
static inline unsigned long zap_pud_range(struct mmu_gather *tlb,
struct vm_area_struct *vma, p4d_t *p4d,
unsigned long addr, unsigned long end,
struct zap_details *details)
{
pud_t *pud;
unsigned long next;
pud = pud_offset(p4d, addr);
do {
next = pud_addr_end(addr, end);
if (pud_trans_huge(*pud) || pud_devmap(*pud)) {
if (next - addr != HPAGE_PUD_SIZE) {
mmap_assert_locked(tlb->mm);
split_huge_pud(vma, pud, addr);
} else if (zap_huge_pud(tlb, vma, pud, addr))
goto next;
/* fall through */
}
if (pud_none_or_clear_bad(pud))
continue;
next = zap_pmd_range(tlb, vma, pud, addr, next, details);
next:
cond_resched();
} while (pud++, addr = next, addr != end);
return addr;
}
static inline unsigned long zap_p4d_range(struct mmu_gather *tlb,
struct vm_area_struct *vma, pgd_t *pgd,
unsigned long addr, unsigned long end,
struct zap_details *details)
{
p4d_t *p4d;
unsigned long next;
p4d = p4d_offset(pgd, addr);
do {
next = p4d_addr_end(addr, end);
if (p4d_none_or_clear_bad(p4d))
continue;
next = zap_pud_range(tlb, vma, p4d, addr, next, details);
} while (p4d++, addr = next, addr != end);
return addr;
}
void unmap_page_range(struct mmu_gather *tlb,
struct vm_area_struct *vma,
unsigned long addr, unsigned long end,
struct zap_details *details)
{
pgd_t *pgd;
unsigned long next;
BUG_ON(addr >= end);
tlb_start_vma(tlb, vma);
pgd = pgd_offset(vma->vm_mm, addr);
do {
next = pgd_addr_end(addr, end);
if (pgd_none_or_clear_bad(pgd))
continue;
next = zap_p4d_range(tlb, vma, pgd, addr, next, details);
} while (pgd++, addr = next, addr != end);
tlb_end_vma(tlb, vma);
}
static void unmap_single_vma(struct mmu_gather *tlb,
struct vm_area_struct *vma, unsigned long start_addr,
unsigned long end_addr,
struct zap_details *details)
{
unsigned long start = max(vma->vm_start, start_addr);
unsigned long end;
if (start >= vma->vm_end)
return;
end = min(vma->vm_end, end_addr);
if (end <= vma->vm_start)
return;
if (vma->vm_file)
uprobe_munmap(vma, start, end);
if (unlikely(vma->vm_flags & VM_PFNMAP))
untrack_pfn(vma, 0, 0);
if (start != end) {
if (unlikely(is_vm_hugetlb_page(vma))) {
/*
* It is undesirable to test vma->vm_file as it
* should be non-null for valid hugetlb area.
* However, vm_file will be NULL in the error
* cleanup path of mmap_region. When
* hugetlbfs ->mmap method fails,
* mmap_region() nullifies vma->vm_file
* before calling this function to clean up.
* Since no pte has actually been setup, it is
* safe to do nothing in this case.
*/
if (vma->vm_file) {
i_mmap_lock_write(vma->vm_file->f_mapping);
__unmap_hugepage_range_final(tlb, vma, start, end, NULL);
i_mmap_unlock_write(vma->vm_file->f_mapping);
}
} else
unmap_page_range(tlb, vma, start, end, details);
}
}
/**
* unmap_vmas - unmap a range of memory covered by a list of vma's
* @tlb: address of the caller's struct mmu_gather
* @vma: the starting vma
* @start_addr: virtual address at which to start unmapping
* @end_addr: virtual address at which to end unmapping
*
* Unmap all pages in the vma list.
*
* Only addresses between `start' and `end' will be unmapped.
*
* The VMA list must be sorted in ascending virtual address order.
*
* unmap_vmas() assumes that the caller will flush the whole unmapped address
* range after unmap_vmas() returns. So the only responsibility here is to
* ensure that any thus-far unmapped pages are flushed before unmap_vmas()
* drops the lock and schedules.
*/
void unmap_vmas(struct mmu_gather *tlb,
struct vm_area_struct *vma, unsigned long start_addr,
unsigned long end_addr)
{
struct mmu_notifier_range range;
mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, vma, vma->vm_mm,
start_addr, end_addr);
mmu_notifier_invalidate_range_start(&range);
for ( ; vma && vma->vm_start < end_addr; vma = vma->vm_next)
unmap_single_vma(tlb, vma, start_addr, end_addr, NULL);
mmu_notifier_invalidate_range_end(&range);
}
/**
* zap_page_range - remove user pages in a given range
* @vma: vm_area_struct holding the applicable pages
* @start: starting address of pages to zap
* @size: number of bytes to zap
*
* Caller must protect the VMA list
*/
void zap_page_range(struct vm_area_struct *vma, unsigned long start,
unsigned long size)
{
struct mmu_notifier_range range;
struct mmu_gather tlb;
lru_add_drain();
mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm,
start, start + size);
tlb_gather_mmu(&tlb, vma->vm_mm);
update_hiwater_rss(vma->vm_mm);
mmu_notifier_invalidate_range_start(&range);
for ( ; vma && vma->vm_start < range.end; vma = vma->vm_next)
unmap_single_vma(&tlb, vma, start, range.end, NULL);
mmu_notifier_invalidate_range_end(&range);
tlb_finish_mmu(&tlb);
}
/**
* zap_page_range_single - remove user pages in a given range
* @vma: vm_area_struct holding the applicable pages
* @address: starting address of pages to zap
* @size: number of bytes to zap
* @details: details of shared cache invalidation
*
* The range must fit into one VMA.
*/
static void zap_page_range_single(struct vm_area_struct *vma, unsigned long address,
unsigned long size, struct zap_details *details)
{
struct mmu_notifier_range range;
struct mmu_gather tlb;
lru_add_drain();
mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm,
address, address + size);
tlb_gather_mmu(&tlb, vma->vm_mm);
update_hiwater_rss(vma->vm_mm);
mmu_notifier_invalidate_range_start(&range);
unmap_single_vma(&tlb, vma, address, range.end, details);
mmu_notifier_invalidate_range_end(&range);
tlb_finish_mmu(&tlb);
}
/**
* zap_vma_ptes - remove ptes mapping the vma
* @vma: vm_area_struct holding ptes to be zapped
* @address: starting address of pages to zap
* @size: number of bytes to zap
*
* This function only unmaps ptes assigned to VM_PFNMAP vmas.
*
* The entire address range must be fully contained within the vma.
*
*/
void zap_vma_ptes(struct vm_area_struct *vma, unsigned long address,
unsigned long size)
{
if (address < vma->vm_start || address + size > vma->vm_end ||
!(vma->vm_flags & VM_PFNMAP))
return;
zap_page_range_single(vma, address, size, NULL);
}
EXPORT_SYMBOL_GPL(zap_vma_ptes);
static pmd_t *walk_to_pmd(struct mm_struct *mm, unsigned long addr)
{
pgd_t *pgd;
p4d_t *p4d;
pud_t *pud;
pmd_t *pmd;
pgd = pgd_offset(mm, addr);
p4d = p4d_alloc(mm, pgd, addr);
if (!p4d)
return NULL;
pud = pud_alloc(mm, p4d, addr);
if (!pud)
return NULL;
pmd = pmd_alloc(mm, pud, addr);
if (!pmd)
return NULL;
VM_BUG_ON(pmd_trans_huge(*pmd));
return pmd;
}
pte_t *__get_locked_pte(struct mm_struct *mm, unsigned long addr,
spinlock_t **ptl)
{
pmd_t *pmd = walk_to_pmd(mm, addr);
if (!pmd)
return NULL;
return pte_alloc_map_lock(mm, pmd, addr, ptl);
}
static int validate_page_before_insert(struct page *page)
{
if (PageAnon(page) || PageSlab(page) || page_has_type(page))
return -EINVAL;
flush_dcache_page(page);
return 0;
}
static int insert_page_into_pte_locked(struct mm_struct *mm, pte_t *pte,
unsigned long addr, struct page *page, pgprot_t prot)
{
if (!pte_none(*pte))
return -EBUSY;
/* Ok, finally just insert the thing.. */
get_page(page);
inc_mm_counter_fast(mm, mm_counter_file(page));
page_add_file_rmap(page, false);
set_pte_at(mm, addr, pte, mk_pte(page, prot));
return 0;
}
/*
* This is the old fallback for page remapping.
*
* For historical reasons, it only allows reserved pages. Only
* old drivers should use this, and they needed to mark their
* pages reserved for the old functions anyway.
*/
static int insert_page(struct vm_area_struct *vma, unsigned long addr,
struct page *page, pgprot_t prot)
{
struct mm_struct *mm = vma->vm_mm;
int retval;
pte_t *pte;
spinlock_t *ptl;
retval = validate_page_before_insert(page);
if (retval)
goto out;
retval = -ENOMEM;
pte = get_locked_pte(mm, addr, &ptl);
if (!pte)
goto out;
retval = insert_page_into_pte_locked(mm, pte, addr, page, prot);
pte_unmap_unlock(pte, ptl);
out:
return retval;
}
#ifdef pte_index
static int insert_page_in_batch_locked(struct mm_struct *mm, pte_t *pte,
unsigned long addr, struct page *page, pgprot_t prot)
{
int err;
if (!page_count(page))
return -EINVAL;
err = validate_page_before_insert(page);
if (err)
return err;
return insert_page_into_pte_locked(mm, pte, addr, page, prot);
}
/* insert_pages() amortizes the cost of spinlock operations
* when inserting pages in a loop. Arch *must* define pte_index.
*/
static int insert_pages(struct vm_area_struct *vma, unsigned long addr,
struct page **pages, unsigned long *num, pgprot_t prot)
{
pmd_t *pmd = NULL;
pte_t *start_pte, *pte;
spinlock_t *pte_lock;
struct mm_struct *const mm = vma->vm_mm;
unsigned long curr_page_idx = 0;
unsigned long remaining_pages_total = *num;
unsigned long pages_to_write_in_pmd;
int ret;
more:
ret = -EFAULT;
pmd = walk_to_pmd(mm, addr);
if (!pmd)
goto out;
pages_to_write_in_pmd = min_t(unsigned long,
remaining_pages_total, PTRS_PER_PTE - pte_index(addr));
/* Allocate the PTE if necessary; takes PMD lock once only. */
ret = -ENOMEM;
if (pte_alloc(mm, pmd))
goto out;
while (pages_to_write_in_pmd) {
int pte_idx = 0;
const int batch_size = min_t(int, pages_to_write_in_pmd, 8);
start_pte = pte_offset_map_lock(mm, pmd, addr, &pte_lock);
for (pte = start_pte; pte_idx < batch_size; ++pte, ++pte_idx) {
int err = insert_page_in_batch_locked(mm, pte,
addr, pages[curr_page_idx], prot);
if (unlikely(err)) {
pte_unmap_unlock(start_pte, pte_lock);
ret = err;
remaining_pages_total -= pte_idx;
goto out;
}
addr += PAGE_SIZE;
++curr_page_idx;
}
pte_unmap_unlock(start_pte, pte_lock);
pages_to_write_in_pmd -= batch_size;
remaining_pages_total -= batch_size;
}
if (remaining_pages_total)
goto more;
ret = 0;
out:
*num = remaining_pages_total;
return ret;
}
#endif /* ifdef pte_index */
/**
* vm_insert_pages - insert multiple pages into user vma, batching the pmd lock.
* @vma: user vma to map to
* @addr: target start user address of these pages
* @pages: source kernel pages
* @num: in: number of pages to map. out: number of pages that were *not*
* mapped. (0 means all pages were successfully mapped).
*
* Preferred over vm_insert_page() when inserting multiple pages.
*
* In case of error, we may have mapped a subset of the provided
* pages. It is the caller's responsibility to account for this case.
*
* The same restrictions apply as in vm_insert_page().
*/
int vm_insert_pages(struct vm_area_struct *vma, unsigned long addr,
struct page **pages, unsigned long *num)
{
#ifdef pte_index
const unsigned long end_addr = addr + (*num * PAGE_SIZE) - 1;
if (addr < vma->vm_start || end_addr >= vma->vm_end)
return -EFAULT;
if (!(vma->vm_flags & VM_MIXEDMAP)) {
BUG_ON(mmap_read_trylock(vma->vm_mm));
BUG_ON(vma->vm_flags & VM_PFNMAP);
vma->vm_flags |= VM_MIXEDMAP;
}
/* Defer page refcount checking till we're about to map that page. */
return insert_pages(vma, addr, pages, num, vma->vm_page_prot);
#else
unsigned long idx = 0, pgcount = *num;
int err = -EINVAL;
for (; idx < pgcount; ++idx) {
err = vm_insert_page(vma, addr + (PAGE_SIZE * idx), pages[idx]);
if (err)
break;
}
*num = pgcount - idx;
return err;
#endif /* ifdef pte_index */
}
EXPORT_SYMBOL(vm_insert_pages);
/**
* vm_insert_page - insert single page into user vma
* @vma: user vma to map to
* @addr: target user address of this page
* @page: source kernel page
*
* This allows drivers to insert individual pages they've allocated
* into a user vma.
*
* The page has to be a nice clean _individual_ kernel allocation.
* If you allocate a compound page, you need to have marked it as
* such (__GFP_COMP), or manually just split the page up yourself
* (see split_page()).
*
* NOTE! Traditionally this was done with "remap_pfn_range()" which
* took an arbitrary page protection parameter. This doesn't allow
* that. Your vma protection will have to be set up correctly, which
* means that if you want a shared writable mapping, you'd better
* ask for a shared writable mapping!
*
* The page does not need to be reserved.
*
* Usually this function is called from f_op->mmap() handler
* under mm->mmap_lock write-lock, so it can change vma->vm_flags.
* Caller must set VM_MIXEDMAP on vma if it wants to call this
* function from other places, for example from page-fault handler.
*
* Return: %0 on success, negative error code otherwise.
*/
int vm_insert_page(struct vm_area_struct *vma, unsigned long addr,
struct page *page)
{
if (addr < vma->vm_start || addr >= vma->vm_end)
return -EFAULT;
if (!page_count(page))
return -EINVAL;
if (!(vma->vm_flags & VM_MIXEDMAP)) {
BUG_ON(mmap_read_trylock(vma->vm_mm));
BUG_ON(vma->vm_flags & VM_PFNMAP);
vma->vm_flags |= VM_MIXEDMAP;
}
return insert_page(vma, addr, page, vma->vm_page_prot);
}
EXPORT_SYMBOL(vm_insert_page);
/*
* __vm_map_pages - maps range of kernel pages into user vma
* @vma: user vma to map to
* @pages: pointer to array of source kernel pages
* @num: number of pages in page array
* @offset: user's requested vm_pgoff
*
* This allows drivers to map range of kernel pages into a user vma.
*
* Return: 0 on success and error code otherwise.
*/
static int __vm_map_pages(struct vm_area_struct *vma, struct page **pages,
unsigned long num, unsigned long offset)
{
unsigned long count = vma_pages(vma);
unsigned long uaddr = vma->vm_start;
int ret, i;
/* Fail if the user requested offset is beyond the end of the object */
if (offset >= num)
return -ENXIO;
/* Fail if the user requested size exceeds available object size */
if (count > num - offset)
return -ENXIO;
for (i = 0; i < count; i++) {
ret = vm_insert_page(vma, uaddr, pages[offset + i]);
if (ret < 0)
return ret;
uaddr += PAGE_SIZE;
}
return 0;
}
/**
* vm_map_pages - maps range of kernel pages starts with non zero offset
* @vma: user vma to map to
* @pages: pointer to array of source kernel pages
* @num: number of pages in page array
*
* Maps an object consisting of @num pages, catering for the user's
* requested vm_pgoff
*
* If we fail to insert any page into the vma, the function will return
* immediately leaving any previously inserted pages present. Callers
* from the mmap handler may immediately return the error as their caller
* will destroy the vma, removing any successfully inserted pages. Other
* callers should make their own arrangements for calling unmap_region().
*
* Context: Process context. Called by mmap handlers.
* Return: 0 on success and error code otherwise.
*/
int vm_map_pages(struct vm_area_struct *vma, struct page **pages,
unsigned long num)
{
return __vm_map_pages(vma, pages, num, vma->vm_pgoff);
}
EXPORT_SYMBOL(vm_map_pages);
/**
* vm_map_pages_zero - map range of kernel pages starts with zero offset
* @vma: user vma to map to
* @pages: pointer to array of source kernel pages
* @num: number of pages in page array
*
* Similar to vm_map_pages(), except that it explicitly sets the offset
* to 0. This function is intended for the drivers that did not consider
* vm_pgoff.
*
* Context: Process context. Called by mmap handlers.
* Return: 0 on success and error code otherwise.
*/
int vm_map_pages_zero(struct vm_area_struct *vma, struct page **pages,
unsigned long num)
{
return __vm_map_pages(vma, pages, num, 0);
}
EXPORT_SYMBOL(vm_map_pages_zero);
static vm_fault_t insert_pfn(struct vm_area_struct *vma, unsigned long addr,
pfn_t pfn, pgprot_t prot, bool mkwrite)
{
struct mm_struct *mm = vma->vm_mm;
pte_t *pte, entry;
spinlock_t *ptl;
pte = get_locked_pte(mm, addr, &ptl);
if (!pte)
return VM_FAULT_OOM;
if (!pte_none(*pte)) {
if (mkwrite) {
/*
* For read faults on private mappings the PFN passed
* in may not match the PFN we have mapped if the
* mapped PFN is a writeable COW page. In the mkwrite
* case we are creating a writable PTE for a shared
* mapping and we expect the PFNs to match. If they
* don't match, we are likely racing with block
* allocation and mapping invalidation so just skip the
* update.
*/
if (pte_pfn(*pte) != pfn_t_to_pfn(pfn)) {
WARN_ON_ONCE(!is_zero_pfn(pte_pfn(*pte)));
goto out_unlock;
}
entry = pte_mkyoung(*pte);
entry = maybe_mkwrite(pte_mkdirty(entry), vma);
if (ptep_set_access_flags(vma, addr, pte, entry, 1))
update_mmu_cache(vma, addr, pte);
}
goto out_unlock;
}
/* Ok, finally just insert the thing.. */
if (pfn_t_devmap(pfn))
entry = pte_mkdevmap(pfn_t_pte(pfn, prot));
else
entry = pte_mkspecial(pfn_t_pte(pfn, prot));
if (mkwrite) {
entry = pte_mkyoung(entry);
entry = maybe_mkwrite(pte_mkdirty(entry), vma);
}
set_pte_at(mm, addr, pte, entry);
update_mmu_cache(vma, addr, pte); /* XXX: why not for insert_page? */
out_unlock:
pte_unmap_unlock(pte, ptl);
return VM_FAULT_NOPAGE;
}
/**
* vmf_insert_pfn_prot - insert single pfn into user vma with specified pgprot
* @vma: user vma to map to
* @addr: target user address of this page
* @pfn: source kernel pfn
* @pgprot: pgprot flags for the inserted page
*
* This is exactly like vmf_insert_pfn(), except that it allows drivers
* to override pgprot on a per-page basis.
*
* This only makes sense for IO mappings, and it makes no sense for
* COW mappings. In general, using multiple vmas is preferable;
* vmf_insert_pfn_prot should only be used if using multiple VMAs is
* impractical.
*
* See vmf_insert_mixed_prot() for a discussion of the implication of using
* a value of @pgprot different from that of @vma->vm_page_prot.
*
* Context: Process context. May allocate using %GFP_KERNEL.
* Return: vm_fault_t value.
*/
vm_fault_t vmf_insert_pfn_prot(struct vm_area_struct *vma, unsigned long addr,
unsigned long pfn, pgprot_t pgprot)
{
/*
* Technically, architectures with pte_special can avoid all these
* restrictions (same for remap_pfn_range). However we would like
* consistency in testing and feature parity among all, so we should
* try to keep these invariants in place for everybody.
*/
BUG_ON(!(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)));
BUG_ON((vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) ==
(VM_PFNMAP|VM_MIXEDMAP));
BUG_ON((vma->vm_flags & VM_PFNMAP) && is_cow_mapping(vma->vm_flags));
BUG_ON((vma->vm_flags & VM_MIXEDMAP) && pfn_valid(pfn));
if (addr < vma->vm_start || addr >= vma->vm_end)
return VM_FAULT_SIGBUS;
if (!pfn_modify_allowed(pfn, pgprot))
return VM_FAULT_SIGBUS;
track_pfn_insert(vma, &pgprot, __pfn_to_pfn_t(pfn, PFN_DEV));
return insert_pfn(vma, addr, __pfn_to_pfn_t(pfn, PFN_DEV), pgprot,
false);
}
EXPORT_SYMBOL(vmf_insert_pfn_prot);
/**
* vmf_insert_pfn - insert single pfn into user vma
* @vma: user vma to map to
* @addr: target user address of this page
* @pfn: source kernel pfn
*
* Similar to vm_insert_page, this allows drivers to insert individual pages
* they've allocated into a user vma. Same comments apply.
*
* This function should only be called from a vm_ops->fault handler, and
* in that case the handler should return the result of this function.
*
* vma cannot be a COW mapping.
*
* As this is called only for pages that do not currently exist, we
* do not need to flush old virtual caches or the TLB.
*
* Context: Process context. May allocate using %GFP_KERNEL.
* Return: vm_fault_t value.
*/
vm_fault_t vmf_insert_pfn(struct vm_area_struct *vma, unsigned long addr,
unsigned long pfn)
{
return vmf_insert_pfn_prot(vma, addr, pfn, vma->vm_page_prot);
}
EXPORT_SYMBOL(vmf_insert_pfn);
static bool vm_mixed_ok(struct vm_area_struct *vma, pfn_t pfn)
{
/* these checks mirror the abort conditions in vm_normal_page */
if (vma->vm_flags & VM_MIXEDMAP)
return true;
if (pfn_t_devmap(pfn))
return true;
if (pfn_t_special(pfn))
return true;
if (is_zero_pfn(pfn_t_to_pfn(pfn)))
return true;
return false;
}
static vm_fault_t __vm_insert_mixed(struct vm_area_struct *vma,
unsigned long addr, pfn_t pfn, pgprot_t pgprot,
bool mkwrite)
{
int err;
BUG_ON(!vm_mixed_ok(vma, pfn));
if (addr < vma->vm_start || addr >= vma->vm_end)
return VM_FAULT_SIGBUS;
track_pfn_insert(vma, &pgprot, pfn);
if (!pfn_modify_allowed(pfn_t_to_pfn(pfn), pgprot))
return VM_FAULT_SIGBUS;
/*
* If we don't have pte special, then we have to use the pfn_valid()
* based VM_MIXEDMAP scheme (see vm_normal_page), and thus we *must*
* refcount the page if pfn_valid is true (hence insert_page rather
* than insert_pfn). If a zero_pfn were inserted into a VM_MIXEDMAP
* without pte special, it would there be refcounted as a normal page.
*/
if (!IS_ENABLED(CONFIG_ARCH_HAS_PTE_SPECIAL) &&
!pfn_t_devmap(pfn) && pfn_t_valid(pfn)) {
struct page *page;
/*
* At this point we are committed to insert_page()
* regardless of whether the caller specified flags that
* result in pfn_t_has_page() == false.
*/
page = pfn_to_page(pfn_t_to_pfn(pfn));
err = insert_page(vma, addr, page, pgprot);
} else {
return insert_pfn(vma, addr, pfn, pgprot, mkwrite);
}
if (err == -ENOMEM)
return VM_FAULT_OOM;
if (err < 0 && err != -EBUSY)
return VM_FAULT_SIGBUS;
return VM_FAULT_NOPAGE;
}
/**
* vmf_insert_mixed_prot - insert single pfn into user vma with specified pgprot
* @vma: user vma to map to
* @addr: target user address of this page
* @pfn: source kernel pfn
* @pgprot: pgprot flags for the inserted page
*
* This is exactly like vmf_insert_mixed(), except that it allows drivers
* to override pgprot on a per-page basis.
*
* Typically this function should be used by drivers to set caching- and
* encryption bits different than those of @vma->vm_page_prot, because
* the caching- or encryption mode may not be known at mmap() time.
* This is ok as long as @vma->vm_page_prot is not used by the core vm
* to set caching and encryption bits for those vmas (except for COW pages).
* This is ensured by core vm only modifying these page table entries using
* functions that don't touch caching- or encryption bits, using pte_modify()
* if needed. (See for example mprotect()).
* Also when new page-table entries are created, this is only done using the
* fault() callback, and never using the value of vma->vm_page_prot,
* except for page-table entries that point to anonymous pages as the result
* of COW.
*
* Context: Process context. May allocate using %GFP_KERNEL.
* Return: vm_fault_t value.
*/
vm_fault_t vmf_insert_mixed_prot(struct vm_area_struct *vma, unsigned long addr,
pfn_t pfn, pgprot_t pgprot)
{
return __vm_insert_mixed(vma, addr, pfn, pgprot, false);
}
EXPORT_SYMBOL(vmf_insert_mixed_prot);
vm_fault_t vmf_insert_mixed(struct vm_area_struct *vma, unsigned long addr,
pfn_t pfn)
{
return __vm_insert_mixed(vma, addr, pfn, vma->vm_page_prot, false);
}
EXPORT_SYMBOL(vmf_insert_mixed);
/*
* If the insertion of PTE failed because someone else already added a
* different entry in the mean time, we treat that as success as we assume
* the same entry was actually inserted.
*/
vm_fault_t vmf_insert_mixed_mkwrite(struct vm_area_struct *vma,
unsigned long addr, pfn_t pfn)
{
return __vm_insert_mixed(vma, addr, pfn, vma->vm_page_prot, true);
}
EXPORT_SYMBOL(vmf_insert_mixed_mkwrite);
/*
* maps a range of physical memory into the requested pages. the old
* mappings are removed. any references to nonexistent pages results
* in null mappings (currently treated as "copy-on-access")
*/
static int remap_pte_range(struct mm_struct *mm, pmd_t *pmd,
unsigned long addr, unsigned long end,
unsigned long pfn, pgprot_t prot)
{
pte_t *pte, *mapped_pte;
spinlock_t *ptl;
int err = 0;
mapped_pte = pte = pte_alloc_map_lock(mm, pmd, addr, &ptl);
if (!pte)
return -ENOMEM;
arch_enter_lazy_mmu_mode();
do {
BUG_ON(!pte_none(*pte));
if (!pfn_modify_allowed(pfn, prot)) {
err = -EACCES;
break;
}
set_pte_at(mm, addr, pte, pte_mkspecial(pfn_pte(pfn, prot)));
pfn++;
} while (pte++, addr += PAGE_SIZE, addr != end);
arch_leave_lazy_mmu_mode();
pte_unmap_unlock(mapped_pte, ptl);
return err;
}
static inline int remap_pmd_range(struct mm_struct *mm, pud_t *pud,
unsigned long addr, unsigned long end,
unsigned long pfn, pgprot_t prot)
{
pmd_t *pmd;
unsigned long next;
int err;
pfn -= addr >> PAGE_SHIFT;
pmd = pmd_alloc(mm, pud, addr);
if (!pmd)
return -ENOMEM;
VM_BUG_ON(pmd_trans_huge(*pmd));
do {
next = pmd_addr_end(addr, end);
err = remap_pte_range(mm, pmd, addr, next,
pfn + (addr >> PAGE_SHIFT), prot);
if (err)
return err;
} while (pmd++, addr = next, addr != end);
return 0;
}
static inline int remap_pud_range(struct mm_struct *mm, p4d_t *p4d,
unsigned long addr, unsigned long end,
unsigned long pfn, pgprot_t prot)
{
pud_t *pud;
unsigned long next;
int err;
pfn -= addr >> PAGE_SHIFT;
pud = pud_alloc(mm, p4d, addr);
if (!pud)
return -ENOMEM;
do {
next = pud_addr_end(addr, end);
err = remap_pmd_range(mm, pud, addr, next,
pfn + (addr >> PAGE_SHIFT), prot);
if (err)
return err;
} while (pud++, addr = next, addr != end);
return 0;
}
static inline int remap_p4d_range(struct mm_struct *mm, pgd_t *pgd,
unsigned long addr, unsigned long end,
unsigned long pfn, pgprot_t prot)
{
p4d_t *p4d;
unsigned long next;
int err;
pfn -= addr >> PAGE_SHIFT;
p4d = p4d_alloc(mm, pgd, addr);
if (!p4d)
return -ENOMEM;
do {
next = p4d_addr_end(addr, end);
err = remap_pud_range(mm, p4d, addr, next,
pfn + (addr >> PAGE_SHIFT), prot);
if (err)
return err;
} while (p4d++, addr = next, addr != end);
return 0;
}
/*
* Variant of remap_pfn_range that does not call track_pfn_remap. The caller
* must have pre-validated the caching bits of the pgprot_t.
*/
int remap_pfn_range_notrack(struct vm_area_struct *vma, unsigned long addr,
unsigned long pfn, unsigned long size, pgprot_t prot)
{
pgd_t *pgd;
unsigned long next;
unsigned long end = addr + PAGE_ALIGN(size);
struct mm_struct *mm = vma->vm_mm;
int err;
if (WARN_ON_ONCE(!PAGE_ALIGNED(addr)))
return -EINVAL;
/*
* Physically remapped pages are special. Tell the
* rest of the world about it:
* VM_IO tells people not to look at these pages
* (accesses can have side effects).
* VM_PFNMAP tells the core MM that the base pages are just
* raw PFN mappings, and do not have a "struct page" associated
* with them.
* VM_DONTEXPAND
* Disable vma merging and expanding with mremap().
* VM_DONTDUMP
* Omit vma from core dump, even when VM_IO turned off.
*
* There's a horrible special case to handle copy-on-write
* behaviour that some programs depend on. We mark the "original"
* un-COW'ed pages by matching them up with "vma->vm_pgoff".
* See vm_normal_page() for details.
*/
if (is_cow_mapping(vma->vm_flags)) {
if (addr != vma->vm_start || end != vma->vm_end)
return -EINVAL;
vma->vm_pgoff = pfn;
}
vma->vm_flags |= VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP;
BUG_ON(addr >= end);
pfn -= addr >> PAGE_SHIFT;
pgd = pgd_offset(mm, addr);
flush_cache_range(vma, addr, end);
do {
next = pgd_addr_end(addr, end);
err = remap_p4d_range(mm, pgd, addr, next,
pfn + (addr >> PAGE_SHIFT), prot);
if (err)
return err;
} while (pgd++, addr = next, addr != end);
return 0;
}
/**
* remap_pfn_range - remap kernel memory to userspace
* @vma: user vma to map to
* @addr: target page aligned user address to start at
* @pfn: page frame number of kernel physical memory address
* @size: size of mapping area
* @prot: page protection flags for this mapping
*
* Note: this is only safe if the mm semaphore is held when called.
*
* Return: %0 on success, negative error code otherwise.
*/
int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
unsigned long pfn, unsigned long size, pgprot_t prot)
{
int err;
err = track_pfn_remap(vma, &prot, pfn, addr, PAGE_ALIGN(size));
if (err)
return -EINVAL;
err = remap_pfn_range_notrack(vma, addr, pfn, size, prot);
if (err)
untrack_pfn(vma, pfn, PAGE_ALIGN(size));
return err;
}
EXPORT_SYMBOL(remap_pfn_range);
/**
* vm_iomap_memory - remap memory to userspace
* @vma: user vma to map to
* @start: start of the physical memory to be mapped
* @len: size of area
*
* This is a simplified io_remap_pfn_range() for common driver use. The
* driver just needs to give us the physical memory range to be mapped,
* we'll figure out the rest from the vma information.
*
* NOTE! Some drivers might want to tweak vma->vm_page_prot first to get
* whatever write-combining details or similar.
*
* Return: %0 on success, negative error code otherwise.
*/
int vm_iomap_memory(struct vm_area_struct *vma, phys_addr_t start, unsigned long len)
{
unsigned long vm_len, pfn, pages;
/* Check that the physical memory area passed in looks valid */
if (start + len < start)
return -EINVAL;
/*
* You *really* shouldn't map things that aren't page-aligned,
* but we've historically allowed it because IO memory might
* just have smaller alignment.
*/
len += start & ~PAGE_MASK;
pfn = start >> PAGE_SHIFT;
pages = (len + ~PAGE_MASK) >> PAGE_SHIFT;
if (pfn + pages < pfn)
return -EINVAL;
/* We start the mapping 'vm_pgoff' pages into the area */
if (vma->vm_pgoff > pages)
return -EINVAL;
pfn += vma->vm_pgoff;
pages -= vma->vm_pgoff;
/* Can we fit all of the mapping? */
vm_len = vma->vm_end - vma->vm_start;
if (vm_len >> PAGE_SHIFT > pages)
return -EINVAL;
/* Ok, let it rip */
return io_remap_pfn_range(vma, vma->vm_start, pfn, vm_len, vma->vm_page_prot);
}
EXPORT_SYMBOL(vm_iomap_memory);
static int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd,
unsigned long addr, unsigned long end,
pte_fn_t fn, void *data, bool create,
pgtbl_mod_mask *mask)
{
pte_t *pte, *mapped_pte;
int err = 0;
spinlock_t *ptl;
if (create) {
mapped_pte = pte = (mm == &init_mm) ?
pte_alloc_kernel_track(pmd, addr, mask) :
pte_alloc_map_lock(mm, pmd, addr, &ptl);
if (!pte)
return -ENOMEM;
} else {
mapped_pte = pte = (mm == &init_mm) ?
pte_offset_kernel(pmd, addr) :
pte_offset_map_lock(mm, pmd, addr, &ptl);
}
BUG_ON(pmd_huge(*pmd));
arch_enter_lazy_mmu_mode();
if (fn) {
do {
if (create || !pte_none(*pte)) {
err = fn(pte++, addr, data);
if (err)
break;
}
} while (addr += PAGE_SIZE, addr != end);
}
*mask |= PGTBL_PTE_MODIFIED;
arch_leave_lazy_mmu_mode();
if (mm != &init_mm)
pte_unmap_unlock(mapped_pte, ptl);
return err;
}
static int apply_to_pmd_range(struct mm_struct *mm, pud_t *pud,
unsigned long addr, unsigned long end,
pte_fn_t fn, void *data, bool create,
pgtbl_mod_mask *mask)
{
pmd_t *pmd;
unsigned long next;
int err = 0;
BUG_ON(pud_huge(*pud));
if (create) {
pmd = pmd_alloc_track(mm, pud, addr, mask);
if (!pmd)
return -ENOMEM;
} else {
pmd = pmd_offset(pud, addr);
}
do {
next = pmd_addr_end(addr, end);
if (pmd_none(*pmd) && !create)
continue;
if (WARN_ON_ONCE(pmd_leaf(*pmd)))
return -EINVAL;
if (!pmd_none(*pmd) && WARN_ON_ONCE(pmd_bad(*pmd))) {
if (!create)
continue;
pmd_clear_bad(pmd);
}
err = apply_to_pte_range(mm, pmd, addr, next,
fn, data, create, mask);
if (err)
break;
} while (pmd++, addr = next, addr != end);
return err;
}
static int apply_to_pud_range(struct mm_struct *mm, p4d_t *p4d,
unsigned long addr, unsigned long end,
pte_fn_t fn, void *data, bool create,
pgtbl_mod_mask *mask)
{
pud_t *pud;
unsigned long next;
int err = 0;
if (create) {
pud = pud_alloc_track(mm, p4d, addr, mask);
if (!pud)
return -ENOMEM;
} else {
pud = pud_offset(p4d, addr);
}
do {
next = pud_addr_end(addr, end);
if (pud_none(*pud) && !create)
continue;
if (WARN_ON_ONCE(pud_leaf(*pud)))
return -EINVAL;
if (!pud_none(*pud) && WARN_ON_ONCE(pud_bad(*pud))) {
if (!create)
continue;
pud_clear_bad(pud);
}
err = apply_to_pmd_range(mm, pud, addr, next,
fn, data, create, mask);
if (err)
break;
} while (pud++, addr = next, addr != end);
return err;
}
static int apply_to_p4d_range(struct mm_struct *mm, pgd_t *pgd,
unsigned long addr, unsigned long end,
pte_fn_t fn, void *data, bool create,
pgtbl_mod_mask *mask)
{
p4d_t *p4d;
unsigned long next;
int err = 0;
if (create) {
p4d = p4d_alloc_track(mm, pgd, addr, mask);
if (!p4d)
return -ENOMEM;
} else {
p4d = p4d_offset(pgd, addr);
}
do {
next = p4d_addr_end(addr, end);
if (p4d_none(*p4d) && !create)
continue;
if (WARN_ON_ONCE(p4d_leaf(*p4d)))
return -EINVAL;
if (!p4d_none(*p4d) && WARN_ON_ONCE(p4d_bad(*p4d))) {
if (!create)
continue;
p4d_clear_bad(p4d);
}
err = apply_to_pud_range(mm, p4d, addr, next,
fn, data, create, mask);
if (err)
break;
} while (p4d++, addr = next, addr != end);
return err;
}
static int __apply_to_page_range(struct mm_struct *mm, unsigned long addr,
unsigned long size, pte_fn_t fn,
void *data, bool create)
{
pgd_t *pgd;
unsigned long start = addr, next;
unsigned long end = addr + size;
pgtbl_mod_mask mask = 0;
int err = 0;
if (WARN_ON(addr >= end))
return -EINVAL;
pgd = pgd_offset(mm, addr);
do {
next = pgd_addr_end(addr, end);
if (pgd_none(*pgd) && !create)
continue;
if (WARN_ON_ONCE(pgd_leaf(*pgd)))
return -EINVAL;
if (!pgd_none(*pgd) && WARN_ON_ONCE(pgd_bad(*pgd))) {
if (!create)
continue;
pgd_clear_bad(pgd);
}
err = apply_to_p4d_range(mm, pgd, addr, next,
fn, data, create, &mask);
if (err)
break;
} while (pgd++, addr = next, addr != end);
if (mask & ARCH_PAGE_TABLE_SYNC_MASK)
arch_sync_kernel_mappings(start, start + size);
return err;
}
/*
* Scan a region of virtual memory, filling in page tables as necessary
* and calling a provided function on each leaf page table.
*/
int apply_to_page_range(struct mm_struct *mm, unsigned long addr,
unsigned long size, pte_fn_t fn, void *data)
{
return __apply_to_page_range(mm, addr, size, fn, data, true);
}
EXPORT_SYMBOL_GPL(apply_to_page_range);
/*
* Scan a region of virtual memory, calling a provided function on
* each leaf page table where it exists.
*
* Unlike apply_to_page_range, this does _not_ fill in page tables
* where they are absent.
*/
int apply_to_existing_page_range(struct mm_struct *mm, unsigned long addr,
unsigned long size, pte_fn_t fn, void *data)
{
return __apply_to_page_range(mm, addr, size, fn, data, false);
}
EXPORT_SYMBOL_GPL(apply_to_existing_page_range);
/*
* handle_pte_fault chooses page fault handler according to an entry which was
* read non-atomically. Before making any commitment, on those architectures
* or configurations (e.g. i386 with PAE) which might give a mix of unmatched
* parts, do_swap_page must check under lock before unmapping the pte and
* proceeding (but do_wp_page is only called after already making such a check;
* and do_anonymous_page can safely check later on).
*/
static inline int pte_unmap_same(struct mm_struct *mm, pmd_t *pmd,
pte_t *page_table, pte_t orig_pte)
{
int same = 1;
#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPTION)
if (sizeof(pte_t) > sizeof(unsigned long)) {
spinlock_t *ptl = pte_lockptr(mm, pmd);
spin_lock(ptl);
same = pte_same(*page_table, orig_pte);
spin_unlock(ptl);
}
#endif
pte_unmap(page_table);
return same;
}
static inline bool cow_user_page(struct page *dst, struct page *src,
struct vm_fault *vmf)
{
bool ret;
void *kaddr;
void __user *uaddr;
bool locked = false;
struct vm_area_struct *vma = vmf->vma;
struct mm_struct *mm = vma->vm_mm;
unsigned long addr = vmf->address; if (likely(src)) {
copy_user_highpage(dst, src, addr, vma);
return true;
}
/*
* If the source page was a PFN mapping, we don't have
* a "struct page" for it. We do a best-effort copy by
* just copying from the original user address. If that
* fails, we just zero-fill it. Live with it.
*/
kaddr = kmap_atomic(dst);
uaddr = (void __user *)(addr & PAGE_MASK);
/*
* On architectures with software "accessed" bits, we would
* take a double page fault, so mark it accessed here.
*/
if (arch_faults_on_old_pte() && !pte_young(vmf->orig_pte)) {
pte_t entry;
vmf->pte = pte_offset_map_lock(mm, vmf->pmd, addr, &vmf->ptl);
locked = true;
if (!likely(pte_same(*vmf->pte, vmf->orig_pte))) {
/*
* Other thread has already handled the fault
* and update local tlb only
*/
update_mmu_tlb(vma, addr, vmf->pte);
ret = false;
goto pte_unlock;
}
entry = pte_mkyoung(vmf->orig_pte);
if (ptep_set_access_flags(vma, addr, vmf->pte, entry, 0))
update_mmu_cache(vma, addr, vmf->pte);
}
/*
* This really shouldn't fail, because the page is there
* in the page tables. But it might just be unreadable,
* in which case we just give up and fill the result with
* zeroes.
*/
if (__copy_from_user_inatomic(kaddr, uaddr, PAGE_SIZE)) {
if (locked)
goto warn;
/* Re-validate under PTL if the page is still mapped */
vmf->pte = pte_offset_map_lock(mm, vmf->pmd, addr, &vmf->ptl);
locked = true;
if (!likely(pte_same(*vmf->pte, vmf->orig_pte))) {
/* The PTE changed under us, update local tlb */
update_mmu_tlb(vma, addr, vmf->pte);
ret = false;
goto pte_unlock;
}
/*
* The same page can be mapped back since last copy attempt.
* Try to copy again under PTL.
*/
if (__copy_from_user_inatomic(kaddr, uaddr, PAGE_SIZE)) {
/*
* Give a warn in case there can be some obscure
* use-case
*/
warn:
WARN_ON_ONCE(1);
clear_page(kaddr);
}
}
ret = true;
pte_unlock:
if (locked)
pte_unmap_unlock(vmf->pte, vmf->ptl);
kunmap_atomic(kaddr);
flush_dcache_page(dst);
return ret;
}
static gfp_t __get_fault_gfp_mask(struct vm_area_struct *vma)
{
struct file *vm_file = vma->vm_file;
if (vm_file)
return mapping_gfp_mask(vm_file->f_mapping) | __GFP_FS | __GFP_IO;
/*
* Special mappings (e.g. VDSO) do not have any file so fake
* a default GFP_KERNEL for them.
*/
return GFP_KERNEL;
}
/*
* Notify the address space that the page is about to become writable so that
* it can prohibit this or wait for the page to get into an appropriate state.
*
* We do this without the lock held, so that it can sleep if it needs to.
*/
static vm_fault_t do_page_mkwrite(struct vm_fault *vmf)
{
vm_fault_t ret;
struct page *page = vmf->page;
unsigned int old_flags = vmf->flags;
vmf->flags = FAULT_FLAG_WRITE|FAULT_FLAG_MKWRITE;
if (vmf->vma->vm_file &&
IS_SWAPFILE(vmf->vma->vm_file->f_mapping->host))
return VM_FAULT_SIGBUS;
ret = vmf->vma->vm_ops->page_mkwrite(vmf);
/* Restore original flags so that caller is not surprised */
vmf->flags = old_flags;
if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))
return ret;
if (unlikely(!(ret & VM_FAULT_LOCKED))) {
lock_page(page);
if (!page->mapping) {
unlock_page(page);
return 0; /* retry */
}
ret |= VM_FAULT_LOCKED;
} else
VM_BUG_ON_PAGE(!PageLocked(page), page);
return ret;
}
/*
* Handle dirtying of a page in shared file mapping on a write fault.
*
* The function expects the page to be locked and unlocks it.
*/
static vm_fault_t fault_dirty_shared_page(struct vm_fault *vmf)
{
struct vm_area_struct *vma = vmf->vma;
struct address_space *mapping;
struct page *page = vmf->page;
bool dirtied;
bool page_mkwrite = vma->vm_ops && vma->vm_ops->page_mkwrite;
dirtied = set_page_dirty(page);
VM_BUG_ON_PAGE(PageAnon(page), page);
/*
* Take a local copy of the address_space - page.mapping may be zeroed
* by truncate after unlock_page(). The address_space itself remains
* pinned by vma->vm_file's reference. We rely on unlock_page()'s
* release semantics to prevent the compiler from undoing this copying.
*/
mapping = page_rmapping(page);
unlock_page(page);
if (!page_mkwrite)
file_update_time(vma->vm_file);
/*
* Throttle page dirtying rate down to writeback speed.
*
* mapping may be NULL here because some device drivers do not
* set page.mapping but still dirty their pages
*
* Drop the mmap_lock before waiting on IO, if we can. The file
* is pinning the mapping, as per above.
*/
if ((dirtied || page_mkwrite) && mapping) {
struct file *fpin;
fpin = maybe_unlock_mmap_for_io(vmf, NULL);
balance_dirty_pages_ratelimited(mapping);
if (fpin) {
fput(fpin);
return VM_FAULT_RETRY;
}
}
return 0;
}
/*
* Handle write page faults for pages that can be reused in the current vma
*
* This can happen either due to the mapping being with the VM_SHARED flag,
* or due to us being the last reference standing to the page. In either
* case, all we need to do here is to mark the page as writable and update
* any related book-keeping.
*/
static inline void wp_page_reuse(struct vm_fault *vmf)
__releases(vmf->ptl)
{
struct vm_area_struct *vma = vmf->vma;
struct page *page = vmf->page;
pte_t entry;
/*
* Clear the pages cpupid information as the existing
* information potentially belongs to a now completely
* unrelated process.
*/
if (page)
page_cpupid_xchg_last(page, (1 << LAST_CPUPID_SHIFT) - 1);
flush_cache_page(vma, vmf->address, pte_pfn(vmf->orig_pte));
entry = pte_mkyoung(vmf->orig_pte);
entry = maybe_mkwrite(pte_mkdirty(entry), vma);
if (ptep_set_access_flags(vma, vmf->address, vmf->pte, entry, 1))
update_mmu_cache(vma, vmf->address, vmf->pte);
pte_unmap_unlock(vmf->pte, vmf->ptl);
count_vm_event(PGREUSE);
}
/*
* Handle the case of a page which we actually need to copy to a new page.
*
* Called with mmap_lock locked and the old page referenced, but
* without the ptl held.
*
* High level logic flow:
*
* - Allocate a page, copy the content of the old page to the new one.
* - Handle book keeping and accounting - cgroups, mmu-notifiers, etc.
* - Take the PTL. If the pte changed, bail out and release the allocated page
* - If the pte is still the way we remember it, update the page table and all
* relevant references. This includes dropping the reference the page-table
* held to the old page, as well as updating the rmap.
* - In any case, unlock the PTL and drop the reference we took to the old page.
*/
static vm_fault_t wp_page_copy(struct vm_fault *vmf)
{
struct vm_area_struct *vma = vmf->vma;
struct mm_struct *mm = vma->vm_mm;
struct page *old_page = vmf->page;
struct page *new_page = NULL;
pte_t entry;
int page_copied = 0;
struct mmu_notifier_range range;
if (unlikely(anon_vma_prepare(vma)))
goto oom;
if (is_zero_pfn(pte_pfn(vmf->orig_pte))) {
new_page = alloc_zeroed_user_highpage_movable(vma,
vmf->address);
if (!new_page)
goto oom;
} else {
new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma,
vmf->address);
if (!new_page)
goto oom;
if (!cow_user_page(new_page, old_page, vmf)) {
/*
* COW failed, if the fault was solved by other,
* it's fine. If not, userspace would re-fault on
* the same address and we will handle the fault
* from the second attempt.
*/
put_page(new_page);
if (old_page)
put_page(old_page);
return 0;
}
}
if (mem_cgroup_charge(new_page, mm, GFP_KERNEL))
goto oom_free_new;
cgroup_throttle_swaprate(new_page, GFP_KERNEL);
__SetPageUptodate(new_page);
mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm,
vmf->address & PAGE_MASK,
(vmf->address & PAGE_MASK) + PAGE_SIZE);
mmu_notifier_invalidate_range_start(&range);
/*
* Re-check the pte - we dropped the lock
*/
vmf->pte = pte_offset_map_lock(mm, vmf->pmd, vmf->address, &vmf->ptl);
if (likely(pte_same(*vmf->pte, vmf->orig_pte))) {
if (old_page) {
if (!PageAnon(old_page)) {
dec_mm_counter_fast(mm,
mm_counter_file(old_page));
inc_mm_counter_fast(mm, MM_ANONPAGES);
}
} else {
inc_mm_counter_fast(mm, MM_ANONPAGES);
}
flush_cache_page(vma, vmf->address, pte_pfn(vmf->orig_pte));
entry = mk_pte(new_page, vma->vm_page_prot);
entry = pte_sw_mkyoung(entry);
entry = maybe_mkwrite(pte_mkdirty(entry), vma);
/*
* Clear the pte entry and flush it first, before updating the
* pte with the new entry, to keep TLBs on different CPUs in
* sync. This code used to set the new PTE then flush TLBs, but
* that left a window where the new PTE could be loaded into
* some TLBs while the old PTE remains in others.
*/
ptep_clear_flush_notify(vma, vmf->address, vmf->pte);
page_add_new_anon_rmap(new_page, vma, vmf->address, false);
lru_cache_add_inactive_or_unevictable(new_page, vma);
/*
* We call the notify macro here because, when using secondary
* mmu page tables (such as kvm shadow page tables), we want the
* new page to be mapped directly into the secondary page table.
*/
set_pte_at_notify(mm, vmf->address, vmf->pte, entry);
update_mmu_cache(vma, vmf->address, vmf->pte);
if (old_page) {
/*
* Only after switching the pte to the new page may
* we remove the mapcount here. Otherwise another
* process may come and find the rmap count decremented
* before the pte is switched to the new page, and
* "reuse" the old page writing into it while our pte
* here still points into it and can be read by other
* threads.
*
* The critical issue is to order this
* page_remove_rmap with the ptp_clear_flush above.
* Those stores are ordered by (if nothing else,)
* the barrier present in the atomic_add_negative
* in page_remove_rmap.
*
* Then the TLB flush in ptep_clear_flush ensures that
* no process can access the old page before the
* decremented mapcount is visible. And the old page
* cannot be reused until after the decremented
* mapcount is visible. So transitively, TLBs to
* old page will be flushed before it can be reused.
*/
page_remove_rmap(old_page, false);
}
/* Free the old page.. */
new_page = old_page;
page_copied = 1;
} else {
update_mmu_tlb(vma, vmf->address, vmf->pte);
}
if (new_page)
put_page(new_page);
pte_unmap_unlock(vmf->pte, vmf->ptl);
/*
* No need to double call mmu_notifier->invalidate_range() callback as
* the above ptep_clear_flush_notify() did already call it.
*/
mmu_notifier_invalidate_range_only_end(&range);
if (old_page) {
/*
* Don't let another task, with possibly unlocked vma,
* keep the mlocked page.
*/
if (page_copied && (vma->vm_flags & VM_LOCKED)) {
lock_page(old_page); /* LRU manipulation */
if (PageMlocked(old_page))
munlock_vma_page(old_page); unlock_page(old_page);
}
if (page_copied)
free_swap_cache(old_page);
put_page(old_page);
}
return page_copied ? VM_FAULT_WRITE : 0;
oom_free_new:
put_page(new_page);
oom:
if (old_page)
put_page(old_page);
return VM_FAULT_OOM;
}
/**
* finish_mkwrite_fault - finish page fault for a shared mapping, making PTE
* writeable once the page is prepared
*
* @vmf: structure describing the fault
*
* This function handles all that is needed to finish a write page fault in a
* shared mapping due to PTE being read-only once the mapped page is prepared.
* It handles locking of PTE and modifying it.
*
* The function expects the page to be locked or other protection against
* concurrent faults / writeback (such as DAX radix tree locks).
*
* Return: %0 on success, %VM_FAULT_NOPAGE when PTE got changed before
* we acquired PTE lock.
*/
vm_fault_t finish_mkwrite_fault(struct vm_fault *vmf)
{
WARN_ON_ONCE(!(vmf->vma->vm_flags & VM_SHARED));
vmf->pte = pte_offset_map_lock(vmf->vma->vm_mm, vmf->pmd, vmf->address,
&vmf->ptl);
/*
* We might have raced with another page fault while we released the
* pte_offset_map_lock.
*/
if (!pte_same(*vmf->pte, vmf->orig_pte)) {
update_mmu_tlb(vmf->vma, vmf->address, vmf->pte);
pte_unmap_unlock(vmf->pte, vmf->ptl);
return VM_FAULT_NOPAGE;
}
wp_page_reuse(vmf);
return 0;
}
/*
* Handle write page faults for VM_MIXEDMAP or VM_PFNMAP for a VM_SHARED
* mapping
*/
static vm_fault_t wp_pfn_shared(struct vm_fault *vmf)
{
struct vm_area_struct *vma = vmf->vma; if (vma->vm_ops && vma->vm_ops->pfn_mkwrite) {
vm_fault_t ret;
pte_unmap_unlock(vmf->pte, vmf->ptl);
vmf->flags |= FAULT_FLAG_MKWRITE;
ret = vma->vm_ops->pfn_mkwrite(vmf);
if (ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE))
return ret;
return finish_mkwrite_fault(vmf);
}
wp_page_reuse(vmf);
return VM_FAULT_WRITE;
}
static vm_fault_t wp_page_shared(struct vm_fault *vmf)
__releases(vmf->ptl)
{
struct vm_area_struct *vma = vmf->vma;
vm_fault_t ret = VM_FAULT_WRITE;
get_page(vmf->page);
if (vma->vm_ops && vma->vm_ops->page_mkwrite) {
vm_fault_t tmp;
pte_unmap_unlock(vmf->pte, vmf->ptl);
tmp = do_page_mkwrite(vmf);
if (unlikely(!tmp || (tmp &
(VM_FAULT_ERROR | VM_FAULT_NOPAGE)))) {
put_page(vmf->page);
return tmp;
}
tmp = finish_mkwrite_fault(vmf);
if (unlikely(tmp & (VM_FAULT_ERROR | VM_FAULT_NOPAGE))) {
unlock_page(vmf->page);
put_page(vmf->page);
return tmp;
}
} else {
wp_page_reuse(vmf);
lock_page(vmf->page);
}
ret |= fault_dirty_shared_page(vmf);
put_page(vmf->page);
return ret;
}
/*
* This routine handles present pages, when users try to write
* to a shared page. It is done by copying the page to a new address
* and decrementing the shared-page counter for the old page.
*
* Note that this routine assumes that the protection checks have been
* done by the caller (the low-level page fault routine in most cases).
* Thus we can safely just mark it writable once we've done any necessary
* COW.
*
* We also mark the page dirty at this point even though the page will
* change only once the write actually happens. This avoids a few races,
* and potentially makes it more efficient.
*
* We enter with non-exclusive mmap_lock (to exclude vma changes,
* but allow concurrent faults), with pte both mapped and locked.
* We return with mmap_lock still held, but pte unmapped and unlocked.
*/
static vm_fault_t do_wp_page(struct vm_fault *vmf)
__releases(vmf->ptl)
{
struct vm_area_struct *vma = vmf->vma;
if (userfaultfd_pte_wp(vma, *vmf->pte)) {
pte_unmap_unlock(vmf->pte, vmf->ptl);
return handle_userfault(vmf, VM_UFFD_WP);
}
/*
* Userfaultfd write-protect can defer flushes. Ensure the TLB
* is flushed in this case before copying.
*/
if (unlikely(userfaultfd_wp(vmf->vma) &&
mm_tlb_flush_pending(vmf->vma->vm_mm)))
flush_tlb_page(vmf->vma, vmf->address);
vmf->page = vm_normal_page(vma, vmf->address, vmf->orig_pte);
if (!vmf->page) {
/*
* VM_MIXEDMAP !pfn_valid() case, or VM_SOFTDIRTY clear on a
* VM_PFNMAP VMA.
*
* We should not cow pages in a shared writeable mapping.
* Just mark the pages writable and/or call ops->pfn_mkwrite.
*/
if ((vma->vm_flags & (VM_WRITE|VM_SHARED)) ==
(VM_WRITE|VM_SHARED))
return wp_pfn_shared(vmf);
pte_unmap_unlock(vmf->pte, vmf->ptl);
return wp_page_copy(vmf);
}
/*
* Take out anonymous pages first, anonymous shared vmas are
* not dirty accountable.
*/
if (PageAnon(vmf->page)) {
struct page *page = vmf->page;
/* PageKsm() doesn't necessarily raise the page refcount */
if (PageKsm(page) || page_count(page) != 1)
goto copy;
if (!trylock_page(page))
goto copy;
if (PageKsm(page) || page_mapcount(page) != 1 || page_count(page) != 1) { unlock_page(page);
goto copy;
}
/*
* Ok, we've got the only map reference, and the only
* page count reference, and the page is locked,
* it's dark out, and we're wearing sunglasses. Hit it.
*/
unlock_page(page);
wp_page_reuse(vmf);
return VM_FAULT_WRITE;
} else if (unlikely((vma->vm_flags & (VM_WRITE|VM_SHARED)) ==
(VM_WRITE|VM_SHARED))) {
return wp_page_shared(vmf);
}
copy:
/*
* Ok, we need to copy. Oh, well..
*/
get_page(vmf->page);
pte_unmap_unlock(vmf->pte, vmf->ptl);
return wp_page_copy(vmf);
}
static void unmap_mapping_range_vma(struct vm_area_struct *vma,
unsigned long start_addr, unsigned long end_addr,
struct zap_details *details)
{
zap_page_range_single(vma, start_addr, end_addr - start_addr, details);
}
static inline void unmap_mapping_range_tree(struct rb_root_cached *root,
struct zap_details *details)
{
struct vm_area_struct *vma;
pgoff_t vba, vea, zba, zea;
vma_interval_tree_foreach(vma, root,
details->first_index, details->last_index) {
vba = vma->vm_pgoff;
vea = vba + vma_pages(vma) - 1;
zba = details->first_index;
if (zba < vba)
zba = vba;
zea = details->last_index;
if (zea > vea)
zea = vea;
unmap_mapping_range_vma(vma,
((zba - vba) << PAGE_SHIFT) + vma->vm_start,
((zea - vba + 1) << PAGE_SHIFT) + vma->vm_start,
details);
}
}
/**
* unmap_mapping_page() - Unmap single page from processes.
* @page: The locked page to be unmapped.
*
* Unmap this page from any userspace process which still has it mmaped.
* Typically, for efficiency, the range of nearby pages has already been
* unmapped by unmap_mapping_pages() or unmap_mapping_range(). But once
* truncation or invalidation holds the lock on a page, it may find that
* the page has been remapped again: and then uses unmap_mapping_page()
* to unmap it finally.
*/
void unmap_mapping_page(struct page *page)
{
struct address_space *mapping = page->mapping;
struct zap_details details = { };
VM_BUG_ON(!PageLocked(page));
VM_BUG_ON(PageTail(page));
details.check_mapping = mapping;
details.first_index = page->index;
details.last_index = page->index + thp_nr_pages(page) - 1;
details.single_page = page;
i_mmap_lock_write(mapping);
if (unlikely(!RB_EMPTY_ROOT(&mapping->i_mmap.rb_root)))
unmap_mapping_range_tree(&mapping->i_mmap, &details);
i_mmap_unlock_write(mapping);
}
/**
* unmap_mapping_pages() - Unmap pages from processes.
* @mapping: The address space containing pages to be unmapped.
* @start: Index of first page to be unmapped.
* @nr: Number of pages to be unmapped. 0 to unmap to end of file.
* @even_cows: Whether to unmap even private COWed pages.
*
* Unmap the pages in this address space from any userspace process which
* has them mmaped. Generally, you want to remove COWed pages as well when
* a file is being truncated, but not when invalidating pages from the page
* cache.
*/
void unmap_mapping_pages(struct address_space *mapping, pgoff_t start,
pgoff_t nr, bool even_cows)
{
struct zap_details details = { }; details.check_mapping = even_cows ? NULL : mapping;
details.first_index = start;
details.last_index = start + nr - 1;
if (details.last_index < details.first_index)
details.last_index = ULONG_MAX;
i_mmap_lock_write(mapping);
if (unlikely(!RB_EMPTY_ROOT(&mapping->i_mmap.rb_root)))
unmap_mapping_range_tree(&mapping->i_mmap, &details);
i_mmap_unlock_write(mapping);
}
EXPORT_SYMBOL_GPL(unmap_mapping_pages);
/**
* unmap_mapping_range - unmap the portion of all mmaps in the specified
* address_space corresponding to the specified byte range in the underlying
* file.
*
* @mapping: the address space containing mmaps to be unmapped.
* @holebegin: byte in first page to unmap, relative to the start of
* the underlying file. This will be rounded down to a PAGE_SIZE
* boundary. Note that this is different from truncate_pagecache(), which
* must keep the partial page. In contrast, we must get rid of
* partial pages.
* @holelen: size of prospective hole in bytes. This will be rounded
* up to a PAGE_SIZE boundary. A holelen of zero truncates to the
* end of the file.
* @even_cows: 1 when truncating a file, unmap even private COWed pages;
* but 0 when invalidating pagecache, don't throw away private data.
*/
void unmap_mapping_range(struct address_space *mapping,
loff_t const holebegin, loff_t const holelen, int even_cows)
{
pgoff_t hba = holebegin >> PAGE_SHIFT;
pgoff_t hlen = (holelen + PAGE_SIZE - 1) >> PAGE_SHIFT;
/* Check for overflow. */
if (sizeof(holelen) > sizeof(hlen)) {
long long holeend =
(holebegin + holelen + PAGE_SIZE - 1) >> PAGE_SHIFT;
if (holeend & ~(long long)ULONG_MAX)
hlen = ULONG_MAX - hba + 1;
}
unmap_mapping_pages(mapping, hba, hlen, even_cows);
}
EXPORT_SYMBOL(unmap_mapping_range);
/*
* Restore a potential device exclusive pte to a working pte entry
*/
static vm_fault_t remove_device_exclusive_entry(struct vm_fault *vmf)
{
struct page *page = vmf->page;
struct vm_area_struct *vma = vmf->vma;
struct mmu_notifier_range range;
if (!lock_page_or_retry(page, vma->vm_mm, vmf->flags))
return VM_FAULT_RETRY;
mmu_notifier_range_init_owner(&range, MMU_NOTIFY_EXCLUSIVE, 0, vma,
vma->vm_mm, vmf->address & PAGE_MASK,
(vmf->address & PAGE_MASK) + PAGE_SIZE, NULL);
mmu_notifier_invalidate_range_start(&range);
vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address,
&vmf->ptl);
if (likely(pte_same(*vmf->pte, vmf->orig_pte)))
restore_exclusive_pte(vma, page, vmf->address, vmf->pte);
pte_unmap_unlock(vmf->pte, vmf->ptl);
unlock_page(page);
mmu_notifier_invalidate_range_end(&range);
return 0;
}
/*
* We enter with non-exclusive mmap_lock (to exclude vma changes,
* but allow concurrent faults), and pte mapped but not yet locked.
* We return with pte unmapped and unlocked.
*
* We return with the mmap_lock locked or unlocked in the same cases
* as does filemap_fault().
*/
vm_fault_t do_swap_page(struct vm_fault *vmf)
{
struct vm_area_struct *vma = vmf->vma;
struct page *page = NULL, *swapcache;
struct swap_info_struct *si = NULL;
swp_entry_t entry;
pte_t pte;
int locked;
int exclusive = 0;
vm_fault_t ret = 0;
void *shadow = NULL;
if (!pte_unmap_same(vma->vm_mm, vmf->pmd, vmf->pte, vmf->orig_pte))
goto out;
entry = pte_to_swp_entry(vmf->orig_pte);
if (unlikely(non_swap_entry(entry))) {
if (is_migration_entry(entry)) {
migration_entry_wait(vma->vm_mm, vmf->pmd,
vmf->address);
} else if (is_device_exclusive_entry(entry)) {
vmf->page = pfn_swap_entry_to_page(entry);
ret = remove_device_exclusive_entry(vmf);
} else if (is_device_private_entry(entry)) {
vmf->page = pfn_swap_entry_to_page(entry);
ret = vmf->page->pgmap->ops->migrate_to_ram(vmf);
} else if (is_hwpoison_entry(entry)) {
ret = VM_FAULT_HWPOISON;
} else {
print_bad_pte(vma, vmf->address, vmf->orig_pte, NULL);
ret = VM_FAULT_SIGBUS;
}
goto out;
}
/* Prevent swapoff from happening to us. */
si = get_swap_device(entry);
if (unlikely(!si))
goto out;
delayacct_set_flag(current, DELAYACCT_PF_SWAPIN);
page = lookup_swap_cache(entry, vma, vmf->address);
swapcache = page;
if (!page) {
if (data_race(si->flags & SWP_SYNCHRONOUS_IO) &&
__swap_count(entry) == 1) {
/* skip swapcache */
page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma,
vmf->address);
if (page) {
__SetPageLocked(page);
__SetPageSwapBacked(page);
if (mem_cgroup_swapin_charge_page(page,
vma->vm_mm, GFP_KERNEL, entry)) {
ret = VM_FAULT_OOM;
goto out_page;
}
mem_cgroup_swapin_uncharge_swap(entry);
shadow = get_shadow_from_swap_cache(entry);
if (shadow)
workingset_refault(page, shadow);
lru_cache_add(page);
/* To provide entry to swap_readpage() */
set_page_private(page, entry.val);
swap_readpage(page, true);
set_page_private(page, 0);
}
} else {
page = swapin_readahead(entry, GFP_HIGHUSER_MOVABLE,
vmf);
swapcache = page;
}
if (!page) {
/*
* Back out if somebody else faulted in this pte
* while we released the pte lock.
*/
vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd,
vmf->address, &vmf->ptl);
if (likely(pte_same(*vmf->pte, vmf->orig_pte)))
ret = VM_FAULT_OOM;
delayacct_clear_flag(current, DELAYACCT_PF_SWAPIN);
goto unlock;
}
/* Had to read the page from swap area: Major fault */
ret = VM_FAULT_MAJOR;
count_vm_event(PGMAJFAULT);
count_memcg_event_mm(vma->vm_mm, PGMAJFAULT);
} else if (PageHWPoison(page)) {
/*
* hwpoisoned dirty swapcache pages are kept for killing
* owner processes (which may be unknown at hwpoison time)
*/
ret = VM_FAULT_HWPOISON;
delayacct_clear_flag(current, DELAYACCT_PF_SWAPIN);
goto out_release;
}
locked = lock_page_or_retry(page, vma->vm_mm, vmf->flags);
delayacct_clear_flag(current, DELAYACCT_PF_SWAPIN);
if (!locked) {
ret |= VM_FAULT_RETRY;
goto out_release;
}
/*
* Make sure try_to_free_swap or reuse_swap_page or swapoff did not
* release the swapcache from under us. The page pin, and pte_same
* test below, are not enough to exclude that. Even if it is still
* swapcache, we need to check that the page's swap has not changed.
*/
if (unlikely((!PageSwapCache(page) ||
page_private(page) != entry.val)) && swapcache)
goto out_page;
page = ksm_might_need_to_copy(page, vma, vmf->address);
if (unlikely(!page)) {
ret = VM_FAULT_OOM;
page = swapcache;
goto out_page;
}
cgroup_throttle_swaprate(page, GFP_KERNEL);
/*
* Back out if somebody else already faulted in this pte.
*/
vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address,
&vmf->ptl);
if (unlikely(!pte_same(*vmf->pte, vmf->orig_pte)))
goto out_nomap;
if (unlikely(!PageUptodate(page))) {
ret = VM_FAULT_SIGBUS;
goto out_nomap;
}
/*
* The page isn't present yet, go ahead with the fault.
*
* Be careful about the sequence of operations here.
* To get its accounting right, reuse_swap_page() must be called
* while the page is counted on swap but not yet in mapcount i.e.
* before page_add_anon_rmap() and swap_free(); try_to_free_swap()
* must be called after the swap_free(), or it will never succeed.
*/
inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES);
dec_mm_counter_fast(vma->vm_mm, MM_SWAPENTS);
pte = mk_pte(page, vma->vm_page_prot);
if ((vmf->flags & FAULT_FLAG_WRITE) && reuse_swap_page(page, NULL)) {
pte = maybe_mkwrite(pte_mkdirty(pte), vma);
vmf->flags &= ~FAULT_FLAG_WRITE;
ret |= VM_FAULT_WRITE;
exclusive = RMAP_EXCLUSIVE;
}
flush_icache_page(vma, page);
if (pte_swp_soft_dirty(vmf->orig_pte))
pte = pte_mksoft_dirty(pte);
if (pte_swp_uffd_wp(vmf->orig_pte)) {
pte = pte_mkuffd_wp(pte);
pte = pte_wrprotect(pte);
}
set_pte_at(vma->vm_mm, vmf->address, vmf->pte, pte);
arch_do_swap_page(vma->vm_mm, vma, vmf->address, pte, vmf->orig_pte);
vmf->orig_pte = pte;
/* ksm created a completely new copy */
if (unlikely(page != swapcache && swapcache)) {
page_add_new_anon_rmap(page, vma, vmf->address, false);
lru_cache_add_inactive_or_unevictable(page, vma);
} else {
do_page_add_anon_rmap(page, vma, vmf->address, exclusive);
}
swap_free(entry);
if (mem_cgroup_swap_full(page) ||
(vma->vm_flags & VM_LOCKED) || PageMlocked(page))
try_to_free_swap(page);
unlock_page(page);
if (page != swapcache && swapcache) {
/*
* Hold the lock to avoid the swap entry to be reused
* until we take the PT lock for the pte_same() check
* (to avoid false positives from pte_same). For
* further safety release the lock after the swap_free
* so that the swap count won't change under a
* parallel locked swapcache.
*/
unlock_page(swapcache);
put_page(swapcache);
}
if (vmf->flags & FAULT_FLAG_WRITE) {
ret |= do_wp_page(vmf);
if (ret & VM_FAULT_ERROR)
ret &= VM_FAULT_ERROR;
goto out;
}
/* No need to invalidate - it was non-present before */
update_mmu_cache(vma, vmf->address, vmf->pte);
unlock:
pte_unmap_unlock(vmf->pte, vmf->ptl);
out:
if (si)
put_swap_device(si);
return ret;
out_nomap:
pte_unmap_unlock(vmf->pte, vmf->ptl);
out_page:
unlock_page(page);
out_release:
put_page(page);
if (page != swapcache && swapcache) {
unlock_page(swapcache);
put_page(swapcache);
}
if (si)
put_swap_device(si);
return ret;
}
/*
* We enter with non-exclusive mmap_lock (to exclude vma changes,
* but allow concurrent faults), and pte mapped but not yet locked.
* We return with mmap_lock still held, but pte unmapped and unlocked.
*/
static vm_fault_t do_anonymous_page(struct vm_fault *vmf)
{
struct vm_area_struct *vma = vmf->vma;
struct page *page;
vm_fault_t ret = 0;
pte_t entry;
/* File mapping without ->vm_ops ? */
if (vma->vm_flags & VM_SHARED)
return VM_FAULT_SIGBUS;
/*
* Use pte_alloc() instead of pte_alloc_map(). We can't run
* pte_offset_map() on pmds where a huge pmd might be created
* from a different thread.
*
* pte_alloc_map() is safe to use under mmap_write_lock(mm) or when
* parallel threads are excluded by other means.
*
* Here we only have mmap_read_lock(mm).
*/
if (pte_alloc(vma->vm_mm, vmf->pmd))
return VM_FAULT_OOM;
/* See comment in handle_pte_fault() */
if (unlikely(pmd_trans_unstable(vmf->pmd)))
return 0;
/* Use the zero-page for reads */
if (!(vmf->flags & FAULT_FLAG_WRITE) &&
!mm_forbids_zeropage(vma->vm_mm)) {
entry = pte_mkspecial(pfn_pte(my_zero_pfn(vmf->address),
vma->vm_page_prot));
vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd,
vmf->address, &vmf->ptl);
if (!pte_none(*vmf->pte)) {
update_mmu_tlb(vma, vmf->address, vmf->pte);
goto unlock;
}
ret = check_stable_address_space(vma->vm_mm);
if (ret)
goto unlock;
/* Deliver the page fault to userland, check inside PT lock */
if (userfaultfd_missing(vma)) {
pte_unmap_unlock(vmf->pte, vmf->ptl);
return handle_userfault(vmf, VM_UFFD_MISSING);
}
goto setpte;
}
/* Allocate our own private page. */
if (unlikely(anon_vma_prepare(vma)))
goto oom;
page = alloc_zeroed_user_highpage_movable(vma, vmf->address);
if (!page)
goto oom;
if (mem_cgroup_charge(page, vma->vm_mm, GFP_KERNEL))
goto oom_free_page;
cgroup_throttle_swaprate(page, GFP_KERNEL);
/*
* The memory barrier inside __SetPageUptodate makes sure that
* preceding stores to the page contents become visible before
* the set_pte_at() write.
*/
__SetPageUptodate(page);
entry = mk_pte(page, vma->vm_page_prot);
entry = pte_sw_mkyoung(entry);
if (vma->vm_flags & VM_WRITE)
entry = pte_mkwrite(pte_mkdirty(entry));
vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address,
&vmf->ptl);
if (!pte_none(*vmf->pte)) {
update_mmu_cache(vma, vmf->address, vmf->pte);
goto release;
}
ret = check_stable_address_space(vma->vm_mm);
if (ret)
goto release;
/* Deliver the page fault to userland, check inside PT lock */
if (userfaultfd_missing(vma)) {
pte_unmap_unlock(vmf->pte, vmf->ptl);
put_page(page);
return handle_userfault(vmf, VM_UFFD_MISSING);
}
inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES); page_add_new_anon_rmap(page, vma, vmf->address, false);
lru_cache_add_inactive_or_unevictable(page, vma);
setpte:
set_pte_at(vma->vm_mm, vmf->address, vmf->pte, entry);
/* No need to invalidate - it was non-present before */
update_mmu_cache(vma, vmf->address, vmf->pte);
unlock:
pte_unmap_unlock(vmf->pte, vmf->ptl);
return ret;
release:
put_page(page);
goto unlock;
oom_free_page:
put_page(page);
oom:
return VM_FAULT_OOM;
}
/*
* The mmap_lock must have been held on entry, and may have been
* released depending on flags and vma->vm_ops->fault() return value.
* See filemap_fault() and __lock_page_retry().
*/
static vm_fault_t __do_fault(struct vm_fault *vmf)
{
struct vm_area_struct *vma = vmf->vma;
vm_fault_t ret;
/*
* Preallocate pte before we take page_lock because this might lead to
* deadlocks for memcg reclaim which waits for pages under writeback:
* lock_page(A)
* SetPageWriteback(A)
* unlock_page(A)
* lock_page(B)
* lock_page(B)
* pte_alloc_one
* shrink_page_list
* wait_on_page_writeback(A)
* SetPageWriteback(B)
* unlock_page(B)
* # flush A, B to clear the writeback
*/
if (pmd_none(*vmf->pmd) && !vmf->prealloc_pte) {
vmf->prealloc_pte = pte_alloc_one(vma->vm_mm);
if (!vmf->prealloc_pte)
return VM_FAULT_OOM;
smp_wmb(); /* See comment in __pte_alloc() */
}
ret = vma->vm_ops->fault(vmf);
if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY |
VM_FAULT_DONE_COW)))
return ret;
if (unlikely(PageHWPoison(vmf->page))) {
struct page *page = vmf->page;
vm_fault_t poisonret = VM_FAULT_HWPOISON;
if (ret & VM_FAULT_LOCKED) {
if (page_mapped(page))
unmap_mapping_pages(page_mapping(page),
page->index, 1, false);
/* Retry if a clean page was removed from the cache. */
if (invalidate_inode_page(page))
poisonret = VM_FAULT_NOPAGE;
unlock_page(page);
}
put_page(page);
vmf->page = NULL;
return poisonret;
}
if (unlikely(!(ret & VM_FAULT_LOCKED)))
lock_page(vmf->page);
else
VM_BUG_ON_PAGE(!PageLocked(vmf->page), vmf->page);
return ret;
}
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
static void deposit_prealloc_pte(struct vm_fault *vmf)
{
struct vm_area_struct *vma = vmf->vma;
pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, vmf->prealloc_pte);
/*
* We are going to consume the prealloc table,
* count that as nr_ptes.
*/
mm_inc_nr_ptes(vma->vm_mm);
vmf->prealloc_pte = NULL;
}
vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page)
{
struct vm_area_struct *vma = vmf->vma;
bool write = vmf->flags & FAULT_FLAG_WRITE;
unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
pmd_t entry;
int i;
vm_fault_t ret = VM_FAULT_FALLBACK;
if (!transhuge_vma_suitable(vma, haddr))
return ret;
page = compound_head(page);
if (compound_order(page) != HPAGE_PMD_ORDER)
return ret;
/*
* Just backoff if any subpage of a THP is corrupted otherwise
* the corrupted page may mapped by PMD silently to escape the
* check. This kind of THP just can be PTE mapped. Access to
* the corrupted subpage should trigger SIGBUS as expected.
*/
if (unlikely(PageHasHWPoisoned(page)))
return ret;
/*
* Archs like ppc64 need additional space to store information
* related to pte entry. Use the preallocated table for that.
*/
if (arch_needs_pgtable_deposit() && !vmf->prealloc_pte) {
vmf->prealloc_pte = pte_alloc_one(vma->vm_mm);
if (!vmf->prealloc_pte)
return VM_FAULT_OOM;
smp_wmb(); /* See comment in __pte_alloc() */
}
vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
if (unlikely(!pmd_none(*vmf->pmd)))
goto out;
for (i = 0; i < HPAGE_PMD_NR; i++)
flush_icache_page(vma, page + i);
entry = mk_huge_pmd(page, vma->vm_page_prot);
if (write)
entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
add_mm_counter(vma->vm_mm, mm_counter_file(page), HPAGE_PMD_NR);
page_add_file_rmap(page, true);
/*
* deposit and withdraw with pmd lock held
*/
if (arch_needs_pgtable_deposit())
deposit_prealloc_pte(vmf);
set_pmd_at(vma->vm_mm, haddr, vmf->pmd, entry);
update_mmu_cache_pmd(vma, haddr, vmf->pmd);
/* fault is handled */
ret = 0;
count_vm_event(THP_FILE_MAPPED);
out:
spin_unlock(vmf->ptl);
return ret;
}
#else
vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page)
{
return VM_FAULT_FALLBACK;
}
#endif
void do_set_pte(struct vm_fault *vmf, struct page *page, unsigned long addr)
{
struct vm_area_struct *vma = vmf->vma;
bool write = vmf->flags & FAULT_FLAG_WRITE;
bool prefault = vmf->address != addr;
pte_t entry;
flush_icache_page(vma, page);
entry = mk_pte(page, vma->vm_page_prot);
if (prefault && arch_wants_old_prefaulted_pte())
entry = pte_mkold(entry);
else
entry = pte_sw_mkyoung(entry);
if (write)
entry = maybe_mkwrite(pte_mkdirty(entry), vma);
/* copy-on-write page */
if (write && !(vma->vm_flags & VM_SHARED)) { inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES); page_add_new_anon_rmap(page, vma, addr, false);
lru_cache_add_inactive_or_unevictable(page, vma);
} else {
inc_mm_counter_fast(vma->vm_mm, mm_counter_file(page)); page_add_file_rmap(page, false);
}
set_pte_at(vma->vm_mm, addr, vmf->pte, entry);
}
/**
* finish_fault - finish page fault once we have prepared the page to fault
*
* @vmf: structure describing the fault
*
* This function handles all that is needed to finish a page fault once the
* page to fault in is prepared. It handles locking of PTEs, inserts PTE for
* given page, adds reverse page mapping, handles memcg charges and LRU
* addition.
*
* The function expects the page to be locked and on success it consumes a
* reference of a page being mapped (for the PTE which maps it).
*
* Return: %0 on success, %VM_FAULT_ code in case of error.
*/
vm_fault_t finish_fault(struct vm_fault *vmf)
{
struct vm_area_struct *vma = vmf->vma;
struct page *page;
vm_fault_t ret;
/* Did we COW the page? */
if ((vmf->flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED))
page = vmf->cow_page;
else
page = vmf->page;
/*
* check even for read faults because we might have lost our CoWed
* page
*/
if (!(vma->vm_flags & VM_SHARED)) {
ret = check_stable_address_space(vma->vm_mm);
if (ret)
return ret;
}
if (pmd_none(*vmf->pmd)) {
if (PageTransCompound(page)) {
ret = do_set_pmd(vmf, page);
if (ret != VM_FAULT_FALLBACK)
return ret;
}
if (vmf->prealloc_pte) {
vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
if (likely(pmd_none(*vmf->pmd))) {
mm_inc_nr_ptes(vma->vm_mm);
pmd_populate(vma->vm_mm, vmf->pmd, vmf->prealloc_pte);
vmf->prealloc_pte = NULL;
}
spin_unlock(vmf->ptl);
} else if (unlikely(pte_alloc(vma->vm_mm, vmf->pmd))) {
return VM_FAULT_OOM;
}
}
/* See comment in handle_pte_fault() */
if (pmd_devmap_trans_unstable(vmf->pmd))
return 0;
vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd,
vmf->address, &vmf->ptl);
ret = 0;
/* Re-check under ptl */
if (likely(pte_none(*vmf->pte)))
do_set_pte(vmf, page, vmf->address);
else
ret = VM_FAULT_NOPAGE;
update_mmu_tlb(vma, vmf->address, vmf->pte);
pte_unmap_unlock(vmf->pte, vmf->ptl);
return ret;
}
static unsigned long fault_around_bytes __read_mostly =
rounddown_pow_of_two(65536);
#ifdef CONFIG_DEBUG_FS
static int fault_around_bytes_get(void *data, u64 *val)
{
*val = fault_around_bytes;
return 0;
}
/*
* fault_around_bytes must be rounded down to the nearest page order as it's
* what do_fault_around() expects to see.
*/
static int fault_around_bytes_set(void *data, u64 val)
{
if (val / PAGE_SIZE > PTRS_PER_PTE)
return -EINVAL;
if (val > PAGE_SIZE)
fault_around_bytes = rounddown_pow_of_two(val);
else
fault_around_bytes = PAGE_SIZE; /* rounddown_pow_of_two(0) is undefined */
return 0;
}
DEFINE_DEBUGFS_ATTRIBUTE(fault_around_bytes_fops,
fault_around_bytes_get, fault_around_bytes_set, "%llu\n");
static int __init fault_around_debugfs(void)
{
debugfs_create_file_unsafe("fault_around_bytes", 0644, NULL, NULL,
&fault_around_bytes_fops);
return 0;
}
late_initcall(fault_around_debugfs);
#endif
/*
* do_fault_around() tries to map few pages around the fault address. The hope
* is that the pages will be needed soon and this will lower the number of
* faults to handle.
*
* It uses vm_ops->map_pages() to map the pages, which skips the page if it's
* not ready to be mapped: not up-to-date, locked, etc.
*
* This function is called with the page table lock taken. In the split ptlock
* case the page table lock only protects only those entries which belong to
* the page table corresponding to the fault address.
*
* This function doesn't cross the VMA boundaries, in order to call map_pages()
* only once.
*
* fault_around_bytes defines how many bytes we'll try to map.
* do_fault_around() expects it to be set to a power of two less than or equal
* to PTRS_PER_PTE.
*
* The virtual address of the area that we map is naturally aligned to
* fault_around_bytes rounded down to the machine page size
* (and therefore to page order). This way it's easier to guarantee
* that we don't cross page table boundaries.
*/
static vm_fault_t do_fault_around(struct vm_fault *vmf)
{
unsigned long address = vmf->address, nr_pages, mask;
pgoff_t start_pgoff = vmf->pgoff;
pgoff_t end_pgoff;
int off;
nr_pages = READ_ONCE(fault_around_bytes) >> PAGE_SHIFT;
mask = ~(nr_pages * PAGE_SIZE - 1) & PAGE_MASK;
address = max(address & mask, vmf->vma->vm_start);
off = ((vmf->address - address) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1);
start_pgoff -= off;
/*
* end_pgoff is either the end of the page table, the end of
* the vma or nr_pages from start_pgoff, depending what is nearest.
*/
end_pgoff = start_pgoff -
((address >> PAGE_SHIFT) & (PTRS_PER_PTE - 1)) +
PTRS_PER_PTE - 1;
end_pgoff = min3(end_pgoff, vma_pages(vmf->vma) + vmf->vma->vm_pgoff - 1,
start_pgoff + nr_pages - 1);
if (pmd_none(*vmf->pmd)) {
vmf->prealloc_pte = pte_alloc_one(vmf->vma->vm_mm);
if (!vmf->prealloc_pte)
return VM_FAULT_OOM;
smp_wmb(); /* See comment in __pte_alloc() */
}
return vmf->vma->vm_ops->map_pages(vmf, start_pgoff, end_pgoff);
}
static vm_fault_t do_read_fault(struct vm_fault *vmf)
{
struct vm_area_struct *vma = vmf->vma;
vm_fault_t ret = 0;
/*
* Let's call ->map_pages() first and use ->fault() as fallback
* if page by the offset is not ready to be mapped (cold cache or
* something).
*/
if (vma->vm_ops->map_pages && fault_around_bytes >> PAGE_SHIFT > 1) {
if (likely(!userfaultfd_minor(vmf->vma))) {
ret = do_fault_around(vmf);
if (ret)
return ret;
}
}
ret = __do_fault(vmf);
if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
return ret;
ret |= finish_fault(vmf);
unlock_page(vmf->page);
if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
put_page(vmf->page);
return ret;
}
static vm_fault_t do_cow_fault(struct vm_fault *vmf)
{
struct vm_area_struct *vma = vmf->vma;
vm_fault_t ret;
if (unlikely(anon_vma_prepare(vma)))
return VM_FAULT_OOM;
vmf->cow_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, vmf->address);
if (!vmf->cow_page)
return VM_FAULT_OOM;
if (mem_cgroup_charge(vmf->cow_page, vma->vm_mm, GFP_KERNEL)) {
put_page(vmf->cow_page);
return VM_FAULT_OOM;
}
cgroup_throttle_swaprate(vmf->cow_page, GFP_KERNEL);
ret = __do_fault(vmf);
if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
goto uncharge_out;
if (ret & VM_FAULT_DONE_COW)
return ret;
copy_user_highpage(vmf->cow_page, vmf->page, vmf->address, vma);
__SetPageUptodate(vmf->cow_page);
ret |= finish_fault(vmf);
unlock_page(vmf->page);
put_page(vmf->page);
if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
goto uncharge_out;
return ret;
uncharge_out:
put_page(vmf->cow_page);
return ret;
}
static vm_fault_t do_shared_fault(struct vm_fault *vmf)
{
struct vm_area_struct *vma = vmf->vma;
vm_fault_t ret, tmp;
ret = __do_fault(vmf);
if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
return ret;
/*
* Check if the backing address space wants to know that the page is
* about to become writable
*/
if (vma->vm_ops->page_mkwrite) { unlock_page(vmf->page);
tmp = do_page_mkwrite(vmf);
if (unlikely(!tmp ||
(tmp & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))) {
put_page(vmf->page);
return tmp;
}
}
ret |= finish_fault(vmf);
if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE |
VM_FAULT_RETRY))) {
unlock_page(vmf->page);
put_page(vmf->page);
return ret;
}
ret |= fault_dirty_shared_page(vmf);
return ret;
}
/*
* We enter with non-exclusive mmap_lock (to exclude vma changes,
* but allow concurrent faults).
* The mmap_lock may have been released depending on flags and our
* return value. See filemap_fault() and __lock_page_or_retry().
* If mmap_lock is released, vma may become invalid (for example
* by other thread calling munmap()).
*/
static vm_fault_t do_fault(struct vm_fault *vmf)
{
struct vm_area_struct *vma = vmf->vma;
struct mm_struct *vm_mm = vma->vm_mm;
vm_fault_t ret;
/*
* The VMA was not fully populated on mmap() or missing VM_DONTEXPAND
*/
if (!vma->vm_ops->fault) {
/*
* If we find a migration pmd entry or a none pmd entry, which
* should never happen, return SIGBUS
*/
if (unlikely(!pmd_present(*vmf->pmd)))
ret = VM_FAULT_SIGBUS;
else {
vmf->pte = pte_offset_map_lock(vmf->vma->vm_mm,
vmf->pmd,
vmf->address,
&vmf->ptl);
/*
* Make sure this is not a temporary clearing of pte
* by holding ptl and checking again. A R/M/W update
* of pte involves: take ptl, clearing the pte so that
* we don't have concurrent modification by hardware
* followed by an update.
*/
if (unlikely(pte_none(*vmf->pte)))
ret = VM_FAULT_SIGBUS;
else
ret = VM_FAULT_NOPAGE;
pte_unmap_unlock(vmf->pte, vmf->ptl);
}
} else if (!(vmf->flags & FAULT_FLAG_WRITE))
ret = do_read_fault(vmf);
else if (!(vma->vm_flags & VM_SHARED))
ret = do_cow_fault(vmf);
else
ret = do_shared_fault(vmf);
/* preallocated pagetable is unused: free it */
if (vmf->prealloc_pte) {
pte_free(vm_mm, vmf->prealloc_pte);
vmf->prealloc_pte = NULL;
}
return ret;
}
int numa_migrate_prep(struct page *page, struct vm_area_struct *vma,
unsigned long addr, int page_nid, int *flags)
{
get_page(page);
count_vm_numa_event(NUMA_HINT_FAULTS);
if (page_nid == numa_node_id()) {
count_vm_numa_event(NUMA_HINT_FAULTS_LOCAL);
*flags |= TNF_FAULT_LOCAL;
}
return mpol_misplaced(page, vma, addr);
}
static vm_fault_t do_numa_page(struct vm_fault *vmf)
{
struct vm_area_struct *vma = vmf->vma;
struct page *page = NULL;
int page_nid = NUMA_NO_NODE;
int last_cpupid;
int target_nid;
pte_t pte, old_pte;
bool was_writable = pte_savedwrite(vmf->orig_pte);
int flags = 0;
/*
* The "pte" at this point cannot be used safely without
* validation through pte_unmap_same(). It's of NUMA type but
* the pfn may be screwed if the read is non atomic.
*/
vmf->ptl = pte_lockptr(vma->vm_mm, vmf->pmd);
spin_lock(vmf->ptl);
if (unlikely(!pte_same(*vmf->pte, vmf->orig_pte))) {
pte_unmap_unlock(vmf->pte, vmf->ptl);
goto out;
}
/* Get the normal PTE */
old_pte = ptep_get(vmf->pte);
pte = pte_modify(old_pte, vma->vm_page_prot);
page = vm_normal_page(vma, vmf->address, pte);
if (!page)
goto out_map;
/* TODO: handle PTE-mapped THP */
if (PageCompound(page))
goto out_map;
/*
* Avoid grouping on RO pages in general. RO pages shouldn't hurt as
* much anyway since they can be in shared cache state. This misses
* the case where a mapping is writable but the process never writes
* to it but pte_write gets cleared during protection updates and
* pte_dirty has unpredictable behaviour between PTE scan updates,
* background writeback, dirty balancing and application behaviour.
*/
if (!was_writable)
flags |= TNF_NO_GROUP;
/*
* Flag if the page is shared between multiple address spaces. This
* is later used when determining whether to group tasks together
*/
if (page_mapcount(page) > 1 && (vma->vm_flags & VM_SHARED))
flags |= TNF_SHARED;
last_cpupid = page_cpupid_last(page);
page_nid = page_to_nid(page);
target_nid = numa_migrate_prep(page, vma, vmf->address, page_nid,
&flags);
if (target_nid == NUMA_NO_NODE) {
put_page(page);
goto out_map;
}
pte_unmap_unlock(vmf->pte, vmf->ptl);
/* Migrate to the requested node */
if (migrate_misplaced_page(page, vma, target_nid)) {
page_nid = target_nid;
flags |= TNF_MIGRATED;
} else {
flags |= TNF_MIGRATE_FAIL;
vmf->pte = pte_offset_map(vmf->pmd, vmf->address);
spin_lock(vmf->ptl);
if (unlikely(!pte_same(*vmf->pte, vmf->orig_pte))) {
pte_unmap_unlock(vmf->pte, vmf->ptl);
goto out;
}
goto out_map;
}
out:
if (page_nid != NUMA_NO_NODE)
task_numa_fault(last_cpupid, page_nid, 1, flags);
return 0;
out_map:
/*
* Make it present again, depending on how arch implements
* non-accessible ptes, some can allow access by kernel mode.
*/
old_pte = ptep_modify_prot_start(vma, vmf->address, vmf->pte);
pte = pte_modify(old_pte, vma->vm_page_prot);
pte = pte_mkyoung(pte);
if (was_writable)
pte = pte_mkwrite(pte);
ptep_modify_prot_commit(vma, vmf->address, vmf->pte, old_pte, pte);
update_mmu_cache(vma, vmf->address, vmf->pte);
pte_unmap_unlock(vmf->pte, vmf->ptl);
goto out;
}
static inline vm_fault_t create_huge_pmd(struct vm_fault *vmf)
{
if (vma_is_anonymous(vmf->vma))
return do_huge_pmd_anonymous_page(vmf);
if (vmf->vma->vm_ops->huge_fault)
return vmf->vma->vm_ops->huge_fault(vmf, PE_SIZE_PMD);
return VM_FAULT_FALLBACK;
}
/* `inline' is required to avoid gcc 4.1.2 build error */
static inline vm_fault_t wp_huge_pmd(struct vm_fault *vmf)
{
if (vma_is_anonymous(vmf->vma)) {
if (userfaultfd_huge_pmd_wp(vmf->vma, vmf->orig_pmd))
return handle_userfault(vmf, VM_UFFD_WP);
return do_huge_pmd_wp_page(vmf);
}
if (vmf->vma->vm_ops->huge_fault) {
vm_fault_t ret = vmf->vma->vm_ops->huge_fault(vmf, PE_SIZE_PMD);
if (!(ret & VM_FAULT_FALLBACK))
return ret;
}
/* COW or write-notify handled on pte level: split pmd. */
__split_huge_pmd(vmf->vma, vmf->pmd, vmf->address, false, NULL);
return VM_FAULT_FALLBACK;
}
static vm_fault_t create_huge_pud(struct vm_fault *vmf)
{
#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && \
defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD)
/* No support for anonymous transparent PUD pages yet */
if (vma_is_anonymous(vmf->vma))
goto split;
if (vmf->vma->vm_ops->huge_fault) {
vm_fault_t ret = vmf->vma->vm_ops->huge_fault(vmf, PE_SIZE_PUD);
if (!(ret & VM_FAULT_FALLBACK))
return ret;
}
split:
/* COW or write-notify not handled on PUD level: split pud.*/
__split_huge_pud(vmf->vma, vmf->pud, vmf->address);
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
return VM_FAULT_FALLBACK;
}
static vm_fault_t wp_huge_pud(struct vm_fault *vmf, pud_t orig_pud)
{
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
/* No support for anonymous transparent PUD pages yet */
if (vma_is_anonymous(vmf->vma))
return VM_FAULT_FALLBACK;
if (vmf->vma->vm_ops->huge_fault)
return vmf->vma->vm_ops->huge_fault(vmf, PE_SIZE_PUD);
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
return VM_FAULT_FALLBACK;
}
/*
* These routines also need to handle stuff like marking pages dirty
* and/or accessed for architectures that don't do it in hardware (most
* RISC architectures). The early dirtying is also good on the i386.
*
* There is also a hook called "update_mmu_cache()" that architectures
* with external mmu caches can use to update those (ie the Sparc or
* PowerPC hashed page tables that act as extended TLBs).
*
* We enter with non-exclusive mmap_lock (to exclude vma changes, but allow
* concurrent faults).
*
* The mmap_lock may have been released depending on flags and our return value.
* See filemap_fault() and __lock_page_or_retry().
*/
static vm_fault_t handle_pte_fault(struct vm_fault *vmf)
{
pte_t entry;
if (unlikely(pmd_none(*vmf->pmd))) {
/*
* Leave __pte_alloc() until later: because vm_ops->fault may
* want to allocate huge page, and if we expose page table
* for an instant, it will be difficult to retract from
* concurrent faults and from rmap lookups.
*/
vmf->pte = NULL;
} else {
/*
* If a huge pmd materialized under us just retry later. Use
* pmd_trans_unstable() via pmd_devmap_trans_unstable() instead
* of pmd_trans_huge() to ensure the pmd didn't become
* pmd_trans_huge under us and then back to pmd_none, as a
* result of MADV_DONTNEED running immediately after a huge pmd
* fault in a different thread of this mm, in turn leading to a
* misleading pmd_trans_huge() retval. All we have to ensure is
* that it is a regular pmd that we can walk with
* pte_offset_map() and we can do that through an atomic read
* in C, which is what pmd_trans_unstable() provides.
*/
if (pmd_devmap_trans_unstable(vmf->pmd))
return 0;
/*
* A regular pmd is established and it can't morph into a huge
* pmd from under us anymore at this point because we hold the
* mmap_lock read mode and khugepaged takes it in write mode.
* So now it's safe to run pte_offset_map().
*/
vmf->pte = pte_offset_map(vmf->pmd, vmf->address);
vmf->orig_pte = *vmf->pte;
/*
* some architectures can have larger ptes than wordsize,
* e.g.ppc44x-defconfig has CONFIG_PTE_64BIT=y and
* CONFIG_32BIT=y, so READ_ONCE cannot guarantee atomic
* accesses. The code below just needs a consistent view
* for the ifs and we later double check anyway with the
* ptl lock held. So here a barrier will do.
*/
barrier();
if (pte_none(vmf->orig_pte)) {
pte_unmap(vmf->pte);
vmf->pte = NULL;
}
}
if (!vmf->pte) { if (vma_is_anonymous(vmf->vma))
return do_anonymous_page(vmf);
else
return do_fault(vmf);
}
if (!pte_present(vmf->orig_pte))
return do_swap_page(vmf);
if (pte_protnone(vmf->orig_pte) && vma_is_accessible(vmf->vma))
return do_numa_page(vmf);
vmf->ptl = pte_lockptr(vmf->vma->vm_mm, vmf->pmd);
spin_lock(vmf->ptl);
entry = vmf->orig_pte;
if (unlikely(!pte_same(*vmf->pte, entry))) {
update_mmu_tlb(vmf->vma, vmf->address, vmf->pte);
goto unlock;
}
if (vmf->flags & FAULT_FLAG_WRITE) {
if (!pte_write(entry))
return do_wp_page(vmf);
entry = pte_mkdirty(entry);
}
entry = pte_mkyoung(entry);
if (ptep_set_access_flags(vmf->vma, vmf->address, vmf->pte, entry,
vmf->flags & FAULT_FLAG_WRITE)) {
update_mmu_cache(vmf->vma, vmf->address, vmf->pte);
} else {
/* Skip spurious TLB flush for retried page fault */
if (vmf->flags & FAULT_FLAG_TRIED)
goto unlock;
/*
* This is needed only for protection faults but the arch code
* is not yet telling us if this is a protection fault or not.
* This still avoids useless tlb flushes for .text page faults
* with threads.
*/
if (vmf->flags & FAULT_FLAG_WRITE)
flush_tlb_fix_spurious_fault(vmf->vma, vmf->address);
}
unlock:
pte_unmap_unlock(vmf->pte, vmf->ptl);
return 0;
}
/*
* By the time we get here, we already hold the mm semaphore
*
* The mmap_lock may have been released depending on flags and our
* return value. See filemap_fault() and __lock_page_or_retry().
*/
static vm_fault_t __handle_mm_fault(struct vm_area_struct *vma,
unsigned long address, unsigned int flags)
{
struct vm_fault vmf = {
.vma = vma,
.address = address & PAGE_MASK,
.flags = flags,
.pgoff = linear_page_index(vma, address),
.gfp_mask = __get_fault_gfp_mask(vma),
};
unsigned int dirty = flags & FAULT_FLAG_WRITE;
struct mm_struct *mm = vma->vm_mm;
pgd_t *pgd;
p4d_t *p4d;
vm_fault_t ret;
pgd = pgd_offset(mm, address);
p4d = p4d_alloc(mm, pgd, address);
if (!p4d) return VM_FAULT_OOM;
vmf.pud = pud_alloc(mm, p4d, address);
if (!vmf.pud)
return VM_FAULT_OOM;
retry_pud:
if (pud_none(*vmf.pud) && __transparent_hugepage_enabled(vma)) {
ret = create_huge_pud(&vmf);
if (!(ret & VM_FAULT_FALLBACK))
return ret;
} else {
pud_t orig_pud = *vmf.pud;
barrier();
if (pud_trans_huge(orig_pud) || pud_devmap(orig_pud)) {
/* NUMA case for anonymous PUDs would go here */
if (dirty && !pud_write(orig_pud)) {
ret = wp_huge_pud(&vmf, orig_pud);
if (!(ret & VM_FAULT_FALLBACK))
return ret;
} else {
huge_pud_set_accessed(&vmf, orig_pud);
return 0;
}
}
}
vmf.pmd = pmd_alloc(mm, vmf.pud, address);
if (!vmf.pmd)
return VM_FAULT_OOM;
/* Huge pud page fault raced with pmd_alloc? */
if (pud_trans_unstable(vmf.pud))
goto retry_pud;
if (pmd_none(*vmf.pmd) && __transparent_hugepage_enabled(vma)) {
ret = create_huge_pmd(&vmf);
if (!(ret & VM_FAULT_FALLBACK))
return ret;
} else {
vmf.orig_pmd = *vmf.pmd;
barrier();
if (unlikely(is_swap_pmd(vmf.orig_pmd))) {
VM_BUG_ON(thp_migration_supported() &&
!is_pmd_migration_entry(vmf.orig_pmd));
if (is_pmd_migration_entry(vmf.orig_pmd))
pmd_migration_entry_wait(mm, vmf.pmd);
return 0;
}
if (pmd_trans_huge(vmf.orig_pmd) || pmd_devmap(vmf.orig_pmd)) {
if (pmd_protnone(vmf.orig_pmd) && vma_is_accessible(vma))
return do_huge_pmd_numa_page(&vmf);
if (dirty && !pmd_write(vmf.orig_pmd)) {
ret = wp_huge_pmd(&vmf);
if (!(ret & VM_FAULT_FALLBACK))
return ret;
} else {
huge_pmd_set_accessed(&vmf);
return 0;
}
}
}
return handle_pte_fault(&vmf);
}
/**
* mm_account_fault - Do page fault accounting
*
* @regs: the pt_regs struct pointer. When set to NULL, will skip accounting
* of perf event counters, but we'll still do the per-task accounting to
* the task who triggered this page fault.
* @address: the faulted address.
* @flags: the fault flags.
* @ret: the fault retcode.
*
* This will take care of most of the page fault accounting. Meanwhile, it
* will also include the PERF_COUNT_SW_PAGE_FAULTS_[MAJ|MIN] perf counter
* updates. However, note that the handling of PERF_COUNT_SW_PAGE_FAULTS should
* still be in per-arch page fault handlers at the entry of page fault.
*/
static inline void mm_account_fault(struct pt_regs *regs,
unsigned long address, unsigned int flags,
vm_fault_t ret)
{
bool major;
/*
* We don't do accounting for some specific faults:
*
* - Unsuccessful faults (e.g. when the address wasn't valid). That
* includes arch_vma_access_permitted() failing before reaching here.
* So this is not a "this many hardware page faults" counter. We
* should use the hw profiling for that.
*
* - Incomplete faults (VM_FAULT_RETRY). They will only be counted
* once they're completed.
*/
if (ret & (VM_FAULT_ERROR | VM_FAULT_RETRY))
return;
/*
* We define the fault as a major fault when the final successful fault
* is VM_FAULT_MAJOR, or if it retried (which implies that we couldn't
* handle it immediately previously).
*/
major = (ret & VM_FAULT_MAJOR) || (flags & FAULT_FLAG_TRIED);
if (major)
current->maj_flt++;
else
current->min_flt++;
/*
* If the fault is done for GUP, regs will be NULL. We only do the
* accounting for the per thread fault counters who triggered the
* fault, and we skip the perf event updates.
*/
if (!regs)
return;
if (major)
perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, regs, address);
else
perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, regs, address);
}
/*
* By the time we get here, we already hold the mm semaphore
*
* The mmap_lock may have been released depending on flags and our
* return value. See filemap_fault() and __lock_page_or_retry().
*/
vm_fault_t handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
unsigned int flags, struct pt_regs *regs)
{
vm_fault_t ret;
__set_current_state(TASK_RUNNING);
count_vm_event(PGFAULT);
count_memcg_event_mm(vma->vm_mm, PGFAULT);
/* do counter updates before entering really critical section. */
check_sync_rss_stat(current);
if (!arch_vma_access_permitted(vma, flags & FAULT_FLAG_WRITE,
flags & FAULT_FLAG_INSTRUCTION,
flags & FAULT_FLAG_REMOTE))
return VM_FAULT_SIGSEGV;
/*
* Enable the memcg OOM handling for faults triggered in user
* space. Kernel faults are handled more gracefully.
*/
if (flags & FAULT_FLAG_USER)
mem_cgroup_enter_user_fault();
if (unlikely(is_vm_hugetlb_page(vma)))
ret = hugetlb_fault(vma->vm_mm, vma, address, flags);
else
ret = __handle_mm_fault(vma, address, flags);
if (flags & FAULT_FLAG_USER) {
mem_cgroup_exit_user_fault();
/*
* The task may have entered a memcg OOM situation but
* if the allocation error was handled gracefully (no
* VM_FAULT_OOM), there is no need to kill anything.
* Just clean up the OOM state peacefully.
*/
if (task_in_memcg_oom(current) && !(ret & VM_FAULT_OOM))
mem_cgroup_oom_synchronize(false);
}
mm_account_fault(regs, address, flags, ret);
return ret;
}
EXPORT_SYMBOL_GPL(handle_mm_fault);
#ifndef __PAGETABLE_P4D_FOLDED
/*
* Allocate p4d page table.
* We've already handled the fast-path in-line.
*/
int __p4d_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address)
{
p4d_t *new = p4d_alloc_one(mm, address);
if (!new)
return -ENOMEM;
smp_wmb(); /* See comment in __pte_alloc */
spin_lock(&mm->page_table_lock);
if (pgd_present(*pgd)) /* Another has populated it */
p4d_free(mm, new);
else
pgd_populate(mm, pgd, new);
spin_unlock(&mm->page_table_lock);
return 0;
}
#endif /* __PAGETABLE_P4D_FOLDED */
#ifndef __PAGETABLE_PUD_FOLDED
/*
* Allocate page upper directory.
* We've already handled the fast-path in-line.
*/
int __pud_alloc(struct mm_struct *mm, p4d_t *p4d, unsigned long address)
{
pud_t *new = pud_alloc_one(mm, address);
if (!new)
return -ENOMEM;
smp_wmb(); /* See comment in __pte_alloc */
spin_lock(&mm->page_table_lock);
if (!p4d_present(*p4d)) {
mm_inc_nr_puds(mm);
p4d_populate(mm, p4d, new);
} else /* Another has populated it */
pud_free(mm, new);
spin_unlock(&mm->page_table_lock);
return 0;
}
#endif /* __PAGETABLE_PUD_FOLDED */
#ifndef __PAGETABLE_PMD_FOLDED
/*
* Allocate page middle directory.
* We've already handled the fast-path in-line.
*/
int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address)
{
spinlock_t *ptl;
pmd_t *new = pmd_alloc_one(mm, address);
if (!new)
return -ENOMEM;
smp_wmb(); /* See comment in __pte_alloc */
ptl = pud_lock(mm, pud);
if (!pud_present(*pud)) {
mm_inc_nr_pmds(mm);
pud_populate(mm, pud, new);
} else /* Another has populated it */
pmd_free(mm, new);
spin_unlock(ptl);
return 0;
}
#endif /* __PAGETABLE_PMD_FOLDED */
int follow_invalidate_pte(struct mm_struct *mm, unsigned long address,
struct mmu_notifier_range *range, pte_t **ptepp,
pmd_t **pmdpp, spinlock_t **ptlp)
{
pgd_t *pgd;
p4d_t *p4d;
pud_t *pud;
pmd_t *pmd;
pte_t *ptep;
pgd = pgd_offset(mm, address);
if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd)))
goto out;
p4d = p4d_offset(pgd, address);
if (p4d_none(*p4d) || unlikely(p4d_bad(*p4d)))
goto out;
pud = pud_offset(p4d, address);
if (pud_none(*pud) || unlikely(pud_bad(*pud)))
goto out;
pmd = pmd_offset(pud, address);
VM_BUG_ON(pmd_trans_huge(*pmd));
if (pmd_huge(*pmd)) {
if (!pmdpp)
goto out;
if (range) {
mmu_notifier_range_init(range, MMU_NOTIFY_CLEAR, 0,
NULL, mm, address & PMD_MASK,
(address & PMD_MASK) + PMD_SIZE);
mmu_notifier_invalidate_range_start(range);
}
*ptlp = pmd_lock(mm, pmd);
if (pmd_huge(*pmd)) {
*pmdpp = pmd;
return 0;
}
spin_unlock(*ptlp);
if (range)
mmu_notifier_invalidate_range_end(range);
}
if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd)))
goto out;
if (range) {
mmu_notifier_range_init(range, MMU_NOTIFY_CLEAR, 0, NULL, mm,
address & PAGE_MASK,
(address & PAGE_MASK) + PAGE_SIZE);
mmu_notifier_invalidate_range_start(range);
}
ptep = pte_offset_map_lock(mm, pmd, address, ptlp);
if (!pte_present(*ptep))
goto unlock;
*ptepp = ptep;
return 0;
unlock:
pte_unmap_unlock(ptep, *ptlp);
if (range)
mmu_notifier_invalidate_range_end(range);
out:
return -EINVAL;
}
/**
* follow_pte - look up PTE at a user virtual address
* @mm: the mm_struct of the target address space
* @address: user virtual address
* @ptepp: location to store found PTE
* @ptlp: location to store the lock for the PTE
*
* On a successful return, the pointer to the PTE is stored in @ptepp;
* the corresponding lock is taken and its location is stored in @ptlp.
* The contents of the PTE are only stable until @ptlp is released;
* any further use, if any, must be protected against invalidation
* with MMU notifiers.
*
* Only IO mappings and raw PFN mappings are allowed. The mmap semaphore
* should be taken for read.
*
* KVM uses this function. While it is arguably less bad than ``follow_pfn``,
* it is not a good general-purpose API.
*
* Return: zero on success, -ve otherwise.
*/
int follow_pte(struct mm_struct *mm, unsigned long address,
pte_t **ptepp, spinlock_t **ptlp)
{
return follow_invalidate_pte(mm, address, NULL, ptepp, NULL, ptlp);
}
EXPORT_SYMBOL_GPL(follow_pte);
/**
* follow_pfn - look up PFN at a user virtual address
* @vma: memory mapping
* @address: user virtual address
* @pfn: location to store found PFN
*
* Only IO mappings and raw PFN mappings are allowed.
*
* This function does not allow the caller to read the permissions
* of the PTE. Do not use it.
*
* Return: zero and the pfn at @pfn on success, -ve otherwise.
*/
int follow_pfn(struct vm_area_struct *vma, unsigned long address,
unsigned long *pfn)
{
int ret = -EINVAL;
spinlock_t *ptl;
pte_t *ptep;
if (!(vma->vm_flags & (VM_IO | VM_PFNMAP)))
return ret;
ret = follow_pte(vma->vm_mm, address, &ptep, &ptl);
if (ret)
return ret;
*pfn = pte_pfn(*ptep);
pte_unmap_unlock(ptep, ptl);
return 0;
}
EXPORT_SYMBOL(follow_pfn);
#ifdef CONFIG_HAVE_IOREMAP_PROT
int follow_phys(struct vm_area_struct *vma,
unsigned long address, unsigned int flags,
unsigned long *prot, resource_size_t *phys)
{
int ret = -EINVAL;
pte_t *ptep, pte;
spinlock_t *ptl;
if (!(vma->vm_flags & (VM_IO | VM_PFNMAP)))
goto out;
if (follow_pte(vma->vm_mm, address, &ptep, &ptl))
goto out;
pte = *ptep;
if ((flags & FOLL_WRITE) && !pte_write(pte))
goto unlock;
*prot = pgprot_val(pte_pgprot(pte));
*phys = (resource_size_t)pte_pfn(pte) << PAGE_SHIFT;
ret = 0;
unlock:
pte_unmap_unlock(ptep, ptl);
out:
return ret;
}
/**
* generic_access_phys - generic implementation for iomem mmap access
* @vma: the vma to access
* @addr: userspace address, not relative offset within @vma
* @buf: buffer to read/write
* @len: length of transfer
* @write: set to FOLL_WRITE when writing, otherwise reading
*
* This is a generic implementation for &vm_operations_struct.access for an
* iomem mapping. This callback is used by access_process_vm() when the @vma is
* not page based.
*/
int generic_access_phys(struct vm_area_struct *vma, unsigned long addr,
void *buf, int len, int write)
{
resource_size_t phys_addr;
unsigned long prot = 0;
void __iomem *maddr;
pte_t *ptep, pte;
spinlock_t *ptl;
int offset = offset_in_page(addr);
int ret = -EINVAL;
if (!(vma->vm_flags & (VM_IO | VM_PFNMAP)))
return -EINVAL;
retry:
if (follow_pte(vma->vm_mm, addr, &ptep, &ptl))
return -EINVAL;
pte = *ptep;
pte_unmap_unlock(ptep, ptl);
prot = pgprot_val(pte_pgprot(pte));
phys_addr = (resource_size_t)pte_pfn(pte) << PAGE_SHIFT;
if ((write & FOLL_WRITE) && !pte_write(pte))
return -EINVAL;
maddr = ioremap_prot(phys_addr, PAGE_ALIGN(len + offset), prot);
if (!maddr)
return -ENOMEM;
if (follow_pte(vma->vm_mm, addr, &ptep, &ptl))
goto out_unmap;
if (!pte_same(pte, *ptep)) {
pte_unmap_unlock(ptep, ptl);
iounmap(maddr);
goto retry;
}
if (write)
memcpy_toio(maddr + offset, buf, len);
else
memcpy_fromio(buf, maddr + offset, len);
ret = len;
pte_unmap_unlock(ptep, ptl);
out_unmap:
iounmap(maddr);
return ret;
}
EXPORT_SYMBOL_GPL(generic_access_phys);
#endif
/*
* Access another process' address space as given in mm.
*/
int __access_remote_vm(struct mm_struct *mm, unsigned long addr, void *buf,
int len, unsigned int gup_flags)
{
struct vm_area_struct *vma;
void *old_buf = buf;
int write = gup_flags & FOLL_WRITE;
if (mmap_read_lock_killable(mm))
return 0;
/* ignore errors, just check how much was successfully transferred */
while (len) {
int bytes, ret, offset;
void *maddr;
struct page *page = NULL;
ret = get_user_pages_remote(mm, addr, 1,
gup_flags, &page, &vma, NULL);
if (ret <= 0) {
#ifndef CONFIG_HAVE_IOREMAP_PROT
break;
#else
/*
* Check if this is a VM_IO | VM_PFNMAP VMA, which
* we can access using slightly different code.
*/
vma = vma_lookup(mm, addr);
if (!vma)
break;
if (vma->vm_ops && vma->vm_ops->access)
ret = vma->vm_ops->access(vma, addr, buf,
len, write);
if (ret <= 0)
break;
bytes = ret;
#endif
} else {
bytes = len;
offset = addr & (PAGE_SIZE-1);
if (bytes > PAGE_SIZE-offset)
bytes = PAGE_SIZE-offset;
maddr = kmap(page);
if (write) {
copy_to_user_page(vma, page, addr,
maddr + offset, buf, bytes);
set_page_dirty_lock(page);
} else {
copy_from_user_page(vma, page, addr,
buf, maddr + offset, bytes);
}
kunmap(page);
put_page(page);
}
len -= bytes;
buf += bytes;
addr += bytes;
}
mmap_read_unlock(mm);
return buf - old_buf;
}
/**
* access_remote_vm - access another process' address space
* @mm: the mm_struct of the target address space
* @addr: start address to access
* @buf: source or destination buffer
* @len: number of bytes to transfer
* @gup_flags: flags modifying lookup behaviour
*
* The caller must hold a reference on @mm.
*
* Return: number of bytes copied from source to destination.
*/
int access_remote_vm(struct mm_struct *mm, unsigned long addr,
void *buf, int len, unsigned int gup_flags)
{
return __access_remote_vm(mm, addr, buf, len, gup_flags);
}
/*
* Access another process' address space.
* Source/target buffer must be kernel space,
* Do not walk the page table directly, use get_user_pages
*/
int access_process_vm(struct task_struct *tsk, unsigned long addr,
void *buf, int len, unsigned int gup_flags)
{
struct mm_struct *mm;
int ret;
mm = get_task_mm(tsk);
if (!mm)
return 0;
ret = __access_remote_vm(mm, addr, buf, len, gup_flags);
mmput(mm);
return ret;
}
EXPORT_SYMBOL_GPL(access_process_vm);
/*
* Print the name of a VMA.
*/
void print_vma_addr(char *prefix, unsigned long ip)
{
struct mm_struct *mm = current->mm;
struct vm_area_struct *vma;
/*
* we might be running from an atomic context so we cannot sleep
*/
if (!mmap_read_trylock(mm))
return;
vma = find_vma(mm, ip);
if (vma && vma->vm_file) {
struct file *f = vma->vm_file;
char *buf = (char *)__get_free_page(GFP_NOWAIT);
if (buf) {
char *p;
p = file_path(f, buf, PAGE_SIZE);
if (IS_ERR(p))
p = "?";
printk("%s%s[%lx+%lx]", prefix, kbasename(p),
vma->vm_start,
vma->vm_end - vma->vm_start);
free_page((unsigned long)buf);
}
}
mmap_read_unlock(mm);
}
#if defined(CONFIG_PROVE_LOCKING) || defined(CONFIG_DEBUG_ATOMIC_SLEEP)
void __might_fault(const char *file, int line)
{
/*
* Some code (nfs/sunrpc) uses socket ops on kernel memory while
* holding the mmap_lock, this is safe because kernel memory doesn't
* get paged out, therefore we'll never actually fault, and the
* below annotations will generate false positives.
*/
if (uaccess_kernel())
return;
if (pagefault_disabled())
return;
__might_sleep(file, line, 0);
#if defined(CONFIG_DEBUG_ATOMIC_SLEEP)
if (current->mm)
might_lock_read(¤t->mm->mmap_lock);
#endif
}
EXPORT_SYMBOL(__might_fault);
#endif
#if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_HUGETLBFS)
/*
* Process all subpages of the specified huge page with the specified
* operation. The target subpage will be processed last to keep its
* cache lines hot.
*/
static inline void process_huge_page(
unsigned long addr_hint, unsigned int pages_per_huge_page,
void (*process_subpage)(unsigned long addr, int idx, void *arg),
void *arg)
{
int i, n, base, l;
unsigned long addr = addr_hint &
~(((unsigned long)pages_per_huge_page << PAGE_SHIFT) - 1);
/* Process target subpage last to keep its cache lines hot */
might_sleep();
n = (addr_hint - addr) / PAGE_SIZE;
if (2 * n <= pages_per_huge_page) {
/* If target subpage in first half of huge page */
base = 0;
l = n;
/* Process subpages at the end of huge page */
for (i = pages_per_huge_page - 1; i >= 2 * n; i--) {
cond_resched();
process_subpage(addr + i * PAGE_SIZE, i, arg);
}
} else {
/* If target subpage in second half of huge page */
base = pages_per_huge_page - 2 * (pages_per_huge_page - n);
l = pages_per_huge_page - n;
/* Process subpages at the begin of huge page */
for (i = 0; i < base; i++) {
cond_resched();
process_subpage(addr + i * PAGE_SIZE, i, arg);
}
}
/*
* Process remaining subpages in left-right-left-right pattern
* towards the target subpage
*/
for (i = 0; i < l; i++) {
int left_idx = base + i;
int right_idx = base + 2 * l - 1 - i;
cond_resched();
process_subpage(addr + left_idx * PAGE_SIZE, left_idx, arg);
cond_resched();
process_subpage(addr + right_idx * PAGE_SIZE, right_idx, arg);
}
}
static void clear_gigantic_page(struct page *page,
unsigned long addr,
unsigned int pages_per_huge_page)
{
int i;
struct page *p = page;
might_sleep();
for (i = 0; i < pages_per_huge_page;
i++, p = mem_map_next(p, page, i)) {
cond_resched();
clear_user_highpage(p, addr + i * PAGE_SIZE);
}
}
static void clear_subpage(unsigned long addr, int idx, void *arg)
{
struct page *page = arg;
clear_user_highpage(page + idx, addr);
}
void clear_huge_page(struct page *page,
unsigned long addr_hint, unsigned int pages_per_huge_page)
{
unsigned long addr = addr_hint &
~(((unsigned long)pages_per_huge_page << PAGE_SHIFT) - 1);
if (unlikely(pages_per_huge_page > MAX_ORDER_NR_PAGES)) {
clear_gigantic_page(page, addr, pages_per_huge_page);
return;
}
process_huge_page(addr_hint, pages_per_huge_page, clear_subpage, page);
}
static void copy_user_gigantic_page(struct page *dst, struct page *src,
unsigned long addr,
struct vm_area_struct *vma,
unsigned int pages_per_huge_page)
{
int i;
struct page *dst_base = dst;
struct page *src_base = src;
for (i = 0; i < pages_per_huge_page; ) {
cond_resched();
copy_user_highpage(dst, src, addr + i*PAGE_SIZE, vma);
i++;
dst = mem_map_next(dst, dst_base, i);
src = mem_map_next(src, src_base, i);
}
}
struct copy_subpage_arg {
struct page *dst;
struct page *src;
struct vm_area_struct *vma;
};
static void copy_subpage(unsigned long addr, int idx, void *arg)
{
struct copy_subpage_arg *copy_arg = arg;
copy_user_highpage(copy_arg->dst + idx, copy_arg->src + idx,
addr, copy_arg->vma);
}
void copy_user_huge_page(struct page *dst, struct page *src,
unsigned long addr_hint, struct vm_area_struct *vma,
unsigned int pages_per_huge_page)
{
unsigned long addr = addr_hint &
~(((unsigned long)pages_per_huge_page << PAGE_SHIFT) - 1);
struct copy_subpage_arg arg = {
.dst = dst,
.src = src,
.vma = vma,
};
if (unlikely(pages_per_huge_page > MAX_ORDER_NR_PAGES)) {
copy_user_gigantic_page(dst, src, addr, vma,
pages_per_huge_page);
return;
}
process_huge_page(addr_hint, pages_per_huge_page, copy_subpage, &arg);
}
long copy_huge_page_from_user(struct page *dst_page,
const void __user *usr_src,
unsigned int pages_per_huge_page,
bool allow_pagefault)
{
void *src = (void *)usr_src;
void *page_kaddr;
unsigned long i, rc = 0;
unsigned long ret_val = pages_per_huge_page * PAGE_SIZE;
struct page *subpage = dst_page;
for (i = 0; i < pages_per_huge_page;
i++, subpage = mem_map_next(subpage, dst_page, i)) {
if (allow_pagefault)
page_kaddr = kmap(subpage);
else
page_kaddr = kmap_atomic(subpage);
rc = copy_from_user(page_kaddr,
(const void __user *)(src + i * PAGE_SIZE),
PAGE_SIZE);
if (allow_pagefault)
kunmap(subpage);
else
kunmap_atomic(page_kaddr);
ret_val -= (PAGE_SIZE - rc);
if (rc)
break;
cond_resched();
}
return ret_val;
}
#endif /* CONFIG_TRANSPARENT_HUGEPAGE || CONFIG_HUGETLBFS */
#if USE_SPLIT_PTE_PTLOCKS && ALLOC_SPLIT_PTLOCKS
static struct kmem_cache *page_ptl_cachep;
void __init ptlock_cache_init(void)
{
page_ptl_cachep = kmem_cache_create("page->ptl", sizeof(spinlock_t), 0,
SLAB_PANIC, NULL);
}
bool ptlock_alloc(struct page *page)
{
spinlock_t *ptl;
ptl = kmem_cache_alloc(page_ptl_cachep, GFP_KERNEL);
if (!ptl)
return false;
page->ptl = ptl;
return true;
}
void ptlock_free(struct page *page)
{
kmem_cache_free(page_ptl_cachep, page->ptl);
}
#endif
/*
* Utility functions for x86 operand and address decoding
*
* Copyright (C) Intel Corporation 2017
*/
#include <linux/kernel.h>
#include <linux/string.h>
#include <linux/ratelimit.h>
#include <linux/mmu_context.h>
#include <asm/desc_defs.h>
#include <asm/desc.h>
#include <asm/inat.h>
#include <asm/insn.h>
#include <asm/insn-eval.h>
#include <asm/ldt.h>
#include <asm/vm86.h>
#undef pr_fmt
#define pr_fmt(fmt) "insn: " fmt
enum reg_type {
REG_TYPE_RM = 0,
REG_TYPE_REG,
REG_TYPE_INDEX,
REG_TYPE_BASE,
};
/**
* is_string_insn() - Determine if instruction is a string instruction
* @insn: Instruction containing the opcode to inspect
*
* Returns:
*
* true if the instruction, determined by the opcode, is any of the
* string instructions as defined in the Intel Software Development manual.
* False otherwise.
*/
static bool is_string_insn(struct insn *insn)
{
insn_get_opcode(insn);
/* All string instructions have a 1-byte opcode. */
if (insn->opcode.nbytes != 1)
return false;
switch (insn->opcode.bytes[0]) {
case 0x6c ... 0x6f: /* INS, OUTS */
case 0xa4 ... 0xa7: /* MOVS, CMPS */
case 0xaa ... 0xaf: /* STOS, LODS, SCAS */
return true;
default:
return false;
}
}
/**
* insn_has_rep_prefix() - Determine if instruction has a REP prefix
* @insn: Instruction containing the prefix to inspect
*
* Returns:
*
* true if the instruction has a REP prefix, false if not.
*/
bool insn_has_rep_prefix(struct insn *insn)
{
insn_byte_t p;
int i;
insn_get_prefixes(insn);
for_each_insn_prefix(insn, i, p) {
if (p == 0xf2 || p == 0xf3)
return true;
}
return false;
}
/**
* get_seg_reg_override_idx() - obtain segment register override index
* @insn: Valid instruction with segment override prefixes
*
* Inspect the instruction prefixes in @insn and find segment overrides, if any.
*
* Returns:
*
* A constant identifying the segment register to use, among CS, SS, DS,
* ES, FS, or GS. INAT_SEG_REG_DEFAULT is returned if no segment override
* prefixes were found.
*
* -EINVAL in case of error.
*/
static int get_seg_reg_override_idx(struct insn *insn)
{
int idx = INAT_SEG_REG_DEFAULT;
int num_overrides = 0, i;
insn_byte_t p;
insn_get_prefixes(insn);
/* Look for any segment override prefixes. */
for_each_insn_prefix(insn, i, p) {
insn_attr_t attr;
attr = inat_get_opcode_attribute(p);
switch (attr) {
case INAT_MAKE_PREFIX(INAT_PFX_CS):
idx = INAT_SEG_REG_CS;
num_overrides++;
break;
case INAT_MAKE_PREFIX(INAT_PFX_SS):
idx = INAT_SEG_REG_SS;
num_overrides++;
break;
case INAT_MAKE_PREFIX(INAT_PFX_DS):
idx = INAT_SEG_REG_DS;
num_overrides++;
break;
case INAT_MAKE_PREFIX(INAT_PFX_ES):
idx = INAT_SEG_REG_ES;
num_overrides++;
break;
case INAT_MAKE_PREFIX(INAT_PFX_FS):
idx = INAT_SEG_REG_FS;
num_overrides++;
break;
case INAT_MAKE_PREFIX(INAT_PFX_GS):
idx = INAT_SEG_REG_GS;
num_overrides++;
break;
/* No default action needed. */
}
}
/* More than one segment override prefix leads to undefined behavior. */
if (num_overrides > 1)
return -EINVAL;
return idx;
}
/**
* check_seg_overrides() - check if segment override prefixes are allowed
* @insn: Valid instruction with segment override prefixes
* @regoff: Operand offset, in pt_regs, for which the check is performed
*
* For a particular register used in register-indirect addressing, determine if
* segment override prefixes can be used. Specifically, no overrides are allowed
* for rDI if used with a string instruction.
*
* Returns:
*
* True if segment override prefixes can be used with the register indicated
* in @regoff. False if otherwise.
*/
static bool check_seg_overrides(struct insn *insn, int regoff)
{
if (regoff == offsetof(struct pt_regs, di) && is_string_insn(insn))
return false;
return true;
}
/**
* resolve_default_seg() - resolve default segment register index for an operand
* @insn: Instruction with opcode and address size. Must be valid.
* @regs: Register values as seen when entering kernel mode
* @off: Operand offset, in pt_regs, for which resolution is needed
*
* Resolve the default segment register index associated with the instruction
* operand register indicated by @off. Such index is resolved based on defaults
* described in the Intel Software Development Manual.
*
* Returns:
*
* If in protected mode, a constant identifying the segment register to use,
* among CS, SS, ES or DS. If in long mode, INAT_SEG_REG_IGNORE.
*
* -EINVAL in case of error.
*/
static int resolve_default_seg(struct insn *insn, struct pt_regs *regs, int off)
{
if (any_64bit_mode(regs))
return INAT_SEG_REG_IGNORE;
/*
* Resolve the default segment register as described in Section 3.7.4
* of the Intel Software Development Manual Vol. 1:
*
* + DS for all references involving r[ABCD]X, and rSI.
* + If used in a string instruction, ES for rDI. Otherwise, DS.
* + AX, CX and DX are not valid register operands in 16-bit address
* encodings but are valid for 32-bit and 64-bit encodings.
* + -EDOM is reserved to identify for cases in which no register
* is used (i.e., displacement-only addressing). Use DS.
* + SS for rSP or rBP.
* + CS for rIP.
*/
switch (off) {
case offsetof(struct pt_regs, ax):
case offsetof(struct pt_regs, cx):
case offsetof(struct pt_regs, dx):
/* Need insn to verify address size. */
if (insn->addr_bytes == 2)
return -EINVAL;
fallthrough;
case -EDOM:
case offsetof(struct pt_regs, bx):
case offsetof(struct pt_regs, si):
return INAT_SEG_REG_DS;
case offsetof(struct pt_regs, di):
if (is_string_insn(insn))
return INAT_SEG_REG_ES;
return INAT_SEG_REG_DS;
case offsetof(struct pt_regs, bp):
case offsetof(struct pt_regs, sp):
return INAT_SEG_REG_SS;
case offsetof(struct pt_regs, ip):
return INAT_SEG_REG_CS;
default:
return -EINVAL;
}
}
/**
* resolve_seg_reg() - obtain segment register index
* @insn: Instruction with operands
* @regs: Register values as seen when entering kernel mode
* @regoff: Operand offset, in pt_regs, used to determine segment register
*
* Determine the segment register associated with the operands and, if
* applicable, prefixes and the instruction pointed by @insn.
*
* The segment register associated to an operand used in register-indirect
* addressing depends on:
*
* a) Whether running in long mode (in such a case segments are ignored, except
* if FS or GS are used).
*
* b) Whether segment override prefixes can be used. Certain instructions and
* registers do not allow override prefixes.
*
* c) Whether segment overrides prefixes are found in the instruction prefixes.
*
* d) If there are not segment override prefixes or they cannot be used, the
* default segment register associated with the operand register is used.
*
* The function checks first if segment override prefixes can be used with the
* operand indicated by @regoff. If allowed, obtain such overridden segment
* register index. Lastly, if not prefixes were found or cannot be used, resolve
* the segment register index to use based on the defaults described in the
* Intel documentation. In long mode, all segment register indexes will be
* ignored, except if overrides were found for FS or GS. All these operations
* are done using helper functions.
*
* The operand register, @regoff, is represented as the offset from the base of
* pt_regs.
*
* As stated, the main use of this function is to determine the segment register
* index based on the instruction, its operands and prefixes. Hence, @insn
* must be valid. However, if @regoff indicates rIP, we don't need to inspect
* @insn at all as in this case CS is used in all cases. This case is checked
* before proceeding further.
*
* Please note that this function does not return the value in the segment
* register (i.e., the segment selector) but our defined index. The segment
* selector needs to be obtained using get_segment_selector() and passing the
* segment register index resolved by this function.
*
* Returns:
*
* An index identifying the segment register to use, among CS, SS, DS,
* ES, FS, or GS. INAT_SEG_REG_IGNORE is returned if running in long mode.
*
* -EINVAL in case of error.
*/
static int resolve_seg_reg(struct insn *insn, struct pt_regs *regs, int regoff)
{
int idx;
/*
* In the unlikely event of having to resolve the segment register
* index for rIP, do it first. Segment override prefixes should not
* be used. Hence, it is not necessary to inspect the instruction,
* which may be invalid at this point.
*/
if (regoff == offsetof(struct pt_regs, ip)) {
if (any_64bit_mode(regs))
return INAT_SEG_REG_IGNORE;
else
return INAT_SEG_REG_CS;
}
if (!insn)
return -EINVAL;
if (!check_seg_overrides(insn, regoff))
return resolve_default_seg(insn, regs, regoff);
idx = get_seg_reg_override_idx(insn);
if (idx < 0)
return idx;
if (idx == INAT_SEG_REG_DEFAULT)
return resolve_default_seg(insn, regs, regoff);
/*
* In long mode, segment override prefixes are ignored, except for
* overrides for FS and GS.
*/
if (any_64bit_mode(regs)) {
if (idx != INAT_SEG_REG_FS &&
idx != INAT_SEG_REG_GS)
idx = INAT_SEG_REG_IGNORE;
}
return idx;
}
/**
* get_segment_selector() - obtain segment selector
* @regs: Register values as seen when entering kernel mode
* @seg_reg_idx: Segment register index to use
*
* Obtain the segment selector from any of the CS, SS, DS, ES, FS, GS segment
* registers. In CONFIG_X86_32, the segment is obtained from either pt_regs or
* kernel_vm86_regs as applicable. In CONFIG_X86_64, CS and SS are obtained
* from pt_regs. DS, ES, FS and GS are obtained by reading the actual CPU
* registers. This done for only for completeness as in CONFIG_X86_64 segment
* registers are ignored.
*
* Returns:
*
* Value of the segment selector, including null when running in
* long mode.
*
* -EINVAL on error.
*/
static short get_segment_selector(struct pt_regs *regs, int seg_reg_idx)
{
#ifdef CONFIG_X86_64
unsigned short sel;
switch (seg_reg_idx) {
case INAT_SEG_REG_IGNORE:
return 0;
case INAT_SEG_REG_CS:
return (unsigned short)(regs->cs & 0xffff);
case INAT_SEG_REG_SS:
return (unsigned short)(regs->ss & 0xffff);
case INAT_SEG_REG_DS:
savesegment(ds, sel);
return sel;
case INAT_SEG_REG_ES:
savesegment(es, sel);
return sel;
case INAT_SEG_REG_FS:
savesegment(fs, sel);
return sel;
case INAT_SEG_REG_GS:
savesegment(gs, sel);
return sel;
default:
return -EINVAL;
}
#else /* CONFIG_X86_32 */
struct kernel_vm86_regs *vm86regs = (struct kernel_vm86_regs *)regs;
if (v8086_mode(regs)) {
switch (seg_reg_idx) {
case INAT_SEG_REG_CS:
return (unsigned short)(regs->cs & 0xffff);
case INAT_SEG_REG_SS:
return (unsigned short)(regs->ss & 0xffff);
case INAT_SEG_REG_DS:
return vm86regs->ds;
case INAT_SEG_REG_ES:
return vm86regs->es;
case INAT_SEG_REG_FS:
return vm86regs->fs;
case INAT_SEG_REG_GS:
return vm86regs->gs;
case INAT_SEG_REG_IGNORE:
default:
return -EINVAL;
}
}
switch (seg_reg_idx) {
case INAT_SEG_REG_CS:
return (unsigned short)(regs->cs & 0xffff);
case INAT_SEG_REG_SS:
return (unsigned short)(regs->ss & 0xffff);
case INAT_SEG_REG_DS:
return (unsigned short)(regs->ds & 0xffff);
case INAT_SEG_REG_ES:
return (unsigned short)(regs->es & 0xffff);
case INAT_SEG_REG_FS:
return (unsigned short)(regs->fs & 0xffff);
case INAT_SEG_REG_GS:
return get_user_gs(regs);
case INAT_SEG_REG_IGNORE:
default:
return -EINVAL;
}
#endif /* CONFIG_X86_64 */
}
static int get_reg_offset(struct insn *insn, struct pt_regs *regs,
enum reg_type type)
{
int regno = 0;
static const int regoff[] = {
offsetof(struct pt_regs, ax),
offsetof(struct pt_regs, cx),
offsetof(struct pt_regs, dx),
offsetof(struct pt_regs, bx),
offsetof(struct pt_regs, sp),
offsetof(struct pt_regs, bp),
offsetof(struct pt_regs, si),
offsetof(struct pt_regs, di),
#ifdef CONFIG_X86_64
offsetof(struct pt_regs, r8),
offsetof(struct pt_regs, r9),
offsetof(struct pt_regs, r10),
offsetof(struct pt_regs, r11),
offsetof(struct pt_regs, r12),
offsetof(struct pt_regs, r13),
offsetof(struct pt_regs, r14),
offsetof(struct pt_regs, r15),
#endif
};
int nr_registers = ARRAY_SIZE(regoff);
/*
* Don't possibly decode a 32-bit instructions as
* reading a 64-bit-only register.
*/
if (IS_ENABLED(CONFIG_X86_64) && !insn->x86_64)
nr_registers -= 8;
switch (type) {
case REG_TYPE_RM:
regno = X86_MODRM_RM(insn->modrm.value);
/*
* ModRM.mod == 0 and ModRM.rm == 5 means a 32-bit displacement
* follows the ModRM byte.
*/
if (!X86_MODRM_MOD(insn->modrm.value) && regno == 5)
return -EDOM;
if (X86_REX_B(insn->rex_prefix.value))
regno += 8;
break;
case REG_TYPE_REG:
regno = X86_MODRM_REG(insn->modrm.value);
if (X86_REX_R(insn->rex_prefix.value))
regno += 8;
break;
case REG_TYPE_INDEX:
regno = X86_SIB_INDEX(insn->sib.value);
if (X86_REX_X(insn->rex_prefix.value))
regno += 8;
/*
* If ModRM.mod != 3 and SIB.index = 4 the scale*index
* portion of the address computation is null. This is
* true only if REX.X is 0. In such a case, the SIB index
* is used in the address computation.
*/
if (X86_MODRM_MOD(insn->modrm.value) != 3 && regno == 4)
return -EDOM;
break;
case REG_TYPE_BASE:
regno = X86_SIB_BASE(insn->sib.value);
/*
* If ModRM.mod is 0 and SIB.base == 5, the base of the
* register-indirect addressing is 0. In this case, a
* 32-bit displacement follows the SIB byte.
*/
if (!X86_MODRM_MOD(insn->modrm.value) && regno == 5)
return -EDOM;
if (X86_REX_B(insn->rex_prefix.value))
regno += 8;
break;
default:
pr_err_ratelimited("invalid register type: %d\n", type);
return -EINVAL;
}
if (regno >= nr_registers) {
WARN_ONCE(1, "decoded an instruction with an invalid register");
return -EINVAL;
}
return regoff[regno];
}
/**
* get_reg_offset_16() - Obtain offset of register indicated by instruction
* @insn: Instruction containing ModRM byte
* @regs: Register values as seen when entering kernel mode
* @offs1: Offset of the first operand register
* @offs2: Offset of the second operand register, if applicable
*
* Obtain the offset, in pt_regs, of the registers indicated by the ModRM byte
* in @insn. This function is to be used with 16-bit address encodings. The
* @offs1 and @offs2 will be written with the offset of the two registers
* indicated by the instruction. In cases where any of the registers is not
* referenced by the instruction, the value will be set to -EDOM.
*
* Returns:
*
* 0 on success, -EINVAL on error.
*/
static int get_reg_offset_16(struct insn *insn, struct pt_regs *regs,
int *offs1, int *offs2)
{
/*
* 16-bit addressing can use one or two registers. Specifics of
* encodings are given in Table 2-1. "16-Bit Addressing Forms with the
* ModR/M Byte" of the Intel Software Development Manual.
*/
static const int regoff1[] = {
offsetof(struct pt_regs, bx),
offsetof(struct pt_regs, bx),
offsetof(struct pt_regs, bp),
offsetof(struct pt_regs, bp),
offsetof(struct pt_regs, si),
offsetof(struct pt_regs, di),
offsetof(struct pt_regs, bp),
offsetof(struct pt_regs, bx),
};
static const int regoff2[] = {
offsetof(struct pt_regs, si),
offsetof(struct pt_regs, di),
offsetof(struct pt_regs, si),
offsetof(struct pt_regs, di),
-EDOM,
-EDOM,
-EDOM,
-EDOM,
};
if (!offs1 || !offs2)
return -EINVAL;
/* Operand is a register, use the generic function. */
if (X86_MODRM_MOD(insn->modrm.value) == 3) {
*offs1 = insn_get_modrm_rm_off(insn, regs);
*offs2 = -EDOM;
return 0;
}
*offs1 = regoff1[X86_MODRM_RM(insn->modrm.value)];
*offs2 = regoff2[X86_MODRM_RM(insn->modrm.value)];
/*
* If ModRM.mod is 0 and ModRM.rm is 110b, then we use displacement-
* only addressing. This means that no registers are involved in
* computing the effective address. Thus, ensure that the first
* register offset is invalid. The second register offset is already
* invalid under the aforementioned conditions.
*/
if ((X86_MODRM_MOD(insn->modrm.value) == 0) &&
(X86_MODRM_RM(insn->modrm.value) == 6))
*offs1 = -EDOM;
return 0;
}
/**
* get_desc() - Obtain contents of a segment descriptor
* @out: Segment descriptor contents on success
* @sel: Segment selector
*
* Given a segment selector, obtain a pointer to the segment descriptor.
* Both global and local descriptor tables are supported.
*
* Returns:
*
* True on success, false on failure.
*
* NULL on error.
*/
static bool get_desc(struct desc_struct *out, unsigned short sel)
{
struct desc_ptr gdt_desc = {0, 0};
unsigned long desc_base;
#ifdef CONFIG_MODIFY_LDT_SYSCALL
if ((sel & SEGMENT_TI_MASK) == SEGMENT_LDT) {
bool success = false;
struct ldt_struct *ldt;
/* Bits [15:3] contain the index of the desired entry. */
sel >>= 3;
mutex_lock(¤t->active_mm->context.lock);
ldt = current->active_mm->context.ldt;
if (ldt && sel < ldt->nr_entries) {
*out = ldt->entries[sel];
success = true;
}
mutex_unlock(¤t->active_mm->context.lock);
return success;
}
#endif
native_store_gdt(&gdt_desc);
/*
* Segment descriptors have a size of 8 bytes. Thus, the index is
* multiplied by 8 to obtain the memory offset of the desired descriptor
* from the base of the GDT. As bits [15:3] of the segment selector
* contain the index, it can be regarded as multiplied by 8 already.
* All that remains is to clear bits [2:0].
*/
desc_base = sel & ~(SEGMENT_RPL_MASK | SEGMENT_TI_MASK);
if (desc_base > gdt_desc.size)
return false;
*out = *(struct desc_struct *)(gdt_desc.address + desc_base); return true;
}
/**
* insn_get_seg_base() - Obtain base address of segment descriptor.
* @regs: Register values as seen when entering kernel mode
* @seg_reg_idx: Index of the segment register pointing to seg descriptor
*
* Obtain the base address of the segment as indicated by the segment descriptor
* pointed by the segment selector. The segment selector is obtained from the
* input segment register index @seg_reg_idx.
*
* Returns:
*
* In protected mode, base address of the segment. Zero in long mode,
* except when FS or GS are used. In virtual-8086 mode, the segment
* selector shifted 4 bits to the right.
*
* -1L in case of error.
*/
unsigned long insn_get_seg_base(struct pt_regs *regs, int seg_reg_idx)
{
struct desc_struct desc;
short sel;
sel = get_segment_selector(regs, seg_reg_idx);
if (sel < 0)
return -1L;
if (v8086_mode(regs))
/*
* Base is simply the segment selector shifted 4
* bits to the right.
*/
return (unsigned long)(sel << 4);
if (any_64bit_mode(regs)) {
/*
* Only FS or GS will have a base address, the rest of
* the segments' bases are forced to 0.
*/
unsigned long base;
if (seg_reg_idx == INAT_SEG_REG_FS) {
rdmsrl(MSR_FS_BASE, base);
} else if (seg_reg_idx == INAT_SEG_REG_GS) {
/*
* swapgs was called at the kernel entry point. Thus,
* MSR_KERNEL_GS_BASE will have the user-space GS base.
*/
if (user_mode(regs))
rdmsrl(MSR_KERNEL_GS_BASE, base);
else
rdmsrl(MSR_GS_BASE, base);
} else {
base = 0;
}
return base;
}
/* In protected mode the segment selector cannot be null. */
if (!sel)
return -1L;
if (!get_desc(&desc, sel))
return -1L;
return get_desc_base(&desc);
}
/**
* get_seg_limit() - Obtain the limit of a segment descriptor
* @regs: Register values as seen when entering kernel mode
* @seg_reg_idx: Index of the segment register pointing to seg descriptor
*
* Obtain the limit of the segment as indicated by the segment descriptor
* pointed by the segment selector. The segment selector is obtained from the
* input segment register index @seg_reg_idx.
*
* Returns:
*
* In protected mode, the limit of the segment descriptor in bytes.
* In long mode and virtual-8086 mode, segment limits are not enforced. Thus,
* limit is returned as -1L to imply a limit-less segment.
*
* Zero is returned on error.
*/
static unsigned long get_seg_limit(struct pt_regs *regs, int seg_reg_idx)
{
struct desc_struct desc;
unsigned long limit;
short sel;
sel = get_segment_selector(regs, seg_reg_idx);
if (sel < 0)
return 0;
if (any_64bit_mode(regs) || v8086_mode(regs))
return -1L;
if (!sel)
return 0;
if (!get_desc(&desc, sel))
return 0;
/*
* If the granularity bit is set, the limit is given in multiples
* of 4096. This also means that the 12 least significant bits are
* not tested when checking the segment limits. In practice,
* this means that the segment ends in (limit << 12) + 0xfff.
*/
limit = get_desc_limit(&desc);
if (desc.g)
limit = (limit << 12) + 0xfff;
return limit;
}
/**
* insn_get_code_seg_params() - Obtain code segment parameters
* @regs: Structure with register values as seen when entering kernel mode
*
* Obtain address and operand sizes of the code segment. It is obtained from the
* selector contained in the CS register in regs. In protected mode, the default
* address is determined by inspecting the L and D bits of the segment
* descriptor. In virtual-8086 mode, the default is always two bytes for both
* address and operand sizes.
*
* Returns:
*
* An int containing ORed-in default parameters on success.
*
* -EINVAL on error.
*/
int insn_get_code_seg_params(struct pt_regs *regs)
{
struct desc_struct desc;
short sel;
if (v8086_mode(regs))
/* Address and operand size are both 16-bit. */
return INSN_CODE_SEG_PARAMS(2, 2);
sel = get_segment_selector(regs, INAT_SEG_REG_CS);
if (sel < 0)
return sel; if (!get_desc(&desc, sel)) return -EINVAL;
/*
* The most significant byte of the Type field of the segment descriptor
* determines whether a segment contains data or code. If this is a data
* segment, return error.
*/
if (!(desc.type & BIT(3)))
return -EINVAL;
switch ((desc.l << 1) | desc.d) {
case 0: /*
* Legacy mode. CS.L=0, CS.D=0. Address and operand size are
* both 16-bit.
*/
return INSN_CODE_SEG_PARAMS(2, 2);
case 1: /*
* Legacy mode. CS.L=0, CS.D=1. Address and operand size are
* both 32-bit.
*/
return INSN_CODE_SEG_PARAMS(4, 4);
case 2: /*
* IA-32e 64-bit mode. CS.L=1, CS.D=0. Address size is 64-bit;
* operand size is 32-bit.
*/
return INSN_CODE_SEG_PARAMS(4, 8);
case 3: /* Invalid setting. CS.L=1, CS.D=1 */
fallthrough;
default:
return -EINVAL;
}
}
/**
* insn_get_modrm_rm_off() - Obtain register in r/m part of the ModRM byte
* @insn: Instruction containing the ModRM byte
* @regs: Register values as seen when entering kernel mode
*
* Returns:
*
* The register indicated by the r/m part of the ModRM byte. The
* register is obtained as an offset from the base of pt_regs. In specific
* cases, the returned value can be -EDOM to indicate that the particular value
* of ModRM does not refer to a register and shall be ignored.
*/
int insn_get_modrm_rm_off(struct insn *insn, struct pt_regs *regs)
{
return get_reg_offset(insn, regs, REG_TYPE_RM);
}
/**
* insn_get_modrm_reg_off() - Obtain register in reg part of the ModRM byte
* @insn: Instruction containing the ModRM byte
* @regs: Register values as seen when entering kernel mode
*
* Returns:
*
* The register indicated by the reg part of the ModRM byte. The
* register is obtained as an offset from the base of pt_regs.
*/
int insn_get_modrm_reg_off(struct insn *insn, struct pt_regs *regs)
{
return get_reg_offset(insn, regs, REG_TYPE_REG);
}
/**
* get_seg_base_limit() - obtain base address and limit of a segment
* @insn: Instruction. Must be valid.
* @regs: Register values as seen when entering kernel mode
* @regoff: Operand offset, in pt_regs, used to resolve segment descriptor
* @base: Obtained segment base
* @limit: Obtained segment limit
*
* Obtain the base address and limit of the segment associated with the operand
* @regoff and, if any or allowed, override prefixes in @insn. This function is
* different from insn_get_seg_base() as the latter does not resolve the segment
* associated with the instruction operand. If a limit is not needed (e.g.,
* when running in long mode), @limit can be NULL.
*
* Returns:
*
* 0 on success. @base and @limit will contain the base address and of the
* resolved segment, respectively.
*
* -EINVAL on error.
*/
static int get_seg_base_limit(struct insn *insn, struct pt_regs *regs,
int regoff, unsigned long *base,
unsigned long *limit)
{
int seg_reg_idx;
if (!base)
return -EINVAL;
seg_reg_idx = resolve_seg_reg(insn, regs, regoff);
if (seg_reg_idx < 0)
return seg_reg_idx;
*base = insn_get_seg_base(regs, seg_reg_idx);
if (*base == -1L)
return -EINVAL;
if (!limit)
return 0;
*limit = get_seg_limit(regs, seg_reg_idx);
if (!(*limit))
return -EINVAL;
return 0;
}
/**
* get_eff_addr_reg() - Obtain effective address from register operand
* @insn: Instruction. Must be valid.
* @regs: Register values as seen when entering kernel mode
* @regoff: Obtained operand offset, in pt_regs, with the effective address
* @eff_addr: Obtained effective address
*
* Obtain the effective address stored in the register operand as indicated by
* the ModRM byte. This function is to be used only with register addressing
* (i.e., ModRM.mod is 3). The effective address is saved in @eff_addr. The
* register operand, as an offset from the base of pt_regs, is saved in @regoff;
* such offset can then be used to resolve the segment associated with the
* operand. This function can be used with any of the supported address sizes
* in x86.
*
* Returns:
*
* 0 on success. @eff_addr will have the effective address stored in the
* operand indicated by ModRM. @regoff will have such operand as an offset from
* the base of pt_regs.
*
* -EINVAL on error.
*/
static int get_eff_addr_reg(struct insn *insn, struct pt_regs *regs,
int *regoff, long *eff_addr)
{
int ret;
ret = insn_get_modrm(insn);
if (ret)
return ret;
if (X86_MODRM_MOD(insn->modrm.value) != 3)
return -EINVAL;
*regoff = get_reg_offset(insn, regs, REG_TYPE_RM);
if (*regoff < 0)
return -EINVAL;
/* Ignore bytes that are outside the address size. */
if (insn->addr_bytes == 2)
*eff_addr = regs_get_register(regs, *regoff) & 0xffff;
else if (insn->addr_bytes == 4)
*eff_addr = regs_get_register(regs, *regoff) & 0xffffffff;
else /* 64-bit address */
*eff_addr = regs_get_register(regs, *regoff);
return 0;
}
/**
* get_eff_addr_modrm() - Obtain referenced effective address via ModRM
* @insn: Instruction. Must be valid.
* @regs: Register values as seen when entering kernel mode
* @regoff: Obtained operand offset, in pt_regs, associated with segment
* @eff_addr: Obtained effective address
*
* Obtain the effective address referenced by the ModRM byte of @insn. After
* identifying the registers involved in the register-indirect memory reference,
* its value is obtained from the operands in @regs. The computed address is
* stored @eff_addr. Also, the register operand that indicates the associated
* segment is stored in @regoff, this parameter can later be used to determine
* such segment.
*
* Returns:
*
* 0 on success. @eff_addr will have the referenced effective address. @regoff
* will have a register, as an offset from the base of pt_regs, that can be used
* to resolve the associated segment.
*
* -EINVAL on error.
*/
static int get_eff_addr_modrm(struct insn *insn, struct pt_regs *regs,
int *regoff, long *eff_addr)
{
long tmp;
int ret;
if (insn->addr_bytes != 8 && insn->addr_bytes != 4)
return -EINVAL;
ret = insn_get_modrm(insn);
if (ret)
return ret;
if (X86_MODRM_MOD(insn->modrm.value) > 2)
return -EINVAL;
*regoff = get_reg_offset(insn, regs, REG_TYPE_RM);
/*
* -EDOM means that we must ignore the address_offset. In such a case,
* in 64-bit mode the effective address relative to the rIP of the
* following instruction.
*/
if (*regoff == -EDOM) {
if (any_64bit_mode(regs))
tmp = regs->ip + insn->length;
else
tmp = 0;
} else if (*regoff < 0) {
return -EINVAL;
} else {
tmp = regs_get_register(regs, *regoff);
}
if (insn->addr_bytes == 4) {
int addr32 = (int)(tmp & 0xffffffff) + insn->displacement.value;
*eff_addr = addr32 & 0xffffffff;
} else {
*eff_addr = tmp + insn->displacement.value;
}
return 0;
}
/**
* get_eff_addr_modrm_16() - Obtain referenced effective address via ModRM
* @insn: Instruction. Must be valid.
* @regs: Register values as seen when entering kernel mode
* @regoff: Obtained operand offset, in pt_regs, associated with segment
* @eff_addr: Obtained effective address
*
* Obtain the 16-bit effective address referenced by the ModRM byte of @insn.
* After identifying the registers involved in the register-indirect memory
* reference, its value is obtained from the operands in @regs. The computed
* address is stored @eff_addr. Also, the register operand that indicates
* the associated segment is stored in @regoff, this parameter can later be used
* to determine such segment.
*
* Returns:
*
* 0 on success. @eff_addr will have the referenced effective address. @regoff
* will have a register, as an offset from the base of pt_regs, that can be used
* to resolve the associated segment.
*
* -EINVAL on error.
*/
static int get_eff_addr_modrm_16(struct insn *insn, struct pt_regs *regs,
int *regoff, short *eff_addr)
{
int addr_offset1, addr_offset2, ret;
short addr1 = 0, addr2 = 0, displacement;
if (insn->addr_bytes != 2)
return -EINVAL;
insn_get_modrm(insn);
if (!insn->modrm.nbytes)
return -EINVAL;
if (X86_MODRM_MOD(insn->modrm.value) > 2)
return -EINVAL;
ret = get_reg_offset_16(insn, regs, &addr_offset1, &addr_offset2);
if (ret < 0)
return -EINVAL;
/*
* Don't fail on invalid offset values. They might be invalid because
* they cannot be used for this particular value of ModRM. Instead, use
* them in the computation only if they contain a valid value.
*/
if (addr_offset1 != -EDOM)
addr1 = regs_get_register(regs, addr_offset1) & 0xffff;
if (addr_offset2 != -EDOM)
addr2 = regs_get_register(regs, addr_offset2) & 0xffff;
displacement = insn->displacement.value & 0xffff;
*eff_addr = addr1 + addr2 + displacement;
/*
* The first operand register could indicate to use of either SS or DS
* registers to obtain the segment selector. The second operand
* register can only indicate the use of DS. Thus, the first operand
* will be used to obtain the segment selector.
*/
*regoff = addr_offset1;
return 0;
}
/**
* get_eff_addr_sib() - Obtain referenced effective address via SIB
* @insn: Instruction. Must be valid.
* @regs: Register values as seen when entering kernel mode
* @regoff: Obtained operand offset, in pt_regs, associated with segment
* @eff_addr: Obtained effective address
*
* Obtain the effective address referenced by the SIB byte of @insn. After
* identifying the registers involved in the indexed, register-indirect memory
* reference, its value is obtained from the operands in @regs. The computed
* address is stored @eff_addr. Also, the register operand that indicates the
* associated segment is stored in @regoff, this parameter can later be used to
* determine such segment.
*
* Returns:
*
* 0 on success. @eff_addr will have the referenced effective address.
* @base_offset will have a register, as an offset from the base of pt_regs,
* that can be used to resolve the associated segment.
*
* Negative value on error.
*/
static int get_eff_addr_sib(struct insn *insn, struct pt_regs *regs,
int *base_offset, long *eff_addr)
{
long base, indx;
int indx_offset;
int ret;
if (insn->addr_bytes != 8 && insn->addr_bytes != 4)
return -EINVAL;
ret = insn_get_modrm(insn);
if (ret)
return ret;
if (!insn->modrm.nbytes)
return -EINVAL;
if (X86_MODRM_MOD(insn->modrm.value) > 2)
return -EINVAL;
ret = insn_get_sib(insn);
if (ret)
return ret;
if (!insn->sib.nbytes)
return -EINVAL;
*base_offset = get_reg_offset(insn, regs, REG_TYPE_BASE);
indx_offset = get_reg_offset(insn, regs, REG_TYPE_INDEX);
/*
* Negative values in the base and index offset means an error when
* decoding the SIB byte. Except -EDOM, which means that the registers
* should not be used in the address computation.
*/
if (*base_offset == -EDOM)
base = 0;
else if (*base_offset < 0)
return -EINVAL;
else
base = regs_get_register(regs, *base_offset);
if (indx_offset == -EDOM)
indx = 0;
else if (indx_offset < 0)
return -EINVAL;
else
indx = regs_get_register(regs, indx_offset);
if (insn->addr_bytes == 4) {
int addr32, base32, idx32;
base32 = base & 0xffffffff;
idx32 = indx & 0xffffffff;
addr32 = base32 + idx32 * (1 << X86_SIB_SCALE(insn->sib.value));
addr32 += insn->displacement.value;
*eff_addr = addr32 & 0xffffffff;
} else {
*eff_addr = base + indx * (1 << X86_SIB_SCALE(insn->sib.value));
*eff_addr += insn->displacement.value;
}
return 0;
}
/**
* get_addr_ref_16() - Obtain the 16-bit address referred by instruction
* @insn: Instruction containing ModRM byte and displacement
* @regs: Register values as seen when entering kernel mode
*
* This function is to be used with 16-bit address encodings. Obtain the memory
* address referred by the instruction's ModRM and displacement bytes. Also, the
* segment used as base is determined by either any segment override prefixes in
* @insn or the default segment of the registers involved in the address
* computation. In protected mode, segment limits are enforced.
*
* Returns:
*
* Linear address referenced by the instruction operands on success.
*
* -1L on error.
*/
static void __user *get_addr_ref_16(struct insn *insn, struct pt_regs *regs)
{
unsigned long linear_addr = -1L, seg_base, seg_limit;
int ret, regoff;
short eff_addr;
long tmp;
if (insn_get_displacement(insn))
goto out;
if (insn->addr_bytes != 2)
goto out;
if (X86_MODRM_MOD(insn->modrm.value) == 3) {
ret = get_eff_addr_reg(insn, regs, ®off, &tmp);
if (ret)
goto out;
eff_addr = tmp;
} else {
ret = get_eff_addr_modrm_16(insn, regs, ®off, &eff_addr);
if (ret)
goto out;
}
ret = get_seg_base_limit(insn, regs, regoff, &seg_base, &seg_limit);
if (ret)
goto out;
/*
* Before computing the linear address, make sure the effective address
* is within the limits of the segment. In virtual-8086 mode, segment
* limits are not enforced. In such a case, the segment limit is -1L to
* reflect this fact.
*/
if ((unsigned long)(eff_addr & 0xffff) > seg_limit)
goto out;
linear_addr = (unsigned long)(eff_addr & 0xffff) + seg_base;
/* Limit linear address to 20 bits */
if (v8086_mode(regs))
linear_addr &= 0xfffff;
out:
return (void __user *)linear_addr;
}
/**
* get_addr_ref_32() - Obtain a 32-bit linear address
* @insn: Instruction with ModRM, SIB bytes and displacement
* @regs: Register values as seen when entering kernel mode
*
* This function is to be used with 32-bit address encodings to obtain the
* linear memory address referred by the instruction's ModRM, SIB,
* displacement bytes and segment base address, as applicable. If in protected
* mode, segment limits are enforced.
*
* Returns:
*
* Linear address referenced by instruction and registers on success.
*
* -1L on error.
*/
static void __user *get_addr_ref_32(struct insn *insn, struct pt_regs *regs)
{
unsigned long linear_addr = -1L, seg_base, seg_limit;
int eff_addr, regoff;
long tmp;
int ret;
if (insn->addr_bytes != 4)
goto out;
if (X86_MODRM_MOD(insn->modrm.value) == 3) {
ret = get_eff_addr_reg(insn, regs, ®off, &tmp);
if (ret)
goto out;
eff_addr = tmp;
} else {
if (insn->sib.nbytes) {
ret = get_eff_addr_sib(insn, regs, ®off, &tmp);
if (ret)
goto out;
eff_addr = tmp;
} else {
ret = get_eff_addr_modrm(insn, regs, ®off, &tmp);
if (ret)
goto out;
eff_addr = tmp;
}
}
ret = get_seg_base_limit(insn, regs, regoff, &seg_base, &seg_limit);
if (ret)
goto out;
/*
* In protected mode, before computing the linear address, make sure
* the effective address is within the limits of the segment.
* 32-bit addresses can be used in long and virtual-8086 modes if an
* address override prefix is used. In such cases, segment limits are
* not enforced. When in virtual-8086 mode, the segment limit is -1L
* to reflect this situation.
*
* After computed, the effective address is treated as an unsigned
* quantity.
*/
if (!any_64bit_mode(regs) && ((unsigned int)eff_addr > seg_limit))
goto out;
/*
* Even though 32-bit address encodings are allowed in virtual-8086
* mode, the address range is still limited to [0x-0xffff].
*/
if (v8086_mode(regs) && (eff_addr & ~0xffff))
goto out;
/*
* Data type long could be 64 bits in size. Ensure that our 32-bit
* effective address is not sign-extended when computing the linear
* address.
*/
linear_addr = (unsigned long)(eff_addr & 0xffffffff) + seg_base;
/* Limit linear address to 20 bits */
if (v8086_mode(regs))
linear_addr &= 0xfffff;
out:
return (void __user *)linear_addr;
}
/**
* get_addr_ref_64() - Obtain a 64-bit linear address
* @insn: Instruction struct with ModRM and SIB bytes and displacement
* @regs: Structure with register values as seen when entering kernel mode
*
* This function is to be used with 64-bit address encodings to obtain the
* linear memory address referred by the instruction's ModRM, SIB,
* displacement bytes and segment base address, as applicable.
*
* Returns:
*
* Linear address referenced by instruction and registers on success.
*
* -1L on error.
*/
#ifndef CONFIG_X86_64
static void __user *get_addr_ref_64(struct insn *insn, struct pt_regs *regs)
{
return (void __user *)-1L;
}
#else
static void __user *get_addr_ref_64(struct insn *insn, struct pt_regs *regs)
{
unsigned long linear_addr = -1L, seg_base;
int regoff, ret;
long eff_addr;
if (insn->addr_bytes != 8)
goto out;
if (X86_MODRM_MOD(insn->modrm.value) == 3) {
ret = get_eff_addr_reg(insn, regs, ®off, &eff_addr);
if (ret)
goto out;
} else {
if (insn->sib.nbytes) {
ret = get_eff_addr_sib(insn, regs, ®off, &eff_addr);
if (ret)
goto out;
} else {
ret = get_eff_addr_modrm(insn, regs, ®off, &eff_addr);
if (ret)
goto out;
}
}
ret = get_seg_base_limit(insn, regs, regoff, &seg_base, NULL);
if (ret)
goto out;
linear_addr = (unsigned long)eff_addr + seg_base;
out:
return (void __user *)linear_addr;
}
#endif /* CONFIG_X86_64 */
/**
* insn_get_addr_ref() - Obtain the linear address referred by instruction
* @insn: Instruction structure containing ModRM byte and displacement
* @regs: Structure with register values as seen when entering kernel mode
*
* Obtain the linear address referred by the instruction's ModRM, SIB and
* displacement bytes, and segment base, as applicable. In protected mode,
* segment limits are enforced.
*
* Returns:
*
* Linear address referenced by instruction and registers on success.
*
* -1L on error.
*/
void __user *insn_get_addr_ref(struct insn *insn, struct pt_regs *regs)
{
if (!insn || !regs)
return (void __user *)-1L;
switch (insn->addr_bytes) {
case 2:
return get_addr_ref_16(insn, regs);
case 4:
return get_addr_ref_32(insn, regs);
case 8:
return get_addr_ref_64(insn, regs);
default:
return (void __user *)-1L;
}
}
int insn_get_effective_ip(struct pt_regs *regs, unsigned long *ip)
{
unsigned long seg_base = 0;
/*
* If not in user-space long mode, a custom code segment could be in
* use. This is true in protected mode (if the process defined a local
* descriptor table), or virtual-8086 mode. In most of the cases
* seg_base will be zero as in USER_CS.
*/
if (!user_64bit_mode(regs)) { seg_base = insn_get_seg_base(regs, INAT_SEG_REG_CS);
if (seg_base == -1L)
return -EINVAL;
}
*ip = seg_base + regs->ip;
return 0;
}
/**
* insn_fetch_from_user() - Copy instruction bytes from user-space memory
* @regs: Structure with register values as seen when entering kernel mode
* @buf: Array to store the fetched instruction
*
* Gets the linear address of the instruction and copies the instruction bytes
* to the buf.
*
* Returns:
*
* - number of instruction bytes copied.
* - 0 if nothing was copied.
* - -EINVAL if the linear address of the instruction could not be calculated
*/
int insn_fetch_from_user(struct pt_regs *regs, unsigned char buf[MAX_INSN_SIZE])
{
unsigned long ip;
int not_copied;
if (insn_get_effective_ip(regs, &ip))
return -EINVAL;
not_copied = copy_from_user(buf, (void __user *)ip, MAX_INSN_SIZE);
return MAX_INSN_SIZE - not_copied;
}
/**
* insn_fetch_from_user_inatomic() - Copy instruction bytes from user-space memory
* while in atomic code
* @regs: Structure with register values as seen when entering kernel mode
* @buf: Array to store the fetched instruction
*
* Gets the linear address of the instruction and copies the instruction bytes
* to the buf. This function must be used in atomic context.
*
* Returns:
*
* - number of instruction bytes copied.
* - 0 if nothing was copied.
* - -EINVAL if the linear address of the instruction could not be calculated.
*/
int insn_fetch_from_user_inatomic(struct pt_regs *regs, unsigned char buf[MAX_INSN_SIZE])
{
unsigned long ip;
int not_copied;
if (insn_get_effective_ip(regs, &ip))
return -EINVAL;
not_copied = __copy_from_user_inatomic(buf, (void __user *)ip, MAX_INSN_SIZE);
return MAX_INSN_SIZE - not_copied;
}
/**
* insn_decode_from_regs() - Decode an instruction
* @insn: Structure to store decoded instruction
* @regs: Structure with register values as seen when entering kernel mode
* @buf: Buffer containing the instruction bytes
* @buf_size: Number of instruction bytes available in buf
*
* Decodes the instruction provided in buf and stores the decoding results in
* insn. Also determines the correct address and operand sizes.
*
* Returns:
*
* True if instruction was decoded, False otherwise.
*/
bool insn_decode_from_regs(struct insn *insn, struct pt_regs *regs,
unsigned char buf[MAX_INSN_SIZE], int buf_size)
{
int seg_defs;
insn_init(insn, buf, buf_size, user_64bit_mode(regs));
/*
* Override the default operand and address sizes with what is specified
* in the code segment descriptor. The instruction decoder only sets
* the address size it to either 4 or 8 address bytes and does nothing
* for the operand bytes. This OK for most of the cases, but we could
* have special cases where, for instance, a 16-bit code segment
* descriptor is used.
* If there is an address override prefix, the instruction decoder
* correctly updates these values, even for 16-bit defaults.
*/
seg_defs = insn_get_code_seg_params(regs);
if (seg_defs == -EINVAL)
return false;
insn->addr_bytes = INSN_CODE_SEG_ADDR_SZ(seg_defs);
insn->opnd_bytes = INSN_CODE_SEG_OPND_SZ(seg_defs);
if (insn_get_length(insn))
return false;
if (buf_size < insn->length)
return false;
return true;
}
/* SPDX-License-Identifier: GPL-2.0 */
/*
* Copyright (C) 2014 Facebook. All rights reserved.
*/
#ifndef BTRFS_QGROUP_H
#define BTRFS_QGROUP_H
#include <linux/spinlock.h>
#include <linux/rbtree.h>
#include <linux/kobject.h>
#include "ulist.h"
#include "delayed-ref.h"
/*
* Btrfs qgroup overview
*
* Btrfs qgroup splits into 3 main part:
* 1) Reserve
* Reserve metadata/data space for incoming operations
* Affect how qgroup limit works
*
* 2) Trace
* Tell btrfs qgroup to trace dirty extents.
*
* Dirty extents including:
* - Newly allocated extents
* - Extents going to be deleted (in this trans)
* - Extents whose owner is going to be modified
*
* This is the main part affects whether qgroup numbers will stay
* consistent.
* Btrfs qgroup can trace clean extents and won't cause any problem,
* but it will consume extra CPU time, it should be avoided if possible.
*
* 3) Account
* Btrfs qgroup will updates its numbers, based on dirty extents traced
* in previous step.
*
* Normally at qgroup rescan and transaction commit time.
*/
/*
* Special performance optimization for balance.
*
* For balance, we need to swap subtree of subvolume and reloc trees.
* In theory, we need to trace all subtree blocks of both subvolume and reloc
* trees, since their owner has changed during such swap.
*
* However since balance has ensured that both subtrees are containing the
* same contents and have the same tree structures, such swap won't cause
* qgroup number change.
*
* But there is a race window between subtree swap and transaction commit,
* during that window, if we increase/decrease tree level or merge/split tree
* blocks, we still need to trace the original subtrees.
*
* So for balance, we use a delayed subtree tracing, whose workflow is:
*
* 1) Record the subtree root block get swapped.
*
* During subtree swap:
* O = Old tree blocks
* N = New tree blocks
* reloc tree subvolume tree X
* Root Root
* / \ / \
* NA OB OA OB
* / | | \ / | | \
* NC ND OE OF OC OD OE OF
*
* In this case, NA and OA are going to be swapped, record (NA, OA) into
* subvolume tree X.
*
* 2) After subtree swap.
* reloc tree subvolume tree X
* Root Root
* / \ / \
* OA OB NA OB
* / | | \ / | | \
* OC OD OE OF NC ND OE OF
*
* 3a) COW happens for OB
* If we are going to COW tree block OB, we check OB's bytenr against
* tree X's swapped_blocks structure.
* If it doesn't fit any, nothing will happen.
*
* 3b) COW happens for NA
* Check NA's bytenr against tree X's swapped_blocks, and get a hit.
* Then we do subtree scan on both subtrees OA and NA.
* Resulting 6 tree blocks to be scanned (OA, OC, OD, NA, NC, ND).
*
* Then no matter what we do to subvolume tree X, qgroup numbers will
* still be correct.
* Then NA's record gets removed from X's swapped_blocks.
*
* 4) Transaction commit
* Any record in X's swapped_blocks gets removed, since there is no
* modification to the swapped subtrees, no need to trigger heavy qgroup
* subtree rescan for them.
*/
/*
* Record a dirty extent, and info qgroup to update quota on it
* TODO: Use kmem cache to alloc it.
*/
struct btrfs_qgroup_extent_record {
struct rb_node node;
u64 bytenr;
u64 num_bytes;
/*
* For qgroup reserved data space freeing.
*
* @data_rsv_refroot and @data_rsv will be recorded after
* BTRFS_ADD_DELAYED_EXTENT is called.
* And will be used to free reserved qgroup space at
* transaction commit time.
*/
u32 data_rsv; /* reserved data space needs to be freed */
u64 data_rsv_refroot; /* which root the reserved data belongs to */
struct ulist *old_roots;
};
struct btrfs_qgroup_swapped_block {
struct rb_node node;
int level;
bool trace_leaf;
/* bytenr/generation of the tree block in subvolume tree after swap */
u64 subvol_bytenr;
u64 subvol_generation;
/* bytenr/generation of the tree block in reloc tree after swap */
u64 reloc_bytenr;
u64 reloc_generation;
u64 last_snapshot;
struct btrfs_key first_key;
};
/*
* Qgroup reservation types:
*
* DATA:
* space reserved for data
*
* META_PERTRANS:
* Space reserved for metadata (per-transaction)
* Due to the fact that qgroup data is only updated at transaction commit
* time, reserved space for metadata must be kept until transaction
* commits.
* Any metadata reserved that are used in btrfs_start_transaction() should
* be of this type.
*
* META_PREALLOC:
* There are cases where metadata space is reserved before starting
* transaction, and then btrfs_join_transaction() to get a trans handle.
* Any metadata reserved for such usage should be of this type.
* And after join_transaction() part (or all) of such reservation should
* be converted into META_PERTRANS.
*/
enum btrfs_qgroup_rsv_type {
BTRFS_QGROUP_RSV_DATA,
BTRFS_QGROUP_RSV_META_PERTRANS,
BTRFS_QGROUP_RSV_META_PREALLOC,
BTRFS_QGROUP_RSV_LAST,
};
/*
* Represents how many bytes we have reserved for this qgroup.
*
* Each type should have different reservation behavior.
* E.g, data follows its io_tree flag modification, while
* *currently* meta is just reserve-and-clear during transaction.
*
* TODO: Add new type for reservation which can survive transaction commit.
* Current metadata reservation behavior is not suitable for such case.
*/
struct btrfs_qgroup_rsv {
u64 values[BTRFS_QGROUP_RSV_LAST];
};
/*
* one struct for each qgroup, organized in fs_info->qgroup_tree.
*/
struct btrfs_qgroup {
u64 qgroupid;
/*
* state
*/
u64 rfer; /* referenced */
u64 rfer_cmpr; /* referenced compressed */
u64 excl; /* exclusive */
u64 excl_cmpr; /* exclusive compressed */
/*
* limits
*/
u64 lim_flags; /* which limits are set */
u64 max_rfer;
u64 max_excl;
u64 rsv_rfer;
u64 rsv_excl;
/*
* reservation tracking
*/
struct btrfs_qgroup_rsv rsv;
/*
* lists
*/
struct list_head groups; /* groups this group is member of */
struct list_head members; /* groups that are members of this group */
struct list_head dirty; /* dirty groups */
struct rb_node node; /* tree of qgroups */
/*
* temp variables for accounting operations
* Refer to qgroup_shared_accounting() for details.
*/
u64 old_refcnt;
u64 new_refcnt;
/*
* Sysfs kobjectid
*/
struct kobject kobj;
};
static inline u64 btrfs_qgroup_subvolid(u64 qgroupid)
{
return (qgroupid & ((1ULL << BTRFS_QGROUP_LEVEL_SHIFT) - 1));
}
/*
* For qgroup event trace points only
*/
#define QGROUP_RESERVE (1<<0)
#define QGROUP_RELEASE (1<<1)
#define QGROUP_FREE (1<<2)
int btrfs_quota_enable(struct btrfs_fs_info *fs_info);
int btrfs_quota_disable(struct btrfs_fs_info *fs_info);
int btrfs_qgroup_rescan(struct btrfs_fs_info *fs_info);
void btrfs_qgroup_rescan_resume(struct btrfs_fs_info *fs_info);
int btrfs_qgroup_wait_for_completion(struct btrfs_fs_info *fs_info,
bool interruptible);
int btrfs_add_qgroup_relation(struct btrfs_trans_handle *trans, u64 src,
u64 dst);
int btrfs_del_qgroup_relation(struct btrfs_trans_handle *trans, u64 src,
u64 dst);
int btrfs_create_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid);
int btrfs_remove_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid);
int btrfs_limit_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid,
struct btrfs_qgroup_limit *limit);
int btrfs_read_qgroup_config(struct btrfs_fs_info *fs_info);
void btrfs_free_qgroup_config(struct btrfs_fs_info *fs_info);
struct btrfs_delayed_extent_op;
/*
* Inform qgroup to trace one dirty extent, its info is recorded in @record.
* So qgroup can account it at transaction committing time.
*
* No lock version, caller must acquire delayed ref lock and allocated memory,
* then call btrfs_qgroup_trace_extent_post() after exiting lock context.
*
* Return 0 for success insert
* Return >0 for existing record, caller can free @record safely.
* Error is not possible
*/
int btrfs_qgroup_trace_extent_nolock(
struct btrfs_fs_info *fs_info,
struct btrfs_delayed_ref_root *delayed_refs,
struct btrfs_qgroup_extent_record *record);
/*
* Post handler after qgroup_trace_extent_nolock().
*
* NOTE: Current qgroup does the expensive backref walk at transaction
* committing time with TRANS_STATE_COMMIT_DOING, this blocks incoming
* new transaction.
* This is designed to allow btrfs_find_all_roots() to get correct new_roots
* result.
*
* However for old_roots there is no need to do backref walk at that time,
* since we search commit roots to walk backref and result will always be
* correct.
*
* Due to the nature of no lock version, we can't do backref there.
* So we must call btrfs_qgroup_trace_extent_post() after exiting
* spinlock context.
*
* TODO: If we can fix and prove btrfs_find_all_roots() can get correct result
* using current root, then we can move all expensive backref walk out of
* transaction committing, but not now as qgroup accounting will be wrong again.
*/
int btrfs_qgroup_trace_extent_post(struct btrfs_trans_handle *trans,
struct btrfs_qgroup_extent_record *qrecord);
/*
* Inform qgroup to trace one dirty extent, specified by @bytenr and
* @num_bytes.
* So qgroup can account it at commit trans time.
*
* Better encapsulated version, with memory allocation and backref walk for
* commit roots.
* So this can sleep.
*
* Return 0 if the operation is done.
* Return <0 for error, like memory allocation failure or invalid parameter
* (NULL trans)
*/
int btrfs_qgroup_trace_extent(struct btrfs_trans_handle *trans, u64 bytenr,
u64 num_bytes, gfp_t gfp_flag);
/*
* Inform qgroup to trace all leaf items of data
*
* Return 0 for success
* Return <0 for error(ENOMEM)
*/
int btrfs_qgroup_trace_leaf_items(struct btrfs_trans_handle *trans,
struct extent_buffer *eb);
/*
* Inform qgroup to trace a whole subtree, including all its child tree
* blocks and data.
* The root tree block is specified by @root_eb.
*
* Normally used by relocation(tree block swap) and subvolume deletion.
*
* Return 0 for success
* Return <0 for error(ENOMEM or tree search error)
*/
int btrfs_qgroup_trace_subtree(struct btrfs_trans_handle *trans,
struct extent_buffer *root_eb,
u64 root_gen, int root_level);
int btrfs_qgroup_account_extent(struct btrfs_trans_handle *trans, u64 bytenr,
u64 num_bytes, struct ulist *old_roots,
struct ulist *new_roots);
int btrfs_qgroup_account_extents(struct btrfs_trans_handle *trans);
int btrfs_run_qgroups(struct btrfs_trans_handle *trans);
int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, u64 srcid,
u64 objectid, struct btrfs_qgroup_inherit *inherit);
void btrfs_qgroup_free_refroot(struct btrfs_fs_info *fs_info,
u64 ref_root, u64 num_bytes,
enum btrfs_qgroup_rsv_type type);
#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
int btrfs_verify_qgroup_counts(struct btrfs_fs_info *fs_info, u64 qgroupid,
u64 rfer, u64 excl);
#endif
/* New io_tree based accurate qgroup reserve API */
int btrfs_qgroup_reserve_data(struct btrfs_inode *inode,
struct extent_changeset **reserved, u64 start, u64 len);
int btrfs_qgroup_release_data(struct btrfs_inode *inode, u64 start, u64 len);
int btrfs_qgroup_free_data(struct btrfs_inode *inode,
struct extent_changeset *reserved, u64 start,
u64 len);
int btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes,
enum btrfs_qgroup_rsv_type type, bool enforce);
int __btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes,
enum btrfs_qgroup_rsv_type type, bool enforce);
/* Reserve metadata space for pertrans and prealloc type */
static inline int btrfs_qgroup_reserve_meta_pertrans(struct btrfs_root *root,
int num_bytes, bool enforce)
{
return __btrfs_qgroup_reserve_meta(root, num_bytes,
BTRFS_QGROUP_RSV_META_PERTRANS, enforce);
}
static inline int btrfs_qgroup_reserve_meta_prealloc(struct btrfs_root *root,
int num_bytes, bool enforce)
{
return __btrfs_qgroup_reserve_meta(root, num_bytes,
BTRFS_QGROUP_RSV_META_PREALLOC, enforce);
}
void __btrfs_qgroup_free_meta(struct btrfs_root *root, int num_bytes,
enum btrfs_qgroup_rsv_type type);
/* Free per-transaction meta reservation for error handling */
static inline void btrfs_qgroup_free_meta_pertrans(struct btrfs_root *root,
int num_bytes)
{
__btrfs_qgroup_free_meta(root, num_bytes,
BTRFS_QGROUP_RSV_META_PERTRANS);
}
/* Pre-allocated meta reservation can be freed at need */
static inline void btrfs_qgroup_free_meta_prealloc(struct btrfs_root *root,
int num_bytes)
{
__btrfs_qgroup_free_meta(root, num_bytes,
BTRFS_QGROUP_RSV_META_PREALLOC);
}
/*
* Per-transaction meta reservation should be all freed at transaction commit
* time
*/
void btrfs_qgroup_free_meta_all_pertrans(struct btrfs_root *root);
/*
* Convert @num_bytes of META_PREALLOCATED reservation to META_PERTRANS.
*
* This is called when preallocated meta reservation needs to be used.
* Normally after btrfs_join_transaction() call.
*/
void btrfs_qgroup_convert_reserved_meta(struct btrfs_root *root, int num_bytes);
void btrfs_qgroup_check_reserved_leak(struct btrfs_inode *inode);
/* btrfs_qgroup_swapped_blocks related functions */
void btrfs_qgroup_init_swapped_blocks(
struct btrfs_qgroup_swapped_blocks *swapped_blocks);
void btrfs_qgroup_clean_swapped_blocks(struct btrfs_root *root);
int btrfs_qgroup_add_swapped_blocks(struct btrfs_trans_handle *trans,
struct btrfs_root *subvol_root,
struct btrfs_block_group *bg,
struct extent_buffer *subvol_parent, int subvol_slot,
struct extent_buffer *reloc_parent, int reloc_slot,
u64 last_snapshot);
int btrfs_qgroup_trace_subtree_after_cow(struct btrfs_trans_handle *trans,
struct btrfs_root *root, struct extent_buffer *eb);
void btrfs_qgroup_destroy_extent_records(struct btrfs_transaction *trans);
bool btrfs_check_quota_leak(struct btrfs_fs_info *fs_info);
#endif
// SPDX-License-Identifier: GPL-2.0
#include <linux/kernel.h>
#include <linux/irqflags.h>
#include <linux/string.h>
#include <linux/errno.h>
#include <linux/bug.h>
#include "printk_ringbuffer.h"
/**
* DOC: printk_ringbuffer overview
*
* Data Structure
* --------------
* The printk_ringbuffer is made up of 3 internal ringbuffers:
*
* desc_ring
* A ring of descriptors and their meta data (such as sequence number,
* timestamp, loglevel, etc.) as well as internal state information about
* the record and logical positions specifying where in the other
* ringbuffer the text strings are located.
*
* text_data_ring
* A ring of data blocks. A data block consists of an unsigned long
* integer (ID) that maps to a desc_ring index followed by the text
* string of the record.
*
* The internal state information of a descriptor is the key element to allow
* readers and writers to locklessly synchronize access to the data.
*
* Implementation
* --------------
*
* Descriptor Ring
* ~~~~~~~~~~~~~~~
* The descriptor ring is an array of descriptors. A descriptor contains
* essential meta data to track the data of a printk record using
* blk_lpos structs pointing to associated text data blocks (see
* "Data Rings" below). Each descriptor is assigned an ID that maps
* directly to index values of the descriptor array and has a state. The ID
* and the state are bitwise combined into a single descriptor field named
* @state_var, allowing ID and state to be synchronously and atomically
* updated.
*
* Descriptors have four states:
*
* reserved
* A writer is modifying the record.
*
* committed
* The record and all its data are written. A writer can reopen the
* descriptor (transitioning it back to reserved), but in the committed
* state the data is consistent.
*
* finalized
* The record and all its data are complete and available for reading. A
* writer cannot reopen the descriptor.
*
* reusable
* The record exists, but its text and/or meta data may no longer be
* available.
*
* Querying the @state_var of a record requires providing the ID of the
* descriptor to query. This can yield a possible fifth (pseudo) state:
*
* miss
* The descriptor being queried has an unexpected ID.
*
* The descriptor ring has a @tail_id that contains the ID of the oldest
* descriptor and @head_id that contains the ID of the newest descriptor.
*
* When a new descriptor should be created (and the ring is full), the tail
* descriptor is invalidated by first transitioning to the reusable state and
* then invalidating all tail data blocks up to and including the data blocks
* associated with the tail descriptor (for the text ring). Then
* @tail_id is advanced, followed by advancing @head_id. And finally the
* @state_var of the new descriptor is initialized to the new ID and reserved
* state.
*
* The @tail_id can only be advanced if the new @tail_id would be in the
* committed or reusable queried state. This makes it possible that a valid
* sequence number of the tail is always available.
*
* Descriptor Finalization
* ~~~~~~~~~~~~~~~~~~~~~~~
* When a writer calls the commit function prb_commit(), record data is
* fully stored and is consistent within the ringbuffer. However, a writer can
* reopen that record, claiming exclusive access (as with prb_reserve()), and
* modify that record. When finished, the writer must again commit the record.
*
* In order for a record to be made available to readers (and also become
* recyclable for writers), it must be finalized. A finalized record cannot be
* reopened and can never become "unfinalized". Record finalization can occur
* in three different scenarios:
*
* 1) A writer can simultaneously commit and finalize its record by calling
* prb_final_commit() instead of prb_commit().
*
* 2) When a new record is reserved and the previous record has been
* committed via prb_commit(), that previous record is automatically
* finalized.
*
* 3) When a record is committed via prb_commit() and a newer record
* already exists, the record being committed is automatically finalized.
*
* Data Ring
* ~~~~~~~~~
* The text data ring is a byte array composed of data blocks. Data blocks are
* referenced by blk_lpos structs that point to the logical position of the
* beginning of a data block and the beginning of the next adjacent data
* block. Logical positions are mapped directly to index values of the byte
* array ringbuffer.
*
* Each data block consists of an ID followed by the writer data. The ID is
* the identifier of a descriptor that is associated with the data block. A
* given data block is considered valid if all of the following conditions
* are met:
*
* 1) The descriptor associated with the data block is in the committed
* or finalized queried state.
*
* 2) The blk_lpos struct within the descriptor associated with the data
* block references back to the same data block.
*
* 3) The data block is within the head/tail logical position range.
*
* If the writer data of a data block would extend beyond the end of the
* byte array, only the ID of the data block is stored at the logical
* position and the full data block (ID and writer data) is stored at the
* beginning of the byte array. The referencing blk_lpos will point to the
* ID before the wrap and the next data block will be at the logical
* position adjacent the full data block after the wrap.
*
* Data rings have a @tail_lpos that points to the beginning of the oldest
* data block and a @head_lpos that points to the logical position of the
* next (not yet existing) data block.
*
* When a new data block should be created (and the ring is full), tail data
* blocks will first be invalidated by putting their associated descriptors
* into the reusable state and then pushing the @tail_lpos forward beyond
* them. Then the @head_lpos is pushed forward and is associated with a new
* descriptor. If a data block is not valid, the @tail_lpos cannot be
* advanced beyond it.
*
* Info Array
* ~~~~~~~~~~
* The general meta data of printk records are stored in printk_info structs,
* stored in an array with the same number of elements as the descriptor ring.
* Each info corresponds to the descriptor of the same index in the
* descriptor ring. Info validity is confirmed by evaluating the corresponding
* descriptor before and after loading the info.
*
* Usage
* -----
* Here are some simple examples demonstrating writers and readers. For the
* examples a global ringbuffer (test_rb) is available (which is not the
* actual ringbuffer used by printk)::
*
* DEFINE_PRINTKRB(test_rb, 15, 5);
*
* This ringbuffer allows up to 32768 records (2 ^ 15) and has a size of
* 1 MiB (2 ^ (15 + 5)) for text data.
*
* Sample writer code::
*
* const char *textstr = "message text";
* struct prb_reserved_entry e;
* struct printk_record r;
*
* // specify how much to allocate
* prb_rec_init_wr(&r, strlen(textstr) + 1);
*
* if (prb_reserve(&e, &test_rb, &r)) {
* snprintf(r.text_buf, r.text_buf_size, "%s", textstr);
*
* r.info->text_len = strlen(textstr);
* r.info->ts_nsec = local_clock();
* r.info->caller_id = printk_caller_id();
*
* // commit and finalize the record
* prb_final_commit(&e);
* }
*
* Note that additional writer functions are available to extend a record
* after it has been committed but not yet finalized. This can be done as
* long as no new records have been reserved and the caller is the same.
*
* Sample writer code (record extending)::
*
* // alternate rest of previous example
*
* r.info->text_len = strlen(textstr);
* r.info->ts_nsec = local_clock();
* r.info->caller_id = printk_caller_id();
*
* // commit the record (but do not finalize yet)
* prb_commit(&e);
* }
*
* ...
*
* // specify additional 5 bytes text space to extend
* prb_rec_init_wr(&r, 5);
*
* // try to extend, but only if it does not exceed 32 bytes
* if (prb_reserve_in_last(&e, &test_rb, &r, printk_caller_id()), 32) {
* snprintf(&r.text_buf[r.info->text_len],
* r.text_buf_size - r.info->text_len, "hello");
*
* r.info->text_len += 5;
*
* // commit and finalize the record
* prb_final_commit(&e);
* }
*
* Sample reader code::
*
* struct printk_info info;
* struct printk_record r;
* char text_buf[32];
* u64 seq;
*
* prb_rec_init_rd(&r, &info, &text_buf[0], sizeof(text_buf));
*
* prb_for_each_record(0, &test_rb, &seq, &r) {
* if (info.seq != seq)
* pr_warn("lost %llu records\n", info.seq - seq);
*
* if (info.text_len > r.text_buf_size) {
* pr_warn("record %llu text truncated\n", info.seq);
* text_buf[r.text_buf_size - 1] = 0;
* }
*
* pr_info("%llu: %llu: %s\n", info.seq, info.ts_nsec,
* &text_buf[0]);
* }
*
* Note that additional less convenient reader functions are available to
* allow complex record access.
*
* ABA Issues
* ~~~~~~~~~~
* To help avoid ABA issues, descriptors are referenced by IDs (array index
* values combined with tagged bits counting array wraps) and data blocks are
* referenced by logical positions (array index values combined with tagged
* bits counting array wraps). However, on 32-bit systems the number of
* tagged bits is relatively small such that an ABA incident is (at least
* theoretically) possible. For example, if 4 million maximally sized (1KiB)
* printk messages were to occur in NMI context on a 32-bit system, the
* interrupted context would not be able to recognize that the 32-bit integer
* completely wrapped and thus represents a different data block than the one
* the interrupted context expects.
*
* To help combat this possibility, additional state checking is performed
* (such as using cmpxchg() even though set() would suffice). These extra
* checks are commented as such and will hopefully catch any ABA issue that
* a 32-bit system might experience.
*
* Memory Barriers
* ~~~~~~~~~~~~~~~
* Multiple memory barriers are used. To simplify proving correctness and
* generating litmus tests, lines of code related to memory barriers
* (loads, stores, and the associated memory barriers) are labeled::
*
* LMM(function:letter)
*
* Comments reference the labels using only the "function:letter" part.
*
* The memory barrier pairs and their ordering are:
*
* desc_reserve:D / desc_reserve:B
* push descriptor tail (id), then push descriptor head (id)
*
* desc_reserve:D / data_push_tail:B
* push data tail (lpos), then set new descriptor reserved (state)
*
* desc_reserve:D / desc_push_tail:C
* push descriptor tail (id), then set new descriptor reserved (state)
*
* desc_reserve:D / prb_first_seq:C
* push descriptor tail (id), then set new descriptor reserved (state)
*
* desc_reserve:F / desc_read:D
* set new descriptor id and reserved (state), then allow writer changes
*
* data_alloc:A (or data_realloc:A) / desc_read:D
* set old descriptor reusable (state), then modify new data block area
*
* data_alloc:A (or data_realloc:A) / data_push_tail:B
* push data tail (lpos), then modify new data block area
*
* _prb_commit:B / desc_read:B
* store writer changes, then set new descriptor committed (state)
*
* desc_reopen_last:A / _prb_commit:B
* set descriptor reserved (state), then read descriptor data
*
* _prb_commit:B / desc_reserve:D
* set new descriptor committed (state), then check descriptor head (id)
*
* data_push_tail:D / data_push_tail:A
* set descriptor reusable (state), then push data tail (lpos)
*
* desc_push_tail:B / desc_reserve:D
* set descriptor reusable (state), then push descriptor tail (id)
*/
#define DATA_SIZE(data_ring) _DATA_SIZE((data_ring)->size_bits)
#define DATA_SIZE_MASK(data_ring) (DATA_SIZE(data_ring) - 1)
#define DESCS_COUNT(desc_ring) _DESCS_COUNT((desc_ring)->count_bits)
#define DESCS_COUNT_MASK(desc_ring) (DESCS_COUNT(desc_ring) - 1)
/* Determine the data array index from a logical position. */
#define DATA_INDEX(data_ring, lpos) ((lpos) & DATA_SIZE_MASK(data_ring))
/* Determine the desc array index from an ID or sequence number. */
#define DESC_INDEX(desc_ring, n) ((n) & DESCS_COUNT_MASK(desc_ring))
/* Determine how many times the data array has wrapped. */
#define DATA_WRAPS(data_ring, lpos) ((lpos) >> (data_ring)->size_bits)
/* Determine if a logical position refers to a data-less block. */
#define LPOS_DATALESS(lpos) ((lpos) & 1UL)
#define BLK_DATALESS(blk) (LPOS_DATALESS((blk)->begin) && \
LPOS_DATALESS((blk)->next))
/* Get the logical position at index 0 of the current wrap. */
#define DATA_THIS_WRAP_START_LPOS(data_ring, lpos) \
((lpos) & ~DATA_SIZE_MASK(data_ring))
/* Get the ID for the same index of the previous wrap as the given ID. */
#define DESC_ID_PREV_WRAP(desc_ring, id) \
DESC_ID((id) - DESCS_COUNT(desc_ring))
/*
* A data block: mapped directly to the beginning of the data block area
* specified as a logical position within the data ring.
*
* @id: the ID of the associated descriptor
* @data: the writer data
*
* Note that the size of a data block is only known by its associated
* descriptor.
*/
struct prb_data_block {
unsigned long id;
char data[];
};
/*
* Return the descriptor associated with @n. @n can be either a
* descriptor ID or a sequence number.
*/
static struct prb_desc *to_desc(struct prb_desc_ring *desc_ring, u64 n)
{
return &desc_ring->descs[DESC_INDEX(desc_ring, n)];
}
/*
* Return the printk_info associated with @n. @n can be either a
* descriptor ID or a sequence number.
*/
static struct printk_info *to_info(struct prb_desc_ring *desc_ring, u64 n)
{
return &desc_ring->infos[DESC_INDEX(desc_ring, n)];
}
static struct prb_data_block *to_block(struct prb_data_ring *data_ring,
unsigned long begin_lpos)
{
return (void *)&data_ring->data[DATA_INDEX(data_ring, begin_lpos)];
}
/*
* Increase the data size to account for data block meta data plus any
* padding so that the adjacent data block is aligned on the ID size.
*/
static unsigned int to_blk_size(unsigned int size)
{
struct prb_data_block *db = NULL;
size += sizeof(*db);
size = ALIGN(size, sizeof(db->id));
return size;
}
/*
* Sanity checker for reserve size. The ringbuffer code assumes that a data
* block does not exceed the maximum possible size that could fit within the
* ringbuffer. This function provides that basic size check so that the
* assumption is safe.
*/
static bool data_check_size(struct prb_data_ring *data_ring, unsigned int size)
{
struct prb_data_block *db = NULL;
if (size == 0)
return true;
/*
* Ensure the alignment padded size could possibly fit in the data
* array. The largest possible data block must still leave room for
* at least the ID of the next block.
*/
size = to_blk_size(size);
if (size > DATA_SIZE(data_ring) - sizeof(db->id))
return false;
return true;
}
/* Query the state of a descriptor. */
static enum desc_state get_desc_state(unsigned long id,
unsigned long state_val)
{
if (id != DESC_ID(state_val))
return desc_miss;
return DESC_STATE(state_val);
}
/*
* Get a copy of a specified descriptor and return its queried state. If the
* descriptor is in an inconsistent state (miss or reserved), the caller can
* only expect the descriptor's @state_var field to be valid.
*
* The sequence number and caller_id can be optionally retrieved. Like all
* non-state_var data, they are only valid if the descriptor is in a
* consistent state.
*/
static enum desc_state desc_read(struct prb_desc_ring *desc_ring,
unsigned long id, struct prb_desc *desc_out,
u64 *seq_out, u32 *caller_id_out)
{
struct printk_info *info = to_info(desc_ring, id);
struct prb_desc *desc = to_desc(desc_ring, id);
atomic_long_t *state_var = &desc->state_var;
enum desc_state d_state;
unsigned long state_val;
/* Check the descriptor state. */
state_val = atomic_long_read(state_var); /* LMM(desc_read:A) */
d_state = get_desc_state(id, state_val);
if (d_state == desc_miss || d_state == desc_reserved) {
/*
* The descriptor is in an inconsistent state. Set at least
* @state_var so that the caller can see the details of
* the inconsistent state.
*/
goto out;
}
/*
* Guarantee the state is loaded before copying the descriptor
* content. This avoids copying obsolete descriptor content that might
* not apply to the descriptor state. This pairs with _prb_commit:B.
*
* Memory barrier involvement:
*
* If desc_read:A reads from _prb_commit:B, then desc_read:C reads
* from _prb_commit:A.
*
* Relies on:
*
* WMB from _prb_commit:A to _prb_commit:B
* matching
* RMB from desc_read:A to desc_read:C
*/
smp_rmb(); /* LMM(desc_read:B) */
/*
* Copy the descriptor data. The data is not valid until the
* state has been re-checked. A memcpy() for all of @desc
* cannot be used because of the atomic_t @state_var field.
*/
memcpy(&desc_out->text_blk_lpos, &desc->text_blk_lpos,
sizeof(desc_out->text_blk_lpos)); /* LMM(desc_read:C) */
if (seq_out)
*seq_out = info->seq; /* also part of desc_read:C */ if (caller_id_out) *caller_id_out = info->caller_id; /* also part of desc_read:C */
/*
* 1. Guarantee the descriptor content is loaded before re-checking
* the state. This avoids reading an obsolete descriptor state
* that may not apply to the copied content. This pairs with
* desc_reserve:F.
*
* Memory barrier involvement:
*
* If desc_read:C reads from desc_reserve:G, then desc_read:E
* reads from desc_reserve:F.
*
* Relies on:
*
* WMB from desc_reserve:F to desc_reserve:G
* matching
* RMB from desc_read:C to desc_read:E
*
* 2. Guarantee the record data is loaded before re-checking the
* state. This avoids reading an obsolete descriptor state that may
* not apply to the copied data. This pairs with data_alloc:A and
* data_realloc:A.
*
* Memory barrier involvement:
*
* If copy_data:A reads from data_alloc:B, then desc_read:E
* reads from desc_make_reusable:A.
*
* Relies on:
*
* MB from desc_make_reusable:A to data_alloc:B
* matching
* RMB from desc_read:C to desc_read:E
*
* Note: desc_make_reusable:A and data_alloc:B can be different
* CPUs. However, the data_alloc:B CPU (which performs the
* full memory barrier) must have previously seen
* desc_make_reusable:A.
*/
smp_rmb(); /* LMM(desc_read:D) */
/*
* The data has been copied. Return the current descriptor state,
* which may have changed since the load above.
*/
state_val = atomic_long_read(state_var); /* LMM(desc_read:E) */
d_state = get_desc_state(id, state_val);
out:
atomic_long_set(&desc_out->state_var, state_val);
return d_state;
}
/*
* Take a specified descriptor out of the finalized state by attempting
* the transition from finalized to reusable. Either this context or some
* other context will have been successful.
*/
static void desc_make_reusable(struct prb_desc_ring *desc_ring,
unsigned long id)
{
unsigned long val_finalized = DESC_SV(id, desc_finalized);
unsigned long val_reusable = DESC_SV(id, desc_reusable);
struct prb_desc *desc = to_desc(desc_ring, id);
atomic_long_t *state_var = &desc->state_var;
atomic_long_cmpxchg_relaxed(state_var, val_finalized,
val_reusable); /* LMM(desc_make_reusable:A) */
}
/*
* Given the text data ring, put the associated descriptor of each
* data block from @lpos_begin until @lpos_end into the reusable state.
*
* If there is any problem making the associated descriptor reusable, either
* the descriptor has not yet been finalized or another writer context has
* already pushed the tail lpos past the problematic data block. Regardless,
* on error the caller can re-load the tail lpos to determine the situation.
*/
static bool data_make_reusable(struct printk_ringbuffer *rb,
unsigned long lpos_begin,
unsigned long lpos_end,
unsigned long *lpos_out)
{
struct prb_data_ring *data_ring = &rb->text_data_ring;
struct prb_desc_ring *desc_ring = &rb->desc_ring;
struct prb_data_block *blk;
enum desc_state d_state;
struct prb_desc desc;
struct prb_data_blk_lpos *blk_lpos = &desc.text_blk_lpos;
unsigned long id;
/* Loop until @lpos_begin has advanced to or beyond @lpos_end. */
while ((lpos_end - lpos_begin) - 1 < DATA_SIZE(data_ring)) { blk = to_block(data_ring, lpos_begin);
/*
* Load the block ID from the data block. This is a data race
* against a writer that may have newly reserved this data
* area. If the loaded value matches a valid descriptor ID,
* the blk_lpos of that descriptor will be checked to make
* sure it points back to this data block. If the check fails,
* the data area has been recycled by another writer.
*/
id = blk->id; /* LMM(data_make_reusable:A) */
d_state = desc_read(desc_ring, id, &desc,
NULL, NULL); /* LMM(data_make_reusable:B) */
switch (d_state) {
case desc_miss:
case desc_reserved:
case desc_committed:
return false;
case desc_finalized:
/*
* This data block is invalid if the descriptor
* does not point back to it.
*/
if (blk_lpos->begin != lpos_begin)
return false;
desc_make_reusable(desc_ring, id);
break;
case desc_reusable:
/*
* This data block is invalid if the descriptor
* does not point back to it.
*/
if (blk_lpos->begin != lpos_begin)
return false;
break;
}
/* Advance @lpos_begin to the next data block. */
lpos_begin = blk_lpos->next;
}
*lpos_out = lpos_begin;
return true;
}
/*
* Advance the data ring tail to at least @lpos. This function puts
* descriptors into the reusable state if the tail is pushed beyond
* their associated data block.
*/
static bool data_push_tail(struct printk_ringbuffer *rb, unsigned long lpos)
{
struct prb_data_ring *data_ring = &rb->text_data_ring;
unsigned long tail_lpos_new;
unsigned long tail_lpos;
unsigned long next_lpos;
/* If @lpos is from a data-less block, there is nothing to do. */
if (LPOS_DATALESS(lpos))
return true;
/*
* Any descriptor states that have transitioned to reusable due to the
* data tail being pushed to this loaded value will be visible to this
* CPU. This pairs with data_push_tail:D.
*
* Memory barrier involvement:
*
* If data_push_tail:A reads from data_push_tail:D, then this CPU can
* see desc_make_reusable:A.
*
* Relies on:
*
* MB from desc_make_reusable:A to data_push_tail:D
* matches
* READFROM from data_push_tail:D to data_push_tail:A
* thus
* READFROM from desc_make_reusable:A to this CPU
*/
tail_lpos = atomic_long_read(&data_ring->tail_lpos); /* LMM(data_push_tail:A) */
/*
* Loop until the tail lpos is at or beyond @lpos. This condition
* may already be satisfied, resulting in no full memory barrier
* from data_push_tail:D being performed. However, since this CPU
* sees the new tail lpos, any descriptor states that transitioned to
* the reusable state must already be visible.
*/
while ((lpos - tail_lpos) - 1 < DATA_SIZE(data_ring)) {
/*
* Make all descriptors reusable that are associated with
* data blocks before @lpos.
*/
if (!data_make_reusable(rb, tail_lpos, lpos, &next_lpos)) {
/*
* 1. Guarantee the block ID loaded in
* data_make_reusable() is performed before
* reloading the tail lpos. The failed
* data_make_reusable() may be due to a newly
* recycled data area causing the tail lpos to
* have been previously pushed. This pairs with
* data_alloc:A and data_realloc:A.
*
* Memory barrier involvement:
*
* If data_make_reusable:A reads from data_alloc:B,
* then data_push_tail:C reads from
* data_push_tail:D.
*
* Relies on:
*
* MB from data_push_tail:D to data_alloc:B
* matching
* RMB from data_make_reusable:A to
* data_push_tail:C
*
* Note: data_push_tail:D and data_alloc:B can be
* different CPUs. However, the data_alloc:B
* CPU (which performs the full memory
* barrier) must have previously seen
* data_push_tail:D.
*
* 2. Guarantee the descriptor state loaded in
* data_make_reusable() is performed before
* reloading the tail lpos. The failed
* data_make_reusable() may be due to a newly
* recycled descriptor causing the tail lpos to
* have been previously pushed. This pairs with
* desc_reserve:D.
*
* Memory barrier involvement:
*
* If data_make_reusable:B reads from
* desc_reserve:F, then data_push_tail:C reads
* from data_push_tail:D.
*
* Relies on:
*
* MB from data_push_tail:D to desc_reserve:F
* matching
* RMB from data_make_reusable:B to
* data_push_tail:C
*
* Note: data_push_tail:D and desc_reserve:F can
* be different CPUs. However, the
* desc_reserve:F CPU (which performs the
* full memory barrier) must have previously
* seen data_push_tail:D.
*/
smp_rmb(); /* LMM(data_push_tail:B) */
tail_lpos_new = atomic_long_read(&data_ring->tail_lpos
); /* LMM(data_push_tail:C) */
if (tail_lpos_new == tail_lpos)
return false;
/* Another CPU pushed the tail. Try again. */
tail_lpos = tail_lpos_new;
continue;
}
/*
* Guarantee any descriptor states that have transitioned to
* reusable are stored before pushing the tail lpos. A full
* memory barrier is needed since other CPUs may have made
* the descriptor states reusable. This pairs with
* data_push_tail:A.
*/
if (atomic_long_try_cmpxchg(&data_ring->tail_lpos, &tail_lpos,
next_lpos)) { /* LMM(data_push_tail:D) */
break;
}
}
return true;
}
/*
* Advance the desc ring tail. This function advances the tail by one
* descriptor, thus invalidating the oldest descriptor. Before advancing
* the tail, the tail descriptor is made reusable and all data blocks up to
* and including the descriptor's data block are invalidated (i.e. the data
* ring tail is pushed past the data block of the descriptor being made
* reusable).
*/
static bool desc_push_tail(struct printk_ringbuffer *rb,
unsigned long tail_id)
{
struct prb_desc_ring *desc_ring = &rb->desc_ring;
enum desc_state d_state;
struct prb_desc desc;
d_state = desc_read(desc_ring, tail_id, &desc, NULL, NULL);
switch (d_state) {
case desc_miss:
/*
* If the ID is exactly 1 wrap behind the expected, it is
* in the process of being reserved by another writer and
* must be considered reserved.
*/
if (DESC_ID(atomic_long_read(&desc.state_var)) ==
DESC_ID_PREV_WRAP(desc_ring, tail_id)) {
return false;
}
/*
* The ID has changed. Another writer must have pushed the
* tail and recycled the descriptor already. Success is
* returned because the caller is only interested in the
* specified tail being pushed, which it was.
*/
return true;
case desc_reserved:
case desc_committed:
return false;
case desc_finalized:
desc_make_reusable(desc_ring, tail_id);
break;
case desc_reusable:
break;
}
/*
* Data blocks must be invalidated before their associated
* descriptor can be made available for recycling. Invalidating
* them later is not possible because there is no way to trust
* data blocks once their associated descriptor is gone.
*/
if (!data_push_tail(rb, desc.text_blk_lpos.next))
return false;
/*
* Check the next descriptor after @tail_id before pushing the tail
* to it because the tail must always be in a finalized or reusable
* state. The implementation of prb_first_seq() relies on this.
*
* A successful read implies that the next descriptor is less than or
* equal to @head_id so there is no risk of pushing the tail past the
* head.
*/
d_state = desc_read(desc_ring, DESC_ID(tail_id + 1), &desc,
NULL, NULL); /* LMM(desc_push_tail:A) */
if (d_state == desc_finalized || d_state == desc_reusable) {
/*
* Guarantee any descriptor states that have transitioned to
* reusable are stored before pushing the tail ID. This allows
* verifying the recycled descriptor state. A full memory
* barrier is needed since other CPUs may have made the
* descriptor states reusable. This pairs with desc_reserve:D.
*/
atomic_long_cmpxchg(&desc_ring->tail_id, tail_id,
DESC_ID(tail_id + 1)); /* LMM(desc_push_tail:B) */
} else {
/*
* Guarantee the last state load from desc_read() is before
* reloading @tail_id in order to see a new tail ID in the
* case that the descriptor has been recycled. This pairs
* with desc_reserve:D.
*
* Memory barrier involvement:
*
* If desc_push_tail:A reads from desc_reserve:F, then
* desc_push_tail:D reads from desc_push_tail:B.
*
* Relies on:
*
* MB from desc_push_tail:B to desc_reserve:F
* matching
* RMB from desc_push_tail:A to desc_push_tail:D
*
* Note: desc_push_tail:B and desc_reserve:F can be different
* CPUs. However, the desc_reserve:F CPU (which performs
* the full memory barrier) must have previously seen
* desc_push_tail:B.
*/
smp_rmb(); /* LMM(desc_push_tail:C) */
/*
* Re-check the tail ID. The descriptor following @tail_id is
* not in an allowed tail state. But if the tail has since
* been moved by another CPU, then it does not matter.
*/
if (atomic_long_read(&desc_ring->tail_id) == tail_id) /* LMM(desc_push_tail:D) */
return false;
}
return true;
}
/* Reserve a new descriptor, invalidating the oldest if necessary. */
static bool desc_reserve(struct printk_ringbuffer *rb, unsigned long *id_out)
{
struct prb_desc_ring *desc_ring = &rb->desc_ring;
unsigned long prev_state_val;
unsigned long id_prev_wrap;
struct prb_desc *desc;
unsigned long head_id;
unsigned long id;
head_id = atomic_long_read(&desc_ring->head_id); /* LMM(desc_reserve:A) */
do {
id = DESC_ID(head_id + 1);
id_prev_wrap = DESC_ID_PREV_WRAP(desc_ring, id);
/*
* Guarantee the head ID is read before reading the tail ID.
* Since the tail ID is updated before the head ID, this
* guarantees that @id_prev_wrap is never ahead of the tail
* ID. This pairs with desc_reserve:D.
*
* Memory barrier involvement:
*
* If desc_reserve:A reads from desc_reserve:D, then
* desc_reserve:C reads from desc_push_tail:B.
*
* Relies on:
*
* MB from desc_push_tail:B to desc_reserve:D
* matching
* RMB from desc_reserve:A to desc_reserve:C
*
* Note: desc_push_tail:B and desc_reserve:D can be different
* CPUs. However, the desc_reserve:D CPU (which performs
* the full memory barrier) must have previously seen
* desc_push_tail:B.
*/
smp_rmb(); /* LMM(desc_reserve:B) */
if (id_prev_wrap == atomic_long_read(&desc_ring->tail_id
)) { /* LMM(desc_reserve:C) */
/*
* Make space for the new descriptor by
* advancing the tail.
*/
if (!desc_push_tail(rb, id_prev_wrap))
return false;
}
/*
* 1. Guarantee the tail ID is read before validating the
* recycled descriptor state. A read memory barrier is
* sufficient for this. This pairs with desc_push_tail:B.
*
* Memory barrier involvement:
*
* If desc_reserve:C reads from desc_push_tail:B, then
* desc_reserve:E reads from desc_make_reusable:A.
*
* Relies on:
*
* MB from desc_make_reusable:A to desc_push_tail:B
* matching
* RMB from desc_reserve:C to desc_reserve:E
*
* Note: desc_make_reusable:A and desc_push_tail:B can be
* different CPUs. However, the desc_push_tail:B CPU
* (which performs the full memory barrier) must have
* previously seen desc_make_reusable:A.
*
* 2. Guarantee the tail ID is stored before storing the head
* ID. This pairs with desc_reserve:B.
*
* 3. Guarantee any data ring tail changes are stored before
* recycling the descriptor. Data ring tail changes can
* happen via desc_push_tail()->data_push_tail(). A full
* memory barrier is needed since another CPU may have
* pushed the data ring tails. This pairs with
* data_push_tail:B.
*
* 4. Guarantee a new tail ID is stored before recycling the
* descriptor. A full memory barrier is needed since
* another CPU may have pushed the tail ID. This pairs
* with desc_push_tail:C and this also pairs with
* prb_first_seq:C.
*
* 5. Guarantee the head ID is stored before trying to
* finalize the previous descriptor. This pairs with
* _prb_commit:B.
*/
} while (!atomic_long_try_cmpxchg(&desc_ring->head_id, &head_id,
id)); /* LMM(desc_reserve:D) */
desc = to_desc(desc_ring, id);
/*
* If the descriptor has been recycled, verify the old state val.
* See "ABA Issues" about why this verification is performed.
*/
prev_state_val = atomic_long_read(&desc->state_var); /* LMM(desc_reserve:E) */
if (prev_state_val &&
get_desc_state(id_prev_wrap, prev_state_val) != desc_reusable) {
WARN_ON_ONCE(1);
return false;
}
/*
* Assign the descriptor a new ID and set its state to reserved.
* See "ABA Issues" about why cmpxchg() instead of set() is used.
*
* Guarantee the new descriptor ID and state is stored before making
* any other changes. A write memory barrier is sufficient for this.
* This pairs with desc_read:D.
*/
if (!atomic_long_try_cmpxchg(&desc->state_var, &prev_state_val,
DESC_SV(id, desc_reserved))) { /* LMM(desc_reserve:F) */
WARN_ON_ONCE(1);
return false;
}
/* Now data in @desc can be modified: LMM(desc_reserve:G) */
*id_out = id;
return true;
}
/* Determine the end of a data block. */
static unsigned long get_next_lpos(struct prb_data_ring *data_ring,
unsigned long lpos, unsigned int size)
{
unsigned long begin_lpos;
unsigned long next_lpos;
begin_lpos = lpos;
next_lpos = lpos + size;
/* First check if the data block does not wrap. */
if (DATA_WRAPS(data_ring, begin_lpos) == DATA_WRAPS(data_ring, next_lpos))
return next_lpos;
/* Wrapping data blocks store their data at the beginning. */
return (DATA_THIS_WRAP_START_LPOS(data_ring, next_lpos) + size);
}
/*
* Allocate a new data block, invalidating the oldest data block(s)
* if necessary. This function also associates the data block with
* a specified descriptor.
*/
static char *data_alloc(struct printk_ringbuffer *rb, unsigned int size,
struct prb_data_blk_lpos *blk_lpos, unsigned long id)
{
struct prb_data_ring *data_ring = &rb->text_data_ring;
struct prb_data_block *blk;
unsigned long begin_lpos;
unsigned long next_lpos;
if (size == 0) {
/* Specify a data-less block. */
blk_lpos->begin = NO_LPOS;
blk_lpos->next = NO_LPOS;
return NULL;
}
size = to_blk_size(size);
begin_lpos = atomic_long_read(&data_ring->head_lpos);
do {
next_lpos = get_next_lpos(data_ring, begin_lpos, size); if (!data_push_tail(rb, next_lpos - DATA_SIZE(data_ring))) {
/* Failed to allocate, specify a data-less block. */
blk_lpos->begin = FAILED_LPOS;
blk_lpos->next = FAILED_LPOS;
return NULL;
}
/*
* 1. Guarantee any descriptor states that have transitioned
* to reusable are stored before modifying the newly
* allocated data area. A full memory barrier is needed
* since other CPUs may have made the descriptor states
* reusable. See data_push_tail:A about why the reusable
* states are visible. This pairs with desc_read:D.
*
* 2. Guarantee any updated tail lpos is stored before
* modifying the newly allocated data area. Another CPU may
* be in data_make_reusable() and is reading a block ID
* from this area. data_make_reusable() can handle reading
* a garbage block ID value, but then it must be able to
* load a new tail lpos. A full memory barrier is needed
* since other CPUs may have updated the tail lpos. This
* pairs with data_push_tail:B.
*/
} while (!atomic_long_try_cmpxchg(&data_ring->head_lpos, &begin_lpos,
next_lpos)); /* LMM(data_alloc:A) */
blk = to_block(data_ring, begin_lpos);
blk->id = id; /* LMM(data_alloc:B) */
if (DATA_WRAPS(data_ring, begin_lpos) != DATA_WRAPS(data_ring, next_lpos)) {
/* Wrapping data blocks store their data at the beginning. */
blk = to_block(data_ring, 0);
/*
* Store the ID on the wrapped block for consistency.
* The printk_ringbuffer does not actually use it.
*/
blk->id = id;
}
blk_lpos->begin = begin_lpos;
blk_lpos->next = next_lpos;
return &blk->data[0];}
/*
* Try to resize an existing data block associated with the descriptor
* specified by @id. If the resized data block should become wrapped, it
* copies the old data to the new data block. If @size yields a data block
* with the same or less size, the data block is left as is.
*
* Fail if this is not the last allocated data block or if there is not
* enough space or it is not possible make enough space.
*
* Return a pointer to the beginning of the entire data buffer or NULL on
* failure.
*/
static char *data_realloc(struct printk_ringbuffer *rb, unsigned int size,
struct prb_data_blk_lpos *blk_lpos, unsigned long id)
{
struct prb_data_ring *data_ring = &rb->text_data_ring;
struct prb_data_block *blk;
unsigned long head_lpos;
unsigned long next_lpos;
bool wrapped;
/* Reallocation only works if @blk_lpos is the newest data block. */
head_lpos = atomic_long_read(&data_ring->head_lpos);
if (head_lpos != blk_lpos->next)
return NULL;
/* Keep track if @blk_lpos was a wrapping data block. */
wrapped = (DATA_WRAPS(data_ring, blk_lpos->begin) != DATA_WRAPS(data_ring, blk_lpos->next));
size = to_blk_size(size);
next_lpos = get_next_lpos(data_ring, blk_lpos->begin, size);
/* If the data block does not increase, there is nothing to do. */
if (head_lpos - next_lpos < DATA_SIZE(data_ring)) {
if (wrapped)
blk = to_block(data_ring, 0);
else
blk = to_block(data_ring, blk_lpos->begin);
return &blk->data[0];
}
if (!data_push_tail(rb, next_lpos - DATA_SIZE(data_ring)))
return NULL;
/* The memory barrier involvement is the same as data_alloc:A. */
if (!atomic_long_try_cmpxchg(&data_ring->head_lpos, &head_lpos,
next_lpos)) { /* LMM(data_realloc:A) */
return NULL;
}
blk = to_block(data_ring, blk_lpos->begin);
if (DATA_WRAPS(data_ring, blk_lpos->begin) != DATA_WRAPS(data_ring, next_lpos)) {
struct prb_data_block *old_blk = blk;
/* Wrapping data blocks store their data at the beginning. */
blk = to_block(data_ring, 0);
/*
* Store the ID on the wrapped block for consistency.
* The printk_ringbuffer does not actually use it.
*/
blk->id = id;
if (!wrapped) {
/*
* Since the allocated space is now in the newly
* created wrapping data block, copy the content
* from the old data block.
*/
memcpy(&blk->data[0], &old_blk->data[0],
(blk_lpos->next - blk_lpos->begin) - sizeof(blk->id));
}
}
blk_lpos->next = next_lpos;
return &blk->data[0];
}
/* Return the number of bytes used by a data block. */
static unsigned int space_used(struct prb_data_ring *data_ring,
struct prb_data_blk_lpos *blk_lpos)
{
/* Data-less blocks take no space. */
if (BLK_DATALESS(blk_lpos))
return 0;
if (DATA_WRAPS(data_ring, blk_lpos->begin) == DATA_WRAPS(data_ring, blk_lpos->next)) {
/* Data block does not wrap. */
return (DATA_INDEX(data_ring, blk_lpos->next) -
DATA_INDEX(data_ring, blk_lpos->begin));
}
/*
* For wrapping data blocks, the trailing (wasted) space is
* also counted.
*/
return (DATA_INDEX(data_ring, blk_lpos->next) +
DATA_SIZE(data_ring) - DATA_INDEX(data_ring, blk_lpos->begin));
}
/*
* Given @blk_lpos, return a pointer to the writer data from the data block
* and calculate the size of the data part. A NULL pointer is returned if
* @blk_lpos specifies values that could never be legal.
*
* This function (used by readers) performs strict validation on the lpos
* values to possibly detect bugs in the writer code. A WARN_ON_ONCE() is
* triggered if an internal error is detected.
*/
static const char *get_data(struct prb_data_ring *data_ring,
struct prb_data_blk_lpos *blk_lpos,
unsigned int *data_size)
{
struct prb_data_block *db;
/* Data-less data block description. */
if (BLK_DATALESS(blk_lpos)) { if (blk_lpos->begin == NO_LPOS && blk_lpos->next == NO_LPOS) { *data_size = 0;
return "";
}
return NULL;
}
/* Regular data block: @begin less than @next and in same wrap. */
if (DATA_WRAPS(data_ring, blk_lpos->begin) == DATA_WRAPS(data_ring, blk_lpos->next) &&
blk_lpos->begin < blk_lpos->next) {
db = to_block(data_ring, blk_lpos->begin);
*data_size = blk_lpos->next - blk_lpos->begin;
/* Wrapping data block: @begin is one wrap behind @next. */
} else if (DATA_WRAPS(data_ring, blk_lpos->begin + DATA_SIZE(data_ring)) ==
DATA_WRAPS(data_ring, blk_lpos->next)) {
db = to_block(data_ring, 0);
*data_size = DATA_INDEX(data_ring, blk_lpos->next);
/* Illegal block description. */
} else {
WARN_ON_ONCE(1);
return NULL;
}
/* A valid data block will always be aligned to the ID size. */
if (WARN_ON_ONCE(blk_lpos->begin != ALIGN(blk_lpos->begin, sizeof(db->id))) || WARN_ON_ONCE(blk_lpos->next != ALIGN(blk_lpos->next, sizeof(db->id)))) {
return NULL;
}
/* A valid data block will always have at least an ID. */
if (WARN_ON_ONCE(*data_size < sizeof(db->id)))
return NULL;
/* Subtract block ID space from size to reflect data size. */
*data_size -= sizeof(db->id); return &db->data[0];
}
/*
* Attempt to transition the newest descriptor from committed back to reserved
* so that the record can be modified by a writer again. This is only possible
* if the descriptor is not yet finalized and the provided @caller_id matches.
*/
static struct prb_desc *desc_reopen_last(struct prb_desc_ring *desc_ring,
u32 caller_id, unsigned long *id_out)
{
unsigned long prev_state_val;
enum desc_state d_state;
struct prb_desc desc;
struct prb_desc *d;
unsigned long id;
u32 cid;
id = atomic_long_read(&desc_ring->head_id);
/*
* To reduce unnecessarily reopening, first check if the descriptor
* state and caller ID are correct.
*/
d_state = desc_read(desc_ring, id, &desc, NULL, &cid);
if (d_state != desc_committed || cid != caller_id)
return NULL;
d = to_desc(desc_ring, id);
prev_state_val = DESC_SV(id, desc_committed);
/*
* Guarantee the reserved state is stored before reading any
* record data. A full memory barrier is needed because @state_var
* modification is followed by reading. This pairs with _prb_commit:B.
*
* Memory barrier involvement:
*
* If desc_reopen_last:A reads from _prb_commit:B, then
* prb_reserve_in_last:A reads from _prb_commit:A.
*
* Relies on:
*
* WMB from _prb_commit:A to _prb_commit:B
* matching
* MB If desc_reopen_last:A to prb_reserve_in_last:A
*/
if (!atomic_long_try_cmpxchg(&d->state_var, &prev_state_val,
DESC_SV(id, desc_reserved))) { /* LMM(desc_reopen_last:A) */
return NULL;
}
*id_out = id;
return d;
}
/**
* prb_reserve_in_last() - Re-reserve and extend the space in the ringbuffer
* used by the newest record.
*
* @e: The entry structure to setup.
* @rb: The ringbuffer to re-reserve and extend data in.
* @r: The record structure to allocate buffers for.
* @caller_id: The caller ID of the caller (reserving writer).
* @max_size: Fail if the extended size would be greater than this.
*
* This is the public function available to writers to re-reserve and extend
* data.
*
* The writer specifies the text size to extend (not the new total size) by
* setting the @text_buf_size field of @r. To ensure proper initialization
* of @r, prb_rec_init_wr() should be used.
*
* This function will fail if @caller_id does not match the caller ID of the
* newest record. In that case the caller must reserve new data using
* prb_reserve().
*
* Context: Any context. Disables local interrupts on success.
* Return: true if text data could be extended, otherwise false.
*
* On success:
*
* - @r->text_buf points to the beginning of the entire text buffer.
*
* - @r->text_buf_size is set to the new total size of the buffer.
*
* - @r->info is not touched so that @r->info->text_len could be used
* to append the text.
*
* - prb_record_text_space() can be used on @e to query the new
* actually used space.
*
* Important: All @r->info fields will already be set with the current values
* for the record. I.e. @r->info->text_len will be less than
* @text_buf_size. Writers can use @r->info->text_len to know
* where concatenation begins and writers should update
* @r->info->text_len after concatenating.
*/
bool prb_reserve_in_last(struct prb_reserved_entry *e, struct printk_ringbuffer *rb,
struct printk_record *r, u32 caller_id, unsigned int max_size)
{
struct prb_desc_ring *desc_ring = &rb->desc_ring;
struct printk_info *info;
unsigned int data_size;
struct prb_desc *d;
unsigned long id;
local_irq_save(e->irqflags);
/* Transition the newest descriptor back to the reserved state. */
d = desc_reopen_last(desc_ring, caller_id, &id);
if (!d) {
local_irq_restore(e->irqflags);
goto fail_reopen;
}
/* Now the writer has exclusive access: LMM(prb_reserve_in_last:A) */
info = to_info(desc_ring, id);
/*
* Set the @e fields here so that prb_commit() can be used if
* anything fails from now on.
*/
e->rb = rb;
e->id = id;
/*
* desc_reopen_last() checked the caller_id, but there was no
* exclusive access at that point. The descriptor may have
* changed since then.
*/
if (caller_id != info->caller_id)
goto fail;
if (BLK_DATALESS(&d->text_blk_lpos)) {
if (WARN_ON_ONCE(info->text_len != 0)) {
pr_warn_once("wrong text_len value (%hu, expecting 0)\n",
info->text_len);
info->text_len = 0;
}
if (!data_check_size(&rb->text_data_ring, r->text_buf_size))
goto fail;
if (r->text_buf_size > max_size)
goto fail;
r->text_buf = data_alloc(rb, r->text_buf_size,
&d->text_blk_lpos, id);
} else {
if (!get_data(&rb->text_data_ring, &d->text_blk_lpos, &data_size))
goto fail;
/*
* Increase the buffer size to include the original size. If
* the meta data (@text_len) is not sane, use the full data
* block size.
*/
if (WARN_ON_ONCE(info->text_len > data_size)) {
pr_warn_once("wrong text_len value (%hu, expecting <=%u)\n",
info->text_len, data_size);
info->text_len = data_size;
}
r->text_buf_size += info->text_len;
if (!data_check_size(&rb->text_data_ring, r->text_buf_size))
goto fail;
if (r->text_buf_size > max_size)
goto fail;
r->text_buf = data_realloc(rb, r->text_buf_size,
&d->text_blk_lpos, id);
}
if (r->text_buf_size && !r->text_buf)
goto fail;
r->info = info;
e->text_space = space_used(&rb->text_data_ring, &d->text_blk_lpos);
return true;
fail:
prb_commit(e);
/* prb_commit() re-enabled interrupts. */
fail_reopen:
/* Make it clear to the caller that the re-reserve failed. */
memset(r, 0, sizeof(*r));
return false;
}
/*
* Attempt to finalize a specified descriptor. If this fails, the descriptor
* is either already final or it will finalize itself when the writer commits.
*/
static void desc_make_final(struct prb_desc_ring *desc_ring, unsigned long id)
{
unsigned long prev_state_val = DESC_SV(id, desc_committed);
struct prb_desc *d = to_desc(desc_ring, id);
atomic_long_cmpxchg_relaxed(&d->state_var, prev_state_val,
DESC_SV(id, desc_finalized)); /* LMM(desc_make_final:A) */
}
/**
* prb_reserve() - Reserve space in the ringbuffer.
*
* @e: The entry structure to setup.
* @rb: The ringbuffer to reserve data in.
* @r: The record structure to allocate buffers for.
*
* This is the public function available to writers to reserve data.
*
* The writer specifies the text size to reserve by setting the
* @text_buf_size field of @r. To ensure proper initialization of @r,
* prb_rec_init_wr() should be used.
*
* Context: Any context. Disables local interrupts on success.
* Return: true if at least text data could be allocated, otherwise false.
*
* On success, the fields @info and @text_buf of @r will be set by this
* function and should be filled in by the writer before committing. Also
* on success, prb_record_text_space() can be used on @e to query the actual
* space used for the text data block.
*
* Important: @info->text_len needs to be set correctly by the writer in
* order for data to be readable and/or extended. Its value
* is initialized to 0.
*/
bool prb_reserve(struct prb_reserved_entry *e, struct printk_ringbuffer *rb,
struct printk_record *r)
{
struct prb_desc_ring *desc_ring = &rb->desc_ring;
struct printk_info *info;
struct prb_desc *d;
unsigned long id;
u64 seq;
if (!data_check_size(&rb->text_data_ring, r->text_buf_size))
goto fail;
/*
* Descriptors in the reserved state act as blockers to all further
* reservations once the desc_ring has fully wrapped. Disable
* interrupts during the reserve/commit window in order to minimize
* the likelihood of this happening.
*/
local_irq_save(e->irqflags);
if (!desc_reserve(rb, &id)) {
/* Descriptor reservation failures are tracked. */
atomic_long_inc(&rb->fail);
local_irq_restore(e->irqflags);
goto fail;
}
d = to_desc(desc_ring, id);
info = to_info(desc_ring, id);
/*
* All @info fields (except @seq) are cleared and must be filled in
* by the writer. Save @seq before clearing because it is used to
* determine the new sequence number.
*/
seq = info->seq;
memset(info, 0, sizeof(*info));
/*
* Set the @e fields here so that prb_commit() can be used if
* text data allocation fails.
*/
e->rb = rb;
e->id = id;
/*
* Initialize the sequence number if it has "never been set".
* Otherwise just increment it by a full wrap.
*
* @seq is considered "never been set" if it has a value of 0,
* _except_ for @infos[0], which was specially setup by the ringbuffer
* initializer and therefore is always considered as set.
*
* See the "Bootstrap" comment block in printk_ringbuffer.h for
* details about how the initializer bootstraps the descriptors.
*/
if (seq == 0 && DESC_INDEX(desc_ring, id) != 0)
info->seq = DESC_INDEX(desc_ring, id);
else
info->seq = seq + DESCS_COUNT(desc_ring);
/*
* New data is about to be reserved. Once that happens, previous
* descriptors are no longer able to be extended. Finalize the
* previous descriptor now so that it can be made available to
* readers. (For seq==0 there is no previous descriptor.)
*/
if (info->seq > 0)
desc_make_final(desc_ring, DESC_ID(id - 1)); r->text_buf = data_alloc(rb, r->text_buf_size, &d->text_blk_lpos, id);
/* If text data allocation fails, a data-less record is committed. */
if (r->text_buf_size && !r->text_buf) { prb_commit(e);
/* prb_commit() re-enabled interrupts. */
goto fail;
}
r->info = info;
/* Record full text space used by record. */
e->text_space = space_used(&rb->text_data_ring, &d->text_blk_lpos);
return true;
fail:
/* Make it clear to the caller that the reserve failed. */
memset(r, 0, sizeof(*r)); return false;
}
/* Commit the data (possibly finalizing it) and restore interrupts. */
static void _prb_commit(struct prb_reserved_entry *e, unsigned long state_val)
{
struct prb_desc_ring *desc_ring = &e->rb->desc_ring;
struct prb_desc *d = to_desc(desc_ring, e->id);
unsigned long prev_state_val = DESC_SV(e->id, desc_reserved);
/* Now the writer has finished all writing: LMM(_prb_commit:A) */
/*
* Set the descriptor as committed. See "ABA Issues" about why
* cmpxchg() instead of set() is used.
*
* 1 Guarantee all record data is stored before the descriptor state
* is stored as committed. A write memory barrier is sufficient
* for this. This pairs with desc_read:B and desc_reopen_last:A.
*
* 2. Guarantee the descriptor state is stored as committed before
* re-checking the head ID in order to possibly finalize this
* descriptor. This pairs with desc_reserve:D.
*
* Memory barrier involvement:
*
* If prb_commit:A reads from desc_reserve:D, then
* desc_make_final:A reads from _prb_commit:B.
*
* Relies on:
*
* MB _prb_commit:B to prb_commit:A
* matching
* MB desc_reserve:D to desc_make_final:A
*/
if (!atomic_long_try_cmpxchg(&d->state_var, &prev_state_val,
DESC_SV(e->id, state_val))) { /* LMM(_prb_commit:B) */
WARN_ON_ONCE(1);
}
/* Restore interrupts, the reserve/commit window is finished. */
local_irq_restore(e->irqflags);
}
/**
* prb_commit() - Commit (previously reserved) data to the ringbuffer.
*
* @e: The entry containing the reserved data information.
*
* This is the public function available to writers to commit data.
*
* Note that the data is not yet available to readers until it is finalized.
* Finalizing happens automatically when space for the next record is
* reserved.
*
* See prb_final_commit() for a version of this function that finalizes
* immediately.
*
* Context: Any context. Enables local interrupts.
*/
void prb_commit(struct prb_reserved_entry *e)
{
struct prb_desc_ring *desc_ring = &e->rb->desc_ring;
unsigned long head_id;
_prb_commit(e, desc_committed);
/*
* If this descriptor is no longer the head (i.e. a new record has
* been allocated), extending the data for this record is no longer
* allowed and therefore it must be finalized.
*/
head_id = atomic_long_read(&desc_ring->head_id); /* LMM(prb_commit:A) */
if (head_id != e->id)
desc_make_final(desc_ring, e->id);
}
/**
* prb_final_commit() - Commit and finalize (previously reserved) data to
* the ringbuffer.
*
* @e: The entry containing the reserved data information.
*
* This is the public function available to writers to commit+finalize data.
*
* By finalizing, the data is made immediately available to readers.
*
* This function should only be used if there are no intentions of extending
* this data using prb_reserve_in_last().
*
* Context: Any context. Enables local interrupts.
*/
void prb_final_commit(struct prb_reserved_entry *e)
{
_prb_commit(e, desc_finalized);
}
/*
* Count the number of lines in provided text. All text has at least 1 line
* (even if @text_size is 0). Each '\n' processed is counted as an additional
* line.
*/
static unsigned int count_lines(const char *text, unsigned int text_size)
{
unsigned int next_size = text_size;
unsigned int line_count = 1;
const char *next = text;
while (next_size) { next = memchr(next, '\n', next_size);
if (!next)
break;
line_count++;
next++;
next_size = text_size - (next - text);
}
return line_count;
}
/*
* Given @blk_lpos, copy an expected @len of data into the provided buffer.
* If @line_count is provided, count the number of lines in the data.
*
* This function (used by readers) performs strict validation on the data
* size to possibly detect bugs in the writer code. A WARN_ON_ONCE() is
* triggered if an internal error is detected.
*/
static bool copy_data(struct prb_data_ring *data_ring,
struct prb_data_blk_lpos *blk_lpos, u16 len, char *buf,
unsigned int buf_size, unsigned int *line_count)
{
unsigned int data_size;
const char *data;
/* Caller might not want any data. */
if ((!buf || !buf_size) && !line_count)
return true;
data = get_data(data_ring, blk_lpos, &data_size);
if (!data)
return false;
/*
* Actual cannot be less than expected. It can be more than expected
* because of the trailing alignment padding.
*
* Note that invalid @len values can occur because the caller loads
* the value during an allowed data race.
*/
if (data_size < (unsigned int)len)
return false;
/* Caller interested in the line count? */
if (line_count) *line_count = count_lines(data, len);
/* Caller interested in the data content? */
if (!buf || !buf_size)
return true;
data_size = min_t(u16, buf_size, len);
memcpy(&buf[0], data, data_size); /* LMM(copy_data:A) */
return true;
}
/*
* This is an extended version of desc_read(). It gets a copy of a specified
* descriptor. However, it also verifies that the record is finalized and has
* the sequence number @seq. On success, 0 is returned.
*
* Error return values:
* -EINVAL: A finalized record with sequence number @seq does not exist.
* -ENOENT: A finalized record with sequence number @seq exists, but its data
* is not available. This is a valid record, so readers should
* continue with the next record.
*/
static int desc_read_finalized_seq(struct prb_desc_ring *desc_ring,
unsigned long id, u64 seq,
struct prb_desc *desc_out)
{
struct prb_data_blk_lpos *blk_lpos = &desc_out->text_blk_lpos;
enum desc_state d_state;
u64 s;
d_state = desc_read(desc_ring, id, desc_out, &s, NULL);
/*
* An unexpected @id (desc_miss) or @seq mismatch means the record
* does not exist. A descriptor in the reserved or committed state
* means the record does not yet exist for the reader.
*/
if (d_state == desc_miss ||
d_state == desc_reserved ||
d_state == desc_committed ||
s != seq) {
return -EINVAL;
}
/*
* A descriptor in the reusable state may no longer have its data
* available; report it as existing but with lost data. Or the record
* may actually be a record with lost data.
*/
if (d_state == desc_reusable || (blk_lpos->begin == FAILED_LPOS && blk_lpos->next == FAILED_LPOS)) {
return -ENOENT;
}
return 0;
}
/*
* Copy the ringbuffer data from the record with @seq to the provided
* @r buffer. On success, 0 is returned.
*
* See desc_read_finalized_seq() for error return values.
*/
static int prb_read(struct printk_ringbuffer *rb, u64 seq,
struct printk_record *r, unsigned int *line_count)
{
struct prb_desc_ring *desc_ring = &rb->desc_ring;
struct printk_info *info = to_info(desc_ring, seq);
struct prb_desc *rdesc = to_desc(desc_ring, seq);
atomic_long_t *state_var = &rdesc->state_var;
struct prb_desc desc;
unsigned long id;
int err;
/* Extract the ID, used to specify the descriptor to read. */
id = DESC_ID(atomic_long_read(state_var));
/* Get a local copy of the correct descriptor (if available). */
err = desc_read_finalized_seq(desc_ring, id, seq, &desc);
/*
* If @r is NULL, the caller is only interested in the availability
* of the record.
*/
if (err || !r)
return err;
/* If requested, copy meta data. */
if (r->info)
memcpy(r->info, info, sizeof(*(r->info)));
/* Copy text data. If it fails, this is a data-less record. */
if (!copy_data(&rb->text_data_ring, &desc.text_blk_lpos, info->text_len,
r->text_buf, r->text_buf_size, line_count)) {
return -ENOENT;
}
/* Ensure the record is still finalized and has the same @seq. */
return desc_read_finalized_seq(desc_ring, id, seq, &desc);
}
/* Get the sequence number of the tail descriptor. */
static u64 prb_first_seq(struct printk_ringbuffer *rb)
{
struct prb_desc_ring *desc_ring = &rb->desc_ring;
enum desc_state d_state;
struct prb_desc desc;
unsigned long id;
u64 seq;
for (;;) {
id = atomic_long_read(&rb->desc_ring.tail_id); /* LMM(prb_first_seq:A) */
d_state = desc_read(desc_ring, id, &desc, &seq, NULL); /* LMM(prb_first_seq:B) */
/*
* This loop will not be infinite because the tail is
* _always_ in the finalized or reusable state.
*/
if (d_state == desc_finalized || d_state == desc_reusable)
break;
/*
* Guarantee the last state load from desc_read() is before
* reloading @tail_id in order to see a new tail in the case
* that the descriptor has been recycled. This pairs with
* desc_reserve:D.
*
* Memory barrier involvement:
*
* If prb_first_seq:B reads from desc_reserve:F, then
* prb_first_seq:A reads from desc_push_tail:B.
*
* Relies on:
*
* MB from desc_push_tail:B to desc_reserve:F
* matching
* RMB prb_first_seq:B to prb_first_seq:A
*/
smp_rmb(); /* LMM(prb_first_seq:C) */
}
return seq;
}
/*
* Non-blocking read of a record. Updates @seq to the last finalized record
* (which may have no data available).
*
* See the description of prb_read_valid() and prb_read_valid_info()
* for details.
*/
static bool _prb_read_valid(struct printk_ringbuffer *rb, u64 *seq,
struct printk_record *r, unsigned int *line_count)
{
u64 tail_seq;
int err;
while ((err = prb_read(rb, *seq, r, line_count))) {
tail_seq = prb_first_seq(rb);
if (*seq < tail_seq) {
/*
* Behind the tail. Catch up and try again. This
* can happen for -ENOENT and -EINVAL cases.
*/
*seq = tail_seq; } else if (err == -ENOENT) {
/* Record exists, but no data available. Skip. */
(*seq)++;
} else {
/* Non-existent/non-finalized record. Must stop. */
return false;
}
}
return true;
}
/**
* prb_read_valid() - Non-blocking read of a requested record or (if gone)
* the next available record.
*
* @rb: The ringbuffer to read from.
* @seq: The sequence number of the record to read.
* @r: A record data buffer to store the read record to.
*
* This is the public function available to readers to read a record.
*
* The reader provides the @info and @text_buf buffers of @r to be
* filled in. Any of the buffer pointers can be set to NULL if the reader
* is not interested in that data. To ensure proper initialization of @r,
* prb_rec_init_rd() should be used.
*
* Context: Any context.
* Return: true if a record was read, otherwise false.
*
* On success, the reader must check r->info.seq to see which record was
* actually read. This allows the reader to detect dropped records.
*
* Failure means @seq refers to a not yet written record.
*/
bool prb_read_valid(struct printk_ringbuffer *rb, u64 seq,
struct printk_record *r)
{
return _prb_read_valid(rb, &seq, r, NULL);
}
/**
* prb_read_valid_info() - Non-blocking read of meta data for a requested
* record or (if gone) the next available record.
*
* @rb: The ringbuffer to read from.
* @seq: The sequence number of the record to read.
* @info: A buffer to store the read record meta data to.
* @line_count: A buffer to store the number of lines in the record text.
*
* This is the public function available to readers to read only the
* meta data of a record.
*
* The reader provides the @info, @line_count buffers to be filled in.
* Either of the buffer pointers can be set to NULL if the reader is not
* interested in that data.
*
* Context: Any context.
* Return: true if a record's meta data was read, otherwise false.
*
* On success, the reader must check info->seq to see which record meta data
* was actually read. This allows the reader to detect dropped records.
*
* Failure means @seq refers to a not yet written record.
*/
bool prb_read_valid_info(struct printk_ringbuffer *rb, u64 seq,
struct printk_info *info, unsigned int *line_count)
{
struct printk_record r;
prb_rec_init_rd(&r, info, NULL, 0);
return _prb_read_valid(rb, &seq, &r, line_count);
}
/**
* prb_first_valid_seq() - Get the sequence number of the oldest available
* record.
*
* @rb: The ringbuffer to get the sequence number from.
*
* This is the public function available to readers to see what the
* first/oldest valid sequence number is.
*
* This provides readers a starting point to begin iterating the ringbuffer.
*
* Context: Any context.
* Return: The sequence number of the first/oldest record or, if the
* ringbuffer is empty, 0 is returned.
*/
u64 prb_first_valid_seq(struct printk_ringbuffer *rb)
{
u64 seq = 0;
if (!_prb_read_valid(rb, &seq, NULL, NULL))
return 0;
return seq;
}
/**
* prb_next_seq() - Get the sequence number after the last available record.
*
* @rb: The ringbuffer to get the sequence number from.
*
* This is the public function available to readers to see what the next
* newest sequence number available to readers will be.
*
* This provides readers a sequence number to jump to if all currently
* available records should be skipped.
*
* Context: Any context.
* Return: The sequence number of the next newest (not yet available) record
* for readers.
*/
u64 prb_next_seq(struct printk_ringbuffer *rb)
{
u64 seq = 0;
/* Search forward from the oldest descriptor. */
while (_prb_read_valid(rb, &seq, NULL, NULL))
seq++;
return seq;
}
/**
* prb_init() - Initialize a ringbuffer to use provided external buffers.
*
* @rb: The ringbuffer to initialize.
* @text_buf: The data buffer for text data.
* @textbits: The size of @text_buf as a power-of-2 value.
* @descs: The descriptor buffer for ringbuffer records.
* @descbits: The count of @descs items as a power-of-2 value.
* @infos: The printk_info buffer for ringbuffer records.
*
* This is the public function available to writers to setup a ringbuffer
* during runtime using provided buffers.
*
* This must match the initialization of DEFINE_PRINTKRB().
*
* Context: Any context.
*/
void prb_init(struct printk_ringbuffer *rb,
char *text_buf, unsigned int textbits,
struct prb_desc *descs, unsigned int descbits,
struct printk_info *infos)
{
memset(descs, 0, _DESCS_COUNT(descbits) * sizeof(descs[0]));
memset(infos, 0, _DESCS_COUNT(descbits) * sizeof(infos[0]));
rb->desc_ring.count_bits = descbits;
rb->desc_ring.descs = descs;
rb->desc_ring.infos = infos;
atomic_long_set(&rb->desc_ring.head_id, DESC0_ID(descbits));
atomic_long_set(&rb->desc_ring.tail_id, DESC0_ID(descbits));
rb->text_data_ring.size_bits = textbits;
rb->text_data_ring.data = text_buf;
atomic_long_set(&rb->text_data_ring.head_lpos, BLK0_LPOS(textbits));
atomic_long_set(&rb->text_data_ring.tail_lpos, BLK0_LPOS(textbits));
atomic_long_set(&rb->fail, 0);
atomic_long_set(&(descs[_DESCS_COUNT(descbits) - 1].state_var), DESC0_SV(descbits));
descs[_DESCS_COUNT(descbits) - 1].text_blk_lpos.begin = FAILED_LPOS;
descs[_DESCS_COUNT(descbits) - 1].text_blk_lpos.next = FAILED_LPOS;
infos[0].seq = -(u64)_DESCS_COUNT(descbits);
infos[_DESCS_COUNT(descbits) - 1].seq = 0;
}
/**
* prb_record_text_space() - Query the full actual used ringbuffer space for
* the text data of a reserved entry.
*
* @e: The successfully reserved entry to query.
*
* This is the public function available to writers to see how much actual
* space is used in the ringbuffer to store the text data of the specified
* entry.
*
* This function is only valid if @e has been successfully reserved using
* prb_reserve().
*
* Context: Any context.
* Return: The size in bytes used by the text data of the associated record.
*/
unsigned int prb_record_text_space(struct prb_reserved_entry *e)
{
return e->text_space;
}
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_HIGHMEM_H
#define _LINUX_HIGHMEM_H
#include <linux/fs.h>
#include <linux/kernel.h>
#include <linux/bug.h>
#include <linux/mm.h>
#include <linux/uaccess.h>
#include <linux/hardirq.h>
#include <asm/cacheflush.h>
#include "highmem-internal.h"
/**
* kmap - Map a page for long term usage
* @page: Pointer to the page to be mapped
*
* Returns: The virtual address of the mapping
*
* Can only be invoked from preemptible task context because on 32bit
* systems with CONFIG_HIGHMEM enabled this function might sleep.
*
* For systems with CONFIG_HIGHMEM=n and for pages in the low memory area
* this returns the virtual address of the direct kernel mapping.
*
* The returned virtual address is globally visible and valid up to the
* point where it is unmapped via kunmap(). The pointer can be handed to
* other contexts.
*
* For highmem pages on 32bit systems this can be slow as the mapping space
* is limited and protected by a global lock. In case that there is no
* mapping slot available the function blocks until a slot is released via
* kunmap().
*/
static inline void *kmap(struct page *page);
/**
* kunmap - Unmap the virtual address mapped by kmap()
* @addr: Virtual address to be unmapped
*
* Counterpart to kmap(). A NOOP for CONFIG_HIGHMEM=n and for mappings of
* pages in the low memory area.
*/
static inline void kunmap(struct page *page);
/**
* kmap_to_page - Get the page for a kmap'ed address
* @addr: The address to look up
*
* Returns: The page which is mapped to @addr.
*/
static inline struct page *kmap_to_page(void *addr);
/**
* kmap_flush_unused - Flush all unused kmap mappings in order to
* remove stray mappings
*/
static inline void kmap_flush_unused(void);
/**
* kmap_local_page - Map a page for temporary usage
* @page: Pointer to the page to be mapped
*
* Returns: The virtual address of the mapping
*
* Can be invoked from any context.
*
* Requires careful handling when nesting multiple mappings because the map
* management is stack based. The unmap has to be in the reverse order of
* the map operation:
*
* addr1 = kmap_local_page(page1);
* addr2 = kmap_local_page(page2);
* ...
* kunmap_local(addr2);
* kunmap_local(addr1);
*
* Unmapping addr1 before addr2 is invalid and causes malfunction.
*
* Contrary to kmap() mappings the mapping is only valid in the context of
* the caller and cannot be handed to other contexts.
*
* On CONFIG_HIGHMEM=n kernels and for low memory pages this returns the
* virtual address of the direct mapping. Only real highmem pages are
* temporarily mapped.
*
* While it is significantly faster than kmap() for the higmem case it
* comes with restrictions about the pointer validity. Only use when really
* necessary.
*
* On HIGHMEM enabled systems mapping a highmem page has the side effect of
* disabling migration in order to keep the virtual address stable across
* preemption. No caller of kmap_local_page() can rely on this side effect.
*/
static inline void *kmap_local_page(struct page *page);
/**
* kmap_atomic - Atomically map a page for temporary usage - Deprecated!
* @page: Pointer to the page to be mapped
*
* Returns: The virtual address of the mapping
*
* Effectively a wrapper around kmap_local_page() which disables pagefaults
* and preemption.
*
* Do not use in new code. Use kmap_local_page() instead.
*/
static inline void *kmap_atomic(struct page *page);
/**
* kunmap_atomic - Unmap the virtual address mapped by kmap_atomic()
* @addr: Virtual address to be unmapped
*
* Counterpart to kmap_atomic().
*
* Effectively a wrapper around kunmap_local() which additionally undoes
* the side effects of kmap_atomic(), i.e. reenabling pagefaults and
* preemption.
*/
/* Highmem related interfaces for management code */
static inline unsigned int nr_free_highpages(void);
static inline unsigned long totalhigh_pages(void);
#ifndef ARCH_HAS_FLUSH_ANON_PAGE
static inline void flush_anon_page(struct vm_area_struct *vma, struct page *page, unsigned long vmaddr)
{
}
#endif
#ifndef ARCH_IMPLEMENTS_FLUSH_KERNEL_VMAP_RANGE
static inline void flush_kernel_vmap_range(void *vaddr, int size)
{
}
static inline void invalidate_kernel_vmap_range(void *vaddr, int size)
{
}
#endif
/* when CONFIG_HIGHMEM is not set these will be plain clear/copy_page */
#ifndef clear_user_highpage
static inline void clear_user_highpage(struct page *page, unsigned long vaddr)
{
void *addr = kmap_atomic(page);
clear_user_page(addr, vaddr, page);
kunmap_atomic(addr);
}
#endif
#ifndef __HAVE_ARCH_ALLOC_ZEROED_USER_HIGHPAGE_MOVABLE
/**
* alloc_zeroed_user_highpage_movable - Allocate a zeroed HIGHMEM page for a VMA that the caller knows can move
* @vma: The VMA the page is to be allocated for
* @vaddr: The virtual address the page will be inserted into
*
* This function will allocate a page for a VMA that the caller knows will
* be able to migrate in the future using move_pages() or reclaimed
*
* An architecture may override this function by defining
* __HAVE_ARCH_ALLOC_ZEROED_USER_HIGHPAGE_MOVABLE and providing their own
* implementation.
*/
static inline struct page *
alloc_zeroed_user_highpage_movable(struct vm_area_struct *vma,
unsigned long vaddr)
{
struct page *page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, vaddr);
if (page)
clear_user_highpage(page, vaddr);
return page;
}
#endif
static inline void clear_highpage(struct page *page)
{
void *kaddr = kmap_atomic(page);
clear_page(kaddr);
kunmap_atomic(kaddr);
}
#ifndef __HAVE_ARCH_TAG_CLEAR_HIGHPAGE
static inline void tag_clear_highpage(struct page *page)
{
}
#endif
/*
* If we pass in a base or tail page, we can zero up to PAGE_SIZE.
* If we pass in a head page, we can zero up to the size of the compound page.
*/
#if defined(CONFIG_HIGHMEM) && defined(CONFIG_TRANSPARENT_HUGEPAGE)
void zero_user_segments(struct page *page, unsigned start1, unsigned end1,
unsigned start2, unsigned end2);
#else /* !HIGHMEM || !TRANSPARENT_HUGEPAGE */
static inline void zero_user_segments(struct page *page,
unsigned start1, unsigned end1,
unsigned start2, unsigned end2)
{
void *kaddr = kmap_atomic(page);
unsigned int i;
BUG_ON(end1 > page_size(page) || end2 > page_size(page)); if (end1 > start1) memset(kaddr + start1, 0, end1 - start1); if (end2 > start2) memset(kaddr + start2, 0, end2 - start2);
kunmap_atomic(kaddr);
for (i = 0; i < compound_nr(page); i++)
flush_dcache_page(page + i);
}
#endif /* !HIGHMEM || !TRANSPARENT_HUGEPAGE */
static inline void zero_user_segment(struct page *page,
unsigned start, unsigned end)
{
zero_user_segments(page, start, end, 0, 0);
}
static inline void zero_user(struct page *page,
unsigned start, unsigned size)
{
zero_user_segments(page, start, start + size, 0, 0);
}
#ifndef __HAVE_ARCH_COPY_USER_HIGHPAGE
static inline void copy_user_highpage(struct page *to, struct page *from,
unsigned long vaddr, struct vm_area_struct *vma)
{
char *vfrom, *vto;
vfrom = kmap_atomic(from);
vto = kmap_atomic(to);
copy_user_page(vto, vfrom, vaddr, to);
kunmap_atomic(vto);
kunmap_atomic(vfrom);
}
#endif
#ifndef __HAVE_ARCH_COPY_HIGHPAGE
static inline void copy_highpage(struct page *to, struct page *from)
{
char *vfrom, *vto;
vfrom = kmap_atomic(from);
vto = kmap_atomic(to);
copy_page(vto, vfrom);
kunmap_atomic(vto);
kunmap_atomic(vfrom);
}
#endif
static inline void memcpy_page(struct page *dst_page, size_t dst_off,
struct page *src_page, size_t src_off,
size_t len)
{
char *dst = kmap_local_page(dst_page);
char *src = kmap_local_page(src_page);
VM_BUG_ON(dst_off + len > PAGE_SIZE || src_off + len > PAGE_SIZE);
memcpy(dst + dst_off, src + src_off, len);
kunmap_local(src);
kunmap_local(dst);
}
static inline void memmove_page(struct page *dst_page, size_t dst_off,
struct page *src_page, size_t src_off,
size_t len)
{
char *dst = kmap_local_page(dst_page);
char *src = kmap_local_page(src_page);
VM_BUG_ON(dst_off + len > PAGE_SIZE || src_off + len > PAGE_SIZE);
memmove(dst + dst_off, src + src_off, len);
kunmap_local(src);
kunmap_local(dst);
}
static inline void memset_page(struct page *page, size_t offset, int val,
size_t len)
{
char *addr = kmap_local_page(page);
VM_BUG_ON(offset + len > PAGE_SIZE);
memset(addr + offset, val, len);
kunmap_local(addr);
}
static inline void memcpy_from_page(char *to, struct page *page,
size_t offset, size_t len)
{
char *from = kmap_local_page(page);
VM_BUG_ON(offset + len > PAGE_SIZE);
memcpy(to, from + offset, len);
kunmap_local(from);
}
static inline void memcpy_to_page(struct page *page, size_t offset,
const char *from, size_t len)
{
char *to = kmap_local_page(page);
VM_BUG_ON(offset + len > PAGE_SIZE);
memcpy(to + offset, from, len);
flush_dcache_page(page);
kunmap_local(to);
}
static inline void memzero_page(struct page *page, size_t offset, size_t len)
{
char *addr = kmap_local_page(page);
memset(addr + offset, 0, len);
flush_dcache_page(page);
kunmap_local(addr);
}
#endif /* _LINUX_HIGHMEM_H */
// SPDX-License-Identifier: GPL-2.0-or-later
/* Task credentials management - see Documentation/security/credentials.rst
*
* Copyright (C) 2008 Red Hat, Inc. All Rights Reserved.
* Written by David Howells (dhowells@redhat.com)
*/
#include <linux/export.h>
#include <linux/cred.h>
#include <linux/slab.h>
#include <linux/sched.h>
#include <linux/sched/coredump.h>
#include <linux/key.h>
#include <linux/keyctl.h>
#include <linux/init_task.h>
#include <linux/security.h>
#include <linux/binfmts.h>
#include <linux/cn_proc.h>
#include <linux/uidgid.h>
#if 0
#define kdebug(FMT, ...) \
printk("[%-5.5s%5u] " FMT "\n", \
current->comm, current->pid, ##__VA_ARGS__)
#else
#define kdebug(FMT, ...) \
do { \
if (0) \
no_printk("[%-5.5s%5u] " FMT "\n", \
current->comm, current->pid, ##__VA_ARGS__); \
} while (0)
#endif
static struct kmem_cache *cred_jar;
/* init to 2 - one for init_task, one to ensure it is never freed */
static struct group_info init_groups = { .usage = ATOMIC_INIT(2) };
/*
* The initial credentials for the initial task
*/
struct cred init_cred = {
.usage = ATOMIC_INIT(4),
#ifdef CONFIG_DEBUG_CREDENTIALS
.subscribers = ATOMIC_INIT(2),
.magic = CRED_MAGIC,
#endif
.uid = GLOBAL_ROOT_UID,
.gid = GLOBAL_ROOT_GID,
.suid = GLOBAL_ROOT_UID,
.sgid = GLOBAL_ROOT_GID,
.euid = GLOBAL_ROOT_UID,
.egid = GLOBAL_ROOT_GID,
.fsuid = GLOBAL_ROOT_UID,
.fsgid = GLOBAL_ROOT_GID,
.securebits = SECUREBITS_DEFAULT,
.cap_inheritable = CAP_EMPTY_SET,
.cap_permitted = CAP_FULL_SET,
.cap_effective = CAP_FULL_SET,
.cap_bset = CAP_FULL_SET,
.user = INIT_USER,
.user_ns = &init_user_ns,
.group_info = &init_groups,
.ucounts = &init_ucounts,
};
static inline void set_cred_subscribers(struct cred *cred, int n)
{
#ifdef CONFIG_DEBUG_CREDENTIALS
atomic_set(&cred->subscribers, n);
#endif
}
static inline int read_cred_subscribers(const struct cred *cred)
{
#ifdef CONFIG_DEBUG_CREDENTIALS
return atomic_read(&cred->subscribers);
#else
return 0;
#endif
}
static inline void alter_cred_subscribers(const struct cred *_cred, int n)
{
#ifdef CONFIG_DEBUG_CREDENTIALS
struct cred *cred = (struct cred *) _cred;
atomic_add(n, &cred->subscribers);
#endif
}
/*
* The RCU callback to actually dispose of a set of credentials
*/
static void put_cred_rcu(struct rcu_head *rcu)
{
struct cred *cred = container_of(rcu, struct cred, rcu);
kdebug("put_cred_rcu(%p)", cred);
#ifdef CONFIG_DEBUG_CREDENTIALS
if (cred->magic != CRED_MAGIC_DEAD ||
atomic_read(&cred->usage) != 0 ||
read_cred_subscribers(cred) != 0)
panic("CRED: put_cred_rcu() sees %p with"
" mag %x, put %p, usage %d, subscr %d\n",
cred, cred->magic, cred->put_addr,
atomic_read(&cred->usage),
read_cred_subscribers(cred));
#else
if (atomic_read(&cred->usage) != 0)
panic("CRED: put_cred_rcu() sees %p with usage %d\n",
cred, atomic_read(&cred->usage));
#endif
security_cred_free(cred);
key_put(cred->session_keyring);
key_put(cred->process_keyring);
key_put(cred->thread_keyring);
key_put(cred->request_key_auth);
if (cred->group_info)
put_group_info(cred->group_info);
free_uid(cred->user);
if (cred->ucounts)
put_ucounts(cred->ucounts);
put_user_ns(cred->user_ns);
kmem_cache_free(cred_jar, cred);
}
/**
* __put_cred - Destroy a set of credentials
* @cred: The record to release
*
* Destroy a set of credentials on which no references remain.
*/
void __put_cred(struct cred *cred)
{
kdebug("__put_cred(%p{%d,%d})", cred,
atomic_read(&cred->usage),
read_cred_subscribers(cred));
BUG_ON(atomic_read(&cred->usage) != 0);
#ifdef CONFIG_DEBUG_CREDENTIALS
BUG_ON(read_cred_subscribers(cred) != 0);
cred->magic = CRED_MAGIC_DEAD;
cred->put_addr = __builtin_return_address(0);
#endif
BUG_ON(cred == current->cred); BUG_ON(cred == current->real_cred); if (cred->non_rcu) put_cred_rcu(&cred->rcu);
else
call_rcu(&cred->rcu, put_cred_rcu);
}
EXPORT_SYMBOL(__put_cred);
/*
* Clean up a task's credentials when it exits
*/
void exit_creds(struct task_struct *tsk)
{
struct cred *cred;
kdebug("exit_creds(%u,%p,%p,{%d,%d})", tsk->pid, tsk->real_cred, tsk->cred,
atomic_read(&tsk->cred->usage),
read_cred_subscribers(tsk->cred));
cred = (struct cred *) tsk->real_cred;
tsk->real_cred = NULL;
validate_creds(cred);
alter_cred_subscribers(cred, -1);
put_cred(cred);
cred = (struct cred *) tsk->cred;
tsk->cred = NULL;
validate_creds(cred);
alter_cred_subscribers(cred, -1);
put_cred(cred);
#ifdef CONFIG_KEYS_REQUEST_CACHE
key_put(tsk->cached_requested_key);
tsk->cached_requested_key = NULL;
#endif
}
/**
* get_task_cred - Get another task's objective credentials
* @task: The task to query
*
* Get the objective credentials of a task, pinning them so that they can't go
* away. Accessing a task's credentials directly is not permitted.
*
* The caller must also make sure task doesn't get deleted, either by holding a
* ref on task or by holding tasklist_lock to prevent it from being unlinked.
*/
const struct cred *get_task_cred(struct task_struct *task)
{
const struct cred *cred;
rcu_read_lock();
do {
cred = __task_cred((task));
BUG_ON(!cred);
} while (!get_cred_rcu(cred));
rcu_read_unlock();
return cred;
}
EXPORT_SYMBOL(get_task_cred);
/*
* Allocate blank credentials, such that the credentials can be filled in at a
* later date without risk of ENOMEM.
*/
struct cred *cred_alloc_blank(void)
{
struct cred *new;
new = kmem_cache_zalloc(cred_jar, GFP_KERNEL);
if (!new)
return NULL;
atomic_set(&new->usage, 1);
#ifdef CONFIG_DEBUG_CREDENTIALS
new->magic = CRED_MAGIC;
#endif
if (security_cred_alloc_blank(new, GFP_KERNEL_ACCOUNT) < 0)
goto error;
return new;
error:
abort_creds(new);
return NULL;
}
/**
* prepare_creds - Prepare a new set of credentials for modification
*
* Prepare a new set of task credentials for modification. A task's creds
* shouldn't generally be modified directly, therefore this function is used to
* prepare a new copy, which the caller then modifies and then commits by
* calling commit_creds().
*
* Preparation involves making a copy of the objective creds for modification.
*
* Returns a pointer to the new creds-to-be if successful, NULL otherwise.
*
* Call commit_creds() or abort_creds() to clean up.
*/
struct cred *prepare_creds(void)
{
struct task_struct *task = current;
const struct cred *old;
struct cred *new;
validate_process_creds();
new = kmem_cache_alloc(cred_jar, GFP_KERNEL);
if (!new)
return NULL;
kdebug("prepare_creds() alloc %p", new);
old = task->cred;
memcpy(new, old, sizeof(struct cred));
new->non_rcu = 0;
atomic_set(&new->usage, 1);
set_cred_subscribers(new, 0);
get_group_info(new->group_info);
get_uid(new->user);
get_user_ns(new->user_ns);
#ifdef CONFIG_KEYS
key_get(new->session_keyring);
key_get(new->process_keyring);
key_get(new->thread_keyring);
key_get(new->request_key_auth);
#endif
#ifdef CONFIG_SECURITY
new->security = NULL;
#endif
new->ucounts = get_ucounts(new->ucounts);
if (!new->ucounts)
goto error;
if (security_prepare_creds(new, old, GFP_KERNEL_ACCOUNT) < 0)
goto error;
validate_creds(new);
return new;
error:
abort_creds(new);
return NULL;
}
EXPORT_SYMBOL(prepare_creds);
/*
* Prepare credentials for current to perform an execve()
* - The caller must hold ->cred_guard_mutex
*/
struct cred *prepare_exec_creds(void)
{
struct cred *new;
new = prepare_creds();
if (!new)
return new;
#ifdef CONFIG_KEYS
/* newly exec'd tasks don't get a thread keyring */
key_put(new->thread_keyring);
new->thread_keyring = NULL;
/* inherit the session keyring; new process keyring */
key_put(new->process_keyring);
new->process_keyring = NULL;
#endif
new->suid = new->fsuid = new->euid;
new->sgid = new->fsgid = new->egid;
return new;
}
/*
* Copy credentials for the new process created by fork()
*
* We share if we can, but under some circumstances we have to generate a new
* set.
*
* The new process gets the current process's subjective credentials as its
* objective and subjective credentials
*/
int copy_creds(struct task_struct *p, unsigned long clone_flags)
{
struct cred *new;
int ret;
#ifdef CONFIG_KEYS_REQUEST_CACHE
p->cached_requested_key = NULL;
#endif
if (
#ifdef CONFIG_KEYS
!p->cred->thread_keyring &&
#endif
clone_flags & CLONE_THREAD
) {
p->real_cred = get_cred(p->cred);
get_cred(p->cred);
alter_cred_subscribers(p->cred, 2);
kdebug("share_creds(%p{%d,%d})",
p->cred, atomic_read(&p->cred->usage),
read_cred_subscribers(p->cred));
inc_rlimit_ucounts(task_ucounts(p), UCOUNT_RLIMIT_NPROC, 1);
return 0;
}
new = prepare_creds();
if (!new)
return -ENOMEM;
if (clone_flags & CLONE_NEWUSER) {
ret = create_user_ns(new);
if (ret < 0)
goto error_put;
ret = set_cred_ucounts(new);
if (ret < 0)
goto error_put;
}
#ifdef CONFIG_KEYS
/* new threads get their own thread keyrings if their parent already
* had one */
if (new->thread_keyring) {
key_put(new->thread_keyring);
new->thread_keyring = NULL;
if (clone_flags & CLONE_THREAD)
install_thread_keyring_to_cred(new);
}
/* The process keyring is only shared between the threads in a process;
* anything outside of those threads doesn't inherit.
*/
if (!(clone_flags & CLONE_THREAD)) {
key_put(new->process_keyring);
new->process_keyring = NULL;
}
#endif
p->cred = p->real_cred = get_cred(new);
inc_rlimit_ucounts(task_ucounts(p), UCOUNT_RLIMIT_NPROC, 1);
alter_cred_subscribers(new, 2);
validate_creds(new);
return 0;
error_put:
put_cred(new);
return ret;
}
static bool cred_cap_issubset(const struct cred *set, const struct cred *subset)
{
const struct user_namespace *set_ns = set->user_ns;
const struct user_namespace *subset_ns = subset->user_ns;
/* If the two credentials are in the same user namespace see if
* the capabilities of subset are a subset of set.
*/
if (set_ns == subset_ns)
return cap_issubset(subset->cap_permitted, set->cap_permitted);
/* The credentials are in a different user namespaces
* therefore one is a subset of the other only if a set is an
* ancestor of subset and set->euid is owner of subset or one
* of subsets ancestors.
*/
for (;subset_ns != &init_user_ns; subset_ns = subset_ns->parent) {
if ((set_ns == subset_ns->parent) &&
uid_eq(subset_ns->owner, set->euid))
return true;
}
return false;
}
/**
* commit_creds - Install new credentials upon the current task
* @new: The credentials to be assigned
*
* Install a new set of credentials to the current task, using RCU to replace
* the old set. Both the objective and the subjective credentials pointers are
* updated. This function may not be called if the subjective credentials are
* in an overridden state.
*
* This function eats the caller's reference to the new credentials.
*
* Always returns 0 thus allowing this function to be tail-called at the end
* of, say, sys_setgid().
*/
int commit_creds(struct cred *new)
{
struct task_struct *task = current;
const struct cred *old = task->real_cred;
kdebug("commit_creds(%p{%d,%d})", new,
atomic_read(&new->usage),
read_cred_subscribers(new));
BUG_ON(task->cred != old);
#ifdef CONFIG_DEBUG_CREDENTIALS
BUG_ON(read_cred_subscribers(old) < 2);
validate_creds(old);
validate_creds(new);
#endif
BUG_ON(atomic_read(&new->usage) < 1);
get_cred(new); /* we will require a ref for the subj creds too */
/* dumpability changes */
if (!uid_eq(old->euid, new->euid) ||
!gid_eq(old->egid, new->egid) ||
!uid_eq(old->fsuid, new->fsuid) ||
!gid_eq(old->fsgid, new->fsgid) ||
!cred_cap_issubset(old, new)) {
if (task->mm)
set_dumpable(task->mm, suid_dumpable);
task->pdeath_signal = 0;
/*
* If a task drops privileges and becomes nondumpable,
* the dumpability change must become visible before
* the credential change; otherwise, a __ptrace_may_access()
* racing with this change may be able to attach to a task it
* shouldn't be able to attach to (as if the task had dropped
* privileges without becoming nondumpable).
* Pairs with a read barrier in __ptrace_may_access().
*/
smp_wmb();
}
/* alter the thread keyring */
if (!uid_eq(new->fsuid, old->fsuid))
key_fsuid_changed(new);
if (!gid_eq(new->fsgid, old->fsgid))
key_fsgid_changed(new);
/* do it
* RLIMIT_NPROC limits on user->processes have already been checked
* in set_user().
*/
alter_cred_subscribers(new, 2);
if (new->user != old->user || new->user_ns != old->user_ns)
inc_rlimit_ucounts(new->ucounts, UCOUNT_RLIMIT_NPROC, 1);
rcu_assign_pointer(task->real_cred, new);
rcu_assign_pointer(task->cred, new);
if (new->user != old->user || new->user_ns != old->user_ns)
dec_rlimit_ucounts(old->ucounts, UCOUNT_RLIMIT_NPROC, 1);
alter_cred_subscribers(old, -2);
/* send notifications */
if (!uid_eq(new->uid, old->uid) ||
!uid_eq(new->euid, old->euid) ||
!uid_eq(new->suid, old->suid) ||
!uid_eq(new->fsuid, old->fsuid))
proc_id_connector(task, PROC_EVENT_UID);
if (!gid_eq(new->gid, old->gid) ||
!gid_eq(new->egid, old->egid) ||
!gid_eq(new->sgid, old->sgid) ||
!gid_eq(new->fsgid, old->fsgid))
proc_id_connector(task, PROC_EVENT_GID);
/* release the old obj and subj refs both */
put_cred(old);
put_cred(old);
return 0;
}
EXPORT_SYMBOL(commit_creds);
/**
* abort_creds - Discard a set of credentials and unlock the current task
* @new: The credentials that were going to be applied
*
* Discard a set of credentials that were under construction and unlock the
* current task.
*/
void abort_creds(struct cred *new)
{
kdebug("abort_creds(%p{%d,%d})", new,
atomic_read(&new->usage),
read_cred_subscribers(new));
#ifdef CONFIG_DEBUG_CREDENTIALS
BUG_ON(read_cred_subscribers(new) != 0);
#endif
BUG_ON(atomic_read(&new->usage) < 1);
put_cred(new);
}
EXPORT_SYMBOL(abort_creds);
/**
* override_creds - Override the current process's subjective credentials
* @new: The credentials to be assigned
*
* Install a set of temporary override subjective credentials on the current
* process, returning the old set for later reversion.
*/
const struct cred *override_creds(const struct cred *new)
{
const struct cred *old = current->cred;
kdebug("override_creds(%p{%d,%d})", new,
atomic_read(&new->usage),
read_cred_subscribers(new));
validate_creds(old);
validate_creds(new);
/*
* NOTE! This uses 'get_new_cred()' rather than 'get_cred()'.
*
* That means that we do not clear the 'non_rcu' flag, since
* we are only installing the cred into the thread-synchronous
* '->cred' pointer, not the '->real_cred' pointer that is
* visible to other threads under RCU.
*
* Also note that we did validate_creds() manually, not depending
* on the validation in 'get_cred()'.
*/
get_new_cred((struct cred *)new);
alter_cred_subscribers(new, 1);
rcu_assign_pointer(current->cred, new);
alter_cred_subscribers(old, -1);
kdebug("override_creds() = %p{%d,%d}", old,
atomic_read(&old->usage),
read_cred_subscribers(old));
return old;
}
EXPORT_SYMBOL(override_creds);
/**
* revert_creds - Revert a temporary subjective credentials override
* @old: The credentials to be restored
*
* Revert a temporary set of override subjective credentials to an old set,
* discarding the override set.
*/
void revert_creds(const struct cred *old)
{
const struct cred *override = current->cred;
kdebug("revert_creds(%p{%d,%d})", old,
atomic_read(&old->usage),
read_cred_subscribers(old));
validate_creds(old);
validate_creds(override);
alter_cred_subscribers(old, 1);
rcu_assign_pointer(current->cred, old);
alter_cred_subscribers(override, -1);
put_cred(override);
}
EXPORT_SYMBOL(revert_creds);
/**
* cred_fscmp - Compare two credentials with respect to filesystem access.
* @a: The first credential
* @b: The second credential
*
* cred_cmp() will return zero if both credentials have the same
* fsuid, fsgid, and supplementary groups. That is, if they will both
* provide the same access to files based on mode/uid/gid.
* If the credentials are different, then either -1 or 1 will
* be returned depending on whether @a comes before or after @b
* respectively in an arbitrary, but stable, ordering of credentials.
*
* Return: -1, 0, or 1 depending on comparison
*/
int cred_fscmp(const struct cred *a, const struct cred *b)
{
struct group_info *ga, *gb;
int g;
if (a == b)
return 0;
if (uid_lt(a->fsuid, b->fsuid))
return -1;
if (uid_gt(a->fsuid, b->fsuid))
return 1;
if (gid_lt(a->fsgid, b->fsgid))
return -1;
if (gid_gt(a->fsgid, b->fsgid))
return 1;
ga = a->group_info;
gb = b->group_info;
if (ga == gb)
return 0;
if (ga == NULL)
return -1;
if (gb == NULL)
return 1;
if (ga->ngroups < gb->ngroups)
return -1;
if (ga->ngroups > gb->ngroups)
return 1;
for (g = 0; g < ga->ngroups; g++) {
if (gid_lt(ga->gid[g], gb->gid[g]))
return -1;
if (gid_gt(ga->gid[g], gb->gid[g]))
return 1;
}
return 0;
}
EXPORT_SYMBOL(cred_fscmp);
int set_cred_ucounts(struct cred *new)
{
struct ucounts *new_ucounts, *old_ucounts = new->ucounts;
/*
* This optimization is needed because alloc_ucounts() uses locks
* for table lookups.
*/
if (old_ucounts->ns == new->user_ns && uid_eq(old_ucounts->uid, new->uid))
return 0;
if (!(new_ucounts = alloc_ucounts(new->user_ns, new->uid)))
return -EAGAIN;
new->ucounts = new_ucounts;
put_ucounts(old_ucounts);
return 0;
}
/*
* initialise the credentials stuff
*/
void __init cred_init(void)
{
/* allocate a slab in which we can store credentials */
cred_jar = kmem_cache_create("cred_jar", sizeof(struct cred), 0,
SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT, NULL);
}
/**
* prepare_kernel_cred - Prepare a set of credentials for a kernel service
* @daemon: A userspace daemon to be used as a reference
*
* Prepare a set of credentials for a kernel service. This can then be used to
* override a task's own credentials so that work can be done on behalf of that
* task that requires a different subjective context.
*
* @daemon is used to provide a base for the security record, but can be NULL.
* If @daemon is supplied, then the security data will be derived from that;
* otherwise they'll be set to 0 and no groups, full capabilities and no keys.
*
* The caller may change these controls afterwards if desired.
*
* Returns the new credentials or NULL if out of memory.
*/
struct cred *prepare_kernel_cred(struct task_struct *daemon)
{
const struct cred *old;
struct cred *new;
new = kmem_cache_alloc(cred_jar, GFP_KERNEL);
if (!new)
return NULL;
kdebug("prepare_kernel_cred() alloc %p", new);
if (daemon)
old = get_task_cred(daemon);
else
old = get_cred(&init_cred);
validate_creds(old);
*new = *old;
new->non_rcu = 0;
atomic_set(&new->usage, 1);
set_cred_subscribers(new, 0);
get_uid(new->user);
get_user_ns(new->user_ns);
get_group_info(new->group_info);
#ifdef CONFIG_KEYS
new->session_keyring = NULL;
new->process_keyring = NULL;
new->thread_keyring = NULL;
new->request_key_auth = NULL;
new->jit_keyring = KEY_REQKEY_DEFL_THREAD_KEYRING;
#endif
#ifdef CONFIG_SECURITY
new->security = NULL;
#endif
new->ucounts = get_ucounts(new->ucounts);
if (!new->ucounts)
goto error;
if (security_prepare_creds(new, old, GFP_KERNEL_ACCOUNT) < 0)
goto error;
put_cred(old);
validate_creds(new);
return new;
error:
put_cred(new);
put_cred(old);
return NULL;
}
EXPORT_SYMBOL(prepare_kernel_cred);
/**
* set_security_override - Set the security ID in a set of credentials
* @new: The credentials to alter
* @secid: The LSM security ID to set
*
* Set the LSM security ID in a set of credentials so that the subjective
* security is overridden when an alternative set of credentials is used.
*/
int set_security_override(struct cred *new, u32 secid)
{
return security_kernel_act_as(new, secid);
}
EXPORT_SYMBOL(set_security_override);
/**
* set_security_override_from_ctx - Set the security ID in a set of credentials
* @new: The credentials to alter
* @secctx: The LSM security context to generate the security ID from.
*
* Set the LSM security ID in a set of credentials so that the subjective
* security is overridden when an alternative set of credentials is used. The
* security ID is specified in string form as a security context to be
* interpreted by the LSM.
*/
int set_security_override_from_ctx(struct cred *new, const char *secctx)
{
u32 secid;
int ret;
ret = security_secctx_to_secid(secctx, strlen(secctx), &secid);
if (ret < 0)
return ret;
return set_security_override(new, secid);
}
EXPORT_SYMBOL(set_security_override_from_ctx);
/**
* set_create_files_as - Set the LSM file create context in a set of credentials
* @new: The credentials to alter
* @inode: The inode to take the context from
*
* Change the LSM file creation context in a set of credentials to be the same
* as the object context of the specified inode, so that the new inodes have
* the same MAC context as that inode.
*/
int set_create_files_as(struct cred *new, struct inode *inode)
{
if (!uid_valid(inode->i_uid) || !gid_valid(inode->i_gid))
return -EINVAL;
new->fsuid = inode->i_uid;
new->fsgid = inode->i_gid;
return security_kernel_create_files_as(new, inode);
}
EXPORT_SYMBOL(set_create_files_as);
#ifdef CONFIG_DEBUG_CREDENTIALS
bool creds_are_invalid(const struct cred *cred)
{
if (cred->magic != CRED_MAGIC)
return true;
return false;
}
EXPORT_SYMBOL(creds_are_invalid);
/*
* dump invalid credentials
*/
static void dump_invalid_creds(const struct cred *cred, const char *label,
const struct task_struct *tsk)
{
printk(KERN_ERR "CRED: %s credentials: %p %s%s%s\n",
label, cred,
cred == &init_cred ? "[init]" : "",
cred == tsk->real_cred ? "[real]" : "",
cred == tsk->cred ? "[eff]" : "");
printk(KERN_ERR "CRED: ->magic=%x, put_addr=%p\n",
cred->magic, cred->put_addr);
printk(KERN_ERR "CRED: ->usage=%d, subscr=%d\n",
atomic_read(&cred->usage),
read_cred_subscribers(cred));
printk(KERN_ERR "CRED: ->*uid = { %d,%d,%d,%d }\n",
from_kuid_munged(&init_user_ns, cred->uid),
from_kuid_munged(&init_user_ns, cred->euid),
from_kuid_munged(&init_user_ns, cred->suid),
from_kuid_munged(&init_user_ns, cred->fsuid));
printk(KERN_ERR "CRED: ->*gid = { %d,%d,%d,%d }\n",
from_kgid_munged(&init_user_ns, cred->gid),
from_kgid_munged(&init_user_ns, cred->egid),
from_kgid_munged(&init_user_ns, cred->sgid),
from_kgid_munged(&init_user_ns, cred->fsgid));
#ifdef CONFIG_SECURITY
printk(KERN_ERR "CRED: ->security is %p\n", cred->security);
if ((unsigned long) cred->security >= PAGE_SIZE &&
(((unsigned long) cred->security & 0xffffff00) !=
(POISON_FREE << 24 | POISON_FREE << 16 | POISON_FREE << 8)))
printk(KERN_ERR "CRED: ->security {%x, %x}\n",
((u32*)cred->security)[0],
((u32*)cred->security)[1]);
#endif
}
/*
* report use of invalid credentials
*/
void __invalid_creds(const struct cred *cred, const char *file, unsigned line)
{
printk(KERN_ERR "CRED: Invalid credentials\n");
printk(KERN_ERR "CRED: At %s:%u\n", file, line);
dump_invalid_creds(cred, "Specified", current);
BUG();
}
EXPORT_SYMBOL(__invalid_creds);
/*
* check the credentials on a process
*/
void __validate_process_creds(struct task_struct *tsk,
const char *file, unsigned line)
{
if (tsk->cred == tsk->real_cred) {
if (unlikely(read_cred_subscribers(tsk->cred) < 2 ||
creds_are_invalid(tsk->cred)))
goto invalid_creds;
} else {
if (unlikely(read_cred_subscribers(tsk->real_cred) < 1 ||
read_cred_subscribers(tsk->cred) < 1 ||
creds_are_invalid(tsk->real_cred) ||
creds_are_invalid(tsk->cred)))
goto invalid_creds;
}
return;
invalid_creds:
printk(KERN_ERR "CRED: Invalid process credentials\n");
printk(KERN_ERR "CRED: At %s:%u\n", file, line);
dump_invalid_creds(tsk->real_cred, "Real", tsk);
if (tsk->cred != tsk->real_cred)
dump_invalid_creds(tsk->cred, "Effective", tsk);
else
printk(KERN_ERR "CRED: Effective creds == Real creds\n");
BUG();
}
EXPORT_SYMBOL(__validate_process_creds);
/*
* check creds for do_exit()
*/
void validate_creds_for_do_exit(struct task_struct *tsk)
{
kdebug("validate_creds_for_do_exit(%p,%p{%d,%d})",
tsk->real_cred, tsk->cred,
atomic_read(&tsk->cred->usage),
read_cred_subscribers(tsk->cred));
__validate_process_creds(tsk, __FILE__, __LINE__);
}
#endif /* CONFIG_DEBUG_CREDENTIALS */
// SPDX-License-Identifier: GPL-2.0-only
/*
* ACPI device specific properties support.
*
* Copyright (C) 2014, Intel Corporation
* All rights reserved.
*
* Authors: Mika Westerberg <mika.westerberg@linux.intel.com>
* Darren Hart <dvhart@linux.intel.com>
* Rafael J. Wysocki <rafael.j.wysocki@intel.com>
*/
#include <linux/acpi.h>
#include <linux/device.h>
#include <linux/export.h>
#include "internal.h"
static int acpi_data_get_property_array(const struct acpi_device_data *data,
const char *name,
acpi_object_type type,
const union acpi_object **obj);
/*
* The GUIDs here are made equivalent to each other in order to avoid extra
* complexity in the properties handling code, with the caveat that the
* kernel will accept certain combinations of GUID and properties that are
* not defined without a warning. For instance if any of the properties
* from different GUID appear in a property list of another, it will be
* accepted by the kernel. Firmware validation tools should catch these.
*/
static const guid_t prp_guids[] = {
/* ACPI _DSD device properties GUID: daffd814-6eba-4d8c-8a91-bc9bbf4aa301 */
GUID_INIT(0xdaffd814, 0x6eba, 0x4d8c,
0x8a, 0x91, 0xbc, 0x9b, 0xbf, 0x4a, 0xa3, 0x01),
/* Hotplug in D3 GUID: 6211e2c0-58a3-4af3-90e1-927a4e0c55a4 */
GUID_INIT(0x6211e2c0, 0x58a3, 0x4af3,
0x90, 0xe1, 0x92, 0x7a, 0x4e, 0x0c, 0x55, 0xa4),
/* External facing port GUID: efcc06cc-73ac-4bc3-bff0-76143807c389 */
GUID_INIT(0xefcc06cc, 0x73ac, 0x4bc3,
0xbf, 0xf0, 0x76, 0x14, 0x38, 0x07, 0xc3, 0x89),
/* Thunderbolt GUID for IMR_VALID: c44d002f-69f9-4e7d-a904-a7baabdf43f7 */
GUID_INIT(0xc44d002f, 0x69f9, 0x4e7d,
0xa9, 0x04, 0xa7, 0xba, 0xab, 0xdf, 0x43, 0xf7),
/* Thunderbolt GUID for WAKE_SUPPORTED: 6c501103-c189-4296-ba72-9bf5a26ebe5d */
GUID_INIT(0x6c501103, 0xc189, 0x4296,
0xba, 0x72, 0x9b, 0xf5, 0xa2, 0x6e, 0xbe, 0x5d),
/* Storage device needs D3 GUID: 5025030f-842f-4ab4-a561-99a5189762d0 */
GUID_INIT(0x5025030f, 0x842f, 0x4ab4,
0xa5, 0x61, 0x99, 0xa5, 0x18, 0x97, 0x62, 0xd0),
};
/* ACPI _DSD data subnodes GUID: dbb8e3e6-5886-4ba6-8795-1319f52a966b */
static const guid_t ads_guid =
GUID_INIT(0xdbb8e3e6, 0x5886, 0x4ba6,
0x87, 0x95, 0x13, 0x19, 0xf5, 0x2a, 0x96, 0x6b);
static bool acpi_enumerate_nondev_subnodes(acpi_handle scope,
const union acpi_object *desc,
struct acpi_device_data *data,
struct fwnode_handle *parent);
static bool acpi_extract_properties(const union acpi_object *desc,
struct acpi_device_data *data);
static bool acpi_nondev_subnode_extract(const union acpi_object *desc,
acpi_handle handle,
const union acpi_object *link,
struct list_head *list,
struct fwnode_handle *parent)
{
struct acpi_data_node *dn;
bool result;
dn = kzalloc(sizeof(*dn), GFP_KERNEL);
if (!dn)
return false;
dn->name = link->package.elements[0].string.pointer;
fwnode_init(&dn->fwnode, &acpi_data_fwnode_ops);
dn->parent = parent;
INIT_LIST_HEAD(&dn->data.properties);
INIT_LIST_HEAD(&dn->data.subnodes);
result = acpi_extract_properties(desc, &dn->data);
if (handle) {
acpi_handle scope;
acpi_status status;
/*
* The scope for the subnode object lookup is the one of the
* namespace node (device) containing the object that has
* returned the package. That is, it's the scope of that
* object's parent.
*/
status = acpi_get_parent(handle, &scope);
if (ACPI_SUCCESS(status)
&& acpi_enumerate_nondev_subnodes(scope, desc, &dn->data,
&dn->fwnode))
result = true;
} else if (acpi_enumerate_nondev_subnodes(NULL, desc, &dn->data,
&dn->fwnode)) {
result = true;
}
if (result) {
dn->handle = handle;
dn->data.pointer = desc;
list_add_tail(&dn->sibling, list);
return true;
}
kfree(dn);
acpi_handle_debug(handle, "Invalid properties/subnodes data, skipping\n");
return false;
}
static bool acpi_nondev_subnode_data_ok(acpi_handle handle,
const union acpi_object *link,
struct list_head *list,
struct fwnode_handle *parent)
{
struct acpi_buffer buf = { ACPI_ALLOCATE_BUFFER };
acpi_status status;
status = acpi_evaluate_object_typed(handle, NULL, NULL, &buf,
ACPI_TYPE_PACKAGE);
if (ACPI_FAILURE(status))
return false;
if (acpi_nondev_subnode_extract(buf.pointer, handle, link, list,
parent))
return true;
ACPI_FREE(buf.pointer);
return false;
}
static bool acpi_nondev_subnode_ok(acpi_handle scope,
const union acpi_object *link,
struct list_head *list,
struct fwnode_handle *parent)
{
acpi_handle handle;
acpi_status status;
if (!scope)
return false;
status = acpi_get_handle(scope, link->package.elements[1].string.pointer,
&handle);
if (ACPI_FAILURE(status))
return false;
return acpi_nondev_subnode_data_ok(handle, link, list, parent);
}
static int acpi_add_nondev_subnodes(acpi_handle scope,
const union acpi_object *links,
struct list_head *list,
struct fwnode_handle *parent)
{
bool ret = false;
int i;
for (i = 0; i < links->package.count; i++) {
const union acpi_object *link, *desc;
acpi_handle handle;
bool result;
link = &links->package.elements[i];
/* Only two elements allowed. */
if (link->package.count != 2)
continue;
/* The first one must be a string. */
if (link->package.elements[0].type != ACPI_TYPE_STRING)
continue;
/* The second one may be a string, a reference or a package. */
switch (link->package.elements[1].type) {
case ACPI_TYPE_STRING:
result = acpi_nondev_subnode_ok(scope, link, list,
parent);
break;
case ACPI_TYPE_LOCAL_REFERENCE:
handle = link->package.elements[1].reference.handle;
result = acpi_nondev_subnode_data_ok(handle, link, list,
parent);
break;
case ACPI_TYPE_PACKAGE:
desc = &link->package.elements[1];
result = acpi_nondev_subnode_extract(desc, NULL, link,
list, parent);
break;
default:
result = false;
break;
}
ret = ret || result;
}
return ret;
}
static bool acpi_enumerate_nondev_subnodes(acpi_handle scope,
const union acpi_object *desc,
struct acpi_device_data *data,
struct fwnode_handle *parent)
{
int i;
/* Look for the ACPI data subnodes GUID. */
for (i = 0; i < desc->package.count; i += 2) {
const union acpi_object *guid, *links;
guid = &desc->package.elements[i];
links = &desc->package.elements[i + 1];
/*
* The first element must be a GUID and the second one must be
* a package.
*/
if (guid->type != ACPI_TYPE_BUFFER ||
guid->buffer.length != 16 ||
links->type != ACPI_TYPE_PACKAGE)
break;
if (!guid_equal((guid_t *)guid->buffer.pointer, &ads_guid))
continue;
return acpi_add_nondev_subnodes(scope, links, &data->subnodes,
parent);
}
return false;
}
static bool acpi_property_value_ok(const union acpi_object *value)
{
int j;
/*
* The value must be an integer, a string, a reference, or a package
* whose every element must be an integer, a string, or a reference.
*/
switch (value->type) {
case ACPI_TYPE_INTEGER:
case ACPI_TYPE_STRING:
case ACPI_TYPE_LOCAL_REFERENCE:
return true;
case ACPI_TYPE_PACKAGE:
for (j = 0; j < value->package.count; j++)
switch (value->package.elements[j].type) {
case ACPI_TYPE_INTEGER:
case ACPI_TYPE_STRING:
case ACPI_TYPE_LOCAL_REFERENCE:
continue;
default:
return false;
}
return true;
}
return false;
}
static bool acpi_properties_format_valid(const union acpi_object *properties)
{
int i;
for (i = 0; i < properties->package.count; i++) {
const union acpi_object *property;
property = &properties->package.elements[i];
/*
* Only two elements allowed, the first one must be a string and
* the second one has to satisfy certain conditions.
*/
if (property->package.count != 2
|| property->package.elements[0].type != ACPI_TYPE_STRING
|| !acpi_property_value_ok(&property->package.elements[1]))
return false;
}
return true;
}
static void acpi_init_of_compatible(struct acpi_device *adev)
{
const union acpi_object *of_compatible;
int ret;
ret = acpi_data_get_property_array(&adev->data, "compatible",
ACPI_TYPE_STRING, &of_compatible);
if (ret) {
ret = acpi_dev_get_property(adev, "compatible",
ACPI_TYPE_STRING, &of_compatible);
if (ret) {
if (adev->parent
&& adev->parent->flags.of_compatible_ok)
goto out;
return;
}
}
adev->data.of_compatible = of_compatible;
out:
adev->flags.of_compatible_ok = 1;
}
static bool acpi_is_property_guid(const guid_t *guid)
{
int i;
for (i = 0; i < ARRAY_SIZE(prp_guids); i++) {
if (guid_equal(guid, &prp_guids[i]))
return true;
}
return false;
}
struct acpi_device_properties *
acpi_data_add_props(struct acpi_device_data *data, const guid_t *guid,
const union acpi_object *properties)
{
struct acpi_device_properties *props;
props = kzalloc(sizeof(*props), GFP_KERNEL);
if (props) {
INIT_LIST_HEAD(&props->list);
props->guid = guid;
props->properties = properties;
list_add_tail(&props->list, &data->properties);
}
return props;
}
static bool acpi_extract_properties(const union acpi_object *desc,
struct acpi_device_data *data)
{
int i;
if (desc->package.count % 2)
return false;
/* Look for the device properties GUID. */
for (i = 0; i < desc->package.count; i += 2) {
const union acpi_object *guid, *properties;
guid = &desc->package.elements[i];
properties = &desc->package.elements[i + 1];
/*
* The first element must be a GUID and the second one must be
* a package.
*/
if (guid->type != ACPI_TYPE_BUFFER ||
guid->buffer.length != 16 ||
properties->type != ACPI_TYPE_PACKAGE)
break;
if (!acpi_is_property_guid((guid_t *)guid->buffer.pointer))
continue;
/*
* We found the matching GUID. Now validate the format of the
* package immediately following it.
*/
if (!acpi_properties_format_valid(properties))
continue;
acpi_data_add_props(data, (const guid_t *)guid->buffer.pointer,
properties);
}
return !list_empty(&data->properties);
}
void acpi_init_properties(struct acpi_device *adev)
{
struct acpi_buffer buf = { ACPI_ALLOCATE_BUFFER };
struct acpi_hardware_id *hwid;
acpi_status status;
bool acpi_of = false;
INIT_LIST_HEAD(&adev->data.properties);
INIT_LIST_HEAD(&adev->data.subnodes);
if (!adev->handle)
return;
/*
* Check if ACPI_DT_NAMESPACE_HID is present and inthat case we fill in
* Device Tree compatible properties for this device.
*/
list_for_each_entry(hwid, &adev->pnp.ids, list) {
if (!strcmp(hwid->id, ACPI_DT_NAMESPACE_HID)) {
acpi_of = true;
break;
}
}
status = acpi_evaluate_object_typed(adev->handle, "_DSD", NULL, &buf,
ACPI_TYPE_PACKAGE);
if (ACPI_FAILURE(status))
goto out;
if (acpi_extract_properties(buf.pointer, &adev->data)) {
adev->data.pointer = buf.pointer;
if (acpi_of)
acpi_init_of_compatible(adev);
}
if (acpi_enumerate_nondev_subnodes(adev->handle, buf.pointer,
&adev->data, acpi_fwnode_handle(adev)))
adev->data.pointer = buf.pointer;
if (!adev->data.pointer) {
acpi_handle_debug(adev->handle, "Invalid _DSD data, skipping\n");
ACPI_FREE(buf.pointer);
}
out:
if (acpi_of && !adev->flags.of_compatible_ok)
acpi_handle_info(adev->handle,
ACPI_DT_NAMESPACE_HID " requires 'compatible' property\n");
if (!adev->data.pointer)
acpi_extract_apple_properties(adev);
}
static void acpi_destroy_nondev_subnodes(struct list_head *list)
{
struct acpi_data_node *dn, *next;
if (list_empty(list))
return;
list_for_each_entry_safe_reverse(dn, next, list, sibling) {
acpi_destroy_nondev_subnodes(&dn->data.subnodes);
wait_for_completion(&dn->kobj_done);
list_del(&dn->sibling);
ACPI_FREE((void *)dn->data.pointer);
kfree(dn);
}
}
void acpi_free_properties(struct acpi_device *adev)
{
struct acpi_device_properties *props, *tmp;
acpi_destroy_nondev_subnodes(&adev->data.subnodes);
ACPI_FREE((void *)adev->data.pointer);
adev->data.of_compatible = NULL;
adev->data.pointer = NULL;
list_for_each_entry_safe(props, tmp, &adev->data.properties, list) {
list_del(&props->list);
kfree(props);
}
}
/**
* acpi_data_get_property - return an ACPI property with given name
* @data: ACPI device deta object to get the property from
* @name: Name of the property
* @type: Expected property type
* @obj: Location to store the property value (if not %NULL)
*
* Look up a property with @name and store a pointer to the resulting ACPI
* object at the location pointed to by @obj if found.
*
* Callers must not attempt to free the returned objects. These objects will be
* freed by the ACPI core automatically during the removal of @data.
*
* Return: %0 if property with @name has been found (success),
* %-EINVAL if the arguments are invalid,
* %-EINVAL if the property doesn't exist,
* %-EPROTO if the property value type doesn't match @type.
*/
static int acpi_data_get_property(const struct acpi_device_data *data,
const char *name, acpi_object_type type,
const union acpi_object **obj)
{
const struct acpi_device_properties *props;
if (!data || !name)
return -EINVAL;
if (!data->pointer || list_empty(&data->properties))
return -EINVAL;
list_for_each_entry(props, &data->properties, list) {
const union acpi_object *properties;
unsigned int i;
properties = props->properties;
for (i = 0; i < properties->package.count; i++) {
const union acpi_object *propname, *propvalue;
const union acpi_object *property;
property = &properties->package.elements[i];
propname = &property->package.elements[0];
propvalue = &property->package.elements[1];
if (!strcmp(name, propname->string.pointer)) {
if (type != ACPI_TYPE_ANY &&
propvalue->type != type)
return -EPROTO;
if (obj)
*obj = propvalue;
return 0;
}
}
}
return -EINVAL;
}
/**
* acpi_dev_get_property - return an ACPI property with given name.
* @adev: ACPI device to get the property from.
* @name: Name of the property.
* @type: Expected property type.
* @obj: Location to store the property value (if not %NULL).
*/
int acpi_dev_get_property(const struct acpi_device *adev, const char *name,
acpi_object_type type, const union acpi_object **obj)
{
return adev ? acpi_data_get_property(&adev->data, name, type, obj) : -EINVAL;
}
EXPORT_SYMBOL_GPL(acpi_dev_get_property);
static const struct acpi_device_data *
acpi_device_data_of_node(const struct fwnode_handle *fwnode)
{
if (is_acpi_device_node(fwnode)) {
const struct acpi_device *adev = to_acpi_device_node(fwnode);
return &adev->data;
} else if (is_acpi_data_node(fwnode)) {
const struct acpi_data_node *dn = to_acpi_data_node(fwnode);
return &dn->data;
}
return NULL;
}
/**
* acpi_node_prop_get - return an ACPI property with given name.
* @fwnode: Firmware node to get the property from.
* @propname: Name of the property.
* @valptr: Location to store a pointer to the property value (if not %NULL).
*/
int acpi_node_prop_get(const struct fwnode_handle *fwnode,
const char *propname, void **valptr)
{
return acpi_data_get_property(acpi_device_data_of_node(fwnode),
propname, ACPI_TYPE_ANY,
(const union acpi_object **)valptr);
}
/**
* acpi_data_get_property_array - return an ACPI array property with given name
* @data: ACPI data object to get the property from
* @name: Name of the property
* @type: Expected type of array elements
* @obj: Location to store a pointer to the property value (if not NULL)
*
* Look up an array property with @name and store a pointer to the resulting
* ACPI object at the location pointed to by @obj if found.
*
* Callers must not attempt to free the returned objects. Those objects will be
* freed by the ACPI core automatically during the removal of @data.
*
* Return: %0 if array property (package) with @name has been found (success),
* %-EINVAL if the arguments are invalid,
* %-EINVAL if the property doesn't exist,
* %-EPROTO if the property is not a package or the type of its elements
* doesn't match @type.
*/
static int acpi_data_get_property_array(const struct acpi_device_data *data,
const char *name,
acpi_object_type type,
const union acpi_object **obj)
{
const union acpi_object *prop;
int ret, i;
ret = acpi_data_get_property(data, name, ACPI_TYPE_PACKAGE, &prop);
if (ret)
return ret;
if (type != ACPI_TYPE_ANY) {
/* Check that all elements are of correct type. */
for (i = 0; i < prop->package.count; i++)
if (prop->package.elements[i].type != type)
return -EPROTO;
}
if (obj)
*obj = prop;
return 0;
}
static struct fwnode_handle *
acpi_fwnode_get_named_child_node(const struct fwnode_handle *fwnode,
const char *childname)
{
struct fwnode_handle *child;
fwnode_for_each_child_node(fwnode, child) {
if (is_acpi_data_node(child)) {
if (acpi_data_node_match(child, childname))
return child;
continue;
}
if (!strncmp(acpi_device_bid(to_acpi_device_node(child)),
childname, ACPI_NAMESEG_SIZE))
return child;
}
return NULL;
}
/**
* __acpi_node_get_property_reference - returns handle to the referenced object
* @fwnode: Firmware node to get the property from
* @propname: Name of the property
* @index: Index of the reference to return
* @num_args: Maximum number of arguments after each reference
* @args: Location to store the returned reference with optional arguments
*
* Find property with @name, verifify that it is a package containing at least
* one object reference and if so, store the ACPI device object pointer to the
* target object in @args->adev. If the reference includes arguments, store
* them in the @args->args[] array.
*
* If there's more than one reference in the property value package, @index is
* used to select the one to return.
*
* It is possible to leave holes in the property value set like in the
* example below:
*
* Package () {
* "cs-gpios",
* Package () {
* ^GPIO, 19, 0, 0,
* ^GPIO, 20, 0, 0,
* 0,
* ^GPIO, 21, 0, 0,
* }
* }
*
* Calling this function with index %2 or index %3 return %-ENOENT. If the
* property does not contain any more values %-ENOENT is returned. The NULL
* entry must be single integer and preferably contain value %0.
*
* Return: %0 on success, negative error code on failure.
*/
int __acpi_node_get_property_reference(const struct fwnode_handle *fwnode,
const char *propname, size_t index, size_t num_args,
struct fwnode_reference_args *args)
{
const union acpi_object *element, *end;
const union acpi_object *obj;
const struct acpi_device_data *data;
struct acpi_device *device;
int ret, idx = 0;
data = acpi_device_data_of_node(fwnode);
if (!data)
return -ENOENT;
ret = acpi_data_get_property(data, propname, ACPI_TYPE_ANY, &obj);
if (ret)
return ret == -EINVAL ? -ENOENT : -EINVAL;
/*
* The simplest case is when the value is a single reference. Just
* return that reference then.
*/
if (obj->type == ACPI_TYPE_LOCAL_REFERENCE) {
if (index)
return -ENOENT;
ret = acpi_bus_get_device(obj->reference.handle, &device);
if (ret)
return ret == -ENODEV ? -EINVAL : ret;
args->fwnode = acpi_fwnode_handle(device);
args->nargs = 0;
return 0;
}
/*
* If it is not a single reference, then it is a package of
* references followed by number of ints as follows:
*
* Package () { REF, INT, REF, INT, INT }
*
* The index argument is then used to determine which reference
* the caller wants (along with the arguments).
*/
if (obj->type != ACPI_TYPE_PACKAGE)
return -EINVAL;
if (index >= obj->package.count)
return -ENOENT;
element = obj->package.elements;
end = element + obj->package.count;
while (element < end) {
u32 nargs, i;
if (element->type == ACPI_TYPE_LOCAL_REFERENCE) {
struct fwnode_handle *ref_fwnode;
ret = acpi_bus_get_device(element->reference.handle,
&device);
if (ret)
return -EINVAL;
nargs = 0;
element++;
/*
* Find the referred data extension node under the
* referred device node.
*/
for (ref_fwnode = acpi_fwnode_handle(device);
element < end && element->type == ACPI_TYPE_STRING;
element++) {
ref_fwnode = acpi_fwnode_get_named_child_node(
ref_fwnode, element->string.pointer);
if (!ref_fwnode)
return -EINVAL;
}
/* assume following integer elements are all args */
for (i = 0; element + i < end && i < num_args; i++) {
int type = element[i].type;
if (type == ACPI_TYPE_INTEGER)
nargs++;
else if (type == ACPI_TYPE_LOCAL_REFERENCE)
break;
else
return -EINVAL;
}
if (nargs > NR_FWNODE_REFERENCE_ARGS)
return -EINVAL;
if (idx == index) {
args->fwnode = ref_fwnode;
args->nargs = nargs;
for (i = 0; i < nargs; i++)
args->args[i] = element[i].integer.value;
return 0;
}
element += nargs;
} else if (element->type == ACPI_TYPE_INTEGER) {
if (idx == index)
return -ENOENT;
element++;
} else {
return -EINVAL;
}
idx++;
}
return -ENOENT;
}
EXPORT_SYMBOL_GPL(__acpi_node_get_property_reference);
static int acpi_data_prop_read_single(const struct acpi_device_data *data,
const char *propname,
enum dev_prop_type proptype, void *val)
{
const union acpi_object *obj;
int ret;
if (proptype >= DEV_PROP_U8 && proptype <= DEV_PROP_U64) {
ret = acpi_data_get_property(data, propname, ACPI_TYPE_INTEGER, &obj);
if (ret)
return ret;
switch (proptype) {
case DEV_PROP_U8:
if (obj->integer.value > U8_MAX)
return -EOVERFLOW;
if (val)
*(u8 *)val = obj->integer.value;
break;
case DEV_PROP_U16:
if (obj->integer.value > U16_MAX)
return -EOVERFLOW;
if (val)
*(u16 *)val = obj->integer.value;
break;
case DEV_PROP_U32:
if (obj->integer.value > U32_MAX)
return -EOVERFLOW;
if (val)
*(u32 *)val = obj->integer.value;
break;
default:
if (val)
*(u64 *)val = obj->integer.value;
break;
}
if (!val)
return 1;
} else if (proptype == DEV_PROP_STRING) {
ret = acpi_data_get_property(data, propname, ACPI_TYPE_STRING, &obj);
if (ret)
return ret;
if (val)
*(char **)val = obj->string.pointer;
return 1;
} else {
ret = -EINVAL;
}
return ret;
}
static int acpi_copy_property_array_u8(const union acpi_object *items, u8 *val,
size_t nval)
{
int i;
for (i = 0; i < nval; i++) {
if (items[i].type != ACPI_TYPE_INTEGER)
return -EPROTO;
if (items[i].integer.value > U8_MAX)
return -EOVERFLOW;
val[i] = items[i].integer.value;
}
return 0;
}
static int acpi_copy_property_array_u16(const union acpi_object *items,
u16 *val, size_t nval)
{
int i;
for (i = 0; i < nval; i++) {
if (items[i].type != ACPI_TYPE_INTEGER)
return -EPROTO;
if (items[i].integer.value > U16_MAX)
return -EOVERFLOW;
val[i] = items[i].integer.value;
}
return 0;
}
static int acpi_copy_property_array_u32(const union acpi_object *items,
u32 *val, size_t nval)
{
int i;
for (i = 0; i < nval; i++) {
if (items[i].type != ACPI_TYPE_INTEGER)
return -EPROTO;
if (items[i].integer.value > U32_MAX)
return -EOVERFLOW;
val[i] = items[i].integer.value;
}
return 0;
}
static int acpi_copy_property_array_u64(const union acpi_object *items,
u64 *val, size_t nval)
{
int i;
for (i = 0; i < nval; i++) {
if (items[i].type != ACPI_TYPE_INTEGER)
return -EPROTO;
val[i] = items[i].integer.value;
}
return 0;
}
static int acpi_copy_property_array_string(const union acpi_object *items,
char **val, size_t nval)
{
int i;
for (i = 0; i < nval; i++) {
if (items[i].type != ACPI_TYPE_STRING)
return -EPROTO;
val[i] = items[i].string.pointer;
}
return nval;
}
static int acpi_data_prop_read(const struct acpi_device_data *data,
const char *propname,
enum dev_prop_type proptype,
void *val, size_t nval)
{
const union acpi_object *obj;
const union acpi_object *items;
int ret;
if (nval == 1 || !val) {
ret = acpi_data_prop_read_single(data, propname, proptype, val);
/*
* The overflow error means that the property is there and it is
* single-value, but its type does not match, so return.
*/
if (ret >= 0 || ret == -EOVERFLOW)
return ret;
/*
* Reading this property as a single-value one failed, but its
* value may still be represented as one-element array, so
* continue.
*/
}
ret = acpi_data_get_property_array(data, propname, ACPI_TYPE_ANY, &obj);
if (ret)
return ret;
if (!val)
return obj->package.count;
if (proptype != DEV_PROP_STRING && nval > obj->package.count)
return -EOVERFLOW;
else if (nval <= 0)
return -EINVAL;
items = obj->package.elements;
switch (proptype) {
case DEV_PROP_U8:
ret = acpi_copy_property_array_u8(items, (u8 *)val, nval);
break;
case DEV_PROP_U16:
ret = acpi_copy_property_array_u16(items, (u16 *)val, nval);
break;
case DEV_PROP_U32:
ret = acpi_copy_property_array_u32(items, (u32 *)val, nval);
break;
case DEV_PROP_U64:
ret = acpi_copy_property_array_u64(items, (u64 *)val, nval);
break;
case DEV_PROP_STRING:
ret = acpi_copy_property_array_string(
items, (char **)val,
min_t(u32, nval, obj->package.count));
break;
default:
ret = -EINVAL;
break;
}
return ret;
}
/**
* acpi_node_prop_read - retrieve the value of an ACPI property with given name.
* @fwnode: Firmware node to get the property from.
* @propname: Name of the property.
* @proptype: Expected property type.
* @val: Location to store the property value (if not %NULL).
* @nval: Size of the array pointed to by @val.
*
* If @val is %NULL, return the number of array elements comprising the value
* of the property. Otherwise, read at most @nval values to the array at the
* location pointed to by @val.
*/
static int acpi_node_prop_read(const struct fwnode_handle *fwnode,
const char *propname, enum dev_prop_type proptype,
void *val, size_t nval)
{
return acpi_data_prop_read(acpi_device_data_of_node(fwnode),
propname, proptype, val, nval);
}
/**
* acpi_get_next_subnode - Return the next child node handle for a fwnode
* @fwnode: Firmware node to find the next child node for.
* @child: Handle to one of the device's child nodes or a null handle.
*/
struct fwnode_handle *acpi_get_next_subnode(const struct fwnode_handle *fwnode,
struct fwnode_handle *child)
{
const struct acpi_device *adev = to_acpi_device_node(fwnode);
const struct list_head *head;
struct list_head *next;
if (!child || is_acpi_device_node(child)) {
struct acpi_device *child_adev;
if (adev)
head = &adev->children;
else
goto nondev;
if (list_empty(head))
goto nondev;
if (child) {
adev = to_acpi_device_node(child);
next = adev->node.next;
if (next == head) {
child = NULL;
goto nondev;
}
child_adev = list_entry(next, struct acpi_device, node);
} else {
child_adev = list_first_entry(head, struct acpi_device,
node);
}
return acpi_fwnode_handle(child_adev);
}
nondev:
if (!child || is_acpi_data_node(child)) {
const struct acpi_data_node *data = to_acpi_data_node(fwnode);
struct acpi_data_node *dn;
/*
* We can have a combination of device and data nodes, e.g. with
* hierarchical _DSD properties. Make sure the adev pointer is
* restored before going through data nodes, otherwise we will
* be looking for data_nodes below the last device found instead
* of the common fwnode shared by device_nodes and data_nodes.
*/
adev = to_acpi_device_node(fwnode);
if (adev)
head = &adev->data.subnodes;
else if (data)
head = &data->data.subnodes;
else
return NULL;
if (list_empty(head))
return NULL;
if (child) {
dn = to_acpi_data_node(child);
next = dn->sibling.next;
if (next == head)
return NULL;
dn = list_entry(next, struct acpi_data_node, sibling);
} else {
dn = list_first_entry(head, struct acpi_data_node, sibling);
}
return &dn->fwnode;
}
return NULL;
}
/**
* acpi_node_get_parent - Return parent fwnode of this fwnode
* @fwnode: Firmware node whose parent to get
*
* Returns parent node of an ACPI device or data firmware node or %NULL if
* not available.
*/
struct fwnode_handle *acpi_node_get_parent(const struct fwnode_handle *fwnode)
{
if (is_acpi_data_node(fwnode)) {
/* All data nodes have parent pointer so just return that */
return to_acpi_data_node(fwnode)->parent;
} else if (is_acpi_device_node(fwnode)) {
struct device *dev = to_acpi_device_node(fwnode)->dev.parent;
if (dev)
return acpi_fwnode_handle(to_acpi_device(dev));
}
return NULL;
}
/*
* Return true if the node is an ACPI graph node. Called on either ports
* or endpoints.
*/
static bool is_acpi_graph_node(struct fwnode_handle *fwnode,
const char *str)
{
unsigned int len = strlen(str);
const char *name;
if (!len || !is_acpi_data_node(fwnode))
return false;
name = to_acpi_data_node(fwnode)->name;
return (fwnode_property_present(fwnode, "reg") &&
!strncmp(name, str, len) && name[len] == '@') ||
fwnode_property_present(fwnode, str);
}
/**
* acpi_graph_get_next_endpoint - Get next endpoint ACPI firmware node
* @fwnode: Pointer to the parent firmware node
* @prev: Previous endpoint node or %NULL to get the first
*
* Looks up next endpoint ACPI firmware node below a given @fwnode. Returns
* %NULL if there is no next endpoint or in case of error. In case of success
* the next endpoint is returned.
*/
static struct fwnode_handle *acpi_graph_get_next_endpoint(
const struct fwnode_handle *fwnode, struct fwnode_handle *prev)
{
struct fwnode_handle *port = NULL;
struct fwnode_handle *endpoint;
if (!prev) {
do {
port = fwnode_get_next_child_node(fwnode, port);
/*
* The names of the port nodes begin with "port@"
* followed by the number of the port node and they also
* have a "reg" property that also has the number of the
* port node. For compatibility reasons a node is also
* recognised as a port node from the "port" property.
*/
if (is_acpi_graph_node(port, "port"))
break;
} while (port);
} else {
port = fwnode_get_parent(prev);
}
if (!port)
return NULL;
endpoint = fwnode_get_next_child_node(port, prev);
while (!endpoint) {
port = fwnode_get_next_child_node(fwnode, port);
if (!port)
break;
if (is_acpi_graph_node(port, "port"))
endpoint = fwnode_get_next_child_node(port, NULL);
}
/*
* The names of the endpoint nodes begin with "endpoint@" followed by
* the number of the endpoint node and they also have a "reg" property
* that also has the number of the endpoint node. For compatibility
* reasons a node is also recognised as an endpoint node from the
* "endpoint" property.
*/
if (!is_acpi_graph_node(endpoint, "endpoint"))
return NULL;
return endpoint;
}
/**
* acpi_graph_get_child_prop_value - Return a child with a given property value
* @fwnode: device fwnode
* @prop_name: The name of the property to look for
* @val: the desired property value
*
* Return the port node corresponding to a given port number. Returns
* the child node on success, NULL otherwise.
*/
static struct fwnode_handle *acpi_graph_get_child_prop_value(
const struct fwnode_handle *fwnode, const char *prop_name,
unsigned int val)
{
struct fwnode_handle *child;
fwnode_for_each_child_node(fwnode, child) {
u32 nr;
if (fwnode_property_read_u32(child, prop_name, &nr))
continue;
if (val == nr)
return child;
}
return NULL;
}
/**
* acpi_graph_get_remote_endpoint - Parses and returns remote end of an endpoint
* @__fwnode: Endpoint firmware node pointing to a remote device
*
* Returns the remote endpoint corresponding to @__fwnode. NULL on error.
*/
static struct fwnode_handle *
acpi_graph_get_remote_endpoint(const struct fwnode_handle *__fwnode)
{
struct fwnode_handle *fwnode;
unsigned int port_nr, endpoint_nr;
struct fwnode_reference_args args;
int ret;
memset(&args, 0, sizeof(args));
ret = acpi_node_get_property_reference(__fwnode, "remote-endpoint", 0,
&args);
if (ret)
return NULL;
/* Direct endpoint reference? */
if (!is_acpi_device_node(args.fwnode))
return args.nargs ? NULL : args.fwnode;
/*
* Always require two arguments with the reference: port and
* endpoint indices.
*/
if (args.nargs != 2)
return NULL;
fwnode = args.fwnode;
port_nr = args.args[0];
endpoint_nr = args.args[1];
fwnode = acpi_graph_get_child_prop_value(fwnode, "port", port_nr);
return acpi_graph_get_child_prop_value(fwnode, "endpoint", endpoint_nr);
}
static bool acpi_fwnode_device_is_available(const struct fwnode_handle *fwnode)
{
if (!is_acpi_device_node(fwnode))
return false;
return acpi_device_is_present(to_acpi_device_node(fwnode));
}
static bool acpi_fwnode_property_present(const struct fwnode_handle *fwnode,
const char *propname)
{
return !acpi_node_prop_get(fwnode, propname, NULL);
}
static int
acpi_fwnode_property_read_int_array(const struct fwnode_handle *fwnode,
const char *propname,
unsigned int elem_size, void *val,
size_t nval)
{
enum dev_prop_type type;
switch (elem_size) {
case sizeof(u8):
type = DEV_PROP_U8;
break;
case sizeof(u16):
type = DEV_PROP_U16;
break;
case sizeof(u32):
type = DEV_PROP_U32;
break;
case sizeof(u64):
type = DEV_PROP_U64;
break;
default:
return -ENXIO;
}
return acpi_node_prop_read(fwnode, propname, type, val, nval);
}
static int
acpi_fwnode_property_read_string_array(const struct fwnode_handle *fwnode,
const char *propname, const char **val,
size_t nval)
{
return acpi_node_prop_read(fwnode, propname, DEV_PROP_STRING,
val, nval);
}
static int
acpi_fwnode_get_reference_args(const struct fwnode_handle *fwnode,
const char *prop, const char *nargs_prop,
unsigned int args_count, unsigned int index,
struct fwnode_reference_args *args)
{
return __acpi_node_get_property_reference(fwnode, prop, index,
args_count, args);
}
static const char *acpi_fwnode_get_name(const struct fwnode_handle *fwnode)
{
const struct acpi_device *adev;
struct fwnode_handle *parent;
/* Is this the root node? */
parent = fwnode_get_parent(fwnode);
if (!parent)
return "\\";
fwnode_handle_put(parent);
if (is_acpi_data_node(fwnode)) {
const struct acpi_data_node *dn = to_acpi_data_node(fwnode);
return dn->name;
}
adev = to_acpi_device_node(fwnode);
if (WARN_ON(!adev))
return NULL;
return acpi_device_bid(adev);
}
static const char *
acpi_fwnode_get_name_prefix(const struct fwnode_handle *fwnode)
{
struct fwnode_handle *parent;
/* Is this the root node? */
parent = fwnode_get_parent(fwnode);
if (!parent)
return "";
/* Is this 2nd node from the root? */
parent = fwnode_get_next_parent(parent);
if (!parent)
return "";
fwnode_handle_put(parent);
/* ACPI device or data node. */
return ".";
}
static struct fwnode_handle *
acpi_fwnode_get_parent(struct fwnode_handle *fwnode)
{
return acpi_node_get_parent(fwnode);
}
static int acpi_fwnode_graph_parse_endpoint(const struct fwnode_handle *fwnode,
struct fwnode_endpoint *endpoint)
{
struct fwnode_handle *port_fwnode = fwnode_get_parent(fwnode);
endpoint->local_fwnode = fwnode;
if (fwnode_property_read_u32(port_fwnode, "reg", &endpoint->port))
fwnode_property_read_u32(port_fwnode, "port", &endpoint->port);
if (fwnode_property_read_u32(fwnode, "reg", &endpoint->id))
fwnode_property_read_u32(fwnode, "endpoint", &endpoint->id);
return 0;
}
static const void *
acpi_fwnode_device_get_match_data(const struct fwnode_handle *fwnode,
const struct device *dev)
{
return acpi_device_get_match_data(dev);
}
#define DECLARE_ACPI_FWNODE_OPS(ops) \
const struct fwnode_operations ops = { \
.device_is_available = acpi_fwnode_device_is_available, \
.device_get_match_data = acpi_fwnode_device_get_match_data, \
.property_present = acpi_fwnode_property_present, \
.property_read_int_array = \
acpi_fwnode_property_read_int_array, \
.property_read_string_array = \
acpi_fwnode_property_read_string_array, \
.get_parent = acpi_node_get_parent, \
.get_next_child_node = acpi_get_next_subnode, \
.get_named_child_node = acpi_fwnode_get_named_child_node, \
.get_name = acpi_fwnode_get_name, \
.get_name_prefix = acpi_fwnode_get_name_prefix, \
.get_reference_args = acpi_fwnode_get_reference_args, \
.graph_get_next_endpoint = \
acpi_graph_get_next_endpoint, \
.graph_get_remote_endpoint = \
acpi_graph_get_remote_endpoint, \
.graph_get_port_parent = acpi_fwnode_get_parent, \
.graph_parse_endpoint = acpi_fwnode_graph_parse_endpoint, \
}; \
EXPORT_SYMBOL_GPL(ops)
DECLARE_ACPI_FWNODE_OPS(acpi_device_fwnode_ops);
DECLARE_ACPI_FWNODE_OPS(acpi_data_fwnode_ops);
const struct fwnode_operations acpi_static_fwnode_ops;
bool is_acpi_device_node(const struct fwnode_handle *fwnode)
{
return !IS_ERR_OR_NULL(fwnode) && fwnode->ops == &acpi_device_fwnode_ops;
}
EXPORT_SYMBOL(is_acpi_device_node);
bool is_acpi_data_node(const struct fwnode_handle *fwnode)
{
return !IS_ERR_OR_NULL(fwnode) && fwnode->ops == &acpi_data_fwnode_ops;
}
EXPORT_SYMBOL(is_acpi_data_node);
/* SPDX-License-Identifier: GPL-2.0-only */
/*
* pm_runtime.h - Device run-time power management helper functions.
*
* Copyright (C) 2009 Rafael J. Wysocki <rjw@sisk.pl>
*/
#ifndef _LINUX_PM_RUNTIME_H
#define _LINUX_PM_RUNTIME_H
#include <linux/device.h>
#include <linux/notifier.h>
#include <linux/pm.h>
#include <linux/jiffies.h>
/* Runtime PM flag argument bits */
#define RPM_ASYNC 0x01 /* Request is asynchronous */
#define RPM_NOWAIT 0x02 /* Don't wait for concurrent
state change */
#define RPM_GET_PUT 0x04 /* Increment/decrement the
usage_count */
#define RPM_AUTO 0x08 /* Use autosuspend_delay */
#ifdef CONFIG_PM
extern struct workqueue_struct *pm_wq;
static inline bool queue_pm_work(struct work_struct *work)
{
return queue_work(pm_wq, work);
}
extern int pm_generic_runtime_suspend(struct device *dev);
extern int pm_generic_runtime_resume(struct device *dev);
extern int pm_runtime_force_suspend(struct device *dev);
extern int pm_runtime_force_resume(struct device *dev);
extern int __pm_runtime_idle(struct device *dev, int rpmflags);
extern int __pm_runtime_suspend(struct device *dev, int rpmflags);
extern int __pm_runtime_resume(struct device *dev, int rpmflags);
extern int pm_runtime_get_if_active(struct device *dev, bool ign_usage_count);
extern int pm_schedule_suspend(struct device *dev, unsigned int delay);
extern int __pm_runtime_set_status(struct device *dev, unsigned int status);
extern int pm_runtime_barrier(struct device *dev);
extern void pm_runtime_enable(struct device *dev);
extern void __pm_runtime_disable(struct device *dev, bool check_resume);
extern void pm_runtime_allow(struct device *dev);
extern void pm_runtime_forbid(struct device *dev);
extern void pm_runtime_no_callbacks(struct device *dev);
extern void pm_runtime_irq_safe(struct device *dev);
extern void __pm_runtime_use_autosuspend(struct device *dev, bool use);
extern void pm_runtime_set_autosuspend_delay(struct device *dev, int delay);
extern u64 pm_runtime_autosuspend_expiration(struct device *dev);
extern void pm_runtime_update_max_time_suspended(struct device *dev,
s64 delta_ns);
extern void pm_runtime_set_memalloc_noio(struct device *dev, bool enable);
extern void pm_runtime_get_suppliers(struct device *dev);
extern void pm_runtime_put_suppliers(struct device *dev);
extern void pm_runtime_new_link(struct device *dev);
extern void pm_runtime_drop_link(struct device_link *link);
extern void pm_runtime_release_supplier(struct device_link *link, bool check_idle);
extern int devm_pm_runtime_enable(struct device *dev);
/**
* pm_runtime_get_if_in_use - Conditionally bump up runtime PM usage counter.
* @dev: Target device.
*
* Increment the runtime PM usage counter of @dev if its runtime PM status is
* %RPM_ACTIVE and its runtime PM usage counter is greater than 0.
*/
static inline int pm_runtime_get_if_in_use(struct device *dev)
{
return pm_runtime_get_if_active(dev, false);
}
/**
* pm_suspend_ignore_children - Set runtime PM behavior regarding children.
* @dev: Target device.
* @enable: Whether or not to ignore possible dependencies on children.
*
* The dependencies of @dev on its children will not be taken into account by
* the runtime PM framework going forward if @enable is %true, or they will
* be taken into account otherwise.
*/
static inline void pm_suspend_ignore_children(struct device *dev, bool enable)
{
dev->power.ignore_children = enable;
}
/**
* pm_runtime_get_noresume - Bump up runtime PM usage counter of a device.
* @dev: Target device.
*/
static inline void pm_runtime_get_noresume(struct device *dev)
{
atomic_inc(&dev->power.usage_count);
}
/**
* pm_runtime_put_noidle - Drop runtime PM usage counter of a device.
* @dev: Target device.
*
* Decrement the runtime PM usage counter of @dev unless it is 0 already.
*/
static inline void pm_runtime_put_noidle(struct device *dev)
{
atomic_add_unless(&dev->power.usage_count, -1, 0);
}
/**
* pm_runtime_suspended - Check whether or not a device is runtime-suspended.
* @dev: Target device.
*
* Return %true if runtime PM is enabled for @dev and its runtime PM status is
* %RPM_SUSPENDED, or %false otherwise.
*
* Note that the return value of this function can only be trusted if it is
* called under the runtime PM lock of @dev or under conditions in which
* runtime PM cannot be either disabled or enabled for @dev and its runtime PM
* status cannot change.
*/
static inline bool pm_runtime_suspended(struct device *dev)
{
return dev->power.runtime_status == RPM_SUSPENDED
&& !dev->power.disable_depth;
}
/**
* pm_runtime_active - Check whether or not a device is runtime-active.
* @dev: Target device.
*
* Return %true if runtime PM is disabled for @dev or its runtime PM status is
* %RPM_ACTIVE, or %false otherwise.
*
* Note that the return value of this function can only be trusted if it is
* called under the runtime PM lock of @dev or under conditions in which
* runtime PM cannot be either disabled or enabled for @dev and its runtime PM
* status cannot change.
*/
static inline bool pm_runtime_active(struct device *dev)
{
return dev->power.runtime_status == RPM_ACTIVE
|| dev->power.disable_depth;
}
/**
* pm_runtime_status_suspended - Check if runtime PM status is "suspended".
* @dev: Target device.
*
* Return %true if the runtime PM status of @dev is %RPM_SUSPENDED, or %false
* otherwise, regardless of whether or not runtime PM has been enabled for @dev.
*
* Note that the return value of this function can only be trusted if it is
* called under the runtime PM lock of @dev or under conditions in which the
* runtime PM status of @dev cannot change.
*/
static inline bool pm_runtime_status_suspended(struct device *dev)
{
return dev->power.runtime_status == RPM_SUSPENDED;
}
/**
* pm_runtime_enabled - Check if runtime PM is enabled.
* @dev: Target device.
*
* Return %true if runtime PM is enabled for @dev or %false otherwise.
*
* Note that the return value of this function can only be trusted if it is
* called under the runtime PM lock of @dev or under conditions in which
* runtime PM cannot be either disabled or enabled for @dev.
*/
static inline bool pm_runtime_enabled(struct device *dev)
{
return !dev->power.disable_depth;
}
/**
* pm_runtime_has_no_callbacks - Check if runtime PM callbacks may be present.
* @dev: Target device.
*
* Return %true if @dev is a special device without runtime PM callbacks or
* %false otherwise.
*/
static inline bool pm_runtime_has_no_callbacks(struct device *dev)
{
return dev->power.no_callbacks;
}
/**
* pm_runtime_mark_last_busy - Update the last access time of a device.
* @dev: Target device.
*
* Update the last access time of @dev used by the runtime PM autosuspend
* mechanism to the current time as returned by ktime_get_mono_fast_ns().
*/
static inline void pm_runtime_mark_last_busy(struct device *dev)
{
WRITE_ONCE(dev->power.last_busy, ktime_get_mono_fast_ns());
}
/**
* pm_runtime_is_irq_safe - Check if runtime PM can work in interrupt context.
* @dev: Target device.
*
* Return %true if @dev has been marked as an "IRQ-safe" device (with respect
* to runtime PM), in which case its runtime PM callabcks can be expected to
* work correctly when invoked from interrupt handlers.
*/
static inline bool pm_runtime_is_irq_safe(struct device *dev)
{
return dev->power.irq_safe;
}
extern u64 pm_runtime_suspended_time(struct device *dev);
#else /* !CONFIG_PM */
static inline bool queue_pm_work(struct work_struct *work) { return false; }
static inline int pm_generic_runtime_suspend(struct device *dev) { return 0; }
static inline int pm_generic_runtime_resume(struct device *dev) { return 0; }
static inline int pm_runtime_force_suspend(struct device *dev) { return 0; }
static inline int pm_runtime_force_resume(struct device *dev) { return 0; }
static inline int __pm_runtime_idle(struct device *dev, int rpmflags)
{
return -ENOSYS;
}
static inline int __pm_runtime_suspend(struct device *dev, int rpmflags)
{
return -ENOSYS;
}
static inline int __pm_runtime_resume(struct device *dev, int rpmflags)
{
return 1;
}
static inline int pm_schedule_suspend(struct device *dev, unsigned int delay)
{
return -ENOSYS;
}
static inline int pm_runtime_get_if_in_use(struct device *dev)
{
return -EINVAL;
}
static inline int pm_runtime_get_if_active(struct device *dev,
bool ign_usage_count)
{
return -EINVAL;
}
static inline int __pm_runtime_set_status(struct device *dev,
unsigned int status) { return 0; }
static inline int pm_runtime_barrier(struct device *dev) { return 0; }
static inline void pm_runtime_enable(struct device *dev) {}
static inline void __pm_runtime_disable(struct device *dev, bool c) {}
static inline void pm_runtime_allow(struct device *dev) {}
static inline void pm_runtime_forbid(struct device *dev) {}
static inline int devm_pm_runtime_enable(struct device *dev) { return 0; }
static inline void pm_suspend_ignore_children(struct device *dev, bool enable) {}
static inline void pm_runtime_get_noresume(struct device *dev) {}
static inline void pm_runtime_put_noidle(struct device *dev) {}
static inline bool pm_runtime_suspended(struct device *dev) { return false; }
static inline bool pm_runtime_active(struct device *dev) { return true; }
static inline bool pm_runtime_status_suspended(struct device *dev) { return false; }
static inline bool pm_runtime_enabled(struct device *dev) { return false; }
static inline void pm_runtime_no_callbacks(struct device *dev) {}
static inline void pm_runtime_irq_safe(struct device *dev) {}
static inline bool pm_runtime_is_irq_safe(struct device *dev) { return false; }
static inline bool pm_runtime_has_no_callbacks(struct device *dev) { return false; }
static inline void pm_runtime_mark_last_busy(struct device *dev) {}
static inline void __pm_runtime_use_autosuspend(struct device *dev,
bool use) {}
static inline void pm_runtime_set_autosuspend_delay(struct device *dev,
int delay) {}
static inline u64 pm_runtime_autosuspend_expiration(
struct device *dev) { return 0; }
static inline void pm_runtime_set_memalloc_noio(struct device *dev,
bool enable){}
static inline void pm_runtime_get_suppliers(struct device *dev) {}
static inline void pm_runtime_put_suppliers(struct device *dev) {}
static inline void pm_runtime_new_link(struct device *dev) {}
static inline void pm_runtime_drop_link(struct device_link *link) {}
static inline void pm_runtime_release_supplier(struct device_link *link,
bool check_idle) {}
#endif /* !CONFIG_PM */
/**
* pm_runtime_idle - Conditionally set up autosuspend of a device or suspend it.
* @dev: Target device.
*
* Invoke the "idle check" callback of @dev and, depending on its return value,
* set up autosuspend of @dev or suspend it (depending on whether or not
* autosuspend has been enabled for it).
*/
static inline int pm_runtime_idle(struct device *dev)
{
return __pm_runtime_idle(dev, 0);
}
/**
* pm_runtime_suspend - Suspend a device synchronously.
* @dev: Target device.
*/
static inline int pm_runtime_suspend(struct device *dev)
{
return __pm_runtime_suspend(dev, 0);
}
/**
* pm_runtime_autosuspend - Set up autosuspend of a device or suspend it.
* @dev: Target device.
*
* Set up autosuspend of @dev or suspend it (depending on whether or not
* autosuspend is enabled for it) without engaging its "idle check" callback.
*/
static inline int pm_runtime_autosuspend(struct device *dev)
{
return __pm_runtime_suspend(dev, RPM_AUTO);
}
/**
* pm_runtime_resume - Resume a device synchronously.
* @dev: Target device.
*/
static inline int pm_runtime_resume(struct device *dev)
{
return __pm_runtime_resume(dev, 0);
}
/**
* pm_request_idle - Queue up "idle check" execution for a device.
* @dev: Target device.
*
* Queue up a work item to run an equivalent of pm_runtime_idle() for @dev
* asynchronously.
*/
static inline int pm_request_idle(struct device *dev)
{
return __pm_runtime_idle(dev, RPM_ASYNC);
}
/**
* pm_request_resume - Queue up runtime-resume of a device.
* @dev: Target device.
*/
static inline int pm_request_resume(struct device *dev)
{
return __pm_runtime_resume(dev, RPM_ASYNC);
}
/**
* pm_request_autosuspend - Queue up autosuspend of a device.
* @dev: Target device.
*
* Queue up a work item to run an equivalent pm_runtime_autosuspend() for @dev
* asynchronously.
*/
static inline int pm_request_autosuspend(struct device *dev)
{
return __pm_runtime_suspend(dev, RPM_ASYNC | RPM_AUTO);
}
/**
* pm_runtime_get - Bump up usage counter and queue up resume of a device.
* @dev: Target device.
*
* Bump up the runtime PM usage counter of @dev and queue up a work item to
* carry out runtime-resume of it.
*/
static inline int pm_runtime_get(struct device *dev)
{
return __pm_runtime_resume(dev, RPM_GET_PUT | RPM_ASYNC);
}
/**
* pm_runtime_get_sync - Bump up usage counter of a device and resume it.
* @dev: Target device.
*
* Bump up the runtime PM usage counter of @dev and carry out runtime-resume of
* it synchronously.
*
* The possible return values of this function are the same as for
* pm_runtime_resume() and the runtime PM usage counter of @dev remains
* incremented in all cases, even if it returns an error code.
* Consider using pm_runtime_resume_and_get() instead of it, especially
* if its return value is checked by the caller, as this is likely to result
* in cleaner code.
*/
static inline int pm_runtime_get_sync(struct device *dev)
{
return __pm_runtime_resume(dev, RPM_GET_PUT);
}
/**
* pm_runtime_resume_and_get - Bump up usage counter of a device and resume it.
* @dev: Target device.
*
* Resume @dev synchronously and if that is successful, increment its runtime
* PM usage counter. Return 0 if the runtime PM usage counter of @dev has been
* incremented or a negative error code otherwise.
*/
static inline int pm_runtime_resume_and_get(struct device *dev)
{
int ret;
ret = __pm_runtime_resume(dev, RPM_GET_PUT);
if (ret < 0) {
pm_runtime_put_noidle(dev);
return ret;
}
return 0;
}
/**
* pm_runtime_put - Drop device usage counter and queue up "idle check" if 0.
* @dev: Target device.
*
* Decrement the runtime PM usage counter of @dev and if it turns out to be
* equal to 0, queue up a work item for @dev like in pm_request_idle().
*/
static inline int pm_runtime_put(struct device *dev)
{
return __pm_runtime_idle(dev, RPM_GET_PUT | RPM_ASYNC);
}
/**
* pm_runtime_put_autosuspend - Drop device usage counter and queue autosuspend if 0.
* @dev: Target device.
*
* Decrement the runtime PM usage counter of @dev and if it turns out to be
* equal to 0, queue up a work item for @dev like in pm_request_autosuspend().
*/
static inline int pm_runtime_put_autosuspend(struct device *dev)
{
return __pm_runtime_suspend(dev,
RPM_GET_PUT | RPM_ASYNC | RPM_AUTO);
}
/**
* pm_runtime_put_sync - Drop device usage counter and run "idle check" if 0.
* @dev: Target device.
*
* Decrement the runtime PM usage counter of @dev and if it turns out to be
* equal to 0, invoke the "idle check" callback of @dev and, depending on its
* return value, set up autosuspend of @dev or suspend it (depending on whether
* or not autosuspend has been enabled for it).
*
* The possible return values of this function are the same as for
* pm_runtime_idle() and the runtime PM usage counter of @dev remains
* decremented in all cases, even if it returns an error code.
*/
static inline int pm_runtime_put_sync(struct device *dev)
{
return __pm_runtime_idle(dev, RPM_GET_PUT);
}
/**
* pm_runtime_put_sync_suspend - Drop device usage counter and suspend if 0.
* @dev: Target device.
*
* Decrement the runtime PM usage counter of @dev and if it turns out to be
* equal to 0, carry out runtime-suspend of @dev synchronously.
*
* The possible return values of this function are the same as for
* pm_runtime_suspend() and the runtime PM usage counter of @dev remains
* decremented in all cases, even if it returns an error code.
*/
static inline int pm_runtime_put_sync_suspend(struct device *dev)
{
return __pm_runtime_suspend(dev, RPM_GET_PUT);
}
/**
* pm_runtime_put_sync_autosuspend - Drop device usage counter and autosuspend if 0.
* @dev: Target device.
*
* Decrement the runtime PM usage counter of @dev and if it turns out to be
* equal to 0, set up autosuspend of @dev or suspend it synchronously (depending
* on whether or not autosuspend has been enabled for it).
*
* The possible return values of this function are the same as for
* pm_runtime_autosuspend() and the runtime PM usage counter of @dev remains
* decremented in all cases, even if it returns an error code.
*/
static inline int pm_runtime_put_sync_autosuspend(struct device *dev)
{
return __pm_runtime_suspend(dev, RPM_GET_PUT | RPM_AUTO);
}
/**
* pm_runtime_set_active - Set runtime PM status to "active".
* @dev: Target device.
*
* Set the runtime PM status of @dev to %RPM_ACTIVE and ensure that dependencies
* of it will be taken into account.
*
* It is not valid to call this function for devices with runtime PM enabled.
*/
static inline int pm_runtime_set_active(struct device *dev)
{
return __pm_runtime_set_status(dev, RPM_ACTIVE);
}
/**
* pm_runtime_set_suspended - Set runtime PM status to "suspended".
* @dev: Target device.
*
* Set the runtime PM status of @dev to %RPM_SUSPENDED and ensure that
* dependencies of it will be taken into account.
*
* It is not valid to call this function for devices with runtime PM enabled.
*/
static inline int pm_runtime_set_suspended(struct device *dev)
{
return __pm_runtime_set_status(dev, RPM_SUSPENDED);
}
/**
* pm_runtime_disable - Disable runtime PM for a device.
* @dev: Target device.
*
* Prevent the runtime PM framework from working with @dev (by incrementing its
* "blocking" counter).
*
* For each invocation of this function for @dev there must be a matching
* pm_runtime_enable() call in order for runtime PM to be enabled for it.
*/
static inline void pm_runtime_disable(struct device *dev)
{
__pm_runtime_disable(dev, true);
}
/**
* pm_runtime_use_autosuspend - Allow autosuspend to be used for a device.
* @dev: Target device.
*
* Allow the runtime PM autosuspend mechanism to be used for @dev whenever
* requested (or "autosuspend" will be handled as direct runtime-suspend for
* it).
*/
static inline void pm_runtime_use_autosuspend(struct device *dev)
{
__pm_runtime_use_autosuspend(dev, true);
}
/**
* pm_runtime_dont_use_autosuspend - Prevent autosuspend from being used.
* @dev: Target device.
*
* Prevent the runtime PM autosuspend mechanism from being used for @dev which
* means that "autosuspend" will be handled as direct runtime-suspend for it
* going forward.
*/
static inline void pm_runtime_dont_use_autosuspend(struct device *dev)
{
__pm_runtime_use_autosuspend(dev, false);
}
#endif
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef __LINUX_COMPLETION_H
#define __LINUX_COMPLETION_H
/*
* (C) Copyright 2001 Linus Torvalds
*
* Atomic wait-for-completion handler data structures.
* See kernel/sched/completion.c for details.
*/
#include <linux/swait.h>
/*
* struct completion - structure used to maintain state for a "completion"
*
* This is the opaque structure used to maintain the state for a "completion".
* Completions currently use a FIFO to queue threads that have to wait for
* the "completion" event.
*
* See also: complete(), wait_for_completion() (and friends _timeout,
* _interruptible, _interruptible_timeout, and _killable), init_completion(),
* reinit_completion(), and macros DECLARE_COMPLETION(),
* DECLARE_COMPLETION_ONSTACK().
*/
struct completion {
unsigned int done;
struct swait_queue_head wait;
};
#define init_completion_map(x, m) init_completion(x)
static inline void complete_acquire(struct completion *x) {}
static inline void complete_release(struct completion *x) {}
#define COMPLETION_INITIALIZER(work) \
{ 0, __SWAIT_QUEUE_HEAD_INITIALIZER((work).wait) }
#define COMPLETION_INITIALIZER_ONSTACK_MAP(work, map) \
(*({ init_completion_map(&(work), &(map)); &(work); }))
#define COMPLETION_INITIALIZER_ONSTACK(work) \
(*({ init_completion(&work); &work; }))
/**
* DECLARE_COMPLETION - declare and initialize a completion structure
* @work: identifier for the completion structure
*
* This macro declares and initializes a completion structure. Generally used
* for static declarations. You should use the _ONSTACK variant for automatic
* variables.
*/
#define DECLARE_COMPLETION(work) \
struct completion work = COMPLETION_INITIALIZER(work)
/*
* Lockdep needs to run a non-constant initializer for on-stack
* completions - so we use the _ONSTACK() variant for those that
* are on the kernel stack:
*/
/**
* DECLARE_COMPLETION_ONSTACK - declare and initialize a completion structure
* @work: identifier for the completion structure
*
* This macro declares and initializes a completion structure on the kernel
* stack.
*/
#ifdef CONFIG_LOCKDEP
# define DECLARE_COMPLETION_ONSTACK(work) \
struct completion work = COMPLETION_INITIALIZER_ONSTACK(work)
# define DECLARE_COMPLETION_ONSTACK_MAP(work, map) \
struct completion work = COMPLETION_INITIALIZER_ONSTACK_MAP(work, map)
#else
# define DECLARE_COMPLETION_ONSTACK(work) DECLARE_COMPLETION(work)
# define DECLARE_COMPLETION_ONSTACK_MAP(work, map) DECLARE_COMPLETION(work)
#endif
/**
* init_completion - Initialize a dynamically allocated completion
* @x: pointer to completion structure that is to be initialized
*
* This inline function will initialize a dynamically created completion
* structure.
*/
static inline void init_completion(struct completion *x)
{
x->done = 0;
init_swait_queue_head(&x->wait);
}
/**
* reinit_completion - reinitialize a completion structure
* @x: pointer to completion structure that is to be reinitialized
*
* This inline function should be used to reinitialize a completion structure so it can
* be reused. This is especially important after complete_all() is used.
*/
static inline void reinit_completion(struct completion *x)
{
x->done = 0;
}
extern void wait_for_completion(struct completion *);
extern void wait_for_completion_io(struct completion *);
extern int wait_for_completion_interruptible(struct completion *x);
extern int wait_for_completion_killable(struct completion *x);
extern unsigned long wait_for_completion_timeout(struct completion *x,
unsigned long timeout);
extern unsigned long wait_for_completion_io_timeout(struct completion *x,
unsigned long timeout);
extern long wait_for_completion_interruptible_timeout(
struct completion *x, unsigned long timeout);
extern long wait_for_completion_killable_timeout(
struct completion *x, unsigned long timeout);
extern bool try_wait_for_completion(struct completion *x);
extern bool completion_done(struct completion *x);
extern void complete(struct completion *);
extern void complete_all(struct completion *);
#endif
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_PID_H
#define _LINUX_PID_H
#include <linux/rculist.h>
#include <linux/wait.h>
#include <linux/refcount.h>
enum pid_type
{
PIDTYPE_PID,
PIDTYPE_TGID,
PIDTYPE_PGID,
PIDTYPE_SID,
PIDTYPE_MAX,
};
/*
* What is struct pid?
*
* A struct pid is the kernel's internal notion of a process identifier.
* It refers to individual tasks, process groups, and sessions. While
* there are processes attached to it the struct pid lives in a hash
* table, so it and then the processes that it refers to can be found
* quickly from the numeric pid value. The attached processes may be
* quickly accessed by following pointers from struct pid.
*
* Storing pid_t values in the kernel and referring to them later has a
* problem. The process originally with that pid may have exited and the
* pid allocator wrapped, and another process could have come along
* and been assigned that pid.
*
* Referring to user space processes by holding a reference to struct
* task_struct has a problem. When the user space process exits
* the now useless task_struct is still kept. A task_struct plus a
* stack consumes around 10K of low kernel memory. More precisely
* this is THREAD_SIZE + sizeof(struct task_struct). By comparison
* a struct pid is about 64 bytes.
*
* Holding a reference to struct pid solves both of these problems.
* It is small so holding a reference does not consume a lot of
* resources, and since a new struct pid is allocated when the numeric pid
* value is reused (when pids wrap around) we don't mistakenly refer to new
* processes.
*/
/*
* struct upid is used to get the id of the struct pid, as it is
* seen in particular namespace. Later the struct pid is found with
* find_pid_ns() using the int nr and struct pid_namespace *ns.
*/
struct upid {
int nr;
struct pid_namespace *ns;
};
struct pid
{
refcount_t count;
unsigned int level;
spinlock_t lock;
/* lists of tasks that use this pid */
struct hlist_head tasks[PIDTYPE_MAX];
struct hlist_head inodes;
/* wait queue for pidfd notifications */
wait_queue_head_t wait_pidfd;
struct rcu_head rcu;
struct upid numbers[1];
};
extern struct pid init_struct_pid;
extern const struct file_operations pidfd_fops;
struct file;
extern struct pid *pidfd_pid(const struct file *file);
struct pid *pidfd_get_pid(unsigned int fd, unsigned int *flags);
int pidfd_create(struct pid *pid, unsigned int flags);
static inline struct pid *get_pid(struct pid *pid)
{
if (pid)
refcount_inc(&pid->count);
return pid;
}
extern void put_pid(struct pid *pid);
extern struct task_struct *pid_task(struct pid *pid, enum pid_type);
static inline bool pid_has_task(struct pid *pid, enum pid_type type)
{
return !hlist_empty(&pid->tasks[type]);
}
extern struct task_struct *get_pid_task(struct pid *pid, enum pid_type);
extern struct pid *get_task_pid(struct task_struct *task, enum pid_type type);
/*
* these helpers must be called with the tasklist_lock write-held.
*/
extern void attach_pid(struct task_struct *task, enum pid_type);
extern void detach_pid(struct task_struct *task, enum pid_type);
extern void change_pid(struct task_struct *task, enum pid_type,
struct pid *pid);
extern void exchange_tids(struct task_struct *task, struct task_struct *old);
extern void transfer_pid(struct task_struct *old, struct task_struct *new,
enum pid_type);
struct pid_namespace;
extern struct pid_namespace init_pid_ns;
extern int pid_max;
extern int pid_max_min, pid_max_max;
/*
* look up a PID in the hash table. Must be called with the tasklist_lock
* or rcu_read_lock() held.
*
* find_pid_ns() finds the pid in the namespace specified
* find_vpid() finds the pid by its virtual id, i.e. in the current namespace
*
* see also find_task_by_vpid() set in include/linux/sched.h
*/
extern struct pid *find_pid_ns(int nr, struct pid_namespace *ns);
extern struct pid *find_vpid(int nr);
/*
* Lookup a PID in the hash table, and return with it's count elevated.
*/
extern struct pid *find_get_pid(int nr);
extern struct pid *find_ge_pid(int nr, struct pid_namespace *);
extern struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid,
size_t set_tid_size);
extern void free_pid(struct pid *pid);
extern void disable_pid_allocation(struct pid_namespace *ns);
/*
* ns_of_pid() returns the pid namespace in which the specified pid was
* allocated.
*
* NOTE:
* ns_of_pid() is expected to be called for a process (task) that has
* an attached 'struct pid' (see attach_pid(), detach_pid()) i.e @pid
* is expected to be non-NULL. If @pid is NULL, caller should handle
* the resulting NULL pid-ns.
*/
static inline struct pid_namespace *ns_of_pid(struct pid *pid)
{
struct pid_namespace *ns = NULL;
if (pid)
ns = pid->numbers[pid->level].ns;
return ns;
}
/*
* is_child_reaper returns true if the pid is the init process
* of the current namespace. As this one could be checked before
* pid_ns->child_reaper is assigned in copy_process, we check
* with the pid number.
*/
static inline bool is_child_reaper(struct pid *pid)
{
return pid->numbers[pid->level].nr == 1;
}
/*
* the helpers to get the pid's id seen from different namespaces
*
* pid_nr() : global id, i.e. the id seen from the init namespace;
* pid_vnr() : virtual id, i.e. the id seen from the pid namespace of
* current.
* pid_nr_ns() : id seen from the ns specified.
*
* see also task_xid_nr() etc in include/linux/sched.h
*/
static inline pid_t pid_nr(struct pid *pid)
{
pid_t nr = 0;
if (pid)
nr = pid->numbers[0].nr;
return nr;
}
pid_t pid_nr_ns(struct pid *pid, struct pid_namespace *ns);
pid_t pid_vnr(struct pid *pid);
#define do_each_pid_task(pid, type, task) \
do { \
if ((pid) != NULL) \
hlist_for_each_entry_rcu((task), \
&(pid)->tasks[type], pid_links[type]) {
/*
* Both old and new leaders may be attached to
* the same pid in the middle of de_thread().
*/
#define while_each_pid_task(pid, type, task) \
if (type == PIDTYPE_PID) \
break; \
} \
} while (0)
#define do_each_pid_thread(pid, type, task) \
do_each_pid_task(pid, type, task) { \
struct task_struct *tg___ = task; \
for_each_thread(tg___, task) {
#define while_each_pid_thread(pid, type, task) \
} \
task = tg___; \
} while_each_pid_task(pid, type, task)
#endif /* _LINUX_PID_H */
// SPDX-License-Identifier: GPL-2.0-only
/*
* linux/fs/exec.c
*
* Copyright (C) 1991, 1992 Linus Torvalds
*/
/*
* #!-checking implemented by tytso.
*/
/*
* Demand-loading implemented 01.12.91 - no need to read anything but
* the header into memory. The inode of the executable is put into
* "current->executable", and page faults do the actual loading. Clean.
*
* Once more I can proudly say that linux stood up to being changed: it
* was less than 2 hours work to get demand-loading completely implemented.
*
* Demand loading changed July 1993 by Eric Youngdale. Use mmap instead,
* current->executable is only used by the procfs. This allows a dispatch
* table to check for several different types of binary formats. We keep
* trying until we recognize the file or we run out of supported binary
* formats.
*/
#include <linux/kernel_read_file.h>
#include <linux/slab.h>
#include <linux/file.h>
#include <linux/fdtable.h>
#include <linux/mm.h>
#include <linux/vmacache.h>
#include <linux/stat.h>
#include <linux/fcntl.h>
#include <linux/swap.h>
#include <linux/string.h>
#include <linux/init.h>
#include <linux/sched/mm.h>
#include <linux/sched/coredump.h>
#include <linux/sched/signal.h>
#include <linux/sched/numa_balancing.h>
#include <linux/sched/task.h>
#include <linux/pagemap.h>
#include <linux/perf_event.h>
#include <linux/highmem.h>
#include <linux/spinlock.h>
#include <linux/key.h>
#include <linux/personality.h>
#include <linux/binfmts.h>
#include <linux/utsname.h>
#include <linux/pid_namespace.h>
#include <linux/module.h>
#include <linux/namei.h>
#include <linux/mount.h>
#include <linux/security.h>
#include <linux/syscalls.h>
#include <linux/tsacct_kern.h>
#include <linux/cn_proc.h>
#include <linux/audit.h>
#include <linux/tracehook.h>
#include <linux/kmod.h>
#include <linux/fsnotify.h>
#include <linux/fs_struct.h>
#include <linux/oom.h>
#include <linux/compat.h>
#include <linux/vmalloc.h>
#include <linux/io_uring.h>
#include <linux/syscall_user_dispatch.h>
#include <linux/uaccess.h>
#include <asm/mmu_context.h>
#include <asm/tlb.h>
#include <trace/events/task.h>
#include "internal.h"
#include <trace/events/sched.h>
static int bprm_creds_from_file(struct linux_binprm *bprm);
int suid_dumpable = 0;
static LIST_HEAD(formats);
static DEFINE_RWLOCK(binfmt_lock);
void __register_binfmt(struct linux_binfmt * fmt, int insert)
{
write_lock(&binfmt_lock);
insert ? list_add(&fmt->lh, &formats) :
list_add_tail(&fmt->lh, &formats);
write_unlock(&binfmt_lock);
}
EXPORT_SYMBOL(__register_binfmt);
void unregister_binfmt(struct linux_binfmt * fmt)
{
write_lock(&binfmt_lock);
list_del(&fmt->lh);
write_unlock(&binfmt_lock);
}
EXPORT_SYMBOL(unregister_binfmt);
static inline void put_binfmt(struct linux_binfmt * fmt)
{
module_put(fmt->module);
}
bool path_noexec(const struct path *path)
{
return (path->mnt->mnt_flags & MNT_NOEXEC) ||
(path->mnt->mnt_sb->s_iflags & SB_I_NOEXEC);
}
#ifdef CONFIG_USELIB
/*
* Note that a shared library must be both readable and executable due to
* security reasons.
*
* Also note that we take the address to load from from the file itself.
*/
SYSCALL_DEFINE1(uselib, const char __user *, library)
{
struct linux_binfmt *fmt;
struct file *file;
struct filename *tmp = getname(library);
int error = PTR_ERR(tmp);
static const struct open_flags uselib_flags = {
.open_flag = O_LARGEFILE | O_RDONLY | __FMODE_EXEC,
.acc_mode = MAY_READ | MAY_EXEC,
.intent = LOOKUP_OPEN,
.lookup_flags = LOOKUP_FOLLOW,
};
if (IS_ERR(tmp))
goto out;
file = do_filp_open(AT_FDCWD, tmp, &uselib_flags);
putname(tmp);
error = PTR_ERR(file);
if (IS_ERR(file))
goto out;
/*
* may_open() has already checked for this, so it should be
* impossible to trip now. But we need to be extra cautious
* and check again at the very end too.
*/
error = -EACCES;
if (WARN_ON_ONCE(!S_ISREG(file_inode(file)->i_mode) ||
path_noexec(&file->f_path)))
goto exit;
fsnotify_open(file);
error = -ENOEXEC;
read_lock(&binfmt_lock);
list_for_each_entry(fmt, &formats, lh) {
if (!fmt->load_shlib)
continue;
if (!try_module_get(fmt->module))
continue;
read_unlock(&binfmt_lock);
error = fmt->load_shlib(file);
read_lock(&binfmt_lock);
put_binfmt(fmt);
if (error != -ENOEXEC)
break;
}
read_unlock(&binfmt_lock);
exit:
fput(file);
out:
return error;
}
#endif /* #ifdef CONFIG_USELIB */
#ifdef CONFIG_MMU
/*
* The nascent bprm->mm is not visible until exec_mmap() but it can
* use a lot of memory, account these pages in current->mm temporary
* for oom_badness()->get_mm_rss(). Once exec succeeds or fails, we
* change the counter back via acct_arg_size(0).
*/
static void acct_arg_size(struct linux_binprm *bprm, unsigned long pages)
{
struct mm_struct *mm = current->mm;
long diff = (long)(pages - bprm->vma_pages);
if (!mm || !diff)
return;
bprm->vma_pages = pages;
add_mm_counter(mm, MM_ANONPAGES, diff);
}
static struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos,
int write)
{
struct page *page;
int ret;
unsigned int gup_flags = FOLL_FORCE;
#ifdef CONFIG_STACK_GROWSUP
if (write) {
ret = expand_downwards(bprm->vma, pos);
if (ret < 0)
return NULL;
}
#endif
if (write)
gup_flags |= FOLL_WRITE;
/*
* We are doing an exec(). 'current' is the process
* doing the exec and bprm->mm is the new process's mm.
*/
mmap_read_lock(bprm->mm);
ret = get_user_pages_remote(bprm->mm, pos, 1, gup_flags,
&page, NULL, NULL);
mmap_read_unlock(bprm->mm);
if (ret <= 0)
return NULL;
if (write)
acct_arg_size(bprm, vma_pages(bprm->vma));
return page;
}
static void put_arg_page(struct page *page)
{
put_page(page);
}
static void free_arg_pages(struct linux_binprm *bprm)
{
}
static void flush_arg_page(struct linux_binprm *bprm, unsigned long pos,
struct page *page)
{
flush_cache_page(bprm->vma, pos, page_to_pfn(page));
}
static int __bprm_mm_init(struct linux_binprm *bprm)
{
int err;
struct vm_area_struct *vma = NULL;
struct mm_struct *mm = bprm->mm;
bprm->vma = vma = vm_area_alloc(mm);
if (!vma)
return -ENOMEM;
vma_set_anonymous(vma);
if (mmap_write_lock_killable(mm)) {
err = -EINTR;
goto err_free;
}
/*
* Place the stack at the largest stack address the architecture
* supports. Later, we'll move this to an appropriate place. We don't
* use STACK_TOP because that can depend on attributes which aren't
* configured yet.
*/
BUILD_BUG_ON(VM_STACK_FLAGS & VM_STACK_INCOMPLETE_SETUP);
vma->vm_end = STACK_TOP_MAX;
vma->vm_start = vma->vm_end - PAGE_SIZE;
vma->vm_flags = VM_SOFTDIRTY | VM_STACK_FLAGS | VM_STACK_INCOMPLETE_SETUP;
vma->vm_page_prot = vm_get_page_prot(vma->vm_flags);
err = insert_vm_struct(mm, vma);
if (err)
goto err;
mm->stack_vm = mm->total_vm = 1;
mmap_write_unlock(mm);
bprm->p = vma->vm_end - sizeof(void *);
return 0;
err:
mmap_write_unlock(mm);
err_free:
bprm->vma = NULL;
vm_area_free(vma);
return err;
}
static bool valid_arg_len(struct linux_binprm *bprm, long len)
{
return len <= MAX_ARG_STRLEN;
}
#else
static inline void acct_arg_size(struct linux_binprm *bprm, unsigned long pages)
{
}
static struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos,
int write)
{
struct page *page;
page = bprm->page[pos / PAGE_SIZE];
if (!page && write) {
page = alloc_page(GFP_HIGHUSER|__GFP_ZERO);
if (!page)
return NULL;
bprm->page[pos / PAGE_SIZE] = page;
}
return page;
}
static void put_arg_page(struct page *page)
{
}
static void free_arg_page(struct linux_binprm *bprm, int i)
{
if (bprm->page[i]) {
__free_page(bprm->page[i]);
bprm->page[i] = NULL;
}
}
static void free_arg_pages(struct linux_binprm *bprm)
{
int i;
for (i = 0; i < MAX_ARG_PAGES; i++)
free_arg_page(bprm, i);
}
static void flush_arg_page(struct linux_binprm *bprm, unsigned long pos,
struct page *page)
{
}
static int __bprm_mm_init(struct linux_binprm *bprm)
{
bprm->p = PAGE_SIZE * MAX_ARG_PAGES - sizeof(void *);
return 0;
}
static bool valid_arg_len(struct linux_binprm *bprm, long len)
{
return len <= bprm->p;
}
#endif /* CONFIG_MMU */
/*
* Create a new mm_struct and populate it with a temporary stack
* vm_area_struct. We don't have enough context at this point to set the stack
* flags, permissions, and offset, so we use temporary values. We'll update
* them later in setup_arg_pages().
*/
static int bprm_mm_init(struct linux_binprm *bprm)
{
int err;
struct mm_struct *mm = NULL;
bprm->mm = mm = mm_alloc();
err = -ENOMEM;
if (!mm)
goto err;
/* Save current stack limit for all calculations made during exec. */
task_lock(current->group_leader);
bprm->rlim_stack = current->signal->rlim[RLIMIT_STACK];
task_unlock(current->group_leader);
err = __bprm_mm_init(bprm);
if (err)
goto err;
return 0;
err:
if (mm) {
bprm->mm = NULL;
mmdrop(mm);
}
return err;
}
struct user_arg_ptr {
#ifdef CONFIG_COMPAT
bool is_compat;
#endif
union {
const char __user *const __user *native;
#ifdef CONFIG_COMPAT
const compat_uptr_t __user *compat;
#endif
} ptr;
};
static const char __user *get_user_arg_ptr(struct user_arg_ptr argv, int nr)
{
const char __user *native;
#ifdef CONFIG_COMPAT
if (unlikely(argv.is_compat)) {
compat_uptr_t compat;
if (get_user(compat, argv.ptr.compat + nr))
return ERR_PTR(-EFAULT);
return compat_ptr(compat);
}
#endif
if (get_user(native, argv.ptr.native + nr))
return ERR_PTR(-EFAULT);
return native;
}
/*
* count() counts the number of strings in array ARGV.
*/
static int count(struct user_arg_ptr argv, int max)
{
int i = 0;
if (argv.ptr.native != NULL) {
for (;;) {
const char __user *p = get_user_arg_ptr(argv, i);
if (!p)
break;
if (IS_ERR(p))
return -EFAULT;
if (i >= max)
return -E2BIG;
++i;
if (fatal_signal_pending(current))
return -ERESTARTNOHAND;
cond_resched();
}
}
return i;
}
static int count_strings_kernel(const char *const *argv)
{
int i;
if (!argv)
return 0;
for (i = 0; argv[i]; ++i) {
if (i >= MAX_ARG_STRINGS)
return -E2BIG;
if (fatal_signal_pending(current))
return -ERESTARTNOHAND;
cond_resched();
}
return i;
}
static int bprm_stack_limits(struct linux_binprm *bprm)
{
unsigned long limit, ptr_size;
/*
* Limit to 1/4 of the max stack size or 3/4 of _STK_LIM
* (whichever is smaller) for the argv+env strings.
* This ensures that:
* - the remaining binfmt code will not run out of stack space,
* - the program will have a reasonable amount of stack left
* to work from.
*/
limit = _STK_LIM / 4 * 3;
limit = min(limit, bprm->rlim_stack.rlim_cur / 4);
/*
* We've historically supported up to 32 pages (ARG_MAX)
* of argument strings even with small stacks
*/
limit = max_t(unsigned long, limit, ARG_MAX);
/*
* We must account for the size of all the argv and envp pointers to
* the argv and envp strings, since they will also take up space in
* the stack. They aren't stored until much later when we can't
* signal to the parent that the child has run out of stack space.
* Instead, calculate it here so it's possible to fail gracefully.
*
* In the case of argc = 0, make sure there is space for adding a
* empty string (which will bump argc to 1), to ensure confused
* userspace programs don't start processing from argv[1], thinking
* argc can never be 0, to keep them from walking envp by accident.
* See do_execveat_common().
*/
ptr_size = (max(bprm->argc, 1) + bprm->envc) * sizeof(void *);
if (limit <= ptr_size)
return -E2BIG;
limit -= ptr_size;
bprm->argmin = bprm->p - limit;
return 0;
}
/*
* 'copy_strings()' copies argument/environment strings from the old
* processes's memory to the new process's stack. The call to get_user_pages()
* ensures the destination page is created and not swapped out.
*/
static int copy_strings(int argc, struct user_arg_ptr argv,
struct linux_binprm *bprm)
{
struct page *kmapped_page = NULL;
char *kaddr = NULL;
unsigned long kpos = 0;
int ret;
while (argc-- > 0) {
const char __user *str;
int len;
unsigned long pos;
ret = -EFAULT;
str = get_user_arg_ptr(argv, argc);
if (IS_ERR(str))
goto out;
len = strnlen_user(str, MAX_ARG_STRLEN);
if (!len)
goto out;
ret = -E2BIG;
if (!valid_arg_len(bprm, len))
goto out;
/* We're going to work our way backwords. */
pos = bprm->p;
str += len;
bprm->p -= len;
#ifdef CONFIG_MMU
if (bprm->p < bprm->argmin)
goto out;
#endif
while (len > 0) {
int offset, bytes_to_copy;
if (fatal_signal_pending(current)) {
ret = -ERESTARTNOHAND;
goto out;
}
cond_resched();
offset = pos % PAGE_SIZE;
if (offset == 0)
offset = PAGE_SIZE;
bytes_to_copy = offset;
if (bytes_to_copy > len)
bytes_to_copy = len;
offset -= bytes_to_copy;
pos -= bytes_to_copy;
str -= bytes_to_copy;
len -= bytes_to_copy;
if (!kmapped_page || kpos != (pos & PAGE_MASK)) {
struct page *page;
page = get_arg_page(bprm, pos, 1);
if (!page) {
ret = -E2BIG;
goto out;
}
if (kmapped_page) {
flush_dcache_page(kmapped_page);
kunmap(kmapped_page);
put_arg_page(kmapped_page);
}
kmapped_page = page;
kaddr = kmap(kmapped_page);
kpos = pos & PAGE_MASK;
flush_arg_page(bprm, kpos, kmapped_page);
}
if (copy_from_user(kaddr+offset, str, bytes_to_copy)) {
ret = -EFAULT;
goto out;
}
}
}
ret = 0;
out:
if (kmapped_page) {
flush_dcache_page(kmapped_page);
kunmap(kmapped_page);
put_arg_page(kmapped_page);
}
return ret;
}
/*
* Copy and argument/environment string from the kernel to the processes stack.
*/
int copy_string_kernel(const char *arg, struct linux_binprm *bprm)
{
int len = strnlen(arg, MAX_ARG_STRLEN) + 1 /* terminating NUL */;
unsigned long pos = bprm->p;
if (len == 0)
return -EFAULT;
if (!valid_arg_len(bprm, len))
return -E2BIG;
/* We're going to work our way backwards. */
arg += len;
bprm->p -= len;
if (IS_ENABLED(CONFIG_MMU) && bprm->p < bprm->argmin)
return -E2BIG;
while (len > 0) {
unsigned int bytes_to_copy = min_t(unsigned int, len,
min_not_zero(offset_in_page(pos), PAGE_SIZE));
struct page *page;
char *kaddr;
pos -= bytes_to_copy;
arg -= bytes_to_copy;
len -= bytes_to_copy;
page = get_arg_page(bprm, pos, 1);
if (!page)
return -E2BIG;
kaddr = kmap_atomic(page);
flush_arg_page(bprm, pos & PAGE_MASK, page);
memcpy(kaddr + offset_in_page(pos), arg, bytes_to_copy);
flush_dcache_page(page);
kunmap_atomic(kaddr);
put_arg_page(page);
}
return 0;
}
EXPORT_SYMBOL(copy_string_kernel);
static int copy_strings_kernel(int argc, const char *const *argv,
struct linux_binprm *bprm)
{
while (argc-- > 0) {
int ret = copy_string_kernel(argv[argc], bprm);
if (ret < 0)
return ret;
if (fatal_signal_pending(current))
return -ERESTARTNOHAND;
cond_resched();
}
return 0;
}
#ifdef CONFIG_MMU
/*
* During bprm_mm_init(), we create a temporary stack at STACK_TOP_MAX. Once
* the binfmt code determines where the new stack should reside, we shift it to
* its final location. The process proceeds as follows:
*
* 1) Use shift to calculate the new vma endpoints.
* 2) Extend vma to cover both the old and new ranges. This ensures the
* arguments passed to subsequent functions are consistent.
* 3) Move vma's page tables to the new range.
* 4) Free up any cleared pgd range.
* 5) Shrink the vma to cover only the new range.
*/
static int shift_arg_pages(struct vm_area_struct *vma, unsigned long shift)
{
struct mm_struct *mm = vma->vm_mm;
unsigned long old_start = vma->vm_start;
unsigned long old_end = vma->vm_end;
unsigned long length = old_end - old_start;
unsigned long new_start = old_start - shift;
unsigned long new_end = old_end - shift;
struct mmu_gather tlb;
BUG_ON(new_start > new_end);
/*
* ensure there are no vmas between where we want to go
* and where we are
*/
if (vma != find_vma(mm, new_start))
return -EFAULT;
/*
* cover the whole range: [new_start, old_end)
*/
if (vma_adjust(vma, new_start, old_end, vma->vm_pgoff, NULL))
return -ENOMEM;
/*
* move the page tables downwards, on failure we rely on
* process cleanup to remove whatever mess we made.
*/
if (length != move_page_tables(vma, old_start,
vma, new_start, length, false))
return -ENOMEM;
lru_add_drain();
tlb_gather_mmu(&tlb, mm);
if (new_end > old_start) {
/*
* when the old and new regions overlap clear from new_end.
*/
free_pgd_range(&tlb, new_end, old_end, new_end,
vma->vm_next ? vma->vm_next->vm_start : USER_PGTABLES_CEILING);
} else {
/*
* otherwise, clean from old_start; this is done to not touch
* the address space in [new_end, old_start) some architectures
* have constraints on va-space that make this illegal (IA64) -
* for the others its just a little faster.
*/
free_pgd_range(&tlb, old_start, old_end, new_end,
vma->vm_next ? vma->vm_next->vm_start : USER_PGTABLES_CEILING);
}
tlb_finish_mmu(&tlb);
/*
* Shrink the vma to just the new range. Always succeeds.
*/
vma_adjust(vma, new_start, new_end, vma->vm_pgoff, NULL);
return 0;
}
/*
* Finalizes the stack vm_area_struct. The flags and permissions are updated,
* the stack is optionally relocated, and some extra space is added.
*/
int setup_arg_pages(struct linux_binprm *bprm,
unsigned long stack_top,
int executable_stack)
{
unsigned long ret;
unsigned long stack_shift;
struct mm_struct *mm = current->mm;
struct vm_area_struct *vma = bprm->vma;
struct vm_area_struct *prev = NULL;
unsigned long vm_flags;
unsigned long stack_base;
unsigned long stack_size;
unsigned long stack_expand;
unsigned long rlim_stack;
#ifdef CONFIG_STACK_GROWSUP
/* Limit stack size */
stack_base = bprm->rlim_stack.rlim_max;
stack_base = calc_max_stack_size(stack_base);
/* Add space for stack randomization. */
stack_base += (STACK_RND_MASK << PAGE_SHIFT);
/* Make sure we didn't let the argument array grow too large. */
if (vma->vm_end - vma->vm_start > stack_base)
return -ENOMEM;
stack_base = PAGE_ALIGN(stack_top - stack_base);
stack_shift = vma->vm_start - stack_base;
mm->arg_start = bprm->p - stack_shift;
bprm->p = vma->vm_end - stack_shift;
#else
stack_top = arch_align_stack(stack_top);
stack_top = PAGE_ALIGN(stack_top);
if (unlikely(stack_top < mmap_min_addr) ||
unlikely(vma->vm_end - vma->vm_start >= stack_top - mmap_min_addr))
return -ENOMEM;
stack_shift = vma->vm_end - stack_top;
bprm->p -= stack_shift;
mm->arg_start = bprm->p;
#endif
if (bprm->loader)
bprm->loader -= stack_shift;
bprm->exec -= stack_shift;
if (mmap_write_lock_killable(mm))
return -EINTR;
vm_flags = VM_STACK_FLAGS;
/*
* Adjust stack execute permissions; explicitly enable for
* EXSTACK_ENABLE_X, disable for EXSTACK_DISABLE_X and leave alone
* (arch default) otherwise.
*/
if (unlikely(executable_stack == EXSTACK_ENABLE_X))
vm_flags |= VM_EXEC;
else if (executable_stack == EXSTACK_DISABLE_X)
vm_flags &= ~VM_EXEC;
vm_flags |= mm->def_flags;
vm_flags |= VM_STACK_INCOMPLETE_SETUP;
ret = mprotect_fixup(vma, &prev, vma->vm_start, vma->vm_end,
vm_flags);
if (ret)
goto out_unlock;
BUG_ON(prev != vma);
if (unlikely(vm_flags & VM_EXEC)) {
pr_warn_once("process '%pD4' started with executable stack\n",
bprm->file);
}
/* Move stack pages down in memory. */
if (stack_shift) {
ret = shift_arg_pages(vma, stack_shift);
if (ret)
goto out_unlock;
}
/* mprotect_fixup is overkill to remove the temporary stack flags */
vma->vm_flags &= ~VM_STACK_INCOMPLETE_SETUP;
stack_expand = 131072UL; /* randomly 32*4k (or 2*64k) pages */
stack_size = vma->vm_end - vma->vm_start;
/*
* Align this down to a page boundary as expand_stack
* will align it up.
*/
rlim_stack = bprm->rlim_stack.rlim_cur & PAGE_MASK;
#ifdef CONFIG_STACK_GROWSUP
if (stack_size + stack_expand > rlim_stack)
stack_base = vma->vm_start + rlim_stack;
else
stack_base = vma->vm_end + stack_expand;
#else
if (stack_size + stack_expand > rlim_stack)
stack_base = vma->vm_end - rlim_stack;
else
stack_base = vma->vm_start - stack_expand;
#endif
current->mm->start_stack = bprm->p;
ret = expand_stack(vma, stack_base);
if (ret)
ret = -EFAULT;
out_unlock:
mmap_write_unlock(mm);
return ret;
}
EXPORT_SYMBOL(setup_arg_pages);
#else
/*
* Transfer the program arguments and environment from the holding pages
* onto the stack. The provided stack pointer is adjusted accordingly.
*/
int transfer_args_to_stack(struct linux_binprm *bprm,
unsigned long *sp_location)
{
unsigned long index, stop, sp;
int ret = 0;
stop = bprm->p >> PAGE_SHIFT;
sp = *sp_location;
for (index = MAX_ARG_PAGES - 1; index >= stop; index--) {
unsigned int offset = index == stop ? bprm->p & ~PAGE_MASK : 0;
char *src = kmap(bprm->page[index]) + offset;
sp -= PAGE_SIZE - offset;
if (copy_to_user((void *) sp, src, PAGE_SIZE - offset) != 0)
ret = -EFAULT;
kunmap(bprm->page[index]);
if (ret)
goto out;
}
*sp_location = sp;
out:
return ret;
}
EXPORT_SYMBOL(transfer_args_to_stack);
#endif /* CONFIG_MMU */
static struct file *do_open_execat(int fd, struct filename *name, int flags)
{
struct file *file;
int err;
struct open_flags open_exec_flags = {
.open_flag = O_LARGEFILE | O_RDONLY | __FMODE_EXEC,
.acc_mode = MAY_EXEC,
.intent = LOOKUP_OPEN,
.lookup_flags = LOOKUP_FOLLOW,
};
if ((flags & ~(AT_SYMLINK_NOFOLLOW | AT_EMPTY_PATH)) != 0)
return ERR_PTR(-EINVAL);
if (flags & AT_SYMLINK_NOFOLLOW)
open_exec_flags.lookup_flags &= ~LOOKUP_FOLLOW;
if (flags & AT_EMPTY_PATH)
open_exec_flags.lookup_flags |= LOOKUP_EMPTY;
file = do_filp_open(fd, name, &open_exec_flags);
if (IS_ERR(file))
goto out;
/*
* may_open() has already checked for this, so it should be
* impossible to trip now. But we need to be extra cautious
* and check again at the very end too.
*/
err = -EACCES;
if (WARN_ON_ONCE(!S_ISREG(file_inode(file)->i_mode) ||
path_noexec(&file->f_path)))
goto exit;
err = deny_write_access(file);
if (err)
goto exit;
if (name->name[0] != '\0')
fsnotify_open(file);
out:
return file;
exit:
fput(file);
return ERR_PTR(err);
}
struct file *open_exec(const char *name)
{
struct filename *filename = getname_kernel(name);
struct file *f = ERR_CAST(filename);
if (!IS_ERR(filename)) {
f = do_open_execat(AT_FDCWD, filename, 0);
putname(filename);
}
return f;
}
EXPORT_SYMBOL(open_exec);
#if defined(CONFIG_HAVE_AOUT) || defined(CONFIG_BINFMT_FLAT) || \
defined(CONFIG_BINFMT_ELF_FDPIC)
ssize_t read_code(struct file *file, unsigned long addr, loff_t pos, size_t len)
{
ssize_t res = vfs_read(file, (void __user *)addr, len, &pos);
if (res > 0)
flush_icache_user_range(addr, addr + len);
return res;
}
EXPORT_SYMBOL(read_code);
#endif
/*
* Maps the mm_struct mm into the current task struct.
* On success, this function returns with exec_update_lock
* held for writing.
*/
static int exec_mmap(struct mm_struct *mm)
{
struct task_struct *tsk;
struct mm_struct *old_mm, *active_mm;
int ret;
/* Notify parent that we're no longer interested in the old VM */
tsk = current;
old_mm = current->mm;
exec_mm_release(tsk, old_mm);
if (old_mm)
sync_mm_rss(old_mm);
ret = down_write_killable(&tsk->signal->exec_update_lock);
if (ret)
return ret;
if (old_mm) {
/*
* Make sure that if there is a core dump in progress
* for the old mm, we get out and die instead of going
* through with the exec. We must hold mmap_lock around
* checking core_state and changing tsk->mm.
*/
mmap_read_lock(old_mm);
if (unlikely(old_mm->core_state)) {
mmap_read_unlock(old_mm);
up_write(&tsk->signal->exec_update_lock);
return -EINTR;
}
}
task_lock(tsk);
membarrier_exec_mmap(mm);
local_irq_disable();
active_mm = tsk->active_mm;
tsk->active_mm = mm;
tsk->mm = mm;
/*
* This prevents preemption while active_mm is being loaded and
* it and mm are being updated, which could cause problems for
* lazy tlb mm refcounting when these are updated by context
* switches. Not all architectures can handle irqs off over
* activate_mm yet.
*/
if (!IS_ENABLED(CONFIG_ARCH_WANT_IRQS_OFF_ACTIVATE_MM))
local_irq_enable();
activate_mm(active_mm, mm);
if (IS_ENABLED(CONFIG_ARCH_WANT_IRQS_OFF_ACTIVATE_MM))
local_irq_enable();
tsk->mm->vmacache_seqnum = 0;
vmacache_flush(tsk);
task_unlock(tsk);
if (old_mm) {
mmap_read_unlock(old_mm);
BUG_ON(active_mm != old_mm);
setmax_mm_hiwater_rss(&tsk->signal->maxrss, old_mm);
mm_update_next_owner(old_mm);
mmput(old_mm);
return 0;
}
mmdrop(active_mm);
return 0;
}
static int de_thread(struct task_struct *tsk)
{
struct signal_struct *sig = tsk->signal;
struct sighand_struct *oldsighand = tsk->sighand;
spinlock_t *lock = &oldsighand->siglock;
if (thread_group_empty(tsk))
goto no_thread_group;
/*
* Kill all other threads in the thread group.
*/
spin_lock_irq(lock);
if (signal_group_exit(sig)) {
/*
* Another group action in progress, just
* return so that the signal is processed.
*/
spin_unlock_irq(lock);
return -EAGAIN;
}
sig->group_exit_task = tsk;
sig->notify_count = zap_other_threads(tsk);
if (!thread_group_leader(tsk))
sig->notify_count--;
while (sig->notify_count) {
__set_current_state(TASK_KILLABLE);
spin_unlock_irq(lock);
schedule();
if (__fatal_signal_pending(tsk))
goto killed;
spin_lock_irq(lock);
}
spin_unlock_irq(lock);
/*
* At this point all other threads have exited, all we have to
* do is to wait for the thread group leader to become inactive,
* and to assume its PID:
*/
if (!thread_group_leader(tsk)) {
struct task_struct *leader = tsk->group_leader;
for (;;) {
cgroup_threadgroup_change_begin(tsk);
write_lock_irq(&tasklist_lock);
/*
* Do this under tasklist_lock to ensure that
* exit_notify() can't miss ->group_exit_task
*/
sig->notify_count = -1;
if (likely(leader->exit_state))
break;
__set_current_state(TASK_KILLABLE);
write_unlock_irq(&tasklist_lock);
cgroup_threadgroup_change_end(tsk);
schedule();
if (__fatal_signal_pending(tsk))
goto killed;
}
/*
* The only record we have of the real-time age of a
* process, regardless of execs it's done, is start_time.
* All the past CPU time is accumulated in signal_struct
* from sister threads now dead. But in this non-leader
* exec, nothing survives from the original leader thread,
* whose birth marks the true age of this process now.
* When we take on its identity by switching to its PID, we
* also take its birthdate (always earlier than our own).
*/
tsk->start_time = leader->start_time;
tsk->start_boottime = leader->start_boottime;
BUG_ON(!same_thread_group(leader, tsk));
/*
* An exec() starts a new thread group with the
* TGID of the previous thread group. Rehash the
* two threads with a switched PID, and release
* the former thread group leader:
*/
/* Become a process group leader with the old leader's pid.
* The old leader becomes a thread of the this thread group.
*/
exchange_tids(tsk, leader);
transfer_pid(leader, tsk, PIDTYPE_TGID);
transfer_pid(leader, tsk, PIDTYPE_PGID);
transfer_pid(leader, tsk, PIDTYPE_SID);
list_replace_rcu(&leader->tasks, &tsk->tasks);
list_replace_init(&leader->sibling, &tsk->sibling);
tsk->group_leader = tsk;
leader->group_leader = tsk;
tsk->exit_signal = SIGCHLD;
leader->exit_signal = -1;
BUG_ON(leader->exit_state != EXIT_ZOMBIE);
leader->exit_state = EXIT_DEAD;
/*
* We are going to release_task()->ptrace_unlink() silently,
* the tracer can sleep in do_wait(). EXIT_DEAD guarantees
* the tracer wont't block again waiting for this thread.
*/
if (unlikely(leader->ptrace))
__wake_up_parent(leader, leader->parent);
write_unlock_irq(&tasklist_lock);
cgroup_threadgroup_change_end(tsk);
release_task(leader);
}
sig->group_exit_task = NULL;
sig->notify_count = 0;
no_thread_group:
/* we have changed execution domain */
tsk->exit_signal = SIGCHLD;
BUG_ON(!thread_group_leader(tsk));
return 0;
killed:
/* protects against exit_notify() and __exit_signal() */
read_lock(&tasklist_lock);
sig->group_exit_task = NULL;
sig->notify_count = 0;
read_unlock(&tasklist_lock);
return -EAGAIN;
}
/*
* This function makes sure the current process has its own signal table,
* so that flush_signal_handlers can later reset the handlers without
* disturbing other processes. (Other processes might share the signal
* table via the CLONE_SIGHAND option to clone().)
*/
static int unshare_sighand(struct task_struct *me)
{
struct sighand_struct *oldsighand = me->sighand;
if (refcount_read(&oldsighand->count) != 1) {
struct sighand_struct *newsighand;
/*
* This ->sighand is shared with the CLONE_SIGHAND
* but not CLONE_THREAD task, switch to the new one.
*/
newsighand = kmem_cache_alloc(sighand_cachep, GFP_KERNEL);
if (!newsighand)
return -ENOMEM;
refcount_set(&newsighand->count, 1);
memcpy(newsighand->action, oldsighand->action,
sizeof(newsighand->action));
write_lock_irq(&tasklist_lock);
spin_lock(&oldsighand->siglock);
rcu_assign_pointer(me->sighand, newsighand);
spin_unlock(&oldsighand->siglock);
write_unlock_irq(&tasklist_lock);
__cleanup_sighand(oldsighand);
}
return 0;
}
char *__get_task_comm(char *buf, size_t buf_size, struct task_struct *tsk)
{
task_lock(tsk);
strncpy(buf, tsk->comm, buf_size);
task_unlock(tsk);
return buf;
}
EXPORT_SYMBOL_GPL(__get_task_comm);
/*
* These functions flushes out all traces of the currently running executable
* so that a new one can be started
*/
void __set_task_comm(struct task_struct *tsk, const char *buf, bool exec)
{
task_lock(tsk);
trace_task_rename(tsk, buf);
strlcpy(tsk->comm, buf, sizeof(tsk->comm));
task_unlock(tsk);
perf_event_comm(tsk, exec);
}
/*
* Calling this is the point of no return. None of the failures will be
* seen by userspace since either the process is already taking a fatal
* signal (via de_thread() or coredump), or will have SEGV raised
* (after exec_mmap()) by search_binary_handler (see below).
*/
int begin_new_exec(struct linux_binprm * bprm)
{
struct task_struct *me = current;
int retval;
/* Once we are committed compute the creds */
retval = bprm_creds_from_file(bprm);
if (retval)
return retval;
/*
* Ensure all future errors are fatal.
*/
bprm->point_of_no_return = true;
/*
* Make this the only thread in the thread group.
*/
retval = de_thread(me);
if (retval)
goto out;
/*
* Cancel any io_uring activity across execve
*/
io_uring_task_cancel();
/* Ensure the files table is not shared. */
retval = unshare_files();
if (retval)
goto out;
/*
* Must be called _before_ exec_mmap() as bprm->mm is
* not visibile until then. This also enables the update
* to be lockless.
*/
retval = set_mm_exe_file(bprm->mm, bprm->file);
if (retval)
goto out;
/* If the binary is not readable then enforce mm->dumpable=0 */
would_dump(bprm, bprm->file);
if (bprm->have_execfd)
would_dump(bprm, bprm->executable);
/*
* Release all of the old mmap stuff
*/
acct_arg_size(bprm, 0);
retval = exec_mmap(bprm->mm);
if (retval)
goto out;
bprm->mm = NULL;
#ifdef CONFIG_POSIX_TIMERS
exit_itimers(me->signal);
flush_itimer_signals();
#endif
/*
* Make the signal table private.
*/
retval = unshare_sighand(me);
if (retval)
goto out_unlock;
/*
* Ensure that the uaccess routines can actually operate on userspace
* pointers:
*/
force_uaccess_begin();
me->flags &= ~(PF_RANDOMIZE | PF_FORKNOEXEC | PF_KTHREAD |
PF_NOFREEZE | PF_NO_SETAFFINITY);
flush_thread();
me->personality &= ~bprm->per_clear;
clear_syscall_work_syscall_user_dispatch(me);
/*
* We have to apply CLOEXEC before we change whether the process is
* dumpable (in setup_new_exec) to avoid a race with a process in userspace
* trying to access the should-be-closed file descriptors of a process
* undergoing exec(2).
*/
do_close_on_exec(me->files);
if (bprm->secureexec) {
/* Make sure parent cannot signal privileged process. */
me->pdeath_signal = 0;
/*
* For secureexec, reset the stack limit to sane default to
* avoid bad behavior from the prior rlimits. This has to
* happen before arch_pick_mmap_layout(), which examines
* RLIMIT_STACK, but after the point of no return to avoid
* needing to clean up the change on failure.
*/
if (bprm->rlim_stack.rlim_cur > _STK_LIM)
bprm->rlim_stack.rlim_cur = _STK_LIM;
}
me->sas_ss_sp = me->sas_ss_size = 0;
/*
* Figure out dumpability. Note that this checking only of current
* is wrong, but userspace depends on it. This should be testing
* bprm->secureexec instead.
*/
if (bprm->interp_flags & BINPRM_FLAGS_ENFORCE_NONDUMP ||
!(uid_eq(current_euid(), current_uid()) &&
gid_eq(current_egid(), current_gid())))
set_dumpable(current->mm, suid_dumpable);
else
set_dumpable(current->mm, SUID_DUMP_USER);
perf_event_exec();
__set_task_comm(me, kbasename(bprm->filename), true);
/* An exec changes our domain. We are no longer part of the thread
group */
WRITE_ONCE(me->self_exec_id, me->self_exec_id + 1);
flush_signal_handlers(me, 0);
retval = set_cred_ucounts(bprm->cred);
if (retval < 0)
goto out_unlock;
/*
* install the new credentials for this executable
*/
security_bprm_committing_creds(bprm);
commit_creds(bprm->cred);
bprm->cred = NULL;
/*
* Disable monitoring for regular users
* when executing setuid binaries. Must
* wait until new credentials are committed
* by commit_creds() above
*/
if (get_dumpable(me->mm) != SUID_DUMP_USER)
perf_event_exit_task(me);
/*
* cred_guard_mutex must be held at least to this point to prevent
* ptrace_attach() from altering our determination of the task's
* credentials; any time after this it may be unlocked.
*/
security_bprm_committed_creds(bprm);
/* Pass the opened binary to the interpreter. */
if (bprm->have_execfd) {
retval = get_unused_fd_flags(0);
if (retval < 0)
goto out_unlock;
fd_install(retval, bprm->executable);
bprm->executable = NULL;
bprm->execfd = retval;
}
return 0;
out_unlock:
up_write(&me->signal->exec_update_lock);
out:
return retval;
}
EXPORT_SYMBOL(begin_new_exec);
void would_dump(struct linux_binprm *bprm, struct file *file)
{
struct inode *inode = file_inode(file);
struct user_namespace *mnt_userns = file_mnt_user_ns(file);
if (inode_permission(mnt_userns, inode, MAY_READ) < 0) {
struct user_namespace *old, *user_ns;
bprm->interp_flags |= BINPRM_FLAGS_ENFORCE_NONDUMP;
/* Ensure mm->user_ns contains the executable */
user_ns = old = bprm->mm->user_ns;
while ((user_ns != &init_user_ns) &&
!privileged_wrt_inode_uidgid(user_ns, mnt_userns, inode))
user_ns = user_ns->parent;
if (old != user_ns) {
bprm->mm->user_ns = get_user_ns(user_ns);
put_user_ns(old);
}
}
}
EXPORT_SYMBOL(would_dump);
void setup_new_exec(struct linux_binprm * bprm)
{
/* Setup things that can depend upon the personality */
struct task_struct *me = current;
arch_pick_mmap_layout(me->mm, &bprm->rlim_stack);
arch_setup_new_exec();
/* Set the new mm task size. We have to do that late because it may
* depend on TIF_32BIT which is only updated in flush_thread() on
* some architectures like powerpc
*/
me->mm->task_size = TASK_SIZE;
up_write(&me->signal->exec_update_lock);
mutex_unlock(&me->signal->cred_guard_mutex);
}
EXPORT_SYMBOL(setup_new_exec);
/* Runs immediately before start_thread() takes over. */
void finalize_exec(struct linux_binprm *bprm)
{
/* Store any stack rlimit changes before starting thread. */
task_lock(current->group_leader);
current->signal->rlim[RLIMIT_STACK] = bprm->rlim_stack;
task_unlock(current->group_leader);
}
EXPORT_SYMBOL(finalize_exec);
/*
* Prepare credentials and lock ->cred_guard_mutex.
* setup_new_exec() commits the new creds and drops the lock.
* Or, if exec fails before, free_bprm() should release ->cred
* and unlock.
*/
static int prepare_bprm_creds(struct linux_binprm *bprm)
{
if (mutex_lock_interruptible(¤t->signal->cred_guard_mutex))
return -ERESTARTNOINTR;
bprm->cred = prepare_exec_creds();
if (likely(bprm->cred))
return 0;
mutex_unlock(¤t->signal->cred_guard_mutex);
return -ENOMEM;
}
static void free_bprm(struct linux_binprm *bprm)
{
if (bprm->mm) {
acct_arg_size(bprm, 0);
mmput(bprm->mm);
}
free_arg_pages(bprm);
if (bprm->cred) {
mutex_unlock(¤t->signal->cred_guard_mutex);
abort_creds(bprm->cred);
}
if (bprm->file) {
allow_write_access(bprm->file);
fput(bprm->file);
}
if (bprm->executable)
fput(bprm->executable);
/* If a binfmt changed the interp, free it. */
if (bprm->interp != bprm->filename)
kfree(bprm->interp);
kfree(bprm->fdpath);
kfree(bprm);
}
static struct linux_binprm *alloc_bprm(int fd, struct filename *filename)
{
struct linux_binprm *bprm = kzalloc(sizeof(*bprm), GFP_KERNEL);
int retval = -ENOMEM;
if (!bprm)
goto out;
if (fd == AT_FDCWD || filename->name[0] == '/') {
bprm->filename = filename->name;
} else {
if (filename->name[0] == '\0')
bprm->fdpath = kasprintf(GFP_KERNEL, "/dev/fd/%d", fd);
else
bprm->fdpath = kasprintf(GFP_KERNEL, "/dev/fd/%d/%s",
fd, filename->name);
if (!bprm->fdpath)
goto out_free;
bprm->filename = bprm->fdpath;
}
bprm->interp = bprm->filename;
retval = bprm_mm_init(bprm);
if (retval)
goto out_free;
return bprm;
out_free:
free_bprm(bprm);
out:
return ERR_PTR(retval);
}
int bprm_change_interp(const char *interp, struct linux_binprm *bprm)
{
/* If a binfmt changed the interp, free it first. */
if (bprm->interp != bprm->filename)
kfree(bprm->interp);
bprm->interp = kstrdup(interp, GFP_KERNEL);
if (!bprm->interp)
return -ENOMEM;
return 0;
}
EXPORT_SYMBOL(bprm_change_interp);
/*
* determine how safe it is to execute the proposed program
* - the caller must hold ->cred_guard_mutex to protect against
* PTRACE_ATTACH or seccomp thread-sync
*/
static void check_unsafe_exec(struct linux_binprm *bprm)
{
struct task_struct *p = current, *t;
unsigned n_fs;
if (p->ptrace)
bprm->unsafe |= LSM_UNSAFE_PTRACE;
/*
* This isn't strictly necessary, but it makes it harder for LSMs to
* mess up.
*/
if (task_no_new_privs(current))
bprm->unsafe |= LSM_UNSAFE_NO_NEW_PRIVS;
t = p;
n_fs = 1;
spin_lock(&p->fs->lock);
rcu_read_lock();
while_each_thread(p, t) {
if (t->fs == p->fs)
n_fs++;
}
rcu_read_unlock();
if (p->fs->users > n_fs)
bprm->unsafe |= LSM_UNSAFE_SHARE;
else
p->fs->in_exec = 1;
spin_unlock(&p->fs->lock);
}
static void bprm_fill_uid(struct linux_binprm *bprm, struct file *file)
{
/* Handle suid and sgid on files */
struct user_namespace *mnt_userns;
struct inode *inode;
unsigned int mode;
kuid_t uid;
kgid_t gid;
if (!mnt_may_suid(file->f_path.mnt))
return;
if (task_no_new_privs(current))
return;
inode = file->f_path.dentry->d_inode;
mode = READ_ONCE(inode->i_mode);
if (!(mode & (S_ISUID|S_ISGID)))
return;
mnt_userns = file_mnt_user_ns(file);
/* Be careful if suid/sgid is set */
inode_lock(inode);
/* reload atomically mode/uid/gid now that lock held */
mode = inode->i_mode;
uid = i_uid_into_mnt(mnt_userns, inode);
gid = i_gid_into_mnt(mnt_userns, inode);
inode_unlock(inode);
/* We ignore suid/sgid if there are no mappings for them in the ns */
if (!kuid_has_mapping(bprm->cred->user_ns, uid) ||
!kgid_has_mapping(bprm->cred->user_ns, gid))
return;
if (mode & S_ISUID) {
bprm->per_clear |= PER_CLEAR_ON_SETID;
bprm->cred->euid = uid;
}
if ((mode & (S_ISGID | S_IXGRP)) == (S_ISGID | S_IXGRP)) {
bprm->per_clear |= PER_CLEAR_ON_SETID;
bprm->cred->egid = gid;
}
}
/*
* Compute brpm->cred based upon the final binary.
*/
static int bprm_creds_from_file(struct linux_binprm *bprm)
{
/* Compute creds based on which file? */
struct file *file = bprm->execfd_creds ? bprm->executable : bprm->file;
bprm_fill_uid(bprm, file);
return security_bprm_creds_from_file(bprm, file);
}
/*
* Fill the binprm structure from the inode.
* Read the first BINPRM_BUF_SIZE bytes
*
* This may be called multiple times for binary chains (scripts for example).
*/
static int prepare_binprm(struct linux_binprm *bprm)
{
loff_t pos = 0;
memset(bprm->buf, 0, BINPRM_BUF_SIZE);
return kernel_read(bprm->file, bprm->buf, BINPRM_BUF_SIZE, &pos);
}
/*
* Arguments are '\0' separated strings found at the location bprm->p
* points to; chop off the first by relocating brpm->p to right after
* the first '\0' encountered.
*/
int remove_arg_zero(struct linux_binprm *bprm)
{
int ret = 0;
unsigned long offset;
char *kaddr;
struct page *page;
if (!bprm->argc)
return 0;
do {
offset = bprm->p & ~PAGE_MASK;
page = get_arg_page(bprm, bprm->p, 0);
if (!page) {
ret = -EFAULT;
goto out;
}
kaddr = kmap_atomic(page);
for (; offset < PAGE_SIZE && kaddr[offset];
offset++, bprm->p++)
;
kunmap_atomic(kaddr);
put_arg_page(page);
} while (offset == PAGE_SIZE);
bprm->p++;
bprm->argc--;
ret = 0;
out:
return ret;
}
EXPORT_SYMBOL(remove_arg_zero);
#define printable(c) (((c)=='\t') || ((c)=='\n') || (0x20<=(c) && (c)<=0x7e))
/*
* cycle the list of binary formats handler, until one recognizes the image
*/
static int search_binary_handler(struct linux_binprm *bprm)
{
bool need_retry = IS_ENABLED(CONFIG_MODULES);
struct linux_binfmt *fmt;
int retval;
retval = prepare_binprm(bprm);
if (retval < 0)
return retval;
retval = security_bprm_check(bprm);
if (retval)
return retval;
retval = -ENOENT;
retry:
read_lock(&binfmt_lock);
list_for_each_entry(fmt, &formats, lh) {
if (!try_module_get(fmt->module))
continue;
read_unlock(&binfmt_lock);
retval = fmt->load_binary(bprm);
read_lock(&binfmt_lock);
put_binfmt(fmt);
if (bprm->point_of_no_return || (retval != -ENOEXEC)) {
read_unlock(&binfmt_lock);
return retval;
}
}
read_unlock(&binfmt_lock);
if (need_retry) {
if (printable(bprm->buf[0]) && printable(bprm->buf[1]) &&
printable(bprm->buf[2]) && printable(bprm->buf[3]))
return retval;
if (request_module("binfmt-%04x", *(ushort *)(bprm->buf + 2)) < 0)
return retval;
need_retry = false;
goto retry;
}
return retval;
}
static int exec_binprm(struct linux_binprm *bprm)
{
pid_t old_pid, old_vpid;
int ret, depth;
/* Need to fetch pid before load_binary changes it */
old_pid = current->pid;
rcu_read_lock();
old_vpid = task_pid_nr_ns(current, task_active_pid_ns(current->parent));
rcu_read_unlock();
/* This allows 4 levels of binfmt rewrites before failing hard. */
for (depth = 0;; depth++) {
struct file *exec;
if (depth > 5)
return -ELOOP;
ret = search_binary_handler(bprm);
if (ret < 0)
return ret;
if (!bprm->interpreter)
break;
exec = bprm->file;
bprm->file = bprm->interpreter;
bprm->interpreter = NULL;
allow_write_access(exec);
if (unlikely(bprm->have_execfd)) {
if (bprm->executable) {
fput(exec);
return -ENOEXEC;
}
bprm->executable = exec;
} else
fput(exec);
}
audit_bprm(bprm);
trace_sched_process_exec(current, old_pid, bprm);
ptrace_event(PTRACE_EVENT_EXEC, old_vpid);
proc_exec_connector(current);
return 0;
}
/*
* sys_execve() executes a new program.
*/
static int bprm_execve(struct linux_binprm *bprm,
int fd, struct filename *filename, int flags)
{
struct file *file;
int retval;
retval = prepare_bprm_creds(bprm);
if (retval)
return retval;
check_unsafe_exec(bprm);
current->in_execve = 1;
file = do_open_execat(fd, filename, flags);
retval = PTR_ERR(file);
if (IS_ERR(file))
goto out_unmark;
sched_exec();
bprm->file = file;
/*
* Record that a name derived from an O_CLOEXEC fd will be
* inaccessible after exec. This allows the code in exec to
* choose to fail when the executable is not mmaped into the
* interpreter and an open file descriptor is not passed to
* the interpreter. This makes for a better user experience
* than having the interpreter start and then immediately fail
* when it finds the executable is inaccessible.
*/
if (bprm->fdpath && get_close_on_exec(fd))
bprm->interp_flags |= BINPRM_FLAGS_PATH_INACCESSIBLE;
/* Set the unchanging part of bprm->cred */
retval = security_bprm_creds_for_exec(bprm);
if (retval)
goto out;
retval = exec_binprm(bprm);
if (retval < 0)
goto out;
/* execve succeeded */
current->fs->in_exec = 0;
current->in_execve = 0;
rseq_execve(current);
acct_update_integrals(current);
task_numa_free(current, false);
return retval;
out:
/*
* If past the point of no return ensure the code never
* returns to the userspace process. Use an existing fatal
* signal if present otherwise terminate the process with
* SIGSEGV.
*/
if (bprm->point_of_no_return && !fatal_signal_pending(current))
force_fatal_sig(SIGSEGV);
out_unmark:
current->fs->in_exec = 0;
current->in_execve = 0;
return retval;
}
static int do_execveat_common(int fd, struct filename *filename,
struct user_arg_ptr argv,
struct user_arg_ptr envp,
int flags)
{
struct linux_binprm *bprm;
int retval;
if (IS_ERR(filename))
return PTR_ERR(filename);
/*
* We move the actual failure in case of RLIMIT_NPROC excess from
* set*uid() to execve() because too many poorly written programs
* don't check setuid() return code. Here we additionally recheck
* whether NPROC limit is still exceeded.
*/
if ((current->flags & PF_NPROC_EXCEEDED) &&
is_ucounts_overlimit(current_ucounts(), UCOUNT_RLIMIT_NPROC, rlimit(RLIMIT_NPROC))) {
retval = -EAGAIN;
goto out_ret;
}
/* We're below the limit (still or again), so we don't want to make
* further execve() calls fail. */
current->flags &= ~PF_NPROC_EXCEEDED;
bprm = alloc_bprm(fd, filename);
if (IS_ERR(bprm)) {
retval = PTR_ERR(bprm);
goto out_ret;
}
retval = count(argv, MAX_ARG_STRINGS);
if (retval == 0)
pr_warn_once("process '%s' launched '%s' with NULL argv: empty string added\n",
current->comm, bprm->filename);
if (retval < 0)
goto out_free;
bprm->argc = retval;
retval = count(envp, MAX_ARG_STRINGS);
if (retval < 0)
goto out_free;
bprm->envc = retval;
retval = bprm_stack_limits(bprm);
if (retval < 0)
goto out_free;
retval = copy_string_kernel(bprm->filename, bprm);
if (retval < 0)
goto out_free;
bprm->exec = bprm->p;
retval = copy_strings(bprm->envc, envp, bprm);
if (retval < 0)
goto out_free;
retval = copy_strings(bprm->argc, argv, bprm);
if (retval < 0)
goto out_free;
/*
* When argv is empty, add an empty string ("") as argv[0] to
* ensure confused userspace programs that start processing
* from argv[1] won't end up walking envp. See also
* bprm_stack_limits().
*/
if (bprm->argc == 0) {
retval = copy_string_kernel("", bprm);
if (retval < 0)
goto out_free;
bprm->argc = 1;
}
retval = bprm_execve(bprm, fd, filename, flags);
out_free:
free_bprm(bprm);
out_ret:
putname(filename);
return retval;
}
int kernel_execve(const char *kernel_filename,
const char *const *argv, const char *const *envp)
{
struct filename *filename;
struct linux_binprm *bprm;
int fd = AT_FDCWD;
int retval;
filename = getname_kernel(kernel_filename);
if (IS_ERR(filename))
return PTR_ERR(filename);
bprm = alloc_bprm(fd, filename);
if (IS_ERR(bprm)) {
retval = PTR_ERR(bprm);
goto out_ret;
}
retval = count_strings_kernel(argv);
if (WARN_ON_ONCE(retval == 0))
retval = -EINVAL;
if (retval < 0)
goto out_free;
bprm->argc = retval;
retval = count_strings_kernel(envp);
if (retval < 0)
goto out_free;
bprm->envc = retval;
retval = bprm_stack_limits(bprm);
if (retval < 0)
goto out_free;
retval = copy_string_kernel(bprm->filename, bprm);
if (retval < 0)
goto out_free;
bprm->exec = bprm->p;
retval = copy_strings_kernel(bprm->envc, envp, bprm);
if (retval < 0)
goto out_free;
retval = copy_strings_kernel(bprm->argc, argv, bprm);
if (retval < 0)
goto out_free;
retval = bprm_execve(bprm, fd, filename, 0);
out_free:
free_bprm(bprm);
out_ret:
putname(filename);
return retval;
}
static int do_execve(struct filename *filename,
const char __user *const __user *__argv,
const char __user *const __user *__envp)
{
struct user_arg_ptr argv = { .ptr.native = __argv };
struct user_arg_ptr envp = { .ptr.native = __envp };
return do_execveat_common(AT_FDCWD, filename, argv, envp, 0);
}
static int do_execveat(int fd, struct filename *filename,
const char __user *const __user *__argv,
const char __user *const __user *__envp,
int flags)
{
struct user_arg_ptr argv = { .ptr.native = __argv };
struct user_arg_ptr envp = { .ptr.native = __envp };
return do_execveat_common(fd, filename, argv, envp, flags);
}
#ifdef CONFIG_COMPAT
static int compat_do_execve(struct filename *filename,
const compat_uptr_t __user *__argv,
const compat_uptr_t __user *__envp)
{
struct user_arg_ptr argv = {
.is_compat = true,
.ptr.compat = __argv,
};
struct user_arg_ptr envp = {
.is_compat = true,
.ptr.compat = __envp,
};
return do_execveat_common(AT_FDCWD, filename, argv, envp, 0);
}
static int compat_do_execveat(int fd, struct filename *filename,
const compat_uptr_t __user *__argv,
const compat_uptr_t __user *__envp,
int flags)
{
struct user_arg_ptr argv = {
.is_compat = true,
.ptr.compat = __argv,
};
struct user_arg_ptr envp = {
.is_compat = true,
.ptr.compat = __envp,
};
return do_execveat_common(fd, filename, argv, envp, flags);
}
#endif
void set_binfmt(struct linux_binfmt *new)
{
struct mm_struct *mm = current->mm;
if (mm->binfmt)
module_put(mm->binfmt->module);
mm->binfmt = new;
if (new)
__module_get(new->module);
}
EXPORT_SYMBOL(set_binfmt);
/*
* set_dumpable stores three-value SUID_DUMP_* into mm->flags.
*/
void set_dumpable(struct mm_struct *mm, int value)
{
if (WARN_ON((unsigned)value > SUID_DUMP_ROOT))
return;
set_mask_bits(&mm->flags, MMF_DUMPABLE_MASK, value);
}
SYSCALL_DEFINE3(execve,
const char __user *, filename,
const char __user *const __user *, argv,
const char __user *const __user *, envp)
{
return do_execve(getname(filename), argv, envp);
}
SYSCALL_DEFINE5(execveat,
int, fd, const char __user *, filename,
const char __user *const __user *, argv,
const char __user *const __user *, envp,
int, flags)
{
return do_execveat(fd,
getname_uflags(filename, flags),
argv, envp, flags);
}
#ifdef CONFIG_COMPAT
COMPAT_SYSCALL_DEFINE3(execve, const char __user *, filename,
const compat_uptr_t __user *, argv,
const compat_uptr_t __user *, envp)
{
return compat_do_execve(getname(filename), argv, envp);
}
COMPAT_SYSCALL_DEFINE5(execveat, int, fd,
const char __user *, filename,
const compat_uptr_t __user *, argv,
const compat_uptr_t __user *, envp,
int, flags)
{
return compat_do_execveat(fd,
getname_uflags(filename, flags),
argv, envp, flags);
}
#endif
// SPDX-License-Identifier: GPL-2.0
/*
* Copyright(C) 2005-2006, Thomas Gleixner <tglx@linutronix.de>
* Copyright(C) 2005-2007, Red Hat, Inc., Ingo Molnar
* Copyright(C) 2006-2007 Timesys Corp., Thomas Gleixner
*
* High-resolution kernel timers
*
* In contrast to the low-resolution timeout API, aka timer wheel,
* hrtimers provide finer resolution and accuracy depending on system
* configuration and capabilities.
*
* Started by: Thomas Gleixner and Ingo Molnar
*
* Credits:
* Based on the original timer wheel code
*
* Help, testing, suggestions, bugfixes, improvements were
* provided by:
*
* George Anzinger, Andrew Morton, Steven Rostedt, Roman Zippel
* et. al.
*/
#include <linux/cpu.h>
#include <linux/export.h>
#include <linux/percpu.h>
#include <linux/hrtimer.h>
#include <linux/notifier.h>
#include <linux/syscalls.h>
#include <linux/interrupt.h>
#include <linux/tick.h>
#include <linux/err.h>
#include <linux/debugobjects.h>
#include <linux/sched/signal.h>
#include <linux/sched/sysctl.h>
#include <linux/sched/rt.h>
#include <linux/sched/deadline.h>
#include <linux/sched/nohz.h>
#include <linux/sched/debug.h>
#include <linux/timer.h>
#include <linux/freezer.h>
#include <linux/compat.h>
#include <linux/uaccess.h>
#include <trace/events/timer.h>
#include "tick-internal.h"
/*
* Masks for selecting the soft and hard context timers from
* cpu_base->active
*/
#define MASK_SHIFT (HRTIMER_BASE_MONOTONIC_SOFT)
#define HRTIMER_ACTIVE_HARD ((1U << MASK_SHIFT) - 1)
#define HRTIMER_ACTIVE_SOFT (HRTIMER_ACTIVE_HARD << MASK_SHIFT)
#define HRTIMER_ACTIVE_ALL (HRTIMER_ACTIVE_SOFT | HRTIMER_ACTIVE_HARD)
/*
* The timer bases:
*
* There are more clockids than hrtimer bases. Thus, we index
* into the timer bases by the hrtimer_base_type enum. When trying
* to reach a base using a clockid, hrtimer_clockid_to_base()
* is used to convert from clockid to the proper hrtimer_base_type.
*/
DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) =
{
.lock = __RAW_SPIN_LOCK_UNLOCKED(hrtimer_bases.lock),
.clock_base =
{
{
.index = HRTIMER_BASE_MONOTONIC,
.clockid = CLOCK_MONOTONIC,
.get_time = &ktime_get,
},
{
.index = HRTIMER_BASE_REALTIME,
.clockid = CLOCK_REALTIME,
.get_time = &ktime_get_real,
},
{
.index = HRTIMER_BASE_BOOTTIME,
.clockid = CLOCK_BOOTTIME,
.get_time = &ktime_get_boottime,
},
{
.index = HRTIMER_BASE_TAI,
.clockid = CLOCK_TAI,
.get_time = &ktime_get_clocktai,
},
{
.index = HRTIMER_BASE_MONOTONIC_SOFT,
.clockid = CLOCK_MONOTONIC,
.get_time = &ktime_get,
},
{
.index = HRTIMER_BASE_REALTIME_SOFT,
.clockid = CLOCK_REALTIME,
.get_time = &ktime_get_real,
},
{
.index = HRTIMER_BASE_BOOTTIME_SOFT,
.clockid = CLOCK_BOOTTIME,
.get_time = &ktime_get_boottime,
},
{
.index = HRTIMER_BASE_TAI_SOFT,
.clockid = CLOCK_TAI,
.get_time = &ktime_get_clocktai,
},
}
};
static const int hrtimer_clock_to_base_table[MAX_CLOCKS] = {
/* Make sure we catch unsupported clockids */
[0 ... MAX_CLOCKS - 1] = HRTIMER_MAX_CLOCK_BASES,
[CLOCK_REALTIME] = HRTIMER_BASE_REALTIME,
[CLOCK_MONOTONIC] = HRTIMER_BASE_MONOTONIC,
[CLOCK_BOOTTIME] = HRTIMER_BASE_BOOTTIME,
[CLOCK_TAI] = HRTIMER_BASE_TAI,
};
/*
* Functions and macros which are different for UP/SMP systems are kept in a
* single place
*/
#ifdef CONFIG_SMP
/*
* We require the migration_base for lock_hrtimer_base()/switch_hrtimer_base()
* such that hrtimer_callback_running() can unconditionally dereference
* timer->base->cpu_base
*/
static struct hrtimer_cpu_base migration_cpu_base = {
.clock_base = { {
.cpu_base = &migration_cpu_base,
.seq = SEQCNT_RAW_SPINLOCK_ZERO(migration_cpu_base.seq,
&migration_cpu_base.lock),
}, },
};
#define migration_base migration_cpu_base.clock_base[0]
static inline bool is_migration_base(struct hrtimer_clock_base *base)
{
return base == &migration_base;
}
/*
* We are using hashed locking: holding per_cpu(hrtimer_bases)[n].lock
* means that all timers which are tied to this base via timer->base are
* locked, and the base itself is locked too.
*
* So __run_timers/migrate_timers can safely modify all timers which could
* be found on the lists/queues.
*
* When the timer's base is locked, and the timer removed from list, it is
* possible to set timer->base = &migration_base and drop the lock: the timer
* remains locked.
*/
static
struct hrtimer_clock_base *lock_hrtimer_base(const struct hrtimer *timer,
unsigned long *flags)
{
struct hrtimer_clock_base *base;
for (;;) {
base = READ_ONCE(timer->base);
if (likely(base != &migration_base)) {
raw_spin_lock_irqsave(&base->cpu_base->lock, *flags);
if (likely(base == timer->base))
return base;
/* The timer has migrated to another CPU: */
raw_spin_unlock_irqrestore(&base->cpu_base->lock, *flags);
}
cpu_relax();
}
}
/*
* We do not migrate the timer when it is expiring before the next
* event on the target cpu. When high resolution is enabled, we cannot
* reprogram the target cpu hardware and we would cause it to fire
* late. To keep it simple, we handle the high resolution enabled and
* disabled case similar.
*
* Called with cpu_base->lock of target cpu held.
*/
static int
hrtimer_check_target(struct hrtimer *timer, struct hrtimer_clock_base *new_base)
{
ktime_t expires;
expires = ktime_sub(hrtimer_get_expires(timer), new_base->offset);
return expires < new_base->cpu_base->expires_next;
}
static inline
struct hrtimer_cpu_base *get_target_base(struct hrtimer_cpu_base *base,
int pinned)
{
#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON)
if (static_branch_likely(&timers_migration_enabled) && !pinned) return &per_cpu(hrtimer_bases, get_nohz_timer_target());
#endif
return base;
}
/*
* We switch the timer base to a power-optimized selected CPU target,
* if:
* - NO_HZ_COMMON is enabled
* - timer migration is enabled
* - the timer callback is not running
* - the timer is not the first expiring timer on the new target
*
* If one of the above requirements is not fulfilled we move the timer
* to the current CPU or leave it on the previously assigned CPU if
* the timer callback is currently running.
*/
static inline struct hrtimer_clock_base *
switch_hrtimer_base(struct hrtimer *timer, struct hrtimer_clock_base *base,
int pinned)
{
struct hrtimer_cpu_base *new_cpu_base, *this_cpu_base;
struct hrtimer_clock_base *new_base;
int basenum = base->index;
this_cpu_base = this_cpu_ptr(&hrtimer_bases);
new_cpu_base = get_target_base(this_cpu_base, pinned);
again:
new_base = &new_cpu_base->clock_base[basenum];
if (base != new_base) {
/*
* We are trying to move timer to new_base.
* However we can't change timer's base while it is running,
* so we keep it on the same CPU. No hassle vs. reprogramming
* the event source in the high resolution case. The softirq
* code will take care of this when the timer function has
* completed. There is no conflict as we hold the lock until
* the timer is enqueued.
*/
if (unlikely(hrtimer_callback_running(timer)))
return base;
/* See the comment in lock_hrtimer_base() */
WRITE_ONCE(timer->base, &migration_base);
raw_spin_unlock(&base->cpu_base->lock);
raw_spin_lock(&new_base->cpu_base->lock);
if (new_cpu_base != this_cpu_base &&
hrtimer_check_target(timer, new_base)) {
raw_spin_unlock(&new_base->cpu_base->lock);
raw_spin_lock(&base->cpu_base->lock);
new_cpu_base = this_cpu_base;
WRITE_ONCE(timer->base, base);
goto again;
}
WRITE_ONCE(timer->base, new_base);
} else {
if (new_cpu_base != this_cpu_base && hrtimer_check_target(timer, new_base)) {
new_cpu_base = this_cpu_base;
goto again;
}
}
return new_base;
}
#else /* CONFIG_SMP */
static inline bool is_migration_base(struct hrtimer_clock_base *base)
{
return false;
}
static inline struct hrtimer_clock_base *
lock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags)
{
struct hrtimer_clock_base *base = timer->base;
raw_spin_lock_irqsave(&base->cpu_base->lock, *flags);
return base;
}
# define switch_hrtimer_base(t, b, p) (b)
#endif /* !CONFIG_SMP */
/*
* Functions for the union type storage format of ktime_t which are
* too large for inlining:
*/
#if BITS_PER_LONG < 64
/*
* Divide a ktime value by a nanosecond value
*/
s64 __ktime_divns(const ktime_t kt, s64 div)
{
int sft = 0;
s64 dclc;
u64 tmp;
dclc = ktime_to_ns(kt);
tmp = dclc < 0 ? -dclc : dclc;
/* Make sure the divisor is less than 2^32: */
while (div >> 32) {
sft++;
div >>= 1;
}
tmp >>= sft;
do_div(tmp, (u32) div);
return dclc < 0 ? -tmp : tmp;
}
EXPORT_SYMBOL_GPL(__ktime_divns);
#endif /* BITS_PER_LONG >= 64 */
/*
* Add two ktime values and do a safety check for overflow:
*/
ktime_t ktime_add_safe(const ktime_t lhs, const ktime_t rhs)
{
ktime_t res = ktime_add_unsafe(lhs, rhs);
/*
* We use KTIME_SEC_MAX here, the maximum timeout which we can
* return to user space in a timespec:
*/
if (res < 0 || res < lhs || res < rhs)
res = ktime_set(KTIME_SEC_MAX, 0);
return res;
}
EXPORT_SYMBOL_GPL(ktime_add_safe);
#ifdef CONFIG_DEBUG_OBJECTS_TIMERS
static const struct debug_obj_descr hrtimer_debug_descr;
static void *hrtimer_debug_hint(void *addr)
{
return ((struct hrtimer *) addr)->function;
}
/*
* fixup_init is called when:
* - an active object is initialized
*/
static bool hrtimer_fixup_init(void *addr, enum debug_obj_state state)
{
struct hrtimer *timer = addr;
switch (state) {
case ODEBUG_STATE_ACTIVE:
hrtimer_cancel(timer);
debug_object_init(timer, &hrtimer_debug_descr);
return true;
default:
return false;
}
}
/*
* fixup_activate is called when:
* - an active object is activated
* - an unknown non-static object is activated
*/
static bool hrtimer_fixup_activate(void *addr, enum debug_obj_state state)
{
switch (state) {
case ODEBUG_STATE_ACTIVE:
WARN_ON(1);
fallthrough;
default:
return false;
}
}
/*
* fixup_free is called when:
* - an active object is freed
*/
static bool hrtimer_fixup_free(void *addr, enum debug_obj_state state)
{
struct hrtimer *timer = addr;
switch (state) {
case ODEBUG_STATE_ACTIVE:
hrtimer_cancel(timer);
debug_object_free(timer, &hrtimer_debug_descr);
return true;
default:
return false;
}
}
static const struct debug_obj_descr hrtimer_debug_descr = {
.name = "hrtimer",
.debug_hint = hrtimer_debug_hint,
.fixup_init = hrtimer_fixup_init,
.fixup_activate = hrtimer_fixup_activate,
.fixup_free = hrtimer_fixup_free,
};
static inline void debug_hrtimer_init(struct hrtimer *timer)
{
debug_object_init(timer, &hrtimer_debug_descr);
}
static inline void debug_hrtimer_activate(struct hrtimer *timer,
enum hrtimer_mode mode)
{
debug_object_activate(timer, &hrtimer_debug_descr);
}
static inline void debug_hrtimer_deactivate(struct hrtimer *timer)
{
debug_object_deactivate(timer, &hrtimer_debug_descr);
}
static void __hrtimer_init(struct hrtimer *timer, clockid_t clock_id,
enum hrtimer_mode mode);
void hrtimer_init_on_stack(struct hrtimer *timer, clockid_t clock_id,
enum hrtimer_mode mode)
{
debug_object_init_on_stack(timer, &hrtimer_debug_descr);
__hrtimer_init(timer, clock_id, mode);
}
EXPORT_SYMBOL_GPL(hrtimer_init_on_stack);
static void __hrtimer_init_sleeper(struct hrtimer_sleeper *sl,
clockid_t clock_id, enum hrtimer_mode mode);
void hrtimer_init_sleeper_on_stack(struct hrtimer_sleeper *sl,
clockid_t clock_id, enum hrtimer_mode mode)
{
debug_object_init_on_stack(&sl->timer, &hrtimer_debug_descr);
__hrtimer_init_sleeper(sl, clock_id, mode);
}
EXPORT_SYMBOL_GPL(hrtimer_init_sleeper_on_stack);
void destroy_hrtimer_on_stack(struct hrtimer *timer)
{
debug_object_free(timer, &hrtimer_debug_descr);
}
EXPORT_SYMBOL_GPL(destroy_hrtimer_on_stack);
#else
static inline void debug_hrtimer_init(struct hrtimer *timer) { }
static inline void debug_hrtimer_activate(struct hrtimer *timer,
enum hrtimer_mode mode) { }
static inline void debug_hrtimer_deactivate(struct hrtimer *timer) { }
#endif
static inline void
debug_init(struct hrtimer *timer, clockid_t clockid,
enum hrtimer_mode mode)
{
debug_hrtimer_init(timer);
trace_hrtimer_init(timer, clockid, mode);
}
static inline void debug_activate(struct hrtimer *timer,
enum hrtimer_mode mode)
{
debug_hrtimer_activate(timer, mode);
trace_hrtimer_start(timer, mode);
}
static inline void debug_deactivate(struct hrtimer *timer)
{
debug_hrtimer_deactivate(timer);
trace_hrtimer_cancel(timer);
}
static struct hrtimer_clock_base *
__next_base(struct hrtimer_cpu_base *cpu_base, unsigned int *active)
{
unsigned int idx;
if (!*active)
return NULL;
idx = __ffs(*active);
*active &= ~(1U << idx);
return &cpu_base->clock_base[idx];
}
#define for_each_active_base(base, cpu_base, active) \
while ((base = __next_base((cpu_base), &(active))))
static ktime_t __hrtimer_next_event_base(struct hrtimer_cpu_base *cpu_base,
const struct hrtimer *exclude,
unsigned int active,
ktime_t expires_next)
{
struct hrtimer_clock_base *base;
ktime_t expires;
for_each_active_base(base, cpu_base, active) {
struct timerqueue_node *next;
struct hrtimer *timer;
next = timerqueue_getnext(&base->active);
timer = container_of(next, struct hrtimer, node);
if (timer == exclude) {
/* Get to the next timer in the queue. */
next = timerqueue_iterate_next(next);
if (!next)
continue;
timer = container_of(next, struct hrtimer, node);
}
expires = ktime_sub(hrtimer_get_expires(timer), base->offset);
if (expires < expires_next) {
expires_next = expires;
/* Skip cpu_base update if a timer is being excluded. */
if (exclude)
continue;
if (timer->is_soft)
cpu_base->softirq_next_timer = timer;
else
cpu_base->next_timer = timer;
}
}
/*
* clock_was_set() might have changed base->offset of any of
* the clock bases so the result might be negative. Fix it up
* to prevent a false positive in clockevents_program_event().
*/
if (expires_next < 0)
expires_next = 0;
return expires_next;
}
/*
* Recomputes cpu_base::*next_timer and returns the earliest expires_next
* but does not set cpu_base::*expires_next, that is done by
* hrtimer[_force]_reprogram and hrtimer_interrupt only. When updating
* cpu_base::*expires_next right away, reprogramming logic would no longer
* work.
*
* When a softirq is pending, we can ignore the HRTIMER_ACTIVE_SOFT bases,
* those timers will get run whenever the softirq gets handled, at the end of
* hrtimer_run_softirq(), hrtimer_update_softirq_timer() will re-add these bases.
*
* Therefore softirq values are those from the HRTIMER_ACTIVE_SOFT clock bases.
* The !softirq values are the minima across HRTIMER_ACTIVE_ALL, unless an actual
* softirq is pending, in which case they're the minima of HRTIMER_ACTIVE_HARD.
*
* @active_mask must be one of:
* - HRTIMER_ACTIVE_ALL,
* - HRTIMER_ACTIVE_SOFT, or
* - HRTIMER_ACTIVE_HARD.
*/
static ktime_t
__hrtimer_get_next_event(struct hrtimer_cpu_base *cpu_base, unsigned int active_mask)
{
unsigned int active;
struct hrtimer *next_timer = NULL;
ktime_t expires_next = KTIME_MAX;
if (!cpu_base->softirq_activated && (active_mask & HRTIMER_ACTIVE_SOFT)) {
active = cpu_base->active_bases & HRTIMER_ACTIVE_SOFT;
cpu_base->softirq_next_timer = NULL;
expires_next = __hrtimer_next_event_base(cpu_base, NULL,
active, KTIME_MAX);
next_timer = cpu_base->softirq_next_timer;
}
if (active_mask & HRTIMER_ACTIVE_HARD) {
active = cpu_base->active_bases & HRTIMER_ACTIVE_HARD;
cpu_base->next_timer = next_timer;
expires_next = __hrtimer_next_event_base(cpu_base, NULL, active,
expires_next);
}
return expires_next;
}
static ktime_t hrtimer_update_next_event(struct hrtimer_cpu_base *cpu_base)
{
ktime_t expires_next, soft = KTIME_MAX;
/*
* If the soft interrupt has already been activated, ignore the
* soft bases. They will be handled in the already raised soft
* interrupt.
*/
if (!cpu_base->softirq_activated) {
soft = __hrtimer_get_next_event(cpu_base, HRTIMER_ACTIVE_SOFT);
/*
* Update the soft expiry time. clock_settime() might have
* affected it.
*/
cpu_base->softirq_expires_next = soft;
}
expires_next = __hrtimer_get_next_event(cpu_base, HRTIMER_ACTIVE_HARD);
/*
* If a softirq timer is expiring first, update cpu_base->next_timer
* and program the hardware with the soft expiry time.
*/
if (expires_next > soft) {
cpu_base->next_timer = cpu_base->softirq_next_timer;
expires_next = soft;
}
return expires_next;
}
static inline ktime_t hrtimer_update_base(struct hrtimer_cpu_base *base)
{
ktime_t *offs_real = &base->clock_base[HRTIMER_BASE_REALTIME].offset;
ktime_t *offs_boot = &base->clock_base[HRTIMER_BASE_BOOTTIME].offset;
ktime_t *offs_tai = &base->clock_base[HRTIMER_BASE_TAI].offset;
ktime_t now = ktime_get_update_offsets_now(&base->clock_was_set_seq,
offs_real, offs_boot, offs_tai);
base->clock_base[HRTIMER_BASE_REALTIME_SOFT].offset = *offs_real;
base->clock_base[HRTIMER_BASE_BOOTTIME_SOFT].offset = *offs_boot;
base->clock_base[HRTIMER_BASE_TAI_SOFT].offset = *offs_tai;
return now;
}
/*
* Is the high resolution mode active ?
*/
static inline int __hrtimer_hres_active(struct hrtimer_cpu_base *cpu_base)
{
return IS_ENABLED(CONFIG_HIGH_RES_TIMERS) ?
cpu_base->hres_active : 0;
}
static inline int hrtimer_hres_active(void)
{
return __hrtimer_hres_active(this_cpu_ptr(&hrtimer_bases));
}
static void __hrtimer_reprogram(struct hrtimer_cpu_base *cpu_base,
struct hrtimer *next_timer,
ktime_t expires_next)
{
cpu_base->expires_next = expires_next;
/*
* If hres is not active, hardware does not have to be
* reprogrammed yet.
*
* If a hang was detected in the last timer interrupt then we
* leave the hang delay active in the hardware. We want the
* system to make progress. That also prevents the following
* scenario:
* T1 expires 50ms from now
* T2 expires 5s from now
*
* T1 is removed, so this code is called and would reprogram
* the hardware to 5s from now. Any hrtimer_start after that
* will not reprogram the hardware due to hang_detected being
* set. So we'd effectively block all timers until the T2 event
* fires.
*/
if (!__hrtimer_hres_active(cpu_base) || cpu_base->hang_detected)
return;
tick_program_event(expires_next, 1);
}
/*
* Reprogram the event source with checking both queues for the
* next event
* Called with interrupts disabled and base->lock held
*/
static void
hrtimer_force_reprogram(struct hrtimer_cpu_base *cpu_base, int skip_equal)
{
ktime_t expires_next;
expires_next = hrtimer_update_next_event(cpu_base);
if (skip_equal && expires_next == cpu_base->expires_next)
return;
__hrtimer_reprogram(cpu_base, cpu_base->next_timer, expires_next);
}
/* High resolution timer related functions */
#ifdef CONFIG_HIGH_RES_TIMERS
/*
* High resolution timer enabled ?
*/
static bool hrtimer_hres_enabled __read_mostly = true;
unsigned int hrtimer_resolution __read_mostly = LOW_RES_NSEC;
EXPORT_SYMBOL_GPL(hrtimer_resolution);
/*
* Enable / Disable high resolution mode
*/
static int __init setup_hrtimer_hres(char *str)
{
return (kstrtobool(str, &hrtimer_hres_enabled) == 0);
}
__setup("highres=", setup_hrtimer_hres);
/*
* hrtimer_high_res_enabled - query, if the highres mode is enabled
*/
static inline int hrtimer_is_hres_enabled(void)
{
return hrtimer_hres_enabled;
}
static void retrigger_next_event(void *arg);
/*
* Switch to high resolution mode
*/
static void hrtimer_switch_to_hres(void)
{
struct hrtimer_cpu_base *base = this_cpu_ptr(&hrtimer_bases);
if (tick_init_highres()) {
pr_warn("Could not switch to high resolution mode on CPU %u\n",
base->cpu);
return;
}
base->hres_active = 1;
hrtimer_resolution = HIGH_RES_NSEC;
tick_setup_sched_timer();
/* "Retrigger" the interrupt to get things going */
retrigger_next_event(NULL);
}
#else
static inline int hrtimer_is_hres_enabled(void) { return 0; }
static inline void hrtimer_switch_to_hres(void) { }
#endif /* CONFIG_HIGH_RES_TIMERS */
/*
* Retrigger next event is called after clock was set with interrupts
* disabled through an SMP function call or directly from low level
* resume code.
*
* This is only invoked when:
* - CONFIG_HIGH_RES_TIMERS is enabled.
* - CONFIG_NOHZ_COMMON is enabled
*
* For the other cases this function is empty and because the call sites
* are optimized out it vanishes as well, i.e. no need for lots of
* #ifdeffery.
*/
static void retrigger_next_event(void *arg)
{
struct hrtimer_cpu_base *base = this_cpu_ptr(&hrtimer_bases);
/*
* When high resolution mode or nohz is active, then the offsets of
* CLOCK_REALTIME/TAI/BOOTTIME have to be updated. Otherwise the
* next tick will take care of that.
*
* If high resolution mode is active then the next expiring timer
* must be reevaluated and the clock event device reprogrammed if
* necessary.
*
* In the NOHZ case the update of the offset and the reevaluation
* of the next expiring timer is enough. The return from the SMP
* function call will take care of the reprogramming in case the
* CPU was in a NOHZ idle sleep.
*/
if (!__hrtimer_hres_active(base) && !tick_nohz_active)
return;
raw_spin_lock(&base->lock);
hrtimer_update_base(base);
if (__hrtimer_hres_active(base))
hrtimer_force_reprogram(base, 0);
else
hrtimer_update_next_event(base);
raw_spin_unlock(&base->lock);
}
/*
* When a timer is enqueued and expires earlier than the already enqueued
* timers, we have to check, whether it expires earlier than the timer for
* which the clock event device was armed.
*
* Called with interrupts disabled and base->cpu_base.lock held
*/
static void hrtimer_reprogram(struct hrtimer *timer, bool reprogram)
{
struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases);
struct hrtimer_clock_base *base = timer->base;
ktime_t expires = ktime_sub(hrtimer_get_expires(timer), base->offset);
WARN_ON_ONCE(hrtimer_get_expires_tv64(timer) < 0);
/*
* CLOCK_REALTIME timer might be requested with an absolute
* expiry time which is less than base->offset. Set it to 0.
*/
if (expires < 0)
expires = 0;
if (timer->is_soft) {
/*
* soft hrtimer could be started on a remote CPU. In this
* case softirq_expires_next needs to be updated on the
* remote CPU. The soft hrtimer will not expire before the
* first hard hrtimer on the remote CPU -
* hrtimer_check_target() prevents this case.
*/
struct hrtimer_cpu_base *timer_cpu_base = base->cpu_base;
if (timer_cpu_base->softirq_activated)
return;
if (!ktime_before(expires, timer_cpu_base->softirq_expires_next))
return;
timer_cpu_base->softirq_next_timer = timer;
timer_cpu_base->softirq_expires_next = expires;
if (!ktime_before(expires, timer_cpu_base->expires_next) ||
!reprogram)
return;
}
/*
* If the timer is not on the current cpu, we cannot reprogram
* the other cpus clock event device.
*/
if (base->cpu_base != cpu_base)
return;
if (expires >= cpu_base->expires_next)
return;
/*
* If the hrtimer interrupt is running, then it will reevaluate the
* clock bases and reprogram the clock event device.
*/
if (cpu_base->in_hrtirq)
return;
cpu_base->next_timer = timer;
__hrtimer_reprogram(cpu_base, timer, expires);
}
static bool update_needs_ipi(struct hrtimer_cpu_base *cpu_base,
unsigned int active)
{
struct hrtimer_clock_base *base;
unsigned int seq;
ktime_t expires;
/*
* Update the base offsets unconditionally so the following
* checks whether the SMP function call is required works.
*
* The update is safe even when the remote CPU is in the hrtimer
* interrupt or the hrtimer soft interrupt and expiring affected
* bases. Either it will see the update before handling a base or
* it will see it when it finishes the processing and reevaluates
* the next expiring timer.
*/
seq = cpu_base->clock_was_set_seq;
hrtimer_update_base(cpu_base);
/*
* If the sequence did not change over the update then the
* remote CPU already handled it.
*/
if (seq == cpu_base->clock_was_set_seq)
return false;
/*
* If the remote CPU is currently handling an hrtimer interrupt, it
* will reevaluate the first expiring timer of all clock bases
* before reprogramming. Nothing to do here.
*/
if (cpu_base->in_hrtirq)
return false;
/*
* Walk the affected clock bases and check whether the first expiring
* timer in a clock base is moving ahead of the first expiring timer of
* @cpu_base. If so, the IPI must be invoked because per CPU clock
* event devices cannot be remotely reprogrammed.
*/
active &= cpu_base->active_bases;
for_each_active_base(base, cpu_base, active) {
struct timerqueue_node *next;
next = timerqueue_getnext(&base->active);
expires = ktime_sub(next->expires, base->offset);
if (expires < cpu_base->expires_next)
return true;
/* Extra check for softirq clock bases */
if (base->clockid < HRTIMER_BASE_MONOTONIC_SOFT)
continue;
if (cpu_base->softirq_activated)
continue;
if (expires < cpu_base->softirq_expires_next)
return true;
}
return false;
}
/*
* Clock was set. This might affect CLOCK_REALTIME, CLOCK_TAI and
* CLOCK_BOOTTIME (for late sleep time injection).
*
* This requires to update the offsets for these clocks
* vs. CLOCK_MONOTONIC. When high resolution timers are enabled, then this
* also requires to eventually reprogram the per CPU clock event devices
* when the change moves an affected timer ahead of the first expiring
* timer on that CPU. Obviously remote per CPU clock event devices cannot
* be reprogrammed. The other reason why an IPI has to be sent is when the
* system is in !HIGH_RES and NOHZ mode. The NOHZ mode updates the offsets
* in the tick, which obviously might be stopped, so this has to bring out
* the remote CPU which might sleep in idle to get this sorted.
*/
void clock_was_set(unsigned int bases)
{
struct hrtimer_cpu_base *cpu_base = raw_cpu_ptr(&hrtimer_bases);
cpumask_var_t mask;
int cpu;
if (!__hrtimer_hres_active(cpu_base) && !tick_nohz_active)
goto out_timerfd;
if (!zalloc_cpumask_var(&mask, GFP_KERNEL)) {
on_each_cpu(retrigger_next_event, NULL, 1);
goto out_timerfd;
}
/* Avoid interrupting CPUs if possible */
cpus_read_lock();
for_each_online_cpu(cpu) {
unsigned long flags;
cpu_base = &per_cpu(hrtimer_bases, cpu);
raw_spin_lock_irqsave(&cpu_base->lock, flags);
if (update_needs_ipi(cpu_base, bases))
cpumask_set_cpu(cpu, mask);
raw_spin_unlock_irqrestore(&cpu_base->lock, flags);
}
preempt_disable();
smp_call_function_many(mask, retrigger_next_event, NULL, 1);
preempt_enable();
cpus_read_unlock();
free_cpumask_var(mask);
out_timerfd:
timerfd_clock_was_set();
}
static void clock_was_set_work(struct work_struct *work)
{
clock_was_set(CLOCK_SET_WALL);
}
static DECLARE_WORK(hrtimer_work, clock_was_set_work);
/*
* Called from timekeeping code to reprogram the hrtimer interrupt device
* on all cpus and to notify timerfd.
*/
void clock_was_set_delayed(void)
{
schedule_work(&hrtimer_work);
}
/*
* Called during resume either directly from via timekeeping_resume()
* or in the case of s2idle from tick_unfreeze() to ensure that the
* hrtimers are up to date.
*/
void hrtimers_resume_local(void)
{
lockdep_assert_irqs_disabled();
/* Retrigger on the local CPU */
retrigger_next_event(NULL);
}
/*
* Counterpart to lock_hrtimer_base above:
*/
static inline
void unlock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags)
{
raw_spin_unlock_irqrestore(&timer->base->cpu_base->lock, *flags);
}
/**
* hrtimer_forward - forward the timer expiry
* @timer: hrtimer to forward
* @now: forward past this time
* @interval: the interval to forward
*
* Forward the timer expiry so it will expire in the future.
* Returns the number of overruns.
*
* Can be safely called from the callback function of @timer. If
* called from other contexts @timer must neither be enqueued nor
* running the callback and the caller needs to take care of
* serialization.
*
* Note: This only updates the timer expiry value and does not requeue
* the timer.
*/
u64 hrtimer_forward(struct hrtimer *timer, ktime_t now, ktime_t interval)
{
u64 orun = 1;
ktime_t delta;
delta = ktime_sub(now, hrtimer_get_expires(timer));
if (delta < 0)
return 0;
if (WARN_ON(timer->state & HRTIMER_STATE_ENQUEUED))
return 0;
if (interval < hrtimer_resolution)
interval = hrtimer_resolution;
if (unlikely(delta >= interval)) {
s64 incr = ktime_to_ns(interval);
orun = ktime_divns(delta, incr);
hrtimer_add_expires_ns(timer, incr * orun);
if (hrtimer_get_expires_tv64(timer) > now)
return orun;
/*
* This (and the ktime_add() below) is the
* correction for exact:
*/
orun++;
}
hrtimer_add_expires(timer, interval);
return orun;
}
EXPORT_SYMBOL_GPL(hrtimer_forward);
/*
* enqueue_hrtimer - internal function to (re)start a timer
*
* The timer is inserted in expiry order. Insertion into the
* red black tree is O(log(n)). Must hold the base lock.
*
* Returns 1 when the new timer is the leftmost timer in the tree.
*/
static int enqueue_hrtimer(struct hrtimer *timer,
struct hrtimer_clock_base *base,
enum hrtimer_mode mode)
{
debug_activate(timer, mode);
base->cpu_base->active_bases |= 1 << base->index;
/* Pairs with the lockless read in hrtimer_is_queued() */
WRITE_ONCE(timer->state, HRTIMER_STATE_ENQUEUED);
return timerqueue_add(&base->active, &timer->node);
}
/*
* __remove_hrtimer - internal function to remove a timer
*
* Caller must hold the base lock.
*
* High resolution timer mode reprograms the clock event device when the
* timer is the one which expires next. The caller can disable this by setting
* reprogram to zero. This is useful, when the context does a reprogramming
* anyway (e.g. timer interrupt)
*/
static void __remove_hrtimer(struct hrtimer *timer,
struct hrtimer_clock_base *base,
u8 newstate, int reprogram)
{
struct hrtimer_cpu_base *cpu_base = base->cpu_base;
u8 state = timer->state;
/* Pairs with the lockless read in hrtimer_is_queued() */
WRITE_ONCE(timer->state, newstate);
if (!(state & HRTIMER_STATE_ENQUEUED))
return;
if (!timerqueue_del(&base->active, &timer->node))
cpu_base->active_bases &= ~(1 << base->index);
/*
* Note: If reprogram is false we do not update
* cpu_base->next_timer. This happens when we remove the first
* timer on a remote cpu. No harm as we never dereference
* cpu_base->next_timer. So the worst thing what can happen is
* an superfluous call to hrtimer_force_reprogram() on the
* remote cpu later on if the same timer gets enqueued again.
*/
if (reprogram && timer == cpu_base->next_timer)
hrtimer_force_reprogram(cpu_base, 1);
}
/*
* remove hrtimer, called with base lock held
*/
static inline int
remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base,
bool restart, bool keep_local)
{
u8 state = timer->state;
if (state & HRTIMER_STATE_ENQUEUED) {
bool reprogram;
/*
* Remove the timer and force reprogramming when high
* resolution mode is active and the timer is on the current
* CPU. If we remove a timer on another CPU, reprogramming is
* skipped. The interrupt event on this CPU is fired and
* reprogramming happens in the interrupt handler. This is a
* rare case and less expensive than a smp call.
*/
debug_deactivate(timer);
reprogram = base->cpu_base == this_cpu_ptr(&hrtimer_bases);
/*
* If the timer is not restarted then reprogramming is
* required if the timer is local. If it is local and about
* to be restarted, avoid programming it twice (on removal
* and a moment later when it's requeued).
*/
if (!restart)
state = HRTIMER_STATE_INACTIVE;
else
reprogram &= !keep_local;
__remove_hrtimer(timer, base, state, reprogram);
return 1;
}
return 0;
}
static inline ktime_t hrtimer_update_lowres(struct hrtimer *timer, ktime_t tim,
const enum hrtimer_mode mode)
{
#ifdef CONFIG_TIME_LOW_RES
/*
* CONFIG_TIME_LOW_RES indicates that the system has no way to return
* granular time values. For relative timers we add hrtimer_resolution
* (i.e. one jiffie) to prevent short timeouts.
*/
timer->is_rel = mode & HRTIMER_MODE_REL;
if (timer->is_rel)
tim = ktime_add_safe(tim, hrtimer_resolution);
#endif
return tim;
}
static void
hrtimer_update_softirq_timer(struct hrtimer_cpu_base *cpu_base, bool reprogram)
{
ktime_t expires;
/*
* Find the next SOFT expiration.
*/
expires = __hrtimer_get_next_event(cpu_base, HRTIMER_ACTIVE_SOFT);
/*
* reprogramming needs to be triggered, even if the next soft
* hrtimer expires at the same time than the next hard
* hrtimer. cpu_base->softirq_expires_next needs to be updated!
*/
if (expires == KTIME_MAX)
return;
/*
* cpu_base->*next_timer is recomputed by __hrtimer_get_next_event()
* cpu_base->*expires_next is only set by hrtimer_reprogram()
*/
hrtimer_reprogram(cpu_base->softirq_next_timer, reprogram);
}
static int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
u64 delta_ns, const enum hrtimer_mode mode,
struct hrtimer_clock_base *base)
{
struct hrtimer_clock_base *new_base;
bool force_local, first;
/*
* If the timer is on the local cpu base and is the first expiring
* timer then this might end up reprogramming the hardware twice
* (on removal and on enqueue). To avoid that by prevent the
* reprogram on removal, keep the timer local to the current CPU
* and enforce reprogramming after it is queued no matter whether
* it is the new first expiring timer again or not.
*/
force_local = base->cpu_base == this_cpu_ptr(&hrtimer_bases);
force_local &= base->cpu_base->next_timer == timer;
/*
* Remove an active timer from the queue. In case it is not queued
* on the current CPU, make sure that remove_hrtimer() updates the
* remote data correctly.
*
* If it's on the current CPU and the first expiring timer, then
* skip reprogramming, keep the timer local and enforce
* reprogramming later if it was the first expiring timer. This
* avoids programming the underlying clock event twice (once at
* removal and once after enqueue).
*/
remove_hrtimer(timer, base, true, force_local);
if (mode & HRTIMER_MODE_REL) tim = ktime_add_safe(tim, base->get_time());
tim = hrtimer_update_lowres(timer, tim, mode);
hrtimer_set_expires_range_ns(timer, tim, delta_ns);
/* Switch the timer base, if necessary: */
if (!force_local) {
new_base = switch_hrtimer_base(timer, base,
mode & HRTIMER_MODE_PINNED);
} else {
new_base = base;
}
first = enqueue_hrtimer(timer, new_base, mode);
if (!force_local)
return first;
/*
* Timer was forced to stay on the current CPU to avoid
* reprogramming on removal and enqueue. Force reprogram the
* hardware by evaluating the new first expiring timer.
*/
hrtimer_force_reprogram(new_base->cpu_base, 1);
return 0;
}
/**
* hrtimer_start_range_ns - (re)start an hrtimer
* @timer: the timer to be added
* @tim: expiry time
* @delta_ns: "slack" range for the timer
* @mode: timer mode: absolute (HRTIMER_MODE_ABS) or
* relative (HRTIMER_MODE_REL), and pinned (HRTIMER_MODE_PINNED);
* softirq based mode is considered for debug purpose only!
*/
void hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
u64 delta_ns, const enum hrtimer_mode mode)
{
struct hrtimer_clock_base *base;
unsigned long flags;
/*
* Check whether the HRTIMER_MODE_SOFT bit and hrtimer.is_soft
* match on CONFIG_PREEMPT_RT = n. With PREEMPT_RT check the hard
* expiry mode because unmarked timers are moved to softirq expiry.
*/
if (!IS_ENABLED(CONFIG_PREEMPT_RT))
WARN_ON_ONCE(!(mode & HRTIMER_MODE_SOFT) ^ !timer->is_soft);
else
WARN_ON_ONCE(!(mode & HRTIMER_MODE_HARD) ^ !timer->is_hard);
base = lock_hrtimer_base(timer, &flags);
if (__hrtimer_start_range_ns(timer, tim, delta_ns, mode, base))
hrtimer_reprogram(timer, true); unlock_hrtimer_base(timer, &flags);
}
EXPORT_SYMBOL_GPL(hrtimer_start_range_ns);
/**
* hrtimer_try_to_cancel - try to deactivate a timer
* @timer: hrtimer to stop
*
* Returns:
*
* * 0 when the timer was not active
* * 1 when the timer was active
* * -1 when the timer is currently executing the callback function and
* cannot be stopped
*/
int hrtimer_try_to_cancel(struct hrtimer *timer)
{
struct hrtimer_clock_base *base;
unsigned long flags;
int ret = -1;
/*
* Check lockless first. If the timer is not active (neither
* enqueued nor running the callback, nothing to do here. The
* base lock does not serialize against a concurrent enqueue,
* so we can avoid taking it.
*/
if (!hrtimer_active(timer))
return 0;
base = lock_hrtimer_base(timer, &flags);
if (!hrtimer_callback_running(timer))
ret = remove_hrtimer(timer, base, false, false);
unlock_hrtimer_base(timer, &flags);
return ret;
}
EXPORT_SYMBOL_GPL(hrtimer_try_to_cancel);
#ifdef CONFIG_PREEMPT_RT
static void hrtimer_cpu_base_init_expiry_lock(struct hrtimer_cpu_base *base)
{
spin_lock_init(&base->softirq_expiry_lock);
}
static void hrtimer_cpu_base_lock_expiry(struct hrtimer_cpu_base *base)
{
spin_lock(&base->softirq_expiry_lock);
}
static void hrtimer_cpu_base_unlock_expiry(struct hrtimer_cpu_base *base)
{
spin_unlock(&base->softirq_expiry_lock);
}
/*
* The counterpart to hrtimer_cancel_wait_running().
*
* If there is a waiter for cpu_base->expiry_lock, then it was waiting for
* the timer callback to finish. Drop expiry_lock and reacquire it. That
* allows the waiter to acquire the lock and make progress.
*/
static void hrtimer_sync_wait_running(struct hrtimer_cpu_base *cpu_base,
unsigned long flags)
{
if (atomic_read(&cpu_base->timer_waiters)) {
raw_spin_unlock_irqrestore(&cpu_base->lock, flags);
spin_unlock(&cpu_base->softirq_expiry_lock);
spin_lock(&cpu_base->softirq_expiry_lock);
raw_spin_lock_irq(&cpu_base->lock);
}
}
/*
* This function is called on PREEMPT_RT kernels when the fast path
* deletion of a timer failed because the timer callback function was
* running.
*
* This prevents priority inversion: if the soft irq thread is preempted
* in the middle of a timer callback, then calling del_timer_sync() can
* lead to two issues:
*
* - If the caller is on a remote CPU then it has to spin wait for the timer
* handler to complete. This can result in unbound priority inversion.
*
* - If the caller originates from the task which preempted the timer
* handler on the same CPU, then spin waiting for the timer handler to
* complete is never going to end.
*/
void hrtimer_cancel_wait_running(const struct hrtimer *timer)
{
/* Lockless read. Prevent the compiler from reloading it below */
struct hrtimer_clock_base *base = READ_ONCE(timer->base);
/*
* Just relax if the timer expires in hard interrupt context or if
* it is currently on the migration base.
*/
if (!timer->is_soft || is_migration_base(base)) {
cpu_relax();
return;
}
/*
* Mark the base as contended and grab the expiry lock, which is
* held by the softirq across the timer callback. Drop the lock
* immediately so the softirq can expire the next timer. In theory
* the timer could already be running again, but that's more than
* unlikely and just causes another wait loop.
*/
atomic_inc(&base->cpu_base->timer_waiters);
spin_lock_bh(&base->cpu_base->softirq_expiry_lock);
atomic_dec(&base->cpu_base->timer_waiters);
spin_unlock_bh(&base->cpu_base->softirq_expiry_lock);
}
#else
static inline void
hrtimer_cpu_base_init_expiry_lock(struct hrtimer_cpu_base *base) { }
static inline void
hrtimer_cpu_base_lock_expiry(struct hrtimer_cpu_base *base) { }
static inline void
hrtimer_cpu_base_unlock_expiry(struct hrtimer_cpu_base *base) { }
static inline void hrtimer_sync_wait_running(struct hrtimer_cpu_base *base,
unsigned long flags) { }
#endif
/**
* hrtimer_cancel - cancel a timer and wait for the handler to finish.
* @timer: the timer to be cancelled
*
* Returns:
* 0 when the timer was not active
* 1 when the timer was active
*/
int hrtimer_cancel(struct hrtimer *timer)
{
int ret;
do {
ret = hrtimer_try_to_cancel(timer);
if (ret < 0)
hrtimer_cancel_wait_running(timer);
} while (ret < 0);
return ret;
}
EXPORT_SYMBOL_GPL(hrtimer_cancel);
/**
* __hrtimer_get_remaining - get remaining time for the timer
* @timer: the timer to read
* @adjust: adjust relative timers when CONFIG_TIME_LOW_RES=y
*/
ktime_t __hrtimer_get_remaining(const struct hrtimer *timer, bool adjust)
{
unsigned long flags;
ktime_t rem;
lock_hrtimer_base(timer, &flags);
if (IS_ENABLED(CONFIG_TIME_LOW_RES) && adjust)
rem = hrtimer_expires_remaining_adjusted(timer);
else
rem = hrtimer_expires_remaining(timer);
unlock_hrtimer_base(timer, &flags);
return rem;
}
EXPORT_SYMBOL_GPL(__hrtimer_get_remaining);
#ifdef CONFIG_NO_HZ_COMMON
/**
* hrtimer_get_next_event - get the time until next expiry event
*
* Returns the next expiry time or KTIME_MAX if no timer is pending.
*/
u64 hrtimer_get_next_event(void)
{
struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases);
u64 expires = KTIME_MAX;
unsigned long flags;
raw_spin_lock_irqsave(&cpu_base->lock, flags);
if (!__hrtimer_hres_active(cpu_base))
expires = __hrtimer_get_next_event(cpu_base, HRTIMER_ACTIVE_ALL);
raw_spin_unlock_irqrestore(&cpu_base->lock, flags);
return expires;
}
/**
* hrtimer_next_event_without - time until next expiry event w/o one timer
* @exclude: timer to exclude
*
* Returns the next expiry time over all timers except for the @exclude one or
* KTIME_MAX if none of them is pending.
*/
u64 hrtimer_next_event_without(const struct hrtimer *exclude)
{
struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases);
u64 expires = KTIME_MAX;
unsigned long flags;
raw_spin_lock_irqsave(&cpu_base->lock, flags);
if (__hrtimer_hres_active(cpu_base)) {
unsigned int active;
if (!cpu_base->softirq_activated) {
active = cpu_base->active_bases & HRTIMER_ACTIVE_SOFT;
expires = __hrtimer_next_event_base(cpu_base, exclude,
active, KTIME_MAX);
}
active = cpu_base->active_bases & HRTIMER_ACTIVE_HARD;
expires = __hrtimer_next_event_base(cpu_base, exclude, active,
expires);
}
raw_spin_unlock_irqrestore(&cpu_base->lock, flags);
return expires;
}
#endif
static inline int hrtimer_clockid_to_base(clockid_t clock_id)
{
if (likely(clock_id < MAX_CLOCKS)) { int base = hrtimer_clock_to_base_table[clock_id]; if (likely(base != HRTIMER_MAX_CLOCK_BASES))
return base;
}
WARN(1, "Invalid clockid %d. Using MONOTONIC\n", clock_id);
return HRTIMER_BASE_MONOTONIC;
}
static void __hrtimer_init(struct hrtimer *timer, clockid_t clock_id,
enum hrtimer_mode mode)
{
bool softtimer = !!(mode & HRTIMER_MODE_SOFT);
struct hrtimer_cpu_base *cpu_base;
int base;
/*
* On PREEMPT_RT enabled kernels hrtimers which are not explicitly
* marked for hard interrupt expiry mode are moved into soft
* interrupt context for latency reasons and because the callbacks
* can invoke functions which might sleep on RT, e.g. spin_lock().
*/
if (IS_ENABLED(CONFIG_PREEMPT_RT) && !(mode & HRTIMER_MODE_HARD))
softtimer = true;
memset(timer, 0, sizeof(struct hrtimer));
cpu_base = raw_cpu_ptr(&hrtimer_bases);
/*
* POSIX magic: Relative CLOCK_REALTIME timers are not affected by
* clock modifications, so they needs to become CLOCK_MONOTONIC to
* ensure POSIX compliance.
*/
if (clock_id == CLOCK_REALTIME && mode & HRTIMER_MODE_REL)
clock_id = CLOCK_MONOTONIC;
base = softtimer ? HRTIMER_MAX_CLOCK_BASES / 2 : 0;
base += hrtimer_clockid_to_base(clock_id);
timer->is_soft = softtimer;
timer->is_hard = !!(mode & HRTIMER_MODE_HARD);
timer->base = &cpu_base->clock_base[base];
timerqueue_init(&timer->node);
}
/**
* hrtimer_init - initialize a timer to the given clock
* @timer: the timer to be initialized
* @clock_id: the clock to be used
* @mode: The modes which are relevant for initialization:
* HRTIMER_MODE_ABS, HRTIMER_MODE_REL, HRTIMER_MODE_ABS_SOFT,
* HRTIMER_MODE_REL_SOFT
*
* The PINNED variants of the above can be handed in,
* but the PINNED bit is ignored as pinning happens
* when the hrtimer is started
*/
void hrtimer_init(struct hrtimer *timer, clockid_t clock_id,
enum hrtimer_mode mode)
{
debug_init(timer, clock_id, mode);
__hrtimer_init(timer, clock_id, mode);
}
EXPORT_SYMBOL_GPL(hrtimer_init);
/*
* A timer is active, when it is enqueued into the rbtree or the
* callback function is running or it's in the state of being migrated
* to another cpu.
*
* It is important for this function to not return a false negative.
*/
bool hrtimer_active(const struct hrtimer *timer)
{
struct hrtimer_clock_base *base;
unsigned int seq;
do {
base = READ_ONCE(timer->base); seq = raw_read_seqcount_begin(&base->seq);
if (timer->state != HRTIMER_STATE_INACTIVE ||
base->running == timer) return true;
} while (read_seqcount_retry(&base->seq, seq) ||
base != READ_ONCE(timer->base));
return false;
}
EXPORT_SYMBOL_GPL(hrtimer_active);
/*
* The write_seqcount_barrier()s in __run_hrtimer() split the thing into 3
* distinct sections:
*
* - queued: the timer is queued
* - callback: the timer is being ran
* - post: the timer is inactive or (re)queued
*
* On the read side we ensure we observe timer->state and cpu_base->running
* from the same section, if anything changed while we looked at it, we retry.
* This includes timer->base changing because sequence numbers alone are
* insufficient for that.
*
* The sequence numbers are required because otherwise we could still observe
* a false negative if the read side got smeared over multiple consecutive
* __run_hrtimer() invocations.
*/
static void __run_hrtimer(struct hrtimer_cpu_base *cpu_base,
struct hrtimer_clock_base *base,
struct hrtimer *timer, ktime_t *now,
unsigned long flags) __must_hold(&cpu_base->lock)
{
enum hrtimer_restart (*fn)(struct hrtimer *);
bool expires_in_hardirq;
int restart;
lockdep_assert_held(&cpu_base->lock);
debug_deactivate(timer);
base->running = timer;
/*
* Separate the ->running assignment from the ->state assignment.
*
* As with a regular write barrier, this ensures the read side in
* hrtimer_active() cannot observe base->running == NULL &&
* timer->state == INACTIVE.
*/
raw_write_seqcount_barrier(&base->seq);
__remove_hrtimer(timer, base, HRTIMER_STATE_INACTIVE, 0);
fn = timer->function;
/*
* Clear the 'is relative' flag for the TIME_LOW_RES case. If the
* timer is restarted with a period then it becomes an absolute
* timer. If its not restarted it does not matter.
*/
if (IS_ENABLED(CONFIG_TIME_LOW_RES))
timer->is_rel = false;
/*
* The timer is marked as running in the CPU base, so it is
* protected against migration to a different CPU even if the lock
* is dropped.
*/
raw_spin_unlock_irqrestore(&cpu_base->lock, flags);
trace_hrtimer_expire_entry(timer, now);
expires_in_hardirq = lockdep_hrtimer_enter(timer);
restart = fn(timer);
lockdep_hrtimer_exit(expires_in_hardirq);
trace_hrtimer_expire_exit(timer);
raw_spin_lock_irq(&cpu_base->lock);
/*
* Note: We clear the running state after enqueue_hrtimer and
* we do not reprogram the event hardware. Happens either in
* hrtimer_start_range_ns() or in hrtimer_interrupt()
*
* Note: Because we dropped the cpu_base->lock above,
* hrtimer_start_range_ns() can have popped in and enqueued the timer
* for us already.
*/
if (restart != HRTIMER_NORESTART &&
!(timer->state & HRTIMER_STATE_ENQUEUED))
enqueue_hrtimer(timer, base, HRTIMER_MODE_ABS);
/*
* Separate the ->running assignment from the ->state assignment.
*
* As with a regular write barrier, this ensures the read side in
* hrtimer_active() cannot observe base->running.timer == NULL &&
* timer->state == INACTIVE.
*/
raw_write_seqcount_barrier(&base->seq);
WARN_ON_ONCE(base->running != timer);
base->running = NULL;
}
static void __hrtimer_run_queues(struct hrtimer_cpu_base *cpu_base, ktime_t now,
unsigned long flags, unsigned int active_mask)
{
struct hrtimer_clock_base *base;
unsigned int active = cpu_base->active_bases & active_mask;
for_each_active_base(base, cpu_base, active) {
struct timerqueue_node *node;
ktime_t basenow;
basenow = ktime_add(now, base->offset);
while ((node = timerqueue_getnext(&base->active))) {
struct hrtimer *timer;
timer = container_of(node, struct hrtimer, node);
/*
* The immediate goal for using the softexpires is
* minimizing wakeups, not running timers at the
* earliest interrupt after their soft expiration.
* This allows us to avoid using a Priority Search
* Tree, which can answer a stabbing query for
* overlapping intervals and instead use the simple
* BST we already have.
* We don't add extra wakeups by delaying timers that
* are right-of a not yet expired timer, because that
* timer will have to trigger a wakeup anyway.
*/
if (basenow < hrtimer_get_softexpires_tv64(timer))
break;
__run_hrtimer(cpu_base, base, timer, &basenow, flags);
if (active_mask == HRTIMER_ACTIVE_SOFT)
hrtimer_sync_wait_running(cpu_base, flags);
}
}
}
static __latent_entropy void hrtimer_run_softirq(struct softirq_action *h)
{
struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases);
unsigned long flags;
ktime_t now;
hrtimer_cpu_base_lock_expiry(cpu_base);
raw_spin_lock_irqsave(&cpu_base->lock, flags);
now = hrtimer_update_base(cpu_base);
__hrtimer_run_queues(cpu_base, now, flags, HRTIMER_ACTIVE_SOFT);
cpu_base->softirq_activated = 0;
hrtimer_update_softirq_timer(cpu_base, true);
raw_spin_unlock_irqrestore(&cpu_base->lock, flags);
hrtimer_cpu_base_unlock_expiry(cpu_base);
}
#ifdef CONFIG_HIGH_RES_TIMERS
/*
* High resolution timer interrupt
* Called with interrupts disabled
*/
void hrtimer_interrupt(struct clock_event_device *dev)
{
struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases);
ktime_t expires_next, now, entry_time, delta;
unsigned long flags;
int retries = 0;
BUG_ON(!cpu_base->hres_active);
cpu_base->nr_events++;
dev->next_event = KTIME_MAX;
raw_spin_lock_irqsave(&cpu_base->lock, flags);
entry_time = now = hrtimer_update_base(cpu_base);
retry:
cpu_base->in_hrtirq = 1;
/*
* We set expires_next to KTIME_MAX here with cpu_base->lock
* held to prevent that a timer is enqueued in our queue via
* the migration code. This does not affect enqueueing of
* timers which run their callback and need to be requeued on
* this CPU.
*/
cpu_base->expires_next = KTIME_MAX;
if (!ktime_before(now, cpu_base->softirq_expires_next)) {
cpu_base->softirq_expires_next = KTIME_MAX;
cpu_base->softirq_activated = 1;
raise_softirq_irqoff(HRTIMER_SOFTIRQ);
}
__hrtimer_run_queues(cpu_base, now, flags, HRTIMER_ACTIVE_HARD);
/* Reevaluate the clock bases for the [soft] next expiry */
expires_next = hrtimer_update_next_event(cpu_base);
/*
* Store the new expiry value so the migration code can verify
* against it.
*/
cpu_base->expires_next = expires_next;
cpu_base->in_hrtirq = 0;
raw_spin_unlock_irqrestore(&cpu_base->lock, flags);
/* Reprogramming necessary ? */
if (!tick_program_event(expires_next, 0)) {
cpu_base->hang_detected = 0;
return;
}
/*
* The next timer was already expired due to:
* - tracing
* - long lasting callbacks
* - being scheduled away when running in a VM
*
* We need to prevent that we loop forever in the hrtimer
* interrupt routine. We give it 3 attempts to avoid
* overreacting on some spurious event.
*
* Acquire base lock for updating the offsets and retrieving
* the current time.
*/
raw_spin_lock_irqsave(&cpu_base->lock, flags);
now = hrtimer_update_base(cpu_base);
cpu_base->nr_retries++;
if (++retries < 3)
goto retry;
/*
* Give the system a chance to do something else than looping
* here. We stored the entry time, so we know exactly how long
* we spent here. We schedule the next event this amount of
* time away.
*/
cpu_base->nr_hangs++;
cpu_base->hang_detected = 1;
raw_spin_unlock_irqrestore(&cpu_base->lock, flags);
delta = ktime_sub(now, entry_time);
if ((unsigned int)delta > cpu_base->max_hang_time)
cpu_base->max_hang_time = (unsigned int) delta;
/*
* Limit it to a sensible value as we enforce a longer
* delay. Give the CPU at least 100ms to catch up.
*/
if (delta > 100 * NSEC_PER_MSEC)
expires_next = ktime_add_ns(now, 100 * NSEC_PER_MSEC);
else
expires_next = ktime_add(now, delta);
tick_program_event(expires_next, 1);
pr_warn_once("hrtimer: interrupt took %llu ns\n", ktime_to_ns(delta));
}
/* called with interrupts disabled */
static inline void __hrtimer_peek_ahead_timers(void)
{
struct tick_device *td;
if (!hrtimer_hres_active())
return;
td = this_cpu_ptr(&tick_cpu_device);
if (td && td->evtdev)
hrtimer_interrupt(td->evtdev);
}
#else /* CONFIG_HIGH_RES_TIMERS */
static inline void __hrtimer_peek_ahead_timers(void) { }
#endif /* !CONFIG_HIGH_RES_TIMERS */
/*
* Called from run_local_timers in hardirq context every jiffy
*/
void hrtimer_run_queues(void)
{
struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases);
unsigned long flags;
ktime_t now;
if (__hrtimer_hres_active(cpu_base))
return;
/*
* This _is_ ugly: We have to check periodically, whether we
* can switch to highres and / or nohz mode. The clocksource
* switch happens with xtime_lock held. Notification from
* there only sets the check bit in the tick_oneshot code,
* otherwise we might deadlock vs. xtime_lock.
*/
if (tick_check_oneshot_change(!hrtimer_is_hres_enabled())) {
hrtimer_switch_to_hres();
return;
}
raw_spin_lock_irqsave(&cpu_base->lock, flags);
now = hrtimer_update_base(cpu_base);
if (!ktime_before(now, cpu_base->softirq_expires_next)) {
cpu_base->softirq_expires_next = KTIME_MAX;
cpu_base->softirq_activated = 1;
raise_softirq_irqoff(HRTIMER_SOFTIRQ);
}
__hrtimer_run_queues(cpu_base, now, flags, HRTIMER_ACTIVE_HARD);
raw_spin_unlock_irqrestore(&cpu_base->lock, flags);
}
/*
* Sleep related functions:
*/
static enum hrtimer_restart hrtimer_wakeup(struct hrtimer *timer)
{
struct hrtimer_sleeper *t =
container_of(timer, struct hrtimer_sleeper, timer);
struct task_struct *task = t->task;
t->task = NULL;
if (task)
wake_up_process(task);
return HRTIMER_NORESTART;
}
/**
* hrtimer_sleeper_start_expires - Start a hrtimer sleeper timer
* @sl: sleeper to be started
* @mode: timer mode abs/rel
*
* Wrapper around hrtimer_start_expires() for hrtimer_sleeper based timers
* to allow PREEMPT_RT to tweak the delivery mode (soft/hardirq context)
*/
void hrtimer_sleeper_start_expires(struct hrtimer_sleeper *sl,
enum hrtimer_mode mode)
{
/*
* Make the enqueue delivery mode check work on RT. If the sleeper
* was initialized for hard interrupt delivery, force the mode bit.
* This is a special case for hrtimer_sleepers because
* hrtimer_init_sleeper() determines the delivery mode on RT so the
* fiddling with this decision is avoided at the call sites.
*/
if (IS_ENABLED(CONFIG_PREEMPT_RT) && sl->timer.is_hard)
mode |= HRTIMER_MODE_HARD;
hrtimer_start_expires(&sl->timer, mode);
}
EXPORT_SYMBOL_GPL(hrtimer_sleeper_start_expires);
static void __hrtimer_init_sleeper(struct hrtimer_sleeper *sl,
clockid_t clock_id, enum hrtimer_mode mode)
{
/*
* On PREEMPT_RT enabled kernels hrtimers which are not explicitly
* marked for hard interrupt expiry mode are moved into soft
* interrupt context either for latency reasons or because the
* hrtimer callback takes regular spinlocks or invokes other
* functions which are not suitable for hard interrupt context on
* PREEMPT_RT.
*
* The hrtimer_sleeper callback is RT compatible in hard interrupt
* context, but there is a latency concern: Untrusted userspace can
* spawn many threads which arm timers for the same expiry time on
* the same CPU. That causes a latency spike due to the wakeup of
* a gazillion threads.
*
* OTOH, privileged real-time user space applications rely on the
* low latency of hard interrupt wakeups. If the current task is in
* a real-time scheduling class, mark the mode for hard interrupt
* expiry.
*/
if (IS_ENABLED(CONFIG_PREEMPT_RT)) {
if (task_is_realtime(current) && !(mode & HRTIMER_MODE_SOFT))
mode |= HRTIMER_MODE_HARD;
}
__hrtimer_init(&sl->timer, clock_id, mode);
sl->timer.function = hrtimer_wakeup;
sl->task = current;
}
/**
* hrtimer_init_sleeper - initialize sleeper to the given clock
* @sl: sleeper to be initialized
* @clock_id: the clock to be used
* @mode: timer mode abs/rel
*/
void hrtimer_init_sleeper(struct hrtimer_sleeper *sl, clockid_t clock_id,
enum hrtimer_mode mode)
{
debug_init(&sl->timer, clock_id, mode);
__hrtimer_init_sleeper(sl, clock_id, mode);
}
EXPORT_SYMBOL_GPL(hrtimer_init_sleeper);
int nanosleep_copyout(struct restart_block *restart, struct timespec64 *ts)
{
switch(restart->nanosleep.type) {
#ifdef CONFIG_COMPAT_32BIT_TIME
case TT_COMPAT:
if (put_old_timespec32(ts, restart->nanosleep.compat_rmtp))
return -EFAULT;
break;
#endif
case TT_NATIVE:
if (put_timespec64(ts, restart->nanosleep.rmtp))
return -EFAULT;
break;
default:
BUG();
}
return -ERESTART_RESTARTBLOCK;
}
static int __sched do_nanosleep(struct hrtimer_sleeper *t, enum hrtimer_mode mode)
{
struct restart_block *restart;
do {
set_current_state(TASK_INTERRUPTIBLE);
hrtimer_sleeper_start_expires(t, mode);
if (likely(t->task))
freezable_schedule();
hrtimer_cancel(&t->timer);
mode = HRTIMER_MODE_ABS;
} while (t->task && !signal_pending(current));
__set_current_state(TASK_RUNNING);
if (!t->task)
return 0;
restart = ¤t->restart_block;
if (restart->nanosleep.type != TT_NONE) {
ktime_t rem = hrtimer_expires_remaining(&t->timer);
struct timespec64 rmt;
if (rem <= 0)
return 0;
rmt = ktime_to_timespec64(rem);
return nanosleep_copyout(restart, &rmt);
}
return -ERESTART_RESTARTBLOCK;
}
static long __sched hrtimer_nanosleep_restart(struct restart_block *restart)
{
struct hrtimer_sleeper t;
int ret;
hrtimer_init_sleeper_on_stack(&t, restart->nanosleep.clockid,
HRTIMER_MODE_ABS);
hrtimer_set_expires_tv64(&t.timer, restart->nanosleep.expires);
ret = do_nanosleep(&t, HRTIMER_MODE_ABS);
destroy_hrtimer_on_stack(&t.timer);
return ret;
}
long hrtimer_nanosleep(ktime_t rqtp, const enum hrtimer_mode mode,
const clockid_t clockid)
{
struct restart_block *restart;
struct hrtimer_sleeper t;
int ret = 0;
u64 slack;
slack = current->timer_slack_ns;
if (dl_task(current) || rt_task(current))
slack = 0;
hrtimer_init_sleeper_on_stack(&t, clockid, mode);
hrtimer_set_expires_range_ns(&t.timer, rqtp, slack);
ret = do_nanosleep(&t, mode);
if (ret != -ERESTART_RESTARTBLOCK)
goto out;
/* Absolute timers do not update the rmtp value and restart: */
if (mode == HRTIMER_MODE_ABS) {
ret = -ERESTARTNOHAND;
goto out;
}
restart = ¤t->restart_block;
restart->nanosleep.clockid = t.timer.base->clockid;
restart->nanosleep.expires = hrtimer_get_expires_tv64(&t.timer);
set_restart_fn(restart, hrtimer_nanosleep_restart);
out:
destroy_hrtimer_on_stack(&t.timer);
return ret;
}
#ifdef CONFIG_64BIT
SYSCALL_DEFINE2(nanosleep, struct __kernel_timespec __user *, rqtp,
struct __kernel_timespec __user *, rmtp)
{
struct timespec64 tu;
if (get_timespec64(&tu, rqtp))
return -EFAULT;
if (!timespec64_valid(&tu))
return -EINVAL;
current->restart_block.nanosleep.type = rmtp ? TT_NATIVE : TT_NONE;
current->restart_block.nanosleep.rmtp = rmtp;
return hrtimer_nanosleep(timespec64_to_ktime(tu), HRTIMER_MODE_REL,
CLOCK_MONOTONIC);
}
#endif
#ifdef CONFIG_COMPAT_32BIT_TIME
SYSCALL_DEFINE2(nanosleep_time32, struct old_timespec32 __user *, rqtp,
struct old_timespec32 __user *, rmtp)
{
struct timespec64 tu;
if (get_old_timespec32(&tu, rqtp))
return -EFAULT;
if (!timespec64_valid(&tu))
return -EINVAL;
current->restart_block.nanosleep.type = rmtp ? TT_COMPAT : TT_NONE;
current->restart_block.nanosleep.compat_rmtp = rmtp;
return hrtimer_nanosleep(timespec64_to_ktime(tu), HRTIMER_MODE_REL,
CLOCK_MONOTONIC);
}
#endif
/*
* Functions related to boot-time initialization:
*/
int hrtimers_prepare_cpu(unsigned int cpu)
{
struct hrtimer_cpu_base *cpu_base = &per_cpu(hrtimer_bases, cpu);
int i;
for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) {
struct hrtimer_clock_base *clock_b = &cpu_base->clock_base[i];
clock_b->cpu_base = cpu_base;
seqcount_raw_spinlock_init(&clock_b->seq, &cpu_base->lock);
timerqueue_init_head(&clock_b->active);
}
cpu_base->cpu = cpu;
cpu_base->active_bases = 0;
cpu_base->hres_active = 0;
cpu_base->hang_detected = 0;
cpu_base->next_timer = NULL;
cpu_base->softirq_next_timer = NULL;
cpu_base->expires_next = KTIME_MAX;
cpu_base->softirq_expires_next = KTIME_MAX;
hrtimer_cpu_base_init_expiry_lock(cpu_base);
return 0;
}
#ifdef CONFIG_HOTPLUG_CPU
static void migrate_hrtimer_list(struct hrtimer_clock_base *old_base,
struct hrtimer_clock_base *new_base)
{
struct hrtimer *timer;
struct timerqueue_node *node;
while ((node = timerqueue_getnext(&old_base->active))) {
timer = container_of(node, struct hrtimer, node);
BUG_ON(hrtimer_callback_running(timer));
debug_deactivate(timer);
/*
* Mark it as ENQUEUED not INACTIVE otherwise the
* timer could be seen as !active and just vanish away
* under us on another CPU
*/
__remove_hrtimer(timer, old_base, HRTIMER_STATE_ENQUEUED, 0);
timer->base = new_base;
/*
* Enqueue the timers on the new cpu. This does not
* reprogram the event device in case the timer
* expires before the earliest on this CPU, but we run
* hrtimer_interrupt after we migrated everything to
* sort out already expired timers and reprogram the
* event device.
*/
enqueue_hrtimer(timer, new_base, HRTIMER_MODE_ABS);
}
}
int hrtimers_dead_cpu(unsigned int scpu)
{
struct hrtimer_cpu_base *old_base, *new_base;
int i;
BUG_ON(cpu_online(scpu));
tick_cancel_sched_timer(scpu);
/*
* this BH disable ensures that raise_softirq_irqoff() does
* not wakeup ksoftirqd (and acquire the pi-lock) while
* holding the cpu_base lock
*/
local_bh_disable();
local_irq_disable();
old_base = &per_cpu(hrtimer_bases, scpu);
new_base = this_cpu_ptr(&hrtimer_bases);
/*
* The caller is globally serialized and nobody else
* takes two locks at once, deadlock is not possible.
*/
raw_spin_lock(&new_base->lock);
raw_spin_lock_nested(&old_base->lock, SINGLE_DEPTH_NESTING);
for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) {
migrate_hrtimer_list(&old_base->clock_base[i],
&new_base->clock_base[i]);
}
/*
* The migration might have changed the first expiring softirq
* timer on this CPU. Update it.
*/
hrtimer_update_softirq_timer(new_base, false);
raw_spin_unlock(&old_base->lock);
raw_spin_unlock(&new_base->lock);
/* Check, if we got expired work to do */
__hrtimer_peek_ahead_timers();
local_irq_enable();
local_bh_enable();
return 0;
}
#endif /* CONFIG_HOTPLUG_CPU */
void __init hrtimers_init(void)
{
hrtimers_prepare_cpu(smp_processor_id());
open_softirq(HRTIMER_SOFTIRQ, hrtimer_run_softirq);
}
/**
* schedule_hrtimeout_range_clock - sleep until timeout
* @expires: timeout value (ktime_t)
* @delta: slack in expires timeout (ktime_t)
* @mode: timer mode
* @clock_id: timer clock to be used
*/
int __sched
schedule_hrtimeout_range_clock(ktime_t *expires, u64 delta,
const enum hrtimer_mode mode, clockid_t clock_id)
{
struct hrtimer_sleeper t;
/*
* Optimize when a zero timeout value is given. It does not
* matter whether this is an absolute or a relative time.
*/
if (expires && *expires == 0) {
__set_current_state(TASK_RUNNING);
return 0;
}
/*
* A NULL parameter means "infinite"
*/
if (!expires) {
schedule();
return -EINTR;
}
hrtimer_init_sleeper_on_stack(&t, clock_id, mode);
hrtimer_set_expires_range_ns(&t.timer, *expires, delta);
hrtimer_sleeper_start_expires(&t, mode);
if (likely(t.task))
schedule();
hrtimer_cancel(&t.timer);
destroy_hrtimer_on_stack(&t.timer);
__set_current_state(TASK_RUNNING);
return !t.task ? 0 : -EINTR;
}
/**
* schedule_hrtimeout_range - sleep until timeout
* @expires: timeout value (ktime_t)
* @delta: slack in expires timeout (ktime_t)
* @mode: timer mode
*
* Make the current task sleep until the given expiry time has
* elapsed. The routine will return immediately unless
* the current task state has been set (see set_current_state()).
*
* The @delta argument gives the kernel the freedom to schedule the
* actual wakeup to a time that is both power and performance friendly.
* The kernel give the normal best effort behavior for "@expires+@delta",
* but may decide to fire the timer earlier, but no earlier than @expires.
*
* You can set the task state as follows -
*
* %TASK_UNINTERRUPTIBLE - at least @timeout time is guaranteed to
* pass before the routine returns unless the current task is explicitly
* woken up, (e.g. by wake_up_process()).
*
* %TASK_INTERRUPTIBLE - the routine may return early if a signal is
* delivered to the current task or the current task is explicitly woken
* up.
*
* The current task state is guaranteed to be TASK_RUNNING when this
* routine returns.
*
* Returns 0 when the timer has expired. If the task was woken before the
* timer expired by a signal (only possible in state TASK_INTERRUPTIBLE) or
* by an explicit wakeup, it returns -EINTR.
*/
int __sched schedule_hrtimeout_range(ktime_t *expires, u64 delta,
const enum hrtimer_mode mode)
{
return schedule_hrtimeout_range_clock(expires, delta, mode,
CLOCK_MONOTONIC);
}
EXPORT_SYMBOL_GPL(schedule_hrtimeout_range);
/**
* schedule_hrtimeout - sleep until timeout
* @expires: timeout value (ktime_t)
* @mode: timer mode
*
* Make the current task sleep until the given expiry time has
* elapsed. The routine will return immediately unless
* the current task state has been set (see set_current_state()).
*
* You can set the task state as follows -
*
* %TASK_UNINTERRUPTIBLE - at least @timeout time is guaranteed to
* pass before the routine returns unless the current task is explicitly
* woken up, (e.g. by wake_up_process()).
*
* %TASK_INTERRUPTIBLE - the routine may return early if a signal is
* delivered to the current task or the current task is explicitly woken
* up.
*
* The current task state is guaranteed to be TASK_RUNNING when this
* routine returns.
*
* Returns 0 when the timer has expired. If the task was woken before the
* timer expired by a signal (only possible in state TASK_INTERRUPTIBLE) or
* by an explicit wakeup, it returns -EINTR.
*/
int __sched schedule_hrtimeout(ktime_t *expires,
const enum hrtimer_mode mode)
{
return schedule_hrtimeout_range(expires, 0, mode);
}
EXPORT_SYMBOL_GPL(schedule_hrtimeout);
// SPDX-License-Identifier: GPL-2.0
#include <linux/compiler.h>
#include <linux/export.h>
#include <linux/fault-inject-usercopy.h>
#include <linux/kasan-checks.h>
#include <linux/thread_info.h>
#include <linux/uaccess.h>
#include <linux/kernel.h>
#include <linux/errno.h>
#include <linux/mm.h>
#include <asm/byteorder.h>
#include <asm/word-at-a-time.h>
#ifdef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
#define IS_UNALIGNED(src, dst) 0
#else
#define IS_UNALIGNED(src, dst) \
(((long) dst | (long) src) & (sizeof(long) - 1))
#endif
/*
* Do a strncpy, return length of string without final '\0'.
* 'count' is the user-supplied count (return 'count' if we
* hit it), 'max' is the address space maximum (and we return
* -EFAULT if we hit it).
*/
static inline long do_strncpy_from_user(char *dst, const char __user *src,
unsigned long count, unsigned long max)
{
const struct word_at_a_time constants = WORD_AT_A_TIME_CONSTANTS;
unsigned long res = 0;
if (IS_UNALIGNED(src, dst))
goto byte_at_a_time;
while (max >= sizeof(unsigned long)) {
unsigned long c, data, mask;
/* Fall back to byte-at-a-time if we get a page fault */
unsafe_get_user(c, (unsigned long __user *)(src+res), byte_at_a_time);
/*
* Note that we mask out the bytes following the NUL. This is
* important to do because string oblivious code may read past
* the NUL. For those routines, we don't want to give them
* potentially random bytes after the NUL in `src`.
*
* One example of such code is BPF map keys. BPF treats map keys
* as an opaque set of bytes. Without the post-NUL mask, any BPF
* maps keyed by strings returned from strncpy_from_user() may
* have multiple entries for semantically identical strings.
*/
if (has_zero(c, &data, &constants)) {
data = prep_zero_mask(c, data, &constants);
data = create_zero_mask(data);
mask = zero_bytemask(data);
*(unsigned long *)(dst+res) = c & mask;
return res + find_zero(data);
}
*(unsigned long *)(dst+res) = c;
res += sizeof(unsigned long);
max -= sizeof(unsigned long);
}
byte_at_a_time:
while (max) {
char c;
unsafe_get_user(c,src+res, efault); dst[res] = c;
if (!c)
return res;
res++;
max--;
}
/*
* Uhhuh. We hit 'max'. But was that the user-specified maximum
* too? If so, that's ok - we got as much as the user asked for.
*/
if (res >= count) return res;
/*
* Nope: we hit the address space limit, and we still had more
* characters the caller would have wanted. That's an EFAULT.
*/
efault:
return -EFAULT;
}
/**
* strncpy_from_user: - Copy a NUL terminated string from userspace.
* @dst: Destination address, in kernel space. This buffer must be at
* least @count bytes long.
* @src: Source address, in user space.
* @count: Maximum number of bytes to copy, including the trailing NUL.
*
* Copies a NUL-terminated string from userspace to kernel space.
*
* On success, returns the length of the string (not including the trailing
* NUL).
*
* If access to userspace fails, returns -EFAULT (some data may have been
* copied).
*
* If @count is smaller than the length of the string, copies @count bytes
* and returns @count.
*/
long strncpy_from_user(char *dst, const char __user *src, long count)
{
unsigned long max_addr, src_addr;
might_fault();
if (should_fail_usercopy())
return -EFAULT;
if (unlikely(count <= 0))
return 0;
max_addr = user_addr_max();
src_addr = (unsigned long)untagged_addr(src);
if (likely(src_addr < max_addr)) {
unsigned long max = max_addr - src_addr;
long retval;
/*
* Truncate 'max' to the user-specified limit, so that
* we only have one limit we need to check in the loop
*/
if (max > count)
max = count;
kasan_check_write(dst, count);
check_object_size(dst, count, false);
if (user_read_access_begin(src, max)) {
retval = do_strncpy_from_user(dst, src, count, max);
user_read_access_end();
return retval;
}
}
return -EFAULT;
}
EXPORT_SYMBOL(strncpy_from_user);
// SPDX-License-Identifier: GPL-2.0
/*
* Functions related to generic timeout handling of requests.
*/
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/blkdev.h>
#include <linux/fault-inject.h>
#include "blk.h"
#include "blk-mq.h"
#ifdef CONFIG_FAIL_IO_TIMEOUT
static DECLARE_FAULT_ATTR(fail_io_timeout);
static int __init setup_fail_io_timeout(char *str)
{
return setup_fault_attr(&fail_io_timeout, str);
}
__setup("fail_io_timeout=", setup_fail_io_timeout);
bool __blk_should_fake_timeout(struct request_queue *q)
{
return should_fail(&fail_io_timeout, 1);
}
EXPORT_SYMBOL_GPL(__blk_should_fake_timeout);
static int __init fail_io_timeout_debugfs(void)
{
struct dentry *dir = fault_create_debugfs_attr("fail_io_timeout",
NULL, &fail_io_timeout);
return PTR_ERR_OR_ZERO(dir);
}
late_initcall(fail_io_timeout_debugfs);
ssize_t part_timeout_show(struct device *dev, struct device_attribute *attr,
char *buf)
{
struct gendisk *disk = dev_to_disk(dev);
int set = test_bit(QUEUE_FLAG_FAIL_IO, &disk->queue->queue_flags);
return sprintf(buf, "%d\n", set != 0);
}
ssize_t part_timeout_store(struct device *dev, struct device_attribute *attr,
const char *buf, size_t count)
{
struct gendisk *disk = dev_to_disk(dev);
int val;
if (count) {
struct request_queue *q = disk->queue;
char *p = (char *) buf;
val = simple_strtoul(p, &p, 10);
if (val)
blk_queue_flag_set(QUEUE_FLAG_FAIL_IO, q);
else
blk_queue_flag_clear(QUEUE_FLAG_FAIL_IO, q);
}
return count;
}
#endif /* CONFIG_FAIL_IO_TIMEOUT */
/**
* blk_abort_request - Request recovery for the specified command
* @req: pointer to the request of interest
*
* This function requests that the block layer start recovery for the
* request by deleting the timer and calling the q's timeout function.
* LLDDs who implement their own error recovery MAY ignore the timeout
* event if they generated blk_abort_request.
*/
void blk_abort_request(struct request *req)
{
/*
* All we need to ensure is that timeout scan takes place
* immediately and that scan sees the new timeout value.
* No need for fancy synchronizations.
*/
WRITE_ONCE(req->deadline, jiffies);
kblockd_schedule_work(&req->q->timeout_work);
}
EXPORT_SYMBOL_GPL(blk_abort_request);
static unsigned long blk_timeout_mask __read_mostly;
static int __init blk_timeout_init(void)
{
blk_timeout_mask = roundup_pow_of_two(HZ) - 1;
return 0;
}
late_initcall(blk_timeout_init);
/*
* Just a rough estimate, we don't care about specific values for timeouts.
*/
static inline unsigned long blk_round_jiffies(unsigned long j)
{
return (j + blk_timeout_mask) + 1;
}
unsigned long blk_rq_timeout(unsigned long timeout)
{
unsigned long maxt;
maxt = blk_round_jiffies(jiffies + BLK_MAX_TIMEOUT);
if (time_after(timeout, maxt))
timeout = maxt;
return timeout;
}
/**
* blk_add_timer - Start timeout timer for a single request
* @req: request that is about to start running.
*
* Notes:
* Each request has its own timer, and as it is added to the queue, we
* set up the timer. When the request completes, we cancel the timer.
*/
void blk_add_timer(struct request *req)
{
struct request_queue *q = req->q;
unsigned long expiry;
/*
* Some LLDs, like scsi, peek at the timeout to prevent a
* command from being retried forever.
*/
if (!req->timeout)
req->timeout = q->rq_timeout; req->rq_flags &= ~RQF_TIMED_OUT;
expiry = jiffies + req->timeout;
WRITE_ONCE(req->deadline, expiry);
/*
* If the timer isn't already pending or this timeout is earlier
* than an existing one, modify the timer. Round up to next nearest
* second.
*/
expiry = blk_rq_timeout(blk_round_jiffies(expiry));
if (!timer_pending(&q->timeout) ||
time_before(expiry, q->timeout.expires)) { unsigned long diff = q->timeout.expires - expiry;
/*
* Due to added timer slack to group timers, the timer
* will often be a little in front of what we asked for.
* So apply some tolerance here too, otherwise we keep
* modifying the timer because expires for value X
* will be X + something.
*/
if (!timer_pending(&q->timeout) || (diff >= HZ / 2)) mod_timer(&q->timeout, expiry);
}
}
/* SPDX-License-Identifier: GPL-2.0-only */
/*
* include/linux/idr.h
*
* 2002-10-18 written by Jim Houston jim.houston@ccur.com
* Copyright (C) 2002 by Concurrent Computer Corporation
*
* Small id to pointer translation service avoiding fixed sized
* tables.
*/
#ifndef __IDR_H__
#define __IDR_H__
#include <linux/radix-tree.h>
#include <linux/gfp.h>
#include <linux/percpu.h>
struct idr {
struct radix_tree_root idr_rt;
unsigned int idr_base;
unsigned int idr_next;
};
/*
* The IDR API does not expose the tagging functionality of the radix tree
* to users. Use tag 0 to track whether a node has free space below it.
*/
#define IDR_FREE 0
/* Set the IDR flag and the IDR_FREE tag */
#define IDR_RT_MARKER (ROOT_IS_IDR | (__force gfp_t) \
(1 << (ROOT_TAG_SHIFT + IDR_FREE)))
#define IDR_INIT_BASE(name, base) { \
.idr_rt = RADIX_TREE_INIT(name, IDR_RT_MARKER), \
.idr_base = (base), \
.idr_next = 0, \
}
/**
* IDR_INIT() - Initialise an IDR.
* @name: Name of IDR.
*
* A freshly-initialised IDR contains no IDs.
*/
#define IDR_INIT(name) IDR_INIT_BASE(name, 0)
/**
* DEFINE_IDR() - Define a statically-allocated IDR.
* @name: Name of IDR.
*
* An IDR defined using this macro is ready for use with no additional
* initialisation required. It contains no IDs.
*/
#define DEFINE_IDR(name) struct idr name = IDR_INIT(name)
/**
* idr_get_cursor - Return the current position of the cyclic allocator
* @idr: idr handle
*
* The value returned is the value that will be next returned from
* idr_alloc_cyclic() if it is free (otherwise the search will start from
* this position).
*/
static inline unsigned int idr_get_cursor(const struct idr *idr)
{
return READ_ONCE(idr->idr_next);
}
/**
* idr_set_cursor - Set the current position of the cyclic allocator
* @idr: idr handle
* @val: new position
*
* The next call to idr_alloc_cyclic() will return @val if it is free
* (otherwise the search will start from this position).
*/
static inline void idr_set_cursor(struct idr *idr, unsigned int val)
{
WRITE_ONCE(idr->idr_next, val);
}
/**
* DOC: idr sync
* idr synchronization (stolen from radix-tree.h)
*
* idr_find() is able to be called locklessly, using RCU. The caller must
* ensure calls to this function are made within rcu_read_lock() regions.
* Other readers (lock-free or otherwise) and modifications may be running
* concurrently.
*
* It is still required that the caller manage the synchronization and
* lifetimes of the items. So if RCU lock-free lookups are used, typically
* this would mean that the items have their own locks, or are amenable to
* lock-free access; and that the items are freed by RCU (or only freed after
* having been deleted from the idr tree *and* a synchronize_rcu() grace
* period).
*/
#define idr_lock(idr) xa_lock(&(idr)->idr_rt)
#define idr_unlock(idr) xa_unlock(&(idr)->idr_rt)
#define idr_lock_bh(idr) xa_lock_bh(&(idr)->idr_rt)
#define idr_unlock_bh(idr) xa_unlock_bh(&(idr)->idr_rt)
#define idr_lock_irq(idr) xa_lock_irq(&(idr)->idr_rt)
#define idr_unlock_irq(idr) xa_unlock_irq(&(idr)->idr_rt)
#define idr_lock_irqsave(idr, flags) \
xa_lock_irqsave(&(idr)->idr_rt, flags)
#define idr_unlock_irqrestore(idr, flags) \
xa_unlock_irqrestore(&(idr)->idr_rt, flags)
void idr_preload(gfp_t gfp_mask);
int idr_alloc(struct idr *, void *ptr, int start, int end, gfp_t);
int __must_check idr_alloc_u32(struct idr *, void *ptr, u32 *id,
unsigned long max, gfp_t);
int idr_alloc_cyclic(struct idr *, void *ptr, int start, int end, gfp_t);
void *idr_remove(struct idr *, unsigned long id);
void *idr_find(const struct idr *, unsigned long id);
int idr_for_each(const struct idr *,
int (*fn)(int id, void *p, void *data), void *data);
void *idr_get_next(struct idr *, int *nextid);
void *idr_get_next_ul(struct idr *, unsigned long *nextid);
void *idr_replace(struct idr *, void *, unsigned long id);
void idr_destroy(struct idr *);
/**
* idr_init_base() - Initialise an IDR.
* @idr: IDR handle.
* @base: The base value for the IDR.
*
* This variation of idr_init() creates an IDR which will allocate IDs
* starting at %base.
*/
static inline void idr_init_base(struct idr *idr, int base)
{
INIT_RADIX_TREE(&idr->idr_rt, IDR_RT_MARKER);
idr->idr_base = base;
idr->idr_next = 0;
}
/**
* idr_init() - Initialise an IDR.
* @idr: IDR handle.
*
* Initialise a dynamically allocated IDR. To initialise a
* statically allocated IDR, use DEFINE_IDR().
*/
static inline void idr_init(struct idr *idr)
{
idr_init_base(idr, 0);
}
/**
* idr_is_empty() - Are there any IDs allocated?
* @idr: IDR handle.
*
* Return: %true if any IDs have been allocated from this IDR.
*/
static inline bool idr_is_empty(const struct idr *idr)
{
return radix_tree_empty(&idr->idr_rt) &&
radix_tree_tagged(&idr->idr_rt, IDR_FREE);
}
/**
* idr_preload_end - end preload section started with idr_preload()
*
* Each idr_preload() should be matched with an invocation of this
* function. See idr_preload() for details.
*/
static inline void idr_preload_end(void)
{
local_unlock(&radix_tree_preloads.lock);
}
/**
* idr_for_each_entry() - Iterate over an IDR's elements of a given type.
* @idr: IDR handle.
* @entry: The type * to use as cursor
* @id: Entry ID.
*
* @entry and @id do not need to be initialized before the loop, and
* after normal termination @entry is left with the value NULL. This
* is convenient for a "not found" value.
*/
#define idr_for_each_entry(idr, entry, id) \
for (id = 0; ((entry) = idr_get_next(idr, &(id))) != NULL; id += 1U)
/**
* idr_for_each_entry_ul() - Iterate over an IDR's elements of a given type.
* @idr: IDR handle.
* @entry: The type * to use as cursor.
* @tmp: A temporary placeholder for ID.
* @id: Entry ID.
*
* @entry and @id do not need to be initialized before the loop, and
* after normal termination @entry is left with the value NULL. This
* is convenient for a "not found" value.
*/
#define idr_for_each_entry_ul(idr, entry, tmp, id) \
for (tmp = 0, id = 0; \
tmp <= id && ((entry) = idr_get_next_ul(idr, &(id))) != NULL; \
tmp = id, ++id)
/**
* idr_for_each_entry_continue() - Continue iteration over an IDR's elements of a given type
* @idr: IDR handle.
* @entry: The type * to use as a cursor.
* @id: Entry ID.
*
* Continue to iterate over entries, continuing after the current position.
*/
#define idr_for_each_entry_continue(idr, entry, id) \
for ((entry) = idr_get_next((idr), &(id)); \
entry; \
++id, (entry) = idr_get_next((idr), &(id)))
/**
* idr_for_each_entry_continue_ul() - Continue iteration over an IDR's elements of a given type
* @idr: IDR handle.
* @entry: The type * to use as a cursor.
* @tmp: A temporary placeholder for ID.
* @id: Entry ID.
*
* Continue to iterate over entries, continuing after the current position.
*/
#define idr_for_each_entry_continue_ul(idr, entry, tmp, id) \
for (tmp = id; \
tmp <= id && ((entry) = idr_get_next_ul(idr, &(id))) != NULL; \
tmp = id, ++id)
/*
* IDA - ID Allocator, use when translation from id to pointer isn't necessary.
*/
#define IDA_CHUNK_SIZE 128 /* 128 bytes per chunk */
#define IDA_BITMAP_LONGS (IDA_CHUNK_SIZE / sizeof(long))
#define IDA_BITMAP_BITS (IDA_BITMAP_LONGS * sizeof(long) * 8)
struct ida_bitmap {
unsigned long bitmap[IDA_BITMAP_LONGS];
};
struct ida {
struct xarray xa;
};
#define IDA_INIT_FLAGS (XA_FLAGS_LOCK_IRQ | XA_FLAGS_ALLOC)
#define IDA_INIT(name) { \
.xa = XARRAY_INIT(name, IDA_INIT_FLAGS) \
}
#define DEFINE_IDA(name) struct ida name = IDA_INIT(name)
int ida_alloc_range(struct ida *, unsigned int min, unsigned int max, gfp_t);
void ida_free(struct ida *, unsigned int id);
void ida_destroy(struct ida *ida);
/**
* ida_alloc() - Allocate an unused ID.
* @ida: IDA handle.
* @gfp: Memory allocation flags.
*
* Allocate an ID between 0 and %INT_MAX, inclusive.
*
* Context: Any context. It is safe to call this function without
* locking in your code.
* Return: The allocated ID, or %-ENOMEM if memory could not be allocated,
* or %-ENOSPC if there are no free IDs.
*/
static inline int ida_alloc(struct ida *ida, gfp_t gfp)
{
return ida_alloc_range(ida, 0, ~0, gfp);
}
/**
* ida_alloc_min() - Allocate an unused ID.
* @ida: IDA handle.
* @min: Lowest ID to allocate.
* @gfp: Memory allocation flags.
*
* Allocate an ID between @min and %INT_MAX, inclusive.
*
* Context: Any context. It is safe to call this function without
* locking in your code.
* Return: The allocated ID, or %-ENOMEM if memory could not be allocated,
* or %-ENOSPC if there are no free IDs.
*/
static inline int ida_alloc_min(struct ida *ida, unsigned int min, gfp_t gfp)
{
return ida_alloc_range(ida, min, ~0, gfp);
}
/**
* ida_alloc_max() - Allocate an unused ID.
* @ida: IDA handle.
* @max: Highest ID to allocate.
* @gfp: Memory allocation flags.
*
* Allocate an ID between 0 and @max, inclusive.
*
* Context: Any context. It is safe to call this function without
* locking in your code.
* Return: The allocated ID, or %-ENOMEM if memory could not be allocated,
* or %-ENOSPC if there are no free IDs.
*/
static inline int ida_alloc_max(struct ida *ida, unsigned int max, gfp_t gfp)
{
return ida_alloc_range(ida, 0, max, gfp);
}
static inline void ida_init(struct ida *ida)
{
xa_init_flags(&ida->xa, IDA_INIT_FLAGS);
}
/*
* ida_simple_get() and ida_simple_remove() are deprecated. Use
* ida_alloc() and ida_free() instead respectively.
*/
#define ida_simple_get(ida, start, end, gfp) \
ida_alloc_range(ida, start, (end) - 1, gfp)
#define ida_simple_remove(ida, id) ida_free(ida, id)
static inline bool ida_is_empty(const struct ida *ida)
{
return xa_empty(&ida->xa);
}
#endif /* __IDR_H__ */
/* SPDX-License-Identifier: GPL-2.0 */
/*
* Variant of atomic_t specialized for reference counts.
*
* The interface matches the atomic_t interface (to aid in porting) but only
* provides the few functions one should use for reference counting.
*
* Saturation semantics
* ====================
*
* refcount_t differs from atomic_t in that the counter saturates at
* REFCOUNT_SATURATED and will not move once there. This avoids wrapping the
* counter and causing 'spurious' use-after-free issues. In order to avoid the
* cost associated with introducing cmpxchg() loops into all of the saturating
* operations, we temporarily allow the counter to take on an unchecked value
* and then explicitly set it to REFCOUNT_SATURATED on detecting that underflow
* or overflow has occurred. Although this is racy when multiple threads
* access the refcount concurrently, by placing REFCOUNT_SATURATED roughly
* equidistant from 0 and INT_MAX we minimise the scope for error:
*
* INT_MAX REFCOUNT_SATURATED UINT_MAX
* 0 (0x7fff_ffff) (0xc000_0000) (0xffff_ffff)
* +--------------------------------+----------------+----------------+
* <---------- bad value! ---------->
*
* (in a signed view of the world, the "bad value" range corresponds to
* a negative counter value).
*
* As an example, consider a refcount_inc() operation that causes the counter
* to overflow:
*
* int old = atomic_fetch_add_relaxed(r);
* // old is INT_MAX, refcount now INT_MIN (0x8000_0000)
* if (old < 0)
* atomic_set(r, REFCOUNT_SATURATED);
*
* If another thread also performs a refcount_inc() operation between the two
* atomic operations, then the count will continue to edge closer to 0. If it
* reaches a value of 1 before /any/ of the threads reset it to the saturated
* value, then a concurrent refcount_dec_and_test() may erroneously free the
* underlying object.
* Linux limits the maximum number of tasks to PID_MAX_LIMIT, which is currently
* 0x400000 (and can't easily be raised in the future beyond FUTEX_TID_MASK).
* With the current PID limit, if no batched refcounting operations are used and
* the attacker can't repeatedly trigger kernel oopses in the middle of refcount
* operations, this makes it impossible for a saturated refcount to leave the
* saturation range, even if it is possible for multiple uses of the same
* refcount to nest in the context of a single task:
*
* (UINT_MAX+1-REFCOUNT_SATURATED) / PID_MAX_LIMIT =
* 0x40000000 / 0x400000 = 0x100 = 256
*
* If hundreds of references are added/removed with a single refcounting
* operation, it may potentially be possible to leave the saturation range; but
* given the precise timing details involved with the round-robin scheduling of
* each thread manipulating the refcount and the need to hit the race multiple
* times in succession, there doesn't appear to be a practical avenue of attack
* even if using refcount_add() operations with larger increments.
*
* Memory ordering
* ===============
*
* Memory ordering rules are slightly relaxed wrt regular atomic_t functions
* and provide only what is strictly required for refcounts.
*
* The increments are fully relaxed; these will not provide ordering. The
* rationale is that whatever is used to obtain the object we're increasing the
* reference count on will provide the ordering. For locked data structures,
* its the lock acquire, for RCU/lockless data structures its the dependent
* load.
*
* Do note that inc_not_zero() provides a control dependency which will order
* future stores against the inc, this ensures we'll never modify the object
* if we did not in fact acquire a reference.
*
* The decrements will provide release order, such that all the prior loads and
* stores will be issued before, it also provides a control dependency, which
* will order us against the subsequent free().
*
* The control dependency is against the load of the cmpxchg (ll/sc) that
* succeeded. This means the stores aren't fully ordered, but this is fine
* because the 1->0 transition indicates no concurrency.
*
* Note that the allocator is responsible for ordering things between free()
* and alloc().
*
* The decrements dec_and_test() and sub_and_test() also provide acquire
* ordering on success.
*
*/
#ifndef _LINUX_REFCOUNT_H
#define _LINUX_REFCOUNT_H
#include <linux/atomic.h>
#include <linux/bug.h>
#include <linux/compiler.h>
#include <linux/limits.h>
#include <linux/spinlock_types.h>
struct mutex;
/**
* typedef refcount_t - variant of atomic_t specialized for reference counts
* @refs: atomic_t counter field
*
* The counter saturates at REFCOUNT_SATURATED and will not move once
* there. This avoids wrapping the counter and causing 'spurious'
* use-after-free bugs.
*/
typedef struct refcount_struct {
atomic_t refs;
} refcount_t;
#define REFCOUNT_INIT(n) { .refs = ATOMIC_INIT(n), }
#define REFCOUNT_MAX INT_MAX
#define REFCOUNT_SATURATED (INT_MIN / 2)
enum refcount_saturation_type {
REFCOUNT_ADD_NOT_ZERO_OVF,
REFCOUNT_ADD_OVF,
REFCOUNT_ADD_UAF,
REFCOUNT_SUB_UAF,
REFCOUNT_DEC_LEAK,
};
void refcount_warn_saturate(refcount_t *r, enum refcount_saturation_type t);
/**
* refcount_set - set a refcount's value
* @r: the refcount
* @n: value to which the refcount will be set
*/
static inline void refcount_set(refcount_t *r, int n)
{
atomic_set(&r->refs, n);
}
/**
* refcount_read - get a refcount's value
* @r: the refcount
*
* Return: the refcount's value
*/
static inline unsigned int refcount_read(const refcount_t *r)
{
return atomic_read(&r->refs);
}
static inline __must_check bool __refcount_add_not_zero(int i, refcount_t *r, int *oldp)
{
int old = refcount_read(r);
do {
if (!old)
break;
} while (!atomic_try_cmpxchg_relaxed(&r->refs, &old, old + i));
if (oldp)
*oldp = old;
if (unlikely(old < 0 || old + i < 0)) refcount_warn_saturate(r, REFCOUNT_ADD_NOT_ZERO_OVF); return old;
}
/**
* refcount_add_not_zero - add a value to a refcount unless it is 0
* @i: the value to add to the refcount
* @r: the refcount
*
* Will saturate at REFCOUNT_SATURATED and WARN.
*
* Provides no memory ordering, it is assumed the caller has guaranteed the
* object memory to be stable (RCU, etc.). It does provide a control dependency
* and thereby orders future stores. See the comment on top.
*
* Use of this function is not recommended for the normal reference counting
* use case in which references are taken and released one at a time. In these
* cases, refcount_inc(), or one of its variants, should instead be used to
* increment a reference count.
*
* Return: false if the passed refcount is 0, true otherwise
*/
static inline __must_check bool refcount_add_not_zero(int i, refcount_t *r)
{
return __refcount_add_not_zero(i, r, NULL);
}
static inline void __refcount_add(int i, refcount_t *r, int *oldp)
{
int old = atomic_fetch_add_relaxed(i, &r->refs);
if (oldp)
*oldp = old;
if (unlikely(!old))
refcount_warn_saturate(r, REFCOUNT_ADD_UAF); else if (unlikely(old < 0 || old + i < 0)) refcount_warn_saturate(r, REFCOUNT_ADD_OVF);
}
/**
* refcount_add - add a value to a refcount
* @i: the value to add to the refcount
* @r: the refcount
*
* Similar to atomic_add(), but will saturate at REFCOUNT_SATURATED and WARN.
*
* Provides no memory ordering, it is assumed the caller has guaranteed the
* object memory to be stable (RCU, etc.). It does provide a control dependency
* and thereby orders future stores. See the comment on top.
*
* Use of this function is not recommended for the normal reference counting
* use case in which references are taken and released one at a time. In these
* cases, refcount_inc(), or one of its variants, should instead be used to
* increment a reference count.
*/
static inline void refcount_add(int i, refcount_t *r)
{
__refcount_add(i, r, NULL);
}
static inline __must_check bool __refcount_inc_not_zero(refcount_t *r, int *oldp)
{
return __refcount_add_not_zero(1, r, oldp);
}
/**
* refcount_inc_not_zero - increment a refcount unless it is 0
* @r: the refcount to increment
*
* Similar to atomic_inc_not_zero(), but will saturate at REFCOUNT_SATURATED
* and WARN.
*
* Provides no memory ordering, it is assumed the caller has guaranteed the
* object memory to be stable (RCU, etc.). It does provide a control dependency
* and thereby orders future stores. See the comment on top.
*
* Return: true if the increment was successful, false otherwise
*/
static inline __must_check bool refcount_inc_not_zero(refcount_t *r)
{
return __refcount_inc_not_zero(r, NULL);
}
static inline void __refcount_inc(refcount_t *r, int *oldp)
{
__refcount_add(1, r, oldp);
}
/**
* refcount_inc - increment a refcount
* @r: the refcount to increment
*
* Similar to atomic_inc(), but will saturate at REFCOUNT_SATURATED and WARN.
*
* Provides no memory ordering, it is assumed the caller already has a
* reference on the object.
*
* Will WARN if the refcount is 0, as this represents a possible use-after-free
* condition.
*/
static inline void refcount_inc(refcount_t *r)
{
__refcount_inc(r, NULL);
}
static inline __must_check bool __refcount_sub_and_test(int i, refcount_t *r, int *oldp)
{
int old = atomic_fetch_sub_release(i, &r->refs);
if (oldp)
*oldp = old;
if (old == i) {
smp_acquire__after_ctrl_dep();
return true;
}
if (unlikely(old < 0 || old - i < 0)) refcount_warn_saturate(r, REFCOUNT_SUB_UAF);
return false;
}
/**
* refcount_sub_and_test - subtract from a refcount and test if it is 0
* @i: amount to subtract from the refcount
* @r: the refcount
*
* Similar to atomic_dec_and_test(), but it will WARN, return false and
* ultimately leak on underflow and will fail to decrement when saturated
* at REFCOUNT_SATURATED.
*
* Provides release memory ordering, such that prior loads and stores are done
* before, and provides an acquire ordering on success such that free()
* must come after.
*
* Use of this function is not recommended for the normal reference counting
* use case in which references are taken and released one at a time. In these
* cases, refcount_dec(), or one of its variants, should instead be used to
* decrement a reference count.
*
* Return: true if the resulting refcount is 0, false otherwise
*/
static inline __must_check bool refcount_sub_and_test(int i, refcount_t *r)
{
return __refcount_sub_and_test(i, r, NULL);
}
static inline __must_check bool __refcount_dec_and_test(refcount_t *r, int *oldp)
{
return __refcount_sub_and_test(1, r, oldp);
}
/**
* refcount_dec_and_test - decrement a refcount and test if it is 0
* @r: the refcount
*
* Similar to atomic_dec_and_test(), it will WARN on underflow and fail to
* decrement when saturated at REFCOUNT_SATURATED.
*
* Provides release memory ordering, such that prior loads and stores are done
* before, and provides an acquire ordering on success such that free()
* must come after.
*
* Return: true if the resulting refcount is 0, false otherwise
*/
static inline __must_check bool refcount_dec_and_test(refcount_t *r)
{
return __refcount_dec_and_test(r, NULL);
}
static inline void __refcount_dec(refcount_t *r, int *oldp)
{
int old = atomic_fetch_sub_release(1, &r->refs);
if (oldp)
*oldp = old;
if (unlikely(old <= 1)) refcount_warn_saturate(r, REFCOUNT_DEC_LEAK);
}
/**
* refcount_dec - decrement a refcount
* @r: the refcount
*
* Similar to atomic_dec(), it will WARN on underflow and fail to decrement
* when saturated at REFCOUNT_SATURATED.
*
* Provides release memory ordering, such that prior loads and stores are done
* before.
*/
static inline void refcount_dec(refcount_t *r)
{
__refcount_dec(r, NULL);
}
extern __must_check bool refcount_dec_if_one(refcount_t *r);
extern __must_check bool refcount_dec_not_one(refcount_t *r);
extern __must_check bool refcount_dec_and_mutex_lock(refcount_t *r, struct mutex *lock);
extern __must_check bool refcount_dec_and_lock(refcount_t *r, spinlock_t *lock);
extern __must_check bool refcount_dec_and_lock_irqsave(refcount_t *r,
spinlock_t *lock,
unsigned long *flags);
#endif /* _LINUX_REFCOUNT_H */
/* SPDX-License-Identifier: GPL-2.0-or-later */
/* Credentials management - see Documentation/security/credentials.rst
*
* Copyright (C) 2008 Red Hat, Inc. All Rights Reserved.
* Written by David Howells (dhowells@redhat.com)
*/
#ifndef _LINUX_CRED_H
#define _LINUX_CRED_H
#include <linux/capability.h>
#include <linux/init.h>
#include <linux/key.h>
#include <linux/atomic.h>
#include <linux/uidgid.h>
#include <linux/sched.h>
#include <linux/sched/user.h>
struct cred;
struct inode;
/*
* COW Supplementary groups list
*/
struct group_info {
atomic_t usage;
int ngroups;
kgid_t gid[];
} __randomize_layout;
/**
* get_group_info - Get a reference to a group info structure
* @group_info: The group info to reference
*
* This gets a reference to a set of supplementary groups.
*
* If the caller is accessing a task's credentials, they must hold the RCU read
* lock when reading.
*/
static inline struct group_info *get_group_info(struct group_info *gi)
{
atomic_inc(&gi->usage);
return gi;
}
/**
* put_group_info - Release a reference to a group info structure
* @group_info: The group info to release
*/
#define put_group_info(group_info) \
do { \
if (atomic_dec_and_test(&(group_info)->usage)) \
groups_free(group_info); \
} while (0)
#ifdef CONFIG_MULTIUSER
extern struct group_info *groups_alloc(int);
extern void groups_free(struct group_info *);
extern int in_group_p(kgid_t);
extern int in_egroup_p(kgid_t);
extern int groups_search(const struct group_info *, kgid_t);
extern int set_current_groups(struct group_info *);
extern void set_groups(struct cred *, struct group_info *);
extern bool may_setgroups(void);
extern void groups_sort(struct group_info *);
#else
static inline void groups_free(struct group_info *group_info)
{
}
static inline int in_group_p(kgid_t grp)
{
return 1;
}
static inline int in_egroup_p(kgid_t grp)
{
return 1;
}
static inline int groups_search(const struct group_info *group_info, kgid_t grp)
{
return 1;
}
#endif
/*
* The security context of a task
*
* The parts of the context break down into two categories:
*
* (1) The objective context of a task. These parts are used when some other
* task is attempting to affect this one.
*
* (2) The subjective context. These details are used when the task is acting
* upon another object, be that a file, a task, a key or whatever.
*
* Note that some members of this structure belong to both categories - the
* LSM security pointer for instance.
*
* A task has two security pointers. task->real_cred points to the objective
* context that defines that task's actual details. The objective part of this
* context is used whenever that task is acted upon.
*
* task->cred points to the subjective context that defines the details of how
* that task is going to act upon another object. This may be overridden
* temporarily to point to another security context, but normally points to the
* same context as task->real_cred.
*/
struct cred {
atomic_t usage;
#ifdef CONFIG_DEBUG_CREDENTIALS
atomic_t subscribers; /* number of processes subscribed */
void *put_addr;
unsigned magic;
#define CRED_MAGIC 0x43736564
#define CRED_MAGIC_DEAD 0x44656144
#endif
kuid_t uid; /* real UID of the task */
kgid_t gid; /* real GID of the task */
kuid_t suid; /* saved UID of the task */
kgid_t sgid; /* saved GID of the task */
kuid_t euid; /* effective UID of the task */
kgid_t egid; /* effective GID of the task */
kuid_t fsuid; /* UID for VFS ops */
kgid_t fsgid; /* GID for VFS ops */
unsigned securebits; /* SUID-less security management */
kernel_cap_t cap_inheritable; /* caps our children can inherit */
kernel_cap_t cap_permitted; /* caps we're permitted */
kernel_cap_t cap_effective; /* caps we can actually use */
kernel_cap_t cap_bset; /* capability bounding set */
kernel_cap_t cap_ambient; /* Ambient capability set */
#ifdef CONFIG_KEYS
unsigned char jit_keyring; /* default keyring to attach requested
* keys to */
struct key *session_keyring; /* keyring inherited over fork */
struct key *process_keyring; /* keyring private to this process */
struct key *thread_keyring; /* keyring private to this thread */
struct key *request_key_auth; /* assumed request_key authority */
#endif
#ifdef CONFIG_SECURITY
void *security; /* LSM security */
#endif
struct user_struct *user; /* real user ID subscription */
struct user_namespace *user_ns; /* user_ns the caps and keyrings are relative to. */
struct ucounts *ucounts;
struct group_info *group_info; /* supplementary groups for euid/fsgid */
/* RCU deletion */
union {
int non_rcu; /* Can we skip RCU deletion? */
struct rcu_head rcu; /* RCU deletion hook */
};
} __randomize_layout;
extern void __put_cred(struct cred *);
extern void exit_creds(struct task_struct *);
extern int copy_creds(struct task_struct *, unsigned long);
extern const struct cred *get_task_cred(struct task_struct *);
extern struct cred *cred_alloc_blank(void);
extern struct cred *prepare_creds(void);
extern struct cred *prepare_exec_creds(void);
extern int commit_creds(struct cred *);
extern void abort_creds(struct cred *);
extern const struct cred *override_creds(const struct cred *);
extern void revert_creds(const struct cred *);
extern struct cred *prepare_kernel_cred(struct task_struct *);
extern int change_create_files_as(struct cred *, struct inode *);
extern int set_security_override(struct cred *, u32);
extern int set_security_override_from_ctx(struct cred *, const char *);
extern int set_create_files_as(struct cred *, struct inode *);
extern int cred_fscmp(const struct cred *, const struct cred *);
extern void __init cred_init(void);
extern int set_cred_ucounts(struct cred *);
/*
* check for validity of credentials
*/
#ifdef CONFIG_DEBUG_CREDENTIALS
extern void __invalid_creds(const struct cred *, const char *, unsigned);
extern void __validate_process_creds(struct task_struct *,
const char *, unsigned);
extern bool creds_are_invalid(const struct cred *cred);
static inline void __validate_creds(const struct cred *cred,
const char *file, unsigned line)
{
if (unlikely(creds_are_invalid(cred)))
__invalid_creds(cred, file, line);
}
#define validate_creds(cred) \
do { \
__validate_creds((cred), __FILE__, __LINE__); \
} while(0)
#define validate_process_creds() \
do { \
__validate_process_creds(current, __FILE__, __LINE__); \
} while(0)
extern void validate_creds_for_do_exit(struct task_struct *);
#else
static inline void validate_creds(const struct cred *cred)
{
}
static inline void validate_creds_for_do_exit(struct task_struct *tsk)
{
}
static inline void validate_process_creds(void)
{
}
#endif
static inline bool cap_ambient_invariant_ok(const struct cred *cred)
{
return cap_issubset(cred->cap_ambient,
cap_intersect(cred->cap_permitted,
cred->cap_inheritable));
}
/**
* get_new_cred - Get a reference on a new set of credentials
* @cred: The new credentials to reference
*
* Get a reference on the specified set of new credentials. The caller must
* release the reference.
*/
static inline struct cred *get_new_cred(struct cred *cred)
{
atomic_inc(&cred->usage);
return cred;
}
/**
* get_cred - Get a reference on a set of credentials
* @cred: The credentials to reference
*
* Get a reference on the specified set of credentials. The caller must
* release the reference. If %NULL is passed, it is returned with no action.
*
* This is used to deal with a committed set of credentials. Although the
* pointer is const, this will temporarily discard the const and increment the
* usage count. The purpose of this is to attempt to catch at compile time the
* accidental alteration of a set of credentials that should be considered
* immutable.
*/
static inline const struct cred *get_cred(const struct cred *cred)
{
struct cred *nonconst_cred = (struct cred *) cred;
if (!cred)
return cred;
validate_creds(cred);
nonconst_cred->non_rcu = 0;
return get_new_cred(nonconst_cred);
}
static inline const struct cred *get_cred_rcu(const struct cred *cred)
{
struct cred *nonconst_cred = (struct cred *) cred;
if (!cred)
return NULL;
if (!atomic_inc_not_zero(&nonconst_cred->usage))
return NULL;
validate_creds(cred);
nonconst_cred->non_rcu = 0;
return cred;
}
/**
* put_cred - Release a reference to a set of credentials
* @cred: The credentials to release
*
* Release a reference to a set of credentials, deleting them when the last ref
* is released. If %NULL is passed, nothing is done.
*
* This takes a const pointer to a set of credentials because the credentials
* on task_struct are attached by const pointers to prevent accidental
* alteration of otherwise immutable credential sets.
*/
static inline void put_cred(const struct cred *_cred)
{
struct cred *cred = (struct cred *) _cred;
if (cred) {
validate_creds(cred);
if (atomic_dec_and_test(&(cred)->usage)) __put_cred(cred);
}
}
/**
* current_cred - Access the current task's subjective credentials
*
* Access the subjective credentials of the current task. RCU-safe,
* since nobody else can modify it.
*/
#define current_cred() \
rcu_dereference_protected(current->cred, 1)
/**
* current_real_cred - Access the current task's objective credentials
*
* Access the objective credentials of the current task. RCU-safe,
* since nobody else can modify it.
*/
#define current_real_cred() \
rcu_dereference_protected(current->real_cred, 1)
/**
* __task_cred - Access a task's objective credentials
* @task: The task to query
*
* Access the objective credentials of a task. The caller must hold the RCU
* readlock.
*
* The result of this function should not be passed directly to get_cred();
* rather get_task_cred() should be used instead.
*/
#define __task_cred(task) \
rcu_dereference((task)->real_cred)
/**
* get_current_cred - Get the current task's subjective credentials
*
* Get the subjective credentials of the current task, pinning them so that
* they can't go away. Accessing the current task's credentials directly is
* not permitted.
*/
#define get_current_cred() \
(get_cred(current_cred()))
/**
* get_current_user - Get the current task's user_struct
*
* Get the user record of the current task, pinning it so that it can't go
* away.
*/
#define get_current_user() \
({ \
struct user_struct *__u; \
const struct cred *__cred; \
__cred = current_cred(); \
__u = get_uid(__cred->user); \
__u; \
})
/**
* get_current_groups - Get the current task's supplementary group list
*
* Get the supplementary group list of the current task, pinning it so that it
* can't go away.
*/
#define get_current_groups() \
({ \
struct group_info *__groups; \
const struct cred *__cred; \
__cred = current_cred(); \
__groups = get_group_info(__cred->group_info); \
__groups; \
})
#define task_cred_xxx(task, xxx) \
({ \
__typeof__(((struct cred *)NULL)->xxx) ___val; \
rcu_read_lock(); \
___val = __task_cred((task))->xxx; \
rcu_read_unlock(); \
___val; \
})
#define task_uid(task) (task_cred_xxx((task), uid))
#define task_euid(task) (task_cred_xxx((task), euid))
#define task_ucounts(task) (task_cred_xxx((task), ucounts))
#define current_cred_xxx(xxx) \
({ \
current_cred()->xxx; \
})
#define current_uid() (current_cred_xxx(uid))
#define current_gid() (current_cred_xxx(gid))
#define current_euid() (current_cred_xxx(euid))
#define current_egid() (current_cred_xxx(egid))
#define current_suid() (current_cred_xxx(suid))
#define current_sgid() (current_cred_xxx(sgid))
#define current_fsuid() (current_cred_xxx(fsuid))
#define current_fsgid() (current_cred_xxx(fsgid))
#define current_cap() (current_cred_xxx(cap_effective))
#define current_user() (current_cred_xxx(user))
#define current_ucounts() (current_cred_xxx(ucounts))
extern struct user_namespace init_user_ns;
#ifdef CONFIG_USER_NS
#define current_user_ns() (current_cred_xxx(user_ns))
#else
static inline struct user_namespace *current_user_ns(void)
{
return &init_user_ns;
}
#endif
#define current_uid_gid(_uid, _gid) \
do { \
const struct cred *__cred; \
__cred = current_cred(); \
*(_uid) = __cred->uid; \
*(_gid) = __cred->gid; \
} while(0)
#define current_euid_egid(_euid, _egid) \
do { \
const struct cred *__cred; \
__cred = current_cred(); \
*(_euid) = __cred->euid; \
*(_egid) = __cred->egid; \
} while(0)
#define current_fsuid_fsgid(_fsuid, _fsgid) \
do { \
const struct cred *__cred; \
__cred = current_cred(); \
*(_fsuid) = __cred->fsuid; \
*(_fsgid) = __cred->fsgid; \
} while(0)
#endif /* _LINUX_CRED_H */
/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
* include/linux/eventpoll.h ( Efficient event polling implementation )
* Copyright (C) 2001,...,2006 Davide Libenzi
*
* Davide Libenzi <davidel@xmailserver.org>
*/
#ifndef _LINUX_EVENTPOLL_H
#define _LINUX_EVENTPOLL_H
#include <uapi/linux/eventpoll.h>
#include <uapi/linux/kcmp.h>
/* Forward declarations to avoid compiler errors */
struct file;
#ifdef CONFIG_EPOLL
#ifdef CONFIG_KCMP
struct file *get_epoll_tfile_raw_ptr(struct file *file, int tfd, unsigned long toff);
#endif
/* Used to release the epoll bits inside the "struct file" */
void eventpoll_release_file(struct file *file);
/*
* This is called from inside fs/file_table.c:__fput() to unlink files
* from the eventpoll interface. We need to have this facility to cleanup
* correctly files that are closed without being removed from the eventpoll
* interface.
*/
static inline void eventpoll_release(struct file *file)
{
/*
* Fast check to avoid the get/release of the semaphore. Since
* we're doing this outside the semaphore lock, it might return
* false negatives, but we don't care. It'll help in 99.99% of cases
* to avoid the semaphore lock. False positives simply cannot happen
* because the file in on the way to be removed and nobody ( but
* eventpoll ) has still a reference to this file.
*/
if (likely(!file->f_ep))
return;
/*
* The file is being closed while it is still linked to an epoll
* descriptor. We need to handle this by correctly unlinking it
* from its containers.
*/
eventpoll_release_file(file);
}
int do_epoll_ctl(int epfd, int op, int fd, struct epoll_event *epds,
bool nonblock);
/* Tells if the epoll_ctl(2) operation needs an event copy from userspace */
static inline int ep_op_has_event(int op)
{
return op != EPOLL_CTL_DEL;
}
#else
static inline void eventpoll_release(struct file *file) {}
#endif
#if defined(CONFIG_ARM) && defined(CONFIG_OABI_COMPAT)
/* ARM OABI has an incompatible struct layout and needs a special handler */
extern struct epoll_event __user *
epoll_put_uevent(__poll_t revents, __u64 data,
struct epoll_event __user *uevent);
#else
static inline struct epoll_event __user *
epoll_put_uevent(__poll_t revents, __u64 data,
struct epoll_event __user *uevent)
{
if (__put_user(revents, &uevent->events) ||
__put_user(data, &uevent->data))
return NULL;
return uevent+1;
}
#endif
#endif /* #ifndef _LINUX_EVENTPOLL_H */
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_MM_TYPES_H
#define _LINUX_MM_TYPES_H
#include <linux/mm_types_task.h>
#include <linux/auxvec.h>
#include <linux/list.h>
#include <linux/spinlock.h>
#include <linux/rbtree.h>
#include <linux/rwsem.h>
#include <linux/completion.h>
#include <linux/cpumask.h>
#include <linux/uprobes.h>
#include <linux/page-flags-layout.h>
#include <linux/workqueue.h>
#include <linux/seqlock.h>
#include <asm/mmu.h>
#ifndef AT_VECTOR_SIZE_ARCH
#define AT_VECTOR_SIZE_ARCH 0
#endif
#define AT_VECTOR_SIZE (2*(AT_VECTOR_SIZE_ARCH + AT_VECTOR_SIZE_BASE + 1))
#define INIT_PASID 0
struct address_space;
struct mem_cgroup;
/*
* Each physical page in the system has a struct page associated with
* it to keep track of whatever it is we are using the page for at the
* moment. Note that we have no way to track which tasks are using
* a page, though if it is a pagecache page, rmap structures can tell us
* who is mapping it.
*
* If you allocate the page using alloc_pages(), you can use some of the
* space in struct page for your own purposes. The five words in the main
* union are available, except for bit 0 of the first word which must be
* kept clear. Many users use this word to store a pointer to an object
* which is guaranteed to be aligned. If you use the same storage as
* page->mapping, you must restore it to NULL before freeing the page.
*
* If your page will not be mapped to userspace, you can also use the four
* bytes in the mapcount union, but you must call page_mapcount_reset()
* before freeing it.
*
* If you want to use the refcount field, it must be used in such a way
* that other CPUs temporarily incrementing and then decrementing the
* refcount does not cause problems. On receiving the page from
* alloc_pages(), the refcount will be positive.
*
* If you allocate pages of order > 0, you can use some of the fields
* in each subpage, but you may need to restore some of their values
* afterwards.
*
* SLUB uses cmpxchg_double() to atomically update its freelist and
* counters. That requires that freelist & counters be adjacent and
* double-word aligned. We align all struct pages to double-word
* boundaries, and ensure that 'freelist' is aligned within the
* struct.
*/
#ifdef CONFIG_HAVE_ALIGNED_STRUCT_PAGE
#define _struct_page_alignment __aligned(2 * sizeof(unsigned long))
#else
#define _struct_page_alignment
#endif
struct page {
unsigned long flags; /* Atomic flags, some possibly
* updated asynchronously */
/*
* Five words (20/40 bytes) are available in this union.
* WARNING: bit 0 of the first word is used for PageTail(). That
* means the other users of this union MUST NOT use the bit to
* avoid collision and false-positive PageTail().
*/
union {
struct { /* Page cache and anonymous pages */
/**
* @lru: Pageout list, eg. active_list protected by
* lruvec->lru_lock. Sometimes used as a generic list
* by the page owner.
*/
struct list_head lru;
/* See page-flags.h for PAGE_MAPPING_FLAGS */
struct address_space *mapping;
pgoff_t index; /* Our offset within mapping. */
/**
* @private: Mapping-private opaque data.
* Usually used for buffer_heads if PagePrivate.
* Used for swp_entry_t if PageSwapCache.
* Indicates order in the buddy system if PageBuddy.
*/
unsigned long private;
};
struct { /* page_pool used by netstack */
/**
* @pp_magic: magic value to avoid recycling non
* page_pool allocated pages.
*/
unsigned long pp_magic;
struct page_pool *pp;
unsigned long _pp_mapping_pad;
unsigned long dma_addr;
union {
/**
* dma_addr_upper: might require a 64-bit
* value on 32-bit architectures.
*/
unsigned long dma_addr_upper;
/**
* For frag page support, not supported in
* 32-bit architectures with 64-bit DMA.
*/
atomic_long_t pp_frag_count;
};
};
struct { /* slab, slob and slub */
union {
struct list_head slab_list;
struct { /* Partial pages */
struct page *next;
#ifdef CONFIG_64BIT
int pages; /* Nr of pages left */
int pobjects; /* Approximate count */
#else
short int pages;
short int pobjects;
#endif
};
};
struct kmem_cache *slab_cache; /* not slob */
/* Double-word boundary */
void *freelist; /* first free object */
union {
void *s_mem; /* slab: first object */
unsigned long counters; /* SLUB */
struct { /* SLUB */
unsigned inuse:16;
unsigned objects:15;
unsigned frozen:1;
};
};
};
struct { /* Tail pages of compound page */
unsigned long compound_head; /* Bit zero is set */
/* First tail page only */
unsigned char compound_dtor;
unsigned char compound_order;
atomic_t compound_mapcount;
unsigned int compound_nr; /* 1 << compound_order */
};
struct { /* Second tail page of compound page */
unsigned long _compound_pad_1; /* compound_head */
atomic_t hpage_pinned_refcount;
/* For both global and memcg */
struct list_head deferred_list;
};
struct { /* Page table pages */
unsigned long _pt_pad_1; /* compound_head */
pgtable_t pmd_huge_pte; /* protected by page->ptl */
unsigned long _pt_pad_2; /* mapping */
union {
struct mm_struct *pt_mm; /* x86 pgds only */
atomic_t pt_frag_refcount; /* powerpc */
};
#if ALLOC_SPLIT_PTLOCKS
spinlock_t *ptl;
#else
spinlock_t ptl;
#endif
};
struct { /* ZONE_DEVICE pages */
/** @pgmap: Points to the hosting device page map. */
struct dev_pagemap *pgmap;
void *zone_device_data;
/*
* ZONE_DEVICE private pages are counted as being
* mapped so the next 3 words hold the mapping, index,
* and private fields from the source anonymous or
* page cache page while the page is migrated to device
* private memory.
* ZONE_DEVICE MEMORY_DEVICE_FS_DAX pages also
* use the mapping, index, and private fields when
* pmem backed DAX files are mapped.
*/
};
/** @rcu_head: You can use this to free a page by RCU. */
struct rcu_head rcu_head;
};
union { /* This union is 4 bytes in size. */
/*
* If the page can be mapped to userspace, encodes the number
* of times this page is referenced by a page table.
*/
atomic_t _mapcount;
/*
* If the page is neither PageSlab nor mappable to userspace,
* the value stored here may help determine what this page
* is used for. See page-flags.h for a list of page types
* which are currently stored here.
*/
unsigned int page_type;
unsigned int active; /* SLAB */
int units; /* SLOB */
};
/* Usage count. *DO NOT USE DIRECTLY*. See page_ref.h */
atomic_t _refcount;
#ifdef CONFIG_MEMCG
unsigned long memcg_data;
#endif
/*
* On machines where all RAM is mapped into kernel address space,
* we can simply calculate the virtual address. On machines with
* highmem some memory is mapped into kernel virtual memory
* dynamically, so we need a place to store that address.
* Note that this field could be 16 bits on x86 ... ;)
*
* Architectures with slow multiplication can define
* WANT_PAGE_VIRTUAL in asm/page.h
*/
#if defined(WANT_PAGE_VIRTUAL)
void *virtual; /* Kernel virtual address (NULL if
not kmapped, ie. highmem) */
#endif /* WANT_PAGE_VIRTUAL */
#ifdef LAST_CPUPID_NOT_IN_PAGE_FLAGS
int _last_cpupid;
#endif
} _struct_page_alignment;
static inline atomic_t *compound_mapcount_ptr(struct page *page)
{
return &page[1].compound_mapcount;
}
static inline atomic_t *compound_pincount_ptr(struct page *page)
{
return &page[2].hpage_pinned_refcount;
}
/*
* Used for sizing the vmemmap region on some architectures
*/
#define STRUCT_PAGE_MAX_SHIFT (order_base_2(sizeof(struct page)))
#define PAGE_FRAG_CACHE_MAX_SIZE __ALIGN_MASK(32768, ~PAGE_MASK)
#define PAGE_FRAG_CACHE_MAX_ORDER get_order(PAGE_FRAG_CACHE_MAX_SIZE)
#define page_private(page) ((page)->private)
static inline void set_page_private(struct page *page, unsigned long private)
{
page->private = private;
}
struct page_frag_cache {
void * va;
#if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
__u16 offset;
__u16 size;
#else
__u32 offset;
#endif
/* we maintain a pagecount bias, so that we dont dirty cache line
* containing page->_refcount every time we allocate a fragment.
*/
unsigned int pagecnt_bias;
bool pfmemalloc;
};
typedef unsigned long vm_flags_t;
/*
* A region containing a mapping of a non-memory backed file under NOMMU
* conditions. These are held in a global tree and are pinned by the VMAs that
* map parts of them.
*/
struct vm_region {
struct rb_node vm_rb; /* link in global region tree */
vm_flags_t vm_flags; /* VMA vm_flags */
unsigned long vm_start; /* start address of region */
unsigned long vm_end; /* region initialised to here */
unsigned long vm_top; /* region allocated to here */
unsigned long vm_pgoff; /* the offset in vm_file corresponding to vm_start */
struct file *vm_file; /* the backing file or NULL */
int vm_usage; /* region usage count (access under nommu_region_sem) */
bool vm_icache_flushed : 1; /* true if the icache has been flushed for
* this region */
};
#ifdef CONFIG_USERFAULTFD
#define NULL_VM_UFFD_CTX ((struct vm_userfaultfd_ctx) { NULL, })
struct vm_userfaultfd_ctx {
struct userfaultfd_ctx *ctx;
};
#else /* CONFIG_USERFAULTFD */
#define NULL_VM_UFFD_CTX ((struct vm_userfaultfd_ctx) {})
struct vm_userfaultfd_ctx {};
#endif /* CONFIG_USERFAULTFD */
/*
* This struct describes a virtual memory area. There is one of these
* per VM-area/task. A VM area is any part of the process virtual memory
* space that has a special rule for the page-fault handlers (ie a shared
* library, the executable area etc).
*/
struct vm_area_struct {
/* The first cache line has the info for VMA tree walking. */
unsigned long vm_start; /* Our start address within vm_mm. */
unsigned long vm_end; /* The first byte after our end address
within vm_mm. */
/* linked list of VM areas per task, sorted by address */
struct vm_area_struct *vm_next, *vm_prev;
struct rb_node vm_rb;
/*
* Largest free memory gap in bytes to the left of this VMA.
* Either between this VMA and vma->vm_prev, or between one of the
* VMAs below us in the VMA rbtree and its ->vm_prev. This helps
* get_unmapped_area find a free area of the right size.
*/
unsigned long rb_subtree_gap;
/* Second cache line starts here. */
struct mm_struct *vm_mm; /* The address space we belong to. */
/*
* Access permissions of this VMA.
* See vmf_insert_mixed_prot() for discussion.
*/
pgprot_t vm_page_prot;
unsigned long vm_flags; /* Flags, see mm.h. */
/*
* For areas with an address space and backing store,
* linkage into the address_space->i_mmap interval tree.
*/
struct {
struct rb_node rb;
unsigned long rb_subtree_last;
} shared;
/*
* A file's MAP_PRIVATE vma can be in both i_mmap tree and anon_vma
* list, after a COW of one of the file pages. A MAP_SHARED vma
* can only be in the i_mmap tree. An anonymous MAP_PRIVATE, stack
* or brk vma (with NULL file) can only be in an anon_vma list.
*/
struct list_head anon_vma_chain; /* Serialized by mmap_lock &
* page_table_lock */
struct anon_vma *anon_vma; /* Serialized by page_table_lock */
/* Function pointers to deal with this struct. */
const struct vm_operations_struct *vm_ops;
/* Information about our backing store: */
unsigned long vm_pgoff; /* Offset (within vm_file) in PAGE_SIZE
units */
struct file * vm_file; /* File we map to (can be NULL). */
void * vm_private_data; /* was vm_pte (shared mem) */
#ifdef CONFIG_SWAP
atomic_long_t swap_readahead_info;
#endif
#ifndef CONFIG_MMU
struct vm_region *vm_region; /* NOMMU mapping region */
#endif
#ifdef CONFIG_NUMA
struct mempolicy *vm_policy; /* NUMA policy for the VMA */
#endif
struct vm_userfaultfd_ctx vm_userfaultfd_ctx;
} __randomize_layout;
struct core_thread {
struct task_struct *task;
struct core_thread *next;
};
struct core_state {
atomic_t nr_threads;
struct core_thread dumper;
struct completion startup;
};
struct kioctx_table;
struct mm_struct {
struct {
struct vm_area_struct *mmap; /* list of VMAs */
struct rb_root mm_rb;
u64 vmacache_seqnum; /* per-thread vmacache */
#ifdef CONFIG_MMU
unsigned long (*get_unmapped_area) (struct file *filp,
unsigned long addr, unsigned long len,
unsigned long pgoff, unsigned long flags);
#endif
unsigned long mmap_base; /* base of mmap area */
unsigned long mmap_legacy_base; /* base of mmap area in bottom-up allocations */
#ifdef CONFIG_HAVE_ARCH_COMPAT_MMAP_BASES
/* Base addresses for compatible mmap() */
unsigned long mmap_compat_base;
unsigned long mmap_compat_legacy_base;
#endif
unsigned long task_size; /* size of task vm space */
unsigned long highest_vm_end; /* highest vma end address */
pgd_t * pgd;
#ifdef CONFIG_MEMBARRIER
/**
* @membarrier_state: Flags controlling membarrier behavior.
*
* This field is close to @pgd to hopefully fit in the same
* cache-line, which needs to be touched by switch_mm().
*/
atomic_t membarrier_state;
#endif
/**
* @mm_users: The number of users including userspace.
*
* Use mmget()/mmget_not_zero()/mmput() to modify. When this
* drops to 0 (i.e. when the task exits and there are no other
* temporary reference holders), we also release a reference on
* @mm_count (which may then free the &struct mm_struct if
* @mm_count also drops to 0).
*/
atomic_t mm_users;
/**
* @mm_count: The number of references to &struct mm_struct
* (@mm_users count as 1).
*
* Use mmgrab()/mmdrop() to modify. When this drops to 0, the
* &struct mm_struct is freed.
*/
atomic_t mm_count;
#ifdef CONFIG_MMU
atomic_long_t pgtables_bytes; /* PTE page table pages */
#endif
int map_count; /* number of VMAs */
spinlock_t page_table_lock; /* Protects page tables and some
* counters
*/
/*
* With some kernel config, the current mmap_lock's offset
* inside 'mm_struct' is at 0x120, which is very optimal, as
* its two hot fields 'count' and 'owner' sit in 2 different
* cachelines, and when mmap_lock is highly contended, both
* of the 2 fields will be accessed frequently, current layout
* will help to reduce cache bouncing.
*
* So please be careful with adding new fields before
* mmap_lock, which can easily push the 2 fields into one
* cacheline.
*/
struct rw_semaphore mmap_lock;
struct list_head mmlist; /* List of maybe swapped mm's. These
* are globally strung together off
* init_mm.mmlist, and are protected
* by mmlist_lock
*/
unsigned long hiwater_rss; /* High-watermark of RSS usage */
unsigned long hiwater_vm; /* High-water virtual memory usage */
unsigned long total_vm; /* Total pages mapped */
unsigned long locked_vm; /* Pages that have PG_mlocked set */
atomic64_t pinned_vm; /* Refcount permanently increased */
unsigned long data_vm; /* VM_WRITE & ~VM_SHARED & ~VM_STACK */
unsigned long exec_vm; /* VM_EXEC & ~VM_WRITE & ~VM_STACK */
unsigned long stack_vm; /* VM_STACK */
unsigned long def_flags;
/**
* @write_protect_seq: Locked when any thread is write
* protecting pages mapped by this mm to enforce a later COW,
* for instance during page table copying for fork().
*/
seqcount_t write_protect_seq;
spinlock_t arg_lock; /* protect the below fields */
unsigned long start_code, end_code, start_data, end_data;
unsigned long start_brk, brk, start_stack;
unsigned long arg_start, arg_end, env_start, env_end;
unsigned long saved_auxv[AT_VECTOR_SIZE]; /* for /proc/PID/auxv */
/*
* Special counters, in some configurations protected by the
* page_table_lock, in other configurations by being atomic.
*/
struct mm_rss_stat rss_stat;
struct linux_binfmt *binfmt;
/* Architecture-specific MM context */
mm_context_t context;
unsigned long flags; /* Must use atomic bitops to access */
struct core_state *core_state; /* coredumping support */
#ifdef CONFIG_AIO
spinlock_t ioctx_lock;
struct kioctx_table __rcu *ioctx_table;
#endif
#ifdef CONFIG_MEMCG
/*
* "owner" points to a task that is regarded as the canonical
* user/owner of this mm. All of the following must be true in
* order for it to be changed:
*
* current == mm->owner
* current->mm != mm
* new_owner->mm == mm
* new_owner->alloc_lock is held
*/
struct task_struct __rcu *owner;
#endif
struct user_namespace *user_ns;
/* store ref to file /proc/<pid>/exe symlink points to */
struct file __rcu *exe_file;
#ifdef CONFIG_MMU_NOTIFIER
struct mmu_notifier_subscriptions *notifier_subscriptions;
#endif
#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS
pgtable_t pmd_huge_pte; /* protected by page_table_lock */
#endif
#ifdef CONFIG_NUMA_BALANCING
/*
* numa_next_scan is the next time that the PTEs will be marked
* pte_numa. NUMA hinting faults will gather statistics and
* migrate pages to new nodes if necessary.
*/
unsigned long numa_next_scan;
/* Restart point for scanning and setting pte_numa */
unsigned long numa_scan_offset;
/* numa_scan_seq prevents two threads setting pte_numa */
int numa_scan_seq;
#endif
/*
* An operation with batched TLB flushing is going on. Anything
* that can move process memory needs to flush the TLB when
* moving a PROT_NONE or PROT_NUMA mapped page.
*/
atomic_t tlb_flush_pending;
#ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH
/* See flush_tlb_batched_pending() */
bool tlb_flush_batched;
#endif
struct uprobes_state uprobes_state;
#ifdef CONFIG_HUGETLB_PAGE
atomic_long_t hugetlb_usage;
#endif
struct work_struct async_put_work;
#ifdef CONFIG_IOMMU_SUPPORT
u32 pasid;
#endif
} __randomize_layout;
/*
* The mm_cpumask needs to be at the end of mm_struct, because it
* is dynamically sized based on nr_cpu_ids.
*/
unsigned long cpu_bitmap[];
};
extern struct mm_struct init_mm;
/* Pointer magic because the dynamic array size confuses some compilers. */
static inline void mm_init_cpumask(struct mm_struct *mm)
{
unsigned long cpu_bitmap = (unsigned long)mm;
cpu_bitmap += offsetof(struct mm_struct, cpu_bitmap);
cpumask_clear((struct cpumask *)cpu_bitmap);
}
/* Future-safe accessor for struct mm_struct's cpu_vm_mask. */
static inline cpumask_t *mm_cpumask(struct mm_struct *mm)
{
return (struct cpumask *)&mm->cpu_bitmap;
}
struct mmu_gather;
extern void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm);
extern void tlb_gather_mmu_fullmm(struct mmu_gather *tlb, struct mm_struct *mm);
extern void tlb_finish_mmu(struct mmu_gather *tlb);
static inline void init_tlb_flush_pending(struct mm_struct *mm)
{
atomic_set(&mm->tlb_flush_pending, 0);
}
static inline void inc_tlb_flush_pending(struct mm_struct *mm)
{
atomic_inc(&mm->tlb_flush_pending);
/*
* The only time this value is relevant is when there are indeed pages
* to flush. And we'll only flush pages after changing them, which
* requires the PTL.
*
* So the ordering here is:
*
* atomic_inc(&mm->tlb_flush_pending);
* spin_lock(&ptl);
* ...
* set_pte_at();
* spin_unlock(&ptl);
*
* spin_lock(&ptl)
* mm_tlb_flush_pending();
* ....
* spin_unlock(&ptl);
*
* flush_tlb_range();
* atomic_dec(&mm->tlb_flush_pending);
*
* Where the increment if constrained by the PTL unlock, it thus
* ensures that the increment is visible if the PTE modification is
* visible. After all, if there is no PTE modification, nobody cares
* about TLB flushes either.
*
* This very much relies on users (mm_tlb_flush_pending() and
* mm_tlb_flush_nested()) only caring about _specific_ PTEs (and
* therefore specific PTLs), because with SPLIT_PTE_PTLOCKS and RCpc
* locks (PPC) the unlock of one doesn't order against the lock of
* another PTL.
*
* The decrement is ordered by the flush_tlb_range(), such that
* mm_tlb_flush_pending() will not return false unless all flushes have
* completed.
*/
}
static inline void dec_tlb_flush_pending(struct mm_struct *mm)
{
/*
* See inc_tlb_flush_pending().
*
* This cannot be smp_mb__before_atomic() because smp_mb() simply does
* not order against TLB invalidate completion, which is what we need.
*
* Therefore we must rely on tlb_flush_*() to guarantee order.
*/
atomic_dec(&mm->tlb_flush_pending);
}
static inline bool mm_tlb_flush_pending(struct mm_struct *mm)
{
/*
* Must be called after having acquired the PTL; orders against that
* PTLs release and therefore ensures that if we observe the modified
* PTE we must also observe the increment from inc_tlb_flush_pending().
*
* That is, it only guarantees to return true if there is a flush
* pending for _this_ PTL.
*/
return atomic_read(&mm->tlb_flush_pending);
}
static inline bool mm_tlb_flush_nested(struct mm_struct *mm)
{
/*
* Similar to mm_tlb_flush_pending(), we must have acquired the PTL
* for which there is a TLB flush pending in order to guarantee
* we've seen both that PTE modification and the increment.
*
* (no requirement on actually still holding the PTL, that is irrelevant)
*/
return atomic_read(&mm->tlb_flush_pending) > 1;
}
struct vm_fault;
/**
* typedef vm_fault_t - Return type for page fault handlers.
*
* Page fault handlers return a bitmask of %VM_FAULT values.
*/
typedef __bitwise unsigned int vm_fault_t;
/**
* enum vm_fault_reason - Page fault handlers return a bitmask of
* these values to tell the core VM what happened when handling the
* fault. Used to decide whether a process gets delivered SIGBUS or
* just gets major/minor fault counters bumped up.
*
* @VM_FAULT_OOM: Out Of Memory
* @VM_FAULT_SIGBUS: Bad access
* @VM_FAULT_MAJOR: Page read from storage
* @VM_FAULT_WRITE: Special case for get_user_pages
* @VM_FAULT_HWPOISON: Hit poisoned small page
* @VM_FAULT_HWPOISON_LARGE: Hit poisoned large page. Index encoded
* in upper bits
* @VM_FAULT_SIGSEGV: segmentation fault
* @VM_FAULT_NOPAGE: ->fault installed the pte, not return page
* @VM_FAULT_LOCKED: ->fault locked the returned page
* @VM_FAULT_RETRY: ->fault blocked, must retry
* @VM_FAULT_FALLBACK: huge page fault failed, fall back to small
* @VM_FAULT_DONE_COW: ->fault has fully handled COW
* @VM_FAULT_NEEDDSYNC: ->fault did not modify page tables and needs
* fsync() to complete (for synchronous page faults
* in DAX)
* @VM_FAULT_HINDEX_MASK: mask HINDEX value
*
*/
enum vm_fault_reason {
VM_FAULT_OOM = (__force vm_fault_t)0x000001,
VM_FAULT_SIGBUS = (__force vm_fault_t)0x000002,
VM_FAULT_MAJOR = (__force vm_fault_t)0x000004,
VM_FAULT_WRITE = (__force vm_fault_t)0x000008,
VM_FAULT_HWPOISON = (__force vm_fault_t)0x000010,
VM_FAULT_HWPOISON_LARGE = (__force vm_fault_t)0x000020,
VM_FAULT_SIGSEGV = (__force vm_fault_t)0x000040,
VM_FAULT_NOPAGE = (__force vm_fault_t)0x000100,
VM_FAULT_LOCKED = (__force vm_fault_t)0x000200,
VM_FAULT_RETRY = (__force vm_fault_t)0x000400,
VM_FAULT_FALLBACK = (__force vm_fault_t)0x000800,
VM_FAULT_DONE_COW = (__force vm_fault_t)0x001000,
VM_FAULT_NEEDDSYNC = (__force vm_fault_t)0x002000,
VM_FAULT_HINDEX_MASK = (__force vm_fault_t)0x0f0000,
};
/* Encode hstate index for a hwpoisoned large page */
#define VM_FAULT_SET_HINDEX(x) ((__force vm_fault_t)((x) << 16))
#define VM_FAULT_GET_HINDEX(x) (((__force unsigned int)(x) >> 16) & 0xf)
#define VM_FAULT_ERROR (VM_FAULT_OOM | VM_FAULT_SIGBUS | \
VM_FAULT_SIGSEGV | VM_FAULT_HWPOISON | \
VM_FAULT_HWPOISON_LARGE | VM_FAULT_FALLBACK)
#define VM_FAULT_RESULT_TRACE \
{ VM_FAULT_OOM, "OOM" }, \
{ VM_FAULT_SIGBUS, "SIGBUS" }, \
{ VM_FAULT_MAJOR, "MAJOR" }, \
{ VM_FAULT_WRITE, "WRITE" }, \
{ VM_FAULT_HWPOISON, "HWPOISON" }, \
{ VM_FAULT_HWPOISON_LARGE, "HWPOISON_LARGE" }, \
{ VM_FAULT_SIGSEGV, "SIGSEGV" }, \
{ VM_FAULT_NOPAGE, "NOPAGE" }, \
{ VM_FAULT_LOCKED, "LOCKED" }, \
{ VM_FAULT_RETRY, "RETRY" }, \
{ VM_FAULT_FALLBACK, "FALLBACK" }, \
{ VM_FAULT_DONE_COW, "DONE_COW" }, \
{ VM_FAULT_NEEDDSYNC, "NEEDDSYNC" }
struct vm_special_mapping {
const char *name; /* The name, e.g. "[vdso]". */
/*
* If .fault is not provided, this points to a
* NULL-terminated array of pages that back the special mapping.
*
* This must not be NULL unless .fault is provided.
*/
struct page **pages;
/*
* If non-NULL, then this is called to resolve page faults
* on the special mapping. If used, .pages is not checked.
*/
vm_fault_t (*fault)(const struct vm_special_mapping *sm,
struct vm_area_struct *vma,
struct vm_fault *vmf);
int (*mremap)(const struct vm_special_mapping *sm,
struct vm_area_struct *new_vma);
};
enum tlb_flush_reason {
TLB_FLUSH_ON_TASK_SWITCH,
TLB_REMOTE_SHOOTDOWN,
TLB_LOCAL_SHOOTDOWN,
TLB_LOCAL_MM_SHOOTDOWN,
TLB_REMOTE_SEND_IPI,
NR_TLB_FLUSH_REASONS,
};
/*
* A swap entry has to fit into a "unsigned long", as the entry is hidden
* in the "index" field of the swapper address space.
*/
typedef struct {
unsigned long val;
} swp_entry_t;
#endif /* _LINUX_MM_TYPES_H */
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_GENERIC_BITOPS_FIND_H_
#define _ASM_GENERIC_BITOPS_FIND_H_
extern unsigned long _find_next_bit(const unsigned long *addr1,
const unsigned long *addr2, unsigned long nbits,
unsigned long start, unsigned long invert, unsigned long le);
extern unsigned long _find_first_bit(const unsigned long *addr, unsigned long size);
extern unsigned long _find_first_zero_bit(const unsigned long *addr, unsigned long size);
extern unsigned long _find_last_bit(const unsigned long *addr, unsigned long size);
#ifndef find_next_bit
/**
* find_next_bit - find the next set bit in a memory region
* @addr: The address to base the search on
* @offset: The bitnumber to start searching at
* @size: The bitmap size in bits
*
* Returns the bit number for the next set bit
* If no bits are set, returns @size.
*/
static inline
unsigned long find_next_bit(const unsigned long *addr, unsigned long size,
unsigned long offset)
{
if (small_const_nbits(size)) {
unsigned long val;
if (unlikely(offset >= size))
return size;
val = *addr & GENMASK(size - 1, offset);
return val ? __ffs(val) : size;
}
return _find_next_bit(addr, NULL, size, offset, 0UL, 0);
}
#endif
#ifndef find_next_and_bit
/**
* find_next_and_bit - find the next set bit in both memory regions
* @addr1: The first address to base the search on
* @addr2: The second address to base the search on
* @offset: The bitnumber to start searching at
* @size: The bitmap size in bits
*
* Returns the bit number for the next set bit
* If no bits are set, returns @size.
*/
static inline
unsigned long find_next_and_bit(const unsigned long *addr1,
const unsigned long *addr2, unsigned long size,
unsigned long offset)
{
if (small_const_nbits(size)) {
unsigned long val;
if (unlikely(offset >= size))
return size;
val = *addr1 & *addr2 & GENMASK(size - 1, offset);
return val ? __ffs(val) : size;
}
return _find_next_bit(addr1, addr2, size, offset, 0UL, 0);
}
#endif
#ifndef find_next_zero_bit
/**
* find_next_zero_bit - find the next cleared bit in a memory region
* @addr: The address to base the search on
* @offset: The bitnumber to start searching at
* @size: The bitmap size in bits
*
* Returns the bit number of the next zero bit
* If no bits are zero, returns @size.
*/
static inline
unsigned long find_next_zero_bit(const unsigned long *addr, unsigned long size,
unsigned long offset)
{
if (small_const_nbits(size)) {
unsigned long val;
if (unlikely(offset >= size))
return size;
val = *addr | ~GENMASK(size - 1, offset);
return val == ~0UL ? size : ffz(val);
}
return _find_next_bit(addr, NULL, size, offset, ~0UL, 0);
}
#endif
#ifdef CONFIG_GENERIC_FIND_FIRST_BIT
#ifndef find_first_bit
/**
* find_first_bit - find the first set bit in a memory region
* @addr: The address to start the search at
* @size: The maximum number of bits to search
*
* Returns the bit number of the first set bit.
* If no bits are set, returns @size.
*/
static inline
unsigned long find_first_bit(const unsigned long *addr, unsigned long size)
{
if (small_const_nbits(size)) {
unsigned long val = *addr & GENMASK(size - 1, 0);
return val ? __ffs(val) : size;
}
return _find_first_bit(addr, size);
}
#endif
#ifndef find_first_zero_bit
/**
* find_first_zero_bit - find the first cleared bit in a memory region
* @addr: The address to start the search at
* @size: The maximum number of bits to search
*
* Returns the bit number of the first cleared bit.
* If no bits are zero, returns @size.
*/
static inline
unsigned long find_first_zero_bit(const unsigned long *addr, unsigned long size)
{
if (small_const_nbits(size)) {
unsigned long val = *addr | ~GENMASK(size - 1, 0);
return val == ~0UL ? size : ffz(val);
}
return _find_first_zero_bit(addr, size);
}
#endif
#else /* CONFIG_GENERIC_FIND_FIRST_BIT */
#ifndef find_first_bit
#define find_first_bit(addr, size) find_next_bit((addr), (size), 0)
#endif
#ifndef find_first_zero_bit
#define find_first_zero_bit(addr, size) find_next_zero_bit((addr), (size), 0)
#endif
#endif /* CONFIG_GENERIC_FIND_FIRST_BIT */
#ifndef find_last_bit
/**
* find_last_bit - find the last set bit in a memory region
* @addr: The address to start the search at
* @size: The number of bits to search
*
* Returns the bit number of the last set bit, or size.
*/
static inline
unsigned long find_last_bit(const unsigned long *addr, unsigned long size)
{
if (small_const_nbits(size)) {
unsigned long val = *addr & GENMASK(size - 1, 0);
return val ? __fls(val) : size;
}
return _find_last_bit(addr, size);
}
#endif
/**
* find_next_clump8 - find next 8-bit clump with set bits in a memory region
* @clump: location to store copy of found clump
* @addr: address to base the search on
* @size: bitmap size in number of bits
* @offset: bit offset at which to start searching
*
* Returns the bit offset for the next set clump; the found clump value is
* copied to the location pointed by @clump. If no bits are set, returns @size.
*/
extern unsigned long find_next_clump8(unsigned long *clump,
const unsigned long *addr,
unsigned long size, unsigned long offset);
#define find_first_clump8(clump, bits, size) \
find_next_clump8((clump), (bits), (size), 0)
#endif /*_ASM_GENERIC_BITOPS_FIND_H_ */
// SPDX-License-Identifier: GPL-2.0
/*
* Functions related to io context handling
*/
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/init.h>
#include <linux/bio.h>
#include <linux/blkdev.h>
#include <linux/slab.h>
#include <linux/sched/task.h>
#include "blk.h"
/*
* For io context allocations
*/
static struct kmem_cache *iocontext_cachep;
/**
* get_io_context - increment reference count to io_context
* @ioc: io_context to get
*
* Increment reference count to @ioc.
*/
void get_io_context(struct io_context *ioc)
{
BUG_ON(atomic_long_read(&ioc->refcount) <= 0);
atomic_long_inc(&ioc->refcount);
}
static void icq_free_icq_rcu(struct rcu_head *head)
{
struct io_cq *icq = container_of(head, struct io_cq, __rcu_head);
kmem_cache_free(icq->__rcu_icq_cache, icq);
}
/*
* Exit an icq. Called with ioc locked for blk-mq, and with both ioc
* and queue locked for legacy.
*/
static void ioc_exit_icq(struct io_cq *icq)
{
struct elevator_type *et = icq->q->elevator->type;
if (icq->flags & ICQ_EXITED)
return;
if (et->ops.exit_icq)
et->ops.exit_icq(icq);
icq->flags |= ICQ_EXITED;
}
/*
* Release an icq. Called with ioc locked for blk-mq, and with both ioc
* and queue locked for legacy.
*/
static void ioc_destroy_icq(struct io_cq *icq)
{
struct io_context *ioc = icq->ioc;
struct request_queue *q = icq->q;
struct elevator_type *et = q->elevator->type;
lockdep_assert_held(&ioc->lock);
radix_tree_delete(&ioc->icq_tree, icq->q->id);
hlist_del_init(&icq->ioc_node);
list_del_init(&icq->q_node);
/*
* Both setting lookup hint to and clearing it from @icq are done
* under queue_lock. If it's not pointing to @icq now, it never
* will. Hint assignment itself can race safely.
*/
if (rcu_access_pointer(ioc->icq_hint) == icq)
rcu_assign_pointer(ioc->icq_hint, NULL);
ioc_exit_icq(icq);
/*
* @icq->q might have gone away by the time RCU callback runs
* making it impossible to determine icq_cache. Record it in @icq.
*/
icq->__rcu_icq_cache = et->icq_cache;
icq->flags |= ICQ_DESTROYED;
call_rcu(&icq->__rcu_head, icq_free_icq_rcu);
}
/*
* Slow path for ioc release in put_io_context(). Performs double-lock
* dancing to unlink all icq's and then frees ioc.
*/
static void ioc_release_fn(struct work_struct *work)
{
struct io_context *ioc = container_of(work, struct io_context,
release_work);
spin_lock_irq(&ioc->lock);
while (!hlist_empty(&ioc->icq_list)) {
struct io_cq *icq = hlist_entry(ioc->icq_list.first,
struct io_cq, ioc_node);
struct request_queue *q = icq->q;
if (spin_trylock(&q->queue_lock)) {
ioc_destroy_icq(icq);
spin_unlock(&q->queue_lock);
} else {
/* Make sure q and icq cannot be freed. */
rcu_read_lock();
/* Re-acquire the locks in the correct order. */
spin_unlock(&ioc->lock);
spin_lock(&q->queue_lock);
spin_lock(&ioc->lock);
/*
* The icq may have been destroyed when the ioc lock
* was released.
*/
if (!(icq->flags & ICQ_DESTROYED))
ioc_destroy_icq(icq);
spin_unlock(&q->queue_lock);
rcu_read_unlock();
}
}
spin_unlock_irq(&ioc->lock);
kmem_cache_free(iocontext_cachep, ioc);
}
/**
* put_io_context - put a reference of io_context
* @ioc: io_context to put
*
* Decrement reference count of @ioc and release it if the count reaches
* zero.
*/
void put_io_context(struct io_context *ioc)
{
unsigned long flags;
bool free_ioc = false;
if (ioc == NULL)
return;
BUG_ON(atomic_long_read(&ioc->refcount) <= 0);
/*
* Releasing ioc requires reverse order double locking and we may
* already be holding a queue_lock. Do it asynchronously from wq.
*/
if (atomic_long_dec_and_test(&ioc->refcount)) {
spin_lock_irqsave(&ioc->lock, flags);
if (!hlist_empty(&ioc->icq_list))
queue_work(system_power_efficient_wq,
&ioc->release_work);
else
free_ioc = true;
spin_unlock_irqrestore(&ioc->lock, flags);
}
if (free_ioc)
kmem_cache_free(iocontext_cachep, ioc);
}
/**
* put_io_context_active - put active reference on ioc
* @ioc: ioc of interest
*
* Undo get_io_context_active(). If active reference reaches zero after
* put, @ioc can never issue further IOs and ioscheds are notified.
*/
void put_io_context_active(struct io_context *ioc)
{
struct io_cq *icq;
if (!atomic_dec_and_test(&ioc->active_ref)) {
put_io_context(ioc);
return;
}
spin_lock_irq(&ioc->lock);
hlist_for_each_entry(icq, &ioc->icq_list, ioc_node) {
if (icq->flags & ICQ_EXITED)
continue;
ioc_exit_icq(icq);
}
spin_unlock_irq(&ioc->lock);
put_io_context(ioc);
}
/* Called by the exiting task */
void exit_io_context(struct task_struct *task)
{
struct io_context *ioc;
task_lock(task);
ioc = task->io_context;
task->io_context = NULL;
task_unlock(task);
atomic_dec(&ioc->nr_tasks);
put_io_context_active(ioc);
}
static void __ioc_clear_queue(struct list_head *icq_list)
{
unsigned long flags;
rcu_read_lock();
while (!list_empty(icq_list)) {
struct io_cq *icq = list_entry(icq_list->next,
struct io_cq, q_node);
struct io_context *ioc = icq->ioc;
spin_lock_irqsave(&ioc->lock, flags);
if (icq->flags & ICQ_DESTROYED) {
spin_unlock_irqrestore(&ioc->lock, flags);
continue;
}
ioc_destroy_icq(icq);
spin_unlock_irqrestore(&ioc->lock, flags);
}
rcu_read_unlock();
}
/**
* ioc_clear_queue - break any ioc association with the specified queue
* @q: request_queue being cleared
*
* Walk @q->icq_list and exit all io_cq's.
*/
void ioc_clear_queue(struct request_queue *q)
{
LIST_HEAD(icq_list);
spin_lock_irq(&q->queue_lock);
list_splice_init(&q->icq_list, &icq_list);
spin_unlock_irq(&q->queue_lock);
__ioc_clear_queue(&icq_list);
}
int create_task_io_context(struct task_struct *task, gfp_t gfp_flags, int node)
{
struct io_context *ioc;
int ret;
ioc = kmem_cache_alloc_node(iocontext_cachep, gfp_flags | __GFP_ZERO,
node);
if (unlikely(!ioc))
return -ENOMEM;
/* initialize */
atomic_long_set(&ioc->refcount, 1);
atomic_set(&ioc->nr_tasks, 1);
atomic_set(&ioc->active_ref, 1);
spin_lock_init(&ioc->lock);
INIT_RADIX_TREE(&ioc->icq_tree, GFP_ATOMIC);
INIT_HLIST_HEAD(&ioc->icq_list);
INIT_WORK(&ioc->release_work, ioc_release_fn);
/*
* Try to install. ioc shouldn't be installed if someone else
* already did or @task, which isn't %current, is exiting. Note
* that we need to allow ioc creation on exiting %current as exit
* path may issue IOs from e.g. exit_files(). The exit path is
* responsible for not issuing IO after exit_io_context().
*/
task_lock(task);
if (!task->io_context &&
(task == current || !(task->flags & PF_EXITING))) task->io_context = ioc;
else
kmem_cache_free(iocontext_cachep, ioc); ret = task->io_context ? 0 : -EBUSY;
task_unlock(task);
return ret;
}
/**
* get_task_io_context - get io_context of a task
* @task: task of interest
* @gfp_flags: allocation flags, used if allocation is necessary
* @node: allocation node, used if allocation is necessary
*
* Return io_context of @task. If it doesn't exist, it is created with
* @gfp_flags and @node. The returned io_context has its reference count
* incremented.
*
* This function always goes through task_lock() and it's better to use
* %current->io_context + get_io_context() for %current.
*/
struct io_context *get_task_io_context(struct task_struct *task,
gfp_t gfp_flags, int node)
{
struct io_context *ioc;
might_sleep_if(gfpflags_allow_blocking(gfp_flags));
do {
task_lock(task);
ioc = task->io_context;
if (likely(ioc)) {
get_io_context(ioc);
task_unlock(task);
return ioc;
}
task_unlock(task);
} while (!create_task_io_context(task, gfp_flags, node));
return NULL;
}
/**
* ioc_lookup_icq - lookup io_cq from ioc
* @ioc: the associated io_context
* @q: the associated request_queue
*
* Look up io_cq associated with @ioc - @q pair from @ioc. Must be called
* with @q->queue_lock held.
*/
struct io_cq *ioc_lookup_icq(struct io_context *ioc, struct request_queue *q)
{
struct io_cq *icq;
lockdep_assert_held(&q->queue_lock);
/*
* icq's are indexed from @ioc using radix tree and hint pointer,
* both of which are protected with RCU. All removals are done
* holding both q and ioc locks, and we're holding q lock - if we
* find a icq which points to us, it's guaranteed to be valid.
*/
rcu_read_lock();
icq = rcu_dereference(ioc->icq_hint);
if (icq && icq->q == q)
goto out;
icq = radix_tree_lookup(&ioc->icq_tree, q->id);
if (icq && icq->q == q)
rcu_assign_pointer(ioc->icq_hint, icq); /* allowed to race */
else
icq = NULL;
out:
rcu_read_unlock();
return icq;
}
EXPORT_SYMBOL(ioc_lookup_icq);
/**
* ioc_create_icq - create and link io_cq
* @ioc: io_context of interest
* @q: request_queue of interest
* @gfp_mask: allocation mask
*
* Make sure io_cq linking @ioc and @q exists. If icq doesn't exist, they
* will be created using @gfp_mask.
*
* The caller is responsible for ensuring @ioc won't go away and @q is
* alive and will stay alive until this function returns.
*/
struct io_cq *ioc_create_icq(struct io_context *ioc, struct request_queue *q,
gfp_t gfp_mask)
{
struct elevator_type *et = q->elevator->type;
struct io_cq *icq;
/* allocate stuff */
icq = kmem_cache_alloc_node(et->icq_cache, gfp_mask | __GFP_ZERO,
q->node);
if (!icq)
return NULL;
if (radix_tree_maybe_preload(gfp_mask) < 0) {
kmem_cache_free(et->icq_cache, icq);
return NULL;
}
icq->ioc = ioc;
icq->q = q;
INIT_LIST_HEAD(&icq->q_node);
INIT_HLIST_NODE(&icq->ioc_node);
/* lock both q and ioc and try to link @icq */
spin_lock_irq(&q->queue_lock);
spin_lock(&ioc->lock);
if (likely(!radix_tree_insert(&ioc->icq_tree, q->id, icq))) {
hlist_add_head(&icq->ioc_node, &ioc->icq_list);
list_add(&icq->q_node, &q->icq_list);
if (et->ops.init_icq)
et->ops.init_icq(icq);
} else {
kmem_cache_free(et->icq_cache, icq);
icq = ioc_lookup_icq(ioc, q);
if (!icq)
printk(KERN_ERR "cfq: icq link failed!\n");
}
spin_unlock(&ioc->lock);
spin_unlock_irq(&q->queue_lock);
radix_tree_preload_end();
return icq;
}
static int __init blk_ioc_init(void)
{
iocontext_cachep = kmem_cache_create("blkdev_ioc",
sizeof(struct io_context), 0, SLAB_PANIC, NULL);
return 0;
}
subsys_initcall(blk_ioc_init);
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_SCHED_MM_H
#define _LINUX_SCHED_MM_H
#include <linux/kernel.h>
#include <linux/atomic.h>
#include <linux/sched.h>
#include <linux/mm_types.h>
#include <linux/gfp.h>
#include <linux/sync_core.h>
/*
* Routines for handling mm_structs
*/
extern struct mm_struct *mm_alloc(void);
/**
* mmgrab() - Pin a &struct mm_struct.
* @mm: The &struct mm_struct to pin.
*
* Make sure that @mm will not get freed even after the owning task
* exits. This doesn't guarantee that the associated address space
* will still exist later on and mmget_not_zero() has to be used before
* accessing it.
*
* This is a preferred way to pin @mm for a longer/unbounded amount
* of time.
*
* Use mmdrop() to release the reference acquired by mmgrab().
*
* See also <Documentation/vm/active_mm.rst> for an in-depth explanation
* of &mm_struct.mm_count vs &mm_struct.mm_users.
*/
static inline void mmgrab(struct mm_struct *mm)
{
atomic_inc(&mm->mm_count);
}
extern void __mmdrop(struct mm_struct *mm);
static inline void mmdrop(struct mm_struct *mm)
{
/*
* The implicit full barrier implied by atomic_dec_and_test() is
* required by the membarrier system call before returning to
* user-space, after storing to rq->curr.
*/
if (unlikely(atomic_dec_and_test(&mm->mm_count)))
__mmdrop(mm);
}
/**
* mmget() - Pin the address space associated with a &struct mm_struct.
* @mm: The address space to pin.
*
* Make sure that the address space of the given &struct mm_struct doesn't
* go away. This does not protect against parts of the address space being
* modified or freed, however.
*
* Never use this function to pin this address space for an
* unbounded/indefinite amount of time.
*
* Use mmput() to release the reference acquired by mmget().
*
* See also <Documentation/vm/active_mm.rst> for an in-depth explanation
* of &mm_struct.mm_count vs &mm_struct.mm_users.
*/
static inline void mmget(struct mm_struct *mm)
{
atomic_inc(&mm->mm_users);
}
static inline bool mmget_not_zero(struct mm_struct *mm)
{
return atomic_inc_not_zero(&mm->mm_users);
}
/* mmput gets rid of the mappings and all user-space */
extern void mmput(struct mm_struct *);
#ifdef CONFIG_MMU
/* same as above but performs the slow path from the async context. Can
* be called from the atomic context as well
*/
void mmput_async(struct mm_struct *);
#endif
/* Grab a reference to a task's mm, if it is not already going away */
extern struct mm_struct *get_task_mm(struct task_struct *task);
/*
* Grab a reference to a task's mm, if it is not already going away
* and ptrace_may_access with the mode parameter passed to it
* succeeds.
*/
extern struct mm_struct *mm_access(struct task_struct *task, unsigned int mode);
/* Remove the current tasks stale references to the old mm_struct on exit() */
extern void exit_mm_release(struct task_struct *, struct mm_struct *);
/* Remove the current tasks stale references to the old mm_struct on exec() */
extern void exec_mm_release(struct task_struct *, struct mm_struct *);
#ifdef CONFIG_MEMCG
extern void mm_update_next_owner(struct mm_struct *mm);
#else
static inline void mm_update_next_owner(struct mm_struct *mm)
{
}
#endif /* CONFIG_MEMCG */
#ifdef CONFIG_MMU
#ifndef arch_get_mmap_end
#define arch_get_mmap_end(addr) (TASK_SIZE)
#endif
#ifndef arch_get_mmap_base
#define arch_get_mmap_base(addr, base) (base)
#endif
extern void arch_pick_mmap_layout(struct mm_struct *mm,
struct rlimit *rlim_stack);
extern unsigned long
arch_get_unmapped_area(struct file *, unsigned long, unsigned long,
unsigned long, unsigned long);
extern unsigned long
arch_get_unmapped_area_topdown(struct file *filp, unsigned long addr,
unsigned long len, unsigned long pgoff,
unsigned long flags);
#else
static inline void arch_pick_mmap_layout(struct mm_struct *mm,
struct rlimit *rlim_stack) {}
#endif
static inline bool in_vfork(struct task_struct *tsk)
{
bool ret;
/*
* need RCU to access ->real_parent if CLONE_VM was used along with
* CLONE_PARENT.
*
* We check real_parent->mm == tsk->mm because CLONE_VFORK does not
* imply CLONE_VM
*
* CLONE_VFORK can be used with CLONE_PARENT/CLONE_THREAD and thus
* ->real_parent is not necessarily the task doing vfork(), so in
* theory we can't rely on task_lock() if we want to dereference it.
*
* And in this case we can't trust the real_parent->mm == tsk->mm
* check, it can be false negative. But we do not care, if init or
* another oom-unkillable task does this it should blame itself.
*/
rcu_read_lock();
ret = tsk->vfork_done &&
rcu_dereference(tsk->real_parent)->mm == tsk->mm;
rcu_read_unlock();
return ret;
}
/*
* Applies per-task gfp context to the given allocation flags.
* PF_MEMALLOC_NOIO implies GFP_NOIO
* PF_MEMALLOC_NOFS implies GFP_NOFS
* PF_MEMALLOC_PIN implies !GFP_MOVABLE
*/
static inline gfp_t current_gfp_context(gfp_t flags)
{
unsigned int pflags = READ_ONCE(current->flags);
if (unlikely(pflags & (PF_MEMALLOC_NOIO | PF_MEMALLOC_NOFS | PF_MEMALLOC_PIN))) {
/*
* NOIO implies both NOIO and NOFS and it is a weaker context
* so always make sure it makes precedence
*/
if (pflags & PF_MEMALLOC_NOIO) flags &= ~(__GFP_IO | __GFP_FS); else if (pflags & PF_MEMALLOC_NOFS) flags &= ~__GFP_FS; if (pflags & PF_MEMALLOC_PIN) flags &= ~__GFP_MOVABLE;
}
return flags;
}
#ifdef CONFIG_LOCKDEP
extern void __fs_reclaim_acquire(unsigned long ip);
extern void __fs_reclaim_release(unsigned long ip);
extern void fs_reclaim_acquire(gfp_t gfp_mask);
extern void fs_reclaim_release(gfp_t gfp_mask);
#else
static inline void __fs_reclaim_acquire(unsigned long ip) { }
static inline void __fs_reclaim_release(unsigned long ip) { }
static inline void fs_reclaim_acquire(gfp_t gfp_mask) { }
static inline void fs_reclaim_release(gfp_t gfp_mask) { }
#endif
/**
* might_alloc - Mark possible allocation sites
* @gfp_mask: gfp_t flags that would be used to allocate
*
* Similar to might_sleep() and other annotations, this can be used in functions
* that might allocate, but often don't. Compiles to nothing without
* CONFIG_LOCKDEP. Includes a conditional might_sleep() if @gfp allows blocking.
*/
static inline void might_alloc(gfp_t gfp_mask)
{
fs_reclaim_acquire(gfp_mask);
fs_reclaim_release(gfp_mask);
might_sleep_if(gfpflags_allow_blocking(gfp_mask));
}
/**
* memalloc_noio_save - Marks implicit GFP_NOIO allocation scope.
*
* This functions marks the beginning of the GFP_NOIO allocation scope.
* All further allocations will implicitly drop __GFP_IO flag and so
* they are safe for the IO critical section from the allocation recursion
* point of view. Use memalloc_noio_restore to end the scope with flags
* returned by this function.
*
* This function is safe to be used from any context.
*/
static inline unsigned int memalloc_noio_save(void)
{
unsigned int flags = current->flags & PF_MEMALLOC_NOIO;
current->flags |= PF_MEMALLOC_NOIO;
return flags;
}
/**
* memalloc_noio_restore - Ends the implicit GFP_NOIO scope.
* @flags: Flags to restore.
*
* Ends the implicit GFP_NOIO scope started by memalloc_noio_save function.
* Always make sure that the given flags is the return value from the
* pairing memalloc_noio_save call.
*/
static inline void memalloc_noio_restore(unsigned int flags)
{
current->flags = (current->flags & ~PF_MEMALLOC_NOIO) | flags;
}
/**
* memalloc_nofs_save - Marks implicit GFP_NOFS allocation scope.
*
* This functions marks the beginning of the GFP_NOFS allocation scope.
* All further allocations will implicitly drop __GFP_FS flag and so
* they are safe for the FS critical section from the allocation recursion
* point of view. Use memalloc_nofs_restore to end the scope with flags
* returned by this function.
*
* This function is safe to be used from any context.
*/
static inline unsigned int memalloc_nofs_save(void)
{
unsigned int flags = current->flags & PF_MEMALLOC_NOFS;
current->flags |= PF_MEMALLOC_NOFS;
return flags;
}
/**
* memalloc_nofs_restore - Ends the implicit GFP_NOFS scope.
* @flags: Flags to restore.
*
* Ends the implicit GFP_NOFS scope started by memalloc_nofs_save function.
* Always make sure that the given flags is the return value from the
* pairing memalloc_nofs_save call.
*/
static inline void memalloc_nofs_restore(unsigned int flags)
{
current->flags = (current->flags & ~PF_MEMALLOC_NOFS) | flags;
}
static inline unsigned int memalloc_noreclaim_save(void)
{
unsigned int flags = current->flags & PF_MEMALLOC;
current->flags |= PF_MEMALLOC;
return flags;
}
static inline void memalloc_noreclaim_restore(unsigned int flags)
{
current->flags = (current->flags & ~PF_MEMALLOC) | flags;
}
static inline unsigned int memalloc_pin_save(void)
{
unsigned int flags = current->flags & PF_MEMALLOC_PIN;
current->flags |= PF_MEMALLOC_PIN;
return flags;
}
static inline void memalloc_pin_restore(unsigned int flags)
{
current->flags = (current->flags & ~PF_MEMALLOC_PIN) | flags;
}
#ifdef CONFIG_MEMCG
DECLARE_PER_CPU(struct mem_cgroup *, int_active_memcg);
/**
* set_active_memcg - Starts the remote memcg charging scope.
* @memcg: memcg to charge.
*
* This function marks the beginning of the remote memcg charging scope. All the
* __GFP_ACCOUNT allocations till the end of the scope will be charged to the
* given memcg.
*
* NOTE: This function can nest. Users must save the return value and
* reset the previous value after their own charging scope is over.
*/
static inline struct mem_cgroup *
set_active_memcg(struct mem_cgroup *memcg)
{
struct mem_cgroup *old;
if (!in_task()) {
old = this_cpu_read(int_active_memcg);
this_cpu_write(int_active_memcg, memcg);
} else {
old = current->active_memcg;
current->active_memcg = memcg;
}
return old;
}
#else
static inline struct mem_cgroup *
set_active_memcg(struct mem_cgroup *memcg)
{
return NULL;
}
#endif
#ifdef CONFIG_MEMBARRIER
enum {
MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY = (1U << 0),
MEMBARRIER_STATE_PRIVATE_EXPEDITED = (1U << 1),
MEMBARRIER_STATE_GLOBAL_EXPEDITED_READY = (1U << 2),
MEMBARRIER_STATE_GLOBAL_EXPEDITED = (1U << 3),
MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE_READY = (1U << 4),
MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE = (1U << 5),
MEMBARRIER_STATE_PRIVATE_EXPEDITED_RSEQ_READY = (1U << 6),
MEMBARRIER_STATE_PRIVATE_EXPEDITED_RSEQ = (1U << 7),
};
enum {
MEMBARRIER_FLAG_SYNC_CORE = (1U << 0),
MEMBARRIER_FLAG_RSEQ = (1U << 1),
};
#ifdef CONFIG_ARCH_HAS_MEMBARRIER_CALLBACKS
#include <asm/membarrier.h>
#endif
static inline void membarrier_mm_sync_core_before_usermode(struct mm_struct *mm)
{
if (current->mm != mm)
return;
if (likely(!(atomic_read(&mm->membarrier_state) &
MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE)))
return;
sync_core_before_usermode();
}
extern void membarrier_exec_mmap(struct mm_struct *mm);
extern void membarrier_update_current_mm(struct mm_struct *next_mm);
#else
#ifdef CONFIG_ARCH_HAS_MEMBARRIER_CALLBACKS
static inline void membarrier_arch_switch_mm(struct mm_struct *prev,
struct mm_struct *next,
struct task_struct *tsk)
{
}
#endif
static inline void membarrier_exec_mmap(struct mm_struct *mm)
{
}
static inline void membarrier_mm_sync_core_before_usermode(struct mm_struct *mm)
{
}
static inline void membarrier_update_current_mm(struct mm_struct *next_mm)
{
}
#endif
#endif /* _LINUX_SCHED_MM_H */
/* SPDX-License-Identifier: GPL-2.0 */
/*
* This file provides wrappers with sanitizer instrumentation for atomic bit
* operations.
*
* To use this functionality, an arch's bitops.h file needs to define each of
* the below bit operations with an arch_ prefix (e.g. arch_set_bit(),
* arch___set_bit(), etc.).
*/
#ifndef _ASM_GENERIC_BITOPS_INSTRUMENTED_ATOMIC_H
#define _ASM_GENERIC_BITOPS_INSTRUMENTED_ATOMIC_H
#include <linux/instrumented.h>
/**
* set_bit - Atomically set a bit in memory
* @nr: the bit to set
* @addr: the address to start counting from
*
* This is a relaxed atomic operation (no implied memory barriers).
*
* Note that @nr may be almost arbitrarily large; this function is not
* restricted to acting on a single-word quantity.
*/
static inline void set_bit(long nr, volatile unsigned long *addr)
{
instrument_atomic_write(addr + BIT_WORD(nr), sizeof(long));
arch_set_bit(nr, addr);
}
/**
* clear_bit - Clears a bit in memory
* @nr: Bit to clear
* @addr: Address to start counting from
*
* This is a relaxed atomic operation (no implied memory barriers).
*/
static inline void clear_bit(long nr, volatile unsigned long *addr)
{
instrument_atomic_write(addr + BIT_WORD(nr), sizeof(long));
arch_clear_bit(nr, addr);
}
/**
* change_bit - Toggle a bit in memory
* @nr: Bit to change
* @addr: Address to start counting from
*
* This is a relaxed atomic operation (no implied memory barriers).
*
* Note that @nr may be almost arbitrarily large; this function is not
* restricted to acting on a single-word quantity.
*/
static inline void change_bit(long nr, volatile unsigned long *addr)
{
instrument_atomic_write(addr + BIT_WORD(nr), sizeof(long));
arch_change_bit(nr, addr);
}
/**
* test_and_set_bit - Set a bit and return its old value
* @nr: Bit to set
* @addr: Address to count from
*
* This is an atomic fully-ordered operation (implied full memory barrier).
*/
static inline bool test_and_set_bit(long nr, volatile unsigned long *addr)
{
instrument_atomic_read_write(addr + BIT_WORD(nr), sizeof(long));
return arch_test_and_set_bit(nr, addr);
}
/**
* test_and_clear_bit - Clear a bit and return its old value
* @nr: Bit to clear
* @addr: Address to count from
*
* This is an atomic fully-ordered operation (implied full memory barrier).
*/
static inline bool test_and_clear_bit(long nr, volatile unsigned long *addr)
{
instrument_atomic_read_write(addr + BIT_WORD(nr), sizeof(long));
return arch_test_and_clear_bit(nr, addr);
}
/**
* test_and_change_bit - Change a bit and return its old value
* @nr: Bit to change
* @addr: Address to count from
*
* This is an atomic fully-ordered operation (implied full memory barrier).
*/
static inline bool test_and_change_bit(long nr, volatile unsigned long *addr)
{
instrument_atomic_read_write(addr + BIT_WORD(nr), sizeof(long));
return arch_test_and_change_bit(nr, addr);
}
#endif /* _ASM_GENERIC_BITOPS_INSTRUMENTED_NON_ATOMIC_H */
// SPDX-License-Identifier: GPL-2.0
/*
* Out-of-line refcount functions.
*/
#include <linux/mutex.h>
#include <linux/refcount.h>
#include <linux/spinlock.h>
#include <linux/bug.h>
#define REFCOUNT_WARN(str) WARN_ONCE(1, "refcount_t: " str ".\n")
void refcount_warn_saturate(refcount_t *r, enum refcount_saturation_type t)
{
refcount_set(r, REFCOUNT_SATURATED);
switch (t) {
case REFCOUNT_ADD_NOT_ZERO_OVF:
REFCOUNT_WARN("saturated; leaking memory");
break;
case REFCOUNT_ADD_OVF:
REFCOUNT_WARN("saturated; leaking memory");
break;
case REFCOUNT_ADD_UAF:
REFCOUNT_WARN("addition on 0; use-after-free");
break;
case REFCOUNT_SUB_UAF:
REFCOUNT_WARN("underflow; use-after-free");
break;
case REFCOUNT_DEC_LEAK:
REFCOUNT_WARN("decrement hit 0; leaking memory");
break;
default:
REFCOUNT_WARN("unknown saturation event!?");
}
}
EXPORT_SYMBOL(refcount_warn_saturate);
/**
* refcount_dec_if_one - decrement a refcount if it is 1
* @r: the refcount
*
* No atomic_t counterpart, it attempts a 1 -> 0 transition and returns the
* success thereof.
*
* Like all decrement operations, it provides release memory order and provides
* a control dependency.
*
* It can be used like a try-delete operator; this explicit case is provided
* and not cmpxchg in generic, because that would allow implementing unsafe
* operations.
*
* Return: true if the resulting refcount is 0, false otherwise
*/
bool refcount_dec_if_one(refcount_t *r)
{
int val = 1;
return atomic_try_cmpxchg_release(&r->refs, &val, 0);
}
EXPORT_SYMBOL(refcount_dec_if_one);
/**
* refcount_dec_not_one - decrement a refcount if it is not 1
* @r: the refcount
*
* No atomic_t counterpart, it decrements unless the value is 1, in which case
* it will return false.
*
* Was often done like: atomic_add_unless(&var, -1, 1)
*
* Return: true if the decrement operation was successful, false otherwise
*/
bool refcount_dec_not_one(refcount_t *r)
{
unsigned int new, val = atomic_read(&r->refs);
do {
if (unlikely(val == REFCOUNT_SATURATED))
return true;
if (val == 1)
return false;
new = val - 1;
if (new > val) {
WARN_ONCE(new > val, "refcount_t: underflow; use-after-free.\n");
return true;
}
} while (!atomic_try_cmpxchg_release(&r->refs, &val, new));
return true;
}
EXPORT_SYMBOL(refcount_dec_not_one);
/**
* refcount_dec_and_mutex_lock - return holding mutex if able to decrement
* refcount to 0
* @r: the refcount
* @lock: the mutex to be locked
*
* Similar to atomic_dec_and_mutex_lock(), it will WARN on underflow and fail
* to decrement when saturated at REFCOUNT_SATURATED.
*
* Provides release memory ordering, such that prior loads and stores are done
* before, and provides a control dependency such that free() must come after.
* See the comment on top.
*
* Return: true and hold mutex if able to decrement refcount to 0, false
* otherwise
*/
bool refcount_dec_and_mutex_lock(refcount_t *r, struct mutex *lock)
{
if (refcount_dec_not_one(r))
return false;
mutex_lock(lock);
if (!refcount_dec_and_test(r)) {
mutex_unlock(lock);
return false;
}
return true;
}
EXPORT_SYMBOL(refcount_dec_and_mutex_lock);
/**
* refcount_dec_and_lock - return holding spinlock if able to decrement
* refcount to 0
* @r: the refcount
* @lock: the spinlock to be locked
*
* Similar to atomic_dec_and_lock(), it will WARN on underflow and fail to
* decrement when saturated at REFCOUNT_SATURATED.
*
* Provides release memory ordering, such that prior loads and stores are done
* before, and provides a control dependency such that free() must come after.
* See the comment on top.
*
* Return: true and hold spinlock if able to decrement refcount to 0, false
* otherwise
*/
bool refcount_dec_and_lock(refcount_t *r, spinlock_t *lock)
{
if (refcount_dec_not_one(r))
return false;
spin_lock(lock);
if (!refcount_dec_and_test(r)) {
spin_unlock(lock);
return false;
}
return true;
}
EXPORT_SYMBOL(refcount_dec_and_lock);
/**
* refcount_dec_and_lock_irqsave - return holding spinlock with disabled
* interrupts if able to decrement refcount to 0
* @r: the refcount
* @lock: the spinlock to be locked
* @flags: saved IRQ-flags if the is acquired
*
* Same as refcount_dec_and_lock() above except that the spinlock is acquired
* with disabled interrupts.
*
* Return: true and hold spinlock if able to decrement refcount to 0, false
* otherwise
*/
bool refcount_dec_and_lock_irqsave(refcount_t *r, spinlock_t *lock,
unsigned long *flags)
{
if (refcount_dec_not_one(r))
return false;
spin_lock_irqsave(lock, *flags);
if (!refcount_dec_and_test(r)) {
spin_unlock_irqrestore(lock, *flags);
return false;
}
return true;
}
EXPORT_SYMBOL(refcount_dec_and_lock_irqsave);
/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
* INET An implementation of the TCP/IP protocol suite for the LINUX
* operating system. INET is implemented using the BSD Socket
* interface as the means of communication with the user level.
*
* Definitions for the TCP protocol.
*
* Version: @(#)tcp.h 1.0.2 04/28/93
*
* Author: Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
*/
#ifndef _LINUX_TCP_H
#define _LINUX_TCP_H
#include <linux/skbuff.h>
#include <linux/win_minmax.h>
#include <net/sock.h>
#include <net/inet_connection_sock.h>
#include <net/inet_timewait_sock.h>
#include <uapi/linux/tcp.h>
static inline struct tcphdr *tcp_hdr(const struct sk_buff *skb)
{
return (struct tcphdr *)skb_transport_header(skb);
}
static inline unsigned int __tcp_hdrlen(const struct tcphdr *th)
{
return th->doff * 4;
}
static inline unsigned int tcp_hdrlen(const struct sk_buff *skb)
{
return __tcp_hdrlen(tcp_hdr(skb));
}
static inline struct tcphdr *inner_tcp_hdr(const struct sk_buff *skb)
{
return (struct tcphdr *)skb_inner_transport_header(skb);
}
static inline unsigned int inner_tcp_hdrlen(const struct sk_buff *skb)
{
return inner_tcp_hdr(skb)->doff * 4;
}
static inline unsigned int tcp_optlen(const struct sk_buff *skb)
{
return (tcp_hdr(skb)->doff - 5) * 4;
}
/* TCP Fast Open */
#define TCP_FASTOPEN_COOKIE_MIN 4 /* Min Fast Open Cookie size in bytes */
#define TCP_FASTOPEN_COOKIE_MAX 16 /* Max Fast Open Cookie size in bytes */
#define TCP_FASTOPEN_COOKIE_SIZE 8 /* the size employed by this impl. */
/* TCP Fast Open Cookie as stored in memory */
struct tcp_fastopen_cookie {
__le64 val[DIV_ROUND_UP(TCP_FASTOPEN_COOKIE_MAX, sizeof(u64))];
s8 len;
bool exp; /* In RFC6994 experimental option format */
};
/* This defines a selective acknowledgement block. */
struct tcp_sack_block_wire {
__be32 start_seq;
__be32 end_seq;
};
struct tcp_sack_block {
u32 start_seq;
u32 end_seq;
};
/*These are used to set the sack_ok field in struct tcp_options_received */
#define TCP_SACK_SEEN (1 << 0) /*1 = peer is SACK capable, */
#define TCP_DSACK_SEEN (1 << 2) /*1 = DSACK was received from peer*/
struct tcp_options_received {
/* PAWS/RTTM data */
int ts_recent_stamp;/* Time we stored ts_recent (for aging) */
u32 ts_recent; /* Time stamp to echo next */
u32 rcv_tsval; /* Time stamp value */
u32 rcv_tsecr; /* Time stamp echo reply */
u16 saw_tstamp : 1, /* Saw TIMESTAMP on last packet */
tstamp_ok : 1, /* TIMESTAMP seen on SYN packet */
dsack : 1, /* D-SACK is scheduled */
wscale_ok : 1, /* Wscale seen on SYN packet */
sack_ok : 3, /* SACK seen on SYN packet */
smc_ok : 1, /* SMC seen on SYN packet */
snd_wscale : 4, /* Window scaling received from sender */
rcv_wscale : 4; /* Window scaling to send to receiver */
u8 saw_unknown:1, /* Received unknown option */
unused:7;
u8 num_sacks; /* Number of SACK blocks */
u16 user_mss; /* mss requested by user in ioctl */
u16 mss_clamp; /* Maximal mss, negotiated at connection setup */
};
static inline void tcp_clear_options(struct tcp_options_received *rx_opt)
{
rx_opt->tstamp_ok = rx_opt->sack_ok = 0;
rx_opt->wscale_ok = rx_opt->snd_wscale = 0;
#if IS_ENABLED(CONFIG_SMC)
rx_opt->smc_ok = 0;
#endif
}
/* This is the max number of SACKS that we'll generate and process. It's safe
* to increase this, although since:
* size = TCPOLEN_SACK_BASE_ALIGNED (4) + n * TCPOLEN_SACK_PERBLOCK (8)
* only four options will fit in a standard TCP header */
#define TCP_NUM_SACKS 4
struct tcp_request_sock_ops;
struct tcp_request_sock {
struct inet_request_sock req;
const struct tcp_request_sock_ops *af_specific;
u64 snt_synack; /* first SYNACK sent time */
bool tfo_listener;
bool is_mptcp;
#if IS_ENABLED(CONFIG_MPTCP)
bool drop_req;
#endif
u32 txhash;
u32 rcv_isn;
u32 snt_isn;
u32 ts_off;
u32 last_oow_ack_time; /* last SYNACK */
u32 rcv_nxt; /* the ack # by SYNACK. For
* FastOpen it's the seq#
* after data-in-SYN.
*/
u8 syn_tos;
};
static inline struct tcp_request_sock *tcp_rsk(const struct request_sock *req)
{
return (struct tcp_request_sock *)req;
}
struct tcp_sock {
/* inet_connection_sock has to be the first member of tcp_sock */
struct inet_connection_sock inet_conn;
u16 tcp_header_len; /* Bytes of tcp header to send */
u16 gso_segs; /* Max number of segs per GSO packet */
/*
* Header prediction flags
* 0x5?10 << 16 + snd_wnd in net byte order
*/
__be32 pred_flags;
/*
* RFC793 variables by their proper names. This means you can
* read the code and the spec side by side (and laugh ...)
* See RFC793 and RFC1122. The RFC writes these in capitals.
*/
u64 bytes_received; /* RFC4898 tcpEStatsAppHCThruOctetsReceived
* sum(delta(rcv_nxt)), or how many bytes
* were acked.
*/
u32 segs_in; /* RFC4898 tcpEStatsPerfSegsIn
* total number of segments in.
*/
u32 data_segs_in; /* RFC4898 tcpEStatsPerfDataSegsIn
* total number of data segments in.
*/
u32 rcv_nxt; /* What we want to receive next */
u32 copied_seq; /* Head of yet unread data */
u32 rcv_wup; /* rcv_nxt on last window update sent */
u32 snd_nxt; /* Next sequence we send */
u32 segs_out; /* RFC4898 tcpEStatsPerfSegsOut
* The total number of segments sent.
*/
u32 data_segs_out; /* RFC4898 tcpEStatsPerfDataSegsOut
* total number of data segments sent.
*/
u64 bytes_sent; /* RFC4898 tcpEStatsPerfHCDataOctetsOut
* total number of data bytes sent.
*/
u64 bytes_acked; /* RFC4898 tcpEStatsAppHCThruOctetsAcked
* sum(delta(snd_una)), or how many bytes
* were acked.
*/
u32 dsack_dups; /* RFC4898 tcpEStatsStackDSACKDups
* total number of DSACK blocks received
*/
u32 snd_una; /* First byte we want an ack for */
u32 snd_sml; /* Last byte of the most recently transmitted small packet */
u32 rcv_tstamp; /* timestamp of last received ACK (for keepalives) */
u32 lsndtime; /* timestamp of last sent data packet (for restart window) */
u32 last_oow_ack_time; /* timestamp of last out-of-window ACK */
u32 compressed_ack_rcv_nxt;
u32 tsoffset; /* timestamp offset */
struct list_head tsq_node; /* anchor in tsq_tasklet.head list */
struct list_head tsorted_sent_queue; /* time-sorted sent but un-SACKed skbs */
u32 snd_wl1; /* Sequence for window update */
u32 snd_wnd; /* The window we expect to receive */
u32 max_window; /* Maximal window ever seen from peer */
u32 mss_cache; /* Cached effective mss, not including SACKS */
u32 window_clamp; /* Maximal window to advertise */
u32 rcv_ssthresh; /* Current window clamp */
/* Information of the most recently (s)acked skb */
struct tcp_rack {
u64 mstamp; /* (Re)sent time of the skb */
u32 rtt_us; /* Associated RTT */
u32 end_seq; /* Ending TCP sequence of the skb */
u32 last_delivered; /* tp->delivered at last reo_wnd adj */
u8 reo_wnd_steps; /* Allowed reordering window */
#define TCP_RACK_RECOVERY_THRESH 16
u8 reo_wnd_persist:5, /* No. of recovery since last adj */
dsack_seen:1, /* Whether DSACK seen after last adj */
advanced:1; /* mstamp advanced since last lost marking */
} rack;
u16 advmss; /* Advertised MSS */
u8 compressed_ack;
u8 dup_ack_counter:2,
tlp_retrans:1, /* TLP is a retransmission */
unused:5;
u32 chrono_start; /* Start time in jiffies of a TCP chrono */
u32 chrono_stat[3]; /* Time in jiffies for chrono_stat stats */
u8 chrono_type:2, /* current chronograph type */
rate_app_limited:1, /* rate_{delivered,interval_us} limited? */
fastopen_connect:1, /* FASTOPEN_CONNECT sockopt */
fastopen_no_cookie:1, /* Allow send/recv SYN+data without a cookie */
is_sack_reneg:1, /* in recovery from loss with SACK reneg? */
fastopen_client_fail:2; /* reason why fastopen failed */
u8 nonagle : 4,/* Disable Nagle algorithm? */
thin_lto : 1,/* Use linear timeouts for thin streams */
recvmsg_inq : 1,/* Indicate # of bytes in queue upon recvmsg */
repair : 1,
frto : 1;/* F-RTO (RFC5682) activated in CA_Loss */
u8 repair_queue;
u8 save_syn:2, /* Save headers of SYN packet */
syn_data:1, /* SYN includes data */
syn_fastopen:1, /* SYN includes Fast Open option */
syn_fastopen_exp:1,/* SYN includes Fast Open exp. option */
syn_fastopen_ch:1, /* Active TFO re-enabling probe */
syn_data_acked:1,/* data in SYN is acked by SYN-ACK */
is_cwnd_limited:1;/* forward progress limited by snd_cwnd? */
u32 tlp_high_seq; /* snd_nxt at the time of TLP */
u32 tcp_tx_delay; /* delay (in usec) added to TX packets */
u64 tcp_wstamp_ns; /* departure time for next sent data packet */
u64 tcp_clock_cache; /* cache last tcp_clock_ns() (see tcp_mstamp_refresh()) */
/* RTT measurement */
u64 tcp_mstamp; /* most recent packet received/sent */
u32 srtt_us; /* smoothed round trip time << 3 in usecs */
u32 mdev_us; /* medium deviation */
u32 mdev_max_us; /* maximal mdev for the last rtt period */
u32 rttvar_us; /* smoothed mdev_max */
u32 rtt_seq; /* sequence number to update rttvar */
struct minmax rtt_min;
u32 packets_out; /* Packets which are "in flight" */
u32 retrans_out; /* Retransmitted packets out */
u32 max_packets_out; /* max packets_out in last window */
u32 max_packets_seq; /* right edge of max_packets_out flight */
u16 urg_data; /* Saved octet of OOB data and control flags */
u8 ecn_flags; /* ECN status bits. */
u8 keepalive_probes; /* num of allowed keep alive probes */
u32 reordering; /* Packet reordering metric. */
u32 reord_seen; /* number of data packet reordering events */
u32 snd_up; /* Urgent pointer */
/*
* Options received (usually on last packet, some only on SYN packets).
*/
struct tcp_options_received rx_opt;
/*
* Slow start and congestion control (see also Nagle, and Karn & Partridge)
*/
u32 snd_ssthresh; /* Slow start size threshold */
u32 snd_cwnd; /* Sending congestion window */
u32 snd_cwnd_cnt; /* Linear increase counter */
u32 snd_cwnd_clamp; /* Do not allow snd_cwnd to grow above this */
u32 snd_cwnd_used;
u32 snd_cwnd_stamp;
u32 prior_cwnd; /* cwnd right before starting loss recovery */
u32 prr_delivered; /* Number of newly delivered packets to
* receiver in Recovery. */
u32 prr_out; /* Total number of pkts sent during Recovery. */
u32 delivered; /* Total data packets delivered incl. rexmits */
u32 delivered_ce; /* Like the above but only ECE marked packets */
u32 lost; /* Total data packets lost incl. rexmits */
u32 app_limited; /* limited until "delivered" reaches this val */
u64 first_tx_mstamp; /* start of window send phase */
u64 delivered_mstamp; /* time we reached "delivered" */
u32 rate_delivered; /* saved rate sample: packets delivered */
u32 rate_interval_us; /* saved rate sample: time elapsed */
u32 rcv_wnd; /* Current receiver window */
u32 write_seq; /* Tail(+1) of data held in tcp send buffer */
u32 notsent_lowat; /* TCP_NOTSENT_LOWAT */
u32 pushed_seq; /* Last pushed seq, required to talk to windows */
u32 lost_out; /* Lost packets */
u32 sacked_out; /* SACK'd packets */
struct hrtimer pacing_timer;
struct hrtimer compressed_ack_timer;
/* from STCP, retrans queue hinting */
struct sk_buff* lost_skb_hint;
struct sk_buff *retransmit_skb_hint;
/* OOO segments go in this rbtree. Socket lock must be held. */
struct rb_root out_of_order_queue;
struct sk_buff *ooo_last_skb; /* cache rb_last(out_of_order_queue) */
/* SACKs data, these 2 need to be together (see tcp_options_write) */
struct tcp_sack_block duplicate_sack[1]; /* D-SACK block */
struct tcp_sack_block selective_acks[4]; /* The SACKS themselves*/
struct tcp_sack_block recv_sack_cache[4];
struct sk_buff *highest_sack; /* skb just after the highest
* skb with SACKed bit set
* (validity guaranteed only if
* sacked_out > 0)
*/
int lost_cnt_hint;
u32 prior_ssthresh; /* ssthresh saved at recovery start */
u32 high_seq; /* snd_nxt at onset of congestion */
u32 retrans_stamp; /* Timestamp of the last retransmit,
* also used in SYN-SENT to remember stamp of
* the first SYN. */
u32 undo_marker; /* snd_una upon a new recovery episode. */
int undo_retrans; /* number of undoable retransmissions. */
u64 bytes_retrans; /* RFC4898 tcpEStatsPerfOctetsRetrans
* Total data bytes retransmitted
*/
u32 total_retrans; /* Total retransmits for entire connection */
u32 urg_seq; /* Seq of received urgent pointer */
unsigned int keepalive_time; /* time before keep alive takes place */
unsigned int keepalive_intvl; /* time interval between keep alive probes */
int linger2;
/* Sock_ops bpf program related variables */
#ifdef CONFIG_BPF
u8 bpf_sock_ops_cb_flags; /* Control calling BPF programs
* values defined in uapi/linux/tcp.h
*/
#define BPF_SOCK_OPS_TEST_FLAG(TP, ARG) (TP->bpf_sock_ops_cb_flags & ARG)
#else
#define BPF_SOCK_OPS_TEST_FLAG(TP, ARG) 0
#endif
u16 timeout_rehash; /* Timeout-triggered rehash attempts */
u32 rcv_ooopack; /* Received out-of-order packets, for tcpinfo */
/* Receiver side RTT estimation */
u32 rcv_rtt_last_tsecr;
struct {
u32 rtt_us;
u32 seq;
u64 time;
} rcv_rtt_est;
/* Receiver queue space */
struct {
u32 space;
u32 seq;
u64 time;
} rcvq_space;
/* TCP-specific MTU probe information. */
struct {
u32 probe_seq_start;
u32 probe_seq_end;
} mtu_probe;
u32 mtu_info; /* We received an ICMP_FRAG_NEEDED / ICMPV6_PKT_TOOBIG
* while socket was owned by user.
*/
#if IS_ENABLED(CONFIG_MPTCP)
bool is_mptcp;
#endif
#if IS_ENABLED(CONFIG_SMC)
bool syn_smc; /* SYN includes SMC */
#endif
#ifdef CONFIG_TCP_MD5SIG
/* TCP AF-Specific parts; only used by MD5 Signature support so far */
const struct tcp_sock_af_ops *af_specific;
/* TCP MD5 Signature Option information */
struct tcp_md5sig_info __rcu *md5sig_info;
#endif
/* TCP fastopen related information */
struct tcp_fastopen_request *fastopen_req;
/* fastopen_rsk points to request_sock that resulted in this big
* socket. Used to retransmit SYNACKs etc.
*/
struct request_sock __rcu *fastopen_rsk;
struct saved_syn *saved_syn;
};
enum tsq_enum {
TSQ_THROTTLED,
TSQ_QUEUED,
TCP_TSQ_DEFERRED, /* tcp_tasklet_func() found socket was owned */
TCP_WRITE_TIMER_DEFERRED, /* tcp_write_timer() found socket was owned */
TCP_DELACK_TIMER_DEFERRED, /* tcp_delack_timer() found socket was owned */
TCP_MTU_REDUCED_DEFERRED, /* tcp_v{4|6}_err() could not call
* tcp_v{4|6}_mtu_reduced()
*/
};
enum tsq_flags {
TSQF_THROTTLED = (1UL << TSQ_THROTTLED),
TSQF_QUEUED = (1UL << TSQ_QUEUED),
TCPF_TSQ_DEFERRED = (1UL << TCP_TSQ_DEFERRED),
TCPF_WRITE_TIMER_DEFERRED = (1UL << TCP_WRITE_TIMER_DEFERRED),
TCPF_DELACK_TIMER_DEFERRED = (1UL << TCP_DELACK_TIMER_DEFERRED),
TCPF_MTU_REDUCED_DEFERRED = (1UL << TCP_MTU_REDUCED_DEFERRED),
};
static inline struct tcp_sock *tcp_sk(const struct sock *sk)
{
return (struct tcp_sock *)sk;
}
struct tcp_timewait_sock {
struct inet_timewait_sock tw_sk;
#define tw_rcv_nxt tw_sk.__tw_common.skc_tw_rcv_nxt
#define tw_snd_nxt tw_sk.__tw_common.skc_tw_snd_nxt
u32 tw_rcv_wnd;
u32 tw_ts_offset;
u32 tw_ts_recent;
/* The time we sent the last out-of-window ACK: */
u32 tw_last_oow_ack_time;
int tw_ts_recent_stamp;
u32 tw_tx_delay;
#ifdef CONFIG_TCP_MD5SIG
struct tcp_md5sig_key *tw_md5_key;
#endif
};
static inline struct tcp_timewait_sock *tcp_twsk(const struct sock *sk)
{
return (struct tcp_timewait_sock *)sk;
}
static inline bool tcp_passive_fastopen(const struct sock *sk)
{
return sk->sk_state == TCP_SYN_RECV && rcu_access_pointer(tcp_sk(sk)->fastopen_rsk) != NULL;
}
static inline void fastopen_queue_tune(struct sock *sk, int backlog)
{
struct request_sock_queue *queue = &inet_csk(sk)->icsk_accept_queue;
int somaxconn = READ_ONCE(sock_net(sk)->core.sysctl_somaxconn);
queue->fastopenq.max_qlen = min_t(unsigned int, backlog, somaxconn);
}
static inline void tcp_move_syn(struct tcp_sock *tp,
struct request_sock *req)
{
tp->saved_syn = req->saved_syn;
req->saved_syn = NULL;
}
static inline void tcp_saved_syn_free(struct tcp_sock *tp)
{
kfree(tp->saved_syn);
tp->saved_syn = NULL;
}
static inline u32 tcp_saved_syn_len(const struct saved_syn *saved_syn)
{
return saved_syn->mac_hdrlen + saved_syn->network_hdrlen +
saved_syn->tcp_hdrlen;
}
struct sk_buff *tcp_get_timestamping_opt_stats(const struct sock *sk,
const struct sk_buff *orig_skb,
const struct sk_buff *ack_skb);
static inline u16 tcp_mss_clamp(const struct tcp_sock *tp, u16 mss)
{
/* We use READ_ONCE() here because socket might not be locked.
* This happens for listeners.
*/
u16 user_mss = READ_ONCE(tp->rx_opt.user_mss);
return (user_mss && user_mss < mss) ? user_mss : mss;
}
int tcp_skb_shift(struct sk_buff *to, struct sk_buff *from, int pcount,
int shiftlen);
void tcp_sock_set_cork(struct sock *sk, bool on);
int tcp_sock_set_keepcnt(struct sock *sk, int val);
int tcp_sock_set_keepidle_locked(struct sock *sk, int val);
int tcp_sock_set_keepidle(struct sock *sk, int val);
int tcp_sock_set_keepintvl(struct sock *sk, int val);
void tcp_sock_set_nodelay(struct sock *sk);
void tcp_sock_set_quickack(struct sock *sk, int val);
int tcp_sock_set_syncnt(struct sock *sk, int val);
void tcp_sock_set_user_timeout(struct sock *sk, u32 val);
#endif /* _LINUX_TCP_H */
// SPDX-License-Identifier: GPL-2.0-only
/*
* async.c: Asynchronous function calls for boot performance
*
* (C) Copyright 2009 Intel Corporation
* Author: Arjan van de Ven <arjan@linux.intel.com>
*/
/*
Goals and Theory of Operation
The primary goal of this feature is to reduce the kernel boot time,
by doing various independent hardware delays and discovery operations
decoupled and not strictly serialized.
More specifically, the asynchronous function call concept allows
certain operations (primarily during system boot) to happen
asynchronously, out of order, while these operations still
have their externally visible parts happen sequentially and in-order.
(not unlike how out-of-order CPUs retire their instructions in order)
Key to the asynchronous function call implementation is the concept of
a "sequence cookie" (which, although it has an abstracted type, can be
thought of as a monotonically incrementing number).
The async core will assign each scheduled event such a sequence cookie and
pass this to the called functions.
The asynchronously called function should before doing a globally visible
operation, such as registering device numbers, call the
async_synchronize_cookie() function and pass in its own cookie. The
async_synchronize_cookie() function will make sure that all asynchronous
operations that were scheduled prior to the operation corresponding with the
cookie have completed.
Subsystem/driver initialization code that scheduled asynchronous probe
functions, but which shares global resources with other drivers/subsystems
that do not use the asynchronous call feature, need to do a full
synchronization with the async_synchronize_full() function, before returning
from their init function. This is to maintain strict ordering between the
asynchronous and synchronous parts of the kernel.
*/
#include <linux/async.h>
#include <linux/atomic.h>
#include <linux/ktime.h>
#include <linux/export.h>
#include <linux/wait.h>
#include <linux/sched.h>
#include <linux/slab.h>
#include <linux/workqueue.h>
#include "workqueue_internal.h"
static async_cookie_t next_cookie = 1;
#define MAX_WORK 32768
#define ASYNC_COOKIE_MAX ULLONG_MAX /* infinity cookie */
static LIST_HEAD(async_global_pending); /* pending from all registered doms */
static ASYNC_DOMAIN(async_dfl_domain);
static DEFINE_SPINLOCK(async_lock);
struct async_entry {
struct list_head domain_list;
struct list_head global_list;
struct work_struct work;
async_cookie_t cookie;
async_func_t func;
void *data;
struct async_domain *domain;
};
static DECLARE_WAIT_QUEUE_HEAD(async_done);
static atomic_t entry_count;
static long long microseconds_since(ktime_t start)
{
ktime_t now = ktime_get();
return ktime_to_ns(ktime_sub(now, start)) >> 10;
}
static async_cookie_t lowest_in_progress(struct async_domain *domain)
{
struct async_entry *first = NULL;
async_cookie_t ret = ASYNC_COOKIE_MAX;
unsigned long flags;
spin_lock_irqsave(&async_lock, flags);
if (domain) {
if (!list_empty(&domain->pending))
first = list_first_entry(&domain->pending,
struct async_entry, domain_list);
} else {
if (!list_empty(&async_global_pending))
first = list_first_entry(&async_global_pending,
struct async_entry, global_list);
}
if (first)
ret = first->cookie;
spin_unlock_irqrestore(&async_lock, flags);
return ret;
}
/*
* pick the first pending entry and run it
*/
static void async_run_entry_fn(struct work_struct *work)
{
struct async_entry *entry =
container_of(work, struct async_entry, work);
unsigned long flags;
ktime_t calltime;
/* 1) run (and print duration) */
pr_debug("calling %lli_%pS @ %i\n", (long long)entry->cookie,
entry->func, task_pid_nr(current));
calltime = ktime_get();
entry->func(entry->data, entry->cookie);
pr_debug("initcall %lli_%pS returned after %lld usecs\n",
(long long)entry->cookie, entry->func,
microseconds_since(calltime));
/* 2) remove self from the pending queues */
spin_lock_irqsave(&async_lock, flags);
list_del_init(&entry->domain_list);
list_del_init(&entry->global_list);
/* 3) free the entry */
kfree(entry);
atomic_dec(&entry_count);
spin_unlock_irqrestore(&async_lock, flags);
/* 4) wake up any waiters */
wake_up(&async_done);
}
/**
* async_schedule_node_domain - NUMA specific version of async_schedule_domain
* @func: function to execute asynchronously
* @data: data pointer to pass to the function
* @node: NUMA node that we want to schedule this on or close to
* @domain: the domain
*
* Returns an async_cookie_t that may be used for checkpointing later.
* @domain may be used in the async_synchronize_*_domain() functions to
* wait within a certain synchronization domain rather than globally.
*
* Note: This function may be called from atomic or non-atomic contexts.
*
* The node requested will be honored on a best effort basis. If the node
* has no CPUs associated with it then the work is distributed among all
* available CPUs.
*/
async_cookie_t async_schedule_node_domain(async_func_t func, void *data,
int node, struct async_domain *domain)
{
struct async_entry *entry;
unsigned long flags;
async_cookie_t newcookie;
/* allow irq-off callers */
entry = kzalloc(sizeof(struct async_entry), GFP_ATOMIC);
/*
* If we're out of memory or if there's too much work
* pending already, we execute synchronously.
*/
if (!entry || atomic_read(&entry_count) > MAX_WORK) {
kfree(entry);
spin_lock_irqsave(&async_lock, flags);
newcookie = next_cookie++;
spin_unlock_irqrestore(&async_lock, flags);
/* low on memory.. run synchronously */
func(data, newcookie);
return newcookie;
}
INIT_LIST_HEAD(&entry->domain_list);
INIT_LIST_HEAD(&entry->global_list);
INIT_WORK(&entry->work, async_run_entry_fn);
entry->func = func;
entry->data = data;
entry->domain = domain;
spin_lock_irqsave(&async_lock, flags);
/* allocate cookie and queue */
newcookie = entry->cookie = next_cookie++;
list_add_tail(&entry->domain_list, &domain->pending);
if (domain->registered)
list_add_tail(&entry->global_list, &async_global_pending);
atomic_inc(&entry_count);
spin_unlock_irqrestore(&async_lock, flags);
/* schedule for execution */
queue_work_node(node, system_unbound_wq, &entry->work);
return newcookie;
}
EXPORT_SYMBOL_GPL(async_schedule_node_domain);
/**
* async_schedule_node - NUMA specific version of async_schedule
* @func: function to execute asynchronously
* @data: data pointer to pass to the function
* @node: NUMA node that we want to schedule this on or close to
*
* Returns an async_cookie_t that may be used for checkpointing later.
* Note: This function may be called from atomic or non-atomic contexts.
*
* The node requested will be honored on a best effort basis. If the node
* has no CPUs associated with it then the work is distributed among all
* available CPUs.
*/
async_cookie_t async_schedule_node(async_func_t func, void *data, int node)
{
return async_schedule_node_domain(func, data, node, &async_dfl_domain);
}
EXPORT_SYMBOL_GPL(async_schedule_node);
/**
* async_synchronize_full - synchronize all asynchronous function calls
*
* This function waits until all asynchronous function calls have been done.
*/
void async_synchronize_full(void)
{
async_synchronize_full_domain(NULL);
}
EXPORT_SYMBOL_GPL(async_synchronize_full);
/**
* async_synchronize_full_domain - synchronize all asynchronous function within a certain domain
* @domain: the domain to synchronize
*
* This function waits until all asynchronous function calls for the
* synchronization domain specified by @domain have been done.
*/
void async_synchronize_full_domain(struct async_domain *domain)
{
async_synchronize_cookie_domain(ASYNC_COOKIE_MAX, domain);
}
EXPORT_SYMBOL_GPL(async_synchronize_full_domain);
/**
* async_synchronize_cookie_domain - synchronize asynchronous function calls within a certain domain with cookie checkpointing
* @cookie: async_cookie_t to use as checkpoint
* @domain: the domain to synchronize (%NULL for all registered domains)
*
* This function waits until all asynchronous function calls for the
* synchronization domain specified by @domain submitted prior to @cookie
* have been done.
*/
void async_synchronize_cookie_domain(async_cookie_t cookie, struct async_domain *domain)
{
ktime_t starttime;
pr_debug("async_waiting @ %i\n", task_pid_nr(current));
starttime = ktime_get();
wait_event(async_done, lowest_in_progress(domain) >= cookie);
pr_debug("async_continuing @ %i after %lli usec\n", task_pid_nr(current),
microseconds_since(starttime));
}
EXPORT_SYMBOL_GPL(async_synchronize_cookie_domain);
/**
* async_synchronize_cookie - synchronize asynchronous function calls with cookie checkpointing
* @cookie: async_cookie_t to use as checkpoint
*
* This function waits until all asynchronous function calls prior to @cookie
* have been done.
*/
void async_synchronize_cookie(async_cookie_t cookie)
{
async_synchronize_cookie_domain(cookie, &async_dfl_domain);
}
EXPORT_SYMBOL_GPL(async_synchronize_cookie);
/**
* current_is_async - is %current an async worker task?
*
* Returns %true if %current is an async worker task.
*/
bool current_is_async(void)
{
struct worker *worker = current_wq_worker();
return worker && worker->current_func == async_run_entry_fn;
}
EXPORT_SYMBOL_GPL(current_is_async);
// SPDX-License-Identifier: GPL-2.0-only
/*
* mm/truncate.c - code for taking down pages from address_spaces
*
* Copyright (C) 2002, Linus Torvalds
*
* 10Sep2002 Andrew Morton
* Initial version.
*/
#include <linux/kernel.h>
#include <linux/backing-dev.h>
#include <linux/dax.h>
#include <linux/gfp.h>
#include <linux/mm.h>
#include <linux/swap.h>
#include <linux/export.h>
#include <linux/pagemap.h>
#include <linux/highmem.h>
#include <linux/pagevec.h>
#include <linux/task_io_accounting_ops.h>
#include <linux/buffer_head.h> /* grr. try_to_release_page,
do_invalidatepage */
#include <linux/shmem_fs.h>
#include <linux/cleancache.h>
#include <linux/rmap.h>
#include "internal.h"
/*
* Regular page slots are stabilized by the page lock even without the tree
* itself locked. These unlocked entries need verification under the tree
* lock.
*/
static inline void __clear_shadow_entry(struct address_space *mapping,
pgoff_t index, void *entry)
{
XA_STATE(xas, &mapping->i_pages, index);
xas_set_update(&xas, workingset_update_node);
if (xas_load(&xas) != entry)
return; xas_store(&xas, NULL);
}
static void clear_shadow_entry(struct address_space *mapping, pgoff_t index,
void *entry)
{
xa_lock_irq(&mapping->i_pages);
__clear_shadow_entry(mapping, index, entry);
xa_unlock_irq(&mapping->i_pages);
}
/*
* Unconditionally remove exceptional entries. Usually called from truncate
* path. Note that the pagevec may be altered by this function by removing
* exceptional entries similar to what pagevec_remove_exceptionals does.
*/
static void truncate_exceptional_pvec_entries(struct address_space *mapping,
struct pagevec *pvec, pgoff_t *indices)
{
int i, j;
bool dax;
/* Handled by shmem itself */
if (shmem_mapping(mapping))
return;
for (j = 0; j < pagevec_count(pvec); j++) if (xa_is_value(pvec->pages[j]))
break;
if (j == pagevec_count(pvec))
return;
dax = dax_mapping(mapping);
if (!dax)
xa_lock_irq(&mapping->i_pages);
for (i = j; i < pagevec_count(pvec); i++) { struct page *page = pvec->pages[i]; pgoff_t index = indices[i];
if (!xa_is_value(page)) {
pvec->pages[j++] = page;
continue;
}
if (unlikely(dax)) {
dax_delete_mapping_entry(mapping, index);
continue;
}
__clear_shadow_entry(mapping, index, page);
}
if (!dax)
xa_unlock_irq(&mapping->i_pages);
pvec->nr = j;
}
/*
* Invalidate exceptional entry if easily possible. This handles exceptional
* entries for invalidate_inode_pages().
*/
static int invalidate_exceptional_entry(struct address_space *mapping,
pgoff_t index, void *entry)
{
/* Handled by shmem itself, or for DAX we do nothing. */
if (shmem_mapping(mapping) || dax_mapping(mapping))
return 1;
clear_shadow_entry(mapping, index, entry);
return 1;
}
/*
* Invalidate exceptional entry if clean. This handles exceptional entries for
* invalidate_inode_pages2() so for DAX it evicts only clean entries.
*/
static int invalidate_exceptional_entry2(struct address_space *mapping,
pgoff_t index, void *entry)
{
/* Handled by shmem itself */
if (shmem_mapping(mapping))
return 1;
if (dax_mapping(mapping))
return dax_invalidate_mapping_entry_sync(mapping, index);
clear_shadow_entry(mapping, index, entry);
return 1;
}
/**
* do_invalidatepage - invalidate part or all of a page
* @page: the page which is affected
* @offset: start of the range to invalidate
* @length: length of the range to invalidate
*
* do_invalidatepage() is called when all or part of the page has become
* invalidated by a truncate operation.
*
* do_invalidatepage() does not have to release all buffers, but it must
* ensure that no dirty buffer is left outside @offset and that no I/O
* is underway against any of the blocks which are outside the truncation
* point. Because the caller is about to free (and possibly reuse) those
* blocks on-disk.
*/
void do_invalidatepage(struct page *page, unsigned int offset,
unsigned int length)
{
void (*invalidatepage)(struct page *, unsigned int, unsigned int);
invalidatepage = page->mapping->a_ops->invalidatepage;
#ifdef CONFIG_BLOCK
if (!invalidatepage)
invalidatepage = block_invalidatepage;
#endif
if (invalidatepage) (*invalidatepage)(page, offset, length);
}
/*
* If truncate cannot remove the fs-private metadata from the page, the page
* becomes orphaned. It will be left on the LRU and may even be mapped into
* user pagetables if we're racing with filemap_fault().
*
* We need to bail out if page->mapping is no longer equal to the original
* mapping. This happens a) when the VM reclaimed the page while we waited on
* its lock, b) when a concurrent invalidate_mapping_pages got there first and
* c) when tmpfs swizzles a page between a tmpfs inode and swapper_space.
*/
static void truncate_cleanup_page(struct page *page)
{
if (page_mapped(page)) unmap_mapping_page(page); if (page_has_private(page))
do_invalidatepage(page, 0, thp_size(page));
/*
* Some filesystems seem to re-dirty the page even after
* the VM has canceled the dirty bit (eg ext3 journaling).
* Hence dirty accounting check is placed after invalidation.
*/
cancel_dirty_page(page);
ClearPageMappedToDisk(page);
}
/*
* This is for invalidate_mapping_pages(). That function can be called at
* any time, and is not supposed to throw away dirty pages. But pages can
* be marked dirty at any time too, so use remove_mapping which safely
* discards clean, unused pages.
*
* Returns non-zero if the page was successfully invalidated.
*/
static int
invalidate_complete_page(struct address_space *mapping, struct page *page)
{
int ret;
if (page->mapping != mapping)
return 0;
if (page_has_private(page) && !try_to_release_page(page, 0))
return 0;
ret = remove_mapping(mapping, page);
return ret;
}
int truncate_inode_page(struct address_space *mapping, struct page *page)
{
VM_BUG_ON_PAGE(PageTail(page), page);
if (page->mapping != mapping)
return -EIO;
truncate_cleanup_page(page);
delete_from_page_cache(page);
return 0;
}
/*
* Used to get rid of pages on hardware memory corruption.
*/
int generic_error_remove_page(struct address_space *mapping, struct page *page)
{
if (!mapping)
return -EINVAL;
/*
* Only punch for normal data pages for now.
* Handling other types like directories would need more auditing.
*/
if (!S_ISREG(mapping->host->i_mode))
return -EIO;
return truncate_inode_page(mapping, page);
}
EXPORT_SYMBOL(generic_error_remove_page);
/*
* Safely invalidate one page from its pagecache mapping.
* It only drops clean, unused pages. The page must be locked.
*
* Returns 1 if the page is successfully invalidated, otherwise 0.
*/
int invalidate_inode_page(struct page *page)
{
struct address_space *mapping = page_mapping(page);
if (!mapping)
return 0;
if (PageDirty(page) || PageWriteback(page))
return 0;
if (page_mapped(page))
return 0;
return invalidate_complete_page(mapping, page);
}
/**
* truncate_inode_pages_range - truncate range of pages specified by start & end byte offsets
* @mapping: mapping to truncate
* @lstart: offset from which to truncate
* @lend: offset to which to truncate (inclusive)
*
* Truncate the page cache, removing the pages that are between
* specified offsets (and zeroing out partial pages
* if lstart or lend + 1 is not page aligned).
*
* Truncate takes two passes - the first pass is nonblocking. It will not
* block on page locks and it will not block on writeback. The second pass
* will wait. This is to prevent as much IO as possible in the affected region.
* The first pass will remove most pages, so the search cost of the second pass
* is low.
*
* We pass down the cache-hot hint to the page freeing code. Even if the
* mapping is large, it is probably the case that the final pages are the most
* recently touched, and freeing happens in ascending file offset order.
*
* Note that since ->invalidatepage() accepts range to invalidate
* truncate_inode_pages_range is able to handle cases where lend + 1 is not
* page aligned properly.
*/
void truncate_inode_pages_range(struct address_space *mapping,
loff_t lstart, loff_t lend)
{
pgoff_t start; /* inclusive */
pgoff_t end; /* exclusive */
unsigned int partial_start; /* inclusive */
unsigned int partial_end; /* exclusive */
struct pagevec pvec;
pgoff_t indices[PAGEVEC_SIZE];
pgoff_t index;
int i;
if (mapping_empty(mapping))
goto out;
/* Offsets within partial pages */
partial_start = lstart & (PAGE_SIZE - 1);
partial_end = (lend + 1) & (PAGE_SIZE - 1);
/*
* 'start' and 'end' always covers the range of pages to be fully
* truncated. Partial pages are covered with 'partial_start' at the
* start of the range and 'partial_end' at the end of the range.
* Note that 'end' is exclusive while 'lend' is inclusive.
*/
start = (lstart + PAGE_SIZE - 1) >> PAGE_SHIFT;
if (lend == -1)
/*
* lend == -1 indicates end-of-file so we have to set 'end'
* to the highest possible pgoff_t and since the type is
* unsigned we're using -1.
*/
end = -1;
else
end = (lend + 1) >> PAGE_SHIFT;
pagevec_init(&pvec);
index = start;
while (index < end && find_lock_entries(mapping, index, end - 1,
&pvec, indices)) {
index = indices[pagevec_count(&pvec) - 1] + 1;
truncate_exceptional_pvec_entries(mapping, &pvec, indices);
for (i = 0; i < pagevec_count(&pvec); i++) truncate_cleanup_page(pvec.pages[i]); delete_from_page_cache_batch(mapping, &pvec);
for (i = 0; i < pagevec_count(&pvec); i++)
unlock_page(pvec.pages[i]);
pagevec_release(&pvec);
cond_resched();
}
if (partial_start) { struct page *page = find_lock_page(mapping, start - 1);
if (page) {
unsigned int top = PAGE_SIZE;
if (start > end) {
/* Truncation within a single page */
top = partial_end;
partial_end = 0;
}
wait_on_page_writeback(page);
zero_user_segment(page, partial_start, top);
cleancache_invalidate_page(mapping, page);
if (page_has_private(page)) do_invalidatepage(page, partial_start,
top - partial_start);
unlock_page(page);
put_page(page);
}
}
if (partial_end) {
struct page *page = find_lock_page(mapping, end);
if (page) {
wait_on_page_writeback(page);
zero_user_segment(page, 0, partial_end);
cleancache_invalidate_page(mapping, page);
if (page_has_private(page))
do_invalidatepage(page, 0,
partial_end);
unlock_page(page);
put_page(page);
}
}
/*
* If the truncation happened within a single page no pages
* will be released, just zeroed, so we can bail out now.
*/
if (start >= end)
goto out;
index = start;
for ( ; ; ) {
cond_resched();
if (!find_get_entries(mapping, index, end - 1, &pvec,
indices)) {
/* If all gone from start onwards, we're done */
if (index == start)
break;
/* Otherwise restart to make sure all gone */
index = start;
continue;
}
for (i = 0; i < pagevec_count(&pvec); i++) { struct page *page = pvec.pages[i];
/* We rely upon deletion not changing page->index */
index = indices[i];
if (xa_is_value(page))
continue;
lock_page(page);
WARN_ON(page_to_index(page) != index); wait_on_page_writeback(page);
truncate_inode_page(mapping, page);
unlock_page(page);
}
truncate_exceptional_pvec_entries(mapping, &pvec, indices);
pagevec_release(&pvec);
index++;
}
out:
cleancache_invalidate_inode(mapping);
}
EXPORT_SYMBOL(truncate_inode_pages_range);
/**
* truncate_inode_pages - truncate *all* the pages from an offset
* @mapping: mapping to truncate
* @lstart: offset from which to truncate
*
* Called under (and serialised by) inode->i_rwsem and
* mapping->invalidate_lock.
*
* Note: When this function returns, there can be a page in the process of
* deletion (inside __delete_from_page_cache()) in the specified range. Thus
* mapping->nrpages can be non-zero when this function returns even after
* truncation of the whole mapping.
*/
void truncate_inode_pages(struct address_space *mapping, loff_t lstart)
{
truncate_inode_pages_range(mapping, lstart, (loff_t)-1);
}
EXPORT_SYMBOL(truncate_inode_pages);
/**
* truncate_inode_pages_final - truncate *all* pages before inode dies
* @mapping: mapping to truncate
*
* Called under (and serialized by) inode->i_rwsem.
*
* Filesystems have to use this in the .evict_inode path to inform the
* VM that this is the final truncate and the inode is going away.
*/
void truncate_inode_pages_final(struct address_space *mapping)
{
/*
* Page reclaim can not participate in regular inode lifetime
* management (can't call iput()) and thus can race with the
* inode teardown. Tell it when the address space is exiting,
* so that it does not install eviction information after the
* final truncate has begun.
*/
mapping_set_exiting(mapping);
if (!mapping_empty(mapping)) {
/*
* As truncation uses a lockless tree lookup, cycle
* the tree lock to make sure any ongoing tree
* modification that does not see AS_EXITING is
* completed before starting the final truncate.
*/
xa_lock_irq(&mapping->i_pages);
xa_unlock_irq(&mapping->i_pages);
}
/*
* Cleancache needs notification even if there are no pages or shadow
* entries.
*/
truncate_inode_pages(mapping, 0);
}
EXPORT_SYMBOL(truncate_inode_pages_final);
static unsigned long __invalidate_mapping_pages(struct address_space *mapping,
pgoff_t start, pgoff_t end, unsigned long *nr_pagevec)
{
pgoff_t indices[PAGEVEC_SIZE];
struct pagevec pvec;
pgoff_t index = start;
unsigned long ret;
unsigned long count = 0;
int i;
pagevec_init(&pvec);
while (find_lock_entries(mapping, index, end, &pvec, indices)) { for (i = 0; i < pagevec_count(&pvec); i++) { struct page *page = pvec.pages[i];
/* We rely upon deletion not changing page->index */
index = indices[i];
if (xa_is_value(page)) {
count += invalidate_exceptional_entry(mapping,
index,
page);
continue;
}
index += thp_nr_pages(page) - 1;
ret = invalidate_inode_page(page);
unlock_page(page);
/*
* Invalidation is a hint that the page is no longer
* of interest and try to speed up its reclaim.
*/
if (!ret) {
deactivate_file_page(page);
/* It is likely on the pagevec of a remote CPU */
if (nr_pagevec)
(*nr_pagevec)++;
}
count += ret;
}
pagevec_remove_exceptionals(&pvec);
pagevec_release(&pvec);
cond_resched();
index++;
}
return count;
}
/**
* invalidate_mapping_pages - Invalidate all clean, unlocked cache of one inode
* @mapping: the address_space which holds the cache to invalidate
* @start: the offset 'from' which to invalidate
* @end: the offset 'to' which to invalidate (inclusive)
*
* This function removes pages that are clean, unmapped and unlocked,
* as well as shadow entries. It will not block on IO activity.
*
* If you want to remove all the pages of one inode, regardless of
* their use and writeback state, use truncate_inode_pages().
*
* Return: the number of the cache entries that were invalidated
*/
unsigned long invalidate_mapping_pages(struct address_space *mapping,
pgoff_t start, pgoff_t end)
{
return __invalidate_mapping_pages(mapping, start, end, NULL);
}
EXPORT_SYMBOL(invalidate_mapping_pages);
/**
* invalidate_mapping_pagevec - Invalidate all the unlocked pages of one inode
* @mapping: the address_space which holds the pages to invalidate
* @start: the offset 'from' which to invalidate
* @end: the offset 'to' which to invalidate (inclusive)
* @nr_pagevec: invalidate failed page number for caller
*
* This helper is similar to invalidate_mapping_pages(), except that it accounts
* for pages that are likely on a pagevec and counts them in @nr_pagevec, which
* will be used by the caller.
*/
void invalidate_mapping_pagevec(struct address_space *mapping,
pgoff_t start, pgoff_t end, unsigned long *nr_pagevec)
{
__invalidate_mapping_pages(mapping, start, end, nr_pagevec);
}
/*
* This is like invalidate_complete_page(), except it ignores the page's
* refcount. We do this because invalidate_inode_pages2() needs stronger
* invalidation guarantees, and cannot afford to leave pages behind because
* shrink_page_list() has a temp ref on them, or because they're transiently
* sitting in the lru_cache_add() pagevecs.
*/
static int
invalidate_complete_page2(struct address_space *mapping, struct page *page)
{
if (page->mapping != mapping)
return 0;
if (page_has_private(page) && !try_to_release_page(page, GFP_KERNEL))
return 0;
xa_lock_irq(&mapping->i_pages);
if (PageDirty(page))
goto failed;
BUG_ON(page_has_private(page)); __delete_from_page_cache(page, NULL);
xa_unlock_irq(&mapping->i_pages);
if (mapping->a_ops->freepage)
mapping->a_ops->freepage(page);
put_page(page); /* pagecache ref */
return 1;
failed:
xa_unlock_irq(&mapping->i_pages);
return 0;
}
static int do_launder_page(struct address_space *mapping, struct page *page)
{
if (!PageDirty(page))
return 0;
if (page->mapping != mapping || mapping->a_ops->launder_page == NULL)
return 0;
return mapping->a_ops->launder_page(page);
}
/**
* invalidate_inode_pages2_range - remove range of pages from an address_space
* @mapping: the address_space
* @start: the page offset 'from' which to invalidate
* @end: the page offset 'to' which to invalidate (inclusive)
*
* Any pages which are found to be mapped into pagetables are unmapped prior to
* invalidation.
*
* Return: -EBUSY if any pages could not be invalidated.
*/
int invalidate_inode_pages2_range(struct address_space *mapping,
pgoff_t start, pgoff_t end)
{
pgoff_t indices[PAGEVEC_SIZE];
struct pagevec pvec;
pgoff_t index;
int i;
int ret = 0;
int ret2 = 0;
int did_range_unmap = 0;
if (mapping_empty(mapping))
goto out;
pagevec_init(&pvec);
index = start;
while (find_get_entries(mapping, index, end, &pvec, indices)) { for (i = 0; i < pagevec_count(&pvec); i++) { struct page *page = pvec.pages[i];
/* We rely upon deletion not changing page->index */
index = indices[i];
if (xa_is_value(page)) {
if (!invalidate_exceptional_entry2(mapping,
index, page))
ret = -EBUSY;
continue;
}
if (!did_range_unmap && page_mapped(page)) {
/*
* If page is mapped, before taking its lock,
* zap the rest of the file in one hit.
*/
unmap_mapping_pages(mapping, index,
(1 + end - index), false);
did_range_unmap = 1;
}
lock_page(page);
WARN_ON(page_to_index(page) != index); if (page->mapping != mapping) {
unlock_page(page);
continue;
}
wait_on_page_writeback(page);
if (page_mapped(page))
unmap_mapping_page(page); BUG_ON(page_mapped(page));
ret2 = do_launder_page(mapping, page);
if (ret2 == 0) {
if (!invalidate_complete_page2(mapping, page))
ret2 = -EBUSY;
}
if (ret2 < 0)
ret = ret2;
unlock_page(page);
}
pagevec_remove_exceptionals(&pvec);
pagevec_release(&pvec);
cond_resched();
index++;
}
/*
* For DAX we invalidate page tables after invalidating page cache. We
* could invalidate page tables while invalidating each entry however
* that would be expensive. And doing range unmapping before doesn't
* work as we have no cheap way to find whether page cache entry didn't
* get remapped later.
*/
if (dax_mapping(mapping)) {
unmap_mapping_pages(mapping, start, end - start + 1, false);
}
out:
cleancache_invalidate_inode(mapping);
return ret;
}
EXPORT_SYMBOL_GPL(invalidate_inode_pages2_range);
/**
* invalidate_inode_pages2 - remove all pages from an address_space
* @mapping: the address_space
*
* Any pages which are found to be mapped into pagetables are unmapped prior to
* invalidation.
*
* Return: -EBUSY if any pages could not be invalidated.
*/
int invalidate_inode_pages2(struct address_space *mapping)
{
return invalidate_inode_pages2_range(mapping, 0, -1);
}
EXPORT_SYMBOL_GPL(invalidate_inode_pages2);
/**
* truncate_pagecache - unmap and remove pagecache that has been truncated
* @inode: inode
* @newsize: new file size
*
* inode's new i_size must already be written before truncate_pagecache
* is called.
*
* This function should typically be called before the filesystem
* releases resources associated with the freed range (eg. deallocates
* blocks). This way, pagecache will always stay logically coherent
* with on-disk format, and the filesystem would not have to deal with
* situations such as writepage being called for a page that has already
* had its underlying blocks deallocated.
*/
void truncate_pagecache(struct inode *inode, loff_t newsize)
{
struct address_space *mapping = inode->i_mapping;
loff_t holebegin = round_up(newsize, PAGE_SIZE);
/*
* unmap_mapping_range is called twice, first simply for
* efficiency so that truncate_inode_pages does fewer
* single-page unmaps. However after this first call, and
* before truncate_inode_pages finishes, it is possible for
* private pages to be COWed, which remain after
* truncate_inode_pages finishes, hence the second
* unmap_mapping_range call must be made for correctness.
*/
unmap_mapping_range(mapping, holebegin, 0, 1);
truncate_inode_pages(mapping, newsize);
unmap_mapping_range(mapping, holebegin, 0, 1);
}
EXPORT_SYMBOL(truncate_pagecache);
/**
* truncate_setsize - update inode and pagecache for a new file size
* @inode: inode
* @newsize: new file size
*
* truncate_setsize updates i_size and performs pagecache truncation (if
* necessary) to @newsize. It will be typically be called from the filesystem's
* setattr function when ATTR_SIZE is passed in.
*
* Must be called with a lock serializing truncates and writes (generally
* i_rwsem but e.g. xfs uses a different lock) and before all filesystem
* specific block truncation has been performed.
*/
void truncate_setsize(struct inode *inode, loff_t newsize)
{
loff_t oldsize = inode->i_size;
i_size_write(inode, newsize);
if (newsize > oldsize)
pagecache_isize_extended(inode, oldsize, newsize); truncate_pagecache(inode, newsize);
}
EXPORT_SYMBOL(truncate_setsize);
/**
* pagecache_isize_extended - update pagecache after extension of i_size
* @inode: inode for which i_size was extended
* @from: original inode size
* @to: new inode size
*
* Handle extension of inode size either caused by extending truncate or by
* write starting after current i_size. We mark the page straddling current
* i_size RO so that page_mkwrite() is called on the nearest write access to
* the page. This way filesystem can be sure that page_mkwrite() is called on
* the page before user writes to the page via mmap after the i_size has been
* changed.
*
* The function must be called after i_size is updated so that page fault
* coming after we unlock the page will already see the new i_size.
* The function must be called while we still hold i_rwsem - this not only
* makes sure i_size is stable but also that userspace cannot observe new
* i_size value before we are prepared to store mmap writes at new inode size.
*/
void pagecache_isize_extended(struct inode *inode, loff_t from, loff_t to)
{
int bsize = i_blocksize(inode);
loff_t rounded_from;
struct page *page;
pgoff_t index;
WARN_ON(to > inode->i_size); if (from >= to || bsize == PAGE_SIZE)
return;
/* Page straddling @from will not have any hole block created? */
rounded_from = round_up(from, bsize); if (to <= rounded_from || !(rounded_from & (PAGE_SIZE - 1)))
return;
index = from >> PAGE_SHIFT;
page = find_lock_page(inode->i_mapping, index);
/* Page not cached? Nothing to do */
if (!page)
return;
/*
* See clear_page_dirty_for_io() for details why set_page_dirty()
* is needed.
*/
if (page_mkclean(page)) set_page_dirty(page); unlock_page(page);
put_page(page);
}
EXPORT_SYMBOL(pagecache_isize_extended);
/**
* truncate_pagecache_range - unmap and remove pagecache that is hole-punched
* @inode: inode
* @lstart: offset of beginning of hole
* @lend: offset of last byte of hole
*
* This function should typically be called before the filesystem
* releases resources associated with the freed range (eg. deallocates
* blocks). This way, pagecache will always stay logically coherent
* with on-disk format, and the filesystem would not have to deal with
* situations such as writepage being called for a page that has already
* had its underlying blocks deallocated.
*/
void truncate_pagecache_range(struct inode *inode, loff_t lstart, loff_t lend)
{
struct address_space *mapping = inode->i_mapping;
loff_t unmap_start = round_up(lstart, PAGE_SIZE);
loff_t unmap_end = round_down(1 + lend, PAGE_SIZE) - 1;
/*
* This rounding is currently just for example: unmap_mapping_range
* expands its hole outwards, whereas we want it to contract the hole
* inwards. However, existing callers of truncate_pagecache_range are
* doing their own page rounding first. Note that unmap_mapping_range
* allows holelen 0 for all, and we allow lend -1 for end of file.
*/
/*
* Unlike in truncate_pagecache, unmap_mapping_range is called only
* once (before truncating pagecache), and without "even_cows" flag:
* hole-punching should not remove private COWed pages from the hole.
*/
if ((u64)unmap_end > (u64)unmap_start)
unmap_mapping_range(mapping, unmap_start,
1 + unmap_end - unmap_start, 0);
truncate_inode_pages_range(mapping, lstart, lend);
}
EXPORT_SYMBOL(truncate_pagecache_range);
// SPDX-License-Identifier: GPL-2.0
#include <linux/export.h>
#include <linux/spinlock.h>
#include <linux/atomic.h>
/*
* This is an implementation of the notion of "decrement a
* reference count, and return locked if it decremented to zero".
*
* NOTE NOTE NOTE! This is _not_ equivalent to
*
* if (atomic_dec_and_test(&atomic)) {
* spin_lock(&lock);
* return 1;
* }
* return 0;
*
* because the spin-lock and the decrement must be
* "atomic".
*/
int _atomic_dec_and_lock(atomic_t *atomic, spinlock_t *lock)
{
/* Subtract 1 from counter unless that drops it to 0 (ie. it was 1) */
if (atomic_add_unless(atomic, -1, 1))
return 0;
/* Otherwise do it the slow way */
spin_lock(lock);
if (atomic_dec_and_test(atomic))
return 1;
spin_unlock(lock);
return 0;
}
EXPORT_SYMBOL(_atomic_dec_and_lock);
int _atomic_dec_and_lock_irqsave(atomic_t *atomic, spinlock_t *lock,
unsigned long *flags)
{
/* Subtract 1 from counter unless that drops it to 0 (ie. it was 1) */
if (atomic_add_unless(atomic, -1, 1))
return 0;
/* Otherwise do it the slow way */
spin_lock_irqsave(lock, *flags);
if (atomic_dec_and_test(atomic))
return 1;
spin_unlock_irqrestore(lock, *flags);
return 0;
}
EXPORT_SYMBOL(_atomic_dec_and_lock_irqsave);
// SPDX-License-Identifier: GPL-2.0
/*
* drivers/base/power/wakeup.c - System wakeup events framework
*
* Copyright (c) 2010 Rafael J. Wysocki <rjw@sisk.pl>, Novell Inc.
*/
#define pr_fmt(fmt) "PM: " fmt
#include <linux/device.h>
#include <linux/slab.h>
#include <linux/sched/signal.h>
#include <linux/capability.h>
#include <linux/export.h>
#include <linux/suspend.h>
#include <linux/seq_file.h>
#include <linux/debugfs.h>
#include <linux/pm_wakeirq.h>
#include <trace/events/power.h>
#include "power.h"
#ifndef CONFIG_SUSPEND
suspend_state_t pm_suspend_target_state;
#define pm_suspend_target_state (PM_SUSPEND_ON)
#endif
#define list_for_each_entry_rcu_locked(pos, head, member) \
list_for_each_entry_rcu(pos, head, member, \
srcu_read_lock_held(&wakeup_srcu))
/*
* If set, the suspend/hibernate code will abort transitions to a sleep state
* if wakeup events are registered during or immediately before the transition.
*/
bool events_check_enabled __read_mostly;
/* First wakeup IRQ seen by the kernel in the last cycle. */
static unsigned int wakeup_irq[2] __read_mostly;
static DEFINE_RAW_SPINLOCK(wakeup_irq_lock);
/* If greater than 0 and the system is suspending, terminate the suspend. */
static atomic_t pm_abort_suspend __read_mostly;
/*
* Combined counters of registered wakeup events and wakeup events in progress.
* They need to be modified together atomically, so it's better to use one
* atomic variable to hold them both.
*/
static atomic_t combined_event_count = ATOMIC_INIT(0);
#define IN_PROGRESS_BITS (sizeof(int) * 4)
#define MAX_IN_PROGRESS ((1 << IN_PROGRESS_BITS) - 1)
static void split_counters(unsigned int *cnt, unsigned int *inpr)
{
unsigned int comb = atomic_read(&combined_event_count);
*cnt = (comb >> IN_PROGRESS_BITS);
*inpr = comb & MAX_IN_PROGRESS;
}
/* A preserved old value of the events counter. */
static unsigned int saved_count;
static DEFINE_RAW_SPINLOCK(events_lock);
static void pm_wakeup_timer_fn(struct timer_list *t);
static LIST_HEAD(wakeup_sources);
static DECLARE_WAIT_QUEUE_HEAD(wakeup_count_wait_queue);
DEFINE_STATIC_SRCU(wakeup_srcu);
static struct wakeup_source deleted_ws = {
.name = "deleted",
.lock = __SPIN_LOCK_UNLOCKED(deleted_ws.lock),
};
static DEFINE_IDA(wakeup_ida);
/**
* wakeup_source_create - Create a struct wakeup_source object.
* @name: Name of the new wakeup source.
*/
struct wakeup_source *wakeup_source_create(const char *name)
{
struct wakeup_source *ws;
const char *ws_name;
int id;
ws = kzalloc(sizeof(*ws), GFP_KERNEL);
if (!ws)
goto err_ws;
ws_name = kstrdup_const(name, GFP_KERNEL);
if (!ws_name)
goto err_name;
ws->name = ws_name;
id = ida_alloc(&wakeup_ida, GFP_KERNEL);
if (id < 0)
goto err_id;
ws->id = id;
return ws;
err_id:
kfree_const(ws->name);
err_name:
kfree(ws);
err_ws:
return NULL;
}
EXPORT_SYMBOL_GPL(wakeup_source_create);
/*
* Record wakeup_source statistics being deleted into a dummy wakeup_source.
*/
static void wakeup_source_record(struct wakeup_source *ws)
{
unsigned long flags;
spin_lock_irqsave(&deleted_ws.lock, flags);
if (ws->event_count) {
deleted_ws.total_time =
ktime_add(deleted_ws.total_time, ws->total_time);
deleted_ws.prevent_sleep_time =
ktime_add(deleted_ws.prevent_sleep_time,
ws->prevent_sleep_time);
deleted_ws.max_time =
ktime_compare(deleted_ws.max_time, ws->max_time) > 0 ?
deleted_ws.max_time : ws->max_time;
deleted_ws.event_count += ws->event_count;
deleted_ws.active_count += ws->active_count;
deleted_ws.relax_count += ws->relax_count;
deleted_ws.expire_count += ws->expire_count;
deleted_ws.wakeup_count += ws->wakeup_count;
}
spin_unlock_irqrestore(&deleted_ws.lock, flags);
}
static void wakeup_source_free(struct wakeup_source *ws)
{
ida_free(&wakeup_ida, ws->id);
kfree_const(ws->name);
kfree(ws);
}
/**
* wakeup_source_destroy - Destroy a struct wakeup_source object.
* @ws: Wakeup source to destroy.
*
* Use only for wakeup source objects created with wakeup_source_create().
*/
void wakeup_source_destroy(struct wakeup_source *ws)
{
if (!ws)
return;
__pm_relax(ws);
wakeup_source_record(ws);
wakeup_source_free(ws);
}
EXPORT_SYMBOL_GPL(wakeup_source_destroy);
/**
* wakeup_source_add - Add given object to the list of wakeup sources.
* @ws: Wakeup source object to add to the list.
*/
void wakeup_source_add(struct wakeup_source *ws)
{
unsigned long flags;
if (WARN_ON(!ws))
return;
spin_lock_init(&ws->lock);
timer_setup(&ws->timer, pm_wakeup_timer_fn, 0);
ws->active = false;
raw_spin_lock_irqsave(&events_lock, flags);
list_add_rcu(&ws->entry, &wakeup_sources);
raw_spin_unlock_irqrestore(&events_lock, flags);
}
EXPORT_SYMBOL_GPL(wakeup_source_add);
/**
* wakeup_source_remove - Remove given object from the wakeup sources list.
* @ws: Wakeup source object to remove from the list.
*/
void wakeup_source_remove(struct wakeup_source *ws)
{
unsigned long flags;
if (WARN_ON(!ws))
return;
raw_spin_lock_irqsave(&events_lock, flags);
list_del_rcu(&ws->entry);
raw_spin_unlock_irqrestore(&events_lock, flags);
synchronize_srcu(&wakeup_srcu);
del_timer_sync(&ws->timer);
/*
* Clear timer.function to make wakeup_source_not_registered() treat
* this wakeup source as not registered.
*/
ws->timer.function = NULL;
}
EXPORT_SYMBOL_GPL(wakeup_source_remove);
/**
* wakeup_source_register - Create wakeup source and add it to the list.
* @dev: Device this wakeup source is associated with (or NULL if virtual).
* @name: Name of the wakeup source to register.
*/
struct wakeup_source *wakeup_source_register(struct device *dev,
const char *name)
{
struct wakeup_source *ws;
int ret;
ws = wakeup_source_create(name);
if (ws) {
if (!dev || device_is_registered(dev)) {
ret = wakeup_source_sysfs_add(dev, ws);
if (ret) {
wakeup_source_free(ws);
return NULL;
}
}
wakeup_source_add(ws);
}
return ws;
}
EXPORT_SYMBOL_GPL(wakeup_source_register);
/**
* wakeup_source_unregister - Remove wakeup source from the list and remove it.
* @ws: Wakeup source object to unregister.
*/
void wakeup_source_unregister(struct wakeup_source *ws)
{
if (ws) {
wakeup_source_remove(ws);
if (ws->dev)
wakeup_source_sysfs_remove(ws);
wakeup_source_destroy(ws);
}
}
EXPORT_SYMBOL_GPL(wakeup_source_unregister);
/**
* wakeup_sources_read_lock - Lock wakeup source list for read.
*
* Returns an index of srcu lock for struct wakeup_srcu.
* This index must be passed to the matching wakeup_sources_read_unlock().
*/
int wakeup_sources_read_lock(void)
{
return srcu_read_lock(&wakeup_srcu);
}
EXPORT_SYMBOL_GPL(wakeup_sources_read_lock);
/**
* wakeup_sources_read_unlock - Unlock wakeup source list.
* @idx: return value from corresponding wakeup_sources_read_lock()
*/
void wakeup_sources_read_unlock(int idx)
{
srcu_read_unlock(&wakeup_srcu, idx);
}
EXPORT_SYMBOL_GPL(wakeup_sources_read_unlock);
/**
* wakeup_sources_walk_start - Begin a walk on wakeup source list
*
* Returns first object of the list of wakeup sources.
*
* Note that to be safe, wakeup sources list needs to be locked by calling
* wakeup_source_read_lock() for this.
*/
struct wakeup_source *wakeup_sources_walk_start(void)
{
struct list_head *ws_head = &wakeup_sources;
return list_entry_rcu(ws_head->next, struct wakeup_source, entry);
}
EXPORT_SYMBOL_GPL(wakeup_sources_walk_start);
/**
* wakeup_sources_walk_next - Get next wakeup source from the list
* @ws: Previous wakeup source object
*
* Note that to be safe, wakeup sources list needs to be locked by calling
* wakeup_source_read_lock() for this.
*/
struct wakeup_source *wakeup_sources_walk_next(struct wakeup_source *ws)
{
struct list_head *ws_head = &wakeup_sources;
return list_next_or_null_rcu(ws_head, &ws->entry,
struct wakeup_source, entry);
}
EXPORT_SYMBOL_GPL(wakeup_sources_walk_next);
/**
* device_wakeup_attach - Attach a wakeup source object to a device object.
* @dev: Device to handle.
* @ws: Wakeup source object to attach to @dev.
*
* This causes @dev to be treated as a wakeup device.
*/
static int device_wakeup_attach(struct device *dev, struct wakeup_source *ws)
{
spin_lock_irq(&dev->power.lock);
if (dev->power.wakeup) {
spin_unlock_irq(&dev->power.lock);
return -EEXIST;
}
dev->power.wakeup = ws;
if (dev->power.wakeirq)
device_wakeup_attach_irq(dev, dev->power.wakeirq);
spin_unlock_irq(&dev->power.lock);
return 0;
}
/**
* device_wakeup_enable - Enable given device to be a wakeup source.
* @dev: Device to handle.
*
* Create a wakeup source object, register it and attach it to @dev.
*/
int device_wakeup_enable(struct device *dev)
{
struct wakeup_source *ws;
int ret;
if (!dev || !dev->power.can_wakeup)
return -EINVAL;
if (pm_suspend_target_state != PM_SUSPEND_ON)
dev_dbg(dev, "Suspicious %s() during system transition!\n", __func__);
ws = wakeup_source_register(dev, dev_name(dev));
if (!ws)
return -ENOMEM;
ret = device_wakeup_attach(dev, ws);
if (ret)
wakeup_source_unregister(ws);
return ret;
}
EXPORT_SYMBOL_GPL(device_wakeup_enable);
/**
* device_wakeup_attach_irq - Attach a wakeirq to a wakeup source
* @dev: Device to handle
* @wakeirq: Device specific wakeirq entry
*
* Attach a device wakeirq to the wakeup source so the device
* wake IRQ can be configured automatically for suspend and
* resume.
*
* Call under the device's power.lock lock.
*/
void device_wakeup_attach_irq(struct device *dev,
struct wake_irq *wakeirq)
{
struct wakeup_source *ws;
ws = dev->power.wakeup;
if (!ws)
return;
if (ws->wakeirq)
dev_err(dev, "Leftover wakeup IRQ found, overriding\n");
ws->wakeirq = wakeirq;
}
/**
* device_wakeup_detach_irq - Detach a wakeirq from a wakeup source
* @dev: Device to handle
*
* Removes a device wakeirq from the wakeup source.
*
* Call under the device's power.lock lock.
*/
void device_wakeup_detach_irq(struct device *dev)
{
struct wakeup_source *ws;
ws = dev->power.wakeup;
if (ws)
ws->wakeirq = NULL;
}
/**
* device_wakeup_arm_wake_irqs -
*
* Iterates over the list of device wakeirqs to arm them.
*/
void device_wakeup_arm_wake_irqs(void)
{
struct wakeup_source *ws;
int srcuidx;
srcuidx = srcu_read_lock(&wakeup_srcu);
list_for_each_entry_rcu_locked(ws, &wakeup_sources, entry)
dev_pm_arm_wake_irq(ws->wakeirq);
srcu_read_unlock(&wakeup_srcu, srcuidx);
}
/**
* device_wakeup_disarm_wake_irqs -
*
* Iterates over the list of device wakeirqs to disarm them.
*/
void device_wakeup_disarm_wake_irqs(void)
{
struct wakeup_source *ws;
int srcuidx;
srcuidx = srcu_read_lock(&wakeup_srcu);
list_for_each_entry_rcu_locked(ws, &wakeup_sources, entry)
dev_pm_disarm_wake_irq(ws->wakeirq);
srcu_read_unlock(&wakeup_srcu, srcuidx);
}
/**
* device_wakeup_detach - Detach a device's wakeup source object from it.
* @dev: Device to detach the wakeup source object from.
*
* After it returns, @dev will not be treated as a wakeup device any more.
*/
static struct wakeup_source *device_wakeup_detach(struct device *dev)
{
struct wakeup_source *ws;
spin_lock_irq(&dev->power.lock);
ws = dev->power.wakeup;
dev->power.wakeup = NULL;
spin_unlock_irq(&dev->power.lock);
return ws;
}
/**
* device_wakeup_disable - Do not regard a device as a wakeup source any more.
* @dev: Device to handle.
*
* Detach the @dev's wakeup source object from it, unregister this wakeup source
* object and destroy it.
*/
int device_wakeup_disable(struct device *dev)
{
struct wakeup_source *ws;
if (!dev || !dev->power.can_wakeup)
return -EINVAL;
ws = device_wakeup_detach(dev);
wakeup_source_unregister(ws);
return 0;
}
EXPORT_SYMBOL_GPL(device_wakeup_disable);
/**
* device_set_wakeup_capable - Set/reset device wakeup capability flag.
* @dev: Device to handle.
* @capable: Whether or not @dev is capable of waking up the system from sleep.
*
* If @capable is set, set the @dev's power.can_wakeup flag and add its
* wakeup-related attributes to sysfs. Otherwise, unset the @dev's
* power.can_wakeup flag and remove its wakeup-related attributes from sysfs.
*
* This function may sleep and it can't be called from any context where
* sleeping is not allowed.
*/
void device_set_wakeup_capable(struct device *dev, bool capable)
{
if (!!dev->power.can_wakeup == !!capable)
return;
dev->power.can_wakeup = capable;
if (device_is_registered(dev) && !list_empty(&dev->power.entry)) {
if (capable) {
int ret = wakeup_sysfs_add(dev);
if (ret)
dev_info(dev, "Wakeup sysfs attributes not added\n");
} else {
wakeup_sysfs_remove(dev);
}
}
}
EXPORT_SYMBOL_GPL(device_set_wakeup_capable);
/**
* device_init_wakeup - Device wakeup initialization.
* @dev: Device to handle.
* @enable: Whether or not to enable @dev as a wakeup device.
*
* By default, most devices should leave wakeup disabled. The exceptions are
* devices that everyone expects to be wakeup sources: keyboards, power buttons,
* possibly network interfaces, etc. Also, devices that don't generate their
* own wakeup requests but merely forward requests from one bus to another
* (like PCI bridges) should have wakeup enabled by default.
*/
int device_init_wakeup(struct device *dev, bool enable)
{
int ret = 0;
if (!dev)
return -EINVAL;
if (enable) {
device_set_wakeup_capable(dev, true);
ret = device_wakeup_enable(dev);
} else {
device_wakeup_disable(dev);
device_set_wakeup_capable(dev, false);
}
return ret;
}
EXPORT_SYMBOL_GPL(device_init_wakeup);
/**
* device_set_wakeup_enable - Enable or disable a device to wake up the system.
* @dev: Device to handle.
* @enable: enable/disable flag
*/
int device_set_wakeup_enable(struct device *dev, bool enable)
{
return enable ? device_wakeup_enable(dev) : device_wakeup_disable(dev);
}
EXPORT_SYMBOL_GPL(device_set_wakeup_enable);
/**
* wakeup_source_not_registered - validate the given wakeup source.
* @ws: Wakeup source to be validated.
*/
static bool wakeup_source_not_registered(struct wakeup_source *ws)
{
/*
* Use timer struct to check if the given source is initialized
* by wakeup_source_add.
*/
return ws->timer.function != pm_wakeup_timer_fn;
}
/*
* The functions below use the observation that each wakeup event starts a
* period in which the system should not be suspended. The moment this period
* will end depends on how the wakeup event is going to be processed after being
* detected and all of the possible cases can be divided into two distinct
* groups.
*
* First, a wakeup event may be detected by the same functional unit that will
* carry out the entire processing of it and possibly will pass it to user space
* for further processing. In that case the functional unit that has detected
* the event may later "close" the "no suspend" period associated with it
* directly as soon as it has been dealt with. The pair of pm_stay_awake() and
* pm_relax(), balanced with each other, is supposed to be used in such
* situations.
*
* Second, a wakeup event may be detected by one functional unit and processed
* by another one. In that case the unit that has detected it cannot really
* "close" the "no suspend" period associated with it, unless it knows in
* advance what's going to happen to the event during processing. This
* knowledge, however, may not be available to it, so it can simply specify time
* to wait before the system can be suspended and pass it as the second
* argument of pm_wakeup_event().
*
* It is valid to call pm_relax() after pm_wakeup_event(), in which case the
* "no suspend" period will be ended either by the pm_relax(), or by the timer
* function executed when the timer expires, whichever comes first.
*/
/**
* wakeup_source_activate - Mark given wakeup source as active.
* @ws: Wakeup source to handle.
*
* Update the @ws' statistics and, if @ws has just been activated, notify the PM
* core of the event by incrementing the counter of of wakeup events being
* processed.
*/
static void wakeup_source_activate(struct wakeup_source *ws)
{
unsigned int cec;
if (WARN_ONCE(wakeup_source_not_registered(ws),
"unregistered wakeup source\n"))
return;
ws->active = true;
ws->active_count++;
ws->last_time = ktime_get();
if (ws->autosleep_enabled)
ws->start_prevent_time = ws->last_time;
/* Increment the counter of events in progress. */
cec = atomic_inc_return(&combined_event_count);
trace_wakeup_source_activate(ws->name, cec);
}
/**
* wakeup_source_report_event - Report wakeup event using the given source.
* @ws: Wakeup source to report the event for.
* @hard: If set, abort suspends in progress and wake up from suspend-to-idle.
*/
static void wakeup_source_report_event(struct wakeup_source *ws, bool hard)
{
ws->event_count++;
/* This is racy, but the counter is approximate anyway. */
if (events_check_enabled)
ws->wakeup_count++;
if (!ws->active)
wakeup_source_activate(ws);
if (hard)
pm_system_wakeup();
}
/**
* __pm_stay_awake - Notify the PM core of a wakeup event.
* @ws: Wakeup source object associated with the source of the event.
*
* It is safe to call this function from interrupt context.
*/
void __pm_stay_awake(struct wakeup_source *ws)
{
unsigned long flags;
if (!ws)
return;
spin_lock_irqsave(&ws->lock, flags);
wakeup_source_report_event(ws, false);
del_timer(&ws->timer);
ws->timer_expires = 0;
spin_unlock_irqrestore(&ws->lock, flags);
}
EXPORT_SYMBOL_GPL(__pm_stay_awake);
/**
* pm_stay_awake - Notify the PM core that a wakeup event is being processed.
* @dev: Device the wakeup event is related to.
*
* Notify the PM core of a wakeup event (signaled by @dev) by calling
* __pm_stay_awake for the @dev's wakeup source object.
*
* Call this function after detecting of a wakeup event if pm_relax() is going
* to be called directly after processing the event (and possibly passing it to
* user space for further processing).
*/
void pm_stay_awake(struct device *dev)
{
unsigned long flags;
if (!dev)
return;
spin_lock_irqsave(&dev->power.lock, flags);
__pm_stay_awake(dev->power.wakeup);
spin_unlock_irqrestore(&dev->power.lock, flags);
}
EXPORT_SYMBOL_GPL(pm_stay_awake);
#ifdef CONFIG_PM_AUTOSLEEP
static void update_prevent_sleep_time(struct wakeup_source *ws, ktime_t now)
{
ktime_t delta = ktime_sub(now, ws->start_prevent_time);
ws->prevent_sleep_time = ktime_add(ws->prevent_sleep_time, delta);
}
#else
static inline void update_prevent_sleep_time(struct wakeup_source *ws,
ktime_t now) {}
#endif
/**
* wakeup_source_deactivate - Mark given wakeup source as inactive.
* @ws: Wakeup source to handle.
*
* Update the @ws' statistics and notify the PM core that the wakeup source has
* become inactive by decrementing the counter of wakeup events being processed
* and incrementing the counter of registered wakeup events.
*/
static void wakeup_source_deactivate(struct wakeup_source *ws)
{
unsigned int cnt, inpr, cec;
ktime_t duration;
ktime_t now;
ws->relax_count++;
/*
* __pm_relax() may be called directly or from a timer function.
* If it is called directly right after the timer function has been
* started, but before the timer function calls __pm_relax(), it is
* possible that __pm_stay_awake() will be called in the meantime and
* will set ws->active. Then, ws->active may be cleared immediately
* by the __pm_relax() called from the timer function, but in such a
* case ws->relax_count will be different from ws->active_count.
*/
if (ws->relax_count != ws->active_count) {
ws->relax_count--;
return;
}
ws->active = false;
now = ktime_get();
duration = ktime_sub(now, ws->last_time);
ws->total_time = ktime_add(ws->total_time, duration);
if (ktime_to_ns(duration) > ktime_to_ns(ws->max_time))
ws->max_time = duration;
ws->last_time = now;
del_timer(&ws->timer);
ws->timer_expires = 0;
if (ws->autosleep_enabled)
update_prevent_sleep_time(ws, now);
/*
* Increment the counter of registered wakeup events and decrement the
* couter of wakeup events in progress simultaneously.
*/
cec = atomic_add_return(MAX_IN_PROGRESS, &combined_event_count);
trace_wakeup_source_deactivate(ws->name, cec);
split_counters(&cnt, &inpr);
if (!inpr && waitqueue_active(&wakeup_count_wait_queue))
wake_up(&wakeup_count_wait_queue);
}
/**
* __pm_relax - Notify the PM core that processing of a wakeup event has ended.
* @ws: Wakeup source object associated with the source of the event.
*
* Call this function for wakeup events whose processing started with calling
* __pm_stay_awake().
*
* It is safe to call it from interrupt context.
*/
void __pm_relax(struct wakeup_source *ws)
{
unsigned long flags;
if (!ws)
return;
spin_lock_irqsave(&ws->lock, flags);
if (ws->active)
wakeup_source_deactivate(ws);
spin_unlock_irqrestore(&ws->lock, flags);
}
EXPORT_SYMBOL_GPL(__pm_relax);
/**
* pm_relax - Notify the PM core that processing of a wakeup event has ended.
* @dev: Device that signaled the event.
*
* Execute __pm_relax() for the @dev's wakeup source object.
*/
void pm_relax(struct device *dev)
{
unsigned long flags;
if (!dev)
return;
spin_lock_irqsave(&dev->power.lock, flags);
__pm_relax(dev->power.wakeup);
spin_unlock_irqrestore(&dev->power.lock, flags);
}
EXPORT_SYMBOL_GPL(pm_relax);
/**
* pm_wakeup_timer_fn - Delayed finalization of a wakeup event.
* @t: timer list
*
* Call wakeup_source_deactivate() for the wakeup source whose address is stored
* in @data if it is currently active and its timer has not been canceled and
* the expiration time of the timer is not in future.
*/
static void pm_wakeup_timer_fn(struct timer_list *t)
{
struct wakeup_source *ws = from_timer(ws, t, timer);
unsigned long flags;
spin_lock_irqsave(&ws->lock, flags);
if (ws->active && ws->timer_expires
&& time_after_eq(jiffies, ws->timer_expires)) {
wakeup_source_deactivate(ws);
ws->expire_count++;
}
spin_unlock_irqrestore(&ws->lock, flags);
}
/**
* pm_wakeup_ws_event - Notify the PM core of a wakeup event.
* @ws: Wakeup source object associated with the event source.
* @msec: Anticipated event processing time (in milliseconds).
* @hard: If set, abort suspends in progress and wake up from suspend-to-idle.
*
* Notify the PM core of a wakeup event whose source is @ws that will take
* approximately @msec milliseconds to be processed by the kernel. If @ws is
* not active, activate it. If @msec is nonzero, set up the @ws' timer to
* execute pm_wakeup_timer_fn() in future.
*
* It is safe to call this function from interrupt context.
*/
void pm_wakeup_ws_event(struct wakeup_source *ws, unsigned int msec, bool hard)
{
unsigned long flags;
unsigned long expires;
if (!ws)
return;
spin_lock_irqsave(&ws->lock, flags);
wakeup_source_report_event(ws, hard);
if (!msec) {
wakeup_source_deactivate(ws);
goto unlock;
}
expires = jiffies + msecs_to_jiffies(msec);
if (!expires)
expires = 1;
if (!ws->timer_expires || time_after(expires, ws->timer_expires)) {
mod_timer(&ws->timer, expires);
ws->timer_expires = expires;
}
unlock:
spin_unlock_irqrestore(&ws->lock, flags);
}
EXPORT_SYMBOL_GPL(pm_wakeup_ws_event);
/**
* pm_wakeup_dev_event - Notify the PM core of a wakeup event.
* @dev: Device the wakeup event is related to.
* @msec: Anticipated event processing time (in milliseconds).
* @hard: If set, abort suspends in progress and wake up from suspend-to-idle.
*
* Call pm_wakeup_ws_event() for the @dev's wakeup source object.
*/
void pm_wakeup_dev_event(struct device *dev, unsigned int msec, bool hard)
{
unsigned long flags;
if (!dev)
return;
spin_lock_irqsave(&dev->power.lock, flags);
pm_wakeup_ws_event(dev->power.wakeup, msec, hard);
spin_unlock_irqrestore(&dev->power.lock, flags);
}
EXPORT_SYMBOL_GPL(pm_wakeup_dev_event);
void pm_print_active_wakeup_sources(void)
{
struct wakeup_source *ws;
int srcuidx, active = 0;
struct wakeup_source *last_activity_ws = NULL;
srcuidx = srcu_read_lock(&wakeup_srcu);
list_for_each_entry_rcu_locked(ws, &wakeup_sources, entry) {
if (ws->active) {
pm_pr_dbg("active wakeup source: %s\n", ws->name);
active = 1;
} else if (!active &&
(!last_activity_ws ||
ktime_to_ns(ws->last_time) >
ktime_to_ns(last_activity_ws->last_time))) {
last_activity_ws = ws;
}
}
if (!active && last_activity_ws)
pm_pr_dbg("last active wakeup source: %s\n",
last_activity_ws->name);
srcu_read_unlock(&wakeup_srcu, srcuidx);
}
EXPORT_SYMBOL_GPL(pm_print_active_wakeup_sources);
/**
* pm_wakeup_pending - Check if power transition in progress should be aborted.
*
* Compare the current number of registered wakeup events with its preserved
* value from the past and return true if new wakeup events have been registered
* since the old value was stored. Also return true if the current number of
* wakeup events being processed is different from zero.
*/
bool pm_wakeup_pending(void)
{
unsigned long flags;
bool ret = false;
raw_spin_lock_irqsave(&events_lock, flags);
if (events_check_enabled) {
unsigned int cnt, inpr;
split_counters(&cnt, &inpr);
ret = (cnt != saved_count || inpr > 0);
events_check_enabled = !ret;
}
raw_spin_unlock_irqrestore(&events_lock, flags);
if (ret) {
pm_pr_dbg("Wakeup pending, aborting suspend\n");
pm_print_active_wakeup_sources();
}
return ret || atomic_read(&pm_abort_suspend) > 0;
}
void pm_system_wakeup(void)
{
atomic_inc(&pm_abort_suspend);
s2idle_wake();
}
EXPORT_SYMBOL_GPL(pm_system_wakeup);
void pm_system_cancel_wakeup(void)
{
atomic_dec_if_positive(&pm_abort_suspend);
}
void pm_wakeup_clear(unsigned int irq_number)
{
raw_spin_lock_irq(&wakeup_irq_lock);
if (irq_number && wakeup_irq[0] == irq_number)
wakeup_irq[0] = wakeup_irq[1];
else
wakeup_irq[0] = 0;
wakeup_irq[1] = 0;
raw_spin_unlock_irq(&wakeup_irq_lock);
if (!irq_number)
atomic_set(&pm_abort_suspend, 0);
}
void pm_system_irq_wakeup(unsigned int irq_number)
{
unsigned long flags;
raw_spin_lock_irqsave(&wakeup_irq_lock, flags);
if (wakeup_irq[0] == 0)
wakeup_irq[0] = irq_number;
else if (wakeup_irq[1] == 0)
wakeup_irq[1] = irq_number;
else
irq_number = 0;
raw_spin_unlock_irqrestore(&wakeup_irq_lock, flags);
if (irq_number)
pm_system_wakeup();
}
unsigned int pm_wakeup_irq(void)
{
return wakeup_irq[0];
}
/**
* pm_get_wakeup_count - Read the number of registered wakeup events.
* @count: Address to store the value at.
* @block: Whether or not to block.
*
* Store the number of registered wakeup events at the address in @count. If
* @block is set, block until the current number of wakeup events being
* processed is zero.
*
* Return 'false' if the current number of wakeup events being processed is
* nonzero. Otherwise return 'true'.
*/
bool pm_get_wakeup_count(unsigned int *count, bool block)
{
unsigned int cnt, inpr;
if (block) {
DEFINE_WAIT(wait);
for (;;) {
prepare_to_wait(&wakeup_count_wait_queue, &wait,
TASK_INTERRUPTIBLE);
split_counters(&cnt, &inpr);
if (inpr == 0 || signal_pending(current))
break;
pm_print_active_wakeup_sources();
schedule();
}
finish_wait(&wakeup_count_wait_queue, &wait);
}
split_counters(&cnt, &inpr);
*count = cnt;
return !inpr;
}
/**
* pm_save_wakeup_count - Save the current number of registered wakeup events.
* @count: Value to compare with the current number of registered wakeup events.
*
* If @count is equal to the current number of registered wakeup events and the
* current number of wakeup events being processed is zero, store @count as the
* old number of registered wakeup events for pm_check_wakeup_events(), enable
* wakeup events detection and return 'true'. Otherwise disable wakeup events
* detection and return 'false'.
*/
bool pm_save_wakeup_count(unsigned int count)
{
unsigned int cnt, inpr;
unsigned long flags;
events_check_enabled = false;
raw_spin_lock_irqsave(&events_lock, flags);
split_counters(&cnt, &inpr);
if (cnt == count && inpr == 0) {
saved_count = count;
events_check_enabled = true;
}
raw_spin_unlock_irqrestore(&events_lock, flags);
return events_check_enabled;
}
#ifdef CONFIG_PM_AUTOSLEEP
/**
* pm_wakep_autosleep_enabled - Modify autosleep_enabled for all wakeup sources.
* @set: Whether to set or to clear the autosleep_enabled flags.
*/
void pm_wakep_autosleep_enabled(bool set)
{
struct wakeup_source *ws;
ktime_t now = ktime_get();
int srcuidx;
srcuidx = srcu_read_lock(&wakeup_srcu);
list_for_each_entry_rcu_locked(ws, &wakeup_sources, entry) {
spin_lock_irq(&ws->lock);
if (ws->autosleep_enabled != set) {
ws->autosleep_enabled = set;
if (ws->active) {
if (set)
ws->start_prevent_time = now;
else
update_prevent_sleep_time(ws, now);
}
}
spin_unlock_irq(&ws->lock);
}
srcu_read_unlock(&wakeup_srcu, srcuidx);
}
#endif /* CONFIG_PM_AUTOSLEEP */
/**
* print_wakeup_source_stats - Print wakeup source statistics information.
* @m: seq_file to print the statistics into.
* @ws: Wakeup source object to print the statistics for.
*/
static int print_wakeup_source_stats(struct seq_file *m,
struct wakeup_source *ws)
{
unsigned long flags;
ktime_t total_time;
ktime_t max_time;
unsigned long active_count;
ktime_t active_time;
ktime_t prevent_sleep_time;
spin_lock_irqsave(&ws->lock, flags);
total_time = ws->total_time;
max_time = ws->max_time;
prevent_sleep_time = ws->prevent_sleep_time;
active_count = ws->active_count;
if (ws->active) {
ktime_t now = ktime_get();
active_time = ktime_sub(now, ws->last_time);
total_time = ktime_add(total_time, active_time);
if (active_time > max_time)
max_time = active_time;
if (ws->autosleep_enabled)
prevent_sleep_time = ktime_add(prevent_sleep_time,
ktime_sub(now, ws->start_prevent_time));
} else {
active_time = 0;
}
seq_printf(m, "%-12s\t%lu\t\t%lu\t\t%lu\t\t%lu\t\t%lld\t\t%lld\t\t%lld\t\t%lld\t\t%lld\n",
ws->name, active_count, ws->event_count,
ws->wakeup_count, ws->expire_count,
ktime_to_ms(active_time), ktime_to_ms(total_time),
ktime_to_ms(max_time), ktime_to_ms(ws->last_time),
ktime_to_ms(prevent_sleep_time));
spin_unlock_irqrestore(&ws->lock, flags);
return 0;
}
static void *wakeup_sources_stats_seq_start(struct seq_file *m,
loff_t *pos)
{
struct wakeup_source *ws;
loff_t n = *pos;
int *srcuidx = m->private;
if (n == 0) {
seq_puts(m, "name\t\tactive_count\tevent_count\twakeup_count\t"
"expire_count\tactive_since\ttotal_time\tmax_time\t"
"last_change\tprevent_suspend_time\n");
}
*srcuidx = srcu_read_lock(&wakeup_srcu);
list_for_each_entry_rcu_locked(ws, &wakeup_sources, entry) {
if (n-- <= 0)
return ws;
}
return NULL;
}
static void *wakeup_sources_stats_seq_next(struct seq_file *m,
void *v, loff_t *pos)
{
struct wakeup_source *ws = v;
struct wakeup_source *next_ws = NULL;
++(*pos);
list_for_each_entry_continue_rcu(ws, &wakeup_sources, entry) {
next_ws = ws;
break;
}
if (!next_ws)
print_wakeup_source_stats(m, &deleted_ws);
return next_ws;
}
static void wakeup_sources_stats_seq_stop(struct seq_file *m, void *v)
{
int *srcuidx = m->private;
srcu_read_unlock(&wakeup_srcu, *srcuidx);
}
/**
* wakeup_sources_stats_seq_show - Print wakeup sources statistics information.
* @m: seq_file to print the statistics into.
* @v: wakeup_source of each iteration
*/
static int wakeup_sources_stats_seq_show(struct seq_file *m, void *v)
{
struct wakeup_source *ws = v;
print_wakeup_source_stats(m, ws);
return 0;
}
static const struct seq_operations wakeup_sources_stats_seq_ops = {
.start = wakeup_sources_stats_seq_start,
.next = wakeup_sources_stats_seq_next,
.stop = wakeup_sources_stats_seq_stop,
.show = wakeup_sources_stats_seq_show,
};
static int wakeup_sources_stats_open(struct inode *inode, struct file *file)
{
return seq_open_private(file, &wakeup_sources_stats_seq_ops, sizeof(int));
}
static const struct file_operations wakeup_sources_stats_fops = {
.owner = THIS_MODULE,
.open = wakeup_sources_stats_open,
.read = seq_read,
.llseek = seq_lseek,
.release = seq_release_private,
};
static int __init wakeup_sources_debugfs_init(void)
{
debugfs_create_file("wakeup_sources", 0444, NULL, NULL,
&wakeup_sources_stats_fops);
return 0;
}
postcore_initcall(wakeup_sources_debugfs_init);
// SPDX-License-Identifier: GPL-2.0
/*
* Copyright (C) 2007 Oracle. All rights reserved.
*/
#include <linux/sched.h>
#include <linux/sched/mm.h>
#include <linux/bio.h>
#include <linux/slab.h>
#include <linux/blkdev.h>
#include <linux/ratelimit.h>
#include <linux/kthread.h>
#include <linux/raid/pq.h>
#include <linux/semaphore.h>
#include <linux/uuid.h>
#include <linux/list_sort.h>
#include <linux/namei.h>
#include "misc.h"
#include "ctree.h"
#include "extent_map.h"
#include "disk-io.h"
#include "transaction.h"
#include "print-tree.h"
#include "volumes.h"
#include "raid56.h"
#include "async-thread.h"
#include "check-integrity.h"
#include "rcu-string.h"
#include "dev-replace.h"
#include "sysfs.h"
#include "tree-checker.h"
#include "space-info.h"
#include "block-group.h"
#include "discard.h"
#include "zoned.h"
const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = {
[BTRFS_RAID_RAID10] = {
.sub_stripes = 2,
.dev_stripes = 1,
.devs_max = 0, /* 0 == as many as possible */
.devs_min = 2,
.tolerated_failures = 1,
.devs_increment = 2,
.ncopies = 2,
.nparity = 0,
.raid_name = "raid10",
.bg_flag = BTRFS_BLOCK_GROUP_RAID10,
.mindev_error = BTRFS_ERROR_DEV_RAID10_MIN_NOT_MET,
},
[BTRFS_RAID_RAID1] = {
.sub_stripes = 1,
.dev_stripes = 1,
.devs_max = 2,
.devs_min = 2,
.tolerated_failures = 1,
.devs_increment = 2,
.ncopies = 2,
.nparity = 0,
.raid_name = "raid1",
.bg_flag = BTRFS_BLOCK_GROUP_RAID1,
.mindev_error = BTRFS_ERROR_DEV_RAID1_MIN_NOT_MET,
},
[BTRFS_RAID_RAID1C3] = {
.sub_stripes = 1,
.dev_stripes = 1,
.devs_max = 3,
.devs_min = 3,
.tolerated_failures = 2,
.devs_increment = 3,
.ncopies = 3,
.nparity = 0,
.raid_name = "raid1c3",
.bg_flag = BTRFS_BLOCK_GROUP_RAID1C3,
.mindev_error = BTRFS_ERROR_DEV_RAID1C3_MIN_NOT_MET,
},
[BTRFS_RAID_RAID1C4] = {
.sub_stripes = 1,
.dev_stripes = 1,
.devs_max = 4,
.devs_min = 4,
.tolerated_failures = 3,
.devs_increment = 4,
.ncopies = 4,
.nparity = 0,
.raid_name = "raid1c4",
.bg_flag = BTRFS_BLOCK_GROUP_RAID1C4,
.mindev_error = BTRFS_ERROR_DEV_RAID1C4_MIN_NOT_MET,
},
[BTRFS_RAID_DUP] = {
.sub_stripes = 1,
.dev_stripes = 2,
.devs_max = 1,
.devs_min = 1,
.tolerated_failures = 0,
.devs_increment = 1,
.ncopies = 2,
.nparity = 0,
.raid_name = "dup",
.bg_flag = BTRFS_BLOCK_GROUP_DUP,
.mindev_error = 0,
},
[BTRFS_RAID_RAID0] = {
.sub_stripes = 1,
.dev_stripes = 1,
.devs_max = 0,
.devs_min = 1,
.tolerated_failures = 0,
.devs_increment = 1,
.ncopies = 1,
.nparity = 0,
.raid_name = "raid0",
.bg_flag = BTRFS_BLOCK_GROUP_RAID0,
.mindev_error = 0,
},
[BTRFS_RAID_SINGLE] = {
.sub_stripes = 1,
.dev_stripes = 1,
.devs_max = 1,
.devs_min = 1,
.tolerated_failures = 0,
.devs_increment = 1,
.ncopies = 1,
.nparity = 0,
.raid_name = "single",
.bg_flag = 0,
.mindev_error = 0,
},
[BTRFS_RAID_RAID5] = {
.sub_stripes = 1,
.dev_stripes = 1,
.devs_max = 0,
.devs_min = 2,
.tolerated_failures = 1,
.devs_increment = 1,
.ncopies = 1,
.nparity = 1,
.raid_name = "raid5",
.bg_flag = BTRFS_BLOCK_GROUP_RAID5,
.mindev_error = BTRFS_ERROR_DEV_RAID5_MIN_NOT_MET,
},
[BTRFS_RAID_RAID6] = {
.sub_stripes = 1,
.dev_stripes = 1,
.devs_max = 0,
.devs_min = 3,
.tolerated_failures = 2,
.devs_increment = 1,
.ncopies = 1,
.nparity = 2,
.raid_name = "raid6",
.bg_flag = BTRFS_BLOCK_GROUP_RAID6,
.mindev_error = BTRFS_ERROR_DEV_RAID6_MIN_NOT_MET,
},
};
/*
* Convert block group flags (BTRFS_BLOCK_GROUP_*) to btrfs_raid_types, which
* can be used as index to access btrfs_raid_array[].
*/
enum btrfs_raid_types __attribute_const__ btrfs_bg_flags_to_raid_index(u64 flags)
{
if (flags & BTRFS_BLOCK_GROUP_RAID10)
return BTRFS_RAID_RAID10;
else if (flags & BTRFS_BLOCK_GROUP_RAID1)
return BTRFS_RAID_RAID1;
else if (flags & BTRFS_BLOCK_GROUP_RAID1C3)
return BTRFS_RAID_RAID1C3;
else if (flags & BTRFS_BLOCK_GROUP_RAID1C4)
return BTRFS_RAID_RAID1C4;
else if (flags & BTRFS_BLOCK_GROUP_DUP)
return BTRFS_RAID_DUP;
else if (flags & BTRFS_BLOCK_GROUP_RAID0)
return BTRFS_RAID_RAID0;
else if (flags & BTRFS_BLOCK_GROUP_RAID5)
return BTRFS_RAID_RAID5;
else if (flags & BTRFS_BLOCK_GROUP_RAID6) return BTRFS_RAID_RAID6;
return BTRFS_RAID_SINGLE; /* BTRFS_BLOCK_GROUP_SINGLE */
}
const char *btrfs_bg_type_to_raid_name(u64 flags)
{
const int index = btrfs_bg_flags_to_raid_index(flags);
if (index >= BTRFS_NR_RAID_TYPES)
return NULL;
return btrfs_raid_array[index].raid_name;}
/*
* Fill @buf with textual description of @bg_flags, no more than @size_buf
* bytes including terminating null byte.
*/
void btrfs_describe_block_groups(u64 bg_flags, char *buf, u32 size_buf)
{
int i;
int ret;
char *bp = buf;
u64 flags = bg_flags;
u32 size_bp = size_buf;
if (!flags) {
strcpy(bp, "NONE");
return;
}
#define DESCRIBE_FLAG(flag, desc) \
do { \
if (flags & (flag)) { \
ret = snprintf(bp, size_bp, "%s|", (desc)); \
if (ret < 0 || ret >= size_bp) \
goto out_overflow; \
size_bp -= ret; \
bp += ret; \
flags &= ~(flag); \
} \
} while (0)
DESCRIBE_FLAG(BTRFS_BLOCK_GROUP_DATA, "data");
DESCRIBE_FLAG(BTRFS_BLOCK_GROUP_SYSTEM, "system");
DESCRIBE_FLAG(BTRFS_BLOCK_GROUP_METADATA, "metadata");
DESCRIBE_FLAG(BTRFS_AVAIL_ALLOC_BIT_SINGLE, "single");
for (i = 0; i < BTRFS_NR_RAID_TYPES; i++)
DESCRIBE_FLAG(btrfs_raid_array[i].bg_flag,
btrfs_raid_array[i].raid_name);
#undef DESCRIBE_FLAG
if (flags) {
ret = snprintf(bp, size_bp, "0x%llx|", flags);
size_bp -= ret;
}
if (size_bp < size_buf)
buf[size_buf - size_bp - 1] = '\0'; /* remove last | */
/*
* The text is trimmed, it's up to the caller to provide sufficiently
* large buffer
*/
out_overflow:;
}
static int init_first_rw_device(struct btrfs_trans_handle *trans);
static int btrfs_relocate_sys_chunks(struct btrfs_fs_info *fs_info);
static void btrfs_dev_stat_print_on_error(struct btrfs_device *dev);
static void btrfs_dev_stat_print_on_load(struct btrfs_device *device);
static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
enum btrfs_map_op op,
u64 logical, u64 *length,
struct btrfs_bio **bbio_ret,
int mirror_num, int need_raid_map);
/*
* Device locking
* ==============
*
* There are several mutexes that protect manipulation of devices and low-level
* structures like chunks but not block groups, extents or files
*
* uuid_mutex (global lock)
* ------------------------
* protects the fs_uuids list that tracks all per-fs fs_devices, resulting from
* the SCAN_DEV ioctl registration or from mount either implicitly (the first
* device) or requested by the device= mount option
*
* the mutex can be very coarse and can cover long-running operations
*
* protects: updates to fs_devices counters like missing devices, rw devices,
* seeding, structure cloning, opening/closing devices at mount/umount time
*
* global::fs_devs - add, remove, updates to the global list
*
* does not protect: manipulation of the fs_devices::devices list in general
* but in mount context it could be used to exclude list modifications by eg.
* scan ioctl
*
* btrfs_device::name - renames (write side), read is RCU
*
* fs_devices::device_list_mutex (per-fs, with RCU)
* ------------------------------------------------
* protects updates to fs_devices::devices, ie. adding and deleting
*
* simple list traversal with read-only actions can be done with RCU protection
*
* may be used to exclude some operations from running concurrently without any
* modifications to the list (see write_all_supers)
*
* Is not required at mount and close times, because our device list is
* protected by the uuid_mutex at that point.
*
* balance_mutex
* -------------
* protects balance structures (status, state) and context accessed from
* several places (internally, ioctl)
*
* chunk_mutex
* -----------
* protects chunks, adding or removing during allocation, trim or when a new
* device is added/removed. Additionally it also protects post_commit_list of
* individual devices, since they can be added to the transaction's
* post_commit_list only with chunk_mutex held.
*
* cleaner_mutex
* -------------
* a big lock that is held by the cleaner thread and prevents running subvolume
* cleaning together with relocation or delayed iputs
*
*
* Lock nesting
* ============
*
* uuid_mutex
* device_list_mutex
* chunk_mutex
* balance_mutex
*
*
* Exclusive operations
* ====================
*
* Maintains the exclusivity of the following operations that apply to the
* whole filesystem and cannot run in parallel.
*
* - Balance (*)
* - Device add
* - Device remove
* - Device replace (*)
* - Resize
*
* The device operations (as above) can be in one of the following states:
*
* - Running state
* - Paused state
* - Completed state
*
* Only device operations marked with (*) can go into the Paused state for the
* following reasons:
*
* - ioctl (only Balance can be Paused through ioctl)
* - filesystem remounted as read-only
* - filesystem unmounted and mounted as read-only
* - system power-cycle and filesystem mounted as read-only
* - filesystem or device errors leading to forced read-only
*
* The status of exclusive operation is set and cleared atomically.
* During the course of Paused state, fs_info::exclusive_operation remains set.
* A device operation in Paused or Running state can be canceled or resumed
* either by ioctl (Balance only) or when remounted as read-write.
* The exclusive status is cleared when the device operation is canceled or
* completed.
*/
DEFINE_MUTEX(uuid_mutex);
static LIST_HEAD(fs_uuids);
struct list_head * __attribute_const__ btrfs_get_fs_uuids(void)
{
return &fs_uuids;
}
/*
* alloc_fs_devices - allocate struct btrfs_fs_devices
* @fsid: if not NULL, copy the UUID to fs_devices::fsid
* @metadata_fsid: if not NULL, copy the UUID to fs_devices::metadata_fsid
*
* Return a pointer to a new struct btrfs_fs_devices on success, or ERR_PTR().
* The returned struct is not linked onto any lists and can be destroyed with
* kfree() right away.
*/
static struct btrfs_fs_devices *alloc_fs_devices(const u8 *fsid,
const u8 *metadata_fsid)
{
struct btrfs_fs_devices *fs_devs;
fs_devs = kzalloc(sizeof(*fs_devs), GFP_KERNEL);
if (!fs_devs)
return ERR_PTR(-ENOMEM);
mutex_init(&fs_devs->device_list_mutex);
INIT_LIST_HEAD(&fs_devs->devices);
INIT_LIST_HEAD(&fs_devs->alloc_list);
INIT_LIST_HEAD(&fs_devs->fs_list);
INIT_LIST_HEAD(&fs_devs->seed_list);
if (fsid)
memcpy(fs_devs->fsid, fsid, BTRFS_FSID_SIZE); if (metadata_fsid) memcpy(fs_devs->metadata_uuid, metadata_fsid, BTRFS_FSID_SIZE);
else if (fsid)
memcpy(fs_devs->metadata_uuid, fsid, BTRFS_FSID_SIZE);
return fs_devs;
}
void btrfs_free_device(struct btrfs_device *device)
{
WARN_ON(!list_empty(&device->post_commit_list)); rcu_string_free(device->name); extent_io_tree_release(&device->alloc_state);
bio_put(device->flush_bio);
btrfs_destroy_dev_zone_info(device);
kfree(device);
}
static void free_fs_devices(struct btrfs_fs_devices *fs_devices)
{
struct btrfs_device *device;
WARN_ON(fs_devices->opened); while (!list_empty(&fs_devices->devices)) { device = list_entry(fs_devices->devices.next,
struct btrfs_device, dev_list);
list_del(&device->dev_list);
btrfs_free_device(device);
}
kfree(fs_devices);
}
void __exit btrfs_cleanup_fs_uuids(void)
{
struct btrfs_fs_devices *fs_devices;
while (!list_empty(&fs_uuids)) {
fs_devices = list_entry(fs_uuids.next,
struct btrfs_fs_devices, fs_list);
list_del(&fs_devices->fs_list);
free_fs_devices(fs_devices);
}
}
static noinline struct btrfs_fs_devices *find_fsid(
const u8 *fsid, const u8 *metadata_fsid)
{
struct btrfs_fs_devices *fs_devices;
ASSERT(fsid);
/* Handle non-split brain cases */
list_for_each_entry(fs_devices, &fs_uuids, fs_list) { if (metadata_fsid) { if (memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE) == 0 && memcmp(metadata_fsid, fs_devices->metadata_uuid,
BTRFS_FSID_SIZE) == 0)
return fs_devices;
} else {
if (memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE) == 0)
return fs_devices;
}
}
return NULL;
}
static struct btrfs_fs_devices *find_fsid_with_metadata_uuid(
struct btrfs_super_block *disk_super)
{
struct btrfs_fs_devices *fs_devices;
/*
* Handle scanned device having completed its fsid change but
* belonging to a fs_devices that was created by first scanning
* a device which didn't have its fsid/metadata_uuid changed
* at all and the CHANGING_FSID_V2 flag set.
*/
list_for_each_entry(fs_devices, &fs_uuids, fs_list) { if (fs_devices->fsid_change && memcmp(disk_super->metadata_uuid, fs_devices->fsid,
BTRFS_FSID_SIZE) == 0 &&
memcmp(fs_devices->fsid, fs_devices->metadata_uuid,
BTRFS_FSID_SIZE) == 0) {
return fs_devices;
}
}
/*
* Handle scanned device having completed its fsid change but
* belonging to a fs_devices that was created by a device that
* has an outdated pair of fsid/metadata_uuid and
* CHANGING_FSID_V2 flag set.
*/
list_for_each_entry(fs_devices, &fs_uuids, fs_list) { if (fs_devices->fsid_change &&
memcmp(fs_devices->metadata_uuid,
fs_devices->fsid, BTRFS_FSID_SIZE) != 0 && memcmp(disk_super->metadata_uuid, fs_devices->metadata_uuid,
BTRFS_FSID_SIZE) == 0) {
return fs_devices;
}
}
return find_fsid(disk_super->fsid, disk_super->metadata_uuid);
}
static int
btrfs_get_bdev_and_sb(const char *device_path, fmode_t flags, void *holder,
int flush, struct block_device **bdev,
struct btrfs_super_block **disk_super)
{
int ret;
*bdev = blkdev_get_by_path(device_path, flags, holder);
if (IS_ERR(*bdev)) {
ret = PTR_ERR(*bdev);
goto error;
}
if (flush) filemap_write_and_wait((*bdev)->bd_inode->i_mapping); ret = set_blocksize(*bdev, BTRFS_BDEV_BLOCKSIZE);
if (ret) {
blkdev_put(*bdev, flags);
goto error;
}
invalidate_bdev(*bdev);
*disk_super = btrfs_read_dev_super(*bdev);
if (IS_ERR(*disk_super)) {
ret = PTR_ERR(*disk_super);
blkdev_put(*bdev, flags);
goto error;
}
return 0;
error:
*bdev = NULL;
return ret;
}
/*
* Check if the device in the path matches the device in the given struct device.
*
* Returns:
* true If it is the same device.
* false If it is not the same device or on error.
*/
static bool device_matched(const struct btrfs_device *device, const char *path)
{
char *device_name;
dev_t dev_old;
dev_t dev_new;
int ret;
/*
* If we are looking for a device with the matching dev_t, then skip
* device without a name (a missing device).
*/
if (!device->name)
return false;
device_name = kzalloc(BTRFS_PATH_NAME_MAX, GFP_KERNEL);
if (!device_name)
return false;
rcu_read_lock();
scnprintf(device_name, BTRFS_PATH_NAME_MAX, "%s", rcu_str_deref(device->name));
rcu_read_unlock();
ret = lookup_bdev(device_name, &dev_old);
kfree(device_name);
if (ret)
return false;
ret = lookup_bdev(path, &dev_new);
if (ret)
return false;
if (dev_old == dev_new) return true;
return false;
}
/*
* Search and remove all stale (devices which are not mounted) devices.
* When both inputs are NULL, it will search and release all stale devices.
* path: Optional. When provided will it release all unmounted devices
* matching this path only.
* skip_dev: Optional. Will skip this device when searching for the stale
* devices.
* Return: 0 for success or if @path is NULL.
* -EBUSY if @path is a mounted device.
* -ENOENT if @path does not match any device in the list.
*/
static int btrfs_free_stale_devices(const char *path,
struct btrfs_device *skip_device)
{
struct btrfs_fs_devices *fs_devices, *tmp_fs_devices;
struct btrfs_device *device, *tmp_device;
int ret = 0;
lockdep_assert_held(&uuid_mutex);
if (path)
ret = -ENOENT;
list_for_each_entry_safe(fs_devices, tmp_fs_devices, &fs_uuids, fs_list) { mutex_lock(&fs_devices->device_list_mutex); list_for_each_entry_safe(device, tmp_device,
&fs_devices->devices, dev_list) {
if (skip_device && skip_device == device)
continue;
if (path && !device_matched(device, path))
continue;
if (fs_devices->opened) {
/* for an already deleted device return 0 */
if (path && ret != 0)
ret = -EBUSY;
break;
}
/* delete the stale device */
fs_devices->num_devices--;
list_del(&device->dev_list);
btrfs_free_device(device);
ret = 0;
}
mutex_unlock(&fs_devices->device_list_mutex);
if (fs_devices->num_devices == 0) {
btrfs_sysfs_remove_fsid(fs_devices);
list_del(&fs_devices->fs_list);
free_fs_devices(fs_devices);
}
}
return ret;
}
/*
* This is only used on mount, and we are protected from competing things
* messing with our fs_devices by the uuid_mutex, thus we do not need the
* fs_devices->device_list_mutex here.
*/
static int btrfs_open_one_device(struct btrfs_fs_devices *fs_devices,
struct btrfs_device *device, fmode_t flags,
void *holder)
{
struct request_queue *q;
struct block_device *bdev;
struct btrfs_super_block *disk_super;
u64 devid;
int ret;
if (device->bdev)
return -EINVAL;
if (!device->name)
return -EINVAL;
ret = btrfs_get_bdev_and_sb(device->name->str, flags, holder, 1,
&bdev, &disk_super);
if (ret)
return ret;
devid = btrfs_stack_device_id(&disk_super->dev_item);
if (devid != device->devid)
goto error_free_page;
if (memcmp(device->uuid, disk_super->dev_item.uuid, BTRFS_UUID_SIZE))
goto error_free_page;
device->generation = btrfs_super_generation(disk_super);
if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_SEEDING) {
if (btrfs_super_incompat_flags(disk_super) &
BTRFS_FEATURE_INCOMPAT_METADATA_UUID) {
pr_err(
"BTRFS: Invalid seeding and uuid-changed device detected\n");
goto error_free_page;
}
clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
fs_devices->seeding = true;
} else {
if (bdev_read_only(bdev))
clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
else
set_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
}
q = bdev_get_queue(bdev);
if (!blk_queue_nonrot(q))
fs_devices->rotating = true; device->bdev = bdev;
clear_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
device->mode = flags;
fs_devices->open_devices++;
if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) &&
device->devid != BTRFS_DEV_REPLACE_DEVID) { fs_devices->rw_devices++;
list_add_tail(&device->dev_alloc_list, &fs_devices->alloc_list);
}
btrfs_release_disk_super(disk_super);
return 0;
error_free_page:
btrfs_release_disk_super(disk_super);
blkdev_put(bdev, flags);
return -EINVAL;
}
/*
* Handle scanned device having its CHANGING_FSID_V2 flag set and the fs_devices
* being created with a disk that has already completed its fsid change. Such
* disk can belong to an fs which has its FSID changed or to one which doesn't.
* Handle both cases here.
*/
static struct btrfs_fs_devices *find_fsid_inprogress(
struct btrfs_super_block *disk_super)
{
struct btrfs_fs_devices *fs_devices;
list_for_each_entry(fs_devices, &fs_uuids, fs_list) { if (memcmp(fs_devices->metadata_uuid, fs_devices->fsid,
BTRFS_FSID_SIZE) != 0 &&
memcmp(fs_devices->metadata_uuid, disk_super->fsid, BTRFS_FSID_SIZE) == 0 && !fs_devices->fsid_change) {
return fs_devices;
}
}
return find_fsid(disk_super->fsid, NULL);
}
static struct btrfs_fs_devices *find_fsid_changed(
struct btrfs_super_block *disk_super)
{
struct btrfs_fs_devices *fs_devices;
/*
* Handles the case where scanned device is part of an fs that had
* multiple successful changes of FSID but currently device didn't
* observe it. Meaning our fsid will be different than theirs. We need
* to handle two subcases :
* 1 - The fs still continues to have different METADATA/FSID uuids.
* 2 - The fs is switched back to its original FSID (METADATA/FSID
* are equal).
*/
list_for_each_entry(fs_devices, &fs_uuids, fs_list) {
/* Changed UUIDs */
if (memcmp(fs_devices->metadata_uuid, fs_devices->fsid,
BTRFS_FSID_SIZE) != 0 &&
memcmp(fs_devices->metadata_uuid, disk_super->metadata_uuid,
BTRFS_FSID_SIZE) == 0 &&
memcmp(fs_devices->fsid, disk_super->fsid,
BTRFS_FSID_SIZE) != 0)
return fs_devices;
/* Unchanged UUIDs */
if (memcmp(fs_devices->metadata_uuid, fs_devices->fsid,
BTRFS_FSID_SIZE) == 0 &&
memcmp(fs_devices->fsid, disk_super->metadata_uuid,
BTRFS_FSID_SIZE) == 0)
return fs_devices;
}
return NULL;
}
static struct btrfs_fs_devices *find_fsid_reverted_metadata(
struct btrfs_super_block *disk_super)
{
struct btrfs_fs_devices *fs_devices;
/*
* Handle the case where the scanned device is part of an fs whose last
* metadata UUID change reverted it to the original FSID. At the same
* time * fs_devices was first created by another constitutent device
* which didn't fully observe the operation. This results in an
* btrfs_fs_devices created with metadata/fsid different AND
* btrfs_fs_devices::fsid_change set AND the metadata_uuid of the
* fs_devices equal to the FSID of the disk.
*/
list_for_each_entry(fs_devices, &fs_uuids, fs_list) { if (memcmp(fs_devices->fsid, fs_devices->metadata_uuid,
BTRFS_FSID_SIZE) != 0 &&
memcmp(fs_devices->metadata_uuid, disk_super->fsid,
BTRFS_FSID_SIZE) == 0 &&
fs_devices->fsid_change)
return fs_devices;
}
return NULL;
}
/*
* Add new device to list of registered devices
*
* Returns:
* device pointer which was just added or updated when successful
* error pointer when failed
*/
static noinline struct btrfs_device *device_list_add(const char *path,
struct btrfs_super_block *disk_super,
bool *new_device_added)
{
struct btrfs_device *device;
struct btrfs_fs_devices *fs_devices = NULL;
struct rcu_string *name;
u64 found_transid = btrfs_super_generation(disk_super);
u64 devid = btrfs_stack_device_id(&disk_super->dev_item);
bool has_metadata_uuid = (btrfs_super_incompat_flags(disk_super) &
BTRFS_FEATURE_INCOMPAT_METADATA_UUID);
bool fsid_change_in_progress = (btrfs_super_flags(disk_super) &
BTRFS_SUPER_FLAG_CHANGING_FSID_V2);
if (fsid_change_in_progress) {
if (!has_metadata_uuid)
fs_devices = find_fsid_inprogress(disk_super);
else
fs_devices = find_fsid_changed(disk_super);
} else if (has_metadata_uuid) {
fs_devices = find_fsid_with_metadata_uuid(disk_super);
} else {
fs_devices = find_fsid_reverted_metadata(disk_super);
if (!fs_devices) fs_devices = find_fsid(disk_super->fsid, NULL);
}
if (!fs_devices) {
if (has_metadata_uuid)
fs_devices = alloc_fs_devices(disk_super->fsid,
disk_super->metadata_uuid);
else
fs_devices = alloc_fs_devices(disk_super->fsid, NULL);
if (IS_ERR(fs_devices))
return ERR_CAST(fs_devices);
fs_devices->fsid_change = fsid_change_in_progress;
mutex_lock(&fs_devices->device_list_mutex);
list_add(&fs_devices->fs_list, &fs_uuids);
device = NULL;
} else {
mutex_lock(&fs_devices->device_list_mutex);
device = btrfs_find_device(fs_devices, devid,
disk_super->dev_item.uuid, NULL);
/*
* If this disk has been pulled into an fs devices created by
* a device which had the CHANGING_FSID_V2 flag then replace the
* metadata_uuid/fsid values of the fs_devices.
*/
if (fs_devices->fsid_change &&
found_transid > fs_devices->latest_generation) { memcpy(fs_devices->fsid, disk_super->fsid,
BTRFS_FSID_SIZE);
if (has_metadata_uuid)
memcpy(fs_devices->metadata_uuid,
disk_super->metadata_uuid,
BTRFS_FSID_SIZE);
else
memcpy(fs_devices->metadata_uuid,
disk_super->fsid, BTRFS_FSID_SIZE);
fs_devices->fsid_change = false;
}
}
if (!device) { if (fs_devices->opened) { mutex_unlock(&fs_devices->device_list_mutex);
return ERR_PTR(-EBUSY);
}
device = btrfs_alloc_device(NULL, &devid,
disk_super->dev_item.uuid);
if (IS_ERR(device)) {
mutex_unlock(&fs_devices->device_list_mutex);
/* we can safely leave the fs_devices entry around */
return device;
}
name = rcu_string_strdup(path, GFP_NOFS);
if (!name) {
btrfs_free_device(device);
mutex_unlock(&fs_devices->device_list_mutex);
return ERR_PTR(-ENOMEM);
}
rcu_assign_pointer(device->name, name);
list_add_rcu(&device->dev_list, &fs_devices->devices);
fs_devices->num_devices++;
device->fs_devices = fs_devices;
*new_device_added = true;
if (disk_super->label[0])
pr_info(
"BTRFS: device label %s devid %llu transid %llu %s scanned by %s (%d)\n",
disk_super->label, devid, found_transid, path,
current->comm, task_pid_nr(current));
else
pr_info(
"BTRFS: device fsid %pU devid %llu transid %llu %s scanned by %s (%d)\n",
disk_super->fsid, devid, found_transid, path,
current->comm, task_pid_nr(current));
} else if (!device->name || strcmp(device->name->str, path)) {
/*
* When FS is already mounted.
* 1. If you are here and if the device->name is NULL that
* means this device was missing at time of FS mount.
* 2. If you are here and if the device->name is different
* from 'path' that means either
* a. The same device disappeared and reappeared with
* different name. or
* b. The missing-disk-which-was-replaced, has
* reappeared now.
*
* We must allow 1 and 2a above. But 2b would be a spurious
* and unintentional.
*
* Further in case of 1 and 2a above, the disk at 'path'
* would have missed some transaction when it was away and
* in case of 2a the stale bdev has to be updated as well.
* 2b must not be allowed at all time.
*/
/*
* For now, we do allow update to btrfs_fs_device through the
* btrfs dev scan cli after FS has been mounted. We're still
* tracking a problem where systems fail mount by subvolume id
* when we reject replacement on a mounted FS.
*/
if (!fs_devices->opened && found_transid < device->generation) {
/*
* That is if the FS is _not_ mounted and if you
* are here, that means there is more than one
* disk with same uuid and devid.We keep the one
* with larger generation number or the last-in if
* generation are equal.
*/
mutex_unlock(&fs_devices->device_list_mutex);
return ERR_PTR(-EEXIST);
}
/*
* We are going to replace the device path for a given devid,
* make sure it's the same device if the device is mounted
*/
if (device->bdev) {
int error;
dev_t path_dev;
error = lookup_bdev(path, &path_dev);
if (error) {
mutex_unlock(&fs_devices->device_list_mutex); return ERR_PTR(error);
}
if (device->bdev->bd_dev != path_dev) { mutex_unlock(&fs_devices->device_list_mutex);
/*
* device->fs_info may not be reliable here, so
* pass in a NULL instead. This avoids a
* possible use-after-free when the fs_info and
* fs_info->sb are already torn down.
*/
btrfs_warn_in_rcu(NULL,
"duplicate device %s devid %llu generation %llu scanned by %s (%d)",
path, devid, found_transid,
current->comm,
task_pid_nr(current));
return ERR_PTR(-EEXIST);
}
btrfs_info_in_rcu(device->fs_info,
"devid %llu device path %s changed to %s scanned by %s (%d)",
devid, rcu_str_deref(device->name),
path, current->comm,
task_pid_nr(current));
}
name = rcu_string_strdup(path, GFP_NOFS);
if (!name) {
mutex_unlock(&fs_devices->device_list_mutex);
return ERR_PTR(-ENOMEM);
}
rcu_string_free(device->name);
rcu_assign_pointer(device->name, name);
if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state)) {
fs_devices->missing_devices--;
clear_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state);
}
}
/*
* Unmount does not free the btrfs_device struct but would zero
* generation along with most of the other members. So just update
* it back. We need it to pick the disk with largest generation
* (as above).
*/
if (!fs_devices->opened) { device->generation = found_transid;
fs_devices->latest_generation = max_t(u64, found_transid,
fs_devices->latest_generation);
}
fs_devices->total_devices = btrfs_super_num_devices(disk_super);
mutex_unlock(&fs_devices->device_list_mutex);
return device;
}
static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig)
{
struct btrfs_fs_devices *fs_devices;
struct btrfs_device *device;
struct btrfs_device *orig_dev;
int ret = 0;
lockdep_assert_held(&uuid_mutex);
fs_devices = alloc_fs_devices(orig->fsid, NULL);
if (IS_ERR(fs_devices))
return fs_devices;
fs_devices->total_devices = orig->total_devices;
list_for_each_entry(orig_dev, &orig->devices, dev_list) {
struct rcu_string *name;
device = btrfs_alloc_device(NULL, &orig_dev->devid,
orig_dev->uuid);
if (IS_ERR(device)) {
ret = PTR_ERR(device);
goto error;
}
/*
* This is ok to do without rcu read locked because we hold the
* uuid mutex so nothing we touch in here is going to disappear.
*/
if (orig_dev->name) {
name = rcu_string_strdup(orig_dev->name->str,
GFP_KERNEL);
if (!name) {
btrfs_free_device(device);
ret = -ENOMEM;
goto error;
}
rcu_assign_pointer(device->name, name);
}
list_add(&device->dev_list, &fs_devices->devices);
device->fs_devices = fs_devices;
fs_devices->num_devices++;
}
return fs_devices;
error:
free_fs_devices(fs_devices);
return ERR_PTR(ret);
}
static void __btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices,
struct btrfs_device **latest_dev)
{
struct btrfs_device *device, *next;
/* This is the initialized path, it is safe to release the devices. */
list_for_each_entry_safe(device, next, &fs_devices->devices, dev_list) { if (test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state)) {
if (!test_bit(BTRFS_DEV_STATE_REPLACE_TGT,
&device->dev_state) &&
!test_bit(BTRFS_DEV_STATE_MISSING,
&device->dev_state) &&
(!*latest_dev || device->generation > (*latest_dev)->generation)) { *latest_dev = device;
}
continue;
}
/*
* We have already validated the presence of BTRFS_DEV_REPLACE_DEVID,
* in btrfs_init_dev_replace() so just continue.
*/
if (device->devid == BTRFS_DEV_REPLACE_DEVID)
continue;
if (device->bdev) { blkdev_put(device->bdev, device->mode);
device->bdev = NULL;
fs_devices->open_devices--;
}
if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
list_del_init(&device->dev_alloc_list);
clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
fs_devices->rw_devices--;
}
list_del_init(&device->dev_list);
fs_devices->num_devices--;
btrfs_free_device(device);
}
}
/*
* After we have read the system tree and know devids belonging to this
* filesystem, remove the device which does not belong there.
*/
void btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices)
{
struct btrfs_device *latest_dev = NULL;
struct btrfs_fs_devices *seed_dev;
mutex_lock(&uuid_mutex);
__btrfs_free_extra_devids(fs_devices, &latest_dev);
list_for_each_entry(seed_dev, &fs_devices->seed_list, seed_list)
__btrfs_free_extra_devids(seed_dev, &latest_dev); fs_devices->latest_dev = latest_dev;
mutex_unlock(&uuid_mutex);
}
static void btrfs_close_bdev(struct btrfs_device *device)
{
if (!device->bdev)
return;
if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) { sync_blockdev(device->bdev);
invalidate_bdev(device->bdev);
}
blkdev_put(device->bdev, device->mode);
}
static void btrfs_close_one_device(struct btrfs_device *device)
{
struct btrfs_fs_devices *fs_devices = device->fs_devices;
if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) &&
device->devid != BTRFS_DEV_REPLACE_DEVID) { list_del_init(&device->dev_alloc_list);
fs_devices->rw_devices--;
}
if (device->devid == BTRFS_DEV_REPLACE_DEVID)
clear_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state);
if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state)) {
clear_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state);
fs_devices->missing_devices--;
}
btrfs_close_bdev(device);
if (device->bdev) {
fs_devices->open_devices--;
device->bdev = NULL;
}
clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
btrfs_destroy_dev_zone_info(device);
device->fs_info = NULL;
atomic_set(&device->dev_stats_ccnt, 0);
extent_io_tree_release(&device->alloc_state);
/*
* Reset the flush error record. We might have a transient flush error
* in this mount, and if so we aborted the current transaction and set
* the fs to an error state, guaranteeing no super blocks can be further
* committed. However that error might be transient and if we unmount the
* filesystem and mount it again, we should allow the mount to succeed
* (btrfs_check_rw_degradable() should not fail) - if after mounting the
* filesystem again we still get flush errors, then we will again abort
* any transaction and set the error state, guaranteeing no commits of
* unsafe super blocks.
*/
device->last_flush_error = 0;
/* Verify the device is back in a pristine state */
ASSERT(!test_bit(BTRFS_DEV_STATE_FLUSH_SENT, &device->dev_state));
ASSERT(!test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state));
ASSERT(list_empty(&device->dev_alloc_list));
ASSERT(list_empty(&device->post_commit_list));
ASSERT(atomic_read(&device->reada_in_flight) == 0);
}
static void close_fs_devices(struct btrfs_fs_devices *fs_devices)
{
struct btrfs_device *device, *tmp;
lockdep_assert_held(&uuid_mutex);
if (--fs_devices->opened > 0)
return;
list_for_each_entry_safe(device, tmp, &fs_devices->devices, dev_list)
btrfs_close_one_device(device);
WARN_ON(fs_devices->open_devices); WARN_ON(fs_devices->rw_devices); fs_devices->opened = 0;
fs_devices->seeding = false;
fs_devices->fs_info = NULL;
}
void btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
{
LIST_HEAD(list);
struct btrfs_fs_devices *tmp;
mutex_lock(&uuid_mutex);
close_fs_devices(fs_devices);
if (!fs_devices->opened)
list_splice_init(&fs_devices->seed_list, &list); list_for_each_entry_safe(fs_devices, tmp, &list, seed_list) {
close_fs_devices(fs_devices);
list_del(&fs_devices->seed_list);
free_fs_devices(fs_devices);
}
mutex_unlock(&uuid_mutex);}
static int open_fs_devices(struct btrfs_fs_devices *fs_devices,
fmode_t flags, void *holder)
{
struct btrfs_device *device;
struct btrfs_device *latest_dev = NULL;
struct btrfs_device *tmp_device;
flags |= FMODE_EXCL; list_for_each_entry_safe(device, tmp_device, &fs_devices->devices,
dev_list) {
int ret;
ret = btrfs_open_one_device(fs_devices, device, flags, holder);
if (ret == 0 &&
(!latest_dev || device->generation > latest_dev->generation)) {
latest_dev = device;
} else if (ret == -ENODATA) {
fs_devices->num_devices--;
list_del(&device->dev_list);
btrfs_free_device(device);
}
}
if (fs_devices->open_devices == 0)
return -EINVAL;
fs_devices->opened = 1;
fs_devices->latest_dev = latest_dev;
fs_devices->total_rw_bytes = 0;
fs_devices->chunk_alloc_policy = BTRFS_CHUNK_ALLOC_REGULAR;
fs_devices->read_policy = BTRFS_READ_POLICY_PID;
return 0;
}
static int devid_cmp(void *priv, const struct list_head *a,
const struct list_head *b)
{
const struct btrfs_device *dev1, *dev2;
dev1 = list_entry(a, struct btrfs_device, dev_list);
dev2 = list_entry(b, struct btrfs_device, dev_list);
if (dev1->devid < dev2->devid)
return -1;
else if (dev1->devid > dev2->devid)
return 1;
return 0;
}
int btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
fmode_t flags, void *holder)
{
int ret;
lockdep_assert_held(&uuid_mutex);
/*
* The device_list_mutex cannot be taken here in case opening the
* underlying device takes further locks like open_mutex.
*
* We also don't need the lock here as this is called during mount and
* exclusion is provided by uuid_mutex
*/
if (fs_devices->opened) { fs_devices->opened++;
ret = 0;
} else {
list_sort(NULL, &fs_devices->devices, devid_cmp);
ret = open_fs_devices(fs_devices, flags, holder);
}
return ret;
}
void btrfs_release_disk_super(struct btrfs_super_block *super)
{
struct page *page = virt_to_page(super);
put_page(page);
}
static struct btrfs_super_block *btrfs_read_disk_super(struct block_device *bdev,
u64 bytenr, u64 bytenr_orig)
{
struct btrfs_super_block *disk_super;
struct page *page;
void *p;
pgoff_t index;
/* make sure our super fits in the device */
if (bytenr + PAGE_SIZE >= i_size_read(bdev->bd_inode))
return ERR_PTR(-EINVAL);
/* make sure our super fits in the page */
if (sizeof(*disk_super) > PAGE_SIZE)
return ERR_PTR(-EINVAL);
/* make sure our super doesn't straddle pages on disk */
index = bytenr >> PAGE_SHIFT;
if ((bytenr + sizeof(*disk_super) - 1) >> PAGE_SHIFT != index)
return ERR_PTR(-EINVAL);
/* pull in the page with our super */
page = read_cache_page_gfp(bdev->bd_inode->i_mapping, index, GFP_KERNEL);
if (IS_ERR(page))
return ERR_CAST(page);
p = page_address(page);
/* align our pointer to the offset of the super block */
disk_super = p + offset_in_page(bytenr);
if (btrfs_super_bytenr(disk_super) != bytenr_orig ||
btrfs_super_magic(disk_super) != BTRFS_MAGIC) {
btrfs_release_disk_super(p);
return ERR_PTR(-EINVAL);
}
if (disk_super->label[0] && disk_super->label[BTRFS_LABEL_SIZE - 1]) disk_super->label[BTRFS_LABEL_SIZE - 1] = 0;
return disk_super;
}
int btrfs_forget_devices(const char *path)
{
int ret;
mutex_lock(&uuid_mutex);
ret = btrfs_free_stale_devices(strlen(path) ? path : NULL, NULL);
mutex_unlock(&uuid_mutex);
return ret;
}
/*
* Look for a btrfs signature on a device. This may be called out of the mount path
* and we are not allowed to call set_blocksize during the scan. The superblock
* is read via pagecache
*/
struct btrfs_device *btrfs_scan_one_device(const char *path, fmode_t flags,
void *holder)
{
struct btrfs_super_block *disk_super;
bool new_device_added = false;
struct btrfs_device *device = NULL;
struct block_device *bdev;
u64 bytenr, bytenr_orig;
int ret;
lockdep_assert_held(&uuid_mutex);
/*
* we would like to check all the supers, but that would make
* a btrfs mount succeed after a mkfs from a different FS.
* So, we need to add a special mount option to scan for
* later supers, using BTRFS_SUPER_MIRROR_MAX instead
*/
flags |= FMODE_EXCL;
bdev = blkdev_get_by_path(path, flags, holder);
if (IS_ERR(bdev))
return ERR_CAST(bdev);
bytenr_orig = btrfs_sb_offset(0);
ret = btrfs_sb_log_location_bdev(bdev, 0, READ, &bytenr);
if (ret) {
device = ERR_PTR(ret);
goto error_bdev_put;
}
disk_super = btrfs_read_disk_super(bdev, bytenr, bytenr_orig); if (IS_ERR(disk_super)) {
device = ERR_CAST(disk_super);
goto error_bdev_put;
}
device = device_list_add(path, disk_super, &new_device_added);
if (!IS_ERR(device)) {
if (new_device_added) btrfs_free_stale_devices(path, device);
}
btrfs_release_disk_super(disk_super);
error_bdev_put:
blkdev_put(bdev, flags); return device;
}
/*
* Try to find a chunk that intersects [start, start + len] range and when one
* such is found, record the end of it in *start
*/
static bool contains_pending_extent(struct btrfs_device *device, u64 *start,
u64 len)
{
u64 physical_start, physical_end;
lockdep_assert_held(&device->fs_info->chunk_mutex);
if (!find_first_extent_bit(&device->alloc_state, *start,
&physical_start, &physical_end,
CHUNK_ALLOCATED, NULL)) {
if (in_range(physical_start, *start, len) || in_range(*start, physical_start,
physical_end - physical_start)) {
*start = physical_end + 1;
return true;
}
}
return false;
}
static u64 dev_extent_search_start(struct btrfs_device *device, u64 start)
{
switch (device->fs_devices->chunk_alloc_policy) {
case BTRFS_CHUNK_ALLOC_REGULAR:
/*
* We don't want to overwrite the superblock on the drive nor
* any area used by the boot loader (grub for example), so we
* make sure to start at an offset of at least 1MB.
*/
return max_t(u64, start, SZ_1M);
case BTRFS_CHUNK_ALLOC_ZONED:
/*
* We don't care about the starting region like regular
* allocator, because we anyway use/reserve the first two zones
* for superblock logging.
*/
return ALIGN(start, device->zone_info->zone_size);
default:
BUG();
}
}
static bool dev_extent_hole_check_zoned(struct btrfs_device *device,
u64 *hole_start, u64 *hole_size,
u64 num_bytes)
{
u64 zone_size = device->zone_info->zone_size;
u64 pos;
int ret;
bool changed = false;
ASSERT(IS_ALIGNED(*hole_start, zone_size));
while (*hole_size > 0) {
pos = btrfs_find_allocatable_zones(device, *hole_start,
*hole_start + *hole_size,
num_bytes);
if (pos != *hole_start) {
*hole_size = *hole_start + *hole_size - pos;
*hole_start = pos;
changed = true;
if (*hole_size < num_bytes)
break;
}
ret = btrfs_ensure_empty_zones(device, pos, num_bytes);
/* Range is ensured to be empty */
if (!ret)
return changed;
/* Given hole range was invalid (outside of device) */
if (ret == -ERANGE) {
*hole_start += *hole_size;
*hole_size = 0;
return true;
}
*hole_start += zone_size;
*hole_size -= zone_size;
changed = true;
}
return changed;
}
/**
* dev_extent_hole_check - check if specified hole is suitable for allocation
* @device: the device which we have the hole
* @hole_start: starting position of the hole
* @hole_size: the size of the hole
* @num_bytes: the size of the free space that we need
*
* This function may modify @hole_start and @hole_size to reflect the suitable
* position for allocation. Returns 1 if hole position is updated, 0 otherwise.
*/
static bool dev_extent_hole_check(struct btrfs_device *device, u64 *hole_start,
u64 *hole_size, u64 num_bytes)
{
bool changed = false;
u64 hole_end = *hole_start + *hole_size;
for (;;) {
/*
* Check before we set max_hole_start, otherwise we could end up
* sending back this offset anyway.
*/
if (contains_pending_extent(device, hole_start, *hole_size)) {
if (hole_end >= *hole_start)
*hole_size = hole_end - *hole_start;
else
*hole_size = 0;
changed = true;
}
switch (device->fs_devices->chunk_alloc_policy) {
case BTRFS_CHUNK_ALLOC_REGULAR:
/* No extra check */
break;
case BTRFS_CHUNK_ALLOC_ZONED:
if (dev_extent_hole_check_zoned(device, hole_start,
hole_size, num_bytes)) {
changed = true;
/*
* The changed hole can contain pending extent.
* Loop again to check that.
*/
continue;
}
break;
default:
BUG();
}
break;
}
return changed;
}
/*
* find_free_dev_extent_start - find free space in the specified device
* @device: the device which we search the free space in
* @num_bytes: the size of the free space that we need
* @search_start: the position from which to begin the search
* @start: store the start of the free space.
* @len: the size of the free space. that we find, or the size
* of the max free space if we don't find suitable free space
*
* this uses a pretty simple search, the expectation is that it is
* called very infrequently and that a given device has a small number
* of extents
*
* @start is used to store the start of the free space if we find. But if we
* don't find suitable free space, it will be used to store the start position
* of the max free space.
*
* @len is used to store the size of the free space that we find.
* But if we don't find suitable free space, it is used to store the size of
* the max free space.
*
* NOTE: This function will search *commit* root of device tree, and does extra
* check to ensure dev extents are not double allocated.
* This makes the function safe to allocate dev extents but may not report
* correct usable device space, as device extent freed in current transaction
* is not reported as available.
*/
static int find_free_dev_extent_start(struct btrfs_device *device,
u64 num_bytes, u64 search_start, u64 *start,
u64 *len)
{
struct btrfs_fs_info *fs_info = device->fs_info;
struct btrfs_root *root = fs_info->dev_root;
struct btrfs_key key;
struct btrfs_dev_extent *dev_extent;
struct btrfs_path *path;
u64 hole_size;
u64 max_hole_start;
u64 max_hole_size;
u64 extent_end;
u64 search_end = device->total_bytes;
int ret;
int slot;
struct extent_buffer *l;
search_start = dev_extent_search_start(device, search_start); WARN_ON(device->zone_info &&
!IS_ALIGNED(num_bytes, device->zone_info->zone_size));
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
max_hole_start = search_start;
max_hole_size = 0;
again:
if (search_start >= search_end ||
test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) {
ret = -ENOSPC;
goto out;
}
path->reada = READA_FORWARD;
path->search_commit_root = 1;
path->skip_locking = 1;
key.objectid = device->devid;
key.offset = search_start;
key.type = BTRFS_DEV_EXTENT_KEY;
ret = btrfs_search_backwards(root, &key, path);
if (ret < 0)
goto out;
while (1) {
l = path->nodes[0];
slot = path->slots[0];
if (slot >= btrfs_header_nritems(l)) {
ret = btrfs_next_leaf(root, path);
if (ret == 0)
continue;
if (ret < 0)
goto out;
break;
}
btrfs_item_key_to_cpu(l, &key, slot);
if (key.objectid < device->devid)
goto next;
if (key.objectid > device->devid)
break;
if (key.type != BTRFS_DEV_EXTENT_KEY)
goto next;
if (key.offset > search_start) { hole_size = key.offset - search_start;
dev_extent_hole_check(device, &search_start, &hole_size,
num_bytes);
if (hole_size > max_hole_size) {
max_hole_start = search_start;
max_hole_size = hole_size;
}
/*
* If this free space is greater than which we need,
* it must be the max free space that we have found
* until now, so max_hole_start must point to the start
* of this free space and the length of this free space
* is stored in max_hole_size. Thus, we return
* max_hole_start and max_hole_size and go back to the
* caller.
*/
if (hole_size >= num_bytes) {
ret = 0;
goto out;
}
}
dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
extent_end = key.offset + btrfs_dev_extent_length(l,
dev_extent);
if (extent_end > search_start)
search_start = extent_end;
next:
path->slots[0]++;
cond_resched();
}
/*
* At this point, search_start should be the end of
* allocated dev extents, and when shrinking the device,
* search_end may be smaller than search_start.
*/
if (search_end > search_start) { hole_size = search_end - search_start;
if (dev_extent_hole_check(device, &search_start, &hole_size,
num_bytes)) {
btrfs_release_path(path);
goto again;
}
if (hole_size > max_hole_size) { max_hole_start = search_start;
max_hole_size = hole_size;
}
}
/* See above. */
if (max_hole_size < num_bytes)
ret = -ENOSPC;
else
ret = 0;
out:
btrfs_free_path(path);
*start = max_hole_start;
if (len)
*len = max_hole_size;
return ret;
}
int find_free_dev_extent(struct btrfs_device *device, u64 num_bytes,
u64 *start, u64 *len)
{
/* FIXME use last free of some kind */
return find_free_dev_extent_start(device, num_bytes, 0, start, len);
}
static int btrfs_free_dev_extent(struct btrfs_trans_handle *trans,
struct btrfs_device *device,
u64 start, u64 *dev_extent_len)
{
struct btrfs_fs_info *fs_info = device->fs_info;
struct btrfs_root *root = fs_info->dev_root;
int ret;
struct btrfs_path *path;
struct btrfs_key key;
struct btrfs_key found_key;
struct extent_buffer *leaf = NULL;
struct btrfs_dev_extent *extent = NULL;
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
key.objectid = device->devid;
key.offset = start;
key.type = BTRFS_DEV_EXTENT_KEY;
again:
ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
if (ret > 0) {
ret = btrfs_previous_item(root, path, key.objectid,
BTRFS_DEV_EXTENT_KEY);
if (ret)
goto out;
leaf = path->nodes[0];
btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
extent = btrfs_item_ptr(leaf, path->slots[0],
struct btrfs_dev_extent);
BUG_ON(found_key.offset > start || found_key.offset +
btrfs_dev_extent_length(leaf, extent) < start);
key = found_key;
btrfs_release_path(path);
goto again;
} else if (ret == 0) {
leaf = path->nodes[0];
extent = btrfs_item_ptr(leaf, path->slots[0],
struct btrfs_dev_extent);
} else {
goto out;
}
*dev_extent_len = btrfs_dev_extent_length(leaf, extent);
ret = btrfs_del_item(trans, root, path);
if (ret == 0)
set_bit(BTRFS_TRANS_HAVE_FREE_BGS, &trans->transaction->flags);
out:
btrfs_free_path(path);
return ret;
}
static u64 find_next_chunk(struct btrfs_fs_info *fs_info)
{
struct extent_map_tree *em_tree;
struct extent_map *em;
struct rb_node *n;
u64 ret = 0;
em_tree = &fs_info->mapping_tree;
read_lock(&em_tree->lock);
n = rb_last(&em_tree->map.rb_root);
if (n) {
em = rb_entry(n, struct extent_map, rb_node);
ret = em->start + em->len;
}
read_unlock(&em_tree->lock);
return ret;
}
static noinline int find_next_devid(struct btrfs_fs_info *fs_info,
u64 *devid_ret)
{
int ret;
struct btrfs_key key;
struct btrfs_key found_key;
struct btrfs_path *path;
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
key.type = BTRFS_DEV_ITEM_KEY;
key.offset = (u64)-1;
ret = btrfs_search_slot(NULL, fs_info->chunk_root, &key, path, 0, 0);
if (ret < 0)
goto error;
if (ret == 0) {
/* Corruption */
btrfs_err(fs_info, "corrupted chunk tree devid -1 matched");
ret = -EUCLEAN;
goto error;
}
ret = btrfs_previous_item(fs_info->chunk_root, path,
BTRFS_DEV_ITEMS_OBJECTID,
BTRFS_DEV_ITEM_KEY);
if (ret) {
*devid_ret = 1;
} else {
btrfs_item_key_to_cpu(path->nodes[0], &found_key,
path->slots[0]);
*devid_ret = found_key.offset + 1;
}
ret = 0;
error:
btrfs_free_path(path);
return ret;
}
/*
* the device information is stored in the chunk root
* the btrfs_device struct should be fully filled in
*/
static int btrfs_add_dev_item(struct btrfs_trans_handle *trans,
struct btrfs_device *device)
{
int ret;
struct btrfs_path *path;
struct btrfs_dev_item *dev_item;
struct extent_buffer *leaf;
struct btrfs_key key;
unsigned long ptr;
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
key.type = BTRFS_DEV_ITEM_KEY;
key.offset = device->devid;
ret = btrfs_insert_empty_item(trans, trans->fs_info->chunk_root, path,
&key, sizeof(*dev_item));
if (ret)
goto out;
leaf = path->nodes[0];
dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item);
btrfs_set_device_id(leaf, dev_item, device->devid);
btrfs_set_device_generation(leaf, dev_item, 0);
btrfs_set_device_type(leaf, dev_item, device->type);
btrfs_set_device_io_align(leaf, dev_item, device->io_align);
btrfs_set_device_io_width(leaf, dev_item, device->io_width);
btrfs_set_device_sector_size(leaf, dev_item, device->sector_size);
btrfs_set_device_total_bytes(leaf, dev_item,
btrfs_device_get_disk_total_bytes(device));
btrfs_set_device_bytes_used(leaf, dev_item,
btrfs_device_get_bytes_used(device));
btrfs_set_device_group(leaf, dev_item, 0);
btrfs_set_device_seek_speed(leaf, dev_item, 0);
btrfs_set_device_bandwidth(leaf, dev_item, 0);
btrfs_set_device_start_offset(leaf, dev_item, 0);
ptr = btrfs_device_uuid(dev_item);
write_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE);
ptr = btrfs_device_fsid(dev_item);
write_extent_buffer(leaf, trans->fs_info->fs_devices->metadata_uuid,
ptr, BTRFS_FSID_SIZE);
btrfs_mark_buffer_dirty(leaf);
ret = 0;
out:
btrfs_free_path(path);
return ret;
}
/*
* Function to update ctime/mtime for a given device path.
* Mainly used for ctime/mtime based probe like libblkid.
*
* We don't care about errors here, this is just to be kind to userspace.
*/
static void update_dev_time(const char *device_path)
{
struct path path;
struct timespec64 now;
int ret;
ret = kern_path(device_path, LOOKUP_FOLLOW, &path);
if (ret)
return;
now = current_time(d_inode(path.dentry));
inode_update_time(d_inode(path.dentry), &now, S_MTIME | S_CTIME);
path_put(&path);
}
static int btrfs_rm_dev_item(struct btrfs_device *device)
{
struct btrfs_root *root = device->fs_info->chunk_root;
int ret;
struct btrfs_path *path;
struct btrfs_key key;
struct btrfs_trans_handle *trans;
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
trans = btrfs_start_transaction(root, 0);
if (IS_ERR(trans)) {
btrfs_free_path(path);
return PTR_ERR(trans);
}
key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
key.type = BTRFS_DEV_ITEM_KEY;
key.offset = device->devid;
ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
if (ret) {
if (ret > 0)
ret = -ENOENT;
btrfs_abort_transaction(trans, ret);
btrfs_end_transaction(trans);
goto out;
}
ret = btrfs_del_item(trans, root, path);
if (ret) {
btrfs_abort_transaction(trans, ret);
btrfs_end_transaction(trans);
}
out:
btrfs_free_path(path);
if (!ret)
ret = btrfs_commit_transaction(trans);
return ret;
}
/*
* Verify that @num_devices satisfies the RAID profile constraints in the whole
* filesystem. It's up to the caller to adjust that number regarding eg. device
* replace.
*/
static int btrfs_check_raid_min_devices(struct btrfs_fs_info *fs_info,
u64 num_devices)
{
u64 all_avail;
unsigned seq;
int i;
do {
seq = read_seqbegin(&fs_info->profiles_lock);
all_avail = fs_info->avail_data_alloc_bits |
fs_info->avail_system_alloc_bits |
fs_info->avail_metadata_alloc_bits;
} while (read_seqretry(&fs_info->profiles_lock, seq));
for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) {
if (!(all_avail & btrfs_raid_array[i].bg_flag))
continue;
if (num_devices < btrfs_raid_array[i].devs_min)
return btrfs_raid_array[i].mindev_error;
}
return 0;
}
static struct btrfs_device * btrfs_find_next_active_device(
struct btrfs_fs_devices *fs_devs, struct btrfs_device *device)
{
struct btrfs_device *next_device;
list_for_each_entry(next_device, &fs_devs->devices, dev_list) {
if (next_device != device &&
!test_bit(BTRFS_DEV_STATE_MISSING, &next_device->dev_state)
&& next_device->bdev)
return next_device;
}
return NULL;
}
/*
* Helper function to check if the given device is part of s_bdev / latest_dev
* and replace it with the provided or the next active device, in the context
* where this function called, there should be always be another device (or
* this_dev) which is active.
*/
void __cold btrfs_assign_next_active_device(struct btrfs_device *device,
struct btrfs_device *next_device)
{
struct btrfs_fs_info *fs_info = device->fs_info;
if (!next_device)
next_device = btrfs_find_next_active_device(fs_info->fs_devices,
device);
ASSERT(next_device);
if (fs_info->sb->s_bdev &&
(fs_info->sb->s_bdev == device->bdev))
fs_info->sb->s_bdev = next_device->bdev;
if (fs_info->fs_devices->latest_dev->bdev == device->bdev)
fs_info->fs_devices->latest_dev = next_device;
}
/*
* Return btrfs_fs_devices::num_devices excluding the device that's being
* currently replaced.
*/
static u64 btrfs_num_devices(struct btrfs_fs_info *fs_info)
{
u64 num_devices = fs_info->fs_devices->num_devices;
down_read(&fs_info->dev_replace.rwsem);
if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace)) {
ASSERT(num_devices > 1);
num_devices--;
}
up_read(&fs_info->dev_replace.rwsem);
return num_devices;
}
void btrfs_scratch_superblocks(struct btrfs_fs_info *fs_info,
struct block_device *bdev,
const char *device_path)
{
struct btrfs_super_block *disk_super;
int copy_num;
if (!bdev)
return;
for (copy_num = 0; copy_num < BTRFS_SUPER_MIRROR_MAX; copy_num++) {
struct page *page;
int ret;
disk_super = btrfs_read_dev_one_super(bdev, copy_num);
if (IS_ERR(disk_super))
continue;
if (bdev_is_zoned(bdev)) {
btrfs_reset_sb_log_zones(bdev, copy_num);
continue;
}
memset(&disk_super->magic, 0, sizeof(disk_super->magic));
page = virt_to_page(disk_super);
set_page_dirty(page);
lock_page(page);
/* write_on_page() unlocks the page */
ret = write_one_page(page);
if (ret)
btrfs_warn(fs_info,
"error clearing superblock number %d (%d)",
copy_num, ret);
btrfs_release_disk_super(disk_super);
}
/* Notify udev that device has changed */
btrfs_kobject_uevent(bdev, KOBJ_CHANGE);
/* Update ctime/mtime for device path for libblkid */
update_dev_time(device_path);
}
int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path,
u64 devid, struct block_device **bdev, fmode_t *mode)
{
struct btrfs_device *device;
struct btrfs_fs_devices *cur_devices;
struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
u64 num_devices;
int ret = 0;
/*
* The device list in fs_devices is accessed without locks (neither
* uuid_mutex nor device_list_mutex) as it won't change on a mounted
* filesystem and another device rm cannot run.
*/
num_devices = btrfs_num_devices(fs_info);
ret = btrfs_check_raid_min_devices(fs_info, num_devices - 1);
if (ret)
goto out;
device = btrfs_find_device_by_devspec(fs_info, devid, device_path);
if (IS_ERR(device)) {
if (PTR_ERR(device) == -ENOENT &&
device_path && strcmp(device_path, "missing") == 0)
ret = BTRFS_ERROR_DEV_MISSING_NOT_FOUND;
else
ret = PTR_ERR(device);
goto out;
}
if (btrfs_pinned_by_swapfile(fs_info, device)) {
btrfs_warn_in_rcu(fs_info,
"cannot remove device %s (devid %llu) due to active swapfile",
rcu_str_deref(device->name), device->devid);
ret = -ETXTBSY;
goto out;
}
if (test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) {
ret = BTRFS_ERROR_DEV_TGT_REPLACE;
goto out;
}
if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) &&
fs_info->fs_devices->rw_devices == 1) {
ret = BTRFS_ERROR_DEV_ONLY_WRITABLE;
goto out;
}
if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
mutex_lock(&fs_info->chunk_mutex);
list_del_init(&device->dev_alloc_list);
device->fs_devices->rw_devices--;
mutex_unlock(&fs_info->chunk_mutex);
}
ret = btrfs_shrink_device(device, 0);
if (!ret)
btrfs_reada_remove_dev(device);
if (ret)
goto error_undo;
/*
* TODO: the superblock still includes this device in its num_devices
* counter although write_all_supers() is not locked out. This
* could give a filesystem state which requires a degraded mount.
*/
ret = btrfs_rm_dev_item(device);
if (ret)
goto error_undo;
clear_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
btrfs_scrub_cancel_dev(device);
/*
* the device list mutex makes sure that we don't change
* the device list while someone else is writing out all
* the device supers. Whoever is writing all supers, should
* lock the device list mutex before getting the number of
* devices in the super block (super_copy). Conversely,
* whoever updates the number of devices in the super block
* (super_copy) should hold the device list mutex.
*/
/*
* In normal cases the cur_devices == fs_devices. But in case
* of deleting a seed device, the cur_devices should point to
* its own fs_devices listed under the fs_devices->seed.
*/
cur_devices = device->fs_devices;
mutex_lock(&fs_devices->device_list_mutex);
list_del_rcu(&device->dev_list);
cur_devices->num_devices--;
cur_devices->total_devices--;
/* Update total_devices of the parent fs_devices if it's seed */
if (cur_devices != fs_devices)
fs_devices->total_devices--;
if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state))
cur_devices->missing_devices--;
btrfs_assign_next_active_device(device, NULL);
if (device->bdev) {
cur_devices->open_devices--;
/* remove sysfs entry */
btrfs_sysfs_remove_device(device);
}
num_devices = btrfs_super_num_devices(fs_info->super_copy) - 1;
btrfs_set_super_num_devices(fs_info->super_copy, num_devices);
mutex_unlock(&fs_devices->device_list_mutex);
/*
* At this point, the device is zero sized and detached from the
* devices list. All that's left is to zero out the old supers and
* free the device.
*
* We cannot call btrfs_close_bdev() here because we're holding the sb
* write lock, and blkdev_put() will pull in the ->open_mutex on the
* block device and it's dependencies. Instead just flush the device
* and let the caller do the final blkdev_put.
*/
if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
btrfs_scratch_superblocks(fs_info, device->bdev,
device->name->str);
if (device->bdev) {
sync_blockdev(device->bdev);
invalidate_bdev(device->bdev);
}
}
*bdev = device->bdev;
*mode = device->mode;
synchronize_rcu();
btrfs_free_device(device);
if (cur_devices->open_devices == 0) {
list_del_init(&cur_devices->seed_list);
close_fs_devices(cur_devices);
free_fs_devices(cur_devices);
}
out:
return ret;
error_undo:
btrfs_reada_undo_remove_dev(device);
if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
mutex_lock(&fs_info->chunk_mutex);
list_add(&device->dev_alloc_list,
&fs_devices->alloc_list);
device->fs_devices->rw_devices++;
mutex_unlock(&fs_info->chunk_mutex);
}
goto out;
}
void btrfs_rm_dev_replace_remove_srcdev(struct btrfs_device *srcdev)
{
struct btrfs_fs_devices *fs_devices;
lockdep_assert_held(&srcdev->fs_info->fs_devices->device_list_mutex);
/*
* in case of fs with no seed, srcdev->fs_devices will point
* to fs_devices of fs_info. However when the dev being replaced is
* a seed dev it will point to the seed's local fs_devices. In short
* srcdev will have its correct fs_devices in both the cases.
*/
fs_devices = srcdev->fs_devices;
list_del_rcu(&srcdev->dev_list);
list_del(&srcdev->dev_alloc_list);
fs_devices->num_devices--;
if (test_bit(BTRFS_DEV_STATE_MISSING, &srcdev->dev_state))
fs_devices->missing_devices--;
if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &srcdev->dev_state))
fs_devices->rw_devices--;
if (srcdev->bdev)
fs_devices->open_devices--;
}
void btrfs_rm_dev_replace_free_srcdev(struct btrfs_device *srcdev)
{
struct btrfs_fs_devices *fs_devices = srcdev->fs_devices;
mutex_lock(&uuid_mutex);
btrfs_close_bdev(srcdev);
synchronize_rcu();
btrfs_free_device(srcdev);
/* if this is no devs we rather delete the fs_devices */
if (!fs_devices->num_devices) {
/*
* On a mounted FS, num_devices can't be zero unless it's a
* seed. In case of a seed device being replaced, the replace
* target added to the sprout FS, so there will be no more
* device left under the seed FS.
*/
ASSERT(fs_devices->seeding);
list_del_init(&fs_devices->seed_list);
close_fs_devices(fs_devices);
free_fs_devices(fs_devices);
}
mutex_unlock(&uuid_mutex);
}
void btrfs_destroy_dev_replace_tgtdev(struct btrfs_device *tgtdev)
{
struct btrfs_fs_devices *fs_devices = tgtdev->fs_info->fs_devices;
mutex_lock(&fs_devices->device_list_mutex);
btrfs_sysfs_remove_device(tgtdev);
if (tgtdev->bdev)
fs_devices->open_devices--;
fs_devices->num_devices--;
btrfs_assign_next_active_device(tgtdev, NULL);
list_del_rcu(&tgtdev->dev_list);
mutex_unlock(&fs_devices->device_list_mutex);
btrfs_scratch_superblocks(tgtdev->fs_info, tgtdev->bdev,
tgtdev->name->str);
btrfs_close_bdev(tgtdev);
synchronize_rcu();
btrfs_free_device(tgtdev);
}
static struct btrfs_device *btrfs_find_device_by_path(
struct btrfs_fs_info *fs_info, const char *device_path)
{
int ret = 0;
struct btrfs_super_block *disk_super;
u64 devid;
u8 *dev_uuid;
struct block_device *bdev;
struct btrfs_device *device;
ret = btrfs_get_bdev_and_sb(device_path, FMODE_READ,
fs_info->bdev_holder, 0, &bdev, &disk_super);
if (ret)
return ERR_PTR(ret);
devid = btrfs_stack_device_id(&disk_super->dev_item);
dev_uuid = disk_super->dev_item.uuid;
if (btrfs_fs_incompat(fs_info, METADATA_UUID))
device = btrfs_find_device(fs_info->fs_devices, devid, dev_uuid,
disk_super->metadata_uuid);
else
device = btrfs_find_device(fs_info->fs_devices, devid, dev_uuid,
disk_super->fsid);
btrfs_release_disk_super(disk_super);
if (!device)
device = ERR_PTR(-ENOENT);
blkdev_put(bdev, FMODE_READ);
return device;
}
/*
* Lookup a device given by device id, or the path if the id is 0.
*/
struct btrfs_device *btrfs_find_device_by_devspec(
struct btrfs_fs_info *fs_info, u64 devid,
const char *device_path)
{
struct btrfs_device *device;
if (devid) {
device = btrfs_find_device(fs_info->fs_devices, devid, NULL,
NULL);
if (!device)
return ERR_PTR(-ENOENT);
return device;
}
if (!device_path || !device_path[0])
return ERR_PTR(-EINVAL);
if (strcmp(device_path, "missing") == 0) {
/* Find first missing device */
list_for_each_entry(device, &fs_info->fs_devices->devices,
dev_list) {
if (test_bit(BTRFS_DEV_STATE_IN_FS_METADATA,
&device->dev_state) && !device->bdev)
return device;
}
return ERR_PTR(-ENOENT);
}
return btrfs_find_device_by_path(fs_info, device_path);
}
/*
* does all the dirty work required for changing file system's UUID.
*/
static int btrfs_prepare_sprout(struct btrfs_fs_info *fs_info)
{
struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
struct btrfs_fs_devices *old_devices;
struct btrfs_fs_devices *seed_devices;
struct btrfs_super_block *disk_super = fs_info->super_copy;
struct btrfs_device *device;
u64 super_flags;
lockdep_assert_held(&uuid_mutex);
if (!fs_devices->seeding)
return -EINVAL;
/*
* Private copy of the seed devices, anchored at
* fs_info->fs_devices->seed_list
*/
seed_devices = alloc_fs_devices(NULL, NULL);
if (IS_ERR(seed_devices))
return PTR_ERR(seed_devices);
/*
* It's necessary to retain a copy of the original seed fs_devices in
* fs_uuids so that filesystems which have been seeded can successfully
* reference the seed device from open_seed_devices. This also supports
* multiple fs seed.
*/
old_devices = clone_fs_devices(fs_devices);
if (IS_ERR(old_devices)) {
kfree(seed_devices);
return PTR_ERR(old_devices);
}
list_add(&old_devices->fs_list, &fs_uuids);
memcpy(seed_devices, fs_devices, sizeof(*seed_devices));
seed_devices->opened = 1;
INIT_LIST_HEAD(&seed_devices->devices);
INIT_LIST_HEAD(&seed_devices->alloc_list);
mutex_init(&seed_devices->device_list_mutex);
mutex_lock(&fs_devices->device_list_mutex);
list_splice_init_rcu(&fs_devices->devices, &seed_devices->devices,
synchronize_rcu);
list_for_each_entry(device, &seed_devices->devices, dev_list)
device->fs_devices = seed_devices;
fs_devices->seeding = false;
fs_devices->num_devices = 0;
fs_devices->open_devices = 0;
fs_devices->missing_devices = 0;
fs_devices->rotating = false;
list_add(&seed_devices->seed_list, &fs_devices->seed_list);
generate_random_uuid(fs_devices->fsid);
memcpy(fs_devices->metadata_uuid, fs_devices->fsid, BTRFS_FSID_SIZE);
memcpy(disk_super->fsid, fs_devices->fsid, BTRFS_FSID_SIZE);
mutex_unlock(&fs_devices->device_list_mutex);
super_flags = btrfs_super_flags(disk_super) &
~BTRFS_SUPER_FLAG_SEEDING;
btrfs_set_super_flags(disk_super, super_flags);
return 0;
}
/*
* Store the expected generation for seed devices in device items.
*/
static int btrfs_finish_sprout(struct btrfs_trans_handle *trans)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
struct btrfs_root *root = fs_info->chunk_root;
struct btrfs_path *path;
struct extent_buffer *leaf;
struct btrfs_dev_item *dev_item;
struct btrfs_device *device;
struct btrfs_key key;
u8 fs_uuid[BTRFS_FSID_SIZE];
u8 dev_uuid[BTRFS_UUID_SIZE];
u64 devid;
int ret;
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
key.offset = 0;
key.type = BTRFS_DEV_ITEM_KEY;
while (1) {
ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
if (ret < 0)
goto error;
leaf = path->nodes[0];
next_slot:
if (path->slots[0] >= btrfs_header_nritems(leaf)) {
ret = btrfs_next_leaf(root, path);
if (ret > 0)
break;
if (ret < 0)
goto error;
leaf = path->nodes[0];
btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
btrfs_release_path(path);
continue;
}
btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
if (key.objectid != BTRFS_DEV_ITEMS_OBJECTID ||
key.type != BTRFS_DEV_ITEM_KEY)
break;
dev_item = btrfs_item_ptr(leaf, path->slots[0],
struct btrfs_dev_item);
devid = btrfs_device_id(leaf, dev_item);
read_extent_buffer(leaf, dev_uuid, btrfs_device_uuid(dev_item),
BTRFS_UUID_SIZE);
read_extent_buffer(leaf, fs_uuid, btrfs_device_fsid(dev_item),
BTRFS_FSID_SIZE);
device = btrfs_find_device(fs_info->fs_devices, devid, dev_uuid,
fs_uuid);
BUG_ON(!device); /* Logic error */
if (device->fs_devices->seeding) {
btrfs_set_device_generation(leaf, dev_item,
device->generation);
btrfs_mark_buffer_dirty(leaf);
}
path->slots[0]++;
goto next_slot;
}
ret = 0;
error:
btrfs_free_path(path);
return ret;
}
int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path)
{
struct btrfs_root *root = fs_info->dev_root;
struct request_queue *q;
struct btrfs_trans_handle *trans;
struct btrfs_device *device;
struct block_device *bdev;
struct super_block *sb = fs_info->sb;
struct rcu_string *name;
struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
u64 orig_super_total_bytes;
u64 orig_super_num_devices;
int seeding_dev = 0;
int ret = 0;
bool locked = false;
if (sb_rdonly(sb) && !fs_devices->seeding)
return -EROFS;
bdev = blkdev_get_by_path(device_path, FMODE_WRITE | FMODE_EXCL,
fs_info->bdev_holder);
if (IS_ERR(bdev))
return PTR_ERR(bdev);
if (!btrfs_check_device_zone_type(fs_info, bdev)) {
ret = -EINVAL;
goto error;
}
if (fs_devices->seeding) {
seeding_dev = 1;
down_write(&sb->s_umount);
mutex_lock(&uuid_mutex);
locked = true;
}
sync_blockdev(bdev);
rcu_read_lock();
list_for_each_entry_rcu(device, &fs_devices->devices, dev_list) {
if (device->bdev == bdev) {
ret = -EEXIST;
rcu_read_unlock();
goto error;
}
}
rcu_read_unlock();
device = btrfs_alloc_device(fs_info, NULL, NULL);
if (IS_ERR(device)) {
/* we can safely leave the fs_devices entry around */
ret = PTR_ERR(device);
goto error;
}
name = rcu_string_strdup(device_path, GFP_KERNEL);
if (!name) {
ret = -ENOMEM;
goto error_free_device;
}
rcu_assign_pointer(device->name, name);
device->fs_info = fs_info;
device->bdev = bdev;
ret = btrfs_get_dev_zone_info(device, false);
if (ret)
goto error_free_device;
trans = btrfs_start_transaction(root, 0);
if (IS_ERR(trans)) {
ret = PTR_ERR(trans);
goto error_free_zone;
}
q = bdev_get_queue(bdev);
set_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
device->generation = trans->transid;
device->io_width = fs_info->sectorsize;
device->io_align = fs_info->sectorsize;
device->sector_size = fs_info->sectorsize;
device->total_bytes = round_down(i_size_read(bdev->bd_inode),
fs_info->sectorsize);
device->disk_total_bytes = device->total_bytes;
device->commit_total_bytes = device->total_bytes;
set_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
clear_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state);
device->mode = FMODE_EXCL;
device->dev_stats_valid = 1;
set_blocksize(device->bdev, BTRFS_BDEV_BLOCKSIZE);
if (seeding_dev) {
btrfs_clear_sb_rdonly(sb);
ret = btrfs_prepare_sprout(fs_info);
if (ret) {
btrfs_abort_transaction(trans, ret);
goto error_trans;
}
btrfs_assign_next_active_device(fs_info->fs_devices->latest_dev,
device);
}
device->fs_devices = fs_devices;
mutex_lock(&fs_devices->device_list_mutex);
mutex_lock(&fs_info->chunk_mutex);
list_add_rcu(&device->dev_list, &fs_devices->devices);
list_add(&device->dev_alloc_list, &fs_devices->alloc_list);
fs_devices->num_devices++;
fs_devices->open_devices++;
fs_devices->rw_devices++;
fs_devices->total_devices++;
fs_devices->total_rw_bytes += device->total_bytes;
atomic64_add(device->total_bytes, &fs_info->free_chunk_space);
if (!blk_queue_nonrot(q))
fs_devices->rotating = true;
orig_super_total_bytes = btrfs_super_total_bytes(fs_info->super_copy);
btrfs_set_super_total_bytes(fs_info->super_copy,
round_down(orig_super_total_bytes + device->total_bytes,
fs_info->sectorsize));
orig_super_num_devices = btrfs_super_num_devices(fs_info->super_copy);
btrfs_set_super_num_devices(fs_info->super_copy,
orig_super_num_devices + 1);
/*
* we've got more storage, clear any full flags on the space
* infos
*/
btrfs_clear_space_info_full(fs_info);
mutex_unlock(&fs_info->chunk_mutex);
/* Add sysfs device entry */
btrfs_sysfs_add_device(device);
mutex_unlock(&fs_devices->device_list_mutex);
if (seeding_dev) {
mutex_lock(&fs_info->chunk_mutex);
ret = init_first_rw_device(trans);
mutex_unlock(&fs_info->chunk_mutex);
if (ret) {
btrfs_abort_transaction(trans, ret);
goto error_sysfs;
}
}
ret = btrfs_add_dev_item(trans, device);
if (ret) {
btrfs_abort_transaction(trans, ret);
goto error_sysfs;
}
if (seeding_dev) {
ret = btrfs_finish_sprout(trans);
if (ret) {
btrfs_abort_transaction(trans, ret);
goto error_sysfs;
}
/*
* fs_devices now represents the newly sprouted filesystem and
* its fsid has been changed by btrfs_prepare_sprout
*/
btrfs_sysfs_update_sprout_fsid(fs_devices);
}
ret = btrfs_commit_transaction(trans);
if (seeding_dev) {
mutex_unlock(&uuid_mutex);
up_write(&sb->s_umount);
locked = false;
if (ret) /* transaction commit */
return ret;
ret = btrfs_relocate_sys_chunks(fs_info);
if (ret < 0)
btrfs_handle_fs_error(fs_info, ret,
"Failed to relocate sys chunks after device initialization. This can be fixed using the \"btrfs balance\" command.");
trans = btrfs_attach_transaction(root);
if (IS_ERR(trans)) {
if (PTR_ERR(trans) == -ENOENT)
return 0;
ret = PTR_ERR(trans);
trans = NULL;
goto error_sysfs;
}
ret = btrfs_commit_transaction(trans);
}
/*
* Now that we have written a new super block to this device, check all
* other fs_devices list if device_path alienates any other scanned
* device.
* We can ignore the return value as it typically returns -EINVAL and
* only succeeds if the device was an alien.
*/
btrfs_forget_devices(device_path);
/* Update ctime/mtime for blkid or udev */
update_dev_time(device_path);
return ret;
error_sysfs:
btrfs_sysfs_remove_device(device);
mutex_lock(&fs_info->fs_devices->device_list_mutex);
mutex_lock(&fs_info->chunk_mutex);
list_del_rcu(&device->dev_list);
list_del(&device->dev_alloc_list);
fs_info->fs_devices->num_devices--;
fs_info->fs_devices->open_devices--;
fs_info->fs_devices->rw_devices--;
fs_info->fs_devices->total_devices--;
fs_info->fs_devices->total_rw_bytes -= device->total_bytes;
atomic64_sub(device->total_bytes, &fs_info->free_chunk_space);
btrfs_set_super_total_bytes(fs_info->super_copy,
orig_super_total_bytes);
btrfs_set_super_num_devices(fs_info->super_copy,
orig_super_num_devices);
mutex_unlock(&fs_info->chunk_mutex);
mutex_unlock(&fs_info->fs_devices->device_list_mutex);
error_trans:
if (seeding_dev)
btrfs_set_sb_rdonly(sb);
if (trans)
btrfs_end_transaction(trans);
error_free_zone:
btrfs_destroy_dev_zone_info(device);
error_free_device:
btrfs_free_device(device);
error:
blkdev_put(bdev, FMODE_EXCL);
if (locked) {
mutex_unlock(&uuid_mutex);
up_write(&sb->s_umount);
}
return ret;
}
static noinline int btrfs_update_device(struct btrfs_trans_handle *trans,
struct btrfs_device *device)
{
int ret;
struct btrfs_path *path;
struct btrfs_root *root = device->fs_info->chunk_root;
struct btrfs_dev_item *dev_item;
struct extent_buffer *leaf;
struct btrfs_key key;
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
key.type = BTRFS_DEV_ITEM_KEY;
key.offset = device->devid;
ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
if (ret < 0)
goto out;
if (ret > 0) {
ret = -ENOENT;
goto out;
}
leaf = path->nodes[0];
dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item);
btrfs_set_device_id(leaf, dev_item, device->devid);
btrfs_set_device_type(leaf, dev_item, device->type);
btrfs_set_device_io_align(leaf, dev_item, device->io_align);
btrfs_set_device_io_width(leaf, dev_item, device->io_width);
btrfs_set_device_sector_size(leaf, dev_item, device->sector_size);
btrfs_set_device_total_bytes(leaf, dev_item,
btrfs_device_get_disk_total_bytes(device));
btrfs_set_device_bytes_used(leaf, dev_item,
btrfs_device_get_bytes_used(device));
btrfs_mark_buffer_dirty(leaf);
out:
btrfs_free_path(path); return ret;
}
int btrfs_grow_device(struct btrfs_trans_handle *trans,
struct btrfs_device *device, u64 new_size)
{
struct btrfs_fs_info *fs_info = device->fs_info;
struct btrfs_super_block *super_copy = fs_info->super_copy;
u64 old_total;
u64 diff;
if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state))
return -EACCES;
new_size = round_down(new_size, fs_info->sectorsize);
mutex_lock(&fs_info->chunk_mutex);
old_total = btrfs_super_total_bytes(super_copy);
diff = round_down(new_size - device->total_bytes, fs_info->sectorsize);
if (new_size <= device->total_bytes ||
test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) {
mutex_unlock(&fs_info->chunk_mutex);
return -EINVAL;
}
btrfs_set_super_total_bytes(super_copy,
round_down(old_total + diff, fs_info->sectorsize));
device->fs_devices->total_rw_bytes += diff;
btrfs_device_set_total_bytes(device, new_size);
btrfs_device_set_disk_total_bytes(device, new_size);
btrfs_clear_space_info_full(device->fs_info);
if (list_empty(&device->post_commit_list))
list_add_tail(&device->post_commit_list,
&trans->transaction->dev_update_list);
mutex_unlock(&fs_info->chunk_mutex);
return btrfs_update_device(trans, device);
}
static int btrfs_free_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
struct btrfs_root *root = fs_info->chunk_root;
int ret;
struct btrfs_path *path;
struct btrfs_key key;
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
key.offset = chunk_offset;
key.type = BTRFS_CHUNK_ITEM_KEY;
ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
if (ret < 0)
goto out;
else if (ret > 0) { /* Logic error or corruption */
btrfs_handle_fs_error(fs_info, -ENOENT,
"Failed lookup while freeing chunk.");
ret = -ENOENT;
goto out;
}
ret = btrfs_del_item(trans, root, path);
if (ret < 0)
btrfs_handle_fs_error(fs_info, ret,
"Failed to delete chunk item.");
out:
btrfs_free_path(path);
return ret;
}
static int btrfs_del_sys_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset)
{
struct btrfs_super_block *super_copy = fs_info->super_copy;
struct btrfs_disk_key *disk_key;
struct btrfs_chunk *chunk;
u8 *ptr;
int ret = 0;
u32 num_stripes;
u32 array_size;
u32 len = 0;
u32 cur;
struct btrfs_key key;
lockdep_assert_held(&fs_info->chunk_mutex);
array_size = btrfs_super_sys_array_size(super_copy);
ptr = super_copy->sys_chunk_array;
cur = 0;
while (cur < array_size) {
disk_key = (struct btrfs_disk_key *)ptr;
btrfs_disk_key_to_cpu(&key, disk_key);
len = sizeof(*disk_key);
if (key.type == BTRFS_CHUNK_ITEM_KEY) {
chunk = (struct btrfs_chunk *)(ptr + len);
num_stripes = btrfs_stack_chunk_num_stripes(chunk);
len += btrfs_chunk_item_size(num_stripes);
} else {
ret = -EIO;
break;
}
if (key.objectid == BTRFS_FIRST_CHUNK_TREE_OBJECTID &&
key.offset == chunk_offset) {
memmove(ptr, ptr + len, array_size - (cur + len));
array_size -= len;
btrfs_set_super_sys_array_size(super_copy, array_size);
} else {
ptr += len;
cur += len;
}
}
return ret;
}
/*
* btrfs_get_chunk_map() - Find the mapping containing the given logical extent.
* @logical: Logical block offset in bytes.
* @length: Length of extent in bytes.
*
* Return: Chunk mapping or ERR_PTR.
*/
struct extent_map *btrfs_get_chunk_map(struct btrfs_fs_info *fs_info,
u64 logical, u64 length)
{
struct extent_map_tree *em_tree;
struct extent_map *em;
em_tree = &fs_info->mapping_tree;
read_lock(&em_tree->lock);
em = lookup_extent_mapping(em_tree, logical, length);
read_unlock(&em_tree->lock);
if (!em) {
btrfs_crit(fs_info, "unable to find logical %llu length %llu",
logical, length);
return ERR_PTR(-EINVAL);
}
if (em->start > logical || em->start + em->len < logical) {
btrfs_crit(fs_info,
"found a bad mapping, wanted %llu-%llu, found %llu-%llu",
logical, length, em->start, em->start + em->len);
free_extent_map(em);
return ERR_PTR(-EINVAL);
}
/* callers are responsible for dropping em's ref. */
return em;
}
static int remove_chunk_item(struct btrfs_trans_handle *trans,
struct map_lookup *map, u64 chunk_offset)
{
int i;
/*
* Removing chunk items and updating the device items in the chunks btree
* requires holding the chunk_mutex.
* See the comment at btrfs_chunk_alloc() for the details.
*/
lockdep_assert_held(&trans->fs_info->chunk_mutex);
for (i = 0; i < map->num_stripes; i++) {
int ret;
ret = btrfs_update_device(trans, map->stripes[i].dev);
if (ret)
return ret;
}
return btrfs_free_chunk(trans, chunk_offset);
}
int btrfs_remove_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
struct extent_map *em;
struct map_lookup *map;
u64 dev_extent_len = 0;
int i, ret = 0;
struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
em = btrfs_get_chunk_map(fs_info, chunk_offset, 1);
if (IS_ERR(em)) {
/*
* This is a logic error, but we don't want to just rely on the
* user having built with ASSERT enabled, so if ASSERT doesn't
* do anything we still error out.
*/
ASSERT(0);
return PTR_ERR(em);
}
map = em->map_lookup;
/*
* First delete the device extent items from the devices btree.
* We take the device_list_mutex to avoid racing with the finishing phase
* of a device replace operation. See the comment below before acquiring
* fs_info->chunk_mutex. Note that here we do not acquire the chunk_mutex
* because that can result in a deadlock when deleting the device extent
* items from the devices btree - COWing an extent buffer from the btree
* may result in allocating a new metadata chunk, which would attempt to
* lock again fs_info->chunk_mutex.
*/
mutex_lock(&fs_devices->device_list_mutex);
for (i = 0; i < map->num_stripes; i++) {
struct btrfs_device *device = map->stripes[i].dev;
ret = btrfs_free_dev_extent(trans, device,
map->stripes[i].physical,
&dev_extent_len);
if (ret) {
mutex_unlock(&fs_devices->device_list_mutex);
btrfs_abort_transaction(trans, ret);
goto out;
}
if (device->bytes_used > 0) {
mutex_lock(&fs_info->chunk_mutex);
btrfs_device_set_bytes_used(device,
device->bytes_used - dev_extent_len);
atomic64_add(dev_extent_len, &fs_info->free_chunk_space);
btrfs_clear_space_info_full(fs_info);
mutex_unlock(&fs_info->chunk_mutex);
}
}
mutex_unlock(&fs_devices->device_list_mutex);
/*
* We acquire fs_info->chunk_mutex for 2 reasons:
*
* 1) Just like with the first phase of the chunk allocation, we must
* reserve system space, do all chunk btree updates and deletions, and
* update the system chunk array in the superblock while holding this
* mutex. This is for similar reasons as explained on the comment at
* the top of btrfs_chunk_alloc();
*
* 2) Prevent races with the final phase of a device replace operation
* that replaces the device object associated with the map's stripes,
* because the device object's id can change at any time during that
* final phase of the device replace operation
* (dev-replace.c:btrfs_dev_replace_finishing()), so we could grab the
* replaced device and then see it with an ID of
* BTRFS_DEV_REPLACE_DEVID, which would cause a failure when updating
* the device item, which does not exists on the chunk btree.
* The finishing phase of device replace acquires both the
* device_list_mutex and the chunk_mutex, in that order, so we are
* safe by just acquiring the chunk_mutex.
*/
trans->removing_chunk = true;
mutex_lock(&fs_info->chunk_mutex);
check_system_chunk(trans, map->type);
ret = remove_chunk_item(trans, map, chunk_offset);
/*
* Normally we should not get -ENOSPC since we reserved space before
* through the call to check_system_chunk().
*
* Despite our system space_info having enough free space, we may not
* be able to allocate extents from its block groups, because all have
* an incompatible profile, which will force us to allocate a new system
* block group with the right profile, or right after we called
* check_system_space() above, a scrub turned the only system block group
* with enough free space into RO mode.
* This is explained with more detail at do_chunk_alloc().
*
* So if we get -ENOSPC, allocate a new system chunk and retry once.
*/
if (ret == -ENOSPC) {
const u64 sys_flags = btrfs_system_alloc_profile(fs_info);
struct btrfs_block_group *sys_bg;
sys_bg = btrfs_alloc_chunk(trans, sys_flags);
if (IS_ERR(sys_bg)) {
ret = PTR_ERR(sys_bg);
btrfs_abort_transaction(trans, ret);
goto out;
}
ret = btrfs_chunk_alloc_add_chunk_item(trans, sys_bg);
if (ret) {
btrfs_abort_transaction(trans, ret);
goto out;
}
ret = remove_chunk_item(trans, map, chunk_offset);
if (ret) {
btrfs_abort_transaction(trans, ret);
goto out;
}
} else if (ret) {
btrfs_abort_transaction(trans, ret);
goto out;
}
trace_btrfs_chunk_free(fs_info, map, chunk_offset, em->len);
if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) {
ret = btrfs_del_sys_chunk(fs_info, chunk_offset);
if (ret) {
btrfs_abort_transaction(trans, ret);
goto out;
}
}
mutex_unlock(&fs_info->chunk_mutex);
trans->removing_chunk = false;
/*
* We are done with chunk btree updates and deletions, so release the
* system space we previously reserved (with check_system_chunk()).
*/
btrfs_trans_release_chunk_metadata(trans);
ret = btrfs_remove_block_group(trans, chunk_offset, em);
if (ret) {
btrfs_abort_transaction(trans, ret);
goto out;
}
out:
if (trans->removing_chunk) {
mutex_unlock(&fs_info->chunk_mutex);
trans->removing_chunk = false;
}
/* once for us */
free_extent_map(em);
return ret;
}
int btrfs_relocate_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset)
{
struct btrfs_root *root = fs_info->chunk_root;
struct btrfs_trans_handle *trans;
struct btrfs_block_group *block_group;
u64 length;
int ret;
/*
* Prevent races with automatic removal of unused block groups.
* After we relocate and before we remove the chunk with offset
* chunk_offset, automatic removal of the block group can kick in,
* resulting in a failure when calling btrfs_remove_chunk() below.
*
* Make sure to acquire this mutex before doing a tree search (dev
* or chunk trees) to find chunks. Otherwise the cleaner kthread might
* call btrfs_remove_chunk() (through btrfs_delete_unused_bgs()) after
* we release the path used to search the chunk/dev tree and before
* the current task acquires this mutex and calls us.
*/
lockdep_assert_held(&fs_info->reclaim_bgs_lock);
/* step one, relocate all the extents inside this chunk */
btrfs_scrub_pause(fs_info);
ret = btrfs_relocate_block_group(fs_info, chunk_offset);
btrfs_scrub_continue(fs_info);
if (ret)
return ret;
block_group = btrfs_lookup_block_group(fs_info, chunk_offset);
if (!block_group)
return -ENOENT;
btrfs_discard_cancel_work(&fs_info->discard_ctl, block_group);
length = block_group->length;
btrfs_put_block_group(block_group);
/*
* On a zoned file system, discard the whole block group, this will
* trigger a REQ_OP_ZONE_RESET operation on the device zone. If
* resetting the zone fails, don't treat it as a fatal problem from the
* filesystem's point of view.
*/
if (btrfs_is_zoned(fs_info)) {
ret = btrfs_discard_extent(fs_info, chunk_offset, length, NULL);
if (ret)
btrfs_info(fs_info,
"failed to reset zone %llu after relocation",
chunk_offset);
}
trans = btrfs_start_trans_remove_block_group(root->fs_info,
chunk_offset);
if (IS_ERR(trans)) {
ret = PTR_ERR(trans);
btrfs_handle_fs_error(root->fs_info, ret, NULL);
return ret;
}
/*
* step two, delete the device extents and the
* chunk tree entries
*/
ret = btrfs_remove_chunk(trans, chunk_offset);
btrfs_end_transaction(trans);
return ret;
}
static int btrfs_relocate_sys_chunks(struct btrfs_fs_info *fs_info)
{
struct btrfs_root *chunk_root = fs_info->chunk_root;
struct btrfs_path *path;
struct extent_buffer *leaf;
struct btrfs_chunk *chunk;
struct btrfs_key key;
struct btrfs_key found_key;
u64 chunk_type;
bool retried = false;
int failed = 0;
int ret;
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
again:
key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
key.offset = (u64)-1;
key.type = BTRFS_CHUNK_ITEM_KEY;
while (1) {
mutex_lock(&fs_info->reclaim_bgs_lock);
ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0);
if (ret < 0) {
mutex_unlock(&fs_info->reclaim_bgs_lock);
goto error;
}
BUG_ON(ret == 0); /* Corruption */
ret = btrfs_previous_item(chunk_root, path, key.objectid,
key.type);
if (ret)
mutex_unlock(&fs_info->reclaim_bgs_lock);
if (ret < 0)
goto error;
if (ret > 0)
break;
leaf = path->nodes[0];
btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
chunk = btrfs_item_ptr(leaf, path->slots[0],
struct btrfs_chunk);
chunk_type = btrfs_chunk_type(leaf, chunk);
btrfs_release_path(path);
if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) {
ret = btrfs_relocate_chunk(fs_info, found_key.offset);
if (ret == -ENOSPC)
failed++;
else
BUG_ON(ret);
}
mutex_unlock(&fs_info->reclaim_bgs_lock);
if (found_key.offset == 0)
break;
key.offset = found_key.offset - 1;
}
ret = 0;
if (failed && !retried) {
failed = 0;
retried = true;
goto again;
} else if (WARN_ON(failed && retried)) {
ret = -ENOSPC;
}
error:
btrfs_free_path(path);
return ret;
}
/*
* return 1 : allocate a data chunk successfully,
* return <0: errors during allocating a data chunk,
* return 0 : no need to allocate a data chunk.
*/
static int btrfs_may_alloc_data_chunk(struct btrfs_fs_info *fs_info,
u64 chunk_offset)
{
struct btrfs_block_group *cache;
u64 bytes_used;
u64 chunk_type;
cache = btrfs_lookup_block_group(fs_info, chunk_offset);
ASSERT(cache);
chunk_type = cache->flags;
btrfs_put_block_group(cache);
if (!(chunk_type & BTRFS_BLOCK_GROUP_DATA))
return 0;
spin_lock(&fs_info->data_sinfo->lock);
bytes_used = fs_info->data_sinfo->bytes_used;
spin_unlock(&fs_info->data_sinfo->lock);
if (!bytes_used) {
struct btrfs_trans_handle *trans;
int ret;
trans = btrfs_join_transaction(fs_info->tree_root);
if (IS_ERR(trans))
return PTR_ERR(trans);
ret = btrfs_force_chunk_alloc(trans, BTRFS_BLOCK_GROUP_DATA);
btrfs_end_transaction(trans);
if (ret < 0)
return ret;
return 1;
}
return 0;
}
static int insert_balance_item(struct btrfs_fs_info *fs_info,
struct btrfs_balance_control *bctl)
{
struct btrfs_root *root = fs_info->tree_root;
struct btrfs_trans_handle *trans;
struct btrfs_balance_item *item;
struct btrfs_disk_balance_args disk_bargs;
struct btrfs_path *path;
struct extent_buffer *leaf;
struct btrfs_key key;
int ret, err;
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
trans = btrfs_start_transaction(root, 0);
if (IS_ERR(trans)) {
btrfs_free_path(path);
return PTR_ERR(trans);
}
key.objectid = BTRFS_BALANCE_OBJECTID;
key.type = BTRFS_TEMPORARY_ITEM_KEY;
key.offset = 0;
ret = btrfs_insert_empty_item(trans, root, path, &key,
sizeof(*item));
if (ret)
goto out;
leaf = path->nodes[0];
item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_balance_item);
memzero_extent_buffer(leaf, (unsigned long)item, sizeof(*item));
btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->data);
btrfs_set_balance_data(leaf, item, &disk_bargs);
btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->meta);
btrfs_set_balance_meta(leaf, item, &disk_bargs);
btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->sys);
btrfs_set_balance_sys(leaf, item, &disk_bargs);
btrfs_set_balance_flags(leaf, item, bctl->flags);
btrfs_mark_buffer_dirty(leaf);
out:
btrfs_free_path(path);
err = btrfs_commit_transaction(trans);
if (err && !ret)
ret = err;
return ret;
}
static int del_balance_item(struct btrfs_fs_info *fs_info)
{
struct btrfs_root *root = fs_info->tree_root;
struct btrfs_trans_handle *trans;
struct btrfs_path *path;
struct btrfs_key key;
int ret, err;
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
trans = btrfs_start_transaction_fallback_global_rsv(root, 0);
if (IS_ERR(trans)) {
btrfs_free_path(path);
return PTR_ERR(trans);
}
key.objectid = BTRFS_BALANCE_OBJECTID;
key.type = BTRFS_TEMPORARY_ITEM_KEY;
key.offset = 0;
ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
if (ret < 0)
goto out;
if (ret > 0) {
ret = -ENOENT;
goto out;
}
ret = btrfs_del_item(trans, root, path);
out:
btrfs_free_path(path);
err = btrfs_commit_transaction(trans);
if (err && !ret)
ret = err;
return ret;
}
/*
* This is a heuristic used to reduce the number of chunks balanced on
* resume after balance was interrupted.
*/
static void update_balance_args(struct btrfs_balance_control *bctl)
{
/*
* Turn on soft mode for chunk types that were being converted.
*/
if (bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT)
bctl->data.flags |= BTRFS_BALANCE_ARGS_SOFT;
if (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT)
bctl->sys.flags |= BTRFS_BALANCE_ARGS_SOFT;
if (bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT)
bctl->meta.flags |= BTRFS_BALANCE_ARGS_SOFT;
/*
* Turn on usage filter if is not already used. The idea is
* that chunks that we have already balanced should be
* reasonably full. Don't do it for chunks that are being
* converted - that will keep us from relocating unconverted
* (albeit full) chunks.
*/
if (!(bctl->data.flags & BTRFS_BALANCE_ARGS_USAGE) &&
!(bctl->data.flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) &&
!(bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT)) {
bctl->data.flags |= BTRFS_BALANCE_ARGS_USAGE;
bctl->data.usage = 90;
}
if (!(bctl->sys.flags & BTRFS_BALANCE_ARGS_USAGE) &&
!(bctl->sys.flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) &&
!(bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT)) {
bctl->sys.flags |= BTRFS_BALANCE_ARGS_USAGE;
bctl->sys.usage = 90;
}
if (!(bctl->meta.flags & BTRFS_BALANCE_ARGS_USAGE) &&
!(bctl->meta.flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) &&
!(bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT)) {
bctl->meta.flags |= BTRFS_BALANCE_ARGS_USAGE;
bctl->meta.usage = 90;
}
}
/*
* Clear the balance status in fs_info and delete the balance item from disk.
*/
static void reset_balance_state(struct btrfs_fs_info *fs_info)
{
struct btrfs_balance_control *bctl = fs_info->balance_ctl;
int ret;
BUG_ON(!fs_info->balance_ctl);
spin_lock(&fs_info->balance_lock);
fs_info->balance_ctl = NULL;
spin_unlock(&fs_info->balance_lock);
kfree(bctl);
ret = del_balance_item(fs_info);
if (ret)
btrfs_handle_fs_error(fs_info, ret, NULL);
}
/*
* Balance filters. Return 1 if chunk should be filtered out
* (should not be balanced).
*/
static int chunk_profiles_filter(u64 chunk_type,
struct btrfs_balance_args *bargs)
{
chunk_type = chunk_to_extended(chunk_type) &
BTRFS_EXTENDED_PROFILE_MASK;
if (bargs->profiles & chunk_type)
return 0;
return 1;
}
static int chunk_usage_range_filter(struct btrfs_fs_info *fs_info, u64 chunk_offset,
struct btrfs_balance_args *bargs)
{
struct btrfs_block_group *cache;
u64 chunk_used;
u64 user_thresh_min;
u64 user_thresh_max;
int ret = 1;
cache = btrfs_lookup_block_group(fs_info, chunk_offset);
chunk_used = cache->used;
if (bargs->usage_min == 0)
user_thresh_min = 0;
else
user_thresh_min = div_factor_fine(cache->length,
bargs->usage_min);
if (bargs->usage_max == 0)
user_thresh_max = 1;
else if (bargs->usage_max > 100)
user_thresh_max = cache->length;
else
user_thresh_max = div_factor_fine(cache->length,
bargs->usage_max);
if (user_thresh_min <= chunk_used && chunk_used < user_thresh_max)
ret = 0;
btrfs_put_block_group(cache);
return ret;
}
static int chunk_usage_filter(struct btrfs_fs_info *fs_info,
u64 chunk_offset, struct btrfs_balance_args *bargs)
{
struct btrfs_block_group *cache;
u64 chunk_used, user_thresh;
int ret = 1;
cache = btrfs_lookup_block_group(fs_info, chunk_offset);
chunk_used = cache->used;
if (bargs->usage_min == 0)
user_thresh = 1;
else if (bargs->usage > 100)
user_thresh = cache->length;
else
user_thresh = div_factor_fine(cache->length, bargs->usage);
if (chunk_used < user_thresh)
ret = 0;
btrfs_put_block_group(cache);
return ret;
}
static int chunk_devid_filter(struct extent_buffer *leaf,
struct btrfs_chunk *chunk,
struct btrfs_balance_args *bargs)
{
struct btrfs_stripe *stripe;
int num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
int i;
for (i = 0; i < num_stripes; i++) {
stripe = btrfs_stripe_nr(chunk, i);
if (btrfs_stripe_devid(leaf, stripe) == bargs->devid)
return 0;
}
return 1;
}
static u64 calc_data_stripes(u64 type, int num_stripes)
{
const int index = btrfs_bg_flags_to_raid_index(type);
const int ncopies = btrfs_raid_array[index].ncopies;
const int nparity = btrfs_raid_array[index].nparity;
return (num_stripes - nparity) / ncopies;
}
/* [pstart, pend) */
static int chunk_drange_filter(struct extent_buffer *leaf,
struct btrfs_chunk *chunk,
struct btrfs_balance_args *bargs)
{
struct btrfs_stripe *stripe;
int num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
u64 stripe_offset;
u64 stripe_length;
u64 type;
int factor;
int i;
if (!(bargs->flags & BTRFS_BALANCE_ARGS_DEVID))
return 0;
type = btrfs_chunk_type(leaf, chunk);
factor = calc_data_stripes(type, num_stripes);
for (i = 0; i < num_stripes; i++) {
stripe = btrfs_stripe_nr(chunk, i);
if (btrfs_stripe_devid(leaf, stripe) != bargs->devid)
continue;
stripe_offset = btrfs_stripe_offset(leaf, stripe);
stripe_length = btrfs_chunk_length(leaf, chunk);
stripe_length = div_u64(stripe_length, factor);
if (stripe_offset < bargs->pend &&
stripe_offset + stripe_length > bargs->pstart)
return 0;
}
return 1;
}
/* [vstart, vend) */
static int chunk_vrange_filter(struct extent_buffer *leaf,
struct btrfs_chunk *chunk,
u64 chunk_offset,
struct btrfs_balance_args *bargs)
{
if (chunk_offset < bargs->vend &&
chunk_offset + btrfs_chunk_length(leaf, chunk) > bargs->vstart)
/* at least part of the chunk is inside this vrange */
return 0;
return 1;
}
static int chunk_stripes_range_filter(struct extent_buffer *leaf,
struct btrfs_chunk *chunk,
struct btrfs_balance_args *bargs)
{
int num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
if (bargs->stripes_min <= num_stripes
&& num_stripes <= bargs->stripes_max)
return 0;
return 1;
}
static int chunk_soft_convert_filter(u64 chunk_type,
struct btrfs_balance_args *bargs)
{
if (!(bargs->flags & BTRFS_BALANCE_ARGS_CONVERT))
return 0;
chunk_type = chunk_to_extended(chunk_type) &
BTRFS_EXTENDED_PROFILE_MASK;
if (bargs->target == chunk_type)
return 1;
return 0;
}
static int should_balance_chunk(struct extent_buffer *leaf,
struct btrfs_chunk *chunk, u64 chunk_offset)
{
struct btrfs_fs_info *fs_info = leaf->fs_info;
struct btrfs_balance_control *bctl = fs_info->balance_ctl;
struct btrfs_balance_args *bargs = NULL;
u64 chunk_type = btrfs_chunk_type(leaf, chunk);
/* type filter */
if (!((chunk_type & BTRFS_BLOCK_GROUP_TYPE_MASK) &
(bctl->flags & BTRFS_BALANCE_TYPE_MASK))) {
return 0;
}
if (chunk_type & BTRFS_BLOCK_GROUP_DATA)
bargs = &bctl->data;
else if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM)
bargs = &bctl->sys;
else if (chunk_type & BTRFS_BLOCK_GROUP_METADATA)
bargs = &bctl->meta;
/* profiles filter */
if ((bargs->flags & BTRFS_BALANCE_ARGS_PROFILES) &&
chunk_profiles_filter(chunk_type, bargs)) {
return 0;
}
/* usage filter */
if ((bargs->flags & BTRFS_BALANCE_ARGS_USAGE) &&
chunk_usage_filter(fs_info, chunk_offset, bargs)) {
return 0;
} else if ((bargs->flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) &&
chunk_usage_range_filter(fs_info, chunk_offset, bargs)) {
return 0;
}
/* devid filter */
if ((bargs->flags & BTRFS_BALANCE_ARGS_DEVID) &&
chunk_devid_filter(leaf, chunk, bargs)) {
return 0;
}
/* drange filter, makes sense only with devid filter */
if ((bargs->flags & BTRFS_BALANCE_ARGS_DRANGE) &&
chunk_drange_filter(leaf, chunk, bargs)) {
return 0;
}
/* vrange filter */
if ((bargs->flags & BTRFS_BALANCE_ARGS_VRANGE) &&
chunk_vrange_filter(leaf, chunk, chunk_offset, bargs)) {
return 0;
}
/* stripes filter */
if ((bargs->flags & BTRFS_BALANCE_ARGS_STRIPES_RANGE) &&
chunk_stripes_range_filter(leaf, chunk, bargs)) {
return 0;
}
/* soft profile changing mode */
if ((bargs->flags & BTRFS_BALANCE_ARGS_SOFT) &&
chunk_soft_convert_filter(chunk_type, bargs)) {
return 0;
}
/*
* limited by count, must be the last filter
*/
if ((bargs->flags & BTRFS_BALANCE_ARGS_LIMIT)) {
if (bargs->limit == 0)
return 0;
else
bargs->limit--;
} else if ((bargs->flags & BTRFS_BALANCE_ARGS_LIMIT_RANGE)) {
/*
* Same logic as the 'limit' filter; the minimum cannot be
* determined here because we do not have the global information
* about the count of all chunks that satisfy the filters.
*/
if (bargs->limit_max == 0)
return 0;
else
bargs->limit_max--;
}
return 1;
}
static int __btrfs_balance(struct btrfs_fs_info *fs_info)
{
struct btrfs_balance_control *bctl = fs_info->balance_ctl;
struct btrfs_root *chunk_root = fs_info->chunk_root;
u64 chunk_type;
struct btrfs_chunk *chunk;
struct btrfs_path *path = NULL;
struct btrfs_key key;
struct btrfs_key found_key;
struct extent_buffer *leaf;
int slot;
int ret;
int enospc_errors = 0;
bool counting = true;
/* The single value limit and min/max limits use the same bytes in the */
u64 limit_data = bctl->data.limit;
u64 limit_meta = bctl->meta.limit;
u64 limit_sys = bctl->sys.limit;
u32 count_data = 0;
u32 count_meta = 0;
u32 count_sys = 0;
int chunk_reserved = 0;
path = btrfs_alloc_path();
if (!path) {
ret = -ENOMEM;
goto error;
}
/* zero out stat counters */
spin_lock(&fs_info->balance_lock);
memset(&bctl->stat, 0, sizeof(bctl->stat));
spin_unlock(&fs_info->balance_lock);
again:
if (!counting) {
/*
* The single value limit and min/max limits use the same bytes
* in the
*/
bctl->data.limit = limit_data;
bctl->meta.limit = limit_meta;
bctl->sys.limit = limit_sys;
}
key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
key.offset = (u64)-1;
key.type = BTRFS_CHUNK_ITEM_KEY;
while (1) {
if ((!counting && atomic_read(&fs_info->balance_pause_req)) ||
atomic_read(&fs_info->balance_cancel_req)) {
ret = -ECANCELED;
goto error;
}
mutex_lock(&fs_info->reclaim_bgs_lock);
ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0);
if (ret < 0) {
mutex_unlock(&fs_info->reclaim_bgs_lock);
goto error;
}
/*
* this shouldn't happen, it means the last relocate
* failed
*/
if (ret == 0)
BUG(); /* FIXME break ? */
ret = btrfs_previous_item(chunk_root, path, 0,
BTRFS_CHUNK_ITEM_KEY);
if (ret) {
mutex_unlock(&fs_info->reclaim_bgs_lock);
ret = 0;
break;
}
leaf = path->nodes[0];
slot = path->slots[0];
btrfs_item_key_to_cpu(leaf, &found_key, slot);
if (found_key.objectid != key.objectid) {
mutex_unlock(&fs_info->reclaim_bgs_lock);
break;
}
chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk);
chunk_type = btrfs_chunk_type(leaf, chunk);
if (!counting) {
spin_lock(&fs_info->balance_lock);
bctl->stat.considered++;
spin_unlock(&fs_info->balance_lock);
}
ret = should_balance_chunk(leaf, chunk, found_key.offset);
btrfs_release_path(path);
if (!ret) {
mutex_unlock(&fs_info->reclaim_bgs_lock);
goto loop;
}
if (counting) {
mutex_unlock(&fs_info->reclaim_bgs_lock);
spin_lock(&fs_info->balance_lock);
bctl->stat.expected++;
spin_unlock(&fs_info->balance_lock);
if (chunk_type & BTRFS_BLOCK_GROUP_DATA)
count_data++;
else if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM)
count_sys++;
else if (chunk_type & BTRFS_BLOCK_GROUP_METADATA)
count_meta++;
goto loop;
}
/*
* Apply limit_min filter, no need to check if the LIMITS
* filter is used, limit_min is 0 by default
*/
if (((chunk_type & BTRFS_BLOCK_GROUP_DATA) &&
count_data < bctl->data.limit_min)
|| ((chunk_type & BTRFS_BLOCK_GROUP_METADATA) &&
count_meta < bctl->meta.limit_min)
|| ((chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) &&
count_sys < bctl->sys.limit_min)) {
mutex_unlock(&fs_info->reclaim_bgs_lock);
goto loop;
}
if (!chunk_reserved) {
/*
* We may be relocating the only data chunk we have,
* which could potentially end up with losing data's
* raid profile, so lets allocate an empty one in
* advance.
*/
ret = btrfs_may_alloc_data_chunk(fs_info,
found_key.offset);
if (ret < 0) {
mutex_unlock(&fs_info->reclaim_bgs_lock);
goto error;
} else if (ret == 1) {
chunk_reserved = 1;
}
}
ret = btrfs_relocate_chunk(fs_info, found_key.offset);
mutex_unlock(&fs_info->reclaim_bgs_lock);
if (ret == -ENOSPC) {
enospc_errors++;
} else if (ret == -ETXTBSY) {
btrfs_info(fs_info,
"skipping relocation of block group %llu due to active swapfile",
found_key.offset);
ret = 0;
} else if (ret) {
goto error;
} else {
spin_lock(&fs_info->balance_lock);
bctl->stat.completed++;
spin_unlock(&fs_info->balance_lock);
}
loop:
if (found_key.offset == 0)
break;
key.offset = found_key.offset - 1;
}
if (counting) {
btrfs_release_path(path);
counting = false;
goto again;
}
error:
btrfs_free_path(path);
if (enospc_errors) {
btrfs_info(fs_info, "%d enospc errors during balance",
enospc_errors);
if (!ret)
ret = -ENOSPC;
}
return ret;
}
/**
* alloc_profile_is_valid - see if a given profile is valid and reduced
* @flags: profile to validate
* @extended: if true @flags is treated as an extended profile
*/
static int alloc_profile_is_valid(u64 flags, int extended)
{
u64 mask = (extended ? BTRFS_EXTENDED_PROFILE_MASK :
BTRFS_BLOCK_GROUP_PROFILE_MASK);
flags &= ~BTRFS_BLOCK_GROUP_TYPE_MASK;
/* 1) check that all other bits are zeroed */
if (flags & ~mask)
return 0;
/* 2) see if profile is reduced */
if (flags == 0)
return !extended; /* "0" is valid for usual profiles */
return has_single_bit_set(flags);
}
static inline int balance_need_close(struct btrfs_fs_info *fs_info)
{
/* cancel requested || normal exit path */
return atomic_read(&fs_info->balance_cancel_req) ||
(atomic_read(&fs_info->balance_pause_req) == 0 &&
atomic_read(&fs_info->balance_cancel_req) == 0);
}
/*
* Validate target profile against allowed profiles and return true if it's OK.
* Otherwise print the error message and return false.
*/
static inline int validate_convert_profile(struct btrfs_fs_info *fs_info,
const struct btrfs_balance_args *bargs,
u64 allowed, const char *type)
{
if (!(bargs->flags & BTRFS_BALANCE_ARGS_CONVERT))
return true;
if (fs_info->sectorsize < PAGE_SIZE &&
bargs->target & BTRFS_BLOCK_GROUP_RAID56_MASK) {
btrfs_err(fs_info,
"RAID56 is not yet supported for sectorsize %u with page size %lu",
fs_info->sectorsize, PAGE_SIZE);
return false;
}
/* Profile is valid and does not have bits outside of the allowed set */
if (alloc_profile_is_valid(bargs->target, 1) &&
(bargs->target & ~allowed) == 0)
return true;
btrfs_err(fs_info, "balance: invalid convert %s profile %s",
type, btrfs_bg_type_to_raid_name(bargs->target));
return false;
}
/*
* Fill @buf with textual description of balance filter flags @bargs, up to
* @size_buf including the terminating null. The output may be trimmed if it
* does not fit into the provided buffer.
*/
static void describe_balance_args(struct btrfs_balance_args *bargs, char *buf,
u32 size_buf)
{
int ret;
u32 size_bp = size_buf;
char *bp = buf;
u64 flags = bargs->flags;
char tmp_buf[128] = {'\0'};
if (!flags)
return;
#define CHECK_APPEND_NOARG(a) \
do { \
ret = snprintf(bp, size_bp, (a)); \
if (ret < 0 || ret >= size_bp) \
goto out_overflow; \
size_bp -= ret; \
bp += ret; \
} while (0)
#define CHECK_APPEND_1ARG(a, v1) \
do { \
ret = snprintf(bp, size_bp, (a), (v1)); \
if (ret < 0 || ret >= size_bp) \
goto out_overflow; \
size_bp -= ret; \
bp += ret; \
} while (0)
#define CHECK_APPEND_2ARG(a, v1, v2) \
do { \
ret = snprintf(bp, size_bp, (a), (v1), (v2)); \
if (ret < 0 || ret >= size_bp) \
goto out_overflow; \
size_bp -= ret; \
bp += ret; \
} while (0)
if (flags & BTRFS_BALANCE_ARGS_CONVERT)
CHECK_APPEND_1ARG("convert=%s,",
btrfs_bg_type_to_raid_name(bargs->target));
if (flags & BTRFS_BALANCE_ARGS_SOFT)
CHECK_APPEND_NOARG("soft,");
if (flags & BTRFS_BALANCE_ARGS_PROFILES) {
btrfs_describe_block_groups(bargs->profiles, tmp_buf,
sizeof(tmp_buf));
CHECK_APPEND_1ARG("profiles=%s,", tmp_buf);
}
if (flags & BTRFS_BALANCE_ARGS_USAGE)
CHECK_APPEND_1ARG("usage=%llu,", bargs->usage);
if (flags & BTRFS_BALANCE_ARGS_USAGE_RANGE)
CHECK_APPEND_2ARG("usage=%u..%u,",
bargs->usage_min, bargs->usage_max);
if (flags & BTRFS_BALANCE_ARGS_DEVID)
CHECK_APPEND_1ARG("devid=%llu,", bargs->devid);
if (flags & BTRFS_BALANCE_ARGS_DRANGE)
CHECK_APPEND_2ARG("drange=%llu..%llu,",
bargs->pstart, bargs->pend);
if (flags & BTRFS_BALANCE_ARGS_VRANGE)
CHECK_APPEND_2ARG("vrange=%llu..%llu,",
bargs->vstart, bargs->vend);
if (flags & BTRFS_BALANCE_ARGS_LIMIT)
CHECK_APPEND_1ARG("limit=%llu,", bargs->limit);
if (flags & BTRFS_BALANCE_ARGS_LIMIT_RANGE)
CHECK_APPEND_2ARG("limit=%u..%u,",
bargs->limit_min, bargs->limit_max);
if (flags & BTRFS_BALANCE_ARGS_STRIPES_RANGE)
CHECK_APPEND_2ARG("stripes=%u..%u,",
bargs->stripes_min, bargs->stripes_max);
#undef CHECK_APPEND_2ARG
#undef CHECK_APPEND_1ARG
#undef CHECK_APPEND_NOARG
out_overflow:
if (size_bp < size_buf)
buf[size_buf - size_bp - 1] = '\0'; /* remove last , */
else
buf[0] = '\0';
}
static void describe_balance_start_or_resume(struct btrfs_fs_info *fs_info)
{
u32 size_buf = 1024;
char tmp_buf[192] = {'\0'};
char *buf;
char *bp;
u32 size_bp = size_buf;
int ret;
struct btrfs_balance_control *bctl = fs_info->balance_ctl;
buf = kzalloc(size_buf, GFP_KERNEL);
if (!buf)
return;
bp = buf;
#define CHECK_APPEND_1ARG(a, v1) \
do { \
ret = snprintf(bp, size_bp, (a), (v1)); \
if (ret < 0 || ret >= size_bp) \
goto out_overflow; \
size_bp -= ret; \
bp += ret; \
} while (0)
if (bctl->flags & BTRFS_BALANCE_FORCE)
CHECK_APPEND_1ARG("%s", "-f ");
if (bctl->flags & BTRFS_BALANCE_DATA) {
describe_balance_args(&bctl->data, tmp_buf, sizeof(tmp_buf));
CHECK_APPEND_1ARG("-d%s ", tmp_buf);
}
if (bctl->flags & BTRFS_BALANCE_METADATA) {
describe_balance_args(&bctl->meta, tmp_buf, sizeof(tmp_buf));
CHECK_APPEND_1ARG("-m%s ", tmp_buf);
}
if (bctl->flags & BTRFS_BALANCE_SYSTEM) {
describe_balance_args(&bctl->sys, tmp_buf, sizeof(tmp_buf));
CHECK_APPEND_1ARG("-s%s ", tmp_buf);
}
#undef CHECK_APPEND_1ARG
out_overflow:
if (size_bp < size_buf)
buf[size_buf - size_bp - 1] = '\0'; /* remove last " " */
btrfs_info(fs_info, "balance: %s %s",
(bctl->flags & BTRFS_BALANCE_RESUME) ?
"resume" : "start", buf);
kfree(buf);
}
/*
* Should be called with balance mutexe held
*/
int btrfs_balance(struct btrfs_fs_info *fs_info,
struct btrfs_balance_control *bctl,
struct btrfs_ioctl_balance_args *bargs)
{
u64 meta_target, data_target;
u64 allowed;
int mixed = 0;
int ret;
u64 num_devices;
unsigned seq;
bool reducing_redundancy;
int i;
if (btrfs_fs_closing(fs_info) ||
atomic_read(&fs_info->balance_pause_req) ||
btrfs_should_cancel_balance(fs_info)) {
ret = -EINVAL;
goto out;
}
allowed = btrfs_super_incompat_flags(fs_info->super_copy);
if (allowed & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS)
mixed = 1;
/*
* In case of mixed groups both data and meta should be picked,
* and identical options should be given for both of them.
*/
allowed = BTRFS_BALANCE_DATA | BTRFS_BALANCE_METADATA;
if (mixed && (bctl->flags & allowed)) {
if (!(bctl->flags & BTRFS_BALANCE_DATA) ||
!(bctl->flags & BTRFS_BALANCE_METADATA) ||
memcmp(&bctl->data, &bctl->meta, sizeof(bctl->data))) {
btrfs_err(fs_info,
"balance: mixed groups data and metadata options must be the same");
ret = -EINVAL;
goto out;
}
}
/*
* rw_devices will not change at the moment, device add/delete/replace
* are exclusive
*/
num_devices = fs_info->fs_devices->rw_devices;
/*
* SINGLE profile on-disk has no profile bit, but in-memory we have a
* special bit for it, to make it easier to distinguish. Thus we need
* to set it manually, or balance would refuse the profile.
*/
allowed = BTRFS_AVAIL_ALLOC_BIT_SINGLE;
for (i = 0; i < ARRAY_SIZE(btrfs_raid_array); i++)
if (num_devices >= btrfs_raid_array[i].devs_min)
allowed |= btrfs_raid_array[i].bg_flag;
if (!validate_convert_profile(fs_info, &bctl->data, allowed, "data") ||
!validate_convert_profile(fs_info, &bctl->meta, allowed, "metadata") ||
!validate_convert_profile(fs_info, &bctl->sys, allowed, "system")) {
ret = -EINVAL;
goto out;
}
/*
* Allow to reduce metadata or system integrity only if force set for
* profiles with redundancy (copies, parity)
*/
allowed = 0;
for (i = 0; i < ARRAY_SIZE(btrfs_raid_array); i++) {
if (btrfs_raid_array[i].ncopies >= 2 ||
btrfs_raid_array[i].tolerated_failures >= 1)
allowed |= btrfs_raid_array[i].bg_flag;
}
do {
seq = read_seqbegin(&fs_info->profiles_lock);
if (((bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
(fs_info->avail_system_alloc_bits & allowed) &&
!(bctl->sys.target & allowed)) ||
((bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
(fs_info->avail_metadata_alloc_bits & allowed) &&
!(bctl->meta.target & allowed)))
reducing_redundancy = true;
else
reducing_redundancy = false;
/* if we're not converting, the target field is uninitialized */
meta_target = (bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) ?
bctl->meta.target : fs_info->avail_metadata_alloc_bits;
data_target = (bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) ?
bctl->data.target : fs_info->avail_data_alloc_bits;
} while (read_seqretry(&fs_info->profiles_lock, seq));
if (reducing_redundancy) {
if (bctl->flags & BTRFS_BALANCE_FORCE) {
btrfs_info(fs_info,
"balance: force reducing metadata redundancy");
} else {
btrfs_err(fs_info,
"balance: reduces metadata redundancy, use --force if you want this");
ret = -EINVAL;
goto out;
}
}
if (btrfs_get_num_tolerated_disk_barrier_failures(meta_target) <
btrfs_get_num_tolerated_disk_barrier_failures(data_target)) {
btrfs_warn(fs_info,
"balance: metadata profile %s has lower redundancy than data profile %s",
btrfs_bg_type_to_raid_name(meta_target),
btrfs_bg_type_to_raid_name(data_target));
}
ret = insert_balance_item(fs_info, bctl);
if (ret && ret != -EEXIST)
goto out;
if (!(bctl->flags & BTRFS_BALANCE_RESUME)) {
BUG_ON(ret == -EEXIST);
BUG_ON(fs_info->balance_ctl);
spin_lock(&fs_info->balance_lock);
fs_info->balance_ctl = bctl;
spin_unlock(&fs_info->balance_lock);
} else {
BUG_ON(ret != -EEXIST);
spin_lock(&fs_info->balance_lock);
update_balance_args(bctl);
spin_unlock(&fs_info->balance_lock);
}
ASSERT(!test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags));
set_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags);
describe_balance_start_or_resume(fs_info);
mutex_unlock(&fs_info->balance_mutex);
ret = __btrfs_balance(fs_info);
mutex_lock(&fs_info->balance_mutex);
if (ret == -ECANCELED && atomic_read(&fs_info->balance_pause_req))
btrfs_info(fs_info, "balance: paused");
/*
* Balance can be canceled by:
*
* - Regular cancel request
* Then ret == -ECANCELED and balance_cancel_req > 0
*
* - Fatal signal to "btrfs" process
* Either the signal caught by wait_reserve_ticket() and callers
* got -EINTR, or caught by btrfs_should_cancel_balance() and
* got -ECANCELED.
* Either way, in this case balance_cancel_req = 0, and
* ret == -EINTR or ret == -ECANCELED.
*
* So here we only check the return value to catch canceled balance.
*/
else if (ret == -ECANCELED || ret == -EINTR)
btrfs_info(fs_info, "balance: canceled");
else
btrfs_info(fs_info, "balance: ended with status: %d", ret);
clear_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags);
if (bargs) {
memset(bargs, 0, sizeof(*bargs));
btrfs_update_ioctl_balance_args(fs_info, bargs);
}
if ((ret && ret != -ECANCELED && ret != -ENOSPC) ||
balance_need_close(fs_info)) {
reset_balance_state(fs_info);
btrfs_exclop_finish(fs_info);
}
wake_up(&fs_info->balance_wait_q);
return ret;
out:
if (bctl->flags & BTRFS_BALANCE_RESUME)
reset_balance_state(fs_info);
else
kfree(bctl);
btrfs_exclop_finish(fs_info);
return ret;
}
static int balance_kthread(void *data)
{
struct btrfs_fs_info *fs_info = data;
int ret = 0;
sb_start_write(fs_info->sb);
mutex_lock(&fs_info->balance_mutex);
if (fs_info->balance_ctl)
ret = btrfs_balance(fs_info, fs_info->balance_ctl, NULL);
mutex_unlock(&fs_info->balance_mutex);
sb_end_write(fs_info->sb);
return ret;
}
int btrfs_resume_balance_async(struct btrfs_fs_info *fs_info)
{
struct task_struct *tsk;
mutex_lock(&fs_info->balance_mutex);
if (!fs_info->balance_ctl) {
mutex_unlock(&fs_info->balance_mutex); return 0;
}
mutex_unlock(&fs_info->balance_mutex);
if (btrfs_test_opt(fs_info, SKIP_BALANCE)) {
btrfs_info(fs_info, "balance: resume skipped");
return 0;
}
/*
* A ro->rw remount sequence should continue with the paused balance
* regardless of who pauses it, system or the user as of now, so set
* the resume flag.
*/
spin_lock(&fs_info->balance_lock);
fs_info->balance_ctl->flags |= BTRFS_BALANCE_RESUME;
spin_unlock(&fs_info->balance_lock);
tsk = kthread_run(balance_kthread, fs_info, "btrfs-balance");
return PTR_ERR_OR_ZERO(tsk);
}
int btrfs_recover_balance(struct btrfs_fs_info *fs_info)
{
struct btrfs_balance_control *bctl;
struct btrfs_balance_item *item;
struct btrfs_disk_balance_args disk_bargs;
struct btrfs_path *path;
struct extent_buffer *leaf;
struct btrfs_key key;
int ret;
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
key.objectid = BTRFS_BALANCE_OBJECTID;
key.type = BTRFS_TEMPORARY_ITEM_KEY;
key.offset = 0;
ret = btrfs_search_slot(NULL, fs_info->tree_root, &key, path, 0, 0);
if (ret < 0)
goto out;
if (ret > 0) { /* ret = -ENOENT; */
ret = 0;
goto out;
}
bctl = kzalloc(sizeof(*bctl), GFP_NOFS);
if (!bctl) {
ret = -ENOMEM;
goto out;
}
leaf = path->nodes[0];
item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_balance_item);
bctl->flags = btrfs_balance_flags(leaf, item);
bctl->flags |= BTRFS_BALANCE_RESUME;
btrfs_balance_data(leaf, item, &disk_bargs);
btrfs_disk_balance_args_to_cpu(&bctl->data, &disk_bargs);
btrfs_balance_meta(leaf, item, &disk_bargs);
btrfs_disk_balance_args_to_cpu(&bctl->meta, &disk_bargs);
btrfs_balance_sys(leaf, item, &disk_bargs);
btrfs_disk_balance_args_to_cpu(&bctl->sys, &disk_bargs);
/*
* This should never happen, as the paused balance state is recovered
* during mount without any chance of other exclusive ops to collide.
*
* This gives the exclusive op status to balance and keeps in paused
* state until user intervention (cancel or umount). If the ownership
* cannot be assigned, show a message but do not fail. The balance
* is in a paused state and must have fs_info::balance_ctl properly
* set up.
*/
if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_BALANCE))
btrfs_warn(fs_info,
"balance: cannot set exclusive op status, resume manually");
btrfs_release_path(path);
mutex_lock(&fs_info->balance_mutex);
BUG_ON(fs_info->balance_ctl);
spin_lock(&fs_info->balance_lock);
fs_info->balance_ctl = bctl;
spin_unlock(&fs_info->balance_lock);
mutex_unlock(&fs_info->balance_mutex);
out:
btrfs_free_path(path); return ret;
}
int btrfs_pause_balance(struct btrfs_fs_info *fs_info)
{
int ret = 0;
mutex_lock(&fs_info->balance_mutex);
if (!fs_info->balance_ctl) {
mutex_unlock(&fs_info->balance_mutex);
return -ENOTCONN;
}
if (test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)) { atomic_inc(&fs_info->balance_pause_req);
mutex_unlock(&fs_info->balance_mutex);
wait_event(fs_info->balance_wait_q,
!test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags));
mutex_lock(&fs_info->balance_mutex);
/* we are good with balance_ctl ripped off from under us */
BUG_ON(test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags));
atomic_dec(&fs_info->balance_pause_req);
} else {
ret = -ENOTCONN;
}
mutex_unlock(&fs_info->balance_mutex); return ret;
}
int btrfs_cancel_balance(struct btrfs_fs_info *fs_info)
{
mutex_lock(&fs_info->balance_mutex);
if (!fs_info->balance_ctl) {
mutex_unlock(&fs_info->balance_mutex);
return -ENOTCONN;
}
/*
* A paused balance with the item stored on disk can be resumed at
* mount time if the mount is read-write. Otherwise it's still paused
* and we must not allow cancelling as it deletes the item.
*/
if (sb_rdonly(fs_info->sb)) {
mutex_unlock(&fs_info->balance_mutex);
return -EROFS;
}
atomic_inc(&fs_info->balance_cancel_req);
/*
* if we are running just wait and return, balance item is
* deleted in btrfs_balance in this case
*/
if (test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)) {
mutex_unlock(&fs_info->balance_mutex);
wait_event(fs_info->balance_wait_q,
!test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags));
mutex_lock(&fs_info->balance_mutex);
} else {
mutex_unlock(&fs_info->balance_mutex);
/*
* Lock released to allow other waiters to continue, we'll
* reexamine the status again.
*/
mutex_lock(&fs_info->balance_mutex);
if (fs_info->balance_ctl) {
reset_balance_state(fs_info);
btrfs_exclop_finish(fs_info);
btrfs_info(fs_info, "balance: canceled");
}
}
BUG_ON(fs_info->balance_ctl ||
test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags));
atomic_dec(&fs_info->balance_cancel_req);
mutex_unlock(&fs_info->balance_mutex);
return 0;
}
int btrfs_uuid_scan_kthread(void *data)
{
struct btrfs_fs_info *fs_info = data;
struct btrfs_root *root = fs_info->tree_root;
struct btrfs_key key;
struct btrfs_path *path = NULL;
int ret = 0;
struct extent_buffer *eb;
int slot;
struct btrfs_root_item root_item;
u32 item_size;
struct btrfs_trans_handle *trans = NULL;
bool closing = false;
path = btrfs_alloc_path();
if (!path) {
ret = -ENOMEM;
goto out;
}
key.objectid = 0;
key.type = BTRFS_ROOT_ITEM_KEY;
key.offset = 0;
while (1) {
if (btrfs_fs_closing(fs_info)) {
closing = true;
break;
}
ret = btrfs_search_forward(root, &key, path,
BTRFS_OLDEST_GENERATION);
if (ret) {
if (ret > 0)
ret = 0;
break;
}
if (key.type != BTRFS_ROOT_ITEM_KEY ||
(key.objectid < BTRFS_FIRST_FREE_OBJECTID &&
key.objectid != BTRFS_FS_TREE_OBJECTID) ||
key.objectid > BTRFS_LAST_FREE_OBJECTID)
goto skip;
eb = path->nodes[0];
slot = path->slots[0];
item_size = btrfs_item_size_nr(eb, slot);
if (item_size < sizeof(root_item))
goto skip;
read_extent_buffer(eb, &root_item,
btrfs_item_ptr_offset(eb, slot),
(int)sizeof(root_item));
if (btrfs_root_refs(&root_item) == 0)
goto skip;
if (!btrfs_is_empty_uuid(root_item.uuid) ||
!btrfs_is_empty_uuid(root_item.received_uuid)) {
if (trans)
goto update_tree;
btrfs_release_path(path);
/*
* 1 - subvol uuid item
* 1 - received_subvol uuid item
*/
trans = btrfs_start_transaction(fs_info->uuid_root, 2);
if (IS_ERR(trans)) {
ret = PTR_ERR(trans);
break;
}
continue;
} else {
goto skip;
}
update_tree:
btrfs_release_path(path);
if (!btrfs_is_empty_uuid(root_item.uuid)) {
ret = btrfs_uuid_tree_add(trans, root_item.uuid,
BTRFS_UUID_KEY_SUBVOL,
key.objectid);
if (ret < 0) {
btrfs_warn(fs_info, "uuid_tree_add failed %d",
ret);
break;
}
}
if (!btrfs_is_empty_uuid(root_item.received_uuid)) {
ret = btrfs_uuid_tree_add(trans,
root_item.received_uuid,
BTRFS_UUID_KEY_RECEIVED_SUBVOL,
key.objectid);
if (ret < 0) {
btrfs_warn(fs_info, "uuid_tree_add failed %d",
ret);
break;
}
}
skip:
btrfs_release_path(path);
if (trans) {
ret = btrfs_end_transaction(trans);
trans = NULL;
if (ret)
break;
}
if (key.offset < (u64)-1) {
key.offset++;
} else if (key.type < BTRFS_ROOT_ITEM_KEY) {
key.offset = 0;
key.type = BTRFS_ROOT_ITEM_KEY;
} else if (key.objectid < (u64)-1) {
key.offset = 0;
key.type = BTRFS_ROOT_ITEM_KEY;
key.objectid++;
} else {
break;
}
cond_resched();
}
out:
btrfs_free_path(path);
if (trans && !IS_ERR(trans))
btrfs_end_transaction(trans);
if (ret)
btrfs_warn(fs_info, "btrfs_uuid_scan_kthread failed %d", ret);
else if (!closing)
set_bit(BTRFS_FS_UPDATE_UUID_TREE_GEN, &fs_info->flags);
up(&fs_info->uuid_tree_rescan_sem);
return 0;
}
int btrfs_create_uuid_tree(struct btrfs_fs_info *fs_info)
{
struct btrfs_trans_handle *trans;
struct btrfs_root *tree_root = fs_info->tree_root;
struct btrfs_root *uuid_root;
struct task_struct *task;
int ret;
/*
* 1 - root node
* 1 - root item
*/
trans = btrfs_start_transaction(tree_root, 2);
if (IS_ERR(trans))
return PTR_ERR(trans);
uuid_root = btrfs_create_tree(trans, BTRFS_UUID_TREE_OBJECTID);
if (IS_ERR(uuid_root)) {
ret = PTR_ERR(uuid_root);
btrfs_abort_transaction(trans, ret);
btrfs_end_transaction(trans);
return ret;
}
fs_info->uuid_root = uuid_root;
ret = btrfs_commit_transaction(trans);
if (ret)
return ret;
down(&fs_info->uuid_tree_rescan_sem);
task = kthread_run(btrfs_uuid_scan_kthread, fs_info, "btrfs-uuid");
if (IS_ERR(task)) {
/* fs_info->update_uuid_tree_gen remains 0 in all error case */
btrfs_warn(fs_info, "failed to start uuid_scan task");
up(&fs_info->uuid_tree_rescan_sem);
return PTR_ERR(task);
}
return 0;
}
/*
* shrinking a device means finding all of the device extents past
* the new size, and then following the back refs to the chunks.
* The chunk relocation code actually frees the device extent
*/
int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
{
struct btrfs_fs_info *fs_info = device->fs_info;
struct btrfs_root *root = fs_info->dev_root;
struct btrfs_trans_handle *trans;
struct btrfs_dev_extent *dev_extent = NULL;
struct btrfs_path *path;
u64 length;
u64 chunk_offset;
int ret;
int slot;
int failed = 0;
bool retried = false;
struct extent_buffer *l;
struct btrfs_key key;
struct btrfs_super_block *super_copy = fs_info->super_copy;
u64 old_total = btrfs_super_total_bytes(super_copy);
u64 old_size = btrfs_device_get_total_bytes(device);
u64 diff;
u64 start;
new_size = round_down(new_size, fs_info->sectorsize);
start = new_size;
diff = round_down(old_size - new_size, fs_info->sectorsize);
if (test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state))
return -EINVAL;
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
path->reada = READA_BACK;
trans = btrfs_start_transaction(root, 0);
if (IS_ERR(trans)) {
btrfs_free_path(path);
return PTR_ERR(trans);
}
mutex_lock(&fs_info->chunk_mutex);
btrfs_device_set_total_bytes(device, new_size);
if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
device->fs_devices->total_rw_bytes -= diff;
atomic64_sub(diff, &fs_info->free_chunk_space);
}
/*
* Once the device's size has been set to the new size, ensure all
* in-memory chunks are synced to disk so that the loop below sees them
* and relocates them accordingly.
*/
if (contains_pending_extent(device, &start, diff)) {
mutex_unlock(&fs_info->chunk_mutex);
ret = btrfs_commit_transaction(trans);
if (ret)
goto done;
} else {
mutex_unlock(&fs_info->chunk_mutex);
btrfs_end_transaction(trans);
}
again:
key.objectid = device->devid;
key.offset = (u64)-1;
key.type = BTRFS_DEV_EXTENT_KEY;
do {
mutex_lock(&fs_info->reclaim_bgs_lock);
ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
if (ret < 0) {
mutex_unlock(&fs_info->reclaim_bgs_lock);
goto done;
}
ret = btrfs_previous_item(root, path, 0, key.type);
if (ret) {
mutex_unlock(&fs_info->reclaim_bgs_lock);
if (ret < 0)
goto done;
ret = 0;
btrfs_release_path(path);
break;
}
l = path->nodes[0];
slot = path->slots[0];
btrfs_item_key_to_cpu(l, &key, path->slots[0]);
if (key.objectid != device->devid) {
mutex_unlock(&fs_info->reclaim_bgs_lock);
btrfs_release_path(path);
break;
}
dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
length = btrfs_dev_extent_length(l, dev_extent);
if (key.offset + length <= new_size) {
mutex_unlock(&fs_info->reclaim_bgs_lock);
btrfs_release_path(path);
break;
}
chunk_offset = btrfs_dev_extent_chunk_offset(l, dev_extent);
btrfs_release_path(path);
/*
* We may be relocating the only data chunk we have,
* which could potentially end up with losing data's
* raid profile, so lets allocate an empty one in
* advance.
*/
ret = btrfs_may_alloc_data_chunk(fs_info, chunk_offset);
if (ret < 0) {
mutex_unlock(&fs_info->reclaim_bgs_lock);
goto done;
}
ret = btrfs_relocate_chunk(fs_info, chunk_offset);
mutex_unlock(&fs_info->reclaim_bgs_lock);
if (ret == -ENOSPC) {
failed++;
} else if (ret) {
if (ret == -ETXTBSY) {
btrfs_warn(fs_info,
"could not shrink block group %llu due to active swapfile",
chunk_offset);
}
goto done;
}
} while (key.offset-- > 0);
if (failed && !retried) {
failed = 0;
retried = true;
goto again;
} else if (failed && retried) {
ret = -ENOSPC;
goto done;
}
/* Shrinking succeeded, else we would be at "done". */
trans = btrfs_start_transaction(root, 0);
if (IS_ERR(trans)) {
ret = PTR_ERR(trans);
goto done;
}
mutex_lock(&fs_info->chunk_mutex);
/* Clear all state bits beyond the shrunk device size */
clear_extent_bits(&device->alloc_state, new_size, (u64)-1,
CHUNK_STATE_MASK);
btrfs_device_set_disk_total_bytes(device, new_size);
if (list_empty(&device->post_commit_list))
list_add_tail(&device->post_commit_list,
&trans->transaction->dev_update_list);
WARN_ON(diff > old_total);
btrfs_set_super_total_bytes(super_copy,
round_down(old_total - diff, fs_info->sectorsize));
mutex_unlock(&fs_info->chunk_mutex);
/* Now btrfs_update_device() will change the on-disk size. */
ret = btrfs_update_device(trans, device);
if (ret < 0) {
btrfs_abort_transaction(trans, ret);
btrfs_end_transaction(trans);
} else {
ret = btrfs_commit_transaction(trans);
}
done:
btrfs_free_path(path);
if (ret) {
mutex_lock(&fs_info->chunk_mutex);
btrfs_device_set_total_bytes(device, old_size);
if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state))
device->fs_devices->total_rw_bytes += diff;
atomic64_add(diff, &fs_info->free_chunk_space);
mutex_unlock(&fs_info->chunk_mutex);
}
return ret;
}
static int btrfs_add_system_chunk(struct btrfs_fs_info *fs_info,
struct btrfs_key *key,
struct btrfs_chunk *chunk, int item_size)
{
struct btrfs_super_block *super_copy = fs_info->super_copy;
struct btrfs_disk_key disk_key;
u32 array_size;
u8 *ptr;
lockdep_assert_held(&fs_info->chunk_mutex);
array_size = btrfs_super_sys_array_size(super_copy);
if (array_size + item_size + sizeof(disk_key)
> BTRFS_SYSTEM_CHUNK_ARRAY_SIZE)
return -EFBIG;
ptr = super_copy->sys_chunk_array + array_size;
btrfs_cpu_key_to_disk(&disk_key, key);
memcpy(ptr, &disk_key, sizeof(disk_key));
ptr += sizeof(disk_key);
memcpy(ptr, chunk, item_size);
item_size += sizeof(disk_key);
btrfs_set_super_sys_array_size(super_copy, array_size + item_size);
return 0;
}
/*
* sort the devices in descending order by max_avail, total_avail
*/
static int btrfs_cmp_device_info(const void *a, const void *b)
{
const struct btrfs_device_info *di_a = a;
const struct btrfs_device_info *di_b = b;
if (di_a->max_avail > di_b->max_avail)
return -1;
if (di_a->max_avail < di_b->max_avail)
return 1;
if (di_a->total_avail > di_b->total_avail)
return -1;
if (di_a->total_avail < di_b->total_avail)
return 1;
return 0;
}
static void check_raid56_incompat_flag(struct btrfs_fs_info *info, u64 type)
{
if (!(type & BTRFS_BLOCK_GROUP_RAID56_MASK))
return;
btrfs_set_fs_incompat(info, RAID56);
}
static void check_raid1c34_incompat_flag(struct btrfs_fs_info *info, u64 type)
{
if (!(type & (BTRFS_BLOCK_GROUP_RAID1C3 | BTRFS_BLOCK_GROUP_RAID1C4)))
return;
btrfs_set_fs_incompat(info, RAID1C34);
}
/*
* Structure used internally for __btrfs_alloc_chunk() function.
* Wraps needed parameters.
*/
struct alloc_chunk_ctl {
u64 start;
u64 type;
/* Total number of stripes to allocate */
int num_stripes;
/* sub_stripes info for map */
int sub_stripes;
/* Stripes per device */
int dev_stripes;
/* Maximum number of devices to use */
int devs_max;
/* Minimum number of devices to use */
int devs_min;
/* ndevs has to be a multiple of this */
int devs_increment;
/* Number of copies */
int ncopies;
/* Number of stripes worth of bytes to store parity information */
int nparity;
u64 max_stripe_size;
u64 max_chunk_size;
u64 dev_extent_min;
u64 stripe_size;
u64 chunk_size;
int ndevs;
};
static void init_alloc_chunk_ctl_policy_regular(
struct btrfs_fs_devices *fs_devices,
struct alloc_chunk_ctl *ctl)
{
u64 type = ctl->type;
if (type & BTRFS_BLOCK_GROUP_DATA) {
ctl->max_stripe_size = SZ_1G;
ctl->max_chunk_size = BTRFS_MAX_DATA_CHUNK_SIZE; } else if (type & BTRFS_BLOCK_GROUP_METADATA) {
/* For larger filesystems, use larger metadata chunks */
if (fs_devices->total_rw_bytes > 50ULL * SZ_1G)
ctl->max_stripe_size = SZ_1G;
else
ctl->max_stripe_size = SZ_256M;
ctl->max_chunk_size = ctl->max_stripe_size;
} else if (type & BTRFS_BLOCK_GROUP_SYSTEM) {
ctl->max_stripe_size = SZ_32M;
ctl->max_chunk_size = 2 * ctl->max_stripe_size;
ctl->devs_max = min_t(int, ctl->devs_max,
BTRFS_MAX_DEVS_SYS_CHUNK);
} else {
BUG();
}
/* We don't want a chunk larger than 10% of writable space */
ctl->max_chunk_size = min(div_factor(fs_devices->total_rw_bytes, 1),
ctl->max_chunk_size);
ctl->dev_extent_min = BTRFS_STRIPE_LEN * ctl->dev_stripes;
}
static void init_alloc_chunk_ctl_policy_zoned(
struct btrfs_fs_devices *fs_devices,
struct alloc_chunk_ctl *ctl)
{
u64 zone_size = fs_devices->fs_info->zone_size;
u64 limit;
int min_num_stripes = ctl->devs_min * ctl->dev_stripes;
int min_data_stripes = (min_num_stripes - ctl->nparity) / ctl->ncopies;
u64 min_chunk_size = min_data_stripes * zone_size;
u64 type = ctl->type;
ctl->max_stripe_size = zone_size;
if (type & BTRFS_BLOCK_GROUP_DATA) {
ctl->max_chunk_size = round_down(BTRFS_MAX_DATA_CHUNK_SIZE,
zone_size);
} else if (type & BTRFS_BLOCK_GROUP_METADATA) { ctl->max_chunk_size = ctl->max_stripe_size; } else if (type & BTRFS_BLOCK_GROUP_SYSTEM) { ctl->max_chunk_size = 2 * ctl->max_stripe_size;
ctl->devs_max = min_t(int, ctl->devs_max,
BTRFS_MAX_DEVS_SYS_CHUNK);
} else {
BUG();
}
/* We don't want a chunk larger than 10% of writable space */
limit = max(round_down(div_factor(fs_devices->total_rw_bytes, 1),
zone_size),
min_chunk_size);
ctl->max_chunk_size = min(limit, ctl->max_chunk_size);
ctl->dev_extent_min = zone_size * ctl->dev_stripes;
}
static void init_alloc_chunk_ctl(struct btrfs_fs_devices *fs_devices,
struct alloc_chunk_ctl *ctl)
{
int index = btrfs_bg_flags_to_raid_index(ctl->type);
ctl->sub_stripes = btrfs_raid_array[index].sub_stripes;
ctl->dev_stripes = btrfs_raid_array[index].dev_stripes;
ctl->devs_max = btrfs_raid_array[index].devs_max;
if (!ctl->devs_max)
ctl->devs_max = BTRFS_MAX_DEVS(fs_devices->fs_info); ctl->devs_min = btrfs_raid_array[index].devs_min;
ctl->devs_increment = btrfs_raid_array[index].devs_increment;
ctl->ncopies = btrfs_raid_array[index].ncopies;
ctl->nparity = btrfs_raid_array[index].nparity;
ctl->ndevs = 0;
switch (fs_devices->chunk_alloc_policy) {
case BTRFS_CHUNK_ALLOC_REGULAR:
init_alloc_chunk_ctl_policy_regular(fs_devices, ctl);
break;
case BTRFS_CHUNK_ALLOC_ZONED:
init_alloc_chunk_ctl_policy_zoned(fs_devices, ctl);
break;
default:
BUG();
}
}
static int gather_device_info(struct btrfs_fs_devices *fs_devices,
struct alloc_chunk_ctl *ctl,
struct btrfs_device_info *devices_info)
{
struct btrfs_fs_info *info = fs_devices->fs_info;
struct btrfs_device *device;
u64 total_avail;
u64 dev_extent_want = ctl->max_stripe_size * ctl->dev_stripes;
int ret;
int ndevs = 0;
u64 max_avail;
u64 dev_offset;
/*
* in the first pass through the devices list, we gather information
* about the available holes on each device.
*/
list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) { if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) { WARN(1, KERN_ERR
"BTRFS: read-only device in alloc_list\n");
continue;
}
if (!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA,
&device->dev_state) ||
test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state))
continue;
if (device->total_bytes > device->bytes_used) total_avail = device->total_bytes - device->bytes_used;
else
total_avail = 0;
/* If there is no space on this device, skip it. */
if (total_avail < ctl->dev_extent_min)
continue;
ret = find_free_dev_extent(device, dev_extent_want, &dev_offset,
&max_avail);
if (ret && ret != -ENOSPC)
return ret;
if (ret == 0)
max_avail = dev_extent_want; if (max_avail < ctl->dev_extent_min) {
if (btrfs_test_opt(info, ENOSPC_DEBUG))
btrfs_debug(info,
"%s: devid %llu has no free space, have=%llu want=%llu",
__func__, device->devid, max_avail,
ctl->dev_extent_min);
continue;
}
if (ndevs == fs_devices->rw_devices) { WARN(1, "%s: found more than %llu devices\n",
__func__, fs_devices->rw_devices);
break;
}
devices_info[ndevs].dev_offset = dev_offset;
devices_info[ndevs].max_avail = max_avail;
devices_info[ndevs].total_avail = total_avail;
devices_info[ndevs].dev = device;
++ndevs;
}
ctl->ndevs = ndevs;
/*
* now sort the devices by hole size / available space
*/
sort(devices_info, ndevs, sizeof(struct btrfs_device_info),
btrfs_cmp_device_info, NULL);
return 0;
}
static int decide_stripe_size_regular(struct alloc_chunk_ctl *ctl,
struct btrfs_device_info *devices_info)
{
/* Number of stripes that count for block group size */
int data_stripes;
/*
* The primary goal is to maximize the number of stripes, so use as
* many devices as possible, even if the stripes are not maximum sized.
*
* The DUP profile stores more than one stripe per device, the
* max_avail is the total size so we have to adjust.
*/
ctl->stripe_size = div_u64(devices_info[ctl->ndevs - 1].max_avail,
ctl->dev_stripes);
ctl->num_stripes = ctl->ndevs * ctl->dev_stripes;
/* This will have to be fixed for RAID1 and RAID10 over more drives */
data_stripes = (ctl->num_stripes - ctl->nparity) / ctl->ncopies;
/*
* Use the number of data stripes to figure out how big this chunk is
* really going to be in terms of logical address space, and compare
* that answer with the max chunk size. If it's higher, we try to
* reduce stripe_size.
*/
if (ctl->stripe_size * data_stripes > ctl->max_chunk_size) {
/*
* Reduce stripe_size, round it up to a 16MB boundary again and
* then use it, unless it ends up being even bigger than the
* previous value we had already.
*/
ctl->stripe_size = min(round_up(div_u64(ctl->max_chunk_size,
data_stripes), SZ_16M),
ctl->stripe_size);
}
/* Align to BTRFS_STRIPE_LEN */
ctl->stripe_size = round_down(ctl->stripe_size, BTRFS_STRIPE_LEN);
ctl->chunk_size = ctl->stripe_size * data_stripes;
return 0;
}
static int decide_stripe_size_zoned(struct alloc_chunk_ctl *ctl,
struct btrfs_device_info *devices_info)
{
u64 zone_size = devices_info[0].dev->zone_info->zone_size;
/* Number of stripes that count for block group size */
int data_stripes;
/*
* It should hold because:
* dev_extent_min == dev_extent_want == zone_size * dev_stripes
*/
ASSERT(devices_info[ctl->ndevs - 1].max_avail == ctl->dev_extent_min);
ctl->stripe_size = zone_size;
ctl->num_stripes = ctl->ndevs * ctl->dev_stripes;
data_stripes = (ctl->num_stripes - ctl->nparity) / ctl->ncopies;
/* stripe_size is fixed in zoned filesysmte. Reduce ndevs instead. */
if (ctl->stripe_size * data_stripes > ctl->max_chunk_size) {
ctl->ndevs = div_u64(div_u64(ctl->max_chunk_size * ctl->ncopies,
ctl->stripe_size) + ctl->nparity,
ctl->dev_stripes);
ctl->num_stripes = ctl->ndevs * ctl->dev_stripes;
data_stripes = (ctl->num_stripes - ctl->nparity) / ctl->ncopies;
ASSERT(ctl->stripe_size * data_stripes <= ctl->max_chunk_size);
}
ctl->chunk_size = ctl->stripe_size * data_stripes;
return 0;
}
static int decide_stripe_size(struct btrfs_fs_devices *fs_devices,
struct alloc_chunk_ctl *ctl,
struct btrfs_device_info *devices_info)
{
struct btrfs_fs_info *info = fs_devices->fs_info;
/*
* Round down to number of usable stripes, devs_increment can be any
* number so we can't use round_down() that requires power of 2, while
* rounddown is safe.
*/
ctl->ndevs = rounddown(ctl->ndevs, ctl->devs_increment);
if (ctl->ndevs < ctl->devs_min) {
if (btrfs_test_opt(info, ENOSPC_DEBUG)) {
btrfs_debug(info,
"%s: not enough devices with free space: have=%d minimum required=%d",
__func__, ctl->ndevs, ctl->devs_min);
}
return -ENOSPC;
}
ctl->ndevs = min(ctl->ndevs, ctl->devs_max);
switch (fs_devices->chunk_alloc_policy) {
case BTRFS_CHUNK_ALLOC_REGULAR:
return decide_stripe_size_regular(ctl, devices_info);
case BTRFS_CHUNK_ALLOC_ZONED:
return decide_stripe_size_zoned(ctl, devices_info);
default:
BUG();
}
}
static struct btrfs_block_group *create_chunk(struct btrfs_trans_handle *trans,
struct alloc_chunk_ctl *ctl,
struct btrfs_device_info *devices_info)
{
struct btrfs_fs_info *info = trans->fs_info;
struct map_lookup *map = NULL;
struct extent_map_tree *em_tree;
struct btrfs_block_group *block_group;
struct extent_map *em;
u64 start = ctl->start;
u64 type = ctl->type;
int ret;
int i;
int j;
map = kmalloc(map_lookup_size(ctl->num_stripes), GFP_NOFS);
if (!map)
return ERR_PTR(-ENOMEM);
map->num_stripes = ctl->num_stripes; for (i = 0; i < ctl->ndevs; ++i) { for (j = 0; j < ctl->dev_stripes; ++j) { int s = i * ctl->dev_stripes + j;
map->stripes[s].dev = devices_info[i].dev;
map->stripes[s].physical = devices_info[i].dev_offset +
j * ctl->stripe_size;
}
}
map->stripe_len = BTRFS_STRIPE_LEN;
map->io_align = BTRFS_STRIPE_LEN;
map->io_width = BTRFS_STRIPE_LEN;
map->type = type;
map->sub_stripes = ctl->sub_stripes;
trace_btrfs_chunk_alloc(info, map, start, ctl->chunk_size);
em = alloc_extent_map();
if (!em) {
kfree(map); return ERR_PTR(-ENOMEM);
}
set_bit(EXTENT_FLAG_FS_MAPPING, &em->flags);
em->map_lookup = map;
em->start = start;
em->len = ctl->chunk_size;
em->block_start = 0;
em->block_len = em->len;
em->orig_block_len = ctl->stripe_size;
em_tree = &info->mapping_tree;
write_lock(&em_tree->lock);
ret = add_extent_mapping(em_tree, em, 0);
if (ret) {
write_unlock(&em_tree->lock);
free_extent_map(em);
return ERR_PTR(ret);
}
write_unlock(&em_tree->lock);
block_group = btrfs_make_block_group(trans, 0, type, start, ctl->chunk_size);
if (IS_ERR(block_group))
goto error_del_extent;
for (i = 0; i < map->num_stripes; i++) { struct btrfs_device *dev = map->stripes[i].dev;
btrfs_device_set_bytes_used(dev,
dev->bytes_used + ctl->stripe_size);
if (list_empty(&dev->post_commit_list))
list_add_tail(&dev->post_commit_list,
&trans->transaction->dev_update_list);
}
atomic64_sub(ctl->stripe_size * map->num_stripes,
&info->free_chunk_space);
free_extent_map(em);
check_raid56_incompat_flag(info, type);
check_raid1c34_incompat_flag(info, type);
return block_group;
error_del_extent:
write_lock(&em_tree->lock);
remove_extent_mapping(em_tree, em);
write_unlock(&em_tree->lock);
/* One for our allocation */
free_extent_map(em);
/* One for the tree reference */
free_extent_map(em);
return block_group;
}
struct btrfs_block_group *btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
u64 type)
{
struct btrfs_fs_info *info = trans->fs_info;
struct btrfs_fs_devices *fs_devices = info->fs_devices;
struct btrfs_device_info *devices_info = NULL;
struct alloc_chunk_ctl ctl;
struct btrfs_block_group *block_group;
int ret;
lockdep_assert_held(&info->chunk_mutex);
if (!alloc_profile_is_valid(type, 0)) {
ASSERT(0);
return ERR_PTR(-EINVAL);
}
if (list_empty(&fs_devices->alloc_list)) {
if (btrfs_test_opt(info, ENOSPC_DEBUG))
btrfs_debug(info, "%s: no writable device", __func__);
return ERR_PTR(-ENOSPC);
}
if (!(type & BTRFS_BLOCK_GROUP_TYPE_MASK)) {
btrfs_err(info, "invalid chunk type 0x%llx requested", type);
ASSERT(0);
return ERR_PTR(-EINVAL);
}
ctl.start = find_next_chunk(info);
ctl.type = type;
init_alloc_chunk_ctl(fs_devices, &ctl);
devices_info = kcalloc(fs_devices->rw_devices, sizeof(*devices_info),
GFP_NOFS);
if (!devices_info)
return ERR_PTR(-ENOMEM);
ret = gather_device_info(fs_devices, &ctl, devices_info);
if (ret < 0) {
block_group = ERR_PTR(ret);
goto out;
}
ret = decide_stripe_size(fs_devices, &ctl, devices_info);
if (ret < 0) {
block_group = ERR_PTR(ret);
goto out;
}
block_group = create_chunk(trans, &ctl, devices_info);
out:
kfree(devices_info);
return block_group;
}
/*
* This function, btrfs_chunk_alloc_add_chunk_item(), typically belongs to the
* phase 1 of chunk allocation. It belongs to phase 2 only when allocating system
* chunks.
*
* See the comment at btrfs_chunk_alloc() for details about the chunk allocation
* phases.
*/
int btrfs_chunk_alloc_add_chunk_item(struct btrfs_trans_handle *trans,
struct btrfs_block_group *bg)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
struct btrfs_root *extent_root = fs_info->extent_root;
struct btrfs_root *chunk_root = fs_info->chunk_root;
struct btrfs_key key;
struct btrfs_chunk *chunk;
struct btrfs_stripe *stripe;
struct extent_map *em;
struct map_lookup *map;
size_t item_size;
int i;
int ret;
/*
* We take the chunk_mutex for 2 reasons:
*
* 1) Updates and insertions in the chunk btree must be done while holding
* the chunk_mutex, as well as updating the system chunk array in the
* superblock. See the comment on top of btrfs_chunk_alloc() for the
* details;
*
* 2) To prevent races with the final phase of a device replace operation
* that replaces the device object associated with the map's stripes,
* because the device object's id can change at any time during that
* final phase of the device replace operation
* (dev-replace.c:btrfs_dev_replace_finishing()), so we could grab the
* replaced device and then see it with an ID of BTRFS_DEV_REPLACE_DEVID,
* which would cause a failure when updating the device item, which does
* not exists, or persisting a stripe of the chunk item with such ID.
* Here we can't use the device_list_mutex because our caller already
* has locked the chunk_mutex, and the final phase of device replace
* acquires both mutexes - first the device_list_mutex and then the
* chunk_mutex. Using any of those two mutexes protects us from a
* concurrent device replace.
*/
lockdep_assert_held(&fs_info->chunk_mutex);
em = btrfs_get_chunk_map(fs_info, bg->start, bg->length);
if (IS_ERR(em)) {
ret = PTR_ERR(em);
btrfs_abort_transaction(trans, ret);
return ret;
}
map = em->map_lookup;
item_size = btrfs_chunk_item_size(map->num_stripes);
chunk = kzalloc(item_size, GFP_NOFS);
if (!chunk) {
ret = -ENOMEM;
btrfs_abort_transaction(trans, ret);
goto out;
}
for (i = 0; i < map->num_stripes; i++) { struct btrfs_device *device = map->stripes[i].dev;
ret = btrfs_update_device(trans, device);
if (ret)
goto out;
}
stripe = &chunk->stripe;
for (i = 0; i < map->num_stripes; i++) {
struct btrfs_device *device = map->stripes[i].dev;
const u64 dev_offset = map->stripes[i].physical;
btrfs_set_stack_stripe_devid(stripe, device->devid);
btrfs_set_stack_stripe_offset(stripe, dev_offset);
memcpy(stripe->dev_uuid, device->uuid, BTRFS_UUID_SIZE);
stripe++;
}
btrfs_set_stack_chunk_length(chunk, bg->length);
btrfs_set_stack_chunk_owner(chunk, extent_root->root_key.objectid);
btrfs_set_stack_chunk_stripe_len(chunk, map->stripe_len);
btrfs_set_stack_chunk_type(chunk, map->type);
btrfs_set_stack_chunk_num_stripes(chunk, map->num_stripes);
btrfs_set_stack_chunk_io_align(chunk, map->stripe_len);
btrfs_set_stack_chunk_io_width(chunk, map->stripe_len);
btrfs_set_stack_chunk_sector_size(chunk, fs_info->sectorsize);
btrfs_set_stack_chunk_sub_stripes(chunk, map->sub_stripes);
key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
key.type = BTRFS_CHUNK_ITEM_KEY;
key.offset = bg->start;
ret = btrfs_insert_item(trans, chunk_root, &key, chunk, item_size);
if (ret)
goto out;
bg->chunk_item_inserted = 1;
if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) {
ret = btrfs_add_system_chunk(fs_info, &key, chunk, item_size);
if (ret)
goto out;
}
out:
kfree(chunk);
free_extent_map(em);
return ret;
}
static noinline int init_first_rw_device(struct btrfs_trans_handle *trans)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
u64 alloc_profile;
struct btrfs_block_group *meta_bg;
struct btrfs_block_group *sys_bg;
/*
* When adding a new device for sprouting, the seed device is read-only
* so we must first allocate a metadata and a system chunk. But before
* adding the block group items to the extent, device and chunk btrees,
* we must first:
*
* 1) Create both chunks without doing any changes to the btrees, as
* otherwise we would get -ENOSPC since the block groups from the
* seed device are read-only;
*
* 2) Add the device item for the new sprout device - finishing the setup
* of a new block group requires updating the device item in the chunk
* btree, so it must exist when we attempt to do it. The previous step
* ensures this does not fail with -ENOSPC.
*
* After that we can add the block group items to their btrees:
* update existing device item in the chunk btree, add a new block group
* item to the extent btree, add a new chunk item to the chunk btree and
* finally add the new device extent items to the devices btree.
*/
alloc_profile = btrfs_metadata_alloc_profile(fs_info);
meta_bg = btrfs_alloc_chunk(trans, alloc_profile);
if (IS_ERR(meta_bg))
return PTR_ERR(meta_bg);
alloc_profile = btrfs_system_alloc_profile(fs_info);
sys_bg = btrfs_alloc_chunk(trans, alloc_profile);
if (IS_ERR(sys_bg))
return PTR_ERR(sys_bg);
return 0;
}
static inline int btrfs_chunk_max_errors(struct map_lookup *map)
{
const int index = btrfs_bg_flags_to_raid_index(map->type);
return btrfs_raid_array[index].tolerated_failures;
}
int btrfs_chunk_readonly(struct btrfs_fs_info *fs_info, u64 chunk_offset)
{
struct extent_map *em;
struct map_lookup *map;
int readonly = 0;
int miss_ndevs = 0;
int i;
em = btrfs_get_chunk_map(fs_info, chunk_offset, 1);
if (IS_ERR(em))
return 1;
map = em->map_lookup; for (i = 0; i < map->num_stripes; i++) {
if (test_bit(BTRFS_DEV_STATE_MISSING,
&map->stripes[i].dev->dev_state)) { miss_ndevs++;
continue;
}
if (!test_bit(BTRFS_DEV_STATE_WRITEABLE,
&map->stripes[i].dev->dev_state)) {
readonly = 1;
goto end;
}
}
/*
* If the number of missing devices is larger than max errors,
* we can not write the data into that chunk successfully, so
* set it readonly.
*/
if (miss_ndevs > btrfs_chunk_max_errors(map))
readonly = 1;
end:
free_extent_map(em); return readonly;
}
void btrfs_mapping_tree_free(struct extent_map_tree *tree)
{
struct extent_map *em;
while (1) {
write_lock(&tree->lock);
em = lookup_extent_mapping(tree, 0, (u64)-1);
if (em)
remove_extent_mapping(tree, em);
write_unlock(&tree->lock);
if (!em)
break;
/* once for us */
free_extent_map(em);
/* once for the tree */
free_extent_map(em);
}
}
int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len)
{
struct extent_map *em;
struct map_lookup *map;
int ret;
em = btrfs_get_chunk_map(fs_info, logical, len);
if (IS_ERR(em))
/*
* We could return errors for these cases, but that could get
* ugly and we'd probably do the same thing which is just not do
* anything else and exit, so return 1 so the callers don't try
* to use other copies.
*/
return 1;
map = em->map_lookup;
if (map->type & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1_MASK))
ret = map->num_stripes; else if (map->type & BTRFS_BLOCK_GROUP_RAID10) ret = map->sub_stripes; else if (map->type & BTRFS_BLOCK_GROUP_RAID5)
ret = 2;
else if (map->type & BTRFS_BLOCK_GROUP_RAID6)
/*
* There could be two corrupted data stripes, we need
* to loop retry in order to rebuild the correct data.
*
* Fail a stripe at a time on every retry except the
* stripe under reconstruction.
*/
ret = map->num_stripes;
else
ret = 1;
free_extent_map(em);
down_read(&fs_info->dev_replace.rwsem);
if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace) &&
fs_info->dev_replace.tgtdev) ret++; up_read(&fs_info->dev_replace.rwsem); return ret;
}
unsigned long btrfs_full_stripe_len(struct btrfs_fs_info *fs_info,
u64 logical)
{
struct extent_map *em;
struct map_lookup *map;
unsigned long len = fs_info->sectorsize;
em = btrfs_get_chunk_map(fs_info, logical, len);
if (!WARN_ON(IS_ERR(em))) { map = em->map_lookup;
if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK)
len = map->stripe_len * nr_data_stripes(map); free_extent_map(em);
}
return len;}
int btrfs_is_parity_mirror(struct btrfs_fs_info *fs_info, u64 logical, u64 len)
{
struct extent_map *em;
struct map_lookup *map;
int ret = 0;
em = btrfs_get_chunk_map(fs_info, logical, len); if(!WARN_ON(IS_ERR(em))) { map = em->map_lookup;
if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK)
ret = 1;
free_extent_map(em);
}
return ret;
}
static int find_live_mirror(struct btrfs_fs_info *fs_info,
struct map_lookup *map, int first,
int dev_replace_is_ongoing)
{
int i;
int num_stripes;
int preferred_mirror;
int tolerance;
struct btrfs_device *srcdev;
ASSERT((map->type &
(BTRFS_BLOCK_GROUP_RAID1_MASK | BTRFS_BLOCK_GROUP_RAID10)));
if (map->type & BTRFS_BLOCK_GROUP_RAID10)
num_stripes = map->sub_stripes;
else
num_stripes = map->num_stripes;
switch (fs_info->fs_devices->read_policy) {
default:
/* Shouldn't happen, just warn and use pid instead of failing */
btrfs_warn_rl(fs_info,
"unknown read_policy type %u, reset to pid",
fs_info->fs_devices->read_policy);
fs_info->fs_devices->read_policy = BTRFS_READ_POLICY_PID;
fallthrough;
case BTRFS_READ_POLICY_PID:
preferred_mirror = first + (current->pid % num_stripes);
break;
}
if (dev_replace_is_ongoing &&
fs_info->dev_replace.cont_reading_from_srcdev_mode ==
BTRFS_DEV_REPLACE_ITEM_CONT_READING_FROM_SRCDEV_MODE_AVOID)
srcdev = fs_info->dev_replace.srcdev;
else
srcdev = NULL;
/*
* try to avoid the drive that is the source drive for a
* dev-replace procedure, only choose it if no other non-missing
* mirror is available
*/
for (tolerance = 0; tolerance < 2; tolerance++) {
if (map->stripes[preferred_mirror].dev->bdev &&
(tolerance || map->stripes[preferred_mirror].dev != srcdev))
return preferred_mirror;
for (i = first; i < first + num_stripes; i++) {
if (map->stripes[i].dev->bdev &&
(tolerance || map->stripes[i].dev != srcdev))
return i;
}
}
/* we couldn't find one that doesn't fail. Just return something
* and the io error handling code will clean up eventually
*/
return preferred_mirror;
}
/* Bubble-sort the stripe set to put the parity/syndrome stripes last */
static void sort_parity_stripes(struct btrfs_bio *bbio, int num_stripes)
{
int i;
int again = 1;
while (again) {
again = 0;
for (i = 0; i < num_stripes - 1; i++) {
/* Swap if parity is on a smaller index */
if (bbio->raid_map[i] > bbio->raid_map[i + 1]) { swap(bbio->stripes[i], bbio->stripes[i + 1]);
swap(bbio->raid_map[i], bbio->raid_map[i + 1]);
again = 1;
}
}
}
}
static struct btrfs_bio *alloc_btrfs_bio(int total_stripes, int real_stripes)
{
struct btrfs_bio *bbio = kzalloc(
/* the size of the btrfs_bio */
sizeof(struct btrfs_bio) +
/* plus the variable array for the stripes */
sizeof(struct btrfs_bio_stripe) * (total_stripes) +
/* plus the variable array for the tgt dev */
sizeof(int) * (real_stripes) +
/*
* plus the raid_map, which includes both the tgt dev
* and the stripes
*/
sizeof(u64) * (total_stripes),
GFP_NOFS|__GFP_NOFAIL);
atomic_set(&bbio->error, 0);
refcount_set(&bbio->refs, 1);
bbio->tgtdev_map = (int *)(bbio->stripes + total_stripes);
bbio->raid_map = (u64 *)(bbio->tgtdev_map + real_stripes);
return bbio;
}
void btrfs_get_bbio(struct btrfs_bio *bbio)
{
WARN_ON(!refcount_read(&bbio->refs));
refcount_inc(&bbio->refs);
}
void btrfs_put_bbio(struct btrfs_bio *bbio)
{
if (!bbio)
return;
if (refcount_dec_and_test(&bbio->refs))
kfree(bbio);
}
/* can REQ_OP_DISCARD be sent with other REQ like REQ_OP_WRITE? */
/*
* Please note that, discard won't be sent to target device of device
* replace.
*/
static int __btrfs_map_block_for_discard(struct btrfs_fs_info *fs_info,
u64 logical, u64 *length_ret,
struct btrfs_bio **bbio_ret)
{
struct extent_map *em;
struct map_lookup *map;
struct btrfs_bio *bbio;
u64 length = *length_ret;
u64 offset;
u64 stripe_nr;
u64 stripe_nr_end;
u64 stripe_end_offset;
u64 stripe_cnt;
u64 stripe_len;
u64 stripe_offset;
u64 num_stripes;
u32 stripe_index;
u32 factor = 0;
u32 sub_stripes = 0;
u64 stripes_per_dev = 0;
u32 remaining_stripes = 0;
u32 last_stripe = 0;
int ret = 0;
int i;
/* discard always return a bbio */
ASSERT(bbio_ret);
em = btrfs_get_chunk_map(fs_info, logical, length);
if (IS_ERR(em))
return PTR_ERR(em);
map = em->map_lookup;
/* we don't discard raid56 yet */
if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
ret = -EOPNOTSUPP;
goto out;
}
offset = logical - em->start;
length = min_t(u64, em->start + em->len - logical, length);
*length_ret = length;
stripe_len = map->stripe_len;
/*
* stripe_nr counts the total number of stripes we have to stride
* to get to this block
*/
stripe_nr = div64_u64(offset, stripe_len);
/* stripe_offset is the offset of this block in its stripe */
stripe_offset = offset - stripe_nr * stripe_len;
stripe_nr_end = round_up(offset + length, map->stripe_len);
stripe_nr_end = div64_u64(stripe_nr_end, map->stripe_len);
stripe_cnt = stripe_nr_end - stripe_nr;
stripe_end_offset = stripe_nr_end * map->stripe_len -
(offset + length);
/*
* after this, stripe_nr is the number of stripes on this
* device we have to walk to find the data, and stripe_index is
* the number of our device in the stripe array
*/
num_stripes = 1;
stripe_index = 0;
if (map->type & (BTRFS_BLOCK_GROUP_RAID0 |
BTRFS_BLOCK_GROUP_RAID10)) {
if (map->type & BTRFS_BLOCK_GROUP_RAID0)
sub_stripes = 1;
else
sub_stripes = map->sub_stripes; factor = map->num_stripes / sub_stripes;
num_stripes = min_t(u64, map->num_stripes,
sub_stripes * stripe_cnt);
stripe_nr = div_u64_rem(stripe_nr, factor, &stripe_index);
stripe_index *= sub_stripes;
stripes_per_dev = div_u64_rem(stripe_cnt, factor,
&remaining_stripes);
div_u64_rem(stripe_nr_end - 1, factor, &last_stripe);
last_stripe *= sub_stripes;
} else if (map->type & (BTRFS_BLOCK_GROUP_RAID1_MASK |
BTRFS_BLOCK_GROUP_DUP)) {
num_stripes = map->num_stripes;
} else {
stripe_nr = div_u64_rem(stripe_nr, map->num_stripes,
&stripe_index);
}
bbio = alloc_btrfs_bio(num_stripes, 0);
if (!bbio) {
ret = -ENOMEM;
goto out;
}
for (i = 0; i < num_stripes; i++) {
bbio->stripes[i].physical =
map->stripes[stripe_index].physical +
stripe_offset + stripe_nr * map->stripe_len;
bbio->stripes[i].dev = map->stripes[stripe_index].dev;
if (map->type & (BTRFS_BLOCK_GROUP_RAID0 |
BTRFS_BLOCK_GROUP_RAID10)) {
bbio->stripes[i].length = stripes_per_dev *
map->stripe_len;
if (i / sub_stripes < remaining_stripes)
bbio->stripes[i].length +=
map->stripe_len;
/*
* Special for the first stripe and
* the last stripe:
*
* |-------|...|-------|
* |----------|
* off end_off
*/
if (i < sub_stripes) bbio->stripes[i].length -=
stripe_offset;
if (stripe_index >= last_stripe && stripe_index <= (last_stripe +
sub_stripes - 1))
bbio->stripes[i].length -=
stripe_end_offset;
if (i == sub_stripes - 1)
stripe_offset = 0;
} else {
bbio->stripes[i].length = length;
}
stripe_index++;
if (stripe_index == map->num_stripes) {
stripe_index = 0;
stripe_nr++;
}
}
*bbio_ret = bbio;
bbio->map_type = map->type;
bbio->num_stripes = num_stripes;
out:
free_extent_map(em);
return ret;
}
/*
* In dev-replace case, for repair case (that's the only case where the mirror
* is selected explicitly when calling btrfs_map_block), blocks left of the
* left cursor can also be read from the target drive.
*
* For REQ_GET_READ_MIRRORS, the target drive is added as the last one to the
* array of stripes.
* For READ, it also needs to be supported using the same mirror number.
*
* If the requested block is not left of the left cursor, EIO is returned. This
* can happen because btrfs_num_copies() returns one more in the dev-replace
* case.
*/
static int get_extra_mirror_from_replace(struct btrfs_fs_info *fs_info,
u64 logical, u64 length,
u64 srcdev_devid, int *mirror_num,
u64 *physical)
{
struct btrfs_bio *bbio = NULL;
int num_stripes;
int index_srcdev = 0;
int found = 0;
u64 physical_of_found = 0;
int i;
int ret = 0;
ret = __btrfs_map_block(fs_info, BTRFS_MAP_GET_READ_MIRRORS,
logical, &length, &bbio, 0, 0);
if (ret) {
ASSERT(bbio == NULL);
return ret;
}
num_stripes = bbio->num_stripes;
if (*mirror_num > num_stripes) {
/*
* BTRFS_MAP_GET_READ_MIRRORS does not contain this mirror,
* that means that the requested area is not left of the left
* cursor
*/
btrfs_put_bbio(bbio);
return -EIO;
}
/*
* process the rest of the function using the mirror_num of the source
* drive. Therefore look it up first. At the end, patch the device
* pointer to the one of the target drive.
*/
for (i = 0; i < num_stripes; i++) { if (bbio->stripes[i].dev->devid != srcdev_devid)
continue;
/*
* In case of DUP, in order to keep it simple, only add the
* mirror with the lowest physical address
*/
if (found &&
physical_of_found <= bbio->stripes[i].physical)
continue;
index_srcdev = i;
found = 1;
physical_of_found = bbio->stripes[i].physical;
}
btrfs_put_bbio(bbio);
ASSERT(found);
if (!found)
return -EIO;
*mirror_num = index_srcdev + 1;
*physical = physical_of_found;
return ret;
}
static bool is_block_group_to_copy(struct btrfs_fs_info *fs_info, u64 logical)
{
struct btrfs_block_group *cache;
bool ret;
/* Non zoned filesystem does not use "to_copy" flag */
if (!btrfs_is_zoned(fs_info))
return false;
cache = btrfs_lookup_block_group(fs_info, logical);
spin_lock(&cache->lock);
ret = cache->to_copy;
spin_unlock(&cache->lock);
btrfs_put_block_group(cache);
return ret;
}
static void handle_ops_on_dev_replace(enum btrfs_map_op op,
struct btrfs_bio **bbio_ret,
struct btrfs_dev_replace *dev_replace,
u64 logical,
int *num_stripes_ret, int *max_errors_ret)
{
struct btrfs_bio *bbio = *bbio_ret;
u64 srcdev_devid = dev_replace->srcdev->devid;
int tgtdev_indexes = 0;
int num_stripes = *num_stripes_ret;
int max_errors = *max_errors_ret;
int i;
if (op == BTRFS_MAP_WRITE) {
int index_where_to_add;
/*
* A block group which have "to_copy" set will eventually
* copied by dev-replace process. We can avoid cloning IO here.
*/
if (is_block_group_to_copy(dev_replace->srcdev->fs_info, logical))
return;
/*
* duplicate the write operations while the dev replace
* procedure is running. Since the copying of the old disk to
* the new disk takes place at run time while the filesystem is
* mounted writable, the regular write operations to the old
* disk have to be duplicated to go to the new disk as well.
*
* Note that device->missing is handled by the caller, and that
* the write to the old disk is already set up in the stripes
* array.
*/
index_where_to_add = num_stripes;
for (i = 0; i < num_stripes; i++) { if (bbio->stripes[i].dev->devid == srcdev_devid) {
/* write to new disk, too */
struct btrfs_bio_stripe *new =
bbio->stripes + index_where_to_add;
struct btrfs_bio_stripe *old =
bbio->stripes + i;
new->physical = old->physical;
new->length = old->length;
new->dev = dev_replace->tgtdev;
bbio->tgtdev_map[i] = index_where_to_add;
index_where_to_add++;
max_errors++;
tgtdev_indexes++;
}
}
num_stripes = index_where_to_add;
} else if (op == BTRFS_MAP_GET_READ_MIRRORS) {
int index_srcdev = 0;
int found = 0;
u64 physical_of_found = 0;
/*
* During the dev-replace procedure, the target drive can also
* be used to read data in case it is needed to repair a corrupt
* block elsewhere. This is possible if the requested area is
* left of the left cursor. In this area, the target drive is a
* full copy of the source drive.
*/
for (i = 0; i < num_stripes; i++) { if (bbio->stripes[i].dev->devid == srcdev_devid) {
/*
* In case of DUP, in order to keep it simple,
* only add the mirror with the lowest physical
* address
*/
if (found &&
physical_of_found <=
bbio->stripes[i].physical)
continue;
index_srcdev = i;
found = 1;
physical_of_found = bbio->stripes[i].physical;
}
}
if (found) {
struct btrfs_bio_stripe *tgtdev_stripe =
bbio->stripes + num_stripes;
tgtdev_stripe->physical = physical_of_found;
tgtdev_stripe->length =
bbio->stripes[index_srcdev].length;
tgtdev_stripe->dev = dev_replace->tgtdev;
bbio->tgtdev_map[index_srcdev] = num_stripes;
tgtdev_indexes++;
num_stripes++;
}
}
*num_stripes_ret = num_stripes;
*max_errors_ret = max_errors;
bbio->num_tgtdevs = tgtdev_indexes; *bbio_ret = bbio;
}
static bool need_full_stripe(enum btrfs_map_op op)
{
return (op == BTRFS_MAP_WRITE || op == BTRFS_MAP_GET_READ_MIRRORS);
}
/*
* Calculate the geometry of a particular (address, len) tuple. This
* information is used to calculate how big a particular bio can get before it
* straddles a stripe.
*
* @fs_info: the filesystem
* @em: mapping containing the logical extent
* @op: type of operation - write or read
* @logical: address that we want to figure out the geometry of
* @io_geom: pointer used to return values
*
* Returns < 0 in case a chunk for the given logical address cannot be found,
* usually shouldn't happen unless @logical is corrupted, 0 otherwise.
*/
int btrfs_get_io_geometry(struct btrfs_fs_info *fs_info, struct extent_map *em,
enum btrfs_map_op op, u64 logical,
struct btrfs_io_geometry *io_geom)
{
struct map_lookup *map;
u64 len;
u64 offset;
u64 stripe_offset;
u64 stripe_nr;
u64 stripe_len;
u64 raid56_full_stripe_start = (u64)-1;
int data_stripes;
ASSERT(op != BTRFS_MAP_DISCARD);
map = em->map_lookup;
/* Offset of this logical address in the chunk */
offset = logical - em->start;
/* Len of a stripe in a chunk */
stripe_len = map->stripe_len;
/* Stripe where this block falls in */
stripe_nr = div64_u64(offset, stripe_len);
/* Offset of stripe in the chunk */
stripe_offset = stripe_nr * stripe_len;
if (offset < stripe_offset) {
btrfs_crit(fs_info,
"stripe math has gone wrong, stripe_offset=%llu offset=%llu start=%llu logical=%llu stripe_len=%llu",
stripe_offset, offset, em->start, logical, stripe_len);
return -EINVAL;
}
/* stripe_offset is the offset of this block in its stripe */
stripe_offset = offset - stripe_offset;
data_stripes = nr_data_stripes(map);
if (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) { u64 max_len = stripe_len - stripe_offset;
/*
* In case of raid56, we need to know the stripe aligned start
*/
if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
unsigned long full_stripe_len = stripe_len * data_stripes;
raid56_full_stripe_start = offset;
/*
* Allow a write of a full stripe, but make sure we
* don't allow straddling of stripes
*/
raid56_full_stripe_start = div64_u64(raid56_full_stripe_start,
full_stripe_len);
raid56_full_stripe_start *= full_stripe_len;
/*
* For writes to RAID[56], allow a full stripeset across
* all disks. For other RAID types and for RAID[56]
* reads, just allow a single stripe (on a single disk).
*/
if (op == BTRFS_MAP_WRITE) {
max_len = stripe_len * data_stripes -
(offset - raid56_full_stripe_start);
}
}
len = min_t(u64, em->len - offset, max_len);
} else {
len = em->len - offset;
}
io_geom->len = len;
io_geom->offset = offset;
io_geom->stripe_len = stripe_len;
io_geom->stripe_nr = stripe_nr;
io_geom->stripe_offset = stripe_offset;
io_geom->raid56_stripe_offset = raid56_full_stripe_start;
return 0;
}
static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
enum btrfs_map_op op,
u64 logical, u64 *length,
struct btrfs_bio **bbio_ret,
int mirror_num, int need_raid_map)
{
struct extent_map *em;
struct map_lookup *map;
u64 stripe_offset;
u64 stripe_nr;
u64 stripe_len;
u32 stripe_index;
int data_stripes;
int i;
int ret = 0;
int num_stripes;
int max_errors = 0;
int tgtdev_indexes = 0;
struct btrfs_bio *bbio = NULL;
struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
int dev_replace_is_ongoing = 0;
int num_alloc_stripes;
int patch_the_first_stripe_for_dev_replace = 0;
u64 physical_to_patch_in_first_stripe = 0;
u64 raid56_full_stripe_start = (u64)-1;
struct btrfs_io_geometry geom;
ASSERT(bbio_ret);
ASSERT(op != BTRFS_MAP_DISCARD);
em = btrfs_get_chunk_map(fs_info, logical, *length);
ASSERT(!IS_ERR(em));
ret = btrfs_get_io_geometry(fs_info, em, op, logical, &geom);
if (ret < 0)
return ret;
map = em->map_lookup;
*length = geom.len;
stripe_len = geom.stripe_len;
stripe_nr = geom.stripe_nr;
stripe_offset = geom.stripe_offset;
raid56_full_stripe_start = geom.raid56_stripe_offset;
data_stripes = nr_data_stripes(map);
down_read(&dev_replace->rwsem);
dev_replace_is_ongoing = btrfs_dev_replace_is_ongoing(dev_replace);
/*
* Hold the semaphore for read during the whole operation, write is
* requested at commit time but must wait.
*/
if (!dev_replace_is_ongoing)
up_read(&dev_replace->rwsem); if (dev_replace_is_ongoing && mirror_num == map->num_stripes + 1 && !need_full_stripe(op) && dev_replace->tgtdev != NULL) {
ret = get_extra_mirror_from_replace(fs_info, logical, *length,
dev_replace->srcdev->devid,
&mirror_num,
&physical_to_patch_in_first_stripe);
if (ret)
goto out;
else
patch_the_first_stripe_for_dev_replace = 1;
} else if (mirror_num > map->num_stripes) {
mirror_num = 0;
}
num_stripes = 1;
stripe_index = 0;
if (map->type & BTRFS_BLOCK_GROUP_RAID0) { stripe_nr = div_u64_rem(stripe_nr, map->num_stripes,
&stripe_index);
if (!need_full_stripe(op))
mirror_num = 1;
} else if (map->type & BTRFS_BLOCK_GROUP_RAID1_MASK) {
if (need_full_stripe(op))
num_stripes = map->num_stripes; else if (mirror_num) stripe_index = mirror_num - 1;
else {
stripe_index = find_live_mirror(fs_info, map, 0,
dev_replace_is_ongoing);
mirror_num = stripe_index + 1;
}
} else if (map->type & BTRFS_BLOCK_GROUP_DUP) {
if (need_full_stripe(op)) {
num_stripes = map->num_stripes;
} else if (mirror_num) { stripe_index = mirror_num - 1;
} else {
mirror_num = 1;
}
} else if (map->type & BTRFS_BLOCK_GROUP_RAID10) { u32 factor = map->num_stripes / map->sub_stripes;
stripe_nr = div_u64_rem(stripe_nr, factor, &stripe_index);
stripe_index *= map->sub_stripes;
if (need_full_stripe(op))
num_stripes = map->sub_stripes;
else if (mirror_num) stripe_index += mirror_num - 1;
else {
int old_stripe_index = stripe_index;
stripe_index = find_live_mirror(fs_info, map,
stripe_index,
dev_replace_is_ongoing);
mirror_num = stripe_index - old_stripe_index + 1;
}
} else if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) { if (need_raid_map && (need_full_stripe(op) || mirror_num > 1)) {
/* push stripe_nr back to the start of the full stripe */
stripe_nr = div64_u64(raid56_full_stripe_start,
stripe_len * data_stripes);
/* RAID[56] write or recovery. Return all stripes */
num_stripes = map->num_stripes;
max_errors = nr_parity_stripes(map);
*length = map->stripe_len;
stripe_index = 0;
stripe_offset = 0;
} else {
/*
* Mirror #0 or #1 means the original data block.
* Mirror #2 is RAID5 parity block.
* Mirror #3 is RAID6 Q block.
*/
stripe_nr = div_u64_rem(stripe_nr,
data_stripes, &stripe_index);
if (mirror_num > 1)
stripe_index = data_stripes + mirror_num - 2;
/* We distribute the parity blocks across stripes */
div_u64_rem(stripe_nr + stripe_index, map->num_stripes,
&stripe_index);
if (!need_full_stripe(op) && mirror_num <= 1)
mirror_num = 1;
}
} else {
/*
* after this, stripe_nr is the number of stripes on this
* device we have to walk to find the data, and stripe_index is
* the number of our device in the stripe array
*/
stripe_nr = div_u64_rem(stripe_nr, map->num_stripes,
&stripe_index);
mirror_num = stripe_index + 1;
}
if (stripe_index >= map->num_stripes) {
btrfs_crit(fs_info,
"stripe index math went horribly wrong, got stripe_index=%u, num_stripes=%u",
stripe_index, map->num_stripes);
ret = -EINVAL;
goto out;
}
num_alloc_stripes = num_stripes;
if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL) { if (op == BTRFS_MAP_WRITE) num_alloc_stripes <<= 1; if (op == BTRFS_MAP_GET_READ_MIRRORS) num_alloc_stripes++;
tgtdev_indexes = num_stripes;
}
bbio = alloc_btrfs_bio(num_alloc_stripes, tgtdev_indexes);
if (!bbio) {
ret = -ENOMEM;
goto out;
}
for (i = 0; i < num_stripes; i++) { bbio->stripes[i].physical = map->stripes[stripe_index].physical +
stripe_offset + stripe_nr * map->stripe_len;
bbio->stripes[i].dev = map->stripes[stripe_index].dev;
stripe_index++;
}
/* build raid_map */
if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK && need_raid_map && (need_full_stripe(op) || mirror_num > 1)) {
u64 tmp;
unsigned rot;
/* Work out the disk rotation on this stripe-set */
div_u64_rem(stripe_nr, num_stripes, &rot);
/* Fill in the logical address of each stripe */
tmp = stripe_nr * data_stripes;
for (i = 0; i < data_stripes; i++)
bbio->raid_map[(i+rot) % num_stripes] =
em->start + (tmp + i) * map->stripe_len; bbio->raid_map[(i+rot) % map->num_stripes] = RAID5_P_STRIPE;
if (map->type & BTRFS_BLOCK_GROUP_RAID6)
bbio->raid_map[(i+rot+1) % num_stripes] =
RAID6_Q_STRIPE;
sort_parity_stripes(bbio, num_stripes);
}
if (need_full_stripe(op))
max_errors = btrfs_chunk_max_errors(map);
if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL &&
need_full_stripe(op)) {
handle_ops_on_dev_replace(op, &bbio, dev_replace, logical,
&num_stripes, &max_errors);
}
*bbio_ret = bbio;
bbio->map_type = map->type;
bbio->num_stripes = num_stripes;
bbio->max_errors = max_errors;
bbio->mirror_num = mirror_num;
/*
* this is the case that REQ_READ && dev_replace_is_ongoing &&
* mirror_num == num_stripes + 1 && dev_replace target drive is
* available as a mirror
*/
if (patch_the_first_stripe_for_dev_replace && num_stripes > 0) { WARN_ON(num_stripes > 1); bbio->stripes[0].dev = dev_replace->tgtdev;
bbio->stripes[0].physical = physical_to_patch_in_first_stripe;
bbio->mirror_num = map->num_stripes + 1;
}
out:
if (dev_replace_is_ongoing) {
lockdep_assert_held(&dev_replace->rwsem);
/* Unlock and let waiting writers proceed */
up_read(&dev_replace->rwsem);
}
free_extent_map(em); return ret;
}
int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
u64 logical, u64 *length,
struct btrfs_bio **bbio_ret, int mirror_num)
{
if (op == BTRFS_MAP_DISCARD)
return __btrfs_map_block_for_discard(fs_info, logical,
length, bbio_ret);
return __btrfs_map_block(fs_info, op, logical, length, bbio_ret,
mirror_num, 0);
}
/* For Scrub/replace */
int btrfs_map_sblock(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
u64 logical, u64 *length,
struct btrfs_bio **bbio_ret)
{
return __btrfs_map_block(fs_info, op, logical, length, bbio_ret, 0, 1);
}
static inline void btrfs_end_bbio(struct btrfs_bio *bbio, struct bio *bio)
{
bio->bi_private = bbio->private;
bio->bi_end_io = bbio->end_io;
bio_endio(bio);
btrfs_put_bbio(bbio);
}
static void btrfs_end_bio(struct bio *bio)
{
struct btrfs_bio *bbio = bio->bi_private;
int is_orig_bio = 0;
if (bio->bi_status) {
atomic_inc(&bbio->error); if (bio->bi_status == BLK_STS_IOERR ||
bio->bi_status == BLK_STS_TARGET) {
struct btrfs_device *dev = btrfs_io_bio(bio)->device;
ASSERT(dev->bdev);
if (btrfs_op(bio) == BTRFS_MAP_WRITE)
btrfs_dev_stat_inc_and_print(dev,
BTRFS_DEV_STAT_WRITE_ERRS);
else if (!(bio->bi_opf & REQ_RAHEAD))
btrfs_dev_stat_inc_and_print(dev,
BTRFS_DEV_STAT_READ_ERRS);
if (bio->bi_opf & REQ_PREFLUSH)
btrfs_dev_stat_inc_and_print(dev,
BTRFS_DEV_STAT_FLUSH_ERRS);
}
}
if (bio == bbio->orig_bio)
is_orig_bio = 1;
btrfs_bio_counter_dec(bbio->fs_info);
if (atomic_dec_and_test(&bbio->stripes_pending)) {
if (!is_orig_bio) { bio_put(bio);
bio = bbio->orig_bio;
}
btrfs_io_bio(bio)->mirror_num = bbio->mirror_num;
/* only send an error to the higher layers if it is
* beyond the tolerance of the btrfs bio
*/
if (atomic_read(&bbio->error) > bbio->max_errors) {
bio->bi_status = BLK_STS_IOERR;
} else {
/*
* this bio is actually up to date, we didn't
* go over the max number of errors
*/
bio->bi_status = BLK_STS_OK;
}
btrfs_end_bbio(bbio, bio);
} else if (!is_orig_bio) { bio_put(bio);
}
}
static void submit_stripe_bio(struct btrfs_bio *bbio, struct bio *bio,
u64 physical, struct btrfs_device *dev)
{
struct btrfs_fs_info *fs_info = bbio->fs_info;
bio->bi_private = bbio;
btrfs_io_bio(bio)->device = dev;
bio->bi_end_io = btrfs_end_bio;
bio->bi_iter.bi_sector = physical >> 9;
/*
* For zone append writing, bi_sector must point the beginning of the
* zone
*/
if (bio_op(bio) == REQ_OP_ZONE_APPEND) {
if (btrfs_dev_is_sequential(dev, physical)) { u64 zone_start = round_down(physical, fs_info->zone_size);
bio->bi_iter.bi_sector = zone_start >> SECTOR_SHIFT;
} else {
bio->bi_opf &= ~REQ_OP_ZONE_APPEND;
bio->bi_opf |= REQ_OP_WRITE;
}
}
btrfs_debug_in_rcu(fs_info,
"btrfs_map_bio: rw %d 0x%x, sector=%llu, dev=%lu (%s id %llu), size=%u",
bio_op(bio), bio->bi_opf, bio->bi_iter.bi_sector,
(unsigned long)dev->bdev->bd_dev, rcu_str_deref(dev->name),
dev->devid, bio->bi_iter.bi_size);
bio_set_dev(bio, dev->bdev);
btrfs_bio_counter_inc_noblocked(fs_info);
btrfsic_submit_bio(bio);
}
static void bbio_error(struct btrfs_bio *bbio, struct bio *bio, u64 logical)
{
atomic_inc(&bbio->error);
if (atomic_dec_and_test(&bbio->stripes_pending)) {
/* Should be the original bio. */
WARN_ON(bio != bbio->orig_bio); btrfs_io_bio(bio)->mirror_num = bbio->mirror_num;
bio->bi_iter.bi_sector = logical >> 9;
if (atomic_read(&bbio->error) > bbio->max_errors)
bio->bi_status = BLK_STS_IOERR;
else
bio->bi_status = BLK_STS_OK;
btrfs_end_bbio(bbio, bio);
}
}
blk_status_t btrfs_map_bio(struct btrfs_fs_info *fs_info, struct bio *bio,
int mirror_num)
{
struct btrfs_device *dev;
struct bio *first_bio = bio;
u64 logical = bio->bi_iter.bi_sector << 9;
u64 length = 0;
u64 map_length;
int ret;
int dev_nr;
int total_devs;
struct btrfs_bio *bbio = NULL;
length = bio->bi_iter.bi_size;
map_length = length;
btrfs_bio_counter_inc_blocked(fs_info);
ret = __btrfs_map_block(fs_info, btrfs_op(bio), logical,
&map_length, &bbio, mirror_num, 1);
if (ret) {
btrfs_bio_counter_dec(fs_info);
return errno_to_blk_status(ret);
}
total_devs = bbio->num_stripes;
bbio->orig_bio = first_bio;
bbio->private = first_bio->bi_private;
bbio->end_io = first_bio->bi_end_io;
bbio->fs_info = fs_info;
atomic_set(&bbio->stripes_pending, bbio->num_stripes);
if ((bbio->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) &&
((btrfs_op(bio) == BTRFS_MAP_WRITE) || (mirror_num > 1))) {
/* In this case, map_length has been set to the length of
a single stripe; not the whole write */
if (btrfs_op(bio) == BTRFS_MAP_WRITE) {
ret = raid56_parity_write(fs_info, bio, bbio,
map_length);
} else {
ret = raid56_parity_recover(fs_info, bio, bbio,
map_length, mirror_num, 1);
}
btrfs_bio_counter_dec(fs_info);
return errno_to_blk_status(ret);
}
if (map_length < length) {
btrfs_crit(fs_info,
"mapping failed logical %llu bio len %llu len %llu",
logical, length, map_length);
BUG();
}
for (dev_nr = 0; dev_nr < total_devs; dev_nr++) { dev = bbio->stripes[dev_nr].dev; if (!dev || !dev->bdev || test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state) ||
(btrfs_op(first_bio) == BTRFS_MAP_WRITE &&
!test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state))) {
bbio_error(bbio, first_bio, logical);
continue;
}
if (dev_nr < total_devs - 1) bio = btrfs_bio_clone(first_bio);
else
bio = first_bio;
submit_stripe_bio(bbio, bio, bbio->stripes[dev_nr].physical, dev);
}
btrfs_bio_counter_dec(fs_info);
return BLK_STS_OK;
}
/*
* Find a device specified by @devid or @uuid in the list of @fs_devices, or
* return NULL.
*
* If devid and uuid are both specified, the match must be exact, otherwise
* only devid is used.
*/
struct btrfs_device *btrfs_find_device(struct btrfs_fs_devices *fs_devices,
u64 devid, u8 *uuid, u8 *fsid)
{
struct btrfs_device *device;
struct btrfs_fs_devices *seed_devs;
if (!fsid || !memcmp(fs_devices->metadata_uuid, fsid, BTRFS_FSID_SIZE)) { list_for_each_entry(device, &fs_devices->devices, dev_list) { if (device->devid == devid && (!uuid || memcmp(device->uuid, uuid,
BTRFS_UUID_SIZE) == 0))
return device;
}
}
list_for_each_entry(seed_devs, &fs_devices->seed_list, seed_list) { if (!fsid || !memcmp(seed_devs->metadata_uuid, fsid, BTRFS_FSID_SIZE)) { list_for_each_entry(device, &seed_devs->devices,
dev_list) {
if (device->devid == devid && (!uuid || memcmp(device->uuid, uuid,
BTRFS_UUID_SIZE) == 0))
return device;
}
}
}
return NULL;
}
static struct btrfs_device *add_missing_dev(struct btrfs_fs_devices *fs_devices,
u64 devid, u8 *dev_uuid)
{
struct btrfs_device *device;
unsigned int nofs_flag;
/*
* We call this under the chunk_mutex, so we want to use NOFS for this
* allocation, however we don't want to change btrfs_alloc_device() to
* always do NOFS because we use it in a lot of other GFP_KERNEL safe
* places.
*/
nofs_flag = memalloc_nofs_save();
device = btrfs_alloc_device(NULL, &devid, dev_uuid);
memalloc_nofs_restore(nofs_flag);
if (IS_ERR(device))
return device;
list_add(&device->dev_list, &fs_devices->devices);
device->fs_devices = fs_devices;
fs_devices->num_devices++;
set_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state);
fs_devices->missing_devices++;
return device;
}
/**
* btrfs_alloc_device - allocate struct btrfs_device
* @fs_info: used only for generating a new devid, can be NULL if
* devid is provided (i.e. @devid != NULL).
* @devid: a pointer to devid for this device. If NULL a new devid
* is generated.
* @uuid: a pointer to UUID for this device. If NULL a new UUID
* is generated.
*
* Return: a pointer to a new &struct btrfs_device on success; ERR_PTR()
* on error. Returned struct is not linked onto any lists and must be
* destroyed with btrfs_free_device.
*/
struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info,
const u64 *devid,
const u8 *uuid)
{
struct btrfs_device *dev;
u64 tmp;
if (WARN_ON(!devid && !fs_info))
return ERR_PTR(-EINVAL);
dev = kzalloc(sizeof(*dev), GFP_KERNEL);
if (!dev)
return ERR_PTR(-ENOMEM);
/*
* Preallocate a bio that's always going to be used for flushing device
* barriers and matches the device lifespan
*/
dev->flush_bio = bio_kmalloc(GFP_KERNEL, 0);
if (!dev->flush_bio) {
kfree(dev);
return ERR_PTR(-ENOMEM);
}
INIT_LIST_HEAD(&dev->dev_list);
INIT_LIST_HEAD(&dev->dev_alloc_list);
INIT_LIST_HEAD(&dev->post_commit_list);
atomic_set(&dev->reada_in_flight, 0);
atomic_set(&dev->dev_stats_ccnt, 0);
btrfs_device_data_ordered_init(dev);
INIT_RADIX_TREE(&dev->reada_zones, GFP_NOFS & ~__GFP_DIRECT_RECLAIM);
INIT_RADIX_TREE(&dev->reada_extents, GFP_NOFS & ~__GFP_DIRECT_RECLAIM);
extent_io_tree_init(fs_info, &dev->alloc_state,
IO_TREE_DEVICE_ALLOC_STATE, NULL);
if (devid)
tmp = *devid;
else {
int ret;
ret = find_next_devid(fs_info, &tmp); if (ret) { btrfs_free_device(dev);
return ERR_PTR(ret);
}
}
dev->devid = tmp;
if (uuid)
memcpy(dev->uuid, uuid, BTRFS_UUID_SIZE);
else
generate_random_uuid(dev->uuid);
return dev;
}
static void btrfs_report_missing_device(struct btrfs_fs_info *fs_info,
u64 devid, u8 *uuid, bool error)
{
if (error)
btrfs_err_rl(fs_info, "devid %llu uuid %pU is missing",
devid, uuid);
else
btrfs_warn_rl(fs_info, "devid %llu uuid %pU is missing",
devid, uuid);
}
static u64 calc_stripe_length(u64 type, u64 chunk_len, int num_stripes)
{
const int data_stripes = calc_data_stripes(type, num_stripes);
return div_u64(chunk_len, data_stripes);
}
#if BITS_PER_LONG == 32
/*
* Due to page cache limit, metadata beyond BTRFS_32BIT_MAX_FILE_SIZE
* can't be accessed on 32bit systems.
*
* This function do mount time check to reject the fs if it already has
* metadata chunk beyond that limit.
*/
static int check_32bit_meta_chunk(struct btrfs_fs_info *fs_info,
u64 logical, u64 length, u64 type)
{
if (!(type & BTRFS_BLOCK_GROUP_METADATA))
return 0;
if (logical + length < MAX_LFS_FILESIZE)
return 0;
btrfs_err_32bit_limit(fs_info);
return -EOVERFLOW;
}
/*
* This is to give early warning for any metadata chunk reaching
* BTRFS_32BIT_EARLY_WARN_THRESHOLD.
* Although we can still access the metadata, it's not going to be possible
* once the limit is reached.
*/
static void warn_32bit_meta_chunk(struct btrfs_fs_info *fs_info,
u64 logical, u64 length, u64 type)
{
if (!(type & BTRFS_BLOCK_GROUP_METADATA))
return;
if (logical + length < BTRFS_32BIT_EARLY_WARN_THRESHOLD)
return;
btrfs_warn_32bit_limit(fs_info);
}
#endif
static int read_one_chunk(struct btrfs_key *key, struct extent_buffer *leaf,
struct btrfs_chunk *chunk)
{
struct btrfs_fs_info *fs_info = leaf->fs_info;
struct extent_map_tree *map_tree = &fs_info->mapping_tree;
struct map_lookup *map;
struct extent_map *em;
u64 logical;
u64 length;
u64 devid;
u64 type;
u8 uuid[BTRFS_UUID_SIZE];
int num_stripes;
int ret;
int i;
logical = key->offset;
length = btrfs_chunk_length(leaf, chunk);
type = btrfs_chunk_type(leaf, chunk);
num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
#if BITS_PER_LONG == 32
ret = check_32bit_meta_chunk(fs_info, logical, length, type);
if (ret < 0)
return ret;
warn_32bit_meta_chunk(fs_info, logical, length, type);
#endif
/*
* Only need to verify chunk item if we're reading from sys chunk array,
* as chunk item in tree block is already verified by tree-checker.
*/
if (leaf->start == BTRFS_SUPER_INFO_OFFSET) {
ret = btrfs_check_chunk_valid(leaf, chunk, logical);
if (ret)
return ret;
}
read_lock(&map_tree->lock);
em = lookup_extent_mapping(map_tree, logical, 1);
read_unlock(&map_tree->lock);
/* already mapped? */
if (em && em->start <= logical && em->start + em->len > logical) { free_extent_map(em);
return 0;
} else if (em) {
free_extent_map(em);
}
em = alloc_extent_map();
if (!em)
return -ENOMEM;
map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS);
if (!map) {
free_extent_map(em);
return -ENOMEM;
}
set_bit(EXTENT_FLAG_FS_MAPPING, &em->flags);
em->map_lookup = map;
em->start = logical;
em->len = length;
em->orig_start = 0;
em->block_start = 0;
em->block_len = em->len;
map->num_stripes = num_stripes;
map->io_width = btrfs_chunk_io_width(leaf, chunk);
map->io_align = btrfs_chunk_io_align(leaf, chunk);
map->stripe_len = btrfs_chunk_stripe_len(leaf, chunk);
map->type = type;
map->sub_stripes = btrfs_chunk_sub_stripes(leaf, chunk);
map->verified_stripes = 0;
em->orig_block_len = calc_stripe_length(type, em->len,
map->num_stripes);
for (i = 0; i < num_stripes; i++) {
map->stripes[i].physical =
btrfs_stripe_offset_nr(leaf, chunk, i);
devid = btrfs_stripe_devid_nr(leaf, chunk, i);
read_extent_buffer(leaf, uuid, (unsigned long)
btrfs_stripe_dev_uuid_nr(chunk, i),
BTRFS_UUID_SIZE);
map->stripes[i].dev = btrfs_find_device(fs_info->fs_devices,
devid, uuid, NULL);
if (!map->stripes[i].dev &&
!btrfs_test_opt(fs_info, DEGRADED)) { free_extent_map(em);
btrfs_report_missing_device(fs_info, devid, uuid, true);
return -ENOENT;
}
if (!map->stripes[i].dev) {
map->stripes[i].dev =
add_missing_dev(fs_info->fs_devices, devid,
uuid);
if (IS_ERR(map->stripes[i].dev)) {
free_extent_map(em);
btrfs_err(fs_info,
"failed to init missing dev %llu: %ld",
devid, PTR_ERR(map->stripes[i].dev));
return PTR_ERR(map->stripes[i].dev);
}
btrfs_report_missing_device(fs_info, devid, uuid, false);
}
set_bit(BTRFS_DEV_STATE_IN_FS_METADATA,
&(map->stripes[i].dev->dev_state));
}
write_lock(&map_tree->lock);
ret = add_extent_mapping(map_tree, em, 0);
write_unlock(&map_tree->lock);
if (ret < 0) {
btrfs_err(fs_info,
"failed to add chunk map, start=%llu len=%llu: %d",
em->start, em->len, ret);
}
free_extent_map(em); return ret;
}
static void fill_device_from_item(struct extent_buffer *leaf,
struct btrfs_dev_item *dev_item,
struct btrfs_device *device)
{
unsigned long ptr;
device->devid = btrfs_device_id(leaf, dev_item);
device->disk_total_bytes = btrfs_device_total_bytes(leaf, dev_item);
device->total_bytes = device->disk_total_bytes;
device->commit_total_bytes = device->disk_total_bytes;
device->bytes_used = btrfs_device_bytes_used(leaf, dev_item);
device->commit_bytes_used = device->bytes_used;
device->type = btrfs_device_type(leaf, dev_item);
device->io_align = btrfs_device_io_align(leaf, dev_item);
device->io_width = btrfs_device_io_width(leaf, dev_item);
device->sector_size = btrfs_device_sector_size(leaf, dev_item);
WARN_ON(device->devid == BTRFS_DEV_REPLACE_DEVID);
clear_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state);
ptr = btrfs_device_uuid(dev_item);
read_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE);
}
static struct btrfs_fs_devices *open_seed_devices(struct btrfs_fs_info *fs_info,
u8 *fsid)
{
struct btrfs_fs_devices *fs_devices;
int ret;
lockdep_assert_held(&uuid_mutex);
ASSERT(fsid);
/* This will match only for multi-device seed fs */
list_for_each_entry(fs_devices, &fs_info->fs_devices->seed_list, seed_list) if (!memcmp(fs_devices->fsid, fsid, BTRFS_FSID_SIZE))
return fs_devices;
fs_devices = find_fsid(fsid, NULL);
if (!fs_devices) {
if (!btrfs_test_opt(fs_info, DEGRADED))
return ERR_PTR(-ENOENT);
fs_devices = alloc_fs_devices(fsid, NULL);
if (IS_ERR(fs_devices))
return fs_devices;
fs_devices->seeding = true;
fs_devices->opened = 1;
return fs_devices;
}
/*
* Upon first call for a seed fs fsid, just create a private copy of the
* respective fs_devices and anchor it at fs_info->fs_devices->seed_list
*/
fs_devices = clone_fs_devices(fs_devices);
if (IS_ERR(fs_devices))
return fs_devices;
ret = open_fs_devices(fs_devices, FMODE_READ, fs_info->bdev_holder);
if (ret) {
free_fs_devices(fs_devices);
return ERR_PTR(ret);
}
if (!fs_devices->seeding) {
close_fs_devices(fs_devices);
free_fs_devices(fs_devices);
return ERR_PTR(-EINVAL);
}
list_add(&fs_devices->seed_list, &fs_info->fs_devices->seed_list);
return fs_devices;
}
static int read_one_dev(struct extent_buffer *leaf,
struct btrfs_dev_item *dev_item)
{
struct btrfs_fs_info *fs_info = leaf->fs_info;
struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
struct btrfs_device *device;
u64 devid;
int ret;
u8 fs_uuid[BTRFS_FSID_SIZE];
u8 dev_uuid[BTRFS_UUID_SIZE];
devid = btrfs_device_id(leaf, dev_item);
read_extent_buffer(leaf, dev_uuid, btrfs_device_uuid(dev_item),
BTRFS_UUID_SIZE);
read_extent_buffer(leaf, fs_uuid, btrfs_device_fsid(dev_item),
BTRFS_FSID_SIZE);
if (memcmp(fs_uuid, fs_devices->metadata_uuid, BTRFS_FSID_SIZE)) {
fs_devices = open_seed_devices(fs_info, fs_uuid);
if (IS_ERR(fs_devices))
return PTR_ERR(fs_devices);
}
device = btrfs_find_device(fs_info->fs_devices, devid, dev_uuid,
fs_uuid);
if (!device) {
if (!btrfs_test_opt(fs_info, DEGRADED)) {
btrfs_report_missing_device(fs_info, devid,
dev_uuid, true);
return -ENOENT;
}
device = add_missing_dev(fs_devices, devid, dev_uuid);
if (IS_ERR(device)) {
btrfs_err(fs_info,
"failed to add missing dev %llu: %ld",
devid, PTR_ERR(device));
return PTR_ERR(device);
}
btrfs_report_missing_device(fs_info, devid, dev_uuid, false);
} else {
if (!device->bdev) { if (!btrfs_test_opt(fs_info, DEGRADED)) { btrfs_report_missing_device(fs_info,
devid, dev_uuid, true);
return -ENOENT;
}
btrfs_report_missing_device(fs_info, devid,
dev_uuid, false);
}
if (!device->bdev &&
!test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state)) {
/*
* this happens when a device that was properly setup
* in the device info lists suddenly goes bad.
* device->bdev is NULL, and so we have to set
* device->missing to one here
*/
device->fs_devices->missing_devices++;
set_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state);
}
/* Move the device to its own fs_devices */
if (device->fs_devices != fs_devices) {
ASSERT(test_bit(BTRFS_DEV_STATE_MISSING,
&device->dev_state));
list_move(&device->dev_list, &fs_devices->devices);
device->fs_devices->num_devices--;
fs_devices->num_devices++;
device->fs_devices->missing_devices--;
fs_devices->missing_devices++;
device->fs_devices = fs_devices;
}
}
if (device->fs_devices != fs_info->fs_devices) { BUG_ON(test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)); if (device->generation !=
btrfs_device_generation(leaf, dev_item))
return -EINVAL;
}
fill_device_from_item(leaf, dev_item, device);
if (device->bdev) {
u64 max_total_bytes = i_size_read(device->bdev->bd_inode);
if (device->total_bytes > max_total_bytes) {
btrfs_err(fs_info,
"device total_bytes should be at most %llu but found %llu",
max_total_bytes, device->total_bytes);
return -EINVAL;
}
}
set_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) &&
!test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) {
device->fs_devices->total_rw_bytes += device->total_bytes;
atomic64_add(device->total_bytes - device->bytes_used,
&fs_info->free_chunk_space);
}
ret = 0;
return ret;
}
int btrfs_read_sys_array(struct btrfs_fs_info *fs_info)
{
struct btrfs_root *root = fs_info->tree_root;
struct btrfs_super_block *super_copy = fs_info->super_copy;
struct extent_buffer *sb;
struct btrfs_disk_key *disk_key;
struct btrfs_chunk *chunk;
u8 *array_ptr;
unsigned long sb_array_offset;
int ret = 0;
u32 num_stripes;
u32 array_size;
u32 len = 0;
u32 cur_offset;
u64 type;
struct btrfs_key key;
ASSERT(BTRFS_SUPER_INFO_SIZE <= fs_info->nodesize);
/*
* This will create extent buffer of nodesize, superblock size is
* fixed to BTRFS_SUPER_INFO_SIZE. If nodesize > sb size, this will
* overallocate but we can keep it as-is, only the first page is used.
*/
sb = btrfs_find_create_tree_block(fs_info, BTRFS_SUPER_INFO_OFFSET,
root->root_key.objectid, 0);
if (IS_ERR(sb))
return PTR_ERR(sb);
set_extent_buffer_uptodate(sb);
/*
* The sb extent buffer is artificial and just used to read the system array.
* set_extent_buffer_uptodate() call does not properly mark all it's
* pages up-to-date when the page is larger: extent does not cover the
* whole page and consequently check_page_uptodate does not find all
* the page's extents up-to-date (the hole beyond sb),
* write_extent_buffer then triggers a WARN_ON.
*
* Regular short extents go through mark_extent_buffer_dirty/writeback cycle,
* but sb spans only this function. Add an explicit SetPageUptodate call
* to silence the warning eg. on PowerPC 64.
*/
if (PAGE_SIZE > BTRFS_SUPER_INFO_SIZE)
SetPageUptodate(sb->pages[0]);
write_extent_buffer(sb, super_copy, 0, BTRFS_SUPER_INFO_SIZE);
array_size = btrfs_super_sys_array_size(super_copy);
array_ptr = super_copy->sys_chunk_array;
sb_array_offset = offsetof(struct btrfs_super_block, sys_chunk_array);
cur_offset = 0;
while (cur_offset < array_size) {
disk_key = (struct btrfs_disk_key *)array_ptr;
len = sizeof(*disk_key);
if (cur_offset + len > array_size)
goto out_short_read;
btrfs_disk_key_to_cpu(&key, disk_key);
array_ptr += len;
sb_array_offset += len;
cur_offset += len;
if (key.type != BTRFS_CHUNK_ITEM_KEY) {
btrfs_err(fs_info,
"unexpected item type %u in sys_array at offset %u",
(u32)key.type, cur_offset);
ret = -EIO;
break;
}
chunk = (struct btrfs_chunk *)sb_array_offset;
/*
* At least one btrfs_chunk with one stripe must be present,
* exact stripe count check comes afterwards
*/
len = btrfs_chunk_item_size(1);
if (cur_offset + len > array_size)
goto out_short_read;
num_stripes = btrfs_chunk_num_stripes(sb, chunk);
if (!num_stripes) {
btrfs_err(fs_info,
"invalid number of stripes %u in sys_array at offset %u",
num_stripes, cur_offset);
ret = -EIO;
break;
}
type = btrfs_chunk_type(sb, chunk);
if ((type & BTRFS_BLOCK_GROUP_SYSTEM) == 0) {
btrfs_err(fs_info,
"invalid chunk type %llu in sys_array at offset %u",
type, cur_offset);
ret = -EIO;
break;
}
len = btrfs_chunk_item_size(num_stripes);
if (cur_offset + len > array_size)
goto out_short_read;
ret = read_one_chunk(&key, sb, chunk);
if (ret)
break;
array_ptr += len;
sb_array_offset += len;
cur_offset += len;
}
clear_extent_buffer_uptodate(sb);
free_extent_buffer_stale(sb);
return ret;
out_short_read:
btrfs_err(fs_info, "sys_array too short to read %u bytes at offset %u",
len, cur_offset);
clear_extent_buffer_uptodate(sb);
free_extent_buffer_stale(sb);
return -EIO;
}
/*
* Check if all chunks in the fs are OK for read-write degraded mount
*
* If the @failing_dev is specified, it's accounted as missing.
*
* Return true if all chunks meet the minimal RW mount requirements.
* Return false if any chunk doesn't meet the minimal RW mount requirements.
*/
bool btrfs_check_rw_degradable(struct btrfs_fs_info *fs_info,
struct btrfs_device *failing_dev)
{
struct extent_map_tree *map_tree = &fs_info->mapping_tree;
struct extent_map *em;
u64 next_start = 0;
bool ret = true;
read_lock(&map_tree->lock);
em = lookup_extent_mapping(map_tree, 0, (u64)-1);
read_unlock(&map_tree->lock);
/* No chunk at all? Return false anyway */
if (!em) {
ret = false;
goto out;
}
while (em) {
struct map_lookup *map;
int missing = 0;
int max_tolerated;
int i;
map = em->map_lookup;
max_tolerated =
btrfs_get_num_tolerated_disk_barrier_failures(
map->type);
for (i = 0; i < map->num_stripes; i++) { struct btrfs_device *dev = map->stripes[i].dev; if (!dev || !dev->bdev || test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state) || dev->last_flush_error)
missing++;
else if (failing_dev && failing_dev == dev) missing++;
}
if (missing > max_tolerated) { if (!failing_dev)
btrfs_warn(fs_info,
"chunk %llu missing %d devices, max tolerance is %d for writable mount",
em->start, missing, max_tolerated);
free_extent_map(em);
ret = false;
goto out;
}
next_start = extent_map_end(em); free_extent_map(em);
read_lock(&map_tree->lock);
em = lookup_extent_mapping(map_tree, next_start,
(u64)(-1) - next_start);
read_unlock(&map_tree->lock);
}
out:
return ret;
}
static void readahead_tree_node_children(struct extent_buffer *node)
{
int i;
const int nr_items = btrfs_header_nritems(node); for (i = 0; i < nr_items; i++) btrfs_readahead_node_child(node, i);
}
int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info)
{
struct btrfs_root *root = fs_info->chunk_root;
struct btrfs_path *path;
struct extent_buffer *leaf;
struct btrfs_key key;
struct btrfs_key found_key;
int ret;
int slot;
u64 total_dev = 0;
u64 last_ra_node = 0;
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
/*
* uuid_mutex is needed only if we are mounting a sprout FS
* otherwise we don't need it.
*/
mutex_lock(&uuid_mutex);
/*
* It is possible for mount and umount to race in such a way that
* we execute this code path, but open_fs_devices failed to clear
* total_rw_bytes. We certainly want it cleared before reading the
* device items, so clear it here.
*/
fs_info->fs_devices->total_rw_bytes = 0;
/*
* Lockdep complains about possible circular locking dependency between
* a disk's open_mutex (struct gendisk.open_mutex), the rw semaphores
* used for freeze procection of a fs (struct super_block.s_writers),
* which we take when starting a transaction, and extent buffers of the
* chunk tree if we call read_one_dev() while holding a lock on an
* extent buffer of the chunk tree. Since we are mounting the filesystem
* and at this point there can't be any concurrent task modifying the
* chunk tree, to keep it simple, just skip locking on the chunk tree.
*/
ASSERT(!test_bit(BTRFS_FS_OPEN, &fs_info->flags));
path->skip_locking = 1;
/*
* Read all device items, and then all the chunk items. All
* device items are found before any chunk item (their object id
* is smaller than the lowest possible object id for a chunk
* item - BTRFS_FIRST_CHUNK_TREE_OBJECTID).
*/
key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
key.offset = 0;
key.type = 0;
ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
if (ret < 0)
goto error;
while (1) {
struct extent_buffer *node;
leaf = path->nodes[0];
slot = path->slots[0];
if (slot >= btrfs_header_nritems(leaf)) {
ret = btrfs_next_leaf(root, path);
if (ret == 0)
continue;
if (ret < 0)
goto error;
break;
}
node = path->nodes[1];
if (node) {
if (last_ra_node != node->start) {
readahead_tree_node_children(node);
last_ra_node = node->start;
}
}
btrfs_item_key_to_cpu(leaf, &found_key, slot);
if (found_key.type == BTRFS_DEV_ITEM_KEY) {
struct btrfs_dev_item *dev_item;
dev_item = btrfs_item_ptr(leaf, slot,
struct btrfs_dev_item);
ret = read_one_dev(leaf, dev_item);
if (ret)
goto error;
total_dev++; } else if (found_key.type == BTRFS_CHUNK_ITEM_KEY) {
struct btrfs_chunk *chunk;
/*
* We are only called at mount time, so no need to take
* fs_info->chunk_mutex. Plus, to avoid lockdep warnings,
* we always lock first fs_info->chunk_mutex before
* acquiring any locks on the chunk tree. This is a
* requirement for chunk allocation, see the comment on
* top of btrfs_chunk_alloc() for details.
*/
chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk);
ret = read_one_chunk(&found_key, leaf, chunk);
if (ret)
goto error;
}
path->slots[0]++;
}
/*
* After loading chunk tree, we've got all device information,
* do another round of validation checks.
*/
if (total_dev != fs_info->fs_devices->total_devices) {
btrfs_err(fs_info,
"super_num_devices %llu mismatch with num_devices %llu found here",
btrfs_super_num_devices(fs_info->super_copy),
total_dev);
ret = -EINVAL;
goto error;
}
if (btrfs_super_total_bytes(fs_info->super_copy) <
fs_info->fs_devices->total_rw_bytes) {
btrfs_err(fs_info,
"super_total_bytes %llu mismatch with fs_devices total_rw_bytes %llu",
btrfs_super_total_bytes(fs_info->super_copy),
fs_info->fs_devices->total_rw_bytes);
ret = -EINVAL;
goto error;
}
ret = 0;
error:
mutex_unlock(&uuid_mutex);
btrfs_free_path(path);
return ret;
}
void btrfs_init_devices_late(struct btrfs_fs_info *fs_info)
{
struct btrfs_fs_devices *fs_devices = fs_info->fs_devices, *seed_devs;
struct btrfs_device *device;
fs_devices->fs_info = fs_info;
mutex_lock(&fs_devices->device_list_mutex);
list_for_each_entry(device, &fs_devices->devices, dev_list)
device->fs_info = fs_info; list_for_each_entry(seed_devs, &fs_devices->seed_list, seed_list) { list_for_each_entry(device, &seed_devs->devices, dev_list) device->fs_info = fs_info; seed_devs->fs_info = fs_info;
}
mutex_unlock(&fs_devices->device_list_mutex);
}
static u64 btrfs_dev_stats_value(const struct extent_buffer *eb,
const struct btrfs_dev_stats_item *ptr,
int index)
{
u64 val;
read_extent_buffer(eb, &val,
offsetof(struct btrfs_dev_stats_item, values) +
((unsigned long)ptr) + (index * sizeof(u64)),
sizeof(val));
return val;
}
static void btrfs_set_dev_stats_value(struct extent_buffer *eb,
struct btrfs_dev_stats_item *ptr,
int index, u64 val)
{
write_extent_buffer(eb, &val,
offsetof(struct btrfs_dev_stats_item, values) +
((unsigned long)ptr) + (index * sizeof(u64)),
sizeof(val));
}
static int btrfs_device_init_dev_stats(struct btrfs_device *device,
struct btrfs_path *path)
{
struct btrfs_dev_stats_item *ptr;
struct extent_buffer *eb;
struct btrfs_key key;
int item_size;
int i, ret, slot;
if (!device->fs_info->dev_root)
return 0;
key.objectid = BTRFS_DEV_STATS_OBJECTID;
key.type = BTRFS_PERSISTENT_ITEM_KEY;
key.offset = device->devid;
ret = btrfs_search_slot(NULL, device->fs_info->dev_root, &key, path, 0, 0);
if (ret) {
for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++)
btrfs_dev_stat_set(device, i, 0);
device->dev_stats_valid = 1;
btrfs_release_path(path);
return ret < 0 ? ret : 0;
}
slot = path->slots[0];
eb = path->nodes[0];
item_size = btrfs_item_size_nr(eb, slot);
ptr = btrfs_item_ptr(eb, slot, struct btrfs_dev_stats_item);
for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) { if (item_size >= (1 + i) * sizeof(__le64))
btrfs_dev_stat_set(device, i,
btrfs_dev_stats_value(eb, ptr, i));
else
btrfs_dev_stat_set(device, i, 0);
}
device->dev_stats_valid = 1;
btrfs_dev_stat_print_on_load(device);
btrfs_release_path(path);
return 0;
}
int btrfs_init_dev_stats(struct btrfs_fs_info *fs_info)
{
struct btrfs_fs_devices *fs_devices = fs_info->fs_devices, *seed_devs;
struct btrfs_device *device;
struct btrfs_path *path = NULL;
int ret = 0;
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
mutex_lock(&fs_devices->device_list_mutex); list_for_each_entry(device, &fs_devices->devices, dev_list) { ret = btrfs_device_init_dev_stats(device, path);
if (ret)
goto out;
}
list_for_each_entry(seed_devs, &fs_devices->seed_list, seed_list) { list_for_each_entry(device, &seed_devs->devices, dev_list) { ret = btrfs_device_init_dev_stats(device, path);
if (ret)
goto out;
}
}
out:
mutex_unlock(&fs_devices->device_list_mutex);
btrfs_free_path(path);
return ret;
}
static int update_dev_stat_item(struct btrfs_trans_handle *trans,
struct btrfs_device *device)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
struct btrfs_root *dev_root = fs_info->dev_root;
struct btrfs_path *path;
struct btrfs_key key;
struct extent_buffer *eb;
struct btrfs_dev_stats_item *ptr;
int ret;
int i;
key.objectid = BTRFS_DEV_STATS_OBJECTID;
key.type = BTRFS_PERSISTENT_ITEM_KEY;
key.offset = device->devid;
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
ret = btrfs_search_slot(trans, dev_root, &key, path, -1, 1);
if (ret < 0) {
btrfs_warn_in_rcu(fs_info,
"error %d while searching for dev_stats item for device %s",
ret, rcu_str_deref(device->name));
goto out;
}
if (ret == 0 && btrfs_item_size_nr(path->nodes[0], path->slots[0]) < sizeof(*ptr)) {
/* need to delete old one and insert a new one */
ret = btrfs_del_item(trans, dev_root, path);
if (ret != 0) {
btrfs_warn_in_rcu(fs_info,
"delete too small dev_stats item for device %s failed %d",
rcu_str_deref(device->name), ret);
goto out;
}
ret = 1;
}
if (ret == 1) {
/* need to insert a new item */
btrfs_release_path(path);
ret = btrfs_insert_empty_item(trans, dev_root, path,
&key, sizeof(*ptr));
if (ret < 0) {
btrfs_warn_in_rcu(fs_info,
"insert dev_stats item for device %s failed %d",
rcu_str_deref(device->name), ret);
goto out;
}
}
eb = path->nodes[0];
ptr = btrfs_item_ptr(eb, path->slots[0], struct btrfs_dev_stats_item);
for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++)
btrfs_set_dev_stats_value(eb, ptr, i,
btrfs_dev_stat_read(device, i));
btrfs_mark_buffer_dirty(eb);
out:
btrfs_free_path(path);
return ret;
}
/*
* called from commit_transaction. Writes all changed device stats to disk.
*/
int btrfs_run_dev_stats(struct btrfs_trans_handle *trans)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
struct btrfs_device *device;
int stats_cnt;
int ret = 0;
mutex_lock(&fs_devices->device_list_mutex);
list_for_each_entry(device, &fs_devices->devices, dev_list) { stats_cnt = atomic_read(&device->dev_stats_ccnt); if (!device->dev_stats_valid || stats_cnt == 0)
continue;
/*
* There is a LOAD-LOAD control dependency between the value of
* dev_stats_ccnt and updating the on-disk values which requires
* reading the in-memory counters. Such control dependencies
* require explicit read memory barriers.
*
* This memory barriers pairs with smp_mb__before_atomic in
* btrfs_dev_stat_inc/btrfs_dev_stat_set and with the full
* barrier implied by atomic_xchg in
* btrfs_dev_stats_read_and_reset
*/
smp_rmb();
ret = update_dev_stat_item(trans, device);
if (!ret)
atomic_sub(stats_cnt, &device->dev_stats_ccnt);
}
mutex_unlock(&fs_devices->device_list_mutex);
return ret;
}
void btrfs_dev_stat_inc_and_print(struct btrfs_device *dev, int index)
{
btrfs_dev_stat_inc(dev, index);
btrfs_dev_stat_print_on_error(dev);
}
static void btrfs_dev_stat_print_on_error(struct btrfs_device *dev)
{
if (!dev->dev_stats_valid)
return;
btrfs_err_rl_in_rcu(dev->fs_info,
"bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u",
rcu_str_deref(dev->name),
btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_WRITE_ERRS),
btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_READ_ERRS),
btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_FLUSH_ERRS),
btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_CORRUPTION_ERRS),
btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_GENERATION_ERRS));
}
static void btrfs_dev_stat_print_on_load(struct btrfs_device *dev)
{
int i;
for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++)
if (btrfs_dev_stat_read(dev, i) != 0)
break;
if (i == BTRFS_DEV_STAT_VALUES_MAX)
return; /* all values == 0, suppress message */
btrfs_info_in_rcu(dev->fs_info,
"bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u",
rcu_str_deref(dev->name),
btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_WRITE_ERRS),
btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_READ_ERRS),
btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_FLUSH_ERRS),
btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_CORRUPTION_ERRS),
btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_GENERATION_ERRS));
}
int btrfs_get_dev_stats(struct btrfs_fs_info *fs_info,
struct btrfs_ioctl_get_dev_stats *stats)
{
struct btrfs_device *dev;
struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
int i;
mutex_lock(&fs_devices->device_list_mutex);
dev = btrfs_find_device(fs_info->fs_devices, stats->devid, NULL, NULL);
mutex_unlock(&fs_devices->device_list_mutex);
if (!dev) {
btrfs_warn(fs_info, "get dev_stats failed, device not found");
return -ENODEV;
} else if (!dev->dev_stats_valid) {
btrfs_warn(fs_info, "get dev_stats failed, not yet valid");
return -ENODEV;
} else if (stats->flags & BTRFS_DEV_STATS_RESET) {
for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) {
if (stats->nr_items > i)
stats->values[i] =
btrfs_dev_stat_read_and_reset(dev, i);
else
btrfs_dev_stat_set(dev, i, 0);
}
btrfs_info(fs_info, "device stats zeroed by %s (%d)",
current->comm, task_pid_nr(current));
} else {
for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++)
if (stats->nr_items > i)
stats->values[i] = btrfs_dev_stat_read(dev, i);
}
if (stats->nr_items > BTRFS_DEV_STAT_VALUES_MAX)
stats->nr_items = BTRFS_DEV_STAT_VALUES_MAX;
return 0;
}
/*
* Update the size and bytes used for each device where it changed. This is
* delayed since we would otherwise get errors while writing out the
* superblocks.
*
* Must be invoked during transaction commit.
*/
void btrfs_commit_device_sizes(struct btrfs_transaction *trans)
{
struct btrfs_device *curr, *next;
ASSERT(trans->state == TRANS_STATE_COMMIT_DOING);
if (list_empty(&trans->dev_update_list))
return;
/*
* We don't need the device_list_mutex here. This list is owned by the
* transaction and the transaction must complete before the device is
* released.
*/
mutex_lock(&trans->fs_info->chunk_mutex);
list_for_each_entry_safe(curr, next, &trans->dev_update_list,
post_commit_list) {
list_del_init(&curr->post_commit_list);
curr->commit_total_bytes = curr->disk_total_bytes;
curr->commit_bytes_used = curr->bytes_used;
}
mutex_unlock(&trans->fs_info->chunk_mutex);
}
/*
* Multiplicity factor for simple profiles: DUP, RAID1-like and RAID10.
*/
int btrfs_bg_type_to_factor(u64 flags)
{
const int index = btrfs_bg_flags_to_raid_index(flags);
return btrfs_raid_array[index].ncopies;
}
static int verify_one_dev_extent(struct btrfs_fs_info *fs_info,
u64 chunk_offset, u64 devid,
u64 physical_offset, u64 physical_len)
{
struct extent_map_tree *em_tree = &fs_info->mapping_tree;
struct extent_map *em;
struct map_lookup *map;
struct btrfs_device *dev;
u64 stripe_len;
bool found = false;
int ret = 0;
int i;
read_lock(&em_tree->lock);
em = lookup_extent_mapping(em_tree, chunk_offset, 1);
read_unlock(&em_tree->lock);
if (!em) {
btrfs_err(fs_info,
"dev extent physical offset %llu on devid %llu doesn't have corresponding chunk",
physical_offset, devid);
ret = -EUCLEAN;
goto out;
}
map = em->map_lookup;
stripe_len = calc_stripe_length(map->type, em->len, map->num_stripes);
if (physical_len != stripe_len) {
btrfs_err(fs_info,
"dev extent physical offset %llu on devid %llu length doesn't match chunk %llu, have %llu expect %llu",
physical_offset, devid, em->start, physical_len,
stripe_len);
ret = -EUCLEAN;
goto out;
}
for (i = 0; i < map->num_stripes; i++) { if (map->stripes[i].dev->devid == devid && map->stripes[i].physical == physical_offset) {
found = true;
if (map->verified_stripes >= map->num_stripes) {
btrfs_err(fs_info,
"too many dev extents for chunk %llu found",
em->start);
ret = -EUCLEAN;
goto out;
}
map->verified_stripes++;
break;
}
}
if (!found) {
btrfs_err(fs_info,
"dev extent physical offset %llu devid %llu has no corresponding chunk",
physical_offset, devid);
ret = -EUCLEAN;
}
/* Make sure no dev extent is beyond device boundary */
dev = btrfs_find_device(fs_info->fs_devices, devid, NULL, NULL);
if (!dev) {
btrfs_err(fs_info, "failed to find devid %llu", devid);
ret = -EUCLEAN;
goto out;
}
if (physical_offset + physical_len > dev->disk_total_bytes) {
btrfs_err(fs_info,
"dev extent devid %llu physical offset %llu len %llu is beyond device boundary %llu",
devid, physical_offset, physical_len,
dev->disk_total_bytes);
ret = -EUCLEAN;
goto out;
}
if (dev->zone_info) { u64 zone_size = dev->zone_info->zone_size;
if (!IS_ALIGNED(physical_offset, zone_size) ||
!IS_ALIGNED(physical_len, zone_size)) {
btrfs_err(fs_info,
"zoned: dev extent devid %llu physical offset %llu len %llu is not aligned to device zone",
devid, physical_offset, physical_len);
ret = -EUCLEAN;
goto out;
}
}
out:
free_extent_map(em);
return ret;
}
static int verify_chunk_dev_extent_mapping(struct btrfs_fs_info *fs_info)
{
struct extent_map_tree *em_tree = &fs_info->mapping_tree;
struct extent_map *em;
struct rb_node *node;
int ret = 0;
read_lock(&em_tree->lock);
for (node = rb_first_cached(&em_tree->map); node; node = rb_next(node)) {
em = rb_entry(node, struct extent_map, rb_node);
if (em->map_lookup->num_stripes !=
em->map_lookup->verified_stripes) {
btrfs_err(fs_info,
"chunk %llu has missing dev extent, have %d expect %d",
em->start, em->map_lookup->verified_stripes,
em->map_lookup->num_stripes);
ret = -EUCLEAN;
goto out;
}
}
out:
read_unlock(&em_tree->lock);
return ret;
}
/*
* Ensure that all dev extents are mapped to correct chunk, otherwise
* later chunk allocation/free would cause unexpected behavior.
*
* NOTE: This will iterate through the whole device tree, which should be of
* the same size level as the chunk tree. This slightly increases mount time.
*/
int btrfs_verify_dev_extents(struct btrfs_fs_info *fs_info)
{
struct btrfs_path *path;
struct btrfs_root *root = fs_info->dev_root;
struct btrfs_key key;
u64 prev_devid = 0;
u64 prev_dev_ext_end = 0;
int ret = 0;
/*
* We don't have a dev_root because we mounted with ignorebadroots and
* failed to load the root, so we want to skip the verification in this
* case for sure.
*
* However if the dev root is fine, but the tree itself is corrupted
* we'd still fail to mount. This verification is only to make sure
* writes can happen safely, so instead just bypass this check
* completely in the case of IGNOREBADROOTS.
*/
if (btrfs_test_opt(fs_info, IGNOREBADROOTS))
return 0;
key.objectid = 1;
key.type = BTRFS_DEV_EXTENT_KEY;
key.offset = 0;
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
path->reada = READA_FORWARD;
ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
if (ret < 0)
goto out;
if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) {
ret = btrfs_next_leaf(root, path);
if (ret < 0)
goto out;
/* No dev extents at all? Not good */
if (ret > 0) {
ret = -EUCLEAN;
goto out;
}
}
while (1) {
struct extent_buffer *leaf = path->nodes[0];
struct btrfs_dev_extent *dext;
int slot = path->slots[0];
u64 chunk_offset;
u64 physical_offset;
u64 physical_len;
u64 devid;
btrfs_item_key_to_cpu(leaf, &key, slot);
if (key.type != BTRFS_DEV_EXTENT_KEY)
break;
devid = key.objectid;
physical_offset = key.offset;
dext = btrfs_item_ptr(leaf, slot, struct btrfs_dev_extent);
chunk_offset = btrfs_dev_extent_chunk_offset(leaf, dext);
physical_len = btrfs_dev_extent_length(leaf, dext);
/* Check if this dev extent overlaps with the previous one */
if (devid == prev_devid && physical_offset < prev_dev_ext_end) {
btrfs_err(fs_info,
"dev extent devid %llu physical offset %llu overlap with previous dev extent end %llu",
devid, physical_offset, prev_dev_ext_end);
ret = -EUCLEAN;
goto out;
}
ret = verify_one_dev_extent(fs_info, chunk_offset, devid,
physical_offset, physical_len);
if (ret < 0)
goto out;
prev_devid = devid;
prev_dev_ext_end = physical_offset + physical_len;
ret = btrfs_next_item(root, path);
if (ret < 0)
goto out;
if (ret > 0) {
ret = 0;
break;
}
}
/* Ensure all chunks have corresponding dev extents */
ret = verify_chunk_dev_extent_mapping(fs_info);
out:
btrfs_free_path(path);
return ret;
}
/*
* Check whether the given block group or device is pinned by any inode being
* used as a swapfile.
*/
bool btrfs_pinned_by_swapfile(struct btrfs_fs_info *fs_info, void *ptr)
{
struct btrfs_swapfile_pin *sp;
struct rb_node *node;
spin_lock(&fs_info->swapfile_pins_lock);
node = fs_info->swapfile_pins.rb_node;
while (node) {
sp = rb_entry(node, struct btrfs_swapfile_pin, node);
if (ptr < sp->ptr)
node = node->rb_left;
else if (ptr > sp->ptr)
node = node->rb_right;
else
break;
}
spin_unlock(&fs_info->swapfile_pins_lock);
return node != NULL;
}
static int relocating_repair_kthread(void *data)
{
struct btrfs_block_group *cache = (struct btrfs_block_group *)data;
struct btrfs_fs_info *fs_info = cache->fs_info;
u64 target;
int ret = 0;
target = cache->start;
btrfs_put_block_group(cache);
sb_start_write(fs_info->sb);
if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_BALANCE)) {
btrfs_info(fs_info,
"zoned: skip relocating block group %llu to repair: EBUSY",
target);
sb_end_write(fs_info->sb);
return -EBUSY;
}
mutex_lock(&fs_info->reclaim_bgs_lock);
/* Ensure block group still exists */
cache = btrfs_lookup_block_group(fs_info, target);
if (!cache)
goto out;
if (!cache->relocating_repair)
goto out;
ret = btrfs_may_alloc_data_chunk(fs_info, target);
if (ret < 0)
goto out;
btrfs_info(fs_info,
"zoned: relocating block group %llu to repair IO failure",
target);
ret = btrfs_relocate_chunk(fs_info, target);
out:
if (cache)
btrfs_put_block_group(cache);
mutex_unlock(&fs_info->reclaim_bgs_lock);
btrfs_exclop_finish(fs_info);
sb_end_write(fs_info->sb);
return ret;
}
int btrfs_repair_one_zone(struct btrfs_fs_info *fs_info, u64 logical)
{
struct btrfs_block_group *cache;
/* Do not attempt to repair in degraded state */
if (btrfs_test_opt(fs_info, DEGRADED))
return 0;
cache = btrfs_lookup_block_group(fs_info, logical);
if (!cache)
return 0;
spin_lock(&cache->lock);
if (cache->relocating_repair) {
spin_unlock(&cache->lock);
btrfs_put_block_group(cache);
return 0;
}
cache->relocating_repair = 1;
spin_unlock(&cache->lock);
kthread_run(relocating_repair_kthread, cache,
"btrfs-relocating-repair");
return 0;
}
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef INT_BLK_MQ_H
#define INT_BLK_MQ_H
#include "blk-stat.h"
#include "blk-mq-tag.h"
struct blk_mq_tag_set;
struct blk_mq_ctxs {
struct kobject kobj;
struct blk_mq_ctx __percpu *queue_ctx;
};
/**
* struct blk_mq_ctx - State for a software queue facing the submitting CPUs
*/
struct blk_mq_ctx {
struct {
spinlock_t lock;
struct list_head rq_lists[HCTX_MAX_TYPES];
} ____cacheline_aligned_in_smp;
unsigned int cpu;
unsigned short index_hw[HCTX_MAX_TYPES];
struct blk_mq_hw_ctx *hctxs[HCTX_MAX_TYPES];
/* incremented at dispatch time */
unsigned long rq_dispatched[2];
unsigned long rq_merged;
/* incremented at completion time */
unsigned long ____cacheline_aligned_in_smp rq_completed[2];
struct request_queue *queue;
struct blk_mq_ctxs *ctxs;
struct kobject kobj;
} ____cacheline_aligned_in_smp;
void blk_mq_exit_queue(struct request_queue *q);
int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr);
void blk_mq_wake_waiters(struct request_queue *q);
bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *,
unsigned int);
void blk_mq_add_to_requeue_list(struct request *rq, bool at_head,
bool kick_requeue_list);
void blk_mq_flush_busy_ctxs(struct blk_mq_hw_ctx *hctx, struct list_head *list);
struct request *blk_mq_dequeue_from_ctx(struct blk_mq_hw_ctx *hctx,
struct blk_mq_ctx *start);
void blk_mq_put_rq_ref(struct request *rq);
/*
* Internal helpers for allocating/freeing the request map
*/
void blk_mq_free_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags,
unsigned int hctx_idx);
void blk_mq_free_rq_map(struct blk_mq_tags *tags, unsigned int flags);
struct blk_mq_tags *blk_mq_alloc_rq_map(struct blk_mq_tag_set *set,
unsigned int hctx_idx,
unsigned int nr_tags,
unsigned int reserved_tags,
unsigned int flags);
int blk_mq_alloc_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags,
unsigned int hctx_idx, unsigned int depth);
/*
* Internal helpers for request insertion into sw queues
*/
void __blk_mq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq,
bool at_head);
void blk_mq_request_bypass_insert(struct request *rq, bool at_head,
bool run_queue);
void blk_mq_insert_requests(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx,
struct list_head *list);
/* Used by blk_insert_cloned_request() to issue request directly */
blk_status_t blk_mq_request_issue_directly(struct request *rq, bool last);
void blk_mq_try_issue_list_directly(struct blk_mq_hw_ctx *hctx,
struct list_head *list);
/*
* CPU -> queue mappings
*/
extern int blk_mq_hw_queue_to_node(struct blk_mq_queue_map *qmap, unsigned int);
/*
* blk_mq_map_queue_type() - map (hctx_type,cpu) to hardware queue
* @q: request queue
* @type: the hctx type index
* @cpu: CPU
*/
static inline struct blk_mq_hw_ctx *blk_mq_map_queue_type(struct request_queue *q,
enum hctx_type type,
unsigned int cpu)
{
return q->queue_hw_ctx[q->tag_set->map[type].mq_map[cpu]];
}
/*
* blk_mq_map_queue() - map (cmd_flags,type) to hardware queue
* @q: request queue
* @flags: request command flags
* @ctx: software queue cpu ctx
*/
static inline struct blk_mq_hw_ctx *blk_mq_map_queue(struct request_queue *q,
unsigned int flags,
struct blk_mq_ctx *ctx)
{
enum hctx_type type = HCTX_TYPE_DEFAULT;
/*
* The caller ensure that if REQ_HIPRI, poll must be enabled.
*/
if (flags & REQ_HIPRI)
type = HCTX_TYPE_POLL;
else if ((flags & REQ_OP_MASK) == REQ_OP_READ)
type = HCTX_TYPE_READ;
return ctx->hctxs[type];
}
/*
* sysfs helpers
*/
extern void blk_mq_sysfs_init(struct request_queue *q);
extern void blk_mq_sysfs_deinit(struct request_queue *q);
extern int __blk_mq_register_dev(struct device *dev, struct request_queue *q);
extern int blk_mq_sysfs_register(struct request_queue *q);
extern void blk_mq_sysfs_unregister(struct request_queue *q);
extern void blk_mq_hctx_kobj_init(struct blk_mq_hw_ctx *hctx);
void blk_mq_cancel_work_sync(struct request_queue *q);
void blk_mq_release(struct request_queue *q);
static inline struct blk_mq_ctx *__blk_mq_get_ctx(struct request_queue *q,
unsigned int cpu)
{
return per_cpu_ptr(q->queue_ctx, cpu);
}
/*
* This assumes per-cpu software queueing queues. They could be per-node
* as well, for instance. For now this is hardcoded as-is. Note that we don't
* care about preemption, since we know the ctx's are persistent. This does
* mean that we can't rely on ctx always matching the currently running CPU.
*/
static inline struct blk_mq_ctx *blk_mq_get_ctx(struct request_queue *q)
{
return __blk_mq_get_ctx(q, raw_smp_processor_id());
}
struct blk_mq_alloc_data {
/* input parameter */
struct request_queue *q;
blk_mq_req_flags_t flags;
unsigned int shallow_depth;
unsigned int cmd_flags;
/* input & output parameter */
struct blk_mq_ctx *ctx;
struct blk_mq_hw_ctx *hctx;
};
static inline bool blk_mq_is_sbitmap_shared(unsigned int flags)
{
return flags & BLK_MQ_F_TAG_HCTX_SHARED;
}
static inline struct blk_mq_tags *blk_mq_tags_from_data(struct blk_mq_alloc_data *data)
{
if (data->q->elevator)
return data->hctx->sched_tags; return data->hctx->tags;
}
static inline bool blk_mq_hctx_stopped(struct blk_mq_hw_ctx *hctx)
{
return test_bit(BLK_MQ_S_STOPPED, &hctx->state);
}
static inline bool blk_mq_hw_queue_mapped(struct blk_mq_hw_ctx *hctx)
{
return hctx->nr_ctx && hctx->tags;
}
unsigned int blk_mq_in_flight(struct request_queue *q,
struct block_device *part);
void blk_mq_in_flight_rw(struct request_queue *q, struct block_device *part,
unsigned int inflight[2]);
static inline void blk_mq_put_dispatch_budget(struct request_queue *q,
int budget_token)
{
if (q->mq_ops->put_budget) q->mq_ops->put_budget(q, budget_token);
}
static inline int blk_mq_get_dispatch_budget(struct request_queue *q)
{
if (q->mq_ops->get_budget) return q->mq_ops->get_budget(q);
return 0;
}
static inline void blk_mq_set_rq_budget_token(struct request *rq, int token)
{
if (token < 0)
return;
if (rq->q->mq_ops->set_rq_budget_token) rq->q->mq_ops->set_rq_budget_token(rq, token);
}
static inline int blk_mq_get_rq_budget_token(struct request *rq)
{
if (rq->q->mq_ops->get_rq_budget_token) return rq->q->mq_ops->get_rq_budget_token(rq);
return -1;
}
static inline void __blk_mq_inc_active_requests(struct blk_mq_hw_ctx *hctx)
{
if (blk_mq_is_sbitmap_shared(hctx->flags))
atomic_inc(&hctx->queue->nr_active_requests_shared_sbitmap);
else
atomic_inc(&hctx->nr_active);
}
static inline void __blk_mq_dec_active_requests(struct blk_mq_hw_ctx *hctx)
{
if (blk_mq_is_sbitmap_shared(hctx->flags))
atomic_dec(&hctx->queue->nr_active_requests_shared_sbitmap);
else
atomic_dec(&hctx->nr_active);
}
static inline int __blk_mq_active_requests(struct blk_mq_hw_ctx *hctx)
{
if (blk_mq_is_sbitmap_shared(hctx->flags))
return atomic_read(&hctx->queue->nr_active_requests_shared_sbitmap); return atomic_read(&hctx->nr_active);
}
static inline void __blk_mq_put_driver_tag(struct blk_mq_hw_ctx *hctx,
struct request *rq)
{
blk_mq_put_tag(hctx->tags, rq->mq_ctx, rq->tag);
rq->tag = BLK_MQ_NO_TAG;
if (rq->rq_flags & RQF_MQ_INFLIGHT) {
rq->rq_flags &= ~RQF_MQ_INFLIGHT;
__blk_mq_dec_active_requests(hctx);
}
}
static inline void blk_mq_put_driver_tag(struct request *rq)
{
if (rq->tag == BLK_MQ_NO_TAG || rq->internal_tag == BLK_MQ_NO_TAG)
return;
__blk_mq_put_driver_tag(rq->mq_hctx, rq);
}
bool blk_mq_get_driver_tag(struct request *rq);
static inline void blk_mq_clear_mq_map(struct blk_mq_queue_map *qmap)
{
int cpu;
for_each_possible_cpu(cpu)
qmap->mq_map[cpu] = 0;
}
/*
* blk_mq_plug() - Get caller context plug
* @q: request queue
* @bio : the bio being submitted by the caller context
*
* Plugging, by design, may delay the insertion of BIOs into the elevator in
* order to increase BIO merging opportunities. This however can cause BIO
* insertion order to change from the order in which submit_bio() is being
* executed in the case of multiple contexts concurrently issuing BIOs to a
* device, even if these context are synchronized to tightly control BIO issuing
* order. While this is not a problem with regular block devices, this ordering
* change can cause write BIO failures with zoned block devices as these
* require sequential write patterns to zones. Prevent this from happening by
* ignoring the plug state of a BIO issuing context if the target request queue
* is for a zoned block device and the BIO to plug is a write operation.
*
* Return current->plug if the bio can be plugged and NULL otherwise
*/
static inline struct blk_plug *blk_mq_plug(struct request_queue *q,
struct bio *bio)
{
/*
* For regular block devices or read operations, use the context plug
* which may be NULL if blk_start_plug() was not executed.
*/
if (!blk_queue_is_zoned(q) || !op_is_write(bio_op(bio)))
return current->plug;
/* Zoned block device write operation case: do not plug the BIO */
return NULL;
}
/* Free all requests on the list */
static inline void blk_mq_free_requests(struct list_head *list)
{
while (!list_empty(list)) {
struct request *rq = list_entry_rq(list->next);
list_del_init(&rq->queuelist);
blk_mq_free_request(rq);
}
}
/*
* For shared tag users, we track the number of currently active users
* and attempt to provide a fair share of the tag depth for each of them.
*/
static inline bool hctx_may_queue(struct blk_mq_hw_ctx *hctx,
struct sbitmap_queue *bt)
{
unsigned int depth, users;
if (!hctx || !(hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED))
return true;
/*
* Don't try dividing an ant
*/
if (bt->sb.depth == 1)
return true;
if (blk_mq_is_sbitmap_shared(hctx->flags)) { struct request_queue *q = hctx->queue;
struct blk_mq_tag_set *set = q->tag_set;
if (!test_bit(QUEUE_FLAG_HCTX_ACTIVE, &q->queue_flags))
return true;
users = atomic_read(&set->active_queues_shared_sbitmap);
} else {
if (!test_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state))
return true;
users = atomic_read(&hctx->tags->active_queues);
}
if (!users)
return true;
/*
* Allow at least some tags
*/
depth = max((bt->sb.depth + users - 1) / users, 4U); return __blk_mq_active_requests(hctx) < depth;
}
#endif
/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
* Definitions for the 'struct sk_buff' memory handlers.
*
* Authors:
* Alan Cox, <gw4pts@gw4pts.ampr.org>
* Florian La Roche, <rzsfl@rz.uni-sb.de>
*/
#ifndef _LINUX_SKBUFF_H
#define _LINUX_SKBUFF_H
#include <linux/kernel.h>
#include <linux/compiler.h>
#include <linux/time.h>
#include <linux/bug.h>
#include <linux/bvec.h>
#include <linux/cache.h>
#include <linux/rbtree.h>
#include <linux/socket.h>
#include <linux/refcount.h>
#include <linux/atomic.h>
#include <asm/types.h>
#include <linux/spinlock.h>
#include <linux/net.h>
#include <linux/textsearch.h>
#include <net/checksum.h>
#include <linux/rcupdate.h>
#include <linux/hrtimer.h>
#include <linux/dma-mapping.h>
#include <linux/netdev_features.h>
#include <linux/sched.h>
#include <linux/sched/clock.h>
#include <net/flow_dissector.h>
#include <linux/splice.h>
#include <linux/in6.h>
#include <linux/if_packet.h>
#include <net/flow.h>
#include <net/page_pool.h>
#if IS_ENABLED(CONFIG_NF_CONNTRACK)
#include <linux/netfilter/nf_conntrack_common.h>
#endif
/* The interface for checksum offload between the stack and networking drivers
* is as follows...
*
* A. IP checksum related features
*
* Drivers advertise checksum offload capabilities in the features of a device.
* From the stack's point of view these are capabilities offered by the driver.
* A driver typically only advertises features that it is capable of offloading
* to its device.
*
* The checksum related features are:
*
* NETIF_F_HW_CSUM - The driver (or its device) is able to compute one
* IP (one's complement) checksum for any combination
* of protocols or protocol layering. The checksum is
* computed and set in a packet per the CHECKSUM_PARTIAL
* interface (see below).
*
* NETIF_F_IP_CSUM - Driver (device) is only able to checksum plain
* TCP or UDP packets over IPv4. These are specifically
* unencapsulated packets of the form IPv4|TCP or
* IPv4|UDP where the Protocol field in the IPv4 header
* is TCP or UDP. The IPv4 header may contain IP options.
* This feature cannot be set in features for a device
* with NETIF_F_HW_CSUM also set. This feature is being
* DEPRECATED (see below).
*
* NETIF_F_IPV6_CSUM - Driver (device) is only able to checksum plain
* TCP or UDP packets over IPv6. These are specifically
* unencapsulated packets of the form IPv6|TCP or
* IPv6|UDP where the Next Header field in the IPv6
* header is either TCP or UDP. IPv6 extension headers
* are not supported with this feature. This feature
* cannot be set in features for a device with
* NETIF_F_HW_CSUM also set. This feature is being
* DEPRECATED (see below).
*
* NETIF_F_RXCSUM - Driver (device) performs receive checksum offload.
* This flag is only used to disable the RX checksum
* feature for a device. The stack will accept receive
* checksum indication in packets received on a device
* regardless of whether NETIF_F_RXCSUM is set.
*
* B. Checksumming of received packets by device. Indication of checksum
* verification is set in skb->ip_summed. Possible values are:
*
* CHECKSUM_NONE:
*
* Device did not checksum this packet e.g. due to lack of capabilities.
* The packet contains full (though not verified) checksum in packet but
* not in skb->csum. Thus, skb->csum is undefined in this case.
*
* CHECKSUM_UNNECESSARY:
*
* The hardware you're dealing with doesn't calculate the full checksum
* (as in CHECKSUM_COMPLETE), but it does parse headers and verify checksums
* for specific protocols. For such packets it will set CHECKSUM_UNNECESSARY
* if their checksums are okay. skb->csum is still undefined in this case
* though. A driver or device must never modify the checksum field in the
* packet even if checksum is verified.
*
* CHECKSUM_UNNECESSARY is applicable to following protocols:
* TCP: IPv6 and IPv4.
* UDP: IPv4 and IPv6. A device may apply CHECKSUM_UNNECESSARY to a
* zero UDP checksum for either IPv4 or IPv6, the networking stack
* may perform further validation in this case.
* GRE: only if the checksum is present in the header.
* SCTP: indicates the CRC in SCTP header has been validated.
* FCOE: indicates the CRC in FC frame has been validated.
*
* skb->csum_level indicates the number of consecutive checksums found in
* the packet minus one that have been verified as CHECKSUM_UNNECESSARY.
* For instance if a device receives an IPv6->UDP->GRE->IPv4->TCP packet
* and a device is able to verify the checksums for UDP (possibly zero),
* GRE (checksum flag is set) and TCP, skb->csum_level would be set to
* two. If the device were only able to verify the UDP checksum and not
* GRE, either because it doesn't support GRE checksum or because GRE
* checksum is bad, skb->csum_level would be set to zero (TCP checksum is
* not considered in this case).
*
* CHECKSUM_COMPLETE:
*
* This is the most generic way. The device supplied checksum of the _whole_
* packet as seen by netif_rx() and fills in skb->csum. This means the
* hardware doesn't need to parse L3/L4 headers to implement this.
*
* Notes:
* - Even if device supports only some protocols, but is able to produce
* skb->csum, it MUST use CHECKSUM_COMPLETE, not CHECKSUM_UNNECESSARY.
* - CHECKSUM_COMPLETE is not applicable to SCTP and FCoE protocols.
*
* CHECKSUM_PARTIAL:
*
* A checksum is set up to be offloaded to a device as described in the
* output description for CHECKSUM_PARTIAL. This may occur on a packet
* received directly from another Linux OS, e.g., a virtualized Linux kernel
* on the same host, or it may be set in the input path in GRO or remote
* checksum offload. For the purposes of checksum verification, the checksum
* referred to by skb->csum_start + skb->csum_offset and any preceding
* checksums in the packet are considered verified. Any checksums in the
* packet that are after the checksum being offloaded are not considered to
* be verified.
*
* C. Checksumming on transmit for non-GSO. The stack requests checksum offload
* in the skb->ip_summed for a packet. Values are:
*
* CHECKSUM_PARTIAL:
*
* The driver is required to checksum the packet as seen by hard_start_xmit()
* from skb->csum_start up to the end, and to record/write the checksum at
* offset skb->csum_start + skb->csum_offset. A driver may verify that the
* csum_start and csum_offset values are valid values given the length and
* offset of the packet, but it should not attempt to validate that the
* checksum refers to a legitimate transport layer checksum -- it is the
* purview of the stack to validate that csum_start and csum_offset are set
* correctly.
*
* When the stack requests checksum offload for a packet, the driver MUST
* ensure that the checksum is set correctly. A driver can either offload the
* checksum calculation to the device, or call skb_checksum_help (in the case
* that the device does not support offload for a particular checksum).
*
* NETIF_F_IP_CSUM and NETIF_F_IPV6_CSUM are being deprecated in favor of
* NETIF_F_HW_CSUM. New devices should use NETIF_F_HW_CSUM to indicate
* checksum offload capability.
* skb_csum_hwoffload_help() can be called to resolve CHECKSUM_PARTIAL based
* on network device checksumming capabilities: if a packet does not match
* them, skb_checksum_help or skb_crc32c_help (depending on the value of
* csum_not_inet, see item D.) is called to resolve the checksum.
*
* CHECKSUM_NONE:
*
* The skb was already checksummed by the protocol, or a checksum is not
* required.
*
* CHECKSUM_UNNECESSARY:
*
* This has the same meaning as CHECKSUM_NONE for checksum offload on
* output.
*
* CHECKSUM_COMPLETE:
* Not used in checksum output. If a driver observes a packet with this value
* set in skbuff, it should treat the packet as if CHECKSUM_NONE were set.
*
* D. Non-IP checksum (CRC) offloads
*
* NETIF_F_SCTP_CRC - This feature indicates that a device is capable of
* offloading the SCTP CRC in a packet. To perform this offload the stack
* will set csum_start and csum_offset accordingly, set ip_summed to
* CHECKSUM_PARTIAL and set csum_not_inet to 1, to provide an indication in
* the skbuff that the CHECKSUM_PARTIAL refers to CRC32c.
* A driver that supports both IP checksum offload and SCTP CRC32c offload
* must verify which offload is configured for a packet by testing the
* value of skb->csum_not_inet; skb_crc32c_csum_help is provided to resolve
* CHECKSUM_PARTIAL on skbs where csum_not_inet is set to 1.
*
* NETIF_F_FCOE_CRC - This feature indicates that a device is capable of
* offloading the FCOE CRC in a packet. To perform this offload the stack
* will set ip_summed to CHECKSUM_PARTIAL and set csum_start and csum_offset
* accordingly. Note that there is no indication in the skbuff that the
* CHECKSUM_PARTIAL refers to an FCOE checksum, so a driver that supports
* both IP checksum offload and FCOE CRC offload must verify which offload
* is configured for a packet, presumably by inspecting packet headers.
*
* E. Checksumming on output with GSO.
*
* In the case of a GSO packet (skb_is_gso(skb) is true), checksum offload
* is implied by the SKB_GSO_* flags in gso_type. Most obviously, if the
* gso_type is SKB_GSO_TCPV4 or SKB_GSO_TCPV6, TCP checksum offload as
* part of the GSO operation is implied. If a checksum is being offloaded
* with GSO then ip_summed is CHECKSUM_PARTIAL, and both csum_start and
* csum_offset are set to refer to the outermost checksum being offloaded
* (two offloaded checksums are possible with UDP encapsulation).
*/
/* Don't change this without changing skb_csum_unnecessary! */
#define CHECKSUM_NONE 0
#define CHECKSUM_UNNECESSARY 1
#define CHECKSUM_COMPLETE 2
#define CHECKSUM_PARTIAL 3
/* Maximum value in skb->csum_level */
#define SKB_MAX_CSUM_LEVEL 3
#define SKB_DATA_ALIGN(X) ALIGN(X, SMP_CACHE_BYTES)
#define SKB_WITH_OVERHEAD(X) \
((X) - SKB_DATA_ALIGN(sizeof(struct skb_shared_info)))
#define SKB_MAX_ORDER(X, ORDER) \
SKB_WITH_OVERHEAD((PAGE_SIZE << (ORDER)) - (X))
#define SKB_MAX_HEAD(X) (SKB_MAX_ORDER((X), 0))
#define SKB_MAX_ALLOC (SKB_MAX_ORDER(0, 2))
/* return minimum truesize of one skb containing X bytes of data */
#define SKB_TRUESIZE(X) ((X) + \
SKB_DATA_ALIGN(sizeof(struct sk_buff)) + \
SKB_DATA_ALIGN(sizeof(struct skb_shared_info)))
struct ahash_request;
struct net_device;
struct scatterlist;
struct pipe_inode_info;
struct iov_iter;
struct napi_struct;
struct bpf_prog;
union bpf_attr;
struct skb_ext;
#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
struct nf_bridge_info {
enum {
BRNF_PROTO_UNCHANGED,
BRNF_PROTO_8021Q,
BRNF_PROTO_PPPOE
} orig_proto:8;
u8 pkt_otherhost:1;
u8 in_prerouting:1;
u8 bridged_dnat:1;
__u16 frag_max_size;
struct net_device *physindev;
/* always valid & non-NULL from FORWARD on, for physdev match */
struct net_device *physoutdev;
union {
/* prerouting: detect dnat in orig/reply direction */
__be32 ipv4_daddr;
struct in6_addr ipv6_daddr;
/* after prerouting + nat detected: store original source
* mac since neigh resolution overwrites it, only used while
* skb is out in neigh layer.
*/
char neigh_header[8];
};
};
#endif
#if IS_ENABLED(CONFIG_NET_TC_SKB_EXT)
/* Chain in tc_skb_ext will be used to share the tc chain with
* ovs recirc_id. It will be set to the current chain by tc
* and read by ovs to recirc_id.
*/
struct tc_skb_ext {
__u32 chain;
__u16 mru;
__u16 zone;
u8 post_ct:1;
u8 post_ct_snat:1;
u8 post_ct_dnat:1;
};
#endif
struct sk_buff_head {
/* These two members must be first. */
struct sk_buff *next;
struct sk_buff *prev;
__u32 qlen;
spinlock_t lock;
};
struct sk_buff;
/* To allow 64K frame to be packed as single skb without frag_list we
* require 64K/PAGE_SIZE pages plus 1 additional page to allow for
* buffers which do not start on a page boundary.
*
* Since GRO uses frags we allocate at least 16 regardless of page
* size.
*/
#if (65536/PAGE_SIZE + 1) < 16
#define MAX_SKB_FRAGS 16UL
#else
#define MAX_SKB_FRAGS (65536/PAGE_SIZE + 1)
#endif
extern int sysctl_max_skb_frags;
/* Set skb_shinfo(skb)->gso_size to this in case you want skb_segment to
* segment using its current segmentation instead.
*/
#define GSO_BY_FRAGS 0xFFFF
typedef struct bio_vec skb_frag_t;
/**
* skb_frag_size() - Returns the size of a skb fragment
* @frag: skb fragment
*/
static inline unsigned int skb_frag_size(const skb_frag_t *frag)
{
return frag->bv_len;
}
/**
* skb_frag_size_set() - Sets the size of a skb fragment
* @frag: skb fragment
* @size: size of fragment
*/
static inline void skb_frag_size_set(skb_frag_t *frag, unsigned int size)
{
frag->bv_len = size;
}
/**
* skb_frag_size_add() - Increments the size of a skb fragment by @delta
* @frag: skb fragment
* @delta: value to add
*/
static inline void skb_frag_size_add(skb_frag_t *frag, int delta)
{
frag->bv_len += delta;
}
/**
* skb_frag_size_sub() - Decrements the size of a skb fragment by @delta
* @frag: skb fragment
* @delta: value to subtract
*/
static inline void skb_frag_size_sub(skb_frag_t *frag, int delta)
{
frag->bv_len -= delta;
}
/**
* skb_frag_must_loop - Test if %p is a high memory page
* @p: fragment's page
*/
static inline bool skb_frag_must_loop(struct page *p)
{
#if defined(CONFIG_HIGHMEM)
if (IS_ENABLED(CONFIG_DEBUG_KMAP_LOCAL_FORCE_MAP) || PageHighMem(p))
return true;
#endif
return false;
}
/**
* skb_frag_foreach_page - loop over pages in a fragment
*
* @f: skb frag to operate on
* @f_off: offset from start of f->bv_page
* @f_len: length from f_off to loop over
* @p: (temp var) current page
* @p_off: (temp var) offset from start of current page,
* non-zero only on first page.
* @p_len: (temp var) length in current page,
* < PAGE_SIZE only on first and last page.
* @copied: (temp var) length so far, excluding current p_len.
*
* A fragment can hold a compound page, in which case per-page
* operations, notably kmap_atomic, must be called for each
* regular page.
*/
#define skb_frag_foreach_page(f, f_off, f_len, p, p_off, p_len, copied) \
for (p = skb_frag_page(f) + ((f_off) >> PAGE_SHIFT), \
p_off = (f_off) & (PAGE_SIZE - 1), \
p_len = skb_frag_must_loop(p) ? \
min_t(u32, f_len, PAGE_SIZE - p_off) : f_len, \
copied = 0; \
copied < f_len; \
copied += p_len, p++, p_off = 0, \
p_len = min_t(u32, f_len - copied, PAGE_SIZE)) \
#define HAVE_HW_TIME_STAMP
/**
* struct skb_shared_hwtstamps - hardware time stamps
* @hwtstamp: hardware time stamp transformed into duration
* since arbitrary point in time
*
* Software time stamps generated by ktime_get_real() are stored in
* skb->tstamp.
*
* hwtstamps can only be compared against other hwtstamps from
* the same device.
*
* This structure is attached to packets as part of the
* &skb_shared_info. Use skb_hwtstamps() to get a pointer.
*/
struct skb_shared_hwtstamps {
ktime_t hwtstamp;
};
/* Definitions for tx_flags in struct skb_shared_info */
enum {
/* generate hardware time stamp */
SKBTX_HW_TSTAMP = 1 << 0,
/* generate software time stamp when queueing packet to NIC */
SKBTX_SW_TSTAMP = 1 << 1,
/* device driver is going to provide hardware time stamp */
SKBTX_IN_PROGRESS = 1 << 2,
/* generate wifi status information (where possible) */
SKBTX_WIFI_STATUS = 1 << 4,
/* generate software time stamp when entering packet scheduling */
SKBTX_SCHED_TSTAMP = 1 << 6,
};
#define SKBTX_ANY_SW_TSTAMP (SKBTX_SW_TSTAMP | \
SKBTX_SCHED_TSTAMP)
#define SKBTX_ANY_TSTAMP (SKBTX_HW_TSTAMP | SKBTX_ANY_SW_TSTAMP)
/* Definitions for flags in struct skb_shared_info */
enum {
/* use zcopy routines */
SKBFL_ZEROCOPY_ENABLE = BIT(0),
/* This indicates at least one fragment might be overwritten
* (as in vmsplice(), sendfile() ...)
* If we need to compute a TX checksum, we'll need to copy
* all frags to avoid possible bad checksum
*/
SKBFL_SHARED_FRAG = BIT(1),
};
#define SKBFL_ZEROCOPY_FRAG (SKBFL_ZEROCOPY_ENABLE | SKBFL_SHARED_FRAG)
/*
* The callback notifies userspace to release buffers when skb DMA is done in
* lower device, the skb last reference should be 0 when calling this.
* The zerocopy_success argument is true if zero copy transmit occurred,
* false on data copy or out of memory error caused by data copy attempt.
* The ctx field is used to track device context.
* The desc field is used to track userspace buffer index.
*/
struct ubuf_info {
void (*callback)(struct sk_buff *, struct ubuf_info *,
bool zerocopy_success);
union {
struct {
unsigned long desc;
void *ctx;
};
struct {
u32 id;
u16 len;
u16 zerocopy:1;
u32 bytelen;
};
};
refcount_t refcnt;
u8 flags;
struct mmpin {
struct user_struct *user;
unsigned int num_pg;
} mmp;
};
#define skb_uarg(SKB) ((struct ubuf_info *)(skb_shinfo(SKB)->destructor_arg))
int mm_account_pinned_pages(struct mmpin *mmp, size_t size);
void mm_unaccount_pinned_pages(struct mmpin *mmp);
struct ubuf_info *msg_zerocopy_alloc(struct sock *sk, size_t size);
struct ubuf_info *msg_zerocopy_realloc(struct sock *sk, size_t size,
struct ubuf_info *uarg);
void msg_zerocopy_put_abort(struct ubuf_info *uarg, bool have_uref);
void msg_zerocopy_callback(struct sk_buff *skb, struct ubuf_info *uarg,
bool success);
int skb_zerocopy_iter_dgram(struct sk_buff *skb, struct msghdr *msg, int len);
int skb_zerocopy_iter_stream(struct sock *sk, struct sk_buff *skb,
struct msghdr *msg, int len,
struct ubuf_info *uarg);
/* This data is invariant across clones and lives at
* the end of the header data, ie. at skb->end.
*/
struct skb_shared_info {
__u8 flags;
__u8 meta_len;
__u8 nr_frags;
__u8 tx_flags;
unsigned short gso_size;
/* Warning: this field is not always filled in (UFO)! */
unsigned short gso_segs;
struct sk_buff *frag_list;
struct skb_shared_hwtstamps hwtstamps;
unsigned int gso_type;
u32 tskey;
/*
* Warning : all fields before dataref are cleared in __alloc_skb()
*/
atomic_t dataref;
/* Intermediate layers must ensure that destructor_arg
* remains valid until skb destructor */
void * destructor_arg;
/* must be last field, see pskb_expand_head() */
skb_frag_t frags[MAX_SKB_FRAGS];
};
/* We divide dataref into two halves. The higher 16 bits hold references
* to the payload part of skb->data. The lower 16 bits hold references to
* the entire skb->data. A clone of a headerless skb holds the length of
* the header in skb->hdr_len.
*
* All users must obey the rule that the skb->data reference count must be
* greater than or equal to the payload reference count.
*
* Holding a reference to the payload part means that the user does not
* care about modifications to the header part of skb->data.
*/
#define SKB_DATAREF_SHIFT 16
#define SKB_DATAREF_MASK ((1 << SKB_DATAREF_SHIFT) - 1)
enum {
SKB_FCLONE_UNAVAILABLE, /* skb has no fclone (from head_cache) */
SKB_FCLONE_ORIG, /* orig skb (from fclone_cache) */
SKB_FCLONE_CLONE, /* companion fclone skb (from fclone_cache) */
};
enum {
SKB_GSO_TCPV4 = 1 << 0,
/* This indicates the skb is from an untrusted source. */
SKB_GSO_DODGY = 1 << 1,
/* This indicates the tcp segment has CWR set. */
SKB_GSO_TCP_ECN = 1 << 2,
SKB_GSO_TCP_FIXEDID = 1 << 3,
SKB_GSO_TCPV6 = 1 << 4,
SKB_GSO_FCOE = 1 << 5,
SKB_GSO_GRE = 1 << 6,
SKB_GSO_GRE_CSUM = 1 << 7,
SKB_GSO_IPXIP4 = 1 << 8,
SKB_GSO_IPXIP6 = 1 << 9,
SKB_GSO_UDP_TUNNEL = 1 << 10,
SKB_GSO_UDP_TUNNEL_CSUM = 1 << 11,
SKB_GSO_PARTIAL = 1 << 12,
SKB_GSO_TUNNEL_REMCSUM = 1 << 13,
SKB_GSO_SCTP = 1 << 14,
SKB_GSO_ESP = 1 << 15,
SKB_GSO_UDP = 1 << 16,
SKB_GSO_UDP_L4 = 1 << 17,
SKB_GSO_FRAGLIST = 1 << 18,
};
#if BITS_PER_LONG > 32
#define NET_SKBUFF_DATA_USES_OFFSET 1
#endif
#ifdef NET_SKBUFF_DATA_USES_OFFSET
typedef unsigned int sk_buff_data_t;
#else
typedef unsigned char *sk_buff_data_t;
#endif
/**
* struct sk_buff - socket buffer
* @next: Next buffer in list
* @prev: Previous buffer in list
* @tstamp: Time we arrived/left
* @skb_mstamp_ns: (aka @tstamp) earliest departure time; start point
* for retransmit timer
* @rbnode: RB tree node, alternative to next/prev for netem/tcp
* @list: queue head
* @sk: Socket we are owned by
* @ip_defrag_offset: (aka @sk) alternate use of @sk, used in
* fragmentation management
* @dev: Device we arrived on/are leaving by
* @dev_scratch: (aka @dev) alternate use of @dev when @dev would be %NULL
* @cb: Control buffer. Free for use by every layer. Put private vars here
* @_skb_refdst: destination entry (with norefcount bit)
* @sp: the security path, used for xfrm
* @len: Length of actual data
* @data_len: Data length
* @mac_len: Length of link layer header
* @hdr_len: writable header length of cloned skb
* @csum: Checksum (must include start/offset pair)
* @csum_start: Offset from skb->head where checksumming should start
* @csum_offset: Offset from csum_start where checksum should be stored
* @priority: Packet queueing priority
* @ignore_df: allow local fragmentation
* @cloned: Head may be cloned (check refcnt to be sure)
* @ip_summed: Driver fed us an IP checksum
* @nohdr: Payload reference only, must not modify header
* @pkt_type: Packet class
* @fclone: skbuff clone status
* @ipvs_property: skbuff is owned by ipvs
* @inner_protocol_type: whether the inner protocol is
* ENCAP_TYPE_ETHER or ENCAP_TYPE_IPPROTO
* @remcsum_offload: remote checksum offload is enabled
* @offload_fwd_mark: Packet was L2-forwarded in hardware
* @offload_l3_fwd_mark: Packet was L3-forwarded in hardware
* @tc_skip_classify: do not classify packet. set by IFB device
* @tc_at_ingress: used within tc_classify to distinguish in/egress
* @redirected: packet was redirected by packet classifier
* @from_ingress: packet was redirected from the ingress path
* @peeked: this packet has been seen already, so stats have been
* done for it, don't do them again
* @nf_trace: netfilter packet trace flag
* @protocol: Packet protocol from driver
* @destructor: Destruct function
* @tcp_tsorted_anchor: list structure for TCP (tp->tsorted_sent_queue)
* @_sk_redir: socket redirection information for skmsg
* @_nfct: Associated connection, if any (with nfctinfo bits)
* @nf_bridge: Saved data about a bridged frame - see br_netfilter.c
* @skb_iif: ifindex of device we arrived on
* @tc_index: Traffic control index
* @hash: the packet hash
* @queue_mapping: Queue mapping for multiqueue devices
* @head_frag: skb was allocated from page fragments,
* not allocated by kmalloc() or vmalloc().
* @pfmemalloc: skbuff was allocated from PFMEMALLOC reserves
* @pp_recycle: mark the packet for recycling instead of freeing (implies
* page_pool support on driver)
* @active_extensions: active extensions (skb_ext_id types)
* @ndisc_nodetype: router type (from link layer)
* @ooo_okay: allow the mapping of a socket to a queue to be changed
* @l4_hash: indicate hash is a canonical 4-tuple hash over transport
* ports.
* @sw_hash: indicates hash was computed in software stack
* @wifi_acked_valid: wifi_acked was set
* @wifi_acked: whether frame was acked on wifi or not
* @no_fcs: Request NIC to treat last 4 bytes as Ethernet FCS
* @encapsulation: indicates the inner headers in the skbuff are valid
* @encap_hdr_csum: software checksum is needed
* @csum_valid: checksum is already valid
* @csum_not_inet: use CRC32c to resolve CHECKSUM_PARTIAL
* @csum_complete_sw: checksum was completed by software
* @csum_level: indicates the number of consecutive checksums found in
* the packet minus one that have been verified as
* CHECKSUM_UNNECESSARY (max 3)
* @dst_pending_confirm: need to confirm neighbour
* @decrypted: Decrypted SKB
* @slow_gro: state present at GRO time, slower prepare step required
* @napi_id: id of the NAPI struct this skb came from
* @sender_cpu: (aka @napi_id) source CPU in XPS
* @secmark: security marking
* @mark: Generic packet mark
* @reserved_tailroom: (aka @mark) number of bytes of free space available
* at the tail of an sk_buff
* @vlan_present: VLAN tag is present
* @vlan_proto: vlan encapsulation protocol
* @vlan_tci: vlan tag control information
* @inner_protocol: Protocol (encapsulation)
* @inner_ipproto: (aka @inner_protocol) stores ipproto when
* skb->inner_protocol_type == ENCAP_TYPE_IPPROTO;
* @inner_transport_header: Inner transport layer header (encapsulation)
* @inner_network_header: Network layer header (encapsulation)
* @inner_mac_header: Link layer header (encapsulation)
* @transport_header: Transport layer header
* @network_header: Network layer header
* @mac_header: Link layer header
* @kcov_handle: KCOV remote handle for remote coverage collection
* @tail: Tail pointer
* @end: End pointer
* @head: Head of buffer
* @data: Data head pointer
* @truesize: Buffer size
* @users: User count - see {datagram,tcp}.c
* @extensions: allocated extensions, valid if active_extensions is nonzero
*/
struct sk_buff {
union {
struct {
/* These two members must be first. */
struct sk_buff *next;
struct sk_buff *prev;
union {
struct net_device *dev;
/* Some protocols might use this space to store information,
* while device pointer would be NULL.
* UDP receive path is one user.
*/
unsigned long dev_scratch;
};
};
struct rb_node rbnode; /* used in netem, ip4 defrag, and tcp stack */
struct list_head list;
};
union {
struct sock *sk;
int ip_defrag_offset;
};
union {
ktime_t tstamp;
u64 skb_mstamp_ns; /* earliest departure time */
};
/*
* This is the control buffer. It is free to use for every
* layer. Please put your private variables there. If you
* want to keep them across layers you have to do a skb_clone()
* first. This is owned by whoever has the skb queued ATM.
*/
char cb[48] __aligned(8);
union {
struct {
unsigned long _skb_refdst;
void (*destructor)(struct sk_buff *skb);
};
struct list_head tcp_tsorted_anchor;
#ifdef CONFIG_NET_SOCK_MSG
unsigned long _sk_redir;
#endif
};
#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
unsigned long _nfct;
#endif
unsigned int len,
data_len;
__u16 mac_len,
hdr_len;
/* Following fields are _not_ copied in __copy_skb_header()
* Note that queue_mapping is here mostly to fill a hole.
*/
__u16 queue_mapping;
/* if you move cloned around you also must adapt those constants */
#ifdef __BIG_ENDIAN_BITFIELD
#define CLONED_MASK (1 << 7)
#else
#define CLONED_MASK 1
#endif
#define CLONED_OFFSET() offsetof(struct sk_buff, __cloned_offset)
/* private: */
__u8 __cloned_offset[0];
/* public: */
__u8 cloned:1,
nohdr:1,
fclone:2,
peeked:1,
head_frag:1,
pfmemalloc:1,
pp_recycle:1; /* page_pool recycle indicator */
#ifdef CONFIG_SKB_EXTENSIONS
__u8 active_extensions;
#endif
/* fields enclosed in headers_start/headers_end are copied
* using a single memcpy() in __copy_skb_header()
*/
/* private: */
__u32 headers_start[0];
/* public: */
/* if you move pkt_type around you also must adapt those constants */
#ifdef __BIG_ENDIAN_BITFIELD
#define PKT_TYPE_MAX (7 << 5)
#else
#define PKT_TYPE_MAX 7
#endif
#define PKT_TYPE_OFFSET() offsetof(struct sk_buff, __pkt_type_offset)
/* private: */
__u8 __pkt_type_offset[0];
/* public: */
__u8 pkt_type:3;
__u8 ignore_df:1;
__u8 nf_trace:1;
__u8 ip_summed:2;
__u8 ooo_okay:1;
__u8 l4_hash:1;
__u8 sw_hash:1;
__u8 wifi_acked_valid:1;
__u8 wifi_acked:1;
__u8 no_fcs:1;
/* Indicates the inner headers are valid in the skbuff. */
__u8 encapsulation:1;
__u8 encap_hdr_csum:1;
__u8 csum_valid:1;
#ifdef __BIG_ENDIAN_BITFIELD
#define PKT_VLAN_PRESENT_BIT 7
#else
#define PKT_VLAN_PRESENT_BIT 0
#endif
#define PKT_VLAN_PRESENT_OFFSET() offsetof(struct sk_buff, __pkt_vlan_present_offset)
/* private: */
__u8 __pkt_vlan_present_offset[0];
/* public: */
__u8 vlan_present:1;
__u8 csum_complete_sw:1;
__u8 csum_level:2;
__u8 csum_not_inet:1;
__u8 dst_pending_confirm:1;
#ifdef CONFIG_IPV6_NDISC_NODETYPE
__u8 ndisc_nodetype:2;
#endif
__u8 ipvs_property:1;
__u8 inner_protocol_type:1;
__u8 remcsum_offload:1;
#ifdef CONFIG_NET_SWITCHDEV
__u8 offload_fwd_mark:1;
__u8 offload_l3_fwd_mark:1;
#endif
#ifdef CONFIG_NET_CLS_ACT
__u8 tc_skip_classify:1;
__u8 tc_at_ingress:1;
#endif
__u8 redirected:1;
#ifdef CONFIG_NET_REDIRECT
__u8 from_ingress:1;
#endif
#ifdef CONFIG_TLS_DEVICE
__u8 decrypted:1;
#endif
__u8 slow_gro:1;
#ifdef CONFIG_NET_SCHED
__u16 tc_index; /* traffic control index */
#endif
union {
__wsum csum;
struct {
__u16 csum_start;
__u16 csum_offset;
};
};
__u32 priority;
int skb_iif;
__u32 hash;
__be16 vlan_proto;
__u16 vlan_tci;
#if defined(CONFIG_NET_RX_BUSY_POLL) || defined(CONFIG_XPS)
union {
unsigned int napi_id;
unsigned int sender_cpu;
};
#endif
#ifdef CONFIG_NETWORK_SECMARK
__u32 secmark;
#endif
union {
__u32 mark;
__u32 reserved_tailroom;
};
union {
__be16 inner_protocol;
__u8 inner_ipproto;
};
__u16 inner_transport_header;
__u16 inner_network_header;
__u16 inner_mac_header;
__be16 protocol;
__u16 transport_header;
__u16 network_header;
__u16 mac_header;
#ifdef CONFIG_KCOV
u64 kcov_handle;
#endif
/* private: */
__u32 headers_end[0];
/* public: */
/* These elements must be at the end, see alloc_skb() for details. */
sk_buff_data_t tail;
sk_buff_data_t end;
unsigned char *head,
*data;
unsigned int truesize;
refcount_t users;
#ifdef CONFIG_SKB_EXTENSIONS
/* only useable after checking ->active_extensions != 0 */
struct skb_ext *extensions;
#endif
};
#ifdef __KERNEL__
/*
* Handling routines are only of interest to the kernel
*/
#define SKB_ALLOC_FCLONE 0x01
#define SKB_ALLOC_RX 0x02
#define SKB_ALLOC_NAPI 0x04
/**
* skb_pfmemalloc - Test if the skb was allocated from PFMEMALLOC reserves
* @skb: buffer
*/
static inline bool skb_pfmemalloc(const struct sk_buff *skb)
{
return unlikely(skb->pfmemalloc);
}
/*
* skb might have a dst pointer attached, refcounted or not.
* _skb_refdst low order bit is set if refcount was _not_ taken
*/
#define SKB_DST_NOREF 1UL
#define SKB_DST_PTRMASK ~(SKB_DST_NOREF)
/**
* skb_dst - returns skb dst_entry
* @skb: buffer
*
* Returns skb dst_entry, regardless of reference taken or not.
*/
static inline struct dst_entry *skb_dst(const struct sk_buff *skb)
{
/* If refdst was not refcounted, check we still are in a
* rcu_read_lock section
*/
WARN_ON((skb->_skb_refdst & SKB_DST_NOREF) &&
!rcu_read_lock_held() &&
!rcu_read_lock_bh_held());
return (struct dst_entry *)(skb->_skb_refdst & SKB_DST_PTRMASK);
}
/**
* skb_dst_set - sets skb dst
* @skb: buffer
* @dst: dst entry
*
* Sets skb dst, assuming a reference was taken on dst and should
* be released by skb_dst_drop()
*/
static inline void skb_dst_set(struct sk_buff *skb, struct dst_entry *dst)
{
skb->slow_gro |= !!dst;
skb->_skb_refdst = (unsigned long)dst;
}
/**
* skb_dst_set_noref - sets skb dst, hopefully, without taking reference
* @skb: buffer
* @dst: dst entry
*
* Sets skb dst, assuming a reference was not taken on dst.
* If dst entry is cached, we do not take reference and dst_release
* will be avoided by refdst_drop. If dst entry is not cached, we take
* reference, so that last dst_release can destroy the dst immediately.
*/
static inline void skb_dst_set_noref(struct sk_buff *skb, struct dst_entry *dst)
{
WARN_ON(!rcu_read_lock_held() && !rcu_read_lock_bh_held());
skb->slow_gro |= !!dst;
skb->_skb_refdst = (unsigned long)dst | SKB_DST_NOREF;
}
/**
* skb_dst_is_noref - Test if skb dst isn't refcounted
* @skb: buffer
*/
static inline bool skb_dst_is_noref(const struct sk_buff *skb)
{
return (skb->_skb_refdst & SKB_DST_NOREF) && skb_dst(skb);
}
/**
* skb_rtable - Returns the skb &rtable
* @skb: buffer
*/
static inline struct rtable *skb_rtable(const struct sk_buff *skb)
{
return (struct rtable *)skb_dst(skb);
}
/* For mangling skb->pkt_type from user space side from applications
* such as nft, tc, etc, we only allow a conservative subset of
* possible pkt_types to be set.
*/
static inline bool skb_pkt_type_ok(u32 ptype)
{
return ptype <= PACKET_OTHERHOST;
}
/**
* skb_napi_id - Returns the skb's NAPI id
* @skb: buffer
*/
static inline unsigned int skb_napi_id(const struct sk_buff *skb)
{
#ifdef CONFIG_NET_RX_BUSY_POLL
return skb->napi_id;
#else
return 0;
#endif
}
/**
* skb_unref - decrement the skb's reference count
* @skb: buffer
*
* Returns true if we can free the skb.
*/
static inline bool skb_unref(struct sk_buff *skb)
{
if (unlikely(!skb))
return false;
if (likely(refcount_read(&skb->users) == 1))
smp_rmb();
else if (likely(!refcount_dec_and_test(&skb->users)))
return false;
return true;
}
void skb_release_head_state(struct sk_buff *skb);
void kfree_skb(struct sk_buff *skb);
void kfree_skb_list(struct sk_buff *segs);
void skb_dump(const char *level, const struct sk_buff *skb, bool full_pkt);
void skb_tx_error(struct sk_buff *skb);
#ifdef CONFIG_TRACEPOINTS
void consume_skb(struct sk_buff *skb);
#else
static inline void consume_skb(struct sk_buff *skb)
{
return kfree_skb(skb);
}
#endif
void __consume_stateless_skb(struct sk_buff *skb);
void __kfree_skb(struct sk_buff *skb);
extern struct kmem_cache *skbuff_head_cache;
void kfree_skb_partial(struct sk_buff *skb, bool head_stolen);
bool skb_try_coalesce(struct sk_buff *to, struct sk_buff *from,
bool *fragstolen, int *delta_truesize);
struct sk_buff *__alloc_skb(unsigned int size, gfp_t priority, int flags,
int node);
struct sk_buff *__build_skb(void *data, unsigned int frag_size);
struct sk_buff *build_skb(void *data, unsigned int frag_size);
struct sk_buff *build_skb_around(struct sk_buff *skb,
void *data, unsigned int frag_size);
struct sk_buff *napi_build_skb(void *data, unsigned int frag_size);
/**
* alloc_skb - allocate a network buffer
* @size: size to allocate
* @priority: allocation mask
*
* This function is a convenient wrapper around __alloc_skb().
*/
static inline struct sk_buff *alloc_skb(unsigned int size,
gfp_t priority)
{
return __alloc_skb(size, priority, 0, NUMA_NO_NODE);
}
struct sk_buff *alloc_skb_with_frags(unsigned long header_len,
unsigned long data_len,
int max_page_order,
int *errcode,
gfp_t gfp_mask);
struct sk_buff *alloc_skb_for_msg(struct sk_buff *first);
/* Layout of fast clones : [skb1][skb2][fclone_ref] */
struct sk_buff_fclones {
struct sk_buff skb1;
struct sk_buff skb2;
refcount_t fclone_ref;
};
/**
* skb_fclone_busy - check if fclone is busy
* @sk: socket
* @skb: buffer
*
* Returns true if skb is a fast clone, and its clone is not freed.
* Some drivers call skb_orphan() in their ndo_start_xmit(),
* so we also check that this didnt happen.
*/
static inline bool skb_fclone_busy(const struct sock *sk,
const struct sk_buff *skb)
{
const struct sk_buff_fclones *fclones;
fclones = container_of(skb, struct sk_buff_fclones, skb1);
return skb->fclone == SKB_FCLONE_ORIG &&
refcount_read(&fclones->fclone_ref) > 1 &&
READ_ONCE(fclones->skb2.sk) == sk;
}
/**
* alloc_skb_fclone - allocate a network buffer from fclone cache
* @size: size to allocate
* @priority: allocation mask
*
* This function is a convenient wrapper around __alloc_skb().
*/
static inline struct sk_buff *alloc_skb_fclone(unsigned int size,
gfp_t priority)
{
return __alloc_skb(size, priority, SKB_ALLOC_FCLONE, NUMA_NO_NODE);
}
struct sk_buff *skb_morph(struct sk_buff *dst, struct sk_buff *src);
void skb_headers_offset_update(struct sk_buff *skb, int off);
int skb_copy_ubufs(struct sk_buff *skb, gfp_t gfp_mask);
struct sk_buff *skb_clone(struct sk_buff *skb, gfp_t priority);
void skb_copy_header(struct sk_buff *new, const struct sk_buff *old);
struct sk_buff *skb_copy(const struct sk_buff *skb, gfp_t priority);
struct sk_buff *__pskb_copy_fclone(struct sk_buff *skb, int headroom,
gfp_t gfp_mask, bool fclone);
static inline struct sk_buff *__pskb_copy(struct sk_buff *skb, int headroom,
gfp_t gfp_mask)
{
return __pskb_copy_fclone(skb, headroom, gfp_mask, false);
}
int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail, gfp_t gfp_mask);
struct sk_buff *skb_realloc_headroom(struct sk_buff *skb,
unsigned int headroom);
struct sk_buff *skb_expand_head(struct sk_buff *skb, unsigned int headroom);
struct sk_buff *skb_copy_expand(const struct sk_buff *skb, int newheadroom,
int newtailroom, gfp_t priority);
int __must_check skb_to_sgvec_nomark(struct sk_buff *skb, struct scatterlist *sg,
int offset, int len);
int __must_check skb_to_sgvec(struct sk_buff *skb, struct scatterlist *sg,
int offset, int len);
int skb_cow_data(struct sk_buff *skb, int tailbits, struct sk_buff **trailer);
int __skb_pad(struct sk_buff *skb, int pad, bool free_on_error);
/**
* skb_pad - zero pad the tail of an skb
* @skb: buffer to pad
* @pad: space to pad
*
* Ensure that a buffer is followed by a padding area that is zero
* filled. Used by network drivers which may DMA or transfer data
* beyond the buffer end onto the wire.
*
* May return error in out of memory cases. The skb is freed on error.
*/
static inline int skb_pad(struct sk_buff *skb, int pad)
{
return __skb_pad(skb, pad, true);
}
#define dev_kfree_skb(a) consume_skb(a)
int skb_append_pagefrags(struct sk_buff *skb, struct page *page,
int offset, size_t size);
struct skb_seq_state {
__u32 lower_offset;
__u32 upper_offset;
__u32 frag_idx;
__u32 stepped_offset;
struct sk_buff *root_skb;
struct sk_buff *cur_skb;
__u8 *frag_data;
__u32 frag_off;
};
void skb_prepare_seq_read(struct sk_buff *skb, unsigned int from,
unsigned int to, struct skb_seq_state *st);
unsigned int skb_seq_read(unsigned int consumed, const u8 **data,
struct skb_seq_state *st);
void skb_abort_seq_read(struct skb_seq_state *st);
unsigned int skb_find_text(struct sk_buff *skb, unsigned int from,
unsigned int to, struct ts_config *config);
/*
* Packet hash types specify the type of hash in skb_set_hash.
*
* Hash types refer to the protocol layer addresses which are used to
* construct a packet's hash. The hashes are used to differentiate or identify
* flows of the protocol layer for the hash type. Hash types are either
* layer-2 (L2), layer-3 (L3), or layer-4 (L4).
*
* Properties of hashes:
*
* 1) Two packets in different flows have different hash values
* 2) Two packets in the same flow should have the same hash value
*
* A hash at a higher layer is considered to be more specific. A driver should
* set the most specific hash possible.
*
* A driver cannot indicate a more specific hash than the layer at which a hash
* was computed. For instance an L3 hash cannot be set as an L4 hash.
*
* A driver may indicate a hash level which is less specific than the
* actual layer the hash was computed on. For instance, a hash computed
* at L4 may be considered an L3 hash. This should only be done if the
* driver can't unambiguously determine that the HW computed the hash at
* the higher layer. Note that the "should" in the second property above
* permits this.
*/
enum pkt_hash_types {
PKT_HASH_TYPE_NONE, /* Undefined type */
PKT_HASH_TYPE_L2, /* Input: src_MAC, dest_MAC */
PKT_HASH_TYPE_L3, /* Input: src_IP, dst_IP */
PKT_HASH_TYPE_L4, /* Input: src_IP, dst_IP, src_port, dst_port */
};
static inline void skb_clear_hash(struct sk_buff *skb)
{
skb->hash = 0;
skb->sw_hash = 0;
skb->l4_hash = 0;
}
static inline void skb_clear_hash_if_not_l4(struct sk_buff *skb)
{
if (!skb->l4_hash)
skb_clear_hash(skb);
}
static inline void
__skb_set_hash(struct sk_buff *skb, __u32 hash, bool is_sw, bool is_l4)
{
skb->l4_hash = is_l4;
skb->sw_hash = is_sw;
skb->hash = hash;
}
static inline void
skb_set_hash(struct sk_buff *skb, __u32 hash, enum pkt_hash_types type)
{
/* Used by drivers to set hash from HW */
__skb_set_hash(skb, hash, false, type == PKT_HASH_TYPE_L4);
}
static inline void
__skb_set_sw_hash(struct sk_buff *skb, __u32 hash, bool is_l4)
{
__skb_set_hash(skb, hash, true, is_l4);
}
void __skb_get_hash(struct sk_buff *skb);
u32 __skb_get_hash_symmetric(const struct sk_buff *skb);
u32 skb_get_poff(const struct sk_buff *skb);
u32 __skb_get_poff(const struct sk_buff *skb, const void *data,
const struct flow_keys_basic *keys, int hlen);
__be32 __skb_flow_get_ports(const struct sk_buff *skb, int thoff, u8 ip_proto,
const void *data, int hlen_proto);
static inline __be32 skb_flow_get_ports(const struct sk_buff *skb,
int thoff, u8 ip_proto)
{
return __skb_flow_get_ports(skb, thoff, ip_proto, NULL, 0);
}
void skb_flow_dissector_init(struct flow_dissector *flow_dissector,
const struct flow_dissector_key *key,
unsigned int key_count);
struct bpf_flow_dissector;
bool bpf_flow_dissect(struct bpf_prog *prog, struct bpf_flow_dissector *ctx,
__be16 proto, int nhoff, int hlen, unsigned int flags);
bool __skb_flow_dissect(const struct net *net,
const struct sk_buff *skb,
struct flow_dissector *flow_dissector,
void *target_container, const void *data,
__be16 proto, int nhoff, int hlen, unsigned int flags);
static inline bool skb_flow_dissect(const struct sk_buff *skb,
struct flow_dissector *flow_dissector,
void *target_container, unsigned int flags)
{
return __skb_flow_dissect(NULL, skb, flow_dissector,
target_container, NULL, 0, 0, 0, flags);
}
static inline bool skb_flow_dissect_flow_keys(const struct sk_buff *skb,
struct flow_keys *flow,
unsigned int flags)
{
memset(flow, 0, sizeof(*flow));
return __skb_flow_dissect(NULL, skb, &flow_keys_dissector,
flow, NULL, 0, 0, 0, flags);
}
static inline bool
skb_flow_dissect_flow_keys_basic(const struct net *net,
const struct sk_buff *skb,
struct flow_keys_basic *flow,
const void *data, __be16 proto,
int nhoff, int hlen, unsigned int flags)
{
memset(flow, 0, sizeof(*flow));
return __skb_flow_dissect(net, skb, &flow_keys_basic_dissector, flow,
data, proto, nhoff, hlen, flags);
}
void skb_flow_dissect_meta(const struct sk_buff *skb,
struct flow_dissector *flow_dissector,
void *target_container);
/* Gets a skb connection tracking info, ctinfo map should be a
* map of mapsize to translate enum ip_conntrack_info states
* to user states.
*/
void
skb_flow_dissect_ct(const struct sk_buff *skb,
struct flow_dissector *flow_dissector,
void *target_container,
u16 *ctinfo_map, size_t mapsize,
bool post_ct, u16 zone);
void
skb_flow_dissect_tunnel_info(const struct sk_buff *skb,
struct flow_dissector *flow_dissector,
void *target_container);
void skb_flow_dissect_hash(const struct sk_buff *skb,
struct flow_dissector *flow_dissector,
void *target_container);
static inline __u32 skb_get_hash(struct sk_buff *skb)
{
if (!skb->l4_hash && !skb->sw_hash) __skb_get_hash(skb); return skb->hash;
}
static inline __u32 skb_get_hash_flowi6(struct sk_buff *skb, const struct flowi6 *fl6)
{
if (!skb->l4_hash && !skb->sw_hash) {
struct flow_keys keys;
__u32 hash = __get_hash_from_flowi6(fl6, &keys);
__skb_set_sw_hash(skb, hash, flow_keys_have_l4(&keys));
}
return skb->hash;
}
__u32 skb_get_hash_perturb(const struct sk_buff *skb,
const siphash_key_t *perturb);
static inline __u32 skb_get_hash_raw(const struct sk_buff *skb)
{
return skb->hash;
}
static inline void skb_copy_hash(struct sk_buff *to, const struct sk_buff *from)
{
to->hash = from->hash;
to->sw_hash = from->sw_hash;
to->l4_hash = from->l4_hash;
};
static inline void skb_copy_decrypted(struct sk_buff *to,
const struct sk_buff *from)
{
#ifdef CONFIG_TLS_DEVICE
to->decrypted = from->decrypted;
#endif
}
#ifdef NET_SKBUFF_DATA_USES_OFFSET
static inline unsigned char *skb_end_pointer(const struct sk_buff *skb)
{
return skb->head + skb->end;
}
static inline unsigned int skb_end_offset(const struct sk_buff *skb)
{
return skb->end;
}
static inline void skb_set_end_offset(struct sk_buff *skb, unsigned int offset)
{
skb->end = offset;
}
#else
static inline unsigned char *skb_end_pointer(const struct sk_buff *skb)
{
return skb->end;
}
static inline unsigned int skb_end_offset(const struct sk_buff *skb)
{
return skb->end - skb->head;
}
static inline void skb_set_end_offset(struct sk_buff *skb, unsigned int offset)
{
skb->end = skb->head + offset;
}
#endif
/* Internal */
#define skb_shinfo(SKB) ((struct skb_shared_info *)(skb_end_pointer(SKB)))
static inline struct skb_shared_hwtstamps *skb_hwtstamps(struct sk_buff *skb)
{
return &skb_shinfo(skb)->hwtstamps;
}
static inline struct ubuf_info *skb_zcopy(struct sk_buff *skb)
{
bool is_zcopy = skb && skb_shinfo(skb)->flags & SKBFL_ZEROCOPY_ENABLE; return is_zcopy ? skb_uarg(skb) : NULL;
}
static inline void net_zcopy_get(struct ubuf_info *uarg)
{
refcount_inc(&uarg->refcnt);
}
static inline void skb_zcopy_init(struct sk_buff *skb, struct ubuf_info *uarg)
{
skb_shinfo(skb)->destructor_arg = uarg;
skb_shinfo(skb)->flags |= uarg->flags;
}
static inline void skb_zcopy_set(struct sk_buff *skb, struct ubuf_info *uarg,
bool *have_ref)
{
if (skb && uarg && !skb_zcopy(skb)) {
if (unlikely(have_ref && *have_ref))
*have_ref = false;
else
net_zcopy_get(uarg);
skb_zcopy_init(skb, uarg);
}
}
static inline void skb_zcopy_set_nouarg(struct sk_buff *skb, void *val)
{
skb_shinfo(skb)->destructor_arg = (void *)((uintptr_t) val | 0x1UL);
skb_shinfo(skb)->flags |= SKBFL_ZEROCOPY_FRAG;
}
static inline bool skb_zcopy_is_nouarg(struct sk_buff *skb)
{
return (uintptr_t) skb_shinfo(skb)->destructor_arg & 0x1UL;
}
static inline void *skb_zcopy_get_nouarg(struct sk_buff *skb)
{
return (void *)((uintptr_t) skb_shinfo(skb)->destructor_arg & ~0x1UL);
}
static inline void net_zcopy_put(struct ubuf_info *uarg)
{
if (uarg) uarg->callback(NULL, uarg, true);
}
static inline void net_zcopy_put_abort(struct ubuf_info *uarg, bool have_uref)
{
if (uarg) { if (uarg->callback == msg_zerocopy_callback) msg_zerocopy_put_abort(uarg, have_uref); else if (have_uref)
net_zcopy_put(uarg);
}
}
/* Release a reference on a zerocopy structure */
static inline void skb_zcopy_clear(struct sk_buff *skb, bool zerocopy_success)
{
struct ubuf_info *uarg = skb_zcopy(skb);
if (uarg) {
if (!skb_zcopy_is_nouarg(skb))
uarg->callback(skb, uarg, zerocopy_success); skb_shinfo(skb)->flags &= ~SKBFL_ZEROCOPY_FRAG;
}
}
static inline void skb_mark_not_on_list(struct sk_buff *skb)
{
skb->next = NULL;
}
/* Iterate through singly-linked GSO fragments of an skb. */
#define skb_list_walk_safe(first, skb, next_skb) \
for ((skb) = (first), (next_skb) = (skb) ? (skb)->next : NULL; (skb); \
(skb) = (next_skb), (next_skb) = (skb) ? (skb)->next : NULL)
static inline void skb_list_del_init(struct sk_buff *skb)
{
__list_del_entry(&skb->list);
skb_mark_not_on_list(skb);
}
/**
* skb_queue_empty - check if a queue is empty
* @list: queue head
*
* Returns true if the queue is empty, false otherwise.
*/
static inline int skb_queue_empty(const struct sk_buff_head *list)
{
return list->next == (const struct sk_buff *) list;
}
/**
* skb_queue_empty_lockless - check if a queue is empty
* @list: queue head
*
* Returns true if the queue is empty, false otherwise.
* This variant can be used in lockless contexts.
*/
static inline bool skb_queue_empty_lockless(const struct sk_buff_head *list)
{
return READ_ONCE(list->next) == (const struct sk_buff *) list;
}
/**
* skb_queue_is_last - check if skb is the last entry in the queue
* @list: queue head
* @skb: buffer
*
* Returns true if @skb is the last buffer on the list.
*/
static inline bool skb_queue_is_last(const struct sk_buff_head *list,
const struct sk_buff *skb)
{
return skb->next == (const struct sk_buff *) list;
}
/**
* skb_queue_is_first - check if skb is the first entry in the queue
* @list: queue head
* @skb: buffer
*
* Returns true if @skb is the first buffer on the list.
*/
static inline bool skb_queue_is_first(const struct sk_buff_head *list,
const struct sk_buff *skb)
{
return skb->prev == (const struct sk_buff *) list;
}
/**
* skb_queue_next - return the next packet in the queue
* @list: queue head
* @skb: current buffer
*
* Return the next packet in @list after @skb. It is only valid to
* call this if skb_queue_is_last() evaluates to false.
*/
static inline struct sk_buff *skb_queue_next(const struct sk_buff_head *list,
const struct sk_buff *skb)
{
/* This BUG_ON may seem severe, but if we just return then we
* are going to dereference garbage.
*/
BUG_ON(skb_queue_is_last(list, skb));
return skb->next;
}
/**
* skb_queue_prev - return the prev packet in the queue
* @list: queue head
* @skb: current buffer
*
* Return the prev packet in @list before @skb. It is only valid to
* call this if skb_queue_is_first() evaluates to false.
*/
static inline struct sk_buff *skb_queue_prev(const struct sk_buff_head *list,
const struct sk_buff *skb)
{
/* This BUG_ON may seem severe, but if we just return then we
* are going to dereference garbage.
*/
BUG_ON(skb_queue_is_first(list, skb));
return skb->prev;
}
/**
* skb_get - reference buffer
* @skb: buffer to reference
*
* Makes another reference to a socket buffer and returns a pointer
* to the buffer.
*/
static inline struct sk_buff *skb_get(struct sk_buff *skb)
{
refcount_inc(&skb->users);
return skb;
}
/*
* If users == 1, we are the only owner and can avoid redundant atomic changes.
*/
/**
* skb_cloned - is the buffer a clone
* @skb: buffer to check
*
* Returns true if the buffer was generated with skb_clone() and is
* one of multiple shared copies of the buffer. Cloned buffers are
* shared data so must not be written to under normal circumstances.
*/
static inline int skb_cloned(const struct sk_buff *skb)
{
return skb->cloned && (atomic_read(&skb_shinfo(skb)->dataref) & SKB_DATAREF_MASK) != 1;
}
static inline int skb_unclone(struct sk_buff *skb, gfp_t pri)
{
might_sleep_if(gfpflags_allow_blocking(pri));
if (skb_cloned(skb))
return pskb_expand_head(skb, 0, 0, pri);
return 0;
}
/* This variant of skb_unclone() makes sure skb->truesize
* and skb_end_offset() are not changed, whenever a new skb->head is needed.
*
* Indeed there is no guarantee that ksize(kmalloc(X)) == ksize(kmalloc(X))
* when various debugging features are in place.
*/
int __skb_unclone_keeptruesize(struct sk_buff *skb, gfp_t pri);
static inline int skb_unclone_keeptruesize(struct sk_buff *skb, gfp_t pri)
{
might_sleep_if(gfpflags_allow_blocking(pri));
if (skb_cloned(skb))
return __skb_unclone_keeptruesize(skb, pri);
return 0;
}
/**
* skb_header_cloned - is the header a clone
* @skb: buffer to check
*
* Returns true if modifying the header part of the buffer requires
* the data to be copied.
*/
static inline int skb_header_cloned(const struct sk_buff *skb)
{
int dataref;
if (!skb->cloned)
return 0;
dataref = atomic_read(&skb_shinfo(skb)->dataref);
dataref = (dataref & SKB_DATAREF_MASK) - (dataref >> SKB_DATAREF_SHIFT);
return dataref != 1;
}
static inline int skb_header_unclone(struct sk_buff *skb, gfp_t pri)
{
might_sleep_if(gfpflags_allow_blocking(pri));
if (skb_header_cloned(skb)) return pskb_expand_head(skb, 0, 0, pri);
return 0;
}
/**
* __skb_header_release - release reference to header
* @skb: buffer to operate on
*/
static inline void __skb_header_release(struct sk_buff *skb)
{
skb->nohdr = 1;
atomic_set(&skb_shinfo(skb)->dataref, 1 + (1 << SKB_DATAREF_SHIFT));
}
/**
* skb_shared - is the buffer shared
* @skb: buffer to check
*
* Returns true if more than one person has a reference to this
* buffer.
*/
static inline int skb_shared(const struct sk_buff *skb)
{
return refcount_read(&skb->users) != 1;
}
/**
* skb_share_check - check if buffer is shared and if so clone it
* @skb: buffer to check
* @pri: priority for memory allocation
*
* If the buffer is shared the buffer is cloned and the old copy
* drops a reference. A new clone with a single reference is returned.
* If the buffer is not shared the original buffer is returned. When
* being called from interrupt status or with spinlocks held pri must
* be GFP_ATOMIC.
*
* NULL is returned on a memory allocation failure.
*/
static inline struct sk_buff *skb_share_check(struct sk_buff *skb, gfp_t pri)
{
might_sleep_if(gfpflags_allow_blocking(pri));
if (skb_shared(skb)) {
struct sk_buff *nskb = skb_clone(skb, pri);
if (likely(nskb))
consume_skb(skb);
else
kfree_skb(skb);
skb = nskb;
}
return skb;
}
/*
* Copy shared buffers into a new sk_buff. We effectively do COW on
* packets to handle cases where we have a local reader and forward
* and a couple of other messy ones. The normal one is tcpdumping
* a packet thats being forwarded.
*/
/**
* skb_unshare - make a copy of a shared buffer
* @skb: buffer to check
* @pri: priority for memory allocation
*
* If the socket buffer is a clone then this function creates a new
* copy of the data, drops a reference count on the old copy and returns
* the new copy with the reference count at 1. If the buffer is not a clone
* the original buffer is returned. When called with a spinlock held or
* from interrupt state @pri must be %GFP_ATOMIC
*
* %NULL is returned on a memory allocation failure.
*/
static inline struct sk_buff *skb_unshare(struct sk_buff *skb,
gfp_t pri)
{
might_sleep_if(gfpflags_allow_blocking(pri));
if (skb_cloned(skb)) {
struct sk_buff *nskb = skb_copy(skb, pri);
/* Free our shared copy */
if (likely(nskb))
consume_skb(skb);
else
kfree_skb(skb);
skb = nskb;
}
return skb;
}
/**
* skb_peek - peek at the head of an &sk_buff_head
* @list_: list to peek at
*
* Peek an &sk_buff. Unlike most other operations you _MUST_
* be careful with this one. A peek leaves the buffer on the
* list and someone else may run off with it. You must hold
* the appropriate locks or have a private queue to do this.
*
* Returns %NULL for an empty list or a pointer to the head element.
* The reference count is not incremented and the reference is therefore
* volatile. Use with caution.
*/
static inline struct sk_buff *skb_peek(const struct sk_buff_head *list_)
{
struct sk_buff *skb = list_->next;
if (skb == (struct sk_buff *)list_)
skb = NULL;
return skb;
}
/**
* __skb_peek - peek at the head of a non-empty &sk_buff_head
* @list_: list to peek at
*
* Like skb_peek(), but the caller knows that the list is not empty.
*/
static inline struct sk_buff *__skb_peek(const struct sk_buff_head *list_)
{
return list_->next;
}
/**
* skb_peek_next - peek skb following the given one from a queue
* @skb: skb to start from
* @list_: list to peek at
*
* Returns %NULL when the end of the list is met or a pointer to the
* next element. The reference count is not incremented and the
* reference is therefore volatile. Use with caution.
*/
static inline struct sk_buff *skb_peek_next(struct sk_buff *skb,
const struct sk_buff_head *list_)
{
struct sk_buff *next = skb->next;
if (next == (struct sk_buff *)list_)
next = NULL;
return next;
}
/**
* skb_peek_tail - peek at the tail of an &sk_buff_head
* @list_: list to peek at
*
* Peek an &sk_buff. Unlike most other operations you _MUST_
* be careful with this one. A peek leaves the buffer on the
* list and someone else may run off with it. You must hold
* the appropriate locks or have a private queue to do this.
*
* Returns %NULL for an empty list or a pointer to the tail element.
* The reference count is not incremented and the reference is therefore
* volatile. Use with caution.
*/
static inline struct sk_buff *skb_peek_tail(const struct sk_buff_head *list_)
{
struct sk_buff *skb = READ_ONCE(list_->prev);
if (skb == (struct sk_buff *)list_)
skb = NULL;
return skb;
}
/**
* skb_queue_len - get queue length
* @list_: list to measure
*
* Return the length of an &sk_buff queue.
*/
static inline __u32 skb_queue_len(const struct sk_buff_head *list_)
{
return list_->qlen;
}
/**
* skb_queue_len_lockless - get queue length
* @list_: list to measure
*
* Return the length of an &sk_buff queue.
* This variant can be used in lockless contexts.
*/
static inline __u32 skb_queue_len_lockless(const struct sk_buff_head *list_)
{
return READ_ONCE(list_->qlen);
}
/**
* __skb_queue_head_init - initialize non-spinlock portions of sk_buff_head
* @list: queue to initialize
*
* This initializes only the list and queue length aspects of
* an sk_buff_head object. This allows to initialize the list
* aspects of an sk_buff_head without reinitializing things like
* the spinlock. It can also be used for on-stack sk_buff_head
* objects where the spinlock is known to not be used.
*/
static inline void __skb_queue_head_init(struct sk_buff_head *list)
{
list->prev = list->next = (struct sk_buff *)list;
list->qlen = 0;
}
/*
* This function creates a split out lock class for each invocation;
* this is needed for now since a whole lot of users of the skb-queue
* infrastructure in drivers have different locking usage (in hardirq)
* than the networking core (in softirq only). In the long run either the
* network layer or drivers should need annotation to consolidate the
* main types of usage into 3 classes.
*/
static inline void skb_queue_head_init(struct sk_buff_head *list)
{
spin_lock_init(&list->lock);
__skb_queue_head_init(list);
}
static inline void skb_queue_head_init_class(struct sk_buff_head *list,
struct lock_class_key *class)
{
skb_queue_head_init(list);
lockdep_set_class(&list->lock, class);
}
/*
* Insert an sk_buff on a list.
*
* The "__skb_xxxx()" functions are the non-atomic ones that
* can only be called with interrupts disabled.
*/
static inline void __skb_insert(struct sk_buff *newsk,
struct sk_buff *prev, struct sk_buff *next,
struct sk_buff_head *list)
{
/* See skb_queue_empty_lockless() and skb_peek_tail()
* for the opposite READ_ONCE()
*/
WRITE_ONCE(newsk->next, next);
WRITE_ONCE(newsk->prev, prev);
WRITE_ONCE(next->prev, newsk);
WRITE_ONCE(prev->next, newsk);
WRITE_ONCE(list->qlen, list->qlen + 1);
}
static inline void __skb_queue_splice(const struct sk_buff_head *list,
struct sk_buff *prev,
struct sk_buff *next)
{
struct sk_buff *first = list->next;
struct sk_buff *last = list->prev;
WRITE_ONCE(first->prev, prev);
WRITE_ONCE(prev->next, first);
WRITE_ONCE(last->next, next);
WRITE_ONCE(next->prev, last);
}
/**
* skb_queue_splice - join two skb lists, this is designed for stacks
* @list: the new list to add
* @head: the place to add it in the first list
*/
static inline void skb_queue_splice(const struct sk_buff_head *list,
struct sk_buff_head *head)
{
if (!skb_queue_empty(list)) {
__skb_queue_splice(list, (struct sk_buff *) head, head->next);
head->qlen += list->qlen;
}
}
/**
* skb_queue_splice_init - join two skb lists and reinitialise the emptied list
* @list: the new list to add
* @head: the place to add it in the first list
*
* The list at @list is reinitialised
*/
static inline void skb_queue_splice_init(struct sk_buff_head *list,
struct sk_buff_head *head)
{
if (!skb_queue_empty(list)) {
__skb_queue_splice(list, (struct sk_buff *) head, head->next);
head->qlen += list->qlen;
__skb_queue_head_init(list);
}
}
/**
* skb_queue_splice_tail - join two skb lists, each list being a queue
* @list: the new list to add
* @head: the place to add it in the first list
*/
static inline void skb_queue_splice_tail(const struct sk_buff_head *list,
struct sk_buff_head *head)
{
if (!skb_queue_empty(list)) {
__skb_queue_splice(list, head->prev, (struct sk_buff *) head);
head->qlen += list->qlen;
}
}
/**
* skb_queue_splice_tail_init - join two skb lists and reinitialise the emptied list
* @list: the new list to add
* @head: the place to add it in the first list
*
* Each of the lists is a queue.
* The list at @list is reinitialised
*/
static inline void skb_queue_splice_tail_init(struct sk_buff_head *list,
struct sk_buff_head *head)
{
if (!skb_queue_empty(list)) {
__skb_queue_splice(list, head->prev, (struct sk_buff *) head);
head->qlen += list->qlen;
__skb_queue_head_init(list);
}
}
/**
* __skb_queue_after - queue a buffer at the list head
* @list: list to use
* @prev: place after this buffer
* @newsk: buffer to queue
*
* Queue a buffer int the middle of a list. This function takes no locks
* and you must therefore hold required locks before calling it.
*
* A buffer cannot be placed on two lists at the same time.
*/
static inline void __skb_queue_after(struct sk_buff_head *list,
struct sk_buff *prev,
struct sk_buff *newsk)
{
__skb_insert(newsk, prev, prev->next, list);
}
void skb_append(struct sk_buff *old, struct sk_buff *newsk,
struct sk_buff_head *list);
static inline void __skb_queue_before(struct sk_buff_head *list,
struct sk_buff *next,
struct sk_buff *newsk)
{
__skb_insert(newsk, next->prev, next, list);
}
/**
* __skb_queue_head - queue a buffer at the list head
* @list: list to use
* @newsk: buffer to queue
*
* Queue a buffer at the start of a list. This function takes no locks
* and you must therefore hold required locks before calling it.
*
* A buffer cannot be placed on two lists at the same time.
*/
static inline void __skb_queue_head(struct sk_buff_head *list,
struct sk_buff *newsk)
{
__skb_queue_after(list, (struct sk_buff *)list, newsk);
}
void skb_queue_head(struct sk_buff_head *list, struct sk_buff *newsk);
/**
* __skb_queue_tail - queue a buffer at the list tail
* @list: list to use
* @newsk: buffer to queue
*
* Queue a buffer at the end of a list. This function takes no locks
* and you must therefore hold required locks before calling it.
*
* A buffer cannot be placed on two lists at the same time.
*/
static inline void __skb_queue_tail(struct sk_buff_head *list,
struct sk_buff *newsk)
{
__skb_queue_before(list, (struct sk_buff *)list, newsk);
}
void skb_queue_tail(struct sk_buff_head *list, struct sk_buff *newsk);
/*
* remove sk_buff from list. _Must_ be called atomically, and with
* the list known..
*/
void skb_unlink(struct sk_buff *skb, struct sk_buff_head *list);
static inline void __skb_unlink(struct sk_buff *skb, struct sk_buff_head *list)
{
struct sk_buff *next, *prev;
WRITE_ONCE(list->qlen, list->qlen - 1);
next = skb->next;
prev = skb->prev;
skb->next = skb->prev = NULL;
WRITE_ONCE(next->prev, prev);
WRITE_ONCE(prev->next, next);
}
/**
* __skb_dequeue - remove from the head of the queue
* @list: list to dequeue from
*
* Remove the head of the list. This function does not take any locks
* so must be used with appropriate locks held only. The head item is
* returned or %NULL if the list is empty.
*/
static inline struct sk_buff *__skb_dequeue(struct sk_buff_head *list)
{
struct sk_buff *skb = skb_peek(list);
if (skb)
__skb_unlink(skb, list);
return skb;
}
struct sk_buff *skb_dequeue(struct sk_buff_head *list);
/**
* __skb_dequeue_tail - remove from the tail of the queue
* @list: list to dequeue from
*
* Remove the tail of the list. This function does not take any locks
* so must be used with appropriate locks held only. The tail item is
* returned or %NULL if the list is empty.
*/
static inline struct sk_buff *__skb_dequeue_tail(struct sk_buff_head *list)
{
struct sk_buff *skb = skb_peek_tail(list);
if (skb)
__skb_unlink(skb, list);
return skb;
}
struct sk_buff *skb_dequeue_tail(struct sk_buff_head *list);
static inline bool skb_is_nonlinear(const struct sk_buff *skb)
{
return skb->data_len;
}
static inline unsigned int skb_headlen(const struct sk_buff *skb)
{
return skb->len - skb->data_len;
}
static inline unsigned int __skb_pagelen(const struct sk_buff *skb)
{
unsigned int i, len = 0;
for (i = skb_shinfo(skb)->nr_frags - 1; (int)i >= 0; i--)
len += skb_frag_size(&skb_shinfo(skb)->frags[i]);
return len;
}
static inline unsigned int skb_pagelen(const struct sk_buff *skb)
{
return skb_headlen(skb) + __skb_pagelen(skb);
}
/**
* __skb_fill_page_desc - initialise a paged fragment in an skb
* @skb: buffer containing fragment to be initialised
* @i: paged fragment index to initialise
* @page: the page to use for this fragment
* @off: the offset to the data with @page
* @size: the length of the data
*
* Initialises the @i'th fragment of @skb to point to &size bytes at
* offset @off within @page.
*
* Does not take any additional reference on the fragment.
*/
static inline void __skb_fill_page_desc(struct sk_buff *skb, int i,
struct page *page, int off, int size)
{
skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
/*
* Propagate page pfmemalloc to the skb if we can. The problem is
* that not all callers have unique ownership of the page but rely
* on page_is_pfmemalloc doing the right thing(tm).
*/
frag->bv_page = page;
frag->bv_offset = off;
skb_frag_size_set(frag, size);
page = compound_head(page);
if (page_is_pfmemalloc(page)) skb->pfmemalloc = true;
}
/**
* skb_fill_page_desc - initialise a paged fragment in an skb
* @skb: buffer containing fragment to be initialised
* @i: paged fragment index to initialise
* @page: the page to use for this fragment
* @off: the offset to the data with @page
* @size: the length of the data
*
* As per __skb_fill_page_desc() -- initialises the @i'th fragment of
* @skb to point to @size bytes at offset @off within @page. In
* addition updates @skb such that @i is the last fragment.
*
* Does not take any additional reference on the fragment.
*/
static inline void skb_fill_page_desc(struct sk_buff *skb, int i,
struct page *page, int off, int size)
{
__skb_fill_page_desc(skb, i, page, off, size);
skb_shinfo(skb)->nr_frags = i + 1;
}
void skb_add_rx_frag(struct sk_buff *skb, int i, struct page *page, int off,
int size, unsigned int truesize);
void skb_coalesce_rx_frag(struct sk_buff *skb, int i, int size,
unsigned int truesize);
#define SKB_LINEAR_ASSERT(skb) BUG_ON(skb_is_nonlinear(skb))
#ifdef NET_SKBUFF_DATA_USES_OFFSET
static inline unsigned char *skb_tail_pointer(const struct sk_buff *skb)
{
return skb->head + skb->tail;
}
static inline void skb_reset_tail_pointer(struct sk_buff *skb)
{
skb->tail = skb->data - skb->head;
}
static inline void skb_set_tail_pointer(struct sk_buff *skb, const int offset)
{
skb_reset_tail_pointer(skb);
skb->tail += offset;
}
#else /* NET_SKBUFF_DATA_USES_OFFSET */
static inline unsigned char *skb_tail_pointer(const struct sk_buff *skb)
{
return skb->tail;
}
static inline void skb_reset_tail_pointer(struct sk_buff *skb)
{
skb->tail = skb->data;
}
static inline void skb_set_tail_pointer(struct sk_buff *skb, const int offset)
{
skb->tail = skb->data + offset;
}
#endif /* NET_SKBUFF_DATA_USES_OFFSET */
/*
* Add data to an sk_buff
*/
void *pskb_put(struct sk_buff *skb, struct sk_buff *tail, int len);
void *skb_put(struct sk_buff *skb, unsigned int len);
static inline void *__skb_put(struct sk_buff *skb, unsigned int len)
{
void *tmp = skb_tail_pointer(skb);
SKB_LINEAR_ASSERT(skb);
skb->tail += len;
skb->len += len;
return tmp;
}
static inline void *__skb_put_zero(struct sk_buff *skb, unsigned int len)
{
void *tmp = __skb_put(skb, len);
memset(tmp, 0, len);
return tmp;
}
static inline void *__skb_put_data(struct sk_buff *skb, const void *data,
unsigned int len)
{
void *tmp = __skb_put(skb, len);
memcpy(tmp, data, len);
return tmp;
}
static inline void __skb_put_u8(struct sk_buff *skb, u8 val)
{
*(u8 *)__skb_put(skb, 1) = val;
}
static inline void *skb_put_zero(struct sk_buff *skb, unsigned int len)
{
void *tmp = skb_put(skb, len);
memset(tmp, 0, len);
return tmp;
}
static inline void *skb_put_data(struct sk_buff *skb, const void *data,
unsigned int len)
{
void *tmp = skb_put(skb, len);
memcpy(tmp, data, len);
return tmp;
}
static inline void skb_put_u8(struct sk_buff *skb, u8 val)
{
*(u8 *)skb_put(skb, 1) = val;
}
void *skb_push(struct sk_buff *skb, unsigned int len);
static inline void *__skb_push(struct sk_buff *skb, unsigned int len)
{
skb->data -= len;
skb->len += len;
return skb->data;
}
void *skb_pull(struct sk_buff *skb, unsigned int len);
static inline void *__skb_pull(struct sk_buff *skb, unsigned int len)
{
skb->len -= len; BUG_ON(skb->len < skb->data_len); return skb->data += len;
}
static inline void *skb_pull_inline(struct sk_buff *skb, unsigned int len)
{
return unlikely(len > skb->len) ? NULL : __skb_pull(skb, len);
}
void *__pskb_pull_tail(struct sk_buff *skb, int delta);
static inline void *__pskb_pull(struct sk_buff *skb, unsigned int len)
{
if (len > skb_headlen(skb) &&
!__pskb_pull_tail(skb, len - skb_headlen(skb)))
return NULL;
skb->len -= len;
return skb->data += len;
}
static inline void *pskb_pull(struct sk_buff *skb, unsigned int len)
{
return unlikely(len > skb->len) ? NULL : __pskb_pull(skb, len);
}
static inline bool pskb_may_pull(struct sk_buff *skb, unsigned int len)
{
if (likely(len <= skb_headlen(skb)))
return true;
if (unlikely(len > skb->len))
return false;
return __pskb_pull_tail(skb, len - skb_headlen(skb)) != NULL;
}
void skb_condense(struct sk_buff *skb);
/**
* skb_headroom - bytes at buffer head
* @skb: buffer to check
*
* Return the number of bytes of free space at the head of an &sk_buff.
*/
static inline unsigned int skb_headroom(const struct sk_buff *skb)
{
return skb->data - skb->head;
}
/**
* skb_tailroom - bytes at buffer end
* @skb: buffer to check
*
* Return the number of bytes of free space at the tail of an sk_buff
*/
static inline int skb_tailroom(const struct sk_buff *skb)
{
return skb_is_nonlinear(skb) ? 0 : skb->end - skb->tail;
}
/**
* skb_availroom - bytes at buffer end
* @skb: buffer to check
*
* Return the number of bytes of free space at the tail of an sk_buff
* allocated by sk_stream_alloc()
*/
static inline int skb_availroom(const struct sk_buff *skb)
{
if (skb_is_nonlinear(skb))
return 0;
return skb->end - skb->tail - skb->reserved_tailroom;
}
/**
* skb_reserve - adjust headroom
* @skb: buffer to alter
* @len: bytes to move
*
* Increase the headroom of an empty &sk_buff by reducing the tail
* room. This is only allowed for an empty buffer.
*/
static inline void skb_reserve(struct sk_buff *skb, int len)
{
skb->data += len;
skb->tail += len;
}
/**
* skb_tailroom_reserve - adjust reserved_tailroom
* @skb: buffer to alter
* @mtu: maximum amount of headlen permitted
* @needed_tailroom: minimum amount of reserved_tailroom
*
* Set reserved_tailroom so that headlen can be as large as possible but
* not larger than mtu and tailroom cannot be smaller than
* needed_tailroom.
* The required headroom should already have been reserved before using
* this function.
*/
static inline void skb_tailroom_reserve(struct sk_buff *skb, unsigned int mtu,
unsigned int needed_tailroom)
{
SKB_LINEAR_ASSERT(skb);
if (mtu < skb_tailroom(skb) - needed_tailroom)
/* use at most mtu */
skb->reserved_tailroom = skb_tailroom(skb) - mtu;
else
/* use up to all available space */
skb->reserved_tailroom = needed_tailroom;
}
#define ENCAP_TYPE_ETHER 0
#define ENCAP_TYPE_IPPROTO 1
static inline void skb_set_inner_protocol(struct sk_buff *skb,
__be16 protocol)
{
skb->inner_protocol = protocol;
skb->inner_protocol_type = ENCAP_TYPE_ETHER;
}
static inline void skb_set_inner_ipproto(struct sk_buff *skb,
__u8 ipproto)
{
skb->inner_ipproto = ipproto;
skb->inner_protocol_type = ENCAP_TYPE_IPPROTO;
}
static inline void skb_reset_inner_headers(struct sk_buff *skb)
{
skb->inner_mac_header = skb->mac_header;
skb->inner_network_header = skb->network_header;
skb->inner_transport_header = skb->transport_header;
}
static inline void skb_reset_mac_len(struct sk_buff *skb)
{
skb->mac_len = skb->network_header - skb->mac_header;
}
static inline unsigned char *skb_inner_transport_header(const struct sk_buff
*skb)
{
return skb->head + skb->inner_transport_header;
}
static inline int skb_inner_transport_offset(const struct sk_buff *skb)
{
return skb_inner_transport_header(skb) - skb->data;
}
static inline void skb_reset_inner_transport_header(struct sk_buff *skb)
{
skb->inner_transport_header = skb->data - skb->head;
}
static inline void skb_set_inner_transport_header(struct sk_buff *skb,
const int offset)
{
skb_reset_inner_transport_header(skb);
skb->inner_transport_header += offset;
}
static inline unsigned char *skb_inner_network_header(const struct sk_buff *skb)
{
return skb->head + skb->inner_network_header;
}
static inline void skb_reset_inner_network_header(struct sk_buff *skb)
{
skb->inner_network_header = skb->data - skb->head;
}
static inline void skb_set_inner_network_header(struct sk_buff *skb,
const int offset)
{
skb_reset_inner_network_header(skb);
skb->inner_network_header += offset;
}
static inline unsigned char *skb_inner_mac_header(const struct sk_buff *skb)
{
return skb->head + skb->inner_mac_header;
}
static inline void skb_reset_inner_mac_header(struct sk_buff *skb)
{
skb->inner_mac_header = skb->data - skb->head;
}
static inline void skb_set_inner_mac_header(struct sk_buff *skb,
const int offset)
{
skb_reset_inner_mac_header(skb);
skb->inner_mac_header += offset;
}
static inline bool skb_transport_header_was_set(const struct sk_buff *skb)
{
return skb->transport_header != (typeof(skb->transport_header))~0U;
}
static inline unsigned char *skb_transport_header(const struct sk_buff *skb)
{
return skb->head + skb->transport_header;
}
static inline void skb_reset_transport_header(struct sk_buff *skb)
{
skb->transport_header = skb->data - skb->head;
}
static inline void skb_set_transport_header(struct sk_buff *skb,
const int offset)
{
skb_reset_transport_header(skb);
skb->transport_header += offset;
}
static inline unsigned char *skb_network_header(const struct sk_buff *skb)
{
return skb->head + skb->network_header;
}
static inline void skb_reset_network_header(struct sk_buff *skb)
{
skb->network_header = skb->data - skb->head;
}
static inline void skb_set_network_header(struct sk_buff *skb, const int offset)
{
skb_reset_network_header(skb);
skb->network_header += offset;
}
static inline unsigned char *skb_mac_header(const struct sk_buff *skb)
{
return skb->head + skb->mac_header;
}
static inline int skb_mac_offset(const struct sk_buff *skb)
{
return skb_mac_header(skb) - skb->data;
}
static inline u32 skb_mac_header_len(const struct sk_buff *skb)
{
return skb->network_header - skb->mac_header;
}
static inline int skb_mac_header_was_set(const struct sk_buff *skb)
{
return skb->mac_header != (typeof(skb->mac_header))~0U;
}
static inline void skb_unset_mac_header(struct sk_buff *skb)
{
skb->mac_header = (typeof(skb->mac_header))~0U;
}
static inline void skb_reset_mac_header(struct sk_buff *skb)
{
skb->mac_header = skb->data - skb->head;
}
static inline void skb_set_mac_header(struct sk_buff *skb, const int offset)
{
skb_reset_mac_header(skb);
skb->mac_header += offset;
}
static inline void skb_pop_mac_header(struct sk_buff *skb)
{
skb->mac_header = skb->network_header;
}
static inline void skb_probe_transport_header(struct sk_buff *skb)
{
struct flow_keys_basic keys;
if (skb_transport_header_was_set(skb))
return;
if (skb_flow_dissect_flow_keys_basic(NULL, skb, &keys,
NULL, 0, 0, 0, 0))
skb_set_transport_header(skb, keys.control.thoff);
}
static inline void skb_mac_header_rebuild(struct sk_buff *skb)
{
if (skb_mac_header_was_set(skb)) {
const unsigned char *old_mac = skb_mac_header(skb);
skb_set_mac_header(skb, -skb->mac_len);
memmove(skb_mac_header(skb), old_mac, skb->mac_len);
}
}
static inline int skb_checksum_start_offset(const struct sk_buff *skb)
{
return skb->csum_start - skb_headroom(skb);
}
static inline unsigned char *skb_checksum_start(const struct sk_buff *skb)
{
return skb->head + skb->csum_start;
}
static inline int skb_transport_offset(const struct sk_buff *skb)
{
return skb_transport_header(skb) - skb->data;
}
static inline u32 skb_network_header_len(const struct sk_buff *skb)
{
return skb->transport_header - skb->network_header;
}
static inline u32 skb_inner_network_header_len(const struct sk_buff *skb)
{
return skb->inner_transport_header - skb->inner_network_header;
}
static inline int skb_network_offset(const struct sk_buff *skb)
{
return skb_network_header(skb) - skb->data;
}
static inline int skb_inner_network_offset(const struct sk_buff *skb)
{
return skb_inner_network_header(skb) - skb->data;
}
static inline int pskb_network_may_pull(struct sk_buff *skb, unsigned int len)
{
return pskb_may_pull(skb, skb_network_offset(skb) + len);
}
/*
* CPUs often take a performance hit when accessing unaligned memory
* locations. The actual performance hit varies, it can be small if the
* hardware handles it or large if we have to take an exception and fix it
* in software.
*
* Since an ethernet header is 14 bytes network drivers often end up with
* the IP header at an unaligned offset. The IP header can be aligned by
* shifting the start of the packet by 2 bytes. Drivers should do this
* with:
*
* skb_reserve(skb, NET_IP_ALIGN);
*
* The downside to this alignment of the IP header is that the DMA is now
* unaligned. On some architectures the cost of an unaligned DMA is high
* and this cost outweighs the gains made by aligning the IP header.
*
* Since this trade off varies between architectures, we allow NET_IP_ALIGN
* to be overridden.
*/
#ifndef NET_IP_ALIGN
#define NET_IP_ALIGN 2
#endif
/*
* The networking layer reserves some headroom in skb data (via
* dev_alloc_skb). This is used to avoid having to reallocate skb data when
* the header has to grow. In the default case, if the header has to grow
* 32 bytes or less we avoid the reallocation.
*
* Unfortunately this headroom changes the DMA alignment of the resulting
* network packet. As for NET_IP_ALIGN, this unaligned DMA is expensive
* on some architectures. An architecture can override this value,
* perhaps setting it to a cacheline in size (since that will maintain
* cacheline alignment of the DMA). It must be a power of 2.
*
* Various parts of the networking layer expect at least 32 bytes of
* headroom, you should not reduce this.
*
* Using max(32, L1_CACHE_BYTES) makes sense (especially with RPS)
* to reduce average number of cache lines per packet.
* get_rps_cpu() for example only access one 64 bytes aligned block :
* NET_IP_ALIGN(2) + ethernet_header(14) + IP_header(20/40) + ports(8)
*/
#ifndef NET_SKB_PAD
#define NET_SKB_PAD max(32, L1_CACHE_BYTES)
#endif
int ___pskb_trim(struct sk_buff *skb, unsigned int len);
static inline void __skb_set_length(struct sk_buff *skb, unsigned int len)
{
if (WARN_ON(skb_is_nonlinear(skb)))
return;
skb->len = len;
skb_set_tail_pointer(skb, len);
}
static inline void __skb_trim(struct sk_buff *skb, unsigned int len)
{
__skb_set_length(skb, len);
}
void skb_trim(struct sk_buff *skb, unsigned int len);
static inline int __pskb_trim(struct sk_buff *skb, unsigned int len)
{
if (skb->data_len) return ___pskb_trim(skb, len);
__skb_trim(skb, len);
return 0;
}
static inline int pskb_trim(struct sk_buff *skb, unsigned int len)
{
return (len < skb->len) ? __pskb_trim(skb, len) : 0;
}
/**
* pskb_trim_unique - remove end from a paged unique (not cloned) buffer
* @skb: buffer to alter
* @len: new length
*
* This is identical to pskb_trim except that the caller knows that
* the skb is not cloned so we should never get an error due to out-
* of-memory.
*/
static inline void pskb_trim_unique(struct sk_buff *skb, unsigned int len)
{
int err = pskb_trim(skb, len);
BUG_ON(err);
}
static inline int __skb_grow(struct sk_buff *skb, unsigned int len)
{
unsigned int diff = len - skb->len;
if (skb_tailroom(skb) < diff) {
int ret = pskb_expand_head(skb, 0, diff - skb_tailroom(skb),
GFP_ATOMIC);
if (ret)
return ret;
}
__skb_set_length(skb, len);
return 0;
}
/**
* skb_orphan - orphan a buffer
* @skb: buffer to orphan
*
* If a buffer currently has an owner then we call the owner's
* destructor function and make the @skb unowned. The buffer continues
* to exist but is no longer charged to its former owner.
*/
static inline void skb_orphan(struct sk_buff *skb)
{
if (skb->destructor) { skb->destructor(skb);
skb->destructor = NULL;
skb->sk = NULL;
} else {
BUG_ON(skb->sk);
}
}
/**
* skb_orphan_frags - orphan the frags contained in a buffer
* @skb: buffer to orphan frags from
* @gfp_mask: allocation mask for replacement pages
*
* For each frag in the SKB which needs a destructor (i.e. has an
* owner) create a copy of that frag and release the original
* page by calling the destructor.
*/
static inline int skb_orphan_frags(struct sk_buff *skb, gfp_t gfp_mask)
{
if (likely(!skb_zcopy(skb)))
return 0;
if (!skb_zcopy_is_nouarg(skb) &&
skb_uarg(skb)->callback == msg_zerocopy_callback)
return 0;
return skb_copy_ubufs(skb, gfp_mask);
}
/* Frags must be orphaned, even if refcounted, if skb might loop to rx path */
static inline int skb_orphan_frags_rx(struct sk_buff *skb, gfp_t gfp_mask)
{
if (likely(!skb_zcopy(skb)))
return 0;
return skb_copy_ubufs(skb, gfp_mask);
}
/**
* __skb_queue_purge - empty a list
* @list: list to empty
*
* Delete all buffers on an &sk_buff list. Each buffer is removed from
* the list and one reference dropped. This function does not take the
* list lock and the caller must hold the relevant locks to use it.
*/
static inline void __skb_queue_purge(struct sk_buff_head *list)
{
struct sk_buff *skb;
while ((skb = __skb_dequeue(list)) != NULL)
kfree_skb(skb);
}
void skb_queue_purge(struct sk_buff_head *list);
unsigned int skb_rbtree_purge(struct rb_root *root);
void *__netdev_alloc_frag_align(unsigned int fragsz, unsigned int align_mask);
/**
* netdev_alloc_frag - allocate a page fragment
* @fragsz: fragment size
*
* Allocates a frag from a page for receive buffer.
* Uses GFP_ATOMIC allocations.
*/
static inline void *netdev_alloc_frag(unsigned int fragsz)
{
return __netdev_alloc_frag_align(fragsz, ~0u);
}
static inline void *netdev_alloc_frag_align(unsigned int fragsz,
unsigned int align)
{
WARN_ON_ONCE(!is_power_of_2(align));
return __netdev_alloc_frag_align(fragsz, -align);
}
struct sk_buff *__netdev_alloc_skb(struct net_device *dev, unsigned int length,
gfp_t gfp_mask);
/**
* netdev_alloc_skb - allocate an skbuff for rx on a specific device
* @dev: network device to receive on
* @length: length to allocate
*
* Allocate a new &sk_buff and assign it a usage count of one. The
* buffer has unspecified headroom built in. Users should allocate
* the headroom they think they need without accounting for the
* built in space. The built in space is used for optimisations.
*
* %NULL is returned if there is no free memory. Although this function
* allocates memory it can be called from an interrupt.
*/
static inline struct sk_buff *netdev_alloc_skb(struct net_device *dev,
unsigned int length)
{
return __netdev_alloc_skb(dev, length, GFP_ATOMIC);
}
/* legacy helper around __netdev_alloc_skb() */
static inline struct sk_buff *__dev_alloc_skb(unsigned int length,
gfp_t gfp_mask)
{
return __netdev_alloc_skb(NULL, length, gfp_mask);
}
/* legacy helper around netdev_alloc_skb() */
static inline struct sk_buff *dev_alloc_skb(unsigned int length)
{
return netdev_alloc_skb(NULL, length);
}
static inline struct sk_buff *__netdev_alloc_skb_ip_align(struct net_device *dev,
unsigned int length, gfp_t gfp)
{
struct sk_buff *skb = __netdev_alloc_skb(dev, length + NET_IP_ALIGN, gfp);
if (NET_IP_ALIGN && skb)
skb_reserve(skb, NET_IP_ALIGN);
return skb;
}
static inline struct sk_buff *netdev_alloc_skb_ip_align(struct net_device *dev,
unsigned int length)
{
return __netdev_alloc_skb_ip_align(dev, length, GFP_ATOMIC);
}
static inline void skb_free_frag(void *addr)
{
page_frag_free(addr);
}
void *__napi_alloc_frag_align(unsigned int fragsz, unsigned int align_mask);
static inline void *napi_alloc_frag(unsigned int fragsz)
{
return __napi_alloc_frag_align(fragsz, ~0u);
}
static inline void *napi_alloc_frag_align(unsigned int fragsz,
unsigned int align)
{
WARN_ON_ONCE(!is_power_of_2(align));
return __napi_alloc_frag_align(fragsz, -align);
}
struct sk_buff *__napi_alloc_skb(struct napi_struct *napi,
unsigned int length, gfp_t gfp_mask);
static inline struct sk_buff *napi_alloc_skb(struct napi_struct *napi,
unsigned int length)
{
return __napi_alloc_skb(napi, length, GFP_ATOMIC);
}
void napi_consume_skb(struct sk_buff *skb, int budget);
void napi_skb_free_stolen_head(struct sk_buff *skb);
void __kfree_skb_defer(struct sk_buff *skb);
/**
* __dev_alloc_pages - allocate page for network Rx
* @gfp_mask: allocation priority. Set __GFP_NOMEMALLOC if not for network Rx
* @order: size of the allocation
*
* Allocate a new page.
*
* %NULL is returned if there is no free memory.
*/
static inline struct page *__dev_alloc_pages(gfp_t gfp_mask,
unsigned int order)
{
/* This piece of code contains several assumptions.
* 1. This is for device Rx, therefor a cold page is preferred.
* 2. The expectation is the user wants a compound page.
* 3. If requesting a order 0 page it will not be compound
* due to the check to see if order has a value in prep_new_page
* 4. __GFP_MEMALLOC is ignored if __GFP_NOMEMALLOC is set due to
* code in gfp_to_alloc_flags that should be enforcing this.
*/
gfp_mask |= __GFP_COMP | __GFP_MEMALLOC;
return alloc_pages_node(NUMA_NO_NODE, gfp_mask, order);
}
static inline struct page *dev_alloc_pages(unsigned int order)
{
return __dev_alloc_pages(GFP_ATOMIC | __GFP_NOWARN, order);
}
/**
* __dev_alloc_page - allocate a page for network Rx
* @gfp_mask: allocation priority. Set __GFP_NOMEMALLOC if not for network Rx
*
* Allocate a new page.
*
* %NULL is returned if there is no free memory.
*/
static inline struct page *__dev_alloc_page(gfp_t gfp_mask)
{
return __dev_alloc_pages(gfp_mask, 0);
}
static inline struct page *dev_alloc_page(void)
{
return dev_alloc_pages(0);
}
/**
* dev_page_is_reusable - check whether a page can be reused for network Rx
* @page: the page to test
*
* A page shouldn't be considered for reusing/recycling if it was allocated
* under memory pressure or at a distant memory node.
*
* Returns false if this page should be returned to page allocator, true
* otherwise.
*/
static inline bool dev_page_is_reusable(const struct page *page)
{
return likely(page_to_nid(page) == numa_mem_id() &&
!page_is_pfmemalloc(page));
}
/**
* skb_propagate_pfmemalloc - Propagate pfmemalloc if skb is allocated after RX page
* @page: The page that was allocated from skb_alloc_page
* @skb: The skb that may need pfmemalloc set
*/
static inline void skb_propagate_pfmemalloc(const struct page *page,
struct sk_buff *skb)
{
if (page_is_pfmemalloc(page))
skb->pfmemalloc = true;
}
/**
* skb_frag_off() - Returns the offset of a skb fragment
* @frag: the paged fragment
*/
static inline unsigned int skb_frag_off(const skb_frag_t *frag)
{
return frag->bv_offset;
}
/**
* skb_frag_off_add() - Increments the offset of a skb fragment by @delta
* @frag: skb fragment
* @delta: value to add
*/
static inline void skb_frag_off_add(skb_frag_t *frag, int delta)
{
frag->bv_offset += delta;
}
/**
* skb_frag_off_set() - Sets the offset of a skb fragment
* @frag: skb fragment
* @offset: offset of fragment
*/
static inline void skb_frag_off_set(skb_frag_t *frag, unsigned int offset)
{
frag->bv_offset = offset;
}
/**
* skb_frag_off_copy() - Sets the offset of a skb fragment from another fragment
* @fragto: skb fragment where offset is set
* @fragfrom: skb fragment offset is copied from
*/
static inline void skb_frag_off_copy(skb_frag_t *fragto,
const skb_frag_t *fragfrom)
{
fragto->bv_offset = fragfrom->bv_offset;
}
/**
* skb_frag_page - retrieve the page referred to by a paged fragment
* @frag: the paged fragment
*
* Returns the &struct page associated with @frag.
*/
static inline struct page *skb_frag_page(const skb_frag_t *frag)
{
return frag->bv_page;
}
/**
* __skb_frag_ref - take an addition reference on a paged fragment.
* @frag: the paged fragment
*
* Takes an additional reference on the paged fragment @frag.
*/
static inline void __skb_frag_ref(skb_frag_t *frag)
{
get_page(skb_frag_page(frag));
}
/**
* skb_frag_ref - take an addition reference on a paged fragment of an skb.
* @skb: the buffer
* @f: the fragment offset.
*
* Takes an additional reference on the @f'th paged fragment of @skb.
*/
static inline void skb_frag_ref(struct sk_buff *skb, int f)
{
__skb_frag_ref(&skb_shinfo(skb)->frags[f]);
}
/**
* __skb_frag_unref - release a reference on a paged fragment.
* @frag: the paged fragment
* @recycle: recycle the page if allocated via page_pool
*
* Releases a reference on the paged fragment @frag
* or recycles the page via the page_pool API.
*/
static inline void __skb_frag_unref(skb_frag_t *frag, bool recycle)
{
struct page *page = skb_frag_page(frag);
#ifdef CONFIG_PAGE_POOL
if (recycle && page_pool_return_skb_page(page))
return;
#endif
put_page(page);
}
/**
* skb_frag_unref - release a reference on a paged fragment of an skb.
* @skb: the buffer
* @f: the fragment offset
*
* Releases a reference on the @f'th paged fragment of @skb.
*/
static inline void skb_frag_unref(struct sk_buff *skb, int f)
{
__skb_frag_unref(&skb_shinfo(skb)->frags[f], skb->pp_recycle);
}
/**
* skb_frag_address - gets the address of the data contained in a paged fragment
* @frag: the paged fragment buffer
*
* Returns the address of the data within @frag. The page must already
* be mapped.
*/
static inline void *skb_frag_address(const skb_frag_t *frag)
{
return page_address(skb_frag_page(frag)) + skb_frag_off(frag);
}
/**
* skb_frag_address_safe - gets the address of the data contained in a paged fragment
* @frag: the paged fragment buffer
*
* Returns the address of the data within @frag. Checks that the page
* is mapped and returns %NULL otherwise.
*/
static inline void *skb_frag_address_safe(const skb_frag_t *frag)
{
void *ptr = page_address(skb_frag_page(frag));
if (unlikely(!ptr))
return NULL;
return ptr + skb_frag_off(frag);
}
/**
* skb_frag_page_copy() - sets the page in a fragment from another fragment
* @fragto: skb fragment where page is set
* @fragfrom: skb fragment page is copied from
*/
static inline void skb_frag_page_copy(skb_frag_t *fragto,
const skb_frag_t *fragfrom)
{
fragto->bv_page = fragfrom->bv_page;
}
/**
* __skb_frag_set_page - sets the page contained in a paged fragment
* @frag: the paged fragment
* @page: the page to set
*
* Sets the fragment @frag to contain @page.
*/
static inline void __skb_frag_set_page(skb_frag_t *frag, struct page *page)
{
frag->bv_page = page;
}
/**
* skb_frag_set_page - sets the page contained in a paged fragment of an skb
* @skb: the buffer
* @f: the fragment offset
* @page: the page to set
*
* Sets the @f'th fragment of @skb to contain @page.
*/
static inline void skb_frag_set_page(struct sk_buff *skb, int f,
struct page *page)
{
__skb_frag_set_page(&skb_shinfo(skb)->frags[f], page);
}
bool skb_page_frag_refill(unsigned int sz, struct page_frag *pfrag, gfp_t prio);
/**
* skb_frag_dma_map - maps a paged fragment via the DMA API
* @dev: the device to map the fragment to
* @frag: the paged fragment to map
* @offset: the offset within the fragment (starting at the
* fragment's own offset)
* @size: the number of bytes to map
* @dir: the direction of the mapping (``PCI_DMA_*``)
*
* Maps the page associated with @frag to @device.
*/
static inline dma_addr_t skb_frag_dma_map(struct device *dev,
const skb_frag_t *frag,
size_t offset, size_t size,
enum dma_data_direction dir)
{
return dma_map_page(dev, skb_frag_page(frag),
skb_frag_off(frag) + offset, size, dir);
}
static inline struct sk_buff *pskb_copy(struct sk_buff *skb,
gfp_t gfp_mask)
{
return __pskb_copy(skb, skb_headroom(skb), gfp_mask);
}
static inline struct sk_buff *pskb_copy_for_clone(struct sk_buff *skb,
gfp_t gfp_mask)
{
return __pskb_copy_fclone(skb, skb_headroom(skb), gfp_mask, true);
}
/**
* skb_clone_writable - is the header of a clone writable
* @skb: buffer to check
* @len: length up to which to write
*
* Returns true if modifying the header part of the cloned buffer
* does not requires the data to be copied.
*/
static inline int skb_clone_writable(const struct sk_buff *skb, unsigned int len)
{
return !skb_header_cloned(skb) && skb_headroom(skb) + len <= skb->hdr_len;
}
static inline int skb_try_make_writable(struct sk_buff *skb,
unsigned int write_len)
{
return skb_cloned(skb) && !skb_clone_writable(skb, write_len) &&
pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
}
static inline int __skb_cow(struct sk_buff *skb, unsigned int headroom,
int cloned)
{
int delta = 0;
if (headroom > skb_headroom(skb)) delta = headroom - skb_headroom(skb); if (delta || cloned) return pskb_expand_head(skb, ALIGN(delta, NET_SKB_PAD), 0,
GFP_ATOMIC);
return 0;
}
/**
* skb_cow - copy header of skb when it is required
* @skb: buffer to cow
* @headroom: needed headroom
*
* If the skb passed lacks sufficient headroom or its data part
* is shared, data is reallocated. If reallocation fails, an error
* is returned and original skb is not changed.
*
* The result is skb with writable area skb->head...skb->tail
* and at least @headroom of space at head.
*/
static inline int skb_cow(struct sk_buff *skb, unsigned int headroom)
{
return __skb_cow(skb, headroom, skb_cloned(skb));
}
/**
* skb_cow_head - skb_cow but only making the head writable
* @skb: buffer to cow
* @headroom: needed headroom
*
* This function is identical to skb_cow except that we replace the
* skb_cloned check by skb_header_cloned. It should be used when
* you only need to push on some header and do not need to modify
* the data.
*/
static inline int skb_cow_head(struct sk_buff *skb, unsigned int headroom)
{
return __skb_cow(skb, headroom, skb_header_cloned(skb));
}
/**
* skb_padto - pad an skbuff up to a minimal size
* @skb: buffer to pad
* @len: minimal length
*
* Pads up a buffer to ensure the trailing bytes exist and are
* blanked. If the buffer already contains sufficient data it
* is untouched. Otherwise it is extended. Returns zero on
* success. The skb is freed on error.
*/
static inline int skb_padto(struct sk_buff *skb, unsigned int len)
{
unsigned int size = skb->len;
if (likely(size >= len))
return 0;
return skb_pad(skb, len - size);
}
/**
* __skb_put_padto - increase size and pad an skbuff up to a minimal size
* @skb: buffer to pad
* @len: minimal length
* @free_on_error: free buffer on error
*
* Pads up a buffer to ensure the trailing bytes exist and are
* blanked. If the buffer already contains sufficient data it
* is untouched. Otherwise it is extended. Returns zero on
* success. The skb is freed on error if @free_on_error is true.
*/
static inline int __must_check __skb_put_padto(struct sk_buff *skb,
unsigned int len,
bool free_on_error)
{
unsigned int size = skb->len;
if (unlikely(size < len)) {
len -= size;
if (__skb_pad(skb, len, free_on_error))
return -ENOMEM;
__skb_put(skb, len);
}
return 0;
}
/**
* skb_put_padto - increase size and pad an skbuff up to a minimal size
* @skb: buffer to pad
* @len: minimal length
*
* Pads up a buffer to ensure the trailing bytes exist and are
* blanked. If the buffer already contains sufficient data it
* is untouched. Otherwise it is extended. Returns zero on
* success. The skb is freed on error.
*/
static inline int __must_check skb_put_padto(struct sk_buff *skb, unsigned int len)
{
return __skb_put_padto(skb, len, true);
}
static inline int skb_add_data(struct sk_buff *skb,
struct iov_iter *from, int copy)
{
const int off = skb->len;
if (skb->ip_summed == CHECKSUM_NONE) {
__wsum csum = 0;
if (csum_and_copy_from_iter_full(skb_put(skb, copy), copy,
&csum, from)) {
skb->csum = csum_block_add(skb->csum, csum, off);
return 0;
}
} else if (copy_from_iter_full(skb_put(skb, copy), copy, from))
return 0;
__skb_trim(skb, off);
return -EFAULT;
}
static inline bool skb_can_coalesce(struct sk_buff *skb, int i,
const struct page *page, int off)
{
if (skb_zcopy(skb))
return false;
if (i) { const skb_frag_t *frag = &skb_shinfo(skb)->frags[i - 1];
return page == skb_frag_page(frag) &&
off == skb_frag_off(frag) + skb_frag_size(frag);
}
return false;
}
static inline int __skb_linearize(struct sk_buff *skb)
{
return __pskb_pull_tail(skb, skb->data_len) ? 0 : -ENOMEM;
}
/**
* skb_linearize - convert paged skb to linear one
* @skb: buffer to linarize
*
* If there is no free memory -ENOMEM is returned, otherwise zero
* is returned and the old skb data released.
*/
static inline int skb_linearize(struct sk_buff *skb)
{
return skb_is_nonlinear(skb) ? __skb_linearize(skb) : 0;
}
/**
* skb_has_shared_frag - can any frag be overwritten
* @skb: buffer to test
*
* Return true if the skb has at least one frag that might be modified
* by an external entity (as in vmsplice()/sendfile())
*/
static inline bool skb_has_shared_frag(const struct sk_buff *skb)
{
return skb_is_nonlinear(skb) && skb_shinfo(skb)->flags & SKBFL_SHARED_FRAG;
}
/**
* skb_linearize_cow - make sure skb is linear and writable
* @skb: buffer to process
*
* If there is no free memory -ENOMEM is returned, otherwise zero
* is returned and the old skb data released.
*/
static inline int skb_linearize_cow(struct sk_buff *skb)
{
return skb_is_nonlinear(skb) || skb_cloned(skb) ?
__skb_linearize(skb) : 0;
}
static __always_inline void
__skb_postpull_rcsum(struct sk_buff *skb, const void *start, unsigned int len,
unsigned int off)
{
if (skb->ip_summed == CHECKSUM_COMPLETE)
skb->csum = csum_block_sub(skb->csum,
csum_partial(start, len, 0), off);
else if (skb->ip_summed == CHECKSUM_PARTIAL &&
skb_checksum_start_offset(skb) < 0)
skb->ip_summed = CHECKSUM_NONE;
}
/**
* skb_postpull_rcsum - update checksum for received skb after pull
* @skb: buffer to update
* @start: start of data before pull
* @len: length of data pulled
*
* After doing a pull on a received packet, you need to call this to
* update the CHECKSUM_COMPLETE checksum, or set ip_summed to
* CHECKSUM_NONE so that it can be recomputed from scratch.
*/
static inline void skb_postpull_rcsum(struct sk_buff *skb,
const void *start, unsigned int len)
{
__skb_postpull_rcsum(skb, start, len, 0);
}
static __always_inline void
__skb_postpush_rcsum(struct sk_buff *skb, const void *start, unsigned int len,
unsigned int off)
{
if (skb->ip_summed == CHECKSUM_COMPLETE)
skb->csum = csum_block_add(skb->csum,
csum_partial(start, len, 0), off);
}
/**
* skb_postpush_rcsum - update checksum for received skb after push
* @skb: buffer to update
* @start: start of data after push
* @len: length of data pushed
*
* After doing a push on a received packet, you need to call this to
* update the CHECKSUM_COMPLETE checksum.
*/
static inline void skb_postpush_rcsum(struct sk_buff *skb,
const void *start, unsigned int len)
{
__skb_postpush_rcsum(skb, start, len, 0);
}
void *skb_pull_rcsum(struct sk_buff *skb, unsigned int len);
/**
* skb_push_rcsum - push skb and update receive checksum
* @skb: buffer to update
* @len: length of data pulled
*
* This function performs an skb_push on the packet and updates
* the CHECKSUM_COMPLETE checksum. It should be used on
* receive path processing instead of skb_push unless you know
* that the checksum difference is zero (e.g., a valid IP header)
* or you are setting ip_summed to CHECKSUM_NONE.
*/
static inline void *skb_push_rcsum(struct sk_buff *skb, unsigned int len)
{
skb_push(skb, len);
skb_postpush_rcsum(skb, skb->data, len);
return skb->data;
}
int pskb_trim_rcsum_slow(struct sk_buff *skb, unsigned int len);
/**
* pskb_trim_rcsum - trim received skb and update checksum
* @skb: buffer to trim
* @len: new length
*
* This is exactly the same as pskb_trim except that it ensures the
* checksum of received packets are still valid after the operation.
* It can change skb pointers.
*/
static inline int pskb_trim_rcsum(struct sk_buff *skb, unsigned int len)
{
if (likely(len >= skb->len))
return 0;
return pskb_trim_rcsum_slow(skb, len);
}
static inline int __skb_trim_rcsum(struct sk_buff *skb, unsigned int len)
{
if (skb->ip_summed == CHECKSUM_COMPLETE)
skb->ip_summed = CHECKSUM_NONE;
__skb_trim(skb, len);
return 0;
}
static inline int __skb_grow_rcsum(struct sk_buff *skb, unsigned int len)
{
if (skb->ip_summed == CHECKSUM_COMPLETE)
skb->ip_summed = CHECKSUM_NONE;
return __skb_grow(skb, len);
}
#define rb_to_skb(rb) rb_entry_safe(rb, struct sk_buff, rbnode)
#define skb_rb_first(root) rb_to_skb(rb_first(root))
#define skb_rb_last(root) rb_to_skb(rb_last(root))
#define skb_rb_next(skb) rb_to_skb(rb_next(&(skb)->rbnode))
#define skb_rb_prev(skb) rb_to_skb(rb_prev(&(skb)->rbnode))
#define skb_queue_walk(queue, skb) \
for (skb = (queue)->next; \
skb != (struct sk_buff *)(queue); \
skb = skb->next)
#define skb_queue_walk_safe(queue, skb, tmp) \
for (skb = (queue)->next, tmp = skb->next; \
skb != (struct sk_buff *)(queue); \
skb = tmp, tmp = skb->next)
#define skb_queue_walk_from(queue, skb) \
for (; skb != (struct sk_buff *)(queue); \
skb = skb->next)
#define skb_rbtree_walk(skb, root) \
for (skb = skb_rb_first(root); skb != NULL; \
skb = skb_rb_next(skb))
#define skb_rbtree_walk_from(skb) \
for (; skb != NULL; \
skb = skb_rb_next(skb))
#define skb_rbtree_walk_from_safe(skb, tmp) \
for (; tmp = skb ? skb_rb_next(skb) : NULL, (skb != NULL); \
skb = tmp)
#define skb_queue_walk_from_safe(queue, skb, tmp) \
for (tmp = skb->next; \
skb != (struct sk_buff *)(queue); \
skb = tmp, tmp = skb->next)
#define skb_queue_reverse_walk(queue, skb) \
for (skb = (queue)->prev; \
skb != (struct sk_buff *)(queue); \
skb = skb->prev)
#define skb_queue_reverse_walk_safe(queue, skb, tmp) \
for (skb = (queue)->prev, tmp = skb->prev; \
skb != (struct sk_buff *)(queue); \
skb = tmp, tmp = skb->prev)
#define skb_queue_reverse_walk_from_safe(queue, skb, tmp) \
for (tmp = skb->prev; \
skb != (struct sk_buff *)(queue); \
skb = tmp, tmp = skb->prev)
static inline bool skb_has_frag_list(const struct sk_buff *skb)
{
return skb_shinfo(skb)->frag_list != NULL;
}
static inline void skb_frag_list_init(struct sk_buff *skb)
{
skb_shinfo(skb)->frag_list = NULL;
}
#define skb_walk_frags(skb, iter) \
for (iter = skb_shinfo(skb)->frag_list; iter; iter = iter->next)
int __skb_wait_for_more_packets(struct sock *sk, struct sk_buff_head *queue,
int *err, long *timeo_p,
const struct sk_buff *skb);
struct sk_buff *__skb_try_recv_from_queue(struct sock *sk,
struct sk_buff_head *queue,
unsigned int flags,
int *off, int *err,
struct sk_buff **last);
struct sk_buff *__skb_try_recv_datagram(struct sock *sk,
struct sk_buff_head *queue,
unsigned int flags, int *off, int *err,
struct sk_buff **last);
struct sk_buff *__skb_recv_datagram(struct sock *sk,
struct sk_buff_head *sk_queue,
unsigned int flags, int *off, int *err);
struct sk_buff *skb_recv_datagram(struct sock *sk, unsigned flags, int noblock,
int *err);
__poll_t datagram_poll(struct file *file, struct socket *sock,
struct poll_table_struct *wait);
int skb_copy_datagram_iter(const struct sk_buff *from, int offset,
struct iov_iter *to, int size);
static inline int skb_copy_datagram_msg(const struct sk_buff *from, int offset,
struct msghdr *msg, int size)
{
return skb_copy_datagram_iter(from, offset, &msg->msg_iter, size);
}
int skb_copy_and_csum_datagram_msg(struct sk_buff *skb, int hlen,
struct msghdr *msg);
int skb_copy_and_hash_datagram_iter(const struct sk_buff *skb, int offset,
struct iov_iter *to, int len,
struct ahash_request *hash);
int skb_copy_datagram_from_iter(struct sk_buff *skb, int offset,
struct iov_iter *from, int len);
int zerocopy_sg_from_iter(struct sk_buff *skb, struct iov_iter *frm);
void skb_free_datagram(struct sock *sk, struct sk_buff *skb);
void __skb_free_datagram_locked(struct sock *sk, struct sk_buff *skb, int len);
static inline void skb_free_datagram_locked(struct sock *sk,
struct sk_buff *skb)
{
__skb_free_datagram_locked(sk, skb, 0);
}
int skb_kill_datagram(struct sock *sk, struct sk_buff *skb, unsigned int flags);
int skb_copy_bits(const struct sk_buff *skb, int offset, void *to, int len);
int skb_store_bits(struct sk_buff *skb, int offset, const void *from, int len);
__wsum skb_copy_and_csum_bits(const struct sk_buff *skb, int offset, u8 *to,
int len);
int skb_splice_bits(struct sk_buff *skb, struct sock *sk, unsigned int offset,
struct pipe_inode_info *pipe, unsigned int len,
unsigned int flags);
int skb_send_sock_locked(struct sock *sk, struct sk_buff *skb, int offset,
int len);
int skb_send_sock(struct sock *sk, struct sk_buff *skb, int offset, int len);
void skb_copy_and_csum_dev(const struct sk_buff *skb, u8 *to);
unsigned int skb_zerocopy_headlen(const struct sk_buff *from);
int skb_zerocopy(struct sk_buff *to, struct sk_buff *from,
int len, int hlen);
void skb_split(struct sk_buff *skb, struct sk_buff *skb1, const u32 len);
int skb_shift(struct sk_buff *tgt, struct sk_buff *skb, int shiftlen);
void skb_scrub_packet(struct sk_buff *skb, bool xnet);
bool skb_gso_validate_network_len(const struct sk_buff *skb, unsigned int mtu);
bool skb_gso_validate_mac_len(const struct sk_buff *skb, unsigned int len);
struct sk_buff *skb_segment(struct sk_buff *skb, netdev_features_t features);
struct sk_buff *skb_segment_list(struct sk_buff *skb, netdev_features_t features,
unsigned int offset);
struct sk_buff *skb_vlan_untag(struct sk_buff *skb);
int skb_ensure_writable(struct sk_buff *skb, int write_len);
int __skb_vlan_pop(struct sk_buff *skb, u16 *vlan_tci);
int skb_vlan_pop(struct sk_buff *skb);
int skb_vlan_push(struct sk_buff *skb, __be16 vlan_proto, u16 vlan_tci);
int skb_eth_pop(struct sk_buff *skb);
int skb_eth_push(struct sk_buff *skb, const unsigned char *dst,
const unsigned char *src);
int skb_mpls_push(struct sk_buff *skb, __be32 mpls_lse, __be16 mpls_proto,
int mac_len, bool ethernet);
int skb_mpls_pop(struct sk_buff *skb, __be16 next_proto, int mac_len,
bool ethernet);
int skb_mpls_update_lse(struct sk_buff *skb, __be32 mpls_lse);
int skb_mpls_dec_ttl(struct sk_buff *skb);
struct sk_buff *pskb_extract(struct sk_buff *skb, int off, int to_copy,
gfp_t gfp);
static inline int memcpy_from_msg(void *data, struct msghdr *msg, int len)
{
return copy_from_iter_full(data, len, &msg->msg_iter) ? 0 : -EFAULT;
}
static inline int memcpy_to_msg(struct msghdr *msg, void *data, int len)
{
return copy_to_iter(data, len, &msg->msg_iter) == len ? 0 : -EFAULT;
}
struct skb_checksum_ops {
__wsum (*update)(const void *mem, int len, __wsum wsum);
__wsum (*combine)(__wsum csum, __wsum csum2, int offset, int len);
};
extern const struct skb_checksum_ops *crc32c_csum_stub __read_mostly;
__wsum __skb_checksum(const struct sk_buff *skb, int offset, int len,
__wsum csum, const struct skb_checksum_ops *ops);
__wsum skb_checksum(const struct sk_buff *skb, int offset, int len,
__wsum csum);
static inline void * __must_check
__skb_header_pointer(const struct sk_buff *skb, int offset, int len,
const void *data, int hlen, void *buffer)
{
if (likely(hlen - offset >= len)) return (void *)data + offset; if (!skb || unlikely(skb_copy_bits(skb, offset, buffer, len) < 0))
return NULL;
return buffer;
}
static inline void * __must_check
skb_header_pointer(const struct sk_buff *skb, int offset, int len, void *buffer)
{
return __skb_header_pointer(skb, offset, len, skb->data,
skb_headlen(skb), buffer);
}
/**
* skb_needs_linearize - check if we need to linearize a given skb
* depending on the given device features.
* @skb: socket buffer to check
* @features: net device features
*
* Returns true if either:
* 1. skb has frag_list and the device doesn't support FRAGLIST, or
* 2. skb is fragmented and the device does not support SG.
*/
static inline bool skb_needs_linearize(struct sk_buff *skb,
netdev_features_t features)
{
return skb_is_nonlinear(skb) && ((skb_has_frag_list(skb) && !(features & NETIF_F_FRAGLIST)) || (skb_shinfo(skb)->nr_frags && !(features & NETIF_F_SG)));
}
static inline void skb_copy_from_linear_data(const struct sk_buff *skb,
void *to,
const unsigned int len)
{
memcpy(to, skb->data, len);
}
static inline void skb_copy_from_linear_data_offset(const struct sk_buff *skb,
const int offset, void *to,
const unsigned int len)
{
memcpy(to, skb->data + offset, len);
}
static inline void skb_copy_to_linear_data(struct sk_buff *skb,
const void *from,
const unsigned int len)
{
memcpy(skb->data, from, len);
}
static inline void skb_copy_to_linear_data_offset(struct sk_buff *skb,
const int offset,
const void *from,
const unsigned int len)
{
memcpy(skb->data + offset, from, len);
}
void skb_init(void);
static inline ktime_t skb_get_ktime(const struct sk_buff *skb)
{
return skb->tstamp;
}
/**
* skb_get_timestamp - get timestamp from a skb
* @skb: skb to get stamp from
* @stamp: pointer to struct __kernel_old_timeval to store stamp in
*
* Timestamps are stored in the skb as offsets to a base timestamp.
* This function converts the offset back to a struct timeval and stores
* it in stamp.
*/
static inline void skb_get_timestamp(const struct sk_buff *skb,
struct __kernel_old_timeval *stamp)
{
*stamp = ns_to_kernel_old_timeval(skb->tstamp);
}
static inline void skb_get_new_timestamp(const struct sk_buff *skb,
struct __kernel_sock_timeval *stamp)
{
struct timespec64 ts = ktime_to_timespec64(skb->tstamp);
stamp->tv_sec = ts.tv_sec;
stamp->tv_usec = ts.tv_nsec / 1000;
}
static inline void skb_get_timestampns(const struct sk_buff *skb,
struct __kernel_old_timespec *stamp)
{
struct timespec64 ts = ktime_to_timespec64(skb->tstamp);
stamp->tv_sec = ts.tv_sec;
stamp->tv_nsec = ts.tv_nsec;
}
static inline void skb_get_new_timestampns(const struct sk_buff *skb,
struct __kernel_timespec *stamp)
{
struct timespec64 ts = ktime_to_timespec64(skb->tstamp);
stamp->tv_sec = ts.tv_sec;
stamp->tv_nsec = ts.tv_nsec;
}
static inline void __net_timestamp(struct sk_buff *skb)
{
skb->tstamp = ktime_get_real();
}
static inline ktime_t net_timedelta(ktime_t t)
{
return ktime_sub(ktime_get_real(), t);
}
static inline ktime_t net_invalid_timestamp(void)
{
return 0;
}
static inline u8 skb_metadata_len(const struct sk_buff *skb)
{
return skb_shinfo(skb)->meta_len;
}
static inline void *skb_metadata_end(const struct sk_buff *skb)
{
return skb_mac_header(skb);
}
static inline bool __skb_metadata_differs(const struct sk_buff *skb_a,
const struct sk_buff *skb_b,
u8 meta_len)
{
const void *a = skb_metadata_end(skb_a);
const void *b = skb_metadata_end(skb_b);
/* Using more efficient varaiant than plain call to memcmp(). */
#if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && BITS_PER_LONG == 64
u64 diffs = 0;
switch (meta_len) {
#define __it(x, op) (x -= sizeof(u##op))
#define __it_diff(a, b, op) (*(u##op *)__it(a, op)) ^ (*(u##op *)__it(b, op))
case 32: diffs |= __it_diff(a, b, 64);
fallthrough;
case 24: diffs |= __it_diff(a, b, 64);
fallthrough;
case 16: diffs |= __it_diff(a, b, 64);
fallthrough;
case 8: diffs |= __it_diff(a, b, 64);
break;
case 28: diffs |= __it_diff(a, b, 64);
fallthrough;
case 20: diffs |= __it_diff(a, b, 64);
fallthrough;
case 12: diffs |= __it_diff(a, b, 64);
fallthrough;
case 4: diffs |= __it_diff(a, b, 32);
break;
}
return diffs;
#else
return memcmp(a - meta_len, b - meta_len, meta_len);
#endif
}
static inline bool skb_metadata_differs(const struct sk_buff *skb_a,
const struct sk_buff *skb_b)
{
u8 len_a = skb_metadata_len(skb_a);
u8 len_b = skb_metadata_len(skb_b);
if (!(len_a | len_b))
return false;
return len_a != len_b ?
true : __skb_metadata_differs(skb_a, skb_b, len_a);
}
static inline void skb_metadata_set(struct sk_buff *skb, u8 meta_len)
{
skb_shinfo(skb)->meta_len = meta_len;
}
static inline void skb_metadata_clear(struct sk_buff *skb)
{
skb_metadata_set(skb, 0);
}
struct sk_buff *skb_clone_sk(struct sk_buff *skb);
#ifdef CONFIG_NETWORK_PHY_TIMESTAMPING
void skb_clone_tx_timestamp(struct sk_buff *skb);
bool skb_defer_rx_timestamp(struct sk_buff *skb);
#else /* CONFIG_NETWORK_PHY_TIMESTAMPING */
static inline void skb_clone_tx_timestamp(struct sk_buff *skb)
{
}
static inline bool skb_defer_rx_timestamp(struct sk_buff *skb)
{
return false;
}
#endif /* !CONFIG_NETWORK_PHY_TIMESTAMPING */
/**
* skb_complete_tx_timestamp() - deliver cloned skb with tx timestamps
*
* PHY drivers may accept clones of transmitted packets for
* timestamping via their phy_driver.txtstamp method. These drivers
* must call this function to return the skb back to the stack with a
* timestamp.
*
* @skb: clone of the original outgoing packet
* @hwtstamps: hardware time stamps
*
*/
void skb_complete_tx_timestamp(struct sk_buff *skb,
struct skb_shared_hwtstamps *hwtstamps);
void __skb_tstamp_tx(struct sk_buff *orig_skb, const struct sk_buff *ack_skb,
struct skb_shared_hwtstamps *hwtstamps,
struct sock *sk, int tstype);
/**
* skb_tstamp_tx - queue clone of skb with send time stamps
* @orig_skb: the original outgoing packet
* @hwtstamps: hardware time stamps, may be NULL if not available
*
* If the skb has a socket associated, then this function clones the
* skb (thus sharing the actual data and optional structures), stores
* the optional hardware time stamping information (if non NULL) or
* generates a software time stamp (otherwise), then queues the clone
* to the error queue of the socket. Errors are silently ignored.
*/
void skb_tstamp_tx(struct sk_buff *orig_skb,
struct skb_shared_hwtstamps *hwtstamps);
/**
* skb_tx_timestamp() - Driver hook for transmit timestamping
*
* Ethernet MAC Drivers should call this function in their hard_xmit()
* function immediately before giving the sk_buff to the MAC hardware.
*
* Specifically, one should make absolutely sure that this function is
* called before TX completion of this packet can trigger. Otherwise
* the packet could potentially already be freed.
*
* @skb: A socket buffer.
*/
static inline void skb_tx_timestamp(struct sk_buff *skb)
{
skb_clone_tx_timestamp(skb);
if (skb_shinfo(skb)->tx_flags & SKBTX_SW_TSTAMP) skb_tstamp_tx(skb, NULL);
}
/**
* skb_complete_wifi_ack - deliver skb with wifi status
*
* @skb: the original outgoing packet
* @acked: ack status
*
*/
void skb_complete_wifi_ack(struct sk_buff *skb, bool acked);
__sum16 __skb_checksum_complete_head(struct sk_buff *skb, int len);
__sum16 __skb_checksum_complete(struct sk_buff *skb);
static inline int skb_csum_unnecessary(const struct sk_buff *skb)
{
return ((skb->ip_summed == CHECKSUM_UNNECESSARY) || skb->csum_valid ||
(skb->ip_summed == CHECKSUM_PARTIAL &&
skb_checksum_start_offset(skb) >= 0));
}
/**
* skb_checksum_complete - Calculate checksum of an entire packet
* @skb: packet to process
*
* This function calculates the checksum over the entire packet plus
* the value of skb->csum. The latter can be used to supply the
* checksum of a pseudo header as used by TCP/UDP. It returns the
* checksum.
*
* For protocols that contain complete checksums such as ICMP/TCP/UDP,
* this function can be used to verify that checksum on received
* packets. In that case the function should return zero if the
* checksum is correct. In particular, this function will return zero
* if skb->ip_summed is CHECKSUM_UNNECESSARY which indicates that the
* hardware has already verified the correctness of the checksum.
*/
static inline __sum16 skb_checksum_complete(struct sk_buff *skb)
{
return skb_csum_unnecessary(skb) ?
0 : __skb_checksum_complete(skb);
}
static inline void __skb_decr_checksum_unnecessary(struct sk_buff *skb)
{
if (skb->ip_summed == CHECKSUM_UNNECESSARY) {
if (skb->csum_level == 0)
skb->ip_summed = CHECKSUM_NONE;
else
skb->csum_level--;
}
}
static inline void __skb_incr_checksum_unnecessary(struct sk_buff *skb)
{
if (skb->ip_summed == CHECKSUM_UNNECESSARY) {
if (skb->csum_level < SKB_MAX_CSUM_LEVEL)
skb->csum_level++;
} else if (skb->ip_summed == CHECKSUM_NONE) {
skb->ip_summed = CHECKSUM_UNNECESSARY;
skb->csum_level = 0;
}
}
static inline void __skb_reset_checksum_unnecessary(struct sk_buff *skb)
{
if (skb->ip_summed == CHECKSUM_UNNECESSARY) {
skb->ip_summed = CHECKSUM_NONE;
skb->csum_level = 0;
}
}
/* Check if we need to perform checksum complete validation.
*
* Returns true if checksum complete is needed, false otherwise
* (either checksum is unnecessary or zero checksum is allowed).
*/
static inline bool __skb_checksum_validate_needed(struct sk_buff *skb,
bool zero_okay,
__sum16 check)
{
if (skb_csum_unnecessary(skb) || (zero_okay && !check)) {
skb->csum_valid = 1;
__skb_decr_checksum_unnecessary(skb);
return false;
}
return true;
}
/* For small packets <= CHECKSUM_BREAK perform checksum complete directly
* in checksum_init.
*/
#define CHECKSUM_BREAK 76
/* Unset checksum-complete
*
* Unset checksum complete can be done when packet is being modified
* (uncompressed for instance) and checksum-complete value is
* invalidated.
*/
static inline void skb_checksum_complete_unset(struct sk_buff *skb)
{
if (skb->ip_summed == CHECKSUM_COMPLETE)
skb->ip_summed = CHECKSUM_NONE;
}
/* Validate (init) checksum based on checksum complete.
*
* Return values:
* 0: checksum is validated or try to in skb_checksum_complete. In the latter
* case the ip_summed will not be CHECKSUM_UNNECESSARY and the pseudo
* checksum is stored in skb->csum for use in __skb_checksum_complete
* non-zero: value of invalid checksum
*
*/
static inline __sum16 __skb_checksum_validate_complete(struct sk_buff *skb,
bool complete,
__wsum psum)
{
if (skb->ip_summed == CHECKSUM_COMPLETE) {
if (!csum_fold(csum_add(psum, skb->csum))) {
skb->csum_valid = 1;
return 0;
}
}
skb->csum = psum;
if (complete || skb->len <= CHECKSUM_BREAK) {
__sum16 csum;
csum = __skb_checksum_complete(skb);
skb->csum_valid = !csum;
return csum;
}
return 0;
}
static inline __wsum null_compute_pseudo(struct sk_buff *skb, int proto)
{
return 0;
}
/* Perform checksum validate (init). Note that this is a macro since we only
* want to calculate the pseudo header which is an input function if necessary.
* First we try to validate without any computation (checksum unnecessary) and
* then calculate based on checksum complete calling the function to compute
* pseudo header.
*
* Return values:
* 0: checksum is validated or try to in skb_checksum_complete
* non-zero: value of invalid checksum
*/
#define __skb_checksum_validate(skb, proto, complete, \
zero_okay, check, compute_pseudo) \
({ \
__sum16 __ret = 0; \
skb->csum_valid = 0; \
if (__skb_checksum_validate_needed(skb, zero_okay, check)) \
__ret = __skb_checksum_validate_complete(skb, \
complete, compute_pseudo(skb, proto)); \
__ret; \
})
#define skb_checksum_init(skb, proto, compute_pseudo) \
__skb_checksum_validate(skb, proto, false, false, 0, compute_pseudo)
#define skb_checksum_init_zero_check(skb, proto, check, compute_pseudo) \
__skb_checksum_validate(skb, proto, false, true, check, compute_pseudo)
#define skb_checksum_validate(skb, proto, compute_pseudo) \
__skb_checksum_validate(skb, proto, true, false, 0, compute_pseudo)
#define skb_checksum_validate_zero_check(skb, proto, check, \
compute_pseudo) \
__skb_checksum_validate(skb, proto, true, true, check, compute_pseudo)
#define skb_checksum_simple_validate(skb) \
__skb_checksum_validate(skb, 0, true, false, 0, null_compute_pseudo)
static inline bool __skb_checksum_convert_check(struct sk_buff *skb)
{
return (skb->ip_summed == CHECKSUM_NONE && skb->csum_valid);
}
static inline void __skb_checksum_convert(struct sk_buff *skb, __wsum pseudo)
{
skb->csum = ~pseudo;
skb->ip_summed = CHECKSUM_COMPLETE;
}
#define skb_checksum_try_convert(skb, proto, compute_pseudo) \
do { \
if (__skb_checksum_convert_check(skb)) \
__skb_checksum_convert(skb, compute_pseudo(skb, proto)); \
} while (0)
static inline void skb_remcsum_adjust_partial(struct sk_buff *skb, void *ptr,
u16 start, u16 offset)
{
skb->ip_summed = CHECKSUM_PARTIAL;
skb->csum_start = ((unsigned char *)ptr + start) - skb->head;
skb->csum_offset = offset - start;
}
/* Update skbuf and packet to reflect the remote checksum offload operation.
* When called, ptr indicates the starting point for skb->csum when
* ip_summed is CHECKSUM_COMPLETE. If we need create checksum complete
* here, skb_postpull_rcsum is done so skb->csum start is ptr.
*/
static inline void skb_remcsum_process(struct sk_buff *skb, void *ptr,
int start, int offset, bool nopartial)
{
__wsum delta;
if (!nopartial) {
skb_remcsum_adjust_partial(skb, ptr, start, offset);
return;
}
if (unlikely(skb->ip_summed != CHECKSUM_COMPLETE)) {
__skb_checksum_complete(skb);
skb_postpull_rcsum(skb, skb->data, ptr - (void *)skb->data);
}
delta = remcsum_adjust(ptr, skb->csum, start, offset);
/* Adjust skb->csum since we changed the packet */
skb->csum = csum_add(skb->csum, delta);
}
static inline struct nf_conntrack *skb_nfct(const struct sk_buff *skb)
{
#if IS_ENABLED(CONFIG_NF_CONNTRACK)
return (void *)(skb->_nfct & NFCT_PTRMASK);
#else
return NULL;
#endif
}
static inline unsigned long skb_get_nfct(const struct sk_buff *skb)
{
#if IS_ENABLED(CONFIG_NF_CONNTRACK)
return skb->_nfct;
#else
return 0UL;
#endif
}
static inline void skb_set_nfct(struct sk_buff *skb, unsigned long nfct)
{
#if IS_ENABLED(CONFIG_NF_CONNTRACK)
skb->slow_gro |= !!nfct;
skb->_nfct = nfct;
#endif
}
#ifdef CONFIG_SKB_EXTENSIONS
enum skb_ext_id {
#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
SKB_EXT_BRIDGE_NF,
#endif
#ifdef CONFIG_XFRM
SKB_EXT_SEC_PATH,
#endif
#if IS_ENABLED(CONFIG_NET_TC_SKB_EXT)
TC_SKB_EXT,
#endif
#if IS_ENABLED(CONFIG_MPTCP)
SKB_EXT_MPTCP,
#endif
SKB_EXT_NUM, /* must be last */
};
/**
* struct skb_ext - sk_buff extensions
* @refcnt: 1 on allocation, deallocated on 0
* @offset: offset to add to @data to obtain extension address
* @chunks: size currently allocated, stored in SKB_EXT_ALIGN_SHIFT units
* @data: start of extension data, variable sized
*
* Note: offsets/lengths are stored in chunks of 8 bytes, this allows
* to use 'u8' types while allowing up to 2kb worth of extension data.
*/
struct skb_ext {
refcount_t refcnt;
u8 offset[SKB_EXT_NUM]; /* in chunks of 8 bytes */
u8 chunks; /* same */
char data[] __aligned(8);
};
struct skb_ext *__skb_ext_alloc(gfp_t flags);
void *__skb_ext_set(struct sk_buff *skb, enum skb_ext_id id,
struct skb_ext *ext);
void *skb_ext_add(struct sk_buff *skb, enum skb_ext_id id);
void __skb_ext_del(struct sk_buff *skb, enum skb_ext_id id);
void __skb_ext_put(struct skb_ext *ext);
static inline void skb_ext_put(struct sk_buff *skb)
{
if (skb->active_extensions)
__skb_ext_put(skb->extensions);
}
static inline void __skb_ext_copy(struct sk_buff *dst,
const struct sk_buff *src)
{
dst->active_extensions = src->active_extensions;
if (src->active_extensions) {
struct skb_ext *ext = src->extensions;
refcount_inc(&ext->refcnt);
dst->extensions = ext;
}
}
static inline void skb_ext_copy(struct sk_buff *dst, const struct sk_buff *src)
{
skb_ext_put(dst); __skb_ext_copy(dst, src);
}
static inline bool __skb_ext_exist(const struct skb_ext *ext, enum skb_ext_id i)
{
return !!ext->offset[i];
}
static inline bool skb_ext_exist(const struct sk_buff *skb, enum skb_ext_id id)
{
return skb->active_extensions & (1 << id);
}
static inline void skb_ext_del(struct sk_buff *skb, enum skb_ext_id id)
{
if (skb_ext_exist(skb, id))
__skb_ext_del(skb, id);
}
static inline void *skb_ext_find(const struct sk_buff *skb, enum skb_ext_id id)
{
if (skb_ext_exist(skb, id)) {
struct skb_ext *ext = skb->extensions;
return (void *)ext + (ext->offset[id] << 3);
}
return NULL;
}
static inline void skb_ext_reset(struct sk_buff *skb)
{
if (unlikely(skb->active_extensions)) { __skb_ext_put(skb->extensions);
skb->active_extensions = 0;
}
}
static inline bool skb_has_extensions(struct sk_buff *skb)
{
return unlikely(skb->active_extensions);
}
#else
static inline void skb_ext_put(struct sk_buff *skb) {}
static inline void skb_ext_reset(struct sk_buff *skb) {}
static inline void skb_ext_del(struct sk_buff *skb, int unused) {}
static inline void __skb_ext_copy(struct sk_buff *d, const struct sk_buff *s) {}
static inline void skb_ext_copy(struct sk_buff *dst, const struct sk_buff *s) {}
static inline bool skb_has_extensions(struct sk_buff *skb) { return false; }
#endif /* CONFIG_SKB_EXTENSIONS */
static inline void nf_reset_ct(struct sk_buff *skb)
{
#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
nf_conntrack_put(skb_nfct(skb)); skb->_nfct = 0;
#endif
}
static inline void nf_reset_trace(struct sk_buff *skb)
{
#if IS_ENABLED(CONFIG_NETFILTER_XT_TARGET_TRACE) || defined(CONFIG_NF_TABLES)
skb->nf_trace = 0;
#endif
}
static inline void ipvs_reset(struct sk_buff *skb)
{
#if IS_ENABLED(CONFIG_IP_VS)
skb->ipvs_property = 0;
#endif
}
/* Note: This doesn't put any conntrack info in dst. */
static inline void __nf_copy(struct sk_buff *dst, const struct sk_buff *src,
bool copy)
{
#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
dst->_nfct = src->_nfct;
nf_conntrack_get(skb_nfct(src));
#endif
#if IS_ENABLED(CONFIG_NETFILTER_XT_TARGET_TRACE) || defined(CONFIG_NF_TABLES)
if (copy)
dst->nf_trace = src->nf_trace;
#endif
}
static inline void nf_copy(struct sk_buff *dst, const struct sk_buff *src)
{
#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
nf_conntrack_put(skb_nfct(dst));
#endif
dst->slow_gro = src->slow_gro;
__nf_copy(dst, src, true);
}
#ifdef CONFIG_NETWORK_SECMARK
static inline void skb_copy_secmark(struct sk_buff *to, const struct sk_buff *from)
{
to->secmark = from->secmark;
}
static inline void skb_init_secmark(struct sk_buff *skb)
{
skb->secmark = 0;
}
#else
static inline void skb_copy_secmark(struct sk_buff *to, const struct sk_buff *from)
{ }
static inline void skb_init_secmark(struct sk_buff *skb)
{ }
#endif
static inline int secpath_exists(const struct sk_buff *skb)
{
#ifdef CONFIG_XFRM
return skb_ext_exist(skb, SKB_EXT_SEC_PATH);
#else
return 0;
#endif
}
static inline bool skb_irq_freeable(const struct sk_buff *skb)
{
return !skb->destructor &&
!secpath_exists(skb) &&
!skb_nfct(skb) &&
!skb->_skb_refdst &&
!skb_has_frag_list(skb);
}
static inline void skb_set_queue_mapping(struct sk_buff *skb, u16 queue_mapping)
{
skb->queue_mapping = queue_mapping;
}
static inline u16 skb_get_queue_mapping(const struct sk_buff *skb)
{
return skb->queue_mapping;
}
static inline void skb_copy_queue_mapping(struct sk_buff *to, const struct sk_buff *from)
{
to->queue_mapping = from->queue_mapping;
}
static inline void skb_record_rx_queue(struct sk_buff *skb, u16 rx_queue)
{
skb->queue_mapping = rx_queue + 1;
}
static inline u16 skb_get_rx_queue(const struct sk_buff *skb)
{
return skb->queue_mapping - 1;
}
static inline bool skb_rx_queue_recorded(const struct sk_buff *skb)
{
return skb->queue_mapping != 0;
}
static inline void skb_set_dst_pending_confirm(struct sk_buff *skb, u32 val)
{
skb->dst_pending_confirm = val;}
static inline bool skb_get_dst_pending_confirm(const struct sk_buff *skb)
{
return skb->dst_pending_confirm != 0;
}
static inline struct sec_path *skb_sec_path(const struct sk_buff *skb)
{
#ifdef CONFIG_XFRM
return skb_ext_find(skb, SKB_EXT_SEC_PATH);
#else
return NULL;
#endif
}
/* Keeps track of mac header offset relative to skb->head.
* It is useful for TSO of Tunneling protocol. e.g. GRE.
* For non-tunnel skb it points to skb_mac_header() and for
* tunnel skb it points to outer mac header.
* Keeps track of level of encapsulation of network headers.
*/
struct skb_gso_cb {
union {
int mac_offset;
int data_offset;
};
int encap_level;
__wsum csum;
__u16 csum_start;
};
#define SKB_GSO_CB_OFFSET 32
#define SKB_GSO_CB(skb) ((struct skb_gso_cb *)((skb)->cb + SKB_GSO_CB_OFFSET))
static inline int skb_tnl_header_len(const struct sk_buff *inner_skb)
{
return (skb_mac_header(inner_skb) - inner_skb->head) -
SKB_GSO_CB(inner_skb)->mac_offset;
}
static inline int gso_pskb_expand_head(struct sk_buff *skb, int extra)
{
int new_headroom, headroom;
int ret;
headroom = skb_headroom(skb);
ret = pskb_expand_head(skb, extra, 0, GFP_ATOMIC);
if (ret)
return ret;
new_headroom = skb_headroom(skb);
SKB_GSO_CB(skb)->mac_offset += (new_headroom - headroom);
return 0;
}
static inline void gso_reset_checksum(struct sk_buff *skb, __wsum res)
{
/* Do not update partial checksums if remote checksum is enabled. */
if (skb->remcsum_offload)
return;
SKB_GSO_CB(skb)->csum = res;
SKB_GSO_CB(skb)->csum_start = skb_checksum_start(skb) - skb->head;
}
/* Compute the checksum for a gso segment. First compute the checksum value
* from the start of transport header to SKB_GSO_CB(skb)->csum_start, and
* then add in skb->csum (checksum from csum_start to end of packet).
* skb->csum and csum_start are then updated to reflect the checksum of the
* resultant packet starting from the transport header-- the resultant checksum
* is in the res argument (i.e. normally zero or ~ of checksum of a pseudo
* header.
*/
static inline __sum16 gso_make_checksum(struct sk_buff *skb, __wsum res)
{
unsigned char *csum_start = skb_transport_header(skb);
int plen = (skb->head + SKB_GSO_CB(skb)->csum_start) - csum_start;
__wsum partial = SKB_GSO_CB(skb)->csum;
SKB_GSO_CB(skb)->csum = res;
SKB_GSO_CB(skb)->csum_start = csum_start - skb->head;
return csum_fold(csum_partial(csum_start, plen, partial));
}
static inline bool skb_is_gso(const struct sk_buff *skb)
{
return skb_shinfo(skb)->gso_size;
}
/* Note: Should be called only if skb_is_gso(skb) is true */
static inline bool skb_is_gso_v6(const struct sk_buff *skb)
{
return skb_shinfo(skb)->gso_type & SKB_GSO_TCPV6;
}
/* Note: Should be called only if skb_is_gso(skb) is true */
static inline bool skb_is_gso_sctp(const struct sk_buff *skb)
{
return skb_shinfo(skb)->gso_type & SKB_GSO_SCTP;
}
/* Note: Should be called only if skb_is_gso(skb) is true */
static inline bool skb_is_gso_tcp(const struct sk_buff *skb)
{
return skb_shinfo(skb)->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6);
}
static inline void skb_gso_reset(struct sk_buff *skb)
{
skb_shinfo(skb)->gso_size = 0;
skb_shinfo(skb)->gso_segs = 0;
skb_shinfo(skb)->gso_type = 0;
}
static inline void skb_increase_gso_size(struct skb_shared_info *shinfo,
u16 increment)
{
if (WARN_ON_ONCE(shinfo->gso_size == GSO_BY_FRAGS))
return;
shinfo->gso_size += increment;
}
static inline void skb_decrease_gso_size(struct skb_shared_info *shinfo,
u16 decrement)
{
if (WARN_ON_ONCE(shinfo->gso_size == GSO_BY_FRAGS))
return;
shinfo->gso_size -= decrement;
}
void __skb_warn_lro_forwarding(const struct sk_buff *skb);
static inline bool skb_warn_if_lro(const struct sk_buff *skb)
{
/* LRO sets gso_size but not gso_type, whereas if GSO is really
* wanted then gso_type will be set. */
const struct skb_shared_info *shinfo = skb_shinfo(skb);
if (skb_is_nonlinear(skb) && shinfo->gso_size != 0 &&
unlikely(shinfo->gso_type == 0)) {
__skb_warn_lro_forwarding(skb);
return true;
}
return false;
}
static inline void skb_forward_csum(struct sk_buff *skb)
{
/* Unfortunately we don't support this one. Any brave souls? */
if (skb->ip_summed == CHECKSUM_COMPLETE)
skb->ip_summed = CHECKSUM_NONE;
}
/**
* skb_checksum_none_assert - make sure skb ip_summed is CHECKSUM_NONE
* @skb: skb to check
*
* fresh skbs have their ip_summed set to CHECKSUM_NONE.
* Instead of forcing ip_summed to CHECKSUM_NONE, we can
* use this helper, to document places where we make this assertion.
*/
static inline void skb_checksum_none_assert(const struct sk_buff *skb)
{
#ifdef DEBUG
BUG_ON(skb->ip_summed != CHECKSUM_NONE);
#endif
}
bool skb_partial_csum_set(struct sk_buff *skb, u16 start, u16 off);
int skb_checksum_setup(struct sk_buff *skb, bool recalculate);
struct sk_buff *skb_checksum_trimmed(struct sk_buff *skb,
unsigned int transport_len,
__sum16(*skb_chkf)(struct sk_buff *skb));
/**
* skb_head_is_locked - Determine if the skb->head is locked down
* @skb: skb to check
*
* The head on skbs build around a head frag can be removed if they are
* not cloned. This function returns true if the skb head is locked down
* due to either being allocated via kmalloc, or by being a clone with
* multiple references to the head.
*/
static inline bool skb_head_is_locked(const struct sk_buff *skb)
{
return !skb->head_frag || skb_cloned(skb);
}
/* Local Checksum Offload.
* Compute outer checksum based on the assumption that the
* inner checksum will be offloaded later.
* See Documentation/networking/checksum-offloads.rst for
* explanation of how this works.
* Fill in outer checksum adjustment (e.g. with sum of outer
* pseudo-header) before calling.
* Also ensure that inner checksum is in linear data area.
*/
static inline __wsum lco_csum(struct sk_buff *skb)
{
unsigned char *csum_start = skb_checksum_start(skb);
unsigned char *l4_hdr = skb_transport_header(skb);
__wsum partial;
/* Start with complement of inner checksum adjustment */
partial = ~csum_unfold(*(__force __sum16 *)(csum_start +
skb->csum_offset));
/* Add in checksum of our headers (incl. outer checksum
* adjustment filled in by caller) and return result.
*/
return csum_partial(l4_hdr, csum_start - l4_hdr, partial);
}
static inline bool skb_is_redirected(const struct sk_buff *skb)
{
return skb->redirected;
}
static inline void skb_set_redirected(struct sk_buff *skb, bool from_ingress)
{
skb->redirected = 1;
#ifdef CONFIG_NET_REDIRECT
skb->from_ingress = from_ingress;
if (skb->from_ingress)
skb->tstamp = 0;
#endif
}
static inline void skb_reset_redirect(struct sk_buff *skb)
{
skb->redirected = 0;
}
static inline bool skb_csum_is_sctp(struct sk_buff *skb)
{
return skb->csum_not_inet;
}
static inline void skb_set_kcov_handle(struct sk_buff *skb,
const u64 kcov_handle)
{
#ifdef CONFIG_KCOV
skb->kcov_handle = kcov_handle;
#endif
}
static inline u64 skb_get_kcov_handle(struct sk_buff *skb)
{
#ifdef CONFIG_KCOV
return skb->kcov_handle;
#else
return 0;
#endif
}
#ifdef CONFIG_PAGE_POOL
static inline void skb_mark_for_recycle(struct sk_buff *skb)
{
skb->pp_recycle = 1;
}
#endif
static inline bool skb_pp_recycle(struct sk_buff *skb, void *data)
{
if (!IS_ENABLED(CONFIG_PAGE_POOL) || !skb->pp_recycle)
return false;
return page_pool_return_skb_page(virt_to_page(data));
}
#endif /* __KERNEL__ */
#endif /* _LINUX_SKBUFF_H */
#ifndef _LINUX_HASH_H
#define _LINUX_HASH_H
/* Fast hashing routine for ints, longs and pointers.
(C) 2002 Nadia Yvette Chambers, IBM */
#include <asm/types.h>
#include <linux/compiler.h>
/*
* The "GOLDEN_RATIO_PRIME" is used in ifs/btrfs/brtfs_inode.h and
* fs/inode.c. It's not actually prime any more (the previous primes
* were actively bad for hashing), but the name remains.
*/
#if BITS_PER_LONG == 32
#define GOLDEN_RATIO_PRIME GOLDEN_RATIO_32
#define hash_long(val, bits) hash_32(val, bits)
#elif BITS_PER_LONG == 64
#define hash_long(val, bits) hash_64(val, bits)
#define GOLDEN_RATIO_PRIME GOLDEN_RATIO_64
#else
#error Wordsize not 32 or 64
#endif
/*
* This hash multiplies the input by a large odd number and takes the
* high bits. Since multiplication propagates changes to the most
* significant end only, it is essential that the high bits of the
* product be used for the hash value.
*
* Chuck Lever verified the effectiveness of this technique:
* http://www.citi.umich.edu/techreports/reports/citi-tr-00-1.pdf
*
* Although a random odd number will do, it turns out that the golden
* ratio phi = (sqrt(5)-1)/2, or its negative, has particularly nice
* properties. (See Knuth vol 3, section 6.4, exercise 9.)
*
* These are the negative, (1 - phi) = phi**2 = (3 - sqrt(5))/2,
* which is very slightly easier to multiply by and makes no
* difference to the hash distribution.
*/
#define GOLDEN_RATIO_32 0x61C88647
#define GOLDEN_RATIO_64 0x61C8864680B583EBull
#ifdef CONFIG_HAVE_ARCH_HASH
/* This header may use the GOLDEN_RATIO_xx constants */
#include <asm/hash.h>
#endif
/*
* The _generic versions exist only so lib/test_hash.c can compare
* the arch-optimized versions with the generic.
*
* Note that if you change these, any <asm/hash.h> that aren't updated
* to match need to have their HAVE_ARCH_* define values updated so the
* self-test will not false-positive.
*/
#ifndef HAVE_ARCH__HASH_32
#define __hash_32 __hash_32_generic
#endif
static inline u32 __hash_32_generic(u32 val)
{
return val * GOLDEN_RATIO_32;
}
#ifndef HAVE_ARCH_HASH_32
#define hash_32 hash_32_generic
#endif
static inline u32 hash_32_generic(u32 val, unsigned int bits)
{
/* High bits are more random, so use them. */
return __hash_32(val) >> (32 - bits);
}
#ifndef HAVE_ARCH_HASH_64
#define hash_64 hash_64_generic
#endif
static __always_inline u32 hash_64_generic(u64 val, unsigned int bits)
{
#if BITS_PER_LONG == 64
/* 64x64-bit multiply is efficient on all 64-bit processors */
return val * GOLDEN_RATIO_64 >> (64 - bits);
#else
/* Hash 64 bits using only 32x32-bit multiply. */
return hash_32((u32)val ^ __hash_32(val >> 32), bits);
#endif
}
static inline u32 hash_ptr(const void *ptr, unsigned int bits)
{
return hash_long((unsigned long)ptr, bits);
}
/* This really should be called fold32_ptr; it does no hashing to speak of. */
static inline u32 hash32_ptr(const void *ptr)
{
unsigned long val = (unsigned long)ptr;
#if BITS_PER_LONG == 64
val ^= (val >> 32);
#endif
return (u32)val;
}
#endif /* _LINUX_HASH_H */
/* zutil.h -- internal interface and configuration of the compression library
* Copyright (C) 1995-1998 Jean-loup Gailly.
* For conditions of distribution and use, see copyright notice in zlib.h
*/
/* WARNING: this file should *not* be used by applications. It is
part of the implementation of the compression library and is
subject to change. Applications should only use zlib.h.
*/
/* @(#) $Id: zutil.h,v 1.1 2000/01/01 03:32:23 davem Exp $ */
#ifndef _Z_UTIL_H
#define _Z_UTIL_H
#include <linux/zlib.h>
#include <linux/string.h>
#include <linux/kernel.h>
typedef unsigned char uch;
typedef unsigned short ush;
typedef unsigned long ulg;
/* common constants */
#define STORED_BLOCK 0
#define STATIC_TREES 1
#define DYN_TREES 2
/* The three kinds of block type */
#define MIN_MATCH 3
#define MAX_MATCH 258
/* The minimum and maximum match lengths */
#define PRESET_DICT 0x20 /* preset dictionary flag in zlib header */
/* target dependencies */
/* Common defaults */
#ifndef OS_CODE
# define OS_CODE 0x03 /* assume Unix */
#endif
/* functions */
typedef uLong (*check_func) (uLong check, const Byte *buf,
uInt len);
/* checksum functions */
#define BASE 65521L /* largest prime smaller than 65536 */
#define NMAX 5552
/* NMAX is the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1 */
#define DO1(buf,i) {s1 += buf[i]; s2 += s1;}
#define DO2(buf,i) DO1(buf,i); DO1(buf,i+1);
#define DO4(buf,i) DO2(buf,i); DO2(buf,i+2);
#define DO8(buf,i) DO4(buf,i); DO4(buf,i+4);
#define DO16(buf) DO8(buf,0); DO8(buf,8);
/* ========================================================================= */
/*
Update a running Adler-32 checksum with the bytes buf[0..len-1] and
return the updated checksum. If buf is NULL, this function returns
the required initial value for the checksum.
An Adler-32 checksum is almost as reliable as a CRC32 but can be computed
much faster. Usage example:
uLong adler = zlib_adler32(0L, NULL, 0);
while (read_buffer(buffer, length) != EOF) {
adler = zlib_adler32(adler, buffer, length);
}
if (adler != original_adler) error();
*/
static inline uLong zlib_adler32(uLong adler,
const Byte *buf,
uInt len)
{
unsigned long s1 = adler & 0xffff;
unsigned long s2 = (adler >> 16) & 0xffff;
int k;
if (buf == NULL) return 1L;
while (len > 0) {
k = len < NMAX ? len : NMAX;
len -= k;
while (k >= 16) {
DO16(buf);
buf += 16;
k -= 16;
}
if (k != 0) do { s1 += *buf++;
s2 += s1;
} while (--k);
s1 %= BASE;
s2 %= BASE;
}
return (s2 << 16) | s1;
}
#endif /* _Z_UTIL_H */
// SPDX-License-Identifier: GPL-2.0-only
/*
* ratelimit.c - Do something with rate limit.
*
* Isolated from kernel/printk.c by Dave Young <hidave.darkstar@gmail.com>
*
* 2008-05-01 rewrite the function and use a ratelimit_state data struct as
* parameter. Now every user can use their own standalone ratelimit_state.
*/
#include <linux/ratelimit.h>
#include <linux/jiffies.h>
#include <linux/export.h>
/*
* __ratelimit - rate limiting
* @rs: ratelimit_state data
* @func: name of calling function
*
* This enforces a rate limit: not more than @rs->burst callbacks
* in every @rs->interval
*
* RETURNS:
* 0 means callbacks will be suppressed.
* 1 means go ahead and do it.
*/
int ___ratelimit(struct ratelimit_state *rs, const char *func)
{
unsigned long flags;
int ret;
if (!rs->interval)
return 1;
/*
* If we contend on this state's lock then almost
* by definition we are too busy to print a message,
* in addition to the one that will be printed by
* the entity that is holding the lock already:
*/
if (!raw_spin_trylock_irqsave(&rs->lock, flags))
return 0;
if (!rs->begin) rs->begin = jiffies; if (time_is_before_jiffies(rs->begin + rs->interval)) { if (rs->missed) { if (!(rs->flags & RATELIMIT_MSG_ON_RELEASE)) { printk_deferred(KERN_WARNING
"%s: %d callbacks suppressed\n",
func, rs->missed);
rs->missed = 0;
}
}
rs->begin = jiffies;
rs->printed = 0;
}
if (rs->burst && rs->burst > rs->printed) { rs->printed++;
ret = 1;
} else {
rs->missed++;
ret = 0;
}
raw_spin_unlock_irqrestore(&rs->lock, flags);
return ret;
}
EXPORT_SYMBOL(___ratelimit);
// SPDX-License-Identifier: GPL-2.0-or-later
/*
* CRC32C
*@Article{castagnoli-crc,
* author = { Guy Castagnoli and Stefan Braeuer and Martin Herrman},
* title = {{Optimization of Cyclic Redundancy-Check Codes with 24
* and 32 Parity Bits}},
* journal = IEEE Transactions on Communication,
* year = {1993},
* volume = {41},
* number = {6},
* pages = {},
* month = {June},
*}
* Used by the iSCSI driver, possibly others, and derived from
* the iscsi-crc.c module of the linux-iscsi driver at
* http://linux-iscsi.sourceforge.net.
*
* Following the example of lib/crc32, this function is intended to be
* flexible and useful for all users. Modules that currently have their
* own crc32c, but hopefully may be able to use this one are:
* net/sctp (please add all your doco to here if you change to
* use this one!)
* <endoflist>
*
* Copyright (c) 2004 Cisco Systems, Inc.
*/
#include <crypto/hash.h>
#include <linux/err.h>
#include <linux/init.h>
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/crc32c.h>
static struct crypto_shash *tfm;
u32 crc32c(u32 crc, const void *address, unsigned int length)
{
SHASH_DESC_ON_STACK(shash, tfm);
u32 ret, *ctx = (u32 *)shash_desc_ctx(shash);
int err;
shash->tfm = tfm;
*ctx = crc;
err = crypto_shash_update(shash, address, length);
BUG_ON(err); ret = *ctx;
barrier_data(ctx);
return ret;
}
EXPORT_SYMBOL(crc32c);
static int __init libcrc32c_mod_init(void)
{
tfm = crypto_alloc_shash("crc32c", 0, 0);
return PTR_ERR_OR_ZERO(tfm);
}
static void __exit libcrc32c_mod_fini(void)
{
crypto_free_shash(tfm);
}
const char *crc32c_impl(void)
{
return crypto_shash_driver_name(tfm);
}
EXPORT_SYMBOL(crc32c_impl);
module_init(libcrc32c_mod_init);
module_exit(libcrc32c_mod_fini);
MODULE_AUTHOR("Clay Haapala <chaapala@cisco.com>");
MODULE_DESCRIPTION("CRC32c (Castagnoli) calculations");
MODULE_LICENSE("GPL");
MODULE_SOFTDEP("pre: crc32c");
// SPDX-License-Identifier: GPL-2.0-only
/*
* fs/dcache.c
*
* Complete reimplementation
* (C) 1997 Thomas Schoebel-Theuer,
* with heavy changes by Linus Torvalds
*/
/*
* Notes on the allocation strategy:
*
* The dcache is a master of the icache - whenever a dcache entry
* exists, the inode will always exist. "iput()" is done either when
* the dcache entry is deleted or garbage collected.
*/
#include <linux/ratelimit.h>
#include <linux/string.h>
#include <linux/mm.h>
#include <linux/fs.h>
#include <linux/fscrypt.h>
#include <linux/fsnotify.h>
#include <linux/slab.h>
#include <linux/init.h>
#include <linux/hash.h>
#include <linux/cache.h>
#include <linux/export.h>
#include <linux/security.h>
#include <linux/seqlock.h>
#include <linux/memblock.h>
#include <linux/bit_spinlock.h>
#include <linux/rculist_bl.h>
#include <linux/list_lru.h>
#include "internal.h"
#include "mount.h"
/*
* Usage:
* dcache->d_inode->i_lock protects:
* - i_dentry, d_u.d_alias, d_inode of aliases
* dcache_hash_bucket lock protects:
* - the dcache hash table
* s_roots bl list spinlock protects:
* - the s_roots list (see __d_drop)
* dentry->d_sb->s_dentry_lru_lock protects:
* - the dcache lru lists and counters
* d_lock protects:
* - d_flags
* - d_name
* - d_lru
* - d_count
* - d_unhashed()
* - d_parent and d_subdirs
* - childrens' d_child and d_parent
* - d_u.d_alias, d_inode
*
* Ordering:
* dentry->d_inode->i_lock
* dentry->d_lock
* dentry->d_sb->s_dentry_lru_lock
* dcache_hash_bucket lock
* s_roots lock
*
* If there is an ancestor relationship:
* dentry->d_parent->...->d_parent->d_lock
* ...
* dentry->d_parent->d_lock
* dentry->d_lock
*
* If no ancestor relationship:
* arbitrary, since it's serialized on rename_lock
*/
int sysctl_vfs_cache_pressure __read_mostly = 100;
EXPORT_SYMBOL_GPL(sysctl_vfs_cache_pressure);
__cacheline_aligned_in_smp DEFINE_SEQLOCK(rename_lock);
EXPORT_SYMBOL(rename_lock);
static struct kmem_cache *dentry_cache __read_mostly;
const struct qstr empty_name = QSTR_INIT("", 0);
EXPORT_SYMBOL(empty_name);
const struct qstr slash_name = QSTR_INIT("/", 1);
EXPORT_SYMBOL(slash_name);
const struct qstr dotdot_name = QSTR_INIT("..", 2);
EXPORT_SYMBOL(dotdot_name);
/*
* This is the single most critical data structure when it comes
* to the dcache: the hashtable for lookups. Somebody should try
* to make this good - I've just made it work.
*
* This hash-function tries to avoid losing too many bits of hash
* information, yet avoid using a prime hash-size or similar.
*/
static unsigned int d_hash_shift __read_mostly;
static struct hlist_bl_head *dentry_hashtable __read_mostly;
static inline struct hlist_bl_head *d_hash(unsigned int hash)
{
return dentry_hashtable + (hash >> d_hash_shift);
}
#define IN_LOOKUP_SHIFT 10
static struct hlist_bl_head in_lookup_hashtable[1 << IN_LOOKUP_SHIFT];
static inline struct hlist_bl_head *in_lookup_hash(const struct dentry *parent,
unsigned int hash)
{
hash += (unsigned long) parent / L1_CACHE_BYTES;
return in_lookup_hashtable + hash_32(hash, IN_LOOKUP_SHIFT);
}
/* Statistics gathering. */
struct dentry_stat_t dentry_stat = {
.age_limit = 45,
};
static DEFINE_PER_CPU(long, nr_dentry);
static DEFINE_PER_CPU(long, nr_dentry_unused);
static DEFINE_PER_CPU(long, nr_dentry_negative);
#if defined(CONFIG_SYSCTL) && defined(CONFIG_PROC_FS)
/*
* Here we resort to our own counters instead of using generic per-cpu counters
* for consistency with what the vfs inode code does. We are expected to harvest
* better code and performance by having our own specialized counters.
*
* Please note that the loop is done over all possible CPUs, not over all online
* CPUs. The reason for this is that we don't want to play games with CPUs going
* on and off. If one of them goes off, we will just keep their counters.
*
* glommer: See cffbc8a for details, and if you ever intend to change this,
* please update all vfs counters to match.
*/
static long get_nr_dentry(void)
{
int i;
long sum = 0;
for_each_possible_cpu(i)
sum += per_cpu(nr_dentry, i);
return sum < 0 ? 0 : sum;
}
static long get_nr_dentry_unused(void)
{
int i;
long sum = 0;
for_each_possible_cpu(i)
sum += per_cpu(nr_dentry_unused, i);
return sum < 0 ? 0 : sum;
}
static long get_nr_dentry_negative(void)
{
int i;
long sum = 0;
for_each_possible_cpu(i)
sum += per_cpu(nr_dentry_negative, i);
return sum < 0 ? 0 : sum;
}
int proc_nr_dentry(struct ctl_table *table, int write, void *buffer,
size_t *lenp, loff_t *ppos)
{
dentry_stat.nr_dentry = get_nr_dentry();
dentry_stat.nr_unused = get_nr_dentry_unused();
dentry_stat.nr_negative = get_nr_dentry_negative();
return proc_doulongvec_minmax(table, write, buffer, lenp, ppos);
}
#endif
/*
* Compare 2 name strings, return 0 if they match, otherwise non-zero.
* The strings are both count bytes long, and count is non-zero.
*/
#ifdef CONFIG_DCACHE_WORD_ACCESS
#include <asm/word-at-a-time.h>
/*
* NOTE! 'cs' and 'scount' come from a dentry, so it has a
* aligned allocation for this particular component. We don't
* strictly need the load_unaligned_zeropad() safety, but it
* doesn't hurt either.
*
* In contrast, 'ct' and 'tcount' can be from a pathname, and do
* need the careful unaligned handling.
*/
static inline int dentry_string_cmp(const unsigned char *cs, const unsigned char *ct, unsigned tcount)
{
unsigned long a,b,mask;
for (;;) {
a = read_word_at_a_time(cs);
b = load_unaligned_zeropad(ct);
if (tcount < sizeof(unsigned long))
break;
if (unlikely(a != b))
return 1;
cs += sizeof(unsigned long);
ct += sizeof(unsigned long);
tcount -= sizeof(unsigned long);
if (!tcount)
return 0;
}
mask = bytemask_from_count(tcount);
return unlikely(!!((a ^ b) & mask));
}
#else
static inline int dentry_string_cmp(const unsigned char *cs, const unsigned char *ct, unsigned tcount)
{
do {
if (*cs != *ct)
return 1;
cs++;
ct++;
tcount--;
} while (tcount);
return 0;
}
#endif
static inline int dentry_cmp(const struct dentry *dentry, const unsigned char *ct, unsigned tcount)
{
/*
* Be careful about RCU walk racing with rename:
* use 'READ_ONCE' to fetch the name pointer.
*
* NOTE! Even if a rename will mean that the length
* was not loaded atomically, we don't care. The
* RCU walk will check the sequence count eventually,
* and catch it. And we won't overrun the buffer,
* because we're reading the name pointer atomically,
* and a dentry name is guaranteed to be properly
* terminated with a NUL byte.
*
* End result: even if 'len' is wrong, we'll exit
* early because the data cannot match (there can
* be no NUL in the ct/tcount data)
*/
const unsigned char *cs = READ_ONCE(dentry->d_name.name);
return dentry_string_cmp(cs, ct, tcount);
}
struct external_name {
union {
atomic_t count;
struct rcu_head head;
} u;
unsigned char name[];
};
static inline struct external_name *external_name(struct dentry *dentry)
{
return container_of(dentry->d_name.name, struct external_name, name[0]);
}
static void __d_free(struct rcu_head *head)
{
struct dentry *dentry = container_of(head, struct dentry, d_u.d_rcu);
kmem_cache_free(dentry_cache, dentry);
}
static void __d_free_external(struct rcu_head *head)
{
struct dentry *dentry = container_of(head, struct dentry, d_u.d_rcu);
kfree(external_name(dentry));
kmem_cache_free(dentry_cache, dentry);
}
static inline int dname_external(const struct dentry *dentry)
{
return dentry->d_name.name != dentry->d_iname;
}
void take_dentry_name_snapshot(struct name_snapshot *name, struct dentry *dentry)
{
spin_lock(&dentry->d_lock);
name->name = dentry->d_name;
if (unlikely(dname_external(dentry))) {
atomic_inc(&external_name(dentry)->u.count);
} else {
memcpy(name->inline_name, dentry->d_iname,
dentry->d_name.len + 1);
name->name.name = name->inline_name;
}
spin_unlock(&dentry->d_lock);
}
EXPORT_SYMBOL(take_dentry_name_snapshot);
void release_dentry_name_snapshot(struct name_snapshot *name)
{
if (unlikely(name->name.name != name->inline_name)) {
struct external_name *p;
p = container_of(name->name.name, struct external_name, name[0]); if (unlikely(atomic_dec_and_test(&p->u.count))) kfree_rcu(p, u.head);
}
}
EXPORT_SYMBOL(release_dentry_name_snapshot);
static inline void __d_set_inode_and_type(struct dentry *dentry,
struct inode *inode,
unsigned type_flags)
{
unsigned flags;
dentry->d_inode = inode;
flags = READ_ONCE(dentry->d_flags);
flags &= ~(DCACHE_ENTRY_TYPE | DCACHE_FALLTHRU);
flags |= type_flags;
smp_store_release(&dentry->d_flags, flags);
}
static inline void __d_clear_type_and_inode(struct dentry *dentry)
{
unsigned flags = READ_ONCE(dentry->d_flags);
flags &= ~(DCACHE_ENTRY_TYPE | DCACHE_FALLTHRU);
WRITE_ONCE(dentry->d_flags, flags);
dentry->d_inode = NULL;
if (dentry->d_flags & DCACHE_LRU_LIST)
this_cpu_inc(nr_dentry_negative);
}
static void dentry_free(struct dentry *dentry)
{
WARN_ON(!hlist_unhashed(&dentry->d_u.d_alias));
if (unlikely(dname_external(dentry))) {
struct external_name *p = external_name(dentry);
if (likely(atomic_dec_and_test(&p->u.count))) { call_rcu(&dentry->d_u.d_rcu, __d_free_external);
return;
}
}
/* if dentry was never visible to RCU, immediate free is OK */
if (dentry->d_flags & DCACHE_NORCU)
__d_free(&dentry->d_u.d_rcu);
else
call_rcu(&dentry->d_u.d_rcu, __d_free);
}
/*
* Release the dentry's inode, using the filesystem
* d_iput() operation if defined.
*/
static void dentry_unlink_inode(struct dentry * dentry)
__releases(dentry->d_lock)
__releases(dentry->d_inode->i_lock)
{
struct inode *inode = dentry->d_inode;
raw_write_seqcount_begin(&dentry->d_seq);
__d_clear_type_and_inode(dentry);
hlist_del_init(&dentry->d_u.d_alias);
raw_write_seqcount_end(&dentry->d_seq);
spin_unlock(&dentry->d_lock);
spin_unlock(&inode->i_lock);
if (!inode->i_nlink)
fsnotify_inoderemove(inode);
if (dentry->d_op && dentry->d_op->d_iput) dentry->d_op->d_iput(dentry, inode);
else
iput(inode);
}
/*
* The DCACHE_LRU_LIST bit is set whenever the 'd_lru' entry
* is in use - which includes both the "real" per-superblock
* LRU list _and_ the DCACHE_SHRINK_LIST use.
*
* The DCACHE_SHRINK_LIST bit is set whenever the dentry is
* on the shrink list (ie not on the superblock LRU list).
*
* The per-cpu "nr_dentry_unused" counters are updated with
* the DCACHE_LRU_LIST bit.
*
* The per-cpu "nr_dentry_negative" counters are only updated
* when deleted from or added to the per-superblock LRU list, not
* from/to the shrink list. That is to avoid an unneeded dec/inc
* pair when moving from LRU to shrink list in select_collect().
*
* These helper functions make sure we always follow the
* rules. d_lock must be held by the caller.
*/
#define D_FLAG_VERIFY(dentry,x) WARN_ON_ONCE(((dentry)->d_flags & (DCACHE_LRU_LIST | DCACHE_SHRINK_LIST)) != (x))
static void d_lru_add(struct dentry *dentry)
{
D_FLAG_VERIFY(dentry, 0); dentry->d_flags |= DCACHE_LRU_LIST;
this_cpu_inc(nr_dentry_unused);
if (d_is_negative(dentry))
this_cpu_inc(nr_dentry_negative); WARN_ON_ONCE(!list_lru_add(&dentry->d_sb->s_dentry_lru, &dentry->d_lru));
}
static void d_lru_del(struct dentry *dentry)
{
D_FLAG_VERIFY(dentry, DCACHE_LRU_LIST); dentry->d_flags &= ~DCACHE_LRU_LIST;
this_cpu_dec(nr_dentry_unused);
if (d_is_negative(dentry))
this_cpu_dec(nr_dentry_negative); WARN_ON_ONCE(!list_lru_del(&dentry->d_sb->s_dentry_lru, &dentry->d_lru));
}
static void d_shrink_del(struct dentry *dentry)
{
D_FLAG_VERIFY(dentry, DCACHE_SHRINK_LIST | DCACHE_LRU_LIST); list_del_init(&dentry->d_lru);
dentry->d_flags &= ~(DCACHE_SHRINK_LIST | DCACHE_LRU_LIST);
this_cpu_dec(nr_dentry_unused);
}
static void d_shrink_add(struct dentry *dentry, struct list_head *list)
{
D_FLAG_VERIFY(dentry, 0); list_add(&dentry->d_lru, list);
dentry->d_flags |= DCACHE_SHRINK_LIST | DCACHE_LRU_LIST;
this_cpu_inc(nr_dentry_unused);
}
/*
* These can only be called under the global LRU lock, ie during the
* callback for freeing the LRU list. "isolate" removes it from the
* LRU lists entirely, while shrink_move moves it to the indicated
* private list.
*/
static void d_lru_isolate(struct list_lru_one *lru, struct dentry *dentry)
{
D_FLAG_VERIFY(dentry, DCACHE_LRU_LIST);
dentry->d_flags &= ~DCACHE_LRU_LIST;
this_cpu_dec(nr_dentry_unused);
if (d_is_negative(dentry))
this_cpu_dec(nr_dentry_negative);
list_lru_isolate(lru, &dentry->d_lru);
}
static void d_lru_shrink_move(struct list_lru_one *lru, struct dentry *dentry,
struct list_head *list)
{
D_FLAG_VERIFY(dentry, DCACHE_LRU_LIST); dentry->d_flags |= DCACHE_SHRINK_LIST;
if (d_is_negative(dentry))
this_cpu_dec(nr_dentry_negative); list_lru_isolate_move(lru, &dentry->d_lru, list);
}
static void ___d_drop(struct dentry *dentry)
{
struct hlist_bl_head *b;
/*
* Hashed dentries are normally on the dentry hashtable,
* with the exception of those newly allocated by
* d_obtain_root, which are always IS_ROOT:
*/
if (unlikely(IS_ROOT(dentry))) b = &dentry->d_sb->s_roots;
else
b = d_hash(dentry->d_name.hash);
hlist_bl_lock(b);
__hlist_bl_del(&dentry->d_hash);
hlist_bl_unlock(b);
}
void __d_drop(struct dentry *dentry)
{
if (!d_unhashed(dentry)) { ___d_drop(dentry);
dentry->d_hash.pprev = NULL;
write_seqcount_invalidate(&dentry->d_seq);
}
}
EXPORT_SYMBOL(__d_drop);
/**
* d_drop - drop a dentry
* @dentry: dentry to drop
*
* d_drop() unhashes the entry from the parent dentry hashes, so that it won't
* be found through a VFS lookup any more. Note that this is different from
* deleting the dentry - d_delete will try to mark the dentry negative if
* possible, giving a successful _negative_ lookup, while d_drop will
* just make the cache lookup fail.
*
* d_drop() is used mainly for stuff that wants to invalidate a dentry for some
* reason (NFS timeouts or autofs deletes).
*
* __d_drop requires dentry->d_lock
*
* ___d_drop doesn't mark dentry as "unhashed"
* (dentry->d_hash.pprev will be LIST_POISON2, not NULL).
*/
void d_drop(struct dentry *dentry)
{
spin_lock(&dentry->d_lock);
__d_drop(dentry);
spin_unlock(&dentry->d_lock);
}
EXPORT_SYMBOL(d_drop);
static inline void dentry_unlist(struct dentry *dentry, struct dentry *parent)
{
struct dentry *next;
/*
* Inform d_walk() and shrink_dentry_list() that we are no longer
* attached to the dentry tree
*/
dentry->d_flags |= DCACHE_DENTRY_KILLED;
if (unlikely(list_empty(&dentry->d_child)))
return;
__list_del_entry(&dentry->d_child);
/*
* Cursors can move around the list of children. While we'd been
* a normal list member, it didn't matter - ->d_child.next would've
* been updated. However, from now on it won't be and for the
* things like d_walk() it might end up with a nasty surprise.
* Normally d_walk() doesn't care about cursors moving around -
* ->d_lock on parent prevents that and since a cursor has no children
* of its own, we get through it without ever unlocking the parent.
* There is one exception, though - if we ascend from a child that
* gets killed as soon as we unlock it, the next sibling is found
* using the value left in its ->d_child.next. And if _that_
* pointed to a cursor, and cursor got moved (e.g. by lseek())
* before d_walk() regains parent->d_lock, we'll end up skipping
* everything the cursor had been moved past.
*
* Solution: make sure that the pointer left behind in ->d_child.next
* points to something that won't be moving around. I.e. skip the
* cursors.
*/
while (dentry->d_child.next != &parent->d_subdirs) {
next = list_entry(dentry->d_child.next, struct dentry, d_child);
if (likely(!(next->d_flags & DCACHE_DENTRY_CURSOR)))
break;
dentry->d_child.next = next->d_child.next;
}
}
static void __dentry_kill(struct dentry *dentry)
{
struct dentry *parent = NULL;
bool can_free = true;
if (!IS_ROOT(dentry))
parent = dentry->d_parent;
/*
* The dentry is now unrecoverably dead to the world.
*/
lockref_mark_dead(&dentry->d_lockref);
/*
* inform the fs via d_prune that this dentry is about to be
* unhashed and destroyed.
*/
if (dentry->d_flags & DCACHE_OP_PRUNE)
dentry->d_op->d_prune(dentry); if (dentry->d_flags & DCACHE_LRU_LIST) { if (!(dentry->d_flags & DCACHE_SHRINK_LIST)) d_lru_del(dentry);
}
/* if it was on the hash then remove it */
__d_drop(dentry);
dentry_unlist(dentry, parent);
if (parent)
spin_unlock(&parent->d_lock);
if (dentry->d_inode) dentry_unlink_inode(dentry);
else
spin_unlock(&dentry->d_lock);
this_cpu_dec(nr_dentry); if (dentry->d_op && dentry->d_op->d_release) dentry->d_op->d_release(dentry);
spin_lock(&dentry->d_lock);
if (dentry->d_flags & DCACHE_SHRINK_LIST) {
dentry->d_flags |= DCACHE_MAY_FREE;
can_free = false;
}
spin_unlock(&dentry->d_lock);
if (likely(can_free))
dentry_free(dentry);
cond_resched();
}
static struct dentry *__lock_parent(struct dentry *dentry)
{
struct dentry *parent;
rcu_read_lock();
spin_unlock(&dentry->d_lock);
again:
parent = READ_ONCE(dentry->d_parent);
spin_lock(&parent->d_lock);
/*
* We can't blindly lock dentry until we are sure
* that we won't violate the locking order.
* Any changes of dentry->d_parent must have
* been done with parent->d_lock held, so
* spin_lock() above is enough of a barrier
* for checking if it's still our child.
*/
if (unlikely(parent != dentry->d_parent)) {
spin_unlock(&parent->d_lock);
goto again;
}
rcu_read_unlock();
if (parent != dentry)
spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED);
else
parent = NULL;
return parent;
}
static inline struct dentry *lock_parent(struct dentry *dentry)
{
struct dentry *parent = dentry->d_parent;
if (IS_ROOT(dentry))
return NULL;
if (likely(spin_trylock(&parent->d_lock)))
return parent;
return __lock_parent(dentry);
}
static inline bool retain_dentry(struct dentry *dentry)
{
WARN_ON(d_in_lookup(dentry));
/* Unreachable? Get rid of it */
if (unlikely(d_unhashed(dentry)))
return false;
if (unlikely(dentry->d_flags & DCACHE_DISCONNECTED))
return false;
if (unlikely(dentry->d_flags & DCACHE_OP_DELETE)) { if (dentry->d_op->d_delete(dentry))
return false;
}
if (unlikely(dentry->d_flags & DCACHE_DONTCACHE))
return false;
/* retain; LRU fodder */
dentry->d_lockref.count--;
if (unlikely(!(dentry->d_flags & DCACHE_LRU_LIST)))
d_lru_add(dentry); else if (unlikely(!(dentry->d_flags & DCACHE_REFERENCED))) dentry->d_flags |= DCACHE_REFERENCED;
return true;
}
void d_mark_dontcache(struct inode *inode)
{
struct dentry *de;
spin_lock(&inode->i_lock);
hlist_for_each_entry(de, &inode->i_dentry, d_u.d_alias) {
spin_lock(&de->d_lock);
de->d_flags |= DCACHE_DONTCACHE;
spin_unlock(&de->d_lock);
}
inode->i_state |= I_DONTCACHE;
spin_unlock(&inode->i_lock);
}
EXPORT_SYMBOL(d_mark_dontcache);
/*
* Finish off a dentry we've decided to kill.
* dentry->d_lock must be held, returns with it unlocked.
* Returns dentry requiring refcount drop, or NULL if we're done.
*/
static struct dentry *dentry_kill(struct dentry *dentry)
__releases(dentry->d_lock)
{
struct inode *inode = dentry->d_inode;
struct dentry *parent = NULL;
if (inode && unlikely(!spin_trylock(&inode->i_lock)))
goto slow_positive;
if (!IS_ROOT(dentry)) {
parent = dentry->d_parent;
if (unlikely(!spin_trylock(&parent->d_lock))) {
parent = __lock_parent(dentry); if (likely(inode || !dentry->d_inode))
goto got_locks;
/* negative that became positive */
if (parent)
spin_unlock(&parent->d_lock);
inode = dentry->d_inode;
goto slow_positive;
}
}
__dentry_kill(dentry);
return parent;
slow_positive:
spin_unlock(&dentry->d_lock);
spin_lock(&inode->i_lock);
spin_lock(&dentry->d_lock);
parent = lock_parent(dentry);
got_locks:
if (unlikely(dentry->d_lockref.count != 1)) { dentry->d_lockref.count--;
} else if (likely(!retain_dentry(dentry))) {
__dentry_kill(dentry);
return parent;
}
/* we are keeping it, after all */
if (inode)
spin_unlock(&inode->i_lock);
if (parent)
spin_unlock(&parent->d_lock);
spin_unlock(&dentry->d_lock);
return NULL;
}
/*
* Try to do a lockless dput(), and return whether that was successful.
*
* If unsuccessful, we return false, having already taken the dentry lock.
*
* The caller needs to hold the RCU read lock, so that the dentry is
* guaranteed to stay around even if the refcount goes down to zero!
*/
static inline bool fast_dput(struct dentry *dentry)
{
int ret;
unsigned int d_flags;
/*
* If we have a d_op->d_delete() operation, we sould not
* let the dentry count go to zero, so use "put_or_lock".
*/
if (unlikely(dentry->d_flags & DCACHE_OP_DELETE))
return lockref_put_or_lock(&dentry->d_lockref);
/*
* .. otherwise, we can try to just decrement the
* lockref optimistically.
*/
ret = lockref_put_return(&dentry->d_lockref);
/*
* If the lockref_put_return() failed due to the lock being held
* by somebody else, the fast path has failed. We will need to
* get the lock, and then check the count again.
*/
if (unlikely(ret < 0)) {
spin_lock(&dentry->d_lock);
if (dentry->d_lockref.count > 1) {
dentry->d_lockref.count--;
spin_unlock(&dentry->d_lock);
return true;
}
return false;
}
/*
* If we weren't the last ref, we're done.
*/
if (ret)
return true;
/*
* Careful, careful. The reference count went down
* to zero, but we don't hold the dentry lock, so
* somebody else could get it again, and do another
* dput(), and we need to not race with that.
*
* However, there is a very special and common case
* where we don't care, because there is nothing to
* do: the dentry is still hashed, it does not have
* a 'delete' op, and it's referenced and already on
* the LRU list.
*
* NOTE! Since we aren't locked, these values are
* not "stable". However, it is sufficient that at
* some point after we dropped the reference the
* dentry was hashed and the flags had the proper
* value. Other dentry users may have re-gotten
* a reference to the dentry and change that, but
* our work is done - we can leave the dentry
* around with a zero refcount.
*
* Nevertheless, there are two cases that we should kill
* the dentry anyway.
* 1. free disconnected dentries as soon as their refcount
* reached zero.
* 2. free dentries if they should not be cached.
*/
smp_rmb();
d_flags = READ_ONCE(dentry->d_flags);
d_flags &= DCACHE_REFERENCED | DCACHE_LRU_LIST |
DCACHE_DISCONNECTED | DCACHE_DONTCACHE;
/* Nothing to do? Dropping the reference was all we needed? */
if (d_flags == (DCACHE_REFERENCED | DCACHE_LRU_LIST) && !d_unhashed(dentry))
return true;
/*
* Not the fast normal case? Get the lock. We've already decremented
* the refcount, but we'll need to re-check the situation after
* getting the lock.
*/
spin_lock(&dentry->d_lock);
/*
* Did somebody else grab a reference to it in the meantime, and
* we're no longer the last user after all? Alternatively, somebody
* else could have killed it and marked it dead. Either way, we
* don't need to do anything else.
*/
if (dentry->d_lockref.count) {
spin_unlock(&dentry->d_lock);
return true;
}
/*
* Re-get the reference we optimistically dropped. We hold the
* lock, and we just tested that it was zero, so we can just
* set it to 1.
*/
dentry->d_lockref.count = 1;
return false;
}
/*
* This is dput
*
* This is complicated by the fact that we do not want to put
* dentries that are no longer on any hash chain on the unused
* list: we'd much rather just get rid of them immediately.
*
* However, that implies that we have to traverse the dentry
* tree upwards to the parents which might _also_ now be
* scheduled for deletion (it may have been only waiting for
* its last child to go away).
*
* This tail recursion is done by hand as we don't want to depend
* on the compiler to always get this right (gcc generally doesn't).
* Real recursion would eat up our stack space.
*/
/*
* dput - release a dentry
* @dentry: dentry to release
*
* Release a dentry. This will drop the usage count and if appropriate
* call the dentry unlink method as well as removing it from the queues and
* releasing its resources. If the parent dentries were scheduled for release
* they too may now get deleted.
*/
void dput(struct dentry *dentry)
{
while (dentry) { might_sleep();
rcu_read_lock();
if (likely(fast_dput(dentry))) {
rcu_read_unlock();
return;
}
/* Slow case: now with the dentry lock held */
rcu_read_unlock();
if (likely(retain_dentry(dentry))) {
spin_unlock(&dentry->d_lock);
return;
}
dentry = dentry_kill(dentry);
}
}
EXPORT_SYMBOL(dput);
static void __dput_to_list(struct dentry *dentry, struct list_head *list)
__must_hold(&dentry->d_lock)
{
if (dentry->d_flags & DCACHE_SHRINK_LIST) {
/* let the owner of the list it's on deal with it */
--dentry->d_lockref.count;
} else {
if (dentry->d_flags & DCACHE_LRU_LIST) d_lru_del(dentry); if (!--dentry->d_lockref.count)
d_shrink_add(dentry, list);
}
}
void dput_to_list(struct dentry *dentry, struct list_head *list)
{
rcu_read_lock();
if (likely(fast_dput(dentry))) {
rcu_read_unlock();
return;
}
rcu_read_unlock();
if (!retain_dentry(dentry))
__dput_to_list(dentry, list);
spin_unlock(&dentry->d_lock);
}
/* This must be called with d_lock held */
static inline void __dget_dlock(struct dentry *dentry)
{
dentry->d_lockref.count++;
}
static inline void __dget(struct dentry *dentry)
{
lockref_get(&dentry->d_lockref);
}
struct dentry *dget_parent(struct dentry *dentry)
{
int gotref;
struct dentry *ret;
unsigned seq;
/*
* Do optimistic parent lookup without any
* locking.
*/
rcu_read_lock();
seq = raw_seqcount_begin(&dentry->d_seq);
ret = READ_ONCE(dentry->d_parent);
gotref = lockref_get_not_zero(&ret->d_lockref);
rcu_read_unlock();
if (likely(gotref)) {
if (!read_seqcount_retry(&dentry->d_seq, seq))
return ret;
dput(ret);
}
repeat:
/*
* Don't need rcu_dereference because we re-check it was correct under
* the lock.
*/
rcu_read_lock();
ret = dentry->d_parent;
spin_lock(&ret->d_lock);
if (unlikely(ret != dentry->d_parent)) {
spin_unlock(&ret->d_lock);
rcu_read_unlock();
goto repeat;
}
rcu_read_unlock();
BUG_ON(!ret->d_lockref.count); ret->d_lockref.count++;
spin_unlock(&ret->d_lock);
return ret;
}
EXPORT_SYMBOL(dget_parent);
static struct dentry * __d_find_any_alias(struct inode *inode)
{
struct dentry *alias;
if (hlist_empty(&inode->i_dentry))
return NULL;
alias = hlist_entry(inode->i_dentry.first, struct dentry, d_u.d_alias);
__dget(alias);
return alias;
}
/**
* d_find_any_alias - find any alias for a given inode
* @inode: inode to find an alias for
*
* If any aliases exist for the given inode, take and return a
* reference for one of them. If no aliases exist, return %NULL.
*/
struct dentry *d_find_any_alias(struct inode *inode)
{
struct dentry *de;
spin_lock(&inode->i_lock);
de = __d_find_any_alias(inode);
spin_unlock(&inode->i_lock);
return de;
}
EXPORT_SYMBOL(d_find_any_alias);
static struct dentry *__d_find_alias(struct inode *inode)
{
struct dentry *alias;
if (S_ISDIR(inode->i_mode))
return __d_find_any_alias(inode);
hlist_for_each_entry(alias, &inode->i_dentry, d_u.d_alias) {
spin_lock(&alias->d_lock);
if (!d_unhashed(alias)) {
__dget_dlock(alias);
spin_unlock(&alias->d_lock);
return alias;
}
spin_unlock(&alias->d_lock);
}
return NULL;
}
/**
* d_find_alias - grab a hashed alias of inode
* @inode: inode in question
*
* If inode has a hashed alias, or is a directory and has any alias,
* acquire the reference to alias and return it. Otherwise return NULL.
* Notice that if inode is a directory there can be only one alias and
* it can be unhashed only if it has no children, or if it is the root
* of a filesystem, or if the directory was renamed and d_revalidate
* was the first vfs operation to notice.
*
* If the inode has an IS_ROOT, DCACHE_DISCONNECTED alias, then prefer
* any other hashed alias over that one.
*/
struct dentry *d_find_alias(struct inode *inode)
{
struct dentry *de = NULL;
if (!hlist_empty(&inode->i_dentry)) {
spin_lock(&inode->i_lock);
de = __d_find_alias(inode);
spin_unlock(&inode->i_lock);
}
return de;
}
EXPORT_SYMBOL(d_find_alias);
/*
* Caller MUST be holding rcu_read_lock() and be guaranteed
* that inode won't get freed until rcu_read_unlock().
*/
struct dentry *d_find_alias_rcu(struct inode *inode)
{
struct hlist_head *l = &inode->i_dentry;
struct dentry *de = NULL;
spin_lock(&inode->i_lock);
// ->i_dentry and ->i_rcu are colocated, but the latter won't be
// used without having I_FREEING set, which means no aliases left
if (likely(!(inode->i_state & I_FREEING) && !hlist_empty(l))) {
if (S_ISDIR(inode->i_mode)) {
de = hlist_entry(l->first, struct dentry, d_u.d_alias);
} else {
hlist_for_each_entry(de, l, d_u.d_alias)
if (!d_unhashed(de))
break;
}
}
spin_unlock(&inode->i_lock);
return de;
}
/*
* Try to kill dentries associated with this inode.
* WARNING: you must own a reference to inode.
*/
void d_prune_aliases(struct inode *inode)
{
struct dentry *dentry;
restart:
spin_lock(&inode->i_lock);
hlist_for_each_entry(dentry, &inode->i_dentry, d_u.d_alias) {
spin_lock(&dentry->d_lock);
if (!dentry->d_lockref.count) {
struct dentry *parent = lock_parent(dentry);
if (likely(!dentry->d_lockref.count)) {
__dentry_kill(dentry);
dput(parent);
goto restart;
}
if (parent)
spin_unlock(&parent->d_lock);
}
spin_unlock(&dentry->d_lock);
}
spin_unlock(&inode->i_lock);
}
EXPORT_SYMBOL(d_prune_aliases);
/*
* Lock a dentry from shrink list.
* Called under rcu_read_lock() and dentry->d_lock; the former
* guarantees that nothing we access will be freed under us.
* Note that dentry is *not* protected from concurrent dentry_kill(),
* d_delete(), etc.
*
* Return false if dentry has been disrupted or grabbed, leaving
* the caller to kick it off-list. Otherwise, return true and have
* that dentry's inode and parent both locked.
*/
static bool shrink_lock_dentry(struct dentry *dentry)
{
struct inode *inode;
struct dentry *parent;
if (dentry->d_lockref.count)
return false;
inode = dentry->d_inode;
if (inode && unlikely(!spin_trylock(&inode->i_lock))) {
spin_unlock(&dentry->d_lock);
spin_lock(&inode->i_lock);
spin_lock(&dentry->d_lock);
if (unlikely(dentry->d_lockref.count))
goto out;
/* changed inode means that somebody had grabbed it */
if (unlikely(inode != dentry->d_inode))
goto out;
}
parent = dentry->d_parent;
if (IS_ROOT(dentry) || likely(spin_trylock(&parent->d_lock)))
return true;
spin_unlock(&dentry->d_lock);
spin_lock(&parent->d_lock);
if (unlikely(parent != dentry->d_parent)) {
spin_unlock(&parent->d_lock);
spin_lock(&dentry->d_lock);
goto out;
}
spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED);
if (likely(!dentry->d_lockref.count))
return true;
spin_unlock(&parent->d_lock);
out:
if (inode)
spin_unlock(&inode->i_lock);
return false;
}
void shrink_dentry_list(struct list_head *list)
{
while (!list_empty(list)) {
struct dentry *dentry, *parent;
dentry = list_entry(list->prev, struct dentry, d_lru);
spin_lock(&dentry->d_lock);
rcu_read_lock();
if (!shrink_lock_dentry(dentry)) {
bool can_free = false;
rcu_read_unlock();
d_shrink_del(dentry);
if (dentry->d_lockref.count < 0)
can_free = dentry->d_flags & DCACHE_MAY_FREE;
spin_unlock(&dentry->d_lock);
if (can_free)
dentry_free(dentry);
continue;
}
rcu_read_unlock();
d_shrink_del(dentry);
parent = dentry->d_parent;
if (parent != dentry)
__dput_to_list(parent, list); __dentry_kill(dentry);
}
}
static enum lru_status dentry_lru_isolate(struct list_head *item,
struct list_lru_one *lru, spinlock_t *lru_lock, void *arg)
{
struct list_head *freeable = arg;
struct dentry *dentry = container_of(item, struct dentry, d_lru);
/*
* we are inverting the lru lock/dentry->d_lock here,
* so use a trylock. If we fail to get the lock, just skip
* it
*/
if (!spin_trylock(&dentry->d_lock))
return LRU_SKIP;
/*
* Referenced dentries are still in use. If they have active
* counts, just remove them from the LRU. Otherwise give them
* another pass through the LRU.
*/
if (dentry->d_lockref.count) {
d_lru_isolate(lru, dentry);
spin_unlock(&dentry->d_lock);
return LRU_REMOVED;
}
if (dentry->d_flags & DCACHE_REFERENCED) {
dentry->d_flags &= ~DCACHE_REFERENCED;
spin_unlock(&dentry->d_lock);
/*
* The list move itself will be made by the common LRU code. At
* this point, we've dropped the dentry->d_lock but keep the
* lru lock. This is safe to do, since every list movement is
* protected by the lru lock even if both locks are held.
*
* This is guaranteed by the fact that all LRU management
* functions are intermediated by the LRU API calls like
* list_lru_add and list_lru_del. List movement in this file
* only ever occur through this functions or through callbacks
* like this one, that are called from the LRU API.
*
* The only exceptions to this are functions like
* shrink_dentry_list, and code that first checks for the
* DCACHE_SHRINK_LIST flag. Those are guaranteed to be
* operating only with stack provided lists after they are
* properly isolated from the main list. It is thus, always a
* local access.
*/
return LRU_ROTATE;
}
d_lru_shrink_move(lru, dentry, freeable);
spin_unlock(&dentry->d_lock);
return LRU_REMOVED;
}
/**
* prune_dcache_sb - shrink the dcache
* @sb: superblock
* @sc: shrink control, passed to list_lru_shrink_walk()
*
* Attempt to shrink the superblock dcache LRU by @sc->nr_to_scan entries. This
* is done when we need more memory and called from the superblock shrinker
* function.
*
* This function may fail to free any resources if all the dentries are in
* use.
*/
long prune_dcache_sb(struct super_block *sb, struct shrink_control *sc)
{
LIST_HEAD(dispose);
long freed;
freed = list_lru_shrink_walk(&sb->s_dentry_lru, sc,
dentry_lru_isolate, &dispose);
shrink_dentry_list(&dispose);
return freed;
}
static enum lru_status dentry_lru_isolate_shrink(struct list_head *item,
struct list_lru_one *lru, spinlock_t *lru_lock, void *arg)
{
struct list_head *freeable = arg;
struct dentry *dentry = container_of(item, struct dentry, d_lru);
/*
* we are inverting the lru lock/dentry->d_lock here,
* so use a trylock. If we fail to get the lock, just skip
* it
*/
if (!spin_trylock(&dentry->d_lock))
return LRU_SKIP;
d_lru_shrink_move(lru, dentry, freeable);
spin_unlock(&dentry->d_lock);
return LRU_REMOVED;
}
/**
* shrink_dcache_sb - shrink dcache for a superblock
* @sb: superblock
*
* Shrink the dcache for the specified super block. This is used to free
* the dcache before unmounting a file system.
*/
void shrink_dcache_sb(struct super_block *sb)
{
do {
LIST_HEAD(dispose);
list_lru_walk(&sb->s_dentry_lru,
dentry_lru_isolate_shrink, &dispose, 1024);
shrink_dentry_list(&dispose); } while (list_lru_count(&sb->s_dentry_lru) > 0);}
EXPORT_SYMBOL(shrink_dcache_sb);
/**
* enum d_walk_ret - action to talke during tree walk
* @D_WALK_CONTINUE: contrinue walk
* @D_WALK_QUIT: quit walk
* @D_WALK_NORETRY: quit when retry is needed
* @D_WALK_SKIP: skip this dentry and its children
*/
enum d_walk_ret {
D_WALK_CONTINUE,
D_WALK_QUIT,
D_WALK_NORETRY,
D_WALK_SKIP,
};
/**
* d_walk - walk the dentry tree
* @parent: start of walk
* @data: data passed to @enter() and @finish()
* @enter: callback when first entering the dentry
*
* The @enter() callbacks are called with d_lock held.
*/
static void d_walk(struct dentry *parent, void *data,
enum d_walk_ret (*enter)(void *, struct dentry *))
{
struct dentry *this_parent;
struct list_head *next;
unsigned seq = 0;
enum d_walk_ret ret;
bool retry = true;
again:
read_seqbegin_or_lock(&rename_lock, &seq);
this_parent = parent;
spin_lock(&this_parent->d_lock);
ret = enter(data, this_parent);
switch (ret) {
case D_WALK_CONTINUE:
break;
case D_WALK_QUIT:
case D_WALK_SKIP:
goto out_unlock;
case D_WALK_NORETRY:
retry = false;
break;
}
repeat:
next = this_parent->d_subdirs.next;
resume:
while (next != &this_parent->d_subdirs) {
struct list_head *tmp = next;
struct dentry *dentry = list_entry(tmp, struct dentry, d_child);
next = tmp->next;
if (unlikely(dentry->d_flags & DCACHE_DENTRY_CURSOR))
continue;
spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED);
ret = enter(data, dentry);
switch (ret) {
case D_WALK_CONTINUE:
break;
case D_WALK_QUIT:
spin_unlock(&dentry->d_lock);
goto out_unlock;
case D_WALK_NORETRY:
retry = false;
break;
case D_WALK_SKIP:
spin_unlock(&dentry->d_lock);
continue;
}
if (!list_empty(&dentry->d_subdirs)) {
spin_unlock(&this_parent->d_lock);
spin_release(&dentry->d_lock.dep_map, _RET_IP_);
this_parent = dentry;
spin_acquire(&this_parent->d_lock.dep_map, 0, 1, _RET_IP_);
goto repeat;
}
spin_unlock(&dentry->d_lock);
}
/*
* All done at this level ... ascend and resume the search.
*/
rcu_read_lock();
ascend:
if (this_parent != parent) {
struct dentry *child = this_parent;
this_parent = child->d_parent;
spin_unlock(&child->d_lock);
spin_lock(&this_parent->d_lock);
/* might go back up the wrong parent if we have had a rename. */
if (need_seqretry(&rename_lock, seq))
goto rename_retry;
/* go into the first sibling still alive */
do {
next = child->d_child.next;
if (next == &this_parent->d_subdirs)
goto ascend;
child = list_entry(next, struct dentry, d_child);
} while (unlikely(child->d_flags & DCACHE_DENTRY_KILLED));
rcu_read_unlock();
goto resume;
}
if (need_seqretry(&rename_lock, seq))
goto rename_retry;
rcu_read_unlock();
out_unlock:
spin_unlock(&this_parent->d_lock);
done_seqretry(&rename_lock, seq);
return;
rename_retry:
spin_unlock(&this_parent->d_lock);
rcu_read_unlock();
BUG_ON(seq & 1); if (!retry)
return;
seq = 1;
goto again;
}
struct check_mount {
struct vfsmount *mnt;
unsigned int mounted;
};
static enum d_walk_ret path_check_mount(void *data, struct dentry *dentry)
{
struct check_mount *info = data;
struct path path = { .mnt = info->mnt, .dentry = dentry };
if (likely(!d_mountpoint(dentry)))
return D_WALK_CONTINUE;
if (__path_is_mountpoint(&path)) {
info->mounted = 1;
return D_WALK_QUIT;
}
return D_WALK_CONTINUE;
}
/**
* path_has_submounts - check for mounts over a dentry in the
* current namespace.
* @parent: path to check.
*
* Return true if the parent or its subdirectories contain
* a mount point in the current namespace.
*/
int path_has_submounts(const struct path *parent)
{
struct check_mount data = { .mnt = parent->mnt, .mounted = 0 };
read_seqlock_excl(&mount_lock);
d_walk(parent->dentry, &data, path_check_mount);
read_sequnlock_excl(&mount_lock);
return data.mounted;
}
EXPORT_SYMBOL(path_has_submounts);
/*
* Called by mount code to set a mountpoint and check if the mountpoint is
* reachable (e.g. NFS can unhash a directory dentry and then the complete
* subtree can become unreachable).
*
* Only one of d_invalidate() and d_set_mounted() must succeed. For
* this reason take rename_lock and d_lock on dentry and ancestors.
*/
int d_set_mounted(struct dentry *dentry)
{
struct dentry *p;
int ret = -ENOENT;
write_seqlock(&rename_lock);
for (p = dentry->d_parent; !IS_ROOT(p); p = p->d_parent) {
/* Need exclusion wrt. d_invalidate() */
spin_lock(&p->d_lock);
if (unlikely(d_unhashed(p))) {
spin_unlock(&p->d_lock);
goto out;
}
spin_unlock(&p->d_lock);
}
spin_lock(&dentry->d_lock);
if (!d_unlinked(dentry)) {
ret = -EBUSY;
if (!d_mountpoint(dentry)) { dentry->d_flags |= DCACHE_MOUNTED;
ret = 0;
}
}
spin_unlock(&dentry->d_lock);
out:
write_sequnlock(&rename_lock);
return ret;
}
/*
* Search the dentry child list of the specified parent,
* and move any unused dentries to the end of the unused
* list for prune_dcache(). We descend to the next level
* whenever the d_subdirs list is non-empty and continue
* searching.
*
* It returns zero iff there are no unused children,
* otherwise it returns the number of children moved to
* the end of the unused list. This may not be the total
* number of unused children, because select_parent can
* drop the lock and return early due to latency
* constraints.
*/
struct select_data {
struct dentry *start;
union {
long found;
struct dentry *victim;
};
struct list_head dispose;
};
static enum d_walk_ret select_collect(void *_data, struct dentry *dentry)
{
struct select_data *data = _data;
enum d_walk_ret ret = D_WALK_CONTINUE;
if (data->start == dentry)
goto out;
if (dentry->d_flags & DCACHE_SHRINK_LIST) { data->found++;
} else {
if (dentry->d_flags & DCACHE_LRU_LIST) d_lru_del(dentry); if (!dentry->d_lockref.count) {
d_shrink_add(dentry, &data->dispose);
data->found++;
}
}
/*
* We can return to the caller if we have found some (this
* ensures forward progress). We'll be coming back to find
* the rest.
*/
if (!list_empty(&data->dispose))
ret = need_resched() ? D_WALK_QUIT : D_WALK_NORETRY;
out:
return ret;
}
static enum d_walk_ret select_collect2(void *_data, struct dentry *dentry)
{
struct select_data *data = _data;
enum d_walk_ret ret = D_WALK_CONTINUE;
if (data->start == dentry)
goto out;
if (dentry->d_flags & DCACHE_SHRINK_LIST) {
if (!dentry->d_lockref.count) {
rcu_read_lock();
data->victim = dentry;
return D_WALK_QUIT;
}
} else {
if (dentry->d_flags & DCACHE_LRU_LIST)
d_lru_del(dentry);
if (!dentry->d_lockref.count)
d_shrink_add(dentry, &data->dispose);
}
/*
* We can return to the caller if we have found some (this
* ensures forward progress). We'll be coming back to find
* the rest.
*/
if (!list_empty(&data->dispose))
ret = need_resched() ? D_WALK_QUIT : D_WALK_NORETRY;
out:
return ret;
}
/**
* shrink_dcache_parent - prune dcache
* @parent: parent of entries to prune
*
* Prune the dcache to remove unused children of the parent dentry.
*/
void shrink_dcache_parent(struct dentry *parent)
{
for (;;) {
struct select_data data = {.start = parent};
INIT_LIST_HEAD(&data.dispose);
d_walk(parent, &data, select_collect);
if (!list_empty(&data.dispose)) {
shrink_dentry_list(&data.dispose); continue;
}
cond_resched();
if (!data.found)
break;
data.victim = NULL;
d_walk(parent, &data, select_collect2);
if (data.victim) {
struct dentry *parent;
spin_lock(&data.victim->d_lock);
if (!shrink_lock_dentry(data.victim)) {
spin_unlock(&data.victim->d_lock);
rcu_read_unlock();
} else {
rcu_read_unlock();
parent = data.victim->d_parent;
if (parent != data.victim)
__dput_to_list(parent, &data.dispose); __dentry_kill(data.victim);
}
}
if (!list_empty(&data.dispose))
shrink_dentry_list(&data.dispose);
}
}
EXPORT_SYMBOL(shrink_dcache_parent);
static enum d_walk_ret umount_check(void *_data, struct dentry *dentry)
{
/* it has busy descendents; complain about those instead */
if (!list_empty(&dentry->d_subdirs))
return D_WALK_CONTINUE;
/* root with refcount 1 is fine */
if (dentry == _data && dentry->d_lockref.count == 1)
return D_WALK_CONTINUE;
printk(KERN_ERR "BUG: Dentry %p{i=%lx,n=%pd} "
" still in use (%d) [unmount of %s %s]\n",
dentry,
dentry->d_inode ?
dentry->d_inode->i_ino : 0UL,
dentry,
dentry->d_lockref.count,
dentry->d_sb->s_type->name,
dentry->d_sb->s_id);
WARN_ON(1);
return D_WALK_CONTINUE;
}
static void do_one_tree(struct dentry *dentry)
{
shrink_dcache_parent(dentry);
d_walk(dentry, dentry, umount_check);
d_drop(dentry);
dput(dentry);
}
/*
* destroy the dentries attached to a superblock on unmounting
*/
void shrink_dcache_for_umount(struct super_block *sb)
{
struct dentry *dentry;
WARN(down_read_trylock(&sb->s_umount), "s_umount should've been locked"); dentry = sb->s_root;
sb->s_root = NULL;
do_one_tree(dentry);
while (!hlist_bl_empty(&sb->s_roots)) {
dentry = dget(hlist_bl_entry(hlist_bl_first(&sb->s_roots), struct dentry, d_hash)); do_one_tree(dentry);
}
}
static enum d_walk_ret find_submount(void *_data, struct dentry *dentry)
{
struct dentry **victim = _data;
if (d_mountpoint(dentry)) {
__dget_dlock(dentry);
*victim = dentry;
return D_WALK_QUIT;
}
return D_WALK_CONTINUE;
}
/**
* d_invalidate - detach submounts, prune dcache, and drop
* @dentry: dentry to invalidate (aka detach, prune and drop)
*/
void d_invalidate(struct dentry *dentry)
{
bool had_submounts = false;
spin_lock(&dentry->d_lock);
if (d_unhashed(dentry)) {
spin_unlock(&dentry->d_lock);
return;
}
__d_drop(dentry);
spin_unlock(&dentry->d_lock);
/* Negative dentries can be dropped without further checks */
if (!dentry->d_inode)
return;
shrink_dcache_parent(dentry);
for (;;) {
struct dentry *victim = NULL;
d_walk(dentry, &victim, find_submount);
if (!victim) {
if (had_submounts) shrink_dcache_parent(dentry); return;
}
had_submounts = true;
detach_mounts(victim);
dput(victim);
}
}
EXPORT_SYMBOL(d_invalidate);
/**
* __d_alloc - allocate a dcache entry
* @sb: filesystem it will belong to
* @name: qstr of the name
*
* Allocates a dentry. It returns %NULL if there is insufficient memory
* available. On a success the dentry is returned. The name passed in is
* copied and the copy passed in may be reused after this call.
*/
static struct dentry *__d_alloc(struct super_block *sb, const struct qstr *name)
{
struct dentry *dentry;
char *dname;
int err;
dentry = kmem_cache_alloc(dentry_cache, GFP_KERNEL);
if (!dentry)
return NULL;
/*
* We guarantee that the inline name is always NUL-terminated.
* This way the memcpy() done by the name switching in rename
* will still always have a NUL at the end, even if we might
* be overwriting an internal NUL character
*/
dentry->d_iname[DNAME_INLINE_LEN-1] = 0;
if (unlikely(!name)) {
name = &slash_name;
dname = dentry->d_iname; } else if (name->len > DNAME_INLINE_LEN-1) {
size_t size = offsetof(struct external_name, name[1]);
struct external_name *p = kmalloc(size + name->len,
GFP_KERNEL_ACCOUNT |
__GFP_RECLAIMABLE);
if (!p) {
kmem_cache_free(dentry_cache, dentry);
return NULL;
}
atomic_set(&p->u.count, 1);
dname = p->name;
} else {
dname = dentry->d_iname;
}
dentry->d_name.len = name->len;
dentry->d_name.hash = name->hash;
memcpy(dname, name->name, name->len);
dname[name->len] = 0;
/* Make sure we always see the terminating NUL character */
smp_store_release(&dentry->d_name.name, dname); /* ^^^ */
dentry->d_lockref.count = 1;
dentry->d_flags = 0;
spin_lock_init(&dentry->d_lock);
seqcount_spinlock_init(&dentry->d_seq, &dentry->d_lock);
dentry->d_inode = NULL;
dentry->d_parent = dentry;
dentry->d_sb = sb;
dentry->d_op = NULL;
dentry->d_fsdata = NULL;
INIT_HLIST_BL_NODE(&dentry->d_hash);
INIT_LIST_HEAD(&dentry->d_lru);
INIT_LIST_HEAD(&dentry->d_subdirs);
INIT_HLIST_NODE(&dentry->d_u.d_alias);
INIT_LIST_HEAD(&dentry->d_child);
d_set_d_op(dentry, dentry->d_sb->s_d_op);
if (dentry->d_op && dentry->d_op->d_init) { err = dentry->d_op->d_init(dentry);
if (err) {
if (dname_external(dentry))
kfree(external_name(dentry));
kmem_cache_free(dentry_cache, dentry);
return NULL;
}
}
this_cpu_inc(nr_dentry); return dentry;
}
/**
* d_alloc - allocate a dcache entry
* @parent: parent of entry to allocate
* @name: qstr of the name
*
* Allocates a dentry. It returns %NULL if there is insufficient memory
* available. On a success the dentry is returned. The name passed in is
* copied and the copy passed in may be reused after this call.
*/
struct dentry *d_alloc(struct dentry * parent, const struct qstr *name)
{
struct dentry *dentry = __d_alloc(parent->d_sb, name);
if (!dentry)
return NULL;
spin_lock(&parent->d_lock);
/*
* don't need child lock because it is not subject
* to concurrency here
*/
__dget_dlock(parent);
dentry->d_parent = parent;
list_add(&dentry->d_child, &parent->d_subdirs);
spin_unlock(&parent->d_lock);
return dentry;
}
EXPORT_SYMBOL(d_alloc);
struct dentry *d_alloc_anon(struct super_block *sb)
{
return __d_alloc(sb, NULL);
}
EXPORT_SYMBOL(d_alloc_anon);
struct dentry *d_alloc_cursor(struct dentry * parent)
{
struct dentry *dentry = d_alloc_anon(parent->d_sb);
if (dentry) {
dentry->d_flags |= DCACHE_DENTRY_CURSOR;
dentry->d_parent = dget(parent);
}
return dentry;
}
/**
* d_alloc_pseudo - allocate a dentry (for lookup-less filesystems)
* @sb: the superblock
* @name: qstr of the name
*
* For a filesystem that just pins its dentries in memory and never
* performs lookups at all, return an unhashed IS_ROOT dentry.
* This is used for pipes, sockets et.al. - the stuff that should
* never be anyone's children or parents. Unlike all other
* dentries, these will not have RCU delay between dropping the
* last reference and freeing them.
*
* The only user is alloc_file_pseudo() and that's what should
* be considered a public interface. Don't use directly.
*/
struct dentry *d_alloc_pseudo(struct super_block *sb, const struct qstr *name)
{
struct dentry *dentry = __d_alloc(sb, name);
if (likely(dentry))
dentry->d_flags |= DCACHE_NORCU; return dentry;
}
struct dentry *d_alloc_name(struct dentry *parent, const char *name)
{
struct qstr q;
q.name = name;
q.hash_len = hashlen_string(parent, name);
return d_alloc(parent, &q);
}
EXPORT_SYMBOL(d_alloc_name);
void d_set_d_op(struct dentry *dentry, const struct dentry_operations *op)
{
WARN_ON_ONCE(dentry->d_op); WARN_ON_ONCE(dentry->d_flags & (DCACHE_OP_HASH |
DCACHE_OP_COMPARE |
DCACHE_OP_REVALIDATE |
DCACHE_OP_WEAK_REVALIDATE |
DCACHE_OP_DELETE |
DCACHE_OP_REAL));
dentry->d_op = op;
if (!op)
return;
if (op->d_hash) dentry->d_flags |= DCACHE_OP_HASH; if (op->d_compare) dentry->d_flags |= DCACHE_OP_COMPARE; if (op->d_revalidate) dentry->d_flags |= DCACHE_OP_REVALIDATE; if (op->d_weak_revalidate) dentry->d_flags |= DCACHE_OP_WEAK_REVALIDATE; if (op->d_delete) dentry->d_flags |= DCACHE_OP_DELETE; if (op->d_prune) dentry->d_flags |= DCACHE_OP_PRUNE; if (op->d_real) dentry->d_flags |= DCACHE_OP_REAL;
}
EXPORT_SYMBOL(d_set_d_op);
/*
* d_set_fallthru - Mark a dentry as falling through to a lower layer
* @dentry - The dentry to mark
*
* Mark a dentry as falling through to the lower layer (as set with
* d_pin_lower()). This flag may be recorded on the medium.
*/
void d_set_fallthru(struct dentry *dentry)
{
spin_lock(&dentry->d_lock);
dentry->d_flags |= DCACHE_FALLTHRU;
spin_unlock(&dentry->d_lock);
}
EXPORT_SYMBOL(d_set_fallthru);
static unsigned d_flags_for_inode(struct inode *inode)
{
unsigned add_flags = DCACHE_REGULAR_TYPE;
if (!inode)
return DCACHE_MISS_TYPE;
if (S_ISDIR(inode->i_mode)) {
add_flags = DCACHE_DIRECTORY_TYPE;
if (unlikely(!(inode->i_opflags & IOP_LOOKUP))) { if (unlikely(!inode->i_op->lookup))
add_flags = DCACHE_AUTODIR_TYPE;
else
inode->i_opflags |= IOP_LOOKUP;
}
goto type_determined;
}
if (unlikely(!(inode->i_opflags & IOP_NOFOLLOW))) { if (unlikely(inode->i_op->get_link)) {
add_flags = DCACHE_SYMLINK_TYPE;
goto type_determined;
}
inode->i_opflags |= IOP_NOFOLLOW;
}
if (unlikely(!S_ISREG(inode->i_mode)))
add_flags = DCACHE_SPECIAL_TYPE;
type_determined:
if (unlikely(IS_AUTOMOUNT(inode))) add_flags |= DCACHE_NEED_AUTOMOUNT;
return add_flags;
}
static void __d_instantiate(struct dentry *dentry, struct inode *inode)
{
unsigned add_flags = d_flags_for_inode(inode); WARN_ON(d_in_lookup(dentry));
spin_lock(&dentry->d_lock);
/*
* Decrement negative dentry count if it was in the LRU list.
*/
if (dentry->d_flags & DCACHE_LRU_LIST)
this_cpu_dec(nr_dentry_negative); hlist_add_head(&dentry->d_u.d_alias, &inode->i_dentry);
raw_write_seqcount_begin(&dentry->d_seq);
__d_set_inode_and_type(dentry, inode, add_flags);
raw_write_seqcount_end(&dentry->d_seq);
fsnotify_update_flags(dentry);
spin_unlock(&dentry->d_lock);
}
/**
* d_instantiate - fill in inode information for a dentry
* @entry: dentry to complete
* @inode: inode to attach to this dentry
*
* Fill in inode information in the entry.
*
* This turns negative dentries into productive full members
* of society.
*
* NOTE! This assumes that the inode count has been incremented
* (or otherwise set) by the caller to indicate that it is now
* in use by the dcache.
*/
void d_instantiate(struct dentry *entry, struct inode * inode)
{
BUG_ON(!hlist_unhashed(&entry->d_u.d_alias)); if (inode) { security_d_instantiate(entry, inode);
spin_lock(&inode->i_lock);
__d_instantiate(entry, inode);
spin_unlock(&inode->i_lock);
}
}
EXPORT_SYMBOL(d_instantiate);
/*
* This should be equivalent to d_instantiate() + unlock_new_inode(),
* with lockdep-related part of unlock_new_inode() done before
* anything else. Use that instead of open-coding d_instantiate()/
* unlock_new_inode() combinations.
*/
void d_instantiate_new(struct dentry *entry, struct inode *inode)
{
BUG_ON(!hlist_unhashed(&entry->d_u.d_alias)); BUG_ON(!inode);
lockdep_annotate_inode_mutex_key(inode);
security_d_instantiate(entry, inode);
spin_lock(&inode->i_lock);
__d_instantiate(entry, inode);
WARN_ON(!(inode->i_state & I_NEW)); inode->i_state &= ~I_NEW & ~I_CREATING;
smp_mb();
wake_up_bit(&inode->i_state, __I_NEW);
spin_unlock(&inode->i_lock);
}
EXPORT_SYMBOL(d_instantiate_new);
struct dentry *d_make_root(struct inode *root_inode)
{
struct dentry *res = NULL;
if (root_inode) { res = d_alloc_anon(root_inode->i_sb);
if (res)
d_instantiate(res, root_inode);
else
iput(root_inode);
}
return res;
}
EXPORT_SYMBOL(d_make_root);
static struct dentry *__d_instantiate_anon(struct dentry *dentry,
struct inode *inode,
bool disconnected)
{
struct dentry *res;
unsigned add_flags;
security_d_instantiate(dentry, inode);
spin_lock(&inode->i_lock);
res = __d_find_any_alias(inode);
if (res) {
spin_unlock(&inode->i_lock);
dput(dentry);
goto out_iput;
}
/* attach a disconnected dentry */
add_flags = d_flags_for_inode(inode);
if (disconnected)
add_flags |= DCACHE_DISCONNECTED;
spin_lock(&dentry->d_lock);
__d_set_inode_and_type(dentry, inode, add_flags);
hlist_add_head(&dentry->d_u.d_alias, &inode->i_dentry);
if (!disconnected) {
hlist_bl_lock(&dentry->d_sb->s_roots);
hlist_bl_add_head(&dentry->d_hash, &dentry->d_sb->s_roots);
hlist_bl_unlock(&dentry->d_sb->s_roots);
}
spin_unlock(&dentry->d_lock);
spin_unlock(&inode->i_lock);
return dentry;
out_iput:
iput(inode);
return res;
}
struct dentry *d_instantiate_anon(struct dentry *dentry, struct inode *inode)
{
return __d_instantiate_anon(dentry, inode, true);
}
EXPORT_SYMBOL(d_instantiate_anon);
static struct dentry *__d_obtain_alias(struct inode *inode, bool disconnected)
{
struct dentry *tmp;
struct dentry *res;
if (!inode)
return ERR_PTR(-ESTALE);
if (IS_ERR(inode))
return ERR_CAST(inode);
res = d_find_any_alias(inode);
if (res)
goto out_iput;
tmp = d_alloc_anon(inode->i_sb);
if (!tmp) {
res = ERR_PTR(-ENOMEM);
goto out_iput;
}
return __d_instantiate_anon(tmp, inode, disconnected);
out_iput:
iput(inode);
return res;
}
/**
* d_obtain_alias - find or allocate a DISCONNECTED dentry for a given inode
* @inode: inode to allocate the dentry for
*
* Obtain a dentry for an inode resulting from NFS filehandle conversion or
* similar open by handle operations. The returned dentry may be anonymous,
* or may have a full name (if the inode was already in the cache).
*
* When called on a directory inode, we must ensure that the inode only ever
* has one dentry. If a dentry is found, that is returned instead of
* allocating a new one.
*
* On successful return, the reference to the inode has been transferred
* to the dentry. In case of an error the reference on the inode is released.
* To make it easier to use in export operations a %NULL or IS_ERR inode may
* be passed in and the error will be propagated to the return value,
* with a %NULL @inode replaced by ERR_PTR(-ESTALE).
*/
struct dentry *d_obtain_alias(struct inode *inode)
{
return __d_obtain_alias(inode, true);
}
EXPORT_SYMBOL(d_obtain_alias);
/**
* d_obtain_root - find or allocate a dentry for a given inode
* @inode: inode to allocate the dentry for
*
* Obtain an IS_ROOT dentry for the root of a filesystem.
*
* We must ensure that directory inodes only ever have one dentry. If a
* dentry is found, that is returned instead of allocating a new one.
*
* On successful return, the reference to the inode has been transferred
* to the dentry. In case of an error the reference on the inode is
* released. A %NULL or IS_ERR inode may be passed in and will be the
* error will be propagate to the return value, with a %NULL @inode
* replaced by ERR_PTR(-ESTALE).
*/
struct dentry *d_obtain_root(struct inode *inode)
{
return __d_obtain_alias(inode, false);
}
EXPORT_SYMBOL(d_obtain_root);
/**
* d_add_ci - lookup or allocate new dentry with case-exact name
* @inode: the inode case-insensitive lookup has found
* @dentry: the negative dentry that was passed to the parent's lookup func
* @name: the case-exact name to be associated with the returned dentry
*
* This is to avoid filling the dcache with case-insensitive names to the
* same inode, only the actual correct case is stored in the dcache for
* case-insensitive filesystems.
*
* For a case-insensitive lookup match and if the case-exact dentry
* already exists in the dcache, use it and return it.
*
* If no entry exists with the exact case name, allocate new dentry with
* the exact case, and return the spliced entry.
*/
struct dentry *d_add_ci(struct dentry *dentry, struct inode *inode,
struct qstr *name)
{
struct dentry *found, *res;
/*
* First check if a dentry matching the name already exists,
* if not go ahead and create it now.
*/
found = d_hash_and_lookup(dentry->d_parent, name);
if (found) {
iput(inode);
return found;
}
if (d_in_lookup(dentry)) {
found = d_alloc_parallel(dentry->d_parent, name,
dentry->d_wait);
if (IS_ERR(found) || !d_in_lookup(found)) {
iput(inode);
return found;
}
} else {
found = d_alloc(dentry->d_parent, name);
if (!found) {
iput(inode);
return ERR_PTR(-ENOMEM);
}
}
res = d_splice_alias(inode, found);
if (res) {
dput(found);
return res;
}
return found;
}
EXPORT_SYMBOL(d_add_ci);
static inline bool d_same_name(const struct dentry *dentry,
const struct dentry *parent,
const struct qstr *name)
{
if (likely(!(parent->d_flags & DCACHE_OP_COMPARE))) {
if (dentry->d_name.len != name->len)
return false;
return dentry_cmp(dentry, name->name, name->len) == 0;
}
return parent->d_op->d_compare(dentry,
dentry->d_name.len, dentry->d_name.name,
name) == 0;
}
/**
* __d_lookup_rcu - search for a dentry (racy, store-free)
* @parent: parent dentry
* @name: qstr of name we wish to find
* @seqp: returns d_seq value at the point where the dentry was found
* Returns: dentry, or NULL
*
* __d_lookup_rcu is the dcache lookup function for rcu-walk name
* resolution (store-free path walking) design described in
* Documentation/filesystems/path-lookup.txt.
*
* This is not to be used outside core vfs.
*
* __d_lookup_rcu must only be used in rcu-walk mode, ie. with vfsmount lock
* held, and rcu_read_lock held. The returned dentry must not be stored into
* without taking d_lock and checking d_seq sequence count against @seq
* returned here.
*
* A refcount may be taken on the found dentry with the d_rcu_to_refcount
* function.
*
* Alternatively, __d_lookup_rcu may be called again to look up the child of
* the returned dentry, so long as its parent's seqlock is checked after the
* child is looked up. Thus, an interlocking stepping of sequence lock checks
* is formed, giving integrity down the path walk.
*
* NOTE! The caller *has* to check the resulting dentry against the sequence
* number we've returned before using any of the resulting dentry state!
*/
struct dentry *__d_lookup_rcu(const struct dentry *parent,
const struct qstr *name,
unsigned *seqp)
{
u64 hashlen = name->hash_len;
const unsigned char *str = name->name;
struct hlist_bl_head *b = d_hash(hashlen_hash(hashlen));
struct hlist_bl_node *node;
struct dentry *dentry;
/*
* Note: There is significant duplication with __d_lookup_rcu which is
* required to prevent single threaded performance regressions
* especially on architectures where smp_rmb (in seqcounts) are costly.
* Keep the two functions in sync.
*/
/*
* The hash list is protected using RCU.
*
* Carefully use d_seq when comparing a candidate dentry, to avoid
* races with d_move().
*
* It is possible that concurrent renames can mess up our list
* walk here and result in missing our dentry, resulting in the
* false-negative result. d_lookup() protects against concurrent
* renames using rename_lock seqlock.
*
* See Documentation/filesystems/path-lookup.txt for more details.
*/
hlist_bl_for_each_entry_rcu(dentry, node, b, d_hash) {
unsigned seq;
seqretry:
/*
* The dentry sequence count protects us from concurrent
* renames, and thus protects parent and name fields.
*
* The caller must perform a seqcount check in order
* to do anything useful with the returned dentry.
*
* NOTE! We do a "raw" seqcount_begin here. That means that
* we don't wait for the sequence count to stabilize if it
* is in the middle of a sequence change. If we do the slow
* dentry compare, we will do seqretries until it is stable,
* and if we end up with a successful lookup, we actually
* want to exit RCU lookup anyway.
*
* Note that raw_seqcount_begin still *does* smp_rmb(), so
* we are still guaranteed NUL-termination of ->d_name.name.
*/
seq = raw_seqcount_begin(&dentry->d_seq);
if (dentry->d_parent != parent)
continue;
if (d_unhashed(dentry))
continue;
if (unlikely(parent->d_flags & DCACHE_OP_COMPARE)) {
int tlen;
const char *tname;
if (dentry->d_name.hash != hashlen_hash(hashlen))
continue;
tlen = dentry->d_name.len;
tname = dentry->d_name.name;
/* we want a consistent (name,len) pair */
if (read_seqcount_retry(&dentry->d_seq, seq)) {
cpu_relax();
goto seqretry;
}
if (parent->d_op->d_compare(dentry,
tlen, tname, name) != 0)
continue;
} else {
if (dentry->d_name.hash_len != hashlen)
continue;
if (dentry_cmp(dentry, str, hashlen_len(hashlen)) != 0)
continue;
}
*seqp = seq; return dentry;
}
return NULL;
}
/**
* d_lookup - search for a dentry
* @parent: parent dentry
* @name: qstr of name we wish to find
* Returns: dentry, or NULL
*
* d_lookup searches the children of the parent dentry for the name in
* question. If the dentry is found its reference count is incremented and the
* dentry is returned. The caller must use dput to free the entry when it has
* finished using it. %NULL is returned if the dentry does not exist.
*/
struct dentry *d_lookup(const struct dentry *parent, const struct qstr *name)
{
struct dentry *dentry;
unsigned seq;
do {
seq = read_seqbegin(&rename_lock);
dentry = __d_lookup(parent, name);
if (dentry)
break;
} while (read_seqretry(&rename_lock, seq));
return dentry;
}
EXPORT_SYMBOL(d_lookup);
/**
* __d_lookup - search for a dentry (racy)
* @parent: parent dentry
* @name: qstr of name we wish to find
* Returns: dentry, or NULL
*
* __d_lookup is like d_lookup, however it may (rarely) return a
* false-negative result due to unrelated rename activity.
*
* __d_lookup is slightly faster by avoiding rename_lock read seqlock,
* however it must be used carefully, eg. with a following d_lookup in
* the case of failure.
*
* __d_lookup callers must be commented.
*/
struct dentry *__d_lookup(const struct dentry *parent, const struct qstr *name)
{
unsigned int hash = name->hash;
struct hlist_bl_head *b = d_hash(hash);
struct hlist_bl_node *node;
struct dentry *found = NULL;
struct dentry *dentry;
/*
* Note: There is significant duplication with __d_lookup_rcu which is
* required to prevent single threaded performance regressions
* especially on architectures where smp_rmb (in seqcounts) are costly.
* Keep the two functions in sync.
*/
/*
* The hash list is protected using RCU.
*
* Take d_lock when comparing a candidate dentry, to avoid races
* with d_move().
*
* It is possible that concurrent renames can mess up our list
* walk here and result in missing our dentry, resulting in the
* false-negative result. d_lookup() protects against concurrent
* renames using rename_lock seqlock.
*
* See Documentation/filesystems/path-lookup.txt for more details.
*/
rcu_read_lock();
hlist_bl_for_each_entry_rcu(dentry, node, b, d_hash) { if (dentry->d_name.hash != hash)
continue;
spin_lock(&dentry->d_lock);
if (dentry->d_parent != parent)
goto next;
if (d_unhashed(dentry))
goto next;
if (!d_same_name(dentry, parent, name))
goto next;
dentry->d_lockref.count++;
found = dentry;
spin_unlock(&dentry->d_lock);
break;
next:
spin_unlock(&dentry->d_lock);
}
rcu_read_unlock();
return found;
}
/**
* d_hash_and_lookup - hash the qstr then search for a dentry
* @dir: Directory to search in
* @name: qstr of name we wish to find
*
* On lookup failure NULL is returned; on bad name - ERR_PTR(-error)
*/
struct dentry *d_hash_and_lookup(struct dentry *dir, struct qstr *name)
{
/*
* Check for a fs-specific hash function. Note that we must
* calculate the standard hash first, as the d_op->d_hash()
* routine may choose to leave the hash value unchanged.
*/
name->hash = full_name_hash(dir, name->name, name->len);
if (dir->d_flags & DCACHE_OP_HASH) {
int err = dir->d_op->d_hash(dir, name);
if (unlikely(err < 0))
return ERR_PTR(err);
}
return d_lookup(dir, name);
}
EXPORT_SYMBOL(d_hash_and_lookup);
/*
* When a file is deleted, we have two options:
* - turn this dentry into a negative dentry
* - unhash this dentry and free it.
*
* Usually, we want to just turn this into
* a negative dentry, but if anybody else is
* currently using the dentry or the inode
* we can't do that and we fall back on removing
* it from the hash queues and waiting for
* it to be deleted later when it has no users
*/
/**
* d_delete - delete a dentry
* @dentry: The dentry to delete
*
* Turn the dentry into a negative dentry if possible, otherwise
* remove it from the hash queues so it can be deleted later
*/
void d_delete(struct dentry * dentry)
{
struct inode *inode = dentry->d_inode;
spin_lock(&inode->i_lock);
spin_lock(&dentry->d_lock);
/*
* Are we the only user?
*/
if (dentry->d_lockref.count == 1) {
dentry->d_flags &= ~DCACHE_CANT_MOUNT;
dentry_unlink_inode(dentry);
} else {
__d_drop(dentry);
spin_unlock(&dentry->d_lock);
spin_unlock(&inode->i_lock);
}
}
EXPORT_SYMBOL(d_delete);
static void __d_rehash(struct dentry *entry)
{
struct hlist_bl_head *b = d_hash(entry->d_name.hash);
hlist_bl_lock(b);
hlist_bl_add_head_rcu(&entry->d_hash, b);
hlist_bl_unlock(b);
}
/**
* d_rehash - add an entry back to the hash
* @entry: dentry to add to the hash
*
* Adds a dentry to the hash according to its name.
*/
void d_rehash(struct dentry * entry)
{
spin_lock(&entry->d_lock);
__d_rehash(entry);
spin_unlock(&entry->d_lock);
}
EXPORT_SYMBOL(d_rehash);
static inline unsigned start_dir_add(struct inode *dir)
{
for (;;) {
unsigned n = dir->i_dir_seq;
if (!(n & 1) && cmpxchg(&dir->i_dir_seq, n, n + 1) == n)
return n;
cpu_relax();
}
}
static inline void end_dir_add(struct inode *dir, unsigned n)
{
smp_store_release(&dir->i_dir_seq, n + 2);
}
static void d_wait_lookup(struct dentry *dentry)
{
if (d_in_lookup(dentry)) {
DECLARE_WAITQUEUE(wait, current);
add_wait_queue(dentry->d_wait, &wait);
do {
set_current_state(TASK_UNINTERRUPTIBLE);
spin_unlock(&dentry->d_lock);
schedule();
spin_lock(&dentry->d_lock);
} while (d_in_lookup(dentry));
}
}
struct dentry *d_alloc_parallel(struct dentry *parent,
const struct qstr *name,
wait_queue_head_t *wq)
{
unsigned int hash = name->hash;
struct hlist_bl_head *b = in_lookup_hash(parent, hash);
struct hlist_bl_node *node;
struct dentry *new = d_alloc(parent, name);
struct dentry *dentry;
unsigned seq, r_seq, d_seq;
if (unlikely(!new))
return ERR_PTR(-ENOMEM);
retry:
rcu_read_lock();
seq = smp_load_acquire(&parent->d_inode->i_dir_seq);
r_seq = read_seqbegin(&rename_lock);
dentry = __d_lookup_rcu(parent, name, &d_seq);
if (unlikely(dentry)) {
if (!lockref_get_not_dead(&dentry->d_lockref)) {
rcu_read_unlock();
goto retry;
}
if (read_seqcount_retry(&dentry->d_seq, d_seq)) {
rcu_read_unlock();
dput(dentry);
goto retry;
}
rcu_read_unlock();
dput(new);
return dentry;
}
if (unlikely(read_seqretry(&rename_lock, r_seq))) {
rcu_read_unlock();
goto retry;
}
if (unlikely(seq & 1)) {
rcu_read_unlock();
goto retry;
}
hlist_bl_lock(b);
if (unlikely(READ_ONCE(parent->d_inode->i_dir_seq) != seq)) {
hlist_bl_unlock(b);
rcu_read_unlock();
goto retry;
}
/*
* No changes for the parent since the beginning of d_lookup().
* Since all removals from the chain happen with hlist_bl_lock(),
* any potential in-lookup matches are going to stay here until
* we unlock the chain. All fields are stable in everything
* we encounter.
*/
hlist_bl_for_each_entry(dentry, node, b, d_u.d_in_lookup_hash) { if (dentry->d_name.hash != hash)
continue;
if (dentry->d_parent != parent)
continue;
if (!d_same_name(dentry, parent, name))
continue;
hlist_bl_unlock(b);
/* now we can try to grab a reference */
if (!lockref_get_not_dead(&dentry->d_lockref)) {
rcu_read_unlock();
goto retry;
}
rcu_read_unlock();
/*
* somebody is likely to be still doing lookup for it;
* wait for them to finish
*/
spin_lock(&dentry->d_lock);
d_wait_lookup(dentry);
/*
* it's not in-lookup anymore; in principle we should repeat
* everything from dcache lookup, but it's likely to be what
* d_lookup() would've found anyway. If it is, just return it;
* otherwise we really have to repeat the whole thing.
*/
if (unlikely(dentry->d_name.hash != hash))
goto mismatch;
if (unlikely(dentry->d_parent != parent))
goto mismatch;
if (unlikely(d_unhashed(dentry)))
goto mismatch;
if (unlikely(!d_same_name(dentry, parent, name)))
goto mismatch;
/* OK, it *is* a hashed match; return it */
spin_unlock(&dentry->d_lock);
dput(new);
return dentry;
}
rcu_read_unlock();
/* we can't take ->d_lock here; it's OK, though. */
new->d_flags |= DCACHE_PAR_LOOKUP;
new->d_wait = wq;
hlist_bl_add_head_rcu(&new->d_u.d_in_lookup_hash, b);
hlist_bl_unlock(b);
return new;
mismatch:
spin_unlock(&dentry->d_lock);
dput(dentry);
goto retry;
}
EXPORT_SYMBOL(d_alloc_parallel);
void __d_lookup_done(struct dentry *dentry)
{
struct hlist_bl_head *b = in_lookup_hash(dentry->d_parent,
dentry->d_name.hash);
hlist_bl_lock(b);
dentry->d_flags &= ~DCACHE_PAR_LOOKUP;
__hlist_bl_del(&dentry->d_u.d_in_lookup_hash);
wake_up_all(dentry->d_wait);
dentry->d_wait = NULL;
hlist_bl_unlock(b);
INIT_HLIST_NODE(&dentry->d_u.d_alias);
INIT_LIST_HEAD(&dentry->d_lru);
}
EXPORT_SYMBOL(__d_lookup_done);
/* inode->i_lock held if inode is non-NULL */
static inline void __d_add(struct dentry *dentry, struct inode *inode)
{
struct inode *dir = NULL;
unsigned n;
spin_lock(&dentry->d_lock);
if (unlikely(d_in_lookup(dentry))) {
dir = dentry->d_parent->d_inode;
n = start_dir_add(dir);
__d_lookup_done(dentry);
}
if (inode) {
unsigned add_flags = d_flags_for_inode(inode);
hlist_add_head(&dentry->d_u.d_alias, &inode->i_dentry);
raw_write_seqcount_begin(&dentry->d_seq);
__d_set_inode_and_type(dentry, inode, add_flags);
raw_write_seqcount_end(&dentry->d_seq);
fsnotify_update_flags(dentry);
}
__d_rehash(dentry);
if (dir)
end_dir_add(dir, n);
spin_unlock(&dentry->d_lock);
if (inode)
spin_unlock(&inode->i_lock);
}
/**
* d_add - add dentry to hash queues
* @entry: dentry to add
* @inode: The inode to attach to this dentry
*
* This adds the entry to the hash queues and initializes @inode.
* The entry was actually filled in earlier during d_alloc().
*/
void d_add(struct dentry *entry, struct inode *inode)
{
if (inode) {
security_d_instantiate(entry, inode);
spin_lock(&inode->i_lock);
}
__d_add(entry, inode);
}
EXPORT_SYMBOL(d_add);
/**
* d_exact_alias - find and hash an exact unhashed alias
* @entry: dentry to add
* @inode: The inode to go with this dentry
*
* If an unhashed dentry with the same name/parent and desired
* inode already exists, hash and return it. Otherwise, return
* NULL.
*
* Parent directory should be locked.
*/
struct dentry *d_exact_alias(struct dentry *entry, struct inode *inode)
{
struct dentry *alias;
unsigned int hash = entry->d_name.hash;
spin_lock(&inode->i_lock);
hlist_for_each_entry(alias, &inode->i_dentry, d_u.d_alias) {
/*
* Don't need alias->d_lock here, because aliases with
* d_parent == entry->d_parent are not subject to name or
* parent changes, because the parent inode i_mutex is held.
*/
if (alias->d_name.hash != hash)
continue;
if (alias->d_parent != entry->d_parent)
continue;
if (!d_same_name(alias, entry->d_parent, &entry->d_name))
continue;
spin_lock(&alias->d_lock);
if (!d_unhashed(alias)) {
spin_unlock(&alias->d_lock);
alias = NULL;
} else {
__dget_dlock(alias);
__d_rehash(alias);
spin_unlock(&alias->d_lock);
}
spin_unlock(&inode->i_lock);
return alias;
}
spin_unlock(&inode->i_lock);
return NULL;
}
EXPORT_SYMBOL(d_exact_alias);
static void swap_names(struct dentry *dentry, struct dentry *target)
{
if (unlikely(dname_external(target))) {
if (unlikely(dname_external(dentry))) {
/*
* Both external: swap the pointers
*/
swap(target->d_name.name, dentry->d_name.name);
} else {
/*
* dentry:internal, target:external. Steal target's
* storage and make target internal.
*/
memcpy(target->d_iname, dentry->d_name.name,
dentry->d_name.len + 1);
dentry->d_name.name = target->d_name.name;
target->d_name.name = target->d_iname;
}
} else {
if (unlikely(dname_external(dentry))) {
/*
* dentry:external, target:internal. Give dentry's
* storage to target and make dentry internal
*/
memcpy(dentry->d_iname, target->d_name.name,
target->d_name.len + 1);
target->d_name.name = dentry->d_name.name;
dentry->d_name.name = dentry->d_iname;
} else {
/*
* Both are internal.
*/
unsigned int i;
BUILD_BUG_ON(!IS_ALIGNED(DNAME_INLINE_LEN, sizeof(long)));
for (i = 0; i < DNAME_INLINE_LEN / sizeof(long); i++) {
swap(((long *) &dentry->d_iname)[i],
((long *) &target->d_iname)[i]);
}
}
}
swap(dentry->d_name.hash_len, target->d_name.hash_len);
}
static void copy_name(struct dentry *dentry, struct dentry *target)
{
struct external_name *old_name = NULL;
if (unlikely(dname_external(dentry)))
old_name = external_name(dentry);
if (unlikely(dname_external(target))) {
atomic_inc(&external_name(target)->u.count);
dentry->d_name = target->d_name;
} else {
memcpy(dentry->d_iname, target->d_name.name,
target->d_name.len + 1);
dentry->d_name.name = dentry->d_iname;
dentry->d_name.hash_len = target->d_name.hash_len;
}
if (old_name && likely(atomic_dec_and_test(&old_name->u.count))) kfree_rcu(old_name, u.head);
}
/*
* __d_move - move a dentry
* @dentry: entry to move
* @target: new dentry
* @exchange: exchange the two dentries
*
* Update the dcache to reflect the move of a file name. Negative
* dcache entries should not be moved in this way. Caller must hold
* rename_lock, the i_mutex of the source and target directories,
* and the sb->s_vfs_rename_mutex if they differ. See lock_rename().
*/
static void __d_move(struct dentry *dentry, struct dentry *target,
bool exchange)
{
struct dentry *old_parent, *p;
struct inode *dir = NULL;
unsigned n;
WARN_ON(!dentry->d_inode); if (WARN_ON(dentry == target))
return;
BUG_ON(d_ancestor(target, dentry)); old_parent = dentry->d_parent;
p = d_ancestor(old_parent, target);
if (IS_ROOT(dentry)) { BUG_ON(p); spin_lock(&target->d_parent->d_lock); } else if (!p) {
/* target is not a descendent of dentry->d_parent */
spin_lock(&target->d_parent->d_lock);
spin_lock_nested(&old_parent->d_lock, DENTRY_D_LOCK_NESTED);
} else {
BUG_ON(p == dentry);
spin_lock(&old_parent->d_lock);
if (p != target)
spin_lock_nested(&target->d_parent->d_lock,
DENTRY_D_LOCK_NESTED);
}
spin_lock_nested(&dentry->d_lock, 2);
spin_lock_nested(&target->d_lock, 3);
if (unlikely(d_in_lookup(target))) {
dir = target->d_parent->d_inode;
n = start_dir_add(dir);
__d_lookup_done(target);
}
write_seqcount_begin(&dentry->d_seq);
write_seqcount_begin_nested(&target->d_seq, DENTRY_D_LOCK_NESTED);
/* unhash both */
if (!d_unhashed(dentry))
___d_drop(dentry);
if (!d_unhashed(target))
___d_drop(target);
/* ... and switch them in the tree */
dentry->d_parent = target->d_parent;
if (!exchange) {
copy_name(dentry, target);
target->d_hash.pprev = NULL;
dentry->d_parent->d_lockref.count++;
if (dentry != old_parent) /* wasn't IS_ROOT */
WARN_ON(!--old_parent->d_lockref.count);
} else {
target->d_parent = old_parent;
swap_names(dentry, target);
list_move(&target->d_child, &target->d_parent->d_subdirs);
__d_rehash(target);
fsnotify_update_flags(target);
}
list_move(&dentry->d_child, &dentry->d_parent->d_subdirs);
__d_rehash(dentry);
fsnotify_update_flags(dentry);
fscrypt_handle_d_move(dentry);
write_seqcount_end(&target->d_seq);
write_seqcount_end(&dentry->d_seq);
if (dir)
end_dir_add(dir, n);
if (dentry->d_parent != old_parent)
spin_unlock(&dentry->d_parent->d_lock);
if (dentry != old_parent)
spin_unlock(&old_parent->d_lock);
spin_unlock(&target->d_lock);
spin_unlock(&dentry->d_lock);
}
/*
* d_move - move a dentry
* @dentry: entry to move
* @target: new dentry
*
* Update the dcache to reflect the move of a file name. Negative
* dcache entries should not be moved in this way. See the locking
* requirements for __d_move.
*/
void d_move(struct dentry *dentry, struct dentry *target)
{
write_seqlock(&rename_lock);
__d_move(dentry, target, false);
write_sequnlock(&rename_lock);
}
EXPORT_SYMBOL(d_move);
/*
* d_exchange - exchange two dentries
* @dentry1: first dentry
* @dentry2: second dentry
*/
void d_exchange(struct dentry *dentry1, struct dentry *dentry2)
{
write_seqlock(&rename_lock);
WARN_ON(!dentry1->d_inode);
WARN_ON(!dentry2->d_inode);
WARN_ON(IS_ROOT(dentry1));
WARN_ON(IS_ROOT(dentry2));
__d_move(dentry1, dentry2, true);
write_sequnlock(&rename_lock);
}
/**
* d_ancestor - search for an ancestor
* @p1: ancestor dentry
* @p2: child dentry
*
* Returns the ancestor dentry of p2 which is a child of p1, if p1 is
* an ancestor of p2, else NULL.
*/
struct dentry *d_ancestor(struct dentry *p1, struct dentry *p2)
{
struct dentry *p;
for (p = p2; !IS_ROOT(p); p = p->d_parent) { if (p->d_parent == p1)
return p;
}
return NULL;
}
/*
* This helper attempts to cope with remotely renamed directories
*
* It assumes that the caller is already holding
* dentry->d_parent->d_inode->i_mutex, and rename_lock
*
* Note: If ever the locking in lock_rename() changes, then please
* remember to update this too...
*/
static int __d_unalias(struct inode *inode,
struct dentry *dentry, struct dentry *alias)
{
struct mutex *m1 = NULL;
struct rw_semaphore *m2 = NULL;
int ret = -ESTALE;
/* If alias and dentry share a parent, then no extra locks required */
if (alias->d_parent == dentry->d_parent)
goto out_unalias;
/* See lock_rename() */
if (!mutex_trylock(&dentry->d_sb->s_vfs_rename_mutex))
goto out_err;
m1 = &dentry->d_sb->s_vfs_rename_mutex;
if (!inode_trylock_shared(alias->d_parent->d_inode))
goto out_err;
m2 = &alias->d_parent->d_inode->i_rwsem;
out_unalias:
__d_move(alias, dentry, false);
ret = 0;
out_err:
if (m2)
up_read(m2); if (m1) mutex_unlock(m1);
return ret;
}
/**
* d_splice_alias - splice a disconnected dentry into the tree if one exists
* @inode: the inode which may have a disconnected dentry
* @dentry: a negative dentry which we want to point to the inode.
*
* If inode is a directory and has an IS_ROOT alias, then d_move that in
* place of the given dentry and return it, else simply d_add the inode
* to the dentry and return NULL.
*
* If a non-IS_ROOT directory is found, the filesystem is corrupt, and
* we should error out: directories can't have multiple aliases.
*
* This is needed in the lookup routine of any filesystem that is exportable
* (via knfsd) so that we can build dcache paths to directories effectively.
*
* If a dentry was found and moved, then it is returned. Otherwise NULL
* is returned. This matches the expected return value of ->lookup.
*
* Cluster filesystems may call this function with a negative, hashed dentry.
* In that case, we know that the inode will be a regular file, and also this
* will only occur during atomic_open. So we need to check for the dentry
* being already hashed only in the final case.
*/
struct dentry *d_splice_alias(struct inode *inode, struct dentry *dentry)
{
if (IS_ERR(inode))
return ERR_CAST(inode);
BUG_ON(!d_unhashed(dentry)); if (!inode)
goto out;
security_d_instantiate(dentry, inode);
spin_lock(&inode->i_lock);
if (S_ISDIR(inode->i_mode)) {
struct dentry *new = __d_find_any_alias(inode);
if (unlikely(new)) {
/* The reference to new ensures it remains an alias */
spin_unlock(&inode->i_lock);
write_seqlock(&rename_lock);
if (unlikely(d_ancestor(new, dentry))) {
write_sequnlock(&rename_lock);
dput(new);
new = ERR_PTR(-ELOOP);
pr_warn_ratelimited(
"VFS: Lookup of '%s' in %s %s"
" would have caused loop\n",
dentry->d_name.name,
inode->i_sb->s_type->name,
inode->i_sb->s_id);
} else if (!IS_ROOT(new)) {
struct dentry *old_parent = dget(new->d_parent);
int err = __d_unalias(inode, dentry, new);
write_sequnlock(&rename_lock);
if (err) {
dput(new);
new = ERR_PTR(err);
}
dput(old_parent);
} else {
__d_move(new, dentry, false);
write_sequnlock(&rename_lock);
}
iput(inode);
return new;
}
}
out:
__d_add(dentry, inode);
return NULL;
}
EXPORT_SYMBOL(d_splice_alias);
/*
* Test whether new_dentry is a subdirectory of old_dentry.
*
* Trivially implemented using the dcache structure
*/
/**
* is_subdir - is new dentry a subdirectory of old_dentry
* @new_dentry: new dentry
* @old_dentry: old dentry
*
* Returns true if new_dentry is a subdirectory of the parent (at any depth).
* Returns false otherwise.
* Caller must ensure that "new_dentry" is pinned before calling is_subdir()
*/
bool is_subdir(struct dentry *new_dentry, struct dentry *old_dentry)
{
bool result;
unsigned seq;
if (new_dentry == old_dentry)
return true;
do {
/* for restarting inner loop in case of seq retry */
seq = read_seqbegin(&rename_lock);
/*
* Need rcu_readlock to protect against the d_parent trashing
* due to d_move
*/
rcu_read_lock();
if (d_ancestor(old_dentry, new_dentry))
result = true;
else
result = false;
rcu_read_unlock();
} while (read_seqretry(&rename_lock, seq));
return result;
}
EXPORT_SYMBOL(is_subdir);
static enum d_walk_ret d_genocide_kill(void *data, struct dentry *dentry)
{
struct dentry *root = data;
if (dentry != root) {
if (d_unhashed(dentry) || !dentry->d_inode)
return D_WALK_SKIP;
if (!(dentry->d_flags & DCACHE_GENOCIDE)) {
dentry->d_flags |= DCACHE_GENOCIDE;
dentry->d_lockref.count--;
}
}
return D_WALK_CONTINUE;
}
void d_genocide(struct dentry *parent)
{
d_walk(parent, parent, d_genocide_kill);
}
EXPORT_SYMBOL(d_genocide);
void d_tmpfile(struct dentry *dentry, struct inode *inode)
{
inode_dec_link_count(inode);
BUG_ON(dentry->d_name.name != dentry->d_iname ||
!hlist_unhashed(&dentry->d_u.d_alias) ||
!d_unlinked(dentry));
spin_lock(&dentry->d_parent->d_lock);
spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED);
dentry->d_name.len = sprintf(dentry->d_iname, "#%llu",
(unsigned long long)inode->i_ino);
spin_unlock(&dentry->d_lock);
spin_unlock(&dentry->d_parent->d_lock);
d_instantiate(dentry, inode);
}
EXPORT_SYMBOL(d_tmpfile);
static __initdata unsigned long dhash_entries;
static int __init set_dhash_entries(char *str)
{
if (!str)
return 0;
dhash_entries = simple_strtoul(str, &str, 0);
return 1;
}
__setup("dhash_entries=", set_dhash_entries);
static void __init dcache_init_early(void)
{
/* If hashes are distributed across NUMA nodes, defer
* hash allocation until vmalloc space is available.
*/
if (hashdist)
return;
dentry_hashtable =
alloc_large_system_hash("Dentry cache",
sizeof(struct hlist_bl_head),
dhash_entries,
13,
HASH_EARLY | HASH_ZERO,
&d_hash_shift,
NULL,
0,
0);
d_hash_shift = 32 - d_hash_shift;
}
static void __init dcache_init(void)
{
/*
* A constructor could be added for stable state like the lists,
* but it is probably not worth it because of the cache nature
* of the dcache.
*/
dentry_cache = KMEM_CACHE_USERCOPY(dentry,
SLAB_RECLAIM_ACCOUNT|SLAB_PANIC|SLAB_MEM_SPREAD|SLAB_ACCOUNT,
d_iname);
/* Hash may have been set up in dcache_init_early */
if (!hashdist)
return;
dentry_hashtable =
alloc_large_system_hash("Dentry cache",
sizeof(struct hlist_bl_head),
dhash_entries,
13,
HASH_ZERO,
&d_hash_shift,
NULL,
0,
0);
d_hash_shift = 32 - d_hash_shift;
}
/* SLAB cache for __getname() consumers */
struct kmem_cache *names_cachep __read_mostly;
EXPORT_SYMBOL(names_cachep);
void __init vfs_caches_init_early(void)
{
int i;
for (i = 0; i < ARRAY_SIZE(in_lookup_hashtable); i++)
INIT_HLIST_BL_HEAD(&in_lookup_hashtable[i]);
dcache_init_early();
inode_init_early();
}
void __init vfs_caches_init(void)
{
names_cachep = kmem_cache_create_usercopy("names_cache", PATH_MAX, 0,
SLAB_HWCACHE_ALIGN|SLAB_PANIC, 0, PATH_MAX, NULL);
dcache_init();
inode_init();
files_init();
files_maxfiles_init();
mnt_init();
bdev_cache_init();
chrdev_init();
}
// SPDX-License-Identifier: GPL-2.0
/*
* Copyright (C) 2012 Fusion-io All rights reserved.
* Copyright (C) 2012 Intel Corp. All rights reserved.
*/
#include <linux/sched.h>
#include <linux/bio.h>
#include <linux/slab.h>
#include <linux/blkdev.h>
#include <linux/raid/pq.h>
#include <linux/hash.h>
#include <linux/list_sort.h>
#include <linux/raid/xor.h>
#include <linux/mm.h>
#include "misc.h"
#include "ctree.h"
#include "disk-io.h"
#include "volumes.h"
#include "raid56.h"
#include "async-thread.h"
/* set when additional merges to this rbio are not allowed */
#define RBIO_RMW_LOCKED_BIT 1
/*
* set when this rbio is sitting in the hash, but it is just a cache
* of past RMW
*/
#define RBIO_CACHE_BIT 2
/*
* set when it is safe to trust the stripe_pages for caching
*/
#define RBIO_CACHE_READY_BIT 3
#define RBIO_CACHE_SIZE 1024
#define BTRFS_STRIPE_HASH_TABLE_BITS 11
/* Used by the raid56 code to lock stripes for read/modify/write */
struct btrfs_stripe_hash {
struct list_head hash_list;
spinlock_t lock;
};
/* Used by the raid56 code to lock stripes for read/modify/write */
struct btrfs_stripe_hash_table {
struct list_head stripe_cache;
spinlock_t cache_lock;
int cache_size;
struct btrfs_stripe_hash table[];
};
enum btrfs_rbio_ops {
BTRFS_RBIO_WRITE,
BTRFS_RBIO_READ_REBUILD,
BTRFS_RBIO_PARITY_SCRUB,
BTRFS_RBIO_REBUILD_MISSING,
};
struct btrfs_raid_bio {
struct btrfs_fs_info *fs_info;
struct btrfs_bio *bbio;
/* while we're doing rmw on a stripe
* we put it into a hash table so we can
* lock the stripe and merge more rbios
* into it.
*/
struct list_head hash_list;
/*
* LRU list for the stripe cache
*/
struct list_head stripe_cache;
/*
* for scheduling work in the helper threads
*/
struct btrfs_work work;
/*
* bio list and bio_list_lock are used
* to add more bios into the stripe
* in hopes of avoiding the full rmw
*/
struct bio_list bio_list;
spinlock_t bio_list_lock;
/* also protected by the bio_list_lock, the
* plug list is used by the plugging code
* to collect partial bios while plugged. The
* stripe locking code also uses it to hand off
* the stripe lock to the next pending IO
*/
struct list_head plug_list;
/*
* flags that tell us if it is safe to
* merge with this bio
*/
unsigned long flags;
/* size of each individual stripe on disk */
int stripe_len;
/* number of data stripes (no p/q) */
int nr_data;
int real_stripes;
int stripe_npages;
/*
* set if we're doing a parity rebuild
* for a read from higher up, which is handled
* differently from a parity rebuild as part of
* rmw
*/
enum btrfs_rbio_ops operation;
/* first bad stripe */
int faila;
/* second bad stripe (for raid6 use) */
int failb;
int scrubp;
/*
* number of pages needed to represent the full
* stripe
*/
int nr_pages;
/*
* size of all the bios in the bio_list. This
* helps us decide if the rbio maps to a full
* stripe or not
*/
int bio_list_bytes;
int generic_bio_cnt;
refcount_t refs;
atomic_t stripes_pending;
atomic_t error;
/*
* these are two arrays of pointers. We allocate the
* rbio big enough to hold them both and setup their
* locations when the rbio is allocated
*/
/* pointers to pages that we allocated for
* reading/writing stripes directly from the disk (including P/Q)
*/
struct page **stripe_pages;
/*
* pointers to the pages in the bio_list. Stored
* here for faster lookup
*/
struct page **bio_pages;
/*
* bitmap to record which horizontal stripe has data
*/
unsigned long *dbitmap;
/* allocated with real_stripes-many pointers for finish_*() calls */
void **finish_pointers;
/* allocated with stripe_npages-many bits for finish_*() calls */
unsigned long *finish_pbitmap;
};
static int __raid56_parity_recover(struct btrfs_raid_bio *rbio);
static noinline void finish_rmw(struct btrfs_raid_bio *rbio);
static void rmw_work(struct btrfs_work *work);
static void read_rebuild_work(struct btrfs_work *work);
static int fail_bio_stripe(struct btrfs_raid_bio *rbio, struct bio *bio);
static int fail_rbio_index(struct btrfs_raid_bio *rbio, int failed);
static void __free_raid_bio(struct btrfs_raid_bio *rbio);
static void index_rbio_pages(struct btrfs_raid_bio *rbio);
static int alloc_rbio_pages(struct btrfs_raid_bio *rbio);
static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio,
int need_check);
static void scrub_parity_work(struct btrfs_work *work);
static void start_async_work(struct btrfs_raid_bio *rbio, btrfs_func_t work_func)
{
btrfs_init_work(&rbio->work, work_func, NULL, NULL);
btrfs_queue_work(rbio->fs_info->rmw_workers, &rbio->work);
}
/*
* the stripe hash table is used for locking, and to collect
* bios in hopes of making a full stripe
*/
int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info)
{
struct btrfs_stripe_hash_table *table;
struct btrfs_stripe_hash_table *x;
struct btrfs_stripe_hash *cur;
struct btrfs_stripe_hash *h;
int num_entries = 1 << BTRFS_STRIPE_HASH_TABLE_BITS;
int i;
if (info->stripe_hash_table)
return 0;
/*
* The table is large, starting with order 4 and can go as high as
* order 7 in case lock debugging is turned on.
*
* Try harder to allocate and fallback to vmalloc to lower the chance
* of a failing mount.
*/
table = kvzalloc(struct_size(table, table, num_entries), GFP_KERNEL);
if (!table)
return -ENOMEM;
spin_lock_init(&table->cache_lock);
INIT_LIST_HEAD(&table->stripe_cache);
h = table->table;
for (i = 0; i < num_entries; i++) {
cur = h + i;
INIT_LIST_HEAD(&cur->hash_list);
spin_lock_init(&cur->lock);
}
x = cmpxchg(&info->stripe_hash_table, NULL, table);
kvfree(x);
return 0;
}
/*
* caching an rbio means to copy anything from the
* bio_pages array into the stripe_pages array. We
* use the page uptodate bit in the stripe cache array
* to indicate if it has valid data
*
* once the caching is done, we set the cache ready
* bit.
*/
static void cache_rbio_pages(struct btrfs_raid_bio *rbio)
{
int i;
int ret;
ret = alloc_rbio_pages(rbio);
if (ret)
return;
for (i = 0; i < rbio->nr_pages; i++) {
if (!rbio->bio_pages[i])
continue;
copy_highpage(rbio->stripe_pages[i], rbio->bio_pages[i]);
SetPageUptodate(rbio->stripe_pages[i]);
}
set_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
}
/*
* we hash on the first logical address of the stripe
*/
static int rbio_bucket(struct btrfs_raid_bio *rbio)
{
u64 num = rbio->bbio->raid_map[0];
/*
* we shift down quite a bit. We're using byte
* addressing, and most of the lower bits are zeros.
* This tends to upset hash_64, and it consistently
* returns just one or two different values.
*
* shifting off the lower bits fixes things.
*/
return hash_64(num >> 16, BTRFS_STRIPE_HASH_TABLE_BITS);
}
/*
* stealing an rbio means taking all the uptodate pages from the stripe
* array in the source rbio and putting them into the destination rbio
*/
static void steal_rbio(struct btrfs_raid_bio *src, struct btrfs_raid_bio *dest)
{
int i;
struct page *s;
struct page *d;
if (!test_bit(RBIO_CACHE_READY_BIT, &src->flags))
return;
for (i = 0; i < dest->nr_pages; i++) {
s = src->stripe_pages[i];
if (!s || !PageUptodate(s)) {
continue;
}
d = dest->stripe_pages[i];
if (d)
__free_page(d);
dest->stripe_pages[i] = s;
src->stripe_pages[i] = NULL;
}
}
/*
* merging means we take the bio_list from the victim and
* splice it into the destination. The victim should
* be discarded afterwards.
*
* must be called with dest->rbio_list_lock held
*/
static void merge_rbio(struct btrfs_raid_bio *dest,
struct btrfs_raid_bio *victim)
{
bio_list_merge(&dest->bio_list, &victim->bio_list);
dest->bio_list_bytes += victim->bio_list_bytes;
dest->generic_bio_cnt += victim->generic_bio_cnt;
bio_list_init(&victim->bio_list);
}
/*
* used to prune items that are in the cache. The caller
* must hold the hash table lock.
*/
static void __remove_rbio_from_cache(struct btrfs_raid_bio *rbio)
{
int bucket = rbio_bucket(rbio);
struct btrfs_stripe_hash_table *table;
struct btrfs_stripe_hash *h;
int freeit = 0;
/*
* check the bit again under the hash table lock.
*/
if (!test_bit(RBIO_CACHE_BIT, &rbio->flags))
return;
table = rbio->fs_info->stripe_hash_table;
h = table->table + bucket;
/* hold the lock for the bucket because we may be
* removing it from the hash table
*/
spin_lock(&h->lock);
/*
* hold the lock for the bio list because we need
* to make sure the bio list is empty
*/
spin_lock(&rbio->bio_list_lock);
if (test_and_clear_bit(RBIO_CACHE_BIT, &rbio->flags)) {
list_del_init(&rbio->stripe_cache);
table->cache_size -= 1;
freeit = 1;
/* if the bio list isn't empty, this rbio is
* still involved in an IO. We take it out
* of the cache list, and drop the ref that
* was held for the list.
*
* If the bio_list was empty, we also remove
* the rbio from the hash_table, and drop
* the corresponding ref
*/
if (bio_list_empty(&rbio->bio_list)) {
if (!list_empty(&rbio->hash_list)) {
list_del_init(&rbio->hash_list);
refcount_dec(&rbio->refs);
BUG_ON(!list_empty(&rbio->plug_list));
}
}
}
spin_unlock(&rbio->bio_list_lock);
spin_unlock(&h->lock);
if (freeit)
__free_raid_bio(rbio);
}
/*
* prune a given rbio from the cache
*/
static void remove_rbio_from_cache(struct btrfs_raid_bio *rbio)
{
struct btrfs_stripe_hash_table *table;
unsigned long flags;
if (!test_bit(RBIO_CACHE_BIT, &rbio->flags))
return;
table = rbio->fs_info->stripe_hash_table;
spin_lock_irqsave(&table->cache_lock, flags);
__remove_rbio_from_cache(rbio);
spin_unlock_irqrestore(&table->cache_lock, flags);
}
/*
* remove everything in the cache
*/
static void btrfs_clear_rbio_cache(struct btrfs_fs_info *info)
{
struct btrfs_stripe_hash_table *table;
unsigned long flags;
struct btrfs_raid_bio *rbio;
table = info->stripe_hash_table;
spin_lock_irqsave(&table->cache_lock, flags);
while (!list_empty(&table->stripe_cache)) {
rbio = list_entry(table->stripe_cache.next,
struct btrfs_raid_bio,
stripe_cache);
__remove_rbio_from_cache(rbio);
}
spin_unlock_irqrestore(&table->cache_lock, flags);
}
/*
* remove all cached entries and free the hash table
* used by unmount
*/
void btrfs_free_stripe_hash_table(struct btrfs_fs_info *info)
{
if (!info->stripe_hash_table)
return;
btrfs_clear_rbio_cache(info);
kvfree(info->stripe_hash_table);
info->stripe_hash_table = NULL;
}
/*
* insert an rbio into the stripe cache. It
* must have already been prepared by calling
* cache_rbio_pages
*
* If this rbio was already cached, it gets
* moved to the front of the lru.
*
* If the size of the rbio cache is too big, we
* prune an item.
*/
static void cache_rbio(struct btrfs_raid_bio *rbio)
{
struct btrfs_stripe_hash_table *table;
unsigned long flags;
if (!test_bit(RBIO_CACHE_READY_BIT, &rbio->flags))
return;
table = rbio->fs_info->stripe_hash_table;
spin_lock_irqsave(&table->cache_lock, flags);
spin_lock(&rbio->bio_list_lock);
/* bump our ref if we were not in the list before */
if (!test_and_set_bit(RBIO_CACHE_BIT, &rbio->flags))
refcount_inc(&rbio->refs);
if (!list_empty(&rbio->stripe_cache)){
list_move(&rbio->stripe_cache, &table->stripe_cache);
} else {
list_add(&rbio->stripe_cache, &table->stripe_cache);
table->cache_size += 1;
}
spin_unlock(&rbio->bio_list_lock);
if (table->cache_size > RBIO_CACHE_SIZE) {
struct btrfs_raid_bio *found;
found = list_entry(table->stripe_cache.prev,
struct btrfs_raid_bio,
stripe_cache);
if (found != rbio)
__remove_rbio_from_cache(found);
}
spin_unlock_irqrestore(&table->cache_lock, flags);
}
/*
* helper function to run the xor_blocks api. It is only
* able to do MAX_XOR_BLOCKS at a time, so we need to
* loop through.
*/
static void run_xor(void **pages, int src_cnt, ssize_t len)
{
int src_off = 0;
int xor_src_cnt = 0;
void *dest = pages[src_cnt];
while(src_cnt > 0) {
xor_src_cnt = min(src_cnt, MAX_XOR_BLOCKS);
xor_blocks(xor_src_cnt, len, dest, pages + src_off);
src_cnt -= xor_src_cnt;
src_off += xor_src_cnt;
}
}
/*
* Returns true if the bio list inside this rbio covers an entire stripe (no
* rmw required).
*/
static int rbio_is_full(struct btrfs_raid_bio *rbio)
{
unsigned long flags;
unsigned long size = rbio->bio_list_bytes;
int ret = 1;
spin_lock_irqsave(&rbio->bio_list_lock, flags);
if (size != rbio->nr_data * rbio->stripe_len)
ret = 0;
BUG_ON(size > rbio->nr_data * rbio->stripe_len);
spin_unlock_irqrestore(&rbio->bio_list_lock, flags);
return ret;
}
/*
* returns 1 if it is safe to merge two rbios together.
* The merging is safe if the two rbios correspond to
* the same stripe and if they are both going in the same
* direction (read vs write), and if neither one is
* locked for final IO
*
* The caller is responsible for locking such that
* rmw_locked is safe to test
*/
static int rbio_can_merge(struct btrfs_raid_bio *last,
struct btrfs_raid_bio *cur)
{
if (test_bit(RBIO_RMW_LOCKED_BIT, &last->flags) ||
test_bit(RBIO_RMW_LOCKED_BIT, &cur->flags))
return 0;
/*
* we can't merge with cached rbios, since the
* idea is that when we merge the destination
* rbio is going to run our IO for us. We can
* steal from cached rbios though, other functions
* handle that.
*/
if (test_bit(RBIO_CACHE_BIT, &last->flags) ||
test_bit(RBIO_CACHE_BIT, &cur->flags))
return 0;
if (last->bbio->raid_map[0] !=
cur->bbio->raid_map[0])
return 0;
/* we can't merge with different operations */
if (last->operation != cur->operation)
return 0;
/*
* We've need read the full stripe from the drive.
* check and repair the parity and write the new results.
*
* We're not allowed to add any new bios to the
* bio list here, anyone else that wants to
* change this stripe needs to do their own rmw.
*/
if (last->operation == BTRFS_RBIO_PARITY_SCRUB)
return 0;
if (last->operation == BTRFS_RBIO_REBUILD_MISSING)
return 0;
if (last->operation == BTRFS_RBIO_READ_REBUILD) {
int fa = last->faila;
int fb = last->failb;
int cur_fa = cur->faila;
int cur_fb = cur->failb;
if (last->faila >= last->failb) {
fa = last->failb;
fb = last->faila;
}
if (cur->faila >= cur->failb) {
cur_fa = cur->failb;
cur_fb = cur->faila;
}
if (fa != cur_fa || fb != cur_fb)
return 0;
}
return 1;
}
static int rbio_stripe_page_index(struct btrfs_raid_bio *rbio, int stripe,
int index)
{
return stripe * rbio->stripe_npages + index;
}
/*
* these are just the pages from the rbio array, not from anything
* the FS sent down to us
*/
static struct page *rbio_stripe_page(struct btrfs_raid_bio *rbio, int stripe,
int index)
{
return rbio->stripe_pages[rbio_stripe_page_index(rbio, stripe, index)];
}
/*
* helper to index into the pstripe
*/
static struct page *rbio_pstripe_page(struct btrfs_raid_bio *rbio, int index)
{
return rbio_stripe_page(rbio, rbio->nr_data, index);
}
/*
* helper to index into the qstripe, returns null
* if there is no qstripe
*/
static struct page *rbio_qstripe_page(struct btrfs_raid_bio *rbio, int index)
{
if (rbio->nr_data + 1 == rbio->real_stripes)
return NULL;
return rbio_stripe_page(rbio, rbio->nr_data + 1, index);
}
/*
* The first stripe in the table for a logical address
* has the lock. rbios are added in one of three ways:
*
* 1) Nobody has the stripe locked yet. The rbio is given
* the lock and 0 is returned. The caller must start the IO
* themselves.
*
* 2) Someone has the stripe locked, but we're able to merge
* with the lock owner. The rbio is freed and the IO will
* start automatically along with the existing rbio. 1 is returned.
*
* 3) Someone has the stripe locked, but we're not able to merge.
* The rbio is added to the lock owner's plug list, or merged into
* an rbio already on the plug list. When the lock owner unlocks,
* the next rbio on the list is run and the IO is started automatically.
* 1 is returned
*
* If we return 0, the caller still owns the rbio and must continue with
* IO submission. If we return 1, the caller must assume the rbio has
* already been freed.
*/
static noinline int lock_stripe_add(struct btrfs_raid_bio *rbio)
{
struct btrfs_stripe_hash *h;
struct btrfs_raid_bio *cur;
struct btrfs_raid_bio *pending;
unsigned long flags;
struct btrfs_raid_bio *freeit = NULL;
struct btrfs_raid_bio *cache_drop = NULL;
int ret = 0;
h = rbio->fs_info->stripe_hash_table->table + rbio_bucket(rbio);
spin_lock_irqsave(&h->lock, flags);
list_for_each_entry(cur, &h->hash_list, hash_list) {
if (cur->bbio->raid_map[0] != rbio->bbio->raid_map[0])
continue;
spin_lock(&cur->bio_list_lock);
/* Can we steal this cached rbio's pages? */
if (bio_list_empty(&cur->bio_list) &&
list_empty(&cur->plug_list) &&
test_bit(RBIO_CACHE_BIT, &cur->flags) &&
!test_bit(RBIO_RMW_LOCKED_BIT, &cur->flags)) {
list_del_init(&cur->hash_list);
refcount_dec(&cur->refs);
steal_rbio(cur, rbio);
cache_drop = cur;
spin_unlock(&cur->bio_list_lock);
goto lockit;
}
/* Can we merge into the lock owner? */
if (rbio_can_merge(cur, rbio)) {
merge_rbio(cur, rbio);
spin_unlock(&cur->bio_list_lock);
freeit = rbio;
ret = 1;
goto out;
}
/*
* We couldn't merge with the running rbio, see if we can merge
* with the pending ones. We don't have to check for rmw_locked
* because there is no way they are inside finish_rmw right now
*/
list_for_each_entry(pending, &cur->plug_list, plug_list) {
if (rbio_can_merge(pending, rbio)) {
merge_rbio(pending, rbio);
spin_unlock(&cur->bio_list_lock);
freeit = rbio;
ret = 1;
goto out;
}
}
/*
* No merging, put us on the tail of the plug list, our rbio
* will be started with the currently running rbio unlocks
*/
list_add_tail(&rbio->plug_list, &cur->plug_list);
spin_unlock(&cur->bio_list_lock);
ret = 1;
goto out;
}
lockit:
refcount_inc(&rbio->refs);
list_add(&rbio->hash_list, &h->hash_list);
out:
spin_unlock_irqrestore(&h->lock, flags);
if (cache_drop)
remove_rbio_from_cache(cache_drop);
if (freeit)
__free_raid_bio(freeit);
return ret;
}
/*
* called as rmw or parity rebuild is completed. If the plug list has more
* rbios waiting for this stripe, the next one on the list will be started
*/
static noinline void unlock_stripe(struct btrfs_raid_bio *rbio)
{
int bucket;
struct btrfs_stripe_hash *h;
unsigned long flags;
int keep_cache = 0;
bucket = rbio_bucket(rbio);
h = rbio->fs_info->stripe_hash_table->table + bucket;
if (list_empty(&rbio->plug_list))
cache_rbio(rbio);
spin_lock_irqsave(&h->lock, flags);
spin_lock(&rbio->bio_list_lock);
if (!list_empty(&rbio->hash_list)) {
/*
* if we're still cached and there is no other IO
* to perform, just leave this rbio here for others
* to steal from later
*/
if (list_empty(&rbio->plug_list) &&
test_bit(RBIO_CACHE_BIT, &rbio->flags)) {
keep_cache = 1;
clear_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags);
BUG_ON(!bio_list_empty(&rbio->bio_list));
goto done;
}
list_del_init(&rbio->hash_list);
refcount_dec(&rbio->refs);
/*
* we use the plug list to hold all the rbios
* waiting for the chance to lock this stripe.
* hand the lock over to one of them.
*/
if (!list_empty(&rbio->plug_list)) {
struct btrfs_raid_bio *next;
struct list_head *head = rbio->plug_list.next;
next = list_entry(head, struct btrfs_raid_bio,
plug_list);
list_del_init(&rbio->plug_list);
list_add(&next->hash_list, &h->hash_list);
refcount_inc(&next->refs);
spin_unlock(&rbio->bio_list_lock);
spin_unlock_irqrestore(&h->lock, flags);
if (next->operation == BTRFS_RBIO_READ_REBUILD)
start_async_work(next, read_rebuild_work);
else if (next->operation == BTRFS_RBIO_REBUILD_MISSING) {
steal_rbio(rbio, next);
start_async_work(next, read_rebuild_work);
} else if (next->operation == BTRFS_RBIO_WRITE) {
steal_rbio(rbio, next);
start_async_work(next, rmw_work);
} else if (next->operation == BTRFS_RBIO_PARITY_SCRUB) {
steal_rbio(rbio, next);
start_async_work(next, scrub_parity_work);
}
goto done_nolock;
}
}
done:
spin_unlock(&rbio->bio_list_lock);
spin_unlock_irqrestore(&h->lock, flags);
done_nolock:
if (!keep_cache)
remove_rbio_from_cache(rbio);
}
static void __free_raid_bio(struct btrfs_raid_bio *rbio)
{
int i;
if (!refcount_dec_and_test(&rbio->refs))
return;
WARN_ON(!list_empty(&rbio->stripe_cache));
WARN_ON(!list_empty(&rbio->hash_list));
WARN_ON(!bio_list_empty(&rbio->bio_list));
for (i = 0; i < rbio->nr_pages; i++) {
if (rbio->stripe_pages[i]) {
__free_page(rbio->stripe_pages[i]);
rbio->stripe_pages[i] = NULL;
}
}
btrfs_put_bbio(rbio->bbio);
kfree(rbio);
}
static void rbio_endio_bio_list(struct bio *cur, blk_status_t err)
{
struct bio *next;
while (cur) {
next = cur->bi_next;
cur->bi_next = NULL;
cur->bi_status = err;
bio_endio(cur);
cur = next;
}
}
/*
* this frees the rbio and runs through all the bios in the
* bio_list and calls end_io on them
*/
static void rbio_orig_end_io(struct btrfs_raid_bio *rbio, blk_status_t err)
{
struct bio *cur = bio_list_get(&rbio->bio_list);
struct bio *extra;
if (rbio->generic_bio_cnt)
btrfs_bio_counter_sub(rbio->fs_info, rbio->generic_bio_cnt);
/*
* At this moment, rbio->bio_list is empty, however since rbio does not
* always have RBIO_RMW_LOCKED_BIT set and rbio is still linked on the
* hash list, rbio may be merged with others so that rbio->bio_list
* becomes non-empty.
* Once unlock_stripe() is done, rbio->bio_list will not be updated any
* more and we can call bio_endio() on all queued bios.
*/
unlock_stripe(rbio);
extra = bio_list_get(&rbio->bio_list);
__free_raid_bio(rbio);
rbio_endio_bio_list(cur, err);
if (extra)
rbio_endio_bio_list(extra, err);
}
/*
* end io function used by finish_rmw. When we finally
* get here, we've written a full stripe
*/
static void raid_write_end_io(struct bio *bio)
{
struct btrfs_raid_bio *rbio = bio->bi_private;
blk_status_t err = bio->bi_status;
int max_errors;
if (err)
fail_bio_stripe(rbio, bio);
bio_put(bio);
if (!atomic_dec_and_test(&rbio->stripes_pending))
return;
err = BLK_STS_OK;
/* OK, we have read all the stripes we need to. */
max_errors = (rbio->operation == BTRFS_RBIO_PARITY_SCRUB) ?
0 : rbio->bbio->max_errors;
if (atomic_read(&rbio->error) > max_errors)
err = BLK_STS_IOERR;
rbio_orig_end_io(rbio, err);
}
/*
* the read/modify/write code wants to use the original bio for
* any pages it included, and then use the rbio for everything
* else. This function decides if a given index (stripe number)
* and page number in that stripe fall inside the original bio
* or the rbio.
*
* if you set bio_list_only, you'll get a NULL back for any ranges
* that are outside the bio_list
*
* This doesn't take any refs on anything, you get a bare page pointer
* and the caller must bump refs as required.
*
* You must call index_rbio_pages once before you can trust
* the answers from this function.
*/
static struct page *page_in_rbio(struct btrfs_raid_bio *rbio,
int index, int pagenr, int bio_list_only)
{
int chunk_page;
struct page *p = NULL;
chunk_page = index * (rbio->stripe_len >> PAGE_SHIFT) + pagenr;
spin_lock_irq(&rbio->bio_list_lock);
p = rbio->bio_pages[chunk_page];
spin_unlock_irq(&rbio->bio_list_lock);
if (p || bio_list_only)
return p;
return rbio->stripe_pages[chunk_page];
}
/*
* number of pages we need for the entire stripe across all the
* drives
*/
static unsigned long rbio_nr_pages(unsigned long stripe_len, int nr_stripes)
{
return DIV_ROUND_UP(stripe_len, PAGE_SIZE) * nr_stripes;
}
/*
* allocation and initial setup for the btrfs_raid_bio. Not
* this does not allocate any pages for rbio->pages.
*/
static struct btrfs_raid_bio *alloc_rbio(struct btrfs_fs_info *fs_info,
struct btrfs_bio *bbio,
u64 stripe_len)
{
struct btrfs_raid_bio *rbio;
int nr_data = 0;
int real_stripes = bbio->num_stripes - bbio->num_tgtdevs;
int num_pages = rbio_nr_pages(stripe_len, real_stripes);
int stripe_npages = DIV_ROUND_UP(stripe_len, PAGE_SIZE);
void *p;
rbio = kzalloc(sizeof(*rbio) +
sizeof(*rbio->stripe_pages) * num_pages +
sizeof(*rbio->bio_pages) * num_pages +
sizeof(*rbio->finish_pointers) * real_stripes +
sizeof(*rbio->dbitmap) * BITS_TO_LONGS(stripe_npages) +
sizeof(*rbio->finish_pbitmap) *
BITS_TO_LONGS(stripe_npages),
GFP_NOFS);
if (!rbio)
return ERR_PTR(-ENOMEM);
bio_list_init(&rbio->bio_list);
INIT_LIST_HEAD(&rbio->plug_list);
spin_lock_init(&rbio->bio_list_lock);
INIT_LIST_HEAD(&rbio->stripe_cache);
INIT_LIST_HEAD(&rbio->hash_list);
rbio->bbio = bbio;
rbio->fs_info = fs_info;
rbio->stripe_len = stripe_len;
rbio->nr_pages = num_pages;
rbio->real_stripes = real_stripes;
rbio->stripe_npages = stripe_npages;
rbio->faila = -1;
rbio->failb = -1;
refcount_set(&rbio->refs, 1);
atomic_set(&rbio->error, 0);
atomic_set(&rbio->stripes_pending, 0);
/*
* the stripe_pages, bio_pages, etc arrays point to the extra
* memory we allocated past the end of the rbio
*/
p = rbio + 1;
#define CONSUME_ALLOC(ptr, count) do { \
ptr = p; \
p = (unsigned char *)p + sizeof(*(ptr)) * (count); \
} while (0)
CONSUME_ALLOC(rbio->stripe_pages, num_pages);
CONSUME_ALLOC(rbio->bio_pages, num_pages);
CONSUME_ALLOC(rbio->finish_pointers, real_stripes);
CONSUME_ALLOC(rbio->dbitmap, BITS_TO_LONGS(stripe_npages));
CONSUME_ALLOC(rbio->finish_pbitmap, BITS_TO_LONGS(stripe_npages));
#undef CONSUME_ALLOC
if (bbio->map_type & BTRFS_BLOCK_GROUP_RAID5)
nr_data = real_stripes - 1;
else if (bbio->map_type & BTRFS_BLOCK_GROUP_RAID6)
nr_data = real_stripes - 2;
else
BUG();
rbio->nr_data = nr_data;
return rbio;
}
/* allocate pages for all the stripes in the bio, including parity */
static int alloc_rbio_pages(struct btrfs_raid_bio *rbio)
{
int i;
struct page *page;
for (i = 0; i < rbio->nr_pages; i++) {
if (rbio->stripe_pages[i])
continue;
page = alloc_page(GFP_NOFS);
if (!page)
return -ENOMEM;
rbio->stripe_pages[i] = page;
}
return 0;
}
/* only allocate pages for p/q stripes */
static int alloc_rbio_parity_pages(struct btrfs_raid_bio *rbio)
{
int i;
struct page *page;
i = rbio_stripe_page_index(rbio, rbio->nr_data, 0);
for (; i < rbio->nr_pages; i++) {
if (rbio->stripe_pages[i])
continue;
page = alloc_page(GFP_NOFS);
if (!page)
return -ENOMEM;
rbio->stripe_pages[i] = page;
}
return 0;
}
/*
* add a single page from a specific stripe into our list of bios for IO
* this will try to merge into existing bios if possible, and returns
* zero if all went well.
*/
static int rbio_add_io_page(struct btrfs_raid_bio *rbio,
struct bio_list *bio_list,
struct page *page,
int stripe_nr,
unsigned long page_index,
unsigned long bio_max_len)
{
struct bio *last = bio_list->tail;
int ret;
struct bio *bio;
struct btrfs_bio_stripe *stripe;
u64 disk_start;
stripe = &rbio->bbio->stripes[stripe_nr];
disk_start = stripe->physical + (page_index << PAGE_SHIFT);
/* if the device is missing, just fail this stripe */
if (!stripe->dev->bdev)
return fail_rbio_index(rbio, stripe_nr);
/* see if we can add this page onto our existing bio */
if (last) {
u64 last_end = last->bi_iter.bi_sector << 9;
last_end += last->bi_iter.bi_size;
/*
* we can't merge these if they are from different
* devices or if they are not contiguous
*/
if (last_end == disk_start && !last->bi_status &&
last->bi_bdev == stripe->dev->bdev) {
ret = bio_add_page(last, page, PAGE_SIZE, 0);
if (ret == PAGE_SIZE)
return 0;
}
}
/* put a new bio on the list */
bio = btrfs_io_bio_alloc(bio_max_len >> PAGE_SHIFT ?: 1);
btrfs_io_bio(bio)->device = stripe->dev;
bio->bi_iter.bi_size = 0;
bio_set_dev(bio, stripe->dev->bdev);
bio->bi_iter.bi_sector = disk_start >> 9;
bio_add_page(bio, page, PAGE_SIZE, 0);
bio_list_add(bio_list, bio);
return 0;
}
/*
* while we're doing the read/modify/write cycle, we could
* have errors in reading pages off the disk. This checks
* for errors and if we're not able to read the page it'll
* trigger parity reconstruction. The rmw will be finished
* after we've reconstructed the failed stripes
*/
static void validate_rbio_for_rmw(struct btrfs_raid_bio *rbio)
{
if (rbio->faila >= 0 || rbio->failb >= 0) {
BUG_ON(rbio->faila == rbio->real_stripes - 1);
__raid56_parity_recover(rbio);
} else {
finish_rmw(rbio);
}
}
/*
* helper function to walk our bio list and populate the bio_pages array with
* the result. This seems expensive, but it is faster than constantly
* searching through the bio list as we setup the IO in finish_rmw or stripe
* reconstruction.
*
* This must be called before you trust the answers from page_in_rbio
*/
static void index_rbio_pages(struct btrfs_raid_bio *rbio)
{
struct bio *bio;
u64 start;
unsigned long stripe_offset;
unsigned long page_index;
spin_lock_irq(&rbio->bio_list_lock);
bio_list_for_each(bio, &rbio->bio_list) {
struct bio_vec bvec;
struct bvec_iter iter;
int i = 0;
start = bio->bi_iter.bi_sector << 9;
stripe_offset = start - rbio->bbio->raid_map[0];
page_index = stripe_offset >> PAGE_SHIFT;
if (bio_flagged(bio, BIO_CLONED))
bio->bi_iter = btrfs_io_bio(bio)->iter;
bio_for_each_segment(bvec, bio, iter) {
rbio->bio_pages[page_index + i] = bvec.bv_page;
i++;
}
}
spin_unlock_irq(&rbio->bio_list_lock);
}
/*
* this is called from one of two situations. We either
* have a full stripe from the higher layers, or we've read all
* the missing bits off disk.
*
* This will calculate the parity and then send down any
* changed blocks.
*/
static noinline void finish_rmw(struct btrfs_raid_bio *rbio)
{
struct btrfs_bio *bbio = rbio->bbio;
void **pointers = rbio->finish_pointers;
int nr_data = rbio->nr_data;
int stripe;
int pagenr;
bool has_qstripe;
struct bio_list bio_list;
struct bio *bio;
int ret;
bio_list_init(&bio_list);
if (rbio->real_stripes - rbio->nr_data == 1)
has_qstripe = false;
else if (rbio->real_stripes - rbio->nr_data == 2)
has_qstripe = true;
else
BUG();
/* at this point we either have a full stripe,
* or we've read the full stripe from the drive.
* recalculate the parity and write the new results.
*
* We're not allowed to add any new bios to the
* bio list here, anyone else that wants to
* change this stripe needs to do their own rmw.
*/
spin_lock_irq(&rbio->bio_list_lock);
set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags);
spin_unlock_irq(&rbio->bio_list_lock);
atomic_set(&rbio->error, 0);
/*
* now that we've set rmw_locked, run through the
* bio list one last time and map the page pointers
*
* We don't cache full rbios because we're assuming
* the higher layers are unlikely to use this area of
* the disk again soon. If they do use it again,
* hopefully they will send another full bio.
*/
index_rbio_pages(rbio);
if (!rbio_is_full(rbio))
cache_rbio_pages(rbio);
else
clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) {
struct page *p;
/* first collect one page from each data stripe */
for (stripe = 0; stripe < nr_data; stripe++) {
p = page_in_rbio(rbio, stripe, pagenr, 0);
pointers[stripe] = kmap_local_page(p);
}
/* then add the parity stripe */
p = rbio_pstripe_page(rbio, pagenr);
SetPageUptodate(p);
pointers[stripe++] = kmap_local_page(p);
if (has_qstripe) {
/*
* raid6, add the qstripe and call the
* library function to fill in our p/q
*/
p = rbio_qstripe_page(rbio, pagenr);
SetPageUptodate(p);
pointers[stripe++] = kmap_local_page(p);
raid6_call.gen_syndrome(rbio->real_stripes, PAGE_SIZE,
pointers);
} else {
/* raid5 */
copy_page(pointers[nr_data], pointers[0]);
run_xor(pointers + 1, nr_data - 1, PAGE_SIZE);
}
for (stripe = stripe - 1; stripe >= 0; stripe--)
kunmap_local(pointers[stripe]);
}
/*
* time to start writing. Make bios for everything from the
* higher layers (the bio_list in our rbio) and our p/q. Ignore
* everything else.
*/
for (stripe = 0; stripe < rbio->real_stripes; stripe++) {
for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) {
struct page *page;
if (stripe < rbio->nr_data) {
page = page_in_rbio(rbio, stripe, pagenr, 1);
if (!page)
continue;
} else {
page = rbio_stripe_page(rbio, stripe, pagenr);
}
ret = rbio_add_io_page(rbio, &bio_list,
page, stripe, pagenr, rbio->stripe_len);
if (ret)
goto cleanup;
}
}
if (likely(!bbio->num_tgtdevs))
goto write_data;
for (stripe = 0; stripe < rbio->real_stripes; stripe++) {
if (!bbio->tgtdev_map[stripe])
continue;
for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) {
struct page *page;
if (stripe < rbio->nr_data) {
page = page_in_rbio(rbio, stripe, pagenr, 1);
if (!page)
continue;
} else {
page = rbio_stripe_page(rbio, stripe, pagenr);
}
ret = rbio_add_io_page(rbio, &bio_list, page,
rbio->bbio->tgtdev_map[stripe],
pagenr, rbio->stripe_len);
if (ret)
goto cleanup;
}
}
write_data:
atomic_set(&rbio->stripes_pending, bio_list_size(&bio_list));
BUG_ON(atomic_read(&rbio->stripes_pending) == 0);
while ((bio = bio_list_pop(&bio_list))) {
bio->bi_private = rbio;
bio->bi_end_io = raid_write_end_io;
bio->bi_opf = REQ_OP_WRITE;
submit_bio(bio);
}
return;
cleanup:
rbio_orig_end_io(rbio, BLK_STS_IOERR);
while ((bio = bio_list_pop(&bio_list)))
bio_put(bio);
}
/*
* helper to find the stripe number for a given bio. Used to figure out which
* stripe has failed. This expects the bio to correspond to a physical disk,
* so it looks up based on physical sector numbers.
*/
static int find_bio_stripe(struct btrfs_raid_bio *rbio,
struct bio *bio)
{
u64 physical = bio->bi_iter.bi_sector;
int i;
struct btrfs_bio_stripe *stripe;
physical <<= 9;
for (i = 0; i < rbio->bbio->num_stripes; i++) {
stripe = &rbio->bbio->stripes[i];
if (in_range(physical, stripe->physical, rbio->stripe_len) &&
stripe->dev->bdev && bio->bi_bdev == stripe->dev->bdev) {
return i;
}
}
return -1;
}
/*
* helper to find the stripe number for a given
* bio (before mapping). Used to figure out which stripe has
* failed. This looks up based on logical block numbers.
*/
static int find_logical_bio_stripe(struct btrfs_raid_bio *rbio,
struct bio *bio)
{
u64 logical = bio->bi_iter.bi_sector << 9;
int i;
for (i = 0; i < rbio->nr_data; i++) {
u64 stripe_start = rbio->bbio->raid_map[i];
if (in_range(logical, stripe_start, rbio->stripe_len))
return i;
}
return -1;
}
/*
* returns -EIO if we had too many failures
*/
static int fail_rbio_index(struct btrfs_raid_bio *rbio, int failed)
{
unsigned long flags;
int ret = 0;
spin_lock_irqsave(&rbio->bio_list_lock, flags);
/* we already know this stripe is bad, move on */
if (rbio->faila == failed || rbio->failb == failed)
goto out;
if (rbio->faila == -1) {
/* first failure on this rbio */
rbio->faila = failed;
atomic_inc(&rbio->error);
} else if (rbio->failb == -1) {
/* second failure on this rbio */
rbio->failb = failed;
atomic_inc(&rbio->error);
} else {
ret = -EIO;
}
out:
spin_unlock_irqrestore(&rbio->bio_list_lock, flags);
return ret;
}
/*
* helper to fail a stripe based on a physical disk
* bio.
*/
static int fail_bio_stripe(struct btrfs_raid_bio *rbio,
struct bio *bio)
{
int failed = find_bio_stripe(rbio, bio);
if (failed < 0)
return -EIO;
return fail_rbio_index(rbio, failed);
}
/*
* this sets each page in the bio uptodate. It should only be used on private
* rbio pages, nothing that comes in from the higher layers
*/
static void set_bio_pages_uptodate(struct bio *bio)
{
struct bio_vec *bvec;
struct bvec_iter_all iter_all;
ASSERT(!bio_flagged(bio, BIO_CLONED));
bio_for_each_segment_all(bvec, bio, iter_all)
SetPageUptodate(bvec->bv_page);
}
/*
* end io for the read phase of the rmw cycle. All the bios here are physical
* stripe bios we've read from the disk so we can recalculate the parity of the
* stripe.
*
* This will usually kick off finish_rmw once all the bios are read in, but it
* may trigger parity reconstruction if we had any errors along the way
*/
static void raid_rmw_end_io(struct bio *bio)
{
struct btrfs_raid_bio *rbio = bio->bi_private;
if (bio->bi_status)
fail_bio_stripe(rbio, bio);
else
set_bio_pages_uptodate(bio);
bio_put(bio);
if (!atomic_dec_and_test(&rbio->stripes_pending))
return;
if (atomic_read(&rbio->error) > rbio->bbio->max_errors)
goto cleanup;
/*
* this will normally call finish_rmw to start our write
* but if there are any failed stripes we'll reconstruct
* from parity first
*/
validate_rbio_for_rmw(rbio);
return;
cleanup:
rbio_orig_end_io(rbio, BLK_STS_IOERR);
}
/*
* the stripe must be locked by the caller. It will
* unlock after all the writes are done
*/
static int raid56_rmw_stripe(struct btrfs_raid_bio *rbio)
{
int bios_to_read = 0;
struct bio_list bio_list;
int ret;
int pagenr;
int stripe;
struct bio *bio;
bio_list_init(&bio_list);
ret = alloc_rbio_pages(rbio);
if (ret)
goto cleanup;
index_rbio_pages(rbio);
atomic_set(&rbio->error, 0);
/*
* build a list of bios to read all the missing parts of this
* stripe
*/
for (stripe = 0; stripe < rbio->nr_data; stripe++) {
for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) {
struct page *page;
/*
* we want to find all the pages missing from
* the rbio and read them from the disk. If
* page_in_rbio finds a page in the bio list
* we don't need to read it off the stripe.
*/
page = page_in_rbio(rbio, stripe, pagenr, 1);
if (page)
continue;
page = rbio_stripe_page(rbio, stripe, pagenr);
/*
* the bio cache may have handed us an uptodate
* page. If so, be happy and use it
*/
if (PageUptodate(page))
continue;
ret = rbio_add_io_page(rbio, &bio_list, page,
stripe, pagenr, rbio->stripe_len);
if (ret)
goto cleanup;
}
}
bios_to_read = bio_list_size(&bio_list);
if (!bios_to_read) {
/*
* this can happen if others have merged with
* us, it means there is nothing left to read.
* But if there are missing devices it may not be
* safe to do the full stripe write yet.
*/
goto finish;
}
/*
* the bbio may be freed once we submit the last bio. Make sure
* not to touch it after that
*/
atomic_set(&rbio->stripes_pending, bios_to_read);
while ((bio = bio_list_pop(&bio_list))) {
bio->bi_private = rbio;
bio->bi_end_io = raid_rmw_end_io;
bio->bi_opf = REQ_OP_READ;
btrfs_bio_wq_end_io(rbio->fs_info, bio, BTRFS_WQ_ENDIO_RAID56);
submit_bio(bio);
}
/* the actual write will happen once the reads are done */
return 0;
cleanup:
rbio_orig_end_io(rbio, BLK_STS_IOERR);
while ((bio = bio_list_pop(&bio_list)))
bio_put(bio);
return -EIO;
finish:
validate_rbio_for_rmw(rbio);
return 0;
}
/*
* if the upper layers pass in a full stripe, we thank them by only allocating
* enough pages to hold the parity, and sending it all down quickly.
*/
static int full_stripe_write(struct btrfs_raid_bio *rbio)
{
int ret;
ret = alloc_rbio_parity_pages(rbio);
if (ret) {
__free_raid_bio(rbio);
return ret;
}
ret = lock_stripe_add(rbio);
if (ret == 0)
finish_rmw(rbio);
return 0;
}
/*
* partial stripe writes get handed over to async helpers.
* We're really hoping to merge a few more writes into this
* rbio before calculating new parity
*/
static int partial_stripe_write(struct btrfs_raid_bio *rbio)
{
int ret;
ret = lock_stripe_add(rbio);
if (ret == 0)
start_async_work(rbio, rmw_work);
return 0;
}
/*
* sometimes while we were reading from the drive to
* recalculate parity, enough new bios come into create
* a full stripe. So we do a check here to see if we can
* go directly to finish_rmw
*/
static int __raid56_parity_write(struct btrfs_raid_bio *rbio)
{
/* head off into rmw land if we don't have a full stripe */
if (!rbio_is_full(rbio))
return partial_stripe_write(rbio);
return full_stripe_write(rbio);
}
/*
* We use plugging call backs to collect full stripes.
* Any time we get a partial stripe write while plugged
* we collect it into a list. When the unplug comes down,
* we sort the list by logical block number and merge
* everything we can into the same rbios
*/
struct btrfs_plug_cb {
struct blk_plug_cb cb;
struct btrfs_fs_info *info;
struct list_head rbio_list;
struct btrfs_work work;
};
/*
* rbios on the plug list are sorted for easier merging.
*/
static int plug_cmp(void *priv, const struct list_head *a,
const struct list_head *b)
{
const struct btrfs_raid_bio *ra = container_of(a, struct btrfs_raid_bio,
plug_list);
const struct btrfs_raid_bio *rb = container_of(b, struct btrfs_raid_bio,
plug_list);
u64 a_sector = ra->bio_list.head->bi_iter.bi_sector;
u64 b_sector = rb->bio_list.head->bi_iter.bi_sector;
if (a_sector < b_sector)
return -1;
if (a_sector > b_sector)
return 1;
return 0;
}
static void run_plug(struct btrfs_plug_cb *plug)
{
struct btrfs_raid_bio *cur;
struct btrfs_raid_bio *last = NULL;
/*
* sort our plug list then try to merge
* everything we can in hopes of creating full
* stripes.
*/
list_sort(NULL, &plug->rbio_list, plug_cmp);
while (!list_empty(&plug->rbio_list)) {
cur = list_entry(plug->rbio_list.next,
struct btrfs_raid_bio, plug_list);
list_del_init(&cur->plug_list);
if (rbio_is_full(cur)) {
int ret;
/* we have a full stripe, send it down */
ret = full_stripe_write(cur);
BUG_ON(ret);
continue;
}
if (last) {
if (rbio_can_merge(last, cur)) {
merge_rbio(last, cur);
__free_raid_bio(cur);
continue;
}
__raid56_parity_write(last);
}
last = cur;
}
if (last) {
__raid56_parity_write(last);
}
kfree(plug);
}
/*
* if the unplug comes from schedule, we have to push the
* work off to a helper thread
*/
static void unplug_work(struct btrfs_work *work)
{
struct btrfs_plug_cb *plug;
plug = container_of(work, struct btrfs_plug_cb, work);
run_plug(plug);
}
static void btrfs_raid_unplug(struct blk_plug_cb *cb, bool from_schedule)
{
struct btrfs_plug_cb *plug;
plug = container_of(cb, struct btrfs_plug_cb, cb);
if (from_schedule) {
btrfs_init_work(&plug->work, unplug_work, NULL, NULL);
btrfs_queue_work(plug->info->rmw_workers,
&plug->work);
return;
}
run_plug(plug);
}
/*
* our main entry point for writes from the rest of the FS.
*/
int raid56_parity_write(struct btrfs_fs_info *fs_info, struct bio *bio,
struct btrfs_bio *bbio, u64 stripe_len)
{
struct btrfs_raid_bio *rbio;
struct btrfs_plug_cb *plug = NULL;
struct blk_plug_cb *cb;
int ret;
rbio = alloc_rbio(fs_info, bbio, stripe_len);
if (IS_ERR(rbio)) {
btrfs_put_bbio(bbio);
return PTR_ERR(rbio);
}
bio_list_add(&rbio->bio_list, bio);
rbio->bio_list_bytes = bio->bi_iter.bi_size;
rbio->operation = BTRFS_RBIO_WRITE;
btrfs_bio_counter_inc_noblocked(fs_info);
rbio->generic_bio_cnt = 1;
/*
* don't plug on full rbios, just get them out the door
* as quickly as we can
*/
if (rbio_is_full(rbio)) {
ret = full_stripe_write(rbio);
if (ret)
btrfs_bio_counter_dec(fs_info);
return ret;
}
cb = blk_check_plugged(btrfs_raid_unplug, fs_info, sizeof(*plug));
if (cb) {
plug = container_of(cb, struct btrfs_plug_cb, cb);
if (!plug->info) {
plug->info = fs_info;
INIT_LIST_HEAD(&plug->rbio_list);
}
list_add_tail(&rbio->plug_list, &plug->rbio_list);
ret = 0;
} else {
ret = __raid56_parity_write(rbio);
if (ret)
btrfs_bio_counter_dec(fs_info);
}
return ret;
}
/*
* all parity reconstruction happens here. We've read in everything
* we can find from the drives and this does the heavy lifting of
* sorting the good from the bad.
*/
static void __raid_recover_end_io(struct btrfs_raid_bio *rbio)
{
int pagenr, stripe;
void **pointers;
void **unmap_array;
int faila = -1, failb = -1;
struct page *page;
blk_status_t err;
int i;
pointers = kcalloc(rbio->real_stripes, sizeof(void *), GFP_NOFS);
if (!pointers) {
err = BLK_STS_RESOURCE;
goto cleanup_io;
}
/*
* Store copy of pointers that does not get reordered during
* reconstruction so that kunmap_local works.
*/
unmap_array = kcalloc(rbio->real_stripes, sizeof(void *), GFP_NOFS);
if (!unmap_array) {
err = BLK_STS_RESOURCE;
goto cleanup_pointers;
}
faila = rbio->faila;
failb = rbio->failb;
if (rbio->operation == BTRFS_RBIO_READ_REBUILD ||
rbio->operation == BTRFS_RBIO_REBUILD_MISSING) {
spin_lock_irq(&rbio->bio_list_lock);
set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags);
spin_unlock_irq(&rbio->bio_list_lock);
}
index_rbio_pages(rbio);
for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) {
/*
* Now we just use bitmap to mark the horizontal stripes in
* which we have data when doing parity scrub.
*/
if (rbio->operation == BTRFS_RBIO_PARITY_SCRUB &&
!test_bit(pagenr, rbio->dbitmap))
continue;
/*
* Setup our array of pointers with pages from each stripe
*
* NOTE: store a duplicate array of pointers to preserve the
* pointer order
*/
for (stripe = 0; stripe < rbio->real_stripes; stripe++) {
/*
* if we're rebuilding a read, we have to use
* pages from the bio list
*/
if ((rbio->operation == BTRFS_RBIO_READ_REBUILD ||
rbio->operation == BTRFS_RBIO_REBUILD_MISSING) &&
(stripe == faila || stripe == failb)) {
page = page_in_rbio(rbio, stripe, pagenr, 0);
} else {
page = rbio_stripe_page(rbio, stripe, pagenr);
}
pointers[stripe] = kmap_local_page(page);
unmap_array[stripe] = pointers[stripe];
}
/* all raid6 handling here */
if (rbio->bbio->map_type & BTRFS_BLOCK_GROUP_RAID6) {
/*
* single failure, rebuild from parity raid5
* style
*/
if (failb < 0) {
if (faila == rbio->nr_data) {
/*
* Just the P stripe has failed, without
* a bad data or Q stripe.
* TODO, we should redo the xor here.
*/
err = BLK_STS_IOERR;
goto cleanup;
}
/*
* a single failure in raid6 is rebuilt
* in the pstripe code below
*/
goto pstripe;
}
/* make sure our ps and qs are in order */
if (faila > failb)
swap(faila, failb);
/* if the q stripe is failed, do a pstripe reconstruction
* from the xors.
* If both the q stripe and the P stripe are failed, we're
* here due to a crc mismatch and we can't give them the
* data they want
*/
if (rbio->bbio->raid_map[failb] == RAID6_Q_STRIPE) {
if (rbio->bbio->raid_map[faila] ==
RAID5_P_STRIPE) {
err = BLK_STS_IOERR;
goto cleanup;
}
/*
* otherwise we have one bad data stripe and
* a good P stripe. raid5!
*/
goto pstripe;
}
if (rbio->bbio->raid_map[failb] == RAID5_P_STRIPE) {
raid6_datap_recov(rbio->real_stripes,
PAGE_SIZE, faila, pointers);
} else {
raid6_2data_recov(rbio->real_stripes,
PAGE_SIZE, faila, failb,
pointers);
}
} else {
void *p;
/* rebuild from P stripe here (raid5 or raid6) */
BUG_ON(failb != -1);
pstripe:
/* Copy parity block into failed block to start with */
copy_page(pointers[faila], pointers[rbio->nr_data]);
/* rearrange the pointer array */
p = pointers[faila];
for (stripe = faila; stripe < rbio->nr_data - 1; stripe++)
pointers[stripe] = pointers[stripe + 1];
pointers[rbio->nr_data - 1] = p;
/* xor in the rest */
run_xor(pointers, rbio->nr_data - 1, PAGE_SIZE);
}
/* if we're doing this rebuild as part of an rmw, go through
* and set all of our private rbio pages in the
* failed stripes as uptodate. This way finish_rmw will
* know they can be trusted. If this was a read reconstruction,
* other endio functions will fiddle the uptodate bits
*/
if (rbio->operation == BTRFS_RBIO_WRITE) {
for (i = 0; i < rbio->stripe_npages; i++) {
if (faila != -1) {
page = rbio_stripe_page(rbio, faila, i);
SetPageUptodate(page);
}
if (failb != -1) {
page = rbio_stripe_page(rbio, failb, i);
SetPageUptodate(page);
}
}
}
for (stripe = rbio->real_stripes - 1; stripe >= 0; stripe--)
kunmap_local(unmap_array[stripe]);
}
err = BLK_STS_OK;
cleanup:
kfree(unmap_array);
cleanup_pointers:
kfree(pointers);
cleanup_io:
/*
* Similar to READ_REBUILD, REBUILD_MISSING at this point also has a
* valid rbio which is consistent with ondisk content, thus such a
* valid rbio can be cached to avoid further disk reads.
*/
if (rbio->operation == BTRFS_RBIO_READ_REBUILD ||
rbio->operation == BTRFS_RBIO_REBUILD_MISSING) {
/*
* - In case of two failures, where rbio->failb != -1:
*
* Do not cache this rbio since the above read reconstruction
* (raid6_datap_recov() or raid6_2data_recov()) may have
* changed some content of stripes which are not identical to
* on-disk content any more, otherwise, a later write/recover
* may steal stripe_pages from this rbio and end up with
* corruptions or rebuild failures.
*
* - In case of single failure, where rbio->failb == -1:
*
* Cache this rbio iff the above read reconstruction is
* executed without problems.
*/
if (err == BLK_STS_OK && rbio->failb < 0)
cache_rbio_pages(rbio);
else
clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
rbio_orig_end_io(rbio, err);
} else if (err == BLK_STS_OK) {
rbio->faila = -1;
rbio->failb = -1;
if (rbio->operation == BTRFS_RBIO_WRITE)
finish_rmw(rbio);
else if (rbio->operation == BTRFS_RBIO_PARITY_SCRUB)
finish_parity_scrub(rbio, 0);
else
BUG();
} else {
rbio_orig_end_io(rbio, err);
}
}
/*
* This is called only for stripes we've read from disk to
* reconstruct the parity.
*/
static void raid_recover_end_io(struct bio *bio)
{
struct btrfs_raid_bio *rbio = bio->bi_private;
/*
* we only read stripe pages off the disk, set them
* up to date if there were no errors
*/
if (bio->bi_status)
fail_bio_stripe(rbio, bio);
else
set_bio_pages_uptodate(bio);
bio_put(bio);
if (!atomic_dec_and_test(&rbio->stripes_pending))
return;
if (atomic_read(&rbio->error) > rbio->bbio->max_errors)
rbio_orig_end_io(rbio, BLK_STS_IOERR);
else
__raid_recover_end_io(rbio);
}
/*
* reads everything we need off the disk to reconstruct
* the parity. endio handlers trigger final reconstruction
* when the IO is done.
*
* This is used both for reads from the higher layers and for
* parity construction required to finish a rmw cycle.
*/
static int __raid56_parity_recover(struct btrfs_raid_bio *rbio)
{
int bios_to_read = 0;
struct bio_list bio_list;
int ret;
int pagenr;
int stripe;
struct bio *bio;
bio_list_init(&bio_list);
ret = alloc_rbio_pages(rbio);
if (ret)
goto cleanup;
atomic_set(&rbio->error, 0);
/*
* read everything that hasn't failed. Thanks to the
* stripe cache, it is possible that some or all of these
* pages are going to be uptodate.
*/
for (stripe = 0; stripe < rbio->real_stripes; stripe++) {
if (rbio->faila == stripe || rbio->failb == stripe) {
atomic_inc(&rbio->error);
continue;
}
for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) {
struct page *p;
/*
* the rmw code may have already read this
* page in
*/
p = rbio_stripe_page(rbio, stripe, pagenr);
if (PageUptodate(p))
continue;
ret = rbio_add_io_page(rbio, &bio_list,
rbio_stripe_page(rbio, stripe, pagenr),
stripe, pagenr, rbio->stripe_len);
if (ret < 0)
goto cleanup;
}
}
bios_to_read = bio_list_size(&bio_list);
if (!bios_to_read) {
/*
* we might have no bios to read just because the pages
* were up to date, or we might have no bios to read because
* the devices were gone.
*/
if (atomic_read(&rbio->error) <= rbio->bbio->max_errors) {
__raid_recover_end_io(rbio);
return 0;
} else {
goto cleanup;
}
}
/*
* the bbio may be freed once we submit the last bio. Make sure
* not to touch it after that
*/
atomic_set(&rbio->stripes_pending, bios_to_read);
while ((bio = bio_list_pop(&bio_list))) {
bio->bi_private = rbio;
bio->bi_end_io = raid_recover_end_io;
bio->bi_opf = REQ_OP_READ;
btrfs_bio_wq_end_io(rbio->fs_info, bio, BTRFS_WQ_ENDIO_RAID56);
submit_bio(bio);
}
return 0;
cleanup:
if (rbio->operation == BTRFS_RBIO_READ_REBUILD ||
rbio->operation == BTRFS_RBIO_REBUILD_MISSING)
rbio_orig_end_io(rbio, BLK_STS_IOERR);
while ((bio = bio_list_pop(&bio_list)))
bio_put(bio);
return -EIO;
}
/*
* the main entry point for reads from the higher layers. This
* is really only called when the normal read path had a failure,
* so we assume the bio they send down corresponds to a failed part
* of the drive.
*/
int raid56_parity_recover(struct btrfs_fs_info *fs_info, struct bio *bio,
struct btrfs_bio *bbio, u64 stripe_len,
int mirror_num, int generic_io)
{
struct btrfs_raid_bio *rbio;
int ret;
if (generic_io) {
ASSERT(bbio->mirror_num == mirror_num);
btrfs_io_bio(bio)->mirror_num = mirror_num;
}
rbio = alloc_rbio(fs_info, bbio, stripe_len);
if (IS_ERR(rbio)) {
if (generic_io)
btrfs_put_bbio(bbio);
return PTR_ERR(rbio);
}
rbio->operation = BTRFS_RBIO_READ_REBUILD;
bio_list_add(&rbio->bio_list, bio);
rbio->bio_list_bytes = bio->bi_iter.bi_size;
rbio->faila = find_logical_bio_stripe(rbio, bio);
if (rbio->faila == -1) {
btrfs_warn(fs_info,
"%s could not find the bad stripe in raid56 so that we cannot recover any more (bio has logical %llu len %llu, bbio has map_type %llu)",
__func__, bio->bi_iter.bi_sector << 9,
(u64)bio->bi_iter.bi_size, bbio->map_type);
if (generic_io)
btrfs_put_bbio(bbio);
kfree(rbio);
return -EIO;
}
if (generic_io) {
btrfs_bio_counter_inc_noblocked(fs_info);
rbio->generic_bio_cnt = 1;
} else {
btrfs_get_bbio(bbio);
}
/*
* Loop retry:
* for 'mirror == 2', reconstruct from all other stripes.
* for 'mirror_num > 2', select a stripe to fail on every retry.
*/
if (mirror_num > 2) {
/*
* 'mirror == 3' is to fail the p stripe and
* reconstruct from the q stripe. 'mirror > 3' is to
* fail a data stripe and reconstruct from p+q stripe.
*/
rbio->failb = rbio->real_stripes - (mirror_num - 1);
ASSERT(rbio->failb > 0);
if (rbio->failb <= rbio->faila)
rbio->failb--;
}
ret = lock_stripe_add(rbio);
/*
* __raid56_parity_recover will end the bio with
* any errors it hits. We don't want to return
* its error value up the stack because our caller
* will end up calling bio_endio with any nonzero
* return
*/
if (ret == 0)
__raid56_parity_recover(rbio);
/*
* our rbio has been added to the list of
* rbios that will be handled after the
* currently lock owner is done
*/
return 0;
}
static void rmw_work(struct btrfs_work *work)
{
struct btrfs_raid_bio *rbio;
rbio = container_of(work, struct btrfs_raid_bio, work);
raid56_rmw_stripe(rbio);
}
static void read_rebuild_work(struct btrfs_work *work)
{
struct btrfs_raid_bio *rbio;
rbio = container_of(work, struct btrfs_raid_bio, work);
__raid56_parity_recover(rbio);
}
/*
* The following code is used to scrub/replace the parity stripe
*
* Caller must have already increased bio_counter for getting @bbio.
*
* Note: We need make sure all the pages that add into the scrub/replace
* raid bio are correct and not be changed during the scrub/replace. That
* is those pages just hold metadata or file data with checksum.
*/
struct btrfs_raid_bio *
raid56_parity_alloc_scrub_rbio(struct btrfs_fs_info *fs_info, struct bio *bio,
struct btrfs_bio *bbio, u64 stripe_len,
struct btrfs_device *scrub_dev,
unsigned long *dbitmap, int stripe_nsectors)
{
struct btrfs_raid_bio *rbio;
int i;
rbio = alloc_rbio(fs_info, bbio, stripe_len);
if (IS_ERR(rbio))
return NULL;
bio_list_add(&rbio->bio_list, bio);
/*
* This is a special bio which is used to hold the completion handler
* and make the scrub rbio is similar to the other types
*/
ASSERT(!bio->bi_iter.bi_size);
rbio->operation = BTRFS_RBIO_PARITY_SCRUB;
/*
* After mapping bbio with BTRFS_MAP_WRITE, parities have been sorted
* to the end position, so this search can start from the first parity
* stripe.
*/
for (i = rbio->nr_data; i < rbio->real_stripes; i++) {
if (bbio->stripes[i].dev == scrub_dev) {
rbio->scrubp = i;
break;
}
}
ASSERT(i < rbio->real_stripes);
/* Now we just support the sectorsize equals to page size */
ASSERT(fs_info->sectorsize == PAGE_SIZE);
ASSERT(rbio->stripe_npages == stripe_nsectors);
bitmap_copy(rbio->dbitmap, dbitmap, stripe_nsectors);
/*
* We have already increased bio_counter when getting bbio, record it
* so we can free it at rbio_orig_end_io().
*/
rbio->generic_bio_cnt = 1;
return rbio;
}
/* Used for both parity scrub and missing. */
void raid56_add_scrub_pages(struct btrfs_raid_bio *rbio, struct page *page,
u64 logical)
{
int stripe_offset;
int index;
ASSERT(logical >= rbio->bbio->raid_map[0]);
ASSERT(logical + PAGE_SIZE <= rbio->bbio->raid_map[0] +
rbio->stripe_len * rbio->nr_data);
stripe_offset = (int)(logical - rbio->bbio->raid_map[0]);
index = stripe_offset >> PAGE_SHIFT;
rbio->bio_pages[index] = page;
}
/*
* We just scrub the parity that we have correct data on the same horizontal,
* so we needn't allocate all pages for all the stripes.
*/
static int alloc_rbio_essential_pages(struct btrfs_raid_bio *rbio)
{
int i;
int bit;
int index;
struct page *page;
for_each_set_bit(bit, rbio->dbitmap, rbio->stripe_npages) {
for (i = 0; i < rbio->real_stripes; i++) {
index = i * rbio->stripe_npages + bit;
if (rbio->stripe_pages[index])
continue;
page = alloc_page(GFP_NOFS);
if (!page)
return -ENOMEM;
rbio->stripe_pages[index] = page;
}
}
return 0;
}
static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio,
int need_check)
{
struct btrfs_bio *bbio = rbio->bbio;
void **pointers = rbio->finish_pointers;
unsigned long *pbitmap = rbio->finish_pbitmap;
int nr_data = rbio->nr_data;
int stripe;
int pagenr;
bool has_qstripe;
struct page *p_page = NULL;
struct page *q_page = NULL;
struct bio_list bio_list;
struct bio *bio;
int is_replace = 0;
int ret;
bio_list_init(&bio_list);
if (rbio->real_stripes - rbio->nr_data == 1)
has_qstripe = false;
else if (rbio->real_stripes - rbio->nr_data == 2)
has_qstripe = true;
else
BUG();
if (bbio->num_tgtdevs && bbio->tgtdev_map[rbio->scrubp]) {
is_replace = 1;
bitmap_copy(pbitmap, rbio->dbitmap, rbio->stripe_npages);
}
/*
* Because the higher layers(scrubber) are unlikely to
* use this area of the disk again soon, so don't cache
* it.
*/
clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
if (!need_check)
goto writeback;
p_page = alloc_page(GFP_NOFS);
if (!p_page)
goto cleanup;
SetPageUptodate(p_page);
if (has_qstripe) {
/* RAID6, allocate and map temp space for the Q stripe */
q_page = alloc_page(GFP_NOFS);
if (!q_page) {
__free_page(p_page);
goto cleanup;
}
SetPageUptodate(q_page);
pointers[rbio->real_stripes - 1] = kmap_local_page(q_page);
}
atomic_set(&rbio->error, 0);
/* Map the parity stripe just once */
pointers[nr_data] = kmap_local_page(p_page);
for_each_set_bit(pagenr, rbio->dbitmap, rbio->stripe_npages) {
struct page *p;
void *parity;
/* first collect one page from each data stripe */
for (stripe = 0; stripe < nr_data; stripe++) {
p = page_in_rbio(rbio, stripe, pagenr, 0);
pointers[stripe] = kmap_local_page(p);
}
if (has_qstripe) {
/* RAID6, call the library function to fill in our P/Q */
raid6_call.gen_syndrome(rbio->real_stripes, PAGE_SIZE,
pointers);
} else {
/* raid5 */
copy_page(pointers[nr_data], pointers[0]);
run_xor(pointers + 1, nr_data - 1, PAGE_SIZE);
}
/* Check scrubbing parity and repair it */
p = rbio_stripe_page(rbio, rbio->scrubp, pagenr);
parity = kmap_local_page(p);
if (memcmp(parity, pointers[rbio->scrubp], PAGE_SIZE))
copy_page(parity, pointers[rbio->scrubp]);
else
/* Parity is right, needn't writeback */
bitmap_clear(rbio->dbitmap, pagenr, 1);
kunmap_local(parity);
for (stripe = nr_data - 1; stripe >= 0; stripe--)
kunmap_local(pointers[stripe]);
}
kunmap_local(pointers[nr_data]);
__free_page(p_page);
if (q_page) {
kunmap_local(pointers[rbio->real_stripes - 1]);
__free_page(q_page);
}
writeback:
/*
* time to start writing. Make bios for everything from the
* higher layers (the bio_list in our rbio) and our p/q. Ignore
* everything else.
*/
for_each_set_bit(pagenr, rbio->dbitmap, rbio->stripe_npages) {
struct page *page;
page = rbio_stripe_page(rbio, rbio->scrubp, pagenr);
ret = rbio_add_io_page(rbio, &bio_list,
page, rbio->scrubp, pagenr, rbio->stripe_len);
if (ret)
goto cleanup;
}
if (!is_replace)
goto submit_write;
for_each_set_bit(pagenr, pbitmap, rbio->stripe_npages) {
struct page *page;
page = rbio_stripe_page(rbio, rbio->scrubp, pagenr);
ret = rbio_add_io_page(rbio, &bio_list, page,
bbio->tgtdev_map[rbio->scrubp],
pagenr, rbio->stripe_len);
if (ret)
goto cleanup;
}
submit_write:
nr_data = bio_list_size(&bio_list);
if (!nr_data) {
/* Every parity is right */
rbio_orig_end_io(rbio, BLK_STS_OK);
return;
}
atomic_set(&rbio->stripes_pending, nr_data);
while ((bio = bio_list_pop(&bio_list))) {
bio->bi_private = rbio;
bio->bi_end_io = raid_write_end_io;
bio->bi_opf = REQ_OP_WRITE;
submit_bio(bio);
}
return;
cleanup:
rbio_orig_end_io(rbio, BLK_STS_IOERR);
while ((bio = bio_list_pop(&bio_list)))
bio_put(bio);
}
static inline int is_data_stripe(struct btrfs_raid_bio *rbio, int stripe)
{
if (stripe >= 0 && stripe < rbio->nr_data)
return 1;
return 0;
}
/*
* While we're doing the parity check and repair, we could have errors
* in reading pages off the disk. This checks for errors and if we're
* not able to read the page it'll trigger parity reconstruction. The
* parity scrub will be finished after we've reconstructed the failed
* stripes
*/
static void validate_rbio_for_parity_scrub(struct btrfs_raid_bio *rbio)
{
if (atomic_read(&rbio->error) > rbio->bbio->max_errors)
goto cleanup;
if (rbio->faila >= 0 || rbio->failb >= 0) {
int dfail = 0, failp = -1;
if (is_data_stripe(rbio, rbio->faila))
dfail++;
else if (is_parity_stripe(rbio->faila))
failp = rbio->faila;
if (is_data_stripe(rbio, rbio->failb))
dfail++;
else if (is_parity_stripe(rbio->failb))
failp = rbio->failb;
/*
* Because we can not use a scrubbing parity to repair
* the data, so the capability of the repair is declined.
* (In the case of RAID5, we can not repair anything)
*/
if (dfail > rbio->bbio->max_errors - 1)
goto cleanup;
/*
* If all data is good, only parity is correctly, just
* repair the parity.
*/
if (dfail == 0) {
finish_parity_scrub(rbio, 0);
return;
}
/*
* Here means we got one corrupted data stripe and one
* corrupted parity on RAID6, if the corrupted parity
* is scrubbing parity, luckily, use the other one to repair
* the data, or we can not repair the data stripe.
*/
if (failp != rbio->scrubp)
goto cleanup;
__raid_recover_end_io(rbio);
} else {
finish_parity_scrub(rbio, 1);
}
return;
cleanup:
rbio_orig_end_io(rbio, BLK_STS_IOERR);
}
/*
* end io for the read phase of the rmw cycle. All the bios here are physical
* stripe bios we've read from the disk so we can recalculate the parity of the
* stripe.
*
* This will usually kick off finish_rmw once all the bios are read in, but it
* may trigger parity reconstruction if we had any errors along the way
*/
static void raid56_parity_scrub_end_io(struct bio *bio)
{
struct btrfs_raid_bio *rbio = bio->bi_private;
if (bio->bi_status)
fail_bio_stripe(rbio, bio);
else
set_bio_pages_uptodate(bio);
bio_put(bio);
if (!atomic_dec_and_test(&rbio->stripes_pending))
return;
/*
* this will normally call finish_rmw to start our write
* but if there are any failed stripes we'll reconstruct
* from parity first
*/
validate_rbio_for_parity_scrub(rbio);
}
static void raid56_parity_scrub_stripe(struct btrfs_raid_bio *rbio)
{
int bios_to_read = 0;
struct bio_list bio_list;
int ret;
int pagenr;
int stripe;
struct bio *bio;
bio_list_init(&bio_list);
ret = alloc_rbio_essential_pages(rbio);
if (ret)
goto cleanup;
atomic_set(&rbio->error, 0);
/*
* build a list of bios to read all the missing parts of this
* stripe
*/
for (stripe = 0; stripe < rbio->real_stripes; stripe++) {
for_each_set_bit(pagenr, rbio->dbitmap, rbio->stripe_npages) {
struct page *page;
/*
* we want to find all the pages missing from
* the rbio and read them from the disk. If
* page_in_rbio finds a page in the bio list
* we don't need to read it off the stripe.
*/
page = page_in_rbio(rbio, stripe, pagenr, 1);
if (page)
continue;
page = rbio_stripe_page(rbio, stripe, pagenr);
/*
* the bio cache may have handed us an uptodate
* page. If so, be happy and use it
*/
if (PageUptodate(page))
continue;
ret = rbio_add_io_page(rbio, &bio_list, page,
stripe, pagenr, rbio->stripe_len);
if (ret)
goto cleanup;
}
}
bios_to_read = bio_list_size(&bio_list);
if (!bios_to_read) {
/*
* this can happen if others have merged with
* us, it means there is nothing left to read.
* But if there are missing devices it may not be
* safe to do the full stripe write yet.
*/
goto finish;
}
/*
* the bbio may be freed once we submit the last bio. Make sure
* not to touch it after that
*/
atomic_set(&rbio->stripes_pending, bios_to_read);
while ((bio = bio_list_pop(&bio_list))) {
bio->bi_private = rbio;
bio->bi_end_io = raid56_parity_scrub_end_io;
bio->bi_opf = REQ_OP_READ;
btrfs_bio_wq_end_io(rbio->fs_info, bio, BTRFS_WQ_ENDIO_RAID56);
submit_bio(bio);
}
/* the actual write will happen once the reads are done */
return;
cleanup:
rbio_orig_end_io(rbio, BLK_STS_IOERR);
while ((bio = bio_list_pop(&bio_list)))
bio_put(bio);
return;
finish:
validate_rbio_for_parity_scrub(rbio);
}
static void scrub_parity_work(struct btrfs_work *work)
{
struct btrfs_raid_bio *rbio;
rbio = container_of(work, struct btrfs_raid_bio, work);
raid56_parity_scrub_stripe(rbio);
}
void raid56_parity_submit_scrub_rbio(struct btrfs_raid_bio *rbio)
{
if (!lock_stripe_add(rbio))
start_async_work(rbio, scrub_parity_work);
}
/* The following code is used for dev replace of a missing RAID 5/6 device. */
struct btrfs_raid_bio *
raid56_alloc_missing_rbio(struct btrfs_fs_info *fs_info, struct bio *bio,
struct btrfs_bio *bbio, u64 length)
{
struct btrfs_raid_bio *rbio;
rbio = alloc_rbio(fs_info, bbio, length);
if (IS_ERR(rbio))
return NULL;
rbio->operation = BTRFS_RBIO_REBUILD_MISSING;
bio_list_add(&rbio->bio_list, bio);
/*
* This is a special bio which is used to hold the completion handler
* and make the scrub rbio is similar to the other types
*/
ASSERT(!bio->bi_iter.bi_size);
rbio->faila = find_logical_bio_stripe(rbio, bio);
if (rbio->faila == -1) {
BUG();
kfree(rbio);
return NULL;
}
/*
* When we get bbio, we have already increased bio_counter, record it
* so we can free it at rbio_orig_end_io()
*/
rbio->generic_bio_cnt = 1;
return rbio;
}
void raid56_submit_missing_rbio(struct btrfs_raid_bio *rbio)
{
if (!lock_stripe_add(rbio))
start_async_work(rbio, read_rebuild_work);
}
/*
* INETPEER - A storage for permanent information about peers
*
* This source is covered by the GNU GPL, the same as all kernel sources.
*
* Authors: Andrey V. Savochkin <saw@msu.ru>
*/
#include <linux/cache.h>
#include <linux/module.h>
#include <linux/types.h>
#include <linux/slab.h>
#include <linux/interrupt.h>
#include <linux/spinlock.h>
#include <linux/random.h>
#include <linux/timer.h>
#include <linux/time.h>
#include <linux/kernel.h>
#include <linux/mm.h>
#include <linux/net.h>
#include <linux/workqueue.h>
#include <net/ip.h>
#include <net/inetpeer.h>
#include <net/secure_seq.h>
/*
* Theory of operations.
* We keep one entry for each peer IP address. The nodes contains long-living
* information about the peer which doesn't depend on routes.
*
* Nodes are removed only when reference counter goes to 0.
* When it's happened the node may be removed when a sufficient amount of
* time has been passed since its last use. The less-recently-used entry can
* also be removed if the pool is overloaded i.e. if the total amount of
* entries is greater-or-equal than the threshold.
*
* Node pool is organised as an RB tree.
* Such an implementation has been chosen not just for fun. It's a way to
* prevent easy and efficient DoS attacks by creating hash collisions. A huge
* amount of long living nodes in a single hash slot would significantly delay
* lookups performed with disabled BHs.
*
* Serialisation issues.
* 1. Nodes may appear in the tree only with the pool lock held.
* 2. Nodes may disappear from the tree only with the pool lock held
* AND reference count being 0.
* 3. Global variable peer_total is modified under the pool lock.
* 4. struct inet_peer fields modification:
* rb_node: pool lock
* refcnt: atomically against modifications on other CPU;
* usually under some other lock to prevent node disappearing
* daddr: unchangeable
*/
static struct kmem_cache *peer_cachep __ro_after_init;
void inet_peer_base_init(struct inet_peer_base *bp)
{
bp->rb_root = RB_ROOT;
seqlock_init(&bp->lock);
bp->total = 0;
}
EXPORT_SYMBOL_GPL(inet_peer_base_init);
#define PEER_MAX_GC 32
/* Exported for sysctl_net_ipv4. */
int inet_peer_threshold __read_mostly; /* start to throw entries more
* aggressively at this stage */
int inet_peer_minttl __read_mostly = 120 * HZ; /* TTL under high load: 120 sec */
int inet_peer_maxttl __read_mostly = 10 * 60 * HZ; /* usual time to live: 10 min */
/* Called from ip_output.c:ip_init */
void __init inet_initpeers(void)
{
u64 nr_entries;
/* 1% of physical memory */
nr_entries = div64_ul((u64)totalram_pages() << PAGE_SHIFT,
100 * L1_CACHE_ALIGN(sizeof(struct inet_peer)));
inet_peer_threshold = clamp_val(nr_entries, 4096, 65536 + 128);
peer_cachep = kmem_cache_create("inet_peer_cache",
sizeof(struct inet_peer),
0, SLAB_HWCACHE_ALIGN | SLAB_PANIC,
NULL);
}
/* Called with rcu_read_lock() or base->lock held */
static struct inet_peer *lookup(const struct inetpeer_addr *daddr,
struct inet_peer_base *base,
unsigned int seq,
struct inet_peer *gc_stack[],
unsigned int *gc_cnt,
struct rb_node **parent_p,
struct rb_node ***pp_p)
{
struct rb_node **pp, *parent, *next;
struct inet_peer *p;
pp = &base->rb_root.rb_node;
parent = NULL;
while (1) {
int cmp;
next = rcu_dereference_raw(*pp);
if (!next)
break;
parent = next;
p = rb_entry(parent, struct inet_peer, rb_node);
cmp = inetpeer_addr_cmp(daddr, &p->daddr);
if (cmp == 0) {
if (!refcount_inc_not_zero(&p->refcnt))
break;
return p;
}
if (gc_stack) { if (*gc_cnt < PEER_MAX_GC) gc_stack[(*gc_cnt)++] = p;
} else if (unlikely(read_seqretry(&base->lock, seq))) {
break;
}
if (cmp == -1) pp = &next->rb_left;
else
pp = &next->rb_right;
}
*parent_p = parent;
*pp_p = pp;
return NULL;
}
static void inetpeer_free_rcu(struct rcu_head *head)
{
kmem_cache_free(peer_cachep, container_of(head, struct inet_peer, rcu));
}
/* perform garbage collect on all items stacked during a lookup */
static void inet_peer_gc(struct inet_peer_base *base,
struct inet_peer *gc_stack[],
unsigned int gc_cnt)
{
struct inet_peer *p;
__u32 delta, ttl;
int i;
if (base->total >= inet_peer_threshold)
ttl = 0; /* be aggressive */
else
ttl = inet_peer_maxttl
- (inet_peer_maxttl - inet_peer_minttl) / HZ *
base->total / inet_peer_threshold * HZ;
for (i = 0; i < gc_cnt; i++) { p = gc_stack[i];
/* The READ_ONCE() pairs with the WRITE_ONCE()
* in inet_putpeer()
*/
delta = (__u32)jiffies - READ_ONCE(p->dtime);
if (delta < ttl || !refcount_dec_if_one(&p->refcnt)) gc_stack[i] = NULL;
}
for (i = 0; i < gc_cnt; i++) { p = gc_stack[i];
if (p) {
rb_erase(&p->rb_node, &base->rb_root);
base->total--;
call_rcu(&p->rcu, inetpeer_free_rcu);
}
}
}
struct inet_peer *inet_getpeer(struct inet_peer_base *base,
const struct inetpeer_addr *daddr,
int create)
{
struct inet_peer *p, *gc_stack[PEER_MAX_GC];
struct rb_node **pp, *parent;
unsigned int gc_cnt, seq;
int invalidated;
/* Attempt a lockless lookup first.
* Because of a concurrent writer, we might not find an existing entry.
*/
rcu_read_lock();
seq = read_seqbegin(&base->lock);
p = lookup(daddr, base, seq, NULL, &gc_cnt, &parent, &pp);
invalidated = read_seqretry(&base->lock, seq);
rcu_read_unlock();
if (p)
return p;
/* If no writer did a change during our lookup, we can return early. */
if (!create && !invalidated)
return NULL;
/* retry an exact lookup, taking the lock before.
* At least, nodes should be hot in our cache.
*/
parent = NULL;
write_seqlock_bh(&base->lock);
gc_cnt = 0;
p = lookup(daddr, base, seq, gc_stack, &gc_cnt, &parent, &pp);
if (!p && create) { p = kmem_cache_alloc(peer_cachep, GFP_ATOMIC);
if (p) {
p->daddr = *daddr;
p->dtime = (__u32)jiffies;
refcount_set(&p->refcnt, 2);
atomic_set(&p->rid, 0);
p->metrics[RTAX_LOCK-1] = INETPEER_METRICS_NEW;
p->rate_tokens = 0;
p->n_redirects = 0;
/* 60*HZ is arbitrary, but chosen enough high so that the first
* calculation of tokens is at its maximum.
*/
p->rate_last = jiffies - 60*HZ;
rb_link_node(&p->rb_node, parent, pp);
rb_insert_color(&p->rb_node, &base->rb_root);
base->total++;
}
}
if (gc_cnt)
inet_peer_gc(base, gc_stack, gc_cnt);
write_sequnlock_bh(&base->lock);
return p;
}
EXPORT_SYMBOL_GPL(inet_getpeer);
void inet_putpeer(struct inet_peer *p)
{
/* The WRITE_ONCE() pairs with itself (we run lockless)
* and the READ_ONCE() in inet_peer_gc()
*/
WRITE_ONCE(p->dtime, (__u32)jiffies);
if (refcount_dec_and_test(&p->refcnt))
call_rcu(&p->rcu, inetpeer_free_rcu);
}
EXPORT_SYMBOL_GPL(inet_putpeer);
/*
* Check transmit rate limitation for given message.
* The rate information is held in the inet_peer entries now.
* This function is generic and could be used for other purposes
* too. It uses a Token bucket filter as suggested by Alexey Kuznetsov.
*
* Note that the same inet_peer fields are modified by functions in
* route.c too, but these work for packet destinations while xrlim_allow
* works for icmp destinations. This means the rate limiting information
* for one "ip object" is shared - and these ICMPs are twice limited:
* by source and by destination.
*
* RFC 1812: 4.3.2.8 SHOULD be able to limit error message rate
* SHOULD allow setting of rate limits
*
* Shared between ICMPv4 and ICMPv6.
*/
#define XRLIM_BURST_FACTOR 6
bool inet_peer_xrlim_allow(struct inet_peer *peer, int timeout)
{
unsigned long now, token;
bool rc = false;
if (!peer)
return true;
token = peer->rate_tokens;
now = jiffies;
token += now - peer->rate_last;
peer->rate_last = now;
if (token > XRLIM_BURST_FACTOR * timeout)
token = XRLIM_BURST_FACTOR * timeout;
if (token >= timeout) {
token -= timeout; rc = true;
}
peer->rate_tokens = token; return rc;
}
EXPORT_SYMBOL(inet_peer_xrlim_allow);
void inetpeer_invalidate_tree(struct inet_peer_base *base)
{
struct rb_node *p = rb_first(&base->rb_root);
while (p) {
struct inet_peer *peer = rb_entry(p, struct inet_peer, rb_node);
p = rb_next(p);
rb_erase(&peer->rb_node, &base->rb_root);
inet_putpeer(peer);
cond_resched();
}
base->total = 0;
}
EXPORT_SYMBOL(inetpeer_invalidate_tree);
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_LIST_NULLS_H
#define _LINUX_LIST_NULLS_H
#include <linux/poison.h>
#include <linux/const.h>
/*
* Special version of lists, where end of list is not a NULL pointer,
* but a 'nulls' marker, which can have many different values.
* (up to 2^31 different values guaranteed on all platforms)
*
* In the standard hlist, termination of a list is the NULL pointer.
* In this special 'nulls' variant, we use the fact that objects stored in
* a list are aligned on a word (4 or 8 bytes alignment).
* We therefore use the last significant bit of 'ptr' :
* Set to 1 : This is a 'nulls' end-of-list marker (ptr >> 1)
* Set to 0 : This is a pointer to some object (ptr)
*/
struct hlist_nulls_head {
struct hlist_nulls_node *first;
};
struct hlist_nulls_node {
struct hlist_nulls_node *next, **pprev;
};
#define NULLS_MARKER(value) (1UL | (((long)value) << 1))
#define INIT_HLIST_NULLS_HEAD(ptr, nulls) \
((ptr)->first = (struct hlist_nulls_node *) NULLS_MARKER(nulls))
#define hlist_nulls_entry(ptr, type, member) container_of(ptr,type,member)
#define hlist_nulls_entry_safe(ptr, type, member) \
({ typeof(ptr) ____ptr = (ptr); \
!is_a_nulls(____ptr) ? hlist_nulls_entry(____ptr, type, member) : NULL; \
})
/**
* ptr_is_a_nulls - Test if a ptr is a nulls
* @ptr: ptr to be tested
*
*/
static inline int is_a_nulls(const struct hlist_nulls_node *ptr)
{
return ((unsigned long)ptr & 1);
}
/**
* get_nulls_value - Get the 'nulls' value of the end of chain
* @ptr: end of chain
*
* Should be called only if is_a_nulls(ptr);
*/
static inline unsigned long get_nulls_value(const struct hlist_nulls_node *ptr)
{
return ((unsigned long)ptr) >> 1;
}
/**
* hlist_nulls_unhashed - Has node been removed and reinitialized?
* @h: Node to be checked
*
* Not that not all removal functions will leave a node in unhashed state.
* For example, hlist_del_init_rcu() leaves the node in unhashed state,
* but hlist_nulls_del() does not.
*/
static inline int hlist_nulls_unhashed(const struct hlist_nulls_node *h)
{
return !h->pprev;
}
/**
* hlist_nulls_unhashed_lockless - Has node been removed and reinitialized?
* @h: Node to be checked
*
* Not that not all removal functions will leave a node in unhashed state.
* For example, hlist_del_init_rcu() leaves the node in unhashed state,
* but hlist_nulls_del() does not. Unlike hlist_nulls_unhashed(), this
* function may be used locklessly.
*/
static inline int hlist_nulls_unhashed_lockless(const struct hlist_nulls_node *h)
{
return !READ_ONCE(h->pprev);
}
static inline int hlist_nulls_empty(const struct hlist_nulls_head *h)
{
return is_a_nulls(READ_ONCE(h->first));
}
static inline void hlist_nulls_add_head(struct hlist_nulls_node *n,
struct hlist_nulls_head *h)
{
struct hlist_nulls_node *first = h->first;
n->next = first;
WRITE_ONCE(n->pprev, &h->first);
h->first = n;
if (!is_a_nulls(first))
WRITE_ONCE(first->pprev, &n->next);
}
static inline void __hlist_nulls_del(struct hlist_nulls_node *n)
{
struct hlist_nulls_node *next = n->next;
struct hlist_nulls_node **pprev = n->pprev;
WRITE_ONCE(*pprev, next);
if (!is_a_nulls(next))
WRITE_ONCE(next->pprev, pprev);
}
static inline void hlist_nulls_del(struct hlist_nulls_node *n)
{
__hlist_nulls_del(n); WRITE_ONCE(n->pprev, LIST_POISON2);
}
/**
* hlist_nulls_for_each_entry - iterate over list of given type
* @tpos: the type * to use as a loop cursor.
* @pos: the &struct hlist_node to use as a loop cursor.
* @head: the head for your list.
* @member: the name of the hlist_node within the struct.
*
*/
#define hlist_nulls_for_each_entry(tpos, pos, head, member) \
for (pos = (head)->first; \
(!is_a_nulls(pos)) && \
({ tpos = hlist_nulls_entry(pos, typeof(*tpos), member); 1;}); \
pos = pos->next)
/**
* hlist_nulls_for_each_entry_from - iterate over a hlist continuing from current point
* @tpos: the type * to use as a loop cursor.
* @pos: the &struct hlist_node to use as a loop cursor.
* @member: the name of the hlist_node within the struct.
*
*/
#define hlist_nulls_for_each_entry_from(tpos, pos, member) \
for (; (!is_a_nulls(pos)) && \
({ tpos = hlist_nulls_entry(pos, typeof(*tpos), member); 1;}); \
pos = pos->next)
#endif
// SPDX-License-Identifier: GPL-2.0-only
/*
* Generic hugetlb support.
* (C) Nadia Yvette Chambers, April 2004
*/
#include <linux/list.h>
#include <linux/init.h>
#include <linux/mm.h>
#include <linux/seq_file.h>
#include <linux/sysctl.h>
#include <linux/highmem.h>
#include <linux/mmu_notifier.h>
#include <linux/nodemask.h>
#include <linux/pagemap.h>
#include <linux/mempolicy.h>
#include <linux/compiler.h>
#include <linux/cpuset.h>
#include <linux/mutex.h>
#include <linux/memblock.h>
#include <linux/sysfs.h>
#include <linux/slab.h>
#include <linux/sched/mm.h>
#include <linux/mmdebug.h>
#include <linux/sched/signal.h>
#include <linux/rmap.h>
#include <linux/string_helpers.h>
#include <linux/swap.h>
#include <linux/swapops.h>
#include <linux/jhash.h>
#include <linux/numa.h>
#include <linux/llist.h>
#include <linux/cma.h>
#include <linux/migrate.h>
#include <asm/page.h>
#include <asm/pgalloc.h>
#include <asm/tlb.h>
#include <linux/io.h>
#include <linux/hugetlb.h>
#include <linux/hugetlb_cgroup.h>
#include <linux/node.h>
#include <linux/page_owner.h>
#include "internal.h"
#include "hugetlb_vmemmap.h"
int hugetlb_max_hstate __read_mostly;
unsigned int default_hstate_idx;
struct hstate hstates[HUGE_MAX_HSTATE];
#ifdef CONFIG_CMA
static struct cma *hugetlb_cma[MAX_NUMNODES];
#endif
static unsigned long hugetlb_cma_size __initdata;
/*
* Minimum page order among possible hugepage sizes, set to a proper value
* at boot time.
*/
static unsigned int minimum_order __read_mostly = UINT_MAX;
__initdata LIST_HEAD(huge_boot_pages);
/* for command line parsing */
static struct hstate * __initdata parsed_hstate;
static unsigned long __initdata default_hstate_max_huge_pages;
static bool __initdata parsed_valid_hugepagesz = true;
static bool __initdata parsed_default_hugepagesz;
/*
* Protects updates to hugepage_freelists, hugepage_activelist, nr_huge_pages,
* free_huge_pages, and surplus_huge_pages.
*/
DEFINE_SPINLOCK(hugetlb_lock);
/*
* Serializes faults on the same logical page. This is used to
* prevent spurious OOMs when the hugepage pool is fully utilized.
*/
static int num_fault_mutexes;
struct mutex *hugetlb_fault_mutex_table ____cacheline_aligned_in_smp;
/* Forward declaration */
static int hugetlb_acct_memory(struct hstate *h, long delta);
static inline bool subpool_is_free(struct hugepage_subpool *spool)
{
if (spool->count)
return false;
if (spool->max_hpages != -1)
return spool->used_hpages == 0;
if (spool->min_hpages != -1)
return spool->rsv_hpages == spool->min_hpages;
return true;
}
static inline void unlock_or_release_subpool(struct hugepage_subpool *spool,
unsigned long irq_flags)
{
spin_unlock_irqrestore(&spool->lock, irq_flags);
/* If no pages are used, and no other handles to the subpool
* remain, give up any reservations based on minimum size and
* free the subpool */
if (subpool_is_free(spool)) {
if (spool->min_hpages != -1)
hugetlb_acct_memory(spool->hstate,
-spool->min_hpages);
kfree(spool);
}
}
struct hugepage_subpool *hugepage_new_subpool(struct hstate *h, long max_hpages,
long min_hpages)
{
struct hugepage_subpool *spool;
spool = kzalloc(sizeof(*spool), GFP_KERNEL);
if (!spool)
return NULL;
spin_lock_init(&spool->lock);
spool->count = 1;
spool->max_hpages = max_hpages;
spool->hstate = h;
spool->min_hpages = min_hpages;
if (min_hpages != -1 && hugetlb_acct_memory(h, min_hpages)) {
kfree(spool);
return NULL;
}
spool->rsv_hpages = min_hpages;
return spool;
}
void hugepage_put_subpool(struct hugepage_subpool *spool)
{
unsigned long flags;
spin_lock_irqsave(&spool->lock, flags);
BUG_ON(!spool->count);
spool->count--;
unlock_or_release_subpool(spool, flags);
}
/*
* Subpool accounting for allocating and reserving pages.
* Return -ENOMEM if there are not enough resources to satisfy the
* request. Otherwise, return the number of pages by which the
* global pools must be adjusted (upward). The returned value may
* only be different than the passed value (delta) in the case where
* a subpool minimum size must be maintained.
*/
static long hugepage_subpool_get_pages(struct hugepage_subpool *spool,
long delta)
{
long ret = delta;
if (!spool)
return ret;
spin_lock_irq(&spool->lock);
if (spool->max_hpages != -1) { /* maximum size accounting */
if ((spool->used_hpages + delta) <= spool->max_hpages)
spool->used_hpages += delta;
else {
ret = -ENOMEM;
goto unlock_ret;
}
}
/* minimum size accounting */
if (spool->min_hpages != -1 && spool->rsv_hpages) {
if (delta > spool->rsv_hpages) {
/*
* Asking for more reserves than those already taken on
* behalf of subpool. Return difference.
*/
ret = delta - spool->rsv_hpages;
spool->rsv_hpages = 0;
} else {
ret = 0; /* reserves already accounted for */
spool->rsv_hpages -= delta;
}
}
unlock_ret:
spin_unlock_irq(&spool->lock);
return ret;
}
/*
* Subpool accounting for freeing and unreserving pages.
* Return the number of global page reservations that must be dropped.
* The return value may only be different than the passed value (delta)
* in the case where a subpool minimum size must be maintained.
*/
static long hugepage_subpool_put_pages(struct hugepage_subpool *spool,
long delta)
{
long ret = delta;
unsigned long flags;
if (!spool)
return delta;
spin_lock_irqsave(&spool->lock, flags);
if (spool->max_hpages != -1) /* maximum size accounting */
spool->used_hpages -= delta;
/* minimum size accounting */
if (spool->min_hpages != -1 && spool->used_hpages < spool->min_hpages) {
if (spool->rsv_hpages + delta <= spool->min_hpages)
ret = 0;
else
ret = spool->rsv_hpages + delta - spool->min_hpages;
spool->rsv_hpages += delta;
if (spool->rsv_hpages > spool->min_hpages)
spool->rsv_hpages = spool->min_hpages;
}
/*
* If hugetlbfs_put_super couldn't free spool due to an outstanding
* quota reference, free it now.
*/
unlock_or_release_subpool(spool, flags);
return ret;
}
static inline struct hugepage_subpool *subpool_inode(struct inode *inode)
{
return HUGETLBFS_SB(inode->i_sb)->spool;
}
static inline struct hugepage_subpool *subpool_vma(struct vm_area_struct *vma)
{
return subpool_inode(file_inode(vma->vm_file));
}
/* Helper that removes a struct file_region from the resv_map cache and returns
* it for use.
*/
static struct file_region *
get_file_region_entry_from_cache(struct resv_map *resv, long from, long to)
{
struct file_region *nrg = NULL;
VM_BUG_ON(resv->region_cache_count <= 0);
resv->region_cache_count--;
nrg = list_first_entry(&resv->region_cache, struct file_region, link);
list_del(&nrg->link);
nrg->from = from;
nrg->to = to;
return nrg;
}
static void copy_hugetlb_cgroup_uncharge_info(struct file_region *nrg,
struct file_region *rg)
{
#ifdef CONFIG_CGROUP_HUGETLB
nrg->reservation_counter = rg->reservation_counter;
nrg->css = rg->css;
if (rg->css)
css_get(rg->css);
#endif
}
/* Helper that records hugetlb_cgroup uncharge info. */
static void record_hugetlb_cgroup_uncharge_info(struct hugetlb_cgroup *h_cg,
struct hstate *h,
struct resv_map *resv,
struct file_region *nrg)
{
#ifdef CONFIG_CGROUP_HUGETLB
if (h_cg) {
nrg->reservation_counter =
&h_cg->rsvd_hugepage[hstate_index(h)];
nrg->css = &h_cg->css;
/*
* The caller will hold exactly one h_cg->css reference for the
* whole contiguous reservation region. But this area might be
* scattered when there are already some file_regions reside in
* it. As a result, many file_regions may share only one css
* reference. In order to ensure that one file_region must hold
* exactly one h_cg->css reference, we should do css_get for
* each file_region and leave the reference held by caller
* untouched.
*/
css_get(&h_cg->css);
if (!resv->pages_per_hpage)
resv->pages_per_hpage = pages_per_huge_page(h);
/* pages_per_hpage should be the same for all entries in
* a resv_map.
*/
VM_BUG_ON(resv->pages_per_hpage != pages_per_huge_page(h));
} else {
nrg->reservation_counter = NULL;
nrg->css = NULL;
}
#endif
}
static void put_uncharge_info(struct file_region *rg)
{
#ifdef CONFIG_CGROUP_HUGETLB
if (rg->css)
css_put(rg->css);
#endif
}
static bool has_same_uncharge_info(struct file_region *rg,
struct file_region *org)
{
#ifdef CONFIG_CGROUP_HUGETLB
return rg && org &&
rg->reservation_counter == org->reservation_counter &&
rg->css == org->css;
#else
return true;
#endif
}
static void coalesce_file_region(struct resv_map *resv, struct file_region *rg)
{
struct file_region *nrg = NULL, *prg = NULL;
prg = list_prev_entry(rg, link);
if (&prg->link != &resv->regions && prg->to == rg->from &&
has_same_uncharge_info(prg, rg)) {
prg->to = rg->to;
list_del(&rg->link);
put_uncharge_info(rg);
kfree(rg);
rg = prg;
}
nrg = list_next_entry(rg, link);
if (&nrg->link != &resv->regions && nrg->from == rg->to &&
has_same_uncharge_info(nrg, rg)) {
nrg->from = rg->from;
list_del(&rg->link);
put_uncharge_info(rg);
kfree(rg);
}
}
static inline long
hugetlb_resv_map_add(struct resv_map *map, struct file_region *rg, long from,
long to, struct hstate *h, struct hugetlb_cgroup *cg,
long *regions_needed)
{
struct file_region *nrg;
if (!regions_needed) {
nrg = get_file_region_entry_from_cache(map, from, to);
record_hugetlb_cgroup_uncharge_info(cg, h, map, nrg);
list_add(&nrg->link, rg->link.prev);
coalesce_file_region(map, nrg);
} else
*regions_needed += 1;
return to - from;
}
/*
* Must be called with resv->lock held.
*
* Calling this with regions_needed != NULL will count the number of pages
* to be added but will not modify the linked list. And regions_needed will
* indicate the number of file_regions needed in the cache to carry out to add
* the regions for this range.
*/
static long add_reservation_in_range(struct resv_map *resv, long f, long t,
struct hugetlb_cgroup *h_cg,
struct hstate *h, long *regions_needed)
{
long add = 0;
struct list_head *head = &resv->regions;
long last_accounted_offset = f;
struct file_region *rg = NULL, *trg = NULL;
if (regions_needed)
*regions_needed = 0;
/* In this loop, we essentially handle an entry for the range
* [last_accounted_offset, rg->from), at every iteration, with some
* bounds checking.
*/
list_for_each_entry_safe(rg, trg, head, link) {
/* Skip irrelevant regions that start before our range. */
if (rg->from < f) {
/* If this region ends after the last accounted offset,
* then we need to update last_accounted_offset.
*/
if (rg->to > last_accounted_offset)
last_accounted_offset = rg->to;
continue;
}
/* When we find a region that starts beyond our range, we've
* finished.
*/
if (rg->from >= t)
break;
/* Add an entry for last_accounted_offset -> rg->from, and
* update last_accounted_offset.
*/
if (rg->from > last_accounted_offset)
add += hugetlb_resv_map_add(resv, rg,
last_accounted_offset,
rg->from, h, h_cg,
regions_needed);
last_accounted_offset = rg->to;
}
/* Handle the case where our range extends beyond
* last_accounted_offset.
*/
if (last_accounted_offset < t)
add += hugetlb_resv_map_add(resv, rg, last_accounted_offset,
t, h, h_cg, regions_needed);
VM_BUG_ON(add < 0);
return add;
}
/* Must be called with resv->lock acquired. Will drop lock to allocate entries.
*/
static int allocate_file_region_entries(struct resv_map *resv,
int regions_needed)
__must_hold(&resv->lock)
{
struct list_head allocated_regions;
int to_allocate = 0, i = 0;
struct file_region *trg = NULL, *rg = NULL;
VM_BUG_ON(regions_needed < 0);
INIT_LIST_HEAD(&allocated_regions);
/*
* Check for sufficient descriptors in the cache to accommodate
* the number of in progress add operations plus regions_needed.
*
* This is a while loop because when we drop the lock, some other call
* to region_add or region_del may have consumed some region_entries,
* so we keep looping here until we finally have enough entries for
* (adds_in_progress + regions_needed).
*/
while (resv->region_cache_count <
(resv->adds_in_progress + regions_needed)) {
to_allocate = resv->adds_in_progress + regions_needed -
resv->region_cache_count;
/* At this point, we should have enough entries in the cache
* for all the existing adds_in_progress. We should only be
* needing to allocate for regions_needed.
*/
VM_BUG_ON(resv->region_cache_count < resv->adds_in_progress);
spin_unlock(&resv->lock);
for (i = 0; i < to_allocate; i++) {
trg = kmalloc(sizeof(*trg), GFP_KERNEL);
if (!trg)
goto out_of_memory;
list_add(&trg->link, &allocated_regions);
}
spin_lock(&resv->lock);
list_splice(&allocated_regions, &resv->region_cache);
resv->region_cache_count += to_allocate;
}
return 0;
out_of_memory:
list_for_each_entry_safe(rg, trg, &allocated_regions, link) {
list_del(&rg->link);
kfree(rg);
}
return -ENOMEM;
}
/*
* Add the huge page range represented by [f, t) to the reserve
* map. Regions will be taken from the cache to fill in this range.
* Sufficient regions should exist in the cache due to the previous
* call to region_chg with the same range, but in some cases the cache will not
* have sufficient entries due to races with other code doing region_add or
* region_del. The extra needed entries will be allocated.
*
* regions_needed is the out value provided by a previous call to region_chg.
*
* Return the number of new huge pages added to the map. This number is greater
* than or equal to zero. If file_region entries needed to be allocated for
* this operation and we were not able to allocate, it returns -ENOMEM.
* region_add of regions of length 1 never allocate file_regions and cannot
* fail; region_chg will always allocate at least 1 entry and a region_add for
* 1 page will only require at most 1 entry.
*/
static long region_add(struct resv_map *resv, long f, long t,
long in_regions_needed, struct hstate *h,
struct hugetlb_cgroup *h_cg)
{
long add = 0, actual_regions_needed = 0;
spin_lock(&resv->lock);
retry:
/* Count how many regions are actually needed to execute this add. */
add_reservation_in_range(resv, f, t, NULL, NULL,
&actual_regions_needed);
/*
* Check for sufficient descriptors in the cache to accommodate
* this add operation. Note that actual_regions_needed may be greater
* than in_regions_needed, as the resv_map may have been modified since
* the region_chg call. In this case, we need to make sure that we
* allocate extra entries, such that we have enough for all the
* existing adds_in_progress, plus the excess needed for this
* operation.
*/
if (actual_regions_needed > in_regions_needed &&
resv->region_cache_count <
resv->adds_in_progress +
(actual_regions_needed - in_regions_needed)) {
/* region_add operation of range 1 should never need to
* allocate file_region entries.
*/
VM_BUG_ON(t - f <= 1);
if (allocate_file_region_entries(
resv, actual_regions_needed - in_regions_needed)) {
return -ENOMEM;
}
goto retry;
}
add = add_reservation_in_range(resv, f, t, h_cg, h, NULL);
resv->adds_in_progress -= in_regions_needed;
spin_unlock(&resv->lock);
return add;
}
/*
* Examine the existing reserve map and determine how many
* huge pages in the specified range [f, t) are NOT currently
* represented. This routine is called before a subsequent
* call to region_add that will actually modify the reserve
* map to add the specified range [f, t). region_chg does
* not change the number of huge pages represented by the
* map. A number of new file_region structures is added to the cache as a
* placeholder, for the subsequent region_add call to use. At least 1
* file_region structure is added.
*
* out_regions_needed is the number of regions added to the
* resv->adds_in_progress. This value needs to be provided to a follow up call
* to region_add or region_abort for proper accounting.
*
* Returns the number of huge pages that need to be added to the existing
* reservation map for the range [f, t). This number is greater or equal to
* zero. -ENOMEM is returned if a new file_region structure or cache entry
* is needed and can not be allocated.
*/
static long region_chg(struct resv_map *resv, long f, long t,
long *out_regions_needed)
{
long chg = 0;
spin_lock(&resv->lock);
/* Count how many hugepages in this range are NOT represented. */
chg = add_reservation_in_range(resv, f, t, NULL, NULL,
out_regions_needed);
if (*out_regions_needed == 0)
*out_regions_needed = 1;
if (allocate_file_region_entries(resv, *out_regions_needed))
return -ENOMEM;
resv->adds_in_progress += *out_regions_needed;
spin_unlock(&resv->lock);
return chg;
}
/*
* Abort the in progress add operation. The adds_in_progress field
* of the resv_map keeps track of the operations in progress between
* calls to region_chg and region_add. Operations are sometimes
* aborted after the call to region_chg. In such cases, region_abort
* is called to decrement the adds_in_progress counter. regions_needed
* is the value returned by the region_chg call, it is used to decrement
* the adds_in_progress counter.
*
* NOTE: The range arguments [f, t) are not needed or used in this
* routine. They are kept to make reading the calling code easier as
* arguments will match the associated region_chg call.
*/
static void region_abort(struct resv_map *resv, long f, long t,
long regions_needed)
{
spin_lock(&resv->lock);
VM_BUG_ON(!resv->region_cache_count);
resv->adds_in_progress -= regions_needed;
spin_unlock(&resv->lock);
}
/*
* Delete the specified range [f, t) from the reserve map. If the
* t parameter is LONG_MAX, this indicates that ALL regions after f
* should be deleted. Locate the regions which intersect [f, t)
* and either trim, delete or split the existing regions.
*
* Returns the number of huge pages deleted from the reserve map.
* In the normal case, the return value is zero or more. In the
* case where a region must be split, a new region descriptor must
* be allocated. If the allocation fails, -ENOMEM will be returned.
* NOTE: If the parameter t == LONG_MAX, then we will never split
* a region and possibly return -ENOMEM. Callers specifying
* t == LONG_MAX do not need to check for -ENOMEM error.
*/
static long region_del(struct resv_map *resv, long f, long t)
{
struct list_head *head = &resv->regions;
struct file_region *rg, *trg;
struct file_region *nrg = NULL;
long del = 0;
retry:
spin_lock(&resv->lock);
list_for_each_entry_safe(rg, trg, head, link) {
/*
* Skip regions before the range to be deleted. file_region
* ranges are normally of the form [from, to). However, there
* may be a "placeholder" entry in the map which is of the form
* (from, to) with from == to. Check for placeholder entries
* at the beginning of the range to be deleted.
*/
if (rg->to <= f && (rg->to != rg->from || rg->to != f))
continue;
if (rg->from >= t)
break;
if (f > rg->from && t < rg->to) { /* Must split region */
/*
* Check for an entry in the cache before dropping
* lock and attempting allocation.
*/
if (!nrg &&
resv->region_cache_count > resv->adds_in_progress) {
nrg = list_first_entry(&resv->region_cache,
struct file_region,
link);
list_del(&nrg->link);
resv->region_cache_count--;
}
if (!nrg) {
spin_unlock(&resv->lock);
nrg = kmalloc(sizeof(*nrg), GFP_KERNEL);
if (!nrg)
return -ENOMEM;
goto retry;
}
del += t - f;
hugetlb_cgroup_uncharge_file_region(
resv, rg, t - f, false);
/* New entry for end of split region */
nrg->from = t;
nrg->to = rg->to;
copy_hugetlb_cgroup_uncharge_info(nrg, rg);
INIT_LIST_HEAD(&nrg->link);
/* Original entry is trimmed */
rg->to = f;
list_add(&nrg->link, &rg->link);
nrg = NULL;
break;
}
if (f <= rg->from && t >= rg->to) { /* Remove entire region */
del += rg->to - rg->from;
hugetlb_cgroup_uncharge_file_region(resv, rg,
rg->to - rg->from, true);
list_del(&rg->link);
kfree(rg);
continue;
}
if (f <= rg->from) { /* Trim beginning of region */
hugetlb_cgroup_uncharge_file_region(resv, rg,
t - rg->from, false);
del += t - rg->from;
rg->from = t;
} else { /* Trim end of region */
hugetlb_cgroup_uncharge_file_region(resv, rg,
rg->to - f, false);
del += rg->to - f;
rg->to = f;
}
}
spin_unlock(&resv->lock);
kfree(nrg);
return del;
}
/*
* A rare out of memory error was encountered which prevented removal of
* the reserve map region for a page. The huge page itself was free'ed
* and removed from the page cache. This routine will adjust the subpool
* usage count, and the global reserve count if needed. By incrementing
* these counts, the reserve map entry which could not be deleted will
* appear as a "reserved" entry instead of simply dangling with incorrect
* counts.
*/
void hugetlb_fix_reserve_counts(struct inode *inode)
{
struct hugepage_subpool *spool = subpool_inode(inode);
long rsv_adjust;
bool reserved = false;
rsv_adjust = hugepage_subpool_get_pages(spool, 1);
if (rsv_adjust > 0) {
struct hstate *h = hstate_inode(inode);
if (!hugetlb_acct_memory(h, 1))
reserved = true;
} else if (!rsv_adjust) {
reserved = true;
}
if (!reserved)
pr_warn("hugetlb: Huge Page Reserved count may go negative.\n");
}
/*
* Count and return the number of huge pages in the reserve map
* that intersect with the range [f, t).
*/
static long region_count(struct resv_map *resv, long f, long t)
{
struct list_head *head = &resv->regions;
struct file_region *rg;
long chg = 0;
spin_lock(&resv->lock);
/* Locate each segment we overlap with, and count that overlap. */
list_for_each_entry(rg, head, link) {
long seg_from;
long seg_to;
if (rg->to <= f)
continue;
if (rg->from >= t)
break;
seg_from = max(rg->from, f);
seg_to = min(rg->to, t);
chg += seg_to - seg_from;
}
spin_unlock(&resv->lock);
return chg;
}
/*
* Convert the address within this vma to the page offset within
* the mapping, in pagecache page units; huge pages here.
*/
static pgoff_t vma_hugecache_offset(struct hstate *h,
struct vm_area_struct *vma, unsigned long address)
{
return ((address - vma->vm_start) >> huge_page_shift(h)) +
(vma->vm_pgoff >> huge_page_order(h));
}
pgoff_t linear_hugepage_index(struct vm_area_struct *vma,
unsigned long address)
{
return vma_hugecache_offset(hstate_vma(vma), vma, address);
}
EXPORT_SYMBOL_GPL(linear_hugepage_index);
/*
* Return the size of the pages allocated when backing a VMA. In the majority
* cases this will be same size as used by the page table entries.
*/
unsigned long vma_kernel_pagesize(struct vm_area_struct *vma)
{
if (vma->vm_ops && vma->vm_ops->pagesize)
return vma->vm_ops->pagesize(vma);
return PAGE_SIZE;
}
EXPORT_SYMBOL_GPL(vma_kernel_pagesize);
/*
* Return the page size being used by the MMU to back a VMA. In the majority
* of cases, the page size used by the kernel matches the MMU size. On
* architectures where it differs, an architecture-specific 'strong'
* version of this symbol is required.
*/
__weak unsigned long vma_mmu_pagesize(struct vm_area_struct *vma)
{
return vma_kernel_pagesize(vma);
}
/*
* Flags for MAP_PRIVATE reservations. These are stored in the bottom
* bits of the reservation map pointer, which are always clear due to
* alignment.
*/
#define HPAGE_RESV_OWNER (1UL << 0)
#define HPAGE_RESV_UNMAPPED (1UL << 1)
#define HPAGE_RESV_MASK (HPAGE_RESV_OWNER | HPAGE_RESV_UNMAPPED)
/*
* These helpers are used to track how many pages are reserved for
* faults in a MAP_PRIVATE mapping. Only the process that called mmap()
* is guaranteed to have their future faults succeed.
*
* With the exception of reset_vma_resv_huge_pages() which is called at fork(),
* the reserve counters are updated with the hugetlb_lock held. It is safe
* to reset the VMA at fork() time as it is not in use yet and there is no
* chance of the global counters getting corrupted as a result of the values.
*
* The private mapping reservation is represented in a subtly different
* manner to a shared mapping. A shared mapping has a region map associated
* with the underlying file, this region map represents the backing file
* pages which have ever had a reservation assigned which this persists even
* after the page is instantiated. A private mapping has a region map
* associated with the original mmap which is attached to all VMAs which
* reference it, this region map represents those offsets which have consumed
* reservation ie. where pages have been instantiated.
*/
static unsigned long get_vma_private_data(struct vm_area_struct *vma)
{
return (unsigned long)vma->vm_private_data;
}
static void set_vma_private_data(struct vm_area_struct *vma,
unsigned long value)
{
vma->vm_private_data = (void *)value;
}
static void
resv_map_set_hugetlb_cgroup_uncharge_info(struct resv_map *resv_map,
struct hugetlb_cgroup *h_cg,
struct hstate *h)
{
#ifdef CONFIG_CGROUP_HUGETLB
if (!h_cg || !h) {
resv_map->reservation_counter = NULL;
resv_map->pages_per_hpage = 0;
resv_map->css = NULL;
} else {
resv_map->reservation_counter =
&h_cg->rsvd_hugepage[hstate_index(h)];
resv_map->pages_per_hpage = pages_per_huge_page(h);
resv_map->css = &h_cg->css;
}
#endif
}
struct resv_map *resv_map_alloc(void)
{
struct resv_map *resv_map = kmalloc(sizeof(*resv_map), GFP_KERNEL);
struct file_region *rg = kmalloc(sizeof(*rg), GFP_KERNEL);
if (!resv_map || !rg) {
kfree(resv_map);
kfree(rg);
return NULL;
}
kref_init(&resv_map->refs);
spin_lock_init(&resv_map->lock);
INIT_LIST_HEAD(&resv_map->regions);
resv_map->adds_in_progress = 0;
/*
* Initialize these to 0. On shared mappings, 0's here indicate these
* fields don't do cgroup accounting. On private mappings, these will be
* re-initialized to the proper values, to indicate that hugetlb cgroup
* reservations are to be un-charged from here.
*/
resv_map_set_hugetlb_cgroup_uncharge_info(resv_map, NULL, NULL);
INIT_LIST_HEAD(&resv_map->region_cache);
list_add(&rg->link, &resv_map->region_cache);
resv_map->region_cache_count = 1;
return resv_map;
}
void resv_map_release(struct kref *ref)
{
struct resv_map *resv_map = container_of(ref, struct resv_map, refs);
struct list_head *head = &resv_map->region_cache;
struct file_region *rg, *trg;
/* Clear out any active regions before we release the map. */
region_del(resv_map, 0, LONG_MAX);
/* ... and any entries left in the cache */
list_for_each_entry_safe(rg, trg, head, link) {
list_del(&rg->link);
kfree(rg);
}
VM_BUG_ON(resv_map->adds_in_progress);
kfree(resv_map);
}
static inline struct resv_map *inode_resv_map(struct inode *inode)
{
/*
* At inode evict time, i_mapping may not point to the original
* address space within the inode. This original address space
* contains the pointer to the resv_map. So, always use the
* address space embedded within the inode.
* The VERY common case is inode->mapping == &inode->i_data but,
* this may not be true for device special inodes.
*/
return (struct resv_map *)(&inode->i_data)->private_data;
}
static struct resv_map *vma_resv_map(struct vm_area_struct *vma)
{
VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma);
if (vma->vm_flags & VM_MAYSHARE) {
struct address_space *mapping = vma->vm_file->f_mapping;
struct inode *inode = mapping->host;
return inode_resv_map(inode);
} else {
return (struct resv_map *)(get_vma_private_data(vma) &
~HPAGE_RESV_MASK);
}
}
static void set_vma_resv_map(struct vm_area_struct *vma, struct resv_map *map)
{
VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma);
VM_BUG_ON_VMA(vma->vm_flags & VM_MAYSHARE, vma);
set_vma_private_data(vma, (get_vma_private_data(vma) &
HPAGE_RESV_MASK) | (unsigned long)map);
}
static void set_vma_resv_flags(struct vm_area_struct *vma, unsigned long flags)
{
VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma);
VM_BUG_ON_VMA(vma->vm_flags & VM_MAYSHARE, vma);
set_vma_private_data(vma, get_vma_private_data(vma) | flags);
}
static int is_vma_resv_set(struct vm_area_struct *vma, unsigned long flag)
{
VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma);
return (get_vma_private_data(vma) & flag) != 0;
}
/* Reset counters to 0 and clear all HPAGE_RESV_* flags */
void reset_vma_resv_huge_pages(struct vm_area_struct *vma)
{
VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma);
if (!(vma->vm_flags & VM_MAYSHARE))
vma->vm_private_data = (void *)0;
}
/* Returns true if the VMA has associated reserve pages */
static bool vma_has_reserves(struct vm_area_struct *vma, long chg)
{
if (vma->vm_flags & VM_NORESERVE) {
/*
* This address is already reserved by other process(chg == 0),
* so, we should decrement reserved count. Without decrementing,
* reserve count remains after releasing inode, because this
* allocated page will go into page cache and is regarded as
* coming from reserved pool in releasing step. Currently, we
* don't have any other solution to deal with this situation
* properly, so add work-around here.
*/
if (vma->vm_flags & VM_MAYSHARE && chg == 0)
return true;
else
return false;
}
/* Shared mappings always use reserves */
if (vma->vm_flags & VM_MAYSHARE) {
/*
* We know VM_NORESERVE is not set. Therefore, there SHOULD
* be a region map for all pages. The only situation where
* there is no region map is if a hole was punched via
* fallocate. In this case, there really are no reserves to
* use. This situation is indicated if chg != 0.
*/
if (chg)
return false;
else
return true;
}
/*
* Only the process that called mmap() has reserves for
* private mappings.
*/
if (is_vma_resv_set(vma, HPAGE_RESV_OWNER)) {
/*
* Like the shared case above, a hole punch or truncate
* could have been performed on the private mapping.
* Examine the value of chg to determine if reserves
* actually exist or were previously consumed.
* Very Subtle - The value of chg comes from a previous
* call to vma_needs_reserves(). The reserve map for
* private mappings has different (opposite) semantics
* than that of shared mappings. vma_needs_reserves()
* has already taken this difference in semantics into
* account. Therefore, the meaning of chg is the same
* as in the shared case above. Code could easily be
* combined, but keeping it separate draws attention to
* subtle differences.
*/
if (chg)
return false;
else
return true;
}
return false;
}
static void enqueue_huge_page(struct hstate *h, struct page *page)
{
int nid = page_to_nid(page);
lockdep_assert_held(&hugetlb_lock);
VM_BUG_ON_PAGE(page_count(page), page);
list_move(&page->lru, &h->hugepage_freelists[nid]);
h->free_huge_pages++;
h->free_huge_pages_node[nid]++;
SetHPageFreed(page);
}
static struct page *dequeue_huge_page_node_exact(struct hstate *h, int nid)
{
struct page *page;
bool pin = !!(current->flags & PF_MEMALLOC_PIN);
lockdep_assert_held(&hugetlb_lock);
list_for_each_entry(page, &h->hugepage_freelists[nid], lru) {
if (pin && !is_pinnable_page(page))
continue;
if (PageHWPoison(page))
continue;
list_move(&page->lru, &h->hugepage_activelist);
set_page_refcounted(page);
ClearHPageFreed(page);
h->free_huge_pages--;
h->free_huge_pages_node[nid]--;
return page;
}
return NULL;
}
static struct page *dequeue_huge_page_nodemask(struct hstate *h, gfp_t gfp_mask, int nid,
nodemask_t *nmask)
{
unsigned int cpuset_mems_cookie;
struct zonelist *zonelist;
struct zone *zone;
struct zoneref *z;
int node = NUMA_NO_NODE;
zonelist = node_zonelist(nid, gfp_mask);
retry_cpuset:
cpuset_mems_cookie = read_mems_allowed_begin();
for_each_zone_zonelist_nodemask(zone, z, zonelist, gfp_zone(gfp_mask), nmask) {
struct page *page;
if (!cpuset_zone_allowed(zone, gfp_mask))
continue;
/*
* no need to ask again on the same node. Pool is node rather than
* zone aware
*/
if (zone_to_nid(zone) == node)
continue;
node = zone_to_nid(zone);
page = dequeue_huge_page_node_exact(h, node);
if (page)
return page;
}
if (unlikely(read_mems_allowed_retry(cpuset_mems_cookie)))
goto retry_cpuset;
return NULL;
}
static struct page *dequeue_huge_page_vma(struct hstate *h,
struct vm_area_struct *vma,
unsigned long address, int avoid_reserve,
long chg)
{
struct page *page = NULL;
struct mempolicy *mpol;
gfp_t gfp_mask;
nodemask_t *nodemask;
int nid;
/*
* A child process with MAP_PRIVATE mappings created by their parent
* have no page reserves. This check ensures that reservations are
* not "stolen". The child may still get SIGKILLed
*/
if (!vma_has_reserves(vma, chg) &&
h->free_huge_pages - h->resv_huge_pages == 0)
goto err;
/* If reserves cannot be used, ensure enough pages are in the pool */
if (avoid_reserve && h->free_huge_pages - h->resv_huge_pages == 0)
goto err;
gfp_mask = htlb_alloc_mask(h);
nid = huge_node(vma, address, gfp_mask, &mpol, &nodemask);
if (mpol_is_preferred_many(mpol)) {
page = dequeue_huge_page_nodemask(h, gfp_mask, nid, nodemask);
/* Fallback to all nodes if page==NULL */
nodemask = NULL;
}
if (!page)
page = dequeue_huge_page_nodemask(h, gfp_mask, nid, nodemask);
if (page && !avoid_reserve && vma_has_reserves(vma, chg)) {
SetHPageRestoreReserve(page);
h->resv_huge_pages--;
}
mpol_cond_put(mpol);
return page;
err:
return NULL;
}
/*
* common helper functions for hstate_next_node_to_{alloc|free}.
* We may have allocated or freed a huge page based on a different
* nodes_allowed previously, so h->next_node_to_{alloc|free} might
* be outside of *nodes_allowed. Ensure that we use an allowed
* node for alloc or free.
*/
static int next_node_allowed(int nid, nodemask_t *nodes_allowed)
{
nid = next_node_in(nid, *nodes_allowed);
VM_BUG_ON(nid >= MAX_NUMNODES);
return nid;
}
static int get_valid_node_allowed(int nid, nodemask_t *nodes_allowed)
{
if (!node_isset(nid, *nodes_allowed))
nid = next_node_allowed(nid, nodes_allowed);
return nid;
}
/*
* returns the previously saved node ["this node"] from which to
* allocate a persistent huge page for the pool and advance the
* next node from which to allocate, handling wrap at end of node
* mask.
*/
static int hstate_next_node_to_alloc(struct hstate *h,
nodemask_t *nodes_allowed)
{
int nid;
VM_BUG_ON(!nodes_allowed);
nid = get_valid_node_allowed(h->next_nid_to_alloc, nodes_allowed);
h->next_nid_to_alloc = next_node_allowed(nid, nodes_allowed);
return nid;
}
/*
* helper for remove_pool_huge_page() - return the previously saved
* node ["this node"] from which to free a huge page. Advance the
* next node id whether or not we find a free huge page to free so
* that the next attempt to free addresses the next node.
*/
static int hstate_next_node_to_free(struct hstate *h, nodemask_t *nodes_allowed)
{
int nid;
VM_BUG_ON(!nodes_allowed);
nid = get_valid_node_allowed(h->next_nid_to_free, nodes_allowed);
h->next_nid_to_free = next_node_allowed(nid, nodes_allowed);
return nid;
}
#define for_each_node_mask_to_alloc(hs, nr_nodes, node, mask) \
for (nr_nodes = nodes_weight(*mask); \
nr_nodes > 0 && \
((node = hstate_next_node_to_alloc(hs, mask)) || 1); \
nr_nodes--)
#define for_each_node_mask_to_free(hs, nr_nodes, node, mask) \
for (nr_nodes = nodes_weight(*mask); \
nr_nodes > 0 && \
((node = hstate_next_node_to_free(hs, mask)) || 1); \
nr_nodes--)
#ifdef CONFIG_ARCH_HAS_GIGANTIC_PAGE
static void destroy_compound_gigantic_page(struct page *page,
unsigned int order)
{
int i;
int nr_pages = 1 << order;
struct page *p = page + 1;
atomic_set(compound_mapcount_ptr(page), 0);
atomic_set(compound_pincount_ptr(page), 0);
for (i = 1; i < nr_pages; i++, p = mem_map_next(p, page, i)) {
clear_compound_head(p);
set_page_refcounted(p);
}
set_compound_order(page, 0);
page[1].compound_nr = 0;
__ClearPageHead(page);
}
static void free_gigantic_page(struct page *page, unsigned int order)
{
/*
* If the page isn't allocated using the cma allocator,
* cma_release() returns false.
*/
#ifdef CONFIG_CMA
if (cma_release(hugetlb_cma[page_to_nid(page)], page, 1 << order))
return;
#endif
free_contig_range(page_to_pfn(page), 1 << order);
}
#ifdef CONFIG_CONTIG_ALLOC
static struct page *alloc_gigantic_page(struct hstate *h, gfp_t gfp_mask,
int nid, nodemask_t *nodemask)
{
unsigned long nr_pages = pages_per_huge_page(h);
if (nid == NUMA_NO_NODE)
nid = numa_mem_id();
#ifdef CONFIG_CMA
{
struct page *page;
int node;
if (hugetlb_cma[nid]) {
page = cma_alloc(hugetlb_cma[nid], nr_pages,
huge_page_order(h), true);
if (page)
return page;
}
if (!(gfp_mask & __GFP_THISNODE)) {
for_each_node_mask(node, *nodemask) {
if (node == nid || !hugetlb_cma[node])
continue;
page = cma_alloc(hugetlb_cma[node], nr_pages,
huge_page_order(h), true);
if (page)
return page;
}
}
}
#endif
return alloc_contig_pages(nr_pages, gfp_mask, nid, nodemask);
}
#else /* !CONFIG_CONTIG_ALLOC */
static struct page *alloc_gigantic_page(struct hstate *h, gfp_t gfp_mask,
int nid, nodemask_t *nodemask)
{
return NULL;
}
#endif /* CONFIG_CONTIG_ALLOC */
#else /* !CONFIG_ARCH_HAS_GIGANTIC_PAGE */
static struct page *alloc_gigantic_page(struct hstate *h, gfp_t gfp_mask,
int nid, nodemask_t *nodemask)
{
return NULL;
}
static inline void free_gigantic_page(struct page *page, unsigned int order) { }
static inline void destroy_compound_gigantic_page(struct page *page,
unsigned int order) { }
#endif
/*
* Remove hugetlb page from lists, and update dtor so that page appears
* as just a compound page. A reference is held on the page.
*
* Must be called with hugetlb lock held.
*/
static void remove_hugetlb_page(struct hstate *h, struct page *page,
bool adjust_surplus)
{
int nid = page_to_nid(page);
VM_BUG_ON_PAGE(hugetlb_cgroup_from_page(page), page);
VM_BUG_ON_PAGE(hugetlb_cgroup_from_page_rsvd(page), page);
lockdep_assert_held(&hugetlb_lock);
if (hstate_is_gigantic(h) && !gigantic_page_runtime_supported())
return;
list_del(&page->lru);
if (HPageFreed(page)) {
h->free_huge_pages--;
h->free_huge_pages_node[nid]--;
}
if (adjust_surplus) {
h->surplus_huge_pages--;
h->surplus_huge_pages_node[nid]--;
}
/*
* Very subtle
*
* For non-gigantic pages set the destructor to the normal compound
* page dtor. This is needed in case someone takes an additional
* temporary ref to the page, and freeing is delayed until they drop
* their reference.
*
* For gigantic pages set the destructor to the null dtor. This
* destructor will never be called. Before freeing the gigantic
* page destroy_compound_gigantic_page will turn the compound page
* into a simple group of pages. After this the destructor does not
* apply.
*
* This handles the case where more than one ref is held when and
* after update_and_free_page is called.
*/
set_page_refcounted(page);
if (hstate_is_gigantic(h))
set_compound_page_dtor(page, NULL_COMPOUND_DTOR);
else
set_compound_page_dtor(page, COMPOUND_PAGE_DTOR);
h->nr_huge_pages--;
h->nr_huge_pages_node[nid]--;
}
static void add_hugetlb_page(struct hstate *h, struct page *page,
bool adjust_surplus)
{
int zeroed;
int nid = page_to_nid(page);
VM_BUG_ON_PAGE(!HPageVmemmapOptimized(page), page);
lockdep_assert_held(&hugetlb_lock);
INIT_LIST_HEAD(&page->lru);
h->nr_huge_pages++;
h->nr_huge_pages_node[nid]++;
if (adjust_surplus) {
h->surplus_huge_pages++;
h->surplus_huge_pages_node[nid]++;
}
set_compound_page_dtor(page, HUGETLB_PAGE_DTOR);
set_page_private(page, 0);
SetHPageVmemmapOptimized(page);
/*
* This page is about to be managed by the hugetlb allocator and
* should have no users. Drop our reference, and check for others
* just in case.
*/
zeroed = put_page_testzero(page);
if (!zeroed)
/*
* It is VERY unlikely soneone else has taken a ref on
* the page. In this case, we simply return as the
* hugetlb destructor (free_huge_page) will be called
* when this other ref is dropped.
*/
return;
arch_clear_hugepage_flags(page);
enqueue_huge_page(h, page);
}
static void __update_and_free_page(struct hstate *h, struct page *page)
{
int i;
struct page *subpage = page;
if (hstate_is_gigantic(h) && !gigantic_page_runtime_supported())
return;
if (alloc_huge_page_vmemmap(h, page)) {
spin_lock_irq(&hugetlb_lock);
/*
* If we cannot allocate vmemmap pages, just refuse to free the
* page and put the page back on the hugetlb free list and treat
* as a surplus page.
*/
add_hugetlb_page(h, page, true);
spin_unlock_irq(&hugetlb_lock);
return;
}
for (i = 0; i < pages_per_huge_page(h);
i++, subpage = mem_map_next(subpage, page, i)) {
subpage->flags &= ~(1 << PG_locked | 1 << PG_error |
1 << PG_referenced | 1 << PG_dirty |
1 << PG_active | 1 << PG_private |
1 << PG_writeback);
}
if (hstate_is_gigantic(h)) {
destroy_compound_gigantic_page(page, huge_page_order(h));
free_gigantic_page(page, huge_page_order(h));
} else {
__free_pages(page, huge_page_order(h));
}
}
/*
* As update_and_free_page() can be called under any context, so we cannot
* use GFP_KERNEL to allocate vmemmap pages. However, we can defer the
* actual freeing in a workqueue to prevent from using GFP_ATOMIC to allocate
* the vmemmap pages.
*
* free_hpage_workfn() locklessly retrieves the linked list of pages to be
* freed and frees them one-by-one. As the page->mapping pointer is going
* to be cleared in free_hpage_workfn() anyway, it is reused as the llist_node
* structure of a lockless linked list of huge pages to be freed.
*/
static LLIST_HEAD(hpage_freelist);
static void free_hpage_workfn(struct work_struct *work)
{
struct llist_node *node;
node = llist_del_all(&hpage_freelist);
while (node) {
struct page *page;
struct hstate *h;
page = container_of((struct address_space **)node,
struct page, mapping);
node = node->next;
page->mapping = NULL;
/*
* The VM_BUG_ON_PAGE(!PageHuge(page), page) in page_hstate()
* is going to trigger because a previous call to
* remove_hugetlb_page() will set_compound_page_dtor(page,
* NULL_COMPOUND_DTOR), so do not use page_hstate() directly.
*/
h = size_to_hstate(page_size(page));
__update_and_free_page(h, page);
cond_resched();
}
}
static DECLARE_WORK(free_hpage_work, free_hpage_workfn);
static inline void flush_free_hpage_work(struct hstate *h)
{
if (free_vmemmap_pages_per_hpage(h))
flush_work(&free_hpage_work);
}
static void update_and_free_page(struct hstate *h, struct page *page,
bool atomic)
{
if (!HPageVmemmapOptimized(page) || !atomic) {
__update_and_free_page(h, page);
return;
}
/*
* Defer freeing to avoid using GFP_ATOMIC to allocate vmemmap pages.
*
* Only call schedule_work() if hpage_freelist is previously
* empty. Otherwise, schedule_work() had been called but the workfn
* hasn't retrieved the list yet.
*/
if (llist_add((struct llist_node *)&page->mapping, &hpage_freelist))
schedule_work(&free_hpage_work);
}
static void update_and_free_pages_bulk(struct hstate *h, struct list_head *list)
{
struct page *page, *t_page;
list_for_each_entry_safe(page, t_page, list, lru) {
update_and_free_page(h, page, false);
cond_resched();
}
}
struct hstate *size_to_hstate(unsigned long size)
{
struct hstate *h;
for_each_hstate(h) {
if (huge_page_size(h) == size)
return h;
}
return NULL;
}
void free_huge_page(struct page *page)
{
/*
* Can't pass hstate in here because it is called from the
* compound page destructor.
*/
struct hstate *h = page_hstate(page);
int nid = page_to_nid(page);
struct hugepage_subpool *spool = hugetlb_page_subpool(page);
bool restore_reserve;
unsigned long flags;
VM_BUG_ON_PAGE(page_count(page), page);
VM_BUG_ON_PAGE(page_mapcount(page), page);
hugetlb_set_page_subpool(page, NULL);
page->mapping = NULL;
restore_reserve = HPageRestoreReserve(page);
ClearHPageRestoreReserve(page);
/*
* If HPageRestoreReserve was set on page, page allocation consumed a
* reservation. If the page was associated with a subpool, there
* would have been a page reserved in the subpool before allocation
* via hugepage_subpool_get_pages(). Since we are 'restoring' the
* reservation, do not call hugepage_subpool_put_pages() as this will
* remove the reserved page from the subpool.
*/
if (!restore_reserve) {
/*
* A return code of zero implies that the subpool will be
* under its minimum size if the reservation is not restored
* after page is free. Therefore, force restore_reserve
* operation.
*/
if (hugepage_subpool_put_pages(spool, 1) == 0)
restore_reserve = true;
}
spin_lock_irqsave(&hugetlb_lock, flags);
ClearHPageMigratable(page);
hugetlb_cgroup_uncharge_page(hstate_index(h),
pages_per_huge_page(h), page);
hugetlb_cgroup_uncharge_page_rsvd(hstate_index(h),
pages_per_huge_page(h), page);
if (restore_reserve)
h->resv_huge_pages++;
if (HPageTemporary(page)) {
remove_hugetlb_page(h, page, false);
spin_unlock_irqrestore(&hugetlb_lock, flags);
update_and_free_page(h, page, true);
} else if (h->surplus_huge_pages_node[nid]) {
/* remove the page from active list */
remove_hugetlb_page(h, page, true);
spin_unlock_irqrestore(&hugetlb_lock, flags);
update_and_free_page(h, page, true);
} else {
arch_clear_hugepage_flags(page);
enqueue_huge_page(h, page);
spin_unlock_irqrestore(&hugetlb_lock, flags);
}
}
/*
* Must be called with the hugetlb lock held
*/
static void __prep_account_new_huge_page(struct hstate *h, int nid)
{
lockdep_assert_held(&hugetlb_lock);
h->nr_huge_pages++;
h->nr_huge_pages_node[nid]++;
}
static void __prep_new_huge_page(struct hstate *h, struct page *page)
{
free_huge_page_vmemmap(h, page);
INIT_LIST_HEAD(&page->lru);
set_compound_page_dtor(page, HUGETLB_PAGE_DTOR);
hugetlb_set_page_subpool(page, NULL);
set_hugetlb_cgroup(page, NULL);
set_hugetlb_cgroup_rsvd(page, NULL);
}
static void prep_new_huge_page(struct hstate *h, struct page *page, int nid)
{
__prep_new_huge_page(h, page);
spin_lock_irq(&hugetlb_lock);
__prep_account_new_huge_page(h, nid);
spin_unlock_irq(&hugetlb_lock);
}
static bool prep_compound_gigantic_page(struct page *page, unsigned int order)
{
int i, j;
int nr_pages = 1 << order;
struct page *p = page + 1;
/* we rely on prep_new_huge_page to set the destructor */
set_compound_order(page, order);
__ClearPageReserved(page);
__SetPageHead(page);
for (i = 1; i < nr_pages; i++, p = mem_map_next(p, page, i)) {
/*
* For gigantic hugepages allocated through bootmem at
* boot, it's safer to be consistent with the not-gigantic
* hugepages and clear the PG_reserved bit from all tail pages
* too. Otherwise drivers using get_user_pages() to access tail
* pages may get the reference counting wrong if they see
* PG_reserved set on a tail page (despite the head page not
* having PG_reserved set). Enforcing this consistency between
* head and tail pages allows drivers to optimize away a check
* on the head page when they need know if put_page() is needed
* after get_user_pages().
*/
__ClearPageReserved(p);
/*
* Subtle and very unlikely
*
* Gigantic 'page allocators' such as memblock or cma will
* return a set of pages with each page ref counted. We need
* to turn this set of pages into a compound page with tail
* page ref counts set to zero. Code such as speculative page
* cache adding could take a ref on a 'to be' tail page.
* We need to respect any increased ref count, and only set
* the ref count to zero if count is currently 1. If count
* is not 1, we return an error. An error return indicates
* the set of pages can not be converted to a gigantic page.
* The caller who allocated the pages should then discard the
* pages using the appropriate free interface.
*/
if (!page_ref_freeze(p, 1)) {
pr_warn("HugeTLB page can not be used due to unexpected inflated ref count\n");
goto out_error;
}
set_page_count(p, 0);
set_compound_head(p, page);
}
atomic_set(compound_mapcount_ptr(page), -1);
atomic_set(compound_pincount_ptr(page), 0);
return true;
out_error:
/* undo tail page modifications made above */
p = page + 1;
for (j = 1; j < i; j++, p = mem_map_next(p, page, j)) {
clear_compound_head(p);
set_page_refcounted(p);
}
/* need to clear PG_reserved on remaining tail pages */
for (; j < nr_pages; j++, p = mem_map_next(p, page, j))
__ClearPageReserved(p);
set_compound_order(page, 0);
page[1].compound_nr = 0;
__ClearPageHead(page);
return false;
}
/*
* PageHuge() only returns true for hugetlbfs pages, but not for normal or
* transparent huge pages. See the PageTransHuge() documentation for more
* details.
*/
int PageHuge(struct page *page)
{
if (!PageCompound(page))
return 0;
page = compound_head(page);
return page[1].compound_dtor == HUGETLB_PAGE_DTOR;
}
EXPORT_SYMBOL_GPL(PageHuge);
/*
* PageHeadHuge() only returns true for hugetlbfs head page, but not for
* normal or transparent huge pages.
*/
int PageHeadHuge(struct page *page_head)
{
if (!PageHead(page_head))
return 0;
return page_head[1].compound_dtor == HUGETLB_PAGE_DTOR;
}
/*
* Find and lock address space (mapping) in write mode.
*
* Upon entry, the page is locked which means that page_mapping() is
* stable. Due to locking order, we can only trylock_write. If we can
* not get the lock, simply return NULL to caller.
*/
struct address_space *hugetlb_page_mapping_lock_write(struct page *hpage)
{
struct address_space *mapping = page_mapping(hpage);
if (!mapping)
return mapping;
if (i_mmap_trylock_write(mapping))
return mapping;
return NULL;
}
pgoff_t hugetlb_basepage_index(struct page *page)
{
struct page *page_head = compound_head(page);
pgoff_t index = page_index(page_head);
unsigned long compound_idx;
if (compound_order(page_head) >= MAX_ORDER)
compound_idx = page_to_pfn(page) - page_to_pfn(page_head);
else
compound_idx = page - page_head;
return (index << compound_order(page_head)) + compound_idx;
}
static struct page *alloc_buddy_huge_page(struct hstate *h,
gfp_t gfp_mask, int nid, nodemask_t *nmask,
nodemask_t *node_alloc_noretry)
{
int order = huge_page_order(h);
struct page *page;
bool alloc_try_hard = true;
/*
* By default we always try hard to allocate the page with
* __GFP_RETRY_MAYFAIL flag. However, if we are allocating pages in
* a loop (to adjust global huge page counts) and previous allocation
* failed, do not continue to try hard on the same node. Use the
* node_alloc_noretry bitmap to manage this state information.
*/
if (node_alloc_noretry && node_isset(nid, *node_alloc_noretry))
alloc_try_hard = false;
gfp_mask |= __GFP_COMP|__GFP_NOWARN;
if (alloc_try_hard)
gfp_mask |= __GFP_RETRY_MAYFAIL;
if (nid == NUMA_NO_NODE)
nid = numa_mem_id();
page = __alloc_pages(gfp_mask, order, nid, nmask);
if (page)
__count_vm_event(HTLB_BUDDY_PGALLOC);
else
__count_vm_event(HTLB_BUDDY_PGALLOC_FAIL);
/*
* If we did not specify __GFP_RETRY_MAYFAIL, but still got a page this
* indicates an overall state change. Clear bit so that we resume
* normal 'try hard' allocations.
*/
if (node_alloc_noretry && page && !alloc_try_hard)
node_clear(nid, *node_alloc_noretry);
/*
* If we tried hard to get a page but failed, set bit so that
* subsequent attempts will not try as hard until there is an
* overall state change.
*/
if (node_alloc_noretry && !page && alloc_try_hard)
node_set(nid, *node_alloc_noretry);
return page;
}
/*
* Common helper to allocate a fresh hugetlb page. All specific allocators
* should use this function to get new hugetlb pages
*/
static struct page *alloc_fresh_huge_page(struct hstate *h,
gfp_t gfp_mask, int nid, nodemask_t *nmask,
nodemask_t *node_alloc_noretry)
{
struct page *page;
bool retry = false;
retry:
if (hstate_is_gigantic(h))
page = alloc_gigantic_page(h, gfp_mask, nid, nmask);
else
page = alloc_buddy_huge_page(h, gfp_mask,
nid, nmask, node_alloc_noretry);
if (!page)
return NULL;
if (hstate_is_gigantic(h)) {
if (!prep_compound_gigantic_page(page, huge_page_order(h))) {
/*
* Rare failure to convert pages to compound page.
* Free pages and try again - ONCE!
*/
free_gigantic_page(page, huge_page_order(h));
if (!retry) {
retry = true;
goto retry;
}
return NULL;
}
}
prep_new_huge_page(h, page, page_to_nid(page));
return page;
}
/*
* Allocates a fresh page to the hugetlb allocator pool in the node interleaved
* manner.
*/
static int alloc_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed,
nodemask_t *node_alloc_noretry)
{
struct page *page;
int nr_nodes, node;
gfp_t gfp_mask = htlb_alloc_mask(h) | __GFP_THISNODE;
for_each_node_mask_to_alloc(h, nr_nodes, node, nodes_allowed) {
page = alloc_fresh_huge_page(h, gfp_mask, node, nodes_allowed,
node_alloc_noretry);
if (page)
break;
}
if (!page)
return 0;
put_page(page); /* free it into the hugepage allocator */
return 1;
}
/*
* Remove huge page from pool from next node to free. Attempt to keep
* persistent huge pages more or less balanced over allowed nodes.
* This routine only 'removes' the hugetlb page. The caller must make
* an additional call to free the page to low level allocators.
* Called with hugetlb_lock locked.
*/
static struct page *remove_pool_huge_page(struct hstate *h,
nodemask_t *nodes_allowed,
bool acct_surplus)
{
int nr_nodes, node;
struct page *page = NULL;
lockdep_assert_held(&hugetlb_lock);
for_each_node_mask_to_free(h, nr_nodes, node, nodes_allowed) {
/*
* If we're returning unused surplus pages, only examine
* nodes with surplus pages.
*/
if ((!acct_surplus || h->surplus_huge_pages_node[node]) &&
!list_empty(&h->hugepage_freelists[node])) {
page = list_entry(h->hugepage_freelists[node].next,
struct page, lru);
remove_hugetlb_page(h, page, acct_surplus);
break;
}
}
return page;
}
/*
* Dissolve a given free hugepage into free buddy pages. This function does
* nothing for in-use hugepages and non-hugepages.
* This function returns values like below:
*
* -ENOMEM: failed to allocate vmemmap pages to free the freed hugepages
* when the system is under memory pressure and the feature of
* freeing unused vmemmap pages associated with each hugetlb page
* is enabled.
* -EBUSY: failed to dissolved free hugepages or the hugepage is in-use
* (allocated or reserved.)
* 0: successfully dissolved free hugepages or the page is not a
* hugepage (considered as already dissolved)
*/
int dissolve_free_huge_page(struct page *page)
{
int rc = -EBUSY;
retry:
/* Not to disrupt normal path by vainly holding hugetlb_lock */
if (!PageHuge(page))
return 0;
spin_lock_irq(&hugetlb_lock);
if (!PageHuge(page)) {
rc = 0;
goto out;
}
if (!page_count(page)) {
struct page *head = compound_head(page);
struct hstate *h = page_hstate(head);
if (h->free_huge_pages - h->resv_huge_pages == 0)
goto out;
/*
* We should make sure that the page is already on the free list
* when it is dissolved.
*/
if (unlikely(!HPageFreed(head))) {
spin_unlock_irq(&hugetlb_lock);
cond_resched();
/*
* Theoretically, we should return -EBUSY when we
* encounter this race. In fact, we have a chance
* to successfully dissolve the page if we do a
* retry. Because the race window is quite small.
* If we seize this opportunity, it is an optimization
* for increasing the success rate of dissolving page.
*/
goto retry;
}
remove_hugetlb_page(h, head, false);
h->max_huge_pages--;
spin_unlock_irq(&hugetlb_lock);
/*
* Normally update_and_free_page will allocate required vmemmmap
* before freeing the page. update_and_free_page will fail to
* free the page if it can not allocate required vmemmap. We
* need to adjust max_huge_pages if the page is not freed.
* Attempt to allocate vmemmmap here so that we can take
* appropriate action on failure.
*/
rc = alloc_huge_page_vmemmap(h, head);
if (!rc) {
/*
* Move PageHWPoison flag from head page to the raw
* error page, which makes any subpages rather than
* the error page reusable.
*/
if (PageHWPoison(head) && page != head) {
SetPageHWPoison(page);
ClearPageHWPoison(head);
}
update_and_free_page(h, head, false);
} else {
spin_lock_irq(&hugetlb_lock);
add_hugetlb_page(h, head, false);
h->max_huge_pages++;
spin_unlock_irq(&hugetlb_lock);
}
return rc;
}
out:
spin_unlock_irq(&hugetlb_lock);
return rc;
}
/*
* Dissolve free hugepages in a given pfn range. Used by memory hotplug to
* make specified memory blocks removable from the system.
* Note that this will dissolve a free gigantic hugepage completely, if any
* part of it lies within the given range.
* Also note that if dissolve_free_huge_page() returns with an error, all
* free hugepages that were dissolved before that error are lost.
*/
int dissolve_free_huge_pages(unsigned long start_pfn, unsigned long end_pfn)
{
unsigned long pfn;
struct page *page;
int rc = 0;
if (!hugepages_supported())
return rc;
for (pfn = start_pfn; pfn < end_pfn; pfn += 1 << minimum_order) {
page = pfn_to_page(pfn);
rc = dissolve_free_huge_page(page);
if (rc)
break;
}
return rc;
}
/*
* Allocates a fresh surplus page from the page allocator.
*/
static struct page *alloc_surplus_huge_page(struct hstate *h, gfp_t gfp_mask,
int nid, nodemask_t *nmask, bool zero_ref)
{
struct page *page = NULL;
bool retry = false;
if (hstate_is_gigantic(h))
return NULL;
spin_lock_irq(&hugetlb_lock);
if (h->surplus_huge_pages >= h->nr_overcommit_huge_pages)
goto out_unlock;
spin_unlock_irq(&hugetlb_lock);
retry:
page = alloc_fresh_huge_page(h, gfp_mask, nid, nmask, NULL);
if (!page)
return NULL;
spin_lock_irq(&hugetlb_lock);
/*
* We could have raced with the pool size change.
* Double check that and simply deallocate the new page
* if we would end up overcommiting the surpluses. Abuse
* temporary page to workaround the nasty free_huge_page
* codeflow
*/
if (h->surplus_huge_pages >= h->nr_overcommit_huge_pages) {
SetHPageTemporary(page);
spin_unlock_irq(&hugetlb_lock);
put_page(page);
return NULL;
}
if (zero_ref) {
/*
* Caller requires a page with zero ref count.
* We will drop ref count here. If someone else is holding
* a ref, the page will be freed when they drop it. Abuse
* temporary page flag to accomplish this.
*/
SetHPageTemporary(page);
if (!put_page_testzero(page)) {
/*
* Unexpected inflated ref count on freshly allocated
* huge. Retry once.
*/
pr_info("HugeTLB unexpected inflated ref count on freshly allocated page\n");
spin_unlock_irq(&hugetlb_lock);
if (retry)
return NULL;
retry = true;
goto retry;
}
ClearHPageTemporary(page);
}
h->surplus_huge_pages++;
h->surplus_huge_pages_node[page_to_nid(page)]++;
out_unlock:
spin_unlock_irq(&hugetlb_lock);
return page;
}
static struct page *alloc_migrate_huge_page(struct hstate *h, gfp_t gfp_mask,
int nid, nodemask_t *nmask)
{
struct page *page;
if (hstate_is_gigantic(h))
return NULL;
page = alloc_fresh_huge_page(h, gfp_mask, nid, nmask, NULL);
if (!page)
return NULL;
/*
* We do not account these pages as surplus because they are only
* temporary and will be released properly on the last reference
*/
SetHPageTemporary(page);
return page;
}
/*
* Use the VMA's mpolicy to allocate a huge page from the buddy.
*/
static
struct page *alloc_buddy_huge_page_with_mpol(struct hstate *h,
struct vm_area_struct *vma, unsigned long addr)
{
struct page *page = NULL;
struct mempolicy *mpol;
gfp_t gfp_mask = htlb_alloc_mask(h);
int nid;
nodemask_t *nodemask;
nid = huge_node(vma, addr, gfp_mask, &mpol, &nodemask);
if (mpol_is_preferred_many(mpol)) {
gfp_t gfp = gfp_mask | __GFP_NOWARN;
gfp &= ~(__GFP_DIRECT_RECLAIM | __GFP_NOFAIL);
page = alloc_surplus_huge_page(h, gfp, nid, nodemask, false);
/* Fallback to all nodes if page==NULL */
nodemask = NULL;
}
if (!page)
page = alloc_surplus_huge_page(h, gfp_mask, nid, nodemask, false);
mpol_cond_put(mpol);
return page;
}
/* page migration callback function */
struct page *alloc_huge_page_nodemask(struct hstate *h, int preferred_nid,
nodemask_t *nmask, gfp_t gfp_mask)
{
spin_lock_irq(&hugetlb_lock);
if (h->free_huge_pages - h->resv_huge_pages > 0) {
struct page *page;
page = dequeue_huge_page_nodemask(h, gfp_mask, preferred_nid, nmask);
if (page) {
spin_unlock_irq(&hugetlb_lock);
return page;
}
}
spin_unlock_irq(&hugetlb_lock);
return alloc_migrate_huge_page(h, gfp_mask, preferred_nid, nmask);
}
/* mempolicy aware migration callback */
struct page *alloc_huge_page_vma(struct hstate *h, struct vm_area_struct *vma,
unsigned long address)
{
struct mempolicy *mpol;
nodemask_t *nodemask;
struct page *page;
gfp_t gfp_mask;
int node;
gfp_mask = htlb_alloc_mask(h);
node = huge_node(vma, address, gfp_mask, &mpol, &nodemask);
page = alloc_huge_page_nodemask(h, node, nodemask, gfp_mask);
mpol_cond_put(mpol);
return page;
}
/*
* Increase the hugetlb pool such that it can accommodate a reservation
* of size 'delta'.
*/
static int gather_surplus_pages(struct hstate *h, long delta)
__must_hold(&hugetlb_lock)
{
struct list_head surplus_list;
struct page *page, *tmp;
int ret;
long i;
long needed, allocated;
bool alloc_ok = true;
lockdep_assert_held(&hugetlb_lock);
needed = (h->resv_huge_pages + delta) - h->free_huge_pages;
if (needed <= 0) {
h->resv_huge_pages += delta;
return 0;
}
allocated = 0;
INIT_LIST_HEAD(&surplus_list);
ret = -ENOMEM;
retry:
spin_unlock_irq(&hugetlb_lock);
for (i = 0; i < needed; i++) {
page = alloc_surplus_huge_page(h, htlb_alloc_mask(h),
NUMA_NO_NODE, NULL, true);
if (!page) {
alloc_ok = false;
break;
}
list_add(&page->lru, &surplus_list);
cond_resched();
}
allocated += i;
/*
* After retaking hugetlb_lock, we need to recalculate 'needed'
* because either resv_huge_pages or free_huge_pages may have changed.
*/
spin_lock_irq(&hugetlb_lock);
needed = (h->resv_huge_pages + delta) -
(h->free_huge_pages + allocated);
if (needed > 0) {
if (alloc_ok)
goto retry;
/*
* We were not able to allocate enough pages to
* satisfy the entire reservation so we free what
* we've allocated so far.
*/
goto free;
}
/*
* The surplus_list now contains _at_least_ the number of extra pages
* needed to accommodate the reservation. Add the appropriate number
* of pages to the hugetlb pool and free the extras back to the buddy
* allocator. Commit the entire reservation here to prevent another
* process from stealing the pages as they are added to the pool but
* before they are reserved.
*/
needed += allocated;
h->resv_huge_pages += delta;
ret = 0;
/* Free the needed pages to the hugetlb pool */
list_for_each_entry_safe(page, tmp, &surplus_list, lru) {
if ((--needed) < 0)
break;
/* Add the page to the hugetlb allocator */
enqueue_huge_page(h, page);
}
free:
spin_unlock_irq(&hugetlb_lock);
/*
* Free unnecessary surplus pages to the buddy allocator.
* Pages have no ref count, call free_huge_page directly.
*/
list_for_each_entry_safe(page, tmp, &surplus_list, lru)
free_huge_page(page);
spin_lock_irq(&hugetlb_lock);
return ret;
}
/*
* This routine has two main purposes:
* 1) Decrement the reservation count (resv_huge_pages) by the value passed
* in unused_resv_pages. This corresponds to the prior adjustments made
* to the associated reservation map.
* 2) Free any unused surplus pages that may have been allocated to satisfy
* the reservation. As many as unused_resv_pages may be freed.
*/
static void return_unused_surplus_pages(struct hstate *h,
unsigned long unused_resv_pages)
{
unsigned long nr_pages;
struct page *page;
LIST_HEAD(page_list);
lockdep_assert_held(&hugetlb_lock);
/* Uncommit the reservation */
h->resv_huge_pages -= unused_resv_pages;
/* Cannot return gigantic pages currently */
if (hstate_is_gigantic(h))
goto out;
/*
* Part (or even all) of the reservation could have been backed
* by pre-allocated pages. Only free surplus pages.
*/
nr_pages = min(unused_resv_pages, h->surplus_huge_pages);
/*
* We want to release as many surplus pages as possible, spread
* evenly across all nodes with memory. Iterate across these nodes
* until we can no longer free unreserved surplus pages. This occurs
* when the nodes with surplus pages have no free pages.
* remove_pool_huge_page() will balance the freed pages across the
* on-line nodes with memory and will handle the hstate accounting.
*/
while (nr_pages--) {
page = remove_pool_huge_page(h, &node_states[N_MEMORY], 1);
if (!page)
goto out;
list_add(&page->lru, &page_list);
}
out:
spin_unlock_irq(&hugetlb_lock);
update_and_free_pages_bulk(h, &page_list);
spin_lock_irq(&hugetlb_lock);
}
/*
* vma_needs_reservation, vma_commit_reservation and vma_end_reservation
* are used by the huge page allocation routines to manage reservations.
*
* vma_needs_reservation is called to determine if the huge page at addr
* within the vma has an associated reservation. If a reservation is
* needed, the value 1 is returned. The caller is then responsible for
* managing the global reservation and subpool usage counts. After
* the huge page has been allocated, vma_commit_reservation is called
* to add the page to the reservation map. If the page allocation fails,
* the reservation must be ended instead of committed. vma_end_reservation
* is called in such cases.
*
* In the normal case, vma_commit_reservation returns the same value
* as the preceding vma_needs_reservation call. The only time this
* is not the case is if a reserve map was changed between calls. It
* is the responsibility of the caller to notice the difference and
* take appropriate action.
*
* vma_add_reservation is used in error paths where a reservation must
* be restored when a newly allocated huge page must be freed. It is
* to be called after calling vma_needs_reservation to determine if a
* reservation exists.
*
* vma_del_reservation is used in error paths where an entry in the reserve
* map was created during huge page allocation and must be removed. It is to
* be called after calling vma_needs_reservation to determine if a reservation
* exists.
*/
enum vma_resv_mode {
VMA_NEEDS_RESV,
VMA_COMMIT_RESV,
VMA_END_RESV,
VMA_ADD_RESV,
VMA_DEL_RESV,
};
static long __vma_reservation_common(struct hstate *h,
struct vm_area_struct *vma, unsigned long addr,
enum vma_resv_mode mode)
{
struct resv_map *resv;
pgoff_t idx;
long ret;
long dummy_out_regions_needed;
resv = vma_resv_map(vma);
if (!resv)
return 1;
idx = vma_hugecache_offset(h, vma, addr);
switch (mode) {
case VMA_NEEDS_RESV:
ret = region_chg(resv, idx, idx + 1, &dummy_out_regions_needed);
/* We assume that vma_reservation_* routines always operate on
* 1 page, and that adding to resv map a 1 page entry can only
* ever require 1 region.
*/
VM_BUG_ON(dummy_out_regions_needed != 1);
break;
case VMA_COMMIT_RESV:
ret = region_add(resv, idx, idx + 1, 1, NULL, NULL);
/* region_add calls of range 1 should never fail. */
VM_BUG_ON(ret < 0);
break;
case VMA_END_RESV:
region_abort(resv, idx, idx + 1, 1);
ret = 0;
break;
case VMA_ADD_RESV:
if (vma->vm_flags & VM_MAYSHARE) {
ret = region_add(resv, idx, idx + 1, 1, NULL, NULL);
/* region_add calls of range 1 should never fail. */
VM_BUG_ON(ret < 0);
} else {
region_abort(resv, idx, idx + 1, 1);
ret = region_del(resv, idx, idx + 1);
}
break;
case VMA_DEL_RESV:
if (vma->vm_flags & VM_MAYSHARE) {
region_abort(resv, idx, idx + 1, 1);
ret = region_del(resv, idx, idx + 1);
} else {
ret = region_add(resv, idx, idx + 1, 1, NULL, NULL);
/* region_add calls of range 1 should never fail. */
VM_BUG_ON(ret < 0);
}
break;
default:
BUG();
}
if (vma->vm_flags & VM_MAYSHARE || mode == VMA_DEL_RESV)
return ret;
/*
* We know private mapping must have HPAGE_RESV_OWNER set.
*
* In most cases, reserves always exist for private mappings.
* However, a file associated with mapping could have been
* hole punched or truncated after reserves were consumed.
* As subsequent fault on such a range will not use reserves.
* Subtle - The reserve map for private mappings has the
* opposite meaning than that of shared mappings. If NO
* entry is in the reserve map, it means a reservation exists.
* If an entry exists in the reserve map, it means the
* reservation has already been consumed. As a result, the
* return value of this routine is the opposite of the
* value returned from reserve map manipulation routines above.
*/
if (ret > 0)
return 0;
if (ret == 0)
return 1;
return ret;
}
static long vma_needs_reservation(struct hstate *h,
struct vm_area_struct *vma, unsigned long addr)
{
return __vma_reservation_common(h, vma, addr, VMA_NEEDS_RESV);
}
static long vma_commit_reservation(struct hstate *h,
struct vm_area_struct *vma, unsigned long addr)
{
return __vma_reservation_common(h, vma, addr, VMA_COMMIT_RESV);
}
static void vma_end_reservation(struct hstate *h,
struct vm_area_struct *vma, unsigned long addr)
{
(void)__vma_reservation_common(h, vma, addr, VMA_END_RESV);
}
static long vma_add_reservation(struct hstate *h,
struct vm_area_struct *vma, unsigned long addr)
{
return __vma_reservation_common(h, vma, addr, VMA_ADD_RESV);
}
static long vma_del_reservation(struct hstate *h,
struct vm_area_struct *vma, unsigned long addr)
{
return __vma_reservation_common(h, vma, addr, VMA_DEL_RESV);
}
/*
* This routine is called to restore reservation information on error paths.
* It should ONLY be called for pages allocated via alloc_huge_page(), and
* the hugetlb mutex should remain held when calling this routine.
*
* It handles two specific cases:
* 1) A reservation was in place and the page consumed the reservation.
* HPageRestoreReserve is set in the page.
* 2) No reservation was in place for the page, so HPageRestoreReserve is
* not set. However, alloc_huge_page always updates the reserve map.
*
* In case 1, free_huge_page later in the error path will increment the
* global reserve count. But, free_huge_page does not have enough context
* to adjust the reservation map. This case deals primarily with private
* mappings. Adjust the reserve map here to be consistent with global
* reserve count adjustments to be made by free_huge_page. Make sure the
* reserve map indicates there is a reservation present.
*
* In case 2, simply undo reserve map modifications done by alloc_huge_page.
*/
void restore_reserve_on_error(struct hstate *h, struct vm_area_struct *vma,
unsigned long address, struct page *page)
{
long rc = vma_needs_reservation(h, vma, address);
if (HPageRestoreReserve(page)) {
if (unlikely(rc < 0))
/*
* Rare out of memory condition in reserve map
* manipulation. Clear HPageRestoreReserve so that
* global reserve count will not be incremented
* by free_huge_page. This will make it appear
* as though the reservation for this page was
* consumed. This may prevent the task from
* faulting in the page at a later time. This
* is better than inconsistent global huge page
* accounting of reserve counts.
*/
ClearHPageRestoreReserve(page);
else if (rc)
(void)vma_add_reservation(h, vma, address);
else
vma_end_reservation(h, vma, address);
} else {
if (!rc) {
/*
* This indicates there is an entry in the reserve map
* not added by alloc_huge_page. We know it was added
* before the alloc_huge_page call, otherwise
* HPageRestoreReserve would be set on the page.
* Remove the entry so that a subsequent allocation
* does not consume a reservation.
*/
rc = vma_del_reservation(h, vma, address);
if (rc < 0)
/*
* VERY rare out of memory condition. Since
* we can not delete the entry, set
* HPageRestoreReserve so that the reserve
* count will be incremented when the page
* is freed. This reserve will be consumed
* on a subsequent allocation.
*/
SetHPageRestoreReserve(page);
} else if (rc < 0) {
/*
* Rare out of memory condition from
* vma_needs_reservation call. Memory allocation is
* only attempted if a new entry is needed. Therefore,
* this implies there is not an entry in the
* reserve map.
*
* For shared mappings, no entry in the map indicates
* no reservation. We are done.
*/
if (!(vma->vm_flags & VM_MAYSHARE))
/*
* For private mappings, no entry indicates
* a reservation is present. Since we can
* not add an entry, set SetHPageRestoreReserve
* on the page so reserve count will be
* incremented when freed. This reserve will
* be consumed on a subsequent allocation.
*/
SetHPageRestoreReserve(page);
} else
/*
* No reservation present, do nothing
*/
vma_end_reservation(h, vma, address);
}
}
/*
* alloc_and_dissolve_huge_page - Allocate a new page and dissolve the old one
* @h: struct hstate old page belongs to
* @old_page: Old page to dissolve
* @list: List to isolate the page in case we need to
* Returns 0 on success, otherwise negated error.
*/
static int alloc_and_dissolve_huge_page(struct hstate *h, struct page *old_page,
struct list_head *list)
{
gfp_t gfp_mask = htlb_alloc_mask(h) | __GFP_THISNODE;
int nid = page_to_nid(old_page);
bool alloc_retry = false;
struct page *new_page;
int ret = 0;
/*
* Before dissolving the page, we need to allocate a new one for the
* pool to remain stable. Here, we allocate the page and 'prep' it
* by doing everything but actually updating counters and adding to
* the pool. This simplifies and let us do most of the processing
* under the lock.
*/
alloc_retry:
new_page = alloc_buddy_huge_page(h, gfp_mask, nid, NULL, NULL);
if (!new_page)
return -ENOMEM;
/*
* If all goes well, this page will be directly added to the free
* list in the pool. For this the ref count needs to be zero.
* Attempt to drop now, and retry once if needed. It is VERY
* unlikely there is another ref on the page.
*
* If someone else has a reference to the page, it will be freed
* when they drop their ref. Abuse temporary page flag to accomplish
* this. Retry once if there is an inflated ref count.
*/
SetHPageTemporary(new_page);
if (!put_page_testzero(new_page)) {
if (alloc_retry)
return -EBUSY;
alloc_retry = true;
goto alloc_retry;
}
ClearHPageTemporary(new_page);
__prep_new_huge_page(h, new_page);
retry:
spin_lock_irq(&hugetlb_lock);
if (!PageHuge(old_page)) {
/*
* Freed from under us. Drop new_page too.
*/
goto free_new;
} else if (page_count(old_page)) {
/*
* Someone has grabbed the page, try to isolate it here.
* Fail with -EBUSY if not possible.
*/
spin_unlock_irq(&hugetlb_lock);
if (!isolate_huge_page(old_page, list))
ret = -EBUSY;
spin_lock_irq(&hugetlb_lock);
goto free_new;
} else if (!HPageFreed(old_page)) {
/*
* Page's refcount is 0 but it has not been enqueued in the
* freelist yet. Race window is small, so we can succeed here if
* we retry.
*/
spin_unlock_irq(&hugetlb_lock);
cond_resched();
goto retry;
} else {
/*
* Ok, old_page is still a genuine free hugepage. Remove it from
* the freelist and decrease the counters. These will be
* incremented again when calling __prep_account_new_huge_page()
* and enqueue_huge_page() for new_page. The counters will remain
* stable since this happens under the lock.
*/
remove_hugetlb_page(h, old_page, false);
/*
* Ref count on new page is already zero as it was dropped
* earlier. It can be directly added to the pool free list.
*/
__prep_account_new_huge_page(h, nid);
enqueue_huge_page(h, new_page);
/*
* Pages have been replaced, we can safely free the old one.
*/
spin_unlock_irq(&hugetlb_lock);
update_and_free_page(h, old_page, false);
}
return ret;
free_new:
spin_unlock_irq(&hugetlb_lock);
/* Page has a zero ref count, but needs a ref to be freed */
set_page_refcounted(new_page);
update_and_free_page(h, new_page, false);
return ret;
}
int isolate_or_dissolve_huge_page(struct page *page, struct list_head *list)
{
struct hstate *h;
struct page *head;
int ret = -EBUSY;
/*
* The page might have been dissolved from under our feet, so make sure
* to carefully check the state under the lock.
* Return success when racing as if we dissolved the page ourselves.
*/
spin_lock_irq(&hugetlb_lock);
if (PageHuge(page)) {
head = compound_head(page);
h = page_hstate(head);
} else {
spin_unlock_irq(&hugetlb_lock);
return 0;
}
spin_unlock_irq(&hugetlb_lock);
/*
* Fence off gigantic pages as there is a cyclic dependency between
* alloc_contig_range and them. Return -ENOMEM as this has the effect
* of bailing out right away without further retrying.
*/
if (hstate_is_gigantic(h))
return -ENOMEM;
if (page_count(head) && isolate_huge_page(head, list))
ret = 0;
else if (!page_count(head))
ret = alloc_and_dissolve_huge_page(h, head, list);
return ret;
}
struct page *alloc_huge_page(struct vm_area_struct *vma,
unsigned long addr, int avoid_reserve)
{
struct hugepage_subpool *spool = subpool_vma(vma);
struct hstate *h = hstate_vma(vma);
struct page *page;
long map_chg, map_commit;
long gbl_chg;
int ret, idx;
struct hugetlb_cgroup *h_cg;
bool deferred_reserve;
idx = hstate_index(h);
/*
* Examine the region/reserve map to determine if the process
* has a reservation for the page to be allocated. A return
* code of zero indicates a reservation exists (no change).
*/
map_chg = gbl_chg = vma_needs_reservation(h, vma, addr);
if (map_chg < 0)
return ERR_PTR(-ENOMEM);
/*
* Processes that did not create the mapping will have no
* reserves as indicated by the region/reserve map. Check
* that the allocation will not exceed the subpool limit.
* Allocations for MAP_NORESERVE mappings also need to be
* checked against any subpool limit.
*/
if (map_chg || avoid_reserve) {
gbl_chg = hugepage_subpool_get_pages(spool, 1);
if (gbl_chg < 0) {
vma_end_reservation(h, vma, addr);
return ERR_PTR(-ENOSPC);
}
/*
* Even though there was no reservation in the region/reserve
* map, there could be reservations associated with the
* subpool that can be used. This would be indicated if the
* return value of hugepage_subpool_get_pages() is zero.
* However, if avoid_reserve is specified we still avoid even
* the subpool reservations.
*/
if (avoid_reserve)
gbl_chg = 1;
}
/* If this allocation is not consuming a reservation, charge it now.
*/
deferred_reserve = map_chg || avoid_reserve;
if (deferred_reserve) {
ret = hugetlb_cgroup_charge_cgroup_rsvd(
idx, pages_per_huge_page(h), &h_cg);
if (ret)
goto out_subpool_put;
}
ret = hugetlb_cgroup_charge_cgroup(idx, pages_per_huge_page(h), &h_cg);
if (ret)
goto out_uncharge_cgroup_reservation;
spin_lock_irq(&hugetlb_lock);
/*
* glb_chg is passed to indicate whether or not a page must be taken
* from the global free pool (global change). gbl_chg == 0 indicates
* a reservation exists for the allocation.
*/
page = dequeue_huge_page_vma(h, vma, addr, avoid_reserve, gbl_chg);
if (!page) {
spin_unlock_irq(&hugetlb_lock);
page = alloc_buddy_huge_page_with_mpol(h, vma, addr);
if (!page)
goto out_uncharge_cgroup;
if (!avoid_reserve && vma_has_reserves(vma, gbl_chg)) {
SetHPageRestoreReserve(page);
h->resv_huge_pages--;
}
spin_lock_irq(&hugetlb_lock);
list_add(&page->lru, &h->hugepage_activelist);
/* Fall through */
}
hugetlb_cgroup_commit_charge(idx, pages_per_huge_page(h), h_cg, page);
/* If allocation is not consuming a reservation, also store the
* hugetlb_cgroup pointer on the page.
*/
if (deferred_reserve) {
hugetlb_cgroup_commit_charge_rsvd(idx, pages_per_huge_page(h),
h_cg, page);
}
spin_unlock_irq(&hugetlb_lock);
hugetlb_set_page_subpool(page, spool);
map_commit = vma_commit_reservation(h, vma, addr);
if (unlikely(map_chg > map_commit)) {
/*
* The page was added to the reservation map between
* vma_needs_reservation and vma_commit_reservation.
* This indicates a race with hugetlb_reserve_pages.
* Adjust for the subpool count incremented above AND
* in hugetlb_reserve_pages for the same page. Also,
* the reservation count added in hugetlb_reserve_pages
* no longer applies.
*/
long rsv_adjust;
rsv_adjust = hugepage_subpool_put_pages(spool, 1);
hugetlb_acct_memory(h, -rsv_adjust);
if (deferred_reserve)
hugetlb_cgroup_uncharge_page_rsvd(hstate_index(h),
pages_per_huge_page(h), page);
}
return page;
out_uncharge_cgroup:
hugetlb_cgroup_uncharge_cgroup(idx, pages_per_huge_page(h), h_cg);
out_uncharge_cgroup_reservation:
if (deferred_reserve)
hugetlb_cgroup_uncharge_cgroup_rsvd(idx, pages_per_huge_page(h),
h_cg);
out_subpool_put:
if (map_chg || avoid_reserve)
hugepage_subpool_put_pages(spool, 1);
vma_end_reservation(h, vma, addr);
return ERR_PTR(-ENOSPC);
}
int alloc_bootmem_huge_page(struct hstate *h)
__attribute__ ((weak, alias("__alloc_bootmem_huge_page")));
int __alloc_bootmem_huge_page(struct hstate *h)
{
struct huge_bootmem_page *m;
int nr_nodes, node;
for_each_node_mask_to_alloc(h, nr_nodes, node, &node_states[N_MEMORY]) {
void *addr;
addr = memblock_alloc_try_nid_raw(
huge_page_size(h), huge_page_size(h),
0, MEMBLOCK_ALLOC_ACCESSIBLE, node);
if (addr) {
/*
* Use the beginning of the huge page to store the
* huge_bootmem_page struct (until gather_bootmem
* puts them into the mem_map).
*/
m = addr;
goto found;
}
}
return 0;
found:
BUG_ON(!IS_ALIGNED(virt_to_phys(m), huge_page_size(h)));
/* Put them into a private list first because mem_map is not up yet */
INIT_LIST_HEAD(&m->list);
list_add(&m->list, &huge_boot_pages);
m->hstate = h;
return 1;
}
/*
* Put bootmem huge pages into the standard lists after mem_map is up.
* Note: This only applies to gigantic (order > MAX_ORDER) pages.
*/
static void __init gather_bootmem_prealloc(void)
{
struct huge_bootmem_page *m;
list_for_each_entry(m, &huge_boot_pages, list) {
struct page *page = virt_to_page(m);
struct hstate *h = m->hstate;
VM_BUG_ON(!hstate_is_gigantic(h));
WARN_ON(page_count(page) != 1);
if (prep_compound_gigantic_page(page, huge_page_order(h))) {
WARN_ON(PageReserved(page));
prep_new_huge_page(h, page, page_to_nid(page));
put_page(page); /* add to the hugepage allocator */
} else {
/* VERY unlikely inflated ref count on a tail page */
free_gigantic_page(page, huge_page_order(h));
}
/*
* We need to restore the 'stolen' pages to totalram_pages
* in order to fix confusing memory reports from free(1) and
* other side-effects, like CommitLimit going negative.
*/
adjust_managed_page_count(page, pages_per_huge_page(h));
cond_resched();
}
}
static void __init hugetlb_hstate_alloc_pages(struct hstate *h)
{
unsigned long i;
nodemask_t *node_alloc_noretry;
if (!hstate_is_gigantic(h)) {
/*
* Bit mask controlling how hard we retry per-node allocations.
* Ignore errors as lower level routines can deal with
* node_alloc_noretry == NULL. If this kmalloc fails at boot
* time, we are likely in bigger trouble.
*/
node_alloc_noretry = kmalloc(sizeof(*node_alloc_noretry),
GFP_KERNEL);
} else {
/* allocations done at boot time */
node_alloc_noretry = NULL;
}
/* bit mask controlling how hard we retry per-node allocations */
if (node_alloc_noretry)
nodes_clear(*node_alloc_noretry);
for (i = 0; i < h->max_huge_pages; ++i) {
if (hstate_is_gigantic(h)) {
if (hugetlb_cma_size) {
pr_warn_once("HugeTLB: hugetlb_cma is enabled, skip boot time allocation\n");
goto free;
}
if (!alloc_bootmem_huge_page(h))
break;
} else if (!alloc_pool_huge_page(h,
&node_states[N_MEMORY],
node_alloc_noretry))
break;
cond_resched();
}
if (i < h->max_huge_pages) {
char buf[32];
string_get_size(huge_page_size(h), 1, STRING_UNITS_2, buf, 32);
pr_warn("HugeTLB: allocating %lu of page size %s failed. Only allocated %lu hugepages.\n",
h->max_huge_pages, buf, i);
h->max_huge_pages = i;
}
free:
kfree(node_alloc_noretry);
}
static void __init hugetlb_init_hstates(void)
{
struct hstate *h;
for_each_hstate(h) {
if (minimum_order > huge_page_order(h))
minimum_order = huge_page_order(h);
/* oversize hugepages were init'ed in early boot */
if (!hstate_is_gigantic(h))
hugetlb_hstate_alloc_pages(h);
}
VM_BUG_ON(minimum_order == UINT_MAX);
}
static void __init report_hugepages(void)
{
struct hstate *h;
for_each_hstate(h) {
char buf[32];
string_get_size(huge_page_size(h), 1, STRING_UNITS_2, buf, 32);
pr_info("HugeTLB registered %s page size, pre-allocated %ld pages\n",
buf, h->free_huge_pages);
}
}
#ifdef CONFIG_HIGHMEM
static void try_to_free_low(struct hstate *h, unsigned long count,
nodemask_t *nodes_allowed)
{
int i;
LIST_HEAD(page_list);
lockdep_assert_held(&hugetlb_lock);
if (hstate_is_gigantic(h))
return;
/*
* Collect pages to be freed on a list, and free after dropping lock
*/
for_each_node_mask(i, *nodes_allowed) {
struct page *page, *next;
struct list_head *freel = &h->hugepage_freelists[i];
list_for_each_entry_safe(page, next, freel, lru) {
if (count >= h->nr_huge_pages)
goto out;
if (PageHighMem(page))
continue;
remove_hugetlb_page(h, page, false);
list_add(&page->lru, &page_list);
}
}
out:
spin_unlock_irq(&hugetlb_lock);
update_and_free_pages_bulk(h, &page_list);
spin_lock_irq(&hugetlb_lock);
}
#else
static inline void try_to_free_low(struct hstate *h, unsigned long count,
nodemask_t *nodes_allowed)
{
}
#endif
/*
* Increment or decrement surplus_huge_pages. Keep node-specific counters
* balanced by operating on them in a round-robin fashion.
* Returns 1 if an adjustment was made.
*/
static int adjust_pool_surplus(struct hstate *h, nodemask_t *nodes_allowed,
int delta)
{
int nr_nodes, node;
lockdep_assert_held(&hugetlb_lock);
VM_BUG_ON(delta != -1 && delta != 1);
if (delta < 0) {
for_each_node_mask_to_alloc(h, nr_nodes, node, nodes_allowed) {
if (h->surplus_huge_pages_node[node])
goto found;
}
} else {
for_each_node_mask_to_free(h, nr_nodes, node, nodes_allowed) {
if (h->surplus_huge_pages_node[node] <
h->nr_huge_pages_node[node])
goto found;
}
}
return 0;
found:
h->surplus_huge_pages += delta;
h->surplus_huge_pages_node[node] += delta;
return 1;
}
#define persistent_huge_pages(h) (h->nr_huge_pages - h->surplus_huge_pages)
static int set_max_huge_pages(struct hstate *h, unsigned long count, int nid,
nodemask_t *nodes_allowed)
{
unsigned long min_count, ret;
struct page *page;
LIST_HEAD(page_list);
NODEMASK_ALLOC(nodemask_t, node_alloc_noretry, GFP_KERNEL);
/*
* Bit mask controlling how hard we retry per-node allocations.
* If we can not allocate the bit mask, do not attempt to allocate
* the requested huge pages.
*/
if (node_alloc_noretry)
nodes_clear(*node_alloc_noretry);
else
return -ENOMEM;
/*
* resize_lock mutex prevents concurrent adjustments to number of
* pages in hstate via the proc/sysfs interfaces.
*/
mutex_lock(&h->resize_lock);
flush_free_hpage_work(h);
spin_lock_irq(&hugetlb_lock);
/*
* Check for a node specific request.
* Changing node specific huge page count may require a corresponding
* change to the global count. In any case, the passed node mask
* (nodes_allowed) will restrict alloc/free to the specified node.
*/
if (nid != NUMA_NO_NODE) {
unsigned long old_count = count;
count += h->nr_huge_pages - h->nr_huge_pages_node[nid];
/*
* User may have specified a large count value which caused the
* above calculation to overflow. In this case, they wanted
* to allocate as many huge pages as possible. Set count to
* largest possible value to align with their intention.
*/
if (count < old_count)
count = ULONG_MAX;
}
/*
* Gigantic pages runtime allocation depend on the capability for large
* page range allocation.
* If the system does not provide this feature, return an error when
* the user tries to allocate gigantic pages but let the user free the
* boottime allocated gigantic pages.
*/
if (hstate_is_gigantic(h) && !IS_ENABLED(CONFIG_CONTIG_ALLOC)) {
if (count > persistent_huge_pages(h)) {
spin_unlock_irq(&hugetlb_lock);
mutex_unlock(&h->resize_lock);
NODEMASK_FREE(node_alloc_noretry);
return -EINVAL;
}
/* Fall through to decrease pool */
}
/*
* Increase the pool size
* First take pages out of surplus state. Then make up the
* remaining difference by allocating fresh huge pages.
*
* We might race with alloc_surplus_huge_page() here and be unable
* to convert a surplus huge page to a normal huge page. That is
* not critical, though, it just means the overall size of the
* pool might be one hugepage larger than it needs to be, but
* within all the constraints specified by the sysctls.
*/
while (h->surplus_huge_pages && count > persistent_huge_pages(h)) {
if (!adjust_pool_surplus(h, nodes_allowed, -1))
break;
}
while (count > persistent_huge_pages(h)) {
/*
* If this allocation races such that we no longer need the
* page, free_huge_page will handle it by freeing the page
* and reducing the surplus.
*/
spin_unlock_irq(&hugetlb_lock);
/* yield cpu to avoid soft lockup */
cond_resched();
ret = alloc_pool_huge_page(h, nodes_allowed,
node_alloc_noretry);
spin_lock_irq(&hugetlb_lock);
if (!ret)
goto out;
/* Bail for signals. Probably ctrl-c from user */
if (signal_pending(current))
goto out;
}
/*
* Decrease the pool size
* First return free pages to the buddy allocator (being careful
* to keep enough around to satisfy reservations). Then place
* pages into surplus state as needed so the pool will shrink
* to the desired size as pages become free.
*
* By placing pages into the surplus state independent of the
* overcommit value, we are allowing the surplus pool size to
* exceed overcommit. There are few sane options here. Since
* alloc_surplus_huge_page() is checking the global counter,
* though, we'll note that we're not allowed to exceed surplus
* and won't grow the pool anywhere else. Not until one of the
* sysctls are changed, or the surplus pages go out of use.
*/
min_count = h->resv_huge_pages + h->nr_huge_pages - h->free_huge_pages;
min_count = max(count, min_count);
try_to_free_low(h, min_count, nodes_allowed);
/*
* Collect pages to be removed on list without dropping lock
*/
while (min_count < persistent_huge_pages(h)) {
page = remove_pool_huge_page(h, nodes_allowed, 0);
if (!page)
break;
list_add(&page->lru, &page_list);
}
/* free the pages after dropping lock */
spin_unlock_irq(&hugetlb_lock);
update_and_free_pages_bulk(h, &page_list);
flush_free_hpage_work(h);
spin_lock_irq(&hugetlb_lock);
while (count < persistent_huge_pages(h)) {
if (!adjust_pool_surplus(h, nodes_allowed, 1))
break;
}
out:
h->max_huge_pages = persistent_huge_pages(h);
spin_unlock_irq(&hugetlb_lock);
mutex_unlock(&h->resize_lock);
NODEMASK_FREE(node_alloc_noretry);
return 0;
}
#define HSTATE_ATTR_RO(_name) \
static struct kobj_attribute _name##_attr = __ATTR_RO(_name)
#define HSTATE_ATTR(_name) \
static struct kobj_attribute _name##_attr = \
__ATTR(_name, 0644, _name##_show, _name##_store)
static struct kobject *hugepages_kobj;
static struct kobject *hstate_kobjs[HUGE_MAX_HSTATE];
static struct hstate *kobj_to_node_hstate(struct kobject *kobj, int *nidp);
static struct hstate *kobj_to_hstate(struct kobject *kobj, int *nidp)
{
int i;
for (i = 0; i < HUGE_MAX_HSTATE; i++)
if (hstate_kobjs[i] == kobj) {
if (nidp)
*nidp = NUMA_NO_NODE;
return &hstates[i];
}
return kobj_to_node_hstate(kobj, nidp);
}
static ssize_t nr_hugepages_show_common(struct kobject *kobj,
struct kobj_attribute *attr, char *buf)
{
struct hstate *h;
unsigned long nr_huge_pages;
int nid;
h = kobj_to_hstate(kobj, &nid);
if (nid == NUMA_NO_NODE)
nr_huge_pages = h->nr_huge_pages;
else
nr_huge_pages = h->nr_huge_pages_node[nid];
return sysfs_emit(buf, "%lu\n", nr_huge_pages);
}
static ssize_t __nr_hugepages_store_common(bool obey_mempolicy,
struct hstate *h, int nid,
unsigned long count, size_t len)
{
int err;
nodemask_t nodes_allowed, *n_mask;
if (hstate_is_gigantic(h) && !gigantic_page_runtime_supported())
return -EINVAL;
if (nid == NUMA_NO_NODE) {
/*
* global hstate attribute
*/
if (!(obey_mempolicy &&
init_nodemask_of_mempolicy(&nodes_allowed)))
n_mask = &node_states[N_MEMORY];
else
n_mask = &nodes_allowed;
} else {
/*
* Node specific request. count adjustment happens in
* set_max_huge_pages() after acquiring hugetlb_lock.
*/
init_nodemask_of_node(&nodes_allowed, nid);
n_mask = &nodes_allowed;
}
err = set_max_huge_pages(h, count, nid, n_mask);
return err ? err : len;
}
static ssize_t nr_hugepages_store_common(bool obey_mempolicy,
struct kobject *kobj, const char *buf,
size_t len)
{
struct hstate *h;
unsigned long count;
int nid;
int err;
err = kstrtoul(buf, 10, &count);
if (err)
return err;
h = kobj_to_hstate(kobj, &nid);
return __nr_hugepages_store_common(obey_mempolicy, h, nid, count, len);
}
static ssize_t nr_hugepages_show(struct kobject *kobj,
struct kobj_attribute *attr, char *buf)
{
return nr_hugepages_show_common(kobj, attr, buf);
}
static ssize_t nr_hugepages_store(struct kobject *kobj,
struct kobj_attribute *attr, const char *buf, size_t len)
{
return nr_hugepages_store_common(false, kobj, buf, len);
}
HSTATE_ATTR(nr_hugepages);
#ifdef CONFIG_NUMA
/*
* hstate attribute for optionally mempolicy-based constraint on persistent
* huge page alloc/free.
*/
static ssize_t nr_hugepages_mempolicy_show(struct kobject *kobj,
struct kobj_attribute *attr,
char *buf)
{
return nr_hugepages_show_common(kobj, attr, buf);
}
static ssize_t nr_hugepages_mempolicy_store(struct kobject *kobj,
struct kobj_attribute *attr, const char *buf, size_t len)
{
return nr_hugepages_store_common(true, kobj, buf, len);
}
HSTATE_ATTR(nr_hugepages_mempolicy);
#endif
static ssize_t nr_overcommit_hugepages_show(struct kobject *kobj,
struct kobj_attribute *attr, char *buf)
{
struct hstate *h = kobj_to_hstate(kobj, NULL);
return sysfs_emit(buf, "%lu\n", h->nr_overcommit_huge_pages);
}
static ssize_t nr_overcommit_hugepages_store(struct kobject *kobj,
struct kobj_attribute *attr, const char *buf, size_t count)
{
int err;
unsigned long input;
struct hstate *h = kobj_to_hstate(kobj, NULL);
if (hstate_is_gigantic(h))
return -EINVAL;
err = kstrtoul(buf, 10, &input);
if (err)
return err;
spin_lock_irq(&hugetlb_lock);
h->nr_overcommit_huge_pages = input;
spin_unlock_irq(&hugetlb_lock);
return count;
}
HSTATE_ATTR(nr_overcommit_hugepages);
static ssize_t free_hugepages_show(struct kobject *kobj,
struct kobj_attribute *attr, char *buf)
{
struct hstate *h;
unsigned long free_huge_pages;
int nid;
h = kobj_to_hstate(kobj, &nid);
if (nid == NUMA_NO_NODE)
free_huge_pages = h->free_huge_pages;
else
free_huge_pages = h->free_huge_pages_node[nid];
return sysfs_emit(buf, "%lu\n", free_huge_pages);
}
HSTATE_ATTR_RO(free_hugepages);
static ssize_t resv_hugepages_show(struct kobject *kobj,
struct kobj_attribute *attr, char *buf)
{
struct hstate *h = kobj_to_hstate(kobj, NULL);
return sysfs_emit(buf, "%lu\n", h->resv_huge_pages);
}
HSTATE_ATTR_RO(resv_hugepages);
static ssize_t surplus_hugepages_show(struct kobject *kobj,
struct kobj_attribute *attr, char *buf)
{
struct hstate *h;
unsigned long surplus_huge_pages;
int nid;
h = kobj_to_hstate(kobj, &nid);
if (nid == NUMA_NO_NODE)
surplus_huge_pages = h->surplus_huge_pages;
else
surplus_huge_pages = h->surplus_huge_pages_node[nid];
return sysfs_emit(buf, "%lu\n", surplus_huge_pages);
}
HSTATE_ATTR_RO(surplus_hugepages);
static struct attribute *hstate_attrs[] = {
&nr_hugepages_attr.attr,
&nr_overcommit_hugepages_attr.attr,
&free_hugepages_attr.attr,
&resv_hugepages_attr.attr,
&surplus_hugepages_attr.attr,
#ifdef CONFIG_NUMA
&nr_hugepages_mempolicy_attr.attr,
#endif
NULL,
};
static const struct attribute_group hstate_attr_group = {
.attrs = hstate_attrs,
};
static int hugetlb_sysfs_add_hstate(struct hstate *h, struct kobject *parent,
struct kobject **hstate_kobjs,
const struct attribute_group *hstate_attr_group)
{
int retval;
int hi = hstate_index(h);
hstate_kobjs[hi] = kobject_create_and_add(h->name, parent);
if (!hstate_kobjs[hi])
return -ENOMEM;
retval = sysfs_create_group(hstate_kobjs[hi], hstate_attr_group);
if (retval) {
kobject_put(hstate_kobjs[hi]);
hstate_kobjs[hi] = NULL;
}
return retval;
}
static void __init hugetlb_sysfs_init(void)
{
struct hstate *h;
int err;
hugepages_kobj = kobject_create_and_add("hugepages", mm_kobj);
if (!hugepages_kobj)
return;
for_each_hstate(h) {
err = hugetlb_sysfs_add_hstate(h, hugepages_kobj,
hstate_kobjs, &hstate_attr_group);
if (err)
pr_err("HugeTLB: Unable to add hstate %s", h->name);
}
}
#ifdef CONFIG_NUMA
/*
* node_hstate/s - associate per node hstate attributes, via their kobjects,
* with node devices in node_devices[] using a parallel array. The array
* index of a node device or _hstate == node id.
* This is here to avoid any static dependency of the node device driver, in
* the base kernel, on the hugetlb module.
*/
struct node_hstate {
struct kobject *hugepages_kobj;
struct kobject *hstate_kobjs[HUGE_MAX_HSTATE];
};
static struct node_hstate node_hstates[MAX_NUMNODES];
/*
* A subset of global hstate attributes for node devices
*/
static struct attribute *per_node_hstate_attrs[] = {
&nr_hugepages_attr.attr,
&free_hugepages_attr.attr,
&surplus_hugepages_attr.attr,
NULL,
};
static const struct attribute_group per_node_hstate_attr_group = {
.attrs = per_node_hstate_attrs,
};
/*
* kobj_to_node_hstate - lookup global hstate for node device hstate attr kobj.
* Returns node id via non-NULL nidp.
*/
static struct hstate *kobj_to_node_hstate(struct kobject *kobj, int *nidp)
{
int nid;
for (nid = 0; nid < nr_node_ids; nid++) {
struct node_hstate *nhs = &node_hstates[nid];
int i;
for (i = 0; i < HUGE_MAX_HSTATE; i++)
if (nhs->hstate_kobjs[i] == kobj) {
if (nidp)
*nidp = nid;
return &hstates[i];
}
}
BUG();
return NULL;
}
/*
* Unregister hstate attributes from a single node device.
* No-op if no hstate attributes attached.
*/
static void hugetlb_unregister_node(struct node *node)
{
struct hstate *h;
struct node_hstate *nhs = &node_hstates[node->dev.id];
if (!nhs->hugepages_kobj)
return; /* no hstate attributes */
for_each_hstate(h) {
int idx = hstate_index(h);
if (nhs->hstate_kobjs[idx]) {
kobject_put(nhs->hstate_kobjs[idx]);
nhs->hstate_kobjs[idx] = NULL;
}
}
kobject_put(nhs->hugepages_kobj);
nhs->hugepages_kobj = NULL;
}
/*
* Register hstate attributes for a single node device.
* No-op if attributes already registered.
*/
static void hugetlb_register_node(struct node *node)
{
struct hstate *h;
struct node_hstate *nhs = &node_hstates[node->dev.id];
int err;
if (nhs->hugepages_kobj)
return; /* already allocated */
nhs->hugepages_kobj = kobject_create_and_add("hugepages",
&node->dev.kobj);
if (!nhs->hugepages_kobj)
return;
for_each_hstate(h) {
err = hugetlb_sysfs_add_hstate(h, nhs->hugepages_kobj,
nhs->hstate_kobjs,
&per_node_hstate_attr_group);
if (err) {
pr_err("HugeTLB: Unable to add hstate %s for node %d\n",
h->name, node->dev.id);
hugetlb_unregister_node(node);
break;
}
}
}
/*
* hugetlb init time: register hstate attributes for all registered node
* devices of nodes that have memory. All on-line nodes should have
* registered their associated device by this time.
*/
static void __init hugetlb_register_all_nodes(void)
{
int nid;
for_each_node_state(nid, N_MEMORY) {
struct node *node = node_devices[nid];
if (node->dev.id == nid)
hugetlb_register_node(node);
}
/*
* Let the node device driver know we're here so it can
* [un]register hstate attributes on node hotplug.
*/
register_hugetlbfs_with_node(hugetlb_register_node,
hugetlb_unregister_node);
}
#else /* !CONFIG_NUMA */
static struct hstate *kobj_to_node_hstate(struct kobject *kobj, int *nidp)
{
BUG();
if (nidp)
*nidp = -1;
return NULL;
}
static void hugetlb_register_all_nodes(void) { }
#endif
static int __init hugetlb_init(void)
{
int i;
BUILD_BUG_ON(sizeof_field(struct page, private) * BITS_PER_BYTE <
__NR_HPAGEFLAGS);
if (!hugepages_supported()) {
if (hugetlb_max_hstate || default_hstate_max_huge_pages)
pr_warn("HugeTLB: huge pages not supported, ignoring associated command-line parameters\n");
return 0;
}
/*
* Make sure HPAGE_SIZE (HUGETLB_PAGE_ORDER) hstate exists. Some
* architectures depend on setup being done here.
*/
hugetlb_add_hstate(HUGETLB_PAGE_ORDER);
if (!parsed_default_hugepagesz) {
/*
* If we did not parse a default huge page size, set
* default_hstate_idx to HPAGE_SIZE hstate. And, if the
* number of huge pages for this default size was implicitly
* specified, set that here as well.
* Note that the implicit setting will overwrite an explicit
* setting. A warning will be printed in this case.
*/
default_hstate_idx = hstate_index(size_to_hstate(HPAGE_SIZE));
if (default_hstate_max_huge_pages) {
if (default_hstate.max_huge_pages) {
char buf[32];
string_get_size(huge_page_size(&default_hstate),
1, STRING_UNITS_2, buf, 32);
pr_warn("HugeTLB: Ignoring hugepages=%lu associated with %s page size\n",
default_hstate.max_huge_pages, buf);
pr_warn("HugeTLB: Using hugepages=%lu for number of default huge pages\n",
default_hstate_max_huge_pages);
}
default_hstate.max_huge_pages =
default_hstate_max_huge_pages;
}
}
hugetlb_cma_check();
hugetlb_init_hstates();
gather_bootmem_prealloc();
report_hugepages();
hugetlb_sysfs_init();
hugetlb_register_all_nodes();
hugetlb_cgroup_file_init();
#ifdef CONFIG_SMP
num_fault_mutexes = roundup_pow_of_two(8 * num_possible_cpus());
#else
num_fault_mutexes = 1;
#endif
hugetlb_fault_mutex_table =
kmalloc_array(num_fault_mutexes, sizeof(struct mutex),
GFP_KERNEL);
BUG_ON(!hugetlb_fault_mutex_table);
for (i = 0; i < num_fault_mutexes; i++)
mutex_init(&hugetlb_fault_mutex_table[i]);
return 0;
}
subsys_initcall(hugetlb_init);
/* Overwritten by architectures with more huge page sizes */
bool __init __attribute((weak)) arch_hugetlb_valid_size(unsigned long size)
{
return size == HPAGE_SIZE;
}
void __init hugetlb_add_hstate(unsigned int order)
{
struct hstate *h;
unsigned long i;
if (size_to_hstate(PAGE_SIZE << order)) {
return;
}
BUG_ON(hugetlb_max_hstate >= HUGE_MAX_HSTATE);
BUG_ON(order == 0);
h = &hstates[hugetlb_max_hstate++];
mutex_init(&h->resize_lock);
h->order = order;
h->mask = ~(huge_page_size(h) - 1);
for (i = 0; i < MAX_NUMNODES; ++i)
INIT_LIST_HEAD(&h->hugepage_freelists[i]);
INIT_LIST_HEAD(&h->hugepage_activelist);
h->next_nid_to_alloc = first_memory_node;
h->next_nid_to_free = first_memory_node;
snprintf(h->name, HSTATE_NAME_LEN, "hugepages-%lukB",
huge_page_size(h)/1024);
hugetlb_vmemmap_init(h);
parsed_hstate = h;
}
/*
* hugepages command line processing
* hugepages normally follows a valid hugepagsz or default_hugepagsz
* specification. If not, ignore the hugepages value. hugepages can also
* be the first huge page command line option in which case it implicitly
* specifies the number of huge pages for the default size.
*/
static int __init hugepages_setup(char *s)
{
unsigned long *mhp;
static unsigned long *last_mhp;
if (!parsed_valid_hugepagesz) {
pr_warn("HugeTLB: hugepages=%s does not follow a valid hugepagesz, ignoring\n", s);
parsed_valid_hugepagesz = true;
return 0;
}
/*
* !hugetlb_max_hstate means we haven't parsed a hugepagesz= parameter
* yet, so this hugepages= parameter goes to the "default hstate".
* Otherwise, it goes with the previously parsed hugepagesz or
* default_hugepagesz.
*/
else if (!hugetlb_max_hstate)
mhp = &default_hstate_max_huge_pages;
else
mhp = &parsed_hstate->max_huge_pages;
if (mhp == last_mhp) {
pr_warn("HugeTLB: hugepages= specified twice without interleaving hugepagesz=, ignoring hugepages=%s\n", s);
return 0;
}
if (sscanf(s, "%lu", mhp) <= 0)
*mhp = 0;
/*
* Global state is always initialized later in hugetlb_init.
* But we need to allocate gigantic hstates here early to still
* use the bootmem allocator.
*/
if (hugetlb_max_hstate && hstate_is_gigantic(parsed_hstate))
hugetlb_hstate_alloc_pages(parsed_hstate);
last_mhp = mhp;
return 1;
}
__setup("hugepages=", hugepages_setup);
/*
* hugepagesz command line processing
* A specific huge page size can only be specified once with hugepagesz.
* hugepagesz is followed by hugepages on the command line. The global
* variable 'parsed_valid_hugepagesz' is used to determine if prior
* hugepagesz argument was valid.
*/
static int __init hugepagesz_setup(char *s)
{
unsigned long size;
struct hstate *h;
parsed_valid_hugepagesz = false;
size = (unsigned long)memparse(s, NULL);
if (!arch_hugetlb_valid_size(size)) {
pr_err("HugeTLB: unsupported hugepagesz=%s\n", s);
return 0;
}
h = size_to_hstate(size);
if (h) {
/*
* hstate for this size already exists. This is normally
* an error, but is allowed if the existing hstate is the
* default hstate. More specifically, it is only allowed if
* the number of huge pages for the default hstate was not
* previously specified.
*/
if (!parsed_default_hugepagesz || h != &default_hstate ||
default_hstate.max_huge_pages) {
pr_warn("HugeTLB: hugepagesz=%s specified twice, ignoring\n", s);
return 0;
}
/*
* No need to call hugetlb_add_hstate() as hstate already
* exists. But, do set parsed_hstate so that a following
* hugepages= parameter will be applied to this hstate.
*/
parsed_hstate = h;
parsed_valid_hugepagesz = true;
return 1;
}
hugetlb_add_hstate(ilog2(size) - PAGE_SHIFT);
parsed_valid_hugepagesz = true;
return 1;
}
__setup("hugepagesz=", hugepagesz_setup);
/*
* default_hugepagesz command line input
* Only one instance of default_hugepagesz allowed on command line.
*/
static int __init default_hugepagesz_setup(char *s)
{
unsigned long size;
parsed_valid_hugepagesz = false;
if (parsed_default_hugepagesz) {
pr_err("HugeTLB: default_hugepagesz previously specified, ignoring %s\n", s);
return 0;
}
size = (unsigned long)memparse(s, NULL);
if (!arch_hugetlb_valid_size(size)) {
pr_err("HugeTLB: unsupported default_hugepagesz=%s\n", s);
return 0;
}
hugetlb_add_hstate(ilog2(size) - PAGE_SHIFT);
parsed_valid_hugepagesz = true;
parsed_default_hugepagesz = true;
default_hstate_idx = hstate_index(size_to_hstate(size));
/*
* The number of default huge pages (for this size) could have been
* specified as the first hugetlb parameter: hugepages=X. If so,
* then default_hstate_max_huge_pages is set. If the default huge
* page size is gigantic (>= MAX_ORDER), then the pages must be
* allocated here from bootmem allocator.
*/
if (default_hstate_max_huge_pages) {
default_hstate.max_huge_pages = default_hstate_max_huge_pages;
if (hstate_is_gigantic(&default_hstate))
hugetlb_hstate_alloc_pages(&default_hstate);
default_hstate_max_huge_pages = 0;
}
return 1;
}
__setup("default_hugepagesz=", default_hugepagesz_setup);
static unsigned int allowed_mems_nr(struct hstate *h)
{
int node;
unsigned int nr = 0;
nodemask_t *mpol_allowed;
unsigned int *array = h->free_huge_pages_node;
gfp_t gfp_mask = htlb_alloc_mask(h);
mpol_allowed = policy_nodemask_current(gfp_mask);
for_each_node_mask(node, cpuset_current_mems_allowed) {
if (!mpol_allowed || node_isset(node, *mpol_allowed))
nr += array[node];
}
return nr;
}
#ifdef CONFIG_SYSCTL
static int proc_hugetlb_doulongvec_minmax(struct ctl_table *table, int write,
void *buffer, size_t *length,
loff_t *ppos, unsigned long *out)
{
struct ctl_table dup_table;
/*
* In order to avoid races with __do_proc_doulongvec_minmax(), we
* can duplicate the @table and alter the duplicate of it.
*/
dup_table = *table;
dup_table.data = out;
return proc_doulongvec_minmax(&dup_table, write, buffer, length, ppos);
}
static int hugetlb_sysctl_handler_common(bool obey_mempolicy,
struct ctl_table *table, int write,
void *buffer, size_t *length, loff_t *ppos)
{
struct hstate *h = &default_hstate;
unsigned long tmp = h->max_huge_pages;
int ret;
if (!hugepages_supported())
return -EOPNOTSUPP;
ret = proc_hugetlb_doulongvec_minmax(table, write, buffer, length, ppos,
&tmp);
if (ret)
goto out;
if (write)
ret = __nr_hugepages_store_common(obey_mempolicy, h,
NUMA_NO_NODE, tmp, *length);
out:
return ret;
}
int hugetlb_sysctl_handler(struct ctl_table *table, int write,
void *buffer, size_t *length, loff_t *ppos)
{
return hugetlb_sysctl_handler_common(false, table, write,
buffer, length, ppos);
}
#ifdef CONFIG_NUMA
int hugetlb_mempolicy_sysctl_handler(struct ctl_table *table, int write,
void *buffer, size_t *length, loff_t *ppos)
{
return hugetlb_sysctl_handler_common(true, table, write,
buffer, length, ppos);
}
#endif /* CONFIG_NUMA */
int hugetlb_overcommit_handler(struct ctl_table *table, int write,
void *buffer, size_t *length, loff_t *ppos)
{
struct hstate *h = &default_hstate;
unsigned long tmp;
int ret;
if (!hugepages_supported())
return -EOPNOTSUPP;
tmp = h->nr_overcommit_huge_pages;
if (write && hstate_is_gigantic(h))
return -EINVAL;
ret = proc_hugetlb_doulongvec_minmax(table, write, buffer, length, ppos,
&tmp);
if (ret)
goto out;
if (write) {
spin_lock_irq(&hugetlb_lock);
h->nr_overcommit_huge_pages = tmp;
spin_unlock_irq(&hugetlb_lock);
}
out:
return ret;
}
#endif /* CONFIG_SYSCTL */
void hugetlb_report_meminfo(struct seq_file *m)
{
struct hstate *h;
unsigned long total = 0;
if (!hugepages_supported())
return;
for_each_hstate(h) {
unsigned long count = h->nr_huge_pages;
total += huge_page_size(h) * count;
if (h == &default_hstate)
seq_printf(m,
"HugePages_Total: %5lu\n"
"HugePages_Free: %5lu\n"
"HugePages_Rsvd: %5lu\n"
"HugePages_Surp: %5lu\n"
"Hugepagesize: %8lu kB\n",
count,
h->free_huge_pages,
h->resv_huge_pages,
h->surplus_huge_pages,
huge_page_size(h) / SZ_1K);
}
seq_printf(m, "Hugetlb: %8lu kB\n", total / SZ_1K);
}
int hugetlb_report_node_meminfo(char *buf, int len, int nid)
{
struct hstate *h = &default_hstate;
if (!hugepages_supported())
return 0;
return sysfs_emit_at(buf, len,
"Node %d HugePages_Total: %5u\n"
"Node %d HugePages_Free: %5u\n"
"Node %d HugePages_Surp: %5u\n",
nid, h->nr_huge_pages_node[nid],
nid, h->free_huge_pages_node[nid],
nid, h->surplus_huge_pages_node[nid]);
}
void hugetlb_show_meminfo(void)
{
struct hstate *h;
int nid;
if (!hugepages_supported())
return;
for_each_node_state(nid, N_MEMORY)
for_each_hstate(h)
pr_info("Node %d hugepages_total=%u hugepages_free=%u hugepages_surp=%u hugepages_size=%lukB\n",
nid,
h->nr_huge_pages_node[nid],
h->free_huge_pages_node[nid],
h->surplus_huge_pages_node[nid],
huge_page_size(h) / SZ_1K);
}
void hugetlb_report_usage(struct seq_file *m, struct mm_struct *mm)
{
seq_printf(m, "HugetlbPages:\t%8lu kB\n",
atomic_long_read(&mm->hugetlb_usage) << (PAGE_SHIFT - 10));
}
/* Return the number pages of memory we physically have, in PAGE_SIZE units. */
unsigned long hugetlb_total_pages(void)
{
struct hstate *h;
unsigned long nr_total_pages = 0;
for_each_hstate(h)
nr_total_pages += h->nr_huge_pages * pages_per_huge_page(h);
return nr_total_pages;
}
static int hugetlb_acct_memory(struct hstate *h, long delta)
{
int ret = -ENOMEM;
if (!delta)
return 0;
spin_lock_irq(&hugetlb_lock);
/*
* When cpuset is configured, it breaks the strict hugetlb page
* reservation as the accounting is done on a global variable. Such
* reservation is completely rubbish in the presence of cpuset because
* the reservation is not checked against page availability for the
* current cpuset. Application can still potentially OOM'ed by kernel
* with lack of free htlb page in cpuset that the task is in.
* Attempt to enforce strict accounting with cpuset is almost
* impossible (or too ugly) because cpuset is too fluid that
* task or memory node can be dynamically moved between cpusets.
*
* The change of semantics for shared hugetlb mapping with cpuset is
* undesirable. However, in order to preserve some of the semantics,
* we fall back to check against current free page availability as
* a best attempt and hopefully to minimize the impact of changing
* semantics that cpuset has.
*
* Apart from cpuset, we also have memory policy mechanism that
* also determines from which node the kernel will allocate memory
* in a NUMA system. So similar to cpuset, we also should consider
* the memory policy of the current task. Similar to the description
* above.
*/
if (delta > 0) {
if (gather_surplus_pages(h, delta) < 0)
goto out;
if (delta > allowed_mems_nr(h)) {
return_unused_surplus_pages(h, delta);
goto out;
}
}
ret = 0;
if (delta < 0)
return_unused_surplus_pages(h, (unsigned long) -delta);
out:
spin_unlock_irq(&hugetlb_lock);
return ret;
}
static void hugetlb_vm_op_open(struct vm_area_struct *vma)
{
struct resv_map *resv = vma_resv_map(vma);
/*
* This new VMA should share its siblings reservation map if present.
* The VMA will only ever have a valid reservation map pointer where
* it is being copied for another still existing VMA. As that VMA
* has a reference to the reservation map it cannot disappear until
* after this open call completes. It is therefore safe to take a
* new reference here without additional locking.
*/
if (resv && is_vma_resv_set(vma, HPAGE_RESV_OWNER)) {
resv_map_dup_hugetlb_cgroup_uncharge_info(resv);
kref_get(&resv->refs);
}
}
static void hugetlb_vm_op_close(struct vm_area_struct *vma)
{
struct hstate *h = hstate_vma(vma);
struct resv_map *resv = vma_resv_map(vma);
struct hugepage_subpool *spool = subpool_vma(vma);
unsigned long reserve, start, end;
long gbl_reserve;
if (!resv || !is_vma_resv_set(vma, HPAGE_RESV_OWNER))
return;
start = vma_hugecache_offset(h, vma, vma->vm_start);
end = vma_hugecache_offset(h, vma, vma->vm_end);
reserve = (end - start) - region_count(resv, start, end);
hugetlb_cgroup_uncharge_counter(resv, start, end);
if (reserve) {
/*
* Decrement reserve counts. The global reserve count may be
* adjusted if the subpool has a minimum size.
*/
gbl_reserve = hugepage_subpool_put_pages(spool, reserve);
hugetlb_acct_memory(h, -gbl_reserve);
}
kref_put(&resv->refs, resv_map_release);
}
static int hugetlb_vm_op_split(struct vm_area_struct *vma, unsigned long addr)
{
if (addr & ~(huge_page_mask(hstate_vma(vma))))
return -EINVAL;
return 0;
}
static unsigned long hugetlb_vm_op_pagesize(struct vm_area_struct *vma)
{
return huge_page_size(hstate_vma(vma));
}
/*
* We cannot handle pagefaults against hugetlb pages at all. They cause
* handle_mm_fault() to try to instantiate regular-sized pages in the
* hugepage VMA. do_page_fault() is supposed to trap this, so BUG is we get
* this far.
*/
static vm_fault_t hugetlb_vm_op_fault(struct vm_fault *vmf)
{
BUG();
return 0;
}
/*
* When a new function is introduced to vm_operations_struct and added
* to hugetlb_vm_ops, please consider adding the function to shm_vm_ops.
* This is because under System V memory model, mappings created via
* shmget/shmat with "huge page" specified are backed by hugetlbfs files,
* their original vm_ops are overwritten with shm_vm_ops.
*/
const struct vm_operations_struct hugetlb_vm_ops = {
.fault = hugetlb_vm_op_fault,
.open = hugetlb_vm_op_open,
.close = hugetlb_vm_op_close,
.may_split = hugetlb_vm_op_split,
.pagesize = hugetlb_vm_op_pagesize,
};
static pte_t make_huge_pte(struct vm_area_struct *vma, struct page *page,
int writable)
{
pte_t entry;
unsigned int shift = huge_page_shift(hstate_vma(vma));
if (writable) {
entry = huge_pte_mkwrite(huge_pte_mkdirty(mk_huge_pte(page,
vma->vm_page_prot)));
} else {
entry = huge_pte_wrprotect(mk_huge_pte(page,
vma->vm_page_prot));
}
entry = pte_mkyoung(entry);
entry = pte_mkhuge(entry);
entry = arch_make_huge_pte(entry, shift, vma->vm_flags);
return entry;
}
static void set_huge_ptep_writable(struct vm_area_struct *vma,
unsigned long address, pte_t *ptep)
{
pte_t entry;
entry = huge_pte_mkwrite(huge_pte_mkdirty(huge_ptep_get(ptep)));
if (huge_ptep_set_access_flags(vma, address, ptep, entry, 1))
update_mmu_cache(vma, address, ptep);
}
bool is_hugetlb_entry_migration(pte_t pte)
{
swp_entry_t swp;
if (huge_pte_none(pte) || pte_present(pte))
return false;
swp = pte_to_swp_entry(pte);
if (is_migration_entry(swp))
return true;
else
return false;
}
static bool is_hugetlb_entry_hwpoisoned(pte_t pte)
{
swp_entry_t swp;
if (huge_pte_none(pte) || pte_present(pte))
return false;
swp = pte_to_swp_entry(pte);
if (is_hwpoison_entry(swp))
return true;
else
return false;
}
static void
hugetlb_install_page(struct vm_area_struct *vma, pte_t *ptep, unsigned long addr,
struct page *new_page)
{
__SetPageUptodate(new_page);
set_huge_pte_at(vma->vm_mm, addr, ptep, make_huge_pte(vma, new_page, 1));
hugepage_add_new_anon_rmap(new_page, vma, addr);
hugetlb_count_add(pages_per_huge_page(hstate_vma(vma)), vma->vm_mm);
ClearHPageRestoreReserve(new_page);
SetHPageMigratable(new_page);
}
int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
struct vm_area_struct *vma)
{
pte_t *src_pte, *dst_pte, entry, dst_entry;
struct page *ptepage;
unsigned long addr;
bool cow = is_cow_mapping(vma->vm_flags);
struct hstate *h = hstate_vma(vma);
unsigned long sz = huge_page_size(h);
unsigned long npages = pages_per_huge_page(h);
struct address_space *mapping = vma->vm_file->f_mapping;
struct mmu_notifier_range range;
int ret = 0;
if (cow) {
mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, src,
vma->vm_start,
vma->vm_end);
mmu_notifier_invalidate_range_start(&range);
} else {
/*
* For shared mappings i_mmap_rwsem must be held to call
* huge_pte_alloc, otherwise the returned ptep could go
* away if part of a shared pmd and another thread calls
* huge_pmd_unshare.
*/
i_mmap_lock_read(mapping);
}
for (addr = vma->vm_start; addr < vma->vm_end; addr += sz) {
spinlock_t *src_ptl, *dst_ptl;
src_pte = huge_pte_offset(src, addr, sz);
if (!src_pte)
continue;
dst_pte = huge_pte_alloc(dst, vma, addr, sz);
if (!dst_pte) {
ret = -ENOMEM;
break;
}
/*
* If the pagetables are shared don't copy or take references.
* dst_pte == src_pte is the common case of src/dest sharing.
*
* However, src could have 'unshared' and dst shares with
* another vma. If dst_pte !none, this implies sharing.
* Check here before taking page table lock, and once again
* after taking the lock below.
*/
dst_entry = huge_ptep_get(dst_pte);
if ((dst_pte == src_pte) || !huge_pte_none(dst_entry))
continue;
dst_ptl = huge_pte_lock(h, dst, dst_pte);
src_ptl = huge_pte_lockptr(h, src, src_pte);
spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
entry = huge_ptep_get(src_pte);
dst_entry = huge_ptep_get(dst_pte);
again:
if (huge_pte_none(entry) || !huge_pte_none(dst_entry)) {
/*
* Skip if src entry none. Also, skip in the
* unlikely case dst entry !none as this implies
* sharing with another vma.
*/
;
} else if (unlikely(is_hugetlb_entry_migration(entry) ||
is_hugetlb_entry_hwpoisoned(entry))) {
swp_entry_t swp_entry = pte_to_swp_entry(entry);
if (is_writable_migration_entry(swp_entry) && cow) {
/*
* COW mappings require pages in both
* parent and child to be set to read.
*/
swp_entry = make_readable_migration_entry(
swp_offset(swp_entry));
entry = swp_entry_to_pte(swp_entry);
set_huge_swap_pte_at(src, addr, src_pte,
entry, sz);
}
set_huge_swap_pte_at(dst, addr, dst_pte, entry, sz);
} else {
entry = huge_ptep_get(src_pte);
ptepage = pte_page(entry);
get_page(ptepage);
/*
* This is a rare case where we see pinned hugetlb
* pages while they're prone to COW. We need to do the
* COW earlier during fork.
*
* When pre-allocating the page or copying data, we
* need to be without the pgtable locks since we could
* sleep during the process.
*/
if (unlikely(page_needs_cow_for_dma(vma, ptepage))) {
pte_t src_pte_old = entry;
struct page *new;
spin_unlock(src_ptl);
spin_unlock(dst_ptl);
/* Do not use reserve as it's private owned */
new = alloc_huge_page(vma, addr, 1);
if (IS_ERR(new)) {
put_page(ptepage);
ret = PTR_ERR(new);
break;
}
copy_user_huge_page(new, ptepage, addr, vma,
npages);
put_page(ptepage);
/* Install the new huge page if src pte stable */
dst_ptl = huge_pte_lock(h, dst, dst_pte);
src_ptl = huge_pte_lockptr(h, src, src_pte);
spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
entry = huge_ptep_get(src_pte);
if (!pte_same(src_pte_old, entry)) {
restore_reserve_on_error(h, vma, addr,
new);
put_page(new);
/* dst_entry won't change as in child */
goto again;
}
hugetlb_install_page(vma, dst_pte, addr, new);
spin_unlock(src_ptl);
spin_unlock(dst_ptl);
continue;
}
if (cow) {
/*
* No need to notify as we are downgrading page
* table protection not changing it to point
* to a new page.
*
* See Documentation/vm/mmu_notifier.rst
*/
huge_ptep_set_wrprotect(src, addr, src_pte);
entry = huge_pte_wrprotect(entry);
}
page_dup_rmap(ptepage, true);
set_huge_pte_at(dst, addr, dst_pte, entry);
hugetlb_count_add(npages, dst);
}
spin_unlock(src_ptl);
spin_unlock(dst_ptl);
}
if (cow)
mmu_notifier_invalidate_range_end(&range);
else
i_mmap_unlock_read(mapping);
return ret;
}
void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
unsigned long start, unsigned long end,
struct page *ref_page)
{
struct mm_struct *mm = vma->vm_mm;
unsigned long address;
pte_t *ptep;
pte_t pte;
spinlock_t *ptl;
struct page *page;
struct hstate *h = hstate_vma(vma);
unsigned long sz = huge_page_size(h);
struct mmu_notifier_range range;
bool force_flush = false;
WARN_ON(!is_vm_hugetlb_page(vma));
BUG_ON(start & ~huge_page_mask(h));
BUG_ON(end & ~huge_page_mask(h));
/*
* This is a hugetlb vma, all the pte entries should point
* to huge page.
*/
tlb_change_page_size(tlb, sz);
tlb_start_vma(tlb, vma);
/*
* If sharing possible, alert mmu notifiers of worst case.
*/
mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, vma, mm, start,
end);
adjust_range_if_pmd_sharing_possible(vma, &range.start, &range.end);
mmu_notifier_invalidate_range_start(&range);
address = start;
for (; address < end; address += sz) {
ptep = huge_pte_offset(mm, address, sz);
if (!ptep)
continue;
ptl = huge_pte_lock(h, mm, ptep);
if (huge_pmd_unshare(mm, vma, &address, ptep)) {
spin_unlock(ptl);
tlb_flush_pmd_range(tlb, address & PUD_MASK, PUD_SIZE);
force_flush = true;
continue;
}
pte = huge_ptep_get(ptep);
if (huge_pte_none(pte)) {
spin_unlock(ptl);
continue;
}
/*
* Migrating hugepage or HWPoisoned hugepage is already
* unmapped and its refcount is dropped, so just clear pte here.
*/
if (unlikely(!pte_present(pte))) {
huge_pte_clear(mm, address, ptep, sz);
spin_unlock(ptl);
continue;
}
page = pte_page(pte);
/*
* If a reference page is supplied, it is because a specific
* page is being unmapped, not a range. Ensure the page we
* are about to unmap is the actual page of interest.
*/
if (ref_page) {
if (page != ref_page) {
spin_unlock(ptl);
continue;
}
/*
* Mark the VMA as having unmapped its page so that
* future faults in this VMA will fail rather than
* looking like data was lost
*/
set_vma_resv_flags(vma, HPAGE_RESV_UNMAPPED);
}
pte = huge_ptep_get_and_clear(mm, address, ptep);
tlb_remove_huge_tlb_entry(h, tlb, ptep, address);
if (huge_pte_dirty(pte))
set_page_dirty(page);
hugetlb_count_sub(pages_per_huge_page(h), mm);
page_remove_rmap(page, true);
spin_unlock(ptl);
tlb_remove_page_size(tlb, page, huge_page_size(h));
/*
* Bail out after unmapping reference page if supplied
*/
if (ref_page)
break;
}
mmu_notifier_invalidate_range_end(&range);
tlb_end_vma(tlb, vma);
/*
* If we unshared PMDs, the TLB flush was not recorded in mmu_gather. We
* could defer the flush until now, since by holding i_mmap_rwsem we
* guaranteed that the last refernece would not be dropped. But we must
* do the flushing before we return, as otherwise i_mmap_rwsem will be
* dropped and the last reference to the shared PMDs page might be
* dropped as well.
*
* In theory we could defer the freeing of the PMD pages as well, but
* huge_pmd_unshare() relies on the exact page_count for the PMD page to
* detect sharing, so we cannot defer the release of the page either.
* Instead, do flush now.
*/
if (force_flush)
tlb_flush_mmu_tlbonly(tlb);
}
void __unmap_hugepage_range_final(struct mmu_gather *tlb,
struct vm_area_struct *vma, unsigned long start,
unsigned long end, struct page *ref_page)
{
__unmap_hugepage_range(tlb, vma, start, end, ref_page);
/*
* Clear this flag so that x86's huge_pmd_share page_table_shareable
* test will fail on a vma being torn down, and not grab a page table
* on its way out. We're lucky that the flag has such an appropriate
* name, and can in fact be safely cleared here. We could clear it
* before the __unmap_hugepage_range above, but all that's necessary
* is to clear it before releasing the i_mmap_rwsem. This works
* because in the context this is called, the VMA is about to be
* destroyed and the i_mmap_rwsem is held.
*/
vma->vm_flags &= ~VM_MAYSHARE;
}
void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
unsigned long end, struct page *ref_page)
{
struct mmu_gather tlb;
tlb_gather_mmu(&tlb, vma->vm_mm);
__unmap_hugepage_range(&tlb, vma, start, end, ref_page);
tlb_finish_mmu(&tlb);
}
/*
* This is called when the original mapper is failing to COW a MAP_PRIVATE
* mapping it owns the reserve page for. The intention is to unmap the page
* from other VMAs and let the children be SIGKILLed if they are faulting the
* same region.
*/
static void unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma,
struct page *page, unsigned long address)
{
struct hstate *h = hstate_vma(vma);
struct vm_area_struct *iter_vma;
struct address_space *mapping;
pgoff_t pgoff;
/*
* vm_pgoff is in PAGE_SIZE units, hence the different calculation
* from page cache lookup which is in HPAGE_SIZE units.
*/
address = address & huge_page_mask(h);
pgoff = ((address - vma->vm_start) >> PAGE_SHIFT) +
vma->vm_pgoff;
mapping = vma->vm_file->f_mapping;
/*
* Take the mapping lock for the duration of the table walk. As
* this mapping should be shared between all the VMAs,
* __unmap_hugepage_range() is called as the lock is already held
*/
i_mmap_lock_write(mapping);
vma_interval_tree_foreach(iter_vma, &mapping->i_mmap, pgoff, pgoff) {
/* Do not unmap the current VMA */
if (iter_vma == vma)
continue;
/*
* Shared VMAs have their own reserves and do not affect
* MAP_PRIVATE accounting but it is possible that a shared
* VMA is using the same page so check and skip such VMAs.
*/
if (iter_vma->vm_flags & VM_MAYSHARE)
continue;
/*
* Unmap the page from other VMAs without their own reserves.
* They get marked to be SIGKILLed if they fault in these
* areas. This is because a future no-page fault on this VMA
* could insert a zeroed page instead of the data existing
* from the time of fork. This would look like data corruption
*/
if (!is_vma_resv_set(iter_vma, HPAGE_RESV_OWNER))
unmap_hugepage_range(iter_vma, address,
address + huge_page_size(h), page);
}
i_mmap_unlock_write(mapping);
}
/*
* Hugetlb_cow() should be called with page lock of the original hugepage held.
* Called with hugetlb_instantiation_mutex held and pte_page locked so we
* cannot race with other handlers or page migration.
* Keep the pte_same checks anyway to make transition from the mutex easier.
*/
static vm_fault_t hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long address, pte_t *ptep,
struct page *pagecache_page, spinlock_t *ptl)
{
pte_t pte;
struct hstate *h = hstate_vma(vma);
struct page *old_page, *new_page;
int outside_reserve = 0;
vm_fault_t ret = 0;
unsigned long haddr = address & huge_page_mask(h);
struct mmu_notifier_range range;
pte = huge_ptep_get(ptep);
old_page = pte_page(pte);
retry_avoidcopy:
/* If no-one else is actually using this page, avoid the copy
* and just make the page writable */
if (page_mapcount(old_page) == 1 && PageAnon(old_page)) {
page_move_anon_rmap(old_page, vma);
set_huge_ptep_writable(vma, haddr, ptep);
return 0;
}
/*
* If the process that created a MAP_PRIVATE mapping is about to
* perform a COW due to a shared page count, attempt to satisfy
* the allocation without using the existing reserves. The pagecache
* page is used to determine if the reserve at this address was
* consumed or not. If reserves were used, a partial faulted mapping
* at the time of fork() could consume its reserves on COW instead
* of the full address range.
*/
if (is_vma_resv_set(vma, HPAGE_RESV_OWNER) &&
old_page != pagecache_page)
outside_reserve = 1;
get_page(old_page);
/*
* Drop page table lock as buddy allocator may be called. It will
* be acquired again before returning to the caller, as expected.
*/
spin_unlock(ptl);
new_page = alloc_huge_page(vma, haddr, outside_reserve);
if (IS_ERR(new_page)) {
/*
* If a process owning a MAP_PRIVATE mapping fails to COW,
* it is due to references held by a child and an insufficient
* huge page pool. To guarantee the original mappers
* reliability, unmap the page from child processes. The child
* may get SIGKILLed if it later faults.
*/
if (outside_reserve) {
struct address_space *mapping = vma->vm_file->f_mapping;
pgoff_t idx;
u32 hash;
put_page(old_page);
BUG_ON(huge_pte_none(pte));
/*
* Drop hugetlb_fault_mutex and i_mmap_rwsem before
* unmapping. unmapping needs to hold i_mmap_rwsem
* in write mode. Dropping i_mmap_rwsem in read mode
* here is OK as COW mappings do not interact with
* PMD sharing.
*
* Reacquire both after unmap operation.
*/
idx = vma_hugecache_offset(h, vma, haddr);
hash = hugetlb_fault_mutex_hash(mapping, idx);
mutex_unlock(&hugetlb_fault_mutex_table[hash]);
i_mmap_unlock_read(mapping);
unmap_ref_private(mm, vma, old_page, haddr);
i_mmap_lock_read(mapping);
mutex_lock(&hugetlb_fault_mutex_table[hash]);
spin_lock(ptl);
ptep = huge_pte_offset(mm, haddr, huge_page_size(h));
if (likely(ptep &&
pte_same(huge_ptep_get(ptep), pte)))
goto retry_avoidcopy;
/*
* race occurs while re-acquiring page table
* lock, and our job is done.
*/
return 0;
}
ret = vmf_error(PTR_ERR(new_page));
goto out_release_old;
}
/*
* When the original hugepage is shared one, it does not have
* anon_vma prepared.
*/
if (unlikely(anon_vma_prepare(vma))) {
ret = VM_FAULT_OOM;
goto out_release_all;
}
copy_user_huge_page(new_page, old_page, address, vma,
pages_per_huge_page(h));
__SetPageUptodate(new_page);
mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm, haddr,
haddr + huge_page_size(h));
mmu_notifier_invalidate_range_start(&range);
/*
* Retake the page table lock to check for racing updates
* before the page tables are altered
*/
spin_lock(ptl);
ptep = huge_pte_offset(mm, haddr, huge_page_size(h));
if (likely(ptep && pte_same(huge_ptep_get(ptep), pte))) {
ClearHPageRestoreReserve(new_page);
/* Break COW */
huge_ptep_clear_flush(vma, haddr, ptep);
mmu_notifier_invalidate_range(mm, range.start, range.end);
set_huge_pte_at(mm, haddr, ptep,
make_huge_pte(vma, new_page, 1));
page_remove_rmap(old_page, true);
hugepage_add_new_anon_rmap(new_page, vma, haddr);
SetHPageMigratable(new_page);
/* Make the old page be freed below */
new_page = old_page;
}
spin_unlock(ptl);
mmu_notifier_invalidate_range_end(&range);
out_release_all:
/* No restore in case of successful pagetable update (Break COW) */
if (new_page != old_page)
restore_reserve_on_error(h, vma, haddr, new_page);
put_page(new_page);
out_release_old:
put_page(old_page);
spin_lock(ptl); /* Caller expects lock to be held */
return ret;
}
/* Return the pagecache page at a given address within a VMA */
static struct page *hugetlbfs_pagecache_page(struct hstate *h,
struct vm_area_struct *vma, unsigned long address)
{
struct address_space *mapping;
pgoff_t idx;
mapping = vma->vm_file->f_mapping;
idx = vma_hugecache_offset(h, vma, address);
return find_lock_page(mapping, idx);
}
/*
* Return whether there is a pagecache page to back given address within VMA.
* Caller follow_hugetlb_page() holds page_table_lock so we cannot lock_page.
*/
static bool hugetlbfs_pagecache_present(struct hstate *h,
struct vm_area_struct *vma, unsigned long address)
{
struct address_space *mapping;
pgoff_t idx;
struct page *page;
mapping = vma->vm_file->f_mapping;
idx = vma_hugecache_offset(h, vma, address);
page = find_get_page(mapping, idx);
if (page)
put_page(page);
return page != NULL;
}
int huge_add_to_page_cache(struct page *page, struct address_space *mapping,
pgoff_t idx)
{
struct inode *inode = mapping->host;
struct hstate *h = hstate_inode(inode);
int err = add_to_page_cache(page, mapping, idx, GFP_KERNEL);
if (err)
return err;
ClearHPageRestoreReserve(page);
/*
* set page dirty so that it will not be removed from cache/file
* by non-hugetlbfs specific code paths.
*/
set_page_dirty(page);
spin_lock(&inode->i_lock);
inode->i_blocks += blocks_per_huge_page(h);
spin_unlock(&inode->i_lock);
return 0;
}
static inline vm_fault_t hugetlb_handle_userfault(struct vm_area_struct *vma,
struct address_space *mapping,
pgoff_t idx,
unsigned int flags,
unsigned long haddr,
unsigned long reason)
{
vm_fault_t ret;
u32 hash;
struct vm_fault vmf = {
.vma = vma,
.address = haddr,
.flags = flags,
/*
* Hard to debug if it ends up being
* used by a callee that assumes
* something about the other
* uninitialized fields... same as in
* memory.c
*/
};
/*
* hugetlb_fault_mutex and i_mmap_rwsem must be
* dropped before handling userfault. Reacquire
* after handling fault to make calling code simpler.
*/
hash = hugetlb_fault_mutex_hash(mapping, idx);
mutex_unlock(&hugetlb_fault_mutex_table[hash]);
i_mmap_unlock_read(mapping);
ret = handle_userfault(&vmf, reason);
i_mmap_lock_read(mapping);
mutex_lock(&hugetlb_fault_mutex_table[hash]);
return ret;
}
static vm_fault_t hugetlb_no_page(struct mm_struct *mm,
struct vm_area_struct *vma,
struct address_space *mapping, pgoff_t idx,
unsigned long address, pte_t *ptep, unsigned int flags)
{
struct hstate *h = hstate_vma(vma);
vm_fault_t ret = VM_FAULT_SIGBUS;
int anon_rmap = 0;
unsigned long size;
struct page *page;
pte_t new_pte;
spinlock_t *ptl;
unsigned long haddr = address & huge_page_mask(h);
bool new_page, new_pagecache_page = false;
/*
* Currently, we are forced to kill the process in the event the
* original mapper has unmapped pages from the child due to a failed
* COW. Warn that such a situation has occurred as it may not be obvious
*/
if (is_vma_resv_set(vma, HPAGE_RESV_UNMAPPED)) {
pr_warn_ratelimited("PID %d killed due to inadequate hugepage pool\n",
current->pid);
return ret;
}
/*
* We can not race with truncation due to holding i_mmap_rwsem.
* i_size is modified when holding i_mmap_rwsem, so check here
* once for faults beyond end of file.
*/
size = i_size_read(mapping->host) >> huge_page_shift(h);
if (idx >= size)
goto out;
retry:
new_page = false;
page = find_lock_page(mapping, idx);
if (!page) {
/* Check for page in userfault range */
if (userfaultfd_missing(vma)) {
ret = hugetlb_handle_userfault(vma, mapping, idx,
flags, haddr,
VM_UFFD_MISSING);
goto out;
}
page = alloc_huge_page(vma, haddr, 0);
if (IS_ERR(page)) {
/*
* Returning error will result in faulting task being
* sent SIGBUS. The hugetlb fault mutex prevents two
* tasks from racing to fault in the same page which
* could result in false unable to allocate errors.
* Page migration does not take the fault mutex, but
* does a clear then write of pte's under page table
* lock. Page fault code could race with migration,
* notice the clear pte and try to allocate a page
* here. Before returning error, get ptl and make
* sure there really is no pte entry.
*/
ptl = huge_pte_lock(h, mm, ptep);
ret = 0;
if (huge_pte_none(huge_ptep_get(ptep)))
ret = vmf_error(PTR_ERR(page));
spin_unlock(ptl);
goto out;
}
clear_huge_page(page, address, pages_per_huge_page(h));
__SetPageUptodate(page);
new_page = true;
if (vma->vm_flags & VM_MAYSHARE) {
int err = huge_add_to_page_cache(page, mapping, idx);
if (err) {
put_page(page);
if (err == -EEXIST)
goto retry;
goto out;
}
new_pagecache_page = true;
} else {
lock_page(page);
if (unlikely(anon_vma_prepare(vma))) {
ret = VM_FAULT_OOM;
goto backout_unlocked;
}
anon_rmap = 1;
}
} else {
/*
* If memory error occurs between mmap() and fault, some process
* don't have hwpoisoned swap entry for errored virtual address.
* So we need to block hugepage fault by PG_hwpoison bit check.
*/
if (unlikely(PageHWPoison(page))) {
ret = VM_FAULT_HWPOISON_LARGE |
VM_FAULT_SET_HINDEX(hstate_index(h));
goto backout_unlocked;
}
/* Check for page in userfault range. */
if (userfaultfd_minor(vma)) {
unlock_page(page);
put_page(page);
ret = hugetlb_handle_userfault(vma, mapping, idx,
flags, haddr,
VM_UFFD_MINOR);
goto out;
}
}
/*
* If we are going to COW a private mapping later, we examine the
* pending reservations for this page now. This will ensure that
* any allocations necessary to record that reservation occur outside
* the spinlock.
*/
if ((flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) {
if (vma_needs_reservation(h, vma, haddr) < 0) {
ret = VM_FAULT_OOM;
goto backout_unlocked;
}
/* Just decrements count, does not deallocate */
vma_end_reservation(h, vma, haddr);
}
ptl = huge_pte_lock(h, mm, ptep);
ret = 0;
if (!huge_pte_none(huge_ptep_get(ptep)))
goto backout;
if (anon_rmap) {
ClearHPageRestoreReserve(page);
hugepage_add_new_anon_rmap(page, vma, haddr);
} else
page_dup_rmap(page, true);
new_pte = make_huge_pte(vma, page, ((vma->vm_flags & VM_WRITE)
&& (vma->vm_flags & VM_SHARED)));
set_huge_pte_at(mm, haddr, ptep, new_pte);
hugetlb_count_add(pages_per_huge_page(h), mm);
if ((flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) {
/* Optimization, do the COW without a second fault */
ret = hugetlb_cow(mm, vma, address, ptep, page, ptl);
}
spin_unlock(ptl);
/*
* Only set HPageMigratable in newly allocated pages. Existing pages
* found in the pagecache may not have HPageMigratableset if they have
* been isolated for migration.
*/
if (new_page)
SetHPageMigratable(page);
unlock_page(page);
out:
return ret;
backout:
spin_unlock(ptl);
backout_unlocked:
unlock_page(page);
/* restore reserve for newly allocated pages not in page cache */
if (new_page && !new_pagecache_page)
restore_reserve_on_error(h, vma, haddr, page);
put_page(page);
goto out;
}
#ifdef CONFIG_SMP
u32 hugetlb_fault_mutex_hash(struct address_space *mapping, pgoff_t idx)
{
unsigned long key[2];
u32 hash;
key[0] = (unsigned long) mapping;
key[1] = idx;
hash = jhash2((u32 *)&key, sizeof(key)/(sizeof(u32)), 0);
return hash & (num_fault_mutexes - 1);
}
#else
/*
* For uniprocessor systems we always use a single mutex, so just
* return 0 and avoid the hashing overhead.
*/
u32 hugetlb_fault_mutex_hash(struct address_space *mapping, pgoff_t idx)
{
return 0;
}
#endif
vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long address, unsigned int flags)
{
pte_t *ptep, entry;
spinlock_t *ptl;
vm_fault_t ret;
u32 hash;
pgoff_t idx;
struct page *page = NULL;
struct page *pagecache_page = NULL;
struct hstate *h = hstate_vma(vma);
struct address_space *mapping;
int need_wait_lock = 0;
unsigned long haddr = address & huge_page_mask(h);
ptep = huge_pte_offset(mm, haddr, huge_page_size(h));
if (ptep) {
/*
* Since we hold no locks, ptep could be stale. That is
* OK as we are only making decisions based on content and
* not actually modifying content here.
*/
entry = huge_ptep_get(ptep);
if (unlikely(is_hugetlb_entry_migration(entry))) {
migration_entry_wait_huge(vma, mm, ptep);
return 0;
} else if (unlikely(is_hugetlb_entry_hwpoisoned(entry)))
return VM_FAULT_HWPOISON_LARGE |
VM_FAULT_SET_HINDEX(hstate_index(h));
}
/*
* Acquire i_mmap_rwsem before calling huge_pte_alloc and hold
* until finished with ptep. This serves two purposes:
* 1) It prevents huge_pmd_unshare from being called elsewhere
* and making the ptep no longer valid.
* 2) It synchronizes us with i_size modifications during truncation.
*
* ptep could have already be assigned via huge_pte_offset. That
* is OK, as huge_pte_alloc will return the same value unless
* something has changed.
*/
mapping = vma->vm_file->f_mapping;
i_mmap_lock_read(mapping);
ptep = huge_pte_alloc(mm, vma, haddr, huge_page_size(h));
if (!ptep) {
i_mmap_unlock_read(mapping);
return VM_FAULT_OOM;
}
/*
* Serialize hugepage allocation and instantiation, so that we don't
* get spurious allocation failures if two CPUs race to instantiate
* the same page in the page cache.
*/
idx = vma_hugecache_offset(h, vma, haddr);
hash = hugetlb_fault_mutex_hash(mapping, idx);
mutex_lock(&hugetlb_fault_mutex_table[hash]);
entry = huge_ptep_get(ptep);
if (huge_pte_none(entry)) {
ret = hugetlb_no_page(mm, vma, mapping, idx, address, ptep, flags);
goto out_mutex;
}
ret = 0;
/*
* entry could be a migration/hwpoison entry at this point, so this
* check prevents the kernel from going below assuming that we have
* an active hugepage in pagecache. This goto expects the 2nd page
* fault, and is_hugetlb_entry_(migration|hwpoisoned) check will
* properly handle it.
*/
if (!pte_present(entry))
goto out_mutex;
/*
* If we are going to COW the mapping later, we examine the pending
* reservations for this page now. This will ensure that any
* allocations necessary to record that reservation occur outside the
* spinlock. For private mappings, we also lookup the pagecache
* page now as it is used to determine if a reservation has been
* consumed.
*/
if ((flags & FAULT_FLAG_WRITE) && !huge_pte_write(entry)) {
if (vma_needs_reservation(h, vma, haddr) < 0) {
ret = VM_FAULT_OOM;
goto out_mutex;
}
/* Just decrements count, does not deallocate */
vma_end_reservation(h, vma, haddr);
if (!(vma->vm_flags & VM_MAYSHARE))
pagecache_page = hugetlbfs_pagecache_page(h,
vma, haddr);
}
ptl = huge_pte_lock(h, mm, ptep);
/* Check for a racing update before calling hugetlb_cow */
if (unlikely(!pte_same(entry, huge_ptep_get(ptep))))
goto out_ptl;
/*
* hugetlb_cow() requires page locks of pte_page(entry) and
* pagecache_page, so here we need take the former one
* when page != pagecache_page or !pagecache_page.
*/
page = pte_page(entry);
if (page != pagecache_page)
if (!trylock_page(page)) {
need_wait_lock = 1;
goto out_ptl;
}
get_page(page);
if (flags & FAULT_FLAG_WRITE) {
if (!huge_pte_write(entry)) {
ret = hugetlb_cow(mm, vma, address, ptep,
pagecache_page, ptl);
goto out_put_page;
}
entry = huge_pte_mkdirty(entry);
}
entry = pte_mkyoung(entry);
if (huge_ptep_set_access_flags(vma, haddr, ptep, entry,
flags & FAULT_FLAG_WRITE))
update_mmu_cache(vma, haddr, ptep);
out_put_page:
if (page != pagecache_page)
unlock_page(page);
put_page(page);
out_ptl:
spin_unlock(ptl);
if (pagecache_page) {
unlock_page(pagecache_page);
put_page(pagecache_page);
}
out_mutex:
mutex_unlock(&hugetlb_fault_mutex_table[hash]);
i_mmap_unlock_read(mapping);
/*
* Generally it's safe to hold refcount during waiting page lock. But
* here we just wait to defer the next page fault to avoid busy loop and
* the page is not used after unlocked before returning from the current
* page fault. So we are safe from accessing freed page, even if we wait
* here without taking refcount.
*/
if (need_wait_lock)
wait_on_page_locked(page);
return ret;
}
#ifdef CONFIG_USERFAULTFD
/*
* Used by userfaultfd UFFDIO_COPY. Based on mcopy_atomic_pte with
* modifications for huge pages.
*/
int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm,
pte_t *dst_pte,
struct vm_area_struct *dst_vma,
unsigned long dst_addr,
unsigned long src_addr,
enum mcopy_atomic_mode mode,
struct page **pagep)
{
bool is_continue = (mode == MCOPY_ATOMIC_CONTINUE);
struct hstate *h = hstate_vma(dst_vma);
struct address_space *mapping = dst_vma->vm_file->f_mapping;
pgoff_t idx = vma_hugecache_offset(h, dst_vma, dst_addr);
unsigned long size;
int vm_shared = dst_vma->vm_flags & VM_SHARED;
pte_t _dst_pte;
spinlock_t *ptl;
int ret = -ENOMEM;
struct page *page;
int writable;
bool page_in_pagecache = false;
if (is_continue) {
ret = -EFAULT;
page = find_lock_page(mapping, idx);
if (!page)
goto out;
page_in_pagecache = true;
} else if (!*pagep) {
/* If a page already exists, then it's UFFDIO_COPY for
* a non-missing case. Return -EEXIST.
*/
if (vm_shared &&
hugetlbfs_pagecache_present(h, dst_vma, dst_addr)) {
ret = -EEXIST;
goto out;
}
page = alloc_huge_page(dst_vma, dst_addr, 0);
if (IS_ERR(page)) {
ret = -ENOMEM;
goto out;
}
ret = copy_huge_page_from_user(page,
(const void __user *) src_addr,
pages_per_huge_page(h), false);
/* fallback to copy_from_user outside mmap_lock */
if (unlikely(ret)) {
ret = -ENOENT;
/* Free the allocated page which may have
* consumed a reservation.
*/
restore_reserve_on_error(h, dst_vma, dst_addr, page);
put_page(page);
/* Allocate a temporary page to hold the copied
* contents.
*/
page = alloc_huge_page_vma(h, dst_vma, dst_addr);
if (!page) {
ret = -ENOMEM;
goto out;
}
*pagep = page;
/* Set the outparam pagep and return to the caller to
* copy the contents outside the lock. Don't free the
* page.
*/
goto out;
}
} else {
if (vm_shared &&
hugetlbfs_pagecache_present(h, dst_vma, dst_addr)) {
put_page(*pagep);
ret = -EEXIST;
*pagep = NULL;
goto out;
}
page = alloc_huge_page(dst_vma, dst_addr, 0);
if (IS_ERR(page)) {
ret = -ENOMEM;
*pagep = NULL;
goto out;
}
copy_huge_page(page, *pagep);
put_page(*pagep);
*pagep = NULL;
}
/*
* The memory barrier inside __SetPageUptodate makes sure that
* preceding stores to the page contents become visible before
* the set_pte_at() write.
*/
__SetPageUptodate(page);
/* Add shared, newly allocated pages to the page cache. */
if (vm_shared && !is_continue) {
size = i_size_read(mapping->host) >> huge_page_shift(h);
ret = -EFAULT;
if (idx >= size)
goto out_release_nounlock;
/*
* Serialization between remove_inode_hugepages() and
* huge_add_to_page_cache() below happens through the
* hugetlb_fault_mutex_table that here must be hold by
* the caller.
*/
ret = huge_add_to_page_cache(page, mapping, idx);
if (ret)
goto out_release_nounlock;
page_in_pagecache = true;
}
ptl = huge_pte_lockptr(h, dst_mm, dst_pte);
spin_lock(ptl);
/*
* Recheck the i_size after holding PT lock to make sure not
* to leave any page mapped (as page_mapped()) beyond the end
* of the i_size (remove_inode_hugepages() is strict about
* enforcing that). If we bail out here, we'll also leave a
* page in the radix tree in the vm_shared case beyond the end
* of the i_size, but remove_inode_hugepages() will take care
* of it as soon as we drop the hugetlb_fault_mutex_table.
*/
size = i_size_read(mapping->host) >> huge_page_shift(h);
ret = -EFAULT;
if (idx >= size)
goto out_release_unlock;
ret = -EEXIST;
if (!huge_pte_none(huge_ptep_get(dst_pte)))
goto out_release_unlock;
if (vm_shared) {
page_dup_rmap(page, true);
} else {
ClearHPageRestoreReserve(page);
hugepage_add_new_anon_rmap(page, dst_vma, dst_addr);
}
/* For CONTINUE on a non-shared VMA, don't set VM_WRITE for CoW. */
if (is_continue && !vm_shared)
writable = 0;
else
writable = dst_vma->vm_flags & VM_WRITE;
_dst_pte = make_huge_pte(dst_vma, page, writable);
if (writable)
_dst_pte = huge_pte_mkdirty(_dst_pte);
_dst_pte = pte_mkyoung(_dst_pte);
set_huge_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte);
(void)huge_ptep_set_access_flags(dst_vma, dst_addr, dst_pte, _dst_pte,
dst_vma->vm_flags & VM_WRITE);
hugetlb_count_add(pages_per_huge_page(h), dst_mm);
/* No need to invalidate - it was non-present before */
update_mmu_cache(dst_vma, dst_addr, dst_pte);
spin_unlock(ptl);
if (!is_continue)
SetHPageMigratable(page);
if (vm_shared || is_continue)
unlock_page(page);
ret = 0;
out:
return ret;
out_release_unlock:
spin_unlock(ptl);
if (vm_shared || is_continue)
unlock_page(page);
out_release_nounlock:
if (!page_in_pagecache)
restore_reserve_on_error(h, dst_vma, dst_addr, page);
put_page(page);
goto out;
}
#endif /* CONFIG_USERFAULTFD */
static void record_subpages_vmas(struct page *page, struct vm_area_struct *vma,
int refs, struct page **pages,
struct vm_area_struct **vmas)
{
int nr;
for (nr = 0; nr < refs; nr++) {
if (likely(pages))
pages[nr] = mem_map_offset(page, nr);
if (vmas)
vmas[nr] = vma;
}
}
long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
struct page **pages, struct vm_area_struct **vmas,
unsigned long *position, unsigned long *nr_pages,
long i, unsigned int flags, int *locked)
{
unsigned long pfn_offset;
unsigned long vaddr = *position;
unsigned long remainder = *nr_pages;
struct hstate *h = hstate_vma(vma);
int err = -EFAULT, refs;
while (vaddr < vma->vm_end && remainder) {
pte_t *pte;
spinlock_t *ptl = NULL;
int absent;
struct page *page;
/*
* If we have a pending SIGKILL, don't keep faulting pages and
* potentially allocating memory.
*/
if (fatal_signal_pending(current)) {
remainder = 0;
break;
}
/*
* Some archs (sparc64, sh*) have multiple pte_ts to
* each hugepage. We have to make sure we get the
* first, for the page indexing below to work.
*
* Note that page table lock is not held when pte is null.
*/
pte = huge_pte_offset(mm, vaddr & huge_page_mask(h),
huge_page_size(h));
if (pte)
ptl = huge_pte_lock(h, mm, pte);
absent = !pte || huge_pte_none(huge_ptep_get(pte));
/*
* When coredumping, it suits get_dump_page if we just return
* an error where there's an empty slot with no huge pagecache
* to back it. This way, we avoid allocating a hugepage, and
* the sparse dumpfile avoids allocating disk blocks, but its
* huge holes still show up with zeroes where they need to be.
*/
if (absent && (flags & FOLL_DUMP) &&
!hugetlbfs_pagecache_present(h, vma, vaddr)) {
if (pte)
spin_unlock(ptl);
remainder = 0;
break;
}
/*
* We need call hugetlb_fault for both hugepages under migration
* (in which case hugetlb_fault waits for the migration,) and
* hwpoisoned hugepages (in which case we need to prevent the
* caller from accessing to them.) In order to do this, we use
* here is_swap_pte instead of is_hugetlb_entry_migration and
* is_hugetlb_entry_hwpoisoned. This is because it simply covers
* both cases, and because we can't follow correct pages
* directly from any kind of swap entries.
*/
if (absent || is_swap_pte(huge_ptep_get(pte)) ||
((flags & FOLL_WRITE) &&
!huge_pte_write(huge_ptep_get(pte)))) {
vm_fault_t ret;
unsigned int fault_flags = 0;
if (pte)
spin_unlock(ptl);
if (flags & FOLL_WRITE)
fault_flags |= FAULT_FLAG_WRITE;
if (locked)
fault_flags |= FAULT_FLAG_ALLOW_RETRY |
FAULT_FLAG_KILLABLE;
if (flags & FOLL_NOWAIT)
fault_flags |= FAULT_FLAG_ALLOW_RETRY |
FAULT_FLAG_RETRY_NOWAIT;
if (flags & FOLL_TRIED) {
/*
* Note: FAULT_FLAG_ALLOW_RETRY and
* FAULT_FLAG_TRIED can co-exist
*/
fault_flags |= FAULT_FLAG_TRIED;
}
ret = hugetlb_fault(mm, vma, vaddr, fault_flags);
if (ret & VM_FAULT_ERROR) {
err = vm_fault_to_errno(ret, flags);
remainder = 0;
break;
}
if (ret & VM_FAULT_RETRY) {
if (locked &&
!(fault_flags & FAULT_FLAG_RETRY_NOWAIT))
*locked = 0;
*nr_pages = 0;
/*
* VM_FAULT_RETRY must not return an
* error, it will return zero
* instead.
*
* No need to update "position" as the
* caller will not check it after
* *nr_pages is set to 0.
*/
return i;
}
continue;
}
pfn_offset = (vaddr & ~huge_page_mask(h)) >> PAGE_SHIFT;
page = pte_page(huge_ptep_get(pte));
/*
* If subpage information not requested, update counters
* and skip the same_page loop below.
*/
if (!pages && !vmas && !pfn_offset &&
(vaddr + huge_page_size(h) < vma->vm_end) &&
(remainder >= pages_per_huge_page(h))) {
vaddr += huge_page_size(h);
remainder -= pages_per_huge_page(h);
i += pages_per_huge_page(h);
spin_unlock(ptl);
continue;
}
/* vaddr may not be aligned to PAGE_SIZE */
refs = min3(pages_per_huge_page(h) - pfn_offset, remainder,
(vma->vm_end - ALIGN_DOWN(vaddr, PAGE_SIZE)) >> PAGE_SHIFT);
if (pages || vmas)
record_subpages_vmas(mem_map_offset(page, pfn_offset),
vma, refs,
likely(pages) ? pages + i : NULL,
vmas ? vmas + i : NULL);
if (pages) {
/*
* try_grab_compound_head() should always succeed here,
* because: a) we hold the ptl lock, and b) we've just
* checked that the huge page is present in the page
* tables. If the huge page is present, then the tail
* pages must also be present. The ptl prevents the
* head page and tail pages from being rearranged in
* any way. So this page must be available at this
* point, unless the page refcount overflowed:
*/
if (WARN_ON_ONCE(!try_grab_compound_head(pages[i],
refs,
flags))) {
spin_unlock(ptl);
remainder = 0;
err = -ENOMEM;
break;
}
}
vaddr += (refs << PAGE_SHIFT);
remainder -= refs;
i += refs;
spin_unlock(ptl);
}
*nr_pages = remainder;
/*
* setting position is actually required only if remainder is
* not zero but it's faster not to add a "if (remainder)"
* branch.
*/
*position = vaddr;
return i ? i : err;
}
unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
unsigned long address, unsigned long end, pgprot_t newprot)
{
struct mm_struct *mm = vma->vm_mm;
unsigned long start = address;
pte_t *ptep;
pte_t pte;
struct hstate *h = hstate_vma(vma);
unsigned long pages = 0;
bool shared_pmd = false;
struct mmu_notifier_range range;
/*
* In the case of shared PMDs, the area to flush could be beyond
* start/end. Set range.start/range.end to cover the maximum possible
* range if PMD sharing is possible.
*/
mmu_notifier_range_init(&range, MMU_NOTIFY_PROTECTION_VMA,
0, vma, mm, start, end);
adjust_range_if_pmd_sharing_possible(vma, &range.start, &range.end);
BUG_ON(address >= end);
flush_cache_range(vma, range.start, range.end);
mmu_notifier_invalidate_range_start(&range);
i_mmap_lock_write(vma->vm_file->f_mapping);
for (; address < end; address += huge_page_size(h)) {
spinlock_t *ptl;
ptep = huge_pte_offset(mm, address, huge_page_size(h));
if (!ptep)
continue;
ptl = huge_pte_lock(h, mm, ptep);
if (huge_pmd_unshare(mm, vma, &address, ptep)) {
pages++;
spin_unlock(ptl);
shared_pmd = true;
continue;
}
pte = huge_ptep_get(ptep);
if (unlikely(is_hugetlb_entry_hwpoisoned(pte))) {
spin_unlock(ptl);
continue;
}
if (unlikely(is_hugetlb_entry_migration(pte))) {
swp_entry_t entry = pte_to_swp_entry(pte);
if (is_writable_migration_entry(entry)) {
pte_t newpte;
entry = make_readable_migration_entry(
swp_offset(entry));
newpte = swp_entry_to_pte(entry);
set_huge_swap_pte_at(mm, address, ptep,
newpte, huge_page_size(h));
pages++;
}
spin_unlock(ptl);
continue;
}
if (!huge_pte_none(pte)) {
pte_t old_pte;
unsigned int shift = huge_page_shift(hstate_vma(vma));
old_pte = huge_ptep_modify_prot_start(vma, address, ptep);
pte = pte_mkhuge(huge_pte_modify(old_pte, newprot));
pte = arch_make_huge_pte(pte, shift, vma->vm_flags);
huge_ptep_modify_prot_commit(vma, address, ptep, old_pte, pte);
pages++;
}
spin_unlock(ptl);
}
/*
* Must flush TLB before releasing i_mmap_rwsem: x86's huge_pmd_unshare
* may have cleared our pud entry and done put_page on the page table:
* once we release i_mmap_rwsem, another task can do the final put_page
* and that page table be reused and filled with junk. If we actually
* did unshare a page of pmds, flush the range corresponding to the pud.
*/
if (shared_pmd)
flush_hugetlb_tlb_range(vma, range.start, range.end);
else
flush_hugetlb_tlb_range(vma, start, end);
/*
* No need to call mmu_notifier_invalidate_range() we are downgrading
* page table protection not changing it to point to a new page.
*
* See Documentation/vm/mmu_notifier.rst
*/
i_mmap_unlock_write(vma->vm_file->f_mapping);
mmu_notifier_invalidate_range_end(&range);
return pages << h->order;
}
/* Return true if reservation was successful, false otherwise. */
bool hugetlb_reserve_pages(struct inode *inode,
long from, long to,
struct vm_area_struct *vma,
vm_flags_t vm_flags)
{
long chg, add = -1;
struct hstate *h = hstate_inode(inode);
struct hugepage_subpool *spool = subpool_inode(inode);
struct resv_map *resv_map;
struct hugetlb_cgroup *h_cg = NULL;
long gbl_reserve, regions_needed = 0;
/* This should never happen */
if (from > to) {
VM_WARN(1, "%s called with a negative range\n", __func__);
return false;
}
/*
* Only apply hugepage reservation if asked. At fault time, an
* attempt will be made for VM_NORESERVE to allocate a page
* without using reserves
*/
if (vm_flags & VM_NORESERVE)
return true;
/*
* Shared mappings base their reservation on the number of pages that
* are already allocated on behalf of the file. Private mappings need
* to reserve the full area even if read-only as mprotect() may be
* called to make the mapping read-write. Assume !vma is a shm mapping
*/
if (!vma || vma->vm_flags & VM_MAYSHARE) {
/*
* resv_map can not be NULL as hugetlb_reserve_pages is only
* called for inodes for which resv_maps were created (see
* hugetlbfs_get_inode).
*/
resv_map = inode_resv_map(inode);
chg = region_chg(resv_map, from, to, ®ions_needed);
} else {
/* Private mapping. */
resv_map = resv_map_alloc();
if (!resv_map)
return false;
chg = to - from;
set_vma_resv_map(vma, resv_map);
set_vma_resv_flags(vma, HPAGE_RESV_OWNER);
}
if (chg < 0)
goto out_err;
if (hugetlb_cgroup_charge_cgroup_rsvd(hstate_index(h),
chg * pages_per_huge_page(h), &h_cg) < 0)
goto out_err;
if (vma && !(vma->vm_flags & VM_MAYSHARE) && h_cg) {
/* For private mappings, the hugetlb_cgroup uncharge info hangs
* of the resv_map.
*/
resv_map_set_hugetlb_cgroup_uncharge_info(resv_map, h_cg, h);
}
/*
* There must be enough pages in the subpool for the mapping. If
* the subpool has a minimum size, there may be some global
* reservations already in place (gbl_reserve).
*/
gbl_reserve = hugepage_subpool_get_pages(spool, chg);
if (gbl_reserve < 0)
goto out_uncharge_cgroup;
/*
* Check enough hugepages are available for the reservation.
* Hand the pages back to the subpool if there are not
*/
if (hugetlb_acct_memory(h, gbl_reserve) < 0)
goto out_put_pages;
/*
* Account for the reservations made. Shared mappings record regions
* that have reservations as they are shared by multiple VMAs.
* When the last VMA disappears, the region map says how much
* the reservation was and the page cache tells how much of
* the reservation was consumed. Private mappings are per-VMA and
* only the consumed reservations are tracked. When the VMA
* disappears, the original reservation is the VMA size and the
* consumed reservations are stored in the map. Hence, nothing
* else has to be done for private mappings here
*/
if (!vma || vma->vm_flags & VM_MAYSHARE) {
add = region_add(resv_map, from, to, regions_needed, h, h_cg);
if (unlikely(add < 0)) {
hugetlb_acct_memory(h, -gbl_reserve);
goto out_put_pages;
} else if (unlikely(chg > add)) {
/*
* pages in this range were added to the reserve
* map between region_chg and region_add. This
* indicates a race with alloc_huge_page. Adjust
* the subpool and reserve counts modified above
* based on the difference.
*/
long rsv_adjust;
/*
* hugetlb_cgroup_uncharge_cgroup_rsvd() will put the
* reference to h_cg->css. See comment below for detail.
*/
hugetlb_cgroup_uncharge_cgroup_rsvd(
hstate_index(h),
(chg - add) * pages_per_huge_page(h), h_cg);
rsv_adjust = hugepage_subpool_put_pages(spool,
chg - add);
hugetlb_acct_memory(h, -rsv_adjust);
} else if (h_cg) {
/*
* The file_regions will hold their own reference to
* h_cg->css. So we should release the reference held
* via hugetlb_cgroup_charge_cgroup_rsvd() when we are
* done.
*/
hugetlb_cgroup_put_rsvd_cgroup(h_cg);
}
}
return true;
out_put_pages:
/* put back original number of pages, chg */
(void)hugepage_subpool_put_pages(spool, chg);
out_uncharge_cgroup:
hugetlb_cgroup_uncharge_cgroup_rsvd(hstate_index(h),
chg * pages_per_huge_page(h), h_cg);
out_err:
if (!vma || vma->vm_flags & VM_MAYSHARE)
/* Only call region_abort if the region_chg succeeded but the
* region_add failed or didn't run.
*/
if (chg >= 0 && add < 0)
region_abort(resv_map, from, to, regions_needed);
if (vma && is_vma_resv_set(vma, HPAGE_RESV_OWNER))
kref_put(&resv_map->refs, resv_map_release);
return false;
}
long hugetlb_unreserve_pages(struct inode *inode, long start, long end,
long freed)
{
struct hstate *h = hstate_inode(inode);
struct resv_map *resv_map = inode_resv_map(inode);
long chg = 0;
struct hugepage_subpool *spool = subpool_inode(inode);
long gbl_reserve;
/*
* Since this routine can be called in the evict inode path for all
* hugetlbfs inodes, resv_map could be NULL.
*/
if (resv_map) {
chg = region_del(resv_map, start, end);
/*
* region_del() can fail in the rare case where a region
* must be split and another region descriptor can not be
* allocated. If end == LONG_MAX, it will not fail.
*/
if (chg < 0)
return chg;
}
spin_lock(&inode->i_lock);
inode->i_blocks -= (blocks_per_huge_page(h) * freed);
spin_unlock(&inode->i_lock);
/*
* If the subpool has a minimum size, the number of global
* reservations to be released may be adjusted.
*
* Note that !resv_map implies freed == 0. So (chg - freed)
* won't go negative.
*/
gbl_reserve = hugepage_subpool_put_pages(spool, (chg - freed));
hugetlb_acct_memory(h, -gbl_reserve);
return 0;
}
#ifdef CONFIG_ARCH_WANT_HUGE_PMD_SHARE
static unsigned long page_table_shareable(struct vm_area_struct *svma,
struct vm_area_struct *vma,
unsigned long addr, pgoff_t idx)
{
unsigned long saddr = ((idx - svma->vm_pgoff) << PAGE_SHIFT) +
svma->vm_start;
unsigned long sbase = saddr & PUD_MASK;
unsigned long s_end = sbase + PUD_SIZE;
/* Allow segments to share if only one is marked locked */
unsigned long vm_flags = vma->vm_flags & VM_LOCKED_CLEAR_MASK;
unsigned long svm_flags = svma->vm_flags & VM_LOCKED_CLEAR_MASK;
/*
* match the virtual addresses, permission and the alignment of the
* page table page.
*/
if (pmd_index(addr) != pmd_index(saddr) ||
vm_flags != svm_flags ||
!range_in_vma(svma, sbase, s_end))
return 0;
return saddr;
}
static bool vma_shareable(struct vm_area_struct *vma, unsigned long addr)
{
unsigned long base = addr & PUD_MASK;
unsigned long end = base + PUD_SIZE;
/*
* check on proper vm_flags and page table alignment
*/
if (vma->vm_flags & VM_MAYSHARE && range_in_vma(vma, base, end))
return true;
return false;
}
bool want_pmd_share(struct vm_area_struct *vma, unsigned long addr)
{
#ifdef CONFIG_USERFAULTFD
if (uffd_disable_huge_pmd_share(vma))
return false;
#endif
return vma_shareable(vma, addr);
}
/*
* Determine if start,end range within vma could be mapped by shared pmd.
* If yes, adjust start and end to cover range associated with possible
* shared pmd mappings.
*/
void adjust_range_if_pmd_sharing_possible(struct vm_area_struct *vma,
unsigned long *start, unsigned long *end)
{
unsigned long v_start = ALIGN(vma->vm_start, PUD_SIZE),
v_end = ALIGN_DOWN(vma->vm_end, PUD_SIZE);
/*
* vma needs to span at least one aligned PUD size, and the range
* must be at least partially within in.
*/
if (!(vma->vm_flags & VM_MAYSHARE) || !(v_end > v_start) ||
(*end <= v_start) || (*start >= v_end))
return;
/* Extend the range to be PUD aligned for a worst case scenario */
if (*start > v_start)
*start = ALIGN_DOWN(*start, PUD_SIZE);
if (*end < v_end)
*end = ALIGN(*end, PUD_SIZE);
}
/*
* Search for a shareable pmd page for hugetlb. In any case calls pmd_alloc()
* and returns the corresponding pte. While this is not necessary for the
* !shared pmd case because we can allocate the pmd later as well, it makes the
* code much cleaner.
*
* This routine must be called with i_mmap_rwsem held in at least read mode if
* sharing is possible. For hugetlbfs, this prevents removal of any page
* table entries associated with the address space. This is important as we
* are setting up sharing based on existing page table entries (mappings).
*
* NOTE: This routine is only called from huge_pte_alloc. Some callers of
* huge_pte_alloc know that sharing is not possible and do not take
* i_mmap_rwsem as a performance optimization. This is handled by the
* if !vma_shareable check at the beginning of the routine. i_mmap_rwsem is
* only required for subsequent processing.
*/
pte_t *huge_pmd_share(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long addr, pud_t *pud)
{
struct address_space *mapping = vma->vm_file->f_mapping;
pgoff_t idx = ((addr - vma->vm_start) >> PAGE_SHIFT) +
vma->vm_pgoff;
struct vm_area_struct *svma;
unsigned long saddr;
pte_t *spte = NULL;
pte_t *pte;
spinlock_t *ptl;
i_mmap_assert_locked(mapping);
vma_interval_tree_foreach(svma, &mapping->i_mmap, idx, idx) {
if (svma == vma)
continue;
saddr = page_table_shareable(svma, vma, addr, idx);
if (saddr) {
spte = huge_pte_offset(svma->vm_mm, saddr,
vma_mmu_pagesize(svma));
if (spte) {
get_page(virt_to_page(spte));
break;
}
}
}
if (!spte)
goto out;
ptl = huge_pte_lock(hstate_vma(vma), mm, spte);
if (pud_none(*pud)) {
pud_populate(mm, pud,
(pmd_t *)((unsigned long)spte & PAGE_MASK));
mm_inc_nr_pmds(mm);
} else {
put_page(virt_to_page(spte));
}
spin_unlock(ptl);
out:
pte = (pte_t *)pmd_alloc(mm, pud, addr);
return pte;
}
/*
* unmap huge page backed by shared pte.
*
* Hugetlb pte page is ref counted at the time of mapping. If pte is shared
* indicated by page_count > 1, unmap is achieved by clearing pud and
* decrementing the ref count. If count == 1, the pte page is not shared.
*
* Called with page table lock held and i_mmap_rwsem held in write mode.
*
* returns: 1 successfully unmapped a shared pte page
* 0 the underlying pte page is not shared, or it is the last user
*/
int huge_pmd_unshare(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long *addr, pte_t *ptep)
{
pgd_t *pgd = pgd_offset(mm, *addr);
p4d_t *p4d = p4d_offset(pgd, *addr);
pud_t *pud = pud_offset(p4d, *addr);
i_mmap_assert_write_locked(vma->vm_file->f_mapping);
BUG_ON(page_count(virt_to_page(ptep)) == 0);
if (page_count(virt_to_page(ptep)) == 1)
return 0;
pud_clear(pud);
put_page(virt_to_page(ptep));
mm_dec_nr_pmds(mm);
*addr = ALIGN(*addr, HPAGE_SIZE * PTRS_PER_PTE) - HPAGE_SIZE;
return 1;
}
#else /* !CONFIG_ARCH_WANT_HUGE_PMD_SHARE */
pte_t *huge_pmd_share(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long addr, pud_t *pud)
{
return NULL;
}
int huge_pmd_unshare(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long *addr, pte_t *ptep)
{
return 0;
}
void adjust_range_if_pmd_sharing_possible(struct vm_area_struct *vma,
unsigned long *start, unsigned long *end)
{
}
bool want_pmd_share(struct vm_area_struct *vma, unsigned long addr)
{
return false;
}
#endif /* CONFIG_ARCH_WANT_HUGE_PMD_SHARE */
#ifdef CONFIG_ARCH_WANT_GENERAL_HUGETLB
pte_t *huge_pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long addr, unsigned long sz)
{
pgd_t *pgd;
p4d_t *p4d;
pud_t *pud;
pte_t *pte = NULL;
pgd = pgd_offset(mm, addr);
p4d = p4d_alloc(mm, pgd, addr);
if (!p4d)
return NULL;
pud = pud_alloc(mm, p4d, addr);
if (pud) {
if (sz == PUD_SIZE) {
pte = (pte_t *)pud;
} else {
BUG_ON(sz != PMD_SIZE);
if (want_pmd_share(vma, addr) && pud_none(*pud))
pte = huge_pmd_share(mm, vma, addr, pud);
else
pte = (pte_t *)pmd_alloc(mm, pud, addr);
}
}
BUG_ON(pte && pte_present(*pte) && !pte_huge(*pte));
return pte;
}
/*
* huge_pte_offset() - Walk the page table to resolve the hugepage
* entry at address @addr
*
* Return: Pointer to page table entry (PUD or PMD) for
* address @addr, or NULL if a !p*d_present() entry is encountered and the
* size @sz doesn't match the hugepage size at this level of the page
* table.
*/
pte_t *huge_pte_offset(struct mm_struct *mm,
unsigned long addr, unsigned long sz)
{
pgd_t *pgd;
p4d_t *p4d;
pud_t *pud;
pmd_t *pmd;
pgd = pgd_offset(mm, addr);
if (!pgd_present(*pgd))
return NULL;
p4d = p4d_offset(pgd, addr);
if (!p4d_present(*p4d))
return NULL;
pud = pud_offset(p4d, addr);
if (sz == PUD_SIZE)
/* must be pud huge, non-present or none */
return (pte_t *)pud;
if (!pud_present(*pud))
return NULL;
/* must have a valid entry and size to go further */
pmd = pmd_offset(pud, addr);
/* must be pmd huge, non-present or none */
return (pte_t *)pmd;
}
#endif /* CONFIG_ARCH_WANT_GENERAL_HUGETLB */
/*
* These functions are overwritable if your architecture needs its own
* behavior.
*/
struct page * __weak
follow_huge_addr(struct mm_struct *mm, unsigned long address,
int write)
{
return ERR_PTR(-EINVAL);
}
struct page * __weak
follow_huge_pd(struct vm_area_struct *vma,
unsigned long address, hugepd_t hpd, int flags, int pdshift)
{
WARN(1, "hugepd follow called with no support for hugepage directory format\n");
return NULL;
}
struct page * __weak
follow_huge_pmd(struct mm_struct *mm, unsigned long address,
pmd_t *pmd, int flags)
{
struct page *page = NULL;
spinlock_t *ptl;
pte_t pte;
/* FOLL_GET and FOLL_PIN are mutually exclusive. */
if (WARN_ON_ONCE((flags & (FOLL_PIN | FOLL_GET)) ==
(FOLL_PIN | FOLL_GET)))
return NULL;
retry:
ptl = pmd_lockptr(mm, pmd);
spin_lock(ptl);
/*
* make sure that the address range covered by this pmd is not
* unmapped from other threads.
*/
if (!pmd_huge(*pmd))
goto out;
pte = huge_ptep_get((pte_t *)pmd);
if (pte_present(pte)) {
page = pmd_page(*pmd) + ((address & ~PMD_MASK) >> PAGE_SHIFT);
/*
* try_grab_page() should always succeed here, because: a) we
* hold the pmd (ptl) lock, and b) we've just checked that the
* huge pmd (head) page is present in the page tables. The ptl
* prevents the head page and tail pages from being rearranged
* in any way. So this page must be available at this point,
* unless the page refcount overflowed:
*/
if (WARN_ON_ONCE(!try_grab_page(page, flags))) {
page = NULL;
goto out;
}
} else {
if (is_hugetlb_entry_migration(pte)) {
spin_unlock(ptl);
__migration_entry_wait(mm, (pte_t *)pmd, ptl);
goto retry;
}
/*
* hwpoisoned entry is treated as no_page_table in
* follow_page_mask().
*/
}
out:
spin_unlock(ptl);
return page;
}
struct page * __weak
follow_huge_pud(struct mm_struct *mm, unsigned long address,
pud_t *pud, int flags)
{
if (flags & (FOLL_GET | FOLL_PIN))
return NULL;
return pte_page(*(pte_t *)pud) + ((address & ~PUD_MASK) >> PAGE_SHIFT);
}
struct page * __weak
follow_huge_pgd(struct mm_struct *mm, unsigned long address, pgd_t *pgd, int flags)
{
if (flags & (FOLL_GET | FOLL_PIN))
return NULL;
return pte_page(*(pte_t *)pgd) + ((address & ~PGDIR_MASK) >> PAGE_SHIFT);
}
bool isolate_huge_page(struct page *page, struct list_head *list)
{
bool ret = true;
spin_lock_irq(&hugetlb_lock);
if (!PageHeadHuge(page) ||
!HPageMigratable(page) ||
!get_page_unless_zero(page)) {
ret = false;
goto unlock;
}
ClearHPageMigratable(page);
list_move_tail(&page->lru, list);
unlock:
spin_unlock_irq(&hugetlb_lock);
return ret;
}
int get_hwpoison_huge_page(struct page *page, bool *hugetlb)
{
int ret = 0;
*hugetlb = false;
spin_lock_irq(&hugetlb_lock);
if (PageHeadHuge(page)) {
*hugetlb = true;
if (HPageFreed(page) || HPageMigratable(page))
ret = get_page_unless_zero(page);
else
ret = -EBUSY;
}
spin_unlock_irq(&hugetlb_lock);
return ret;
}
void putback_active_hugepage(struct page *page)
{
spin_lock_irq(&hugetlb_lock);
SetHPageMigratable(page);
list_move_tail(&page->lru, &(page_hstate(page))->hugepage_activelist);
spin_unlock_irq(&hugetlb_lock);
put_page(page);
}
void move_hugetlb_state(struct page *oldpage, struct page *newpage, int reason)
{
struct hstate *h = page_hstate(oldpage);
hugetlb_cgroup_migrate(oldpage, newpage);
set_page_owner_migrate_reason(newpage, reason);
/*
* transfer temporary state of the new huge page. This is
* reverse to other transitions because the newpage is going to
* be final while the old one will be freed so it takes over
* the temporary status.
*
* Also note that we have to transfer the per-node surplus state
* here as well otherwise the global surplus count will not match
* the per-node's.
*/
if (HPageTemporary(newpage)) {
int old_nid = page_to_nid(oldpage);
int new_nid = page_to_nid(newpage);
SetHPageTemporary(oldpage);
ClearHPageTemporary(newpage);
/*
* There is no need to transfer the per-node surplus state
* when we do not cross the node.
*/
if (new_nid == old_nid)
return;
spin_lock_irq(&hugetlb_lock);
if (h->surplus_huge_pages_node[old_nid]) {
h->surplus_huge_pages_node[old_nid]--;
h->surplus_huge_pages_node[new_nid]++;
}
spin_unlock_irq(&hugetlb_lock);
}
}
/*
* This function will unconditionally remove all the shared pmd pgtable entries
* within the specific vma for a hugetlbfs memory range.
*/
void hugetlb_unshare_all_pmds(struct vm_area_struct *vma)
{
struct hstate *h = hstate_vma(vma);
unsigned long sz = huge_page_size(h);
struct mm_struct *mm = vma->vm_mm;
struct mmu_notifier_range range;
unsigned long address, start, end;
spinlock_t *ptl;
pte_t *ptep;
if (!(vma->vm_flags & VM_MAYSHARE))
return;
start = ALIGN(vma->vm_start, PUD_SIZE);
end = ALIGN_DOWN(vma->vm_end, PUD_SIZE);
if (start >= end)
return;
/*
* No need to call adjust_range_if_pmd_sharing_possible(), because
* we have already done the PUD_SIZE alignment.
*/
mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm,
start, end);
mmu_notifier_invalidate_range_start(&range);
i_mmap_lock_write(vma->vm_file->f_mapping);
for (address = start; address < end; address += PUD_SIZE) {
unsigned long tmp = address;
ptep = huge_pte_offset(mm, address, sz);
if (!ptep)
continue;
ptl = huge_pte_lock(h, mm, ptep);
/* We don't want 'address' to be changed */
huge_pmd_unshare(mm, vma, &tmp, ptep);
spin_unlock(ptl);
}
flush_hugetlb_tlb_range(vma, start, end);
i_mmap_unlock_write(vma->vm_file->f_mapping);
/*
* No need to call mmu_notifier_invalidate_range(), see
* Documentation/vm/mmu_notifier.rst.
*/
mmu_notifier_invalidate_range_end(&range);
}
#ifdef CONFIG_CMA
static bool cma_reserve_called __initdata;
static int __init cmdline_parse_hugetlb_cma(char *p)
{
hugetlb_cma_size = memparse(p, &p);
return 0;
}
early_param("hugetlb_cma", cmdline_parse_hugetlb_cma);
void __init hugetlb_cma_reserve(int order)
{
unsigned long size, reserved, per_node;
int nid;
cma_reserve_called = true;
if (!hugetlb_cma_size)
return;
if (hugetlb_cma_size < (PAGE_SIZE << order)) {
pr_warn("hugetlb_cma: cma area should be at least %lu MiB\n",
(PAGE_SIZE << order) / SZ_1M);
return;
}
/*
* If 3 GB area is requested on a machine with 4 numa nodes,
* let's allocate 1 GB on first three nodes and ignore the last one.
*/
per_node = DIV_ROUND_UP(hugetlb_cma_size, nr_online_nodes);
pr_info("hugetlb_cma: reserve %lu MiB, up to %lu MiB per node\n",
hugetlb_cma_size / SZ_1M, per_node / SZ_1M);
reserved = 0;
for_each_node_state(nid, N_ONLINE) {
int res;
char name[CMA_MAX_NAME];
size = min(per_node, hugetlb_cma_size - reserved);
size = round_up(size, PAGE_SIZE << order);
snprintf(name, sizeof(name), "hugetlb%d", nid);
res = cma_declare_contiguous_nid(0, size, 0, PAGE_SIZE << order,
0, false, name,
&hugetlb_cma[nid], nid);
if (res) {
pr_warn("hugetlb_cma: reservation failed: err %d, node %d",
res, nid);
continue;
}
reserved += size;
pr_info("hugetlb_cma: reserved %lu MiB on node %d\n",
size / SZ_1M, nid);
if (reserved >= hugetlb_cma_size)
break;
}
}
void __init hugetlb_cma_check(void)
{
if (!hugetlb_cma_size || cma_reserve_called)
return;
pr_warn("hugetlb_cma: the option isn't supported by current arch\n");
}
#endif /* CONFIG_CMA */
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef BTRFS_EXTENT_IO_H
#define BTRFS_EXTENT_IO_H
#include <linux/rbtree.h>
#include <linux/refcount.h>
#include <linux/fiemap.h>
#include <linux/btrfs_tree.h>
#include "ulist.h"
/*
* flags for bio submission. The high bits indicate the compression
* type for this bio
*/
#define EXTENT_BIO_COMPRESSED 1
#define EXTENT_BIO_FLAG_SHIFT 16
enum {
EXTENT_BUFFER_UPTODATE,
EXTENT_BUFFER_DIRTY,
EXTENT_BUFFER_CORRUPT,
/* this got triggered by readahead */
EXTENT_BUFFER_READAHEAD,
EXTENT_BUFFER_TREE_REF,
EXTENT_BUFFER_STALE,
EXTENT_BUFFER_WRITEBACK,
/* read IO error */
EXTENT_BUFFER_READ_ERR,
EXTENT_BUFFER_UNMAPPED,
EXTENT_BUFFER_IN_TREE,
/* write IO error */
EXTENT_BUFFER_WRITE_ERR,
EXTENT_BUFFER_NO_CHECK,
};
/* these are flags for __process_pages_contig */
#define PAGE_UNLOCK (1 << 0)
/* Page starts writeback, clear dirty bit and set writeback bit */
#define PAGE_START_WRITEBACK (1 << 1)
#define PAGE_END_WRITEBACK (1 << 2)
#define PAGE_SET_ORDERED (1 << 3)
#define PAGE_SET_ERROR (1 << 4)
#define PAGE_LOCK (1 << 5)
/*
* page->private values. Every page that is controlled by the extent
* map has page->private set to one.
*/
#define EXTENT_PAGE_PRIVATE 1
/*
* The extent buffer bitmap operations are done with byte granularity instead of
* word granularity for two reasons:
* 1. The bitmaps must be little-endian on disk.
* 2. Bitmap items are not guaranteed to be aligned to a word and therefore a
* single word in a bitmap may straddle two pages in the extent buffer.
*/
#define BIT_BYTE(nr) ((nr) / BITS_PER_BYTE)
#define BYTE_MASK ((1 << BITS_PER_BYTE) - 1)
#define BITMAP_FIRST_BYTE_MASK(start) \
((BYTE_MASK << ((start) & (BITS_PER_BYTE - 1))) & BYTE_MASK)
#define BITMAP_LAST_BYTE_MASK(nbits) \
(BYTE_MASK >> (-(nbits) & (BITS_PER_BYTE - 1)))
struct btrfs_root;
struct btrfs_inode;
struct btrfs_io_bio;
struct btrfs_fs_info;
struct io_failure_record;
struct extent_io_tree;
typedef blk_status_t (submit_bio_hook_t)(struct inode *inode, struct bio *bio,
int mirror_num,
unsigned long bio_flags);
typedef blk_status_t (extent_submit_bio_start_t)(struct inode *inode,
struct bio *bio, u64 dio_file_offset);
#define INLINE_EXTENT_BUFFER_PAGES (BTRFS_MAX_METADATA_BLOCKSIZE / PAGE_SIZE)
struct extent_buffer {
u64 start;
unsigned long len;
unsigned long bflags;
struct btrfs_fs_info *fs_info;
spinlock_t refs_lock;
atomic_t refs;
atomic_t io_pages;
int read_mirror;
struct rcu_head rcu_head;
pid_t lock_owner;
/* >= 0 if eb belongs to a log tree, -1 otherwise */
s8 log_index;
struct rw_semaphore lock;
struct page *pages[INLINE_EXTENT_BUFFER_PAGES];
struct list_head release_list;
#ifdef CONFIG_BTRFS_DEBUG
struct list_head leak_list;
#endif
};
/*
* Structure to record info about the bio being assembled, and other info like
* how many bytes are there before stripe/ordered extent boundary.
*/
struct btrfs_bio_ctrl {
struct bio *bio;
unsigned long bio_flags;
u32 len_to_stripe_boundary;
u32 len_to_oe_boundary;
};
/*
* Structure to record how many bytes and which ranges are set/cleared
*/
struct extent_changeset {
/* How many bytes are set/cleared in this operation */
u64 bytes_changed;
/* Changed ranges */
struct ulist range_changed;
};
static inline void extent_changeset_init(struct extent_changeset *changeset)
{
changeset->bytes_changed = 0;
ulist_init(&changeset->range_changed);
}
static inline struct extent_changeset *extent_changeset_alloc(void)
{
struct extent_changeset *ret;
ret = kmalloc(sizeof(*ret), GFP_KERNEL);
if (!ret)
return NULL;
extent_changeset_init(ret);
return ret;
}
static inline void extent_changeset_release(struct extent_changeset *changeset)
{
if (!changeset)
return;
changeset->bytes_changed = 0;
ulist_release(&changeset->range_changed);
}
static inline void extent_changeset_free(struct extent_changeset *changeset)
{
if (!changeset)
return;
extent_changeset_release(changeset);
kfree(changeset);
}
static inline void extent_set_compress_type(unsigned long *bio_flags,
int compress_type)
{
*bio_flags |= compress_type << EXTENT_BIO_FLAG_SHIFT;
}
static inline int extent_compress_type(unsigned long bio_flags)
{
return bio_flags >> EXTENT_BIO_FLAG_SHIFT;
}
struct extent_map_tree;
typedef struct extent_map *(get_extent_t)(struct btrfs_inode *inode,
struct page *page, size_t pg_offset,
u64 start, u64 len);
int try_release_extent_mapping(struct page *page, gfp_t mask);
int try_release_extent_buffer(struct page *page);
int __must_check submit_one_bio(struct bio *bio, int mirror_num,
unsigned long bio_flags);
int btrfs_do_readpage(struct page *page, struct extent_map **em_cached,
struct btrfs_bio_ctrl *bio_ctrl,
unsigned int read_flags, u64 *prev_em_start);
int extent_write_full_page(struct page *page, struct writeback_control *wbc);
int extent_write_locked_range(struct inode *inode, u64 start, u64 end,
int mode);
int extent_writepages(struct address_space *mapping,
struct writeback_control *wbc);
int btree_write_cache_pages(struct address_space *mapping,
struct writeback_control *wbc);
void extent_readahead(struct readahead_control *rac);
int extent_fiemap(struct btrfs_inode *inode, struct fiemap_extent_info *fieinfo,
u64 start, u64 len);
int set_page_extent_mapped(struct page *page);
void clear_page_extent_mapped(struct page *page);
struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
u64 start, u64 owner_root, int level);
struct extent_buffer *__alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info,
u64 start, unsigned long len);
struct extent_buffer *alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info,
u64 start);
struct extent_buffer *btrfs_clone_extent_buffer(const struct extent_buffer *src);
struct extent_buffer *find_extent_buffer(struct btrfs_fs_info *fs_info,
u64 start);
void free_extent_buffer(struct extent_buffer *eb);
void free_extent_buffer_stale(struct extent_buffer *eb);
#define WAIT_NONE 0
#define WAIT_COMPLETE 1
#define WAIT_PAGE_LOCK 2
int read_extent_buffer_pages(struct extent_buffer *eb, int wait,
int mirror_num);
void wait_on_extent_buffer_writeback(struct extent_buffer *eb);
void btrfs_readahead_tree_block(struct btrfs_fs_info *fs_info,
u64 bytenr, u64 owner_root, u64 gen, int level);
void btrfs_readahead_node_child(struct extent_buffer *node, int slot);
static inline int num_extent_pages(const struct extent_buffer *eb)
{
/*
* For sectorsize == PAGE_SIZE case, since nodesize is always aligned to
* sectorsize, it's just eb->len >> PAGE_SHIFT.
*
* For sectorsize < PAGE_SIZE case, we could have nodesize < PAGE_SIZE,
* thus have to ensure we get at least one page.
*/
return (eb->len >> PAGE_SHIFT) ?: 1;
}
static inline int extent_buffer_uptodate(const struct extent_buffer *eb)
{
return test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
}
int memcmp_extent_buffer(const struct extent_buffer *eb, const void *ptrv,
unsigned long start, unsigned long len);
void read_extent_buffer(const struct extent_buffer *eb, void *dst,
unsigned long start,
unsigned long len);
int read_extent_buffer_to_user_nofault(const struct extent_buffer *eb,
void __user *dst, unsigned long start,
unsigned long len);
void write_extent_buffer_fsid(const struct extent_buffer *eb, const void *src);
void write_extent_buffer_chunk_tree_uuid(const struct extent_buffer *eb,
const void *src);
void write_extent_buffer(const struct extent_buffer *eb, const void *src,
unsigned long start, unsigned long len);
void copy_extent_buffer_full(const struct extent_buffer *dst,
const struct extent_buffer *src);
void copy_extent_buffer(const struct extent_buffer *dst,
const struct extent_buffer *src,
unsigned long dst_offset, unsigned long src_offset,
unsigned long len);
void memcpy_extent_buffer(const struct extent_buffer *dst,
unsigned long dst_offset, unsigned long src_offset,
unsigned long len);
void memmove_extent_buffer(const struct extent_buffer *dst,
unsigned long dst_offset, unsigned long src_offset,
unsigned long len);
void memzero_extent_buffer(const struct extent_buffer *eb, unsigned long start,
unsigned long len);
int extent_buffer_test_bit(const struct extent_buffer *eb, unsigned long start,
unsigned long pos);
void extent_buffer_bitmap_set(const struct extent_buffer *eb, unsigned long start,
unsigned long pos, unsigned long len);
void extent_buffer_bitmap_clear(const struct extent_buffer *eb,
unsigned long start, unsigned long pos,
unsigned long len);
void clear_extent_buffer_dirty(const struct extent_buffer *eb);
bool set_extent_buffer_dirty(struct extent_buffer *eb);
void set_extent_buffer_uptodate(struct extent_buffer *eb);
void clear_extent_buffer_uptodate(struct extent_buffer *eb);
int extent_buffer_under_io(const struct extent_buffer *eb);
void extent_range_clear_dirty_for_io(struct inode *inode, u64 start, u64 end);
void extent_range_redirty_for_io(struct inode *inode, u64 start, u64 end);
void extent_clear_unlock_delalloc(struct btrfs_inode *inode, u64 start, u64 end,
struct page *locked_page,
u32 bits_to_clear, unsigned long page_ops);
struct bio *btrfs_bio_alloc(u64 first_byte);
struct bio *btrfs_io_bio_alloc(unsigned int nr_iovecs);
struct bio *btrfs_bio_clone(struct bio *bio);
struct bio *btrfs_bio_clone_partial(struct bio *orig, u64 offset, u64 size);
int repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start,
u64 length, u64 logical, struct page *page,
unsigned int pg_offset, int mirror_num);
void end_extent_writepage(struct page *page, int err, u64 start, u64 end);
int btrfs_repair_eb_io_failure(const struct extent_buffer *eb, int mirror_num);
/*
* When IO fails, either with EIO or csum verification fails, we
* try other mirrors that might have a good copy of the data. This
* io_failure_record is used to record state as we go through all the
* mirrors. If another mirror has good data, the sector is set up to date
* and things continue. If a good mirror can't be found, the original
* bio end_io callback is called to indicate things have failed.
*/
struct io_failure_record {
struct page *page;
u64 start;
u64 len;
u64 logical;
unsigned long bio_flags;
int this_mirror;
int failed_mirror;
};
int btrfs_repair_one_sector(struct inode *inode,
struct bio *failed_bio, u32 bio_offset,
struct page *page, unsigned int pgoff,
u64 start, int failed_mirror,
submit_bio_hook_t *submit_bio_hook);
#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
bool find_lock_delalloc_range(struct inode *inode,
struct page *locked_page, u64 *start,
u64 *end);
#endif
struct extent_buffer *alloc_test_extent_buffer(struct btrfs_fs_info *fs_info,
u64 start);
#ifdef CONFIG_BTRFS_DEBUG
void btrfs_extent_buffer_leak_debug_check(struct btrfs_fs_info *fs_info);
#else
#define btrfs_extent_buffer_leak_debug_check(fs_info) do {} while (0)
#endif
#endif
// SPDX-License-Identifier: GPL-2.0
/*
* Functions related to segment and merge handling
*/
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/bio.h>
#include <linux/blkdev.h>
#include <linux/scatterlist.h>
#include <linux/blk-cgroup.h>
#include <trace/events/block.h>
#include "blk.h"
#include "blk-rq-qos.h"
static inline bool bio_will_gap(struct request_queue *q,
struct request *prev_rq, struct bio *prev, struct bio *next)
{
struct bio_vec pb, nb;
if (!bio_has_data(prev) || !queue_virt_boundary(q))
return false;
/*
* Don't merge if the 1st bio starts with non-zero offset, otherwise it
* is quite difficult to respect the sg gap limit. We work hard to
* merge a huge number of small single bios in case of mkfs.
*/
if (prev_rq) bio_get_first_bvec(prev_rq->bio, &pb);
else
bio_get_first_bvec(prev, &pb);
if (pb.bv_offset & queue_virt_boundary(q))
return true;
/*
* We don't need to worry about the situation that the merged segment
* ends in unaligned virt boundary:
*
* - if 'pb' ends aligned, the merged segment ends aligned
* - if 'pb' ends unaligned, the next bio must include
* one single bvec of 'nb', otherwise the 'nb' can't
* merge with 'pb'
*/
bio_get_last_bvec(prev, &pb);
bio_get_first_bvec(next, &nb);
if (biovec_phys_mergeable(q, &pb, &nb))
return false;
return __bvec_gap_to_prev(q, &pb, nb.bv_offset);
}
static inline bool req_gap_back_merge(struct request *req, struct bio *bio)
{
return bio_will_gap(req->q, req, req->biotail, bio);
}
static inline bool req_gap_front_merge(struct request *req, struct bio *bio)
{
return bio_will_gap(req->q, NULL, bio, req->bio);
}
static struct bio *blk_bio_discard_split(struct request_queue *q,
struct bio *bio,
struct bio_set *bs,
unsigned *nsegs)
{
unsigned int max_discard_sectors, granularity;
int alignment;
sector_t tmp;
unsigned split_sectors;
*nsegs = 1;
/* Zero-sector (unknown) and one-sector granularities are the same. */
granularity = max(q->limits.discard_granularity >> 9, 1U);
max_discard_sectors = min(q->limits.max_discard_sectors,
bio_allowed_max_sectors(q));
max_discard_sectors -= max_discard_sectors % granularity;
if (unlikely(!max_discard_sectors)) {
/* XXX: warn */
return NULL;
}
if (bio_sectors(bio) <= max_discard_sectors)
return NULL;
split_sectors = max_discard_sectors;
/*
* If the next starting sector would be misaligned, stop the discard at
* the previous aligned sector.
*/
alignment = (q->limits.discard_alignment >> 9) % granularity;
tmp = bio->bi_iter.bi_sector + split_sectors - alignment;
tmp = sector_div(tmp, granularity);
if (split_sectors > tmp)
split_sectors -= tmp;
return bio_split(bio, split_sectors, GFP_NOIO, bs);
}
static struct bio *blk_bio_write_zeroes_split(struct request_queue *q,
struct bio *bio, struct bio_set *bs, unsigned *nsegs)
{
*nsegs = 0;
if (!q->limits.max_write_zeroes_sectors)
return NULL;
if (bio_sectors(bio) <= q->limits.max_write_zeroes_sectors)
return NULL;
return bio_split(bio, q->limits.max_write_zeroes_sectors, GFP_NOIO, bs);
}
static struct bio *blk_bio_write_same_split(struct request_queue *q,
struct bio *bio,
struct bio_set *bs,
unsigned *nsegs)
{
*nsegs = 1;
if (!q->limits.max_write_same_sectors)
return NULL;
if (bio_sectors(bio) <= q->limits.max_write_same_sectors)
return NULL;
return bio_split(bio, q->limits.max_write_same_sectors, GFP_NOIO, bs);
}
/*
* Return the maximum number of sectors from the start of a bio that may be
* submitted as a single request to a block device. If enough sectors remain,
* align the end to the physical block size. Otherwise align the end to the
* logical block size. This approach minimizes the number of non-aligned
* requests that are submitted to a block device if the start of a bio is not
* aligned to a physical block boundary.
*/
static inline unsigned get_max_io_size(struct request_queue *q,
struct bio *bio)
{
unsigned sectors = blk_max_size_offset(q, bio->bi_iter.bi_sector, 0);
unsigned max_sectors = sectors;
unsigned pbs = queue_physical_block_size(q) >> SECTOR_SHIFT;
unsigned lbs = queue_logical_block_size(q) >> SECTOR_SHIFT;
unsigned start_offset = bio->bi_iter.bi_sector & (pbs - 1);
max_sectors += start_offset;
max_sectors &= ~(pbs - 1);
if (max_sectors > start_offset)
return max_sectors - start_offset; return sectors & ~(lbs - 1);
}
static inline unsigned get_max_segment_size(const struct request_queue *q,
struct page *start_page,
unsigned long offset)
{
unsigned long mask = queue_segment_boundary(q);
offset = mask & (page_to_phys(start_page) + offset);
/*
* overflow may be triggered in case of zero page physical address
* on 32bit arch, use queue's max segment size when that happens.
*/
return min_not_zero(mask - offset + 1,
(unsigned long)queue_max_segment_size(q));
}
/**
* bvec_split_segs - verify whether or not a bvec should be split in the middle
* @q: [in] request queue associated with the bio associated with @bv
* @bv: [in] bvec to examine
* @nsegs: [in,out] Number of segments in the bio being built. Incremented
* by the number of segments from @bv that may be appended to that
* bio without exceeding @max_segs
* @sectors: [in,out] Number of sectors in the bio being built. Incremented
* by the number of sectors from @bv that may be appended to that
* bio without exceeding @max_sectors
* @max_segs: [in] upper bound for *@nsegs
* @max_sectors: [in] upper bound for *@sectors
*
* When splitting a bio, it can happen that a bvec is encountered that is too
* big to fit in a single segment and hence that it has to be split in the
* middle. This function verifies whether or not that should happen. The value
* %true is returned if and only if appending the entire @bv to a bio with
* *@nsegs segments and *@sectors sectors would make that bio unacceptable for
* the block driver.
*/
static bool bvec_split_segs(const struct request_queue *q,
const struct bio_vec *bv, unsigned *nsegs,
unsigned *sectors, unsigned max_segs,
unsigned max_sectors)
{
unsigned max_len = (min(max_sectors, UINT_MAX >> 9) - *sectors) << 9;
unsigned len = min(bv->bv_len, max_len);
unsigned total_len = 0;
unsigned seg_size = 0;
while (len && *nsegs < max_segs) {
seg_size = get_max_segment_size(q, bv->bv_page,
bv->bv_offset + total_len); seg_size = min(seg_size, len);
(*nsegs)++;
total_len += seg_size;
len -= seg_size;
if ((bv->bv_offset + total_len) & queue_virt_boundary(q))
break;
}
*sectors += total_len >> 9;
/* tell the caller to split the bvec if it is too big to fit */
return len > 0 || bv->bv_len > max_len;
}
/**
* blk_bio_segment_split - split a bio in two bios
* @q: [in] request queue pointer
* @bio: [in] bio to be split
* @bs: [in] bio set to allocate the clone from
* @segs: [out] number of segments in the bio with the first half of the sectors
*
* Clone @bio, update the bi_iter of the clone to represent the first sectors
* of @bio and update @bio->bi_iter to represent the remaining sectors. The
* following is guaranteed for the cloned bio:
* - That it has at most get_max_io_size(@q, @bio) sectors.
* - That it has at most queue_max_segments(@q) segments.
*
* Except for discard requests the cloned bio will point at the bi_io_vec of
* the original bio. It is the responsibility of the caller to ensure that the
* original bio is not freed before the cloned bio. The caller is also
* responsible for ensuring that @bs is only destroyed after processing of the
* split bio has finished.
*/
static struct bio *blk_bio_segment_split(struct request_queue *q,
struct bio *bio,
struct bio_set *bs,
unsigned *segs)
{
struct bio_vec bv, bvprv, *bvprvp = NULL;
struct bvec_iter iter;
unsigned nsegs = 0, sectors = 0;
const unsigned max_sectors = get_max_io_size(q, bio);
const unsigned max_segs = queue_max_segments(q); bio_for_each_bvec(bv, bio, iter) {
/*
* If the queue doesn't support SG gaps and adding this
* offset would create a gap, disallow it.
*/
if (bvprvp && bvec_gap_to_prev(q, bvprvp, bv.bv_offset))
goto split;
if (nsegs < max_segs && sectors + (bv.bv_len >> 9) <= max_sectors && bv.bv_offset + bv.bv_len <= PAGE_SIZE) { nsegs++;
sectors += bv.bv_len >> 9;
} else if (bvec_split_segs(q, &bv, &nsegs, §ors, max_segs,
max_sectors)) {
goto split;
}
bvprv = bv;
bvprvp = &bvprv;
}
*segs = nsegs;
return NULL;
split:
*segs = nsegs;
/*
* Bio splitting may cause subtle trouble such as hang when doing sync
* iopoll in direct IO routine. Given performance gain of iopoll for
* big IO can be trival, disable iopoll when split needed.
*/
bio_clear_hipri(bio);
return bio_split(bio, sectors, GFP_NOIO, bs);
}
/**
* __blk_queue_split - split a bio and submit the second half
* @bio: [in, out] bio to be split
* @nr_segs: [out] number of segments in the first bio
*
* Split a bio into two bios, chain the two bios, submit the second half and
* store a pointer to the first half in *@bio. If the second bio is still too
* big it will be split by a recursive call to this function. Since this
* function may allocate a new bio from q->bio_split, it is the responsibility
* of the caller to ensure that q->bio_split is only released after processing
* of the split bio has finished.
*/
void __blk_queue_split(struct bio **bio, unsigned int *nr_segs)
{
struct request_queue *q = (*bio)->bi_bdev->bd_disk->queue;
struct bio *split = NULL;
switch (bio_op(*bio)) {
case REQ_OP_DISCARD:
case REQ_OP_SECURE_ERASE:
split = blk_bio_discard_split(q, *bio, &q->bio_split, nr_segs);
break;
case REQ_OP_WRITE_ZEROES:
split = blk_bio_write_zeroes_split(q, *bio, &q->bio_split,
nr_segs);
break;
case REQ_OP_WRITE_SAME:
split = blk_bio_write_same_split(q, *bio, &q->bio_split,
nr_segs);
break;
default:
/*
* All drivers must accept single-segments bios that are <=
* PAGE_SIZE. This is a quick and dirty check that relies on
* the fact that bi_io_vec[0] is always valid if a bio has data.
* The check might lead to occasional false negatives when bios
* are cloned, but compared to the performance impact of cloned
* bios themselves the loop below doesn't matter anyway.
*/
if (!q->limits.chunk_sectors && (*bio)->bi_vcnt == 1 && ((*bio)->bi_io_vec[0].bv_len +
(*bio)->bi_io_vec[0].bv_offset) <= PAGE_SIZE) {
*nr_segs = 1;
break;
}
split = blk_bio_segment_split(q, *bio, &q->bio_split, nr_segs); break;
}
if (split) {
/* there isn't chance to merge the splitted bio */
split->bi_opf |= REQ_NOMERGE;
bio_chain(split, *bio);
trace_block_split(split, (*bio)->bi_iter.bi_sector); submit_bio_noacct(*bio);
*bio = split;
blk_throtl_charge_bio_split(*bio);
}
}
/**
* blk_queue_split - split a bio and submit the second half
* @bio: [in, out] bio to be split
*
* Split a bio into two bios, chains the two bios, submit the second half and
* store a pointer to the first half in *@bio. Since this function may allocate
* a new bio from q->bio_split, it is the responsibility of the caller to ensure
* that q->bio_split is only released after processing of the split bio has
* finished.
*/
void blk_queue_split(struct bio **bio)
{
unsigned int nr_segs;
__blk_queue_split(bio, &nr_segs);
}
EXPORT_SYMBOL(blk_queue_split);
unsigned int blk_recalc_rq_segments(struct request *rq)
{
unsigned int nr_phys_segs = 0;
unsigned int nr_sectors = 0;
struct req_iterator iter;
struct bio_vec bv;
if (!rq->bio)
return 0;
switch (bio_op(rq->bio)) {
case REQ_OP_DISCARD:
case REQ_OP_SECURE_ERASE:
if (queue_max_discard_segments(rq->q) > 1) {
struct bio *bio = rq->bio;
for_each_bio(bio)
nr_phys_segs++;
return nr_phys_segs;
}
return 1;
case REQ_OP_WRITE_ZEROES:
return 0;
case REQ_OP_WRITE_SAME:
return 1;
}
rq_for_each_bvec(bv, rq, iter)
bvec_split_segs(rq->q, &bv, &nr_phys_segs, &nr_sectors,
UINT_MAX, UINT_MAX);
return nr_phys_segs;
}
static inline struct scatterlist *blk_next_sg(struct scatterlist **sg,
struct scatterlist *sglist)
{
if (!*sg)
return sglist;
/*
* If the driver previously mapped a shorter list, we could see a
* termination bit prematurely unless it fully inits the sg table
* on each mapping. We KNOW that there must be more entries here
* or the driver would be buggy, so force clear the termination bit
* to avoid doing a full sg_init_table() in drivers for each command.
*/
sg_unmark_end(*sg);
return sg_next(*sg);
}
static unsigned blk_bvec_map_sg(struct request_queue *q,
struct bio_vec *bvec, struct scatterlist *sglist,
struct scatterlist **sg)
{
unsigned nbytes = bvec->bv_len;
unsigned nsegs = 0, total = 0;
while (nbytes > 0) { unsigned offset = bvec->bv_offset + total; unsigned len = min(get_max_segment_size(q, bvec->bv_page,
offset), nbytes);
struct page *page = bvec->bv_page;
/*
* Unfortunately a fair number of drivers barf on scatterlists
* that have an offset larger than PAGE_SIZE, despite other
* subsystems dealing with that invariant just fine. For now
* stick to the legacy format where we never present those from
* the block layer, but the code below should be removed once
* these offenders (mostly MMC/SD drivers) are fixed.
*/
page += (offset >> PAGE_SHIFT);
offset &= ~PAGE_MASK;
*sg = blk_next_sg(sg, sglist);
sg_set_page(*sg, page, len, offset);
total += len;
nbytes -= len;
nsegs++;
}
return nsegs;
}
static inline int __blk_bvec_map_sg(struct bio_vec bv,
struct scatterlist *sglist, struct scatterlist **sg)
{
*sg = blk_next_sg(sg, sglist);
sg_set_page(*sg, bv.bv_page, bv.bv_len, bv.bv_offset);
return 1;
}
/* only try to merge bvecs into one sg if they are from two bios */
static inline bool
__blk_segment_map_sg_merge(struct request_queue *q, struct bio_vec *bvec,
struct bio_vec *bvprv, struct scatterlist **sg)
{
int nbytes = bvec->bv_len;
if (!*sg)
return false;
if ((*sg)->length + nbytes > queue_max_segment_size(q))
return false;
if (!biovec_phys_mergeable(q, bvprv, bvec))
return false;
(*sg)->length += nbytes;
return true;
}
static int __blk_bios_map_sg(struct request_queue *q, struct bio *bio,
struct scatterlist *sglist,
struct scatterlist **sg)
{
struct bio_vec bvec, bvprv = { NULL };
struct bvec_iter iter;
int nsegs = 0;
bool new_bio = false;
for_each_bio(bio) { bio_for_each_bvec(bvec, bio, iter) {
/*
* Only try to merge bvecs from two bios given we
* have done bio internal merge when adding pages
* to bio
*/
if (new_bio &&
__blk_segment_map_sg_merge(q, &bvec, &bvprv, sg))
goto next_bvec;
if (bvec.bv_offset + bvec.bv_len <= PAGE_SIZE)
nsegs += __blk_bvec_map_sg(bvec, sglist, sg);
else
nsegs += blk_bvec_map_sg(q, &bvec, sglist, sg);
next_bvec:
new_bio = false;
}
if (likely(bio->bi_iter.bi_size)) {
bvprv = bvec;
new_bio = true;
}
}
return nsegs;
}
/*
* map a request to scatterlist, return number of sg entries setup. Caller
* must make sure sg can hold rq->nr_phys_segments entries
*/
int __blk_rq_map_sg(struct request_queue *q, struct request *rq,
struct scatterlist *sglist, struct scatterlist **last_sg)
{
int nsegs = 0;
if (rq->rq_flags & RQF_SPECIAL_PAYLOAD) nsegs = __blk_bvec_map_sg(rq->special_vec, sglist, last_sg); else if (rq->bio && bio_op(rq->bio) == REQ_OP_WRITE_SAME) nsegs = __blk_bvec_map_sg(bio_iovec(rq->bio), sglist, last_sg);
else if (rq->bio)
nsegs = __blk_bios_map_sg(q, rq->bio, sglist, last_sg);
if (*last_sg)
sg_mark_end(*last_sg);
/*
* Something must have been wrong if the figured number of
* segment is bigger than number of req's physical segments
*/
WARN_ON(nsegs > blk_rq_nr_phys_segments(rq)); return nsegs;
}
EXPORT_SYMBOL(__blk_rq_map_sg);
static inline unsigned int blk_rq_get_max_segments(struct request *rq)
{
if (req_op(rq) == REQ_OP_DISCARD)
return queue_max_discard_segments(rq->q); return queue_max_segments(rq->q);
}
static inline int ll_new_hw_segment(struct request *req, struct bio *bio,
unsigned int nr_phys_segs)
{
if (!blk_cgroup_mergeable(req, bio))
goto no_merge;
if (blk_integrity_merge_bio(req->q, req, bio) == false)
goto no_merge;
/* discard request merge won't add new segment */
if (req_op(req) == REQ_OP_DISCARD)
return 1;
if (req->nr_phys_segments + nr_phys_segs > blk_rq_get_max_segments(req))
goto no_merge;
/*
* This will form the start of a new hw segment. Bump both
* counters.
*/
req->nr_phys_segments += nr_phys_segs; return 1;
no_merge:
req_set_nomerge(req->q, req);
return 0;
}
int ll_back_merge_fn(struct request *req, struct bio *bio, unsigned int nr_segs)
{
if (req_gap_back_merge(req, bio))
return 0;
if (blk_integrity_rq(req) &&
integrity_req_gap_back_merge(req, bio))
return 0;
if (!bio_crypt_ctx_back_mergeable(req, bio))
return 0;
if (blk_rq_sectors(req) + bio_sectors(bio) > blk_rq_get_max_sectors(req, blk_rq_pos(req))) {
req_set_nomerge(req->q, req);
return 0;
}
return ll_new_hw_segment(req, bio, nr_segs);
}
static int ll_front_merge_fn(struct request *req, struct bio *bio,
unsigned int nr_segs)
{
if (req_gap_front_merge(req, bio))
return 0;
if (blk_integrity_rq(req) &&
integrity_req_gap_front_merge(req, bio))
return 0;
if (!bio_crypt_ctx_front_mergeable(req, bio))
return 0;
if (blk_rq_sectors(req) + bio_sectors(bio) > blk_rq_get_max_sectors(req, bio->bi_iter.bi_sector)) {
req_set_nomerge(req->q, req);
return 0;
}
return ll_new_hw_segment(req, bio, nr_segs);
}
static bool req_attempt_discard_merge(struct request_queue *q, struct request *req,
struct request *next)
{
unsigned short segments = blk_rq_nr_discard_segments(req);
if (segments >= queue_max_discard_segments(q))
goto no_merge;
if (blk_rq_sectors(req) + bio_sectors(next->bio) >
blk_rq_get_max_sectors(req, blk_rq_pos(req)))
goto no_merge;
req->nr_phys_segments = segments + blk_rq_nr_discard_segments(next);
return true;
no_merge:
req_set_nomerge(q, req);
return false;
}
static int ll_merge_requests_fn(struct request_queue *q, struct request *req,
struct request *next)
{
int total_phys_segments;
if (req_gap_back_merge(req, next->bio))
return 0;
/*
* Will it become too large?
*/
if ((blk_rq_sectors(req) + blk_rq_sectors(next)) >
blk_rq_get_max_sectors(req, blk_rq_pos(req)))
return 0;
total_phys_segments = req->nr_phys_segments + next->nr_phys_segments; if (total_phys_segments > blk_rq_get_max_segments(req))
return 0;
if (!blk_cgroup_mergeable(req, next->bio))
return 0;
if (blk_integrity_merge_rq(q, req, next) == false)
return 0;
if (!bio_crypt_ctx_merge_rq(req, next))
return 0;
/* Merge is OK... */
req->nr_phys_segments = total_phys_segments;
return 1;
}
/**
* blk_rq_set_mixed_merge - mark a request as mixed merge
* @rq: request to mark as mixed merge
*
* Description:
* @rq is about to be mixed merged. Make sure the attributes
* which can be mixed are set in each bio and mark @rq as mixed
* merged.
*/
void blk_rq_set_mixed_merge(struct request *rq)
{
unsigned int ff = rq->cmd_flags & REQ_FAILFAST_MASK;
struct bio *bio;
if (rq->rq_flags & RQF_MIXED_MERGE)
return;
/*
* @rq will no longer represent mixable attributes for all the
* contained bios. It will just track those of the first one.
* Distributes the attributs to each bio.
*/
for (bio = rq->bio; bio; bio = bio->bi_next) {
WARN_ON_ONCE((bio->bi_opf & REQ_FAILFAST_MASK) &&
(bio->bi_opf & REQ_FAILFAST_MASK) != ff);
bio->bi_opf |= ff;
}
rq->rq_flags |= RQF_MIXED_MERGE;
}
static void blk_account_io_merge_request(struct request *req)
{
if (blk_do_io_stat(req)) { part_stat_lock(); part_stat_inc(req->part, merges[op_stat_group(req_op(req))]); part_stat_unlock();
}
}
static enum elv_merge blk_try_req_merge(struct request *req,
struct request *next)
{
if (blk_discard_mergable(req))
return ELEVATOR_DISCARD_MERGE;
else if (blk_rq_pos(req) + blk_rq_sectors(req) == blk_rq_pos(next))
return ELEVATOR_BACK_MERGE;
return ELEVATOR_NO_MERGE;
}
/*
* For non-mq, this has to be called with the request spinlock acquired.
* For mq with scheduling, the appropriate queue wide lock should be held.
*/
static struct request *attempt_merge(struct request_queue *q,
struct request *req, struct request *next)
{
if (!rq_mergeable(req) || !rq_mergeable(next))
return NULL;
if (req_op(req) != req_op(next))
return NULL;
if (rq_data_dir(req) != rq_data_dir(next)
|| req->rq_disk != next->rq_disk)
return NULL;
if (req_op(req) == REQ_OP_WRITE_SAME && !blk_write_same_mergeable(req->bio, next->bio))
return NULL;
/*
* Don't allow merge of different write hints, or for a hint with
* non-hint IO.
*/
if (req->write_hint != next->write_hint)
return NULL;
if (req->ioprio != next->ioprio)
return NULL;
/*
* If we are allowed to merge, then append bio list
* from next to rq and release next. merge_requests_fn
* will have updated segment counts, update sector
* counts here. Handle DISCARDs separately, as they
* have separate settings.
*/
switch (blk_try_req_merge(req, next)) {
case ELEVATOR_DISCARD_MERGE:
if (!req_attempt_discard_merge(q, req, next))
return NULL;
break;
case ELEVATOR_BACK_MERGE:
if (!ll_merge_requests_fn(q, req, next))
return NULL;
break;
default:
return NULL;
}
/*
* If failfast settings disagree or any of the two is already
* a mixed merge, mark both as mixed before proceeding. This
* makes sure that all involved bios have mixable attributes
* set properly.
*/
if (((req->rq_flags | next->rq_flags) & RQF_MIXED_MERGE) ||
(req->cmd_flags & REQ_FAILFAST_MASK) !=
(next->cmd_flags & REQ_FAILFAST_MASK)) { blk_rq_set_mixed_merge(req);
blk_rq_set_mixed_merge(next);
}
/*
* At this point we have either done a back merge or front merge. We
* need the smaller start_time_ns of the merged requests to be the
* current request for accounting purposes.
*/
if (next->start_time_ns < req->start_time_ns) req->start_time_ns = next->start_time_ns; req->biotail->bi_next = next->bio;
req->biotail = next->biotail;
req->__data_len += blk_rq_bytes(next);
if (!blk_discard_mergable(req)) elv_merge_requests(q, req, next);
/*
* 'next' is going away, so update stats accordingly
*/
blk_account_io_merge_request(next);
trace_block_rq_merge(next);
/*
* ownership of bio passed from next to req, return 'next' for
* the caller to free
*/
next->bio = NULL;
return next;
}
static struct request *attempt_back_merge(struct request_queue *q,
struct request *rq)
{
struct request *next = elv_latter_request(q, rq);
if (next)
return attempt_merge(q, rq, next);
return NULL;
}
static struct request *attempt_front_merge(struct request_queue *q,
struct request *rq)
{
struct request *prev = elv_former_request(q, rq);
if (prev)
return attempt_merge(q, prev, rq);
return NULL;
}
/*
* Try to merge 'next' into 'rq'. Return true if the merge happened, false
* otherwise. The caller is responsible for freeing 'next' if the merge
* happened.
*/
bool blk_attempt_req_merge(struct request_queue *q, struct request *rq,
struct request *next)
{
return attempt_merge(q, rq, next);
}
bool blk_rq_merge_ok(struct request *rq, struct bio *bio)
{
if (!rq_mergeable(rq) || !bio_mergeable(bio))
return false;
if (req_op(rq) != bio_op(bio))
return false;
/* different data direction or already started, don't merge */
if (bio_data_dir(bio) != rq_data_dir(rq))
return false;
/* must be same device */
if (rq->rq_disk != bio->bi_bdev->bd_disk)
return false;
/* don't merge across cgroup boundaries */
if (!blk_cgroup_mergeable(rq, bio))
return false;
/* only merge integrity protected bio into ditto rq */
if (blk_integrity_merge_bio(rq->q, rq, bio) == false)
return false;
/* Only merge if the crypt contexts are compatible */
if (!bio_crypt_rq_ctx_compatible(rq, bio))
return false;
/* must be using the same buffer */
if (req_op(rq) == REQ_OP_WRITE_SAME && !blk_write_same_mergeable(rq->bio, bio))
return false;
/*
* Don't allow merge of different write hints, or for a hint with
* non-hint IO.
*/
if (rq->write_hint != bio->bi_write_hint)
return false;
if (rq->ioprio != bio_prio(bio))
return false;
return true;
}
enum elv_merge blk_try_merge(struct request *rq, struct bio *bio)
{
if (blk_discard_mergable(rq))
return ELEVATOR_DISCARD_MERGE;
else if (blk_rq_pos(rq) + blk_rq_sectors(rq) == bio->bi_iter.bi_sector)
return ELEVATOR_BACK_MERGE;
else if (blk_rq_pos(rq) - bio_sectors(bio) == bio->bi_iter.bi_sector)
return ELEVATOR_FRONT_MERGE;
return ELEVATOR_NO_MERGE;
}
static void blk_account_io_merge_bio(struct request *req)
{
if (!blk_do_io_stat(req))
return;
part_stat_lock(); part_stat_inc(req->part, merges[op_stat_group(req_op(req))]); part_stat_unlock();
}
enum bio_merge_status {
BIO_MERGE_OK,
BIO_MERGE_NONE,
BIO_MERGE_FAILED,
};
static enum bio_merge_status bio_attempt_back_merge(struct request *req,
struct bio *bio, unsigned int nr_segs)
{
const int ff = bio->bi_opf & REQ_FAILFAST_MASK;
if (!ll_back_merge_fn(req, bio, nr_segs))
return BIO_MERGE_FAILED;
trace_block_bio_backmerge(bio);
rq_qos_merge(req->q, req, bio); if ((req->cmd_flags & REQ_FAILFAST_MASK) != ff) blk_rq_set_mixed_merge(req); req->biotail->bi_next = bio;
req->biotail = bio;
req->__data_len += bio->bi_iter.bi_size;
bio_crypt_free_ctx(bio);
blk_account_io_merge_bio(req);
return BIO_MERGE_OK;
}
static enum bio_merge_status bio_attempt_front_merge(struct request *req,
struct bio *bio, unsigned int nr_segs)
{
const int ff = bio->bi_opf & REQ_FAILFAST_MASK;
if (!ll_front_merge_fn(req, bio, nr_segs))
return BIO_MERGE_FAILED;
trace_block_bio_frontmerge(bio);
rq_qos_merge(req->q, req, bio); if ((req->cmd_flags & REQ_FAILFAST_MASK) != ff) blk_rq_set_mixed_merge(req); bio->bi_next = req->bio;
req->bio = bio;
req->__sector = bio->bi_iter.bi_sector;
req->__data_len += bio->bi_iter.bi_size;
bio_crypt_do_front_merge(req, bio);
blk_account_io_merge_bio(req);
return BIO_MERGE_OK;
}
static enum bio_merge_status bio_attempt_discard_merge(struct request_queue *q,
struct request *req, struct bio *bio)
{
unsigned short segments = blk_rq_nr_discard_segments(req);
if (segments >= queue_max_discard_segments(q))
goto no_merge;
if (blk_rq_sectors(req) + bio_sectors(bio) >
blk_rq_get_max_sectors(req, blk_rq_pos(req)))
goto no_merge;
rq_qos_merge(q, req, bio);
req->biotail->bi_next = bio;
req->biotail = bio;
req->__data_len += bio->bi_iter.bi_size;
req->nr_phys_segments = segments + 1;
blk_account_io_merge_bio(req);
return BIO_MERGE_OK;
no_merge:
req_set_nomerge(q, req);
return BIO_MERGE_FAILED;
}
static enum bio_merge_status blk_attempt_bio_merge(struct request_queue *q,
struct request *rq,
struct bio *bio,
unsigned int nr_segs,
bool sched_allow_merge)
{
if (!blk_rq_merge_ok(rq, bio))
return BIO_MERGE_NONE;
switch (blk_try_merge(rq, bio)) {
case ELEVATOR_BACK_MERGE:
if (!sched_allow_merge || blk_mq_sched_allow_merge(q, rq, bio)) return bio_attempt_back_merge(rq, bio, nr_segs);
break;
case ELEVATOR_FRONT_MERGE:
if (!sched_allow_merge || blk_mq_sched_allow_merge(q, rq, bio)) return bio_attempt_front_merge(rq, bio, nr_segs);
break;
case ELEVATOR_DISCARD_MERGE:
return bio_attempt_discard_merge(q, rq, bio);
default:
return BIO_MERGE_NONE;
}
return BIO_MERGE_FAILED;
}
/**
* blk_attempt_plug_merge - try to merge with %current's plugged list
* @q: request_queue new bio is being queued at
* @bio: new bio being queued
* @nr_segs: number of segments in @bio
* @same_queue_rq: pointer to &struct request that gets filled in when
* another request associated with @q is found on the plug list
* (optional, may be %NULL)
*
* Determine whether @bio being queued on @q can be merged with a request
* on %current's plugged list. Returns %true if merge was successful,
* otherwise %false.
*
* Plugging coalesces IOs from the same issuer for the same purpose without
* going through @q->queue_lock. As such it's more of an issuing mechanism
* than scheduling, and the request, while may have elvpriv data, is not
* added on the elevator at this point. In addition, we don't have
* reliable access to the elevator outside queue lock. Only check basic
* merging parameters without querying the elevator.
*
* Caller must ensure !blk_queue_nomerges(q) beforehand.
*/
bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio,
unsigned int nr_segs, struct request **same_queue_rq)
{
struct blk_plug *plug;
struct request *rq;
struct list_head *plug_list;
plug = blk_mq_plug(q, bio);
if (!plug)
return false;
plug_list = &plug->mq_list; list_for_each_entry_reverse(rq, plug_list, queuelist) { if (rq->q == q && same_queue_rq) {
/*
* Only blk-mq multiple hardware queues case checks the
* rq in the same queue, there should be only one such
* rq in a queue
**/
*same_queue_rq = rq;
}
if (rq->q != q)
continue;
if (blk_attempt_bio_merge(q, rq, bio, nr_segs, false) ==
BIO_MERGE_OK)
return true;
}
return false;
}
/*
* Iterate list of requests and see if we can merge this bio with any
* of them.
*/
bool blk_bio_list_merge(struct request_queue *q, struct list_head *list,
struct bio *bio, unsigned int nr_segs)
{
struct request *rq;
int checked = 8;
list_for_each_entry_reverse(rq, list, queuelist) {
if (!checked--)
break;
switch (blk_attempt_bio_merge(q, rq, bio, nr_segs, true)) {
case BIO_MERGE_NONE:
continue;
case BIO_MERGE_OK:
return true;
case BIO_MERGE_FAILED:
return false;
}
}
return false;
}
EXPORT_SYMBOL_GPL(blk_bio_list_merge);
bool blk_mq_sched_try_merge(struct request_queue *q, struct bio *bio,
unsigned int nr_segs, struct request **merged_request)
{
struct request *rq;
switch (elv_merge(q, &rq, bio)) {
case ELEVATOR_BACK_MERGE:
if (!blk_mq_sched_allow_merge(q, rq, bio)) return false; if (bio_attempt_back_merge(rq, bio, nr_segs) != BIO_MERGE_OK)
return false;
*merged_request = attempt_back_merge(q, rq);
if (!*merged_request)
elv_merged_request(q, rq, ELEVATOR_BACK_MERGE);
return true;
case ELEVATOR_FRONT_MERGE:
if (!blk_mq_sched_allow_merge(q, rq, bio))
return false;
if (bio_attempt_front_merge(rq, bio, nr_segs) != BIO_MERGE_OK)
return false;
*merged_request = attempt_front_merge(q, rq);
if (!*merged_request)
elv_merged_request(q, rq, ELEVATOR_FRONT_MERGE);
return true;
case ELEVATOR_DISCARD_MERGE:
return bio_attempt_discard_merge(q, rq, bio) == BIO_MERGE_OK;
default:
return false;
}
}
EXPORT_SYMBOL_GPL(blk_mq_sched_try_merge);
// SPDX-License-Identifier: GPL-2.0-or-later
/*
* Kernel Probes (KProbes)
* kernel/kprobes.c
*
* Copyright (C) IBM Corporation, 2002, 2004
*
* 2002-Oct Created by Vamsi Krishna S <vamsi_krishna@in.ibm.com> Kernel
* Probes initial implementation (includes suggestions from
* Rusty Russell).
* 2004-Aug Updated by Prasanna S Panchamukhi <prasanna@in.ibm.com> with
* hlists and exceptions notifier as suggested by Andi Kleen.
* 2004-July Suparna Bhattacharya <suparna@in.ibm.com> added jumper probes
* interface to access function arguments.
* 2004-Sep Prasanna S Panchamukhi <prasanna@in.ibm.com> Changed Kprobes
* exceptions notifier to be first on the priority list.
* 2005-May Hien Nguyen <hien@us.ibm.com>, Jim Keniston
* <jkenisto@us.ibm.com> and Prasanna S Panchamukhi
* <prasanna@in.ibm.com> added function-return probes.
*/
#include <linux/kprobes.h>
#include <linux/hash.h>
#include <linux/init.h>
#include <linux/slab.h>
#include <linux/stddef.h>
#include <linux/export.h>
#include <linux/moduleloader.h>
#include <linux/kallsyms.h>
#include <linux/freezer.h>
#include <linux/seq_file.h>
#include <linux/debugfs.h>
#include <linux/sysctl.h>
#include <linux/kdebug.h>
#include <linux/memory.h>
#include <linux/ftrace.h>
#include <linux/cpu.h>
#include <linux/jump_label.h>
#include <linux/static_call.h>
#include <linux/perf_event.h>
#include <asm/sections.h>
#include <asm/cacheflush.h>
#include <asm/errno.h>
#include <linux/uaccess.h>
#define KPROBE_HASH_BITS 6
#define KPROBE_TABLE_SIZE (1 << KPROBE_HASH_BITS)
static int kprobes_initialized;
/* kprobe_table can be accessed by
* - Normal hlist traversal and RCU add/del under kprobe_mutex is held.
* Or
* - RCU hlist traversal under disabling preempt (breakpoint handlers)
*/
static struct hlist_head kprobe_table[KPROBE_TABLE_SIZE];
/* NOTE: change this value only with kprobe_mutex held */
static bool kprobes_all_disarmed;
/* This protects kprobe_table and optimizing_list */
static DEFINE_MUTEX(kprobe_mutex);
static DEFINE_PER_CPU(struct kprobe *, kprobe_instance) = NULL;
kprobe_opcode_t * __weak kprobe_lookup_name(const char *name,
unsigned int __unused)
{
return ((kprobe_opcode_t *)(kallsyms_lookup_name(name)));
}
/* Blacklist -- list of struct kprobe_blacklist_entry */
static LIST_HEAD(kprobe_blacklist);
#ifdef __ARCH_WANT_KPROBES_INSN_SLOT
/*
* kprobe->ainsn.insn points to the copy of the instruction to be
* single-stepped. x86_64, POWER4 and above have no-exec support and
* stepping on the instruction on a vmalloced/kmalloced/data page
* is a recipe for disaster
*/
struct kprobe_insn_page {
struct list_head list;
kprobe_opcode_t *insns; /* Page of instruction slots */
struct kprobe_insn_cache *cache;
int nused;
int ngarbage;
char slot_used[];
};
#define KPROBE_INSN_PAGE_SIZE(slots) \
(offsetof(struct kprobe_insn_page, slot_used) + \
(sizeof(char) * (slots)))
static int slots_per_page(struct kprobe_insn_cache *c)
{
return PAGE_SIZE/(c->insn_size * sizeof(kprobe_opcode_t));
}
enum kprobe_slot_state {
SLOT_CLEAN = 0,
SLOT_DIRTY = 1,
SLOT_USED = 2,
};
void __weak *alloc_insn_page(void)
{
return module_alloc(PAGE_SIZE);
}
static void free_insn_page(void *page)
{
module_memfree(page);
}
struct kprobe_insn_cache kprobe_insn_slots = {
.mutex = __MUTEX_INITIALIZER(kprobe_insn_slots.mutex),
.alloc = alloc_insn_page,
.free = free_insn_page,
.sym = KPROBE_INSN_PAGE_SYM,
.pages = LIST_HEAD_INIT(kprobe_insn_slots.pages),
.insn_size = MAX_INSN_SIZE,
.nr_garbage = 0,
};
static int collect_garbage_slots(struct kprobe_insn_cache *c);
/**
* __get_insn_slot() - Find a slot on an executable page for an instruction.
* We allocate an executable page if there's no room on existing ones.
*/
kprobe_opcode_t *__get_insn_slot(struct kprobe_insn_cache *c)
{
struct kprobe_insn_page *kip;
kprobe_opcode_t *slot = NULL;
/* Since the slot array is not protected by rcu, we need a mutex */
mutex_lock(&c->mutex);
retry:
rcu_read_lock();
list_for_each_entry_rcu(kip, &c->pages, list) {
if (kip->nused < slots_per_page(c)) {
int i;
for (i = 0; i < slots_per_page(c); i++) {
if (kip->slot_used[i] == SLOT_CLEAN) {
kip->slot_used[i] = SLOT_USED;
kip->nused++;
slot = kip->insns + (i * c->insn_size);
rcu_read_unlock();
goto out;
}
}
/* kip->nused is broken. Fix it. */
kip->nused = slots_per_page(c);
WARN_ON(1);
}
}
rcu_read_unlock();
/* If there are any garbage slots, collect it and try again. */
if (c->nr_garbage && collect_garbage_slots(c) == 0)
goto retry;
/* All out of space. Need to allocate a new page. */
kip = kmalloc(KPROBE_INSN_PAGE_SIZE(slots_per_page(c)), GFP_KERNEL);
if (!kip)
goto out;
/*
* Use module_alloc so this page is within +/- 2GB of where the
* kernel image and loaded module images reside. This is required
* so x86_64 can correctly handle the %rip-relative fixups.
*/
kip->insns = c->alloc();
if (!kip->insns) {
kfree(kip);
goto out;
}
INIT_LIST_HEAD(&kip->list);
memset(kip->slot_used, SLOT_CLEAN, slots_per_page(c));
kip->slot_used[0] = SLOT_USED;
kip->nused = 1;
kip->ngarbage = 0;
kip->cache = c;
list_add_rcu(&kip->list, &c->pages);
slot = kip->insns;
/* Record the perf ksymbol register event after adding the page */
perf_event_ksymbol(PERF_RECORD_KSYMBOL_TYPE_OOL, (unsigned long)kip->insns,
PAGE_SIZE, false, c->sym);
out:
mutex_unlock(&c->mutex);
return slot;
}
/* Return 1 if all garbages are collected, otherwise 0. */
static int collect_one_slot(struct kprobe_insn_page *kip, int idx)
{
kip->slot_used[idx] = SLOT_CLEAN;
kip->nused--;
if (kip->nused == 0) {
/*
* Page is no longer in use. Free it unless
* it's the last one. We keep the last one
* so as not to have to set it up again the
* next time somebody inserts a probe.
*/
if (!list_is_singular(&kip->list)) {
/*
* Record perf ksymbol unregister event before removing
* the page.
*/
perf_event_ksymbol(PERF_RECORD_KSYMBOL_TYPE_OOL,
(unsigned long)kip->insns, PAGE_SIZE, true,
kip->cache->sym);
list_del_rcu(&kip->list);
synchronize_rcu();
kip->cache->free(kip->insns);
kfree(kip);
}
return 1;
}
return 0;
}
static int collect_garbage_slots(struct kprobe_insn_cache *c)
{
struct kprobe_insn_page *kip, *next;
/* Ensure no-one is interrupted on the garbages */
synchronize_rcu();
list_for_each_entry_safe(kip, next, &c->pages, list) {
int i;
if (kip->ngarbage == 0)
continue;
kip->ngarbage = 0; /* we will collect all garbages */
for (i = 0; i < slots_per_page(c); i++) {
if (kip->slot_used[i] == SLOT_DIRTY && collect_one_slot(kip, i))
break;
}
}
c->nr_garbage = 0;
return 0;
}
void __free_insn_slot(struct kprobe_insn_cache *c,
kprobe_opcode_t *slot, int dirty)
{
struct kprobe_insn_page *kip;
long idx;
mutex_lock(&c->mutex);
rcu_read_lock();
list_for_each_entry_rcu(kip, &c->pages, list) {
idx = ((long)slot - (long)kip->insns) /
(c->insn_size * sizeof(kprobe_opcode_t));
if (idx >= 0 && idx < slots_per_page(c))
goto out;
}
/* Could not find this slot. */
WARN_ON(1);
kip = NULL;
out:
rcu_read_unlock();
/* Mark and sweep: this may sleep */
if (kip) {
/* Check double free */
WARN_ON(kip->slot_used[idx] != SLOT_USED);
if (dirty) {
kip->slot_used[idx] = SLOT_DIRTY;
kip->ngarbage++;
if (++c->nr_garbage > slots_per_page(c))
collect_garbage_slots(c);
} else {
collect_one_slot(kip, idx);
}
}
mutex_unlock(&c->mutex);
}
/*
* Check given address is on the page of kprobe instruction slots.
* This will be used for checking whether the address on a stack
* is on a text area or not.
*/
bool __is_insn_slot_addr(struct kprobe_insn_cache *c, unsigned long addr)
{
struct kprobe_insn_page *kip;
bool ret = false;
rcu_read_lock();
list_for_each_entry_rcu(kip, &c->pages, list) { if (addr >= (unsigned long)kip->insns && addr < (unsigned long)kip->insns + PAGE_SIZE) {
ret = true;
break;
}
}
rcu_read_unlock();
return ret;
}
int kprobe_cache_get_kallsym(struct kprobe_insn_cache *c, unsigned int *symnum,
unsigned long *value, char *type, char *sym)
{
struct kprobe_insn_page *kip;
int ret = -ERANGE;
rcu_read_lock();
list_for_each_entry_rcu(kip, &c->pages, list) {
if ((*symnum)--)
continue;
strlcpy(sym, c->sym, KSYM_NAME_LEN);
*type = 't';
*value = (unsigned long)kip->insns;
ret = 0;
break;
}
rcu_read_unlock();
return ret;
}
#ifdef CONFIG_OPTPROBES
void __weak *alloc_optinsn_page(void)
{
return alloc_insn_page();
}
void __weak free_optinsn_page(void *page)
{
free_insn_page(page);
}
/* For optimized_kprobe buffer */
struct kprobe_insn_cache kprobe_optinsn_slots = {
.mutex = __MUTEX_INITIALIZER(kprobe_optinsn_slots.mutex),
.alloc = alloc_optinsn_page,
.free = free_optinsn_page,
.sym = KPROBE_OPTINSN_PAGE_SYM,
.pages = LIST_HEAD_INIT(kprobe_optinsn_slots.pages),
/* .insn_size is initialized later */
.nr_garbage = 0,
};
#endif
#endif
/* We have preemption disabled.. so it is safe to use __ versions */
static inline void set_kprobe_instance(struct kprobe *kp)
{
__this_cpu_write(kprobe_instance, kp);
}
static inline void reset_kprobe_instance(void)
{
__this_cpu_write(kprobe_instance, NULL);
}
/*
* This routine is called either:
* - under the kprobe_mutex - during kprobe_[un]register()
* OR
* - with preemption disabled - from arch/xxx/kernel/kprobes.c
*/
struct kprobe *get_kprobe(void *addr)
{
struct hlist_head *head;
struct kprobe *p;
head = &kprobe_table[hash_ptr(addr, KPROBE_HASH_BITS)];
hlist_for_each_entry_rcu(p, head, hlist,
lockdep_is_held(&kprobe_mutex)) {
if (p->addr == addr)
return p;
}
return NULL;
}
NOKPROBE_SYMBOL(get_kprobe);
static int aggr_pre_handler(struct kprobe *p, struct pt_regs *regs);
/* Return true if the kprobe is an aggregator */
static inline int kprobe_aggrprobe(struct kprobe *p)
{
return p->pre_handler == aggr_pre_handler;
}
/* Return true(!0) if the kprobe is unused */
static inline int kprobe_unused(struct kprobe *p)
{
return kprobe_aggrprobe(p) && kprobe_disabled(p) &&
list_empty(&p->list);
}
/*
* Keep all fields in the kprobe consistent
*/
static inline void copy_kprobe(struct kprobe *ap, struct kprobe *p)
{
memcpy(&p->opcode, &ap->opcode, sizeof(kprobe_opcode_t));
memcpy(&p->ainsn, &ap->ainsn, sizeof(struct arch_specific_insn));
}
#ifdef CONFIG_OPTPROBES
/* NOTE: change this value only with kprobe_mutex held */
static bool kprobes_allow_optimization;
/*
* Call all pre_handler on the list, but ignores its return value.
* This must be called from arch-dep optimized caller.
*/
void opt_pre_handler(struct kprobe *p, struct pt_regs *regs)
{
struct kprobe *kp;
list_for_each_entry_rcu(kp, &p->list, list) {
if (kp->pre_handler && likely(!kprobe_disabled(kp))) {
set_kprobe_instance(kp);
kp->pre_handler(kp, regs);
}
reset_kprobe_instance();
}
}
NOKPROBE_SYMBOL(opt_pre_handler);
/* Free optimized instructions and optimized_kprobe */
static void free_aggr_kprobe(struct kprobe *p)
{
struct optimized_kprobe *op;
op = container_of(p, struct optimized_kprobe, kp);
arch_remove_optimized_kprobe(op);
arch_remove_kprobe(p);
kfree(op);
}
/* Return true(!0) if the kprobe is ready for optimization. */
static inline int kprobe_optready(struct kprobe *p)
{
struct optimized_kprobe *op;
if (kprobe_aggrprobe(p)) {
op = container_of(p, struct optimized_kprobe, kp);
return arch_prepared_optinsn(&op->optinsn);
}
return 0;
}
/* Return true(!0) if the kprobe is disarmed. Note: p must be on hash list */
static inline int kprobe_disarmed(struct kprobe *p)
{
struct optimized_kprobe *op;
/* If kprobe is not aggr/opt probe, just return kprobe is disabled */
if (!kprobe_aggrprobe(p))
return kprobe_disabled(p);
op = container_of(p, struct optimized_kprobe, kp);
return kprobe_disabled(p) && list_empty(&op->list);
}
/* Return true(!0) if the probe is queued on (un)optimizing lists */
static int kprobe_queued(struct kprobe *p)
{
struct optimized_kprobe *op;
if (kprobe_aggrprobe(p)) {
op = container_of(p, struct optimized_kprobe, kp);
if (!list_empty(&op->list))
return 1;
}
return 0;
}
/*
* Return an optimized kprobe whose optimizing code replaces
* instructions including addr (exclude breakpoint).
*/
static struct kprobe *get_optimized_kprobe(unsigned long addr)
{
int i;
struct kprobe *p = NULL;
struct optimized_kprobe *op;
/* Don't check i == 0, since that is a breakpoint case. */
for (i = 1; !p && i < MAX_OPTIMIZED_LENGTH; i++)
p = get_kprobe((void *)(addr - i));
if (p && kprobe_optready(p)) {
op = container_of(p, struct optimized_kprobe, kp);
if (arch_within_optimized_kprobe(op, addr))
return p;
}
return NULL;
}
/* Optimization staging list, protected by kprobe_mutex */
static LIST_HEAD(optimizing_list);
static LIST_HEAD(unoptimizing_list);
static LIST_HEAD(freeing_list);
static void kprobe_optimizer(struct work_struct *work);
static DECLARE_DELAYED_WORK(optimizing_work, kprobe_optimizer);
#define OPTIMIZE_DELAY 5
/*
* Optimize (replace a breakpoint with a jump) kprobes listed on
* optimizing_list.
*/
static void do_optimize_kprobes(void)
{
lockdep_assert_held(&text_mutex);
/*
* The optimization/unoptimization refers online_cpus via
* stop_machine() and cpu-hotplug modifies online_cpus.
* And same time, text_mutex will be held in cpu-hotplug and here.
* This combination can cause a deadlock (cpu-hotplug try to lock
* text_mutex but stop_machine can not be done because online_cpus
* has been changed)
* To avoid this deadlock, caller must have locked cpu hotplug
* for preventing cpu-hotplug outside of text_mutex locking.
*/
lockdep_assert_cpus_held();
/* Optimization never be done when disarmed */
if (kprobes_all_disarmed || !kprobes_allow_optimization ||
list_empty(&optimizing_list))
return;
arch_optimize_kprobes(&optimizing_list);
}
/*
* Unoptimize (replace a jump with a breakpoint and remove the breakpoint
* if need) kprobes listed on unoptimizing_list.
*/
static void do_unoptimize_kprobes(void)
{
struct optimized_kprobe *op, *tmp;
lockdep_assert_held(&text_mutex);
/* See comment in do_optimize_kprobes() */
lockdep_assert_cpus_held();
/* Unoptimization must be done anytime */
if (list_empty(&unoptimizing_list))
return;
arch_unoptimize_kprobes(&unoptimizing_list, &freeing_list);
/* Loop free_list for disarming */
list_for_each_entry_safe(op, tmp, &freeing_list, list) {
/* Switching from detour code to origin */
op->kp.flags &= ~KPROBE_FLAG_OPTIMIZED;
/* Disarm probes if marked disabled */
if (kprobe_disabled(&op->kp))
arch_disarm_kprobe(&op->kp);
if (kprobe_unused(&op->kp)) {
/*
* Remove unused probes from hash list. After waiting
* for synchronization, these probes are reclaimed.
* (reclaiming is done by do_free_cleaned_kprobes.)
*/
hlist_del_rcu(&op->kp.hlist);
} else
list_del_init(&op->list);
}
}
/* Reclaim all kprobes on the free_list */
static void do_free_cleaned_kprobes(void)
{
struct optimized_kprobe *op, *tmp;
list_for_each_entry_safe(op, tmp, &freeing_list, list) {
list_del_init(&op->list);
if (WARN_ON_ONCE(!kprobe_unused(&op->kp))) {
/*
* This must not happen, but if there is a kprobe
* still in use, keep it on kprobes hash list.
*/
continue;
}
free_aggr_kprobe(&op->kp);
}
}
/* Start optimizer after OPTIMIZE_DELAY passed */
static void kick_kprobe_optimizer(void)
{
schedule_delayed_work(&optimizing_work, OPTIMIZE_DELAY);
}
/* Kprobe jump optimizer */
static void kprobe_optimizer(struct work_struct *work)
{
mutex_lock(&kprobe_mutex);
cpus_read_lock();
mutex_lock(&text_mutex);
/*
* Step 1: Unoptimize kprobes and collect cleaned (unused and disarmed)
* kprobes before waiting for quiesence period.
*/
do_unoptimize_kprobes();
/*
* Step 2: Wait for quiesence period to ensure all potentially
* preempted tasks to have normally scheduled. Because optprobe
* may modify multiple instructions, there is a chance that Nth
* instruction is preempted. In that case, such tasks can return
* to 2nd-Nth byte of jump instruction. This wait is for avoiding it.
* Note that on non-preemptive kernel, this is transparently converted
* to synchronoze_sched() to wait for all interrupts to have completed.
*/
synchronize_rcu_tasks();
/* Step 3: Optimize kprobes after quiesence period */
do_optimize_kprobes();
/* Step 4: Free cleaned kprobes after quiesence period */
do_free_cleaned_kprobes();
mutex_unlock(&text_mutex);
cpus_read_unlock();
/* Step 5: Kick optimizer again if needed */
if (!list_empty(&optimizing_list) || !list_empty(&unoptimizing_list))
kick_kprobe_optimizer();
mutex_unlock(&kprobe_mutex);
}
/* Wait for completing optimization and unoptimization */
void wait_for_kprobe_optimizer(void)
{
mutex_lock(&kprobe_mutex);
while (!list_empty(&optimizing_list) || !list_empty(&unoptimizing_list)) {
mutex_unlock(&kprobe_mutex);
/* this will also make optimizing_work execute immmediately */
flush_delayed_work(&optimizing_work);
/* @optimizing_work might not have been queued yet, relax */
cpu_relax();
mutex_lock(&kprobe_mutex);
}
mutex_unlock(&kprobe_mutex);
}
static bool optprobe_queued_unopt(struct optimized_kprobe *op)
{
struct optimized_kprobe *_op;
list_for_each_entry(_op, &unoptimizing_list, list) {
if (op == _op)
return true;
}
return false;
}
/* Optimize kprobe if p is ready to be optimized */
static void optimize_kprobe(struct kprobe *p)
{
struct optimized_kprobe *op;
/* Check if the kprobe is disabled or not ready for optimization. */
if (!kprobe_optready(p) || !kprobes_allow_optimization ||
(kprobe_disabled(p) || kprobes_all_disarmed))
return;
/* kprobes with post_handler can not be optimized */
if (p->post_handler)
return;
op = container_of(p, struct optimized_kprobe, kp);
/* Check there is no other kprobes at the optimized instructions */
if (arch_check_optimized_kprobe(op) < 0)
return;
/* Check if it is already optimized. */
if (op->kp.flags & KPROBE_FLAG_OPTIMIZED) {
if (optprobe_queued_unopt(op)) {
/* This is under unoptimizing. Just dequeue the probe */
list_del_init(&op->list);
}
return;
}
op->kp.flags |= KPROBE_FLAG_OPTIMIZED;
/* On unoptimizing/optimizing_list, op must have OPTIMIZED flag */
if (WARN_ON_ONCE(!list_empty(&op->list)))
return;
list_add(&op->list, &optimizing_list);
kick_kprobe_optimizer();
}
/* Short cut to direct unoptimizing */
static void force_unoptimize_kprobe(struct optimized_kprobe *op)
{
lockdep_assert_cpus_held();
arch_unoptimize_kprobe(op);
op->kp.flags &= ~KPROBE_FLAG_OPTIMIZED;
}
/* Unoptimize a kprobe if p is optimized */
static void unoptimize_kprobe(struct kprobe *p, bool force)
{
struct optimized_kprobe *op;
if (!kprobe_aggrprobe(p) || kprobe_disarmed(p))
return; /* This is not an optprobe nor optimized */
op = container_of(p, struct optimized_kprobe, kp);
if (!kprobe_optimized(p))
return;
if (!list_empty(&op->list)) {
if (optprobe_queued_unopt(op)) {
/* Queued in unoptimizing queue */
if (force) {
/*
* Forcibly unoptimize the kprobe here, and queue it
* in the freeing list for release afterwards.
*/
force_unoptimize_kprobe(op);
list_move(&op->list, &freeing_list);
}
} else {
/* Dequeue from the optimizing queue */
list_del_init(&op->list);
op->kp.flags &= ~KPROBE_FLAG_OPTIMIZED;
}
return;
}
/* Optimized kprobe case */
if (force) {
/* Forcibly update the code: this is a special case */
force_unoptimize_kprobe(op);
} else {
list_add(&op->list, &unoptimizing_list);
kick_kprobe_optimizer();
}
}
/* Cancel unoptimizing for reusing */
static int reuse_unused_kprobe(struct kprobe *ap)
{
struct optimized_kprobe *op;
/*
* Unused kprobe MUST be on the way of delayed unoptimizing (means
* there is still a relative jump) and disabled.
*/
op = container_of(ap, struct optimized_kprobe, kp);
WARN_ON_ONCE(list_empty(&op->list));
/* Enable the probe again */
ap->flags &= ~KPROBE_FLAG_DISABLED;
/* Optimize it again (remove from op->list) */
if (!kprobe_optready(ap))
return -EINVAL;
optimize_kprobe(ap);
return 0;
}
/* Remove optimized instructions */
static void kill_optimized_kprobe(struct kprobe *p)
{
struct optimized_kprobe *op;
op = container_of(p, struct optimized_kprobe, kp);
if (!list_empty(&op->list))
/* Dequeue from the (un)optimization queue */
list_del_init(&op->list);
op->kp.flags &= ~KPROBE_FLAG_OPTIMIZED;
if (kprobe_unused(p)) {
/* Enqueue if it is unused */
list_add(&op->list, &freeing_list);
/*
* Remove unused probes from the hash list. After waiting
* for synchronization, this probe is reclaimed.
* (reclaiming is done by do_free_cleaned_kprobes().)
*/
hlist_del_rcu(&op->kp.hlist);
}
/* Don't touch the code, because it is already freed. */
arch_remove_optimized_kprobe(op);
}
static inline
void __prepare_optimized_kprobe(struct optimized_kprobe *op, struct kprobe *p)
{
if (!kprobe_ftrace(p))
arch_prepare_optimized_kprobe(op, p);
}
/* Try to prepare optimized instructions */
static void prepare_optimized_kprobe(struct kprobe *p)
{
struct optimized_kprobe *op;
op = container_of(p, struct optimized_kprobe, kp);
__prepare_optimized_kprobe(op, p);
}
/* Allocate new optimized_kprobe and try to prepare optimized instructions */
static struct kprobe *alloc_aggr_kprobe(struct kprobe *p)
{
struct optimized_kprobe *op;
op = kzalloc(sizeof(struct optimized_kprobe), GFP_KERNEL);
if (!op)
return NULL;
INIT_LIST_HEAD(&op->list);
op->kp.addr = p->addr;
__prepare_optimized_kprobe(op, p);
return &op->kp;
}
static void init_aggr_kprobe(struct kprobe *ap, struct kprobe *p);
/*
* Prepare an optimized_kprobe and optimize it
* NOTE: p must be a normal registered kprobe
*/
static void try_to_optimize_kprobe(struct kprobe *p)
{
struct kprobe *ap;
struct optimized_kprobe *op;
/* Impossible to optimize ftrace-based kprobe */
if (kprobe_ftrace(p))
return;
/* For preparing optimization, jump_label_text_reserved() is called */
cpus_read_lock();
jump_label_lock();
mutex_lock(&text_mutex);
ap = alloc_aggr_kprobe(p);
if (!ap)
goto out;
op = container_of(ap, struct optimized_kprobe, kp);
if (!arch_prepared_optinsn(&op->optinsn)) {
/* If failed to setup optimizing, fallback to kprobe */
arch_remove_optimized_kprobe(op);
kfree(op);
goto out;
}
init_aggr_kprobe(ap, p);
optimize_kprobe(ap); /* This just kicks optimizer thread */
out:
mutex_unlock(&text_mutex);
jump_label_unlock();
cpus_read_unlock();
}
static void optimize_all_kprobes(void)
{
struct hlist_head *head;
struct kprobe *p;
unsigned int i;
mutex_lock(&kprobe_mutex);
/* If optimization is already allowed, just return */
if (kprobes_allow_optimization)
goto out;
cpus_read_lock();
kprobes_allow_optimization = true;
for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
head = &kprobe_table[i];
hlist_for_each_entry(p, head, hlist)
if (!kprobe_disabled(p))
optimize_kprobe(p);
}
cpus_read_unlock();
printk(KERN_INFO "Kprobes globally optimized\n");
out:
mutex_unlock(&kprobe_mutex);
}
#ifdef CONFIG_SYSCTL
static void unoptimize_all_kprobes(void)
{
struct hlist_head *head;
struct kprobe *p;
unsigned int i;
mutex_lock(&kprobe_mutex);
/* If optimization is already prohibited, just return */
if (!kprobes_allow_optimization) {
mutex_unlock(&kprobe_mutex);
return;
}
cpus_read_lock();
kprobes_allow_optimization = false;
for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
head = &kprobe_table[i];
hlist_for_each_entry(p, head, hlist) {
if (!kprobe_disabled(p))
unoptimize_kprobe(p, false);
}
}
cpus_read_unlock();
mutex_unlock(&kprobe_mutex);
/* Wait for unoptimizing completion */
wait_for_kprobe_optimizer();
printk(KERN_INFO "Kprobes globally unoptimized\n");
}
static DEFINE_MUTEX(kprobe_sysctl_mutex);
int sysctl_kprobes_optimization;
int proc_kprobes_optimization_handler(struct ctl_table *table, int write,
void *buffer, size_t *length,
loff_t *ppos)
{
int ret;
mutex_lock(&kprobe_sysctl_mutex);
sysctl_kprobes_optimization = kprobes_allow_optimization ? 1 : 0;
ret = proc_dointvec_minmax(table, write, buffer, length, ppos);
if (sysctl_kprobes_optimization)
optimize_all_kprobes();
else
unoptimize_all_kprobes();
mutex_unlock(&kprobe_sysctl_mutex);
return ret;
}
#endif /* CONFIG_SYSCTL */
/* Put a breakpoint for a probe. Must be called with text_mutex locked */
static void __arm_kprobe(struct kprobe *p)
{
struct kprobe *_p;
/* Check collision with other optimized kprobes */
_p = get_optimized_kprobe((unsigned long)p->addr);
if (unlikely(_p))
/* Fallback to unoptimized kprobe */
unoptimize_kprobe(_p, true);
arch_arm_kprobe(p);
optimize_kprobe(p); /* Try to optimize (add kprobe to a list) */
}
/* Remove the breakpoint of a probe. Must be called with text_mutex locked */
static void __disarm_kprobe(struct kprobe *p, bool reopt)
{
struct kprobe *_p;
/* Try to unoptimize */
unoptimize_kprobe(p, kprobes_all_disarmed);
if (!kprobe_queued(p)) {
arch_disarm_kprobe(p);
/* If another kprobe was blocked, optimize it. */
_p = get_optimized_kprobe((unsigned long)p->addr);
if (unlikely(_p) && reopt)
optimize_kprobe(_p);
}
/* TODO: reoptimize others after unoptimized this probe */
}
#else /* !CONFIG_OPTPROBES */
#define optimize_kprobe(p) do {} while (0)
#define unoptimize_kprobe(p, f) do {} while (0)
#define kill_optimized_kprobe(p) do {} while (0)
#define prepare_optimized_kprobe(p) do {} while (0)
#define try_to_optimize_kprobe(p) do {} while (0)
#define __arm_kprobe(p) arch_arm_kprobe(p)
#define __disarm_kprobe(p, o) arch_disarm_kprobe(p)
#define kprobe_disarmed(p) kprobe_disabled(p)
#define wait_for_kprobe_optimizer() do {} while (0)
static int reuse_unused_kprobe(struct kprobe *ap)
{
/*
* If the optimized kprobe is NOT supported, the aggr kprobe is
* released at the same time that the last aggregated kprobe is
* unregistered.
* Thus there should be no chance to reuse unused kprobe.
*/
printk(KERN_ERR "Error: There should be no unused kprobe here.\n");
return -EINVAL;
}
static void free_aggr_kprobe(struct kprobe *p)
{
arch_remove_kprobe(p);
kfree(p);
}
static struct kprobe *alloc_aggr_kprobe(struct kprobe *p)
{
return kzalloc(sizeof(struct kprobe), GFP_KERNEL);
}
#endif /* CONFIG_OPTPROBES */
#ifdef CONFIG_KPROBES_ON_FTRACE
static struct ftrace_ops kprobe_ftrace_ops __read_mostly = {
.func = kprobe_ftrace_handler,
.flags = FTRACE_OPS_FL_SAVE_REGS,
};
static struct ftrace_ops kprobe_ipmodify_ops __read_mostly = {
.func = kprobe_ftrace_handler,
.flags = FTRACE_OPS_FL_SAVE_REGS | FTRACE_OPS_FL_IPMODIFY,
};
static int kprobe_ipmodify_enabled;
static int kprobe_ftrace_enabled;
/* Must ensure p->addr is really on ftrace */
static int prepare_kprobe(struct kprobe *p)
{
if (!kprobe_ftrace(p))
return arch_prepare_kprobe(p);
return arch_prepare_kprobe_ftrace(p);
}
/* Caller must lock kprobe_mutex */
static int __arm_kprobe_ftrace(struct kprobe *p, struct ftrace_ops *ops,
int *cnt)
{
int ret = 0;
ret = ftrace_set_filter_ip(ops, (unsigned long)p->addr, 0, 0);
if (ret) {
pr_debug("Failed to arm kprobe-ftrace at %pS (%d)\n",
p->addr, ret);
return ret;
}
if (*cnt == 0) {
ret = register_ftrace_function(ops);
if (ret) {
pr_debug("Failed to init kprobe-ftrace (%d)\n", ret);
goto err_ftrace;
}
}
(*cnt)++;
return ret;
err_ftrace:
/*
* At this point, sinec ops is not registered, we should be sefe from
* registering empty filter.
*/
ftrace_set_filter_ip(ops, (unsigned long)p->addr, 1, 0);
return ret;
}
static int arm_kprobe_ftrace(struct kprobe *p)
{
bool ipmodify = (p->post_handler != NULL);
return __arm_kprobe_ftrace(p,
ipmodify ? &kprobe_ipmodify_ops : &kprobe_ftrace_ops,
ipmodify ? &kprobe_ipmodify_enabled : &kprobe_ftrace_enabled);
}
/* Caller must lock kprobe_mutex */
static int __disarm_kprobe_ftrace(struct kprobe *p, struct ftrace_ops *ops,
int *cnt)
{
int ret = 0;
if (*cnt == 1) {
ret = unregister_ftrace_function(ops);
if (WARN(ret < 0, "Failed to unregister kprobe-ftrace (%d)\n", ret))
return ret;
}
(*cnt)--;
ret = ftrace_set_filter_ip(ops, (unsigned long)p->addr, 1, 0);
WARN_ONCE(ret < 0, "Failed to disarm kprobe-ftrace at %pS (%d)\n",
p->addr, ret);
return ret;
}
static int disarm_kprobe_ftrace(struct kprobe *p)
{
bool ipmodify = (p->post_handler != NULL);
return __disarm_kprobe_ftrace(p,
ipmodify ? &kprobe_ipmodify_ops : &kprobe_ftrace_ops,
ipmodify ? &kprobe_ipmodify_enabled : &kprobe_ftrace_enabled);
}
#else /* !CONFIG_KPROBES_ON_FTRACE */
static inline int prepare_kprobe(struct kprobe *p)
{
return arch_prepare_kprobe(p);
}
static inline int arm_kprobe_ftrace(struct kprobe *p)
{
return -ENODEV;
}
static inline int disarm_kprobe_ftrace(struct kprobe *p)
{
return -ENODEV;
}
#endif
/* Arm a kprobe with text_mutex */
static int arm_kprobe(struct kprobe *kp)
{
if (unlikely(kprobe_ftrace(kp)))
return arm_kprobe_ftrace(kp);
cpus_read_lock();
mutex_lock(&text_mutex);
__arm_kprobe(kp);
mutex_unlock(&text_mutex);
cpus_read_unlock();
return 0;
}
/* Disarm a kprobe with text_mutex */
static int disarm_kprobe(struct kprobe *kp, bool reopt)
{
if (unlikely(kprobe_ftrace(kp)))
return disarm_kprobe_ftrace(kp);
cpus_read_lock();
mutex_lock(&text_mutex);
__disarm_kprobe(kp, reopt);
mutex_unlock(&text_mutex);
cpus_read_unlock();
return 0;
}
/*
* Aggregate handlers for multiple kprobes support - these handlers
* take care of invoking the individual kprobe handlers on p->list
*/
static int aggr_pre_handler(struct kprobe *p, struct pt_regs *regs)
{
struct kprobe *kp;
list_for_each_entry_rcu(kp, &p->list, list) {
if (kp->pre_handler && likely(!kprobe_disabled(kp))) {
set_kprobe_instance(kp);
if (kp->pre_handler(kp, regs))
return 1;
}
reset_kprobe_instance();
}
return 0;
}
NOKPROBE_SYMBOL(aggr_pre_handler);
static void aggr_post_handler(struct kprobe *p, struct pt_regs *regs,
unsigned long flags)
{
struct kprobe *kp;
list_for_each_entry_rcu(kp, &p->list, list) {
if (kp->post_handler && likely(!kprobe_disabled(kp))) {
set_kprobe_instance(kp);
kp->post_handler(kp, regs, flags);
reset_kprobe_instance();
}
}
}
NOKPROBE_SYMBOL(aggr_post_handler);
/* Walks the list and increments nmissed count for multiprobe case */
void kprobes_inc_nmissed_count(struct kprobe *p)
{
struct kprobe *kp;
if (!kprobe_aggrprobe(p)) {
p->nmissed++;
} else {
list_for_each_entry_rcu(kp, &p->list, list)
kp->nmissed++;
}
return;
}
NOKPROBE_SYMBOL(kprobes_inc_nmissed_count);
static void free_rp_inst_rcu(struct rcu_head *head)
{
struct kretprobe_instance *ri = container_of(head, struct kretprobe_instance, rcu);
if (refcount_dec_and_test(&ri->rph->ref))
kfree(ri->rph);
kfree(ri);
}
NOKPROBE_SYMBOL(free_rp_inst_rcu);
static void recycle_rp_inst(struct kretprobe_instance *ri)
{
struct kretprobe *rp = get_kretprobe(ri);
if (likely(rp)) {
freelist_add(&ri->freelist, &rp->freelist);
} else
call_rcu(&ri->rcu, free_rp_inst_rcu);
}
NOKPROBE_SYMBOL(recycle_rp_inst);
static struct kprobe kprobe_busy = {
.addr = (void *) get_kprobe,
};
void kprobe_busy_begin(void)
{
struct kprobe_ctlblk *kcb;
preempt_disable();
__this_cpu_write(current_kprobe, &kprobe_busy);
kcb = get_kprobe_ctlblk();
kcb->kprobe_status = KPROBE_HIT_ACTIVE;
}
void kprobe_busy_end(void)
{
__this_cpu_write(current_kprobe, NULL);
preempt_enable();
}
/*
* This function is called from finish_task_switch when task tk becomes dead,
* so that we can recycle any function-return probe instances associated
* with this task. These left over instances represent probed functions
* that have been called but will never return.
*/
void kprobe_flush_task(struct task_struct *tk)
{
struct kretprobe_instance *ri;
struct llist_node *node;
/* Early boot, not yet initialized. */
if (unlikely(!kprobes_initialized))
return;
kprobe_busy_begin();
node = __llist_del_all(&tk->kretprobe_instances);
while (node) {
ri = container_of(node, struct kretprobe_instance, llist);
node = node->next;
recycle_rp_inst(ri);
}
kprobe_busy_end();
}
NOKPROBE_SYMBOL(kprobe_flush_task);
static inline void free_rp_inst(struct kretprobe *rp)
{
struct kretprobe_instance *ri;
struct freelist_node *node;
int count = 0;
node = rp->freelist.head;
while (node) {
ri = container_of(node, struct kretprobe_instance, freelist);
node = node->next;
kfree(ri);
count++;
}
if (refcount_sub_and_test(count, &rp->rph->ref)) {
kfree(rp->rph);
rp->rph = NULL;
}
}
/* Add the new probe to ap->list */
static int add_new_kprobe(struct kprobe *ap, struct kprobe *p)
{
if (p->post_handler)
unoptimize_kprobe(ap, true); /* Fall back to normal kprobe */
list_add_rcu(&p->list, &ap->list);
if (p->post_handler && !ap->post_handler)
ap->post_handler = aggr_post_handler;
return 0;
}
/*
* Fill in the required fields of the "manager kprobe". Replace the
* earlier kprobe in the hlist with the manager kprobe
*/
static void init_aggr_kprobe(struct kprobe *ap, struct kprobe *p)
{
/* Copy p's insn slot to ap */
copy_kprobe(p, ap);
flush_insn_slot(ap);
ap->addr = p->addr;
ap->flags = p->flags & ~KPROBE_FLAG_OPTIMIZED;
ap->pre_handler = aggr_pre_handler;
/* We don't care the kprobe which has gone. */
if (p->post_handler && !kprobe_gone(p))
ap->post_handler = aggr_post_handler;
INIT_LIST_HEAD(&ap->list);
INIT_HLIST_NODE(&ap->hlist);
list_add_rcu(&p->list, &ap->list);
hlist_replace_rcu(&p->hlist, &ap->hlist);
}
/*
* This is the second or subsequent kprobe at the address - handle
* the intricacies
*/
static int register_aggr_kprobe(struct kprobe *orig_p, struct kprobe *p)
{
int ret = 0;
struct kprobe *ap = orig_p;
cpus_read_lock();
/* For preparing optimization, jump_label_text_reserved() is called */
jump_label_lock();
mutex_lock(&text_mutex);
if (!kprobe_aggrprobe(orig_p)) {
/* If orig_p is not an aggr_kprobe, create new aggr_kprobe. */
ap = alloc_aggr_kprobe(orig_p);
if (!ap) {
ret = -ENOMEM;
goto out;
}
init_aggr_kprobe(ap, orig_p);
} else if (kprobe_unused(ap)) {
/* This probe is going to die. Rescue it */
ret = reuse_unused_kprobe(ap);
if (ret)
goto out;
}
if (kprobe_gone(ap)) {
/*
* Attempting to insert new probe at the same location that
* had a probe in the module vaddr area which already
* freed. So, the instruction slot has already been
* released. We need a new slot for the new probe.
*/
ret = arch_prepare_kprobe(ap);
if (ret)
/*
* Even if fail to allocate new slot, don't need to
* free aggr_probe. It will be used next time, or
* freed by unregister_kprobe.
*/
goto out;
/* Prepare optimized instructions if possible. */
prepare_optimized_kprobe(ap);
/*
* Clear gone flag to prevent allocating new slot again, and
* set disabled flag because it is not armed yet.
*/
ap->flags = (ap->flags & ~KPROBE_FLAG_GONE)
| KPROBE_FLAG_DISABLED;
}
/* Copy ap's insn slot to p */
copy_kprobe(ap, p);
ret = add_new_kprobe(ap, p);
out:
mutex_unlock(&text_mutex);
jump_label_unlock();
cpus_read_unlock();
if (ret == 0 && kprobe_disabled(ap) && !kprobe_disabled(p)) {
ap->flags &= ~KPROBE_FLAG_DISABLED;
if (!kprobes_all_disarmed) {
/* Arm the breakpoint again. */
ret = arm_kprobe(ap);
if (ret) {
ap->flags |= KPROBE_FLAG_DISABLED;
list_del_rcu(&p->list);
synchronize_rcu();
}
}
}
return ret;
}
bool __weak arch_within_kprobe_blacklist(unsigned long addr)
{
/* The __kprobes marked functions and entry code must not be probed */
return addr >= (unsigned long)__kprobes_text_start &&
addr < (unsigned long)__kprobes_text_end;
}
static bool __within_kprobe_blacklist(unsigned long addr)
{
struct kprobe_blacklist_entry *ent;
if (arch_within_kprobe_blacklist(addr))
return true;
/*
* If there exists a kprobe_blacklist, verify and
* fail any probe registration in the prohibited area
*/
list_for_each_entry(ent, &kprobe_blacklist, list) {
if (addr >= ent->start_addr && addr < ent->end_addr)
return true;
}
return false;
}
bool within_kprobe_blacklist(unsigned long addr)
{
char symname[KSYM_NAME_LEN], *p;
if (__within_kprobe_blacklist(addr))
return true;
/* Check if the address is on a suffixed-symbol */
if (!lookup_symbol_name(addr, symname)) {
p = strchr(symname, '.');
if (!p)
return false;
*p = '\0';
addr = (unsigned long)kprobe_lookup_name(symname, 0);
if (addr)
return __within_kprobe_blacklist(addr);
}
return false;
}
/*
* If we have a symbol_name argument, look it up and add the offset field
* to it. This way, we can specify a relative address to a symbol.
* This returns encoded errors if it fails to look up symbol or invalid
* combination of parameters.
*/
static kprobe_opcode_t *_kprobe_addr(kprobe_opcode_t *addr,
const char *symbol_name, unsigned int offset)
{
if ((symbol_name && addr) || (!symbol_name && !addr))
goto invalid;
if (symbol_name) {
addr = kprobe_lookup_name(symbol_name, offset);
if (!addr)
return ERR_PTR(-ENOENT);
}
addr = (kprobe_opcode_t *)(((char *)addr) + offset);
if (addr)
return addr;
invalid:
return ERR_PTR(-EINVAL);
}
static kprobe_opcode_t *kprobe_addr(struct kprobe *p)
{
return _kprobe_addr(p->addr, p->symbol_name, p->offset);
}
/* Check passed kprobe is valid and return kprobe in kprobe_table. */
static struct kprobe *__get_valid_kprobe(struct kprobe *p)
{
struct kprobe *ap, *list_p;
lockdep_assert_held(&kprobe_mutex);
ap = get_kprobe(p->addr);
if (unlikely(!ap))
return NULL;
if (p != ap) {
list_for_each_entry(list_p, &ap->list, list)
if (list_p == p)
/* kprobe p is a valid probe */
goto valid;
return NULL;
}
valid:
return ap;
}
/*
* Warn and return error if the kprobe is being re-registered since
* there must be a software bug.
*/
static inline int warn_kprobe_rereg(struct kprobe *p)
{
int ret = 0;
mutex_lock(&kprobe_mutex);
if (WARN_ON_ONCE(__get_valid_kprobe(p)))
ret = -EINVAL;
mutex_unlock(&kprobe_mutex);
return ret;
}
int __weak arch_check_ftrace_location(struct kprobe *p)
{
unsigned long ftrace_addr;
ftrace_addr = ftrace_location((unsigned long)p->addr);
if (ftrace_addr) {
#ifdef CONFIG_KPROBES_ON_FTRACE
/* Given address is not on the instruction boundary */
if ((unsigned long)p->addr != ftrace_addr)
return -EILSEQ;
p->flags |= KPROBE_FLAG_FTRACE;
#else /* !CONFIG_KPROBES_ON_FTRACE */
return -EINVAL;
#endif
}
return 0;
}
static int check_kprobe_address_safe(struct kprobe *p,
struct module **probed_mod)
{
int ret;
ret = arch_check_ftrace_location(p);
if (ret)
return ret;
jump_label_lock();
preempt_disable();
/* Ensure it is not in reserved area nor out of text */
if (!kernel_text_address((unsigned long) p->addr) ||
within_kprobe_blacklist((unsigned long) p->addr) ||
jump_label_text_reserved(p->addr, p->addr) ||
static_call_text_reserved(p->addr, p->addr) ||
find_bug((unsigned long)p->addr)) {
ret = -EINVAL;
goto out;
}
/* Check if are we probing a module */
*probed_mod = __module_text_address((unsigned long) p->addr);
if (*probed_mod) {
/*
* We must hold a refcount of the probed module while updating
* its code to prohibit unexpected unloading.
*/
if (unlikely(!try_module_get(*probed_mod))) {
ret = -ENOENT;
goto out;
}
/*
* If the module freed .init.text, we couldn't insert
* kprobes in there.
*/
if (within_module_init((unsigned long)p->addr, *probed_mod) &&
(*probed_mod)->state != MODULE_STATE_COMING) {
module_put(*probed_mod);
*probed_mod = NULL;
ret = -ENOENT;
}
}
out:
preempt_enable();
jump_label_unlock();
return ret;
}
int register_kprobe(struct kprobe *p)
{
int ret;
struct kprobe *old_p;
struct module *probed_mod;
kprobe_opcode_t *addr;
/* Adjust probe address from symbol */
addr = kprobe_addr(p);
if (IS_ERR(addr))
return PTR_ERR(addr);
p->addr = addr;
ret = warn_kprobe_rereg(p);
if (ret)
return ret;
/* User can pass only KPROBE_FLAG_DISABLED to register_kprobe */
p->flags &= KPROBE_FLAG_DISABLED;
p->nmissed = 0;
INIT_LIST_HEAD(&p->list);
ret = check_kprobe_address_safe(p, &probed_mod);
if (ret)
return ret;
mutex_lock(&kprobe_mutex);
old_p = get_kprobe(p->addr);
if (old_p) {
/* Since this may unoptimize old_p, locking text_mutex. */
ret = register_aggr_kprobe(old_p, p);
goto out;
}
cpus_read_lock();
/* Prevent text modification */
mutex_lock(&text_mutex);
ret = prepare_kprobe(p);
mutex_unlock(&text_mutex);
cpus_read_unlock();
if (ret)
goto out;
INIT_HLIST_NODE(&p->hlist);
hlist_add_head_rcu(&p->hlist,
&kprobe_table[hash_ptr(p->addr, KPROBE_HASH_BITS)]);
if (!kprobes_all_disarmed && !kprobe_disabled(p)) {
ret = arm_kprobe(p);
if (ret) {
hlist_del_rcu(&p->hlist);
synchronize_rcu();
goto out;
}
}
/* Try to optimize kprobe */
try_to_optimize_kprobe(p);
out:
mutex_unlock(&kprobe_mutex);
if (probed_mod)
module_put(probed_mod);
return ret;
}
EXPORT_SYMBOL_GPL(register_kprobe);
/* Check if all probes on the aggrprobe are disabled */
static int aggr_kprobe_disabled(struct kprobe *ap)
{
struct kprobe *kp;
lockdep_assert_held(&kprobe_mutex);
list_for_each_entry(kp, &ap->list, list)
if (!kprobe_disabled(kp))
/*
* There is an active probe on the list.
* We can't disable this ap.
*/
return 0;
return 1;
}
/* Disable one kprobe: Make sure called under kprobe_mutex is locked */
static struct kprobe *__disable_kprobe(struct kprobe *p)
{
struct kprobe *orig_p;
int ret;
/* Get an original kprobe for return */
orig_p = __get_valid_kprobe(p);
if (unlikely(orig_p == NULL))
return ERR_PTR(-EINVAL);
if (!kprobe_disabled(p)) {
/* Disable probe if it is a child probe */
if (p != orig_p)
p->flags |= KPROBE_FLAG_DISABLED;
/* Try to disarm and disable this/parent probe */
if (p == orig_p || aggr_kprobe_disabled(orig_p)) {
/*
* If kprobes_all_disarmed is set, orig_p
* should have already been disarmed, so
* skip unneed disarming process.
*/
if (!kprobes_all_disarmed) {
ret = disarm_kprobe(orig_p, true);
if (ret) {
p->flags &= ~KPROBE_FLAG_DISABLED;
return ERR_PTR(ret);
}
}
orig_p->flags |= KPROBE_FLAG_DISABLED;
}
}
return orig_p;
}
/*
* Unregister a kprobe without a scheduler synchronization.
*/
static int __unregister_kprobe_top(struct kprobe *p)
{
struct kprobe *ap, *list_p;
/* Disable kprobe. This will disarm it if needed. */
ap = __disable_kprobe(p);
if (IS_ERR(ap))
return PTR_ERR(ap);
if (ap == p)
/*
* This probe is an independent(and non-optimized) kprobe
* (not an aggrprobe). Remove from the hash list.
*/
goto disarmed;
/* Following process expects this probe is an aggrprobe */
WARN_ON(!kprobe_aggrprobe(ap));
if (list_is_singular(&ap->list) && kprobe_disarmed(ap))
/*
* !disarmed could be happen if the probe is under delayed
* unoptimizing.
*/
goto disarmed;
else {
/* If disabling probe has special handlers, update aggrprobe */
if (p->post_handler && !kprobe_gone(p)) {
list_for_each_entry(list_p, &ap->list, list) {
if ((list_p != p) && (list_p->post_handler))
goto noclean;
}
ap->post_handler = NULL;
}
noclean:
/*
* Remove from the aggrprobe: this path will do nothing in
* __unregister_kprobe_bottom().
*/
list_del_rcu(&p->list);
if (!kprobe_disabled(ap) && !kprobes_all_disarmed)
/*
* Try to optimize this probe again, because post
* handler may have been changed.
*/
optimize_kprobe(ap);
}
return 0;
disarmed:
hlist_del_rcu(&ap->hlist);
return 0;
}
static void __unregister_kprobe_bottom(struct kprobe *p)
{
struct kprobe *ap;
if (list_empty(&p->list))
/* This is an independent kprobe */
arch_remove_kprobe(p);
else if (list_is_singular(&p->list)) {
/* This is the last child of an aggrprobe */
ap = list_entry(p->list.next, struct kprobe, list);
list_del(&p->list);
free_aggr_kprobe(ap);
}
/* Otherwise, do nothing. */
}
int register_kprobes(struct kprobe **kps, int num)
{
int i, ret = 0;
if (num <= 0)
return -EINVAL;
for (i = 0; i < num; i++) {
ret = register_kprobe(kps[i]);
if (ret < 0) {
if (i > 0)
unregister_kprobes(kps, i);
break;
}
}
return ret;
}
EXPORT_SYMBOL_GPL(register_kprobes);
void unregister_kprobe(struct kprobe *p)
{
unregister_kprobes(&p, 1);
}
EXPORT_SYMBOL_GPL(unregister_kprobe);
void unregister_kprobes(struct kprobe **kps, int num)
{
int i;
if (num <= 0)
return;
mutex_lock(&kprobe_mutex);
for (i = 0; i < num; i++)
if (__unregister_kprobe_top(kps[i]) < 0)
kps[i]->addr = NULL;
mutex_unlock(&kprobe_mutex);
synchronize_rcu();
for (i = 0; i < num; i++)
if (kps[i]->addr)
__unregister_kprobe_bottom(kps[i]);
}
EXPORT_SYMBOL_GPL(unregister_kprobes);
int __weak kprobe_exceptions_notify(struct notifier_block *self,
unsigned long val, void *data)
{
return NOTIFY_DONE;
}
NOKPROBE_SYMBOL(kprobe_exceptions_notify);
static struct notifier_block kprobe_exceptions_nb = {
.notifier_call = kprobe_exceptions_notify,
.priority = 0x7fffffff /* we need to be notified first */
};
unsigned long __weak arch_deref_entry_point(void *entry)
{
return (unsigned long)entry;
}
#ifdef CONFIG_KRETPROBES
unsigned long __kretprobe_trampoline_handler(struct pt_regs *regs,
void *trampoline_address,
void *frame_pointer)
{
kprobe_opcode_t *correct_ret_addr = NULL;
struct kretprobe_instance *ri = NULL;
struct llist_node *first, *node;
struct kretprobe *rp;
/* Find all nodes for this frame. */
first = node = current->kretprobe_instances.first;
while (node) {
ri = container_of(node, struct kretprobe_instance, llist);
BUG_ON(ri->fp != frame_pointer);
if (ri->ret_addr != trampoline_address) {
correct_ret_addr = ri->ret_addr;
/*
* This is the real return address. Any other
* instances associated with this task are for
* other calls deeper on the call stack
*/
goto found;
}
node = node->next;
}
pr_err("Oops! Kretprobe fails to find correct return address.\n");
BUG_ON(1);
found:
/* Unlink all nodes for this frame. */
current->kretprobe_instances.first = node->next;
node->next = NULL;
/* Run them.. */
while (first) {
ri = container_of(first, struct kretprobe_instance, llist);
first = first->next;
rp = get_kretprobe(ri);
if (rp && rp->handler) {
struct kprobe *prev = kprobe_running();
__this_cpu_write(current_kprobe, &rp->kp);
ri->ret_addr = correct_ret_addr;
rp->handler(ri, regs);
__this_cpu_write(current_kprobe, prev);
}
recycle_rp_inst(ri);
}
return (unsigned long)correct_ret_addr;
}
NOKPROBE_SYMBOL(__kretprobe_trampoline_handler)
/*
* This kprobe pre_handler is registered with every kretprobe. When probe
* hits it will set up the return probe.
*/
static int pre_handler_kretprobe(struct kprobe *p, struct pt_regs *regs)
{
struct kretprobe *rp = container_of(p, struct kretprobe, kp);
struct kretprobe_instance *ri;
struct freelist_node *fn;
fn = freelist_try_get(&rp->freelist);
if (!fn) {
rp->nmissed++;
return 0;
}
ri = container_of(fn, struct kretprobe_instance, freelist);
if (rp->entry_handler && rp->entry_handler(ri, regs)) {
freelist_add(&ri->freelist, &rp->freelist);
return 0;
}
arch_prepare_kretprobe(ri, regs);
__llist_add(&ri->llist, ¤t->kretprobe_instances);
return 0;
}
NOKPROBE_SYMBOL(pre_handler_kretprobe);
bool __weak arch_kprobe_on_func_entry(unsigned long offset)
{
return !offset;
}
/**
* kprobe_on_func_entry() -- check whether given address is function entry
* @addr: Target address
* @sym: Target symbol name
* @offset: The offset from the symbol or the address
*
* This checks whether the given @addr+@offset or @sym+@offset is on the
* function entry address or not.
* This returns 0 if it is the function entry, or -EINVAL if it is not.
* And also it returns -ENOENT if it fails the symbol or address lookup.
* Caller must pass @addr or @sym (either one must be NULL), or this
* returns -EINVAL.
*/
int kprobe_on_func_entry(kprobe_opcode_t *addr, const char *sym, unsigned long offset)
{
kprobe_opcode_t *kp_addr = _kprobe_addr(addr, sym, offset);
if (IS_ERR(kp_addr))
return PTR_ERR(kp_addr);
if (!kallsyms_lookup_size_offset((unsigned long)kp_addr, NULL, &offset))
return -ENOENT;
if (!arch_kprobe_on_func_entry(offset))
return -EINVAL;
return 0;
}
int register_kretprobe(struct kretprobe *rp)
{
int ret;
struct kretprobe_instance *inst;
int i;
void *addr;
ret = kprobe_on_func_entry(rp->kp.addr, rp->kp.symbol_name, rp->kp.offset);
if (ret)
return ret;
/* If only rp->kp.addr is specified, check reregistering kprobes */
if (rp->kp.addr && warn_kprobe_rereg(&rp->kp))
return -EINVAL;
if (kretprobe_blacklist_size) {
addr = kprobe_addr(&rp->kp);
if (IS_ERR(addr))
return PTR_ERR(addr);
for (i = 0; kretprobe_blacklist[i].name != NULL; i++) {
if (kretprobe_blacklist[i].addr == addr)
return -EINVAL;
}
}
if (rp->data_size > KRETPROBE_MAX_DATA_SIZE)
return -E2BIG;
rp->kp.pre_handler = pre_handler_kretprobe;
rp->kp.post_handler = NULL;
/* Pre-allocate memory for max kretprobe instances */
if (rp->maxactive <= 0) {
#ifdef CONFIG_PREEMPTION
rp->maxactive = max_t(unsigned int, 10, 2*num_possible_cpus());
#else
rp->maxactive = num_possible_cpus();
#endif
}
rp->freelist.head = NULL;
rp->rph = kzalloc(sizeof(struct kretprobe_holder), GFP_KERNEL);
if (!rp->rph)
return -ENOMEM;
rp->rph->rp = rp;
for (i = 0; i < rp->maxactive; i++) {
inst = kzalloc(sizeof(struct kretprobe_instance) +
rp->data_size, GFP_KERNEL);
if (inst == NULL) {
refcount_set(&rp->rph->ref, i);
free_rp_inst(rp);
return -ENOMEM;
}
inst->rph = rp->rph;
freelist_add(&inst->freelist, &rp->freelist);
}
refcount_set(&rp->rph->ref, i);
rp->nmissed = 0;
/* Establish function entry probe point */
ret = register_kprobe(&rp->kp);
if (ret != 0)
free_rp_inst(rp);
return ret;
}
EXPORT_SYMBOL_GPL(register_kretprobe);
int register_kretprobes(struct kretprobe **rps, int num)
{
int ret = 0, i;
if (num <= 0)
return -EINVAL;
for (i = 0; i < num; i++) {
ret = register_kretprobe(rps[i]);
if (ret < 0) {
if (i > 0)
unregister_kretprobes(rps, i);
break;
}
}
return ret;
}
EXPORT_SYMBOL_GPL(register_kretprobes);
void unregister_kretprobe(struct kretprobe *rp)
{
unregister_kretprobes(&rp, 1);
}
EXPORT_SYMBOL_GPL(unregister_kretprobe);
void unregister_kretprobes(struct kretprobe **rps, int num)
{
int i;
if (num <= 0)
return;
mutex_lock(&kprobe_mutex);
for (i = 0; i < num; i++) {
if (__unregister_kprobe_top(&rps[i]->kp) < 0)
rps[i]->kp.addr = NULL;
rps[i]->rph->rp = NULL;
}
mutex_unlock(&kprobe_mutex);
synchronize_rcu();
for (i = 0; i < num; i++) {
if (rps[i]->kp.addr) {
__unregister_kprobe_bottom(&rps[i]->kp);
free_rp_inst(rps[i]);
}
}
}
EXPORT_SYMBOL_GPL(unregister_kretprobes);
#else /* CONFIG_KRETPROBES */
int register_kretprobe(struct kretprobe *rp)
{
return -ENOSYS;
}
EXPORT_SYMBOL_GPL(register_kretprobe);
int register_kretprobes(struct kretprobe **rps, int num)
{
return -ENOSYS;
}
EXPORT_SYMBOL_GPL(register_kretprobes);
void unregister_kretprobe(struct kretprobe *rp)
{
}
EXPORT_SYMBOL_GPL(unregister_kretprobe);
void unregister_kretprobes(struct kretprobe **rps, int num)
{
}
EXPORT_SYMBOL_GPL(unregister_kretprobes);
static int pre_handler_kretprobe(struct kprobe *p, struct pt_regs *regs)
{
return 0;
}
NOKPROBE_SYMBOL(pre_handler_kretprobe);
#endif /* CONFIG_KRETPROBES */
/* Set the kprobe gone and remove its instruction buffer. */
static void kill_kprobe(struct kprobe *p)
{
struct kprobe *kp;
lockdep_assert_held(&kprobe_mutex);
p->flags |= KPROBE_FLAG_GONE;
if (kprobe_aggrprobe(p)) {
/*
* If this is an aggr_kprobe, we have to list all the
* chained probes and mark them GONE.
*/
list_for_each_entry(kp, &p->list, list)
kp->flags |= KPROBE_FLAG_GONE;
p->post_handler = NULL;
kill_optimized_kprobe(p);
}
/*
* Here, we can remove insn_slot safely, because no thread calls
* the original probed function (which will be freed soon) any more.
*/
arch_remove_kprobe(p);
/*
* The module is going away. We should disarm the kprobe which
* is using ftrace, because ftrace framework is still available at
* MODULE_STATE_GOING notification.
*/
if (kprobe_ftrace(p) && !kprobe_disabled(p) && !kprobes_all_disarmed)
disarm_kprobe_ftrace(p);
}
/* Disable one kprobe */
int disable_kprobe(struct kprobe *kp)
{
int ret = 0;
struct kprobe *p;
mutex_lock(&kprobe_mutex);
/* Disable this kprobe */
p = __disable_kprobe(kp);
if (IS_ERR(p))
ret = PTR_ERR(p);
mutex_unlock(&kprobe_mutex);
return ret;
}
EXPORT_SYMBOL_GPL(disable_kprobe);
/* Enable one kprobe */
int enable_kprobe(struct kprobe *kp)
{
int ret = 0;
struct kprobe *p;
mutex_lock(&kprobe_mutex);
/* Check whether specified probe is valid. */
p = __get_valid_kprobe(kp);
if (unlikely(p == NULL)) {
ret = -EINVAL;
goto out;
}
if (kprobe_gone(kp)) {
/* This kprobe has gone, we couldn't enable it. */
ret = -EINVAL;
goto out;
}
if (p != kp)
kp->flags &= ~KPROBE_FLAG_DISABLED;
if (!kprobes_all_disarmed && kprobe_disabled(p)) {
p->flags &= ~KPROBE_FLAG_DISABLED;
ret = arm_kprobe(p);
if (ret)
p->flags |= KPROBE_FLAG_DISABLED;
}
out:
mutex_unlock(&kprobe_mutex);
return ret;
}
EXPORT_SYMBOL_GPL(enable_kprobe);
/* Caller must NOT call this in usual path. This is only for critical case */
void dump_kprobe(struct kprobe *kp)
{
pr_err("Dumping kprobe:\n");
pr_err("Name: %s\nOffset: %x\nAddress: %pS\n",
kp->symbol_name, kp->offset, kp->addr);
}
NOKPROBE_SYMBOL(dump_kprobe);
int kprobe_add_ksym_blacklist(unsigned long entry)
{
struct kprobe_blacklist_entry *ent;
unsigned long offset = 0, size = 0;
if (!kernel_text_address(entry) ||
!kallsyms_lookup_size_offset(entry, &size, &offset))
return -EINVAL;
ent = kmalloc(sizeof(*ent), GFP_KERNEL);
if (!ent)
return -ENOMEM;
ent->start_addr = entry;
ent->end_addr = entry + size;
INIT_LIST_HEAD(&ent->list);
list_add_tail(&ent->list, &kprobe_blacklist);
return (int)size;
}
/* Add all symbols in given area into kprobe blacklist */
int kprobe_add_area_blacklist(unsigned long start, unsigned long end)
{
unsigned long entry;
int ret = 0;
for (entry = start; entry < end; entry += ret) {
ret = kprobe_add_ksym_blacklist(entry);
if (ret < 0)
return ret;
if (ret == 0) /* In case of alias symbol */
ret = 1;
}
return 0;
}
/* Remove all symbols in given area from kprobe blacklist */
static void kprobe_remove_area_blacklist(unsigned long start, unsigned long end)
{
struct kprobe_blacklist_entry *ent, *n;
list_for_each_entry_safe(ent, n, &kprobe_blacklist, list) {
if (ent->start_addr < start || ent->start_addr >= end)
continue;
list_del(&ent->list);
kfree(ent);
}
}
static void kprobe_remove_ksym_blacklist(unsigned long entry)
{
kprobe_remove_area_blacklist(entry, entry + 1);
}
int __weak arch_kprobe_get_kallsym(unsigned int *symnum, unsigned long *value,
char *type, char *sym)
{
return -ERANGE;
}
int kprobe_get_kallsym(unsigned int symnum, unsigned long *value, char *type,
char *sym)
{
#ifdef __ARCH_WANT_KPROBES_INSN_SLOT
if (!kprobe_cache_get_kallsym(&kprobe_insn_slots, &symnum, value, type, sym))
return 0;
#ifdef CONFIG_OPTPROBES
if (!kprobe_cache_get_kallsym(&kprobe_optinsn_slots, &symnum, value, type, sym))
return 0;
#endif
#endif
if (!arch_kprobe_get_kallsym(&symnum, value, type, sym))
return 0;
return -ERANGE;
}
int __init __weak arch_populate_kprobe_blacklist(void)
{
return 0;
}
/*
* Lookup and populate the kprobe_blacklist.
*
* Unlike the kretprobe blacklist, we'll need to determine
* the range of addresses that belong to the said functions,
* since a kprobe need not necessarily be at the beginning
* of a function.
*/
static int __init populate_kprobe_blacklist(unsigned long *start,
unsigned long *end)
{
unsigned long entry;
unsigned long *iter;
int ret;
for (iter = start; iter < end; iter++) {
entry = arch_deref_entry_point((void *)*iter);
ret = kprobe_add_ksym_blacklist(entry);
if (ret == -EINVAL)
continue;
if (ret < 0)
return ret;
}
/* Symbols in __kprobes_text are blacklisted */
ret = kprobe_add_area_blacklist((unsigned long)__kprobes_text_start,
(unsigned long)__kprobes_text_end);
if (ret)
return ret;
/* Symbols in noinstr section are blacklisted */
ret = kprobe_add_area_blacklist((unsigned long)__noinstr_text_start,
(unsigned long)__noinstr_text_end);
return ret ? : arch_populate_kprobe_blacklist();
}
static void add_module_kprobe_blacklist(struct module *mod)
{
unsigned long start, end;
int i;
if (mod->kprobe_blacklist) {
for (i = 0; i < mod->num_kprobe_blacklist; i++)
kprobe_add_ksym_blacklist(mod->kprobe_blacklist[i]);
}
start = (unsigned long)mod->kprobes_text_start;
if (start) {
end = start + mod->kprobes_text_size;
kprobe_add_area_blacklist(start, end);
}
start = (unsigned long)mod->noinstr_text_start;
if (start) {
end = start + mod->noinstr_text_size;
kprobe_add_area_blacklist(start, end);
}
}
static void remove_module_kprobe_blacklist(struct module *mod)
{
unsigned long start, end;
int i;
if (mod->kprobe_blacklist) {
for (i = 0; i < mod->num_kprobe_blacklist; i++)
kprobe_remove_ksym_blacklist(mod->kprobe_blacklist[i]);
}
start = (unsigned long)mod->kprobes_text_start;
if (start) {
end = start + mod->kprobes_text_size;
kprobe_remove_area_blacklist(start, end);
}
start = (unsigned long)mod->noinstr_text_start;
if (start) {
end = start + mod->noinstr_text_size;
kprobe_remove_area_blacklist(start, end);
}
}
/* Module notifier call back, checking kprobes on the module */
static int kprobes_module_callback(struct notifier_block *nb,
unsigned long val, void *data)
{
struct module *mod = data;
struct hlist_head *head;
struct kprobe *p;
unsigned int i;
int checkcore = (val == MODULE_STATE_GOING);
if (val == MODULE_STATE_COMING) {
mutex_lock(&kprobe_mutex);
add_module_kprobe_blacklist(mod);
mutex_unlock(&kprobe_mutex);
}
if (val != MODULE_STATE_GOING && val != MODULE_STATE_LIVE)
return NOTIFY_DONE;
/*
* When MODULE_STATE_GOING was notified, both of module .text and
* .init.text sections would be freed. When MODULE_STATE_LIVE was
* notified, only .init.text section would be freed. We need to
* disable kprobes which have been inserted in the sections.
*/
mutex_lock(&kprobe_mutex);
for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
head = &kprobe_table[i];
hlist_for_each_entry(p, head, hlist)
if (within_module_init((unsigned long)p->addr, mod) ||
(checkcore &&
within_module_core((unsigned long)p->addr, mod))) {
/*
* The vaddr this probe is installed will soon
* be vfreed buy not synced to disk. Hence,
* disarming the breakpoint isn't needed.
*
* Note, this will also move any optimized probes
* that are pending to be removed from their
* corresponding lists to the freeing_list and
* will not be touched by the delayed
* kprobe_optimizer work handler.
*/
kill_kprobe(p);
}
}
if (val == MODULE_STATE_GOING)
remove_module_kprobe_blacklist(mod);
mutex_unlock(&kprobe_mutex);
return NOTIFY_DONE;
}
static struct notifier_block kprobe_module_nb = {
.notifier_call = kprobes_module_callback,
.priority = 0
};
/* Markers of _kprobe_blacklist section */
extern unsigned long __start_kprobe_blacklist[];
extern unsigned long __stop_kprobe_blacklist[];
void kprobe_free_init_mem(void)
{
void *start = (void *)(&__init_begin);
void *end = (void *)(&__init_end);
struct hlist_head *head;
struct kprobe *p;
int i;
mutex_lock(&kprobe_mutex);
/* Kill all kprobes on initmem */
for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
head = &kprobe_table[i];
hlist_for_each_entry(p, head, hlist) {
if (start <= (void *)p->addr && (void *)p->addr < end)
kill_kprobe(p);
}
}
mutex_unlock(&kprobe_mutex);
}
static int __init init_kprobes(void)
{
int i, err = 0;
/* FIXME allocate the probe table, currently defined statically */
/* initialize all list heads */
for (i = 0; i < KPROBE_TABLE_SIZE; i++)
INIT_HLIST_HEAD(&kprobe_table[i]);
err = populate_kprobe_blacklist(__start_kprobe_blacklist,
__stop_kprobe_blacklist);
if (err) {
pr_err("kprobes: failed to populate blacklist: %d\n", err);
pr_err("Please take care of using kprobes.\n");
}
if (kretprobe_blacklist_size) {
/* lookup the function address from its name */
for (i = 0; kretprobe_blacklist[i].name != NULL; i++) {
kretprobe_blacklist[i].addr =
kprobe_lookup_name(kretprobe_blacklist[i].name, 0);
if (!kretprobe_blacklist[i].addr)
printk("kretprobe: lookup failed: %s\n",
kretprobe_blacklist[i].name);
}
}
/* By default, kprobes are armed */
kprobes_all_disarmed = false;
#if defined(CONFIG_OPTPROBES) && defined(__ARCH_WANT_KPROBES_INSN_SLOT)
/* Init kprobe_optinsn_slots for allocation */
kprobe_optinsn_slots.insn_size = MAX_OPTINSN_SIZE;
#endif
err = arch_init_kprobes();
if (!err)
err = register_die_notifier(&kprobe_exceptions_nb);
if (!err)
err = register_module_notifier(&kprobe_module_nb);
kprobes_initialized = (err == 0);
if (!err)
init_test_probes();
return err;
}
early_initcall(init_kprobes);
#if defined(CONFIG_OPTPROBES)
static int __init init_optprobes(void)
{
/*
* Enable kprobe optimization - this kicks the optimizer which
* depends on synchronize_rcu_tasks() and ksoftirqd, that is
* not spawned in early initcall. So delay the optimization.
*/
optimize_all_kprobes();
return 0;
}
subsys_initcall(init_optprobes);
#endif
#ifdef CONFIG_DEBUG_FS
static void report_probe(struct seq_file *pi, struct kprobe *p,
const char *sym, int offset, char *modname, struct kprobe *pp)
{
char *kprobe_type;
void *addr = p->addr;
if (p->pre_handler == pre_handler_kretprobe)
kprobe_type = "r";
else
kprobe_type = "k";
if (!kallsyms_show_value(pi->file->f_cred))
addr = NULL;
if (sym)
seq_printf(pi, "%px %s %s+0x%x %s ",
addr, kprobe_type, sym, offset,
(modname ? modname : " "));
else /* try to use %pS */
seq_printf(pi, "%px %s %pS ",
addr, kprobe_type, p->addr);
if (!pp)
pp = p;
seq_printf(pi, "%s%s%s%s\n",
(kprobe_gone(p) ? "[GONE]" : ""),
((kprobe_disabled(p) && !kprobe_gone(p)) ? "[DISABLED]" : ""),
(kprobe_optimized(pp) ? "[OPTIMIZED]" : ""),
(kprobe_ftrace(pp) ? "[FTRACE]" : ""));
}
static void *kprobe_seq_start(struct seq_file *f, loff_t *pos)
{
return (*pos < KPROBE_TABLE_SIZE) ? pos : NULL;
}
static void *kprobe_seq_next(struct seq_file *f, void *v, loff_t *pos)
{
(*pos)++;
if (*pos >= KPROBE_TABLE_SIZE)
return NULL;
return pos;
}
static void kprobe_seq_stop(struct seq_file *f, void *v)
{
/* Nothing to do */
}
static int show_kprobe_addr(struct seq_file *pi, void *v)
{
struct hlist_head *head;
struct kprobe *p, *kp;
const char *sym = NULL;
unsigned int i = *(loff_t *) v;
unsigned long offset = 0;
char *modname, namebuf[KSYM_NAME_LEN];
head = &kprobe_table[i];
preempt_disable();
hlist_for_each_entry_rcu(p, head, hlist) {
sym = kallsyms_lookup((unsigned long)p->addr, NULL,
&offset, &modname, namebuf);
if (kprobe_aggrprobe(p)) {
list_for_each_entry_rcu(kp, &p->list, list)
report_probe(pi, kp, sym, offset, modname, p);
} else
report_probe(pi, p, sym, offset, modname, NULL);
}
preempt_enable();
return 0;
}
static const struct seq_operations kprobes_sops = {
.start = kprobe_seq_start,
.next = kprobe_seq_next,
.stop = kprobe_seq_stop,
.show = show_kprobe_addr
};
DEFINE_SEQ_ATTRIBUTE(kprobes);
/* kprobes/blacklist -- shows which functions can not be probed */
static void *kprobe_blacklist_seq_start(struct seq_file *m, loff_t *pos)
{
mutex_lock(&kprobe_mutex);
return seq_list_start(&kprobe_blacklist, *pos);
}
static void *kprobe_blacklist_seq_next(struct seq_file *m, void *v, loff_t *pos)
{
return seq_list_next(v, &kprobe_blacklist, pos);
}
static int kprobe_blacklist_seq_show(struct seq_file *m, void *v)
{
struct kprobe_blacklist_entry *ent =
list_entry(v, struct kprobe_blacklist_entry, list);
/*
* If /proc/kallsyms is not showing kernel address, we won't
* show them here either.
*/
if (!kallsyms_show_value(m->file->f_cred))
seq_printf(m, "0x%px-0x%px\t%ps\n", NULL, NULL,
(void *)ent->start_addr);
else
seq_printf(m, "0x%px-0x%px\t%ps\n", (void *)ent->start_addr,
(void *)ent->end_addr, (void *)ent->start_addr);
return 0;
}
static void kprobe_blacklist_seq_stop(struct seq_file *f, void *v)
{
mutex_unlock(&kprobe_mutex);
}
static const struct seq_operations kprobe_blacklist_sops = {
.start = kprobe_blacklist_seq_start,
.next = kprobe_blacklist_seq_next,
.stop = kprobe_blacklist_seq_stop,
.show = kprobe_blacklist_seq_show,
};
DEFINE_SEQ_ATTRIBUTE(kprobe_blacklist);
static int arm_all_kprobes(void)
{
struct hlist_head *head;
struct kprobe *p;
unsigned int i, total = 0, errors = 0;
int err, ret = 0;
mutex_lock(&kprobe_mutex);
/* If kprobes are armed, just return */
if (!kprobes_all_disarmed)
goto already_enabled;
/*
* optimize_kprobe() called by arm_kprobe() checks
* kprobes_all_disarmed, so set kprobes_all_disarmed before
* arm_kprobe.
*/
kprobes_all_disarmed = false;
/* Arming kprobes doesn't optimize kprobe itself */
for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
head = &kprobe_table[i];
/* Arm all kprobes on a best-effort basis */
hlist_for_each_entry(p, head, hlist) {
if (!kprobe_disabled(p)) {
err = arm_kprobe(p);
if (err) {
errors++;
ret = err;
}
total++;
}
}
}
if (errors)
pr_warn("Kprobes globally enabled, but failed to arm %d out of %d probes\n",
errors, total);
else
pr_info("Kprobes globally enabled\n");
already_enabled:
mutex_unlock(&kprobe_mutex);
return ret;
}
static int disarm_all_kprobes(void)
{
struct hlist_head *head;
struct kprobe *p;
unsigned int i, total = 0, errors = 0;
int err, ret = 0;
mutex_lock(&kprobe_mutex);
/* If kprobes are already disarmed, just return */
if (kprobes_all_disarmed) {
mutex_unlock(&kprobe_mutex);
return 0;
}
kprobes_all_disarmed = true;
for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
head = &kprobe_table[i];
/* Disarm all kprobes on a best-effort basis */
hlist_for_each_entry(p, head, hlist) {
if (!arch_trampoline_kprobe(p) && !kprobe_disabled(p)) {
err = disarm_kprobe(p, false);
if (err) {
errors++;
ret = err;
}
total++;
}
}
}
if (errors)
pr_warn("Kprobes globally disabled, but failed to disarm %d out of %d probes\n",
errors, total);
else
pr_info("Kprobes globally disabled\n");
mutex_unlock(&kprobe_mutex);
/* Wait for disarming all kprobes by optimizer */
wait_for_kprobe_optimizer();
return ret;
}
/*
* XXX: The debugfs bool file interface doesn't allow for callbacks
* when the bool state is switched. We can reuse that facility when
* available
*/
static ssize_t read_enabled_file_bool(struct file *file,
char __user *user_buf, size_t count, loff_t *ppos)
{
char buf[3];
if (!kprobes_all_disarmed)
buf[0] = '1';
else
buf[0] = '0';
buf[1] = '\n';
buf[2] = 0x00;
return simple_read_from_buffer(user_buf, count, ppos, buf, 2);
}
static ssize_t write_enabled_file_bool(struct file *file,
const char __user *user_buf, size_t count, loff_t *ppos)
{
char buf[32];
size_t buf_size;
int ret = 0;
buf_size = min(count, (sizeof(buf)-1));
if (copy_from_user(buf, user_buf, buf_size))
return -EFAULT;
buf[buf_size] = '\0';
switch (buf[0]) {
case 'y':
case 'Y':
case '1':
ret = arm_all_kprobes();
break;
case 'n':
case 'N':
case '0':
ret = disarm_all_kprobes();
break;
default:
return -EINVAL;
}
if (ret)
return ret;
return count;
}
static const struct file_operations fops_kp = {
.read = read_enabled_file_bool,
.write = write_enabled_file_bool,
.llseek = default_llseek,
};
static int __init debugfs_kprobe_init(void)
{
struct dentry *dir;
dir = debugfs_create_dir("kprobes", NULL);
debugfs_create_file("list", 0400, dir, NULL, &kprobes_fops);
debugfs_create_file("enabled", 0600, dir, NULL, &fops_kp);
debugfs_create_file("blacklist", 0400, dir, NULL,
&kprobe_blacklist_fops);
return 0;
}
late_initcall(debugfs_kprobe_init);
#endif /* CONFIG_DEBUG_FS */
// SPDX-License-Identifier: GPL-2.0+
/*
* Universal/legacy driver for 8250/16550-type serial ports
*
* Based on drivers/char/serial.c, by Linus Torvalds, Theodore Ts'o.
*
* Copyright (C) 2001 Russell King.
*
* Supports: ISA-compatible 8250/16550 ports
* PNP 8250/16550 ports
* early_serial_setup() ports
* userspace-configurable "phantom" ports
* "serial8250" platform devices
* serial8250_register_8250_port() ports
*/
#include <linux/acpi.h>
#include <linux/module.h>
#include <linux/moduleparam.h>
#include <linux/ioport.h>
#include <linux/init.h>
#include <linux/console.h>
#include <linux/sysrq.h>
#include <linux/delay.h>
#include <linux/platform_device.h>
#include <linux/tty.h>
#include <linux/ratelimit.h>
#include <linux/tty_flip.h>
#include <linux/serial.h>
#include <linux/serial_8250.h>
#include <linux/nmi.h>
#include <linux/mutex.h>
#include <linux/slab.h>
#include <linux/uaccess.h>
#include <linux/pm_runtime.h>
#include <linux/io.h>
#ifdef CONFIG_SPARC
#include <linux/sunserialcore.h>
#endif
#include <asm/irq.h>
#include "8250.h"
/*
* Configuration:
* share_irqs - whether we pass IRQF_SHARED to request_irq(). This option
* is unsafe when used on edge-triggered interrupts.
*/
static unsigned int share_irqs = SERIAL8250_SHARE_IRQS;
static unsigned int nr_uarts = CONFIG_SERIAL_8250_RUNTIME_UARTS;
static struct uart_driver serial8250_reg;
static unsigned int skip_txen_test; /* force skip of txen test at init time */
#define PASS_LIMIT 512
#include <asm/serial.h>
/*
* SERIAL_PORT_DFNS tells us about built-in ports that have no
* standard enumeration mechanism. Platforms that can find all
* serial ports via mechanisms like ACPI or PCI need not supply it.
*/
#ifndef SERIAL_PORT_DFNS
#define SERIAL_PORT_DFNS
#endif
static const struct old_serial_port old_serial_port[] = {
SERIAL_PORT_DFNS /* defined in asm/serial.h */
};
#define UART_NR CONFIG_SERIAL_8250_NR_UARTS
#ifdef CONFIG_SERIAL_8250_RSA
#define PORT_RSA_MAX 4
static unsigned long probe_rsa[PORT_RSA_MAX];
static unsigned int probe_rsa_count;
#endif /* CONFIG_SERIAL_8250_RSA */
struct irq_info {
struct hlist_node node;
int irq;
spinlock_t lock; /* Protects list not the hash */
struct list_head *head;
};
#define NR_IRQ_HASH 32 /* Can be adjusted later */
static struct hlist_head irq_lists[NR_IRQ_HASH];
static DEFINE_MUTEX(hash_mutex); /* Used to walk the hash */
/*
* This is the serial driver's interrupt routine.
*
* Arjan thinks the old way was overly complex, so it got simplified.
* Alan disagrees, saying that need the complexity to handle the weird
* nature of ISA shared interrupts. (This is a special exception.)
*
* In order to handle ISA shared interrupts properly, we need to check
* that all ports have been serviced, and therefore the ISA interrupt
* line has been de-asserted.
*
* This means we need to loop through all ports. checking that they
* don't have an interrupt pending.
*/
static irqreturn_t serial8250_interrupt(int irq, void *dev_id)
{
struct irq_info *i = dev_id;
struct list_head *l, *end = NULL;
int pass_counter = 0, handled = 0;
pr_debug("%s(%d): start\n", __func__, irq);
spin_lock(&i->lock);
l = i->head;
do {
struct uart_8250_port *up;
struct uart_port *port;
up = list_entry(l, struct uart_8250_port, list);
port = &up->port;
if (port->handle_irq(port)) {
handled = 1;
end = NULL;
} else if (end == NULL)
end = l;
l = l->next;
if (l == i->head && pass_counter++ > PASS_LIMIT)
break;
} while (l != end);
spin_unlock(&i->lock);
pr_debug("%s(%d): end\n", __func__, irq);
return IRQ_RETVAL(handled);
}
/*
* To support ISA shared interrupts, we need to have one interrupt
* handler that ensures that the IRQ line has been deasserted
* before returning. Failing to do this will result in the IRQ
* line being stuck active, and, since ISA irqs are edge triggered,
* no more IRQs will be seen.
*/
static void serial_do_unlink(struct irq_info *i, struct uart_8250_port *up)
{
spin_lock_irq(&i->lock);
if (!list_empty(i->head)) {
if (i->head == &up->list)
i->head = i->head->next;
list_del(&up->list);
} else {
BUG_ON(i->head != &up->list);
i->head = NULL;
}
spin_unlock_irq(&i->lock);
/* List empty so throw away the hash node */
if (i->head == NULL) {
hlist_del(&i->node);
kfree(i);
}
}
static int serial_link_irq_chain(struct uart_8250_port *up)
{
struct hlist_head *h;
struct irq_info *i;
int ret;
mutex_lock(&hash_mutex);
h = &irq_lists[up->port.irq % NR_IRQ_HASH];
hlist_for_each_entry(i, h, node)
if (i->irq == up->port.irq)
break;
if (i == NULL) {
i = kzalloc(sizeof(struct irq_info), GFP_KERNEL);
if (i == NULL) {
mutex_unlock(&hash_mutex);
return -ENOMEM;
}
spin_lock_init(&i->lock);
i->irq = up->port.irq;
hlist_add_head(&i->node, h);
}
mutex_unlock(&hash_mutex);
spin_lock_irq(&i->lock);
if (i->head) {
list_add(&up->list, i->head);
spin_unlock_irq(&i->lock);
ret = 0;
} else {
INIT_LIST_HEAD(&up->list);
i->head = &up->list;
spin_unlock_irq(&i->lock);
ret = request_irq(up->port.irq, serial8250_interrupt,
up->port.irqflags, up->port.name, i);
if (ret < 0)
serial_do_unlink(i, up);
}
return ret;
}
static void serial_unlink_irq_chain(struct uart_8250_port *up)
{
struct irq_info *i;
struct hlist_head *h;
mutex_lock(&hash_mutex);
h = &irq_lists[up->port.irq % NR_IRQ_HASH];
hlist_for_each_entry(i, h, node)
if (i->irq == up->port.irq)
break;
BUG_ON(i == NULL);
BUG_ON(i->head == NULL);
if (list_empty(i->head))
free_irq(up->port.irq, i);
serial_do_unlink(i, up);
mutex_unlock(&hash_mutex);
}
/*
* This function is used to handle ports that do not have an
* interrupt. This doesn't work very well for 16450's, but gives
* barely passable results for a 16550A. (Although at the expense
* of much CPU overhead).
*/
static void serial8250_timeout(struct timer_list *t)
{
struct uart_8250_port *up = from_timer(up, t, timer);
up->port.handle_irq(&up->port);
mod_timer(&up->timer, jiffies + uart_poll_timeout(&up->port));
}
static void serial8250_backup_timeout(struct timer_list *t)
{
struct uart_8250_port *up = from_timer(up, t, timer);
unsigned int iir, ier = 0, lsr;
unsigned long flags;
spin_lock_irqsave(&up->port.lock, flags);
/*
* Must disable interrupts or else we risk racing with the interrupt
* based handler.
*/
if (up->port.irq) {
ier = serial_in(up, UART_IER);
serial_out(up, UART_IER, 0);
}
iir = serial_in(up, UART_IIR);
/*
* This should be a safe test for anyone who doesn't trust the
* IIR bits on their UART, but it's specifically designed for
* the "Diva" UART used on the management processor on many HP
* ia64 and parisc boxes.
*/
lsr = serial_in(up, UART_LSR);
up->lsr_saved_flags |= lsr & LSR_SAVE_FLAGS;
if ((iir & UART_IIR_NO_INT) && (up->ier & UART_IER_THRI) &&
(!uart_circ_empty(&up->port.state->xmit) || up->port.x_char) &&
(lsr & UART_LSR_THRE)) {
iir &= ~(UART_IIR_ID | UART_IIR_NO_INT);
iir |= UART_IIR_THRI;
}
if (!(iir & UART_IIR_NO_INT))
serial8250_tx_chars(up);
if (up->port.irq)
serial_out(up, UART_IER, ier);
spin_unlock_irqrestore(&up->port.lock, flags);
/* Standard timer interval plus 0.2s to keep the port running */
mod_timer(&up->timer,
jiffies + uart_poll_timeout(&up->port) + HZ / 5);
}
static int univ8250_setup_irq(struct uart_8250_port *up)
{
struct uart_port *port = &up->port;
int retval = 0;
/*
* The above check will only give an accurate result the first time
* the port is opened so this value needs to be preserved.
*/
if (up->bugs & UART_BUG_THRE) {
pr_debug("%s - using backup timer\n", port->name);
up->timer.function = serial8250_backup_timeout;
mod_timer(&up->timer, jiffies +
uart_poll_timeout(port) + HZ / 5);
}
/*
* If the "interrupt" for this port doesn't correspond with any
* hardware interrupt, we use a timer-based system. The original
* driver used to do this with IRQ0.
*/
if (!port->irq)
mod_timer(&up->timer, jiffies + uart_poll_timeout(port));
else
retval = serial_link_irq_chain(up);
return retval;
}
static void univ8250_release_irq(struct uart_8250_port *up)
{
struct uart_port *port = &up->port;
del_timer_sync(&up->timer);
up->timer.function = serial8250_timeout;
if (port->irq)
serial_unlink_irq_chain(up);
}
#ifdef CONFIG_SERIAL_8250_RSA
static int serial8250_request_rsa_resource(struct uart_8250_port *up)
{
unsigned long start = UART_RSA_BASE << up->port.regshift;
unsigned int size = 8 << up->port.regshift;
struct uart_port *port = &up->port;
int ret = -EINVAL;
switch (port->iotype) {
case UPIO_HUB6:
case UPIO_PORT:
start += port->iobase;
if (request_region(start, size, "serial-rsa"))
ret = 0;
else
ret = -EBUSY;
break;
}
return ret;
}
static void serial8250_release_rsa_resource(struct uart_8250_port *up)
{
unsigned long offset = UART_RSA_BASE << up->port.regshift;
unsigned int size = 8 << up->port.regshift;
struct uart_port *port = &up->port;
switch (port->iotype) {
case UPIO_HUB6:
case UPIO_PORT:
release_region(port->iobase + offset, size);
break;
}
}
#endif
static const struct uart_ops *base_ops;
static struct uart_ops univ8250_port_ops;
static const struct uart_8250_ops univ8250_driver_ops = {
.setup_irq = univ8250_setup_irq,
.release_irq = univ8250_release_irq,
};
static struct uart_8250_port serial8250_ports[UART_NR];
/**
* serial8250_get_port - retrieve struct uart_8250_port
* @line: serial line number
*
* This function retrieves struct uart_8250_port for the specific line.
* This struct *must* *not* be used to perform a 8250 or serial core operation
* which is not accessible otherwise. Its only purpose is to make the struct
* accessible to the runtime-pm callbacks for context suspend/restore.
* The lock assumption made here is none because runtime-pm suspend/resume
* callbacks should not be invoked if there is any operation performed on the
* port.
*/
struct uart_8250_port *serial8250_get_port(int line)
{
return &serial8250_ports[line];
}
EXPORT_SYMBOL_GPL(serial8250_get_port);
static void (*serial8250_isa_config)(int port, struct uart_port *up,
u32 *capabilities);
void serial8250_set_isa_configurator(
void (*v)(int port, struct uart_port *up, u32 *capabilities))
{
serial8250_isa_config = v;
}
EXPORT_SYMBOL(serial8250_set_isa_configurator);
#ifdef CONFIG_SERIAL_8250_RSA
static void univ8250_config_port(struct uart_port *port, int flags)
{
struct uart_8250_port *up = up_to_u8250p(port);
up->probe &= ~UART_PROBE_RSA;
if (port->type == PORT_RSA) {
if (serial8250_request_rsa_resource(up) == 0)
up->probe |= UART_PROBE_RSA;
} else if (flags & UART_CONFIG_TYPE) {
int i;
for (i = 0; i < probe_rsa_count; i++) {
if (probe_rsa[i] == up->port.iobase) {
if (serial8250_request_rsa_resource(up) == 0)
up->probe |= UART_PROBE_RSA;
break;
}
}
}
base_ops->config_port(port, flags);
if (port->type != PORT_RSA && up->probe & UART_PROBE_RSA)
serial8250_release_rsa_resource(up);
}
static int univ8250_request_port(struct uart_port *port)
{
struct uart_8250_port *up = up_to_u8250p(port);
int ret;
ret = base_ops->request_port(port);
if (ret == 0 && port->type == PORT_RSA) {
ret = serial8250_request_rsa_resource(up);
if (ret < 0)
base_ops->release_port(port);
}
return ret;
}
static void univ8250_release_port(struct uart_port *port)
{
struct uart_8250_port *up = up_to_u8250p(port);
if (port->type == PORT_RSA)
serial8250_release_rsa_resource(up);
base_ops->release_port(port);
}
static void univ8250_rsa_support(struct uart_ops *ops)
{
ops->config_port = univ8250_config_port;
ops->request_port = univ8250_request_port;
ops->release_port = univ8250_release_port;
}
#else
#define univ8250_rsa_support(x) do { } while (0)
#endif /* CONFIG_SERIAL_8250_RSA */
static inline void serial8250_apply_quirks(struct uart_8250_port *up)
{
up->port.quirks |= skip_txen_test ? UPQ_NO_TXEN_TEST : 0;
}
static void __init serial8250_isa_init_ports(void)
{
struct uart_8250_port *up;
static int first = 1;
int i, irqflag = 0;
if (!first)
return;
first = 0;
if (nr_uarts > UART_NR)
nr_uarts = UART_NR;
for (i = 0; i < nr_uarts; i++) {
struct uart_8250_port *up = &serial8250_ports[i];
struct uart_port *port = &up->port;
port->line = i;
serial8250_init_port(up);
if (!base_ops)
base_ops = port->ops;
port->ops = &univ8250_port_ops;
timer_setup(&up->timer, serial8250_timeout, 0);
up->ops = &univ8250_driver_ops;
/*
* ALPHA_KLUDGE_MCR needs to be killed.
*/
up->mcr_mask = ~ALPHA_KLUDGE_MCR;
up->mcr_force = ALPHA_KLUDGE_MCR;
serial8250_set_defaults(up);
}
/* chain base port ops to support Remote Supervisor Adapter */
univ8250_port_ops = *base_ops;
univ8250_rsa_support(&univ8250_port_ops);
if (share_irqs)
irqflag = IRQF_SHARED;
for (i = 0, up = serial8250_ports;
i < ARRAY_SIZE(old_serial_port) && i < nr_uarts;
i++, up++) {
struct uart_port *port = &up->port;
port->iobase = old_serial_port[i].port;
port->irq = irq_canonicalize(old_serial_port[i].irq);
port->irqflags = 0;
port->uartclk = old_serial_port[i].baud_base * 16;
port->flags = old_serial_port[i].flags;
port->hub6 = 0;
port->membase = old_serial_port[i].iomem_base;
port->iotype = old_serial_port[i].io_type;
port->regshift = old_serial_port[i].iomem_reg_shift;
port->irqflags |= irqflag;
if (serial8250_isa_config != NULL)
serial8250_isa_config(i, &up->port, &up->capabilities);
}
}
static void __init
serial8250_register_ports(struct uart_driver *drv, struct device *dev)
{
int i;
for (i = 0; i < nr_uarts; i++) {
struct uart_8250_port *up = &serial8250_ports[i];
if (up->port.type == PORT_8250_CIR)
continue;
if (up->port.dev)
continue;
up->port.dev = dev;
serial8250_apply_quirks(up);
uart_add_one_port(drv, &up->port);
}
}
#ifdef CONFIG_SERIAL_8250_CONSOLE
static void univ8250_console_write(struct console *co, const char *s,
unsigned int count)
{
struct uart_8250_port *up = &serial8250_ports[co->index];
serial8250_console_write(up, s, count);
}
static int univ8250_console_setup(struct console *co, char *options)
{
struct uart_port *port;
int retval;
/*
* Check whether an invalid uart number has been specified, and
* if so, search for the first available port that does have
* console support.
*/
if (co->index >= nr_uarts)
co->index = 0;
port = &serial8250_ports[co->index].port;
/* link port to console */
port->cons = co;
retval = serial8250_console_setup(port, options, false);
if (retval != 0)
port->cons = NULL;
return retval;
}
static int univ8250_console_exit(struct console *co)
{
struct uart_port *port;
port = &serial8250_ports[co->index].port;
return serial8250_console_exit(port);
}
/**
* univ8250_console_match - non-standard console matching
* @co: registering console
* @name: name from console command line
* @idx: index from console command line
* @options: ptr to option string from console command line
*
* Only attempts to match console command lines of the form:
* console=uart[8250],io|mmio|mmio16|mmio32,<addr>[,<options>]
* console=uart[8250],0x<addr>[,<options>]
* This form is used to register an initial earlycon boot console and
* replace it with the serial8250_console at 8250 driver init.
*
* Performs console setup for a match (as required by interface)
* If no <options> are specified, then assume the h/w is already setup.
*
* Returns 0 if console matches; otherwise non-zero to use default matching
*/
static int univ8250_console_match(struct console *co, char *name, int idx,
char *options)
{
char match[] = "uart"; /* 8250-specific earlycon name */
unsigned char iotype;
resource_size_t addr;
int i;
if (strncmp(name, match, 4) != 0)
return -ENODEV;
if (uart_parse_earlycon(options, &iotype, &addr, &options))
return -ENODEV;
/* try to match the port specified on the command line */
for (i = 0; i < nr_uarts; i++) {
struct uart_port *port = &serial8250_ports[i].port;
if (port->iotype != iotype)
continue;
if ((iotype == UPIO_MEM || iotype == UPIO_MEM16 ||
iotype == UPIO_MEM32 || iotype == UPIO_MEM32BE)
&& (port->mapbase != addr))
continue;
if (iotype == UPIO_PORT && port->iobase != addr)
continue;
co->index = i;
port->cons = co;
return serial8250_console_setup(port, options, true);
}
return -ENODEV;
}
static struct console univ8250_console = {
.name = "ttyS",
.write = univ8250_console_write,
.device = uart_console_device,
.setup = univ8250_console_setup,
.exit = univ8250_console_exit,
.match = univ8250_console_match,
.flags = CON_PRINTBUFFER | CON_ANYTIME,
.index = -1,
.data = &serial8250_reg,
};
static int __init univ8250_console_init(void)
{
if (nr_uarts == 0)
return -ENODEV;
serial8250_isa_init_ports();
register_console(&univ8250_console);
return 0;
}
console_initcall(univ8250_console_init);
#define SERIAL8250_CONSOLE (&univ8250_console)
#else
#define SERIAL8250_CONSOLE NULL
#endif
static struct uart_driver serial8250_reg = {
.owner = THIS_MODULE,
.driver_name = "serial",
.dev_name = "ttyS",
.major = TTY_MAJOR,
.minor = 64,
.cons = SERIAL8250_CONSOLE,
};
/*
* early_serial_setup - early registration for 8250 ports
*
* Setup an 8250 port structure prior to console initialisation. Use
* after console initialisation will cause undefined behaviour.
*/
int __init early_serial_setup(struct uart_port *port)
{
struct uart_port *p;
if (port->line >= ARRAY_SIZE(serial8250_ports) || nr_uarts == 0)
return -ENODEV;
serial8250_isa_init_ports();
p = &serial8250_ports[port->line].port;
p->iobase = port->iobase;
p->membase = port->membase;
p->irq = port->irq;
p->irqflags = port->irqflags;
p->uartclk = port->uartclk;
p->fifosize = port->fifosize;
p->regshift = port->regshift;
p->iotype = port->iotype;
p->flags = port->flags;
p->mapbase = port->mapbase;
p->mapsize = port->mapsize;
p->private_data = port->private_data;
p->type = port->type;
p->line = port->line;
serial8250_set_defaults(up_to_u8250p(p));
if (port->serial_in)
p->serial_in = port->serial_in;
if (port->serial_out)
p->serial_out = port->serial_out;
if (port->handle_irq)
p->handle_irq = port->handle_irq;
return 0;
}
/**
* serial8250_suspend_port - suspend one serial port
* @line: serial line number
*
* Suspend one serial port.
*/
void serial8250_suspend_port(int line)
{
struct uart_8250_port *up = &serial8250_ports[line];
struct uart_port *port = &up->port;
if (!console_suspend_enabled && uart_console(port) &&
port->type != PORT_8250) {
unsigned char canary = 0xa5;
serial_out(up, UART_SCR, canary);
if (serial_in(up, UART_SCR) == canary)
up->canary = canary;
}
uart_suspend_port(&serial8250_reg, port);
}
EXPORT_SYMBOL(serial8250_suspend_port);
/**
* serial8250_resume_port - resume one serial port
* @line: serial line number
*
* Resume one serial port.
*/
void serial8250_resume_port(int line)
{
struct uart_8250_port *up = &serial8250_ports[line];
struct uart_port *port = &up->port;
up->canary = 0;
if (up->capabilities & UART_NATSEMI) {
/* Ensure it's still in high speed mode */
serial_port_out(port, UART_LCR, 0xE0);
ns16550a_goto_highspeed(up);
serial_port_out(port, UART_LCR, 0);
port->uartclk = 921600*16;
}
uart_resume_port(&serial8250_reg, port);
}
EXPORT_SYMBOL(serial8250_resume_port);
/*
* Register a set of serial devices attached to a platform device. The
* list is terminated with a zero flags entry, which means we expect
* all entries to have at least UPF_BOOT_AUTOCONF set.
*/
static int serial8250_probe(struct platform_device *dev)
{
struct plat_serial8250_port *p = dev_get_platdata(&dev->dev);
struct uart_8250_port uart;
int ret, i, irqflag = 0;
memset(&uart, 0, sizeof(uart));
if (share_irqs)
irqflag = IRQF_SHARED;
for (i = 0; p && p->flags != 0; p++, i++) {
uart.port.iobase = p->iobase;
uart.port.membase = p->membase;
uart.port.irq = p->irq;
uart.port.irqflags = p->irqflags;
uart.port.uartclk = p->uartclk;
uart.port.regshift = p->regshift;
uart.port.iotype = p->iotype;
uart.port.flags = p->flags;
uart.port.mapbase = p->mapbase;
uart.port.hub6 = p->hub6;
uart.port.has_sysrq = p->has_sysrq;
uart.port.private_data = p->private_data;
uart.port.type = p->type;
uart.port.serial_in = p->serial_in;
uart.port.serial_out = p->serial_out;
uart.port.handle_irq = p->handle_irq;
uart.port.handle_break = p->handle_break;
uart.port.set_termios = p->set_termios;
uart.port.set_ldisc = p->set_ldisc;
uart.port.get_mctrl = p->get_mctrl;
uart.port.pm = p->pm;
uart.port.dev = &dev->dev;
uart.port.irqflags |= irqflag;
ret = serial8250_register_8250_port(&uart);
if (ret < 0) {
dev_err(&dev->dev, "unable to register port at index %d "
"(IO%lx MEM%llx IRQ%d): %d\n", i,
p->iobase, (unsigned long long)p->mapbase,
p->irq, ret);
}
}
return 0;
}
/*
* Remove serial ports registered against a platform device.
*/
static int serial8250_remove(struct platform_device *dev)
{
int i;
for (i = 0; i < nr_uarts; i++) {
struct uart_8250_port *up = &serial8250_ports[i];
if (up->port.dev == &dev->dev)
serial8250_unregister_port(i);
}
return 0;
}
static int serial8250_suspend(struct platform_device *dev, pm_message_t state)
{
int i;
for (i = 0; i < UART_NR; i++) {
struct uart_8250_port *up = &serial8250_ports[i];
if (up->port.type != PORT_UNKNOWN && up->port.dev == &dev->dev)
uart_suspend_port(&serial8250_reg, &up->port);
}
return 0;
}
static int serial8250_resume(struct platform_device *dev)
{
int i;
for (i = 0; i < UART_NR; i++) {
struct uart_8250_port *up = &serial8250_ports[i];
if (up->port.type != PORT_UNKNOWN && up->port.dev == &dev->dev)
serial8250_resume_port(i);
}
return 0;
}
static struct platform_driver serial8250_isa_driver = {
.probe = serial8250_probe,
.remove = serial8250_remove,
.suspend = serial8250_suspend,
.resume = serial8250_resume,
.driver = {
.name = "serial8250",
},
};
/*
* This "device" covers _all_ ISA 8250-compatible serial devices listed
* in the table in include/asm/serial.h
*/
static struct platform_device *serial8250_isa_devs;
/*
* serial8250_register_8250_port and serial8250_unregister_port allows for
* 16x50 serial ports to be configured at run-time, to support PCMCIA
* modems and PCI multiport cards.
*/
static DEFINE_MUTEX(serial_mutex);
static struct uart_8250_port *serial8250_find_match_or_unused(const struct uart_port *port)
{
int i;
/*
* First, find a port entry which matches.
*/
for (i = 0; i < nr_uarts; i++)
if (uart_match_port(&serial8250_ports[i].port, port))
return &serial8250_ports[i];
/* try line number first if still available */
i = port->line;
if (i < nr_uarts && serial8250_ports[i].port.type == PORT_UNKNOWN &&
serial8250_ports[i].port.iobase == 0)
return &serial8250_ports[i];
/*
* We didn't find a matching entry, so look for the first
* free entry. We look for one which hasn't been previously
* used (indicated by zero iobase).
*/
for (i = 0; i < nr_uarts; i++)
if (serial8250_ports[i].port.type == PORT_UNKNOWN &&
serial8250_ports[i].port.iobase == 0)
return &serial8250_ports[i];
/*
* That also failed. Last resort is to find any entry which
* doesn't have a real port associated with it.
*/
for (i = 0; i < nr_uarts; i++)
if (serial8250_ports[i].port.type == PORT_UNKNOWN)
return &serial8250_ports[i];
return NULL;
}
static void serial_8250_overrun_backoff_work(struct work_struct *work)
{
struct uart_8250_port *up =
container_of(to_delayed_work(work), struct uart_8250_port,
overrun_backoff);
struct uart_port *port = &up->port;
unsigned long flags;
spin_lock_irqsave(&port->lock, flags);
up->ier |= UART_IER_RLSI | UART_IER_RDI;
up->port.read_status_mask |= UART_LSR_DR;
serial_out(up, UART_IER, up->ier);
spin_unlock_irqrestore(&port->lock, flags);
}
/**
* serial8250_register_8250_port - register a serial port
* @up: serial port template
*
* Configure the serial port specified by the request. If the
* port exists and is in use, it is hung up and unregistered
* first.
*
* The port is then probed and if necessary the IRQ is autodetected
* If this fails an error is returned.
*
* On success the port is ready to use and the line number is returned.
*/
int serial8250_register_8250_port(const struct uart_8250_port *up)
{
struct uart_8250_port *uart;
int ret = -ENOSPC;
if (up->port.uartclk == 0)
return -EINVAL;
mutex_lock(&serial_mutex);
uart = serial8250_find_match_or_unused(&up->port);
if (uart && uart->port.type != PORT_8250_CIR) {
struct mctrl_gpios *gpios;
if (uart->port.dev)
uart_remove_one_port(&serial8250_reg, &uart->port);
uart->port.iobase = up->port.iobase;
uart->port.membase = up->port.membase;
uart->port.irq = up->port.irq;
uart->port.irqflags = up->port.irqflags;
uart->port.uartclk = up->port.uartclk;
uart->port.fifosize = up->port.fifosize;
uart->port.regshift = up->port.regshift;
uart->port.iotype = up->port.iotype;
uart->port.flags = up->port.flags | UPF_BOOT_AUTOCONF;
uart->bugs = up->bugs;
uart->port.mapbase = up->port.mapbase;
uart->port.mapsize = up->port.mapsize;
uart->port.private_data = up->port.private_data;
uart->tx_loadsz = up->tx_loadsz;
uart->capabilities = up->capabilities;
uart->port.throttle = up->port.throttle;
uart->port.unthrottle = up->port.unthrottle;
uart->port.rs485_config = up->port.rs485_config;
uart->port.rs485 = up->port.rs485;
uart->rs485_start_tx = up->rs485_start_tx;
uart->rs485_stop_tx = up->rs485_stop_tx;
uart->dma = up->dma;
/* Take tx_loadsz from fifosize if it wasn't set separately */
if (uart->port.fifosize && !uart->tx_loadsz)
uart->tx_loadsz = uart->port.fifosize;
if (up->port.dev) {
uart->port.dev = up->port.dev;
ret = uart_get_rs485_mode(&uart->port);
if (ret)
goto err;
}
if (up->port.flags & UPF_FIXED_TYPE)
uart->port.type = up->port.type;
/*
* Only call mctrl_gpio_init(), if the device has no ACPI
* companion device
*/
if (!has_acpi_companion(uart->port.dev)) {
gpios = mctrl_gpio_init(&uart->port, 0);
if (IS_ERR(gpios)) {
ret = PTR_ERR(gpios);
goto err;
} else {
uart->gpios = gpios;
}
}
serial8250_set_defaults(uart);
/* Possibly override default I/O functions. */
if (up->port.serial_in)
uart->port.serial_in = up->port.serial_in;
if (up->port.serial_out)
uart->port.serial_out = up->port.serial_out;
if (up->port.handle_irq)
uart->port.handle_irq = up->port.handle_irq;
/* Possibly override set_termios call */
if (up->port.set_termios)
uart->port.set_termios = up->port.set_termios;
if (up->port.set_ldisc)
uart->port.set_ldisc = up->port.set_ldisc;
if (up->port.get_mctrl)
uart->port.get_mctrl = up->port.get_mctrl;
if (up->port.set_mctrl)
uart->port.set_mctrl = up->port.set_mctrl;
if (up->port.get_divisor)
uart->port.get_divisor = up->port.get_divisor;
if (up->port.set_divisor)
uart->port.set_divisor = up->port.set_divisor;
if (up->port.startup)
uart->port.startup = up->port.startup;
if (up->port.shutdown)
uart->port.shutdown = up->port.shutdown;
if (up->port.pm)
uart->port.pm = up->port.pm;
if (up->port.handle_break)
uart->port.handle_break = up->port.handle_break;
if (up->dl_read)
uart->dl_read = up->dl_read;
if (up->dl_write)
uart->dl_write = up->dl_write;
if (uart->port.type != PORT_8250_CIR) {
if (serial8250_isa_config != NULL)
serial8250_isa_config(0, &uart->port,
&uart->capabilities);
serial8250_apply_quirks(uart);
ret = uart_add_one_port(&serial8250_reg,
&uart->port);
if (ret)
goto err;
ret = uart->port.line;
} else {
dev_info(uart->port.dev,
"skipping CIR port at 0x%lx / 0x%llx, IRQ %d\n",
uart->port.iobase,
(unsigned long long)uart->port.mapbase,
uart->port.irq);
ret = 0;
}
/* Initialise interrupt backoff work if required */
if (up->overrun_backoff_time_ms > 0) {
uart->overrun_backoff_time_ms =
up->overrun_backoff_time_ms;
INIT_DELAYED_WORK(&uart->overrun_backoff,
serial_8250_overrun_backoff_work);
} else {
uart->overrun_backoff_time_ms = 0;
}
}
mutex_unlock(&serial_mutex);
return ret;
err:
uart->port.dev = NULL;
mutex_unlock(&serial_mutex);
return ret;
}
EXPORT_SYMBOL(serial8250_register_8250_port);
/**
* serial8250_unregister_port - remove a 16x50 serial port at runtime
* @line: serial line number
*
* Remove one serial port. This may not be called from interrupt
* context. We hand the port back to the our control.
*/
void serial8250_unregister_port(int line)
{
struct uart_8250_port *uart = &serial8250_ports[line];
mutex_lock(&serial_mutex);
if (uart->em485) {
unsigned long flags;
spin_lock_irqsave(&uart->port.lock, flags);
serial8250_em485_destroy(uart);
spin_unlock_irqrestore(&uart->port.lock, flags);
}
uart_remove_one_port(&serial8250_reg, &uart->port);
if (serial8250_isa_devs) {
uart->port.flags &= ~UPF_BOOT_AUTOCONF;
uart->port.type = PORT_UNKNOWN;
uart->port.dev = &serial8250_isa_devs->dev;
uart->capabilities = 0;
serial8250_apply_quirks(uart);
uart_add_one_port(&serial8250_reg, &uart->port);
} else {
uart->port.dev = NULL;
}
mutex_unlock(&serial_mutex);
}
EXPORT_SYMBOL(serial8250_unregister_port);
static int __init serial8250_init(void)
{
int ret;
if (nr_uarts == 0)
return -ENODEV;
serial8250_isa_init_ports();
pr_info("Serial: 8250/16550 driver, %d ports, IRQ sharing %sabled\n",
nr_uarts, share_irqs ? "en" : "dis");
#ifdef CONFIG_SPARC
ret = sunserial_register_minors(&serial8250_reg, UART_NR);
#else
serial8250_reg.nr = UART_NR;
ret = uart_register_driver(&serial8250_reg);
#endif
if (ret)
goto out;
ret = serial8250_pnp_init();
if (ret)
goto unreg_uart_drv;
serial8250_isa_devs = platform_device_alloc("serial8250",
PLAT8250_DEV_LEGACY);
if (!serial8250_isa_devs) {
ret = -ENOMEM;
goto unreg_pnp;
}
ret = platform_device_add(serial8250_isa_devs);
if (ret)
goto put_dev;
serial8250_register_ports(&serial8250_reg, &serial8250_isa_devs->dev);
ret = platform_driver_register(&serial8250_isa_driver);
if (ret == 0)
goto out;
platform_device_del(serial8250_isa_devs);
put_dev:
platform_device_put(serial8250_isa_devs);
unreg_pnp:
serial8250_pnp_exit();
unreg_uart_drv:
#ifdef CONFIG_SPARC
sunserial_unregister_minors(&serial8250_reg, UART_NR);
#else
uart_unregister_driver(&serial8250_reg);
#endif
out:
return ret;
}
static void __exit serial8250_exit(void)
{
struct platform_device *isa_dev = serial8250_isa_devs;
/*
* This tells serial8250_unregister_port() not to re-register
* the ports (thereby making serial8250_isa_driver permanently
* in use.)
*/
serial8250_isa_devs = NULL;
platform_driver_unregister(&serial8250_isa_driver);
platform_device_unregister(isa_dev);
serial8250_pnp_exit();
#ifdef CONFIG_SPARC
sunserial_unregister_minors(&serial8250_reg, UART_NR);
#else
uart_unregister_driver(&serial8250_reg);
#endif
}
module_init(serial8250_init);
module_exit(serial8250_exit);
MODULE_LICENSE("GPL");
MODULE_DESCRIPTION("Generic 8250/16x50 serial driver");
module_param_hw(share_irqs, uint, other, 0644);
MODULE_PARM_DESC(share_irqs, "Share IRQs with other non-8250/16x50 devices (unsafe)");
module_param(nr_uarts, uint, 0644);
MODULE_PARM_DESC(nr_uarts, "Maximum number of UARTs supported. (1-" __MODULE_STRING(CONFIG_SERIAL_8250_NR_UARTS) ")");
module_param(skip_txen_test, uint, 0644);
MODULE_PARM_DESC(skip_txen_test, "Skip checking for the TXEN bug at init time");
#ifdef CONFIG_SERIAL_8250_RSA
module_param_hw_array(probe_rsa, ulong, ioport, &probe_rsa_count, 0444);
MODULE_PARM_DESC(probe_rsa, "Probe I/O ports for RSA");
#endif
MODULE_ALIAS_CHARDEV_MAJOR(TTY_MAJOR);
#ifdef CONFIG_SERIAL_8250_DEPRECATED_OPTIONS
#ifndef MODULE
/* This module was renamed to 8250_core in 3.7. Keep the old "8250" name
* working as well for the module options so we don't break people. We
* need to keep the names identical and the convenient macros will happily
* refuse to let us do that by failing the build with redefinition errors
* of global variables. So we stick them inside a dummy function to avoid
* those conflicts. The options still get parsed, and the redefined
* MODULE_PARAM_PREFIX lets us keep the "8250." syntax alive.
*
* This is hacky. I'm sorry.
*/
static void __used s8250_options(void)
{
#undef MODULE_PARAM_PREFIX
#define MODULE_PARAM_PREFIX "8250_core."
module_param_cb(share_irqs, ¶m_ops_uint, &share_irqs, 0644);
module_param_cb(nr_uarts, ¶m_ops_uint, &nr_uarts, 0644);
module_param_cb(skip_txen_test, ¶m_ops_uint, &skip_txen_test, 0644);
#ifdef CONFIG_SERIAL_8250_RSA
__module_param_call(MODULE_PARAM_PREFIX, probe_rsa,
¶m_array_ops, .arr = &__param_arr_probe_rsa,
0444, -1, 0);
#endif
}
#else
MODULE_ALIAS("8250_core");
#endif
#endif
// SPDX-License-Identifier: GPL-2.0-or-later
/*
* Copyright (C) 2008 Red Hat, Inc., Eric Paris <eparis@redhat.com>
*/
#include <linux/dcache.h>
#include <linux/fs.h>
#include <linux/gfp.h>
#include <linux/init.h>
#include <linux/module.h>
#include <linux/mount.h>
#include <linux/srcu.h>
#include <linux/fsnotify_backend.h>
#include "fsnotify.h"
/*
* Clear all of the marks on an inode when it is being evicted from core
*/
void __fsnotify_inode_delete(struct inode *inode)
{
fsnotify_clear_marks_by_inode(inode);
}
EXPORT_SYMBOL_GPL(__fsnotify_inode_delete);
void __fsnotify_vfsmount_delete(struct vfsmount *mnt)
{
fsnotify_clear_marks_by_mount(mnt);
}
/**
* fsnotify_unmount_inodes - an sb is unmounting. handle any watched inodes.
* @sb: superblock being unmounted.
*
* Called during unmount with no locks held, so needs to be safe against
* concurrent modifiers. We temporarily drop sb->s_inode_list_lock and CAN block.
*/
static void fsnotify_unmount_inodes(struct super_block *sb)
{
struct inode *inode, *iput_inode = NULL;
spin_lock(&sb->s_inode_list_lock);
list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
/*
* We cannot __iget() an inode in state I_FREEING,
* I_WILL_FREE, or I_NEW which is fine because by that point
* the inode cannot have any associated watches.
*/
spin_lock(&inode->i_lock);
if (inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) {
spin_unlock(&inode->i_lock);
continue;
}
/*
* If i_count is zero, the inode cannot have any watches and
* doing an __iget/iput with SB_ACTIVE clear would actually
* evict all inodes with zero i_count from icache which is
* unnecessarily violent and may in fact be illegal to do.
* However, we should have been called /after/ evict_inodes
* removed all zero refcount inodes, in any case. Test to
* be sure.
*/
if (!atomic_read(&inode->i_count)) {
spin_unlock(&inode->i_lock);
continue;
}
__iget(inode);
spin_unlock(&inode->i_lock);
spin_unlock(&sb->s_inode_list_lock);
if (iput_inode)
iput(iput_inode);
/* for each watch, send FS_UNMOUNT and then remove it */
fsnotify_inode(inode, FS_UNMOUNT);
fsnotify_inode_delete(inode);
iput_inode = inode;
cond_resched();
spin_lock(&sb->s_inode_list_lock);
}
spin_unlock(&sb->s_inode_list_lock);
if (iput_inode)
iput(iput_inode);
}
void fsnotify_sb_delete(struct super_block *sb)
{
fsnotify_unmount_inodes(sb);
fsnotify_clear_marks_by_sb(sb);
/* Wait for outstanding object references from connectors */
wait_var_event(&sb->s_fsnotify_connectors,
!atomic_long_read(&sb->s_fsnotify_connectors));
}
/*
* Given an inode, first check if we care what happens to our children. Inotify
* and dnotify both tell their parents about events. If we care about any event
* on a child we run all of our children and set a dentry flag saying that the
* parent cares. Thus when an event happens on a child it can quickly tell if
* if there is a need to find a parent and send the event to the parent.
*/
void __fsnotify_update_child_dentry_flags(struct inode *inode)
{
struct dentry *alias;
int watched;
if (!S_ISDIR(inode->i_mode))
return;
/* determine if the children should tell inode about their events */
watched = fsnotify_inode_watches_children(inode);
spin_lock(&inode->i_lock);
/* run all of the dentries associated with this inode. Since this is a
* directory, there damn well better only be one item on this list */
hlist_for_each_entry(alias, &inode->i_dentry, d_u.d_alias) {
struct dentry *child;
/* run all of the children of the original inode and fix their
* d_flags to indicate parental interest (their parent is the
* original inode) */
spin_lock(&alias->d_lock);
list_for_each_entry(child, &alias->d_subdirs, d_child) {
if (!child->d_inode)
continue;
spin_lock_nested(&child->d_lock, DENTRY_D_LOCK_NESTED);
if (watched)
child->d_flags |= DCACHE_FSNOTIFY_PARENT_WATCHED;
else
child->d_flags &= ~DCACHE_FSNOTIFY_PARENT_WATCHED;
spin_unlock(&child->d_lock);
}
spin_unlock(&alias->d_lock);
}
spin_unlock(&inode->i_lock);
}
/* Are inode/sb/mount interested in parent and name info with this event? */
static bool fsnotify_event_needs_parent(struct inode *inode, struct mount *mnt,
__u32 mask)
{
__u32 marks_mask = 0;
/* We only send parent/name to inode/sb/mount for events on non-dir */
if (mask & FS_ISDIR)
return false;
/*
* All events that are possible on child can also may be reported with
* parent/name info to inode/sb/mount. Otherwise, a watching parent
* could result in events reported with unexpected name info to sb/mount.
*/
BUILD_BUG_ON(FS_EVENTS_POSS_ON_CHILD & ~FS_EVENTS_POSS_TO_PARENT);
/* Did either inode/sb/mount subscribe for events with parent/name? */
marks_mask |= fsnotify_parent_needed_mask(inode->i_fsnotify_mask); marks_mask |= fsnotify_parent_needed_mask(inode->i_sb->s_fsnotify_mask); if (mnt) marks_mask |= fsnotify_parent_needed_mask(mnt->mnt_fsnotify_mask);
/* Did they subscribe for this event with parent/name info? */
return mask & marks_mask;
}
/*
* Notify this dentry's parent about a child's events with child name info
* if parent is watching or if inode/sb/mount are interested in events with
* parent and name info.
*
* Notify only the child without name info if parent is not watching and
* inode/sb/mount are not interested in events with parent and name info.
*/
int __fsnotify_parent(struct dentry *dentry, __u32 mask, const void *data,
int data_type)
{
const struct path *path = fsnotify_data_path(data, data_type);
struct mount *mnt = path ? real_mount(path->mnt) : NULL; struct inode *inode = d_inode(dentry);
struct dentry *parent;
bool parent_watched = dentry->d_flags & DCACHE_FSNOTIFY_PARENT_WATCHED;
bool parent_needed, parent_interested;
__u32 p_mask;
struct inode *p_inode = NULL;
struct name_snapshot name;
struct qstr *file_name = NULL;
int ret = 0;
/*
* Do inode/sb/mount care about parent and name info on non-dir?
* Do they care about any event at all?
*/
if (!inode->i_fsnotify_marks && !inode->i_sb->s_fsnotify_marks && (!mnt || !mnt->mnt_fsnotify_marks) && !parent_watched)
return 0;
parent = NULL;
parent_needed = fsnotify_event_needs_parent(inode, mnt, mask);
if (!parent_watched && !parent_needed)
goto notify;
/* Does parent inode care about events on children? */
parent = dget_parent(dentry);
p_inode = parent->d_inode;
p_mask = fsnotify_inode_watches_children(p_inode);
if (unlikely(parent_watched && !p_mask))
__fsnotify_update_child_dentry_flags(p_inode);
/*
* Include parent/name in notification either if some notification
* groups require parent info or the parent is interested in this event.
*/
parent_interested = mask & p_mask & ALL_FSNOTIFY_EVENTS;
if (parent_needed || parent_interested) {
/* When notifying parent, child should be passed as data */
WARN_ON_ONCE(inode != fsnotify_data_inode(data, data_type));
/* Notify both parent and child with child name info */
take_dentry_name_snapshot(&name, dentry);
file_name = &name.name;
if (parent_interested)
mask |= FS_EVENT_ON_CHILD;
}
notify:
ret = fsnotify(mask, data, data_type, p_inode, file_name, inode, 0);
if (file_name)
release_dentry_name_snapshot(&name);
dput(parent); return ret;
}
EXPORT_SYMBOL_GPL(__fsnotify_parent);
static int fsnotify_handle_inode_event(struct fsnotify_group *group,
struct fsnotify_mark *inode_mark,
u32 mask, const void *data, int data_type,
struct inode *dir, const struct qstr *name,
u32 cookie)
{
const struct path *path = fsnotify_data_path(data, data_type);
struct inode *inode = fsnotify_data_inode(data, data_type);
const struct fsnotify_ops *ops = group->ops;
if (WARN_ON_ONCE(!ops->handle_inode_event))
return 0;
if ((inode_mark->mask & FS_EXCL_UNLINK) && path && d_unlinked(path->dentry))
return 0;
/* Check interest of this mark in case event was sent with two marks */
if (!(mask & inode_mark->mask & ALL_FSNOTIFY_EVENTS))
return 0;
return ops->handle_inode_event(inode_mark, mask, inode, dir, name, cookie);
}
static int fsnotify_handle_event(struct fsnotify_group *group, __u32 mask,
const void *data, int data_type,
struct inode *dir, const struct qstr *name,
u32 cookie, struct fsnotify_iter_info *iter_info)
{
struct fsnotify_mark *inode_mark = fsnotify_iter_inode_mark(iter_info); struct fsnotify_mark *parent_mark = fsnotify_iter_parent_mark(iter_info);
int ret;
if (WARN_ON_ONCE(fsnotify_iter_sb_mark(iter_info)) || WARN_ON_ONCE(fsnotify_iter_vfsmount_mark(iter_info)))
return 0;
if (parent_mark) {
/*
* parent_mark indicates that the parent inode is watching
* children and interested in this event, which is an event
* possible on child. But is *this mark* watching children and
* interested in this event?
*/
if (parent_mark->mask & FS_EVENT_ON_CHILD) { ret = fsnotify_handle_inode_event(group, parent_mark, mask,
data, data_type, dir, name, 0);
if (ret)
return ret;
}
if (!inode_mark)
return 0;
}
if (mask & FS_EVENT_ON_CHILD) {
/*
* Some events can be sent on both parent dir and child marks
* (e.g. FS_ATTRIB). If both parent dir and child are
* watching, report the event once to parent dir with name (if
* interested) and once to child without name (if interested).
* The child watcher is expecting an event without a file name
* and without the FS_EVENT_ON_CHILD flag.
*/
mask &= ~FS_EVENT_ON_CHILD;
dir = NULL;
name = NULL;
}
return fsnotify_handle_inode_event(group, inode_mark, mask, data, data_type,
dir, name, cookie);
}
static int send_to_group(__u32 mask, const void *data, int data_type,
struct inode *dir, const struct qstr *file_name,
u32 cookie, struct fsnotify_iter_info *iter_info)
{
struct fsnotify_group *group = NULL;
__u32 test_mask = (mask & ALL_FSNOTIFY_EVENTS);
__u32 marks_mask = 0;
__u32 marks_ignored_mask = 0;
struct fsnotify_mark *mark;
int type;
if (WARN_ON(!iter_info->report_mask))
return 0;
/* clear ignored on inode modification */
if (mask & FS_MODIFY) { fsnotify_foreach_obj_type(type) {
if (!fsnotify_iter_should_report_type(iter_info, type))
continue;
mark = iter_info->marks[type];
if (mark &&
!(mark->flags & FSNOTIFY_MARK_FLAG_IGNORED_SURV_MODIFY)) mark->ignored_mask = 0;
}
}
fsnotify_foreach_obj_type(type) {
if (!fsnotify_iter_should_report_type(iter_info, type))
continue;
mark = iter_info->marks[type];
/* does the object mark tell us to do something? */
if (mark) {
group = mark->group;
marks_mask |= mark->mask;
marks_ignored_mask |= mark->ignored_mask;
}
}
pr_debug("%s: group=%p mask=%x marks_mask=%x marks_ignored_mask=%x data=%p data_type=%d dir=%p cookie=%d\n",
__func__, group, mask, marks_mask, marks_ignored_mask,
data, data_type, dir, cookie);
if (!(test_mask & marks_mask & ~marks_ignored_mask))
return 0;
if (group->ops->handle_event) { return group->ops->handle_event(group, mask, data, data_type, dir,
file_name, cookie, iter_info);
}
return fsnotify_handle_event(group, mask, data, data_type, dir,
file_name, cookie, iter_info);
}
static struct fsnotify_mark *fsnotify_first_mark(struct fsnotify_mark_connector **connp)
{
struct fsnotify_mark_connector *conn;
struct hlist_node *node = NULL;
conn = srcu_dereference(*connp, &fsnotify_mark_srcu);
if (conn)
node = srcu_dereference(conn->list.first, &fsnotify_mark_srcu); return hlist_entry_safe(node, struct fsnotify_mark, obj_list);
}
static struct fsnotify_mark *fsnotify_next_mark(struct fsnotify_mark *mark)
{
struct hlist_node *node = NULL;
if (mark)
node = srcu_dereference(mark->obj_list.next,
&fsnotify_mark_srcu);
return hlist_entry_safe(node, struct fsnotify_mark, obj_list);
}
/*
* iter_info is a multi head priority queue of marks.
* Pick a subset of marks from queue heads, all with the
* same group and set the report_mask for selected subset.
* Returns the report_mask of the selected subset.
*/
static unsigned int fsnotify_iter_select_report_types(
struct fsnotify_iter_info *iter_info)
{
struct fsnotify_group *max_prio_group = NULL;
struct fsnotify_mark *mark;
int type;
/* Choose max prio group among groups of all queue heads */
fsnotify_foreach_obj_type(type) {
mark = iter_info->marks[type];
if (mark && fsnotify_compare_groups(max_prio_group, mark->group) > 0) max_prio_group = mark->group;
}
if (!max_prio_group)
return 0;
/* Set the report mask for marks from same group as max prio group */
iter_info->report_mask = 0; fsnotify_foreach_obj_type(type) { mark = iter_info->marks[type];
if (mark &&
fsnotify_compare_groups(max_prio_group, mark->group) == 0)
fsnotify_iter_set_report_type(iter_info, type);
}
return iter_info->report_mask;
}
/*
* Pop from iter_info multi head queue, the marks that were iterated in the
* current iteration step.
*/
static void fsnotify_iter_next(struct fsnotify_iter_info *iter_info)
{
int type;
fsnotify_foreach_obj_type(type) {
if (fsnotify_iter_should_report_type(iter_info, type))
iter_info->marks[type] = fsnotify_next_mark(iter_info->marks[type]);
}
}
/*
* fsnotify - This is the main call to fsnotify.
*
* The VFS calls into hook specific functions in linux/fsnotify.h.
* Those functions then in turn call here. Here will call out to all of the
* registered fsnotify_group. Those groups can then use the notification event
* in whatever means they feel necessary.
*
* @mask: event type and flags
* @data: object that event happened on
* @data_type: type of object for fanotify_data_XXX() accessors
* @dir: optional directory associated with event -
* if @file_name is not NULL, this is the directory that
* @file_name is relative to
* @file_name: optional file name associated with event
* @inode: optional inode associated with event -
* either @dir or @inode must be non-NULL.
* if both are non-NULL event may be reported to both.
* @cookie: inotify rename cookie
*/
int fsnotify(__u32 mask, const void *data, int data_type, struct inode *dir,
const struct qstr *file_name, struct inode *inode, u32 cookie)
{
const struct path *path = fsnotify_data_path(data, data_type);
struct fsnotify_iter_info iter_info = {};
struct super_block *sb;
struct mount *mnt = NULL;
struct inode *parent = NULL;
int ret = 0;
__u32 test_mask, marks_mask;
if (path)
mnt = real_mount(path->mnt); if (!inode) {
/* Dirent event - report on TYPE_INODE to dir */
inode = dir;
} else if (mask & FS_EVENT_ON_CHILD) {
/*
* Event on child - report on TYPE_PARENT to dir if it is
* watching children and on TYPE_INODE to child.
*/
parent = dir;
}
sb = inode->i_sb;
/*
* Optimization: srcu_read_lock() has a memory barrier which can
* be expensive. It protects walking the *_fsnotify_marks lists.
* However, if we do not walk the lists, we do not have to do
* SRCU because we have no references to any objects and do not
* need SRCU to keep them "alive".
*/
if (!sb->s_fsnotify_marks && (!mnt || !mnt->mnt_fsnotify_marks) && (!inode || !inode->i_fsnotify_marks) && (!parent || !parent->i_fsnotify_marks))
return 0;
marks_mask = sb->s_fsnotify_mask;
if (mnt)
marks_mask |= mnt->mnt_fsnotify_mask; if (inode) marks_mask |= inode->i_fsnotify_mask; if (parent) marks_mask |= parent->i_fsnotify_mask;
/*
* if this is a modify event we may need to clear the ignored masks
* otherwise return if none of the marks care about this type of event.
*/
test_mask = (mask & ALL_FSNOTIFY_EVENTS);
if (!(mask & FS_MODIFY) && !(test_mask & marks_mask))
return 0;
iter_info.srcu_idx = srcu_read_lock(&fsnotify_mark_srcu);
iter_info.marks[FSNOTIFY_OBJ_TYPE_SB] =
fsnotify_first_mark(&sb->s_fsnotify_marks);
if (mnt) {
iter_info.marks[FSNOTIFY_OBJ_TYPE_VFSMOUNT] =
fsnotify_first_mark(&mnt->mnt_fsnotify_marks);
}
if (inode) { iter_info.marks[FSNOTIFY_OBJ_TYPE_INODE] =
fsnotify_first_mark(&inode->i_fsnotify_marks);
}
if (parent) { iter_info.marks[FSNOTIFY_OBJ_TYPE_PARENT] =
fsnotify_first_mark(&parent->i_fsnotify_marks);
}
/*
* We need to merge inode/vfsmount/sb mark lists so that e.g. inode mark
* ignore masks are properly reflected for mount/sb mark notifications.
* That's why this traversal is so complicated...
*/
while (fsnotify_iter_select_report_types(&iter_info)) {
ret = send_to_group(mask, data, data_type, dir, file_name,
cookie, &iter_info);
if (ret && (mask & ALL_FSNOTIFY_PERM_EVENTS))
goto out;
fsnotify_iter_next(&iter_info);
}
ret = 0;
out:
srcu_read_unlock(&fsnotify_mark_srcu, iter_info.srcu_idx); return ret;
}
EXPORT_SYMBOL_GPL(fsnotify);
static __init int fsnotify_init(void)
{
int ret;
BUILD_BUG_ON(HWEIGHT32(ALL_FSNOTIFY_BITS) != 25);
ret = init_srcu_struct(&fsnotify_mark_srcu);
if (ret)
panic("initializing fsnotify_mark_srcu");
fsnotify_mark_connector_cachep = KMEM_CACHE(fsnotify_mark_connector,
SLAB_PANIC);
return 0;
}
core_initcall(fsnotify_init);
/* SPDX-License-Identifier: GPL-2.0 */
/*
* Copyright (C) 2007 Oracle. All rights reserved.
*/
#ifndef BTRFS_INODE_H
#define BTRFS_INODE_H
#include <linux/hash.h>
#include <linux/refcount.h>
#include "extent_map.h"
#include "extent_io.h"
#include "ordered-data.h"
#include "delayed-inode.h"
/*
* ordered_data_close is set by truncate when a file that used
* to have good data has been truncated to zero. When it is set
* the btrfs file release call will add this inode to the
* ordered operations list so that we make sure to flush out any
* new data the application may have written before commit.
*/
enum {
BTRFS_INODE_FLUSH_ON_CLOSE,
BTRFS_INODE_DUMMY,
BTRFS_INODE_IN_DEFRAG,
BTRFS_INODE_HAS_ASYNC_EXTENT,
/*
* Always set under the VFS' inode lock, otherwise it can cause races
* during fsync (we start as a fast fsync and then end up in a full
* fsync racing with ordered extent completion).
*/
BTRFS_INODE_NEEDS_FULL_SYNC,
BTRFS_INODE_COPY_EVERYTHING,
BTRFS_INODE_IN_DELALLOC_LIST,
BTRFS_INODE_HAS_PROPS,
BTRFS_INODE_SNAPSHOT_FLUSH,
/*
* Set and used when logging an inode and it serves to signal that an
* inode does not have xattrs, so subsequent fsyncs can avoid searching
* for xattrs to log. This bit must be cleared whenever a xattr is added
* to an inode.
*/
BTRFS_INODE_NO_XATTRS,
/*
* Set when we are in a context where we need to start a transaction and
* have dirty pages with the respective file range locked. This is to
* ensure that when reserving space for the transaction, if we are low
* on available space and need to flush delalloc, we will not flush
* delalloc for this inode, because that could result in a deadlock (on
* the file range, inode's io_tree).
*/
BTRFS_INODE_NO_DELALLOC_FLUSH,
/*
* Set when we are working on enabling verity for a file. Computing and
* writing the whole Merkle tree can take a while so we want to prevent
* races where two separate tasks attempt to simultaneously start verity
* on the same file.
*/
BTRFS_INODE_VERITY_IN_PROGRESS,
};
/* in memory btrfs inode */
struct btrfs_inode {
/* which subvolume this inode belongs to */
struct btrfs_root *root;
/* key used to find this inode on disk. This is used by the code
* to read in roots of subvolumes
*/
struct btrfs_key location;
/*
* Lock for counters and all fields used to determine if the inode is in
* the log or not (last_trans, last_sub_trans, last_log_commit,
* logged_trans), to access/update new_delalloc_bytes and to update the
* VFS' inode number of bytes used.
*/
spinlock_t lock;
/* the extent_tree has caches of all the extent mappings to disk */
struct extent_map_tree extent_tree;
/* the io_tree does range state (DIRTY, LOCKED etc) */
struct extent_io_tree io_tree;
/* special utility tree used to record which mirrors have already been
* tried when checksums fail for a given block
*/
struct extent_io_tree io_failure_tree;
/*
* Keep track of where the inode has extent items mapped in order to
* make sure the i_size adjustments are accurate
*/
struct extent_io_tree file_extent_tree;
/* held while logging the inode in tree-log.c */
struct mutex log_mutex;
/* used to order data wrt metadata */
struct btrfs_ordered_inode_tree ordered_tree;
/* list of all the delalloc inodes in the FS. There are times we need
* to write all the delalloc pages to disk, and this list is used
* to walk them all.
*/
struct list_head delalloc_inodes;
/* node for the red-black tree that links inodes in subvolume root */
struct rb_node rb_node;
unsigned long runtime_flags;
/* Keep track of who's O_SYNC/fsyncing currently */
atomic_t sync_writers;
/* full 64 bit generation number, struct vfs_inode doesn't have a big
* enough field for this.
*/
u64 generation;
/*
* transid of the trans_handle that last modified this inode
*/
u64 last_trans;
/*
* transid that last logged this inode
*/
u64 logged_trans;
/*
* log transid when this inode was last modified
*/
int last_sub_trans;
/* a local copy of root's last_log_commit */
int last_log_commit;
/* total number of bytes pending delalloc, used by stat to calc the
* real block usage of the file
*/
u64 delalloc_bytes;
/*
* Total number of bytes pending delalloc that fall within a file
* range that is either a hole or beyond EOF (and no prealloc extent
* exists in the range). This is always <= delalloc_bytes.
*/
u64 new_delalloc_bytes;
/*
* total number of bytes pending defrag, used by stat to check whether
* it needs COW.
*/
u64 defrag_bytes;
/*
* the size of the file stored in the metadata on disk. data=ordered
* means the in-memory i_size might be larger than the size on disk
* because not all the blocks are written yet.
*/
u64 disk_i_size;
/*
* if this is a directory then index_cnt is the counter for the index
* number for new files that are created
*/
u64 index_cnt;
/* Cache the directory index number to speed the dir/file remove */
u64 dir_index;
/* the fsync log has some corner cases that mean we have to check
* directories to see if any unlinks have been done before
* the directory was logged. See tree-log.c for all the
* details
*/
u64 last_unlink_trans;
/*
* The id/generation of the last transaction where this inode was
* either the source or the destination of a clone/dedupe operation.
* Used when logging an inode to know if there are shared extents that
* need special care when logging checksum items, to avoid duplicate
* checksum items in a log (which can lead to a corruption where we end
* up with missing checksum ranges after log replay).
* Protected by the vfs inode lock.
*/
u64 last_reflink_trans;
/*
* Number of bytes outstanding that are going to need csums. This is
* used in ENOSPC accounting.
*/
u64 csum_bytes;
/* Backwards incompatible flags, lower half of inode_item::flags */
u32 flags;
/* Read-only compatibility flags, upper half of inode_item::flags */
u32 ro_flags;
/*
* Counters to keep track of the number of extent item's we may use due
* to delalloc and such. outstanding_extents is the number of extent
* items we think we'll end up using, and reserved_extents is the number
* of extent items we've reserved metadata for.
*/
unsigned outstanding_extents;
struct btrfs_block_rsv block_rsv;
/*
* Cached values of inode properties
*/
unsigned prop_compress; /* per-file compression algorithm */
/*
* Force compression on the file using the defrag ioctl, could be
* different from prop_compress and takes precedence if set
*/
unsigned defrag_compress;
struct btrfs_delayed_node *delayed_node;
/* File creation time. */
struct timespec64 i_otime;
/* Hook into fs_info->delayed_iputs */
struct list_head delayed_iput;
struct rw_semaphore i_mmap_lock;
struct inode vfs_inode;
};
static inline u32 btrfs_inode_sectorsize(const struct btrfs_inode *inode)
{
return inode->root->fs_info->sectorsize;
}
static inline struct btrfs_inode *BTRFS_I(const struct inode *inode)
{
return container_of(inode, struct btrfs_inode, vfs_inode);
}
static inline unsigned long btrfs_inode_hash(u64 objectid,
const struct btrfs_root *root)
{
u64 h = objectid ^ (root->root_key.objectid * GOLDEN_RATIO_PRIME);
#if BITS_PER_LONG == 32
h = (h >> 32) ^ (h & 0xffffffff);
#endif
return (unsigned long)h;
}
static inline void btrfs_insert_inode_hash(struct inode *inode)
{
unsigned long h = btrfs_inode_hash(inode->i_ino, BTRFS_I(inode)->root);
__insert_inode_hash(inode, h);
}
static inline u64 btrfs_ino(const struct btrfs_inode *inode)
{
u64 ino = inode->location.objectid;
/*
* !ino: btree_inode
* type == BTRFS_ROOT_ITEM_KEY: subvol dir
*/
if (!ino || inode->location.type == BTRFS_ROOT_ITEM_KEY) ino = inode->vfs_inode.i_ino;
return ino;
}
static inline void btrfs_i_size_write(struct btrfs_inode *inode, u64 size)
{
i_size_write(&inode->vfs_inode, size);
inode->disk_i_size = size;
}
static inline bool btrfs_is_free_space_inode(struct btrfs_inode *inode)
{
struct btrfs_root *root = inode->root; if (root == root->fs_info->tree_root &&
btrfs_ino(inode) != BTRFS_BTREE_INODE_OBJECTID)
return true;
if (inode->location.objectid == BTRFS_FREE_INO_OBJECTID)
return true;
return false;
}
static inline bool is_data_inode(struct inode *inode)
{
return btrfs_ino(BTRFS_I(inode)) != BTRFS_BTREE_INODE_OBJECTID;
}
static inline void btrfs_mod_outstanding_extents(struct btrfs_inode *inode,
int mod)
{
lockdep_assert_held(&inode->lock);
inode->outstanding_extents += mod;
if (btrfs_is_free_space_inode(inode))
return;
trace_btrfs_inode_mod_outstanding_extents(inode->root, btrfs_ino(inode),
mod);
}
/*
* Called every time after doing a buffered, direct IO or memory mapped write.
*
* This is to ensure that if we write to a file that was previously fsynced in
* the current transaction, then try to fsync it again in the same transaction,
* we will know that there were changes in the file and that it needs to be
* logged.
*/
static inline void btrfs_set_inode_last_sub_trans(struct btrfs_inode *inode)
{
spin_lock(&inode->lock);
inode->last_sub_trans = inode->root->log_transid;
spin_unlock(&inode->lock);
}
static inline bool btrfs_inode_in_log(struct btrfs_inode *inode, u64 generation)
{
bool ret = false;
spin_lock(&inode->lock);
if (inode->logged_trans == generation &&
inode->last_sub_trans <= inode->last_log_commit && inode->last_sub_trans <= inode->root->last_log_commit)
ret = true;
spin_unlock(&inode->lock);
return ret;
}
struct btrfs_dio_private {
struct inode *inode;
u64 logical_offset;
u64 disk_bytenr;
/* Used for bio::bi_size */
u32 bytes;
/*
* References to this structure. There is one reference per in-flight
* bio plus one while we're still setting up.
*/
refcount_t refs;
/* dio_bio came from fs/direct-io.c */
struct bio *dio_bio;
/* Array of checksums */
u8 csums[];
};
/*
* btrfs_inode_item stores flags in a u64, btrfs_inode stores them in two
* separate u32s. These two functions convert between the two representations.
*/
static inline u64 btrfs_inode_combine_flags(u32 flags, u32 ro_flags)
{
return (flags | ((u64)ro_flags << 32));
}
static inline void btrfs_inode_split_flags(u64 inode_item_flags,
u32 *flags, u32 *ro_flags)
{
*flags = (u32)inode_item_flags;
*ro_flags = (u32)(inode_item_flags >> 32);
}
/* Array of bytes with variable length, hexadecimal format 0x1234 */
#define CSUM_FMT "0x%*phN"
#define CSUM_FMT_VALUE(size, bytes) size, bytes
static inline void btrfs_print_data_csum_error(struct btrfs_inode *inode,
u64 logical_start, u8 *csum, u8 *csum_expected, int mirror_num)
{
struct btrfs_root *root = inode->root;
const u32 csum_size = root->fs_info->csum_size;
/* Output minus objectid, which is more meaningful */
if (root->root_key.objectid >= BTRFS_LAST_FREE_OBJECTID)
btrfs_warn_rl(root->fs_info,
"csum failed root %lld ino %lld off %llu csum " CSUM_FMT " expected csum " CSUM_FMT " mirror %d",
root->root_key.objectid, btrfs_ino(inode),
logical_start,
CSUM_FMT_VALUE(csum_size, csum),
CSUM_FMT_VALUE(csum_size, csum_expected),
mirror_num);
else
btrfs_warn_rl(root->fs_info,
"csum failed root %llu ino %llu off %llu csum " CSUM_FMT " expected csum " CSUM_FMT " mirror %d",
root->root_key.objectid, btrfs_ino(inode),
logical_start,
CSUM_FMT_VALUE(csum_size, csum),
CSUM_FMT_VALUE(csum_size, csum_expected),
mirror_num);
}
#endif
// SPDX-License-Identifier: GPL-2.0+
/*
* User-space Probes (UProbes)
*
* Copyright (C) IBM Corporation, 2008-2012
* Authors:
* Srikar Dronamraju
* Jim Keniston
* Copyright (C) 2011-2012 Red Hat, Inc., Peter Zijlstra
*/
#include <linux/kernel.h>
#include <linux/highmem.h>
#include <linux/pagemap.h> /* read_mapping_page */
#include <linux/slab.h>
#include <linux/sched.h>
#include <linux/sched/mm.h>
#include <linux/sched/coredump.h>
#include <linux/export.h>
#include <linux/rmap.h> /* anon_vma_prepare */
#include <linux/mmu_notifier.h> /* set_pte_at_notify */
#include <linux/swap.h> /* try_to_free_swap */
#include <linux/ptrace.h> /* user_enable_single_step */
#include <linux/kdebug.h> /* notifier mechanism */
#include "../../mm/internal.h" /* munlock_vma_page */
#include <linux/percpu-rwsem.h>
#include <linux/task_work.h>
#include <linux/shmem_fs.h>
#include <linux/khugepaged.h>
#include <linux/uprobes.h>
#define UINSNS_PER_PAGE (PAGE_SIZE/UPROBE_XOL_SLOT_BYTES)
#define MAX_UPROBE_XOL_SLOTS UINSNS_PER_PAGE
static struct rb_root uprobes_tree = RB_ROOT;
/*
* allows us to skip the uprobe_mmap if there are no uprobe events active
* at this time. Probably a fine grained per inode count is better?
*/
#define no_uprobe_events() RB_EMPTY_ROOT(&uprobes_tree)
static DEFINE_SPINLOCK(uprobes_treelock); /* serialize rbtree access */
#define UPROBES_HASH_SZ 13
/* serialize uprobe->pending_list */
static struct mutex uprobes_mmap_mutex[UPROBES_HASH_SZ];
#define uprobes_mmap_hash(v) (&uprobes_mmap_mutex[((unsigned long)(v)) % UPROBES_HASH_SZ])
DEFINE_STATIC_PERCPU_RWSEM(dup_mmap_sem);
/* Have a copy of original instruction */
#define UPROBE_COPY_INSN 0
struct uprobe {
struct rb_node rb_node; /* node in the rb tree */
refcount_t ref;
struct rw_semaphore register_rwsem;
struct rw_semaphore consumer_rwsem;
struct list_head pending_list;
struct uprobe_consumer *consumers;
struct inode *inode; /* Also hold a ref to inode */
loff_t offset;
loff_t ref_ctr_offset;
unsigned long flags;
/*
* The generic code assumes that it has two members of unknown type
* owned by the arch-specific code:
*
* insn - copy_insn() saves the original instruction here for
* arch_uprobe_analyze_insn().
*
* ixol - potentially modified instruction to execute out of
* line, copied to xol_area by xol_get_insn_slot().
*/
struct arch_uprobe arch;
};
struct delayed_uprobe {
struct list_head list;
struct uprobe *uprobe;
struct mm_struct *mm;
};
static DEFINE_MUTEX(delayed_uprobe_lock);
static LIST_HEAD(delayed_uprobe_list);
/*
* Execute out of line area: anonymous executable mapping installed
* by the probed task to execute the copy of the original instruction
* mangled by set_swbp().
*
* On a breakpoint hit, thread contests for a slot. It frees the
* slot after singlestep. Currently a fixed number of slots are
* allocated.
*/
struct xol_area {
wait_queue_head_t wq; /* if all slots are busy */
atomic_t slot_count; /* number of in-use slots */
unsigned long *bitmap; /* 0 = free slot */
struct vm_special_mapping xol_mapping;
struct page *pages[2];
/*
* We keep the vma's vm_start rather than a pointer to the vma
* itself. The probed process or a naughty kernel module could make
* the vma go away, and we must handle that reasonably gracefully.
*/
unsigned long vaddr; /* Page(s) of instruction slots */
};
/*
* valid_vma: Verify if the specified vma is an executable vma
* Relax restrictions while unregistering: vm_flags might have
* changed after breakpoint was inserted.
* - is_register: indicates if we are in register context.
* - Return 1 if the specified virtual address is in an
* executable vma.
*/
static bool valid_vma(struct vm_area_struct *vma, bool is_register)
{
vm_flags_t flags = VM_HUGETLB | VM_MAYEXEC | VM_MAYSHARE;
if (is_register)
flags |= VM_WRITE;
return vma->vm_file && (vma->vm_flags & flags) == VM_MAYEXEC;
}
static unsigned long offset_to_vaddr(struct vm_area_struct *vma, loff_t offset)
{
return vma->vm_start + offset - ((loff_t)vma->vm_pgoff << PAGE_SHIFT);
}
static loff_t vaddr_to_offset(struct vm_area_struct *vma, unsigned long vaddr)
{
return ((loff_t)vma->vm_pgoff << PAGE_SHIFT) + (vaddr - vma->vm_start);
}
/**
* __replace_page - replace page in vma by new page.
* based on replace_page in mm/ksm.c
*
* @vma: vma that holds the pte pointing to page
* @addr: address the old @page is mapped at
* @old_page: the page we are replacing by new_page
* @new_page: the modified page we replace page by
*
* If @new_page is NULL, only unmap @old_page.
*
* Returns 0 on success, negative error code otherwise.
*/
static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
struct page *old_page, struct page *new_page)
{
struct mm_struct *mm = vma->vm_mm;
struct page_vma_mapped_walk pvmw = {
.page = compound_head(old_page),
.vma = vma,
.address = addr,
};
int err;
struct mmu_notifier_range range;
mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm, addr,
addr + PAGE_SIZE);
if (new_page) {
err = mem_cgroup_charge(new_page, vma->vm_mm, GFP_KERNEL);
if (err)
return err;
}
/* For try_to_free_swap() and munlock_vma_page() below */
lock_page(old_page);
mmu_notifier_invalidate_range_start(&range);
err = -EAGAIN;
if (!page_vma_mapped_walk(&pvmw))
goto unlock;
VM_BUG_ON_PAGE(addr != pvmw.address, old_page);
if (new_page) {
get_page(new_page);
page_add_new_anon_rmap(new_page, vma, addr, false);
lru_cache_add_inactive_or_unevictable(new_page, vma);
} else
/* no new page, just dec_mm_counter for old_page */
dec_mm_counter(mm, MM_ANONPAGES);
if (!PageAnon(old_page)) {
dec_mm_counter(mm, mm_counter_file(old_page));
inc_mm_counter(mm, MM_ANONPAGES);
}
flush_cache_page(vma, addr, pte_pfn(*pvmw.pte));
ptep_clear_flush_notify(vma, addr, pvmw.pte);
if (new_page)
set_pte_at_notify(mm, addr, pvmw.pte,
mk_pte(new_page, vma->vm_page_prot));
page_remove_rmap(old_page, false);
if (!page_mapped(old_page))
try_to_free_swap(old_page);
page_vma_mapped_walk_done(&pvmw);
if ((vma->vm_flags & VM_LOCKED) && !PageCompound(old_page))
munlock_vma_page(old_page);
put_page(old_page);
err = 0;
unlock:
mmu_notifier_invalidate_range_end(&range);
unlock_page(old_page);
return err;
}
/**
* is_swbp_insn - check if instruction is breakpoint instruction.
* @insn: instruction to be checked.
* Default implementation of is_swbp_insn
* Returns true if @insn is a breakpoint instruction.
*/
bool __weak is_swbp_insn(uprobe_opcode_t *insn)
{
return *insn == UPROBE_SWBP_INSN;
}
/**
* is_trap_insn - check if instruction is breakpoint instruction.
* @insn: instruction to be checked.
* Default implementation of is_trap_insn
* Returns true if @insn is a breakpoint instruction.
*
* This function is needed for the case where an architecture has multiple
* trap instructions (like powerpc).
*/
bool __weak is_trap_insn(uprobe_opcode_t *insn)
{
return is_swbp_insn(insn);
}
static void copy_from_page(struct page *page, unsigned long vaddr, void *dst, int len)
{
void *kaddr = kmap_atomic(page);
memcpy(dst, kaddr + (vaddr & ~PAGE_MASK), len);
kunmap_atomic(kaddr);
}
static void copy_to_page(struct page *page, unsigned long vaddr, const void *src, int len)
{
void *kaddr = kmap_atomic(page);
memcpy(kaddr + (vaddr & ~PAGE_MASK), src, len);
kunmap_atomic(kaddr);
}
static int verify_opcode(struct page *page, unsigned long vaddr, uprobe_opcode_t *new_opcode)
{
uprobe_opcode_t old_opcode;
bool is_swbp;
/*
* Note: We only check if the old_opcode is UPROBE_SWBP_INSN here.
* We do not check if it is any other 'trap variant' which could
* be conditional trap instruction such as the one powerpc supports.
*
* The logic is that we do not care if the underlying instruction
* is a trap variant; uprobes always wins over any other (gdb)
* breakpoint.
*/
copy_from_page(page, vaddr, &old_opcode, UPROBE_SWBP_INSN_SIZE);
is_swbp = is_swbp_insn(&old_opcode);
if (is_swbp_insn(new_opcode)) {
if (is_swbp) /* register: already installed? */
return 0;
} else {
if (!is_swbp) /* unregister: was it changed by us? */
return 0;
}
return 1;
}
static struct delayed_uprobe *
delayed_uprobe_check(struct uprobe *uprobe, struct mm_struct *mm)
{
struct delayed_uprobe *du;
list_for_each_entry(du, &delayed_uprobe_list, list)
if (du->uprobe == uprobe && du->mm == mm)
return du;
return NULL;
}
static int delayed_uprobe_add(struct uprobe *uprobe, struct mm_struct *mm)
{
struct delayed_uprobe *du;
if (delayed_uprobe_check(uprobe, mm))
return 0;
du = kzalloc(sizeof(*du), GFP_KERNEL);
if (!du)
return -ENOMEM;
du->uprobe = uprobe;
du->mm = mm;
list_add(&du->list, &delayed_uprobe_list);
return 0;
}
static void delayed_uprobe_delete(struct delayed_uprobe *du)
{
if (WARN_ON(!du))
return;
list_del(&du->list);
kfree(du);
}
static void delayed_uprobe_remove(struct uprobe *uprobe, struct mm_struct *mm)
{
struct list_head *pos, *q;
struct delayed_uprobe *du;
if (!uprobe && !mm)
return;
list_for_each_safe(pos, q, &delayed_uprobe_list) {
du = list_entry(pos, struct delayed_uprobe, list);
if (uprobe && du->uprobe != uprobe)
continue;
if (mm && du->mm != mm)
continue;
delayed_uprobe_delete(du);
}
}
static bool valid_ref_ctr_vma(struct uprobe *uprobe,
struct vm_area_struct *vma)
{
unsigned long vaddr = offset_to_vaddr(vma, uprobe->ref_ctr_offset);
return uprobe->ref_ctr_offset &&
vma->vm_file &&
file_inode(vma->vm_file) == uprobe->inode &&
(vma->vm_flags & (VM_WRITE|VM_SHARED)) == VM_WRITE &&
vma->vm_start <= vaddr &&
vma->vm_end > vaddr;
}
static struct vm_area_struct *
find_ref_ctr_vma(struct uprobe *uprobe, struct mm_struct *mm)
{
struct vm_area_struct *tmp;
for (tmp = mm->mmap; tmp; tmp = tmp->vm_next)
if (valid_ref_ctr_vma(uprobe, tmp))
return tmp;
return NULL;
}
static int
__update_ref_ctr(struct mm_struct *mm, unsigned long vaddr, short d)
{
void *kaddr;
struct page *page;
struct vm_area_struct *vma;
int ret;
short *ptr;
if (!vaddr || !d)
return -EINVAL;
ret = get_user_pages_remote(mm, vaddr, 1,
FOLL_WRITE, &page, &vma, NULL);
if (unlikely(ret <= 0)) {
/*
* We are asking for 1 page. If get_user_pages_remote() fails,
* it may return 0, in that case we have to return error.
*/
return ret == 0 ? -EBUSY : ret;
}
kaddr = kmap_atomic(page);
ptr = kaddr + (vaddr & ~PAGE_MASK);
if (unlikely(*ptr + d < 0)) {
pr_warn("ref_ctr going negative. vaddr: 0x%lx, "
"curr val: %d, delta: %d\n", vaddr, *ptr, d);
ret = -EINVAL;
goto out;
}
*ptr += d;
ret = 0;
out:
kunmap_atomic(kaddr);
put_page(page);
return ret;
}
static void update_ref_ctr_warn(struct uprobe *uprobe,
struct mm_struct *mm, short d)
{
pr_warn("ref_ctr %s failed for inode: 0x%lx offset: "
"0x%llx ref_ctr_offset: 0x%llx of mm: 0x%pK\n",
d > 0 ? "increment" : "decrement", uprobe->inode->i_ino,
(unsigned long long) uprobe->offset,
(unsigned long long) uprobe->ref_ctr_offset, mm);
}
static int update_ref_ctr(struct uprobe *uprobe, struct mm_struct *mm,
short d)
{
struct vm_area_struct *rc_vma;
unsigned long rc_vaddr;
int ret = 0;
rc_vma = find_ref_ctr_vma(uprobe, mm);
if (rc_vma) {
rc_vaddr = offset_to_vaddr(rc_vma, uprobe->ref_ctr_offset);
ret = __update_ref_ctr(mm, rc_vaddr, d);
if (ret)
update_ref_ctr_warn(uprobe, mm, d);
if (d > 0)
return ret;
}
mutex_lock(&delayed_uprobe_lock);
if (d > 0)
ret = delayed_uprobe_add(uprobe, mm);
else
delayed_uprobe_remove(uprobe, mm);
mutex_unlock(&delayed_uprobe_lock);
return ret;
}
/*
* NOTE:
* Expect the breakpoint instruction to be the smallest size instruction for
* the architecture. If an arch has variable length instruction and the
* breakpoint instruction is not of the smallest length instruction
* supported by that architecture then we need to modify is_trap_at_addr and
* uprobe_write_opcode accordingly. This would never be a problem for archs
* that have fixed length instructions.
*
* uprobe_write_opcode - write the opcode at a given virtual address.
* @auprobe: arch specific probepoint information.
* @mm: the probed process address space.
* @vaddr: the virtual address to store the opcode.
* @opcode: opcode to be written at @vaddr.
*
* Called with mm->mmap_lock held for write.
* Return 0 (success) or a negative errno.
*/
int uprobe_write_opcode(struct arch_uprobe *auprobe, struct mm_struct *mm,
unsigned long vaddr, uprobe_opcode_t opcode)
{
struct uprobe *uprobe;
struct page *old_page, *new_page;
struct vm_area_struct *vma;
int ret, is_register, ref_ctr_updated = 0;
bool orig_page_huge = false;
unsigned int gup_flags = FOLL_FORCE;
is_register = is_swbp_insn(&opcode);
uprobe = container_of(auprobe, struct uprobe, arch);
retry:
if (is_register)
gup_flags |= FOLL_SPLIT_PMD;
/* Read the page with vaddr into memory */
ret = get_user_pages_remote(mm, vaddr, 1, gup_flags,
&old_page, &vma, NULL);
if (ret <= 0)
return ret;
ret = verify_opcode(old_page, vaddr, &opcode);
if (ret <= 0)
goto put_old;
if (WARN(!is_register && PageCompound(old_page),
"uprobe unregister should never work on compound page\n")) {
ret = -EINVAL;
goto put_old;
}
/* We are going to replace instruction, update ref_ctr. */
if (!ref_ctr_updated && uprobe->ref_ctr_offset) {
ret = update_ref_ctr(uprobe, mm, is_register ? 1 : -1);
if (ret)
goto put_old;
ref_ctr_updated = 1;
}
ret = 0;
if (!is_register && !PageAnon(old_page))
goto put_old;
ret = anon_vma_prepare(vma);
if (ret)
goto put_old;
ret = -ENOMEM;
new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, vaddr);
if (!new_page)
goto put_old;
__SetPageUptodate(new_page);
copy_highpage(new_page, old_page);
copy_to_page(new_page, vaddr, &opcode, UPROBE_SWBP_INSN_SIZE);
if (!is_register) {
struct page *orig_page;
pgoff_t index;
VM_BUG_ON_PAGE(!PageAnon(old_page), old_page);
index = vaddr_to_offset(vma, vaddr & PAGE_MASK) >> PAGE_SHIFT;
orig_page = find_get_page(vma->vm_file->f_inode->i_mapping,
index);
if (orig_page) {
if (PageUptodate(orig_page) &&
pages_identical(new_page, orig_page)) {
/* let go new_page */
put_page(new_page);
new_page = NULL;
if (PageCompound(orig_page))
orig_page_huge = true;
}
put_page(orig_page);
}
}
ret = __replace_page(vma, vaddr, old_page, new_page);
if (new_page)
put_page(new_page);
put_old:
put_page(old_page);
if (unlikely(ret == -EAGAIN))
goto retry;
/* Revert back reference counter if instruction update failed. */
if (ret && is_register && ref_ctr_updated)
update_ref_ctr(uprobe, mm, -1);
/* try collapse pmd for compound page */
if (!ret && orig_page_huge)
collapse_pte_mapped_thp(mm, vaddr);
return ret;
}
/**
* set_swbp - store breakpoint at a given address.
* @auprobe: arch specific probepoint information.
* @mm: the probed process address space.
* @vaddr: the virtual address to insert the opcode.
*
* For mm @mm, store the breakpoint instruction at @vaddr.
* Return 0 (success) or a negative errno.
*/
int __weak set_swbp(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long vaddr)
{
return uprobe_write_opcode(auprobe, mm, vaddr, UPROBE_SWBP_INSN);
}
/**
* set_orig_insn - Restore the original instruction.
* @mm: the probed process address space.
* @auprobe: arch specific probepoint information.
* @vaddr: the virtual address to insert the opcode.
*
* For mm @mm, restore the original opcode (opcode) at @vaddr.
* Return 0 (success) or a negative errno.
*/
int __weak
set_orig_insn(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long vaddr)
{
return uprobe_write_opcode(auprobe, mm, vaddr,
*(uprobe_opcode_t *)&auprobe->insn);
}
static struct uprobe *get_uprobe(struct uprobe *uprobe)
{
refcount_inc(&uprobe->ref);
return uprobe;
}
static void put_uprobe(struct uprobe *uprobe)
{
if (refcount_dec_and_test(&uprobe->ref)) {
/*
* If application munmap(exec_vma) before uprobe_unregister()
* gets called, we don't get a chance to remove uprobe from
* delayed_uprobe_list from remove_breakpoint(). Do it here.
*/
mutex_lock(&delayed_uprobe_lock);
delayed_uprobe_remove(uprobe, NULL);
mutex_unlock(&delayed_uprobe_lock);
kfree(uprobe);
}
}
static __always_inline
int uprobe_cmp(const struct inode *l_inode, const loff_t l_offset,
const struct uprobe *r)
{
if (l_inode < r->inode)
return -1;
if (l_inode > r->inode)
return 1;
if (l_offset < r->offset)
return -1;
if (l_offset > r->offset)
return 1;
return 0;
}
#define __node_2_uprobe(node) \
rb_entry((node), struct uprobe, rb_node)
struct __uprobe_key {
struct inode *inode;
loff_t offset;
};
static inline int __uprobe_cmp_key(const void *key, const struct rb_node *b)
{
const struct __uprobe_key *a = key;
return uprobe_cmp(a->inode, a->offset, __node_2_uprobe(b));
}
static inline int __uprobe_cmp(struct rb_node *a, const struct rb_node *b)
{
struct uprobe *u = __node_2_uprobe(a);
return uprobe_cmp(u->inode, u->offset, __node_2_uprobe(b));
}
static struct uprobe *__find_uprobe(struct inode *inode, loff_t offset)
{
struct __uprobe_key key = {
.inode = inode,
.offset = offset,
};
struct rb_node *node = rb_find(&key, &uprobes_tree, __uprobe_cmp_key);
if (node)
return get_uprobe(__node_2_uprobe(node));
return NULL;
}
/*
* Find a uprobe corresponding to a given inode:offset
* Acquires uprobes_treelock
*/
static struct uprobe *find_uprobe(struct inode *inode, loff_t offset)
{
struct uprobe *uprobe;
spin_lock(&uprobes_treelock);
uprobe = __find_uprobe(inode, offset);
spin_unlock(&uprobes_treelock);
return uprobe;
}
static struct uprobe *__insert_uprobe(struct uprobe *uprobe)
{
struct rb_node *node;
node = rb_find_add(&uprobe->rb_node, &uprobes_tree, __uprobe_cmp);
if (node)
return get_uprobe(__node_2_uprobe(node));
/* get access + creation ref */
refcount_set(&uprobe->ref, 2);
return NULL;
}
/*
* Acquire uprobes_treelock.
* Matching uprobe already exists in rbtree;
* increment (access refcount) and return the matching uprobe.
*
* No matching uprobe; insert the uprobe in rb_tree;
* get a double refcount (access + creation) and return NULL.
*/
static struct uprobe *insert_uprobe(struct uprobe *uprobe)
{
struct uprobe *u;
spin_lock(&uprobes_treelock);
u = __insert_uprobe(uprobe);
spin_unlock(&uprobes_treelock);
return u;
}
static void
ref_ctr_mismatch_warn(struct uprobe *cur_uprobe, struct uprobe *uprobe)
{
pr_warn("ref_ctr_offset mismatch. inode: 0x%lx offset: 0x%llx "
"ref_ctr_offset(old): 0x%llx ref_ctr_offset(new): 0x%llx\n",
uprobe->inode->i_ino, (unsigned long long) uprobe->offset,
(unsigned long long) cur_uprobe->ref_ctr_offset,
(unsigned long long) uprobe->ref_ctr_offset);
}
static struct uprobe *alloc_uprobe(struct inode *inode, loff_t offset,
loff_t ref_ctr_offset)
{
struct uprobe *uprobe, *cur_uprobe;
uprobe = kzalloc(sizeof(struct uprobe), GFP_KERNEL);
if (!uprobe)
return NULL;
uprobe->inode = inode;
uprobe->offset = offset;
uprobe->ref_ctr_offset = ref_ctr_offset;
init_rwsem(&uprobe->register_rwsem);
init_rwsem(&uprobe->consumer_rwsem);
/* add to uprobes_tree, sorted on inode:offset */
cur_uprobe = insert_uprobe(uprobe);
/* a uprobe exists for this inode:offset combination */
if (cur_uprobe) {
if (cur_uprobe->ref_ctr_offset != uprobe->ref_ctr_offset) {
ref_ctr_mismatch_warn(cur_uprobe, uprobe);
put_uprobe(cur_uprobe);
kfree(uprobe);
return ERR_PTR(-EINVAL);
}
kfree(uprobe);
uprobe = cur_uprobe;
}
return uprobe;
}
static void consumer_add(struct uprobe *uprobe, struct uprobe_consumer *uc)
{
down_write(&uprobe->consumer_rwsem);
uc->next = uprobe->consumers;
uprobe->consumers = uc;
up_write(&uprobe->consumer_rwsem);
}
/*
* For uprobe @uprobe, delete the consumer @uc.
* Return true if the @uc is deleted successfully
* or return false.
*/
static bool consumer_del(struct uprobe *uprobe, struct uprobe_consumer *uc)
{
struct uprobe_consumer **con;
bool ret = false;
down_write(&uprobe->consumer_rwsem);
for (con = &uprobe->consumers; *con; con = &(*con)->next) {
if (*con == uc) {
*con = uc->next;
ret = true;
break;
}
}
up_write(&uprobe->consumer_rwsem);
return ret;
}
static int __copy_insn(struct address_space *mapping, struct file *filp,
void *insn, int nbytes, loff_t offset)
{
struct page *page;
/*
* Ensure that the page that has the original instruction is populated
* and in page-cache. If ->readpage == NULL it must be shmem_mapping(),
* see uprobe_register().
*/
if (mapping->a_ops->readpage)
page = read_mapping_page(mapping, offset >> PAGE_SHIFT, filp);
else
page = shmem_read_mapping_page(mapping, offset >> PAGE_SHIFT);
if (IS_ERR(page))
return PTR_ERR(page);
copy_from_page(page, offset, insn, nbytes);
put_page(page);
return 0;
}
static int copy_insn(struct uprobe *uprobe, struct file *filp)
{
struct address_space *mapping = uprobe->inode->i_mapping;
loff_t offs = uprobe->offset;
void *insn = &uprobe->arch.insn;
int size = sizeof(uprobe->arch.insn);
int len, err = -EIO;
/* Copy only available bytes, -EIO if nothing was read */
do {
if (offs >= i_size_read(uprobe->inode))
break;
len = min_t(int, size, PAGE_SIZE - (offs & ~PAGE_MASK));
err = __copy_insn(mapping, filp, insn, len, offs);
if (err)
break;
insn += len;
offs += len;
size -= len;
} while (size);
return err;
}
static int prepare_uprobe(struct uprobe *uprobe, struct file *file,
struct mm_struct *mm, unsigned long vaddr)
{
int ret = 0;
if (test_bit(UPROBE_COPY_INSN, &uprobe->flags))
return ret;
/* TODO: move this into _register, until then we abuse this sem. */
down_write(&uprobe->consumer_rwsem);
if (test_bit(UPROBE_COPY_INSN, &uprobe->flags))
goto out;
ret = copy_insn(uprobe, file);
if (ret)
goto out;
ret = -ENOTSUPP;
if (is_trap_insn((uprobe_opcode_t *)&uprobe->arch.insn))
goto out;
ret = arch_uprobe_analyze_insn(&uprobe->arch, mm, vaddr);
if (ret)
goto out;
smp_wmb(); /* pairs with the smp_rmb() in handle_swbp() */
set_bit(UPROBE_COPY_INSN, &uprobe->flags);
out:
up_write(&uprobe->consumer_rwsem);
return ret;
}
static inline bool consumer_filter(struct uprobe_consumer *uc,
enum uprobe_filter_ctx ctx, struct mm_struct *mm)
{
return !uc->filter || uc->filter(uc, ctx, mm);
}
static bool filter_chain(struct uprobe *uprobe,
enum uprobe_filter_ctx ctx, struct mm_struct *mm)
{
struct uprobe_consumer *uc;
bool ret = false;
down_read(&uprobe->consumer_rwsem);
for (uc = uprobe->consumers; uc; uc = uc->next) {
ret = consumer_filter(uc, ctx, mm);
if (ret)
break;
}
up_read(&uprobe->consumer_rwsem);
return ret;
}
static int
install_breakpoint(struct uprobe *uprobe, struct mm_struct *mm,
struct vm_area_struct *vma, unsigned long vaddr)
{
bool first_uprobe;
int ret;
ret = prepare_uprobe(uprobe, vma->vm_file, mm, vaddr);
if (ret)
return ret;
/*
* set MMF_HAS_UPROBES in advance for uprobe_pre_sstep_notifier(),
* the task can hit this breakpoint right after __replace_page().
*/
first_uprobe = !test_bit(MMF_HAS_UPROBES, &mm->flags);
if (first_uprobe)
set_bit(MMF_HAS_UPROBES, &mm->flags);
ret = set_swbp(&uprobe->arch, mm, vaddr);
if (!ret)
clear_bit(MMF_RECALC_UPROBES, &mm->flags);
else if (first_uprobe)
clear_bit(MMF_HAS_UPROBES, &mm->flags);
return ret;
}
static int
remove_breakpoint(struct uprobe *uprobe, struct mm_struct *mm, unsigned long vaddr)
{
set_bit(MMF_RECALC_UPROBES, &mm->flags);
return set_orig_insn(&uprobe->arch, mm, vaddr);
}
static inline bool uprobe_is_active(struct uprobe *uprobe)
{
return !RB_EMPTY_NODE(&uprobe->rb_node);
}
/*
* There could be threads that have already hit the breakpoint. They
* will recheck the current insn and restart if find_uprobe() fails.
* See find_active_uprobe().
*/
static void delete_uprobe(struct uprobe *uprobe)
{
if (WARN_ON(!uprobe_is_active(uprobe)))
return;
spin_lock(&uprobes_treelock);
rb_erase(&uprobe->rb_node, &uprobes_tree);
spin_unlock(&uprobes_treelock);
RB_CLEAR_NODE(&uprobe->rb_node); /* for uprobe_is_active() */
put_uprobe(uprobe);
}
struct map_info {
struct map_info *next;
struct mm_struct *mm;
unsigned long vaddr;
};
static inline struct map_info *free_map_info(struct map_info *info)
{
struct map_info *next = info->next;
kfree(info);
return next;
}
static struct map_info *
build_map_info(struct address_space *mapping, loff_t offset, bool is_register)
{
unsigned long pgoff = offset >> PAGE_SHIFT;
struct vm_area_struct *vma;
struct map_info *curr = NULL;
struct map_info *prev = NULL;
struct map_info *info;
int more = 0;
again:
i_mmap_lock_read(mapping);
vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) {
if (!valid_vma(vma, is_register))
continue;
if (!prev && !more) {
/*
* Needs GFP_NOWAIT to avoid i_mmap_rwsem recursion through
* reclaim. This is optimistic, no harm done if it fails.
*/
prev = kmalloc(sizeof(struct map_info),
GFP_NOWAIT | __GFP_NOMEMALLOC | __GFP_NOWARN);
if (prev)
prev->next = NULL;
}
if (!prev) {
more++;
continue;
}
if (!mmget_not_zero(vma->vm_mm))
continue;
info = prev;
prev = prev->next;
info->next = curr;
curr = info;
info->mm = vma->vm_mm;
info->vaddr = offset_to_vaddr(vma, offset);
}
i_mmap_unlock_read(mapping);
if (!more)
goto out;
prev = curr;
while (curr) {
mmput(curr->mm);
curr = curr->next;
}
do {
info = kmalloc(sizeof(struct map_info), GFP_KERNEL);
if (!info) {
curr = ERR_PTR(-ENOMEM);
goto out;
}
info->next = prev;
prev = info;
} while (--more);
goto again;
out:
while (prev)
prev = free_map_info(prev);
return curr;
}
static int
register_for_each_vma(struct uprobe *uprobe, struct uprobe_consumer *new)
{
bool is_register = !!new;
struct map_info *info;
int err = 0;
percpu_down_write(&dup_mmap_sem);
info = build_map_info(uprobe->inode->i_mapping,
uprobe->offset, is_register);
if (IS_ERR(info)) {
err = PTR_ERR(info);
goto out;
}
while (info) {
struct mm_struct *mm = info->mm;
struct vm_area_struct *vma;
if (err && is_register)
goto free;
mmap_write_lock(mm);
vma = find_vma(mm, info->vaddr);
if (!vma || !valid_vma(vma, is_register) ||
file_inode(vma->vm_file) != uprobe->inode)
goto unlock;
if (vma->vm_start > info->vaddr ||
vaddr_to_offset(vma, info->vaddr) != uprobe->offset)
goto unlock;
if (is_register) {
/* consult only the "caller", new consumer. */
if (consumer_filter(new,
UPROBE_FILTER_REGISTER, mm))
err = install_breakpoint(uprobe, mm, vma, info->vaddr);
} else if (test_bit(MMF_HAS_UPROBES, &mm->flags)) {
if (!filter_chain(uprobe,
UPROBE_FILTER_UNREGISTER, mm))
err |= remove_breakpoint(uprobe, mm, info->vaddr);
}
unlock:
mmap_write_unlock(mm);
free:
mmput(mm);
info = free_map_info(info);
}
out:
percpu_up_write(&dup_mmap_sem);
return err;
}
static void
__uprobe_unregister(struct uprobe *uprobe, struct uprobe_consumer *uc)
{
int err;
if (WARN_ON(!consumer_del(uprobe, uc)))
return;
err = register_for_each_vma(uprobe, NULL);
/* TODO : cant unregister? schedule a worker thread */
if (!uprobe->consumers && !err)
delete_uprobe(uprobe);
}
/*
* uprobe_unregister - unregister an already registered probe.
* @inode: the file in which the probe has to be removed.
* @offset: offset from the start of the file.
* @uc: identify which probe if multiple probes are colocated.
*/
void uprobe_unregister(struct inode *inode, loff_t offset, struct uprobe_consumer *uc)
{
struct uprobe *uprobe;
uprobe = find_uprobe(inode, offset);
if (WARN_ON(!uprobe))
return;
down_write(&uprobe->register_rwsem);
__uprobe_unregister(uprobe, uc);
up_write(&uprobe->register_rwsem);
put_uprobe(uprobe);
}
EXPORT_SYMBOL_GPL(uprobe_unregister);
/*
* __uprobe_register - register a probe
* @inode: the file in which the probe has to be placed.
* @offset: offset from the start of the file.
* @uc: information on howto handle the probe..
*
* Apart from the access refcount, __uprobe_register() takes a creation
* refcount (thro alloc_uprobe) if and only if this @uprobe is getting
* inserted into the rbtree (i.e first consumer for a @inode:@offset
* tuple). Creation refcount stops uprobe_unregister from freeing the
* @uprobe even before the register operation is complete. Creation
* refcount is released when the last @uc for the @uprobe
* unregisters. Caller of __uprobe_register() is required to keep @inode
* (and the containing mount) referenced.
*
* Return errno if it cannot successully install probes
* else return 0 (success)
*/
static int __uprobe_register(struct inode *inode, loff_t offset,
loff_t ref_ctr_offset, struct uprobe_consumer *uc)
{
struct uprobe *uprobe;
int ret;
/* Uprobe must have at least one set consumer */
if (!uc->handler && !uc->ret_handler)
return -EINVAL;
/* copy_insn() uses read_mapping_page() or shmem_read_mapping_page() */
if (!inode->i_mapping->a_ops->readpage && !shmem_mapping(inode->i_mapping))
return -EIO;
/* Racy, just to catch the obvious mistakes */
if (offset > i_size_read(inode))
return -EINVAL;
/*
* This ensures that copy_from_page(), copy_to_page() and
* __update_ref_ctr() can't cross page boundary.
*/
if (!IS_ALIGNED(offset, UPROBE_SWBP_INSN_SIZE))
return -EINVAL;
if (!IS_ALIGNED(ref_ctr_offset, sizeof(short)))
return -EINVAL;
retry:
uprobe = alloc_uprobe(inode, offset, ref_ctr_offset);
if (!uprobe)
return -ENOMEM;
if (IS_ERR(uprobe))
return PTR_ERR(uprobe);
/*
* We can race with uprobe_unregister()->delete_uprobe().
* Check uprobe_is_active() and retry if it is false.
*/
down_write(&uprobe->register_rwsem);
ret = -EAGAIN;
if (likely(uprobe_is_active(uprobe))) {
consumer_add(uprobe, uc);
ret = register_for_each_vma(uprobe, uc);
if (ret)
__uprobe_unregister(uprobe, uc);
}
up_write(&uprobe->register_rwsem);
put_uprobe(uprobe);
if (unlikely(ret == -EAGAIN))
goto retry;
return ret;
}
int uprobe_register(struct inode *inode, loff_t offset,
struct uprobe_consumer *uc)
{
return __uprobe_register(inode, offset, 0, uc);
}
EXPORT_SYMBOL_GPL(uprobe_register);
int uprobe_register_refctr(struct inode *inode, loff_t offset,
loff_t ref_ctr_offset, struct uprobe_consumer *uc)
{
return __uprobe_register(inode, offset, ref_ctr_offset, uc);
}
EXPORT_SYMBOL_GPL(uprobe_register_refctr);
/*
* uprobe_apply - unregister an already registered probe.
* @inode: the file in which the probe has to be removed.
* @offset: offset from the start of the file.
* @uc: consumer which wants to add more or remove some breakpoints
* @add: add or remove the breakpoints
*/
int uprobe_apply(struct inode *inode, loff_t offset,
struct uprobe_consumer *uc, bool add)
{
struct uprobe *uprobe;
struct uprobe_consumer *con;
int ret = -ENOENT;
uprobe = find_uprobe(inode, offset);
if (WARN_ON(!uprobe))
return ret;
down_write(&uprobe->register_rwsem);
for (con = uprobe->consumers; con && con != uc ; con = con->next)
;
if (con)
ret = register_for_each_vma(uprobe, add ? uc : NULL);
up_write(&uprobe->register_rwsem);
put_uprobe(uprobe);
return ret;
}
static int unapply_uprobe(struct uprobe *uprobe, struct mm_struct *mm)
{
struct vm_area_struct *vma;
int err = 0;
mmap_read_lock(mm);
for (vma = mm->mmap; vma; vma = vma->vm_next) {
unsigned long vaddr;
loff_t offset;
if (!valid_vma(vma, false) ||
file_inode(vma->vm_file) != uprobe->inode)
continue;
offset = (loff_t)vma->vm_pgoff << PAGE_SHIFT;
if (uprobe->offset < offset ||
uprobe->offset >= offset + vma->vm_end - vma->vm_start)
continue;
vaddr = offset_to_vaddr(vma, uprobe->offset);
err |= remove_breakpoint(uprobe, mm, vaddr);
}
mmap_read_unlock(mm);
return err;
}
static struct rb_node *
find_node_in_range(struct inode *inode, loff_t min, loff_t max)
{
struct rb_node *n = uprobes_tree.rb_node;
while (n) {
struct uprobe *u = rb_entry(n, struct uprobe, rb_node);
if (inode < u->inode) {
n = n->rb_left;
} else if (inode > u->inode) {
n = n->rb_right;
} else {
if (max < u->offset)
n = n->rb_left;
else if (min > u->offset)
n = n->rb_right;
else
break;
}
}
return n;
}
/*
* For a given range in vma, build a list of probes that need to be inserted.
*/
static void build_probe_list(struct inode *inode,
struct vm_area_struct *vma,
unsigned long start, unsigned long end,
struct list_head *head)
{
loff_t min, max;
struct rb_node *n, *t;
struct uprobe *u;
INIT_LIST_HEAD(head);
min = vaddr_to_offset(vma, start);
max = min + (end - start) - 1;
spin_lock(&uprobes_treelock);
n = find_node_in_range(inode, min, max);
if (n) {
for (t = n; t; t = rb_prev(t)) {
u = rb_entry(t, struct uprobe, rb_node);
if (u->inode != inode || u->offset < min)
break;
list_add(&u->pending_list, head);
get_uprobe(u);
}
for (t = n; (t = rb_next(t)); ) {
u = rb_entry(t, struct uprobe, rb_node);
if (u->inode != inode || u->offset > max)
break;
list_add(&u->pending_list, head);
get_uprobe(u);
}
}
spin_unlock(&uprobes_treelock);
}
/* @vma contains reference counter, not the probed instruction. */
static int delayed_ref_ctr_inc(struct vm_area_struct *vma)
{
struct list_head *pos, *q;
struct delayed_uprobe *du;
unsigned long vaddr;
int ret = 0, err = 0;
mutex_lock(&delayed_uprobe_lock);
list_for_each_safe(pos, q, &delayed_uprobe_list) {
du = list_entry(pos, struct delayed_uprobe, list);
if (du->mm != vma->vm_mm ||
!valid_ref_ctr_vma(du->uprobe, vma))
continue;
vaddr = offset_to_vaddr(vma, du->uprobe->ref_ctr_offset);
ret = __update_ref_ctr(vma->vm_mm, vaddr, 1);
if (ret) {
update_ref_ctr_warn(du->uprobe, vma->vm_mm, 1);
if (!err)
err = ret;
}
delayed_uprobe_delete(du);
}
mutex_unlock(&delayed_uprobe_lock);
return err;
}
/*
* Called from mmap_region/vma_adjust with mm->mmap_lock acquired.
*
* Currently we ignore all errors and always return 0, the callers
* can't handle the failure anyway.
*/
int uprobe_mmap(struct vm_area_struct *vma)
{
struct list_head tmp_list;
struct uprobe *uprobe, *u;
struct inode *inode;
if (no_uprobe_events())
return 0;
if (vma->vm_file &&
(vma->vm_flags & (VM_WRITE|VM_SHARED)) == VM_WRITE &&
test_bit(MMF_HAS_UPROBES, &vma->vm_mm->flags))
delayed_ref_ctr_inc(vma);
if (!valid_vma(vma, true))
return 0;
inode = file_inode(vma->vm_file);
if (!inode)
return 0;
mutex_lock(uprobes_mmap_hash(inode));
build_probe_list(inode, vma, vma->vm_start, vma->vm_end, &tmp_list);
/*
* We can race with uprobe_unregister(), this uprobe can be already
* removed. But in this case filter_chain() must return false, all
* consumers have gone away.
*/
list_for_each_entry_safe(uprobe, u, &tmp_list, pending_list) {
if (!fatal_signal_pending(current) &&
filter_chain(uprobe, UPROBE_FILTER_MMAP, vma->vm_mm)) {
unsigned long vaddr = offset_to_vaddr(vma, uprobe->offset);
install_breakpoint(uprobe, vma->vm_mm, vma, vaddr);
}
put_uprobe(uprobe);
}
mutex_unlock(uprobes_mmap_hash(inode));
return 0;
}
static bool
vma_has_uprobes(struct vm_area_struct *vma, unsigned long start, unsigned long end)
{
loff_t min, max;
struct inode *inode;
struct rb_node *n;
inode = file_inode(vma->vm_file);
min = vaddr_to_offset(vma, start);
max = min + (end - start) - 1;
spin_lock(&uprobes_treelock);
n = find_node_in_range(inode, min, max);
spin_unlock(&uprobes_treelock);
return !!n;
}
/*
* Called in context of a munmap of a vma.
*/
void uprobe_munmap(struct vm_area_struct *vma, unsigned long start, unsigned long end)
{
if (no_uprobe_events() || !valid_vma(vma, false))
return;
if (!atomic_read(&vma->vm_mm->mm_users)) /* called by mmput() ? */
return;
if (!test_bit(MMF_HAS_UPROBES, &vma->vm_mm->flags) ||
test_bit(MMF_RECALC_UPROBES, &vma->vm_mm->flags))
return;
if (vma_has_uprobes(vma, start, end))
set_bit(MMF_RECALC_UPROBES, &vma->vm_mm->flags);
}
/* Slot allocation for XOL */
static int xol_add_vma(struct mm_struct *mm, struct xol_area *area)
{
struct vm_area_struct *vma;
int ret;
if (mmap_write_lock_killable(mm))
return -EINTR;
if (mm->uprobes_state.xol_area) {
ret = -EALREADY;
goto fail;
}
if (!area->vaddr) {
/* Try to map as high as possible, this is only a hint. */
area->vaddr = get_unmapped_area(NULL, TASK_SIZE - PAGE_SIZE,
PAGE_SIZE, 0, 0);
if (IS_ERR_VALUE(area->vaddr)) {
ret = area->vaddr;
goto fail;
}
}
vma = _install_special_mapping(mm, area->vaddr, PAGE_SIZE,
VM_EXEC|VM_MAYEXEC|VM_DONTCOPY|VM_IO,
&area->xol_mapping);
if (IS_ERR(vma)) {
ret = PTR_ERR(vma);
goto fail;
}
ret = 0;
/* pairs with get_xol_area() */
smp_store_release(&mm->uprobes_state.xol_area, area); /* ^^^ */
fail:
mmap_write_unlock(mm);
return ret;
}
static struct xol_area *__create_xol_area(unsigned long vaddr)
{
struct mm_struct *mm = current->mm;
uprobe_opcode_t insn = UPROBE_SWBP_INSN;
struct xol_area *area;
area = kmalloc(sizeof(*area), GFP_KERNEL);
if (unlikely(!area))
goto out;
area->bitmap = kcalloc(BITS_TO_LONGS(UINSNS_PER_PAGE), sizeof(long),
GFP_KERNEL);
if (!area->bitmap)
goto free_area;
area->xol_mapping.name = "[uprobes]";
area->xol_mapping.fault = NULL;
area->xol_mapping.pages = area->pages;
area->pages[0] = alloc_page(GFP_HIGHUSER);
if (!area->pages[0])
goto free_bitmap;
area->pages[1] = NULL;
area->vaddr = vaddr;
init_waitqueue_head(&area->wq);
/* Reserve the 1st slot for get_trampoline_vaddr() */
set_bit(0, area->bitmap);
atomic_set(&area->slot_count, 1);
arch_uprobe_copy_ixol(area->pages[0], 0, &insn, UPROBE_SWBP_INSN_SIZE);
if (!xol_add_vma(mm, area))
return area;
__free_page(area->pages[0]);
free_bitmap:
kfree(area->bitmap);
free_area:
kfree(area);
out:
return NULL;
}
/*
* get_xol_area - Allocate process's xol_area if necessary.
* This area will be used for storing instructions for execution out of line.
*
* Returns the allocated area or NULL.
*/
static struct xol_area *get_xol_area(void)
{
struct mm_struct *mm = current->mm;
struct xol_area *area;
if (!mm->uprobes_state.xol_area)
__create_xol_area(0);
/* Pairs with xol_add_vma() smp_store_release() */
area = READ_ONCE(mm->uprobes_state.xol_area); /* ^^^ */
return area;
}
/*
* uprobe_clear_state - Free the area allocated for slots.
*/
void uprobe_clear_state(struct mm_struct *mm)
{
struct xol_area *area = mm->uprobes_state.xol_area;
mutex_lock(&delayed_uprobe_lock);
delayed_uprobe_remove(NULL, mm);
mutex_unlock(&delayed_uprobe_lock);
if (!area)
return;
put_page(area->pages[0]);
kfree(area->bitmap);
kfree(area);
}
void uprobe_start_dup_mmap(void)
{
percpu_down_read(&dup_mmap_sem);
}
void uprobe_end_dup_mmap(void)
{
percpu_up_read(&dup_mmap_sem);
}
void uprobe_dup_mmap(struct mm_struct *oldmm, struct mm_struct *newmm)
{
if (test_bit(MMF_HAS_UPROBES, &oldmm->flags)) {
set_bit(MMF_HAS_UPROBES, &newmm->flags);
/* unconditionally, dup_mmap() skips VM_DONTCOPY vmas */
set_bit(MMF_RECALC_UPROBES, &newmm->flags);
}
}
/*
* - search for a free slot.
*/
static unsigned long xol_take_insn_slot(struct xol_area *area)
{
unsigned long slot_addr;
int slot_nr;
do {
slot_nr = find_first_zero_bit(area->bitmap, UINSNS_PER_PAGE);
if (slot_nr < UINSNS_PER_PAGE) {
if (!test_and_set_bit(slot_nr, area->bitmap))
break;
slot_nr = UINSNS_PER_PAGE;
continue;
}
wait_event(area->wq, (atomic_read(&area->slot_count) < UINSNS_PER_PAGE));
} while (slot_nr >= UINSNS_PER_PAGE);
slot_addr = area->vaddr + (slot_nr * UPROBE_XOL_SLOT_BYTES);
atomic_inc(&area->slot_count);
return slot_addr;
}
/*
* xol_get_insn_slot - allocate a slot for xol.
* Returns the allocated slot address or 0.
*/
static unsigned long xol_get_insn_slot(struct uprobe *uprobe)
{
struct xol_area *area;
unsigned long xol_vaddr;
area = get_xol_area();
if (!area)
return 0;
xol_vaddr = xol_take_insn_slot(area);
if (unlikely(!xol_vaddr))
return 0;
arch_uprobe_copy_ixol(area->pages[0], xol_vaddr,
&uprobe->arch.ixol, sizeof(uprobe->arch.ixol));
return xol_vaddr;
}
/*
* xol_free_insn_slot - If slot was earlier allocated by
* @xol_get_insn_slot(), make the slot available for
* subsequent requests.
*/
static void xol_free_insn_slot(struct task_struct *tsk)
{
struct xol_area *area;
unsigned long vma_end;
unsigned long slot_addr;
if (!tsk->mm || !tsk->mm->uprobes_state.xol_area || !tsk->utask)
return;
slot_addr = tsk->utask->xol_vaddr;
if (unlikely(!slot_addr))
return;
area = tsk->mm->uprobes_state.xol_area;
vma_end = area->vaddr + PAGE_SIZE;
if (area->vaddr <= slot_addr && slot_addr < vma_end) {
unsigned long offset;
int slot_nr;
offset = slot_addr - area->vaddr;
slot_nr = offset / UPROBE_XOL_SLOT_BYTES;
if (slot_nr >= UINSNS_PER_PAGE)
return;
clear_bit(slot_nr, area->bitmap);
atomic_dec(&area->slot_count);
smp_mb__after_atomic(); /* pairs with prepare_to_wait() */
if (waitqueue_active(&area->wq))
wake_up(&area->wq);
tsk->utask->xol_vaddr = 0;
}
}
void __weak arch_uprobe_copy_ixol(struct page *page, unsigned long vaddr,
void *src, unsigned long len)
{
/* Initialize the slot */
copy_to_page(page, vaddr, src, len);
/*
* We probably need flush_icache_user_page() but it needs vma.
* This should work on most of architectures by default. If
* architecture needs to do something different it can define
* its own version of the function.
*/
flush_dcache_page(page);
}
/**
* uprobe_get_swbp_addr - compute address of swbp given post-swbp regs
* @regs: Reflects the saved state of the task after it has hit a breakpoint
* instruction.
* Return the address of the breakpoint instruction.
*/
unsigned long __weak uprobe_get_swbp_addr(struct pt_regs *regs)
{
return instruction_pointer(regs) - UPROBE_SWBP_INSN_SIZE;
}
unsigned long uprobe_get_trap_addr(struct pt_regs *regs)
{
struct uprobe_task *utask = current->utask;
if (unlikely(utask && utask->active_uprobe))
return utask->vaddr;
return instruction_pointer(regs);
}
static struct return_instance *free_ret_instance(struct return_instance *ri)
{
struct return_instance *next = ri->next;
put_uprobe(ri->uprobe);
kfree(ri);
return next;
}
/*
* Called with no locks held.
* Called in context of an exiting or an exec-ing thread.
*/
void uprobe_free_utask(struct task_struct *t)
{
struct uprobe_task *utask = t->utask;
struct return_instance *ri;
if (!utask)
return;
if (utask->active_uprobe)
put_uprobe(utask->active_uprobe);
ri = utask->return_instances;
while (ri)
ri = free_ret_instance(ri);
xol_free_insn_slot(t);
kfree(utask);
t->utask = NULL;
}
/*
* Allocate a uprobe_task object for the task if necessary.
* Called when the thread hits a breakpoint.
*
* Returns:
* - pointer to new uprobe_task on success
* - NULL otherwise
*/
static struct uprobe_task *get_utask(void)
{
if (!current->utask)
current->utask = kzalloc(sizeof(struct uprobe_task), GFP_KERNEL);
return current->utask;
}
static int dup_utask(struct task_struct *t, struct uprobe_task *o_utask)
{
struct uprobe_task *n_utask;
struct return_instance **p, *o, *n;
n_utask = kzalloc(sizeof(struct uprobe_task), GFP_KERNEL);
if (!n_utask)
return -ENOMEM;
t->utask = n_utask;
p = &n_utask->return_instances;
for (o = o_utask->return_instances; o; o = o->next) {
n = kmalloc(sizeof(struct return_instance), GFP_KERNEL);
if (!n)
return -ENOMEM;
*n = *o;
get_uprobe(n->uprobe);
n->next = NULL;
*p = n;
p = &n->next;
n_utask->depth++;
}
return 0;
}
static void uprobe_warn(struct task_struct *t, const char *msg)
{
pr_warn("uprobe: %s:%d failed to %s\n",
current->comm, current->pid, msg);
}
static void dup_xol_work(struct callback_head *work)
{
if (current->flags & PF_EXITING)
return;
if (!__create_xol_area(current->utask->dup_xol_addr) &&
!fatal_signal_pending(current))
uprobe_warn(current, "dup xol area");
}
/*
* Called in context of a new clone/fork from copy_process.
*/
void uprobe_copy_process(struct task_struct *t, unsigned long flags)
{
struct uprobe_task *utask = current->utask;
struct mm_struct *mm = current->mm;
struct xol_area *area;
t->utask = NULL;
if (!utask || !utask->return_instances)
return;
if (mm == t->mm && !(flags & CLONE_VFORK))
return;
if (dup_utask(t, utask))
return uprobe_warn(t, "dup ret instances");
/* The task can fork() after dup_xol_work() fails */
area = mm->uprobes_state.xol_area;
if (!area)
return uprobe_warn(t, "dup xol area");
if (mm == t->mm)
return;
t->utask->dup_xol_addr = area->vaddr;
init_task_work(&t->utask->dup_xol_work, dup_xol_work);
task_work_add(t, &t->utask->dup_xol_work, TWA_RESUME);
}
/*
* Current area->vaddr notion assume the trampoline address is always
* equal area->vaddr.
*
* Returns -1 in case the xol_area is not allocated.
*/
static unsigned long get_trampoline_vaddr(void)
{
struct xol_area *area;
unsigned long trampoline_vaddr = -1;
/* Pairs with xol_add_vma() smp_store_release() */
area = READ_ONCE(current->mm->uprobes_state.xol_area); /* ^^^ */
if (area)
trampoline_vaddr = area->vaddr;
return trampoline_vaddr;
}
static void cleanup_return_instances(struct uprobe_task *utask, bool chained,
struct pt_regs *regs)
{
struct return_instance *ri = utask->return_instances;
enum rp_check ctx = chained ? RP_CHECK_CHAIN_CALL : RP_CHECK_CALL;
while (ri && !arch_uretprobe_is_alive(ri, ctx, regs)) {
ri = free_ret_instance(ri);
utask->depth--;
}
utask->return_instances = ri;
}
static void prepare_uretprobe(struct uprobe *uprobe, struct pt_regs *regs)
{
struct return_instance *ri;
struct uprobe_task *utask;
unsigned long orig_ret_vaddr, trampoline_vaddr;
bool chained;
if (!get_xol_area())
return;
utask = get_utask();
if (!utask)
return;
if (utask->depth >= MAX_URETPROBE_DEPTH) {
printk_ratelimited(KERN_INFO "uprobe: omit uretprobe due to"
" nestedness limit pid/tgid=%d/%d\n",
current->pid, current->tgid);
return;
}
ri = kmalloc(sizeof(struct return_instance), GFP_KERNEL);
if (!ri)
return;
trampoline_vaddr = get_trampoline_vaddr();
orig_ret_vaddr = arch_uretprobe_hijack_return_addr(trampoline_vaddr, regs);
if (orig_ret_vaddr == -1)
goto fail;
/* drop the entries invalidated by longjmp() */
chained = (orig_ret_vaddr == trampoline_vaddr);
cleanup_return_instances(utask, chained, regs);
/*
* We don't want to keep trampoline address in stack, rather keep the
* original return address of first caller thru all the consequent
* instances. This also makes breakpoint unwrapping easier.
*/
if (chained) {
if (!utask->return_instances) {
/*
* This situation is not possible. Likely we have an
* attack from user-space.
*/
uprobe_warn(current, "handle tail call");
goto fail;
}
orig_ret_vaddr = utask->return_instances->orig_ret_vaddr;
}
ri->uprobe = get_uprobe(uprobe);
ri->func = instruction_pointer(regs);
ri->stack = user_stack_pointer(regs);
ri->orig_ret_vaddr = orig_ret_vaddr;
ri->chained = chained;
utask->depth++;
ri->next = utask->return_instances;
utask->return_instances = ri;
return;
fail:
kfree(ri);
}
/* Prepare to single-step probed instruction out of line. */
static int
pre_ssout(struct uprobe *uprobe, struct pt_regs *regs, unsigned long bp_vaddr)
{
struct uprobe_task *utask;
unsigned long xol_vaddr;
int err;
utask = get_utask();
if (!utask)
return -ENOMEM;
xol_vaddr = xol_get_insn_slot(uprobe);
if (!xol_vaddr)
return -ENOMEM;
utask->xol_vaddr = xol_vaddr;
utask->vaddr = bp_vaddr;
err = arch_uprobe_pre_xol(&uprobe->arch, regs);
if (unlikely(err)) {
xol_free_insn_slot(current);
return err;
}
utask->active_uprobe = uprobe;
utask->state = UTASK_SSTEP;
return 0;
}
/*
* If we are singlestepping, then ensure this thread is not connected to
* non-fatal signals until completion of singlestep. When xol insn itself
* triggers the signal, restart the original insn even if the task is
* already SIGKILL'ed (since coredump should report the correct ip). This
* is even more important if the task has a handler for SIGSEGV/etc, The
* _same_ instruction should be repeated again after return from the signal
* handler, and SSTEP can never finish in this case.
*/
bool uprobe_deny_signal(void)
{
struct task_struct *t = current;
struct uprobe_task *utask = t->utask;
if (likely(!utask || !utask->active_uprobe)) return false; WARN_ON_ONCE(utask->state != UTASK_SSTEP);
if (task_sigpending(t)) {
spin_lock_irq(&t->sighand->siglock);
clear_tsk_thread_flag(t, TIF_SIGPENDING);
spin_unlock_irq(&t->sighand->siglock);
if (__fatal_signal_pending(t) || arch_uprobe_xol_was_trapped(t)) { utask->state = UTASK_SSTEP_TRAPPED;
set_tsk_thread_flag(t, TIF_UPROBE);
}
}
return true;
}
static void mmf_recalc_uprobes(struct mm_struct *mm)
{
struct vm_area_struct *vma;
for (vma = mm->mmap; vma; vma = vma->vm_next) {
if (!valid_vma(vma, false))
continue;
/*
* This is not strictly accurate, we can race with
* uprobe_unregister() and see the already removed
* uprobe if delete_uprobe() was not yet called.
* Or this uprobe can be filtered out.
*/
if (vma_has_uprobes(vma, vma->vm_start, vma->vm_end))
return;
}
clear_bit(MMF_HAS_UPROBES, &mm->flags);
}
static int is_trap_at_addr(struct mm_struct *mm, unsigned long vaddr)
{
struct page *page;
uprobe_opcode_t opcode;
int result;
if (WARN_ON_ONCE(!IS_ALIGNED(vaddr, UPROBE_SWBP_INSN_SIZE)))
return -EINVAL;
pagefault_disable();
result = __get_user(opcode, (uprobe_opcode_t __user *)vaddr);
pagefault_enable();
if (likely(result == 0))
goto out;
/*
* The NULL 'tsk' here ensures that any faults that occur here
* will not be accounted to the task. 'mm' *is* current->mm,
* but we treat this as a 'remote' access since it is
* essentially a kernel access to the memory.
*/
result = get_user_pages_remote(mm, vaddr, 1, FOLL_FORCE, &page,
NULL, NULL);
if (result < 0)
return result;
copy_from_page(page, vaddr, &opcode, UPROBE_SWBP_INSN_SIZE);
put_page(page);
out:
/* This needs to return true for any variant of the trap insn */
return is_trap_insn(&opcode);
}
static struct uprobe *find_active_uprobe(unsigned long bp_vaddr, int *is_swbp)
{
struct mm_struct *mm = current->mm;
struct uprobe *uprobe = NULL;
struct vm_area_struct *vma;
mmap_read_lock(mm);
vma = vma_lookup(mm, bp_vaddr);
if (vma) {
if (valid_vma(vma, false)) {
struct inode *inode = file_inode(vma->vm_file);
loff_t offset = vaddr_to_offset(vma, bp_vaddr);
uprobe = find_uprobe(inode, offset);
}
if (!uprobe)
*is_swbp = is_trap_at_addr(mm, bp_vaddr);
} else {
*is_swbp = -EFAULT;
}
if (!uprobe && test_and_clear_bit(MMF_RECALC_UPROBES, &mm->flags))
mmf_recalc_uprobes(mm);
mmap_read_unlock(mm);
return uprobe;
}
static void handler_chain(struct uprobe *uprobe, struct pt_regs *regs)
{
struct uprobe_consumer *uc;
int remove = UPROBE_HANDLER_REMOVE;
bool need_prep = false; /* prepare return uprobe, when needed */
down_read(&uprobe->register_rwsem);
for (uc = uprobe->consumers; uc; uc = uc->next) {
int rc = 0;
if (uc->handler) {
rc = uc->handler(uc, regs);
WARN(rc & ~UPROBE_HANDLER_MASK,
"bad rc=0x%x from %ps()\n", rc, uc->handler);
}
if (uc->ret_handler)
need_prep = true;
remove &= rc;
}
if (need_prep && !remove)
prepare_uretprobe(uprobe, regs); /* put bp at return */
if (remove && uprobe->consumers) {
WARN_ON(!uprobe_is_active(uprobe));
unapply_uprobe(uprobe, current->mm);
}
up_read(&uprobe->register_rwsem);
}
static void
handle_uretprobe_chain(struct return_instance *ri, struct pt_regs *regs)
{
struct uprobe *uprobe = ri->uprobe;
struct uprobe_consumer *uc;
down_read(&uprobe->register_rwsem);
for (uc = uprobe->consumers; uc; uc = uc->next) {
if (uc->ret_handler)
uc->ret_handler(uc, ri->func, regs);
}
up_read(&uprobe->register_rwsem);
}
static struct return_instance *find_next_ret_chain(struct return_instance *ri)
{
bool chained;
do {
chained = ri->chained;
ri = ri->next; /* can't be NULL if chained */
} while (chained);
return ri;
}
static void handle_trampoline(struct pt_regs *regs)
{
struct uprobe_task *utask;
struct return_instance *ri, *next;
bool valid;
utask = current->utask;
if (!utask)
goto sigill;
ri = utask->return_instances;
if (!ri)
goto sigill;
do {
/*
* We should throw out the frames invalidated by longjmp().
* If this chain is valid, then the next one should be alive
* or NULL; the latter case means that nobody but ri->func
* could hit this trampoline on return. TODO: sigaltstack().
*/
next = find_next_ret_chain(ri);
valid = !next || arch_uretprobe_is_alive(next, RP_CHECK_RET, regs);
instruction_pointer_set(regs, ri->orig_ret_vaddr);
do {
if (valid)
handle_uretprobe_chain(ri, regs);
ri = free_ret_instance(ri);
utask->depth--;
} while (ri != next);
} while (!valid);
utask->return_instances = ri;
return;
sigill:
uprobe_warn(current, "handle uretprobe, sending SIGILL.");
force_sig(SIGILL);
}
bool __weak arch_uprobe_ignore(struct arch_uprobe *aup, struct pt_regs *regs)
{
return false;
}
bool __weak arch_uretprobe_is_alive(struct return_instance *ret, enum rp_check ctx,
struct pt_regs *regs)
{
return true;
}
/*
* Run handler and ask thread to singlestep.
* Ensure all non-fatal signals cannot interrupt thread while it singlesteps.
*/
static void handle_swbp(struct pt_regs *regs)
{
struct uprobe *uprobe;
unsigned long bp_vaddr;
int is_swbp;
bp_vaddr = uprobe_get_swbp_addr(regs);
if (bp_vaddr == get_trampoline_vaddr())
return handle_trampoline(regs);
uprobe = find_active_uprobe(bp_vaddr, &is_swbp);
if (!uprobe) {
if (is_swbp > 0) {
/* No matching uprobe; signal SIGTRAP. */
force_sig(SIGTRAP);
} else {
/*
* Either we raced with uprobe_unregister() or we can't
* access this memory. The latter is only possible if
* another thread plays with our ->mm. In both cases
* we can simply restart. If this vma was unmapped we
* can pretend this insn was not executed yet and get
* the (correct) SIGSEGV after restart.
*/
instruction_pointer_set(regs, bp_vaddr);
}
return;
}
/* change it in advance for ->handler() and restart */
instruction_pointer_set(regs, bp_vaddr);
/*
* TODO: move copy_insn/etc into _register and remove this hack.
* After we hit the bp, _unregister + _register can install the
* new and not-yet-analyzed uprobe at the same address, restart.
*/
if (unlikely(!test_bit(UPROBE_COPY_INSN, &uprobe->flags)))
goto out;
/*
* Pairs with the smp_wmb() in prepare_uprobe().
*
* Guarantees that if we see the UPROBE_COPY_INSN bit set, then
* we must also see the stores to &uprobe->arch performed by the
* prepare_uprobe() call.
*/
smp_rmb();
/* Tracing handlers use ->utask to communicate with fetch methods */
if (!get_utask())
goto out;
if (arch_uprobe_ignore(&uprobe->arch, regs))
goto out;
handler_chain(uprobe, regs);
if (arch_uprobe_skip_sstep(&uprobe->arch, regs))
goto out;
if (!pre_ssout(uprobe, regs, bp_vaddr))
return;
/* arch_uprobe_skip_sstep() succeeded, or restart if can't singlestep */
out:
put_uprobe(uprobe);
}
/*
* Perform required fix-ups and disable singlestep.
* Allow pending signals to take effect.
*/
static void handle_singlestep(struct uprobe_task *utask, struct pt_regs *regs)
{
struct uprobe *uprobe;
int err = 0;
uprobe = utask->active_uprobe;
if (utask->state == UTASK_SSTEP_ACK)
err = arch_uprobe_post_xol(&uprobe->arch, regs);
else if (utask->state == UTASK_SSTEP_TRAPPED)
arch_uprobe_abort_xol(&uprobe->arch, regs);
else
WARN_ON_ONCE(1);
put_uprobe(uprobe);
utask->active_uprobe = NULL;
utask->state = UTASK_RUNNING;
xol_free_insn_slot(current);
spin_lock_irq(¤t->sighand->siglock);
recalc_sigpending(); /* see uprobe_deny_signal() */
spin_unlock_irq(¤t->sighand->siglock);
if (unlikely(err)) {
uprobe_warn(current, "execute the probed insn, sending SIGILL.");
force_sig(SIGILL);
}
}
/*
* On breakpoint hit, breakpoint notifier sets the TIF_UPROBE flag and
* allows the thread to return from interrupt. After that handle_swbp()
* sets utask->active_uprobe.
*
* On singlestep exception, singlestep notifier sets the TIF_UPROBE flag
* and allows the thread to return from interrupt.
*
* While returning to userspace, thread notices the TIF_UPROBE flag and calls
* uprobe_notify_resume().
*/
void uprobe_notify_resume(struct pt_regs *regs)
{
struct uprobe_task *utask;
clear_thread_flag(TIF_UPROBE);
utask = current->utask;
if (utask && utask->active_uprobe)
handle_singlestep(utask, regs);
else
handle_swbp(regs);
}
/*
* uprobe_pre_sstep_notifier gets called from interrupt context as part of
* notifier mechanism. Set TIF_UPROBE flag and indicate breakpoint hit.
*/
int uprobe_pre_sstep_notifier(struct pt_regs *regs)
{
if (!current->mm)
return 0;
if (!test_bit(MMF_HAS_UPROBES, ¤t->mm->flags) &&
(!current->utask || !current->utask->return_instances))
return 0;
set_thread_flag(TIF_UPROBE);
return 1;
}
/*
* uprobe_post_sstep_notifier gets called in interrupt context as part of notifier
* mechanism. Set TIF_UPROBE flag and indicate completion of singlestep.
*/
int uprobe_post_sstep_notifier(struct pt_regs *regs)
{
struct uprobe_task *utask = current->utask;
if (!current->mm || !utask || !utask->active_uprobe)
/* task is currently not uprobed */
return 0;
utask->state = UTASK_SSTEP_ACK;
set_thread_flag(TIF_UPROBE);
return 1;
}
static struct notifier_block uprobe_exception_nb = {
.notifier_call = arch_uprobe_exception_notify,
.priority = INT_MAX-1, /* notified after kprobes, kgdb */
};
void __init uprobes_init(void)
{
int i;
for (i = 0; i < UPROBES_HASH_SZ; i++)
mutex_init(&uprobes_mmap_mutex[i]);
BUG_ON(register_die_notifier(&uprobe_exception_nb));
}
// SPDX-License-Identifier: GPL-2.0-only
/*
* fs/kernfs/dir.c - kernfs directory implementation
*
* Copyright (c) 2001-3 Patrick Mochel
* Copyright (c) 2007 SUSE Linux Products GmbH
* Copyright (c) 2007, 2013 Tejun Heo <tj@kernel.org>
*/
#include <linux/sched.h>
#include <linux/fs.h>
#include <linux/namei.h>
#include <linux/idr.h>
#include <linux/slab.h>
#include <linux/security.h>
#include <linux/hash.h>
#include "kernfs-internal.h"
DECLARE_RWSEM(kernfs_rwsem);
static DEFINE_SPINLOCK(kernfs_rename_lock); /* kn->parent and ->name */
static char kernfs_pr_cont_buf[PATH_MAX]; /* protected by rename_lock */
static DEFINE_SPINLOCK(kernfs_idr_lock); /* root->ino_idr */
#define rb_to_kn(X) rb_entry((X), struct kernfs_node, rb)
static bool kernfs_active(struct kernfs_node *kn)
{
lockdep_assert_held(&kernfs_rwsem);
return atomic_read(&kn->active) >= 0;
}
static bool kernfs_lockdep(struct kernfs_node *kn)
{
#ifdef CONFIG_DEBUG_LOCK_ALLOC
return kn->flags & KERNFS_LOCKDEP;
#else
return false;
#endif
}
static int kernfs_name_locked(struct kernfs_node *kn, char *buf, size_t buflen)
{
if (!kn)
return strlcpy(buf, "(null)", buflen);
return strlcpy(buf, kn->parent ? kn->name : "/", buflen);
}
/* kernfs_node_depth - compute depth from @from to @to */
static size_t kernfs_depth(struct kernfs_node *from, struct kernfs_node *to)
{
size_t depth = 0;
while (to->parent && to != from) {
depth++;
to = to->parent;
}
return depth;
}
static struct kernfs_node *kernfs_common_ancestor(struct kernfs_node *a,
struct kernfs_node *b)
{
size_t da, db;
struct kernfs_root *ra = kernfs_root(a), *rb = kernfs_root(b);
if (ra != rb)
return NULL;
da = kernfs_depth(ra->kn, a);
db = kernfs_depth(rb->kn, b);
while (da > db) {
a = a->parent;
da--;
}
while (db > da) {
b = b->parent;
db--;
}
/* worst case b and a will be the same at root */
while (b != a) {
b = b->parent;
a = a->parent;
}
return a;
}
/**
* kernfs_path_from_node_locked - find a pseudo-absolute path to @kn_to,
* where kn_from is treated as root of the path.
* @kn_from: kernfs node which should be treated as root for the path
* @kn_to: kernfs node to which path is needed
* @buf: buffer to copy the path into
* @buflen: size of @buf
*
* We need to handle couple of scenarios here:
* [1] when @kn_from is an ancestor of @kn_to at some level
* kn_from: /n1/n2/n3
* kn_to: /n1/n2/n3/n4/n5
* result: /n4/n5
*
* [2] when @kn_from is on a different hierarchy and we need to find common
* ancestor between @kn_from and @kn_to.
* kn_from: /n1/n2/n3/n4
* kn_to: /n1/n2/n5
* result: /../../n5
* OR
* kn_from: /n1/n2/n3/n4/n5 [depth=5]
* kn_to: /n1/n2/n3 [depth=3]
* result: /../..
*
* [3] when @kn_to is NULL result will be "(null)"
*
* Returns the length of the full path. If the full length is equal to or
* greater than @buflen, @buf contains the truncated path with the trailing
* '\0'. On error, -errno is returned.
*/
static int kernfs_path_from_node_locked(struct kernfs_node *kn_to,
struct kernfs_node *kn_from,
char *buf, size_t buflen)
{
struct kernfs_node *kn, *common;
const char parent_str[] = "/..";
size_t depth_from, depth_to, len = 0;
int i, j;
if (!kn_to)
return strlcpy(buf, "(null)", buflen);
if (!kn_from)
kn_from = kernfs_root(kn_to)->kn;
if (kn_from == kn_to)
return strlcpy(buf, "/", buflen);
if (!buf)
return -EINVAL;
common = kernfs_common_ancestor(kn_from, kn_to);
if (WARN_ON(!common))
return -EINVAL;
depth_to = kernfs_depth(common, kn_to);
depth_from = kernfs_depth(common, kn_from);
buf[0] = '\0';
for (i = 0; i < depth_from; i++)
len += strlcpy(buf + len, parent_str,
len < buflen ? buflen - len : 0);
/* Calculate how many bytes we need for the rest */
for (i = depth_to - 1; i >= 0; i--) {
for (kn = kn_to, j = 0; j < i; j++)
kn = kn->parent;
len += strlcpy(buf + len, "/",
len < buflen ? buflen - len : 0);
len += strlcpy(buf + len, kn->name,
len < buflen ? buflen - len : 0);
}
return len;
}
/**
* kernfs_name - obtain the name of a given node
* @kn: kernfs_node of interest
* @buf: buffer to copy @kn's name into
* @buflen: size of @buf
*
* Copies the name of @kn into @buf of @buflen bytes. The behavior is
* similar to strlcpy(). It returns the length of @kn's name and if @buf
* isn't long enough, it's filled upto @buflen-1 and nul terminated.
*
* Fills buffer with "(null)" if @kn is NULL.
*
* This function can be called from any context.
*/
int kernfs_name(struct kernfs_node *kn, char *buf, size_t buflen)
{
unsigned long flags;
int ret;
spin_lock_irqsave(&kernfs_rename_lock, flags);
ret = kernfs_name_locked(kn, buf, buflen);
spin_unlock_irqrestore(&kernfs_rename_lock, flags);
return ret;
}
/**
* kernfs_path_from_node - build path of node @to relative to @from.
* @from: parent kernfs_node relative to which we need to build the path
* @to: kernfs_node of interest
* @buf: buffer to copy @to's path into
* @buflen: size of @buf
*
* Builds @to's path relative to @from in @buf. @from and @to must
* be on the same kernfs-root. If @from is not parent of @to, then a relative
* path (which includes '..'s) as needed to reach from @from to @to is
* returned.
*
* Returns the length of the full path. If the full length is equal to or
* greater than @buflen, @buf contains the truncated path with the trailing
* '\0'. On error, -errno is returned.
*/
int kernfs_path_from_node(struct kernfs_node *to, struct kernfs_node *from,
char *buf, size_t buflen)
{
unsigned long flags;
int ret;
spin_lock_irqsave(&kernfs_rename_lock, flags);
ret = kernfs_path_from_node_locked(to, from, buf, buflen);
spin_unlock_irqrestore(&kernfs_rename_lock, flags);
return ret;
}
EXPORT_SYMBOL_GPL(kernfs_path_from_node);
/**
* pr_cont_kernfs_name - pr_cont name of a kernfs_node
* @kn: kernfs_node of interest
*
* This function can be called from any context.
*/
void pr_cont_kernfs_name(struct kernfs_node *kn)
{
unsigned long flags;
spin_lock_irqsave(&kernfs_rename_lock, flags);
kernfs_name_locked(kn, kernfs_pr_cont_buf, sizeof(kernfs_pr_cont_buf));
pr_cont("%s", kernfs_pr_cont_buf);
spin_unlock_irqrestore(&kernfs_rename_lock, flags);
}
/**
* pr_cont_kernfs_path - pr_cont path of a kernfs_node
* @kn: kernfs_node of interest
*
* This function can be called from any context.
*/
void pr_cont_kernfs_path(struct kernfs_node *kn)
{
unsigned long flags;
int sz;
spin_lock_irqsave(&kernfs_rename_lock, flags);
sz = kernfs_path_from_node_locked(kn, NULL, kernfs_pr_cont_buf,
sizeof(kernfs_pr_cont_buf));
if (sz < 0) {
pr_cont("(error)");
goto out;
}
if (sz >= sizeof(kernfs_pr_cont_buf)) {
pr_cont("(name too long)");
goto out;
}
pr_cont("%s", kernfs_pr_cont_buf);
out:
spin_unlock_irqrestore(&kernfs_rename_lock, flags);
}
/**
* kernfs_get_parent - determine the parent node and pin it
* @kn: kernfs_node of interest
*
* Determines @kn's parent, pins and returns it. This function can be
* called from any context.
*/
struct kernfs_node *kernfs_get_parent(struct kernfs_node *kn)
{
struct kernfs_node *parent;
unsigned long flags;
spin_lock_irqsave(&kernfs_rename_lock, flags);
parent = kn->parent;
kernfs_get(parent);
spin_unlock_irqrestore(&kernfs_rename_lock, flags);
return parent;
}
/**
* kernfs_name_hash
* @name: Null terminated string to hash
* @ns: Namespace tag to hash
*
* Returns 31 bit hash of ns + name (so it fits in an off_t )
*/
static unsigned int kernfs_name_hash(const char *name, const void *ns)
{
unsigned long hash = init_name_hash(ns);
unsigned int len = strlen(name);
while (len--)
hash = partial_name_hash(*name++, hash);
hash = end_name_hash(hash);
hash &= 0x7fffffffU;
/* Reserve hash numbers 0, 1 and INT_MAX for magic directory entries */
if (hash < 2)
hash += 2; if (hash >= INT_MAX)
hash = INT_MAX - 1;
return hash;
}
static int kernfs_name_compare(unsigned int hash, const char *name,
const void *ns, const struct kernfs_node *kn)
{
if (hash < kn->hash)
return -1;
if (hash > kn->hash)
return 1;
if (ns < kn->ns)
return -1;
if (ns > kn->ns)
return 1;
return strcmp(name, kn->name);
}
static int kernfs_sd_compare(const struct kernfs_node *left,
const struct kernfs_node *right)
{
return kernfs_name_compare(left->hash, left->name, left->ns, right);
}
/**
* kernfs_link_sibling - link kernfs_node into sibling rbtree
* @kn: kernfs_node of interest
*
* Link @kn into its sibling rbtree which starts from
* @kn->parent->dir.children.
*
* Locking:
* kernfs_rwsem held exclusive
*
* RETURNS:
* 0 on susccess -EEXIST on failure.
*/
static int kernfs_link_sibling(struct kernfs_node *kn)
{
struct rb_node **node = &kn->parent->dir.children.rb_node;
struct rb_node *parent = NULL;
while (*node) {
struct kernfs_node *pos;
int result;
pos = rb_to_kn(*node);
parent = *node;
result = kernfs_sd_compare(kn, pos);
if (result < 0)
node = &pos->rb.rb_left; else if (result > 0) node = &pos->rb.rb_right;
else
return -EEXIST;
}
/* add new node and rebalance the tree */
rb_link_node(&kn->rb, parent, node);
rb_insert_color(&kn->rb, &kn->parent->dir.children);
/* successfully added, account subdir number */
if (kernfs_type(kn) == KERNFS_DIR)
kn->parent->dir.subdirs++; kernfs_inc_rev(kn->parent);
return 0;
}
/**
* kernfs_unlink_sibling - unlink kernfs_node from sibling rbtree
* @kn: kernfs_node of interest
*
* Try to unlink @kn from its sibling rbtree which starts from
* kn->parent->dir.children. Returns %true if @kn was actually
* removed, %false if @kn wasn't on the rbtree.
*
* Locking:
* kernfs_rwsem held exclusive
*/
static bool kernfs_unlink_sibling(struct kernfs_node *kn)
{
if (RB_EMPTY_NODE(&kn->rb))
return false;
if (kernfs_type(kn) == KERNFS_DIR) kn->parent->dir.subdirs--;
kernfs_inc_rev(kn->parent);
rb_erase(&kn->rb, &kn->parent->dir.children);
RB_CLEAR_NODE(&kn->rb);
return true;
}
/**
* kernfs_get_active - get an active reference to kernfs_node
* @kn: kernfs_node to get an active reference to
*
* Get an active reference of @kn. This function is noop if @kn
* is NULL.
*
* RETURNS:
* Pointer to @kn on success, NULL on failure.
*/
struct kernfs_node *kernfs_get_active(struct kernfs_node *kn)
{
if (unlikely(!kn))
return NULL;
if (!atomic_inc_unless_negative(&kn->active))
return NULL;
if (kernfs_lockdep(kn))
rwsem_acquire_read(&kn->dep_map, 0, 1, _RET_IP_);
return kn;
}
/**
* kernfs_put_active - put an active reference to kernfs_node
* @kn: kernfs_node to put an active reference to
*
* Put an active reference to @kn. This function is noop if @kn
* is NULL.
*/
void kernfs_put_active(struct kernfs_node *kn)
{
int v;
if (unlikely(!kn))
return;
if (kernfs_lockdep(kn))
rwsem_release(&kn->dep_map, _RET_IP_);
v = atomic_dec_return(&kn->active);
if (likely(v != KN_DEACTIVATED_BIAS))
return;
wake_up_all(&kernfs_root(kn)->deactivate_waitq);
}
/**
* kernfs_drain - drain kernfs_node
* @kn: kernfs_node to drain
*
* Drain existing usages and nuke all existing mmaps of @kn. Mutiple
* removers may invoke this function concurrently on @kn and all will
* return after draining is complete.
*/
static void kernfs_drain(struct kernfs_node *kn)
__releases(&kernfs_rwsem) __acquires(&kernfs_rwsem)
{
struct kernfs_root *root = kernfs_root(kn);
lockdep_assert_held_write(&kernfs_rwsem);
WARN_ON_ONCE(kernfs_active(kn)); up_write(&kernfs_rwsem);
if (kernfs_lockdep(kn)) {
rwsem_acquire(&kn->dep_map, 0, 0, _RET_IP_);
if (atomic_read(&kn->active) != KN_DEACTIVATED_BIAS)
lock_contended(&kn->dep_map, _RET_IP_);
}
/* but everyone should wait for draining */
wait_event(root->deactivate_waitq,
atomic_read(&kn->active) == KN_DEACTIVATED_BIAS);
if (kernfs_lockdep(kn)) {
lock_acquired(&kn->dep_map, _RET_IP_);
rwsem_release(&kn->dep_map, _RET_IP_);
}
kernfs_drain_open_files(kn);
down_write(&kernfs_rwsem);
}
/**
* kernfs_get - get a reference count on a kernfs_node
* @kn: the target kernfs_node
*/
void kernfs_get(struct kernfs_node *kn)
{
if (kn) { WARN_ON(!atomic_read(&kn->count));
atomic_inc(&kn->count);
}
}
EXPORT_SYMBOL_GPL(kernfs_get);
/**
* kernfs_put - put a reference count on a kernfs_node
* @kn: the target kernfs_node
*
* Put a reference count of @kn and destroy it if it reached zero.
*/
void kernfs_put(struct kernfs_node *kn)
{
struct kernfs_node *parent;
struct kernfs_root *root;
if (!kn || !atomic_dec_and_test(&kn->count))
return;
root = kernfs_root(kn);
repeat:
/*
* Moving/renaming is always done while holding reference.
* kn->parent won't change beneath us.
*/
parent = kn->parent;
WARN_ONCE(atomic_read(&kn->active) != KN_DEACTIVATED_BIAS,
"kernfs_put: %s/%s: released with incorrect active_ref %d\n",
parent ? parent->name : "", kn->name, atomic_read(&kn->active));
if (kernfs_type(kn) == KERNFS_LINK) kernfs_put(kn->symlink.target_kn); kfree_const(kn->name);
if (kn->iattr) {
simple_xattrs_free(&kn->iattr->xattrs); kmem_cache_free(kernfs_iattrs_cache, kn->iattr);
}
spin_lock(&kernfs_idr_lock);
idr_remove(&root->ino_idr, (u32)kernfs_ino(kn));
spin_unlock(&kernfs_idr_lock);
kmem_cache_free(kernfs_node_cache, kn);
kn = parent;
if (kn) {
if (atomic_dec_and_test(&kn->count))
goto repeat;
} else {
/* just released the root kn, free @root too */
idr_destroy(&root->ino_idr);
kfree(root);
}
}
EXPORT_SYMBOL_GPL(kernfs_put);
/**
* kernfs_node_from_dentry - determine kernfs_node associated with a dentry
* @dentry: the dentry in question
*
* Return the kernfs_node associated with @dentry. If @dentry is not a
* kernfs one, %NULL is returned.
*
* While the returned kernfs_node will stay accessible as long as @dentry
* is accessible, the returned node can be in any state and the caller is
* fully responsible for determining what's accessible.
*/
struct kernfs_node *kernfs_node_from_dentry(struct dentry *dentry)
{
if (dentry->d_sb->s_op == &kernfs_sops)
return kernfs_dentry_node(dentry);
return NULL;
}
static struct kernfs_node *__kernfs_new_node(struct kernfs_root *root,
struct kernfs_node *parent,
const char *name, umode_t mode,
kuid_t uid, kgid_t gid,
unsigned flags)
{
struct kernfs_node *kn;
u32 id_highbits;
int ret;
name = kstrdup_const(name, GFP_KERNEL);
if (!name)
return NULL;
kn = kmem_cache_zalloc(kernfs_node_cache, GFP_KERNEL);
if (!kn)
goto err_out1;
idr_preload(GFP_KERNEL);
spin_lock(&kernfs_idr_lock);
ret = idr_alloc_cyclic(&root->ino_idr, kn, 1, 0, GFP_ATOMIC);
if (ret >= 0 && ret < root->last_id_lowbits) root->id_highbits++;
id_highbits = root->id_highbits;
root->last_id_lowbits = ret;
spin_unlock(&kernfs_idr_lock);
idr_preload_end();
if (ret < 0)
goto err_out2;
kn->id = (u64)id_highbits << 32 | ret;
atomic_set(&kn->count, 1);
atomic_set(&kn->active, KN_DEACTIVATED_BIAS);
RB_CLEAR_NODE(&kn->rb);
kn->name = name;
kn->mode = mode;
kn->flags = flags;
if (!uid_eq(uid, GLOBAL_ROOT_UID) || !gid_eq(gid, GLOBAL_ROOT_GID)) {
struct iattr iattr = {
.ia_valid = ATTR_UID | ATTR_GID,
.ia_uid = uid,
.ia_gid = gid,
};
ret = __kernfs_setattr(kn, &iattr);
if (ret < 0) goto err_out3;
}
if (parent) { ret = security_kernfs_init_security(parent, kn); if (ret)
goto err_out3;
}
return kn;
err_out3:
idr_remove(&root->ino_idr, (u32)kernfs_ino(kn));
err_out2:
kmem_cache_free(kernfs_node_cache, kn);
err_out1:
kfree_const(name);
return NULL;
}
struct kernfs_node *kernfs_new_node(struct kernfs_node *parent,
const char *name, umode_t mode,
kuid_t uid, kgid_t gid,
unsigned flags)
{
struct kernfs_node *kn;
kn = __kernfs_new_node(kernfs_root(parent), parent,
name, mode, uid, gid, flags);
if (kn) {
kernfs_get(parent);
kn->parent = parent;
}
return kn;
}
/*
* kernfs_find_and_get_node_by_id - get kernfs_node from node id
* @root: the kernfs root
* @id: the target node id
*
* @id's lower 32bits encode ino and upper gen. If the gen portion is
* zero, all generations are matched.
*
* RETURNS:
* NULL on failure. Return a kernfs node with reference counter incremented
*/
struct kernfs_node *kernfs_find_and_get_node_by_id(struct kernfs_root *root,
u64 id)
{
struct kernfs_node *kn;
ino_t ino = kernfs_id_ino(id);
u32 gen = kernfs_id_gen(id);
spin_lock(&kernfs_idr_lock);
kn = idr_find(&root->ino_idr, (u32)ino);
if (!kn)
goto err_unlock;
if (sizeof(ino_t) >= sizeof(u64)) {
/* we looked up with the low 32bits, compare the whole */
if (kernfs_ino(kn) != ino)
goto err_unlock;
} else {
/* 0 matches all generations */
if (unlikely(gen && kernfs_gen(kn) != gen))
goto err_unlock;
}
/*
* ACTIVATED is protected with kernfs_mutex but it was clear when
* @kn was added to idr and we just wanna see it set. No need to
* grab kernfs_mutex.
*/
if (unlikely(!(kn->flags & KERNFS_ACTIVATED) ||
!atomic_inc_not_zero(&kn->count)))
goto err_unlock;
spin_unlock(&kernfs_idr_lock);
return kn;
err_unlock:
spin_unlock(&kernfs_idr_lock);
return NULL;
}
/**
* kernfs_add_one - add kernfs_node to parent without warning
* @kn: kernfs_node to be added
*
* The caller must already have initialized @kn->parent. This
* function increments nlink of the parent's inode if @kn is a
* directory and link into the children list of the parent.
*
* RETURNS:
* 0 on success, -EEXIST if entry with the given name already
* exists.
*/
int kernfs_add_one(struct kernfs_node *kn)
{
struct kernfs_node *parent = kn->parent;
struct kernfs_iattrs *ps_iattr;
bool has_ns;
int ret;
down_write(&kernfs_rwsem);
ret = -EINVAL;
has_ns = kernfs_ns_enabled(parent);
if (WARN(has_ns != (bool)kn->ns, KERN_WARNING "kernfs: ns %s in '%s' for '%s'\n",
has_ns ? "required" : "invalid", parent->name, kn->name))
goto out_unlock;
if (kernfs_type(parent) != KERNFS_DIR)
goto out_unlock;
ret = -ENOENT;
if (parent->flags & KERNFS_EMPTY_DIR)
goto out_unlock;
if ((parent->flags & KERNFS_ACTIVATED) && !kernfs_active(parent))
goto out_unlock;
kn->hash = kernfs_name_hash(kn->name, kn->ns);
ret = kernfs_link_sibling(kn);
if (ret)
goto out_unlock;
/* Update timestamps on the parent */
ps_iattr = parent->iattr;
if (ps_iattr) {
ktime_get_real_ts64(&ps_iattr->ia_ctime);
ps_iattr->ia_mtime = ps_iattr->ia_ctime;
}
up_write(&kernfs_rwsem);
/*
* Activate the new node unless CREATE_DEACTIVATED is requested.
* If not activated here, the kernfs user is responsible for
* activating the node with kernfs_activate(). A node which hasn't
* been activated is not visible to userland and its removal won't
* trigger deactivation.
*/
if (!(kernfs_root(kn)->flags & KERNFS_ROOT_CREATE_DEACTIVATED))
kernfs_activate(kn);
return 0;
out_unlock:
up_write(&kernfs_rwsem); return ret;
}
/**
* kernfs_find_ns - find kernfs_node with the given name
* @parent: kernfs_node to search under
* @name: name to look for
* @ns: the namespace tag to use
*
* Look for kernfs_node with name @name under @parent. Returns pointer to
* the found kernfs_node on success, %NULL on failure.
*/
static struct kernfs_node *kernfs_find_ns(struct kernfs_node *parent,
const unsigned char *name,
const void *ns)
{
struct rb_node *node = parent->dir.children.rb_node; bool has_ns = kernfs_ns_enabled(parent);
unsigned int hash;
lockdep_assert_held(&kernfs_rwsem);
if (has_ns != (bool)ns) {
WARN(1, KERN_WARNING "kernfs: ns %s in '%s' for '%s'\n",
has_ns ? "required" : "invalid", parent->name, name);
return NULL;
}
hash = kernfs_name_hash(name, ns);
while (node) {
struct kernfs_node *kn;
int result;
kn = rb_to_kn(node);
result = kernfs_name_compare(hash, name, ns, kn);
if (result < 0)
node = node->rb_left; else if (result > 0) node = node->rb_right;
else
return kn;
}
return NULL;
}
static struct kernfs_node *kernfs_walk_ns(struct kernfs_node *parent,
const unsigned char *path,
const void *ns)
{
size_t len;
char *p, *name;
lockdep_assert_held_read(&kernfs_rwsem);
/* grab kernfs_rename_lock to piggy back on kernfs_pr_cont_buf */
spin_lock_irq(&kernfs_rename_lock);
len = strlcpy(kernfs_pr_cont_buf, path, sizeof(kernfs_pr_cont_buf));
if (len >= sizeof(kernfs_pr_cont_buf)) {
spin_unlock_irq(&kernfs_rename_lock);
return NULL;
}
p = kernfs_pr_cont_buf;
while ((name = strsep(&p, "/")) && parent) {
if (*name == '\0')
continue;
parent = kernfs_find_ns(parent, name, ns);
}
spin_unlock_irq(&kernfs_rename_lock);
return parent;
}
/**
* kernfs_find_and_get_ns - find and get kernfs_node with the given name
* @parent: kernfs_node to search under
* @name: name to look for
* @ns: the namespace tag to use
*
* Look for kernfs_node with name @name under @parent and get a reference
* if found. This function may sleep and returns pointer to the found
* kernfs_node on success, %NULL on failure.
*/
struct kernfs_node *kernfs_find_and_get_ns(struct kernfs_node *parent,
const char *name, const void *ns)
{
struct kernfs_node *kn;
down_read(&kernfs_rwsem);
kn = kernfs_find_ns(parent, name, ns);
kernfs_get(kn);
up_read(&kernfs_rwsem);
return kn;
}
EXPORT_SYMBOL_GPL(kernfs_find_and_get_ns);
/**
* kernfs_walk_and_get_ns - find and get kernfs_node with the given path
* @parent: kernfs_node to search under
* @path: path to look for
* @ns: the namespace tag to use
*
* Look for kernfs_node with path @path under @parent and get a reference
* if found. This function may sleep and returns pointer to the found
* kernfs_node on success, %NULL on failure.
*/
struct kernfs_node *kernfs_walk_and_get_ns(struct kernfs_node *parent,
const char *path, const void *ns)
{
struct kernfs_node *kn;
down_read(&kernfs_rwsem);
kn = kernfs_walk_ns(parent, path, ns);
kernfs_get(kn);
up_read(&kernfs_rwsem);
return kn;
}
/**
* kernfs_create_root - create a new kernfs hierarchy
* @scops: optional syscall operations for the hierarchy
* @flags: KERNFS_ROOT_* flags
* @priv: opaque data associated with the new directory
*
* Returns the root of the new hierarchy on success, ERR_PTR() value on
* failure.
*/
struct kernfs_root *kernfs_create_root(struct kernfs_syscall_ops *scops,
unsigned int flags, void *priv)
{
struct kernfs_root *root;
struct kernfs_node *kn;
root = kzalloc(sizeof(*root), GFP_KERNEL);
if (!root)
return ERR_PTR(-ENOMEM);
idr_init(&root->ino_idr);
INIT_LIST_HEAD(&root->supers);
/*
* On 64bit ino setups, id is ino. On 32bit, low 32bits are ino.
* High bits generation. The starting value for both ino and
* genenration is 1. Initialize upper 32bit allocation
* accordingly.
*/
if (sizeof(ino_t) >= sizeof(u64))
root->id_highbits = 0;
else
root->id_highbits = 1;
kn = __kernfs_new_node(root, NULL, "", S_IFDIR | S_IRUGO | S_IXUGO,
GLOBAL_ROOT_UID, GLOBAL_ROOT_GID,
KERNFS_DIR);
if (!kn) {
idr_destroy(&root->ino_idr);
kfree(root);
return ERR_PTR(-ENOMEM);
}
kn->priv = priv;
kn->dir.root = root;
root->syscall_ops = scops;
root->flags = flags;
root->kn = kn;
init_waitqueue_head(&root->deactivate_waitq);
if (!(root->flags & KERNFS_ROOT_CREATE_DEACTIVATED))
kernfs_activate(kn);
return root;
}
/**
* kernfs_destroy_root - destroy a kernfs hierarchy
* @root: root of the hierarchy to destroy
*
* Destroy the hierarchy anchored at @root by removing all existing
* directories and destroying @root.
*/
void kernfs_destroy_root(struct kernfs_root *root)
{
kernfs_remove(root->kn); /* will also free @root */
}
/**
* kernfs_create_dir_ns - create a directory
* @parent: parent in which to create a new directory
* @name: name of the new directory
* @mode: mode of the new directory
* @uid: uid of the new directory
* @gid: gid of the new directory
* @priv: opaque data associated with the new directory
* @ns: optional namespace tag of the directory
*
* Returns the created node on success, ERR_PTR() value on failure.
*/
struct kernfs_node *kernfs_create_dir_ns(struct kernfs_node *parent,
const char *name, umode_t mode,
kuid_t uid, kgid_t gid,
void *priv, const void *ns)
{
struct kernfs_node *kn;
int rc;
/* allocate */
kn = kernfs_new_node(parent, name, mode | S_IFDIR,
uid, gid, KERNFS_DIR);
if (!kn)
return ERR_PTR(-ENOMEM);
kn->dir.root = parent->dir.root;
kn->ns = ns;
kn->priv = priv;
/* link in */
rc = kernfs_add_one(kn);
if (!rc)
return kn;
kernfs_put(kn);
return ERR_PTR(rc);
}
/**
* kernfs_create_empty_dir - create an always empty directory
* @parent: parent in which to create a new directory
* @name: name of the new directory
*
* Returns the created node on success, ERR_PTR() value on failure.
*/
struct kernfs_node *kernfs_create_empty_dir(struct kernfs_node *parent,
const char *name)
{
struct kernfs_node *kn;
int rc;
/* allocate */
kn = kernfs_new_node(parent, name, S_IRUGO|S_IXUGO|S_IFDIR,
GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, KERNFS_DIR);
if (!kn)
return ERR_PTR(-ENOMEM);
kn->flags |= KERNFS_EMPTY_DIR;
kn->dir.root = parent->dir.root;
kn->ns = NULL;
kn->priv = NULL;
/* link in */
rc = kernfs_add_one(kn);
if (!rc)
return kn;
kernfs_put(kn);
return ERR_PTR(rc);
}
static int kernfs_dop_revalidate(struct dentry *dentry, unsigned int flags)
{
struct kernfs_node *kn;
if (flags & LOOKUP_RCU)
return -ECHILD;
/* Negative hashed dentry? */
if (d_really_is_negative(dentry)) {
struct kernfs_node *parent;
/* If the kernfs parent node has changed discard and
* proceed to ->lookup.
*/
down_read(&kernfs_rwsem);
spin_lock(&dentry->d_lock);
parent = kernfs_dentry_node(dentry->d_parent);
if (parent) {
if (kernfs_dir_changed(parent, dentry)) {
spin_unlock(&dentry->d_lock);
up_read(&kernfs_rwsem);
return 0;
}
}
spin_unlock(&dentry->d_lock);
up_read(&kernfs_rwsem);
/* The kernfs parent node hasn't changed, leave the
* dentry negative and return success.
*/
return 1;
}
kn = kernfs_dentry_node(dentry);
down_read(&kernfs_rwsem);
/* The kernfs node has been deactivated */
if (!kernfs_active(kn))
goto out_bad;
/* The kernfs node has been moved? */
if (kernfs_dentry_node(dentry->d_parent) != kn->parent)
goto out_bad;
/* The kernfs node has been renamed */
if (strcmp(dentry->d_name.name, kn->name) != 0)
goto out_bad;
/* The kernfs node has been moved to a different namespace */
if (kn->parent && kernfs_ns_enabled(kn->parent) &&
kernfs_info(dentry->d_sb)->ns != kn->ns)
goto out_bad;
up_read(&kernfs_rwsem);
return 1;
out_bad:
up_read(&kernfs_rwsem);
return 0;
}
const struct dentry_operations kernfs_dops = {
.d_revalidate = kernfs_dop_revalidate,
};
static struct dentry *kernfs_iop_lookup(struct inode *dir,
struct dentry *dentry,
unsigned int flags)
{
struct kernfs_node *parent = dir->i_private;
struct kernfs_node *kn;
struct inode *inode = NULL;
const void *ns = NULL;
down_read(&kernfs_rwsem);
if (kernfs_ns_enabled(parent))
ns = kernfs_info(dir->i_sb)->ns;
kn = kernfs_find_ns(parent, dentry->d_name.name, ns);
/* attach dentry and inode */
if (kn) {
/* Inactive nodes are invisible to the VFS so don't
* create a negative.
*/
if (!kernfs_active(kn)) {
up_read(&kernfs_rwsem);
return NULL;
}
inode = kernfs_get_inode(dir->i_sb, kn);
if (!inode)
inode = ERR_PTR(-ENOMEM);
}
/*
* Needed for negative dentry validation.
* The negative dentry can be created in kernfs_iop_lookup()
* or transforms from positive dentry in dentry_unlink_inode()
* called from vfs_rmdir().
*/
if (!IS_ERR(inode))
kernfs_set_rev(parent, dentry);
up_read(&kernfs_rwsem);
/* instantiate and hash (possibly negative) dentry */
return d_splice_alias(inode, dentry);
}
static int kernfs_iop_mkdir(struct user_namespace *mnt_userns,
struct inode *dir, struct dentry *dentry,
umode_t mode)
{
struct kernfs_node *parent = dir->i_private;
struct kernfs_syscall_ops *scops = kernfs_root(parent)->syscall_ops;
int ret;
if (!scops || !scops->mkdir)
return -EPERM;
if (!kernfs_get_active(parent))
return -ENODEV;
ret = scops->mkdir(parent, dentry->d_name.name, mode);
kernfs_put_active(parent);
return ret;
}
static int kernfs_iop_rmdir(struct inode *dir, struct dentry *dentry)
{
struct kernfs_node *kn = kernfs_dentry_node(dentry);
struct kernfs_syscall_ops *scops = kernfs_root(kn)->syscall_ops;
int ret;
if (!scops || !scops->rmdir)
return -EPERM;
if (!kernfs_get_active(kn))
return -ENODEV;
ret = scops->rmdir(kn);
kernfs_put_active(kn);
return ret;
}
static int kernfs_iop_rename(struct user_namespace *mnt_userns,
struct inode *old_dir, struct dentry *old_dentry,
struct inode *new_dir, struct dentry *new_dentry,
unsigned int flags)
{
struct kernfs_node *kn = kernfs_dentry_node(old_dentry);
struct kernfs_node *new_parent = new_dir->i_private;
struct kernfs_syscall_ops *scops = kernfs_root(kn)->syscall_ops;
int ret;
if (flags)
return -EINVAL;
if (!scops || !scops->rename)
return -EPERM;
if (!kernfs_get_active(kn))
return -ENODEV;
if (!kernfs_get_active(new_parent)) {
kernfs_put_active(kn);
return -ENODEV;
}
ret = scops->rename(kn, new_parent, new_dentry->d_name.name);
kernfs_put_active(new_parent);
kernfs_put_active(kn);
return ret;
}
const struct inode_operations kernfs_dir_iops = {
.lookup = kernfs_iop_lookup,
.permission = kernfs_iop_permission,
.setattr = kernfs_iop_setattr,
.getattr = kernfs_iop_getattr,
.listxattr = kernfs_iop_listxattr,
.mkdir = kernfs_iop_mkdir,
.rmdir = kernfs_iop_rmdir,
.rename = kernfs_iop_rename,
};
static struct kernfs_node *kernfs_leftmost_descendant(struct kernfs_node *pos)
{
struct kernfs_node *last;
while (true) {
struct rb_node *rbn;
last = pos;
if (kernfs_type(pos) != KERNFS_DIR)
break;
rbn = rb_first(&pos->dir.children);
if (!rbn)
break;
pos = rb_to_kn(rbn);
}
return last;
}
/**
* kernfs_next_descendant_post - find the next descendant for post-order walk
* @pos: the current position (%NULL to initiate traversal)
* @root: kernfs_node whose descendants to walk
*
* Find the next descendant to visit for post-order traversal of @root's
* descendants. @root is included in the iteration and the last node to be
* visited.
*/
static struct kernfs_node *kernfs_next_descendant_post(struct kernfs_node *pos,
struct kernfs_node *root)
{
struct rb_node *rbn;
lockdep_assert_held_write(&kernfs_rwsem);
/* if first iteration, visit leftmost descendant which may be root */
if (!pos)
return kernfs_leftmost_descendant(root);
/* if we visited @root, we're done */
if (pos == root)
return NULL;
/* if there's an unvisited sibling, visit its leftmost descendant */
rbn = rb_next(&pos->rb);
if (rbn)
return kernfs_leftmost_descendant(rb_to_kn(rbn));
/* no sibling left, visit parent */
return pos->parent;
}
/**
* kernfs_activate - activate a node which started deactivated
* @kn: kernfs_node whose subtree is to be activated
*
* If the root has KERNFS_ROOT_CREATE_DEACTIVATED set, a newly created node
* needs to be explicitly activated. A node which hasn't been activated
* isn't visible to userland and deactivation is skipped during its
* removal. This is useful to construct atomic init sequences where
* creation of multiple nodes should either succeed or fail atomically.
*
* The caller is responsible for ensuring that this function is not called
* after kernfs_remove*() is invoked on @kn.
*/
void kernfs_activate(struct kernfs_node *kn)
{
struct kernfs_node *pos;
down_write(&kernfs_rwsem);
pos = NULL;
while ((pos = kernfs_next_descendant_post(pos, kn))) { if (pos->flags & KERNFS_ACTIVATED)
continue;
WARN_ON_ONCE(pos->parent && RB_EMPTY_NODE(&pos->rb)); WARN_ON_ONCE(atomic_read(&pos->active) != KN_DEACTIVATED_BIAS);
atomic_sub(KN_DEACTIVATED_BIAS, &pos->active);
pos->flags |= KERNFS_ACTIVATED;
}
up_write(&kernfs_rwsem);
}
static void __kernfs_remove(struct kernfs_node *kn)
{
struct kernfs_node *pos;
lockdep_assert_held_write(&kernfs_rwsem);
/*
* Short-circuit if non-root @kn has already finished removal.
* This is for kernfs_remove_self() which plays with active ref
* after removal.
*/
if (!kn || (kn->parent && RB_EMPTY_NODE(&kn->rb)))
return;
pr_debug("kernfs %s: removing\n", kn->name);
/* prevent any new usage under @kn by deactivating all nodes */
pos = NULL;
while ((pos = kernfs_next_descendant_post(pos, kn)))
if (kernfs_active(pos))
atomic_add(KN_DEACTIVATED_BIAS, &pos->active);
/* deactivate and unlink the subtree node-by-node */
do {
pos = kernfs_leftmost_descendant(kn);
/*
* kernfs_drain() drops kernfs_rwsem temporarily and @pos's
* base ref could have been put by someone else by the time
* the function returns. Make sure it doesn't go away
* underneath us.
*/
kernfs_get(pos);
/*
* Drain iff @kn was activated. This avoids draining and
* its lockdep annotations for nodes which have never been
* activated and allows embedding kernfs_remove() in create
* error paths without worrying about draining.
*/
if (kn->flags & KERNFS_ACTIVATED)
kernfs_drain(pos);
else
WARN_ON_ONCE(atomic_read(&kn->active) != KN_DEACTIVATED_BIAS);
/*
* kernfs_unlink_sibling() succeeds once per node. Use it
* to decide who's responsible for cleanups.
*/
if (!pos->parent || kernfs_unlink_sibling(pos)) {
struct kernfs_iattrs *ps_iattr =
pos->parent ? pos->parent->iattr : NULL;
/* update timestamps on the parent */
if (ps_iattr) {
ktime_get_real_ts64(&ps_iattr->ia_ctime);
ps_iattr->ia_mtime = ps_iattr->ia_ctime;
}
kernfs_put(pos);
}
kernfs_put(pos); } while (pos != kn);
}
/**
* kernfs_remove - remove a kernfs_node recursively
* @kn: the kernfs_node to remove
*
* Remove @kn along with all its subdirectories and files.
*/
void kernfs_remove(struct kernfs_node *kn)
{
down_write(&kernfs_rwsem);
__kernfs_remove(kn);
up_write(&kernfs_rwsem);
}
/**
* kernfs_break_active_protection - break out of active protection
* @kn: the self kernfs_node
*
* The caller must be running off of a kernfs operation which is invoked
* with an active reference - e.g. one of kernfs_ops. Each invocation of
* this function must also be matched with an invocation of
* kernfs_unbreak_active_protection().
*
* This function releases the active reference of @kn the caller is
* holding. Once this function is called, @kn may be removed at any point
* and the caller is solely responsible for ensuring that the objects it
* dereferences are accessible.
*/
void kernfs_break_active_protection(struct kernfs_node *kn)
{
/*
* Take out ourself out of the active ref dependency chain. If
* we're called without an active ref, lockdep will complain.
*/
kernfs_put_active(kn);
}
/**
* kernfs_unbreak_active_protection - undo kernfs_break_active_protection()
* @kn: the self kernfs_node
*
* If kernfs_break_active_protection() was called, this function must be
* invoked before finishing the kernfs operation. Note that while this
* function restores the active reference, it doesn't and can't actually
* restore the active protection - @kn may already or be in the process of
* being removed. Once kernfs_break_active_protection() is invoked, that
* protection is irreversibly gone for the kernfs operation instance.
*
* While this function may be called at any point after
* kernfs_break_active_protection() is invoked, its most useful location
* would be right before the enclosing kernfs operation returns.
*/
void kernfs_unbreak_active_protection(struct kernfs_node *kn)
{
/*
* @kn->active could be in any state; however, the increment we do
* here will be undone as soon as the enclosing kernfs operation
* finishes and this temporary bump can't break anything. If @kn
* is alive, nothing changes. If @kn is being deactivated, the
* soon-to-follow put will either finish deactivation or restore
* deactivated state. If @kn is already removed, the temporary
* bump is guaranteed to be gone before @kn is released.
*/
atomic_inc(&kn->active);
if (kernfs_lockdep(kn))
rwsem_acquire(&kn->dep_map, 0, 1, _RET_IP_);
}
/**
* kernfs_remove_self - remove a kernfs_node from its own method
* @kn: the self kernfs_node to remove
*
* The caller must be running off of a kernfs operation which is invoked
* with an active reference - e.g. one of kernfs_ops. This can be used to
* implement a file operation which deletes itself.
*
* For example, the "delete" file for a sysfs device directory can be
* implemented by invoking kernfs_remove_self() on the "delete" file
* itself. This function breaks the circular dependency of trying to
* deactivate self while holding an active ref itself. It isn't necessary
* to modify the usual removal path to use kernfs_remove_self(). The
* "delete" implementation can simply invoke kernfs_remove_self() on self
* before proceeding with the usual removal path. kernfs will ignore later
* kernfs_remove() on self.
*
* kernfs_remove_self() can be called multiple times concurrently on the
* same kernfs_node. Only the first one actually performs removal and
* returns %true. All others will wait until the kernfs operation which
* won self-removal finishes and return %false. Note that the losers wait
* for the completion of not only the winning kernfs_remove_self() but also
* the whole kernfs_ops which won the arbitration. This can be used to
* guarantee, for example, all concurrent writes to a "delete" file to
* finish only after the whole operation is complete.
*/
bool kernfs_remove_self(struct kernfs_node *kn)
{
bool ret;
down_write(&kernfs_rwsem);
kernfs_break_active_protection(kn);
/*
* SUICIDAL is used to arbitrate among competing invocations. Only
* the first one will actually perform removal. When the removal
* is complete, SUICIDED is set and the active ref is restored
* while kernfs_rwsem for held exclusive. The ones which lost
* arbitration waits for SUICIDED && drained which can happen only
* after the enclosing kernfs operation which executed the winning
* instance of kernfs_remove_self() finished.
*/
if (!(kn->flags & KERNFS_SUICIDAL)) {
kn->flags |= KERNFS_SUICIDAL;
__kernfs_remove(kn);
kn->flags |= KERNFS_SUICIDED;
ret = true;
} else {
wait_queue_head_t *waitq = &kernfs_root(kn)->deactivate_waitq;
DEFINE_WAIT(wait);
while (true) {
prepare_to_wait(waitq, &wait, TASK_UNINTERRUPTIBLE);
if ((kn->flags & KERNFS_SUICIDED) &&
atomic_read(&kn->active) == KN_DEACTIVATED_BIAS)
break;
up_write(&kernfs_rwsem);
schedule();
down_write(&kernfs_rwsem);
}
finish_wait(waitq, &wait);
WARN_ON_ONCE(!RB_EMPTY_NODE(&kn->rb));
ret = false;
}
/*
* This must be done while kernfs_rwsem held exclusive; otherwise,
* waiting for SUICIDED && deactivated could finish prematurely.
*/
kernfs_unbreak_active_protection(kn);
up_write(&kernfs_rwsem);
return ret;
}
/**
* kernfs_remove_by_name_ns - find a kernfs_node by name and remove it
* @parent: parent of the target
* @name: name of the kernfs_node to remove
* @ns: namespace tag of the kernfs_node to remove
*
* Look for the kernfs_node with @name and @ns under @parent and remove it.
* Returns 0 on success, -ENOENT if such entry doesn't exist.
*/
int kernfs_remove_by_name_ns(struct kernfs_node *parent, const char *name,
const void *ns)
{
struct kernfs_node *kn;
if (!parent) { WARN(1, KERN_WARNING "kernfs: can not remove '%s', no directory\n",
name);
return -ENOENT;
}
down_write(&kernfs_rwsem);
kn = kernfs_find_ns(parent, name, ns);
if (kn)
__kernfs_remove(kn);
up_write(&kernfs_rwsem); if (kn)
return 0;
else
return -ENOENT;
}
/**
* kernfs_rename_ns - move and rename a kernfs_node
* @kn: target node
* @new_parent: new parent to put @sd under
* @new_name: new name
* @new_ns: new namespace tag
*/
int kernfs_rename_ns(struct kernfs_node *kn, struct kernfs_node *new_parent,
const char *new_name, const void *new_ns)
{
struct kernfs_node *old_parent;
const char *old_name = NULL;
int error;
/* can't move or rename root */
if (!kn->parent)
return -EINVAL;
down_write(&kernfs_rwsem);
error = -ENOENT;
if (!kernfs_active(kn) || !kernfs_active(new_parent) ||
(new_parent->flags & KERNFS_EMPTY_DIR))
goto out;
error = 0;
if ((kn->parent == new_parent) && (kn->ns == new_ns) &&
(strcmp(kn->name, new_name) == 0))
goto out; /* nothing to rename */
error = -EEXIST;
if (kernfs_find_ns(new_parent, new_name, new_ns))
goto out;
/* rename kernfs_node */
if (strcmp(kn->name, new_name) != 0) {
error = -ENOMEM;
new_name = kstrdup_const(new_name, GFP_KERNEL);
if (!new_name)
goto out;
} else {
new_name = NULL;
}
/*
* Move to the appropriate place in the appropriate directories rbtree.
*/
kernfs_unlink_sibling(kn);
kernfs_get(new_parent);
/* rename_lock protects ->parent and ->name accessors */
spin_lock_irq(&kernfs_rename_lock);
old_parent = kn->parent;
kn->parent = new_parent;
kn->ns = new_ns;
if (new_name) {
old_name = kn->name;
kn->name = new_name;
}
spin_unlock_irq(&kernfs_rename_lock);
kn->hash = kernfs_name_hash(kn->name, kn->ns);
kernfs_link_sibling(kn);
kernfs_put(old_parent);
kfree_const(old_name);
error = 0;
out:
up_write(&kernfs_rwsem);
return error;
}
/* Relationship between mode and the DT_xxx types */
static inline unsigned char dt_type(struct kernfs_node *kn)
{
return (kn->mode >> 12) & 15;
}
static int kernfs_dir_fop_release(struct inode *inode, struct file *filp)
{
kernfs_put(filp->private_data);
return 0;
}
static struct kernfs_node *kernfs_dir_pos(const void *ns,
struct kernfs_node *parent, loff_t hash, struct kernfs_node *pos)
{
if (pos) {
int valid = kernfs_active(pos) &&
pos->parent == parent && hash == pos->hash;
kernfs_put(pos);
if (!valid)
pos = NULL;
}
if (!pos && (hash > 1) && (hash < INT_MAX)) {
struct rb_node *node = parent->dir.children.rb_node;
while (node) {
pos = rb_to_kn(node);
if (hash < pos->hash)
node = node->rb_left;
else if (hash > pos->hash)
node = node->rb_right;
else
break;
}
}
/* Skip over entries which are dying/dead or in the wrong namespace */
while (pos && (!kernfs_active(pos) || pos->ns != ns)) {
struct rb_node *node = rb_next(&pos->rb);
if (!node)
pos = NULL;
else
pos = rb_to_kn(node);
}
return pos;
}
static struct kernfs_node *kernfs_dir_next_pos(const void *ns,
struct kernfs_node *parent, ino_t ino, struct kernfs_node *pos)
{
pos = kernfs_dir_pos(ns, parent, ino, pos);
if (pos) {
do {
struct rb_node *node = rb_next(&pos->rb);
if (!node)
pos = NULL;
else
pos = rb_to_kn(node);
} while (pos && (!kernfs_active(pos) || pos->ns != ns));
}
return pos;
}
static int kernfs_fop_readdir(struct file *file, struct dir_context *ctx)
{
struct dentry *dentry = file->f_path.dentry;
struct kernfs_node *parent = kernfs_dentry_node(dentry);
struct kernfs_node *pos = file->private_data;
const void *ns = NULL;
if (!dir_emit_dots(file, ctx))
return 0;
down_read(&kernfs_rwsem);
if (kernfs_ns_enabled(parent))
ns = kernfs_info(dentry->d_sb)->ns;
for (pos = kernfs_dir_pos(ns, parent, ctx->pos, pos);
pos;
pos = kernfs_dir_next_pos(ns, parent, ctx->pos, pos)) {
const char *name = pos->name;
unsigned int type = dt_type(pos);
int len = strlen(name);
ino_t ino = kernfs_ino(pos);
ctx->pos = pos->hash;
file->private_data = pos;
kernfs_get(pos);
up_read(&kernfs_rwsem);
if (!dir_emit(ctx, name, len, ino, type))
return 0;
down_read(&kernfs_rwsem);
}
up_read(&kernfs_rwsem);
file->private_data = NULL;
ctx->pos = INT_MAX;
return 0;
}
const struct file_operations kernfs_dir_fops = {
.read = generic_read_dir,
.iterate_shared = kernfs_fop_readdir,
.release = kernfs_dir_fop_release,
.llseek = generic_file_llseek,
};
// SPDX-License-Identifier: GPL-2.0
/*
* linux/kernel/capability.c
*
* Copyright (C) 1997 Andrew Main <zefram@fysh.org>
*
* Integrated into 2.1.97+, Andrew G. Morgan <morgan@kernel.org>
* 30 May 2002: Cleanup, Robert M. Love <rml@tech9.net>
*/
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/audit.h>
#include <linux/capability.h>
#include <linux/mm.h>
#include <linux/export.h>
#include <linux/security.h>
#include <linux/syscalls.h>
#include <linux/pid_namespace.h>
#include <linux/user_namespace.h>
#include <linux/uaccess.h>
/*
* Leveraged for setting/resetting capabilities
*/
const kernel_cap_t __cap_empty_set = CAP_EMPTY_SET;
EXPORT_SYMBOL(__cap_empty_set);
int file_caps_enabled = 1;
static int __init file_caps_disable(char *str)
{
file_caps_enabled = 0;
return 1;
}
__setup("no_file_caps", file_caps_disable);
#ifdef CONFIG_MULTIUSER
/*
* More recent versions of libcap are available from:
*
* http://www.kernel.org/pub/linux/libs/security/linux-privs/
*/
static void warn_legacy_capability_use(void)
{
char name[sizeof(current->comm)];
pr_info_once("warning: `%s' uses 32-bit capabilities (legacy support in use)\n",
get_task_comm(name, current));
}
/*
* Version 2 capabilities worked fine, but the linux/capability.h file
* that accompanied their introduction encouraged their use without
* the necessary user-space source code changes. As such, we have
* created a version 3 with equivalent functionality to version 2, but
* with a header change to protect legacy source code from using
* version 2 when it wanted to use version 1. If your system has code
* that trips the following warning, it is using version 2 specific
* capabilities and may be doing so insecurely.
*
* The remedy is to either upgrade your version of libcap (to 2.10+,
* if the application is linked against it), or recompile your
* application with modern kernel headers and this warning will go
* away.
*/
static void warn_deprecated_v2(void)
{
char name[sizeof(current->comm)];
pr_info_once("warning: `%s' uses deprecated v2 capabilities in a way that may be insecure\n",
get_task_comm(name, current));
}
/*
* Version check. Return the number of u32s in each capability flag
* array, or a negative value on error.
*/
static int cap_validate_magic(cap_user_header_t header, unsigned *tocopy)
{
__u32 version;
if (get_user(version, &header->version))
return -EFAULT;
switch (version) {
case _LINUX_CAPABILITY_VERSION_1:
warn_legacy_capability_use();
*tocopy = _LINUX_CAPABILITY_U32S_1;
break;
case _LINUX_CAPABILITY_VERSION_2:
warn_deprecated_v2();
fallthrough; /* v3 is otherwise equivalent to v2 */
case _LINUX_CAPABILITY_VERSION_3:
*tocopy = _LINUX_CAPABILITY_U32S_3;
break;
default:
if (put_user((u32)_KERNEL_CAPABILITY_VERSION, &header->version))
return -EFAULT;
return -EINVAL;
}
return 0;
}
/*
* The only thing that can change the capabilities of the current
* process is the current process. As such, we can't be in this code
* at the same time as we are in the process of setting capabilities
* in this process. The net result is that we can limit our use of
* locks to when we are reading the caps of another process.
*/
static inline int cap_get_target_pid(pid_t pid, kernel_cap_t *pEp,
kernel_cap_t *pIp, kernel_cap_t *pPp)
{
int ret;
if (pid && (pid != task_pid_vnr(current))) {
struct task_struct *target;
rcu_read_lock();
target = find_task_by_vpid(pid);
if (!target)
ret = -ESRCH;
else
ret = security_capget(target, pEp, pIp, pPp);
rcu_read_unlock();
} else
ret = security_capget(current, pEp, pIp, pPp);
return ret;
}
/**
* sys_capget - get the capabilities of a given process.
* @header: pointer to struct that contains capability version and
* target pid data
* @dataptr: pointer to struct that contains the effective, permitted,
* and inheritable capabilities that are returned
*
* Returns 0 on success and < 0 on error.
*/
SYSCALL_DEFINE2(capget, cap_user_header_t, header, cap_user_data_t, dataptr)
{
int ret = 0;
pid_t pid;
unsigned tocopy;
kernel_cap_t pE, pI, pP;
ret = cap_validate_magic(header, &tocopy);
if ((dataptr == NULL) || (ret != 0))
return ((dataptr == NULL) && (ret == -EINVAL)) ? 0 : ret;
if (get_user(pid, &header->pid))
return -EFAULT;
if (pid < 0)
return -EINVAL;
ret = cap_get_target_pid(pid, &pE, &pI, &pP);
if (!ret) {
struct __user_cap_data_struct kdata[_KERNEL_CAPABILITY_U32S];
unsigned i;
for (i = 0; i < tocopy; i++) {
kdata[i].effective = pE.cap[i];
kdata[i].permitted = pP.cap[i];
kdata[i].inheritable = pI.cap[i];
}
/*
* Note, in the case, tocopy < _KERNEL_CAPABILITY_U32S,
* we silently drop the upper capabilities here. This
* has the effect of making older libcap
* implementations implicitly drop upper capability
* bits when they perform a: capget/modify/capset
* sequence.
*
* This behavior is considered fail-safe
* behavior. Upgrading the application to a newer
* version of libcap will enable access to the newer
* capabilities.
*
* An alternative would be to return an error here
* (-ERANGE), but that causes legacy applications to
* unexpectedly fail; the capget/modify/capset aborts
* before modification is attempted and the application
* fails.
*/
if (copy_to_user(dataptr, kdata, tocopy
* sizeof(struct __user_cap_data_struct))) {
return -EFAULT;
}
}
return ret;
}
/**
* sys_capset - set capabilities for a process or (*) a group of processes
* @header: pointer to struct that contains capability version and
* target pid data
* @data: pointer to struct that contains the effective, permitted,
* and inheritable capabilities
*
* Set capabilities for the current process only. The ability to any other
* process(es) has been deprecated and removed.
*
* The restrictions on setting capabilities are specified as:
*
* I: any raised capabilities must be a subset of the old permitted
* P: any raised capabilities must be a subset of the old permitted
* E: must be set to a subset of new permitted
*
* Returns 0 on success and < 0 on error.
*/
SYSCALL_DEFINE2(capset, cap_user_header_t, header, const cap_user_data_t, data)
{
struct __user_cap_data_struct kdata[_KERNEL_CAPABILITY_U32S];
unsigned i, tocopy, copybytes;
kernel_cap_t inheritable, permitted, effective;
struct cred *new;
int ret;
pid_t pid;
ret = cap_validate_magic(header, &tocopy);
if (ret != 0)
return ret;
if (get_user(pid, &header->pid))
return -EFAULT;
/* may only affect current now */
if (pid != 0 && pid != task_pid_vnr(current))
return -EPERM;
copybytes = tocopy * sizeof(struct __user_cap_data_struct);
if (copybytes > sizeof(kdata))
return -EFAULT;
if (copy_from_user(&kdata, data, copybytes))
return -EFAULT;
for (i = 0; i < tocopy; i++) {
effective.cap[i] = kdata[i].effective;
permitted.cap[i] = kdata[i].permitted;
inheritable.cap[i] = kdata[i].inheritable;
}
while (i < _KERNEL_CAPABILITY_U32S) {
effective.cap[i] = 0;
permitted.cap[i] = 0;
inheritable.cap[i] = 0;
i++;
}
effective.cap[CAP_LAST_U32] &= CAP_LAST_U32_VALID_MASK;
permitted.cap[CAP_LAST_U32] &= CAP_LAST_U32_VALID_MASK;
inheritable.cap[CAP_LAST_U32] &= CAP_LAST_U32_VALID_MASK;
new = prepare_creds();
if (!new)
return -ENOMEM;
ret = security_capset(new, current_cred(),
&effective, &inheritable, &permitted);
if (ret < 0)
goto error;
audit_log_capset(new, current_cred());
return commit_creds(new);
error:
abort_creds(new);
return ret;
}
/**
* has_ns_capability - Does a task have a capability in a specific user ns
* @t: The task in question
* @ns: target user namespace
* @cap: The capability to be tested for
*
* Return true if the specified task has the given superior capability
* currently in effect to the specified user namespace, false if not.
*
* Note that this does not set PF_SUPERPRIV on the task.
*/
bool has_ns_capability(struct task_struct *t,
struct user_namespace *ns, int cap)
{
int ret;
rcu_read_lock();
ret = security_capable(__task_cred(t), ns, cap, CAP_OPT_NONE);
rcu_read_unlock();
return (ret == 0);
}
/**
* has_capability - Does a task have a capability in init_user_ns
* @t: The task in question
* @cap: The capability to be tested for
*
* Return true if the specified task has the given superior capability
* currently in effect to the initial user namespace, false if not.
*
* Note that this does not set PF_SUPERPRIV on the task.
*/
bool has_capability(struct task_struct *t, int cap)
{
return has_ns_capability(t, &init_user_ns, cap);
}
EXPORT_SYMBOL(has_capability);
/**
* has_ns_capability_noaudit - Does a task have a capability (unaudited)
* in a specific user ns.
* @t: The task in question
* @ns: target user namespace
* @cap: The capability to be tested for
*
* Return true if the specified task has the given superior capability
* currently in effect to the specified user namespace, false if not.
* Do not write an audit message for the check.
*
* Note that this does not set PF_SUPERPRIV on the task.
*/
bool has_ns_capability_noaudit(struct task_struct *t,
struct user_namespace *ns, int cap)
{
int ret;
rcu_read_lock();
ret = security_capable(__task_cred(t), ns, cap, CAP_OPT_NOAUDIT);
rcu_read_unlock();
return (ret == 0);
}
/**
* has_capability_noaudit - Does a task have a capability (unaudited) in the
* initial user ns
* @t: The task in question
* @cap: The capability to be tested for
*
* Return true if the specified task has the given superior capability
* currently in effect to init_user_ns, false if not. Don't write an
* audit message for the check.
*
* Note that this does not set PF_SUPERPRIV on the task.
*/
bool has_capability_noaudit(struct task_struct *t, int cap)
{
return has_ns_capability_noaudit(t, &init_user_ns, cap);
}
static bool ns_capable_common(struct user_namespace *ns,
int cap,
unsigned int opts)
{
int capable;
if (unlikely(!cap_valid(cap))) {
pr_crit("capable() called with invalid cap=%u\n", cap);
BUG();
}
capable = security_capable(current_cred(), ns, cap, opts);
if (capable == 0) {
current->flags |= PF_SUPERPRIV;
return true;
}
return false;
}
/**
* ns_capable - Determine if the current task has a superior capability in effect
* @ns: The usernamespace we want the capability in
* @cap: The capability to be tested for
*
* Return true if the current task has the given superior capability currently
* available for use, false if not.
*
* This sets PF_SUPERPRIV on the task if the capability is available on the
* assumption that it's about to be used.
*/
bool ns_capable(struct user_namespace *ns, int cap)
{
return ns_capable_common(ns, cap, CAP_OPT_NONE);
}
EXPORT_SYMBOL(ns_capable);
/**
* ns_capable_noaudit - Determine if the current task has a superior capability
* (unaudited) in effect
* @ns: The usernamespace we want the capability in
* @cap: The capability to be tested for
*
* Return true if the current task has the given superior capability currently
* available for use, false if not.
*
* This sets PF_SUPERPRIV on the task if the capability is available on the
* assumption that it's about to be used.
*/
bool ns_capable_noaudit(struct user_namespace *ns, int cap)
{
return ns_capable_common(ns, cap, CAP_OPT_NOAUDIT);
}
EXPORT_SYMBOL(ns_capable_noaudit);
/**
* ns_capable_setid - Determine if the current task has a superior capability
* in effect, while signalling that this check is being done from within a
* setid or setgroups syscall.
* @ns: The usernamespace we want the capability in
* @cap: The capability to be tested for
*
* Return true if the current task has the given superior capability currently
* available for use, false if not.
*
* This sets PF_SUPERPRIV on the task if the capability is available on the
* assumption that it's about to be used.
*/
bool ns_capable_setid(struct user_namespace *ns, int cap)
{
return ns_capable_common(ns, cap, CAP_OPT_INSETID);
}
EXPORT_SYMBOL(ns_capable_setid);
/**
* capable - Determine if the current task has a superior capability in effect
* @cap: The capability to be tested for
*
* Return true if the current task has the given superior capability currently
* available for use, false if not.
*
* This sets PF_SUPERPRIV on the task if the capability is available on the
* assumption that it's about to be used.
*/
bool capable(int cap)
{
return ns_capable(&init_user_ns, cap);
}
EXPORT_SYMBOL(capable);
#endif /* CONFIG_MULTIUSER */
/**
* file_ns_capable - Determine if the file's opener had a capability in effect
* @file: The file we want to check
* @ns: The usernamespace we want the capability in
* @cap: The capability to be tested for
*
* Return true if task that opened the file had a capability in effect
* when the file was opened.
*
* This does not set PF_SUPERPRIV because the caller may not
* actually be privileged.
*/
bool file_ns_capable(const struct file *file, struct user_namespace *ns,
int cap)
{
if (WARN_ON_ONCE(!cap_valid(cap)))
return false;
if (security_capable(file->f_cred, ns, cap, CAP_OPT_NONE) == 0)
return true;
return false;
}
EXPORT_SYMBOL(file_ns_capable);
/**
* privileged_wrt_inode_uidgid - Do capabilities in the namespace work over the inode?
* @ns: The user namespace in question
* @inode: The inode in question
*
* Return true if the inode uid and gid are within the namespace.
*/
bool privileged_wrt_inode_uidgid(struct user_namespace *ns,
struct user_namespace *mnt_userns,
const struct inode *inode)
{
return kuid_has_mapping(ns, i_uid_into_mnt(mnt_userns, inode)) &&
kgid_has_mapping(ns, i_gid_into_mnt(mnt_userns, inode));
}
/**
* capable_wrt_inode_uidgid - Check nsown_capable and uid and gid mapped
* @inode: The inode in question
* @cap: The capability in question
*
* Return true if the current task has the given capability targeted at
* its own user namespace and that the given inode's uid and gid are
* mapped into the current user namespace.
*/
bool capable_wrt_inode_uidgid(struct user_namespace *mnt_userns,
const struct inode *inode, int cap)
{
struct user_namespace *ns = current_user_ns();
return ns_capable(ns, cap) &&
privileged_wrt_inode_uidgid(ns, mnt_userns, inode);
}
EXPORT_SYMBOL(capable_wrt_inode_uidgid);
/**
* ptracer_capable - Determine if the ptracer holds CAP_SYS_PTRACE in the namespace
* @tsk: The task that may be ptraced
* @ns: The user namespace to search for CAP_SYS_PTRACE in
*
* Return true if the task that is ptracing the current task had CAP_SYS_PTRACE
* in the specified user namespace.
*/
bool ptracer_capable(struct task_struct *tsk, struct user_namespace *ns)
{
int ret = 0; /* An absent tracer adds no restrictions */
const struct cred *cred;
rcu_read_lock();
cred = rcu_dereference(tsk->ptracer_cred);
if (cred)
ret = security_capable(cred, ns, CAP_SYS_PTRACE,
CAP_OPT_NOAUDIT);
rcu_read_unlock();
return (ret == 0);
}
/* SPDX-License-Identifier: GPL-2.0 */
/*
* include/linux/writeback.h
*/
#ifndef WRITEBACK_H
#define WRITEBACK_H
#include <linux/sched.h>
#include <linux/workqueue.h>
#include <linux/fs.h>
#include <linux/flex_proportions.h>
#include <linux/backing-dev-defs.h>
#include <linux/blk_types.h>
#include <linux/blk-cgroup.h>
struct bio;
DECLARE_PER_CPU(int, dirty_throttle_leaks);
/*
* The 1/4 region under the global dirty thresh is for smooth dirty throttling:
*
* (thresh - thresh/DIRTY_FULL_SCOPE, thresh)
*
* Further beyond, all dirtier tasks will enter a loop waiting (possibly long
* time) for the dirty pages to drop, unless written enough pages.
*
* The global dirty threshold is normally equal to the global dirty limit,
* except when the system suddenly allocates a lot of anonymous memory and
* knocks down the global dirty threshold quickly, in which case the global
* dirty limit will follow down slowly to prevent livelocking all dirtier tasks.
*/
#define DIRTY_SCOPE 8
#define DIRTY_FULL_SCOPE (DIRTY_SCOPE / 2)
struct backing_dev_info;
/*
* fs/fs-writeback.c
*/
enum writeback_sync_modes {
WB_SYNC_NONE, /* Don't wait on anything */
WB_SYNC_ALL, /* Wait on every mapping */
};
/*
* A control structure which tells the writeback code what to do. These are
* always on the stack, and hence need no locking. They are always initialised
* in a manner such that unspecified fields are set to zero.
*/
struct writeback_control {
long nr_to_write; /* Write this many pages, and decrement
this for each page written */
long pages_skipped; /* Pages which were not written */
/*
* For a_ops->writepages(): if start or end are non-zero then this is
* a hint that the filesystem need only write out the pages inside that
* byterange. The byte at `end' is included in the writeout request.
*/
loff_t range_start;
loff_t range_end;
enum writeback_sync_modes sync_mode;
unsigned for_kupdate:1; /* A kupdate writeback */
unsigned for_background:1; /* A background writeback */
unsigned tagged_writepages:1; /* tag-and-write to avoid livelock */
unsigned for_reclaim:1; /* Invoked from the page allocator */
unsigned range_cyclic:1; /* range_start is cyclic */
unsigned for_sync:1; /* sync(2) WB_SYNC_ALL writeback */
/*
* When writeback IOs are bounced through async layers, only the
* initial synchronous phase should be accounted towards inode
* cgroup ownership arbitration to avoid confusion. Later stages
* can set the following flag to disable the accounting.
*/
unsigned no_cgroup_owner:1;
unsigned punt_to_cgroup:1; /* cgrp punting, see __REQ_CGROUP_PUNT */
#ifdef CONFIG_CGROUP_WRITEBACK
struct bdi_writeback *wb; /* wb this writeback is issued under */
struct inode *inode; /* inode being written out */
/* foreign inode detection, see wbc_detach_inode() */
int wb_id; /* current wb id */
int wb_lcand_id; /* last foreign candidate wb id */
int wb_tcand_id; /* this foreign candidate wb id */
size_t wb_bytes; /* bytes written by current wb */
size_t wb_lcand_bytes; /* bytes written by last candidate */
size_t wb_tcand_bytes; /* bytes written by this candidate */
#endif
};
static inline int wbc_to_write_flags(struct writeback_control *wbc)
{
int flags = 0;
if (wbc->punt_to_cgroup)
flags = REQ_CGROUP_PUNT;
if (wbc->sync_mode == WB_SYNC_ALL) flags |= REQ_SYNC; else if (wbc->for_kupdate || wbc->for_background) flags |= REQ_BACKGROUND;
return flags;
}
static inline struct cgroup_subsys_state *
wbc_blkcg_css(struct writeback_control *wbc)
{
#ifdef CONFIG_CGROUP_WRITEBACK
if (wbc->wb)
return wbc->wb->blkcg_css;
#endif
return blkcg_root_css;
}
/*
* A wb_domain represents a domain that wb's (bdi_writeback's) belong to
* and are measured against each other in. There always is one global
* domain, global_wb_domain, that every wb in the system is a member of.
* This allows measuring the relative bandwidth of each wb to distribute
* dirtyable memory accordingly.
*/
struct wb_domain {
spinlock_t lock;
/*
* Scale the writeback cache size proportional to the relative
* writeout speed.
*
* We do this by keeping a floating proportion between BDIs, based
* on page writeback completions [end_page_writeback()]. Those
* devices that write out pages fastest will get the larger share,
* while the slower will get a smaller share.
*
* We use page writeout completions because we are interested in
* getting rid of dirty pages. Having them written out is the
* primary goal.
*
* We introduce a concept of time, a period over which we measure
* these events, because demand can/will vary over time. The length
* of this period itself is measured in page writeback completions.
*/
struct fprop_global completions;
struct timer_list period_timer; /* timer for aging of completions */
unsigned long period_time;
/*
* The dirtyable memory and dirty threshold could be suddenly
* knocked down by a large amount (eg. on the startup of KVM in a
* swapless system). This may throw the system into deep dirty
* exceeded state and throttle heavy/light dirtiers alike. To
* retain good responsiveness, maintain global_dirty_limit for
* tracking slowly down to the knocked down dirty threshold.
*
* Both fields are protected by ->lock.
*/
unsigned long dirty_limit_tstamp;
unsigned long dirty_limit;
};
/**
* wb_domain_size_changed - memory available to a wb_domain has changed
* @dom: wb_domain of interest
*
* This function should be called when the amount of memory available to
* @dom has changed. It resets @dom's dirty limit parameters to prevent
* the past values which don't match the current configuration from skewing
* dirty throttling. Without this, when memory size of a wb_domain is
* greatly reduced, the dirty throttling logic may allow too many pages to
* be dirtied leading to consecutive unnecessary OOMs and may get stuck in
* that situation.
*/
static inline void wb_domain_size_changed(struct wb_domain *dom)
{
spin_lock(&dom->lock);
dom->dirty_limit_tstamp = jiffies;
dom->dirty_limit = 0;
spin_unlock(&dom->lock);
}
/*
* fs/fs-writeback.c
*/
struct bdi_writeback;
void writeback_inodes_sb(struct super_block *, enum wb_reason reason);
void writeback_inodes_sb_nr(struct super_block *, unsigned long nr,
enum wb_reason reason);
void try_to_writeback_inodes_sb(struct super_block *sb, enum wb_reason reason);
void sync_inodes_sb(struct super_block *);
void wakeup_flusher_threads(enum wb_reason reason);
void wakeup_flusher_threads_bdi(struct backing_dev_info *bdi,
enum wb_reason reason);
void inode_wait_for_writeback(struct inode *inode);
void inode_io_list_del(struct inode *inode);
/* writeback.h requires fs.h; it, too, is not included from here. */
static inline void wait_on_inode(struct inode *inode)
{
might_sleep();
wait_on_bit(&inode->i_state, __I_NEW, TASK_UNINTERRUPTIBLE);
}
#ifdef CONFIG_CGROUP_WRITEBACK
#include <linux/cgroup.h>
#include <linux/bio.h>
void __inode_attach_wb(struct inode *inode, struct page *page);
void wbc_attach_and_unlock_inode(struct writeback_control *wbc,
struct inode *inode)
__releases(&inode->i_lock);
void wbc_detach_inode(struct writeback_control *wbc);
void wbc_account_cgroup_owner(struct writeback_control *wbc, struct page *page,
size_t bytes);
int cgroup_writeback_by_id(u64 bdi_id, int memcg_id,
enum wb_reason reason, struct wb_completion *done);
void cgroup_writeback_umount(void);
bool cleanup_offline_cgwb(struct bdi_writeback *wb);
/**
* inode_attach_wb - associate an inode with its wb
* @inode: inode of interest
* @page: page being dirtied (may be NULL)
*
* If @inode doesn't have its wb, associate it with the wb matching the
* memcg of @page or, if @page is NULL, %current. May be called w/ or w/o
* @inode->i_lock.
*/
static inline void inode_attach_wb(struct inode *inode, struct page *page)
{
if (!inode->i_wb)
__inode_attach_wb(inode, page);
}
/**
* inode_detach_wb - disassociate an inode from its wb
* @inode: inode of interest
*
* @inode is being freed. Detach from its wb.
*/
static inline void inode_detach_wb(struct inode *inode)
{
if (inode->i_wb) {
WARN_ON_ONCE(!(inode->i_state & I_CLEAR));
wb_put(inode->i_wb);
inode->i_wb = NULL;
}
}
/**
* wbc_attach_fdatawrite_inode - associate wbc and inode for fdatawrite
* @wbc: writeback_control of interest
* @inode: target inode
*
* This function is to be used by __filemap_fdatawrite_range(), which is an
* alternative entry point into writeback code, and first ensures @inode is
* associated with a bdi_writeback and attaches it to @wbc.
*/
static inline void wbc_attach_fdatawrite_inode(struct writeback_control *wbc,
struct inode *inode)
{
spin_lock(&inode->i_lock);
inode_attach_wb(inode, NULL);
wbc_attach_and_unlock_inode(wbc, inode);
}
/**
* wbc_init_bio - writeback specific initializtion of bio
* @wbc: writeback_control for the writeback in progress
* @bio: bio to be initialized
*
* @bio is a part of the writeback in progress controlled by @wbc. Perform
* writeback specific initialization. This is used to apply the cgroup
* writeback context. Must be called after the bio has been associated with
* a device.
*/
static inline void wbc_init_bio(struct writeback_control *wbc, struct bio *bio)
{
/*
* pageout() path doesn't attach @wbc to the inode being written
* out. This is intentional as we don't want the function to block
* behind a slow cgroup. Ultimately, we want pageout() to kick off
* regular writeback instead of writing things out itself.
*/
if (wbc->wb)
bio_associate_blkg_from_css(bio, wbc->wb->blkcg_css);
}
#else /* CONFIG_CGROUP_WRITEBACK */
static inline void inode_attach_wb(struct inode *inode, struct page *page)
{
}
static inline void inode_detach_wb(struct inode *inode)
{
}
static inline void wbc_attach_and_unlock_inode(struct writeback_control *wbc,
struct inode *inode)
__releases(&inode->i_lock)
{
spin_unlock(&inode->i_lock);
}
static inline void wbc_attach_fdatawrite_inode(struct writeback_control *wbc,
struct inode *inode)
{
}
static inline void wbc_detach_inode(struct writeback_control *wbc)
{
}
static inline void wbc_init_bio(struct writeback_control *wbc, struct bio *bio)
{
}
static inline void wbc_account_cgroup_owner(struct writeback_control *wbc,
struct page *page, size_t bytes)
{
}
static inline void cgroup_writeback_umount(void)
{
}
#endif /* CONFIG_CGROUP_WRITEBACK */
/*
* mm/page-writeback.c
*/
void laptop_io_completion(struct backing_dev_info *info);
void laptop_sync_completion(void);
void laptop_mode_timer_fn(struct timer_list *t);
bool node_dirty_ok(struct pglist_data *pgdat);
int wb_domain_init(struct wb_domain *dom, gfp_t gfp);
#ifdef CONFIG_CGROUP_WRITEBACK
void wb_domain_exit(struct wb_domain *dom);
#endif
extern struct wb_domain global_wb_domain;
/* These are exported to sysctl. */
extern int dirty_background_ratio;
extern unsigned long dirty_background_bytes;
extern int vm_dirty_ratio;
extern unsigned long vm_dirty_bytes;
extern unsigned int dirty_writeback_interval;
extern unsigned int dirty_expire_interval;
extern unsigned int dirtytime_expire_interval;
extern int vm_highmem_is_dirtyable;
extern int laptop_mode;
int dirty_background_ratio_handler(struct ctl_table *table, int write,
void *buffer, size_t *lenp, loff_t *ppos);
int dirty_background_bytes_handler(struct ctl_table *table, int write,
void *buffer, size_t *lenp, loff_t *ppos);
int dirty_ratio_handler(struct ctl_table *table, int write,
void *buffer, size_t *lenp, loff_t *ppos);
int dirty_bytes_handler(struct ctl_table *table, int write,
void *buffer, size_t *lenp, loff_t *ppos);
int dirtytime_interval_handler(struct ctl_table *table, int write,
void *buffer, size_t *lenp, loff_t *ppos);
int dirty_writeback_centisecs_handler(struct ctl_table *table, int write,
void *buffer, size_t *lenp, loff_t *ppos);
void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty);
unsigned long wb_calc_thresh(struct bdi_writeback *wb, unsigned long thresh);
void wb_update_bandwidth(struct bdi_writeback *wb);
void balance_dirty_pages_ratelimited(struct address_space *mapping);
bool wb_over_bg_thresh(struct bdi_writeback *wb);
typedef int (*writepage_t)(struct page *page, struct writeback_control *wbc,
void *data);
int generic_writepages(struct address_space *mapping,
struct writeback_control *wbc);
void tag_pages_for_writeback(struct address_space *mapping,
pgoff_t start, pgoff_t end);
int write_cache_pages(struct address_space *mapping,
struct writeback_control *wbc, writepage_t writepage,
void *data);
int do_writepages(struct address_space *mapping, struct writeback_control *wbc);
void writeback_set_ratelimit(void);
void tag_pages_for_writeback(struct address_space *mapping,
pgoff_t start, pgoff_t end);
void account_page_redirty(struct page *page);
void sb_mark_inode_writeback(struct inode *inode);
void sb_clear_inode_writeback(struct inode *inode);
#endif /* WRITEBACK_H */
// SPDX-License-Identifier: GPL-2.0-only
/*
* Copyright (C) 2016 Facebook
* Copyright (C) 2013-2014 Jens Axboe
*/
#include <linux/sched.h>
#include <linux/random.h>
#include <linux/sbitmap.h>
#include <linux/seq_file.h>
static int init_alloc_hint(struct sbitmap *sb, gfp_t flags)
{
unsigned depth = sb->depth;
sb->alloc_hint = alloc_percpu_gfp(unsigned int, flags);
if (!sb->alloc_hint)
return -ENOMEM;
if (depth && !sb->round_robin) {
int i;
for_each_possible_cpu(i)
*per_cpu_ptr(sb->alloc_hint, i) = prandom_u32() % depth;
}
return 0;
}
static inline unsigned update_alloc_hint_before_get(struct sbitmap *sb,
unsigned int depth)
{
unsigned hint;
hint = this_cpu_read(*sb->alloc_hint);
if (unlikely(hint >= depth)) {
hint = depth ? prandom_u32() % depth : 0; this_cpu_write(*sb->alloc_hint, hint);
}
return hint;
}
static inline void update_alloc_hint_after_get(struct sbitmap *sb,
unsigned int depth,
unsigned int hint,
unsigned int nr)
{
if (nr == -1) {
/* If the map is full, a hint won't do us much good. */
this_cpu_write(*sb->alloc_hint, 0); } else if (nr == hint || unlikely(sb->round_robin)) {
/* Only update the hint if we used it. */
hint = nr + 1;
if (hint >= depth - 1)
hint = 0;
this_cpu_write(*sb->alloc_hint, hint);
}
}
/*
* See if we have deferred clears that we can batch move
*/
static inline bool sbitmap_deferred_clear(struct sbitmap_word *map)
{
unsigned long mask;
if (!READ_ONCE(map->cleared))
return false;
/*
* First get a stable cleared mask, setting the old mask to 0.
*/
mask = xchg(&map->cleared, 0);
/*
* Now clear the masked bits in our free word
*/
atomic_long_andnot(mask, (atomic_long_t *)&map->word);
BUILD_BUG_ON(sizeof(atomic_long_t) != sizeof(map->word));
return true;
}
int sbitmap_init_node(struct sbitmap *sb, unsigned int depth, int shift,
gfp_t flags, int node, bool round_robin,
bool alloc_hint)
{
unsigned int bits_per_word;
unsigned int i;
if (shift < 0)
shift = sbitmap_calculate_shift(depth);
bits_per_word = 1U << shift;
if (bits_per_word > BITS_PER_LONG)
return -EINVAL;
sb->shift = shift;
sb->depth = depth;
sb->map_nr = DIV_ROUND_UP(sb->depth, bits_per_word);
sb->round_robin = round_robin;
if (depth == 0) {
sb->map = NULL;
return 0;
}
if (alloc_hint) {
if (init_alloc_hint(sb, flags))
return -ENOMEM;
} else {
sb->alloc_hint = NULL;
}
sb->map = kcalloc_node(sb->map_nr, sizeof(*sb->map), flags, node);
if (!sb->map) {
free_percpu(sb->alloc_hint);
return -ENOMEM;
}
for (i = 0; i < sb->map_nr; i++) {
sb->map[i].depth = min(depth, bits_per_word);
depth -= sb->map[i].depth;
}
return 0;
}
EXPORT_SYMBOL_GPL(sbitmap_init_node);
void sbitmap_resize(struct sbitmap *sb, unsigned int depth)
{
unsigned int bits_per_word = 1U << sb->shift;
unsigned int i;
for (i = 0; i < sb->map_nr; i++)
sbitmap_deferred_clear(&sb->map[i]);
sb->depth = depth;
sb->map_nr = DIV_ROUND_UP(sb->depth, bits_per_word);
for (i = 0; i < sb->map_nr; i++) {
sb->map[i].depth = min(depth, bits_per_word);
depth -= sb->map[i].depth;
}
}
EXPORT_SYMBOL_GPL(sbitmap_resize);
static int __sbitmap_get_word(unsigned long *word, unsigned long depth,
unsigned int hint, bool wrap)
{
int nr;
/* don't wrap if starting from 0 */
wrap = wrap && hint;
while (1) {
nr = find_next_zero_bit(word, depth, hint);
if (unlikely(nr >= depth)) {
/*
* We started with an offset, and we didn't reset the
* offset to 0 in a failure case, so start from 0 to
* exhaust the map.
*/
if (hint && wrap) {
hint = 0;
continue;
}
return -1;
}
if (!test_and_set_bit_lock(nr, word))
break;
hint = nr + 1;
if (hint >= depth - 1)
hint = 0;
}
return nr;
}
static int sbitmap_find_bit_in_index(struct sbitmap *sb, int index,
unsigned int alloc_hint)
{
struct sbitmap_word *map = &sb->map[index];
int nr;
do {
nr = __sbitmap_get_word(&map->word, map->depth, alloc_hint,
!sb->round_robin);
if (nr != -1)
break;
if (!sbitmap_deferred_clear(map))
break;
} while (1);
return nr;
}
static int __sbitmap_get(struct sbitmap *sb, unsigned int alloc_hint)
{
unsigned int i, index;
int nr = -1;
index = SB_NR_TO_INDEX(sb, alloc_hint);
/*
* Unless we're doing round robin tag allocation, just use the
* alloc_hint to find the right word index. No point in looping
* twice in find_next_zero_bit() for that case.
*/
if (sb->round_robin)
alloc_hint = SB_NR_TO_BIT(sb, alloc_hint);
else
alloc_hint = 0;
for (i = 0; i < sb->map_nr; i++) { nr = sbitmap_find_bit_in_index(sb, index, alloc_hint);
if (nr != -1) {
nr += index << sb->shift;
break;
}
/* Jump to next index. */
alloc_hint = 0;
if (++index >= sb->map_nr)
index = 0;
}
return nr;
}
int sbitmap_get(struct sbitmap *sb)
{
int nr;
unsigned int hint, depth;
if (WARN_ON_ONCE(unlikely(!sb->alloc_hint)))
return -1;
depth = READ_ONCE(sb->depth);
hint = update_alloc_hint_before_get(sb, depth);
nr = __sbitmap_get(sb, hint);
update_alloc_hint_after_get(sb, depth, hint, nr);
return nr;
}
EXPORT_SYMBOL_GPL(sbitmap_get);
static int __sbitmap_get_shallow(struct sbitmap *sb,
unsigned int alloc_hint,
unsigned long shallow_depth)
{
unsigned int i, index;
int nr = -1;
index = SB_NR_TO_INDEX(sb, alloc_hint); for (i = 0; i < sb->map_nr; i++) {
again:
nr = __sbitmap_get_word(&sb->map[index].word,
min(sb->map[index].depth, shallow_depth), SB_NR_TO_BIT(sb, alloc_hint), true);
if (nr != -1) {
nr += index << sb->shift;
break;
}
if (sbitmap_deferred_clear(&sb->map[index]))
goto again;
/* Jump to next index. */
index++; alloc_hint = index << sb->shift;
if (index >= sb->map_nr) {
index = 0;
alloc_hint = 0;
}
}
return nr;
}
int sbitmap_get_shallow(struct sbitmap *sb, unsigned long shallow_depth)
{
int nr;
unsigned int hint, depth;
if (WARN_ON_ONCE(unlikely(!sb->alloc_hint)))
return -1;
depth = READ_ONCE(sb->depth);
hint = update_alloc_hint_before_get(sb, depth);
nr = __sbitmap_get_shallow(sb, hint, shallow_depth);
update_alloc_hint_after_get(sb, depth, hint, nr);
return nr;
}
EXPORT_SYMBOL_GPL(sbitmap_get_shallow);
bool sbitmap_any_bit_set(const struct sbitmap *sb)
{
unsigned int i;
for (i = 0; i < sb->map_nr; i++) { if (sb->map[i].word & ~sb->map[i].cleared)
return true;
}
return false;
}
EXPORT_SYMBOL_GPL(sbitmap_any_bit_set);
static unsigned int __sbitmap_weight(const struct sbitmap *sb, bool set)
{
unsigned int i, weight = 0;
for (i = 0; i < sb->map_nr; i++) { const struct sbitmap_word *word = &sb->map[i];
if (set)
weight += bitmap_weight(&word->word, word->depth);
else
weight += bitmap_weight(&word->cleared, word->depth);
}
return weight;
}
static unsigned int sbitmap_cleared(const struct sbitmap *sb)
{
return __sbitmap_weight(sb, false);
}
unsigned int sbitmap_weight(const struct sbitmap *sb)
{
return __sbitmap_weight(sb, true) - sbitmap_cleared(sb);
}
EXPORT_SYMBOL_GPL(sbitmap_weight);
void sbitmap_show(struct sbitmap *sb, struct seq_file *m)
{
seq_printf(m, "depth=%u\n", sb->depth);
seq_printf(m, "busy=%u\n", sbitmap_weight(sb));
seq_printf(m, "cleared=%u\n", sbitmap_cleared(sb));
seq_printf(m, "bits_per_word=%u\n", 1U << sb->shift);
seq_printf(m, "map_nr=%u\n", sb->map_nr);
}
EXPORT_SYMBOL_GPL(sbitmap_show);
static inline void emit_byte(struct seq_file *m, unsigned int offset, u8 byte)
{
if ((offset & 0xf) == 0) {
if (offset != 0)
seq_putc(m, '\n');
seq_printf(m, "%08x:", offset);
}
if ((offset & 0x1) == 0)
seq_putc(m, ' ');
seq_printf(m, "%02x", byte);
}
void sbitmap_bitmap_show(struct sbitmap *sb, struct seq_file *m)
{
u8 byte = 0;
unsigned int byte_bits = 0;
unsigned int offset = 0;
int i;
for (i = 0; i < sb->map_nr; i++) {
unsigned long word = READ_ONCE(sb->map[i].word);
unsigned long cleared = READ_ONCE(sb->map[i].cleared);
unsigned int word_bits = READ_ONCE(sb->map[i].depth);
word &= ~cleared;
while (word_bits > 0) {
unsigned int bits = min(8 - byte_bits, word_bits);
byte |= (word & (BIT(bits) - 1)) << byte_bits;
byte_bits += bits;
if (byte_bits == 8) {
emit_byte(m, offset, byte);
byte = 0;
byte_bits = 0;
offset++;
}
word >>= bits;
word_bits -= bits;
}
}
if (byte_bits) {
emit_byte(m, offset, byte);
offset++;
}
if (offset)
seq_putc(m, '\n');
}
EXPORT_SYMBOL_GPL(sbitmap_bitmap_show);
static unsigned int sbq_calc_wake_batch(struct sbitmap_queue *sbq,
unsigned int depth)
{
unsigned int wake_batch;
unsigned int shallow_depth;
/*
* For each batch, we wake up one queue. We need to make sure that our
* batch size is small enough that the full depth of the bitmap,
* potentially limited by a shallow depth, is enough to wake up all of
* the queues.
*
* Each full word of the bitmap has bits_per_word bits, and there might
* be a partial word. There are depth / bits_per_word full words and
* depth % bits_per_word bits left over. In bitwise arithmetic:
*
* bits_per_word = 1 << shift
* depth / bits_per_word = depth >> shift
* depth % bits_per_word = depth & ((1 << shift) - 1)
*
* Each word can be limited to sbq->min_shallow_depth bits.
*/
shallow_depth = min(1U << sbq->sb.shift, sbq->min_shallow_depth);
depth = ((depth >> sbq->sb.shift) * shallow_depth +
min(depth & ((1U << sbq->sb.shift) - 1), shallow_depth));
wake_batch = clamp_t(unsigned int, depth / SBQ_WAIT_QUEUES, 1,
SBQ_WAKE_BATCH);
return wake_batch;
}
int sbitmap_queue_init_node(struct sbitmap_queue *sbq, unsigned int depth,
int shift, bool round_robin, gfp_t flags, int node)
{
int ret;
int i;
ret = sbitmap_init_node(&sbq->sb, depth, shift, flags, node,
round_robin, true);
if (ret)
return ret;
sbq->min_shallow_depth = UINT_MAX;
sbq->wake_batch = sbq_calc_wake_batch(sbq, depth);
atomic_set(&sbq->wake_index, 0);
atomic_set(&sbq->ws_active, 0);
sbq->ws = kzalloc_node(SBQ_WAIT_QUEUES * sizeof(*sbq->ws), flags, node);
if (!sbq->ws) {
sbitmap_free(&sbq->sb);
return -ENOMEM;
}
for (i = 0; i < SBQ_WAIT_QUEUES; i++) {
init_waitqueue_head(&sbq->ws[i].wait);
atomic_set(&sbq->ws[i].wait_cnt, sbq->wake_batch);
}
return 0;
}
EXPORT_SYMBOL_GPL(sbitmap_queue_init_node);
static void sbitmap_queue_update_wake_batch(struct sbitmap_queue *sbq,
unsigned int depth)
{
unsigned int wake_batch = sbq_calc_wake_batch(sbq, depth);
int i;
if (sbq->wake_batch != wake_batch) {
WRITE_ONCE(sbq->wake_batch, wake_batch);
/*
* Pairs with the memory barrier in sbitmap_queue_wake_up()
* to ensure that the batch size is updated before the wait
* counts.
*/
smp_mb();
for (i = 0; i < SBQ_WAIT_QUEUES; i++)
atomic_set(&sbq->ws[i].wait_cnt, 1);
}
}
void sbitmap_queue_resize(struct sbitmap_queue *sbq, unsigned int depth)
{
sbitmap_queue_update_wake_batch(sbq, depth);
sbitmap_resize(&sbq->sb, depth);
}
EXPORT_SYMBOL_GPL(sbitmap_queue_resize);
int __sbitmap_queue_get(struct sbitmap_queue *sbq)
{
return sbitmap_get(&sbq->sb);
}
EXPORT_SYMBOL_GPL(__sbitmap_queue_get);
int __sbitmap_queue_get_shallow(struct sbitmap_queue *sbq,
unsigned int shallow_depth)
{
WARN_ON_ONCE(shallow_depth < sbq->min_shallow_depth); return sbitmap_get_shallow(&sbq->sb, shallow_depth);
}
EXPORT_SYMBOL_GPL(__sbitmap_queue_get_shallow);
void sbitmap_queue_min_shallow_depth(struct sbitmap_queue *sbq,
unsigned int min_shallow_depth)
{
sbq->min_shallow_depth = min_shallow_depth;
sbitmap_queue_update_wake_batch(sbq, sbq->sb.depth);
}
EXPORT_SYMBOL_GPL(sbitmap_queue_min_shallow_depth);
static struct sbq_wait_state *sbq_wake_ptr(struct sbitmap_queue *sbq)
{
int i, wake_index;
if (!atomic_read(&sbq->ws_active))
return NULL;
wake_index = atomic_read(&sbq->wake_index);
for (i = 0; i < SBQ_WAIT_QUEUES; i++) {
struct sbq_wait_state *ws = &sbq->ws[wake_index];
if (waitqueue_active(&ws->wait)) {
if (wake_index != atomic_read(&sbq->wake_index))
atomic_set(&sbq->wake_index, wake_index);
return ws;
}
wake_index = sbq_index_inc(wake_index);
}
return NULL;
}
static bool __sbq_wake_up(struct sbitmap_queue *sbq)
{
struct sbq_wait_state *ws;
unsigned int wake_batch;
int wait_cnt;
ws = sbq_wake_ptr(sbq);
if (!ws)
return false;
wait_cnt = atomic_dec_return(&ws->wait_cnt);
if (wait_cnt <= 0) {
int ret;
wake_batch = READ_ONCE(sbq->wake_batch);
/*
* Pairs with the memory barrier in sbitmap_queue_resize() to
* ensure that we see the batch size update before the wait
* count is reset.
*/
smp_mb__before_atomic();
/*
* For concurrent callers of this, the one that failed the
* atomic_cmpxhcg() race should call this function again
* to wakeup a new batch on a different 'ws'.
*/
ret = atomic_cmpxchg(&ws->wait_cnt, wait_cnt, wake_batch);
if (ret == wait_cnt) {
sbq_index_atomic_inc(&sbq->wake_index);
wake_up_nr(&ws->wait, wake_batch);
return false;
}
return true;
}
return false;
}
void sbitmap_queue_wake_up(struct sbitmap_queue *sbq)
{
while (__sbq_wake_up(sbq))
;
}
EXPORT_SYMBOL_GPL(sbitmap_queue_wake_up);
void sbitmap_queue_clear(struct sbitmap_queue *sbq, unsigned int nr,
unsigned int cpu)
{
/*
* Once the clear bit is set, the bit may be allocated out.
*
* Orders READ/WRITE on the associated instance(such as request
* of blk_mq) by this bit for avoiding race with re-allocation,
* and its pair is the memory barrier implied in __sbitmap_get_word.
*
* One invariant is that the clear bit has to be zero when the bit
* is in use.
*/
smp_mb__before_atomic();
sbitmap_deferred_clear_bit(&sbq->sb, nr);
/*
* Pairs with the memory barrier in set_current_state() to ensure the
* proper ordering of clear_bit_unlock()/waitqueue_active() in the waker
* and test_and_set_bit_lock()/prepare_to_wait()/finish_wait() in the
* waiter. See the comment on waitqueue_active().
*/
smp_mb__after_atomic();
sbitmap_queue_wake_up(sbq);
if (likely(!sbq->sb.round_robin && nr < sbq->sb.depth))
*per_cpu_ptr(sbq->sb.alloc_hint, cpu) = nr;
}
EXPORT_SYMBOL_GPL(sbitmap_queue_clear);
void sbitmap_queue_wake_all(struct sbitmap_queue *sbq)
{
int i, wake_index;
/*
* Pairs with the memory barrier in set_current_state() like in
* sbitmap_queue_wake_up().
*/
smp_mb();
wake_index = atomic_read(&sbq->wake_index);
for (i = 0; i < SBQ_WAIT_QUEUES; i++) {
struct sbq_wait_state *ws = &sbq->ws[wake_index];
if (waitqueue_active(&ws->wait))
wake_up(&ws->wait);
wake_index = sbq_index_inc(wake_index);
}
}
EXPORT_SYMBOL_GPL(sbitmap_queue_wake_all);
void sbitmap_queue_show(struct sbitmap_queue *sbq, struct seq_file *m)
{
bool first;
int i;
sbitmap_show(&sbq->sb, m);
seq_puts(m, "alloc_hint={");
first = true;
for_each_possible_cpu(i) {
if (!first)
seq_puts(m, ", ");
first = false;
seq_printf(m, "%u", *per_cpu_ptr(sbq->sb.alloc_hint, i));
}
seq_puts(m, "}\n");
seq_printf(m, "wake_batch=%u\n", sbq->wake_batch);
seq_printf(m, "wake_index=%d\n", atomic_read(&sbq->wake_index));
seq_printf(m, "ws_active=%d\n", atomic_read(&sbq->ws_active));
seq_puts(m, "ws={\n");
for (i = 0; i < SBQ_WAIT_QUEUES; i++) {
struct sbq_wait_state *ws = &sbq->ws[i];
seq_printf(m, "\t{.wait_cnt=%d, .wait=%s},\n",
atomic_read(&ws->wait_cnt),
waitqueue_active(&ws->wait) ? "active" : "inactive");
}
seq_puts(m, "}\n");
seq_printf(m, "round_robin=%d\n", sbq->sb.round_robin);
seq_printf(m, "min_shallow_depth=%u\n", sbq->min_shallow_depth);
}
EXPORT_SYMBOL_GPL(sbitmap_queue_show);
void sbitmap_add_wait_queue(struct sbitmap_queue *sbq,
struct sbq_wait_state *ws,
struct sbq_wait *sbq_wait)
{
if (!sbq_wait->sbq) {
sbq_wait->sbq = sbq;
atomic_inc(&sbq->ws_active);
add_wait_queue(&ws->wait, &sbq_wait->wait);
}
}
EXPORT_SYMBOL_GPL(sbitmap_add_wait_queue);
void sbitmap_del_wait_queue(struct sbq_wait *sbq_wait)
{
list_del_init(&sbq_wait->wait.entry);
if (sbq_wait->sbq) {
atomic_dec(&sbq_wait->sbq->ws_active);
sbq_wait->sbq = NULL;
}
}
EXPORT_SYMBOL_GPL(sbitmap_del_wait_queue);
void sbitmap_prepare_to_wait(struct sbitmap_queue *sbq,
struct sbq_wait_state *ws,
struct sbq_wait *sbq_wait, int state)
{
if (!sbq_wait->sbq) { atomic_inc(&sbq->ws_active);
sbq_wait->sbq = sbq;
}
prepare_to_wait_exclusive(&ws->wait, &sbq_wait->wait, state);
}
EXPORT_SYMBOL_GPL(sbitmap_prepare_to_wait);
void sbitmap_finish_wait(struct sbitmap_queue *sbq, struct sbq_wait_state *ws,
struct sbq_wait *sbq_wait)
{
finish_wait(&ws->wait, &sbq_wait->wait);
if (sbq_wait->sbq) {
atomic_dec(&sbq->ws_active);
sbq_wait->sbq = NULL;
}
}
EXPORT_SYMBOL_GPL(sbitmap_finish_wait);
// SPDX-License-Identifier: GPL-2.0-only
/*
* Lock-less NULL terminated single linked list
*
* The basic atomic operation of this list is cmpxchg on long. On
* architectures that don't have NMI-safe cmpxchg implementation, the
* list can NOT be used in NMI handlers. So code that uses the list in
* an NMI handler should depend on CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG.
*
* Copyright 2010,2011 Intel Corp.
* Author: Huang Ying <ying.huang@intel.com>
*/
#include <linux/kernel.h>
#include <linux/export.h>
#include <linux/llist.h>
/**
* llist_add_batch - add several linked entries in batch
* @new_first: first entry in batch to be added
* @new_last: last entry in batch to be added
* @head: the head for your lock-less list
*
* Return whether list is empty before adding.
*/
bool llist_add_batch(struct llist_node *new_first, struct llist_node *new_last,
struct llist_head *head)
{
struct llist_node *first;
do {
new_last->next = first = READ_ONCE(head->first);
} while (cmpxchg(&head->first, first, new_first) != first);
return !first;
}
EXPORT_SYMBOL_GPL(llist_add_batch);
/**
* llist_del_first - delete the first entry of lock-less list
* @head: the head for your lock-less list
*
* If list is empty, return NULL, otherwise, return the first entry
* deleted, this is the newest added one.
*
* Only one llist_del_first user can be used simultaneously with
* multiple llist_add users without lock. Because otherwise
* llist_del_first, llist_add, llist_add (or llist_del_all, llist_add,
* llist_add) sequence in another user may change @head->first->next,
* but keep @head->first. If multiple consumers are needed, please
* use llist_del_all or use lock between consumers.
*/
struct llist_node *llist_del_first(struct llist_head *head)
{
struct llist_node *entry, *old_entry, *next;
entry = smp_load_acquire(&head->first);
for (;;) {
if (entry == NULL)
return NULL;
old_entry = entry;
next = READ_ONCE(entry->next);
entry = cmpxchg(&head->first, old_entry, next);
if (entry == old_entry)
break;
}
return entry;
}
EXPORT_SYMBOL_GPL(llist_del_first);
/**
* llist_reverse_order - reverse order of a llist chain
* @head: first item of the list to be reversed
*
* Reverse the order of a chain of llist entries and return the
* new first entry.
*/
struct llist_node *llist_reverse_order(struct llist_node *head)
{
struct llist_node *new_head = NULL;
while (head) {
struct llist_node *tmp = head;
head = head->next;
tmp->next = new_head;
new_head = tmp;
}
return new_head;
}
EXPORT_SYMBOL_GPL(llist_reverse_order);
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_SCATTERLIST_H
#define _LINUX_SCATTERLIST_H
#include <linux/string.h>
#include <linux/types.h>
#include <linux/bug.h>
#include <linux/mm.h>
#include <asm/io.h>
struct scatterlist {
unsigned long page_link;
unsigned int offset;
unsigned int length;
dma_addr_t dma_address;
#ifdef CONFIG_NEED_SG_DMA_LENGTH
unsigned int dma_length;
#endif
};
/*
* These macros should be used after a dma_map_sg call has been done
* to get bus addresses of each of the SG entries and their lengths.
* You should only work with the number of sg entries dma_map_sg
* returns, or alternatively stop on the first sg_dma_len(sg) which
* is 0.
*/
#define sg_dma_address(sg) ((sg)->dma_address)
#ifdef CONFIG_NEED_SG_DMA_LENGTH
#define sg_dma_len(sg) ((sg)->dma_length)
#else
#define sg_dma_len(sg) ((sg)->length)
#endif
struct sg_table {
struct scatterlist *sgl; /* the list */
unsigned int nents; /* number of mapped entries */
unsigned int orig_nents; /* original size of list */
};
struct sg_append_table {
struct sg_table sgt; /* The scatter list table */
struct scatterlist *prv; /* last populated sge in the table */
unsigned int total_nents; /* Total entries in the table */
};
/*
* Notes on SG table design.
*
* We use the unsigned long page_link field in the scatterlist struct to place
* the page pointer AND encode information about the sg table as well. The two
* lower bits are reserved for this information.
*
* If bit 0 is set, then the page_link contains a pointer to the next sg
* table list. Otherwise the next entry is at sg + 1.
*
* If bit 1 is set, then this sg entry is the last element in a list.
*
* See sg_next().
*
*/
#define SG_CHAIN 0x01UL
#define SG_END 0x02UL
/*
* We overload the LSB of the page pointer to indicate whether it's
* a valid sg entry, or whether it points to the start of a new scatterlist.
* Those low bits are there for everyone! (thanks mason :-)
*/
#define sg_is_chain(sg) ((sg)->page_link & SG_CHAIN)
#define sg_is_last(sg) ((sg)->page_link & SG_END)
#define sg_chain_ptr(sg) \
((struct scatterlist *) ((sg)->page_link & ~(SG_CHAIN | SG_END)))
/**
* sg_assign_page - Assign a given page to an SG entry
* @sg: SG entry
* @page: The page
*
* Description:
* Assign page to sg entry. Also see sg_set_page(), the most commonly used
* variant.
*
**/
static inline void sg_assign_page(struct scatterlist *sg, struct page *page)
{
unsigned long page_link = sg->page_link & (SG_CHAIN | SG_END);
/*
* In order for the low bit stealing approach to work, pages
* must be aligned at a 32-bit boundary as a minimum.
*/
BUG_ON((unsigned long) page & (SG_CHAIN | SG_END));
#ifdef CONFIG_DEBUG_SG
BUG_ON(sg_is_chain(sg));
#endif
sg->page_link = page_link | (unsigned long) page;
}
/**
* sg_set_page - Set sg entry to point at given page
* @sg: SG entry
* @page: The page
* @len: Length of data
* @offset: Offset into page
*
* Description:
* Use this function to set an sg entry pointing at a page, never assign
* the page directly. We encode sg table information in the lower bits
* of the page pointer. See sg_page() for looking up the page belonging
* to an sg entry.
*
**/
static inline void sg_set_page(struct scatterlist *sg, struct page *page,
unsigned int len, unsigned int offset)
{
sg_assign_page(sg, page);
sg->offset = offset;
sg->length = len;
}
static inline struct page *sg_page(struct scatterlist *sg)
{
#ifdef CONFIG_DEBUG_SG
BUG_ON(sg_is_chain(sg));
#endif
return (struct page *)((sg)->page_link & ~(SG_CHAIN | SG_END));
}
/**
* sg_set_buf - Set sg entry to point at given data
* @sg: SG entry
* @buf: Data
* @buflen: Data length
*
**/
static inline void sg_set_buf(struct scatterlist *sg, const void *buf,
unsigned int buflen)
{
#ifdef CONFIG_DEBUG_SG
BUG_ON(!virt_addr_valid(buf));
#endif
sg_set_page(sg, virt_to_page(buf), buflen, offset_in_page(buf));
}
/*
* Loop over each sg element, following the pointer to a new list if necessary
*/
#define for_each_sg(sglist, sg, nr, __i) \
for (__i = 0, sg = (sglist); __i < (nr); __i++, sg = sg_next(sg))
/*
* Loop over each sg element in the given sg_table object.
*/
#define for_each_sgtable_sg(sgt, sg, i) \
for_each_sg((sgt)->sgl, sg, (sgt)->orig_nents, i)
/*
* Loop over each sg element in the given *DMA mapped* sg_table object.
* Please use sg_dma_address(sg) and sg_dma_len(sg) to extract DMA addresses
* of the each element.
*/
#define for_each_sgtable_dma_sg(sgt, sg, i) \
for_each_sg((sgt)->sgl, sg, (sgt)->nents, i)
static inline void __sg_chain(struct scatterlist *chain_sg,
struct scatterlist *sgl)
{
/*
* offset and length are unused for chain entry. Clear them.
*/
chain_sg->offset = 0;
chain_sg->length = 0;
/*
* Set lowest bit to indicate a link pointer, and make sure to clear
* the termination bit if it happens to be set.
*/
chain_sg->page_link = ((unsigned long) sgl | SG_CHAIN) & ~SG_END;
}
/**
* sg_chain - Chain two sglists together
* @prv: First scatterlist
* @prv_nents: Number of entries in prv
* @sgl: Second scatterlist
*
* Description:
* Links @prv@ and @sgl@ together, to form a longer scatterlist.
*
**/
static inline void sg_chain(struct scatterlist *prv, unsigned int prv_nents,
struct scatterlist *sgl)
{
__sg_chain(&prv[prv_nents - 1], sgl);
}
/**
* sg_mark_end - Mark the end of the scatterlist
* @sg: SG entryScatterlist
*
* Description:
* Marks the passed in sg entry as the termination point for the sg
* table. A call to sg_next() on this entry will return NULL.
*
**/
static inline void sg_mark_end(struct scatterlist *sg)
{
/*
* Set termination bit, clear potential chain bit
*/
sg->page_link |= SG_END;
sg->page_link &= ~SG_CHAIN;
}
/**
* sg_unmark_end - Undo setting the end of the scatterlist
* @sg: SG entryScatterlist
*
* Description:
* Removes the termination marker from the given entry of the scatterlist.
*
**/
static inline void sg_unmark_end(struct scatterlist *sg)
{
sg->page_link &= ~SG_END;
}
/**
* sg_phys - Return physical address of an sg entry
* @sg: SG entry
*
* Description:
* This calls page_to_phys() on the page in this sg entry, and adds the
* sg offset. The caller must know that it is legal to call page_to_phys()
* on the sg page.
*
**/
static inline dma_addr_t sg_phys(struct scatterlist *sg)
{
return page_to_phys(sg_page(sg)) + sg->offset;
}
/**
* sg_virt - Return virtual address of an sg entry
* @sg: SG entry
*
* Description:
* This calls page_address() on the page in this sg entry, and adds the
* sg offset. The caller must know that the sg page has a valid virtual
* mapping.
*
**/
static inline void *sg_virt(struct scatterlist *sg)
{
return page_address(sg_page(sg)) + sg->offset;
}
/**
* sg_init_marker - Initialize markers in sg table
* @sgl: The SG table
* @nents: Number of entries in table
*
**/
static inline void sg_init_marker(struct scatterlist *sgl,
unsigned int nents)
{
sg_mark_end(&sgl[nents - 1]);
}
int sg_nents(struct scatterlist *sg);
int sg_nents_for_len(struct scatterlist *sg, u64 len);
struct scatterlist *sg_next(struct scatterlist *);
struct scatterlist *sg_last(struct scatterlist *s, unsigned int);
void sg_init_table(struct scatterlist *, unsigned int);
void sg_init_one(struct scatterlist *, const void *, unsigned int);
int sg_split(struct scatterlist *in, const int in_mapped_nents,
const off_t skip, const int nb_splits,
const size_t *split_sizes,
struct scatterlist **out, int *out_mapped_nents,
gfp_t gfp_mask);
typedef struct scatterlist *(sg_alloc_fn)(unsigned int, gfp_t);
typedef void (sg_free_fn)(struct scatterlist *, unsigned int);
void __sg_free_table(struct sg_table *, unsigned int, unsigned int,
sg_free_fn *, unsigned int);
void sg_free_table(struct sg_table *);
void sg_free_append_table(struct sg_append_table *sgt);
int __sg_alloc_table(struct sg_table *, unsigned int, unsigned int,
struct scatterlist *, unsigned int, gfp_t, sg_alloc_fn *);
int sg_alloc_table(struct sg_table *, unsigned int, gfp_t);
int sg_alloc_append_table_from_pages(struct sg_append_table *sgt,
struct page **pages, unsigned int n_pages,
unsigned int offset, unsigned long size,
unsigned int max_segment,
unsigned int left_pages, gfp_t gfp_mask);
int sg_alloc_table_from_pages_segment(struct sg_table *sgt, struct page **pages,
unsigned int n_pages, unsigned int offset,
unsigned long size,
unsigned int max_segment, gfp_t gfp_mask);
/**
* sg_alloc_table_from_pages - Allocate and initialize an sg table from
* an array of pages
* @sgt: The sg table header to use
* @pages: Pointer to an array of page pointers
* @n_pages: Number of pages in the pages array
* @offset: Offset from start of the first page to the start of a buffer
* @size: Number of valid bytes in the buffer (after offset)
* @gfp_mask: GFP allocation mask
*
* Description:
* Allocate and initialize an sg table from a list of pages. Contiguous
* ranges of the pages are squashed into a single scatterlist node. A user
* may provide an offset at a start and a size of valid data in a buffer
* specified by the page array. The returned sg table is released by
* sg_free_table.
*
* Returns:
* 0 on success, negative error on failure
*/
static inline int sg_alloc_table_from_pages(struct sg_table *sgt,
struct page **pages,
unsigned int n_pages,
unsigned int offset,
unsigned long size, gfp_t gfp_mask)
{
return sg_alloc_table_from_pages_segment(sgt, pages, n_pages, offset,
size, UINT_MAX, gfp_mask);
}
#ifdef CONFIG_SGL_ALLOC
struct scatterlist *sgl_alloc_order(unsigned long long length,
unsigned int order, bool chainable,
gfp_t gfp, unsigned int *nent_p);
struct scatterlist *sgl_alloc(unsigned long long length, gfp_t gfp,
unsigned int *nent_p);
void sgl_free_n_order(struct scatterlist *sgl, int nents, int order);
void sgl_free_order(struct scatterlist *sgl, int order);
void sgl_free(struct scatterlist *sgl);
#endif /* CONFIG_SGL_ALLOC */
size_t sg_copy_buffer(struct scatterlist *sgl, unsigned int nents, void *buf,
size_t buflen, off_t skip, bool to_buffer);
size_t sg_copy_from_buffer(struct scatterlist *sgl, unsigned int nents,
const void *buf, size_t buflen);
size_t sg_copy_to_buffer(struct scatterlist *sgl, unsigned int nents,
void *buf, size_t buflen);
size_t sg_pcopy_from_buffer(struct scatterlist *sgl, unsigned int nents,
const void *buf, size_t buflen, off_t skip);
size_t sg_pcopy_to_buffer(struct scatterlist *sgl, unsigned int nents,
void *buf, size_t buflen, off_t skip);
size_t sg_zero_buffer(struct scatterlist *sgl, unsigned int nents,
size_t buflen, off_t skip);
/*
* Maximum number of entries that will be allocated in one piece, if
* a list larger than this is required then chaining will be utilized.
*/
#define SG_MAX_SINGLE_ALLOC (PAGE_SIZE / sizeof(struct scatterlist))
/*
* The maximum number of SG segments that we will put inside a
* scatterlist (unless chaining is used). Should ideally fit inside a
* single page, to avoid a higher order allocation. We could define this
* to SG_MAX_SINGLE_ALLOC to pack correctly at the highest order. The
* minimum value is 32
*/
#define SG_CHUNK_SIZE 128
/*
* Like SG_CHUNK_SIZE, but for archs that have sg chaining. This limit
* is totally arbitrary, a setting of 2048 will get you at least 8mb ios.
*/
#ifdef CONFIG_ARCH_NO_SG_CHAIN
#define SG_MAX_SEGMENTS SG_CHUNK_SIZE
#else
#define SG_MAX_SEGMENTS 2048
#endif
#ifdef CONFIG_SG_POOL
void sg_free_table_chained(struct sg_table *table,
unsigned nents_first_chunk);
int sg_alloc_table_chained(struct sg_table *table, int nents,
struct scatterlist *first_chunk,
unsigned nents_first_chunk);
#endif
/*
* sg page iterator
*
* Iterates over sg entries page-by-page. On each successful iteration, you
* can call sg_page_iter_page(@piter) to get the current page.
* @piter->sg will point to the sg holding this page and @piter->sg_pgoffset to
* the page's page offset within the sg. The iteration will stop either when a
* maximum number of sg entries was reached or a terminating sg
* (sg_last(sg) == true) was reached.
*/
struct sg_page_iter {
struct scatterlist *sg; /* sg holding the page */
unsigned int sg_pgoffset; /* page offset within the sg */
/* these are internal states, keep away */
unsigned int __nents; /* remaining sg entries */
int __pg_advance; /* nr pages to advance at the
* next step */
};
/*
* sg page iterator for DMA addresses
*
* This is the same as sg_page_iter however you can call
* sg_page_iter_dma_address(@dma_iter) to get the page's DMA
* address. sg_page_iter_page() cannot be called on this iterator.
*/
struct sg_dma_page_iter {
struct sg_page_iter base;
};
bool __sg_page_iter_next(struct sg_page_iter *piter);
bool __sg_page_iter_dma_next(struct sg_dma_page_iter *dma_iter);
void __sg_page_iter_start(struct sg_page_iter *piter,
struct scatterlist *sglist, unsigned int nents,
unsigned long pgoffset);
/**
* sg_page_iter_page - get the current page held by the page iterator
* @piter: page iterator holding the page
*/
static inline struct page *sg_page_iter_page(struct sg_page_iter *piter)
{
return nth_page(sg_page(piter->sg), piter->sg_pgoffset);
}
/**
* sg_page_iter_dma_address - get the dma address of the current page held by
* the page iterator.
* @dma_iter: page iterator holding the page
*/
static inline dma_addr_t
sg_page_iter_dma_address(struct sg_dma_page_iter *dma_iter)
{
return sg_dma_address(dma_iter->base.sg) +
(dma_iter->base.sg_pgoffset << PAGE_SHIFT);
}
/**
* for_each_sg_page - iterate over the pages of the given sg list
* @sglist: sglist to iterate over
* @piter: page iterator to hold current page, sg, sg_pgoffset
* @nents: maximum number of sg entries to iterate over
* @pgoffset: starting page offset (in pages)
*
* Callers may use sg_page_iter_page() to get each page pointer.
* In each loop it operates on PAGE_SIZE unit.
*/
#define for_each_sg_page(sglist, piter, nents, pgoffset) \
for (__sg_page_iter_start((piter), (sglist), (nents), (pgoffset)); \
__sg_page_iter_next(piter);)
/**
* for_each_sg_dma_page - iterate over the pages of the given sg list
* @sglist: sglist to iterate over
* @dma_iter: DMA page iterator to hold current page
* @dma_nents: maximum number of sg entries to iterate over, this is the value
* returned from dma_map_sg
* @pgoffset: starting page offset (in pages)
*
* Callers may use sg_page_iter_dma_address() to get each page's DMA address.
* In each loop it operates on PAGE_SIZE unit.
*/
#define for_each_sg_dma_page(sglist, dma_iter, dma_nents, pgoffset) \
for (__sg_page_iter_start(&(dma_iter)->base, sglist, dma_nents, \
pgoffset); \
__sg_page_iter_dma_next(dma_iter);)
/**
* for_each_sgtable_page - iterate over all pages in the sg_table object
* @sgt: sg_table object to iterate over
* @piter: page iterator to hold current page
* @pgoffset: starting page offset (in pages)
*
* Iterates over the all memory pages in the buffer described by
* a scatterlist stored in the given sg_table object.
* See also for_each_sg_page(). In each loop it operates on PAGE_SIZE unit.
*/
#define for_each_sgtable_page(sgt, piter, pgoffset) \
for_each_sg_page((sgt)->sgl, piter, (sgt)->orig_nents, pgoffset)
/**
* for_each_sgtable_dma_page - iterate over the DMA mapped sg_table object
* @sgt: sg_table object to iterate over
* @dma_iter: DMA page iterator to hold current page
* @pgoffset: starting page offset (in pages)
*
* Iterates over the all DMA mapped pages in the buffer described by
* a scatterlist stored in the given sg_table object.
* See also for_each_sg_dma_page(). In each loop it operates on PAGE_SIZE
* unit.
*/
#define for_each_sgtable_dma_page(sgt, dma_iter, pgoffset) \
for_each_sg_dma_page((sgt)->sgl, dma_iter, (sgt)->nents, pgoffset)
/*
* Mapping sg iterator
*
* Iterates over sg entries mapping page-by-page. On each successful
* iteration, @miter->page points to the mapped page and
* @miter->length bytes of data can be accessed at @miter->addr. As
* long as an iteration is enclosed between start and stop, the user
* is free to choose control structure and when to stop.
*
* @miter->consumed is set to @miter->length on each iteration. It
* can be adjusted if the user can't consume all the bytes in one go.
* Also, a stopped iteration can be resumed by calling next on it.
* This is useful when iteration needs to release all resources and
* continue later (e.g. at the next interrupt).
*/
#define SG_MITER_ATOMIC (1 << 0) /* use kmap_atomic */
#define SG_MITER_TO_SG (1 << 1) /* flush back to phys on unmap */
#define SG_MITER_FROM_SG (1 << 2) /* nop */
struct sg_mapping_iter {
/* the following three fields can be accessed directly */
struct page *page; /* currently mapped page */
void *addr; /* pointer to the mapped area */
size_t length; /* length of the mapped area */
size_t consumed; /* number of consumed bytes */
struct sg_page_iter piter; /* page iterator */
/* these are internal states, keep away */
unsigned int __offset; /* offset within page */
unsigned int __remaining; /* remaining bytes on page */
unsigned int __flags;
};
void sg_miter_start(struct sg_mapping_iter *miter, struct scatterlist *sgl,
unsigned int nents, unsigned int flags);
bool sg_miter_skip(struct sg_mapping_iter *miter, off_t offset);
bool sg_miter_next(struct sg_mapping_iter *miter);
void sg_miter_stop(struct sg_mapping_iter *miter);
#endif /* _LINUX_SCATTERLIST_H */
// SPDX-License-Identifier: GPL-2.0-only
/*
* linux/fs/pnode.c
*
* (C) Copyright IBM Corporation 2005.
* Author : Ram Pai (linuxram@us.ibm.com)
*/
#include <linux/mnt_namespace.h>
#include <linux/mount.h>
#include <linux/fs.h>
#include <linux/nsproxy.h>
#include <uapi/linux/mount.h>
#include "internal.h"
#include "pnode.h"
/* return the next shared peer mount of @p */
static inline struct mount *next_peer(struct mount *p)
{
return list_entry(p->mnt_share.next, struct mount, mnt_share);
}
static inline struct mount *first_slave(struct mount *p)
{
return list_entry(p->mnt_slave_list.next, struct mount, mnt_slave);
}
static inline struct mount *last_slave(struct mount *p)
{
return list_entry(p->mnt_slave_list.prev, struct mount, mnt_slave);
}
static inline struct mount *next_slave(struct mount *p)
{
return list_entry(p->mnt_slave.next, struct mount, mnt_slave);
}
static struct mount *get_peer_under_root(struct mount *mnt,
struct mnt_namespace *ns,
const struct path *root)
{
struct mount *m = mnt;
do {
/* Check the namespace first for optimization */
if (m->mnt_ns == ns && is_path_reachable(m, m->mnt.mnt_root, root))
return m;
m = next_peer(m);
} while (m != mnt);
return NULL;
}
/*
* Get ID of closest dominating peer group having a representative
* under the given root.
*
* Caller must hold namespace_sem
*/
int get_dominating_id(struct mount *mnt, const struct path *root)
{
struct mount *m;
for (m = mnt->mnt_master; m != NULL; m = m->mnt_master) {
struct mount *d = get_peer_under_root(m, mnt->mnt_ns, root);
if (d)
return d->mnt_group_id;
}
return 0;
}
static int do_make_slave(struct mount *mnt)
{
struct mount *master, *slave_mnt;
if (list_empty(&mnt->mnt_share)) { if (IS_MNT_SHARED(mnt)) { mnt_release_group_id(mnt);
CLEAR_MNT_SHARED(mnt);
}
master = mnt->mnt_master;
if (!master) {
struct list_head *p = &mnt->mnt_slave_list;
while (!list_empty(p)) {
slave_mnt = list_first_entry(p,
struct mount, mnt_slave);
list_del_init(&slave_mnt->mnt_slave);
slave_mnt->mnt_master = NULL;
}
return 0;
}
} else {
struct mount *m;
/*
* slave 'mnt' to a peer mount that has the
* same root dentry. If none is available then
* slave it to anything that is available.
*/
for (m = master = next_peer(mnt); m != mnt; m = next_peer(m)) { if (m->mnt.mnt_root == mnt->mnt.mnt_root) {
master = m;
break;
}
}
list_del_init(&mnt->mnt_share);
mnt->mnt_group_id = 0;
CLEAR_MNT_SHARED(mnt);
}
list_for_each_entry(slave_mnt, &mnt->mnt_slave_list, mnt_slave) slave_mnt->mnt_master = master; list_move(&mnt->mnt_slave, &master->mnt_slave_list); list_splice(&mnt->mnt_slave_list, master->mnt_slave_list.prev);
INIT_LIST_HEAD(&mnt->mnt_slave_list);
mnt->mnt_master = master;
return 0;
}
/*
* vfsmount lock must be held for write
*/
void change_mnt_propagation(struct mount *mnt, int type)
{
if (type == MS_SHARED) {
set_mnt_shared(mnt);
return;
}
do_make_slave(mnt);
if (type != MS_SLAVE) { list_del_init(&mnt->mnt_slave);
mnt->mnt_master = NULL;
if (type == MS_UNBINDABLE)
mnt->mnt.mnt_flags |= MNT_UNBINDABLE;
else
mnt->mnt.mnt_flags &= ~MNT_UNBINDABLE;
}
}
/*
* get the next mount in the propagation tree.
* @m: the mount seen last
* @origin: the original mount from where the tree walk initiated
*
* Note that peer groups form contiguous segments of slave lists.
* We rely on that in get_source() to be able to find out if
* vfsmount found while iterating with propagation_next() is
* a peer of one we'd found earlier.
*/
static struct mount *propagation_next(struct mount *m,
struct mount *origin)
{
/* are there any slaves of this mount? */
if (!IS_MNT_NEW(m) && !list_empty(&m->mnt_slave_list))
return first_slave(m);
while (1) {
struct mount *master = m->mnt_master;
if (master == origin->mnt_master) {
struct mount *next = next_peer(m);
return (next == origin) ? NULL : next;
} else if (m->mnt_slave.next != &master->mnt_slave_list)
return next_slave(m);
/* back at master */
m = master;
}
}
static struct mount *skip_propagation_subtree(struct mount *m,
struct mount *origin)
{
/*
* Advance m such that propagation_next will not return
* the slaves of m.
*/
if (!IS_MNT_NEW(m) && !list_empty(&m->mnt_slave_list))
m = last_slave(m);
return m;
}
static struct mount *next_group(struct mount *m, struct mount *origin)
{
while (1) {
while (1) {
struct mount *next;
if (!IS_MNT_NEW(m) && !list_empty(&m->mnt_slave_list)) return first_slave(m); next = next_peer(m);
if (m->mnt_group_id == origin->mnt_group_id) {
if (next == origin)
return NULL;
} else if (m->mnt_slave.next != &next->mnt_slave)
break;
m = next;
}
/* m is the last peer */
while (1) {
struct mount *master = m->mnt_master;
if (m->mnt_slave.next != &master->mnt_slave_list)
return next_slave(m);
m = next_peer(master);
if (master->mnt_group_id == origin->mnt_group_id)
break;
if (master->mnt_slave.next == &m->mnt_slave)
break;
m = master;
}
if (m == origin)
return NULL;
}
}
/* all accesses are serialized by namespace_sem */
static struct mount *last_dest, *first_source, *last_source, *dest_master;
static struct mountpoint *mp;
static struct hlist_head *list;
static inline bool peers(struct mount *m1, struct mount *m2)
{
return m1->mnt_group_id == m2->mnt_group_id && m1->mnt_group_id;
}
static int propagate_one(struct mount *m)
{
struct mount *child;
int type;
/* skip ones added by this propagate_mnt() */
if (IS_MNT_NEW(m))
return 0;
/* skip if mountpoint isn't covered by it */
if (!is_subdir(mp->m_dentry, m->mnt.mnt_root))
return 0;
if (peers(m, last_dest)) {
type = CL_MAKE_SHARED;
} else {
struct mount *n, *p;
bool done;
for (n = m; ; n = p) {
p = n->mnt_master;
if (p == dest_master || IS_MNT_MARKED(p))
break;
}
do {
struct mount *parent = last_source->mnt_parent;
if (last_source == first_source)
break;
done = parent->mnt_master == p;
if (done && peers(n, parent))
break;
last_source = last_source->mnt_master;
} while (!done);
type = CL_SLAVE;
/* beginning of peer group among the slaves? */
if (IS_MNT_SHARED(m))
type |= CL_MAKE_SHARED;
}
child = copy_tree(last_source, last_source->mnt.mnt_root, type);
if (IS_ERR(child))
return PTR_ERR(child);
read_seqlock_excl(&mount_lock);
mnt_set_mountpoint(m, mp, child);
if (m->mnt_master != dest_master)
SET_MNT_MARK(m->mnt_master);
read_sequnlock_excl(&mount_lock);
last_dest = m;
last_source = child;
hlist_add_head(&child->mnt_hash, list);
return count_mounts(m->mnt_ns, child);
}
/*
* mount 'source_mnt' under the destination 'dest_mnt' at
* dentry 'dest_dentry'. And propagate that mount to
* all the peer and slave mounts of 'dest_mnt'.
* Link all the new mounts into a propagation tree headed at
* source_mnt. Also link all the new mounts using ->mnt_list
* headed at source_mnt's ->mnt_list
*
* @dest_mnt: destination mount.
* @dest_dentry: destination dentry.
* @source_mnt: source mount.
* @tree_list : list of heads of trees to be attached.
*/
int propagate_mnt(struct mount *dest_mnt, struct mountpoint *dest_mp,
struct mount *source_mnt, struct hlist_head *tree_list)
{
struct mount *m, *n;
int ret = 0;
/*
* we don't want to bother passing tons of arguments to
* propagate_one(); everything is serialized by namespace_sem,
* so globals will do just fine.
*/
last_dest = dest_mnt;
first_source = source_mnt;
last_source = source_mnt;
mp = dest_mp;
list = tree_list;
dest_master = dest_mnt->mnt_master;
/* all peers of dest_mnt, except dest_mnt itself */
for (n = next_peer(dest_mnt); n != dest_mnt; n = next_peer(n)) { ret = propagate_one(n);
if (ret)
goto out;
}
/* all slave groups */
for (m = next_group(dest_mnt, dest_mnt); m; m = next_group(m, dest_mnt)) {
/* everything in that slave group */
n = m;
do {
ret = propagate_one(n);
if (ret)
goto out;
n = next_peer(n);
} while (n != m);
}
out:
read_seqlock_excl(&mount_lock);
hlist_for_each_entry(n, tree_list, mnt_hash) { m = n->mnt_parent;
if (m->mnt_master != dest_mnt->mnt_master)
CLEAR_MNT_MARK(m->mnt_master);
}
read_sequnlock_excl(&mount_lock);
return ret;
}
static struct mount *find_topper(struct mount *mnt)
{
/* If there is exactly one mount covering mnt completely return it. */
struct mount *child;
if (!list_is_singular(&mnt->mnt_mounts))
return NULL;
child = list_first_entry(&mnt->mnt_mounts, struct mount, mnt_child);
if (child->mnt_mountpoint != mnt->mnt.mnt_root)
return NULL;
return child;
}
/*
* return true if the refcount is greater than count
*/
static inline int do_refcount_check(struct mount *mnt, int count)
{
return mnt_get_count(mnt) > count;
}
/*
* check if the mount 'mnt' can be unmounted successfully.
* @mnt: the mount to be checked for unmount
* NOTE: unmounting 'mnt' would naturally propagate to all
* other mounts its parent propagates to.
* Check if any of these mounts that **do not have submounts**
* have more references than 'refcnt'. If so return busy.
*
* vfsmount lock must be held for write
*/
int propagate_mount_busy(struct mount *mnt, int refcnt)
{
struct mount *m, *child, *topper;
struct mount *parent = mnt->mnt_parent;
if (mnt == parent)
return do_refcount_check(mnt, refcnt);
/*
* quickly check if the current mount can be unmounted.
* If not, we don't have to go checking for all other
* mounts
*/
if (!list_empty(&mnt->mnt_mounts) || do_refcount_check(mnt, refcnt))
return 1;
for (m = propagation_next(parent, parent); m;
m = propagation_next(m, parent)) {
int count = 1;
child = __lookup_mnt(&m->mnt, mnt->mnt_mountpoint);
if (!child)
continue;
/* Is there exactly one mount on the child that covers
* it completely whose reference should be ignored?
*/
topper = find_topper(child);
if (topper)
count += 1;
else if (!list_empty(&child->mnt_mounts))
continue;
if (do_refcount_check(child, count))
return 1;
}
return 0;
}
/*
* Clear MNT_LOCKED when it can be shown to be safe.
*
* mount_lock lock must be held for write
*/
void propagate_mount_unlock(struct mount *mnt)
{
struct mount *parent = mnt->mnt_parent;
struct mount *m, *child;
BUG_ON(parent == mnt);
for (m = propagation_next(parent, parent); m;
m = propagation_next(m, parent)) {
child = __lookup_mnt(&m->mnt, mnt->mnt_mountpoint);
if (child)
child->mnt.mnt_flags &= ~MNT_LOCKED;
}
}
static void umount_one(struct mount *mnt, struct list_head *to_umount)
{
CLEAR_MNT_MARK(mnt);
mnt->mnt.mnt_flags |= MNT_UMOUNT;
list_del_init(&mnt->mnt_child);
list_del_init(&mnt->mnt_umounting);
list_move_tail(&mnt->mnt_list, to_umount);
}
/*
* NOTE: unmounting 'mnt' naturally propagates to all other mounts its
* parent propagates to.
*/
static bool __propagate_umount(struct mount *mnt,
struct list_head *to_umount,
struct list_head *to_restore)
{
bool progress = false;
struct mount *child;
/*
* The state of the parent won't change if this mount is
* already unmounted or marked as without children.
*/
if (mnt->mnt.mnt_flags & (MNT_UMOUNT | MNT_MARKED))
goto out;
/* Verify topper is the only grandchild that has not been
* speculatively unmounted.
*/
list_for_each_entry(child, &mnt->mnt_mounts, mnt_child) {
if (child->mnt_mountpoint == mnt->mnt.mnt_root)
continue;
if (!list_empty(&child->mnt_umounting) && IS_MNT_MARKED(child))
continue;
/* Found a mounted child */
goto children;
}
/* Mark mounts that can be unmounted if not locked */
SET_MNT_MARK(mnt);
progress = true;
/* If a mount is without children and not locked umount it. */
if (!IS_MNT_LOCKED(mnt)) {
umount_one(mnt, to_umount);
} else {
children:
list_move_tail(&mnt->mnt_umounting, to_restore);
}
out:
return progress;
}
static void umount_list(struct list_head *to_umount,
struct list_head *to_restore)
{
struct mount *mnt, *child, *tmp;
list_for_each_entry(mnt, to_umount, mnt_list) {
list_for_each_entry_safe(child, tmp, &mnt->mnt_mounts, mnt_child) {
/* topper? */
if (child->mnt_mountpoint == mnt->mnt.mnt_root)
list_move_tail(&child->mnt_umounting, to_restore);
else
umount_one(child, to_umount);
}
}
}
static void restore_mounts(struct list_head *to_restore)
{
/* Restore mounts to a clean working state */
while (!list_empty(to_restore)) {
struct mount *mnt, *parent;
struct mountpoint *mp;
mnt = list_first_entry(to_restore, struct mount, mnt_umounting);
CLEAR_MNT_MARK(mnt);
list_del_init(&mnt->mnt_umounting);
/* Should this mount be reparented? */
mp = mnt->mnt_mp;
parent = mnt->mnt_parent;
while (parent->mnt.mnt_flags & MNT_UMOUNT) {
mp = parent->mnt_mp;
parent = parent->mnt_parent;
}
if (parent != mnt->mnt_parent)
mnt_change_mountpoint(parent, mp, mnt);
}
}
static void cleanup_umount_visitations(struct list_head *visited)
{
while (!list_empty(visited)) {
struct mount *mnt =
list_first_entry(visited, struct mount, mnt_umounting);
list_del_init(&mnt->mnt_umounting);
}
}
/*
* collect all mounts that receive propagation from the mount in @list,
* and return these additional mounts in the same list.
* @list: the list of mounts to be unmounted.
*
* vfsmount lock must be held for write
*/
int propagate_umount(struct list_head *list)
{
struct mount *mnt;
LIST_HEAD(to_restore);
LIST_HEAD(to_umount);
LIST_HEAD(visited);
/* Find candidates for unmounting */
list_for_each_entry_reverse(mnt, list, mnt_list) {
struct mount *parent = mnt->mnt_parent;
struct mount *m;
/*
* If this mount has already been visited it is known that it's
* entire peer group and all of their slaves in the propagation
* tree for the mountpoint has already been visited and there is
* no need to visit them again.
*/
if (!list_empty(&mnt->mnt_umounting))
continue;
list_add_tail(&mnt->mnt_umounting, &visited);
for (m = propagation_next(parent, parent); m;
m = propagation_next(m, parent)) {
struct mount *child = __lookup_mnt(&m->mnt,
mnt->mnt_mountpoint);
if (!child)
continue;
if (!list_empty(&child->mnt_umounting)) {
/*
* If the child has already been visited it is
* know that it's entire peer group and all of
* their slaves in the propgation tree for the
* mountpoint has already been visited and there
* is no need to visit this subtree again.
*/
m = skip_propagation_subtree(m, parent);
continue;
} else if (child->mnt.mnt_flags & MNT_UMOUNT) {
/*
* We have come accross an partially unmounted
* mount in list that has not been visited yet.
* Remember it has been visited and continue
* about our merry way.
*/
list_add_tail(&child->mnt_umounting, &visited);
continue;
}
/* Check the child and parents while progress is made */
while (__propagate_umount(child,
&to_umount, &to_restore)) {
/* Is the parent a umount candidate? */
child = child->mnt_parent;
if (list_empty(&child->mnt_umounting))
break;
}
}
}
umount_list(&to_umount, &to_restore);
restore_mounts(&to_restore);
cleanup_umount_visitations(&visited);
list_splice_tail(&to_umount, list);
return 0;
}
// SPDX-License-Identifier: GPL-2.0
/*
* Copyright (C) 2007 Oracle. All rights reserved.
*/
#include <linux/slab.h>
#include <linux/blkdev.h>
#include <linux/writeback.h>
#include <linux/sched/mm.h>
#include "misc.h"
#include "ctree.h"
#include "transaction.h"
#include "btrfs_inode.h"
#include "extent_io.h"
#include "disk-io.h"
#include "compression.h"
#include "delalloc-space.h"
#include "qgroup.h"
#include "subpage.h"
static struct kmem_cache *btrfs_ordered_extent_cache;
static u64 entry_end(struct btrfs_ordered_extent *entry)
{
if (entry->file_offset + entry->num_bytes < entry->file_offset)
return (u64)-1;
return entry->file_offset + entry->num_bytes;
}
/* returns NULL if the insertion worked, or it returns the node it did find
* in the tree
*/
static struct rb_node *tree_insert(struct rb_root *root, u64 file_offset,
struct rb_node *node)
{
struct rb_node **p = &root->rb_node;
struct rb_node *parent = NULL;
struct btrfs_ordered_extent *entry;
while (*p) {
parent = *p;
entry = rb_entry(parent, struct btrfs_ordered_extent, rb_node);
if (file_offset < entry->file_offset) p = &(*p)->rb_left; else if (file_offset >= entry_end(entry)) p = &(*p)->rb_right;
else
return parent;
}
rb_link_node(node, parent, p);
rb_insert_color(node, root);
return NULL;
}
/*
* look for a given offset in the tree, and if it can't be found return the
* first lesser offset
*/
static struct rb_node *__tree_search(struct rb_root *root, u64 file_offset,
struct rb_node **prev_ret)
{
struct rb_node *n = root->rb_node;
struct rb_node *prev = NULL;
struct rb_node *test;
struct btrfs_ordered_extent *entry;
struct btrfs_ordered_extent *prev_entry = NULL;
while (n) { entry = rb_entry(n, struct btrfs_ordered_extent, rb_node);
prev = n;
prev_entry = entry;
if (file_offset < entry->file_offset) n = n->rb_left; else if (file_offset >= entry_end(entry)) n = n->rb_right;
else
return n;
}
if (!prev_ret)
return NULL;
while (prev && file_offset >= entry_end(prev_entry)) { test = rb_next(prev);
if (!test)
break;
prev_entry = rb_entry(test, struct btrfs_ordered_extent,
rb_node);
if (file_offset < entry_end(prev_entry))
break;
prev = test;
}
if (prev)
prev_entry = rb_entry(prev, struct btrfs_ordered_extent,
rb_node);
while (prev && file_offset < entry_end(prev_entry)) { test = rb_prev(prev);
if (!test)
break;
prev_entry = rb_entry(test, struct btrfs_ordered_extent,
rb_node);
prev = test;
}
*prev_ret = prev; return NULL;
}
static int range_overlaps(struct btrfs_ordered_extent *entry, u64 file_offset,
u64 len)
{
if (file_offset + len <= entry->file_offset ||
entry->file_offset + entry->num_bytes <= file_offset)
return 0;
return 1;
}
/*
* look find the first ordered struct that has this offset, otherwise
* the first one less than this offset
*/
static inline struct rb_node *tree_search(struct btrfs_ordered_inode_tree *tree,
u64 file_offset)
{
struct rb_root *root = &tree->tree;
struct rb_node *prev = NULL;
struct rb_node *ret;
struct btrfs_ordered_extent *entry;
if (tree->last) {
entry = rb_entry(tree->last, struct btrfs_ordered_extent,
rb_node);
if (in_range(file_offset, entry->file_offset, entry->num_bytes))
return tree->last;
}
ret = __tree_search(root, file_offset, &prev);
if (!ret)
ret = prev; if (ret) tree->last = ret;
return ret;
}
/*
* Allocate and add a new ordered_extent into the per-inode tree.
*
* The tree is given a single reference on the ordered extent that was
* inserted.
*/
static int __btrfs_add_ordered_extent(struct btrfs_inode *inode, u64 file_offset,
u64 disk_bytenr, u64 num_bytes,
u64 disk_num_bytes, int type, int dio,
int compress_type)
{
struct btrfs_root *root = inode->root;
struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_ordered_inode_tree *tree = &inode->ordered_tree;
struct rb_node *node;
struct btrfs_ordered_extent *entry;
int ret;
if (type == BTRFS_ORDERED_NOCOW || type == BTRFS_ORDERED_PREALLOC) {
/* For nocow write, we can release the qgroup rsv right now */
ret = btrfs_qgroup_free_data(inode, NULL, file_offset, num_bytes);
if (ret < 0)
return ret;
ret = 0;
} else {
/*
* The ordered extent has reserved qgroup space, release now
* and pass the reserved number for qgroup_record to free.
*/
ret = btrfs_qgroup_release_data(inode, file_offset, num_bytes);
if (ret < 0)
return ret;
}
entry = kmem_cache_zalloc(btrfs_ordered_extent_cache, GFP_NOFS);
if (!entry)
return -ENOMEM;
entry->file_offset = file_offset;
entry->disk_bytenr = disk_bytenr;
entry->num_bytes = num_bytes;
entry->disk_num_bytes = disk_num_bytes;
entry->bytes_left = num_bytes;
entry->inode = igrab(&inode->vfs_inode);
entry->compress_type = compress_type;
entry->truncated_len = (u64)-1;
entry->qgroup_rsv = ret;
entry->physical = (u64)-1;
ASSERT(type == BTRFS_ORDERED_REGULAR ||
type == BTRFS_ORDERED_NOCOW ||
type == BTRFS_ORDERED_PREALLOC ||
type == BTRFS_ORDERED_COMPRESSED);
set_bit(type, &entry->flags);
percpu_counter_add_batch(&fs_info->ordered_bytes, num_bytes,
fs_info->delalloc_batch);
if (dio)
set_bit(BTRFS_ORDERED_DIRECT, &entry->flags);
/* one ref for the tree */
refcount_set(&entry->refs, 1);
init_waitqueue_head(&entry->wait);
INIT_LIST_HEAD(&entry->list);
INIT_LIST_HEAD(&entry->log_list);
INIT_LIST_HEAD(&entry->root_extent_list);
INIT_LIST_HEAD(&entry->work_list);
init_completion(&entry->completion);
trace_btrfs_ordered_extent_add(inode, entry);
spin_lock_irq(&tree->lock);
node = tree_insert(&tree->tree, file_offset,
&entry->rb_node);
if (node)
btrfs_panic(fs_info, -EEXIST,
"inconsistency in ordered tree at offset %llu",
file_offset);
spin_unlock_irq(&tree->lock);
spin_lock(&root->ordered_extent_lock);
list_add_tail(&entry->root_extent_list,
&root->ordered_extents);
root->nr_ordered_extents++;
if (root->nr_ordered_extents == 1) {
spin_lock(&fs_info->ordered_root_lock);
BUG_ON(!list_empty(&root->ordered_root)); list_add_tail(&root->ordered_root, &fs_info->ordered_roots);
spin_unlock(&fs_info->ordered_root_lock);
}
spin_unlock(&root->ordered_extent_lock);
/*
* We don't need the count_max_extents here, we can assume that all of
* that work has been done at higher layers, so this is truly the
* smallest the extent is going to get.
*/
spin_lock(&inode->lock);
btrfs_mod_outstanding_extents(inode, 1);
spin_unlock(&inode->lock);
return 0;
}
int btrfs_add_ordered_extent(struct btrfs_inode *inode, u64 file_offset,
u64 disk_bytenr, u64 num_bytes, u64 disk_num_bytes,
int type)
{
ASSERT(type == BTRFS_ORDERED_REGULAR ||
type == BTRFS_ORDERED_NOCOW ||
type == BTRFS_ORDERED_PREALLOC);
return __btrfs_add_ordered_extent(inode, file_offset, disk_bytenr,
num_bytes, disk_num_bytes, type, 0,
BTRFS_COMPRESS_NONE);
}
int btrfs_add_ordered_extent_dio(struct btrfs_inode *inode, u64 file_offset,
u64 disk_bytenr, u64 num_bytes,
u64 disk_num_bytes, int type)
{
ASSERT(type == BTRFS_ORDERED_REGULAR ||
type == BTRFS_ORDERED_NOCOW ||
type == BTRFS_ORDERED_PREALLOC);
return __btrfs_add_ordered_extent(inode, file_offset, disk_bytenr,
num_bytes, disk_num_bytes, type, 1,
BTRFS_COMPRESS_NONE);
}
int btrfs_add_ordered_extent_compress(struct btrfs_inode *inode, u64 file_offset,
u64 disk_bytenr, u64 num_bytes,
u64 disk_num_bytes, int compress_type)
{
ASSERT(compress_type != BTRFS_COMPRESS_NONE);
return __btrfs_add_ordered_extent(inode, file_offset, disk_bytenr,
num_bytes, disk_num_bytes,
BTRFS_ORDERED_COMPRESSED, 0,
compress_type);
}
/*
* Add a struct btrfs_ordered_sum into the list of checksums to be inserted
* when an ordered extent is finished. If the list covers more than one
* ordered extent, it is split across multiples.
*/
void btrfs_add_ordered_sum(struct btrfs_ordered_extent *entry,
struct btrfs_ordered_sum *sum)
{
struct btrfs_ordered_inode_tree *tree;
tree = &BTRFS_I(entry->inode)->ordered_tree;
spin_lock_irq(&tree->lock);
list_add_tail(&sum->list, &entry->list);
spin_unlock_irq(&tree->lock);
}
/*
* Mark all ordered extents io inside the specified range finished.
*
* @page: The invovled page for the opeartion.
* For uncompressed buffered IO, the page status also needs to be
* updated to indicate whether the pending ordered io is finished.
* Can be NULL for direct IO and compressed write.
* For these cases, callers are ensured they won't execute the
* endio function twice.
* @finish_func: The function to be executed when all the IO of an ordered
* extent are finished.
*
* This function is called for endio, thus the range must have ordered
* extent(s) coveri it.
*/
void btrfs_mark_ordered_io_finished(struct btrfs_inode *inode,
struct page *page, u64 file_offset,
u64 num_bytes, btrfs_func_t finish_func,
bool uptodate)
{
struct btrfs_ordered_inode_tree *tree = &inode->ordered_tree;
struct btrfs_fs_info *fs_info = inode->root->fs_info;
struct btrfs_workqueue *wq;
struct rb_node *node;
struct btrfs_ordered_extent *entry = NULL;
unsigned long flags;
u64 cur = file_offset;
if (btrfs_is_free_space_inode(inode))
wq = fs_info->endio_freespace_worker;
else
wq = fs_info->endio_write_workers;
if (page)
ASSERT(page->mapping && page_offset(page) <= file_offset &&
file_offset + num_bytes <= page_offset(page) + PAGE_SIZE);
spin_lock_irqsave(&tree->lock, flags); while (cur < file_offset + num_bytes) {
u64 entry_end;
u64 end;
u32 len;
node = tree_search(tree, cur);
/* No ordered extents at all */
if (!node)
break;
entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);
entry_end = entry->file_offset + entry->num_bytes;
/*
* |<-- OE --->| |
* cur
* Go to next OE.
*/
if (cur >= entry_end) {
node = rb_next(node);
/* No more ordered extents, exit */
if (!node)
break;
entry = rb_entry(node, struct btrfs_ordered_extent,
rb_node);
/* Go to next ordered extent and continue */
cur = entry->file_offset;
continue;
}
/*
* | |<--- OE --->|
* cur
* Go to the start of OE.
*/
if (cur < entry->file_offset) {
cur = entry->file_offset;
continue;
}
/*
* Now we are definitely inside one ordered extent.
*
* |<--- OE --->|
* |
* cur
*/
end = min(entry->file_offset + entry->num_bytes,
file_offset + num_bytes) - 1;
ASSERT(end + 1 - cur < U32_MAX);
len = end + 1 - cur;
if (page) {
/*
* Ordered (Private2) bit indicates whether we still
* have pending io unfinished for the ordered extent.
*
* If there's no such bit, we need to skip to next range.
*/
if (!btrfs_page_test_ordered(fs_info, page, cur, len)) {
cur += len;
continue;
}
btrfs_page_clear_ordered(fs_info, page, cur, len);
}
/* Now we're fine to update the accounting */
if (unlikely(len > entry->bytes_left)) { WARN_ON(1);
btrfs_crit(fs_info,
"bad ordered extent accounting, root=%llu ino=%llu OE offset=%llu OE len=%llu to_dec=%u left=%llu",
inode->root->root_key.objectid,
btrfs_ino(inode),
entry->file_offset,
entry->num_bytes,
len, entry->bytes_left);
entry->bytes_left = 0;
} else {
entry->bytes_left -= len;
}
if (!uptodate)
set_bit(BTRFS_ORDERED_IOERR, &entry->flags);
/*
* All the IO of the ordered extent is finished, we need to queue
* the finish_func to be executed.
*/
if (entry->bytes_left == 0) { set_bit(BTRFS_ORDERED_IO_DONE, &entry->flags); cond_wake_up(&entry->wait); refcount_inc(&entry->refs);
spin_unlock_irqrestore(&tree->lock, flags);
btrfs_init_work(&entry->work, finish_func, NULL, NULL);
btrfs_queue_work(wq, &entry->work);
spin_lock_irqsave(&tree->lock, flags);
}
cur += len;
}
spin_unlock_irqrestore(&tree->lock, flags);
}
/*
* Finish IO for one ordered extent across a given range. The range can only
* contain one ordered extent.
*
* @cached: The cached ordered extent. If not NULL, we can skip the tree
* search and use the ordered extent directly.
* Will be also used to store the finished ordered extent.
* @file_offset: File offset for the finished IO
* @io_size: Length of the finish IO range
*
* Return true if the ordered extent is finished in the range, and update
* @cached.
* Return false otherwise.
*
* NOTE: The range can NOT cross multiple ordered extents.
* Thus caller should ensure the range doesn't cross ordered extents.
*/
bool btrfs_dec_test_ordered_pending(struct btrfs_inode *inode,
struct btrfs_ordered_extent **cached,
u64 file_offset, u64 io_size)
{
struct btrfs_ordered_inode_tree *tree = &inode->ordered_tree;
struct rb_node *node;
struct btrfs_ordered_extent *entry = NULL;
unsigned long flags;
bool finished = false;
spin_lock_irqsave(&tree->lock, flags);
if (cached && *cached) {
entry = *cached;
goto have_entry;
}
node = tree_search(tree, file_offset);
if (!node)
goto out;
entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);
have_entry:
if (!in_range(file_offset, entry->file_offset, entry->num_bytes))
goto out;
if (io_size > entry->bytes_left)
btrfs_crit(inode->root->fs_info,
"bad ordered accounting left %llu size %llu",
entry->bytes_left, io_size);
entry->bytes_left -= io_size;
if (entry->bytes_left == 0) {
/*
* Ensure only one caller can set the flag and finished_ret
* accordingly
*/
finished = !test_and_set_bit(BTRFS_ORDERED_IO_DONE, &entry->flags);
/* test_and_set_bit implies a barrier */
cond_wake_up_nomb(&entry->wait);
}
out:
if (finished && cached && entry) { *cached = entry;
refcount_inc(&entry->refs);
}
spin_unlock_irqrestore(&tree->lock, flags);
return finished;
}
/*
* used to drop a reference on an ordered extent. This will free
* the extent if the last reference is dropped
*/
void btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry)
{
struct list_head *cur;
struct btrfs_ordered_sum *sum;
trace_btrfs_ordered_extent_put(BTRFS_I(entry->inode), entry); if (refcount_dec_and_test(&entry->refs)) {
ASSERT(list_empty(&entry->root_extent_list));
ASSERT(list_empty(&entry->log_list));
ASSERT(RB_EMPTY_NODE(&entry->rb_node));
if (entry->inode)
btrfs_add_delayed_iput(entry->inode); while (!list_empty(&entry->list)) { cur = entry->list.next;
sum = list_entry(cur, struct btrfs_ordered_sum, list);
list_del(&sum->list);
kvfree(sum);
}
kmem_cache_free(btrfs_ordered_extent_cache, entry);
}
}
/*
* remove an ordered extent from the tree. No references are dropped
* and waiters are woken up.
*/
void btrfs_remove_ordered_extent(struct btrfs_inode *btrfs_inode,
struct btrfs_ordered_extent *entry)
{
struct btrfs_ordered_inode_tree *tree;
struct btrfs_root *root = btrfs_inode->root;
struct btrfs_fs_info *fs_info = root->fs_info;
struct rb_node *node;
bool pending;
/* This is paired with btrfs_add_ordered_extent. */
spin_lock(&btrfs_inode->lock);
btrfs_mod_outstanding_extents(btrfs_inode, -1);
spin_unlock(&btrfs_inode->lock);
if (root != fs_info->tree_root)
btrfs_delalloc_release_metadata(btrfs_inode, entry->num_bytes,
false);
percpu_counter_add_batch(&fs_info->ordered_bytes, -entry->num_bytes,
fs_info->delalloc_batch);
tree = &btrfs_inode->ordered_tree;
spin_lock_irq(&tree->lock);
node = &entry->rb_node;
rb_erase(node, &tree->tree);
RB_CLEAR_NODE(node);
if (tree->last == node)
tree->last = NULL; set_bit(BTRFS_ORDERED_COMPLETE, &entry->flags);
pending = test_and_clear_bit(BTRFS_ORDERED_PENDING, &entry->flags);
spin_unlock_irq(&tree->lock);
/*
* The current running transaction is waiting on us, we need to let it
* know that we're complete and wake it up.
*/
if (pending) {
struct btrfs_transaction *trans;
/*
* The checks for trans are just a formality, it should be set,
* but if it isn't we don't want to deref/assert under the spin
* lock, so be nice and check if trans is set, but ASSERT() so
* if it isn't set a developer will notice.
*/
spin_lock(&fs_info->trans_lock);
trans = fs_info->running_transaction;
if (trans)
refcount_inc(&trans->use_count);
spin_unlock(&fs_info->trans_lock);
ASSERT(trans);
if (trans) {
if (atomic_dec_and_test(&trans->pending_ordered)) wake_up(&trans->pending_wait); btrfs_put_transaction(trans);
}
}
spin_lock(&root->ordered_extent_lock);
list_del_init(&entry->root_extent_list);
root->nr_ordered_extents--;
trace_btrfs_ordered_extent_remove(btrfs_inode, entry);
if (!root->nr_ordered_extents) {
spin_lock(&fs_info->ordered_root_lock);
BUG_ON(list_empty(&root->ordered_root));
list_del_init(&root->ordered_root);
spin_unlock(&fs_info->ordered_root_lock);
}
spin_unlock(&root->ordered_extent_lock);
wake_up(&entry->wait);
}
static void btrfs_run_ordered_extent_work(struct btrfs_work *work)
{
struct btrfs_ordered_extent *ordered;
ordered = container_of(work, struct btrfs_ordered_extent, flush_work);
btrfs_start_ordered_extent(ordered, 1);
complete(&ordered->completion);
}
/*
* wait for all the ordered extents in a root. This is done when balancing
* space between drives.
*/
u64 btrfs_wait_ordered_extents(struct btrfs_root *root, u64 nr,
const u64 range_start, const u64 range_len)
{
struct btrfs_fs_info *fs_info = root->fs_info;
LIST_HEAD(splice);
LIST_HEAD(skipped);
LIST_HEAD(works);
struct btrfs_ordered_extent *ordered, *next;
u64 count = 0;
const u64 range_end = range_start + range_len;
mutex_lock(&root->ordered_extent_mutex);
spin_lock(&root->ordered_extent_lock);
list_splice_init(&root->ordered_extents, &splice);
while (!list_empty(&splice) && nr) {
ordered = list_first_entry(&splice, struct btrfs_ordered_extent,
root_extent_list);
if (range_end <= ordered->disk_bytenr ||
ordered->disk_bytenr + ordered->disk_num_bytes <= range_start) {
list_move_tail(&ordered->root_extent_list, &skipped);
cond_resched_lock(&root->ordered_extent_lock);
continue;
}
list_move_tail(&ordered->root_extent_list,
&root->ordered_extents);
refcount_inc(&ordered->refs);
spin_unlock(&root->ordered_extent_lock);
btrfs_init_work(&ordered->flush_work,
btrfs_run_ordered_extent_work, NULL, NULL);
list_add_tail(&ordered->work_list, &works);
btrfs_queue_work(fs_info->flush_workers, &ordered->flush_work);
cond_resched();
spin_lock(&root->ordered_extent_lock);
if (nr != U64_MAX)
nr--;
count++;
}
list_splice_tail(&skipped, &root->ordered_extents);
list_splice_tail(&splice, &root->ordered_extents);
spin_unlock(&root->ordered_extent_lock);
list_for_each_entry_safe(ordered, next, &works, work_list) {
list_del_init(&ordered->work_list);
wait_for_completion(&ordered->completion);
btrfs_put_ordered_extent(ordered);
cond_resched();
}
mutex_unlock(&root->ordered_extent_mutex);
return count;
}
void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, u64 nr,
const u64 range_start, const u64 range_len)
{
struct btrfs_root *root;
struct list_head splice;
u64 done;
INIT_LIST_HEAD(&splice);
mutex_lock(&fs_info->ordered_operations_mutex);
spin_lock(&fs_info->ordered_root_lock);
list_splice_init(&fs_info->ordered_roots, &splice);
while (!list_empty(&splice) && nr) { root = list_first_entry(&splice, struct btrfs_root,
ordered_root);
root = btrfs_grab_root(root);
BUG_ON(!root); list_move_tail(&root->ordered_root,
&fs_info->ordered_roots);
spin_unlock(&fs_info->ordered_root_lock);
done = btrfs_wait_ordered_extents(root, nr,
range_start, range_len);
btrfs_put_root(root);
spin_lock(&fs_info->ordered_root_lock);
if (nr != U64_MAX) {
nr -= done;
}
}
list_splice_tail(&splice, &fs_info->ordered_roots);
spin_unlock(&fs_info->ordered_root_lock);
mutex_unlock(&fs_info->ordered_operations_mutex);
}
/*
* Used to start IO or wait for a given ordered extent to finish.
*
* If wait is one, this effectively waits on page writeback for all the pages
* in the extent, and it waits on the io completion code to insert
* metadata into the btree corresponding to the extent
*/
void btrfs_start_ordered_extent(struct btrfs_ordered_extent *entry, int wait)
{
u64 start = entry->file_offset; u64 end = start + entry->num_bytes - 1;
struct btrfs_inode *inode = BTRFS_I(entry->inode);
trace_btrfs_ordered_extent_start(inode, entry);
/*
* pages in the range can be dirty, clean or writeback. We
* start IO on any dirty ones so the wait doesn't stall waiting
* for the flusher thread to find them
*/
if (!test_bit(BTRFS_ORDERED_DIRECT, &entry->flags))
filemap_fdatawrite_range(inode->vfs_inode.i_mapping, start, end);
if (wait) { wait_event(entry->wait, test_bit(BTRFS_ORDERED_COMPLETE,
&entry->flags));
}
}
/*
* Used to wait on ordered extents across a large range of bytes.
*/
int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len)
{
int ret = 0;
int ret_wb = 0;
u64 end;
u64 orig_end;
struct btrfs_ordered_extent *ordered;
if (start + len < start) {
orig_end = INT_LIMIT(loff_t);
} else {
orig_end = start + len - 1;
if (orig_end > INT_LIMIT(loff_t))
orig_end = INT_LIMIT(loff_t);
}
/* start IO across the range first to instantiate any delalloc
* extents
*/
ret = btrfs_fdatawrite_range(inode, start, orig_end); if (ret)
return ret;
/*
* If we have a writeback error don't return immediately. Wait first
* for any ordered extents that haven't completed yet. This is to make
* sure no one can dirty the same page ranges and call writepages()
* before the ordered extents complete - to avoid failures (-EEXIST)
* when adding the new ordered extents to the ordered tree.
*/
ret_wb = filemap_fdatawait_range(inode->i_mapping, start, orig_end);
end = orig_end;
while (1) {
ordered = btrfs_lookup_first_ordered_extent(BTRFS_I(inode), end);
if (!ordered)
break;
if (ordered->file_offset > orig_end) {
btrfs_put_ordered_extent(ordered);
break;
}
if (ordered->file_offset + ordered->num_bytes <= start) { btrfs_put_ordered_extent(ordered);
break;
}
btrfs_start_ordered_extent(ordered, 1);
end = ordered->file_offset;
/*
* If the ordered extent had an error save the error but don't
* exit without waiting first for all other ordered extents in
* the range to complete.
*/
if (test_bit(BTRFS_ORDERED_IOERR, &ordered->flags))
ret = -EIO;
btrfs_put_ordered_extent(ordered); if (end == 0 || end == start)
break;
end--;
}
return ret_wb ? ret_wb : ret;
}
/*
* find an ordered extent corresponding to file_offset. return NULL if
* nothing is found, otherwise take a reference on the extent and return it
*/
struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct btrfs_inode *inode,
u64 file_offset)
{
struct btrfs_ordered_inode_tree *tree;
struct rb_node *node;
struct btrfs_ordered_extent *entry = NULL;
unsigned long flags;
tree = &inode->ordered_tree;
spin_lock_irqsave(&tree->lock, flags);
node = tree_search(tree, file_offset);
if (!node)
goto out;
entry = rb_entry(node, struct btrfs_ordered_extent, rb_node); if (!in_range(file_offset, entry->file_offset, entry->num_bytes))
entry = NULL;
if (entry)
refcount_inc(&entry->refs);
out:
spin_unlock_irqrestore(&tree->lock, flags);
return entry;
}
/* Since the DIO code tries to lock a wide area we need to look for any ordered
* extents that exist in the range, rather than just the start of the range.
*/
struct btrfs_ordered_extent *btrfs_lookup_ordered_range(
struct btrfs_inode *inode, u64 file_offset, u64 len)
{
struct btrfs_ordered_inode_tree *tree;
struct rb_node *node;
struct btrfs_ordered_extent *entry = NULL;
tree = &inode->ordered_tree;
spin_lock_irq(&tree->lock);
node = tree_search(tree, file_offset);
if (!node) {
node = tree_search(tree, file_offset + len);
if (!node)
goto out;
}
while (1) {
entry = rb_entry(node, struct btrfs_ordered_extent, rb_node); if (range_overlaps(entry, file_offset, len))
break;
if (entry->file_offset >= file_offset + len) {
entry = NULL;
break;
}
entry = NULL;
node = rb_next(node);
if (!node)
break;
}
out:
if (entry)
refcount_inc(&entry->refs);
spin_unlock_irq(&tree->lock);
return entry;
}
/*
* Adds all ordered extents to the given list. The list ends up sorted by the
* file_offset of the ordered extents.
*/
void btrfs_get_ordered_extents_for_logging(struct btrfs_inode *inode,
struct list_head *list)
{
struct btrfs_ordered_inode_tree *tree = &inode->ordered_tree;
struct rb_node *n;
ASSERT(inode_is_locked(&inode->vfs_inode));
spin_lock_irq(&tree->lock);
for (n = rb_first(&tree->tree); n; n = rb_next(n)) {
struct btrfs_ordered_extent *ordered;
ordered = rb_entry(n, struct btrfs_ordered_extent, rb_node);
if (test_bit(BTRFS_ORDERED_LOGGED, &ordered->flags))
continue;
ASSERT(list_empty(&ordered->log_list));
list_add_tail(&ordered->log_list, list);
refcount_inc(&ordered->refs);
}
spin_unlock_irq(&tree->lock);
}
/*
* lookup and return any extent before 'file_offset'. NULL is returned
* if none is found
*/
struct btrfs_ordered_extent *
btrfs_lookup_first_ordered_extent(struct btrfs_inode *inode, u64 file_offset)
{
struct btrfs_ordered_inode_tree *tree;
struct rb_node *node;
struct btrfs_ordered_extent *entry = NULL;
tree = &inode->ordered_tree;
spin_lock_irq(&tree->lock);
node = tree_search(tree, file_offset);
if (!node)
goto out;
entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);
refcount_inc(&entry->refs);
out:
spin_unlock_irq(&tree->lock);
return entry;
}
/*
* Lookup the first ordered extent that overlaps the range
* [@file_offset, @file_offset + @len).
*
* The difference between this and btrfs_lookup_first_ordered_extent() is
* that this one won't return any ordered extent that does not overlap the range.
* And the difference against btrfs_lookup_ordered_extent() is, this function
* ensures the first ordered extent gets returned.
*/
struct btrfs_ordered_extent *btrfs_lookup_first_ordered_range(
struct btrfs_inode *inode, u64 file_offset, u64 len)
{
struct btrfs_ordered_inode_tree *tree = &inode->ordered_tree;
struct rb_node *node;
struct rb_node *cur;
struct rb_node *prev;
struct rb_node *next;
struct btrfs_ordered_extent *entry = NULL;
spin_lock_irq(&tree->lock);
node = tree->tree.rb_node;
/*
* Here we don't want to use tree_search() which will use tree->last
* and screw up the search order.
* And __tree_search() can't return the adjacent ordered extents
* either, thus here we do our own search.
*/
while (node) { entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);
if (file_offset < entry->file_offset) {
node = node->rb_left; } else if (file_offset >= entry_end(entry)) { node = node->rb_right;
} else {
/*
* Direct hit, got an ordered extent that starts at
* @file_offset
*/
goto out;
}
}
if (!entry) {
/* Empty tree */
goto out;
}
cur = &entry->rb_node;
/* We got an entry around @file_offset, check adjacent entries */
if (entry->file_offset < file_offset) {
prev = cur;
next = rb_next(cur);
} else {
prev = rb_prev(cur);
next = cur;
}
if (prev) {
entry = rb_entry(prev, struct btrfs_ordered_extent, rb_node);
if (range_overlaps(entry, file_offset, len))
goto out;
}
if (next) { entry = rb_entry(next, struct btrfs_ordered_extent, rb_node); if (range_overlaps(entry, file_offset, len))
goto out;
}
/* No ordered extent in the range */
entry = NULL;
out:
if (entry) refcount_inc(&entry->refs);
spin_unlock_irq(&tree->lock);
return entry;
}
/*
* btrfs_flush_ordered_range - Lock the passed range and ensures all pending
* ordered extents in it are run to completion.
*
* @inode: Inode whose ordered tree is to be searched
* @start: Beginning of range to flush
* @end: Last byte of range to lock
* @cached_state: If passed, will return the extent state responsible for the
* locked range. It's the caller's responsibility to free the cached state.
*
* This function always returns with the given range locked, ensuring after it's
* called no order extent can be pending.
*/
void btrfs_lock_and_flush_ordered_range(struct btrfs_inode *inode, u64 start,
u64 end,
struct extent_state **cached_state)
{
struct btrfs_ordered_extent *ordered;
struct extent_state *cache = NULL;
struct extent_state **cachedp = &cache;
if (cached_state)
cachedp = cached_state;
while (1) {
lock_extent_bits(&inode->io_tree, start, end, cachedp);
ordered = btrfs_lookup_ordered_range(inode, start,
end - start + 1);
if (!ordered) {
/*
* If no external cached_state has been passed then
* decrement the extra ref taken for cachedp since we
* aren't exposing it outside of this function
*/
if (!cached_state) refcount_dec(&cache->refs); break;
}
unlock_extent_cached(&inode->io_tree, start, end, cachedp);
btrfs_start_ordered_extent(ordered, 1);
btrfs_put_ordered_extent(ordered);
}
}
static int clone_ordered_extent(struct btrfs_ordered_extent *ordered, u64 pos,
u64 len)
{
struct inode *inode = ordered->inode;
struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
u64 file_offset = ordered->file_offset + pos;
u64 disk_bytenr = ordered->disk_bytenr + pos;
u64 num_bytes = len;
u64 disk_num_bytes = len;
int type;
unsigned long flags_masked = ordered->flags & ~(1 << BTRFS_ORDERED_DIRECT);
int compress_type = ordered->compress_type;
unsigned long weight;
int ret;
weight = hweight_long(flags_masked);
WARN_ON_ONCE(weight > 1);
if (!weight)
type = 0;
else
type = __ffs(flags_masked);
/*
* The splitting extent is already counted and will be added again
* in btrfs_add_ordered_extent_*(). Subtract num_bytes to avoid
* double counting.
*/
percpu_counter_add_batch(&fs_info->ordered_bytes, -num_bytes,
fs_info->delalloc_batch);
if (test_bit(BTRFS_ORDERED_COMPRESSED, &ordered->flags)) {
WARN_ON_ONCE(1);
ret = btrfs_add_ordered_extent_compress(BTRFS_I(inode),
file_offset, disk_bytenr, num_bytes,
disk_num_bytes, compress_type);
} else if (test_bit(BTRFS_ORDERED_DIRECT, &ordered->flags)) {
ret = btrfs_add_ordered_extent_dio(BTRFS_I(inode), file_offset,
disk_bytenr, num_bytes, disk_num_bytes, type);
} else {
ret = btrfs_add_ordered_extent(BTRFS_I(inode), file_offset,
disk_bytenr, num_bytes, disk_num_bytes, type);
}
return ret;
}
int btrfs_split_ordered_extent(struct btrfs_ordered_extent *ordered, u64 pre,
u64 post)
{
struct inode *inode = ordered->inode;
struct btrfs_ordered_inode_tree *tree = &BTRFS_I(inode)->ordered_tree;
struct rb_node *node;
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
int ret = 0;
spin_lock_irq(&tree->lock);
/* Remove from tree once */
node = &ordered->rb_node;
rb_erase(node, &tree->tree);
RB_CLEAR_NODE(node);
if (tree->last == node)
tree->last = NULL;
ordered->file_offset += pre;
ordered->disk_bytenr += pre;
ordered->num_bytes -= (pre + post);
ordered->disk_num_bytes -= (pre + post);
ordered->bytes_left -= (pre + post);
/* Re-insert the node */
node = tree_insert(&tree->tree, ordered->file_offset, &ordered->rb_node);
if (node)
btrfs_panic(fs_info, -EEXIST,
"zoned: inconsistency in ordered tree at offset %llu",
ordered->file_offset);
spin_unlock_irq(&tree->lock);
if (pre)
ret = clone_ordered_extent(ordered, 0, pre);
if (ret == 0 && post)
ret = clone_ordered_extent(ordered, pre + ordered->disk_num_bytes,
post);
return ret;
}
int __init ordered_data_init(void)
{
btrfs_ordered_extent_cache = kmem_cache_create("btrfs_ordered_extent",
sizeof(struct btrfs_ordered_extent), 0,
SLAB_MEM_SPREAD,
NULL);
if (!btrfs_ordered_extent_cache)
return -ENOMEM;
return 0;
}
void __cold ordered_data_exit(void)
{
kmem_cache_destroy(btrfs_ordered_extent_cache);
}
// SPDX-License-Identifier: GPL-2.0
/*
* Copyright (c) 2012-2014 Andy Lutomirski <luto@amacapital.net>
*
* Based on the original implementation which is:
* Copyright (C) 2001 Andrea Arcangeli <andrea@suse.de> SuSE
* Copyright 2003 Andi Kleen, SuSE Labs.
*
* Parts of the original code have been moved to arch/x86/vdso/vma.c
*
* This file implements vsyscall emulation. vsyscalls are a legacy ABI:
* Userspace can request certain kernel services by calling fixed
* addresses. This concept is problematic:
*
* - It interferes with ASLR.
* - It's awkward to write code that lives in kernel addresses but is
* callable by userspace at fixed addresses.
* - The whole concept is impossible for 32-bit compat userspace.
* - UML cannot easily virtualize a vsyscall.
*
* As of mid-2014, I believe that there is no new userspace code that
* will use a vsyscall if the vDSO is present. I hope that there will
* soon be no new userspace code that will ever use a vsyscall.
*
* The code in this file emulates vsyscalls when notified of a page
* fault to a vsyscall address.
*/
#include <linux/kernel.h>
#include <linux/timer.h>
#include <linux/sched/signal.h>
#include <linux/mm_types.h>
#include <linux/syscalls.h>
#include <linux/ratelimit.h>
#include <asm/vsyscall.h>
#include <asm/unistd.h>
#include <asm/fixmap.h>
#include <asm/traps.h>
#include <asm/paravirt.h>
#define CREATE_TRACE_POINTS
#include "vsyscall_trace.h"
static enum { EMULATE, XONLY, NONE } vsyscall_mode __ro_after_init =
#ifdef CONFIG_LEGACY_VSYSCALL_NONE
NONE;
#elif defined(CONFIG_LEGACY_VSYSCALL_XONLY)
XONLY;
#else
EMULATE;
#endif
static int __init vsyscall_setup(char *str)
{
if (str) {
if (!strcmp("emulate", str))
vsyscall_mode = EMULATE;
else if (!strcmp("xonly", str))
vsyscall_mode = XONLY;
else if (!strcmp("none", str))
vsyscall_mode = NONE;
else
return -EINVAL;
return 0;
}
return -EINVAL;
}
early_param("vsyscall", vsyscall_setup);
static void warn_bad_vsyscall(const char *level, struct pt_regs *regs,
const char *message)
{
if (!show_unhandled_signals)
return;
printk_ratelimited("%s%s[%d] %s ip:%lx cs:%lx sp:%lx ax:%lx si:%lx di:%lx\n",
level, current->comm, task_pid_nr(current),
message, regs->ip, regs->cs,
regs->sp, regs->ax, regs->si, regs->di);
}
static int addr_to_vsyscall_nr(unsigned long addr)
{
int nr;
if ((addr & ~0xC00UL) != VSYSCALL_ADDR)
return -EINVAL;
nr = (addr & 0xC00UL) >> 10;
if (nr >= 3)
return -EINVAL;
return nr;
}
static bool write_ok_or_segv(unsigned long ptr, size_t size)
{
/*
* XXX: if access_ok, get_user, and put_user handled
* sig_on_uaccess_err, this could go away.
*/
if (!access_ok((void __user *)ptr, size)) {
struct thread_struct *thread = ¤t->thread;
thread->error_code = X86_PF_USER | X86_PF_WRITE;
thread->cr2 = ptr;
thread->trap_nr = X86_TRAP_PF;
force_sig_fault(SIGSEGV, SEGV_MAPERR, (void __user *)ptr);
return false;
} else {
return true;
}
}
bool emulate_vsyscall(unsigned long error_code,
struct pt_regs *regs, unsigned long address)
{
struct task_struct *tsk;
unsigned long caller;
int vsyscall_nr, syscall_nr, tmp;
int prev_sig_on_uaccess_err;
long ret;
unsigned long orig_dx;
/* Write faults or kernel-privilege faults never get fixed up. */
if ((error_code & (X86_PF_WRITE | X86_PF_USER)) != X86_PF_USER)
return false;
if (!(error_code & X86_PF_INSTR)) {
/* Failed vsyscall read */
if (vsyscall_mode == EMULATE)
return false;
/*
* User code tried and failed to read the vsyscall page.
*/
warn_bad_vsyscall(KERN_INFO, regs, "vsyscall read attempt denied -- look up the vsyscall kernel parameter if you need a workaround");
return false;
}
/*
* No point in checking CS -- the only way to get here is a user mode
* trap to a high address, which means that we're in 64-bit user code.
*/
WARN_ON_ONCE(address != regs->ip); if (vsyscall_mode == NONE) { warn_bad_vsyscall(KERN_INFO, regs,
"vsyscall attempted with vsyscall=none");
return false;
}
vsyscall_nr = addr_to_vsyscall_nr(address);
trace_emulate_vsyscall(vsyscall_nr);
if (vsyscall_nr < 0) { warn_bad_vsyscall(KERN_WARNING, regs,
"misaligned vsyscall (exploit attempt or buggy program) -- look up the vsyscall kernel parameter if you need a workaround");
goto sigsegv;
}
if (get_user(caller, (unsigned long __user *)regs->sp) != 0) { warn_bad_vsyscall(KERN_WARNING, regs,
"vsyscall with bad stack (exploit attempt?)");
goto sigsegv;
}
tsk = current;
/*
* Check for access_ok violations and find the syscall nr.
*
* NULL is a valid user pointer (in the access_ok sense) on 32-bit and
* 64-bit, so we don't need to special-case it here. For all the
* vsyscalls, NULL means "don't write anything" not "write it at
* address 0".
*/
switch (vsyscall_nr) {
case 0:
if (!write_ok_or_segv(regs->di, sizeof(struct __kernel_old_timeval)) || !write_ok_or_segv(regs->si, sizeof(struct timezone))) {
ret = -EFAULT;
goto check_fault;
}
syscall_nr = __NR_gettimeofday;
break;
case 1:
if (!write_ok_or_segv(regs->di, sizeof(__kernel_old_time_t))) {
ret = -EFAULT;
goto check_fault;
}
syscall_nr = __NR_time;
break;
case 2:
if (!write_ok_or_segv(regs->di, sizeof(unsigned)) ||
!write_ok_or_segv(regs->si, sizeof(unsigned))) {
ret = -EFAULT;
goto check_fault;
}
syscall_nr = __NR_getcpu;
break;
}
/*
* Handle seccomp. regs->ip must be the original value.
* See seccomp_send_sigsys and Documentation/userspace-api/seccomp_filter.rst.
*
* We could optimize the seccomp disabled case, but performance
* here doesn't matter.
*/
regs->orig_ax = syscall_nr;
regs->ax = -ENOSYS;
tmp = secure_computing();
if ((!tmp && regs->orig_ax != syscall_nr) || regs->ip != address) { warn_bad_vsyscall(KERN_DEBUG, regs,
"seccomp tried to change syscall nr or ip");
force_exit_sig(SIGSYS);
return true;
}
regs->orig_ax = -1;
if (tmp)
goto do_ret; /* skip requested */
/*
* With a real vsyscall, page faults cause SIGSEGV. We want to
* preserve that behavior to make writing exploits harder.
*/
prev_sig_on_uaccess_err = current->thread.sig_on_uaccess_err;
current->thread.sig_on_uaccess_err = 1;
ret = -EFAULT;
switch (vsyscall_nr) {
case 0:
/* this decodes regs->di and regs->si on its own */
ret = __x64_sys_gettimeofday(regs);
break;
case 1:
/* this decodes regs->di on its own */
ret = __x64_sys_time(regs);
break;
case 2:
/* while we could clobber regs->dx, we didn't in the past... */
orig_dx = regs->dx;
regs->dx = 0;
/* this decodes regs->di, regs->si and regs->dx on its own */
ret = __x64_sys_getcpu(regs);
regs->dx = orig_dx;
break;
}
current->thread.sig_on_uaccess_err = prev_sig_on_uaccess_err;
check_fault:
if (ret == -EFAULT) {
/* Bad news -- userspace fed a bad pointer to a vsyscall. */
warn_bad_vsyscall(KERN_INFO, regs,
"vsyscall fault (exploit attempt?)");
/*
* If we failed to generate a signal for any reason,
* generate one here. (This should be impossible.)
*/
if (WARN_ON_ONCE(!sigismember(&tsk->pending.signal, SIGBUS) &&
!sigismember(&tsk->pending.signal, SIGSEGV)))
goto sigsegv;
return true; /* Don't emulate the ret. */
}
regs->ax = ret;
do_ret:
/* Emulate a ret instruction. */
regs->ip = caller;
regs->sp += 8;
return true;
sigsegv:
force_sig(SIGSEGV);
return true;
}
/*
* A pseudo VMA to allow ptrace access for the vsyscall page. This only
* covers the 64bit vsyscall page now. 32bit has a real VMA now and does
* not need special handling anymore:
*/
static const char *gate_vma_name(struct vm_area_struct *vma)
{
return "[vsyscall]";
}
static const struct vm_operations_struct gate_vma_ops = {
.name = gate_vma_name,
};
static struct vm_area_struct gate_vma __ro_after_init = {
.vm_start = VSYSCALL_ADDR,
.vm_end = VSYSCALL_ADDR + PAGE_SIZE,
.vm_page_prot = PAGE_READONLY_EXEC,
.vm_flags = VM_READ | VM_EXEC,
.vm_ops = &gate_vma_ops,
};
struct vm_area_struct *get_gate_vma(struct mm_struct *mm)
{
#ifdef CONFIG_COMPAT
if (!mm || !(mm->context.flags & MM_CONTEXT_HAS_VSYSCALL))
return NULL;
#endif
if (vsyscall_mode == NONE)
return NULL;
return &gate_vma;
}
int in_gate_area(struct mm_struct *mm, unsigned long addr)
{
struct vm_area_struct *vma = get_gate_vma(mm);
if (!vma)
return 0;
return (addr >= vma->vm_start) && (addr < vma->vm_end);
}
/*
* Use this when you have no reliable mm, typically from interrupt
* context. It is less reliable than using a task's mm and may give
* false positives.
*/
int in_gate_area_no_mm(unsigned long addr)
{
return vsyscall_mode != NONE && (addr & PAGE_MASK) == VSYSCALL_ADDR;
}
/*
* The VSYSCALL page is the only user-accessible page in the kernel address
* range. Normally, the kernel page tables can have _PAGE_USER clear, but
* the tables covering VSYSCALL_ADDR need _PAGE_USER set if vsyscalls
* are enabled.
*
* Some day we may create a "minimal" vsyscall mode in which we emulate
* vsyscalls but leave the page not present. If so, we skip calling
* this.
*/
void __init set_vsyscall_pgtable_user_bits(pgd_t *root)
{
pgd_t *pgd;
p4d_t *p4d;
pud_t *pud;
pmd_t *pmd;
pgd = pgd_offset_pgd(root, VSYSCALL_ADDR);
set_pgd(pgd, __pgd(pgd_val(*pgd) | _PAGE_USER));
p4d = p4d_offset(pgd, VSYSCALL_ADDR);
#if CONFIG_PGTABLE_LEVELS >= 5
set_p4d(p4d, __p4d(p4d_val(*p4d) | _PAGE_USER));
#endif
pud = pud_offset(p4d, VSYSCALL_ADDR);
set_pud(pud, __pud(pud_val(*pud) | _PAGE_USER));
pmd = pmd_offset(pud, VSYSCALL_ADDR);
set_pmd(pmd, __pmd(pmd_val(*pmd) | _PAGE_USER));
}
void __init map_vsyscall(void)
{
extern char __vsyscall_page;
unsigned long physaddr_vsyscall = __pa_symbol(&__vsyscall_page);
/*
* For full emulation, the page needs to exist for real. In
* execute-only mode, there is no PTE at all backing the vsyscall
* page.
*/
if (vsyscall_mode == EMULATE) {
__set_fixmap(VSYSCALL_PAGE, physaddr_vsyscall,
PAGE_KERNEL_VVAR);
set_vsyscall_pgtable_user_bits(swapper_pg_dir);
}
if (vsyscall_mode == XONLY)
gate_vma.vm_flags = VM_EXEC;
BUILD_BUG_ON((unsigned long)__fix_to_virt(VSYSCALL_PAGE) !=
(unsigned long)VSYSCALL_ADDR);
}
// SPDX-License-Identifier: GPL-2.0
/*
* INET An implementation of the TCP/IP protocol suite for the LINUX
* operating system. INET is implemented using the BSD Socket
* interface as the means of communication with the user level.
*
* Implementation of the Transmission Control Protocol(TCP).
*
* Authors: Ross Biro
* Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
* Mark Evans, <evansmp@uhura.aston.ac.uk>
* Corey Minyard <wf-rch!minyard@relay.EU.net>
* Florian La Roche, <flla@stud.uni-sb.de>
* Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
* Linus Torvalds, <torvalds@cs.helsinki.fi>
* Alan Cox, <gw4pts@gw4pts.ampr.org>
* Matthew Dillon, <dillon@apollo.west.oic.com>
* Arnt Gulbrandsen, <agulbra@nvg.unit.no>
* Jorge Cwik, <jorge@laser.satlink.net>
*/
/*
* Changes:
* Pedro Roque : Fast Retransmit/Recovery.
* Two receive queues.
* Retransmit queue handled by TCP.
* Better retransmit timer handling.
* New congestion avoidance.
* Header prediction.
* Variable renaming.
*
* Eric : Fast Retransmit.
* Randy Scott : MSS option defines.
* Eric Schenk : Fixes to slow start algorithm.
* Eric Schenk : Yet another double ACK bug.
* Eric Schenk : Delayed ACK bug fixes.
* Eric Schenk : Floyd style fast retrans war avoidance.
* David S. Miller : Don't allow zero congestion window.
* Eric Schenk : Fix retransmitter so that it sends
* next packet on ack of previous packet.
* Andi Kleen : Moved open_request checking here
* and process RSTs for open_requests.
* Andi Kleen : Better prune_queue, and other fixes.
* Andrey Savochkin: Fix RTT measurements in the presence of
* timestamps.
* Andrey Savochkin: Check sequence numbers correctly when
* removing SACKs due to in sequence incoming
* data segments.
* Andi Kleen: Make sure we never ack data there is not
* enough room for. Also make this condition
* a fatal error if it might still happen.
* Andi Kleen: Add tcp_measure_rcv_mss to make
* connections with MSS<min(MTU,ann. MSS)
* work without delayed acks.
* Andi Kleen: Process packets with PSH set in the
* fast path.
* J Hadi Salim: ECN support
* Andrei Gurtov,
* Pasi Sarolahti,
* Panu Kuhlberg: Experimental audit of TCP (re)transmission
* engine. Lots of bugs are found.
* Pasi Sarolahti: F-RTO for dealing with spurious RTOs
*/
#define pr_fmt(fmt) "TCP: " fmt
#include <linux/mm.h>
#include <linux/slab.h>
#include <linux/module.h>
#include <linux/sysctl.h>
#include <linux/kernel.h>
#include <linux/prefetch.h>
#include <net/dst.h>
#include <net/tcp.h>
#include <net/inet_common.h>
#include <linux/ipsec.h>
#include <asm/unaligned.h>
#include <linux/errqueue.h>
#include <trace/events/tcp.h>
#include <linux/jump_label_ratelimit.h>
#include <net/busy_poll.h>
#include <net/mptcp.h>
int sysctl_tcp_max_orphans __read_mostly = NR_FILE;
#define FLAG_DATA 0x01 /* Incoming frame contained data. */
#define FLAG_WIN_UPDATE 0x02 /* Incoming ACK was a window update. */
#define FLAG_DATA_ACKED 0x04 /* This ACK acknowledged new data. */
#define FLAG_RETRANS_DATA_ACKED 0x08 /* "" "" some of which was retransmitted. */
#define FLAG_SYN_ACKED 0x10 /* This ACK acknowledged SYN. */
#define FLAG_DATA_SACKED 0x20 /* New SACK. */
#define FLAG_ECE 0x40 /* ECE in this ACK */
#define FLAG_LOST_RETRANS 0x80 /* This ACK marks some retransmission lost */
#define FLAG_SLOWPATH 0x100 /* Do not skip RFC checks for window update.*/
#define FLAG_ORIG_SACK_ACKED 0x200 /* Never retransmitted data are (s)acked */
#define FLAG_SND_UNA_ADVANCED 0x400 /* Snd_una was changed (!= FLAG_DATA_ACKED) */
#define FLAG_DSACKING_ACK 0x800 /* SACK blocks contained D-SACK info */
#define FLAG_SET_XMIT_TIMER 0x1000 /* Set TLP or RTO timer */
#define FLAG_SACK_RENEGING 0x2000 /* snd_una advanced to a sacked seq */
#define FLAG_UPDATE_TS_RECENT 0x4000 /* tcp_replace_ts_recent() */
#define FLAG_NO_CHALLENGE_ACK 0x8000 /* do not call tcp_send_challenge_ack() */
#define FLAG_ACK_MAYBE_DELAYED 0x10000 /* Likely a delayed ACK */
#define FLAG_DSACK_TLP 0x20000 /* DSACK for tail loss probe */
#define FLAG_ACKED (FLAG_DATA_ACKED|FLAG_SYN_ACKED)
#define FLAG_NOT_DUP (FLAG_DATA|FLAG_WIN_UPDATE|FLAG_ACKED)
#define FLAG_CA_ALERT (FLAG_DATA_SACKED|FLAG_ECE|FLAG_DSACKING_ACK)
#define FLAG_FORWARD_PROGRESS (FLAG_ACKED|FLAG_DATA_SACKED)
#define TCP_REMNANT (TCP_FLAG_FIN|TCP_FLAG_URG|TCP_FLAG_SYN|TCP_FLAG_PSH)
#define TCP_HP_BITS (~(TCP_RESERVED_BITS|TCP_FLAG_PSH))
#define REXMIT_NONE 0 /* no loss recovery to do */
#define REXMIT_LOST 1 /* retransmit packets marked lost */
#define REXMIT_NEW 2 /* FRTO-style transmit of unsent/new packets */
#if IS_ENABLED(CONFIG_TLS_DEVICE)
static DEFINE_STATIC_KEY_DEFERRED_FALSE(clean_acked_data_enabled, HZ);
void clean_acked_data_enable(struct inet_connection_sock *icsk,
void (*cad)(struct sock *sk, u32 ack_seq))
{
icsk->icsk_clean_acked = cad;
static_branch_deferred_inc(&clean_acked_data_enabled);
}
EXPORT_SYMBOL_GPL(clean_acked_data_enable);
void clean_acked_data_disable(struct inet_connection_sock *icsk)
{
static_branch_slow_dec_deferred(&clean_acked_data_enabled);
icsk->icsk_clean_acked = NULL;
}
EXPORT_SYMBOL_GPL(clean_acked_data_disable);
void clean_acked_data_flush(void)
{
static_key_deferred_flush(&clean_acked_data_enabled);
}
EXPORT_SYMBOL_GPL(clean_acked_data_flush);
#endif
#ifdef CONFIG_CGROUP_BPF
static void bpf_skops_parse_hdr(struct sock *sk, struct sk_buff *skb)
{
bool unknown_opt = tcp_sk(sk)->rx_opt.saw_unknown &&
BPF_SOCK_OPS_TEST_FLAG(tcp_sk(sk),
BPF_SOCK_OPS_PARSE_UNKNOWN_HDR_OPT_CB_FLAG);
bool parse_all_opt = BPF_SOCK_OPS_TEST_FLAG(tcp_sk(sk),
BPF_SOCK_OPS_PARSE_ALL_HDR_OPT_CB_FLAG);
struct bpf_sock_ops_kern sock_ops;
if (likely(!unknown_opt && !parse_all_opt))
return;
/* The skb will be handled in the
* bpf_skops_established() or
* bpf_skops_write_hdr_opt().
*/
switch (sk->sk_state) {
case TCP_SYN_RECV:
case TCP_SYN_SENT:
case TCP_LISTEN:
return;
}
sock_owned_by_me(sk);
memset(&sock_ops, 0, offsetof(struct bpf_sock_ops_kern, temp));
sock_ops.op = BPF_SOCK_OPS_PARSE_HDR_OPT_CB;
sock_ops.is_fullsock = 1;
sock_ops.sk = sk;
bpf_skops_init_skb(&sock_ops, skb, tcp_hdrlen(skb));
BPF_CGROUP_RUN_PROG_SOCK_OPS(&sock_ops);
}
static void bpf_skops_established(struct sock *sk, int bpf_op,
struct sk_buff *skb)
{
struct bpf_sock_ops_kern sock_ops;
sock_owned_by_me(sk);
memset(&sock_ops, 0, offsetof(struct bpf_sock_ops_kern, temp));
sock_ops.op = bpf_op;
sock_ops.is_fullsock = 1;
sock_ops.sk = sk;
/* sk with TCP_REPAIR_ON does not have skb in tcp_finish_connect */
if (skb)
bpf_skops_init_skb(&sock_ops, skb, tcp_hdrlen(skb));
BPF_CGROUP_RUN_PROG_SOCK_OPS(&sock_ops);
}
#else
static void bpf_skops_parse_hdr(struct sock *sk, struct sk_buff *skb)
{
}
static void bpf_skops_established(struct sock *sk, int bpf_op,
struct sk_buff *skb)
{
}
#endif
static void tcp_gro_dev_warn(struct sock *sk, const struct sk_buff *skb,
unsigned int len)
{
static bool __once __read_mostly;
if (!__once) {
struct net_device *dev;
__once = true;
rcu_read_lock();
dev = dev_get_by_index_rcu(sock_net(sk), skb->skb_iif);
if (!dev || len >= dev->mtu)
pr_warn("%s: Driver has suspect GRO implementation, TCP performance may be compromised.\n",
dev ? dev->name : "Unknown driver");
rcu_read_unlock();
}
}
/* Adapt the MSS value used to make delayed ack decision to the
* real world.
*/
static void tcp_measure_rcv_mss(struct sock *sk, const struct sk_buff *skb)
{
struct inet_connection_sock *icsk = inet_csk(sk);
const unsigned int lss = icsk->icsk_ack.last_seg_size;
unsigned int len;
icsk->icsk_ack.last_seg_size = 0;
/* skb->len may jitter because of SACKs, even if peer
* sends good full-sized frames.
*/
len = skb_shinfo(skb)->gso_size ? : skb->len;
if (len >= icsk->icsk_ack.rcv_mss) {
icsk->icsk_ack.rcv_mss = min_t(unsigned int, len,
tcp_sk(sk)->advmss);
/* Account for possibly-removed options */
if (unlikely(len > icsk->icsk_ack.rcv_mss +
MAX_TCP_OPTION_SPACE))
tcp_gro_dev_warn(sk, skb, len);
} else {
/* Otherwise, we make more careful check taking into account,
* that SACKs block is variable.
*
* "len" is invariant segment length, including TCP header.
*/
len += skb->data - skb_transport_header(skb);
if (len >= TCP_MSS_DEFAULT + sizeof(struct tcphdr) ||
/* If PSH is not set, packet should be
* full sized, provided peer TCP is not badly broken.
* This observation (if it is correct 8)) allows
* to handle super-low mtu links fairly.
*/
(len >= TCP_MIN_MSS + sizeof(struct tcphdr) &&
!(tcp_flag_word(tcp_hdr(skb)) & TCP_REMNANT))) {
/* Subtract also invariant (if peer is RFC compliant),
* tcp header plus fixed timestamp option length.
* Resulting "len" is MSS free of SACK jitter.
*/
len -= tcp_sk(sk)->tcp_header_len;
icsk->icsk_ack.last_seg_size = len;
if (len == lss) {
icsk->icsk_ack.rcv_mss = len;
return;
}
}
if (icsk->icsk_ack.pending & ICSK_ACK_PUSHED)
icsk->icsk_ack.pending |= ICSK_ACK_PUSHED2;
icsk->icsk_ack.pending |= ICSK_ACK_PUSHED;
}
}
static void tcp_incr_quickack(struct sock *sk, unsigned int max_quickacks)
{
struct inet_connection_sock *icsk = inet_csk(sk);
unsigned int quickacks = tcp_sk(sk)->rcv_wnd / (2 * icsk->icsk_ack.rcv_mss);
if (quickacks == 0)
quickacks = 2;
quickacks = min(quickacks, max_quickacks);
if (quickacks > icsk->icsk_ack.quick) icsk->icsk_ack.quick = quickacks;
}
void tcp_enter_quickack_mode(struct sock *sk, unsigned int max_quickacks)
{
struct inet_connection_sock *icsk = inet_csk(sk);
tcp_incr_quickack(sk, max_quickacks);
inet_csk_exit_pingpong_mode(sk);
icsk->icsk_ack.ato = TCP_ATO_MIN;
}
EXPORT_SYMBOL(tcp_enter_quickack_mode);
/* Send ACKs quickly, if "quick" count is not exhausted
* and the session is not interactive.
*/
static bool tcp_in_quickack_mode(struct sock *sk)
{
const struct inet_connection_sock *icsk = inet_csk(sk);
const struct dst_entry *dst = __sk_dst_get(sk);
return (dst && dst_metric(dst, RTAX_QUICKACK)) ||
(icsk->icsk_ack.quick && !inet_csk_in_pingpong_mode(sk));
}
static void tcp_ecn_queue_cwr(struct tcp_sock *tp)
{
if (tp->ecn_flags & TCP_ECN_OK)
tp->ecn_flags |= TCP_ECN_QUEUE_CWR;
}
static void tcp_ecn_accept_cwr(struct sock *sk, const struct sk_buff *skb)
{
if (tcp_hdr(skb)->cwr) {
tcp_sk(sk)->ecn_flags &= ~TCP_ECN_DEMAND_CWR;
/* If the sender is telling us it has entered CWR, then its
* cwnd may be very low (even just 1 packet), so we should ACK
* immediately.
*/
if (TCP_SKB_CB(skb)->seq != TCP_SKB_CB(skb)->end_seq)
inet_csk(sk)->icsk_ack.pending |= ICSK_ACK_NOW;
}
}
static void tcp_ecn_withdraw_cwr(struct tcp_sock *tp)
{
tp->ecn_flags &= ~TCP_ECN_QUEUE_CWR;
}
static void __tcp_ecn_check_ce(struct sock *sk, const struct sk_buff *skb)
{
struct tcp_sock *tp = tcp_sk(sk);
switch (TCP_SKB_CB(skb)->ip_dsfield & INET_ECN_MASK) {
case INET_ECN_NOT_ECT:
/* Funny extension: if ECT is not set on a segment,
* and we already seen ECT on a previous segment,
* it is probably a retransmit.
*/
if (tp->ecn_flags & TCP_ECN_SEEN)
tcp_enter_quickack_mode(sk, 2);
break;
case INET_ECN_CE:
if (tcp_ca_needs_ecn(sk))
tcp_ca_event(sk, CA_EVENT_ECN_IS_CE);
if (!(tp->ecn_flags & TCP_ECN_DEMAND_CWR)) {
/* Better not delay acks, sender can have a very low cwnd */
tcp_enter_quickack_mode(sk, 2);
tp->ecn_flags |= TCP_ECN_DEMAND_CWR;
}
tp->ecn_flags |= TCP_ECN_SEEN;
break;
default:
if (tcp_ca_needs_ecn(sk))
tcp_ca_event(sk, CA_EVENT_ECN_NO_CE);
tp->ecn_flags |= TCP_ECN_SEEN;
break;
}
}
static void tcp_ecn_check_ce(struct sock *sk, const struct sk_buff *skb)
{
if (tcp_sk(sk)->ecn_flags & TCP_ECN_OK)
__tcp_ecn_check_ce(sk, skb);
}
static void tcp_ecn_rcv_synack(struct tcp_sock *tp, const struct tcphdr *th)
{
if ((tp->ecn_flags & TCP_ECN_OK) && (!th->ece || th->cwr)) tp->ecn_flags &= ~TCP_ECN_OK;
}
static void tcp_ecn_rcv_syn(struct tcp_sock *tp, const struct tcphdr *th)
{
if ((tp->ecn_flags & TCP_ECN_OK) && (!th->ece || !th->cwr)) tp->ecn_flags &= ~TCP_ECN_OK;
}
static bool tcp_ecn_rcv_ecn_echo(const struct tcp_sock *tp, const struct tcphdr *th)
{
if (th->ece && !th->syn && (tp->ecn_flags & TCP_ECN_OK))
return true;
return false;
}
/* Buffer size and advertised window tuning.
*
* 1. Tuning sk->sk_sndbuf, when connection enters established state.
*/
static void tcp_sndbuf_expand(struct sock *sk)
{
const struct tcp_sock *tp = tcp_sk(sk);
const struct tcp_congestion_ops *ca_ops = inet_csk(sk)->icsk_ca_ops;
int sndmem, per_mss;
u32 nr_segs;
/* Worst case is non GSO/TSO : each frame consumes one skb
* and skb->head is kmalloced using power of two area of memory
*/
per_mss = max_t(u32, tp->rx_opt.mss_clamp, tp->mss_cache) +
MAX_TCP_HEADER +
SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
per_mss = roundup_pow_of_two(per_mss) +
SKB_DATA_ALIGN(sizeof(struct sk_buff));
nr_segs = max_t(u32, TCP_INIT_CWND, tp->snd_cwnd);
nr_segs = max_t(u32, nr_segs, tp->reordering + 1);
/* Fast Recovery (RFC 5681 3.2) :
* Cubic needs 1.7 factor, rounded to 2 to include
* extra cushion (application might react slowly to EPOLLOUT)
*/
sndmem = ca_ops->sndbuf_expand ? ca_ops->sndbuf_expand(sk) : 2;
sndmem *= nr_segs * per_mss;
if (sk->sk_sndbuf < sndmem)
WRITE_ONCE(sk->sk_sndbuf,
min(sndmem, sock_net(sk)->ipv4.sysctl_tcp_wmem[2]));
}
/* 2. Tuning advertised window (window_clamp, rcv_ssthresh)
*
* All tcp_full_space() is split to two parts: "network" buffer, allocated
* forward and advertised in receiver window (tp->rcv_wnd) and
* "application buffer", required to isolate scheduling/application
* latencies from network.
* window_clamp is maximal advertised window. It can be less than
* tcp_full_space(), in this case tcp_full_space() - window_clamp
* is reserved for "application" buffer. The less window_clamp is
* the smoother our behaviour from viewpoint of network, but the lower
* throughput and the higher sensitivity of the connection to losses. 8)
*
* rcv_ssthresh is more strict window_clamp used at "slow start"
* phase to predict further behaviour of this connection.
* It is used for two goals:
* - to enforce header prediction at sender, even when application
* requires some significant "application buffer". It is check #1.
* - to prevent pruning of receive queue because of misprediction
* of receiver window. Check #2.
*
* The scheme does not work when sender sends good segments opening
* window and then starts to feed us spaghetti. But it should work
* in common situations. Otherwise, we have to rely on queue collapsing.
*/
/* Slow part of check#2. */
static int __tcp_grow_window(const struct sock *sk, const struct sk_buff *skb,
unsigned int skbtruesize)
{
struct tcp_sock *tp = tcp_sk(sk);
/* Optimize this! */
int truesize = tcp_win_from_space(sk, skbtruesize) >> 1;
int window = tcp_win_from_space(sk, sock_net(sk)->ipv4.sysctl_tcp_rmem[2]) >> 1;
while (tp->rcv_ssthresh <= window) {
if (truesize <= skb->len)
return 2 * inet_csk(sk)->icsk_ack.rcv_mss;
truesize >>= 1;
window >>= 1;
}
return 0;
}
/* Even if skb appears to have a bad len/truesize ratio, TCP coalescing
* can play nice with us, as sk_buff and skb->head might be either
* freed or shared with up to MAX_SKB_FRAGS segments.
* Only give a boost to drivers using page frag(s) to hold the frame(s),
* and if no payload was pulled in skb->head before reaching us.
*/
static u32 truesize_adjust(bool adjust, const struct sk_buff *skb)
{
u32 truesize = skb->truesize;
if (adjust && !skb_headlen(skb)) {
truesize -= SKB_TRUESIZE(skb_end_offset(skb));
/* paranoid check, some drivers might be buggy */
if (unlikely((int)truesize < (int)skb->len))
truesize = skb->truesize;
}
return truesize;
}
static void tcp_grow_window(struct sock *sk, const struct sk_buff *skb,
bool adjust)
{
struct tcp_sock *tp = tcp_sk(sk);
int room;
room = min_t(int, tp->window_clamp, tcp_space(sk)) - tp->rcv_ssthresh;
/* Check #1 */
if (room > 0 && !tcp_under_memory_pressure(sk)) {
unsigned int truesize = truesize_adjust(adjust, skb);
int incr;
/* Check #2. Increase window, if skb with such overhead
* will fit to rcvbuf in future.
*/
if (tcp_win_from_space(sk, truesize) <= skb->len)
incr = 2 * tp->advmss;
else
incr = __tcp_grow_window(sk, skb, truesize);
if (incr) {
incr = max_t(int, incr, 2 * skb->len);
tp->rcv_ssthresh += min(room, incr);
inet_csk(sk)->icsk_ack.quick |= 1;
}
}
}
/* 3. Try to fixup all. It is made immediately after connection enters
* established state.
*/
static void tcp_init_buffer_space(struct sock *sk)
{
int tcp_app_win = sock_net(sk)->ipv4.sysctl_tcp_app_win;
struct tcp_sock *tp = tcp_sk(sk);
int maxwin;
if (!(sk->sk_userlocks & SOCK_SNDBUF_LOCK))
tcp_sndbuf_expand(sk);
tcp_mstamp_refresh(tp);
tp->rcvq_space.time = tp->tcp_mstamp;
tp->rcvq_space.seq = tp->copied_seq;
maxwin = tcp_full_space(sk);
if (tp->window_clamp >= maxwin) {
tp->window_clamp = maxwin;
if (tcp_app_win && maxwin > 4 * tp->advmss)
tp->window_clamp = max(maxwin -
(maxwin >> tcp_app_win),
4 * tp->advmss);
}
/* Force reservation of one segment. */
if (tcp_app_win &&
tp->window_clamp > 2 * tp->advmss &&
tp->window_clamp + tp->advmss > maxwin)
tp->window_clamp = max(2 * tp->advmss, maxwin - tp->advmss);
tp->rcv_ssthresh = min(tp->rcv_ssthresh, tp->window_clamp);
tp->snd_cwnd_stamp = tcp_jiffies32;
tp->rcvq_space.space = min3(tp->rcv_ssthresh, tp->rcv_wnd,
(u32)TCP_INIT_CWND * tp->advmss);
}
/* 4. Recalculate window clamp after socket hit its memory bounds. */
static void tcp_clamp_window(struct sock *sk)
{
struct tcp_sock *tp = tcp_sk(sk);
struct inet_connection_sock *icsk = inet_csk(sk);
struct net *net = sock_net(sk);
icsk->icsk_ack.quick = 0;
if (sk->sk_rcvbuf < net->ipv4.sysctl_tcp_rmem[2] &&
!(sk->sk_userlocks & SOCK_RCVBUF_LOCK) &&
!tcp_under_memory_pressure(sk) &&
sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)) {
WRITE_ONCE(sk->sk_rcvbuf,
min(atomic_read(&sk->sk_rmem_alloc),
net->ipv4.sysctl_tcp_rmem[2]));
}
if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf)
tp->rcv_ssthresh = min(tp->window_clamp, 2U * tp->advmss);
}
/* Initialize RCV_MSS value.
* RCV_MSS is an our guess about MSS used by the peer.
* We haven't any direct information about the MSS.
* It's better to underestimate the RCV_MSS rather than overestimate.
* Overestimations make us ACKing less frequently than needed.
* Underestimations are more easy to detect and fix by tcp_measure_rcv_mss().
*/
void tcp_initialize_rcv_mss(struct sock *sk)
{
const struct tcp_sock *tp = tcp_sk(sk);
unsigned int hint = min_t(unsigned int, tp->advmss, tp->mss_cache);
hint = min(hint, tp->rcv_wnd / 2);
hint = min(hint, TCP_MSS_DEFAULT);
hint = max(hint, TCP_MIN_MSS);
inet_csk(sk)->icsk_ack.rcv_mss = hint;
}
EXPORT_SYMBOL(tcp_initialize_rcv_mss);
/* Receiver "autotuning" code.
*
* The algorithm for RTT estimation w/o timestamps is based on
* Dynamic Right-Sizing (DRS) by Wu Feng and Mike Fisk of LANL.
* <https://public.lanl.gov/radiant/pubs.html#DRS>
*
* More detail on this code can be found at
* <http://staff.psc.edu/jheffner/>,
* though this reference is out of date. A new paper
* is pending.
*/
static void tcp_rcv_rtt_update(struct tcp_sock *tp, u32 sample, int win_dep)
{
u32 new_sample = tp->rcv_rtt_est.rtt_us;
long m = sample;
if (new_sample != 0) {
/* If we sample in larger samples in the non-timestamp
* case, we could grossly overestimate the RTT especially
* with chatty applications or bulk transfer apps which
* are stalled on filesystem I/O.
*
* Also, since we are only going for a minimum in the
* non-timestamp case, we do not smooth things out
* else with timestamps disabled convergence takes too
* long.
*/
if (!win_dep) {
m -= (new_sample >> 3);
new_sample += m;
} else {
m <<= 3;
if (m < new_sample)
new_sample = m;
}
} else {
/* No previous measure. */
new_sample = m << 3;
}
tp->rcv_rtt_est.rtt_us = new_sample;
}
static inline void tcp_rcv_rtt_measure(struct tcp_sock *tp)
{
u32 delta_us;
if (tp->rcv_rtt_est.time == 0)
goto new_measure;
if (before(tp->rcv_nxt, tp->rcv_rtt_est.seq))
return;
delta_us = tcp_stamp_us_delta(tp->tcp_mstamp, tp->rcv_rtt_est.time);
if (!delta_us)
delta_us = 1;
tcp_rcv_rtt_update(tp, delta_us, 1);
new_measure:
tp->rcv_rtt_est.seq = tp->rcv_nxt + tp->rcv_wnd;
tp->rcv_rtt_est.time = tp->tcp_mstamp;
}
static inline void tcp_rcv_rtt_measure_ts(struct sock *sk,
const struct sk_buff *skb)
{
struct tcp_sock *tp = tcp_sk(sk);
if (tp->rx_opt.rcv_tsecr == tp->rcv_rtt_last_tsecr)
return;
tp->rcv_rtt_last_tsecr = tp->rx_opt.rcv_tsecr;
if (TCP_SKB_CB(skb)->end_seq -
TCP_SKB_CB(skb)->seq >= inet_csk(sk)->icsk_ack.rcv_mss) {
u32 delta = tcp_time_stamp(tp) - tp->rx_opt.rcv_tsecr;
u32 delta_us;
if (likely(delta < INT_MAX / (USEC_PER_SEC / TCP_TS_HZ))) {
if (!delta)
delta = 1;
delta_us = delta * (USEC_PER_SEC / TCP_TS_HZ);
tcp_rcv_rtt_update(tp, delta_us, 0);
}
}
}
/*
* This function should be called every time data is copied to user space.
* It calculates the appropriate TCP receive buffer space.
*/
void tcp_rcv_space_adjust(struct sock *sk)
{
struct tcp_sock *tp = tcp_sk(sk);
u32 copied;
int time;
trace_tcp_rcv_space_adjust(sk);
tcp_mstamp_refresh(tp);
time = tcp_stamp_us_delta(tp->tcp_mstamp, tp->rcvq_space.time);
if (time < (tp->rcv_rtt_est.rtt_us >> 3) || tp->rcv_rtt_est.rtt_us == 0)
return;
/* Number of bytes copied to user in last RTT */
copied = tp->copied_seq - tp->rcvq_space.seq;
if (copied <= tp->rcvq_space.space)
goto new_measure;
/* A bit of theory :
* copied = bytes received in previous RTT, our base window
* To cope with packet losses, we need a 2x factor
* To cope with slow start, and sender growing its cwin by 100 %
* every RTT, we need a 4x factor, because the ACK we are sending
* now is for the next RTT, not the current one :
* <prev RTT . ><current RTT .. ><next RTT .... >
*/
if (sock_net(sk)->ipv4.sysctl_tcp_moderate_rcvbuf &&
!(sk->sk_userlocks & SOCK_RCVBUF_LOCK)) {
int rcvmem, rcvbuf;
u64 rcvwin, grow;
/* minimal window to cope with packet losses, assuming
* steady state. Add some cushion because of small variations.
*/
rcvwin = ((u64)copied << 1) + 16 * tp->advmss;
/* Accommodate for sender rate increase (eg. slow start) */
grow = rcvwin * (copied - tp->rcvq_space.space);
do_div(grow, tp->rcvq_space.space);
rcvwin += (grow << 1);
rcvmem = SKB_TRUESIZE(tp->advmss + MAX_TCP_HEADER);
while (tcp_win_from_space(sk, rcvmem) < tp->advmss)
rcvmem += 128;
do_div(rcvwin, tp->advmss);
rcvbuf = min_t(u64, rcvwin * rcvmem,
sock_net(sk)->ipv4.sysctl_tcp_rmem[2]);
if (rcvbuf > sk->sk_rcvbuf) {
WRITE_ONCE(sk->sk_rcvbuf, rcvbuf);
/* Make the window clamp follow along. */
tp->window_clamp = tcp_win_from_space(sk, rcvbuf);
}
}
tp->rcvq_space.space = copied;
new_measure:
tp->rcvq_space.seq = tp->copied_seq;
tp->rcvq_space.time = tp->tcp_mstamp;
}
/* There is something which you must keep in mind when you analyze the
* behavior of the tp->ato delayed ack timeout interval. When a
* connection starts up, we want to ack as quickly as possible. The
* problem is that "good" TCP's do slow start at the beginning of data
* transmission. The means that until we send the first few ACK's the
* sender will sit on his end and only queue most of his data, because
* he can only send snd_cwnd unacked packets at any given time. For
* each ACK we send, he increments snd_cwnd and transmits more of his
* queue. -DaveM
*/
static void tcp_event_data_recv(struct sock *sk, struct sk_buff *skb)
{
struct tcp_sock *tp = tcp_sk(sk);
struct inet_connection_sock *icsk = inet_csk(sk);
u32 now;
inet_csk_schedule_ack(sk);
tcp_measure_rcv_mss(sk, skb);
tcp_rcv_rtt_measure(tp);
now = tcp_jiffies32;
if (!icsk->icsk_ack.ato) {
/* The _first_ data packet received, initialize
* delayed ACK engine.
*/
tcp_incr_quickack(sk, TCP_MAX_QUICKACKS);
icsk->icsk_ack.ato = TCP_ATO_MIN;
} else {
int m = now - icsk->icsk_ack.lrcvtime;
if (m <= TCP_ATO_MIN / 2) {
/* The fastest case is the first. */
icsk->icsk_ack.ato = (icsk->icsk_ack.ato >> 1) + TCP_ATO_MIN / 2;
} else if (m < icsk->icsk_ack.ato) {
icsk->icsk_ack.ato = (icsk->icsk_ack.ato >> 1) + m;
if (icsk->icsk_ack.ato > icsk->icsk_rto)
icsk->icsk_ack.ato = icsk->icsk_rto;
} else if (m > icsk->icsk_rto) {
/* Too long gap. Apparently sender failed to
* restart window, so that we send ACKs quickly.
*/
tcp_incr_quickack(sk, TCP_MAX_QUICKACKS);
sk_mem_reclaim(sk);
}
}
icsk->icsk_ack.lrcvtime = now;
tcp_ecn_check_ce(sk, skb);
if (skb->len >= 128)
tcp_grow_window(sk, skb, true);
}
/* Called to compute a smoothed rtt estimate. The data fed to this
* routine either comes from timestamps, or from segments that were
* known _not_ to have been retransmitted [see Karn/Partridge
* Proceedings SIGCOMM 87]. The algorithm is from the SIGCOMM 88
* piece by Van Jacobson.
* NOTE: the next three routines used to be one big routine.
* To save cycles in the RFC 1323 implementation it was better to break
* it up into three procedures. -- erics
*/
static void tcp_rtt_estimator(struct sock *sk, long mrtt_us)
{
struct tcp_sock *tp = tcp_sk(sk);
long m = mrtt_us; /* RTT */
u32 srtt = tp->srtt_us;
/* The following amusing code comes from Jacobson's
* article in SIGCOMM '88. Note that rtt and mdev
* are scaled versions of rtt and mean deviation.
* This is designed to be as fast as possible
* m stands for "measurement".
*
* On a 1990 paper the rto value is changed to:
* RTO = rtt + 4 * mdev
*
* Funny. This algorithm seems to be very broken.
* These formulae increase RTO, when it should be decreased, increase
* too slowly, when it should be increased quickly, decrease too quickly
* etc. I guess in BSD RTO takes ONE value, so that it is absolutely
* does not matter how to _calculate_ it. Seems, it was trap
* that VJ failed to avoid. 8)
*/
if (srtt != 0) {
m -= (srtt >> 3); /* m is now error in rtt est */
srtt += m; /* rtt = 7/8 rtt + 1/8 new */
if (m < 0) {
m = -m; /* m is now abs(error) */
m -= (tp->mdev_us >> 2); /* similar update on mdev */
/* This is similar to one of Eifel findings.
* Eifel blocks mdev updates when rtt decreases.
* This solution is a bit different: we use finer gain
* for mdev in this case (alpha*beta).
* Like Eifel it also prevents growth of rto,
* but also it limits too fast rto decreases,
* happening in pure Eifel.
*/
if (m > 0)
m >>= 3;
} else {
m -= (tp->mdev_us >> 2); /* similar update on mdev */
}
tp->mdev_us += m; /* mdev = 3/4 mdev + 1/4 new */
if (tp->mdev_us > tp->mdev_max_us) {
tp->mdev_max_us = tp->mdev_us;
if (tp->mdev_max_us > tp->rttvar_us)
tp->rttvar_us = tp->mdev_max_us;
}
if (after(tp->snd_una, tp->rtt_seq)) {
if (tp->mdev_max_us < tp->rttvar_us)
tp->rttvar_us -= (tp->rttvar_us - tp->mdev_max_us) >> 2;
tp->rtt_seq = tp->snd_nxt;
tp->mdev_max_us = tcp_rto_min_us(sk);
tcp_bpf_rtt(sk);
}
} else {
/* no previous measure. */
srtt = m << 3; /* take the measured time to be rtt */
tp->mdev_us = m << 1; /* make sure rto = 3*rtt */
tp->rttvar_us = max(tp->mdev_us, tcp_rto_min_us(sk));
tp->mdev_max_us = tp->rttvar_us;
tp->rtt_seq = tp->snd_nxt;
tcp_bpf_rtt(sk);
}
tp->srtt_us = max(1U, srtt);
}
static void tcp_update_pacing_rate(struct sock *sk)
{
const struct tcp_sock *tp = tcp_sk(sk);
u64 rate;
/* set sk_pacing_rate to 200 % of current rate (mss * cwnd / srtt) */
rate = (u64)tp->mss_cache * ((USEC_PER_SEC / 100) << 3);
/* current rate is (cwnd * mss) / srtt
* In Slow Start [1], set sk_pacing_rate to 200 % the current rate.
* In Congestion Avoidance phase, set it to 120 % the current rate.
*
* [1] : Normal Slow Start condition is (tp->snd_cwnd < tp->snd_ssthresh)
* If snd_cwnd >= (tp->snd_ssthresh / 2), we are approaching
* end of slow start and should slow down.
*/
if (tp->snd_cwnd < tp->snd_ssthresh / 2)
rate *= sock_net(sk)->ipv4.sysctl_tcp_pacing_ss_ratio;
else
rate *= sock_net(sk)->ipv4.sysctl_tcp_pacing_ca_ratio;
rate *= max(tp->snd_cwnd, tp->packets_out);
if (likely(tp->srtt_us))
do_div(rate, tp->srtt_us);
/* WRITE_ONCE() is needed because sch_fq fetches sk_pacing_rate
* without any lock. We want to make sure compiler wont store
* intermediate values in this location.
*/
WRITE_ONCE(sk->sk_pacing_rate, min_t(u64, rate,
sk->sk_max_pacing_rate));
}
/* Calculate rto without backoff. This is the second half of Van Jacobson's
* routine referred to above.
*/
static void tcp_set_rto(struct sock *sk)
{
const struct tcp_sock *tp = tcp_sk(sk);
/* Old crap is replaced with new one. 8)
*
* More seriously:
* 1. If rtt variance happened to be less 50msec, it is hallucination.
* It cannot be less due to utterly erratic ACK generation made
* at least by solaris and freebsd. "Erratic ACKs" has _nothing_
* to do with delayed acks, because at cwnd>2 true delack timeout
* is invisible. Actually, Linux-2.4 also generates erratic
* ACKs in some circumstances.
*/
inet_csk(sk)->icsk_rto = __tcp_set_rto(tp);
/* 2. Fixups made earlier cannot be right.
* If we do not estimate RTO correctly without them,
* all the algo is pure shit and should be replaced
* with correct one. It is exactly, which we pretend to do.
*/
/* NOTE: clamping at TCP_RTO_MIN is not required, current algo
* guarantees that rto is higher.
*/
tcp_bound_rto(sk);
}
__u32 tcp_init_cwnd(const struct tcp_sock *tp, const struct dst_entry *dst)
{
__u32 cwnd = (dst ? dst_metric(dst, RTAX_INITCWND) : 0);
if (!cwnd)
cwnd = TCP_INIT_CWND;
return min_t(__u32, cwnd, tp->snd_cwnd_clamp);
}
struct tcp_sacktag_state {
/* Timestamps for earliest and latest never-retransmitted segment
* that was SACKed. RTO needs the earliest RTT to stay conservative,
* but congestion control should still get an accurate delay signal.
*/
u64 first_sackt;
u64 last_sackt;
u32 reord;
u32 sack_delivered;
int flag;
unsigned int mss_now;
struct rate_sample *rate;
};
/* Take a notice that peer is sending D-SACKs. Skip update of data delivery
* and spurious retransmission information if this DSACK is unlikely caused by
* sender's action:
* - DSACKed sequence range is larger than maximum receiver's window.
* - Total no. of DSACKed segments exceed the total no. of retransmitted segs.
*/
static u32 tcp_dsack_seen(struct tcp_sock *tp, u32 start_seq,
u32 end_seq, struct tcp_sacktag_state *state)
{
u32 seq_len, dup_segs = 1;
if (!before(start_seq, end_seq))
return 0;
seq_len = end_seq - start_seq;
/* Dubious DSACK: DSACKed range greater than maximum advertised rwnd */
if (seq_len > tp->max_window)
return 0;
if (seq_len > tp->mss_cache)
dup_segs = DIV_ROUND_UP(seq_len, tp->mss_cache);
else if (tp->tlp_high_seq && tp->tlp_high_seq == end_seq)
state->flag |= FLAG_DSACK_TLP;
tp->dsack_dups += dup_segs;
/* Skip the DSACK if dup segs weren't retransmitted by sender */
if (tp->dsack_dups > tp->total_retrans)
return 0;
tp->rx_opt.sack_ok |= TCP_DSACK_SEEN;
/* We increase the RACK ordering window in rounds where we receive
* DSACKs that may have been due to reordering causing RACK to trigger
* a spurious fast recovery. Thus RACK ignores DSACKs that happen
* without having seen reordering, or that match TLP probes (TLP
* is timer-driven, not triggered by RACK).
*/
if (tp->reord_seen && !(state->flag & FLAG_DSACK_TLP))
tp->rack.dsack_seen = 1;
state->flag |= FLAG_DSACKING_ACK;
/* A spurious retransmission is delivered */
state->sack_delivered += dup_segs;
return dup_segs;
}
/* It's reordering when higher sequence was delivered (i.e. sacked) before
* some lower never-retransmitted sequence ("low_seq"). The maximum reordering
* distance is approximated in full-mss packet distance ("reordering").
*/
static void tcp_check_sack_reordering(struct sock *sk, const u32 low_seq,
const int ts)
{
struct tcp_sock *tp = tcp_sk(sk);
const u32 mss = tp->mss_cache;
u32 fack, metric;
fack = tcp_highest_sack_seq(tp);
if (!before(low_seq, fack))
return;
metric = fack - low_seq;
if ((metric > tp->reordering * mss) && mss) {
#if FASTRETRANS_DEBUG > 1
pr_debug("Disorder%d %d %u f%u s%u rr%d\n",
tp->rx_opt.sack_ok, inet_csk(sk)->icsk_ca_state,
tp->reordering,
0,
tp->sacked_out,
tp->undo_marker ? tp->undo_retrans : 0);
#endif
tp->reordering = min_t(u32, (metric + mss - 1) / mss,
sock_net(sk)->ipv4.sysctl_tcp_max_reordering);
}
/* This exciting event is worth to be remembered. 8) */
tp->reord_seen++;
NET_INC_STATS(sock_net(sk),
ts ? LINUX_MIB_TCPTSREORDER : LINUX_MIB_TCPSACKREORDER);
}
/* This must be called before lost_out or retrans_out are updated
* on a new loss, because we want to know if all skbs previously
* known to be lost have already been retransmitted, indicating
* that this newly lost skb is our next skb to retransmit.
*/
static void tcp_verify_retransmit_hint(struct tcp_sock *tp, struct sk_buff *skb)
{
if ((!tp->retransmit_skb_hint && tp->retrans_out >= tp->lost_out) ||
(tp->retransmit_skb_hint &&
before(TCP_SKB_CB(skb)->seq,
TCP_SKB_CB(tp->retransmit_skb_hint)->seq)))
tp->retransmit_skb_hint = skb;
}
/* Sum the number of packets on the wire we have marked as lost, and
* notify the congestion control module that the given skb was marked lost.
*/
static void tcp_notify_skb_loss_event(struct tcp_sock *tp, const struct sk_buff *skb)
{
tp->lost += tcp_skb_pcount(skb);
}
void tcp_mark_skb_lost(struct sock *sk, struct sk_buff *skb)
{
__u8 sacked = TCP_SKB_CB(skb)->sacked;
struct tcp_sock *tp = tcp_sk(sk);
if (sacked & TCPCB_SACKED_ACKED)
return;
tcp_verify_retransmit_hint(tp, skb);
if (sacked & TCPCB_LOST) {
if (sacked & TCPCB_SACKED_RETRANS) {
/* Account for retransmits that are lost again */
TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS;
tp->retrans_out -= tcp_skb_pcount(skb);
NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPLOSTRETRANSMIT,
tcp_skb_pcount(skb));
tcp_notify_skb_loss_event(tp, skb);
}
} else {
tp->lost_out += tcp_skb_pcount(skb);
TCP_SKB_CB(skb)->sacked |= TCPCB_LOST;
tcp_notify_skb_loss_event(tp, skb);
}
}
/* Updates the delivered and delivered_ce counts */
static void tcp_count_delivered(struct tcp_sock *tp, u32 delivered,
bool ece_ack)
{
tp->delivered += delivered;
if (ece_ack)
tp->delivered_ce += delivered;
}
/* This procedure tags the retransmission queue when SACKs arrive.
*
* We have three tag bits: SACKED(S), RETRANS(R) and LOST(L).
* Packets in queue with these bits set are counted in variables
* sacked_out, retrans_out and lost_out, correspondingly.
*
* Valid combinations are:
* Tag InFlight Description
* 0 1 - orig segment is in flight.
* S 0 - nothing flies, orig reached receiver.
* L 0 - nothing flies, orig lost by net.
* R 2 - both orig and retransmit are in flight.
* L|R 1 - orig is lost, retransmit is in flight.
* S|R 1 - orig reached receiver, retrans is still in flight.
* (L|S|R is logically valid, it could occur when L|R is sacked,
* but it is equivalent to plain S and code short-curcuits it to S.
* L|S is logically invalid, it would mean -1 packet in flight 8))
*
* These 6 states form finite state machine, controlled by the following events:
* 1. New ACK (+SACK) arrives. (tcp_sacktag_write_queue())
* 2. Retransmission. (tcp_retransmit_skb(), tcp_xmit_retransmit_queue())
* 3. Loss detection event of two flavors:
* A. Scoreboard estimator decided the packet is lost.
* A'. Reno "three dupacks" marks head of queue lost.
* B. SACK arrives sacking SND.NXT at the moment, when the
* segment was retransmitted.
* 4. D-SACK added new rule: D-SACK changes any tag to S.
*
* It is pleasant to note, that state diagram turns out to be commutative,
* so that we are allowed not to be bothered by order of our actions,
* when multiple events arrive simultaneously. (see the function below).
*
* Reordering detection.
* --------------------
* Reordering metric is maximal distance, which a packet can be displaced
* in packet stream. With SACKs we can estimate it:
*
* 1. SACK fills old hole and the corresponding segment was not
* ever retransmitted -> reordering. Alas, we cannot use it
* when segment was retransmitted.
* 2. The last flaw is solved with D-SACK. D-SACK arrives
* for retransmitted and already SACKed segment -> reordering..
* Both of these heuristics are not used in Loss state, when we cannot
* account for retransmits accurately.
*
* SACK block validation.
* ----------------------
*
* SACK block range validation checks that the received SACK block fits to
* the expected sequence limits, i.e., it is between SND.UNA and SND.NXT.
* Note that SND.UNA is not included to the range though being valid because
* it means that the receiver is rather inconsistent with itself reporting
* SACK reneging when it should advance SND.UNA. Such SACK block this is
* perfectly valid, however, in light of RFC2018 which explicitly states
* that "SACK block MUST reflect the newest segment. Even if the newest
* segment is going to be discarded ...", not that it looks very clever
* in case of head skb. Due to potentional receiver driven attacks, we
* choose to avoid immediate execution of a walk in write queue due to
* reneging and defer head skb's loss recovery to standard loss recovery
* procedure that will eventually trigger (nothing forbids us doing this).
*
* Implements also blockage to start_seq wrap-around. Problem lies in the
* fact that though start_seq (s) is before end_seq (i.e., not reversed),
* there's no guarantee that it will be before snd_nxt (n). The problem
* happens when start_seq resides between end_seq wrap (e_w) and snd_nxt
* wrap (s_w):
*
* <- outs wnd -> <- wrapzone ->
* u e n u_w e_w s n_w
* | | | | | | |
* |<------------+------+----- TCP seqno space --------------+---------->|
* ...-- <2^31 ->| |<--------...
* ...---- >2^31 ------>| |<--------...
*
* Current code wouldn't be vulnerable but it's better still to discard such
* crazy SACK blocks. Doing this check for start_seq alone closes somewhat
* similar case (end_seq after snd_nxt wrap) as earlier reversed check in
* snd_nxt wrap -> snd_una region will then become "well defined", i.e.,
* equal to the ideal case (infinite seqno space without wrap caused issues).
*
* With D-SACK the lower bound is extended to cover sequence space below
* SND.UNA down to undo_marker, which is the last point of interest. Yet
* again, D-SACK block must not to go across snd_una (for the same reason as
* for the normal SACK blocks, explained above). But there all simplicity
* ends, TCP might receive valid D-SACKs below that. As long as they reside
* fully below undo_marker they do not affect behavior in anyway and can
* therefore be safely ignored. In rare cases (which are more or less
* theoretical ones), the D-SACK will nicely cross that boundary due to skb
* fragmentation and packet reordering past skb's retransmission. To consider
* them correctly, the acceptable range must be extended even more though
* the exact amount is rather hard to quantify. However, tp->max_window can
* be used as an exaggerated estimate.
*/
static bool tcp_is_sackblock_valid(struct tcp_sock *tp, bool is_dsack,
u32 start_seq, u32 end_seq)
{
/* Too far in future, or reversed (interpretation is ambiguous) */
if (after(end_seq, tp->snd_nxt) || !before(start_seq, end_seq))
return false;
/* Nasty start_seq wrap-around check (see comments above) */
if (!before(start_seq, tp->snd_nxt))
return false;
/* In outstanding window? ...This is valid exit for D-SACKs too.
* start_seq == snd_una is non-sensical (see comments above)
*/
if (after(start_seq, tp->snd_una))
return true;
if (!is_dsack || !tp->undo_marker)
return false;
/* ...Then it's D-SACK, and must reside below snd_una completely */
if (after(end_seq, tp->snd_una))
return false;
if (!before(start_seq, tp->undo_marker))
return true;
/* Too old */
if (!after(end_seq, tp->undo_marker))
return false;
/* Undo_marker boundary crossing (overestimates a lot). Known already:
* start_seq < undo_marker and end_seq >= undo_marker.
*/
return !before(start_seq, end_seq - tp->max_window);
}
static bool tcp_check_dsack(struct sock *sk, const struct sk_buff *ack_skb,
struct tcp_sack_block_wire *sp, int num_sacks,
u32 prior_snd_una, struct tcp_sacktag_state *state)
{
struct tcp_sock *tp = tcp_sk(sk);
u32 start_seq_0 = get_unaligned_be32(&sp[0].start_seq);
u32 end_seq_0 = get_unaligned_be32(&sp[0].end_seq);
u32 dup_segs;
if (before(start_seq_0, TCP_SKB_CB(ack_skb)->ack_seq)) {
NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPDSACKRECV);
} else if (num_sacks > 1) {
u32 end_seq_1 = get_unaligned_be32(&sp[1].end_seq);
u32 start_seq_1 = get_unaligned_be32(&sp[1].start_seq);
if (after(end_seq_0, end_seq_1) || before(start_seq_0, start_seq_1))
return false;
NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPDSACKOFORECV);
} else {
return false;
}
dup_segs = tcp_dsack_seen(tp, start_seq_0, end_seq_0, state);
if (!dup_segs) { /* Skip dubious DSACK */
NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPDSACKIGNOREDDUBIOUS);
return false;
}
NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPDSACKRECVSEGS, dup_segs);
/* D-SACK for already forgotten data... Do dumb counting. */
if (tp->undo_marker && tp->undo_retrans > 0 &&
!after(end_seq_0, prior_snd_una) &&
after(end_seq_0, tp->undo_marker))
tp->undo_retrans = max_t(int, 0, tp->undo_retrans - dup_segs);
return true;
}
/* Check if skb is fully within the SACK block. In presence of GSO skbs,
* the incoming SACK may not exactly match but we can find smaller MSS
* aligned portion of it that matches. Therefore we might need to fragment
* which may fail and creates some hassle (caller must handle error case
* returns).
*
* FIXME: this could be merged to shift decision code
*/
static int tcp_match_skb_to_sack(struct sock *sk, struct sk_buff *skb,
u32 start_seq, u32 end_seq)
{
int err;
bool in_sack;
unsigned int pkt_len;
unsigned int mss;
in_sack = !after(start_seq, TCP_SKB_CB(skb)->seq) &&
!before(end_seq, TCP_SKB_CB(skb)->end_seq);
if (tcp_skb_pcount(skb) > 1 && !in_sack &&
after(TCP_SKB_CB(skb)->end_seq, start_seq)) {
mss = tcp_skb_mss(skb);
in_sack = !after(start_seq, TCP_SKB_CB(skb)->seq);
if (!in_sack) {
pkt_len = start_seq - TCP_SKB_CB(skb)->seq;
if (pkt_len < mss)
pkt_len = mss;
} else {
pkt_len = end_seq - TCP_SKB_CB(skb)->seq;
if (pkt_len < mss)
return -EINVAL;
}
/* Round if necessary so that SACKs cover only full MSSes
* and/or the remaining small portion (if present)
*/
if (pkt_len > mss) {
unsigned int new_len = (pkt_len / mss) * mss;
if (!in_sack && new_len < pkt_len)
new_len += mss;
pkt_len = new_len;
}
if (pkt_len >= skb->len && !in_sack)
return 0;
err = tcp_fragment(sk, TCP_FRAG_IN_RTX_QUEUE, skb,
pkt_len, mss, GFP_ATOMIC);
if (err < 0)
return err;
}
return in_sack;
}
/* Mark the given newly-SACKed range as such, adjusting counters and hints. */
static u8 tcp_sacktag_one(struct sock *sk,
struct tcp_sacktag_state *state, u8 sacked,
u32 start_seq, u32 end_seq,
int dup_sack, int pcount,
u64 xmit_time)
{
struct tcp_sock *tp = tcp_sk(sk);
/* Account D-SACK for retransmitted packet. */
if (dup_sack && (sacked & TCPCB_RETRANS)) {
if (tp->undo_marker && tp->undo_retrans > 0 &&
after(end_seq, tp->undo_marker))
tp->undo_retrans = max_t(int, 0, tp->undo_retrans - pcount);
if ((sacked & TCPCB_SACKED_ACKED) &&
before(start_seq, state->reord))
state->reord = start_seq;
}
/* Nothing to do; acked frame is about to be dropped (was ACKed). */
if (!after(end_seq, tp->snd_una))
return sacked;
if (!(sacked & TCPCB_SACKED_ACKED)) {
tcp_rack_advance(tp, sacked, end_seq, xmit_time);
if (sacked & TCPCB_SACKED_RETRANS) {
/* If the segment is not tagged as lost,
* we do not clear RETRANS, believing
* that retransmission is still in flight.
*/
if (sacked & TCPCB_LOST) {
sacked &= ~(TCPCB_LOST|TCPCB_SACKED_RETRANS);
tp->lost_out -= pcount;
tp->retrans_out -= pcount;
}
} else {
if (!(sacked & TCPCB_RETRANS)) {
/* New sack for not retransmitted frame,
* which was in hole. It is reordering.
*/
if (before(start_seq,
tcp_highest_sack_seq(tp)) &&
before(start_seq, state->reord))
state->reord = start_seq;
if (!after(end_seq, tp->high_seq))
state->flag |= FLAG_ORIG_SACK_ACKED;
if (state->first_sackt == 0)
state->first_sackt = xmit_time;
state->last_sackt = xmit_time;
}
if (sacked & TCPCB_LOST) {
sacked &= ~TCPCB_LOST;
tp->lost_out -= pcount;
}
}
sacked |= TCPCB_SACKED_ACKED;
state->flag |= FLAG_DATA_SACKED;
tp->sacked_out += pcount;
/* Out-of-order packets delivered */
state->sack_delivered += pcount;
/* Lost marker hint past SACKed? Tweak RFC3517 cnt */
if (tp->lost_skb_hint &&
before(start_seq, TCP_SKB_CB(tp->lost_skb_hint)->seq))
tp->lost_cnt_hint += pcount;
}
/* D-SACK. We can detect redundant retransmission in S|R and plain R
* frames and clear it. undo_retrans is decreased above, L|R frames
* are accounted above as well.
*/
if (dup_sack && (sacked & TCPCB_SACKED_RETRANS)) {
sacked &= ~TCPCB_SACKED_RETRANS;
tp->retrans_out -= pcount;
}
return sacked;
}
/* Shift newly-SACKed bytes from this skb to the immediately previous
* already-SACKed sk_buff. Mark the newly-SACKed bytes as such.
*/
static bool tcp_shifted_skb(struct sock *sk, struct sk_buff *prev,
struct sk_buff *skb,
struct tcp_sacktag_state *state,
unsigned int pcount, int shifted, int mss,
bool dup_sack)
{
struct tcp_sock *tp = tcp_sk(sk);
u32 start_seq = TCP_SKB_CB(skb)->seq; /* start of newly-SACKed */
u32 end_seq = start_seq + shifted; /* end of newly-SACKed */
BUG_ON(!pcount);
/* Adjust counters and hints for the newly sacked sequence
* range but discard the return value since prev is already
* marked. We must tag the range first because the seq
* advancement below implicitly advances
* tcp_highest_sack_seq() when skb is highest_sack.
*/
tcp_sacktag_one(sk, state, TCP_SKB_CB(skb)->sacked,
start_seq, end_seq, dup_sack, pcount,
tcp_skb_timestamp_us(skb));
tcp_rate_skb_delivered(sk, skb, state->rate);
if (skb == tp->lost_skb_hint)
tp->lost_cnt_hint += pcount;
TCP_SKB_CB(prev)->end_seq += shifted;
TCP_SKB_CB(skb)->seq += shifted;
tcp_skb_pcount_add(prev, pcount);
WARN_ON_ONCE(tcp_skb_pcount(skb) < pcount);
tcp_skb_pcount_add(skb, -pcount);
/* When we're adding to gso_segs == 1, gso_size will be zero,
* in theory this shouldn't be necessary but as long as DSACK
* code can come after this skb later on it's better to keep
* setting gso_size to something.
*/
if (!TCP_SKB_CB(prev)->tcp_gso_size)
TCP_SKB_CB(prev)->tcp_gso_size = mss;
/* CHECKME: To clear or not to clear? Mimics normal skb currently */
if (tcp_skb_pcount(skb) <= 1)
TCP_SKB_CB(skb)->tcp_gso_size = 0;
/* Difference in this won't matter, both ACKed by the same cumul. ACK */
TCP_SKB_CB(prev)->sacked |= (TCP_SKB_CB(skb)->sacked & TCPCB_EVER_RETRANS);
if (skb->len > 0) {
BUG_ON(!tcp_skb_pcount(skb));
NET_INC_STATS(sock_net(sk), LINUX_MIB_SACKSHIFTED);
return false;
}
/* Whole SKB was eaten :-) */
if (skb == tp->retransmit_skb_hint)
tp->retransmit_skb_hint = prev;
if (skb == tp->lost_skb_hint) {
tp->lost_skb_hint = prev;
tp->lost_cnt_hint -= tcp_skb_pcount(prev);
}
TCP_SKB_CB(prev)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags;
TCP_SKB_CB(prev)->eor = TCP_SKB_CB(skb)->eor;
if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
TCP_SKB_CB(prev)->end_seq++;
if (skb == tcp_highest_sack(sk))
tcp_advance_highest_sack(sk, skb);
tcp_skb_collapse_tstamp(prev, skb);
if (unlikely(TCP_SKB_CB(prev)->tx.delivered_mstamp))
TCP_SKB_CB(prev)->tx.delivered_mstamp = 0;
tcp_rtx_queue_unlink_and_free(skb, sk);
NET_INC_STATS(sock_net(sk), LINUX_MIB_SACKMERGED);
return true;
}
/* I wish gso_size would have a bit more sane initialization than
* something-or-zero which complicates things
*/
static int tcp_skb_seglen(const struct sk_buff *skb)
{
return tcp_skb_pcount(skb) == 1 ? skb->len : tcp_skb_mss(skb);
}
/* Shifting pages past head area doesn't work */
static int skb_can_shift(const struct sk_buff *skb)
{
return !skb_headlen(skb) && skb_is_nonlinear(skb);
}
int tcp_skb_shift(struct sk_buff *to, struct sk_buff *from,
int pcount, int shiftlen)
{
/* TCP min gso_size is 8 bytes (TCP_MIN_GSO_SIZE)
* Since TCP_SKB_CB(skb)->tcp_gso_segs is 16 bits, we need
* to make sure not storing more than 65535 * 8 bytes per skb,
* even if current MSS is bigger.
*/
if (unlikely(to->len + shiftlen >= 65535 * TCP_MIN_GSO_SIZE))
return 0;
if (unlikely(tcp_skb_pcount(to) + pcount > 65535))
return 0;
return skb_shift(to, from, shiftlen);
}
/* Try collapsing SACK blocks spanning across multiple skbs to a single
* skb.
*/
static struct sk_buff *tcp_shift_skb_data(struct sock *sk, struct sk_buff *skb,
struct tcp_sacktag_state *state,
u32 start_seq, u32 end_seq,
bool dup_sack)
{
struct tcp_sock *tp = tcp_sk(sk);
struct sk_buff *prev;
int mss;
int pcount = 0;
int len;
int in_sack;
/* Normally R but no L won't result in plain S */
if (!dup_sack &&
(TCP_SKB_CB(skb)->sacked & (TCPCB_LOST|TCPCB_SACKED_RETRANS)) == TCPCB_SACKED_RETRANS)
goto fallback;
if (!skb_can_shift(skb))
goto fallback;
/* This frame is about to be dropped (was ACKed). */
if (!after(TCP_SKB_CB(skb)->end_seq, tp->snd_una))
goto fallback;
/* Can only happen with delayed DSACK + discard craziness */
prev = skb_rb_prev(skb);
if (!prev)
goto fallback;
if ((TCP_SKB_CB(prev)->sacked & TCPCB_TAGBITS) != TCPCB_SACKED_ACKED)
goto fallback;
if (!tcp_skb_can_collapse(prev, skb))
goto fallback;
in_sack = !after(start_seq, TCP_SKB_CB(skb)->seq) &&
!before(end_seq, TCP_SKB_CB(skb)->end_seq);
if (in_sack) {
len = skb->len;
pcount = tcp_skb_pcount(skb);
mss = tcp_skb_seglen(skb);
/* TODO: Fix DSACKs to not fragment already SACKed and we can
* drop this restriction as unnecessary
*/
if (mss != tcp_skb_seglen(prev))
goto fallback;
} else {
if (!after(TCP_SKB_CB(skb)->end_seq, start_seq))
goto noop;
/* CHECKME: This is non-MSS split case only?, this will
* cause skipped skbs due to advancing loop btw, original
* has that feature too
*/
if (tcp_skb_pcount(skb) <= 1)
goto noop;
in_sack = !after(start_seq, TCP_SKB_CB(skb)->seq);
if (!in_sack) {
/* TODO: head merge to next could be attempted here
* if (!after(TCP_SKB_CB(skb)->end_seq, end_seq)),
* though it might not be worth of the additional hassle
*
* ...we can probably just fallback to what was done
* previously. We could try merging non-SACKed ones
* as well but it probably isn't going to buy off
* because later SACKs might again split them, and
* it would make skb timestamp tracking considerably
* harder problem.
*/
goto fallback;
}
len = end_seq - TCP_SKB_CB(skb)->seq;
BUG_ON(len < 0);
BUG_ON(len > skb->len);
/* MSS boundaries should be honoured or else pcount will
* severely break even though it makes things bit trickier.
* Optimize common case to avoid most of the divides
*/
mss = tcp_skb_mss(skb);
/* TODO: Fix DSACKs to not fragment already SACKed and we can
* drop this restriction as unnecessary
*/
if (mss != tcp_skb_seglen(prev))
goto fallback;
if (len == mss) {
pcount = 1;
} else if (len < mss) {
goto noop;
} else {
pcount = len / mss;
len = pcount * mss;
}
}
/* tcp_sacktag_one() won't SACK-tag ranges below snd_una */
if (!after(TCP_SKB_CB(skb)->seq + len, tp->snd_una))
goto fallback;
if (!tcp_skb_shift(prev, skb, pcount, len))
goto fallback;
if (!tcp_shifted_skb(sk, prev, skb, state, pcount, len, mss, dup_sack))
goto out;
/* Hole filled allows collapsing with the next as well, this is very
* useful when hole on every nth skb pattern happens
*/
skb = skb_rb_next(prev);
if (!skb)
goto out;
if (!skb_can_shift(skb) ||
((TCP_SKB_CB(skb)->sacked & TCPCB_TAGBITS) != TCPCB_SACKED_ACKED) ||
(mss != tcp_skb_seglen(skb)))
goto out;
if (!tcp_skb_can_collapse(prev, skb))
goto out;
len = skb->len;
pcount = tcp_skb_pcount(skb);
if (tcp_skb_shift(prev, skb, pcount, len))
tcp_shifted_skb(sk, prev, skb, state, pcount,
len, mss, 0);
out:
return prev;
noop:
return skb;
fallback:
NET_INC_STATS(sock_net(sk), LINUX_MIB_SACKSHIFTFALLBACK);
return NULL;
}
static struct sk_buff *tcp_sacktag_walk(struct sk_buff *skb, struct sock *sk,
struct tcp_sack_block *next_dup,
struct tcp_sacktag_state *state,
u32 start_seq, u32 end_seq,
bool dup_sack_in)
{
struct tcp_sock *tp = tcp_sk(sk);
struct sk_buff *tmp;
skb_rbtree_walk_from(skb) {
int in_sack = 0;
bool dup_sack = dup_sack_in;
/* queue is in-order => we can short-circuit the walk early */
if (!before(TCP_SKB_CB(skb)->seq, end_seq))
break;
if (next_dup &&
before(TCP_SKB_CB(skb)->seq, next_dup->end_seq)) {
in_sack = tcp_match_skb_to_sack(sk, skb,
next_dup->start_seq,
next_dup->end_seq);
if (in_sack > 0)
dup_sack = true;
}
/* skb reference here is a bit tricky to get right, since
* shifting can eat and free both this skb and the next,
* so not even _safe variant of the loop is enough.
*/
if (in_sack <= 0) {
tmp = tcp_shift_skb_data(sk, skb, state,
start_seq, end_seq, dup_sack);
if (tmp) {
if (tmp != skb) {
skb = tmp;
continue;
}
in_sack = 0;
} else {
in_sack = tcp_match_skb_to_sack(sk, skb,
start_seq,
end_seq);
}
}
if (unlikely(in_sack < 0))
break;
if (in_sack) {
TCP_SKB_CB(skb)->sacked =
tcp_sacktag_one(sk,
state,
TCP_SKB_CB(skb)->sacked,
TCP_SKB_CB(skb)->seq,
TCP_SKB_CB(skb)->end_seq,
dup_sack,
tcp_skb_pcount(skb),
tcp_skb_timestamp_us(skb));
tcp_rate_skb_delivered(sk, skb, state->rate);
if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)
list_del_init(&skb->tcp_tsorted_anchor);
if (!before(TCP_SKB_CB(skb)->seq,
tcp_highest_sack_seq(tp)))
tcp_advance_highest_sack(sk, skb);
}
}
return skb;
}
static struct sk_buff *tcp_sacktag_bsearch(struct sock *sk, u32 seq)
{
struct rb_node *parent, **p = &sk->tcp_rtx_queue.rb_node;
struct sk_buff *skb;
while (*p) {
parent = *p;
skb = rb_to_skb(parent);
if (before(seq, TCP_SKB_CB(skb)->seq)) {
p = &parent->rb_left;
continue;
}
if (!before(seq, TCP_SKB_CB(skb)->end_seq)) {
p = &parent->rb_right;
continue;
}
return skb;
}
return NULL;
}
static struct sk_buff *tcp_sacktag_skip(struct sk_buff *skb, struct sock *sk,
u32 skip_to_seq)
{
if (skb && after(TCP_SKB_CB(skb)->seq, skip_to_seq))
return skb;
return tcp_sacktag_bsearch(sk, skip_to_seq);
}
static struct sk_buff *tcp_maybe_skipping_dsack(struct sk_buff *skb,
struct sock *sk,
struct tcp_sack_block *next_dup,
struct tcp_sacktag_state *state,
u32 skip_to_seq)
{
if (!next_dup)
return skb;
if (before(next_dup->start_seq, skip_to_seq)) {
skb = tcp_sacktag_skip(skb, sk, next_dup->start_seq);
skb = tcp_sacktag_walk(skb, sk, NULL, state,
next_dup->start_seq, next_dup->end_seq,
1);
}
return skb;
}
static int tcp_sack_cache_ok(const struct tcp_sock *tp, const struct tcp_sack_block *cache)
{
return cache < tp->recv_sack_cache + ARRAY_SIZE(tp->recv_sack_cache);
}
static int
tcp_sacktag_write_queue(struct sock *sk, const struct sk_buff *ack_skb,
u32 prior_snd_una, struct tcp_sacktag_state *state)
{
struct tcp_sock *tp = tcp_sk(sk);
const unsigned char *ptr = (skb_transport_header(ack_skb) +
TCP_SKB_CB(ack_skb)->sacked);
struct tcp_sack_block_wire *sp_wire = (struct tcp_sack_block_wire *)(ptr+2);
struct tcp_sack_block sp[TCP_NUM_SACKS];
struct tcp_sack_block *cache;
struct sk_buff *skb;
int num_sacks = min(TCP_NUM_SACKS, (ptr[1] - TCPOLEN_SACK_BASE) >> 3);
int used_sacks;
bool found_dup_sack = false;
int i, j;
int first_sack_index;
state->flag = 0;
state->reord = tp->snd_nxt;
if (!tp->sacked_out)
tcp_highest_sack_reset(sk);
found_dup_sack = tcp_check_dsack(sk, ack_skb, sp_wire,
num_sacks, prior_snd_una, state);
/* Eliminate too old ACKs, but take into
* account more or less fresh ones, they can
* contain valid SACK info.
*/
if (before(TCP_SKB_CB(ack_skb)->ack_seq, prior_snd_una - tp->max_window))
return 0;
if (!tp->packets_out)
goto out;
used_sacks = 0;
first_sack_index = 0;
for (i = 0; i < num_sacks; i++) {
bool dup_sack = !i && found_dup_sack;
sp[used_sacks].start_seq = get_unaligned_be32(&sp_wire[i].start_seq);
sp[used_sacks].end_seq = get_unaligned_be32(&sp_wire[i].end_seq);
if (!tcp_is_sackblock_valid(tp, dup_sack,
sp[used_sacks].start_seq,
sp[used_sacks].end_seq)) {
int mib_idx;
if (dup_sack) {
if (!tp->undo_marker)
mib_idx = LINUX_MIB_TCPDSACKIGNOREDNOUNDO;
else
mib_idx = LINUX_MIB_TCPDSACKIGNOREDOLD;
} else {
/* Don't count olds caused by ACK reordering */
if ((TCP_SKB_CB(ack_skb)->ack_seq != tp->snd_una) &&
!after(sp[used_sacks].end_seq, tp->snd_una))
continue;
mib_idx = LINUX_MIB_TCPSACKDISCARD;
}
NET_INC_STATS(sock_net(sk), mib_idx);
if (i == 0)
first_sack_index = -1;
continue;
}
/* Ignore very old stuff early */
if (!after(sp[used_sacks].end_seq, prior_snd_una)) {
if (i == 0)
first_sack_index = -1;
continue;
}
used_sacks++;
}
/* order SACK blocks to allow in order walk of the retrans queue */
for (i = used_sacks - 1; i > 0; i--) {
for (j = 0; j < i; j++) {
if (after(sp[j].start_seq, sp[j + 1].start_seq)) {
swap(sp[j], sp[j + 1]);
/* Track where the first SACK block goes to */
if (j == first_sack_index)
first_sack_index = j + 1;
}
}
}
state->mss_now = tcp_current_mss(sk);
skb = NULL;
i = 0;
if (!tp->sacked_out) {
/* It's already past, so skip checking against it */
cache = tp->recv_sack_cache + ARRAY_SIZE(tp->recv_sack_cache);
} else {
cache = tp->recv_sack_cache;
/* Skip empty blocks in at head of the cache */
while (tcp_sack_cache_ok(tp, cache) && !cache->start_seq &&
!cache->end_seq)
cache++;
}
while (i < used_sacks) {
u32 start_seq = sp[i].start_seq;
u32 end_seq = sp[i].end_seq;
bool dup_sack = (found_dup_sack && (i == first_sack_index));
struct tcp_sack_block *next_dup = NULL;
if (found_dup_sack && ((i + 1) == first_sack_index))
next_dup = &sp[i + 1];
/* Skip too early cached blocks */
while (tcp_sack_cache_ok(tp, cache) &&
!before(start_seq, cache->end_seq))
cache++;
/* Can skip some work by looking recv_sack_cache? */
if (tcp_sack_cache_ok(tp, cache) && !dup_sack &&
after(end_seq, cache->start_seq)) {
/* Head todo? */
if (before(start_seq, cache->start_seq)) {
skb = tcp_sacktag_skip(skb, sk, start_seq);
skb = tcp_sacktag_walk(skb, sk, next_dup,
state,
start_seq,
cache->start_seq,
dup_sack);
}
/* Rest of the block already fully processed? */
if (!after(end_seq, cache->end_seq))
goto advance_sp;
skb = tcp_maybe_skipping_dsack(skb, sk, next_dup,
state,
cache->end_seq);
/* ...tail remains todo... */
if (tcp_highest_sack_seq(tp) == cache->end_seq) {
/* ...but better entrypoint exists! */
skb = tcp_highest_sack(sk);
if (!skb)
break;
cache++;
goto walk;
}
skb = tcp_sacktag_skip(skb, sk, cache->end_seq);
/* Check overlap against next cached too (past this one already) */
cache++;
continue;
}
if (!before(start_seq, tcp_highest_sack_seq(tp))) {
skb = tcp_highest_sack(sk);
if (!skb)
break;
}
skb = tcp_sacktag_skip(skb, sk, start_seq);
walk:
skb = tcp_sacktag_walk(skb, sk, next_dup, state,
start_seq, end_seq, dup_sack);
advance_sp:
i++;
}
/* Clear the head of the cache sack blocks so we can skip it next time */
for (i = 0; i < ARRAY_SIZE(tp->recv_sack_cache) - used_sacks; i++) {
tp->recv_sack_cache[i].start_seq = 0;
tp->recv_sack_cache[i].end_seq = 0;
}
for (j = 0; j < used_sacks; j++)
tp->recv_sack_cache[i++] = sp[j];
if (inet_csk(sk)->icsk_ca_state != TCP_CA_Loss || tp->undo_marker)
tcp_check_sack_reordering(sk, state->reord, 0);
tcp_verify_left_out(tp);
out:
#if FASTRETRANS_DEBUG > 0
WARN_ON((int)tp->sacked_out < 0);
WARN_ON((int)tp->lost_out < 0);
WARN_ON((int)tp->retrans_out < 0);
WARN_ON((int)tcp_packets_in_flight(tp) < 0);
#endif
return state->flag;
}
/* Limits sacked_out so that sum with lost_out isn't ever larger than
* packets_out. Returns false if sacked_out adjustement wasn't necessary.
*/
static bool tcp_limit_reno_sacked(struct tcp_sock *tp)
{
u32 holes;
holes = max(tp->lost_out, 1U);
holes = min(holes, tp->packets_out);
if ((tp->sacked_out + holes) > tp->packets_out) {
tp->sacked_out = tp->packets_out - holes;
return true;
}
return false;
}
/* If we receive more dupacks than we expected counting segments
* in assumption of absent reordering, interpret this as reordering.
* The only another reason could be bug in receiver TCP.
*/
static void tcp_check_reno_reordering(struct sock *sk, const int addend)
{
struct tcp_sock *tp = tcp_sk(sk);
if (!tcp_limit_reno_sacked(tp))
return;
tp->reordering = min_t(u32, tp->packets_out + addend,
sock_net(sk)->ipv4.sysctl_tcp_max_reordering);
tp->reord_seen++;
NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPRENOREORDER);
}
/* Emulate SACKs for SACKless connection: account for a new dupack. */
static void tcp_add_reno_sack(struct sock *sk, int num_dupack, bool ece_ack)
{
if (num_dupack) {
struct tcp_sock *tp = tcp_sk(sk);
u32 prior_sacked = tp->sacked_out;
s32 delivered;
tp->sacked_out += num_dupack;
tcp_check_reno_reordering(sk, 0);
delivered = tp->sacked_out - prior_sacked;
if (delivered > 0)
tcp_count_delivered(tp, delivered, ece_ack);
tcp_verify_left_out(tp);
}
}
/* Account for ACK, ACKing some data in Reno Recovery phase. */
static void tcp_remove_reno_sacks(struct sock *sk, int acked, bool ece_ack)
{
struct tcp_sock *tp = tcp_sk(sk);
if (acked > 0) {
/* One ACK acked hole. The rest eat duplicate ACKs. */
tcp_count_delivered(tp, max_t(int, acked - tp->sacked_out, 1),
ece_ack);
if (acked - 1 >= tp->sacked_out)
tp->sacked_out = 0;
else
tp->sacked_out -= acked - 1;
}
tcp_check_reno_reordering(sk, acked);
tcp_verify_left_out(tp);
}
static inline void tcp_reset_reno_sack(struct tcp_sock *tp)
{
tp->sacked_out = 0;
}
void tcp_clear_retrans(struct tcp_sock *tp)
{
tp->retrans_out = 0;
tp->lost_out = 0;
tp->undo_marker = 0;
tp->undo_retrans = -1;
tp->sacked_out = 0;
}
static inline void tcp_init_undo(struct tcp_sock *tp)
{
tp->undo_marker = tp->snd_una;
/* Retransmission still in flight may cause DSACKs later. */
tp->undo_retrans = tp->retrans_out ? : -1;
}
static bool tcp_is_rack(const struct sock *sk)
{
return sock_net(sk)->ipv4.sysctl_tcp_recovery & TCP_RACK_LOSS_DETECTION;
}
/* If we detect SACK reneging, forget all SACK information
* and reset tags completely, otherwise preserve SACKs. If receiver
* dropped its ofo queue, we will know this due to reneging detection.
*/
static void tcp_timeout_mark_lost(struct sock *sk)
{
struct tcp_sock *tp = tcp_sk(sk);
struct sk_buff *skb, *head;
bool is_reneg; /* is receiver reneging on SACKs? */
head = tcp_rtx_queue_head(sk);
is_reneg = head && (TCP_SKB_CB(head)->sacked & TCPCB_SACKED_ACKED);
if (is_reneg) {
NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPSACKRENEGING);
tp->sacked_out = 0;
/* Mark SACK reneging until we recover from this loss event. */
tp->is_sack_reneg = 1;
} else if (tcp_is_reno(tp)) {
tcp_reset_reno_sack(tp);
}
skb = head;
skb_rbtree_walk_from(skb) {
if (is_reneg)
TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_ACKED;
else if (tcp_is_rack(sk) && skb != head &&
tcp_rack_skb_timeout(tp, skb, 0) > 0)
continue; /* Don't mark recently sent ones lost yet */
tcp_mark_skb_lost(sk, skb);
}
tcp_verify_left_out(tp);
tcp_clear_all_retrans_hints(tp);
}
/* Enter Loss state. */
void tcp_enter_loss(struct sock *sk)
{
const struct inet_connection_sock *icsk = inet_csk(sk);
struct tcp_sock *tp = tcp_sk(sk);
struct net *net = sock_net(sk);
bool new_recovery = icsk->icsk_ca_state < TCP_CA_Recovery;
tcp_timeout_mark_lost(sk);
/* Reduce ssthresh if it has not yet been made inside this window. */
if (icsk->icsk_ca_state <= TCP_CA_Disorder ||
!after(tp->high_seq, tp->snd_una) ||
(icsk->icsk_ca_state == TCP_CA_Loss && !icsk->icsk_retransmits)) {
tp->prior_ssthresh = tcp_current_ssthresh(sk);
tp->prior_cwnd = tp->snd_cwnd;
tp->snd_ssthresh = icsk->icsk_ca_ops->ssthresh(sk);
tcp_ca_event(sk, CA_EVENT_LOSS);
tcp_init_undo(tp);
}
tp->snd_cwnd = tcp_packets_in_flight(tp) + 1;
tp->snd_cwnd_cnt = 0;
tp->snd_cwnd_stamp = tcp_jiffies32;
/* Timeout in disordered state after receiving substantial DUPACKs
* suggests that the degree of reordering is over-estimated.
*/
if (icsk->icsk_ca_state <= TCP_CA_Disorder &&
tp->sacked_out >= net->ipv4.sysctl_tcp_reordering)
tp->reordering = min_t(unsigned int, tp->reordering,
net->ipv4.sysctl_tcp_reordering);
tcp_set_ca_state(sk, TCP_CA_Loss);
tp->high_seq = tp->snd_nxt;
tcp_ecn_queue_cwr(tp);
/* F-RTO RFC5682 sec 3.1 step 1: retransmit SND.UNA if no previous
* loss recovery is underway except recurring timeout(s) on
* the same SND.UNA (sec 3.2). Disable F-RTO on path MTU probing
*/
tp->frto = net->ipv4.sysctl_tcp_frto &&
(new_recovery || icsk->icsk_retransmits) &&
!inet_csk(sk)->icsk_mtup.probe_size;
}
/* If ACK arrived pointing to a remembered SACK, it means that our
* remembered SACKs do not reflect real state of receiver i.e.
* receiver _host_ is heavily congested (or buggy).
*
* To avoid big spurious retransmission bursts due to transient SACK
* scoreboard oddities that look like reneging, we give the receiver a
* little time (max(RTT/2, 10ms)) to send us some more ACKs that will
* restore sanity to the SACK scoreboard. If the apparent reneging
* persists until this RTO then we'll clear the SACK scoreboard.
*/
static bool tcp_check_sack_reneging(struct sock *sk, int flag)
{
if (flag & FLAG_SACK_RENEGING) {
struct tcp_sock *tp = tcp_sk(sk);
unsigned long delay = max(usecs_to_jiffies(tp->srtt_us >> 4),
msecs_to_jiffies(10));
inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
delay, TCP_RTO_MAX);
return true;
}
return false;
}
/* Heurestics to calculate number of duplicate ACKs. There's no dupACKs
* counter when SACK is enabled (without SACK, sacked_out is used for
* that purpose).
*
* With reordering, holes may still be in flight, so RFC3517 recovery
* uses pure sacked_out (total number of SACKed segments) even though
* it violates the RFC that uses duplicate ACKs, often these are equal
* but when e.g. out-of-window ACKs or packet duplication occurs,
* they differ. Since neither occurs due to loss, TCP should really
* ignore them.
*/
static inline int tcp_dupack_heuristics(const struct tcp_sock *tp)
{
return tp->sacked_out + 1;
}
/* Linux NewReno/SACK/ECN state machine.
* --------------------------------------
*
* "Open" Normal state, no dubious events, fast path.
* "Disorder" In all the respects it is "Open",
* but requires a bit more attention. It is entered when
* we see some SACKs or dupacks. It is split of "Open"
* mainly to move some processing from fast path to slow one.
* "CWR" CWND was reduced due to some Congestion Notification event.
* It can be ECN, ICMP source quench, local device congestion.
* "Recovery" CWND was reduced, we are fast-retransmitting.
* "Loss" CWND was reduced due to RTO timeout or SACK reneging.
*
* tcp_fastretrans_alert() is entered:
* - each incoming ACK, if state is not "Open"
* - when arrived ACK is unusual, namely:
* * SACK
* * Duplicate ACK.
* * ECN ECE.
*
* Counting packets in flight is pretty simple.
*
* in_flight = packets_out - left_out + retrans_out
*
* packets_out is SND.NXT-SND.UNA counted in packets.
*
* retrans_out is number of retransmitted segments.
*
* left_out is number of segments left network, but not ACKed yet.
*
* left_out = sacked_out + lost_out
*
* sacked_out: Packets, which arrived to receiver out of order
* and hence not ACKed. With SACKs this number is simply
* amount of SACKed data. Even without SACKs
* it is easy to give pretty reliable estimate of this number,
* counting duplicate ACKs.
*
* lost_out: Packets lost by network. TCP has no explicit
* "loss notification" feedback from network (for now).
* It means that this number can be only _guessed_.
* Actually, it is the heuristics to predict lossage that
* distinguishes different algorithms.
*
* F.e. after RTO, when all the queue is considered as lost,
* lost_out = packets_out and in_flight = retrans_out.
*
* Essentially, we have now a few algorithms detecting
* lost packets.
*
* If the receiver supports SACK:
*
* RFC6675/3517: It is the conventional algorithm. A packet is
* considered lost if the number of higher sequence packets
* SACKed is greater than or equal the DUPACK thoreshold
* (reordering). This is implemented in tcp_mark_head_lost and
* tcp_update_scoreboard.
*
* RACK (draft-ietf-tcpm-rack-01): it is a newer algorithm
* (2017-) that checks timing instead of counting DUPACKs.
* Essentially a packet is considered lost if it's not S/ACKed
* after RTT + reordering_window, where both metrics are
* dynamically measured and adjusted. This is implemented in
* tcp_rack_mark_lost.
*
* If the receiver does not support SACK:
*
* NewReno (RFC6582): in Recovery we assume that one segment
* is lost (classic Reno). While we are in Recovery and
* a partial ACK arrives, we assume that one more packet
* is lost (NewReno). This heuristics are the same in NewReno
* and SACK.
*
* Really tricky (and requiring careful tuning) part of algorithm
* is hidden in functions tcp_time_to_recover() and tcp_xmit_retransmit_queue().
* The first determines the moment _when_ we should reduce CWND and,
* hence, slow down forward transmission. In fact, it determines the moment
* when we decide that hole is caused by loss, rather than by a reorder.
*
* tcp_xmit_retransmit_queue() decides, _what_ we should retransmit to fill
* holes, caused by lost packets.
*
* And the most logically complicated part of algorithm is undo
* heuristics. We detect false retransmits due to both too early
* fast retransmit (reordering) and underestimated RTO, analyzing
* timestamps and D-SACKs. When we detect that some segments were
* retransmitted by mistake and CWND reduction was wrong, we undo
* window reduction and abort recovery phase. This logic is hidden
* inside several functions named tcp_try_undo_<something>.
*/
/* This function decides, when we should leave Disordered state
* and enter Recovery phase, reducing congestion window.
*
* Main question: may we further continue forward transmission
* with the same cwnd?
*/
static bool tcp_time_to_recover(struct sock *sk, int flag)
{
struct tcp_sock *tp = tcp_sk(sk);
/* Trick#1: The loss is proven. */
if (tp->lost_out)
return true;
/* Not-A-Trick#2 : Classic rule... */
if (!tcp_is_rack(sk) && tcp_dupack_heuristics(tp) > tp->reordering)
return true;
return false;
}
/* Detect loss in event "A" above by marking head of queue up as lost.
* For RFC3517 SACK, a segment is considered lost if it
* has at least tp->reordering SACKed seqments above it; "packets" refers to
* the maximum SACKed segments to pass before reaching this limit.
*/
static void tcp_mark_head_lost(struct sock *sk, int packets, int mark_head)
{
struct tcp_sock *tp = tcp_sk(sk);
struct sk_buff *skb;
int cnt;
/* Use SACK to deduce losses of new sequences sent during recovery */
const u32 loss_high = tp->snd_nxt;
WARN_ON(packets > tp->packets_out);
skb = tp->lost_skb_hint;
if (skb) {
/* Head already handled? */
if (mark_head && after(TCP_SKB_CB(skb)->seq, tp->snd_una))
return;
cnt = tp->lost_cnt_hint;
} else {
skb = tcp_rtx_queue_head(sk);
cnt = 0;
}
skb_rbtree_walk_from(skb) {
/* TODO: do this better */
/* this is not the most efficient way to do this... */
tp->lost_skb_hint = skb;
tp->lost_cnt_hint = cnt;
if (after(TCP_SKB_CB(skb)->end_seq, loss_high))
break;
if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)
cnt += tcp_skb_pcount(skb);
if (cnt > packets)
break;
if (!(TCP_SKB_CB(skb)->sacked & TCPCB_LOST))
tcp_mark_skb_lost(sk, skb);
if (mark_head)
break;
}
tcp_verify_left_out(tp);
}
/* Account newly detected lost packet(s) */
static void tcp_update_scoreboard(struct sock *sk, int fast_rexmit)
{
struct tcp_sock *tp = tcp_sk(sk);
if (tcp_is_sack(tp)) {
int sacked_upto = tp->sacked_out - tp->reordering;
if (sacked_upto >= 0)
tcp_mark_head_lost(sk, sacked_upto, 0);
else if (fast_rexmit)
tcp_mark_head_lost(sk, 1, 1);
}
}
static bool tcp_tsopt_ecr_before(const struct tcp_sock *tp, u32 when)
{
return tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr &&
before(tp->rx_opt.rcv_tsecr, when);
}
/* skb is spurious retransmitted if the returned timestamp echo
* reply is prior to the skb transmission time
*/
static bool tcp_skb_spurious_retrans(const struct tcp_sock *tp,
const struct sk_buff *skb)
{
return (TCP_SKB_CB(skb)->sacked & TCPCB_RETRANS) &&
tcp_tsopt_ecr_before(tp, tcp_skb_timestamp(skb));
}
/* Nothing was retransmitted or returned timestamp is less
* than timestamp of the first retransmission.
*/
static inline bool tcp_packet_delayed(const struct tcp_sock *tp)
{
return tp->retrans_stamp &&
tcp_tsopt_ecr_before(tp, tp->retrans_stamp);
}
/* Undo procedures. */
/* We can clear retrans_stamp when there are no retransmissions in the
* window. It would seem that it is trivially available for us in
* tp->retrans_out, however, that kind of assumptions doesn't consider
* what will happen if errors occur when sending retransmission for the
* second time. ...It could the that such segment has only
* TCPCB_EVER_RETRANS set at the present time. It seems that checking
* the head skb is enough except for some reneging corner cases that
* are not worth the effort.
*
* Main reason for all this complexity is the fact that connection dying
* time now depends on the validity of the retrans_stamp, in particular,
* that successive retransmissions of a segment must not advance
* retrans_stamp under any conditions.
*/
static bool tcp_any_retrans_done(const struct sock *sk)
{
const struct tcp_sock *tp = tcp_sk(sk);
struct sk_buff *skb;
if (tp->retrans_out)
return true;
skb = tcp_rtx_queue_head(sk);
if (unlikely(skb && TCP_SKB_CB(skb)->sacked & TCPCB_EVER_RETRANS))
return true;
return false;
}
static void DBGUNDO(struct sock *sk, const char *msg)
{
#if FASTRETRANS_DEBUG > 1
struct tcp_sock *tp = tcp_sk(sk);
struct inet_sock *inet = inet_sk(sk);
if (sk->sk_family == AF_INET) {
pr_debug("Undo %s %pI4/%u c%u l%u ss%u/%u p%u\n",
msg,
&inet->inet_daddr, ntohs(inet->inet_dport),
tp->snd_cwnd, tcp_left_out(tp),
tp->snd_ssthresh, tp->prior_ssthresh,
tp->packets_out);
}
#if IS_ENABLED(CONFIG_IPV6)
else if (sk->sk_family == AF_INET6) {
pr_debug("Undo %s %pI6/%u c%u l%u ss%u/%u p%u\n",
msg,
&sk->sk_v6_daddr, ntohs(inet->inet_dport),
tp->snd_cwnd, tcp_left_out(tp),
tp->snd_ssthresh, tp->prior_ssthresh,
tp->packets_out);
}
#endif
#endif
}
static void tcp_undo_cwnd_reduction(struct sock *sk, bool unmark_loss)
{
struct tcp_sock *tp = tcp_sk(sk);
if (unmark_loss) {
struct sk_buff *skb;
skb_rbtree_walk(skb, &sk->tcp_rtx_queue) {
TCP_SKB_CB(skb)->sacked &= ~TCPCB_LOST;
}
tp->lost_out = 0;
tcp_clear_all_retrans_hints(tp);
}
if (tp->prior_ssthresh) {
const struct inet_connection_sock *icsk = inet_csk(sk);
tp->snd_cwnd = icsk->icsk_ca_ops->undo_cwnd(sk);
if (tp->prior_ssthresh > tp->snd_ssthresh) {
tp->snd_ssthresh = tp->prior_ssthresh;
tcp_ecn_withdraw_cwr(tp);
}
}
tp->snd_cwnd_stamp = tcp_jiffies32;
tp->undo_marker = 0;
tp->rack.advanced = 1; /* Force RACK to re-exam losses */
}
static inline bool tcp_may_undo(const struct tcp_sock *tp)
{
return tp->undo_marker && (!tp->undo_retrans || tcp_packet_delayed(tp));
}
/* People celebrate: "We love our President!" */
static bool tcp_try_undo_recovery(struct sock *sk)
{
struct tcp_sock *tp = tcp_sk(sk);
if (tcp_may_undo(tp)) {
int mib_idx;
/* Happy end! We did not retransmit anything
* or our original transmission succeeded.
*/
DBGUNDO(sk, inet_csk(sk)->icsk_ca_state == TCP_CA_Loss ? "loss" : "retrans");
tcp_undo_cwnd_reduction(sk, false);
if (inet_csk(sk)->icsk_ca_state == TCP_CA_Loss)
mib_idx = LINUX_MIB_TCPLOSSUNDO;
else
mib_idx = LINUX_MIB_TCPFULLUNDO;
NET_INC_STATS(sock_net(sk), mib_idx);
} else if (tp->rack.reo_wnd_persist) {
tp->rack.reo_wnd_persist--;
}
if (tp->snd_una == tp->high_seq && tcp_is_reno(tp)) {
/* Hold old state until something *above* high_seq
* is ACKed. For Reno it is MUST to prevent false
* fast retransmits (RFC2582). SACK TCP is safe. */
if (!tcp_any_retrans_done(sk))
tp->retrans_stamp = 0;
return true;
}
tcp_set_ca_state(sk, TCP_CA_Open);
tp->is_sack_reneg = 0;
return false;
}
/* Try to undo cwnd reduction, because D-SACKs acked all retransmitted data */
static bool tcp_try_undo_dsack(struct sock *sk)
{
struct tcp_sock *tp = tcp_sk(sk);
if (tp->undo_marker && !tp->undo_retrans) {
tp->rack.reo_wnd_persist = min(TCP_RACK_RECOVERY_THRESH,
tp->rack.reo_wnd_persist + 1);
DBGUNDO(sk, "D-SACK");
tcp_undo_cwnd_reduction(sk, false);
NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPDSACKUNDO);
return true;
}
return false;
}
/* Undo during loss recovery after partial ACK or using F-RTO. */
static bool tcp_try_undo_loss(struct sock *sk, bool frto_undo)
{
struct tcp_sock *tp = tcp_sk(sk);
if (frto_undo || tcp_may_undo(tp)) {
tcp_undo_cwnd_reduction(sk, true);
DBGUNDO(sk, "partial loss");
NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPLOSSUNDO);
if (frto_undo)
NET_INC_STATS(sock_net(sk),
LINUX_MIB_TCPSPURIOUSRTOS);
inet_csk(sk)->icsk_retransmits = 0;
if (frto_undo || tcp_is_sack(tp)) {
tcp_set_ca_state(sk, TCP_CA_Open);
tp->is_sack_reneg = 0;
}
return true;
}
return false;
}
/* The cwnd reduction in CWR and Recovery uses the PRR algorithm in RFC 6937.
* It computes the number of packets to send (sndcnt) based on packets newly
* delivered:
* 1) If the packets in flight is larger than ssthresh, PRR spreads the
* cwnd reductions across a full RTT.
* 2) Otherwise PRR uses packet conservation to send as much as delivered.
* But when SND_UNA is acked without further losses,
* slow starts cwnd up to ssthresh to speed up the recovery.
*/
static void tcp_init_cwnd_reduction(struct sock *sk)
{
struct tcp_sock *tp = tcp_sk(sk);
tp->high_seq = tp->snd_nxt;
tp->tlp_high_seq = 0;
tp->snd_cwnd_cnt = 0;
tp->prior_cwnd = tp->snd_cwnd;
tp->prr_delivered = 0;
tp->prr_out = 0;
tp->snd_ssthresh = inet_csk(sk)->icsk_ca_ops->ssthresh(sk);
tcp_ecn_queue_cwr(tp);
}
void tcp_cwnd_reduction(struct sock *sk, int newly_acked_sacked, int newly_lost, int flag)
{
struct tcp_sock *tp = tcp_sk(sk);
int sndcnt = 0;
int delta = tp->snd_ssthresh - tcp_packets_in_flight(tp);
if (newly_acked_sacked <= 0 || WARN_ON_ONCE(!tp->prior_cwnd))
return;
tp->prr_delivered += newly_acked_sacked;
if (delta < 0) {
u64 dividend = (u64)tp->snd_ssthresh * tp->prr_delivered +
tp->prior_cwnd - 1;
sndcnt = div_u64(dividend, tp->prior_cwnd) - tp->prr_out;
} else if (flag & FLAG_SND_UNA_ADVANCED && !newly_lost) {
sndcnt = min_t(int, delta,
max_t(int, tp->prr_delivered - tp->prr_out,
newly_acked_sacked) + 1);
} else {
sndcnt = min(delta, newly_acked_sacked);
}
/* Force a fast retransmit upon entering fast recovery */
sndcnt = max(sndcnt, (tp->prr_out ? 0 : 1));
tp->snd_cwnd = tcp_packets_in_flight(tp) + sndcnt;
}
static inline void tcp_end_cwnd_reduction(struct sock *sk)
{
struct tcp_sock *tp = tcp_sk(sk);
if (inet_csk(sk)->icsk_ca_ops->cong_control)
return;
/* Reset cwnd to ssthresh in CWR or Recovery (unless it's undone) */
if (tp->snd_ssthresh < TCP_INFINITE_SSTHRESH &&
(inet_csk(sk)->icsk_ca_state == TCP_CA_CWR || tp->undo_marker)) {
tp->snd_cwnd = tp->snd_ssthresh;
tp->snd_cwnd_stamp = tcp_jiffies32;
}
tcp_ca_event(sk, CA_EVENT_COMPLETE_CWR);
}
/* Enter CWR state. Disable cwnd undo since congestion is proven with ECN */
void tcp_enter_cwr(struct sock *sk)
{
struct tcp_sock *tp = tcp_sk(sk);
tp->prior_ssthresh = 0;
if (inet_csk(sk)->icsk_ca_state < TCP_CA_CWR) {
tp->undo_marker = 0;
tcp_init_cwnd_reduction(sk);
tcp_set_ca_state(sk, TCP_CA_CWR);
}
}
EXPORT_SYMBOL(tcp_enter_cwr);
static void tcp_try_keep_open(struct sock *sk)
{
struct tcp_sock *tp = tcp_sk(sk);
int state = TCP_CA_Open;
if (tcp_left_out(tp) || tcp_any_retrans_done(sk))
state = TCP_CA_Disorder;
if (inet_csk(sk)->icsk_ca_state != state) {
tcp_set_ca_state(sk, state);
tp->high_seq = tp->snd_nxt;
}
}
static void tcp_try_to_open(struct sock *sk, int flag)
{
struct tcp_sock *tp = tcp_sk(sk);
tcp_verify_left_out(tp);
if (!tcp_any_retrans_done(sk))
tp->retrans_stamp = 0;
if (flag & FLAG_ECE)
tcp_enter_cwr(sk);
if (inet_csk(sk)->icsk_ca_state != TCP_CA_CWR) {
tcp_try_keep_open(sk);
}
}
static void tcp_mtup_probe_failed(struct sock *sk)
{
struct inet_connection_sock *icsk = inet_csk(sk);
icsk->icsk_mtup.search_high = icsk->icsk_mtup.probe_size - 1;
icsk->icsk_mtup.probe_size = 0;
NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMTUPFAIL);
}
static void tcp_mtup_probe_success(struct sock *sk)
{
struct tcp_sock *tp = tcp_sk(sk);
struct inet_connection_sock *icsk = inet_csk(sk);
/* FIXME: breaks with very large cwnd */
tp->prior_ssthresh = tcp_current_ssthresh(sk);
tp->snd_cwnd = tp->snd_cwnd *
tcp_mss_to_mtu(sk, tp->mss_cache) /
icsk->icsk_mtup.probe_size;
tp->snd_cwnd_cnt = 0;
tp->snd_cwnd_stamp = tcp_jiffies32;
tp->snd_ssthresh = tcp_current_ssthresh(sk);
icsk->icsk_mtup.search_low = icsk->icsk_mtup.probe_size;
icsk->icsk_mtup.probe_size = 0;
tcp_sync_mss(sk, icsk->icsk_pmtu_cookie);
NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMTUPSUCCESS);
}
/* Do a simple retransmit without using the backoff mechanisms in
* tcp_timer. This is used for path mtu discovery.
* The socket is already locked here.
*/
void tcp_simple_retransmit(struct sock *sk)
{
const struct inet_connection_sock *icsk = inet_csk(sk);
struct tcp_sock *tp = tcp_sk(sk);
struct sk_buff *skb;
int mss;
/* A fastopen SYN request is stored as two separate packets within
* the retransmit queue, this is done by tcp_send_syn_data().
* As a result simply checking the MSS of the frames in the queue
* will not work for the SYN packet.
*
* Us being here is an indication of a path MTU issue so we can
* assume that the fastopen SYN was lost and just mark all the
* frames in the retransmit queue as lost. We will use an MSS of
* -1 to mark all frames as lost, otherwise compute the current MSS.
*/
if (tp->syn_data && sk->sk_state == TCP_SYN_SENT)
mss = -1;
else
mss = tcp_current_mss(sk);
skb_rbtree_walk(skb, &sk->tcp_rtx_queue) {
if (tcp_skb_seglen(skb) > mss)
tcp_mark_skb_lost(sk, skb);
}
tcp_clear_retrans_hints_partial(tp);
if (!tp->lost_out)
return;
if (tcp_is_reno(tp))
tcp_limit_reno_sacked(tp);
tcp_verify_left_out(tp);
/* Don't muck with the congestion window here.
* Reason is that we do not increase amount of _data_
* in network, but units changed and effective
* cwnd/ssthresh really reduced now.
*/
if (icsk->icsk_ca_state != TCP_CA_Loss) {
tp->high_seq = tp->snd_nxt;
tp->snd_ssthresh = tcp_current_ssthresh(sk);
tp->prior_ssthresh = 0;
tp->undo_marker = 0;
tcp_set_ca_state(sk, TCP_CA_Loss);
}
tcp_xmit_retransmit_queue(sk);
}
EXPORT_SYMBOL(tcp_simple_retransmit);
void tcp_enter_recovery(struct sock *sk, bool ece_ack)
{
struct tcp_sock *tp = tcp_sk(sk);
int mib_idx;
if (tcp_is_reno(tp))
mib_idx = LINUX_MIB_TCPRENORECOVERY;
else
mib_idx = LINUX_MIB_TCPSACKRECOVERY;
NET_INC_STATS(sock_net(sk), mib_idx);
tp->prior_ssthresh = 0;
tcp_init_undo(tp);
if (!tcp_in_cwnd_reduction(sk)) {
if (!ece_ack)
tp->prior_ssthresh = tcp_current_ssthresh(sk);
tcp_init_cwnd_reduction(sk);
}
tcp_set_ca_state(sk, TCP_CA_Recovery);
}
/* Process an ACK in CA_Loss state. Move to CA_Open if lost data are
* recovered or spurious. Otherwise retransmits more on partial ACKs.
*/
static void tcp_process_loss(struct sock *sk, int flag, int num_dupack,
int *rexmit)
{
struct tcp_sock *tp = tcp_sk(sk);
bool recovered = !before(tp->snd_una, tp->high_seq);
if ((flag & FLAG_SND_UNA_ADVANCED || rcu_access_pointer(tp->fastopen_rsk)) &&
tcp_try_undo_loss(sk, false))
return;
if (tp->frto) { /* F-RTO RFC5682 sec 3.1 (sack enhanced version). */
/* Step 3.b. A timeout is spurious if not all data are
* lost, i.e., never-retransmitted data are (s)acked.
*/
if ((flag & FLAG_ORIG_SACK_ACKED) &&
tcp_try_undo_loss(sk, true))
return;
if (after(tp->snd_nxt, tp->high_seq)) {
if (flag & FLAG_DATA_SACKED || num_dupack)
tp->frto = 0; /* Step 3.a. loss was real */
} else if (flag & FLAG_SND_UNA_ADVANCED && !recovered) {
tp->high_seq = tp->snd_nxt;
/* Step 2.b. Try send new data (but deferred until cwnd
* is updated in tcp_ack()). Otherwise fall back to
* the conventional recovery.
*/
if (!tcp_write_queue_empty(sk) &&
after(tcp_wnd_end(tp), tp->snd_nxt)) {
*rexmit = REXMIT_NEW;
return;
}
tp->frto = 0;
}
}
if (recovered) {
/* F-RTO RFC5682 sec 3.1 step 2.a and 1st part of step 3.a */
tcp_try_undo_recovery(sk);
return;
}
if (tcp_is_reno(tp)) {
/* A Reno DUPACK means new data in F-RTO step 2.b above are
* delivered. Lower inflight to clock out (re)tranmissions.
*/
if (after(tp->snd_nxt, tp->high_seq) && num_dupack)
tcp_add_reno_sack(sk, num_dupack, flag & FLAG_ECE);
else if (flag & FLAG_SND_UNA_ADVANCED)
tcp_reset_reno_sack(tp);
}
*rexmit = REXMIT_LOST;
}
static bool tcp_force_fast_retransmit(struct sock *sk)
{
struct tcp_sock *tp = tcp_sk(sk);
return after(tcp_highest_sack_seq(tp),
tp->snd_una + tp->reordering * tp->mss_cache);
}
/* Undo during fast recovery after partial ACK. */
static bool tcp_try_undo_partial(struct sock *sk, u32 prior_snd_una,
bool *do_lost)
{
struct tcp_sock *tp = tcp_sk(sk);
if (tp->undo_marker && tcp_packet_delayed(tp)) {
/* Plain luck! Hole if filled with delayed
* packet, rather than with a retransmit. Check reordering.
*/
tcp_check_sack_reordering(sk, prior_snd_una, 1);
/* We are getting evidence that the reordering degree is higher
* than we realized. If there are no retransmits out then we
* can undo. Otherwise we clock out new packets but do not
* mark more packets lost or retransmit more.
*/
if (tp->retrans_out)
return true;
if (!tcp_any_retrans_done(sk))
tp->retrans_stamp = 0;
DBGUNDO(sk, "partial recovery");
tcp_undo_cwnd_reduction(sk, true);
NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPPARTIALUNDO);
tcp_try_keep_open(sk);
} else {
/* Partial ACK arrived. Force fast retransmit. */
*do_lost = tcp_force_fast_retransmit(sk);
}
return false;
}
static void tcp_identify_packet_loss(struct sock *sk, int *ack_flag)
{
struct tcp_sock *tp = tcp_sk(sk);
if (tcp_rtx_queue_empty(sk))
return;
if (unlikely(tcp_is_reno(tp))) {
tcp_newreno_mark_lost(sk, *ack_flag & FLAG_SND_UNA_ADVANCED);
} else if (tcp_is_rack(sk)) {
u32 prior_retrans = tp->retrans_out;
if (tcp_rack_mark_lost(sk))
*ack_flag &= ~FLAG_SET_XMIT_TIMER;
if (prior_retrans > tp->retrans_out)
*ack_flag |= FLAG_LOST_RETRANS;
}
}
/* Process an event, which can update packets-in-flight not trivially.
* Main goal of this function is to calculate new estimate for left_out,
* taking into account both packets sitting in receiver's buffer and
* packets lost by network.
*
* Besides that it updates the congestion state when packet loss or ECN
* is detected. But it does not reduce the cwnd, it is done by the
* congestion control later.
*
* It does _not_ decide what to send, it is made in function
* tcp_xmit_retransmit_queue().
*/
static void tcp_fastretrans_alert(struct sock *sk, const u32 prior_snd_una,
int num_dupack, int *ack_flag, int *rexmit)
{
struct inet_connection_sock *icsk = inet_csk(sk);
struct tcp_sock *tp = tcp_sk(sk);
int fast_rexmit = 0, flag = *ack_flag;
bool ece_ack = flag & FLAG_ECE;
bool do_lost = num_dupack || ((flag & FLAG_DATA_SACKED) &&
tcp_force_fast_retransmit(sk));
if (!tp->packets_out && tp->sacked_out)
tp->sacked_out = 0;
/* Now state machine starts.
* A. ECE, hence prohibit cwnd undoing, the reduction is required. */
if (ece_ack)
tp->prior_ssthresh = 0;
/* B. In all the states check for reneging SACKs. */
if (tcp_check_sack_reneging(sk, flag))
return;
/* C. Check consistency of the current state. */
tcp_verify_left_out(tp);
/* D. Check state exit conditions. State can be terminated
* when high_seq is ACKed. */
if (icsk->icsk_ca_state == TCP_CA_Open) {
WARN_ON(tp->retrans_out != 0 && !tp->syn_data);
tp->retrans_stamp = 0;
} else if (!before(tp->snd_una, tp->high_seq)) {
switch (icsk->icsk_ca_state) {
case TCP_CA_CWR:
/* CWR is to be held something *above* high_seq
* is ACKed for CWR bit to reach receiver. */
if (tp->snd_una != tp->high_seq) {
tcp_end_cwnd_reduction(sk);
tcp_set_ca_state(sk, TCP_CA_Open);
}
break;
case TCP_CA_Recovery:
if (tcp_is_reno(tp))
tcp_reset_reno_sack(tp);
if (tcp_try_undo_recovery(sk))
return;
tcp_end_cwnd_reduction(sk);
break;
}
}
/* E. Process state. */
switch (icsk->icsk_ca_state) {
case TCP_CA_Recovery:
if (!(flag & FLAG_SND_UNA_ADVANCED)) {
if (tcp_is_reno(tp))
tcp_add_reno_sack(sk, num_dupack, ece_ack);
} else if (tcp_try_undo_partial(sk, prior_snd_una, &do_lost))
return;
if (tcp_try_undo_dsack(sk))
tcp_try_keep_open(sk);
tcp_identify_packet_loss(sk, ack_flag);
if (icsk->icsk_ca_state != TCP_CA_Recovery) {
if (!tcp_time_to_recover(sk, flag))
return;
/* Undo reverts the recovery state. If loss is evident,
* starts a new recovery (e.g. reordering then loss);
*/
tcp_enter_recovery(sk, ece_ack);
}
break;
case TCP_CA_Loss:
tcp_process_loss(sk, flag, num_dupack, rexmit);
tcp_identify_packet_loss(sk, ack_flag);
if (!(icsk->icsk_ca_state == TCP_CA_Open ||
(*ack_flag & FLAG_LOST_RETRANS)))
return;
/* Change state if cwnd is undone or retransmits are lost */
fallthrough;
default:
if (tcp_is_reno(tp)) {
if (flag & FLAG_SND_UNA_ADVANCED)
tcp_reset_reno_sack(tp);
tcp_add_reno_sack(sk, num_dupack, ece_ack);
}
if (icsk->icsk_ca_state <= TCP_CA_Disorder)
tcp_try_undo_dsack(sk);
tcp_identify_packet_loss(sk, ack_flag);
if (!tcp_time_to_recover(sk, flag)) {
tcp_try_to_open(sk, flag);
return;
}
/* MTU probe failure: don't reduce cwnd */
if (icsk->icsk_ca_state < TCP_CA_CWR &&
icsk->icsk_mtup.probe_size &&
tp->snd_una == tp->mtu_probe.probe_seq_start) {
tcp_mtup_probe_failed(sk);
/* Restores the reduction we did in tcp_mtup_probe() */
tp->snd_cwnd++;
tcp_simple_retransmit(sk);
return;
}
/* Otherwise enter Recovery state */
tcp_enter_recovery(sk, ece_ack);
fast_rexmit = 1;
}
if (!tcp_is_rack(sk) && do_lost)
tcp_update_scoreboard(sk, fast_rexmit);
*rexmit = REXMIT_LOST;
}
static void tcp_update_rtt_min(struct sock *sk, u32 rtt_us, const int flag)
{
u32 wlen = sock_net(sk)->ipv4.sysctl_tcp_min_rtt_wlen * HZ;
struct tcp_sock *tp = tcp_sk(sk);
if ((flag & FLAG_ACK_MAYBE_DELAYED) && rtt_us > tcp_min_rtt(tp)) {
/* If the remote keeps returning delayed ACKs, eventually
* the min filter would pick it up and overestimate the
* prop. delay when it expires. Skip suspected delayed ACKs.
*/
return;
}
minmax_running_min(&tp->rtt_min, wlen, tcp_jiffies32,
rtt_us ? : jiffies_to_usecs(1));
}
static bool tcp_ack_update_rtt(struct sock *sk, const int flag,
long seq_rtt_us, long sack_rtt_us,
long ca_rtt_us, struct rate_sample *rs)
{
const struct tcp_sock *tp = tcp_sk(sk);
/* Prefer RTT measured from ACK's timing to TS-ECR. This is because
* broken middle-boxes or peers may corrupt TS-ECR fields. But
* Karn's algorithm forbids taking RTT if some retransmitted data
* is acked (RFC6298).
*/
if (seq_rtt_us < 0)
seq_rtt_us = sack_rtt_us;
/* RTTM Rule: A TSecr value received in a segment is used to
* update the averaged RTT measurement only if the segment
* acknowledges some new data, i.e., only if it advances the
* left edge of the send window.
* See draft-ietf-tcplw-high-performance-00, section 3.3.
*/
if (seq_rtt_us < 0 && tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr &&
flag & FLAG_ACKED) {
u32 delta = tcp_time_stamp(tp) - tp->rx_opt.rcv_tsecr;
if (likely(delta < INT_MAX / (USEC_PER_SEC / TCP_TS_HZ))) {
if (!delta)
delta = 1;
seq_rtt_us = delta * (USEC_PER_SEC / TCP_TS_HZ);
ca_rtt_us = seq_rtt_us;
}
}
rs->rtt_us = ca_rtt_us; /* RTT of last (S)ACKed packet (or -1) */
if (seq_rtt_us < 0)
return false;
/* ca_rtt_us >= 0 is counting on the invariant that ca_rtt_us is
* always taken together with ACK, SACK, or TS-opts. Any negative
* values will be skipped with the seq_rtt_us < 0 check above.
*/
tcp_update_rtt_min(sk, ca_rtt_us, flag);
tcp_rtt_estimator(sk, seq_rtt_us);
tcp_set_rto(sk);
/* RFC6298: only reset backoff on valid RTT measurement. */
inet_csk(sk)->icsk_backoff = 0;
return true;
}
/* Compute time elapsed between (last) SYNACK and the ACK completing 3WHS. */
void tcp_synack_rtt_meas(struct sock *sk, struct request_sock *req)
{
struct rate_sample rs;
long rtt_us = -1L;
if (req && !req->num_retrans && tcp_rsk(req)->snt_synack)
rtt_us = tcp_stamp_us_delta(tcp_clock_us(), tcp_rsk(req)->snt_synack);
tcp_ack_update_rtt(sk, FLAG_SYN_ACKED, rtt_us, -1L, rtt_us, &rs);
}
static void tcp_cong_avoid(struct sock *sk, u32 ack, u32 acked)
{
const struct inet_connection_sock *icsk = inet_csk(sk);
icsk->icsk_ca_ops->cong_avoid(sk, ack, acked);
tcp_sk(sk)->snd_cwnd_stamp = tcp_jiffies32;
}
/* Restart timer after forward progress on connection.
* RFC2988 recommends to restart timer to now+rto.
*/
void tcp_rearm_rto(struct sock *sk)
{
const struct inet_connection_sock *icsk = inet_csk(sk);
struct tcp_sock *tp = tcp_sk(sk);
/* If the retrans timer is currently being used by Fast Open
* for SYN-ACK retrans purpose, stay put.
*/
if (rcu_access_pointer(tp->fastopen_rsk))
return;
if (!tp->packets_out) {
inet_csk_clear_xmit_timer(sk, ICSK_TIME_RETRANS);
} else {
u32 rto = inet_csk(sk)->icsk_rto;
/* Offset the time elapsed after installing regular RTO */
if (icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT ||
icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
s64 delta_us = tcp_rto_delta_us(sk);
/* delta_us may not be positive if the socket is locked
* when the retrans timer fires and is rescheduled.
*/
rto = usecs_to_jiffies(max_t(int, delta_us, 1));
}
tcp_reset_xmit_timer(sk, ICSK_TIME_RETRANS, rto,
TCP_RTO_MAX);
}
}
/* Try to schedule a loss probe; if that doesn't work, then schedule an RTO. */
static void tcp_set_xmit_timer(struct sock *sk)
{
if (!tcp_schedule_loss_probe(sk, true))
tcp_rearm_rto(sk);
}
/* If we get here, the whole TSO packet has not been acked. */
static u32 tcp_tso_acked(struct sock *sk, struct sk_buff *skb)
{
struct tcp_sock *tp = tcp_sk(sk);
u32 packets_acked;
BUG_ON(!after(TCP_SKB_CB(skb)->end_seq, tp->snd_una));
packets_acked = tcp_skb_pcount(skb);
if (tcp_trim_head(sk, skb, tp->snd_una - TCP_SKB_CB(skb)->seq))
return 0;
packets_acked -= tcp_skb_pcount(skb);
if (packets_acked) {
BUG_ON(tcp_skb_pcount(skb) == 0);
BUG_ON(!before(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq));
}
return packets_acked;
}
static void tcp_ack_tstamp(struct sock *sk, struct sk_buff *skb,
const struct sk_buff *ack_skb, u32 prior_snd_una)
{
const struct skb_shared_info *shinfo;
/* Avoid cache line misses to get skb_shinfo() and shinfo->tx_flags */
if (likely(!TCP_SKB_CB(skb)->txstamp_ack))
return;
shinfo = skb_shinfo(skb);
if (!before(shinfo->tskey, prior_snd_una) &&
before(shinfo->tskey, tcp_sk(sk)->snd_una)) {
tcp_skb_tsorted_save(skb) {
__skb_tstamp_tx(skb, ack_skb, NULL, sk, SCM_TSTAMP_ACK);
} tcp_skb_tsorted_restore(skb);
}
}
/* Remove acknowledged frames from the retransmission queue. If our packet
* is before the ack sequence we can discard it as it's confirmed to have
* arrived at the other end.
*/
static int tcp_clean_rtx_queue(struct sock *sk, const struct sk_buff *ack_skb,
u32 prior_fack, u32 prior_snd_una,
struct tcp_sacktag_state *sack, bool ece_ack)
{
const struct inet_connection_sock *icsk = inet_csk(sk);
u64 first_ackt, last_ackt;
struct tcp_sock *tp = tcp_sk(sk);
u32 prior_sacked = tp->sacked_out;
u32 reord = tp->snd_nxt; /* lowest acked un-retx un-sacked seq */
struct sk_buff *skb, *next;
bool fully_acked = true;
long sack_rtt_us = -1L;
long seq_rtt_us = -1L;
long ca_rtt_us = -1L;
u32 pkts_acked = 0;
u32 last_in_flight = 0;
bool rtt_update;
int flag = 0;
first_ackt = 0;
for (skb = skb_rb_first(&sk->tcp_rtx_queue); skb; skb = next) {
struct tcp_skb_cb *scb = TCP_SKB_CB(skb);
const u32 start_seq = scb->seq;
u8 sacked = scb->sacked;
u32 acked_pcount;
/* Determine how many packets and what bytes were acked, tso and else */
if (after(scb->end_seq, tp->snd_una)) {
if (tcp_skb_pcount(skb) == 1 ||
!after(tp->snd_una, scb->seq))
break;
acked_pcount = tcp_tso_acked(sk, skb);
if (!acked_pcount)
break;
fully_acked = false;
} else {
acked_pcount = tcp_skb_pcount(skb);
}
if (unlikely(sacked & TCPCB_RETRANS)) {
if (sacked & TCPCB_SACKED_RETRANS)
tp->retrans_out -= acked_pcount;
flag |= FLAG_RETRANS_DATA_ACKED;
} else if (!(sacked & TCPCB_SACKED_ACKED)) {
last_ackt = tcp_skb_timestamp_us(skb);
WARN_ON_ONCE(last_ackt == 0);
if (!first_ackt)
first_ackt = last_ackt;
last_in_flight = TCP_SKB_CB(skb)->tx.in_flight;
if (before(start_seq, reord))
reord = start_seq;
if (!after(scb->end_seq, tp->high_seq))
flag |= FLAG_ORIG_SACK_ACKED;
}
if (sacked & TCPCB_SACKED_ACKED) {
tp->sacked_out -= acked_pcount;
} else if (tcp_is_sack(tp)) {
tcp_count_delivered(tp, acked_pcount, ece_ack);
if (!tcp_skb_spurious_retrans(tp, skb))
tcp_rack_advance(tp, sacked, scb->end_seq,
tcp_skb_timestamp_us(skb));
}
if (sacked & TCPCB_LOST)
tp->lost_out -= acked_pcount;
tp->packets_out -= acked_pcount;
pkts_acked += acked_pcount;
tcp_rate_skb_delivered(sk, skb, sack->rate);
/* Initial outgoing SYN's get put onto the write_queue
* just like anything else we transmit. It is not
* true data, and if we misinform our callers that
* this ACK acks real data, we will erroneously exit
* connection startup slow start one packet too
* quickly. This is severely frowned upon behavior.
*/
if (likely(!(scb->tcp_flags & TCPHDR_SYN))) {
flag |= FLAG_DATA_ACKED;
} else {
flag |= FLAG_SYN_ACKED;
tp->retrans_stamp = 0;
}
if (!fully_acked)
break;
tcp_ack_tstamp(sk, skb, ack_skb, prior_snd_una);
next = skb_rb_next(skb);
if (unlikely(skb == tp->retransmit_skb_hint))
tp->retransmit_skb_hint = NULL;
if (unlikely(skb == tp->lost_skb_hint))
tp->lost_skb_hint = NULL;
tcp_highest_sack_replace(sk, skb, next);
tcp_rtx_queue_unlink_and_free(skb, sk);
}
if (!skb)
tcp_chrono_stop(sk, TCP_CHRONO_BUSY);
if (likely(between(tp->snd_up, prior_snd_una, tp->snd_una)))
tp->snd_up = tp->snd_una;
if (skb) {
tcp_ack_tstamp(sk, skb, ack_skb, prior_snd_una);
if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)
flag |= FLAG_SACK_RENEGING;
}
if (likely(first_ackt) && !(flag & FLAG_RETRANS_DATA_ACKED)) {
seq_rtt_us = tcp_stamp_us_delta(tp->tcp_mstamp, first_ackt);
ca_rtt_us = tcp_stamp_us_delta(tp->tcp_mstamp, last_ackt);
if (pkts_acked == 1 && last_in_flight < tp->mss_cache &&
last_in_flight && !prior_sacked && fully_acked &&
sack->rate->prior_delivered + 1 == tp->delivered &&
!(flag & (FLAG_CA_ALERT | FLAG_SYN_ACKED))) {
/* Conservatively mark a delayed ACK. It's typically
* from a lone runt packet over the round trip to
* a receiver w/o out-of-order or CE events.
*/
flag |= FLAG_ACK_MAYBE_DELAYED;
}
}
if (sack->first_sackt) {
sack_rtt_us = tcp_stamp_us_delta(tp->tcp_mstamp, sack->first_sackt);
ca_rtt_us = tcp_stamp_us_delta(tp->tcp_mstamp, sack->last_sackt);
}
rtt_update = tcp_ack_update_rtt(sk, flag, seq_rtt_us, sack_rtt_us,
ca_rtt_us, sack->rate);
if (flag & FLAG_ACKED) {
flag |= FLAG_SET_XMIT_TIMER; /* set TLP or RTO timer */
if (unlikely(icsk->icsk_mtup.probe_size &&
!after(tp->mtu_probe.probe_seq_end, tp->snd_una))) {
tcp_mtup_probe_success(sk);
}
if (tcp_is_reno(tp)) {
tcp_remove_reno_sacks(sk, pkts_acked, ece_ack);
/* If any of the cumulatively ACKed segments was
* retransmitted, non-SACK case cannot confirm that
* progress was due to original transmission due to
* lack of TCPCB_SACKED_ACKED bits even if some of
* the packets may have been never retransmitted.
*/
if (flag & FLAG_RETRANS_DATA_ACKED)
flag &= ~FLAG_ORIG_SACK_ACKED;
} else {
int delta;
/* Non-retransmitted hole got filled? That's reordering */
if (before(reord, prior_fack))
tcp_check_sack_reordering(sk, reord, 0);
delta = prior_sacked - tp->sacked_out;
tp->lost_cnt_hint -= min(tp->lost_cnt_hint, delta);
}
} else if (skb && rtt_update && sack_rtt_us >= 0 &&
sack_rtt_us > tcp_stamp_us_delta(tp->tcp_mstamp,
tcp_skb_timestamp_us(skb))) {
/* Do not re-arm RTO if the sack RTT is measured from data sent
* after when the head was last (re)transmitted. Otherwise the
* timeout may continue to extend in loss recovery.
*/
flag |= FLAG_SET_XMIT_TIMER; /* set TLP or RTO timer */
}
if (icsk->icsk_ca_ops->pkts_acked) {
struct ack_sample sample = { .pkts_acked = pkts_acked,
.rtt_us = sack->rate->rtt_us,
.in_flight = last_in_flight };
icsk->icsk_ca_ops->pkts_acked(sk, &sample);
}
#if FASTRETRANS_DEBUG > 0
WARN_ON((int)tp->sacked_out < 0);
WARN_ON((int)tp->lost_out < 0);
WARN_ON((int)tp->retrans_out < 0);
if (!tp->packets_out && tcp_is_sack(tp)) {
icsk = inet_csk(sk);
if (tp->lost_out) {
pr_debug("Leak l=%u %d\n",
tp->lost_out, icsk->icsk_ca_state);
tp->lost_out = 0;
}
if (tp->sacked_out) {
pr_debug("Leak s=%u %d\n",
tp->sacked_out, icsk->icsk_ca_state);
tp->sacked_out = 0;
}
if (tp->retrans_out) {
pr_debug("Leak r=%u %d\n",
tp->retrans_out, icsk->icsk_ca_state);
tp->retrans_out = 0;
}
}
#endif
return flag;
}
static void tcp_ack_probe(struct sock *sk)
{
struct inet_connection_sock *icsk = inet_csk(sk);
struct sk_buff *head = tcp_send_head(sk);
const struct tcp_sock *tp = tcp_sk(sk);
/* Was it a usable window open? */
if (!head)
return;
if (!after(TCP_SKB_CB(head)->end_seq, tcp_wnd_end(tp))) {
icsk->icsk_backoff = 0;
icsk->icsk_probes_tstamp = 0;
inet_csk_clear_xmit_timer(sk, ICSK_TIME_PROBE0);
/* Socket must be waked up by subsequent tcp_data_snd_check().
* This function is not for random using!
*/
} else {
unsigned long when = tcp_probe0_when(sk, TCP_RTO_MAX);
when = tcp_clamp_probe0_to_user_timeout(sk, when);
tcp_reset_xmit_timer(sk, ICSK_TIME_PROBE0, when, TCP_RTO_MAX);
}
}
static inline bool tcp_ack_is_dubious(const struct sock *sk, const int flag)
{
return !(flag & FLAG_NOT_DUP) || (flag & FLAG_CA_ALERT) ||
inet_csk(sk)->icsk_ca_state != TCP_CA_Open;
}
/* Decide wheather to run the increase function of congestion control. */
static inline bool tcp_may_raise_cwnd(const struct sock *sk, const int flag)
{
/* If reordering is high then always grow cwnd whenever data is
* delivered regardless of its ordering. Otherwise stay conservative
* and only grow cwnd on in-order delivery (RFC5681). A stretched ACK w/
* new SACK or ECE mark may first advance cwnd here and later reduce
* cwnd in tcp_fastretrans_alert() based on more states.
*/
if (tcp_sk(sk)->reordering > sock_net(sk)->ipv4.sysctl_tcp_reordering)
return flag & FLAG_FORWARD_PROGRESS;
return flag & FLAG_DATA_ACKED;
}
/* The "ultimate" congestion control function that aims to replace the rigid
* cwnd increase and decrease control (tcp_cong_avoid,tcp_*cwnd_reduction).
* It's called toward the end of processing an ACK with precise rate
* information. All transmission or retransmission are delayed afterwards.
*/
static void tcp_cong_control(struct sock *sk, u32 ack, u32 acked_sacked,
int flag, const struct rate_sample *rs)
{
const struct inet_connection_sock *icsk = inet_csk(sk);
if (icsk->icsk_ca_ops->cong_control) {
icsk->icsk_ca_ops->cong_control(sk, rs);
return;
}
if (tcp_in_cwnd_reduction(sk)) {
/* Reduce cwnd if state mandates */
tcp_cwnd_reduction(sk, acked_sacked, rs->losses, flag);
} else if (tcp_may_raise_cwnd(sk, flag)) {
/* Advance cwnd if state allows */
tcp_cong_avoid(sk, ack, acked_sacked);
}
tcp_update_pacing_rate(sk);
}
/* Check that window update is acceptable.
* The function assumes that snd_una<=ack<=snd_next.
*/
static inline bool tcp_may_update_window(const struct tcp_sock *tp,
const u32 ack, const u32 ack_seq,
const u32 nwin)
{
return after(ack, tp->snd_una) ||
after(ack_seq, tp->snd_wl1) ||
(ack_seq == tp->snd_wl1 && nwin > tp->snd_wnd);
}
/* If we update tp->snd_una, also update tp->bytes_acked */
static void tcp_snd_una_update(struct tcp_sock *tp, u32 ack)
{
u32 delta = ack - tp->snd_una;
sock_owned_by_me((struct sock *)tp);
tp->bytes_acked += delta;
tp->snd_una = ack;
}
/* If we update tp->rcv_nxt, also update tp->bytes_received */
static void tcp_rcv_nxt_update(struct tcp_sock *tp, u32 seq)
{
u32 delta = seq - tp->rcv_nxt;
sock_owned_by_me((struct sock *)tp);
tp->bytes_received += delta;
WRITE_ONCE(tp->rcv_nxt, seq);
}
/* Update our send window.
*
* Window update algorithm, described in RFC793/RFC1122 (used in linux-2.2
* and in FreeBSD. NetBSD's one is even worse.) is wrong.
*/
static int tcp_ack_update_window(struct sock *sk, const struct sk_buff *skb, u32 ack,
u32 ack_seq)
{
struct tcp_sock *tp = tcp_sk(sk);
int flag = 0;
u32 nwin = ntohs(tcp_hdr(skb)->window);
if (likely(!tcp_hdr(skb)->syn))
nwin <<= tp->rx_opt.snd_wscale;
if (tcp_may_update_window(tp, ack, ack_seq, nwin)) {
flag |= FLAG_WIN_UPDATE;
tcp_update_wl(tp, ack_seq);
if (tp->snd_wnd != nwin) {
tp->snd_wnd = nwin;
/* Note, it is the only place, where
* fast path is recovered for sending TCP.
*/
tp->pred_flags = 0;
tcp_fast_path_check(sk);
if (!tcp_write_queue_empty(sk))
tcp_slow_start_after_idle_check(sk);
if (nwin > tp->max_window) {
tp->max_window = nwin;
tcp_sync_mss(sk, inet_csk(sk)->icsk_pmtu_cookie);
}
}
}
tcp_snd_una_update(tp, ack);
return flag;
}
static bool __tcp_oow_rate_limited(struct net *net, int mib_idx,
u32 *last_oow_ack_time)
{
if (*last_oow_ack_time) {
s32 elapsed = (s32)(tcp_jiffies32 - *last_oow_ack_time);
if (0 <= elapsed && elapsed < net->ipv4.sysctl_tcp_invalid_ratelimit) {
NET_INC_STATS(net, mib_idx);
return true; /* rate-limited: don't send yet! */
}
}
*last_oow_ack_time = tcp_jiffies32;
return false; /* not rate-limited: go ahead, send dupack now! */
}
/* Return true if we're currently rate-limiting out-of-window ACKs and
* thus shouldn't send a dupack right now. We rate-limit dupacks in
* response to out-of-window SYNs or ACKs to mitigate ACK loops or DoS
* attacks that send repeated SYNs or ACKs for the same connection. To
* do this, we do not send a duplicate SYNACK or ACK if the remote
* endpoint is sending out-of-window SYNs or pure ACKs at a high rate.
*/
bool tcp_oow_rate_limited(struct net *net, const struct sk_buff *skb,
int mib_idx, u32 *last_oow_ack_time)
{
/* Data packets without SYNs are not likely part of an ACK loop. */
if ((TCP_SKB_CB(skb)->seq != TCP_SKB_CB(skb)->end_seq) &&
!tcp_hdr(skb)->syn)
return false;
return __tcp_oow_rate_limited(net, mib_idx, last_oow_ack_time);
}
/* RFC 5961 7 [ACK Throttling] */
static void tcp_send_challenge_ack(struct sock *sk, const struct sk_buff *skb)
{
/* unprotected vars, we dont care of overwrites */
static u32 challenge_timestamp;
static unsigned int challenge_count;
struct tcp_sock *tp = tcp_sk(sk);
struct net *net = sock_net(sk);
u32 count, now;
/* First check our per-socket dupack rate limit. */
if (__tcp_oow_rate_limited(net,
LINUX_MIB_TCPACKSKIPPEDCHALLENGE,
&tp->last_oow_ack_time))
return;
/* Then check host-wide RFC 5961 rate limit. */
now = jiffies / HZ;
if (now != challenge_timestamp) {
u32 ack_limit = net->ipv4.sysctl_tcp_challenge_ack_limit;
u32 half = (ack_limit + 1) >> 1;
challenge_timestamp = now;
WRITE_ONCE(challenge_count, half + prandom_u32_max(ack_limit));
}
count = READ_ONCE(challenge_count);
if (count > 0) {
WRITE_ONCE(challenge_count, count - 1);
NET_INC_STATS(net, LINUX_MIB_TCPCHALLENGEACK);
tcp_send_ack(sk);
}
}
static void tcp_store_ts_recent(struct tcp_sock *tp)
{
tp->rx_opt.ts_recent = tp->rx_opt.rcv_tsval;
tp->rx_opt.ts_recent_stamp = ktime_get_seconds();
}
static void tcp_replace_ts_recent(struct tcp_sock *tp, u32 seq)
{
if (tp->rx_opt.saw_tstamp && !after(seq, tp->rcv_wup)) {
/* PAWS bug workaround wrt. ACK frames, the PAWS discard
* extra check below makes sure this can only happen
* for pure ACK frames. -DaveM
*
* Not only, also it occurs for expired timestamps.
*/
if (tcp_paws_check(&tp->rx_opt, 0))
tcp_store_ts_recent(tp);
}
}
/* This routine deals with acks during a TLP episode and ends an episode by
* resetting tlp_high_seq. Ref: TLP algorithm in draft-ietf-tcpm-rack
*/
static void tcp_process_tlp_ack(struct sock *sk, u32 ack, int flag)
{
struct tcp_sock *tp = tcp_sk(sk);
if (before(ack, tp->tlp_high_seq))
return;
if (!tp->tlp_retrans) {
/* TLP of new data has been acknowledged */
tp->tlp_high_seq = 0;
} else if (flag & FLAG_DSACK_TLP) {
/* This DSACK means original and TLP probe arrived; no loss */
tp->tlp_high_seq = 0;
} else if (after(ack, tp->tlp_high_seq)) {
/* ACK advances: there was a loss, so reduce cwnd. Reset
* tlp_high_seq in tcp_init_cwnd_reduction()
*/
tcp_init_cwnd_reduction(sk);
tcp_set_ca_state(sk, TCP_CA_CWR);
tcp_end_cwnd_reduction(sk);
tcp_try_keep_open(sk);
NET_INC_STATS(sock_net(sk),
LINUX_MIB_TCPLOSSPROBERECOVERY);
} else if (!(flag & (FLAG_SND_UNA_ADVANCED |
FLAG_NOT_DUP | FLAG_DATA_SACKED))) {
/* Pure dupack: original and TLP probe arrived; no loss */
tp->tlp_high_seq = 0;
}
}
static inline void tcp_in_ack_event(struct sock *sk, u32 flags)
{
const struct inet_connection_sock *icsk = inet_csk(sk);
if (icsk->icsk_ca_ops->in_ack_event)
icsk->icsk_ca_ops->in_ack_event(sk, flags);
}
/* Congestion control has updated the cwnd already. So if we're in
* loss recovery then now we do any new sends (for FRTO) or
* retransmits (for CA_Loss or CA_recovery) that make sense.
*/
static void tcp_xmit_recovery(struct sock *sk, int rexmit)
{
struct tcp_sock *tp = tcp_sk(sk);
if (rexmit == REXMIT_NONE || sk->sk_state == TCP_SYN_SENT)
return;
if (unlikely(rexmit == REXMIT_NEW)) {
__tcp_push_pending_frames(sk, tcp_current_mss(sk),
TCP_NAGLE_OFF);
if (after(tp->snd_nxt, tp->high_seq))
return;
tp->frto = 0;
}
tcp_xmit_retransmit_queue(sk);
}
/* Returns the number of packets newly acked or sacked by the current ACK */
static u32 tcp_newly_delivered(struct sock *sk, u32 prior_delivered, int flag)
{
const struct net *net = sock_net(sk);
struct tcp_sock *tp = tcp_sk(sk);
u32 delivered;
delivered = tp->delivered - prior_delivered;
NET_ADD_STATS(net, LINUX_MIB_TCPDELIVERED, delivered);
if (flag & FLAG_ECE)
NET_ADD_STATS(net, LINUX_MIB_TCPDELIVEREDCE, delivered);
return delivered;
}
/* This routine deals with incoming acks, but not outgoing ones. */
static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
{
struct inet_connection_sock *icsk = inet_csk(sk);
struct tcp_sock *tp = tcp_sk(sk);
struct tcp_sacktag_state sack_state;
struct rate_sample rs = { .prior_delivered = 0 };
u32 prior_snd_una = tp->snd_una;
bool is_sack_reneg = tp->is_sack_reneg;
u32 ack_seq = TCP_SKB_CB(skb)->seq;
u32 ack = TCP_SKB_CB(skb)->ack_seq;
int num_dupack = 0;
int prior_packets = tp->packets_out;
u32 delivered = tp->delivered;
u32 lost = tp->lost;
int rexmit = REXMIT_NONE; /* Flag to (re)transmit to recover losses */
u32 prior_fack;
sack_state.first_sackt = 0;
sack_state.rate = &rs;
sack_state.sack_delivered = 0;
/* We very likely will need to access rtx queue. */
prefetch(sk->tcp_rtx_queue.rb_node);
/* If the ack is older than previous acks
* then we can probably ignore it.
*/
if (before(ack, prior_snd_una)) {
/* RFC 5961 5.2 [Blind Data Injection Attack].[Mitigation] */
if (before(ack, prior_snd_una - tp->max_window)) {
if (!(flag & FLAG_NO_CHALLENGE_ACK))
tcp_send_challenge_ack(sk, skb);
return -1;
}
goto old_ack;
}
/* If the ack includes data we haven't sent yet, discard
* this segment (RFC793 Section 3.9).
*/
if (after(ack, tp->snd_nxt))
return -1;
if (after(ack, prior_snd_una)) {
flag |= FLAG_SND_UNA_ADVANCED;
icsk->icsk_retransmits = 0;
#if IS_ENABLED(CONFIG_TLS_DEVICE)
if (static_branch_unlikely(&clean_acked_data_enabled.key))
if (icsk->icsk_clean_acked)
icsk->icsk_clean_acked(sk, ack);
#endif
}
prior_fack = tcp_is_sack(tp) ? tcp_highest_sack_seq(tp) : tp->snd_una;
rs.prior_in_flight = tcp_packets_in_flight(tp);
/* ts_recent update must be made after we are sure that the packet
* is in window.
*/
if (flag & FLAG_UPDATE_TS_RECENT)
tcp_replace_ts_recent(tp, TCP_SKB_CB(skb)->seq);
if ((flag & (FLAG_SLOWPATH | FLAG_SND_UNA_ADVANCED)) ==
FLAG_SND_UNA_ADVANCED) {
/* Window is constant, pure forward advance.
* No more checks are required.
* Note, we use the fact that SND.UNA>=SND.WL2.
*/
tcp_update_wl(tp, ack_seq);
tcp_snd_una_update(tp, ack);
flag |= FLAG_WIN_UPDATE;
tcp_in_ack_event(sk, CA_ACK_WIN_UPDATE);
NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPHPACKS);
} else {
u32 ack_ev_flags = CA_ACK_SLOWPATH;
if (ack_seq != TCP_SKB_CB(skb)->end_seq)
flag |= FLAG_DATA;
else
NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPPUREACKS);
flag |= tcp_ack_update_window(sk, skb, ack, ack_seq);
if (TCP_SKB_CB(skb)->sacked)
flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una,
&sack_state);
if (tcp_ecn_rcv_ecn_echo(tp, tcp_hdr(skb))) {
flag |= FLAG_ECE;
ack_ev_flags |= CA_ACK_ECE;
}
if (sack_state.sack_delivered)
tcp_count_delivered(tp, sack_state.sack_delivered,
flag & FLAG_ECE);
if (flag & FLAG_WIN_UPDATE)
ack_ev_flags |= CA_ACK_WIN_UPDATE;
tcp_in_ack_event(sk, ack_ev_flags);
}
/* This is a deviation from RFC3168 since it states that:
* "When the TCP data sender is ready to set the CWR bit after reducing
* the congestion window, it SHOULD set the CWR bit only on the first
* new data packet that it transmits."
* We accept CWR on pure ACKs to be more robust
* with widely-deployed TCP implementations that do this.
*/
tcp_ecn_accept_cwr(sk, skb);
/* We passed data and got it acked, remove any soft error
* log. Something worked...
*/
sk->sk_err_soft = 0;
icsk->icsk_probes_out = 0;
tp->rcv_tstamp = tcp_jiffies32;
if (!prior_packets)
goto no_queue;
/* See if we can take anything off of the retransmit queue. */
flag |= tcp_clean_rtx_queue(sk, skb, prior_fack, prior_snd_una,
&sack_state, flag & FLAG_ECE);
tcp_rack_update_reo_wnd(sk, &rs);
if (tp->tlp_high_seq)
tcp_process_tlp_ack(sk, ack, flag);
if (tcp_ack_is_dubious(sk, flag)) {
if (!(flag & (FLAG_SND_UNA_ADVANCED |
FLAG_NOT_DUP | FLAG_DSACKING_ACK))) {
num_dupack = 1;
/* Consider if pure acks were aggregated in tcp_add_backlog() */
if (!(flag & FLAG_DATA))
num_dupack = max_t(u16, 1, skb_shinfo(skb)->gso_segs);
}
tcp_fastretrans_alert(sk, prior_snd_una, num_dupack, &flag,
&rexmit);
}
/* If needed, reset TLP/RTO timer when RACK doesn't set. */
if (flag & FLAG_SET_XMIT_TIMER)
tcp_set_xmit_timer(sk);
if ((flag & FLAG_FORWARD_PROGRESS) || !(flag & FLAG_NOT_DUP))
sk_dst_confirm(sk);
delivered = tcp_newly_delivered(sk, delivered, flag);
lost = tp->lost - lost; /* freshly marked lost */
rs.is_ack_delayed = !!(flag & FLAG_ACK_MAYBE_DELAYED);
tcp_rate_gen(sk, delivered, lost, is_sack_reneg, sack_state.rate);
tcp_cong_control(sk, ack, delivered, flag, sack_state.rate);
tcp_xmit_recovery(sk, rexmit);
return 1;
no_queue:
/* If data was DSACKed, see if we can undo a cwnd reduction. */
if (flag & FLAG_DSACKING_ACK) {
tcp_fastretrans_alert(sk, prior_snd_una, num_dupack, &flag,
&rexmit);
tcp_newly_delivered(sk, delivered, flag);
}
/* If this ack opens up a zero window, clear backoff. It was
* being used to time the probes, and is probably far higher than
* it needs to be for normal retransmission.
*/
tcp_ack_probe(sk);
if (tp->tlp_high_seq)
tcp_process_tlp_ack(sk, ack, flag);
return 1;
old_ack:
/* If data was SACKed, tag it and see if we should send more data.
* If data was DSACKed, see if we can undo a cwnd reduction.
*/
if (TCP_SKB_CB(skb)->sacked) {
flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una,
&sack_state);
tcp_fastretrans_alert(sk, prior_snd_una, num_dupack, &flag,
&rexmit);
tcp_newly_delivered(sk, delivered, flag);
tcp_xmit_recovery(sk, rexmit);
}
return 0;
}
static void tcp_parse_fastopen_option(int len, const unsigned char *cookie,
bool syn, struct tcp_fastopen_cookie *foc,
bool exp_opt)
{
/* Valid only in SYN or SYN-ACK with an even length. */
if (!foc || !syn || len < 0 || (len & 1))
return;
if (len >= TCP_FASTOPEN_COOKIE_MIN &&
len <= TCP_FASTOPEN_COOKIE_MAX)
memcpy(foc->val, cookie, len);
else if (len != 0)
len = -1;
foc->len = len;
foc->exp = exp_opt;
}
static bool smc_parse_options(const struct tcphdr *th,
struct tcp_options_received *opt_rx,
const unsigned char *ptr,
int opsize)
{
#if IS_ENABLED(CONFIG_SMC)
if (static_branch_unlikely(&tcp_have_smc)) {
if (th->syn && !(opsize & 1) &&
opsize >= TCPOLEN_EXP_SMC_BASE &&
get_unaligned_be32(ptr) == TCPOPT_SMC_MAGIC) {
opt_rx->smc_ok = 1;
return true;
}
}
#endif
return false;
}
/* Try to parse the MSS option from the TCP header. Return 0 on failure, clamped
* value on success.
*/
static u16 tcp_parse_mss_option(const struct tcphdr *th, u16 user_mss)
{
const unsigned char *ptr = (const unsigned char *)(th + 1);
int length = (th->doff * 4) - sizeof(struct tcphdr);
u16 mss = 0;
while (length > 0) {
int opcode = *ptr++;
int opsize;
switch (opcode) {
case TCPOPT_EOL:
return mss;
case TCPOPT_NOP: /* Ref: RFC 793 section 3.1 */
length--;
continue;
default:
if (length < 2)
return mss;
opsize = *ptr++;
if (opsize < 2) /* "silly options" */
return mss;
if (opsize > length)
return mss; /* fail on partial options */
if (opcode == TCPOPT_MSS && opsize == TCPOLEN_MSS) {
u16 in_mss = get_unaligned_be16(ptr);
if (in_mss) {
if (user_mss && user_mss < in_mss)
in_mss = user_mss;
mss = in_mss;
}
}
ptr += opsize - 2;
length -= opsize;
}
}
return mss;
}
/* Look for tcp options. Normally only called on SYN and SYNACK packets.
* But, this can also be called on packets in the established flow when
* the fast version below fails.
*/
void tcp_parse_options(const struct net *net,
const struct sk_buff *skb,
struct tcp_options_received *opt_rx, int estab,
struct tcp_fastopen_cookie *foc)
{
const unsigned char *ptr;
const struct tcphdr *th = tcp_hdr(skb);
int length = (th->doff * 4) - sizeof(struct tcphdr);
ptr = (const unsigned char *)(th + 1);
opt_rx->saw_tstamp = 0;
opt_rx->saw_unknown = 0;
while (length > 0) { int opcode = *ptr++;
int opsize;
switch (opcode) {
case TCPOPT_EOL:
return;
case TCPOPT_NOP: /* Ref: RFC 793 section 3.1 */
length--;
continue;
default:
if (length < 2)
return;
opsize = *ptr++;
if (opsize < 2) /* "silly options" */
return;
if (opsize > length)
return; /* don't parse partial options */
switch (opcode) {
case TCPOPT_MSS:
if (opsize == TCPOLEN_MSS && th->syn && !estab) {
u16 in_mss = get_unaligned_be16(ptr);
if (in_mss) {
if (opt_rx->user_mss &&
opt_rx->user_mss < in_mss)
in_mss = opt_rx->user_mss;
opt_rx->mss_clamp = in_mss;
}
}
break;
case TCPOPT_WINDOW:
if (opsize == TCPOLEN_WINDOW && th->syn && !estab && net->ipv4.sysctl_tcp_window_scaling) { __u8 snd_wscale = *(__u8 *)ptr;
opt_rx->wscale_ok = 1;
if (snd_wscale > TCP_MAX_WSCALE) { net_info_ratelimited("%s: Illegal window scaling value %d > %u received\n",
__func__,
snd_wscale,
TCP_MAX_WSCALE);
snd_wscale = TCP_MAX_WSCALE;
}
opt_rx->snd_wscale = snd_wscale;
}
break;
case TCPOPT_TIMESTAMP:
if ((opsize == TCPOLEN_TIMESTAMP) && ((estab && opt_rx->tstamp_ok) || (!estab && net->ipv4.sysctl_tcp_timestamps))) { opt_rx->saw_tstamp = 1;
opt_rx->rcv_tsval = get_unaligned_be32(ptr);
opt_rx->rcv_tsecr = get_unaligned_be32(ptr + 4);
}
break;
case TCPOPT_SACK_PERM:
if (opsize == TCPOLEN_SACK_PERM && th->syn && !estab && net->ipv4.sysctl_tcp_sack) { opt_rx->sack_ok = TCP_SACK_SEEN;
tcp_sack_reset(opt_rx);
}
break;
case TCPOPT_SACK:
if ((opsize >= (TCPOLEN_SACK_BASE + TCPOLEN_SACK_PERBLOCK)) && !((opsize - TCPOLEN_SACK_BASE) % TCPOLEN_SACK_PERBLOCK) &&
opt_rx->sack_ok) {
TCP_SKB_CB(skb)->sacked = (ptr - 2) - (unsigned char *)th;
}
break;
#ifdef CONFIG_TCP_MD5SIG
case TCPOPT_MD5SIG:
/*
* The MD5 Hash has already been
* checked (see tcp_v{4,6}_do_rcv()).
*/
break;
#endif
case TCPOPT_FASTOPEN:
tcp_parse_fastopen_option(
opsize - TCPOLEN_FASTOPEN_BASE,
ptr, th->syn, foc, false);
break;
case TCPOPT_EXP:
/* Fast Open option shares code 254 using a
* 16 bits magic number.
*/
if (opsize >= TCPOLEN_EXP_FASTOPEN_BASE &&
get_unaligned_be16(ptr) ==
TCPOPT_FASTOPEN_MAGIC) {
tcp_parse_fastopen_option(opsize -
TCPOLEN_EXP_FASTOPEN_BASE,
ptr + 2, th->syn, foc, true);
break;
}
if (smc_parse_options(th, opt_rx, ptr, opsize))
break;
opt_rx->saw_unknown = 1;
break;
default:
opt_rx->saw_unknown = 1;
}
ptr += opsize-2;
length -= opsize;
}
}
}
EXPORT_SYMBOL(tcp_parse_options);
static bool tcp_parse_aligned_timestamp(struct tcp_sock *tp, const struct tcphdr *th)
{
const __be32 *ptr = (const __be32 *)(th + 1);
if (*ptr == htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16)
| (TCPOPT_TIMESTAMP << 8) | TCPOLEN_TIMESTAMP)) {
tp->rx_opt.saw_tstamp = 1;
++ptr;
tp->rx_opt.rcv_tsval = ntohl(*ptr);
++ptr;
if (*ptr)
tp->rx_opt.rcv_tsecr = ntohl(*ptr) - tp->tsoffset;
else
tp->rx_opt.rcv_tsecr = 0;
return true;
}
return false;
}
/* Fast parse options. This hopes to only see timestamps.
* If it is wrong it falls back on tcp_parse_options().
*/
static bool tcp_fast_parse_options(const struct net *net,
const struct sk_buff *skb,
const struct tcphdr *th, struct tcp_sock *tp)
{
/* In the spirit of fast parsing, compare doff directly to constant
* values. Because equality is used, short doff can be ignored here.
*/
if (th->doff == (sizeof(*th) / 4)) {
tp->rx_opt.saw_tstamp = 0;
return false;
} else if (tp->rx_opt.tstamp_ok &&
th->doff == ((sizeof(*th) + TCPOLEN_TSTAMP_ALIGNED) / 4)) {
if (tcp_parse_aligned_timestamp(tp, th))
return true;
}
tcp_parse_options(net, skb, &tp->rx_opt, 1, NULL);
if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr)
tp->rx_opt.rcv_tsecr -= tp->tsoffset;
return true;
}
#ifdef CONFIG_TCP_MD5SIG
/*
* Parse MD5 Signature option
*/
const u8 *tcp_parse_md5sig_option(const struct tcphdr *th)
{
int length = (th->doff << 2) - sizeof(*th);
const u8 *ptr = (const u8 *)(th + 1);
/* If not enough data remaining, we can short cut */
while (length >= TCPOLEN_MD5SIG) {
int opcode = *ptr++;
int opsize;
switch (opcode) {
case TCPOPT_EOL:
return NULL;
case TCPOPT_NOP:
length--;
continue;
default:
opsize = *ptr++;
if (opsize < 2 || opsize > length)
return NULL;
if (opcode == TCPOPT_MD5SIG)
return opsize == TCPOLEN_MD5SIG ? ptr : NULL;
}
ptr += opsize - 2;
length -= opsize;
}
return NULL;
}
EXPORT_SYMBOL(tcp_parse_md5sig_option);
#endif
/* Sorry, PAWS as specified is broken wrt. pure-ACKs -DaveM
*
* It is not fatal. If this ACK does _not_ change critical state (seqs, window)
* it can pass through stack. So, the following predicate verifies that
* this segment is not used for anything but congestion avoidance or
* fast retransmit. Moreover, we even are able to eliminate most of such
* second order effects, if we apply some small "replay" window (~RTO)
* to timestamp space.
*
* All these measures still do not guarantee that we reject wrapped ACKs
* on networks with high bandwidth, when sequence space is recycled fastly,
* but it guarantees that such events will be very rare and do not affect
* connection seriously. This doesn't look nice, but alas, PAWS is really
* buggy extension.
*
* [ Later note. Even worse! It is buggy for segments _with_ data. RFC
* states that events when retransmit arrives after original data are rare.
* It is a blatant lie. VJ forgot about fast retransmit! 8)8) It is
* the biggest problem on large power networks even with minor reordering.
* OK, let's give it small replay window. If peer clock is even 1hz, it is safe
* up to bandwidth of 18Gigabit/sec. 8) ]
*/
static int tcp_disordered_ack(const struct sock *sk, const struct sk_buff *skb)
{
const struct tcp_sock *tp = tcp_sk(sk);
const struct tcphdr *th = tcp_hdr(skb);
u32 seq = TCP_SKB_CB(skb)->seq;
u32 ack = TCP_SKB_CB(skb)->ack_seq;
return (/* 1. Pure ACK with correct sequence number. */
(th->ack && seq == TCP_SKB_CB(skb)->end_seq && seq == tp->rcv_nxt) &&
/* 2. ... and duplicate ACK. */
ack == tp->snd_una &&
/* 3. ... and does not update window. */
!tcp_may_update_window(tp, ack, seq, ntohs(th->window) << tp->rx_opt.snd_wscale) &&
/* 4. ... and sits in replay window. */
(s32)(tp->rx_opt.ts_recent - tp->rx_opt.rcv_tsval) <= (inet_csk(sk)->icsk_rto * 1024) / HZ);
}
static inline bool tcp_paws_discard(const struct sock *sk,
const struct sk_buff *skb)
{
const struct tcp_sock *tp = tcp_sk(sk);
return !tcp_paws_check(&tp->rx_opt, TCP_PAWS_WINDOW) &&
!tcp_disordered_ack(sk, skb);
}
/* Check segment sequence number for validity.
*
* Segment controls are considered valid, if the segment
* fits to the window after truncation to the window. Acceptability
* of data (and SYN, FIN, of course) is checked separately.
* See tcp_data_queue(), for example.
*
* Also, controls (RST is main one) are accepted using RCV.WUP instead
* of RCV.NXT. Peer still did not advance his SND.UNA when we
* delayed ACK, so that hisSND.UNA<=ourRCV.WUP.
* (borrowed from freebsd)
*/
static inline bool tcp_sequence(const struct tcp_sock *tp, u32 seq, u32 end_seq)
{
return !before(end_seq, tp->rcv_wup) &&
!after(seq, tp->rcv_nxt + tcp_receive_window(tp));
}
/* When we get a reset we do this. */
void tcp_reset(struct sock *sk, struct sk_buff *skb)
{
trace_tcp_receive_reset(sk);
/* mptcp can't tell us to ignore reset pkts,
* so just ignore the return value of mptcp_incoming_options().
*/
if (sk_is_mptcp(sk))
mptcp_incoming_options(sk, skb);
/* We want the right error as BSD sees it (and indeed as we do). */
switch (sk->sk_state) {
case TCP_SYN_SENT:
sk->sk_err = ECONNREFUSED;
break;
case TCP_CLOSE_WAIT:
sk->sk_err = EPIPE;
break;
case TCP_CLOSE:
return;
default:
sk->sk_err = ECONNRESET;
}
/* This barrier is coupled with smp_rmb() in tcp_poll() */
smp_wmb();
tcp_write_queue_purge(sk);
tcp_done(sk);
if (!sock_flag(sk, SOCK_DEAD))
sk_error_report(sk);
}
/*
* Process the FIN bit. This now behaves as it is supposed to work
* and the FIN takes effect when it is validly part of sequence
* space. Not before when we get holes.
*
* If we are ESTABLISHED, a received fin moves us to CLOSE-WAIT
* (and thence onto LAST-ACK and finally, CLOSE, we never enter
* TIME-WAIT)
*
* If we are in FINWAIT-1, a received FIN indicates simultaneous
* close and we go into CLOSING (and later onto TIME-WAIT)
*
* If we are in FINWAIT-2, a received FIN moves us to TIME-WAIT.
*/
void tcp_fin(struct sock *sk)
{
struct tcp_sock *tp = tcp_sk(sk);
inet_csk_schedule_ack(sk);
sk->sk_shutdown |= RCV_SHUTDOWN;
sock_set_flag(sk, SOCK_DONE);
switch (sk->sk_state) {
case TCP_SYN_RECV:
case TCP_ESTABLISHED:
/* Move to CLOSE_WAIT */
tcp_set_state(sk, TCP_CLOSE_WAIT);
inet_csk_enter_pingpong_mode(sk);
break;
case TCP_CLOSE_WAIT:
case TCP_CLOSING:
/* Received a retransmission of the FIN, do
* nothing.
*/
break;
case TCP_LAST_ACK:
/* RFC793: Remain in the LAST-ACK state. */
break;
case TCP_FIN_WAIT1:
/* This case occurs when a simultaneous close
* happens, we must ack the received FIN and
* enter the CLOSING state.
*/
tcp_send_ack(sk);
tcp_set_state(sk, TCP_CLOSING);
break;
case TCP_FIN_WAIT2:
/* Received a FIN -- send ACK and enter TIME_WAIT. */
tcp_send_ack(sk);
tcp_time_wait(sk, TCP_TIME_WAIT, 0);
break;
default:
/* Only TCP_LISTEN and TCP_CLOSE are left, in these
* cases we should never reach this piece of code.
*/
pr_err("%s: Impossible, sk->sk_state=%d\n",
__func__, sk->sk_state);
break;
}
/* It _is_ possible, that we have something out-of-order _after_ FIN.
* Probably, we should reset in this case. For now drop them.
*/
skb_rbtree_purge(&tp->out_of_order_queue);
if (tcp_is_sack(tp))
tcp_sack_reset(&tp->rx_opt);
sk_mem_reclaim(sk);
if (!sock_flag(sk, SOCK_DEAD)) {
sk->sk_state_change(sk);
/* Do not send POLL_HUP for half duplex close. */
if (sk->sk_shutdown == SHUTDOWN_MASK ||
sk->sk_state == TCP_CLOSE)
sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_HUP);
else
sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
}
}
static inline bool tcp_sack_extend(struct tcp_sack_block *sp, u32 seq,
u32 end_seq)
{
if (!after(seq, sp->end_seq) && !after(sp->start_seq, end_seq)) {
if (before(seq, sp->start_seq))
sp->start_seq = seq;
if (after(end_seq, sp->end_seq))
sp->end_seq = end_seq;
return true;
}
return false;
}
static void tcp_dsack_set(struct sock *sk, u32 seq, u32 end_seq)
{
struct tcp_sock *tp = tcp_sk(sk);
if (tcp_is_sack(tp) && sock_net(sk)->ipv4.sysctl_tcp_dsack) {
int mib_idx;
if (before(seq, tp->rcv_nxt))
mib_idx = LINUX_MIB_TCPDSACKOLDSENT;
else
mib_idx = LINUX_MIB_TCPDSACKOFOSENT;
NET_INC_STATS(sock_net(sk), mib_idx);
tp->rx_opt.dsack = 1;
tp->duplicate_sack[0].start_seq = seq;
tp->duplicate_sack[0].end_seq = end_seq;
}
}
static void tcp_dsack_extend(struct sock *sk, u32 seq, u32 end_seq)
{
struct tcp_sock *tp = tcp_sk(sk);
if (!tp->rx_opt.dsack)
tcp_dsack_set(sk, seq, end_seq);
else
tcp_sack_extend(tp->duplicate_sack, seq, end_seq);
}
static void tcp_rcv_spurious_retrans(struct sock *sk, const struct sk_buff *skb)
{
/* When the ACK path fails or drops most ACKs, the sender would
* timeout and spuriously retransmit the same segment repeatedly.
* The receiver remembers and reflects via DSACKs. Leverage the
* DSACK state and change the txhash to re-route speculatively.
*/
if (TCP_SKB_CB(skb)->seq == tcp_sk(sk)->duplicate_sack[0].start_seq &&
sk_rethink_txhash(sk))
NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPDUPLICATEDATAREHASH);
}
static void tcp_send_dupack(struct sock *sk, const struct sk_buff *skb)
{
struct tcp_sock *tp = tcp_sk(sk);
if (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq &&
before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) {
NET_INC_STATS(sock_net(sk), LINUX_MIB_DELAYEDACKLOST);
tcp_enter_quickack_mode(sk, TCP_MAX_QUICKACKS);
if (tcp_is_sack(tp) && sock_net(sk)->ipv4.sysctl_tcp_dsack) {
u32 end_seq = TCP_SKB_CB(skb)->end_seq;
tcp_rcv_spurious_retrans(sk, skb);
if (after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt))
end_seq = tp->rcv_nxt;
tcp_dsack_set(sk, TCP_SKB_CB(skb)->seq, end_seq);
}
}
tcp_send_ack(sk);
}
/* These routines update the SACK block as out-of-order packets arrive or
* in-order packets close up the sequence space.
*/
static void tcp_sack_maybe_coalesce(struct tcp_sock *tp)
{
int this_sack;
struct tcp_sack_block *sp = &tp->selective_acks[0];
struct tcp_sack_block *swalk = sp + 1;
/* See if the recent change to the first SACK eats into
* or hits the sequence space of other SACK blocks, if so coalesce.
*/
for (this_sack = 1; this_sack < tp->rx_opt.num_sacks;) {
if (tcp_sack_extend(sp, swalk->start_seq, swalk->end_seq)) {
int i;
/* Zap SWALK, by moving every further SACK up by one slot.
* Decrease num_sacks.
*/
tp->rx_opt.num_sacks--;
for (i = this_sack; i < tp->rx_opt.num_sacks; i++)
sp[i] = sp[i + 1];
continue;
}
this_sack++;
swalk++;
}
}
static void tcp_sack_compress_send_ack(struct sock *sk)
{
struct tcp_sock *tp = tcp_sk(sk);
if (!tp->compressed_ack)
return;
if (hrtimer_try_to_cancel(&tp->compressed_ack_timer) == 1)
__sock_put(sk);
/* Since we have to send one ack finally,
* substract one from tp->compressed_ack to keep
* LINUX_MIB_TCPACKCOMPRESSED accurate.
*/
NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPACKCOMPRESSED,
tp->compressed_ack - 1);
tp->compressed_ack = 0;
tcp_send_ack(sk);
}
/* Reasonable amount of sack blocks included in TCP SACK option
* The max is 4, but this becomes 3 if TCP timestamps are there.
* Given that SACK packets might be lost, be conservative and use 2.
*/
#define TCP_SACK_BLOCKS_EXPECTED 2
static void tcp_sack_new_ofo_skb(struct sock *sk, u32 seq, u32 end_seq)
{
struct tcp_sock *tp = tcp_sk(sk);
struct tcp_sack_block *sp = &tp->selective_acks[0];
int cur_sacks = tp->rx_opt.num_sacks;
int this_sack;
if (!cur_sacks)
goto new_sack;
for (this_sack = 0; this_sack < cur_sacks; this_sack++, sp++) {
if (tcp_sack_extend(sp, seq, end_seq)) {
if (this_sack >= TCP_SACK_BLOCKS_EXPECTED)
tcp_sack_compress_send_ack(sk);
/* Rotate this_sack to the first one. */
for (; this_sack > 0; this_sack--, sp--)
swap(*sp, *(sp - 1));
if (cur_sacks > 1)
tcp_sack_maybe_coalesce(tp);
return;
}
}
if (this_sack >= TCP_SACK_BLOCKS_EXPECTED)
tcp_sack_compress_send_ack(sk);
/* Could not find an adjacent existing SACK, build a new one,
* put it at the front, and shift everyone else down. We
* always know there is at least one SACK present already here.
*
* If the sack array is full, forget about the last one.
*/
if (this_sack >= TCP_NUM_SACKS) {
this_sack--;
tp->rx_opt.num_sacks--;
sp--;
}
for (; this_sack > 0; this_sack--, sp--)
*sp = *(sp - 1);
new_sack:
/* Build the new head SACK, and we're done. */
sp->start_seq = seq;
sp->end_seq = end_seq;
tp->rx_opt.num_sacks++;
}
/* RCV.NXT advances, some SACKs should be eaten. */
static void tcp_sack_remove(struct tcp_sock *tp)
{
struct tcp_sack_block *sp = &tp->selective_acks[0];
int num_sacks = tp->rx_opt.num_sacks;
int this_sack;
/* Empty ofo queue, hence, all the SACKs are eaten. Clear. */
if (RB_EMPTY_ROOT(&tp->out_of_order_queue)) {
tp->rx_opt.num_sacks = 0;
return;
}
for (this_sack = 0; this_sack < num_sacks;) {
/* Check if the start of the sack is covered by RCV.NXT. */
if (!before(tp->rcv_nxt, sp->start_seq)) {
int i;
/* RCV.NXT must cover all the block! */
WARN_ON(before(tp->rcv_nxt, sp->end_seq));
/* Zap this SACK, by moving forward any other SACKS. */
for (i = this_sack+1; i < num_sacks; i++)
tp->selective_acks[i-1] = tp->selective_acks[i];
num_sacks--;
continue;
}
this_sack++;
sp++;
}
tp->rx_opt.num_sacks = num_sacks;
}
/**
* tcp_try_coalesce - try to merge skb to prior one
* @sk: socket
* @to: prior buffer
* @from: buffer to add in queue
* @fragstolen: pointer to boolean
*
* Before queueing skb @from after @to, try to merge them
* to reduce overall memory use and queue lengths, if cost is small.
* Packets in ofo or receive queues can stay a long time.
* Better try to coalesce them right now to avoid future collapses.
* Returns true if caller should free @from instead of queueing it
*/
static bool tcp_try_coalesce(struct sock *sk,
struct sk_buff *to,
struct sk_buff *from,
bool *fragstolen)
{
int delta;
*fragstolen = false;
/* Its possible this segment overlaps with prior segment in queue */
if (TCP_SKB_CB(from)->seq != TCP_SKB_CB(to)->end_seq)
return false;
if (!mptcp_skb_can_collapse(to, from))
return false;
#ifdef CONFIG_TLS_DEVICE
if (from->decrypted != to->decrypted)
return false;
#endif
if (!skb_try_coalesce(to, from, fragstolen, &delta))
return false;
atomic_add(delta, &sk->sk_rmem_alloc);
sk_mem_charge(sk, delta);
NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPRCVCOALESCE);
TCP_SKB_CB(to)->end_seq = TCP_SKB_CB(from)->end_seq;
TCP_SKB_CB(to)->ack_seq = TCP_SKB_CB(from)->ack_seq;
TCP_SKB_CB(to)->tcp_flags |= TCP_SKB_CB(from)->tcp_flags;
if (TCP_SKB_CB(from)->has_rxtstamp) {
TCP_SKB_CB(to)->has_rxtstamp = true;
to->tstamp = from->tstamp;
skb_hwtstamps(to)->hwtstamp = skb_hwtstamps(from)->hwtstamp;
}
return true;
}
static bool tcp_ooo_try_coalesce(struct sock *sk,
struct sk_buff *to,
struct sk_buff *from,
bool *fragstolen)
{
bool res = tcp_try_coalesce(sk, to, from, fragstolen);
/* In case tcp_drop() is called later, update to->gso_segs */
if (res) {
u32 gso_segs = max_t(u16, 1, skb_shinfo(to)->gso_segs) +
max_t(u16, 1, skb_shinfo(from)->gso_segs);
skb_shinfo(to)->gso_segs = min_t(u32, gso_segs, 0xFFFF);
}
return res;
}
static void tcp_drop(struct sock *sk, struct sk_buff *skb)
{
sk_drops_add(sk, skb);
__kfree_skb(skb);
}
/* This one checks to see if we can put data from the
* out_of_order queue into the receive_queue.
*/
static void tcp_ofo_queue(struct sock *sk)
{
struct tcp_sock *tp = tcp_sk(sk);
__u32 dsack_high = tp->rcv_nxt;
bool fin, fragstolen, eaten;
struct sk_buff *skb, *tail;
struct rb_node *p;
p = rb_first(&tp->out_of_order_queue);
while (p) {
skb = rb_to_skb(p);
if (after(TCP_SKB_CB(skb)->seq, tp->rcv_nxt))
break;
if (before(TCP_SKB_CB(skb)->seq, dsack_high)) {
__u32 dsack = dsack_high;
if (before(TCP_SKB_CB(skb)->end_seq, dsack_high))
dsack_high = TCP_SKB_CB(skb)->end_seq;
tcp_dsack_extend(sk, TCP_SKB_CB(skb)->seq, dsack);
}
p = rb_next(p);
rb_erase(&skb->rbnode, &tp->out_of_order_queue);
if (unlikely(!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt))) {
tcp_drop(sk, skb);
continue;
}
tail = skb_peek_tail(&sk->sk_receive_queue);
eaten = tail && tcp_try_coalesce(sk, tail, skb, &fragstolen);
tcp_rcv_nxt_update(tp, TCP_SKB_CB(skb)->end_seq);
fin = TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN;
if (!eaten)
__skb_queue_tail(&sk->sk_receive_queue, skb);
else
kfree_skb_partial(skb, fragstolen);
if (unlikely(fin)) {
tcp_fin(sk);
/* tcp_fin() purges tp->out_of_order_queue,
* so we must end this loop right now.
*/
break;
}
}
}
static bool tcp_prune_ofo_queue(struct sock *sk);
static int tcp_prune_queue(struct sock *sk);
static int tcp_try_rmem_schedule(struct sock *sk, struct sk_buff *skb,
unsigned int size)
{
if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf ||
!sk_rmem_schedule(sk, skb, size)) {
if (tcp_prune_queue(sk) < 0)
return -1;
while (!sk_rmem_schedule(sk, skb, size)) {
if (!tcp_prune_ofo_queue(sk))
return -1;
}
}
return 0;
}
static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb)
{
struct tcp_sock *tp = tcp_sk(sk);
struct rb_node **p, *parent;
struct sk_buff *skb1;
u32 seq, end_seq;
bool fragstolen;
tcp_ecn_check_ce(sk, skb);
if (unlikely(tcp_try_rmem_schedule(sk, skb, skb->truesize))) {
NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPOFODROP);
sk->sk_data_ready(sk);
tcp_drop(sk, skb);
return;
}
/* Disable header prediction. */
tp->pred_flags = 0;
inet_csk_schedule_ack(sk);
tp->rcv_ooopack += max_t(u16, 1, skb_shinfo(skb)->gso_segs);
NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPOFOQUEUE);
seq = TCP_SKB_CB(skb)->seq;
end_seq = TCP_SKB_CB(skb)->end_seq;
p = &tp->out_of_order_queue.rb_node;
if (RB_EMPTY_ROOT(&tp->out_of_order_queue)) {
/* Initial out of order segment, build 1 SACK. */
if (tcp_is_sack(tp)) {
tp->rx_opt.num_sacks = 1;
tp->selective_acks[0].start_seq = seq;
tp->selective_acks[0].end_seq = end_seq;
}
rb_link_node(&skb->rbnode, NULL, p);
rb_insert_color(&skb->rbnode, &tp->out_of_order_queue);
tp->ooo_last_skb = skb;
goto end;
}
/* In the typical case, we are adding an skb to the end of the list.
* Use of ooo_last_skb avoids the O(Log(N)) rbtree lookup.
*/
if (tcp_ooo_try_coalesce(sk, tp->ooo_last_skb,
skb, &fragstolen)) {
coalesce_done:
/* For non sack flows, do not grow window to force DUPACK
* and trigger fast retransmit.
*/
if (tcp_is_sack(tp))
tcp_grow_window(sk, skb, true);
kfree_skb_partial(skb, fragstolen);
skb = NULL;
goto add_sack;
}
/* Can avoid an rbtree lookup if we are adding skb after ooo_last_skb */
if (!before(seq, TCP_SKB_CB(tp->ooo_last_skb)->end_seq)) {
parent = &tp->ooo_last_skb->rbnode;
p = &parent->rb_right;
goto insert;
}
/* Find place to insert this segment. Handle overlaps on the way. */
parent = NULL;
while (*p) {
parent = *p;
skb1 = rb_to_skb(parent);
if (before(seq, TCP_SKB_CB(skb1)->seq)) {
p = &parent->rb_left;
continue;
}
if (before(seq, TCP_SKB_CB(skb1)->end_seq)) {
if (!after(end_seq, TCP_SKB_CB(skb1)->end_seq)) {
/* All the bits are present. Drop. */
NET_INC_STATS(sock_net(sk),
LINUX_MIB_TCPOFOMERGE);
tcp_drop(sk, skb);
skb = NULL;
tcp_dsack_set(sk, seq, end_seq);
goto add_sack;
}
if (after(seq, TCP_SKB_CB(skb1)->seq)) {
/* Partial overlap. */
tcp_dsack_set(sk, seq, TCP_SKB_CB(skb1)->end_seq);
} else {
/* skb's seq == skb1's seq and skb covers skb1.
* Replace skb1 with skb.
*/
rb_replace_node(&skb1->rbnode, &skb->rbnode,
&tp->out_of_order_queue);
tcp_dsack_extend(sk,
TCP_SKB_CB(skb1)->seq,
TCP_SKB_CB(skb1)->end_seq);
NET_INC_STATS(sock_net(sk),
LINUX_MIB_TCPOFOMERGE);
tcp_drop(sk, skb1);
goto merge_right;
}
} else if (tcp_ooo_try_coalesce(sk, skb1,
skb, &fragstolen)) {
goto coalesce_done;
}
p = &parent->rb_right;
}
insert:
/* Insert segment into RB tree. */
rb_link_node(&skb->rbnode, parent, p);
rb_insert_color(&skb->rbnode, &tp->out_of_order_queue);
merge_right:
/* Remove other segments covered by skb. */
while ((skb1 = skb_rb_next(skb)) != NULL) {
if (!after(end_seq, TCP_SKB_CB(skb1)->seq))
break;
if (before(end_seq, TCP_SKB_CB(skb1)->end_seq)) {
tcp_dsack_extend(sk, TCP_SKB_CB(skb1)->seq,
end_seq);
break;
}
rb_erase(&skb1->rbnode, &tp->out_of_order_queue);
tcp_dsack_extend(sk, TCP_SKB_CB(skb1)->seq,
TCP_SKB_CB(skb1)->end_seq);
NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPOFOMERGE);
tcp_drop(sk, skb1);
}
/* If there is no skb after us, we are the last_skb ! */
if (!skb1)
tp->ooo_last_skb = skb;
add_sack:
if (tcp_is_sack(tp))
tcp_sack_new_ofo_skb(sk, seq, end_seq);
end:
if (skb) {
/* For non sack flows, do not grow window to force DUPACK
* and trigger fast retransmit.
*/
if (tcp_is_sack(tp))
tcp_grow_window(sk, skb, false);
skb_condense(skb);
skb_set_owner_r(skb, sk);
}
}
static int __must_check tcp_queue_rcv(struct sock *sk, struct sk_buff *skb,
bool *fragstolen)
{
int eaten;
struct sk_buff *tail = skb_peek_tail(&sk->sk_receive_queue);
eaten = (tail &&
tcp_try_coalesce(sk, tail,
skb, fragstolen)) ? 1 : 0;
tcp_rcv_nxt_update(tcp_sk(sk), TCP_SKB_CB(skb)->end_seq);
if (!eaten) {
__skb_queue_tail(&sk->sk_receive_queue, skb);
skb_set_owner_r(skb, sk);
}
return eaten;
}
int tcp_send_rcvq(struct sock *sk, struct msghdr *msg, size_t size)
{
struct sk_buff *skb;
int err = -ENOMEM;
int data_len = 0;
bool fragstolen;
if (size == 0)
return 0;
if (size > PAGE_SIZE) {
int npages = min_t(size_t, size >> PAGE_SHIFT, MAX_SKB_FRAGS);
data_len = npages << PAGE_SHIFT;
size = data_len + (size & ~PAGE_MASK);
}
skb = alloc_skb_with_frags(size - data_len, data_len,
PAGE_ALLOC_COSTLY_ORDER,
&err, sk->sk_allocation);
if (!skb)
goto err;
skb_put(skb, size - data_len);
skb->data_len = data_len;
skb->len = size;
if (tcp_try_rmem_schedule(sk, skb, skb->truesize)) {
NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPRCVQDROP);
goto err_free;
}
err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, size);
if (err)
goto err_free;
TCP_SKB_CB(skb)->seq = tcp_sk(sk)->rcv_nxt;
TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq + size;
TCP_SKB_CB(skb)->ack_seq = tcp_sk(sk)->snd_una - 1;
if (tcp_queue_rcv(sk, skb, &fragstolen)) {
WARN_ON_ONCE(fragstolen); /* should not happen */
__kfree_skb(skb);
}
return size;
err_free:
kfree_skb(skb);
err:
return err;
}
void tcp_data_ready(struct sock *sk)
{
if (tcp_epollin_ready(sk, sk->sk_rcvlowat) || sock_flag(sk, SOCK_DONE))
sk->sk_data_ready(sk);
}
static void tcp_data_queue(struct sock *sk, struct sk_buff *skb)
{
struct tcp_sock *tp = tcp_sk(sk);
bool fragstolen;
int eaten;
/* If a subflow has been reset, the packet should not continue
* to be processed, drop the packet.
*/
if (sk_is_mptcp(sk) && !mptcp_incoming_options(sk, skb)) {
__kfree_skb(skb);
return;
}
if (TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq) {
__kfree_skb(skb);
return;
}
skb_dst_drop(skb);
__skb_pull(skb, tcp_hdr(skb)->doff * 4);
tp->rx_opt.dsack = 0;
/* Queue data for delivery to the user.
* Packets in sequence go to the receive queue.
* Out of sequence packets to the out_of_order_queue.
*/
if (TCP_SKB_CB(skb)->seq == tp->rcv_nxt) {
if (tcp_receive_window(tp) == 0) {
NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPZEROWINDOWDROP);
goto out_of_window;
}
/* Ok. In sequence. In window. */
queue_and_out:
if (skb_queue_len(&sk->sk_receive_queue) == 0)
sk_forced_mem_schedule(sk, skb->truesize);
else if (tcp_try_rmem_schedule(sk, skb, skb->truesize)) {
NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPRCVQDROP);
sk->sk_data_ready(sk);
goto drop;
}
eaten = tcp_queue_rcv(sk, skb, &fragstolen);
if (skb->len)
tcp_event_data_recv(sk, skb);
if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
tcp_fin(sk);
if (!RB_EMPTY_ROOT(&tp->out_of_order_queue)) {
tcp_ofo_queue(sk);
/* RFC5681. 4.2. SHOULD send immediate ACK, when
* gap in queue is filled.
*/
if (RB_EMPTY_ROOT(&tp->out_of_order_queue))
inet_csk(sk)->icsk_ack.pending |= ICSK_ACK_NOW;
}
if (tp->rx_opt.num_sacks)
tcp_sack_remove(tp);
tcp_fast_path_check(sk);
if (eaten > 0)
kfree_skb_partial(skb, fragstolen);
if (!sock_flag(sk, SOCK_DEAD))
tcp_data_ready(sk);
return;
}
if (!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt)) {
tcp_rcv_spurious_retrans(sk, skb);
/* A retransmit, 2nd most common case. Force an immediate ack. */
NET_INC_STATS(sock_net(sk), LINUX_MIB_DELAYEDACKLOST);
tcp_dsack_set(sk, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq);
out_of_window:
tcp_enter_quickack_mode(sk, TCP_MAX_QUICKACKS);
inet_csk_schedule_ack(sk);
drop:
tcp_drop(sk, skb);
return;
}
/* Out of window. F.e. zero window probe. */
if (!before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt + tcp_receive_window(tp)))
goto out_of_window;
if (before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) {
/* Partial packet, seq < rcv_next < end_seq */
tcp_dsack_set(sk, TCP_SKB_CB(skb)->seq, tp->rcv_nxt);
/* If window is closed, drop tail of packet. But after
* remembering D-SACK for its head made in previous line.
*/
if (!tcp_receive_window(tp)) {
NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPZEROWINDOWDROP);
goto out_of_window;
}
goto queue_and_out;
}
tcp_data_queue_ofo(sk, skb);
}
static struct sk_buff *tcp_skb_next(struct sk_buff *skb, struct sk_buff_head *list)
{
if (list)
return !skb_queue_is_last(list, skb) ? skb->next : NULL;
return skb_rb_next(skb);
}
static struct sk_buff *tcp_collapse_one(struct sock *sk, struct sk_buff *skb,
struct sk_buff_head *list,
struct rb_root *root)
{
struct sk_buff *next = tcp_skb_next(skb, list);
if (list)
__skb_unlink(skb, list);
else
rb_erase(&skb->rbnode, root);
__kfree_skb(skb);
NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPRCVCOLLAPSED);
return next;
}
/* Insert skb into rb tree, ordered by TCP_SKB_CB(skb)->seq */
void tcp_rbtree_insert(struct rb_root *root, struct sk_buff *skb)
{
struct rb_node **p = &root->rb_node;
struct rb_node *parent = NULL;
struct sk_buff *skb1;
while (*p) {
parent = *p;
skb1 = rb_to_skb(parent);
if (before(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb1)->seq)) p = &parent->rb_left;
else
p = &parent->rb_right;
}
rb_link_node(&skb->rbnode, parent, p);
rb_insert_color(&skb->rbnode, root);
}
/* Collapse contiguous sequence of skbs head..tail with
* sequence numbers start..end.
*
* If tail is NULL, this means until the end of the queue.
*
* Segments with FIN/SYN are not collapsed (only because this
* simplifies code)
*/
static void
tcp_collapse(struct sock *sk, struct sk_buff_head *list, struct rb_root *root,
struct sk_buff *head, struct sk_buff *tail, u32 start, u32 end)
{
struct sk_buff *skb = head, *n;
struct sk_buff_head tmp;
bool end_of_skbs;
/* First, check that queue is collapsible and find
* the point where collapsing can be useful.
*/
restart:
for (end_of_skbs = true; skb != NULL && skb != tail; skb = n) {
n = tcp_skb_next(skb, list);
/* No new bits? It is possible on ofo queue. */
if (!before(start, TCP_SKB_CB(skb)->end_seq)) {
skb = tcp_collapse_one(sk, skb, list, root);
if (!skb)
break;
goto restart;
}
/* The first skb to collapse is:
* - not SYN/FIN and
* - bloated or contains data before "start" or
* overlaps to the next one and mptcp allow collapsing.
*/
if (!(TCP_SKB_CB(skb)->tcp_flags & (TCPHDR_SYN | TCPHDR_FIN)) &&
(tcp_win_from_space(sk, skb->truesize) > skb->len ||
before(TCP_SKB_CB(skb)->seq, start))) {
end_of_skbs = false;
break;
}
if (n && n != tail && mptcp_skb_can_collapse(skb, n) &&
TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(n)->seq) {
end_of_skbs = false;
break;
}
/* Decided to skip this, advance start seq. */
start = TCP_SKB_CB(skb)->end_seq;
}
if (end_of_skbs ||
(TCP_SKB_CB(skb)->tcp_flags & (TCPHDR_SYN | TCPHDR_FIN)))
return;
__skb_queue_head_init(&tmp);
while (before(start, end)) {
int copy = min_t(int, SKB_MAX_ORDER(0, 0), end - start);
struct sk_buff *nskb;
nskb = alloc_skb(copy, GFP_ATOMIC);
if (!nskb)
break;
memcpy(nskb->cb, skb->cb, sizeof(skb->cb));
#ifdef CONFIG_TLS_DEVICE
nskb->decrypted = skb->decrypted;
#endif
TCP_SKB_CB(nskb)->seq = TCP_SKB_CB(nskb)->end_seq = start;
if (list)
__skb_queue_before(list, skb, nskb);
else
__skb_queue_tail(&tmp, nskb); /* defer rbtree insertion */
skb_set_owner_r(nskb, sk);
mptcp_skb_ext_move(nskb, skb);
/* Copy data, releasing collapsed skbs. */
while (copy > 0) {
int offset = start - TCP_SKB_CB(skb)->seq;
int size = TCP_SKB_CB(skb)->end_seq - start;
BUG_ON(offset < 0);
if (size > 0) {
size = min(copy, size);
if (skb_copy_bits(skb, offset, skb_put(nskb, size), size))
BUG();
TCP_SKB_CB(nskb)->end_seq += size;
copy -= size;
start += size;
}
if (!before(start, TCP_SKB_CB(skb)->end_seq)) {
skb = tcp_collapse_one(sk, skb, list, root);
if (!skb ||
skb == tail ||
!mptcp_skb_can_collapse(nskb, skb) ||
(TCP_SKB_CB(skb)->tcp_flags & (TCPHDR_SYN | TCPHDR_FIN)))
goto end;
#ifdef CONFIG_TLS_DEVICE
if (skb->decrypted != nskb->decrypted)
goto end;
#endif
}
}
}
end:
skb_queue_walk_safe(&tmp, skb, n)
tcp_rbtree_insert(root, skb);
}
/* Collapse ofo queue. Algorithm: select contiguous sequence of skbs
* and tcp_collapse() them until all the queue is collapsed.
*/
static void tcp_collapse_ofo_queue(struct sock *sk)
{
struct tcp_sock *tp = tcp_sk(sk);
u32 range_truesize, sum_tiny = 0;
struct sk_buff *skb, *head;
u32 start, end;
skb = skb_rb_first(&tp->out_of_order_queue);
new_range:
if (!skb) {
tp->ooo_last_skb = skb_rb_last(&tp->out_of_order_queue);
return;
}
start = TCP_SKB_CB(skb)->seq;
end = TCP_SKB_CB(skb)->end_seq;
range_truesize = skb->truesize;
for (head = skb;;) {
skb = skb_rb_next(skb);
/* Range is terminated when we see a gap or when
* we are at the queue end.
*/
if (!skb ||
after(TCP_SKB_CB(skb)->seq, end) ||
before(TCP_SKB_CB(skb)->end_seq, start)) {
/* Do not attempt collapsing tiny skbs */
if (range_truesize != head->truesize ||
end - start >= SKB_WITH_OVERHEAD(SK_MEM_QUANTUM)) {
tcp_collapse(sk, NULL, &tp->out_of_order_queue,
head, skb, start, end);
} else {
sum_tiny += range_truesize;
if (sum_tiny > sk->sk_rcvbuf >> 3)
return;
}
goto new_range;
}
range_truesize += skb->truesize;
if (unlikely(before(TCP_SKB_CB(skb)->seq, start)))
start = TCP_SKB_CB(skb)->seq;
if (after(TCP_SKB_CB(skb)->end_seq, end))
end = TCP_SKB_CB(skb)->end_seq;
}
}
/*
* Clean the out-of-order queue to make room.
* We drop high sequences packets to :
* 1) Let a chance for holes to be filled.
* 2) not add too big latencies if thousands of packets sit there.
* (But if application shrinks SO_RCVBUF, we could still end up
* freeing whole queue here)
* 3) Drop at least 12.5 % of sk_rcvbuf to avoid malicious attacks.
*
* Return true if queue has shrunk.
*/
static bool tcp_prune_ofo_queue(struct sock *sk)
{
struct tcp_sock *tp = tcp_sk(sk);
struct rb_node *node, *prev;
int goal;
if (RB_EMPTY_ROOT(&tp->out_of_order_queue))
return false;
NET_INC_STATS(sock_net(sk), LINUX_MIB_OFOPRUNED);
goal = sk->sk_rcvbuf >> 3;
node = &tp->ooo_last_skb->rbnode;
do {
prev = rb_prev(node);
rb_erase(node, &tp->out_of_order_queue);
goal -= rb_to_skb(node)->truesize;
tcp_drop(sk, rb_to_skb(node));
if (!prev || goal <= 0) {
sk_mem_reclaim(sk);
if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf &&
!tcp_under_memory_pressure(sk))
break;
goal = sk->sk_rcvbuf >> 3;
}
node = prev;
} while (node);
tp->ooo_last_skb = rb_to_skb(prev);
/* Reset SACK state. A conforming SACK implementation will
* do the same at a timeout based retransmit. When a connection
* is in a sad state like this, we care only about integrity
* of the connection not performance.
*/
if (tp->rx_opt.sack_ok)
tcp_sack_reset(&tp->rx_opt);
return true;
}
/* Reduce allocated memory if we can, trying to get
* the socket within its memory limits again.
*
* Return less than zero if we should start dropping frames
* until the socket owning process reads some of the data
* to stabilize the situation.
*/
static int tcp_prune_queue(struct sock *sk)
{
struct tcp_sock *tp = tcp_sk(sk);
NET_INC_STATS(sock_net(sk), LINUX_MIB_PRUNECALLED);
if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf)
tcp_clamp_window(sk);
else if (tcp_under_memory_pressure(sk))
tp->rcv_ssthresh = min(tp->rcv_ssthresh, 4U * tp->advmss);
if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf)
return 0;
tcp_collapse_ofo_queue(sk);
if (!skb_queue_empty(&sk->sk_receive_queue))
tcp_collapse(sk, &sk->sk_receive_queue, NULL,
skb_peek(&sk->sk_receive_queue),
NULL,
tp->copied_seq, tp->rcv_nxt);
sk_mem_reclaim(sk);
if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf)
return 0;
/* Collapsing did not help, destructive actions follow.
* This must not ever occur. */
tcp_prune_ofo_queue(sk);
if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf)
return 0;
/* If we are really being abused, tell the caller to silently
* drop receive data on the floor. It will get retransmitted
* and hopefully then we'll have sufficient space.
*/
NET_INC_STATS(sock_net(sk), LINUX_MIB_RCVPRUNED);
/* Massive buffer overcommit. */
tp->pred_flags = 0;
return -1;
}
static bool tcp_should_expand_sndbuf(const struct sock *sk)
{
const struct tcp_sock *tp = tcp_sk(sk);
/* If the user specified a specific send buffer setting, do
* not modify it.
*/
if (sk->sk_userlocks & SOCK_SNDBUF_LOCK)
return false;
/* If we are under global TCP memory pressure, do not expand. */
if (tcp_under_memory_pressure(sk))
return false;
/* If we are under soft global TCP memory pressure, do not expand. */
if (sk_memory_allocated(sk) >= sk_prot_mem_limits(sk, 0))
return false;
/* If we filled the congestion window, do not expand. */
if (tcp_packets_in_flight(tp) >= tp->snd_cwnd)
return false;
return true;
}
static void tcp_new_space(struct sock *sk)
{
struct tcp_sock *tp = tcp_sk(sk);
if (tcp_should_expand_sndbuf(sk)) {
tcp_sndbuf_expand(sk);
tp->snd_cwnd_stamp = tcp_jiffies32;
}
INDIRECT_CALL_1(sk->sk_write_space, sk_stream_write_space, sk);
}
/* Caller made space either from:
* 1) Freeing skbs in rtx queues (after tp->snd_una has advanced)
* 2) Sent skbs from output queue (and thus advancing tp->snd_nxt)
*
* We might be able to generate EPOLLOUT to the application if:
* 1) Space consumed in output/rtx queues is below sk->sk_sndbuf/2
* 2) notsent amount (tp->write_seq - tp->snd_nxt) became
* small enough that tcp_stream_memory_free() decides it
* is time to generate EPOLLOUT.
*/
void tcp_check_space(struct sock *sk)
{
/* pairs with tcp_poll() */
smp_mb();
if (sk->sk_socket &&
test_bit(SOCK_NOSPACE, &sk->sk_socket->flags)) {
tcp_new_space(sk);
if (!test_bit(SOCK_NOSPACE, &sk->sk_socket->flags))
tcp_chrono_stop(sk, TCP_CHRONO_SNDBUF_LIMITED);
}
}
static inline void tcp_data_snd_check(struct sock *sk)
{
tcp_push_pending_frames(sk);
tcp_check_space(sk);
}
/*
* Check if sending an ack is needed.
*/
static void __tcp_ack_snd_check(struct sock *sk, int ofo_possible)
{
struct tcp_sock *tp = tcp_sk(sk);
unsigned long rtt, delay;
/* More than one full frame received... */
if (((tp->rcv_nxt - tp->rcv_wup) > inet_csk(sk)->icsk_ack.rcv_mss &&
/* ... and right edge of window advances far enough.
* (tcp_recvmsg() will send ACK otherwise).
* If application uses SO_RCVLOWAT, we want send ack now if
* we have not received enough bytes to satisfy the condition.
*/
(tp->rcv_nxt - tp->copied_seq < sk->sk_rcvlowat ||
__tcp_select_window(sk) >= tp->rcv_wnd)) ||
/* We ACK each frame or... */
tcp_in_quickack_mode(sk) ||
/* Protocol state mandates a one-time immediate ACK */
inet_csk(sk)->icsk_ack.pending & ICSK_ACK_NOW) {
send_now:
tcp_send_ack(sk);
return;
}
if (!ofo_possible || RB_EMPTY_ROOT(&tp->out_of_order_queue)) {
tcp_send_delayed_ack(sk);
return;
}
if (!tcp_is_sack(tp) ||
tp->compressed_ack >= sock_net(sk)->ipv4.sysctl_tcp_comp_sack_nr)
goto send_now;
if (tp->compressed_ack_rcv_nxt != tp->rcv_nxt) {
tp->compressed_ack_rcv_nxt = tp->rcv_nxt;
tp->dup_ack_counter = 0;
}
if (tp->dup_ack_counter < TCP_FASTRETRANS_THRESH) {
tp->dup_ack_counter++;
goto send_now;
}
tp->compressed_ack++;
if (hrtimer_is_queued(&tp->compressed_ack_timer))
return;
/* compress ack timer : 5 % of rtt, but no more than tcp_comp_sack_delay_ns */
rtt = tp->rcv_rtt_est.rtt_us;
if (tp->srtt_us && tp->srtt_us < rtt)
rtt = tp->srtt_us;
delay = min_t(unsigned long, sock_net(sk)->ipv4.sysctl_tcp_comp_sack_delay_ns,
rtt * (NSEC_PER_USEC >> 3)/20);
sock_hold(sk);
hrtimer_start_range_ns(&tp->compressed_ack_timer, ns_to_ktime(delay),
sock_net(sk)->ipv4.sysctl_tcp_comp_sack_slack_ns,
HRTIMER_MODE_REL_PINNED_SOFT);
}
static inline void tcp_ack_snd_check(struct sock *sk)
{
if (!inet_csk_ack_scheduled(sk)) {
/* We sent a data segment already. */
return;
}
__tcp_ack_snd_check(sk, 1);
}
/*
* This routine is only called when we have urgent data
* signaled. Its the 'slow' part of tcp_urg. It could be
* moved inline now as tcp_urg is only called from one
* place. We handle URGent data wrong. We have to - as
* BSD still doesn't use the correction from RFC961.
* For 1003.1g we should support a new option TCP_STDURG to permit
* either form (or just set the sysctl tcp_stdurg).
*/
static void tcp_check_urg(struct sock *sk, const struct tcphdr *th)
{
struct tcp_sock *tp = tcp_sk(sk);
u32 ptr = ntohs(th->urg_ptr);
if (ptr && !sock_net(sk)->ipv4.sysctl_tcp_stdurg)
ptr--;
ptr += ntohl(th->seq);
/* Ignore urgent data that we've already seen and read. */
if (after(tp->copied_seq, ptr))
return;
/* Do not replay urg ptr.
*
* NOTE: interesting situation not covered by specs.
* Misbehaving sender may send urg ptr, pointing to segment,
* which we already have in ofo queue. We are not able to fetch
* such data and will stay in TCP_URG_NOTYET until will be eaten
* by recvmsg(). Seems, we are not obliged to handle such wicked
* situations. But it is worth to think about possibility of some
* DoSes using some hypothetical application level deadlock.
*/
if (before(ptr, tp->rcv_nxt))
return;
/* Do we already have a newer (or duplicate) urgent pointer? */
if (tp->urg_data && !after(ptr, tp->urg_seq))
return;
/* Tell the world about our new urgent pointer. */
sk_send_sigurg(sk);
/* We may be adding urgent data when the last byte read was
* urgent. To do this requires some care. We cannot just ignore
* tp->copied_seq since we would read the last urgent byte again
* as data, nor can we alter copied_seq until this data arrives
* or we break the semantics of SIOCATMARK (and thus sockatmark())
*
* NOTE. Double Dutch. Rendering to plain English: author of comment
* above did something sort of send("A", MSG_OOB); send("B", MSG_OOB);
* and expect that both A and B disappear from stream. This is _wrong_.
* Though this happens in BSD with high probability, this is occasional.
* Any application relying on this is buggy. Note also, that fix "works"
* only in this artificial test. Insert some normal data between A and B and we will
* decline of BSD again. Verdict: it is better to remove to trap
* buggy users.
*/
if (tp->urg_seq == tp->copied_seq && tp->urg_data &&
!sock_flag(sk, SOCK_URGINLINE) && tp->copied_seq != tp->rcv_nxt) {
struct sk_buff *skb = skb_peek(&sk->sk_receive_queue);
tp->copied_seq++;
if (skb && !before(tp->copied_seq, TCP_SKB_CB(skb)->end_seq)) {
__skb_unlink(skb, &sk->sk_receive_queue);
__kfree_skb(skb);
}
}
tp->urg_data = TCP_URG_NOTYET;
WRITE_ONCE(tp->urg_seq, ptr);
/* Disable header prediction. */
tp->pred_flags = 0;
}
/* This is the 'fast' part of urgent handling. */
static void tcp_urg(struct sock *sk, struct sk_buff *skb, const struct tcphdr *th)
{
struct tcp_sock *tp = tcp_sk(sk);
/* Check if we get a new urgent pointer - normally not. */
if (th->urg)
tcp_check_urg(sk, th);
/* Do we wait for any urgent data? - normally not... */
if (tp->urg_data == TCP_URG_NOTYET) {
u32 ptr = tp->urg_seq - ntohl(th->seq) + (th->doff * 4) -
th->syn;
/* Is the urgent pointer pointing into this packet? */
if (ptr < skb->len) {
u8 tmp;
if (skb_copy_bits(skb, ptr, &tmp, 1))
BUG();
tp->urg_data = TCP_URG_VALID | tmp;
if (!sock_flag(sk, SOCK_DEAD))
sk->sk_data_ready(sk);
}
}
}
/* Accept RST for rcv_nxt - 1 after a FIN.
* When tcp connections are abruptly terminated from Mac OSX (via ^C), a
* FIN is sent followed by a RST packet. The RST is sent with the same
* sequence number as the FIN, and thus according to RFC 5961 a challenge
* ACK should be sent. However, Mac OSX rate limits replies to challenge
* ACKs on the closed socket. In addition middleboxes can drop either the
* challenge ACK or a subsequent RST.
*/
static bool tcp_reset_check(const struct sock *sk, const struct sk_buff *skb)
{
struct tcp_sock *tp = tcp_sk(sk);
return unlikely(TCP_SKB_CB(skb)->seq == (tp->rcv_nxt - 1) &&
(1 << sk->sk_state) & (TCPF_CLOSE_WAIT | TCPF_LAST_ACK |
TCPF_CLOSING));
}
/* Does PAWS and seqno based validation of an incoming segment, flags will
* play significant role here.
*/
static bool tcp_validate_incoming(struct sock *sk, struct sk_buff *skb,
const struct tcphdr *th, int syn_inerr)
{
struct tcp_sock *tp = tcp_sk(sk);
bool rst_seq_match = false;
/* RFC1323: H1. Apply PAWS check first. */
if (tcp_fast_parse_options(sock_net(sk), skb, th, tp) &&
tp->rx_opt.saw_tstamp &&
tcp_paws_discard(sk, skb)) {
if (!th->rst) {
NET_INC_STATS(sock_net(sk), LINUX_MIB_PAWSESTABREJECTED);
if (!tcp_oow_rate_limited(sock_net(sk), skb,
LINUX_MIB_TCPACKSKIPPEDPAWS,
&tp->last_oow_ack_time))
tcp_send_dupack(sk, skb);
goto discard;
}
/* Reset is accepted even if it did not pass PAWS. */
}
/* Step 1: check sequence number */
if (!tcp_sequence(tp, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq)) {
/* RFC793, page 37: "In all states except SYN-SENT, all reset
* (RST) segments are validated by checking their SEQ-fields."
* And page 69: "If an incoming segment is not acceptable,
* an acknowledgment should be sent in reply (unless the RST
* bit is set, if so drop the segment and return)".
*/
if (!th->rst) {
if (th->syn)
goto syn_challenge;
if (!tcp_oow_rate_limited(sock_net(sk), skb,
LINUX_MIB_TCPACKSKIPPEDSEQ,
&tp->last_oow_ack_time))
tcp_send_dupack(sk, skb);
} else if (tcp_reset_check(sk, skb)) {
tcp_reset(sk, skb);
}
goto discard;
}
/* Step 2: check RST bit */
if (th->rst) {
/* RFC 5961 3.2 (extend to match against (RCV.NXT - 1) after a
* FIN and SACK too if available):
* If seq num matches RCV.NXT or (RCV.NXT - 1) after a FIN, or
* the right-most SACK block,
* then
* RESET the connection
* else
* Send a challenge ACK
*/
if (TCP_SKB_CB(skb)->seq == tp->rcv_nxt ||
tcp_reset_check(sk, skb)) {
rst_seq_match = true;
} else if (tcp_is_sack(tp) && tp->rx_opt.num_sacks > 0) {
struct tcp_sack_block *sp = &tp->selective_acks[0];
int max_sack = sp[0].end_seq;
int this_sack;
for (this_sack = 1; this_sack < tp->rx_opt.num_sacks;
++this_sack) {
max_sack = after(sp[this_sack].end_seq,
max_sack) ?
sp[this_sack].end_seq : max_sack;
}
if (TCP_SKB_CB(skb)->seq == max_sack)
rst_seq_match = true;
}
if (rst_seq_match)
tcp_reset(sk, skb);
else {
/* Disable TFO if RST is out-of-order
* and no data has been received
* for current active TFO socket
*/
if (tp->syn_fastopen && !tp->data_segs_in &&
sk->sk_state == TCP_ESTABLISHED)
tcp_fastopen_active_disable(sk);
tcp_send_challenge_ack(sk, skb);
}
goto discard;
}
/* step 3: check security and precedence [ignored] */
/* step 4: Check for a SYN
* RFC 5961 4.2 : Send a challenge ack
*/
if (th->syn) {
syn_challenge:
if (syn_inerr)
TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPSYNCHALLENGE);
tcp_send_challenge_ack(sk, skb);
goto discard;
}
bpf_skops_parse_hdr(sk, skb);
return true;
discard:
tcp_drop(sk, skb);
return false;
}
/*
* TCP receive function for the ESTABLISHED state.
*
* It is split into a fast path and a slow path. The fast path is
* disabled when:
* - A zero window was announced from us - zero window probing
* is only handled properly in the slow path.
* - Out of order segments arrived.
* - Urgent data is expected.
* - There is no buffer space left
* - Unexpected TCP flags/window values/header lengths are received
* (detected by checking the TCP header against pred_flags)
* - Data is sent in both directions. Fast path only supports pure senders
* or pure receivers (this means either the sequence number or the ack
* value must stay constant)
* - Unexpected TCP option.
*
* When these conditions are not satisfied it drops into a standard
* receive procedure patterned after RFC793 to handle all cases.
* The first three cases are guaranteed by proper pred_flags setting,
* the rest is checked inline. Fast processing is turned on in
* tcp_data_queue when everything is OK.
*/
void tcp_rcv_established(struct sock *sk, struct sk_buff *skb)
{
const struct tcphdr *th = (const struct tcphdr *)skb->data;
struct tcp_sock *tp = tcp_sk(sk);
unsigned int len = skb->len;
/* TCP congestion window tracking */
trace_tcp_probe(sk, skb);
tcp_mstamp_refresh(tp);
if (unlikely(!rcu_access_pointer(sk->sk_rx_dst)))
inet_csk(sk)->icsk_af_ops->sk_rx_dst_set(sk, skb);
/*
* Header prediction.
* The code loosely follows the one in the famous
* "30 instruction TCP receive" Van Jacobson mail.
*
* Van's trick is to deposit buffers into socket queue
* on a device interrupt, to call tcp_recv function
* on the receive process context and checksum and copy
* the buffer to user space. smart...
*
* Our current scheme is not silly either but we take the
* extra cost of the net_bh soft interrupt processing...
* We do checksum and copy also but from device to kernel.
*/
tp->rx_opt.saw_tstamp = 0;
/* pred_flags is 0xS?10 << 16 + snd_wnd
* if header_prediction is to be made
* 'S' will always be tp->tcp_header_len >> 2
* '?' will be 0 for the fast path, otherwise pred_flags is 0 to
* turn it off (when there are holes in the receive
* space for instance)
* PSH flag is ignored.
*/
if ((tcp_flag_word(th) & TCP_HP_BITS) == tp->pred_flags &&
TCP_SKB_CB(skb)->seq == tp->rcv_nxt &&
!after(TCP_SKB_CB(skb)->ack_seq, tp->snd_nxt)) {
int tcp_header_len = tp->tcp_header_len;
/* Timestamp header prediction: tcp_header_len
* is automatically equal to th->doff*4 due to pred_flags
* match.
*/
/* Check timestamp */
if (tcp_header_len == sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED) {
/* No? Slow path! */
if (!tcp_parse_aligned_timestamp(tp, th))
goto slow_path;
/* If PAWS failed, check it more carefully in slow path */
if ((s32)(tp->rx_opt.rcv_tsval - tp->rx_opt.ts_recent) < 0)
goto slow_path;
/* DO NOT update ts_recent here, if checksum fails
* and timestamp was corrupted part, it will result
* in a hung connection since we will drop all
* future packets due to the PAWS test.
*/
}
if (len <= tcp_header_len) {
/* Bulk data transfer: sender */
if (len == tcp_header_len) {
/* Predicted packet is in window by definition.
* seq == rcv_nxt and rcv_wup <= rcv_nxt.
* Hence, check seq<=rcv_wup reduces to:
*/
if (tcp_header_len ==
(sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED) &&
tp->rcv_nxt == tp->rcv_wup)
tcp_store_ts_recent(tp);
/* We know that such packets are checksummed
* on entry.
*/
tcp_ack(sk, skb, 0);
__kfree_skb(skb);
tcp_data_snd_check(sk);
/* When receiving pure ack in fast path, update
* last ts ecr directly instead of calling
* tcp_rcv_rtt_measure_ts()
*/
tp->rcv_rtt_last_tsecr = tp->rx_opt.rcv_tsecr;
return;
} else { /* Header too small */
TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
goto discard;
}
} else {
int eaten = 0;
bool fragstolen = false;
if (tcp_checksum_complete(skb))
goto csum_error;
if ((int)skb->truesize > sk->sk_forward_alloc)
goto step5;
/* Predicted packet is in window by definition.
* seq == rcv_nxt and rcv_wup <= rcv_nxt.
* Hence, check seq<=rcv_wup reduces to:
*/
if (tcp_header_len ==
(sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED) &&
tp->rcv_nxt == tp->rcv_wup)
tcp_store_ts_recent(tp);
tcp_rcv_rtt_measure_ts(sk, skb);
NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPHPHITS);
/* Bulk data transfer: receiver */
__skb_pull(skb, tcp_header_len);
eaten = tcp_queue_rcv(sk, skb, &fragstolen);
tcp_event_data_recv(sk, skb);
if (TCP_SKB_CB(skb)->ack_seq != tp->snd_una) {
/* Well, only one small jumplet in fast path... */
tcp_ack(sk, skb, FLAG_DATA);
tcp_data_snd_check(sk);
if (!inet_csk_ack_scheduled(sk))
goto no_ack;
} else {
tcp_update_wl(tp, TCP_SKB_CB(skb)->seq);
}
__tcp_ack_snd_check(sk, 0);
no_ack:
if (eaten)
kfree_skb_partial(skb, fragstolen);
tcp_data_ready(sk);
return;
}
}
slow_path:
if (len < (th->doff << 2) || tcp_checksum_complete(skb))
goto csum_error;
if (!th->ack && !th->rst && !th->syn)
goto discard;
/*
* Standard slow path.
*/
if (!tcp_validate_incoming(sk, skb, th, 1))
return;
step5:
if (tcp_ack(sk, skb, FLAG_SLOWPATH | FLAG_UPDATE_TS_RECENT) < 0)
goto discard;
tcp_rcv_rtt_measure_ts(sk, skb);
/* Process urgent data. */
tcp_urg(sk, skb, th);
/* step 7: process the segment text */
tcp_data_queue(sk, skb);
tcp_data_snd_check(sk);
tcp_ack_snd_check(sk);
return;
csum_error:
trace_tcp_bad_csum(skb);
TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
discard:
tcp_drop(sk, skb);
}
EXPORT_SYMBOL(tcp_rcv_established);
void tcp_init_transfer(struct sock *sk, int bpf_op, struct sk_buff *skb)
{
struct inet_connection_sock *icsk = inet_csk(sk);
struct tcp_sock *tp = tcp_sk(sk);
tcp_mtup_init(sk);
icsk->icsk_af_ops->rebuild_header(sk);
tcp_init_metrics(sk);
/* Initialize the congestion window to start the transfer.
* Cut cwnd down to 1 per RFC5681 if SYN or SYN-ACK has been
* retransmitted. In light of RFC6298 more aggressive 1sec
* initRTO, we only reset cwnd when more than 1 SYN/SYN-ACK
* retransmission has occurred.
*/
if (tp->total_retrans > 1 && tp->undo_marker)
tp->snd_cwnd = 1;
else
tp->snd_cwnd = tcp_init_cwnd(tp, __sk_dst_get(sk));
tp->snd_cwnd_stamp = tcp_jiffies32;
bpf_skops_established(sk, bpf_op, skb);
/* Initialize congestion control unless BPF initialized it already: */
if (!icsk->icsk_ca_initialized)
tcp_init_congestion_control(sk);
tcp_init_buffer_space(sk);
}
void tcp_finish_connect(struct sock *sk, struct sk_buff *skb)
{
struct tcp_sock *tp = tcp_sk(sk);
struct inet_connection_sock *icsk = inet_csk(sk);
tcp_set_state(sk, TCP_ESTABLISHED);
icsk->icsk_ack.lrcvtime = tcp_jiffies32;
if (skb) {
icsk->icsk_af_ops->sk_rx_dst_set(sk, skb);
security_inet_conn_established(sk, skb);
sk_mark_napi_id(sk, skb);
}
tcp_init_transfer(sk, BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB, skb);
/* Prevent spurious tcp_cwnd_restart() on first data
* packet.
*/
tp->lsndtime = tcp_jiffies32;
if (sock_flag(sk, SOCK_KEEPOPEN))
inet_csk_reset_keepalive_timer(sk, keepalive_time_when(tp));
if (!tp->rx_opt.snd_wscale)
__tcp_fast_path_on(tp, tp->snd_wnd);
else
tp->pred_flags = 0;
}
static bool tcp_rcv_fastopen_synack(struct sock *sk, struct sk_buff *synack,
struct tcp_fastopen_cookie *cookie)
{
struct tcp_sock *tp = tcp_sk(sk);
struct sk_buff *data = tp->syn_data ? tcp_rtx_queue_head(sk) : NULL; u16 mss = tp->rx_opt.mss_clamp, try_exp = 0;
bool syn_drop = false;
if (mss == tp->rx_opt.user_mss) {
struct tcp_options_received opt;
/* Get original SYNACK MSS value if user MSS sets mss_clamp */
tcp_clear_options(&opt);
opt.user_mss = opt.mss_clamp = 0;
tcp_parse_options(sock_net(sk), synack, &opt, 0, NULL);
mss = opt.mss_clamp;
}
if (!tp->syn_fastopen) {
/* Ignore an unsolicited cookie */
cookie->len = -1; } else if (tp->total_retrans) {
/* SYN timed out and the SYN-ACK neither has a cookie nor
* acknowledges data. Presumably the remote received only
* the retransmitted (regular) SYNs: either the original
* SYN-data or the corresponding SYN-ACK was dropped.
*/
syn_drop = (cookie->len < 0 && data); } else if (cookie->len < 0 && !tp->syn_data) {
/* We requested a cookie but didn't get it. If we did not use
* the (old) exp opt format then try so next time (try_exp=1).
* Otherwise we go back to use the RFC7413 opt (try_exp=2).
*/
try_exp = tp->syn_fastopen_exp ? 2 : 1;
}
tcp_fastopen_cache_set(sk, mss, cookie, syn_drop, try_exp);
if (data) { /* Retransmit unacked data in SYN */
if (tp->total_retrans)
tp->fastopen_client_fail = TFO_SYN_RETRANSMITTED;
else
tp->fastopen_client_fail = TFO_DATA_NOT_ACKED;
skb_rbtree_walk_from(data)
tcp_mark_skb_lost(sk, data); tcp_xmit_retransmit_queue(sk);
NET_INC_STATS(sock_net(sk),
LINUX_MIB_TCPFASTOPENACTIVEFAIL);
return true;
}
tp->syn_data_acked = tp->syn_data;
if (tp->syn_data_acked) {
NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPFASTOPENACTIVE);
/* SYN-data is counted as two separate packets in tcp_ack() */
if (tp->delivered > 1)
--tp->delivered;
}
tcp_fastopen_add_skb(sk, synack);
return false;
}
static void smc_check_reset_syn(struct tcp_sock *tp)
{
#if IS_ENABLED(CONFIG_SMC)
if (static_branch_unlikely(&tcp_have_smc)) {
if (tp->syn_smc && !tp->rx_opt.smc_ok)
tp->syn_smc = 0;
}
#endif
}
static void tcp_try_undo_spurious_syn(struct sock *sk)
{
struct tcp_sock *tp = tcp_sk(sk);
u32 syn_stamp;
/* undo_marker is set when SYN or SYNACK times out. The timeout is
* spurious if the ACK's timestamp option echo value matches the
* original SYN timestamp.
*/
syn_stamp = tp->retrans_stamp; if (tp->undo_marker && syn_stamp && tp->rx_opt.saw_tstamp && syn_stamp == tp->rx_opt.rcv_tsecr) tp->undo_marker = 0;
}
static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
const struct tcphdr *th)
{
struct inet_connection_sock *icsk = inet_csk(sk);
struct tcp_sock *tp = tcp_sk(sk);
struct tcp_fastopen_cookie foc = { .len = -1 };
int saved_clamp = tp->rx_opt.mss_clamp;
bool fastopen_fail;
tcp_parse_options(sock_net(sk), skb, &tp->rx_opt, 0, &foc);
if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr) tp->rx_opt.rcv_tsecr -= tp->tsoffset; if (th->ack) {
/* rfc793:
* "If the state is SYN-SENT then
* first check the ACK bit
* If the ACK bit is set
* If SEG.ACK =< ISS, or SEG.ACK > SND.NXT, send
* a reset (unless the RST bit is set, if so drop
* the segment and return)"
*/
if (!after(TCP_SKB_CB(skb)->ack_seq, tp->snd_una) || after(TCP_SKB_CB(skb)->ack_seq, tp->snd_nxt)) {
/* Previous FIN/ACK or RST/ACK might be ignored. */
if (icsk->icsk_retransmits == 0)
inet_csk_reset_xmit_timer(sk,
ICSK_TIME_RETRANS,
TCP_TIMEOUT_MIN, TCP_RTO_MAX);
goto reset_and_undo;
}
if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr &&
!between(tp->rx_opt.rcv_tsecr, tp->retrans_stamp,
tcp_time_stamp(tp))) {
NET_INC_STATS(sock_net(sk),
LINUX_MIB_PAWSACTIVEREJECTED);
goto reset_and_undo;
}
/* Now ACK is acceptable.
*
* "If the RST bit is set
* If the ACK was acceptable then signal the user "error:
* connection reset", drop the segment, enter CLOSED state,
* delete TCB, and return."
*/
if (th->rst) { tcp_reset(sk, skb);
goto discard;
}
/* rfc793:
* "fifth, if neither of the SYN or RST bits is set then
* drop the segment and return."
*
* See note below!
* --ANK(990513)
*/
if (!th->syn)
goto discard_and_undo;
/* rfc793:
* "If the SYN bit is on ...
* are acceptable then ...
* (our SYN has been ACKed), change the connection
* state to ESTABLISHED..."
*/
tcp_ecn_rcv_synack(tp, th);
tcp_init_wl(tp, TCP_SKB_CB(skb)->seq);
tcp_try_undo_spurious_syn(sk);
tcp_ack(sk, skb, FLAG_SLOWPATH);
/* Ok.. it's good. Set up sequence numbers and
* move to established.
*/
WRITE_ONCE(tp->rcv_nxt, TCP_SKB_CB(skb)->seq + 1);
tp->rcv_wup = TCP_SKB_CB(skb)->seq + 1;
/* RFC1323: The window in SYN & SYN/ACK segments is
* never scaled.
*/
tp->snd_wnd = ntohs(th->window);
if (!tp->rx_opt.wscale_ok) {
tp->rx_opt.snd_wscale = tp->rx_opt.rcv_wscale = 0;
tp->window_clamp = min(tp->window_clamp, 65535U);
}
if (tp->rx_opt.saw_tstamp) { tp->rx_opt.tstamp_ok = 1;
tp->tcp_header_len =
sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED;
tp->advmss -= TCPOLEN_TSTAMP_ALIGNED;
tcp_store_ts_recent(tp);
} else {
tp->tcp_header_len = sizeof(struct tcphdr);
}
tcp_sync_mss(sk, icsk->icsk_pmtu_cookie);
tcp_initialize_rcv_mss(sk);
/* Remember, tcp_poll() does not lock socket!
* Change state from SYN-SENT only after copied_seq
* is initialized. */
WRITE_ONCE(tp->copied_seq, tp->rcv_nxt);
smc_check_reset_syn(tp);
smp_mb();
tcp_finish_connect(sk, skb);
fastopen_fail = (tp->syn_fastopen || tp->syn_data) &&
tcp_rcv_fastopen_synack(sk, skb, &foc);
if (!sock_flag(sk, SOCK_DEAD)) {
sk->sk_state_change(sk);
sk_wake_async(sk, SOCK_WAKE_IO, POLL_OUT);
}
if (fastopen_fail)
return -1;
if (sk->sk_write_pending || icsk->icsk_accept_queue.rskq_defer_accept ||
inet_csk_in_pingpong_mode(sk)) {
/* Save one ACK. Data will be ready after
* several ticks, if write_pending is set.
*
* It may be deleted, but with this feature tcpdumps
* look so _wonderfully_ clever, that I was not able
* to stand against the temptation 8) --ANK
*/
inet_csk_schedule_ack(sk);
tcp_enter_quickack_mode(sk, TCP_MAX_QUICKACKS);
inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK,
TCP_DELACK_MAX, TCP_RTO_MAX);
discard:
tcp_drop(sk, skb);
return 0;
} else {
tcp_send_ack(sk);
}
return -1;
}
/* No ACK in the segment */
if (th->rst) {
/* rfc793:
* "If the RST bit is set
*
* Otherwise (no ACK) drop the segment and return."
*/
goto discard_and_undo;
}
/* PAWS check. */
if (tp->rx_opt.ts_recent_stamp && tp->rx_opt.saw_tstamp &&
tcp_paws_reject(&tp->rx_opt, 0))
goto discard_and_undo;
if (th->syn) {
/* We see SYN without ACK. It is attempt of
* simultaneous connect with crossed SYNs.
* Particularly, it can be connect to self.
*/
tcp_set_state(sk, TCP_SYN_RECV);
if (tp->rx_opt.saw_tstamp) {
tp->rx_opt.tstamp_ok = 1;
tcp_store_ts_recent(tp);
tp->tcp_header_len =
sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED;
} else {
tp->tcp_header_len = sizeof(struct tcphdr);
}
WRITE_ONCE(tp->rcv_nxt, TCP_SKB_CB(skb)->seq + 1);
WRITE_ONCE(tp->copied_seq, tp->rcv_nxt);
tp->rcv_wup = TCP_SKB_CB(skb)->seq + 1;
/* RFC1323: The window in SYN & SYN/ACK segments is
* never scaled.
*/
tp->snd_wnd = ntohs(th->window);
tp->snd_wl1 = TCP_SKB_CB(skb)->seq;
tp->max_window = tp->snd_wnd;
tcp_ecn_rcv_syn(tp, th);
tcp_mtup_init(sk);
tcp_sync_mss(sk, icsk->icsk_pmtu_cookie);
tcp_initialize_rcv_mss(sk);
tcp_send_synack(sk);
#if 0
/* Note, we could accept data and URG from this segment.
* There are no obstacles to make this (except that we must
* either change tcp_recvmsg() to prevent it from returning data
* before 3WHS completes per RFC793, or employ TCP Fast Open).
*
* However, if we ignore data in ACKless segments sometimes,
* we have no reasons to accept it sometimes.
* Also, seems the code doing it in step6 of tcp_rcv_state_process
* is not flawless. So, discard packet for sanity.
* Uncomment this return to process the data.
*/
return -1;
#else
goto discard;
#endif
}
/* "fifth, if neither of the SYN or RST bits is set then
* drop the segment and return."
*/
discard_and_undo:
tcp_clear_options(&tp->rx_opt);
tp->rx_opt.mss_clamp = saved_clamp;
goto discard;
reset_and_undo:
tcp_clear_options(&tp->rx_opt);
tp->rx_opt.mss_clamp = saved_clamp;
return 1;
}
static void tcp_rcv_synrecv_state_fastopen(struct sock *sk)
{
struct request_sock *req;
/* If we are still handling the SYNACK RTO, see if timestamp ECR allows
* undo. If peer SACKs triggered fast recovery, we can't undo here.
*/
if (inet_csk(sk)->icsk_ca_state == TCP_CA_Loss)
tcp_try_undo_loss(sk, false);
/* Reset rtx states to prevent spurious retransmits_timed_out() */
tcp_sk(sk)->retrans_stamp = 0;
inet_csk(sk)->icsk_retransmits = 0;
/* Once we leave TCP_SYN_RECV or TCP_FIN_WAIT_1,
* we no longer need req so release it.
*/
req = rcu_dereference_protected(tcp_sk(sk)->fastopen_rsk,
lockdep_sock_is_held(sk));
reqsk_fastopen_remove(sk, req, false);
/* Re-arm the timer because data may have been sent out.
* This is similar to the regular data transmission case
* when new data has just been ack'ed.
*
* (TFO) - we could try to be more aggressive and
* retransmitting any data sooner based on when they
* are sent out.
*/
tcp_rearm_rto(sk);
}
/*
* This function implements the receiving procedure of RFC 793 for
* all states except ESTABLISHED and TIME_WAIT.
* It's called from both tcp_v4_rcv and tcp_v6_rcv and should be
* address independent.
*/
int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb)
{
struct tcp_sock *tp = tcp_sk(sk);
struct inet_connection_sock *icsk = inet_csk(sk);
const struct tcphdr *th = tcp_hdr(skb);
struct request_sock *req;
int queued = 0;
bool acceptable;
switch (sk->sk_state) {
case TCP_CLOSE:
goto discard;
case TCP_LISTEN:
if (th->ack)
return 1;
if (th->rst)
goto discard;
if (th->syn) { if (th->fin)
goto discard;
/* It is possible that we process SYN packets from backlog,
* so we need to make sure to disable BH and RCU right there.
*/
rcu_read_lock();
local_bh_disable();
acceptable = icsk->icsk_af_ops->conn_request(sk, skb) >= 0;
local_bh_enable();
rcu_read_unlock();
if (!acceptable)
return 1;
consume_skb(skb);
return 0;
}
goto discard;
case TCP_SYN_SENT:
tp->rx_opt.saw_tstamp = 0;
tcp_mstamp_refresh(tp);
queued = tcp_rcv_synsent_state_process(sk, skb, th);
if (queued >= 0)
return queued;
/* Do step6 onward by hand. */
tcp_urg(sk, skb, th);
__kfree_skb(skb);
tcp_data_snd_check(sk);
return 0;
}
tcp_mstamp_refresh(tp);
tp->rx_opt.saw_tstamp = 0;
req = rcu_dereference_protected(tp->fastopen_rsk,
lockdep_sock_is_held(sk));
if (req) {
bool req_stolen;
WARN_ON_ONCE(sk->sk_state != TCP_SYN_RECV &&
sk->sk_state != TCP_FIN_WAIT1);
if (!tcp_check_req(sk, skb, req, true, &req_stolen)) goto discard;
}
if (!th->ack && !th->rst && !th->syn)
goto discard;
if (!tcp_validate_incoming(sk, skb, th, 0))
return 0;
/* step 5: check the ACK field */
acceptable = tcp_ack(sk, skb, FLAG_SLOWPATH |
FLAG_UPDATE_TS_RECENT |
FLAG_NO_CHALLENGE_ACK) > 0;
if (!acceptable) {
if (sk->sk_state == TCP_SYN_RECV)
return 1; /* send one RST */
tcp_send_challenge_ack(sk, skb);
goto discard;
}
switch (sk->sk_state) {
case TCP_SYN_RECV:
tp->delivered++; /* SYN-ACK delivery isn't tracked in tcp_ack */
if (!tp->srtt_us)
tcp_synack_rtt_meas(sk, req); if (req) { tcp_rcv_synrecv_state_fastopen(sk);
} else {
tcp_try_undo_spurious_syn(sk);
tp->retrans_stamp = 0;
tcp_init_transfer(sk, BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB,
skb);
WRITE_ONCE(tp->copied_seq, tp->rcv_nxt);
}
smp_mb();
tcp_set_state(sk, TCP_ESTABLISHED);
sk->sk_state_change(sk);
/* Note, that this wakeup is only for marginal crossed SYN case.
* Passively open sockets are not waked up, because
* sk->sk_sleep == NULL and sk->sk_socket == NULL.
*/
if (sk->sk_socket)
sk_wake_async(sk, SOCK_WAKE_IO, POLL_OUT);
tp->snd_una = TCP_SKB_CB(skb)->ack_seq;
tp->snd_wnd = ntohs(th->window) << tp->rx_opt.snd_wscale;
tcp_init_wl(tp, TCP_SKB_CB(skb)->seq);
if (tp->rx_opt.tstamp_ok)
tp->advmss -= TCPOLEN_TSTAMP_ALIGNED; if (!inet_csk(sk)->icsk_ca_ops->cong_control) tcp_update_pacing_rate(sk);
/* Prevent spurious tcp_cwnd_restart() on first data packet */
tp->lsndtime = tcp_jiffies32;
tcp_initialize_rcv_mss(sk);
tcp_fast_path_on(tp);
break;
case TCP_FIN_WAIT1: {
int tmo;
if (req) tcp_rcv_synrecv_state_fastopen(sk); if (tp->snd_una != tp->write_seq)
break;
tcp_set_state(sk, TCP_FIN_WAIT2);
sk->sk_shutdown |= SEND_SHUTDOWN;
sk_dst_confirm(sk);
if (!sock_flag(sk, SOCK_DEAD)) {
/* Wake up lingering close() */
sk->sk_state_change(sk);
break;
}
if (tp->linger2 < 0) { tcp_done(sk);
NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONDATA);
return 1;
}
if (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq && after(TCP_SKB_CB(skb)->end_seq - th->fin, tp->rcv_nxt)) {
/* Receive out of order FIN after close() */
if (tp->syn_fastopen && th->fin) tcp_fastopen_active_disable(sk); tcp_done(sk);
NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONDATA);
return 1;
}
tmo = tcp_fin_time(sk);
if (tmo > TCP_TIMEWAIT_LEN) {
inet_csk_reset_keepalive_timer(sk, tmo - TCP_TIMEWAIT_LEN); } else if (th->fin || sock_owned_by_user(sk)) {
/* Bad case. We could lose such FIN otherwise.
* It is not a big problem, but it looks confusing
* and not so rare event. We still can lose it now,
* if it spins in bh_lock_sock(), but it is really
* marginal case.
*/
inet_csk_reset_keepalive_timer(sk, tmo);
} else {
tcp_time_wait(sk, TCP_FIN_WAIT2, tmo);
goto discard;
}
break;
}
case TCP_CLOSING:
if (tp->snd_una == tp->write_seq) { tcp_time_wait(sk, TCP_TIME_WAIT, 0);
goto discard;
}
break;
case TCP_LAST_ACK:
if (tp->snd_una == tp->write_seq) { tcp_update_metrics(sk);
tcp_done(sk);
goto discard;
}
break;
}
/* step 6: check the URG bit */
tcp_urg(sk, skb, th);
/* step 7: process the segment text */
switch (sk->sk_state) {
case TCP_CLOSE_WAIT:
case TCP_CLOSING:
case TCP_LAST_ACK:
if (!before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) {
/* If a subflow has been reset, the packet should not
* continue to be processed, drop the packet.
*/
if (sk_is_mptcp(sk) && !mptcp_incoming_options(sk, skb))
goto discard;
break;
}
fallthrough;
case TCP_FIN_WAIT1:
case TCP_FIN_WAIT2:
/* RFC 793 says to queue data in these states,
* RFC 1122 says we MUST send a reset.
* BSD 4.4 also does reset.
*/
if (sk->sk_shutdown & RCV_SHUTDOWN) { if (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq && after(TCP_SKB_CB(skb)->end_seq - th->fin, tp->rcv_nxt)) {
NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONDATA);
tcp_reset(sk, skb);
return 1;
}
}
fallthrough;
case TCP_ESTABLISHED:
tcp_data_queue(sk, skb);
queued = 1;
break;
}
/* tcp_data could move socket to TIME-WAIT */
if (sk->sk_state != TCP_CLOSE) {
tcp_data_snd_check(sk);
tcp_ack_snd_check(sk);
}
if (!queued) {
discard:
tcp_drop(sk, skb);
}
return 0;
}
EXPORT_SYMBOL(tcp_rcv_state_process);
static inline void pr_drop_req(struct request_sock *req, __u16 port, int family)
{
struct inet_request_sock *ireq = inet_rsk(req);
if (family == AF_INET)
net_dbg_ratelimited("drop open request from %pI4/%u\n",
&ireq->ir_rmt_addr, port);
#if IS_ENABLED(CONFIG_IPV6)
else if (family == AF_INET6)
net_dbg_ratelimited("drop open request from %pI6/%u\n",
&ireq->ir_v6_rmt_addr, port);
#endif
}
/* RFC3168 : 6.1.1 SYN packets must not have ECT/ECN bits set
*
* If we receive a SYN packet with these bits set, it means a
* network is playing bad games with TOS bits. In order to
* avoid possible false congestion notifications, we disable
* TCP ECN negotiation.
*
* Exception: tcp_ca wants ECN. This is required for DCTCP
* congestion control: Linux DCTCP asserts ECT on all packets,
* including SYN, which is most optimal solution; however,
* others, such as FreeBSD do not.
*
* Exception: At least one of the reserved bits of the TCP header (th->res1) is
* set, indicating the use of a future TCP extension (such as AccECN). See
* RFC8311 §4.3 which updates RFC3168 to allow the development of such
* extensions.
*/
static void tcp_ecn_create_request(struct request_sock *req,
const struct sk_buff *skb,
const struct sock *listen_sk,
const struct dst_entry *dst)
{
const struct tcphdr *th = tcp_hdr(skb);
const struct net *net = sock_net(listen_sk);
bool th_ecn = th->ece && th->cwr;
bool ect, ecn_ok;
u32 ecn_ok_dst;
if (!th_ecn)
return;
ect = !INET_ECN_is_not_ect(TCP_SKB_CB(skb)->ip_dsfield);
ecn_ok_dst = dst_feature(dst, DST_FEATURE_ECN_MASK);
ecn_ok = net->ipv4.sysctl_tcp_ecn || ecn_ok_dst;
if (((!ect || th->res1) && ecn_ok) || tcp_ca_needs_ecn(listen_sk) ||
(ecn_ok_dst & DST_FEATURE_ECN_CA) ||
tcp_bpf_ca_needs_ecn((struct sock *)req))
inet_rsk(req)->ecn_ok = 1;
}
static void tcp_openreq_init(struct request_sock *req,
const struct tcp_options_received *rx_opt,
struct sk_buff *skb, const struct sock *sk)
{
struct inet_request_sock *ireq = inet_rsk(req);
req->rsk_rcv_wnd = 0; /* So that tcp_send_synack() knows! */
tcp_rsk(req)->rcv_isn = TCP_SKB_CB(skb)->seq;
tcp_rsk(req)->rcv_nxt = TCP_SKB_CB(skb)->seq + 1;
tcp_rsk(req)->snt_synack = 0;
tcp_rsk(req)->last_oow_ack_time = 0;
req->mss = rx_opt->mss_clamp;
req->ts_recent = rx_opt->saw_tstamp ? rx_opt->rcv_tsval : 0;
ireq->tstamp_ok = rx_opt->tstamp_ok;
ireq->sack_ok = rx_opt->sack_ok;
ireq->snd_wscale = rx_opt->snd_wscale;
ireq->wscale_ok = rx_opt->wscale_ok;
ireq->acked = 0;
ireq->ecn_ok = 0;
ireq->ir_rmt_port = tcp_hdr(skb)->source;
ireq->ir_num = ntohs(tcp_hdr(skb)->dest);
ireq->ir_mark = inet_request_mark(sk, skb);
#if IS_ENABLED(CONFIG_SMC)
ireq->smc_ok = rx_opt->smc_ok;
#endif
}
struct request_sock *inet_reqsk_alloc(const struct request_sock_ops *ops,
struct sock *sk_listener,
bool attach_listener)
{
struct request_sock *req = reqsk_alloc(ops, sk_listener,
attach_listener);
if (req) {
struct inet_request_sock *ireq = inet_rsk(req);
ireq->ireq_opt = NULL;
#if IS_ENABLED(CONFIG_IPV6)
ireq->pktopts = NULL;
#endif
atomic64_set(&ireq->ir_cookie, 0);
ireq->ireq_state = TCP_NEW_SYN_RECV;
write_pnet(&ireq->ireq_net, sock_net(sk_listener));
ireq->ireq_family = sk_listener->sk_family;
}
return req;
}
EXPORT_SYMBOL(inet_reqsk_alloc);
/*
* Return true if a syncookie should be sent
*/
static bool tcp_syn_flood_action(const struct sock *sk, const char *proto)
{
struct request_sock_queue *queue = &inet_csk(sk)->icsk_accept_queue;
const char *msg = "Dropping request";
bool want_cookie = false;
struct net *net = sock_net(sk);
#ifdef CONFIG_SYN_COOKIES
if (net->ipv4.sysctl_tcp_syncookies) {
msg = "Sending cookies";
want_cookie = true;
__NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPREQQFULLDOCOOKIES);
} else
#endif
__NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPREQQFULLDROP);
if (!queue->synflood_warned &&
net->ipv4.sysctl_tcp_syncookies != 2 &&
xchg(&queue->synflood_warned, 1) == 0)
net_info_ratelimited("%s: Possible SYN flooding on port %d. %s. Check SNMP counters.\n",
proto, sk->sk_num, msg);
return want_cookie;
}
static void tcp_reqsk_record_syn(const struct sock *sk,
struct request_sock *req,
const struct sk_buff *skb)
{
if (tcp_sk(sk)->save_syn) {
u32 len = skb_network_header_len(skb) + tcp_hdrlen(skb);
struct saved_syn *saved_syn;
u32 mac_hdrlen;
void *base;
if (tcp_sk(sk)->save_syn == 2) { /* Save full header. */
base = skb_mac_header(skb);
mac_hdrlen = skb_mac_header_len(skb);
len += mac_hdrlen;
} else {
base = skb_network_header(skb);
mac_hdrlen = 0;
}
saved_syn = kmalloc(struct_size(saved_syn, data, len),
GFP_ATOMIC);
if (saved_syn) {
saved_syn->mac_hdrlen = mac_hdrlen;
saved_syn->network_hdrlen = skb_network_header_len(skb);
saved_syn->tcp_hdrlen = tcp_hdrlen(skb);
memcpy(saved_syn->data, base, len);
req->saved_syn = saved_syn;
}
}
}
/* If a SYN cookie is required and supported, returns a clamped MSS value to be
* used for SYN cookie generation.
*/
u16 tcp_get_syncookie_mss(struct request_sock_ops *rsk_ops,
const struct tcp_request_sock_ops *af_ops,
struct sock *sk, struct tcphdr *th)
{
struct tcp_sock *tp = tcp_sk(sk);
u16 mss;
if (sock_net(sk)->ipv4.sysctl_tcp_syncookies != 2 &&
!inet_csk_reqsk_queue_is_full(sk))
return 0;
if (!tcp_syn_flood_action(sk, rsk_ops->slab_name))
return 0;
if (sk_acceptq_is_full(sk)) {
NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
return 0;
}
mss = tcp_parse_mss_option(th, tp->rx_opt.user_mss);
if (!mss)
mss = af_ops->mss_clamp;
return mss;
}
EXPORT_SYMBOL_GPL(tcp_get_syncookie_mss);
int tcp_conn_request(struct request_sock_ops *rsk_ops,
const struct tcp_request_sock_ops *af_ops,
struct sock *sk, struct sk_buff *skb)
{
struct tcp_fastopen_cookie foc = { .len = -1 };
__u32 isn = TCP_SKB_CB(skb)->tcp_tw_isn;
struct tcp_options_received tmp_opt;
struct tcp_sock *tp = tcp_sk(sk);
struct net *net = sock_net(sk);
struct sock *fastopen_sk = NULL;
struct request_sock *req;
bool want_cookie = false;
struct dst_entry *dst;
struct flowi fl;
/* TW buckets are converted to open requests without
* limitations, they conserve resources and peer is
* evidently real one.
*/
if ((net->ipv4.sysctl_tcp_syncookies == 2 ||
inet_csk_reqsk_queue_is_full(sk)) && !isn) {
want_cookie = tcp_syn_flood_action(sk, rsk_ops->slab_name);
if (!want_cookie)
goto drop;
}
if (sk_acceptq_is_full(sk)) {
NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
goto drop;
}
req = inet_reqsk_alloc(rsk_ops, sk, !want_cookie);
if (!req)
goto drop;
req->syncookie = want_cookie;
tcp_rsk(req)->af_specific = af_ops;
tcp_rsk(req)->ts_off = 0;
#if IS_ENABLED(CONFIG_MPTCP)
tcp_rsk(req)->is_mptcp = 0;
#endif
tcp_clear_options(&tmp_opt);
tmp_opt.mss_clamp = af_ops->mss_clamp;
tmp_opt.user_mss = tp->rx_opt.user_mss;
tcp_parse_options(sock_net(sk), skb, &tmp_opt, 0,
want_cookie ? NULL : &foc);
if (want_cookie && !tmp_opt.saw_tstamp)
tcp_clear_options(&tmp_opt);
if (IS_ENABLED(CONFIG_SMC) && want_cookie)
tmp_opt.smc_ok = 0;
tmp_opt.tstamp_ok = tmp_opt.saw_tstamp;
tcp_openreq_init(req, &tmp_opt, skb, sk);
inet_rsk(req)->no_srccheck = inet_sk(sk)->transparent;
/* Note: tcp_v6_init_req() might override ir_iif for link locals */
inet_rsk(req)->ir_iif = inet_request_bound_dev_if(sk, skb);
dst = af_ops->route_req(sk, skb, &fl, req);
if (!dst)
goto drop_and_free;
if (tmp_opt.tstamp_ok)
tcp_rsk(req)->ts_off = af_ops->init_ts_off(net, skb);
if (!want_cookie && !isn) {
/* Kill the following clause, if you dislike this way. */
if (!net->ipv4.sysctl_tcp_syncookies &&
(net->ipv4.sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) <
(net->ipv4.sysctl_max_syn_backlog >> 2)) &&
!tcp_peer_is_proven(req, dst)) {
/* Without syncookies last quarter of
* backlog is filled with destinations,
* proven to be alive.
* It means that we continue to communicate
* to destinations, already remembered
* to the moment of synflood.
*/
pr_drop_req(req, ntohs(tcp_hdr(skb)->source),
rsk_ops->family);
goto drop_and_release;
}
isn = af_ops->init_seq(skb);
}
tcp_ecn_create_request(req, skb, sk, dst);
if (want_cookie) {
isn = cookie_init_sequence(af_ops, sk, skb, &req->mss);
if (!tmp_opt.tstamp_ok)
inet_rsk(req)->ecn_ok = 0;
}
tcp_rsk(req)->snt_isn = isn;
tcp_rsk(req)->txhash = net_tx_rndhash();
tcp_rsk(req)->syn_tos = TCP_SKB_CB(skb)->ip_dsfield;
tcp_openreq_init_rwin(req, sk, dst);
sk_rx_queue_set(req_to_sk(req), skb);
if (!want_cookie) {
tcp_reqsk_record_syn(sk, req, skb);
fastopen_sk = tcp_try_fastopen(sk, skb, req, &foc, dst);
}
if (fastopen_sk) {
af_ops->send_synack(fastopen_sk, dst, &fl, req,
&foc, TCP_SYNACK_FASTOPEN, skb);
/* Add the child socket directly into the accept queue */
if (!inet_csk_reqsk_queue_add(sk, req, fastopen_sk)) {
reqsk_fastopen_remove(fastopen_sk, req, false);
bh_unlock_sock(fastopen_sk);
sock_put(fastopen_sk);
goto drop_and_free;
}
sk->sk_data_ready(sk);
bh_unlock_sock(fastopen_sk);
sock_put(fastopen_sk);
} else {
tcp_rsk(req)->tfo_listener = false;
if (!want_cookie)
inet_csk_reqsk_queue_hash_add(sk, req,
tcp_timeout_init((struct sock *)req));
af_ops->send_synack(sk, dst, &fl, req, &foc,
!want_cookie ? TCP_SYNACK_NORMAL :
TCP_SYNACK_COOKIE,
skb);
if (want_cookie) {
reqsk_free(req);
return 0;
}
}
reqsk_put(req);
return 0;
drop_and_release:
dst_release(dst);
drop_and_free:
__reqsk_free(req);
drop:
tcp_listendrop(sk);
return 0;
}
EXPORT_SYMBOL(tcp_conn_request);
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef __KERNEL_PRINTK__
#define __KERNEL_PRINTK__
#include <linux/stdarg.h>
#include <linux/init.h>
#include <linux/kern_levels.h>
#include <linux/linkage.h>
#include <linux/cache.h>
#include <linux/ratelimit_types.h>
#include <linux/once_lite.h>
extern const char linux_banner[];
extern const char linux_proc_banner[];
extern int oops_in_progress; /* If set, an oops, panic(), BUG() or die() is in progress */
#define PRINTK_MAX_SINGLE_HEADER_LEN 2
static inline int printk_get_level(const char *buffer)
{
if (buffer[0] == KERN_SOH_ASCII && buffer[1]) { switch (buffer[1]) {
case '0' ... '7':
case 'c': /* KERN_CONT */
return buffer[1];
}
}
return 0;
}
static inline const char *printk_skip_level(const char *buffer)
{
if (printk_get_level(buffer))
return buffer + 2;
return buffer;
}
static inline const char *printk_skip_headers(const char *buffer)
{
while (printk_get_level(buffer))
buffer = printk_skip_level(buffer);
return buffer;
}
#define CONSOLE_EXT_LOG_MAX 8192
/* printk's without a loglevel use this.. */
#define MESSAGE_LOGLEVEL_DEFAULT CONFIG_MESSAGE_LOGLEVEL_DEFAULT
/* We show everything that is MORE important than this.. */
#define CONSOLE_LOGLEVEL_SILENT 0 /* Mum's the word */
#define CONSOLE_LOGLEVEL_MIN 1 /* Minimum loglevel we let people use */
#define CONSOLE_LOGLEVEL_DEBUG 10 /* issue debug messages */
#define CONSOLE_LOGLEVEL_MOTORMOUTH 15 /* You can't shut this one up */
/*
* Default used to be hard-coded at 7, quiet used to be hardcoded at 4,
* we're now allowing both to be set from kernel config.
*/
#define CONSOLE_LOGLEVEL_DEFAULT CONFIG_CONSOLE_LOGLEVEL_DEFAULT
#define CONSOLE_LOGLEVEL_QUIET CONFIG_CONSOLE_LOGLEVEL_QUIET
extern int console_printk[];
#define console_loglevel (console_printk[0])
#define default_message_loglevel (console_printk[1])
#define minimum_console_loglevel (console_printk[2])
#define default_console_loglevel (console_printk[3])
extern void console_verbose(void);
/* strlen("ratelimit") + 1 */
#define DEVKMSG_STR_MAX_SIZE 10
extern char devkmsg_log_str[];
struct ctl_table;
extern int suppress_printk;
struct va_format {
const char *fmt;
va_list *va;
};
/*
* FW_BUG
* Add this to a message where you are sure the firmware is buggy or behaves
* really stupid or out of spec. Be aware that the responsible BIOS developer
* should be able to fix this issue or at least get a concrete idea of the
* problem by reading your message without the need of looking at the kernel
* code.
*
* Use it for definite and high priority BIOS bugs.
*
* FW_WARN
* Use it for not that clear (e.g. could the kernel messed up things already?)
* and medium priority BIOS bugs.
*
* FW_INFO
* Use this one if you want to tell the user or vendor about something
* suspicious, but generally harmless related to the firmware.
*
* Use it for information or very low priority BIOS bugs.
*/
#define FW_BUG "[Firmware Bug]: "
#define FW_WARN "[Firmware Warn]: "
#define FW_INFO "[Firmware Info]: "
/*
* HW_ERR
* Add this to a message for hardware errors, so that user can report
* it to hardware vendor instead of LKML or software vendor.
*/
#define HW_ERR "[Hardware Error]: "
/*
* DEPRECATED
* Add this to a message whenever you want to warn user space about the use
* of a deprecated aspect of an API so they can stop using it
*/
#define DEPRECATED "[Deprecated]: "
/*
* Dummy printk for disabled debugging statements to use whilst maintaining
* gcc's format checking.
*/
#define no_printk(fmt, ...) \
({ \
if (0) \
printk(fmt, ##__VA_ARGS__); \
0; \
})
#ifdef CONFIG_EARLY_PRINTK
extern asmlinkage __printf(1, 2)
void early_printk(const char *fmt, ...);
#else
static inline __printf(1, 2) __cold
void early_printk(const char *s, ...) { }
#endif
struct dev_printk_info;
#ifdef CONFIG_PRINTK
asmlinkage __printf(4, 0)
int vprintk_emit(int facility, int level,
const struct dev_printk_info *dev_info,
const char *fmt, va_list args);
asmlinkage __printf(1, 0)
int vprintk(const char *fmt, va_list args);
asmlinkage __printf(1, 2) __cold
int _printk(const char *fmt, ...);
/*
* Special printk facility for scheduler/timekeeping use only, _DO_NOT_USE_ !
*/
__printf(1, 2) __cold int _printk_deferred(const char *fmt, ...);
extern void __printk_safe_enter(void);
extern void __printk_safe_exit(void);
/*
* The printk_deferred_enter/exit macros are available only as a hack for
* some code paths that need to defer all printk console printing. Interrupts
* must be disabled for the deferred duration.
*/
#define printk_deferred_enter __printk_safe_enter
#define printk_deferred_exit __printk_safe_exit
/*
* Please don't use printk_ratelimit(), because it shares ratelimiting state
* with all other unrelated printk_ratelimit() callsites. Instead use
* printk_ratelimited() or plain old __ratelimit().
*/
extern int __printk_ratelimit(const char *func);
#define printk_ratelimit() __printk_ratelimit(__func__)
extern bool printk_timed_ratelimit(unsigned long *caller_jiffies,
unsigned int interval_msec);
extern int printk_delay_msec;
extern int dmesg_restrict;
extern int
devkmsg_sysctl_set_loglvl(struct ctl_table *table, int write, void *buf,
size_t *lenp, loff_t *ppos);
extern void wake_up_klogd(void);
char *log_buf_addr_get(void);
u32 log_buf_len_get(void);
void log_buf_vmcoreinfo_setup(void);
void __init setup_log_buf(int early);
__printf(1, 2) void dump_stack_set_arch_desc(const char *fmt, ...);
void dump_stack_print_info(const char *log_lvl);
void show_regs_print_info(const char *log_lvl);
extern asmlinkage void dump_stack_lvl(const char *log_lvl) __cold;
extern asmlinkage void dump_stack(void) __cold;
void printk_trigger_flush(void);
#else
static inline __printf(1, 0)
int vprintk(const char *s, va_list args)
{
return 0;
}
static inline __printf(1, 2) __cold
int _printk(const char *s, ...)
{
return 0;
}
static inline __printf(1, 2) __cold
int _printk_deferred(const char *s, ...)
{
return 0;
}
static inline void printk_deferred_enter(void)
{
}
static inline void printk_deferred_exit(void)
{
}
static inline int printk_ratelimit(void)
{
return 0;
}
static inline bool printk_timed_ratelimit(unsigned long *caller_jiffies,
unsigned int interval_msec)
{
return false;
}
static inline void wake_up_klogd(void)
{
}
static inline char *log_buf_addr_get(void)
{
return NULL;
}
static inline u32 log_buf_len_get(void)
{
return 0;
}
static inline void log_buf_vmcoreinfo_setup(void)
{
}
static inline void setup_log_buf(int early)
{
}
static inline __printf(1, 2) void dump_stack_set_arch_desc(const char *fmt, ...)
{
}
static inline void dump_stack_print_info(const char *log_lvl)
{
}
static inline void show_regs_print_info(const char *log_lvl)
{
}
static inline void dump_stack_lvl(const char *log_lvl)
{
}
static inline void dump_stack(void)
{
}
static inline void printk_trigger_flush(void)
{
}
#endif
#ifdef CONFIG_SMP
extern int __printk_cpu_trylock(void);
extern void __printk_wait_on_cpu_lock(void);
extern void __printk_cpu_unlock(void);
/**
* printk_cpu_lock_irqsave() - Acquire the printk cpu-reentrant spinning
* lock and disable interrupts.
* @flags: Stack-allocated storage for saving local interrupt state,
* to be passed to printk_cpu_unlock_irqrestore().
*
* If the lock is owned by another CPU, spin until it becomes available.
* Interrupts are restored while spinning.
*/
#define printk_cpu_lock_irqsave(flags) \
for (;;) { \
local_irq_save(flags); \
if (__printk_cpu_trylock()) \
break; \
local_irq_restore(flags); \
__printk_wait_on_cpu_lock(); \
}
/**
* printk_cpu_unlock_irqrestore() - Release the printk cpu-reentrant spinning
* lock and restore interrupts.
* @flags: Caller's saved interrupt state, from printk_cpu_lock_irqsave().
*/
#define printk_cpu_unlock_irqrestore(flags) \
do { \
__printk_cpu_unlock(); \
local_irq_restore(flags); \
} while (0) \
#else
#define printk_cpu_lock_irqsave(flags) ((void)flags)
#define printk_cpu_unlock_irqrestore(flags) ((void)flags)
#endif /* CONFIG_SMP */
extern int kptr_restrict;
/**
* pr_fmt - used by the pr_*() macros to generate the printk format string
* @fmt: format string passed from a pr_*() macro
*
* This macro can be used to generate a unified format string for pr_*()
* macros. A common use is to prefix all pr_*() messages in a file with a common
* string. For example, defining this at the top of a source file:
*
* #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
*
* would prefix all pr_info, pr_emerg... messages in the file with the module
* name.
*/
#ifndef pr_fmt
#define pr_fmt(fmt) fmt
#endif
struct module;
#ifdef CONFIG_PRINTK_INDEX
struct pi_entry {
const char *fmt;
const char *func;
const char *file;
unsigned int line;
/*
* While printk and pr_* have the level stored in the string at compile
* time, some subsystems dynamically add it at runtime through the
* format string. For these dynamic cases, we allow the subsystem to
* tell us the level at compile time.
*
* NULL indicates that the level, if any, is stored in fmt.
*/
const char *level;
/*
* The format string used by various subsystem specific printk()
* wrappers to prefix the message.
*
* Note that the static prefix defined by the pr_fmt() macro is stored
* directly in the message format (@fmt), not here.
*/
const char *subsys_fmt_prefix;
} __packed;
#define __printk_index_emit(_fmt, _level, _subsys_fmt_prefix) \
do { \
if (__builtin_constant_p(_fmt) && __builtin_constant_p(_level)) { \
/*
* We check __builtin_constant_p multiple times here
* for the same input because GCC will produce an error
* if we try to assign a static variable to fmt if it
* is not a constant, even with the outer if statement.
*/ \
static const struct pi_entry _entry \
__used = { \
.fmt = __builtin_constant_p(_fmt) ? (_fmt) : NULL, \
.func = __func__, \
.file = __FILE__, \
.line = __LINE__, \
.level = __builtin_constant_p(_level) ? (_level) : NULL, \
.subsys_fmt_prefix = _subsys_fmt_prefix,\
}; \
static const struct pi_entry *_entry_ptr \
__used __section(".printk_index") = &_entry; \
} \
} while (0)
#else /* !CONFIG_PRINTK_INDEX */
#define __printk_index_emit(...) do {} while (0)
#endif /* CONFIG_PRINTK_INDEX */
/*
* Some subsystems have their own custom printk that applies a va_format to a
* generic format, for example, to include a device number or other metadata
* alongside the format supplied by the caller.
*
* In order to store these in the way they would be emitted by the printk
* infrastructure, the subsystem provides us with the start, fixed string, and
* any subsequent text in the format string.
*
* We take a variable argument list as pr_fmt/dev_fmt/etc are sometimes passed
* as multiple arguments (eg: `"%s: ", "blah"`), and we must only take the
* first one.
*
* subsys_fmt_prefix must be known at compile time, or compilation will fail
* (since this is a mistake). If fmt or level is not known at compile time, no
* index entry will be made (since this can legitimately happen).
*/
#define printk_index_subsys_emit(subsys_fmt_prefix, level, fmt, ...) \
__printk_index_emit(fmt, level, subsys_fmt_prefix)
#define printk_index_wrap(_p_func, _fmt, ...) \
({ \
__printk_index_emit(_fmt, NULL, NULL); \
_p_func(_fmt, ##__VA_ARGS__); \
})
/**
* printk - print a kernel message
* @fmt: format string
*
* This is printk(). It can be called from any context. We want it to work.
*
* If printk indexing is enabled, _printk() is called from printk_index_wrap.
* Otherwise, printk is simply #defined to _printk.
*
* We try to grab the console_lock. If we succeed, it's easy - we log the
* output and call the console drivers. If we fail to get the semaphore, we
* place the output into the log buffer and return. The current holder of
* the console_sem will notice the new output in console_unlock(); and will
* send it to the consoles before releasing the lock.
*
* One effect of this deferred printing is that code which calls printk() and
* then changes console_loglevel may break. This is because console_loglevel
* is inspected when the actual printing occurs.
*
* See also:
* printf(3)
*
* See the vsnprintf() documentation for format string extensions over C99.
*/
#define printk(fmt, ...) printk_index_wrap(_printk, fmt, ##__VA_ARGS__)
#define printk_deferred(fmt, ...) \
printk_index_wrap(_printk_deferred, fmt, ##__VA_ARGS__)
/**
* pr_emerg - Print an emergency-level message
* @fmt: format string
* @...: arguments for the format string
*
* This macro expands to a printk with KERN_EMERG loglevel. It uses pr_fmt() to
* generate the format string.
*/
#define pr_emerg(fmt, ...) \
printk(KERN_EMERG pr_fmt(fmt), ##__VA_ARGS__)
/**
* pr_alert - Print an alert-level message
* @fmt: format string
* @...: arguments for the format string
*
* This macro expands to a printk with KERN_ALERT loglevel. It uses pr_fmt() to
* generate the format string.
*/
#define pr_alert(fmt, ...) \
printk(KERN_ALERT pr_fmt(fmt), ##__VA_ARGS__)
/**
* pr_crit - Print a critical-level message
* @fmt: format string
* @...: arguments for the format string
*
* This macro expands to a printk with KERN_CRIT loglevel. It uses pr_fmt() to
* generate the format string.
*/
#define pr_crit(fmt, ...) \
printk(KERN_CRIT pr_fmt(fmt), ##__VA_ARGS__)
/**
* pr_err - Print an error-level message
* @fmt: format string
* @...: arguments for the format string
*
* This macro expands to a printk with KERN_ERR loglevel. It uses pr_fmt() to
* generate the format string.
*/
#define pr_err(fmt, ...) \
printk(KERN_ERR pr_fmt(fmt), ##__VA_ARGS__)
/**
* pr_warn - Print a warning-level message
* @fmt: format string
* @...: arguments for the format string
*
* This macro expands to a printk with KERN_WARNING loglevel. It uses pr_fmt()
* to generate the format string.
*/
#define pr_warn(fmt, ...) \
printk(KERN_WARNING pr_fmt(fmt), ##__VA_ARGS__)
/**
* pr_notice - Print a notice-level message
* @fmt: format string
* @...: arguments for the format string
*
* This macro expands to a printk with KERN_NOTICE loglevel. It uses pr_fmt() to
* generate the format string.
*/
#define pr_notice(fmt, ...) \
printk(KERN_NOTICE pr_fmt(fmt), ##__VA_ARGS__)
/**
* pr_info - Print an info-level message
* @fmt: format string
* @...: arguments for the format string
*
* This macro expands to a printk with KERN_INFO loglevel. It uses pr_fmt() to
* generate the format string.
*/
#define pr_info(fmt, ...) \
printk(KERN_INFO pr_fmt(fmt), ##__VA_ARGS__)
/**
* pr_cont - Continues a previous log message in the same line.
* @fmt: format string
* @...: arguments for the format string
*
* This macro expands to a printk with KERN_CONT loglevel. It should only be
* used when continuing a log message with no newline ('\n') enclosed. Otherwise
* it defaults back to KERN_DEFAULT loglevel.
*/
#define pr_cont(fmt, ...) \
printk(KERN_CONT fmt, ##__VA_ARGS__)
/**
* pr_devel - Print a debug-level message conditionally
* @fmt: format string
* @...: arguments for the format string
*
* This macro expands to a printk with KERN_DEBUG loglevel if DEBUG is
* defined. Otherwise it does nothing.
*
* It uses pr_fmt() to generate the format string.
*/
#ifdef DEBUG
#define pr_devel(fmt, ...) \
printk(KERN_DEBUG pr_fmt(fmt), ##__VA_ARGS__)
#else
#define pr_devel(fmt, ...) \
no_printk(KERN_DEBUG pr_fmt(fmt), ##__VA_ARGS__)
#endif
/* If you are writing a driver, please use dev_dbg instead */
#if defined(CONFIG_DYNAMIC_DEBUG) || \
(defined(CONFIG_DYNAMIC_DEBUG_CORE) && defined(DYNAMIC_DEBUG_MODULE))
#include <linux/dynamic_debug.h>
/**
* pr_debug - Print a debug-level message conditionally
* @fmt: format string
* @...: arguments for the format string
*
* This macro expands to dynamic_pr_debug() if CONFIG_DYNAMIC_DEBUG is
* set. Otherwise, if DEBUG is defined, it's equivalent to a printk with
* KERN_DEBUG loglevel. If DEBUG is not defined it does nothing.
*
* It uses pr_fmt() to generate the format string (dynamic_pr_debug() uses
* pr_fmt() internally).
*/
#define pr_debug(fmt, ...) \
dynamic_pr_debug(fmt, ##__VA_ARGS__)
#elif defined(DEBUG)
#define pr_debug(fmt, ...) \
printk(KERN_DEBUG pr_fmt(fmt), ##__VA_ARGS__)
#else
#define pr_debug(fmt, ...) \
no_printk(KERN_DEBUG pr_fmt(fmt), ##__VA_ARGS__)
#endif
/*
* Print a one-time message (analogous to WARN_ONCE() et al):
*/
#ifdef CONFIG_PRINTK
#define printk_once(fmt, ...) \
DO_ONCE_LITE(printk, fmt, ##__VA_ARGS__)
#define printk_deferred_once(fmt, ...) \
DO_ONCE_LITE(printk_deferred, fmt, ##__VA_ARGS__)
#else
#define printk_once(fmt, ...) \
no_printk(fmt, ##__VA_ARGS__)
#define printk_deferred_once(fmt, ...) \
no_printk(fmt, ##__VA_ARGS__)
#endif
#define pr_emerg_once(fmt, ...) \
printk_once(KERN_EMERG pr_fmt(fmt), ##__VA_ARGS__)
#define pr_alert_once(fmt, ...) \
printk_once(KERN_ALERT pr_fmt(fmt), ##__VA_ARGS__)
#define pr_crit_once(fmt, ...) \
printk_once(KERN_CRIT pr_fmt(fmt), ##__VA_ARGS__)
#define pr_err_once(fmt, ...) \
printk_once(KERN_ERR pr_fmt(fmt), ##__VA_ARGS__)
#define pr_warn_once(fmt, ...) \
printk_once(KERN_WARNING pr_fmt(fmt), ##__VA_ARGS__)
#define pr_notice_once(fmt, ...) \
printk_once(KERN_NOTICE pr_fmt(fmt), ##__VA_ARGS__)
#define pr_info_once(fmt, ...) \
printk_once(KERN_INFO pr_fmt(fmt), ##__VA_ARGS__)
/* no pr_cont_once, don't do that... */
#if defined(DEBUG)
#define pr_devel_once(fmt, ...) \
printk_once(KERN_DEBUG pr_fmt(fmt), ##__VA_ARGS__)
#else
#define pr_devel_once(fmt, ...) \
no_printk(KERN_DEBUG pr_fmt(fmt), ##__VA_ARGS__)
#endif
/* If you are writing a driver, please use dev_dbg instead */
#if defined(DEBUG)
#define pr_debug_once(fmt, ...) \
printk_once(KERN_DEBUG pr_fmt(fmt), ##__VA_ARGS__)
#else
#define pr_debug_once(fmt, ...) \
no_printk(KERN_DEBUG pr_fmt(fmt), ##__VA_ARGS__)
#endif
/*
* ratelimited messages with local ratelimit_state,
* no local ratelimit_state used in the !PRINTK case
*/
#ifdef CONFIG_PRINTK
#define printk_ratelimited(fmt, ...) \
({ \
static DEFINE_RATELIMIT_STATE(_rs, \
DEFAULT_RATELIMIT_INTERVAL, \
DEFAULT_RATELIMIT_BURST); \
\
if (__ratelimit(&_rs)) \
printk(fmt, ##__VA_ARGS__); \
})
#else
#define printk_ratelimited(fmt, ...) \
no_printk(fmt, ##__VA_ARGS__)
#endif
#define pr_emerg_ratelimited(fmt, ...) \
printk_ratelimited(KERN_EMERG pr_fmt(fmt), ##__VA_ARGS__)
#define pr_alert_ratelimited(fmt, ...) \
printk_ratelimited(KERN_ALERT pr_fmt(fmt), ##__VA_ARGS__)
#define pr_crit_ratelimited(fmt, ...) \
printk_ratelimited(KERN_CRIT pr_fmt(fmt), ##__VA_ARGS__)
#define pr_err_ratelimited(fmt, ...) \
printk_ratelimited(KERN_ERR pr_fmt(fmt), ##__VA_ARGS__)
#define pr_warn_ratelimited(fmt, ...) \
printk_ratelimited(KERN_WARNING pr_fmt(fmt), ##__VA_ARGS__)
#define pr_notice_ratelimited(fmt, ...) \
printk_ratelimited(KERN_NOTICE pr_fmt(fmt), ##__VA_ARGS__)
#define pr_info_ratelimited(fmt, ...) \
printk_ratelimited(KERN_INFO pr_fmt(fmt), ##__VA_ARGS__)
/* no pr_cont_ratelimited, don't do that... */
#if defined(DEBUG)
#define pr_devel_ratelimited(fmt, ...) \
printk_ratelimited(KERN_DEBUG pr_fmt(fmt), ##__VA_ARGS__)
#else
#define pr_devel_ratelimited(fmt, ...) \
no_printk(KERN_DEBUG pr_fmt(fmt), ##__VA_ARGS__)
#endif
/* If you are writing a driver, please use dev_dbg instead */
#if defined(CONFIG_DYNAMIC_DEBUG) || \
(defined(CONFIG_DYNAMIC_DEBUG_CORE) && defined(DYNAMIC_DEBUG_MODULE))
/* descriptor check is first to prevent flooding with "callbacks suppressed" */
#define pr_debug_ratelimited(fmt, ...) \
do { \
static DEFINE_RATELIMIT_STATE(_rs, \
DEFAULT_RATELIMIT_INTERVAL, \
DEFAULT_RATELIMIT_BURST); \
DEFINE_DYNAMIC_DEBUG_METADATA(descriptor, pr_fmt(fmt)); \
if (DYNAMIC_DEBUG_BRANCH(descriptor) && \
__ratelimit(&_rs)) \
__dynamic_pr_debug(&descriptor, pr_fmt(fmt), ##__VA_ARGS__); \
} while (0)
#elif defined(DEBUG)
#define pr_debug_ratelimited(fmt, ...) \
printk_ratelimited(KERN_DEBUG pr_fmt(fmt), ##__VA_ARGS__)
#else
#define pr_debug_ratelimited(fmt, ...) \
no_printk(KERN_DEBUG pr_fmt(fmt), ##__VA_ARGS__)
#endif
extern const struct file_operations kmsg_fops;
enum {
DUMP_PREFIX_NONE,
DUMP_PREFIX_ADDRESS,
DUMP_PREFIX_OFFSET
};
extern int hex_dump_to_buffer(const void *buf, size_t len, int rowsize,
int groupsize, char *linebuf, size_t linebuflen,
bool ascii);
#ifdef CONFIG_PRINTK
extern void print_hex_dump(const char *level, const char *prefix_str,
int prefix_type, int rowsize, int groupsize,
const void *buf, size_t len, bool ascii);
#else
static inline void print_hex_dump(const char *level, const char *prefix_str,
int prefix_type, int rowsize, int groupsize,
const void *buf, size_t len, bool ascii)
{
}
static inline void print_hex_dump_bytes(const char *prefix_str, int prefix_type,
const void *buf, size_t len)
{
}
#endif
#if defined(CONFIG_DYNAMIC_DEBUG) || \
(defined(CONFIG_DYNAMIC_DEBUG_CORE) && defined(DYNAMIC_DEBUG_MODULE))
#define print_hex_dump_debug(prefix_str, prefix_type, rowsize, \
groupsize, buf, len, ascii) \
dynamic_hex_dump(prefix_str, prefix_type, rowsize, \
groupsize, buf, len, ascii)
#elif defined(DEBUG)
#define print_hex_dump_debug(prefix_str, prefix_type, rowsize, \
groupsize, buf, len, ascii) \
print_hex_dump(KERN_DEBUG, prefix_str, prefix_type, rowsize, \
groupsize, buf, len, ascii)
#else
static inline void print_hex_dump_debug(const char *prefix_str, int prefix_type,
int rowsize, int groupsize,
const void *buf, size_t len, bool ascii)
{
}
#endif
/**
* print_hex_dump_bytes - shorthand form of print_hex_dump() with default params
* @prefix_str: string to prefix each line with;
* caller supplies trailing spaces for alignment if desired
* @prefix_type: controls whether prefix of an offset, address, or none
* is printed (%DUMP_PREFIX_OFFSET, %DUMP_PREFIX_ADDRESS, %DUMP_PREFIX_NONE)
* @buf: data blob to dump
* @len: number of bytes in the @buf
*
* Calls print_hex_dump(), with log level of KERN_DEBUG,
* rowsize of 16, groupsize of 1, and ASCII output included.
*/
#define print_hex_dump_bytes(prefix_str, prefix_type, buf, len) \
print_hex_dump_debug(prefix_str, prefix_type, 16, 1, buf, len, true)
#endif
// SPDX-License-Identifier: GPL-2.0-only
/*
* mm/readahead.c - address_space-level file readahead.
*
* Copyright (C) 2002, Linus Torvalds
*
* 09Apr2002 Andrew Morton
* Initial version.
*/
#include <linux/kernel.h>
#include <linux/dax.h>
#include <linux/gfp.h>
#include <linux/export.h>
#include <linux/blkdev.h>
#include <linux/backing-dev.h>
#include <linux/task_io_accounting_ops.h>
#include <linux/pagevec.h>
#include <linux/pagemap.h>
#include <linux/syscalls.h>
#include <linux/file.h>
#include <linux/mm_inline.h>
#include <linux/blk-cgroup.h>
#include <linux/fadvise.h>
#include <linux/sched/mm.h>
#include "internal.h"
/*
* Initialise a struct file's readahead state. Assumes that the caller has
* memset *ra to zero.
*/
void
file_ra_state_init(struct file_ra_state *ra, struct address_space *mapping)
{
ra->ra_pages = inode_to_bdi(mapping->host)->ra_pages;
ra->prev_pos = -1;
}
EXPORT_SYMBOL_GPL(file_ra_state_init);
/*
* see if a page needs releasing upon read_cache_pages() failure
* - the caller of read_cache_pages() may have set PG_private or PG_fscache
* before calling, such as the NFS fs marking pages that are cached locally
* on disk, thus we need to give the fs a chance to clean up in the event of
* an error
*/
static void read_cache_pages_invalidate_page(struct address_space *mapping,
struct page *page)
{
if (page_has_private(page)) {
if (!trylock_page(page))
BUG();
page->mapping = mapping;
do_invalidatepage(page, 0, PAGE_SIZE);
page->mapping = NULL;
unlock_page(page);
}
put_page(page);
}
/*
* release a list of pages, invalidating them first if need be
*/
static void read_cache_pages_invalidate_pages(struct address_space *mapping,
struct list_head *pages)
{
struct page *victim;
while (!list_empty(pages)) {
victim = lru_to_page(pages);
list_del(&victim->lru);
read_cache_pages_invalidate_page(mapping, victim);
}
}
/**
* read_cache_pages - populate an address space with some pages & start reads against them
* @mapping: the address_space
* @pages: The address of a list_head which contains the target pages. These
* pages have their ->index populated and are otherwise uninitialised.
* @filler: callback routine for filling a single page.
* @data: private data for the callback routine.
*
* Hides the details of the LRU cache etc from the filesystems.
*
* Returns: %0 on success, error return by @filler otherwise
*/
int read_cache_pages(struct address_space *mapping, struct list_head *pages,
int (*filler)(void *, struct page *), void *data)
{
struct page *page;
int ret = 0;
while (!list_empty(pages)) {
page = lru_to_page(pages);
list_del(&page->lru);
if (add_to_page_cache_lru(page, mapping, page->index,
readahead_gfp_mask(mapping))) {
read_cache_pages_invalidate_page(mapping, page);
continue;
}
put_page(page);
ret = filler(data, page);
if (unlikely(ret)) {
read_cache_pages_invalidate_pages(mapping, pages);
break;
}
task_io_account_read(PAGE_SIZE);
}
return ret;
}
EXPORT_SYMBOL(read_cache_pages);
static void read_pages(struct readahead_control *rac, struct list_head *pages,
bool skip_page)
{
const struct address_space_operations *aops = rac->mapping->a_ops;
struct page *page;
struct blk_plug plug;
if (!readahead_count(rac))
goto out;
blk_start_plug(&plug);
if (aops->readahead) {
aops->readahead(rac);
/* Clean up the remaining pages */
while ((page = readahead_page(rac))) {
unlock_page(page);
put_page(page);
}
} else if (aops->readpages) { aops->readpages(rac->file, rac->mapping, pages,
readahead_count(rac));
/* Clean up the remaining pages */
put_pages_list(pages);
rac->_index += rac->_nr_pages;
rac->_nr_pages = 0;
} else {
while ((page = readahead_page(rac))) {
aops->readpage(rac->file, page);
put_page(page);
}
}
blk_finish_plug(&plug); BUG_ON(!list_empty(pages)); BUG_ON(readahead_count(rac));
out:
if (skip_page) rac->_index++;
}
/**
* page_cache_ra_unbounded - Start unchecked readahead.
* @ractl: Readahead control.
* @nr_to_read: The number of pages to read.
* @lookahead_size: Where to start the next readahead.
*
* This function is for filesystems to call when they want to start
* readahead beyond a file's stated i_size. This is almost certainly
* not the function you want to call. Use page_cache_async_readahead()
* or page_cache_sync_readahead() instead.
*
* Context: File is referenced by caller. Mutexes may be held by caller.
* May sleep, but will not reenter filesystem to reclaim memory.
*/
void page_cache_ra_unbounded(struct readahead_control *ractl,
unsigned long nr_to_read, unsigned long lookahead_size)
{
struct address_space *mapping = ractl->mapping;
unsigned long index = readahead_index(ractl);
LIST_HEAD(page_pool);
gfp_t gfp_mask = readahead_gfp_mask(mapping);
unsigned long i;
/*
* Partway through the readahead operation, we will have added
* locked pages to the page cache, but will not yet have submitted
* them for I/O. Adding another page may need to allocate memory,
* which can trigger memory reclaim. Telling the VM we're in
* the middle of a filesystem operation will cause it to not
* touch file-backed pages, preventing a deadlock. Most (all?)
* filesystems already specify __GFP_NOFS in their mapping's
* gfp_mask, but let's be explicit here.
*/
unsigned int nofs = memalloc_nofs_save();
filemap_invalidate_lock_shared(mapping);
/*
* Preallocate as many pages as we will need.
*/
for (i = 0; i < nr_to_read; i++) { struct page *page = xa_load(&mapping->i_pages, index + i);
if (page && !xa_is_value(page)) {
/*
* Page already present? Kick off the current batch
* of contiguous pages before continuing with the
* next batch. This page may be the one we would
* have intended to mark as Readahead, but we don't
* have a stable reference to this page, and it's
* not worth getting one just for that.
*/
read_pages(ractl, &page_pool, true);
i = ractl->_index + ractl->_nr_pages - index - 1;
continue;
}
page = __page_cache_alloc(gfp_mask);
if (!page)
break;
if (mapping->a_ops->readpages) { page->index = index + i;
list_add(&page->lru, &page_pool);
} else if (add_to_page_cache_lru(page, mapping, index + i,
gfp_mask) < 0) {
put_page(page);
read_pages(ractl, &page_pool, true);
i = ractl->_index + ractl->_nr_pages - index - 1;
continue;
}
if (i == nr_to_read - lookahead_size)
SetPageReadahead(page);
ractl->_nr_pages++;
}
/*
* Now start the IO. We ignore I/O errors - if the page is not
* uptodate then the caller will launch readpage again, and
* will then handle the error.
*/
read_pages(ractl, &page_pool, false);
filemap_invalidate_unlock_shared(mapping);
memalloc_nofs_restore(nofs);
}
EXPORT_SYMBOL_GPL(page_cache_ra_unbounded);
/*
* do_page_cache_ra() actually reads a chunk of disk. It allocates
* the pages first, then submits them for I/O. This avoids the very bad
* behaviour which would occur if page allocations are causing VM writeback.
* We really don't want to intermingle reads and writes like that.
*/
void do_page_cache_ra(struct readahead_control *ractl,
unsigned long nr_to_read, unsigned long lookahead_size)
{
struct inode *inode = ractl->mapping->host; unsigned long index = readahead_index(ractl);
loff_t isize = i_size_read(inode);
pgoff_t end_index; /* The last page we want to read */
if (isize == 0)
return;
end_index = (isize - 1) >> PAGE_SHIFT;
if (index > end_index)
return;
/* Don't read past the page containing the last byte of the file */
if (nr_to_read > end_index - index) nr_to_read = end_index - index + 1; page_cache_ra_unbounded(ractl, nr_to_read, lookahead_size);
}
/*
* Chunk the readahead into 2 megabyte units, so that we don't pin too much
* memory at once.
*/
void force_page_cache_ra(struct readahead_control *ractl,
unsigned long nr_to_read)
{
struct address_space *mapping = ractl->mapping;
struct file_ra_state *ra = ractl->ra;
struct backing_dev_info *bdi = inode_to_bdi(mapping->host);
unsigned long max_pages, index;
if (unlikely(!mapping->a_ops->readpage && !mapping->a_ops->readpages &&
!mapping->a_ops->readahead))
return;
/*
* If the request exceeds the readahead window, allow the read to
* be up to the optimal hardware IO size
*/
index = readahead_index(ractl);
max_pages = max_t(unsigned long, bdi->io_pages, ra->ra_pages);
nr_to_read = min_t(unsigned long, nr_to_read, max_pages);
while (nr_to_read) {
unsigned long this_chunk = (2 * 1024 * 1024) / PAGE_SIZE;
if (this_chunk > nr_to_read)
this_chunk = nr_to_read;
ractl->_index = index;
do_page_cache_ra(ractl, this_chunk, 0);
index += this_chunk;
nr_to_read -= this_chunk;
}
}
/*
* Set the initial window size, round to next power of 2 and square
* for small size, x 4 for medium, and x 2 for large
* for 128k (32 page) max ra
* 1-8 page = 32k initial, > 8 page = 128k initial
*/
static unsigned long get_init_ra_size(unsigned long size, unsigned long max)
{
unsigned long newsize = roundup_pow_of_two(size);
if (newsize <= max / 32)
newsize = newsize * 4; else if (newsize <= max / 4) newsize = newsize * 2;
else
newsize = max;
return newsize;
}
/*
* Get the previous window size, ramp it up, and
* return it as the new window size.
*/
static unsigned long get_next_ra_size(struct file_ra_state *ra,
unsigned long max)
{
unsigned long cur = ra->size;
if (cur < max / 16)
return 4 * cur; if (cur <= max / 2) return 2 * cur;
return max;
}
/*
* On-demand readahead design.
*
* The fields in struct file_ra_state represent the most-recently-executed
* readahead attempt:
*
* |<----- async_size ---------|
* |------------------- size -------------------->|
* |==================#===========================|
* ^start ^page marked with PG_readahead
*
* To overlap application thinking time and disk I/O time, we do
* `readahead pipelining': Do not wait until the application consumed all
* readahead pages and stalled on the missing page at readahead_index;
* Instead, submit an asynchronous readahead I/O as soon as there are
* only async_size pages left in the readahead window. Normally async_size
* will be equal to size, for maximum pipelining.
*
* In interleaved sequential reads, concurrent streams on the same fd can
* be invalidating each other's readahead state. So we flag the new readahead
* page at (start+size-async_size) with PG_readahead, and use it as readahead
* indicator. The flag won't be set on already cached pages, to avoid the
* readahead-for-nothing fuss, saving pointless page cache lookups.
*
* prev_pos tracks the last visited byte in the _previous_ read request.
* It should be maintained by the caller, and will be used for detecting
* small random reads. Note that the readahead algorithm checks loosely
* for sequential patterns. Hence interleaved reads might be served as
* sequential ones.
*
* There is a special-case: if the first page which the application tries to
* read happens to be the first page of the file, it is assumed that a linear
* read is about to happen and the window is immediately set to the initial size
* based on I/O request size and the max_readahead.
*
* The code ramps up the readahead size aggressively at first, but slow down as
* it approaches max_readhead.
*/
/*
* Count contiguously cached pages from @index-1 to @index-@max,
* this count is a conservative estimation of
* - length of the sequential read sequence, or
* - thrashing threshold in memory tight systems
*/
static pgoff_t count_history_pages(struct address_space *mapping,
pgoff_t index, unsigned long max)
{
pgoff_t head;
rcu_read_lock();
head = page_cache_prev_miss(mapping, index - 1, max);
rcu_read_unlock();
return index - 1 - head;
}
/*
* page cache context based read-ahead
*/
static int try_context_readahead(struct address_space *mapping,
struct file_ra_state *ra,
pgoff_t index,
unsigned long req_size,
unsigned long max)
{
pgoff_t size;
size = count_history_pages(mapping, index, max);
/*
* not enough history pages:
* it could be a random read
*/
if (size <= req_size)
return 0;
/*
* starts from beginning of file:
* it is a strong indication of long-run stream (or whole-file-read)
*/
if (size >= index) size *= 2; ra->start = index;
ra->size = min(size + req_size, max);
ra->async_size = 1;
return 1;
}
/*
* A minimal readahead algorithm for trivial sequential/random reads.
*/
static void ondemand_readahead(struct readahead_control *ractl,
bool hit_readahead_marker, unsigned long req_size)
{
struct backing_dev_info *bdi = inode_to_bdi(ractl->mapping->host); struct file_ra_state *ra = ractl->ra;
unsigned long max_pages = ra->ra_pages;
unsigned long add_pages;
unsigned long index = readahead_index(ractl);
pgoff_t prev_index;
/*
* If the request exceeds the readahead window, allow the read to
* be up to the optimal hardware IO size
*/
if (req_size > max_pages && bdi->io_pages > max_pages) max_pages = min(req_size, bdi->io_pages);
/*
* start of file
*/
if (!index)
goto initial_readahead;
/*
* It's the expected callback index, assume sequential access.
* Ramp up sizes, and push forward the readahead window.
*/
if ((index == (ra->start + ra->size - ra->async_size) ||
index == (ra->start + ra->size))) {
ra->start += ra->size; ra->size = get_next_ra_size(ra, max_pages);
ra->async_size = ra->size;
goto readit;
}
/*
* Hit a marked page without valid readahead state.
* E.g. interleaved reads.
* Query the pagecache for async_size, which normally equals to
* readahead size. Ramp it up and use it as the new readahead size.
*/
if (hit_readahead_marker) {
pgoff_t start;
rcu_read_lock();
start = page_cache_next_miss(ractl->mapping, index + 1,
max_pages);
rcu_read_unlock();
if (!start || start - index > max_pages)
return;
ra->start = start;
ra->size = start - index; /* old async_size */
ra->size += req_size;
ra->size = get_next_ra_size(ra, max_pages);
ra->async_size = ra->size;
goto readit;
}
/*
* oversize read
*/
if (req_size > max_pages)
goto initial_readahead;
/*
* sequential cache miss
* trivial case: (index - prev_index) == 1
* unaligned reads: (index - prev_index) == 0
*/
prev_index = (unsigned long long)ra->prev_pos >> PAGE_SHIFT;
if (index - prev_index <= 1UL)
goto initial_readahead;
/*
* Query the page cache and look for the traces(cached history pages)
* that a sequential stream would leave behind.
*/
if (try_context_readahead(ractl->mapping, ra, index, req_size,
max_pages))
goto readit;
/*
* standalone, small random read
* Read as is, and do not pollute the readahead state.
*/
do_page_cache_ra(ractl, req_size, 0);
return;
initial_readahead:
ra->start = index; ra->size = get_init_ra_size(req_size, max_pages); ra->async_size = ra->size > req_size ? ra->size - req_size : ra->size;
readit:
/*
* Will this read hit the readahead marker made by itself?
* If so, trigger the readahead marker hit now, and merge
* the resulted next readahead window into the current one.
* Take care of maximum IO pages as above.
*/
if (index == ra->start && ra->size == ra->async_size) {
add_pages = get_next_ra_size(ra, max_pages);
if (ra->size + add_pages <= max_pages) { ra->async_size = add_pages;
ra->size += add_pages;
} else {
ra->size = max_pages;
ra->async_size = max_pages >> 1;
}
}
ractl->_index = ra->start;
do_page_cache_ra(ractl, ra->size, ra->async_size);
}
void page_cache_sync_ra(struct readahead_control *ractl,
unsigned long req_count)
{
bool do_forced_ra = ractl->file && (ractl->file->f_mode & FMODE_RANDOM);
/*
* Even if read-ahead is disabled, issue this request as read-ahead
* as we'll need it to satisfy the requested range. The forced
* read-ahead will do the right thing and limit the read to just the
* requested range, which we'll set to 1 page for this case.
*/
if (!ractl->ra->ra_pages || blk_cgroup_congested()) {
if (!ractl->file)
return;
req_count = 1;
do_forced_ra = true;
}
/* be dumb */
if (do_forced_ra) {
force_page_cache_ra(ractl, req_count);
return;
}
/* do read-ahead */
ondemand_readahead(ractl, false, req_count);
}
EXPORT_SYMBOL_GPL(page_cache_sync_ra);
void page_cache_async_ra(struct readahead_control *ractl,
struct page *page, unsigned long req_count)
{
/* no read-ahead */
if (!ractl->ra->ra_pages)
return;
/*
* Same bit is used for PG_readahead and PG_reclaim.
*/
if (PageWriteback(page))
return;
ClearPageReadahead(page);
/*
* Defer asynchronous read-ahead on IO congestion.
*/
if (inode_read_congested(ractl->mapping->host))
return;
if (blk_cgroup_congested())
return;
/* do read-ahead */
ondemand_readahead(ractl, true, req_count);
}
EXPORT_SYMBOL_GPL(page_cache_async_ra);
ssize_t ksys_readahead(int fd, loff_t offset, size_t count)
{
ssize_t ret;
struct fd f;
ret = -EBADF;
f = fdget(fd);
if (!f.file || !(f.file->f_mode & FMODE_READ))
goto out;
/*
* The readahead() syscall is intended to run only on files
* that can execute readahead. If readahead is not possible
* on this file, then we must return -EINVAL.
*/
ret = -EINVAL;
if (!f.file->f_mapping || !f.file->f_mapping->a_ops ||
!S_ISREG(file_inode(f.file)->i_mode))
goto out;
ret = vfs_fadvise(f.file, offset, count, POSIX_FADV_WILLNEED);
out:
fdput(f);
return ret;
}
SYSCALL_DEFINE3(readahead, int, fd, loff_t, offset, size_t, count)
{
return ksys_readahead(fd, offset, count);
}
/**
* readahead_expand - Expand a readahead request
* @ractl: The request to be expanded
* @new_start: The revised start
* @new_len: The revised size of the request
*
* Attempt to expand a readahead request outwards from the current size to the
* specified size by inserting locked pages before and after the current window
* to increase the size to the new window. This may involve the insertion of
* THPs, in which case the window may get expanded even beyond what was
* requested.
*
* The algorithm will stop if it encounters a conflicting page already in the
* pagecache and leave a smaller expansion than requested.
*
* The caller must check for this by examining the revised @ractl object for a
* different expansion than was requested.
*/
void readahead_expand(struct readahead_control *ractl,
loff_t new_start, size_t new_len)
{
struct address_space *mapping = ractl->mapping;
struct file_ra_state *ra = ractl->ra;
pgoff_t new_index, new_nr_pages;
gfp_t gfp_mask = readahead_gfp_mask(mapping);
new_index = new_start / PAGE_SIZE;
/* Expand the leading edge downwards */
while (ractl->_index > new_index) {
unsigned long index = ractl->_index - 1;
struct page *page = xa_load(&mapping->i_pages, index);
if (page && !xa_is_value(page))
return; /* Page apparently present */
page = __page_cache_alloc(gfp_mask);
if (!page)
return;
if (add_to_page_cache_lru(page, mapping, index, gfp_mask) < 0) {
put_page(page);
return;
}
ractl->_nr_pages++;
ractl->_index = page->index;
}
new_len += new_start - readahead_pos(ractl);
new_nr_pages = DIV_ROUND_UP(new_len, PAGE_SIZE);
/* Expand the trailing edge upwards */
while (ractl->_nr_pages < new_nr_pages) {
unsigned long index = ractl->_index + ractl->_nr_pages;
struct page *page = xa_load(&mapping->i_pages, index);
if (page && !xa_is_value(page))
return; /* Page apparently present */
page = __page_cache_alloc(gfp_mask);
if (!page)
return;
if (add_to_page_cache_lru(page, mapping, index, gfp_mask) < 0) {
put_page(page);
return;
}
ractl->_nr_pages++;
if (ra) {
ra->size++;
ra->async_size++;
}
}
}
EXPORT_SYMBOL(readahead_expand);
// SPDX-License-Identifier: GPL-2.0
/*
* property.c - Unified device property interface.
*
* Copyright (C) 2014, Intel Corporation
* Authors: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
* Mika Westerberg <mika.westerberg@linux.intel.com>
*/
#include <linux/acpi.h>
#include <linux/export.h>
#include <linux/kernel.h>
#include <linux/of.h>
#include <linux/of_address.h>
#include <linux/of_graph.h>
#include <linux/of_irq.h>
#include <linux/property.h>
#include <linux/etherdevice.h>
#include <linux/phy.h>
struct fwnode_handle *dev_fwnode(struct device *dev)
{
return IS_ENABLED(CONFIG_OF) && dev->of_node ?
of_fwnode_handle(dev->of_node) : dev->fwnode;
}
EXPORT_SYMBOL_GPL(dev_fwnode);
/**
* device_property_present - check if a property of a device is present
* @dev: Device whose property is being checked
* @propname: Name of the property
*
* Check if property @propname is present in the device firmware description.
*/
bool device_property_present(struct device *dev, const char *propname)
{
return fwnode_property_present(dev_fwnode(dev), propname);
}
EXPORT_SYMBOL_GPL(device_property_present);
/**
* fwnode_property_present - check if a property of a firmware node is present
* @fwnode: Firmware node whose property to check
* @propname: Name of the property
*/
bool fwnode_property_present(const struct fwnode_handle *fwnode,
const char *propname)
{
bool ret;
ret = fwnode_call_bool_op(fwnode, property_present, propname);
if (ret == false && !IS_ERR_OR_NULL(fwnode) &&
!IS_ERR_OR_NULL(fwnode->secondary))
ret = fwnode_call_bool_op(fwnode->secondary, property_present,
propname);
return ret;
}
EXPORT_SYMBOL_GPL(fwnode_property_present);
/**
* device_property_read_u8_array - return a u8 array property of a device
* @dev: Device to get the property of
* @propname: Name of the property
* @val: The values are stored here or %NULL to return the number of values
* @nval: Size of the @val array
*
* Function reads an array of u8 properties with @propname from the device
* firmware description and stores them to @val if found.
*
* Return: number of values if @val was %NULL,
* %0 if the property was found (success),
* %-EINVAL if given arguments are not valid,
* %-ENODATA if the property does not have a value,
* %-EPROTO if the property is not an array of numbers,
* %-EOVERFLOW if the size of the property is not as expected.
* %-ENXIO if no suitable firmware interface is present.
*/
int device_property_read_u8_array(struct device *dev, const char *propname,
u8 *val, size_t nval)
{
return fwnode_property_read_u8_array(dev_fwnode(dev), propname, val, nval);
}
EXPORT_SYMBOL_GPL(device_property_read_u8_array);
/**
* device_property_read_u16_array - return a u16 array property of a device
* @dev: Device to get the property of
* @propname: Name of the property
* @val: The values are stored here or %NULL to return the number of values
* @nval: Size of the @val array
*
* Function reads an array of u16 properties with @propname from the device
* firmware description and stores them to @val if found.
*
* Return: number of values if @val was %NULL,
* %0 if the property was found (success),
* %-EINVAL if given arguments are not valid,
* %-ENODATA if the property does not have a value,
* %-EPROTO if the property is not an array of numbers,
* %-EOVERFLOW if the size of the property is not as expected.
* %-ENXIO if no suitable firmware interface is present.
*/
int device_property_read_u16_array(struct device *dev, const char *propname,
u16 *val, size_t nval)
{
return fwnode_property_read_u16_array(dev_fwnode(dev), propname, val, nval);
}
EXPORT_SYMBOL_GPL(device_property_read_u16_array);
/**
* device_property_read_u32_array - return a u32 array property of a device
* @dev: Device to get the property of
* @propname: Name of the property
* @val: The values are stored here or %NULL to return the number of values
* @nval: Size of the @val array
*
* Function reads an array of u32 properties with @propname from the device
* firmware description and stores them to @val if found.
*
* Return: number of values if @val was %NULL,
* %0 if the property was found (success),
* %-EINVAL if given arguments are not valid,
* %-ENODATA if the property does not have a value,
* %-EPROTO if the property is not an array of numbers,
* %-EOVERFLOW if the size of the property is not as expected.
* %-ENXIO if no suitable firmware interface is present.
*/
int device_property_read_u32_array(struct device *dev, const char *propname,
u32 *val, size_t nval)
{
return fwnode_property_read_u32_array(dev_fwnode(dev), propname, val, nval);
}
EXPORT_SYMBOL_GPL(device_property_read_u32_array);
/**
* device_property_read_u64_array - return a u64 array property of a device
* @dev: Device to get the property of
* @propname: Name of the property
* @val: The values are stored here or %NULL to return the number of values
* @nval: Size of the @val array
*
* Function reads an array of u64 properties with @propname from the device
* firmware description and stores them to @val if found.
*
* Return: number of values if @val was %NULL,
* %0 if the property was found (success),
* %-EINVAL if given arguments are not valid,
* %-ENODATA if the property does not have a value,
* %-EPROTO if the property is not an array of numbers,
* %-EOVERFLOW if the size of the property is not as expected.
* %-ENXIO if no suitable firmware interface is present.
*/
int device_property_read_u64_array(struct device *dev, const char *propname,
u64 *val, size_t nval)
{
return fwnode_property_read_u64_array(dev_fwnode(dev), propname, val, nval);
}
EXPORT_SYMBOL_GPL(device_property_read_u64_array);
/**
* device_property_read_string_array - return a string array property of device
* @dev: Device to get the property of
* @propname: Name of the property
* @val: The values are stored here or %NULL to return the number of values
* @nval: Size of the @val array
*
* Function reads an array of string properties with @propname from the device
* firmware description and stores them to @val if found.
*
* Return: number of values read on success if @val is non-NULL,
* number of values available on success if @val is NULL,
* %-EINVAL if given arguments are not valid,
* %-ENODATA if the property does not have a value,
* %-EPROTO or %-EILSEQ if the property is not an array of strings,
* %-EOVERFLOW if the size of the property is not as expected.
* %-ENXIO if no suitable firmware interface is present.
*/
int device_property_read_string_array(struct device *dev, const char *propname,
const char **val, size_t nval)
{
return fwnode_property_read_string_array(dev_fwnode(dev), propname, val, nval);
}
EXPORT_SYMBOL_GPL(device_property_read_string_array);
/**
* device_property_read_string - return a string property of a device
* @dev: Device to get the property of
* @propname: Name of the property
* @val: The value is stored here
*
* Function reads property @propname from the device firmware description and
* stores the value into @val if found. The value is checked to be a string.
*
* Return: %0 if the property was found (success),
* %-EINVAL if given arguments are not valid,
* %-ENODATA if the property does not have a value,
* %-EPROTO or %-EILSEQ if the property type is not a string.
* %-ENXIO if no suitable firmware interface is present.
*/
int device_property_read_string(struct device *dev, const char *propname,
const char **val)
{
return fwnode_property_read_string(dev_fwnode(dev), propname, val);
}
EXPORT_SYMBOL_GPL(device_property_read_string);
/**
* device_property_match_string - find a string in an array and return index
* @dev: Device to get the property of
* @propname: Name of the property holding the array
* @string: String to look for
*
* Find a given string in a string array and if it is found return the
* index back.
*
* Return: %0 if the property was found (success),
* %-EINVAL if given arguments are not valid,
* %-ENODATA if the property does not have a value,
* %-EPROTO if the property is not an array of strings,
* %-ENXIO if no suitable firmware interface is present.
*/
int device_property_match_string(struct device *dev, const char *propname,
const char *string)
{
return fwnode_property_match_string(dev_fwnode(dev), propname, string);
}
EXPORT_SYMBOL_GPL(device_property_match_string);
static int fwnode_property_read_int_array(const struct fwnode_handle *fwnode,
const char *propname,
unsigned int elem_size, void *val,
size_t nval)
{
int ret;
ret = fwnode_call_int_op(fwnode, property_read_int_array, propname,
elem_size, val, nval);
if (ret == -EINVAL && !IS_ERR_OR_NULL(fwnode) &&
!IS_ERR_OR_NULL(fwnode->secondary))
ret = fwnode_call_int_op(
fwnode->secondary, property_read_int_array, propname,
elem_size, val, nval);
return ret;
}
/**
* fwnode_property_read_u8_array - return a u8 array property of firmware node
* @fwnode: Firmware node to get the property of
* @propname: Name of the property
* @val: The values are stored here or %NULL to return the number of values
* @nval: Size of the @val array
*
* Read an array of u8 properties with @propname from @fwnode and stores them to
* @val if found.
*
* Return: number of values if @val was %NULL,
* %0 if the property was found (success),
* %-EINVAL if given arguments are not valid,
* %-ENODATA if the property does not have a value,
* %-EPROTO if the property is not an array of numbers,
* %-EOVERFLOW if the size of the property is not as expected,
* %-ENXIO if no suitable firmware interface is present.
*/
int fwnode_property_read_u8_array(const struct fwnode_handle *fwnode,
const char *propname, u8 *val, size_t nval)
{
return fwnode_property_read_int_array(fwnode, propname, sizeof(u8),
val, nval);
}
EXPORT_SYMBOL_GPL(fwnode_property_read_u8_array);
/**
* fwnode_property_read_u16_array - return a u16 array property of firmware node
* @fwnode: Firmware node to get the property of
* @propname: Name of the property
* @val: The values are stored here or %NULL to return the number of values
* @nval: Size of the @val array
*
* Read an array of u16 properties with @propname from @fwnode and store them to
* @val if found.
*
* Return: number of values if @val was %NULL,
* %0 if the property was found (success),
* %-EINVAL if given arguments are not valid,
* %-ENODATA if the property does not have a value,
* %-EPROTO if the property is not an array of numbers,
* %-EOVERFLOW if the size of the property is not as expected,
* %-ENXIO if no suitable firmware interface is present.
*/
int fwnode_property_read_u16_array(const struct fwnode_handle *fwnode,
const char *propname, u16 *val, size_t nval)
{
return fwnode_property_read_int_array(fwnode, propname, sizeof(u16),
val, nval);
}
EXPORT_SYMBOL_GPL(fwnode_property_read_u16_array);
/**
* fwnode_property_read_u32_array - return a u32 array property of firmware node
* @fwnode: Firmware node to get the property of
* @propname: Name of the property
* @val: The values are stored here or %NULL to return the number of values
* @nval: Size of the @val array
*
* Read an array of u32 properties with @propname from @fwnode store them to
* @val if found.
*
* Return: number of values if @val was %NULL,
* %0 if the property was found (success),
* %-EINVAL if given arguments are not valid,
* %-ENODATA if the property does not have a value,
* %-EPROTO if the property is not an array of numbers,
* %-EOVERFLOW if the size of the property is not as expected,
* %-ENXIO if no suitable firmware interface is present.
*/
int fwnode_property_read_u32_array(const struct fwnode_handle *fwnode,
const char *propname, u32 *val, size_t nval)
{
return fwnode_property_read_int_array(fwnode, propname, sizeof(u32),
val, nval);
}
EXPORT_SYMBOL_GPL(fwnode_property_read_u32_array);
/**
* fwnode_property_read_u64_array - return a u64 array property firmware node
* @fwnode: Firmware node to get the property of
* @propname: Name of the property
* @val: The values are stored here or %NULL to return the number of values
* @nval: Size of the @val array
*
* Read an array of u64 properties with @propname from @fwnode and store them to
* @val if found.
*
* Return: number of values if @val was %NULL,
* %0 if the property was found (success),
* %-EINVAL if given arguments are not valid,
* %-ENODATA if the property does not have a value,
* %-EPROTO if the property is not an array of numbers,
* %-EOVERFLOW if the size of the property is not as expected,
* %-ENXIO if no suitable firmware interface is present.
*/
int fwnode_property_read_u64_array(const struct fwnode_handle *fwnode,
const char *propname, u64 *val, size_t nval)
{
return fwnode_property_read_int_array(fwnode, propname, sizeof(u64),
val, nval);
}
EXPORT_SYMBOL_GPL(fwnode_property_read_u64_array);
/**
* fwnode_property_read_string_array - return string array property of a node
* @fwnode: Firmware node to get the property of
* @propname: Name of the property
* @val: The values are stored here or %NULL to return the number of values
* @nval: Size of the @val array
*
* Read an string list property @propname from the given firmware node and store
* them to @val if found.
*
* Return: number of values read on success if @val is non-NULL,
* number of values available on success if @val is NULL,
* %-EINVAL if given arguments are not valid,
* %-ENODATA if the property does not have a value,
* %-EPROTO or %-EILSEQ if the property is not an array of strings,
* %-EOVERFLOW if the size of the property is not as expected,
* %-ENXIO if no suitable firmware interface is present.
*/
int fwnode_property_read_string_array(const struct fwnode_handle *fwnode,
const char *propname, const char **val,
size_t nval)
{
int ret;
ret = fwnode_call_int_op(fwnode, property_read_string_array, propname,
val, nval);
if (ret == -EINVAL && !IS_ERR_OR_NULL(fwnode) &&
!IS_ERR_OR_NULL(fwnode->secondary))
ret = fwnode_call_int_op(fwnode->secondary,
property_read_string_array, propname,
val, nval);
return ret;
}
EXPORT_SYMBOL_GPL(fwnode_property_read_string_array);
/**
* fwnode_property_read_string - return a string property of a firmware node
* @fwnode: Firmware node to get the property of
* @propname: Name of the property
* @val: The value is stored here
*
* Read property @propname from the given firmware node and store the value into
* @val if found. The value is checked to be a string.
*
* Return: %0 if the property was found (success),
* %-EINVAL if given arguments are not valid,
* %-ENODATA if the property does not have a value,
* %-EPROTO or %-EILSEQ if the property is not a string,
* %-ENXIO if no suitable firmware interface is present.
*/
int fwnode_property_read_string(const struct fwnode_handle *fwnode,
const char *propname, const char **val)
{
int ret = fwnode_property_read_string_array(fwnode, propname, val, 1);
return ret < 0 ? ret : 0;
}
EXPORT_SYMBOL_GPL(fwnode_property_read_string);
/**
* fwnode_property_match_string - find a string in an array and return index
* @fwnode: Firmware node to get the property of
* @propname: Name of the property holding the array
* @string: String to look for
*
* Find a given string in a string array and if it is found return the
* index back.
*
* Return: %0 if the property was found (success),
* %-EINVAL if given arguments are not valid,
* %-ENODATA if the property does not have a value,
* %-EPROTO if the property is not an array of strings,
* %-ENXIO if no suitable firmware interface is present.
*/
int fwnode_property_match_string(const struct fwnode_handle *fwnode,
const char *propname, const char *string)
{
const char **values;
int nval, ret;
nval = fwnode_property_read_string_array(fwnode, propname, NULL, 0);
if (nval < 0)
return nval;
if (nval == 0)
return -ENODATA;
values = kcalloc(nval, sizeof(*values), GFP_KERNEL);
if (!values)
return -ENOMEM;
ret = fwnode_property_read_string_array(fwnode, propname, values, nval);
if (ret < 0)
goto out;
ret = match_string(values, nval, string);
if (ret < 0)
ret = -ENODATA;
out:
kfree(values);
return ret;
}
EXPORT_SYMBOL_GPL(fwnode_property_match_string);
/**
* fwnode_property_get_reference_args() - Find a reference with arguments
* @fwnode: Firmware node where to look for the reference
* @prop: The name of the property
* @nargs_prop: The name of the property telling the number of
* arguments in the referred node. NULL if @nargs is known,
* otherwise @nargs is ignored. Only relevant on OF.
* @nargs: Number of arguments. Ignored if @nargs_prop is non-NULL.
* @index: Index of the reference, from zero onwards.
* @args: Result structure with reference and integer arguments.
*
* Obtain a reference based on a named property in an fwnode, with
* integer arguments.
*
* Caller is responsible to call fwnode_handle_put() on the returned
* args->fwnode pointer.
*
* Returns: %0 on success
* %-ENOENT when the index is out of bounds, the index has an empty
* reference or the property was not found
* %-EINVAL on parse error
*/
int fwnode_property_get_reference_args(const struct fwnode_handle *fwnode,
const char *prop, const char *nargs_prop,
unsigned int nargs, unsigned int index,
struct fwnode_reference_args *args)
{
return fwnode_call_int_op(fwnode, get_reference_args, prop, nargs_prop,
nargs, index, args);
}
EXPORT_SYMBOL_GPL(fwnode_property_get_reference_args);
/**
* fwnode_find_reference - Find named reference to a fwnode_handle
* @fwnode: Firmware node where to look for the reference
* @name: The name of the reference
* @index: Index of the reference
*
* @index can be used when the named reference holds a table of references.
*
* Returns pointer to the reference fwnode, or ERR_PTR. Caller is responsible to
* call fwnode_handle_put() on the returned fwnode pointer.
*/
struct fwnode_handle *fwnode_find_reference(const struct fwnode_handle *fwnode,
const char *name,
unsigned int index)
{
struct fwnode_reference_args args;
int ret;
ret = fwnode_property_get_reference_args(fwnode, name, NULL, 0, index,
&args);
return ret ? ERR_PTR(ret) : args.fwnode;
}
EXPORT_SYMBOL_GPL(fwnode_find_reference);
/**
* device_remove_properties - Remove properties from a device object.
* @dev: Device whose properties to remove.
*
* The function removes properties previously associated to the device
* firmware node with device_add_properties(). Memory allocated to the
* properties will also be released.
*/
void device_remove_properties(struct device *dev)
{
struct fwnode_handle *fwnode = dev_fwnode(dev);
if (!fwnode)
return;
if (is_software_node(fwnode->secondary)) { fwnode_remove_software_node(fwnode->secondary);
set_secondary_fwnode(dev, NULL);
}
}
EXPORT_SYMBOL_GPL(device_remove_properties);
/**
* device_add_properties - Add a collection of properties to a device object.
* @dev: Device to add properties to.
* @properties: Collection of properties to add.
*
* Associate a collection of device properties represented by @properties with
* @dev. The function takes a copy of @properties.
*
* WARNING: The callers should not use this function if it is known that there
* is no real firmware node associated with @dev! In that case the callers
* should create a software node and assign it to @dev directly.
*/
int device_add_properties(struct device *dev,
const struct property_entry *properties)
{
struct fwnode_handle *fwnode;
fwnode = fwnode_create_software_node(properties, NULL);
if (IS_ERR(fwnode))
return PTR_ERR(fwnode);
set_secondary_fwnode(dev, fwnode);
return 0;
}
EXPORT_SYMBOL_GPL(device_add_properties);
/**
* fwnode_get_name - Return the name of a node
* @fwnode: The firmware node
*
* Returns a pointer to the node name.
*/
const char *fwnode_get_name(const struct fwnode_handle *fwnode)
{
return fwnode_call_ptr_op(fwnode, get_name);
}
EXPORT_SYMBOL_GPL(fwnode_get_name);
/**
* fwnode_get_name_prefix - Return the prefix of node for printing purposes
* @fwnode: The firmware node
*
* Returns the prefix of a node, intended to be printed right before the node.
* The prefix works also as a separator between the nodes.
*/
const char *fwnode_get_name_prefix(const struct fwnode_handle *fwnode)
{
return fwnode_call_ptr_op(fwnode, get_name_prefix);
}
/**
* fwnode_get_parent - Return parent firwmare node
* @fwnode: Firmware whose parent is retrieved
*
* Return parent firmware node of the given node if possible or %NULL if no
* parent was available.
*/
struct fwnode_handle *fwnode_get_parent(const struct fwnode_handle *fwnode)
{
return fwnode_call_ptr_op(fwnode, get_parent);
}
EXPORT_SYMBOL_GPL(fwnode_get_parent);
/**
* fwnode_get_next_parent - Iterate to the node's parent
* @fwnode: Firmware whose parent is retrieved
*
* This is like fwnode_get_parent() except that it drops the refcount
* on the passed node, making it suitable for iterating through a
* node's parents.
*
* Returns a node pointer with refcount incremented, use
* fwnode_handle_node() on it when done.
*/
struct fwnode_handle *fwnode_get_next_parent(struct fwnode_handle *fwnode)
{
struct fwnode_handle *parent = fwnode_get_parent(fwnode);
fwnode_handle_put(fwnode);
return parent;
}
EXPORT_SYMBOL_GPL(fwnode_get_next_parent);
/**
* fwnode_get_next_parent_dev - Find device of closest ancestor fwnode
* @fwnode: firmware node
*
* Given a firmware node (@fwnode), this function finds its closest ancestor
* firmware node that has a corresponding struct device and returns that struct
* device.
*
* The caller of this function is expected to call put_device() on the returned
* device when they are done.
*/
struct device *fwnode_get_next_parent_dev(struct fwnode_handle *fwnode)
{
struct device *dev;
fwnode_handle_get(fwnode);
do {
fwnode = fwnode_get_next_parent(fwnode);
if (!fwnode)
return NULL;
dev = get_dev_from_fwnode(fwnode);
} while (!dev);
fwnode_handle_put(fwnode);
return dev;
}
/**
* fwnode_count_parents - Return the number of parents a node has
* @fwnode: The node the parents of which are to be counted
*
* Returns the number of parents a node has.
*/
unsigned int fwnode_count_parents(const struct fwnode_handle *fwnode)
{
struct fwnode_handle *__fwnode;
unsigned int count;
__fwnode = fwnode_get_parent(fwnode);
for (count = 0; __fwnode; count++)
__fwnode = fwnode_get_next_parent(__fwnode);
return count;
}
EXPORT_SYMBOL_GPL(fwnode_count_parents);
/**
* fwnode_get_nth_parent - Return an nth parent of a node
* @fwnode: The node the parent of which is requested
* @depth: Distance of the parent from the node
*
* Returns the nth parent of a node. If there is no parent at the requested
* @depth, %NULL is returned. If @depth is 0, the functionality is equivalent to
* fwnode_handle_get(). For @depth == 1, it is fwnode_get_parent() and so on.
*
* The caller is responsible for calling fwnode_handle_put() for the returned
* node.
*/
struct fwnode_handle *fwnode_get_nth_parent(struct fwnode_handle *fwnode,
unsigned int depth)
{
unsigned int i;
fwnode_handle_get(fwnode);
for (i = 0; i < depth && fwnode; i++)
fwnode = fwnode_get_next_parent(fwnode);
return fwnode;
}
EXPORT_SYMBOL_GPL(fwnode_get_nth_parent);
/**
* fwnode_is_ancestor_of - Test if @test_ancestor is ancestor of @test_child
* @test_ancestor: Firmware which is tested for being an ancestor
* @test_child: Firmware which is tested for being the child
*
* A node is considered an ancestor of itself too.
*
* Returns true if @test_ancestor is an ancestor of @test_child.
* Otherwise, returns false.
*/
bool fwnode_is_ancestor_of(struct fwnode_handle *test_ancestor,
struct fwnode_handle *test_child)
{
if (!test_ancestor)
return false;
fwnode_handle_get(test_child);
while (test_child) {
if (test_child == test_ancestor) {
fwnode_handle_put(test_child);
return true;
}
test_child = fwnode_get_next_parent(test_child);
}
return false;
}
/**
* fwnode_get_next_child_node - Return the next child node handle for a node
* @fwnode: Firmware node to find the next child node for.
* @child: Handle to one of the node's child nodes or a %NULL handle.
*/
struct fwnode_handle *
fwnode_get_next_child_node(const struct fwnode_handle *fwnode,
struct fwnode_handle *child)
{
return fwnode_call_ptr_op(fwnode, get_next_child_node, child);
}
EXPORT_SYMBOL_GPL(fwnode_get_next_child_node);
/**
* fwnode_get_next_available_child_node - Return the next
* available child node handle for a node
* @fwnode: Firmware node to find the next child node for.
* @child: Handle to one of the node's child nodes or a %NULL handle.
*/
struct fwnode_handle *
fwnode_get_next_available_child_node(const struct fwnode_handle *fwnode,
struct fwnode_handle *child)
{
struct fwnode_handle *next_child = child;
if (!fwnode)
return NULL;
do {
next_child = fwnode_get_next_child_node(fwnode, next_child);
if (!next_child)
return NULL;
} while (!fwnode_device_is_available(next_child));
return next_child;
}
EXPORT_SYMBOL_GPL(fwnode_get_next_available_child_node);
/**
* device_get_next_child_node - Return the next child node handle for a device
* @dev: Device to find the next child node for.
* @child: Handle to one of the device's child nodes or a null handle.
*/
struct fwnode_handle *device_get_next_child_node(struct device *dev,
struct fwnode_handle *child)
{
const struct fwnode_handle *fwnode = dev_fwnode(dev);
struct fwnode_handle *next;
/* Try to find a child in primary fwnode */
next = fwnode_get_next_child_node(fwnode, child);
if (next)
return next;
/* When no more children in primary, continue with secondary */
if (fwnode && !IS_ERR_OR_NULL(fwnode->secondary))
next = fwnode_get_next_child_node(fwnode->secondary, child);
return next;
}
EXPORT_SYMBOL_GPL(device_get_next_child_node);
/**
* fwnode_get_named_child_node - Return first matching named child node handle
* @fwnode: Firmware node to find the named child node for.
* @childname: String to match child node name against.
*/
struct fwnode_handle *
fwnode_get_named_child_node(const struct fwnode_handle *fwnode,
const char *childname)
{
return fwnode_call_ptr_op(fwnode, get_named_child_node, childname);
}
EXPORT_SYMBOL_GPL(fwnode_get_named_child_node);
/**
* device_get_named_child_node - Return first matching named child node handle
* @dev: Device to find the named child node for.
* @childname: String to match child node name against.
*/
struct fwnode_handle *device_get_named_child_node(struct device *dev,
const char *childname)
{
return fwnode_get_named_child_node(dev_fwnode(dev), childname);
}
EXPORT_SYMBOL_GPL(device_get_named_child_node);
/**
* fwnode_handle_get - Obtain a reference to a device node
* @fwnode: Pointer to the device node to obtain the reference to.
*
* Returns the fwnode handle.
*/
struct fwnode_handle *fwnode_handle_get(struct fwnode_handle *fwnode)
{
if (!fwnode_has_op(fwnode, get))
return fwnode;
return fwnode_call_ptr_op(fwnode, get);
}
EXPORT_SYMBOL_GPL(fwnode_handle_get);
/**
* fwnode_handle_put - Drop reference to a device node
* @fwnode: Pointer to the device node to drop the reference to.
*
* This has to be used when terminating device_for_each_child_node() iteration
* with break or return to prevent stale device node references from being left
* behind.
*/
void fwnode_handle_put(struct fwnode_handle *fwnode)
{
fwnode_call_void_op(fwnode, put);
}
EXPORT_SYMBOL_GPL(fwnode_handle_put);
/**
* fwnode_device_is_available - check if a device is available for use
* @fwnode: Pointer to the fwnode of the device.
*
* For fwnode node types that don't implement the .device_is_available()
* operation, this function returns true.
*/
bool fwnode_device_is_available(const struct fwnode_handle *fwnode)
{
if (!fwnode_has_op(fwnode, device_is_available))
return true;
return fwnode_call_bool_op(fwnode, device_is_available);
}
EXPORT_SYMBOL_GPL(fwnode_device_is_available);
/**
* device_get_child_node_count - return the number of child nodes for device
* @dev: Device to cound the child nodes for
*/
unsigned int device_get_child_node_count(struct device *dev)
{
struct fwnode_handle *child;
unsigned int count = 0;
device_for_each_child_node(dev, child)
count++;
return count;
}
EXPORT_SYMBOL_GPL(device_get_child_node_count);
bool device_dma_supported(struct device *dev)
{
const struct fwnode_handle *fwnode = dev_fwnode(dev);
/* For DT, this is always supported.
* For ACPI, this depends on CCA, which
* is determined by the acpi_dma_supported().
*/
if (is_of_node(fwnode))
return true;
return acpi_dma_supported(to_acpi_device_node(fwnode));
}
EXPORT_SYMBOL_GPL(device_dma_supported);
enum dev_dma_attr device_get_dma_attr(struct device *dev)
{
const struct fwnode_handle *fwnode = dev_fwnode(dev);
enum dev_dma_attr attr = DEV_DMA_NOT_SUPPORTED;
if (is_of_node(fwnode)) {
if (of_dma_is_coherent(to_of_node(fwnode)))
attr = DEV_DMA_COHERENT;
else
attr = DEV_DMA_NON_COHERENT;
} else
attr = acpi_get_dma_attr(to_acpi_device_node(fwnode));
return attr;
}
EXPORT_SYMBOL_GPL(device_get_dma_attr);
/**
* fwnode_get_phy_mode - Get phy mode for given firmware node
* @fwnode: Pointer to the given node
*
* The function gets phy interface string from property 'phy-mode' or
* 'phy-connection-type', and return its index in phy_modes table, or errno in
* error case.
*/
int fwnode_get_phy_mode(struct fwnode_handle *fwnode)
{
const char *pm;
int err, i;
err = fwnode_property_read_string(fwnode, "phy-mode", &pm);
if (err < 0)
err = fwnode_property_read_string(fwnode,
"phy-connection-type", &pm);
if (err < 0)
return err;
for (i = 0; i < PHY_INTERFACE_MODE_MAX; i++)
if (!strcasecmp(pm, phy_modes(i)))
return i;
return -ENODEV;
}
EXPORT_SYMBOL_GPL(fwnode_get_phy_mode);
/**
* device_get_phy_mode - Get phy mode for given device
* @dev: Pointer to the given device
*
* The function gets phy interface string from property 'phy-mode' or
* 'phy-connection-type', and return its index in phy_modes table, or errno in
* error case.
*/
int device_get_phy_mode(struct device *dev)
{
return fwnode_get_phy_mode(dev_fwnode(dev));
}
EXPORT_SYMBOL_GPL(device_get_phy_mode);
static void *fwnode_get_mac_addr(struct fwnode_handle *fwnode,
const char *name, char *addr,
int alen)
{
int ret = fwnode_property_read_u8_array(fwnode, name, addr, alen);
if (ret == 0 && alen == ETH_ALEN && is_valid_ether_addr(addr))
return addr;
return NULL;
}
/**
* fwnode_get_mac_address - Get the MAC from the firmware node
* @fwnode: Pointer to the firmware node
* @addr: Address of buffer to store the MAC in
* @alen: Length of the buffer pointed to by addr, should be ETH_ALEN
*
* Search the firmware node for the best MAC address to use. 'mac-address' is
* checked first, because that is supposed to contain to "most recent" MAC
* address. If that isn't set, then 'local-mac-address' is checked next,
* because that is the default address. If that isn't set, then the obsolete
* 'address' is checked, just in case we're using an old device tree.
*
* Note that the 'address' property is supposed to contain a virtual address of
* the register set, but some DTS files have redefined that property to be the
* MAC address.
*
* All-zero MAC addresses are rejected, because those could be properties that
* exist in the firmware tables, but were not updated by the firmware. For
* example, the DTS could define 'mac-address' and 'local-mac-address', with
* zero MAC addresses. Some older U-Boots only initialized 'local-mac-address'.
* In this case, the real MAC is in 'local-mac-address', and 'mac-address'
* exists but is all zeros.
*/
void *fwnode_get_mac_address(struct fwnode_handle *fwnode, char *addr, int alen)
{
char *res;
res = fwnode_get_mac_addr(fwnode, "mac-address", addr, alen);
if (res)
return res;
res = fwnode_get_mac_addr(fwnode, "local-mac-address", addr, alen);
if (res)
return res;
return fwnode_get_mac_addr(fwnode, "address", addr, alen);
}
EXPORT_SYMBOL(fwnode_get_mac_address);
/**
* device_get_mac_address - Get the MAC for a given device
* @dev: Pointer to the device
* @addr: Address of buffer to store the MAC in
* @alen: Length of the buffer pointed to by addr, should be ETH_ALEN
*/
void *device_get_mac_address(struct device *dev, char *addr, int alen)
{
return fwnode_get_mac_address(dev_fwnode(dev), addr, alen);
}
EXPORT_SYMBOL(device_get_mac_address);
/**
* fwnode_irq_get - Get IRQ directly from a fwnode
* @fwnode: Pointer to the firmware node
* @index: Zero-based index of the IRQ
*
* Returns Linux IRQ number on success. Other values are determined
* accordingly to acpi_/of_ irq_get() operation.
*/
int fwnode_irq_get(const struct fwnode_handle *fwnode, unsigned int index)
{
struct resource res;
int ret;
if (is_of_node(fwnode))
return of_irq_get(to_of_node(fwnode), index);
ret = acpi_irq_get(ACPI_HANDLE_FWNODE(fwnode), index, &res);
if (ret)
return ret;
return res.start;
}
EXPORT_SYMBOL(fwnode_irq_get);
/**
* fwnode_graph_get_next_endpoint - Get next endpoint firmware node
* @fwnode: Pointer to the parent firmware node
* @prev: Previous endpoint node or %NULL to get the first
*
* Returns an endpoint firmware node pointer or %NULL if no more endpoints
* are available.
*/
struct fwnode_handle *
fwnode_graph_get_next_endpoint(const struct fwnode_handle *fwnode,
struct fwnode_handle *prev)
{
const struct fwnode_handle *parent;
struct fwnode_handle *ep;
/*
* If this function is in a loop and the previous iteration returned
* an endpoint from fwnode->secondary, then we need to use the secondary
* as parent rather than @fwnode.
*/
if (prev)
parent = fwnode_graph_get_port_parent(prev);
else
parent = fwnode;
ep = fwnode_call_ptr_op(parent, graph_get_next_endpoint, prev);
if (IS_ERR_OR_NULL(ep) &&
!IS_ERR_OR_NULL(parent) && !IS_ERR_OR_NULL(parent->secondary))
ep = fwnode_graph_get_next_endpoint(parent->secondary, NULL);
return ep;
}
EXPORT_SYMBOL_GPL(fwnode_graph_get_next_endpoint);
/**
* fwnode_graph_get_port_parent - Return the device fwnode of a port endpoint
* @endpoint: Endpoint firmware node of the port
*
* Return: the firmware node of the device the @endpoint belongs to.
*/
struct fwnode_handle *
fwnode_graph_get_port_parent(const struct fwnode_handle *endpoint)
{
struct fwnode_handle *port, *parent;
port = fwnode_get_parent(endpoint);
parent = fwnode_call_ptr_op(port, graph_get_port_parent);
fwnode_handle_put(port);
return parent;
}
EXPORT_SYMBOL_GPL(fwnode_graph_get_port_parent);
/**
* fwnode_graph_get_remote_port_parent - Return fwnode of a remote device
* @fwnode: Endpoint firmware node pointing to the remote endpoint
*
* Extracts firmware node of a remote device the @fwnode points to.
*/
struct fwnode_handle *
fwnode_graph_get_remote_port_parent(const struct fwnode_handle *fwnode)
{
struct fwnode_handle *endpoint, *parent;
endpoint = fwnode_graph_get_remote_endpoint(fwnode);
parent = fwnode_graph_get_port_parent(endpoint);
fwnode_handle_put(endpoint);
return parent;
}
EXPORT_SYMBOL_GPL(fwnode_graph_get_remote_port_parent);
/**
* fwnode_graph_get_remote_port - Return fwnode of a remote port
* @fwnode: Endpoint firmware node pointing to the remote endpoint
*
* Extracts firmware node of a remote port the @fwnode points to.
*/
struct fwnode_handle *
fwnode_graph_get_remote_port(const struct fwnode_handle *fwnode)
{
return fwnode_get_next_parent(fwnode_graph_get_remote_endpoint(fwnode));
}
EXPORT_SYMBOL_GPL(fwnode_graph_get_remote_port);
/**
* fwnode_graph_get_remote_endpoint - Return fwnode of a remote endpoint
* @fwnode: Endpoint firmware node pointing to the remote endpoint
*
* Extracts firmware node of a remote endpoint the @fwnode points to.
*/
struct fwnode_handle *
fwnode_graph_get_remote_endpoint(const struct fwnode_handle *fwnode)
{
return fwnode_call_ptr_op(fwnode, graph_get_remote_endpoint);
}
EXPORT_SYMBOL_GPL(fwnode_graph_get_remote_endpoint);
/**
* fwnode_graph_get_remote_node - get remote parent node for given port/endpoint
* @fwnode: pointer to parent fwnode_handle containing graph port/endpoint
* @port_id: identifier of the parent port node
* @endpoint_id: identifier of the endpoint node
*
* Return: Remote fwnode handle associated with remote endpoint node linked
* to @node. Use fwnode_node_put() on it when done.
*/
struct fwnode_handle *
fwnode_graph_get_remote_node(const struct fwnode_handle *fwnode, u32 port_id,
u32 endpoint_id)
{
struct fwnode_handle *endpoint = NULL;
while ((endpoint = fwnode_graph_get_next_endpoint(fwnode, endpoint))) {
struct fwnode_endpoint fwnode_ep;
struct fwnode_handle *remote;
int ret;
ret = fwnode_graph_parse_endpoint(endpoint, &fwnode_ep);
if (ret < 0)
continue;
if (fwnode_ep.port != port_id || fwnode_ep.id != endpoint_id)
continue;
remote = fwnode_graph_get_remote_port_parent(endpoint);
if (!remote)
return NULL;
return fwnode_device_is_available(remote) ? remote : NULL;
}
return NULL;
}
EXPORT_SYMBOL_GPL(fwnode_graph_get_remote_node);
/**
* fwnode_graph_get_endpoint_by_id - get endpoint by port and endpoint numbers
* @fwnode: parent fwnode_handle containing the graph
* @port: identifier of the port node
* @endpoint: identifier of the endpoint node under the port node
* @flags: fwnode lookup flags
*
* Return the fwnode handle of the local endpoint corresponding the port and
* endpoint IDs or NULL if not found.
*
* If FWNODE_GRAPH_ENDPOINT_NEXT is passed in @flags and the specified endpoint
* has not been found, look for the closest endpoint ID greater than the
* specified one and return the endpoint that corresponds to it, if present.
*
* Do not return endpoints that belong to disabled devices, unless
* FWNODE_GRAPH_DEVICE_DISABLED is passed in @flags.
*
* The returned endpoint needs to be released by calling fwnode_handle_put() on
* it when it is not needed any more.
*/
struct fwnode_handle *
fwnode_graph_get_endpoint_by_id(const struct fwnode_handle *fwnode,
u32 port, u32 endpoint, unsigned long flags)
{
struct fwnode_handle *ep = NULL, *best_ep = NULL;
unsigned int best_ep_id = 0;
bool endpoint_next = flags & FWNODE_GRAPH_ENDPOINT_NEXT;
bool enabled_only = !(flags & FWNODE_GRAPH_DEVICE_DISABLED);
while ((ep = fwnode_graph_get_next_endpoint(fwnode, ep))) {
struct fwnode_endpoint fwnode_ep = { 0 };
int ret;
if (enabled_only) {
struct fwnode_handle *dev_node;
bool available;
dev_node = fwnode_graph_get_remote_port_parent(ep);
available = fwnode_device_is_available(dev_node);
fwnode_handle_put(dev_node);
if (!available)
continue;
}
ret = fwnode_graph_parse_endpoint(ep, &fwnode_ep);
if (ret < 0)
continue;
if (fwnode_ep.port != port)
continue;
if (fwnode_ep.id == endpoint)
return ep;
if (!endpoint_next)
continue;
/*
* If the endpoint that has just been found is not the first
* matching one and the ID of the one found previously is closer
* to the requested endpoint ID, skip it.
*/
if (fwnode_ep.id < endpoint ||
(best_ep && best_ep_id < fwnode_ep.id))
continue;
fwnode_handle_put(best_ep);
best_ep = fwnode_handle_get(ep);
best_ep_id = fwnode_ep.id;
}
return best_ep;
}
EXPORT_SYMBOL_GPL(fwnode_graph_get_endpoint_by_id);
/**
* fwnode_graph_parse_endpoint - parse common endpoint node properties
* @fwnode: pointer to endpoint fwnode_handle
* @endpoint: pointer to the fwnode endpoint data structure
*
* Parse @fwnode representing a graph endpoint node and store the
* information in @endpoint. The caller must hold a reference to
* @fwnode.
*/
int fwnode_graph_parse_endpoint(const struct fwnode_handle *fwnode,
struct fwnode_endpoint *endpoint)
{
memset(endpoint, 0, sizeof(*endpoint));
return fwnode_call_int_op(fwnode, graph_parse_endpoint, endpoint);
}
EXPORT_SYMBOL(fwnode_graph_parse_endpoint);
const void *device_get_match_data(struct device *dev)
{
return fwnode_call_ptr_op(dev_fwnode(dev), device_get_match_data, dev);
}
EXPORT_SYMBOL_GPL(device_get_match_data);
static void *
fwnode_graph_devcon_match(struct fwnode_handle *fwnode, const char *con_id,
void *data, devcon_match_fn_t match)
{
struct fwnode_handle *node;
struct fwnode_handle *ep;
void *ret;
fwnode_graph_for_each_endpoint(fwnode, ep) {
node = fwnode_graph_get_remote_port_parent(ep);
if (!fwnode_device_is_available(node)) {
fwnode_handle_put(node);
continue;
}
ret = match(node, con_id, data);
fwnode_handle_put(node);
if (ret) {
fwnode_handle_put(ep);
return ret;
}
}
return NULL;
}
static void *
fwnode_devcon_match(struct fwnode_handle *fwnode, const char *con_id,
void *data, devcon_match_fn_t match)
{
struct fwnode_handle *node;
void *ret;
int i;
for (i = 0; ; i++) {
node = fwnode_find_reference(fwnode, con_id, i);
if (IS_ERR(node))
break;
ret = match(node, NULL, data);
fwnode_handle_put(node);
if (ret)
return ret;
}
return NULL;
}
/**
* fwnode_connection_find_match - Find connection from a device node
* @fwnode: Device node with the connection
* @con_id: Identifier for the connection
* @data: Data for the match function
* @match: Function to check and convert the connection description
*
* Find a connection with unique identifier @con_id between @fwnode and another
* device node. @match will be used to convert the connection description to
* data the caller is expecting to be returned.
*/
void *fwnode_connection_find_match(struct fwnode_handle *fwnode,
const char *con_id, void *data,
devcon_match_fn_t match)
{
void *ret;
if (!fwnode || !match)
return NULL;
ret = fwnode_graph_devcon_match(fwnode, con_id, data, match);
if (ret)
return ret;
return fwnode_devcon_match(fwnode, con_id, data, match);
}
EXPORT_SYMBOL_GPL(fwnode_connection_find_match);
// SPDX-License-Identifier: GPL-2.0
#include "misc.h"
#include "ctree.h"
#include "block-group.h"
#include "space-info.h"
#include "disk-io.h"
#include "free-space-cache.h"
#include "free-space-tree.h"
#include "volumes.h"
#include "transaction.h"
#include "ref-verify.h"
#include "sysfs.h"
#include "tree-log.h"
#include "delalloc-space.h"
#include "discard.h"
#include "raid56.h"
#include "zoned.h"
/*
* Return target flags in extended format or 0 if restripe for this chunk_type
* is not in progress
*
* Should be called with balance_lock held
*/
static u64 get_restripe_target(struct btrfs_fs_info *fs_info, u64 flags)
{
struct btrfs_balance_control *bctl = fs_info->balance_ctl;
u64 target = 0;
if (!bctl)
return 0;
if (flags & BTRFS_BLOCK_GROUP_DATA && bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) { target = BTRFS_BLOCK_GROUP_DATA | bctl->data.target; } else if (flags & BTRFS_BLOCK_GROUP_SYSTEM && bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) { target = BTRFS_BLOCK_GROUP_SYSTEM | bctl->sys.target; } else if (flags & BTRFS_BLOCK_GROUP_METADATA && bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) { target = BTRFS_BLOCK_GROUP_METADATA | bctl->meta.target;
}
return target;
}
/*
* @flags: available profiles in extended format (see ctree.h)
*
* Return reduced profile in chunk format. If profile changing is in progress
* (either running or paused) picks the target profile (if it's already
* available), otherwise falls back to plain reducing.
*/
static u64 btrfs_reduce_alloc_profile(struct btrfs_fs_info *fs_info, u64 flags)
{
u64 num_devices = fs_info->fs_devices->rw_devices;
u64 target;
u64 raid_type;
u64 allowed = 0;
/*
* See if restripe for this chunk_type is in progress, if so try to
* reduce to the target profile
*/
spin_lock(&fs_info->balance_lock);
target = get_restripe_target(fs_info, flags);
if (target) {
spin_unlock(&fs_info->balance_lock);
return extended_to_chunk(target);
}
spin_unlock(&fs_info->balance_lock);
/* First, mask out the RAID levels which aren't possible */
for (raid_type = 0; raid_type < BTRFS_NR_RAID_TYPES; raid_type++) { if (num_devices >= btrfs_raid_array[raid_type].devs_min) allowed |= btrfs_raid_array[raid_type].bg_flag;
}
allowed &= flags;
if (allowed & BTRFS_BLOCK_GROUP_RAID6)
allowed = BTRFS_BLOCK_GROUP_RAID6;
else if (allowed & BTRFS_BLOCK_GROUP_RAID5)
allowed = BTRFS_BLOCK_GROUP_RAID5;
else if (allowed & BTRFS_BLOCK_GROUP_RAID10)
allowed = BTRFS_BLOCK_GROUP_RAID10;
else if (allowed & BTRFS_BLOCK_GROUP_RAID1)
allowed = BTRFS_BLOCK_GROUP_RAID1;
else if (allowed & BTRFS_BLOCK_GROUP_RAID0)
allowed = BTRFS_BLOCK_GROUP_RAID0;
flags &= ~BTRFS_BLOCK_GROUP_PROFILE_MASK;
return extended_to_chunk(flags | allowed);
}
u64 btrfs_get_alloc_profile(struct btrfs_fs_info *fs_info, u64 orig_flags)
{
unsigned seq;
u64 flags;
do {
flags = orig_flags;
seq = read_seqbegin(&fs_info->profiles_lock);
if (flags & BTRFS_BLOCK_GROUP_DATA)
flags |= fs_info->avail_data_alloc_bits; else if (flags & BTRFS_BLOCK_GROUP_SYSTEM) flags |= fs_info->avail_system_alloc_bits; else if (flags & BTRFS_BLOCK_GROUP_METADATA) flags |= fs_info->avail_metadata_alloc_bits;
} while (read_seqretry(&fs_info->profiles_lock, seq));
return btrfs_reduce_alloc_profile(fs_info, flags);
}
void btrfs_get_block_group(struct btrfs_block_group *cache)
{
refcount_inc(&cache->refs);
}
void btrfs_put_block_group(struct btrfs_block_group *cache)
{
if (refcount_dec_and_test(&cache->refs)) { WARN_ON(cache->pinned > 0);
/*
* If there was a failure to cleanup a log tree, very likely due
* to an IO failure on a writeback attempt of one or more of its
* extent buffers, we could not do proper (and cheap) unaccounting
* of their reserved space, so don't warn on reserved > 0 in that
* case.
*/
if (!(cache->flags & BTRFS_BLOCK_GROUP_METADATA) || !BTRFS_FS_LOG_CLEANUP_ERROR(cache->fs_info)) WARN_ON(cache->reserved > 0);
/*
* A block_group shouldn't be on the discard_list anymore.
* Remove the block_group from the discard_list to prevent us
* from causing a panic due to NULL pointer dereference.
*/
if (WARN_ON(!list_empty(&cache->discard_list)))
btrfs_discard_cancel_work(&cache->fs_info->discard_ctl,
cache);
/*
* If not empty, someone is still holding mutex of
* full_stripe_lock, which can only be released by caller.
* And it will definitely cause use-after-free when caller
* tries to release full stripe lock.
*
* No better way to resolve, but only to warn.
*/
WARN_ON(!RB_EMPTY_ROOT(&cache->full_stripe_locks_root.root)); kfree(cache->free_space_ctl);
kfree(cache);
}
}
/*
* This adds the block group to the fs_info rb tree for the block group cache
*/
static int btrfs_add_block_group_cache(struct btrfs_fs_info *info,
struct btrfs_block_group *block_group)
{
struct rb_node **p;
struct rb_node *parent = NULL;
struct btrfs_block_group *cache;
ASSERT(block_group->length != 0);
spin_lock(&info->block_group_cache_lock);
p = &info->block_group_cache_tree.rb_node;
while (*p) {
parent = *p;
cache = rb_entry(parent, struct btrfs_block_group, cache_node);
if (block_group->start < cache->start) { p = &(*p)->rb_left; } else if (block_group->start > cache->start) { p = &(*p)->rb_right;
} else {
spin_unlock(&info->block_group_cache_lock);
return -EEXIST;
}
}
rb_link_node(&block_group->cache_node, parent, p);
rb_insert_color(&block_group->cache_node,
&info->block_group_cache_tree);
if (info->first_logical_byte > block_group->start)
info->first_logical_byte = block_group->start;
spin_unlock(&info->block_group_cache_lock);
return 0;
}
/*
* This will return the block group at or after bytenr if contains is 0, else
* it will return the block group that contains the bytenr
*/
static struct btrfs_block_group *block_group_cache_tree_search(
struct btrfs_fs_info *info, u64 bytenr, int contains)
{
struct btrfs_block_group *cache, *ret = NULL;
struct rb_node *n;
u64 end, start;
spin_lock(&info->block_group_cache_lock);
n = info->block_group_cache_tree.rb_node;
while (n) { cache = rb_entry(n, struct btrfs_block_group, cache_node); end = cache->start + cache->length - 1;
start = cache->start;
if (bytenr < start) {
if (!contains && (!ret || start < ret->start))
ret = cache;
n = n->rb_left; } else if (bytenr > start) {
if (contains && bytenr <= end) {
ret = cache;
break;
}
n = n->rb_right;
} else {
ret = cache;
break;
}
}
if (ret) {
btrfs_get_block_group(ret);
if (bytenr == 0 && info->first_logical_byte > ret->start) info->first_logical_byte = ret->start;
}
spin_unlock(&info->block_group_cache_lock);
return ret;
}
/*
* Return the block group that starts at or after bytenr
*/
struct btrfs_block_group *btrfs_lookup_first_block_group(
struct btrfs_fs_info *info, u64 bytenr)
{
return block_group_cache_tree_search(info, bytenr, 0);
}
/*
* Return the block group that contains the given bytenr
*/
struct btrfs_block_group *btrfs_lookup_block_group(
struct btrfs_fs_info *info, u64 bytenr)
{
return block_group_cache_tree_search(info, bytenr, 1);
}
struct btrfs_block_group *btrfs_next_block_group(
struct btrfs_block_group *cache)
{
struct btrfs_fs_info *fs_info = cache->fs_info;
struct rb_node *node;
spin_lock(&fs_info->block_group_cache_lock);
/* If our block group was removed, we need a full search. */
if (RB_EMPTY_NODE(&cache->cache_node)) {
const u64 next_bytenr = cache->start + cache->length;
spin_unlock(&fs_info->block_group_cache_lock);
btrfs_put_block_group(cache);
cache = btrfs_lookup_first_block_group(fs_info, next_bytenr); return cache;
}
node = rb_next(&cache->cache_node);
btrfs_put_block_group(cache);
if (node) {
cache = rb_entry(node, struct btrfs_block_group, cache_node);
btrfs_get_block_group(cache);
} else
cache = NULL;
spin_unlock(&fs_info->block_group_cache_lock);
return cache;
}
bool btrfs_inc_nocow_writers(struct btrfs_fs_info *fs_info, u64 bytenr)
{
struct btrfs_block_group *bg;
bool ret = true;
bg = btrfs_lookup_block_group(fs_info, bytenr);
if (!bg)
return false;
spin_lock(&bg->lock);
if (bg->ro)
ret = false;
else
atomic_inc(&bg->nocow_writers);
spin_unlock(&bg->lock);
/* No put on block group, done by btrfs_dec_nocow_writers */
if (!ret) btrfs_put_block_group(bg);
return ret;
}
void btrfs_dec_nocow_writers(struct btrfs_fs_info *fs_info, u64 bytenr)
{
struct btrfs_block_group *bg;
bg = btrfs_lookup_block_group(fs_info, bytenr);
ASSERT(bg);
if (atomic_dec_and_test(&bg->nocow_writers))
wake_up_var(&bg->nocow_writers);
/*
* Once for our lookup and once for the lookup done by a previous call
* to btrfs_inc_nocow_writers()
*/
btrfs_put_block_group(bg);
btrfs_put_block_group(bg);
}
void btrfs_wait_nocow_writers(struct btrfs_block_group *bg)
{
wait_var_event(&bg->nocow_writers, !atomic_read(&bg->nocow_writers));
}
void btrfs_dec_block_group_reservations(struct btrfs_fs_info *fs_info,
const u64 start)
{
struct btrfs_block_group *bg;
bg = btrfs_lookup_block_group(fs_info, start);
ASSERT(bg);
if (atomic_dec_and_test(&bg->reservations))
wake_up_var(&bg->reservations); btrfs_put_block_group(bg);
}
void btrfs_wait_block_group_reservations(struct btrfs_block_group *bg)
{
struct btrfs_space_info *space_info = bg->space_info;
ASSERT(bg->ro);
if (!(bg->flags & BTRFS_BLOCK_GROUP_DATA))
return;
/*
* Our block group is read only but before we set it to read only,
* some task might have had allocated an extent from it already, but it
* has not yet created a respective ordered extent (and added it to a
* root's list of ordered extents).
* Therefore wait for any task currently allocating extents, since the
* block group's reservations counter is incremented while a read lock
* on the groups' semaphore is held and decremented after releasing
* the read access on that semaphore and creating the ordered extent.
*/
down_write(&space_info->groups_sem);
up_write(&space_info->groups_sem);
wait_var_event(&bg->reservations, !atomic_read(&bg->reservations));
}
struct btrfs_caching_control *btrfs_get_caching_control(
struct btrfs_block_group *cache)
{
struct btrfs_caching_control *ctl;
spin_lock(&cache->lock);
if (!cache->caching_ctl) {
spin_unlock(&cache->lock);
return NULL;
}
ctl = cache->caching_ctl;
refcount_inc(&ctl->count);
spin_unlock(&cache->lock);
return ctl;
}
void btrfs_put_caching_control(struct btrfs_caching_control *ctl)
{
if (refcount_dec_and_test(&ctl->count))
kfree(ctl);
}
/*
* When we wait for progress in the block group caching, its because our
* allocation attempt failed at least once. So, we must sleep and let some
* progress happen before we try again.
*
* This function will sleep at least once waiting for new free space to show
* up, and then it will check the block group free space numbers for our min
* num_bytes. Another option is to have it go ahead and look in the rbtree for
* a free extent of a given size, but this is a good start.
*
* Callers of this must check if cache->cached == BTRFS_CACHE_ERROR before using
* any of the information in this block group.
*/
void btrfs_wait_block_group_cache_progress(struct btrfs_block_group *cache,
u64 num_bytes)
{
struct btrfs_caching_control *caching_ctl;
caching_ctl = btrfs_get_caching_control(cache);
if (!caching_ctl)
return;
wait_event(caching_ctl->wait, btrfs_block_group_done(cache) ||
(cache->free_space_ctl->free_space >= num_bytes));
btrfs_put_caching_control(caching_ctl);
}
int btrfs_wait_block_group_cache_done(struct btrfs_block_group *cache)
{
struct btrfs_caching_control *caching_ctl;
int ret = 0;
caching_ctl = btrfs_get_caching_control(cache);
if (!caching_ctl)
return (cache->cached == BTRFS_CACHE_ERROR) ? -EIO : 0; wait_event(caching_ctl->wait, btrfs_block_group_done(cache)); if (cache->cached == BTRFS_CACHE_ERROR)
ret = -EIO;
btrfs_put_caching_control(caching_ctl); return ret;
}
static bool space_cache_v1_done(struct btrfs_block_group *cache)
{
bool ret;
spin_lock(&cache->lock);
ret = cache->cached != BTRFS_CACHE_FAST;
spin_unlock(&cache->lock);
return ret;
}
void btrfs_wait_space_cache_v1_finished(struct btrfs_block_group *cache,
struct btrfs_caching_control *caching_ctl)
{
wait_event(caching_ctl->wait, space_cache_v1_done(cache));}
#ifdef CONFIG_BTRFS_DEBUG
static void fragment_free_space(struct btrfs_block_group *block_group)
{
struct btrfs_fs_info *fs_info = block_group->fs_info;
u64 start = block_group->start;
u64 len = block_group->length;
u64 chunk = block_group->flags & BTRFS_BLOCK_GROUP_METADATA ?
fs_info->nodesize : fs_info->sectorsize;
u64 step = chunk << 1;
while (len > chunk) {
btrfs_remove_free_space(block_group, start, chunk);
start += step;
if (len < step)
len = 0;
else
len -= step;
}
}
#endif
/*
* This is only called by btrfs_cache_block_group, since we could have freed
* extents we need to check the pinned_extents for any extents that can't be
* used yet since their free space will be released as soon as the transaction
* commits.
*/
u64 add_new_free_space(struct btrfs_block_group *block_group, u64 start, u64 end)
{
struct btrfs_fs_info *info = block_group->fs_info;
u64 extent_start, extent_end, size, total_added = 0;
int ret;
while (start < end) { ret = find_first_extent_bit(&info->excluded_extents, start,
&extent_start, &extent_end,
EXTENT_DIRTY | EXTENT_UPTODATE,
NULL);
if (ret)
break;
if (extent_start <= start) {
start = extent_end + 1;
} else if (extent_start > start && extent_start < end) { size = extent_start - start;
total_added += size;
ret = btrfs_add_free_space_async_trimmed(block_group,
start, size);
BUG_ON(ret); /* -ENOMEM or logic error */ start = extent_end + 1;
} else {
break;
}
}
if (start < end) {
size = end - start;
total_added += size;
ret = btrfs_add_free_space_async_trimmed(block_group, start,
size);
BUG_ON(ret); /* -ENOMEM or logic error */
}
return total_added;
}
static int load_extent_tree_free(struct btrfs_caching_control *caching_ctl)
{
struct btrfs_block_group *block_group = caching_ctl->block_group;
struct btrfs_fs_info *fs_info = block_group->fs_info;
struct btrfs_root *extent_root = fs_info->extent_root;
struct btrfs_path *path;
struct extent_buffer *leaf;
struct btrfs_key key;
u64 total_found = 0;
u64 last = 0;
u32 nritems;
int ret;
bool wakeup = true;
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
last = max_t(u64, block_group->start, BTRFS_SUPER_INFO_OFFSET);
#ifdef CONFIG_BTRFS_DEBUG
/*
* If we're fragmenting we don't want to make anybody think we can
* allocate from this block group until we've had a chance to fragment
* the free space.
*/
if (btrfs_should_fragment_free_space(block_group))
wakeup = false;
#endif
/*
* We don't want to deadlock with somebody trying to allocate a new
* extent for the extent root while also trying to search the extent
* root to add free space. So we skip locking and search the commit
* root, since its read-only
*/
path->skip_locking = 1;
path->search_commit_root = 1;
path->reada = READA_FORWARD;
key.objectid = last;
key.offset = 0;
key.type = BTRFS_EXTENT_ITEM_KEY;
next:
ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
if (ret < 0)
goto out;
leaf = path->nodes[0];
nritems = btrfs_header_nritems(leaf);
while (1) {
if (btrfs_fs_closing(fs_info) > 1) {
last = (u64)-1;
break;
}
if (path->slots[0] < nritems) {
btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
} else {
ret = btrfs_find_next_key(extent_root, path, &key, 0, 0);
if (ret)
break;
if (need_resched() ||
rwsem_is_contended(&fs_info->commit_root_sem)) {
if (wakeup)
caching_ctl->progress = last;
btrfs_release_path(path);
up_read(&fs_info->commit_root_sem);
mutex_unlock(&caching_ctl->mutex);
cond_resched();
mutex_lock(&caching_ctl->mutex);
down_read(&fs_info->commit_root_sem);
goto next;
}
ret = btrfs_next_leaf(extent_root, path);
if (ret < 0)
goto out;
if (ret)
break;
leaf = path->nodes[0];
nritems = btrfs_header_nritems(leaf);
continue;
}
if (key.objectid < last) {
key.objectid = last;
key.offset = 0;
key.type = BTRFS_EXTENT_ITEM_KEY;
if (wakeup)
caching_ctl->progress = last;
btrfs_release_path(path);
goto next;
}
if (key.objectid < block_group->start) {
path->slots[0]++;
continue;
}
if (key.objectid >= block_group->start + block_group->length)
break;
if (key.type == BTRFS_EXTENT_ITEM_KEY ||
key.type == BTRFS_METADATA_ITEM_KEY) {
total_found += add_new_free_space(block_group, last,
key.objectid);
if (key.type == BTRFS_METADATA_ITEM_KEY)
last = key.objectid +
fs_info->nodesize;
else
last = key.objectid + key.offset;
if (total_found > CACHING_CTL_WAKE_UP) {
total_found = 0;
if (wakeup)
wake_up(&caching_ctl->wait);
}
}
path->slots[0]++;
}
ret = 0;
total_found += add_new_free_space(block_group, last,
block_group->start + block_group->length);
caching_ctl->progress = (u64)-1;
out:
btrfs_free_path(path);
return ret;
}
static noinline void caching_thread(struct btrfs_work *work)
{
struct btrfs_block_group *block_group;
struct btrfs_fs_info *fs_info;
struct btrfs_caching_control *caching_ctl;
int ret;
caching_ctl = container_of(work, struct btrfs_caching_control, work);
block_group = caching_ctl->block_group;
fs_info = block_group->fs_info;
mutex_lock(&caching_ctl->mutex);
down_read(&fs_info->commit_root_sem);
if (btrfs_test_opt(fs_info, SPACE_CACHE)) {
ret = load_free_space_cache(block_group);
if (ret == 1) {
ret = 0;
goto done;
}
/*
* We failed to load the space cache, set ourselves to
* CACHE_STARTED and carry on.
*/
spin_lock(&block_group->lock);
block_group->cached = BTRFS_CACHE_STARTED;
spin_unlock(&block_group->lock);
wake_up(&caching_ctl->wait);
}
/*
* If we are in the transaction that populated the free space tree we
* can't actually cache from the free space tree as our commit root and
* real root are the same, so we could change the contents of the blocks
* while caching. Instead do the slow caching in this case, and after
* the transaction has committed we will be safe.
*/
if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE) &&
!(test_bit(BTRFS_FS_FREE_SPACE_TREE_UNTRUSTED, &fs_info->flags)))
ret = load_free_space_tree(caching_ctl);
else
ret = load_extent_tree_free(caching_ctl);
done:
spin_lock(&block_group->lock);
block_group->caching_ctl = NULL;
block_group->cached = ret ? BTRFS_CACHE_ERROR : BTRFS_CACHE_FINISHED;
spin_unlock(&block_group->lock);
#ifdef CONFIG_BTRFS_DEBUG
if (btrfs_should_fragment_free_space(block_group)) {
u64 bytes_used;
spin_lock(&block_group->space_info->lock);
spin_lock(&block_group->lock);
bytes_used = block_group->length - block_group->used;
block_group->space_info->bytes_used += bytes_used >> 1;
spin_unlock(&block_group->lock);
spin_unlock(&block_group->space_info->lock);
fragment_free_space(block_group);
}
#endif
caching_ctl->progress = (u64)-1;
up_read(&fs_info->commit_root_sem);
btrfs_free_excluded_extents(block_group);
mutex_unlock(&caching_ctl->mutex);
wake_up(&caching_ctl->wait);
btrfs_put_caching_control(caching_ctl);
btrfs_put_block_group(block_group);
}
int btrfs_cache_block_group(struct btrfs_block_group *cache, int load_cache_only)
{
DEFINE_WAIT(wait);
struct btrfs_fs_info *fs_info = cache->fs_info;
struct btrfs_caching_control *caching_ctl = NULL;
int ret = 0;
/* Allocator for zoned filesystems does not use the cache at all */
if (btrfs_is_zoned(fs_info))
return 0;
caching_ctl = kzalloc(sizeof(*caching_ctl), GFP_NOFS);
if (!caching_ctl)
return -ENOMEM;
INIT_LIST_HEAD(&caching_ctl->list);
mutex_init(&caching_ctl->mutex);
init_waitqueue_head(&caching_ctl->wait);
caching_ctl->block_group = cache;
caching_ctl->progress = cache->start;
refcount_set(&caching_ctl->count, 2);
btrfs_init_work(&caching_ctl->work, caching_thread, NULL, NULL);
spin_lock(&cache->lock);
if (cache->cached != BTRFS_CACHE_NO) {
kfree(caching_ctl);
caching_ctl = cache->caching_ctl;
if (caching_ctl)
refcount_inc(&caching_ctl->count);
spin_unlock(&cache->lock);
goto out;
}
WARN_ON(cache->caching_ctl); cache->caching_ctl = caching_ctl;
if (btrfs_test_opt(fs_info, SPACE_CACHE))
cache->cached = BTRFS_CACHE_FAST;
else
cache->cached = BTRFS_CACHE_STARTED;
cache->has_caching_ctl = 1;
spin_unlock(&cache->lock);
spin_lock(&fs_info->block_group_cache_lock);
refcount_inc(&caching_ctl->count);
list_add_tail(&caching_ctl->list, &fs_info->caching_block_groups);
spin_unlock(&fs_info->block_group_cache_lock);
btrfs_get_block_group(cache);
btrfs_queue_work(fs_info->caching_workers, &caching_ctl->work);
out:
if (load_cache_only && caching_ctl) btrfs_wait_space_cache_v1_finished(cache, caching_ctl); if (caching_ctl) btrfs_put_caching_control(caching_ctl); return ret;
}
static void clear_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
{
u64 extra_flags = chunk_to_extended(flags) &
BTRFS_EXTENDED_PROFILE_MASK;
write_seqlock(&fs_info->profiles_lock);
if (flags & BTRFS_BLOCK_GROUP_DATA)
fs_info->avail_data_alloc_bits &= ~extra_flags;
if (flags & BTRFS_BLOCK_GROUP_METADATA)
fs_info->avail_metadata_alloc_bits &= ~extra_flags;
if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
fs_info->avail_system_alloc_bits &= ~extra_flags;
write_sequnlock(&fs_info->profiles_lock);
}
/*
* Clear incompat bits for the following feature(s):
*
* - RAID56 - in case there's neither RAID5 nor RAID6 profile block group
* in the whole filesystem
*
* - RAID1C34 - same as above for RAID1C3 and RAID1C4 block groups
*/
static void clear_incompat_bg_bits(struct btrfs_fs_info *fs_info, u64 flags)
{
bool found_raid56 = false;
bool found_raid1c34 = false;
if ((flags & BTRFS_BLOCK_GROUP_RAID56_MASK) ||
(flags & BTRFS_BLOCK_GROUP_RAID1C3) ||
(flags & BTRFS_BLOCK_GROUP_RAID1C4)) {
struct list_head *head = &fs_info->space_info;
struct btrfs_space_info *sinfo;
list_for_each_entry_rcu(sinfo, head, list) {
down_read(&sinfo->groups_sem);
if (!list_empty(&sinfo->block_groups[BTRFS_RAID_RAID5]))
found_raid56 = true;
if (!list_empty(&sinfo->block_groups[BTRFS_RAID_RAID6]))
found_raid56 = true;
if (!list_empty(&sinfo->block_groups[BTRFS_RAID_RAID1C3]))
found_raid1c34 = true;
if (!list_empty(&sinfo->block_groups[BTRFS_RAID_RAID1C4]))
found_raid1c34 = true;
up_read(&sinfo->groups_sem);
}
if (!found_raid56)
btrfs_clear_fs_incompat(fs_info, RAID56);
if (!found_raid1c34)
btrfs_clear_fs_incompat(fs_info, RAID1C34);
}
}
static int remove_block_group_item(struct btrfs_trans_handle *trans,
struct btrfs_path *path,
struct btrfs_block_group *block_group)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
struct btrfs_root *root;
struct btrfs_key key;
int ret;
root = fs_info->extent_root;
key.objectid = block_group->start;
key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
key.offset = block_group->length;
ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
if (ret > 0)
ret = -ENOENT;
if (ret < 0)
return ret;
ret = btrfs_del_item(trans, root, path);
return ret;
}
int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
u64 group_start, struct extent_map *em)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
struct btrfs_path *path;
struct btrfs_block_group *block_group;
struct btrfs_free_cluster *cluster;
struct inode *inode;
struct kobject *kobj = NULL;
int ret;
int index;
int factor;
struct btrfs_caching_control *caching_ctl = NULL;
bool remove_em;
bool remove_rsv = false;
block_group = btrfs_lookup_block_group(fs_info, group_start);
BUG_ON(!block_group);
BUG_ON(!block_group->ro);
trace_btrfs_remove_block_group(block_group);
/*
* Free the reserved super bytes from this block group before
* remove it.
*/
btrfs_free_excluded_extents(block_group);
btrfs_free_ref_tree_range(fs_info, block_group->start,
block_group->length);
index = btrfs_bg_flags_to_raid_index(block_group->flags);
factor = btrfs_bg_type_to_factor(block_group->flags);
/* make sure this block group isn't part of an allocation cluster */
cluster = &fs_info->data_alloc_cluster;
spin_lock(&cluster->refill_lock);
btrfs_return_cluster_to_free_space(block_group, cluster);
spin_unlock(&cluster->refill_lock);
/*
* make sure this block group isn't part of a metadata
* allocation cluster
*/
cluster = &fs_info->meta_alloc_cluster;
spin_lock(&cluster->refill_lock);
btrfs_return_cluster_to_free_space(block_group, cluster);
spin_unlock(&cluster->refill_lock);
btrfs_clear_treelog_bg(block_group);
btrfs_clear_data_reloc_bg(block_group);
path = btrfs_alloc_path();
if (!path) {
ret = -ENOMEM;
goto out;
}
/*
* get the inode first so any iput calls done for the io_list
* aren't the final iput (no unlinks allowed now)
*/
inode = lookup_free_space_inode(block_group, path);
mutex_lock(&trans->transaction->cache_write_mutex);
/*
* Make sure our free space cache IO is done before removing the
* free space inode
*/
spin_lock(&trans->transaction->dirty_bgs_lock);
if (!list_empty(&block_group->io_list)) {
list_del_init(&block_group->io_list);
WARN_ON(!IS_ERR(inode) && inode != block_group->io_ctl.inode);
spin_unlock(&trans->transaction->dirty_bgs_lock);
btrfs_wait_cache_io(trans, block_group, path);
btrfs_put_block_group(block_group);
spin_lock(&trans->transaction->dirty_bgs_lock);
}
if (!list_empty(&block_group->dirty_list)) {
list_del_init(&block_group->dirty_list);
remove_rsv = true;
btrfs_put_block_group(block_group);
}
spin_unlock(&trans->transaction->dirty_bgs_lock);
mutex_unlock(&trans->transaction->cache_write_mutex);
ret = btrfs_remove_free_space_inode(trans, inode, block_group);
if (ret)
goto out;
spin_lock(&fs_info->block_group_cache_lock);
rb_erase(&block_group->cache_node,
&fs_info->block_group_cache_tree);
RB_CLEAR_NODE(&block_group->cache_node);
/* Once for the block groups rbtree */
btrfs_put_block_group(block_group);
if (fs_info->first_logical_byte == block_group->start)
fs_info->first_logical_byte = (u64)-1;
spin_unlock(&fs_info->block_group_cache_lock);
down_write(&block_group->space_info->groups_sem);
/*
* we must use list_del_init so people can check to see if they
* are still on the list after taking the semaphore
*/
list_del_init(&block_group->list);
if (list_empty(&block_group->space_info->block_groups[index])) {
kobj = block_group->space_info->block_group_kobjs[index];
block_group->space_info->block_group_kobjs[index] = NULL;
clear_avail_alloc_bits(fs_info, block_group->flags);
}
up_write(&block_group->space_info->groups_sem);
clear_incompat_bg_bits(fs_info, block_group->flags);
if (kobj) {
kobject_del(kobj);
kobject_put(kobj);
}
if (block_group->has_caching_ctl)
caching_ctl = btrfs_get_caching_control(block_group);
if (block_group->cached == BTRFS_CACHE_STARTED)
btrfs_wait_block_group_cache_done(block_group);
if (block_group->has_caching_ctl) {
spin_lock(&fs_info->block_group_cache_lock);
if (!caching_ctl) {
struct btrfs_caching_control *ctl;
list_for_each_entry(ctl,
&fs_info->caching_block_groups, list)
if (ctl->block_group == block_group) {
caching_ctl = ctl;
refcount_inc(&caching_ctl->count);
break;
}
}
if (caching_ctl)
list_del_init(&caching_ctl->list);
spin_unlock(&fs_info->block_group_cache_lock);
if (caching_ctl) {
/* Once for the caching bgs list and once for us. */
btrfs_put_caching_control(caching_ctl);
btrfs_put_caching_control(caching_ctl);
}
}
spin_lock(&trans->transaction->dirty_bgs_lock);
WARN_ON(!list_empty(&block_group->dirty_list));
WARN_ON(!list_empty(&block_group->io_list));
spin_unlock(&trans->transaction->dirty_bgs_lock);
btrfs_remove_free_space_cache(block_group);
spin_lock(&block_group->space_info->lock);
list_del_init(&block_group->ro_list);
if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) {
WARN_ON(block_group->space_info->total_bytes
< block_group->length);
WARN_ON(block_group->space_info->bytes_readonly
< block_group->length - block_group->zone_unusable);
WARN_ON(block_group->space_info->bytes_zone_unusable
< block_group->zone_unusable);
WARN_ON(block_group->space_info->disk_total
< block_group->length * factor);
}
block_group->space_info->total_bytes -= block_group->length;
block_group->space_info->bytes_readonly -=
(block_group->length - block_group->zone_unusable);
block_group->space_info->bytes_zone_unusable -=
block_group->zone_unusable;
block_group->space_info->disk_total -= block_group->length * factor;
spin_unlock(&block_group->space_info->lock);
/*
* Remove the free space for the block group from the free space tree
* and the block group's item from the extent tree before marking the
* block group as removed. This is to prevent races with tasks that
* freeze and unfreeze a block group, this task and another task
* allocating a new block group - the unfreeze task ends up removing
* the block group's extent map before the task calling this function
* deletes the block group item from the extent tree, allowing for
* another task to attempt to create another block group with the same
* item key (and failing with -EEXIST and a transaction abort).
*/
ret = remove_block_group_free_space(trans, block_group);
if (ret)
goto out;
ret = remove_block_group_item(trans, path, block_group);
if (ret < 0)
goto out;
spin_lock(&block_group->lock);
block_group->removed = 1;
/*
* At this point trimming or scrub can't start on this block group,
* because we removed the block group from the rbtree
* fs_info->block_group_cache_tree so no one can't find it anymore and
* even if someone already got this block group before we removed it
* from the rbtree, they have already incremented block_group->frozen -
* if they didn't, for the trimming case they won't find any free space
* entries because we already removed them all when we called
* btrfs_remove_free_space_cache().
*
* And we must not remove the extent map from the fs_info->mapping_tree
* to prevent the same logical address range and physical device space
* ranges from being reused for a new block group. This is needed to
* avoid races with trimming and scrub.
*
* An fs trim operation (btrfs_trim_fs() / btrfs_ioctl_fitrim()) is
* completely transactionless, so while it is trimming a range the
* currently running transaction might finish and a new one start,
* allowing for new block groups to be created that can reuse the same
* physical device locations unless we take this special care.
*
* There may also be an implicit trim operation if the file system
* is mounted with -odiscard. The same protections must remain
* in place until the extents have been discarded completely when
* the transaction commit has completed.
*/
remove_em = (atomic_read(&block_group->frozen) == 0);
spin_unlock(&block_group->lock);
if (remove_em) {
struct extent_map_tree *em_tree;
em_tree = &fs_info->mapping_tree;
write_lock(&em_tree->lock);
remove_extent_mapping(em_tree, em);
write_unlock(&em_tree->lock);
/* once for the tree */
free_extent_map(em);
}
out:
/* Once for the lookup reference */
btrfs_put_block_group(block_group);
if (remove_rsv)
btrfs_delayed_refs_rsv_release(fs_info, 1);
btrfs_free_path(path);
return ret;
}
struct btrfs_trans_handle *btrfs_start_trans_remove_block_group(
struct btrfs_fs_info *fs_info, const u64 chunk_offset)
{
struct extent_map_tree *em_tree = &fs_info->mapping_tree;
struct extent_map *em;
struct map_lookup *map;
unsigned int num_items;
read_lock(&em_tree->lock);
em = lookup_extent_mapping(em_tree, chunk_offset, 1);
read_unlock(&em_tree->lock);
ASSERT(em && em->start == chunk_offset);
/*
* We need to reserve 3 + N units from the metadata space info in order
* to remove a block group (done at btrfs_remove_chunk() and at
* btrfs_remove_block_group()), which are used for:
*
* 1 unit for adding the free space inode's orphan (located in the tree
* of tree roots).
* 1 unit for deleting the block group item (located in the extent
* tree).
* 1 unit for deleting the free space item (located in tree of tree
* roots).
* N units for deleting N device extent items corresponding to each
* stripe (located in the device tree).
*
* In order to remove a block group we also need to reserve units in the
* system space info in order to update the chunk tree (update one or
* more device items and remove one chunk item), but this is done at
* btrfs_remove_chunk() through a call to check_system_chunk().
*/
map = em->map_lookup;
num_items = 3 + map->num_stripes;
free_extent_map(em);
return btrfs_start_transaction_fallback_global_rsv(fs_info->extent_root,
num_items);
}
/*
* Mark block group @cache read-only, so later write won't happen to block
* group @cache.
*
* If @force is not set, this function will only mark the block group readonly
* if we have enough free space (1M) in other metadata/system block groups.
* If @force is not set, this function will mark the block group readonly
* without checking free space.
*
* NOTE: This function doesn't care if other block groups can contain all the
* data in this block group. That check should be done by relocation routine,
* not this function.
*/
static int inc_block_group_ro(struct btrfs_block_group *cache, int force)
{
struct btrfs_space_info *sinfo = cache->space_info;
u64 num_bytes;
int ret = -ENOSPC;
spin_lock(&sinfo->lock);
spin_lock(&cache->lock);
if (cache->swap_extents) {
ret = -ETXTBSY;
goto out;
}
if (cache->ro) {
cache->ro++;
ret = 0;
goto out;
}
num_bytes = cache->length - cache->reserved - cache->pinned -
cache->bytes_super - cache->zone_unusable - cache->used;
/*
* Data never overcommits, even in mixed mode, so do just the straight
* check of left over space in how much we have allocated.
*/
if (force) {
ret = 0;
} else if (sinfo->flags & BTRFS_BLOCK_GROUP_DATA) {
u64 sinfo_used = btrfs_space_info_used(sinfo, true);
/*
* Here we make sure if we mark this bg RO, we still have enough
* free space as buffer.
*/
if (sinfo_used + num_bytes <= sinfo->total_bytes)
ret = 0;
} else {
/*
* We overcommit metadata, so we need to do the
* btrfs_can_overcommit check here, and we need to pass in
* BTRFS_RESERVE_NO_FLUSH to give ourselves the most amount of
* leeway to allow us to mark this block group as read only.
*/
if (btrfs_can_overcommit(cache->fs_info, sinfo, num_bytes,
BTRFS_RESERVE_NO_FLUSH))
ret = 0;
}
if (!ret) {
sinfo->bytes_readonly += num_bytes;
if (btrfs_is_zoned(cache->fs_info)) {
/* Migrate zone_unusable bytes to readonly */
sinfo->bytes_readonly += cache->zone_unusable;
sinfo->bytes_zone_unusable -= cache->zone_unusable;
cache->zone_unusable = 0;
}
cache->ro++;
list_add_tail(&cache->ro_list, &sinfo->ro_bgs);
}
out:
spin_unlock(&cache->lock);
spin_unlock(&sinfo->lock);
if (ret == -ENOSPC && btrfs_test_opt(cache->fs_info, ENOSPC_DEBUG)) {
btrfs_info(cache->fs_info,
"unable to make block group %llu ro", cache->start);
btrfs_dump_space_info(cache->fs_info, cache->space_info, 0, 0);
}
return ret;
}
static bool clean_pinned_extents(struct btrfs_trans_handle *trans,
struct btrfs_block_group *bg)
{
struct btrfs_fs_info *fs_info = bg->fs_info;
struct btrfs_transaction *prev_trans = NULL;
const u64 start = bg->start;
const u64 end = start + bg->length - 1;
int ret;
spin_lock(&fs_info->trans_lock);
if (trans->transaction->list.prev != &fs_info->trans_list) {
prev_trans = list_last_entry(&trans->transaction->list,
struct btrfs_transaction, list);
refcount_inc(&prev_trans->use_count);
}
spin_unlock(&fs_info->trans_lock);
/*
* Hold the unused_bg_unpin_mutex lock to avoid racing with
* btrfs_finish_extent_commit(). If we are at transaction N, another
* task might be running finish_extent_commit() for the previous
* transaction N - 1, and have seen a range belonging to the block
* group in pinned_extents before we were able to clear the whole block
* group range from pinned_extents. This means that task can lookup for
* the block group after we unpinned it from pinned_extents and removed
* it, leading to a BUG_ON() at unpin_extent_range().
*/
mutex_lock(&fs_info->unused_bg_unpin_mutex);
if (prev_trans) {
ret = clear_extent_bits(&prev_trans->pinned_extents, start, end,
EXTENT_DIRTY);
if (ret)
goto out;
}
ret = clear_extent_bits(&trans->transaction->pinned_extents, start, end,
EXTENT_DIRTY);
out:
mutex_unlock(&fs_info->unused_bg_unpin_mutex);
if (prev_trans)
btrfs_put_transaction(prev_trans);
return ret == 0;
}
/*
* Process the unused_bgs list and remove any that don't have any allocated
* space inside of them.
*/
void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
{
struct btrfs_block_group *block_group;
struct btrfs_space_info *space_info;
struct btrfs_trans_handle *trans;
const bool async_trim_enabled = btrfs_test_opt(fs_info, DISCARD_ASYNC);
int ret = 0;
if (!test_bit(BTRFS_FS_OPEN, &fs_info->flags))
return;
/*
* Long running balances can keep us blocked here for eternity, so
* simply skip deletion if we're unable to get the mutex.
*/
if (!mutex_trylock(&fs_info->reclaim_bgs_lock))
return;
spin_lock(&fs_info->unused_bgs_lock);
while (!list_empty(&fs_info->unused_bgs)) {
int trimming;
block_group = list_first_entry(&fs_info->unused_bgs,
struct btrfs_block_group,
bg_list);
list_del_init(&block_group->bg_list);
space_info = block_group->space_info;
if (ret || btrfs_mixed_space_info(space_info)) {
btrfs_put_block_group(block_group);
continue;
}
spin_unlock(&fs_info->unused_bgs_lock);
btrfs_discard_cancel_work(&fs_info->discard_ctl, block_group);
/* Don't want to race with allocators so take the groups_sem */
down_write(&space_info->groups_sem);
/*
* Async discard moves the final block group discard to be prior
* to the unused_bgs code path. Therefore, if it's not fully
* trimmed, punt it back to the async discard lists.
*/
if (btrfs_test_opt(fs_info, DISCARD_ASYNC) &&
!btrfs_is_free_space_trimmed(block_group)) {
trace_btrfs_skip_unused_block_group(block_group);
up_write(&space_info->groups_sem);
/* Requeue if we failed because of async discard */
btrfs_discard_queue_work(&fs_info->discard_ctl,
block_group);
goto next;
}
spin_lock(&block_group->lock);
if (block_group->reserved || block_group->pinned || block_group->used || block_group->ro || list_is_singular(&block_group->list)) {
/*
* We want to bail if we made new allocations or have
* outstanding allocations in this block group. We do
* the ro check in case balance is currently acting on
* this block group.
*/
trace_btrfs_skip_unused_block_group(block_group);
spin_unlock(&block_group->lock);
up_write(&space_info->groups_sem);
goto next;
}
spin_unlock(&block_group->lock);
/* We don't want to force the issue, only flip if it's ok. */
ret = inc_block_group_ro(block_group, 0);
up_write(&space_info->groups_sem);
if (ret < 0) {
ret = 0;
goto next;
}
/*
* Want to do this before we do anything else so we can recover
* properly if we fail to join the transaction.
*/
trans = btrfs_start_trans_remove_block_group(fs_info,
block_group->start);
if (IS_ERR(trans)) {
btrfs_dec_block_group_ro(block_group);
ret = PTR_ERR(trans);
goto next;
}
/*
* We could have pending pinned extents for this block group,
* just delete them, we don't care about them anymore.
*/
if (!clean_pinned_extents(trans, block_group)) { btrfs_dec_block_group_ro(block_group);
goto end_trans;
}
/*
* At this point, the block_group is read only and should fail
* new allocations. However, btrfs_finish_extent_commit() can
* cause this block_group to be placed back on the discard
* lists because now the block_group isn't fully discarded.
* Bail here and try again later after discarding everything.
*/
spin_lock(&fs_info->discard_ctl.lock);
if (!list_empty(&block_group->discard_list)) {
spin_unlock(&fs_info->discard_ctl.lock);
btrfs_dec_block_group_ro(block_group);
btrfs_discard_queue_work(&fs_info->discard_ctl,
block_group);
goto end_trans;
}
spin_unlock(&fs_info->discard_ctl.lock);
/* Reset pinned so btrfs_put_block_group doesn't complain */
spin_lock(&space_info->lock);
spin_lock(&block_group->lock);
btrfs_space_info_update_bytes_pinned(fs_info, space_info,
-block_group->pinned);
space_info->bytes_readonly += block_group->pinned;
block_group->pinned = 0;
spin_unlock(&block_group->lock);
spin_unlock(&space_info->lock);
/*
* The normal path here is an unused block group is passed here,
* then trimming is handled in the transaction commit path.
* Async discard interposes before this to do the trimming
* before coming down the unused block group path as trimming
* will no longer be done later in the transaction commit path.
*/
if (!async_trim_enabled && btrfs_test_opt(fs_info, DISCARD_ASYNC))
goto flip_async;
/*
* DISCARD can flip during remount. On zoned filesystems, we
* need to reset sequential-required zones.
*/
trimming = btrfs_test_opt(fs_info, DISCARD_SYNC) || btrfs_is_zoned(fs_info);
/* Implicit trim during transaction commit. */
if (trimming)
btrfs_freeze_block_group(block_group);
/*
* Btrfs_remove_chunk will abort the transaction if things go
* horribly wrong.
*/
ret = btrfs_remove_chunk(trans, block_group->start);
if (ret) {
if (trimming)
btrfs_unfreeze_block_group(block_group);
goto end_trans;
}
/*
* If we're not mounted with -odiscard, we can just forget
* about this block group. Otherwise we'll need to wait
* until transaction commit to do the actual discard.
*/
if (trimming) {
spin_lock(&fs_info->unused_bgs_lock);
/*
* A concurrent scrub might have added us to the list
* fs_info->unused_bgs, so use a list_move operation
* to add the block group to the deleted_bgs list.
*/
list_move(&block_group->bg_list,
&trans->transaction->deleted_bgs);
spin_unlock(&fs_info->unused_bgs_lock);
btrfs_get_block_group(block_group);
}
end_trans:
btrfs_end_transaction(trans);
next:
btrfs_put_block_group(block_group);
spin_lock(&fs_info->unused_bgs_lock);
}
spin_unlock(&fs_info->unused_bgs_lock);
mutex_unlock(&fs_info->reclaim_bgs_lock);
return;
flip_async:
btrfs_end_transaction(trans);
mutex_unlock(&fs_info->reclaim_bgs_lock);
btrfs_put_block_group(block_group);
btrfs_discard_punt_unused_bgs_list(fs_info);
}
void btrfs_mark_bg_unused(struct btrfs_block_group *bg)
{
struct btrfs_fs_info *fs_info = bg->fs_info;
spin_lock(&fs_info->unused_bgs_lock);
if (list_empty(&bg->bg_list)) {
btrfs_get_block_group(bg);
trace_btrfs_add_unused_block_group(bg);
list_add_tail(&bg->bg_list, &fs_info->unused_bgs);
}
spin_unlock(&fs_info->unused_bgs_lock);
}
void btrfs_reclaim_bgs_work(struct work_struct *work)
{
struct btrfs_fs_info *fs_info =
container_of(work, struct btrfs_fs_info, reclaim_bgs_work);
struct btrfs_block_group *bg;
struct btrfs_space_info *space_info;
if (!test_bit(BTRFS_FS_OPEN, &fs_info->flags))
return;
sb_start_write(fs_info->sb);
if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_BALANCE)) {
sb_end_write(fs_info->sb);
return;
}
/*
* Long running balances can keep us blocked here for eternity, so
* simply skip reclaim if we're unable to get the mutex.
*/
if (!mutex_trylock(&fs_info->reclaim_bgs_lock)) {
btrfs_exclop_finish(fs_info);
sb_end_write(fs_info->sb);
return;
}
spin_lock(&fs_info->unused_bgs_lock);
while (!list_empty(&fs_info->reclaim_bgs)) {
u64 zone_unusable;
int ret = 0;
bg = list_first_entry(&fs_info->reclaim_bgs,
struct btrfs_block_group,
bg_list);
list_del_init(&bg->bg_list);
space_info = bg->space_info;
spin_unlock(&fs_info->unused_bgs_lock);
/* Don't race with allocators so take the groups_sem */
down_write(&space_info->groups_sem);
spin_lock(&bg->lock);
if (bg->reserved || bg->pinned || bg->ro) {
/*
* We want to bail if we made new allocations or have
* outstanding allocations in this block group. We do
* the ro check in case balance is currently acting on
* this block group.
*/
spin_unlock(&bg->lock);
up_write(&space_info->groups_sem);
goto next;
}
spin_unlock(&bg->lock);
/* Get out fast, in case we're unmounting the filesystem */
if (btrfs_fs_closing(fs_info)) {
up_write(&space_info->groups_sem);
goto next;
}
/*
* Cache the zone_unusable value before turning the block group
* to read only. As soon as the blog group is read only it's
* zone_unusable value gets moved to the block group's read-only
* bytes and isn't available for calculations anymore.
*/
zone_unusable = bg->zone_unusable;
ret = inc_block_group_ro(bg, 0);
up_write(&space_info->groups_sem);
if (ret < 0)
goto next;
btrfs_info(fs_info,
"reclaiming chunk %llu with %llu%% used %llu%% unusable",
bg->start, div_u64(bg->used * 100, bg->length),
div64_u64(zone_unusable * 100, bg->length));
trace_btrfs_reclaim_block_group(bg);
ret = btrfs_relocate_chunk(fs_info, bg->start);
if (ret)
btrfs_err(fs_info, "error relocating chunk %llu",
bg->start);
next:
btrfs_put_block_group(bg);
spin_lock(&fs_info->unused_bgs_lock);
}
spin_unlock(&fs_info->unused_bgs_lock);
mutex_unlock(&fs_info->reclaim_bgs_lock);
btrfs_exclop_finish(fs_info);
sb_end_write(fs_info->sb);
}
void btrfs_reclaim_bgs(struct btrfs_fs_info *fs_info)
{
spin_lock(&fs_info->unused_bgs_lock);
if (!list_empty(&fs_info->reclaim_bgs))
queue_work(system_unbound_wq, &fs_info->reclaim_bgs_work);
spin_unlock(&fs_info->unused_bgs_lock);
}
void btrfs_mark_bg_to_reclaim(struct btrfs_block_group *bg)
{
struct btrfs_fs_info *fs_info = bg->fs_info;
spin_lock(&fs_info->unused_bgs_lock);
if (list_empty(&bg->bg_list)) {
btrfs_get_block_group(bg);
trace_btrfs_add_reclaim_block_group(bg);
list_add_tail(&bg->bg_list, &fs_info->reclaim_bgs);
}
spin_unlock(&fs_info->unused_bgs_lock);
}
static int read_bg_from_eb(struct btrfs_fs_info *fs_info, struct btrfs_key *key,
struct btrfs_path *path)
{
struct extent_map_tree *em_tree;
struct extent_map *em;
struct btrfs_block_group_item bg;
struct extent_buffer *leaf;
int slot;
u64 flags;
int ret = 0;
slot = path->slots[0];
leaf = path->nodes[0];
em_tree = &fs_info->mapping_tree;
read_lock(&em_tree->lock);
em = lookup_extent_mapping(em_tree, key->objectid, key->offset);
read_unlock(&em_tree->lock);
if (!em) {
btrfs_err(fs_info,
"logical %llu len %llu found bg but no related chunk",
key->objectid, key->offset);
return -ENOENT;
}
if (em->start != key->objectid || em->len != key->offset) {
btrfs_err(fs_info,
"block group %llu len %llu mismatch with chunk %llu len %llu",
key->objectid, key->offset, em->start, em->len);
ret = -EUCLEAN;
goto out_free_em;
}
read_extent_buffer(leaf, &bg, btrfs_item_ptr_offset(leaf, slot),
sizeof(bg));
flags = btrfs_stack_block_group_flags(&bg) &
BTRFS_BLOCK_GROUP_TYPE_MASK;
if (flags != (em->map_lookup->type & BTRFS_BLOCK_GROUP_TYPE_MASK)) {
btrfs_err(fs_info,
"block group %llu len %llu type flags 0x%llx mismatch with chunk type flags 0x%llx",
key->objectid, key->offset, flags,
(BTRFS_BLOCK_GROUP_TYPE_MASK & em->map_lookup->type));
ret = -EUCLEAN;
}
out_free_em:
free_extent_map(em);
return ret;
}
static int find_first_block_group(struct btrfs_fs_info *fs_info,
struct btrfs_path *path,
struct btrfs_key *key)
{
struct btrfs_root *root = fs_info->extent_root;
int ret;
struct btrfs_key found_key;
struct extent_buffer *leaf;
int slot;
ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
if (ret < 0)
return ret;
while (1) {
slot = path->slots[0];
leaf = path->nodes[0];
if (slot >= btrfs_header_nritems(leaf)) {
ret = btrfs_next_leaf(root, path);
if (ret == 0)
continue;
if (ret < 0)
goto out;
break;
}
btrfs_item_key_to_cpu(leaf, &found_key, slot);
if (found_key.objectid >= key->objectid &&
found_key.type == BTRFS_BLOCK_GROUP_ITEM_KEY) { ret = read_bg_from_eb(fs_info, &found_key, path);
break;
}
path->slots[0]++;
}
out:
return ret;
}
static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
{
u64 extra_flags = chunk_to_extended(flags) &
BTRFS_EXTENDED_PROFILE_MASK;
write_seqlock(&fs_info->profiles_lock);
if (flags & BTRFS_BLOCK_GROUP_DATA)
fs_info->avail_data_alloc_bits |= extra_flags; if (flags & BTRFS_BLOCK_GROUP_METADATA) fs_info->avail_metadata_alloc_bits |= extra_flags; if (flags & BTRFS_BLOCK_GROUP_SYSTEM) fs_info->avail_system_alloc_bits |= extra_flags;
write_sequnlock(&fs_info->profiles_lock);
}
/**
* Map a physical disk address to a list of logical addresses
*
* @fs_info: the filesystem
* @chunk_start: logical address of block group
* @bdev: physical device to resolve, can be NULL to indicate any device
* @physical: physical address to map to logical addresses
* @logical: return array of logical addresses which map to @physical
* @naddrs: length of @logical
* @stripe_len: size of IO stripe for the given block group
*
* Maps a particular @physical disk address to a list of @logical addresses.
* Used primarily to exclude those portions of a block group that contain super
* block copies.
*/
int btrfs_rmap_block(struct btrfs_fs_info *fs_info, u64 chunk_start,
struct block_device *bdev, u64 physical, u64 **logical,
int *naddrs, int *stripe_len)
{
struct extent_map *em;
struct map_lookup *map;
u64 *buf;
u64 bytenr;
u64 data_stripe_length;
u64 io_stripe_size;
int i, nr = 0;
int ret = 0;
em = btrfs_get_chunk_map(fs_info, chunk_start, 1);
if (IS_ERR(em))
return -EIO;
map = em->map_lookup;
data_stripe_length = em->orig_block_len;
io_stripe_size = map->stripe_len;
chunk_start = em->start;
/* For RAID5/6 adjust to a full IO stripe length */
if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK)
io_stripe_size = map->stripe_len * nr_data_stripes(map); buf = kcalloc(map->num_stripes, sizeof(u64), GFP_NOFS);
if (!buf) {
ret = -ENOMEM;
goto out;
}
for (i = 0; i < map->num_stripes; i++) {
bool already_inserted = false;
u64 stripe_nr;
u64 offset;
int j;
if (!in_range(physical, map->stripes[i].physical,
data_stripe_length))
continue;
if (bdev && map->stripes[i].dev->bdev != bdev)
continue;
stripe_nr = physical - map->stripes[i].physical;
stripe_nr = div64_u64_rem(stripe_nr, map->stripe_len, &offset);
if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
stripe_nr = stripe_nr * map->num_stripes + i;
stripe_nr = div_u64(stripe_nr, map->sub_stripes);
} else if (map->type & BTRFS_BLOCK_GROUP_RAID0) { stripe_nr = stripe_nr * map->num_stripes + i;
}
/*
* The remaining case would be for RAID56, multiply by
* nr_data_stripes(). Alternatively, just use rmap_len below
* instead of map->stripe_len
*/
bytenr = chunk_start + stripe_nr * io_stripe_size + offset;
/* Ensure we don't add duplicate addresses */
for (j = 0; j < nr; j++) { if (buf[j] == bytenr) {
already_inserted = true;
break;
}
}
if (!already_inserted)
buf[nr++] = bytenr;
}
*logical = buf;
*naddrs = nr;
*stripe_len = io_stripe_size;
out:
free_extent_map(em); return ret;
}
static int exclude_super_stripes(struct btrfs_block_group *cache)
{
struct btrfs_fs_info *fs_info = cache->fs_info;
const bool zoned = btrfs_is_zoned(fs_info);
u64 bytenr;
u64 *logical;
int stripe_len;
int i, nr, ret;
if (cache->start < BTRFS_SUPER_INFO_OFFSET) {
stripe_len = BTRFS_SUPER_INFO_OFFSET - cache->start;
cache->bytes_super += stripe_len;
ret = btrfs_add_excluded_extent(fs_info, cache->start,
stripe_len);
if (ret)
return ret;
}
for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
bytenr = btrfs_sb_offset(i);
ret = btrfs_rmap_block(fs_info, cache->start, NULL,
bytenr, &logical, &nr, &stripe_len);
if (ret)
return ret;
/* Shouldn't have super stripes in sequential zones */
if (zoned && nr) {
btrfs_err(fs_info,
"zoned: block group %llu must not contain super block",
cache->start);
return -EUCLEAN;
}
while (nr--) { u64 len = min_t(u64, stripe_len,
cache->start + cache->length - logical[nr]);
cache->bytes_super += len;
ret = btrfs_add_excluded_extent(fs_info, logical[nr],
len);
if (ret) {
kfree(logical);
return ret;
}
}
kfree(logical);
}
return 0;
}
static void link_block_group(struct btrfs_block_group *cache)
{
struct btrfs_space_info *space_info = cache->space_info;
int index = btrfs_bg_flags_to_raid_index(cache->flags);
down_write(&space_info->groups_sem);
list_add_tail(&cache->list, &space_info->block_groups[index]);
up_write(&space_info->groups_sem);
}
static struct btrfs_block_group *btrfs_create_block_group_cache(
struct btrfs_fs_info *fs_info, u64 start)
{
struct btrfs_block_group *cache;
cache = kzalloc(sizeof(*cache), GFP_NOFS);
if (!cache)
return NULL;
cache->free_space_ctl = kzalloc(sizeof(*cache->free_space_ctl),
GFP_NOFS);
if (!cache->free_space_ctl) {
kfree(cache);
return NULL;
}
cache->start = start;
cache->fs_info = fs_info;
cache->full_stripe_len = btrfs_full_stripe_len(fs_info, start);
cache->discard_index = BTRFS_DISCARD_INDEX_UNUSED;
refcount_set(&cache->refs, 1);
spin_lock_init(&cache->lock);
init_rwsem(&cache->data_rwsem);
INIT_LIST_HEAD(&cache->list);
INIT_LIST_HEAD(&cache->cluster_list);
INIT_LIST_HEAD(&cache->bg_list);
INIT_LIST_HEAD(&cache->ro_list);
INIT_LIST_HEAD(&cache->discard_list);
INIT_LIST_HEAD(&cache->dirty_list);
INIT_LIST_HEAD(&cache->io_list);
btrfs_init_free_space_ctl(cache, cache->free_space_ctl);
atomic_set(&cache->frozen, 0);
mutex_init(&cache->free_space_lock);
btrfs_init_full_stripe_locks_tree(&cache->full_stripe_locks_root);
return cache;
}
/*
* Iterate all chunks and verify that each of them has the corresponding block
* group
*/
static int check_chunk_block_group_mappings(struct btrfs_fs_info *fs_info)
{
struct extent_map_tree *map_tree = &fs_info->mapping_tree;
struct extent_map *em;
struct btrfs_block_group *bg;
u64 start = 0;
int ret = 0;
while (1) {
read_lock(&map_tree->lock);
/*
* lookup_extent_mapping will return the first extent map
* intersecting the range, so setting @len to 1 is enough to
* get the first chunk.
*/
em = lookup_extent_mapping(map_tree, start, 1);
read_unlock(&map_tree->lock);
if (!em)
break;
bg = btrfs_lookup_block_group(fs_info, em->start);
if (!bg) {
btrfs_err(fs_info,
"chunk start=%llu len=%llu doesn't have corresponding block group",
em->start, em->len);
ret = -EUCLEAN;
free_extent_map(em);
break;
}
if (bg->start != em->start || bg->length != em->len || (bg->flags & BTRFS_BLOCK_GROUP_TYPE_MASK) !=
(em->map_lookup->type & BTRFS_BLOCK_GROUP_TYPE_MASK)) {
btrfs_err(fs_info,
"chunk start=%llu len=%llu flags=0x%llx doesn't match block group start=%llu len=%llu flags=0x%llx",
em->start, em->len,
em->map_lookup->type & BTRFS_BLOCK_GROUP_TYPE_MASK,
bg->start, bg->length,
bg->flags & BTRFS_BLOCK_GROUP_TYPE_MASK);
ret = -EUCLEAN;
free_extent_map(em);
btrfs_put_block_group(bg);
break;
}
start = em->start + em->len;
free_extent_map(em);
btrfs_put_block_group(bg);
}
return ret;
}
static int read_one_block_group(struct btrfs_fs_info *info,
struct btrfs_block_group_item *bgi,
const struct btrfs_key *key,
int need_clear)
{
struct btrfs_block_group *cache;
struct btrfs_space_info *space_info;
const bool mixed = btrfs_fs_incompat(info, MIXED_GROUPS);
int ret;
ASSERT(key->type == BTRFS_BLOCK_GROUP_ITEM_KEY);
cache = btrfs_create_block_group_cache(info, key->objectid);
if (!cache)
return -ENOMEM;
cache->length = key->offset;
cache->used = btrfs_stack_block_group_used(bgi);
cache->flags = btrfs_stack_block_group_flags(bgi);
set_free_space_tree_thresholds(cache);
if (need_clear) {
/*
* When we mount with old space cache, we need to
* set BTRFS_DC_CLEAR and set dirty flag.
*
* a) Setting 'BTRFS_DC_CLEAR' makes sure that we
* truncate the old free space cache inode and
* setup a new one.
* b) Setting 'dirty flag' makes sure that we flush
* the new space cache info onto disk.
*/
if (btrfs_test_opt(info, SPACE_CACHE)) cache->disk_cache_state = BTRFS_DC_CLEAR;
}
if (!mixed && ((cache->flags & BTRFS_BLOCK_GROUP_METADATA) &&
(cache->flags & BTRFS_BLOCK_GROUP_DATA))) {
btrfs_err(info,
"bg %llu is a mixed block group but filesystem hasn't enabled mixed block groups",
cache->start);
ret = -EINVAL;
goto error;
}
ret = btrfs_load_block_group_zone_info(cache, false);
if (ret) {
btrfs_err(info, "zoned: failed to load zone info of bg %llu",
cache->start);
goto error;
}
/*
* We need to exclude the super stripes now so that the space info has
* super bytes accounted for, otherwise we'll think we have more space
* than we actually do.
*/
ret = exclude_super_stripes(cache);
if (ret) {
/* We may have excluded something, so call this just in case. */
btrfs_free_excluded_extents(cache);
goto error;
}
/*
* For zoned filesystem, space after the allocation offset is the only
* free space for a block group. So, we don't need any caching work.
* btrfs_calc_zone_unusable() will set the amount of free space and
* zone_unusable space.
*
* For regular filesystem, check for two cases, either we are full, and
* therefore don't need to bother with the caching work since we won't
* find any space, or we are empty, and we can just add all the space
* in and be done with it. This saves us _a_lot_ of time, particularly
* in the full case.
*/
if (btrfs_is_zoned(info)) {
btrfs_calc_zone_unusable(cache);
} else if (cache->length == cache->used) { cache->last_byte_to_unpin = (u64)-1;
cache->cached = BTRFS_CACHE_FINISHED;
btrfs_free_excluded_extents(cache); } else if (cache->used == 0) { cache->last_byte_to_unpin = (u64)-1;
cache->cached = BTRFS_CACHE_FINISHED;
add_new_free_space(cache, cache->start,
cache->start + cache->length);
btrfs_free_excluded_extents(cache);
}
ret = btrfs_add_block_group_cache(info, cache);
if (ret) {
btrfs_remove_free_space_cache(cache);
goto error;
}
trace_btrfs_add_block_group(info, cache, 0);
btrfs_update_space_info(info, cache->flags, cache->length,
cache->used, cache->bytes_super,
cache->zone_unusable, &space_info);
cache->space_info = space_info;
link_block_group(cache);
set_avail_alloc_bits(info, cache->flags);
if (btrfs_chunk_readonly(info, cache->start)) {
inc_block_group_ro(cache, 1); } else if (cache->used == 0) {
ASSERT(list_empty(&cache->bg_list));
if (btrfs_test_opt(info, DISCARD_ASYNC))
btrfs_discard_queue_work(&info->discard_ctl, cache);
else
btrfs_mark_bg_unused(cache);
}
return 0;
error:
btrfs_put_block_group(cache);
return ret;
}
static int fill_dummy_bgs(struct btrfs_fs_info *fs_info)
{
struct extent_map_tree *em_tree = &fs_info->mapping_tree;
struct btrfs_space_info *space_info;
struct rb_node *node;
int ret = 0;
for (node = rb_first_cached(&em_tree->map); node; node = rb_next(node)) {
struct extent_map *em;
struct map_lookup *map;
struct btrfs_block_group *bg;
em = rb_entry(node, struct extent_map, rb_node);
map = em->map_lookup;
bg = btrfs_create_block_group_cache(fs_info, em->start);
if (!bg) {
ret = -ENOMEM;
break;
}
/* Fill dummy cache as FULL */
bg->length = em->len;
bg->flags = map->type;
bg->last_byte_to_unpin = (u64)-1;
bg->cached = BTRFS_CACHE_FINISHED;
bg->used = em->len;
bg->flags = map->type;
ret = btrfs_add_block_group_cache(fs_info, bg);
/*
* We may have some valid block group cache added already, in
* that case we skip to the next one.
*/
if (ret == -EEXIST) {
ret = 0;
btrfs_put_block_group(bg);
continue;
}
if (ret) {
btrfs_remove_free_space_cache(bg);
btrfs_put_block_group(bg);
break;
}
btrfs_update_space_info(fs_info, bg->flags, em->len, em->len,
0, 0, &space_info);
bg->space_info = space_info;
link_block_group(bg);
set_avail_alloc_bits(fs_info, bg->flags);
}
if (!ret)
btrfs_init_global_block_rsv(fs_info);
return ret;
}
int btrfs_read_block_groups(struct btrfs_fs_info *info)
{
struct btrfs_path *path;
int ret;
struct btrfs_block_group *cache;
struct btrfs_space_info *space_info;
struct btrfs_key key;
int need_clear = 0;
u64 cache_gen;
if (!info->extent_root)
return fill_dummy_bgs(info);
key.objectid = 0;
key.offset = 0;
key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
cache_gen = btrfs_super_cache_generation(info->super_copy); if (btrfs_test_opt(info, SPACE_CACHE) &&
btrfs_super_generation(info->super_copy) != cache_gen)
need_clear = 1;
if (btrfs_test_opt(info, CLEAR_CACHE))
need_clear = 1;
while (1) {
struct btrfs_block_group_item bgi;
struct extent_buffer *leaf;
int slot;
ret = find_first_block_group(info, path, &key);
if (ret > 0)
break;
if (ret != 0)
goto error;
leaf = path->nodes[0];
slot = path->slots[0];
read_extent_buffer(leaf, &bgi, btrfs_item_ptr_offset(leaf, slot),
sizeof(bgi));
btrfs_item_key_to_cpu(leaf, &key, slot);
btrfs_release_path(path);
ret = read_one_block_group(info, &bgi, &key, need_clear);
if (ret < 0)
goto error;
key.objectid += key.offset;
key.offset = 0;
}
btrfs_release_path(path);
list_for_each_entry(space_info, &info->space_info, list) {
int i;
for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) { if (list_empty(&space_info->block_groups[i]))
continue;
cache = list_first_entry(&space_info->block_groups[i],
struct btrfs_block_group,
list);
btrfs_sysfs_add_block_group_type(cache);
}
if (!(btrfs_get_alloc_profile(info, space_info->flags) &
(BTRFS_BLOCK_GROUP_RAID10 |
BTRFS_BLOCK_GROUP_RAID1_MASK |
BTRFS_BLOCK_GROUP_RAID56_MASK |
BTRFS_BLOCK_GROUP_DUP)))
continue;
/*
* Avoid allocating from un-mirrored block group if there are
* mirrored block groups.
*/
list_for_each_entry(cache,
&space_info->block_groups[BTRFS_RAID_RAID0],
list)
inc_block_group_ro(cache, 1); list_for_each_entry(cache,
&space_info->block_groups[BTRFS_RAID_SINGLE],
list)
inc_block_group_ro(cache, 1);
}
btrfs_init_global_block_rsv(info);
ret = check_chunk_block_group_mappings(info);
error:
btrfs_free_path(path);
/*
* We've hit some error while reading the extent tree, and have
* rescue=ibadroots mount option.
* Try to fill the tree using dummy block groups so that the user can
* continue to mount and grab their data.
*/
if (ret && btrfs_test_opt(info, IGNOREBADROOTS)) ret = fill_dummy_bgs(info);
return ret;
}
/*
* This function, insert_block_group_item(), belongs to the phase 2 of chunk
* allocation.
*
* See the comment at btrfs_chunk_alloc() for details about the chunk allocation
* phases.
*/
static int insert_block_group_item(struct btrfs_trans_handle *trans,
struct btrfs_block_group *block_group)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
struct btrfs_block_group_item bgi;
struct btrfs_root *root;
struct btrfs_key key;
spin_lock(&block_group->lock);
btrfs_set_stack_block_group_used(&bgi, block_group->used);
btrfs_set_stack_block_group_chunk_objectid(&bgi,
BTRFS_FIRST_CHUNK_TREE_OBJECTID);
btrfs_set_stack_block_group_flags(&bgi, block_group->flags);
key.objectid = block_group->start;
key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
key.offset = block_group->length;
spin_unlock(&block_group->lock);
root = fs_info->extent_root;
return btrfs_insert_item(trans, root, &key, &bgi, sizeof(bgi));
}
static int insert_dev_extent(struct btrfs_trans_handle *trans,
struct btrfs_device *device, u64 chunk_offset,
u64 start, u64 num_bytes)
{
struct btrfs_fs_info *fs_info = device->fs_info;
struct btrfs_root *root = fs_info->dev_root;
struct btrfs_path *path;
struct btrfs_dev_extent *extent;
struct extent_buffer *leaf;
struct btrfs_key key;
int ret;
WARN_ON(!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state)); WARN_ON(test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)); path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
key.objectid = device->devid;
key.type = BTRFS_DEV_EXTENT_KEY;
key.offset = start;
ret = btrfs_insert_empty_item(trans, root, path, &key, sizeof(*extent));
if (ret)
goto out;
leaf = path->nodes[0];
extent = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_extent);
btrfs_set_dev_extent_chunk_tree(leaf, extent, BTRFS_CHUNK_TREE_OBJECTID);
btrfs_set_dev_extent_chunk_objectid(leaf, extent,
BTRFS_FIRST_CHUNK_TREE_OBJECTID);
btrfs_set_dev_extent_chunk_offset(leaf, extent, chunk_offset);
btrfs_set_dev_extent_length(leaf, extent, num_bytes);
btrfs_mark_buffer_dirty(leaf);
out:
btrfs_free_path(path);
return ret;
}
/*
* This function belongs to phase 2.
*
* See the comment at btrfs_chunk_alloc() for details about the chunk allocation
* phases.
*/
static int insert_dev_extents(struct btrfs_trans_handle *trans,
u64 chunk_offset, u64 chunk_size)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
struct btrfs_device *device;
struct extent_map *em;
struct map_lookup *map;
u64 dev_offset;
u64 stripe_size;
int i;
int ret = 0;
em = btrfs_get_chunk_map(fs_info, chunk_offset, chunk_size);
if (IS_ERR(em))
return PTR_ERR(em); map = em->map_lookup;
stripe_size = em->orig_block_len;
/*
* Take the device list mutex to prevent races with the final phase of
* a device replace operation that replaces the device object associated
* with the map's stripes, because the device object's id can change
* at any time during that final phase of the device replace operation
* (dev-replace.c:btrfs_dev_replace_finishing()), so we could grab the
* replaced device and then see it with an ID of BTRFS_DEV_REPLACE_DEVID,
* resulting in persisting a device extent item with such ID.
*/
mutex_lock(&fs_info->fs_devices->device_list_mutex);
for (i = 0; i < map->num_stripes; i++) {
device = map->stripes[i].dev;
dev_offset = map->stripes[i].physical;
ret = insert_dev_extent(trans, device, chunk_offset, dev_offset,
stripe_size);
if (ret)
break;
}
mutex_unlock(&fs_info->fs_devices->device_list_mutex);
free_extent_map(em);
return ret;
}
/*
* This function, btrfs_create_pending_block_groups(), belongs to the phase 2 of
* chunk allocation.
*
* See the comment at btrfs_chunk_alloc() for details about the chunk allocation
* phases.
*/
void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
struct btrfs_block_group *block_group;
int ret = 0;
while (!list_empty(&trans->new_bgs)) {
int index;
block_group = list_first_entry(&trans->new_bgs,
struct btrfs_block_group,
bg_list);
if (ret)
goto next;
index = btrfs_bg_flags_to_raid_index(block_group->flags);
ret = insert_block_group_item(trans, block_group);
if (ret)
btrfs_abort_transaction(trans, ret);
if (!block_group->chunk_item_inserted) {
mutex_lock(&fs_info->chunk_mutex);
ret = btrfs_chunk_alloc_add_chunk_item(trans, block_group);
mutex_unlock(&fs_info->chunk_mutex);
if (ret)
btrfs_abort_transaction(trans, ret);
}
ret = insert_dev_extents(trans, block_group->start,
block_group->length);
if (ret) btrfs_abort_transaction(trans, ret); add_block_group_free_space(trans, block_group);
/*
* If we restriped during balance, we may have added a new raid
* type, so now add the sysfs entries when it is safe to do so.
* We don't have to worry about locking here as it's handled in
* btrfs_sysfs_add_block_group_type.
*/
if (block_group->space_info->block_group_kobjs[index] == NULL)
btrfs_sysfs_add_block_group_type(block_group);
/* Already aborted the transaction if it failed. */
next:
btrfs_delayed_refs_rsv_release(fs_info, 1);
list_del_init(&block_group->bg_list);
}
btrfs_trans_release_chunk_metadata(trans);
}
struct btrfs_block_group *btrfs_make_block_group(struct btrfs_trans_handle *trans,
u64 bytes_used, u64 type,
u64 chunk_offset, u64 size)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
struct btrfs_block_group *cache;
int ret;
btrfs_set_log_full_commit(trans);
cache = btrfs_create_block_group_cache(fs_info, chunk_offset);
if (!cache)
return ERR_PTR(-ENOMEM);
cache->length = size;
set_free_space_tree_thresholds(cache);
cache->used = bytes_used;
cache->flags = type;
cache->last_byte_to_unpin = (u64)-1;
cache->cached = BTRFS_CACHE_FINISHED;
if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE))
cache->needs_free_space = 1;
ret = btrfs_load_block_group_zone_info(cache, true);
if (ret) {
btrfs_put_block_group(cache);
return ERR_PTR(ret);
}
ret = exclude_super_stripes(cache);
if (ret) {
/* We may have excluded something, so call this just in case */
btrfs_free_excluded_extents(cache);
btrfs_put_block_group(cache);
return ERR_PTR(ret);
}
add_new_free_space(cache, chunk_offset, chunk_offset + size);
btrfs_free_excluded_extents(cache);
#ifdef CONFIG_BTRFS_DEBUG
if (btrfs_should_fragment_free_space(cache)) {
u64 new_bytes_used = size - bytes_used;
bytes_used += new_bytes_used >> 1;
fragment_free_space(cache);
}
#endif
/*
* Ensure the corresponding space_info object is created and
* assigned to our block group. We want our bg to be added to the rbtree
* with its ->space_info set.
*/
cache->space_info = btrfs_find_space_info(fs_info, cache->flags);
ASSERT(cache->space_info);
ret = btrfs_add_block_group_cache(fs_info, cache);
if (ret) {
btrfs_remove_free_space_cache(cache);
btrfs_put_block_group(cache);
return ERR_PTR(ret);
}
/*
* Now that our block group has its ->space_info set and is inserted in
* the rbtree, update the space info's counters.
*/
trace_btrfs_add_block_group(fs_info, cache, 1);
btrfs_update_space_info(fs_info, cache->flags, size, bytes_used,
cache->bytes_super, 0, &cache->space_info);
btrfs_update_global_block_rsv(fs_info);
link_block_group(cache);
list_add_tail(&cache->bg_list, &trans->new_bgs);
trans->delayed_ref_updates++;
btrfs_update_delayed_refs_rsv(trans);
set_avail_alloc_bits(fs_info, type);
return cache;
}
/*
* Mark one block group RO, can be called several times for the same block
* group.
*
* @cache: the destination block group
* @do_chunk_alloc: whether need to do chunk pre-allocation, this is to
* ensure we still have some free space after marking this
* block group RO.
*/
int btrfs_inc_block_group_ro(struct btrfs_block_group *cache,
bool do_chunk_alloc)
{
struct btrfs_fs_info *fs_info = cache->fs_info;
struct btrfs_trans_handle *trans;
u64 alloc_flags;
int ret;
bool dirty_bg_running;
/*
* This can only happen when we are doing read-only scrub on read-only
* mount.
* In that case we should not start a new transaction on read-only fs.
* Thus here we skip all chunk allocations.
*/
if (sb_rdonly(fs_info->sb)) {
mutex_lock(&fs_info->ro_block_group_mutex);
ret = inc_block_group_ro(cache, 0);
mutex_unlock(&fs_info->ro_block_group_mutex);
return ret;
}
do {
trans = btrfs_join_transaction(fs_info->extent_root);
if (IS_ERR(trans))
return PTR_ERR(trans);
dirty_bg_running = false;
/*
* We're not allowed to set block groups readonly after the dirty
* block group cache has started writing. If it already started,
* back off and let this transaction commit.
*/
mutex_lock(&fs_info->ro_block_group_mutex);
if (test_bit(BTRFS_TRANS_DIRTY_BG_RUN, &trans->transaction->flags)) {
u64 transid = trans->transid;
mutex_unlock(&fs_info->ro_block_group_mutex);
btrfs_end_transaction(trans);
ret = btrfs_wait_for_commit(fs_info, transid);
if (ret)
return ret;
dirty_bg_running = true;
}
} while (dirty_bg_running);
if (do_chunk_alloc) {
/*
* If we are changing raid levels, try to allocate a
* corresponding block group with the new raid level.
*/
alloc_flags = btrfs_get_alloc_profile(fs_info, cache->flags);
if (alloc_flags != cache->flags) {
ret = btrfs_chunk_alloc(trans, alloc_flags,
CHUNK_ALLOC_FORCE);
/*
* ENOSPC is allowed here, we may have enough space
* already allocated at the new raid level to carry on
*/
if (ret == -ENOSPC)
ret = 0;
if (ret < 0)
goto out;
}
}
ret = inc_block_group_ro(cache, 0);
if (!do_chunk_alloc || ret == -ETXTBSY)
goto unlock_out;
if (!ret)
goto out;
alloc_flags = btrfs_get_alloc_profile(fs_info, cache->space_info->flags);
ret = btrfs_chunk_alloc(trans, alloc_flags, CHUNK_ALLOC_FORCE);
if (ret < 0)
goto out;
ret = inc_block_group_ro(cache, 0);
if (ret == -ETXTBSY)
goto unlock_out;
out:
if (cache->flags & BTRFS_BLOCK_GROUP_SYSTEM) {
alloc_flags = btrfs_get_alloc_profile(fs_info, cache->flags);
mutex_lock(&fs_info->chunk_mutex);
check_system_chunk(trans, alloc_flags);
mutex_unlock(&fs_info->chunk_mutex);
}
unlock_out:
mutex_unlock(&fs_info->ro_block_group_mutex);
btrfs_end_transaction(trans);
return ret;
}
void btrfs_dec_block_group_ro(struct btrfs_block_group *cache)
{
struct btrfs_space_info *sinfo = cache->space_info;
u64 num_bytes;
BUG_ON(!cache->ro);
spin_lock(&sinfo->lock);
spin_lock(&cache->lock);
if (!--cache->ro) {
if (btrfs_is_zoned(cache->fs_info)) {
/* Migrate zone_unusable bytes back */
cache->zone_unusable = cache->alloc_offset - cache->used;
sinfo->bytes_zone_unusable += cache->zone_unusable;
sinfo->bytes_readonly -= cache->zone_unusable;
}
num_bytes = cache->length - cache->reserved -
cache->pinned - cache->bytes_super -
cache->zone_unusable - cache->used;
sinfo->bytes_readonly -= num_bytes;
list_del_init(&cache->ro_list);
}
spin_unlock(&cache->lock);
spin_unlock(&sinfo->lock);
}
static int update_block_group_item(struct btrfs_trans_handle *trans,
struct btrfs_path *path,
struct btrfs_block_group *cache)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
int ret;
struct btrfs_root *root = fs_info->extent_root;
unsigned long bi;
struct extent_buffer *leaf;
struct btrfs_block_group_item bgi;
struct btrfs_key key;
key.objectid = cache->start;
key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
key.offset = cache->length;
ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
if (ret) {
if (ret > 0)
ret = -ENOENT;
goto fail;
}
leaf = path->nodes[0];
bi = btrfs_item_ptr_offset(leaf, path->slots[0]);
btrfs_set_stack_block_group_used(&bgi, cache->used);
btrfs_set_stack_block_group_chunk_objectid(&bgi,
BTRFS_FIRST_CHUNK_TREE_OBJECTID);
btrfs_set_stack_block_group_flags(&bgi, cache->flags);
write_extent_buffer(leaf, &bgi, bi, sizeof(bgi));
btrfs_mark_buffer_dirty(leaf);
fail:
btrfs_release_path(path);
return ret;
}
static int cache_save_setup(struct btrfs_block_group *block_group,
struct btrfs_trans_handle *trans,
struct btrfs_path *path)
{
struct btrfs_fs_info *fs_info = block_group->fs_info;
struct btrfs_root *root = fs_info->tree_root;
struct inode *inode = NULL;
struct extent_changeset *data_reserved = NULL;
u64 alloc_hint = 0;
int dcs = BTRFS_DC_ERROR;
u64 cache_size = 0;
int retries = 0;
int ret = 0;
if (!btrfs_test_opt(fs_info, SPACE_CACHE))
return 0;
/*
* If this block group is smaller than 100 megs don't bother caching the
* block group.
*/
if (block_group->length < (100 * SZ_1M)) {
spin_lock(&block_group->lock);
block_group->disk_cache_state = BTRFS_DC_WRITTEN;
spin_unlock(&block_group->lock);
return 0;
}
if (TRANS_ABORTED(trans))
return 0;
again:
inode = lookup_free_space_inode(block_group, path); if (IS_ERR(inode) && PTR_ERR(inode) != -ENOENT) {
ret = PTR_ERR(inode);
btrfs_release_path(path);
goto out;
}
if (IS_ERR(inode)) {
BUG_ON(retries);
retries++;
if (block_group->ro)
goto out_free;
ret = create_free_space_inode(trans, block_group, path);
if (ret)
goto out_free;
goto again;
}
/*
* We want to set the generation to 0, that way if anything goes wrong
* from here on out we know not to trust this cache when we load up next
* time.
*/
BTRFS_I(inode)->generation = 0;
ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
if (ret) {
/*
* So theoretically we could recover from this, simply set the
* super cache generation to 0 so we know to invalidate the
* cache, but then we'd have to keep track of the block groups
* that fail this way so we know we _have_ to reset this cache
* before the next commit or risk reading stale cache. So to
* limit our exposure to horrible edge cases lets just abort the
* transaction, this only happens in really bad situations
* anyway.
*/
btrfs_abort_transaction(trans, ret);
goto out_put;
}
WARN_ON(ret);
/* We've already setup this transaction, go ahead and exit */
if (block_group->cache_generation == trans->transid &&
i_size_read(inode)) {
dcs = BTRFS_DC_SETUP;
goto out_put;
}
if (i_size_read(inode) > 0) { ret = btrfs_check_trunc_cache_free_space(fs_info,
&fs_info->global_block_rsv);
if (ret)
goto out_put;
ret = btrfs_truncate_free_space_cache(trans, NULL, inode);
if (ret)
goto out_put;
}
spin_lock(&block_group->lock);
if (block_group->cached != BTRFS_CACHE_FINISHED ||
!btrfs_test_opt(fs_info, SPACE_CACHE)) {
/*
* don't bother trying to write stuff out _if_
* a) we're not cached,
* b) we're with nospace_cache mount option,
* c) we're with v2 space_cache (FREE_SPACE_TREE).
*/
dcs = BTRFS_DC_WRITTEN;
spin_unlock(&block_group->lock);
goto out_put;
}
spin_unlock(&block_group->lock);
/*
* We hit an ENOSPC when setting up the cache in this transaction, just
* skip doing the setup, we've already cleared the cache so we're safe.
*/
if (test_bit(BTRFS_TRANS_CACHE_ENOSPC, &trans->transaction->flags)) {
ret = -ENOSPC;
goto out_put;
}
/*
* Try to preallocate enough space based on how big the block group is.
* Keep in mind this has to include any pinned space which could end up
* taking up quite a bit since it's not folded into the other space
* cache.
*/
cache_size = div_u64(block_group->length, SZ_256M);
if (!cache_size)
cache_size = 1;
cache_size *= 16;
cache_size *= fs_info->sectorsize;
ret = btrfs_check_data_free_space(BTRFS_I(inode), &data_reserved, 0,
cache_size);
if (ret)
goto out_put;
ret = btrfs_prealloc_file_range_trans(inode, trans, 0, 0, cache_size,
cache_size, cache_size,
&alloc_hint);
/*
* Our cache requires contiguous chunks so that we don't modify a bunch
* of metadata or split extents when writing the cache out, which means
* we can enospc if we are heavily fragmented in addition to just normal
* out of space conditions. So if we hit this just skip setting up any
* other block groups for this transaction, maybe we'll unpin enough
* space the next time around.
*/
if (!ret)
dcs = BTRFS_DC_SETUP;
else if (ret == -ENOSPC) set_bit(BTRFS_TRANS_CACHE_ENOSPC, &trans->transaction->flags);
out_put:
iput(inode);
out_free:
btrfs_release_path(path);
out:
spin_lock(&block_group->lock);
if (!ret && dcs == BTRFS_DC_SETUP)
block_group->cache_generation = trans->transid; block_group->disk_cache_state = dcs;
spin_unlock(&block_group->lock);
extent_changeset_free(data_reserved);
return ret;
}
int btrfs_setup_space_cache(struct btrfs_trans_handle *trans)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
struct btrfs_block_group *cache, *tmp;
struct btrfs_transaction *cur_trans = trans->transaction;
struct btrfs_path *path;
if (list_empty(&cur_trans->dirty_bgs) ||
!btrfs_test_opt(fs_info, SPACE_CACHE))
return 0;
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
/* Could add new block groups, use _safe just in case */
list_for_each_entry_safe(cache, tmp, &cur_trans->dirty_bgs,
dirty_list) {
if (cache->disk_cache_state == BTRFS_DC_CLEAR) cache_save_setup(cache, trans, path);
}
btrfs_free_path(path); return 0;
}
/*
* Transaction commit does final block group cache writeback during a critical
* section where nothing is allowed to change the FS. This is required in
* order for the cache to actually match the block group, but can introduce a
* lot of latency into the commit.
*
* So, btrfs_start_dirty_block_groups is here to kick off block group cache IO.
* There's a chance we'll have to redo some of it if the block group changes
* again during the commit, but it greatly reduces the commit latency by
* getting rid of the easy block groups while we're still allowing others to
* join the commit.
*/
int btrfs_start_dirty_block_groups(struct btrfs_trans_handle *trans)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
struct btrfs_block_group *cache;
struct btrfs_transaction *cur_trans = trans->transaction;
int ret = 0;
int should_put;
struct btrfs_path *path = NULL;
LIST_HEAD(dirty);
struct list_head *io = &cur_trans->io_bgs;
int loops = 0;
spin_lock(&cur_trans->dirty_bgs_lock);
if (list_empty(&cur_trans->dirty_bgs)) {
spin_unlock(&cur_trans->dirty_bgs_lock);
return 0;
}
list_splice_init(&cur_trans->dirty_bgs, &dirty);
spin_unlock(&cur_trans->dirty_bgs_lock);
again:
/* Make sure all the block groups on our dirty list actually exist */
btrfs_create_pending_block_groups(trans);
if (!path) {
path = btrfs_alloc_path();
if (!path) {
ret = -ENOMEM;
goto out;
}
}
/*
* cache_write_mutex is here only to save us from balance or automatic
* removal of empty block groups deleting this block group while we are
* writing out the cache
*/
mutex_lock(&trans->transaction->cache_write_mutex);
while (!list_empty(&dirty)) {
bool drop_reserve = true;
cache = list_first_entry(&dirty, struct btrfs_block_group,
dirty_list);
/*
* This can happen if something re-dirties a block group that
* is already under IO. Just wait for it to finish and then do
* it all again
*/
if (!list_empty(&cache->io_list)) {
list_del_init(&cache->io_list);
btrfs_wait_cache_io(trans, cache, path);
btrfs_put_block_group(cache);
}
/*
* btrfs_wait_cache_io uses the cache->dirty_list to decide if
* it should update the cache_state. Don't delete until after
* we wait.
*
* Since we're not running in the commit critical section
* we need the dirty_bgs_lock to protect from update_block_group
*/
spin_lock(&cur_trans->dirty_bgs_lock);
list_del_init(&cache->dirty_list);
spin_unlock(&cur_trans->dirty_bgs_lock);
should_put = 1;
cache_save_setup(cache, trans, path);
if (cache->disk_cache_state == BTRFS_DC_SETUP) {
cache->io_ctl.inode = NULL;
ret = btrfs_write_out_cache(trans, cache, path);
if (ret == 0 && cache->io_ctl.inode) {
should_put = 0;
/*
* The cache_write_mutex is protecting the
* io_list, also refer to the definition of
* btrfs_transaction::io_bgs for more details
*/
list_add_tail(&cache->io_list, io);
} else {
/*
* If we failed to write the cache, the
* generation will be bad and life goes on
*/
ret = 0;
}
}
if (!ret) {
ret = update_block_group_item(trans, path, cache);
/*
* Our block group might still be attached to the list
* of new block groups in the transaction handle of some
* other task (struct btrfs_trans_handle->new_bgs). This
* means its block group item isn't yet in the extent
* tree. If this happens ignore the error, as we will
* try again later in the critical section of the
* transaction commit.
*/
if (ret == -ENOENT) {
ret = 0;
spin_lock(&cur_trans->dirty_bgs_lock);
if (list_empty(&cache->dirty_list)) {
list_add_tail(&cache->dirty_list,
&cur_trans->dirty_bgs);
btrfs_get_block_group(cache);
drop_reserve = false;
}
spin_unlock(&cur_trans->dirty_bgs_lock);
} else if (ret) { btrfs_abort_transaction(trans, ret);
}
}
/* If it's not on the io list, we need to put the block group */
if (should_put) btrfs_put_block_group(cache);
if (drop_reserve)
btrfs_delayed_refs_rsv_release(fs_info, 1);
/*
* Avoid blocking other tasks for too long. It might even save
* us from writing caches for block groups that are going to be
* removed.
*/
mutex_unlock(&trans->transaction->cache_write_mutex);
if (ret)
goto out;
mutex_lock(&trans->transaction->cache_write_mutex);
}
mutex_unlock(&trans->transaction->cache_write_mutex);
/*
* Go through delayed refs for all the stuff we've just kicked off
* and then loop back (just once)
*/
if (!ret)
ret = btrfs_run_delayed_refs(trans, 0);
if (!ret && loops == 0) {
loops++;
spin_lock(&cur_trans->dirty_bgs_lock);
list_splice_init(&cur_trans->dirty_bgs, &dirty);
/*
* dirty_bgs_lock protects us from concurrent block group
* deletes too (not just cache_write_mutex).
*/
if (!list_empty(&dirty)) {
spin_unlock(&cur_trans->dirty_bgs_lock);
goto again;
}
spin_unlock(&cur_trans->dirty_bgs_lock);
}
out:
if (ret < 0) {
spin_lock(&cur_trans->dirty_bgs_lock);
list_splice_init(&dirty, &cur_trans->dirty_bgs);
spin_unlock(&cur_trans->dirty_bgs_lock);
btrfs_cleanup_dirty_bgs(cur_trans, fs_info);
}
btrfs_free_path(path); return ret;
}
int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
struct btrfs_block_group *cache;
struct btrfs_transaction *cur_trans = trans->transaction;
int ret = 0;
int should_put;
struct btrfs_path *path;
struct list_head *io = &cur_trans->io_bgs;
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
/*
* Even though we are in the critical section of the transaction commit,
* we can still have concurrent tasks adding elements to this
* transaction's list of dirty block groups. These tasks correspond to
* endio free space workers started when writeback finishes for a
* space cache, which run inode.c:btrfs_finish_ordered_io(), and can
* allocate new block groups as a result of COWing nodes of the root
* tree when updating the free space inode. The writeback for the space
* caches is triggered by an earlier call to
* btrfs_start_dirty_block_groups() and iterations of the following
* loop.
* Also we want to do the cache_save_setup first and then run the
* delayed refs to make sure we have the best chance at doing this all
* in one shot.
*/
spin_lock(&cur_trans->dirty_bgs_lock);
while (!list_empty(&cur_trans->dirty_bgs)) {
cache = list_first_entry(&cur_trans->dirty_bgs,
struct btrfs_block_group,
dirty_list);
/*
* This can happen if cache_save_setup re-dirties a block group
* that is already under IO. Just wait for it to finish and
* then do it all again
*/
if (!list_empty(&cache->io_list)) {
spin_unlock(&cur_trans->dirty_bgs_lock);
list_del_init(&cache->io_list);
btrfs_wait_cache_io(trans, cache, path);
btrfs_put_block_group(cache);
spin_lock(&cur_trans->dirty_bgs_lock);
}
/*
* Don't remove from the dirty list until after we've waited on
* any pending IO
*/
list_del_init(&cache->dirty_list);
spin_unlock(&cur_trans->dirty_bgs_lock);
should_put = 1;
cache_save_setup(cache, trans, path);
if (!ret)
ret = btrfs_run_delayed_refs(trans,
(unsigned long) -1);
if (!ret && cache->disk_cache_state == BTRFS_DC_SETUP) { cache->io_ctl.inode = NULL;
ret = btrfs_write_out_cache(trans, cache, path);
if (ret == 0 && cache->io_ctl.inode) {
should_put = 0;
list_add_tail(&cache->io_list, io);
} else {
/*
* If we failed to write the cache, the
* generation will be bad and life goes on
*/
ret = 0;
}
}
if (!ret) {
ret = update_block_group_item(trans, path, cache);
/*
* One of the free space endio workers might have
* created a new block group while updating a free space
* cache's inode (at inode.c:btrfs_finish_ordered_io())
* and hasn't released its transaction handle yet, in
* which case the new block group is still attached to
* its transaction handle and its creation has not
* finished yet (no block group item in the extent tree
* yet, etc). If this is the case, wait for all free
* space endio workers to finish and retry. This is a
* very rare case so no need for a more efficient and
* complex approach.
*/
if (ret == -ENOENT) {
wait_event(cur_trans->writer_wait,
atomic_read(&cur_trans->num_writers) == 1);
ret = update_block_group_item(trans, path, cache);
}
if (ret) btrfs_abort_transaction(trans, ret);
}
/* If its not on the io list, we need to put the block group */
if (should_put) btrfs_put_block_group(cache); btrfs_delayed_refs_rsv_release(fs_info, 1);
spin_lock(&cur_trans->dirty_bgs_lock);
}
spin_unlock(&cur_trans->dirty_bgs_lock);
/*
* Refer to the definition of io_bgs member for details why it's safe
* to use it without any locking
*/
while (!list_empty(io)) {
cache = list_first_entry(io, struct btrfs_block_group,
io_list);
list_del_init(&cache->io_list);
btrfs_wait_cache_io(trans, cache, path);
btrfs_put_block_group(cache);
}
btrfs_free_path(path); return ret;
}
int btrfs_update_block_group(struct btrfs_trans_handle *trans,
u64 bytenr, u64 num_bytes, int alloc)
{
struct btrfs_fs_info *info = trans->fs_info;
struct btrfs_block_group *cache = NULL;
u64 total = num_bytes;
u64 old_val;
u64 byte_in_group;
int factor;
int ret = 0;
/* Block accounting for super block */
spin_lock(&info->delalloc_root_lock);
old_val = btrfs_super_bytes_used(info->super_copy);
if (alloc)
old_val += num_bytes;
else
old_val -= num_bytes;
btrfs_set_super_bytes_used(info->super_copy, old_val);
spin_unlock(&info->delalloc_root_lock);
while (total) {
cache = btrfs_lookup_block_group(info, bytenr);
if (!cache) {
ret = -ENOENT;
break;
}
factor = btrfs_bg_type_to_factor(cache->flags);
/*
* If this block group has free space cache written out, we
* need to make sure to load it if we are removing space. This
* is because we need the unpinning stage to actually add the
* space back to the block group, otherwise we will leak space.
*/
if (!alloc && !btrfs_block_group_done(cache))
btrfs_cache_block_group(cache, 1); byte_in_group = bytenr - cache->start; WARN_ON(byte_in_group > cache->length); spin_lock(&cache->space_info->lock);
spin_lock(&cache->lock);
if (btrfs_test_opt(info, SPACE_CACHE) &&
cache->disk_cache_state < BTRFS_DC_CLEAR) cache->disk_cache_state = BTRFS_DC_CLEAR; old_val = cache->used;
num_bytes = min(total, cache->length - byte_in_group);
if (alloc) {
old_val += num_bytes;
cache->used = old_val;
cache->reserved -= num_bytes;
cache->space_info->bytes_reserved -= num_bytes;
cache->space_info->bytes_used += num_bytes;
cache->space_info->disk_used += num_bytes * factor;
spin_unlock(&cache->lock);
spin_unlock(&cache->space_info->lock);
} else {
old_val -= num_bytes;
cache->used = old_val;
cache->pinned += num_bytes;
btrfs_space_info_update_bytes_pinned(info,
cache->space_info, num_bytes);
cache->space_info->bytes_used -= num_bytes;
cache->space_info->disk_used -= num_bytes * factor;
spin_unlock(&cache->lock);
spin_unlock(&cache->space_info->lock);
set_extent_dirty(&trans->transaction->pinned_extents,
bytenr, bytenr + num_bytes - 1,
GFP_NOFS | __GFP_NOFAIL);
}
spin_lock(&trans->transaction->dirty_bgs_lock);
if (list_empty(&cache->dirty_list)) {
list_add_tail(&cache->dirty_list,
&trans->transaction->dirty_bgs);
trans->delayed_ref_updates++;
btrfs_get_block_group(cache);
}
spin_unlock(&trans->transaction->dirty_bgs_lock);
/*
* No longer have used bytes in this block group, queue it for
* deletion. We do this after adding the block group to the
* dirty list to avoid races between cleaner kthread and space
* cache writeout.
*/
if (!alloc && old_val == 0) { if (!btrfs_test_opt(info, DISCARD_ASYNC)) btrfs_mark_bg_unused(cache);
}
btrfs_put_block_group(cache);
total -= num_bytes;
bytenr += num_bytes;
}
/* Modified block groups are accounted for in the delayed_refs_rsv. */
btrfs_update_delayed_refs_rsv(trans);
return ret;
}
/**
* btrfs_add_reserved_bytes - update the block_group and space info counters
* @cache: The cache we are manipulating
* @ram_bytes: The number of bytes of file content, and will be same to
* @num_bytes except for the compress path.
* @num_bytes: The number of bytes in question
* @delalloc: The blocks are allocated for the delalloc write
*
* This is called by the allocator when it reserves space. If this is a
* reservation and the block group has become read only we cannot make the
* reservation and return -EAGAIN, otherwise this function always succeeds.
*/
int btrfs_add_reserved_bytes(struct btrfs_block_group *cache,
u64 ram_bytes, u64 num_bytes, int delalloc)
{
struct btrfs_space_info *space_info = cache->space_info;
int ret = 0;
spin_lock(&space_info->lock);
spin_lock(&cache->lock);
if (cache->ro) {
ret = -EAGAIN;
} else {
cache->reserved += num_bytes;
space_info->bytes_reserved += num_bytes;
trace_btrfs_space_reservation(cache->fs_info, "space_info",
space_info->flags, num_bytes, 1);
btrfs_space_info_update_bytes_may_use(cache->fs_info,
space_info, -ram_bytes);
if (delalloc)
cache->delalloc_bytes += num_bytes;
/*
* Compression can use less space than we reserved, so wake
* tickets if that happens
*/
if (num_bytes < ram_bytes) btrfs_try_granting_tickets(cache->fs_info, space_info);
}
spin_unlock(&cache->lock);
spin_unlock(&space_info->lock);
return ret;
}
/**
* btrfs_free_reserved_bytes - update the block_group and space info counters
* @cache: The cache we are manipulating
* @num_bytes: The number of bytes in question
* @delalloc: The blocks are allocated for the delalloc write
*
* This is called by somebody who is freeing space that was never actually used
* on disk. For example if you reserve some space for a new leaf in transaction
* A and before transaction A commits you free that leaf, you call this with
* reserve set to 0 in order to clear the reservation.
*/
void btrfs_free_reserved_bytes(struct btrfs_block_group *cache,
u64 num_bytes, int delalloc)
{
struct btrfs_space_info *space_info = cache->space_info;
spin_lock(&space_info->lock);
spin_lock(&cache->lock);
if (cache->ro)
space_info->bytes_readonly += num_bytes; cache->reserved -= num_bytes;
space_info->bytes_reserved -= num_bytes;
space_info->max_extent_size = 0;
if (delalloc)
cache->delalloc_bytes -= num_bytes;
spin_unlock(&cache->lock);
btrfs_try_granting_tickets(cache->fs_info, space_info);
spin_unlock(&space_info->lock);
}
static void force_metadata_allocation(struct btrfs_fs_info *info)
{
struct list_head *head = &info->space_info;
struct btrfs_space_info *found;
list_for_each_entry(found, head, list) { if (found->flags & BTRFS_BLOCK_GROUP_METADATA) found->force_alloc = CHUNK_ALLOC_FORCE;
}
}
static int should_alloc_chunk(struct btrfs_fs_info *fs_info,
struct btrfs_space_info *sinfo, int force)
{
u64 bytes_used = btrfs_space_info_used(sinfo, false);
u64 thresh;
if (force == CHUNK_ALLOC_FORCE)
return 1;
/*
* in limited mode, we want to have some free space up to
* about 1% of the FS size.
*/
if (force == CHUNK_ALLOC_LIMITED) {
thresh = btrfs_super_total_bytes(fs_info->super_copy);
thresh = max_t(u64, SZ_64M, div_factor_fine(thresh, 1));
if (sinfo->total_bytes - bytes_used < thresh)
return 1;
}
if (bytes_used + SZ_2M < div_factor(sinfo->total_bytes, 8))
return 0;
return 1;
}
int btrfs_force_chunk_alloc(struct btrfs_trans_handle *trans, u64 type)
{
u64 alloc_flags = btrfs_get_alloc_profile(trans->fs_info, type);
return btrfs_chunk_alloc(trans, alloc_flags, CHUNK_ALLOC_FORCE);
}
static int do_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags)
{
struct btrfs_block_group *bg;
int ret;
/*
* Check if we have enough space in the system space info because we
* will need to update device items in the chunk btree and insert a new
* chunk item in the chunk btree as well. This will allocate a new
* system block group if needed.
*/
check_system_chunk(trans, flags);
bg = btrfs_alloc_chunk(trans, flags);
if (IS_ERR(bg)) {
ret = PTR_ERR(bg);
goto out;
}
/*
* If this is a system chunk allocation then stop right here and do not
* add the chunk item to the chunk btree. This is to prevent a deadlock
* because this system chunk allocation can be triggered while COWing
* some extent buffer of the chunk btree and while holding a lock on a
* parent extent buffer, in which case attempting to insert the chunk
* item (or update the device item) would result in a deadlock on that
* parent extent buffer. In this case defer the chunk btree updates to
* the second phase of chunk allocation and keep our reservation until
* the second phase completes.
*
* This is a rare case and can only be triggered by the very few cases
* we have where we need to touch the chunk btree outside chunk allocation
* and chunk removal. These cases are basically adding a device, removing
* a device or resizing a device.
*/
if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
return 0;
ret = btrfs_chunk_alloc_add_chunk_item(trans, bg);
/*
* Normally we are not expected to fail with -ENOSPC here, since we have
* previously reserved space in the system space_info and allocated one
* new system chunk if necessary. However there are two exceptions:
*
* 1) We may have enough free space in the system space_info but all the
* existing system block groups have a profile which can not be used
* for extent allocation.
*
* This happens when mounting in degraded mode. For example we have a
* RAID1 filesystem with 2 devices, lose one device and mount the fs
* using the other device in degraded mode. If we then allocate a chunk,
* we may have enough free space in the existing system space_info, but
* none of the block groups can be used for extent allocation since they
* have a RAID1 profile, and because we are in degraded mode with a
* single device, we are forced to allocate a new system chunk with a
* SINGLE profile. Making check_system_chunk() iterate over all system
* block groups and check if they have a usable profile and enough space
* can be slow on very large filesystems, so we tolerate the -ENOSPC and
* try again after forcing allocation of a new system chunk. Like this
* we avoid paying the cost of that search in normal circumstances, when
* we were not mounted in degraded mode;
*
* 2) We had enough free space info the system space_info, and one suitable
* block group to allocate from when we called check_system_chunk()
* above. However right after we called it, the only system block group
* with enough free space got turned into RO mode by a running scrub,
* and in this case we have to allocate a new one and retry. We only
* need do this allocate and retry once, since we have a transaction
* handle and scrub uses the commit root to search for block groups.
*/
if (ret == -ENOSPC) {
const u64 sys_flags = btrfs_system_alloc_profile(trans->fs_info);
struct btrfs_block_group *sys_bg;
sys_bg = btrfs_alloc_chunk(trans, sys_flags);
if (IS_ERR(sys_bg)) {
ret = PTR_ERR(sys_bg);
btrfs_abort_transaction(trans, ret);
goto out;
}
ret = btrfs_chunk_alloc_add_chunk_item(trans, sys_bg);
if (ret) {
btrfs_abort_transaction(trans, ret);
goto out;
}
ret = btrfs_chunk_alloc_add_chunk_item(trans, bg);
if (ret) {
btrfs_abort_transaction(trans, ret);
goto out;
}
} else if (ret) { btrfs_abort_transaction(trans, ret);
goto out;
}
out:
btrfs_trans_release_chunk_metadata(trans);
return ret;
}
/*
* Chunk allocation is done in 2 phases:
*
* 1) Phase 1 - through btrfs_chunk_alloc() we allocate device extents for
* the chunk, the chunk mapping, create its block group and add the items
* that belong in the chunk btree to it - more specifically, we need to
* update device items in the chunk btree and add a new chunk item to it.
*
* 2) Phase 2 - through btrfs_create_pending_block_groups(), we add the block
* group item to the extent btree and the device extent items to the devices
* btree.
*
* This is done to prevent deadlocks. For example when COWing a node from the
* extent btree we are holding a write lock on the node's parent and if we
* trigger chunk allocation and attempted to insert the new block group item
* in the extent btree right way, we could deadlock because the path for the
* insertion can include that parent node. At first glance it seems impossible
* to trigger chunk allocation after starting a transaction since tasks should
* reserve enough transaction units (metadata space), however while that is true
* most of the time, chunk allocation may still be triggered for several reasons:
*
* 1) When reserving metadata, we check if there is enough free space in the
* metadata space_info and therefore don't trigger allocation of a new chunk.
* However later when the task actually tries to COW an extent buffer from
* the extent btree or from the device btree for example, it is forced to
* allocate a new block group (chunk) because the only one that had enough
* free space was just turned to RO mode by a running scrub for example (or
* device replace, block group reclaim thread, etc), so we can not use it
* for allocating an extent and end up being forced to allocate a new one;
*
* 2) Because we only check that the metadata space_info has enough free bytes,
* we end up not allocating a new metadata chunk in that case. However if
* the filesystem was mounted in degraded mode, none of the existing block
* groups might be suitable for extent allocation due to their incompatible
* profile (for e.g. mounting a 2 devices filesystem, where all block groups
* use a RAID1 profile, in degraded mode using a single device). In this case
* when the task attempts to COW some extent buffer of the extent btree for
* example, it will trigger allocation of a new metadata block group with a
* suitable profile (SINGLE profile in the example of the degraded mount of
* the RAID1 filesystem);
*
* 3) The task has reserved enough transaction units / metadata space, but when
* it attempts to COW an extent buffer from the extent or device btree for
* example, it does not find any free extent in any metadata block group,
* therefore forced to try to allocate a new metadata block group.
* This is because some other task allocated all available extents in the
* meanwhile - this typically happens with tasks that don't reserve space
* properly, either intentionally or as a bug. One example where this is
* done intentionally is fsync, as it does not reserve any transaction units
* and ends up allocating a variable number of metadata extents for log
* tree extent buffers.
*
* We also need this 2 phases setup when adding a device to a filesystem with
* a seed device - we must create new metadata and system chunks without adding
* any of the block group items to the chunk, extent and device btrees. If we
* did not do it this way, we would get ENOSPC when attempting to update those
* btrees, since all the chunks from the seed device are read-only.
*
* Phase 1 does the updates and insertions to the chunk btree because if we had
* it done in phase 2 and have a thundering herd of tasks allocating chunks in
* parallel, we risk having too many system chunks allocated by many tasks if
* many tasks reach phase 1 without the previous ones completing phase 2. In the
* extreme case this leads to exhaustion of the system chunk array in the
* superblock. This is easier to trigger if using a btree node/leaf size of 64K
* and with RAID filesystems (so we have more device items in the chunk btree).
* This has happened before and commit eafa4fd0ad0607 ("btrfs: fix exhaustion of
* the system chunk array due to concurrent allocations") provides more details.
*
* For allocation of system chunks, we defer the updates and insertions into the
* chunk btree to phase 2. This is to prevent deadlocks on extent buffers because
* if the chunk allocation is triggered while COWing an extent buffer of the
* chunk btree, we are holding a lock on the parent of that extent buffer and
* doing the chunk btree updates and insertions can require locking that parent.
* This is for the very few and rare cases where we update the chunk btree that
* are not chunk allocation or chunk removal: adding a device, removing a device
* or resizing a device.
*
* The reservation of system space, done through check_system_chunk(), as well
* as all the updates and insertions into the chunk btree must be done while
* holding fs_info->chunk_mutex. This is important to guarantee that while COWing
* an extent buffer from the chunks btree we never trigger allocation of a new
* system chunk, which would result in a deadlock (trying to lock twice an
* extent buffer of the chunk btree, first time before triggering the chunk
* allocation and the second time during chunk allocation while attempting to
* update the chunks btree). The system chunk array is also updated while holding
* that mutex. The same logic applies to removing chunks - we must reserve system
* space, update the chunk btree and the system chunk array in the superblock
* while holding fs_info->chunk_mutex.
*
* This function, btrfs_chunk_alloc(), belongs to phase 1.
*
* If @force is CHUNK_ALLOC_FORCE:
* - return 1 if it successfully allocates a chunk,
* - return errors including -ENOSPC otherwise.
* If @force is NOT CHUNK_ALLOC_FORCE:
* - return 0 if it doesn't need to allocate a new chunk,
* - return 1 if it successfully allocates a chunk,
* - return errors including -ENOSPC otherwise.
*/
int btrfs_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags,
enum btrfs_chunk_alloc_enum force)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
struct btrfs_space_info *space_info;
bool wait_for_alloc = false;
bool should_alloc = false;
int ret = 0;
/* Don't re-enter if we're already allocating a chunk */
if (trans->allocating_chunk)
return -ENOSPC;
/*
* If we are removing a chunk, don't re-enter or we would deadlock.
* System space reservation and system chunk allocation is done by the
* chunk remove operation (btrfs_remove_chunk()).
*/
if (trans->removing_chunk)
return -ENOSPC;
space_info = btrfs_find_space_info(fs_info, flags);
ASSERT(space_info);
do {
spin_lock(&space_info->lock);
if (force < space_info->force_alloc)
force = space_info->force_alloc;
should_alloc = should_alloc_chunk(fs_info, space_info, force);
if (space_info->full) {
/* No more free physical space */
if (should_alloc)
ret = -ENOSPC;
else
ret = 0;
spin_unlock(&space_info->lock);
return ret;
} else if (!should_alloc) {
spin_unlock(&space_info->lock);
return 0; } else if (space_info->chunk_alloc) {
/*
* Someone is already allocating, so we need to block
* until this someone is finished and then loop to
* recheck if we should continue with our allocation
* attempt.
*/
wait_for_alloc = true;
spin_unlock(&space_info->lock);
mutex_lock(&fs_info->chunk_mutex);
mutex_unlock(&fs_info->chunk_mutex);
} else {
/* Proceed with allocation */
space_info->chunk_alloc = 1;
wait_for_alloc = false;
spin_unlock(&space_info->lock);
}
cond_resched();
} while (wait_for_alloc);
mutex_lock(&fs_info->chunk_mutex);
trans->allocating_chunk = true;
/*
* If we have mixed data/metadata chunks we want to make sure we keep
* allocating mixed chunks instead of individual chunks.
*/
if (btrfs_mixed_space_info(space_info))
flags |= (BTRFS_BLOCK_GROUP_DATA | BTRFS_BLOCK_GROUP_METADATA);
/*
* if we're doing a data chunk, go ahead and make sure that
* we keep a reasonable number of metadata chunks allocated in the
* FS as well.
*/
if (flags & BTRFS_BLOCK_GROUP_DATA && fs_info->metadata_ratio) { fs_info->data_chunk_allocations++;
if (!(fs_info->data_chunk_allocations %
fs_info->metadata_ratio))
force_metadata_allocation(fs_info);
}
ret = do_chunk_alloc(trans, flags);
trans->allocating_chunk = false;
spin_lock(&space_info->lock);
if (ret < 0) {
if (ret == -ENOSPC) space_info->full = 1;
else
goto out;
} else {
ret = 1;
space_info->max_extent_size = 0;
}
space_info->force_alloc = CHUNK_ALLOC_NO_FORCE;
out:
space_info->chunk_alloc = 0;
spin_unlock(&space_info->lock);
mutex_unlock(&fs_info->chunk_mutex);
return ret;
}
static u64 get_profile_num_devs(struct btrfs_fs_info *fs_info, u64 type)
{
u64 num_dev;
num_dev = btrfs_raid_array[btrfs_bg_flags_to_raid_index(type)].devs_max;
if (!num_dev)
num_dev = fs_info->fs_devices->rw_devices;
return num_dev;
}
/*
* Reserve space in the system space for allocating or removing a chunk
*/
void check_system_chunk(struct btrfs_trans_handle *trans, u64 type)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
struct btrfs_space_info *info;
u64 left;
u64 thresh;
int ret = 0;
u64 num_devs;
/*
* Needed because we can end up allocating a system chunk and for an
* atomic and race free space reservation in the chunk block reserve.
*/
lockdep_assert_held(&fs_info->chunk_mutex);
info = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_SYSTEM);
spin_lock(&info->lock);
left = info->total_bytes - btrfs_space_info_used(info, true);
spin_unlock(&info->lock);
num_devs = get_profile_num_devs(fs_info, type);
/* num_devs device items to update and 1 chunk item to add or remove */
thresh = btrfs_calc_metadata_size(fs_info, num_devs) +
btrfs_calc_insert_metadata_size(fs_info, 1);
if (left < thresh && btrfs_test_opt(fs_info, ENOSPC_DEBUG)) {
btrfs_info(fs_info, "left=%llu, need=%llu, flags=%llu",
left, thresh, type);
btrfs_dump_space_info(fs_info, info, 0, 0);
}
if (left < thresh) {
u64 flags = btrfs_system_alloc_profile(fs_info);
struct btrfs_block_group *bg;
/*
* Ignore failure to create system chunk. We might end up not
* needing it, as we might not need to COW all nodes/leafs from
* the paths we visit in the chunk tree (they were already COWed
* or created in the current transaction for example).
*
* Also, if our caller is allocating a system chunk, do not
* attempt to insert the chunk item in the chunk btree, as we
* could deadlock on an extent buffer since our caller may be
* COWing an extent buffer from the chunk btree.
*/
bg = btrfs_alloc_chunk(trans, flags);
if (IS_ERR(bg)) {
ret = PTR_ERR(bg);
} else if (!(type & BTRFS_BLOCK_GROUP_SYSTEM)) {
/*
* If we fail to add the chunk item here, we end up
* trying again at phase 2 of chunk allocation, at
* btrfs_create_pending_block_groups(). So ignore
* any error here.
*/
btrfs_chunk_alloc_add_chunk_item(trans, bg);
}
}
if (!ret) {
ret = btrfs_block_rsv_add(fs_info->chunk_root,
&fs_info->chunk_block_rsv,
thresh, BTRFS_RESERVE_NO_FLUSH);
if (!ret)
trans->chunk_bytes_reserved += thresh;
}
}
void btrfs_put_block_group_cache(struct btrfs_fs_info *info)
{
struct btrfs_block_group *block_group;
u64 last = 0;
while (1) {
struct inode *inode;
block_group = btrfs_lookup_first_block_group(info, last);
while (block_group) {
btrfs_wait_block_group_cache_done(block_group);
spin_lock(&block_group->lock);
if (block_group->iref)
break;
spin_unlock(&block_group->lock);
block_group = btrfs_next_block_group(block_group);
}
if (!block_group) {
if (last == 0)
break;
last = 0;
continue;
}
inode = block_group->inode;
block_group->iref = 0;
block_group->inode = NULL;
spin_unlock(&block_group->lock);
ASSERT(block_group->io_ctl.inode == NULL);
iput(inode);
last = block_group->start + block_group->length;
btrfs_put_block_group(block_group);
}
}
/*
* Must be called only after stopping all workers, since we could have block
* group caching kthreads running, and therefore they could race with us if we
* freed the block groups before stopping them.
*/
int btrfs_free_block_groups(struct btrfs_fs_info *info)
{
struct btrfs_block_group *block_group;
struct btrfs_space_info *space_info;
struct btrfs_caching_control *caching_ctl;
struct rb_node *n;
spin_lock(&info->block_group_cache_lock);
while (!list_empty(&info->caching_block_groups)) {
caching_ctl = list_entry(info->caching_block_groups.next,
struct btrfs_caching_control, list);
list_del(&caching_ctl->list);
btrfs_put_caching_control(caching_ctl);
}
spin_unlock(&info->block_group_cache_lock);
spin_lock(&info->unused_bgs_lock);
while (!list_empty(&info->unused_bgs)) {
block_group = list_first_entry(&info->unused_bgs,
struct btrfs_block_group,
bg_list);
list_del_init(&block_group->bg_list);
btrfs_put_block_group(block_group);
}
spin_unlock(&info->unused_bgs_lock);
spin_lock(&info->unused_bgs_lock);
while (!list_empty(&info->reclaim_bgs)) {
block_group = list_first_entry(&info->reclaim_bgs,
struct btrfs_block_group,
bg_list);
list_del_init(&block_group->bg_list);
btrfs_put_block_group(block_group);
}
spin_unlock(&info->unused_bgs_lock);
spin_lock(&info->block_group_cache_lock);
while ((n = rb_last(&info->block_group_cache_tree)) != NULL) { block_group = rb_entry(n, struct btrfs_block_group,
cache_node);
rb_erase(&block_group->cache_node,
&info->block_group_cache_tree);
RB_CLEAR_NODE(&block_group->cache_node);
spin_unlock(&info->block_group_cache_lock);
down_write(&block_group->space_info->groups_sem);
list_del(&block_group->list);
up_write(&block_group->space_info->groups_sem);
/*
* We haven't cached this block group, which means we could
* possibly have excluded extents on this block group.
*/
if (block_group->cached == BTRFS_CACHE_NO ||
block_group->cached == BTRFS_CACHE_ERROR)
btrfs_free_excluded_extents(block_group); btrfs_remove_free_space_cache(block_group);
ASSERT(block_group->cached != BTRFS_CACHE_STARTED);
ASSERT(list_empty(&block_group->dirty_list));
ASSERT(list_empty(&block_group->io_list));
ASSERT(list_empty(&block_group->bg_list));
ASSERT(refcount_read(&block_group->refs) == 1);
ASSERT(block_group->swap_extents == 0);
btrfs_put_block_group(block_group);
spin_lock(&info->block_group_cache_lock);
}
spin_unlock(&info->block_group_cache_lock);
btrfs_release_global_block_rsv(info);
while (!list_empty(&info->space_info)) {
space_info = list_entry(info->space_info.next,
struct btrfs_space_info,
list);
/*
* Do not hide this behind enospc_debug, this is actually
* important and indicates a real bug if this happens.
*/
if (WARN_ON(space_info->bytes_pinned > 0 ||
space_info->bytes_may_use > 0))
btrfs_dump_space_info(info, space_info, 0, 0);
/*
* If there was a failure to cleanup a log tree, very likely due
* to an IO failure on a writeback attempt of one or more of its
* extent buffers, we could not do proper (and cheap) unaccounting
* of their reserved space, so don't warn on bytes_reserved > 0 in
* that case.
*/
if (!(space_info->flags & BTRFS_BLOCK_GROUP_METADATA) || !BTRFS_FS_LOG_CLEANUP_ERROR(info)) { if (WARN_ON(space_info->bytes_reserved > 0))
btrfs_dump_space_info(info, space_info, 0, 0);
}
WARN_ON(space_info->reclaim_size > 0);
list_del(&space_info->list);
btrfs_sysfs_remove_space_info(space_info);
}
return 0;
}
void btrfs_freeze_block_group(struct btrfs_block_group *cache)
{
atomic_inc(&cache->frozen);
}
void btrfs_unfreeze_block_group(struct btrfs_block_group *block_group)
{
struct btrfs_fs_info *fs_info = block_group->fs_info;
struct extent_map_tree *em_tree;
struct extent_map *em;
bool cleanup;
spin_lock(&block_group->lock);
cleanup = (atomic_dec_and_test(&block_group->frozen) &&
block_group->removed);
spin_unlock(&block_group->lock);
if (cleanup) {
em_tree = &fs_info->mapping_tree;
write_lock(&em_tree->lock);
em = lookup_extent_mapping(em_tree, block_group->start,
1);
BUG_ON(!em); /* logic error, can't happen */
remove_extent_mapping(em_tree, em);
write_unlock(&em_tree->lock);
/* once for us and once for the tree */
free_extent_map(em);
free_extent_map(em);
/*
* We may have left one free space entry and other possible
* tasks trimming this block group have left 1 entry each one.
* Free them if any.
*/
__btrfs_remove_free_space_cache(block_group->free_space_ctl);
}
}
bool btrfs_inc_block_group_swap_extents(struct btrfs_block_group *bg)
{
bool ret = true;
spin_lock(&bg->lock);
if (bg->ro)
ret = false;
else
bg->swap_extents++;
spin_unlock(&bg->lock);
return ret;
}
void btrfs_dec_block_group_swap_extents(struct btrfs_block_group *bg, int amount)
{
spin_lock(&bg->lock);
ASSERT(!bg->ro);
ASSERT(bg->swap_extents >= amount);
bg->swap_extents -= amount;
spin_unlock(&bg->lock);
}
// SPDX-License-Identifier: GPL-2.0
/*
* Functions related to generic helpers functions
*/
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/bio.h>
#include <linux/blkdev.h>
#include <linux/scatterlist.h>
#include "blk.h"
struct bio *blk_next_bio(struct bio *bio, unsigned int nr_pages, gfp_t gfp)
{
struct bio *new = bio_alloc(gfp, nr_pages);
if (bio) {
bio_chain(bio, new);
submit_bio(bio);
}
return new;
}
EXPORT_SYMBOL_GPL(blk_next_bio);
int __blkdev_issue_discard(struct block_device *bdev, sector_t sector,
sector_t nr_sects, gfp_t gfp_mask, int flags,
struct bio **biop)
{
struct request_queue *q = bdev_get_queue(bdev);
struct bio *bio = *biop;
unsigned int op;
sector_t bs_mask, part_offset = 0;
if (!q)
return -ENXIO;
if (bdev_read_only(bdev))
return -EPERM;
if (flags & BLKDEV_DISCARD_SECURE) {
if (!blk_queue_secure_erase(q))
return -EOPNOTSUPP;
op = REQ_OP_SECURE_ERASE;
} else {
if (!blk_queue_discard(q))
return -EOPNOTSUPP;
op = REQ_OP_DISCARD;
}
/* In case the discard granularity isn't set by buggy device driver */
if (WARN_ON_ONCE(!q->limits.discard_granularity)) {
char dev_name[BDEVNAME_SIZE];
bdevname(bdev, dev_name);
pr_err_ratelimited("%s: Error: discard_granularity is 0.\n", dev_name);
return -EOPNOTSUPP;
}
bs_mask = (bdev_logical_block_size(bdev) >> 9) - 1;
if ((sector | nr_sects) & bs_mask)
return -EINVAL;
if (!nr_sects)
return -EINVAL;
/* In case the discard request is in a partition */
if (bdev_is_partition(bdev)) part_offset = bdev->bd_start_sect; while (nr_sects) {
sector_t granularity_aligned_lba, req_sects;
sector_t sector_mapped = sector + part_offset;
granularity_aligned_lba = round_up(sector_mapped,
q->limits.discard_granularity >> SECTOR_SHIFT);
/*
* Check whether the discard bio starts at a discard_granularity
* aligned LBA,
* - If no: set (granularity_aligned_lba - sector_mapped) to
* bi_size of the first split bio, then the second bio will
* start at a discard_granularity aligned LBA on the device.
* - If yes: use bio_aligned_discard_max_sectors() as the max
* possible bi_size of the first split bio. Then when this bio
* is split in device drive, the split ones are very probably
* to be aligned to discard_granularity of the device's queue.
*/
if (granularity_aligned_lba == sector_mapped)
req_sects = min_t(sector_t, nr_sects,
bio_aligned_discard_max_sectors(q));
else
req_sects = min_t(sector_t, nr_sects,
granularity_aligned_lba - sector_mapped);
WARN_ON_ONCE((req_sects << 9) > UINT_MAX); bio = blk_next_bio(bio, 0, gfp_mask);
bio->bi_iter.bi_sector = sector;
bio_set_dev(bio, bdev);
bio_set_op_attrs(bio, op, 0);
bio->bi_iter.bi_size = req_sects << 9;
sector += req_sects;
nr_sects -= req_sects;
/*
* We can loop for a long time in here, if someone does
* full device discards (like mkfs). Be nice and allow
* us to schedule out to avoid softlocking if preempt
* is disabled.
*/
cond_resched();
}
*biop = bio; return 0;
}
EXPORT_SYMBOL(__blkdev_issue_discard);
/**
* blkdev_issue_discard - queue a discard
* @bdev: blockdev to issue discard for
* @sector: start sector
* @nr_sects: number of sectors to discard
* @gfp_mask: memory allocation flags (for bio_alloc)
* @flags: BLKDEV_DISCARD_* flags to control behaviour
*
* Description:
* Issue a discard request for the sectors in question.
*/
int blkdev_issue_discard(struct block_device *bdev, sector_t sector,
sector_t nr_sects, gfp_t gfp_mask, unsigned long flags)
{
struct bio *bio = NULL;
struct blk_plug plug;
int ret;
blk_start_plug(&plug);
ret = __blkdev_issue_discard(bdev, sector, nr_sects, gfp_mask, flags,
&bio);
if (!ret && bio) { ret = submit_bio_wait(bio);
if (ret == -EOPNOTSUPP)
ret = 0;
bio_put(bio);
}
blk_finish_plug(&plug);
return ret;
}
EXPORT_SYMBOL(blkdev_issue_discard);
/**
* __blkdev_issue_write_same - generate number of bios with same page
* @bdev: target blockdev
* @sector: start sector
* @nr_sects: number of sectors to write
* @gfp_mask: memory allocation flags (for bio_alloc)
* @page: page containing data to write
* @biop: pointer to anchor bio
*
* Description:
* Generate and issue number of bios(REQ_OP_WRITE_SAME) with same page.
*/
static int __blkdev_issue_write_same(struct block_device *bdev, sector_t sector,
sector_t nr_sects, gfp_t gfp_mask, struct page *page,
struct bio **biop)
{
struct request_queue *q = bdev_get_queue(bdev);
unsigned int max_write_same_sectors;
struct bio *bio = *biop;
sector_t bs_mask;
if (!q)
return -ENXIO;
if (bdev_read_only(bdev))
return -EPERM;
bs_mask = (bdev_logical_block_size(bdev) >> 9) - 1;
if ((sector | nr_sects) & bs_mask)
return -EINVAL;
if (!bdev_write_same(bdev))
return -EOPNOTSUPP;
/* Ensure that max_write_same_sectors doesn't overflow bi_size */
max_write_same_sectors = bio_allowed_max_sectors(q);
while (nr_sects) {
bio = blk_next_bio(bio, 1, gfp_mask);
bio->bi_iter.bi_sector = sector;
bio_set_dev(bio, bdev);
bio->bi_vcnt = 1;
bio->bi_io_vec->bv_page = page;
bio->bi_io_vec->bv_offset = 0;
bio->bi_io_vec->bv_len = bdev_logical_block_size(bdev);
bio_set_op_attrs(bio, REQ_OP_WRITE_SAME, 0);
if (nr_sects > max_write_same_sectors) {
bio->bi_iter.bi_size = max_write_same_sectors << 9;
nr_sects -= max_write_same_sectors;
sector += max_write_same_sectors;
} else {
bio->bi_iter.bi_size = nr_sects << 9;
nr_sects = 0;
}
cond_resched();
}
*biop = bio;
return 0;
}
/**
* blkdev_issue_write_same - queue a write same operation
* @bdev: target blockdev
* @sector: start sector
* @nr_sects: number of sectors to write
* @gfp_mask: memory allocation flags (for bio_alloc)
* @page: page containing data
*
* Description:
* Issue a write same request for the sectors in question.
*/
int blkdev_issue_write_same(struct block_device *bdev, sector_t sector,
sector_t nr_sects, gfp_t gfp_mask,
struct page *page)
{
struct bio *bio = NULL;
struct blk_plug plug;
int ret;
blk_start_plug(&plug);
ret = __blkdev_issue_write_same(bdev, sector, nr_sects, gfp_mask, page,
&bio);
if (ret == 0 && bio) {
ret = submit_bio_wait(bio);
bio_put(bio);
}
blk_finish_plug(&plug);
return ret;
}
EXPORT_SYMBOL(blkdev_issue_write_same);
static int __blkdev_issue_write_zeroes(struct block_device *bdev,
sector_t sector, sector_t nr_sects, gfp_t gfp_mask,
struct bio **biop, unsigned flags)
{
struct bio *bio = *biop;
unsigned int max_write_zeroes_sectors;
struct request_queue *q = bdev_get_queue(bdev);
if (!q)
return -ENXIO;
if (bdev_read_only(bdev))
return -EPERM;
/* Ensure that max_write_zeroes_sectors doesn't overflow bi_size */
max_write_zeroes_sectors = bdev_write_zeroes_sectors(bdev);
if (max_write_zeroes_sectors == 0)
return -EOPNOTSUPP;
while (nr_sects) { bio = blk_next_bio(bio, 0, gfp_mask);
bio->bi_iter.bi_sector = sector;
bio_set_dev(bio, bdev); bio->bi_opf = REQ_OP_WRITE_ZEROES;
if (flags & BLKDEV_ZERO_NOUNMAP)
bio->bi_opf |= REQ_NOUNMAP; if (nr_sects > max_write_zeroes_sectors) { bio->bi_iter.bi_size = max_write_zeroes_sectors << 9;
nr_sects -= max_write_zeroes_sectors;
sector += max_write_zeroes_sectors;
} else {
bio->bi_iter.bi_size = nr_sects << 9;
nr_sects = 0;
}
cond_resched();
}
*biop = bio; return 0;
}
/*
* Convert a number of 512B sectors to a number of pages.
* The result is limited to a number of pages that can fit into a BIO.
* Also make sure that the result is always at least 1 (page) for the cases
* where nr_sects is lower than the number of sectors in a page.
*/
static unsigned int __blkdev_sectors_to_bio_pages(sector_t nr_sects)
{
sector_t pages = DIV_ROUND_UP_SECTOR_T(nr_sects, PAGE_SIZE / 512);
return min(pages, (sector_t)BIO_MAX_VECS);
}
static int __blkdev_issue_zero_pages(struct block_device *bdev,
sector_t sector, sector_t nr_sects, gfp_t gfp_mask,
struct bio **biop)
{
struct request_queue *q = bdev_get_queue(bdev);
struct bio *bio = *biop;
int bi_size = 0;
unsigned int sz;
if (!q)
return -ENXIO;
if (bdev_read_only(bdev))
return -EPERM;
while (nr_sects != 0) {
bio = blk_next_bio(bio, __blkdev_sectors_to_bio_pages(nr_sects),
gfp_mask);
bio->bi_iter.bi_sector = sector;
bio_set_dev(bio, bdev);
bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
while (nr_sects != 0) { sz = min((sector_t) PAGE_SIZE, nr_sects << 9);
bi_size = bio_add_page(bio, ZERO_PAGE(0), sz, 0);
nr_sects -= bi_size >> 9;
sector += bi_size >> 9;
if (bi_size < sz)
break;
}
cond_resched();
}
*biop = bio; return 0;
}
/**
* __blkdev_issue_zeroout - generate number of zero filed write bios
* @bdev: blockdev to issue
* @sector: start sector
* @nr_sects: number of sectors to write
* @gfp_mask: memory allocation flags (for bio_alloc)
* @biop: pointer to anchor bio
* @flags: controls detailed behavior
*
* Description:
* Zero-fill a block range, either using hardware offload or by explicitly
* writing zeroes to the device.
*
* If a device is using logical block provisioning, the underlying space will
* not be released if %flags contains BLKDEV_ZERO_NOUNMAP.
*
* If %flags contains BLKDEV_ZERO_NOFALLBACK, the function will return
* -EOPNOTSUPP if no explicit hardware offload for zeroing is provided.
*/
int __blkdev_issue_zeroout(struct block_device *bdev, sector_t sector,
sector_t nr_sects, gfp_t gfp_mask, struct bio **biop,
unsigned flags)
{
int ret;
sector_t bs_mask;
bs_mask = (bdev_logical_block_size(bdev) >> 9) - 1;
if ((sector | nr_sects) & bs_mask)
return -EINVAL;
ret = __blkdev_issue_write_zeroes(bdev, sector, nr_sects, gfp_mask,
biop, flags);
if (ret != -EOPNOTSUPP || (flags & BLKDEV_ZERO_NOFALLBACK))
return ret;
return __blkdev_issue_zero_pages(bdev, sector, nr_sects, gfp_mask,
biop);
}
EXPORT_SYMBOL(__blkdev_issue_zeroout);
/**
* blkdev_issue_zeroout - zero-fill a block range
* @bdev: blockdev to write
* @sector: start sector
* @nr_sects: number of sectors to write
* @gfp_mask: memory allocation flags (for bio_alloc)
* @flags: controls detailed behavior
*
* Description:
* Zero-fill a block range, either using hardware offload or by explicitly
* writing zeroes to the device. See __blkdev_issue_zeroout() for the
* valid values for %flags.
*/
int blkdev_issue_zeroout(struct block_device *bdev, sector_t sector,
sector_t nr_sects, gfp_t gfp_mask, unsigned flags)
{
int ret = 0;
sector_t bs_mask;
struct bio *bio;
struct blk_plug plug;
bool try_write_zeroes = !!bdev_write_zeroes_sectors(bdev);
bs_mask = (bdev_logical_block_size(bdev) >> 9) - 1;
if ((sector | nr_sects) & bs_mask)
return -EINVAL;
retry:
bio = NULL;
blk_start_plug(&plug);
if (try_write_zeroes) {
ret = __blkdev_issue_write_zeroes(bdev, sector, nr_sects,
gfp_mask, &bio, flags);
} else if (!(flags & BLKDEV_ZERO_NOFALLBACK)) { ret = __blkdev_issue_zero_pages(bdev, sector, nr_sects,
gfp_mask, &bio);
} else {
/* No zeroing offload support */
ret = -EOPNOTSUPP;
}
if (ret == 0 && bio) { ret = submit_bio_wait(bio);
bio_put(bio);
}
blk_finish_plug(&plug); if (ret && try_write_zeroes) { if (!(flags & BLKDEV_ZERO_NOFALLBACK)) {
try_write_zeroes = false;
goto retry;
}
if (!bdev_write_zeroes_sectors(bdev)) {
/*
* Zeroing offload support was indicated, but the
* device reported ILLEGAL REQUEST (for some devices
* there is no non-destructive way to verify whether
* WRITE ZEROES is actually supported).
*/
ret = -EOPNOTSUPP;
}
}
return ret;
}
EXPORT_SYMBOL(blkdev_issue_zeroout);
/* SPDX-License-Identifier: GPL-2.0 */
/*
* Copyright (C) 2011 STRATO. All rights reserved.
*/
#ifndef BTRFS_BACKREF_H
#define BTRFS_BACKREF_H
#include <linux/btrfs.h>
#include "ulist.h"
#include "disk-io.h"
#include "extent_io.h"
struct inode_fs_paths {
struct btrfs_path *btrfs_path;
struct btrfs_root *fs_root;
struct btrfs_data_container *fspath;
};
typedef int (iterate_extent_inodes_t)(u64 inum, u64 offset, u64 root,
void *ctx);
int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical,
struct btrfs_path *path, struct btrfs_key *found_key,
u64 *flags);
int tree_backref_for_extent(unsigned long *ptr, struct extent_buffer *eb,
struct btrfs_key *key, struct btrfs_extent_item *ei,
u32 item_size, u64 *out_root, u8 *out_level);
int iterate_extent_inodes(struct btrfs_fs_info *fs_info,
u64 extent_item_objectid,
u64 extent_offset, int search_commit_root,
iterate_extent_inodes_t *iterate, void *ctx,
bool ignore_offset);
int iterate_inodes_from_logical(u64 logical, struct btrfs_fs_info *fs_info,
struct btrfs_path *path,
iterate_extent_inodes_t *iterate, void *ctx,
bool ignore_offset);
int paths_from_inode(u64 inum, struct inode_fs_paths *ipath);
int btrfs_find_all_leafs(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info, u64 bytenr,
u64 time_seq, struct ulist **leafs,
const u64 *extent_item_pos, bool ignore_offset);
int btrfs_find_all_roots(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info, u64 bytenr,
u64 time_seq, struct ulist **roots,
bool skip_commit_root_sem);
char *btrfs_ref_to_path(struct btrfs_root *fs_root, struct btrfs_path *path,
u32 name_len, unsigned long name_off,
struct extent_buffer *eb_in, u64 parent,
char *dest, u32 size);
struct btrfs_data_container *init_data_container(u32 total_bytes);
struct inode_fs_paths *init_ipath(s32 total_bytes, struct btrfs_root *fs_root,
struct btrfs_path *path);
void free_ipath(struct inode_fs_paths *ipath);
int btrfs_find_one_extref(struct btrfs_root *root, u64 inode_objectid,
u64 start_off, struct btrfs_path *path,
struct btrfs_inode_extref **ret_extref,
u64 *found_off);
int btrfs_check_shared(struct btrfs_root *root, u64 inum, u64 bytenr,
struct ulist *roots, struct ulist *tmp_ulist);
int __init btrfs_prelim_ref_init(void);
void __cold btrfs_prelim_ref_exit(void);
struct prelim_ref {
struct rb_node rbnode;
u64 root_id;
struct btrfs_key key_for_search;
int level;
int count;
struct extent_inode_elem *inode_list;
u64 parent;
u64 wanted_disk_byte;
};
/*
* Iterate backrefs of one extent.
*
* Now it only supports iteration of tree block in commit root.
*/
struct btrfs_backref_iter {
u64 bytenr;
struct btrfs_path *path;
struct btrfs_fs_info *fs_info;
struct btrfs_key cur_key;
u32 item_ptr;
u32 cur_ptr;
u32 end_ptr;
};
struct btrfs_backref_iter *btrfs_backref_iter_alloc(
struct btrfs_fs_info *fs_info, gfp_t gfp_flag);
static inline void btrfs_backref_iter_free(struct btrfs_backref_iter *iter)
{
if (!iter)
return;
btrfs_free_path(iter->path);
kfree(iter);
}
static inline struct extent_buffer *btrfs_backref_get_eb(
struct btrfs_backref_iter *iter)
{
if (!iter)
return NULL;
return iter->path->nodes[0];
}
/*
* For metadata with EXTENT_ITEM key (non-skinny) case, the first inline data
* is btrfs_tree_block_info, without a btrfs_extent_inline_ref header.
*
* This helper determines if that's the case.
*/
static inline bool btrfs_backref_has_tree_block_info(
struct btrfs_backref_iter *iter)
{
if (iter->cur_key.type == BTRFS_EXTENT_ITEM_KEY &&
iter->cur_ptr - iter->item_ptr == sizeof(struct btrfs_extent_item))
return true;
return false;
}
int btrfs_backref_iter_start(struct btrfs_backref_iter *iter, u64 bytenr);
int btrfs_backref_iter_next(struct btrfs_backref_iter *iter);
static inline bool btrfs_backref_iter_is_inline_ref(
struct btrfs_backref_iter *iter)
{
if (iter->cur_key.type == BTRFS_EXTENT_ITEM_KEY ||
iter->cur_key.type == BTRFS_METADATA_ITEM_KEY)
return true;
return false;
}
static inline void btrfs_backref_iter_release(struct btrfs_backref_iter *iter)
{
iter->bytenr = 0;
iter->item_ptr = 0;
iter->cur_ptr = 0;
iter->end_ptr = 0;
btrfs_release_path(iter->path);
memset(&iter->cur_key, 0, sizeof(iter->cur_key));
}
/*
* Backref cache related structures
*
* The whole objective of backref_cache is to build a bi-directional map
* of tree blocks (represented by backref_node) and all their parents.
*/
/*
* Represent a tree block in the backref cache
*/
struct btrfs_backref_node {
struct {
struct rb_node rb_node;
u64 bytenr;
}; /* Use rb_simple_node for search/insert */
u64 new_bytenr;
/* Objectid of tree block owner, can be not uptodate */
u64 owner;
/* Link to pending, changed or detached list */
struct list_head list;
/* List of upper level edges, which link this node to its parents */
struct list_head upper;
/* List of lower level edges, which link this node to its children */
struct list_head lower;
/* NULL if this node is not tree root */
struct btrfs_root *root;
/* Extent buffer got by COWing the block */
struct extent_buffer *eb;
/* Level of the tree block */
unsigned int level:8;
/* Is the block in a non-shareable tree */
unsigned int cowonly:1;
/* 1 if no child node is in the cache */
unsigned int lowest:1;
/* Is the extent buffer locked */
unsigned int locked:1;
/* Has the block been processed */
unsigned int processed:1;
/* Have backrefs of this block been checked */
unsigned int checked:1;
/*
* 1 if corresponding block has been COWed but some upper level block
* pointers may not point to the new location
*/
unsigned int pending:1;
/* 1 if the backref node isn't connected to any other backref node */
unsigned int detached:1;
/*
* For generic purpose backref cache, where we only care if it's a reloc
* root, doesn't care the source subvolid.
*/
unsigned int is_reloc_root:1;
};
#define LOWER 0
#define UPPER 1
/*
* Represent an edge connecting upper and lower backref nodes.
*/
struct btrfs_backref_edge {
/*
* list[LOWER] is linked to btrfs_backref_node::upper of lower level
* node, and list[UPPER] is linked to btrfs_backref_node::lower of
* upper level node.
*
* Also, build_backref_tree() uses list[UPPER] for pending edges, before
* linking list[UPPER] to its upper level nodes.
*/
struct list_head list[2];
/* Two related nodes */
struct btrfs_backref_node *node[2];
};
struct btrfs_backref_cache {
/* Red black tree of all backref nodes in the cache */
struct rb_root rb_root;
/* For passing backref nodes to btrfs_reloc_cow_block */
struct btrfs_backref_node *path[BTRFS_MAX_LEVEL];
/*
* List of blocks that have been COWed but some block pointers in upper
* level blocks may not reflect the new location
*/
struct list_head pending[BTRFS_MAX_LEVEL];
/* List of backref nodes with no child node */
struct list_head leaves;
/* List of blocks that have been COWed in current transaction */
struct list_head changed;
/* List of detached backref node. */
struct list_head detached;
u64 last_trans;
int nr_nodes;
int nr_edges;
/* List of unchecked backref edges during backref cache build */
struct list_head pending_edge;
/* List of useless backref nodes during backref cache build */
struct list_head useless_node;
struct btrfs_fs_info *fs_info;
/*
* Whether this cache is for relocation
*
* Reloction backref cache require more info for reloc root compared
* to generic backref cache.
*/
unsigned int is_reloc;
};
void btrfs_backref_init_cache(struct btrfs_fs_info *fs_info,
struct btrfs_backref_cache *cache, int is_reloc);
struct btrfs_backref_node *btrfs_backref_alloc_node(
struct btrfs_backref_cache *cache, u64 bytenr, int level);
struct btrfs_backref_edge *btrfs_backref_alloc_edge(
struct btrfs_backref_cache *cache);
#define LINK_LOWER (1 << 0)
#define LINK_UPPER (1 << 1)
static inline void btrfs_backref_link_edge(struct btrfs_backref_edge *edge,
struct btrfs_backref_node *lower,
struct btrfs_backref_node *upper,
int link_which)
{
ASSERT(upper && lower && upper->level == lower->level + 1);
edge->node[LOWER] = lower;
edge->node[UPPER] = upper;
if (link_which & LINK_LOWER)
list_add_tail(&edge->list[LOWER], &lower->upper);
if (link_which & LINK_UPPER)
list_add_tail(&edge->list[UPPER], &upper->lower);
}
static inline void btrfs_backref_free_node(struct btrfs_backref_cache *cache,
struct btrfs_backref_node *node)
{
if (node) {
ASSERT(list_empty(&node->list));
ASSERT(list_empty(&node->lower));
ASSERT(node->eb == NULL);
cache->nr_nodes--;
btrfs_put_root(node->root);
kfree(node);
}
}
static inline void btrfs_backref_free_edge(struct btrfs_backref_cache *cache,
struct btrfs_backref_edge *edge)
{
if (edge) {
cache->nr_edges--;
kfree(edge);
}
}
static inline void btrfs_backref_unlock_node_buffer(
struct btrfs_backref_node *node)
{
if (node->locked) { btrfs_tree_unlock(node->eb);
node->locked = 0;
}
}
static inline void btrfs_backref_drop_node_buffer(
struct btrfs_backref_node *node)
{
if (node->eb) {
btrfs_backref_unlock_node_buffer(node);
free_extent_buffer(node->eb);
node->eb = NULL;
}
}
/*
* Drop the backref node from cache without cleaning up its children
* edges.
*
* This can only be called on node without parent edges.
* The children edges are still kept as is.
*/
static inline void btrfs_backref_drop_node(struct btrfs_backref_cache *tree,
struct btrfs_backref_node *node)
{
ASSERT(list_empty(&node->upper));
btrfs_backref_drop_node_buffer(node);
list_del_init(&node->list);
list_del_init(&node->lower);
if (!RB_EMPTY_NODE(&node->rb_node))
rb_erase(&node->rb_node, &tree->rb_root);
btrfs_backref_free_node(tree, node);
}
void btrfs_backref_cleanup_node(struct btrfs_backref_cache *cache,
struct btrfs_backref_node *node);
void btrfs_backref_release_cache(struct btrfs_backref_cache *cache);
static inline void btrfs_backref_panic(struct btrfs_fs_info *fs_info,
u64 bytenr, int errno)
{
btrfs_panic(fs_info, errno,
"Inconsistency in backref cache found at offset %llu",
bytenr);
}
int btrfs_backref_add_tree_node(struct btrfs_backref_cache *cache,
struct btrfs_path *path,
struct btrfs_backref_iter *iter,
struct btrfs_key *node_key,
struct btrfs_backref_node *cur);
int btrfs_backref_finish_upper_links(struct btrfs_backref_cache *cache,
struct btrfs_backref_node *start);
void btrfs_backref_error_cleanup(struct btrfs_backref_cache *cache,
struct btrfs_backref_node *node);
#endif
// SPDX-License-Identifier: GPL-2.0
/*
* Functions related to setting various queue properties from drivers
*/
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/init.h>
#include <linux/bio.h>
#include <linux/blkdev.h>
#include <linux/pagemap.h>
#include <linux/backing-dev-defs.h>
#include <linux/gcd.h>
#include <linux/lcm.h>
#include <linux/jiffies.h>
#include <linux/gfp.h>
#include <linux/dma-mapping.h>
#include "blk.h"
#include "blk-wbt.h"
void blk_queue_rq_timeout(struct request_queue *q, unsigned int timeout)
{
q->rq_timeout = timeout;
}
EXPORT_SYMBOL_GPL(blk_queue_rq_timeout);
/**
* blk_set_default_limits - reset limits to default values
* @lim: the queue_limits structure to reset
*
* Description:
* Returns a queue_limit struct to its default state.
*/
void blk_set_default_limits(struct queue_limits *lim)
{
lim->max_segments = BLK_MAX_SEGMENTS;
lim->max_discard_segments = 1;
lim->max_integrity_segments = 0;
lim->seg_boundary_mask = BLK_SEG_BOUNDARY_MASK;
lim->virt_boundary_mask = 0;
lim->max_segment_size = BLK_MAX_SEGMENT_SIZE;
lim->max_sectors = lim->max_hw_sectors = BLK_SAFE_MAX_SECTORS;
lim->max_dev_sectors = 0;
lim->chunk_sectors = 0;
lim->max_write_same_sectors = 0;
lim->max_write_zeroes_sectors = 0;
lim->max_zone_append_sectors = 0;
lim->max_discard_sectors = 0;
lim->max_hw_discard_sectors = 0;
lim->discard_granularity = 0;
lim->discard_alignment = 0;
lim->discard_misaligned = 0;
lim->logical_block_size = lim->physical_block_size = lim->io_min = 512;
lim->bounce = BLK_BOUNCE_NONE;
lim->alignment_offset = 0;
lim->io_opt = 0;
lim->misaligned = 0;
lim->zoned = BLK_ZONED_NONE;
lim->zone_write_granularity = 0;
}
EXPORT_SYMBOL(blk_set_default_limits);
/**
* blk_set_stacking_limits - set default limits for stacking devices
* @lim: the queue_limits structure to reset
*
* Description:
* Returns a queue_limit struct to its default state. Should be used
* by stacking drivers like DM that have no internal limits.
*/
void blk_set_stacking_limits(struct queue_limits *lim)
{
blk_set_default_limits(lim);
/* Inherit limits from component devices */
lim->max_segments = USHRT_MAX;
lim->max_discard_segments = USHRT_MAX;
lim->max_hw_sectors = UINT_MAX;
lim->max_segment_size = UINT_MAX;
lim->max_sectors = UINT_MAX;
lim->max_dev_sectors = UINT_MAX;
lim->max_write_same_sectors = UINT_MAX;
lim->max_write_zeroes_sectors = UINT_MAX;
lim->max_zone_append_sectors = UINT_MAX;
}
EXPORT_SYMBOL(blk_set_stacking_limits);
/**
* blk_queue_bounce_limit - set bounce buffer limit for queue
* @q: the request queue for the device
* @bounce: bounce limit to enforce
*
* Description:
* Force bouncing for ISA DMA ranges or highmem.
*
* DEPRECATED, don't use in new code.
**/
void blk_queue_bounce_limit(struct request_queue *q, enum blk_bounce bounce)
{
q->limits.bounce = bounce;
}
EXPORT_SYMBOL(blk_queue_bounce_limit);
/**
* blk_queue_max_hw_sectors - set max sectors for a request for this queue
* @q: the request queue for the device
* @max_hw_sectors: max hardware sectors in the usual 512b unit
*
* Description:
* Enables a low level driver to set a hard upper limit,
* max_hw_sectors, on the size of requests. max_hw_sectors is set by
* the device driver based upon the capabilities of the I/O
* controller.
*
* max_dev_sectors is a hard limit imposed by the storage device for
* READ/WRITE requests. It is set by the disk driver.
*
* max_sectors is a soft limit imposed by the block layer for
* filesystem type requests. This value can be overridden on a
* per-device basis in /sys/block/<device>/queue/max_sectors_kb.
* The soft limit can not exceed max_hw_sectors.
**/
void blk_queue_max_hw_sectors(struct request_queue *q, unsigned int max_hw_sectors)
{
struct queue_limits *limits = &q->limits;
unsigned int max_sectors;
if ((max_hw_sectors << 9) < PAGE_SIZE) {
max_hw_sectors = 1 << (PAGE_SHIFT - 9);
printk(KERN_INFO "%s: set to minimum %d\n",
__func__, max_hw_sectors);
}
max_hw_sectors = round_down(max_hw_sectors,
limits->logical_block_size >> SECTOR_SHIFT);
limits->max_hw_sectors = max_hw_sectors;
max_sectors = min_not_zero(max_hw_sectors, limits->max_dev_sectors);
max_sectors = min_t(unsigned int, max_sectors, BLK_DEF_MAX_SECTORS);
max_sectors = round_down(max_sectors,
limits->logical_block_size >> SECTOR_SHIFT);
limits->max_sectors = max_sectors;
if (!q->disk)
return;
q->disk->bdi->io_pages = max_sectors >> (PAGE_SHIFT - 9);
}
EXPORT_SYMBOL(blk_queue_max_hw_sectors);
/**
* blk_queue_chunk_sectors - set size of the chunk for this queue
* @q: the request queue for the device
* @chunk_sectors: chunk sectors in the usual 512b unit
*
* Description:
* If a driver doesn't want IOs to cross a given chunk size, it can set
* this limit and prevent merging across chunks. Note that the block layer
* must accept a page worth of data at any offset. So if the crossing of
* chunks is a hard limitation in the driver, it must still be prepared
* to split single page bios.
**/
void blk_queue_chunk_sectors(struct request_queue *q, unsigned int chunk_sectors)
{
q->limits.chunk_sectors = chunk_sectors;
}
EXPORT_SYMBOL(blk_queue_chunk_sectors);
/**
* blk_queue_max_discard_sectors - set max sectors for a single discard
* @q: the request queue for the device
* @max_discard_sectors: maximum number of sectors to discard
**/
void blk_queue_max_discard_sectors(struct request_queue *q,
unsigned int max_discard_sectors)
{
q->limits.max_hw_discard_sectors = max_discard_sectors;
q->limits.max_discard_sectors = max_discard_sectors;
}
EXPORT_SYMBOL(blk_queue_max_discard_sectors);
/**
* blk_queue_max_write_same_sectors - set max sectors for a single write same
* @q: the request queue for the device
* @max_write_same_sectors: maximum number of sectors to write per command
**/
void blk_queue_max_write_same_sectors(struct request_queue *q,
unsigned int max_write_same_sectors)
{
q->limits.max_write_same_sectors = max_write_same_sectors;
}
EXPORT_SYMBOL(blk_queue_max_write_same_sectors);
/**
* blk_queue_max_write_zeroes_sectors - set max sectors for a single
* write zeroes
* @q: the request queue for the device
* @max_write_zeroes_sectors: maximum number of sectors to write per command
**/
void blk_queue_max_write_zeroes_sectors(struct request_queue *q,
unsigned int max_write_zeroes_sectors)
{
q->limits.max_write_zeroes_sectors = max_write_zeroes_sectors;
}
EXPORT_SYMBOL(blk_queue_max_write_zeroes_sectors);
/**
* blk_queue_max_zone_append_sectors - set max sectors for a single zone append
* @q: the request queue for the device
* @max_zone_append_sectors: maximum number of sectors to write per command
**/
void blk_queue_max_zone_append_sectors(struct request_queue *q,
unsigned int max_zone_append_sectors)
{
unsigned int max_sectors;
if (WARN_ON(!blk_queue_is_zoned(q)))
return;
max_sectors = min(q->limits.max_hw_sectors, max_zone_append_sectors);
max_sectors = min(q->limits.chunk_sectors, max_sectors);
/*
* Signal eventual driver bugs resulting in the max_zone_append sectors limit
* being 0 due to a 0 argument, the chunk_sectors limit (zone size) not set,
* or the max_hw_sectors limit not set.
*/
WARN_ON(!max_sectors);
q->limits.max_zone_append_sectors = max_sectors;
}
EXPORT_SYMBOL_GPL(blk_queue_max_zone_append_sectors);
/**
* blk_queue_max_segments - set max hw segments for a request for this queue
* @q: the request queue for the device
* @max_segments: max number of segments
*
* Description:
* Enables a low level driver to set an upper limit on the number of
* hw data segments in a request.
**/
void blk_queue_max_segments(struct request_queue *q, unsigned short max_segments)
{
if (!max_segments) {
max_segments = 1;
printk(KERN_INFO "%s: set to minimum %d\n",
__func__, max_segments);
}
q->limits.max_segments = max_segments;
}
EXPORT_SYMBOL(blk_queue_max_segments);
/**
* blk_queue_max_discard_segments - set max segments for discard requests
* @q: the request queue for the device
* @max_segments: max number of segments
*
* Description:
* Enables a low level driver to set an upper limit on the number of
* segments in a discard request.
**/
void blk_queue_max_discard_segments(struct request_queue *q,
unsigned short max_segments)
{
q->limits.max_discard_segments = max_segments;
}
EXPORT_SYMBOL_GPL(blk_queue_max_discard_segments);
/**
* blk_queue_max_segment_size - set max segment size for blk_rq_map_sg
* @q: the request queue for the device
* @max_size: max size of segment in bytes
*
* Description:
* Enables a low level driver to set an upper limit on the size of a
* coalesced segment
**/
void blk_queue_max_segment_size(struct request_queue *q, unsigned int max_size)
{
if (max_size < PAGE_SIZE) {
max_size = PAGE_SIZE;
printk(KERN_INFO "%s: set to minimum %d\n",
__func__, max_size);
}
/* see blk_queue_virt_boundary() for the explanation */
WARN_ON_ONCE(q->limits.virt_boundary_mask);
q->limits.max_segment_size = max_size;
}
EXPORT_SYMBOL(blk_queue_max_segment_size);
/**
* blk_queue_logical_block_size - set logical block size for the queue
* @q: the request queue for the device
* @size: the logical block size, in bytes
*
* Description:
* This should be set to the lowest possible block size that the
* storage device can address. The default of 512 covers most
* hardware.
**/
void blk_queue_logical_block_size(struct request_queue *q, unsigned int size)
{
struct queue_limits *limits = &q->limits;
limits->logical_block_size = size;
if (limits->physical_block_size < size)
limits->physical_block_size = size; if (limits->io_min < limits->physical_block_size) limits->io_min = limits->physical_block_size;
limits->max_hw_sectors =
round_down(limits->max_hw_sectors, size >> SECTOR_SHIFT);
limits->max_sectors =
round_down(limits->max_sectors, size >> SECTOR_SHIFT);
}
EXPORT_SYMBOL(blk_queue_logical_block_size);
/**
* blk_queue_physical_block_size - set physical block size for the queue
* @q: the request queue for the device
* @size: the physical block size, in bytes
*
* Description:
* This should be set to the lowest possible sector size that the
* hardware can operate on without reverting to read-modify-write
* operations.
*/
void blk_queue_physical_block_size(struct request_queue *q, unsigned int size)
{
q->limits.physical_block_size = size;
if (q->limits.physical_block_size < q->limits.logical_block_size)
q->limits.physical_block_size = q->limits.logical_block_size; if (q->limits.io_min < q->limits.physical_block_size) q->limits.io_min = q->limits.physical_block_size;
}
EXPORT_SYMBOL(blk_queue_physical_block_size);
/**
* blk_queue_zone_write_granularity - set zone write granularity for the queue
* @q: the request queue for the zoned device
* @size: the zone write granularity size, in bytes
*
* Description:
* This should be set to the lowest possible size allowing to write in
* sequential zones of a zoned block device.
*/
void blk_queue_zone_write_granularity(struct request_queue *q,
unsigned int size)
{
if (WARN_ON_ONCE(!blk_queue_is_zoned(q)))
return;
q->limits.zone_write_granularity = size;
if (q->limits.zone_write_granularity < q->limits.logical_block_size)
q->limits.zone_write_granularity = q->limits.logical_block_size;
}
EXPORT_SYMBOL_GPL(blk_queue_zone_write_granularity);
/**
* blk_queue_alignment_offset - set physical block alignment offset
* @q: the request queue for the device
* @offset: alignment offset in bytes
*
* Description:
* Some devices are naturally misaligned to compensate for things like
* the legacy DOS partition table 63-sector offset. Low-level drivers
* should call this function for devices whose first sector is not
* naturally aligned.
*/
void blk_queue_alignment_offset(struct request_queue *q, unsigned int offset)
{
q->limits.alignment_offset =
offset & (q->limits.physical_block_size - 1);
q->limits.misaligned = 0;
}
EXPORT_SYMBOL(blk_queue_alignment_offset);
void disk_update_readahead(struct gendisk *disk)
{
struct request_queue *q = disk->queue;
/*
* For read-ahead of large files to be effective, we need to read ahead
* at least twice the optimal I/O size.
*/
disk->bdi->ra_pages =
max(queue_io_opt(q) * 2 / PAGE_SIZE, VM_READAHEAD_PAGES);
disk->bdi->io_pages = queue_max_sectors(q) >> (PAGE_SHIFT - 9);
}
EXPORT_SYMBOL_GPL(disk_update_readahead);
/**
* blk_limits_io_min - set minimum request size for a device
* @limits: the queue limits
* @min: smallest I/O size in bytes
*
* Description:
* Some devices have an internal block size bigger than the reported
* hardware sector size. This function can be used to signal the
* smallest I/O the device can perform without incurring a performance
* penalty.
*/
void blk_limits_io_min(struct queue_limits *limits, unsigned int min)
{
limits->io_min = min;
if (limits->io_min < limits->logical_block_size)
limits->io_min = limits->logical_block_size; if (limits->io_min < limits->physical_block_size) limits->io_min = limits->physical_block_size;
}
EXPORT_SYMBOL(blk_limits_io_min);
/**
* blk_queue_io_min - set minimum request size for the queue
* @q: the request queue for the device
* @min: smallest I/O size in bytes
*
* Description:
* Storage devices may report a granularity or preferred minimum I/O
* size which is the smallest request the device can perform without
* incurring a performance penalty. For disk drives this is often the
* physical block size. For RAID arrays it is often the stripe chunk
* size. A properly aligned multiple of minimum_io_size is the
* preferred request size for workloads where a high number of I/O
* operations is desired.
*/
void blk_queue_io_min(struct request_queue *q, unsigned int min)
{
blk_limits_io_min(&q->limits, min);
}
EXPORT_SYMBOL(blk_queue_io_min);
/**
* blk_limits_io_opt - set optimal request size for a device
* @limits: the queue limits
* @opt: smallest I/O size in bytes
*
* Description:
* Storage devices may report an optimal I/O size, which is the
* device's preferred unit for sustained I/O. This is rarely reported
* for disk drives. For RAID arrays it is usually the stripe width or
* the internal track size. A properly aligned multiple of
* optimal_io_size is the preferred request size for workloads where
* sustained throughput is desired.
*/
void blk_limits_io_opt(struct queue_limits *limits, unsigned int opt)
{
limits->io_opt = opt;
}
EXPORT_SYMBOL(blk_limits_io_opt);
/**
* blk_queue_io_opt - set optimal request size for the queue
* @q: the request queue for the device
* @opt: optimal request size in bytes
*
* Description:
* Storage devices may report an optimal I/O size, which is the
* device's preferred unit for sustained I/O. This is rarely reported
* for disk drives. For RAID arrays it is usually the stripe width or
* the internal track size. A properly aligned multiple of
* optimal_io_size is the preferred request size for workloads where
* sustained throughput is desired.
*/
void blk_queue_io_opt(struct request_queue *q, unsigned int opt)
{
blk_limits_io_opt(&q->limits, opt);
if (!q->disk)
return;
q->disk->bdi->ra_pages =
max(queue_io_opt(q) * 2 / PAGE_SIZE, VM_READAHEAD_PAGES);
}
EXPORT_SYMBOL(blk_queue_io_opt);
static unsigned int blk_round_down_sectors(unsigned int sectors, unsigned int lbs)
{
sectors = round_down(sectors, lbs >> SECTOR_SHIFT);
if (sectors < PAGE_SIZE >> SECTOR_SHIFT)
sectors = PAGE_SIZE >> SECTOR_SHIFT;
return sectors;
}
/**
* blk_stack_limits - adjust queue_limits for stacked devices
* @t: the stacking driver limits (top device)
* @b: the underlying queue limits (bottom, component device)
* @start: first data sector within component device
*
* Description:
* This function is used by stacking drivers like MD and DM to ensure
* that all component devices have compatible block sizes and
* alignments. The stacking driver must provide a queue_limits
* struct (top) and then iteratively call the stacking function for
* all component (bottom) devices. The stacking function will
* attempt to combine the values and ensure proper alignment.
*
* Returns 0 if the top and bottom queue_limits are compatible. The
* top device's block sizes and alignment offsets may be adjusted to
* ensure alignment with the bottom device. If no compatible sizes
* and alignments exist, -1 is returned and the resulting top
* queue_limits will have the misaligned flag set to indicate that
* the alignment_offset is undefined.
*/
int blk_stack_limits(struct queue_limits *t, struct queue_limits *b,
sector_t start)
{
unsigned int top, bottom, alignment, ret = 0;
t->max_sectors = min_not_zero(t->max_sectors, b->max_sectors);
t->max_hw_sectors = min_not_zero(t->max_hw_sectors, b->max_hw_sectors);
t->max_dev_sectors = min_not_zero(t->max_dev_sectors, b->max_dev_sectors);
t->max_write_same_sectors = min(t->max_write_same_sectors,
b->max_write_same_sectors);
t->max_write_zeroes_sectors = min(t->max_write_zeroes_sectors,
b->max_write_zeroes_sectors);
t->max_zone_append_sectors = min(t->max_zone_append_sectors,
b->max_zone_append_sectors);
t->bounce = max(t->bounce, b->bounce);
t->seg_boundary_mask = min_not_zero(t->seg_boundary_mask,
b->seg_boundary_mask);
t->virt_boundary_mask = min_not_zero(t->virt_boundary_mask,
b->virt_boundary_mask);
t->max_segments = min_not_zero(t->max_segments, b->max_segments);
t->max_discard_segments = min_not_zero(t->max_discard_segments,
b->max_discard_segments);
t->max_integrity_segments = min_not_zero(t->max_integrity_segments,
b->max_integrity_segments);
t->max_segment_size = min_not_zero(t->max_segment_size,
b->max_segment_size);
t->misaligned |= b->misaligned;
alignment = queue_limit_alignment_offset(b, start);
/* Bottom device has different alignment. Check that it is
* compatible with the current top alignment.
*/
if (t->alignment_offset != alignment) {
top = max(t->physical_block_size, t->io_min)
+ t->alignment_offset;
bottom = max(b->physical_block_size, b->io_min) + alignment;
/* Verify that top and bottom intervals line up */
if (max(top, bottom) % min(top, bottom)) {
t->misaligned = 1;
ret = -1;
}
}
t->logical_block_size = max(t->logical_block_size,
b->logical_block_size);
t->physical_block_size = max(t->physical_block_size,
b->physical_block_size);
t->io_min = max(t->io_min, b->io_min);
t->io_opt = lcm_not_zero(t->io_opt, b->io_opt);
/* Set non-power-of-2 compatible chunk_sectors boundary */
if (b->chunk_sectors)
t->chunk_sectors = gcd(t->chunk_sectors, b->chunk_sectors);
/* Physical block size a multiple of the logical block size? */
if (t->physical_block_size & (t->logical_block_size - 1)) {
t->physical_block_size = t->logical_block_size;
t->misaligned = 1;
ret = -1;
}
/* Minimum I/O a multiple of the physical block size? */
if (t->io_min & (t->physical_block_size - 1)) {
t->io_min = t->physical_block_size;
t->misaligned = 1;
ret = -1;
}
/* Optimal I/O a multiple of the physical block size? */
if (t->io_opt & (t->physical_block_size - 1)) {
t->io_opt = 0;
t->misaligned = 1;
ret = -1;
}
/* chunk_sectors a multiple of the physical block size? */
if ((t->chunk_sectors << 9) & (t->physical_block_size - 1)) {
t->chunk_sectors = 0;
t->misaligned = 1;
ret = -1;
}
t->raid_partial_stripes_expensive =
max(t->raid_partial_stripes_expensive,
b->raid_partial_stripes_expensive);
/* Find lowest common alignment_offset */
t->alignment_offset = lcm_not_zero(t->alignment_offset, alignment)
% max(t->physical_block_size, t->io_min);
/* Verify that new alignment_offset is on a logical block boundary */
if (t->alignment_offset & (t->logical_block_size - 1)) {
t->misaligned = 1;
ret = -1;
}
t->max_sectors = blk_round_down_sectors(t->max_sectors, t->logical_block_size);
t->max_hw_sectors = blk_round_down_sectors(t->max_hw_sectors, t->logical_block_size);
t->max_dev_sectors = blk_round_down_sectors(t->max_dev_sectors, t->logical_block_size);
/* Discard alignment and granularity */
if (b->discard_granularity) {
alignment = queue_limit_discard_alignment(b, start);
if (t->discard_granularity != 0 &&
t->discard_alignment != alignment) {
top = t->discard_granularity + t->discard_alignment;
bottom = b->discard_granularity + alignment;
/* Verify that top and bottom intervals line up */
if ((max(top, bottom) % min(top, bottom)) != 0)
t->discard_misaligned = 1;
}
t->max_discard_sectors = min_not_zero(t->max_discard_sectors,
b->max_discard_sectors);
t->max_hw_discard_sectors = min_not_zero(t->max_hw_discard_sectors,
b->max_hw_discard_sectors);
t->discard_granularity = max(t->discard_granularity,
b->discard_granularity);
t->discard_alignment = lcm_not_zero(t->discard_alignment, alignment) %
t->discard_granularity;
}
t->zone_write_granularity = max(t->zone_write_granularity,
b->zone_write_granularity);
t->zoned = max(t->zoned, b->zoned);
return ret;
}
EXPORT_SYMBOL(blk_stack_limits);
/**
* disk_stack_limits - adjust queue limits for stacked drivers
* @disk: MD/DM gendisk (top)
* @bdev: the underlying block device (bottom)
* @offset: offset to beginning of data within component device
*
* Description:
* Merges the limits for a top level gendisk and a bottom level
* block_device.
*/
void disk_stack_limits(struct gendisk *disk, struct block_device *bdev,
sector_t offset)
{
struct request_queue *t = disk->queue;
if (blk_stack_limits(&t->limits, &bdev_get_queue(bdev)->limits,
get_start_sect(bdev) + (offset >> 9)) < 0)
pr_notice("%s: Warning: Device %pg is misaligned\n",
disk->disk_name, bdev);
disk_update_readahead(disk);
}
EXPORT_SYMBOL(disk_stack_limits);
/**
* blk_queue_update_dma_pad - update pad mask
* @q: the request queue for the device
* @mask: pad mask
*
* Update dma pad mask.
*
* Appending pad buffer to a request modifies the last entry of a
* scatter list such that it includes the pad buffer.
**/
void blk_queue_update_dma_pad(struct request_queue *q, unsigned int mask)
{
if (mask > q->dma_pad_mask)
q->dma_pad_mask = mask;
}
EXPORT_SYMBOL(blk_queue_update_dma_pad);
/**
* blk_queue_segment_boundary - set boundary rules for segment merging
* @q: the request queue for the device
* @mask: the memory boundary mask
**/
void blk_queue_segment_boundary(struct request_queue *q, unsigned long mask)
{
if (mask < PAGE_SIZE - 1) {
mask = PAGE_SIZE - 1;
printk(KERN_INFO "%s: set to minimum %lx\n",
__func__, mask);
}
q->limits.seg_boundary_mask = mask;
}
EXPORT_SYMBOL(blk_queue_segment_boundary);
/**
* blk_queue_virt_boundary - set boundary rules for bio merging
* @q: the request queue for the device
* @mask: the memory boundary mask
**/
void blk_queue_virt_boundary(struct request_queue *q, unsigned long mask)
{
q->limits.virt_boundary_mask = mask;
/*
* Devices that require a virtual boundary do not support scatter/gather
* I/O natively, but instead require a descriptor list entry for each
* page (which might not be idential to the Linux PAGE_SIZE). Because
* of that they are not limited by our notion of "segment size".
*/
if (mask)
q->limits.max_segment_size = UINT_MAX;
}
EXPORT_SYMBOL(blk_queue_virt_boundary);
/**
* blk_queue_dma_alignment - set dma length and memory alignment
* @q: the request queue for the device
* @mask: alignment mask
*
* description:
* set required memory and length alignment for direct dma transactions.
* this is used when building direct io requests for the queue.
*
**/
void blk_queue_dma_alignment(struct request_queue *q, int mask)
{
q->dma_alignment = mask;
}
EXPORT_SYMBOL(blk_queue_dma_alignment);
/**
* blk_queue_update_dma_alignment - update dma length and memory alignment
* @q: the request queue for the device
* @mask: alignment mask
*
* description:
* update required memory and length alignment for direct dma transactions.
* If the requested alignment is larger than the current alignment, then
* the current queue alignment is updated to the new value, otherwise it
* is left alone. The design of this is to allow multiple objects
* (driver, device, transport etc) to set their respective
* alignments without having them interfere.
*
**/
void blk_queue_update_dma_alignment(struct request_queue *q, int mask)
{
BUG_ON(mask > PAGE_SIZE);
if (mask > q->dma_alignment)
q->dma_alignment = mask;
}
EXPORT_SYMBOL(blk_queue_update_dma_alignment);
/**
* blk_set_queue_depth - tell the block layer about the device queue depth
* @q: the request queue for the device
* @depth: queue depth
*
*/
void blk_set_queue_depth(struct request_queue *q, unsigned int depth)
{
q->queue_depth = depth;
rq_qos_queue_depth_changed(q);
}
EXPORT_SYMBOL(blk_set_queue_depth);
/**
* blk_queue_write_cache - configure queue's write cache
* @q: the request queue for the device
* @wc: write back cache on or off
* @fua: device supports FUA writes, if true
*
* Tell the block layer about the write cache of @q.
*/
void blk_queue_write_cache(struct request_queue *q, bool wc, bool fua)
{
if (wc) blk_queue_flag_set(QUEUE_FLAG_WC, q);
else
blk_queue_flag_clear(QUEUE_FLAG_WC, q); if (fua) blk_queue_flag_set(QUEUE_FLAG_FUA, q);
else
blk_queue_flag_clear(QUEUE_FLAG_FUA, q); wbt_set_write_cache(q, test_bit(QUEUE_FLAG_WC, &q->queue_flags));
}
EXPORT_SYMBOL_GPL(blk_queue_write_cache);
/**
* blk_queue_required_elevator_features - Set a queue required elevator features
* @q: the request queue for the target device
* @features: Required elevator features OR'ed together
*
* Tell the block layer that for the device controlled through @q, only the
* only elevators that can be used are those that implement at least the set of
* features specified by @features.
*/
void blk_queue_required_elevator_features(struct request_queue *q,
unsigned int features)
{
q->required_elevator_features = features;
}
EXPORT_SYMBOL_GPL(blk_queue_required_elevator_features);
/**
* blk_queue_can_use_dma_map_merging - configure queue for merging segments.
* @q: the request queue for the device
* @dev: the device pointer for dma
*
* Tell the block layer about merging the segments by dma map of @q.
*/
bool blk_queue_can_use_dma_map_merging(struct request_queue *q,
struct device *dev)
{
unsigned long boundary = dma_get_merge_boundary(dev);
if (!boundary)
return false;
/* No need to update max_segment_size. see blk_queue_virt_boundary() */
blk_queue_virt_boundary(q, boundary);
return true;
}
EXPORT_SYMBOL_GPL(blk_queue_can_use_dma_map_merging);
static bool disk_has_partitions(struct gendisk *disk)
{
unsigned long idx;
struct block_device *part;
bool ret = false;
rcu_read_lock();
xa_for_each(&disk->part_tbl, idx, part) {
if (bdev_is_partition(part)) {
ret = true;
break;
}
}
rcu_read_unlock();
return ret;
}
/**
* blk_queue_set_zoned - configure a disk queue zoned model.
* @disk: the gendisk of the queue to configure
* @model: the zoned model to set
*
* Set the zoned model of the request queue of @disk according to @model.
* When @model is BLK_ZONED_HM (host managed), this should be called only
* if zoned block device support is enabled (CONFIG_BLK_DEV_ZONED option).
* If @model specifies BLK_ZONED_HA (host aware), the effective model used
* depends on CONFIG_BLK_DEV_ZONED settings and on the existence of partitions
* on the disk.
*/
void blk_queue_set_zoned(struct gendisk *disk, enum blk_zoned_model model)
{
struct request_queue *q = disk->queue;
switch (model) {
case BLK_ZONED_HM:
/*
* Host managed devices are supported only if
* CONFIG_BLK_DEV_ZONED is enabled.
*/
WARN_ON_ONCE(!IS_ENABLED(CONFIG_BLK_DEV_ZONED));
break;
case BLK_ZONED_HA:
/*
* Host aware devices can be treated either as regular block
* devices (similar to drive managed devices) or as zoned block
* devices to take advantage of the zone command set, similarly
* to host managed devices. We try the latter if there are no
* partitions and zoned block device support is enabled, else
* we do nothing special as far as the block layer is concerned.
*/
if (!IS_ENABLED(CONFIG_BLK_DEV_ZONED) ||
disk_has_partitions(disk))
model = BLK_ZONED_NONE;
break;
case BLK_ZONED_NONE:
default:
if (WARN_ON_ONCE(model != BLK_ZONED_NONE))
model = BLK_ZONED_NONE;
break;
}
q->limits.zoned = model;
if (model != BLK_ZONED_NONE) {
/*
* Set the zone write granularity to the device logical block
* size by default. The driver can change this value if needed.
*/
blk_queue_zone_write_granularity(q,
queue_logical_block_size(q));
} else {
blk_queue_clear_zone_settings(q);
}
}
EXPORT_SYMBOL_GPL(blk_queue_set_zoned);
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_ERR_H
#define _LINUX_ERR_H
#include <linux/compiler.h>
#include <linux/types.h>
#include <asm/errno.h>
/*
* Kernel pointers have redundant information, so we can use a
* scheme where we can return either an error code or a normal
* pointer with the same return value.
*
* This should be a per-architecture thing, to allow different
* error and pointer decisions.
*/
#define MAX_ERRNO 4095
#ifndef __ASSEMBLY__
#define IS_ERR_VALUE(x) unlikely((unsigned long)(void *)(x) >= (unsigned long)-MAX_ERRNO)
static inline void * __must_check ERR_PTR(long error)
{
return (void *) error;
}
static inline long __must_check PTR_ERR(__force const void *ptr)
{
return (long) ptr;
}
static inline bool __must_check IS_ERR(__force const void *ptr)
{
return IS_ERR_VALUE((unsigned long)ptr);
}
static inline bool __must_check IS_ERR_OR_NULL(__force const void *ptr)
{
return unlikely(!ptr) || IS_ERR_VALUE((unsigned long)ptr);
}
/**
* ERR_CAST - Explicitly cast an error-valued pointer to another pointer type
* @ptr: The pointer to cast.
*
* Explicitly cast an error-valued pointer to another pointer type in such a
* way as to make it clear that's what's going on.
*/
static inline void * __must_check ERR_CAST(__force const void *ptr)
{
/* cast away the const */
return (void *) ptr;
}
static inline int __must_check PTR_ERR_OR_ZERO(__force const void *ptr)
{
if (IS_ERR(ptr))
return PTR_ERR(ptr);
else
return 0;
}
#endif
#endif /* _LINUX_ERR_H */
// SPDX-License-Identifier: GPL-2.0
/*
* Functions to sequence PREFLUSH and FUA writes.
*
* Copyright (C) 2011 Max Planck Institute for Gravitational Physics
* Copyright (C) 2011 Tejun Heo <tj@kernel.org>
*
* REQ_{PREFLUSH|FUA} requests are decomposed to sequences consisted of three
* optional steps - PREFLUSH, DATA and POSTFLUSH - according to the request
* properties and hardware capability.
*
* If a request doesn't have data, only REQ_PREFLUSH makes sense, which
* indicates a simple flush request. If there is data, REQ_PREFLUSH indicates
* that the device cache should be flushed before the data is executed, and
* REQ_FUA means that the data must be on non-volatile media on request
* completion.
*
* If the device doesn't have writeback cache, PREFLUSH and FUA don't make any
* difference. The requests are either completed immediately if there's no data
* or executed as normal requests otherwise.
*
* If the device has writeback cache and supports FUA, REQ_PREFLUSH is
* translated to PREFLUSH but REQ_FUA is passed down directly with DATA.
*
* If the device has writeback cache and doesn't support FUA, REQ_PREFLUSH
* is translated to PREFLUSH and REQ_FUA to POSTFLUSH.
*
* The actual execution of flush is double buffered. Whenever a request
* needs to execute PRE or POSTFLUSH, it queues at
* fq->flush_queue[fq->flush_pending_idx]. Once certain criteria are met, a
* REQ_OP_FLUSH is issued and the pending_idx is toggled. When the flush
* completes, all the requests which were pending are proceeded to the next
* step. This allows arbitrary merging of different types of PREFLUSH/FUA
* requests.
*
* Currently, the following conditions are used to determine when to issue
* flush.
*
* C1. At any given time, only one flush shall be in progress. This makes
* double buffering sufficient.
*
* C2. Flush is deferred if any request is executing DATA of its sequence.
* This avoids issuing separate POSTFLUSHes for requests which shared
* PREFLUSH.
*
* C3. The second condition is ignored if there is a request which has
* waited longer than FLUSH_PENDING_TIMEOUT. This is to avoid
* starvation in the unlikely case where there are continuous stream of
* FUA (without PREFLUSH) requests.
*
* For devices which support FUA, it isn't clear whether C2 (and thus C3)
* is beneficial.
*
* Note that a sequenced PREFLUSH/FUA request with DATA is completed twice.
* Once while executing DATA and again after the whole sequence is
* complete. The first completion updates the contained bio but doesn't
* finish it so that the bio submitter is notified only after the whole
* sequence is complete. This is implemented by testing RQF_FLUSH_SEQ in
* req_bio_endio().
*
* The above peculiarity requires that each PREFLUSH/FUA request has only one
* bio attached to it, which is guaranteed as they aren't allowed to be
* merged in the usual way.
*/
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/bio.h>
#include <linux/blkdev.h>
#include <linux/gfp.h>
#include <linux/blk-mq.h>
#include "blk.h"
#include "blk-mq.h"
#include "blk-mq-tag.h"
#include "blk-mq-sched.h"
/* PREFLUSH/FUA sequences */
enum {
REQ_FSEQ_PREFLUSH = (1 << 0), /* pre-flushing in progress */
REQ_FSEQ_DATA = (1 << 1), /* data write in progress */
REQ_FSEQ_POSTFLUSH = (1 << 2), /* post-flushing in progress */
REQ_FSEQ_DONE = (1 << 3),
REQ_FSEQ_ACTIONS = REQ_FSEQ_PREFLUSH | REQ_FSEQ_DATA |
REQ_FSEQ_POSTFLUSH,
/*
* If flush has been pending longer than the following timeout,
* it's issued even if flush_data requests are still in flight.
*/
FLUSH_PENDING_TIMEOUT = 5 * HZ,
};
static void blk_kick_flush(struct request_queue *q,
struct blk_flush_queue *fq, unsigned int flags);
static unsigned int blk_flush_policy(unsigned long fflags, struct request *rq)
{
unsigned int policy = 0;
if (blk_rq_sectors(rq))
policy |= REQ_FSEQ_DATA;
if (fflags & (1UL << QUEUE_FLAG_WC)) { if (rq->cmd_flags & REQ_PREFLUSH) policy |= REQ_FSEQ_PREFLUSH; if (!(fflags & (1UL << QUEUE_FLAG_FUA)) &&
(rq->cmd_flags & REQ_FUA))
policy |= REQ_FSEQ_POSTFLUSH;
}
return policy;
}
static unsigned int blk_flush_cur_seq(struct request *rq)
{
return 1 << ffz(rq->flush.seq);
}
static void blk_flush_restore_request(struct request *rq)
{
/*
* After flush data completion, @rq->bio is %NULL but we need to
* complete the bio again. @rq->biotail is guaranteed to equal the
* original @rq->bio. Restore it.
*/
rq->bio = rq->biotail;
/* make @rq a normal request */
rq->rq_flags &= ~RQF_FLUSH_SEQ;
rq->end_io = rq->flush.saved_end_io;
}
static void blk_flush_queue_rq(struct request *rq, bool add_front)
{
blk_mq_add_to_requeue_list(rq, add_front, true);
}
static void blk_account_io_flush(struct request *rq)
{
struct block_device *part = rq->rq_disk->part0;
part_stat_lock();
part_stat_inc(part, ios[STAT_FLUSH]);
part_stat_add(part, nsecs[STAT_FLUSH],
ktime_get_ns() - rq->start_time_ns);
part_stat_unlock();
}
/**
* blk_flush_complete_seq - complete flush sequence
* @rq: PREFLUSH/FUA request being sequenced
* @fq: flush queue
* @seq: sequences to complete (mask of %REQ_FSEQ_*, can be zero)
* @error: whether an error occurred
*
* @rq just completed @seq part of its flush sequence, record the
* completion and trigger the next step.
*
* CONTEXT:
* spin_lock_irq(fq->mq_flush_lock)
*/
static void blk_flush_complete_seq(struct request *rq,
struct blk_flush_queue *fq,
unsigned int seq, blk_status_t error)
{
struct request_queue *q = rq->q; struct list_head *pending = &fq->flush_queue[fq->flush_pending_idx];
unsigned int cmd_flags;
BUG_ON(rq->flush.seq & seq); rq->flush.seq |= seq;
cmd_flags = rq->cmd_flags;
if (likely(!error))
seq = blk_flush_cur_seq(rq);
else
seq = REQ_FSEQ_DONE;
switch (seq) {
case REQ_FSEQ_PREFLUSH:
case REQ_FSEQ_POSTFLUSH:
/* queue for flush */
if (list_empty(pending))
fq->flush_pending_since = jiffies; list_move_tail(&rq->flush.list, pending);
break;
case REQ_FSEQ_DATA:
list_move_tail(&rq->flush.list, &fq->flush_data_in_flight);
blk_flush_queue_rq(rq, true);
break;
case REQ_FSEQ_DONE:
/*
* @rq was previously adjusted by blk_insert_flush() for
* flush sequencing and may already have gone through the
* flush data request completion path. Restore @rq for
* normal completion and end it.
*/
BUG_ON(!list_empty(&rq->queuelist)); list_del_init(&rq->flush.list);
blk_flush_restore_request(rq);
blk_mq_end_request(rq, error);
break;
default:
BUG();
}
blk_kick_flush(q, fq, cmd_flags);
}
static void flush_end_io(struct request *flush_rq, blk_status_t error)
{
struct request_queue *q = flush_rq->q;
struct list_head *running;
struct request *rq, *n;
unsigned long flags = 0;
struct blk_flush_queue *fq = blk_get_flush_queue(q, flush_rq->mq_ctx);
/* release the tag's ownership to the req cloned from */
spin_lock_irqsave(&fq->mq_flush_lock, flags);
if (!refcount_dec_and_test(&flush_rq->ref)) {
fq->rq_status = error;
spin_unlock_irqrestore(&fq->mq_flush_lock, flags);
return;
}
blk_account_io_flush(flush_rq);
/*
* Flush request has to be marked as IDLE when it is really ended
* because its .end_io() is called from timeout code path too for
* avoiding use-after-free.
*/
WRITE_ONCE(flush_rq->state, MQ_RQ_IDLE);
if (fq->rq_status != BLK_STS_OK) {
error = fq->rq_status;
fq->rq_status = BLK_STS_OK;
}
if (!q->elevator) {
flush_rq->tag = BLK_MQ_NO_TAG;
} else {
blk_mq_put_driver_tag(flush_rq);
flush_rq->internal_tag = BLK_MQ_NO_TAG;
}
running = &fq->flush_queue[fq->flush_running_idx];
BUG_ON(fq->flush_pending_idx == fq->flush_running_idx);
/* account completion of the flush request */
fq->flush_running_idx ^= 1;
/* and push the waiting requests to the next stage */
list_for_each_entry_safe(rq, n, running, flush.list) {
unsigned int seq = blk_flush_cur_seq(rq);
BUG_ON(seq != REQ_FSEQ_PREFLUSH && seq != REQ_FSEQ_POSTFLUSH);
blk_flush_complete_seq(rq, fq, seq, error);
}
spin_unlock_irqrestore(&fq->mq_flush_lock, flags);
}
bool is_flush_rq(struct request *rq)
{
return rq->end_io == flush_end_io;
}
/**
* blk_kick_flush - consider issuing flush request
* @q: request_queue being kicked
* @fq: flush queue
* @flags: cmd_flags of the original request
*
* Flush related states of @q have changed, consider issuing flush request.
* Please read the comment at the top of this file for more info.
*
* CONTEXT:
* spin_lock_irq(fq->mq_flush_lock)
*
*/
static void blk_kick_flush(struct request_queue *q, struct blk_flush_queue *fq,
unsigned int flags)
{
struct list_head *pending = &fq->flush_queue[fq->flush_pending_idx];
struct request *first_rq =
list_first_entry(pending, struct request, flush.list);
struct request *flush_rq = fq->flush_rq;
/* C1 described at the top of this file */
if (fq->flush_pending_idx != fq->flush_running_idx || list_empty(pending))
return;
/* C2 and C3 */
if (!list_empty(&fq->flush_data_in_flight) && time_before(jiffies,
fq->flush_pending_since + FLUSH_PENDING_TIMEOUT))
return;
/*
* Issue flush and toggle pending_idx. This makes pending_idx
* different from running_idx, which means flush is in flight.
*/
fq->flush_pending_idx ^= 1;
blk_rq_init(q, flush_rq);
/*
* In case of none scheduler, borrow tag from the first request
* since they can't be in flight at the same time. And acquire
* the tag's ownership for flush req.
*
* In case of IO scheduler, flush rq need to borrow scheduler tag
* just for cheating put/get driver tag.
*/
flush_rq->mq_ctx = first_rq->mq_ctx;
flush_rq->mq_hctx = first_rq->mq_hctx;
if (!q->elevator) {
flush_rq->tag = first_rq->tag;
/*
* We borrow data request's driver tag, so have to mark
* this flush request as INFLIGHT for avoiding double
* account of this driver tag
*/
flush_rq->rq_flags |= RQF_MQ_INFLIGHT;
} else
flush_rq->internal_tag = first_rq->internal_tag;
flush_rq->cmd_flags = REQ_OP_FLUSH | REQ_PREFLUSH;
flush_rq->cmd_flags |= (flags & REQ_DRV) | (flags & REQ_FAILFAST_MASK);
flush_rq->rq_flags |= RQF_FLUSH_SEQ;
flush_rq->rq_disk = first_rq->rq_disk;
flush_rq->end_io = flush_end_io;
/*
* Order WRITE ->end_io and WRITE rq->ref, and its pair is the one
* implied in refcount_inc_not_zero() called from
* blk_mq_find_and_get_req(), which orders WRITE/READ flush_rq->ref
* and READ flush_rq->end_io
*/
smp_wmb();
refcount_set(&flush_rq->ref, 1);
blk_flush_queue_rq(flush_rq, false);
}
static void mq_flush_data_end_io(struct request *rq, blk_status_t error)
{
struct request_queue *q = rq->q;
struct blk_mq_hw_ctx *hctx = rq->mq_hctx;
struct blk_mq_ctx *ctx = rq->mq_ctx;
unsigned long flags;
struct blk_flush_queue *fq = blk_get_flush_queue(q, ctx);
if (q->elevator) {
WARN_ON(rq->tag < 0);
blk_mq_put_driver_tag(rq);
}
/*
* After populating an empty queue, kick it to avoid stall. Read
* the comment in flush_end_io().
*/
spin_lock_irqsave(&fq->mq_flush_lock, flags);
blk_flush_complete_seq(rq, fq, REQ_FSEQ_DATA, error);
spin_unlock_irqrestore(&fq->mq_flush_lock, flags);
blk_mq_sched_restart(hctx);
}
/**
* blk_insert_flush - insert a new PREFLUSH/FUA request
* @rq: request to insert
*
* To be called from __elv_add_request() for %ELEVATOR_INSERT_FLUSH insertions.
* or __blk_mq_run_hw_queue() to dispatch request.
* @rq is being submitted. Analyze what needs to be done and put it on the
* right queue.
*/
void blk_insert_flush(struct request *rq)
{
struct request_queue *q = rq->q;
unsigned long fflags = q->queue_flags; /* may change, cache */
unsigned int policy = blk_flush_policy(fflags, rq);
struct blk_flush_queue *fq = blk_get_flush_queue(q, rq->mq_ctx);
/*
* @policy now records what operations need to be done. Adjust
* REQ_PREFLUSH and FUA for the driver.
*/
rq->cmd_flags &= ~REQ_PREFLUSH;
if (!(fflags & (1UL << QUEUE_FLAG_FUA)))
rq->cmd_flags &= ~REQ_FUA;
/*
* REQ_PREFLUSH|REQ_FUA implies REQ_SYNC, so if we clear any
* of those flags, we have to set REQ_SYNC to avoid skewing
* the request accounting.
*/
rq->cmd_flags |= REQ_SYNC;
/*
* An empty flush handed down from a stacking driver may
* translate into nothing if the underlying device does not
* advertise a write-back cache. In this case, simply
* complete the request.
*/
if (!policy) {
blk_mq_end_request(rq, 0);
return;
}
BUG_ON(rq->bio != rq->biotail); /*assumes zero or single bio rq */
/*
* If there's data but flush is not necessary, the request can be
* processed directly without going through flush machinery. Queue
* for normal execution.
*/
if ((policy & REQ_FSEQ_DATA) &&
!(policy & (REQ_FSEQ_PREFLUSH | REQ_FSEQ_POSTFLUSH))) {
blk_mq_request_bypass_insert(rq, false, false);
return;
}
/*
* @rq should go through flush machinery. Mark it part of flush
* sequence and submit for further processing.
*/
memset(&rq->flush, 0, sizeof(rq->flush));
INIT_LIST_HEAD(&rq->flush.list);
rq->rq_flags |= RQF_FLUSH_SEQ;
rq->flush.saved_end_io = rq->end_io; /* Usually NULL */
rq->end_io = mq_flush_data_end_io;
spin_lock_irq(&fq->mq_flush_lock);
blk_flush_complete_seq(rq, fq, REQ_FSEQ_ACTIONS & ~policy, 0);
spin_unlock_irq(&fq->mq_flush_lock);
}
/**
* blkdev_issue_flush - queue a flush
* @bdev: blockdev to issue flush for
*
* Description:
* Issue a flush for the block device in question.
*/
int blkdev_issue_flush(struct block_device *bdev)
{
struct bio bio;
bio_init(&bio, NULL, 0); bio_set_dev(&bio, bdev);
bio.bi_opf = REQ_OP_WRITE | REQ_PREFLUSH;
return submit_bio_wait(&bio);
}
EXPORT_SYMBOL(blkdev_issue_flush);
struct blk_flush_queue *blk_alloc_flush_queue(int node, int cmd_size,
gfp_t flags)
{
struct blk_flush_queue *fq;
int rq_sz = sizeof(struct request);
fq = kzalloc_node(sizeof(*fq), flags, node);
if (!fq)
goto fail;
spin_lock_init(&fq->mq_flush_lock);
rq_sz = round_up(rq_sz + cmd_size, cache_line_size());
fq->flush_rq = kzalloc_node(rq_sz, flags, node);
if (!fq->flush_rq)
goto fail_rq;
INIT_LIST_HEAD(&fq->flush_queue[0]);
INIT_LIST_HEAD(&fq->flush_queue[1]);
INIT_LIST_HEAD(&fq->flush_data_in_flight);
return fq;
fail_rq:
kfree(fq);
fail:
return NULL;
}
void blk_free_flush_queue(struct blk_flush_queue *fq)
{
/* bio based request queue hasn't flush queue */
if (!fq)
return;
kfree(fq->flush_rq);
kfree(fq);
}
/*
* Allow driver to set its own lock class to fq->mq_flush_lock for
* avoiding lockdep complaint.
*
* flush_end_io() may be called recursively from some driver, such as
* nvme-loop, so lockdep may complain 'possible recursive locking' because
* all 'struct blk_flush_queue' instance share same mq_flush_lock lock class
* key. We need to assign different lock class for these driver's
* fq->mq_flush_lock for avoiding the lockdep warning.
*
* Use dynamically allocated lock class key for each 'blk_flush_queue'
* instance is over-kill, and more worse it introduces horrible boot delay
* issue because synchronize_rcu() is implied in lockdep_unregister_key which
* is called for each hctx release. SCSI probing may synchronously create and
* destroy lots of MQ request_queues for non-existent devices, and some robot
* test kernel always enable lockdep option. It is observed that more than half
* an hour is taken during SCSI MQ probe with per-fq lock class.
*/
void blk_mq_hctx_set_fq_lock_class(struct blk_mq_hw_ctx *hctx,
struct lock_class_key *key)
{
lockdep_set_class(&hctx->fq->mq_flush_lock, key);
}
EXPORT_SYMBOL_GPL(blk_mq_hctx_set_fq_lock_class);
/* SPDX-License-Identifier: GPL-2.0 OR MIT */
#ifndef __LINUX_OVERFLOW_H
#define __LINUX_OVERFLOW_H
#include <linux/compiler.h>
#include <linux/limits.h>
/*
* We need to compute the minimum and maximum values representable in a given
* type. These macros may also be useful elsewhere. It would seem more obvious
* to do something like:
*
* #define type_min(T) (T)(is_signed_type(T) ? (T)1 << (8*sizeof(T)-1) : 0)
* #define type_max(T) (T)(is_signed_type(T) ? ((T)1 << (8*sizeof(T)-1)) - 1 : ~(T)0)
*
* Unfortunately, the middle expressions, strictly speaking, have
* undefined behaviour, and at least some versions of gcc warn about
* the type_max expression (but not if -fsanitize=undefined is in
* effect; in that case, the warning is deferred to runtime...).
*
* The slightly excessive casting in type_min is to make sure the
* macros also produce sensible values for the exotic type _Bool. [The
* overflow checkers only almost work for _Bool, but that's
* a-feature-not-a-bug, since people shouldn't be doing arithmetic on
* _Bools. Besides, the gcc builtins don't allow _Bool* as third
* argument.]
*
* Idea stolen from
* https://mail-index.netbsd.org/tech-misc/2007/02/05/0000.html -
* credit to Christian Biere.
*/
#define is_signed_type(type) (((type)(-1)) < (type)1)
#define __type_half_max(type) ((type)1 << (8*sizeof(type) - 1 - is_signed_type(type)))
#define type_max(T) ((T)((__type_half_max(T) - 1) + __type_half_max(T)))
#define type_min(T) ((T)((T)-type_max(T)-(T)1))
/*
* Avoids triggering -Wtype-limits compilation warning,
* while using unsigned data types to check a < 0.
*/
#define is_non_negative(a) ((a) > 0 || (a) == 0)
#define is_negative(a) (!(is_non_negative(a)))
/*
* Allows for effectively applying __must_check to a macro so we can have
* both the type-agnostic benefits of the macros while also being able to
* enforce that the return value is, in fact, checked.
*/
static inline bool __must_check __must_check_overflow(bool overflow)
{
return unlikely(overflow);
}
/*
* For simplicity and code hygiene, the fallback code below insists on
* a, b and *d having the same type (similar to the min() and max()
* macros), whereas gcc's type-generic overflow checkers accept
* different types. Hence we don't just make check_add_overflow an
* alias for __builtin_add_overflow, but add type checks similar to
* below.
*/
#define check_add_overflow(a, b, d) __must_check_overflow(({ \
typeof(a) __a = (a); \
typeof(b) __b = (b); \
typeof(d) __d = (d); \
(void) (&__a == &__b); \
(void) (&__a == __d); \
__builtin_add_overflow(__a, __b, __d); \
}))
#define check_sub_overflow(a, b, d) __must_check_overflow(({ \
typeof(a) __a = (a); \
typeof(b) __b = (b); \
typeof(d) __d = (d); \
(void) (&__a == &__b); \
(void) (&__a == __d); \
__builtin_sub_overflow(__a, __b, __d); \
}))
#define check_mul_overflow(a, b, d) __must_check_overflow(({ \
typeof(a) __a = (a); \
typeof(b) __b = (b); \
typeof(d) __d = (d); \
(void) (&__a == &__b); \
(void) (&__a == __d); \
__builtin_mul_overflow(__a, __b, __d); \
}))
/** check_shl_overflow() - Calculate a left-shifted value and check overflow
*
* @a: Value to be shifted
* @s: How many bits left to shift
* @d: Pointer to where to store the result
*
* Computes *@d = (@a << @s)
*
* Returns true if '*d' cannot hold the result or when 'a << s' doesn't
* make sense. Example conditions:
* - 'a << s' causes bits to be lost when stored in *d.
* - 's' is garbage (e.g. negative) or so large that the result of
* 'a << s' is guaranteed to be 0.
* - 'a' is negative.
* - 'a << s' sets the sign bit, if any, in '*d'.
*
* '*d' will hold the results of the attempted shift, but is not
* considered "safe for use" if true is returned.
*/
#define check_shl_overflow(a, s, d) __must_check_overflow(({ \
typeof(a) _a = a; \
typeof(s) _s = s; \
typeof(d) _d = d; \
u64 _a_full = _a; \
unsigned int _to_shift = \
is_non_negative(_s) && _s < 8 * sizeof(*d) ? _s : 0; \
*_d = (_a_full << _to_shift); \
(_to_shift != _s || is_negative(*_d) || is_negative(_a) || \
(*_d >> _to_shift) != _a); \
}))
/**
* array_size() - Calculate size of 2-dimensional array.
*
* @a: dimension one
* @b: dimension two
*
* Calculates size of 2-dimensional array: @a * @b.
*
* Returns: number of bytes needed to represent the array or SIZE_MAX on
* overflow.
*/
static inline __must_check size_t array_size(size_t a, size_t b)
{
size_t bytes;
if (check_mul_overflow(a, b, &bytes))
return SIZE_MAX;
return bytes;
}
/**
* array3_size() - Calculate size of 3-dimensional array.
*
* @a: dimension one
* @b: dimension two
* @c: dimension three
*
* Calculates size of 3-dimensional array: @a * @b * @c.
*
* Returns: number of bytes needed to represent the array or SIZE_MAX on
* overflow.
*/
static inline __must_check size_t array3_size(size_t a, size_t b, size_t c)
{
size_t bytes;
if (check_mul_overflow(a, b, &bytes))
return SIZE_MAX;
if (check_mul_overflow(bytes, c, &bytes))
return SIZE_MAX;
return bytes;
}
/*
* Compute a*b+c, returning SIZE_MAX on overflow. Internal helper for
* struct_size() below.
*/
static inline __must_check size_t __ab_c_size(size_t a, size_t b, size_t c)
{
size_t bytes;
if (check_mul_overflow(a, b, &bytes))
return SIZE_MAX;
if (check_add_overflow(bytes, c, &bytes))
return SIZE_MAX;
return bytes;
}
/**
* struct_size() - Calculate size of structure with trailing array.
* @p: Pointer to the structure.
* @member: Name of the array member.
* @count: Number of elements in the array.
*
* Calculates size of memory needed for structure @p followed by an
* array of @count number of @member elements.
*
* Return: number of bytes needed or SIZE_MAX on overflow.
*/
#define struct_size(p, member, count) \
__ab_c_size(count, \
sizeof(*(p)->member) + __must_be_array((p)->member),\
sizeof(*(p)))
/**
* flex_array_size() - Calculate size of a flexible array member
* within an enclosing structure.
*
* @p: Pointer to the structure.
* @member: Name of the flexible array member.
* @count: Number of elements in the array.
*
* Calculates size of a flexible array of @count number of @member
* elements, at the end of structure @p.
*
* Return: number of bytes needed or SIZE_MAX on overflow.
*/
#define flex_array_size(p, member, count) \
array_size(count, \
sizeof(*(p)->member) + __must_be_array((p)->member))
#endif /* __LINUX_OVERFLOW_H */
/*
* memfd_create system call and file sealing support
*
* Code was originally included in shmem.c, and broken out to facilitate
* use by hugetlbfs as well as tmpfs.
*
* This file is released under the GPL.
*/
#include <linux/fs.h>
#include <linux/vfs.h>
#include <linux/pagemap.h>
#include <linux/file.h>
#include <linux/mm.h>
#include <linux/sched/signal.h>
#include <linux/khugepaged.h>
#include <linux/syscalls.h>
#include <linux/hugetlb.h>
#include <linux/shmem_fs.h>
#include <linux/memfd.h>
#include <uapi/linux/memfd.h>
/*
* We need a tag: a new tag would expand every xa_node by 8 bytes,
* so reuse a tag which we firmly believe is never set or cleared on tmpfs
* or hugetlbfs because they are memory only filesystems.
*/
#define MEMFD_TAG_PINNED PAGECACHE_TAG_TOWRITE
#define LAST_SCAN 4 /* about 150ms max */
static void memfd_tag_pins(struct xa_state *xas)
{
struct page *page;
int latency = 0;
int cache_count;
lru_add_drain();
xas_lock_irq(xas);
xas_for_each(xas, page, ULONG_MAX) {
cache_count = 1;
if (!xa_is_value(page) &&
PageTransHuge(page) && !PageHuge(page))
cache_count = HPAGE_PMD_NR;
if (!xa_is_value(page) &&
page_count(page) - total_mapcount(page) != cache_count)
xas_set_mark(xas, MEMFD_TAG_PINNED);
if (cache_count != 1)
xas_set(xas, page->index + cache_count);
latency += cache_count;
if (latency < XA_CHECK_SCHED)
continue;
latency = 0;
xas_pause(xas);
xas_unlock_irq(xas);
cond_resched();
xas_lock_irq(xas);
}
xas_unlock_irq(xas);
}
/*
* Setting SEAL_WRITE requires us to verify there's no pending writer. However,
* via get_user_pages(), drivers might have some pending I/O without any active
* user-space mappings (eg., direct-IO, AIO). Therefore, we look at all pages
* and see whether it has an elevated ref-count. If so, we tag them and wait for
* them to be dropped.
* The caller must guarantee that no new user will acquire writable references
* to those pages to avoid races.
*/
static int memfd_wait_for_pins(struct address_space *mapping)
{
XA_STATE(xas, &mapping->i_pages, 0);
struct page *page;
int error, scan;
memfd_tag_pins(&xas);
error = 0;
for (scan = 0; scan <= LAST_SCAN; scan++) {
int latency = 0;
int cache_count;
if (!xas_marked(&xas, MEMFD_TAG_PINNED))
break;
if (!scan)
lru_add_drain_all();
else if (schedule_timeout_killable((HZ << scan) / 200))
scan = LAST_SCAN;
xas_set(&xas, 0);
xas_lock_irq(&xas);
xas_for_each_marked(&xas, page, ULONG_MAX, MEMFD_TAG_PINNED) {
bool clear = true;
cache_count = 1;
if (!xa_is_value(page) &&
PageTransHuge(page) && !PageHuge(page))
cache_count = HPAGE_PMD_NR;
if (!xa_is_value(page) && cache_count !=
page_count(page) - total_mapcount(page)) {
/*
* On the last scan, we clean up all those tags
* we inserted; but make a note that we still
* found pages pinned.
*/
if (scan == LAST_SCAN)
error = -EBUSY;
else
clear = false;
}
if (clear)
xas_clear_mark(&xas, MEMFD_TAG_PINNED);
latency += cache_count;
if (latency < XA_CHECK_SCHED)
continue;
latency = 0;
xas_pause(&xas);
xas_unlock_irq(&xas);
cond_resched();
xas_lock_irq(&xas);
}
xas_unlock_irq(&xas);
}
return error;
}
static unsigned int *memfd_file_seals_ptr(struct file *file)
{
if (shmem_file(file))
return &SHMEM_I(file_inode(file))->seals;
#ifdef CONFIG_HUGETLBFS
if (is_file_hugepages(file))
return &HUGETLBFS_I(file_inode(file))->seals;
#endif
return NULL;
}
#define F_ALL_SEALS (F_SEAL_SEAL | \
F_SEAL_SHRINK | \
F_SEAL_GROW | \
F_SEAL_WRITE | \
F_SEAL_FUTURE_WRITE)
static int memfd_add_seals(struct file *file, unsigned int seals)
{
struct inode *inode = file_inode(file);
unsigned int *file_seals;
int error;
/*
* SEALING
* Sealing allows multiple parties to share a tmpfs or hugetlbfs file
* but restrict access to a specific subset of file operations. Seals
* can only be added, but never removed. This way, mutually untrusted
* parties can share common memory regions with a well-defined policy.
* A malicious peer can thus never perform unwanted operations on a
* shared object.
*
* Seals are only supported on special tmpfs or hugetlbfs files and
* always affect the whole underlying inode. Once a seal is set, it
* may prevent some kinds of access to the file. Currently, the
* following seals are defined:
* SEAL_SEAL: Prevent further seals from being set on this file
* SEAL_SHRINK: Prevent the file from shrinking
* SEAL_GROW: Prevent the file from growing
* SEAL_WRITE: Prevent write access to the file
*
* As we don't require any trust relationship between two parties, we
* must prevent seals from being removed. Therefore, sealing a file
* only adds a given set of seals to the file, it never touches
* existing seals. Furthermore, the "setting seals"-operation can be
* sealed itself, which basically prevents any further seal from being
* added.
*
* Semantics of sealing are only defined on volatile files. Only
* anonymous tmpfs and hugetlbfs files support sealing. More
* importantly, seals are never written to disk. Therefore, there's
* no plan to support it on other file types.
*/
if (!(file->f_mode & FMODE_WRITE))
return -EPERM;
if (seals & ~(unsigned int)F_ALL_SEALS)
return -EINVAL;
inode_lock(inode);
file_seals = memfd_file_seals_ptr(file);
if (!file_seals) {
error = -EINVAL;
goto unlock;
}
if (*file_seals & F_SEAL_SEAL) {
error = -EPERM;
goto unlock;
}
if ((seals & F_SEAL_WRITE) && !(*file_seals & F_SEAL_WRITE)) {
error = mapping_deny_writable(file->f_mapping);
if (error)
goto unlock;
error = memfd_wait_for_pins(file->f_mapping);
if (error) {
mapping_allow_writable(file->f_mapping);
goto unlock;
}
}
*file_seals |= seals;
error = 0;
unlock:
inode_unlock(inode);
return error;
}
static int memfd_get_seals(struct file *file)
{
unsigned int *seals = memfd_file_seals_ptr(file);
return seals ? *seals : -EINVAL;
}
long memfd_fcntl(struct file *file, unsigned int cmd, unsigned long arg)
{
long error;
switch (cmd) {
case F_ADD_SEALS:
/* disallow upper 32bit */
if (arg > UINT_MAX)
return -EINVAL;
error = memfd_add_seals(file, arg);
break;
case F_GET_SEALS:
error = memfd_get_seals(file);
break;
default:
error = -EINVAL;
break;
}
return error;
}
#define MFD_NAME_PREFIX "memfd:"
#define MFD_NAME_PREFIX_LEN (sizeof(MFD_NAME_PREFIX) - 1)
#define MFD_NAME_MAX_LEN (NAME_MAX - MFD_NAME_PREFIX_LEN)
#define MFD_ALL_FLAGS (MFD_CLOEXEC | MFD_ALLOW_SEALING | MFD_HUGETLB)
SYSCALL_DEFINE2(memfd_create,
const char __user *, uname,
unsigned int, flags)
{
unsigned int *file_seals;
struct file *file;
int fd, error;
char *name;
long len;
if (!(flags & MFD_HUGETLB)) {
if (flags & ~(unsigned int)MFD_ALL_FLAGS)
return -EINVAL;
} else {
/* Allow huge page size encoding in flags. */
if (flags & ~(unsigned int)(MFD_ALL_FLAGS |
(MFD_HUGE_MASK << MFD_HUGE_SHIFT)))
return -EINVAL;
}
/* length includes terminating zero */
len = strnlen_user(uname, MFD_NAME_MAX_LEN + 1);
if (len <= 0)
return -EFAULT;
if (len > MFD_NAME_MAX_LEN + 1)
return -EINVAL;
name = kmalloc(len + MFD_NAME_PREFIX_LEN, GFP_KERNEL);
if (!name)
return -ENOMEM;
strcpy(name, MFD_NAME_PREFIX);
if (copy_from_user(&name[MFD_NAME_PREFIX_LEN], uname, len)) {
error = -EFAULT;
goto err_name;
}
/* terminating-zero may have changed after strnlen_user() returned */
if (name[len + MFD_NAME_PREFIX_LEN - 1]) {
error = -EFAULT;
goto err_name;
}
fd = get_unused_fd_flags((flags & MFD_CLOEXEC) ? O_CLOEXEC : 0);
if (fd < 0) {
error = fd;
goto err_name;
}
if (flags & MFD_HUGETLB) { struct ucounts *ucounts = NULL; file = hugetlb_file_setup(name, 0, VM_NORESERVE, &ucounts,
HUGETLB_ANONHUGE_INODE,
(flags >> MFD_HUGE_SHIFT) &
MFD_HUGE_MASK);
} else
file = shmem_file_setup(name, 0, VM_NORESERVE);
if (IS_ERR(file)) {
error = PTR_ERR(file);
goto err_fd;
}
file->f_mode |= FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE;
file->f_flags |= O_LARGEFILE;
if (flags & MFD_ALLOW_SEALING) {
file_seals = memfd_file_seals_ptr(file);
*file_seals &= ~F_SEAL_SEAL;
}
fd_install(fd, file);
kfree(name);
return fd;
err_fd:
put_unused_fd(fd);
err_name:
kfree(name);
return error;
}
// SPDX-License-Identifier: GPL-2.0-or-later
/*
* NET3: Garbage Collector For AF_UNIX sockets
*
* Garbage Collector:
* Copyright (C) Barak A. Pearlmutter.
*
* Chopped about by Alan Cox 22/3/96 to make it fit the AF_UNIX socket problem.
* If it doesn't work blame me, it worked when Barak sent it.
*
* Assumptions:
*
* - object w/ a bit
* - free list
*
* Current optimizations:
*
* - explicit stack instead of recursion
* - tail recurse on first born instead of immediate push/pop
* - we gather the stuff that should not be killed into tree
* and stack is just a path from root to the current pointer.
*
* Future optimizations:
*
* - don't just push entire root set; process in place
*
* Fixes:
* Alan Cox 07 Sept 1997 Vmalloc internal stack as needed.
* Cope with changing max_files.
* Al Viro 11 Oct 1998
* Graph may have cycles. That is, we can send the descriptor
* of foo to bar and vice versa. Current code chokes on that.
* Fix: move SCM_RIGHTS ones into the separate list and then
* skb_free() them all instead of doing explicit fput's.
* Another problem: since fput() may block somebody may
* create a new unix_socket when we are in the middle of sweep
* phase. Fix: revert the logic wrt MARKED. Mark everything
* upon the beginning and unmark non-junk ones.
*
* [12 Oct 1998] AAARGH! New code purges all SCM_RIGHTS
* sent to connect()'ed but still not accept()'ed sockets.
* Fixed. Old code had slightly different problem here:
* extra fput() in situation when we passed the descriptor via
* such socket and closed it (descriptor). That would happen on
* each unix_gc() until the accept(). Since the struct file in
* question would go to the free list and might be reused...
* That might be the reason of random oopses on filp_close()
* in unrelated processes.
*
* AV 28 Feb 1999
* Kill the explicit allocation of stack. Now we keep the tree
* with root in dummy + pointer (gc_current) to one of the nodes.
* Stack is represented as path from gc_current to dummy. Unmark
* now means "add to tree". Push == "make it a son of gc_current".
* Pop == "move gc_current to parent". We keep only pointers to
* parents (->gc_tree).
* AV 1 Mar 1999
* Damn. Added missing check for ->dead in listen queues scanning.
*
* Miklos Szeredi 25 Jun 2007
* Reimplement with a cycle collecting algorithm. This should
* solve several problems with the previous code, like being racy
* wrt receive and holding up unrelated socket operations.
*/
#include <linux/kernel.h>
#include <linux/string.h>
#include <linux/socket.h>
#include <linux/un.h>
#include <linux/net.h>
#include <linux/fs.h>
#include <linux/skbuff.h>
#include <linux/netdevice.h>
#include <linux/file.h>
#include <linux/proc_fs.h>
#include <linux/mutex.h>
#include <linux/wait.h>
#include <net/sock.h>
#include <net/af_unix.h>
#include <net/scm.h>
#include <net/tcp_states.h>
#include "scm.h"
/* Internal data structures and random procedures: */
static LIST_HEAD(gc_candidates);
static DECLARE_WAIT_QUEUE_HEAD(unix_gc_wait);
static void scan_inflight(struct sock *x, void (*func)(struct unix_sock *),
struct sk_buff_head *hitlist)
{
struct sk_buff *skb;
struct sk_buff *next;
spin_lock(&x->sk_receive_queue.lock);
skb_queue_walk_safe(&x->sk_receive_queue, skb, next) {
/* Do we have file descriptors ? */
if (UNIXCB(skb).fp) {
bool hit = false;
/* Process the descriptors of this socket */
int nfd = UNIXCB(skb).fp->count;
struct file **fp = UNIXCB(skb).fp->fp;
while (nfd--) {
/* Get the socket the fd matches if it indeed does so */
struct sock *sk = unix_get_socket(*fp++);
if (sk) {
struct unix_sock *u = unix_sk(sk);
/* Ignore non-candidates, they could
* have been added to the queues after
* starting the garbage collection
*/
if (test_bit(UNIX_GC_CANDIDATE, &u->gc_flags)) {
hit = true;
func(u);
}
}
}
if (hit && hitlist != NULL) {
__skb_unlink(skb, &x->sk_receive_queue);
__skb_queue_tail(hitlist, skb);
}
}
}
spin_unlock(&x->sk_receive_queue.lock);
}
static void scan_children(struct sock *x, void (*func)(struct unix_sock *),
struct sk_buff_head *hitlist)
{
if (x->sk_state != TCP_LISTEN) {
scan_inflight(x, func, hitlist);
} else {
struct sk_buff *skb;
struct sk_buff *next;
struct unix_sock *u;
LIST_HEAD(embryos);
/* For a listening socket collect the queued embryos
* and perform a scan on them as well.
*/
spin_lock(&x->sk_receive_queue.lock);
skb_queue_walk_safe(&x->sk_receive_queue, skb, next) {
u = unix_sk(skb->sk);
/* An embryo cannot be in-flight, so it's safe
* to use the list link.
*/
BUG_ON(!list_empty(&u->link));
list_add_tail(&u->link, &embryos);
}
spin_unlock(&x->sk_receive_queue.lock);
while (!list_empty(&embryos)) {
u = list_entry(embryos.next, struct unix_sock, link);
scan_inflight(&u->sk, func, hitlist);
list_del_init(&u->link);
}
}
}
static void dec_inflight(struct unix_sock *usk)
{
atomic_long_dec(&usk->inflight);
}
static void inc_inflight(struct unix_sock *usk)
{
atomic_long_inc(&usk->inflight);
}
static void inc_inflight_move_tail(struct unix_sock *u)
{
atomic_long_inc(&u->inflight);
/* If this still might be part of a cycle, move it to the end
* of the list, so that it's checked even if it was already
* passed over
*/
if (test_bit(UNIX_GC_MAYBE_CYCLE, &u->gc_flags))
list_move_tail(&u->link, &gc_candidates);
}
static bool gc_in_progress;
#define UNIX_INFLIGHT_TRIGGER_GC 16000
void wait_for_unix_gc(void)
{
/* If number of inflight sockets is insane,
* force a garbage collect right now.
* Paired with the WRITE_ONCE() in unix_inflight(),
* unix_notinflight() and gc_in_progress().
*/
if (READ_ONCE(unix_tot_inflight) > UNIX_INFLIGHT_TRIGGER_GC && !READ_ONCE(gc_in_progress)) unix_gc(); wait_event(unix_gc_wait, gc_in_progress == false);}
/* The external entry point: unix_gc() */
void unix_gc(void)
{
struct unix_sock *u;
struct unix_sock *next;
struct sk_buff_head hitlist;
struct list_head cursor;
LIST_HEAD(not_cycle_list);
spin_lock(&unix_gc_lock);
/* Avoid a recursive GC. */
if (gc_in_progress)
goto out;
/* Paired with READ_ONCE() in wait_for_unix_gc(). */
WRITE_ONCE(gc_in_progress, true);
/* First, select candidates for garbage collection. Only
* in-flight sockets are considered, and from those only ones
* which don't have any external reference.
*
* Holding unix_gc_lock will protect these candidates from
* being detached, and hence from gaining an external
* reference. Since there are no possible receivers, all
* buffers currently on the candidates' queues stay there
* during the garbage collection.
*
* We also know that no new candidate can be added onto the
* receive queues. Other, non candidate sockets _can_ be
* added to queue, so we must make sure only to touch
* candidates.
*/
list_for_each_entry_safe(u, next, &gc_inflight_list, link) {
long total_refs;
long inflight_refs;
total_refs = file_count(u->sk.sk_socket->file);
inflight_refs = atomic_long_read(&u->inflight);
BUG_ON(inflight_refs < 1);
BUG_ON(total_refs < inflight_refs);
if (total_refs == inflight_refs) {
list_move_tail(&u->link, &gc_candidates);
__set_bit(UNIX_GC_CANDIDATE, &u->gc_flags);
__set_bit(UNIX_GC_MAYBE_CYCLE, &u->gc_flags);
}
}
/* Now remove all internal in-flight reference to children of
* the candidates.
*/
list_for_each_entry(u, &gc_candidates, link)
scan_children(&u->sk, dec_inflight, NULL);
/* Restore the references for children of all candidates,
* which have remaining references. Do this recursively, so
* only those remain, which form cyclic references.
*
* Use a "cursor" link, to make the list traversal safe, even
* though elements might be moved about.
*/
list_add(&cursor, &gc_candidates);
while (cursor.next != &gc_candidates) {
u = list_entry(cursor.next, struct unix_sock, link);
/* Move cursor to after the current position. */
list_move(&cursor, &u->link);
if (atomic_long_read(&u->inflight) > 0) {
list_move_tail(&u->link, ¬_cycle_list);
__clear_bit(UNIX_GC_MAYBE_CYCLE, &u->gc_flags);
scan_children(&u->sk, inc_inflight_move_tail, NULL);
}
}
list_del(&cursor);
/* Now gc_candidates contains only garbage. Restore original
* inflight counters for these as well, and remove the skbuffs
* which are creating the cycle(s).
*/
skb_queue_head_init(&hitlist);
list_for_each_entry(u, &gc_candidates, link)
scan_children(&u->sk, inc_inflight, &hitlist);
/* not_cycle_list contains those sockets which do not make up a
* cycle. Restore these to the inflight list.
*/
while (!list_empty(¬_cycle_list)) {
u = list_entry(not_cycle_list.next, struct unix_sock, link);
__clear_bit(UNIX_GC_CANDIDATE, &u->gc_flags);
list_move_tail(&u->link, &gc_inflight_list);
}
spin_unlock(&unix_gc_lock);
/* Here we are. Hitlist is filled. Die. */
__skb_queue_purge(&hitlist);
spin_lock(&unix_gc_lock);
/* All candidates should have been detached by now. */
BUG_ON(!list_empty(&gc_candidates));
/* Paired with READ_ONCE() in wait_for_unix_gc(). */
WRITE_ONCE(gc_in_progress, false);
wake_up(&unix_gc_wait);
out:
spin_unlock(&unix_gc_lock);
}
// SPDX-License-Identifier: GPL-2.0-or-later
/*
* NetLabel Domain Hash Table
*
* This file manages the domain hash table that NetLabel uses to determine
* which network labeling protocol to use for a given domain. The NetLabel
* system manages static and dynamic label mappings for network protocols such
* as CIPSO and RIPSO.
*
* Author: Paul Moore <paul@paul-moore.com>
*/
/*
* (c) Copyright Hewlett-Packard Development Company, L.P., 2006, 2008
*/
#include <linux/types.h>
#include <linux/rculist.h>
#include <linux/skbuff.h>
#include <linux/spinlock.h>
#include <linux/string.h>
#include <linux/audit.h>
#include <linux/slab.h>
#include <net/netlabel.h>
#include <net/cipso_ipv4.h>
#include <net/calipso.h>
#include <asm/bug.h>
#include "netlabel_mgmt.h"
#include "netlabel_addrlist.h"
#include "netlabel_calipso.h"
#include "netlabel_domainhash.h"
#include "netlabel_user.h"
struct netlbl_domhsh_tbl {
struct list_head *tbl;
u32 size;
};
/* Domain hash table */
/* updates should be so rare that having one spinlock for the entire hash table
* should be okay */
static DEFINE_SPINLOCK(netlbl_domhsh_lock);
#define netlbl_domhsh_rcu_deref(p) \
rcu_dereference_check(p, lockdep_is_held(&netlbl_domhsh_lock))
static struct netlbl_domhsh_tbl __rcu *netlbl_domhsh;
static struct netlbl_dom_map __rcu *netlbl_domhsh_def_ipv4;
static struct netlbl_dom_map __rcu *netlbl_domhsh_def_ipv6;
/*
* Domain Hash Table Helper Functions
*/
/**
* netlbl_domhsh_free_entry - Frees a domain hash table entry
* @entry: the entry's RCU field
*
* Description:
* This function is designed to be used as a callback to the call_rcu()
* function so that the memory allocated to a hash table entry can be released
* safely.
*
*/
static void netlbl_domhsh_free_entry(struct rcu_head *entry)
{
struct netlbl_dom_map *ptr;
struct netlbl_af4list *iter4;
struct netlbl_af4list *tmp4;
#if IS_ENABLED(CONFIG_IPV6)
struct netlbl_af6list *iter6;
struct netlbl_af6list *tmp6;
#endif /* IPv6 */
ptr = container_of(entry, struct netlbl_dom_map, rcu);
if (ptr->def.type == NETLBL_NLTYPE_ADDRSELECT) {
netlbl_af4list_foreach_safe(iter4, tmp4,
&ptr->def.addrsel->list4) {
netlbl_af4list_remove_entry(iter4);
kfree(netlbl_domhsh_addr4_entry(iter4));
}
#if IS_ENABLED(CONFIG_IPV6)
netlbl_af6list_foreach_safe(iter6, tmp6,
&ptr->def.addrsel->list6) {
netlbl_af6list_remove_entry(iter6);
kfree(netlbl_domhsh_addr6_entry(iter6));
}
#endif /* IPv6 */
kfree(ptr->def.addrsel);
}
kfree(ptr->domain);
kfree(ptr);
}
/**
* netlbl_domhsh_hash - Hashing function for the domain hash table
* @key: the domain name to hash
*
* Description:
* This is the hashing function for the domain hash table, it returns the
* correct bucket number for the domain. The caller is responsible for
* ensuring that the hash table is protected with either a RCU read lock or the
* hash table lock.
*
*/
static u32 netlbl_domhsh_hash(const char *key)
{
u32 iter;
u32 val;
u32 len;
/* This is taken (with slight modification) from
* security/selinux/ss/symtab.c:symhash() */
for (iter = 0, val = 0, len = strlen(key); iter < len; iter++) val = (val << 4 | (val >> (8 * sizeof(u32) - 4))) ^ key[iter]; return val & (netlbl_domhsh_rcu_deref(netlbl_domhsh)->size - 1);
}
static bool netlbl_family_match(u16 f1, u16 f2)
{
return (f1 == f2) || (f1 == AF_UNSPEC) || (f2 == AF_UNSPEC);
}
/**
* netlbl_domhsh_search - Search for a domain entry
* @domain: the domain
* @family: the address family
*
* Description:
* Searches the domain hash table and returns a pointer to the hash table
* entry if found, otherwise NULL is returned. @family may be %AF_UNSPEC
* which matches any address family entries. The caller is responsible for
* ensuring that the hash table is protected with either a RCU read lock or the
* hash table lock.
*
*/
static struct netlbl_dom_map *netlbl_domhsh_search(const char *domain,
u16 family)
{
u32 bkt;
struct list_head *bkt_list;
struct netlbl_dom_map *iter;
if (domain != NULL) { bkt = netlbl_domhsh_hash(domain);
bkt_list = &netlbl_domhsh_rcu_deref(netlbl_domhsh)->tbl[bkt];
list_for_each_entry_rcu(iter, bkt_list, list,
lockdep_is_held(&netlbl_domhsh_lock))
if (iter->valid && netlbl_family_match(iter->family, family) && strcmp(iter->domain, domain) == 0)
return iter;
}
return NULL;
}
/**
* netlbl_domhsh_search_def - Search for a domain entry
* @domain: the domain
* @family: the address family
*
* Description:
* Searches the domain hash table and returns a pointer to the hash table
* entry if an exact match is found, if an exact match is not present in the
* hash table then the default entry is returned if valid otherwise NULL is
* returned. @family may be %AF_UNSPEC which matches any address family
* entries. The caller is responsible ensuring that the hash table is
* protected with either a RCU read lock or the hash table lock.
*
*/
static struct netlbl_dom_map *netlbl_domhsh_search_def(const char *domain,
u16 family)
{
struct netlbl_dom_map *entry;
entry = netlbl_domhsh_search(domain, family); if (entry != NULL)
return entry;
if (family == AF_INET || family == AF_UNSPEC) { entry = netlbl_domhsh_rcu_deref(netlbl_domhsh_def_ipv4); if (entry != NULL && entry->valid)
return entry;
}
if (family == AF_INET6 || family == AF_UNSPEC) { entry = netlbl_domhsh_rcu_deref(netlbl_domhsh_def_ipv6); if (entry != NULL && entry->valid)
return entry;
}
return NULL;
}
/**
* netlbl_domhsh_audit_add - Generate an audit entry for an add event
* @entry: the entry being added
* @addr4: the IPv4 address information
* @addr6: the IPv6 address information
* @result: the result code
* @audit_info: NetLabel audit information
*
* Description:
* Generate an audit record for adding a new NetLabel/LSM mapping entry with
* the given information. Caller is responsible for holding the necessary
* locks.
*
*/
static void netlbl_domhsh_audit_add(struct netlbl_dom_map *entry,
struct netlbl_af4list *addr4,
struct netlbl_af6list *addr6,
int result,
struct netlbl_audit *audit_info)
{
struct audit_buffer *audit_buf;
struct cipso_v4_doi *cipsov4 = NULL;
struct calipso_doi *calipso = NULL;
u32 type;
audit_buf = netlbl_audit_start_common(AUDIT_MAC_MAP_ADD, audit_info);
if (audit_buf != NULL) {
audit_log_format(audit_buf, " nlbl_domain=%s",
entry->domain ? entry->domain : "(default)");
if (addr4 != NULL) {
struct netlbl_domaddr4_map *map4;
map4 = netlbl_domhsh_addr4_entry(addr4);
type = map4->def.type;
cipsov4 = map4->def.cipso;
netlbl_af4list_audit_addr(audit_buf, 0, NULL,
addr4->addr, addr4->mask);
#if IS_ENABLED(CONFIG_IPV6)
} else if (addr6 != NULL) {
struct netlbl_domaddr6_map *map6;
map6 = netlbl_domhsh_addr6_entry(addr6);
type = map6->def.type;
calipso = map6->def.calipso;
netlbl_af6list_audit_addr(audit_buf, 0, NULL,
&addr6->addr, &addr6->mask);
#endif /* IPv6 */
} else {
type = entry->def.type;
cipsov4 = entry->def.cipso;
calipso = entry->def.calipso;
}
switch (type) {
case NETLBL_NLTYPE_UNLABELED:
audit_log_format(audit_buf, " nlbl_protocol=unlbl");
break;
case NETLBL_NLTYPE_CIPSOV4:
BUG_ON(cipsov4 == NULL);
audit_log_format(audit_buf,
" nlbl_protocol=cipsov4 cipso_doi=%u",
cipsov4->doi);
break;
case NETLBL_NLTYPE_CALIPSO:
BUG_ON(calipso == NULL);
audit_log_format(audit_buf,
" nlbl_protocol=calipso calipso_doi=%u",
calipso->doi);
break;
}
audit_log_format(audit_buf, " res=%u", result == 0 ? 1 : 0);
audit_log_end(audit_buf);
}
}
/**
* netlbl_domhsh_validate - Validate a new domain mapping entry
* @entry: the entry to validate
*
* This function validates the new domain mapping entry to ensure that it is
* a valid entry. Returns zero on success, negative values on failure.
*
*/
static int netlbl_domhsh_validate(const struct netlbl_dom_map *entry)
{
struct netlbl_af4list *iter4;
struct netlbl_domaddr4_map *map4;
#if IS_ENABLED(CONFIG_IPV6)
struct netlbl_af6list *iter6;
struct netlbl_domaddr6_map *map6;
#endif /* IPv6 */
if (entry == NULL)
return -EINVAL;
if (entry->family != AF_INET && entry->family != AF_INET6 &&
(entry->family != AF_UNSPEC ||
entry->def.type != NETLBL_NLTYPE_UNLABELED))
return -EINVAL;
switch (entry->def.type) {
case NETLBL_NLTYPE_UNLABELED:
if (entry->def.cipso != NULL || entry->def.calipso != NULL ||
entry->def.addrsel != NULL)
return -EINVAL;
break;
case NETLBL_NLTYPE_CIPSOV4:
if (entry->family != AF_INET ||
entry->def.cipso == NULL)
return -EINVAL;
break;
case NETLBL_NLTYPE_CALIPSO:
if (entry->family != AF_INET6 ||
entry->def.calipso == NULL)
return -EINVAL;
break;
case NETLBL_NLTYPE_ADDRSELECT:
netlbl_af4list_foreach(iter4, &entry->def.addrsel->list4) {
map4 = netlbl_domhsh_addr4_entry(iter4);
switch (map4->def.type) {
case NETLBL_NLTYPE_UNLABELED:
if (map4->def.cipso != NULL)
return -EINVAL;
break;
case NETLBL_NLTYPE_CIPSOV4:
if (map4->def.cipso == NULL)
return -EINVAL;
break;
default:
return -EINVAL;
}
}
#if IS_ENABLED(CONFIG_IPV6)
netlbl_af6list_foreach(iter6, &entry->def.addrsel->list6) {
map6 = netlbl_domhsh_addr6_entry(iter6);
switch (map6->def.type) {
case NETLBL_NLTYPE_UNLABELED:
if (map6->def.calipso != NULL)
return -EINVAL;
break;
case NETLBL_NLTYPE_CALIPSO:
if (map6->def.calipso == NULL)
return -EINVAL;
break;
default:
return -EINVAL;
}
}
#endif /* IPv6 */
break;
default:
return -EINVAL;
}
return 0;
}
/*
* Domain Hash Table Functions
*/
/**
* netlbl_domhsh_init - Init for the domain hash
* @size: the number of bits to use for the hash buckets
*
* Description:
* Initializes the domain hash table, should be called only by
* netlbl_user_init() during initialization. Returns zero on success, non-zero
* values on error.
*
*/
int __init netlbl_domhsh_init(u32 size)
{
u32 iter;
struct netlbl_domhsh_tbl *hsh_tbl;
if (size == 0)
return -EINVAL;
hsh_tbl = kmalloc(sizeof(*hsh_tbl), GFP_KERNEL);
if (hsh_tbl == NULL)
return -ENOMEM;
hsh_tbl->size = 1 << size;
hsh_tbl->tbl = kcalloc(hsh_tbl->size,
sizeof(struct list_head),
GFP_KERNEL);
if (hsh_tbl->tbl == NULL) {
kfree(hsh_tbl);
return -ENOMEM;
}
for (iter = 0; iter < hsh_tbl->size; iter++)
INIT_LIST_HEAD(&hsh_tbl->tbl[iter]);
spin_lock(&netlbl_domhsh_lock);
rcu_assign_pointer(netlbl_domhsh, hsh_tbl);
spin_unlock(&netlbl_domhsh_lock);
return 0;
}
/**
* netlbl_domhsh_add - Adds a entry to the domain hash table
* @entry: the entry to add
* @audit_info: NetLabel audit information
*
* Description:
* Adds a new entry to the domain hash table and handles any updates to the
* lower level protocol handler (i.e. CIPSO). @entry->family may be set to
* %AF_UNSPEC which will add an entry that matches all address families. This
* is only useful for the unlabelled type and will only succeed if there is no
* existing entry for any address family with the same domain. Returns zero
* on success, negative on failure.
*
*/
int netlbl_domhsh_add(struct netlbl_dom_map *entry,
struct netlbl_audit *audit_info)
{
int ret_val = 0;
struct netlbl_dom_map *entry_old, *entry_b;
struct netlbl_af4list *iter4;
struct netlbl_af4list *tmp4;
#if IS_ENABLED(CONFIG_IPV6)
struct netlbl_af6list *iter6;
struct netlbl_af6list *tmp6;
#endif /* IPv6 */
ret_val = netlbl_domhsh_validate(entry);
if (ret_val != 0)
return ret_val;
/* XXX - we can remove this RCU read lock as the spinlock protects the
* entire function, but before we do we need to fixup the
* netlbl_af[4,6]list RCU functions to do "the right thing" with
* respect to rcu_dereference() when only a spinlock is held. */
rcu_read_lock();
spin_lock(&netlbl_domhsh_lock);
if (entry->domain != NULL)
entry_old = netlbl_domhsh_search(entry->domain, entry->family);
else
entry_old = netlbl_domhsh_search_def(entry->domain,
entry->family);
if (entry_old == NULL) {
entry->valid = 1;
if (entry->domain != NULL) {
u32 bkt = netlbl_domhsh_hash(entry->domain);
list_add_tail_rcu(&entry->list,
&rcu_dereference(netlbl_domhsh)->tbl[bkt]);
} else {
INIT_LIST_HEAD(&entry->list);
switch (entry->family) {
case AF_INET:
rcu_assign_pointer(netlbl_domhsh_def_ipv4,
entry);
break;
case AF_INET6:
rcu_assign_pointer(netlbl_domhsh_def_ipv6,
entry);
break;
case AF_UNSPEC:
if (entry->def.type !=
NETLBL_NLTYPE_UNLABELED) {
ret_val = -EINVAL;
goto add_return;
}
entry_b = kzalloc(sizeof(*entry_b), GFP_ATOMIC);
if (entry_b == NULL) {
ret_val = -ENOMEM;
goto add_return;
}
entry_b->family = AF_INET6;
entry_b->def.type = NETLBL_NLTYPE_UNLABELED;
entry_b->valid = 1;
entry->family = AF_INET;
rcu_assign_pointer(netlbl_domhsh_def_ipv4,
entry);
rcu_assign_pointer(netlbl_domhsh_def_ipv6,
entry_b);
break;
default:
/* Already checked in
* netlbl_domhsh_validate(). */
ret_val = -EINVAL;
goto add_return;
}
}
if (entry->def.type == NETLBL_NLTYPE_ADDRSELECT) {
netlbl_af4list_foreach_rcu(iter4,
&entry->def.addrsel->list4)
netlbl_domhsh_audit_add(entry, iter4, NULL,
ret_val, audit_info);
#if IS_ENABLED(CONFIG_IPV6)
netlbl_af6list_foreach_rcu(iter6,
&entry->def.addrsel->list6)
netlbl_domhsh_audit_add(entry, NULL, iter6,
ret_val, audit_info);
#endif /* IPv6 */
} else
netlbl_domhsh_audit_add(entry, NULL, NULL,
ret_val, audit_info);
} else if (entry_old->def.type == NETLBL_NLTYPE_ADDRSELECT &&
entry->def.type == NETLBL_NLTYPE_ADDRSELECT) {
struct list_head *old_list4;
struct list_head *old_list6;
old_list4 = &entry_old->def.addrsel->list4;
old_list6 = &entry_old->def.addrsel->list6;
/* we only allow the addition of address selectors if all of
* the selectors do not exist in the existing domain map */
netlbl_af4list_foreach_rcu(iter4, &entry->def.addrsel->list4)
if (netlbl_af4list_search_exact(iter4->addr,
iter4->mask,
old_list4)) {
ret_val = -EEXIST;
goto add_return;
}
#if IS_ENABLED(CONFIG_IPV6)
netlbl_af6list_foreach_rcu(iter6, &entry->def.addrsel->list6)
if (netlbl_af6list_search_exact(&iter6->addr,
&iter6->mask,
old_list6)) {
ret_val = -EEXIST;
goto add_return;
}
#endif /* IPv6 */
netlbl_af4list_foreach_safe(iter4, tmp4,
&entry->def.addrsel->list4) {
netlbl_af4list_remove_entry(iter4);
iter4->valid = 1;
ret_val = netlbl_af4list_add(iter4, old_list4);
netlbl_domhsh_audit_add(entry_old, iter4, NULL,
ret_val, audit_info);
if (ret_val != 0)
goto add_return;
}
#if IS_ENABLED(CONFIG_IPV6)
netlbl_af6list_foreach_safe(iter6, tmp6,
&entry->def.addrsel->list6) {
netlbl_af6list_remove_entry(iter6);
iter6->valid = 1;
ret_val = netlbl_af6list_add(iter6, old_list6);
netlbl_domhsh_audit_add(entry_old, NULL, iter6,
ret_val, audit_info);
if (ret_val != 0)
goto add_return;
}
#endif /* IPv6 */
/* cleanup the new entry since we've moved everything over */
netlbl_domhsh_free_entry(&entry->rcu);
} else
ret_val = -EINVAL;
add_return:
spin_unlock(&netlbl_domhsh_lock);
rcu_read_unlock();
return ret_val;
}
/**
* netlbl_domhsh_add_default - Adds the default entry to the domain hash table
* @entry: the entry to add
* @audit_info: NetLabel audit information
*
* Description:
* Adds a new default entry to the domain hash table and handles any updates
* to the lower level protocol handler (i.e. CIPSO). Returns zero on success,
* negative on failure.
*
*/
int netlbl_domhsh_add_default(struct netlbl_dom_map *entry,
struct netlbl_audit *audit_info)
{
return netlbl_domhsh_add(entry, audit_info);
}
/**
* netlbl_domhsh_remove_entry - Removes a given entry from the domain table
* @entry: the entry to remove
* @audit_info: NetLabel audit information
*
* Description:
* Removes an entry from the domain hash table and handles any updates to the
* lower level protocol handler (i.e. CIPSO). Caller is responsible for
* ensuring that the RCU read lock is held. Returns zero on success, negative
* on failure.
*
*/
int netlbl_domhsh_remove_entry(struct netlbl_dom_map *entry,
struct netlbl_audit *audit_info)
{
int ret_val = 0;
struct audit_buffer *audit_buf;
struct netlbl_af4list *iter4;
struct netlbl_domaddr4_map *map4;
#if IS_ENABLED(CONFIG_IPV6)
struct netlbl_af6list *iter6;
struct netlbl_domaddr6_map *map6;
#endif /* IPv6 */
if (entry == NULL)
return -ENOENT;
spin_lock(&netlbl_domhsh_lock);
if (entry->valid) {
entry->valid = 0;
if (entry == rcu_dereference(netlbl_domhsh_def_ipv4))
RCU_INIT_POINTER(netlbl_domhsh_def_ipv4, NULL);
else if (entry == rcu_dereference(netlbl_domhsh_def_ipv6))
RCU_INIT_POINTER(netlbl_domhsh_def_ipv6, NULL);
else
list_del_rcu(&entry->list);
} else
ret_val = -ENOENT;
spin_unlock(&netlbl_domhsh_lock);
if (ret_val)
return ret_val;
audit_buf = netlbl_audit_start_common(AUDIT_MAC_MAP_DEL, audit_info);
if (audit_buf != NULL) {
audit_log_format(audit_buf,
" nlbl_domain=%s res=1",
entry->domain ? entry->domain : "(default)");
audit_log_end(audit_buf);
}
switch (entry->def.type) {
case NETLBL_NLTYPE_ADDRSELECT:
netlbl_af4list_foreach_rcu(iter4, &entry->def.addrsel->list4) {
map4 = netlbl_domhsh_addr4_entry(iter4);
cipso_v4_doi_putdef(map4->def.cipso);
}
#if IS_ENABLED(CONFIG_IPV6)
netlbl_af6list_foreach_rcu(iter6, &entry->def.addrsel->list6) {
map6 = netlbl_domhsh_addr6_entry(iter6);
calipso_doi_putdef(map6->def.calipso);
}
#endif /* IPv6 */
break;
case NETLBL_NLTYPE_CIPSOV4:
cipso_v4_doi_putdef(entry->def.cipso);
break;
#if IS_ENABLED(CONFIG_IPV6)
case NETLBL_NLTYPE_CALIPSO:
calipso_doi_putdef(entry->def.calipso);
break;
#endif /* IPv6 */
}
call_rcu(&entry->rcu, netlbl_domhsh_free_entry);
return ret_val;
}
/**
* netlbl_domhsh_remove_af4 - Removes an address selector entry
* @domain: the domain
* @addr: IPv4 address
* @mask: IPv4 address mask
* @audit_info: NetLabel audit information
*
* Description:
* Removes an individual address selector from a domain mapping and potentially
* the entire mapping if it is empty. Returns zero on success, negative values
* on failure.
*
*/
int netlbl_domhsh_remove_af4(const char *domain,
const struct in_addr *addr,
const struct in_addr *mask,
struct netlbl_audit *audit_info)
{
struct netlbl_dom_map *entry_map;
struct netlbl_af4list *entry_addr;
struct netlbl_af4list *iter4;
#if IS_ENABLED(CONFIG_IPV6)
struct netlbl_af6list *iter6;
#endif /* IPv6 */
struct netlbl_domaddr4_map *entry;
rcu_read_lock();
if (domain)
entry_map = netlbl_domhsh_search(domain, AF_INET);
else
entry_map = netlbl_domhsh_search_def(domain, AF_INET);
if (entry_map == NULL ||
entry_map->def.type != NETLBL_NLTYPE_ADDRSELECT)
goto remove_af4_failure;
spin_lock(&netlbl_domhsh_lock);
entry_addr = netlbl_af4list_remove(addr->s_addr, mask->s_addr,
&entry_map->def.addrsel->list4);
spin_unlock(&netlbl_domhsh_lock);
if (entry_addr == NULL)
goto remove_af4_failure;
netlbl_af4list_foreach_rcu(iter4, &entry_map->def.addrsel->list4)
goto remove_af4_single_addr;
#if IS_ENABLED(CONFIG_IPV6)
netlbl_af6list_foreach_rcu(iter6, &entry_map->def.addrsel->list6)
goto remove_af4_single_addr;
#endif /* IPv6 */
/* the domain mapping is empty so remove it from the mapping table */
netlbl_domhsh_remove_entry(entry_map, audit_info);
remove_af4_single_addr:
rcu_read_unlock();
/* yick, we can't use call_rcu here because we don't have a rcu head
* pointer but hopefully this should be a rare case so the pause
* shouldn't be a problem */
synchronize_rcu();
entry = netlbl_domhsh_addr4_entry(entry_addr);
cipso_v4_doi_putdef(entry->def.cipso);
kfree(entry);
return 0;
remove_af4_failure:
rcu_read_unlock();
return -ENOENT;
}
#if IS_ENABLED(CONFIG_IPV6)
/**
* netlbl_domhsh_remove_af6 - Removes an address selector entry
* @domain: the domain
* @addr: IPv6 address
* @mask: IPv6 address mask
* @audit_info: NetLabel audit information
*
* Description:
* Removes an individual address selector from a domain mapping and potentially
* the entire mapping if it is empty. Returns zero on success, negative values
* on failure.
*
*/
int netlbl_domhsh_remove_af6(const char *domain,
const struct in6_addr *addr,
const struct in6_addr *mask,
struct netlbl_audit *audit_info)
{
struct netlbl_dom_map *entry_map;
struct netlbl_af6list *entry_addr;
struct netlbl_af4list *iter4;
struct netlbl_af6list *iter6;
struct netlbl_domaddr6_map *entry;
rcu_read_lock();
if (domain)
entry_map = netlbl_domhsh_search(domain, AF_INET6);
else
entry_map = netlbl_domhsh_search_def(domain, AF_INET6);
if (entry_map == NULL ||
entry_map->def.type != NETLBL_NLTYPE_ADDRSELECT)
goto remove_af6_failure;
spin_lock(&netlbl_domhsh_lock);
entry_addr = netlbl_af6list_remove(addr, mask,
&entry_map->def.addrsel->list6);
spin_unlock(&netlbl_domhsh_lock);
if (entry_addr == NULL)
goto remove_af6_failure;
netlbl_af4list_foreach_rcu(iter4, &entry_map->def.addrsel->list4)
goto remove_af6_single_addr;
netlbl_af6list_foreach_rcu(iter6, &entry_map->def.addrsel->list6)
goto remove_af6_single_addr;
/* the domain mapping is empty so remove it from the mapping table */
netlbl_domhsh_remove_entry(entry_map, audit_info);
remove_af6_single_addr:
rcu_read_unlock();
/* yick, we can't use call_rcu here because we don't have a rcu head
* pointer but hopefully this should be a rare case so the pause
* shouldn't be a problem */
synchronize_rcu();
entry = netlbl_domhsh_addr6_entry(entry_addr);
calipso_doi_putdef(entry->def.calipso);
kfree(entry);
return 0;
remove_af6_failure:
rcu_read_unlock();
return -ENOENT;
}
#endif /* IPv6 */
/**
* netlbl_domhsh_remove - Removes an entry from the domain hash table
* @domain: the domain to remove
* @family: address family
* @audit_info: NetLabel audit information
*
* Description:
* Removes an entry from the domain hash table and handles any updates to the
* lower level protocol handler (i.e. CIPSO). @family may be %AF_UNSPEC which
* removes all address family entries. Returns zero on success, negative on
* failure.
*
*/
int netlbl_domhsh_remove(const char *domain, u16 family,
struct netlbl_audit *audit_info)
{
int ret_val = -EINVAL;
struct netlbl_dom_map *entry;
rcu_read_lock();
if (family == AF_INET || family == AF_UNSPEC) {
if (domain)
entry = netlbl_domhsh_search(domain, AF_INET);
else
entry = netlbl_domhsh_search_def(domain, AF_INET);
ret_val = netlbl_domhsh_remove_entry(entry, audit_info);
if (ret_val && ret_val != -ENOENT)
goto done;
}
if (family == AF_INET6 || family == AF_UNSPEC) {
int ret_val2;
if (domain)
entry = netlbl_domhsh_search(domain, AF_INET6);
else
entry = netlbl_domhsh_search_def(domain, AF_INET6);
ret_val2 = netlbl_domhsh_remove_entry(entry, audit_info);
if (ret_val2 != -ENOENT)
ret_val = ret_val2;
}
done:
rcu_read_unlock();
return ret_val;
}
/**
* netlbl_domhsh_remove_default - Removes the default entry from the table
* @family: address family
* @audit_info: NetLabel audit information
*
* Description:
* Removes/resets the default entry corresponding to @family from the domain
* hash table and handles any updates to the lower level protocol handler
* (i.e. CIPSO). @family may be %AF_UNSPEC which removes all address family
* entries. Returns zero on success, negative on failure.
*
*/
int netlbl_domhsh_remove_default(u16 family, struct netlbl_audit *audit_info)
{
return netlbl_domhsh_remove(NULL, family, audit_info);
}
/**
* netlbl_domhsh_getentry - Get an entry from the domain hash table
* @domain: the domain name to search for
* @family: address family
*
* Description:
* Look through the domain hash table searching for an entry to match @domain,
* with address family @family, return a pointer to a copy of the entry or
* NULL. The caller is responsible for ensuring that rcu_read_[un]lock() is
* called.
*
*/
struct netlbl_dom_map *netlbl_domhsh_getentry(const char *domain, u16 family)
{
if (family == AF_UNSPEC)
return NULL;
return netlbl_domhsh_search_def(domain, family);
}
/**
* netlbl_domhsh_getentry_af4 - Get an entry from the domain hash table
* @domain: the domain name to search for
* @addr: the IP address to search for
*
* Description:
* Look through the domain hash table searching for an entry to match @domain
* and @addr, return a pointer to a copy of the entry or NULL. The caller is
* responsible for ensuring that rcu_read_[un]lock() is called.
*
*/
struct netlbl_dommap_def *netlbl_domhsh_getentry_af4(const char *domain,
__be32 addr)
{
struct netlbl_dom_map *dom_iter;
struct netlbl_af4list *addr_iter;
dom_iter = netlbl_domhsh_search_def(domain, AF_INET);
if (dom_iter == NULL)
return NULL;
if (dom_iter->def.type != NETLBL_NLTYPE_ADDRSELECT)
return &dom_iter->def;
addr_iter = netlbl_af4list_search(addr, &dom_iter->def.addrsel->list4);
if (addr_iter == NULL)
return NULL;
return &(netlbl_domhsh_addr4_entry(addr_iter)->def);
}
#if IS_ENABLED(CONFIG_IPV6)
/**
* netlbl_domhsh_getentry_af6 - Get an entry from the domain hash table
* @domain: the domain name to search for
* @addr: the IP address to search for
*
* Description:
* Look through the domain hash table searching for an entry to match @domain
* and @addr, return a pointer to a copy of the entry or NULL. The caller is
* responsible for ensuring that rcu_read_[un]lock() is called.
*
*/
struct netlbl_dommap_def *netlbl_domhsh_getentry_af6(const char *domain,
const struct in6_addr *addr)
{
struct netlbl_dom_map *dom_iter;
struct netlbl_af6list *addr_iter;
dom_iter = netlbl_domhsh_search_def(domain, AF_INET6);
if (dom_iter == NULL)
return NULL;
if (dom_iter->def.type != NETLBL_NLTYPE_ADDRSELECT)
return &dom_iter->def;
addr_iter = netlbl_af6list_search(addr, &dom_iter->def.addrsel->list6);
if (addr_iter == NULL)
return NULL;
return &(netlbl_domhsh_addr6_entry(addr_iter)->def);
}
#endif /* IPv6 */
/**
* netlbl_domhsh_walk - Iterate through the domain mapping hash table
* @skip_bkt: the number of buckets to skip at the start
* @skip_chain: the number of entries to skip in the first iterated bucket
* @callback: callback for each entry
* @cb_arg: argument for the callback function
*
* Description:
* Iterate over the domain mapping hash table, skipping the first @skip_bkt
* buckets and @skip_chain entries. For each entry in the table call
* @callback, if @callback returns a negative value stop 'walking' through the
* table and return. Updates the values in @skip_bkt and @skip_chain on
* return. Returns zero on success, negative values on failure.
*
*/
int netlbl_domhsh_walk(u32 *skip_bkt,
u32 *skip_chain,
int (*callback) (struct netlbl_dom_map *entry, void *arg),
void *cb_arg)
{
int ret_val = -ENOENT;
u32 iter_bkt;
struct list_head *iter_list;
struct netlbl_dom_map *iter_entry;
u32 chain_cnt = 0;
rcu_read_lock();
for (iter_bkt = *skip_bkt;
iter_bkt < rcu_dereference(netlbl_domhsh)->size;
iter_bkt++, chain_cnt = 0) {
iter_list = &rcu_dereference(netlbl_domhsh)->tbl[iter_bkt];
list_for_each_entry_rcu(iter_entry, iter_list, list)
if (iter_entry->valid) {
if (chain_cnt++ < *skip_chain)
continue;
ret_val = callback(iter_entry, cb_arg);
if (ret_val < 0) {
chain_cnt--;
goto walk_return;
}
}
}
walk_return:
rcu_read_unlock();
*skip_bkt = iter_bkt;
*skip_chain = chain_cnt;
return ret_val;
}
// SPDX-License-Identifier: GPL-2.0
/*
* Copyright (C) 2011 STRATO. All rights reserved.
*/
#include <linux/sched.h>
#include <linux/pagemap.h>
#include <linux/writeback.h>
#include <linux/blkdev.h>
#include <linux/rbtree.h>
#include <linux/slab.h>
#include <linux/workqueue.h>
#include <linux/btrfs.h>
#include <linux/sched/mm.h>
#include "ctree.h"
#include "transaction.h"
#include "disk-io.h"
#include "locking.h"
#include "ulist.h"
#include "backref.h"
#include "extent_io.h"
#include "qgroup.h"
#include "block-group.h"
#include "sysfs.h"
#include "tree-mod-log.h"
/* TODO XXX FIXME
* - subvol delete -> delete when ref goes to 0? delete limits also?
* - reorganize keys
* - compressed
* - sync
* - copy also limits on subvol creation
* - limit
* - caches for ulists
* - performance benchmarks
* - check all ioctl parameters
*/
/*
* Helpers to access qgroup reservation
*
* Callers should ensure the lock context and type are valid
*/
static u64 qgroup_rsv_total(const struct btrfs_qgroup *qgroup)
{
u64 ret = 0;
int i;
for (i = 0; i < BTRFS_QGROUP_RSV_LAST; i++)
ret += qgroup->rsv.values[i];
return ret;
}
#ifdef CONFIG_BTRFS_DEBUG
static const char *qgroup_rsv_type_str(enum btrfs_qgroup_rsv_type type)
{
if (type == BTRFS_QGROUP_RSV_DATA)
return "data";
if (type == BTRFS_QGROUP_RSV_META_PERTRANS)
return "meta_pertrans";
if (type == BTRFS_QGROUP_RSV_META_PREALLOC)
return "meta_prealloc";
return NULL;
}
#endif
static void qgroup_rsv_add(struct btrfs_fs_info *fs_info,
struct btrfs_qgroup *qgroup, u64 num_bytes,
enum btrfs_qgroup_rsv_type type)
{
trace_qgroup_update_reserve(fs_info, qgroup, num_bytes, type);
qgroup->rsv.values[type] += num_bytes;
}
static void qgroup_rsv_release(struct btrfs_fs_info *fs_info,
struct btrfs_qgroup *qgroup, u64 num_bytes,
enum btrfs_qgroup_rsv_type type)
{
trace_qgroup_update_reserve(fs_info, qgroup, -(s64)num_bytes, type);
if (qgroup->rsv.values[type] >= num_bytes) { qgroup->rsv.values[type] -= num_bytes; return;
}
#ifdef CONFIG_BTRFS_DEBUG
WARN_RATELIMIT(1,
"qgroup %llu %s reserved space underflow, have %llu to free %llu",
qgroup->qgroupid, qgroup_rsv_type_str(type),
qgroup->rsv.values[type], num_bytes);
#endif
qgroup->rsv.values[type] = 0;
}
static void qgroup_rsv_add_by_qgroup(struct btrfs_fs_info *fs_info,
struct btrfs_qgroup *dest,
struct btrfs_qgroup *src)
{
int i;
for (i = 0; i < BTRFS_QGROUP_RSV_LAST; i++)
qgroup_rsv_add(fs_info, dest, src->rsv.values[i], i);
}
static void qgroup_rsv_release_by_qgroup(struct btrfs_fs_info *fs_info,
struct btrfs_qgroup *dest,
struct btrfs_qgroup *src)
{
int i;
for (i = 0; i < BTRFS_QGROUP_RSV_LAST; i++)
qgroup_rsv_release(fs_info, dest, src->rsv.values[i], i);
}
static void btrfs_qgroup_update_old_refcnt(struct btrfs_qgroup *qg, u64 seq,
int mod)
{
if (qg->old_refcnt < seq)
qg->old_refcnt = seq;
qg->old_refcnt += mod;
}
static void btrfs_qgroup_update_new_refcnt(struct btrfs_qgroup *qg, u64 seq,
int mod)
{
if (qg->new_refcnt < seq)
qg->new_refcnt = seq;
qg->new_refcnt += mod;
}
static inline u64 btrfs_qgroup_get_old_refcnt(struct btrfs_qgroup *qg, u64 seq)
{
if (qg->old_refcnt < seq)
return 0;
return qg->old_refcnt - seq;
}
static inline u64 btrfs_qgroup_get_new_refcnt(struct btrfs_qgroup *qg, u64 seq)
{
if (qg->new_refcnt < seq)
return 0;
return qg->new_refcnt - seq;
}
/*
* glue structure to represent the relations between qgroups.
*/
struct btrfs_qgroup_list {
struct list_head next_group;
struct list_head next_member;
struct btrfs_qgroup *group;
struct btrfs_qgroup *member;
};
static inline u64 qgroup_to_aux(struct btrfs_qgroup *qg)
{
return (u64)(uintptr_t)qg;
}
static inline struct btrfs_qgroup* unode_aux_to_qgroup(struct ulist_node *n)
{
return (struct btrfs_qgroup *)(uintptr_t)n->aux;
}
static int
qgroup_rescan_init(struct btrfs_fs_info *fs_info, u64 progress_objectid,
int init_flags);
static void qgroup_rescan_zero_tracking(struct btrfs_fs_info *fs_info);
/* must be called with qgroup_ioctl_lock held */
static struct btrfs_qgroup *find_qgroup_rb(struct btrfs_fs_info *fs_info,
u64 qgroupid)
{
struct rb_node *n = fs_info->qgroup_tree.rb_node;
struct btrfs_qgroup *qgroup;
while (n) { qgroup = rb_entry(n, struct btrfs_qgroup, node); if (qgroup->qgroupid < qgroupid) n = n->rb_left; else if (qgroup->qgroupid > qgroupid) n = n->rb_right;
else
return qgroup;
}
return NULL;
}
/* must be called with qgroup_lock held */
static struct btrfs_qgroup *add_qgroup_rb(struct btrfs_fs_info *fs_info,
u64 qgroupid)
{
struct rb_node **p = &fs_info->qgroup_tree.rb_node;
struct rb_node *parent = NULL;
struct btrfs_qgroup *qgroup;
while (*p) {
parent = *p;
qgroup = rb_entry(parent, struct btrfs_qgroup, node);
if (qgroup->qgroupid < qgroupid)
p = &(*p)->rb_left;
else if (qgroup->qgroupid > qgroupid)
p = &(*p)->rb_right;
else
return qgroup;
}
qgroup = kzalloc(sizeof(*qgroup), GFP_ATOMIC);
if (!qgroup)
return ERR_PTR(-ENOMEM);
qgroup->qgroupid = qgroupid;
INIT_LIST_HEAD(&qgroup->groups);
INIT_LIST_HEAD(&qgroup->members);
INIT_LIST_HEAD(&qgroup->dirty);
rb_link_node(&qgroup->node, parent, p);
rb_insert_color(&qgroup->node, &fs_info->qgroup_tree);
return qgroup;
}
static void __del_qgroup_rb(struct btrfs_fs_info *fs_info,
struct btrfs_qgroup *qgroup)
{
struct btrfs_qgroup_list *list;
list_del(&qgroup->dirty);
while (!list_empty(&qgroup->groups)) {
list = list_first_entry(&qgroup->groups,
struct btrfs_qgroup_list, next_group);
list_del(&list->next_group);
list_del(&list->next_member);
kfree(list);
}
while (!list_empty(&qgroup->members)) {
list = list_first_entry(&qgroup->members,
struct btrfs_qgroup_list, next_member);
list_del(&list->next_group);
list_del(&list->next_member);
kfree(list);
}
}
/* must be called with qgroup_lock held */
static int del_qgroup_rb(struct btrfs_fs_info *fs_info, u64 qgroupid)
{
struct btrfs_qgroup *qgroup = find_qgroup_rb(fs_info, qgroupid);
if (!qgroup)
return -ENOENT;
rb_erase(&qgroup->node, &fs_info->qgroup_tree);
__del_qgroup_rb(fs_info, qgroup);
return 0;
}
/* must be called with qgroup_lock held */
static int add_relation_rb(struct btrfs_fs_info *fs_info,
u64 memberid, u64 parentid)
{
struct btrfs_qgroup *member;
struct btrfs_qgroup *parent;
struct btrfs_qgroup_list *list;
member = find_qgroup_rb(fs_info, memberid);
parent = find_qgroup_rb(fs_info, parentid);
if (!member || !parent)
return -ENOENT;
list = kzalloc(sizeof(*list), GFP_ATOMIC);
if (!list)
return -ENOMEM;
list->group = parent;
list->member = member;
list_add_tail(&list->next_group, &member->groups);
list_add_tail(&list->next_member, &parent->members);
return 0;
}
/* must be called with qgroup_lock held */
static int del_relation_rb(struct btrfs_fs_info *fs_info,
u64 memberid, u64 parentid)
{
struct btrfs_qgroup *member;
struct btrfs_qgroup *parent;
struct btrfs_qgroup_list *list;
member = find_qgroup_rb(fs_info, memberid);
parent = find_qgroup_rb(fs_info, parentid);
if (!member || !parent)
return -ENOENT;
list_for_each_entry(list, &member->groups, next_group) {
if (list->group == parent) {
list_del(&list->next_group);
list_del(&list->next_member);
kfree(list);
return 0;
}
}
return -ENOENT;
}
#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
int btrfs_verify_qgroup_counts(struct btrfs_fs_info *fs_info, u64 qgroupid,
u64 rfer, u64 excl)
{
struct btrfs_qgroup *qgroup;
qgroup = find_qgroup_rb(fs_info, qgroupid);
if (!qgroup)
return -EINVAL;
if (qgroup->rfer != rfer || qgroup->excl != excl)
return -EINVAL;
return 0;
}
#endif
/*
* The full config is read in one go, only called from open_ctree()
* It doesn't use any locking, as at this point we're still single-threaded
*/
int btrfs_read_qgroup_config(struct btrfs_fs_info *fs_info)
{
struct btrfs_key key;
struct btrfs_key found_key;
struct btrfs_root *quota_root = fs_info->quota_root;
struct btrfs_path *path = NULL;
struct extent_buffer *l;
int slot;
int ret = 0;
u64 flags = 0;
u64 rescan_progress = 0;
if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags))
return 0;
fs_info->qgroup_ulist = ulist_alloc(GFP_KERNEL);
if (!fs_info->qgroup_ulist) {
ret = -ENOMEM;
goto out;
}
path = btrfs_alloc_path();
if (!path) {
ret = -ENOMEM;
goto out;
}
ret = btrfs_sysfs_add_qgroups(fs_info);
if (ret < 0)
goto out;
/* default this to quota off, in case no status key is found */
fs_info->qgroup_flags = 0;
/*
* pass 1: read status, all qgroup infos and limits
*/
key.objectid = 0;
key.type = 0;
key.offset = 0;
ret = btrfs_search_slot_for_read(quota_root, &key, path, 1, 1);
if (ret)
goto out;
while (1) {
struct btrfs_qgroup *qgroup;
slot = path->slots[0];
l = path->nodes[0];
btrfs_item_key_to_cpu(l, &found_key, slot);
if (found_key.type == BTRFS_QGROUP_STATUS_KEY) {
struct btrfs_qgroup_status_item *ptr;
ptr = btrfs_item_ptr(l, slot,
struct btrfs_qgroup_status_item);
if (btrfs_qgroup_status_version(l, ptr) !=
BTRFS_QGROUP_STATUS_VERSION) {
btrfs_err(fs_info,
"old qgroup version, quota disabled");
goto out;
}
if (btrfs_qgroup_status_generation(l, ptr) !=
fs_info->generation) {
flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
btrfs_err(fs_info,
"qgroup generation mismatch, marked as inconsistent");
}
fs_info->qgroup_flags = btrfs_qgroup_status_flags(l,
ptr);
rescan_progress = btrfs_qgroup_status_rescan(l, ptr);
goto next1;
}
if (found_key.type != BTRFS_QGROUP_INFO_KEY &&
found_key.type != BTRFS_QGROUP_LIMIT_KEY)
goto next1;
qgroup = find_qgroup_rb(fs_info, found_key.offset); if ((qgroup && found_key.type == BTRFS_QGROUP_INFO_KEY) || (!qgroup && found_key.type == BTRFS_QGROUP_LIMIT_KEY)) {
btrfs_err(fs_info, "inconsistent qgroup config");
flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
}
if (!qgroup) {
qgroup = add_qgroup_rb(fs_info, found_key.offset);
if (IS_ERR(qgroup)) {
ret = PTR_ERR(qgroup);
goto out;
}
}
ret = btrfs_sysfs_add_one_qgroup(fs_info, qgroup);
if (ret < 0)
goto out;
switch (found_key.type) {
case BTRFS_QGROUP_INFO_KEY: {
struct btrfs_qgroup_info_item *ptr;
ptr = btrfs_item_ptr(l, slot,
struct btrfs_qgroup_info_item);
qgroup->rfer = btrfs_qgroup_info_rfer(l, ptr);
qgroup->rfer_cmpr = btrfs_qgroup_info_rfer_cmpr(l, ptr);
qgroup->excl = btrfs_qgroup_info_excl(l, ptr);
qgroup->excl_cmpr = btrfs_qgroup_info_excl_cmpr(l, ptr);
/* generation currently unused */
break;
}
case BTRFS_QGROUP_LIMIT_KEY: {
struct btrfs_qgroup_limit_item *ptr;
ptr = btrfs_item_ptr(l, slot,
struct btrfs_qgroup_limit_item);
qgroup->lim_flags = btrfs_qgroup_limit_flags(l, ptr);
qgroup->max_rfer = btrfs_qgroup_limit_max_rfer(l, ptr);
qgroup->max_excl = btrfs_qgroup_limit_max_excl(l, ptr);
qgroup->rsv_rfer = btrfs_qgroup_limit_rsv_rfer(l, ptr);
qgroup->rsv_excl = btrfs_qgroup_limit_rsv_excl(l, ptr);
break;
}
}
next1:
ret = btrfs_next_item(quota_root, path);
if (ret < 0)
goto out;
if (ret)
break;
}
btrfs_release_path(path);
/*
* pass 2: read all qgroup relations
*/
key.objectid = 0;
key.type = BTRFS_QGROUP_RELATION_KEY;
key.offset = 0;
ret = btrfs_search_slot_for_read(quota_root, &key, path, 1, 0);
if (ret)
goto out;
while (1) {
slot = path->slots[0];
l = path->nodes[0];
btrfs_item_key_to_cpu(l, &found_key, slot);
if (found_key.type != BTRFS_QGROUP_RELATION_KEY)
goto next2;
if (found_key.objectid > found_key.offset) {
/* parent <- member, not needed to build config */
/* FIXME should we omit the key completely? */
goto next2;
}
ret = add_relation_rb(fs_info, found_key.objectid,
found_key.offset);
if (ret == -ENOENT) {
btrfs_warn(fs_info,
"orphan qgroup relation 0x%llx->0x%llx",
found_key.objectid, found_key.offset);
ret = 0; /* ignore the error */
}
if (ret)
goto out;
next2:
ret = btrfs_next_item(quota_root, path);
if (ret < 0)
goto out;
if (ret)
break;
}
out:
btrfs_free_path(path);
fs_info->qgroup_flags |= flags;
if (!(fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_ON))
clear_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags);
else if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN &&
ret >= 0)
ret = qgroup_rescan_init(fs_info, rescan_progress, 0); if (ret < 0) { ulist_free(fs_info->qgroup_ulist);
fs_info->qgroup_ulist = NULL;
fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_RESCAN;
btrfs_sysfs_del_qgroups(fs_info);
}
return ret < 0 ? ret : 0;
}
/*
* Called in close_ctree() when quota is still enabled. This verifies we don't
* leak some reserved space.
*
* Return false if no reserved space is left.
* Return true if some reserved space is leaked.
*/
bool btrfs_check_quota_leak(struct btrfs_fs_info *fs_info)
{
struct rb_node *node;
bool ret = false;
if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) return ret;
/*
* Since we're unmounting, there is no race and no need to grab qgroup
* lock. And here we don't go post-order to provide a more user
* friendly sorted result.
*/
for (node = rb_first(&fs_info->qgroup_tree); node; node = rb_next(node)) {
struct btrfs_qgroup *qgroup;
int i;
qgroup = rb_entry(node, struct btrfs_qgroup, node);
for (i = 0; i < BTRFS_QGROUP_RSV_LAST; i++) { if (qgroup->rsv.values[i]) {
ret = true;
btrfs_warn(fs_info,
"qgroup %hu/%llu has unreleased space, type %d rsv %llu",
btrfs_qgroup_level(qgroup->qgroupid),
btrfs_qgroup_subvolid(qgroup->qgroupid),
i, qgroup->rsv.values[i]);
}
}
}
return ret;
}
/*
* This is called from close_ctree() or open_ctree() or btrfs_quota_disable(),
* first two are in single-threaded paths.And for the third one, we have set
* quota_root to be null with qgroup_lock held before, so it is safe to clean
* up the in-memory structures without qgroup_lock held.
*/
void btrfs_free_qgroup_config(struct btrfs_fs_info *fs_info)
{
struct rb_node *n;
struct btrfs_qgroup *qgroup;
while ((n = rb_first(&fs_info->qgroup_tree))) { qgroup = rb_entry(n, struct btrfs_qgroup, node);
rb_erase(n, &fs_info->qgroup_tree);
__del_qgroup_rb(fs_info, qgroup);
btrfs_sysfs_del_one_qgroup(fs_info, qgroup);
kfree(qgroup);
}
/*
* We call btrfs_free_qgroup_config() when unmounting
* filesystem and disabling quota, so we set qgroup_ulist
* to be null here to avoid double free.
*/
ulist_free(fs_info->qgroup_ulist);
fs_info->qgroup_ulist = NULL;
btrfs_sysfs_del_qgroups(fs_info);
}
static int add_qgroup_relation_item(struct btrfs_trans_handle *trans, u64 src,
u64 dst)
{
int ret;
struct btrfs_root *quota_root = trans->fs_info->quota_root;
struct btrfs_path *path;
struct btrfs_key key;
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
key.objectid = src;
key.type = BTRFS_QGROUP_RELATION_KEY;
key.offset = dst;
ret = btrfs_insert_empty_item(trans, quota_root, path, &key, 0);
btrfs_mark_buffer_dirty(path->nodes[0]);
btrfs_free_path(path);
return ret;
}
static int del_qgroup_relation_item(struct btrfs_trans_handle *trans, u64 src,
u64 dst)
{
int ret;
struct btrfs_root *quota_root = trans->fs_info->quota_root;
struct btrfs_path *path;
struct btrfs_key key;
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
key.objectid = src;
key.type = BTRFS_QGROUP_RELATION_KEY;
key.offset = dst;
ret = btrfs_search_slot(trans, quota_root, &key, path, -1, 1);
if (ret < 0)
goto out;
if (ret > 0) {
ret = -ENOENT;
goto out;
}
ret = btrfs_del_item(trans, quota_root, path);
out:
btrfs_free_path(path);
return ret;
}
static int add_qgroup_item(struct btrfs_trans_handle *trans,
struct btrfs_root *quota_root, u64 qgroupid)
{
int ret;
struct btrfs_path *path;
struct btrfs_qgroup_info_item *qgroup_info;
struct btrfs_qgroup_limit_item *qgroup_limit;
struct extent_buffer *leaf;
struct btrfs_key key;
if (btrfs_is_testing(quota_root->fs_info))
return 0;
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
key.objectid = 0;
key.type = BTRFS_QGROUP_INFO_KEY;
key.offset = qgroupid;
/*
* Avoid a transaction abort by catching -EEXIST here. In that
* case, we proceed by re-initializing the existing structure
* on disk.
*/
ret = btrfs_insert_empty_item(trans, quota_root, path, &key,
sizeof(*qgroup_info));
if (ret && ret != -EEXIST)
goto out;
leaf = path->nodes[0];
qgroup_info = btrfs_item_ptr(leaf, path->slots[0],
struct btrfs_qgroup_info_item);
btrfs_set_qgroup_info_generation(leaf, qgroup_info, trans->transid);
btrfs_set_qgroup_info_rfer(leaf, qgroup_info, 0);
btrfs_set_qgroup_info_rfer_cmpr(leaf, qgroup_info, 0);
btrfs_set_qgroup_info_excl(leaf, qgroup_info, 0);
btrfs_set_qgroup_info_excl_cmpr(leaf, qgroup_info, 0);
btrfs_mark_buffer_dirty(leaf);
btrfs_release_path(path);
key.type = BTRFS_QGROUP_LIMIT_KEY;
ret = btrfs_insert_empty_item(trans, quota_root, path, &key,
sizeof(*qgroup_limit));
if (ret && ret != -EEXIST)
goto out;
leaf = path->nodes[0];
qgroup_limit = btrfs_item_ptr(leaf, path->slots[0],
struct btrfs_qgroup_limit_item);
btrfs_set_qgroup_limit_flags(leaf, qgroup_limit, 0);
btrfs_set_qgroup_limit_max_rfer(leaf, qgroup_limit, 0);
btrfs_set_qgroup_limit_max_excl(leaf, qgroup_limit, 0);
btrfs_set_qgroup_limit_rsv_rfer(leaf, qgroup_limit, 0);
btrfs_set_qgroup_limit_rsv_excl(leaf, qgroup_limit, 0);
btrfs_mark_buffer_dirty(leaf);
ret = 0;
out:
btrfs_free_path(path);
return ret;
}
static int del_qgroup_item(struct btrfs_trans_handle *trans, u64 qgroupid)
{
int ret;
struct btrfs_root *quota_root = trans->fs_info->quota_root;
struct btrfs_path *path;
struct btrfs_key key;
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
key.objectid = 0;
key.type = BTRFS_QGROUP_INFO_KEY;
key.offset = qgroupid;
ret = btrfs_search_slot(trans, quota_root, &key, path, -1, 1);
if (ret < 0)
goto out;
if (ret > 0) {
ret = -ENOENT;
goto out;
}
ret = btrfs_del_item(trans, quota_root, path);
if (ret)
goto out;
btrfs_release_path(path);
key.type = BTRFS_QGROUP_LIMIT_KEY;
ret = btrfs_search_slot(trans, quota_root, &key, path, -1, 1);
if (ret < 0)
goto out;
if (ret > 0) {
ret = -ENOENT;
goto out;
}
ret = btrfs_del_item(trans, quota_root, path);
out:
btrfs_free_path(path);
return ret;
}
static int update_qgroup_limit_item(struct btrfs_trans_handle *trans,
struct btrfs_qgroup *qgroup)
{
struct btrfs_root *quota_root = trans->fs_info->quota_root;
struct btrfs_path *path;
struct btrfs_key key;
struct extent_buffer *l;
struct btrfs_qgroup_limit_item *qgroup_limit;
int ret;
int slot;
key.objectid = 0;
key.type = BTRFS_QGROUP_LIMIT_KEY;
key.offset = qgroup->qgroupid;
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
ret = btrfs_search_slot(trans, quota_root, &key, path, 0, 1);
if (ret > 0)
ret = -ENOENT;
if (ret)
goto out;
l = path->nodes[0];
slot = path->slots[0];
qgroup_limit = btrfs_item_ptr(l, slot, struct btrfs_qgroup_limit_item);
btrfs_set_qgroup_limit_flags(l, qgroup_limit, qgroup->lim_flags);
btrfs_set_qgroup_limit_max_rfer(l, qgroup_limit, qgroup->max_rfer);
btrfs_set_qgroup_limit_max_excl(l, qgroup_limit, qgroup->max_excl);
btrfs_set_qgroup_limit_rsv_rfer(l, qgroup_limit, qgroup->rsv_rfer);
btrfs_set_qgroup_limit_rsv_excl(l, qgroup_limit, qgroup->rsv_excl);
btrfs_mark_buffer_dirty(l);
out:
btrfs_free_path(path);
return ret;
}
static int update_qgroup_info_item(struct btrfs_trans_handle *trans,
struct btrfs_qgroup *qgroup)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
struct btrfs_root *quota_root = fs_info->quota_root;
struct btrfs_path *path;
struct btrfs_key key;
struct extent_buffer *l;
struct btrfs_qgroup_info_item *qgroup_info;
int ret;
int slot;
if (btrfs_is_testing(fs_info))
return 0;
key.objectid = 0;
key.type = BTRFS_QGROUP_INFO_KEY;
key.offset = qgroup->qgroupid;
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
ret = btrfs_search_slot(trans, quota_root, &key, path, 0, 1);
if (ret > 0)
ret = -ENOENT;
if (ret)
goto out;
l = path->nodes[0];
slot = path->slots[0];
qgroup_info = btrfs_item_ptr(l, slot, struct btrfs_qgroup_info_item);
btrfs_set_qgroup_info_generation(l, qgroup_info, trans->transid);
btrfs_set_qgroup_info_rfer(l, qgroup_info, qgroup->rfer);
btrfs_set_qgroup_info_rfer_cmpr(l, qgroup_info, qgroup->rfer_cmpr);
btrfs_set_qgroup_info_excl(l, qgroup_info, qgroup->excl);
btrfs_set_qgroup_info_excl_cmpr(l, qgroup_info, qgroup->excl_cmpr);
btrfs_mark_buffer_dirty(l);
out:
btrfs_free_path(path);
return ret;
}
static int update_qgroup_status_item(struct btrfs_trans_handle *trans)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
struct btrfs_root *quota_root = fs_info->quota_root;
struct btrfs_path *path;
struct btrfs_key key;
struct extent_buffer *l;
struct btrfs_qgroup_status_item *ptr;
int ret;
int slot;
key.objectid = 0;
key.type = BTRFS_QGROUP_STATUS_KEY;
key.offset = 0;
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
ret = btrfs_search_slot(trans, quota_root, &key, path, 0, 1);
if (ret > 0)
ret = -ENOENT;
if (ret)
goto out;
l = path->nodes[0];
slot = path->slots[0];
ptr = btrfs_item_ptr(l, slot, struct btrfs_qgroup_status_item);
btrfs_set_qgroup_status_flags(l, ptr, fs_info->qgroup_flags);
btrfs_set_qgroup_status_generation(l, ptr, trans->transid);
btrfs_set_qgroup_status_rescan(l, ptr,
fs_info->qgroup_rescan_progress.objectid);
btrfs_mark_buffer_dirty(l);
out:
btrfs_free_path(path);
return ret;
}
/*
* called with qgroup_lock held
*/
static int btrfs_clean_quota_tree(struct btrfs_trans_handle *trans,
struct btrfs_root *root)
{
struct btrfs_path *path;
struct btrfs_key key;
struct extent_buffer *leaf = NULL;
int ret;
int nr = 0;
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
key.objectid = 0;
key.offset = 0;
key.type = 0;
while (1) {
ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
if (ret < 0)
goto out;
leaf = path->nodes[0];
nr = btrfs_header_nritems(leaf);
if (!nr)
break;
/*
* delete the leaf one by one
* since the whole tree is going
* to be deleted.
*/
path->slots[0] = 0;
ret = btrfs_del_items(trans, root, path, 0, nr);
if (ret)
goto out;
btrfs_release_path(path);
}
ret = 0;
out:
btrfs_free_path(path);
return ret;
}
int btrfs_quota_enable(struct btrfs_fs_info *fs_info)
{
struct btrfs_root *quota_root;
struct btrfs_root *tree_root = fs_info->tree_root;
struct btrfs_path *path = NULL;
struct btrfs_qgroup_status_item *ptr;
struct extent_buffer *leaf;
struct btrfs_key key;
struct btrfs_key found_key;
struct btrfs_qgroup *qgroup = NULL;
struct btrfs_trans_handle *trans = NULL;
struct ulist *ulist = NULL;
int ret = 0;
int slot;
/*
* We need to have subvol_sem write locked, to prevent races between
* concurrent tasks trying to enable quotas, because we will unlock
* and relock qgroup_ioctl_lock before setting fs_info->quota_root
* and before setting BTRFS_FS_QUOTA_ENABLED.
*/
lockdep_assert_held_write(&fs_info->subvol_sem);
mutex_lock(&fs_info->qgroup_ioctl_lock);
if (fs_info->quota_root)
goto out;
ulist = ulist_alloc(GFP_KERNEL);
if (!ulist) {
ret = -ENOMEM;
goto out;
}
ret = btrfs_sysfs_add_qgroups(fs_info);
if (ret < 0)
goto out;
/*
* Unlock qgroup_ioctl_lock before starting the transaction. This is to
* avoid lock acquisition inversion problems (reported by lockdep) between
* qgroup_ioctl_lock and the vfs freeze semaphores, acquired when we
* start a transaction.
* After we started the transaction lock qgroup_ioctl_lock again and
* check if someone else created the quota root in the meanwhile. If so,
* just return success and release the transaction handle.
*
* Also we don't need to worry about someone else calling
* btrfs_sysfs_add_qgroups() after we unlock and getting an error because
* that function returns 0 (success) when the sysfs entries already exist.
*/
mutex_unlock(&fs_info->qgroup_ioctl_lock);
/*
* 1 for quota root item
* 1 for BTRFS_QGROUP_STATUS item
*
* Yet we also need 2*n items for a QGROUP_INFO/QGROUP_LIMIT items
* per subvolume. However those are not currently reserved since it
* would be a lot of overkill.
*/
trans = btrfs_start_transaction(tree_root, 2);
mutex_lock(&fs_info->qgroup_ioctl_lock);
if (IS_ERR(trans)) {
ret = PTR_ERR(trans);
trans = NULL;
goto out;
}
if (fs_info->quota_root)
goto out;
fs_info->qgroup_ulist = ulist;
ulist = NULL;
/*
* initially create the quota tree
*/
quota_root = btrfs_create_tree(trans, BTRFS_QUOTA_TREE_OBJECTID);
if (IS_ERR(quota_root)) {
ret = PTR_ERR(quota_root);
btrfs_abort_transaction(trans, ret);
goto out;
}
path = btrfs_alloc_path();
if (!path) {
ret = -ENOMEM;
btrfs_abort_transaction(trans, ret);
goto out_free_root;
}
key.objectid = 0;
key.type = BTRFS_QGROUP_STATUS_KEY;
key.offset = 0;
ret = btrfs_insert_empty_item(trans, quota_root, path, &key,
sizeof(*ptr));
if (ret) {
btrfs_abort_transaction(trans, ret);
goto out_free_path;
}
leaf = path->nodes[0];
ptr = btrfs_item_ptr(leaf, path->slots[0],
struct btrfs_qgroup_status_item);
btrfs_set_qgroup_status_generation(leaf, ptr, trans->transid);
btrfs_set_qgroup_status_version(leaf, ptr, BTRFS_QGROUP_STATUS_VERSION);
fs_info->qgroup_flags = BTRFS_QGROUP_STATUS_FLAG_ON |
BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
btrfs_set_qgroup_status_flags(leaf, ptr, fs_info->qgroup_flags);
btrfs_set_qgroup_status_rescan(leaf, ptr, 0);
btrfs_mark_buffer_dirty(leaf);
key.objectid = 0;
key.type = BTRFS_ROOT_REF_KEY;
key.offset = 0;
btrfs_release_path(path);
ret = btrfs_search_slot_for_read(tree_root, &key, path, 1, 0);
if (ret > 0)
goto out_add_root;
if (ret < 0) {
btrfs_abort_transaction(trans, ret);
goto out_free_path;
}
while (1) {
slot = path->slots[0];
leaf = path->nodes[0];
btrfs_item_key_to_cpu(leaf, &found_key, slot);
if (found_key.type == BTRFS_ROOT_REF_KEY) {
/* Release locks on tree_root before we access quota_root */
btrfs_release_path(path);
ret = add_qgroup_item(trans, quota_root,
found_key.offset);
if (ret) {
btrfs_abort_transaction(trans, ret);
goto out_free_path;
}
qgroup = add_qgroup_rb(fs_info, found_key.offset);
if (IS_ERR(qgroup)) {
ret = PTR_ERR(qgroup);
btrfs_abort_transaction(trans, ret);
goto out_free_path;
}
ret = btrfs_sysfs_add_one_qgroup(fs_info, qgroup);
if (ret < 0) {
btrfs_abort_transaction(trans, ret);
goto out_free_path;
}
ret = btrfs_search_slot_for_read(tree_root, &found_key,
path, 1, 0);
if (ret < 0) {
btrfs_abort_transaction(trans, ret);
goto out_free_path;
}
if (ret > 0) {
/*
* Shouldn't happen, but in case it does we
* don't need to do the btrfs_next_item, just
* continue.
*/
continue;
}
}
ret = btrfs_next_item(tree_root, path);
if (ret < 0) {
btrfs_abort_transaction(trans, ret);
goto out_free_path;
}
if (ret)
break;
}
out_add_root:
btrfs_release_path(path);
ret = add_qgroup_item(trans, quota_root, BTRFS_FS_TREE_OBJECTID);
if (ret) {
btrfs_abort_transaction(trans, ret);
goto out_free_path;
}
qgroup = add_qgroup_rb(fs_info, BTRFS_FS_TREE_OBJECTID);
if (IS_ERR(qgroup)) {
ret = PTR_ERR(qgroup);
btrfs_abort_transaction(trans, ret);
goto out_free_path;
}
ret = btrfs_sysfs_add_one_qgroup(fs_info, qgroup);
if (ret < 0) {
btrfs_abort_transaction(trans, ret);
goto out_free_path;
}
mutex_unlock(&fs_info->qgroup_ioctl_lock);
/*
* Commit the transaction while not holding qgroup_ioctl_lock, to avoid
* a deadlock with tasks concurrently doing other qgroup operations, such
* adding/removing qgroups or adding/deleting qgroup relations for example,
* because all qgroup operations first start or join a transaction and then
* lock the qgroup_ioctl_lock mutex.
* We are safe from a concurrent task trying to enable quotas, by calling
* this function, since we are serialized by fs_info->subvol_sem.
*/
ret = btrfs_commit_transaction(trans);
trans = NULL;
mutex_lock(&fs_info->qgroup_ioctl_lock);
if (ret)
goto out_free_path;
/*
* Set quota enabled flag after committing the transaction, to avoid
* deadlocks on fs_info->qgroup_ioctl_lock with concurrent snapshot
* creation.
*/
spin_lock(&fs_info->qgroup_lock);
fs_info->quota_root = quota_root;
set_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags);
spin_unlock(&fs_info->qgroup_lock);
ret = qgroup_rescan_init(fs_info, 0, 1);
if (!ret) {
qgroup_rescan_zero_tracking(fs_info);
fs_info->qgroup_rescan_running = true;
btrfs_queue_work(fs_info->qgroup_rescan_workers,
&fs_info->qgroup_rescan_work);
}
out_free_path:
btrfs_free_path(path);
out_free_root:
if (ret)
btrfs_put_root(quota_root);
out:
if (ret) {
ulist_free(fs_info->qgroup_ulist);
fs_info->qgroup_ulist = NULL;
btrfs_sysfs_del_qgroups(fs_info);
}
mutex_unlock(&fs_info->qgroup_ioctl_lock);
if (ret && trans)
btrfs_end_transaction(trans);
else if (trans)
ret = btrfs_end_transaction(trans);
ulist_free(ulist);
return ret;
}
int btrfs_quota_disable(struct btrfs_fs_info *fs_info)
{
struct btrfs_root *quota_root;
struct btrfs_trans_handle *trans = NULL;
int ret = 0;
/*
* We need to have subvol_sem write locked, to prevent races between
* concurrent tasks trying to disable quotas, because we will unlock
* and relock qgroup_ioctl_lock across BTRFS_FS_QUOTA_ENABLED changes.
*/
lockdep_assert_held_write(&fs_info->subvol_sem);
mutex_lock(&fs_info->qgroup_ioctl_lock);
if (!fs_info->quota_root)
goto out;
/*
* Unlock the qgroup_ioctl_lock mutex before waiting for the rescan worker to
* complete. Otherwise we can deadlock because btrfs_remove_qgroup() needs
* to lock that mutex while holding a transaction handle and the rescan
* worker needs to commit a transaction.
*/
mutex_unlock(&fs_info->qgroup_ioctl_lock);
/*
* Request qgroup rescan worker to complete and wait for it. This wait
* must be done before transaction start for quota disable since it may
* deadlock with transaction by the qgroup rescan worker.
*/
clear_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags);
btrfs_qgroup_wait_for_completion(fs_info, false);
/*
* 1 For the root item
*
* We should also reserve enough items for the quota tree deletion in
* btrfs_clean_quota_tree but this is not done.
*
* Also, we must always start a transaction without holding the mutex
* qgroup_ioctl_lock, see btrfs_quota_enable().
*/
trans = btrfs_start_transaction(fs_info->tree_root, 1);
mutex_lock(&fs_info->qgroup_ioctl_lock);
if (IS_ERR(trans)) {
ret = PTR_ERR(trans);
trans = NULL;
set_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags);
goto out;
}
if (!fs_info->quota_root)
goto out;
spin_lock(&fs_info->qgroup_lock);
quota_root = fs_info->quota_root;
fs_info->quota_root = NULL;
fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_ON;
spin_unlock(&fs_info->qgroup_lock);
btrfs_free_qgroup_config(fs_info);
ret = btrfs_clean_quota_tree(trans, quota_root);
if (ret) {
btrfs_abort_transaction(trans, ret);
goto out;
}
ret = btrfs_del_root(trans, "a_root->root_key);
if (ret) {
btrfs_abort_transaction(trans, ret);
goto out;
}
list_del("a_root->dirty_list);
btrfs_tree_lock(quota_root->node);
btrfs_clean_tree_block(quota_root->node);
btrfs_tree_unlock(quota_root->node);
btrfs_free_tree_block(trans, quota_root, quota_root->node, 0, 1);
btrfs_put_root(quota_root);
out:
mutex_unlock(&fs_info->qgroup_ioctl_lock);
if (ret && trans)
btrfs_end_transaction(trans);
else if (trans)
ret = btrfs_end_transaction(trans);
return ret;
}
static void qgroup_dirty(struct btrfs_fs_info *fs_info,
struct btrfs_qgroup *qgroup)
{
if (list_empty(&qgroup->dirty))
list_add(&qgroup->dirty, &fs_info->dirty_qgroups);
}
/*
* The easy accounting, we're updating qgroup relationship whose child qgroup
* only has exclusive extents.
*
* In this case, all exclusive extents will also be exclusive for parent, so
* excl/rfer just get added/removed.
*
* So is qgroup reservation space, which should also be added/removed to
* parent.
* Or when child tries to release reservation space, parent will underflow its
* reservation (for relationship adding case).
*
* Caller should hold fs_info->qgroup_lock.
*/
static int __qgroup_excl_accounting(struct btrfs_fs_info *fs_info,
struct ulist *tmp, u64 ref_root,
struct btrfs_qgroup *src, int sign)
{
struct btrfs_qgroup *qgroup;
struct btrfs_qgroup_list *glist;
struct ulist_node *unode;
struct ulist_iterator uiter;
u64 num_bytes = src->excl;
int ret = 0;
qgroup = find_qgroup_rb(fs_info, ref_root);
if (!qgroup)
goto out;
qgroup->rfer += sign * num_bytes;
qgroup->rfer_cmpr += sign * num_bytes;
WARN_ON(sign < 0 && qgroup->excl < num_bytes);
qgroup->excl += sign * num_bytes;
qgroup->excl_cmpr += sign * num_bytes;
if (sign > 0)
qgroup_rsv_add_by_qgroup(fs_info, qgroup, src);
else
qgroup_rsv_release_by_qgroup(fs_info, qgroup, src);
qgroup_dirty(fs_info, qgroup);
/* Get all of the parent groups that contain this qgroup */
list_for_each_entry(glist, &qgroup->groups, next_group) {
ret = ulist_add(tmp, glist->group->qgroupid,
qgroup_to_aux(glist->group), GFP_ATOMIC);
if (ret < 0)
goto out;
}
/* Iterate all of the parents and adjust their reference counts */
ULIST_ITER_INIT(&uiter);
while ((unode = ulist_next(tmp, &uiter))) {
qgroup = unode_aux_to_qgroup(unode);
qgroup->rfer += sign * num_bytes;
qgroup->rfer_cmpr += sign * num_bytes;
WARN_ON(sign < 0 && qgroup->excl < num_bytes);
qgroup->excl += sign * num_bytes;
if (sign > 0)
qgroup_rsv_add_by_qgroup(fs_info, qgroup, src);
else
qgroup_rsv_release_by_qgroup(fs_info, qgroup, src);
qgroup->excl_cmpr += sign * num_bytes;
qgroup_dirty(fs_info, qgroup);
/* Add any parents of the parents */
list_for_each_entry(glist, &qgroup->groups, next_group) {
ret = ulist_add(tmp, glist->group->qgroupid,
qgroup_to_aux(glist->group), GFP_ATOMIC);
if (ret < 0)
goto out;
}
}
ret = 0;
out:
return ret;
}
/*
* Quick path for updating qgroup with only excl refs.
*
* In that case, just update all parent will be enough.
* Or we needs to do a full rescan.
* Caller should also hold fs_info->qgroup_lock.
*
* Return 0 for quick update, return >0 for need to full rescan
* and mark INCONSISTENT flag.
* Return < 0 for other error.
*/
static int quick_update_accounting(struct btrfs_fs_info *fs_info,
struct ulist *tmp, u64 src, u64 dst,
int sign)
{
struct btrfs_qgroup *qgroup;
int ret = 1;
int err = 0;
qgroup = find_qgroup_rb(fs_info, src);
if (!qgroup)
goto out;
if (qgroup->excl == qgroup->rfer) {
ret = 0;
err = __qgroup_excl_accounting(fs_info, tmp, dst,
qgroup, sign);
if (err < 0) {
ret = err;
goto out;
}
}
out:
if (ret)
fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
return ret;
}
int btrfs_add_qgroup_relation(struct btrfs_trans_handle *trans, u64 src,
u64 dst)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
struct btrfs_qgroup *parent;
struct btrfs_qgroup *member;
struct btrfs_qgroup_list *list;
struct ulist *tmp;
unsigned int nofs_flag;
int ret = 0;
/* Check the level of src and dst first */
if (btrfs_qgroup_level(src) >= btrfs_qgroup_level(dst))
return -EINVAL;
/* We hold a transaction handle open, must do a NOFS allocation. */
nofs_flag = memalloc_nofs_save();
tmp = ulist_alloc(GFP_KERNEL);
memalloc_nofs_restore(nofs_flag);
if (!tmp)
return -ENOMEM;
mutex_lock(&fs_info->qgroup_ioctl_lock);
if (!fs_info->quota_root) {
ret = -ENOTCONN;
goto out;
}
member = find_qgroup_rb(fs_info, src);
parent = find_qgroup_rb(fs_info, dst);
if (!member || !parent) {
ret = -EINVAL;
goto out;
}
/* check if such qgroup relation exist firstly */
list_for_each_entry(list, &member->groups, next_group) {
if (list->group == parent) {
ret = -EEXIST;
goto out;
}
}
ret = add_qgroup_relation_item(trans, src, dst);
if (ret)
goto out;
ret = add_qgroup_relation_item(trans, dst, src);
if (ret) {
del_qgroup_relation_item(trans, src, dst);
goto out;
}
spin_lock(&fs_info->qgroup_lock);
ret = add_relation_rb(fs_info, src, dst);
if (ret < 0) {
spin_unlock(&fs_info->qgroup_lock);
goto out;
}
ret = quick_update_accounting(fs_info, tmp, src, dst, 1);
spin_unlock(&fs_info->qgroup_lock);
out:
mutex_unlock(&fs_info->qgroup_ioctl_lock);
ulist_free(tmp);
return ret;
}
static int __del_qgroup_relation(struct btrfs_trans_handle *trans, u64 src,
u64 dst)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
struct btrfs_qgroup *parent;
struct btrfs_qgroup *member;
struct btrfs_qgroup_list *list;
struct ulist *tmp;
bool found = false;
unsigned int nofs_flag;
int ret = 0;
int ret2;
/* We hold a transaction handle open, must do a NOFS allocation. */
nofs_flag = memalloc_nofs_save();
tmp = ulist_alloc(GFP_KERNEL);
memalloc_nofs_restore(nofs_flag);
if (!tmp)
return -ENOMEM;
if (!fs_info->quota_root) {
ret = -ENOTCONN;
goto out;
}
member = find_qgroup_rb(fs_info, src);
parent = find_qgroup_rb(fs_info, dst);
/*
* The parent/member pair doesn't exist, then try to delete the dead
* relation items only.
*/
if (!member || !parent)
goto delete_item;
/* check if such qgroup relation exist firstly */
list_for_each_entry(list, &member->groups, next_group) {
if (list->group == parent) {
found = true;
break;
}
}
delete_item:
ret = del_qgroup_relation_item(trans, src, dst);
if (ret < 0 && ret != -ENOENT)
goto out;
ret2 = del_qgroup_relation_item(trans, dst, src);
if (ret2 < 0 && ret2 != -ENOENT)
goto out;
/* At least one deletion succeeded, return 0 */
if (!ret || !ret2)
ret = 0;
if (found) {
spin_lock(&fs_info->qgroup_lock);
del_relation_rb(fs_info, src, dst);
ret = quick_update_accounting(fs_info, tmp, src, dst, -1);
spin_unlock(&fs_info->qgroup_lock);
}
out:
ulist_free(tmp);
return ret;
}
int btrfs_del_qgroup_relation(struct btrfs_trans_handle *trans, u64 src,
u64 dst)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
int ret = 0;
mutex_lock(&fs_info->qgroup_ioctl_lock);
ret = __del_qgroup_relation(trans, src, dst);
mutex_unlock(&fs_info->qgroup_ioctl_lock);
return ret;
}
int btrfs_create_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
struct btrfs_root *quota_root;
struct btrfs_qgroup *qgroup;
int ret = 0;
mutex_lock(&fs_info->qgroup_ioctl_lock);
if (!fs_info->quota_root) {
ret = -ENOTCONN;
goto out;
}
quota_root = fs_info->quota_root;
qgroup = find_qgroup_rb(fs_info, qgroupid);
if (qgroup) {
ret = -EEXIST;
goto out;
}
ret = add_qgroup_item(trans, quota_root, qgroupid);
if (ret)
goto out;
spin_lock(&fs_info->qgroup_lock);
qgroup = add_qgroup_rb(fs_info, qgroupid);
spin_unlock(&fs_info->qgroup_lock);
if (IS_ERR(qgroup)) {
ret = PTR_ERR(qgroup);
goto out;
}
ret = btrfs_sysfs_add_one_qgroup(fs_info, qgroup);
out:
mutex_unlock(&fs_info->qgroup_ioctl_lock);
return ret;
}
int btrfs_remove_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
struct btrfs_qgroup *qgroup;
struct btrfs_qgroup_list *list;
int ret = 0;
mutex_lock(&fs_info->qgroup_ioctl_lock);
if (!fs_info->quota_root) {
ret = -ENOTCONN;
goto out;
}
qgroup = find_qgroup_rb(fs_info, qgroupid);
if (!qgroup) {
ret = -ENOENT;
goto out;
}
/* Check if there are no children of this qgroup */
if (!list_empty(&qgroup->members)) {
ret = -EBUSY;
goto out;
}
ret = del_qgroup_item(trans, qgroupid);
if (ret && ret != -ENOENT)
goto out;
while (!list_empty(&qgroup->groups)) {
list = list_first_entry(&qgroup->groups,
struct btrfs_qgroup_list, next_group);
ret = __del_qgroup_relation(trans, qgroupid,
list->group->qgroupid);
if (ret)
goto out;
}
spin_lock(&fs_info->qgroup_lock);
del_qgroup_rb(fs_info, qgroupid);
spin_unlock(&fs_info->qgroup_lock);
/*
* Remove the qgroup from sysfs now without holding the qgroup_lock
* spinlock, since the sysfs_remove_group() function needs to take
* the mutex kernfs_mutex through kernfs_remove_by_name_ns().
*/
btrfs_sysfs_del_one_qgroup(fs_info, qgroup);
kfree(qgroup);
out:
mutex_unlock(&fs_info->qgroup_ioctl_lock);
return ret;
}
int btrfs_limit_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid,
struct btrfs_qgroup_limit *limit)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
struct btrfs_qgroup *qgroup;
int ret = 0;
/* Sometimes we would want to clear the limit on this qgroup.
* To meet this requirement, we treat the -1 as a special value
* which tell kernel to clear the limit on this qgroup.
*/
const u64 CLEAR_VALUE = -1;
mutex_lock(&fs_info->qgroup_ioctl_lock);
if (!fs_info->quota_root) {
ret = -ENOTCONN;
goto out;
}
qgroup = find_qgroup_rb(fs_info, qgroupid);
if (!qgroup) {
ret = -ENOENT;
goto out;
}
spin_lock(&fs_info->qgroup_lock);
if (limit->flags & BTRFS_QGROUP_LIMIT_MAX_RFER) {
if (limit->max_rfer == CLEAR_VALUE) {
qgroup->lim_flags &= ~BTRFS_QGROUP_LIMIT_MAX_RFER;
limit->flags &= ~BTRFS_QGROUP_LIMIT_MAX_RFER;
qgroup->max_rfer = 0;
} else {
qgroup->max_rfer = limit->max_rfer;
}
}
if (limit->flags & BTRFS_QGROUP_LIMIT_MAX_EXCL) {
if (limit->max_excl == CLEAR_VALUE) {
qgroup->lim_flags &= ~BTRFS_QGROUP_LIMIT_MAX_EXCL;
limit->flags &= ~BTRFS_QGROUP_LIMIT_MAX_EXCL;
qgroup->max_excl = 0;
} else {
qgroup->max_excl = limit->max_excl;
}
}
if (limit->flags & BTRFS_QGROUP_LIMIT_RSV_RFER) {
if (limit->rsv_rfer == CLEAR_VALUE) {
qgroup->lim_flags &= ~BTRFS_QGROUP_LIMIT_RSV_RFER;
limit->flags &= ~BTRFS_QGROUP_LIMIT_RSV_RFER;
qgroup->rsv_rfer = 0;
} else {
qgroup->rsv_rfer = limit->rsv_rfer;
}
}
if (limit->flags & BTRFS_QGROUP_LIMIT_RSV_EXCL) {
if (limit->rsv_excl == CLEAR_VALUE) {
qgroup->lim_flags &= ~BTRFS_QGROUP_LIMIT_RSV_EXCL;
limit->flags &= ~BTRFS_QGROUP_LIMIT_RSV_EXCL;
qgroup->rsv_excl = 0;
} else {
qgroup->rsv_excl = limit->rsv_excl;
}
}
qgroup->lim_flags |= limit->flags;
spin_unlock(&fs_info->qgroup_lock);
ret = update_qgroup_limit_item(trans, qgroup);
if (ret) {
fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
btrfs_info(fs_info, "unable to update quota limit for %llu",
qgroupid);
}
out:
mutex_unlock(&fs_info->qgroup_ioctl_lock);
return ret;
}
int btrfs_qgroup_trace_extent_nolock(struct btrfs_fs_info *fs_info,
struct btrfs_delayed_ref_root *delayed_refs,
struct btrfs_qgroup_extent_record *record)
{
struct rb_node **p = &delayed_refs->dirty_extent_root.rb_node;
struct rb_node *parent_node = NULL;
struct btrfs_qgroup_extent_record *entry;
u64 bytenr = record->bytenr;
lockdep_assert_held(&delayed_refs->lock);
trace_btrfs_qgroup_trace_extent(fs_info, record);
while (*p) {
parent_node = *p;
entry = rb_entry(parent_node, struct btrfs_qgroup_extent_record,
node);
if (bytenr < entry->bytenr) {
p = &(*p)->rb_left;
} else if (bytenr > entry->bytenr) {
p = &(*p)->rb_right;
} else {
if (record->data_rsv && !entry->data_rsv) {
entry->data_rsv = record->data_rsv;
entry->data_rsv_refroot =
record->data_rsv_refroot;
}
return 1;
}
}
rb_link_node(&record->node, parent_node, p);
rb_insert_color(&record->node, &delayed_refs->dirty_extent_root);
return 0;
}
int btrfs_qgroup_trace_extent_post(struct btrfs_trans_handle *trans,
struct btrfs_qgroup_extent_record *qrecord)
{
struct ulist *old_root;
u64 bytenr = qrecord->bytenr;
int ret;
/*
* We are always called in a context where we are already holding a
* transaction handle. Often we are called when adding a data delayed
* reference from btrfs_truncate_inode_items() (truncating or unlinking),
* in which case we will be holding a write lock on extent buffer from a
* subvolume tree. In this case we can't allow btrfs_find_all_roots() to
* acquire fs_info->commit_root_sem, because that is a higher level lock
* that must be acquired before locking any extent buffers.
*
* So we want btrfs_find_all_roots() to not acquire the commit_root_sem
* but we can't pass it a non-NULL transaction handle, because otherwise
* it would not use commit roots and would lock extent buffers, causing
* a deadlock if it ends up trying to read lock the same extent buffer
* that was previously write locked at btrfs_truncate_inode_items().
*
* So pass a NULL transaction handle to btrfs_find_all_roots() and
* explicitly tell it to not acquire the commit_root_sem - if we are
* holding a transaction handle we don't need its protection.
*/
ASSERT(trans != NULL);
ret = btrfs_find_all_roots(NULL, trans->fs_info, bytenr, 0, &old_root,
true);
if (ret < 0) {
trans->fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
btrfs_warn(trans->fs_info,
"error accounting new delayed refs extent (err code: %d), quota inconsistent",
ret);
return 0;
}
/*
* Here we don't need to get the lock of
* trans->transaction->delayed_refs, since inserted qrecord won't
* be deleted, only qrecord->node may be modified (new qrecord insert)
*
* So modifying qrecord->old_roots is safe here
*/
qrecord->old_roots = old_root;
return 0;
}
int btrfs_qgroup_trace_extent(struct btrfs_trans_handle *trans, u64 bytenr,
u64 num_bytes, gfp_t gfp_flag)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
struct btrfs_qgroup_extent_record *record;
struct btrfs_delayed_ref_root *delayed_refs;
int ret;
if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)
|| bytenr == 0 || num_bytes == 0)
return 0;
record = kzalloc(sizeof(*record), gfp_flag);
if (!record)
return -ENOMEM;
delayed_refs = &trans->transaction->delayed_refs;
record->bytenr = bytenr;
record->num_bytes = num_bytes;
record->old_roots = NULL;
spin_lock(&delayed_refs->lock);
ret = btrfs_qgroup_trace_extent_nolock(fs_info, delayed_refs, record);
spin_unlock(&delayed_refs->lock);
if (ret > 0) {
kfree(record);
return 0;
}
return btrfs_qgroup_trace_extent_post(trans, record);
}
int btrfs_qgroup_trace_leaf_items(struct btrfs_trans_handle *trans,
struct extent_buffer *eb)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
int nr = btrfs_header_nritems(eb);
int i, extent_type, ret;
struct btrfs_key key;
struct btrfs_file_extent_item *fi;
u64 bytenr, num_bytes;
/* We can be called directly from walk_up_proc() */
if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags))
return 0;
for (i = 0; i < nr; i++) {
btrfs_item_key_to_cpu(eb, &key, i);
if (key.type != BTRFS_EXTENT_DATA_KEY)
continue;
fi = btrfs_item_ptr(eb, i, struct btrfs_file_extent_item);
/* filter out non qgroup-accountable extents */
extent_type = btrfs_file_extent_type(eb, fi);
if (extent_type == BTRFS_FILE_EXTENT_INLINE)
continue;
bytenr = btrfs_file_extent_disk_bytenr(eb, fi);
if (!bytenr)
continue;
num_bytes = btrfs_file_extent_disk_num_bytes(eb, fi);
ret = btrfs_qgroup_trace_extent(trans, bytenr, num_bytes,
GFP_NOFS);
if (ret)
return ret;
}
cond_resched();
return 0;
}
/*
* Walk up the tree from the bottom, freeing leaves and any interior
* nodes which have had all slots visited. If a node (leaf or
* interior) is freed, the node above it will have it's slot
* incremented. The root node will never be freed.
*
* At the end of this function, we should have a path which has all
* slots incremented to the next position for a search. If we need to
* read a new node it will be NULL and the node above it will have the
* correct slot selected for a later read.
*
* If we increment the root nodes slot counter past the number of
* elements, 1 is returned to signal completion of the search.
*/
static int adjust_slots_upwards(struct btrfs_path *path, int root_level)
{
int level = 0;
int nr, slot;
struct extent_buffer *eb;
if (root_level == 0)
return 1;
while (level <= root_level) {
eb = path->nodes[level];
nr = btrfs_header_nritems(eb);
path->slots[level]++;
slot = path->slots[level];
if (slot >= nr || level == 0) {
/*
* Don't free the root - we will detect this
* condition after our loop and return a
* positive value for caller to stop walking the tree.
*/
if (level != root_level) {
btrfs_tree_unlock_rw(eb, path->locks[level]);
path->locks[level] = 0;
free_extent_buffer(eb);
path->nodes[level] = NULL;
path->slots[level] = 0;
}
} else {
/*
* We have a valid slot to walk back down
* from. Stop here so caller can process these
* new nodes.
*/
break;
}
level++;
}
eb = path->nodes[root_level];
if (path->slots[root_level] >= btrfs_header_nritems(eb))
return 1;
return 0;
}
/*
* Helper function to trace a subtree tree block swap.
*
* The swap will happen in highest tree block, but there may be a lot of
* tree blocks involved.
*
* For example:
* OO = Old tree blocks
* NN = New tree blocks allocated during balance
*
* File tree (257) Reloc tree for 257
* L2 OO NN
* / \ / \
* L1 OO OO (a) OO NN (a)
* / \ / \ / \ / \
* L0 OO OO OO OO OO OO NN NN
* (b) (c) (b) (c)
*
* When calling qgroup_trace_extent_swap(), we will pass:
* @src_eb = OO(a)
* @dst_path = [ nodes[1] = NN(a), nodes[0] = NN(c) ]
* @dst_level = 0
* @root_level = 1
*
* In that case, qgroup_trace_extent_swap() will search from OO(a) to
* reach OO(c), then mark both OO(c) and NN(c) as qgroup dirty.
*
* The main work of qgroup_trace_extent_swap() can be split into 3 parts:
*
* 1) Tree search from @src_eb
* It should acts as a simplified btrfs_search_slot().
* The key for search can be extracted from @dst_path->nodes[dst_level]
* (first key).
*
* 2) Mark the final tree blocks in @src_path and @dst_path qgroup dirty
* NOTE: In above case, OO(a) and NN(a) won't be marked qgroup dirty.
* They should be marked during previous (@dst_level = 1) iteration.
*
* 3) Mark file extents in leaves dirty
* We don't have good way to pick out new file extents only.
* So we still follow the old method by scanning all file extents in
* the leave.
*
* This function can free us from keeping two paths, thus later we only need
* to care about how to iterate all new tree blocks in reloc tree.
*/
static int qgroup_trace_extent_swap(struct btrfs_trans_handle* trans,
struct extent_buffer *src_eb,
struct btrfs_path *dst_path,
int dst_level, int root_level,
bool trace_leaf)
{
struct btrfs_key key;
struct btrfs_path *src_path;
struct btrfs_fs_info *fs_info = trans->fs_info;
u32 nodesize = fs_info->nodesize;
int cur_level = root_level;
int ret;
BUG_ON(dst_level > root_level);
/* Level mismatch */
if (btrfs_header_level(src_eb) != root_level)
return -EINVAL;
src_path = btrfs_alloc_path();
if (!src_path) {
ret = -ENOMEM;
goto out;
}
if (dst_level)
btrfs_node_key_to_cpu(dst_path->nodes[dst_level], &key, 0);
else
btrfs_item_key_to_cpu(dst_path->nodes[dst_level], &key, 0);
/* For src_path */
atomic_inc(&src_eb->refs);
src_path->nodes[root_level] = src_eb;
src_path->slots[root_level] = dst_path->slots[root_level];
src_path->locks[root_level] = 0;
/* A simplified version of btrfs_search_slot() */
while (cur_level >= dst_level) {
struct btrfs_key src_key;
struct btrfs_key dst_key;
if (src_path->nodes[cur_level] == NULL) {
struct extent_buffer *eb;
int parent_slot;
eb = src_path->nodes[cur_level + 1];
parent_slot = src_path->slots[cur_level + 1];
eb = btrfs_read_node_slot(eb, parent_slot);
if (IS_ERR(eb)) {
ret = PTR_ERR(eb);
goto out;
}
src_path->nodes[cur_level] = eb;
btrfs_tree_read_lock(eb);
src_path->locks[cur_level] = BTRFS_READ_LOCK;
}
src_path->slots[cur_level] = dst_path->slots[cur_level];
if (cur_level) {
btrfs_node_key_to_cpu(dst_path->nodes[cur_level],
&dst_key, dst_path->slots[cur_level]);
btrfs_node_key_to_cpu(src_path->nodes[cur_level],
&src_key, src_path->slots[cur_level]);
} else {
btrfs_item_key_to_cpu(dst_path->nodes[cur_level],
&dst_key, dst_path->slots[cur_level]);
btrfs_item_key_to_cpu(src_path->nodes[cur_level],
&src_key, src_path->slots[cur_level]);
}
/* Content mismatch, something went wrong */
if (btrfs_comp_cpu_keys(&dst_key, &src_key)) {
ret = -ENOENT;
goto out;
}
cur_level--;
}
/*
* Now both @dst_path and @src_path have been populated, record the tree
* blocks for qgroup accounting.
*/
ret = btrfs_qgroup_trace_extent(trans, src_path->nodes[dst_level]->start,
nodesize, GFP_NOFS);
if (ret < 0)
goto out;
ret = btrfs_qgroup_trace_extent(trans,
dst_path->nodes[dst_level]->start,
nodesize, GFP_NOFS);
if (ret < 0)
goto out;
/* Record leaf file extents */
if (dst_level == 0 && trace_leaf) {
ret = btrfs_qgroup_trace_leaf_items(trans, src_path->nodes[0]);
if (ret < 0)
goto out;
ret = btrfs_qgroup_trace_leaf_items(trans, dst_path->nodes[0]);
}
out:
btrfs_free_path(src_path);
return ret;
}
/*
* Helper function to do recursive generation-aware depth-first search, to
* locate all new tree blocks in a subtree of reloc tree.
*
* E.g. (OO = Old tree blocks, NN = New tree blocks, whose gen == last_snapshot)
* reloc tree
* L2 NN (a)
* / \
* L1 OO NN (b)
* / \ / \
* L0 OO OO OO NN
* (c) (d)
* If we pass:
* @dst_path = [ nodes[1] = NN(b), nodes[0] = NULL ],
* @cur_level = 1
* @root_level = 1
*
* We will iterate through tree blocks NN(b), NN(d) and info qgroup to trace
* above tree blocks along with their counter parts in file tree.
* While during search, old tree blocks OO(c) will be skipped as tree block swap
* won't affect OO(c).
*/
static int qgroup_trace_new_subtree_blocks(struct btrfs_trans_handle* trans,
struct extent_buffer *src_eb,
struct btrfs_path *dst_path,
int cur_level, int root_level,
u64 last_snapshot, bool trace_leaf)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
struct extent_buffer *eb;
bool need_cleanup = false;
int ret = 0;
int i;
/* Level sanity check */
if (cur_level < 0 || cur_level >= BTRFS_MAX_LEVEL - 1 ||
root_level < 0 || root_level >= BTRFS_MAX_LEVEL - 1 ||
root_level < cur_level) {
btrfs_err_rl(fs_info,
"%s: bad levels, cur_level=%d root_level=%d",
__func__, cur_level, root_level);
return -EUCLEAN;
}
/* Read the tree block if needed */
if (dst_path->nodes[cur_level] == NULL) {
int parent_slot;
u64 child_gen;
/*
* dst_path->nodes[root_level] must be initialized before
* calling this function.
*/
if (cur_level == root_level) {
btrfs_err_rl(fs_info,
"%s: dst_path->nodes[%d] not initialized, root_level=%d cur_level=%d",
__func__, root_level, root_level, cur_level);
return -EUCLEAN;
}
/*
* We need to get child blockptr/gen from parent before we can
* read it.
*/
eb = dst_path->nodes[cur_level + 1];
parent_slot = dst_path->slots[cur_level + 1];
child_gen = btrfs_node_ptr_generation(eb, parent_slot);
/* This node is old, no need to trace */
if (child_gen < last_snapshot)
goto out;
eb = btrfs_read_node_slot(eb, parent_slot);
if (IS_ERR(eb)) {
ret = PTR_ERR(eb);
goto out;
}
dst_path->nodes[cur_level] = eb;
dst_path->slots[cur_level] = 0;
btrfs_tree_read_lock(eb);
dst_path->locks[cur_level] = BTRFS_READ_LOCK;
need_cleanup = true;
}
/* Now record this tree block and its counter part for qgroups */
ret = qgroup_trace_extent_swap(trans, src_eb, dst_path, cur_level,
root_level, trace_leaf);
if (ret < 0)
goto cleanup;
eb = dst_path->nodes[cur_level];
if (cur_level > 0) {
/* Iterate all child tree blocks */
for (i = 0; i < btrfs_header_nritems(eb); i++) {
/* Skip old tree blocks as they won't be swapped */
if (btrfs_node_ptr_generation(eb, i) < last_snapshot)
continue;
dst_path->slots[cur_level] = i;
/* Recursive call (at most 7 times) */
ret = qgroup_trace_new_subtree_blocks(trans, src_eb,
dst_path, cur_level - 1, root_level,
last_snapshot, trace_leaf);
if (ret < 0)
goto cleanup;
}
}
cleanup:
if (need_cleanup) {
/* Clean up */
btrfs_tree_unlock_rw(dst_path->nodes[cur_level],
dst_path->locks[cur_level]);
free_extent_buffer(dst_path->nodes[cur_level]);
dst_path->nodes[cur_level] = NULL;
dst_path->slots[cur_level] = 0;
dst_path->locks[cur_level] = 0;
}
out:
return ret;
}
static int qgroup_trace_subtree_swap(struct btrfs_trans_handle *trans,
struct extent_buffer *src_eb,
struct extent_buffer *dst_eb,
u64 last_snapshot, bool trace_leaf)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
struct btrfs_path *dst_path = NULL;
int level;
int ret;
if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags))
return 0;
/* Wrong parameter order */
if (btrfs_header_generation(src_eb) > btrfs_header_generation(dst_eb)) { btrfs_err_rl(fs_info,
"%s: bad parameter order, src_gen=%llu dst_gen=%llu", __func__,
btrfs_header_generation(src_eb),
btrfs_header_generation(dst_eb));
return -EUCLEAN;
}
if (!extent_buffer_uptodate(src_eb) || !extent_buffer_uptodate(dst_eb)) {
ret = -EIO;
goto out;
}
level = btrfs_header_level(dst_eb);
dst_path = btrfs_alloc_path();
if (!dst_path) {
ret = -ENOMEM;
goto out;
}
/* For dst_path */
atomic_inc(&dst_eb->refs);
dst_path->nodes[level] = dst_eb;
dst_path->slots[level] = 0;
dst_path->locks[level] = 0;
/* Do the generation aware breadth-first search */
ret = qgroup_trace_new_subtree_blocks(trans, src_eb, dst_path, level,
level, last_snapshot, trace_leaf);
if (ret < 0)
goto out;
ret = 0;
out:
btrfs_free_path(dst_path);
if (ret < 0)
fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
return ret;
}
int btrfs_qgroup_trace_subtree(struct btrfs_trans_handle *trans,
struct extent_buffer *root_eb,
u64 root_gen, int root_level)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
int ret = 0;
int level;
struct extent_buffer *eb = root_eb;
struct btrfs_path *path = NULL;
BUG_ON(root_level < 0 || root_level >= BTRFS_MAX_LEVEL);
BUG_ON(root_eb == NULL);
if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags))
return 0;
if (!extent_buffer_uptodate(root_eb)) {
ret = btrfs_read_buffer(root_eb, root_gen, root_level, NULL);
if (ret)
goto out;
}
if (root_level == 0) {
ret = btrfs_qgroup_trace_leaf_items(trans, root_eb);
goto out;
}
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
/*
* Walk down the tree. Missing extent blocks are filled in as
* we go. Metadata is accounted every time we read a new
* extent block.
*
* When we reach a leaf, we account for file extent items in it,
* walk back up the tree (adjusting slot pointers as we go)
* and restart the search process.
*/
atomic_inc(&root_eb->refs); /* For path */
path->nodes[root_level] = root_eb;
path->slots[root_level] = 0;
path->locks[root_level] = 0; /* so release_path doesn't try to unlock */
walk_down:
level = root_level;
while (level >= 0) {
if (path->nodes[level] == NULL) {
int parent_slot;
u64 child_bytenr;
/*
* We need to get child blockptr from parent before we
* can read it.
*/
eb = path->nodes[level + 1];
parent_slot = path->slots[level + 1];
child_bytenr = btrfs_node_blockptr(eb, parent_slot);
eb = btrfs_read_node_slot(eb, parent_slot);
if (IS_ERR(eb)) {
ret = PTR_ERR(eb);
goto out;
}
path->nodes[level] = eb;
path->slots[level] = 0;
btrfs_tree_read_lock(eb);
path->locks[level] = BTRFS_READ_LOCK;
ret = btrfs_qgroup_trace_extent(trans, child_bytenr,
fs_info->nodesize,
GFP_NOFS);
if (ret)
goto out;
}
if (level == 0) {
ret = btrfs_qgroup_trace_leaf_items(trans,
path->nodes[level]);
if (ret)
goto out;
/* Nonzero return here means we completed our search */
ret = adjust_slots_upwards(path, root_level);
if (ret)
break;
/* Restart search with new slots */
goto walk_down;
}
level--;
}
ret = 0;
out:
btrfs_free_path(path);
return ret;
}
#define UPDATE_NEW 0
#define UPDATE_OLD 1
/*
* Walk all of the roots that points to the bytenr and adjust their refcnts.
*/
static int qgroup_update_refcnt(struct btrfs_fs_info *fs_info,
struct ulist *roots, struct ulist *tmp,
struct ulist *qgroups, u64 seq, int update_old)
{
struct ulist_node *unode;
struct ulist_iterator uiter;
struct ulist_node *tmp_unode;
struct ulist_iterator tmp_uiter;
struct btrfs_qgroup *qg;
int ret = 0;
if (!roots)
return 0;
ULIST_ITER_INIT(&uiter);
while ((unode = ulist_next(roots, &uiter))) {
qg = find_qgroup_rb(fs_info, unode->val);
if (!qg)
continue;
ulist_reinit(tmp);
ret = ulist_add(qgroups, qg->qgroupid, qgroup_to_aux(qg),
GFP_ATOMIC);
if (ret < 0)
return ret;
ret = ulist_add(tmp, qg->qgroupid, qgroup_to_aux(qg), GFP_ATOMIC);
if (ret < 0)
return ret;
ULIST_ITER_INIT(&tmp_uiter);
while ((tmp_unode = ulist_next(tmp, &tmp_uiter))) {
struct btrfs_qgroup_list *glist;
qg = unode_aux_to_qgroup(tmp_unode);
if (update_old)
btrfs_qgroup_update_old_refcnt(qg, seq, 1);
else
btrfs_qgroup_update_new_refcnt(qg, seq, 1);
list_for_each_entry(glist, &qg->groups, next_group) {
ret = ulist_add(qgroups, glist->group->qgroupid,
qgroup_to_aux(glist->group),
GFP_ATOMIC);
if (ret < 0)
return ret;
ret = ulist_add(tmp, glist->group->qgroupid,
qgroup_to_aux(glist->group),
GFP_ATOMIC);
if (ret < 0)
return ret;
}
}
}
return 0;
}
/*
* Update qgroup rfer/excl counters.
* Rfer update is easy, codes can explain themselves.
*
* Excl update is tricky, the update is split into 2 parts.
* Part 1: Possible exclusive <-> sharing detect:
* | A | !A |
* -------------------------------------
* B | * | - |
* -------------------------------------
* !B | + | ** |
* -------------------------------------
*
* Conditions:
* A: cur_old_roots < nr_old_roots (not exclusive before)
* !A: cur_old_roots == nr_old_roots (possible exclusive before)
* B: cur_new_roots < nr_new_roots (not exclusive now)
* !B: cur_new_roots == nr_new_roots (possible exclusive now)
*
* Results:
* +: Possible sharing -> exclusive -: Possible exclusive -> sharing
* *: Definitely not changed. **: Possible unchanged.
*
* For !A and !B condition, the exception is cur_old/new_roots == 0 case.
*
* To make the logic clear, we first use condition A and B to split
* combination into 4 results.
*
* Then, for result "+" and "-", check old/new_roots == 0 case, as in them
* only on variant maybe 0.
*
* Lastly, check result **, since there are 2 variants maybe 0, split them
* again(2x2).
* But this time we don't need to consider other things, the codes and logic
* is easy to understand now.
*/
static int qgroup_update_counters(struct btrfs_fs_info *fs_info,
struct ulist *qgroups,
u64 nr_old_roots,
u64 nr_new_roots,
u64 num_bytes, u64 seq)
{
struct ulist_node *unode;
struct ulist_iterator uiter;
struct btrfs_qgroup *qg;
u64 cur_new_count, cur_old_count;
ULIST_ITER_INIT(&uiter);
while ((unode = ulist_next(qgroups, &uiter))) {
bool dirty = false;
qg = unode_aux_to_qgroup(unode);
cur_old_count = btrfs_qgroup_get_old_refcnt(qg, seq);
cur_new_count = btrfs_qgroup_get_new_refcnt(qg, seq);
trace_qgroup_update_counters(fs_info, qg, cur_old_count,
cur_new_count);
/* Rfer update part */
if (cur_old_count == 0 && cur_new_count > 0) {
qg->rfer += num_bytes;
qg->rfer_cmpr += num_bytes;
dirty = true;
}
if (cur_old_count > 0 && cur_new_count == 0) {
qg->rfer -= num_bytes;
qg->rfer_cmpr -= num_bytes;
dirty = true;
}
/* Excl update part */
/* Exclusive/none -> shared case */
if (cur_old_count == nr_old_roots &&
cur_new_count < nr_new_roots) {
/* Exclusive -> shared */
if (cur_old_count != 0) {
qg->excl -= num_bytes;
qg->excl_cmpr -= num_bytes;
dirty = true;
}
}
/* Shared -> exclusive/none case */
if (cur_old_count < nr_old_roots &&
cur_new_count == nr_new_roots) {
/* Shared->exclusive */
if (cur_new_count != 0) {
qg->excl += num_bytes;
qg->excl_cmpr += num_bytes;
dirty = true;
}
}
/* Exclusive/none -> exclusive/none case */
if (cur_old_count == nr_old_roots &&
cur_new_count == nr_new_roots) {
if (cur_old_count == 0) {
/* None -> exclusive/none */
if (cur_new_count != 0) {
/* None -> exclusive */
qg->excl += num_bytes;
qg->excl_cmpr += num_bytes;
dirty = true;
}
/* None -> none, nothing changed */
} else {
/* Exclusive -> exclusive/none */
if (cur_new_count == 0) {
/* Exclusive -> none */
qg->excl -= num_bytes;
qg->excl_cmpr -= num_bytes;
dirty = true;
}
/* Exclusive -> exclusive, nothing changed */
}
}
if (dirty)
qgroup_dirty(fs_info, qg);
}
return 0;
}
/*
* Check if the @roots potentially is a list of fs tree roots
*
* Return 0 for definitely not a fs/subvol tree roots ulist
* Return 1 for possible fs/subvol tree roots in the list (considering an empty
* one as well)
*/
static int maybe_fs_roots(struct ulist *roots)
{
struct ulist_node *unode;
struct ulist_iterator uiter;
/* Empty one, still possible for fs roots */
if (!roots || roots->nnodes == 0)
return 1;
ULIST_ITER_INIT(&uiter);
unode = ulist_next(roots, &uiter);
if (!unode)
return 1;
/*
* If it contains fs tree roots, then it must belong to fs/subvol
* trees.
* If it contains a non-fs tree, it won't be shared with fs/subvol trees.
*/
return is_fstree(unode->val);
}
int btrfs_qgroup_account_extent(struct btrfs_trans_handle *trans, u64 bytenr,
u64 num_bytes, struct ulist *old_roots,
struct ulist *new_roots)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
struct ulist *qgroups = NULL;
struct ulist *tmp = NULL;
u64 seq;
u64 nr_new_roots = 0;
u64 nr_old_roots = 0;
int ret = 0;
/*
* If quotas get disabled meanwhile, the resources need to be freed and
* we can't just exit here.
*/
if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags))
goto out_free;
if (new_roots) {
if (!maybe_fs_roots(new_roots))
goto out_free;
nr_new_roots = new_roots->nnodes;
}
if (old_roots) {
if (!maybe_fs_roots(old_roots))
goto out_free;
nr_old_roots = old_roots->nnodes;
}
/* Quick exit, either not fs tree roots, or won't affect any qgroup */
if (nr_old_roots == 0 && nr_new_roots == 0)
goto out_free;
BUG_ON(!fs_info->quota_root);
trace_btrfs_qgroup_account_extent(fs_info, trans->transid, bytenr,
num_bytes, nr_old_roots, nr_new_roots);
qgroups = ulist_alloc(GFP_NOFS);
if (!qgroups) {
ret = -ENOMEM;
goto out_free;
}
tmp = ulist_alloc(GFP_NOFS);
if (!tmp) {
ret = -ENOMEM;
goto out_free;
}
mutex_lock(&fs_info->qgroup_rescan_lock);
if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN) {
if (fs_info->qgroup_rescan_progress.objectid <= bytenr) {
mutex_unlock(&fs_info->qgroup_rescan_lock);
ret = 0;
goto out_free;
}
}
mutex_unlock(&fs_info->qgroup_rescan_lock);
spin_lock(&fs_info->qgroup_lock);
seq = fs_info->qgroup_seq;
/* Update old refcnts using old_roots */
ret = qgroup_update_refcnt(fs_info, old_roots, tmp, qgroups, seq,
UPDATE_OLD);
if (ret < 0)
goto out;
/* Update new refcnts using new_roots */
ret = qgroup_update_refcnt(fs_info, new_roots, tmp, qgroups, seq,
UPDATE_NEW);
if (ret < 0)
goto out;
qgroup_update_counters(fs_info, qgroups, nr_old_roots, nr_new_roots,
num_bytes, seq);
/*
* Bump qgroup_seq to avoid seq overlap
*/
fs_info->qgroup_seq += max(nr_old_roots, nr_new_roots) + 1;
out:
spin_unlock(&fs_info->qgroup_lock);
out_free:
ulist_free(tmp);
ulist_free(qgroups);
ulist_free(old_roots);
ulist_free(new_roots);
return ret;
}
int btrfs_qgroup_account_extents(struct btrfs_trans_handle *trans)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
struct btrfs_qgroup_extent_record *record;
struct btrfs_delayed_ref_root *delayed_refs;
struct ulist *new_roots = NULL;
struct rb_node *node;
u64 num_dirty_extents = 0;
u64 qgroup_to_skip;
int ret = 0;
delayed_refs = &trans->transaction->delayed_refs;
qgroup_to_skip = delayed_refs->qgroup_to_skip;
while ((node = rb_first(&delayed_refs->dirty_extent_root))) {
record = rb_entry(node, struct btrfs_qgroup_extent_record,
node);
num_dirty_extents++;
trace_btrfs_qgroup_account_extents(fs_info, record);
if (!ret) {
/*
* Old roots should be searched when inserting qgroup
* extent record
*/
if (WARN_ON(!record->old_roots)) {
/* Search commit root to find old_roots */
ret = btrfs_find_all_roots(NULL, fs_info,
record->bytenr, 0,
&record->old_roots, false);
if (ret < 0)
goto cleanup;
}
/* Free the reserved data space */
btrfs_qgroup_free_refroot(fs_info,
record->data_rsv_refroot,
record->data_rsv,
BTRFS_QGROUP_RSV_DATA);
/*
* Use BTRFS_SEQ_LAST as time_seq to do special search,
* which doesn't lock tree or delayed_refs and search
* current root. It's safe inside commit_transaction().
*/
ret = btrfs_find_all_roots(trans, fs_info,
record->bytenr, BTRFS_SEQ_LAST, &new_roots, false);
if (ret < 0)
goto cleanup;
if (qgroup_to_skip) { ulist_del(new_roots, qgroup_to_skip, 0);
ulist_del(record->old_roots, qgroup_to_skip,
0);
}
ret = btrfs_qgroup_account_extent(trans, record->bytenr,
record->num_bytes,
record->old_roots,
new_roots);
record->old_roots = NULL;
new_roots = NULL;
}
cleanup:
ulist_free(record->old_roots);
ulist_free(new_roots);
new_roots = NULL;
rb_erase(node, &delayed_refs->dirty_extent_root);
kfree(record);
}
trace_qgroup_num_dirty_extents(fs_info, trans->transid,
num_dirty_extents);
return ret;
}
/*
* called from commit_transaction. Writes all changed qgroups to disk.
*/
int btrfs_run_qgroups(struct btrfs_trans_handle *trans)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
int ret = 0;
if (!fs_info->quota_root)
return ret;
spin_lock(&fs_info->qgroup_lock);
while (!list_empty(&fs_info->dirty_qgroups)) {
struct btrfs_qgroup *qgroup;
qgroup = list_first_entry(&fs_info->dirty_qgroups,
struct btrfs_qgroup, dirty);
list_del_init(&qgroup->dirty);
spin_unlock(&fs_info->qgroup_lock);
ret = update_qgroup_info_item(trans, qgroup);
if (ret)
fs_info->qgroup_flags |=
BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
ret = update_qgroup_limit_item(trans, qgroup);
if (ret)
fs_info->qgroup_flags |=
BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
spin_lock(&fs_info->qgroup_lock);
}
if (test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_ON;
else
fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_ON;
spin_unlock(&fs_info->qgroup_lock);
ret = update_qgroup_status_item(trans);
if (ret)
fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
return ret;
}
/*
* Copy the accounting information between qgroups. This is necessary
* when a snapshot or a subvolume is created. Throwing an error will
* cause a transaction abort so we take extra care here to only error
* when a readonly fs is a reasonable outcome.
*/
int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, u64 srcid,
u64 objectid, struct btrfs_qgroup_inherit *inherit)
{
int ret = 0;
int i;
u64 *i_qgroups;
bool committing = false;
struct btrfs_fs_info *fs_info = trans->fs_info;
struct btrfs_root *quota_root;
struct btrfs_qgroup *srcgroup;
struct btrfs_qgroup *dstgroup;
bool need_rescan = false;
u32 level_size = 0;
u64 nums;
/*
* There are only two callers of this function.
*
* One in create_subvol() in the ioctl context, which needs to hold
* the qgroup_ioctl_lock.
*
* The other one in create_pending_snapshot() where no other qgroup
* code can modify the fs as they all need to either start a new trans
* or hold a trans handler, thus we don't need to hold
* qgroup_ioctl_lock.
* This would avoid long and complex lock chain and make lockdep happy.
*/
spin_lock(&fs_info->trans_lock);
if (trans->transaction->state == TRANS_STATE_COMMIT_DOING)
committing = true;
spin_unlock(&fs_info->trans_lock);
if (!committing)
mutex_lock(&fs_info->qgroup_ioctl_lock);
if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags))
goto out;
quota_root = fs_info->quota_root;
if (!quota_root) {
ret = -EINVAL;
goto out;
}
if (inherit) {
i_qgroups = (u64 *)(inherit + 1);
nums = inherit->num_qgroups + 2 * inherit->num_ref_copies +
2 * inherit->num_excl_copies;
for (i = 0; i < nums; ++i) {
srcgroup = find_qgroup_rb(fs_info, *i_qgroups);
/*
* Zero out invalid groups so we can ignore
* them later.
*/
if (!srcgroup ||
((srcgroup->qgroupid >> 48) <= (objectid >> 48)))
*i_qgroups = 0ULL;
++i_qgroups;
}
}
/*
* create a tracking group for the subvol itself
*/
ret = add_qgroup_item(trans, quota_root, objectid);
if (ret)
goto out;
/*
* add qgroup to all inherited groups
*/
if (inherit) {
i_qgroups = (u64 *)(inherit + 1);
for (i = 0; i < inherit->num_qgroups; ++i, ++i_qgroups) {
if (*i_qgroups == 0)
continue;
ret = add_qgroup_relation_item(trans, objectid,
*i_qgroups);
if (ret && ret != -EEXIST)
goto out;
ret = add_qgroup_relation_item(trans, *i_qgroups,
objectid);
if (ret && ret != -EEXIST)
goto out;
}
ret = 0;
}
spin_lock(&fs_info->qgroup_lock);
dstgroup = add_qgroup_rb(fs_info, objectid);
if (IS_ERR(dstgroup)) {
ret = PTR_ERR(dstgroup);
goto unlock;
}
if (inherit && inherit->flags & BTRFS_QGROUP_INHERIT_SET_LIMITS) {
dstgroup->lim_flags = inherit->lim.flags;
dstgroup->max_rfer = inherit->lim.max_rfer;
dstgroup->max_excl = inherit->lim.max_excl;
dstgroup->rsv_rfer = inherit->lim.rsv_rfer;
dstgroup->rsv_excl = inherit->lim.rsv_excl;
ret = update_qgroup_limit_item(trans, dstgroup);
if (ret) {
fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
btrfs_info(fs_info,
"unable to update quota limit for %llu",
dstgroup->qgroupid);
goto unlock;
}
}
if (srcid) {
srcgroup = find_qgroup_rb(fs_info, srcid);
if (!srcgroup)
goto unlock;
/*
* We call inherit after we clone the root in order to make sure
* our counts don't go crazy, so at this point the only
* difference between the two roots should be the root node.
*/
level_size = fs_info->nodesize;
dstgroup->rfer = srcgroup->rfer;
dstgroup->rfer_cmpr = srcgroup->rfer_cmpr;
dstgroup->excl = level_size;
dstgroup->excl_cmpr = level_size;
srcgroup->excl = level_size;
srcgroup->excl_cmpr = level_size;
/* inherit the limit info */
dstgroup->lim_flags = srcgroup->lim_flags;
dstgroup->max_rfer = srcgroup->max_rfer;
dstgroup->max_excl = srcgroup->max_excl;
dstgroup->rsv_rfer = srcgroup->rsv_rfer;
dstgroup->rsv_excl = srcgroup->rsv_excl;
qgroup_dirty(fs_info, dstgroup);
qgroup_dirty(fs_info, srcgroup);
}
if (!inherit)
goto unlock;
i_qgroups = (u64 *)(inherit + 1);
for (i = 0; i < inherit->num_qgroups; ++i) {
if (*i_qgroups) {
ret = add_relation_rb(fs_info, objectid, *i_qgroups);
if (ret)
goto unlock;
}
++i_qgroups;
/*
* If we're doing a snapshot, and adding the snapshot to a new
* qgroup, the numbers are guaranteed to be incorrect.
*/
if (srcid)
need_rescan = true;
}
for (i = 0; i < inherit->num_ref_copies; ++i, i_qgroups += 2) {
struct btrfs_qgroup *src;
struct btrfs_qgroup *dst;
if (!i_qgroups[0] || !i_qgroups[1])
continue;
src = find_qgroup_rb(fs_info, i_qgroups[0]);
dst = find_qgroup_rb(fs_info, i_qgroups[1]);
if (!src || !dst) {
ret = -EINVAL;
goto unlock;
}
dst->rfer = src->rfer - level_size;
dst->rfer_cmpr = src->rfer_cmpr - level_size;
/* Manually tweaking numbers certainly needs a rescan */
need_rescan = true;
}
for (i = 0; i < inherit->num_excl_copies; ++i, i_qgroups += 2) {
struct btrfs_qgroup *src;
struct btrfs_qgroup *dst;
if (!i_qgroups[0] || !i_qgroups[1])
continue;
src = find_qgroup_rb(fs_info, i_qgroups[0]);
dst = find_qgroup_rb(fs_info, i_qgroups[1]);
if (!src || !dst) {
ret = -EINVAL;
goto unlock;
}
dst->excl = src->excl + level_size;
dst->excl_cmpr = src->excl_cmpr + level_size;
need_rescan = true;
}
unlock:
spin_unlock(&fs_info->qgroup_lock);
if (!ret)
ret = btrfs_sysfs_add_one_qgroup(fs_info, dstgroup);
out:
if (!committing)
mutex_unlock(&fs_info->qgroup_ioctl_lock);
if (need_rescan)
fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
return ret;
}
static bool qgroup_check_limits(const struct btrfs_qgroup *qg, u64 num_bytes)
{
if ((qg->lim_flags & BTRFS_QGROUP_LIMIT_MAX_RFER) &&
qgroup_rsv_total(qg) + (s64)qg->rfer + num_bytes > qg->max_rfer)
return false;
if ((qg->lim_flags & BTRFS_QGROUP_LIMIT_MAX_EXCL) &&
qgroup_rsv_total(qg) + (s64)qg->excl + num_bytes > qg->max_excl)
return false;
return true;
}
static int qgroup_reserve(struct btrfs_root *root, u64 num_bytes, bool enforce,
enum btrfs_qgroup_rsv_type type)
{
struct btrfs_qgroup *qgroup;
struct btrfs_fs_info *fs_info = root->fs_info;
u64 ref_root = root->root_key.objectid;
int ret = 0;
struct ulist_node *unode;
struct ulist_iterator uiter;
if (!is_fstree(ref_root))
return 0;
if (num_bytes == 0)
return 0;
if (test_bit(BTRFS_FS_QUOTA_OVERRIDE, &fs_info->flags) &&
capable(CAP_SYS_RESOURCE))
enforce = false;
spin_lock(&fs_info->qgroup_lock);
if (!fs_info->quota_root)
goto out;
qgroup = find_qgroup_rb(fs_info, ref_root);
if (!qgroup)
goto out;
/*
* in a first step, we check all affected qgroups if any limits would
* be exceeded
*/
ulist_reinit(fs_info->qgroup_ulist);
ret = ulist_add(fs_info->qgroup_ulist, qgroup->qgroupid,
qgroup_to_aux(qgroup), GFP_ATOMIC);
if (ret < 0)
goto out;
ULIST_ITER_INIT(&uiter);
while ((unode = ulist_next(fs_info->qgroup_ulist, &uiter))) {
struct btrfs_qgroup *qg;
struct btrfs_qgroup_list *glist;
qg = unode_aux_to_qgroup(unode);
if (enforce && !qgroup_check_limits(qg, num_bytes)) {
ret = -EDQUOT;
goto out;
}
list_for_each_entry(glist, &qg->groups, next_group) {
ret = ulist_add(fs_info->qgroup_ulist,
glist->group->qgroupid,
qgroup_to_aux(glist->group), GFP_ATOMIC);
if (ret < 0)
goto out;
}
}
ret = 0;
/*
* no limits exceeded, now record the reservation into all qgroups
*/
ULIST_ITER_INIT(&uiter);
while ((unode = ulist_next(fs_info->qgroup_ulist, &uiter))) {
struct btrfs_qgroup *qg;
qg = unode_aux_to_qgroup(unode);
qgroup_rsv_add(fs_info, qg, num_bytes, type);
}
out:
spin_unlock(&fs_info->qgroup_lock);
return ret;
}
/*
* Free @num_bytes of reserved space with @type for qgroup. (Normally level 0
* qgroup).
*
* Will handle all higher level qgroup too.
*
* NOTE: If @num_bytes is (u64)-1, this means to free all bytes of this qgroup.
* This special case is only used for META_PERTRANS type.
*/
void btrfs_qgroup_free_refroot(struct btrfs_fs_info *fs_info,
u64 ref_root, u64 num_bytes,
enum btrfs_qgroup_rsv_type type)
{
struct btrfs_qgroup *qgroup;
struct ulist_node *unode;
struct ulist_iterator uiter;
int ret = 0;
if (!is_fstree(ref_root))
return;
if (num_bytes == 0)
return;
if (num_bytes == (u64)-1 && type != BTRFS_QGROUP_RSV_META_PERTRANS) {
WARN(1, "%s: Invalid type to free", __func__);
return;
}
spin_lock(&fs_info->qgroup_lock);
if (!fs_info->quota_root)
goto out;
qgroup = find_qgroup_rb(fs_info, ref_root);
if (!qgroup)
goto out;
if (num_bytes == (u64)-1)
/*
* We're freeing all pertrans rsv, get reserved value from
* level 0 qgroup as real num_bytes to free.
*/
num_bytes = qgroup->rsv.values[type];
ulist_reinit(fs_info->qgroup_ulist);
ret = ulist_add(fs_info->qgroup_ulist, qgroup->qgroupid,
qgroup_to_aux(qgroup), GFP_ATOMIC);
if (ret < 0)
goto out;
ULIST_ITER_INIT(&uiter);
while ((unode = ulist_next(fs_info->qgroup_ulist, &uiter))) {
struct btrfs_qgroup *qg;
struct btrfs_qgroup_list *glist;
qg = unode_aux_to_qgroup(unode);
qgroup_rsv_release(fs_info, qg, num_bytes, type);
list_for_each_entry(glist, &qg->groups, next_group) {
ret = ulist_add(fs_info->qgroup_ulist,
glist->group->qgroupid,
qgroup_to_aux(glist->group), GFP_ATOMIC);
if (ret < 0)
goto out;
}
}
out:
spin_unlock(&fs_info->qgroup_lock);
}
/*
* Check if the leaf is the last leaf. Which means all node pointers
* are at their last position.
*/
static bool is_last_leaf(struct btrfs_path *path)
{
int i;
for (i = 1; i < BTRFS_MAX_LEVEL && path->nodes[i]; i++) {
if (path->slots[i] != btrfs_header_nritems(path->nodes[i]) - 1)
return false;
}
return true;
}
/*
* returns < 0 on error, 0 when more leafs are to be scanned.
* returns 1 when done.
*/
static int qgroup_rescan_leaf(struct btrfs_trans_handle *trans,
struct btrfs_path *path)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
struct btrfs_key found;
struct extent_buffer *scratch_leaf = NULL;
struct ulist *roots = NULL;
u64 num_bytes;
bool done;
int slot;
int ret;
mutex_lock(&fs_info->qgroup_rescan_lock);
ret = btrfs_search_slot_for_read(fs_info->extent_root,
&fs_info->qgroup_rescan_progress,
path, 1, 0);
btrfs_debug(fs_info,
"current progress key (%llu %u %llu), search_slot ret %d",
fs_info->qgroup_rescan_progress.objectid,
fs_info->qgroup_rescan_progress.type,
fs_info->qgroup_rescan_progress.offset, ret);
if (ret) {
/*
* The rescan is about to end, we will not be scanning any
* further blocks. We cannot unset the RESCAN flag here, because
* we want to commit the transaction if everything went well.
* To make the live accounting work in this phase, we set our
* scan progress pointer such that every real extent objectid
* will be smaller.
*/
fs_info->qgroup_rescan_progress.objectid = (u64)-1;
btrfs_release_path(path);
mutex_unlock(&fs_info->qgroup_rescan_lock);
return ret;
}
done = is_last_leaf(path);
btrfs_item_key_to_cpu(path->nodes[0], &found,
btrfs_header_nritems(path->nodes[0]) - 1);
fs_info->qgroup_rescan_progress.objectid = found.objectid + 1;
scratch_leaf = btrfs_clone_extent_buffer(path->nodes[0]);
if (!scratch_leaf) {
ret = -ENOMEM;
mutex_unlock(&fs_info->qgroup_rescan_lock);
goto out;
}
slot = path->slots[0];
btrfs_release_path(path);
mutex_unlock(&fs_info->qgroup_rescan_lock);
for (; slot < btrfs_header_nritems(scratch_leaf); ++slot) {
btrfs_item_key_to_cpu(scratch_leaf, &found, slot);
if (found.type != BTRFS_EXTENT_ITEM_KEY &&
found.type != BTRFS_METADATA_ITEM_KEY)
continue;
if (found.type == BTRFS_METADATA_ITEM_KEY)
num_bytes = fs_info->nodesize;
else
num_bytes = found.offset;
ret = btrfs_find_all_roots(NULL, fs_info, found.objectid, 0,
&roots, false);
if (ret < 0)
goto out;
/* For rescan, just pass old_roots as NULL */
ret = btrfs_qgroup_account_extent(trans, found.objectid,
num_bytes, NULL, roots);
if (ret < 0)
goto out;
}
out:
if (scratch_leaf)
free_extent_buffer(scratch_leaf);
if (done && !ret) {
ret = 1;
fs_info->qgroup_rescan_progress.objectid = (u64)-1;
}
return ret;
}
static bool rescan_should_stop(struct btrfs_fs_info *fs_info)
{
return btrfs_fs_closing(fs_info) ||
test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state);
}
static void btrfs_qgroup_rescan_worker(struct btrfs_work *work)
{
struct btrfs_fs_info *fs_info = container_of(work, struct btrfs_fs_info,
qgroup_rescan_work);
struct btrfs_path *path;
struct btrfs_trans_handle *trans = NULL;
int err = -ENOMEM;
int ret = 0;
bool stopped = false;
path = btrfs_alloc_path();
if (!path)
goto out;
/*
* Rescan should only search for commit root, and any later difference
* should be recorded by qgroup
*/
path->search_commit_root = 1;
path->skip_locking = 1;
err = 0;
while (!err && !(stopped = rescan_should_stop(fs_info))) {
trans = btrfs_start_transaction(fs_info->fs_root, 0);
if (IS_ERR(trans)) {
err = PTR_ERR(trans);
break;
}
if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) {
err = -EINTR;
} else {
err = qgroup_rescan_leaf(trans, path);
}
if (err > 0)
btrfs_commit_transaction(trans);
else
btrfs_end_transaction(trans);
}
out:
btrfs_free_path(path);
mutex_lock(&fs_info->qgroup_rescan_lock);
if (err > 0 &&
fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT) {
fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
} else if (err < 0) {
fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
}
mutex_unlock(&fs_info->qgroup_rescan_lock);
/*
* only update status, since the previous part has already updated the
* qgroup info.
*/
trans = btrfs_start_transaction(fs_info->quota_root, 1);
if (IS_ERR(trans)) {
err = PTR_ERR(trans);
trans = NULL;
btrfs_err(fs_info,
"fail to start transaction for status update: %d",
err);
}
mutex_lock(&fs_info->qgroup_rescan_lock);
if (!stopped)
fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_RESCAN;
if (trans) {
ret = update_qgroup_status_item(trans);
if (ret < 0) {
err = ret;
btrfs_err(fs_info, "fail to update qgroup status: %d",
err);
}
}
fs_info->qgroup_rescan_running = false;
complete_all(&fs_info->qgroup_rescan_completion);
mutex_unlock(&fs_info->qgroup_rescan_lock);
if (!trans)
return;
btrfs_end_transaction(trans);
if (stopped) {
btrfs_info(fs_info, "qgroup scan paused");
} else if (err >= 0) {
btrfs_info(fs_info, "qgroup scan completed%s",
err > 0 ? " (inconsistency flag cleared)" : "");
} else {
btrfs_err(fs_info, "qgroup scan failed with %d", err);
}
}
/*
* Checks that (a) no rescan is running and (b) quota is enabled. Allocates all
* memory required for the rescan context.
*/
static int
qgroup_rescan_init(struct btrfs_fs_info *fs_info, u64 progress_objectid,
int init_flags)
{
int ret = 0;
if (!init_flags) {
/* we're resuming qgroup rescan at mount time */
if (!(fs_info->qgroup_flags &
BTRFS_QGROUP_STATUS_FLAG_RESCAN)) {
btrfs_warn(fs_info,
"qgroup rescan init failed, qgroup rescan is not queued");
ret = -EINVAL;
} else if (!(fs_info->qgroup_flags &
BTRFS_QGROUP_STATUS_FLAG_ON)) {
btrfs_warn(fs_info,
"qgroup rescan init failed, qgroup is not enabled");
ret = -EINVAL;
}
if (ret)
return ret;
}
mutex_lock(&fs_info->qgroup_rescan_lock);
if (init_flags) {
if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN) {
btrfs_warn(fs_info,
"qgroup rescan is already in progress");
ret = -EINPROGRESS;
} else if (!(fs_info->qgroup_flags &
BTRFS_QGROUP_STATUS_FLAG_ON)) {
btrfs_warn(fs_info,
"qgroup rescan init failed, qgroup is not enabled");
ret = -EINVAL;
} else if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) {
/* Quota disable is in progress */
ret = -EBUSY;
}
if (ret) {
mutex_unlock(&fs_info->qgroup_rescan_lock);
return ret;
}
fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_RESCAN;
}
memset(&fs_info->qgroup_rescan_progress, 0,
sizeof(fs_info->qgroup_rescan_progress));
fs_info->qgroup_rescan_progress.objectid = progress_objectid;
init_completion(&fs_info->qgroup_rescan_completion);
mutex_unlock(&fs_info->qgroup_rescan_lock);
btrfs_init_work(&fs_info->qgroup_rescan_work,
btrfs_qgroup_rescan_worker, NULL, NULL);
return 0;
}
static void
qgroup_rescan_zero_tracking(struct btrfs_fs_info *fs_info)
{
struct rb_node *n;
struct btrfs_qgroup *qgroup;
spin_lock(&fs_info->qgroup_lock);
/* clear all current qgroup tracking information */
for (n = rb_first(&fs_info->qgroup_tree); n; n = rb_next(n)) {
qgroup = rb_entry(n, struct btrfs_qgroup, node);
qgroup->rfer = 0;
qgroup->rfer_cmpr = 0;
qgroup->excl = 0;
qgroup->excl_cmpr = 0;
qgroup_dirty(fs_info, qgroup);
}
spin_unlock(&fs_info->qgroup_lock);
}
int
btrfs_qgroup_rescan(struct btrfs_fs_info *fs_info)
{
int ret = 0;
struct btrfs_trans_handle *trans;
ret = qgroup_rescan_init(fs_info, 0, 1);
if (ret)
return ret;
/*
* We have set the rescan_progress to 0, which means no more
* delayed refs will be accounted by btrfs_qgroup_account_ref.
* However, btrfs_qgroup_account_ref may be right after its call
* to btrfs_find_all_roots, in which case it would still do the
* accounting.
* To solve this, we're committing the transaction, which will
* ensure we run all delayed refs and only after that, we are
* going to clear all tracking information for a clean start.
*/
trans = btrfs_join_transaction(fs_info->fs_root);
if (IS_ERR(trans)) {
fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_RESCAN;
return PTR_ERR(trans);
}
ret = btrfs_commit_transaction(trans);
if (ret) {
fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_RESCAN;
return ret;
}
qgroup_rescan_zero_tracking(fs_info);
mutex_lock(&fs_info->qgroup_rescan_lock);
fs_info->qgroup_rescan_running = true;
btrfs_queue_work(fs_info->qgroup_rescan_workers,
&fs_info->qgroup_rescan_work);
mutex_unlock(&fs_info->qgroup_rescan_lock);
return 0;
}
int btrfs_qgroup_wait_for_completion(struct btrfs_fs_info *fs_info,
bool interruptible)
{
int running;
int ret = 0;
mutex_lock(&fs_info->qgroup_rescan_lock);
running = fs_info->qgroup_rescan_running;
mutex_unlock(&fs_info->qgroup_rescan_lock);
if (!running)
return 0;
if (interruptible) ret = wait_for_completion_interruptible(
&fs_info->qgroup_rescan_completion);
else
wait_for_completion(&fs_info->qgroup_rescan_completion);
return ret;
}
/*
* this is only called from open_ctree where we're still single threaded, thus
* locking is omitted here.
*/
void
btrfs_qgroup_rescan_resume(struct btrfs_fs_info *fs_info)
{
if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN) { mutex_lock(&fs_info->qgroup_rescan_lock);
fs_info->qgroup_rescan_running = true;
btrfs_queue_work(fs_info->qgroup_rescan_workers,
&fs_info->qgroup_rescan_work);
mutex_unlock(&fs_info->qgroup_rescan_lock);
}
}
#define rbtree_iterate_from_safe(node, next, start) \
for (node = start; node && ({ next = rb_next(node); 1;}); node = next)
static int qgroup_unreserve_range(struct btrfs_inode *inode,
struct extent_changeset *reserved, u64 start,
u64 len)
{
struct rb_node *node;
struct rb_node *next;
struct ulist_node *entry;
int ret = 0;
node = reserved->range_changed.root.rb_node;
if (!node)
return 0;
while (node) { entry = rb_entry(node, struct ulist_node, rb_node);
if (entry->val < start)
node = node->rb_right;
else
node = node->rb_left;
}
if (entry->val > start && rb_prev(&entry->rb_node)) entry = rb_entry(rb_prev(&entry->rb_node), struct ulist_node,
rb_node);
rbtree_iterate_from_safe(node, next, &entry->rb_node) {
u64 entry_start;
u64 entry_end;
u64 entry_len;
int clear_ret;
entry = rb_entry(node, struct ulist_node, rb_node);
entry_start = entry->val;
entry_end = entry->aux;
entry_len = entry_end - entry_start + 1;
if (entry_start >= start + len)
break;
if (entry_start + entry_len <= start)
continue;
/*
* Now the entry is in [start, start + len), revert the
* EXTENT_QGROUP_RESERVED bit.
*/
clear_ret = clear_extent_bits(&inode->io_tree, entry_start,
entry_end, EXTENT_QGROUP_RESERVED);
if (!ret && clear_ret < 0)
ret = clear_ret;
ulist_del(&reserved->range_changed, entry->val, entry->aux);
if (likely(reserved->bytes_changed >= entry_len)) {
reserved->bytes_changed -= entry_len;
} else {
WARN_ON(1);
reserved->bytes_changed = 0;
}
}
return ret;
}
/*
* Try to free some space for qgroup.
*
* For qgroup, there are only 3 ways to free qgroup space:
* - Flush nodatacow write
* Any nodatacow write will free its reserved data space at run_delalloc_range().
* In theory, we should only flush nodatacow inodes, but it's not yet
* possible, so we need to flush the whole root.
*
* - Wait for ordered extents
* When ordered extents are finished, their reserved metadata is finally
* converted to per_trans status, which can be freed by later commit
* transaction.
*
* - Commit transaction
* This would free the meta_per_trans space.
* In theory this shouldn't provide much space, but any more qgroup space
* is needed.
*/
static int try_flush_qgroup(struct btrfs_root *root)
{
struct btrfs_trans_handle *trans;
int ret;
/* Can't hold an open transaction or we run the risk of deadlocking. */
ASSERT(current->journal_info == NULL);
if (WARN_ON(current->journal_info))
return 0;
/*
* We don't want to run flush again and again, so if there is a running
* one, we won't try to start a new flush, but exit directly.
*/
if (test_and_set_bit(BTRFS_ROOT_QGROUP_FLUSHING, &root->state)) {
wait_event(root->qgroup_flush_wait,
!test_bit(BTRFS_ROOT_QGROUP_FLUSHING, &root->state));
return 0;
}
ret = btrfs_start_delalloc_snapshot(root, true);
if (ret < 0)
goto out;
btrfs_wait_ordered_extents(root, U64_MAX, 0, (u64)-1);
trans = btrfs_join_transaction(root);
if (IS_ERR(trans)) {
ret = PTR_ERR(trans);
goto out;
}
ret = btrfs_commit_transaction(trans);
out:
clear_bit(BTRFS_ROOT_QGROUP_FLUSHING, &root->state);
wake_up(&root->qgroup_flush_wait);
return ret;
}
static int qgroup_reserve_data(struct btrfs_inode *inode,
struct extent_changeset **reserved_ret, u64 start,
u64 len)
{
struct btrfs_root *root = inode->root;
struct extent_changeset *reserved;
bool new_reserved = false;
u64 orig_reserved;
u64 to_reserve;
int ret;
if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &root->fs_info->flags) ||
!is_fstree(root->root_key.objectid) || len == 0)
return 0;
/* @reserved parameter is mandatory for qgroup */
if (WARN_ON(!reserved_ret))
return -EINVAL;
if (!*reserved_ret) {
new_reserved = true;
*reserved_ret = extent_changeset_alloc();
if (!*reserved_ret)
return -ENOMEM;
}
reserved = *reserved_ret;
/* Record already reserved space */
orig_reserved = reserved->bytes_changed;
ret = set_record_extent_bits(&inode->io_tree, start,
start + len -1, EXTENT_QGROUP_RESERVED, reserved);
/* Newly reserved space */
to_reserve = reserved->bytes_changed - orig_reserved;
trace_btrfs_qgroup_reserve_data(&inode->vfs_inode, start, len,
to_reserve, QGROUP_RESERVE);
if (ret < 0)
goto out;
ret = qgroup_reserve(root, to_reserve, true, BTRFS_QGROUP_RSV_DATA);
if (ret < 0)
goto cleanup;
return ret;
cleanup:
qgroup_unreserve_range(inode, reserved, start, len);
out:
if (new_reserved) {
extent_changeset_free(reserved);
*reserved_ret = NULL;
}
return ret;
}
/*
* Reserve qgroup space for range [start, start + len).
*
* This function will either reserve space from related qgroups or do nothing
* if the range is already reserved.
*
* Return 0 for successful reservation
* Return <0 for error (including -EQUOT)
*
* NOTE: This function may sleep for memory allocation, dirty page flushing and
* commit transaction. So caller should not hold any dirty page locked.
*/
int btrfs_qgroup_reserve_data(struct btrfs_inode *inode,
struct extent_changeset **reserved_ret, u64 start,
u64 len)
{
int ret;
ret = qgroup_reserve_data(inode, reserved_ret, start, len); if (ret <= 0 && ret != -EDQUOT)
return ret;
ret = try_flush_qgroup(inode->root);
if (ret < 0)
return ret;
return qgroup_reserve_data(inode, reserved_ret, start, len);
}
/* Free ranges specified by @reserved, normally in error path */
static int qgroup_free_reserved_data(struct btrfs_inode *inode,
struct extent_changeset *reserved, u64 start, u64 len)
{
struct btrfs_root *root = inode->root;
struct ulist_node *unode;
struct ulist_iterator uiter;
struct extent_changeset changeset;
int freed = 0;
int ret;
extent_changeset_init(&changeset);
len = round_up(start + len, root->fs_info->sectorsize);
start = round_down(start, root->fs_info->sectorsize);
ULIST_ITER_INIT(&uiter);
while ((unode = ulist_next(&reserved->range_changed, &uiter))) { u64 range_start = unode->val;
/* unode->aux is the inclusive end */
u64 range_len = unode->aux - range_start + 1;
u64 free_start;
u64 free_len;
extent_changeset_release(&changeset);
/* Only free range in range [start, start + len) */
if (range_start >= start + len ||
range_start + range_len <= start)
continue;
free_start = max(range_start, start);
free_len = min(start + len, range_start + range_len) -
free_start;
/*
* TODO: To also modify reserved->ranges_reserved to reflect
* the modification.
*
* However as long as we free qgroup reserved according to
* EXTENT_QGROUP_RESERVED, we won't double free.
* So not need to rush.
*/
ret = clear_record_extent_bits(&inode->io_tree, free_start,
free_start + free_len - 1,
EXTENT_QGROUP_RESERVED, &changeset);
if (ret < 0)
goto out;
freed += changeset.bytes_changed;
}
btrfs_qgroup_free_refroot(root->fs_info, root->root_key.objectid, freed,
BTRFS_QGROUP_RSV_DATA);
ret = freed;
out:
extent_changeset_release(&changeset);
return ret;
}
static int __btrfs_qgroup_release_data(struct btrfs_inode *inode,
struct extent_changeset *reserved, u64 start, u64 len,
int free)
{
struct extent_changeset changeset;
int trace_op = QGROUP_RELEASE;
int ret;
if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &inode->root->fs_info->flags))
return 0;
/* In release case, we shouldn't have @reserved */
WARN_ON(!free && reserved);
if (free && reserved)
return qgroup_free_reserved_data(inode, reserved, start, len);
extent_changeset_init(&changeset);
ret = clear_record_extent_bits(&inode->io_tree, start, start + len -1,
EXTENT_QGROUP_RESERVED, &changeset);
if (ret < 0)
goto out;
if (free)
trace_op = QGROUP_FREE;
trace_btrfs_qgroup_release_data(&inode->vfs_inode, start, len,
changeset.bytes_changed, trace_op);
if (free)
btrfs_qgroup_free_refroot(inode->root->fs_info,
inode->root->root_key.objectid,
changeset.bytes_changed, BTRFS_QGROUP_RSV_DATA);
ret = changeset.bytes_changed;
out:
extent_changeset_release(&changeset);
return ret;
}
/*
* Free a reserved space range from io_tree and related qgroups
*
* Should be called when a range of pages get invalidated before reaching disk.
* Or for error cleanup case.
* if @reserved is given, only reserved range in [@start, @start + @len) will
* be freed.
*
* For data written to disk, use btrfs_qgroup_release_data().
*
* NOTE: This function may sleep for memory allocation.
*/
int btrfs_qgroup_free_data(struct btrfs_inode *inode,
struct extent_changeset *reserved, u64 start, u64 len)
{
return __btrfs_qgroup_release_data(inode, reserved, start, len, 1);
}
/*
* Release a reserved space range from io_tree only.
*
* Should be called when a range of pages get written to disk and corresponding
* FILE_EXTENT is inserted into corresponding root.
*
* Since new qgroup accounting framework will only update qgroup numbers at
* commit_transaction() time, its reserved space shouldn't be freed from
* related qgroups.
*
* But we should release the range from io_tree, to allow further write to be
* COWed.
*
* NOTE: This function may sleep for memory allocation.
*/
int btrfs_qgroup_release_data(struct btrfs_inode *inode, u64 start, u64 len)
{
return __btrfs_qgroup_release_data(inode, NULL, start, len, 0);
}
static void add_root_meta_rsv(struct btrfs_root *root, int num_bytes,
enum btrfs_qgroup_rsv_type type)
{
if (type != BTRFS_QGROUP_RSV_META_PREALLOC &&
type != BTRFS_QGROUP_RSV_META_PERTRANS)
return;
if (num_bytes == 0)
return;
spin_lock(&root->qgroup_meta_rsv_lock);
if (type == BTRFS_QGROUP_RSV_META_PREALLOC)
root->qgroup_meta_rsv_prealloc += num_bytes;
else
root->qgroup_meta_rsv_pertrans += num_bytes;
spin_unlock(&root->qgroup_meta_rsv_lock);
}
static int sub_root_meta_rsv(struct btrfs_root *root, int num_bytes,
enum btrfs_qgroup_rsv_type type)
{
if (type != BTRFS_QGROUP_RSV_META_PREALLOC &&
type != BTRFS_QGROUP_RSV_META_PERTRANS)
return 0;
if (num_bytes == 0)
return 0;
spin_lock(&root->qgroup_meta_rsv_lock);
if (type == BTRFS_QGROUP_RSV_META_PREALLOC) {
num_bytes = min_t(u64, root->qgroup_meta_rsv_prealloc,
num_bytes);
root->qgroup_meta_rsv_prealloc -= num_bytes;
} else {
num_bytes = min_t(u64, root->qgroup_meta_rsv_pertrans,
num_bytes);
root->qgroup_meta_rsv_pertrans -= num_bytes;
}
spin_unlock(&root->qgroup_meta_rsv_lock);
return num_bytes;
}
int btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes,
enum btrfs_qgroup_rsv_type type, bool enforce)
{
struct btrfs_fs_info *fs_info = root->fs_info;
int ret;
if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) ||
!is_fstree(root->root_key.objectid) || num_bytes == 0) return 0; BUG_ON(num_bytes != round_down(num_bytes, fs_info->nodesize));
trace_qgroup_meta_reserve(root, (s64)num_bytes, type);
ret = qgroup_reserve(root, num_bytes, enforce, type);
if (ret < 0)
return ret;
/*
* Record what we have reserved into root.
*
* To avoid quota disabled->enabled underflow.
* In that case, we may try to free space we haven't reserved
* (since quota was disabled), so record what we reserved into root.
* And ensure later release won't underflow this number.
*/
add_root_meta_rsv(root, num_bytes, type);
return ret;
}
int __btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes,
enum btrfs_qgroup_rsv_type type, bool enforce)
{
int ret;
ret = btrfs_qgroup_reserve_meta(root, num_bytes, type, enforce); if (ret <= 0 && ret != -EDQUOT)
return ret;
ret = try_flush_qgroup(root);
if (ret < 0)
return ret;
return btrfs_qgroup_reserve_meta(root, num_bytes, type, enforce);
}
void btrfs_qgroup_free_meta_all_pertrans(struct btrfs_root *root)
{
struct btrfs_fs_info *fs_info = root->fs_info;
if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) ||
!is_fstree(root->root_key.objectid))
return;
/* TODO: Update trace point to handle such free */
trace_qgroup_meta_free_all_pertrans(root);
/* Special value -1 means to free all reserved space */
btrfs_qgroup_free_refroot(fs_info, root->root_key.objectid, (u64)-1,
BTRFS_QGROUP_RSV_META_PERTRANS);
}
void __btrfs_qgroup_free_meta(struct btrfs_root *root, int num_bytes,
enum btrfs_qgroup_rsv_type type)
{
struct btrfs_fs_info *fs_info = root->fs_info;
if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) ||
!is_fstree(root->root_key.objectid))
return;
/*
* reservation for META_PREALLOC can happen before quota is enabled,
* which can lead to underflow.
* Here ensure we will only free what we really have reserved.
*/
num_bytes = sub_root_meta_rsv(root, num_bytes, type);
BUG_ON(num_bytes != round_down(num_bytes, fs_info->nodesize));
trace_qgroup_meta_reserve(root, -(s64)num_bytes, type);
btrfs_qgroup_free_refroot(fs_info, root->root_key.objectid,
num_bytes, type);
}
static void qgroup_convert_meta(struct btrfs_fs_info *fs_info, u64 ref_root,
int num_bytes)
{
struct btrfs_qgroup *qgroup;
struct ulist_node *unode;
struct ulist_iterator uiter;
int ret = 0;
if (num_bytes == 0) return; if (!fs_info->quota_root)
return;
spin_lock(&fs_info->qgroup_lock);
qgroup = find_qgroup_rb(fs_info, ref_root);
if (!qgroup)
goto out;
ulist_reinit(fs_info->qgroup_ulist);
ret = ulist_add(fs_info->qgroup_ulist, qgroup->qgroupid,
qgroup_to_aux(qgroup), GFP_ATOMIC);
if (ret < 0)
goto out;
ULIST_ITER_INIT(&uiter); while ((unode = ulist_next(fs_info->qgroup_ulist, &uiter))) {
struct btrfs_qgroup *qg;
struct btrfs_qgroup_list *glist;
qg = unode_aux_to_qgroup(unode);
qgroup_rsv_release(fs_info, qg, num_bytes,
BTRFS_QGROUP_RSV_META_PREALLOC);
qgroup_rsv_add(fs_info, qg, num_bytes,
BTRFS_QGROUP_RSV_META_PERTRANS);
list_for_each_entry(glist, &qg->groups, next_group) { ret = ulist_add(fs_info->qgroup_ulist,
glist->group->qgroupid,
qgroup_to_aux(glist->group), GFP_ATOMIC);
if (ret < 0)
goto out;
}
}
out:
spin_unlock(&fs_info->qgroup_lock);
}
void btrfs_qgroup_convert_reserved_meta(struct btrfs_root *root, int num_bytes)
{
struct btrfs_fs_info *fs_info = root->fs_info;
if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) ||
!is_fstree(root->root_key.objectid))
return;
/* Same as btrfs_qgroup_free_meta_prealloc() */
num_bytes = sub_root_meta_rsv(root, num_bytes,
BTRFS_QGROUP_RSV_META_PREALLOC);
trace_qgroup_meta_convert(root, num_bytes);
qgroup_convert_meta(fs_info, root->root_key.objectid, num_bytes);
}
/*
* Check qgroup reserved space leaking, normally at destroy inode
* time
*/
void btrfs_qgroup_check_reserved_leak(struct btrfs_inode *inode)
{
struct extent_changeset changeset;
struct ulist_node *unode;
struct ulist_iterator iter;
int ret;
extent_changeset_init(&changeset);
ret = clear_record_extent_bits(&inode->io_tree, 0, (u64)-1,
EXTENT_QGROUP_RESERVED, &changeset);
WARN_ON(ret < 0); if (WARN_ON(changeset.bytes_changed)) {
ULIST_ITER_INIT(&iter);
while ((unode = ulist_next(&changeset.range_changed, &iter))) {
btrfs_warn(inode->root->fs_info,
"leaking qgroup reserved space, ino: %llu, start: %llu, end: %llu",
btrfs_ino(inode), unode->val, unode->aux);
}
btrfs_qgroup_free_refroot(inode->root->fs_info,
inode->root->root_key.objectid,
changeset.bytes_changed, BTRFS_QGROUP_RSV_DATA);
}
extent_changeset_release(&changeset);
}
void btrfs_qgroup_init_swapped_blocks(
struct btrfs_qgroup_swapped_blocks *swapped_blocks)
{
int i;
spin_lock_init(&swapped_blocks->lock);
for (i = 0; i < BTRFS_MAX_LEVEL; i++)
swapped_blocks->blocks[i] = RB_ROOT; swapped_blocks->swapped = false;
}
/*
* Delete all swapped blocks record of @root.
* Every record here means we skipped a full subtree scan for qgroup.
*
* Gets called when committing one transaction.
*/
void btrfs_qgroup_clean_swapped_blocks(struct btrfs_root *root)
{
struct btrfs_qgroup_swapped_blocks *swapped_blocks;
int i;
swapped_blocks = &root->swapped_blocks;
spin_lock(&swapped_blocks->lock);
if (!swapped_blocks->swapped)
goto out;
for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
struct rb_root *cur_root = &swapped_blocks->blocks[i];
struct btrfs_qgroup_swapped_block *entry;
struct btrfs_qgroup_swapped_block *next;
rbtree_postorder_for_each_entry_safe(entry, next, cur_root,
node)
kfree(entry); swapped_blocks->blocks[i] = RB_ROOT;
}
swapped_blocks->swapped = false;
out:
spin_unlock(&swapped_blocks->lock);
}
/*
* Add subtree roots record into @subvol_root.
*
* @subvol_root: tree root of the subvolume tree get swapped
* @bg: block group under balance
* @subvol_parent/slot: pointer to the subtree root in subvolume tree
* @reloc_parent/slot: pointer to the subtree root in reloc tree
* BOTH POINTERS ARE BEFORE TREE SWAP
* @last_snapshot: last snapshot generation of the subvolume tree
*/
int btrfs_qgroup_add_swapped_blocks(struct btrfs_trans_handle *trans,
struct btrfs_root *subvol_root,
struct btrfs_block_group *bg,
struct extent_buffer *subvol_parent, int subvol_slot,
struct extent_buffer *reloc_parent, int reloc_slot,
u64 last_snapshot)
{
struct btrfs_fs_info *fs_info = subvol_root->fs_info;
struct btrfs_qgroup_swapped_blocks *blocks = &subvol_root->swapped_blocks;
struct btrfs_qgroup_swapped_block *block;
struct rb_node **cur;
struct rb_node *parent = NULL;
int level = btrfs_header_level(subvol_parent) - 1;
int ret = 0;
if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags))
return 0;
if (btrfs_node_ptr_generation(subvol_parent, subvol_slot) >
btrfs_node_ptr_generation(reloc_parent, reloc_slot)) {
btrfs_err_rl(fs_info,
"%s: bad parameter order, subvol_gen=%llu reloc_gen=%llu",
__func__,
btrfs_node_ptr_generation(subvol_parent, subvol_slot),
btrfs_node_ptr_generation(reloc_parent, reloc_slot));
return -EUCLEAN;
}
block = kmalloc(sizeof(*block), GFP_NOFS);
if (!block) {
ret = -ENOMEM;
goto out;
}
/*
* @reloc_parent/slot is still before swap, while @block is going to
* record the bytenr after swap, so we do the swap here.
*/
block->subvol_bytenr = btrfs_node_blockptr(reloc_parent, reloc_slot);
block->subvol_generation = btrfs_node_ptr_generation(reloc_parent,
reloc_slot);
block->reloc_bytenr = btrfs_node_blockptr(subvol_parent, subvol_slot);
block->reloc_generation = btrfs_node_ptr_generation(subvol_parent,
subvol_slot);
block->last_snapshot = last_snapshot;
block->level = level;
/*
* If we have bg == NULL, we're called from btrfs_recover_relocation(),
* no one else can modify tree blocks thus we qgroup will not change
* no matter the value of trace_leaf.
*/
if (bg && bg->flags & BTRFS_BLOCK_GROUP_DATA)
block->trace_leaf = true;
else
block->trace_leaf = false;
btrfs_node_key_to_cpu(reloc_parent, &block->first_key, reloc_slot);
/* Insert @block into @blocks */
spin_lock(&blocks->lock);
cur = &blocks->blocks[level].rb_node;
while (*cur) {
struct btrfs_qgroup_swapped_block *entry;
parent = *cur;
entry = rb_entry(parent, struct btrfs_qgroup_swapped_block,
node);
if (entry->subvol_bytenr < block->subvol_bytenr) {
cur = &(*cur)->rb_left;
} else if (entry->subvol_bytenr > block->subvol_bytenr) {
cur = &(*cur)->rb_right;
} else {
if (entry->subvol_generation !=
block->subvol_generation ||
entry->reloc_bytenr != block->reloc_bytenr ||
entry->reloc_generation !=
block->reloc_generation) {
/*
* Duplicated but mismatch entry found.
* Shouldn't happen.
*
* Marking qgroup inconsistent should be enough
* for end users.
*/
WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG));
ret = -EEXIST;
}
kfree(block);
goto out_unlock;
}
}
rb_link_node(&block->node, parent, cur);
rb_insert_color(&block->node, &blocks->blocks[level]);
blocks->swapped = true;
out_unlock:
spin_unlock(&blocks->lock);
out:
if (ret < 0)
fs_info->qgroup_flags |=
BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
return ret;
}
/*
* Check if the tree block is a subtree root, and if so do the needed
* delayed subtree trace for qgroup.
*
* This is called during btrfs_cow_block().
*/
int btrfs_qgroup_trace_subtree_after_cow(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct extent_buffer *subvol_eb)
{
struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_qgroup_swapped_blocks *blocks = &root->swapped_blocks;
struct btrfs_qgroup_swapped_block *block;
struct extent_buffer *reloc_eb = NULL;
struct rb_node *node;
bool found = false;
bool swapped = false;
int level = btrfs_header_level(subvol_eb);
int ret = 0;
int i;
if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags))
return 0;
if (!is_fstree(root->root_key.objectid) || !root->reloc_root)
return 0;
spin_lock(&blocks->lock);
if (!blocks->swapped) {
spin_unlock(&blocks->lock);
return 0;
}
node = blocks->blocks[level].rb_node;
while (node) {
block = rb_entry(node, struct btrfs_qgroup_swapped_block, node);
if (block->subvol_bytenr < subvol_eb->start) { node = node->rb_left; } else if (block->subvol_bytenr > subvol_eb->start) { node = node->rb_right;
} else {
found = true;
break;
}
}
if (!found) {
spin_unlock(&blocks->lock);
goto out;
}
/* Found one, remove it from @blocks first and update blocks->swapped */
rb_erase(&block->node, &blocks->blocks[level]);
for (i = 0; i < BTRFS_MAX_LEVEL; i++) { if (RB_EMPTY_ROOT(&blocks->blocks[i])) {
swapped = true;
break;
}
}
blocks->swapped = swapped;
spin_unlock(&blocks->lock);
/* Read out reloc subtree root */
reloc_eb = read_tree_block(fs_info, block->reloc_bytenr, 0,
block->reloc_generation, block->level,
&block->first_key);
if (IS_ERR(reloc_eb)) {
ret = PTR_ERR(reloc_eb);
reloc_eb = NULL;
goto free_out;
}
if (!extent_buffer_uptodate(reloc_eb)) {
ret = -EIO;
goto free_out;
}
ret = qgroup_trace_subtree_swap(trans, reloc_eb, subvol_eb,
block->last_snapshot, block->trace_leaf);
free_out:
kfree(block);
free_extent_buffer(reloc_eb);
out:
if (ret < 0) {
btrfs_err_rl(fs_info,
"failed to account subtree at bytenr %llu: %d",
subvol_eb->start, ret);
fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
}
return ret;
}
void btrfs_qgroup_destroy_extent_records(struct btrfs_transaction *trans)
{
struct btrfs_qgroup_extent_record *entry;
struct btrfs_qgroup_extent_record *next;
struct rb_root *root;
root = &trans->delayed_refs.dirty_extent_root;
rbtree_postorder_for_each_entry_safe(entry, next, root, node) {
ulist_free(entry->old_roots);
kfree(entry);
}
}
// SPDX-License-Identifier: GPL-2.0
/*
* Disk events - monitor disk events like media change and eject request.
*/
#include <linux/export.h>
#include <linux/moduleparam.h>
#include <linux/genhd.h>
#include "blk.h"
struct disk_events {
struct list_head node; /* all disk_event's */
struct gendisk *disk; /* the associated disk */
spinlock_t lock;
struct mutex block_mutex; /* protects blocking */
int block; /* event blocking depth */
unsigned int pending; /* events already sent out */
unsigned int clearing; /* events being cleared */
long poll_msecs; /* interval, -1 for default */
struct delayed_work dwork;
};
static const char *disk_events_strs[] = {
[ilog2(DISK_EVENT_MEDIA_CHANGE)] = "media_change",
[ilog2(DISK_EVENT_EJECT_REQUEST)] = "eject_request",
};
static char *disk_uevents[] = {
[ilog2(DISK_EVENT_MEDIA_CHANGE)] = "DISK_MEDIA_CHANGE=1",
[ilog2(DISK_EVENT_EJECT_REQUEST)] = "DISK_EJECT_REQUEST=1",
};
/* list of all disk_events */
static DEFINE_MUTEX(disk_events_mutex);
static LIST_HEAD(disk_events);
/* disable in-kernel polling by default */
static unsigned long disk_events_dfl_poll_msecs;
static unsigned long disk_events_poll_jiffies(struct gendisk *disk)
{
struct disk_events *ev = disk->ev;
long intv_msecs = 0;
/*
* If device-specific poll interval is set, always use it. If
* the default is being used, poll if the POLL flag is set.
*/
if (ev->poll_msecs >= 0)
intv_msecs = ev->poll_msecs;
else if (disk->event_flags & DISK_EVENT_FLAG_POLL)
intv_msecs = disk_events_dfl_poll_msecs;
return msecs_to_jiffies(intv_msecs);
}
/**
* disk_block_events - block and flush disk event checking
* @disk: disk to block events for
*
* On return from this function, it is guaranteed that event checking
* isn't in progress and won't happen until unblocked by
* disk_unblock_events(). Events blocking is counted and the actual
* unblocking happens after the matching number of unblocks are done.
*
* Note that this intentionally does not block event checking from
* disk_clear_events().
*
* CONTEXT:
* Might sleep.
*/
void disk_block_events(struct gendisk *disk)
{
struct disk_events *ev = disk->ev;
unsigned long flags;
bool cancel;
if (!ev)
return;
/*
* Outer mutex ensures that the first blocker completes canceling
* the event work before further blockers are allowed to finish.
*/
mutex_lock(&ev->block_mutex);
spin_lock_irqsave(&ev->lock, flags);
cancel = !ev->block++;
spin_unlock_irqrestore(&ev->lock, flags);
if (cancel)
cancel_delayed_work_sync(&disk->ev->dwork); mutex_unlock(&ev->block_mutex);
}
static void __disk_unblock_events(struct gendisk *disk, bool check_now)
{
struct disk_events *ev = disk->ev;
unsigned long intv;
unsigned long flags;
spin_lock_irqsave(&ev->lock, flags);
if (WARN_ON_ONCE(ev->block <= 0))
goto out_unlock;
if (--ev->block)
goto out_unlock;
intv = disk_events_poll_jiffies(disk);
if (check_now)
queue_delayed_work(system_freezable_power_efficient_wq,
&ev->dwork, 0);
else if (intv)
queue_delayed_work(system_freezable_power_efficient_wq,
&ev->dwork, intv);
out_unlock:
spin_unlock_irqrestore(&ev->lock, flags);
}
/**
* disk_unblock_events - unblock disk event checking
* @disk: disk to unblock events for
*
* Undo disk_block_events(). When the block count reaches zero, it
* starts events polling if configured.
*
* CONTEXT:
* Don't care. Safe to call from irq context.
*/
void disk_unblock_events(struct gendisk *disk)
{
if (disk->ev) __disk_unblock_events(disk, false);
}
/**
* disk_flush_events - schedule immediate event checking and flushing
* @disk: disk to check and flush events for
* @mask: events to flush
*
* Schedule immediate event checking on @disk if not blocked. Events in
* @mask are scheduled to be cleared from the driver. Note that this
* doesn't clear the events from @disk->ev.
*
* CONTEXT:
* If @mask is non-zero must be called with disk->open_mutex held.
*/
void disk_flush_events(struct gendisk *disk, unsigned int mask)
{
struct disk_events *ev = disk->ev;
if (!ev)
return;
spin_lock_irq(&ev->lock);
ev->clearing |= mask;
if (!ev->block)
mod_delayed_work(system_freezable_power_efficient_wq,
&ev->dwork, 0);
spin_unlock_irq(&ev->lock);
}
/*
* Tell userland about new events. Only the events listed in @disk->events are
* reported, and only if DISK_EVENT_FLAG_UEVENT is set. Otherwise, events are
* processed internally but never get reported to userland.
*/
static void disk_event_uevent(struct gendisk *disk, unsigned int events)
{
char *envp[ARRAY_SIZE(disk_uevents) + 1] = { };
int nr_events = 0, i;
for (i = 0; i < ARRAY_SIZE(disk_uevents); i++)
if (events & disk->events & (1 << i)) envp[nr_events++] = disk_uevents[i];
if (nr_events)
kobject_uevent_env(&disk_to_dev(disk)->kobj, KOBJ_CHANGE, envp);
}
static void disk_check_events(struct disk_events *ev,
unsigned int *clearing_ptr)
{
struct gendisk *disk = ev->disk;
unsigned int clearing = *clearing_ptr;
unsigned int events;
unsigned long intv;
/* check events */
events = disk->fops->check_events(disk, clearing);
/* accumulate pending events and schedule next poll if necessary */
spin_lock_irq(&ev->lock);
events &= ~ev->pending;
ev->pending |= events;
*clearing_ptr &= ~clearing;
intv = disk_events_poll_jiffies(disk);
if (!ev->block && intv)
queue_delayed_work(system_freezable_power_efficient_wq,
&ev->dwork, intv);
spin_unlock_irq(&ev->lock);
if (events & DISK_EVENT_MEDIA_CHANGE)
inc_diskseq(disk);
if (disk->event_flags & DISK_EVENT_FLAG_UEVENT)
disk_event_uevent(disk, events);
}
/**
* disk_clear_events - synchronously check, clear and return pending events
* @disk: disk to fetch and clear events from
* @mask: mask of events to be fetched and cleared
*
* Disk events are synchronously checked and pending events in @mask
* are cleared and returned. This ignores the block count.
*
* CONTEXT:
* Might sleep.
*/
static unsigned int disk_clear_events(struct gendisk *disk, unsigned int mask)
{
struct disk_events *ev = disk->ev;
unsigned int pending;
unsigned int clearing = mask;
if (!ev)
return 0;
disk_block_events(disk);
/*
* store the union of mask and ev->clearing on the stack so that the
* race with disk_flush_events does not cause ambiguity (ev->clearing
* can still be modified even if events are blocked).
*/
spin_lock_irq(&ev->lock);
clearing |= ev->clearing;
ev->clearing = 0;
spin_unlock_irq(&ev->lock);
disk_check_events(ev, &clearing);
/*
* if ev->clearing is not 0, the disk_flush_events got called in the
* middle of this function, so we want to run the workfn without delay.
*/
__disk_unblock_events(disk, ev->clearing ? true : false);
/* then, fetch and clear pending events */
spin_lock_irq(&ev->lock);
pending = ev->pending & mask;
ev->pending &= ~mask;
spin_unlock_irq(&ev->lock);
WARN_ON_ONCE(clearing & mask);
return pending;
}
/**
* bdev_check_media_change - check if a removable media has been changed
* @bdev: block device to check
*
* Check whether a removable media has been changed, and attempt to free all
* dentries and inodes and invalidates all block device page cache entries in
* that case.
*
* Returns %true if the block device changed, or %false if not.
*/
bool bdev_check_media_change(struct block_device *bdev)
{
unsigned int events;
events = disk_clear_events(bdev->bd_disk, DISK_EVENT_MEDIA_CHANGE |
DISK_EVENT_EJECT_REQUEST);
if (!(events & DISK_EVENT_MEDIA_CHANGE))
return false;
if (__invalidate_device(bdev, true))
pr_warn("VFS: busy inodes on changed media %s\n",
bdev->bd_disk->disk_name);
set_bit(GD_NEED_PART_SCAN, &bdev->bd_disk->state);
return true;
}
EXPORT_SYMBOL(bdev_check_media_change);
/**
* disk_force_media_change - force a media change event
* @disk: the disk which will raise the event
* @events: the events to raise
*
* Generate uevents for the disk. If DISK_EVENT_MEDIA_CHANGE is present,
* attempt to free all dentries and inodes and invalidates all block
* device page cache entries in that case.
*
* Returns %true if DISK_EVENT_MEDIA_CHANGE was raised, or %false if not.
*/
bool disk_force_media_change(struct gendisk *disk, unsigned int events)
{
disk_event_uevent(disk, events); if (!(events & DISK_EVENT_MEDIA_CHANGE))
return false;
if (__invalidate_device(disk->part0, true))
pr_warn("VFS: busy inodes on changed media %s\n",
disk->disk_name);
set_bit(GD_NEED_PART_SCAN, &disk->state); return true;
}
EXPORT_SYMBOL_GPL(disk_force_media_change);
/*
* Separate this part out so that a different pointer for clearing_ptr can be
* passed in for disk_clear_events.
*/
static void disk_events_workfn(struct work_struct *work)
{
struct delayed_work *dwork = to_delayed_work(work);
struct disk_events *ev = container_of(dwork, struct disk_events, dwork);
disk_check_events(ev, &ev->clearing);
}
/*
* A disk events enabled device has the following sysfs nodes under
* its /sys/block/X/ directory.
*
* events : list of all supported events
* events_async : list of events which can be detected w/o polling
* (always empty, only for backwards compatibility)
* events_poll_msecs : polling interval, 0: disable, -1: system default
*/
static ssize_t __disk_events_show(unsigned int events, char *buf)
{
const char *delim = "";
ssize_t pos = 0;
int i;
for (i = 0; i < ARRAY_SIZE(disk_events_strs); i++)
if (events & (1 << i)) {
pos += sprintf(buf + pos, "%s%s",
delim, disk_events_strs[i]);
delim = " ";
}
if (pos)
pos += sprintf(buf + pos, "\n");
return pos;
}
static ssize_t disk_events_show(struct device *dev,
struct device_attribute *attr, char *buf)
{
struct gendisk *disk = dev_to_disk(dev);
if (!(disk->event_flags & DISK_EVENT_FLAG_UEVENT))
return 0;
return __disk_events_show(disk->events, buf);
}
static ssize_t disk_events_async_show(struct device *dev,
struct device_attribute *attr, char *buf)
{
return 0;
}
static ssize_t disk_events_poll_msecs_show(struct device *dev,
struct device_attribute *attr,
char *buf)
{
struct gendisk *disk = dev_to_disk(dev);
if (!disk->ev)
return sprintf(buf, "-1\n");
return sprintf(buf, "%ld\n", disk->ev->poll_msecs);
}
static ssize_t disk_events_poll_msecs_store(struct device *dev,
struct device_attribute *attr,
const char *buf, size_t count)
{
struct gendisk *disk = dev_to_disk(dev);
long intv;
if (!count || !sscanf(buf, "%ld", &intv))
return -EINVAL;
if (intv < 0 && intv != -1)
return -EINVAL;
if (!disk->ev)
return -ENODEV;
disk_block_events(disk);
disk->ev->poll_msecs = intv;
__disk_unblock_events(disk, true);
return count;
}
DEVICE_ATTR(events, 0444, disk_events_show, NULL);
DEVICE_ATTR(events_async, 0444, disk_events_async_show, NULL);
DEVICE_ATTR(events_poll_msecs, 0644, disk_events_poll_msecs_show,
disk_events_poll_msecs_store);
/*
* The default polling interval can be specified by the kernel
* parameter block.events_dfl_poll_msecs which defaults to 0
* (disable). This can also be modified runtime by writing to
* /sys/module/block/parameters/events_dfl_poll_msecs.
*/
static int disk_events_set_dfl_poll_msecs(const char *val,
const struct kernel_param *kp)
{
struct disk_events *ev;
int ret;
ret = param_set_ulong(val, kp);
if (ret < 0)
return ret;
mutex_lock(&disk_events_mutex);
list_for_each_entry(ev, &disk_events, node)
disk_flush_events(ev->disk, 0);
mutex_unlock(&disk_events_mutex);
return 0;
}
static const struct kernel_param_ops disk_events_dfl_poll_msecs_param_ops = {
.set = disk_events_set_dfl_poll_msecs,
.get = param_get_ulong,
};
#undef MODULE_PARAM_PREFIX
#define MODULE_PARAM_PREFIX "block."
module_param_cb(events_dfl_poll_msecs, &disk_events_dfl_poll_msecs_param_ops,
&disk_events_dfl_poll_msecs, 0644);
/*
* disk_{alloc|add|del|release}_events - initialize and destroy disk_events.
*/
int disk_alloc_events(struct gendisk *disk)
{
struct disk_events *ev;
if (!disk->fops->check_events || !disk->events)
return 0;
ev = kzalloc(sizeof(*ev), GFP_KERNEL);
if (!ev) {
pr_warn("%s: failed to initialize events\n", disk->disk_name);
return -ENOMEM;
}
INIT_LIST_HEAD(&ev->node);
ev->disk = disk;
spin_lock_init(&ev->lock);
mutex_init(&ev->block_mutex);
ev->block = 1;
ev->poll_msecs = -1;
INIT_DELAYED_WORK(&ev->dwork, disk_events_workfn);
disk->ev = ev;
return 0;
}
void disk_add_events(struct gendisk *disk)
{
if (!disk->ev)
return;
mutex_lock(&disk_events_mutex);
list_add_tail(&disk->ev->node, &disk_events);
mutex_unlock(&disk_events_mutex);
/*
* Block count is initialized to 1 and the following initial
* unblock kicks it into action.
*/
__disk_unblock_events(disk, true);
}
void disk_del_events(struct gendisk *disk)
{
if (disk->ev) {
disk_block_events(disk);
mutex_lock(&disk_events_mutex);
list_del_init(&disk->ev->node);
mutex_unlock(&disk_events_mutex);
}
}
void disk_release_events(struct gendisk *disk)
{
/* the block count should be 1 from disk_del_events() */
WARN_ON_ONCE(disk->ev && disk->ev->block != 1);
kfree(disk->ev);
}
/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
#ifndef _BTRFS_CTREE_H_
#define _BTRFS_CTREE_H_
#include <linux/btrfs.h>
#include <linux/types.h>
#ifdef __KERNEL__
#include <linux/stddef.h>
#else
#include <stddef.h>
#endif
/*
* This header contains the structure definitions and constants used
* by file system objects that can be retrieved using
* the BTRFS_IOC_SEARCH_TREE ioctl. That means basically anything that
* is needed to describe a leaf node's key or item contents.
*/
/* holds pointers to all of the tree roots */
#define BTRFS_ROOT_TREE_OBJECTID 1ULL
/* stores information about which extents are in use, and reference counts */
#define BTRFS_EXTENT_TREE_OBJECTID 2ULL
/*
* chunk tree stores translations from logical -> physical block numbering
* the super block points to the chunk tree
*/
#define BTRFS_CHUNK_TREE_OBJECTID 3ULL
/*
* stores information about which areas of a given device are in use.
* one per device. The tree of tree roots points to the device tree
*/
#define BTRFS_DEV_TREE_OBJECTID 4ULL
/* one per subvolume, storing files and directories */
#define BTRFS_FS_TREE_OBJECTID 5ULL
/* directory objectid inside the root tree */
#define BTRFS_ROOT_TREE_DIR_OBJECTID 6ULL
/* holds checksums of all the data extents */
#define BTRFS_CSUM_TREE_OBJECTID 7ULL
/* holds quota configuration and tracking */
#define BTRFS_QUOTA_TREE_OBJECTID 8ULL
/* for storing items that use the BTRFS_UUID_KEY* types */
#define BTRFS_UUID_TREE_OBJECTID 9ULL
/* tracks free space in block groups. */
#define BTRFS_FREE_SPACE_TREE_OBJECTID 10ULL
/* device stats in the device tree */
#define BTRFS_DEV_STATS_OBJECTID 0ULL
/* for storing balance parameters in the root tree */
#define BTRFS_BALANCE_OBJECTID -4ULL
/* orphan objectid for tracking unlinked/truncated files */
#define BTRFS_ORPHAN_OBJECTID -5ULL
/* does write ahead logging to speed up fsyncs */
#define BTRFS_TREE_LOG_OBJECTID -6ULL
#define BTRFS_TREE_LOG_FIXUP_OBJECTID -7ULL
/* for space balancing */
#define BTRFS_TREE_RELOC_OBJECTID -8ULL
#define BTRFS_DATA_RELOC_TREE_OBJECTID -9ULL
/*
* extent checksums all have this objectid
* this allows them to share the logging tree
* for fsyncs
*/
#define BTRFS_EXTENT_CSUM_OBJECTID -10ULL
/* For storing free space cache */
#define BTRFS_FREE_SPACE_OBJECTID -11ULL
/*
* The inode number assigned to the special inode for storing
* free ino cache
*/
#define BTRFS_FREE_INO_OBJECTID -12ULL
/* dummy objectid represents multiple objectids */
#define BTRFS_MULTIPLE_OBJECTIDS -255ULL
/*
* All files have objectids in this range.
*/
#define BTRFS_FIRST_FREE_OBJECTID 256ULL
#define BTRFS_LAST_FREE_OBJECTID -256ULL
#define BTRFS_FIRST_CHUNK_TREE_OBJECTID 256ULL
/*
* the device items go into the chunk tree. The key is in the form
* [ 1 BTRFS_DEV_ITEM_KEY device_id ]
*/
#define BTRFS_DEV_ITEMS_OBJECTID 1ULL
#define BTRFS_BTREE_INODE_OBJECTID 1
#define BTRFS_EMPTY_SUBVOL_DIR_OBJECTID 2
#define BTRFS_DEV_REPLACE_DEVID 0ULL
/*
* inode items have the data typically returned from stat and store other
* info about object characteristics. There is one for every file and dir in
* the FS
*/
#define BTRFS_INODE_ITEM_KEY 1
#define BTRFS_INODE_REF_KEY 12
#define BTRFS_INODE_EXTREF_KEY 13
#define BTRFS_XATTR_ITEM_KEY 24
/*
* fs verity items are stored under two different key types on disk.
* The descriptor items:
* [ inode objectid, BTRFS_VERITY_DESC_ITEM_KEY, offset ]
*
* At offset 0, we store a btrfs_verity_descriptor_item which tracks the size
* of the descriptor item and some extra data for encryption.
* Starting at offset 1, these hold the generic fs verity descriptor. The
* latter are opaque to btrfs, we just read and write them as a blob for the
* higher level verity code. The most common descriptor size is 256 bytes.
*
* The merkle tree items:
* [ inode objectid, BTRFS_VERITY_MERKLE_ITEM_KEY, offset ]
*
* These also start at offset 0, and correspond to the merkle tree bytes. When
* fsverity asks for page 0 of the merkle tree, we pull up one page starting at
* offset 0 for this key type. These are also opaque to btrfs, we're blindly
* storing whatever fsverity sends down.
*/
#define BTRFS_VERITY_DESC_ITEM_KEY 36
#define BTRFS_VERITY_MERKLE_ITEM_KEY 37
#define BTRFS_ORPHAN_ITEM_KEY 48
/* reserve 2-15 close to the inode for later flexibility */
/*
* dir items are the name -> inode pointers in a directory. There is one
* for every name in a directory.
*/
#define BTRFS_DIR_LOG_ITEM_KEY 60
#define BTRFS_DIR_LOG_INDEX_KEY 72
#define BTRFS_DIR_ITEM_KEY 84
#define BTRFS_DIR_INDEX_KEY 96
/*
* extent data is for file data
*/
#define BTRFS_EXTENT_DATA_KEY 108
/*
* extent csums are stored in a separate tree and hold csums for
* an entire extent on disk.
*/
#define BTRFS_EXTENT_CSUM_KEY 128
/*
* root items point to tree roots. They are typically in the root
* tree used by the super block to find all the other trees
*/
#define BTRFS_ROOT_ITEM_KEY 132
/*
* root backrefs tie subvols and snapshots to the directory entries that
* reference them
*/
#define BTRFS_ROOT_BACKREF_KEY 144
/*
* root refs make a fast index for listing all of the snapshots and
* subvolumes referenced by a given root. They point directly to the
* directory item in the root that references the subvol
*/
#define BTRFS_ROOT_REF_KEY 156
/*
* extent items are in the extent map tree. These record which blocks
* are used, and how many references there are to each block
*/
#define BTRFS_EXTENT_ITEM_KEY 168
/*
* The same as the BTRFS_EXTENT_ITEM_KEY, except it's metadata we already know
* the length, so we save the level in key->offset instead of the length.
*/
#define BTRFS_METADATA_ITEM_KEY 169
#define BTRFS_TREE_BLOCK_REF_KEY 176
#define BTRFS_EXTENT_DATA_REF_KEY 178
#define BTRFS_EXTENT_REF_V0_KEY 180
#define BTRFS_SHARED_BLOCK_REF_KEY 182
#define BTRFS_SHARED_DATA_REF_KEY 184
/*
* block groups give us hints into the extent allocation trees. Which
* blocks are free etc etc
*/
#define BTRFS_BLOCK_GROUP_ITEM_KEY 192
/*
* Every block group is represented in the free space tree by a free space info
* item, which stores some accounting information. It is keyed on
* (block_group_start, FREE_SPACE_INFO, block_group_length).
*/
#define BTRFS_FREE_SPACE_INFO_KEY 198
/*
* A free space extent tracks an extent of space that is free in a block group.
* It is keyed on (start, FREE_SPACE_EXTENT, length).
*/
#define BTRFS_FREE_SPACE_EXTENT_KEY 199
/*
* When a block group becomes very fragmented, we convert it to use bitmaps
* instead of extents. A free space bitmap is keyed on
* (start, FREE_SPACE_BITMAP, length); the corresponding item is a bitmap with
* (length / sectorsize) bits.
*/
#define BTRFS_FREE_SPACE_BITMAP_KEY 200
#define BTRFS_DEV_EXTENT_KEY 204
#define BTRFS_DEV_ITEM_KEY 216
#define BTRFS_CHUNK_ITEM_KEY 228
/*
* Records the overall state of the qgroups.
* There's only one instance of this key present,
* (0, BTRFS_QGROUP_STATUS_KEY, 0)
*/
#define BTRFS_QGROUP_STATUS_KEY 240
/*
* Records the currently used space of the qgroup.
* One key per qgroup, (0, BTRFS_QGROUP_INFO_KEY, qgroupid).
*/
#define BTRFS_QGROUP_INFO_KEY 242
/*
* Contains the user configured limits for the qgroup.
* One key per qgroup, (0, BTRFS_QGROUP_LIMIT_KEY, qgroupid).
*/
#define BTRFS_QGROUP_LIMIT_KEY 244
/*
* Records the child-parent relationship of qgroups. For
* each relation, 2 keys are present:
* (childid, BTRFS_QGROUP_RELATION_KEY, parentid)
* (parentid, BTRFS_QGROUP_RELATION_KEY, childid)
*/
#define BTRFS_QGROUP_RELATION_KEY 246
/*
* Obsolete name, see BTRFS_TEMPORARY_ITEM_KEY.
*/
#define BTRFS_BALANCE_ITEM_KEY 248
/*
* The key type for tree items that are stored persistently, but do not need to
* exist for extended period of time. The items can exist in any tree.
*
* [subtype, BTRFS_TEMPORARY_ITEM_KEY, data]
*
* Existing items:
*
* - balance status item
* (BTRFS_BALANCE_OBJECTID, BTRFS_TEMPORARY_ITEM_KEY, 0)
*/
#define BTRFS_TEMPORARY_ITEM_KEY 248
/*
* Obsolete name, see BTRFS_PERSISTENT_ITEM_KEY
*/
#define BTRFS_DEV_STATS_KEY 249
/*
* The key type for tree items that are stored persistently and usually exist
* for a long period, eg. filesystem lifetime. The item kinds can be status
* information, stats or preference values. The item can exist in any tree.
*
* [subtype, BTRFS_PERSISTENT_ITEM_KEY, data]
*
* Existing items:
*
* - device statistics, store IO stats in the device tree, one key for all
* stats
* (BTRFS_DEV_STATS_OBJECTID, BTRFS_DEV_STATS_KEY, 0)
*/
#define BTRFS_PERSISTENT_ITEM_KEY 249
/*
* Persistently stores the device replace state in the device tree.
* The key is built like this: (0, BTRFS_DEV_REPLACE_KEY, 0).
*/
#define BTRFS_DEV_REPLACE_KEY 250
/*
* Stores items that allow to quickly map UUIDs to something else.
* These items are part of the filesystem UUID tree.
* The key is built like this:
* (UUID_upper_64_bits, BTRFS_UUID_KEY*, UUID_lower_64_bits).
*/
#if BTRFS_UUID_SIZE != 16
#error "UUID items require BTRFS_UUID_SIZE == 16!"
#endif
#define BTRFS_UUID_KEY_SUBVOL 251 /* for UUIDs assigned to subvols */
#define BTRFS_UUID_KEY_RECEIVED_SUBVOL 252 /* for UUIDs assigned to
* received subvols */
/*
* string items are for debugging. They just store a short string of
* data in the FS
*/
#define BTRFS_STRING_ITEM_KEY 253
/* Maximum metadata block size (nodesize) */
#define BTRFS_MAX_METADATA_BLOCKSIZE 65536
/* 32 bytes in various csum fields */
#define BTRFS_CSUM_SIZE 32
/* csum types */
enum btrfs_csum_type {
BTRFS_CSUM_TYPE_CRC32 = 0,
BTRFS_CSUM_TYPE_XXHASH = 1,
BTRFS_CSUM_TYPE_SHA256 = 2,
BTRFS_CSUM_TYPE_BLAKE2 = 3,
};
/*
* flags definitions for directory entry item type
*
* Used by:
* struct btrfs_dir_item.type
*
* Values 0..7 must match common file type values in fs_types.h.
*/
#define BTRFS_FT_UNKNOWN 0
#define BTRFS_FT_REG_FILE 1
#define BTRFS_FT_DIR 2
#define BTRFS_FT_CHRDEV 3
#define BTRFS_FT_BLKDEV 4
#define BTRFS_FT_FIFO 5
#define BTRFS_FT_SOCK 6
#define BTRFS_FT_SYMLINK 7
#define BTRFS_FT_XATTR 8
#define BTRFS_FT_MAX 9
/*
* The key defines the order in the tree, and so it also defines (optimal)
* block layout.
*
* objectid corresponds to the inode number.
*
* type tells us things about the object, and is a kind of stream selector.
* so for a given inode, keys with type of 1 might refer to the inode data,
* type of 2 may point to file data in the btree and type == 3 may point to
* extents.
*
* offset is the starting byte offset for this key in the stream.
*
* btrfs_disk_key is in disk byte order. struct btrfs_key is always
* in cpu native order. Otherwise they are identical and their sizes
* should be the same (ie both packed)
*/
struct btrfs_disk_key {
__le64 objectid;
__u8 type;
__le64 offset;
} __attribute__ ((__packed__));
struct btrfs_key {
__u64 objectid;
__u8 type;
__u64 offset;
} __attribute__ ((__packed__));
struct btrfs_dev_item {
/* the internal btrfs device id */
__le64 devid;
/* size of the device */
__le64 total_bytes;
/* bytes used */
__le64 bytes_used;
/* optimal io alignment for this device */
__le32 io_align;
/* optimal io width for this device */
__le32 io_width;
/* minimal io size for this device */
__le32 sector_size;
/* type and info about this device */
__le64 type;
/* expected generation for this device */
__le64 generation;
/*
* starting byte of this partition on the device,
* to allow for stripe alignment in the future
*/
__le64 start_offset;
/* grouping information for allocation decisions */
__le32 dev_group;
/* seek speed 0-100 where 100 is fastest */
__u8 seek_speed;
/* bandwidth 0-100 where 100 is fastest */
__u8 bandwidth;
/* btrfs generated uuid for this device */
__u8 uuid[BTRFS_UUID_SIZE];
/* uuid of FS who owns this device */
__u8 fsid[BTRFS_UUID_SIZE];
} __attribute__ ((__packed__));
struct btrfs_stripe {
__le64 devid;
__le64 offset;
__u8 dev_uuid[BTRFS_UUID_SIZE];
} __attribute__ ((__packed__));
struct btrfs_chunk {
/* size of this chunk in bytes */
__le64 length;
/* objectid of the root referencing this chunk */
__le64 owner;
__le64 stripe_len;
__le64 type;
/* optimal io alignment for this chunk */
__le32 io_align;
/* optimal io width for this chunk */
__le32 io_width;
/* minimal io size for this chunk */
__le32 sector_size;
/* 2^16 stripes is quite a lot, a second limit is the size of a single
* item in the btree
*/
__le16 num_stripes;
/* sub stripes only matter for raid10 */
__le16 sub_stripes;
struct btrfs_stripe stripe;
/* additional stripes go here */
} __attribute__ ((__packed__));
#define BTRFS_FREE_SPACE_EXTENT 1
#define BTRFS_FREE_SPACE_BITMAP 2
struct btrfs_free_space_entry {
__le64 offset;
__le64 bytes;
__u8 type;
} __attribute__ ((__packed__));
struct btrfs_free_space_header {
struct btrfs_disk_key location;
__le64 generation;
__le64 num_entries;
__le64 num_bitmaps;
} __attribute__ ((__packed__));
#define BTRFS_HEADER_FLAG_WRITTEN (1ULL << 0)
#define BTRFS_HEADER_FLAG_RELOC (1ULL << 1)
/* Super block flags */
/* Errors detected */
#define BTRFS_SUPER_FLAG_ERROR (1ULL << 2)
#define BTRFS_SUPER_FLAG_SEEDING (1ULL << 32)
#define BTRFS_SUPER_FLAG_METADUMP (1ULL << 33)
#define BTRFS_SUPER_FLAG_METADUMP_V2 (1ULL << 34)
#define BTRFS_SUPER_FLAG_CHANGING_FSID (1ULL << 35)
#define BTRFS_SUPER_FLAG_CHANGING_FSID_V2 (1ULL << 36)
/*
* items in the extent btree are used to record the objectid of the
* owner of the block and the number of references
*/
struct btrfs_extent_item {
__le64 refs;
__le64 generation;
__le64 flags;
} __attribute__ ((__packed__));
struct btrfs_extent_item_v0 {
__le32 refs;
} __attribute__ ((__packed__));
#define BTRFS_EXTENT_FLAG_DATA (1ULL << 0)
#define BTRFS_EXTENT_FLAG_TREE_BLOCK (1ULL << 1)
/* following flags only apply to tree blocks */
/* use full backrefs for extent pointers in the block */
#define BTRFS_BLOCK_FLAG_FULL_BACKREF (1ULL << 8)
/*
* this flag is only used internally by scrub and may be changed at any time
* it is only declared here to avoid collisions
*/
#define BTRFS_EXTENT_FLAG_SUPER (1ULL << 48)
struct btrfs_tree_block_info {
struct btrfs_disk_key key;
__u8 level;
} __attribute__ ((__packed__));
struct btrfs_extent_data_ref {
__le64 root;
__le64 objectid;
__le64 offset;
__le32 count;
} __attribute__ ((__packed__));
struct btrfs_shared_data_ref {
__le32 count;
} __attribute__ ((__packed__));
struct btrfs_extent_inline_ref {
__u8 type;
__le64 offset;
} __attribute__ ((__packed__));
/* dev extents record free space on individual devices. The owner
* field points back to the chunk allocation mapping tree that allocated
* the extent. The chunk tree uuid field is a way to double check the owner
*/
struct btrfs_dev_extent {
__le64 chunk_tree;
__le64 chunk_objectid;
__le64 chunk_offset;
__le64 length;
__u8 chunk_tree_uuid[BTRFS_UUID_SIZE];
} __attribute__ ((__packed__));
struct btrfs_inode_ref {
__le64 index;
__le16 name_len;
/* name goes here */
} __attribute__ ((__packed__));
struct btrfs_inode_extref {
__le64 parent_objectid;
__le64 index;
__le16 name_len;
__u8 name[0];
/* name goes here */
} __attribute__ ((__packed__));
struct btrfs_timespec {
__le64 sec;
__le32 nsec;
} __attribute__ ((__packed__));
struct btrfs_inode_item {
/* nfs style generation number */
__le64 generation;
/* transid that last touched this inode */
__le64 transid;
__le64 size;
__le64 nbytes;
__le64 block_group;
__le32 nlink;
__le32 uid;
__le32 gid;
__le32 mode;
__le64 rdev;
__le64 flags;
/* modification sequence number for NFS */
__le64 sequence;
/*
* a little future expansion, for more than this we can
* just grow the inode item and version it
*/
__le64 reserved[4];
struct btrfs_timespec atime;
struct btrfs_timespec ctime;
struct btrfs_timespec mtime;
struct btrfs_timespec otime;
} __attribute__ ((__packed__));
struct btrfs_dir_log_item {
__le64 end;
} __attribute__ ((__packed__));
struct btrfs_dir_item {
struct btrfs_disk_key location;
__le64 transid;
__le16 data_len;
__le16 name_len;
__u8 type;
} __attribute__ ((__packed__));
#define BTRFS_ROOT_SUBVOL_RDONLY (1ULL << 0)
/*
* Internal in-memory flag that a subvolume has been marked for deletion but
* still visible as a directory
*/
#define BTRFS_ROOT_SUBVOL_DEAD (1ULL << 48)
struct btrfs_root_item {
struct btrfs_inode_item inode;
__le64 generation;
__le64 root_dirid;
__le64 bytenr;
__le64 byte_limit;
__le64 bytes_used;
__le64 last_snapshot;
__le64 flags;
__le32 refs;
struct btrfs_disk_key drop_progress;
__u8 drop_level;
__u8 level;
/*
* The following fields appear after subvol_uuids+subvol_times
* were introduced.
*/
/*
* This generation number is used to test if the new fields are valid
* and up to date while reading the root item. Every time the root item
* is written out, the "generation" field is copied into this field. If
* anyone ever mounted the fs with an older kernel, we will have
* mismatching generation values here and thus must invalidate the
* new fields. See btrfs_update_root and btrfs_find_last_root for
* details.
* the offset of generation_v2 is also used as the start for the memset
* when invalidating the fields.
*/
__le64 generation_v2;
__u8 uuid[BTRFS_UUID_SIZE];
__u8 parent_uuid[BTRFS_UUID_SIZE];
__u8 received_uuid[BTRFS_UUID_SIZE];
__le64 ctransid; /* updated when an inode changes */
__le64 otransid; /* trans when created */
__le64 stransid; /* trans when sent. non-zero for received subvol */
__le64 rtransid; /* trans when received. non-zero for received subvol */
struct btrfs_timespec ctime;
struct btrfs_timespec otime;
struct btrfs_timespec stime;
struct btrfs_timespec rtime;
__le64 reserved[8]; /* for future */
} __attribute__ ((__packed__));
/*
* Btrfs root item used to be smaller than current size. The old format ends
* at where member generation_v2 is.
*/
static inline __u32 btrfs_legacy_root_item_size(void)
{
return offsetof(struct btrfs_root_item, generation_v2);
}
/*
* this is used for both forward and backward root refs
*/
struct btrfs_root_ref {
__le64 dirid;
__le64 sequence;
__le16 name_len;
} __attribute__ ((__packed__));
struct btrfs_disk_balance_args {
/*
* profiles to operate on, single is denoted by
* BTRFS_AVAIL_ALLOC_BIT_SINGLE
*/
__le64 profiles;
/*
* usage filter
* BTRFS_BALANCE_ARGS_USAGE with a single value means '0..N'
* BTRFS_BALANCE_ARGS_USAGE_RANGE - range syntax, min..max
*/
union {
__le64 usage;
struct {
__le32 usage_min;
__le32 usage_max;
};
};
/* devid filter */
__le64 devid;
/* devid subset filter [pstart..pend) */
__le64 pstart;
__le64 pend;
/* btrfs virtual address space subset filter [vstart..vend) */
__le64 vstart;
__le64 vend;
/*
* profile to convert to, single is denoted by
* BTRFS_AVAIL_ALLOC_BIT_SINGLE
*/
__le64 target;
/* BTRFS_BALANCE_ARGS_* */
__le64 flags;
/*
* BTRFS_BALANCE_ARGS_LIMIT with value 'limit'
* BTRFS_BALANCE_ARGS_LIMIT_RANGE - the extend version can use minimum
* and maximum
*/
union {
__le64 limit;
struct {
__le32 limit_min;
__le32 limit_max;
};
};
/*
* Process chunks that cross stripes_min..stripes_max devices,
* BTRFS_BALANCE_ARGS_STRIPES_RANGE
*/
__le32 stripes_min;
__le32 stripes_max;
__le64 unused[6];
} __attribute__ ((__packed__));
/*
* store balance parameters to disk so that balance can be properly
* resumed after crash or unmount
*/
struct btrfs_balance_item {
/* BTRFS_BALANCE_* */
__le64 flags;
struct btrfs_disk_balance_args data;
struct btrfs_disk_balance_args meta;
struct btrfs_disk_balance_args sys;
__le64 unused[4];
} __attribute__ ((__packed__));
enum {
BTRFS_FILE_EXTENT_INLINE = 0,
BTRFS_FILE_EXTENT_REG = 1,
BTRFS_FILE_EXTENT_PREALLOC = 2,
BTRFS_NR_FILE_EXTENT_TYPES = 3,
};
struct btrfs_file_extent_item {
/*
* transaction id that created this extent
*/
__le64 generation;
/*
* max number of bytes to hold this extent in ram
* when we split a compressed extent we can't know how big
* each of the resulting pieces will be. So, this is
* an upper limit on the size of the extent in ram instead of
* an exact limit.
*/
__le64 ram_bytes;
/*
* 32 bits for the various ways we might encode the data,
* including compression and encryption. If any of these
* are set to something a given disk format doesn't understand
* it is treated like an incompat flag for reading and writing,
* but not for stat.
*/
__u8 compression;
__u8 encryption;
__le16 other_encoding; /* spare for later use */
/* are we inline data or a real extent? */
__u8 type;
/*
* disk space consumed by the extent, checksum blocks are included
* in these numbers
*
* At this offset in the structure, the inline extent data start.
*/
__le64 disk_bytenr;
__le64 disk_num_bytes;
/*
* the logical offset in file blocks (no csums)
* this extent record is for. This allows a file extent to point
* into the middle of an existing extent on disk, sharing it
* between two snapshots (useful if some bytes in the middle of the
* extent have changed
*/
__le64 offset;
/*
* the logical number of file blocks (no csums included). This
* always reflects the size uncompressed and without encoding.
*/
__le64 num_bytes;
} __attribute__ ((__packed__));
struct btrfs_csum_item {
__u8 csum;
} __attribute__ ((__packed__));
struct btrfs_dev_stats_item {
/*
* grow this item struct at the end for future enhancements and keep
* the existing values unchanged
*/
__le64 values[BTRFS_DEV_STAT_VALUES_MAX];
} __attribute__ ((__packed__));
#define BTRFS_DEV_REPLACE_ITEM_CONT_READING_FROM_SRCDEV_MODE_ALWAYS 0
#define BTRFS_DEV_REPLACE_ITEM_CONT_READING_FROM_SRCDEV_MODE_AVOID 1
struct btrfs_dev_replace_item {
/*
* grow this item struct at the end for future enhancements and keep
* the existing values unchanged
*/
__le64 src_devid;
__le64 cursor_left;
__le64 cursor_right;
__le64 cont_reading_from_srcdev_mode;
__le64 replace_state;
__le64 time_started;
__le64 time_stopped;
__le64 num_write_errors;
__le64 num_uncorrectable_read_errors;
} __attribute__ ((__packed__));
/* different types of block groups (and chunks) */
#define BTRFS_BLOCK_GROUP_DATA (1ULL << 0)
#define BTRFS_BLOCK_GROUP_SYSTEM (1ULL << 1)
#define BTRFS_BLOCK_GROUP_METADATA (1ULL << 2)
#define BTRFS_BLOCK_GROUP_RAID0 (1ULL << 3)
#define BTRFS_BLOCK_GROUP_RAID1 (1ULL << 4)
#define BTRFS_BLOCK_GROUP_DUP (1ULL << 5)
#define BTRFS_BLOCK_GROUP_RAID10 (1ULL << 6)
#define BTRFS_BLOCK_GROUP_RAID5 (1ULL << 7)
#define BTRFS_BLOCK_GROUP_RAID6 (1ULL << 8)
#define BTRFS_BLOCK_GROUP_RAID1C3 (1ULL << 9)
#define BTRFS_BLOCK_GROUP_RAID1C4 (1ULL << 10)
#define BTRFS_BLOCK_GROUP_RESERVED (BTRFS_AVAIL_ALLOC_BIT_SINGLE | \
BTRFS_SPACE_INFO_GLOBAL_RSV)
enum btrfs_raid_types {
BTRFS_RAID_RAID10,
BTRFS_RAID_RAID1,
BTRFS_RAID_DUP,
BTRFS_RAID_RAID0,
BTRFS_RAID_SINGLE,
BTRFS_RAID_RAID5,
BTRFS_RAID_RAID6,
BTRFS_RAID_RAID1C3,
BTRFS_RAID_RAID1C4,
BTRFS_NR_RAID_TYPES
};
#define BTRFS_BLOCK_GROUP_TYPE_MASK (BTRFS_BLOCK_GROUP_DATA | \
BTRFS_BLOCK_GROUP_SYSTEM | \
BTRFS_BLOCK_GROUP_METADATA)
#define BTRFS_BLOCK_GROUP_PROFILE_MASK (BTRFS_BLOCK_GROUP_RAID0 | \
BTRFS_BLOCK_GROUP_RAID1 | \
BTRFS_BLOCK_GROUP_RAID1C3 | \
BTRFS_BLOCK_GROUP_RAID1C4 | \
BTRFS_BLOCK_GROUP_RAID5 | \
BTRFS_BLOCK_GROUP_RAID6 | \
BTRFS_BLOCK_GROUP_DUP | \
BTRFS_BLOCK_GROUP_RAID10)
#define BTRFS_BLOCK_GROUP_RAID56_MASK (BTRFS_BLOCK_GROUP_RAID5 | \
BTRFS_BLOCK_GROUP_RAID6)
#define BTRFS_BLOCK_GROUP_RAID1_MASK (BTRFS_BLOCK_GROUP_RAID1 | \
BTRFS_BLOCK_GROUP_RAID1C3 | \
BTRFS_BLOCK_GROUP_RAID1C4)
/*
* We need a bit for restriper to be able to tell when chunks of type
* SINGLE are available. This "extended" profile format is used in
* fs_info->avail_*_alloc_bits (in-memory) and balance item fields
* (on-disk). The corresponding on-disk bit in chunk.type is reserved
* to avoid remappings between two formats in future.
*/
#define BTRFS_AVAIL_ALLOC_BIT_SINGLE (1ULL << 48)
/*
* A fake block group type that is used to communicate global block reserve
* size to userspace via the SPACE_INFO ioctl.
*/
#define BTRFS_SPACE_INFO_GLOBAL_RSV (1ULL << 49)
#define BTRFS_EXTENDED_PROFILE_MASK (BTRFS_BLOCK_GROUP_PROFILE_MASK | \
BTRFS_AVAIL_ALLOC_BIT_SINGLE)
static inline __u64 chunk_to_extended(__u64 flags)
{
if ((flags & BTRFS_BLOCK_GROUP_PROFILE_MASK) == 0) flags |= BTRFS_AVAIL_ALLOC_BIT_SINGLE;
return flags;
}
static inline __u64 extended_to_chunk(__u64 flags)
{
return flags & ~BTRFS_AVAIL_ALLOC_BIT_SINGLE;
}
struct btrfs_block_group_item {
__le64 used;
__le64 chunk_objectid;
__le64 flags;
} __attribute__ ((__packed__));
struct btrfs_free_space_info {
__le32 extent_count;
__le32 flags;
} __attribute__ ((__packed__));
#define BTRFS_FREE_SPACE_USING_BITMAPS (1ULL << 0)
#define BTRFS_QGROUP_LEVEL_SHIFT 48
static inline __u16 btrfs_qgroup_level(__u64 qgroupid)
{
return (__u16)(qgroupid >> BTRFS_QGROUP_LEVEL_SHIFT);
}
/*
* is subvolume quota turned on?
*/
#define BTRFS_QGROUP_STATUS_FLAG_ON (1ULL << 0)
/*
* RESCAN is set during the initialization phase
*/
#define BTRFS_QGROUP_STATUS_FLAG_RESCAN (1ULL << 1)
/*
* Some qgroup entries are known to be out of date,
* either because the configuration has changed in a way that
* makes a rescan necessary, or because the fs has been mounted
* with a non-qgroup-aware version.
* Turning qouta off and on again makes it inconsistent, too.
*/
#define BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT (1ULL << 2)
#define BTRFS_QGROUP_STATUS_VERSION 1
struct btrfs_qgroup_status_item {
__le64 version;
/*
* the generation is updated during every commit. As older
* versions of btrfs are not aware of qgroups, it will be
* possible to detect inconsistencies by checking the
* generation on mount time
*/
__le64 generation;
/* flag definitions see above */
__le64 flags;
/*
* only used during scanning to record the progress
* of the scan. It contains a logical address
*/
__le64 rescan;
} __attribute__ ((__packed__));
struct btrfs_qgroup_info_item {
__le64 generation;
__le64 rfer;
__le64 rfer_cmpr;
__le64 excl;
__le64 excl_cmpr;
} __attribute__ ((__packed__));
struct btrfs_qgroup_limit_item {
/*
* only updated when any of the other values change
*/
__le64 flags;
__le64 max_rfer;
__le64 max_excl;
__le64 rsv_rfer;
__le64 rsv_excl;
} __attribute__ ((__packed__));
struct btrfs_verity_descriptor_item {
/* Size of the verity descriptor in bytes */
__le64 size;
/*
* When we implement support for fscrypt, we will need to encrypt the
* Merkle tree for encrypted verity files. These 128 bits are for the
* eventual storage of an fscrypt initialization vector.
*/
__le64 reserved[2];
__u8 encryption;
} __attribute__ ((__packed__));
#endif /* _BTRFS_CTREE_H_ */
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _KERNEL_PRINTK_RINGBUFFER_H
#define _KERNEL_PRINTK_RINGBUFFER_H
#include <linux/atomic.h>
#include <linux/dev_printk.h>
/*
* Meta information about each stored message.
*
* All fields are set by the printk code except for @seq, which is
* set by the ringbuffer code.
*/
struct printk_info {
u64 seq; /* sequence number */
u64 ts_nsec; /* timestamp in nanoseconds */
u16 text_len; /* length of text message */
u8 facility; /* syslog facility */
u8 flags:5; /* internal record flags */
u8 level:3; /* syslog level */
u32 caller_id; /* thread id or processor id */
struct dev_printk_info dev_info;
};
/*
* A structure providing the buffers, used by writers and readers.
*
* Writers:
* Using prb_rec_init_wr(), a writer sets @text_buf_size before calling
* prb_reserve(). On success, prb_reserve() sets @info and @text_buf to
* buffers reserved for that writer.
*
* Readers:
* Using prb_rec_init_rd(), a reader sets all fields before calling
* prb_read_valid(). Note that the reader provides the @info and @text_buf,
* buffers. On success, the struct pointed to by @info will be filled and
* the char array pointed to by @text_buf will be filled with text data.
*/
struct printk_record {
struct printk_info *info;
char *text_buf;
unsigned int text_buf_size;
};
/* Specifies the logical position and span of a data block. */
struct prb_data_blk_lpos {
unsigned long begin;
unsigned long next;
};
/*
* A descriptor: the complete meta-data for a record.
*
* @state_var: A bitwise combination of descriptor ID and descriptor state.
*/
struct prb_desc {
atomic_long_t state_var;
struct prb_data_blk_lpos text_blk_lpos;
};
/* A ringbuffer of "ID + data" elements. */
struct prb_data_ring {
unsigned int size_bits;
char *data;
atomic_long_t head_lpos;
atomic_long_t tail_lpos;
};
/* A ringbuffer of "struct prb_desc" elements. */
struct prb_desc_ring {
unsigned int count_bits;
struct prb_desc *descs;
struct printk_info *infos;
atomic_long_t head_id;
atomic_long_t tail_id;
};
/*
* The high level structure representing the printk ringbuffer.
*
* @fail: Count of failed prb_reserve() calls where not even a data-less
* record was created.
*/
struct printk_ringbuffer {
struct prb_desc_ring desc_ring;
struct prb_data_ring text_data_ring;
atomic_long_t fail;
};
/*
* Used by writers as a reserve/commit handle.
*
* @rb: Ringbuffer where the entry is reserved.
* @irqflags: Saved irq flags to restore on entry commit.
* @id: ID of the reserved descriptor.
* @text_space: Total occupied buffer space in the text data ring, including
* ID, alignment padding, and wrapping data blocks.
*
* This structure is an opaque handle for writers. Its contents are only
* to be used by the ringbuffer implementation.
*/
struct prb_reserved_entry {
struct printk_ringbuffer *rb;
unsigned long irqflags;
unsigned long id;
unsigned int text_space;
};
/* The possible responses of a descriptor state-query. */
enum desc_state {
desc_miss = -1, /* ID mismatch (pseudo state) */
desc_reserved = 0x0, /* reserved, in use by writer */
desc_committed = 0x1, /* committed by writer, could get reopened */
desc_finalized = 0x2, /* committed, no further modification allowed */
desc_reusable = 0x3, /* free, not yet used by any writer */
};
#define _DATA_SIZE(sz_bits) (1UL << (sz_bits))
#define _DESCS_COUNT(ct_bits) (1U << (ct_bits))
#define DESC_SV_BITS (sizeof(unsigned long) * 8)
#define DESC_FLAGS_SHIFT (DESC_SV_BITS - 2)
#define DESC_FLAGS_MASK (3UL << DESC_FLAGS_SHIFT)
#define DESC_STATE(sv) (3UL & (sv >> DESC_FLAGS_SHIFT))
#define DESC_SV(id, state) (((unsigned long)state << DESC_FLAGS_SHIFT) | id)
#define DESC_ID_MASK (~DESC_FLAGS_MASK)
#define DESC_ID(sv) ((sv) & DESC_ID_MASK)
#define FAILED_LPOS 0x1
#define NO_LPOS 0x3
#define FAILED_BLK_LPOS \
{ \
.begin = FAILED_LPOS, \
.next = FAILED_LPOS, \
}
/*
* Descriptor Bootstrap
*
* The descriptor array is minimally initialized to allow immediate usage
* by readers and writers. The requirements that the descriptor array
* initialization must satisfy:
*
* Req1
* The tail must point to an existing (committed or reusable) descriptor.
* This is required by the implementation of prb_first_seq().
*
* Req2
* Readers must see that the ringbuffer is initially empty.
*
* Req3
* The first record reserved by a writer is assigned sequence number 0.
*
* To satisfy Req1, the tail initially points to a descriptor that is
* minimally initialized (having no data block, i.e. data-less with the
* data block's lpos @begin and @next values set to FAILED_LPOS).
*
* To satisfy Req2, the initial tail descriptor is initialized to the
* reusable state. Readers recognize reusable descriptors as existing
* records, but skip over them.
*
* To satisfy Req3, the last descriptor in the array is used as the initial
* head (and tail) descriptor. This allows the first record reserved by a
* writer (head + 1) to be the first descriptor in the array. (Only the first
* descriptor in the array could have a valid sequence number of 0.)
*
* The first time a descriptor is reserved, it is assigned a sequence number
* with the value of the array index. A "first time reserved" descriptor can
* be recognized because it has a sequence number of 0 but does not have an
* index of 0. (Only the first descriptor in the array could have a valid
* sequence number of 0.) After the first reservation, all future reservations
* (recycling) simply involve incrementing the sequence number by the array
* count.
*
* Hack #1
* Only the first descriptor in the array is allowed to have the sequence
* number 0. In this case it is not possible to recognize if it is being
* reserved the first time (set to index value) or has been reserved
* previously (increment by the array count). This is handled by _always_
* incrementing the sequence number by the array count when reserving the
* first descriptor in the array. In order to satisfy Req3, the sequence
* number of the first descriptor in the array is initialized to minus
* the array count. Then, upon the first reservation, it is incremented
* to 0, thus satisfying Req3.
*
* Hack #2
* prb_first_seq() can be called at any time by readers to retrieve the
* sequence number of the tail descriptor. However, due to Req2 and Req3,
* initially there are no records to report the sequence number of
* (sequence numbers are u64 and there is nothing less than 0). To handle
* this, the sequence number of the initial tail descriptor is initialized
* to 0. Technically this is incorrect, because there is no record with
* sequence number 0 (yet) and the tail descriptor is not the first
* descriptor in the array. But it allows prb_read_valid() to correctly
* report the existence of a record for _any_ given sequence number at all
* times. Bootstrapping is complete when the tail is pushed the first
* time, thus finally pointing to the first descriptor reserved by a
* writer, which has the assigned sequence number 0.
*/
/*
* Initiating Logical Value Overflows
*
* Both logical position (lpos) and ID values can be mapped to array indexes
* but may experience overflows during the lifetime of the system. To ensure
* that printk_ringbuffer can handle the overflows for these types, initial
* values are chosen that map to the correct initial array indexes, but will
* result in overflows soon.
*
* BLK0_LPOS
* The initial @head_lpos and @tail_lpos for data rings. It is at index
* 0 and the lpos value is such that it will overflow on the first wrap.
*
* DESC0_ID
* The initial @head_id and @tail_id for the desc ring. It is at the last
* index of the descriptor array (see Req3 above) and the ID value is such
* that it will overflow on the second wrap.
*/
#define BLK0_LPOS(sz_bits) (-(_DATA_SIZE(sz_bits)))
#define DESC0_ID(ct_bits) DESC_ID(-(_DESCS_COUNT(ct_bits) + 1))
#define DESC0_SV(ct_bits) DESC_SV(DESC0_ID(ct_bits), desc_reusable)
/*
* Define a ringbuffer with an external text data buffer. The same as
* DEFINE_PRINTKRB() but requires specifying an external buffer for the
* text data.
*
* Note: The specified external buffer must be of the size:
* 2 ^ (descbits + avgtextbits)
*/
#define _DEFINE_PRINTKRB(name, descbits, avgtextbits, text_buf) \
static struct prb_desc _##name##_descs[_DESCS_COUNT(descbits)] = { \
/* the initial head and tail */ \
[_DESCS_COUNT(descbits) - 1] = { \
/* reusable */ \
.state_var = ATOMIC_INIT(DESC0_SV(descbits)), \
/* no associated data block */ \
.text_blk_lpos = FAILED_BLK_LPOS, \
}, \
}; \
static struct printk_info _##name##_infos[_DESCS_COUNT(descbits)] = { \
/* this will be the first record reserved by a writer */ \
[0] = { \
/* will be incremented to 0 on the first reservation */ \
.seq = -(u64)_DESCS_COUNT(descbits), \
}, \
/* the initial head and tail */ \
[_DESCS_COUNT(descbits) - 1] = { \
/* reports the first seq value during the bootstrap phase */ \
.seq = 0, \
}, \
}; \
static struct printk_ringbuffer name = { \
.desc_ring = { \
.count_bits = descbits, \
.descs = &_##name##_descs[0], \
.infos = &_##name##_infos[0], \
.head_id = ATOMIC_INIT(DESC0_ID(descbits)), \
.tail_id = ATOMIC_INIT(DESC0_ID(descbits)), \
}, \
.text_data_ring = { \
.size_bits = (avgtextbits) + (descbits), \
.data = text_buf, \
.head_lpos = ATOMIC_LONG_INIT(BLK0_LPOS((avgtextbits) + (descbits))), \
.tail_lpos = ATOMIC_LONG_INIT(BLK0_LPOS((avgtextbits) + (descbits))), \
}, \
.fail = ATOMIC_LONG_INIT(0), \
}
/**
* DEFINE_PRINTKRB() - Define a ringbuffer.
*
* @name: The name of the ringbuffer variable.
* @descbits: The number of descriptors as a power-of-2 value.
* @avgtextbits: The average text data size per record as a power-of-2 value.
*
* This is a macro for defining a ringbuffer and all internal structures
* such that it is ready for immediate use. See _DEFINE_PRINTKRB() for a
* variant where the text data buffer can be specified externally.
*/
#define DEFINE_PRINTKRB(name, descbits, avgtextbits) \
static char _##name##_text[1U << ((avgtextbits) + (descbits))] \
__aligned(__alignof__(unsigned long)); \
_DEFINE_PRINTKRB(name, descbits, avgtextbits, &_##name##_text[0])
/* Writer Interface */
/**
* prb_rec_init_wr() - Initialize a buffer for writing records.
*
* @r: The record to initialize.
* @text_buf_size: The needed text buffer size.
*/
static inline void prb_rec_init_wr(struct printk_record *r,
unsigned int text_buf_size)
{
r->info = NULL;
r->text_buf = NULL;
r->text_buf_size = text_buf_size;
}
bool prb_reserve(struct prb_reserved_entry *e, struct printk_ringbuffer *rb,
struct printk_record *r);
bool prb_reserve_in_last(struct prb_reserved_entry *e, struct printk_ringbuffer *rb,
struct printk_record *r, u32 caller_id, unsigned int max_size);
void prb_commit(struct prb_reserved_entry *e);
void prb_final_commit(struct prb_reserved_entry *e);
void prb_init(struct printk_ringbuffer *rb,
char *text_buf, unsigned int text_buf_size,
struct prb_desc *descs, unsigned int descs_count_bits,
struct printk_info *infos);
unsigned int prb_record_text_space(struct prb_reserved_entry *e);
/* Reader Interface */
/**
* prb_rec_init_rd() - Initialize a buffer for reading records.
*
* @r: The record to initialize.
* @info: A buffer to store record meta-data.
* @text_buf: A buffer to store text data.
* @text_buf_size: The size of @text_buf.
*
* Initialize all the fields that a reader is interested in. All arguments
* (except @r) are optional. Only record data for arguments that are
* non-NULL or non-zero will be read.
*/
static inline void prb_rec_init_rd(struct printk_record *r,
struct printk_info *info,
char *text_buf, unsigned int text_buf_size)
{
r->info = info;
r->text_buf = text_buf;
r->text_buf_size = text_buf_size;
}
/**
* prb_for_each_record() - Iterate over the records of a ringbuffer.
*
* @from: The sequence number to begin with.
* @rb: The ringbuffer to iterate over.
* @s: A u64 to store the sequence number on each iteration.
* @r: A printk_record to store the record on each iteration.
*
* This is a macro for conveniently iterating over a ringbuffer.
* Note that @s may not be the sequence number of the record on each
* iteration. For the sequence number, @r->info->seq should be checked.
*
* Context: Any context.
*/
#define prb_for_each_record(from, rb, s, r) \
for ((s) = from; prb_read_valid(rb, s, r); (s) = (r)->info->seq + 1)
/**
* prb_for_each_info() - Iterate over the meta data of a ringbuffer.
*
* @from: The sequence number to begin with.
* @rb: The ringbuffer to iterate over.
* @s: A u64 to store the sequence number on each iteration.
* @i: A printk_info to store the record meta data on each iteration.
* @lc: An unsigned int to store the text line count of each record.
*
* This is a macro for conveniently iterating over a ringbuffer.
* Note that @s may not be the sequence number of the record on each
* iteration. For the sequence number, @r->info->seq should be checked.
*
* Context: Any context.
*/
#define prb_for_each_info(from, rb, s, i, lc) \
for ((s) = from; prb_read_valid_info(rb, s, i, lc); (s) = (i)->seq + 1)
bool prb_read_valid(struct printk_ringbuffer *rb, u64 seq,
struct printk_record *r);
bool prb_read_valid_info(struct printk_ringbuffer *rb, u64 seq,
struct printk_info *info, unsigned int *line_count);
u64 prb_first_valid_seq(struct printk_ringbuffer *rb);
u64 prb_next_seq(struct printk_ringbuffer *rb);
#endif /* _KERNEL_PRINTK_RINGBUFFER_H */
// SPDX-License-Identifier: GPL-2.0
/*
* inode.c - part of debugfs, a tiny little debug file system
*
* Copyright (C) 2004,2019 Greg Kroah-Hartman <greg@kroah.com>
* Copyright (C) 2004 IBM Inc.
* Copyright (C) 2019 Linux Foundation <gregkh@linuxfoundation.org>
*
* debugfs is for people to use instead of /proc or /sys.
* See ./Documentation/core-api/kernel-api.rst for more details.
*/
#define pr_fmt(fmt) "debugfs: " fmt
#include <linux/module.h>
#include <linux/fs.h>
#include <linux/mount.h>
#include <linux/pagemap.h>
#include <linux/init.h>
#include <linux/kobject.h>
#include <linux/namei.h>
#include <linux/debugfs.h>
#include <linux/fsnotify.h>
#include <linux/string.h>
#include <linux/seq_file.h>
#include <linux/parser.h>
#include <linux/magic.h>
#include <linux/slab.h>
#include <linux/security.h>
#include "internal.h"
#define DEBUGFS_DEFAULT_MODE 0700
static struct vfsmount *debugfs_mount;
static int debugfs_mount_count;
static bool debugfs_registered;
static unsigned int debugfs_allow __ro_after_init = DEFAULT_DEBUGFS_ALLOW_BITS;
/*
* Don't allow access attributes to be changed whilst the kernel is locked down
* so that we can use the file mode as part of a heuristic to determine whether
* to lock down individual files.
*/
static int debugfs_setattr(struct user_namespace *mnt_userns,
struct dentry *dentry, struct iattr *ia)
{
int ret;
if (ia->ia_valid & (ATTR_MODE | ATTR_UID | ATTR_GID)) {
ret = security_locked_down(LOCKDOWN_DEBUGFS);
if (ret)
return ret;
}
return simple_setattr(&init_user_ns, dentry, ia);
}
static const struct inode_operations debugfs_file_inode_operations = {
.setattr = debugfs_setattr,
};
static const struct inode_operations debugfs_dir_inode_operations = {
.lookup = simple_lookup,
.setattr = debugfs_setattr,
};
static const struct inode_operations debugfs_symlink_inode_operations = {
.get_link = simple_get_link,
.setattr = debugfs_setattr,
};
static struct inode *debugfs_get_inode(struct super_block *sb)
{
struct inode *inode = new_inode(sb);
if (inode) {
inode->i_ino = get_next_ino();
inode->i_atime = inode->i_mtime =
inode->i_ctime = current_time(inode);
}
return inode;
}
struct debugfs_mount_opts {
kuid_t uid;
kgid_t gid;
umode_t mode;
};
enum {
Opt_uid,
Opt_gid,
Opt_mode,
Opt_err
};
static const match_table_t tokens = {
{Opt_uid, "uid=%u"},
{Opt_gid, "gid=%u"},
{Opt_mode, "mode=%o"},
{Opt_err, NULL}
};
struct debugfs_fs_info {
struct debugfs_mount_opts mount_opts;
};
static int debugfs_parse_options(char *data, struct debugfs_mount_opts *opts)
{
substring_t args[MAX_OPT_ARGS];
int option;
int token;
kuid_t uid;
kgid_t gid;
char *p;
opts->mode = DEBUGFS_DEFAULT_MODE;
while ((p = strsep(&data, ",")) != NULL) {
if (!*p)
continue;
token = match_token(p, tokens, args);
switch (token) {
case Opt_uid:
if (match_int(&args[0], &option))
return -EINVAL;
uid = make_kuid(current_user_ns(), option);
if (!uid_valid(uid))
return -EINVAL;
opts->uid = uid;
break;
case Opt_gid:
if (match_int(&args[0], &option))
return -EINVAL;
gid = make_kgid(current_user_ns(), option);
if (!gid_valid(gid))
return -EINVAL;
opts->gid = gid;
break;
case Opt_mode:
if (match_octal(&args[0], &option))
return -EINVAL;
opts->mode = option & S_IALLUGO;
break;
/*
* We might like to report bad mount options here;
* but traditionally debugfs has ignored all mount options
*/
}
}
return 0;
}
static int debugfs_apply_options(struct super_block *sb)
{
struct debugfs_fs_info *fsi = sb->s_fs_info;
struct inode *inode = d_inode(sb->s_root);
struct debugfs_mount_opts *opts = &fsi->mount_opts;
inode->i_mode &= ~S_IALLUGO;
inode->i_mode |= opts->mode;
inode->i_uid = opts->uid;
inode->i_gid = opts->gid;
return 0;
}
static int debugfs_remount(struct super_block *sb, int *flags, char *data)
{
int err;
struct debugfs_fs_info *fsi = sb->s_fs_info;
sync_filesystem(sb);
err = debugfs_parse_options(data, &fsi->mount_opts);
if (err)
goto fail;
debugfs_apply_options(sb);
fail:
return err;
}
static int debugfs_show_options(struct seq_file *m, struct dentry *root)
{
struct debugfs_fs_info *fsi = root->d_sb->s_fs_info;
struct debugfs_mount_opts *opts = &fsi->mount_opts;
if (!uid_eq(opts->uid, GLOBAL_ROOT_UID))
seq_printf(m, ",uid=%u",
from_kuid_munged(&init_user_ns, opts->uid));
if (!gid_eq(opts->gid, GLOBAL_ROOT_GID))
seq_printf(m, ",gid=%u",
from_kgid_munged(&init_user_ns, opts->gid));
if (opts->mode != DEBUGFS_DEFAULT_MODE)
seq_printf(m, ",mode=%o", opts->mode);
return 0;
}
static void debugfs_free_inode(struct inode *inode)
{
if (S_ISLNK(inode->i_mode))
kfree(inode->i_link);
free_inode_nonrcu(inode);
}
static const struct super_operations debugfs_super_operations = {
.statfs = simple_statfs,
.remount_fs = debugfs_remount,
.show_options = debugfs_show_options,
.free_inode = debugfs_free_inode,
};
static void debugfs_release_dentry(struct dentry *dentry)
{
void *fsd = dentry->d_fsdata;
if (!((unsigned long)fsd & DEBUGFS_FSDATA_IS_REAL_FOPS_BIT))
kfree(dentry->d_fsdata);
}
static struct vfsmount *debugfs_automount(struct path *path)
{
debugfs_automount_t f;
f = (debugfs_automount_t)path->dentry->d_fsdata;
return f(path->dentry, d_inode(path->dentry)->i_private);
}
static const struct dentry_operations debugfs_dops = {
.d_delete = always_delete_dentry,
.d_release = debugfs_release_dentry,
.d_automount = debugfs_automount,
};
static int debug_fill_super(struct super_block *sb, void *data, int silent)
{
static const struct tree_descr debug_files[] = {{""}};
struct debugfs_fs_info *fsi;
int err;
fsi = kzalloc(sizeof(struct debugfs_fs_info), GFP_KERNEL);
sb->s_fs_info = fsi;
if (!fsi) {
err = -ENOMEM;
goto fail;
}
err = debugfs_parse_options(data, &fsi->mount_opts);
if (err)
goto fail;
err = simple_fill_super(sb, DEBUGFS_MAGIC, debug_files);
if (err)
goto fail;
sb->s_op = &debugfs_super_operations;
sb->s_d_op = &debugfs_dops;
debugfs_apply_options(sb);
return 0;
fail:
kfree(fsi);
sb->s_fs_info = NULL;
return err;
}
static struct dentry *debug_mount(struct file_system_type *fs_type,
int flags, const char *dev_name,
void *data)
{
if (!(debugfs_allow & DEBUGFS_ALLOW_API))
return ERR_PTR(-EPERM);
return mount_single(fs_type, flags, data, debug_fill_super);
}
static struct file_system_type debug_fs_type = {
.owner = THIS_MODULE,
.name = "debugfs",
.mount = debug_mount,
.kill_sb = kill_litter_super,
};
MODULE_ALIAS_FS("debugfs");
/**
* debugfs_lookup() - look up an existing debugfs file
* @name: a pointer to a string containing the name of the file to look up.
* @parent: a pointer to the parent dentry of the file.
*
* This function will return a pointer to a dentry if it succeeds. If the file
* doesn't exist or an error occurs, %NULL will be returned. The returned
* dentry must be passed to dput() when it is no longer needed.
*
* If debugfs is not enabled in the kernel, the value -%ENODEV will be
* returned.
*/
struct dentry *debugfs_lookup(const char *name, struct dentry *parent)
{
struct dentry *dentry;
if (!debugfs_initialized() || IS_ERR_OR_NULL(name) || IS_ERR(parent))
return NULL;
if (!parent)
parent = debugfs_mount->mnt_root;
dentry = lookup_positive_unlocked(name, parent, strlen(name));
if (IS_ERR(dentry))
return NULL;
return dentry;
}
EXPORT_SYMBOL_GPL(debugfs_lookup);
static struct dentry *start_creating(const char *name, struct dentry *parent)
{
struct dentry *dentry;
int error;
if (!(debugfs_allow & DEBUGFS_ALLOW_API))
return ERR_PTR(-EPERM);
if (!debugfs_initialized())
return ERR_PTR(-ENOENT);
pr_debug("creating file '%s'\n", name);
if (IS_ERR(parent))
return parent;
error = simple_pin_fs(&debug_fs_type, &debugfs_mount,
&debugfs_mount_count);
if (error) {
pr_err("Unable to pin filesystem for file '%s'\n", name);
return ERR_PTR(error);
}
/* If the parent is not specified, we create it in the root.
* We need the root dentry to do this, which is in the super
* block. A pointer to that is in the struct vfsmount that we
* have around.
*/
if (!parent) parent = debugfs_mount->mnt_root; inode_lock(d_inode(parent));
if (unlikely(IS_DEADDIR(d_inode(parent))))
dentry = ERR_PTR(-ENOENT);
else
dentry = lookup_one_len(name, parent, strlen(name)); if (!IS_ERR(dentry) && d_really_is_positive(dentry)) {
if (d_is_dir(dentry))
pr_err("Directory '%s' with parent '%s' already present!\n",
name, parent->d_name.name);
else
pr_err("File '%s' in directory '%s' already present!\n",
name, parent->d_name.name);
dput(dentry);
dentry = ERR_PTR(-EEXIST);
}
if (IS_ERR(dentry)) {
inode_unlock(d_inode(parent));
simple_release_fs(&debugfs_mount, &debugfs_mount_count);
}
return dentry;
}
static struct dentry *failed_creating(struct dentry *dentry)
{
inode_unlock(d_inode(dentry->d_parent));
dput(dentry);
simple_release_fs(&debugfs_mount, &debugfs_mount_count);
return ERR_PTR(-ENOMEM);
}
static struct dentry *end_creating(struct dentry *dentry)
{
inode_unlock(d_inode(dentry->d_parent)); return dentry;
}
static struct dentry *__debugfs_create_file(const char *name, umode_t mode,
struct dentry *parent, void *data,
const struct file_operations *proxy_fops,
const struct file_operations *real_fops)
{
struct dentry *dentry;
struct inode *inode;
if (!(mode & S_IFMT))
mode |= S_IFREG; BUG_ON(!S_ISREG(mode)); dentry = start_creating(name, parent);
if (IS_ERR(dentry))
return dentry;
if (!(debugfs_allow & DEBUGFS_ALLOW_API)) { failed_creating(dentry);
return ERR_PTR(-EPERM);
}
inode = debugfs_get_inode(dentry->d_sb);
if (unlikely(!inode)) {
pr_err("out of free dentries, can not create file '%s'\n",
name);
return failed_creating(dentry);
}
inode->i_mode = mode;
inode->i_private = data;
inode->i_op = &debugfs_file_inode_operations;
inode->i_fop = proxy_fops;
dentry->d_fsdata = (void *)((unsigned long)real_fops |
DEBUGFS_FSDATA_IS_REAL_FOPS_BIT);
d_instantiate(dentry, inode);
fsnotify_create(d_inode(dentry->d_parent), dentry);
return end_creating(dentry);
}
/**
* debugfs_create_file - create a file in the debugfs filesystem
* @name: a pointer to a string containing the name of the file to create.
* @mode: the permission that the file should have.
* @parent: a pointer to the parent dentry for this file. This should be a
* directory dentry if set. If this parameter is NULL, then the
* file will be created in the root of the debugfs filesystem.
* @data: a pointer to something that the caller will want to get to later
* on. The inode.i_private pointer will point to this value on
* the open() call.
* @fops: a pointer to a struct file_operations that should be used for
* this file.
*
* This is the basic "create a file" function for debugfs. It allows for a
* wide range of flexibility in creating a file, or a directory (if you want
* to create a directory, the debugfs_create_dir() function is
* recommended to be used instead.)
*
* This function will return a pointer to a dentry if it succeeds. This
* pointer must be passed to the debugfs_remove() function when the file is
* to be removed (no automatic cleanup happens if your module is unloaded,
* you are responsible here.) If an error occurs, ERR_PTR(-ERROR) will be
* returned.
*
* If debugfs is not enabled in the kernel, the value -%ENODEV will be
* returned.
*/
struct dentry *debugfs_create_file(const char *name, umode_t mode,
struct dentry *parent, void *data,
const struct file_operations *fops)
{
return __debugfs_create_file(name, mode, parent, data,
fops ? &debugfs_full_proxy_file_operations :
&debugfs_noop_file_operations,
fops);
}
EXPORT_SYMBOL_GPL(debugfs_create_file);
/**
* debugfs_create_file_unsafe - create a file in the debugfs filesystem
* @name: a pointer to a string containing the name of the file to create.
* @mode: the permission that the file should have.
* @parent: a pointer to the parent dentry for this file. This should be a
* directory dentry if set. If this parameter is NULL, then the
* file will be created in the root of the debugfs filesystem.
* @data: a pointer to something that the caller will want to get to later
* on. The inode.i_private pointer will point to this value on
* the open() call.
* @fops: a pointer to a struct file_operations that should be used for
* this file.
*
* debugfs_create_file_unsafe() is completely analogous to
* debugfs_create_file(), the only difference being that the fops
* handed it will not get protected against file removals by the
* debugfs core.
*
* It is your responsibility to protect your struct file_operation
* methods against file removals by means of debugfs_file_get()
* and debugfs_file_put(). ->open() is still protected by
* debugfs though.
*
* Any struct file_operations defined by means of
* DEFINE_DEBUGFS_ATTRIBUTE() is protected against file removals and
* thus, may be used here.
*/
struct dentry *debugfs_create_file_unsafe(const char *name, umode_t mode,
struct dentry *parent, void *data,
const struct file_operations *fops)
{
return __debugfs_create_file(name, mode, parent, data,
fops ? &debugfs_open_proxy_file_operations :
&debugfs_noop_file_operations,
fops);
}
EXPORT_SYMBOL_GPL(debugfs_create_file_unsafe);
/**
* debugfs_create_file_size - create a file in the debugfs filesystem
* @name: a pointer to a string containing the name of the file to create.
* @mode: the permission that the file should have.
* @parent: a pointer to the parent dentry for this file. This should be a
* directory dentry if set. If this parameter is NULL, then the
* file will be created in the root of the debugfs filesystem.
* @data: a pointer to something that the caller will want to get to later
* on. The inode.i_private pointer will point to this value on
* the open() call.
* @fops: a pointer to a struct file_operations that should be used for
* this file.
* @file_size: initial file size
*
* This is the basic "create a file" function for debugfs. It allows for a
* wide range of flexibility in creating a file, or a directory (if you want
* to create a directory, the debugfs_create_dir() function is
* recommended to be used instead.)
*/
void debugfs_create_file_size(const char *name, umode_t mode,
struct dentry *parent, void *data,
const struct file_operations *fops,
loff_t file_size)
{
struct dentry *de = debugfs_create_file(name, mode, parent, data, fops);
if (!IS_ERR(de))
d_inode(de)->i_size = file_size;
}
EXPORT_SYMBOL_GPL(debugfs_create_file_size);
/**
* debugfs_create_dir - create a directory in the debugfs filesystem
* @name: a pointer to a string containing the name of the directory to
* create.
* @parent: a pointer to the parent dentry for this file. This should be a
* directory dentry if set. If this parameter is NULL, then the
* directory will be created in the root of the debugfs filesystem.
*
* This function creates a directory in debugfs with the given name.
*
* This function will return a pointer to a dentry if it succeeds. This
* pointer must be passed to the debugfs_remove() function when the file is
* to be removed (no automatic cleanup happens if your module is unloaded,
* you are responsible here.) If an error occurs, ERR_PTR(-ERROR) will be
* returned.
*
* If debugfs is not enabled in the kernel, the value -%ENODEV will be
* returned.
*/
struct dentry *debugfs_create_dir(const char *name, struct dentry *parent)
{
struct dentry *dentry = start_creating(name, parent);
struct inode *inode;
if (IS_ERR(dentry))
return dentry;
if (!(debugfs_allow & DEBUGFS_ALLOW_API)) { failed_creating(dentry);
return ERR_PTR(-EPERM);
}
inode = debugfs_get_inode(dentry->d_sb);
if (unlikely(!inode)) {
pr_err("out of free dentries, can not create directory '%s'\n",
name);
return failed_creating(dentry);
}
inode->i_mode = S_IFDIR | S_IRWXU | S_IRUGO | S_IXUGO;
inode->i_op = &debugfs_dir_inode_operations;
inode->i_fop = &simple_dir_operations;
/* directory inodes start off with i_nlink == 2 (for "." entry) */
inc_nlink(inode);
d_instantiate(dentry, inode);
inc_nlink(d_inode(dentry->d_parent));
fsnotify_mkdir(d_inode(dentry->d_parent), dentry);
return end_creating(dentry);
}
EXPORT_SYMBOL_GPL(debugfs_create_dir);
/**
* debugfs_create_automount - create automount point in the debugfs filesystem
* @name: a pointer to a string containing the name of the file to create.
* @parent: a pointer to the parent dentry for this file. This should be a
* directory dentry if set. If this parameter is NULL, then the
* file will be created in the root of the debugfs filesystem.
* @f: function to be called when pathname resolution steps on that one.
* @data: opaque argument to pass to f().
*
* @f should return what ->d_automount() would.
*/
struct dentry *debugfs_create_automount(const char *name,
struct dentry *parent,
debugfs_automount_t f,
void *data)
{
struct dentry *dentry = start_creating(name, parent);
struct inode *inode;
if (IS_ERR(dentry))
return dentry;
if (!(debugfs_allow & DEBUGFS_ALLOW_API)) {
failed_creating(dentry);
return ERR_PTR(-EPERM);
}
inode = debugfs_get_inode(dentry->d_sb);
if (unlikely(!inode)) {
pr_err("out of free dentries, can not create automount '%s'\n",
name);
return failed_creating(dentry);
}
make_empty_dir_inode(inode);
inode->i_flags |= S_AUTOMOUNT;
inode->i_private = data;
dentry->d_fsdata = (void *)f;
/* directory inodes start off with i_nlink == 2 (for "." entry) */
inc_nlink(inode);
d_instantiate(dentry, inode);
inc_nlink(d_inode(dentry->d_parent));
fsnotify_mkdir(d_inode(dentry->d_parent), dentry);
return end_creating(dentry);
}
EXPORT_SYMBOL(debugfs_create_automount);
/**
* debugfs_create_symlink- create a symbolic link in the debugfs filesystem
* @name: a pointer to a string containing the name of the symbolic link to
* create.
* @parent: a pointer to the parent dentry for this symbolic link. This
* should be a directory dentry if set. If this parameter is NULL,
* then the symbolic link will be created in the root of the debugfs
* filesystem.
* @target: a pointer to a string containing the path to the target of the
* symbolic link.
*
* This function creates a symbolic link with the given name in debugfs that
* links to the given target path.
*
* This function will return a pointer to a dentry if it succeeds. This
* pointer must be passed to the debugfs_remove() function when the symbolic
* link is to be removed (no automatic cleanup happens if your module is
* unloaded, you are responsible here.) If an error occurs, ERR_PTR(-ERROR)
* will be returned.
*
* If debugfs is not enabled in the kernel, the value -%ENODEV will be
* returned.
*/
struct dentry *debugfs_create_symlink(const char *name, struct dentry *parent,
const char *target)
{
struct dentry *dentry;
struct inode *inode;
char *link = kstrdup(target, GFP_KERNEL);
if (!link)
return ERR_PTR(-ENOMEM);
dentry = start_creating(name, parent);
if (IS_ERR(dentry)) {
kfree(link);
return dentry;
}
inode = debugfs_get_inode(dentry->d_sb);
if (unlikely(!inode)) {
pr_err("out of free dentries, can not create symlink '%s'\n",
name);
kfree(link);
return failed_creating(dentry);
}
inode->i_mode = S_IFLNK | S_IRWXUGO;
inode->i_op = &debugfs_symlink_inode_operations;
inode->i_link = link;
d_instantiate(dentry, inode);
return end_creating(dentry);
}
EXPORT_SYMBOL_GPL(debugfs_create_symlink);
static void __debugfs_file_removed(struct dentry *dentry)
{
struct debugfs_fsdata *fsd;
/*
* Paired with the closing smp_mb() implied by a successful
* cmpxchg() in debugfs_file_get(): either
* debugfs_file_get() must see a dead dentry or we must see a
* debugfs_fsdata instance at ->d_fsdata here (or both).
*/
smp_mb();
fsd = READ_ONCE(dentry->d_fsdata);
if ((unsigned long)fsd & DEBUGFS_FSDATA_IS_REAL_FOPS_BIT)
return;
if (!refcount_dec_and_test(&fsd->active_users)) wait_for_completion(&fsd->active_users_drained);
}
static void remove_one(struct dentry *victim)
{
if (d_is_reg(victim))
__debugfs_file_removed(victim);
simple_release_fs(&debugfs_mount, &debugfs_mount_count);
}
/**
* debugfs_remove - recursively removes a directory
* @dentry: a pointer to a the dentry of the directory to be removed. If this
* parameter is NULL or an error value, nothing will be done.
*
* This function recursively removes a directory tree in debugfs that
* was previously created with a call to another debugfs function
* (like debugfs_create_file() or variants thereof.)
*
* This function is required to be called in order for the file to be
* removed, no automatic cleanup of files will happen when a module is
* removed, you are responsible here.
*/
void debugfs_remove(struct dentry *dentry)
{
if (IS_ERR_OR_NULL(dentry))
return;
simple_pin_fs(&debug_fs_type, &debugfs_mount, &debugfs_mount_count);
simple_recursive_removal(dentry, remove_one);
simple_release_fs(&debugfs_mount, &debugfs_mount_count);
}
EXPORT_SYMBOL_GPL(debugfs_remove);
/**
* debugfs_rename - rename a file/directory in the debugfs filesystem
* @old_dir: a pointer to the parent dentry for the renamed object. This
* should be a directory dentry.
* @old_dentry: dentry of an object to be renamed.
* @new_dir: a pointer to the parent dentry where the object should be
* moved. This should be a directory dentry.
* @new_name: a pointer to a string containing the target name.
*
* This function renames a file/directory in debugfs. The target must not
* exist for rename to succeed.
*
* This function will return a pointer to old_dentry (which is updated to
* reflect renaming) if it succeeds. If an error occurs, %NULL will be
* returned.
*
* If debugfs is not enabled in the kernel, the value -%ENODEV will be
* returned.
*/
struct dentry *debugfs_rename(struct dentry *old_dir, struct dentry *old_dentry,
struct dentry *new_dir, const char *new_name)
{
int error;
struct dentry *dentry = NULL, *trap;
struct name_snapshot old_name;
if (IS_ERR(old_dir))
return old_dir;
if (IS_ERR(new_dir))
return new_dir;
if (IS_ERR_OR_NULL(old_dentry))
return old_dentry;
trap = lock_rename(new_dir, old_dir);
/* Source or destination directories don't exist? */
if (d_really_is_negative(old_dir) || d_really_is_negative(new_dir))
goto exit;
/* Source does not exist, cyclic rename, or mountpoint? */
if (d_really_is_negative(old_dentry) || old_dentry == trap ||
d_mountpoint(old_dentry))
goto exit;
dentry = lookup_one_len(new_name, new_dir, strlen(new_name));
/* Lookup failed, cyclic rename or target exists? */
if (IS_ERR(dentry) || dentry == trap || d_really_is_positive(dentry))
goto exit;
take_dentry_name_snapshot(&old_name, old_dentry);
error = simple_rename(&init_user_ns, d_inode(old_dir), old_dentry,
d_inode(new_dir), dentry, 0);
if (error) {
release_dentry_name_snapshot(&old_name);
goto exit;
}
d_move(old_dentry, dentry);
fsnotify_move(d_inode(old_dir), d_inode(new_dir), &old_name.name,
d_is_dir(old_dentry),
NULL, old_dentry);
release_dentry_name_snapshot(&old_name);
unlock_rename(new_dir, old_dir);
dput(dentry);
return old_dentry;
exit:
if (dentry && !IS_ERR(dentry))
dput(dentry);
unlock_rename(new_dir, old_dir);
if (IS_ERR(dentry))
return dentry;
return ERR_PTR(-EINVAL);
}
EXPORT_SYMBOL_GPL(debugfs_rename);
/**
* debugfs_initialized - Tells whether debugfs has been registered
*/
bool debugfs_initialized(void)
{
return debugfs_registered;
}
EXPORT_SYMBOL_GPL(debugfs_initialized);
static int __init debugfs_kernel(char *str)
{
if (str) {
if (!strcmp(str, "on"))
debugfs_allow = DEBUGFS_ALLOW_API | DEBUGFS_ALLOW_MOUNT;
else if (!strcmp(str, "no-mount"))
debugfs_allow = DEBUGFS_ALLOW_API;
else if (!strcmp(str, "off"))
debugfs_allow = 0;
}
return 0;
}
early_param("debugfs", debugfs_kernel);
static int __init debugfs_init(void)
{
int retval;
if (!(debugfs_allow & DEBUGFS_ALLOW_MOUNT))
return -EPERM;
retval = sysfs_create_mount_point(kernel_kobj, "debug");
if (retval)
return retval;
retval = register_filesystem(&debug_fs_type);
if (retval)
sysfs_remove_mount_point(kernel_kobj, "debug");
else
debugfs_registered = true;
return retval;
}
core_initcall(debugfs_init);
// SPDX-License-Identifier: GPL-2.0
/*
* Copyright (C) 2007 Oracle. All rights reserved.
*/
#include <linux/sched.h>
#include <linux/sched/signal.h>
#include <linux/pagemap.h>
#include <linux/writeback.h>
#include <linux/blkdev.h>
#include <linux/sort.h>
#include <linux/rcupdate.h>
#include <linux/kthread.h>
#include <linux/slab.h>
#include <linux/ratelimit.h>
#include <linux/percpu_counter.h>
#include <linux/lockdep.h>
#include <linux/crc32c.h>
#include "misc.h"
#include "tree-log.h"
#include "disk-io.h"
#include "print-tree.h"
#include "volumes.h"
#include "raid56.h"
#include "locking.h"
#include "free-space-cache.h"
#include "free-space-tree.h"
#include "sysfs.h"
#include "qgroup.h"
#include "ref-verify.h"
#include "space-info.h"
#include "block-rsv.h"
#include "delalloc-space.h"
#include "block-group.h"
#include "discard.h"
#include "rcu-string.h"
#include "zoned.h"
#include "dev-replace.h"
#undef SCRAMBLE_DELAYED_REFS
static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
struct btrfs_delayed_ref_node *node, u64 parent,
u64 root_objectid, u64 owner_objectid,
u64 owner_offset, int refs_to_drop,
struct btrfs_delayed_extent_op *extra_op);
static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op,
struct extent_buffer *leaf,
struct btrfs_extent_item *ei);
static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
u64 parent, u64 root_objectid,
u64 flags, u64 owner, u64 offset,
struct btrfs_key *ins, int ref_mod);
static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
struct btrfs_delayed_ref_node *node,
struct btrfs_delayed_extent_op *extent_op);
static int find_next_key(struct btrfs_path *path, int level,
struct btrfs_key *key);
static int block_group_bits(struct btrfs_block_group *cache, u64 bits)
{
return (cache->flags & bits) == bits;
}
int btrfs_add_excluded_extent(struct btrfs_fs_info *fs_info,
u64 start, u64 num_bytes)
{
u64 end = start + num_bytes - 1;
set_extent_bits(&fs_info->excluded_extents, start, end,
EXTENT_UPTODATE);
return 0;
}
void btrfs_free_excluded_extents(struct btrfs_block_group *cache)
{
struct btrfs_fs_info *fs_info = cache->fs_info;
u64 start, end;
start = cache->start;
end = start + cache->length - 1;
clear_extent_bits(&fs_info->excluded_extents, start, end,
EXTENT_UPTODATE);
}
/* simple helper to search for an existing data extent at a given offset */
int btrfs_lookup_data_extent(struct btrfs_fs_info *fs_info, u64 start, u64 len)
{
int ret;
struct btrfs_key key;
struct btrfs_path *path;
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
key.objectid = start;
key.offset = len;
key.type = BTRFS_EXTENT_ITEM_KEY;
ret = btrfs_search_slot(NULL, fs_info->extent_root, &key, path, 0, 0);
btrfs_free_path(path);
return ret;
}
/*
* helper function to lookup reference count and flags of a tree block.
*
* the head node for delayed ref is used to store the sum of all the
* reference count modifications queued up in the rbtree. the head
* node may also store the extent flags to set. This way you can check
* to see what the reference count and extent flags would be if all of
* the delayed refs are not processed.
*/
int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info, u64 bytenr,
u64 offset, int metadata, u64 *refs, u64 *flags)
{
struct btrfs_delayed_ref_head *head;
struct btrfs_delayed_ref_root *delayed_refs;
struct btrfs_path *path;
struct btrfs_extent_item *ei;
struct extent_buffer *leaf;
struct btrfs_key key;
u32 item_size;
u64 num_refs;
u64 extent_flags;
int ret;
/*
* If we don't have skinny metadata, don't bother doing anything
* different
*/
if (metadata && !btrfs_fs_incompat(fs_info, SKINNY_METADATA)) {
offset = fs_info->nodesize;
metadata = 0;
}
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
if (!trans) {
path->skip_locking = 1;
path->search_commit_root = 1;
}
search_again:
key.objectid = bytenr;
key.offset = offset;
if (metadata)
key.type = BTRFS_METADATA_ITEM_KEY;
else
key.type = BTRFS_EXTENT_ITEM_KEY;
ret = btrfs_search_slot(NULL, fs_info->extent_root, &key, path, 0, 0);
if (ret < 0)
goto out_free;
if (ret > 0 && metadata && key.type == BTRFS_METADATA_ITEM_KEY) {
if (path->slots[0]) {
path->slots[0]--;
btrfs_item_key_to_cpu(path->nodes[0], &key,
path->slots[0]);
if (key.objectid == bytenr &&
key.type == BTRFS_EXTENT_ITEM_KEY &&
key.offset == fs_info->nodesize)
ret = 0;
}
}
if (ret == 0) {
leaf = path->nodes[0];
item_size = btrfs_item_size_nr(leaf, path->slots[0]);
if (item_size >= sizeof(*ei)) {
ei = btrfs_item_ptr(leaf, path->slots[0],
struct btrfs_extent_item);
num_refs = btrfs_extent_refs(leaf, ei);
extent_flags = btrfs_extent_flags(leaf, ei);
} else {
ret = -EINVAL;
btrfs_print_v0_err(fs_info);
if (trans)
btrfs_abort_transaction(trans, ret);
else
btrfs_handle_fs_error(fs_info, ret, NULL);
goto out_free;
}
BUG_ON(num_refs == 0);
} else {
num_refs = 0;
extent_flags = 0;
ret = 0;
}
if (!trans)
goto out;
delayed_refs = &trans->transaction->delayed_refs;
spin_lock(&delayed_refs->lock);
head = btrfs_find_delayed_ref_head(delayed_refs, bytenr);
if (head) {
if (!mutex_trylock(&head->mutex)) {
refcount_inc(&head->refs);
spin_unlock(&delayed_refs->lock);
btrfs_release_path(path);
/*
* Mutex was contended, block until it's released and try
* again
*/
mutex_lock(&head->mutex);
mutex_unlock(&head->mutex);
btrfs_put_delayed_ref_head(head);
goto search_again;
}
spin_lock(&head->lock);
if (head->extent_op && head->extent_op->update_flags)
extent_flags |= head->extent_op->flags_to_set;
else
BUG_ON(num_refs == 0);
num_refs += head->ref_mod;
spin_unlock(&head->lock);
mutex_unlock(&head->mutex);
}
spin_unlock(&delayed_refs->lock);
out:
WARN_ON(num_refs == 0);
if (refs)
*refs = num_refs;
if (flags)
*flags = extent_flags;
out_free:
btrfs_free_path(path);
return ret;
}
/*
* Back reference rules. Back refs have three main goals:
*
* 1) differentiate between all holders of references to an extent so that
* when a reference is dropped we can make sure it was a valid reference
* before freeing the extent.
*
* 2) Provide enough information to quickly find the holders of an extent
* if we notice a given block is corrupted or bad.
*
* 3) Make it easy to migrate blocks for FS shrinking or storage pool
* maintenance. This is actually the same as #2, but with a slightly
* different use case.
*
* There are two kinds of back refs. The implicit back refs is optimized
* for pointers in non-shared tree blocks. For a given pointer in a block,
* back refs of this kind provide information about the block's owner tree
* and the pointer's key. These information allow us to find the block by
* b-tree searching. The full back refs is for pointers in tree blocks not
* referenced by their owner trees. The location of tree block is recorded
* in the back refs. Actually the full back refs is generic, and can be
* used in all cases the implicit back refs is used. The major shortcoming
* of the full back refs is its overhead. Every time a tree block gets
* COWed, we have to update back refs entry for all pointers in it.
*
* For a newly allocated tree block, we use implicit back refs for
* pointers in it. This means most tree related operations only involve
* implicit back refs. For a tree block created in old transaction, the
* only way to drop a reference to it is COW it. So we can detect the
* event that tree block loses its owner tree's reference and do the
* back refs conversion.
*
* When a tree block is COWed through a tree, there are four cases:
*
* The reference count of the block is one and the tree is the block's
* owner tree. Nothing to do in this case.
*
* The reference count of the block is one and the tree is not the
* block's owner tree. In this case, full back refs is used for pointers
* in the block. Remove these full back refs, add implicit back refs for
* every pointers in the new block.
*
* The reference count of the block is greater than one and the tree is
* the block's owner tree. In this case, implicit back refs is used for
* pointers in the block. Add full back refs for every pointers in the
* block, increase lower level extents' reference counts. The original
* implicit back refs are entailed to the new block.
*
* The reference count of the block is greater than one and the tree is
* not the block's owner tree. Add implicit back refs for every pointer in
* the new block, increase lower level extents' reference count.
*
* Back Reference Key composing:
*
* The key objectid corresponds to the first byte in the extent,
* The key type is used to differentiate between types of back refs.
* There are different meanings of the key offset for different types
* of back refs.
*
* File extents can be referenced by:
*
* - multiple snapshots, subvolumes, or different generations in one subvol
* - different files inside a single subvolume
* - different offsets inside a file (bookend extents in file.c)
*
* The extent ref structure for the implicit back refs has fields for:
*
* - Objectid of the subvolume root
* - objectid of the file holding the reference
* - original offset in the file
* - how many bookend extents
*
* The key offset for the implicit back refs is hash of the first
* three fields.
*
* The extent ref structure for the full back refs has field for:
*
* - number of pointers in the tree leaf
*
* The key offset for the implicit back refs is the first byte of
* the tree leaf
*
* When a file extent is allocated, The implicit back refs is used.
* the fields are filled in:
*
* (root_key.objectid, inode objectid, offset in file, 1)
*
* When a file extent is removed file truncation, we find the
* corresponding implicit back refs and check the following fields:
*
* (btrfs_header_owner(leaf), inode objectid, offset in file)
*
* Btree extents can be referenced by:
*
* - Different subvolumes
*
* Both the implicit back refs and the full back refs for tree blocks
* only consist of key. The key offset for the implicit back refs is
* objectid of block's owner tree. The key offset for the full back refs
* is the first byte of parent block.
*
* When implicit back refs is used, information about the lowest key and
* level of the tree block are required. These information are stored in
* tree block info structure.
*/
/*
* is_data == BTRFS_REF_TYPE_BLOCK, tree block type is required,
* is_data == BTRFS_REF_TYPE_DATA, data type is requiried,
* is_data == BTRFS_REF_TYPE_ANY, either type is OK.
*/
int btrfs_get_extent_inline_ref_type(const struct extent_buffer *eb,
struct btrfs_extent_inline_ref *iref,
enum btrfs_inline_ref_type is_data)
{
int type = btrfs_extent_inline_ref_type(eb, iref);
u64 offset = btrfs_extent_inline_ref_offset(eb, iref);
if (type == BTRFS_TREE_BLOCK_REF_KEY || type == BTRFS_SHARED_BLOCK_REF_KEY || type == BTRFS_SHARED_DATA_REF_KEY ||
type == BTRFS_EXTENT_DATA_REF_KEY) {
if (is_data == BTRFS_REF_TYPE_BLOCK) {
if (type == BTRFS_TREE_BLOCK_REF_KEY)
return type;
if (type == BTRFS_SHARED_BLOCK_REF_KEY) {
ASSERT(eb->fs_info);
/*
* Every shared one has parent tree block,
* which must be aligned to sector size.
*/
if (offset && IS_ALIGNED(offset, eb->fs_info->sectorsize))
return type;
}
} else if (is_data == BTRFS_REF_TYPE_DATA) { if (type == BTRFS_EXTENT_DATA_REF_KEY)
return type;
if (type == BTRFS_SHARED_DATA_REF_KEY) {
ASSERT(eb->fs_info);
/*
* Every shared one has parent tree block,
* which must be aligned to sector size.
*/
if (offset && IS_ALIGNED(offset, eb->fs_info->sectorsize))
return type;
}
} else {
ASSERT(is_data == BTRFS_REF_TYPE_ANY);
return type;
}
}
btrfs_print_leaf((struct extent_buffer *)eb);
btrfs_err(eb->fs_info,
"eb %llu iref 0x%lx invalid extent inline ref type %d",
eb->start, (unsigned long)iref, type);
WARN_ON(1);
return BTRFS_REF_TYPE_INVALID;
}
u64 hash_extent_data_ref(u64 root_objectid, u64 owner, u64 offset)
{
u32 high_crc = ~(u32)0;
u32 low_crc = ~(u32)0;
__le64 lenum;
lenum = cpu_to_le64(root_objectid);
high_crc = btrfs_crc32c(high_crc, &lenum, sizeof(lenum));
lenum = cpu_to_le64(owner);
low_crc = btrfs_crc32c(low_crc, &lenum, sizeof(lenum));
lenum = cpu_to_le64(offset);
low_crc = btrfs_crc32c(low_crc, &lenum, sizeof(lenum));
return ((u64)high_crc << 31) ^ (u64)low_crc;
}
static u64 hash_extent_data_ref_item(struct extent_buffer *leaf,
struct btrfs_extent_data_ref *ref)
{
return hash_extent_data_ref(btrfs_extent_data_ref_root(leaf, ref),
btrfs_extent_data_ref_objectid(leaf, ref),
btrfs_extent_data_ref_offset(leaf, ref));
}
static int match_extent_data_ref(struct extent_buffer *leaf,
struct btrfs_extent_data_ref *ref,
u64 root_objectid, u64 owner, u64 offset)
{
if (btrfs_extent_data_ref_root(leaf, ref) != root_objectid ||
btrfs_extent_data_ref_objectid(leaf, ref) != owner ||
btrfs_extent_data_ref_offset(leaf, ref) != offset)
return 0;
return 1;
}
static noinline int lookup_extent_data_ref(struct btrfs_trans_handle *trans,
struct btrfs_path *path,
u64 bytenr, u64 parent,
u64 root_objectid,
u64 owner, u64 offset)
{
struct btrfs_root *root = trans->fs_info->extent_root;
struct btrfs_key key;
struct btrfs_extent_data_ref *ref;
struct extent_buffer *leaf;
u32 nritems;
int ret;
int recow;
int err = -ENOENT;
key.objectid = bytenr;
if (parent) {
key.type = BTRFS_SHARED_DATA_REF_KEY;
key.offset = parent;
} else {
key.type = BTRFS_EXTENT_DATA_REF_KEY;
key.offset = hash_extent_data_ref(root_objectid,
owner, offset);
}
again:
recow = 0;
ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
if (ret < 0) {
err = ret;
goto fail;
}
if (parent) {
if (!ret)
return 0;
goto fail;
}
leaf = path->nodes[0];
nritems = btrfs_header_nritems(leaf);
while (1) {
if (path->slots[0] >= nritems) {
ret = btrfs_next_leaf(root, path);
if (ret < 0)
err = ret;
if (ret)
goto fail;
leaf = path->nodes[0];
nritems = btrfs_header_nritems(leaf);
recow = 1;
}
btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
if (key.objectid != bytenr ||
key.type != BTRFS_EXTENT_DATA_REF_KEY)
goto fail;
ref = btrfs_item_ptr(leaf, path->slots[0],
struct btrfs_extent_data_ref);
if (match_extent_data_ref(leaf, ref, root_objectid,
owner, offset)) {
if (recow) {
btrfs_release_path(path);
goto again;
}
err = 0;
break;
}
path->slots[0]++;
}
fail:
return err;
}
static noinline int insert_extent_data_ref(struct btrfs_trans_handle *trans,
struct btrfs_path *path,
u64 bytenr, u64 parent,
u64 root_objectid, u64 owner,
u64 offset, int refs_to_add)
{
struct btrfs_root *root = trans->fs_info->extent_root;
struct btrfs_key key;
struct extent_buffer *leaf;
u32 size;
u32 num_refs;
int ret;
key.objectid = bytenr;
if (parent) {
key.type = BTRFS_SHARED_DATA_REF_KEY;
key.offset = parent;
size = sizeof(struct btrfs_shared_data_ref);
} else {
key.type = BTRFS_EXTENT_DATA_REF_KEY;
key.offset = hash_extent_data_ref(root_objectid,
owner, offset);
size = sizeof(struct btrfs_extent_data_ref);
}
ret = btrfs_insert_empty_item(trans, root, path, &key, size);
if (ret && ret != -EEXIST)
goto fail;
leaf = path->nodes[0];
if (parent) {
struct btrfs_shared_data_ref *ref;
ref = btrfs_item_ptr(leaf, path->slots[0],
struct btrfs_shared_data_ref);
if (ret == 0) {
btrfs_set_shared_data_ref_count(leaf, ref, refs_to_add);
} else {
num_refs = btrfs_shared_data_ref_count(leaf, ref);
num_refs += refs_to_add;
btrfs_set_shared_data_ref_count(leaf, ref, num_refs);
}
} else {
struct btrfs_extent_data_ref *ref;
while (ret == -EEXIST) {
ref = btrfs_item_ptr(leaf, path->slots[0],
struct btrfs_extent_data_ref);
if (match_extent_data_ref(leaf, ref, root_objectid,
owner, offset))
break;
btrfs_release_path(path);
key.offset++;
ret = btrfs_insert_empty_item(trans, root, path, &key,
size);
if (ret && ret != -EEXIST)
goto fail;
leaf = path->nodes[0];
}
ref = btrfs_item_ptr(leaf, path->slots[0],
struct btrfs_extent_data_ref);
if (ret == 0) {
btrfs_set_extent_data_ref_root(leaf, ref,
root_objectid);
btrfs_set_extent_data_ref_objectid(leaf, ref, owner);
btrfs_set_extent_data_ref_offset(leaf, ref, offset);
btrfs_set_extent_data_ref_count(leaf, ref, refs_to_add);
} else {
num_refs = btrfs_extent_data_ref_count(leaf, ref);
num_refs += refs_to_add;
btrfs_set_extent_data_ref_count(leaf, ref, num_refs);
}
}
btrfs_mark_buffer_dirty(leaf);
ret = 0;
fail:
btrfs_release_path(path);
return ret;
}
static noinline int remove_extent_data_ref(struct btrfs_trans_handle *trans,
struct btrfs_path *path,
int refs_to_drop, int *last_ref)
{
struct btrfs_key key;
struct btrfs_extent_data_ref *ref1 = NULL;
struct btrfs_shared_data_ref *ref2 = NULL;
struct extent_buffer *leaf;
u32 num_refs = 0;
int ret = 0;
leaf = path->nodes[0];
btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
if (key.type == BTRFS_EXTENT_DATA_REF_KEY) {
ref1 = btrfs_item_ptr(leaf, path->slots[0],
struct btrfs_extent_data_ref);
num_refs = btrfs_extent_data_ref_count(leaf, ref1);
} else if (key.type == BTRFS_SHARED_DATA_REF_KEY) {
ref2 = btrfs_item_ptr(leaf, path->slots[0],
struct btrfs_shared_data_ref);
num_refs = btrfs_shared_data_ref_count(leaf, ref2);
} else if (unlikely(key.type == BTRFS_EXTENT_REF_V0_KEY)) {
btrfs_print_v0_err(trans->fs_info);
btrfs_abort_transaction(trans, -EINVAL);
return -EINVAL;
} else {
BUG();
}
BUG_ON(num_refs < refs_to_drop);
num_refs -= refs_to_drop;
if (num_refs == 0) {
ret = btrfs_del_item(trans, trans->fs_info->extent_root, path);
*last_ref = 1;
} else {
if (key.type == BTRFS_EXTENT_DATA_REF_KEY)
btrfs_set_extent_data_ref_count(leaf, ref1, num_refs);
else if (key.type == BTRFS_SHARED_DATA_REF_KEY)
btrfs_set_shared_data_ref_count(leaf, ref2, num_refs);
btrfs_mark_buffer_dirty(leaf);
}
return ret;
}
static noinline u32 extent_data_ref_count(struct btrfs_path *path,
struct btrfs_extent_inline_ref *iref)
{
struct btrfs_key key;
struct extent_buffer *leaf;
struct btrfs_extent_data_ref *ref1;
struct btrfs_shared_data_ref *ref2;
u32 num_refs = 0;
int type;
leaf = path->nodes[0];
btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); BUG_ON(key.type == BTRFS_EXTENT_REF_V0_KEY); if (iref) {
/*
* If type is invalid, we should have bailed out earlier than
* this call.
*/
type = btrfs_get_extent_inline_ref_type(leaf, iref, BTRFS_REF_TYPE_DATA);
ASSERT(type != BTRFS_REF_TYPE_INVALID);
if (type == BTRFS_EXTENT_DATA_REF_KEY) {
ref1 = (struct btrfs_extent_data_ref *)(&iref->offset);
num_refs = btrfs_extent_data_ref_count(leaf, ref1);
} else {
ref2 = (struct btrfs_shared_data_ref *)(iref + 1);
num_refs = btrfs_shared_data_ref_count(leaf, ref2);
}
} else if (key.type == BTRFS_EXTENT_DATA_REF_KEY) { ref1 = btrfs_item_ptr(leaf, path->slots[0],
struct btrfs_extent_data_ref);
num_refs = btrfs_extent_data_ref_count(leaf, ref1);
} else if (key.type == BTRFS_SHARED_DATA_REF_KEY) { ref2 = btrfs_item_ptr(leaf, path->slots[0],
struct btrfs_shared_data_ref);
num_refs = btrfs_shared_data_ref_count(leaf, ref2);
} else {
WARN_ON(1);
}
return num_refs;
}
static noinline int lookup_tree_block_ref(struct btrfs_trans_handle *trans,
struct btrfs_path *path,
u64 bytenr, u64 parent,
u64 root_objectid)
{
struct btrfs_root *root = trans->fs_info->extent_root;
struct btrfs_key key;
int ret;
key.objectid = bytenr;
if (parent) {
key.type = BTRFS_SHARED_BLOCK_REF_KEY;
key.offset = parent;
} else {
key.type = BTRFS_TREE_BLOCK_REF_KEY;
key.offset = root_objectid;
}
ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
if (ret > 0)
ret = -ENOENT;
return ret;
}
static noinline int insert_tree_block_ref(struct btrfs_trans_handle *trans,
struct btrfs_path *path,
u64 bytenr, u64 parent,
u64 root_objectid)
{
struct btrfs_key key;
int ret;
key.objectid = bytenr;
if (parent) {
key.type = BTRFS_SHARED_BLOCK_REF_KEY;
key.offset = parent;
} else {
key.type = BTRFS_TREE_BLOCK_REF_KEY;
key.offset = root_objectid;
}
ret = btrfs_insert_empty_item(trans, trans->fs_info->extent_root,
path, &key, 0);
btrfs_release_path(path);
return ret;
}
static inline int extent_ref_type(u64 parent, u64 owner)
{
int type;
if (owner < BTRFS_FIRST_FREE_OBJECTID) {
if (parent > 0)
type = BTRFS_SHARED_BLOCK_REF_KEY;
else
type = BTRFS_TREE_BLOCK_REF_KEY;
} else {
if (parent > 0)
type = BTRFS_SHARED_DATA_REF_KEY;
else
type = BTRFS_EXTENT_DATA_REF_KEY;
}
return type;
}
static int find_next_key(struct btrfs_path *path, int level,
struct btrfs_key *key)
{
for (; level < BTRFS_MAX_LEVEL; level++) {
if (!path->nodes[level])
break;
if (path->slots[level] + 1 >=
btrfs_header_nritems(path->nodes[level]))
continue;
if (level == 0)
btrfs_item_key_to_cpu(path->nodes[level], key,
path->slots[level] + 1);
else
btrfs_node_key_to_cpu(path->nodes[level], key,
path->slots[level] + 1);
return 0;
}
return 1;
}
/*
* look for inline back ref. if back ref is found, *ref_ret is set
* to the address of inline back ref, and 0 is returned.
*
* if back ref isn't found, *ref_ret is set to the address where it
* should be inserted, and -ENOENT is returned.
*
* if insert is true and there are too many inline back refs, the path
* points to the extent item, and -EAGAIN is returned.
*
* NOTE: inline back refs are ordered in the same way that back ref
* items in the tree are ordered.
*/
static noinline_for_stack
int lookup_inline_extent_backref(struct btrfs_trans_handle *trans,
struct btrfs_path *path,
struct btrfs_extent_inline_ref **ref_ret,
u64 bytenr, u64 num_bytes,
u64 parent, u64 root_objectid,
u64 owner, u64 offset, int insert)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
struct btrfs_root *root = fs_info->extent_root;
struct btrfs_key key;
struct extent_buffer *leaf;
struct btrfs_extent_item *ei;
struct btrfs_extent_inline_ref *iref;
u64 flags;
u64 item_size;
unsigned long ptr;
unsigned long end;
int extra_size;
int type;
int want;
int ret;
int err = 0;
bool skinny_metadata = btrfs_fs_incompat(fs_info, SKINNY_METADATA);
int needed;
key.objectid = bytenr;
key.type = BTRFS_EXTENT_ITEM_KEY;
key.offset = num_bytes;
want = extent_ref_type(parent, owner);
if (insert) {
extra_size = btrfs_extent_inline_ref_size(want);
path->search_for_extension = 1;
path->keep_locks = 1;
} else
extra_size = -1;
/*
* Owner is our level, so we can just add one to get the level for the
* block we are interested in.
*/
if (skinny_metadata && owner < BTRFS_FIRST_FREE_OBJECTID) { key.type = BTRFS_METADATA_ITEM_KEY;
key.offset = owner;
}
again:
ret = btrfs_search_slot(trans, root, &key, path, extra_size, 1);
if (ret < 0) {
err = ret;
goto out;
}
/*
* We may be a newly converted file system which still has the old fat
* extent entries for metadata, so try and see if we have one of those.
*/
if (ret > 0 && skinny_metadata) {
skinny_metadata = false;
if (path->slots[0]) { path->slots[0]--;
btrfs_item_key_to_cpu(path->nodes[0], &key,
path->slots[0]);
if (key.objectid == bytenr &&
key.type == BTRFS_EXTENT_ITEM_KEY && key.offset == num_bytes)
ret = 0;
}
if (ret) {
key.objectid = bytenr;
key.type = BTRFS_EXTENT_ITEM_KEY;
key.offset = num_bytes;
btrfs_release_path(path);
goto again;
}
}
if (ret && !insert) {
err = -ENOENT;
goto out;
} else if (WARN_ON(ret)) {
err = -EIO;
goto out;
}
leaf = path->nodes[0];
item_size = btrfs_item_size_nr(leaf, path->slots[0]);
if (unlikely(item_size < sizeof(*ei))) {
err = -EINVAL;
btrfs_print_v0_err(fs_info);
btrfs_abort_transaction(trans, err);
goto out;
}
ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
flags = btrfs_extent_flags(leaf, ei);
ptr = (unsigned long)(ei + 1);
end = (unsigned long)ei + item_size;
if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK && !skinny_metadata) { ptr += sizeof(struct btrfs_tree_block_info); BUG_ON(ptr > end);
}
if (owner >= BTRFS_FIRST_FREE_OBJECTID)
needed = BTRFS_REF_TYPE_DATA;
else
needed = BTRFS_REF_TYPE_BLOCK;
err = -ENOENT;
while (1) {
if (ptr >= end) { WARN_ON(ptr > end);
break;
}
iref = (struct btrfs_extent_inline_ref *)ptr;
type = btrfs_get_extent_inline_ref_type(leaf, iref, needed);
if (type == BTRFS_REF_TYPE_INVALID) {
err = -EUCLEAN;
goto out;
}
if (want < type)
break;
if (want > type) {
ptr += btrfs_extent_inline_ref_size(type);
continue;
}
if (type == BTRFS_EXTENT_DATA_REF_KEY) {
struct btrfs_extent_data_ref *dref;
dref = (struct btrfs_extent_data_ref *)(&iref->offset);
if (match_extent_data_ref(leaf, dref, root_objectid,
owner, offset)) {
err = 0;
break;
}
if (hash_extent_data_ref_item(leaf, dref) <
hash_extent_data_ref(root_objectid, owner, offset))
break;
} else {
u64 ref_offset;
ref_offset = btrfs_extent_inline_ref_offset(leaf, iref);
if (parent > 0) {
if (parent == ref_offset) {
err = 0;
break;
}
if (ref_offset < parent)
break;
} else {
if (root_objectid == ref_offset) {
err = 0;
break;
}
if (ref_offset < root_objectid)
break;
}
}
ptr += btrfs_extent_inline_ref_size(type);
}
if (err == -ENOENT && insert) { if (item_size + extra_size >=
BTRFS_MAX_EXTENT_ITEM_SIZE(root)) {
err = -EAGAIN;
goto out;
}
/*
* To add new inline back ref, we have to make sure
* there is no corresponding back ref item.
* For simplicity, we just do not add new inline back
* ref if there is any kind of item for this block
*/
if (find_next_key(path, 0, &key) == 0 && key.objectid == bytenr && key.type < BTRFS_BLOCK_GROUP_ITEM_KEY) {
err = -EAGAIN;
goto out;
}
}
*ref_ret = (struct btrfs_extent_inline_ref *)ptr;
out:
if (insert) { path->keep_locks = 0;
path->search_for_extension = 0;
btrfs_unlock_up_safe(path, 1);
}
return err;
}
/*
* helper to add new inline back ref
*/
static noinline_for_stack
void setup_inline_extent_backref(struct btrfs_fs_info *fs_info,
struct btrfs_path *path,
struct btrfs_extent_inline_ref *iref,
u64 parent, u64 root_objectid,
u64 owner, u64 offset, int refs_to_add,
struct btrfs_delayed_extent_op *extent_op)
{
struct extent_buffer *leaf;
struct btrfs_extent_item *ei;
unsigned long ptr;
unsigned long end;
unsigned long item_offset;
u64 refs;
int size;
int type;
leaf = path->nodes[0];
ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
item_offset = (unsigned long)iref - (unsigned long)ei;
type = extent_ref_type(parent, owner);
size = btrfs_extent_inline_ref_size(type);
btrfs_extend_item(path, size);
ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
refs = btrfs_extent_refs(leaf, ei);
refs += refs_to_add;
btrfs_set_extent_refs(leaf, ei, refs);
if (extent_op)
__run_delayed_extent_op(extent_op, leaf, ei);
ptr = (unsigned long)ei + item_offset;
end = (unsigned long)ei + btrfs_item_size_nr(leaf, path->slots[0]);
if (ptr < end - size)
memmove_extent_buffer(leaf, ptr + size, ptr,
end - size - ptr);
iref = (struct btrfs_extent_inline_ref *)ptr;
btrfs_set_extent_inline_ref_type(leaf, iref, type);
if (type == BTRFS_EXTENT_DATA_REF_KEY) {
struct btrfs_extent_data_ref *dref;
dref = (struct btrfs_extent_data_ref *)(&iref->offset);
btrfs_set_extent_data_ref_root(leaf, dref, root_objectid);
btrfs_set_extent_data_ref_objectid(leaf, dref, owner);
btrfs_set_extent_data_ref_offset(leaf, dref, offset);
btrfs_set_extent_data_ref_count(leaf, dref, refs_to_add);
} else if (type == BTRFS_SHARED_DATA_REF_KEY) {
struct btrfs_shared_data_ref *sref;
sref = (struct btrfs_shared_data_ref *)(iref + 1);
btrfs_set_shared_data_ref_count(leaf, sref, refs_to_add);
btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
} else if (type == BTRFS_SHARED_BLOCK_REF_KEY) {
btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
} else {
btrfs_set_extent_inline_ref_offset(leaf, iref, root_objectid);
}
btrfs_mark_buffer_dirty(leaf);
}
static int lookup_extent_backref(struct btrfs_trans_handle *trans,
struct btrfs_path *path,
struct btrfs_extent_inline_ref **ref_ret,
u64 bytenr, u64 num_bytes, u64 parent,
u64 root_objectid, u64 owner, u64 offset)
{
int ret;
ret = lookup_inline_extent_backref(trans, path, ref_ret, bytenr,
num_bytes, parent, root_objectid,
owner, offset, 0);
if (ret != -ENOENT)
return ret;
btrfs_release_path(path);
*ref_ret = NULL;
if (owner < BTRFS_FIRST_FREE_OBJECTID) {
ret = lookup_tree_block_ref(trans, path, bytenr, parent,
root_objectid);
} else {
ret = lookup_extent_data_ref(trans, path, bytenr, parent,
root_objectid, owner, offset);
}
return ret;
}
/*
* helper to update/remove inline back ref
*/
static noinline_for_stack
void update_inline_extent_backref(struct btrfs_path *path,
struct btrfs_extent_inline_ref *iref,
int refs_to_mod,
struct btrfs_delayed_extent_op *extent_op,
int *last_ref)
{
struct extent_buffer *leaf = path->nodes[0];
struct btrfs_extent_item *ei;
struct btrfs_extent_data_ref *dref = NULL;
struct btrfs_shared_data_ref *sref = NULL;
unsigned long ptr;
unsigned long end;
u32 item_size;
int size;
int type;
u64 refs;
ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
refs = btrfs_extent_refs(leaf, ei);
WARN_ON(refs_to_mod < 0 && refs + refs_to_mod <= 0);
refs += refs_to_mod;
btrfs_set_extent_refs(leaf, ei, refs);
if (extent_op)
__run_delayed_extent_op(extent_op, leaf, ei);
/*
* If type is invalid, we should have bailed out after
* lookup_inline_extent_backref().
*/
type = btrfs_get_extent_inline_ref_type(leaf, iref, BTRFS_REF_TYPE_ANY);
ASSERT(type != BTRFS_REF_TYPE_INVALID);
if (type == BTRFS_EXTENT_DATA_REF_KEY) {
dref = (struct btrfs_extent_data_ref *)(&iref->offset);
refs = btrfs_extent_data_ref_count(leaf, dref);
} else if (type == BTRFS_SHARED_DATA_REF_KEY) {
sref = (struct btrfs_shared_data_ref *)(iref + 1);
refs = btrfs_shared_data_ref_count(leaf, sref);
} else {
refs = 1;
BUG_ON(refs_to_mod != -1);
}
BUG_ON(refs_to_mod < 0 && refs < -refs_to_mod);
refs += refs_to_mod;
if (refs > 0) {
if (type == BTRFS_EXTENT_DATA_REF_KEY)
btrfs_set_extent_data_ref_count(leaf, dref, refs);
else
btrfs_set_shared_data_ref_count(leaf, sref, refs);
} else {
*last_ref = 1;
size = btrfs_extent_inline_ref_size(type);
item_size = btrfs_item_size_nr(leaf, path->slots[0]);
ptr = (unsigned long)iref;
end = (unsigned long)ei + item_size;
if (ptr + size < end)
memmove_extent_buffer(leaf, ptr, ptr + size,
end - ptr - size);
item_size -= size;
btrfs_truncate_item(path, item_size, 1);
}
btrfs_mark_buffer_dirty(leaf);
}
static noinline_for_stack
int insert_inline_extent_backref(struct btrfs_trans_handle *trans,
struct btrfs_path *path,
u64 bytenr, u64 num_bytes, u64 parent,
u64 root_objectid, u64 owner,
u64 offset, int refs_to_add,
struct btrfs_delayed_extent_op *extent_op)
{
struct btrfs_extent_inline_ref *iref;
int ret;
ret = lookup_inline_extent_backref(trans, path, &iref, bytenr,
num_bytes, parent, root_objectid,
owner, offset, 1);
if (ret == 0) {
/*
* We're adding refs to a tree block we already own, this
* should not happen at all.
*/
if (owner < BTRFS_FIRST_FREE_OBJECTID) {
btrfs_crit(trans->fs_info,
"adding refs to an existing tree ref, bytenr %llu num_bytes %llu root_objectid %llu",
bytenr, num_bytes, root_objectid);
if (IS_ENABLED(CONFIG_BTRFS_DEBUG)) {
WARN_ON(1);
btrfs_crit(trans->fs_info,
"path->slots[0]=%d path->nodes[0]:", path->slots[0]);
btrfs_print_leaf(path->nodes[0]);
}
return -EUCLEAN;
}
update_inline_extent_backref(path, iref, refs_to_add,
extent_op, NULL);
} else if (ret == -ENOENT) {
setup_inline_extent_backref(trans->fs_info, path, iref, parent,
root_objectid, owner, offset,
refs_to_add, extent_op);
ret = 0;
}
return ret;
}
static int remove_extent_backref(struct btrfs_trans_handle *trans,
struct btrfs_path *path,
struct btrfs_extent_inline_ref *iref,
int refs_to_drop, int is_data, int *last_ref)
{
int ret = 0;
BUG_ON(!is_data && refs_to_drop != 1);
if (iref) {
update_inline_extent_backref(path, iref, -refs_to_drop, NULL,
last_ref);
} else if (is_data) {
ret = remove_extent_data_ref(trans, path, refs_to_drop,
last_ref);
} else {
*last_ref = 1;
ret = btrfs_del_item(trans, trans->fs_info->extent_root, path);
}
return ret;
}
static int btrfs_issue_discard(struct block_device *bdev, u64 start, u64 len,
u64 *discarded_bytes)
{
int j, ret = 0;
u64 bytes_left, end;
u64 aligned_start = ALIGN(start, 1 << 9); if (WARN_ON(start != aligned_start)) {
len -= aligned_start - start;
len = round_down(len, 1 << 9);
start = aligned_start;
}
*discarded_bytes = 0;
if (!len)
return 0;
end = start + len;
bytes_left = len;
/* Skip any superblocks on this device. */
for (j = 0; j < BTRFS_SUPER_MIRROR_MAX; j++) {
u64 sb_start = btrfs_sb_offset(j);
u64 sb_end = sb_start + BTRFS_SUPER_INFO_SIZE; u64 size = sb_start - start; if (!in_range(sb_start, start, bytes_left) && !in_range(sb_end, start, bytes_left) && !in_range(start, sb_start, BTRFS_SUPER_INFO_SIZE))
continue;
/*
* Superblock spans beginning of range. Adjust start and
* try again.
*/
if (sb_start <= start) {
start += sb_end - start;
if (start > end) {
bytes_left = 0;
break;
}
bytes_left = end - start;
continue;
}
if (size) {
ret = blkdev_issue_discard(bdev, start >> 9, size >> 9,
GFP_NOFS, 0);
if (!ret)
*discarded_bytes += size; else if (ret != -EOPNOTSUPP)
return ret;
}
start = sb_end;
if (start > end) {
bytes_left = 0;
break;
}
bytes_left = end - start;
}
if (bytes_left) { ret = blkdev_issue_discard(bdev, start >> 9, bytes_left >> 9,
GFP_NOFS, 0);
if (!ret)
*discarded_bytes += bytes_left;
}
return ret;
}
static int do_discard_extent(struct btrfs_bio_stripe *stripe, u64 *bytes)
{
struct btrfs_device *dev = stripe->dev;
struct btrfs_fs_info *fs_info = dev->fs_info;
struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
u64 phys = stripe->physical;
u64 len = stripe->length;
u64 discarded = 0;
int ret = 0;
/* Zone reset on a zoned filesystem */
if (btrfs_can_zone_reset(dev, phys, len)) {
u64 src_disc;
ret = btrfs_reset_device_zone(dev, phys, len, &discarded);
if (ret)
goto out;
if (!btrfs_dev_replace_is_ongoing(dev_replace) ||
dev != dev_replace->srcdev)
goto out;
src_disc = discarded;
/* Send to replace target as well */
ret = btrfs_reset_device_zone(dev_replace->tgtdev, phys, len,
&discarded);
discarded += src_disc;
} else if (blk_queue_discard(bdev_get_queue(stripe->dev->bdev))) { ret = btrfs_issue_discard(dev->bdev, phys, len, &discarded);
} else {
ret = 0;
*bytes = 0;
}
out:
*bytes = discarded; return ret;
}
int btrfs_discard_extent(struct btrfs_fs_info *fs_info, u64 bytenr,
u64 num_bytes, u64 *actual_bytes)
{
int ret = 0;
u64 discarded_bytes = 0;
u64 end = bytenr + num_bytes;
u64 cur = bytenr;
struct btrfs_bio *bbio = NULL;
/*
* Avoid races with device replace and make sure our bbio has devices
* associated to its stripes that don't go away while we are discarding.
*/
btrfs_bio_counter_inc_blocked(fs_info);
while (cur < end) {
struct btrfs_bio_stripe *stripe;
int i;
num_bytes = end - cur;
/* Tell the block device(s) that the sectors can be discarded */
ret = btrfs_map_block(fs_info, BTRFS_MAP_DISCARD, cur,
&num_bytes, &bbio, 0);
/*
* Error can be -ENOMEM, -ENOENT (no such chunk mapping) or
* -EOPNOTSUPP. For any such error, @num_bytes is not updated,
* thus we can't continue anyway.
*/
if (ret < 0)
goto out;
stripe = bbio->stripes; for (i = 0; i < bbio->num_stripes; i++, stripe++) {
u64 bytes;
struct btrfs_device *device = stripe->dev;
if (!device->bdev) {
ASSERT(btrfs_test_opt(fs_info, DEGRADED));
continue;
}
if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state))
continue;
ret = do_discard_extent(stripe, &bytes);
if (!ret) {
discarded_bytes += bytes;
} else if (ret != -EOPNOTSUPP) {
/*
* Logic errors or -ENOMEM, or -EIO, but
* unlikely to happen.
*
* And since there are two loops, explicitly
* go to out to avoid confusion.
*/
btrfs_put_bbio(bbio);
goto out;
}
/*
* Just in case we get back EOPNOTSUPP for some reason,
* just ignore the return value so we don't screw up
* people calling discard_extent.
*/
ret = 0;
}
btrfs_put_bbio(bbio);
cur += num_bytes;
}
out:
btrfs_bio_counter_dec(fs_info);
if (actual_bytes)
*actual_bytes = discarded_bytes; if (ret == -EOPNOTSUPP)
ret = 0;
return ret;
}
/* Can return -ENOMEM */
int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
struct btrfs_ref *generic_ref)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
int ret;
ASSERT(generic_ref->type != BTRFS_REF_NOT_SET &&
generic_ref->action);
BUG_ON(generic_ref->type == BTRFS_REF_METADATA &&
generic_ref->tree_ref.root == BTRFS_TREE_LOG_OBJECTID);
if (generic_ref->type == BTRFS_REF_METADATA)
ret = btrfs_add_delayed_tree_ref(trans, generic_ref, NULL);
else
ret = btrfs_add_delayed_data_ref(trans, generic_ref, 0);
btrfs_ref_tree_mod(fs_info, generic_ref);
return ret;
}
/*
* __btrfs_inc_extent_ref - insert backreference for a given extent
*
* The counterpart is in __btrfs_free_extent(), with examples and more details
* how it works.
*
* @trans: Handle of transaction
*
* @node: The delayed ref node used to get the bytenr/length for
* extent whose references are incremented.
*
* @parent: If this is a shared extent (BTRFS_SHARED_DATA_REF_KEY/
* BTRFS_SHARED_BLOCK_REF_KEY) then it holds the logical
* bytenr of the parent block. Since new extents are always
* created with indirect references, this will only be the case
* when relocating a shared extent. In that case, root_objectid
* will be BTRFS_TREE_RELOC_OBJECTID. Otherwise, parent must
* be 0
*
* @root_objectid: The id of the root where this modification has originated,
* this can be either one of the well-known metadata trees or
* the subvolume id which references this extent.
*
* @owner: For data extents it is the inode number of the owning file.
* For metadata extents this parameter holds the level in the
* tree of the extent.
*
* @offset: For metadata extents the offset is ignored and is currently
* always passed as 0. For data extents it is the fileoffset
* this extent belongs to.
*
* @refs_to_add Number of references to add
*
* @extent_op Pointer to a structure, holding information necessary when
* updating a tree block's flags
*
*/
static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
struct btrfs_delayed_ref_node *node,
u64 parent, u64 root_objectid,
u64 owner, u64 offset, int refs_to_add,
struct btrfs_delayed_extent_op *extent_op)
{
struct btrfs_path *path;
struct extent_buffer *leaf;
struct btrfs_extent_item *item;
struct btrfs_key key;
u64 bytenr = node->bytenr;
u64 num_bytes = node->num_bytes;
u64 refs;
int ret;
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
/* this will setup the path even if it fails to insert the back ref */
ret = insert_inline_extent_backref(trans, path, bytenr, num_bytes,
parent, root_objectid, owner,
offset, refs_to_add, extent_op);
if ((ret < 0 && ret != -EAGAIN) || !ret)
goto out;
/*
* Ok we had -EAGAIN which means we didn't have space to insert and
* inline extent ref, so just update the reference count and add a
* normal backref.
*/
leaf = path->nodes[0];
btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
refs = btrfs_extent_refs(leaf, item);
btrfs_set_extent_refs(leaf, item, refs + refs_to_add);
if (extent_op)
__run_delayed_extent_op(extent_op, leaf, item);
btrfs_mark_buffer_dirty(leaf);
btrfs_release_path(path);
/* now insert the actual backref */
if (owner < BTRFS_FIRST_FREE_OBJECTID) {
BUG_ON(refs_to_add != 1);
ret = insert_tree_block_ref(trans, path, bytenr, parent,
root_objectid);
} else {
ret = insert_extent_data_ref(trans, path, bytenr, parent,
root_objectid, owner, offset,
refs_to_add);
}
if (ret)
btrfs_abort_transaction(trans, ret);
out:
btrfs_free_path(path);
return ret;
}
static int run_delayed_data_ref(struct btrfs_trans_handle *trans,
struct btrfs_delayed_ref_node *node,
struct btrfs_delayed_extent_op *extent_op,
int insert_reserved)
{
int ret = 0;
struct btrfs_delayed_data_ref *ref;
struct btrfs_key ins;
u64 parent = 0;
u64 ref_root = 0;
u64 flags = 0;
ins.objectid = node->bytenr;
ins.offset = node->num_bytes;
ins.type = BTRFS_EXTENT_ITEM_KEY;
ref = btrfs_delayed_node_to_data_ref(node);
trace_run_delayed_data_ref(trans->fs_info, node, ref, node->action); if (node->type == BTRFS_SHARED_DATA_REF_KEY) parent = ref->parent; ref_root = ref->root;
if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) {
if (extent_op) flags |= extent_op->flags_to_set; ret = alloc_reserved_file_extent(trans, parent, ref_root,
flags, ref->objectid,
ref->offset, &ins,
node->ref_mod);
} else if (node->action == BTRFS_ADD_DELAYED_REF) {
ret = __btrfs_inc_extent_ref(trans, node, parent, ref_root,
ref->objectid, ref->offset,
node->ref_mod, extent_op);
} else if (node->action == BTRFS_DROP_DELAYED_REF) { ret = __btrfs_free_extent(trans, node, parent,
ref_root, ref->objectid,
ref->offset, node->ref_mod,
extent_op);
} else {
BUG();
}
return ret;
}
static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op,
struct extent_buffer *leaf,
struct btrfs_extent_item *ei)
{
u64 flags = btrfs_extent_flags(leaf, ei);
if (extent_op->update_flags) {
flags |= extent_op->flags_to_set;
btrfs_set_extent_flags(leaf, ei, flags);
}
if (extent_op->update_key) {
struct btrfs_tree_block_info *bi;
BUG_ON(!(flags & BTRFS_EXTENT_FLAG_TREE_BLOCK));
bi = (struct btrfs_tree_block_info *)(ei + 1);
btrfs_set_tree_block_key(leaf, bi, &extent_op->key);
}
}
static int run_delayed_extent_op(struct btrfs_trans_handle *trans,
struct btrfs_delayed_ref_head *head,
struct btrfs_delayed_extent_op *extent_op)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
struct btrfs_key key;
struct btrfs_path *path;
struct btrfs_extent_item *ei;
struct extent_buffer *leaf;
u32 item_size;
int ret;
int err = 0;
int metadata = !extent_op->is_data;
if (TRANS_ABORTED(trans))
return 0;
if (metadata && !btrfs_fs_incompat(fs_info, SKINNY_METADATA))
metadata = 0;
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
key.objectid = head->bytenr;
if (metadata) {
key.type = BTRFS_METADATA_ITEM_KEY;
key.offset = extent_op->level;
} else {
key.type = BTRFS_EXTENT_ITEM_KEY;
key.offset = head->num_bytes;
}
again:
ret = btrfs_search_slot(trans, fs_info->extent_root, &key, path, 0, 1);
if (ret < 0) {
err = ret;
goto out;
}
if (ret > 0) { if (metadata) { if (path->slots[0] > 0) { path->slots[0]--;
btrfs_item_key_to_cpu(path->nodes[0], &key,
path->slots[0]);
if (key.objectid == head->bytenr &&
key.type == BTRFS_EXTENT_ITEM_KEY && key.offset == head->num_bytes)
ret = 0;
}
if (ret > 0) {
btrfs_release_path(path);
metadata = 0;
key.objectid = head->bytenr;
key.offset = head->num_bytes;
key.type = BTRFS_EXTENT_ITEM_KEY;
goto again;
}
} else {
err = -EIO;
goto out;
}
}
leaf = path->nodes[0];
item_size = btrfs_item_size_nr(leaf, path->slots[0]);
if (unlikely(item_size < sizeof(*ei))) {
err = -EINVAL;
btrfs_print_v0_err(fs_info);
btrfs_abort_transaction(trans, err);
goto out;
}
ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
__run_delayed_extent_op(extent_op, leaf, ei);
btrfs_mark_buffer_dirty(leaf);
out:
btrfs_free_path(path);
return err;
}
static int run_delayed_tree_ref(struct btrfs_trans_handle *trans,
struct btrfs_delayed_ref_node *node,
struct btrfs_delayed_extent_op *extent_op,
int insert_reserved)
{
int ret = 0;
struct btrfs_delayed_tree_ref *ref;
u64 parent = 0;
u64 ref_root = 0;
ref = btrfs_delayed_node_to_tree_ref(node);
trace_run_delayed_tree_ref(trans->fs_info, node, ref, node->action); if (node->type == BTRFS_SHARED_BLOCK_REF_KEY) parent = ref->parent; ref_root = ref->root;
if (node->ref_mod != 1) {
btrfs_err(trans->fs_info,
"btree block(%llu) has %d references rather than 1: action %d ref_root %llu parent %llu",
node->bytenr, node->ref_mod, node->action, ref_root,
parent);
return -EIO;
}
if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) { BUG_ON(!extent_op || !extent_op->update_flags);
ret = alloc_reserved_tree_block(trans, node, extent_op);
} else if (node->action == BTRFS_ADD_DELAYED_REF) {
ret = __btrfs_inc_extent_ref(trans, node, parent, ref_root,
ref->level, 0, 1, extent_op); } else if (node->action == BTRFS_DROP_DELAYED_REF) {
ret = __btrfs_free_extent(trans, node, parent, ref_root,
ref->level, 0, 1, extent_op);
} else {
BUG();
}
return ret;
}
/* helper function to actually process a single delayed ref entry */
static int run_one_delayed_ref(struct btrfs_trans_handle *trans,
struct btrfs_delayed_ref_node *node,
struct btrfs_delayed_extent_op *extent_op,
int insert_reserved)
{
int ret = 0;
if (TRANS_ABORTED(trans)) {
if (insert_reserved) btrfs_pin_extent(trans, node->bytenr, node->num_bytes, 1);
return 0;
}
if (node->type == BTRFS_TREE_BLOCK_REF_KEY ||
node->type == BTRFS_SHARED_BLOCK_REF_KEY)
ret = run_delayed_tree_ref(trans, node, extent_op,
insert_reserved);
else if (node->type == BTRFS_EXTENT_DATA_REF_KEY ||
node->type == BTRFS_SHARED_DATA_REF_KEY)
ret = run_delayed_data_ref(trans, node, extent_op,
insert_reserved);
else
BUG(); if (ret && insert_reserved) btrfs_pin_extent(trans, node->bytenr, node->num_bytes, 1);
return ret;
}
static inline struct btrfs_delayed_ref_node *
select_delayed_ref(struct btrfs_delayed_ref_head *head)
{
struct btrfs_delayed_ref_node *ref;
if (RB_EMPTY_ROOT(&head->ref_tree.rb_root))
return NULL;
/*
* Select a delayed ref of type BTRFS_ADD_DELAYED_REF first.
* This is to prevent a ref count from going down to zero, which deletes
* the extent item from the extent tree, when there still are references
* to add, which would fail because they would not find the extent item.
*/
if (!list_empty(&head->ref_add_list))
return list_first_entry(&head->ref_add_list,
struct btrfs_delayed_ref_node, add_list);
ref = rb_entry(rb_first_cached(&head->ref_tree),
struct btrfs_delayed_ref_node, ref_node);
ASSERT(list_empty(&ref->add_list));
return ref;
}
static void unselect_delayed_ref_head(struct btrfs_delayed_ref_root *delayed_refs,
struct btrfs_delayed_ref_head *head)
{
spin_lock(&delayed_refs->lock);
head->processing = 0;
delayed_refs->num_heads_ready++;
spin_unlock(&delayed_refs->lock);
btrfs_delayed_ref_unlock(head);
}
static struct btrfs_delayed_extent_op *cleanup_extent_op(
struct btrfs_delayed_ref_head *head)
{
struct btrfs_delayed_extent_op *extent_op = head->extent_op;
if (!extent_op)
return NULL;
if (head->must_insert_reserved) { head->extent_op = NULL;
btrfs_free_delayed_extent_op(extent_op);
return NULL;
}
return extent_op;
}
static int run_and_cleanup_extent_op(struct btrfs_trans_handle *trans,
struct btrfs_delayed_ref_head *head)
{
struct btrfs_delayed_extent_op *extent_op;
int ret;
extent_op = cleanup_extent_op(head);
if (!extent_op)
return 0;
head->extent_op = NULL;
spin_unlock(&head->lock);
ret = run_delayed_extent_op(trans, head, extent_op);
btrfs_free_delayed_extent_op(extent_op);
return ret ? ret : 1;
}
void btrfs_cleanup_ref_head_accounting(struct btrfs_fs_info *fs_info,
struct btrfs_delayed_ref_root *delayed_refs,
struct btrfs_delayed_ref_head *head)
{
int nr_items = 1; /* Dropping this ref head update. */
/*
* We had csum deletions accounted for in our delayed refs rsv, we need
* to drop the csum leaves for this update from our delayed_refs_rsv.
*/
if (head->total_ref_mod < 0 && head->is_data) {
spin_lock(&delayed_refs->lock);
delayed_refs->pending_csums -= head->num_bytes;
spin_unlock(&delayed_refs->lock);
nr_items += btrfs_csum_bytes_to_leaves(fs_info, head->num_bytes);
}
btrfs_delayed_refs_rsv_release(fs_info, nr_items);
}
static int cleanup_ref_head(struct btrfs_trans_handle *trans,
struct btrfs_delayed_ref_head *head)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
struct btrfs_delayed_ref_root *delayed_refs;
int ret;
delayed_refs = &trans->transaction->delayed_refs;
ret = run_and_cleanup_extent_op(trans, head);
if (ret < 0) {
unselect_delayed_ref_head(delayed_refs, head);
btrfs_debug(fs_info, "run_delayed_extent_op returned %d", ret);
return ret;
} else if (ret) {
return ret;
}
/*
* Need to drop our head ref lock and re-acquire the delayed ref lock
* and then re-check to make sure nobody got added.
*/
spin_unlock(&head->lock);
spin_lock(&delayed_refs->lock);
spin_lock(&head->lock);
if (!RB_EMPTY_ROOT(&head->ref_tree.rb_root) || head->extent_op) {
spin_unlock(&head->lock);
spin_unlock(&delayed_refs->lock);
return 1;
}
btrfs_delete_ref_head(delayed_refs, head);
spin_unlock(&head->lock);
spin_unlock(&delayed_refs->lock);
if (head->must_insert_reserved) {
btrfs_pin_extent(trans, head->bytenr, head->num_bytes, 1);
if (head->is_data) {
ret = btrfs_del_csums(trans, fs_info->csum_root,
head->bytenr, head->num_bytes);
}
}
btrfs_cleanup_ref_head_accounting(fs_info, delayed_refs, head);
trace_run_delayed_ref_head(fs_info, head, 0);
btrfs_delayed_ref_unlock(head);
btrfs_put_delayed_ref_head(head);
return ret;
}
static struct btrfs_delayed_ref_head *btrfs_obtain_ref_head(
struct btrfs_trans_handle *trans)
{
struct btrfs_delayed_ref_root *delayed_refs =
&trans->transaction->delayed_refs;
struct btrfs_delayed_ref_head *head = NULL;
int ret;
spin_lock(&delayed_refs->lock);
head = btrfs_select_ref_head(delayed_refs);
if (!head) {
spin_unlock(&delayed_refs->lock);
return head;
}
/*
* Grab the lock that says we are going to process all the refs for
* this head
*/
ret = btrfs_delayed_ref_lock(delayed_refs, head);
spin_unlock(&delayed_refs->lock);
/*
* We may have dropped the spin lock to get the head mutex lock, and
* that might have given someone else time to free the head. If that's
* true, it has been removed from our list and we can move on.
*/
if (ret == -EAGAIN)
head = ERR_PTR(-EAGAIN);
return head;
}
static int btrfs_run_delayed_refs_for_head(struct btrfs_trans_handle *trans,
struct btrfs_delayed_ref_head *locked_ref,
unsigned long *run_refs)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
struct btrfs_delayed_ref_root *delayed_refs;
struct btrfs_delayed_extent_op *extent_op;
struct btrfs_delayed_ref_node *ref;
int must_insert_reserved = 0;
int ret;
delayed_refs = &trans->transaction->delayed_refs;
lockdep_assert_held(&locked_ref->mutex);
lockdep_assert_held(&locked_ref->lock);
while ((ref = select_delayed_ref(locked_ref))) { if (ref->seq && btrfs_check_delayed_seq(fs_info, ref->seq)) {
spin_unlock(&locked_ref->lock);
unselect_delayed_ref_head(delayed_refs, locked_ref);
return -EAGAIN;
}
(*run_refs)++;
ref->in_tree = 0;
rb_erase_cached(&ref->ref_node, &locked_ref->ref_tree);
RB_CLEAR_NODE(&ref->ref_node);
if (!list_empty(&ref->add_list))
list_del(&ref->add_list);
/*
* When we play the delayed ref, also correct the ref_mod on
* head
*/
switch (ref->action) {
case BTRFS_ADD_DELAYED_REF:
case BTRFS_ADD_DELAYED_EXTENT:
locked_ref->ref_mod -= ref->ref_mod;
break;
case BTRFS_DROP_DELAYED_REF:
locked_ref->ref_mod += ref->ref_mod;
break;
default:
WARN_ON(1);
}
atomic_dec(&delayed_refs->num_entries);
/*
* Record the must_insert_reserved flag before we drop the
* spin lock.
*/
must_insert_reserved = locked_ref->must_insert_reserved;
locked_ref->must_insert_reserved = 0;
extent_op = locked_ref->extent_op;
locked_ref->extent_op = NULL;
spin_unlock(&locked_ref->lock);
ret = run_one_delayed_ref(trans, ref, extent_op,
must_insert_reserved);
btrfs_free_delayed_extent_op(extent_op);
if (ret) {
unselect_delayed_ref_head(delayed_refs, locked_ref);
btrfs_put_delayed_ref(ref);
btrfs_debug(fs_info, "run_one_delayed_ref returned %d",
ret);
return ret;
}
btrfs_put_delayed_ref(ref);
cond_resched();
spin_lock(&locked_ref->lock);
btrfs_merge_delayed_refs(trans, delayed_refs, locked_ref);
}
return 0;
}
/*
* Returns 0 on success or if called with an already aborted transaction.
* Returns -ENOMEM or -EIO on failure and will abort the transaction.
*/
static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
unsigned long nr)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
struct btrfs_delayed_ref_root *delayed_refs;
struct btrfs_delayed_ref_head *locked_ref = NULL;
ktime_t start = ktime_get();
int ret;
unsigned long count = 0;
unsigned long actual_count = 0;
delayed_refs = &trans->transaction->delayed_refs;
do {
if (!locked_ref) { locked_ref = btrfs_obtain_ref_head(trans);
if (IS_ERR_OR_NULL(locked_ref)) {
if (PTR_ERR(locked_ref) == -EAGAIN) {
continue;
} else {
break;
}
}
count++;
}
/*
* We need to try and merge add/drops of the same ref since we
* can run into issues with relocate dropping the implicit ref
* and then it being added back again before the drop can
* finish. If we merged anything we need to re-loop so we can
* get a good ref.
* Or we can get node references of the same type that weren't
* merged when created due to bumps in the tree mod seq, and
* we need to merge them to prevent adding an inline extent
* backref before dropping it (triggering a BUG_ON at
* insert_inline_extent_backref()).
*/
spin_lock(&locked_ref->lock);
btrfs_merge_delayed_refs(trans, delayed_refs, locked_ref);
ret = btrfs_run_delayed_refs_for_head(trans, locked_ref,
&actual_count);
if (ret < 0 && ret != -EAGAIN) {
/*
* Error, btrfs_run_delayed_refs_for_head already
* unlocked everything so just bail out
*/
return ret;
} else if (!ret) {
/*
* Success, perform the usual cleanup of a processed
* head
*/
ret = cleanup_ref_head(trans, locked_ref);
if (ret > 0 ) {
/* We dropped our lock, we need to loop. */
ret = 0;
continue;
} else if (ret) {
return ret;
}
}
/*
* Either success case or btrfs_run_delayed_refs_for_head
* returned -EAGAIN, meaning we need to select another head
*/
locked_ref = NULL;
cond_resched();
} while ((nr != -1 && count < nr) || locked_ref);
/*
* We don't want to include ref heads since we can have empty ref heads
* and those will drastically skew our runtime down since we just do
* accounting, no actual extent tree updates.
*/
if (actual_count > 0) { u64 runtime = ktime_to_ns(ktime_sub(ktime_get(), start));
u64 avg;
/*
* We weigh the current average higher than our current runtime
* to avoid large swings in the average.
*/
spin_lock(&delayed_refs->lock);
avg = fs_info->avg_delayed_ref_runtime * 3 + runtime;
fs_info->avg_delayed_ref_runtime = avg >> 2; /* div by 4 */
spin_unlock(&delayed_refs->lock);
}
return 0;
}
#ifdef SCRAMBLE_DELAYED_REFS
/*
* Normally delayed refs get processed in ascending bytenr order. This
* correlates in most cases to the order added. To expose dependencies on this
* order, we start to process the tree in the middle instead of the beginning
*/
static u64 find_middle(struct rb_root *root)
{
struct rb_node *n = root->rb_node;
struct btrfs_delayed_ref_node *entry;
int alt = 1;
u64 middle;
u64 first = 0, last = 0;
n = rb_first(root);
if (n) {
entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node);
first = entry->bytenr;
}
n = rb_last(root);
if (n) {
entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node);
last = entry->bytenr;
}
n = root->rb_node;
while (n) {
entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node);
WARN_ON(!entry->in_tree);
middle = entry->bytenr;
if (alt)
n = n->rb_left;
else
n = n->rb_right;
alt = 1 - alt;
}
return middle;
}
#endif
/*
* this starts processing the delayed reference count updates and
* extent insertions we have queued up so far. count can be
* 0, which means to process everything in the tree at the start
* of the run (but not newly added entries), or it can be some target
* number you'd like to process.
*
* Returns 0 on success or if called with an aborted transaction
* Returns <0 on error and aborts the transaction
*/
int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
unsigned long count)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
struct rb_node *node;
struct btrfs_delayed_ref_root *delayed_refs;
struct btrfs_delayed_ref_head *head;
int ret;
int run_all = count == (unsigned long)-1;
/* We'll clean this up in btrfs_cleanup_transaction */
if (TRANS_ABORTED(trans))
return 0; if (test_bit(BTRFS_FS_CREATING_FREE_SPACE_TREE, &fs_info->flags))
return 0;
delayed_refs = &trans->transaction->delayed_refs;
if (count == 0)
count = delayed_refs->num_heads_ready;
again:
#ifdef SCRAMBLE_DELAYED_REFS
delayed_refs->run_delayed_start = find_middle(&delayed_refs->root);
#endif
ret = __btrfs_run_delayed_refs(trans, count);
if (ret < 0) {
btrfs_abort_transaction(trans, ret);
return ret;
}
if (run_all) { btrfs_create_pending_block_groups(trans);
spin_lock(&delayed_refs->lock);
node = rb_first_cached(&delayed_refs->href_root);
if (!node) {
spin_unlock(&delayed_refs->lock);
goto out;
}
head = rb_entry(node, struct btrfs_delayed_ref_head,
href_node);
refcount_inc(&head->refs);
spin_unlock(&delayed_refs->lock);
/* Mutex was contended, block until it's released and retry. */
mutex_lock(&head->mutex);
mutex_unlock(&head->mutex);
btrfs_put_delayed_ref_head(head);
cond_resched();
goto again;
}
out:
return 0;
}
int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans,
struct extent_buffer *eb, u64 flags,
int level, int is_data)
{
struct btrfs_delayed_extent_op *extent_op;
int ret;
extent_op = btrfs_alloc_delayed_extent_op();
if (!extent_op)
return -ENOMEM;
extent_op->flags_to_set = flags;
extent_op->update_flags = true;
extent_op->update_key = false;
extent_op->is_data = is_data ? true : false;
extent_op->level = level;
ret = btrfs_add_delayed_extent_op(trans, eb->start, eb->len, extent_op);
if (ret)
btrfs_free_delayed_extent_op(extent_op);
return ret;
}
static noinline int check_delayed_ref(struct btrfs_root *root,
struct btrfs_path *path,
u64 objectid, u64 offset, u64 bytenr)
{
struct btrfs_delayed_ref_head *head;
struct btrfs_delayed_ref_node *ref;
struct btrfs_delayed_data_ref *data_ref;
struct btrfs_delayed_ref_root *delayed_refs;
struct btrfs_transaction *cur_trans;
struct rb_node *node;
int ret = 0;
spin_lock(&root->fs_info->trans_lock);
cur_trans = root->fs_info->running_transaction;
if (cur_trans)
refcount_inc(&cur_trans->use_count);
spin_unlock(&root->fs_info->trans_lock);
if (!cur_trans)
return 0;
delayed_refs = &cur_trans->delayed_refs;
spin_lock(&delayed_refs->lock);
head = btrfs_find_delayed_ref_head(delayed_refs, bytenr);
if (!head) {
spin_unlock(&delayed_refs->lock);
btrfs_put_transaction(cur_trans);
return 0;
}
if (!mutex_trylock(&head->mutex)) { refcount_inc(&head->refs);
spin_unlock(&delayed_refs->lock);
btrfs_release_path(path);
/*
* Mutex was contended, block until it's released and let
* caller try again
*/
mutex_lock(&head->mutex);
mutex_unlock(&head->mutex);
btrfs_put_delayed_ref_head(head);
btrfs_put_transaction(cur_trans);
return -EAGAIN;
}
spin_unlock(&delayed_refs->lock);
spin_lock(&head->lock);
/*
* XXX: We should replace this with a proper search function in the
* future.
*/
for (node = rb_first_cached(&head->ref_tree); node;
node = rb_next(node)) {
ref = rb_entry(node, struct btrfs_delayed_ref_node, ref_node);
/* If it's a shared ref we know a cross reference exists */
if (ref->type != BTRFS_EXTENT_DATA_REF_KEY) {
ret = 1;
break;
}
data_ref = btrfs_delayed_node_to_data_ref(ref);
/*
* If our ref doesn't match the one we're currently looking at
* then we have a cross reference.
*/
if (data_ref->root != root->root_key.objectid || data_ref->objectid != objectid || data_ref->offset != offset) {
ret = 1;
break;
}
}
spin_unlock(&head->lock);
mutex_unlock(&head->mutex);
btrfs_put_transaction(cur_trans);
return ret;
}
static noinline int check_committed_ref(struct btrfs_root *root,
struct btrfs_path *path,
u64 objectid, u64 offset, u64 bytenr,
bool strict)
{
struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_root *extent_root = fs_info->extent_root;
struct extent_buffer *leaf;
struct btrfs_extent_data_ref *ref;
struct btrfs_extent_inline_ref *iref;
struct btrfs_extent_item *ei;
struct btrfs_key key;
u32 item_size;
int type;
int ret;
key.objectid = bytenr;
key.offset = (u64)-1;
key.type = BTRFS_EXTENT_ITEM_KEY;
ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
if (ret < 0)
goto out;
BUG_ON(ret == 0); /* Corruption */
ret = -ENOENT;
if (path->slots[0] == 0)
goto out;
path->slots[0]--;
leaf = path->nodes[0];
btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
if (key.objectid != bytenr || key.type != BTRFS_EXTENT_ITEM_KEY)
goto out;
ret = 1;
item_size = btrfs_item_size_nr(leaf, path->slots[0]); ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
/* If extent item has more than 1 inline ref then it's shared */
if (item_size != sizeof(*ei) +
btrfs_extent_inline_ref_size(BTRFS_EXTENT_DATA_REF_KEY))
goto out;
/*
* If extent created before last snapshot => it's shared unless the
* snapshot has been deleted. Use the heuristic if strict is false.
*/
if (!strict &&
(btrfs_extent_generation(leaf, ei) <=
btrfs_root_last_snapshot(&root->root_item)))
goto out;
iref = (struct btrfs_extent_inline_ref *)(ei + 1);
/* If this extent has SHARED_DATA_REF then it's shared */
type = btrfs_get_extent_inline_ref_type(leaf, iref, BTRFS_REF_TYPE_DATA);
if (type != BTRFS_EXTENT_DATA_REF_KEY)
goto out;
ref = (struct btrfs_extent_data_ref *)(&iref->offset);
if (btrfs_extent_refs(leaf, ei) !=
btrfs_extent_data_ref_count(leaf, ref) ||
btrfs_extent_data_ref_root(leaf, ref) !=
root->root_key.objectid ||
btrfs_extent_data_ref_objectid(leaf, ref) != objectid ||
btrfs_extent_data_ref_offset(leaf, ref) != offset)
goto out;
ret = 0;
out:
return ret;
}
int btrfs_cross_ref_exist(struct btrfs_root *root, u64 objectid, u64 offset,
u64 bytenr, bool strict)
{
struct btrfs_path *path;
int ret;
path = btrfs_alloc_path(); if (!path)
return -ENOMEM;
do {
ret = check_committed_ref(root, path, objectid,
offset, bytenr, strict);
if (ret && ret != -ENOENT)
goto out;
ret = check_delayed_ref(root, path, objectid, offset, bytenr);
} while (ret == -EAGAIN);
out:
btrfs_free_path(path); if (btrfs_is_data_reloc_root(root)) WARN_ON(ret > 0);
return ret;
}
static int __btrfs_mod_ref(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct extent_buffer *buf,
int full_backref, int inc)
{
struct btrfs_fs_info *fs_info = root->fs_info;
u64 bytenr;
u64 num_bytes;
u64 parent;
u64 ref_root;
u32 nritems;
struct btrfs_key key;
struct btrfs_file_extent_item *fi;
struct btrfs_ref generic_ref = { 0 };
bool for_reloc = btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC);
int i;
int action;
int level;
int ret = 0;
if (btrfs_is_testing(fs_info))
return 0;
ref_root = btrfs_header_owner(buf);
nritems = btrfs_header_nritems(buf);
level = btrfs_header_level(buf);
if (!test_bit(BTRFS_ROOT_SHAREABLE, &root->state) && level == 0)
return 0;
if (full_backref)
parent = buf->start;
else
parent = 0;
if (inc)
action = BTRFS_ADD_DELAYED_REF;
else
action = BTRFS_DROP_DELAYED_REF;
for (i = 0; i < nritems; i++) {
if (level == 0) {
btrfs_item_key_to_cpu(buf, &key, i);
if (key.type != BTRFS_EXTENT_DATA_KEY)
continue;
fi = btrfs_item_ptr(buf, i,
struct btrfs_file_extent_item);
if (btrfs_file_extent_type(buf, fi) ==
BTRFS_FILE_EXTENT_INLINE)
continue;
bytenr = btrfs_file_extent_disk_bytenr(buf, fi);
if (bytenr == 0)
continue;
num_bytes = btrfs_file_extent_disk_num_bytes(buf, fi);
key.offset -= btrfs_file_extent_offset(buf, fi);
btrfs_init_generic_ref(&generic_ref, action, bytenr,
num_bytes, parent);
generic_ref.real_root = root->root_key.objectid;
btrfs_init_data_ref(&generic_ref, ref_root, key.objectid,
key.offset);
generic_ref.skip_qgroup = for_reloc;
if (inc)
ret = btrfs_inc_extent_ref(trans, &generic_ref);
else
ret = btrfs_free_extent(trans, &generic_ref);
if (ret)
goto fail;
} else {
bytenr = btrfs_node_blockptr(buf, i);
num_bytes = fs_info->nodesize;
btrfs_init_generic_ref(&generic_ref, action, bytenr,
num_bytes, parent);
generic_ref.real_root = root->root_key.objectid;
btrfs_init_tree_ref(&generic_ref, level - 1, ref_root);
generic_ref.skip_qgroup = for_reloc;
if (inc)
ret = btrfs_inc_extent_ref(trans, &generic_ref);
else
ret = btrfs_free_extent(trans, &generic_ref);
if (ret)
goto fail;
}
}
return 0;
fail:
return ret;
}
int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
struct extent_buffer *buf, int full_backref)
{
return __btrfs_mod_ref(trans, root, buf, full_backref, 1);
}
int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
struct extent_buffer *buf, int full_backref)
{
return __btrfs_mod_ref(trans, root, buf, full_backref, 0);
}
static u64 get_alloc_profile_by_root(struct btrfs_root *root, int data)
{
struct btrfs_fs_info *fs_info = root->fs_info;
u64 flags;
u64 ret;
if (data)
flags = BTRFS_BLOCK_GROUP_DATA;
else if (root == fs_info->chunk_root)
flags = BTRFS_BLOCK_GROUP_SYSTEM;
else
flags = BTRFS_BLOCK_GROUP_METADATA;
ret = btrfs_get_alloc_profile(fs_info, flags);
return ret;
}
static u64 first_logical_byte(struct btrfs_fs_info *fs_info, u64 search_start)
{
struct btrfs_block_group *cache;
u64 bytenr;
spin_lock(&fs_info->block_group_cache_lock);
bytenr = fs_info->first_logical_byte;
spin_unlock(&fs_info->block_group_cache_lock);
if (bytenr < (u64)-1)
return bytenr;
cache = btrfs_lookup_first_block_group(fs_info, search_start);
if (!cache)
return 0;
bytenr = cache->start;
btrfs_put_block_group(cache);
return bytenr;
}
static int pin_down_extent(struct btrfs_trans_handle *trans,
struct btrfs_block_group *cache,
u64 bytenr, u64 num_bytes, int reserved)
{
struct btrfs_fs_info *fs_info = cache->fs_info;
spin_lock(&cache->space_info->lock);
spin_lock(&cache->lock);
cache->pinned += num_bytes;
btrfs_space_info_update_bytes_pinned(fs_info, cache->space_info,
num_bytes);
if (reserved) {
cache->reserved -= num_bytes;
cache->space_info->bytes_reserved -= num_bytes;
}
spin_unlock(&cache->lock);
spin_unlock(&cache->space_info->lock);
set_extent_dirty(&trans->transaction->pinned_extents, bytenr,
bytenr + num_bytes - 1, GFP_NOFS | __GFP_NOFAIL);
return 0;
}
int btrfs_pin_extent(struct btrfs_trans_handle *trans,
u64 bytenr, u64 num_bytes, int reserved)
{
struct btrfs_block_group *cache;
cache = btrfs_lookup_block_group(trans->fs_info, bytenr); BUG_ON(!cache); /* Logic error */ pin_down_extent(trans, cache, bytenr, num_bytes, reserved);
btrfs_put_block_group(cache);
return 0;
}
/*
* this function must be called within transaction
*/
int btrfs_pin_extent_for_log_replay(struct btrfs_trans_handle *trans,
u64 bytenr, u64 num_bytes)
{
struct btrfs_block_group *cache;
int ret;
cache = btrfs_lookup_block_group(trans->fs_info, bytenr);
if (!cache)
return -EINVAL;
/*
* pull in the free space cache (if any) so that our pin
* removes the free space from the cache. We have load_only set
* to one because the slow code to read in the free extents does check
* the pinned extents.
*/
btrfs_cache_block_group(cache, 1);
/*
* Make sure we wait until the cache is completely built in case it is
* missing or is invalid and therefore needs to be rebuilt.
*/
ret = btrfs_wait_block_group_cache_done(cache);
if (ret)
goto out;
pin_down_extent(trans, cache, bytenr, num_bytes, 0);
/* remove us from the free space cache (if we're there at all) */
ret = btrfs_remove_free_space(cache, bytenr, num_bytes);
out:
btrfs_put_block_group(cache);
return ret;
}
static int __exclude_logged_extent(struct btrfs_fs_info *fs_info,
u64 start, u64 num_bytes)
{
int ret;
struct btrfs_block_group *block_group;
block_group = btrfs_lookup_block_group(fs_info, start);
if (!block_group)
return -EINVAL;
btrfs_cache_block_group(block_group, 1);
/*
* Make sure we wait until the cache is completely built in case it is
* missing or is invalid and therefore needs to be rebuilt.
*/
ret = btrfs_wait_block_group_cache_done(block_group);
if (ret)
goto out;
ret = btrfs_remove_free_space(block_group, start, num_bytes);
out:
btrfs_put_block_group(block_group);
return ret;
}
int btrfs_exclude_logged_extents(struct extent_buffer *eb)
{
struct btrfs_fs_info *fs_info = eb->fs_info;
struct btrfs_file_extent_item *item;
struct btrfs_key key;
int found_type;
int i;
int ret = 0;
if (!btrfs_fs_incompat(fs_info, MIXED_GROUPS))
return 0;
for (i = 0; i < btrfs_header_nritems(eb); i++) {
btrfs_item_key_to_cpu(eb, &key, i);
if (key.type != BTRFS_EXTENT_DATA_KEY)
continue;
item = btrfs_item_ptr(eb, i, struct btrfs_file_extent_item);
found_type = btrfs_file_extent_type(eb, item);
if (found_type == BTRFS_FILE_EXTENT_INLINE)
continue;
if (btrfs_file_extent_disk_bytenr(eb, item) == 0)
continue;
key.objectid = btrfs_file_extent_disk_bytenr(eb, item);
key.offset = btrfs_file_extent_disk_num_bytes(eb, item);
ret = __exclude_logged_extent(fs_info, key.objectid, key.offset);
if (ret)
break;
}
return ret;
}
static void
btrfs_inc_block_group_reservations(struct btrfs_block_group *bg)
{
atomic_inc(&bg->reservations);
}
/*
* Returns the free cluster for the given space info and sets empty_cluster to
* what it should be based on the mount options.
*/
static struct btrfs_free_cluster *
fetch_cluster_info(struct btrfs_fs_info *fs_info,
struct btrfs_space_info *space_info, u64 *empty_cluster)
{
struct btrfs_free_cluster *ret = NULL;
*empty_cluster = 0;
if (btrfs_mixed_space_info(space_info))
return ret;
if (space_info->flags & BTRFS_BLOCK_GROUP_METADATA) { ret = &fs_info->meta_alloc_cluster;
if (btrfs_test_opt(fs_info, SSD))
*empty_cluster = SZ_2M;
else
*empty_cluster = SZ_64K; } else if ((space_info->flags & BTRFS_BLOCK_GROUP_DATA) && btrfs_test_opt(fs_info, SSD_SPREAD)) { *empty_cluster = SZ_2M; ret = &fs_info->data_alloc_cluster;
}
return ret;
}
static int unpin_extent_range(struct btrfs_fs_info *fs_info,
u64 start, u64 end,
const bool return_free_space)
{
struct btrfs_block_group *cache = NULL;
struct btrfs_space_info *space_info;
struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
struct btrfs_free_cluster *cluster = NULL;
u64 len;
u64 total_unpinned = 0;
u64 empty_cluster = 0;
bool readonly;
while (start <= end) {
readonly = false;
if (!cache || start >= cache->start + cache->length) {
if (cache)
btrfs_put_block_group(cache);
total_unpinned = 0;
cache = btrfs_lookup_block_group(fs_info, start); BUG_ON(!cache); /* Logic error */ cluster = fetch_cluster_info(fs_info,
cache->space_info,
&empty_cluster);
empty_cluster <<= 1;
}
len = cache->start + cache->length - start;
len = min(len, end + 1 - start);
down_read(&fs_info->commit_root_sem);
if (start < cache->last_byte_to_unpin && return_free_space) { u64 add_len = min(len, cache->last_byte_to_unpin - start);
btrfs_add_free_space(cache, start, add_len);
}
up_read(&fs_info->commit_root_sem);
start += len;
total_unpinned += len;
space_info = cache->space_info;
/*
* If this space cluster has been marked as fragmented and we've
* unpinned enough in this block group to potentially allow a
* cluster to be created inside of it go ahead and clear the
* fragmented check.
*/
if (cluster && cluster->fragmented && total_unpinned > empty_cluster) {
spin_lock(&cluster->lock);
cluster->fragmented = 0;
spin_unlock(&cluster->lock);
}
spin_lock(&space_info->lock);
spin_lock(&cache->lock);
cache->pinned -= len;
btrfs_space_info_update_bytes_pinned(fs_info, space_info, -len);
space_info->max_extent_size = 0;
if (cache->ro) {
space_info->bytes_readonly += len;
readonly = true;
} else if (btrfs_is_zoned(fs_info)) {
/* Need reset before reusing in a zoned block group */
space_info->bytes_zone_unusable += len;
readonly = true;
}
spin_unlock(&cache->lock);
if (!readonly && return_free_space &&
global_rsv->space_info == space_info) {
u64 to_add = len;
spin_lock(&global_rsv->lock);
if (!global_rsv->full) {
to_add = min(len, global_rsv->size -
global_rsv->reserved);
global_rsv->reserved += to_add;
btrfs_space_info_update_bytes_may_use(fs_info,
space_info, to_add);
if (global_rsv->reserved >= global_rsv->size)
global_rsv->full = 1; len -= to_add;
}
spin_unlock(&global_rsv->lock);
}
/* Add to any tickets we may have */
if (!readonly && return_free_space && len) btrfs_try_granting_tickets(fs_info, space_info);
spin_unlock(&space_info->lock);
}
if (cache) btrfs_put_block_group(cache); return 0;
}
int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
struct btrfs_block_group *block_group, *tmp;
struct list_head *deleted_bgs;
struct extent_io_tree *unpin;
u64 start;
u64 end;
int ret;
unpin = &trans->transaction->pinned_extents;
while (!TRANS_ABORTED(trans)) { struct extent_state *cached_state = NULL;
mutex_lock(&fs_info->unused_bg_unpin_mutex);
ret = find_first_extent_bit(unpin, 0, &start, &end,
EXTENT_DIRTY, &cached_state);
if (ret) {
mutex_unlock(&fs_info->unused_bg_unpin_mutex);
break;
}
if (btrfs_test_opt(fs_info, DISCARD_SYNC)) ret = btrfs_discard_extent(fs_info, start,
end + 1 - start, NULL);
clear_extent_dirty(unpin, start, end, &cached_state);
unpin_extent_range(fs_info, start, end, true);
mutex_unlock(&fs_info->unused_bg_unpin_mutex);
free_extent_state(cached_state);
cond_resched();
}
if (btrfs_test_opt(fs_info, DISCARD_ASYNC)) { btrfs_discard_calc_delay(&fs_info->discard_ctl);
btrfs_discard_schedule_work(&fs_info->discard_ctl, true);
}
/*
* Transaction is finished. We don't need the lock anymore. We
* do need to clean up the block groups in case of a transaction
* abort.
*/
deleted_bgs = &trans->transaction->deleted_bgs;
list_for_each_entry_safe(block_group, tmp, deleted_bgs, bg_list) {
u64 trimmed = 0;
ret = -EROFS;
if (!TRANS_ABORTED(trans))
ret = btrfs_discard_extent(fs_info,
block_group->start,
block_group->length,
&trimmed);
list_del_init(&block_group->bg_list);
btrfs_unfreeze_block_group(block_group);
btrfs_put_block_group(block_group);
if (ret) {
const char *errstr = btrfs_decode_error(ret);
btrfs_warn(fs_info,
"discard failed while removing blockgroup: errno=%d %s",
ret, errstr);
}
}
return 0;
}
/*
* Drop one or more refs of @node.
*
* 1. Locate the extent refs.
* It's either inline in EXTENT/METADATA_ITEM or in keyed SHARED_* item.
* Locate it, then reduce the refs number or remove the ref line completely.
*
* 2. Update the refs count in EXTENT/METADATA_ITEM
*
* Inline backref case:
*
* in extent tree we have:
*
* item 0 key (13631488 EXTENT_ITEM 1048576) itemoff 16201 itemsize 82
* refs 2 gen 6 flags DATA
* extent data backref root FS_TREE objectid 258 offset 0 count 1
* extent data backref root FS_TREE objectid 257 offset 0 count 1
*
* This function gets called with:
*
* node->bytenr = 13631488
* node->num_bytes = 1048576
* root_objectid = FS_TREE
* owner_objectid = 257
* owner_offset = 0
* refs_to_drop = 1
*
* Then we should get some like:
*
* item 0 key (13631488 EXTENT_ITEM 1048576) itemoff 16201 itemsize 82
* refs 1 gen 6 flags DATA
* extent data backref root FS_TREE objectid 258 offset 0 count 1
*
* Keyed backref case:
*
* in extent tree we have:
*
* item 0 key (13631488 EXTENT_ITEM 1048576) itemoff 3971 itemsize 24
* refs 754 gen 6 flags DATA
* [...]
* item 2 key (13631488 EXTENT_DATA_REF <HASH>) itemoff 3915 itemsize 28
* extent data backref root FS_TREE objectid 866 offset 0 count 1
*
* This function get called with:
*
* node->bytenr = 13631488
* node->num_bytes = 1048576
* root_objectid = FS_TREE
* owner_objectid = 866
* owner_offset = 0
* refs_to_drop = 1
*
* Then we should get some like:
*
* item 0 key (13631488 EXTENT_ITEM 1048576) itemoff 3971 itemsize 24
* refs 753 gen 6 flags DATA
*
* And that (13631488 EXTENT_DATA_REF <HASH>) gets removed.
*/
static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
struct btrfs_delayed_ref_node *node, u64 parent,
u64 root_objectid, u64 owner_objectid,
u64 owner_offset, int refs_to_drop,
struct btrfs_delayed_extent_op *extent_op)
{
struct btrfs_fs_info *info = trans->fs_info;
struct btrfs_key key;
struct btrfs_path *path;
struct btrfs_root *extent_root = info->extent_root;
struct extent_buffer *leaf;
struct btrfs_extent_item *ei;
struct btrfs_extent_inline_ref *iref;
int ret;
int is_data;
int extent_slot = 0;
int found_extent = 0;
int num_to_del = 1;
u32 item_size;
u64 refs;
u64 bytenr = node->bytenr;
u64 num_bytes = node->num_bytes;
int last_ref = 0;
bool skinny_metadata = btrfs_fs_incompat(info, SKINNY_METADATA);
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
is_data = owner_objectid >= BTRFS_FIRST_FREE_OBJECTID; if (!is_data && refs_to_drop != 1) {
btrfs_crit(info,
"invalid refs_to_drop, dropping more than 1 refs for tree block %llu refs_to_drop %u",
node->bytenr, refs_to_drop);
ret = -EINVAL;
btrfs_abort_transaction(trans, ret);
goto out;
}
if (is_data)
skinny_metadata = false;
ret = lookup_extent_backref(trans, path, &iref, bytenr, num_bytes,
parent, root_objectid, owner_objectid,
owner_offset);
if (ret == 0) {
/*
* Either the inline backref or the SHARED_DATA_REF/
* SHARED_BLOCK_REF is found
*
* Here is a quick path to locate EXTENT/METADATA_ITEM.
* It's possible the EXTENT/METADATA_ITEM is near current slot.
*/
extent_slot = path->slots[0];
while (extent_slot >= 0) {
btrfs_item_key_to_cpu(path->nodes[0], &key,
extent_slot);
if (key.objectid != bytenr)
break;
if (key.type == BTRFS_EXTENT_ITEM_KEY && key.offset == num_bytes) {
found_extent = 1;
break;
}
if (key.type == BTRFS_METADATA_ITEM_KEY && key.offset == owner_objectid) {
found_extent = 1;
break;
}
/* Quick path didn't find the EXTEMT/METADATA_ITEM */
if (path->slots[0] - extent_slot > 5)
break;
extent_slot--;
}
if (!found_extent) {
if (iref) {
btrfs_crit(info,
"invalid iref, no EXTENT/METADATA_ITEM found but has inline extent ref");
btrfs_abort_transaction(trans, -EUCLEAN);
goto err_dump;
}
/* Must be SHARED_* item, remove the backref first */
ret = remove_extent_backref(trans, path, NULL,
refs_to_drop,
is_data, &last_ref);
if (ret) {
btrfs_abort_transaction(trans, ret);
goto out;
}
btrfs_release_path(path);
/* Slow path to locate EXTENT/METADATA_ITEM */
key.objectid = bytenr;
key.type = BTRFS_EXTENT_ITEM_KEY;
key.offset = num_bytes;
if (!is_data && skinny_metadata) { key.type = BTRFS_METADATA_ITEM_KEY;
key.offset = owner_objectid;
}
ret = btrfs_search_slot(trans, extent_root,
&key, path, -1, 1);
if (ret > 0 && skinny_metadata && path->slots[0]) {
/*
* Couldn't find our skinny metadata item,
* see if we have ye olde extent item.
*/
path->slots[0]--;
btrfs_item_key_to_cpu(path->nodes[0], &key,
path->slots[0]);
if (key.objectid == bytenr &&
key.type == BTRFS_EXTENT_ITEM_KEY && key.offset == num_bytes)
ret = 0;
}
if (ret > 0 && skinny_metadata) {
skinny_metadata = false;
key.objectid = bytenr;
key.type = BTRFS_EXTENT_ITEM_KEY;
key.offset = num_bytes;
btrfs_release_path(path);
ret = btrfs_search_slot(trans, extent_root,
&key, path, -1, 1);
}
if (ret) {
btrfs_err(info,
"umm, got %d back from search, was looking for %llu",
ret, bytenr);
if (ret > 0)
btrfs_print_leaf(path->nodes[0]);
}
if (ret < 0) {
btrfs_abort_transaction(trans, ret);
goto out;
}
extent_slot = path->slots[0];
}
} else if (WARN_ON(ret == -ENOENT)) {
btrfs_print_leaf(path->nodes[0]);
btrfs_err(info,
"unable to find ref byte nr %llu parent %llu root %llu owner %llu offset %llu",
bytenr, parent, root_objectid, owner_objectid,
owner_offset);
btrfs_abort_transaction(trans, ret);
goto out;
} else {
btrfs_abort_transaction(trans, ret);
goto out;
}
leaf = path->nodes[0];
item_size = btrfs_item_size_nr(leaf, extent_slot);
if (unlikely(item_size < sizeof(*ei))) {
ret = -EINVAL;
btrfs_print_v0_err(info);
btrfs_abort_transaction(trans, ret);
goto out;
}
ei = btrfs_item_ptr(leaf, extent_slot,
struct btrfs_extent_item);
if (owner_objectid < BTRFS_FIRST_FREE_OBJECTID &&
key.type == BTRFS_EXTENT_ITEM_KEY) {
struct btrfs_tree_block_info *bi;
if (item_size < sizeof(*ei) + sizeof(*bi)) {
btrfs_crit(info,
"invalid extent item size for key (%llu, %u, %llu) owner %llu, has %u expect >= %zu",
key.objectid, key.type, key.offset,
owner_objectid, item_size,
sizeof(*ei) + sizeof(*bi));
btrfs_abort_transaction(trans, -EUCLEAN);
goto err_dump;
}
bi = (struct btrfs_tree_block_info *)(ei + 1); WARN_ON(owner_objectid != btrfs_tree_block_level(leaf, bi));
}
refs = btrfs_extent_refs(leaf, ei);
if (refs < refs_to_drop) {
btrfs_crit(info,
"trying to drop %d refs but we only have %llu for bytenr %llu",
refs_to_drop, refs, bytenr);
btrfs_abort_transaction(trans, -EUCLEAN);
goto err_dump;
}
refs -= refs_to_drop;
if (refs > 0) {
if (extent_op) __run_delayed_extent_op(extent_op, leaf, ei);
/*
* In the case of inline back ref, reference count will
* be updated by remove_extent_backref
*/
if (iref) { if (!found_extent) {
btrfs_crit(info,
"invalid iref, got inlined extent ref but no EXTENT/METADATA_ITEM found");
btrfs_abort_transaction(trans, -EUCLEAN);
goto err_dump;
}
} else {
btrfs_set_extent_refs(leaf, ei, refs);
btrfs_mark_buffer_dirty(leaf);
}
if (found_extent) { ret = remove_extent_backref(trans, path, iref,
refs_to_drop, is_data,
&last_ref);
if (ret) {
btrfs_abort_transaction(trans, ret);
goto out;
}
}
} else {
/* In this branch refs == 1 */
if (found_extent) { if (is_data && refs_to_drop != extent_data_ref_count(path, iref)) { btrfs_crit(info,
"invalid refs_to_drop, current refs %u refs_to_drop %u",
extent_data_ref_count(path, iref),
refs_to_drop);
btrfs_abort_transaction(trans, -EUCLEAN);
goto err_dump;
}
if (iref) { if (path->slots[0] != extent_slot) {
btrfs_crit(info,
"invalid iref, extent item key (%llu %u %llu) doesn't have wanted iref",
key.objectid, key.type,
key.offset);
btrfs_abort_transaction(trans, -EUCLEAN);
goto err_dump;
}
} else {
/*
* No inline ref, we must be at SHARED_* item,
* And it's single ref, it must be:
* | extent_slot ||extent_slot + 1|
* [ EXTENT/METADATA_ITEM ][ SHARED_* ITEM ]
*/
if (path->slots[0] != extent_slot + 1) {
btrfs_crit(info,
"invalid SHARED_* item, previous item is not EXTENT/METADATA_ITEM");
btrfs_abort_transaction(trans, -EUCLEAN);
goto err_dump;
}
path->slots[0] = extent_slot;
num_to_del = 2;
}
}
last_ref = 1;
ret = btrfs_del_items(trans, extent_root, path, path->slots[0],
num_to_del);
if (ret) {
btrfs_abort_transaction(trans, ret);
goto out;
}
btrfs_release_path(path);
if (is_data) {
ret = btrfs_del_csums(trans, info->csum_root, bytenr,
num_bytes);
if (ret) {
btrfs_abort_transaction(trans, ret);
goto out;
}
}
ret = add_to_free_space_tree(trans, bytenr, num_bytes);
if (ret) {
btrfs_abort_transaction(trans, ret);
goto out;
}
ret = btrfs_update_block_group(trans, bytenr, num_bytes, 0);
if (ret) {
btrfs_abort_transaction(trans, ret);
goto out;
}
}
btrfs_release_path(path);
out:
btrfs_free_path(path); return ret;
err_dump:
/*
* Leaf dump can take up a lot of log buffer, so we only do full leaf
* dump for debug build.
*/
if (IS_ENABLED(CONFIG_BTRFS_DEBUG)) {
btrfs_crit(info, "path->slots[0]=%d extent_slot=%d",
path->slots[0], extent_slot);
btrfs_print_leaf(path->nodes[0]);
}
btrfs_free_path(path);
return -EUCLEAN;
}
/*
* when we free an block, it is possible (and likely) that we free the last
* delayed ref for that extent as well. This searches the delayed ref tree for
* a given extent, and if there are no other delayed refs to be processed, it
* removes it from the tree.
*/
static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans,
u64 bytenr)
{
struct btrfs_delayed_ref_head *head;
struct btrfs_delayed_ref_root *delayed_refs;
int ret = 0;
delayed_refs = &trans->transaction->delayed_refs;
spin_lock(&delayed_refs->lock);
head = btrfs_find_delayed_ref_head(delayed_refs, bytenr);
if (!head)
goto out_delayed_unlock;
spin_lock(&head->lock);
if (!RB_EMPTY_ROOT(&head->ref_tree.rb_root))
goto out;
if (cleanup_extent_op(head) != NULL)
goto out;
/*
* waiting for the lock here would deadlock. If someone else has it
* locked they are already in the process of dropping it anyway
*/
if (!mutex_trylock(&head->mutex))
goto out;
btrfs_delete_ref_head(delayed_refs, head);
head->processing = 0;
spin_unlock(&head->lock);
spin_unlock(&delayed_refs->lock);
BUG_ON(head->extent_op); if (head->must_insert_reserved)
ret = 1;
btrfs_cleanup_ref_head_accounting(trans->fs_info, delayed_refs, head);
mutex_unlock(&head->mutex);
btrfs_put_delayed_ref_head(head);
return ret;
out:
spin_unlock(&head->lock);
out_delayed_unlock:
spin_unlock(&delayed_refs->lock);
return 0;
}
void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct extent_buffer *buf,
u64 parent, int last_ref)
{
struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_ref generic_ref = { 0 };
int ret;
btrfs_init_generic_ref(&generic_ref, BTRFS_DROP_DELAYED_REF,
buf->start, buf->len, parent);
btrfs_init_tree_ref(&generic_ref, btrfs_header_level(buf),
root->root_key.objectid);
if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
btrfs_ref_tree_mod(fs_info, &generic_ref);
ret = btrfs_add_delayed_tree_ref(trans, &generic_ref, NULL); BUG_ON(ret); /* -ENOMEM */
}
if (last_ref && btrfs_header_generation(buf) == trans->transid) {
struct btrfs_block_group *cache;
bool must_pin = false;
if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) { ret = check_ref_cleanup(trans, buf->start); if (!ret) {
btrfs_redirty_list_add(trans->transaction, buf);
goto out;
}
}
cache = btrfs_lookup_block_group(fs_info, buf->start);
if (btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) {
pin_down_extent(trans, cache, buf->start, buf->len, 1);
btrfs_put_block_group(cache);
goto out;
}
/*
* If this is a leaf and there are tree mod log users, we may
* have recorded mod log operations that point to this leaf.
* So we must make sure no one reuses this leaf's extent before
* mod log operations are applied to a node, otherwise after
* rewinding a node using the mod log operations we get an
* inconsistent btree, as the leaf's extent may now be used as
* a node or leaf for another different btree.
* We are safe from races here because at this point no other
* node or root points to this extent buffer, so if after this
* check a new tree mod log user joins, it will not be able to
* find a node pointing to this leaf and record operations that
* point to this leaf.
*/
if (btrfs_header_level(buf) == 0 &&
test_bit(BTRFS_FS_TREE_MOD_LOG_USERS, &fs_info->flags))
must_pin = true;
if (must_pin || btrfs_is_zoned(fs_info)) {
btrfs_redirty_list_add(trans->transaction, buf);
pin_down_extent(trans, cache, buf->start, buf->len, 1);
btrfs_put_block_group(cache);
goto out;
}
WARN_ON(test_bit(EXTENT_BUFFER_DIRTY, &buf->bflags)); btrfs_add_free_space(cache, buf->start, buf->len);
btrfs_free_reserved_bytes(cache, buf->len, 0);
btrfs_put_block_group(cache);
trace_btrfs_reserved_extent_free(fs_info, buf->start, buf->len);
}
out:
if (last_ref) {
/*
* Deleting the buffer, clear the corrupt flag since it doesn't
* matter anymore.
*/
clear_bit(EXTENT_BUFFER_CORRUPT, &buf->bflags);
}
}
/* Can return -ENOMEM */
int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_ref *ref)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
int ret;
if (btrfs_is_testing(fs_info))
return 0;
/*
* tree log blocks never actually go into the extent allocation
* tree, just update pinning info and exit early.
*/
if ((ref->type == BTRFS_REF_METADATA && ref->tree_ref.root == BTRFS_TREE_LOG_OBJECTID) ||
(ref->type == BTRFS_REF_DATA &&
ref->data_ref.ref_root == BTRFS_TREE_LOG_OBJECTID)) {
/* unlocks the pinned mutex */
btrfs_pin_extent(trans, ref->bytenr, ref->len, 1);
ret = 0;
} else if (ref->type == BTRFS_REF_METADATA) {
ret = btrfs_add_delayed_tree_ref(trans, ref, NULL);
} else {
ret = btrfs_add_delayed_data_ref(trans, ref, 0);
}
if (!((ref->type == BTRFS_REF_METADATA &&
ref->tree_ref.root == BTRFS_TREE_LOG_OBJECTID) ||
(ref->type == BTRFS_REF_DATA &&
ref->data_ref.ref_root == BTRFS_TREE_LOG_OBJECTID)))
btrfs_ref_tree_mod(fs_info, ref);
return ret;
}
enum btrfs_loop_type {
LOOP_CACHING_NOWAIT,
LOOP_CACHING_WAIT,
LOOP_ALLOC_CHUNK,
LOOP_NO_EMPTY_SIZE,
};
static inline void
btrfs_lock_block_group(struct btrfs_block_group *cache,
int delalloc)
{
if (delalloc)
down_read(&cache->data_rwsem);
}
static inline void btrfs_grab_block_group(struct btrfs_block_group *cache,
int delalloc)
{
btrfs_get_block_group(cache);
if (delalloc)
down_read(&cache->data_rwsem);
}
static struct btrfs_block_group *btrfs_lock_cluster(
struct btrfs_block_group *block_group,
struct btrfs_free_cluster *cluster,
int delalloc)
__acquires(&cluster->refill_lock)
{
struct btrfs_block_group *used_bg = NULL;
spin_lock(&cluster->refill_lock);
while (1) {
used_bg = cluster->block_group;
if (!used_bg)
return NULL;
if (used_bg == block_group)
return used_bg;
btrfs_get_block_group(used_bg);
if (!delalloc)
return used_bg;
if (down_read_trylock(&used_bg->data_rwsem))
return used_bg;
spin_unlock(&cluster->refill_lock);
/* We should only have one-level nested. */
down_read_nested(&used_bg->data_rwsem, SINGLE_DEPTH_NESTING);
spin_lock(&cluster->refill_lock);
if (used_bg == cluster->block_group)
return used_bg;
up_read(&used_bg->data_rwsem);
btrfs_put_block_group(used_bg);
}
}
static inline void
btrfs_release_block_group(struct btrfs_block_group *cache,
int delalloc)
{
if (delalloc) up_read(&cache->data_rwsem); btrfs_put_block_group(cache);
}
enum btrfs_extent_allocation_policy {
BTRFS_EXTENT_ALLOC_CLUSTERED,
BTRFS_EXTENT_ALLOC_ZONED,
};
/*
* Structure used internally for find_free_extent() function. Wraps needed
* parameters.
*/
struct find_free_extent_ctl {
/* Basic allocation info */
u64 num_bytes;
u64 empty_size;
u64 flags;
int delalloc;
/* Where to start the search inside the bg */
u64 search_start;
/* For clustered allocation */
u64 empty_cluster;
struct btrfs_free_cluster *last_ptr;
bool use_cluster;
bool have_caching_bg;
bool orig_have_caching_bg;
/* Allocation is called for tree-log */
bool for_treelog;
/* Allocation is called for data relocation */
bool for_data_reloc;
/* RAID index, converted from flags */
int index;
/*
* Current loop number, check find_free_extent_update_loop() for details
*/
int loop;
/*
* Whether we're refilling a cluster, if true we need to re-search
* current block group but don't try to refill the cluster again.
*/
bool retry_clustered;
/*
* Whether we're updating free space cache, if true we need to re-search
* current block group but don't try updating free space cache again.
*/
bool retry_unclustered;
/* If current block group is cached */
int cached;
/* Max contiguous hole found */
u64 max_extent_size;
/* Total free space from free space cache, not always contiguous */
u64 total_free_space;
/* Found result */
u64 found_offset;
/* Hint where to start looking for an empty space */
u64 hint_byte;
/* Allocation policy */
enum btrfs_extent_allocation_policy policy;
};
/*
* Helper function for find_free_extent().
*
* Return -ENOENT to inform caller that we need fallback to unclustered mode.
* Return -EAGAIN to inform caller that we need to re-search this block group
* Return >0 to inform caller that we find nothing
* Return 0 means we have found a location and set ffe_ctl->found_offset.
*/
static int find_free_extent_clustered(struct btrfs_block_group *bg,
struct find_free_extent_ctl *ffe_ctl,
struct btrfs_block_group **cluster_bg_ret)
{
struct btrfs_block_group *cluster_bg;
struct btrfs_free_cluster *last_ptr = ffe_ctl->last_ptr;
u64 aligned_cluster;
u64 offset;
int ret;
cluster_bg = btrfs_lock_cluster(bg, last_ptr, ffe_ctl->delalloc);
if (!cluster_bg)
goto refill_cluster;
if (cluster_bg != bg && (cluster_bg->ro || !block_group_bits(cluster_bg, ffe_ctl->flags)))
goto release_cluster;
offset = btrfs_alloc_from_cluster(cluster_bg, last_ptr,
ffe_ctl->num_bytes, cluster_bg->start,
&ffe_ctl->max_extent_size);
if (offset) {
/* We have a block, we're done */
spin_unlock(&last_ptr->refill_lock);
trace_btrfs_reserve_extent_cluster(cluster_bg,
ffe_ctl->search_start, ffe_ctl->num_bytes);
*cluster_bg_ret = cluster_bg;
ffe_ctl->found_offset = offset;
return 0;
}
WARN_ON(last_ptr->block_group != cluster_bg);
release_cluster:
/*
* If we are on LOOP_NO_EMPTY_SIZE, we can't set up a new clusters, so
* lets just skip it and let the allocator find whatever block it can
* find. If we reach this point, we will have tried the cluster
* allocator plenty of times and not have found anything, so we are
* likely way too fragmented for the clustering stuff to find anything.
*
* However, if the cluster is taken from the current block group,
* release the cluster first, so that we stand a better chance of
* succeeding in the unclustered allocation.
*/
if (ffe_ctl->loop >= LOOP_NO_EMPTY_SIZE && cluster_bg != bg) {
spin_unlock(&last_ptr->refill_lock);
btrfs_release_block_group(cluster_bg, ffe_ctl->delalloc);
return -ENOENT;
}
/* This cluster didn't work out, free it and start over */
btrfs_return_cluster_to_free_space(NULL, last_ptr);
if (cluster_bg != bg)
btrfs_release_block_group(cluster_bg, ffe_ctl->delalloc);
refill_cluster:
if (ffe_ctl->loop >= LOOP_NO_EMPTY_SIZE) {
spin_unlock(&last_ptr->refill_lock);
return -ENOENT;
}
aligned_cluster = max_t(u64,
ffe_ctl->empty_cluster + ffe_ctl->empty_size,
bg->full_stripe_len);
ret = btrfs_find_space_cluster(bg, last_ptr, ffe_ctl->search_start,
ffe_ctl->num_bytes, aligned_cluster);
if (ret == 0) {
/* Now pull our allocation out of this cluster */
offset = btrfs_alloc_from_cluster(bg, last_ptr,
ffe_ctl->num_bytes, ffe_ctl->search_start,
&ffe_ctl->max_extent_size);
if (offset) {
/* We found one, proceed */
spin_unlock(&last_ptr->refill_lock);
trace_btrfs_reserve_extent_cluster(bg,
ffe_ctl->search_start,
ffe_ctl->num_bytes);
ffe_ctl->found_offset = offset;
return 0;
}
} else if (!ffe_ctl->cached && ffe_ctl->loop > LOOP_CACHING_NOWAIT && !ffe_ctl->retry_clustered) {
spin_unlock(&last_ptr->refill_lock);
ffe_ctl->retry_clustered = true;
btrfs_wait_block_group_cache_progress(bg, ffe_ctl->num_bytes +
ffe_ctl->empty_cluster + ffe_ctl->empty_size);
return -EAGAIN;
}
/*
* At this point we either didn't find a cluster or we weren't able to
* allocate a block from our cluster. Free the cluster we've been
* trying to use, and go to the next block group.
*/
btrfs_return_cluster_to_free_space(NULL, last_ptr);
spin_unlock(&last_ptr->refill_lock);
return 1;
}
/*
* Return >0 to inform caller that we find nothing
* Return 0 when we found an free extent and set ffe_ctrl->found_offset
* Return -EAGAIN to inform caller that we need to re-search this block group
*/
static int find_free_extent_unclustered(struct btrfs_block_group *bg,
struct find_free_extent_ctl *ffe_ctl)
{
struct btrfs_free_cluster *last_ptr = ffe_ctl->last_ptr;
u64 offset;
/*
* We are doing an unclustered allocation, set the fragmented flag so
* we don't bother trying to setup a cluster again until we get more
* space.
*/
if (unlikely(last_ptr)) {
spin_lock(&last_ptr->lock);
last_ptr->fragmented = 1;
spin_unlock(&last_ptr->lock);
}
if (ffe_ctl->cached) {
struct btrfs_free_space_ctl *free_space_ctl;
free_space_ctl = bg->free_space_ctl;
spin_lock(&free_space_ctl->tree_lock);
if (free_space_ctl->free_space <
ffe_ctl->num_bytes + ffe_ctl->empty_cluster +
ffe_ctl->empty_size) {
ffe_ctl->total_free_space = max_t(u64,
ffe_ctl->total_free_space,
free_space_ctl->free_space);
spin_unlock(&free_space_ctl->tree_lock);
return 1;
}
spin_unlock(&free_space_ctl->tree_lock);
}
offset = btrfs_find_space_for_alloc(bg, ffe_ctl->search_start,
ffe_ctl->num_bytes, ffe_ctl->empty_size,
&ffe_ctl->max_extent_size);
/*
* If we didn't find a chunk, and we haven't failed on this block group
* before, and this block group is in the middle of caching and we are
* ok with waiting, then go ahead and wait for progress to be made, and
* set @retry_unclustered to true.
*
* If @retry_unclustered is true then we've already waited on this
* block group once and should move on to the next block group.
*/
if (!offset && !ffe_ctl->retry_unclustered && !ffe_ctl->cached && ffe_ctl->loop > LOOP_CACHING_NOWAIT) { btrfs_wait_block_group_cache_progress(bg, ffe_ctl->num_bytes +
ffe_ctl->empty_size);
ffe_ctl->retry_unclustered = true;
return -EAGAIN;
} else if (!offset) {
return 1;
}
ffe_ctl->found_offset = offset;
return 0;
}
static int do_allocation_clustered(struct btrfs_block_group *block_group,
struct find_free_extent_ctl *ffe_ctl,
struct btrfs_block_group **bg_ret)
{
int ret;
/* We want to try and use the cluster allocator, so lets look there */
if (ffe_ctl->last_ptr && ffe_ctl->use_cluster) {
ret = find_free_extent_clustered(block_group, ffe_ctl, bg_ret);
if (ret >= 0 || ret == -EAGAIN)
return ret;
/* ret == -ENOENT case falls through */
}
return find_free_extent_unclustered(block_group, ffe_ctl);
}
/*
* Tree-log block group locking
* ============================
*
* fs_info::treelog_bg_lock protects the fs_info::treelog_bg which
* indicates the starting address of a block group, which is reserved only
* for tree-log metadata.
*
* Lock nesting
* ============
*
* space_info::lock
* block_group::lock
* fs_info::treelog_bg_lock
*/
/*
* Simple allocator for sequential-only block group. It only allows sequential
* allocation. No need to play with trees. This function also reserves the
* bytes as in btrfs_add_reserved_bytes.
*/
static int do_allocation_zoned(struct btrfs_block_group *block_group,
struct find_free_extent_ctl *ffe_ctl,
struct btrfs_block_group **bg_ret)
{
struct btrfs_fs_info *fs_info = block_group->fs_info;
struct btrfs_space_info *space_info = block_group->space_info;
struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
u64 start = block_group->start;
u64 num_bytes = ffe_ctl->num_bytes;
u64 avail;
u64 bytenr = block_group->start;
u64 log_bytenr;
u64 data_reloc_bytenr;
int ret = 0;
bool skip;
ASSERT(btrfs_is_zoned(block_group->fs_info));
/*
* Do not allow non-tree-log blocks in the dedicated tree-log block
* group, and vice versa.
*/
spin_lock(&fs_info->treelog_bg_lock);
log_bytenr = fs_info->treelog_bg;
skip = log_bytenr && ((ffe_ctl->for_treelog && bytenr != log_bytenr) || (!ffe_ctl->for_treelog && bytenr == log_bytenr));
spin_unlock(&fs_info->treelog_bg_lock);
if (skip)
return 1;
/*
* Do not allow non-relocation blocks in the dedicated relocation block
* group, and vice versa.
*/
spin_lock(&fs_info->relocation_bg_lock);
data_reloc_bytenr = fs_info->data_reloc_bg;
if (data_reloc_bytenr &&
((ffe_ctl->for_data_reloc && bytenr != data_reloc_bytenr) || (!ffe_ctl->for_data_reloc && bytenr == data_reloc_bytenr)))
skip = true;
spin_unlock(&fs_info->relocation_bg_lock);
if (skip)
return 1;
spin_lock(&space_info->lock);
spin_lock(&block_group->lock);
spin_lock(&fs_info->treelog_bg_lock);
spin_lock(&fs_info->relocation_bg_lock);
ASSERT(!ffe_ctl->for_treelog ||
block_group->start == fs_info->treelog_bg ||
fs_info->treelog_bg == 0);
ASSERT(!ffe_ctl->for_data_reloc ||
block_group->start == fs_info->data_reloc_bg ||
fs_info->data_reloc_bg == 0);
if (block_group->ro) {
ret = 1;
goto out;
}
/*
* Do not allow currently using block group to be tree-log dedicated
* block group.
*/
if (ffe_ctl->for_treelog && !fs_info->treelog_bg && (block_group->used || block_group->reserved)) {
ret = 1;
goto out;
}
/*
* Do not allow currently used block group to be the data relocation
* dedicated block group.
*/
if (ffe_ctl->for_data_reloc && !fs_info->data_reloc_bg && (block_group->used || block_group->reserved)) {
ret = 1;
goto out;
}
avail = block_group->length - block_group->alloc_offset;
if (avail < num_bytes) {
if (ffe_ctl->max_extent_size < avail) {
/*
* With sequential allocator, free space is always
* contiguous
*/
ffe_ctl->max_extent_size = avail;
ffe_ctl->total_free_space = avail;
}
ret = 1;
goto out;
}
if (ffe_ctl->for_treelog && !fs_info->treelog_bg) fs_info->treelog_bg = block_group->start; if (ffe_ctl->for_data_reloc && !fs_info->data_reloc_bg) fs_info->data_reloc_bg = block_group->start; ffe_ctl->found_offset = start + block_group->alloc_offset;
block_group->alloc_offset += num_bytes;
spin_lock(&ctl->tree_lock);
ctl->free_space -= num_bytes;
spin_unlock(&ctl->tree_lock);
/*
* We do not check if found_offset is aligned to stripesize. The
* address is anyway rewritten when using zone append writing.
*/
ffe_ctl->search_start = ffe_ctl->found_offset;
out:
if (ret && ffe_ctl->for_treelog) fs_info->treelog_bg = 0; if (ret && ffe_ctl->for_data_reloc) fs_info->data_reloc_bg = 0;
spin_unlock(&fs_info->relocation_bg_lock);
spin_unlock(&fs_info->treelog_bg_lock);
spin_unlock(&block_group->lock);
spin_unlock(&space_info->lock);
return ret;
}
static int do_allocation(struct btrfs_block_group *block_group,
struct find_free_extent_ctl *ffe_ctl,
struct btrfs_block_group **bg_ret)
{
switch (ffe_ctl->policy) {
case BTRFS_EXTENT_ALLOC_CLUSTERED:
return do_allocation_clustered(block_group, ffe_ctl, bg_ret);
case BTRFS_EXTENT_ALLOC_ZONED:
return do_allocation_zoned(block_group, ffe_ctl, bg_ret);
default:
BUG();
}
}
static void release_block_group(struct btrfs_block_group *block_group,
struct find_free_extent_ctl *ffe_ctl,
int delalloc)
{
switch (ffe_ctl->policy) {
case BTRFS_EXTENT_ALLOC_CLUSTERED:
ffe_ctl->retry_clustered = false;
ffe_ctl->retry_unclustered = false;
break;
case BTRFS_EXTENT_ALLOC_ZONED:
/* Nothing to do */
break;
default:
BUG();
}
BUG_ON(btrfs_bg_flags_to_raid_index(block_group->flags) !=
ffe_ctl->index);
btrfs_release_block_group(block_group, delalloc);
}
static void found_extent_clustered(struct find_free_extent_ctl *ffe_ctl,
struct btrfs_key *ins)
{
struct btrfs_free_cluster *last_ptr = ffe_ctl->last_ptr;
if (!ffe_ctl->use_cluster && last_ptr) {
spin_lock(&last_ptr->lock);
last_ptr->window_start = ins->objectid;
spin_unlock(&last_ptr->lock);
}
}
static void found_extent(struct find_free_extent_ctl *ffe_ctl,
struct btrfs_key *ins)
{
switch (ffe_ctl->policy) {
case BTRFS_EXTENT_ALLOC_CLUSTERED:
found_extent_clustered(ffe_ctl, ins);
break;
case BTRFS_EXTENT_ALLOC_ZONED:
/* Nothing to do */
break;
default:
BUG();
}
}
static int chunk_allocation_failed(struct find_free_extent_ctl *ffe_ctl)
{
switch (ffe_ctl->policy) {
case BTRFS_EXTENT_ALLOC_CLUSTERED:
/*
* If we can't allocate a new chunk we've already looped through
* at least once, move on to the NO_EMPTY_SIZE case.
*/
ffe_ctl->loop = LOOP_NO_EMPTY_SIZE;
return 0;
case BTRFS_EXTENT_ALLOC_ZONED:
/* Give up here */
return -ENOSPC;
default:
BUG();
}
}
/*
* Return >0 means caller needs to re-search for free extent
* Return 0 means we have the needed free extent.
* Return <0 means we failed to locate any free extent.
*/
static int find_free_extent_update_loop(struct btrfs_fs_info *fs_info,
struct btrfs_key *ins,
struct find_free_extent_ctl *ffe_ctl,
bool full_search)
{
struct btrfs_root *root = fs_info->extent_root;
int ret;
if ((ffe_ctl->loop == LOOP_CACHING_NOWAIT) &&
ffe_ctl->have_caching_bg && !ffe_ctl->orig_have_caching_bg) ffe_ctl->orig_have_caching_bg = true; if (!ins->objectid && ffe_ctl->loop >= LOOP_CACHING_WAIT && ffe_ctl->have_caching_bg)
return 1;
if (!ins->objectid && ++(ffe_ctl->index) < BTRFS_NR_RAID_TYPES)
return 1;
if (ins->objectid) {
found_extent(ffe_ctl, ins);
return 0;
}
/*
* LOOP_CACHING_NOWAIT, search partially cached block groups, kicking
* caching kthreads as we move along
* LOOP_CACHING_WAIT, search everything, and wait if our bg is caching
* LOOP_ALLOC_CHUNK, force a chunk allocation and try again
* LOOP_NO_EMPTY_SIZE, set empty_size and empty_cluster to 0 and try
* again
*/
if (ffe_ctl->loop < LOOP_NO_EMPTY_SIZE) { ffe_ctl->index = 0;
if (ffe_ctl->loop == LOOP_CACHING_NOWAIT) {
/*
* We want to skip the LOOP_CACHING_WAIT step if we
* don't have any uncached bgs and we've already done a
* full search through.
*/
if (ffe_ctl->orig_have_caching_bg || !full_search) ffe_ctl->loop = LOOP_CACHING_WAIT;
else
ffe_ctl->loop = LOOP_ALLOC_CHUNK;
} else {
ffe_ctl->loop++;
}
if (ffe_ctl->loop == LOOP_ALLOC_CHUNK) {
struct btrfs_trans_handle *trans;
int exist = 0;
trans = current->journal_info;
if (trans)
exist = 1;
else
trans = btrfs_join_transaction(root);
if (IS_ERR(trans)) {
ret = PTR_ERR(trans);
return ret;
}
ret = btrfs_chunk_alloc(trans, ffe_ctl->flags,
CHUNK_ALLOC_FORCE);
/* Do not bail out on ENOSPC since we can do more. */
if (ret == -ENOSPC)
ret = chunk_allocation_failed(ffe_ctl); else if (ret < 0) btrfs_abort_transaction(trans, ret);
else
ret = 0;
if (!exist) btrfs_end_transaction(trans);
if (ret)
return ret;
}
if (ffe_ctl->loop == LOOP_NO_EMPTY_SIZE) { if (ffe_ctl->policy != BTRFS_EXTENT_ALLOC_CLUSTERED)
return -ENOSPC;
/*
* Don't loop again if we already have no empty_size and
* no empty_cluster.
*/
if (ffe_ctl->empty_size == 0 &&
ffe_ctl->empty_cluster == 0)
return -ENOSPC;
ffe_ctl->empty_size = 0;
ffe_ctl->empty_cluster = 0;
}
return 1;
}
return -ENOSPC;
}
static int prepare_allocation_clustered(struct btrfs_fs_info *fs_info,
struct find_free_extent_ctl *ffe_ctl,
struct btrfs_space_info *space_info,
struct btrfs_key *ins)
{
/*
* If our free space is heavily fragmented we may not be able to make
* big contiguous allocations, so instead of doing the expensive search
* for free space, simply return ENOSPC with our max_extent_size so we
* can go ahead and search for a more manageable chunk.
*
* If our max_extent_size is large enough for our allocation simply
* disable clustering since we will likely not be able to find enough
* space to create a cluster and induce latency trying.
*/
if (space_info->max_extent_size) {
spin_lock(&space_info->lock);
if (space_info->max_extent_size &&
ffe_ctl->num_bytes > space_info->max_extent_size) { ins->offset = space_info->max_extent_size;
spin_unlock(&space_info->lock);
return -ENOSPC;
} else if (space_info->max_extent_size) {
ffe_ctl->use_cluster = false;
}
spin_unlock(&space_info->lock);
}
ffe_ctl->last_ptr = fetch_cluster_info(fs_info, space_info,
&ffe_ctl->empty_cluster);
if (ffe_ctl->last_ptr) {
struct btrfs_free_cluster *last_ptr = ffe_ctl->last_ptr;
spin_lock(&last_ptr->lock);
if (last_ptr->block_group)
ffe_ctl->hint_byte = last_ptr->window_start; if (last_ptr->fragmented) {
/*
* We still set window_start so we can keep track of the
* last place we found an allocation to try and save
* some time.
*/
ffe_ctl->hint_byte = last_ptr->window_start;
ffe_ctl->use_cluster = false;
}
spin_unlock(&last_ptr->lock);
}
return 0;
}
static int prepare_allocation(struct btrfs_fs_info *fs_info,
struct find_free_extent_ctl *ffe_ctl,
struct btrfs_space_info *space_info,
struct btrfs_key *ins)
{
switch (ffe_ctl->policy) {
case BTRFS_EXTENT_ALLOC_CLUSTERED:
return prepare_allocation_clustered(fs_info, ffe_ctl,
space_info, ins);
case BTRFS_EXTENT_ALLOC_ZONED:
if (ffe_ctl->for_treelog) {
spin_lock(&fs_info->treelog_bg_lock);
if (fs_info->treelog_bg)
ffe_ctl->hint_byte = fs_info->treelog_bg;
spin_unlock(&fs_info->treelog_bg_lock);
}
if (ffe_ctl->for_data_reloc) {
spin_lock(&fs_info->relocation_bg_lock);
if (fs_info->data_reloc_bg)
ffe_ctl->hint_byte = fs_info->data_reloc_bg;
spin_unlock(&fs_info->relocation_bg_lock);
}
return 0;
default:
BUG();
}
}
/*
* walks the btree of allocated extents and find a hole of a given size.
* The key ins is changed to record the hole:
* ins->objectid == start position
* ins->flags = BTRFS_EXTENT_ITEM_KEY
* ins->offset == the size of the hole.
* Any available blocks before search_start are skipped.
*
* If there is no suitable free space, we will record the max size of
* the free space extent currently.
*
* The overall logic and call chain:
*
* find_free_extent()
* |- Iterate through all block groups
* | |- Get a valid block group
* | |- Try to do clustered allocation in that block group
* | |- Try to do unclustered allocation in that block group
* | |- Check if the result is valid
* | | |- If valid, then exit
* | |- Jump to next block group
* |
* |- Push harder to find free extents
* |- If not found, re-iterate all block groups
*/
static noinline int find_free_extent(struct btrfs_root *root,
u64 ram_bytes, u64 num_bytes, u64 empty_size,
u64 hint_byte_orig, struct btrfs_key *ins,
u64 flags, int delalloc)
{
struct btrfs_fs_info *fs_info = root->fs_info;
int ret = 0;
int cache_block_group_error = 0;
struct btrfs_block_group *block_group = NULL;
struct find_free_extent_ctl ffe_ctl = {0};
struct btrfs_space_info *space_info;
bool full_search = false;
bool for_treelog = (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID);
bool for_data_reloc = (btrfs_is_data_reloc_root(root) &&
flags & BTRFS_BLOCK_GROUP_DATA); WARN_ON(num_bytes < fs_info->sectorsize); ffe_ctl.num_bytes = num_bytes;
ffe_ctl.empty_size = empty_size;
ffe_ctl.flags = flags;
ffe_ctl.search_start = 0;
ffe_ctl.delalloc = delalloc;
ffe_ctl.index = btrfs_bg_flags_to_raid_index(flags);
ffe_ctl.have_caching_bg = false;
ffe_ctl.orig_have_caching_bg = false;
ffe_ctl.found_offset = 0;
ffe_ctl.hint_byte = hint_byte_orig;
ffe_ctl.for_treelog = for_treelog;
ffe_ctl.for_data_reloc = for_data_reloc;
ffe_ctl.policy = BTRFS_EXTENT_ALLOC_CLUSTERED;
/* For clustered allocation */
ffe_ctl.retry_clustered = false;
ffe_ctl.retry_unclustered = false;
ffe_ctl.last_ptr = NULL;
ffe_ctl.use_cluster = true;
if (btrfs_is_zoned(fs_info))
ffe_ctl.policy = BTRFS_EXTENT_ALLOC_ZONED; ins->type = BTRFS_EXTENT_ITEM_KEY;
ins->objectid = 0;
ins->offset = 0;
trace_find_free_extent(root, num_bytes, empty_size, flags);
space_info = btrfs_find_space_info(fs_info, flags);
if (!space_info) {
btrfs_err(fs_info, "No space info for %llu", flags);
return -ENOSPC;
}
ret = prepare_allocation(fs_info, &ffe_ctl, space_info, ins);
if (ret < 0)
return ret;
ffe_ctl.search_start = max(ffe_ctl.search_start,
first_logical_byte(fs_info, 0));
ffe_ctl.search_start = max(ffe_ctl.search_start, ffe_ctl.hint_byte); if (ffe_ctl.search_start == ffe_ctl.hint_byte) { block_group = btrfs_lookup_block_group(fs_info,
ffe_ctl.search_start);
/*
* we don't want to use the block group if it doesn't match our
* allocation bits, or if its not cached.
*
* However if we are re-searching with an ideal block group
* picked out then we don't care that the block group is cached.
*/
if (block_group && block_group_bits(block_group, flags) && block_group->cached != BTRFS_CACHE_NO) { down_read(&space_info->groups_sem);
if (list_empty(&block_group->list) ||
block_group->ro) {
/*
* someone is removing this block group,
* we can't jump into the have_block_group
* target because our list pointers are not
* valid
*/
btrfs_put_block_group(block_group);
up_read(&space_info->groups_sem);
} else {
ffe_ctl.index = btrfs_bg_flags_to_raid_index(
block_group->flags);
btrfs_lock_block_group(block_group, delalloc);
goto have_block_group;
}
} else if (block_group) {
btrfs_put_block_group(block_group);
}
}
search:
ffe_ctl.have_caching_bg = false; if (ffe_ctl.index == btrfs_bg_flags_to_raid_index(flags) ||
ffe_ctl.index == 0)
full_search = true;
down_read(&space_info->groups_sem); list_for_each_entry(block_group,
&space_info->block_groups[ffe_ctl.index], list) {
struct btrfs_block_group *bg_ret;
/* If the block group is read-only, we can skip it entirely. */
if (unlikely(block_group->ro)) { if (for_treelog) btrfs_clear_treelog_bg(block_group);
if (ffe_ctl.for_data_reloc)
btrfs_clear_data_reloc_bg(block_group);
continue;
}
btrfs_grab_block_group(block_group, delalloc);
ffe_ctl.search_start = block_group->start;
/*
* this can happen if we end up cycling through all the
* raid types, but we want to make sure we only allocate
* for the proper type.
*/
if (!block_group_bits(block_group, flags)) {
u64 extra = BTRFS_BLOCK_GROUP_DUP |
BTRFS_BLOCK_GROUP_RAID1_MASK |
BTRFS_BLOCK_GROUP_RAID56_MASK |
BTRFS_BLOCK_GROUP_RAID10;
/*
* if they asked for extra copies and this block group
* doesn't provide them, bail. This does allow us to
* fill raid0 from raid1.
*/
if ((flags & extra) && !(block_group->flags & extra))
goto loop;
/*
* This block group has different flags than we want.
* It's possible that we have MIXED_GROUP flag but no
* block group is mixed. Just skip such block group.
*/
btrfs_release_block_group(block_group, delalloc);
continue;
}
have_block_group:
ffe_ctl.cached = btrfs_block_group_done(block_group);
if (unlikely(!ffe_ctl.cached)) {
ffe_ctl.have_caching_bg = true;
ret = btrfs_cache_block_group(block_group, 0);
/*
* If we get ENOMEM here or something else we want to
* try other block groups, because it may not be fatal.
* However if we can't find anything else we need to
* save our return here so that we return the actual
* error that caused problems, not ENOSPC.
*/
if (ret < 0) { if (!cache_block_group_error)
cache_block_group_error = ret;
ret = 0;
goto loop;
}
ret = 0;
}
if (unlikely(block_group->cached == BTRFS_CACHE_ERROR))
goto loop;
bg_ret = NULL;
ret = do_allocation(block_group, &ffe_ctl, &bg_ret);
if (ret == 0) { if (bg_ret && bg_ret != block_group) {
btrfs_release_block_group(block_group, delalloc);
block_group = bg_ret;
}
} else if (ret == -EAGAIN) {
goto have_block_group;
} else if (ret > 0) {
goto loop;
}
/* Checks */
ffe_ctl.search_start = round_up(ffe_ctl.found_offset,
fs_info->stripesize);
/* move on to the next group */
if (ffe_ctl.search_start + num_bytes >
block_group->start + block_group->length) {
btrfs_add_free_space_unused(block_group,
ffe_ctl.found_offset, num_bytes);
goto loop;
}
if (ffe_ctl.found_offset < ffe_ctl.search_start) btrfs_add_free_space_unused(block_group,
ffe_ctl.found_offset,
ffe_ctl.search_start - ffe_ctl.found_offset);
ret = btrfs_add_reserved_bytes(block_group, ram_bytes,
num_bytes, delalloc);
if (ret == -EAGAIN) {
btrfs_add_free_space_unused(block_group,
ffe_ctl.found_offset, num_bytes);
goto loop;
}
btrfs_inc_block_group_reservations(block_group);
/* we are all good, lets return */
ins->objectid = ffe_ctl.search_start;
ins->offset = num_bytes;
trace_btrfs_reserve_extent(block_group, ffe_ctl.search_start,
num_bytes);
btrfs_release_block_group(block_group, delalloc);
break;
loop:
release_block_group(block_group, &ffe_ctl, delalloc);
cond_resched();
}
up_read(&space_info->groups_sem);
ret = find_free_extent_update_loop(fs_info, ins, &ffe_ctl, full_search);
if (ret > 0)
goto search;
if (ret == -ENOSPC && !cache_block_group_error) {
/*
* Use ffe_ctl->total_free_space as fallback if we can't find
* any contiguous hole.
*/
if (!ffe_ctl.max_extent_size) ffe_ctl.max_extent_size = ffe_ctl.total_free_space;
spin_lock(&space_info->lock);
space_info->max_extent_size = ffe_ctl.max_extent_size;
spin_unlock(&space_info->lock);
ins->offset = ffe_ctl.max_extent_size;
} else if (ret == -ENOSPC) {
ret = cache_block_group_error;
}
return ret;
}
/*
* btrfs_reserve_extent - entry point to the extent allocator. Tries to find a
* hole that is at least as big as @num_bytes.
*
* @root - The root that will contain this extent
*
* @ram_bytes - The amount of space in ram that @num_bytes take. This
* is used for accounting purposes. This value differs
* from @num_bytes only in the case of compressed extents.
*
* @num_bytes - Number of bytes to allocate on-disk.
*
* @min_alloc_size - Indicates the minimum amount of space that the
* allocator should try to satisfy. In some cases
* @num_bytes may be larger than what is required and if
* the filesystem is fragmented then allocation fails.
* However, the presence of @min_alloc_size gives a
* chance to try and satisfy the smaller allocation.
*
* @empty_size - A hint that you plan on doing more COW. This is the
* size in bytes the allocator should try to find free
* next to the block it returns. This is just a hint and
* may be ignored by the allocator.
*
* @hint_byte - Hint to the allocator to start searching above the byte
* address passed. It might be ignored.
*
* @ins - This key is modified to record the found hole. It will
* have the following values:
* ins->objectid == start position
* ins->flags = BTRFS_EXTENT_ITEM_KEY
* ins->offset == the size of the hole.
*
* @is_data - Boolean flag indicating whether an extent is
* allocated for data (true) or metadata (false)
*
* @delalloc - Boolean flag indicating whether this allocation is for
* delalloc or not. If 'true' data_rwsem of block groups
* is going to be acquired.
*
*
* Returns 0 when an allocation succeeded or < 0 when an error occurred. In
* case -ENOSPC is returned then @ins->offset will contain the size of the
* largest available hole the allocator managed to find.
*/
int btrfs_reserve_extent(struct btrfs_root *root, u64 ram_bytes,
u64 num_bytes, u64 min_alloc_size,
u64 empty_size, u64 hint_byte,
struct btrfs_key *ins, int is_data, int delalloc)
{
struct btrfs_fs_info *fs_info = root->fs_info;
bool final_tried = num_bytes == min_alloc_size;
u64 flags;
int ret;
bool for_treelog = (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID);
bool for_data_reloc = (btrfs_is_data_reloc_root(root) && is_data);
flags = get_alloc_profile_by_root(root, is_data);
again:
WARN_ON(num_bytes < fs_info->sectorsize); ret = find_free_extent(root, ram_bytes, num_bytes, empty_size,
hint_byte, ins, flags, delalloc);
if (!ret && !is_data) {
btrfs_dec_block_group_reservations(fs_info, ins->objectid); } else if (ret == -ENOSPC) { if (!final_tried && ins->offset) { num_bytes = min(num_bytes >> 1, ins->offset);
num_bytes = round_down(num_bytes,
fs_info->sectorsize);
num_bytes = max(num_bytes, min_alloc_size);
ram_bytes = num_bytes;
if (num_bytes == min_alloc_size)
final_tried = true;
goto again;
} else if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) {
struct btrfs_space_info *sinfo;
sinfo = btrfs_find_space_info(fs_info, flags);
btrfs_err(fs_info,
"allocation failed flags %llu, wanted %llu tree-log %d, relocation: %d",
flags, num_bytes, for_treelog, for_data_reloc);
if (sinfo)
btrfs_dump_space_info(fs_info, sinfo,
num_bytes, 1);
}
}
return ret;
}
int btrfs_free_reserved_extent(struct btrfs_fs_info *fs_info,
u64 start, u64 len, int delalloc)
{
struct btrfs_block_group *cache;
cache = btrfs_lookup_block_group(fs_info, start);
if (!cache) {
btrfs_err(fs_info, "Unable to find block group for %llu",
start);
return -ENOSPC;
}
btrfs_add_free_space(cache, start, len);
btrfs_free_reserved_bytes(cache, len, delalloc);
trace_btrfs_reserved_extent_free(fs_info, start, len);
btrfs_put_block_group(cache); return 0;
}
int btrfs_pin_reserved_extent(struct btrfs_trans_handle *trans, u64 start,
u64 len)
{
struct btrfs_block_group *cache;
int ret = 0;
cache = btrfs_lookup_block_group(trans->fs_info, start);
if (!cache) {
btrfs_err(trans->fs_info, "unable to find block group for %llu",
start);
return -ENOSPC;
}
ret = pin_down_extent(trans, cache, start, len, 1);
btrfs_put_block_group(cache);
return ret;
}
static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
u64 parent, u64 root_objectid,
u64 flags, u64 owner, u64 offset,
struct btrfs_key *ins, int ref_mod)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
int ret;
struct btrfs_extent_item *extent_item;
struct btrfs_extent_inline_ref *iref;
struct btrfs_path *path;
struct extent_buffer *leaf;
int type;
u32 size;
if (parent > 0)
type = BTRFS_SHARED_DATA_REF_KEY;
else
type = BTRFS_EXTENT_DATA_REF_KEY;
size = sizeof(*extent_item) + btrfs_extent_inline_ref_size(type);
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
ret = btrfs_insert_empty_item(trans, fs_info->extent_root, path,
ins, size);
if (ret) {
btrfs_free_path(path);
return ret;
}
leaf = path->nodes[0];
extent_item = btrfs_item_ptr(leaf, path->slots[0],
struct btrfs_extent_item);
btrfs_set_extent_refs(leaf, extent_item, ref_mod);
btrfs_set_extent_generation(leaf, extent_item, trans->transid);
btrfs_set_extent_flags(leaf, extent_item,
flags | BTRFS_EXTENT_FLAG_DATA);
iref = (struct btrfs_extent_inline_ref *)(extent_item + 1);
btrfs_set_extent_inline_ref_type(leaf, iref, type);
if (parent > 0) {
struct btrfs_shared_data_ref *ref;
ref = (struct btrfs_shared_data_ref *)(iref + 1);
btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
btrfs_set_shared_data_ref_count(leaf, ref, ref_mod);
} else {
struct btrfs_extent_data_ref *ref;
ref = (struct btrfs_extent_data_ref *)(&iref->offset);
btrfs_set_extent_data_ref_root(leaf, ref, root_objectid);
btrfs_set_extent_data_ref_objectid(leaf, ref, owner);
btrfs_set_extent_data_ref_offset(leaf, ref, offset);
btrfs_set_extent_data_ref_count(leaf, ref, ref_mod);
}
btrfs_mark_buffer_dirty(path->nodes[0]);
btrfs_free_path(path);
ret = remove_from_free_space_tree(trans, ins->objectid, ins->offset);
if (ret)
return ret;
ret = btrfs_update_block_group(trans, ins->objectid, ins->offset, 1);
if (ret) { /* -ENOENT, logic error */
btrfs_err(fs_info, "update block group failed for %llu %llu",
ins->objectid, ins->offset);
BUG();
}
trace_btrfs_reserved_extent_alloc(fs_info, ins->objectid, ins->offset);
return ret;
}
static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
struct btrfs_delayed_ref_node *node,
struct btrfs_delayed_extent_op *extent_op)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
int ret;
struct btrfs_extent_item *extent_item;
struct btrfs_key extent_key;
struct btrfs_tree_block_info *block_info;
struct btrfs_extent_inline_ref *iref;
struct btrfs_path *path;
struct extent_buffer *leaf;
struct btrfs_delayed_tree_ref *ref;
u32 size = sizeof(*extent_item) + sizeof(*iref);
u64 num_bytes;
u64 flags = extent_op->flags_to_set;
bool skinny_metadata = btrfs_fs_incompat(fs_info, SKINNY_METADATA);
ref = btrfs_delayed_node_to_tree_ref(node);
extent_key.objectid = node->bytenr;
if (skinny_metadata) {
extent_key.offset = ref->level;
extent_key.type = BTRFS_METADATA_ITEM_KEY;
num_bytes = fs_info->nodesize;
} else {
extent_key.offset = node->num_bytes;
extent_key.type = BTRFS_EXTENT_ITEM_KEY;
size += sizeof(*block_info);
num_bytes = node->num_bytes;
}
path = btrfs_alloc_path();
if (!path)
return -ENOMEM; ret = btrfs_insert_empty_item(trans, fs_info->extent_root, path,
&extent_key, size);
if (ret) {
btrfs_free_path(path); return ret;
}
leaf = path->nodes[0];
extent_item = btrfs_item_ptr(leaf, path->slots[0],
struct btrfs_extent_item);
btrfs_set_extent_refs(leaf, extent_item, 1);
btrfs_set_extent_generation(leaf, extent_item, trans->transid);
btrfs_set_extent_flags(leaf, extent_item,
flags | BTRFS_EXTENT_FLAG_TREE_BLOCK);
if (skinny_metadata) {
iref = (struct btrfs_extent_inline_ref *)(extent_item + 1);
} else {
block_info = (struct btrfs_tree_block_info *)(extent_item + 1);
btrfs_set_tree_block_key(leaf, block_info, &extent_op->key);
btrfs_set_tree_block_level(leaf, block_info, ref->level);
iref = (struct btrfs_extent_inline_ref *)(block_info + 1);
}
if (node->type == BTRFS_SHARED_BLOCK_REF_KEY) {
btrfs_set_extent_inline_ref_type(leaf, iref,
BTRFS_SHARED_BLOCK_REF_KEY);
btrfs_set_extent_inline_ref_offset(leaf, iref, ref->parent);
} else {
btrfs_set_extent_inline_ref_type(leaf, iref,
BTRFS_TREE_BLOCK_REF_KEY);
btrfs_set_extent_inline_ref_offset(leaf, iref, ref->root);
}
btrfs_mark_buffer_dirty(leaf);
btrfs_free_path(path);
ret = remove_from_free_space_tree(trans, extent_key.objectid,
num_bytes);
if (ret)
return ret;
ret = btrfs_update_block_group(trans, extent_key.objectid,
fs_info->nodesize, 1);
if (ret) { /* -ENOENT, logic error */
btrfs_err(fs_info, "update block group failed for %llu %llu",
extent_key.objectid, extent_key.offset);
BUG();
}
trace_btrfs_reserved_extent_alloc(fs_info, extent_key.objectid,
fs_info->nodesize);
return ret;
}
int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
struct btrfs_root *root, u64 owner,
u64 offset, u64 ram_bytes,
struct btrfs_key *ins)
{
struct btrfs_ref generic_ref = { 0 }; BUG_ON(root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID); btrfs_init_generic_ref(&generic_ref, BTRFS_ADD_DELAYED_EXTENT,
ins->objectid, ins->offset, 0);
btrfs_init_data_ref(&generic_ref, root->root_key.objectid, owner, offset);
btrfs_ref_tree_mod(root->fs_info, &generic_ref);
return btrfs_add_delayed_data_ref(trans, &generic_ref, ram_bytes);
}
/*
* this is used by the tree logging recovery code. It records that
* an extent has been allocated and makes sure to clear the free
* space cache bits as well
*/
int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
u64 root_objectid, u64 owner, u64 offset,
struct btrfs_key *ins)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
int ret;
struct btrfs_block_group *block_group;
struct btrfs_space_info *space_info;
/*
* Mixed block groups will exclude before processing the log so we only
* need to do the exclude dance if this fs isn't mixed.
*/
if (!btrfs_fs_incompat(fs_info, MIXED_GROUPS)) {
ret = __exclude_logged_extent(fs_info, ins->objectid,
ins->offset);
if (ret)
return ret;
}
block_group = btrfs_lookup_block_group(fs_info, ins->objectid);
if (!block_group)
return -EINVAL;
space_info = block_group->space_info;
spin_lock(&space_info->lock);
spin_lock(&block_group->lock);
space_info->bytes_reserved += ins->offset;
block_group->reserved += ins->offset;
spin_unlock(&block_group->lock);
spin_unlock(&space_info->lock);
ret = alloc_reserved_file_extent(trans, 0, root_objectid, 0, owner,
offset, ins, 1);
if (ret)
btrfs_pin_extent(trans, ins->objectid, ins->offset, 1);
btrfs_put_block_group(block_group);
return ret;
}
static struct extent_buffer *
btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root,
u64 bytenr, int level, u64 owner,
enum btrfs_lock_nesting nest)
{
struct btrfs_fs_info *fs_info = root->fs_info;
struct extent_buffer *buf;
buf = btrfs_find_create_tree_block(fs_info, bytenr, owner, level);
if (IS_ERR(buf))
return buf;
/*
* Extra safety check in case the extent tree is corrupted and extent
* allocator chooses to use a tree block which is already used and
* locked.
*/
if (buf->lock_owner == current->pid) { btrfs_err_rl(fs_info,
"tree block %llu owner %llu already locked by pid=%d, extent tree corruption detected",
buf->start, btrfs_header_owner(buf), current->pid);
free_extent_buffer(buf);
return ERR_PTR(-EUCLEAN);
}
/*
* This needs to stay, because we could allocate a freed block from an
* old tree into a new tree, so we need to make sure this new block is
* set to the appropriate level and owner.
*/
btrfs_set_buffer_lockdep_class(owner, buf, level);
__btrfs_tree_lock(buf, nest);
btrfs_clean_tree_block(buf);
clear_bit(EXTENT_BUFFER_STALE, &buf->bflags);
clear_bit(EXTENT_BUFFER_NO_CHECK, &buf->bflags);
set_extent_buffer_uptodate(buf);
memzero_extent_buffer(buf, 0, sizeof(struct btrfs_header));
btrfs_set_header_level(buf, level);
btrfs_set_header_bytenr(buf, buf->start);
btrfs_set_header_generation(buf, trans->transid);
btrfs_set_header_backref_rev(buf, BTRFS_MIXED_BACKREF_REV);
btrfs_set_header_owner(buf, owner);
write_extent_buffer_fsid(buf, fs_info->fs_devices->metadata_uuid);
write_extent_buffer_chunk_tree_uuid(buf, fs_info->chunk_tree_uuid);
if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) {
buf->log_index = root->log_transid % 2;
/*
* we allow two log transactions at a time, use different
* EXTENT bit to differentiate dirty pages.
*/
if (buf->log_index == 0)
set_extent_dirty(&root->dirty_log_pages, buf->start,
buf->start + buf->len - 1, GFP_NOFS);
else
set_extent_new(&root->dirty_log_pages, buf->start,
buf->start + buf->len - 1);
} else {
buf->log_index = -1;
set_extent_dirty(&trans->transaction->dirty_pages, buf->start,
buf->start + buf->len - 1, GFP_NOFS);
}
/* this returns a buffer locked for blocking */
return buf;
}
/*
* finds a free extent and does all the dirty work required for allocation
* returns the tree buffer or an ERR_PTR on error.
*/
struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
u64 parent, u64 root_objectid,
const struct btrfs_disk_key *key,
int level, u64 hint,
u64 empty_size,
enum btrfs_lock_nesting nest)
{
struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_key ins;
struct btrfs_block_rsv *block_rsv;
struct extent_buffer *buf;
struct btrfs_delayed_extent_op *extent_op;
struct btrfs_ref generic_ref = { 0 };
u64 flags = 0;
int ret;
u32 blocksize = fs_info->nodesize;
bool skinny_metadata = btrfs_fs_incompat(fs_info, SKINNY_METADATA);
#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
if (btrfs_is_testing(fs_info)) {
buf = btrfs_init_new_buffer(trans, root, root->alloc_bytenr,
level, root_objectid, nest);
if (!IS_ERR(buf))
root->alloc_bytenr += blocksize;
return buf;
}
#endif
block_rsv = btrfs_use_block_rsv(trans, root, blocksize);
if (IS_ERR(block_rsv))
return ERR_CAST(block_rsv);
ret = btrfs_reserve_extent(root, blocksize, blocksize, blocksize,
empty_size, hint, &ins, 0, 0);
if (ret)
goto out_unuse;
buf = btrfs_init_new_buffer(trans, root, ins.objectid, level,
root_objectid, nest);
if (IS_ERR(buf)) {
ret = PTR_ERR(buf);
goto out_free_reserved;
}
if (root_objectid == BTRFS_TREE_RELOC_OBJECTID) { if (parent == 0) parent = ins.objectid;
flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF;
} else
BUG_ON(parent > 0); if (root_objectid != BTRFS_TREE_LOG_OBJECTID) {
extent_op = btrfs_alloc_delayed_extent_op();
if (!extent_op) {
ret = -ENOMEM;
goto out_free_buf;
}
if (key) memcpy(&extent_op->key, key, sizeof(extent_op->key));
else
memset(&extent_op->key, 0, sizeof(extent_op->key)); extent_op->flags_to_set = flags;
extent_op->update_key = skinny_metadata ? false : true;
extent_op->update_flags = true;
extent_op->is_data = false;
extent_op->level = level;
btrfs_init_generic_ref(&generic_ref, BTRFS_ADD_DELAYED_EXTENT,
ins.objectid, ins.offset, parent);
generic_ref.real_root = root->root_key.objectid;
btrfs_init_tree_ref(&generic_ref, level, root_objectid);
btrfs_ref_tree_mod(fs_info, &generic_ref);
ret = btrfs_add_delayed_tree_ref(trans, &generic_ref, extent_op);
if (ret)
goto out_free_delayed;
}
return buf;
out_free_delayed:
btrfs_free_delayed_extent_op(extent_op);
out_free_buf:
btrfs_tree_unlock(buf);
free_extent_buffer(buf);
out_free_reserved:
btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 0);
out_unuse:
btrfs_unuse_block_rsv(fs_info, block_rsv, blocksize);
return ERR_PTR(ret);
}
struct walk_control {
u64 refs[BTRFS_MAX_LEVEL];
u64 flags[BTRFS_MAX_LEVEL];
struct btrfs_key update_progress;
struct btrfs_key drop_progress;
int drop_level;
int stage;
int level;
int shared_level;
int update_ref;
int keep_locks;
int reada_slot;
int reada_count;
int restarted;
};
#define DROP_REFERENCE 1
#define UPDATE_BACKREF 2
static noinline void reada_walk_down(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct walk_control *wc,
struct btrfs_path *path)
{
struct btrfs_fs_info *fs_info = root->fs_info;
u64 bytenr;
u64 generation;
u64 refs;
u64 flags;
u32 nritems;
struct btrfs_key key;
struct extent_buffer *eb;
int ret;
int slot;
int nread = 0;
if (path->slots[wc->level] < wc->reada_slot) {
wc->reada_count = wc->reada_count * 2 / 3;
wc->reada_count = max(wc->reada_count, 2);
} else {
wc->reada_count = wc->reada_count * 3 / 2;
wc->reada_count = min_t(int, wc->reada_count,
BTRFS_NODEPTRS_PER_BLOCK(fs_info));
}
eb = path->nodes[wc->level];
nritems = btrfs_header_nritems(eb);
for (slot = path->slots[wc->level]; slot < nritems; slot++) {
if (nread >= wc->reada_count)
break;
cond_resched();
bytenr = btrfs_node_blockptr(eb, slot);
generation = btrfs_node_ptr_generation(eb, slot);
if (slot == path->slots[wc->level])
goto reada;
if (wc->stage == UPDATE_BACKREF &&
generation <= root->root_key.offset)
continue;
/* We don't lock the tree block, it's OK to be racy here */
ret = btrfs_lookup_extent_info(trans, fs_info, bytenr,
wc->level - 1, 1, &refs,
&flags);
/* We don't care about errors in readahead. */
if (ret < 0)
continue;
BUG_ON(refs == 0);
if (wc->stage == DROP_REFERENCE) {
if (refs == 1)
goto reada;
if (wc->level == 1 &&
(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF))
continue;
if (!wc->update_ref ||
generation <= root->root_key.offset)
continue;
btrfs_node_key_to_cpu(eb, &key, slot);
ret = btrfs_comp_cpu_keys(&key,
&wc->update_progress);
if (ret < 0)
continue;
} else {
if (wc->level == 1 &&
(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF))
continue;
}
reada:
btrfs_readahead_node_child(eb, slot);
nread++;
}
wc->reada_slot = slot;
}
/*
* helper to process tree block while walking down the tree.
*
* when wc->stage == UPDATE_BACKREF, this function updates
* back refs for pointers in the block.
*
* NOTE: return value 1 means we should stop walking down.
*/
static noinline int walk_down_proc(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_path *path,
struct walk_control *wc, int lookup_info)
{
struct btrfs_fs_info *fs_info = root->fs_info;
int level = wc->level;
struct extent_buffer *eb = path->nodes[level];
u64 flag = BTRFS_BLOCK_FLAG_FULL_BACKREF;
int ret;
if (wc->stage == UPDATE_BACKREF &&
btrfs_header_owner(eb) != root->root_key.objectid)
return 1;
/*
* when reference count of tree block is 1, it won't increase
* again. once full backref flag is set, we never clear it.
*/
if (lookup_info &&
((wc->stage == DROP_REFERENCE && wc->refs[level] != 1) ||
(wc->stage == UPDATE_BACKREF && !(wc->flags[level] & flag)))) {
BUG_ON(!path->locks[level]);
ret = btrfs_lookup_extent_info(trans, fs_info,
eb->start, level, 1,
&wc->refs[level],
&wc->flags[level]);
BUG_ON(ret == -ENOMEM);
if (ret)
return ret;
BUG_ON(wc->refs[level] == 0);
}
if (wc->stage == DROP_REFERENCE) {
if (wc->refs[level] > 1)
return 1;
if (path->locks[level] && !wc->keep_locks) {
btrfs_tree_unlock_rw(eb, path->locks[level]);
path->locks[level] = 0;
}
return 0;
}
/* wc->stage == UPDATE_BACKREF */
if (!(wc->flags[level] & flag)) {
BUG_ON(!path->locks[level]);
ret = btrfs_inc_ref(trans, root, eb, 1);
BUG_ON(ret); /* -ENOMEM */
ret = btrfs_dec_ref(trans, root, eb, 0);
BUG_ON(ret); /* -ENOMEM */
ret = btrfs_set_disk_extent_flags(trans, eb, flag,
btrfs_header_level(eb), 0);
BUG_ON(ret); /* -ENOMEM */
wc->flags[level] |= flag;
}
/*
* the block is shared by multiple trees, so it's not good to
* keep the tree lock
*/
if (path->locks[level] && level > 0) {
btrfs_tree_unlock_rw(eb, path->locks[level]);
path->locks[level] = 0;
}
return 0;
}
/*
* This is used to verify a ref exists for this root to deal with a bug where we
* would have a drop_progress key that hadn't been updated properly.
*/
static int check_ref_exists(struct btrfs_trans_handle *trans,
struct btrfs_root *root, u64 bytenr, u64 parent,
int level)
{
struct btrfs_path *path;
struct btrfs_extent_inline_ref *iref;
int ret;
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
ret = lookup_extent_backref(trans, path, &iref, bytenr,
root->fs_info->nodesize, parent,
root->root_key.objectid, level, 0);
btrfs_free_path(path);
if (ret == -ENOENT)
return 0;
if (ret < 0)
return ret;
return 1;
}
/*
* helper to process tree block pointer.
*
* when wc->stage == DROP_REFERENCE, this function checks
* reference count of the block pointed to. if the block
* is shared and we need update back refs for the subtree
* rooted at the block, this function changes wc->stage to
* UPDATE_BACKREF. if the block is shared and there is no
* need to update back, this function drops the reference
* to the block.
*
* NOTE: return value 1 means we should stop walking down.
*/
static noinline int do_walk_down(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_path *path,
struct walk_control *wc, int *lookup_info)
{
struct btrfs_fs_info *fs_info = root->fs_info;
u64 bytenr;
u64 generation;
u64 parent;
struct btrfs_key key;
struct btrfs_key first_key;
struct btrfs_ref ref = { 0 };
struct extent_buffer *next;
int level = wc->level;
int reada = 0;
int ret = 0;
bool need_account = false;
generation = btrfs_node_ptr_generation(path->nodes[level],
path->slots[level]);
/*
* if the lower level block was created before the snapshot
* was created, we know there is no need to update back refs
* for the subtree
*/
if (wc->stage == UPDATE_BACKREF &&
generation <= root->root_key.offset) {
*lookup_info = 1;
return 1;
}
bytenr = btrfs_node_blockptr(path->nodes[level], path->slots[level]);
btrfs_node_key_to_cpu(path->nodes[level], &first_key,
path->slots[level]);
next = find_extent_buffer(fs_info, bytenr);
if (!next) {
next = btrfs_find_create_tree_block(fs_info, bytenr,
root->root_key.objectid, level - 1);
if (IS_ERR(next))
return PTR_ERR(next);
reada = 1;
}
btrfs_tree_lock(next);
ret = btrfs_lookup_extent_info(trans, fs_info, bytenr, level - 1, 1,
&wc->refs[level - 1],
&wc->flags[level - 1]);
if (ret < 0)
goto out_unlock;
if (unlikely(wc->refs[level - 1] == 0)) {
btrfs_err(fs_info, "Missing references.");
ret = -EIO;
goto out_unlock;
}
*lookup_info = 0;
if (wc->stage == DROP_REFERENCE) {
if (wc->refs[level - 1] > 1) {
need_account = true;
if (level == 1 &&
(wc->flags[0] & BTRFS_BLOCK_FLAG_FULL_BACKREF))
goto skip;
if (!wc->update_ref ||
generation <= root->root_key.offset)
goto skip;
btrfs_node_key_to_cpu(path->nodes[level], &key,
path->slots[level]);
ret = btrfs_comp_cpu_keys(&key, &wc->update_progress);
if (ret < 0)
goto skip;
wc->stage = UPDATE_BACKREF;
wc->shared_level = level - 1;
}
} else {
if (level == 1 &&
(wc->flags[0] & BTRFS_BLOCK_FLAG_FULL_BACKREF))
goto skip;
}
if (!btrfs_buffer_uptodate(next, generation, 0)) {
btrfs_tree_unlock(next);
free_extent_buffer(next);
next = NULL;
*lookup_info = 1;
}
if (!next) {
if (reada && level == 1)
reada_walk_down(trans, root, wc, path);
next = read_tree_block(fs_info, bytenr, root->root_key.objectid,
generation, level - 1, &first_key);
if (IS_ERR(next)) {
return PTR_ERR(next);
} else if (!extent_buffer_uptodate(next)) {
free_extent_buffer(next);
return -EIO;
}
btrfs_tree_lock(next);
}
level--;
ASSERT(level == btrfs_header_level(next));
if (level != btrfs_header_level(next)) {
btrfs_err(root->fs_info, "mismatched level");
ret = -EIO;
goto out_unlock;
}
path->nodes[level] = next;
path->slots[level] = 0;
path->locks[level] = BTRFS_WRITE_LOCK;
wc->level = level;
if (wc->level == 1)
wc->reada_slot = 0;
return 0;
skip:
wc->refs[level - 1] = 0;
wc->flags[level - 1] = 0;
if (wc->stage == DROP_REFERENCE) {
if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF) {
parent = path->nodes[level]->start;
} else {
ASSERT(root->root_key.objectid ==
btrfs_header_owner(path->nodes[level]));
if (root->root_key.objectid !=
btrfs_header_owner(path->nodes[level])) {
btrfs_err(root->fs_info,
"mismatched block owner");
ret = -EIO;
goto out_unlock;
}
parent = 0;
}
/*
* If we had a drop_progress we need to verify the refs are set
* as expected. If we find our ref then we know that from here
* on out everything should be correct, and we can clear the
* ->restarted flag.
*/
if (wc->restarted) {
ret = check_ref_exists(trans, root, bytenr, parent,
level - 1);
if (ret < 0)
goto out_unlock;
if (ret == 0)
goto no_delete;
ret = 0;
wc->restarted = 0;
}
/*
* Reloc tree doesn't contribute to qgroup numbers, and we have
* already accounted them at merge time (replace_path),
* thus we could skip expensive subtree trace here.
*/
if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID &&
need_account) {
ret = btrfs_qgroup_trace_subtree(trans, next,
generation, level - 1);
if (ret) {
btrfs_err_rl(fs_info,
"Error %d accounting shared subtree. Quota is out of sync, rescan required.",
ret);
}
}
/*
* We need to update the next key in our walk control so we can
* update the drop_progress key accordingly. We don't care if
* find_next_key doesn't find a key because that means we're at
* the end and are going to clean up now.
*/
wc->drop_level = level;
find_next_key(path, level, &wc->drop_progress);
btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF, bytenr,
fs_info->nodesize, parent);
btrfs_init_tree_ref(&ref, level - 1, root->root_key.objectid);
ret = btrfs_free_extent(trans, &ref);
if (ret)
goto out_unlock;
}
no_delete:
*lookup_info = 1;
ret = 1;
out_unlock:
btrfs_tree_unlock(next);
free_extent_buffer(next);
return ret;
}
/*
* helper to process tree block while walking up the tree.
*
* when wc->stage == DROP_REFERENCE, this function drops
* reference count on the block.
*
* when wc->stage == UPDATE_BACKREF, this function changes
* wc->stage back to DROP_REFERENCE if we changed wc->stage
* to UPDATE_BACKREF previously while processing the block.
*
* NOTE: return value 1 means we should stop walking up.
*/
static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_path *path,
struct walk_control *wc)
{
struct btrfs_fs_info *fs_info = root->fs_info;
int ret;
int level = wc->level;
struct extent_buffer *eb = path->nodes[level];
u64 parent = 0;
if (wc->stage == UPDATE_BACKREF) {
BUG_ON(wc->shared_level < level);
if (level < wc->shared_level)
goto out;
ret = find_next_key(path, level + 1, &wc->update_progress);
if (ret > 0)
wc->update_ref = 0;
wc->stage = DROP_REFERENCE;
wc->shared_level = -1;
path->slots[level] = 0;
/*
* check reference count again if the block isn't locked.
* we should start walking down the tree again if reference
* count is one.
*/
if (!path->locks[level]) {
BUG_ON(level == 0);
btrfs_tree_lock(eb);
path->locks[level] = BTRFS_WRITE_LOCK;
ret = btrfs_lookup_extent_info(trans, fs_info,
eb->start, level, 1,
&wc->refs[level],
&wc->flags[level]);
if (ret < 0) {
btrfs_tree_unlock_rw(eb, path->locks[level]);
path->locks[level] = 0;
return ret;
}
BUG_ON(wc->refs[level] == 0);
if (wc->refs[level] == 1) {
btrfs_tree_unlock_rw(eb, path->locks[level]);
path->locks[level] = 0;
return 1;
}
}
}
/* wc->stage == DROP_REFERENCE */
BUG_ON(wc->refs[level] > 1 && !path->locks[level]);
if (wc->refs[level] == 1) {
if (level == 0) {
if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
ret = btrfs_dec_ref(trans, root, eb, 1);
else
ret = btrfs_dec_ref(trans, root, eb, 0);
BUG_ON(ret); /* -ENOMEM */
if (is_fstree(root->root_key.objectid)) {
ret = btrfs_qgroup_trace_leaf_items(trans, eb);
if (ret) {
btrfs_err_rl(fs_info,
"error %d accounting leaf items, quota is out of sync, rescan required",
ret);
}
}
}
/* make block locked assertion in btrfs_clean_tree_block happy */
if (!path->locks[level] &&
btrfs_header_generation(eb) == trans->transid) {
btrfs_tree_lock(eb);
path->locks[level] = BTRFS_WRITE_LOCK;
}
btrfs_clean_tree_block(eb);
}
if (eb == root->node) {
if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
parent = eb->start;
else if (root->root_key.objectid != btrfs_header_owner(eb))
goto owner_mismatch;
} else {
if (wc->flags[level + 1] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
parent = path->nodes[level + 1]->start;
else if (root->root_key.objectid !=
btrfs_header_owner(path->nodes[level + 1]))
goto owner_mismatch;
}
btrfs_free_tree_block(trans, root, eb, parent, wc->refs[level] == 1);
out:
wc->refs[level] = 0;
wc->flags[level] = 0;
return 0;
owner_mismatch:
btrfs_err_rl(fs_info, "unexpected tree owner, have %llu expect %llu",
btrfs_header_owner(eb), root->root_key.objectid);
return -EUCLEAN;
}
static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_path *path,
struct walk_control *wc)
{
int level = wc->level;
int lookup_info = 1;
int ret;
while (level >= 0) {
ret = walk_down_proc(trans, root, path, wc, lookup_info);
if (ret > 0)
break;
if (level == 0)
break;
if (path->slots[level] >=
btrfs_header_nritems(path->nodes[level]))
break;
ret = do_walk_down(trans, root, path, wc, &lookup_info);
if (ret > 0) {
path->slots[level]++;
continue;
} else if (ret < 0)
return ret;
level = wc->level;
}
return 0;
}
static noinline int walk_up_tree(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_path *path,
struct walk_control *wc, int max_level)
{
int level = wc->level;
int ret;
path->slots[level] = btrfs_header_nritems(path->nodes[level]);
while (level < max_level && path->nodes[level]) {
wc->level = level;
if (path->slots[level] + 1 <
btrfs_header_nritems(path->nodes[level])) {
path->slots[level]++;
return 0;
} else {
ret = walk_up_proc(trans, root, path, wc);
if (ret > 0)
return 0;
if (ret < 0)
return ret;
if (path->locks[level]) {
btrfs_tree_unlock_rw(path->nodes[level],
path->locks[level]);
path->locks[level] = 0;
}
free_extent_buffer(path->nodes[level]);
path->nodes[level] = NULL;
level++;
}
}
return 1;
}
/*
* drop a subvolume tree.
*
* this function traverses the tree freeing any blocks that only
* referenced by the tree.
*
* when a shared tree block is found. this function decreases its
* reference count by one. if update_ref is true, this function
* also make sure backrefs for the shared block and all lower level
* blocks are properly updated.
*
* If called with for_reloc == 0, may exit early with -EAGAIN
*/
int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, int for_reloc)
{
struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_path *path;
struct btrfs_trans_handle *trans;
struct btrfs_root *tree_root = fs_info->tree_root;
struct btrfs_root_item *root_item = &root->root_item;
struct walk_control *wc;
struct btrfs_key key;
int err = 0;
int ret;
int level;
bool root_dropped = false;
bool unfinished_drop = false;
btrfs_debug(fs_info, "Drop subvolume %llu", root->root_key.objectid);
path = btrfs_alloc_path();
if (!path) {
err = -ENOMEM;
goto out;
}
wc = kzalloc(sizeof(*wc), GFP_NOFS);
if (!wc) {
btrfs_free_path(path);
err = -ENOMEM;
goto out;
}
/*
* Use join to avoid potential EINTR from transaction start. See
* wait_reserve_ticket and the whole reservation callchain.
*/
if (for_reloc)
trans = btrfs_join_transaction(tree_root);
else
trans = btrfs_start_transaction(tree_root, 0);
if (IS_ERR(trans)) {
err = PTR_ERR(trans);
goto out_free;
}
err = btrfs_run_delayed_items(trans);
if (err)
goto out_end_trans;
/*
* This will help us catch people modifying the fs tree while we're
* dropping it. It is unsafe to mess with the fs tree while it's being
* dropped as we unlock the root node and parent nodes as we walk down
* the tree, assuming nothing will change. If something does change
* then we'll have stale information and drop references to blocks we've
* already dropped.
*/
set_bit(BTRFS_ROOT_DELETING, &root->state);
unfinished_drop = test_bit(BTRFS_ROOT_UNFINISHED_DROP, &root->state);
if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) {
level = btrfs_header_level(root->node);
path->nodes[level] = btrfs_lock_root_node(root);
path->slots[level] = 0;
path->locks[level] = BTRFS_WRITE_LOCK;
memset(&wc->update_progress, 0,
sizeof(wc->update_progress));
} else {
btrfs_disk_key_to_cpu(&key, &root_item->drop_progress);
memcpy(&wc->update_progress, &key,
sizeof(wc->update_progress));
level = btrfs_root_drop_level(root_item);
BUG_ON(level == 0);
path->lowest_level = level;
ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
path->lowest_level = 0;
if (ret < 0) {
err = ret;
goto out_end_trans;
}
WARN_ON(ret > 0);
/*
* unlock our path, this is safe because only this
* function is allowed to delete this snapshot
*/
btrfs_unlock_up_safe(path, 0);
level = btrfs_header_level(root->node);
while (1) {
btrfs_tree_lock(path->nodes[level]);
path->locks[level] = BTRFS_WRITE_LOCK;
ret = btrfs_lookup_extent_info(trans, fs_info,
path->nodes[level]->start,
level, 1, &wc->refs[level],
&wc->flags[level]);
if (ret < 0) {
err = ret;
goto out_end_trans;
}
BUG_ON(wc->refs[level] == 0);
if (level == btrfs_root_drop_level(root_item))
break;
btrfs_tree_unlock(path->nodes[level]);
path->locks[level] = 0;
WARN_ON(wc->refs[level] != 1);
level--;
}
}
wc->restarted = test_bit(BTRFS_ROOT_DEAD_TREE, &root->state);
wc->level = level;
wc->shared_level = -1;
wc->stage = DROP_REFERENCE;
wc->update_ref = update_ref;
wc->keep_locks = 0;
wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(fs_info);
while (1) {
ret = walk_down_tree(trans, root, path, wc);
if (ret < 0) {
err = ret;
break;
}
ret = walk_up_tree(trans, root, path, wc, BTRFS_MAX_LEVEL);
if (ret < 0) {
err = ret;
break;
}
if (ret > 0) {
BUG_ON(wc->stage != DROP_REFERENCE);
break;
}
if (wc->stage == DROP_REFERENCE) {
wc->drop_level = wc->level;
btrfs_node_key_to_cpu(path->nodes[wc->drop_level],
&wc->drop_progress,
path->slots[wc->drop_level]);
}
btrfs_cpu_key_to_disk(&root_item->drop_progress,
&wc->drop_progress);
btrfs_set_root_drop_level(root_item, wc->drop_level);
BUG_ON(wc->level == 0);
if (btrfs_should_end_transaction(trans) ||
(!for_reloc && btrfs_need_cleaner_sleep(fs_info))) {
ret = btrfs_update_root(trans, tree_root,
&root->root_key,
root_item);
if (ret) {
btrfs_abort_transaction(trans, ret);
err = ret;
goto out_end_trans;
}
btrfs_end_transaction_throttle(trans);
if (!for_reloc && btrfs_need_cleaner_sleep(fs_info)) {
btrfs_debug(fs_info,
"drop snapshot early exit");
err = -EAGAIN;
goto out_free;
}
/*
* Use join to avoid potential EINTR from transaction
* start. See wait_reserve_ticket and the whole
* reservation callchain.
*/
if (for_reloc)
trans = btrfs_join_transaction(tree_root);
else
trans = btrfs_start_transaction(tree_root, 0);
if (IS_ERR(trans)) {
err = PTR_ERR(trans);
goto out_free;
}
}
}
btrfs_release_path(path);
if (err)
goto out_end_trans;
ret = btrfs_del_root(trans, &root->root_key);
if (ret) {
btrfs_abort_transaction(trans, ret);
err = ret;
goto out_end_trans;
}
if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) {
ret = btrfs_find_root(tree_root, &root->root_key, path,
NULL, NULL);
if (ret < 0) {
btrfs_abort_transaction(trans, ret);
err = ret;
goto out_end_trans;
} else if (ret > 0) {
/* if we fail to delete the orphan item this time
* around, it'll get picked up the next time.
*
* The most common failure here is just -ENOENT.
*/
btrfs_del_orphan_item(trans, tree_root,
root->root_key.objectid);
}
}
/*
* This subvolume is going to be completely dropped, and won't be
* recorded as dirty roots, thus pertrans meta rsv will not be freed at
* commit transaction time. So free it here manually.
*/
btrfs_qgroup_convert_reserved_meta(root, INT_MAX);
btrfs_qgroup_free_meta_all_pertrans(root);
if (test_bit(BTRFS_ROOT_IN_RADIX, &root->state))
btrfs_add_dropped_root(trans, root);
else
btrfs_put_root(root);
root_dropped = true;
out_end_trans:
btrfs_end_transaction_throttle(trans);
out_free:
kfree(wc);
btrfs_free_path(path);
out:
/*
* We were an unfinished drop root, check to see if there are any
* pending, and if not clear and wake up any waiters.
*/
if (!err && unfinished_drop)
btrfs_maybe_wake_unfinished_drop(fs_info);
/*
* So if we need to stop dropping the snapshot for whatever reason we
* need to make sure to add it back to the dead root list so that we
* keep trying to do the work later. This also cleans up roots if we
* don't have it in the radix (like when we recover after a power fail
* or unmount) so we don't leak memory.
*/
if (!for_reloc && !root_dropped)
btrfs_add_dead_root(root);
return err;
}
/*
* drop subtree rooted at tree block 'node'.
*
* NOTE: this function will unlock and release tree block 'node'
* only used by relocation code
*/
int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct extent_buffer *node,
struct extent_buffer *parent)
{
struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_path *path;
struct walk_control *wc;
int level;
int parent_level;
int ret = 0;
int wret;
BUG_ON(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID);
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
wc = kzalloc(sizeof(*wc), GFP_NOFS);
if (!wc) {
btrfs_free_path(path);
return -ENOMEM;
}
btrfs_assert_tree_locked(parent);
parent_level = btrfs_header_level(parent);
atomic_inc(&parent->refs);
path->nodes[parent_level] = parent;
path->slots[parent_level] = btrfs_header_nritems(parent);
btrfs_assert_tree_locked(node);
level = btrfs_header_level(node);
path->nodes[level] = node;
path->slots[level] = 0;
path->locks[level] = BTRFS_WRITE_LOCK;
wc->refs[parent_level] = 1;
wc->flags[parent_level] = BTRFS_BLOCK_FLAG_FULL_BACKREF;
wc->level = level;
wc->shared_level = -1;
wc->stage = DROP_REFERENCE;
wc->update_ref = 0;
wc->keep_locks = 1;
wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(fs_info);
while (1) {
wret = walk_down_tree(trans, root, path, wc);
if (wret < 0) {
ret = wret;
break;
}
wret = walk_up_tree(trans, root, path, wc, parent_level);
if (wret < 0)
ret = wret;
if (wret != 0)
break;
}
kfree(wc);
btrfs_free_path(path);
return ret;
}
/*
* helper to account the unused space of all the readonly block group in the
* space_info. takes mirrors into account.
*/
u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo)
{
struct btrfs_block_group *block_group;
u64 free_bytes = 0;
int factor;
/* It's df, we don't care if it's racy */
if (list_empty(&sinfo->ro_bgs))
return 0;
spin_lock(&sinfo->lock);
list_for_each_entry(block_group, &sinfo->ro_bgs, ro_list) {
spin_lock(&block_group->lock);
if (!block_group->ro) {
spin_unlock(&block_group->lock);
continue;
}
factor = btrfs_bg_type_to_factor(block_group->flags);
free_bytes += (block_group->length -
block_group->used) * factor;
spin_unlock(&block_group->lock);
}
spin_unlock(&sinfo->lock);
return free_bytes;
}
int btrfs_error_unpin_extent_range(struct btrfs_fs_info *fs_info,
u64 start, u64 end)
{
return unpin_extent_range(fs_info, start, end, false);
}
/*
* It used to be that old block groups would be left around forever.
* Iterating over them would be enough to trim unused space. Since we
* now automatically remove them, we also need to iterate over unallocated
* space.
*
* We don't want a transaction for this since the discard may take a
* substantial amount of time. We don't require that a transaction be
* running, but we do need to take a running transaction into account
* to ensure that we're not discarding chunks that were released or
* allocated in the current transaction.
*
* Holding the chunks lock will prevent other threads from allocating
* or releasing chunks, but it won't prevent a running transaction
* from committing and releasing the memory that the pending chunks
* list head uses. For that, we need to take a reference to the
* transaction and hold the commit root sem. We only need to hold
* it while performing the free space search since we have already
* held back allocations.
*/
static int btrfs_trim_free_extents(struct btrfs_device *device, u64 *trimmed)
{
u64 start = SZ_1M, len = 0, end = 0;
int ret;
*trimmed = 0;
/* Discard not supported = nothing to do. */
if (!blk_queue_discard(bdev_get_queue(device->bdev)))
return 0;
/* Not writable = nothing to do. */
if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state))
return 0;
/* No free space = nothing to do. */
if (device->total_bytes <= device->bytes_used)
return 0;
ret = 0;
while (1) {
struct btrfs_fs_info *fs_info = device->fs_info;
u64 bytes;
ret = mutex_lock_interruptible(&fs_info->chunk_mutex);
if (ret)
break;
find_first_clear_extent_bit(&device->alloc_state, start,
&start, &end,
CHUNK_TRIMMED | CHUNK_ALLOCATED);
/* Check if there are any CHUNK_* bits left */
if (start > device->total_bytes) {
WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG));
btrfs_warn_in_rcu(fs_info,
"ignoring attempt to trim beyond device size: offset %llu length %llu device %s device size %llu",
start, end - start + 1,
rcu_str_deref(device->name),
device->total_bytes);
mutex_unlock(&fs_info->chunk_mutex);
ret = 0;
break;
}
/* Ensure we skip the reserved area in the first 1M */
start = max_t(u64, start, SZ_1M);
/*
* If find_first_clear_extent_bit find a range that spans the
* end of the device it will set end to -1, in this case it's up
* to the caller to trim the value to the size of the device.
*/
end = min(end, device->total_bytes - 1);
len = end - start + 1;
/* We didn't find any extents */
if (!len) {
mutex_unlock(&fs_info->chunk_mutex);
ret = 0;
break;
}
ret = btrfs_issue_discard(device->bdev, start, len,
&bytes);
if (!ret)
set_extent_bits(&device->alloc_state, start,
start + bytes - 1,
CHUNK_TRIMMED);
mutex_unlock(&fs_info->chunk_mutex);
if (ret)
break;
start += len;
*trimmed += bytes;
if (fatal_signal_pending(current)) {
ret = -ERESTARTSYS;
break;
}
cond_resched();
}
return ret;
}
/*
* Trim the whole filesystem by:
* 1) trimming the free space in each block group
* 2) trimming the unallocated space on each device
*
* This will also continue trimming even if a block group or device encounters
* an error. The return value will be the last error, or 0 if nothing bad
* happens.
*/
int btrfs_trim_fs(struct btrfs_fs_info *fs_info, struct fstrim_range *range)
{
struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
struct btrfs_block_group *cache = NULL;
struct btrfs_device *device;
u64 group_trimmed;
u64 range_end = U64_MAX;
u64 start;
u64 end;
u64 trimmed = 0;
u64 bg_failed = 0;
u64 dev_failed = 0;
int bg_ret = 0;
int dev_ret = 0;
int ret = 0;
/*
* Check range overflow if range->len is set.
* The default range->len is U64_MAX.
*/
if (range->len != U64_MAX &&
check_add_overflow(range->start, range->len, &range_end))
return -EINVAL;
cache = btrfs_lookup_first_block_group(fs_info, range->start);
for (; cache; cache = btrfs_next_block_group(cache)) {
if (cache->start >= range_end) {
btrfs_put_block_group(cache);
break;
}
start = max(range->start, cache->start);
end = min(range_end, cache->start + cache->length);
if (end - start >= range->minlen) {
if (!btrfs_block_group_done(cache)) {
ret = btrfs_cache_block_group(cache, 0);
if (ret) {
bg_failed++;
bg_ret = ret;
continue;
}
ret = btrfs_wait_block_group_cache_done(cache);
if (ret) {
bg_failed++;
bg_ret = ret;
continue;
}
}
ret = btrfs_trim_block_group(cache,
&group_trimmed,
start,
end,
range->minlen);
trimmed += group_trimmed;
if (ret) {
bg_failed++;
bg_ret = ret;
continue;
}
}
}
if (bg_failed)
btrfs_warn(fs_info,
"failed to trim %llu block group(s), last error %d",
bg_failed, bg_ret);
mutex_lock(&fs_devices->device_list_mutex);
list_for_each_entry(device, &fs_devices->devices, dev_list) {
if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state))
continue;
ret = btrfs_trim_free_extents(device, &group_trimmed);
if (ret) {
dev_failed++;
dev_ret = ret;
break;
}
trimmed += group_trimmed;
}
mutex_unlock(&fs_devices->device_list_mutex);
if (dev_failed)
btrfs_warn(fs_info,
"failed to trim %llu device(s), last error %d",
dev_failed, dev_ret);
range->len = trimmed;
if (bg_ret)
return bg_ret;
return dev_ret;
}
// SPDX-License-Identifier: GPL-2.0-or-later
/*
* fs/eventpoll.c (Efficient event retrieval implementation)
* Copyright (C) 2001,...,2009 Davide Libenzi
*
* Davide Libenzi <davidel@xmailserver.org>
*/
#include <linux/init.h>
#include <linux/kernel.h>
#include <linux/sched/signal.h>
#include <linux/fs.h>
#include <linux/file.h>
#include <linux/signal.h>
#include <linux/errno.h>
#include <linux/mm.h>
#include <linux/slab.h>
#include <linux/poll.h>
#include <linux/string.h>
#include <linux/list.h>
#include <linux/hash.h>
#include <linux/spinlock.h>
#include <linux/syscalls.h>
#include <linux/rbtree.h>
#include <linux/wait.h>
#include <linux/eventpoll.h>
#include <linux/mount.h>
#include <linux/bitops.h>
#include <linux/mutex.h>
#include <linux/anon_inodes.h>
#include <linux/device.h>
#include <linux/uaccess.h>
#include <asm/io.h>
#include <asm/mman.h>
#include <linux/atomic.h>
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
#include <linux/compat.h>
#include <linux/rculist.h>
#include <net/busy_poll.h>
/*
* LOCKING:
* There are three level of locking required by epoll :
*
* 1) epmutex (mutex)
* 2) ep->mtx (mutex)
* 3) ep->lock (rwlock)
*
* The acquire order is the one listed above, from 1 to 3.
* We need a rwlock (ep->lock) because we manipulate objects
* from inside the poll callback, that might be triggered from
* a wake_up() that in turn might be called from IRQ context.
* So we can't sleep inside the poll callback and hence we need
* a spinlock. During the event transfer loop (from kernel to
* user space) we could end up sleeping due a copy_to_user(), so
* we need a lock that will allow us to sleep. This lock is a
* mutex (ep->mtx). It is acquired during the event transfer loop,
* during epoll_ctl(EPOLL_CTL_DEL) and during eventpoll_release_file().
* Then we also need a global mutex to serialize eventpoll_release_file()
* and ep_free().
* This mutex is acquired by ep_free() during the epoll file
* cleanup path and it is also acquired by eventpoll_release_file()
* if a file has been pushed inside an epoll set and it is then
* close()d without a previous call to epoll_ctl(EPOLL_CTL_DEL).
* It is also acquired when inserting an epoll fd onto another epoll
* fd. We do this so that we walk the epoll tree and ensure that this
* insertion does not create a cycle of epoll file descriptors, which
* could lead to deadlock. We need a global mutex to prevent two
* simultaneous inserts (A into B and B into A) from racing and
* constructing a cycle without either insert observing that it is
* going to.
* It is necessary to acquire multiple "ep->mtx"es at once in the
* case when one epoll fd is added to another. In this case, we
* always acquire the locks in the order of nesting (i.e. after
* epoll_ctl(e1, EPOLL_CTL_ADD, e2), e1->mtx will always be acquired
* before e2->mtx). Since we disallow cycles of epoll file
* descriptors, this ensures that the mutexes are well-ordered. In
* order to communicate this nesting to lockdep, when walking a tree
* of epoll file descriptors, we use the current recursion depth as
* the lockdep subkey.
* It is possible to drop the "ep->mtx" and to use the global
* mutex "epmutex" (together with "ep->lock") to have it working,
* but having "ep->mtx" will make the interface more scalable.
* Events that require holding "epmutex" are very rare, while for
* normal operations the epoll private "ep->mtx" will guarantee
* a better scalability.
*/
/* Epoll private bits inside the event mask */
#define EP_PRIVATE_BITS (EPOLLWAKEUP | EPOLLONESHOT | EPOLLET | EPOLLEXCLUSIVE)
#define EPOLLINOUT_BITS (EPOLLIN | EPOLLOUT)
#define EPOLLEXCLUSIVE_OK_BITS (EPOLLINOUT_BITS | EPOLLERR | EPOLLHUP | \
EPOLLWAKEUP | EPOLLET | EPOLLEXCLUSIVE)
/* Maximum number of nesting allowed inside epoll sets */
#define EP_MAX_NESTS 4
#define EP_MAX_EVENTS (INT_MAX / sizeof(struct epoll_event))
#define EP_UNACTIVE_PTR ((void *) -1L)
#define EP_ITEM_COST (sizeof(struct epitem) + sizeof(struct eppoll_entry))
struct epoll_filefd {
struct file *file;
int fd;
} __packed;
/* Wait structure used by the poll hooks */
struct eppoll_entry {
/* List header used to link this structure to the "struct epitem" */
struct eppoll_entry *next;
/* The "base" pointer is set to the container "struct epitem" */
struct epitem *base;
/*
* Wait queue item that will be linked to the target file wait
* queue head.
*/
wait_queue_entry_t wait;
/* The wait queue head that linked the "wait" wait queue item */
wait_queue_head_t *whead;
};
/*
* Each file descriptor added to the eventpoll interface will
* have an entry of this type linked to the "rbr" RB tree.
* Avoid increasing the size of this struct, there can be many thousands
* of these on a server and we do not want this to take another cache line.
*/
struct epitem {
union {
/* RB tree node links this structure to the eventpoll RB tree */
struct rb_node rbn;
/* Used to free the struct epitem */
struct rcu_head rcu;
};
/* List header used to link this structure to the eventpoll ready list */
struct list_head rdllink;
/*
* Works together "struct eventpoll"->ovflist in keeping the
* single linked chain of items.
*/
struct epitem *next;
/* The file descriptor information this item refers to */
struct epoll_filefd ffd;
/* List containing poll wait queues */
struct eppoll_entry *pwqlist;
/* The "container" of this item */
struct eventpoll *ep;
/* List header used to link this item to the "struct file" items list */
struct hlist_node fllink;
/* wakeup_source used when EPOLLWAKEUP is set */
struct wakeup_source __rcu *ws;
/* The structure that describe the interested events and the source fd */
struct epoll_event event;
};
/*
* This structure is stored inside the "private_data" member of the file
* structure and represents the main data structure for the eventpoll
* interface.
*/
struct eventpoll {
/*
* This mutex is used to ensure that files are not removed
* while epoll is using them. This is held during the event
* collection loop, the file cleanup path, the epoll file exit
* code and the ctl operations.
*/
struct mutex mtx;
/* Wait queue used by sys_epoll_wait() */
wait_queue_head_t wq;
/* Wait queue used by file->poll() */
wait_queue_head_t poll_wait;
/* List of ready file descriptors */
struct list_head rdllist;
/* Lock which protects rdllist and ovflist */
rwlock_t lock;
/* RB tree root used to store monitored fd structs */
struct rb_root_cached rbr;
/*
* This is a single linked list that chains all the "struct epitem" that
* happened while transferring ready events to userspace w/out
* holding ->lock.
*/
struct epitem *ovflist;
/* wakeup_source used when ep_scan_ready_list is running */
struct wakeup_source *ws;
/* The user that created the eventpoll descriptor */
struct user_struct *user;
struct file *file;
/* used to optimize loop detection check */
u64 gen;
struct hlist_head refs;
#ifdef CONFIG_NET_RX_BUSY_POLL
/* used to track busy poll napi_id */
unsigned int napi_id;
#endif
#ifdef CONFIG_DEBUG_LOCK_ALLOC
/* tracks wakeup nests for lockdep validation */
u8 nests;
#endif
};
/* Wrapper struct used by poll queueing */
struct ep_pqueue {
poll_table pt;
struct epitem *epi;
};
/*
* Configuration options available inside /proc/sys/fs/epoll/
*/
/* Maximum number of epoll watched descriptors, per user */
static long max_user_watches __read_mostly;
/*
* This mutex is used to serialize ep_free() and eventpoll_release_file().
*/
static DEFINE_MUTEX(epmutex);
static u64 loop_check_gen = 0;
/* Used to check for epoll file descriptor inclusion loops */
static struct eventpoll *inserting_into;
/* Slab cache used to allocate "struct epitem" */
static struct kmem_cache *epi_cache __read_mostly;
/* Slab cache used to allocate "struct eppoll_entry" */
static struct kmem_cache *pwq_cache __read_mostly;
/*
* List of files with newly added links, where we may need to limit the number
* of emanating paths. Protected by the epmutex.
*/
struct epitems_head {
struct hlist_head epitems;
struct epitems_head *next;
};
static struct epitems_head *tfile_check_list = EP_UNACTIVE_PTR;
static struct kmem_cache *ephead_cache __read_mostly;
static inline void free_ephead(struct epitems_head *head)
{
if (head)
kmem_cache_free(ephead_cache, head);
}
static void list_file(struct file *file)
{
struct epitems_head *head;
head = container_of(file->f_ep, struct epitems_head, epitems);
if (!head->next) {
head->next = tfile_check_list;
tfile_check_list = head;
}
}
static void unlist_file(struct epitems_head *head)
{
struct epitems_head *to_free = head;
struct hlist_node *p = rcu_dereference(hlist_first_rcu(&head->epitems));
if (p) {
struct epitem *epi= container_of(p, struct epitem, fllink);
spin_lock(&epi->ffd.file->f_lock);
if (!hlist_empty(&head->epitems))
to_free = NULL;
head->next = NULL;
spin_unlock(&epi->ffd.file->f_lock);
}
free_ephead(to_free);
}
#ifdef CONFIG_SYSCTL
#include <linux/sysctl.h>
static long long_zero;
static long long_max = LONG_MAX;
struct ctl_table epoll_table[] = {
{
.procname = "max_user_watches",
.data = &max_user_watches,
.maxlen = sizeof(max_user_watches),
.mode = 0644,
.proc_handler = proc_doulongvec_minmax,
.extra1 = &long_zero,
.extra2 = &long_max,
},
{ }
};
#endif /* CONFIG_SYSCTL */
static const struct file_operations eventpoll_fops;
static inline int is_file_epoll(struct file *f)
{
return f->f_op == &eventpoll_fops;
}
/* Setup the structure that is used as key for the RB tree */
static inline void ep_set_ffd(struct epoll_filefd *ffd,
struct file *file, int fd)
{
ffd->file = file;
ffd->fd = fd;
}
/* Compare RB tree keys */
static inline int ep_cmp_ffd(struct epoll_filefd *p1,
struct epoll_filefd *p2)
{
return (p1->file > p2->file ? +1:
(p1->file < p2->file ? -1 : p1->fd - p2->fd));
}
/* Tells us if the item is currently linked */
static inline int ep_is_linked(struct epitem *epi)
{
return !list_empty(&epi->rdllink);
}
static inline struct eppoll_entry *ep_pwq_from_wait(wait_queue_entry_t *p)
{
return container_of(p, struct eppoll_entry, wait);
}
/* Get the "struct epitem" from a wait queue pointer */
static inline struct epitem *ep_item_from_wait(wait_queue_entry_t *p)
{
return container_of(p, struct eppoll_entry, wait)->base;
}
/**
* ep_events_available - Checks if ready events might be available.
*
* @ep: Pointer to the eventpoll context.
*
* Return: a value different than %zero if ready events are available,
* or %zero otherwise.
*/
static inline int ep_events_available(struct eventpoll *ep)
{
return !list_empty_careful(&ep->rdllist) ||
READ_ONCE(ep->ovflist) != EP_UNACTIVE_PTR;
}
#ifdef CONFIG_NET_RX_BUSY_POLL
static bool ep_busy_loop_end(void *p, unsigned long start_time)
{
struct eventpoll *ep = p;
return ep_events_available(ep) || busy_loop_timeout(start_time);
}
/*
* Busy poll if globally on and supporting sockets found && no events,
* busy loop will return if need_resched or ep_events_available.
*
* we must do our busy polling with irqs enabled
*/
static bool ep_busy_loop(struct eventpoll *ep, int nonblock)
{
unsigned int napi_id = READ_ONCE(ep->napi_id);
if ((napi_id >= MIN_NAPI_ID) && net_busy_loop_on()) {
napi_busy_loop(napi_id, nonblock ? NULL : ep_busy_loop_end, ep, false,
BUSY_POLL_BUDGET);
if (ep_events_available(ep))
return true;
/*
* Busy poll timed out. Drop NAPI ID for now, we can add
* it back in when we have moved a socket with a valid NAPI
* ID onto the ready list.
*/
ep->napi_id = 0;
return false;
}
return false;
}
/*
* Set epoll busy poll NAPI ID from sk.
*/
static inline void ep_set_busy_poll_napi_id(struct epitem *epi)
{
struct eventpoll *ep;
unsigned int napi_id;
struct socket *sock;
struct sock *sk;
if (!net_busy_loop_on())
return;
sock = sock_from_file(epi->ffd.file);
if (!sock)
return;
sk = sock->sk;
if (!sk)
return;
napi_id = READ_ONCE(sk->sk_napi_id); ep = epi->ep;
/* Non-NAPI IDs can be rejected
* or
* Nothing to do if we already have this ID
*/
if (napi_id < MIN_NAPI_ID || napi_id == ep->napi_id)
return;
/* record NAPI ID for use in next busy poll */
ep->napi_id = napi_id;
}
#else
static inline bool ep_busy_loop(struct eventpoll *ep, int nonblock)
{
return false;
}
static inline void ep_set_busy_poll_napi_id(struct epitem *epi)
{
}
#endif /* CONFIG_NET_RX_BUSY_POLL */
/*
* As described in commit 0ccf831cb lockdep: annotate epoll
* the use of wait queues used by epoll is done in a very controlled
* manner. Wake ups can nest inside each other, but are never done
* with the same locking. For example:
*
* dfd = socket(...);
* efd1 = epoll_create();
* efd2 = epoll_create();
* epoll_ctl(efd1, EPOLL_CTL_ADD, dfd, ...);
* epoll_ctl(efd2, EPOLL_CTL_ADD, efd1, ...);
*
* When a packet arrives to the device underneath "dfd", the net code will
* issue a wake_up() on its poll wake list. Epoll (efd1) has installed a
* callback wakeup entry on that queue, and the wake_up() performed by the
* "dfd" net code will end up in ep_poll_callback(). At this point epoll
* (efd1) notices that it may have some event ready, so it needs to wake up
* the waiters on its poll wait list (efd2). So it calls ep_poll_safewake()
* that ends up in another wake_up(), after having checked about the
* recursion constraints. That are, no more than EP_MAX_POLLWAKE_NESTS, to
* avoid stack blasting.
*
* When CONFIG_DEBUG_LOCK_ALLOC is enabled, make sure lockdep can handle
* this special case of epoll.
*/
#ifdef CONFIG_DEBUG_LOCK_ALLOC
static void ep_poll_safewake(struct eventpoll *ep, struct epitem *epi)
{
struct eventpoll *ep_src;
unsigned long flags;
u8 nests = 0;
/*
* To set the subclass or nesting level for spin_lock_irqsave_nested()
* it might be natural to create a per-cpu nest count. However, since
* we can recurse on ep->poll_wait.lock, and a non-raw spinlock can
* schedule() in the -rt kernel, the per-cpu variable are no longer
* protected. Thus, we are introducing a per eventpoll nest field.
* If we are not being call from ep_poll_callback(), epi is NULL and
* we are at the first level of nesting, 0. Otherwise, we are being
* called from ep_poll_callback() and if a previous wakeup source is
* not an epoll file itself, we are at depth 1 since the wakeup source
* is depth 0. If the wakeup source is a previous epoll file in the
* wakeup chain then we use its nests value and record ours as
* nests + 1. The previous epoll file nests value is stable since its
* already holding its own poll_wait.lock.
*/
if (epi) {
if ((is_file_epoll(epi->ffd.file))) {
ep_src = epi->ffd.file->private_data;
nests = ep_src->nests;
} else {
nests = 1;
}
}
spin_lock_irqsave_nested(&ep->poll_wait.lock, flags, nests);
ep->nests = nests + 1;
wake_up_locked_poll(&ep->poll_wait, EPOLLIN);
ep->nests = 0;
spin_unlock_irqrestore(&ep->poll_wait.lock, flags);
}
#else
static void ep_poll_safewake(struct eventpoll *ep, struct epitem *epi)
{
wake_up_poll(&ep->poll_wait, EPOLLIN);
}
#endif
static void ep_remove_wait_queue(struct eppoll_entry *pwq)
{
wait_queue_head_t *whead;
rcu_read_lock();
/*
* If it is cleared by POLLFREE, it should be rcu-safe.
* If we read NULL we need a barrier paired with
* smp_store_release() in ep_poll_callback(), otherwise
* we rely on whead->lock.
*/
whead = smp_load_acquire(&pwq->whead);
if (whead)
remove_wait_queue(whead, &pwq->wait);
rcu_read_unlock();
}
/*
* This function unregisters poll callbacks from the associated file
* descriptor. Must be called with "mtx" held (or "epmutex" if called from
* ep_free).
*/
static void ep_unregister_pollwait(struct eventpoll *ep, struct epitem *epi)
{
struct eppoll_entry **p = &epi->pwqlist;
struct eppoll_entry *pwq;
while ((pwq = *p) != NULL) {
*p = pwq->next;
ep_remove_wait_queue(pwq);
kmem_cache_free(pwq_cache, pwq);
}
}
/* call only when ep->mtx is held */
static inline struct wakeup_source *ep_wakeup_source(struct epitem *epi)
{
return rcu_dereference_check(epi->ws, lockdep_is_held(&epi->ep->mtx));
}
/* call only when ep->mtx is held */
static inline void ep_pm_stay_awake(struct epitem *epi)
{
struct wakeup_source *ws = ep_wakeup_source(epi);
if (ws)
__pm_stay_awake(ws);
}
static inline bool ep_has_wakeup_source(struct epitem *epi)
{
return rcu_access_pointer(epi->ws) ? true : false;
}
/* call when ep->mtx cannot be held (ep_poll_callback) */
static inline void ep_pm_stay_awake_rcu(struct epitem *epi)
{
struct wakeup_source *ws;
rcu_read_lock();
ws = rcu_dereference(epi->ws);
if (ws)
__pm_stay_awake(ws);
rcu_read_unlock();
}
/*
* ep->mutex needs to be held because we could be hit by
* eventpoll_release_file() and epoll_ctl().
*/
static void ep_start_scan(struct eventpoll *ep, struct list_head *txlist)
{
/*
* Steal the ready list, and re-init the original one to the
* empty list. Also, set ep->ovflist to NULL so that events
* happening while looping w/out locks, are not lost. We cannot
* have the poll callback to queue directly on ep->rdllist,
* because we want the "sproc" callback to be able to do it
* in a lockless way.
*/
lockdep_assert_irqs_enabled();
write_lock_irq(&ep->lock);
list_splice_init(&ep->rdllist, txlist);
WRITE_ONCE(ep->ovflist, NULL);
write_unlock_irq(&ep->lock);
}
static void ep_done_scan(struct eventpoll *ep,
struct list_head *txlist)
{
struct epitem *epi, *nepi;
write_lock_irq(&ep->lock);
/*
* During the time we spent inside the "sproc" callback, some
* other events might have been queued by the poll callback.
* We re-insert them inside the main ready-list here.
*/
for (nepi = READ_ONCE(ep->ovflist); (epi = nepi) != NULL;
nepi = epi->next, epi->next = EP_UNACTIVE_PTR) {
/*
* We need to check if the item is already in the list.
* During the "sproc" callback execution time, items are
* queued into ->ovflist but the "txlist" might already
* contain them, and the list_splice() below takes care of them.
*/
if (!ep_is_linked(epi)) {
/*
* ->ovflist is LIFO, so we have to reverse it in order
* to keep in FIFO.
*/
list_add(&epi->rdllink, &ep->rdllist);
ep_pm_stay_awake(epi);
}
}
/*
* We need to set back ep->ovflist to EP_UNACTIVE_PTR, so that after
* releasing the lock, events will be queued in the normal way inside
* ep->rdllist.
*/
WRITE_ONCE(ep->ovflist, EP_UNACTIVE_PTR);
/*
* Quickly re-inject items left on "txlist".
*/
list_splice(txlist, &ep->rdllist);
__pm_relax(ep->ws);
if (!list_empty(&ep->rdllist)) {
if (waitqueue_active(&ep->wq))
wake_up(&ep->wq);
}
write_unlock_irq(&ep->lock);
}
static void epi_rcu_free(struct rcu_head *head)
{
struct epitem *epi = container_of(head, struct epitem, rcu);
kmem_cache_free(epi_cache, epi);
}
/*
* Removes a "struct epitem" from the eventpoll RB tree and deallocates
* all the associated resources. Must be called with "mtx" held.
*/
static int ep_remove(struct eventpoll *ep, struct epitem *epi)
{
struct file *file = epi->ffd.file;
struct epitems_head *to_free;
struct hlist_head *head;
lockdep_assert_irqs_enabled();
/*
* Removes poll wait queue hooks.
*/
ep_unregister_pollwait(ep, epi);
/* Remove the current item from the list of epoll hooks */
spin_lock(&file->f_lock);
to_free = NULL;
head = file->f_ep;
if (head->first == &epi->fllink && !epi->fllink.next) {
file->f_ep = NULL;
if (!is_file_epoll(file)) {
struct epitems_head *v;
v = container_of(head, struct epitems_head, epitems);
if (!smp_load_acquire(&v->next))
to_free = v;
}
}
hlist_del_rcu(&epi->fllink);
spin_unlock(&file->f_lock);
free_ephead(to_free);
rb_erase_cached(&epi->rbn, &ep->rbr);
write_lock_irq(&ep->lock);
if (ep_is_linked(epi))
list_del_init(&epi->rdllink);
write_unlock_irq(&ep->lock);
wakeup_source_unregister(ep_wakeup_source(epi));
/*
* At this point it is safe to free the eventpoll item. Use the union
* field epi->rcu, since we are trying to minimize the size of
* 'struct epitem'. The 'rbn' field is no longer in use. Protected by
* ep->mtx. The rcu read side, reverse_path_check_proc(), does not make
* use of the rbn field.
*/
call_rcu(&epi->rcu, epi_rcu_free);
percpu_counter_dec(&ep->user->epoll_watches);
return 0;
}
static void ep_free(struct eventpoll *ep)
{
struct rb_node *rbp;
struct epitem *epi;
/* We need to release all tasks waiting for these file */
if (waitqueue_active(&ep->poll_wait))
ep_poll_safewake(ep, NULL);
/*
* We need to lock this because we could be hit by
* eventpoll_release_file() while we're freeing the "struct eventpoll".
* We do not need to hold "ep->mtx" here because the epoll file
* is on the way to be removed and no one has references to it
* anymore. The only hit might come from eventpoll_release_file() but
* holding "epmutex" is sufficient here.
*/
mutex_lock(&epmutex);
/*
* Walks through the whole tree by unregistering poll callbacks.
*/
for (rbp = rb_first_cached(&ep->rbr); rbp; rbp = rb_next(rbp)) {
epi = rb_entry(rbp, struct epitem, rbn);
ep_unregister_pollwait(ep, epi);
cond_resched();
}
/*
* Walks through the whole tree by freeing each "struct epitem". At this
* point we are sure no poll callbacks will be lingering around, and also by
* holding "epmutex" we can be sure that no file cleanup code will hit
* us during this operation. So we can avoid the lock on "ep->lock".
* We do not need to lock ep->mtx, either, we only do it to prevent
* a lockdep warning.
*/
mutex_lock(&ep->mtx);
while ((rbp = rb_first_cached(&ep->rbr)) != NULL) {
epi = rb_entry(rbp, struct epitem, rbn);
ep_remove(ep, epi);
cond_resched();
}
mutex_unlock(&ep->mtx);
mutex_unlock(&epmutex);
mutex_destroy(&ep->mtx);
free_uid(ep->user);
wakeup_source_unregister(ep->ws);
kfree(ep);
}
static int ep_eventpoll_release(struct inode *inode, struct file *file)
{
struct eventpoll *ep = file->private_data;
if (ep)
ep_free(ep);
return 0;
}
static __poll_t ep_item_poll(const struct epitem *epi, poll_table *pt, int depth);
static __poll_t __ep_eventpoll_poll(struct file *file, poll_table *wait, int depth)
{
struct eventpoll *ep = file->private_data;
LIST_HEAD(txlist);
struct epitem *epi, *tmp;
poll_table pt;
__poll_t res = 0;
init_poll_funcptr(&pt, NULL);
/* Insert inside our poll wait queue */
poll_wait(file, &ep->poll_wait, wait);
/*
* Proceed to find out if wanted events are really available inside
* the ready list.
*/
mutex_lock_nested(&ep->mtx, depth);
ep_start_scan(ep, &txlist);
list_for_each_entry_safe(epi, tmp, &txlist, rdllink) {
if (ep_item_poll(epi, &pt, depth + 1)) {
res = EPOLLIN | EPOLLRDNORM;
break;
} else {
/*
* Item has been dropped into the ready list by the poll
* callback, but it's not actually ready, as far as
* caller requested events goes. We can remove it here.
*/
__pm_relax(ep_wakeup_source(epi));
list_del_init(&epi->rdllink);
}
}
ep_done_scan(ep, &txlist);
mutex_unlock(&ep->mtx);
return res;
}
/*
* Differs from ep_eventpoll_poll() in that internal callers already have
* the ep->mtx so we need to start from depth=1, such that mutex_lock_nested()
* is correctly annotated.
*/
static __poll_t ep_item_poll(const struct epitem *epi, poll_table *pt,
int depth)
{
struct file *file = epi->ffd.file;
__poll_t res;
pt->_key = epi->event.events;
if (!is_file_epoll(file))
res = vfs_poll(file, pt);
else
res = __ep_eventpoll_poll(file, pt, depth);
return res & epi->event.events;
}
static __poll_t ep_eventpoll_poll(struct file *file, poll_table *wait)
{
return __ep_eventpoll_poll(file, wait, 0);
}
#ifdef CONFIG_PROC_FS
static void ep_show_fdinfo(struct seq_file *m, struct file *f)
{
struct eventpoll *ep = f->private_data;
struct rb_node *rbp;
mutex_lock(&ep->mtx);
for (rbp = rb_first_cached(&ep->rbr); rbp; rbp = rb_next(rbp)) {
struct epitem *epi = rb_entry(rbp, struct epitem, rbn);
struct inode *inode = file_inode(epi->ffd.file);
seq_printf(m, "tfd: %8d events: %8x data: %16llx "
" pos:%lli ino:%lx sdev:%x\n",
epi->ffd.fd, epi->event.events,
(long long)epi->event.data,
(long long)epi->ffd.file->f_pos,
inode->i_ino, inode->i_sb->s_dev);
if (seq_has_overflowed(m))
break;
}
mutex_unlock(&ep->mtx);
}
#endif
/* File callbacks that implement the eventpoll file behaviour */
static const struct file_operations eventpoll_fops = {
#ifdef CONFIG_PROC_FS
.show_fdinfo = ep_show_fdinfo,
#endif
.release = ep_eventpoll_release,
.poll = ep_eventpoll_poll,
.llseek = noop_llseek,
};
/*
* This is called from eventpoll_release() to unlink files from the eventpoll
* interface. We need to have this facility to cleanup correctly files that are
* closed without being removed from the eventpoll interface.
*/
void eventpoll_release_file(struct file *file)
{
struct eventpoll *ep;
struct epitem *epi;
struct hlist_node *next;
/*
* We don't want to get "file->f_lock" because it is not
* necessary. It is not necessary because we're in the "struct file"
* cleanup path, and this means that no one is using this file anymore.
* So, for example, epoll_ctl() cannot hit here since if we reach this
* point, the file counter already went to zero and fget() would fail.
* The only hit might come from ep_free() but by holding the mutex
* will correctly serialize the operation. We do need to acquire
* "ep->mtx" after "epmutex" because ep_remove() requires it when called
* from anywhere but ep_free().
*
* Besides, ep_remove() acquires the lock, so we can't hold it here.
*/
mutex_lock(&epmutex);
if (unlikely(!file->f_ep)) {
mutex_unlock(&epmutex);
return;
}
hlist_for_each_entry_safe(epi, next, file->f_ep, fllink) {
ep = epi->ep;
mutex_lock_nested(&ep->mtx, 0);
ep_remove(ep, epi);
mutex_unlock(&ep->mtx);
}
mutex_unlock(&epmutex);
}
static int ep_alloc(struct eventpoll **pep)
{
int error;
struct user_struct *user;
struct eventpoll *ep;
user = get_current_user();
error = -ENOMEM;
ep = kzalloc(sizeof(*ep), GFP_KERNEL);
if (unlikely(!ep))
goto free_uid;
mutex_init(&ep->mtx);
rwlock_init(&ep->lock);
init_waitqueue_head(&ep->wq);
init_waitqueue_head(&ep->poll_wait);
INIT_LIST_HEAD(&ep->rdllist);
ep->rbr = RB_ROOT_CACHED;
ep->ovflist = EP_UNACTIVE_PTR;
ep->user = user;
*pep = ep;
return 0;
free_uid:
free_uid(user);
return error;
}
/*
* Search the file inside the eventpoll tree. The RB tree operations
* are protected by the "mtx" mutex, and ep_find() must be called with
* "mtx" held.
*/
static struct epitem *ep_find(struct eventpoll *ep, struct file *file, int fd)
{
int kcmp;
struct rb_node *rbp;
struct epitem *epi, *epir = NULL;
struct epoll_filefd ffd;
ep_set_ffd(&ffd, file, fd);
for (rbp = ep->rbr.rb_root.rb_node; rbp; ) {
epi = rb_entry(rbp, struct epitem, rbn);
kcmp = ep_cmp_ffd(&ffd, &epi->ffd);
if (kcmp > 0)
rbp = rbp->rb_right;
else if (kcmp < 0)
rbp = rbp->rb_left;
else {
epir = epi;
break;
}
}
return epir;
}
#ifdef CONFIG_KCMP
static struct epitem *ep_find_tfd(struct eventpoll *ep, int tfd, unsigned long toff)
{
struct rb_node *rbp;
struct epitem *epi;
for (rbp = rb_first_cached(&ep->rbr); rbp; rbp = rb_next(rbp)) {
epi = rb_entry(rbp, struct epitem, rbn);
if (epi->ffd.fd == tfd) {
if (toff == 0)
return epi;
else
toff--;
}
cond_resched();
}
return NULL;
}
struct file *get_epoll_tfile_raw_ptr(struct file *file, int tfd,
unsigned long toff)
{
struct file *file_raw;
struct eventpoll *ep;
struct epitem *epi;
if (!is_file_epoll(file))
return ERR_PTR(-EINVAL);
ep = file->private_data;
mutex_lock(&ep->mtx);
epi = ep_find_tfd(ep, tfd, toff);
if (epi)
file_raw = epi->ffd.file;
else
file_raw = ERR_PTR(-ENOENT);
mutex_unlock(&ep->mtx);
return file_raw;
}
#endif /* CONFIG_KCMP */
/*
* Adds a new entry to the tail of the list in a lockless way, i.e.
* multiple CPUs are allowed to call this function concurrently.
*
* Beware: it is necessary to prevent any other modifications of the
* existing list until all changes are completed, in other words
* concurrent list_add_tail_lockless() calls should be protected
* with a read lock, where write lock acts as a barrier which
* makes sure all list_add_tail_lockless() calls are fully
* completed.
*
* Also an element can be locklessly added to the list only in one
* direction i.e. either to the tail or to the head, otherwise
* concurrent access will corrupt the list.
*
* Return: %false if element has been already added to the list, %true
* otherwise.
*/
static inline bool list_add_tail_lockless(struct list_head *new,
struct list_head *head)
{
struct list_head *prev;
/*
* This is simple 'new->next = head' operation, but cmpxchg()
* is used in order to detect that same element has been just
* added to the list from another CPU: the winner observes
* new->next == new.
*/
if (cmpxchg(&new->next, new, head) != new)
return false;
/*
* Initially ->next of a new element must be updated with the head
* (we are inserting to the tail) and only then pointers are atomically
* exchanged. XCHG guarantees memory ordering, thus ->next should be
* updated before pointers are actually swapped and pointers are
* swapped before prev->next is updated.
*/
prev = xchg(&head->prev, new);
/*
* It is safe to modify prev->next and new->prev, because a new element
* is added only to the tail and new->next is updated before XCHG.
*/
prev->next = new;
new->prev = prev;
return true;
}
/*
* Chains a new epi entry to the tail of the ep->ovflist in a lockless way,
* i.e. multiple CPUs are allowed to call this function concurrently.
*
* Return: %false if epi element has been already chained, %true otherwise.
*/
static inline bool chain_epi_lockless(struct epitem *epi)
{
struct eventpoll *ep = epi->ep;
/* Fast preliminary check */
if (epi->next != EP_UNACTIVE_PTR)
return false;
/* Check that the same epi has not been just chained from another CPU */
if (cmpxchg(&epi->next, EP_UNACTIVE_PTR, NULL) != EP_UNACTIVE_PTR)
return false;
/* Atomically exchange tail */
epi->next = xchg(&ep->ovflist, epi);
return true;
}
/*
* This is the callback that is passed to the wait queue wakeup
* mechanism. It is called by the stored file descriptors when they
* have events to report.
*
* This callback takes a read lock in order not to contend with concurrent
* events from another file descriptor, thus all modifications to ->rdllist
* or ->ovflist are lockless. Read lock is paired with the write lock from
* ep_scan_ready_list(), which stops all list modifications and guarantees
* that lists state is seen correctly.
*
* Another thing worth to mention is that ep_poll_callback() can be called
* concurrently for the same @epi from different CPUs if poll table was inited
* with several wait queues entries. Plural wakeup from different CPUs of a
* single wait queue is serialized by wq.lock, but the case when multiple wait
* queues are used should be detected accordingly. This is detected using
* cmpxchg() operation.
*/
static int ep_poll_callback(wait_queue_entry_t *wait, unsigned mode, int sync, void *key)
{
int pwake = 0;
struct epitem *epi = ep_item_from_wait(wait);
struct eventpoll *ep = epi->ep;
__poll_t pollflags = key_to_poll(key);
unsigned long flags;
int ewake = 0;
read_lock_irqsave(&ep->lock, flags);
ep_set_busy_poll_napi_id(epi);
/*
* If the event mask does not contain any poll(2) event, we consider the
* descriptor to be disabled. This condition is likely the effect of the
* EPOLLONESHOT bit that disables the descriptor when an event is received,
* until the next EPOLL_CTL_MOD will be issued.
*/
if (!(epi->event.events & ~EP_PRIVATE_BITS))
goto out_unlock;
/*
* Check the events coming with the callback. At this stage, not
* every device reports the events in the "key" parameter of the
* callback. We need to be able to handle both cases here, hence the
* test for "key" != NULL before the event match test.
*/
if (pollflags && !(pollflags & epi->event.events))
goto out_unlock;
/*
* If we are transferring events to userspace, we can hold no locks
* (because we're accessing user memory, and because of linux f_op->poll()
* semantics). All the events that happen during that period of time are
* chained in ep->ovflist and requeued later on.
*/
if (READ_ONCE(ep->ovflist) != EP_UNACTIVE_PTR) {
if (chain_epi_lockless(epi))
ep_pm_stay_awake_rcu(epi);
} else if (!ep_is_linked(epi)) {
/* In the usual case, add event to ready list. */
if (list_add_tail_lockless(&epi->rdllink, &ep->rdllist))
ep_pm_stay_awake_rcu(epi);
}
/*
* Wake up ( if active ) both the eventpoll wait list and the ->poll()
* wait list.
*/
if (waitqueue_active(&ep->wq)) {
if ((epi->event.events & EPOLLEXCLUSIVE) &&
!(pollflags & POLLFREE)) {
switch (pollflags & EPOLLINOUT_BITS) {
case EPOLLIN:
if (epi->event.events & EPOLLIN)
ewake = 1;
break;
case EPOLLOUT:
if (epi->event.events & EPOLLOUT)
ewake = 1;
break;
case 0:
ewake = 1;
break;
}
}
wake_up(&ep->wq);
}
if (waitqueue_active(&ep->poll_wait))
pwake++;
out_unlock:
read_unlock_irqrestore(&ep->lock, flags);
/* We have to call this outside the lock */
if (pwake)
ep_poll_safewake(ep, epi);
if (!(epi->event.events & EPOLLEXCLUSIVE))
ewake = 1;
if (pollflags & POLLFREE) {
/*
* If we race with ep_remove_wait_queue() it can miss
* ->whead = NULL and do another remove_wait_queue() after
* us, so we can't use __remove_wait_queue().
*/
list_del_init(&wait->entry);
/*
* ->whead != NULL protects us from the race with ep_free()
* or ep_remove(), ep_remove_wait_queue() takes whead->lock
* held by the caller. Once we nullify it, nothing protects
* ep/epi or even wait.
*/
smp_store_release(&ep_pwq_from_wait(wait)->whead, NULL);
}
return ewake;
}
/*
* This is the callback that is used to add our wait queue to the
* target file wakeup lists.
*/
static void ep_ptable_queue_proc(struct file *file, wait_queue_head_t *whead,
poll_table *pt)
{
struct ep_pqueue *epq = container_of(pt, struct ep_pqueue, pt);
struct epitem *epi = epq->epi;
struct eppoll_entry *pwq;
if (unlikely(!epi)) // an earlier allocation has failed
return;
pwq = kmem_cache_alloc(pwq_cache, GFP_KERNEL);
if (unlikely(!pwq)) {
epq->epi = NULL;
return;
}
init_waitqueue_func_entry(&pwq->wait, ep_poll_callback);
pwq->whead = whead;
pwq->base = epi;
if (epi->event.events & EPOLLEXCLUSIVE)
add_wait_queue_exclusive(whead, &pwq->wait);
else
add_wait_queue(whead, &pwq->wait);
pwq->next = epi->pwqlist;
epi->pwqlist = pwq;
}
static void ep_rbtree_insert(struct eventpoll *ep, struct epitem *epi)
{
int kcmp;
struct rb_node **p = &ep->rbr.rb_root.rb_node, *parent = NULL;
struct epitem *epic;
bool leftmost = true;
while (*p) {
parent = *p;
epic = rb_entry(parent, struct epitem, rbn);
kcmp = ep_cmp_ffd(&epi->ffd, &epic->ffd);
if (kcmp > 0) {
p = &parent->rb_right;
leftmost = false;
} else
p = &parent->rb_left;
}
rb_link_node(&epi->rbn, parent, p);
rb_insert_color_cached(&epi->rbn, &ep->rbr, leftmost);
}
#define PATH_ARR_SIZE 5
/*
* These are the number paths of length 1 to 5, that we are allowing to emanate
* from a single file of interest. For example, we allow 1000 paths of length
* 1, to emanate from each file of interest. This essentially represents the
* potential wakeup paths, which need to be limited in order to avoid massive
* uncontrolled wakeup storms. The common use case should be a single ep which
* is connected to n file sources. In this case each file source has 1 path
* of length 1. Thus, the numbers below should be more than sufficient. These
* path limits are enforced during an EPOLL_CTL_ADD operation, since a modify
* and delete can't add additional paths. Protected by the epmutex.
*/
static const int path_limits[PATH_ARR_SIZE] = { 1000, 500, 100, 50, 10 };
static int path_count[PATH_ARR_SIZE];
static int path_count_inc(int nests)
{
/* Allow an arbitrary number of depth 1 paths */
if (nests == 0)
return 0;
if (++path_count[nests] > path_limits[nests])
return -1;
return 0;
}
static void path_count_init(void)
{
int i;
for (i = 0; i < PATH_ARR_SIZE; i++)
path_count[i] = 0;
}
static int reverse_path_check_proc(struct hlist_head *refs, int depth)
{
int error = 0;
struct epitem *epi;
if (depth > EP_MAX_NESTS) /* too deep nesting */
return -1;
/* CTL_DEL can remove links here, but that can't increase our count */
hlist_for_each_entry_rcu(epi, refs, fllink) {
struct hlist_head *refs = &epi->ep->refs;
if (hlist_empty(refs))
error = path_count_inc(depth);
else
error = reverse_path_check_proc(refs, depth + 1);
if (error != 0)
break;
}
return error;
}
/**
* reverse_path_check - The tfile_check_list is list of epitem_head, which have
* links that are proposed to be newly added. We need to
* make sure that those added links don't add too many
* paths such that we will spend all our time waking up
* eventpoll objects.
*
* Return: %zero if the proposed links don't create too many paths,
* %-1 otherwise.
*/
static int reverse_path_check(void)
{
struct epitems_head *p;
for (p = tfile_check_list; p != EP_UNACTIVE_PTR; p = p->next) {
int error;
path_count_init();
rcu_read_lock();
error = reverse_path_check_proc(&p->epitems, 0);
rcu_read_unlock();
if (error)
return error;
}
return 0;
}
static int ep_create_wakeup_source(struct epitem *epi)
{
struct name_snapshot n;
struct wakeup_source *ws;
if (!epi->ep->ws) {
epi->ep->ws = wakeup_source_register(NULL, "eventpoll");
if (!epi->ep->ws)
return -ENOMEM;
}
take_dentry_name_snapshot(&n, epi->ffd.file->f_path.dentry);
ws = wakeup_source_register(NULL, n.name.name);
release_dentry_name_snapshot(&n);
if (!ws)
return -ENOMEM;
rcu_assign_pointer(epi->ws, ws);
return 0;
}
/* rare code path, only used when EPOLL_CTL_MOD removes a wakeup source */
static noinline void ep_destroy_wakeup_source(struct epitem *epi)
{
struct wakeup_source *ws = ep_wakeup_source(epi);
RCU_INIT_POINTER(epi->ws, NULL);
/*
* wait for ep_pm_stay_awake_rcu to finish, synchronize_rcu is
* used internally by wakeup_source_remove, too (called by
* wakeup_source_unregister), so we cannot use call_rcu
*/
synchronize_rcu();
wakeup_source_unregister(ws);
}
static int attach_epitem(struct file *file, struct epitem *epi)
{
struct epitems_head *to_free = NULL;
struct hlist_head *head = NULL;
struct eventpoll *ep = NULL;
if (is_file_epoll(file))
ep = file->private_data;
if (ep) {
head = &ep->refs;
} else if (!READ_ONCE(file->f_ep)) {
allocate:
to_free = kmem_cache_zalloc(ephead_cache, GFP_KERNEL);
if (!to_free)
return -ENOMEM;
head = &to_free->epitems;
}
spin_lock(&file->f_lock);
if (!file->f_ep) {
if (unlikely(!head)) {
spin_unlock(&file->f_lock);
goto allocate;
}
file->f_ep = head;
to_free = NULL;
}
hlist_add_head_rcu(&epi->fllink, file->f_ep);
spin_unlock(&file->f_lock);
free_ephead(to_free);
return 0;
}
/*
* Must be called with "mtx" held.
*/
static int ep_insert(struct eventpoll *ep, const struct epoll_event *event,
struct file *tfile, int fd, int full_check)
{
int error, pwake = 0;
__poll_t revents;
struct epitem *epi;
struct ep_pqueue epq;
struct eventpoll *tep = NULL;
if (is_file_epoll(tfile))
tep = tfile->private_data;
lockdep_assert_irqs_enabled();
if (unlikely(percpu_counter_compare(&ep->user->epoll_watches,
max_user_watches) >= 0))
return -ENOSPC;
percpu_counter_inc(&ep->user->epoll_watches);
if (!(epi = kmem_cache_zalloc(epi_cache, GFP_KERNEL))) {
percpu_counter_dec(&ep->user->epoll_watches);
return -ENOMEM;
}
/* Item initialization follow here ... */
INIT_LIST_HEAD(&epi->rdllink);
epi->ep = ep;
ep_set_ffd(&epi->ffd, tfile, fd);
epi->event = *event;
epi->next = EP_UNACTIVE_PTR;
if (tep)
mutex_lock_nested(&tep->mtx, 1);
/* Add the current item to the list of active epoll hook for this file */
if (unlikely(attach_epitem(tfile, epi) < 0)) {
if (tep)
mutex_unlock(&tep->mtx);
kmem_cache_free(epi_cache, epi);
percpu_counter_dec(&ep->user->epoll_watches);
return -ENOMEM;
}
if (full_check && !tep)
list_file(tfile);
/*
* Add the current item to the RB tree. All RB tree operations are
* protected by "mtx", and ep_insert() is called with "mtx" held.
*/
ep_rbtree_insert(ep, epi);
if (tep)
mutex_unlock(&tep->mtx);
/* now check if we've created too many backpaths */
if (unlikely(full_check && reverse_path_check())) {
ep_remove(ep, epi);
return -EINVAL;
}
if (epi->event.events & EPOLLWAKEUP) {
error = ep_create_wakeup_source(epi);
if (error) {
ep_remove(ep, epi);
return error;
}
}
/* Initialize the poll table using the queue callback */
epq.epi = epi;
init_poll_funcptr(&epq.pt, ep_ptable_queue_proc);
/*
* Attach the item to the poll hooks and get current event bits.
* We can safely use the file* here because its usage count has
* been increased by the caller of this function. Note that after
* this operation completes, the poll callback can start hitting
* the new item.
*/
revents = ep_item_poll(epi, &epq.pt, 1);
/*
* We have to check if something went wrong during the poll wait queue
* install process. Namely an allocation for a wait queue failed due
* high memory pressure.
*/
if (unlikely(!epq.epi)) {
ep_remove(ep, epi);
return -ENOMEM;
}
/* We have to drop the new item inside our item list to keep track of it */
write_lock_irq(&ep->lock);
/* record NAPI ID of new item if present */
ep_set_busy_poll_napi_id(epi);
/* If the file is already "ready" we drop it inside the ready list */
if (revents && !ep_is_linked(epi)) {
list_add_tail(&epi->rdllink, &ep->rdllist);
ep_pm_stay_awake(epi);
/* Notify waiting tasks that events are available */
if (waitqueue_active(&ep->wq))
wake_up(&ep->wq);
if (waitqueue_active(&ep->poll_wait))
pwake++;
}
write_unlock_irq(&ep->lock);
/* We have to call this outside the lock */
if (pwake)
ep_poll_safewake(ep, NULL);
return 0;
}
/*
* Modify the interest event mask by dropping an event if the new mask
* has a match in the current file status. Must be called with "mtx" held.
*/
static int ep_modify(struct eventpoll *ep, struct epitem *epi,
const struct epoll_event *event)
{
int pwake = 0;
poll_table pt;
lockdep_assert_irqs_enabled();
init_poll_funcptr(&pt, NULL);
/*
* Set the new event interest mask before calling f_op->poll();
* otherwise we might miss an event that happens between the
* f_op->poll() call and the new event set registering.
*/
epi->event.events = event->events; /* need barrier below */
epi->event.data = event->data; /* protected by mtx */
if (epi->event.events & EPOLLWAKEUP) {
if (!ep_has_wakeup_source(epi))
ep_create_wakeup_source(epi);
} else if (ep_has_wakeup_source(epi)) {
ep_destroy_wakeup_source(epi);
}
/*
* The following barrier has two effects:
*
* 1) Flush epi changes above to other CPUs. This ensures
* we do not miss events from ep_poll_callback if an
* event occurs immediately after we call f_op->poll().
* We need this because we did not take ep->lock while
* changing epi above (but ep_poll_callback does take
* ep->lock).
*
* 2) We also need to ensure we do not miss _past_ events
* when calling f_op->poll(). This barrier also
* pairs with the barrier in wq_has_sleeper (see
* comments for wq_has_sleeper).
*
* This barrier will now guarantee ep_poll_callback or f_op->poll
* (or both) will notice the readiness of an item.
*/
smp_mb();
/*
* Get current event bits. We can safely use the file* here because
* its usage count has been increased by the caller of this function.
* If the item is "hot" and it is not registered inside the ready
* list, push it inside.
*/
if (ep_item_poll(epi, &pt, 1)) {
write_lock_irq(&ep->lock);
if (!ep_is_linked(epi)) {
list_add_tail(&epi->rdllink, &ep->rdllist);
ep_pm_stay_awake(epi);
/* Notify waiting tasks that events are available */
if (waitqueue_active(&ep->wq))
wake_up(&ep->wq);
if (waitqueue_active(&ep->poll_wait))
pwake++;
}
write_unlock_irq(&ep->lock);
}
/* We have to call this outside the lock */
if (pwake)
ep_poll_safewake(ep, NULL);
return 0;
}
static int ep_send_events(struct eventpoll *ep,
struct epoll_event __user *events, int maxevents)
{
struct epitem *epi, *tmp;
LIST_HEAD(txlist);
poll_table pt;
int res = 0;
/*
* Always short-circuit for fatal signals to allow threads to make a
* timely exit without the chance of finding more events available and
* fetching repeatedly.
*/
if (fatal_signal_pending(current))
return -EINTR;
init_poll_funcptr(&pt, NULL);
mutex_lock(&ep->mtx);
ep_start_scan(ep, &txlist);
/*
* We can loop without lock because we are passed a task private list.
* Items cannot vanish during the loop we are holding ep->mtx.
*/
list_for_each_entry_safe(epi, tmp, &txlist, rdllink) {
struct wakeup_source *ws;
__poll_t revents;
if (res >= maxevents)
break;
/*
* Activate ep->ws before deactivating epi->ws to prevent
* triggering auto-suspend here (in case we reactive epi->ws
* below).
*
* This could be rearranged to delay the deactivation of epi->ws
* instead, but then epi->ws would temporarily be out of sync
* with ep_is_linked().
*/
ws = ep_wakeup_source(epi);
if (ws) {
if (ws->active)
__pm_stay_awake(ep->ws);
__pm_relax(ws);
}
list_del_init(&epi->rdllink);
/*
* If the event mask intersect the caller-requested one,
* deliver the event to userspace. Again, we are holding ep->mtx,
* so no operations coming from userspace can change the item.
*/
revents = ep_item_poll(epi, &pt, 1);
if (!revents)
continue;
events = epoll_put_uevent(revents, epi->event.data, events);
if (!events) {
list_add(&epi->rdllink, &txlist);
ep_pm_stay_awake(epi);
if (!res)
res = -EFAULT;
break;
}
res++;
if (epi->event.events & EPOLLONESHOT)
epi->event.events &= EP_PRIVATE_BITS;
else if (!(epi->event.events & EPOLLET)) {
/*
* If this file has been added with Level
* Trigger mode, we need to insert back inside
* the ready list, so that the next call to
* epoll_wait() will check again the events
* availability. At this point, no one can insert
* into ep->rdllist besides us. The epoll_ctl()
* callers are locked out by
* ep_scan_ready_list() holding "mtx" and the
* poll callback will queue them in ep->ovflist.
*/
list_add_tail(&epi->rdllink, &ep->rdllist);
ep_pm_stay_awake(epi);
}
}
ep_done_scan(ep, &txlist);
mutex_unlock(&ep->mtx);
return res;
}
static struct timespec64 *ep_timeout_to_timespec(struct timespec64 *to, long ms)
{
struct timespec64 now;
if (ms < 0)
return NULL;
if (!ms) {
to->tv_sec = 0;
to->tv_nsec = 0;
return to;
}
to->tv_sec = ms / MSEC_PER_SEC;
to->tv_nsec = NSEC_PER_MSEC * (ms % MSEC_PER_SEC);
ktime_get_ts64(&now);
*to = timespec64_add_safe(now, *to);
return to;
}
/**
* ep_poll - Retrieves ready events, and delivers them to the caller-supplied
* event buffer.
*
* @ep: Pointer to the eventpoll context.
* @events: Pointer to the userspace buffer where the ready events should be
* stored.
* @maxevents: Size (in terms of number of events) of the caller event buffer.
* @timeout: Maximum timeout for the ready events fetch operation, in
* timespec. If the timeout is zero, the function will not block,
* while if the @timeout ptr is NULL, the function will block
* until at least one event has been retrieved (or an error
* occurred).
*
* Return: the number of ready events which have been fetched, or an
* error code, in case of error.
*/
static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
int maxevents, struct timespec64 *timeout)
{
int res, eavail, timed_out = 0;
u64 slack = 0;
wait_queue_entry_t wait;
ktime_t expires, *to = NULL;
lockdep_assert_irqs_enabled();
if (timeout && (timeout->tv_sec | timeout->tv_nsec)) {
slack = select_estimate_accuracy(timeout);
to = &expires;
*to = timespec64_to_ktime(*timeout);
} else if (timeout) {
/*
* Avoid the unnecessary trip to the wait queue loop, if the
* caller specified a non blocking operation.
*/
timed_out = 1;
}
/*
* This call is racy: We may or may not see events that are being added
* to the ready list under the lock (e.g., in IRQ callbacks). For cases
* with a non-zero timeout, this thread will check the ready list under
* lock and will add to the wait queue. For cases with a zero
* timeout, the user by definition should not care and will have to
* recheck again.
*/
eavail = ep_events_available(ep);
while (1) {
if (eavail) {
/*
* Try to transfer events to user space. In case we get
* 0 events and there's still timeout left over, we go
* trying again in search of more luck.
*/
res = ep_send_events(ep, events, maxevents);
if (res)
return res;
}
if (timed_out)
return 0;
eavail = ep_busy_loop(ep, timed_out);
if (eavail)
continue;
if (signal_pending(current))
return -EINTR;
/*
* Internally init_wait() uses autoremove_wake_function(),
* thus wait entry is removed from the wait queue on each
* wakeup. Why it is important? In case of several waiters
* each new wakeup will hit the next waiter, giving it the
* chance to harvest new event. Otherwise wakeup can be
* lost. This is also good performance-wise, because on
* normal wakeup path no need to call __remove_wait_queue()
* explicitly, thus ep->lock is not taken, which halts the
* event delivery.
*/
init_wait(&wait);
write_lock_irq(&ep->lock);
/*
* Barrierless variant, waitqueue_active() is called under
* the same lock on wakeup ep_poll_callback() side, so it
* is safe to avoid an explicit barrier.
*/
__set_current_state(TASK_INTERRUPTIBLE);
/*
* Do the final check under the lock. ep_scan_ready_list()
* plays with two lists (->rdllist and ->ovflist) and there
* is always a race when both lists are empty for short
* period of time although events are pending, so lock is
* important.
*/
eavail = ep_events_available(ep);
if (!eavail)
__add_wait_queue_exclusive(&ep->wq, &wait);
write_unlock_irq(&ep->lock);
if (!eavail)
timed_out = !schedule_hrtimeout_range(to, slack,
HRTIMER_MODE_ABS);
__set_current_state(TASK_RUNNING);
/*
* We were woken up, thus go and try to harvest some events.
* If timed out and still on the wait queue, recheck eavail
* carefully under lock, below.
*/
eavail = 1;
if (!list_empty_careful(&wait.entry)) {
write_lock_irq(&ep->lock);
/*
* If the thread timed out and is not on the wait queue,
* it means that the thread was woken up after its
* timeout expired before it could reacquire the lock.
* Thus, when wait.entry is empty, it needs to harvest
* events.
*/
if (timed_out)
eavail = list_empty(&wait.entry);
__remove_wait_queue(&ep->wq, &wait);
write_unlock_irq(&ep->lock);
}
}
}
/**
* ep_loop_check_proc - verify that adding an epoll file inside another
* epoll structure does not violate the constraints, in
* terms of closed loops, or too deep chains (which can
* result in excessive stack usage).
*
* @ep: the &struct eventpoll to be currently checked.
* @depth: Current depth of the path being checked.
*
* Return: %zero if adding the epoll @file inside current epoll
* structure @ep does not violate the constraints, or %-1 otherwise.
*/
static int ep_loop_check_proc(struct eventpoll *ep, int depth)
{
int error = 0;
struct rb_node *rbp;
struct epitem *epi;
mutex_lock_nested(&ep->mtx, depth + 1);
ep->gen = loop_check_gen;
for (rbp = rb_first_cached(&ep->rbr); rbp; rbp = rb_next(rbp)) {
epi = rb_entry(rbp, struct epitem, rbn);
if (unlikely(is_file_epoll(epi->ffd.file))) {
struct eventpoll *ep_tovisit;
ep_tovisit = epi->ffd.file->private_data;
if (ep_tovisit->gen == loop_check_gen)
continue;
if (ep_tovisit == inserting_into || depth > EP_MAX_NESTS)
error = -1;
else
error = ep_loop_check_proc(ep_tovisit, depth + 1);
if (error != 0)
break;
} else {
/*
* If we've reached a file that is not associated with
* an ep, then we need to check if the newly added
* links are going to add too many wakeup paths. We do
* this by adding it to the tfile_check_list, if it's
* not already there, and calling reverse_path_check()
* during ep_insert().
*/
list_file(epi->ffd.file);
}
}
mutex_unlock(&ep->mtx);
return error;
}
/**
* ep_loop_check - Performs a check to verify that adding an epoll file (@to)
* into another epoll file (represented by @ep) does not create
* closed loops or too deep chains.
*
* @ep: Pointer to the epoll we are inserting into.
* @to: Pointer to the epoll to be inserted.
*
* Return: %zero if adding the epoll @to inside the epoll @from
* does not violate the constraints, or %-1 otherwise.
*/
static int ep_loop_check(struct eventpoll *ep, struct eventpoll *to)
{
inserting_into = ep;
return ep_loop_check_proc(to, 0);
}
static void clear_tfile_check_list(void)
{
rcu_read_lock();
while (tfile_check_list != EP_UNACTIVE_PTR) {
struct epitems_head *head = tfile_check_list;
tfile_check_list = head->next;
unlist_file(head);
}
rcu_read_unlock();
}
/*
* Open an eventpoll file descriptor.
*/
static int do_epoll_create(int flags)
{
int error, fd;
struct eventpoll *ep = NULL;
struct file *file;
/* Check the EPOLL_* constant for consistency. */
BUILD_BUG_ON(EPOLL_CLOEXEC != O_CLOEXEC);
if (flags & ~EPOLL_CLOEXEC)
return -EINVAL;
/*
* Create the internal data structure ("struct eventpoll").
*/
error = ep_alloc(&ep);
if (error < 0)
return error;
/*
* Creates all the items needed to setup an eventpoll file. That is,
* a file structure and a free file descriptor.
*/
fd = get_unused_fd_flags(O_RDWR | (flags & O_CLOEXEC));
if (fd < 0) {
error = fd;
goto out_free_ep;
}
file = anon_inode_getfile("[eventpoll]", &eventpoll_fops, ep,
O_RDWR | (flags & O_CLOEXEC));
if (IS_ERR(file)) {
error = PTR_ERR(file);
goto out_free_fd;
}
ep->file = file;
fd_install(fd, file);
return fd;
out_free_fd:
put_unused_fd(fd);
out_free_ep:
ep_free(ep);
return error;
}
SYSCALL_DEFINE1(epoll_create1, int, flags)
{
return do_epoll_create(flags);
}
SYSCALL_DEFINE1(epoll_create, int, size)
{
if (size <= 0)
return -EINVAL;
return do_epoll_create(0);
}
static inline int epoll_mutex_lock(struct mutex *mutex, int depth,
bool nonblock)
{
if (!nonblock) {
mutex_lock_nested(mutex, depth);
return 0;
}
if (mutex_trylock(mutex))
return 0;
return -EAGAIN;
}
int do_epoll_ctl(int epfd, int op, int fd, struct epoll_event *epds,
bool nonblock)
{
int error;
int full_check = 0;
struct fd f, tf;
struct eventpoll *ep;
struct epitem *epi;
struct eventpoll *tep = NULL;
error = -EBADF;
f = fdget(epfd);
if (!f.file)
goto error_return;
/* Get the "struct file *" for the target file */
tf = fdget(fd);
if (!tf.file)
goto error_fput;
/* The target file descriptor must support poll */
error = -EPERM;
if (!file_can_poll(tf.file))
goto error_tgt_fput;
/* Check if EPOLLWAKEUP is allowed */
if (ep_op_has_event(op))
ep_take_care_of_epollwakeup(epds);
/*
* We have to check that the file structure underneath the file descriptor
* the user passed to us _is_ an eventpoll file. And also we do not permit
* adding an epoll file descriptor inside itself.
*/
error = -EINVAL;
if (f.file == tf.file || !is_file_epoll(f.file))
goto error_tgt_fput;
/*
* epoll adds to the wakeup queue at EPOLL_CTL_ADD time only,
* so EPOLLEXCLUSIVE is not allowed for a EPOLL_CTL_MOD operation.
* Also, we do not currently supported nested exclusive wakeups.
*/
if (ep_op_has_event(op) && (epds->events & EPOLLEXCLUSIVE)) {
if (op == EPOLL_CTL_MOD)
goto error_tgt_fput;
if (op == EPOLL_CTL_ADD && (is_file_epoll(tf.file) ||
(epds->events & ~EPOLLEXCLUSIVE_OK_BITS)))
goto error_tgt_fput;
}
/*
* At this point it is safe to assume that the "private_data" contains
* our own data structure.
*/
ep = f.file->private_data;
/*
* When we insert an epoll file descriptor inside another epoll file
* descriptor, there is the chance of creating closed loops, which are
* better be handled here, than in more critical paths. While we are
* checking for loops we also determine the list of files reachable
* and hang them on the tfile_check_list, so we can check that we
* haven't created too many possible wakeup paths.
*
* We do not need to take the global 'epumutex' on EPOLL_CTL_ADD when
* the epoll file descriptor is attaching directly to a wakeup source,
* unless the epoll file descriptor is nested. The purpose of taking the
* 'epmutex' on add is to prevent complex toplogies such as loops and
* deep wakeup paths from forming in parallel through multiple
* EPOLL_CTL_ADD operations.
*/
error = epoll_mutex_lock(&ep->mtx, 0, nonblock);
if (error)
goto error_tgt_fput;
if (op == EPOLL_CTL_ADD) {
if (READ_ONCE(f.file->f_ep) || ep->gen == loop_check_gen ||
is_file_epoll(tf.file)) {
mutex_unlock(&ep->mtx);
error = epoll_mutex_lock(&epmutex, 0, nonblock);
if (error)
goto error_tgt_fput;
loop_check_gen++;
full_check = 1;
if (is_file_epoll(tf.file)) {
tep = tf.file->private_data;
error = -ELOOP;
if (ep_loop_check(ep, tep) != 0)
goto error_tgt_fput;
}
error = epoll_mutex_lock(&ep->mtx, 0, nonblock);
if (error)
goto error_tgt_fput;
}
}
/*
* Try to lookup the file inside our RB tree. Since we grabbed "mtx"
* above, we can be sure to be able to use the item looked up by
* ep_find() till we release the mutex.
*/
epi = ep_find(ep, tf.file, fd);
error = -EINVAL;
switch (op) {
case EPOLL_CTL_ADD:
if (!epi) {
epds->events |= EPOLLERR | EPOLLHUP;
error = ep_insert(ep, epds, tf.file, fd, full_check);
} else
error = -EEXIST;
break;
case EPOLL_CTL_DEL:
if (epi)
error = ep_remove(ep, epi);
else
error = -ENOENT;
break;
case EPOLL_CTL_MOD:
if (epi) {
if (!(epi->event.events & EPOLLEXCLUSIVE)) {
epds->events |= EPOLLERR | EPOLLHUP;
error = ep_modify(ep, epi, epds);
}
} else
error = -ENOENT;
break;
}
mutex_unlock(&ep->mtx);
error_tgt_fput:
if (full_check) {
clear_tfile_check_list();
loop_check_gen++;
mutex_unlock(&epmutex);
}
fdput(tf);
error_fput:
fdput(f);
error_return:
return error;
}
/*
* The following function implements the controller interface for
* the eventpoll file that enables the insertion/removal/change of
* file descriptors inside the interest set.
*/
SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
struct epoll_event __user *, event)
{
struct epoll_event epds;
if (ep_op_has_event(op) &&
copy_from_user(&epds, event, sizeof(struct epoll_event)))
return -EFAULT;
return do_epoll_ctl(epfd, op, fd, &epds, false);
}
/*
* Implement the event wait interface for the eventpoll file. It is the kernel
* part of the user space epoll_wait(2).
*/
static int do_epoll_wait(int epfd, struct epoll_event __user *events,
int maxevents, struct timespec64 *to)
{
int error;
struct fd f;
struct eventpoll *ep;
/* The maximum number of event must be greater than zero */
if (maxevents <= 0 || maxevents > EP_MAX_EVENTS)
return -EINVAL;
/* Verify that the area passed by the user is writeable */
if (!access_ok(events, maxevents * sizeof(struct epoll_event)))
return -EFAULT;
/* Get the "struct file *" for the eventpoll file */
f = fdget(epfd);
if (!f.file)
return -EBADF;
/*
* We have to check that the file structure underneath the fd
* the user passed to us _is_ an eventpoll file.
*/
error = -EINVAL;
if (!is_file_epoll(f.file))
goto error_fput;
/*
* At this point it is safe to assume that the "private_data" contains
* our own data structure.
*/
ep = f.file->private_data;
/* Time to fish for events ... */
error = ep_poll(ep, events, maxevents, to);
error_fput:
fdput(f);
return error;
}
SYSCALL_DEFINE4(epoll_wait, int, epfd, struct epoll_event __user *, events,
int, maxevents, int, timeout)
{
struct timespec64 to;
return do_epoll_wait(epfd, events, maxevents,
ep_timeout_to_timespec(&to, timeout));
}
/*
* Implement the event wait interface for the eventpoll file. It is the kernel
* part of the user space epoll_pwait(2).
*/
static int do_epoll_pwait(int epfd, struct epoll_event __user *events,
int maxevents, struct timespec64 *to,
const sigset_t __user *sigmask, size_t sigsetsize)
{
int error;
/*
* If the caller wants a certain signal mask to be set during the wait,
* we apply it here.
*/
error = set_user_sigmask(sigmask, sigsetsize);
if (error)
return error;
error = do_epoll_wait(epfd, events, maxevents, to);
restore_saved_sigmask_unless(error == -EINTR);
return error;
}
SYSCALL_DEFINE6(epoll_pwait, int, epfd, struct epoll_event __user *, events,
int, maxevents, int, timeout, const sigset_t __user *, sigmask,
size_t, sigsetsize)
{
struct timespec64 to;
return do_epoll_pwait(epfd, events, maxevents,
ep_timeout_to_timespec(&to, timeout),
sigmask, sigsetsize);
}
SYSCALL_DEFINE6(epoll_pwait2, int, epfd, struct epoll_event __user *, events,
int, maxevents, const struct __kernel_timespec __user *, timeout,
const sigset_t __user *, sigmask, size_t, sigsetsize)
{
struct timespec64 ts, *to = NULL;
if (timeout) {
if (get_timespec64(&ts, timeout))
return -EFAULT;
to = &ts;
if (poll_select_set_timeout(to, ts.tv_sec, ts.tv_nsec))
return -EINVAL;
}
return do_epoll_pwait(epfd, events, maxevents, to,
sigmask, sigsetsize);
}
#ifdef CONFIG_COMPAT
static int do_compat_epoll_pwait(int epfd, struct epoll_event __user *events,
int maxevents, struct timespec64 *timeout,
const compat_sigset_t __user *sigmask,
compat_size_t sigsetsize)
{
long err;
/*
* If the caller wants a certain signal mask to be set during the wait,
* we apply it here.
*/
err = set_compat_user_sigmask(sigmask, sigsetsize);
if (err)
return err;
err = do_epoll_wait(epfd, events, maxevents, timeout);
restore_saved_sigmask_unless(err == -EINTR);
return err;
}
COMPAT_SYSCALL_DEFINE6(epoll_pwait, int, epfd,
struct epoll_event __user *, events,
int, maxevents, int, timeout,
const compat_sigset_t __user *, sigmask,
compat_size_t, sigsetsize)
{
struct timespec64 to;
return do_compat_epoll_pwait(epfd, events, maxevents,
ep_timeout_to_timespec(&to, timeout),
sigmask, sigsetsize);
}
COMPAT_SYSCALL_DEFINE6(epoll_pwait2, int, epfd,
struct epoll_event __user *, events,
int, maxevents,
const struct __kernel_timespec __user *, timeout,
const compat_sigset_t __user *, sigmask,
compat_size_t, sigsetsize)
{
struct timespec64 ts, *to = NULL;
if (timeout) {
if (get_timespec64(&ts, timeout))
return -EFAULT;
to = &ts;
if (poll_select_set_timeout(to, ts.tv_sec, ts.tv_nsec))
return -EINVAL;
}
return do_compat_epoll_pwait(epfd, events, maxevents, to,
sigmask, sigsetsize);
}
#endif
static int __init eventpoll_init(void)
{
struct sysinfo si;
si_meminfo(&si);
/*
* Allows top 4% of lomem to be allocated for epoll watches (per user).
*/
max_user_watches = (((si.totalram - si.totalhigh) / 25) << PAGE_SHIFT) /
EP_ITEM_COST;
BUG_ON(max_user_watches < 0);
/*
* We can have many thousands of epitems, so prevent this from
* using an extra cache line on 64-bit (and smaller) CPUs
*/
BUILD_BUG_ON(sizeof(void *) <= 8 && sizeof(struct epitem) > 128);
/* Allocates slab cache used to allocate "struct epitem" items */
epi_cache = kmem_cache_create("eventpoll_epi", sizeof(struct epitem),
0, SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT, NULL);
/* Allocates slab cache used to allocate "struct eppoll_entry" */
pwq_cache = kmem_cache_create("eventpoll_pwq",
sizeof(struct eppoll_entry), 0, SLAB_PANIC|SLAB_ACCOUNT, NULL);
ephead_cache = kmem_cache_create("ep_head",
sizeof(struct epitems_head), 0, SLAB_PANIC|SLAB_ACCOUNT, NULL);
return 0;
}
fs_initcall(eventpoll_init);
// SPDX-License-Identifier: GPL-2.0
/*
* drivers/base/devres.c - device resource management
*
* Copyright (c) 2006 SUSE Linux Products GmbH
* Copyright (c) 2006 Tejun Heo <teheo@suse.de>
*/
#include <linux/device.h>
#include <linux/module.h>
#include <linux/slab.h>
#include <linux/percpu.h>
#include <asm/sections.h>
#include "base.h"
#include "trace.h"
struct devres_node {
struct list_head entry;
dr_release_t release;
const char *name;
size_t size;
};
struct devres {
struct devres_node node;
/*
* Some archs want to perform DMA into kmalloc caches
* and need a guaranteed alignment larger than
* the alignment of a 64-bit integer.
* Thus we use ARCH_KMALLOC_MINALIGN here and get exactly the same
* buffer alignment as if it was allocated by plain kmalloc().
*/
u8 __aligned(ARCH_KMALLOC_MINALIGN) data[];
};
struct devres_group {
struct devres_node node[2];
void *id;
int color;
/* -- 8 pointers */
};
static void set_node_dbginfo(struct devres_node *node, const char *name,
size_t size)
{
node->name = name;
node->size = size;
}
#ifdef CONFIG_DEBUG_DEVRES
static int log_devres = 0;
module_param_named(log, log_devres, int, S_IRUGO | S_IWUSR);
static void devres_dbg(struct device *dev, struct devres_node *node,
const char *op)
{
if (unlikely(log_devres))
dev_err(dev, "DEVRES %3s %p %s (%zu bytes)\n",
op, node, node->name, node->size);
}
#else /* CONFIG_DEBUG_DEVRES */
#define devres_dbg(dev, node, op) do {} while (0)
#endif /* CONFIG_DEBUG_DEVRES */
static void devres_log(struct device *dev, struct devres_node *node,
const char *op)
{
trace_devres_log(dev, op, node, node->name, node->size);
devres_dbg(dev, node, op);
}
/*
* Release functions for devres group. These callbacks are used only
* for identification.
*/
static void group_open_release(struct device *dev, void *res)
{
/* noop */
}
static void group_close_release(struct device *dev, void *res)
{
/* noop */
}
static struct devres_group * node_to_group(struct devres_node *node)
{
if (node->release == &group_open_release)
return container_of(node, struct devres_group, node[0]);
if (node->release == &group_close_release)
return container_of(node, struct devres_group, node[1]);
return NULL;
}
static bool check_dr_size(size_t size, size_t *tot_size)
{
/* We must catch any near-SIZE_MAX cases that could overflow. */
if (unlikely(check_add_overflow(sizeof(struct devres),
size, tot_size)))
return false;
return true;
}
static __always_inline struct devres * alloc_dr(dr_release_t release,
size_t size, gfp_t gfp, int nid)
{
size_t tot_size;
struct devres *dr;
if (!check_dr_size(size, &tot_size))
return NULL;
dr = kmalloc_node_track_caller(tot_size, gfp, nid);
if (unlikely(!dr))
return NULL;
memset(dr, 0, offsetof(struct devres, data));
INIT_LIST_HEAD(&dr->node.entry);
dr->node.release = release;
return dr;
}
static void add_dr(struct device *dev, struct devres_node *node)
{
devres_log(dev, node, "ADD");
BUG_ON(!list_empty(&node->entry));
list_add_tail(&node->entry, &dev->devres_head);
}
static void replace_dr(struct device *dev,
struct devres_node *old, struct devres_node *new)
{
devres_log(dev, old, "REPLACE");
BUG_ON(!list_empty(&new->entry));
list_replace(&old->entry, &new->entry);
}
/**
* __devres_alloc_node - Allocate device resource data
* @release: Release function devres will be associated with
* @size: Allocation size
* @gfp: Allocation flags
* @nid: NUMA node
* @name: Name of the resource
*
* Allocate devres of @size bytes. The allocated area is zeroed, then
* associated with @release. The returned pointer can be passed to
* other devres_*() functions.
*
* RETURNS:
* Pointer to allocated devres on success, NULL on failure.
*/
void *__devres_alloc_node(dr_release_t release, size_t size, gfp_t gfp, int nid,
const char *name)
{
struct devres *dr;
dr = alloc_dr(release, size, gfp | __GFP_ZERO, nid);
if (unlikely(!dr))
return NULL;
set_node_dbginfo(&dr->node, name, size);
return dr->data;
}
EXPORT_SYMBOL_GPL(__devres_alloc_node);
/**
* devres_for_each_res - Resource iterator
* @dev: Device to iterate resource from
* @release: Look for resources associated with this release function
* @match: Match function (optional)
* @match_data: Data for the match function
* @fn: Function to be called for each matched resource.
* @data: Data for @fn, the 3rd parameter of @fn
*
* Call @fn for each devres of @dev which is associated with @release
* and for which @match returns 1.
*
* RETURNS:
* void
*/
void devres_for_each_res(struct device *dev, dr_release_t release,
dr_match_t match, void *match_data,
void (*fn)(struct device *, void *, void *),
void *data)
{
struct devres_node *node;
struct devres_node *tmp;
unsigned long flags;
if (!fn)
return;
spin_lock_irqsave(&dev->devres_lock, flags);
list_for_each_entry_safe_reverse(node, tmp,
&dev->devres_head, entry) {
struct devres *dr = container_of(node, struct devres, node);
if (node->release != release)
continue;
if (match && !match(dev, dr->data, match_data))
continue;
fn(dev, dr->data, data);
}
spin_unlock_irqrestore(&dev->devres_lock, flags);
}
EXPORT_SYMBOL_GPL(devres_for_each_res);
/**
* devres_free - Free device resource data
* @res: Pointer to devres data to free
*
* Free devres created with devres_alloc().
*/
void devres_free(void *res)
{
if (res) {
struct devres *dr = container_of(res, struct devres, data);
BUG_ON(!list_empty(&dr->node.entry));
kfree(dr);
}
}
EXPORT_SYMBOL_GPL(devres_free);
/**
* devres_add - Register device resource
* @dev: Device to add resource to
* @res: Resource to register
*
* Register devres @res to @dev. @res should have been allocated
* using devres_alloc(). On driver detach, the associated release
* function will be invoked and devres will be freed automatically.
*/
void devres_add(struct device *dev, void *res)
{
struct devres *dr = container_of(res, struct devres, data);
unsigned long flags;
spin_lock_irqsave(&dev->devres_lock, flags);
add_dr(dev, &dr->node);
spin_unlock_irqrestore(&dev->devres_lock, flags);
}
EXPORT_SYMBOL_GPL(devres_add);
static struct devres *find_dr(struct device *dev, dr_release_t release,
dr_match_t match, void *match_data)
{
struct devres_node *node;
list_for_each_entry_reverse(node, &dev->devres_head, entry) {
struct devres *dr = container_of(node, struct devres, node);
if (node->release != release)
continue;
if (match && !match(dev, dr->data, match_data))
continue;
return dr;
}
return NULL;
}
/**
* devres_find - Find device resource
* @dev: Device to lookup resource from
* @release: Look for resources associated with this release function
* @match: Match function (optional)
* @match_data: Data for the match function
*
* Find the latest devres of @dev which is associated with @release
* and for which @match returns 1. If @match is NULL, it's considered
* to match all.
*
* RETURNS:
* Pointer to found devres, NULL if not found.
*/
void * devres_find(struct device *dev, dr_release_t release,
dr_match_t match, void *match_data)
{
struct devres *dr;
unsigned long flags;
spin_lock_irqsave(&dev->devres_lock, flags);
dr = find_dr(dev, release, match, match_data);
spin_unlock_irqrestore(&dev->devres_lock, flags);
if (dr)
return dr->data;
return NULL;
}
EXPORT_SYMBOL_GPL(devres_find);
/**
* devres_get - Find devres, if non-existent, add one atomically
* @dev: Device to lookup or add devres for
* @new_res: Pointer to new initialized devres to add if not found
* @match: Match function (optional)
* @match_data: Data for the match function
*
* Find the latest devres of @dev which has the same release function
* as @new_res and for which @match return 1. If found, @new_res is
* freed; otherwise, @new_res is added atomically.
*
* RETURNS:
* Pointer to found or added devres.
*/
void * devres_get(struct device *dev, void *new_res,
dr_match_t match, void *match_data)
{
struct devres *new_dr = container_of(new_res, struct devres, data);
struct devres *dr;
unsigned long flags;
spin_lock_irqsave(&dev->devres_lock, flags);
dr = find_dr(dev, new_dr->node.release, match, match_data);
if (!dr) {
add_dr(dev, &new_dr->node);
dr = new_dr;
new_res = NULL;
}
spin_unlock_irqrestore(&dev->devres_lock, flags);
devres_free(new_res);
return dr->data;
}
EXPORT_SYMBOL_GPL(devres_get);
/**
* devres_remove - Find a device resource and remove it
* @dev: Device to find resource from
* @release: Look for resources associated with this release function
* @match: Match function (optional)
* @match_data: Data for the match function
*
* Find the latest devres of @dev associated with @release and for
* which @match returns 1. If @match is NULL, it's considered to
* match all. If found, the resource is removed atomically and
* returned.
*
* RETURNS:
* Pointer to removed devres on success, NULL if not found.
*/
void * devres_remove(struct device *dev, dr_release_t release,
dr_match_t match, void *match_data)
{
struct devres *dr;
unsigned long flags;
spin_lock_irqsave(&dev->devres_lock, flags);
dr = find_dr(dev, release, match, match_data);
if (dr) {
list_del_init(&dr->node.entry);
devres_log(dev, &dr->node, "REM");
}
spin_unlock_irqrestore(&dev->devres_lock, flags);
if (dr)
return dr->data;
return NULL;
}
EXPORT_SYMBOL_GPL(devres_remove);
/**
* devres_destroy - Find a device resource and destroy it
* @dev: Device to find resource from
* @release: Look for resources associated with this release function
* @match: Match function (optional)
* @match_data: Data for the match function
*
* Find the latest devres of @dev associated with @release and for
* which @match returns 1. If @match is NULL, it's considered to
* match all. If found, the resource is removed atomically and freed.
*
* Note that the release function for the resource will not be called,
* only the devres-allocated data will be freed. The caller becomes
* responsible for freeing any other data.
*
* RETURNS:
* 0 if devres is found and freed, -ENOENT if not found.
*/
int devres_destroy(struct device *dev, dr_release_t release,
dr_match_t match, void *match_data)
{
void *res;
res = devres_remove(dev, release, match, match_data);
if (unlikely(!res))
return -ENOENT;
devres_free(res);
return 0;
}
EXPORT_SYMBOL_GPL(devres_destroy);
/**
* devres_release - Find a device resource and destroy it, calling release
* @dev: Device to find resource from
* @release: Look for resources associated with this release function
* @match: Match function (optional)
* @match_data: Data for the match function
*
* Find the latest devres of @dev associated with @release and for
* which @match returns 1. If @match is NULL, it's considered to
* match all. If found, the resource is removed atomically, the
* release function called and the resource freed.
*
* RETURNS:
* 0 if devres is found and freed, -ENOENT if not found.
*/
int devres_release(struct device *dev, dr_release_t release,
dr_match_t match, void *match_data)
{
void *res;
res = devres_remove(dev, release, match, match_data);
if (unlikely(!res))
return -ENOENT;
(*release)(dev, res);
devres_free(res);
return 0;
}
EXPORT_SYMBOL_GPL(devres_release);
static int remove_nodes(struct device *dev,
struct list_head *first, struct list_head *end,
struct list_head *todo)
{
struct devres_node *node, *n;
int cnt = 0, nr_groups = 0;
/* First pass - move normal devres entries to @todo and clear
* devres_group colors.
*/
node = list_entry(first, struct devres_node, entry);
list_for_each_entry_safe_from(node, n, end, entry) {
struct devres_group *grp;
grp = node_to_group(node);
if (grp) {
/* clear color of group markers in the first pass */
grp->color = 0;
nr_groups++;
} else {
/* regular devres entry */
if (&node->entry == first)
first = first->next;
list_move_tail(&node->entry, todo);
cnt++;
}
}
if (!nr_groups)
return cnt;
/* Second pass - Scan groups and color them. A group gets
* color value of two iff the group is wholly contained in
* [current node, end). That is, for a closed group, both opening
* and closing markers should be in the range, while just the
* opening marker is enough for an open group.
*/
node = list_entry(first, struct devres_node, entry);
list_for_each_entry_safe_from(node, n, end, entry) {
struct devres_group *grp;
grp = node_to_group(node);
BUG_ON(!grp || list_empty(&grp->node[0].entry));
grp->color++;
if (list_empty(&grp->node[1].entry))
grp->color++;
BUG_ON(grp->color <= 0 || grp->color > 2);
if (grp->color == 2) {
/* No need to update current node or end. The removed
* nodes are always before both.
*/
list_move_tail(&grp->node[0].entry, todo);
list_del_init(&grp->node[1].entry);
}
}
return cnt;
}
static void release_nodes(struct device *dev, struct list_head *todo)
{
struct devres *dr, *tmp;
/* Release. Note that both devres and devres_group are
* handled as devres in the following loop. This is safe.
*/
list_for_each_entry_safe_reverse(dr, tmp, todo, node.entry) {
devres_log(dev, &dr->node, "REL");
dr->node.release(dev, dr->data);
kfree(dr);
}
}
/**
* devres_release_all - Release all managed resources
* @dev: Device to release resources for
*
* Release all resources associated with @dev. This function is
* called on driver detach.
*/
int devres_release_all(struct device *dev)
{
unsigned long flags;
LIST_HEAD(todo);
int cnt;
/* Looks like an uninitialized device structure */
if (WARN_ON(dev->devres_head.next == NULL))
return -ENODEV;
/* Nothing to release if list is empty */
if (list_empty(&dev->devres_head))
return 0;
spin_lock_irqsave(&dev->devres_lock, flags);
cnt = remove_nodes(dev, dev->devres_head.next, &dev->devres_head, &todo);
spin_unlock_irqrestore(&dev->devres_lock, flags);
release_nodes(dev, &todo);
return cnt;
}
/**
* devres_open_group - Open a new devres group
* @dev: Device to open devres group for
* @id: Separator ID
* @gfp: Allocation flags
*
* Open a new devres group for @dev with @id. For @id, using a
* pointer to an object which won't be used for another group is
* recommended. If @id is NULL, address-wise unique ID is created.
*
* RETURNS:
* ID of the new group, NULL on failure.
*/
void * devres_open_group(struct device *dev, void *id, gfp_t gfp)
{
struct devres_group *grp;
unsigned long flags;
grp = kmalloc(sizeof(*grp), gfp);
if (unlikely(!grp))
return NULL;
grp->node[0].release = &group_open_release;
grp->node[1].release = &group_close_release;
INIT_LIST_HEAD(&grp->node[0].entry);
INIT_LIST_HEAD(&grp->node[1].entry);
set_node_dbginfo(&grp->node[0], "grp<", 0);
set_node_dbginfo(&grp->node[1], "grp>", 0);
grp->id = grp;
if (id)
grp->id = id;
spin_lock_irqsave(&dev->devres_lock, flags);
add_dr(dev, &grp->node[0]);
spin_unlock_irqrestore(&dev->devres_lock, flags);
return grp->id;
}
EXPORT_SYMBOL_GPL(devres_open_group);
/* Find devres group with ID @id. If @id is NULL, look for the latest. */
static struct devres_group * find_group(struct device *dev, void *id)
{
struct devres_node *node;
list_for_each_entry_reverse(node, &dev->devres_head, entry) {
struct devres_group *grp;
if (node->release != &group_open_release)
continue;
grp = container_of(node, struct devres_group, node[0]);
if (id) {
if (grp->id == id)
return grp;
} else if (list_empty(&grp->node[1].entry))
return grp;
}
return NULL;
}
/**
* devres_close_group - Close a devres group
* @dev: Device to close devres group for
* @id: ID of target group, can be NULL
*
* Close the group identified by @id. If @id is NULL, the latest open
* group is selected.
*/
void devres_close_group(struct device *dev, void *id)
{
struct devres_group *grp;
unsigned long flags;
spin_lock_irqsave(&dev->devres_lock, flags);
grp = find_group(dev, id);
if (grp)
add_dr(dev, &grp->node[1]);
else
WARN_ON(1);
spin_unlock_irqrestore(&dev->devres_lock, flags);
}
EXPORT_SYMBOL_GPL(devres_close_group);
/**
* devres_remove_group - Remove a devres group
* @dev: Device to remove group for
* @id: ID of target group, can be NULL
*
* Remove the group identified by @id. If @id is NULL, the latest
* open group is selected. Note that removing a group doesn't affect
* any other resources.
*/
void devres_remove_group(struct device *dev, void *id)
{
struct devres_group *grp;
unsigned long flags;
spin_lock_irqsave(&dev->devres_lock, flags);
grp = find_group(dev, id);
if (grp) {
list_del_init(&grp->node[0].entry);
list_del_init(&grp->node[1].entry);
devres_log(dev, &grp->node[0], "REM");
} else
WARN_ON(1);
spin_unlock_irqrestore(&dev->devres_lock, flags);
kfree(grp);
}
EXPORT_SYMBOL_GPL(devres_remove_group);
/**
* devres_release_group - Release resources in a devres group
* @dev: Device to release group for
* @id: ID of target group, can be NULL
*
* Release all resources in the group identified by @id. If @id is
* NULL, the latest open group is selected. The selected group and
* groups properly nested inside the selected group are removed.
*
* RETURNS:
* The number of released non-group resources.
*/
int devres_release_group(struct device *dev, void *id)
{
struct devres_group *grp;
unsigned long flags;
LIST_HEAD(todo);
int cnt = 0;
spin_lock_irqsave(&dev->devres_lock, flags);
grp = find_group(dev, id);
if (grp) {
struct list_head *first = &grp->node[0].entry;
struct list_head *end = &dev->devres_head;
if (!list_empty(&grp->node[1].entry))
end = grp->node[1].entry.next;
cnt = remove_nodes(dev, first, end, &todo);
spin_unlock_irqrestore(&dev->devres_lock, flags);
release_nodes(dev, &todo);
} else {
WARN_ON(1);
spin_unlock_irqrestore(&dev->devres_lock, flags);
}
return cnt;
}
EXPORT_SYMBOL_GPL(devres_release_group);
/*
* Custom devres actions allow inserting a simple function call
* into the teadown sequence.
*/
struct action_devres {
void *data;
void (*action)(void *);
};
static int devm_action_match(struct device *dev, void *res, void *p)
{
struct action_devres *devres = res;
struct action_devres *target = p;
return devres->action == target->action &&
devres->data == target->data;
}
static void devm_action_release(struct device *dev, void *res)
{
struct action_devres *devres = res;
devres->action(devres->data);
}
/**
* devm_add_action() - add a custom action to list of managed resources
* @dev: Device that owns the action
* @action: Function that should be called
* @data: Pointer to data passed to @action implementation
*
* This adds a custom action to the list of managed resources so that
* it gets executed as part of standard resource unwinding.
*/
int devm_add_action(struct device *dev, void (*action)(void *), void *data)
{
struct action_devres *devres;
devres = devres_alloc(devm_action_release,
sizeof(struct action_devres), GFP_KERNEL);
if (!devres)
return -ENOMEM;
devres->data = data;
devres->action = action;
devres_add(dev, devres);
return 0;
}
EXPORT_SYMBOL_GPL(devm_add_action);
/**
* devm_remove_action() - removes previously added custom action
* @dev: Device that owns the action
* @action: Function implementing the action
* @data: Pointer to data passed to @action implementation
*
* Removes instance of @action previously added by devm_add_action().
* Both action and data should match one of the existing entries.
*/
void devm_remove_action(struct device *dev, void (*action)(void *), void *data)
{
struct action_devres devres = {
.data = data,
.action = action,
};
WARN_ON(devres_destroy(dev, devm_action_release, devm_action_match,
&devres));
}
EXPORT_SYMBOL_GPL(devm_remove_action);
/**
* devm_release_action() - release previously added custom action
* @dev: Device that owns the action
* @action: Function implementing the action
* @data: Pointer to data passed to @action implementation
*
* Releases and removes instance of @action previously added by
* devm_add_action(). Both action and data should match one of the
* existing entries.
*/
void devm_release_action(struct device *dev, void (*action)(void *), void *data)
{
struct action_devres devres = {
.data = data,
.action = action,
};
WARN_ON(devres_release(dev, devm_action_release, devm_action_match,
&devres));
}
EXPORT_SYMBOL_GPL(devm_release_action);
/*
* Managed kmalloc/kfree
*/
static void devm_kmalloc_release(struct device *dev, void *res)
{
/* noop */
}
static int devm_kmalloc_match(struct device *dev, void *res, void *data)
{
return res == data;
}
/**
* devm_kmalloc - Resource-managed kmalloc
* @dev: Device to allocate memory for
* @size: Allocation size
* @gfp: Allocation gfp flags
*
* Managed kmalloc. Memory allocated with this function is
* automatically freed on driver detach. Like all other devres
* resources, guaranteed alignment is unsigned long long.
*
* RETURNS:
* Pointer to allocated memory on success, NULL on failure.
*/
void *devm_kmalloc(struct device *dev, size_t size, gfp_t gfp)
{
struct devres *dr;
if (unlikely(!size))
return ZERO_SIZE_PTR;
/* use raw alloc_dr for kmalloc caller tracing */
dr = alloc_dr(devm_kmalloc_release, size, gfp, dev_to_node(dev));
if (unlikely(!dr))
return NULL;
/*
* This is named devm_kzalloc_release for historical reasons
* The initial implementation did not support kmalloc, only kzalloc
*/
set_node_dbginfo(&dr->node, "devm_kzalloc_release", size);
devres_add(dev, dr->data);
return dr->data;
}
EXPORT_SYMBOL_GPL(devm_kmalloc);
/**
* devm_krealloc - Resource-managed krealloc()
* @dev: Device to re-allocate memory for
* @ptr: Pointer to the memory chunk to re-allocate
* @new_size: New allocation size
* @gfp: Allocation gfp flags
*
* Managed krealloc(). Resizes the memory chunk allocated with devm_kmalloc().
* Behaves similarly to regular krealloc(): if @ptr is NULL or ZERO_SIZE_PTR,
* it's the equivalent of devm_kmalloc(). If new_size is zero, it frees the
* previously allocated memory and returns ZERO_SIZE_PTR. This function doesn't
* change the order in which the release callback for the re-alloc'ed devres
* will be called (except when falling back to devm_kmalloc() or when freeing
* resources when new_size is zero). The contents of the memory are preserved
* up to the lesser of new and old sizes.
*/
void *devm_krealloc(struct device *dev, void *ptr, size_t new_size, gfp_t gfp)
{
size_t total_new_size, total_old_size;
struct devres *old_dr, *new_dr;
unsigned long flags;
if (unlikely(!new_size)) {
devm_kfree(dev, ptr);
return ZERO_SIZE_PTR;
}
if (unlikely(ZERO_OR_NULL_PTR(ptr)))
return devm_kmalloc(dev, new_size, gfp);
if (WARN_ON(is_kernel_rodata((unsigned long)ptr)))
/*
* We cannot reliably realloc a const string returned by
* devm_kstrdup_const().
*/
return NULL;
if (!check_dr_size(new_size, &total_new_size))
return NULL;
total_old_size = ksize(container_of(ptr, struct devres, data));
if (total_old_size == 0) {
WARN(1, "Pointer doesn't point to dynamically allocated memory.");
return NULL;
}
/*
* If new size is smaller or equal to the actual number of bytes
* allocated previously - just return the same pointer.
*/
if (total_new_size <= total_old_size)
return ptr;
/*
* Otherwise: allocate new, larger chunk. We need to allocate before
* taking the lock as most probably the caller uses GFP_KERNEL.
*/
new_dr = alloc_dr(devm_kmalloc_release,
total_new_size, gfp, dev_to_node(dev));
if (!new_dr)
return NULL;
/*
* The spinlock protects the linked list against concurrent
* modifications but not the resource itself.
*/
spin_lock_irqsave(&dev->devres_lock, flags);
old_dr = find_dr(dev, devm_kmalloc_release, devm_kmalloc_match, ptr);
if (!old_dr) {
spin_unlock_irqrestore(&dev->devres_lock, flags);
kfree(new_dr);
WARN(1, "Memory chunk not managed or managed by a different device.");
return NULL;
}
replace_dr(dev, &old_dr->node, &new_dr->node);
spin_unlock_irqrestore(&dev->devres_lock, flags);
/*
* We can copy the memory contents after releasing the lock as we're
* no longer modyfing the list links.
*/
memcpy(new_dr->data, old_dr->data,
total_old_size - offsetof(struct devres, data));
/*
* Same for releasing the old devres - it's now been removed from the
* list. This is also the reason why we must not use devm_kfree() - the
* links are no longer valid.
*/
kfree(old_dr);
return new_dr->data;
}
EXPORT_SYMBOL_GPL(devm_krealloc);
/**
* devm_kstrdup - Allocate resource managed space and
* copy an existing string into that.
* @dev: Device to allocate memory for
* @s: the string to duplicate
* @gfp: the GFP mask used in the devm_kmalloc() call when
* allocating memory
* RETURNS:
* Pointer to allocated string on success, NULL on failure.
*/
char *devm_kstrdup(struct device *dev, const char *s, gfp_t gfp)
{
size_t size;
char *buf;
if (!s)
return NULL;
size = strlen(s) + 1;
buf = devm_kmalloc(dev, size, gfp);
if (buf)
memcpy(buf, s, size);
return buf;
}
EXPORT_SYMBOL_GPL(devm_kstrdup);
/**
* devm_kstrdup_const - resource managed conditional string duplication
* @dev: device for which to duplicate the string
* @s: the string to duplicate
* @gfp: the GFP mask used in the kmalloc() call when allocating memory
*
* Strings allocated by devm_kstrdup_const will be automatically freed when
* the associated device is detached.
*
* RETURNS:
* Source string if it is in .rodata section otherwise it falls back to
* devm_kstrdup.
*/
const char *devm_kstrdup_const(struct device *dev, const char *s, gfp_t gfp)
{
if (is_kernel_rodata((unsigned long)s))
return s;
return devm_kstrdup(dev, s, gfp);
}
EXPORT_SYMBOL_GPL(devm_kstrdup_const);
/**
* devm_kvasprintf - Allocate resource managed space and format a string
* into that.
* @dev: Device to allocate memory for
* @gfp: the GFP mask used in the devm_kmalloc() call when
* allocating memory
* @fmt: The printf()-style format string
* @ap: Arguments for the format string
* RETURNS:
* Pointer to allocated string on success, NULL on failure.
*/
char *devm_kvasprintf(struct device *dev, gfp_t gfp, const char *fmt,
va_list ap)
{
unsigned int len;
char *p;
va_list aq;
va_copy(aq, ap);
len = vsnprintf(NULL, 0, fmt, aq);
va_end(aq);
p = devm_kmalloc(dev, len+1, gfp);
if (!p)
return NULL;
vsnprintf(p, len+1, fmt, ap);
return p;
}
EXPORT_SYMBOL(devm_kvasprintf);
/**
* devm_kasprintf - Allocate resource managed space and format a string
* into that.
* @dev: Device to allocate memory for
* @gfp: the GFP mask used in the devm_kmalloc() call when
* allocating memory
* @fmt: The printf()-style format string
* @...: Arguments for the format string
* RETURNS:
* Pointer to allocated string on success, NULL on failure.
*/
char *devm_kasprintf(struct device *dev, gfp_t gfp, const char *fmt, ...)
{
va_list ap;
char *p;
va_start(ap, fmt);
p = devm_kvasprintf(dev, gfp, fmt, ap);
va_end(ap);
return p;
}
EXPORT_SYMBOL_GPL(devm_kasprintf);
/**
* devm_kfree - Resource-managed kfree
* @dev: Device this memory belongs to
* @p: Memory to free
*
* Free memory allocated with devm_kmalloc().
*/
void devm_kfree(struct device *dev, const void *p)
{
int rc;
/*
* Special cases: pointer to a string in .rodata returned by
* devm_kstrdup_const() or NULL/ZERO ptr.
*/
if (unlikely(is_kernel_rodata((unsigned long)p) || ZERO_OR_NULL_PTR(p)))
return;
rc = devres_destroy(dev, devm_kmalloc_release,
devm_kmalloc_match, (void *)p);
WARN_ON(rc);
}
EXPORT_SYMBOL_GPL(devm_kfree);
/**
* devm_kmemdup - Resource-managed kmemdup
* @dev: Device this memory belongs to
* @src: Memory region to duplicate
* @len: Memory region length
* @gfp: GFP mask to use
*
* Duplicate region of a memory using resource managed kmalloc
*/
void *devm_kmemdup(struct device *dev, const void *src, size_t len, gfp_t gfp)
{
void *p;
p = devm_kmalloc(dev, len, gfp);
if (p)
memcpy(p, src, len);
return p;
}
EXPORT_SYMBOL_GPL(devm_kmemdup);
struct pages_devres {
unsigned long addr;
unsigned int order;
};
static int devm_pages_match(struct device *dev, void *res, void *p)
{
struct pages_devres *devres = res;
struct pages_devres *target = p;
return devres->addr == target->addr;
}
static void devm_pages_release(struct device *dev, void *res)
{
struct pages_devres *devres = res;
free_pages(devres->addr, devres->order);
}
/**
* devm_get_free_pages - Resource-managed __get_free_pages
* @dev: Device to allocate memory for
* @gfp_mask: Allocation gfp flags
* @order: Allocation size is (1 << order) pages
*
* Managed get_free_pages. Memory allocated with this function is
* automatically freed on driver detach.
*
* RETURNS:
* Address of allocated memory on success, 0 on failure.
*/
unsigned long devm_get_free_pages(struct device *dev,
gfp_t gfp_mask, unsigned int order)
{
struct pages_devres *devres;
unsigned long addr;
addr = __get_free_pages(gfp_mask, order);
if (unlikely(!addr))
return 0;
devres = devres_alloc(devm_pages_release,
sizeof(struct pages_devres), GFP_KERNEL);
if (unlikely(!devres)) {
free_pages(addr, order);
return 0;
}
devres->addr = addr;
devres->order = order;
devres_add(dev, devres);
return addr;
}
EXPORT_SYMBOL_GPL(devm_get_free_pages);
/**
* devm_free_pages - Resource-managed free_pages
* @dev: Device this memory belongs to
* @addr: Memory to free
*
* Free memory allocated with devm_get_free_pages(). Unlike free_pages,
* there is no need to supply the @order.
*/
void devm_free_pages(struct device *dev, unsigned long addr)
{
struct pages_devres devres = { .addr = addr };
WARN_ON(devres_release(dev, devm_pages_release, devm_pages_match,
&devres));
}
EXPORT_SYMBOL_GPL(devm_free_pages);
static void devm_percpu_release(struct device *dev, void *pdata)
{
void __percpu *p;
p = *(void __percpu **)pdata;
free_percpu(p);
}
static int devm_percpu_match(struct device *dev, void *data, void *p)
{
struct devres *devr = container_of(data, struct devres, data);
return *(void **)devr->data == p;
}
/**
* __devm_alloc_percpu - Resource-managed alloc_percpu
* @dev: Device to allocate per-cpu memory for
* @size: Size of per-cpu memory to allocate
* @align: Alignment of per-cpu memory to allocate
*
* Managed alloc_percpu. Per-cpu memory allocated with this function is
* automatically freed on driver detach.
*
* RETURNS:
* Pointer to allocated memory on success, NULL on failure.
*/
void __percpu *__devm_alloc_percpu(struct device *dev, size_t size,
size_t align)
{
void *p;
void __percpu *pcpu;
pcpu = __alloc_percpu(size, align);
if (!pcpu)
return NULL;
p = devres_alloc(devm_percpu_release, sizeof(void *), GFP_KERNEL);
if (!p) {
free_percpu(pcpu);
return NULL;
}
*(void __percpu **)p = pcpu;
devres_add(dev, p);
return pcpu;
}
EXPORT_SYMBOL_GPL(__devm_alloc_percpu);
/**
* devm_free_percpu - Resource-managed free_percpu
* @dev: Device this memory belongs to
* @pdata: Per-cpu memory to free
*
* Free memory allocated with devm_alloc_percpu().
*/
void devm_free_percpu(struct device *dev, void __percpu *pdata)
{
WARN_ON(devres_destroy(dev, devm_percpu_release, devm_percpu_match,
(__force void *)pdata));
}
EXPORT_SYMBOL_GPL(devm_free_percpu);
// SPDX-License-Identifier: GPL-2.0-or-later
/*
* Generic Timer-queue
*
* Manages a simple queue of timers, ordered by expiration time.
* Uses rbtrees for quick list adds and expiration.
*
* NOTE: All of the following functions need to be serialized
* to avoid races. No locking is done by this library code.
*/
#include <linux/bug.h>
#include <linux/timerqueue.h>
#include <linux/rbtree.h>
#include <linux/export.h>
#define __node_2_tq(_n) \
rb_entry((_n), struct timerqueue_node, node)
static inline bool __timerqueue_less(struct rb_node *a, const struct rb_node *b)
{
return __node_2_tq(a)->expires < __node_2_tq(b)->expires;
}
/**
* timerqueue_add - Adds timer to timerqueue.
*
* @head: head of timerqueue
* @node: timer node to be added
*
* Adds the timer node to the timerqueue, sorted by the node's expires
* value. Returns true if the newly added timer is the first expiring timer in
* the queue.
*/
bool timerqueue_add(struct timerqueue_head *head, struct timerqueue_node *node)
{
/* Make sure we don't add nodes that are already added */
WARN_ON_ONCE(!RB_EMPTY_NODE(&node->node)); return rb_add_cached(&node->node, &head->rb_root, __timerqueue_less);
}
EXPORT_SYMBOL_GPL(timerqueue_add);
/**
* timerqueue_del - Removes a timer from the timerqueue.
*
* @head: head of timerqueue
* @node: timer node to be removed
*
* Removes the timer node from the timerqueue. Returns true if the queue is
* not empty after the remove.
*/
bool timerqueue_del(struct timerqueue_head *head, struct timerqueue_node *node)
{
WARN_ON_ONCE(RB_EMPTY_NODE(&node->node));
rb_erase_cached(&node->node, &head->rb_root);
RB_CLEAR_NODE(&node->node);
return !RB_EMPTY_ROOT(&head->rb_root.rb_root);
}
EXPORT_SYMBOL_GPL(timerqueue_del);
/**
* timerqueue_iterate_next - Returns the timer after the provided timer
*
* @node: Pointer to a timer.
*
* Provides the timer that is after the given node. This is used, when
* necessary, to iterate through the list of timers in a timer list
* without modifying the list.
*/
struct timerqueue_node *timerqueue_iterate_next(struct timerqueue_node *node)
{
struct rb_node *next;
if (!node)
return NULL;
next = rb_next(&node->node);
if (!next)
return NULL;
return container_of(next, struct timerqueue_node, node);
}
EXPORT_SYMBOL_GPL(timerqueue_iterate_next);
// SPDX-License-Identifier: GPL-2.0-only
/*
* linux/fs/buffer.c
*
* Copyright (C) 1991, 1992, 2002 Linus Torvalds
*/
/*
* Start bdflush() with kernel_thread not syscall - Paul Gortmaker, 12/95
*
* Removed a lot of unnecessary code and simplified things now that
* the buffer cache isn't our primary cache - Andrew Tridgell 12/96
*
* Speed up hash, lru, and free list operations. Use gfp() for allocating
* hash table, use SLAB cache for buffer heads. SMP threading. -DaveM
*
* Added 32k buffer block sizes - these are required older ARM systems. - RMK
*
* async buffer flushing, 1999 Andrea Arcangeli <andrea@suse.de>
*/
#include <linux/kernel.h>
#include <linux/sched/signal.h>
#include <linux/syscalls.h>
#include <linux/fs.h>
#include <linux/iomap.h>
#include <linux/mm.h>
#include <linux/percpu.h>
#include <linux/slab.h>
#include <linux/capability.h>
#include <linux/blkdev.h>
#include <linux/file.h>
#include <linux/quotaops.h>
#include <linux/highmem.h>
#include <linux/export.h>
#include <linux/backing-dev.h>
#include <linux/writeback.h>
#include <linux/hash.h>
#include <linux/suspend.h>
#include <linux/buffer_head.h>
#include <linux/task_io_accounting_ops.h>
#include <linux/bio.h>
#include <linux/cpu.h>
#include <linux/bitops.h>
#include <linux/mpage.h>
#include <linux/bit_spinlock.h>
#include <linux/pagevec.h>
#include <linux/sched/mm.h>
#include <trace/events/block.h>
#include <linux/fscrypt.h>
#include "internal.h"
static int fsync_buffers_list(spinlock_t *lock, struct list_head *list);
static int submit_bh_wbc(int op, int op_flags, struct buffer_head *bh,
enum rw_hint hint, struct writeback_control *wbc);
#define BH_ENTRY(list) list_entry((list), struct buffer_head, b_assoc_buffers)
inline void touch_buffer(struct buffer_head *bh)
{
trace_block_touch_buffer(bh);
mark_page_accessed(bh->b_page);
}
EXPORT_SYMBOL(touch_buffer);
void __lock_buffer(struct buffer_head *bh)
{
wait_on_bit_lock_io(&bh->b_state, BH_Lock, TASK_UNINTERRUPTIBLE);
}
EXPORT_SYMBOL(__lock_buffer);
void unlock_buffer(struct buffer_head *bh)
{
clear_bit_unlock(BH_Lock, &bh->b_state);
smp_mb__after_atomic();
wake_up_bit(&bh->b_state, BH_Lock);
}
EXPORT_SYMBOL(unlock_buffer);
/*
* Returns if the page has dirty or writeback buffers. If all the buffers
* are unlocked and clean then the PageDirty information is stale. If
* any of the pages are locked, it is assumed they are locked for IO.
*/
void buffer_check_dirty_writeback(struct page *page,
bool *dirty, bool *writeback)
{
struct buffer_head *head, *bh;
*dirty = false;
*writeback = false;
BUG_ON(!PageLocked(page));
if (!page_has_buffers(page))
return;
if (PageWriteback(page))
*writeback = true;
head = page_buffers(page);
bh = head;
do {
if (buffer_locked(bh))
*writeback = true;
if (buffer_dirty(bh))
*dirty = true;
bh = bh->b_this_page;
} while (bh != head);
}
EXPORT_SYMBOL(buffer_check_dirty_writeback);
/*
* Block until a buffer comes unlocked. This doesn't stop it
* from becoming locked again - you have to lock it yourself
* if you want to preserve its state.
*/
void __wait_on_buffer(struct buffer_head * bh)
{
wait_on_bit_io(&bh->b_state, BH_Lock, TASK_UNINTERRUPTIBLE);
}
EXPORT_SYMBOL(__wait_on_buffer);
static void buffer_io_error(struct buffer_head *bh, char *msg)
{
if (!test_bit(BH_Quiet, &bh->b_state))
printk_ratelimited(KERN_ERR
"Buffer I/O error on dev %pg, logical block %llu%s\n",
bh->b_bdev, (unsigned long long)bh->b_blocknr, msg);
}
/*
* End-of-IO handler helper function which does not touch the bh after
* unlocking it.
* Note: unlock_buffer() sort-of does touch the bh after unlocking it, but
* a race there is benign: unlock_buffer() only use the bh's address for
* hashing after unlocking the buffer, so it doesn't actually touch the bh
* itself.
*/
static void __end_buffer_read_notouch(struct buffer_head *bh, int uptodate)
{
if (uptodate) {
set_buffer_uptodate(bh);
} else {
/* This happens, due to failed read-ahead attempts. */
clear_buffer_uptodate(bh);
}
unlock_buffer(bh);
}
/*
* Default synchronous end-of-IO handler.. Just mark it up-to-date and
* unlock the buffer. This is what ll_rw_block uses too.
*/
void end_buffer_read_sync(struct buffer_head *bh, int uptodate)
{
__end_buffer_read_notouch(bh, uptodate);
put_bh(bh);
}
EXPORT_SYMBOL(end_buffer_read_sync);
void end_buffer_write_sync(struct buffer_head *bh, int uptodate)
{
if (uptodate) {
set_buffer_uptodate(bh);
} else {
buffer_io_error(bh, ", lost sync page write");
mark_buffer_write_io_error(bh);
clear_buffer_uptodate(bh);
}
unlock_buffer(bh);
put_bh(bh);
}
EXPORT_SYMBOL(end_buffer_write_sync);
/*
* Various filesystems appear to want __find_get_block to be non-blocking.
* But it's the page lock which protects the buffers. To get around this,
* we get exclusion from try_to_free_buffers with the blockdev mapping's
* private_lock.
*
* Hack idea: for the blockdev mapping, private_lock contention
* may be quite high. This code could TryLock the page, and if that
* succeeds, there is no need to take private_lock.
*/
static struct buffer_head *
__find_get_block_slow(struct block_device *bdev, sector_t block)
{
struct inode *bd_inode = bdev->bd_inode;
struct address_space *bd_mapping = bd_inode->i_mapping;
struct buffer_head *ret = NULL;
pgoff_t index;
struct buffer_head *bh;
struct buffer_head *head;
struct page *page;
int all_mapped = 1;
static DEFINE_RATELIMIT_STATE(last_warned, HZ, 1);
index = block >> (PAGE_SHIFT - bd_inode->i_blkbits);
page = find_get_page_flags(bd_mapping, index, FGP_ACCESSED);
if (!page)
goto out;
spin_lock(&bd_mapping->private_lock);
if (!page_has_buffers(page))
goto out_unlock;
head = page_buffers(page);
bh = head;
do {
if (!buffer_mapped(bh))
all_mapped = 0;
else if (bh->b_blocknr == block) {
ret = bh;
get_bh(bh);
goto out_unlock;
}
bh = bh->b_this_page;
} while (bh != head);
/* we might be here because some of the buffers on this page are
* not mapped. This is due to various races between
* file io on the block device and getblk. It gets dealt with
* elsewhere, don't buffer_error if we had some unmapped buffers
*/
ratelimit_set_flags(&last_warned, RATELIMIT_MSG_ON_RELEASE);
if (all_mapped && __ratelimit(&last_warned)) {
printk("__find_get_block_slow() failed. block=%llu, "
"b_blocknr=%llu, b_state=0x%08lx, b_size=%zu, "
"device %pg blocksize: %d\n",
(unsigned long long)block,
(unsigned long long)bh->b_blocknr,
bh->b_state, bh->b_size, bdev,
1 << bd_inode->i_blkbits);
}
out_unlock:
spin_unlock(&bd_mapping->private_lock);
put_page(page);
out:
return ret;
}
static void end_buffer_async_read(struct buffer_head *bh, int uptodate)
{
unsigned long flags;
struct buffer_head *first;
struct buffer_head *tmp;
struct page *page;
int page_uptodate = 1;
BUG_ON(!buffer_async_read(bh));
page = bh->b_page;
if (uptodate) {
set_buffer_uptodate(bh);
} else {
clear_buffer_uptodate(bh);
buffer_io_error(bh, ", async page read");
SetPageError(page);
}
/*
* Be _very_ careful from here on. Bad things can happen if
* two buffer heads end IO at almost the same time and both
* decide that the page is now completely done.
*/
first = page_buffers(page);
spin_lock_irqsave(&first->b_uptodate_lock, flags);
clear_buffer_async_read(bh);
unlock_buffer(bh);
tmp = bh;
do {
if (!buffer_uptodate(tmp))
page_uptodate = 0;
if (buffer_async_read(tmp)) {
BUG_ON(!buffer_locked(tmp));
goto still_busy;
}
tmp = tmp->b_this_page;
} while (tmp != bh);
spin_unlock_irqrestore(&first->b_uptodate_lock, flags);
/*
* If none of the buffers had errors and they are all
* uptodate then we can set the page uptodate.
*/
if (page_uptodate && !PageError(page))
SetPageUptodate(page);
unlock_page(page);
return;
still_busy:
spin_unlock_irqrestore(&first->b_uptodate_lock, flags);
return;
}
struct decrypt_bh_ctx {
struct work_struct work;
struct buffer_head *bh;
};
static void decrypt_bh(struct work_struct *work)
{
struct decrypt_bh_ctx *ctx =
container_of(work, struct decrypt_bh_ctx, work);
struct buffer_head *bh = ctx->bh;
int err;
err = fscrypt_decrypt_pagecache_blocks(bh->b_page, bh->b_size,
bh_offset(bh));
end_buffer_async_read(bh, err == 0);
kfree(ctx);
}
/*
* I/O completion handler for block_read_full_page() - pages
* which come unlocked at the end of I/O.
*/
static void end_buffer_async_read_io(struct buffer_head *bh, int uptodate)
{
/* Decrypt if needed */
if (uptodate &&
fscrypt_inode_uses_fs_layer_crypto(bh->b_page->mapping->host)) {
struct decrypt_bh_ctx *ctx = kmalloc(sizeof(*ctx), GFP_ATOMIC);
if (ctx) {
INIT_WORK(&ctx->work, decrypt_bh);
ctx->bh = bh;
fscrypt_enqueue_decrypt_work(&ctx->work);
return;
}
uptodate = 0;
}
end_buffer_async_read(bh, uptodate);
}
/*
* Completion handler for block_write_full_page() - pages which are unlocked
* during I/O, and which have PageWriteback cleared upon I/O completion.
*/
void end_buffer_async_write(struct buffer_head *bh, int uptodate)
{
unsigned long flags;
struct buffer_head *first;
struct buffer_head *tmp;
struct page *page;
BUG_ON(!buffer_async_write(bh));
page = bh->b_page;
if (uptodate) {
set_buffer_uptodate(bh);
} else {
buffer_io_error(bh, ", lost async page write");
mark_buffer_write_io_error(bh);
clear_buffer_uptodate(bh);
SetPageError(page);
}
first = page_buffers(page);
spin_lock_irqsave(&first->b_uptodate_lock, flags);
clear_buffer_async_write(bh);
unlock_buffer(bh);
tmp = bh->b_this_page;
while (tmp != bh) {
if (buffer_async_write(tmp)) {
BUG_ON(!buffer_locked(tmp));
goto still_busy;
}
tmp = tmp->b_this_page;
}
spin_unlock_irqrestore(&first->b_uptodate_lock, flags);
end_page_writeback(page);
return;
still_busy:
spin_unlock_irqrestore(&first->b_uptodate_lock, flags);
return;
}
EXPORT_SYMBOL(end_buffer_async_write);
/*
* If a page's buffers are under async readin (end_buffer_async_read
* completion) then there is a possibility that another thread of
* control could lock one of the buffers after it has completed
* but while some of the other buffers have not completed. This
* locked buffer would confuse end_buffer_async_read() into not unlocking
* the page. So the absence of BH_Async_Read tells end_buffer_async_read()
* that this buffer is not under async I/O.
*
* The page comes unlocked when it has no locked buffer_async buffers
* left.
*
* PageLocked prevents anyone starting new async I/O reads any of
* the buffers.
*
* PageWriteback is used to prevent simultaneous writeout of the same
* page.
*
* PageLocked prevents anyone from starting writeback of a page which is
* under read I/O (PageWriteback is only ever set against a locked page).
*/
static void mark_buffer_async_read(struct buffer_head *bh)
{
bh->b_end_io = end_buffer_async_read_io;
set_buffer_async_read(bh);
}
static void mark_buffer_async_write_endio(struct buffer_head *bh,
bh_end_io_t *handler)
{
bh->b_end_io = handler;
set_buffer_async_write(bh);
}
void mark_buffer_async_write(struct buffer_head *bh)
{
mark_buffer_async_write_endio(bh, end_buffer_async_write);
}
EXPORT_SYMBOL(mark_buffer_async_write);
/*
* fs/buffer.c contains helper functions for buffer-backed address space's
* fsync functions. A common requirement for buffer-based filesystems is
* that certain data from the backing blockdev needs to be written out for
* a successful fsync(). For example, ext2 indirect blocks need to be
* written back and waited upon before fsync() returns.
*
* The functions mark_buffer_inode_dirty(), fsync_inode_buffers(),
* inode_has_buffers() and invalidate_inode_buffers() are provided for the
* management of a list of dependent buffers at ->i_mapping->private_list.
*
* Locking is a little subtle: try_to_free_buffers() will remove buffers
* from their controlling inode's queue when they are being freed. But
* try_to_free_buffers() will be operating against the *blockdev* mapping
* at the time, not against the S_ISREG file which depends on those buffers.
* So the locking for private_list is via the private_lock in the address_space
* which backs the buffers. Which is different from the address_space
* against which the buffers are listed. So for a particular address_space,
* mapping->private_lock does *not* protect mapping->private_list! In fact,
* mapping->private_list will always be protected by the backing blockdev's
* ->private_lock.
*
* Which introduces a requirement: all buffers on an address_space's
* ->private_list must be from the same address_space: the blockdev's.
*
* address_spaces which do not place buffers at ->private_list via these
* utility functions are free to use private_lock and private_list for
* whatever they want. The only requirement is that list_empty(private_list)
* be true at clear_inode() time.
*
* FIXME: clear_inode should not call invalidate_inode_buffers(). The
* filesystems should do that. invalidate_inode_buffers() should just go
* BUG_ON(!list_empty).
*
* FIXME: mark_buffer_dirty_inode() is a data-plane operation. It should
* take an address_space, not an inode. And it should be called
* mark_buffer_dirty_fsync() to clearly define why those buffers are being
* queued up.
*
* FIXME: mark_buffer_dirty_inode() doesn't need to add the buffer to the
* list if it is already on a list. Because if the buffer is on a list,
* it *must* already be on the right one. If not, the filesystem is being
* silly. This will save a ton of locking. But first we have to ensure
* that buffers are taken *off* the old inode's list when they are freed
* (presumably in truncate). That requires careful auditing of all
* filesystems (do it inside bforget()). It could also be done by bringing
* b_inode back.
*/
/*
* The buffer's backing address_space's private_lock must be held
*/
static void __remove_assoc_queue(struct buffer_head *bh)
{
list_del_init(&bh->b_assoc_buffers); WARN_ON(!bh->b_assoc_map); bh->b_assoc_map = NULL;
}
int inode_has_buffers(struct inode *inode)
{
return !list_empty(&inode->i_data.private_list);
}
/*
* osync is designed to support O_SYNC io. It waits synchronously for
* all already-submitted IO to complete, but does not queue any new
* writes to the disk.
*
* To do O_SYNC writes, just queue the buffer writes with ll_rw_block as
* you dirty the buffers, and then use osync_inode_buffers to wait for
* completion. Any other dirty buffers which are not yet queued for
* write will not be flushed to disk by the osync.
*/
static int osync_buffers_list(spinlock_t *lock, struct list_head *list)
{
struct buffer_head *bh;
struct list_head *p;
int err = 0;
spin_lock(lock);
repeat:
list_for_each_prev(p, list) {
bh = BH_ENTRY(p);
if (buffer_locked(bh)) {
get_bh(bh);
spin_unlock(lock);
wait_on_buffer(bh);
if (!buffer_uptodate(bh))
err = -EIO;
brelse(bh);
spin_lock(lock);
goto repeat;
}
}
spin_unlock(lock);
return err;
}
void emergency_thaw_bdev(struct super_block *sb)
{
while (sb->s_bdev && !thaw_bdev(sb->s_bdev))
printk(KERN_WARNING "Emergency Thaw on %pg\n", sb->s_bdev);
}
/**
* sync_mapping_buffers - write out & wait upon a mapping's "associated" buffers
* @mapping: the mapping which wants those buffers written
*
* Starts I/O against the buffers at mapping->private_list, and waits upon
* that I/O.
*
* Basically, this is a convenience function for fsync().
* @mapping is a file or directory which needs those buffers to be written for
* a successful fsync().
*/
int sync_mapping_buffers(struct address_space *mapping)
{
struct address_space *buffer_mapping = mapping->private_data;
if (buffer_mapping == NULL || list_empty(&mapping->private_list))
return 0;
return fsync_buffers_list(&buffer_mapping->private_lock,
&mapping->private_list);
}
EXPORT_SYMBOL(sync_mapping_buffers);
/*
* Called when we've recently written block `bblock', and it is known that
* `bblock' was for a buffer_boundary() buffer. This means that the block at
* `bblock + 1' is probably a dirty indirect block. Hunt it down and, if it's
* dirty, schedule it for IO. So that indirects merge nicely with their data.
*/
void write_boundary_block(struct block_device *bdev,
sector_t bblock, unsigned blocksize)
{
struct buffer_head *bh = __find_get_block(bdev, bblock + 1, blocksize);
if (bh) {
if (buffer_dirty(bh))
ll_rw_block(REQ_OP_WRITE, 0, 1, &bh);
put_bh(bh);
}
}
void mark_buffer_dirty_inode(struct buffer_head *bh, struct inode *inode)
{
struct address_space *mapping = inode->i_mapping;
struct address_space *buffer_mapping = bh->b_page->mapping;
mark_buffer_dirty(bh);
if (!mapping->private_data) {
mapping->private_data = buffer_mapping;
} else {
BUG_ON(mapping->private_data != buffer_mapping);
}
if (!bh->b_assoc_map) {
spin_lock(&buffer_mapping->private_lock);
list_move_tail(&bh->b_assoc_buffers,
&mapping->private_list);
bh->b_assoc_map = mapping;
spin_unlock(&buffer_mapping->private_lock);
}
}
EXPORT_SYMBOL(mark_buffer_dirty_inode);
/*
* Add a page to the dirty page list.
*
* It is a sad fact of life that this function is called from several places
* deeply under spinlocking. It may not sleep.
*
* If the page has buffers, the uptodate buffers are set dirty, to preserve
* dirty-state coherency between the page and the buffers. It the page does
* not have buffers then when they are later attached they will all be set
* dirty.
*
* The buffers are dirtied before the page is dirtied. There's a small race
* window in which a writepage caller may see the page cleanness but not the
* buffer dirtiness. That's fine. If this code were to set the page dirty
* before the buffers, a concurrent writepage caller could clear the page dirty
* bit, see a bunch of clean buffers and we'd end up with dirty buffers/clean
* page on the dirty page list.
*
* We use private_lock to lock against try_to_free_buffers while using the
* page's buffer list. Also use this to protect against clean buffers being
* added to the page after it was set dirty.
*
* FIXME: may need to call ->reservepage here as well. That's rather up to the
* address_space though.
*/
int __set_page_dirty_buffers(struct page *page)
{
int newly_dirty;
struct address_space *mapping = page_mapping(page);
if (unlikely(!mapping))
return !TestSetPageDirty(page);
spin_lock(&mapping->private_lock);
if (page_has_buffers(page)) {
struct buffer_head *head = page_buffers(page);
struct buffer_head *bh = head;
do {
set_buffer_dirty(bh);
bh = bh->b_this_page;
} while (bh != head);
}
/*
* Lock out page's memcg migration to keep PageDirty
* synchronized with per-memcg dirty page counters.
*/
lock_page_memcg(page);
newly_dirty = !TestSetPageDirty(page);
spin_unlock(&mapping->private_lock);
if (newly_dirty)
__set_page_dirty(page, mapping, 1);
unlock_page_memcg(page);
if (newly_dirty)
__mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
return newly_dirty;
}
EXPORT_SYMBOL(__set_page_dirty_buffers);
/*
* Write out and wait upon a list of buffers.
*
* We have conflicting pressures: we want to make sure that all
* initially dirty buffers get waited on, but that any subsequently
* dirtied buffers don't. After all, we don't want fsync to last
* forever if somebody is actively writing to the file.
*
* Do this in two main stages: first we copy dirty buffers to a
* temporary inode list, queueing the writes as we go. Then we clean
* up, waiting for those writes to complete.
*
* During this second stage, any subsequent updates to the file may end
* up refiling the buffer on the original inode's dirty list again, so
* there is a chance we will end up with a buffer queued for write but
* not yet completed on that list. So, as a final cleanup we go through
* the osync code to catch these locked, dirty buffers without requeuing
* any newly dirty buffers for write.
*/
static int fsync_buffers_list(spinlock_t *lock, struct list_head *list)
{
struct buffer_head *bh;
struct list_head tmp;
struct address_space *mapping;
int err = 0, err2;
struct blk_plug plug;
INIT_LIST_HEAD(&tmp);
blk_start_plug(&plug);
spin_lock(lock);
while (!list_empty(list)) {
bh = BH_ENTRY(list->next);
mapping = bh->b_assoc_map;
__remove_assoc_queue(bh);
/* Avoid race with mark_buffer_dirty_inode() which does
* a lockless check and we rely on seeing the dirty bit */
smp_mb();
if (buffer_dirty(bh) || buffer_locked(bh)) {
list_add(&bh->b_assoc_buffers, &tmp);
bh->b_assoc_map = mapping;
if (buffer_dirty(bh)) {
get_bh(bh);
spin_unlock(lock);
/*
* Ensure any pending I/O completes so that
* write_dirty_buffer() actually writes the
* current contents - it is a noop if I/O is
* still in flight on potentially older
* contents.
*/
write_dirty_buffer(bh, REQ_SYNC);
/*
* Kick off IO for the previous mapping. Note
* that we will not run the very last mapping,
* wait_on_buffer() will do that for us
* through sync_buffer().
*/
brelse(bh);
spin_lock(lock);
}
}
}
spin_unlock(lock);
blk_finish_plug(&plug);
spin_lock(lock);
while (!list_empty(&tmp)) {
bh = BH_ENTRY(tmp.prev);
get_bh(bh);
mapping = bh->b_assoc_map;
__remove_assoc_queue(bh);
/* Avoid race with mark_buffer_dirty_inode() which does
* a lockless check and we rely on seeing the dirty bit */
smp_mb();
if (buffer_dirty(bh)) {
list_add(&bh->b_assoc_buffers,
&mapping->private_list);
bh->b_assoc_map = mapping;
}
spin_unlock(lock);
wait_on_buffer(bh);
if (!buffer_uptodate(bh))
err = -EIO;
brelse(bh);
spin_lock(lock);
}
spin_unlock(lock);
err2 = osync_buffers_list(lock, list);
if (err)
return err;
else
return err2;
}
/*
* Invalidate any and all dirty buffers on a given inode. We are
* probably unmounting the fs, but that doesn't mean we have already
* done a sync(). Just drop the buffers from the inode list.
*
* NOTE: we take the inode's blockdev's mapping's private_lock. Which
* assumes that all the buffers are against the blockdev. Not true
* for reiserfs.
*/
void invalidate_inode_buffers(struct inode *inode)
{
if (inode_has_buffers(inode)) {
struct address_space *mapping = &inode->i_data;
struct list_head *list = &mapping->private_list;
struct address_space *buffer_mapping = mapping->private_data;
spin_lock(&buffer_mapping->private_lock);
while (!list_empty(list))
__remove_assoc_queue(BH_ENTRY(list->next));
spin_unlock(&buffer_mapping->private_lock);
}
}
EXPORT_SYMBOL(invalidate_inode_buffers);
/*
* Remove any clean buffers from the inode's buffer list. This is called
* when we're trying to free the inode itself. Those buffers can pin it.
*
* Returns true if all buffers were removed.
*/
int remove_inode_buffers(struct inode *inode)
{
int ret = 1;
if (inode_has_buffers(inode)) {
struct address_space *mapping = &inode->i_data;
struct list_head *list = &mapping->private_list;
struct address_space *buffer_mapping = mapping->private_data;
spin_lock(&buffer_mapping->private_lock);
while (!list_empty(list)) {
struct buffer_head *bh = BH_ENTRY(list->next);
if (buffer_dirty(bh)) {
ret = 0;
break;
}
__remove_assoc_queue(bh);
}
spin_unlock(&buffer_mapping->private_lock);
}
return ret;
}
/*
* Create the appropriate buffers when given a page for data area and
* the size of each buffer.. Use the bh->b_this_page linked list to
* follow the buffers created. Return NULL if unable to create more
* buffers.
*
* The retry flag is used to differentiate async IO (paging, swapping)
* which may not fail from ordinary buffer allocations.
*/
struct buffer_head *alloc_page_buffers(struct page *page, unsigned long size,
bool retry)
{
struct buffer_head *bh, *head;
gfp_t gfp = GFP_NOFS | __GFP_ACCOUNT;
long offset;
struct mem_cgroup *memcg, *old_memcg;
if (retry)
gfp |= __GFP_NOFAIL;
/* The page lock pins the memcg */
memcg = page_memcg(page);
old_memcg = set_active_memcg(memcg);
head = NULL;
offset = PAGE_SIZE;
while ((offset -= size) >= 0) { bh = alloc_buffer_head(gfp);
if (!bh)
goto no_grow;
bh->b_this_page = head;
bh->b_blocknr = -1;
head = bh;
bh->b_size = size;
/* Link the buffer to its page */
set_bh_page(bh, page, offset);
}
out:
set_active_memcg(old_memcg);
return head;
/*
* In case anything failed, we just free everything we got.
*/
no_grow:
if (head) {
do {
bh = head;
head = head->b_this_page;
free_buffer_head(bh);
} while (head);
}
goto out;
}
EXPORT_SYMBOL_GPL(alloc_page_buffers);
static inline void
link_dev_buffers(struct page *page, struct buffer_head *head)
{
struct buffer_head *bh, *tail;
bh = head;
do {
tail = bh;
bh = bh->b_this_page;
} while (bh);
tail->b_this_page = head;
attach_page_private(page, head);
}
static sector_t blkdev_max_block(struct block_device *bdev, unsigned int size)
{
sector_t retval = ~((sector_t)0);
loff_t sz = i_size_read(bdev->bd_inode);
if (sz) {
unsigned int sizebits = blksize_bits(size);
retval = (sz >> sizebits);
}
return retval;
}
/*
* Initialise the state of a blockdev page's buffers.
*/
static sector_t
init_page_buffers(struct page *page, struct block_device *bdev,
sector_t block, int size)
{
struct buffer_head *head = page_buffers(page);
struct buffer_head *bh = head;
int uptodate = PageUptodate(page);
sector_t end_block = blkdev_max_block(I_BDEV(bdev->bd_inode), size);
do {
if (!buffer_mapped(bh)) {
bh->b_end_io = NULL;
bh->b_private = NULL;
bh->b_bdev = bdev;
bh->b_blocknr = block;
if (uptodate)
set_buffer_uptodate(bh);
if (block < end_block)
set_buffer_mapped(bh);
}
block++;
bh = bh->b_this_page;
} while (bh != head);
/*
* Caller needs to validate requested block against end of device.
*/
return end_block;
}
/*
* Create the page-cache page that contains the requested block.
*
* This is used purely for blockdev mappings.
*/
static int
grow_dev_page(struct block_device *bdev, sector_t block,
pgoff_t index, int size, int sizebits, gfp_t gfp)
{
struct inode *inode = bdev->bd_inode;
struct page *page;
struct buffer_head *bh;
sector_t end_block;
int ret = 0;
gfp_t gfp_mask;
gfp_mask = mapping_gfp_constraint(inode->i_mapping, ~__GFP_FS) | gfp;
/*
* XXX: __getblk_slow() can not really deal with failure and
* will endlessly loop on improvised global reclaim. Prefer
* looping in the allocator rather than here, at least that
* code knows what it's doing.
*/
gfp_mask |= __GFP_NOFAIL;
page = find_or_create_page(inode->i_mapping, index, gfp_mask);
BUG_ON(!PageLocked(page));
if (page_has_buffers(page)) {
bh = page_buffers(page);
if (bh->b_size == size) {
end_block = init_page_buffers(page, bdev,
(sector_t)index << sizebits,
size);
goto done;
}
if (!try_to_free_buffers(page))
goto failed;
}
/*
* Allocate some buffers for this page
*/
bh = alloc_page_buffers(page, size, true);
/*
* Link the page to the buffers and initialise them. Take the
* lock to be atomic wrt __find_get_block(), which does not
* run under the page lock.
*/
spin_lock(&inode->i_mapping->private_lock);
link_dev_buffers(page, bh);
end_block = init_page_buffers(page, bdev, (sector_t)index << sizebits,
size);
spin_unlock(&inode->i_mapping->private_lock);
done:
ret = (block < end_block) ? 1 : -ENXIO;
failed:
unlock_page(page);
put_page(page);
return ret;
}
/*
* Create buffers for the specified block device block's page. If
* that page was dirty, the buffers are set dirty also.
*/
static int
grow_buffers(struct block_device *bdev, sector_t block, int size, gfp_t gfp)
{
pgoff_t index;
int sizebits;
sizebits = PAGE_SHIFT - __ffs(size);
index = block >> sizebits;
/*
* Check for a block which wants to lie outside our maximum possible
* pagecache index. (this comparison is done using sector_t types).
*/
if (unlikely(index != block >> sizebits)) {
printk(KERN_ERR "%s: requested out-of-range block %llu for "
"device %pg\n",
__func__, (unsigned long long)block,
bdev);
return -EIO;
}
/* Create a page with the proper size buffers.. */
return grow_dev_page(bdev, block, index, size, sizebits, gfp);
}
static struct buffer_head *
__getblk_slow(struct block_device *bdev, sector_t block,
unsigned size, gfp_t gfp)
{
/* Size must be multiple of hard sectorsize */
if (unlikely(size & (bdev_logical_block_size(bdev)-1) ||
(size < 512 || size > PAGE_SIZE))) {
printk(KERN_ERR "getblk(): invalid block size %d requested\n",
size);
printk(KERN_ERR "logical block size: %d\n",
bdev_logical_block_size(bdev));
dump_stack();
return NULL;
}
for (;;) {
struct buffer_head *bh;
int ret;
bh = __find_get_block(bdev, block, size);
if (bh)
return bh;
ret = grow_buffers(bdev, block, size, gfp);
if (ret < 0)
return NULL;
}
}
/*
* The relationship between dirty buffers and dirty pages:
*
* Whenever a page has any dirty buffers, the page's dirty bit is set, and
* the page is tagged dirty in the page cache.
*
* At all times, the dirtiness of the buffers represents the dirtiness of
* subsections of the page. If the page has buffers, the page dirty bit is
* merely a hint about the true dirty state.
*
* When a page is set dirty in its entirety, all its buffers are marked dirty
* (if the page has buffers).
*
* When a buffer is marked dirty, its page is dirtied, but the page's other
* buffers are not.
*
* Also. When blockdev buffers are explicitly read with bread(), they
* individually become uptodate. But their backing page remains not
* uptodate - even if all of its buffers are uptodate. A subsequent
* block_read_full_page() against that page will discover all the uptodate
* buffers, will set the page uptodate and will perform no I/O.
*/
/**
* mark_buffer_dirty - mark a buffer_head as needing writeout
* @bh: the buffer_head to mark dirty
*
* mark_buffer_dirty() will set the dirty bit against the buffer, then set
* its backing page dirty, then tag the page as dirty in the page cache
* and then attach the address_space's inode to its superblock's dirty
* inode list.
*
* mark_buffer_dirty() is atomic. It takes bh->b_page->mapping->private_lock,
* i_pages lock and mapping->host->i_lock.
*/
void mark_buffer_dirty(struct buffer_head *bh)
{
WARN_ON_ONCE(!buffer_uptodate(bh));
trace_block_dirty_buffer(bh);
/*
* Very *carefully* optimize the it-is-already-dirty case.
*
* Don't let the final "is it dirty" escape to before we
* perhaps modified the buffer.
*/
if (buffer_dirty(bh)) {
smp_mb();
if (buffer_dirty(bh))
return;
}
if (!test_set_buffer_dirty(bh)) {
struct page *page = bh->b_page;
struct address_space *mapping = NULL;
lock_page_memcg(page);
if (!TestSetPageDirty(page)) {
mapping = page_mapping(page);
if (mapping)
__set_page_dirty(page, mapping, 0);
}
unlock_page_memcg(page);
if (mapping)
__mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
}
}
EXPORT_SYMBOL(mark_buffer_dirty);
void mark_buffer_write_io_error(struct buffer_head *bh)
{
struct super_block *sb;
set_buffer_write_io_error(bh);
/* FIXME: do we need to set this in both places? */
if (bh->b_page && bh->b_page->mapping)
mapping_set_error(bh->b_page->mapping, -EIO);
if (bh->b_assoc_map)
mapping_set_error(bh->b_assoc_map, -EIO);
rcu_read_lock();
sb = READ_ONCE(bh->b_bdev->bd_super);
if (sb)
errseq_set(&sb->s_wb_err, -EIO);
rcu_read_unlock();
}
EXPORT_SYMBOL(mark_buffer_write_io_error);
/*
* Decrement a buffer_head's reference count. If all buffers against a page
* have zero reference count, are clean and unlocked, and if the page is clean
* and unlocked then try_to_free_buffers() may strip the buffers from the page
* in preparation for freeing it (sometimes, rarely, buffers are removed from
* a page but it ends up not being freed, and buffers may later be reattached).
*/
void __brelse(struct buffer_head * buf)
{
if (atomic_read(&buf->b_count)) {
put_bh(buf);
return;
}
WARN(1, KERN_ERR "VFS: brelse: Trying to free free buffer\n");
}
EXPORT_SYMBOL(__brelse);
/*
* bforget() is like brelse(), except it discards any
* potentially dirty data.
*/
void __bforget(struct buffer_head *bh)
{
clear_buffer_dirty(bh);
if (bh->b_assoc_map) {
struct address_space *buffer_mapping = bh->b_page->mapping;
spin_lock(&buffer_mapping->private_lock);
list_del_init(&bh->b_assoc_buffers);
bh->b_assoc_map = NULL;
spin_unlock(&buffer_mapping->private_lock);
}
__brelse(bh);
}
EXPORT_SYMBOL(__bforget);
static struct buffer_head *__bread_slow(struct buffer_head *bh)
{
lock_buffer(bh);
if (buffer_uptodate(bh)) {
unlock_buffer(bh);
return bh;
} else {
get_bh(bh);
bh->b_end_io = end_buffer_read_sync;
submit_bh(REQ_OP_READ, 0, bh);
wait_on_buffer(bh);
if (buffer_uptodate(bh))
return bh;
}
brelse(bh);
return NULL;
}
/*
* Per-cpu buffer LRU implementation. To reduce the cost of __find_get_block().
* The bhs[] array is sorted - newest buffer is at bhs[0]. Buffers have their
* refcount elevated by one when they're in an LRU. A buffer can only appear
* once in a particular CPU's LRU. A single buffer can be present in multiple
* CPU's LRUs at the same time.
*
* This is a transparent caching front-end to sb_bread(), sb_getblk() and
* sb_find_get_block().
*
* The LRUs themselves only need locking against invalidate_bh_lrus. We use
* a local interrupt disable for that.
*/
#define BH_LRU_SIZE 16
struct bh_lru {
struct buffer_head *bhs[BH_LRU_SIZE];
};
static DEFINE_PER_CPU(struct bh_lru, bh_lrus) = {{ NULL }};
#ifdef CONFIG_SMP
#define bh_lru_lock() local_irq_disable()
#define bh_lru_unlock() local_irq_enable()
#else
#define bh_lru_lock() preempt_disable()
#define bh_lru_unlock() preempt_enable()
#endif
static inline void check_irqs_on(void)
{
#ifdef irqs_disabled
BUG_ON(irqs_disabled());
#endif
}
/*
* Install a buffer_head into this cpu's LRU. If not already in the LRU, it is
* inserted at the front, and the buffer_head at the back if any is evicted.
* Or, if already in the LRU it is moved to the front.
*/
static void bh_lru_install(struct buffer_head *bh)
{
struct buffer_head *evictee = bh;
struct bh_lru *b;
int i;
check_irqs_on();
bh_lru_lock();
/*
* the refcount of buffer_head in bh_lru prevents dropping the
* attached page(i.e., try_to_free_buffers) so it could cause
* failing page migration.
* Skip putting upcoming bh into bh_lru until migration is done.
*/
if (lru_cache_disabled()) {
bh_lru_unlock();
return;
}
b = this_cpu_ptr(&bh_lrus);
for (i = 0; i < BH_LRU_SIZE; i++) {
swap(evictee, b->bhs[i]);
if (evictee == bh) {
bh_lru_unlock();
return;
}
}
get_bh(bh);
bh_lru_unlock();
brelse(evictee);
}
/*
* Look up the bh in this cpu's LRU. If it's there, move it to the head.
*/
static struct buffer_head *
lookup_bh_lru(struct block_device *bdev, sector_t block, unsigned size)
{
struct buffer_head *ret = NULL;
unsigned int i;
check_irqs_on();
bh_lru_lock();
for (i = 0; i < BH_LRU_SIZE; i++) {
struct buffer_head *bh = __this_cpu_read(bh_lrus.bhs[i]);
if (bh && bh->b_blocknr == block && bh->b_bdev == bdev &&
bh->b_size == size) {
if (i) {
while (i) {
__this_cpu_write(bh_lrus.bhs[i],
__this_cpu_read(bh_lrus.bhs[i - 1]));
i--;
}
__this_cpu_write(bh_lrus.bhs[0], bh);
}
get_bh(bh);
ret = bh;
break;
}
}
bh_lru_unlock();
return ret;
}
/*
* Perform a pagecache lookup for the matching buffer. If it's there, refresh
* it in the LRU and mark it as accessed. If it is not present then return
* NULL
*/
struct buffer_head *
__find_get_block(struct block_device *bdev, sector_t block, unsigned size)
{
struct buffer_head *bh = lookup_bh_lru(bdev, block, size);
if (bh == NULL) {
/* __find_get_block_slow will mark the page accessed */
bh = __find_get_block_slow(bdev, block);
if (bh)
bh_lru_install(bh);
} else
touch_buffer(bh);
return bh;
}
EXPORT_SYMBOL(__find_get_block);
/*
* __getblk_gfp() will locate (and, if necessary, create) the buffer_head
* which corresponds to the passed block_device, block and size. The
* returned buffer has its reference count incremented.
*
* __getblk_gfp() will lock up the machine if grow_dev_page's
* try_to_free_buffers() attempt is failing. FIXME, perhaps?
*/
struct buffer_head *
__getblk_gfp(struct block_device *bdev, sector_t block,
unsigned size, gfp_t gfp)
{
struct buffer_head *bh = __find_get_block(bdev, block, size);
might_sleep();
if (bh == NULL)
bh = __getblk_slow(bdev, block, size, gfp);
return bh;
}
EXPORT_SYMBOL(__getblk_gfp);
/*
* Do async read-ahead on a buffer..
*/
void __breadahead(struct block_device *bdev, sector_t block, unsigned size)
{
struct buffer_head *bh = __getblk(bdev, block, size);
if (likely(bh)) {
ll_rw_block(REQ_OP_READ, REQ_RAHEAD, 1, &bh);
brelse(bh);
}
}
EXPORT_SYMBOL(__breadahead);
void __breadahead_gfp(struct block_device *bdev, sector_t block, unsigned size,
gfp_t gfp)
{
struct buffer_head *bh = __getblk_gfp(bdev, block, size, gfp);
if (likely(bh)) {
ll_rw_block(REQ_OP_READ, REQ_RAHEAD, 1, &bh);
brelse(bh);
}
}
EXPORT_SYMBOL(__breadahead_gfp);
/**
* __bread_gfp() - reads a specified block and returns the bh
* @bdev: the block_device to read from
* @block: number of block
* @size: size (in bytes) to read
* @gfp: page allocation flag
*
* Reads a specified block, and returns buffer head that contains it.
* The page cache can be allocated from non-movable area
* not to prevent page migration if you set gfp to zero.
* It returns NULL if the block was unreadable.
*/
struct buffer_head *
__bread_gfp(struct block_device *bdev, sector_t block,
unsigned size, gfp_t gfp)
{
struct buffer_head *bh = __getblk_gfp(bdev, block, size, gfp);
if (likely(bh) && !buffer_uptodate(bh))
bh = __bread_slow(bh);
return bh;
}
EXPORT_SYMBOL(__bread_gfp);
static void __invalidate_bh_lrus(struct bh_lru *b)
{
int i;
for (i = 0; i < BH_LRU_SIZE; i++) {
brelse(b->bhs[i]);
b->bhs[i] = NULL;
}
}
/*
* invalidate_bh_lrus() is called rarely - but not only at unmount.
* This doesn't race because it runs in each cpu either in irq
* or with preempt disabled.
*/
static void invalidate_bh_lru(void *arg)
{
struct bh_lru *b = &get_cpu_var(bh_lrus);
__invalidate_bh_lrus(b);
put_cpu_var(bh_lrus);
}
bool has_bh_in_lru(int cpu, void *dummy)
{
struct bh_lru *b = per_cpu_ptr(&bh_lrus, cpu);
int i;
for (i = 0; i < BH_LRU_SIZE; i++) { if (b->bhs[i])
return true;
}
return false;
}
void invalidate_bh_lrus(void)
{
on_each_cpu_cond(has_bh_in_lru, invalidate_bh_lru, NULL, 1);
}
EXPORT_SYMBOL_GPL(invalidate_bh_lrus);
/*
* It's called from workqueue context so we need a bh_lru_lock to close
* the race with preemption/irq.
*/
void invalidate_bh_lrus_cpu(void)
{
struct bh_lru *b;
bh_lru_lock();
b = this_cpu_ptr(&bh_lrus);
__invalidate_bh_lrus(b);
bh_lru_unlock();
}
void set_bh_page(struct buffer_head *bh,
struct page *page, unsigned long offset)
{
bh->b_page = page;
BUG_ON(offset >= PAGE_SIZE);
if (PageHighMem(page))
/*
* This catches illegal uses and preserves the offset:
*/
bh->b_data = (char *)(0 + offset);
else
bh->b_data = page_address(page) + offset;
}
EXPORT_SYMBOL(set_bh_page);
/*
* Called when truncating a buffer on a page completely.
*/
/* Bits that are cleared during an invalidate */
#define BUFFER_FLAGS_DISCARD \
(1 << BH_Mapped | 1 << BH_New | 1 << BH_Req | \
1 << BH_Delay | 1 << BH_Unwritten)
static void discard_buffer(struct buffer_head * bh)
{
unsigned long b_state, b_state_old;
lock_buffer(bh);
clear_buffer_dirty(bh);
bh->b_bdev = NULL;
b_state = bh->b_state;
for (;;) {
b_state_old = cmpxchg(&bh->b_state, b_state,
(b_state & ~BUFFER_FLAGS_DISCARD));
if (b_state_old == b_state)
break;
b_state = b_state_old;
}
unlock_buffer(bh);
}
/**
* block_invalidatepage - invalidate part or all of a buffer-backed page
*
* @page: the page which is affected
* @offset: start of the range to invalidate
* @length: length of the range to invalidate
*
* block_invalidatepage() is called when all or part of the page has become
* invalidated by a truncate operation.
*
* block_invalidatepage() does not have to release all buffers, but it must
* ensure that no dirty buffer is left outside @offset and that no I/O
* is underway against any of the blocks which are outside the truncation
* point. Because the caller is about to free (and possibly reuse) those
* blocks on-disk.
*/
void block_invalidatepage(struct page *page, unsigned int offset,
unsigned int length)
{
struct buffer_head *head, *bh, *next;
unsigned int curr_off = 0;
unsigned int stop = length + offset; BUG_ON(!PageLocked(page));
if (!page_has_buffers(page))
goto out;
/*
* Check for overflow
*/
BUG_ON(stop > PAGE_SIZE || stop < length); head = page_buffers(page);
bh = head;
do {
unsigned int next_off = curr_off + bh->b_size;
next = bh->b_this_page;
/*
* Are we still fully in range ?
*/
if (next_off > stop)
goto out;
/*
* is this block fully invalidated?
*/
if (offset <= curr_off)
discard_buffer(bh);
curr_off = next_off;
bh = next;
} while (bh != head);
/*
* We release buffers only if the entire page is being invalidated.
* The get_block cached value has been unconditionally invalidated,
* so real IO is not possible anymore.
*/
if (length == PAGE_SIZE) try_to_release_page(page, 0);
out:
return;
}
EXPORT_SYMBOL(block_invalidatepage);
/*
* We attach and possibly dirty the buffers atomically wrt
* __set_page_dirty_buffers() via private_lock. try_to_free_buffers
* is already excluded via the page lock.
*/
void create_empty_buffers(struct page *page,
unsigned long blocksize, unsigned long b_state)
{
struct buffer_head *bh, *head, *tail;
head = alloc_page_buffers(page, blocksize, true);
bh = head;
do {
bh->b_state |= b_state;
tail = bh;
bh = bh->b_this_page;
} while (bh);
tail->b_this_page = head;
spin_lock(&page->mapping->private_lock);
if (PageUptodate(page) || PageDirty(page)) {
bh = head;
do {
if (PageDirty(page))
set_buffer_dirty(bh);
if (PageUptodate(page))
set_buffer_uptodate(bh);
bh = bh->b_this_page;
} while (bh != head);
}
attach_page_private(page, head);
spin_unlock(&page->mapping->private_lock);
}
EXPORT_SYMBOL(create_empty_buffers);
/**
* clean_bdev_aliases: clean a range of buffers in block device
* @bdev: Block device to clean buffers in
* @block: Start of a range of blocks to clean
* @len: Number of blocks to clean
*
* We are taking a range of blocks for data and we don't want writeback of any
* buffer-cache aliases starting from return from this function and until the
* moment when something will explicitly mark the buffer dirty (hopefully that
* will not happen until we will free that block ;-) We don't even need to mark
* it not-uptodate - nobody can expect anything from a newly allocated buffer
* anyway. We used to use unmap_buffer() for such invalidation, but that was
* wrong. We definitely don't want to mark the alias unmapped, for example - it
* would confuse anyone who might pick it with bread() afterwards...
*
* Also.. Note that bforget() doesn't lock the buffer. So there can be
* writeout I/O going on against recently-freed buffers. We don't wait on that
* I/O in bforget() - it's more efficient to wait on the I/O only if we really
* need to. That happens here.
*/
void clean_bdev_aliases(struct block_device *bdev, sector_t block, sector_t len)
{
struct inode *bd_inode = bdev->bd_inode;
struct address_space *bd_mapping = bd_inode->i_mapping;
struct pagevec pvec;
pgoff_t index = block >> (PAGE_SHIFT - bd_inode->i_blkbits);
pgoff_t end;
int i, count;
struct buffer_head *bh;
struct buffer_head *head;
end = (block + len - 1) >> (PAGE_SHIFT - bd_inode->i_blkbits);
pagevec_init(&pvec);
while (pagevec_lookup_range(&pvec, bd_mapping, &index, end)) {
count = pagevec_count(&pvec);
for (i = 0; i < count; i++) {
struct page *page = pvec.pages[i];
if (!page_has_buffers(page))
continue;
/*
* We use page lock instead of bd_mapping->private_lock
* to pin buffers here since we can afford to sleep and
* it scales better than a global spinlock lock.
*/
lock_page(page);
/* Recheck when the page is locked which pins bhs */
if (!page_has_buffers(page))
goto unlock_page;
head = page_buffers(page);
bh = head;
do {
if (!buffer_mapped(bh) || (bh->b_blocknr < block))
goto next;
if (bh->b_blocknr >= block + len)
break;
clear_buffer_dirty(bh);
wait_on_buffer(bh);
clear_buffer_req(bh);
next:
bh = bh->b_this_page;
} while (bh != head);
unlock_page:
unlock_page(page);
}
pagevec_release(&pvec);
cond_resched();
/* End of range already reached? */
if (index > end || !index)
break;
}
}
EXPORT_SYMBOL(clean_bdev_aliases);
/*
* Size is a power-of-two in the range 512..PAGE_SIZE,
* and the case we care about most is PAGE_SIZE.
*
* So this *could* possibly be written with those
* constraints in mind (relevant mostly if some
* architecture has a slow bit-scan instruction)
*/
static inline int block_size_bits(unsigned int blocksize)
{
return ilog2(blocksize);
}
static struct buffer_head *create_page_buffers(struct page *page, struct inode *inode, unsigned int b_state)
{
BUG_ON(!PageLocked(page));
if (!page_has_buffers(page))
create_empty_buffers(page, 1 << READ_ONCE(inode->i_blkbits),
b_state);
return page_buffers(page);
}
/*
* NOTE! All mapped/uptodate combinations are valid:
*
* Mapped Uptodate Meaning
*
* No No "unknown" - must do get_block()
* No Yes "hole" - zero-filled
* Yes No "allocated" - allocated on disk, not read in
* Yes Yes "valid" - allocated and up-to-date in memory.
*
* "Dirty" is valid only with the last case (mapped+uptodate).
*/
/*
* While block_write_full_page is writing back the dirty buffers under
* the page lock, whoever dirtied the buffers may decide to clean them
* again at any time. We handle that by only looking at the buffer
* state inside lock_buffer().
*
* If block_write_full_page() is called for regular writeback
* (wbc->sync_mode == WB_SYNC_NONE) then it will redirty a page which has a
* locked buffer. This only can happen if someone has written the buffer
* directly, with submit_bh(). At the address_space level PageWriteback
* prevents this contention from occurring.
*
* If block_write_full_page() is called with wbc->sync_mode ==
* WB_SYNC_ALL, the writes are posted using REQ_SYNC; this
* causes the writes to be flagged as synchronous writes.
*/
int __block_write_full_page(struct inode *inode, struct page *page,
get_block_t *get_block, struct writeback_control *wbc,
bh_end_io_t *handler)
{
int err;
sector_t block;
sector_t last_block;
struct buffer_head *bh, *head;
unsigned int blocksize, bbits;
int nr_underway = 0;
int write_flags = wbc_to_write_flags(wbc);
head = create_page_buffers(page, inode,
(1 << BH_Dirty)|(1 << BH_Uptodate));
/*
* Be very careful. We have no exclusion from __set_page_dirty_buffers
* here, and the (potentially unmapped) buffers may become dirty at
* any time. If a buffer becomes dirty here after we've inspected it
* then we just miss that fact, and the page stays dirty.
*
* Buffers outside i_size may be dirtied by __set_page_dirty_buffers;
* handle that here by just cleaning them.
*/
bh = head;
blocksize = bh->b_size;
bbits = block_size_bits(blocksize);
block = (sector_t)page->index << (PAGE_SHIFT - bbits);
last_block = (i_size_read(inode) - 1) >> bbits;
/*
* Get all the dirty buffers mapped to disk addresses and
* handle any aliases from the underlying blockdev's mapping.
*/
do {
if (block > last_block) {
/*
* mapped buffers outside i_size will occur, because
* this page can be outside i_size when there is a
* truncate in progress.
*/
/*
* The buffer was zeroed by block_write_full_page()
*/
clear_buffer_dirty(bh);
set_buffer_uptodate(bh);
} else if ((!buffer_mapped(bh) || buffer_delay(bh)) &&
buffer_dirty(bh)) {
WARN_ON(bh->b_size != blocksize); err = get_block(inode, block, bh, 1);
if (err)
goto recover;
clear_buffer_delay(bh);
if (buffer_new(bh)) {
/* blockdev mappings never come here */
clear_buffer_new(bh);
clean_bdev_bh_alias(bh);
}
}
bh = bh->b_this_page;
block++;
} while (bh != head);
do {
if (!buffer_mapped(bh))
continue;
/*
* If it's a fully non-blocking write attempt and we cannot
* lock the buffer then redirty the page. Note that this can
* potentially cause a busy-wait loop from writeback threads
* and kswapd activity, but those code paths have their own
* higher-level throttling.
*/
if (wbc->sync_mode != WB_SYNC_NONE) {
lock_buffer(bh);
} else if (!trylock_buffer(bh)) {
redirty_page_for_writepage(wbc, page);
continue;
}
if (test_clear_buffer_dirty(bh)) {
mark_buffer_async_write_endio(bh, handler);
} else {
unlock_buffer(bh);
}
} while ((bh = bh->b_this_page) != head);
/*
* The page and its buffers are protected by PageWriteback(), so we can
* drop the bh refcounts early.
*/
BUG_ON(PageWriteback(page));
set_page_writeback(page);
do {
struct buffer_head *next = bh->b_this_page;
if (buffer_async_write(bh)) {
submit_bh_wbc(REQ_OP_WRITE, write_flags, bh,
inode->i_write_hint, wbc);
nr_underway++;
}
bh = next;
} while (bh != head); unlock_page(page);
err = 0;
done:
if (nr_underway == 0) {
/*
* The page was marked dirty, but the buffers were
* clean. Someone wrote them back by hand with
* ll_rw_block/submit_bh. A rare case.
*/
end_page_writeback(page);
/*
* The page and buffer_heads can be released at any time from
* here on.
*/
}
return err;
recover:
/*
* ENOSPC, or some other error. We may already have added some
* blocks to the file, so we need to write these out to avoid
* exposing stale data.
* The page is currently locked and not marked for writeback
*/
bh = head;
/* Recovery: lock and submit the mapped buffers */
do {
if (buffer_mapped(bh) && buffer_dirty(bh) &&
!buffer_delay(bh)) {
lock_buffer(bh);
mark_buffer_async_write_endio(bh, handler);
} else {
/*
* The buffer may have been set dirty during
* attachment to a dirty page.
*/
clear_buffer_dirty(bh);
}
} while ((bh = bh->b_this_page) != head);
SetPageError(page);
BUG_ON(PageWriteback(page)); mapping_set_error(page->mapping, err);
set_page_writeback(page);
do {
struct buffer_head *next = bh->b_this_page;
if (buffer_async_write(bh)) {
clear_buffer_dirty(bh);
submit_bh_wbc(REQ_OP_WRITE, write_flags, bh,
inode->i_write_hint, wbc);
nr_underway++;
}
bh = next;
} while (bh != head); unlock_page(page);
goto done;
}
EXPORT_SYMBOL(__block_write_full_page);
/*
* If a page has any new buffers, zero them out here, and mark them uptodate
* and dirty so they'll be written out (in order to prevent uninitialised
* block data from leaking). And clear the new bit.
*/
void page_zero_new_buffers(struct page *page, unsigned from, unsigned to)
{
unsigned int block_start, block_end;
struct buffer_head *head, *bh;
BUG_ON(!PageLocked(page));
if (!page_has_buffers(page))
return;
bh = head = page_buffers(page);
block_start = 0;
do {
block_end = block_start + bh->b_size;
if (buffer_new(bh)) {
if (block_end > from && block_start < to) {
if (!PageUptodate(page)) {
unsigned start, size;
start = max(from, block_start);
size = min(to, block_end) - start;
zero_user(page, start, size);
set_buffer_uptodate(bh);
}
clear_buffer_new(bh);
mark_buffer_dirty(bh);
}
}
block_start = block_end;
bh = bh->b_this_page;
} while (bh != head);
}
EXPORT_SYMBOL(page_zero_new_buffers);
static void
iomap_to_bh(struct inode *inode, sector_t block, struct buffer_head *bh,
const struct iomap *iomap)
{
loff_t offset = block << inode->i_blkbits;
bh->b_bdev = iomap->bdev;
/*
* Block points to offset in file we need to map, iomap contains
* the offset at which the map starts. If the map ends before the
* current block, then do not map the buffer and let the caller
* handle it.
*/
BUG_ON(offset >= iomap->offset + iomap->length); switch (iomap->type) {
case IOMAP_HOLE:
/*
* If the buffer is not up to date or beyond the current EOF,
* we need to mark it as new to ensure sub-block zeroing is
* executed if necessary.
*/
if (!buffer_uptodate(bh) ||
(offset >= i_size_read(inode)))
set_buffer_new(bh);
break;
case IOMAP_DELALLOC:
if (!buffer_uptodate(bh) ||
(offset >= i_size_read(inode)))
set_buffer_new(bh);
set_buffer_uptodate(bh);
set_buffer_mapped(bh);
set_buffer_delay(bh);
break;
case IOMAP_UNWRITTEN:
/*
* For unwritten regions, we always need to ensure that regions
* in the block we are not writing to are zeroed. Mark the
* buffer as new to ensure this.
*/
set_buffer_new(bh);
set_buffer_unwritten(bh);
fallthrough;
case IOMAP_MAPPED:
if ((iomap->flags & IOMAP_F_NEW) || offset >= i_size_read(inode))
set_buffer_new(bh);
bh->b_blocknr = (iomap->addr + offset - iomap->offset) >>
inode->i_blkbits;
set_buffer_mapped(bh);
break;
}
}
int __block_write_begin_int(struct page *page, loff_t pos, unsigned len,
get_block_t *get_block, const struct iomap *iomap)
{
unsigned from = pos & (PAGE_SIZE - 1);
unsigned to = from + len;
struct inode *inode = page->mapping->host;
unsigned block_start, block_end;
sector_t block;
int err = 0;
unsigned blocksize, bbits;
struct buffer_head *bh, *head, *wait[2], **wait_bh=wait;
BUG_ON(!PageLocked(page));
BUG_ON(from > PAGE_SIZE);
BUG_ON(to > PAGE_SIZE); BUG_ON(from > to); head = create_page_buffers(page, inode, 0);
blocksize = head->b_size;
bbits = block_size_bits(blocksize);
block = (sector_t)page->index << (PAGE_SHIFT - bbits);
for(bh = head, block_start = 0; bh != head || !block_start; block++, block_start=block_end, bh = bh->b_this_page) { block_end = block_start + blocksize; if (block_end <= from || block_start >= to) {
if (PageUptodate(page)) {
if (!buffer_uptodate(bh))
set_buffer_uptodate(bh);
}
continue;
}
if (buffer_new(bh))
clear_buffer_new(bh); if (!buffer_mapped(bh)) { WARN_ON(bh->b_size != blocksize); if (get_block) { err = get_block(inode, block, bh, 1); if (err)
break;
} else {
iomap_to_bh(inode, block, bh, iomap);
}
if (buffer_new(bh)) {
clean_bdev_bh_alias(bh);
if (PageUptodate(page)) {
clear_buffer_new(bh);
set_buffer_uptodate(bh);
mark_buffer_dirty(bh);
continue;
}
if (block_end > to || block_start < from)
zero_user_segments(page,
to, block_end,
block_start, from);
continue;
}
}
if (PageUptodate(page)) {
if (!buffer_uptodate(bh))
set_buffer_uptodate(bh);
continue;
}
if (!buffer_uptodate(bh) && !buffer_delay(bh) && !buffer_unwritten(bh) && (block_start < from || block_end > to)) { ll_rw_block(REQ_OP_READ, 0, 1, &bh);
*wait_bh++=bh;
}
}
/*
* If we issued read requests - let them complete.
*/
while(wait_bh > wait) { wait_on_buffer(*--wait_bh);
if (!buffer_uptodate(*wait_bh))
err = -EIO;
}
if (unlikely(err)) page_zero_new_buffers(page, from, to); return err;
}
int __block_write_begin(struct page *page, loff_t pos, unsigned len,
get_block_t *get_block)
{
return __block_write_begin_int(page, pos, len, get_block, NULL);
}
EXPORT_SYMBOL(__block_write_begin);
static int __block_commit_write(struct inode *inode, struct page *page,
unsigned from, unsigned to)
{
unsigned block_start, block_end;
int partial = 0;
unsigned blocksize;
struct buffer_head *bh, *head;
bh = head = page_buffers(page);
blocksize = bh->b_size;
block_start = 0;
do {
block_end = block_start + blocksize; if (block_end <= from || block_start >= to) {
if (!buffer_uptodate(bh))
partial = 1;
} else {
set_buffer_uptodate(bh);
mark_buffer_dirty(bh);
}
if (buffer_new(bh))
clear_buffer_new(bh);
block_start = block_end;
bh = bh->b_this_page;
} while (bh != head);
/*
* If this is a partial write which happened to make all buffers
* uptodate then we can optimize away a bogus readpage() for
* the next read(). Here we 'discover' whether the page went
* uptodate as a result of this (potentially partial) write.
*/
if (!partial)
SetPageUptodate(page);
return 0;
}
/*
* block_write_begin takes care of the basic task of block allocation and
* bringing partial write blocks uptodate first.
*
* The filesystem needs to handle block truncation upon failure.
*/
int block_write_begin(struct address_space *mapping, loff_t pos, unsigned len,
unsigned flags, struct page **pagep, get_block_t *get_block)
{
pgoff_t index = pos >> PAGE_SHIFT;
struct page *page;
int status;
page = grab_cache_page_write_begin(mapping, index, flags);
if (!page)
return -ENOMEM;
status = __block_write_begin(page, pos, len, get_block);
if (unlikely(status)) {
unlock_page(page);
put_page(page);
page = NULL;
}
*pagep = page; return status;
}
EXPORT_SYMBOL(block_write_begin);
int block_write_end(struct file *file, struct address_space *mapping,
loff_t pos, unsigned len, unsigned copied,
struct page *page, void *fsdata)
{
struct inode *inode = mapping->host;
unsigned start;
start = pos & (PAGE_SIZE - 1); if (unlikely(copied < len)) {
/*
* The buffers that were written will now be uptodate, so we
* don't have to worry about a readpage reading them and
* overwriting a partial write. However if we have encountered
* a short write and only partially written into a buffer, it
* will not be marked uptodate, so a readpage might come in and
* destroy our partial write.
*
* Do the simplest thing, and just treat any short write to a
* non uptodate page as a zero-length write, and force the
* caller to redo the whole thing.
*/
if (!PageUptodate(page))
copied = 0;
page_zero_new_buffers(page, start+copied, start+len);
}
flush_dcache_page(page);
/* This could be a short (even 0-length) commit */
__block_commit_write(inode, page, start, start+copied);
return copied;
}
EXPORT_SYMBOL(block_write_end);
int generic_write_end(struct file *file, struct address_space *mapping,
loff_t pos, unsigned len, unsigned copied,
struct page *page, void *fsdata)
{
struct inode *inode = mapping->host;
loff_t old_size = inode->i_size;
bool i_size_changed = false;
copied = block_write_end(file, mapping, pos, len, copied, page, fsdata);
/*
* No need to use i_size_read() here, the i_size cannot change under us
* because we hold i_rwsem.
*
* But it's important to update i_size while still holding page lock:
* page writeout could otherwise come in and zero beyond i_size.
*/
if (pos + copied > inode->i_size) {
i_size_write(inode, pos + copied);
i_size_changed = true;
}
unlock_page(page);
put_page(page);
if (old_size < pos)
pagecache_isize_extended(inode, old_size, pos);
/*
* Don't mark the inode dirty under page lock. First, it unnecessarily
* makes the holding time of page lock longer. Second, it forces lock
* ordering of page lock and transaction start for journaling
* filesystems.
*/
if (i_size_changed)
mark_inode_dirty(inode);
return copied;
}
EXPORT_SYMBOL(generic_write_end);
/*
* block_is_partially_uptodate checks whether buffers within a page are
* uptodate or not.
*
* Returns true if all buffers which correspond to a file portion
* we want to read are uptodate.
*/
int block_is_partially_uptodate(struct page *page, unsigned long from,
unsigned long count)
{
unsigned block_start, block_end, blocksize;
unsigned to;
struct buffer_head *bh, *head;
int ret = 1;
if (!page_has_buffers(page))
return 0;
head = page_buffers(page);
blocksize = head->b_size;
to = min_t(unsigned, PAGE_SIZE - from, count);
to = from + to;
if (from < blocksize && to > PAGE_SIZE - blocksize)
return 0;
bh = head;
block_start = 0;
do {
block_end = block_start + blocksize;
if (block_end > from && block_start < to) {
if (!buffer_uptodate(bh)) {
ret = 0;
break;
}
if (block_end >= to)
break;
}
block_start = block_end;
bh = bh->b_this_page;
} while (bh != head);
return ret;
}
EXPORT_SYMBOL(block_is_partially_uptodate);
/*
* Generic "read page" function for block devices that have the normal
* get_block functionality. This is most of the block device filesystems.
* Reads the page asynchronously --- the unlock_buffer() and
* set/clear_buffer_uptodate() functions propagate buffer state into the
* page struct once IO has completed.
*/
int block_read_full_page(struct page *page, get_block_t *get_block)
{
struct inode *inode = page->mapping->host;
sector_t iblock, lblock;
struct buffer_head *bh, *head, *arr[MAX_BUF_PER_PAGE];
unsigned int blocksize, bbits;
int nr, i;
int fully_mapped = 1;
head = create_page_buffers(page, inode, 0);
blocksize = head->b_size;
bbits = block_size_bits(blocksize);
iblock = (sector_t)page->index << (PAGE_SHIFT - bbits);
lblock = (i_size_read(inode)+blocksize-1) >> bbits;
bh = head;
nr = 0;
i = 0;
do {
if (buffer_uptodate(bh))
continue;
if (!buffer_mapped(bh)) {
int err = 0;
fully_mapped = 0;
if (iblock < lblock) { WARN_ON(bh->b_size != blocksize); err = get_block(inode, iblock, bh, 0);
if (err)
SetPageError(page);
}
if (!buffer_mapped(bh)) {
zero_user(page, i * blocksize, blocksize); if (!err)
set_buffer_uptodate(bh);
continue;
}
/*
* get_block() might have updated the buffer
* synchronously
*/
if (buffer_uptodate(bh))
continue;
}
arr[nr++] = bh; } while (i++, iblock++, (bh = bh->b_this_page) != head); if (fully_mapped)
SetPageMappedToDisk(page);
if (!nr) {
/*
* All buffers are uptodate - we can set the page uptodate
* as well. But not if get_block() returned an error.
*/
if (!PageError(page))
SetPageUptodate(page);
unlock_page(page);
return 0;
}
/* Stage two: lock the buffers */
for (i = 0; i < nr; i++) { bh = arr[i];
lock_buffer(bh);
mark_buffer_async_read(bh);
}
/*
* Stage 3: start the IO. Check for uptodateness
* inside the buffer lock in case another process reading
* the underlying blockdev brought it uptodate (the sct fix).
*/
for (i = 0; i < nr; i++) { bh = arr[i];
if (buffer_uptodate(bh))
end_buffer_async_read(bh, 1);
else
submit_bh(REQ_OP_READ, 0, bh);
}
return 0;
}
EXPORT_SYMBOL(block_read_full_page);
/* utility function for filesystems that need to do work on expanding
* truncates. Uses filesystem pagecache writes to allow the filesystem to
* deal with the hole.
*/
int generic_cont_expand_simple(struct inode *inode, loff_t size)
{
struct address_space *mapping = inode->i_mapping;
struct page *page;
void *fsdata;
int err;
err = inode_newsize_ok(inode, size);
if (err)
goto out;
err = pagecache_write_begin(NULL, mapping, size, 0,
AOP_FLAG_CONT_EXPAND, &page, &fsdata);
if (err)
goto out;
err = pagecache_write_end(NULL, mapping, size, 0, 0, page, fsdata);
BUG_ON(err > 0);
out:
return err;
}
EXPORT_SYMBOL(generic_cont_expand_simple);
static int cont_expand_zero(struct file *file, struct address_space *mapping,
loff_t pos, loff_t *bytes)
{
struct inode *inode = mapping->host;
unsigned int blocksize = i_blocksize(inode);
struct page *page;
void *fsdata;
pgoff_t index, curidx;
loff_t curpos;
unsigned zerofrom, offset, len;
int err = 0;
index = pos >> PAGE_SHIFT;
offset = pos & ~PAGE_MASK;
while (index > (curidx = (curpos = *bytes)>>PAGE_SHIFT)) {
zerofrom = curpos & ~PAGE_MASK;
if (zerofrom & (blocksize-1)) {
*bytes |= (blocksize-1);
(*bytes)++;
}
len = PAGE_SIZE - zerofrom;
err = pagecache_write_begin(file, mapping, curpos, len, 0,
&page, &fsdata);
if (err)
goto out;
zero_user(page, zerofrom, len);
err = pagecache_write_end(file, mapping, curpos, len, len,
page, fsdata);
if (err < 0)
goto out;
BUG_ON(err != len);
err = 0;
balance_dirty_pages_ratelimited(mapping);
if (fatal_signal_pending(current)) {
err = -EINTR;
goto out;
}
}
/* page covers the boundary, find the boundary offset */
if (index == curidx) {
zerofrom = curpos & ~PAGE_MASK;
/* if we will expand the thing last block will be filled */
if (offset <= zerofrom) {
goto out;
}
if (zerofrom & (blocksize-1)) {
*bytes |= (blocksize-1);
(*bytes)++;
}
len = offset - zerofrom;
err = pagecache_write_begin(file, mapping, curpos, len, 0,
&page, &fsdata);
if (err)
goto out;
zero_user(page, zerofrom, len);
err = pagecache_write_end(file, mapping, curpos, len, len,
page, fsdata);
if (err < 0)
goto out;
BUG_ON(err != len);
err = 0;
}
out:
return err;
}
/*
* For moronic filesystems that do not allow holes in file.
* We may have to extend the file.
*/
int cont_write_begin(struct file *file, struct address_space *mapping,
loff_t pos, unsigned len, unsigned flags,
struct page **pagep, void **fsdata,
get_block_t *get_block, loff_t *bytes)
{
struct inode *inode = mapping->host;
unsigned int blocksize = i_blocksize(inode);
unsigned int zerofrom;
int err;
err = cont_expand_zero(file, mapping, pos, bytes);
if (err)
return err;
zerofrom = *bytes & ~PAGE_MASK;
if (pos+len > *bytes && zerofrom & (blocksize-1)) {
*bytes |= (blocksize-1);
(*bytes)++;
}
return block_write_begin(mapping, pos, len, flags, pagep, get_block);
}
EXPORT_SYMBOL(cont_write_begin);
int block_commit_write(struct page *page, unsigned from, unsigned to)
{
struct inode *inode = page->mapping->host;
__block_commit_write(inode,page,from,to);
return 0;
}
EXPORT_SYMBOL(block_commit_write);
/*
* block_page_mkwrite() is not allowed to change the file size as it gets
* called from a page fault handler when a page is first dirtied. Hence we must
* be careful to check for EOF conditions here. We set the page up correctly
* for a written page which means we get ENOSPC checking when writing into
* holes and correct delalloc and unwritten extent mapping on filesystems that
* support these features.
*
* We are not allowed to take the i_mutex here so we have to play games to
* protect against truncate races as the page could now be beyond EOF. Because
* truncate writes the inode size before removing pages, once we have the
* page lock we can determine safely if the page is beyond EOF. If it is not
* beyond EOF, then the page is guaranteed safe against truncation until we
* unlock the page.
*
* Direct callers of this function should protect against filesystem freezing
* using sb_start_pagefault() - sb_end_pagefault() functions.
*/
int block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf,
get_block_t get_block)
{
struct page *page = vmf->page;
struct inode *inode = file_inode(vma->vm_file);
unsigned long end;
loff_t size;
int ret;
lock_page(page);
size = i_size_read(inode);
if ((page->mapping != inode->i_mapping) ||
(page_offset(page) > size)) {
/* We overload EFAULT to mean page got truncated */
ret = -EFAULT;
goto out_unlock;
}
/* page is wholly or partially inside EOF */
if (((page->index + 1) << PAGE_SHIFT) > size)
end = size & ~PAGE_MASK;
else
end = PAGE_SIZE;
ret = __block_write_begin(page, 0, end, get_block);
if (!ret)
ret = block_commit_write(page, 0, end);
if (unlikely(ret < 0))
goto out_unlock;
set_page_dirty(page);
wait_for_stable_page(page);
return 0;
out_unlock:
unlock_page(page);
return ret;
}
EXPORT_SYMBOL(block_page_mkwrite);
/*
* nobh_write_begin()'s prereads are special: the buffer_heads are freed
* immediately, while under the page lock. So it needs a special end_io
* handler which does not touch the bh after unlocking it.
*/
static void end_buffer_read_nobh(struct buffer_head *bh, int uptodate)
{
__end_buffer_read_notouch(bh, uptodate);
}
/*
* Attach the singly-linked list of buffers created by nobh_write_begin, to
* the page (converting it to circular linked list and taking care of page
* dirty races).
*/
static void attach_nobh_buffers(struct page *page, struct buffer_head *head)
{
struct buffer_head *bh;
BUG_ON(!PageLocked(page));
spin_lock(&page->mapping->private_lock);
bh = head;
do {
if (PageDirty(page))
set_buffer_dirty(bh);
if (!bh->b_this_page)
bh->b_this_page = head;
bh = bh->b_this_page;
} while (bh != head);
attach_page_private(page, head);
spin_unlock(&page->mapping->private_lock);
}
/*
* On entry, the page is fully not uptodate.
* On exit the page is fully uptodate in the areas outside (from,to)
* The filesystem needs to handle block truncation upon failure.
*/
int nobh_write_begin(struct address_space *mapping,
loff_t pos, unsigned len, unsigned flags,
struct page **pagep, void **fsdata,
get_block_t *get_block)
{
struct inode *inode = mapping->host;
const unsigned blkbits = inode->i_blkbits;
const unsigned blocksize = 1 << blkbits;
struct buffer_head *head, *bh;
struct page *page;
pgoff_t index;
unsigned from, to;
unsigned block_in_page;
unsigned block_start, block_end;
sector_t block_in_file;
int nr_reads = 0;
int ret = 0;
int is_mapped_to_disk = 1;
index = pos >> PAGE_SHIFT;
from = pos & (PAGE_SIZE - 1);
to = from + len;
page = grab_cache_page_write_begin(mapping, index, flags);
if (!page)
return -ENOMEM;
*pagep = page;
*fsdata = NULL;
if (page_has_buffers(page)) {
ret = __block_write_begin(page, pos, len, get_block);
if (unlikely(ret))
goto out_release;
return ret;
}
if (PageMappedToDisk(page))
return 0;
/*
* Allocate buffers so that we can keep track of state, and potentially
* attach them to the page if an error occurs. In the common case of
* no error, they will just be freed again without ever being attached
* to the page (which is all OK, because we're under the page lock).
*
* Be careful: the buffer linked list is a NULL terminated one, rather
* than the circular one we're used to.
*/
head = alloc_page_buffers(page, blocksize, false);
if (!head) {
ret = -ENOMEM;
goto out_release;
}
block_in_file = (sector_t)page->index << (PAGE_SHIFT - blkbits);
/*
* We loop across all blocks in the page, whether or not they are
* part of the affected region. This is so we can discover if the
* page is fully mapped-to-disk.
*/
for (block_start = 0, block_in_page = 0, bh = head;
block_start < PAGE_SIZE;
block_in_page++, block_start += blocksize, bh = bh->b_this_page) {
int create;
block_end = block_start + blocksize;
bh->b_state = 0;
create = 1;
if (block_start >= to)
create = 0;
ret = get_block(inode, block_in_file + block_in_page,
bh, create);
if (ret)
goto failed;
if (!buffer_mapped(bh))
is_mapped_to_disk = 0;
if (buffer_new(bh))
clean_bdev_bh_alias(bh);
if (PageUptodate(page)) {
set_buffer_uptodate(bh);
continue;
}
if (buffer_new(bh) || !buffer_mapped(bh)) {
zero_user_segments(page, block_start, from,
to, block_end);
continue;
}
if (buffer_uptodate(bh))
continue; /* reiserfs does this */
if (block_start < from || block_end > to) {
lock_buffer(bh);
bh->b_end_io = end_buffer_read_nobh;
submit_bh(REQ_OP_READ, 0, bh);
nr_reads++;
}
}
if (nr_reads) {
/*
* The page is locked, so these buffers are protected from
* any VM or truncate activity. Hence we don't need to care
* for the buffer_head refcounts.
*/
for (bh = head; bh; bh = bh->b_this_page) {
wait_on_buffer(bh);
if (!buffer_uptodate(bh))
ret = -EIO;
}
if (ret)
goto failed;
}
if (is_mapped_to_disk)
SetPageMappedToDisk(page);
*fsdata = head; /* to be released by nobh_write_end */
return 0;
failed:
BUG_ON(!ret);
/*
* Error recovery is a bit difficult. We need to zero out blocks that
* were newly allocated, and dirty them to ensure they get written out.
* Buffers need to be attached to the page at this point, otherwise
* the handling of potential IO errors during writeout would be hard
* (could try doing synchronous writeout, but what if that fails too?)
*/
attach_nobh_buffers(page, head);
page_zero_new_buffers(page, from, to);
out_release:
unlock_page(page);
put_page(page);
*pagep = NULL;
return ret;
}
EXPORT_SYMBOL(nobh_write_begin);
int nobh_write_end(struct file *file, struct address_space *mapping,
loff_t pos, unsigned len, unsigned copied,
struct page *page, void *fsdata)
{
struct inode *inode = page->mapping->host;
struct buffer_head *head = fsdata;
struct buffer_head *bh;
BUG_ON(fsdata != NULL && page_has_buffers(page));
if (unlikely(copied < len) && head)
attach_nobh_buffers(page, head);
if (page_has_buffers(page))
return generic_write_end(file, mapping, pos, len,
copied, page, fsdata);
SetPageUptodate(page);
set_page_dirty(page);
if (pos+copied > inode->i_size) {
i_size_write(inode, pos+copied);
mark_inode_dirty(inode);
}
unlock_page(page);
put_page(page);
while (head) {
bh = head;
head = head->b_this_page;
free_buffer_head(bh);
}
return copied;
}
EXPORT_SYMBOL(nobh_write_end);
/*
* nobh_writepage() - based on block_full_write_page() except
* that it tries to operate without attaching bufferheads to
* the page.
*/
int nobh_writepage(struct page *page, get_block_t *get_block,
struct writeback_control *wbc)
{
struct inode * const inode = page->mapping->host;
loff_t i_size = i_size_read(inode);
const pgoff_t end_index = i_size >> PAGE_SHIFT;
unsigned offset;
int ret;
/* Is the page fully inside i_size? */
if (page->index < end_index)
goto out;
/* Is the page fully outside i_size? (truncate in progress) */
offset = i_size & (PAGE_SIZE-1);
if (page->index >= end_index+1 || !offset) {
unlock_page(page);
return 0; /* don't care */
}
/*
* The page straddles i_size. It must be zeroed out on each and every
* writepage invocation because it may be mmapped. "A file is mapped
* in multiples of the page size. For a file that is not a multiple of
* the page size, the remaining memory is zeroed when mapped, and
* writes to that region are not written out to the file."
*/
zero_user_segment(page, offset, PAGE_SIZE);
out:
ret = mpage_writepage(page, get_block, wbc);
if (ret == -EAGAIN)
ret = __block_write_full_page(inode, page, get_block, wbc,
end_buffer_async_write);
return ret;
}
EXPORT_SYMBOL(nobh_writepage);
int nobh_truncate_page(struct address_space *mapping,
loff_t from, get_block_t *get_block)
{
pgoff_t index = from >> PAGE_SHIFT;
unsigned offset = from & (PAGE_SIZE-1);
unsigned blocksize;
sector_t iblock;
unsigned length, pos;
struct inode *inode = mapping->host;
struct page *page;
struct buffer_head map_bh;
int err;
blocksize = i_blocksize(inode);
length = offset & (blocksize - 1);
/* Block boundary? Nothing to do */
if (!length)
return 0;
length = blocksize - length;
iblock = (sector_t)index << (PAGE_SHIFT - inode->i_blkbits);
page = grab_cache_page(mapping, index);
err = -ENOMEM;
if (!page)
goto out;
if (page_has_buffers(page)) {
has_buffers:
unlock_page(page);
put_page(page);
return block_truncate_page(mapping, from, get_block);
}
/* Find the buffer that contains "offset" */
pos = blocksize;
while (offset >= pos) {
iblock++;
pos += blocksize;
}
map_bh.b_size = blocksize;
map_bh.b_state = 0;
err = get_block(inode, iblock, &map_bh, 0);
if (err)
goto unlock;
/* unmapped? It's a hole - nothing to do */
if (!buffer_mapped(&map_bh))
goto unlock;
/* Ok, it's mapped. Make sure it's up-to-date */
if (!PageUptodate(page)) {
err = mapping->a_ops->readpage(NULL, page);
if (err) {
put_page(page);
goto out;
}
lock_page(page);
if (!PageUptodate(page)) {
err = -EIO;
goto unlock;
}
if (page_has_buffers(page))
goto has_buffers;
}
zero_user(page, offset, length);
set_page_dirty(page);
err = 0;
unlock:
unlock_page(page);
put_page(page);
out:
return err;
}
EXPORT_SYMBOL(nobh_truncate_page);
int block_truncate_page(struct address_space *mapping,
loff_t from, get_block_t *get_block)
{
pgoff_t index = from >> PAGE_SHIFT;
unsigned offset = from & (PAGE_SIZE-1);
unsigned blocksize;
sector_t iblock;
unsigned length, pos;
struct inode *inode = mapping->host;
struct page *page;
struct buffer_head *bh;
int err;
blocksize = i_blocksize(inode);
length = offset & (blocksize - 1);
/* Block boundary? Nothing to do */
if (!length)
return 0;
length = blocksize - length;
iblock = (sector_t)index << (PAGE_SHIFT - inode->i_blkbits);
page = grab_cache_page(mapping, index);
err = -ENOMEM;
if (!page)
goto out;
if (!page_has_buffers(page))
create_empty_buffers(page, blocksize, 0);
/* Find the buffer that contains "offset" */
bh = page_buffers(page);
pos = blocksize;
while (offset >= pos) {
bh = bh->b_this_page;
iblock++;
pos += blocksize;
}
err = 0;
if (!buffer_mapped(bh)) {
WARN_ON(bh->b_size != blocksize);
err = get_block(inode, iblock, bh, 0);
if (err)
goto unlock;
/* unmapped? It's a hole - nothing to do */
if (!buffer_mapped(bh))
goto unlock;
}
/* Ok, it's mapped. Make sure it's up-to-date */
if (PageUptodate(page))
set_buffer_uptodate(bh);
if (!buffer_uptodate(bh) && !buffer_delay(bh) && !buffer_unwritten(bh)) {
err = -EIO;
ll_rw_block(REQ_OP_READ, 0, 1, &bh);
wait_on_buffer(bh);
/* Uhhuh. Read error. Complain and punt. */
if (!buffer_uptodate(bh))
goto unlock;
}
zero_user(page, offset, length);
mark_buffer_dirty(bh);
err = 0;
unlock:
unlock_page(page);
put_page(page);
out:
return err;
}
EXPORT_SYMBOL(block_truncate_page);
/*
* The generic ->writepage function for buffer-backed address_spaces
*/
int block_write_full_page(struct page *page, get_block_t *get_block,
struct writeback_control *wbc)
{
struct inode * const inode = page->mapping->host;
loff_t i_size = i_size_read(inode);
const pgoff_t end_index = i_size >> PAGE_SHIFT;
unsigned offset;
/* Is the page fully inside i_size? */
if (page->index < end_index)
return __block_write_full_page(inode, page, get_block, wbc,
end_buffer_async_write);
/* Is the page fully outside i_size? (truncate in progress) */
offset = i_size & (PAGE_SIZE-1); if (page->index >= end_index+1 || !offset) { unlock_page(page); return 0; /* don't care */
}
/*
* The page straddles i_size. It must be zeroed out on each and every
* writepage invocation because it may be mmapped. "A file is mapped
* in multiples of the page size. For a file that is not a multiple of
* the page size, the remaining memory is zeroed when mapped, and
* writes to that region are not written out to the file."
*/
zero_user_segment(page, offset, PAGE_SIZE);
return __block_write_full_page(inode, page, get_block, wbc,
end_buffer_async_write);
}
EXPORT_SYMBOL(block_write_full_page);
sector_t generic_block_bmap(struct address_space *mapping, sector_t block,
get_block_t *get_block)
{
struct inode *inode = mapping->host;
struct buffer_head tmp = {
.b_size = i_blocksize(inode),
};
get_block(inode, block, &tmp, 0);
return tmp.b_blocknr;
}
EXPORT_SYMBOL(generic_block_bmap);
static void end_bio_bh_io_sync(struct bio *bio)
{
struct buffer_head *bh = bio->bi_private;
if (unlikely(bio_flagged(bio, BIO_QUIET)))
set_bit(BH_Quiet, &bh->b_state);
bh->b_end_io(bh, !bio->bi_status);
bio_put(bio);
}
static int submit_bh_wbc(int op, int op_flags, struct buffer_head *bh,
enum rw_hint write_hint, struct writeback_control *wbc)
{
struct bio *bio;
BUG_ON(!buffer_locked(bh)); BUG_ON(!buffer_mapped(bh)); BUG_ON(!bh->b_end_io); BUG_ON(buffer_delay(bh)); BUG_ON(buffer_unwritten(bh));
/*
* Only clear out a write error when rewriting
*/
if (test_set_buffer_req(bh) && (op == REQ_OP_WRITE))
clear_buffer_write_io_error(bh);
bio = bio_alloc(GFP_NOIO, 1);
fscrypt_set_bio_crypt_ctx_bh(bio, bh, GFP_NOIO);
bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9);
bio_set_dev(bio, bh->b_bdev);
bio->bi_write_hint = write_hint;
bio_add_page(bio, bh->b_page, bh->b_size, bh_offset(bh));
BUG_ON(bio->bi_iter.bi_size != bh->b_size); bio->bi_end_io = end_bio_bh_io_sync;
bio->bi_private = bh;
if (buffer_meta(bh))
op_flags |= REQ_META;
if (buffer_prio(bh))
op_flags |= REQ_PRIO; bio_set_op_attrs(bio, op, op_flags);
/* Take care of bh's that straddle the end of the device */
guard_bio_eod(bio);
if (wbc) {
wbc_init_bio(wbc, bio);
wbc_account_cgroup_owner(wbc, bh->b_page, bh->b_size);
}
submit_bio(bio);
return 0;
}
int submit_bh(int op, int op_flags, struct buffer_head *bh)
{
return submit_bh_wbc(op, op_flags, bh, 0, NULL);
}
EXPORT_SYMBOL(submit_bh);
/**
* ll_rw_block: low-level access to block devices (DEPRECATED)
* @op: whether to %READ or %WRITE
* @op_flags: req_flag_bits
* @nr: number of &struct buffer_heads in the array
* @bhs: array of pointers to &struct buffer_head
*
* ll_rw_block() takes an array of pointers to &struct buffer_heads, and
* requests an I/O operation on them, either a %REQ_OP_READ or a %REQ_OP_WRITE.
* @op_flags contains flags modifying the detailed I/O behavior, most notably
* %REQ_RAHEAD.
*
* This function drops any buffer that it cannot get a lock on (with the
* BH_Lock state bit), any buffer that appears to be clean when doing a write
* request, and any buffer that appears to be up-to-date when doing read
* request. Further it marks as clean buffers that are processed for
* writing (the buffer cache won't assume that they are actually clean
* until the buffer gets unlocked).
*
* ll_rw_block sets b_end_io to simple completion handler that marks
* the buffer up-to-date (if appropriate), unlocks the buffer and wakes
* any waiters.
*
* All of the buffers must be for the same device, and must also be a
* multiple of the current approved size for the device.
*/
void ll_rw_block(int op, int op_flags, int nr, struct buffer_head *bhs[])
{
int i;
for (i = 0; i < nr; i++) { struct buffer_head *bh = bhs[i];
if (!trylock_buffer(bh))
continue;
if (op == WRITE) {
if (test_clear_buffer_dirty(bh)) {
bh->b_end_io = end_buffer_write_sync;
get_bh(bh);
submit_bh(op, op_flags, bh);
continue;
}
} else {
if (!buffer_uptodate(bh)) {
bh->b_end_io = end_buffer_read_sync;
get_bh(bh);
submit_bh(op, op_flags, bh);
continue;
}
}
unlock_buffer(bh);
}
}
EXPORT_SYMBOL(ll_rw_block);
void write_dirty_buffer(struct buffer_head *bh, int op_flags)
{
lock_buffer(bh);
if (!test_clear_buffer_dirty(bh)) {
unlock_buffer(bh);
return;
}
bh->b_end_io = end_buffer_write_sync;
get_bh(bh);
submit_bh(REQ_OP_WRITE, op_flags, bh);
}
EXPORT_SYMBOL(write_dirty_buffer);
/*
* For a data-integrity writeout, we need to wait upon any in-progress I/O
* and then start new I/O and then wait upon it. The caller must have a ref on
* the buffer_head.
*/
int __sync_dirty_buffer(struct buffer_head *bh, int op_flags)
{
int ret = 0;
WARN_ON(atomic_read(&bh->b_count) < 1);
lock_buffer(bh);
if (test_clear_buffer_dirty(bh)) {
/*
* The bh should be mapped, but it might not be if the
* device was hot-removed. Not much we can do but fail the I/O.
*/
if (!buffer_mapped(bh)) {
unlock_buffer(bh);
return -EIO;
}
get_bh(bh);
bh->b_end_io = end_buffer_write_sync;
ret = submit_bh(REQ_OP_WRITE, op_flags, bh);
wait_on_buffer(bh);
if (!ret && !buffer_uptodate(bh))
ret = -EIO;
} else {
unlock_buffer(bh);
}
return ret;
}
EXPORT_SYMBOL(__sync_dirty_buffer);
int sync_dirty_buffer(struct buffer_head *bh)
{
return __sync_dirty_buffer(bh, REQ_SYNC);
}
EXPORT_SYMBOL(sync_dirty_buffer);
/*
* try_to_free_buffers() checks if all the buffers on this particular page
* are unused, and releases them if so.
*
* Exclusion against try_to_free_buffers may be obtained by either
* locking the page or by holding its mapping's private_lock.
*
* If the page is dirty but all the buffers are clean then we need to
* be sure to mark the page clean as well. This is because the page
* may be against a block device, and a later reattachment of buffers
* to a dirty page will set *all* buffers dirty. Which would corrupt
* filesystem data on the same device.
*
* The same applies to regular filesystem pages: if all the buffers are
* clean then we set the page clean and proceed. To do that, we require
* total exclusion from __set_page_dirty_buffers(). That is obtained with
* private_lock.
*
* try_to_free_buffers() is non-blocking.
*/
static inline int buffer_busy(struct buffer_head *bh)
{
return atomic_read(&bh->b_count) |
(bh->b_state & ((1 << BH_Dirty) | (1 << BH_Lock)));
}
static int
drop_buffers(struct page *page, struct buffer_head **buffers_to_free)
{
struct buffer_head *head = page_buffers(page);
struct buffer_head *bh;
bh = head;
do {
if (buffer_busy(bh))
goto failed;
bh = bh->b_this_page;
} while (bh != head);
do {
struct buffer_head *next = bh->b_this_page;
if (bh->b_assoc_map)
__remove_assoc_queue(bh);
bh = next;
} while (bh != head); *buffers_to_free = head;
detach_page_private(page);
return 1;
failed:
return 0;
}
int try_to_free_buffers(struct page *page)
{
struct address_space * const mapping = page->mapping;
struct buffer_head *buffers_to_free = NULL;
int ret = 0;
BUG_ON(!PageLocked(page));
if (PageWriteback(page))
return 0;
if (mapping == NULL) { /* can this still happen? */ ret = drop_buffers(page, &buffers_to_free);
goto out;
}
spin_lock(&mapping->private_lock);
ret = drop_buffers(page, &buffers_to_free);
/*
* If the filesystem writes its buffers by hand (eg ext3)
* then we can have clean buffers against a dirty page. We
* clean the page here; otherwise the VM will never notice
* that the filesystem did any IO at all.
*
* Also, during truncate, discard_buffer will have marked all
* the page's buffers clean. We discover that here and clean
* the page also.
*
* private_lock must be held over this entire operation in order
* to synchronise against __set_page_dirty_buffers and prevent the
* dirty bit from being lost.
*/
if (ret)
cancel_dirty_page(page);
spin_unlock(&mapping->private_lock);
out:
if (buffers_to_free) {
struct buffer_head *bh = buffers_to_free;
do {
struct buffer_head *next = bh->b_this_page;
free_buffer_head(bh);
bh = next;
} while (bh != buffers_to_free);
}
return ret;
}
EXPORT_SYMBOL(try_to_free_buffers);
/*
* Buffer-head allocation
*/
static struct kmem_cache *bh_cachep __read_mostly;
/*
* Once the number of bh's in the machine exceeds this level, we start
* stripping them in writeback.
*/
static unsigned long max_buffer_heads;
int buffer_heads_over_limit;
struct bh_accounting {
int nr; /* Number of live bh's */
int ratelimit; /* Limit cacheline bouncing */
};
static DEFINE_PER_CPU(struct bh_accounting, bh_accounting) = {0, 0};
static void recalc_bh_state(void)
{
int i;
int tot = 0;
if (__this_cpu_inc_return(bh_accounting.ratelimit) - 1 < 4096)
return;
__this_cpu_write(bh_accounting.ratelimit, 0); for_each_online_cpu(i) tot += per_cpu(bh_accounting, i).nr; buffer_heads_over_limit = (tot > max_buffer_heads);
}
struct buffer_head *alloc_buffer_head(gfp_t gfp_flags)
{
struct buffer_head *ret = kmem_cache_zalloc(bh_cachep, gfp_flags);
if (ret) {
INIT_LIST_HEAD(&ret->b_assoc_buffers);
spin_lock_init(&ret->b_uptodate_lock);
preempt_disable();
__this_cpu_inc(bh_accounting.nr);
recalc_bh_state();
preempt_enable();
}
return ret;
}
EXPORT_SYMBOL(alloc_buffer_head);
void free_buffer_head(struct buffer_head *bh)
{
BUG_ON(!list_empty(&bh->b_assoc_buffers)); kmem_cache_free(bh_cachep, bh);
preempt_disable();
__this_cpu_dec(bh_accounting.nr);
recalc_bh_state();
preempt_enable();
}
EXPORT_SYMBOL(free_buffer_head);
static int buffer_exit_cpu_dead(unsigned int cpu)
{
int i;
struct bh_lru *b = &per_cpu(bh_lrus, cpu);
for (i = 0; i < BH_LRU_SIZE; i++) {
brelse(b->bhs[i]);
b->bhs[i] = NULL;
}
this_cpu_add(bh_accounting.nr, per_cpu(bh_accounting, cpu).nr);
per_cpu(bh_accounting, cpu).nr = 0;
return 0;
}
/**
* bh_uptodate_or_lock - Test whether the buffer is uptodate
* @bh: struct buffer_head
*
* Return true if the buffer is up-to-date and false,
* with the buffer locked, if not.
*/
int bh_uptodate_or_lock(struct buffer_head *bh)
{
if (!buffer_uptodate(bh)) {
lock_buffer(bh);
if (!buffer_uptodate(bh))
return 0;
unlock_buffer(bh);
}
return 1;
}
EXPORT_SYMBOL(bh_uptodate_or_lock);
/**
* bh_submit_read - Submit a locked buffer for reading
* @bh: struct buffer_head
*
* Returns zero on success and -EIO on error.
*/
int bh_submit_read(struct buffer_head *bh)
{
BUG_ON(!buffer_locked(bh));
if (buffer_uptodate(bh)) {
unlock_buffer(bh);
return 0;
}
get_bh(bh);
bh->b_end_io = end_buffer_read_sync;
submit_bh(REQ_OP_READ, 0, bh);
wait_on_buffer(bh);
if (buffer_uptodate(bh))
return 0;
return -EIO;
}
EXPORT_SYMBOL(bh_submit_read);
void __init buffer_init(void)
{
unsigned long nrpages;
int ret;
bh_cachep = kmem_cache_create("buffer_head",
sizeof(struct buffer_head), 0,
(SLAB_RECLAIM_ACCOUNT|SLAB_PANIC|
SLAB_MEM_SPREAD),
NULL);
/*
* Limit the bh occupancy to 10% of ZONE_NORMAL
*/
nrpages = (nr_free_buffer_pages() * 10) / 100;
max_buffer_heads = nrpages * (PAGE_SIZE / sizeof(struct buffer_head));
ret = cpuhp_setup_state_nocalls(CPUHP_FS_BUFF_DEAD, "fs/buffer:dead",
NULL, buffer_exit_cpu_dead);
WARN_ON(ret < 0);
}
// SPDX-License-Identifier: GPL-2.0-only
/*
* fs/kernfs/file.c - kernfs file implementation
*
* Copyright (c) 2001-3 Patrick Mochel
* Copyright (c) 2007 SUSE Linux Products GmbH
* Copyright (c) 2007, 2013 Tejun Heo <tj@kernel.org>
*/
#include <linux/fs.h>
#include <linux/seq_file.h>
#include <linux/slab.h>
#include <linux/poll.h>
#include <linux/pagemap.h>
#include <linux/sched/mm.h>
#include <linux/fsnotify.h>
#include <linux/uio.h>
#include "kernfs-internal.h"
/*
* There's one kernfs_open_file for each open file and one kernfs_open_node
* for each kernfs_node with one or more open files.
*
* kernfs_node->attr.open points to kernfs_open_node. attr.open is
* protected by kernfs_open_node_lock.
*
* filp->private_data points to seq_file whose ->private points to
* kernfs_open_file. kernfs_open_files are chained at
* kernfs_open_node->files, which is protected by kernfs_open_file_mutex.
*/
static DEFINE_SPINLOCK(kernfs_open_node_lock);
static DEFINE_MUTEX(kernfs_open_file_mutex);
struct kernfs_open_node {
atomic_t refcnt;
atomic_t event;
wait_queue_head_t poll;
struct list_head files; /* goes through kernfs_open_file.list */
};
/*
* kernfs_notify() may be called from any context and bounces notifications
* through a work item. To minimize space overhead in kernfs_node, the
* pending queue is implemented as a singly linked list of kernfs_nodes.
* The list is terminated with the self pointer so that whether a
* kernfs_node is on the list or not can be determined by testing the next
* pointer for NULL.
*/
#define KERNFS_NOTIFY_EOL ((void *)&kernfs_notify_list)
static DEFINE_SPINLOCK(kernfs_notify_lock);
static struct kernfs_node *kernfs_notify_list = KERNFS_NOTIFY_EOL;
static struct kernfs_open_file *kernfs_of(struct file *file)
{
return ((struct seq_file *)file->private_data)->private;
}
/*
* Determine the kernfs_ops for the given kernfs_node. This function must
* be called while holding an active reference.
*/
static const struct kernfs_ops *kernfs_ops(struct kernfs_node *kn)
{
if (kn->flags & KERNFS_LOCKDEP)
lockdep_assert_held(kn);
return kn->attr.ops;
}
/*
* As kernfs_seq_stop() is also called after kernfs_seq_start() or
* kernfs_seq_next() failure, it needs to distinguish whether it's stopping
* a seq_file iteration which is fully initialized with an active reference
* or an aborted kernfs_seq_start() due to get_active failure. The
* position pointer is the only context for each seq_file iteration and
* thus the stop condition should be encoded in it. As the return value is
* directly visible to userland, ERR_PTR(-ENODEV) is the only acceptable
* choice to indicate get_active failure.
*
* Unfortunately, this is complicated due to the optional custom seq_file
* operations which may return ERR_PTR(-ENODEV) too. kernfs_seq_stop()
* can't distinguish whether ERR_PTR(-ENODEV) is from get_active failure or
* custom seq_file operations and thus can't decide whether put_active
* should be performed or not only on ERR_PTR(-ENODEV).
*
* This is worked around by factoring out the custom seq_stop() and
* put_active part into kernfs_seq_stop_active(), skipping it from
* kernfs_seq_stop() if ERR_PTR(-ENODEV) while invoking it directly after
* custom seq_file operations fail with ERR_PTR(-ENODEV) - this ensures
* that kernfs_seq_stop_active() is skipped only after get_active failure.
*/
static void kernfs_seq_stop_active(struct seq_file *sf, void *v)
{
struct kernfs_open_file *of = sf->private;
const struct kernfs_ops *ops = kernfs_ops(of->kn);
if (ops->seq_stop)
ops->seq_stop(sf, v);
kernfs_put_active(of->kn);
}
static void *kernfs_seq_start(struct seq_file *sf, loff_t *ppos)
{
struct kernfs_open_file *of = sf->private;
const struct kernfs_ops *ops;
/*
* @of->mutex nests outside active ref and is primarily to ensure that
* the ops aren't called concurrently for the same open file.
*/
mutex_lock(&of->mutex);
if (!kernfs_get_active(of->kn))
return ERR_PTR(-ENODEV);
ops = kernfs_ops(of->kn);
if (ops->seq_start) {
void *next = ops->seq_start(sf, ppos);
/* see the comment above kernfs_seq_stop_active() */
if (next == ERR_PTR(-ENODEV))
kernfs_seq_stop_active(sf, next);
return next;
} else {
/*
* The same behavior and code as single_open(). Returns
* !NULL if pos is at the beginning; otherwise, NULL.
*/
return NULL + !*ppos;
}
}
static void *kernfs_seq_next(struct seq_file *sf, void *v, loff_t *ppos)
{
struct kernfs_open_file *of = sf->private;
const struct kernfs_ops *ops = kernfs_ops(of->kn);
if (ops->seq_next) {
void *next = ops->seq_next(sf, v, ppos);
/* see the comment above kernfs_seq_stop_active() */
if (next == ERR_PTR(-ENODEV))
kernfs_seq_stop_active(sf, next);
return next;
} else {
/*
* The same behavior and code as single_open(), always
* terminate after the initial read.
*/
++*ppos;
return NULL;
}
}
static void kernfs_seq_stop(struct seq_file *sf, void *v)
{
struct kernfs_open_file *of = sf->private;
if (v != ERR_PTR(-ENODEV))
kernfs_seq_stop_active(sf, v);
mutex_unlock(&of->mutex);
}
static int kernfs_seq_show(struct seq_file *sf, void *v)
{
struct kernfs_open_file *of = sf->private;
of->event = atomic_read(&of->kn->attr.open->event);
return of->kn->attr.ops->seq_show(sf, v);
}
static const struct seq_operations kernfs_seq_ops = {
.start = kernfs_seq_start,
.next = kernfs_seq_next,
.stop = kernfs_seq_stop,
.show = kernfs_seq_show,
};
/*
* As reading a bin file can have side-effects, the exact offset and bytes
* specified in read(2) call should be passed to the read callback making
* it difficult to use seq_file. Implement simplistic custom buffering for
* bin files.
*/
static ssize_t kernfs_file_read_iter(struct kiocb *iocb, struct iov_iter *iter)
{
struct kernfs_open_file *of = kernfs_of(iocb->ki_filp);
ssize_t len = min_t(size_t, iov_iter_count(iter), PAGE_SIZE);
const struct kernfs_ops *ops;
char *buf;
buf = of->prealloc_buf;
if (buf)
mutex_lock(&of->prealloc_mutex);
else
buf = kmalloc(len, GFP_KERNEL);
if (!buf)
return -ENOMEM;
/*
* @of->mutex nests outside active ref and is used both to ensure that
* the ops aren't called concurrently for the same open file.
*/
mutex_lock(&of->mutex);
if (!kernfs_get_active(of->kn)) {
len = -ENODEV;
mutex_unlock(&of->mutex);
goto out_free;
}
of->event = atomic_read(&of->kn->attr.open->event);
ops = kernfs_ops(of->kn);
if (ops->read)
len = ops->read(of, buf, len, iocb->ki_pos);
else
len = -EINVAL;
kernfs_put_active(of->kn);
mutex_unlock(&of->mutex);
if (len < 0)
goto out_free;
if (copy_to_iter(buf, len, iter) != len) {
len = -EFAULT;
goto out_free;
}
iocb->ki_pos += len;
out_free:
if (buf == of->prealloc_buf)
mutex_unlock(&of->prealloc_mutex);
else
kfree(buf);
return len;
}
static ssize_t kernfs_fop_read_iter(struct kiocb *iocb, struct iov_iter *iter)
{
if (kernfs_of(iocb->ki_filp)->kn->flags & KERNFS_HAS_SEQ_SHOW)
return seq_read_iter(iocb, iter);
return kernfs_file_read_iter(iocb, iter);
}
/*
* Copy data in from userland and pass it to the matching kernfs write
* operation.
*
* There is no easy way for us to know if userspace is only doing a partial
* write, so we don't support them. We expect the entire buffer to come on
* the first write. Hint: if you're writing a value, first read the file,
* modify only the the value you're changing, then write entire buffer
* back.
*/
static ssize_t kernfs_fop_write_iter(struct kiocb *iocb, struct iov_iter *iter)
{
struct kernfs_open_file *of = kernfs_of(iocb->ki_filp);
ssize_t len = iov_iter_count(iter);
const struct kernfs_ops *ops;
char *buf;
if (of->atomic_write_len) {
if (len > of->atomic_write_len)
return -E2BIG;
} else {
len = min_t(size_t, len, PAGE_SIZE);
}
buf = of->prealloc_buf;
if (buf)
mutex_lock(&of->prealloc_mutex);
else
buf = kmalloc(len + 1, GFP_KERNEL);
if (!buf)
return -ENOMEM;
if (copy_from_iter(buf, len, iter) != len) {
len = -EFAULT;
goto out_free;
}
buf[len] = '\0'; /* guarantee string termination */
/*
* @of->mutex nests outside active ref and is used both to ensure that
* the ops aren't called concurrently for the same open file.
*/
mutex_lock(&of->mutex);
if (!kernfs_get_active(of->kn)) {
mutex_unlock(&of->mutex);
len = -ENODEV;
goto out_free;
}
ops = kernfs_ops(of->kn);
if (ops->write)
len = ops->write(of, buf, len, iocb->ki_pos);
else
len = -EINVAL;
kernfs_put_active(of->kn);
mutex_unlock(&of->mutex);
if (len > 0)
iocb->ki_pos += len;
out_free:
if (buf == of->prealloc_buf)
mutex_unlock(&of->prealloc_mutex);
else
kfree(buf);
return len;
}
static void kernfs_vma_open(struct vm_area_struct *vma)
{
struct file *file = vma->vm_file;
struct kernfs_open_file *of = kernfs_of(file);
if (!of->vm_ops)
return;
if (!kernfs_get_active(of->kn))
return;
if (of->vm_ops->open)
of->vm_ops->open(vma);
kernfs_put_active(of->kn);
}
static vm_fault_t kernfs_vma_fault(struct vm_fault *vmf)
{
struct file *file = vmf->vma->vm_file;
struct kernfs_open_file *of = kernfs_of(file);
vm_fault_t ret;
if (!of->vm_ops)
return VM_FAULT_SIGBUS;
if (!kernfs_get_active(of->kn))
return VM_FAULT_SIGBUS;
ret = VM_FAULT_SIGBUS;
if (of->vm_ops->fault)
ret = of->vm_ops->fault(vmf);
kernfs_put_active(of->kn);
return ret;
}
static vm_fault_t kernfs_vma_page_mkwrite(struct vm_fault *vmf)
{
struct file *file = vmf->vma->vm_file;
struct kernfs_open_file *of = kernfs_of(file);
vm_fault_t ret;
if (!of->vm_ops)
return VM_FAULT_SIGBUS;
if (!kernfs_get_active(of->kn))
return VM_FAULT_SIGBUS;
ret = 0;
if (of->vm_ops->page_mkwrite)
ret = of->vm_ops->page_mkwrite(vmf);
else
file_update_time(file);
kernfs_put_active(of->kn);
return ret;
}
static int kernfs_vma_access(struct vm_area_struct *vma, unsigned long addr,
void *buf, int len, int write)
{
struct file *file = vma->vm_file;
struct kernfs_open_file *of = kernfs_of(file);
int ret;
if (!of->vm_ops)
return -EINVAL;
if (!kernfs_get_active(of->kn))
return -EINVAL;
ret = -EINVAL;
if (of->vm_ops->access)
ret = of->vm_ops->access(vma, addr, buf, len, write);
kernfs_put_active(of->kn);
return ret;
}
#ifdef CONFIG_NUMA
static int kernfs_vma_set_policy(struct vm_area_struct *vma,
struct mempolicy *new)
{
struct file *file = vma->vm_file;
struct kernfs_open_file *of = kernfs_of(file);
int ret;
if (!of->vm_ops)
return 0;
if (!kernfs_get_active(of->kn))
return -EINVAL;
ret = 0;
if (of->vm_ops->set_policy)
ret = of->vm_ops->set_policy(vma, new);
kernfs_put_active(of->kn);
return ret;
}
static struct mempolicy *kernfs_vma_get_policy(struct vm_area_struct *vma,
unsigned long addr)
{
struct file *file = vma->vm_file;
struct kernfs_open_file *of = kernfs_of(file);
struct mempolicy *pol;
if (!of->vm_ops)
return vma->vm_policy;
if (!kernfs_get_active(of->kn))
return vma->vm_policy;
pol = vma->vm_policy;
if (of->vm_ops->get_policy)
pol = of->vm_ops->get_policy(vma, addr);
kernfs_put_active(of->kn);
return pol;
}
#endif
static const struct vm_operations_struct kernfs_vm_ops = {
.open = kernfs_vma_open,
.fault = kernfs_vma_fault,
.page_mkwrite = kernfs_vma_page_mkwrite,
.access = kernfs_vma_access,
#ifdef CONFIG_NUMA
.set_policy = kernfs_vma_set_policy,
.get_policy = kernfs_vma_get_policy,
#endif
};
static int kernfs_fop_mmap(struct file *file, struct vm_area_struct *vma)
{
struct kernfs_open_file *of = kernfs_of(file);
const struct kernfs_ops *ops;
int rc;
/*
* mmap path and of->mutex are prone to triggering spurious lockdep
* warnings and we don't want to add spurious locking dependency
* between the two. Check whether mmap is actually implemented
* without grabbing @of->mutex by testing HAS_MMAP flag. See the
* comment in kernfs_file_open() for more details.
*/
if (!(of->kn->flags & KERNFS_HAS_MMAP))
return -ENODEV;
mutex_lock(&of->mutex);
rc = -ENODEV;
if (!kernfs_get_active(of->kn))
goto out_unlock;
ops = kernfs_ops(of->kn);
rc = ops->mmap(of, vma);
if (rc)
goto out_put;
/*
* PowerPC's pci_mmap of legacy_mem uses shmem_zero_setup()
* to satisfy versions of X which crash if the mmap fails: that
* substitutes a new vm_file, and we don't then want bin_vm_ops.
*/
if (vma->vm_file != file)
goto out_put;
rc = -EINVAL;
if (of->mmapped && of->vm_ops != vma->vm_ops)
goto out_put;
/*
* It is not possible to successfully wrap close.
* So error if someone is trying to use close.
*/
rc = -EINVAL;
if (vma->vm_ops && vma->vm_ops->close)
goto out_put;
rc = 0;
of->mmapped = true;
of->vm_ops = vma->vm_ops;
vma->vm_ops = &kernfs_vm_ops;
out_put:
kernfs_put_active(of->kn);
out_unlock:
mutex_unlock(&of->mutex);
return rc;
}
/**
* kernfs_get_open_node - get or create kernfs_open_node
* @kn: target kernfs_node
* @of: kernfs_open_file for this instance of open
*
* If @kn->attr.open exists, increment its reference count; otherwise,
* create one. @of is chained to the files list.
*
* LOCKING:
* Kernel thread context (may sleep).
*
* RETURNS:
* 0 on success, -errno on failure.
*/
static int kernfs_get_open_node(struct kernfs_node *kn,
struct kernfs_open_file *of)
{
struct kernfs_open_node *on, *new_on = NULL;
retry:
mutex_lock(&kernfs_open_file_mutex);
spin_lock_irq(&kernfs_open_node_lock);
if (!kn->attr.open && new_on) {
kn->attr.open = new_on;
new_on = NULL;
}
on = kn->attr.open;
if (on) {
atomic_inc(&on->refcnt);
list_add_tail(&of->list, &on->files);
}
spin_unlock_irq(&kernfs_open_node_lock);
mutex_unlock(&kernfs_open_file_mutex);
if (on) {
kfree(new_on);
return 0;
}
/* not there, initialize a new one and retry */
new_on = kmalloc(sizeof(*new_on), GFP_KERNEL);
if (!new_on)
return -ENOMEM;
atomic_set(&new_on->refcnt, 0);
atomic_set(&new_on->event, 1);
init_waitqueue_head(&new_on->poll);
INIT_LIST_HEAD(&new_on->files);
goto retry;
}
/**
* kernfs_put_open_node - put kernfs_open_node
* @kn: target kernfs_nodet
* @of: associated kernfs_open_file
*
* Put @kn->attr.open and unlink @of from the files list. If
* reference count reaches zero, disassociate and free it.
*
* LOCKING:
* None.
*/
static void kernfs_put_open_node(struct kernfs_node *kn,
struct kernfs_open_file *of)
{
struct kernfs_open_node *on = kn->attr.open;
unsigned long flags;
mutex_lock(&kernfs_open_file_mutex);
spin_lock_irqsave(&kernfs_open_node_lock, flags);
if (of)
list_del(&of->list);
if (atomic_dec_and_test(&on->refcnt))
kn->attr.open = NULL;
else
on = NULL;
spin_unlock_irqrestore(&kernfs_open_node_lock, flags);
mutex_unlock(&kernfs_open_file_mutex);
kfree(on);
}
static int kernfs_fop_open(struct inode *inode, struct file *file)
{
struct kernfs_node *kn = inode->i_private;
struct kernfs_root *root = kernfs_root(kn);
const struct kernfs_ops *ops;
struct kernfs_open_file *of;
bool has_read, has_write, has_mmap;
int error = -EACCES;
if (!kernfs_get_active(kn))
return -ENODEV;
ops = kernfs_ops(kn);
has_read = ops->seq_show || ops->read || ops->mmap;
has_write = ops->write || ops->mmap;
has_mmap = ops->mmap;
/* see the flag definition for details */
if (root->flags & KERNFS_ROOT_EXTRA_OPEN_PERM_CHECK) {
if ((file->f_mode & FMODE_WRITE) &&
(!(inode->i_mode & S_IWUGO) || !has_write))
goto err_out;
if ((file->f_mode & FMODE_READ) &&
(!(inode->i_mode & S_IRUGO) || !has_read))
goto err_out;
}
/* allocate a kernfs_open_file for the file */
error = -ENOMEM;
of = kzalloc(sizeof(struct kernfs_open_file), GFP_KERNEL);
if (!of)
goto err_out;
/*
* The following is done to give a different lockdep key to
* @of->mutex for files which implement mmap. This is a rather
* crude way to avoid false positive lockdep warning around
* mm->mmap_lock - mmap nests @of->mutex under mm->mmap_lock and
* reading /sys/block/sda/trace/act_mask grabs sr_mutex, under
* which mm->mmap_lock nests, while holding @of->mutex. As each
* open file has a separate mutex, it's okay as long as those don't
* happen on the same file. At this point, we can't easily give
* each file a separate locking class. Let's differentiate on
* whether the file has mmap or not for now.
*
* Both paths of the branch look the same. They're supposed to
* look that way and give @of->mutex different static lockdep keys.
*/
if (has_mmap)
mutex_init(&of->mutex);
else
mutex_init(&of->mutex);
of->kn = kn;
of->file = file;
/*
* Write path needs to atomic_write_len outside active reference.
* Cache it in open_file. See kernfs_fop_write_iter() for details.
*/
of->atomic_write_len = ops->atomic_write_len;
error = -EINVAL;
/*
* ->seq_show is incompatible with ->prealloc,
* as seq_read does its own allocation.
* ->read must be used instead.
*/
if (ops->prealloc && ops->seq_show)
goto err_free;
if (ops->prealloc) {
int len = of->atomic_write_len ?: PAGE_SIZE;
of->prealloc_buf = kmalloc(len + 1, GFP_KERNEL);
error = -ENOMEM;
if (!of->prealloc_buf)
goto err_free;
mutex_init(&of->prealloc_mutex);
}
/*
* Always instantiate seq_file even if read access doesn't use
* seq_file or is not requested. This unifies private data access
* and readable regular files are the vast majority anyway.
*/
if (ops->seq_show)
error = seq_open(file, &kernfs_seq_ops);
else
error = seq_open(file, NULL);
if (error)
goto err_free;
of->seq_file = file->private_data;
of->seq_file->private = of;
/* seq_file clears PWRITE unconditionally, restore it if WRITE */
if (file->f_mode & FMODE_WRITE)
file->f_mode |= FMODE_PWRITE;
/* make sure we have open node struct */
error = kernfs_get_open_node(kn, of);
if (error)
goto err_seq_release;
if (ops->open) {
/* nobody has access to @of yet, skip @of->mutex */
error = ops->open(of);
if (error)
goto err_put_node;
}
/* open succeeded, put active references */
kernfs_put_active(kn);
return 0;
err_put_node:
kernfs_put_open_node(kn, of);
err_seq_release:
seq_release(inode, file);
err_free:
kfree(of->prealloc_buf);
kfree(of);
err_out:
kernfs_put_active(kn);
return error;
}
/* used from release/drain to ensure that ->release() is called exactly once */
static void kernfs_release_file(struct kernfs_node *kn,
struct kernfs_open_file *of)
{
/*
* @of is guaranteed to have no other file operations in flight and
* we just want to synchronize release and drain paths.
* @kernfs_open_file_mutex is enough. @of->mutex can't be used
* here because drain path may be called from places which can
* cause circular dependency.
*/
lockdep_assert_held(&kernfs_open_file_mutex);
if (!of->released) {
/*
* A file is never detached without being released and we
* need to be able to release files which are deactivated
* and being drained. Don't use kernfs_ops().
*/
kn->attr.ops->release(of);
of->released = true;
}
}
static int kernfs_fop_release(struct inode *inode, struct file *filp)
{
struct kernfs_node *kn = inode->i_private;
struct kernfs_open_file *of = kernfs_of(filp);
if (kn->flags & KERNFS_HAS_RELEASE) {
mutex_lock(&kernfs_open_file_mutex);
kernfs_release_file(kn, of);
mutex_unlock(&kernfs_open_file_mutex);
}
kernfs_put_open_node(kn, of);
seq_release(inode, filp);
kfree(of->prealloc_buf);
kfree(of);
return 0;
}
void kernfs_drain_open_files(struct kernfs_node *kn)
{
struct kernfs_open_node *on;
struct kernfs_open_file *of;
if (!(kn->flags & (KERNFS_HAS_MMAP | KERNFS_HAS_RELEASE)))
return;
spin_lock_irq(&kernfs_open_node_lock);
on = kn->attr.open;
if (on)
atomic_inc(&on->refcnt);
spin_unlock_irq(&kernfs_open_node_lock);
if (!on)
return;
mutex_lock(&kernfs_open_file_mutex); list_for_each_entry(of, &on->files, list) { struct inode *inode = file_inode(of->file); if (kn->flags & KERNFS_HAS_MMAP)
unmap_mapping_range(inode->i_mapping, 0, 0, 1);
if (kn->flags & KERNFS_HAS_RELEASE)
kernfs_release_file(kn, of);
}
mutex_unlock(&kernfs_open_file_mutex);
kernfs_put_open_node(kn, NULL);
}
/*
* Kernfs attribute files are pollable. The idea is that you read
* the content and then you use 'poll' or 'select' to wait for
* the content to change. When the content changes (assuming the
* manager for the kobject supports notification), poll will
* return EPOLLERR|EPOLLPRI, and select will return the fd whether
* it is waiting for read, write, or exceptions.
* Once poll/select indicates that the value has changed, you
* need to close and re-open the file, or seek to 0 and read again.
* Reminder: this only works for attributes which actively support
* it, and it is not possible to test an attribute from userspace
* to see if it supports poll (Neither 'poll' nor 'select' return
* an appropriate error code). When in doubt, set a suitable timeout value.
*/
__poll_t kernfs_generic_poll(struct kernfs_open_file *of, poll_table *wait)
{
struct kernfs_node *kn = kernfs_dentry_node(of->file->f_path.dentry);
struct kernfs_open_node *on = kn->attr.open;
poll_wait(of->file, &on->poll, wait);
if (of->event != atomic_read(&on->event))
return DEFAULT_POLLMASK|EPOLLERR|EPOLLPRI;
return DEFAULT_POLLMASK;
}
static __poll_t kernfs_fop_poll(struct file *filp, poll_table *wait)
{
struct kernfs_open_file *of = kernfs_of(filp);
struct kernfs_node *kn = kernfs_dentry_node(filp->f_path.dentry);
__poll_t ret;
if (!kernfs_get_active(kn))
return DEFAULT_POLLMASK|EPOLLERR|EPOLLPRI;
if (kn->attr.ops->poll)
ret = kn->attr.ops->poll(of, wait);
else
ret = kernfs_generic_poll(of, wait);
kernfs_put_active(kn);
return ret;
}
static void kernfs_notify_workfn(struct work_struct *work)
{
struct kernfs_node *kn;
struct kernfs_super_info *info;
repeat:
/* pop one off the notify_list */
spin_lock_irq(&kernfs_notify_lock);
kn = kernfs_notify_list;
if (kn == KERNFS_NOTIFY_EOL) {
spin_unlock_irq(&kernfs_notify_lock);
return;
}
kernfs_notify_list = kn->attr.notify_next;
kn->attr.notify_next = NULL;
spin_unlock_irq(&kernfs_notify_lock);
/* kick fsnotify */
down_write(&kernfs_rwsem);
list_for_each_entry(info, &kernfs_root(kn)->supers, node) {
struct kernfs_node *parent;
struct inode *p_inode = NULL;
struct inode *inode;
struct qstr name;
/*
* We want fsnotify_modify() on @kn but as the
* modifications aren't originating from userland don't
* have the matching @file available. Look up the inodes
* and generate the events manually.
*/
inode = ilookup(info->sb, kernfs_ino(kn));
if (!inode)
continue;
name = (struct qstr)QSTR_INIT(kn->name, strlen(kn->name));
parent = kernfs_get_parent(kn);
if (parent) {
p_inode = ilookup(info->sb, kernfs_ino(parent));
if (p_inode) {
fsnotify(FS_MODIFY | FS_EVENT_ON_CHILD,
inode, FSNOTIFY_EVENT_INODE,
p_inode, &name, inode, 0);
iput(p_inode);
}
kernfs_put(parent);
}
if (!p_inode)
fsnotify_inode(inode, FS_MODIFY);
iput(inode);
}
up_write(&kernfs_rwsem);
kernfs_put(kn);
goto repeat;
}
/**
* kernfs_notify - notify a kernfs file
* @kn: file to notify
*
* Notify @kn such that poll(2) on @kn wakes up. Maybe be called from any
* context.
*/
void kernfs_notify(struct kernfs_node *kn)
{
static DECLARE_WORK(kernfs_notify_work, kernfs_notify_workfn);
unsigned long flags;
struct kernfs_open_node *on;
if (WARN_ON(kernfs_type(kn) != KERNFS_FILE))
return;
/* kick poll immediately */
spin_lock_irqsave(&kernfs_open_node_lock, flags);
on = kn->attr.open;
if (on) {
atomic_inc(&on->event);
wake_up_interruptible(&on->poll);
}
spin_unlock_irqrestore(&kernfs_open_node_lock, flags);
/* schedule work to kick fsnotify */
spin_lock_irqsave(&kernfs_notify_lock, flags);
if (!kn->attr.notify_next) {
kernfs_get(kn);
kn->attr.notify_next = kernfs_notify_list;
kernfs_notify_list = kn;
schedule_work(&kernfs_notify_work);
}
spin_unlock_irqrestore(&kernfs_notify_lock, flags);
}
EXPORT_SYMBOL_GPL(kernfs_notify);
const struct file_operations kernfs_file_fops = {
.read_iter = kernfs_fop_read_iter,
.write_iter = kernfs_fop_write_iter,
.llseek = generic_file_llseek,
.mmap = kernfs_fop_mmap,
.open = kernfs_fop_open,
.release = kernfs_fop_release,
.poll = kernfs_fop_poll,
.fsync = noop_fsync,
.splice_read = generic_file_splice_read,
.splice_write = iter_file_splice_write,
};
/**
* __kernfs_create_file - kernfs internal function to create a file
* @parent: directory to create the file in
* @name: name of the file
* @mode: mode of the file
* @uid: uid of the file
* @gid: gid of the file
* @size: size of the file
* @ops: kernfs operations for the file
* @priv: private data for the file
* @ns: optional namespace tag of the file
* @key: lockdep key for the file's active_ref, %NULL to disable lockdep
*
* Returns the created node on success, ERR_PTR() value on error.
*/
struct kernfs_node *__kernfs_create_file(struct kernfs_node *parent,
const char *name,
umode_t mode, kuid_t uid, kgid_t gid,
loff_t size,
const struct kernfs_ops *ops,
void *priv, const void *ns,
struct lock_class_key *key)
{
struct kernfs_node *kn;
unsigned flags;
int rc;
flags = KERNFS_FILE;
kn = kernfs_new_node(parent, name, (mode & S_IALLUGO) | S_IFREG,
uid, gid, flags);
if (!kn)
return ERR_PTR(-ENOMEM);
kn->attr.ops = ops;
kn->attr.size = size;
kn->ns = ns;
kn->priv = priv;
#ifdef CONFIG_DEBUG_LOCK_ALLOC
if (key) {
lockdep_init_map(&kn->dep_map, "kn->active", key, 0);
kn->flags |= KERNFS_LOCKDEP;
}
#endif
/*
* kn->attr.ops is accesible only while holding active ref. We
* need to know whether some ops are implemented outside active
* ref. Cache their existence in flags.
*/
if (ops->seq_show)
kn->flags |= KERNFS_HAS_SEQ_SHOW; if (ops->mmap) kn->flags |= KERNFS_HAS_MMAP; if (ops->release) kn->flags |= KERNFS_HAS_RELEASE; rc = kernfs_add_one(kn); if (rc) { kernfs_put(kn);
return ERR_PTR(rc);
}
return kn;
}
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_MEMREMAP_H_
#define _LINUX_MEMREMAP_H_
#include <linux/range.h>
#include <linux/ioport.h>
#include <linux/percpu-refcount.h>
struct resource;
struct device;
/**
* struct vmem_altmap - pre-allocated storage for vmemmap_populate
* @base_pfn: base of the entire dev_pagemap mapping
* @reserve: pages mapped, but reserved for driver use (relative to @base)
* @free: free pages set aside in the mapping for memmap storage
* @align: pages reserved to meet allocation alignments
* @alloc: track pages consumed, private to vmemmap_populate()
*/
struct vmem_altmap {
unsigned long base_pfn;
const unsigned long end_pfn;
const unsigned long reserve;
unsigned long free;
unsigned long align;
unsigned long alloc;
};
/*
* Specialize ZONE_DEVICE memory into multiple types each has a different
* usage.
*
* MEMORY_DEVICE_PRIVATE:
* Device memory that is not directly addressable by the CPU: CPU can neither
* read nor write private memory. In this case, we do still have struct pages
* backing the device memory. Doing so simplifies the implementation, but it is
* important to remember that there are certain points at which the struct page
* must be treated as an opaque object, rather than a "normal" struct page.
*
* A more complete discussion of unaddressable memory may be found in
* include/linux/hmm.h and Documentation/vm/hmm.rst.
*
* MEMORY_DEVICE_FS_DAX:
* Host memory that has similar access semantics as System RAM i.e. DMA
* coherent and supports page pinning. In support of coordinating page
* pinning vs other operations MEMORY_DEVICE_FS_DAX arranges for a
* wakeup event whenever a page is unpinned and becomes idle. This
* wakeup is used to coordinate physical address space management (ex:
* fs truncate/hole punch) vs pinned pages (ex: device dma).
*
* MEMORY_DEVICE_GENERIC:
* Host memory that has similar access semantics as System RAM i.e. DMA
* coherent and supports page pinning. This is for example used by DAX devices
* that expose memory using a character device.
*
* MEMORY_DEVICE_PCI_P2PDMA:
* Device memory residing in a PCI BAR intended for use with Peer-to-Peer
* transactions.
*/
enum memory_type {
/* 0 is reserved to catch uninitialized type fields */
MEMORY_DEVICE_PRIVATE = 1,
MEMORY_DEVICE_FS_DAX,
MEMORY_DEVICE_GENERIC,
MEMORY_DEVICE_PCI_P2PDMA,
};
struct dev_pagemap_ops {
/*
* Called once the page refcount reaches 1. (ZONE_DEVICE pages never
* reach 0 refcount unless there is a refcount bug. This allows the
* device driver to implement its own memory management.)
*/
void (*page_free)(struct page *page);
/*
* Transition the refcount in struct dev_pagemap to the dead state.
*/
void (*kill)(struct dev_pagemap *pgmap);
/*
* Wait for refcount in struct dev_pagemap to be idle and reap it.
*/
void (*cleanup)(struct dev_pagemap *pgmap);
/*
* Used for private (un-addressable) device memory only. Must migrate
* the page back to a CPU accessible page.
*/
vm_fault_t (*migrate_to_ram)(struct vm_fault *vmf);
};
#define PGMAP_ALTMAP_VALID (1 << 0)
/**
* struct dev_pagemap - metadata for ZONE_DEVICE mappings
* @altmap: pre-allocated/reserved memory for vmemmap allocations
* @ref: reference count that pins the devm_memremap_pages() mapping
* @internal_ref: internal reference if @ref is not provided by the caller
* @done: completion for @internal_ref
* @type: memory type: see MEMORY_* in memory_hotplug.h
* @flags: PGMAP_* flags to specify defailed behavior
* @ops: method table
* @owner: an opaque pointer identifying the entity that manages this
* instance. Used by various helpers to make sure that no
* foreign ZONE_DEVICE memory is accessed.
* @nr_range: number of ranges to be mapped
* @range: range to be mapped when nr_range == 1
* @ranges: array of ranges to be mapped when nr_range > 1
*/
struct dev_pagemap {
struct vmem_altmap altmap;
struct percpu_ref *ref;
struct percpu_ref internal_ref;
struct completion done;
enum memory_type type;
unsigned int flags;
const struct dev_pagemap_ops *ops;
void *owner;
int nr_range;
union {
struct range range;
struct range ranges[0];
};
};
static inline struct vmem_altmap *pgmap_altmap(struct dev_pagemap *pgmap)
{
if (pgmap->flags & PGMAP_ALTMAP_VALID)
return &pgmap->altmap;
return NULL;
}
#ifdef CONFIG_ZONE_DEVICE
void *memremap_pages(struct dev_pagemap *pgmap, int nid);
void memunmap_pages(struct dev_pagemap *pgmap);
void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap);
void devm_memunmap_pages(struct device *dev, struct dev_pagemap *pgmap);
struct dev_pagemap *get_dev_pagemap(unsigned long pfn,
struct dev_pagemap *pgmap);
bool pgmap_pfn_valid(struct dev_pagemap *pgmap, unsigned long pfn);
unsigned long vmem_altmap_offset(struct vmem_altmap *altmap);
void vmem_altmap_free(struct vmem_altmap *altmap, unsigned long nr_pfns);
unsigned long memremap_compat_align(void);
#else
static inline void *devm_memremap_pages(struct device *dev,
struct dev_pagemap *pgmap)
{
/*
* Fail attempts to call devm_memremap_pages() without
* ZONE_DEVICE support enabled, this requires callers to fall
* back to plain devm_memremap() based on config
*/
WARN_ON_ONCE(1);
return ERR_PTR(-ENXIO);
}
static inline void devm_memunmap_pages(struct device *dev,
struct dev_pagemap *pgmap)
{
}
static inline struct dev_pagemap *get_dev_pagemap(unsigned long pfn,
struct dev_pagemap *pgmap)
{
return NULL;
}
static inline bool pgmap_pfn_valid(struct dev_pagemap *pgmap, unsigned long pfn)
{
return false;
}
static inline unsigned long vmem_altmap_offset(struct vmem_altmap *altmap)
{
return 0;
}
static inline void vmem_altmap_free(struct vmem_altmap *altmap,
unsigned long nr_pfns)
{
}
/* when memremap_pages() is disabled all archs can remap a single page */
static inline unsigned long memremap_compat_align(void)
{
return PAGE_SIZE;
}
#endif /* CONFIG_ZONE_DEVICE */
static inline void put_dev_pagemap(struct dev_pagemap *pgmap)
{
if (pgmap)
percpu_ref_put(pgmap->ref);
}
#endif /* _LINUX_MEMREMAP_H_ */
// SPDX-License-Identifier: GPL-2.0-or-later
/*
* SELinux NetLabel Support
*
* This file provides the necessary glue to tie NetLabel into the SELinux
* subsystem.
*
* Author: Paul Moore <paul@paul-moore.com>
*/
/*
* (c) Copyright Hewlett-Packard Development Company, L.P., 2007, 2008
*/
#include <linux/spinlock.h>
#include <linux/rcupdate.h>
#include <linux/gfp.h>
#include <linux/ip.h>
#include <linux/ipv6.h>
#include <net/sock.h>
#include <net/netlabel.h>
#include <net/ip.h>
#include <net/ipv6.h>
#include "objsec.h"
#include "security.h"
#include "netlabel.h"
/**
* selinux_netlbl_sidlookup_cached - Cache a SID lookup
* @skb: the packet
* @secattr: the NetLabel security attributes
* @sid: the SID
*
* Description:
* Query the SELinux security server to lookup the correct SID for the given
* security attributes. If the query is successful, cache the result to speed
* up future lookups. Returns zero on success, negative values on failure.
*
*/
static int selinux_netlbl_sidlookup_cached(struct sk_buff *skb,
u16 family,
struct netlbl_lsm_secattr *secattr,
u32 *sid)
{
int rc;
rc = security_netlbl_secattr_to_sid(&selinux_state, secattr, sid);
if (rc == 0 &&
(secattr->flags & NETLBL_SECATTR_CACHEABLE) &&
(secattr->flags & NETLBL_SECATTR_CACHE))
netlbl_cache_add(skb, family, secattr);
return rc;
}
/**
* selinux_netlbl_sock_genattr - Generate the NetLabel socket secattr
* @sk: the socket
*
* Description:
* Generate the NetLabel security attributes for a socket, making full use of
* the socket's attribute cache. Returns a pointer to the security attributes
* on success, NULL on failure.
*
*/
static struct netlbl_lsm_secattr *selinux_netlbl_sock_genattr(struct sock *sk)
{
int rc;
struct sk_security_struct *sksec = sk->sk_security;
struct netlbl_lsm_secattr *secattr;
if (sksec->nlbl_secattr != NULL)
return sksec->nlbl_secattr;
secattr = netlbl_secattr_alloc(GFP_ATOMIC);
if (secattr == NULL)
return NULL;
rc = security_netlbl_sid_to_secattr(&selinux_state, sksec->sid,
secattr);
if (rc != 0) {
netlbl_secattr_free(secattr);
return NULL;
}
sksec->nlbl_secattr = secattr;
return secattr;
}
/**
* selinux_netlbl_sock_getattr - Get the cached NetLabel secattr
* @sk: the socket
* @sid: the SID
*
* Query the socket's cached secattr and if the SID matches the cached value
* return the cache, otherwise return NULL.
*
*/
static struct netlbl_lsm_secattr *selinux_netlbl_sock_getattr(
const struct sock *sk,
u32 sid)
{
struct sk_security_struct *sksec = sk->sk_security;
struct netlbl_lsm_secattr *secattr = sksec->nlbl_secattr;
if (secattr == NULL)
return NULL;
if ((secattr->flags & NETLBL_SECATTR_SECID) &&
(secattr->attr.secid == sid))
return secattr;
return NULL;
}
/**
* selinux_netlbl_cache_invalidate - Invalidate the NetLabel cache
*
* Description:
* Invalidate the NetLabel security attribute mapping cache.
*
*/
void selinux_netlbl_cache_invalidate(void)
{
netlbl_cache_invalidate();
}
/**
* selinux_netlbl_err - Handle a NetLabel packet error
* @skb: the packet
* @error: the error code
* @gateway: true if host is acting as a gateway, false otherwise
*
* Description:
* When a packet is dropped due to a call to avc_has_perm() pass the error
* code to the NetLabel subsystem so any protocol specific processing can be
* done. This is safe to call even if you are unsure if NetLabel labeling is
* present on the packet, NetLabel is smart enough to only act when it should.
*
*/
void selinux_netlbl_err(struct sk_buff *skb, u16 family, int error, int gateway)
{
netlbl_skbuff_err(skb, family, error, gateway);
}
/**
* selinux_netlbl_sk_security_free - Free the NetLabel fields
* @sksec: the sk_security_struct
*
* Description:
* Free all of the memory in the NetLabel fields of a sk_security_struct.
*
*/
void selinux_netlbl_sk_security_free(struct sk_security_struct *sksec)
{
if (sksec->nlbl_secattr != NULL)
netlbl_secattr_free(sksec->nlbl_secattr);
}
/**
* selinux_netlbl_sk_security_reset - Reset the NetLabel fields
* @sksec: the sk_security_struct
* @family: the socket family
*
* Description:
* Called when the NetLabel state of a sk_security_struct needs to be reset.
* The caller is responsible for all the NetLabel sk_security_struct locking.
*
*/
void selinux_netlbl_sk_security_reset(struct sk_security_struct *sksec)
{
sksec->nlbl_state = NLBL_UNSET;
}
/**
* selinux_netlbl_skbuff_getsid - Get the sid of a packet using NetLabel
* @skb: the packet
* @family: protocol family
* @type: NetLabel labeling protocol type
* @sid: the SID
*
* Description:
* Call the NetLabel mechanism to get the security attributes of the given
* packet and use those attributes to determine the correct context/SID to
* assign to the packet. Returns zero on success, negative values on failure.
*
*/
int selinux_netlbl_skbuff_getsid(struct sk_buff *skb,
u16 family,
u32 *type,
u32 *sid)
{
int rc;
struct netlbl_lsm_secattr secattr;
if (!netlbl_enabled()) {
*sid = SECSID_NULL;
return 0;
}
netlbl_secattr_init(&secattr);
rc = netlbl_skbuff_getattr(skb, family, &secattr);
if (rc == 0 && secattr.flags != NETLBL_SECATTR_NONE)
rc = selinux_netlbl_sidlookup_cached(skb, family,
&secattr, sid);
else
*sid = SECSID_NULL;
*type = secattr.type;
netlbl_secattr_destroy(&secattr);
return rc;
}
/**
* selinux_netlbl_skbuff_setsid - Set the NetLabel on a packet given a sid
* @skb: the packet
* @family: protocol family
* @sid: the SID
*
* Description
* Call the NetLabel mechanism to set the label of a packet using @sid.
* Returns zero on success, negative values on failure.
*
*/
int selinux_netlbl_skbuff_setsid(struct sk_buff *skb,
u16 family,
u32 sid)
{
int rc;
struct netlbl_lsm_secattr secattr_storage;
struct netlbl_lsm_secattr *secattr = NULL;
struct sock *sk;
/* if this is a locally generated packet check to see if it is already
* being labeled by it's parent socket, if it is just exit */
sk = skb_to_full_sk(skb);
if (sk != NULL) {
struct sk_security_struct *sksec = sk->sk_security;
if (sksec->nlbl_state != NLBL_REQSKB)
return 0;
secattr = selinux_netlbl_sock_getattr(sk, sid);
}
if (secattr == NULL) {
secattr = &secattr_storage;
netlbl_secattr_init(secattr);
rc = security_netlbl_sid_to_secattr(&selinux_state, sid,
secattr);
if (rc != 0)
goto skbuff_setsid_return;
}
rc = netlbl_skbuff_setattr(skb, family, secattr);
skbuff_setsid_return:
if (secattr == &secattr_storage)
netlbl_secattr_destroy(secattr);
return rc;
}
/**
* selinux_netlbl_sctp_assoc_request - Label an incoming sctp association.
* @ep: incoming association endpoint.
* @skb: the packet.
*
* Description:
* A new incoming connection is represented by @ep, ......
* Returns zero on success, negative values on failure.
*
*/
int selinux_netlbl_sctp_assoc_request(struct sctp_endpoint *ep,
struct sk_buff *skb)
{
int rc;
struct netlbl_lsm_secattr secattr;
struct sk_security_struct *sksec = ep->base.sk->sk_security;
struct sockaddr_in addr4;
struct sockaddr_in6 addr6;
if (ep->base.sk->sk_family != PF_INET &&
ep->base.sk->sk_family != PF_INET6)
return 0;
netlbl_secattr_init(&secattr);
rc = security_netlbl_sid_to_secattr(&selinux_state,
ep->secid, &secattr);
if (rc != 0)
goto assoc_request_return;
/* Move skb hdr address info to a struct sockaddr and then call
* netlbl_conn_setattr().
*/
if (ip_hdr(skb)->version == 4) {
addr4.sin_family = AF_INET;
addr4.sin_addr.s_addr = ip_hdr(skb)->saddr;
rc = netlbl_conn_setattr(ep->base.sk, (void *)&addr4, &secattr);
} else if (IS_ENABLED(CONFIG_IPV6) && ip_hdr(skb)->version == 6) {
addr6.sin6_family = AF_INET6;
addr6.sin6_addr = ipv6_hdr(skb)->saddr;
rc = netlbl_conn_setattr(ep->base.sk, (void *)&addr6, &secattr);
} else {
rc = -EAFNOSUPPORT;
}
if (rc == 0)
sksec->nlbl_state = NLBL_LABELED;
assoc_request_return:
netlbl_secattr_destroy(&secattr);
return rc;
}
/**
* selinux_netlbl_inet_conn_request - Label an incoming stream connection
* @req: incoming connection request socket
*
* Description:
* A new incoming connection request is represented by @req, we need to label
* the new request_sock here and the stack will ensure the on-the-wire label
* will get preserved when a full sock is created once the connection handshake
* is complete. Returns zero on success, negative values on failure.
*
*/
int selinux_netlbl_inet_conn_request(struct request_sock *req, u16 family)
{
int rc;
struct netlbl_lsm_secattr secattr;
if (family != PF_INET && family != PF_INET6)
return 0;
netlbl_secattr_init(&secattr);
rc = security_netlbl_sid_to_secattr(&selinux_state, req->secid,
&secattr);
if (rc != 0)
goto inet_conn_request_return;
rc = netlbl_req_setattr(req, &secattr);
inet_conn_request_return:
netlbl_secattr_destroy(&secattr);
return rc;
}
/**
* selinux_netlbl_inet_csk_clone - Initialize the newly created sock
* @sk: the new sock
*
* Description:
* A new connection has been established using @sk, we've already labeled the
* socket via the request_sock struct in selinux_netlbl_inet_conn_request() but
* we need to set the NetLabel state here since we now have a sock structure.
*
*/
void selinux_netlbl_inet_csk_clone(struct sock *sk, u16 family)
{
struct sk_security_struct *sksec = sk->sk_security;
if (family == PF_INET)
sksec->nlbl_state = NLBL_LABELED;
else
sksec->nlbl_state = NLBL_UNSET;
}
/**
* selinux_netlbl_sctp_sk_clone - Copy state to the newly created sock
* @sk: current sock
* @newsk: the new sock
*
* Description:
* Called whenever a new socket is created by accept(2) or sctp_peeloff(3).
*/
void selinux_netlbl_sctp_sk_clone(struct sock *sk, struct sock *newsk)
{
struct sk_security_struct *sksec = sk->sk_security;
struct sk_security_struct *newsksec = newsk->sk_security;
newsksec->nlbl_state = sksec->nlbl_state;
}
/**
* selinux_netlbl_socket_post_create - Label a socket using NetLabel
* @sock: the socket to label
* @family: protocol family
*
* Description:
* Attempt to label a socket using the NetLabel mechanism using the given
* SID. Returns zero values on success, negative values on failure.
*
*/
int selinux_netlbl_socket_post_create(struct sock *sk, u16 family)
{
int rc;
struct sk_security_struct *sksec = sk->sk_security;
struct netlbl_lsm_secattr *secattr;
if (family != PF_INET && family != PF_INET6)
return 0;
secattr = selinux_netlbl_sock_genattr(sk);
if (secattr == NULL)
return -ENOMEM;
rc = netlbl_sock_setattr(sk, family, secattr);
switch (rc) {
case 0:
sksec->nlbl_state = NLBL_LABELED; break;
case -EDESTADDRREQ:
sksec->nlbl_state = NLBL_REQSKB; rc = 0;
break;
}
return rc;
}
/**
* selinux_netlbl_sock_rcv_skb - Do an inbound access check using NetLabel
* @sksec: the sock's sk_security_struct
* @skb: the packet
* @family: protocol family
* @ad: the audit data
*
* Description:
* Fetch the NetLabel security attributes from @skb and perform an access check
* against the receiving socket. Returns zero on success, negative values on
* error.
*
*/
int selinux_netlbl_sock_rcv_skb(struct sk_security_struct *sksec,
struct sk_buff *skb,
u16 family,
struct common_audit_data *ad)
{
int rc;
u32 nlbl_sid;
u32 perm;
struct netlbl_lsm_secattr secattr;
if (!netlbl_enabled())
return 0;
netlbl_secattr_init(&secattr);
rc = netlbl_skbuff_getattr(skb, family, &secattr);
if (rc == 0 && secattr.flags != NETLBL_SECATTR_NONE)
rc = selinux_netlbl_sidlookup_cached(skb, family,
&secattr, &nlbl_sid);
else
nlbl_sid = SECINITSID_UNLABELED;
netlbl_secattr_destroy(&secattr);
if (rc != 0)
return rc;
switch (sksec->sclass) {
case SECCLASS_UDP_SOCKET:
perm = UDP_SOCKET__RECVFROM;
break;
case SECCLASS_TCP_SOCKET:
perm = TCP_SOCKET__RECVFROM;
break;
default:
perm = RAWIP_SOCKET__RECVFROM;
}
rc = avc_has_perm(&selinux_state,
sksec->sid, nlbl_sid, sksec->sclass, perm, ad);
if (rc == 0)
return 0;
if (nlbl_sid != SECINITSID_UNLABELED)
netlbl_skbuff_err(skb, family, rc, 0);
return rc;
}
/**
* selinux_netlbl_option - Is this a NetLabel option
* @level: the socket level or protocol
* @optname: the socket option name
*
* Description:
* Returns true if @level and @optname refer to a NetLabel option.
* Helper for selinux_netlbl_socket_setsockopt().
*/
static inline int selinux_netlbl_option(int level, int optname)
{
return (level == IPPROTO_IP && optname == IP_OPTIONS) ||
(level == IPPROTO_IPV6 && optname == IPV6_HOPOPTS);
}
/**
* selinux_netlbl_socket_setsockopt - Do not allow users to remove a NetLabel
* @sock: the socket
* @level: the socket level or protocol
* @optname: the socket option name
*
* Description:
* Check the setsockopt() call and if the user is trying to replace the IP
* options on a socket and a NetLabel is in place for the socket deny the
* access; otherwise allow the access. Returns zero when the access is
* allowed, -EACCES when denied, and other negative values on error.
*
*/
int selinux_netlbl_socket_setsockopt(struct socket *sock,
int level,
int optname)
{
int rc = 0;
struct sock *sk = sock->sk;
struct sk_security_struct *sksec = sk->sk_security;
struct netlbl_lsm_secattr secattr;
if (selinux_netlbl_option(level, optname) &&
(sksec->nlbl_state == NLBL_LABELED ||
sksec->nlbl_state == NLBL_CONNLABELED)) {
netlbl_secattr_init(&secattr);
lock_sock(sk);
/* call the netlabel function directly as we want to see the
* on-the-wire label that is assigned via the socket's options
* and not the cached netlabel/lsm attributes */
rc = netlbl_sock_getattr(sk, &secattr);
release_sock(sk);
if (rc == 0)
rc = -EACCES;
else if (rc == -ENOMSG)
rc = 0;
netlbl_secattr_destroy(&secattr);
}
return rc;
}
/**
* selinux_netlbl_socket_connect_helper - Help label a client-side socket on
* connect
* @sk: the socket to label
* @addr: the destination address
*
* Description:
* Attempt to label a connected socket with NetLabel using the given address.
* Returns zero values on success, negative values on failure.
*
*/
static int selinux_netlbl_socket_connect_helper(struct sock *sk,
struct sockaddr *addr)
{
int rc;
struct sk_security_struct *sksec = sk->sk_security;
struct netlbl_lsm_secattr *secattr;
/* connected sockets are allowed to disconnect when the address family
* is set to AF_UNSPEC, if that is what is happening we want to reset
* the socket */
if (addr->sa_family == AF_UNSPEC) {
netlbl_sock_delattr(sk);
sksec->nlbl_state = NLBL_REQSKB;
rc = 0;
return rc;
}
secattr = selinux_netlbl_sock_genattr(sk);
if (secattr == NULL) {
rc = -ENOMEM;
return rc;
}
rc = netlbl_conn_setattr(sk, addr, secattr);
if (rc == 0)
sksec->nlbl_state = NLBL_CONNLABELED;
return rc;
}
/**
* selinux_netlbl_socket_connect_locked - Label a client-side socket on
* connect
* @sk: the socket to label
* @addr: the destination address
*
* Description:
* Attempt to label a connected socket that already has the socket locked
* with NetLabel using the given address.
* Returns zero values on success, negative values on failure.
*
*/
int selinux_netlbl_socket_connect_locked(struct sock *sk,
struct sockaddr *addr)
{
struct sk_security_struct *sksec = sk->sk_security;
if (sksec->nlbl_state != NLBL_REQSKB &&
sksec->nlbl_state != NLBL_CONNLABELED)
return 0;
return selinux_netlbl_socket_connect_helper(sk, addr);
}
/**
* selinux_netlbl_socket_connect - Label a client-side socket on connect
* @sk: the socket to label
* @addr: the destination address
*
* Description:
* Attempt to label a connected socket with NetLabel using the given address.
* Returns zero values on success, negative values on failure.
*
*/
int selinux_netlbl_socket_connect(struct sock *sk, struct sockaddr *addr)
{
int rc;
lock_sock(sk);
rc = selinux_netlbl_socket_connect_locked(sk, addr);
release_sock(sk);
return rc;
}
/*
* kmod - the kernel module loader
*/
#include <linux/module.h>
#include <linux/sched.h>
#include <linux/sched/task.h>
#include <linux/binfmts.h>
#include <linux/syscalls.h>
#include <linux/unistd.h>
#include <linux/kmod.h>
#include <linux/slab.h>
#include <linux/completion.h>
#include <linux/cred.h>
#include <linux/file.h>
#include <linux/fdtable.h>
#include <linux/workqueue.h>
#include <linux/security.h>
#include <linux/mount.h>
#include <linux/kernel.h>
#include <linux/init.h>
#include <linux/resource.h>
#include <linux/notifier.h>
#include <linux/suspend.h>
#include <linux/rwsem.h>
#include <linux/ptrace.h>
#include <linux/async.h>
#include <linux/uaccess.h>
#include <trace/events/module.h>
/*
* Assuming:
*
* threads = div64_u64((u64) totalram_pages * (u64) PAGE_SIZE,
* (u64) THREAD_SIZE * 8UL);
*
* If you need less than 50 threads would mean we're dealing with systems
* smaller than 3200 pages. This assumes you are capable of having ~13M memory,
* and this would only be an upper limit, after which the OOM killer would take
* effect. Systems like these are very unlikely if modules are enabled.
*/
#define MAX_KMOD_CONCURRENT 50
static atomic_t kmod_concurrent_max = ATOMIC_INIT(MAX_KMOD_CONCURRENT);
static DECLARE_WAIT_QUEUE_HEAD(kmod_wq);
/*
* This is a restriction on having *all* MAX_KMOD_CONCURRENT threads
* running at the same time without returning. When this happens we
* believe you've somehow ended up with a recursive module dependency
* creating a loop.
*
* We have no option but to fail.
*
* Userspace should proactively try to detect and prevent these.
*/
#define MAX_KMOD_ALL_BUSY_TIMEOUT 5
/*
modprobe_path is set via /proc/sys.
*/
char modprobe_path[KMOD_PATH_LEN] = CONFIG_MODPROBE_PATH;
static void free_modprobe_argv(struct subprocess_info *info)
{
kfree(info->argv[3]); /* check call_modprobe() */
kfree(info->argv);
}
static int call_modprobe(char *module_name, int wait)
{
struct subprocess_info *info;
static char *envp[] = {
"HOME=/",
"TERM=linux",
"PATH=/sbin:/usr/sbin:/bin:/usr/bin",
NULL
};
char **argv = kmalloc(sizeof(char *[5]), GFP_KERNEL);
if (!argv)
goto out;
module_name = kstrdup(module_name, GFP_KERNEL);
if (!module_name)
goto free_argv;
argv[0] = modprobe_path;
argv[1] = "-q";
argv[2] = "--";
argv[3] = module_name; /* check free_modprobe_argv() */
argv[4] = NULL;
info = call_usermodehelper_setup(modprobe_path, argv, envp, GFP_KERNEL,
NULL, free_modprobe_argv, NULL);
if (!info)
goto free_module_name;
return call_usermodehelper_exec(info, wait | UMH_KILLABLE);
free_module_name:
kfree(module_name);
free_argv:
kfree(argv);
out:
return -ENOMEM;
}
/**
* __request_module - try to load a kernel module
* @wait: wait (or not) for the operation to complete
* @fmt: printf style format string for the name of the module
* @...: arguments as specified in the format string
*
* Load a module using the user mode module loader. The function returns
* zero on success or a negative errno code or positive exit code from
* "modprobe" on failure. Note that a successful module load does not mean
* the module did not then unload and exit on an error of its own. Callers
* must check that the service they requested is now available not blindly
* invoke it.
*
* If module auto-loading support is disabled then this function
* simply returns -ENOENT.
*/
int __request_module(bool wait, const char *fmt, ...)
{
va_list args;
char module_name[MODULE_NAME_LEN];
int ret;
/*
* We don't allow synchronous module loading from async. Module
* init may invoke async_synchronize_full() which will end up
* waiting for this task which already is waiting for the module
* loading to complete, leading to a deadlock.
*/
WARN_ON_ONCE(wait && current_is_async()); if (!modprobe_path[0])
return -ENOENT;
va_start(args, fmt);
ret = vsnprintf(module_name, MODULE_NAME_LEN, fmt, args);
va_end(args);
if (ret >= MODULE_NAME_LEN)
return -ENAMETOOLONG; ret = security_kernel_module_request(module_name); if (ret)
return ret;
if (atomic_dec_if_positive(&kmod_concurrent_max) < 0) {
pr_warn_ratelimited("request_module: kmod_concurrent_max (%u) close to 0 (max_modprobes: %u), for module %s, throttling...",
atomic_read(&kmod_concurrent_max),
MAX_KMOD_CONCURRENT, module_name);
ret = wait_event_killable_timeout(kmod_wq,
atomic_dec_if_positive(&kmod_concurrent_max) >= 0,
MAX_KMOD_ALL_BUSY_TIMEOUT * HZ);
if (!ret) {
pr_warn_ratelimited("request_module: modprobe %s cannot be processed, kmod busy with %d threads for more than %d seconds now",
module_name, MAX_KMOD_CONCURRENT, MAX_KMOD_ALL_BUSY_TIMEOUT);
return -ETIME;
} else if (ret == -ERESTARTSYS) { pr_warn_ratelimited("request_module: sigkill sent for modprobe %s, giving up", module_name);
return ret;
}
}
trace_module_request(module_name, wait, _RET_IP_); ret = call_modprobe(module_name, wait ? UMH_WAIT_PROC : UMH_WAIT_EXEC);
atomic_inc(&kmod_concurrent_max);
wake_up(&kmod_wq);
return ret;
}
EXPORT_SYMBOL(__request_module);
// SPDX-License-Identifier: GPL-2.0-only
/*
* linux/fs/locks.c
*
* Provide support for fcntl()'s F_GETLK, F_SETLK, and F_SETLKW calls.
* Doug Evans (dje@spiff.uucp), August 07, 1992
*
* Deadlock detection added.
* FIXME: one thing isn't handled yet:
* - mandatory locks (requires lots of changes elsewhere)
* Kelly Carmichael (kelly@[142.24.8.65]), September 17, 1994.
*
* Miscellaneous edits, and a total rewrite of posix_lock_file() code.
* Kai Petzke (wpp@marie.physik.tu-berlin.de), 1994
*
* Converted file_lock_table to a linked list from an array, which eliminates
* the limits on how many active file locks are open.
* Chad Page (pageone@netcom.com), November 27, 1994
*
* Removed dependency on file descriptors. dup()'ed file descriptors now
* get the same locks as the original file descriptors, and a close() on
* any file descriptor removes ALL the locks on the file for the current
* process. Since locks still depend on the process id, locks are inherited
* after an exec() but not after a fork(). This agrees with POSIX, and both
* BSD and SVR4 practice.
* Andy Walker (andy@lysaker.kvaerner.no), February 14, 1995
*
* Scrapped free list which is redundant now that we allocate locks
* dynamically with kmalloc()/kfree().
* Andy Walker (andy@lysaker.kvaerner.no), February 21, 1995
*
* Implemented two lock personalities - FL_FLOCK and FL_POSIX.
*
* FL_POSIX locks are created with calls to fcntl() and lockf() through the
* fcntl() system call. They have the semantics described above.
*
* FL_FLOCK locks are created with calls to flock(), through the flock()
* system call, which is new. Old C libraries implement flock() via fcntl()
* and will continue to use the old, broken implementation.
*
* FL_FLOCK locks follow the 4.4 BSD flock() semantics. They are associated
* with a file pointer (filp). As a result they can be shared by a parent
* process and its children after a fork(). They are removed when the last
* file descriptor referring to the file pointer is closed (unless explicitly
* unlocked).
*
* FL_FLOCK locks never deadlock, an existing lock is always removed before
* upgrading from shared to exclusive (or vice versa). When this happens
* any processes blocked by the current lock are woken up and allowed to
* run before the new lock is applied.
* Andy Walker (andy@lysaker.kvaerner.no), June 09, 1995
*
* Removed some race conditions in flock_lock_file(), marked other possible
* races. Just grep for FIXME to see them.
* Dmitry Gorodchanin (pgmdsg@ibi.com), February 09, 1996.
*
* Addressed Dmitry's concerns. Deadlock checking no longer recursive.
* Lock allocation changed to GFP_ATOMIC as we can't afford to sleep
* once we've checked for blocking and deadlocking.
* Andy Walker (andy@lysaker.kvaerner.no), April 03, 1996.
*
* Initial implementation of mandatory locks. SunOS turned out to be
* a rotten model, so I implemented the "obvious" semantics.
* See 'Documentation/filesystems/mandatory-locking.rst' for details.
* Andy Walker (andy@lysaker.kvaerner.no), April 06, 1996.
*
* Don't allow mandatory locks on mmap()'ed files. Added simple functions to
* check if a file has mandatory locks, used by mmap(), open() and creat() to
* see if system call should be rejected. Ref. HP-UX/SunOS/Solaris Reference
* Manual, Section 2.
* Andy Walker (andy@lysaker.kvaerner.no), April 09, 1996.
*
* Tidied up block list handling. Added '/proc/locks' interface.
* Andy Walker (andy@lysaker.kvaerner.no), April 24, 1996.
*
* Fixed deadlock condition for pathological code that mixes calls to
* flock() and fcntl().
* Andy Walker (andy@lysaker.kvaerner.no), April 29, 1996.
*
* Allow only one type of locking scheme (FL_POSIX or FL_FLOCK) to be in use
* for a given file at a time. Changed the CONFIG_LOCK_MANDATORY scheme to
* guarantee sensible behaviour in the case where file system modules might
* be compiled with different options than the kernel itself.
* Andy Walker (andy@lysaker.kvaerner.no), May 15, 1996.
*
* Added a couple of missing wake_up() calls. Thanks to Thomas Meckel
* (Thomas.Meckel@mni.fh-giessen.de) for spotting this.
* Andy Walker (andy@lysaker.kvaerner.no), May 15, 1996.
*
* Changed FL_POSIX locks to use the block list in the same way as FL_FLOCK
* locks. Changed process synchronisation to avoid dereferencing locks that
* have already been freed.
* Andy Walker (andy@lysaker.kvaerner.no), Sep 21, 1996.
*
* Made the block list a circular list to minimise searching in the list.
* Andy Walker (andy@lysaker.kvaerner.no), Sep 25, 1996.
*
* Made mandatory locking a mount option. Default is not to allow mandatory
* locking.
* Andy Walker (andy@lysaker.kvaerner.no), Oct 04, 1996.
*
* Some adaptations for NFS support.
* Olaf Kirch (okir@monad.swb.de), Dec 1996,
*
* Fixed /proc/locks interface so that we can't overrun the buffer we are handed.
* Andy Walker (andy@lysaker.kvaerner.no), May 12, 1997.
*
* Use slab allocator instead of kmalloc/kfree.
* Use generic list implementation from <linux/list.h>.
* Sped up posix_locks_deadlock by only considering blocked locks.
* Matthew Wilcox <willy@debian.org>, March, 2000.
*
* Leases and LOCK_MAND
* Matthew Wilcox <willy@debian.org>, June, 2000.
* Stephen Rothwell <sfr@canb.auug.org.au>, June, 2000.
*
* Locking conflicts and dependencies:
* If multiple threads attempt to lock the same byte (or flock the same file)
* only one can be granted the lock, and other must wait their turn.
* The first lock has been "applied" or "granted", the others are "waiting"
* and are "blocked" by the "applied" lock..
*
* Waiting and applied locks are all kept in trees whose properties are:
*
* - the root of a tree may be an applied or waiting lock.
* - every other node in the tree is a waiting lock that
* conflicts with every ancestor of that node.
*
* Every such tree begins life as a waiting singleton which obviously
* satisfies the above properties.
*
* The only ways we modify trees preserve these properties:
*
* 1. We may add a new leaf node, but only after first verifying that it
* conflicts with all of its ancestors.
* 2. We may remove the root of a tree, creating a new singleton
* tree from the root and N new trees rooted in the immediate
* children.
* 3. If the root of a tree is not currently an applied lock, we may
* apply it (if possible).
* 4. We may upgrade the root of the tree (either extend its range,
* or upgrade its entire range from read to write).
*
* When an applied lock is modified in a way that reduces or downgrades any
* part of its range, we remove all its children (2 above). This particularly
* happens when a lock is unlocked.
*
* For each of those child trees we "wake up" the thread which is
* waiting for the lock so it can continue handling as follows: if the
* root of the tree applies, we do so (3). If it doesn't, it must
* conflict with some applied lock. We remove (wake up) all of its children
* (2), and add it is a new leaf to the tree rooted in the applied
* lock (1). We then repeat the process recursively with those
* children.
*
*/
#include <linux/capability.h>
#include <linux/file.h>
#include <linux/fdtable.h>
#include <linux/fs.h>
#include <linux/init.h>
#include <linux/security.h>
#include <linux/slab.h>
#include <linux/syscalls.h>
#include <linux/time.h>
#include <linux/rcupdate.h>
#include <linux/pid_namespace.h>
#include <linux/hashtable.h>
#include <linux/percpu.h>
#define CREATE_TRACE_POINTS
#include <trace/events/filelock.h>
#include <linux/uaccess.h>
#define IS_POSIX(fl) (fl->fl_flags & FL_POSIX)
#define IS_FLOCK(fl) (fl->fl_flags & FL_FLOCK)
#define IS_LEASE(fl) (fl->fl_flags & (FL_LEASE|FL_DELEG|FL_LAYOUT))
#define IS_OFDLCK(fl) (fl->fl_flags & FL_OFDLCK)
#define IS_REMOTELCK(fl) (fl->fl_pid <= 0)
static bool lease_breaking(struct file_lock *fl)
{
return fl->fl_flags & (FL_UNLOCK_PENDING | FL_DOWNGRADE_PENDING);
}
static int target_leasetype(struct file_lock *fl)
{
if (fl->fl_flags & FL_UNLOCK_PENDING)
return F_UNLCK;
if (fl->fl_flags & FL_DOWNGRADE_PENDING)
return F_RDLCK;
return fl->fl_type;
}
int leases_enable = 1;
int lease_break_time = 45;
/*
* The global file_lock_list is only used for displaying /proc/locks, so we
* keep a list on each CPU, with each list protected by its own spinlock.
* Global serialization is done using file_rwsem.
*
* Note that alterations to the list also require that the relevant flc_lock is
* held.
*/
struct file_lock_list_struct {
spinlock_t lock;
struct hlist_head hlist;
};
static DEFINE_PER_CPU(struct file_lock_list_struct, file_lock_list);
DEFINE_STATIC_PERCPU_RWSEM(file_rwsem);
/*
* The blocked_hash is used to find POSIX lock loops for deadlock detection.
* It is protected by blocked_lock_lock.
*
* We hash locks by lockowner in order to optimize searching for the lock a
* particular lockowner is waiting on.
*
* FIXME: make this value scale via some heuristic? We generally will want more
* buckets when we have more lockowners holding locks, but that's a little
* difficult to determine without knowing what the workload will look like.
*/
#define BLOCKED_HASH_BITS 7
static DEFINE_HASHTABLE(blocked_hash, BLOCKED_HASH_BITS);
/*
* This lock protects the blocked_hash. Generally, if you're accessing it, you
* want to be holding this lock.
*
* In addition, it also protects the fl->fl_blocked_requests list, and the
* fl->fl_blocker pointer for file_lock structures that are acting as lock
* requests (in contrast to those that are acting as records of acquired locks).
*
* Note that when we acquire this lock in order to change the above fields,
* we often hold the flc_lock as well. In certain cases, when reading the fields
* protected by this lock, we can skip acquiring it iff we already hold the
* flc_lock.
*/
static DEFINE_SPINLOCK(blocked_lock_lock);
static struct kmem_cache *flctx_cache __read_mostly;
static struct kmem_cache *filelock_cache __read_mostly;
static struct file_lock_context *
locks_get_lock_context(struct inode *inode, int type)
{
struct file_lock_context *ctx;
/* paired with cmpxchg() below */
ctx = smp_load_acquire(&inode->i_flctx); if (likely(ctx) || type == F_UNLCK)
goto out;
ctx = kmem_cache_alloc(flctx_cache, GFP_KERNEL);
if (!ctx)
goto out;
spin_lock_init(&ctx->flc_lock);
INIT_LIST_HEAD(&ctx->flc_flock);
INIT_LIST_HEAD(&ctx->flc_posix);
INIT_LIST_HEAD(&ctx->flc_lease);
/*
* Assign the pointer if it's not already assigned. If it is, then
* free the context we just allocated.
*/
if (cmpxchg(&inode->i_flctx, NULL, ctx)) {
kmem_cache_free(flctx_cache, ctx);
ctx = smp_load_acquire(&inode->i_flctx);
}
out:
trace_locks_get_lock_context(inode, type, ctx);
return ctx;
}
static void
locks_dump_ctx_list(struct list_head *list, char *list_type)
{
struct file_lock *fl;
list_for_each_entry(fl, list, fl_list) {
pr_warn("%s: fl_owner=%p fl_flags=0x%x fl_type=0x%x fl_pid=%u\n", list_type, fl->fl_owner, fl->fl_flags, fl->fl_type, fl->fl_pid);
}
}
static void
locks_check_ctx_lists(struct inode *inode)
{
struct file_lock_context *ctx = inode->i_flctx;
if (unlikely(!list_empty(&ctx->flc_flock) ||
!list_empty(&ctx->flc_posix) ||
!list_empty(&ctx->flc_lease))) {
pr_warn("Leaked locks on dev=0x%x:0x%x ino=0x%lx:\n",
MAJOR(inode->i_sb->s_dev), MINOR(inode->i_sb->s_dev),
inode->i_ino);
locks_dump_ctx_list(&ctx->flc_flock, "FLOCK");
locks_dump_ctx_list(&ctx->flc_posix, "POSIX");
locks_dump_ctx_list(&ctx->flc_lease, "LEASE");
}
}
static void
locks_check_ctx_file_list(struct file *filp, struct list_head *list,
char *list_type)
{
struct file_lock *fl;
struct inode *inode = locks_inode(filp); list_for_each_entry(fl, list, fl_list) if (fl->fl_file == filp)
pr_warn("Leaked %s lock on dev=0x%x:0x%x ino=0x%lx "
" fl_owner=%p fl_flags=0x%x fl_type=0x%x fl_pid=%u\n",
list_type, MAJOR(inode->i_sb->s_dev),
MINOR(inode->i_sb->s_dev), inode->i_ino,
fl->fl_owner, fl->fl_flags, fl->fl_type, fl->fl_pid);
}
void
locks_free_lock_context(struct inode *inode)
{
struct file_lock_context *ctx = inode->i_flctx;
if (unlikely(ctx)) {
locks_check_ctx_lists(inode);
kmem_cache_free(flctx_cache, ctx);
}
}
static void locks_init_lock_heads(struct file_lock *fl)
{
INIT_HLIST_NODE(&fl->fl_link);
INIT_LIST_HEAD(&fl->fl_list);
INIT_LIST_HEAD(&fl->fl_blocked_requests);
INIT_LIST_HEAD(&fl->fl_blocked_member);
init_waitqueue_head(&fl->fl_wait);
}
/* Allocate an empty lock structure. */
struct file_lock *locks_alloc_lock(void)
{
struct file_lock *fl = kmem_cache_zalloc(filelock_cache, GFP_KERNEL);
if (fl)
locks_init_lock_heads(fl);
return fl;
}
EXPORT_SYMBOL_GPL(locks_alloc_lock);
void locks_release_private(struct file_lock *fl)
{
BUG_ON(waitqueue_active(&fl->fl_wait));
BUG_ON(!list_empty(&fl->fl_list));
BUG_ON(!list_empty(&fl->fl_blocked_requests));
BUG_ON(!list_empty(&fl->fl_blocked_member));
BUG_ON(!hlist_unhashed(&fl->fl_link));
if (fl->fl_ops) {
if (fl->fl_ops->fl_release_private)
fl->fl_ops->fl_release_private(fl);
fl->fl_ops = NULL;
}
if (fl->fl_lmops) {
if (fl->fl_lmops->lm_put_owner) {
fl->fl_lmops->lm_put_owner(fl->fl_owner);
fl->fl_owner = NULL;
}
fl->fl_lmops = NULL;
}
}
EXPORT_SYMBOL_GPL(locks_release_private);
/* Free a lock which is not in use. */
void locks_free_lock(struct file_lock *fl)
{
locks_release_private(fl);
kmem_cache_free(filelock_cache, fl);
}
EXPORT_SYMBOL(locks_free_lock);
static void
locks_dispose_list(struct list_head *dispose)
{
struct file_lock *fl;
while (!list_empty(dispose)) {
fl = list_first_entry(dispose, struct file_lock, fl_list);
list_del_init(&fl->fl_list);
locks_free_lock(fl);
}
}
void locks_init_lock(struct file_lock *fl)
{
memset(fl, 0, sizeof(struct file_lock));
locks_init_lock_heads(fl);
}
EXPORT_SYMBOL(locks_init_lock);
/*
* Initialize a new lock from an existing file_lock structure.
*/
void locks_copy_conflock(struct file_lock *new, struct file_lock *fl)
{
new->fl_owner = fl->fl_owner;
new->fl_pid = fl->fl_pid;
new->fl_file = NULL;
new->fl_flags = fl->fl_flags;
new->fl_type = fl->fl_type;
new->fl_start = fl->fl_start;
new->fl_end = fl->fl_end;
new->fl_lmops = fl->fl_lmops;
new->fl_ops = NULL;
if (fl->fl_lmops) {
if (fl->fl_lmops->lm_get_owner)
fl->fl_lmops->lm_get_owner(fl->fl_owner);
}
}
EXPORT_SYMBOL(locks_copy_conflock);
void locks_copy_lock(struct file_lock *new, struct file_lock *fl)
{
/* "new" must be a freshly-initialized lock */
WARN_ON_ONCE(new->fl_ops);
locks_copy_conflock(new, fl);
new->fl_file = fl->fl_file;
new->fl_ops = fl->fl_ops;
if (fl->fl_ops) {
if (fl->fl_ops->fl_copy_lock)
fl->fl_ops->fl_copy_lock(new, fl);
}
}
EXPORT_SYMBOL(locks_copy_lock);
static void locks_move_blocks(struct file_lock *new, struct file_lock *fl)
{
struct file_lock *f;
/*
* As ctx->flc_lock is held, new requests cannot be added to
* ->fl_blocked_requests, so we don't need a lock to check if it
* is empty.
*/
if (list_empty(&fl->fl_blocked_requests))
return;
spin_lock(&blocked_lock_lock);
list_splice_init(&fl->fl_blocked_requests, &new->fl_blocked_requests);
list_for_each_entry(f, &new->fl_blocked_requests, fl_blocked_member)
f->fl_blocker = new;
spin_unlock(&blocked_lock_lock);
}
static inline int flock_translate_cmd(int cmd) {
if (cmd & LOCK_MAND) return cmd & (LOCK_MAND | LOCK_RW); switch (cmd) {
case LOCK_SH:
return F_RDLCK;
case LOCK_EX:
return F_WRLCK;
case LOCK_UN:
return F_UNLCK;
}
return -EINVAL;
}
/* Fill in a file_lock structure with an appropriate FLOCK lock. */
static struct file_lock *
flock_make_lock(struct file *filp, unsigned int cmd, struct file_lock *fl)
{
int type = flock_translate_cmd(cmd); if (type < 0)
return ERR_PTR(type);
if (fl == NULL) { fl = locks_alloc_lock();
if (fl == NULL)
return ERR_PTR(-ENOMEM);
} else {
locks_init_lock(fl);
}
fl->fl_file = filp;
fl->fl_owner = filp;
fl->fl_pid = current->tgid;
fl->fl_flags = FL_FLOCK;
fl->fl_type = type;
fl->fl_end = OFFSET_MAX;
return fl;}
static int assign_type(struct file_lock *fl, long type)
{
switch (type) {
case F_RDLCK:
case F_WRLCK:
case F_UNLCK:
fl->fl_type = type;
break;
default:
return -EINVAL;
}
return 0;
}
static int flock64_to_posix_lock(struct file *filp, struct file_lock *fl,
struct flock64 *l)
{
switch (l->l_whence) {
case SEEK_SET:
fl->fl_start = 0;
break;
case SEEK_CUR:
fl->fl_start = filp->f_pos;
break;
case SEEK_END:
fl->fl_start = i_size_read(file_inode(filp));
break;
default:
return -EINVAL;
}
if (l->l_start > OFFSET_MAX - fl->fl_start)
return -EOVERFLOW;
fl->fl_start += l->l_start;
if (fl->fl_start < 0)
return -EINVAL;
/* POSIX-1996 leaves the case l->l_len < 0 undefined;
POSIX-2001 defines it. */
if (l->l_len > 0) {
if (l->l_len - 1 > OFFSET_MAX - fl->fl_start)
return -EOVERFLOW;
fl->fl_end = fl->fl_start + (l->l_len - 1);
} else if (l->l_len < 0) {
if (fl->fl_start + l->l_len < 0)
return -EINVAL;
fl->fl_end = fl->fl_start - 1;
fl->fl_start += l->l_len;
} else
fl->fl_end = OFFSET_MAX;
fl->fl_owner = current->files;
fl->fl_pid = current->tgid;
fl->fl_file = filp;
fl->fl_flags = FL_POSIX;
fl->fl_ops = NULL;
fl->fl_lmops = NULL;
return assign_type(fl, l->l_type);
}
/* Verify a "struct flock" and copy it to a "struct file_lock" as a POSIX
* style lock.
*/
static int flock_to_posix_lock(struct file *filp, struct file_lock *fl,
struct flock *l)
{
struct flock64 ll = {
.l_type = l->l_type,
.l_whence = l->l_whence,
.l_start = l->l_start,
.l_len = l->l_len,
};
return flock64_to_posix_lock(filp, fl, &ll);
}
/* default lease lock manager operations */
static bool
lease_break_callback(struct file_lock *fl)
{
kill_fasync(&fl->fl_fasync, SIGIO, POLL_MSG);
return false;
}
static void
lease_setup(struct file_lock *fl, void **priv)
{
struct file *filp = fl->fl_file;
struct fasync_struct *fa = *priv;
/*
* fasync_insert_entry() returns the old entry if any. If there was no
* old entry, then it used "priv" and inserted it into the fasync list.
* Clear the pointer to indicate that it shouldn't be freed.
*/
if (!fasync_insert_entry(fa->fa_fd, filp, &fl->fl_fasync, fa))
*priv = NULL;
__f_setown(filp, task_pid(current), PIDTYPE_TGID, 0);
}
static const struct lock_manager_operations lease_manager_ops = {
.lm_break = lease_break_callback,
.lm_change = lease_modify,
.lm_setup = lease_setup,
};
/*
* Initialize a lease, use the default lock manager operations
*/
static int lease_init(struct file *filp, long type, struct file_lock *fl)
{
if (assign_type(fl, type) != 0)
return -EINVAL;
fl->fl_owner = filp;
fl->fl_pid = current->tgid;
fl->fl_file = filp;
fl->fl_flags = FL_LEASE;
fl->fl_start = 0;
fl->fl_end = OFFSET_MAX;
fl->fl_ops = NULL;
fl->fl_lmops = &lease_manager_ops;
return 0;
}
/* Allocate a file_lock initialised to this type of lease */
static struct file_lock *lease_alloc(struct file *filp, long type)
{
struct file_lock *fl = locks_alloc_lock();
int error = -ENOMEM;
if (fl == NULL)
return ERR_PTR(error);
error = lease_init(filp, type, fl);
if (error) {
locks_free_lock(fl);
return ERR_PTR(error);
}
return fl;
}
/* Check if two locks overlap each other.
*/
static inline int locks_overlap(struct file_lock *fl1, struct file_lock *fl2)
{
return ((fl1->fl_end >= fl2->fl_start) &&
(fl2->fl_end >= fl1->fl_start));
}
/*
* Check whether two locks have the same owner.
*/
static int posix_same_owner(struct file_lock *fl1, struct file_lock *fl2)
{
return fl1->fl_owner == fl2->fl_owner;
}
/* Must be called with the flc_lock held! */
static void locks_insert_global_locks(struct file_lock *fl)
{
struct file_lock_list_struct *fll = this_cpu_ptr(&file_lock_list);
percpu_rwsem_assert_held(&file_rwsem);
spin_lock(&fll->lock);
fl->fl_link_cpu = smp_processor_id();
hlist_add_head(&fl->fl_link, &fll->hlist);
spin_unlock(&fll->lock);
}
/* Must be called with the flc_lock held! */
static void locks_delete_global_locks(struct file_lock *fl)
{
struct file_lock_list_struct *fll;
percpu_rwsem_assert_held(&file_rwsem);
/*
* Avoid taking lock if already unhashed. This is safe since this check
* is done while holding the flc_lock, and new insertions into the list
* also require that it be held.
*/
if (hlist_unhashed(&fl->fl_link))
return;
fll = per_cpu_ptr(&file_lock_list, fl->fl_link_cpu);
spin_lock(&fll->lock);
hlist_del_init(&fl->fl_link);
spin_unlock(&fll->lock);
}
static unsigned long
posix_owner_key(struct file_lock *fl)
{
return (unsigned long)fl->fl_owner;
}
static void locks_insert_global_blocked(struct file_lock *waiter)
{
lockdep_assert_held(&blocked_lock_lock);
hash_add(blocked_hash, &waiter->fl_link, posix_owner_key(waiter));
}
static void locks_delete_global_blocked(struct file_lock *waiter)
{
lockdep_assert_held(&blocked_lock_lock);
hash_del(&waiter->fl_link);
}
/* Remove waiter from blocker's block list.
* When blocker ends up pointing to itself then the list is empty.
*
* Must be called with blocked_lock_lock held.
*/
static void __locks_delete_block(struct file_lock *waiter)
{
locks_delete_global_blocked(waiter);
list_del_init(&waiter->fl_blocked_member);
}
static void __locks_wake_up_blocks(struct file_lock *blocker)
{
while (!list_empty(&blocker->fl_blocked_requests)) {
struct file_lock *waiter;
waiter = list_first_entry(&blocker->fl_blocked_requests,
struct file_lock, fl_blocked_member);
__locks_delete_block(waiter);
if (waiter->fl_lmops && waiter->fl_lmops->lm_notify)
waiter->fl_lmops->lm_notify(waiter);
else
wake_up(&waiter->fl_wait);
/*
* The setting of fl_blocker to NULL marks the "done"
* point in deleting a block. Paired with acquire at the top
* of locks_delete_block().
*/
smp_store_release(&waiter->fl_blocker, NULL);
}
}
/**
* locks_delete_block - stop waiting for a file lock
* @waiter: the lock which was waiting
*
* lockd/nfsd need to disconnect the lock while working on it.
*/
int locks_delete_block(struct file_lock *waiter)
{
int status = -ENOENT;
/*
* If fl_blocker is NULL, it won't be set again as this thread "owns"
* the lock and is the only one that might try to claim the lock.
*
* We use acquire/release to manage fl_blocker so that we can
* optimize away taking the blocked_lock_lock in many cases.
*
* The smp_load_acquire guarantees two things:
*
* 1/ that fl_blocked_requests can be tested locklessly. If something
* was recently added to that list it must have been in a locked region
* *before* the locked region when fl_blocker was set to NULL.
*
* 2/ that no other thread is accessing 'waiter', so it is safe to free
* it. __locks_wake_up_blocks is careful not to touch waiter after
* fl_blocker is released.
*
* If a lockless check of fl_blocker shows it to be NULL, we know that
* no new locks can be inserted into its fl_blocked_requests list, and
* can avoid doing anything further if the list is empty.
*/
if (!smp_load_acquire(&waiter->fl_blocker) &&
list_empty(&waiter->fl_blocked_requests))
return status;
spin_lock(&blocked_lock_lock);
if (waiter->fl_blocker)
status = 0;
__locks_wake_up_blocks(waiter);
__locks_delete_block(waiter);
/*
* The setting of fl_blocker to NULL marks the "done" point in deleting
* a block. Paired with acquire at the top of this function.
*/
smp_store_release(&waiter->fl_blocker, NULL);
spin_unlock(&blocked_lock_lock);
return status;
}
EXPORT_SYMBOL(locks_delete_block);
/* Insert waiter into blocker's block list.
* We use a circular list so that processes can be easily woken up in
* the order they blocked. The documentation doesn't require this but
* it seems like the reasonable thing to do.
*
* Must be called with both the flc_lock and blocked_lock_lock held. The
* fl_blocked_requests list itself is protected by the blocked_lock_lock,
* but by ensuring that the flc_lock is also held on insertions we can avoid
* taking the blocked_lock_lock in some cases when we see that the
* fl_blocked_requests list is empty.
*
* Rather than just adding to the list, we check for conflicts with any existing
* waiters, and add beneath any waiter that blocks the new waiter.
* Thus wakeups don't happen until needed.
*/
static void __locks_insert_block(struct file_lock *blocker,
struct file_lock *waiter,
bool conflict(struct file_lock *,
struct file_lock *))
{
struct file_lock *fl;
BUG_ON(!list_empty(&waiter->fl_blocked_member));
new_blocker:
list_for_each_entry(fl, &blocker->fl_blocked_requests, fl_blocked_member)
if (conflict(fl, waiter)) {
blocker = fl;
goto new_blocker;
}
waiter->fl_blocker = blocker;
list_add_tail(&waiter->fl_blocked_member, &blocker->fl_blocked_requests);
if (IS_POSIX(blocker) && !IS_OFDLCK(blocker))
locks_insert_global_blocked(waiter);
/* The requests in waiter->fl_blocked are known to conflict with
* waiter, but might not conflict with blocker, or the requests
* and lock which block it. So they all need to be woken.
*/
__locks_wake_up_blocks(waiter);
}
/* Must be called with flc_lock held. */
static void locks_insert_block(struct file_lock *blocker,
struct file_lock *waiter,
bool conflict(struct file_lock *,
struct file_lock *))
{
spin_lock(&blocked_lock_lock);
__locks_insert_block(blocker, waiter, conflict);
spin_unlock(&blocked_lock_lock);
}
/*
* Wake up processes blocked waiting for blocker.
*
* Must be called with the inode->flc_lock held!
*/
static void locks_wake_up_blocks(struct file_lock *blocker)
{
/*
* Avoid taking global lock if list is empty. This is safe since new
* blocked requests are only added to the list under the flc_lock, and
* the flc_lock is always held here. Note that removal from the
* fl_blocked_requests list does not require the flc_lock, so we must
* recheck list_empty() after acquiring the blocked_lock_lock.
*/
if (list_empty(&blocker->fl_blocked_requests))
return;
spin_lock(&blocked_lock_lock);
__locks_wake_up_blocks(blocker);
spin_unlock(&blocked_lock_lock);
}
static void
locks_insert_lock_ctx(struct file_lock *fl, struct list_head *before)
{
list_add_tail(&fl->fl_list, before);
locks_insert_global_locks(fl);
}
static void
locks_unlink_lock_ctx(struct file_lock *fl)
{
locks_delete_global_locks(fl);
list_del_init(&fl->fl_list);
locks_wake_up_blocks(fl);
}
static void
locks_delete_lock_ctx(struct file_lock *fl, struct list_head *dispose)
{
locks_unlink_lock_ctx(fl);
if (dispose)
list_add(&fl->fl_list, dispose);
else
locks_free_lock(fl);
}
/* Determine if lock sys_fl blocks lock caller_fl. Common functionality
* checks for shared/exclusive status of overlapping locks.
*/
static bool locks_conflict(struct file_lock *caller_fl,
struct file_lock *sys_fl)
{
if (sys_fl->fl_type == F_WRLCK)
return true;
if (caller_fl->fl_type == F_WRLCK)
return true;
return false;
}
/* Determine if lock sys_fl blocks lock caller_fl. POSIX specific
* checking before calling the locks_conflict().
*/
static bool posix_locks_conflict(struct file_lock *caller_fl,
struct file_lock *sys_fl)
{
/* POSIX locks owned by the same process do not conflict with
* each other.
*/
if (posix_same_owner(caller_fl, sys_fl))
return false;
/* Check whether they overlap */
if (!locks_overlap(caller_fl, sys_fl))
return false;
return locks_conflict(caller_fl, sys_fl);
}
/* Determine if lock sys_fl blocks lock caller_fl. FLOCK specific
* checking before calling the locks_conflict().
*/
static bool flock_locks_conflict(struct file_lock *caller_fl,
struct file_lock *sys_fl)
{
/* FLOCK locks referring to the same filp do not conflict with
* each other.
*/
if (caller_fl->fl_file == sys_fl->fl_file)
return false;
if ((caller_fl->fl_type & LOCK_MAND) || (sys_fl->fl_type & LOCK_MAND))
return false;
return locks_conflict(caller_fl, sys_fl);
}
void
posix_test_lock(struct file *filp, struct file_lock *fl)
{
struct file_lock *cfl;
struct file_lock_context *ctx;
struct inode *inode = locks_inode(filp);
ctx = smp_load_acquire(&inode->i_flctx);
if (!ctx || list_empty_careful(&ctx->flc_posix)) {
fl->fl_type = F_UNLCK;
return;
}
spin_lock(&ctx->flc_lock);
list_for_each_entry(cfl, &ctx->flc_posix, fl_list) {
if (posix_locks_conflict(fl, cfl)) {
locks_copy_conflock(fl, cfl);
goto out;
}
}
fl->fl_type = F_UNLCK;
out:
spin_unlock(&ctx->flc_lock);
return;
}
EXPORT_SYMBOL(posix_test_lock);
/*
* Deadlock detection:
*
* We attempt to detect deadlocks that are due purely to posix file
* locks.
*
* We assume that a task can be waiting for at most one lock at a time.
* So for any acquired lock, the process holding that lock may be
* waiting on at most one other lock. That lock in turns may be held by
* someone waiting for at most one other lock. Given a requested lock
* caller_fl which is about to wait for a conflicting lock block_fl, we
* follow this chain of waiters to ensure we are not about to create a
* cycle.
*
* Since we do this before we ever put a process to sleep on a lock, we
* are ensured that there is never a cycle; that is what guarantees that
* the while() loop in posix_locks_deadlock() eventually completes.
*
* Note: the above assumption may not be true when handling lock
* requests from a broken NFS client. It may also fail in the presence
* of tasks (such as posix threads) sharing the same open file table.
* To handle those cases, we just bail out after a few iterations.
*
* For FL_OFDLCK locks, the owner is the filp, not the files_struct.
* Because the owner is not even nominally tied to a thread of
* execution, the deadlock detection below can't reasonably work well. Just
* skip it for those.
*
* In principle, we could do a more limited deadlock detection on FL_OFDLCK
* locks that just checks for the case where two tasks are attempting to
* upgrade from read to write locks on the same inode.
*/
#define MAX_DEADLK_ITERATIONS 10
/* Find a lock that the owner of the given block_fl is blocking on. */
static struct file_lock *what_owner_is_waiting_for(struct file_lock *block_fl)
{
struct file_lock *fl;
hash_for_each_possible(blocked_hash, fl, fl_link, posix_owner_key(block_fl)) {
if (posix_same_owner(fl, block_fl)) {
while (fl->fl_blocker)
fl = fl->fl_blocker;
return fl;
}
}
return NULL;
}
/* Must be called with the blocked_lock_lock held! */
static int posix_locks_deadlock(struct file_lock *caller_fl,
struct file_lock *block_fl)
{
int i = 0;
lockdep_assert_held(&blocked_lock_lock);
/*
* This deadlock detector can't reasonably detect deadlocks with
* FL_OFDLCK locks, since they aren't owned by a process, per-se.
*/
if (IS_OFDLCK(caller_fl))
return 0;
while ((block_fl = what_owner_is_waiting_for(block_fl))) {
if (i++ > MAX_DEADLK_ITERATIONS)
return 0;
if (posix_same_owner(caller_fl, block_fl))
return 1;
}
return 0;
}
/* Try to create a FLOCK lock on filp. We always insert new FLOCK locks
* after any leases, but before any posix locks.
*
* Note that if called with an FL_EXISTS argument, the caller may determine
* whether or not a lock was successfully freed by testing the return
* value for -ENOENT.
*/
static int flock_lock_inode(struct inode *inode, struct file_lock *request)
{
struct file_lock *new_fl = NULL;
struct file_lock *fl;
struct file_lock_context *ctx;
int error = 0;
bool found = false;
LIST_HEAD(dispose);
ctx = locks_get_lock_context(inode, request->fl_type);
if (!ctx) {
if (request->fl_type != F_UNLCK)
return -ENOMEM;
return (request->fl_flags & FL_EXISTS) ? -ENOENT : 0;
}
if (!(request->fl_flags & FL_ACCESS) && (request->fl_type != F_UNLCK)) { new_fl = locks_alloc_lock();
if (!new_fl)
return -ENOMEM;
}
percpu_down_read(&file_rwsem);
spin_lock(&ctx->flc_lock);
if (request->fl_flags & FL_ACCESS)
goto find_conflict;
list_for_each_entry(fl, &ctx->flc_flock, fl_list) { if (request->fl_file != fl->fl_file)
continue;
if (request->fl_type == fl->fl_type)
goto out;
found = true;
locks_delete_lock_ctx(fl, &dispose);
break;
}
if (request->fl_type == F_UNLCK) { if ((request->fl_flags & FL_EXISTS) && !found)
error = -ENOENT;
goto out;
}
find_conflict:
list_for_each_entry(fl, &ctx->flc_flock, fl_list) {
if (!flock_locks_conflict(request, fl))
continue;
error = -EAGAIN;
if (!(request->fl_flags & FL_SLEEP))
goto out;
error = FILE_LOCK_DEFERRED;
locks_insert_block(fl, request, flock_locks_conflict);
goto out;
}
if (request->fl_flags & FL_ACCESS)
goto out;
locks_copy_lock(new_fl, request);
locks_move_blocks(new_fl, request);
locks_insert_lock_ctx(new_fl, &ctx->flc_flock);
new_fl = NULL;
error = 0;
out:
spin_unlock(&ctx->flc_lock);
percpu_up_read(&file_rwsem);
if (new_fl)
locks_free_lock(new_fl);
locks_dispose_list(&dispose);
trace_flock_lock_inode(inode, request, error);
return error;
}
static int posix_lock_inode(struct inode *inode, struct file_lock *request,
struct file_lock *conflock)
{
struct file_lock *fl, *tmp;
struct file_lock *new_fl = NULL;
struct file_lock *new_fl2 = NULL;
struct file_lock *left = NULL;
struct file_lock *right = NULL;
struct file_lock_context *ctx;
int error;
bool added = false;
LIST_HEAD(dispose);
ctx = locks_get_lock_context(inode, request->fl_type);
if (!ctx)
return (request->fl_type == F_UNLCK) ? 0 : -ENOMEM;
/*
* We may need two file_lock structures for this operation,
* so we get them in advance to avoid races.
*
* In some cases we can be sure, that no new locks will be needed
*/
if (!(request->fl_flags & FL_ACCESS) &&
(request->fl_type != F_UNLCK ||
request->fl_start != 0 || request->fl_end != OFFSET_MAX)) {
new_fl = locks_alloc_lock();
new_fl2 = locks_alloc_lock();
}
percpu_down_read(&file_rwsem);
spin_lock(&ctx->flc_lock);
/*
* New lock request. Walk all POSIX locks and look for conflicts. If
* there are any, either return error or put the request on the
* blocker's list of waiters and the global blocked_hash.
*/
if (request->fl_type != F_UNLCK) {
list_for_each_entry(fl, &ctx->flc_posix, fl_list) {
if (!posix_locks_conflict(request, fl))
continue;
if (conflock)
locks_copy_conflock(conflock, fl);
error = -EAGAIN;
if (!(request->fl_flags & FL_SLEEP))
goto out;
/*
* Deadlock detection and insertion into the blocked
* locks list must be done while holding the same lock!
*/
error = -EDEADLK;
spin_lock(&blocked_lock_lock);
/*
* Ensure that we don't find any locks blocked on this
* request during deadlock detection.
*/
__locks_wake_up_blocks(request);
if (likely(!posix_locks_deadlock(request, fl))) {
error = FILE_LOCK_DEFERRED;
__locks_insert_block(fl, request,
posix_locks_conflict);
}
spin_unlock(&blocked_lock_lock);
goto out;
}
}
/* If we're just looking for a conflict, we're done. */
error = 0;
if (request->fl_flags & FL_ACCESS)
goto out;
/* Find the first old lock with the same owner as the new lock */
list_for_each_entry(fl, &ctx->flc_posix, fl_list) {
if (posix_same_owner(request, fl))
break;
}
/* Process locks with this owner. */
list_for_each_entry_safe_from(fl, tmp, &ctx->flc_posix, fl_list) {
if (!posix_same_owner(request, fl))
break;
/* Detect adjacent or overlapping regions (if same lock type) */
if (request->fl_type == fl->fl_type) {
/* In all comparisons of start vs end, use
* "start - 1" rather than "end + 1". If end
* is OFFSET_MAX, end + 1 will become negative.
*/
if (fl->fl_end < request->fl_start - 1)
continue;
/* If the next lock in the list has entirely bigger
* addresses than the new one, insert the lock here.
*/
if (fl->fl_start - 1 > request->fl_end)
break;
/* If we come here, the new and old lock are of the
* same type and adjacent or overlapping. Make one
* lock yielding from the lower start address of both
* locks to the higher end address.
*/
if (fl->fl_start > request->fl_start)
fl->fl_start = request->fl_start;
else
request->fl_start = fl->fl_start;
if (fl->fl_end < request->fl_end)
fl->fl_end = request->fl_end;
else
request->fl_end = fl->fl_end;
if (added) {
locks_delete_lock_ctx(fl, &dispose);
continue;
}
request = fl;
added = true;
} else {
/* Processing for different lock types is a bit
* more complex.
*/
if (fl->fl_end < request->fl_start)
continue;
if (fl->fl_start > request->fl_end)
break;
if (request->fl_type == F_UNLCK)
added = true;
if (fl->fl_start < request->fl_start)
left = fl;
/* If the next lock in the list has a higher end
* address than the new one, insert the new one here.
*/
if (fl->fl_end > request->fl_end) {
right = fl;
break;
}
if (fl->fl_start >= request->fl_start) {
/* The new lock completely replaces an old
* one (This may happen several times).
*/
if (added) {
locks_delete_lock_ctx(fl, &dispose);
continue;
}
/*
* Replace the old lock with new_fl, and
* remove the old one. It's safe to do the
* insert here since we know that we won't be
* using new_fl later, and that the lock is
* just replacing an existing lock.
*/
error = -ENOLCK;
if (!new_fl)
goto out;
locks_copy_lock(new_fl, request);
locks_move_blocks(new_fl, request);
request = new_fl;
new_fl = NULL;
locks_insert_lock_ctx(request, &fl->fl_list);
locks_delete_lock_ctx(fl, &dispose);
added = true;
}
}
}
/*
* The above code only modifies existing locks in case of merging or
* replacing. If new lock(s) need to be inserted all modifications are
* done below this, so it's safe yet to bail out.
*/
error = -ENOLCK; /* "no luck" */
if (right && left == right && !new_fl2)
goto out;
error = 0;
if (!added) {
if (request->fl_type == F_UNLCK) {
if (request->fl_flags & FL_EXISTS)
error = -ENOENT;
goto out;
}
if (!new_fl) {
error = -ENOLCK;
goto out;
}
locks_copy_lock(new_fl, request);
locks_move_blocks(new_fl, request);
locks_insert_lock_ctx(new_fl, &fl->fl_list);
fl = new_fl;
new_fl = NULL;
}
if (right) {
if (left == right) {
/* The new lock breaks the old one in two pieces,
* so we have to use the second new lock.
*/
left = new_fl2;
new_fl2 = NULL;
locks_copy_lock(left, right);
locks_insert_lock_ctx(left, &fl->fl_list);
}
right->fl_start = request->fl_end + 1;
locks_wake_up_blocks(right);
}
if (left) {
left->fl_end = request->fl_start - 1;
locks_wake_up_blocks(left);
}
out:
spin_unlock(&ctx->flc_lock);
percpu_up_read(&file_rwsem);
/*
* Free any unused locks.
*/
if (new_fl)
locks_free_lock(new_fl);
if (new_fl2)
locks_free_lock(new_fl2);
locks_dispose_list(&dispose);
trace_posix_lock_inode(inode, request, error);
return error;
}
/**
* posix_lock_file - Apply a POSIX-style lock to a file
* @filp: The file to apply the lock to
* @fl: The lock to be applied
* @conflock: Place to return a copy of the conflicting lock, if found.
*
* Add a POSIX style lock to a file.
* We merge adjacent & overlapping locks whenever possible.
* POSIX locks are sorted by owner task, then by starting address
*
* Note that if called with an FL_EXISTS argument, the caller may determine
* whether or not a lock was successfully freed by testing the return
* value for -ENOENT.
*/
int posix_lock_file(struct file *filp, struct file_lock *fl,
struct file_lock *conflock)
{
return posix_lock_inode(locks_inode(filp), fl, conflock);
}
EXPORT_SYMBOL(posix_lock_file);
/**
* posix_lock_inode_wait - Apply a POSIX-style lock to a file
* @inode: inode of file to which lock request should be applied
* @fl: The lock to be applied
*
* Apply a POSIX style lock request to an inode.
*/
static int posix_lock_inode_wait(struct inode *inode, struct file_lock *fl)
{
int error;
might_sleep ();
for (;;) {
error = posix_lock_inode(inode, fl, NULL);
if (error != FILE_LOCK_DEFERRED)
break;
error = wait_event_interruptible(fl->fl_wait,
list_empty(&fl->fl_blocked_member));
if (error)
break;
}
locks_delete_block(fl);
return error;
}
static void lease_clear_pending(struct file_lock *fl, int arg)
{
switch (arg) {
case F_UNLCK:
fl->fl_flags &= ~FL_UNLOCK_PENDING;
fallthrough;
case F_RDLCK:
fl->fl_flags &= ~FL_DOWNGRADE_PENDING;
}
}
/* We already had a lease on this file; just change its type */
int lease_modify(struct file_lock *fl, int arg, struct list_head *dispose)
{
int error = assign_type(fl, arg);
if (error)
return error;
lease_clear_pending(fl, arg);
locks_wake_up_blocks(fl);
if (arg == F_UNLCK) {
struct file *filp = fl->fl_file;
f_delown(filp);
filp->f_owner.signum = 0;
fasync_helper(0, fl->fl_file, 0, &fl->fl_fasync);
if (fl->fl_fasync != NULL) {
printk(KERN_ERR "locks_delete_lock: fasync == %p\n", fl->fl_fasync);
fl->fl_fasync = NULL;
}
locks_delete_lock_ctx(fl, dispose);
}
return 0;
}
EXPORT_SYMBOL(lease_modify);
static bool past_time(unsigned long then)
{
if (!then)
/* 0 is a special value meaning "this never expires": */
return false;
return time_after(jiffies, then);
}
static void time_out_leases(struct inode *inode, struct list_head *dispose)
{
struct file_lock_context *ctx = inode->i_flctx;
struct file_lock *fl, *tmp;
lockdep_assert_held(&ctx->flc_lock);
list_for_each_entry_safe(fl, tmp, &ctx->flc_lease, fl_list) {
trace_time_out_leases(inode, fl);
if (past_time(fl->fl_downgrade_time))
lease_modify(fl, F_RDLCK, dispose);
if (past_time(fl->fl_break_time))
lease_modify(fl, F_UNLCK, dispose);
}
}
static bool leases_conflict(struct file_lock *lease, struct file_lock *breaker)
{
bool rc;
if (lease->fl_lmops->lm_breaker_owns_lease
&& lease->fl_lmops->lm_breaker_owns_lease(lease))
return false;
if ((breaker->fl_flags & FL_LAYOUT) != (lease->fl_flags & FL_LAYOUT)) {
rc = false;
goto trace;
}
if ((breaker->fl_flags & FL_DELEG) && (lease->fl_flags & FL_LEASE)) {
rc = false;
goto trace;
}
rc = locks_conflict(breaker, lease);
trace:
trace_leases_conflict(rc, lease, breaker);
return rc;
}
static bool
any_leases_conflict(struct inode *inode, struct file_lock *breaker)
{
struct file_lock_context *ctx = inode->i_flctx;
struct file_lock *fl;
lockdep_assert_held(&ctx->flc_lock);
list_for_each_entry(fl, &ctx->flc_lease, fl_list) {
if (leases_conflict(fl, breaker))
return true;
}
return false;
}
/**
* __break_lease - revoke all outstanding leases on file
* @inode: the inode of the file to return
* @mode: O_RDONLY: break only write leases; O_WRONLY or O_RDWR:
* break all leases
* @type: FL_LEASE: break leases and delegations; FL_DELEG: break
* only delegations
*
* break_lease (inlined for speed) has checked there already is at least
* some kind of lock (maybe a lease) on this file. Leases are broken on
* a call to open() or truncate(). This function can sleep unless you
* specified %O_NONBLOCK to your open().
*/
int __break_lease(struct inode *inode, unsigned int mode, unsigned int type)
{
int error = 0;
struct file_lock_context *ctx;
struct file_lock *new_fl, *fl, *tmp;
unsigned long break_time;
int want_write = (mode & O_ACCMODE) != O_RDONLY;
LIST_HEAD(dispose);
new_fl = lease_alloc(NULL, want_write ? F_WRLCK : F_RDLCK);
if (IS_ERR(new_fl))
return PTR_ERR(new_fl);
new_fl->fl_flags = type;
/* typically we will check that ctx is non-NULL before calling */
ctx = smp_load_acquire(&inode->i_flctx);
if (!ctx) {
WARN_ON_ONCE(1);
goto free_lock;
}
percpu_down_read(&file_rwsem);
spin_lock(&ctx->flc_lock);
time_out_leases(inode, &dispose);
if (!any_leases_conflict(inode, new_fl))
goto out;
break_time = 0;
if (lease_break_time > 0) {
break_time = jiffies + lease_break_time * HZ;
if (break_time == 0)
break_time++; /* so that 0 means no break time */
}
list_for_each_entry_safe(fl, tmp, &ctx->flc_lease, fl_list) {
if (!leases_conflict(fl, new_fl))
continue;
if (want_write) {
if (fl->fl_flags & FL_UNLOCK_PENDING)
continue;
fl->fl_flags |= FL_UNLOCK_PENDING;
fl->fl_break_time = break_time;
} else {
if (lease_breaking(fl))
continue;
fl->fl_flags |= FL_DOWNGRADE_PENDING;
fl->fl_downgrade_time = break_time;
}
if (fl->fl_lmops->lm_break(fl))
locks_delete_lock_ctx(fl, &dispose);
}
if (list_empty(&ctx->flc_lease))
goto out;
if (mode & O_NONBLOCK) {
trace_break_lease_noblock(inode, new_fl);
error = -EWOULDBLOCK;
goto out;
}
restart:
fl = list_first_entry(&ctx->flc_lease, struct file_lock, fl_list);
break_time = fl->fl_break_time;
if (break_time != 0)
break_time -= jiffies;
if (break_time == 0)
break_time++;
locks_insert_block(fl, new_fl, leases_conflict);
trace_break_lease_block(inode, new_fl);
spin_unlock(&ctx->flc_lock);
percpu_up_read(&file_rwsem);
locks_dispose_list(&dispose);
error = wait_event_interruptible_timeout(new_fl->fl_wait,
list_empty(&new_fl->fl_blocked_member),
break_time);
percpu_down_read(&file_rwsem);
spin_lock(&ctx->flc_lock);
trace_break_lease_unblock(inode, new_fl);
locks_delete_block(new_fl);
if (error >= 0) {
/*
* Wait for the next conflicting lease that has not been
* broken yet
*/
if (error == 0)
time_out_leases(inode, &dispose);
if (any_leases_conflict(inode, new_fl))
goto restart;
error = 0;
}
out:
spin_unlock(&ctx->flc_lock);
percpu_up_read(&file_rwsem);
locks_dispose_list(&dispose);
free_lock:
locks_free_lock(new_fl);
return error;
}
EXPORT_SYMBOL(__break_lease);
/**
* lease_get_mtime - update modified time of an inode with exclusive lease
* @inode: the inode
* @time: pointer to a timespec which contains the last modified time
*
* This is to force NFS clients to flush their caches for files with
* exclusive leases. The justification is that if someone has an
* exclusive lease, then they could be modifying it.
*/
void lease_get_mtime(struct inode *inode, struct timespec64 *time)
{
bool has_lease = false;
struct file_lock_context *ctx;
struct file_lock *fl;
ctx = smp_load_acquire(&inode->i_flctx);
if (ctx && !list_empty_careful(&ctx->flc_lease)) {
spin_lock(&ctx->flc_lock);
fl = list_first_entry_or_null(&ctx->flc_lease,
struct file_lock, fl_list);
if (fl && (fl->fl_type == F_WRLCK))
has_lease = true;
spin_unlock(&ctx->flc_lock);
}
if (has_lease)
*time = current_time(inode);
}
EXPORT_SYMBOL(lease_get_mtime);
/**
* fcntl_getlease - Enquire what lease is currently active
* @filp: the file
*
* The value returned by this function will be one of
* (if no lease break is pending):
*
* %F_RDLCK to indicate a shared lease is held.
*
* %F_WRLCK to indicate an exclusive lease is held.
*
* %F_UNLCK to indicate no lease is held.
*
* (if a lease break is pending):
*
* %F_RDLCK to indicate an exclusive lease needs to be
* changed to a shared lease (or removed).
*
* %F_UNLCK to indicate the lease needs to be removed.
*
* XXX: sfr & willy disagree over whether F_INPROGRESS
* should be returned to userspace.
*/
int fcntl_getlease(struct file *filp)
{
struct file_lock *fl;
struct inode *inode = locks_inode(filp);
struct file_lock_context *ctx;
int type = F_UNLCK;
LIST_HEAD(dispose);
ctx = smp_load_acquire(&inode->i_flctx);
if (ctx && !list_empty_careful(&ctx->flc_lease)) {
percpu_down_read(&file_rwsem);
spin_lock(&ctx->flc_lock);
time_out_leases(inode, &dispose);
list_for_each_entry(fl, &ctx->flc_lease, fl_list) {
if (fl->fl_file != filp)
continue;
type = target_leasetype(fl);
break;
}
spin_unlock(&ctx->flc_lock);
percpu_up_read(&file_rwsem);
locks_dispose_list(&dispose);
}
return type;
}
/**
* check_conflicting_open - see if the given file points to an inode that has
* an existing open that would conflict with the
* desired lease.
* @filp: file to check
* @arg: type of lease that we're trying to acquire
* @flags: current lock flags
*
* Check to see if there's an existing open fd on this file that would
* conflict with the lease we're trying to set.
*/
static int
check_conflicting_open(struct file *filp, const long arg, int flags)
{
struct inode *inode = locks_inode(filp);
int self_wcount = 0, self_rcount = 0;
if (flags & FL_LAYOUT)
return 0;
if (flags & FL_DELEG)
/* We leave these checks to the caller */
return 0;
if (arg == F_RDLCK)
return inode_is_open_for_write(inode) ? -EAGAIN : 0;
else if (arg != F_WRLCK)
return 0;
/*
* Make sure that only read/write count is from lease requestor.
* Note that this will result in denying write leases when i_writecount
* is negative, which is what we want. (We shouldn't grant write leases
* on files open for execution.)
*/
if (filp->f_mode & FMODE_WRITE)
self_wcount = 1;
else if (filp->f_mode & FMODE_READ)
self_rcount = 1;
if (atomic_read(&inode->i_writecount) != self_wcount ||
atomic_read(&inode->i_readcount) != self_rcount)
return -EAGAIN;
return 0;
}
static int
generic_add_lease(struct file *filp, long arg, struct file_lock **flp, void **priv)
{
struct file_lock *fl, *my_fl = NULL, *lease;
struct inode *inode = locks_inode(filp);
struct file_lock_context *ctx;
bool is_deleg = (*flp)->fl_flags & FL_DELEG;
int error;
LIST_HEAD(dispose);
lease = *flp;
trace_generic_add_lease(inode, lease);
/* Note that arg is never F_UNLCK here */
ctx = locks_get_lock_context(inode, arg);
if (!ctx)
return -ENOMEM;
/*
* In the delegation case we need mutual exclusion with
* a number of operations that take the i_mutex. We trylock
* because delegations are an optional optimization, and if
* there's some chance of a conflict--we'd rather not
* bother, maybe that's a sign this just isn't a good file to
* hand out a delegation on.
*/
if (is_deleg && !inode_trylock(inode))
return -EAGAIN;
if (is_deleg && arg == F_WRLCK) {
/* Write delegations are not currently supported: */
inode_unlock(inode);
WARN_ON_ONCE(1);
return -EINVAL;
}
percpu_down_read(&file_rwsem);
spin_lock(&ctx->flc_lock);
time_out_leases(inode, &dispose);
error = check_conflicting_open(filp, arg, lease->fl_flags);
if (error)
goto out;
/*
* At this point, we know that if there is an exclusive
* lease on this file, then we hold it on this filp
* (otherwise our open of this file would have blocked).
* And if we are trying to acquire an exclusive lease,
* then the file is not open by anyone (including us)
* except for this filp.
*/
error = -EAGAIN;
list_for_each_entry(fl, &ctx->flc_lease, fl_list) {
if (fl->fl_file == filp &&
fl->fl_owner == lease->fl_owner) {
my_fl = fl;
continue;
}
/*
* No exclusive leases if someone else has a lease on
* this file:
*/
if (arg == F_WRLCK)
goto out;
/*
* Modifying our existing lease is OK, but no getting a
* new lease if someone else is opening for write:
*/
if (fl->fl_flags & FL_UNLOCK_PENDING)
goto out;
}
if (my_fl != NULL) {
lease = my_fl;
error = lease->fl_lmops->lm_change(lease, arg, &dispose);
if (error)
goto out;
goto out_setup;
}
error = -EINVAL;
if (!leases_enable)
goto out;
locks_insert_lock_ctx(lease, &ctx->flc_lease);
/*
* The check in break_lease() is lockless. It's possible for another
* open to race in after we did the earlier check for a conflicting
* open but before the lease was inserted. Check again for a
* conflicting open and cancel the lease if there is one.
*
* We also add a barrier here to ensure that the insertion of the lock
* precedes these checks.
*/
smp_mb();
error = check_conflicting_open(filp, arg, lease->fl_flags);
if (error) {
locks_unlink_lock_ctx(lease);
goto out;
}
out_setup:
if (lease->fl_lmops->lm_setup)
lease->fl_lmops->lm_setup(lease, priv);
out:
spin_unlock(&ctx->flc_lock);
percpu_up_read(&file_rwsem);
locks_dispose_list(&dispose);
if (is_deleg)
inode_unlock(inode);
if (!error && !my_fl)
*flp = NULL;
return error;
}
static int generic_delete_lease(struct file *filp, void *owner)
{
int error = -EAGAIN;
struct file_lock *fl, *victim = NULL;
struct inode *inode = locks_inode(filp);
struct file_lock_context *ctx;
LIST_HEAD(dispose);
ctx = smp_load_acquire(&inode->i_flctx);
if (!ctx) {
trace_generic_delete_lease(inode, NULL);
return error;
}
percpu_down_read(&file_rwsem);
spin_lock(&ctx->flc_lock);
list_for_each_entry(fl, &ctx->flc_lease, fl_list) {
if (fl->fl_file == filp &&
fl->fl_owner == owner) {
victim = fl;
break;
}
}
trace_generic_delete_lease(inode, victim);
if (victim)
error = fl->fl_lmops->lm_change(victim, F_UNLCK, &dispose);
spin_unlock(&ctx->flc_lock);
percpu_up_read(&file_rwsem);
locks_dispose_list(&dispose);
return error;
}
/**
* generic_setlease - sets a lease on an open file
* @filp: file pointer
* @arg: type of lease to obtain
* @flp: input - file_lock to use, output - file_lock inserted
* @priv: private data for lm_setup (may be NULL if lm_setup
* doesn't require it)
*
* The (input) flp->fl_lmops->lm_break function is required
* by break_lease().
*/
int generic_setlease(struct file *filp, long arg, struct file_lock **flp,
void **priv)
{
struct inode *inode = locks_inode(filp);
int error;
if ((!uid_eq(current_fsuid(), inode->i_uid)) && !capable(CAP_LEASE))
return -EACCES;
if (!S_ISREG(inode->i_mode))
return -EINVAL;
error = security_file_lock(filp, arg);
if (error)
return error;
switch (arg) {
case F_UNLCK:
return generic_delete_lease(filp, *priv);
case F_RDLCK:
case F_WRLCK:
if (!(*flp)->fl_lmops->lm_break) {
WARN_ON_ONCE(1);
return -ENOLCK;
}
return generic_add_lease(filp, arg, flp, priv);
default:
return -EINVAL;
}
}
EXPORT_SYMBOL(generic_setlease);
#if IS_ENABLED(CONFIG_SRCU)
/*
* Kernel subsystems can register to be notified on any attempt to set
* a new lease with the lease_notifier_chain. This is used by (e.g.) nfsd
* to close files that it may have cached when there is an attempt to set a
* conflicting lease.
*/
static struct srcu_notifier_head lease_notifier_chain;
static inline void
lease_notifier_chain_init(void)
{
srcu_init_notifier_head(&lease_notifier_chain);
}
static inline void
setlease_notifier(long arg, struct file_lock *lease)
{
if (arg != F_UNLCK)
srcu_notifier_call_chain(&lease_notifier_chain, arg, lease);
}
int lease_register_notifier(struct notifier_block *nb)
{
return srcu_notifier_chain_register(&lease_notifier_chain, nb);
}
EXPORT_SYMBOL_GPL(lease_register_notifier);
void lease_unregister_notifier(struct notifier_block *nb)
{
srcu_notifier_chain_unregister(&lease_notifier_chain, nb);
}
EXPORT_SYMBOL_GPL(lease_unregister_notifier);
#else /* !IS_ENABLED(CONFIG_SRCU) */
static inline void
lease_notifier_chain_init(void)
{
}
static inline void
setlease_notifier(long arg, struct file_lock *lease)
{
}
int lease_register_notifier(struct notifier_block *nb)
{
return 0;
}
EXPORT_SYMBOL_GPL(lease_register_notifier);
void lease_unregister_notifier(struct notifier_block *nb)
{
}
EXPORT_SYMBOL_GPL(lease_unregister_notifier);
#endif /* IS_ENABLED(CONFIG_SRCU) */
/**
* vfs_setlease - sets a lease on an open file
* @filp: file pointer
* @arg: type of lease to obtain
* @lease: file_lock to use when adding a lease
* @priv: private info for lm_setup when adding a lease (may be
* NULL if lm_setup doesn't require it)
*
* Call this to establish a lease on the file. The "lease" argument is not
* used for F_UNLCK requests and may be NULL. For commands that set or alter
* an existing lease, the ``(*lease)->fl_lmops->lm_break`` operation must be
* set; if not, this function will return -ENOLCK (and generate a scary-looking
* stack trace).
*
* The "priv" pointer is passed directly to the lm_setup function as-is. It
* may be NULL if the lm_setup operation doesn't require it.
*/
int
vfs_setlease(struct file *filp, long arg, struct file_lock **lease, void **priv)
{
if (lease)
setlease_notifier(arg, *lease);
if (filp->f_op->setlease)
return filp->f_op->setlease(filp, arg, lease, priv);
else
return generic_setlease(filp, arg, lease, priv);
}
EXPORT_SYMBOL_GPL(vfs_setlease);
static int do_fcntl_add_lease(unsigned int fd, struct file *filp, long arg)
{
struct file_lock *fl;
struct fasync_struct *new;
int error;
fl = lease_alloc(filp, arg);
if (IS_ERR(fl))
return PTR_ERR(fl);
new = fasync_alloc();
if (!new) {
locks_free_lock(fl);
return -ENOMEM;
}
new->fa_fd = fd;
error = vfs_setlease(filp, arg, &fl, (void **)&new);
if (fl)
locks_free_lock(fl);
if (new)
fasync_free(new);
return error;
}
/**
* fcntl_setlease - sets a lease on an open file
* @fd: open file descriptor
* @filp: file pointer
* @arg: type of lease to obtain
*
* Call this fcntl to establish a lease on the file.
* Note that you also need to call %F_SETSIG to
* receive a signal when the lease is broken.
*/
int fcntl_setlease(unsigned int fd, struct file *filp, long arg)
{
if (arg == F_UNLCK)
return vfs_setlease(filp, F_UNLCK, NULL, (void **)&filp);
return do_fcntl_add_lease(fd, filp, arg);
}
/**
* flock_lock_inode_wait - Apply a FLOCK-style lock to a file
* @inode: inode of the file to apply to
* @fl: The lock to be applied
*
* Apply a FLOCK style lock request to an inode.
*/
static int flock_lock_inode_wait(struct inode *inode, struct file_lock *fl)
{
int error;
might_sleep();
for (;;) {
error = flock_lock_inode(inode, fl);
if (error != FILE_LOCK_DEFERRED)
break;
error = wait_event_interruptible(fl->fl_wait,
list_empty(&fl->fl_blocked_member));
if (error)
break;
}
locks_delete_block(fl);
return error;
}
/**
* locks_lock_inode_wait - Apply a lock to an inode
* @inode: inode of the file to apply to
* @fl: The lock to be applied
*
* Apply a POSIX or FLOCK style lock request to an inode.
*/
int locks_lock_inode_wait(struct inode *inode, struct file_lock *fl)
{
int res = 0;
switch (fl->fl_flags & (FL_POSIX|FL_FLOCK)) {
case FL_POSIX:
res = posix_lock_inode_wait(inode, fl);
break;
case FL_FLOCK:
res = flock_lock_inode_wait(inode, fl);
break;
default:
BUG();
}
return res;
}
EXPORT_SYMBOL(locks_lock_inode_wait);
/**
* sys_flock: - flock() system call.
* @fd: the file descriptor to lock.
* @cmd: the type of lock to apply.
*
* Apply a %FL_FLOCK style lock to an open file descriptor.
* The @cmd can be one of:
*
* - %LOCK_SH -- a shared lock.
* - %LOCK_EX -- an exclusive lock.
* - %LOCK_UN -- remove an existing lock.
* - %LOCK_MAND -- a 'mandatory' flock.
* This exists to emulate Windows Share Modes.
*
* %LOCK_MAND can be combined with %LOCK_READ or %LOCK_WRITE to allow other
* processes read and write access respectively.
*/
SYSCALL_DEFINE2(flock, unsigned int, fd, unsigned int, cmd)
{
struct fd f = fdget(fd);
struct file_lock *lock;
int can_sleep, unlock;
int error;
error = -EBADF;
if (!f.file)
goto out;
can_sleep = !(cmd & LOCK_NB);
cmd &= ~LOCK_NB;
unlock = (cmd == LOCK_UN);
if (!unlock && !(cmd & LOCK_MAND) &&
!(f.file->f_mode & (FMODE_READ|FMODE_WRITE)))
goto out_putf;
lock = flock_make_lock(f.file, cmd, NULL);
if (IS_ERR(lock)) {
error = PTR_ERR(lock);
goto out_putf;
}
if (can_sleep)
lock->fl_flags |= FL_SLEEP;
error = security_file_lock(f.file, lock->fl_type);
if (error)
goto out_free;
if (f.file->f_op->flock)
error = f.file->f_op->flock(f.file,
(can_sleep) ? F_SETLKW : F_SETLK,
lock);
else
error = locks_lock_file_wait(f.file, lock);
out_free:
locks_free_lock(lock);
out_putf:
fdput(f);
out:
return error;
}
/**
* vfs_test_lock - test file byte range lock
* @filp: The file to test lock for
* @fl: The lock to test; also used to hold result
*
* Returns -ERRNO on failure. Indicates presence of conflicting lock by
* setting conf->fl_type to something other than F_UNLCK.
*/
int vfs_test_lock(struct file *filp, struct file_lock *fl)
{
if (filp->f_op->lock)
return filp->f_op->lock(filp, F_GETLK, fl);
posix_test_lock(filp, fl);
return 0;
}
EXPORT_SYMBOL_GPL(vfs_test_lock);
/**
* locks_translate_pid - translate a file_lock's fl_pid number into a namespace
* @fl: The file_lock who's fl_pid should be translated
* @ns: The namespace into which the pid should be translated
*
* Used to tranlate a fl_pid into a namespace virtual pid number
*/
static pid_t locks_translate_pid(struct file_lock *fl, struct pid_namespace *ns)
{
pid_t vnr;
struct pid *pid;
if (IS_OFDLCK(fl))
return -1;
if (IS_REMOTELCK(fl))
return fl->fl_pid;
/*
* If the flock owner process is dead and its pid has been already
* freed, the translation below won't work, but we still want to show
* flock owner pid number in init pidns.
*/
if (ns == &init_pid_ns)
return (pid_t)fl->fl_pid;
rcu_read_lock();
pid = find_pid_ns(fl->fl_pid, &init_pid_ns);
vnr = pid_nr_ns(pid, ns);
rcu_read_unlock();
return vnr;
}
static int posix_lock_to_flock(struct flock *flock, struct file_lock *fl)
{
flock->l_pid = locks_translate_pid(fl, task_active_pid_ns(current));
#if BITS_PER_LONG == 32
/*
* Make sure we can represent the posix lock via
* legacy 32bit flock.
*/
if (fl->fl_start > OFFT_OFFSET_MAX)
return -EOVERFLOW;
if (fl->fl_end != OFFSET_MAX && fl->fl_end > OFFT_OFFSET_MAX)
return -EOVERFLOW;
#endif
flock->l_start = fl->fl_start;
flock->l_len = fl->fl_end == OFFSET_MAX ? 0 :
fl->fl_end - fl->fl_start + 1;
flock->l_whence = 0;
flock->l_type = fl->fl_type;
return 0;
}
#if BITS_PER_LONG == 32
static void posix_lock_to_flock64(struct flock64 *flock, struct file_lock *fl)
{
flock->l_pid = locks_translate_pid(fl, task_active_pid_ns(current));
flock->l_start = fl->fl_start;
flock->l_len = fl->fl_end == OFFSET_MAX ? 0 :
fl->fl_end - fl->fl_start + 1;
flock->l_whence = 0;
flock->l_type = fl->fl_type;
}
#endif
/* Report the first existing lock that would conflict with l.
* This implements the F_GETLK command of fcntl().
*/
int fcntl_getlk(struct file *filp, unsigned int cmd, struct flock *flock)
{
struct file_lock *fl;
int error;
fl = locks_alloc_lock();
if (fl == NULL)
return -ENOMEM;
error = -EINVAL;
if (flock->l_type != F_RDLCK && flock->l_type != F_WRLCK)
goto out;
error = flock_to_posix_lock(filp, fl, flock);
if (error)
goto out;
if (cmd == F_OFD_GETLK) {
error = -EINVAL;
if (flock->l_pid != 0)
goto out;
fl->fl_flags |= FL_OFDLCK;
fl->fl_owner = filp;
}
error = vfs_test_lock(filp, fl);
if (error)
goto out;
flock->l_type = fl->fl_type;
if (fl->fl_type != F_UNLCK) {
error = posix_lock_to_flock(flock, fl);
if (error)
goto out;
}
out:
locks_free_lock(fl);
return error;
}
/**
* vfs_lock_file - file byte range lock
* @filp: The file to apply the lock to
* @cmd: type of locking operation (F_SETLK, F_GETLK, etc.)
* @fl: The lock to be applied
* @conf: Place to return a copy of the conflicting lock, if found.
*
* A caller that doesn't care about the conflicting lock may pass NULL
* as the final argument.
*
* If the filesystem defines a private ->lock() method, then @conf will
* be left unchanged; so a caller that cares should initialize it to
* some acceptable default.
*
* To avoid blocking kernel daemons, such as lockd, that need to acquire POSIX
* locks, the ->lock() interface may return asynchronously, before the lock has
* been granted or denied by the underlying filesystem, if (and only if)
* lm_grant is set. Callers expecting ->lock() to return asynchronously
* will only use F_SETLK, not F_SETLKW; they will set FL_SLEEP if (and only if)
* the request is for a blocking lock. When ->lock() does return asynchronously,
* it must return FILE_LOCK_DEFERRED, and call ->lm_grant() when the lock
* request completes.
* If the request is for non-blocking lock the file system should return
* FILE_LOCK_DEFERRED then try to get the lock and call the callback routine
* with the result. If the request timed out the callback routine will return a
* nonzero return code and the file system should release the lock. The file
* system is also responsible to keep a corresponding posix lock when it
* grants a lock so the VFS can find out which locks are locally held and do
* the correct lock cleanup when required.
* The underlying filesystem must not drop the kernel lock or call
* ->lm_grant() before returning to the caller with a FILE_LOCK_DEFERRED
* return code.
*/
int vfs_lock_file(struct file *filp, unsigned int cmd, struct file_lock *fl, struct file_lock *conf)
{
if (filp->f_op->lock)
return filp->f_op->lock(filp, cmd, fl);
else
return posix_lock_file(filp, fl, conf);
}
EXPORT_SYMBOL_GPL(vfs_lock_file);
static int do_lock_file_wait(struct file *filp, unsigned int cmd,
struct file_lock *fl)
{
int error;
error = security_file_lock(filp, fl->fl_type);
if (error)
return error;
for (;;) {
error = vfs_lock_file(filp, cmd, fl, NULL);
if (error != FILE_LOCK_DEFERRED)
break;
error = wait_event_interruptible(fl->fl_wait,
list_empty(&fl->fl_blocked_member));
if (error)
break;
}
locks_delete_block(fl);
return error;
}
/* Ensure that fl->fl_file has compatible f_mode for F_SETLK calls */
static int
check_fmode_for_setlk(struct file_lock *fl)
{
switch (fl->fl_type) {
case F_RDLCK:
if (!(fl->fl_file->f_mode & FMODE_READ))
return -EBADF;
break;
case F_WRLCK:
if (!(fl->fl_file->f_mode & FMODE_WRITE))
return -EBADF;
}
return 0;
}
/* Apply the lock described by l to an open file descriptor.
* This implements both the F_SETLK and F_SETLKW commands of fcntl().
*/
int fcntl_setlk(unsigned int fd, struct file *filp, unsigned int cmd,
struct flock *flock)
{
struct file_lock *file_lock = locks_alloc_lock();
struct inode *inode = locks_inode(filp);
struct file *f;
int error;
if (file_lock == NULL)
return -ENOLCK;
error = flock_to_posix_lock(filp, file_lock, flock);
if (error)
goto out;
error = check_fmode_for_setlk(file_lock);
if (error)
goto out;
/*
* If the cmd is requesting file-private locks, then set the
* FL_OFDLCK flag and override the owner.
*/
switch (cmd) {
case F_OFD_SETLK:
error = -EINVAL;
if (flock->l_pid != 0)
goto out;
cmd = F_SETLK;
file_lock->fl_flags |= FL_OFDLCK;
file_lock->fl_owner = filp;
break;
case F_OFD_SETLKW:
error = -EINVAL;
if (flock->l_pid != 0)
goto out;
cmd = F_SETLKW;
file_lock->fl_flags |= FL_OFDLCK;
file_lock->fl_owner = filp;
fallthrough;
case F_SETLKW:
file_lock->fl_flags |= FL_SLEEP;
}
error = do_lock_file_wait(filp, cmd, file_lock);
/*
* Attempt to detect a close/fcntl race and recover by releasing the
* lock that was just acquired. There is no need to do that when we're
* unlocking though, or for OFD locks.
*/
if (!error && file_lock->fl_type != F_UNLCK &&
!(file_lock->fl_flags & FL_OFDLCK)) {
struct files_struct *files = current->files;
/*
* We need that spin_lock here - it prevents reordering between
* update of i_flctx->flc_posix and check for it done in
* close(). rcu_read_lock() wouldn't do.
*/
spin_lock(&files->file_lock);
f = files_lookup_fd_locked(files, fd);
spin_unlock(&files->file_lock);
if (f != filp) {
file_lock->fl_type = F_UNLCK;
error = do_lock_file_wait(filp, cmd, file_lock);
WARN_ON_ONCE(error);
error = -EBADF;
}
}
out:
trace_fcntl_setlk(inode, file_lock, error);
locks_free_lock(file_lock);
return error;
}
#if BITS_PER_LONG == 32
/* Report the first existing lock that would conflict with l.
* This implements the F_GETLK command of fcntl().
*/
int fcntl_getlk64(struct file *filp, unsigned int cmd, struct flock64 *flock)
{
struct file_lock *fl;
int error;
fl = locks_alloc_lock();
if (fl == NULL)
return -ENOMEM;
error = -EINVAL;
if (flock->l_type != F_RDLCK && flock->l_type != F_WRLCK)
goto out;
error = flock64_to_posix_lock(filp, fl, flock);
if (error)
goto out;
if (cmd == F_OFD_GETLK) {
error = -EINVAL;
if (flock->l_pid != 0)
goto out;
cmd = F_GETLK64;
fl->fl_flags |= FL_OFDLCK;
fl->fl_owner = filp;
}
error = vfs_test_lock(filp, fl);
if (error)
goto out;
flock->l_type = fl->fl_type;
if (fl->fl_type != F_UNLCK)
posix_lock_to_flock64(flock, fl);
out:
locks_free_lock(fl);
return error;
}
/* Apply the lock described by l to an open file descriptor.
* This implements both the F_SETLK and F_SETLKW commands of fcntl().
*/
int fcntl_setlk64(unsigned int fd, struct file *filp, unsigned int cmd,
struct flock64 *flock)
{
struct file_lock *file_lock = locks_alloc_lock();
struct file *f;
int error;
if (file_lock == NULL)
return -ENOLCK;
error = flock64_to_posix_lock(filp, file_lock, flock);
if (error)
goto out;
error = check_fmode_for_setlk(file_lock);
if (error)
goto out;
/*
* If the cmd is requesting file-private locks, then set the
* FL_OFDLCK flag and override the owner.
*/
switch (cmd) {
case F_OFD_SETLK:
error = -EINVAL;
if (flock->l_pid != 0)
goto out;
cmd = F_SETLK64;
file_lock->fl_flags |= FL_OFDLCK;
file_lock->fl_owner = filp;
break;
case F_OFD_SETLKW:
error = -EINVAL;
if (flock->l_pid != 0)
goto out;
cmd = F_SETLKW64;
file_lock->fl_flags |= FL_OFDLCK;
file_lock->fl_owner = filp;
fallthrough;
case F_SETLKW64:
file_lock->fl_flags |= FL_SLEEP;
}
error = do_lock_file_wait(filp, cmd, file_lock);
/*
* Attempt to detect a close/fcntl race and recover by releasing the
* lock that was just acquired. There is no need to do that when we're
* unlocking though, or for OFD locks.
*/
if (!error && file_lock->fl_type != F_UNLCK &&
!(file_lock->fl_flags & FL_OFDLCK)) {
struct files_struct *files = current->files;
/*
* We need that spin_lock here - it prevents reordering between
* update of i_flctx->flc_posix and check for it done in
* close(). rcu_read_lock() wouldn't do.
*/
spin_lock(&files->file_lock);
f = files_lookup_fd_locked(files, fd);
spin_unlock(&files->file_lock);
if (f != filp) {
file_lock->fl_type = F_UNLCK;
error = do_lock_file_wait(filp, cmd, file_lock);
WARN_ON_ONCE(error);
error = -EBADF;
}
}
out:
locks_free_lock(file_lock);
return error;
}
#endif /* BITS_PER_LONG == 32 */
/*
* This function is called when the file is being removed
* from the task's fd array. POSIX locks belonging to this task
* are deleted at this time.
*/
void locks_remove_posix(struct file *filp, fl_owner_t owner)
{
int error;
struct inode *inode = locks_inode(filp);
struct file_lock lock;
struct file_lock_context *ctx;
/*
* If there are no locks held on this file, we don't need to call
* posix_lock_file(). Another process could be setting a lock on this
* file at the same time, but we wouldn't remove that lock anyway.
*/
ctx = smp_load_acquire(&inode->i_flctx);
if (!ctx || list_empty(&ctx->flc_posix)) return; locks_init_lock(&lock);
lock.fl_type = F_UNLCK;
lock.fl_flags = FL_POSIX | FL_CLOSE;
lock.fl_start = 0;
lock.fl_end = OFFSET_MAX;
lock.fl_owner = owner;
lock.fl_pid = current->tgid;
lock.fl_file = filp;
lock.fl_ops = NULL;
lock.fl_lmops = NULL;
error = vfs_lock_file(filp, F_SETLK, &lock, NULL);
if (lock.fl_ops && lock.fl_ops->fl_release_private) lock.fl_ops->fl_release_private(&lock);
trace_locks_remove_posix(inode, &lock, error);
}
EXPORT_SYMBOL(locks_remove_posix);
/* The i_flctx must be valid when calling into here */
static void
locks_remove_flock(struct file *filp, struct file_lock_context *flctx)
{
struct file_lock fl;
struct inode *inode = locks_inode(filp); if (list_empty(&flctx->flc_flock)) return;
flock_make_lock(filp, LOCK_UN, &fl);
fl.fl_flags |= FL_CLOSE;
if (filp->f_op->flock)
filp->f_op->flock(filp, F_SETLKW, &fl);
else
flock_lock_inode(inode, &fl); if (fl.fl_ops && fl.fl_ops->fl_release_private) fl.fl_ops->fl_release_private(&fl);
}
/* The i_flctx must be valid when calling into here */
static void
locks_remove_lease(struct file *filp, struct file_lock_context *ctx)
{
struct file_lock *fl, *tmp;
LIST_HEAD(dispose);
if (list_empty(&ctx->flc_lease))
return;
percpu_down_read(&file_rwsem);
spin_lock(&ctx->flc_lock);
list_for_each_entry_safe(fl, tmp, &ctx->flc_lease, fl_list) if (filp == fl->fl_file) lease_modify(fl, F_UNLCK, &dispose);
spin_unlock(&ctx->flc_lock);
percpu_up_read(&file_rwsem);
locks_dispose_list(&dispose);
}
/*
* This function is called on the last close of an open file.
*/
void locks_remove_file(struct file *filp)
{
struct file_lock_context *ctx;
ctx = smp_load_acquire(&locks_inode(filp)->i_flctx);
if (!ctx)
return;
/* remove any OFD locks */
locks_remove_posix(filp, filp);
/* remove flock locks */
locks_remove_flock(filp, ctx);
/* remove any leases */
locks_remove_lease(filp, ctx);
spin_lock(&ctx->flc_lock);
locks_check_ctx_file_list(filp, &ctx->flc_posix, "POSIX");
locks_check_ctx_file_list(filp, &ctx->flc_flock, "FLOCK");
locks_check_ctx_file_list(filp, &ctx->flc_lease, "LEASE");
spin_unlock(&ctx->flc_lock);
}
/**
* vfs_cancel_lock - file byte range unblock lock
* @filp: The file to apply the unblock to
* @fl: The lock to be unblocked
*
* Used by lock managers to cancel blocked requests
*/
int vfs_cancel_lock(struct file *filp, struct file_lock *fl)
{
if (filp->f_op->lock)
return filp->f_op->lock(filp, F_CANCELLK, fl);
return 0;
}
EXPORT_SYMBOL_GPL(vfs_cancel_lock);
#ifdef CONFIG_PROC_FS
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
struct locks_iterator {
int li_cpu;
loff_t li_pos;
};
static void lock_get_status(struct seq_file *f, struct file_lock *fl,
loff_t id, char *pfx, int repeat)
{
struct inode *inode = NULL;
unsigned int fl_pid;
struct pid_namespace *proc_pidns = proc_pid_ns(file_inode(f->file)->i_sb);
fl_pid = locks_translate_pid(fl, proc_pidns);
/*
* If lock owner is dead (and pid is freed) or not visible in current
* pidns, zero is shown as a pid value. Check lock info from
* init_pid_ns to get saved lock pid value.
*/
if (fl->fl_file != NULL)
inode = locks_inode(fl->fl_file);
seq_printf(f, "%lld: ", id);
if (repeat)
seq_printf(f, "%*s", repeat - 1 + (int)strlen(pfx), pfx);
if (IS_POSIX(fl)) {
if (fl->fl_flags & FL_ACCESS)
seq_puts(f, "ACCESS");
else if (IS_OFDLCK(fl))
seq_puts(f, "OFDLCK");
else
seq_puts(f, "POSIX ");
seq_printf(f, " %s ",
(inode == NULL) ? "*NOINODE*" : "ADVISORY ");
} else if (IS_FLOCK(fl)) {
if (fl->fl_type & LOCK_MAND) {
seq_puts(f, "FLOCK MSNFS ");
} else {
seq_puts(f, "FLOCK ADVISORY ");
}
} else if (IS_LEASE(fl)) {
if (fl->fl_flags & FL_DELEG)
seq_puts(f, "DELEG ");
else
seq_puts(f, "LEASE ");
if (lease_breaking(fl))
seq_puts(f, "BREAKING ");
else if (fl->fl_file)
seq_puts(f, "ACTIVE ");
else
seq_puts(f, "BREAKER ");
} else {
seq_puts(f, "UNKNOWN UNKNOWN ");
}
if (fl->fl_type & LOCK_MAND) {
seq_printf(f, "%s ",
(fl->fl_type & LOCK_READ)
? (fl->fl_type & LOCK_WRITE) ? "RW " : "READ "
: (fl->fl_type & LOCK_WRITE) ? "WRITE" : "NONE ");
} else {
int type = IS_LEASE(fl) ? target_leasetype(fl) : fl->fl_type;
seq_printf(f, "%s ", (type == F_WRLCK) ? "WRITE" :
(type == F_RDLCK) ? "READ" : "UNLCK");
}
if (inode) {
/* userspace relies on this representation of dev_t */
seq_printf(f, "%d %02x:%02x:%lu ", fl_pid,
MAJOR(inode->i_sb->s_dev),
MINOR(inode->i_sb->s_dev), inode->i_ino);
} else {
seq_printf(f, "%d <none>:0 ", fl_pid);
}
if (IS_POSIX(fl)) {
if (fl->fl_end == OFFSET_MAX)
seq_printf(f, "%Ld EOF\n", fl->fl_start);
else
seq_printf(f, "%Ld %Ld\n", fl->fl_start, fl->fl_end);
} else {
seq_puts(f, "0 EOF\n");
}
}
static struct file_lock *get_next_blocked_member(struct file_lock *node)
{
struct file_lock *tmp;
/* NULL node or root node */
if (node == NULL || node->fl_blocker == NULL)
return NULL;
/* Next member in the linked list could be itself */
tmp = list_next_entry(node, fl_blocked_member);
if (list_entry_is_head(tmp, &node->fl_blocker->fl_blocked_requests, fl_blocked_member)
|| tmp == node) {
return NULL;
}
return tmp;
}
static int locks_show(struct seq_file *f, void *v)
{
struct locks_iterator *iter = f->private;
struct file_lock *cur, *tmp;
struct pid_namespace *proc_pidns = proc_pid_ns(file_inode(f->file)->i_sb);
int level = 0;
cur = hlist_entry(v, struct file_lock, fl_link);
if (locks_translate_pid(cur, proc_pidns) == 0)
return 0;
/* View this crossed linked list as a binary tree, the first member of fl_blocked_requests
* is the left child of current node, the next silibing in fl_blocked_member is the
* right child, we can alse get the parent of current node from fl_blocker, so this
* question becomes traversal of a binary tree
*/
while (cur != NULL) {
if (level)
lock_get_status(f, cur, iter->li_pos, "-> ", level);
else
lock_get_status(f, cur, iter->li_pos, "", level);
if (!list_empty(&cur->fl_blocked_requests)) {
/* Turn left */
cur = list_first_entry_or_null(&cur->fl_blocked_requests,
struct file_lock, fl_blocked_member);
level++;
} else {
/* Turn right */
tmp = get_next_blocked_member(cur);
/* Fall back to parent node */
while (tmp == NULL && cur->fl_blocker != NULL) {
cur = cur->fl_blocker;
level--;
tmp = get_next_blocked_member(cur);
}
cur = tmp;
}
}
return 0;
}
static void __show_fd_locks(struct seq_file *f,
struct list_head *head, int *id,
struct file *filp, struct files_struct *files)
{
struct file_lock *fl;
list_for_each_entry(fl, head, fl_list) {
if (filp != fl->fl_file)
continue;
if (fl->fl_owner != files &&
fl->fl_owner != filp)
continue;
(*id)++;
seq_puts(f, "lock:\t");
lock_get_status(f, fl, *id, "", 0);
}
}
void show_fd_locks(struct seq_file *f,
struct file *filp, struct files_struct *files)
{
struct inode *inode = locks_inode(filp);
struct file_lock_context *ctx;
int id = 0;
ctx = smp_load_acquire(&inode->i_flctx);
if (!ctx)
return;
spin_lock(&ctx->flc_lock);
__show_fd_locks(f, &ctx->flc_flock, &id, filp, files);
__show_fd_locks(f, &ctx->flc_posix, &id, filp, files);
__show_fd_locks(f, &ctx->flc_lease, &id, filp, files);
spin_unlock(&ctx->flc_lock);
}
static void *locks_start(struct seq_file *f, loff_t *pos)
__acquires(&blocked_lock_lock)
{
struct locks_iterator *iter = f->private;
iter->li_pos = *pos + 1;
percpu_down_write(&file_rwsem);
spin_lock(&blocked_lock_lock);
return seq_hlist_start_percpu(&file_lock_list.hlist, &iter->li_cpu, *pos);
}
static void *locks_next(struct seq_file *f, void *v, loff_t *pos)
{
struct locks_iterator *iter = f->private;
++iter->li_pos;
return seq_hlist_next_percpu(v, &file_lock_list.hlist, &iter->li_cpu, pos);
}
static void locks_stop(struct seq_file *f, void *v)
__releases(&blocked_lock_lock)
{
spin_unlock(&blocked_lock_lock);
percpu_up_write(&file_rwsem);
}
static const struct seq_operations locks_seq_operations = {
.start = locks_start,
.next = locks_next,
.stop = locks_stop,
.show = locks_show,
};
static int __init proc_locks_init(void)
{
proc_create_seq_private("locks", 0, NULL, &locks_seq_operations,
sizeof(struct locks_iterator), NULL);
return 0;
}
fs_initcall(proc_locks_init);
#endif
static int __init filelock_init(void)
{
int i;
flctx_cache = kmem_cache_create("file_lock_ctx",
sizeof(struct file_lock_context), 0, SLAB_PANIC, NULL);
filelock_cache = kmem_cache_create("file_lock_cache",
sizeof(struct file_lock), 0, SLAB_PANIC, NULL);
for_each_possible_cpu(i) {
struct file_lock_list_struct *fll = per_cpu_ptr(&file_lock_list, i);
spin_lock_init(&fll->lock);
INIT_HLIST_HEAD(&fll->hlist);
}
lease_notifier_chain_init();
return 0;
}
core_initcall(filelock_init);
// SPDX-License-Identifier: GPL-2.0-or-later
/*
* fs/inotify_user.c - inotify support for userspace
*
* Authors:
* John McCutchan <ttb@tentacle.dhs.org>
* Robert Love <rml@novell.com>
*
* Copyright (C) 2005 John McCutchan
* Copyright 2006 Hewlett-Packard Development Company, L.P.
*
* Copyright (C) 2009 Eric Paris <Red Hat Inc>
* inotify was largely rewriten to make use of the fsnotify infrastructure
*/
#include <linux/dcache.h> /* d_unlinked */
#include <linux/fs.h> /* struct inode */
#include <linux/fsnotify_backend.h>
#include <linux/inotify.h>
#include <linux/path.h> /* struct path */
#include <linux/slab.h> /* kmem_* */
#include <linux/types.h>
#include <linux/sched.h>
#include <linux/sched/user.h>
#include <linux/sched/mm.h>
#include "inotify.h"
/*
* Check if 2 events contain the same information.
*/
static bool event_compare(struct fsnotify_event *old_fsn,
struct fsnotify_event *new_fsn)
{
struct inotify_event_info *old, *new;
old = INOTIFY_E(old_fsn);
new = INOTIFY_E(new_fsn);
if (old->mask & FS_IN_IGNORED)
return false;
if ((old->mask == new->mask) &&
(old->wd == new->wd) &&
(old->name_len == new->name_len) && (!old->name_len || !strcmp(old->name, new->name)))
return true;
return false;
}
static int inotify_merge(struct fsnotify_group *group,
struct fsnotify_event *event)
{
struct list_head *list = &group->notification_list;
struct fsnotify_event *last_event;
last_event = list_entry(list->prev, struct fsnotify_event, list); return event_compare(last_event, event);
}
int inotify_handle_inode_event(struct fsnotify_mark *inode_mark, u32 mask,
struct inode *inode, struct inode *dir,
const struct qstr *name, u32 cookie)
{
struct inotify_inode_mark *i_mark;
struct inotify_event_info *event;
struct fsnotify_event *fsn_event;
struct fsnotify_group *group = inode_mark->group;
int ret;
int len = 0;
int alloc_len = sizeof(struct inotify_event_info);
struct mem_cgroup *old_memcg;
if (name) {
len = name->len;
alloc_len += len + 1;
}
pr_debug("%s: group=%p mark=%p mask=%x\n", __func__, group, inode_mark,
mask);
i_mark = container_of(inode_mark, struct inotify_inode_mark,
fsn_mark);
/*
* Whoever is interested in the event, pays for the allocation. Do not
* trigger OOM killer in the target monitoring memcg as it may have
* security repercussion.
*/
old_memcg = set_active_memcg(group->memcg);
event = kmalloc(alloc_len, GFP_KERNEL_ACCOUNT | __GFP_RETRY_MAYFAIL);
set_active_memcg(old_memcg);
if (unlikely(!event)) {
/*
* Treat lost event due to ENOMEM the same way as queue
* overflow to let userspace know event was lost.
*/
fsnotify_queue_overflow(group);
return -ENOMEM;
}
/*
* We now report FS_ISDIR flag with MOVE_SELF and DELETE_SELF events
* for fanotify. inotify never reported IN_ISDIR with those events.
* It looks like an oversight, but to avoid the risk of breaking
* existing inotify programs, mask the flag out from those events.
*/
if (mask & (IN_MOVE_SELF | IN_DELETE_SELF)) mask &= ~IN_ISDIR; fsn_event = &event->fse;
fsnotify_init_event(fsn_event);
event->mask = mask;
event->wd = i_mark->wd;
event->sync_cookie = cookie;
event->name_len = len;
if (len)
strcpy(event->name, name->name); ret = fsnotify_add_event(group, fsn_event, inotify_merge, NULL);
if (ret) {
/* Our event wasn't used in the end. Free it. */
fsnotify_destroy_event(group, fsn_event);
}
if (inode_mark->mask & IN_ONESHOT) fsnotify_destroy_mark(inode_mark, group);
return 0;
}
static void inotify_freeing_mark(struct fsnotify_mark *fsn_mark, struct fsnotify_group *group)
{
inotify_ignored_and_remove_idr(fsn_mark, group);
}
/*
* This is NEVER supposed to be called. Inotify marks should either have been
* removed from the idr when the watch was removed or in the
* fsnotify_destroy_mark_by_group() call when the inotify instance was being
* torn down. This is only called if the idr is about to be freed but there
* are still marks in it.
*/
static int idr_callback(int id, void *p, void *data)
{
struct fsnotify_mark *fsn_mark;
struct inotify_inode_mark *i_mark;
static bool warned = false;
if (warned)
return 0;
warned = true;
fsn_mark = p;
i_mark = container_of(fsn_mark, struct inotify_inode_mark, fsn_mark);
WARN(1, "inotify closing but id=%d for fsn_mark=%p in group=%p still in "
"idr. Probably leaking memory\n", id, p, data);
/*
* I'm taking the liberty of assuming that the mark in question is a
* valid address and I'm dereferencing it. This might help to figure
* out why we got here and the panic is no worse than the original
* BUG() that was here.
*/
if (fsn_mark)
printk(KERN_WARNING "fsn_mark->group=%p wd=%d\n",
fsn_mark->group, i_mark->wd);
return 0;
}
static void inotify_free_group_priv(struct fsnotify_group *group)
{
/* ideally the idr is empty and we won't hit the BUG in the callback */
idr_for_each(&group->inotify_data.idr, idr_callback, group);
idr_destroy(&group->inotify_data.idr);
if (group->inotify_data.ucounts)
dec_inotify_instances(group->inotify_data.ucounts);
}
static void inotify_free_event(struct fsnotify_event *fsn_event)
{
kfree(INOTIFY_E(fsn_event));
}
/* ding dong the mark is dead */
static void inotify_free_mark(struct fsnotify_mark *fsn_mark)
{
struct inotify_inode_mark *i_mark;
i_mark = container_of(fsn_mark, struct inotify_inode_mark, fsn_mark);
kmem_cache_free(inotify_inode_mark_cachep, i_mark);
}
const struct fsnotify_ops inotify_fsnotify_ops = {
.handle_inode_event = inotify_handle_inode_event,
.free_group_priv = inotify_free_group_priv,
.free_event = inotify_free_event,
.freeing_mark = inotify_freeing_mark,
.free_mark = inotify_free_mark,
};
// SPDX-License-Identifier: GPL-2.0
/*
* blk-mq scheduling framework
*
* Copyright (C) 2016 Jens Axboe
*/
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/blk-mq.h>
#include <linux/list_sort.h>
#include <trace/events/block.h>
#include "blk.h"
#include "blk-mq.h"
#include "blk-mq-debugfs.h"
#include "blk-mq-sched.h"
#include "blk-mq-tag.h"
#include "blk-wbt.h"
void blk_mq_sched_assign_ioc(struct request *rq)
{
struct request_queue *q = rq->q;
struct io_context *ioc;
struct io_cq *icq;
/*
* May not have an IO context if it's a passthrough request
*/
ioc = current->io_context;
if (!ioc)
return;
spin_lock_irq(&q->queue_lock);
icq = ioc_lookup_icq(ioc, q);
spin_unlock_irq(&q->queue_lock);
if (!icq) {
icq = ioc_create_icq(ioc, q, GFP_ATOMIC);
if (!icq)
return;
}
get_io_context(icq->ioc);
rq->elv.icq = icq;
}
/*
* Mark a hardware queue as needing a restart. For shared queues, maintain
* a count of how many hardware queues are marked for restart.
*/
void blk_mq_sched_mark_restart_hctx(struct blk_mq_hw_ctx *hctx)
{
if (test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state))
return;
set_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state);
}
EXPORT_SYMBOL_GPL(blk_mq_sched_mark_restart_hctx);
void blk_mq_sched_restart(struct blk_mq_hw_ctx *hctx)
{
if (!test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state))
return;
clear_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state);
/*
* Order clearing SCHED_RESTART and list_empty_careful(&hctx->dispatch)
* in blk_mq_run_hw_queue(). Its pair is the barrier in
* blk_mq_dispatch_rq_list(). So dispatch code won't see SCHED_RESTART,
* meantime new request added to hctx->dispatch is missed to check in
* blk_mq_run_hw_queue().
*/
smp_mb();
blk_mq_run_hw_queue(hctx, true);
}
static int sched_rq_cmp(void *priv, const struct list_head *a,
const struct list_head *b)
{
struct request *rqa = container_of(a, struct request, queuelist);
struct request *rqb = container_of(b, struct request, queuelist);
return rqa->mq_hctx > rqb->mq_hctx;
}
static bool blk_mq_dispatch_hctx_list(struct list_head *rq_list)
{
struct blk_mq_hw_ctx *hctx =
list_first_entry(rq_list, struct request, queuelist)->mq_hctx;
struct request *rq;
LIST_HEAD(hctx_list);
unsigned int count = 0;
list_for_each_entry(rq, rq_list, queuelist) {
if (rq->mq_hctx != hctx) {
list_cut_before(&hctx_list, rq_list, &rq->queuelist);
goto dispatch;
}
count++;
}
list_splice_tail_init(rq_list, &hctx_list);
dispatch:
return blk_mq_dispatch_rq_list(hctx, &hctx_list, count);
}
#define BLK_MQ_BUDGET_DELAY 3 /* ms units */
/*
* Only SCSI implements .get_budget and .put_budget, and SCSI restarts
* its queue by itself in its completion handler, so we don't need to
* restart queue if .get_budget() returns BLK_STS_NO_RESOURCE.
*
* Returns -EAGAIN if hctx->dispatch was found non-empty and run_work has to
* be run again. This is necessary to avoid starving flushes.
*/
static int __blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx)
{
struct request_queue *q = hctx->queue;
struct elevator_queue *e = q->elevator;
bool multi_hctxs = false, run_queue = false;
bool dispatched = false, busy = false;
unsigned int max_dispatch;
LIST_HEAD(rq_list);
int count = 0;
if (hctx->dispatch_busy)
max_dispatch = 1;
else
max_dispatch = hctx->queue->nr_requests;
do {
struct request *rq;
int budget_token;
if (e->type->ops.has_work && !e->type->ops.has_work(hctx))
break;
if (!list_empty_careful(&hctx->dispatch)) {
busy = true;
break;
}
budget_token = blk_mq_get_dispatch_budget(q);
if (budget_token < 0)
break;
rq = e->type->ops.dispatch_request(hctx);
if (!rq) {
blk_mq_put_dispatch_budget(q, budget_token);
/*
* We're releasing without dispatching. Holding the
* budget could have blocked any "hctx"s with the
* same queue and if we didn't dispatch then there's
* no guarantee anyone will kick the queue. Kick it
* ourselves.
*/
run_queue = true;
break;
}
blk_mq_set_rq_budget_token(rq, budget_token);
/*
* Now this rq owns the budget which has to be released
* if this rq won't be queued to driver via .queue_rq()
* in blk_mq_dispatch_rq_list().
*/
list_add_tail(&rq->queuelist, &rq_list);
count++;
if (rq->mq_hctx != hctx)
multi_hctxs = true;
/*
* If we cannot get tag for the request, stop dequeueing
* requests from the IO scheduler. We are unlikely to be able
* to submit them anyway and it creates false impression for
* scheduling heuristics that the device can take more IO.
*/
if (!blk_mq_get_driver_tag(rq))
break;
} while (count < max_dispatch); if (!count) {
if (run_queue)
blk_mq_delay_run_hw_queues(q, BLK_MQ_BUDGET_DELAY); } else if (multi_hctxs) {
/*
* Requests from different hctx may be dequeued from some
* schedulers, such as bfq and deadline.
*
* Sort the requests in the list according to their hctx,
* dispatch batching requests from same hctx at a time.
*/
list_sort(NULL, &rq_list, sched_rq_cmp);
do {
dispatched |= blk_mq_dispatch_hctx_list(&rq_list);
} while (!list_empty(&rq_list));
} else {
dispatched = blk_mq_dispatch_rq_list(hctx, &rq_list, count);
}
if (busy)
return -EAGAIN;
return !!dispatched;
}
static int blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx)
{
unsigned long end = jiffies + HZ;
int ret;
do {
ret = __blk_mq_do_dispatch_sched(hctx);
if (ret != 1)
break;
if (need_resched() || time_is_before_jiffies(end)) { blk_mq_delay_run_hw_queue(hctx, 0);
break;
}
} while (1);
return ret;
}
static struct blk_mq_ctx *blk_mq_next_ctx(struct blk_mq_hw_ctx *hctx,
struct blk_mq_ctx *ctx)
{
unsigned short idx = ctx->index_hw[hctx->type];
if (++idx == hctx->nr_ctx)
idx = 0;
return hctx->ctxs[idx];
}
/*
* Only SCSI implements .get_budget and .put_budget, and SCSI restarts
* its queue by itself in its completion handler, so we don't need to
* restart queue if .get_budget() returns BLK_STS_NO_RESOURCE.
*
* Returns -EAGAIN if hctx->dispatch was found non-empty and run_work has to
* be run again. This is necessary to avoid starving flushes.
*/
static int blk_mq_do_dispatch_ctx(struct blk_mq_hw_ctx *hctx)
{
struct request_queue *q = hctx->queue;
LIST_HEAD(rq_list);
struct blk_mq_ctx *ctx = READ_ONCE(hctx->dispatch_from);
int ret = 0;
struct request *rq;
do {
int budget_token;
if (!list_empty_careful(&hctx->dispatch)) {
ret = -EAGAIN;
break;
}
if (!sbitmap_any_bit_set(&hctx->ctx_map))
break;
budget_token = blk_mq_get_dispatch_budget(q);
if (budget_token < 0)
break;
rq = blk_mq_dequeue_from_ctx(hctx, ctx);
if (!rq) {
blk_mq_put_dispatch_budget(q, budget_token);
/*
* We're releasing without dispatching. Holding the
* budget could have blocked any "hctx"s with the
* same queue and if we didn't dispatch then there's
* no guarantee anyone will kick the queue. Kick it
* ourselves.
*/
blk_mq_delay_run_hw_queues(q, BLK_MQ_BUDGET_DELAY);
break;
}
blk_mq_set_rq_budget_token(rq, budget_token);
/*
* Now this rq owns the budget which has to be released
* if this rq won't be queued to driver via .queue_rq()
* in blk_mq_dispatch_rq_list().
*/
list_add(&rq->queuelist, &rq_list);
/* round robin for fair dispatch */
ctx = blk_mq_next_ctx(hctx, rq->mq_ctx);
} while (blk_mq_dispatch_rq_list(rq->mq_hctx, &rq_list, 1));
WRITE_ONCE(hctx->dispatch_from, ctx);
return ret;
}
static int __blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx)
{
struct request_queue *q = hctx->queue;
const bool has_sched = q->elevator;
int ret = 0;
LIST_HEAD(rq_list);
/*
* If we have previous entries on our dispatch list, grab them first for
* more fair dispatch.
*/
if (!list_empty_careful(&hctx->dispatch)) {
spin_lock(&hctx->lock);
if (!list_empty(&hctx->dispatch))
list_splice_init(&hctx->dispatch, &rq_list);
spin_unlock(&hctx->lock);
}
/*
* Only ask the scheduler for requests, if we didn't have residual
* requests from the dispatch list. This is to avoid the case where
* we only ever dispatch a fraction of the requests available because
* of low device queue depth. Once we pull requests out of the IO
* scheduler, we can no longer merge or sort them. So it's best to
* leave them there for as long as we can. Mark the hw queue as
* needing a restart in that case.
*
* We want to dispatch from the scheduler if there was nothing
* on the dispatch list or we were able to dispatch from the
* dispatch list.
*/
if (!list_empty(&rq_list)) {
blk_mq_sched_mark_restart_hctx(hctx);
if (blk_mq_dispatch_rq_list(hctx, &rq_list, 0)) {
if (has_sched)
ret = blk_mq_do_dispatch_sched(hctx);
else
ret = blk_mq_do_dispatch_ctx(hctx);
}
} else if (has_sched) { ret = blk_mq_do_dispatch_sched(hctx); } else if (hctx->dispatch_busy) {
/* dequeue request one by one from sw queue if queue is busy */
ret = blk_mq_do_dispatch_ctx(hctx);
} else {
blk_mq_flush_busy_ctxs(hctx, &rq_list);
blk_mq_dispatch_rq_list(hctx, &rq_list, 0);
}
return ret;
}
void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx)
{
struct request_queue *q = hctx->queue;
/* RCU or SRCU read lock is needed before checking quiesced flag */
if (unlikely(blk_mq_hctx_stopped(hctx) || blk_queue_quiesced(q)))
return;
hctx->run++;
/*
* A return of -EAGAIN is an indication that hctx->dispatch is not
* empty and we must run again in order to avoid starving flushes.
*/
if (__blk_mq_sched_dispatch_requests(hctx) == -EAGAIN) {
if (__blk_mq_sched_dispatch_requests(hctx) == -EAGAIN) blk_mq_run_hw_queue(hctx, true);
}
}
bool __blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio,
unsigned int nr_segs)
{
struct elevator_queue *e = q->elevator;
struct blk_mq_ctx *ctx;
struct blk_mq_hw_ctx *hctx;
bool ret = false;
enum hctx_type type;
if (e && e->type->ops.bio_merge) return e->type->ops.bio_merge(q, bio, nr_segs);
ctx = blk_mq_get_ctx(q);
hctx = blk_mq_map_queue(q, bio->bi_opf, ctx);
type = hctx->type;
if (!(hctx->flags & BLK_MQ_F_SHOULD_MERGE) ||
list_empty_careful(&ctx->rq_lists[type]))
return false;
/* default per sw-queue merge */
spin_lock(&ctx->lock);
/*
* Reverse check our software queue for entries that we could
* potentially merge with. Currently includes a hand-wavy stop
* count of 8, to not spend too much time checking for merges.
*/
if (blk_bio_list_merge(q, &ctx->rq_lists[type], bio, nr_segs)) {
ctx->rq_merged++;
ret = true;
}
spin_unlock(&ctx->lock);
return ret;
}
bool blk_mq_sched_try_insert_merge(struct request_queue *q, struct request *rq,
struct list_head *free)
{
return rq_mergeable(rq) && elv_attempt_insert_merge(q, rq, free);
}
EXPORT_SYMBOL_GPL(blk_mq_sched_try_insert_merge);
static bool blk_mq_sched_bypass_insert(struct blk_mq_hw_ctx *hctx,
struct request *rq)
{
/*
* dispatch flush and passthrough rq directly
*
* passthrough request has to be added to hctx->dispatch directly.
* For some reason, device may be in one situation which can't
* handle FS request, so STS_RESOURCE is always returned and the
* FS request will be added to hctx->dispatch. However passthrough
* request may be required at that time for fixing the problem. If
* passthrough request is added to scheduler queue, there isn't any
* chance to dispatch it given we prioritize requests in hctx->dispatch.
*/
if ((rq->rq_flags & RQF_FLUSH_SEQ) || blk_rq_is_passthrough(rq))
return true;
return false;
}
void blk_mq_sched_insert_request(struct request *rq, bool at_head,
bool run_queue, bool async)
{
struct request_queue *q = rq->q;
struct elevator_queue *e = q->elevator;
struct blk_mq_ctx *ctx = rq->mq_ctx;
struct blk_mq_hw_ctx *hctx = rq->mq_hctx;
WARN_ON(e && (rq->tag != BLK_MQ_NO_TAG));
if (blk_mq_sched_bypass_insert(hctx, rq)) {
/*
* Firstly normal IO request is inserted to scheduler queue or
* sw queue, meantime we add flush request to dispatch queue(
* hctx->dispatch) directly and there is at most one in-flight
* flush request for each hw queue, so it doesn't matter to add
* flush request to tail or front of the dispatch queue.
*
* Secondly in case of NCQ, flush request belongs to non-NCQ
* command, and queueing it will fail when there is any
* in-flight normal IO request(NCQ command). When adding flush
* rq to the front of hctx->dispatch, it is easier to introduce
* extra time to flush rq's latency because of S_SCHED_RESTART
* compared with adding to the tail of dispatch queue, then
* chance of flush merge is increased, and less flush requests
* will be issued to controller. It is observed that ~10% time
* is saved in blktests block/004 on disk attached to AHCI/NCQ
* drive when adding flush rq to the front of hctx->dispatch.
*
* Simply queue flush rq to the front of hctx->dispatch so that
* intensive flush workloads can benefit in case of NCQ HW.
*/
at_head = (rq->rq_flags & RQF_FLUSH_SEQ) ? true : at_head;
blk_mq_request_bypass_insert(rq, at_head, false);
goto run;
}
if (e) {
LIST_HEAD(list);
list_add(&rq->queuelist, &list);
e->type->ops.insert_requests(hctx, &list, at_head);
} else {
spin_lock(&ctx->lock);
__blk_mq_insert_request(hctx, rq, at_head);
spin_unlock(&ctx->lock);
}
run:
if (run_queue) blk_mq_run_hw_queue(hctx, async);}
void blk_mq_sched_insert_requests(struct blk_mq_hw_ctx *hctx,
struct blk_mq_ctx *ctx,
struct list_head *list, bool run_queue_async)
{
struct elevator_queue *e;
struct request_queue *q = hctx->queue;
/*
* blk_mq_sched_insert_requests() is called from flush plug
* context only, and hold one usage counter to prevent queue
* from being released.
*/
percpu_ref_get(&q->q_usage_counter);
e = hctx->queue->elevator;
if (e) {
e->type->ops.insert_requests(hctx, list, false);
} else {
/*
* try to issue requests directly if the hw queue isn't
* busy in case of 'none' scheduler, and this way may save
* us one extra enqueue & dequeue to sw queue.
*/
if (!hctx->dispatch_busy && !e && !run_queue_async) { blk_mq_try_issue_list_directly(hctx, list);
if (list_empty(list))
goto out;
}
blk_mq_insert_requests(hctx, ctx, list);
}
blk_mq_run_hw_queue(hctx, run_queue_async);
out:
percpu_ref_put(&q->q_usage_counter);
}
static int blk_mq_sched_alloc_tags(struct request_queue *q,
struct blk_mq_hw_ctx *hctx,
unsigned int hctx_idx)
{
struct blk_mq_tag_set *set = q->tag_set;
int ret;
hctx->sched_tags = blk_mq_alloc_rq_map(set, hctx_idx, q->nr_requests,
set->reserved_tags, set->flags);
if (!hctx->sched_tags)
return -ENOMEM;
ret = blk_mq_alloc_rqs(set, hctx->sched_tags, hctx_idx, q->nr_requests);
if (ret) {
blk_mq_free_rq_map(hctx->sched_tags, set->flags);
hctx->sched_tags = NULL;
}
return ret;
}
/* called in queue's release handler, tagset has gone away */
static void blk_mq_sched_tags_teardown(struct request_queue *q)
{
struct blk_mq_hw_ctx *hctx;
int i;
queue_for_each_hw_ctx(q, hctx, i) {
if (hctx->sched_tags) {
blk_mq_free_rq_map(hctx->sched_tags, hctx->flags);
hctx->sched_tags = NULL;
}
}
}
static int blk_mq_init_sched_shared_sbitmap(struct request_queue *queue)
{
struct blk_mq_tag_set *set = queue->tag_set;
int alloc_policy = BLK_MQ_FLAG_TO_ALLOC_POLICY(set->flags);
struct blk_mq_hw_ctx *hctx;
int ret, i;
/*
* Set initial depth at max so that we don't need to reallocate for
* updating nr_requests.
*/
ret = blk_mq_init_bitmaps(&queue->sched_bitmap_tags,
&queue->sched_breserved_tags,
MAX_SCHED_RQ, set->reserved_tags,
set->numa_node, alloc_policy);
if (ret)
return ret;
queue_for_each_hw_ctx(queue, hctx, i) {
hctx->sched_tags->bitmap_tags =
&queue->sched_bitmap_tags;
hctx->sched_tags->breserved_tags =
&queue->sched_breserved_tags;
}
sbitmap_queue_resize(&queue->sched_bitmap_tags,
queue->nr_requests - set->reserved_tags);
return 0;
}
static void blk_mq_exit_sched_shared_sbitmap(struct request_queue *queue)
{
sbitmap_queue_free(&queue->sched_bitmap_tags);
sbitmap_queue_free(&queue->sched_breserved_tags);
}
int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e)
{
struct blk_mq_hw_ctx *hctx;
struct elevator_queue *eq;
unsigned int i;
int ret;
if (!e) {
q->elevator = NULL;
q->nr_requests = q->tag_set->queue_depth;
return 0;
}
/*
* Default to double of smaller one between hw queue_depth and 128,
* since we don't split into sync/async like the old code did.
* Additionally, this is a per-hw queue depth.
*/
q->nr_requests = 2 * min_t(unsigned int, q->tag_set->queue_depth,
BLKDEV_MAX_RQ);
queue_for_each_hw_ctx(q, hctx, i) {
ret = blk_mq_sched_alloc_tags(q, hctx, i);
if (ret)
goto err_free_tags;
}
if (blk_mq_is_sbitmap_shared(q->tag_set->flags)) {
ret = blk_mq_init_sched_shared_sbitmap(q);
if (ret)
goto err_free_tags;
}
ret = e->ops.init_sched(q, e);
if (ret)
goto err_free_sbitmap;
blk_mq_debugfs_register_sched(q);
queue_for_each_hw_ctx(q, hctx, i) {
if (e->ops.init_hctx) {
ret = e->ops.init_hctx(hctx, i);
if (ret) {
eq = q->elevator;
blk_mq_sched_free_requests(q);
blk_mq_exit_sched(q, eq);
kobject_put(&eq->kobj);
return ret;
}
}
blk_mq_debugfs_register_sched_hctx(q, hctx);
}
return 0;
err_free_sbitmap:
if (blk_mq_is_sbitmap_shared(q->tag_set->flags))
blk_mq_exit_sched_shared_sbitmap(q);
err_free_tags:
blk_mq_sched_free_requests(q);
blk_mq_sched_tags_teardown(q);
q->elevator = NULL;
return ret;
}
/*
* called in either blk_queue_cleanup or elevator_switch, tagset
* is required for freeing requests
*/
void blk_mq_sched_free_requests(struct request_queue *q)
{
struct blk_mq_hw_ctx *hctx;
int i;
queue_for_each_hw_ctx(q, hctx, i) {
if (hctx->sched_tags)
blk_mq_free_rqs(q->tag_set, hctx->sched_tags, i);
}
}
void blk_mq_exit_sched(struct request_queue *q, struct elevator_queue *e)
{
struct blk_mq_hw_ctx *hctx;
unsigned int i;
unsigned int flags = 0;
queue_for_each_hw_ctx(q, hctx, i) {
blk_mq_debugfs_unregister_sched_hctx(hctx);
if (e->type->ops.exit_hctx && hctx->sched_data) {
e->type->ops.exit_hctx(hctx, i);
hctx->sched_data = NULL;
}
flags = hctx->flags;
}
blk_mq_debugfs_unregister_sched(q);
if (e->type->ops.exit_sched)
e->type->ops.exit_sched(e);
blk_mq_sched_tags_teardown(q);
if (blk_mq_is_sbitmap_shared(flags))
blk_mq_exit_sched_shared_sbitmap(q);
q->elevator = NULL;
}
// SPDX-License-Identifier: GPL-2.0
/*
* Tag allocation using scalable bitmaps. Uses active queue tracking to support
* fairer distribution of tags between multiple submitters when a shared tag map
* is used.
*
* Copyright (C) 2013-2014 Jens Axboe
*/
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/blk-mq.h>
#include <linux/delay.h>
#include "blk.h"
#include "blk-mq.h"
#include "blk-mq-sched.h"
#include "blk-mq-tag.h"
/*
* If a previously inactive queue goes active, bump the active user count.
* We need to do this before try to allocate driver tag, then even if fail
* to get tag when first time, the other shared-tag users could reserve
* budget for it.
*/
bool __blk_mq_tag_busy(struct blk_mq_hw_ctx *hctx)
{
if (blk_mq_is_sbitmap_shared(hctx->flags)) {
struct request_queue *q = hctx->queue;
struct blk_mq_tag_set *set = q->tag_set;
if (!test_bit(QUEUE_FLAG_HCTX_ACTIVE, &q->queue_flags) &&
!test_and_set_bit(QUEUE_FLAG_HCTX_ACTIVE, &q->queue_flags))
atomic_inc(&set->active_queues_shared_sbitmap);
} else {
if (!test_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state) &&
!test_and_set_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state))
atomic_inc(&hctx->tags->active_queues);
}
return true;
}
/*
* Wakeup all potentially sleeping on tags
*/
void blk_mq_tag_wakeup_all(struct blk_mq_tags *tags, bool include_reserve)
{
sbitmap_queue_wake_all(tags->bitmap_tags);
if (include_reserve)
sbitmap_queue_wake_all(tags->breserved_tags);
}
/*
* If a previously busy queue goes inactive, potential waiters could now
* be allowed to queue. Wake them up and check.
*/
void __blk_mq_tag_idle(struct blk_mq_hw_ctx *hctx)
{
struct blk_mq_tags *tags = hctx->tags;
struct request_queue *q = hctx->queue;
struct blk_mq_tag_set *set = q->tag_set;
if (blk_mq_is_sbitmap_shared(hctx->flags)) {
if (!test_and_clear_bit(QUEUE_FLAG_HCTX_ACTIVE,
&q->queue_flags))
return;
atomic_dec(&set->active_queues_shared_sbitmap);
} else {
if (!test_and_clear_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state))
return;
atomic_dec(&tags->active_queues);
}
blk_mq_tag_wakeup_all(tags, false);
}
static int __blk_mq_get_tag(struct blk_mq_alloc_data *data,
struct sbitmap_queue *bt)
{
if (!data->q->elevator && !(data->flags & BLK_MQ_REQ_RESERVED) && !hctx_may_queue(data->hctx, bt))
return BLK_MQ_NO_TAG;
if (data->shallow_depth) return __sbitmap_queue_get_shallow(bt, data->shallow_depth);
else
return __sbitmap_queue_get(bt);
}
unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data)
{
struct blk_mq_tags *tags = blk_mq_tags_from_data(data);
struct sbitmap_queue *bt;
struct sbq_wait_state *ws;
DEFINE_SBQ_WAIT(wait);
unsigned int tag_offset;
int tag;
if (data->flags & BLK_MQ_REQ_RESERVED) {
if (unlikely(!tags->nr_reserved_tags)) { WARN_ON_ONCE(1);
return BLK_MQ_NO_TAG;
}
bt = tags->breserved_tags;
tag_offset = 0;
} else {
bt = tags->bitmap_tags;
tag_offset = tags->nr_reserved_tags;
}
tag = __blk_mq_get_tag(data, bt);
if (tag != BLK_MQ_NO_TAG)
goto found_tag;
if (data->flags & BLK_MQ_REQ_NOWAIT)
return BLK_MQ_NO_TAG;
ws = bt_wait_ptr(bt, data->hctx);
do {
struct sbitmap_queue *bt_prev;
/*
* We're out of tags on this hardware queue, kick any
* pending IO submits before going to sleep waiting for
* some to complete.
*/
blk_mq_run_hw_queue(data->hctx, false);
/*
* Retry tag allocation after running the hardware queue,
* as running the queue may also have found completions.
*/
tag = __blk_mq_get_tag(data, bt);
if (tag != BLK_MQ_NO_TAG)
break;
sbitmap_prepare_to_wait(bt, ws, &wait, TASK_UNINTERRUPTIBLE);
tag = __blk_mq_get_tag(data, bt);
if (tag != BLK_MQ_NO_TAG)
break;
bt_prev = bt;
io_schedule();
sbitmap_finish_wait(bt, ws, &wait);
data->ctx = blk_mq_get_ctx(data->q);
data->hctx = blk_mq_map_queue(data->q, data->cmd_flags,
data->ctx);
tags = blk_mq_tags_from_data(data);
if (data->flags & BLK_MQ_REQ_RESERVED)
bt = tags->breserved_tags;
else
bt = tags->bitmap_tags;
/*
* If destination hw queue is changed, fake wake up on
* previous queue for compensating the wake up miss, so
* other allocations on previous queue won't be starved.
*/
if (bt != bt_prev) sbitmap_queue_wake_up(bt_prev);
ws = bt_wait_ptr(bt, data->hctx);
} while (1);
sbitmap_finish_wait(bt, ws, &wait);
found_tag:
/*
* Give up this allocation if the hctx is inactive. The caller will
* retry on an active hctx.
*/
if (unlikely(test_bit(BLK_MQ_S_INACTIVE, &data->hctx->state))) { blk_mq_put_tag(tags, data->ctx, tag + tag_offset);
return BLK_MQ_NO_TAG;
}
return tag + tag_offset;
}
void blk_mq_put_tag(struct blk_mq_tags *tags, struct blk_mq_ctx *ctx,
unsigned int tag)
{
if (!blk_mq_tag_is_reserved(tags, tag)) {
const int real_tag = tag - tags->nr_reserved_tags;
BUG_ON(real_tag >= tags->nr_tags);
sbitmap_queue_clear(tags->bitmap_tags, real_tag, ctx->cpu);
} else {
BUG_ON(tag >= tags->nr_reserved_tags);
sbitmap_queue_clear(tags->breserved_tags, tag, ctx->cpu);
}
}
struct bt_iter_data {
struct blk_mq_hw_ctx *hctx;
busy_iter_fn *fn;
void *data;
bool reserved;
};
static struct request *blk_mq_find_and_get_req(struct blk_mq_tags *tags,
unsigned int bitnr)
{
struct request *rq;
unsigned long flags;
spin_lock_irqsave(&tags->lock, flags);
rq = tags->rqs[bitnr];
if (!rq || rq->tag != bitnr || !refcount_inc_not_zero(&rq->ref))
rq = NULL;
spin_unlock_irqrestore(&tags->lock, flags);
return rq;
}
static bool bt_iter(struct sbitmap *bitmap, unsigned int bitnr, void *data)
{
struct bt_iter_data *iter_data = data;
struct blk_mq_hw_ctx *hctx = iter_data->hctx;
struct blk_mq_tags *tags = hctx->tags;
bool reserved = iter_data->reserved;
struct request *rq;
bool ret = true;
if (!reserved)
bitnr += tags->nr_reserved_tags;
/*
* We can hit rq == NULL here, because the tagging functions
* test and set the bit before assigning ->rqs[].
*/
rq = blk_mq_find_and_get_req(tags, bitnr);
if (!rq)
return true;
if (rq->q == hctx->queue && rq->mq_hctx == hctx)
ret = iter_data->fn(hctx, rq, iter_data->data, reserved);
blk_mq_put_rq_ref(rq);
return ret;
}
/**
* bt_for_each - iterate over the requests associated with a hardware queue
* @hctx: Hardware queue to examine.
* @bt: sbitmap to examine. This is either the breserved_tags member
* or the bitmap_tags member of struct blk_mq_tags.
* @fn: Pointer to the function that will be called for each request
* associated with @hctx that has been assigned a driver tag.
* @fn will be called as follows: @fn(@hctx, rq, @data, @reserved)
* where rq is a pointer to a request. Return true to continue
* iterating tags, false to stop.
* @data: Will be passed as third argument to @fn.
* @reserved: Indicates whether @bt is the breserved_tags member or the
* bitmap_tags member of struct blk_mq_tags.
*/
static void bt_for_each(struct blk_mq_hw_ctx *hctx, struct sbitmap_queue *bt,
busy_iter_fn *fn, void *data, bool reserved)
{
struct bt_iter_data iter_data = {
.hctx = hctx,
.fn = fn,
.data = data,
.reserved = reserved,
};
sbitmap_for_each_set(&bt->sb, bt_iter, &iter_data);
}
struct bt_tags_iter_data {
struct blk_mq_tags *tags;
busy_tag_iter_fn *fn;
void *data;
unsigned int flags;
};
#define BT_TAG_ITER_RESERVED (1 << 0)
#define BT_TAG_ITER_STARTED (1 << 1)
#define BT_TAG_ITER_STATIC_RQS (1 << 2)
static bool bt_tags_iter(struct sbitmap *bitmap, unsigned int bitnr, void *data)
{
struct bt_tags_iter_data *iter_data = data;
struct blk_mq_tags *tags = iter_data->tags;
bool reserved = iter_data->flags & BT_TAG_ITER_RESERVED;
struct request *rq;
bool ret = true;
bool iter_static_rqs = !!(iter_data->flags & BT_TAG_ITER_STATIC_RQS);
if (!reserved)
bitnr += tags->nr_reserved_tags;
/*
* We can hit rq == NULL here, because the tagging functions
* test and set the bit before assigning ->rqs[].
*/
if (iter_static_rqs)
rq = tags->static_rqs[bitnr];
else
rq = blk_mq_find_and_get_req(tags, bitnr);
if (!rq)
return true;
if (!(iter_data->flags & BT_TAG_ITER_STARTED) ||
blk_mq_request_started(rq))
ret = iter_data->fn(rq, iter_data->data, reserved);
if (!iter_static_rqs)
blk_mq_put_rq_ref(rq);
return ret;
}
/**
* bt_tags_for_each - iterate over the requests in a tag map
* @tags: Tag map to iterate over.
* @bt: sbitmap to examine. This is either the breserved_tags member
* or the bitmap_tags member of struct blk_mq_tags.
* @fn: Pointer to the function that will be called for each started
* request. @fn will be called as follows: @fn(rq, @data,
* @reserved) where rq is a pointer to a request. Return true
* to continue iterating tags, false to stop.
* @data: Will be passed as second argument to @fn.
* @flags: BT_TAG_ITER_*
*/
static void bt_tags_for_each(struct blk_mq_tags *tags, struct sbitmap_queue *bt,
busy_tag_iter_fn *fn, void *data, unsigned int flags)
{
struct bt_tags_iter_data iter_data = {
.tags = tags,
.fn = fn,
.data = data,
.flags = flags,
};
if (tags->rqs)
sbitmap_for_each_set(&bt->sb, bt_tags_iter, &iter_data);
}
static void __blk_mq_all_tag_iter(struct blk_mq_tags *tags,
busy_tag_iter_fn *fn, void *priv, unsigned int flags)
{
WARN_ON_ONCE(flags & BT_TAG_ITER_RESERVED);
if (tags->nr_reserved_tags)
bt_tags_for_each(tags, tags->breserved_tags, fn, priv,
flags | BT_TAG_ITER_RESERVED);
bt_tags_for_each(tags, tags->bitmap_tags, fn, priv, flags);
}
/**
* blk_mq_all_tag_iter - iterate over all requests in a tag map
* @tags: Tag map to iterate over.
* @fn: Pointer to the function that will be called for each
* request. @fn will be called as follows: @fn(rq, @priv,
* reserved) where rq is a pointer to a request. 'reserved'
* indicates whether or not @rq is a reserved request. Return
* true to continue iterating tags, false to stop.
* @priv: Will be passed as second argument to @fn.
*
* Caller has to pass the tag map from which requests are allocated.
*/
void blk_mq_all_tag_iter(struct blk_mq_tags *tags, busy_tag_iter_fn *fn,
void *priv)
{
__blk_mq_all_tag_iter(tags, fn, priv, BT_TAG_ITER_STATIC_RQS);
}
/**
* blk_mq_tagset_busy_iter - iterate over all started requests in a tag set
* @tagset: Tag set to iterate over.
* @fn: Pointer to the function that will be called for each started
* request. @fn will be called as follows: @fn(rq, @priv,
* reserved) where rq is a pointer to a request. 'reserved'
* indicates whether or not @rq is a reserved request. Return
* true to continue iterating tags, false to stop.
* @priv: Will be passed as second argument to @fn.
*
* We grab one request reference before calling @fn and release it after
* @fn returns.
*/
void blk_mq_tagset_busy_iter(struct blk_mq_tag_set *tagset,
busy_tag_iter_fn *fn, void *priv)
{
int i;
for (i = 0; i < tagset->nr_hw_queues; i++) {
if (tagset->tags && tagset->tags[i])
__blk_mq_all_tag_iter(tagset->tags[i], fn, priv,
BT_TAG_ITER_STARTED);
}
}
EXPORT_SYMBOL(blk_mq_tagset_busy_iter);
static bool blk_mq_tagset_count_completed_rqs(struct request *rq,
void *data, bool reserved)
{
unsigned *count = data;
if (blk_mq_request_completed(rq))
(*count)++;
return true;
}
/**
* blk_mq_tagset_wait_completed_request - Wait until all scheduled request
* completions have finished.
* @tagset: Tag set to drain completed request
*
* Note: This function has to be run after all IO queues are shutdown
*/
void blk_mq_tagset_wait_completed_request(struct blk_mq_tag_set *tagset)
{
while (true) {
unsigned count = 0;
blk_mq_tagset_busy_iter(tagset,
blk_mq_tagset_count_completed_rqs, &count);
if (!count)
break;
msleep(5);
}
}
EXPORT_SYMBOL(blk_mq_tagset_wait_completed_request);
/**
* blk_mq_queue_tag_busy_iter - iterate over all requests with a driver tag
* @q: Request queue to examine.
* @fn: Pointer to the function that will be called for each request
* on @q. @fn will be called as follows: @fn(hctx, rq, @priv,
* reserved) where rq is a pointer to a request and hctx points
* to the hardware queue associated with the request. 'reserved'
* indicates whether or not @rq is a reserved request.
* @priv: Will be passed as third argument to @fn.
*
* Note: if @q->tag_set is shared with other request queues then @fn will be
* called for all requests on all queues that share that tag set and not only
* for requests associated with @q.
*/
void blk_mq_queue_tag_busy_iter(struct request_queue *q, busy_iter_fn *fn,
void *priv)
{
struct blk_mq_hw_ctx *hctx;
int i;
/*
* __blk_mq_update_nr_hw_queues() updates nr_hw_queues and queue_hw_ctx
* while the queue is frozen. So we can use q_usage_counter to avoid
* racing with it.
*/
if (!percpu_ref_tryget(&q->q_usage_counter))
return;
queue_for_each_hw_ctx(q, hctx, i) {
struct blk_mq_tags *tags = hctx->tags;
/*
* If no software queues are currently mapped to this
* hardware queue, there's nothing to check
*/
if (!blk_mq_hw_queue_mapped(hctx))
continue;
if (tags->nr_reserved_tags)
bt_for_each(hctx, tags->breserved_tags, fn, priv, true);
bt_for_each(hctx, tags->bitmap_tags, fn, priv, false);
}
blk_queue_exit(q);
}
static int bt_alloc(struct sbitmap_queue *bt, unsigned int depth,
bool round_robin, int node)
{
return sbitmap_queue_init_node(bt, depth, -1, round_robin, GFP_KERNEL,
node);
}
int blk_mq_init_bitmaps(struct sbitmap_queue *bitmap_tags,
struct sbitmap_queue *breserved_tags,
unsigned int queue_depth, unsigned int reserved,
int node, int alloc_policy)
{
unsigned int depth = queue_depth - reserved;
bool round_robin = alloc_policy == BLK_TAG_ALLOC_RR;
if (bt_alloc(bitmap_tags, depth, round_robin, node))
return -ENOMEM;
if (bt_alloc(breserved_tags, reserved, round_robin, node))
goto free_bitmap_tags;
return 0;
free_bitmap_tags:
sbitmap_queue_free(bitmap_tags);
return -ENOMEM;
}
static int blk_mq_init_bitmap_tags(struct blk_mq_tags *tags,
int node, int alloc_policy)
{
int ret;
ret = blk_mq_init_bitmaps(&tags->__bitmap_tags,
&tags->__breserved_tags,
tags->nr_tags, tags->nr_reserved_tags,
node, alloc_policy);
if (ret)
return ret;
tags->bitmap_tags = &tags->__bitmap_tags;
tags->breserved_tags = &tags->__breserved_tags;
return 0;
}
int blk_mq_init_shared_sbitmap(struct blk_mq_tag_set *set)
{
int alloc_policy = BLK_MQ_FLAG_TO_ALLOC_POLICY(set->flags);
int i, ret;
ret = blk_mq_init_bitmaps(&set->__bitmap_tags, &set->__breserved_tags,
set->queue_depth, set->reserved_tags,
set->numa_node, alloc_policy);
if (ret)
return ret;
for (i = 0; i < set->nr_hw_queues; i++) {
struct blk_mq_tags *tags = set->tags[i];
tags->bitmap_tags = &set->__bitmap_tags;
tags->breserved_tags = &set->__breserved_tags;
}
return 0;
}
void blk_mq_exit_shared_sbitmap(struct blk_mq_tag_set *set)
{
sbitmap_queue_free(&set->__bitmap_tags);
sbitmap_queue_free(&set->__breserved_tags);
}
struct blk_mq_tags *blk_mq_init_tags(unsigned int total_tags,
unsigned int reserved_tags,
int node, unsigned int flags)
{
int alloc_policy = BLK_MQ_FLAG_TO_ALLOC_POLICY(flags);
struct blk_mq_tags *tags;
if (total_tags > BLK_MQ_TAG_MAX) {
pr_err("blk-mq: tag depth too large\n");
return NULL;
}
tags = kzalloc_node(sizeof(*tags), GFP_KERNEL, node);
if (!tags)
return NULL;
tags->nr_tags = total_tags;
tags->nr_reserved_tags = reserved_tags;
spin_lock_init(&tags->lock);
if (blk_mq_is_sbitmap_shared(flags))
return tags;
if (blk_mq_init_bitmap_tags(tags, node, alloc_policy) < 0) {
kfree(tags);
return NULL;
}
return tags;
}
void blk_mq_free_tags(struct blk_mq_tags *tags, unsigned int flags)
{
if (!blk_mq_is_sbitmap_shared(flags)) {
sbitmap_queue_free(tags->bitmap_tags);
sbitmap_queue_free(tags->breserved_tags);
}
kfree(tags);
}
int blk_mq_tag_update_depth(struct blk_mq_hw_ctx *hctx,
struct blk_mq_tags **tagsptr, unsigned int tdepth,
bool can_grow)
{
struct blk_mq_tags *tags = *tagsptr;
if (tdepth <= tags->nr_reserved_tags)
return -EINVAL;
/*
* If we are allowed to grow beyond the original size, allocate
* a new set of tags before freeing the old one.
*/
if (tdepth > tags->nr_tags) {
struct blk_mq_tag_set *set = hctx->queue->tag_set;
struct blk_mq_tags *new;
bool ret;
if (!can_grow)
return -EINVAL;
/*
* We need some sort of upper limit, set it high enough that
* no valid use cases should require more.
*/
if (tdepth > MAX_SCHED_RQ)
return -EINVAL;
new = blk_mq_alloc_rq_map(set, hctx->queue_num, tdepth,
tags->nr_reserved_tags, set->flags);
if (!new)
return -ENOMEM;
ret = blk_mq_alloc_rqs(set, new, hctx->queue_num, tdepth);
if (ret) {
blk_mq_free_rq_map(new, set->flags);
return -ENOMEM;
}
blk_mq_free_rqs(set, *tagsptr, hctx->queue_num);
blk_mq_free_rq_map(*tagsptr, set->flags);
*tagsptr = new;
} else {
/*
* Don't need (or can't) update reserved tags here, they
* remain static and should never need resizing.
*/
sbitmap_queue_resize(tags->bitmap_tags,
tdepth - tags->nr_reserved_tags);
}
return 0;
}
void blk_mq_tag_resize_shared_sbitmap(struct blk_mq_tag_set *set, unsigned int size)
{
sbitmap_queue_resize(&set->__bitmap_tags, size - set->reserved_tags);
}
/**
* blk_mq_unique_tag() - return a tag that is unique queue-wide
* @rq: request for which to compute a unique tag
*
* The tag field in struct request is unique per hardware queue but not over
* all hardware queues. Hence this function that returns a tag with the
* hardware context index in the upper bits and the per hardware queue tag in
* the lower bits.
*
* Note: When called for a request that is queued on a non-multiqueue request
* queue, the hardware context index is set to zero.
*/
u32 blk_mq_unique_tag(struct request *rq)
{
return (rq->mq_hctx->queue_num << BLK_MQ_UNIQUE_TAG_BITS) |
(rq->tag & BLK_MQ_UNIQUE_TAG_MASK);
}
EXPORT_SYMBOL(blk_mq_unique_tag);
/* SPDX-License-Identifier: GPL-2.0-only */
/*
* kref.h - library routines for handling generic reference counted objects
*
* Copyright (C) 2004 Greg Kroah-Hartman <greg@kroah.com>
* Copyright (C) 2004 IBM Corp.
*
* based on kobject.h which was:
* Copyright (C) 2002-2003 Patrick Mochel <mochel@osdl.org>
* Copyright (C) 2002-2003 Open Source Development Labs
*/
#ifndef _KREF_H_
#define _KREF_H_
#include <linux/spinlock.h>
#include <linux/refcount.h>
struct kref {
refcount_t refcount;
};
#define KREF_INIT(n) { .refcount = REFCOUNT_INIT(n), }
/**
* kref_init - initialize object.
* @kref: object in question.
*/
static inline void kref_init(struct kref *kref)
{
refcount_set(&kref->refcount, 1);
}
static inline unsigned int kref_read(const struct kref *kref)
{
return refcount_read(&kref->refcount);
}
/**
* kref_get - increment refcount for object.
* @kref: object.
*/
static inline void kref_get(struct kref *kref)
{
refcount_inc(&kref->refcount);
}
/**
* kref_put - decrement refcount for object.
* @kref: object.
* @release: pointer to the function that will clean up the object when the
* last reference to the object is released.
* This pointer is required, and it is not acceptable to pass kfree
* in as this function.
*
* Decrement the refcount, and if 0, call release().
* Return 1 if the object was removed, otherwise return 0. Beware, if this
* function returns 0, you still can not count on the kref from remaining in
* memory. Only use the return value if you want to see if the kref is now
* gone, not present.
*/
static inline int kref_put(struct kref *kref, void (*release)(struct kref *kref))
{
if (refcount_dec_and_test(&kref->refcount)) {
release(kref);
return 1;
}
return 0;
}
static inline int kref_put_mutex(struct kref *kref,
void (*release)(struct kref *kref),
struct mutex *lock)
{
if (refcount_dec_and_mutex_lock(&kref->refcount, lock)) {
release(kref);
return 1;
}
return 0;
}
static inline int kref_put_lock(struct kref *kref,
void (*release)(struct kref *kref),
spinlock_t *lock)
{
if (refcount_dec_and_lock(&kref->refcount, lock)) {
release(kref);
return 1;
}
return 0;
}
/**
* kref_get_unless_zero - Increment refcount for object unless it is zero.
* @kref: object.
*
* Return non-zero if the increment succeeded. Otherwise return 0.
*
* This function is intended to simplify locking around refcounting for
* objects that can be looked up from a lookup structure, and which are
* removed from that lookup structure in the object destructor.
* Operations on such objects require at least a read lock around
* lookup + kref_get, and a write lock around kref_put + remove from lookup
* structure. Furthermore, RCU implementations become extremely tricky.
* With a lookup followed by a kref_get_unless_zero *with return value check*
* locking in the kref_put path can be deferred to the actual removal from
* the lookup structure and RCU lookups become trivial.
*/
static inline int __must_check kref_get_unless_zero(struct kref *kref)
{
return refcount_inc_not_zero(&kref->refcount);
}
#endif /* _KREF_H_ */
/* SPDX-License-Identifier: GPL-2.0 */
/*
* This file provides wrappers with sanitizer instrumentation for non-atomic
* bit operations.
*
* To use this functionality, an arch's bitops.h file needs to define each of
* the below bit operations with an arch_ prefix (e.g. arch_set_bit(),
* arch___set_bit(), etc.).
*/
#ifndef _ASM_GENERIC_BITOPS_INSTRUMENTED_NON_ATOMIC_H
#define _ASM_GENERIC_BITOPS_INSTRUMENTED_NON_ATOMIC_H
#include <linux/instrumented.h>
/**
* __set_bit - Set a bit in memory
* @nr: the bit to set
* @addr: the address to start counting from
*
* Unlike set_bit(), this function is non-atomic. If it is called on the same
* region of memory concurrently, the effect may be that only one operation
* succeeds.
*/
static inline void __set_bit(long nr, volatile unsigned long *addr)
{
instrument_write(addr + BIT_WORD(nr), sizeof(long));
arch___set_bit(nr, addr);
}
/**
* __clear_bit - Clears a bit in memory
* @nr: the bit to clear
* @addr: the address to start counting from
*
* Unlike clear_bit(), this function is non-atomic. If it is called on the same
* region of memory concurrently, the effect may be that only one operation
* succeeds.
*/
static inline void __clear_bit(long nr, volatile unsigned long *addr)
{
instrument_write(addr + BIT_WORD(nr), sizeof(long));
arch___clear_bit(nr, addr);
}
/**
* __change_bit - Toggle a bit in memory
* @nr: the bit to change
* @addr: the address to start counting from
*
* Unlike change_bit(), this function is non-atomic. If it is called on the same
* region of memory concurrently, the effect may be that only one operation
* succeeds.
*/
static inline void __change_bit(long nr, volatile unsigned long *addr)
{
instrument_write(addr + BIT_WORD(nr), sizeof(long));
arch___change_bit(nr, addr);
}
static inline void __instrument_read_write_bitop(long nr, volatile unsigned long *addr)
{
if (IS_ENABLED(CONFIG_KCSAN_ASSUME_PLAIN_WRITES_ATOMIC)) {
/*
* We treat non-atomic read-write bitops a little more special.
* Given the operations here only modify a single bit, assuming
* non-atomicity of the writer is sufficient may be reasonable
* for certain usage (and follows the permissible nature of the
* assume-plain-writes-atomic rule):
* 1. report read-modify-write races -> check read;
* 2. do not report races with marked readers, but do report
* races with unmarked readers -> check "atomic" write.
*/
kcsan_check_read(addr + BIT_WORD(nr), sizeof(long));
/*
* Use generic write instrumentation, in case other sanitizers
* or tools are enabled alongside KCSAN.
*/
instrument_write(addr + BIT_WORD(nr), sizeof(long));
} else {
instrument_read_write(addr + BIT_WORD(nr), sizeof(long));
}
}
/**
* __test_and_set_bit - Set a bit and return its old value
* @nr: Bit to set
* @addr: Address to count from
*
* This operation is non-atomic. If two instances of this operation race, one
* can appear to succeed but actually fail.
*/
static inline bool __test_and_set_bit(long nr, volatile unsigned long *addr)
{
__instrument_read_write_bitop(nr, addr);
return arch___test_and_set_bit(nr, addr);
}
/**
* __test_and_clear_bit - Clear a bit and return its old value
* @nr: Bit to clear
* @addr: Address to count from
*
* This operation is non-atomic. If two instances of this operation race, one
* can appear to succeed but actually fail.
*/
static inline bool __test_and_clear_bit(long nr, volatile unsigned long *addr)
{
__instrument_read_write_bitop(nr, addr);
return arch___test_and_clear_bit(nr, addr);
}
/**
* __test_and_change_bit - Change a bit and return its old value
* @nr: Bit to change
* @addr: Address to count from
*
* This operation is non-atomic. If two instances of this operation race, one
* can appear to succeed but actually fail.
*/
static inline bool __test_and_change_bit(long nr, volatile unsigned long *addr)
{
__instrument_read_write_bitop(nr, addr);
return arch___test_and_change_bit(nr, addr);
}
/**
* test_bit - Determine whether a bit is set
* @nr: bit number to test
* @addr: Address to start counting from
*/
static inline bool test_bit(long nr, const volatile unsigned long *addr)
{
instrument_atomic_read(addr + BIT_WORD(nr), sizeof(long));
return arch_test_bit(nr, addr);
}
#endif /* _ASM_GENERIC_BITOPS_INSTRUMENTED_NON_ATOMIC_H */
/* SPDX-License-Identifier: GPL-2.0 */
/* Freezer declarations */
#ifndef FREEZER_H_INCLUDED
#define FREEZER_H_INCLUDED
#include <linux/debug_locks.h>
#include <linux/sched.h>
#include <linux/wait.h>
#include <linux/atomic.h>
#ifdef CONFIG_FREEZER
extern atomic_t system_freezing_cnt; /* nr of freezing conds in effect */
extern bool pm_freezing; /* PM freezing in effect */
extern bool pm_nosig_freezing; /* PM nosig freezing in effect */
/*
* Timeout for stopping processes
*/
extern unsigned int freeze_timeout_msecs;
/*
* Check if a process has been frozen
*/
static inline bool frozen(struct task_struct *p)
{
return p->flags & PF_FROZEN;
}
extern bool freezing_slow_path(struct task_struct *p);
/*
* Check if there is a request to freeze a process
*/
static inline bool freezing(struct task_struct *p)
{
if (likely(!atomic_read(&system_freezing_cnt)))
return false;
return freezing_slow_path(p);
}
/* Takes and releases task alloc lock using task_lock() */
extern void __thaw_task(struct task_struct *t);
extern bool __refrigerator(bool check_kthr_stop);
extern int freeze_processes(void);
extern int freeze_kernel_threads(void);
extern void thaw_processes(void);
extern void thaw_kernel_threads(void);
/*
* DO NOT ADD ANY NEW CALLERS OF THIS FUNCTION
* If try_to_freeze causes a lockdep warning it means the caller may deadlock
*/
static inline bool try_to_freeze_unsafe(void)
{
might_sleep();
if (likely(!freezing(current)))
return false;
return __refrigerator(false);
}
static inline bool try_to_freeze(void)
{
if (!(current->flags & PF_NOFREEZE))
debug_check_no_locks_held();
return try_to_freeze_unsafe();
}
extern bool freeze_task(struct task_struct *p);
extern bool set_freezable(void);
#ifdef CONFIG_CGROUP_FREEZER
extern bool cgroup_freezing(struct task_struct *task);
#else /* !CONFIG_CGROUP_FREEZER */
static inline bool cgroup_freezing(struct task_struct *task)
{
return false;
}
#endif /* !CONFIG_CGROUP_FREEZER */
/*
* The PF_FREEZER_SKIP flag should be set by a vfork parent right before it
* calls wait_for_completion(&vfork) and reset right after it returns from this
* function. Next, the parent should call try_to_freeze() to freeze itself
* appropriately in case the child has exited before the freezing of tasks is
* complete. However, we don't want kernel threads to be frozen in unexpected
* places, so we allow them to block freeze_processes() instead or to set
* PF_NOFREEZE if needed. Fortunately, in the ____call_usermodehelper() case the
* parent won't really block freeze_processes(), since ____call_usermodehelper()
* (the child) does a little before exec/exit and it can't be frozen before
* waking up the parent.
*/
/**
* freezer_do_not_count - tell freezer to ignore %current
*
* Tell freezers to ignore the current task when determining whether the
* target frozen state is reached. IOW, the current task will be
* considered frozen enough by freezers.
*
* The caller shouldn't do anything which isn't allowed for a frozen task
* until freezer_cont() is called. Usually, freezer[_do_not]_count() pair
* wrap a scheduling operation and nothing much else.
*/
static inline void freezer_do_not_count(void)
{
current->flags |= PF_FREEZER_SKIP;
}
/**
* freezer_count - tell freezer to stop ignoring %current
*
* Undo freezer_do_not_count(). It tells freezers that %current should be
* considered again and tries to freeze if freezing condition is already in
* effect.
*/
static inline void freezer_count(void)
{
current->flags &= ~PF_FREEZER_SKIP;
/*
* If freezing is in progress, the following paired with smp_mb()
* in freezer_should_skip() ensures that either we see %true
* freezing() or freezer_should_skip() sees !PF_FREEZER_SKIP.
*/
smp_mb();
try_to_freeze();
}
/* DO NOT ADD ANY NEW CALLERS OF THIS FUNCTION */
static inline void freezer_count_unsafe(void)
{
current->flags &= ~PF_FREEZER_SKIP;
smp_mb();
try_to_freeze_unsafe();
}
/**
* freezer_should_skip - whether to skip a task when determining frozen
* state is reached
* @p: task in quesion
*
* This function is used by freezers after establishing %true freezing() to
* test whether a task should be skipped when determining the target frozen
* state is reached. IOW, if this function returns %true, @p is considered
* frozen enough.
*/
static inline bool freezer_should_skip(struct task_struct *p)
{
/*
* The following smp_mb() paired with the one in freezer_count()
* ensures that either freezer_count() sees %true freezing() or we
* see cleared %PF_FREEZER_SKIP and return %false. This makes it
* impossible for a task to slip frozen state testing after
* clearing %PF_FREEZER_SKIP.
*/
smp_mb();
return p->flags & PF_FREEZER_SKIP;
}
/*
* These functions are intended to be used whenever you want allow a sleeping
* task to be frozen. Note that neither return any clear indication of
* whether a freeze event happened while in this function.
*/
/* Like schedule(), but should not block the freezer. */
static inline void freezable_schedule(void)
{
freezer_do_not_count();
schedule();
freezer_count();
}
/* DO NOT ADD ANY NEW CALLERS OF THIS FUNCTION */
static inline void freezable_schedule_unsafe(void)
{
freezer_do_not_count();
schedule();
freezer_count_unsafe();
}
/*
* Like schedule_timeout(), but should not block the freezer. Do not
* call this with locks held.
*/
static inline long freezable_schedule_timeout(long timeout)
{
long __retval;
freezer_do_not_count();
__retval = schedule_timeout(timeout);
freezer_count();
return __retval;
}
/*
* Like schedule_timeout_interruptible(), but should not block the freezer. Do not
* call this with locks held.
*/
static inline long freezable_schedule_timeout_interruptible(long timeout)
{
long __retval;
freezer_do_not_count();
__retval = schedule_timeout_interruptible(timeout);
freezer_count();
return __retval;
}
/* DO NOT ADD ANY NEW CALLERS OF THIS FUNCTION */
static inline long freezable_schedule_timeout_interruptible_unsafe(long timeout)
{
long __retval;
freezer_do_not_count();
__retval = schedule_timeout_interruptible(timeout);
freezer_count_unsafe();
return __retval;
}
/* Like schedule_timeout_killable(), but should not block the freezer. */
static inline long freezable_schedule_timeout_killable(long timeout)
{
long __retval;
freezer_do_not_count();
__retval = schedule_timeout_killable(timeout);
freezer_count();
return __retval;
}
/* DO NOT ADD ANY NEW CALLERS OF THIS FUNCTION */
static inline long freezable_schedule_timeout_killable_unsafe(long timeout)
{
long __retval;
freezer_do_not_count();
__retval = schedule_timeout_killable(timeout);
freezer_count_unsafe();
return __retval;
}
/*
* Like schedule_hrtimeout_range(), but should not block the freezer. Do not
* call this with locks held.
*/
static inline int freezable_schedule_hrtimeout_range(ktime_t *expires,
u64 delta, const enum hrtimer_mode mode)
{
int __retval;
freezer_do_not_count();
__retval = schedule_hrtimeout_range(expires, delta, mode);
freezer_count();
return __retval;
}
/*
* Freezer-friendly wrappers around wait_event_interruptible(),
* wait_event_killable() and wait_event_interruptible_timeout(), originally
* defined in <linux/wait.h>
*/
/* DO NOT ADD ANY NEW CALLERS OF THIS FUNCTION */
#define wait_event_freezekillable_unsafe(wq, condition) \
({ \
int __retval; \
freezer_do_not_count(); \
__retval = wait_event_killable(wq, (condition)); \
freezer_count_unsafe(); \
__retval; \
})
#else /* !CONFIG_FREEZER */
static inline bool frozen(struct task_struct *p) { return false; }
static inline bool freezing(struct task_struct *p) { return false; }
static inline void __thaw_task(struct task_struct *t) {}
static inline bool __refrigerator(bool check_kthr_stop) { return false; }
static inline int freeze_processes(void) { return -ENOSYS; }
static inline int freeze_kernel_threads(void) { return -ENOSYS; }
static inline void thaw_processes(void) {}
static inline void thaw_kernel_threads(void) {}
static inline bool try_to_freeze(void) { return false; }
static inline void freezer_do_not_count(void) {}
static inline void freezer_count(void) {}
static inline int freezer_should_skip(struct task_struct *p) { return 0; }
static inline void set_freezable(void) {}
#define freezable_schedule() schedule()
#define freezable_schedule_unsafe() schedule()
#define freezable_schedule_timeout(timeout) schedule_timeout(timeout)
#define freezable_schedule_timeout_interruptible(timeout) \
schedule_timeout_interruptible(timeout)
#define freezable_schedule_timeout_interruptible_unsafe(timeout) \
schedule_timeout_interruptible(timeout)
#define freezable_schedule_timeout_killable(timeout) \
schedule_timeout_killable(timeout)
#define freezable_schedule_timeout_killable_unsafe(timeout) \
schedule_timeout_killable(timeout)
#define freezable_schedule_hrtimeout_range(expires, delta, mode) \
schedule_hrtimeout_range(expires, delta, mode)
#define wait_event_freezekillable_unsafe(wq, condition) \
wait_event_killable(wq, condition)
#endif /* !CONFIG_FREEZER */
#endif /* FREEZER_H_INCLUDED */
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_MMZONE_H
#define _LINUX_MMZONE_H
#ifndef __ASSEMBLY__
#ifndef __GENERATING_BOUNDS_H
#include <linux/spinlock.h>
#include <linux/list.h>
#include <linux/wait.h>
#include <linux/bitops.h>
#include <linux/cache.h>
#include <linux/threads.h>
#include <linux/numa.h>
#include <linux/init.h>
#include <linux/seqlock.h>
#include <linux/nodemask.h>
#include <linux/pageblock-flags.h>
#include <linux/page-flags-layout.h>
#include <linux/atomic.h>
#include <linux/mm_types.h>
#include <linux/page-flags.h>
#include <linux/local_lock.h>
#include <asm/page.h>
/* Free memory management - zoned buddy allocator. */
#ifndef CONFIG_FORCE_MAX_ZONEORDER
#define MAX_ORDER 11
#else
#define MAX_ORDER CONFIG_FORCE_MAX_ZONEORDER
#endif
#define MAX_ORDER_NR_PAGES (1 << (MAX_ORDER - 1))
/*
* PAGE_ALLOC_COSTLY_ORDER is the order at which allocations are deemed
* costly to service. That is between allocation orders which should
* coalesce naturally under reasonable reclaim pressure and those which
* will not.
*/
#define PAGE_ALLOC_COSTLY_ORDER 3
enum migratetype {
MIGRATE_UNMOVABLE,
MIGRATE_MOVABLE,
MIGRATE_RECLAIMABLE,
MIGRATE_PCPTYPES, /* the number of types on the pcp lists */
MIGRATE_HIGHATOMIC = MIGRATE_PCPTYPES,
#ifdef CONFIG_CMA
/*
* MIGRATE_CMA migration type is designed to mimic the way
* ZONE_MOVABLE works. Only movable pages can be allocated
* from MIGRATE_CMA pageblocks and page allocator never
* implicitly change migration type of MIGRATE_CMA pageblock.
*
* The way to use it is to change migratetype of a range of
* pageblocks to MIGRATE_CMA which can be done by
* __free_pageblock_cma() function. What is important though
* is that a range of pageblocks must be aligned to
* MAX_ORDER_NR_PAGES should biggest page be bigger than
* a single pageblock.
*/
MIGRATE_CMA,
#endif
#ifdef CONFIG_MEMORY_ISOLATION
MIGRATE_ISOLATE, /* can't allocate from here */
#endif
MIGRATE_TYPES
};
/* In mm/page_alloc.c; keep in sync also with show_migration_types() there */
extern const char * const migratetype_names[MIGRATE_TYPES];
#ifdef CONFIG_CMA
# define is_migrate_cma(migratetype) unlikely((migratetype) == MIGRATE_CMA)
# define is_migrate_cma_page(_page) (get_pageblock_migratetype(_page) == MIGRATE_CMA)
#else
# define is_migrate_cma(migratetype) false
# define is_migrate_cma_page(_page) false
#endif
static inline bool is_migrate_movable(int mt)
{
return is_migrate_cma(mt) || mt == MIGRATE_MOVABLE;
}
#define for_each_migratetype_order(order, type) \
for (order = 0; order < MAX_ORDER; order++) \
for (type = 0; type < MIGRATE_TYPES; type++)
extern int page_group_by_mobility_disabled;
#define MIGRATETYPE_MASK ((1UL << PB_migratetype_bits) - 1)
#define get_pageblock_migratetype(page) \
get_pfnblock_flags_mask(page, page_to_pfn(page), MIGRATETYPE_MASK)
struct free_area {
struct list_head free_list[MIGRATE_TYPES];
unsigned long nr_free;
};
static inline struct page *get_page_from_free_area(struct free_area *area,
int migratetype)
{
return list_first_entry_or_null(&area->free_list[migratetype],
struct page, lru);
}
static inline bool free_area_empty(struct free_area *area, int migratetype)
{
return list_empty(&area->free_list[migratetype]);
}
struct pglist_data;
/*
* Add a wild amount of padding here to ensure data fall into separate
* cachelines. There are very few zone structures in the machine, so space
* consumption is not a concern here.
*/
#if defined(CONFIG_SMP)
struct zone_padding {
char x[0];
} ____cacheline_internodealigned_in_smp;
#define ZONE_PADDING(name) struct zone_padding name;
#else
#define ZONE_PADDING(name)
#endif
#ifdef CONFIG_NUMA
enum numa_stat_item {
NUMA_HIT, /* allocated in intended node */
NUMA_MISS, /* allocated in non intended node */
NUMA_FOREIGN, /* was intended here, hit elsewhere */
NUMA_INTERLEAVE_HIT, /* interleaver preferred this zone */
NUMA_LOCAL, /* allocation from local node */
NUMA_OTHER, /* allocation from other node */
NR_VM_NUMA_EVENT_ITEMS
};
#else
#define NR_VM_NUMA_EVENT_ITEMS 0
#endif
enum zone_stat_item {
/* First 128 byte cacheline (assuming 64 bit words) */
NR_FREE_PAGES,
NR_ZONE_LRU_BASE, /* Used only for compaction and reclaim retry */
NR_ZONE_INACTIVE_ANON = NR_ZONE_LRU_BASE,
NR_ZONE_ACTIVE_ANON,
NR_ZONE_INACTIVE_FILE,
NR_ZONE_ACTIVE_FILE,
NR_ZONE_UNEVICTABLE,
NR_ZONE_WRITE_PENDING, /* Count of dirty, writeback and unstable pages */
NR_MLOCK, /* mlock()ed pages found and moved off LRU */
/* Second 128 byte cacheline */
NR_BOUNCE,
#if IS_ENABLED(CONFIG_ZSMALLOC)
NR_ZSPAGES, /* allocated in zsmalloc */
#endif
NR_FREE_CMA_PAGES,
NR_VM_ZONE_STAT_ITEMS };
enum node_stat_item {
NR_LRU_BASE,
NR_INACTIVE_ANON = NR_LRU_BASE, /* must match order of LRU_[IN]ACTIVE */
NR_ACTIVE_ANON, /* " " " " " */
NR_INACTIVE_FILE, /* " " " " " */
NR_ACTIVE_FILE, /* " " " " " */
NR_UNEVICTABLE, /* " " " " " */
NR_SLAB_RECLAIMABLE_B,
NR_SLAB_UNRECLAIMABLE_B,
NR_ISOLATED_ANON, /* Temporary isolated pages from anon lru */
NR_ISOLATED_FILE, /* Temporary isolated pages from file lru */
WORKINGSET_NODES,
WORKINGSET_REFAULT_BASE,
WORKINGSET_REFAULT_ANON = WORKINGSET_REFAULT_BASE,
WORKINGSET_REFAULT_FILE,
WORKINGSET_ACTIVATE_BASE,
WORKINGSET_ACTIVATE_ANON = WORKINGSET_ACTIVATE_BASE,
WORKINGSET_ACTIVATE_FILE,
WORKINGSET_RESTORE_BASE,
WORKINGSET_RESTORE_ANON = WORKINGSET_RESTORE_BASE,
WORKINGSET_RESTORE_FILE,
WORKINGSET_NODERECLAIM,
NR_ANON_MAPPED, /* Mapped anonymous pages */
NR_FILE_MAPPED, /* pagecache pages mapped into pagetables.
only modified from process context */
NR_FILE_PAGES,
NR_FILE_DIRTY,
NR_WRITEBACK,
NR_WRITEBACK_TEMP, /* Writeback using temporary buffers */
NR_SHMEM, /* shmem pages (included tmpfs/GEM pages) */
NR_SHMEM_THPS,
NR_SHMEM_PMDMAPPED,
NR_FILE_THPS,
NR_FILE_PMDMAPPED,
NR_ANON_THPS,
NR_VMSCAN_WRITE,
NR_VMSCAN_IMMEDIATE, /* Prioritise for reclaim when writeback ends */
NR_DIRTIED, /* page dirtyings since bootup */
NR_WRITTEN, /* page writings since bootup */
NR_KERNEL_MISC_RECLAIMABLE, /* reclaimable non-slab kernel pages */
NR_FOLL_PIN_ACQUIRED, /* via: pin_user_page(), gup flag: FOLL_PIN */
NR_FOLL_PIN_RELEASED, /* pages returned via unpin_user_page() */
NR_KERNEL_STACK_KB, /* measured in KiB */
#if IS_ENABLED(CONFIG_SHADOW_CALL_STACK)
NR_KERNEL_SCS_KB, /* measured in KiB */
#endif
NR_PAGETABLE, /* used for pagetables */
#ifdef CONFIG_SWAP
NR_SWAPCACHE,
#endif
NR_VM_NODE_STAT_ITEMS
};
/*
* Returns true if the item should be printed in THPs (/proc/vmstat
* currently prints number of anon, file and shmem THPs. But the item
* is charged in pages).
*/
static __always_inline bool vmstat_item_print_in_thp(enum node_stat_item item)
{
if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE))
return false;
return item == NR_ANON_THPS ||
item == NR_FILE_THPS ||
item == NR_SHMEM_THPS ||
item == NR_SHMEM_PMDMAPPED ||
item == NR_FILE_PMDMAPPED;
}
/*
* Returns true if the value is measured in bytes (most vmstat values are
* measured in pages). This defines the API part, the internal representation
* might be different.
*/
static __always_inline bool vmstat_item_in_bytes(int idx)
{
/*
* Global and per-node slab counters track slab pages.
* It's expected that changes are multiples of PAGE_SIZE.
* Internally values are stored in pages.
*
* Per-memcg and per-lruvec counters track memory, consumed
* by individual slab objects. These counters are actually
* byte-precise.
*/
return (idx == NR_SLAB_RECLAIMABLE_B ||
idx == NR_SLAB_UNRECLAIMABLE_B);
}
/*
* We do arithmetic on the LRU lists in various places in the code,
* so it is important to keep the active lists LRU_ACTIVE higher in
* the array than the corresponding inactive lists, and to keep
* the *_FILE lists LRU_FILE higher than the corresponding _ANON lists.
*
* This has to be kept in sync with the statistics in zone_stat_item
* above and the descriptions in vmstat_text in mm/vmstat.c
*/
#define LRU_BASE 0
#define LRU_ACTIVE 1
#define LRU_FILE 2
enum lru_list {
LRU_INACTIVE_ANON = LRU_BASE,
LRU_ACTIVE_ANON = LRU_BASE + LRU_ACTIVE,
LRU_INACTIVE_FILE = LRU_BASE + LRU_FILE,
LRU_ACTIVE_FILE = LRU_BASE + LRU_FILE + LRU_ACTIVE,
LRU_UNEVICTABLE,
NR_LRU_LISTS
};
#define for_each_lru(lru) for (lru = 0; lru < NR_LRU_LISTS; lru++)
#define for_each_evictable_lru(lru) for (lru = 0; lru <= LRU_ACTIVE_FILE; lru++)
static inline bool is_file_lru(enum lru_list lru)
{
return (lru == LRU_INACTIVE_FILE || lru == LRU_ACTIVE_FILE);
}
static inline bool is_active_lru(enum lru_list lru)
{
return (lru == LRU_ACTIVE_ANON || lru == LRU_ACTIVE_FILE);
}
#define ANON_AND_FILE 2
enum lruvec_flags {
LRUVEC_CONGESTED, /* lruvec has many dirty pages
* backed by a congested BDI
*/
};
struct lruvec {
struct list_head lists[NR_LRU_LISTS];
/* per lruvec lru_lock for memcg */
spinlock_t lru_lock;
/*
* These track the cost of reclaiming one LRU - file or anon -
* over the other. As the observed cost of reclaiming one LRU
* increases, the reclaim scan balance tips toward the other.
*/
unsigned long anon_cost;
unsigned long file_cost;
/* Non-resident age, driven by LRU movement */
atomic_long_t nonresident_age;
/* Refaults at the time of last reclaim cycle */
unsigned long refaults[ANON_AND_FILE];
/* Various lruvec state flags (enum lruvec_flags) */
unsigned long flags;
#ifdef CONFIG_MEMCG
struct pglist_data *pgdat;
#endif
};
/* Isolate unmapped pages */
#define ISOLATE_UNMAPPED ((__force isolate_mode_t)0x2)
/* Isolate for asynchronous migration */
#define ISOLATE_ASYNC_MIGRATE ((__force isolate_mode_t)0x4)
/* Isolate unevictable pages */
#define ISOLATE_UNEVICTABLE ((__force isolate_mode_t)0x8)
/* LRU Isolation modes. */
typedef unsigned __bitwise isolate_mode_t;
enum zone_watermarks {
WMARK_MIN,
WMARK_LOW,
WMARK_HIGH,
NR_WMARK
};
/*
* One per migratetype for each PAGE_ALLOC_COSTLY_ORDER plus one additional
* for pageblock size for THP if configured.
*/
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
#define NR_PCP_THP 1
#else
#define NR_PCP_THP 0
#endif
#define NR_PCP_LISTS (MIGRATE_PCPTYPES * (PAGE_ALLOC_COSTLY_ORDER + 1 + NR_PCP_THP))
/*
* Shift to encode migratetype and order in the same integer, with order
* in the least significant bits.
*/
#define NR_PCP_ORDER_WIDTH 8
#define NR_PCP_ORDER_MASK ((1<<NR_PCP_ORDER_WIDTH) - 1)
#define min_wmark_pages(z) (z->_watermark[WMARK_MIN] + z->watermark_boost)
#define low_wmark_pages(z) (z->_watermark[WMARK_LOW] + z->watermark_boost)
#define high_wmark_pages(z) (z->_watermark[WMARK_HIGH] + z->watermark_boost)
#define wmark_pages(z, i) (z->_watermark[i] + z->watermark_boost)
/* Fields and list protected by pagesets local_lock in page_alloc.c */
struct per_cpu_pages {
int count; /* number of pages in the list */
int high; /* high watermark, emptying needed */
int batch; /* chunk size for buddy add/remove */
short free_factor; /* batch scaling factor during free */
#ifdef CONFIG_NUMA
short expire; /* When 0, remote pagesets are drained */
#endif
/* Lists of pages, one per migrate type stored on the pcp-lists */
struct list_head lists[NR_PCP_LISTS];
};
struct per_cpu_zonestat {
#ifdef CONFIG_SMP
s8 vm_stat_diff[NR_VM_ZONE_STAT_ITEMS];
s8 stat_threshold;
#endif
#ifdef CONFIG_NUMA
/*
* Low priority inaccurate counters that are only folded
* on demand. Use a large type to avoid the overhead of
* folding during refresh_cpu_vm_stats.
*/
unsigned long vm_numa_event[NR_VM_NUMA_EVENT_ITEMS];
#endif
};
struct per_cpu_nodestat {
s8 stat_threshold;
s8 vm_node_stat_diff[NR_VM_NODE_STAT_ITEMS];
};
#endif /* !__GENERATING_BOUNDS.H */
enum zone_type {
/*
* ZONE_DMA and ZONE_DMA32 are used when there are peripherals not able
* to DMA to all of the addressable memory (ZONE_NORMAL).
* On architectures where this area covers the whole 32 bit address
* space ZONE_DMA32 is used. ZONE_DMA is left for the ones with smaller
* DMA addressing constraints. This distinction is important as a 32bit
* DMA mask is assumed when ZONE_DMA32 is defined. Some 64-bit
* platforms may need both zones as they support peripherals with
* different DMA addressing limitations.
*/
#ifdef CONFIG_ZONE_DMA
ZONE_DMA,
#endif
#ifdef CONFIG_ZONE_DMA32
ZONE_DMA32,
#endif
/*
* Normal addressable memory is in ZONE_NORMAL. DMA operations can be
* performed on pages in ZONE_NORMAL if the DMA devices support
* transfers to all addressable memory.
*/
ZONE_NORMAL,
#ifdef CONFIG_HIGHMEM
/*
* A memory area that is only addressable by the kernel through
* mapping portions into its own address space. This is for example
* used by i386 to allow the kernel to address the memory beyond
* 900MB. The kernel will set up special mappings (page
* table entries on i386) for each page that the kernel needs to
* access.
*/
ZONE_HIGHMEM,
#endif
/*
* ZONE_MOVABLE is similar to ZONE_NORMAL, except that it contains
* movable pages with few exceptional cases described below. Main use
* cases for ZONE_MOVABLE are to make memory offlining/unplug more
* likely to succeed, and to locally limit unmovable allocations - e.g.,
* to increase the number of THP/huge pages. Notable special cases are:
*
* 1. Pinned pages: (long-term) pinning of movable pages might
* essentially turn such pages unmovable. Therefore, we do not allow
* pinning long-term pages in ZONE_MOVABLE. When pages are pinned and
* faulted, they come from the right zone right away. However, it is
* still possible that address space already has pages in
* ZONE_MOVABLE at the time when pages are pinned (i.e. user has
* touches that memory before pinning). In such case we migrate them
* to a different zone. When migration fails - pinning fails.
* 2. memblock allocations: kernelcore/movablecore setups might create
* situations where ZONE_MOVABLE contains unmovable allocations
* after boot. Memory offlining and allocations fail early.
* 3. Memory holes: kernelcore/movablecore setups might create very rare
* situations where ZONE_MOVABLE contains memory holes after boot,
* for example, if we have sections that are only partially
* populated. Memory offlining and allocations fail early.
* 4. PG_hwpoison pages: while poisoned pages can be skipped during
* memory offlining, such pages cannot be allocated.
* 5. Unmovable PG_offline pages: in paravirtualized environments,
* hotplugged memory blocks might only partially be managed by the
* buddy (e.g., via XEN-balloon, Hyper-V balloon, virtio-mem). The
* parts not manged by the buddy are unmovable PG_offline pages. In
* some cases (virtio-mem), such pages can be skipped during
* memory offlining, however, cannot be moved/allocated. These
* techniques might use alloc_contig_range() to hide previously
* exposed pages from the buddy again (e.g., to implement some sort
* of memory unplug in virtio-mem).
* 6. ZERO_PAGE(0), kernelcore/movablecore setups might create
* situations where ZERO_PAGE(0) which is allocated differently
* on different platforms may end up in a movable zone. ZERO_PAGE(0)
* cannot be migrated.
* 7. Memory-hotplug: when using memmap_on_memory and onlining the
* memory to the MOVABLE zone, the vmemmap pages are also placed in
* such zone. Such pages cannot be really moved around as they are
* self-stored in the range, but they are treated as movable when
* the range they describe is about to be offlined.
*
* In general, no unmovable allocations that degrade memory offlining
* should end up in ZONE_MOVABLE. Allocators (like alloc_contig_range())
* have to expect that migrating pages in ZONE_MOVABLE can fail (even
* if has_unmovable_pages() states that there are no unmovable pages,
* there can be false negatives).
*/
ZONE_MOVABLE,
#ifdef CONFIG_ZONE_DEVICE
ZONE_DEVICE,
#endif
__MAX_NR_ZONES
};
#ifndef __GENERATING_BOUNDS_H
#define ASYNC_AND_SYNC 2
struct zone {
/* Read-mostly fields */
/* zone watermarks, access with *_wmark_pages(zone) macros */
unsigned long _watermark[NR_WMARK];
unsigned long watermark_boost;
unsigned long nr_reserved_highatomic;
/*
* We don't know if the memory that we're going to allocate will be
* freeable or/and it will be released eventually, so to avoid totally
* wasting several GB of ram we must reserve some of the lower zone
* memory (otherwise we risk to run OOM on the lower zones despite
* there being tons of freeable ram on the higher zones). This array is
* recalculated at runtime if the sysctl_lowmem_reserve_ratio sysctl
* changes.
*/
long lowmem_reserve[MAX_NR_ZONES];
#ifdef CONFIG_NUMA
int node;
#endif
struct pglist_data *zone_pgdat;
struct per_cpu_pages __percpu *per_cpu_pageset;
struct per_cpu_zonestat __percpu *per_cpu_zonestats;
/*
* the high and batch values are copied to individual pagesets for
* faster access
*/
int pageset_high;
int pageset_batch;
#ifndef CONFIG_SPARSEMEM
/*
* Flags for a pageblock_nr_pages block. See pageblock-flags.h.
* In SPARSEMEM, this map is stored in struct mem_section
*/
unsigned long *pageblock_flags;
#endif /* CONFIG_SPARSEMEM */
/* zone_start_pfn == zone_start_paddr >> PAGE_SHIFT */
unsigned long zone_start_pfn;
/*
* spanned_pages is the total pages spanned by the zone, including
* holes, which is calculated as:
* spanned_pages = zone_end_pfn - zone_start_pfn;
*
* present_pages is physical pages existing within the zone, which
* is calculated as:
* present_pages = spanned_pages - absent_pages(pages in holes);
*
* present_early_pages is present pages existing within the zone
* located on memory available since early boot, excluding hotplugged
* memory.
*
* managed_pages is present pages managed by the buddy system, which
* is calculated as (reserved_pages includes pages allocated by the
* bootmem allocator):
* managed_pages = present_pages - reserved_pages;
*
* cma pages is present pages that are assigned for CMA use
* (MIGRATE_CMA).
*
* So present_pages may be used by memory hotplug or memory power
* management logic to figure out unmanaged pages by checking
* (present_pages - managed_pages). And managed_pages should be used
* by page allocator and vm scanner to calculate all kinds of watermarks
* and thresholds.
*
* Locking rules:
*
* zone_start_pfn and spanned_pages are protected by span_seqlock.
* It is a seqlock because it has to be read outside of zone->lock,
* and it is done in the main allocator path. But, it is written
* quite infrequently.
*
* The span_seq lock is declared along with zone->lock because it is
* frequently read in proximity to zone->lock. It's good to
* give them a chance of being in the same cacheline.
*
* Write access to present_pages at runtime should be protected by
* mem_hotplug_begin/end(). Any reader who can't tolerant drift of
* present_pages should get_online_mems() to get a stable value.
*/
atomic_long_t managed_pages;
unsigned long spanned_pages;
unsigned long present_pages;
#if defined(CONFIG_MEMORY_HOTPLUG)
unsigned long present_early_pages;
#endif
#ifdef CONFIG_CMA
unsigned long cma_pages;
#endif
const char *name;
#ifdef CONFIG_MEMORY_ISOLATION
/*
* Number of isolated pageblock. It is used to solve incorrect
* freepage counting problem due to racy retrieving migratetype
* of pageblock. Protected by zone->lock.
*/
unsigned long nr_isolate_pageblock;
#endif
#ifdef CONFIG_MEMORY_HOTPLUG
/* see spanned/present_pages for more description */
seqlock_t span_seqlock;
#endif
int initialized;
/* Write-intensive fields used from the page allocator */
ZONE_PADDING(_pad1_)
/* free areas of different sizes */
struct free_area free_area[MAX_ORDER];
/* zone flags, see below */
unsigned long flags;
/* Primarily protects free_area */
spinlock_t lock;
/* Write-intensive fields used by compaction and vmstats. */
ZONE_PADDING(_pad2_)
/*
* When free pages are below this point, additional steps are taken
* when reading the number of free pages to avoid per-cpu counter
* drift allowing watermarks to be breached
*/
unsigned long percpu_drift_mark;
#if defined CONFIG_COMPACTION || defined CONFIG_CMA
/* pfn where compaction free scanner should start */
unsigned long compact_cached_free_pfn;
/* pfn where compaction migration scanner should start */
unsigned long compact_cached_migrate_pfn[ASYNC_AND_SYNC];
unsigned long compact_init_migrate_pfn;
unsigned long compact_init_free_pfn;
#endif
#ifdef CONFIG_COMPACTION
/*
* On compaction failure, 1<<compact_defer_shift compactions
* are skipped before trying again. The number attempted since
* last failure is tracked with compact_considered.
* compact_order_failed is the minimum compaction failed order.
*/
unsigned int compact_considered;
unsigned int compact_defer_shift;
int compact_order_failed;
#endif
#if defined CONFIG_COMPACTION || defined CONFIG_CMA
/* Set to true when the PG_migrate_skip bits should be cleared */
bool compact_blockskip_flush;
#endif
bool contiguous;
ZONE_PADDING(_pad3_)
/* Zone statistics */
atomic_long_t vm_stat[NR_VM_ZONE_STAT_ITEMS];
atomic_long_t vm_numa_event[NR_VM_NUMA_EVENT_ITEMS];
} ____cacheline_internodealigned_in_smp;
enum pgdat_flags {
PGDAT_DIRTY, /* reclaim scanning has recently found
* many dirty file pages at the tail
* of the LRU.
*/
PGDAT_WRITEBACK, /* reclaim scanning has recently found
* many pages under writeback
*/
PGDAT_RECLAIM_LOCKED, /* prevents concurrent reclaim */
};
enum zone_flags {
ZONE_BOOSTED_WATERMARK, /* zone recently boosted watermarks.
* Cleared when kswapd is woken.
*/
ZONE_RECLAIM_ACTIVE, /* kswapd may be scanning the zone. */
};
static inline unsigned long zone_managed_pages(struct zone *zone)
{
return (unsigned long)atomic_long_read(&zone->managed_pages);
}
static inline unsigned long zone_cma_pages(struct zone *zone)
{
#ifdef CONFIG_CMA
return zone->cma_pages;
#else
return 0;
#endif
}
static inline unsigned long zone_end_pfn(const struct zone *zone)
{
return zone->zone_start_pfn + zone->spanned_pages;
}
static inline bool zone_spans_pfn(const struct zone *zone, unsigned long pfn)
{
return zone->zone_start_pfn <= pfn && pfn < zone_end_pfn(zone);
}
static inline bool zone_is_initialized(struct zone *zone)
{
return zone->initialized;
}
static inline bool zone_is_empty(struct zone *zone)
{
return zone->spanned_pages == 0;
}
/*
* Return true if [start_pfn, start_pfn + nr_pages) range has a non-empty
* intersection with the given zone
*/
static inline bool zone_intersects(struct zone *zone,
unsigned long start_pfn, unsigned long nr_pages)
{
if (zone_is_empty(zone))
return false;
if (start_pfn >= zone_end_pfn(zone) ||
start_pfn + nr_pages <= zone->zone_start_pfn)
return false;
return true;
}
/*
* The "priority" of VM scanning is how much of the queues we will scan in one
* go. A value of 12 for DEF_PRIORITY implies that we will scan 1/4096th of the
* queues ("queue_length >> 12") during an aging round.
*/
#define DEF_PRIORITY 12
/* Maximum number of zones on a zonelist */
#define MAX_ZONES_PER_ZONELIST (MAX_NUMNODES * MAX_NR_ZONES)
enum {
ZONELIST_FALLBACK, /* zonelist with fallback */
#ifdef CONFIG_NUMA
/*
* The NUMA zonelists are doubled because we need zonelists that
* restrict the allocations to a single node for __GFP_THISNODE.
*/
ZONELIST_NOFALLBACK, /* zonelist without fallback (__GFP_THISNODE) */
#endif
MAX_ZONELISTS
};
/*
* This struct contains information about a zone in a zonelist. It is stored
* here to avoid dereferences into large structures and lookups of tables
*/
struct zoneref {
struct zone *zone; /* Pointer to actual zone */
int zone_idx; /* zone_idx(zoneref->zone) */
};
/*
* One allocation request operates on a zonelist. A zonelist
* is a list of zones, the first one is the 'goal' of the
* allocation, the other zones are fallback zones, in decreasing
* priority.
*
* To speed the reading of the zonelist, the zonerefs contain the zone index
* of the entry being read. Helper functions to access information given
* a struct zoneref are
*
* zonelist_zone() - Return the struct zone * for an entry in _zonerefs
* zonelist_zone_idx() - Return the index of the zone for an entry
* zonelist_node_idx() - Return the index of the node for an entry
*/
struct zonelist {
struct zoneref _zonerefs[MAX_ZONES_PER_ZONELIST + 1];
};
/*
* The array of struct pages for flatmem.
* It must be declared for SPARSEMEM as well because there are configurations
* that rely on that.
*/
extern struct page *mem_map;
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
struct deferred_split {
spinlock_t split_queue_lock;
struct list_head split_queue;
unsigned long split_queue_len;
};
#endif
/*
* On NUMA machines, each NUMA node would have a pg_data_t to describe
* it's memory layout. On UMA machines there is a single pglist_data which
* describes the whole memory.
*
* Memory statistics and page replacement data structures are maintained on a
* per-zone basis.
*/
typedef struct pglist_data {
/*
* node_zones contains just the zones for THIS node. Not all of the
* zones may be populated, but it is the full list. It is referenced by
* this node's node_zonelists as well as other node's node_zonelists.
*/
struct zone node_zones[MAX_NR_ZONES];
/*
* node_zonelists contains references to all zones in all nodes.
* Generally the first zones will be references to this node's
* node_zones.
*/
struct zonelist node_zonelists[MAX_ZONELISTS];
int nr_zones; /* number of populated zones in this node */
#ifdef CONFIG_FLATMEM /* means !SPARSEMEM */
struct page *node_mem_map;
#ifdef CONFIG_PAGE_EXTENSION
struct page_ext *node_page_ext;
#endif
#endif
#if defined(CONFIG_MEMORY_HOTPLUG) || defined(CONFIG_DEFERRED_STRUCT_PAGE_INIT)
/*
* Must be held any time you expect node_start_pfn,
* node_present_pages, node_spanned_pages or nr_zones to stay constant.
* Also synchronizes pgdat->first_deferred_pfn during deferred page
* init.
*
* pgdat_resize_lock() and pgdat_resize_unlock() are provided to
* manipulate node_size_lock without checking for CONFIG_MEMORY_HOTPLUG
* or CONFIG_DEFERRED_STRUCT_PAGE_INIT.
*
* Nests above zone->lock and zone->span_seqlock
*/
spinlock_t node_size_lock;
#endif
unsigned long node_start_pfn;
unsigned long node_present_pages; /* total number of physical pages */
unsigned long node_spanned_pages; /* total size of physical page
range, including holes */
int node_id;
wait_queue_head_t kswapd_wait;
wait_queue_head_t pfmemalloc_wait;
struct task_struct *kswapd; /* Protected by
mem_hotplug_begin/end() */
int kswapd_order;
enum zone_type kswapd_highest_zoneidx;
int kswapd_failures; /* Number of 'reclaimed == 0' runs */
#ifdef CONFIG_COMPACTION
int kcompactd_max_order;
enum zone_type kcompactd_highest_zoneidx;
wait_queue_head_t kcompactd_wait;
struct task_struct *kcompactd;
bool proactive_compact_trigger;
#endif
/*
* This is a per-node reserve of pages that are not available
* to userspace allocations.
*/
unsigned long totalreserve_pages;
#ifdef CONFIG_NUMA
/*
* node reclaim becomes active if more unmapped pages exist.
*/
unsigned long min_unmapped_pages;
unsigned long min_slab_pages;
#endif /* CONFIG_NUMA */
/* Write-intensive fields used by page reclaim */
ZONE_PADDING(_pad1_)
#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
/*
* If memory initialisation on large machines is deferred then this
* is the first PFN that needs to be initialised.
*/
unsigned long first_deferred_pfn;
#endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
struct deferred_split deferred_split_queue;
#endif
/* Fields commonly accessed by the page reclaim scanner */
/*
* NOTE: THIS IS UNUSED IF MEMCG IS ENABLED.
*
* Use mem_cgroup_lruvec() to look up lruvecs.
*/
struct lruvec __lruvec;
unsigned long flags;
ZONE_PADDING(_pad2_)
/* Per-node vmstats */
struct per_cpu_nodestat __percpu *per_cpu_nodestats;
atomic_long_t vm_stat[NR_VM_NODE_STAT_ITEMS];
} pg_data_t;
#define node_present_pages(nid) (NODE_DATA(nid)->node_present_pages)
#define node_spanned_pages(nid) (NODE_DATA(nid)->node_spanned_pages)
#ifdef CONFIG_FLATMEM
#define pgdat_page_nr(pgdat, pagenr) ((pgdat)->node_mem_map + (pagenr))
#else
#define pgdat_page_nr(pgdat, pagenr) pfn_to_page((pgdat)->node_start_pfn + (pagenr))
#endif
#define nid_page_nr(nid, pagenr) pgdat_page_nr(NODE_DATA(nid),(pagenr))
#define node_start_pfn(nid) (NODE_DATA(nid)->node_start_pfn)
#define node_end_pfn(nid) pgdat_end_pfn(NODE_DATA(nid))
static inline unsigned long pgdat_end_pfn(pg_data_t *pgdat)
{
return pgdat->node_start_pfn + pgdat->node_spanned_pages;
}
static inline bool pgdat_is_empty(pg_data_t *pgdat)
{
return !pgdat->node_start_pfn && !pgdat->node_spanned_pages;
}
#include <linux/memory_hotplug.h>
void build_all_zonelists(pg_data_t *pgdat);
void wakeup_kswapd(struct zone *zone, gfp_t gfp_mask, int order,
enum zone_type highest_zoneidx);
bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark,
int highest_zoneidx, unsigned int alloc_flags,
long free_pages);
bool zone_watermark_ok(struct zone *z, unsigned int order,
unsigned long mark, int highest_zoneidx,
unsigned int alloc_flags);
bool zone_watermark_ok_safe(struct zone *z, unsigned int order,
unsigned long mark, int highest_zoneidx);
/*
* Memory initialization context, use to differentiate memory added by
* the platform statically or via memory hotplug interface.
*/
enum meminit_context {
MEMINIT_EARLY,
MEMINIT_HOTPLUG,
};
extern void init_currently_empty_zone(struct zone *zone, unsigned long start_pfn,
unsigned long size);
extern void lruvec_init(struct lruvec *lruvec);
static inline struct pglist_data *lruvec_pgdat(struct lruvec *lruvec)
{
#ifdef CONFIG_MEMCG
return lruvec->pgdat;
#else
return container_of(lruvec, struct pglist_data, __lruvec);
#endif
}
#ifdef CONFIG_HAVE_MEMORYLESS_NODES
int local_memory_node(int node_id);
#else
static inline int local_memory_node(int node_id) { return node_id; };
#endif
/*
* zone_idx() returns 0 for the ZONE_DMA zone, 1 for the ZONE_NORMAL zone, etc.
*/
#define zone_idx(zone) ((zone) - (zone)->zone_pgdat->node_zones)
#ifdef CONFIG_ZONE_DEVICE
static inline bool zone_is_zone_device(struct zone *zone)
{
return zone_idx(zone) == ZONE_DEVICE;
}
#else
static inline bool zone_is_zone_device(struct zone *zone)
{
return false;
}
#endif
/*
* Returns true if a zone has pages managed by the buddy allocator.
* All the reclaim decisions have to use this function rather than
* populated_zone(). If the whole zone is reserved then we can easily
* end up with populated_zone() && !managed_zone().
*/
static inline bool managed_zone(struct zone *zone)
{
return zone_managed_pages(zone);
}
/* Returns true if a zone has memory */
static inline bool populated_zone(struct zone *zone)
{
return zone->present_pages;
}
#ifdef CONFIG_NUMA
static inline int zone_to_nid(struct zone *zone)
{
return zone->node;
}
static inline void zone_set_nid(struct zone *zone, int nid)
{
zone->node = nid;
}
#else
static inline int zone_to_nid(struct zone *zone)
{
return 0;
}
static inline void zone_set_nid(struct zone *zone, int nid) {}
#endif
extern int movable_zone;
static inline int is_highmem_idx(enum zone_type idx)
{
#ifdef CONFIG_HIGHMEM
return (idx == ZONE_HIGHMEM ||
(idx == ZONE_MOVABLE && movable_zone == ZONE_HIGHMEM));
#else
return 0;
#endif
}
#ifdef CONFIG_ZONE_DMA
bool has_managed_dma(void);
#else
static inline bool has_managed_dma(void)
{
return false;
}
#endif
/**
* is_highmem - helper function to quickly check if a struct zone is a
* highmem zone or not. This is an attempt to keep references
* to ZONE_{DMA/NORMAL/HIGHMEM/etc} in general code to a minimum.
* @zone: pointer to struct zone variable
* Return: 1 for a highmem zone, 0 otherwise
*/
static inline int is_highmem(struct zone *zone)
{
#ifdef CONFIG_HIGHMEM
return is_highmem_idx(zone_idx(zone));
#else
return 0;
#endif
}
/* These two functions are used to setup the per zone pages min values */
struct ctl_table;
int min_free_kbytes_sysctl_handler(struct ctl_table *, int, void *, size_t *,
loff_t *);
int watermark_scale_factor_sysctl_handler(struct ctl_table *, int, void *,
size_t *, loff_t *);
extern int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES];
int lowmem_reserve_ratio_sysctl_handler(struct ctl_table *, int, void *,
size_t *, loff_t *);
int percpu_pagelist_high_fraction_sysctl_handler(struct ctl_table *, int,
void *, size_t *, loff_t *);
int sysctl_min_unmapped_ratio_sysctl_handler(struct ctl_table *, int,
void *, size_t *, loff_t *);
int sysctl_min_slab_ratio_sysctl_handler(struct ctl_table *, int,
void *, size_t *, loff_t *);
int numa_zonelist_order_handler(struct ctl_table *, int,
void *, size_t *, loff_t *);
extern int percpu_pagelist_high_fraction;
extern char numa_zonelist_order[];
#define NUMA_ZONELIST_ORDER_LEN 16
#ifndef CONFIG_NUMA
extern struct pglist_data contig_page_data;
static inline struct pglist_data *NODE_DATA(int nid)
{
return &contig_page_data;
}
#define NODE_MEM_MAP(nid) mem_map
#else /* CONFIG_NUMA */
#include <asm/mmzone.h>
#endif /* !CONFIG_NUMA */
extern struct pglist_data *first_online_pgdat(void);
extern struct pglist_data *next_online_pgdat(struct pglist_data *pgdat);
extern struct zone *next_zone(struct zone *zone);
/**
* for_each_online_pgdat - helper macro to iterate over all online nodes
* @pgdat: pointer to a pg_data_t variable
*/
#define for_each_online_pgdat(pgdat) \
for (pgdat = first_online_pgdat(); \
pgdat; \
pgdat = next_online_pgdat(pgdat))
/**
* for_each_zone - helper macro to iterate over all memory zones
* @zone: pointer to struct zone variable
*
* The user only needs to declare the zone variable, for_each_zone
* fills it in.
*/
#define for_each_zone(zone) \
for (zone = (first_online_pgdat())->node_zones; \
zone; \
zone = next_zone(zone))
#define for_each_populated_zone(zone) \
for (zone = (first_online_pgdat())->node_zones; \
zone; \
zone = next_zone(zone)) \
if (!populated_zone(zone)) \
; /* do nothing */ \
else
static inline struct zone *zonelist_zone(struct zoneref *zoneref)
{
return zoneref->zone;
}
static inline int zonelist_zone_idx(struct zoneref *zoneref)
{
return zoneref->zone_idx;
}
static inline int zonelist_node_idx(struct zoneref *zoneref)
{
return zone_to_nid(zoneref->zone);
}
struct zoneref *__next_zones_zonelist(struct zoneref *z,
enum zone_type highest_zoneidx,
nodemask_t *nodes);
/**
* next_zones_zonelist - Returns the next zone at or below highest_zoneidx within the allowed nodemask using a cursor within a zonelist as a starting point
* @z: The cursor used as a starting point for the search
* @highest_zoneidx: The zone index of the highest zone to return
* @nodes: An optional nodemask to filter the zonelist with
*
* This function returns the next zone at or below a given zone index that is
* within the allowed nodemask using a cursor as the starting point for the
* search. The zoneref returned is a cursor that represents the current zone
* being examined. It should be advanced by one before calling
* next_zones_zonelist again.
*
* Return: the next zone at or below highest_zoneidx within the allowed
* nodemask using a cursor within a zonelist as a starting point
*/
static __always_inline struct zoneref *next_zones_zonelist(struct zoneref *z,
enum zone_type highest_zoneidx,
nodemask_t *nodes)
{
if (likely(!nodes && zonelist_zone_idx(z) <= highest_zoneidx))
return z;
return __next_zones_zonelist(z, highest_zoneidx, nodes);
}
/**
* first_zones_zonelist - Returns the first zone at or below highest_zoneidx within the allowed nodemask in a zonelist
* @zonelist: The zonelist to search for a suitable zone
* @highest_zoneidx: The zone index of the highest zone to return
* @nodes: An optional nodemask to filter the zonelist with
*
* This function returns the first zone at or below a given zone index that is
* within the allowed nodemask. The zoneref returned is a cursor that can be
* used to iterate the zonelist with next_zones_zonelist by advancing it by
* one before calling.
*
* When no eligible zone is found, zoneref->zone is NULL (zoneref itself is
* never NULL). This may happen either genuinely, or due to concurrent nodemask
* update due to cpuset modification.
*
* Return: Zoneref pointer for the first suitable zone found
*/
static inline struct zoneref *first_zones_zonelist(struct zonelist *zonelist,
enum zone_type highest_zoneidx,
nodemask_t *nodes)
{
return next_zones_zonelist(zonelist->_zonerefs,
highest_zoneidx, nodes);
}
/**
* for_each_zone_zonelist_nodemask - helper macro to iterate over valid zones in a zonelist at or below a given zone index and within a nodemask
* @zone: The current zone in the iterator
* @z: The current pointer within zonelist->_zonerefs being iterated
* @zlist: The zonelist being iterated
* @highidx: The zone index of the highest zone to return
* @nodemask: Nodemask allowed by the allocator
*
* This iterator iterates though all zones at or below a given zone index and
* within a given nodemask
*/
#define for_each_zone_zonelist_nodemask(zone, z, zlist, highidx, nodemask) \
for (z = first_zones_zonelist(zlist, highidx, nodemask), zone = zonelist_zone(z); \
zone; \
z = next_zones_zonelist(++z, highidx, nodemask), \
zone = zonelist_zone(z))
#define for_next_zone_zonelist_nodemask(zone, z, highidx, nodemask) \
for (zone = z->zone; \
zone; \
z = next_zones_zonelist(++z, highidx, nodemask), \
zone = zonelist_zone(z))
/**
* for_each_zone_zonelist - helper macro to iterate over valid zones in a zonelist at or below a given zone index
* @zone: The current zone in the iterator
* @z: The current pointer within zonelist->zones being iterated
* @zlist: The zonelist being iterated
* @highidx: The zone index of the highest zone to return
*
* This iterator iterates though all zones at or below a given zone index.
*/
#define for_each_zone_zonelist(zone, z, zlist, highidx) \
for_each_zone_zonelist_nodemask(zone, z, zlist, highidx, NULL)
#ifdef CONFIG_SPARSEMEM
#include <asm/sparsemem.h>
#endif
#ifdef CONFIG_FLATMEM
#define pfn_to_nid(pfn) (0)
#endif
#ifdef CONFIG_SPARSEMEM
/*
* PA_SECTION_SHIFT physical address to/from section number
* PFN_SECTION_SHIFT pfn to/from section number
*/
#define PA_SECTION_SHIFT (SECTION_SIZE_BITS)
#define PFN_SECTION_SHIFT (SECTION_SIZE_BITS - PAGE_SHIFT)
#define NR_MEM_SECTIONS (1UL << SECTIONS_SHIFT)
#define PAGES_PER_SECTION (1UL << PFN_SECTION_SHIFT)
#define PAGE_SECTION_MASK (~(PAGES_PER_SECTION-1))
#define SECTION_BLOCKFLAGS_BITS \
((1UL << (PFN_SECTION_SHIFT - pageblock_order)) * NR_PAGEBLOCK_BITS)
#if (MAX_ORDER - 1 + PAGE_SHIFT) > SECTION_SIZE_BITS
#error Allocator MAX_ORDER exceeds SECTION_SIZE
#endif
static inline unsigned long pfn_to_section_nr(unsigned long pfn)
{
return pfn >> PFN_SECTION_SHIFT;
}
static inline unsigned long section_nr_to_pfn(unsigned long sec)
{
return sec << PFN_SECTION_SHIFT;
}
#define SECTION_ALIGN_UP(pfn) (((pfn) + PAGES_PER_SECTION - 1) & PAGE_SECTION_MASK)
#define SECTION_ALIGN_DOWN(pfn) ((pfn) & PAGE_SECTION_MASK)
#define SUBSECTION_SHIFT 21
#define SUBSECTION_SIZE (1UL << SUBSECTION_SHIFT)
#define PFN_SUBSECTION_SHIFT (SUBSECTION_SHIFT - PAGE_SHIFT)
#define PAGES_PER_SUBSECTION (1UL << PFN_SUBSECTION_SHIFT)
#define PAGE_SUBSECTION_MASK (~(PAGES_PER_SUBSECTION-1))
#if SUBSECTION_SHIFT > SECTION_SIZE_BITS
#error Subsection size exceeds section size
#else
#define SUBSECTIONS_PER_SECTION (1UL << (SECTION_SIZE_BITS - SUBSECTION_SHIFT))
#endif
#define SUBSECTION_ALIGN_UP(pfn) ALIGN((pfn), PAGES_PER_SUBSECTION)
#define SUBSECTION_ALIGN_DOWN(pfn) ((pfn) & PAGE_SUBSECTION_MASK)
struct mem_section_usage {
#ifdef CONFIG_SPARSEMEM_VMEMMAP
DECLARE_BITMAP(subsection_map, SUBSECTIONS_PER_SECTION);
#endif
/* See declaration of similar field in struct zone */
unsigned long pageblock_flags[0];
};
void subsection_map_init(unsigned long pfn, unsigned long nr_pages);
struct page;
struct page_ext;
struct mem_section {
/*
* This is, logically, a pointer to an array of struct
* pages. However, it is stored with some other magic.
* (see sparse.c::sparse_init_one_section())
*
* Additionally during early boot we encode node id of
* the location of the section here to guide allocation.
* (see sparse.c::memory_present())
*
* Making it a UL at least makes someone do a cast
* before using it wrong.
*/
unsigned long section_mem_map;
struct mem_section_usage *usage;
#ifdef CONFIG_PAGE_EXTENSION
/*
* If SPARSEMEM, pgdat doesn't have page_ext pointer. We use
* section. (see page_ext.h about this.)
*/
struct page_ext *page_ext;
unsigned long pad;
#endif
/*
* WARNING: mem_section must be a power-of-2 in size for the
* calculation and use of SECTION_ROOT_MASK to make sense.
*/
};
#ifdef CONFIG_SPARSEMEM_EXTREME
#define SECTIONS_PER_ROOT (PAGE_SIZE / sizeof (struct mem_section))
#else
#define SECTIONS_PER_ROOT 1
#endif
#define SECTION_NR_TO_ROOT(sec) ((sec) / SECTIONS_PER_ROOT)
#define NR_SECTION_ROOTS DIV_ROUND_UP(NR_MEM_SECTIONS, SECTIONS_PER_ROOT)
#define SECTION_ROOT_MASK (SECTIONS_PER_ROOT - 1)
#ifdef CONFIG_SPARSEMEM_EXTREME
extern struct mem_section **mem_section;
#else
extern struct mem_section mem_section[NR_SECTION_ROOTS][SECTIONS_PER_ROOT];
#endif
static inline unsigned long *section_to_usemap(struct mem_section *ms)
{
return ms->usage->pageblock_flags;
}
static inline struct mem_section *__nr_to_section(unsigned long nr)
{
unsigned long root = SECTION_NR_TO_ROOT(nr);
if (unlikely(root >= NR_SECTION_ROOTS))
return NULL;
#ifdef CONFIG_SPARSEMEM_EXTREME
if (!mem_section || !mem_section[root])
return NULL;
#endif
return &mem_section[root][nr & SECTION_ROOT_MASK];
}
extern size_t mem_section_usage_size(void);
/*
* We use the lower bits of the mem_map pointer to store
* a little bit of information. The pointer is calculated
* as mem_map - section_nr_to_pfn(pnum). The result is
* aligned to the minimum alignment of the two values:
* 1. All mem_map arrays are page-aligned.
* 2. section_nr_to_pfn() always clears PFN_SECTION_SHIFT
* lowest bits. PFN_SECTION_SHIFT is arch-specific
* (equal SECTION_SIZE_BITS - PAGE_SHIFT), and the
* worst combination is powerpc with 256k pages,
* which results in PFN_SECTION_SHIFT equal 6.
* To sum it up, at least 6 bits are available.
*/
#define SECTION_MARKED_PRESENT (1UL<<0)
#define SECTION_HAS_MEM_MAP (1UL<<1)
#define SECTION_IS_ONLINE (1UL<<2)
#define SECTION_IS_EARLY (1UL<<3)
#define SECTION_TAINT_ZONE_DEVICE (1UL<<4)
#define SECTION_MAP_LAST_BIT (1UL<<5)
#define SECTION_MAP_MASK (~(SECTION_MAP_LAST_BIT-1))
#define SECTION_NID_SHIFT 6
static inline struct page *__section_mem_map_addr(struct mem_section *section)
{
unsigned long map = section->section_mem_map;
map &= SECTION_MAP_MASK;
return (struct page *)map;
}
static inline int present_section(struct mem_section *section)
{
return (section && (section->section_mem_map & SECTION_MARKED_PRESENT));
}
static inline int present_section_nr(unsigned long nr)
{
return present_section(__nr_to_section(nr));
}
static inline int valid_section(struct mem_section *section)
{
return (section && (section->section_mem_map & SECTION_HAS_MEM_MAP));
}
static inline int early_section(struct mem_section *section)
{
return (section && (section->section_mem_map & SECTION_IS_EARLY));
}
static inline int valid_section_nr(unsigned long nr)
{
return valid_section(__nr_to_section(nr));
}
static inline int online_section(struct mem_section *section)
{
return (section && (section->section_mem_map & SECTION_IS_ONLINE));
}
static inline int online_device_section(struct mem_section *section)
{
unsigned long flags = SECTION_IS_ONLINE | SECTION_TAINT_ZONE_DEVICE;
return section && ((section->section_mem_map & flags) == flags);
}
static inline int online_section_nr(unsigned long nr)
{
return online_section(__nr_to_section(nr));
}
#ifdef CONFIG_MEMORY_HOTPLUG
void online_mem_sections(unsigned long start_pfn, unsigned long end_pfn);
void offline_mem_sections(unsigned long start_pfn, unsigned long end_pfn);
#endif
static inline struct mem_section *__pfn_to_section(unsigned long pfn)
{
return __nr_to_section(pfn_to_section_nr(pfn));
}
extern unsigned long __highest_present_section_nr;
static inline int subsection_map_index(unsigned long pfn)
{
return (pfn & ~(PAGE_SECTION_MASK)) / PAGES_PER_SUBSECTION;
}
#ifdef CONFIG_SPARSEMEM_VMEMMAP
static inline int pfn_section_valid(struct mem_section *ms, unsigned long pfn)
{
int idx = subsection_map_index(pfn);
return test_bit(idx, ms->usage->subsection_map);
}
#else
static inline int pfn_section_valid(struct mem_section *ms, unsigned long pfn)
{
return 1;
}
#endif
#ifndef CONFIG_HAVE_ARCH_PFN_VALID
/**
* pfn_valid - check if there is a valid memory map entry for a PFN
* @pfn: the page frame number to check
*
* Check if there is a valid memory map entry aka struct page for the @pfn.
* Note, that availability of the memory map entry does not imply that
* there is actual usable memory at that @pfn. The struct page may
* represent a hole or an unusable page frame.
*
* Return: 1 for PFNs that have memory map entries and 0 otherwise
*/
static inline int pfn_valid(unsigned long pfn)
{
struct mem_section *ms;
/*
* Ensure the upper PAGE_SHIFT bits are clear in the
* pfn. Else it might lead to false positives when
* some of the upper bits are set, but the lower bits
* match a valid pfn.
*/
if (PHYS_PFN(PFN_PHYS(pfn)) != pfn)
return 0;
if (pfn_to_section_nr(pfn) >= NR_MEM_SECTIONS)
return 0;
ms = __nr_to_section(pfn_to_section_nr(pfn));
if (!valid_section(ms))
return 0;
/*
* Traditionally early sections always returned pfn_valid() for
* the entire section-sized span.
*/
return early_section(ms) || pfn_section_valid(ms, pfn);
}
#endif
static inline int pfn_in_present_section(unsigned long pfn)
{
if (pfn_to_section_nr(pfn) >= NR_MEM_SECTIONS)
return 0;
return present_section(__nr_to_section(pfn_to_section_nr(pfn)));
}
static inline unsigned long next_present_section_nr(unsigned long section_nr)
{
while (++section_nr <= __highest_present_section_nr) {
if (present_section_nr(section_nr))
return section_nr;
}
return -1;
}
/*
* These are _only_ used during initialisation, therefore they
* can use __initdata ... They could have names to indicate
* this restriction.
*/
#ifdef CONFIG_NUMA
#define pfn_to_nid(pfn) \
({ \
unsigned long __pfn_to_nid_pfn = (pfn); \
page_to_nid(pfn_to_page(__pfn_to_nid_pfn)); \
})
#else
#define pfn_to_nid(pfn) (0)
#endif
void sparse_init(void);
#else
#define sparse_init() do {} while (0)
#define sparse_index_init(_sec, _nid) do {} while (0)
#define pfn_in_present_section pfn_valid
#define subsection_map_init(_pfn, _nr_pages) do {} while (0)
#endif /* CONFIG_SPARSEMEM */
#endif /* !__GENERATING_BOUNDS.H */
#endif /* !__ASSEMBLY__ */
#endif /* _LINUX_MMZONE_H */
// SPDX-License-Identifier: GPL-2.0
#include "ctree.h"
#include "delalloc-space.h"
#include "block-rsv.h"
#include "btrfs_inode.h"
#include "space-info.h"
#include "transaction.h"
#include "qgroup.h"
#include "block-group.h"
/*
* HOW DOES THIS WORK
*
* There are two stages to data reservations, one for data and one for metadata
* to handle the new extents and checksums generated by writing data.
*
*
* DATA RESERVATION
* The general flow of the data reservation is as follows
*
* -> Reserve
* We call into btrfs_reserve_data_bytes() for the user request bytes that
* they wish to write. We make this reservation and add it to
* space_info->bytes_may_use. We set EXTENT_DELALLOC on the inode io_tree
* for the range and carry on if this is buffered, or follow up trying to
* make a real allocation if we are pre-allocating or doing O_DIRECT.
*
* -> Use
* At writepages()/prealloc/O_DIRECT time we will call into
* btrfs_reserve_extent() for some part or all of this range of bytes. We
* will make the allocation and subtract space_info->bytes_may_use by the
* original requested length and increase the space_info->bytes_reserved by
* the allocated length. This distinction is important because compression
* may allocate a smaller on disk extent than we previously reserved.
*
* -> Allocation
* finish_ordered_io() will insert the new file extent item for this range,
* and then add a delayed ref update for the extent tree. Once that delayed
* ref is written the extent size is subtracted from
* space_info->bytes_reserved and added to space_info->bytes_used.
*
* Error handling
*
* -> By the reservation maker
* This is the simplest case, we haven't completed our operation and we know
* how much we reserved, we can simply call
* btrfs_free_reserved_data_space*() and it will be removed from
* space_info->bytes_may_use.
*
* -> After the reservation has been made, but before cow_file_range()
* This is specifically for the delalloc case. You must clear
* EXTENT_DELALLOC with the EXTENT_CLEAR_DATA_RESV bit, and the range will
* be subtracted from space_info->bytes_may_use.
*
* METADATA RESERVATION
* The general metadata reservation lifetimes are discussed elsewhere, this
* will just focus on how it is used for delalloc space.
*
* We keep track of two things on a per inode bases
*
* ->outstanding_extents
* This is the number of file extent items we'll need to handle all of the
* outstanding DELALLOC space we have in this inode. We limit the maximum
* size of an extent, so a large contiguous dirty area may require more than
* one outstanding_extent, which is why count_max_extents() is used to
* determine how many outstanding_extents get added.
*
* ->csum_bytes
* This is essentially how many dirty bytes we have for this inode, so we
* can calculate the number of checksum items we would have to add in order
* to checksum our outstanding data.
*
* We keep a per-inode block_rsv in order to make it easier to keep track of
* our reservation. We use btrfs_calculate_inode_block_rsv_size() to
* calculate the current theoretical maximum reservation we would need for the
* metadata for this inode. We call this and then adjust our reservation as
* necessary, either by attempting to reserve more space, or freeing up excess
* space.
*
* OUTSTANDING_EXTENTS HANDLING
*
* ->outstanding_extents is used for keeping track of how many extents we will
* need to use for this inode, and it will fluctuate depending on where you are
* in the life cycle of the dirty data. Consider the following normal case for
* a completely clean inode, with a num_bytes < our maximum allowed extent size
*
* -> reserve
* ->outstanding_extents += 1 (current value is 1)
*
* -> set_delalloc
* ->outstanding_extents += 1 (current value is 2)
*
* -> btrfs_delalloc_release_extents()
* ->outstanding_extents -= 1 (current value is 1)
*
* We must call this once we are done, as we hold our reservation for the
* duration of our operation, and then assume set_delalloc will update the
* counter appropriately.
*
* -> add ordered extent
* ->outstanding_extents += 1 (current value is 2)
*
* -> btrfs_clear_delalloc_extent
* ->outstanding_extents -= 1 (current value is 1)
*
* -> finish_ordered_io/btrfs_remove_ordered_extent
* ->outstanding_extents -= 1 (current value is 0)
*
* Each stage is responsible for their own accounting of the extent, thus
* making error handling and cleanup easier.
*/
int btrfs_alloc_data_chunk_ondemand(struct btrfs_inode *inode, u64 bytes)
{
struct btrfs_root *root = inode->root;
struct btrfs_fs_info *fs_info = root->fs_info;
enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_FLUSH_DATA;
/* Make sure bytes are sectorsize aligned */
bytes = ALIGN(bytes, fs_info->sectorsize);
if (btrfs_is_free_space_inode(inode))
flush = BTRFS_RESERVE_FLUSH_FREE_SPACE_INODE;
return btrfs_reserve_data_bytes(fs_info, bytes, flush);
}
int btrfs_check_data_free_space(struct btrfs_inode *inode,
struct extent_changeset **reserved, u64 start, u64 len)
{
struct btrfs_fs_info *fs_info = inode->root->fs_info;
int ret;
/* align the range */
len = round_up(start + len, fs_info->sectorsize) -
round_down(start, fs_info->sectorsize);
start = round_down(start, fs_info->sectorsize);
ret = btrfs_alloc_data_chunk_ondemand(inode, len);
if (ret < 0)
return ret;
/* Use new btrfs_qgroup_reserve_data to reserve precious data space. */
ret = btrfs_qgroup_reserve_data(inode, reserved, start, len);
if (ret < 0) {
btrfs_free_reserved_data_space_noquota(fs_info, len);
extent_changeset_free(*reserved);
*reserved = NULL;
} else {
ret = 0;
}
return ret;
}
/*
* Called if we need to clear a data reservation for this inode
* Normally in a error case.
*
* This one will *NOT* use accurate qgroup reserved space API, just for case
* which we can't sleep and is sure it won't affect qgroup reserved space.
* Like clear_bit_hook().
*/
void btrfs_free_reserved_data_space_noquota(struct btrfs_fs_info *fs_info,
u64 len)
{
struct btrfs_space_info *data_sinfo;
ASSERT(IS_ALIGNED(len, fs_info->sectorsize));
data_sinfo = fs_info->data_sinfo;
btrfs_space_info_free_bytes_may_use(fs_info, data_sinfo, len);
}
/*
* Called if we need to clear a data reservation for this inode
* Normally in a error case.
*
* This one will handle the per-inode data rsv map for accurate reserved
* space framework.
*/
void btrfs_free_reserved_data_space(struct btrfs_inode *inode,
struct extent_changeset *reserved, u64 start, u64 len)
{
struct btrfs_fs_info *fs_info = inode->root->fs_info;
/* Make sure the range is aligned to sectorsize */
len = round_up(start + len, fs_info->sectorsize) -
round_down(start, fs_info->sectorsize);
start = round_down(start, fs_info->sectorsize);
btrfs_free_reserved_data_space_noquota(fs_info, len);
btrfs_qgroup_free_data(inode, reserved, start, len);
}
/**
* Release any excessive reservation
*
* @inode: the inode we need to release from
* @qgroup_free: free or convert qgroup meta. Unlike normal operation, qgroup
* meta reservation needs to know if we are freeing qgroup
* reservation or just converting it into per-trans. Normally
* @qgroup_free is true for error handling, and false for normal
* release.
*
* This is the same as btrfs_block_rsv_release, except that it handles the
* tracepoint for the reservation.
*/
static void btrfs_inode_rsv_release(struct btrfs_inode *inode, bool qgroup_free)
{
struct btrfs_fs_info *fs_info = inode->root->fs_info;
struct btrfs_block_rsv *block_rsv = &inode->block_rsv;
u64 released = 0;
u64 qgroup_to_release = 0;
/*
* Since we statically set the block_rsv->size we just want to say we
* are releasing 0 bytes, and then we'll just get the reservation over
* the size free'd.
*/
released = btrfs_block_rsv_release(fs_info, block_rsv, 0,
&qgroup_to_release);
if (released > 0)
trace_btrfs_space_reservation(fs_info, "delalloc",
btrfs_ino(inode), released, 0);
if (qgroup_free)
btrfs_qgroup_free_meta_prealloc(inode->root, qgroup_to_release);
else
btrfs_qgroup_convert_reserved_meta(inode->root,
qgroup_to_release);
}
static void btrfs_calculate_inode_block_rsv_size(struct btrfs_fs_info *fs_info,
struct btrfs_inode *inode)
{
struct btrfs_block_rsv *block_rsv = &inode->block_rsv;
u64 reserve_size = 0;
u64 qgroup_rsv_size = 0;
u64 csum_leaves;
unsigned outstanding_extents;
lockdep_assert_held(&inode->lock);
outstanding_extents = inode->outstanding_extents;
/*
* Insert size for the number of outstanding extents, 1 normal size for
* updating the inode.
*/
if (outstanding_extents) {
reserve_size = btrfs_calc_insert_metadata_size(fs_info,
outstanding_extents);
reserve_size += btrfs_calc_metadata_size(fs_info, 1);
}
csum_leaves = btrfs_csum_bytes_to_leaves(fs_info,
inode->csum_bytes);
reserve_size += btrfs_calc_insert_metadata_size(fs_info,
csum_leaves);
/*
* For qgroup rsv, the calculation is very simple:
* account one nodesize for each outstanding extent
*
* This is overestimating in most cases.
*/
qgroup_rsv_size = (u64)outstanding_extents * fs_info->nodesize;
spin_lock(&block_rsv->lock);
block_rsv->size = reserve_size;
block_rsv->qgroup_rsv_size = qgroup_rsv_size;
spin_unlock(&block_rsv->lock);
}
static void calc_inode_reservations(struct btrfs_fs_info *fs_info,
u64 num_bytes, u64 *meta_reserve,
u64 *qgroup_reserve)
{
u64 nr_extents = count_max_extents(num_bytes);
u64 csum_leaves = btrfs_csum_bytes_to_leaves(fs_info, num_bytes);
u64 inode_update = btrfs_calc_metadata_size(fs_info, 1);
*meta_reserve = btrfs_calc_insert_metadata_size(fs_info,
nr_extents + csum_leaves);
/*
* finish_ordered_io has to update the inode, so add the space required
* for an inode update.
*/
*meta_reserve += inode_update;
*qgroup_reserve = nr_extents * fs_info->nodesize;
}
int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes)
{
struct btrfs_root *root = inode->root;
struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_block_rsv *block_rsv = &inode->block_rsv;
u64 meta_reserve, qgroup_reserve;
unsigned nr_extents;
enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_FLUSH_ALL;
int ret = 0;
/*
* If we are a free space inode we need to not flush since we will be in
* the middle of a transaction commit. We also don't need the delalloc
* mutex since we won't race with anybody. We need this mostly to make
* lockdep shut its filthy mouth.
*
* If we have a transaction open (can happen if we call truncate_block
* from truncate), then we need FLUSH_LIMIT so we don't deadlock.
*/
if (btrfs_is_free_space_inode(inode)) {
flush = BTRFS_RESERVE_NO_FLUSH;
} else {
if (current->journal_info)
flush = BTRFS_RESERVE_FLUSH_LIMIT;
if (btrfs_transaction_in_commit(fs_info)) schedule_timeout(1);
}
num_bytes = ALIGN(num_bytes, fs_info->sectorsize);
/*
* We always want to do it this way, every other way is wrong and ends
* in tears. Pre-reserving the amount we are going to add will always
* be the right way, because otherwise if we have enough parallelism we
* could end up with thousands of inodes all holding little bits of
* reservations they were able to make previously and the only way to
* reclaim that space is to ENOSPC out the operations and clear
* everything out and try again, which is bad. This way we just
* over-reserve slightly, and clean up the mess when we are done.
*/
calc_inode_reservations(fs_info, num_bytes, &meta_reserve,
&qgroup_reserve);
ret = btrfs_qgroup_reserve_meta_prealloc(root, qgroup_reserve, true);
if (ret)
return ret;
ret = btrfs_reserve_metadata_bytes(root, block_rsv, meta_reserve, flush);
if (ret) {
btrfs_qgroup_free_meta_prealloc(root, qgroup_reserve);
return ret;
}
/*
* Now we need to update our outstanding extents and csum bytes _first_
* and then add the reservation to the block_rsv. This keeps us from
* racing with an ordered completion or some such that would think it
* needs to free the reservation we just made.
*/
spin_lock(&inode->lock);
nr_extents = count_max_extents(num_bytes);
btrfs_mod_outstanding_extents(inode, nr_extents);
inode->csum_bytes += num_bytes;
btrfs_calculate_inode_block_rsv_size(fs_info, inode);
spin_unlock(&inode->lock);
/* Now we can safely add our space to our block rsv */
btrfs_block_rsv_add_bytes(block_rsv, meta_reserve, false);
trace_btrfs_space_reservation(root->fs_info, "delalloc",
btrfs_ino(inode), meta_reserve, 1);
spin_lock(&block_rsv->lock);
block_rsv->qgroup_rsv_reserved += qgroup_reserve;
spin_unlock(&block_rsv->lock);
return 0;
}
/**
* Release a metadata reservation for an inode
*
* @inode: the inode to release the reservation for.
* @num_bytes: the number of bytes we are releasing.
* @qgroup_free: free qgroup reservation or convert it to per-trans reservation
*
* This will release the metadata reservation for an inode. This can be called
* once we complete IO for a given set of bytes to release their metadata
* reservations, or on error for the same reason.
*/
void btrfs_delalloc_release_metadata(struct btrfs_inode *inode, u64 num_bytes,
bool qgroup_free)
{
struct btrfs_fs_info *fs_info = inode->root->fs_info;
num_bytes = ALIGN(num_bytes, fs_info->sectorsize);
spin_lock(&inode->lock);
inode->csum_bytes -= num_bytes;
btrfs_calculate_inode_block_rsv_size(fs_info, inode);
spin_unlock(&inode->lock);
if (btrfs_is_testing(fs_info))
return;
btrfs_inode_rsv_release(inode, qgroup_free);
}
/**
* btrfs_delalloc_release_extents - release our outstanding_extents
* @inode: the inode to balance the reservation for.
* @num_bytes: the number of bytes we originally reserved with
*
* When we reserve space we increase outstanding_extents for the extents we may
* add. Once we've set the range as delalloc or created our ordered extents we
* have outstanding_extents to track the real usage, so we use this to free our
* temporarily tracked outstanding_extents. This _must_ be used in conjunction
* with btrfs_delalloc_reserve_metadata.
*/
void btrfs_delalloc_release_extents(struct btrfs_inode *inode, u64 num_bytes)
{
struct btrfs_fs_info *fs_info = inode->root->fs_info;
unsigned num_extents;
spin_lock(&inode->lock);
num_extents = count_max_extents(num_bytes);
btrfs_mod_outstanding_extents(inode, -num_extents);
btrfs_calculate_inode_block_rsv_size(fs_info, inode);
spin_unlock(&inode->lock);
if (btrfs_is_testing(fs_info))
return;
btrfs_inode_rsv_release(inode, true);
}
/**
* btrfs_delalloc_reserve_space - reserve data and metadata space for
* delalloc
* @inode: inode we're writing to
* @start: start range we are writing to
* @len: how long the range we are writing to
* @reserved: mandatory parameter, record actually reserved qgroup ranges of
* current reservation.
*
* This will do the following things
*
* - reserve space in data space info for num bytes
* and reserve precious corresponding qgroup space
* (Done in check_data_free_space)
*
* - reserve space for metadata space, based on the number of outstanding
* extents and how much csums will be needed
* also reserve metadata space in a per root over-reserve method.
* - add to the inodes->delalloc_bytes
* - add it to the fs_info's delalloc inodes list.
* (Above 3 all done in delalloc_reserve_metadata)
*
* Return 0 for success
* Return <0 for error(-ENOSPC or -EQUOT)
*/
int btrfs_delalloc_reserve_space(struct btrfs_inode *inode,
struct extent_changeset **reserved, u64 start, u64 len)
{
int ret;
ret = btrfs_check_data_free_space(inode, reserved, start, len);
if (ret < 0)
return ret;
ret = btrfs_delalloc_reserve_metadata(inode, len); if (ret < 0) { btrfs_free_reserved_data_space(inode, *reserved, start, len);
extent_changeset_free(*reserved);
*reserved = NULL;
}
return ret;
}
/**
* Release data and metadata space for delalloc
*
* @inode: inode we're releasing space for
* @reserved: list of changed/reserved ranges
* @start: start position of the space already reserved
* @len: length of the space already reserved
* @qgroup_free: should qgroup reserved-space also be freed
*
* This function will release the metadata space that was not used and will
* decrement ->delalloc_bytes and remove it from the fs_info delalloc_inodes
* list if there are no delalloc bytes left.
* Also it will handle the qgroup reserved space.
*/
void btrfs_delalloc_release_space(struct btrfs_inode *inode,
struct extent_changeset *reserved,
u64 start, u64 len, bool qgroup_free)
{
btrfs_delalloc_release_metadata(inode, len, qgroup_free);
btrfs_free_reserved_data_space(inode, reserved, start, len);
}
// SPDX-License-Identifier: GPL-2.0
#include <linux/crypto.h>
#include <linux/err.h>
#include <linux/init.h>
#include <linux/kernel.h>
#include <linux/list.h>
#include <linux/tcp.h>
#include <linux/rcupdate.h>
#include <linux/rculist.h>
#include <net/inetpeer.h>
#include <net/tcp.h>
void tcp_fastopen_init_key_once(struct net *net)
{
u8 key[TCP_FASTOPEN_KEY_LENGTH];
struct tcp_fastopen_context *ctxt;
rcu_read_lock();
ctxt = rcu_dereference(net->ipv4.tcp_fastopen_ctx);
if (ctxt) {
rcu_read_unlock();
return;
}
rcu_read_unlock();
/* tcp_fastopen_reset_cipher publishes the new context
* atomically, so we allow this race happening here.
*
* All call sites of tcp_fastopen_cookie_gen also check
* for a valid cookie, so this is an acceptable risk.
*/
get_random_bytes(key, sizeof(key));
tcp_fastopen_reset_cipher(net, NULL, key, NULL);
}
static void tcp_fastopen_ctx_free(struct rcu_head *head)
{
struct tcp_fastopen_context *ctx =
container_of(head, struct tcp_fastopen_context, rcu);
kfree_sensitive(ctx);
}
void tcp_fastopen_destroy_cipher(struct sock *sk)
{
struct tcp_fastopen_context *ctx;
ctx = rcu_dereference_protected(
inet_csk(sk)->icsk_accept_queue.fastopenq.ctx, 1);
if (ctx)
call_rcu(&ctx->rcu, tcp_fastopen_ctx_free);
}
void tcp_fastopen_ctx_destroy(struct net *net)
{
struct tcp_fastopen_context *ctxt;
ctxt = xchg((__force struct tcp_fastopen_context **)&net->ipv4.tcp_fastopen_ctx, NULL);
if (ctxt)
call_rcu(&ctxt->rcu, tcp_fastopen_ctx_free);
}
int tcp_fastopen_reset_cipher(struct net *net, struct sock *sk,
void *primary_key, void *backup_key)
{
struct tcp_fastopen_context *ctx, *octx;
struct fastopen_queue *q;
int err = 0;
ctx = kmalloc(sizeof(*ctx), GFP_KERNEL);
if (!ctx) {
err = -ENOMEM;
goto out;
}
ctx->key[0].key[0] = get_unaligned_le64(primary_key);
ctx->key[0].key[1] = get_unaligned_le64(primary_key + 8);
if (backup_key) {
ctx->key[1].key[0] = get_unaligned_le64(backup_key);
ctx->key[1].key[1] = get_unaligned_le64(backup_key + 8);
ctx->num = 2;
} else {
ctx->num = 1;
}
if (sk) {
q = &inet_csk(sk)->icsk_accept_queue.fastopenq;
octx = xchg((__force struct tcp_fastopen_context **)&q->ctx, ctx);
} else {
octx = xchg((__force struct tcp_fastopen_context **)&net->ipv4.tcp_fastopen_ctx, ctx);
}
if (octx)
call_rcu(&octx->rcu, tcp_fastopen_ctx_free);
out:
return err;
}
int tcp_fastopen_get_cipher(struct net *net, struct inet_connection_sock *icsk,
u64 *key)
{
struct tcp_fastopen_context *ctx;
int n_keys = 0, i;
rcu_read_lock();
if (icsk)
ctx = rcu_dereference(icsk->icsk_accept_queue.fastopenq.ctx);
else
ctx = rcu_dereference(net->ipv4.tcp_fastopen_ctx);
if (ctx) {
n_keys = tcp_fastopen_context_len(ctx);
for (i = 0; i < n_keys; i++) {
put_unaligned_le64(ctx->key[i].key[0], key + (i * 2));
put_unaligned_le64(ctx->key[i].key[1], key + (i * 2) + 1);
}
}
rcu_read_unlock();
return n_keys;
}
static bool __tcp_fastopen_cookie_gen_cipher(struct request_sock *req,
struct sk_buff *syn,
const siphash_key_t *key,
struct tcp_fastopen_cookie *foc)
{
BUILD_BUG_ON(TCP_FASTOPEN_COOKIE_SIZE != sizeof(u64));
if (req->rsk_ops->family == AF_INET) {
const struct iphdr *iph = ip_hdr(syn);
foc->val[0] = cpu_to_le64(siphash(&iph->saddr,
sizeof(iph->saddr) +
sizeof(iph->daddr),
key));
foc->len = TCP_FASTOPEN_COOKIE_SIZE;
return true;
}
#if IS_ENABLED(CONFIG_IPV6)
if (req->rsk_ops->family == AF_INET6) {
const struct ipv6hdr *ip6h = ipv6_hdr(syn);
foc->val[0] = cpu_to_le64(siphash(&ip6h->saddr,
sizeof(ip6h->saddr) +
sizeof(ip6h->daddr),
key));
foc->len = TCP_FASTOPEN_COOKIE_SIZE;
return true;
}
#endif
return false;
}
/* Generate the fastopen cookie by applying SipHash to both the source and
* destination addresses.
*/
static void tcp_fastopen_cookie_gen(struct sock *sk,
struct request_sock *req,
struct sk_buff *syn,
struct tcp_fastopen_cookie *foc)
{
struct tcp_fastopen_context *ctx;
rcu_read_lock();
ctx = tcp_fastopen_get_ctx(sk);
if (ctx)
__tcp_fastopen_cookie_gen_cipher(req, syn, &ctx->key[0], foc);
rcu_read_unlock();
}
/* If an incoming SYN or SYNACK frame contains a payload and/or FIN,
* queue this additional data / FIN.
*/
void tcp_fastopen_add_skb(struct sock *sk, struct sk_buff *skb)
{
struct tcp_sock *tp = tcp_sk(sk);
if (TCP_SKB_CB(skb)->end_seq == tp->rcv_nxt)
return;
skb = skb_clone(skb, GFP_ATOMIC);
if (!skb)
return;
skb_dst_drop(skb);
/* segs_in has been initialized to 1 in tcp_create_openreq_child().
* Hence, reset segs_in to 0 before calling tcp_segs_in()
* to avoid double counting. Also, tcp_segs_in() expects
* skb->len to include the tcp_hdrlen. Hence, it should
* be called before __skb_pull().
*/
tp->segs_in = 0;
tcp_segs_in(tp, skb);
__skb_pull(skb, tcp_hdrlen(skb));
sk_forced_mem_schedule(sk, skb->truesize);
skb_set_owner_r(skb, sk);
TCP_SKB_CB(skb)->seq++;
TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_SYN;
tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
__skb_queue_tail(&sk->sk_receive_queue, skb);
tp->syn_data_acked = 1;
/* u64_stats_update_begin(&tp->syncp) not needed here,
* as we certainly are not changing upper 32bit value (0)
*/
tp->bytes_received = skb->len;
if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
tcp_fin(sk);
}
/* returns 0 - no key match, 1 for primary, 2 for backup */
static int tcp_fastopen_cookie_gen_check(struct sock *sk,
struct request_sock *req,
struct sk_buff *syn,
struct tcp_fastopen_cookie *orig,
struct tcp_fastopen_cookie *valid_foc)
{
struct tcp_fastopen_cookie search_foc = { .len = -1 };
struct tcp_fastopen_cookie *foc = valid_foc;
struct tcp_fastopen_context *ctx;
int i, ret = 0;
rcu_read_lock();
ctx = tcp_fastopen_get_ctx(sk);
if (!ctx)
goto out;
for (i = 0; i < tcp_fastopen_context_len(ctx); i++) {
__tcp_fastopen_cookie_gen_cipher(req, syn, &ctx->key[i], foc);
if (tcp_fastopen_cookie_match(foc, orig)) {
ret = i + 1;
goto out;
}
foc = &search_foc;
}
out:
rcu_read_unlock();
return ret;
}
static struct sock *tcp_fastopen_create_child(struct sock *sk,
struct sk_buff *skb,
struct request_sock *req)
{
struct tcp_sock *tp;
struct request_sock_queue *queue = &inet_csk(sk)->icsk_accept_queue;
struct sock *child;
bool own_req;
child = inet_csk(sk)->icsk_af_ops->syn_recv_sock(sk, skb, req, NULL,
NULL, &own_req);
if (!child)
return NULL;
spin_lock(&queue->fastopenq.lock);
queue->fastopenq.qlen++;
spin_unlock(&queue->fastopenq.lock);
/* Initialize the child socket. Have to fix some values to take
* into account the child is a Fast Open socket and is created
* only out of the bits carried in the SYN packet.
*/
tp = tcp_sk(child);
rcu_assign_pointer(tp->fastopen_rsk, req);
tcp_rsk(req)->tfo_listener = true;
/* RFC1323: The window in SYN & SYN/ACK segments is never
* scaled. So correct it appropriately.
*/
tp->snd_wnd = ntohs(tcp_hdr(skb)->window);
tp->max_window = tp->snd_wnd;
/* Activate the retrans timer so that SYNACK can be retransmitted.
* The request socket is not added to the ehash
* because it's been added to the accept queue directly.
*/
inet_csk_reset_xmit_timer(child, ICSK_TIME_RETRANS,
TCP_TIMEOUT_INIT, TCP_RTO_MAX);
refcount_set(&req->rsk_refcnt, 2);
/* Now finish processing the fastopen child socket. */
tcp_init_transfer(child, BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB, skb);
tp->rcv_nxt = TCP_SKB_CB(skb)->seq + 1;
tcp_fastopen_add_skb(child, skb);
tcp_rsk(req)->rcv_nxt = tp->rcv_nxt;
tp->rcv_wup = tp->rcv_nxt;
/* tcp_conn_request() is sending the SYNACK,
* and queues the child into listener accept queue.
*/
return child;
}
static bool tcp_fastopen_queue_check(struct sock *sk)
{
struct fastopen_queue *fastopenq;
/* Make sure the listener has enabled fastopen, and we don't
* exceed the max # of pending TFO requests allowed before trying
* to validating the cookie in order to avoid burning CPU cycles
* unnecessarily.
*
* XXX (TFO) - The implication of checking the max_qlen before
* processing a cookie request is that clients can't differentiate
* between qlen overflow causing Fast Open to be disabled
* temporarily vs a server not supporting Fast Open at all.
*/
fastopenq = &inet_csk(sk)->icsk_accept_queue.fastopenq;
if (fastopenq->max_qlen == 0)
return false;
if (fastopenq->qlen >= fastopenq->max_qlen) {
struct request_sock *req1;
spin_lock(&fastopenq->lock);
req1 = fastopenq->rskq_rst_head;
if (!req1 || time_after(req1->rsk_timer.expires, jiffies)) {
__NET_INC_STATS(sock_net(sk),
LINUX_MIB_TCPFASTOPENLISTENOVERFLOW);
spin_unlock(&fastopenq->lock);
return false;
}
fastopenq->rskq_rst_head = req1->dl_next;
fastopenq->qlen--;
spin_unlock(&fastopenq->lock);
reqsk_put(req1);
}
return true;
}
static bool tcp_fastopen_no_cookie(const struct sock *sk,
const struct dst_entry *dst,
int flag)
{
return (sock_net(sk)->ipv4.sysctl_tcp_fastopen & flag) ||
tcp_sk(sk)->fastopen_no_cookie ||
(dst && dst_metric(dst, RTAX_FASTOPEN_NO_COOKIE));
}
/* Returns true if we should perform Fast Open on the SYN. The cookie (foc)
* may be updated and return the client in the SYN-ACK later. E.g., Fast Open
* cookie request (foc->len == 0).
*/
struct sock *tcp_try_fastopen(struct sock *sk, struct sk_buff *skb,
struct request_sock *req,
struct tcp_fastopen_cookie *foc,
const struct dst_entry *dst)
{
bool syn_data = TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq + 1;
int tcp_fastopen = sock_net(sk)->ipv4.sysctl_tcp_fastopen;
struct tcp_fastopen_cookie valid_foc = { .len = -1 };
struct sock *child;
int ret = 0;
if (foc->len == 0) /* Client requests a cookie */
NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPFASTOPENCOOKIEREQD);
if (!((tcp_fastopen & TFO_SERVER_ENABLE) &&
(syn_data || foc->len >= 0) &&
tcp_fastopen_queue_check(sk))) {
foc->len = -1;
return NULL;
}
if (tcp_fastopen_no_cookie(sk, dst, TFO_SERVER_COOKIE_NOT_REQD))
goto fastopen;
if (foc->len == 0) {
/* Client requests a cookie. */
tcp_fastopen_cookie_gen(sk, req, skb, &valid_foc);
} else if (foc->len > 0) {
ret = tcp_fastopen_cookie_gen_check(sk, req, skb, foc,
&valid_foc);
if (!ret) {
NET_INC_STATS(sock_net(sk),
LINUX_MIB_TCPFASTOPENPASSIVEFAIL);
} else {
/* Cookie is valid. Create a (full) child socket to
* accept the data in SYN before returning a SYN-ACK to
* ack the data. If we fail to create the socket, fall
* back and ack the ISN only but includes the same
* cookie.
*
* Note: Data-less SYN with valid cookie is allowed to
* send data in SYN_RECV state.
*/
fastopen:
child = tcp_fastopen_create_child(sk, skb, req);
if (child) {
if (ret == 2) {
valid_foc.exp = foc->exp;
*foc = valid_foc;
NET_INC_STATS(sock_net(sk),
LINUX_MIB_TCPFASTOPENPASSIVEALTKEY);
} else {
foc->len = -1;
}
NET_INC_STATS(sock_net(sk),
LINUX_MIB_TCPFASTOPENPASSIVE);
return child;
}
NET_INC_STATS(sock_net(sk),
LINUX_MIB_TCPFASTOPENPASSIVEFAIL);
}
}
valid_foc.exp = foc->exp;
*foc = valid_foc;
return NULL;
}
bool tcp_fastopen_cookie_check(struct sock *sk, u16 *mss,
struct tcp_fastopen_cookie *cookie)
{
const struct dst_entry *dst;
tcp_fastopen_cache_get(sk, mss, cookie);
/* Firewall blackhole issue check */
if (tcp_fastopen_active_should_disable(sk)) {
cookie->len = -1;
return false;
}
dst = __sk_dst_get(sk);
if (tcp_fastopen_no_cookie(sk, dst, TFO_CLIENT_NO_COOKIE)) {
cookie->len = -1;
return true;
}
if (cookie->len > 0)
return true;
tcp_sk(sk)->fastopen_client_fail = TFO_COOKIE_UNAVAILABLE;
return false;
}
/* This function checks if we want to defer sending SYN until the first
* write(). We defer under the following conditions:
* 1. fastopen_connect sockopt is set
* 2. we have a valid cookie
* Return value: return true if we want to defer until application writes data
* return false if we want to send out SYN immediately
*/
bool tcp_fastopen_defer_connect(struct sock *sk, int *err)
{
struct tcp_fastopen_cookie cookie = { .len = 0 };
struct tcp_sock *tp = tcp_sk(sk);
u16 mss;
if (tp->fastopen_connect && !tp->fastopen_req) { if (tcp_fastopen_cookie_check(sk, &mss, &cookie)) { inet_sk(sk)->defer_connect = 1;
return true;
}
/* Alloc fastopen_req in order for FO option to be included
* in SYN
*/
tp->fastopen_req = kzalloc(sizeof(*tp->fastopen_req),
sk->sk_allocation);
if (tp->fastopen_req)
tp->fastopen_req->cookie = cookie;
else
*err = -ENOBUFS;
}
return false;
}
EXPORT_SYMBOL(tcp_fastopen_defer_connect);
/*
* The following code block is to deal with middle box issues with TFO:
* Middlebox firewall issues can potentially cause server's data being
* blackholed after a successful 3WHS using TFO.
* The proposed solution is to disable active TFO globally under the
* following circumstances:
* 1. client side TFO socket receives out of order FIN
* 2. client side TFO socket receives out of order RST
* 3. client side TFO socket has timed out three times consecutively during
* or after handshake
* We disable active side TFO globally for 1hr at first. Then if it
* happens again, we disable it for 2h, then 4h, 8h, ...
* And we reset the timeout back to 1hr when we see a successful active
* TFO connection with data exchanges.
*/
/* Disable active TFO and record current jiffies and
* tfo_active_disable_times
*/
void tcp_fastopen_active_disable(struct sock *sk)
{
struct net *net = sock_net(sk);
if (!sock_net(sk)->ipv4.sysctl_tcp_fastopen_blackhole_timeout)
return;
/* Paired with READ_ONCE() in tcp_fastopen_active_should_disable() */
WRITE_ONCE(net->ipv4.tfo_active_disable_stamp, jiffies);
/* Paired with smp_rmb() in tcp_fastopen_active_should_disable().
* We want net->ipv4.tfo_active_disable_stamp to be updated first.
*/
smp_mb__before_atomic();
atomic_inc(&net->ipv4.tfo_active_disable_times);
NET_INC_STATS(net, LINUX_MIB_TCPFASTOPENBLACKHOLE);
}
/* Calculate timeout for tfo active disable
* Return true if we are still in the active TFO disable period
* Return false if timeout already expired and we should use active TFO
*/
bool tcp_fastopen_active_should_disable(struct sock *sk)
{
unsigned int tfo_bh_timeout = sock_net(sk)->ipv4.sysctl_tcp_fastopen_blackhole_timeout;
unsigned long timeout;
int tfo_da_times;
int multiplier;
if (!tfo_bh_timeout)
return false;
tfo_da_times = atomic_read(&sock_net(sk)->ipv4.tfo_active_disable_times);
if (!tfo_da_times)
return false;
/* Paired with smp_mb__before_atomic() in tcp_fastopen_active_disable() */
smp_rmb();
/* Limit timeout to max: 2^6 * initial timeout */
multiplier = 1 << min(tfo_da_times - 1, 6);
/* Paired with the WRITE_ONCE() in tcp_fastopen_active_disable(). */
timeout = READ_ONCE(sock_net(sk)->ipv4.tfo_active_disable_stamp) +
multiplier * tfo_bh_timeout * HZ;
if (time_before(jiffies, timeout))
return true;
/* Mark check bit so we can check for successful active TFO
* condition and reset tfo_active_disable_times
*/
tcp_sk(sk)->syn_fastopen_ch = 1;
return false;
}
/* Disable active TFO if FIN is the only packet in the ofo queue
* and no data is received.
* Also check if we can reset tfo_active_disable_times if data is
* received successfully on a marked active TFO sockets opened on
* a non-loopback interface
*/
void tcp_fastopen_active_disable_ofo_check(struct sock *sk)
{
struct tcp_sock *tp = tcp_sk(sk);
struct dst_entry *dst;
struct sk_buff *skb;
if (!tp->syn_fastopen)
return;
if (!tp->data_segs_in) { skb = skb_rb_first(&tp->out_of_order_queue); if (skb && !skb_rb_next(skb)) { if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) { tcp_fastopen_active_disable(sk);
return;
}
}
} else if (tp->syn_fastopen_ch &&
atomic_read(&sock_net(sk)->ipv4.tfo_active_disable_times)) {
dst = sk_dst_get(sk);
if (!(dst && dst->dev && (dst->dev->flags & IFF_LOOPBACK)))
atomic_set(&sock_net(sk)->ipv4.tfo_active_disable_times, 0);
dst_release(dst);
}
}
void tcp_fastopen_active_detect_blackhole(struct sock *sk, bool expired)
{
u32 timeouts = inet_csk(sk)->icsk_retransmits;
struct tcp_sock *tp = tcp_sk(sk);
/* Broken middle-boxes may black-hole Fast Open connection during or
* even after the handshake. Be extremely conservative and pause
* Fast Open globally after hitting the third consecutive timeout or
* exceeding the configured timeout limit.
*/
if ((tp->syn_fastopen || tp->syn_data || tp->syn_data_acked) &&
(timeouts == 2 || (timeouts < 2 && expired))) {
tcp_fastopen_active_disable(sk);
NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPFASTOPENACTIVEFAIL);
}
}
/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
* INET An implementation of the TCP/IP protocol suite for the LINUX
* operating system. NET is implemented using the BSD Socket
* interface as the means of communication with the user level.
*
* Definitions for the Ethernet handlers.
*
* Version: @(#)eth.h 1.0.4 05/13/93
*
* Authors: Ross Biro
* Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
*
* Relocated to include/linux where it belongs by Alan Cox
* <gw4pts@gw4pts.ampr.org>
*/
#ifndef _LINUX_ETHERDEVICE_H
#define _LINUX_ETHERDEVICE_H
#include <linux/if_ether.h>
#include <linux/netdevice.h>
#include <linux/random.h>
#include <linux/crc32.h>
#include <asm/unaligned.h>
#include <asm/bitsperlong.h>
#ifdef __KERNEL__
struct device;
int eth_platform_get_mac_address(struct device *dev, u8 *mac_addr);
unsigned char *arch_get_platform_mac_address(void);
int nvmem_get_mac_address(struct device *dev, void *addrbuf);
u32 eth_get_headlen(const struct net_device *dev, const void *data, u32 len);
__be16 eth_type_trans(struct sk_buff *skb, struct net_device *dev);
extern const struct header_ops eth_header_ops;
int eth_header(struct sk_buff *skb, struct net_device *dev, unsigned short type,
const void *daddr, const void *saddr, unsigned len);
int eth_header_parse(const struct sk_buff *skb, unsigned char *haddr);
int eth_header_cache(const struct neighbour *neigh, struct hh_cache *hh,
__be16 type);
void eth_header_cache_update(struct hh_cache *hh, const struct net_device *dev,
const unsigned char *haddr);
__be16 eth_header_parse_protocol(const struct sk_buff *skb);
int eth_prepare_mac_addr_change(struct net_device *dev, void *p);
void eth_commit_mac_addr_change(struct net_device *dev, void *p);
int eth_mac_addr(struct net_device *dev, void *p);
int eth_validate_addr(struct net_device *dev);
struct net_device *alloc_etherdev_mqs(int sizeof_priv, unsigned int txqs,
unsigned int rxqs);
#define alloc_etherdev(sizeof_priv) alloc_etherdev_mq(sizeof_priv, 1)
#define alloc_etherdev_mq(sizeof_priv, count) alloc_etherdev_mqs(sizeof_priv, count, count)
struct net_device *devm_alloc_etherdev_mqs(struct device *dev, int sizeof_priv,
unsigned int txqs,
unsigned int rxqs);
#define devm_alloc_etherdev(dev, sizeof_priv) devm_alloc_etherdev_mqs(dev, sizeof_priv, 1, 1)
struct sk_buff *eth_gro_receive(struct list_head *head, struct sk_buff *skb);
int eth_gro_complete(struct sk_buff *skb, int nhoff);
/* Reserved Ethernet Addresses per IEEE 802.1Q */
static const u8 eth_reserved_addr_base[ETH_ALEN] __aligned(2) =
{ 0x01, 0x80, 0xc2, 0x00, 0x00, 0x00 };
#define eth_stp_addr eth_reserved_addr_base
/**
* is_link_local_ether_addr - Determine if given Ethernet address is link-local
* @addr: Pointer to a six-byte array containing the Ethernet address
*
* Return true if address is link local reserved addr (01:80:c2:00:00:0X) per
* IEEE 802.1Q 8.6.3 Frame filtering.
*
* Please note: addr must be aligned to u16.
*/
static inline bool is_link_local_ether_addr(const u8 *addr)
{
__be16 *a = (__be16 *)addr;
static const __be16 *b = (const __be16 *)eth_reserved_addr_base;
static const __be16 m = cpu_to_be16(0xfff0);
#if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)
return (((*(const u32 *)addr) ^ (*(const u32 *)b)) |
(__force int)((a[2] ^ b[2]) & m)) == 0;
#else
return ((a[0] ^ b[0]) | (a[1] ^ b[1]) | ((a[2] ^ b[2]) & m)) == 0;
#endif
}
/**
* is_zero_ether_addr - Determine if give Ethernet address is all zeros.
* @addr: Pointer to a six-byte array containing the Ethernet address
*
* Return true if the address is all zeroes.
*
* Please note: addr must be aligned to u16.
*/
static inline bool is_zero_ether_addr(const u8 *addr)
{
#if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)
return ((*(const u32 *)addr) | (*(const u16 *)(addr + 4))) == 0;
#else
return (*(const u16 *)(addr + 0) |
*(const u16 *)(addr + 2) |
*(const u16 *)(addr + 4)) == 0;
#endif
}
/**
* is_multicast_ether_addr - Determine if the Ethernet address is a multicast.
* @addr: Pointer to a six-byte array containing the Ethernet address
*
* Return true if the address is a multicast address.
* By definition the broadcast address is also a multicast address.
*/
static inline bool is_multicast_ether_addr(const u8 *addr)
{
#if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)
u32 a = *(const u32 *)addr;
#else
u16 a = *(const u16 *)addr;
#endif
#ifdef __BIG_ENDIAN
return 0x01 & (a >> ((sizeof(a) * 8) - 8));
#else
return 0x01 & a;
#endif
}
static inline bool is_multicast_ether_addr_64bits(const u8 *addr)
{
#if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && BITS_PER_LONG == 64
#ifdef __BIG_ENDIAN
return 0x01 & ((*(const u64 *)addr) >> 56);
#else
return 0x01 & (*(const u64 *)addr);
#endif
#else
return is_multicast_ether_addr(addr);
#endif
}
/**
* is_local_ether_addr - Determine if the Ethernet address is locally-assigned one (IEEE 802).
* @addr: Pointer to a six-byte array containing the Ethernet address
*
* Return true if the address is a local address.
*/
static inline bool is_local_ether_addr(const u8 *addr)
{
return 0x02 & addr[0];
}
/**
* is_broadcast_ether_addr - Determine if the Ethernet address is broadcast
* @addr: Pointer to a six-byte array containing the Ethernet address
*
* Return true if the address is the broadcast address.
*
* Please note: addr must be aligned to u16.
*/
static inline bool is_broadcast_ether_addr(const u8 *addr)
{
return (*(const u16 *)(addr + 0) &
*(const u16 *)(addr + 2) &
*(const u16 *)(addr + 4)) == 0xffff;
}
/**
* is_unicast_ether_addr - Determine if the Ethernet address is unicast
* @addr: Pointer to a six-byte array containing the Ethernet address
*
* Return true if the address is a unicast address.
*/
static inline bool is_unicast_ether_addr(const u8 *addr)
{
return !is_multicast_ether_addr(addr);
}
/**
* is_valid_ether_addr - Determine if the given Ethernet address is valid
* @addr: Pointer to a six-byte array containing the Ethernet address
*
* Check that the Ethernet address (MAC) is not 00:00:00:00:00:00, is not
* a multicast address, and is not FF:FF:FF:FF:FF:FF.
*
* Return true if the address is valid.
*
* Please note: addr must be aligned to u16.
*/
static inline bool is_valid_ether_addr(const u8 *addr)
{
/* FF:FF:FF:FF:FF:FF is a multicast address so we don't need to
* explicitly check for it here. */
return !is_multicast_ether_addr(addr) && !is_zero_ether_addr(addr);
}
/**
* eth_proto_is_802_3 - Determine if a given Ethertype/length is a protocol
* @proto: Ethertype/length value to be tested
*
* Check that the value from the Ethertype/length field is a valid Ethertype.
*
* Return true if the valid is an 802.3 supported Ethertype.
*/
static inline bool eth_proto_is_802_3(__be16 proto)
{
#ifndef __BIG_ENDIAN
/* if CPU is little endian mask off bits representing LSB */
proto &= htons(0xFF00);
#endif
/* cast both to u16 and compare since LSB can be ignored */
return (__force u16)proto >= (__force u16)htons(ETH_P_802_3_MIN);
}
/**
* eth_random_addr - Generate software assigned random Ethernet address
* @addr: Pointer to a six-byte array containing the Ethernet address
*
* Generate a random Ethernet address (MAC) that is not multicast
* and has the local assigned bit set.
*/
static inline void eth_random_addr(u8 *addr)
{
get_random_bytes(addr, ETH_ALEN);
addr[0] &= 0xfe; /* clear multicast bit */
addr[0] |= 0x02; /* set local assignment bit (IEEE802) */
}
#define random_ether_addr(addr) eth_random_addr(addr)
/**
* eth_broadcast_addr - Assign broadcast address
* @addr: Pointer to a six-byte array containing the Ethernet address
*
* Assign the broadcast address to the given address array.
*/
static inline void eth_broadcast_addr(u8 *addr)
{
memset(addr, 0xff, ETH_ALEN);
}
/**
* eth_zero_addr - Assign zero address
* @addr: Pointer to a six-byte array containing the Ethernet address
*
* Assign the zero address to the given address array.
*/
static inline void eth_zero_addr(u8 *addr)
{
memset(addr, 0x00, ETH_ALEN);
}
/**
* eth_hw_addr_random - Generate software assigned random Ethernet and
* set device flag
* @dev: pointer to net_device structure
*
* Generate a random Ethernet address (MAC) to be used by a net device
* and set addr_assign_type so the state can be read by sysfs and be
* used by userspace.
*/
static inline void eth_hw_addr_random(struct net_device *dev)
{
dev->addr_assign_type = NET_ADDR_RANDOM;
eth_random_addr(dev->dev_addr);
}
/**
* eth_hw_addr_crc - Calculate CRC from netdev_hw_addr
* @ha: pointer to hardware address
*
* Calculate CRC from a hardware address as basis for filter hashes.
*/
static inline u32 eth_hw_addr_crc(struct netdev_hw_addr *ha)
{
return ether_crc(ETH_ALEN, ha->addr);
}
/**
* ether_addr_copy - Copy an Ethernet address
* @dst: Pointer to a six-byte array Ethernet address destination
* @src: Pointer to a six-byte array Ethernet address source
*
* Please note: dst & src must both be aligned to u16.
*/
static inline void ether_addr_copy(u8 *dst, const u8 *src)
{
#if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)
*(u32 *)dst = *(const u32 *)src;
*(u16 *)(dst + 4) = *(const u16 *)(src + 4);
#else
u16 *a = (u16 *)dst;
const u16 *b = (const u16 *)src;
a[0] = b[0];
a[1] = b[1];
a[2] = b[2];
#endif
}
/**
* eth_hw_addr_set - Assign Ethernet address to a net_device
* @dev: pointer to net_device structure
* @addr: address to assign
*
* Assign given address to the net_device, addr_assign_type is not changed.
*/
static inline void eth_hw_addr_set(struct net_device *dev, const u8 *addr)
{
__dev_addr_set(dev, addr, ETH_ALEN);
}
/**
* eth_hw_addr_inherit - Copy dev_addr from another net_device
* @dst: pointer to net_device to copy dev_addr to
* @src: pointer to net_device to copy dev_addr from
*
* Copy the Ethernet address from one net_device to another along with
* the address attributes (addr_assign_type).
*/
static inline void eth_hw_addr_inherit(struct net_device *dst,
struct net_device *src)
{
dst->addr_assign_type = src->addr_assign_type;
ether_addr_copy(dst->dev_addr, src->dev_addr);
}
/**
* ether_addr_equal - Compare two Ethernet addresses
* @addr1: Pointer to a six-byte array containing the Ethernet address
* @addr2: Pointer other six-byte array containing the Ethernet address
*
* Compare two Ethernet addresses, returns true if equal
*
* Please note: addr1 & addr2 must both be aligned to u16.
*/
static inline bool ether_addr_equal(const u8 *addr1, const u8 *addr2)
{
#if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)
u32 fold = ((*(const u32 *)addr1) ^ (*(const u32 *)addr2)) |
((*(const u16 *)(addr1 + 4)) ^ (*(const u16 *)(addr2 + 4)));
return fold == 0;
#else
const u16 *a = (const u16 *)addr1;
const u16 *b = (const u16 *)addr2;
return ((a[0] ^ b[0]) | (a[1] ^ b[1]) | (a[2] ^ b[2])) == 0;
#endif
}
/**
* ether_addr_equal_64bits - Compare two Ethernet addresses
* @addr1: Pointer to an array of 8 bytes
* @addr2: Pointer to an other array of 8 bytes
*
* Compare two Ethernet addresses, returns true if equal, false otherwise.
*
* The function doesn't need any conditional branches and possibly uses
* word memory accesses on CPU allowing cheap unaligned memory reads.
* arrays = { byte1, byte2, byte3, byte4, byte5, byte6, pad1, pad2 }
*
* Please note that alignment of addr1 & addr2 are only guaranteed to be 16 bits.
*/
static inline bool ether_addr_equal_64bits(const u8 *addr1, const u8 *addr2)
{
#if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && BITS_PER_LONG == 64
u64 fold = (*(const u64 *)addr1) ^ (*(const u64 *)addr2);
#ifdef __BIG_ENDIAN
return (fold >> 16) == 0;
#else
return (fold << 16) == 0;
#endif
#else
return ether_addr_equal(addr1, addr2);
#endif
}
/**
* ether_addr_equal_unaligned - Compare two not u16 aligned Ethernet addresses
* @addr1: Pointer to a six-byte array containing the Ethernet address
* @addr2: Pointer other six-byte array containing the Ethernet address
*
* Compare two Ethernet addresses, returns true if equal
*
* Please note: Use only when any Ethernet address may not be u16 aligned.
*/
static inline bool ether_addr_equal_unaligned(const u8 *addr1, const u8 *addr2)
{
#if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)
return ether_addr_equal(addr1, addr2);
#else
return memcmp(addr1, addr2, ETH_ALEN) == 0;
#endif
}
/**
* ether_addr_equal_masked - Compare two Ethernet addresses with a mask
* @addr1: Pointer to a six-byte array containing the 1st Ethernet address
* @addr2: Pointer to a six-byte array containing the 2nd Ethernet address
* @mask: Pointer to a six-byte array containing the Ethernet address bitmask
*
* Compare two Ethernet addresses with a mask, returns true if for every bit
* set in the bitmask the equivalent bits in the ethernet addresses are equal.
* Using a mask with all bits set is a slower ether_addr_equal.
*/
static inline bool ether_addr_equal_masked(const u8 *addr1, const u8 *addr2,
const u8 *mask)
{
int i;
for (i = 0; i < ETH_ALEN; i++) {
if ((addr1[i] ^ addr2[i]) & mask[i])
return false;
}
return true;
}
/**
* ether_addr_to_u64 - Convert an Ethernet address into a u64 value.
* @addr: Pointer to a six-byte array containing the Ethernet address
*
* Return a u64 value of the address
*/
static inline u64 ether_addr_to_u64(const u8 *addr)
{
u64 u = 0;
int i;
for (i = 0; i < ETH_ALEN; i++)
u = u << 8 | addr[i];
return u;
}
/**
* u64_to_ether_addr - Convert a u64 to an Ethernet address.
* @u: u64 to convert to an Ethernet MAC address
* @addr: Pointer to a six-byte array to contain the Ethernet address
*/
static inline void u64_to_ether_addr(u64 u, u8 *addr)
{
int i;
for (i = ETH_ALEN - 1; i >= 0; i--) {
addr[i] = u & 0xff;
u = u >> 8;
}
}
/**
* eth_addr_dec - Decrement the given MAC address
*
* @addr: Pointer to a six-byte array containing Ethernet address to decrement
*/
static inline void eth_addr_dec(u8 *addr)
{
u64 u = ether_addr_to_u64(addr);
u--;
u64_to_ether_addr(u, addr);
}
/**
* eth_addr_inc() - Increment the given MAC address.
* @addr: Pointer to a six-byte array containing Ethernet address to increment.
*/
static inline void eth_addr_inc(u8 *addr)
{
u64 u = ether_addr_to_u64(addr);
u++;
u64_to_ether_addr(u, addr);
}
/**
* is_etherdev_addr - Tell if given Ethernet address belongs to the device.
* @dev: Pointer to a device structure
* @addr: Pointer to a six-byte array containing the Ethernet address
*
* Compare passed address with all addresses of the device. Return true if the
* address if one of the device addresses.
*
* Note that this function calls ether_addr_equal_64bits() so take care of
* the right padding.
*/
static inline bool is_etherdev_addr(const struct net_device *dev,
const u8 addr[6 + 2])
{
struct netdev_hw_addr *ha;
bool res = false;
rcu_read_lock();
for_each_dev_addr(dev, ha) {
res = ether_addr_equal_64bits(addr, ha->addr);
if (res)
break;
}
rcu_read_unlock();
return res;
}
#endif /* __KERNEL__ */
/**
* compare_ether_header - Compare two Ethernet headers
* @a: Pointer to Ethernet header
* @b: Pointer to Ethernet header
*
* Compare two Ethernet headers, returns 0 if equal.
* This assumes that the network header (i.e., IP header) is 4-byte
* aligned OR the platform can handle unaligned access. This is the
* case for all packets coming into netif_receive_skb or similar
* entry points.
*/
static inline unsigned long compare_ether_header(const void *a, const void *b)
{
#if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && BITS_PER_LONG == 64
unsigned long fold;
/*
* We want to compare 14 bytes:
* [a0 ... a13] ^ [b0 ... b13]
* Use two long XOR, ORed together, with an overlap of two bytes.
* [a0 a1 a2 a3 a4 a5 a6 a7 ] ^ [b0 b1 b2 b3 b4 b5 b6 b7 ] |
* [a6 a7 a8 a9 a10 a11 a12 a13] ^ [b6 b7 b8 b9 b10 b11 b12 b13]
* This means the [a6 a7] ^ [b6 b7] part is done two times.
*/
fold = *(unsigned long *)a ^ *(unsigned long *)b;
fold |= *(unsigned long *)(a + 6) ^ *(unsigned long *)(b + 6);
return fold;
#else
u32 *a32 = (u32 *)((u8 *)a + 2);
u32 *b32 = (u32 *)((u8 *)b + 2);
return (*(u16 *)a ^ *(u16 *)b) | (a32[0] ^ b32[0]) |
(a32[1] ^ b32[1]) | (a32[2] ^ b32[2]);
#endif
}
/**
* eth_skb_pad - Pad buffer to mininum number of octets for Ethernet frame
* @skb: Buffer to pad
*
* An Ethernet frame should have a minimum size of 60 bytes. This function
* takes short frames and pads them with zeros up to the 60 byte limit.
*/
static inline int eth_skb_pad(struct sk_buff *skb)
{
return skb_put_padto(skb, ETH_ZLEN);
}
#endif /* _LINUX_ETHERDEVICE_H */
/* CPU control.
* (C) 2001, 2002, 2003, 2004 Rusty Russell
*
* This code is licenced under the GPL.
*/
#include <linux/sched/mm.h>
#include <linux/proc_fs.h>
#include <linux/smp.h>
#include <linux/init.h>
#include <linux/notifier.h>
#include <linux/sched/signal.h>
#include <linux/sched/hotplug.h>
#include <linux/sched/isolation.h>
#include <linux/sched/task.h>
#include <linux/sched/smt.h>
#include <linux/unistd.h>
#include <linux/cpu.h>
#include <linux/oom.h>
#include <linux/rcupdate.h>
#include <linux/export.h>
#include <linux/bug.h>
#include <linux/kthread.h>
#include <linux/stop_machine.h>
#include <linux/mutex.h>
#include <linux/gfp.h>
#include <linux/suspend.h>
#include <linux/lockdep.h>
#include <linux/tick.h>
#include <linux/irq.h>
#include <linux/nmi.h>
#include <linux/smpboot.h>
#include <linux/relay.h>
#include <linux/slab.h>
#include <linux/scs.h>
#include <linux/percpu-rwsem.h>
#include <linux/cpuset.h>
#include <trace/events/power.h>
#define CREATE_TRACE_POINTS
#include <trace/events/cpuhp.h>
#include "smpboot.h"
/**
* struct cpuhp_cpu_state - Per cpu hotplug state storage
* @state: The current cpu state
* @target: The target state
* @fail: Current CPU hotplug callback state
* @thread: Pointer to the hotplug thread
* @should_run: Thread should execute
* @rollback: Perform a rollback
* @single: Single callback invocation
* @bringup: Single callback bringup or teardown selector
* @cpu: CPU number
* @node: Remote CPU node; for multi-instance, do a
* single entry callback for install/remove
* @last: For multi-instance rollback, remember how far we got
* @cb_state: The state for a single callback (install/uninstall)
* @result: Result of the operation
* @done_up: Signal completion to the issuer of the task for cpu-up
* @done_down: Signal completion to the issuer of the task for cpu-down
*/
struct cpuhp_cpu_state {
enum cpuhp_state state;
enum cpuhp_state target;
enum cpuhp_state fail;
#ifdef CONFIG_SMP
struct task_struct *thread;
bool should_run;
bool rollback;
bool single;
bool bringup;
struct hlist_node *node;
struct hlist_node *last;
enum cpuhp_state cb_state;
int result;
struct completion done_up;
struct completion done_down;
#endif
};
static DEFINE_PER_CPU(struct cpuhp_cpu_state, cpuhp_state) = {
.fail = CPUHP_INVALID,
};
#ifdef CONFIG_SMP
cpumask_t cpus_booted_once_mask;
#endif
#if defined(CONFIG_LOCKDEP) && defined(CONFIG_SMP)
static struct lockdep_map cpuhp_state_up_map =
STATIC_LOCKDEP_MAP_INIT("cpuhp_state-up", &cpuhp_state_up_map);
static struct lockdep_map cpuhp_state_down_map =
STATIC_LOCKDEP_MAP_INIT("cpuhp_state-down", &cpuhp_state_down_map);
static inline void cpuhp_lock_acquire(bool bringup)
{
lock_map_acquire(bringup ? &cpuhp_state_up_map : &cpuhp_state_down_map);
}
static inline void cpuhp_lock_release(bool bringup)
{
lock_map_release(bringup ? &cpuhp_state_up_map : &cpuhp_state_down_map);
}
#else
static inline void cpuhp_lock_acquire(bool bringup) { }
static inline void cpuhp_lock_release(bool bringup) { }
#endif
/**
* struct cpuhp_step - Hotplug state machine step
* @name: Name of the step
* @startup: Startup function of the step
* @teardown: Teardown function of the step
* @cant_stop: Bringup/teardown can't be stopped at this step
* @multi_instance: State has multiple instances which get added afterwards
*/
struct cpuhp_step {
const char *name;
union {
int (*single)(unsigned int cpu);
int (*multi)(unsigned int cpu,
struct hlist_node *node);
} startup;
union {
int (*single)(unsigned int cpu);
int (*multi)(unsigned int cpu,
struct hlist_node *node);
} teardown;
/* private: */
struct hlist_head list;
/* public: */
bool cant_stop;
bool multi_instance;
};
static DEFINE_MUTEX(cpuhp_state_mutex);
static struct cpuhp_step cpuhp_hp_states[];
static struct cpuhp_step *cpuhp_get_step(enum cpuhp_state state)
{
return cpuhp_hp_states + state;
}
static bool cpuhp_step_empty(bool bringup, struct cpuhp_step *step)
{
return bringup ? !step->startup.single : !step->teardown.single;
}
/**
* cpuhp_invoke_callback - Invoke the callbacks for a given state
* @cpu: The cpu for which the callback should be invoked
* @state: The state to do callbacks for
* @bringup: True if the bringup callback should be invoked
* @node: For multi-instance, do a single entry callback for install/remove
* @lastp: For multi-instance rollback, remember how far we got
*
* Called from cpu hotplug and from the state register machinery.
*
* Return: %0 on success or a negative errno code
*/
static int cpuhp_invoke_callback(unsigned int cpu, enum cpuhp_state state,
bool bringup, struct hlist_node *node,
struct hlist_node **lastp)
{
struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
struct cpuhp_step *step = cpuhp_get_step(state);
int (*cbm)(unsigned int cpu, struct hlist_node *node);
int (*cb)(unsigned int cpu);
int ret, cnt;
if (st->fail == state) {
st->fail = CPUHP_INVALID;
return -EAGAIN;
}
if (cpuhp_step_empty(bringup, step)) {
WARN_ON_ONCE(1);
return 0;
}
if (!step->multi_instance) {
WARN_ON_ONCE(lastp && *lastp);
cb = bringup ? step->startup.single : step->teardown.single;
trace_cpuhp_enter(cpu, st->target, state, cb);
ret = cb(cpu);
trace_cpuhp_exit(cpu, st->state, state, ret);
return ret;
}
cbm = bringup ? step->startup.multi : step->teardown.multi;
/* Single invocation for instance add/remove */
if (node) {
WARN_ON_ONCE(lastp && *lastp);
trace_cpuhp_multi_enter(cpu, st->target, state, cbm, node);
ret = cbm(cpu, node);
trace_cpuhp_exit(cpu, st->state, state, ret);
return ret;
}
/* State transition. Invoke on all instances */
cnt = 0;
hlist_for_each(node, &step->list) {
if (lastp && node == *lastp)
break;
trace_cpuhp_multi_enter(cpu, st->target, state, cbm, node);
ret = cbm(cpu, node);
trace_cpuhp_exit(cpu, st->state, state, ret);
if (ret) {
if (!lastp)
goto err;
*lastp = node;
return ret;
}
cnt++;
}
if (lastp)
*lastp = NULL;
return 0;
err:
/* Rollback the instances if one failed */
cbm = !bringup ? step->startup.multi : step->teardown.multi;
if (!cbm)
return ret;
hlist_for_each(node, &step->list) {
if (!cnt--)
break;
trace_cpuhp_multi_enter(cpu, st->target, state, cbm, node);
ret = cbm(cpu, node);
trace_cpuhp_exit(cpu, st->state, state, ret);
/*
* Rollback must not fail,
*/
WARN_ON_ONCE(ret);
}
return ret;
}
#ifdef CONFIG_SMP
static bool cpuhp_is_ap_state(enum cpuhp_state state)
{
/*
* The extra check for CPUHP_TEARDOWN_CPU is only for documentation
* purposes as that state is handled explicitly in cpu_down.
*/
return state > CPUHP_BRINGUP_CPU && state != CPUHP_TEARDOWN_CPU;
}
static inline void wait_for_ap_thread(struct cpuhp_cpu_state *st, bool bringup)
{
struct completion *done = bringup ? &st->done_up : &st->done_down;
wait_for_completion(done);
}
static inline void complete_ap_thread(struct cpuhp_cpu_state *st, bool bringup)
{
struct completion *done = bringup ? &st->done_up : &st->done_down;
complete(done);
}
/*
* The former STARTING/DYING states, ran with IRQs disabled and must not fail.
*/
static bool cpuhp_is_atomic_state(enum cpuhp_state state)
{
return CPUHP_AP_IDLE_DEAD <= state && state < CPUHP_AP_ONLINE;
}
/* Serializes the updates to cpu_online_mask, cpu_present_mask */
static DEFINE_MUTEX(cpu_add_remove_lock);
bool cpuhp_tasks_frozen;
EXPORT_SYMBOL_GPL(cpuhp_tasks_frozen);
/*
* The following two APIs (cpu_maps_update_begin/done) must be used when
* attempting to serialize the updates to cpu_online_mask & cpu_present_mask.
*/
void cpu_maps_update_begin(void)
{
mutex_lock(&cpu_add_remove_lock);
}
void cpu_maps_update_done(void)
{
mutex_unlock(&cpu_add_remove_lock);
}
/*
* If set, cpu_up and cpu_down will return -EBUSY and do nothing.
* Should always be manipulated under cpu_add_remove_lock
*/
static int cpu_hotplug_disabled;
#ifdef CONFIG_HOTPLUG_CPU
DEFINE_STATIC_PERCPU_RWSEM(cpu_hotplug_lock);
void cpus_read_lock(void)
{
percpu_down_read(&cpu_hotplug_lock);
}
EXPORT_SYMBOL_GPL(cpus_read_lock);
int cpus_read_trylock(void)
{
return percpu_down_read_trylock(&cpu_hotplug_lock);
}
EXPORT_SYMBOL_GPL(cpus_read_trylock);
void cpus_read_unlock(void)
{
percpu_up_read(&cpu_hotplug_lock);
}
EXPORT_SYMBOL_GPL(cpus_read_unlock);
void cpus_write_lock(void)
{
percpu_down_write(&cpu_hotplug_lock);
}
void cpus_write_unlock(void)
{
percpu_up_write(&cpu_hotplug_lock);
}
void lockdep_assert_cpus_held(void)
{
/*
* We can't have hotplug operations before userspace starts running,
* and some init codepaths will knowingly not take the hotplug lock.
* This is all valid, so mute lockdep until it makes sense to report
* unheld locks.
*/
if (system_state < SYSTEM_RUNNING)
return;
percpu_rwsem_assert_held(&cpu_hotplug_lock);
}
#ifdef CONFIG_LOCKDEP
int lockdep_is_cpus_held(void)
{
return percpu_rwsem_is_held(&cpu_hotplug_lock);
}
#endif
static void lockdep_acquire_cpus_lock(void)
{
rwsem_acquire(&cpu_hotplug_lock.dep_map, 0, 0, _THIS_IP_);
}
static void lockdep_release_cpus_lock(void)
{
rwsem_release(&cpu_hotplug_lock.dep_map, _THIS_IP_);
}
/*
* Wait for currently running CPU hotplug operations to complete (if any) and
* disable future CPU hotplug (from sysfs). The 'cpu_add_remove_lock' protects
* the 'cpu_hotplug_disabled' flag. The same lock is also acquired by the
* hotplug path before performing hotplug operations. So acquiring that lock
* guarantees mutual exclusion from any currently running hotplug operations.
*/
void cpu_hotplug_disable(void)
{
cpu_maps_update_begin();
cpu_hotplug_disabled++;
cpu_maps_update_done();
}
EXPORT_SYMBOL_GPL(cpu_hotplug_disable);
static void __cpu_hotplug_enable(void)
{
if (WARN_ONCE(!cpu_hotplug_disabled, "Unbalanced cpu hotplug enable\n"))
return;
cpu_hotplug_disabled--;
}
void cpu_hotplug_enable(void)
{
cpu_maps_update_begin();
__cpu_hotplug_enable();
cpu_maps_update_done();
}
EXPORT_SYMBOL_GPL(cpu_hotplug_enable);
#else
static void lockdep_acquire_cpus_lock(void)
{
}
static void lockdep_release_cpus_lock(void)
{
}
#endif /* CONFIG_HOTPLUG_CPU */
/*
* Architectures that need SMT-specific errata handling during SMT hotplug
* should override this.
*/
void __weak arch_smt_update(void) { }
#ifdef CONFIG_HOTPLUG_SMT
enum cpuhp_smt_control cpu_smt_control __read_mostly = CPU_SMT_ENABLED;
void __init cpu_smt_disable(bool force)
{
if (!cpu_smt_possible())
return;
if (force) {
pr_info("SMT: Force disabled\n");
cpu_smt_control = CPU_SMT_FORCE_DISABLED;
} else {
pr_info("SMT: disabled\n");
cpu_smt_control = CPU_SMT_DISABLED;
}
}
/*
* The decision whether SMT is supported can only be done after the full
* CPU identification. Called from architecture code.
*/
void __init cpu_smt_check_topology(void)
{
if (!topology_smt_supported())
cpu_smt_control = CPU_SMT_NOT_SUPPORTED;
}
static int __init smt_cmdline_disable(char *str)
{
cpu_smt_disable(str && !strcmp(str, "force"));
return 0;
}
early_param("nosmt", smt_cmdline_disable);
static inline bool cpu_smt_allowed(unsigned int cpu)
{
if (cpu_smt_control == CPU_SMT_ENABLED)
return true;
if (topology_is_primary_thread(cpu))
return true;
/*
* On x86 it's required to boot all logical CPUs at least once so
* that the init code can get a chance to set CR4.MCE on each
* CPU. Otherwise, a broadcasted MCE observing CR4.MCE=0b on any
* core will shutdown the machine.
*/
return !cpumask_test_cpu(cpu, &cpus_booted_once_mask);
}
/* Returns true if SMT is not supported of forcefully (irreversibly) disabled */
bool cpu_smt_possible(void)
{
return cpu_smt_control != CPU_SMT_FORCE_DISABLED &&
cpu_smt_control != CPU_SMT_NOT_SUPPORTED;
}
EXPORT_SYMBOL_GPL(cpu_smt_possible);
#else
static inline bool cpu_smt_allowed(unsigned int cpu) { return true; }
#endif
static inline enum cpuhp_state
cpuhp_set_state(int cpu, struct cpuhp_cpu_state *st, enum cpuhp_state target)
{
enum cpuhp_state prev_state = st->state;
bool bringup = st->state < target;
st->rollback = false;
st->last = NULL;
st->target = target;
st->single = false;
st->bringup = bringup;
if (cpu_dying(cpu) != !bringup)
set_cpu_dying(cpu, !bringup);
return prev_state;
}
static inline void
cpuhp_reset_state(int cpu, struct cpuhp_cpu_state *st,
enum cpuhp_state prev_state)
{
bool bringup = !st->bringup;
st->target = prev_state;
/*
* Already rolling back. No need invert the bringup value or to change
* the current state.
*/
if (st->rollback)
return;
st->rollback = true;
/*
* If we have st->last we need to undo partial multi_instance of this
* state first. Otherwise start undo at the previous state.
*/
if (!st->last) {
if (st->bringup)
st->state--;
else
st->state++;
}
st->bringup = bringup;
if (cpu_dying(cpu) != !bringup)
set_cpu_dying(cpu, !bringup);
}
/* Regular hotplug invocation of the AP hotplug thread */
static void __cpuhp_kick_ap(struct cpuhp_cpu_state *st)
{
if (!st->single && st->state == st->target)
return;
st->result = 0;
/*
* Make sure the above stores are visible before should_run becomes
* true. Paired with the mb() above in cpuhp_thread_fun()
*/
smp_mb();
st->should_run = true;
wake_up_process(st->thread);
wait_for_ap_thread(st, st->bringup);
}
static int cpuhp_kick_ap(int cpu, struct cpuhp_cpu_state *st,
enum cpuhp_state target)
{
enum cpuhp_state prev_state;
int ret;
prev_state = cpuhp_set_state(cpu, st, target);
__cpuhp_kick_ap(st);
if ((ret = st->result)) {
cpuhp_reset_state(cpu, st, prev_state);
__cpuhp_kick_ap(st);
}
return ret;
}
static int bringup_wait_for_ap(unsigned int cpu)
{
struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
/* Wait for the CPU to reach CPUHP_AP_ONLINE_IDLE */
wait_for_ap_thread(st, true);
if (WARN_ON_ONCE((!cpu_online(cpu))))
return -ECANCELED;
/* Unpark the hotplug thread of the target cpu */
kthread_unpark(st->thread);
/*
* SMT soft disabling on X86 requires to bring the CPU out of the
* BIOS 'wait for SIPI' state in order to set the CR4.MCE bit. The
* CPU marked itself as booted_once in notify_cpu_starting() so the
* cpu_smt_allowed() check will now return false if this is not the
* primary sibling.
*/
if (!cpu_smt_allowed(cpu))
return -ECANCELED;
if (st->target <= CPUHP_AP_ONLINE_IDLE)
return 0;
return cpuhp_kick_ap(cpu, st, st->target);
}
static int bringup_cpu(unsigned int cpu)
{
struct task_struct *idle = idle_thread_get(cpu);
int ret;
/*
* Reset stale stack state from the last time this CPU was online.
*/
scs_task_reset(idle);
kasan_unpoison_task_stack(idle);
/*
* Some architectures have to walk the irq descriptors to
* setup the vector space for the cpu which comes online.
* Prevent irq alloc/free across the bringup.
*/
irq_lock_sparse();
/* Arch-specific enabling code. */
ret = __cpu_up(cpu, idle);
irq_unlock_sparse();
if (ret)
return ret;
return bringup_wait_for_ap(cpu);
}
static int finish_cpu(unsigned int cpu)
{
struct task_struct *idle = idle_thread_get(cpu);
struct mm_struct *mm = idle->active_mm;
/*
* idle_task_exit() will have switched to &init_mm, now
* clean up any remaining active_mm state.
*/
if (mm != &init_mm)
idle->active_mm = &init_mm;
mmdrop(mm);
return 0;
}
/*
* Hotplug state machine related functions
*/
/*
* Get the next state to run. Empty ones will be skipped. Returns true if a
* state must be run.
*
* st->state will be modified ahead of time, to match state_to_run, as if it
* has already ran.
*/
static bool cpuhp_next_state(bool bringup,
enum cpuhp_state *state_to_run,
struct cpuhp_cpu_state *st,
enum cpuhp_state target)
{
do {
if (bringup) {
if (st->state >= target)
return false;
*state_to_run = ++st->state;
} else {
if (st->state <= target)
return false;
*state_to_run = st->state--;
}
if (!cpuhp_step_empty(bringup, cpuhp_get_step(*state_to_run)))
break;
} while (true);
return true;
}
static int cpuhp_invoke_callback_range(bool bringup,
unsigned int cpu,
struct cpuhp_cpu_state *st,
enum cpuhp_state target)
{
enum cpuhp_state state;
int err = 0;
while (cpuhp_next_state(bringup, &state, st, target)) {
err = cpuhp_invoke_callback(cpu, state, bringup, NULL, NULL);
if (err)
break;
}
return err;
}
static inline bool can_rollback_cpu(struct cpuhp_cpu_state *st)
{
if (IS_ENABLED(CONFIG_HOTPLUG_CPU))
return true;
/*
* When CPU hotplug is disabled, then taking the CPU down is not
* possible because takedown_cpu() and the architecture and
* subsystem specific mechanisms are not available. So the CPU
* which would be completely unplugged again needs to stay around
* in the current state.
*/
return st->state <= CPUHP_BRINGUP_CPU;
}
static int cpuhp_up_callbacks(unsigned int cpu, struct cpuhp_cpu_state *st,
enum cpuhp_state target)
{
enum cpuhp_state prev_state = st->state;
int ret = 0;
ret = cpuhp_invoke_callback_range(true, cpu, st, target);
if (ret) {
pr_debug("CPU UP failed (%d) CPU %u state %s (%d)\n",
ret, cpu, cpuhp_get_step(st->state)->name,
st->state);
cpuhp_reset_state(cpu, st, prev_state);
if (can_rollback_cpu(st))
WARN_ON(cpuhp_invoke_callback_range(false, cpu, st,
prev_state));
}
return ret;
}
/*
* The cpu hotplug threads manage the bringup and teardown of the cpus
*/
static void cpuhp_create(unsigned int cpu)
{
struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
init_completion(&st->done_up);
init_completion(&st->done_down);
}
static int cpuhp_should_run(unsigned int cpu)
{
struct cpuhp_cpu_state *st = this_cpu_ptr(&cpuhp_state);
return st->should_run;
}
/*
* Execute teardown/startup callbacks on the plugged cpu. Also used to invoke
* callbacks when a state gets [un]installed at runtime.
*
* Each invocation of this function by the smpboot thread does a single AP
* state callback.
*
* It has 3 modes of operation:
* - single: runs st->cb_state
* - up: runs ++st->state, while st->state < st->target
* - down: runs st->state--, while st->state > st->target
*
* When complete or on error, should_run is cleared and the completion is fired.
*/
static void cpuhp_thread_fun(unsigned int cpu)
{
struct cpuhp_cpu_state *st = this_cpu_ptr(&cpuhp_state);
bool bringup = st->bringup;
enum cpuhp_state state;
if (WARN_ON_ONCE(!st->should_run))
return;
/*
* ACQUIRE for the cpuhp_should_run() load of ->should_run. Ensures
* that if we see ->should_run we also see the rest of the state.
*/
smp_mb();
/*
* The BP holds the hotplug lock, but we're now running on the AP,
* ensure that anybody asserting the lock is held, will actually find
* it so.
*/
lockdep_acquire_cpus_lock();
cpuhp_lock_acquire(bringup);
if (st->single) {
state = st->cb_state;
st->should_run = false;
} else {
st->should_run = cpuhp_next_state(bringup, &state, st, st->target);
if (!st->should_run)
goto end;
}
WARN_ON_ONCE(!cpuhp_is_ap_state(state));
if (cpuhp_is_atomic_state(state)) {
local_irq_disable();
st->result = cpuhp_invoke_callback(cpu, state, bringup, st->node, &st->last);
local_irq_enable();
/*
* STARTING/DYING must not fail!
*/
WARN_ON_ONCE(st->result);
} else {
st->result = cpuhp_invoke_callback(cpu, state, bringup, st->node, &st->last);
}
if (st->result) {
/*
* If we fail on a rollback, we're up a creek without no
* paddle, no way forward, no way back. We loose, thanks for
* playing.
*/
WARN_ON_ONCE(st->rollback);
st->should_run = false;
}
end:
cpuhp_lock_release(bringup);
lockdep_release_cpus_lock();
if (!st->should_run)
complete_ap_thread(st, bringup);
}
/* Invoke a single callback on a remote cpu */
static int
cpuhp_invoke_ap_callback(int cpu, enum cpuhp_state state, bool bringup,
struct hlist_node *node)
{
struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
int ret;
if (!cpu_online(cpu))
return 0;
cpuhp_lock_acquire(false);
cpuhp_lock_release(false);
cpuhp_lock_acquire(true);
cpuhp_lock_release(true);
/*
* If we are up and running, use the hotplug thread. For early calls
* we invoke the thread function directly.
*/
if (!st->thread)
return cpuhp_invoke_callback(cpu, state, bringup, node, NULL);
st->rollback = false;
st->last = NULL;
st->node = node;
st->bringup = bringup;
st->cb_state = state;
st->single = true;
__cpuhp_kick_ap(st);
/*
* If we failed and did a partial, do a rollback.
*/
if ((ret = st->result) && st->last) {
st->rollback = true;
st->bringup = !bringup;
__cpuhp_kick_ap(st);
}
/*
* Clean up the leftovers so the next hotplug operation wont use stale
* data.
*/
st->node = st->last = NULL;
return ret;
}
static int cpuhp_kick_ap_work(unsigned int cpu)
{
struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
enum cpuhp_state prev_state = st->state;
int ret;
cpuhp_lock_acquire(false);
cpuhp_lock_release(false);
cpuhp_lock_acquire(true);
cpuhp_lock_release(true);
trace_cpuhp_enter(cpu, st->target, prev_state, cpuhp_kick_ap_work);
ret = cpuhp_kick_ap(cpu, st, st->target);
trace_cpuhp_exit(cpu, st->state, prev_state, ret);
return ret;
}
static struct smp_hotplug_thread cpuhp_threads = {
.store = &cpuhp_state.thread,
.create = &cpuhp_create,
.thread_should_run = cpuhp_should_run,
.thread_fn = cpuhp_thread_fun,
.thread_comm = "cpuhp/%u",
.selfparking = true,
};
void __init cpuhp_threads_init(void)
{
BUG_ON(smpboot_register_percpu_thread(&cpuhp_threads));
kthread_unpark(this_cpu_read(cpuhp_state.thread));
}
/*
*
* Serialize hotplug trainwrecks outside of the cpu_hotplug_lock
* protected region.
*
* The operation is still serialized against concurrent CPU hotplug via
* cpu_add_remove_lock, i.e. CPU map protection. But it is _not_
* serialized against other hotplug related activity like adding or
* removing of state callbacks and state instances, which invoke either the
* startup or the teardown callback of the affected state.
*
* This is required for subsystems which are unfixable vs. CPU hotplug and
* evade lock inversion problems by scheduling work which has to be
* completed _before_ cpu_up()/_cpu_down() returns.
*
* Don't even think about adding anything to this for any new code or even
* drivers. It's only purpose is to keep existing lock order trainwrecks
* working.
*
* For cpu_down() there might be valid reasons to finish cleanups which are
* not required to be done under cpu_hotplug_lock, but that's a different
* story and would be not invoked via this.
*/
static void cpu_up_down_serialize_trainwrecks(bool tasks_frozen)
{
/*
* cpusets delegate hotplug operations to a worker to "solve" the
* lock order problems. Wait for the worker, but only if tasks are
* _not_ frozen (suspend, hibernate) as that would wait forever.
*
* The wait is required because otherwise the hotplug operation
* returns with inconsistent state, which could even be observed in
* user space when a new CPU is brought up. The CPU plug uevent
* would be delivered and user space reacting on it would fail to
* move tasks to the newly plugged CPU up to the point where the
* work has finished because up to that point the newly plugged CPU
* is not assignable in cpusets/cgroups. On unplug that's not
* necessarily a visible issue, but it is still inconsistent state,
* which is the real problem which needs to be "fixed". This can't
* prevent the transient state between scheduling the work and
* returning from waiting for it.
*/
if (!tasks_frozen)
cpuset_wait_for_hotplug();
}
#ifdef CONFIG_HOTPLUG_CPU
#ifndef arch_clear_mm_cpumask_cpu
#define arch_clear_mm_cpumask_cpu(cpu, mm) cpumask_clear_cpu(cpu, mm_cpumask(mm))
#endif
/**
* clear_tasks_mm_cpumask - Safely clear tasks' mm_cpumask for a CPU
* @cpu: a CPU id
*
* This function walks all processes, finds a valid mm struct for each one and
* then clears a corresponding bit in mm's cpumask. While this all sounds
* trivial, there are various non-obvious corner cases, which this function
* tries to solve in a safe manner.
*
* Also note that the function uses a somewhat relaxed locking scheme, so it may
* be called only for an already offlined CPU.
*/
void clear_tasks_mm_cpumask(int cpu)
{
struct task_struct *p;
/*
* This function is called after the cpu is taken down and marked
* offline, so its not like new tasks will ever get this cpu set in
* their mm mask. -- Peter Zijlstra
* Thus, we may use rcu_read_lock() here, instead of grabbing
* full-fledged tasklist_lock.
*/
WARN_ON(cpu_online(cpu));
rcu_read_lock();
for_each_process(p) {
struct task_struct *t;
/*
* Main thread might exit, but other threads may still have
* a valid mm. Find one.
*/
t = find_lock_task_mm(p);
if (!t)
continue;
arch_clear_mm_cpumask_cpu(cpu, t->mm);
task_unlock(t);
}
rcu_read_unlock();
}
/* Take this CPU down. */
static int take_cpu_down(void *_param)
{
struct cpuhp_cpu_state *st = this_cpu_ptr(&cpuhp_state);
enum cpuhp_state target = max((int)st->target, CPUHP_AP_OFFLINE);
int err, cpu = smp_processor_id();
int ret;
/* Ensure this CPU doesn't handle any more interrupts. */
err = __cpu_disable();
if (err < 0)
return err;
/*
* Must be called from CPUHP_TEARDOWN_CPU, which means, as we are going
* down, that the current state is CPUHP_TEARDOWN_CPU - 1.
*/
WARN_ON(st->state != (CPUHP_TEARDOWN_CPU - 1));
/* Invoke the former CPU_DYING callbacks */
ret = cpuhp_invoke_callback_range(false, cpu, st, target);
/*
* DYING must not fail!
*/
WARN_ON_ONCE(ret);
/* Give up timekeeping duties */
tick_handover_do_timer();
/* Remove CPU from timer broadcasting */
tick_offline_cpu(cpu);
/* Park the stopper thread */
stop_machine_park(cpu);
return 0;
}
static int takedown_cpu(unsigned int cpu)
{
struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
int err;
/* Park the smpboot threads */
kthread_park(st->thread);
/*
* Prevent irq alloc/free while the dying cpu reorganizes the
* interrupt affinities.
*/
irq_lock_sparse();
/*
* So now all preempt/rcu users must observe !cpu_active().
*/
err = stop_machine_cpuslocked(take_cpu_down, NULL, cpumask_of(cpu));
if (err) {
/* CPU refused to die */
irq_unlock_sparse();
/* Unpark the hotplug thread so we can rollback there */
kthread_unpark(st->thread);
return err;
}
BUG_ON(cpu_online(cpu));
/*
* The teardown callback for CPUHP_AP_SCHED_STARTING will have removed
* all runnable tasks from the CPU, there's only the idle task left now
* that the migration thread is done doing the stop_machine thing.
*
* Wait for the stop thread to go away.
*/
wait_for_ap_thread(st, false);
BUG_ON(st->state != CPUHP_AP_IDLE_DEAD);
/* Interrupts are moved away from the dying cpu, reenable alloc/free */
irq_unlock_sparse();
hotplug_cpu__broadcast_tick_pull(cpu);
/* This actually kills the CPU. */
__cpu_die(cpu);
tick_cleanup_dead_cpu(cpu);
rcutree_migrate_callbacks(cpu);
return 0;
}
static void cpuhp_complete_idle_dead(void *arg)
{
struct cpuhp_cpu_state *st = arg;
complete_ap_thread(st, false);
}
void cpuhp_report_idle_dead(void)
{
struct cpuhp_cpu_state *st = this_cpu_ptr(&cpuhp_state);
BUG_ON(st->state != CPUHP_AP_OFFLINE);
rcu_report_dead(smp_processor_id());
st->state = CPUHP_AP_IDLE_DEAD;
/*
* We cannot call complete after rcu_report_dead() so we delegate it
* to an online cpu.
*/
smp_call_function_single(cpumask_first(cpu_online_mask),
cpuhp_complete_idle_dead, st, 0);
}
static int cpuhp_down_callbacks(unsigned int cpu, struct cpuhp_cpu_state *st,
enum cpuhp_state target)
{
enum cpuhp_state prev_state = st->state;
int ret = 0;
ret = cpuhp_invoke_callback_range(false, cpu, st, target);
if (ret) {
pr_debug("CPU DOWN failed (%d) CPU %u state %s (%d)\n",
ret, cpu, cpuhp_get_step(st->state)->name,
st->state);
cpuhp_reset_state(cpu, st, prev_state);
if (st->state < prev_state)
WARN_ON(cpuhp_invoke_callback_range(true, cpu, st,
prev_state));
}
return ret;
}
/* Requires cpu_add_remove_lock to be held */
static int __ref _cpu_down(unsigned int cpu, int tasks_frozen,
enum cpuhp_state target)
{
struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
int prev_state, ret = 0;
if (num_online_cpus() == 1)
return -EBUSY;
if (!cpu_present(cpu))
return -EINVAL;
cpus_write_lock();
cpuhp_tasks_frozen = tasks_frozen;
prev_state = cpuhp_set_state(cpu, st, target);
/*
* If the current CPU state is in the range of the AP hotplug thread,
* then we need to kick the thread.
*/
if (st->state > CPUHP_TEARDOWN_CPU) {
st->target = max((int)target, CPUHP_TEARDOWN_CPU);
ret = cpuhp_kick_ap_work(cpu);
/*
* The AP side has done the error rollback already. Just
* return the error code..
*/
if (ret)
goto out;
/*
* We might have stopped still in the range of the AP hotplug
* thread. Nothing to do anymore.
*/
if (st->state > CPUHP_TEARDOWN_CPU)
goto out;
st->target = target;
}
/*
* The AP brought itself down to CPUHP_TEARDOWN_CPU. So we need
* to do the further cleanups.
*/
ret = cpuhp_down_callbacks(cpu, st, target);
if (ret && st->state < prev_state) {
if (st->state == CPUHP_TEARDOWN_CPU) {
cpuhp_reset_state(cpu, st, prev_state);
__cpuhp_kick_ap(st);
} else {
WARN(1, "DEAD callback error for CPU%d", cpu);
}
}
out:
cpus_write_unlock();
/*
* Do post unplug cleanup. This is still protected against
* concurrent CPU hotplug via cpu_add_remove_lock.
*/
lockup_detector_cleanup();
arch_smt_update();
cpu_up_down_serialize_trainwrecks(tasks_frozen);
return ret;
}
static int cpu_down_maps_locked(unsigned int cpu, enum cpuhp_state target)
{
if (cpu_hotplug_disabled)
return -EBUSY;
return _cpu_down(cpu, 0, target);
}
static int cpu_down(unsigned int cpu, enum cpuhp_state target)
{
int err;
cpu_maps_update_begin();
err = cpu_down_maps_locked(cpu, target);
cpu_maps_update_done();
return err;
}
/**
* cpu_device_down - Bring down a cpu device
* @dev: Pointer to the cpu device to offline
*
* This function is meant to be used by device core cpu subsystem only.
*
* Other subsystems should use remove_cpu() instead.
*
* Return: %0 on success or a negative errno code
*/
int cpu_device_down(struct device *dev)
{
return cpu_down(dev->id, CPUHP_OFFLINE);
}
int remove_cpu(unsigned int cpu)
{
int ret;
lock_device_hotplug();
ret = device_offline(get_cpu_device(cpu));
unlock_device_hotplug();
return ret;
}
EXPORT_SYMBOL_GPL(remove_cpu);
void smp_shutdown_nonboot_cpus(unsigned int primary_cpu)
{
unsigned int cpu;
int error;
cpu_maps_update_begin();
/*
* Make certain the cpu I'm about to reboot on is online.
*
* This is inline to what migrate_to_reboot_cpu() already do.
*/
if (!cpu_online(primary_cpu))
primary_cpu = cpumask_first(cpu_online_mask);
for_each_online_cpu(cpu) {
if (cpu == primary_cpu)
continue;
error = cpu_down_maps_locked(cpu, CPUHP_OFFLINE);
if (error) {
pr_err("Failed to offline CPU%d - error=%d",
cpu, error);
break;
}
}
/*
* Ensure all but the reboot CPU are offline.
*/
BUG_ON(num_online_cpus() > 1);
/*
* Make sure the CPUs won't be enabled by someone else after this
* point. Kexec will reboot to a new kernel shortly resetting
* everything along the way.
*/
cpu_hotplug_disabled++;
cpu_maps_update_done();
}
#else
#define takedown_cpu NULL
#endif /*CONFIG_HOTPLUG_CPU*/
/**
* notify_cpu_starting(cpu) - Invoke the callbacks on the starting CPU
* @cpu: cpu that just started
*
* It must be called by the arch code on the new cpu, before the new cpu
* enables interrupts and before the "boot" cpu returns from __cpu_up().
*/
void notify_cpu_starting(unsigned int cpu)
{
struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
enum cpuhp_state target = min((int)st->target, CPUHP_AP_ONLINE);
int ret;
rcu_cpu_starting(cpu); /* Enables RCU usage on this CPU. */
cpumask_set_cpu(cpu, &cpus_booted_once_mask);
ret = cpuhp_invoke_callback_range(true, cpu, st, target);
/*
* STARTING must not fail!
*/
WARN_ON_ONCE(ret);
}
/*
* Called from the idle task. Wake up the controlling task which brings the
* hotplug thread of the upcoming CPU up and then delegates the rest of the
* online bringup to the hotplug thread.
*/
void cpuhp_online_idle(enum cpuhp_state state)
{
struct cpuhp_cpu_state *st = this_cpu_ptr(&cpuhp_state);
/* Happens for the boot cpu */
if (state != CPUHP_AP_ONLINE_IDLE)
return;
/*
* Unpart the stopper thread before we start the idle loop (and start
* scheduling); this ensures the stopper task is always available.
*/
stop_machine_unpark(smp_processor_id());
st->state = CPUHP_AP_ONLINE_IDLE;
complete_ap_thread(st, true);
}
/* Requires cpu_add_remove_lock to be held */
static int _cpu_up(unsigned int cpu, int tasks_frozen, enum cpuhp_state target)
{
struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
struct task_struct *idle;
int ret = 0;
cpus_write_lock();
if (!cpu_present(cpu)) {
ret = -EINVAL;
goto out;
}
/*
* The caller of cpu_up() might have raced with another
* caller. Nothing to do.
*/
if (st->state >= target)
goto out;
if (st->state == CPUHP_OFFLINE) {
/* Let it fail before we try to bring the cpu up */
idle = idle_thread_get(cpu);
if (IS_ERR(idle)) {
ret = PTR_ERR(idle);
goto out;
}
}
cpuhp_tasks_frozen = tasks_frozen;
cpuhp_set_state(cpu, st, target);
/*
* If the current CPU state is in the range of the AP hotplug thread,
* then we need to kick the thread once more.
*/
if (st->state > CPUHP_BRINGUP_CPU) {
ret = cpuhp_kick_ap_work(cpu);
/*
* The AP side has done the error rollback already. Just
* return the error code..
*/
if (ret)
goto out;
}
/*
* Try to reach the target state. We max out on the BP at
* CPUHP_BRINGUP_CPU. After that the AP hotplug thread is
* responsible for bringing it up to the target state.
*/
target = min((int)target, CPUHP_BRINGUP_CPU);
ret = cpuhp_up_callbacks(cpu, st, target);
out:
cpus_write_unlock();
arch_smt_update();
cpu_up_down_serialize_trainwrecks(tasks_frozen);
return ret;
}
static int cpu_up(unsigned int cpu, enum cpuhp_state target)
{
int err = 0;
if (!cpu_possible(cpu)) {
pr_err("can't online cpu %d because it is not configured as may-hotadd at boot time\n",
cpu);
#if defined(CONFIG_IA64)
pr_err("please check additional_cpus= boot parameter\n");
#endif
return -EINVAL;
}
err = try_online_node(cpu_to_node(cpu));
if (err)
return err;
cpu_maps_update_begin();
if (cpu_hotplug_disabled) {
err = -EBUSY;
goto out;
}
if (!cpu_smt_allowed(cpu)) {
err = -EPERM;
goto out;
}
err = _cpu_up(cpu, 0, target);
out:
cpu_maps_update_done();
return err;
}
/**
* cpu_device_up - Bring up a cpu device
* @dev: Pointer to the cpu device to online
*
* This function is meant to be used by device core cpu subsystem only.
*
* Other subsystems should use add_cpu() instead.
*
* Return: %0 on success or a negative errno code
*/
int cpu_device_up(struct device *dev)
{
return cpu_up(dev->id, CPUHP_ONLINE);
}
int add_cpu(unsigned int cpu)
{
int ret;
lock_device_hotplug();
ret = device_online(get_cpu_device(cpu));
unlock_device_hotplug();
return ret;
}
EXPORT_SYMBOL_GPL(add_cpu);
/**
* bringup_hibernate_cpu - Bring up the CPU that we hibernated on
* @sleep_cpu: The cpu we hibernated on and should be brought up.
*
* On some architectures like arm64, we can hibernate on any CPU, but on
* wake up the CPU we hibernated on might be offline as a side effect of
* using maxcpus= for example.
*
* Return: %0 on success or a negative errno code
*/
int bringup_hibernate_cpu(unsigned int sleep_cpu)
{
int ret;
if (!cpu_online(sleep_cpu)) {
pr_info("Hibernated on a CPU that is offline! Bringing CPU up.\n");
ret = cpu_up(sleep_cpu, CPUHP_ONLINE);
if (ret) {
pr_err("Failed to bring hibernate-CPU up!\n");
return ret;
}
}
return 0;
}
void bringup_nonboot_cpus(unsigned int setup_max_cpus)
{
unsigned int cpu;
for_each_present_cpu(cpu) {
if (num_online_cpus() >= setup_max_cpus)
break;
if (!cpu_online(cpu))
cpu_up(cpu, CPUHP_ONLINE);
}
}
#ifdef CONFIG_PM_SLEEP_SMP
static cpumask_var_t frozen_cpus;
int freeze_secondary_cpus(int primary)
{
int cpu, error = 0;
cpu_maps_update_begin();
if (primary == -1) {
primary = cpumask_first(cpu_online_mask);
if (!housekeeping_cpu(primary, HK_FLAG_TIMER))
primary = housekeeping_any_cpu(HK_FLAG_TIMER);
} else {
if (!cpu_online(primary))
primary = cpumask_first(cpu_online_mask);
}
/*
* We take down all of the non-boot CPUs in one shot to avoid races
* with the userspace trying to use the CPU hotplug at the same time
*/
cpumask_clear(frozen_cpus);
pr_info("Disabling non-boot CPUs ...\n");
for_each_online_cpu(cpu) {
if (cpu == primary)
continue;
if (pm_wakeup_pending()) {
pr_info("Wakeup pending. Abort CPU freeze\n");
error = -EBUSY;
break;
}
trace_suspend_resume(TPS("CPU_OFF"), cpu, true);
error = _cpu_down(cpu, 1, CPUHP_OFFLINE);
trace_suspend_resume(TPS("CPU_OFF"), cpu, false);
if (!error)
cpumask_set_cpu(cpu, frozen_cpus);
else {
pr_err("Error taking CPU%d down: %d\n", cpu, error);
break;
}
}
if (!error)
BUG_ON(num_online_cpus() > 1);
else
pr_err("Non-boot CPUs are not disabled\n");
/*
* Make sure the CPUs won't be enabled by someone else. We need to do
* this even in case of failure as all freeze_secondary_cpus() users are
* supposed to do thaw_secondary_cpus() on the failure path.
*/
cpu_hotplug_disabled++;
cpu_maps_update_done();
return error;
}
void __weak arch_thaw_secondary_cpus_begin(void)
{
}
void __weak arch_thaw_secondary_cpus_end(void)
{
}
void thaw_secondary_cpus(void)
{
int cpu, error;
/* Allow everyone to use the CPU hotplug again */
cpu_maps_update_begin();
__cpu_hotplug_enable();
if (cpumask_empty(frozen_cpus))
goto out;
pr_info("Enabling non-boot CPUs ...\n");
arch_thaw_secondary_cpus_begin();
for_each_cpu(cpu, frozen_cpus) {
trace_suspend_resume(TPS("CPU_ON"), cpu, true);
error = _cpu_up(cpu, 1, CPUHP_ONLINE);
trace_suspend_resume(TPS("CPU_ON"), cpu, false);
if (!error) {
pr_info("CPU%d is up\n", cpu);
continue;
}
pr_warn("Error taking CPU%d up: %d\n", cpu, error);
}
arch_thaw_secondary_cpus_end();
cpumask_clear(frozen_cpus);
out:
cpu_maps_update_done();
}
static int __init alloc_frozen_cpus(void)
{
if (!alloc_cpumask_var(&frozen_cpus, GFP_KERNEL|__GFP_ZERO))
return -ENOMEM;
return 0;
}
core_initcall(alloc_frozen_cpus);
/*
* When callbacks for CPU hotplug notifications are being executed, we must
* ensure that the state of the system with respect to the tasks being frozen
* or not, as reported by the notification, remains unchanged *throughout the
* duration* of the execution of the callbacks.
* Hence we need to prevent the freezer from racing with regular CPU hotplug.
*
* This synchronization is implemented by mutually excluding regular CPU
* hotplug and Suspend/Hibernate call paths by hooking onto the Suspend/
* Hibernate notifications.
*/
static int
cpu_hotplug_pm_callback(struct notifier_block *nb,
unsigned long action, void *ptr)
{
switch (action) {
case PM_SUSPEND_PREPARE:
case PM_HIBERNATION_PREPARE:
cpu_hotplug_disable();
break;
case PM_POST_SUSPEND:
case PM_POST_HIBERNATION:
cpu_hotplug_enable();
break;
default:
return NOTIFY_DONE;
}
return NOTIFY_OK;
}
static int __init cpu_hotplug_pm_sync_init(void)
{
/*
* cpu_hotplug_pm_callback has higher priority than x86
* bsp_pm_callback which depends on cpu_hotplug_pm_callback
* to disable cpu hotplug to avoid cpu hotplug race.
*/
pm_notifier(cpu_hotplug_pm_callback, 0);
return 0;
}
core_initcall(cpu_hotplug_pm_sync_init);
#endif /* CONFIG_PM_SLEEP_SMP */
int __boot_cpu_id;
#endif /* CONFIG_SMP */
/* Boot processor state steps */
static struct cpuhp_step cpuhp_hp_states[] = {
[CPUHP_OFFLINE] = {
.name = "offline",
.startup.single = NULL,
.teardown.single = NULL,
},
#ifdef CONFIG_SMP
[CPUHP_CREATE_THREADS]= {
.name = "threads:prepare",
.startup.single = smpboot_create_threads,
.teardown.single = NULL,
.cant_stop = true,
},
[CPUHP_PERF_PREPARE] = {
.name = "perf:prepare",
.startup.single = perf_event_init_cpu,
.teardown.single = perf_event_exit_cpu,
},
[CPUHP_WORKQUEUE_PREP] = {
.name = "workqueue:prepare",
.startup.single = workqueue_prepare_cpu,
.teardown.single = NULL,
},
[CPUHP_HRTIMERS_PREPARE] = {
.name = "hrtimers:prepare",
.startup.single = hrtimers_prepare_cpu,
.teardown.single = hrtimers_dead_cpu,
},
[CPUHP_SMPCFD_PREPARE] = {
.name = "smpcfd:prepare",
.startup.single = smpcfd_prepare_cpu,
.teardown.single = smpcfd_dead_cpu,
},
[CPUHP_RELAY_PREPARE] = {
.name = "relay:prepare",
.startup.single = relay_prepare_cpu,
.teardown.single = NULL,
},
[CPUHP_SLAB_PREPARE] = {
.name = "slab:prepare",
.startup.single = slab_prepare_cpu,
.teardown.single = slab_dead_cpu,
},
[CPUHP_RCUTREE_PREP] = {
.name = "RCU/tree:prepare",
.startup.single = rcutree_prepare_cpu,
.teardown.single = rcutree_dead_cpu,
},
/*
* On the tear-down path, timers_dead_cpu() must be invoked
* before blk_mq_queue_reinit_notify() from notify_dead(),
* otherwise a RCU stall occurs.
*/
[CPUHP_TIMERS_PREPARE] = {
.name = "timers:prepare",
.startup.single = timers_prepare_cpu,
.teardown.single = timers_dead_cpu,
},
/* Kicks the plugged cpu into life */
[CPUHP_BRINGUP_CPU] = {
.name = "cpu:bringup",
.startup.single = bringup_cpu,
.teardown.single = finish_cpu,
.cant_stop = true,
},
/* Final state before CPU kills itself */
[CPUHP_AP_IDLE_DEAD] = {
.name = "idle:dead",
},
/*
* Last state before CPU enters the idle loop to die. Transient state
* for synchronization.
*/
[CPUHP_AP_OFFLINE] = {
.name = "ap:offline",
.cant_stop = true,
},
/* First state is scheduler control. Interrupts are disabled */
[CPUHP_AP_SCHED_STARTING] = {
.name = "sched:starting",
.startup.single = sched_cpu_starting,
.teardown.single = sched_cpu_dying,
},
[CPUHP_AP_RCUTREE_DYING] = {
.name = "RCU/tree:dying",
.startup.single = NULL,
.teardown.single = rcutree_dying_cpu,
},
[CPUHP_AP_SMPCFD_DYING] = {
.name = "smpcfd:dying",
.startup.single = NULL,
.teardown.single = smpcfd_dying_cpu,
},
/* Entry state on starting. Interrupts enabled from here on. Transient
* state for synchronsization */
[CPUHP_AP_ONLINE] = {
.name = "ap:online",
},
/*
* Handled on control processor until the plugged processor manages
* this itself.
*/
[CPUHP_TEARDOWN_CPU] = {
.name = "cpu:teardown",
.startup.single = NULL,
.teardown.single = takedown_cpu,
.cant_stop = true,
},
[CPUHP_AP_SCHED_WAIT_EMPTY] = {
.name = "sched:waitempty",
.startup.single = NULL,
.teardown.single = sched_cpu_wait_empty,
},
/* Handle smpboot threads park/unpark */
[CPUHP_AP_SMPBOOT_THREADS] = {
.name = "smpboot/threads:online",
.startup.single = smpboot_unpark_threads,
.teardown.single = smpboot_park_threads,
},
[CPUHP_AP_IRQ_AFFINITY_ONLINE] = {
.name = "irq/affinity:online",
.startup.single = irq_affinity_online_cpu,
.teardown.single = NULL,
},
[CPUHP_AP_PERF_ONLINE] = {
.name = "perf:online",
.startup.single = perf_event_init_cpu,
.teardown.single = perf_event_exit_cpu,
},
[CPUHP_AP_WATCHDOG_ONLINE] = {
.name = "lockup_detector:online",
.startup.single = lockup_detector_online_cpu,
.teardown.single = lockup_detector_offline_cpu,
},
[CPUHP_AP_WORKQUEUE_ONLINE] = {
.name = "workqueue:online",
.startup.single = workqueue_online_cpu,
.teardown.single = workqueue_offline_cpu,
},
[CPUHP_AP_RCUTREE_ONLINE] = {
.name = "RCU/tree:online",
.startup.single = rcutree_online_cpu,
.teardown.single = rcutree_offline_cpu,
},
#endif
/*
* The dynamically registered state space is here
*/
#ifdef CONFIG_SMP
/* Last state is scheduler control setting the cpu active */
[CPUHP_AP_ACTIVE] = {
.name = "sched:active",
.startup.single = sched_cpu_activate,
.teardown.single = sched_cpu_deactivate,
},
#endif
/* CPU is fully up and running. */
[CPUHP_ONLINE] = {
.name = "online",
.startup.single = NULL,
.teardown.single = NULL,
},
};
/* Sanity check for callbacks */
static int cpuhp_cb_check(enum cpuhp_state state)
{
if (state <= CPUHP_OFFLINE || state >= CPUHP_ONLINE)
return -EINVAL;
return 0;
}
/*
* Returns a free for dynamic slot assignment of the Online state. The states
* are protected by the cpuhp_slot_states mutex and an empty slot is identified
* by having no name assigned.
*/
static int cpuhp_reserve_state(enum cpuhp_state state)
{
enum cpuhp_state i, end;
struct cpuhp_step *step;
switch (state) {
case CPUHP_AP_ONLINE_DYN:
step = cpuhp_hp_states + CPUHP_AP_ONLINE_DYN;
end = CPUHP_AP_ONLINE_DYN_END;
break;
case CPUHP_BP_PREPARE_DYN:
step = cpuhp_hp_states + CPUHP_BP_PREPARE_DYN;
end = CPUHP_BP_PREPARE_DYN_END;
break;
default:
return -EINVAL;
}
for (i = state; i <= end; i++, step++) {
if (!step->name)
return i;
}
WARN(1, "No more dynamic states available for CPU hotplug\n");
return -ENOSPC;
}
static int cpuhp_store_callbacks(enum cpuhp_state state, const char *name,
int (*startup)(unsigned int cpu),
int (*teardown)(unsigned int cpu),
bool multi_instance)
{
/* (Un)Install the callbacks for further cpu hotplug operations */
struct cpuhp_step *sp;
int ret = 0;
/*
* If name is NULL, then the state gets removed.
*
* CPUHP_AP_ONLINE_DYN and CPUHP_BP_PREPARE_DYN are handed out on
* the first allocation from these dynamic ranges, so the removal
* would trigger a new allocation and clear the wrong (already
* empty) state, leaving the callbacks of the to be cleared state
* dangling, which causes wreckage on the next hotplug operation.
*/
if (name && (state == CPUHP_AP_ONLINE_DYN ||
state == CPUHP_BP_PREPARE_DYN)) {
ret = cpuhp_reserve_state(state);
if (ret < 0)
return ret;
state = ret;
}
sp = cpuhp_get_step(state);
if (name && sp->name)
return -EBUSY;
sp->startup.single = startup;
sp->teardown.single = teardown;
sp->name = name;
sp->multi_instance = multi_instance;
INIT_HLIST_HEAD(&sp->list);
return ret;
}
static void *cpuhp_get_teardown_cb(enum cpuhp_state state)
{
return cpuhp_get_step(state)->teardown.single;
}
/*
* Call the startup/teardown function for a step either on the AP or
* on the current CPU.
*/
static int cpuhp_issue_call(int cpu, enum cpuhp_state state, bool bringup,
struct hlist_node *node)
{
struct cpuhp_step *sp = cpuhp_get_step(state);
int ret;
/*
* If there's nothing to do, we done.
* Relies on the union for multi_instance.
*/
if (cpuhp_step_empty(bringup, sp))
return 0;
/*
* The non AP bound callbacks can fail on bringup. On teardown
* e.g. module removal we crash for now.
*/
#ifdef CONFIG_SMP
if (cpuhp_is_ap_state(state))
ret = cpuhp_invoke_ap_callback(cpu, state, bringup, node);
else
ret = cpuhp_invoke_callback(cpu, state, bringup, node, NULL);
#else
ret = cpuhp_invoke_callback(cpu, state, bringup, node, NULL);
#endif
BUG_ON(ret && !bringup);
return ret;
}
/*
* Called from __cpuhp_setup_state on a recoverable failure.
*
* Note: The teardown callbacks for rollback are not allowed to fail!
*/
static void cpuhp_rollback_install(int failedcpu, enum cpuhp_state state,
struct hlist_node *node)
{
int cpu;
/* Roll back the already executed steps on the other cpus */
for_each_present_cpu(cpu) {
struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
int cpustate = st->state;
if (cpu >= failedcpu)
break;
/* Did we invoke the startup call on that cpu ? */
if (cpustate >= state)
cpuhp_issue_call(cpu, state, false, node);
}
}
int __cpuhp_state_add_instance_cpuslocked(enum cpuhp_state state,
struct hlist_node *node,
bool invoke)
{
struct cpuhp_step *sp;
int cpu;
int ret;
lockdep_assert_cpus_held();
sp = cpuhp_get_step(state);
if (sp->multi_instance == false)
return -EINVAL;
mutex_lock(&cpuhp_state_mutex);
if (!invoke || !sp->startup.multi)
goto add_node;
/*
* Try to call the startup callback for each present cpu
* depending on the hotplug state of the cpu.
*/
for_each_present_cpu(cpu) {
struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
int cpustate = st->state;
if (cpustate < state)
continue;
ret = cpuhp_issue_call(cpu, state, true, node);
if (ret) {
if (sp->teardown.multi)
cpuhp_rollback_install(cpu, state, node);
goto unlock;
}
}
add_node:
ret = 0;
hlist_add_head(node, &sp->list);
unlock:
mutex_unlock(&cpuhp_state_mutex);
return ret;
}
int __cpuhp_state_add_instance(enum cpuhp_state state, struct hlist_node *node,
bool invoke)
{
int ret;
cpus_read_lock();
ret = __cpuhp_state_add_instance_cpuslocked(state, node, invoke);
cpus_read_unlock();
return ret;
}
EXPORT_SYMBOL_GPL(__cpuhp_state_add_instance);
/**
* __cpuhp_setup_state_cpuslocked - Setup the callbacks for an hotplug machine state
* @state: The state to setup
* @name: Name of the step
* @invoke: If true, the startup function is invoked for cpus where
* cpu state >= @state
* @startup: startup callback function
* @teardown: teardown callback function
* @multi_instance: State is set up for multiple instances which get
* added afterwards.
*
* The caller needs to hold cpus read locked while calling this function.
* Return:
* On success:
* Positive state number if @state is CPUHP_AP_ONLINE_DYN;
* 0 for all other states
* On failure: proper (negative) error code
*/
int __cpuhp_setup_state_cpuslocked(enum cpuhp_state state,
const char *name, bool invoke,
int (*startup)(unsigned int cpu),
int (*teardown)(unsigned int cpu),
bool multi_instance)
{
int cpu, ret = 0;
bool dynstate;
lockdep_assert_cpus_held();
if (cpuhp_cb_check(state) || !name)
return -EINVAL;
mutex_lock(&cpuhp_state_mutex);
ret = cpuhp_store_callbacks(state, name, startup, teardown,
multi_instance);
dynstate = state == CPUHP_AP_ONLINE_DYN;
if (ret > 0 && dynstate) {
state = ret;
ret = 0;
}
if (ret || !invoke || !startup)
goto out;
/*
* Try to call the startup callback for each present cpu
* depending on the hotplug state of the cpu.
*/
for_each_present_cpu(cpu) {
struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
int cpustate = st->state;
if (cpustate < state)
continue;
ret = cpuhp_issue_call(cpu, state, true, NULL);
if (ret) {
if (teardown)
cpuhp_rollback_install(cpu, state, NULL);
cpuhp_store_callbacks(state, NULL, NULL, NULL, false);
goto out;
}
}
out:
mutex_unlock(&cpuhp_state_mutex);
/*
* If the requested state is CPUHP_AP_ONLINE_DYN, return the
* dynamically allocated state in case of success.
*/
if (!ret && dynstate)
return state;
return ret;
}
EXPORT_SYMBOL(__cpuhp_setup_state_cpuslocked);
int __cpuhp_setup_state(enum cpuhp_state state,
const char *name, bool invoke,
int (*startup)(unsigned int cpu),
int (*teardown)(unsigned int cpu),
bool multi_instance)
{
int ret;
cpus_read_lock();
ret = __cpuhp_setup_state_cpuslocked(state, name, invoke, startup,
teardown, multi_instance);
cpus_read_unlock();
return ret;
}
EXPORT_SYMBOL(__cpuhp_setup_state);
int __cpuhp_state_remove_instance(enum cpuhp_state state,
struct hlist_node *node, bool invoke)
{
struct cpuhp_step *sp = cpuhp_get_step(state);
int cpu;
BUG_ON(cpuhp_cb_check(state));
if (!sp->multi_instance)
return -EINVAL;
cpus_read_lock();
mutex_lock(&cpuhp_state_mutex);
if (!invoke || !cpuhp_get_teardown_cb(state))
goto remove;
/*
* Call the teardown callback for each present cpu depending
* on the hotplug state of the cpu. This function is not
* allowed to fail currently!
*/
for_each_present_cpu(cpu) {
struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
int cpustate = st->state;
if (cpustate >= state)
cpuhp_issue_call(cpu, state, false, node);
}
remove:
hlist_del(node);
mutex_unlock(&cpuhp_state_mutex);
cpus_read_unlock();
return 0;
}
EXPORT_SYMBOL_GPL(__cpuhp_state_remove_instance);
/**
* __cpuhp_remove_state_cpuslocked - Remove the callbacks for an hotplug machine state
* @state: The state to remove
* @invoke: If true, the teardown function is invoked for cpus where
* cpu state >= @state
*
* The caller needs to hold cpus read locked while calling this function.
* The teardown callback is currently not allowed to fail. Think
* about module removal!
*/
void __cpuhp_remove_state_cpuslocked(enum cpuhp_state state, bool invoke)
{
struct cpuhp_step *sp = cpuhp_get_step(state);
int cpu;
BUG_ON(cpuhp_cb_check(state));
lockdep_assert_cpus_held();
mutex_lock(&cpuhp_state_mutex);
if (sp->multi_instance) {
WARN(!hlist_empty(&sp->list),
"Error: Removing state %d which has instances left.\n",
state);
goto remove;
}
if (!invoke || !cpuhp_get_teardown_cb(state))
goto remove;
/*
* Call the teardown callback for each present cpu depending
* on the hotplug state of the cpu. This function is not
* allowed to fail currently!
*/
for_each_present_cpu(cpu) {
struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
int cpustate = st->state;
if (cpustate >= state)
cpuhp_issue_call(cpu, state, false, NULL);
}
remove:
cpuhp_store_callbacks(state, NULL, NULL, NULL, false);
mutex_unlock(&cpuhp_state_mutex);
}
EXPORT_SYMBOL(__cpuhp_remove_state_cpuslocked);
void __cpuhp_remove_state(enum cpuhp_state state, bool invoke)
{
cpus_read_lock();
__cpuhp_remove_state_cpuslocked(state, invoke);
cpus_read_unlock();
}
EXPORT_SYMBOL(__cpuhp_remove_state);
#ifdef CONFIG_HOTPLUG_SMT
static void cpuhp_offline_cpu_device(unsigned int cpu)
{
struct device *dev = get_cpu_device(cpu);
dev->offline = true;
/* Tell user space about the state change */
kobject_uevent(&dev->kobj, KOBJ_OFFLINE);
}
static void cpuhp_online_cpu_device(unsigned int cpu)
{
struct device *dev = get_cpu_device(cpu);
dev->offline = false;
/* Tell user space about the state change */
kobject_uevent(&dev->kobj, KOBJ_ONLINE);
}
int cpuhp_smt_disable(enum cpuhp_smt_control ctrlval)
{
int cpu, ret = 0;
cpu_maps_update_begin();
for_each_online_cpu(cpu) {
if (topology_is_primary_thread(cpu))
continue;
ret = cpu_down_maps_locked(cpu, CPUHP_OFFLINE);
if (ret)
break;
/*
* As this needs to hold the cpu maps lock it's impossible
* to call device_offline() because that ends up calling
* cpu_down() which takes cpu maps lock. cpu maps lock
* needs to be held as this might race against in kernel
* abusers of the hotplug machinery (thermal management).
*
* So nothing would update device:offline state. That would
* leave the sysfs entry stale and prevent onlining after
* smt control has been changed to 'off' again. This is
* called under the sysfs hotplug lock, so it is properly
* serialized against the regular offline usage.
*/
cpuhp_offline_cpu_device(cpu);
}
if (!ret)
cpu_smt_control = ctrlval;
cpu_maps_update_done();
return ret;
}
int cpuhp_smt_enable(void)
{
int cpu, ret = 0;
cpu_maps_update_begin();
cpu_smt_control = CPU_SMT_ENABLED;
for_each_present_cpu(cpu) {
/* Skip online CPUs and CPUs on offline nodes */
if (cpu_online(cpu) || !node_online(cpu_to_node(cpu)))
continue;
ret = _cpu_up(cpu, 0, CPUHP_ONLINE);
if (ret)
break;
/* See comment in cpuhp_smt_disable() */
cpuhp_online_cpu_device(cpu);
}
cpu_maps_update_done();
return ret;
}
#endif
#if defined(CONFIG_SYSFS) && defined(CONFIG_HOTPLUG_CPU)
static ssize_t state_show(struct device *dev,
struct device_attribute *attr, char *buf)
{
struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, dev->id);
return sprintf(buf, "%d\n", st->state);
}
static DEVICE_ATTR_RO(state);
static ssize_t target_store(struct device *dev, struct device_attribute *attr,
const char *buf, size_t count)
{
struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, dev->id);
struct cpuhp_step *sp;
int target, ret;
ret = kstrtoint(buf, 10, &target);
if (ret)
return ret;
#ifdef CONFIG_CPU_HOTPLUG_STATE_CONTROL
if (target < CPUHP_OFFLINE || target > CPUHP_ONLINE)
return -EINVAL;
#else
if (target != CPUHP_OFFLINE && target != CPUHP_ONLINE)
return -EINVAL;
#endif
ret = lock_device_hotplug_sysfs();
if (ret)
return ret;
mutex_lock(&cpuhp_state_mutex);
sp = cpuhp_get_step(target);
ret = !sp->name || sp->cant_stop ? -EINVAL : 0;
mutex_unlock(&cpuhp_state_mutex);
if (ret)
goto out;
if (st->state < target)
ret = cpu_up(dev->id, target);
else
ret = cpu_down(dev->id, target);
out:
unlock_device_hotplug();
return ret ? ret : count;
}
static ssize_t target_show(struct device *dev,
struct device_attribute *attr, char *buf)
{
struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, dev->id);
return sprintf(buf, "%d\n", st->target);
}
static DEVICE_ATTR_RW(target);
static ssize_t fail_store(struct device *dev, struct device_attribute *attr,
const char *buf, size_t count)
{
struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, dev->id);
struct cpuhp_step *sp;
int fail, ret;
ret = kstrtoint(buf, 10, &fail);
if (ret)
return ret;
if (fail == CPUHP_INVALID) {
st->fail = fail;
return count;
}
if (fail < CPUHP_OFFLINE || fail > CPUHP_ONLINE)
return -EINVAL;
/*
* Cannot fail STARTING/DYING callbacks.
*/
if (cpuhp_is_atomic_state(fail))
return -EINVAL;
/*
* DEAD callbacks cannot fail...
* ... neither can CPUHP_BRINGUP_CPU during hotunplug. The latter
* triggering STARTING callbacks, a failure in this state would
* hinder rollback.
*/
if (fail <= CPUHP_BRINGUP_CPU && st->state > CPUHP_BRINGUP_CPU)
return -EINVAL;
/*
* Cannot fail anything that doesn't have callbacks.
*/
mutex_lock(&cpuhp_state_mutex);
sp = cpuhp_get_step(fail);
if (!sp->startup.single && !sp->teardown.single)
ret = -EINVAL;
mutex_unlock(&cpuhp_state_mutex);
if (ret)
return ret;
st->fail = fail;
return count;
}
static ssize_t fail_show(struct device *dev,
struct device_attribute *attr, char *buf)
{
struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, dev->id);
return sprintf(buf, "%d\n", st->fail);
}
static DEVICE_ATTR_RW(fail);
static struct attribute *cpuhp_cpu_attrs[] = {
&dev_attr_state.attr,
&dev_attr_target.attr,
&dev_attr_fail.attr,
NULL
};
static const struct attribute_group cpuhp_cpu_attr_group = {
.attrs = cpuhp_cpu_attrs,
.name = "hotplug",
NULL
};
static ssize_t states_show(struct device *dev,
struct device_attribute *attr, char *buf)
{
ssize_t cur, res = 0;
int i;
mutex_lock(&cpuhp_state_mutex);
for (i = CPUHP_OFFLINE; i <= CPUHP_ONLINE; i++) {
struct cpuhp_step *sp = cpuhp_get_step(i);
if (sp->name) {
cur = sprintf(buf, "%3d: %s\n", i, sp->name);
buf += cur;
res += cur;
}
}
mutex_unlock(&cpuhp_state_mutex);
return res;
}
static DEVICE_ATTR_RO(states);
static struct attribute *cpuhp_cpu_root_attrs[] = {
&dev_attr_states.attr,
NULL
};
static const struct attribute_group cpuhp_cpu_root_attr_group = {
.attrs = cpuhp_cpu_root_attrs,
.name = "hotplug",
NULL
};
#ifdef CONFIG_HOTPLUG_SMT
static ssize_t
__store_smt_control(struct device *dev, struct device_attribute *attr,
const char *buf, size_t count)
{
int ctrlval, ret;
if (sysfs_streq(buf, "on"))
ctrlval = CPU_SMT_ENABLED;
else if (sysfs_streq(buf, "off"))
ctrlval = CPU_SMT_DISABLED;
else if (sysfs_streq(buf, "forceoff"))
ctrlval = CPU_SMT_FORCE_DISABLED;
else
return -EINVAL;
if (cpu_smt_control == CPU_SMT_FORCE_DISABLED)
return -EPERM;
if (cpu_smt_control == CPU_SMT_NOT_SUPPORTED)
return -ENODEV;
ret = lock_device_hotplug_sysfs();
if (ret)
return ret;
if (ctrlval != cpu_smt_control) {
switch (ctrlval) {
case CPU_SMT_ENABLED:
ret = cpuhp_smt_enable();
break;
case CPU_SMT_DISABLED:
case CPU_SMT_FORCE_DISABLED:
ret = cpuhp_smt_disable(ctrlval);
break;
}
}
unlock_device_hotplug();
return ret ? ret : count;
}
#else /* !CONFIG_HOTPLUG_SMT */
static ssize_t
__store_smt_control(struct device *dev, struct device_attribute *attr,
const char *buf, size_t count)
{
return -ENODEV;
}
#endif /* CONFIG_HOTPLUG_SMT */
static const char *smt_states[] = {
[CPU_SMT_ENABLED] = "on",
[CPU_SMT_DISABLED] = "off",
[CPU_SMT_FORCE_DISABLED] = "forceoff",
[CPU_SMT_NOT_SUPPORTED] = "notsupported",
[CPU_SMT_NOT_IMPLEMENTED] = "notimplemented",
};
static ssize_t control_show(struct device *dev,
struct device_attribute *attr, char *buf)
{
const char *state = smt_states[cpu_smt_control];
return snprintf(buf, PAGE_SIZE - 2, "%s\n", state);
}
static ssize_t control_store(struct device *dev, struct device_attribute *attr,
const char *buf, size_t count)
{
return __store_smt_control(dev, attr, buf, count);
}
static DEVICE_ATTR_RW(control);
static ssize_t active_show(struct device *dev,
struct device_attribute *attr, char *buf)
{
return snprintf(buf, PAGE_SIZE - 2, "%d\n", sched_smt_active());
}
static DEVICE_ATTR_RO(active);
static struct attribute *cpuhp_smt_attrs[] = {
&dev_attr_control.attr,
&dev_attr_active.attr,
NULL
};
static const struct attribute_group cpuhp_smt_attr_group = {
.attrs = cpuhp_smt_attrs,
.name = "smt",
NULL
};
static int __init cpu_smt_sysfs_init(void)
{
return sysfs_create_group(&cpu_subsys.dev_root->kobj,
&cpuhp_smt_attr_group);
}
static int __init cpuhp_sysfs_init(void)
{
int cpu, ret;
ret = cpu_smt_sysfs_init();
if (ret)
return ret;
ret = sysfs_create_group(&cpu_subsys.dev_root->kobj,
&cpuhp_cpu_root_attr_group);
if (ret)
return ret;
for_each_possible_cpu(cpu) {
struct device *dev = get_cpu_device(cpu);
if (!dev)
continue;
ret = sysfs_create_group(&dev->kobj, &cpuhp_cpu_attr_group);
if (ret)
return ret;
}
return 0;
}
device_initcall(cpuhp_sysfs_init);
#endif /* CONFIG_SYSFS && CONFIG_HOTPLUG_CPU */
/*
* cpu_bit_bitmap[] is a special, "compressed" data structure that
* represents all NR_CPUS bits binary values of 1<<nr.
*
* It is used by cpumask_of() to get a constant address to a CPU
* mask value that has a single bit set only.
*/
/* cpu_bit_bitmap[0] is empty - so we can back into it */
#define MASK_DECLARE_1(x) [x+1][0] = (1UL << (x))
#define MASK_DECLARE_2(x) MASK_DECLARE_1(x), MASK_DECLARE_1(x+1)
#define MASK_DECLARE_4(x) MASK_DECLARE_2(x), MASK_DECLARE_2(x+2)
#define MASK_DECLARE_8(x) MASK_DECLARE_4(x), MASK_DECLARE_4(x+4)
const unsigned long cpu_bit_bitmap[BITS_PER_LONG+1][BITS_TO_LONGS(NR_CPUS)] = {
MASK_DECLARE_8(0), MASK_DECLARE_8(8),
MASK_DECLARE_8(16), MASK_DECLARE_8(24),
#if BITS_PER_LONG > 32
MASK_DECLARE_8(32), MASK_DECLARE_8(40),
MASK_DECLARE_8(48), MASK_DECLARE_8(56),
#endif
};
EXPORT_SYMBOL_GPL(cpu_bit_bitmap);
const DECLARE_BITMAP(cpu_all_bits, NR_CPUS) = CPU_BITS_ALL;
EXPORT_SYMBOL(cpu_all_bits);
#ifdef CONFIG_INIT_ALL_POSSIBLE
struct cpumask __cpu_possible_mask __read_mostly
= {CPU_BITS_ALL};
#else
struct cpumask __cpu_possible_mask __read_mostly;
#endif
EXPORT_SYMBOL(__cpu_possible_mask);
struct cpumask __cpu_online_mask __read_mostly;
EXPORT_SYMBOL(__cpu_online_mask);
struct cpumask __cpu_present_mask __read_mostly;
EXPORT_SYMBOL(__cpu_present_mask);
struct cpumask __cpu_active_mask __read_mostly;
EXPORT_SYMBOL(__cpu_active_mask);
struct cpumask __cpu_dying_mask __read_mostly;
EXPORT_SYMBOL(__cpu_dying_mask);
atomic_t __num_online_cpus __read_mostly;
EXPORT_SYMBOL(__num_online_cpus);
void init_cpu_present(const struct cpumask *src)
{
cpumask_copy(&__cpu_present_mask, src);
}
void init_cpu_possible(const struct cpumask *src)
{
cpumask_copy(&__cpu_possible_mask, src);
}
void init_cpu_online(const struct cpumask *src)
{
cpumask_copy(&__cpu_online_mask, src);
}
void set_cpu_online(unsigned int cpu, bool online)
{
/*
* atomic_inc/dec() is required to handle the horrid abuse of this
* function by the reboot and kexec code which invoke it from
* IPI/NMI broadcasts when shutting down CPUs. Invocation from
* regular CPU hotplug is properly serialized.
*
* Note, that the fact that __num_online_cpus is of type atomic_t
* does not protect readers which are not serialized against
* concurrent hotplug operations.
*/
if (online) {
if (!cpumask_test_and_set_cpu(cpu, &__cpu_online_mask))
atomic_inc(&__num_online_cpus);
} else {
if (cpumask_test_and_clear_cpu(cpu, &__cpu_online_mask))
atomic_dec(&__num_online_cpus);
}
}
/*
* Activate the first processor.
*/
void __init boot_cpu_init(void)
{
int cpu = smp_processor_id();
/* Mark the boot cpu "present", "online" etc for SMP and UP case */
set_cpu_online(cpu, true);
set_cpu_active(cpu, true);
set_cpu_present(cpu, true);
set_cpu_possible(cpu, true);
#ifdef CONFIG_SMP
__boot_cpu_id = cpu;
#endif
}
/*
* Must be called _AFTER_ setting up the per_cpu areas
*/
void __init boot_cpu_hotplug_init(void)
{
#ifdef CONFIG_SMP
cpumask_set_cpu(smp_processor_id(), &cpus_booted_once_mask);
#endif
this_cpu_write(cpuhp_state.state, CPUHP_ONLINE);
}
/*
* These are used for a global "mitigations=" cmdline option for toggling
* optional CPU mitigations.
*/
enum cpu_mitigations {
CPU_MITIGATIONS_OFF,
CPU_MITIGATIONS_AUTO,
CPU_MITIGATIONS_AUTO_NOSMT,
};
static enum cpu_mitigations cpu_mitigations __ro_after_init =
CPU_MITIGATIONS_AUTO;
static int __init mitigations_parse_cmdline(char *arg)
{
if (!strcmp(arg, "off"))
cpu_mitigations = CPU_MITIGATIONS_OFF;
else if (!strcmp(arg, "auto"))
cpu_mitigations = CPU_MITIGATIONS_AUTO;
else if (!strcmp(arg, "auto,nosmt"))
cpu_mitigations = CPU_MITIGATIONS_AUTO_NOSMT;
else
pr_crit("Unsupported mitigations=%s, system may still be vulnerable\n",
arg);
return 0;
}
early_param("mitigations", mitigations_parse_cmdline);
/* mitigations=off */
bool cpu_mitigations_off(void)
{
return cpu_mitigations == CPU_MITIGATIONS_OFF;
}
EXPORT_SYMBOL_GPL(cpu_mitigations_off);
/* mitigations=auto,nosmt */
bool cpu_mitigations_auto_nosmt(void)
{
return cpu_mitigations == CPU_MITIGATIONS_AUTO_NOSMT;
}
EXPORT_SYMBOL_GPL(cpu_mitigations_auto_nosmt);
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef __FS_NOTIFY_FSNOTIFY_H_
#define __FS_NOTIFY_FSNOTIFY_H_
#include <linux/list.h>
#include <linux/fsnotify.h>
#include <linux/srcu.h>
#include <linux/types.h>
#include "../mount.h"
static inline struct inode *fsnotify_conn_inode(
struct fsnotify_mark_connector *conn)
{
return container_of(conn->obj, struct inode, i_fsnotify_marks);
}
static inline struct mount *fsnotify_conn_mount(
struct fsnotify_mark_connector *conn)
{
return container_of(conn->obj, struct mount, mnt_fsnotify_marks);
}
static inline struct super_block *fsnotify_conn_sb(
struct fsnotify_mark_connector *conn)
{
return container_of(conn->obj, struct super_block, s_fsnotify_marks);
}
static inline struct super_block *fsnotify_connector_sb(
struct fsnotify_mark_connector *conn)
{
switch (conn->type) {
case FSNOTIFY_OBJ_TYPE_INODE:
return fsnotify_conn_inode(conn)->i_sb;
case FSNOTIFY_OBJ_TYPE_VFSMOUNT:
return fsnotify_conn_mount(conn)->mnt.mnt_sb;
case FSNOTIFY_OBJ_TYPE_SB:
return fsnotify_conn_sb(conn);
default:
return NULL;
}
}
/* destroy all events sitting in this groups notification queue */
extern void fsnotify_flush_notify(struct fsnotify_group *group);
/* protects reads of inode and vfsmount marks list */
extern struct srcu_struct fsnotify_mark_srcu;
/* compare two groups for sorting of marks lists */
extern int fsnotify_compare_groups(struct fsnotify_group *a,
struct fsnotify_group *b);
/* Destroy all marks attached to an object via connector */
extern void fsnotify_destroy_marks(fsnotify_connp_t *connp);
/* run the list of all marks associated with inode and destroy them */
static inline void fsnotify_clear_marks_by_inode(struct inode *inode)
{
fsnotify_destroy_marks(&inode->i_fsnotify_marks);
}
/* run the list of all marks associated with vfsmount and destroy them */
static inline void fsnotify_clear_marks_by_mount(struct vfsmount *mnt)
{
fsnotify_destroy_marks(&real_mount(mnt)->mnt_fsnotify_marks);
}
/* run the list of all marks associated with sb and destroy them */
static inline void fsnotify_clear_marks_by_sb(struct super_block *sb)
{
fsnotify_destroy_marks(&sb->s_fsnotify_marks);
}
/*
* update the dentry->d_flags of all of inode's children to indicate if inode cares
* about events that happen to its children.
*/
extern void __fsnotify_update_child_dentry_flags(struct inode *inode);
/* allocate and destroy and event holder to attach events to notification/access queues */
extern struct fsnotify_event_holder *fsnotify_alloc_event_holder(void);
extern void fsnotify_destroy_event_holder(struct fsnotify_event_holder *holder);
extern struct kmem_cache *fsnotify_mark_connector_cachep;
#endif /* __FS_NOTIFY_FSNOTIFY_H_ */
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_JIFFIES_H
#define _LINUX_JIFFIES_H
#include <linux/cache.h>
#include <linux/limits.h>
#include <linux/math64.h>
#include <linux/minmax.h>
#include <linux/types.h>
#include <linux/time.h>
#include <linux/timex.h>
#include <vdso/jiffies.h>
#include <asm/param.h> /* for HZ */
#include <generated/timeconst.h>
/*
* The following defines establish the engineering parameters of the PLL
* model. The HZ variable establishes the timer interrupt frequency, 100 Hz
* for the SunOS kernel, 256 Hz for the Ultrix kernel and 1024 Hz for the
* OSF/1 kernel. The SHIFT_HZ define expresses the same value as the
* nearest power of two in order to avoid hardware multiply operations.
*/
#if HZ >= 12 && HZ < 24
# define SHIFT_HZ 4
#elif HZ >= 24 && HZ < 48
# define SHIFT_HZ 5
#elif HZ >= 48 && HZ < 96
# define SHIFT_HZ 6
#elif HZ >= 96 && HZ < 192
# define SHIFT_HZ 7
#elif HZ >= 192 && HZ < 384
# define SHIFT_HZ 8
#elif HZ >= 384 && HZ < 768
# define SHIFT_HZ 9
#elif HZ >= 768 && HZ < 1536
# define SHIFT_HZ 10
#elif HZ >= 1536 && HZ < 3072
# define SHIFT_HZ 11
#elif HZ >= 3072 && HZ < 6144
# define SHIFT_HZ 12
#elif HZ >= 6144 && HZ < 12288
# define SHIFT_HZ 13
#else
# error Invalid value of HZ.
#endif
/* Suppose we want to divide two numbers NOM and DEN: NOM/DEN, then we can
* improve accuracy by shifting LSH bits, hence calculating:
* (NOM << LSH) / DEN
* This however means trouble for large NOM, because (NOM << LSH) may no
* longer fit in 32 bits. The following way of calculating this gives us
* some slack, under the following conditions:
* - (NOM / DEN) fits in (32 - LSH) bits.
* - (NOM % DEN) fits in (32 - LSH) bits.
*/
#define SH_DIV(NOM,DEN,LSH) ( (((NOM) / (DEN)) << (LSH)) \
+ ((((NOM) % (DEN)) << (LSH)) + (DEN) / 2) / (DEN))
/* LATCH is used in the interval timer and ftape setup. */
#define LATCH ((CLOCK_TICK_RATE + HZ/2) / HZ) /* For divider */
extern int register_refined_jiffies(long clock_tick_rate);
/* TICK_USEC is the time between ticks in usec assuming SHIFTED_HZ */
#define TICK_USEC ((USEC_PER_SEC + HZ/2) / HZ)
/* USER_TICK_USEC is the time between ticks in usec assuming fake USER_HZ */
#define USER_TICK_USEC ((1000000UL + USER_HZ/2) / USER_HZ)
#ifndef __jiffy_arch_data
#define __jiffy_arch_data
#endif
/*
* The 64-bit value is not atomic - you MUST NOT read it
* without sampling the sequence number in jiffies_lock.
* get_jiffies_64() will do this for you as appropriate.
*/
extern u64 __cacheline_aligned_in_smp jiffies_64;
extern unsigned long volatile __cacheline_aligned_in_smp __jiffy_arch_data jiffies;
#if (BITS_PER_LONG < 64)
u64 get_jiffies_64(void);
#else
static inline u64 get_jiffies_64(void)
{
return (u64)jiffies;
}
#endif
/*
* These inlines deal with timer wrapping correctly. You are
* strongly encouraged to use them
* 1. Because people otherwise forget
* 2. Because if the timer wrap changes in future you won't have to
* alter your driver code.
*
* time_after(a,b) returns true if the time a is after time b.
*
* Do this with "<0" and ">=0" to only test the sign of the result. A
* good compiler would generate better code (and a really good compiler
* wouldn't care). Gcc is currently neither.
*/
#define time_after(a,b) \
(typecheck(unsigned long, a) && \
typecheck(unsigned long, b) && \
((long)((b) - (a)) < 0))
#define time_before(a,b) time_after(b,a)
#define time_after_eq(a,b) \
(typecheck(unsigned long, a) && \
typecheck(unsigned long, b) && \
((long)((a) - (b)) >= 0))
#define time_before_eq(a,b) time_after_eq(b,a)
/*
* Calculate whether a is in the range of [b, c].
*/
#define time_in_range(a,b,c) \
(time_after_eq(a,b) && \
time_before_eq(a,c))
/*
* Calculate whether a is in the range of [b, c).
*/
#define time_in_range_open(a,b,c) \
(time_after_eq(a,b) && \
time_before(a,c))
/* Same as above, but does so with platform independent 64bit types.
* These must be used when utilizing jiffies_64 (i.e. return value of
* get_jiffies_64() */
#define time_after64(a,b) \
(typecheck(__u64, a) && \
typecheck(__u64, b) && \
((__s64)((b) - (a)) < 0))
#define time_before64(a,b) time_after64(b,a)
#define time_after_eq64(a,b) \
(typecheck(__u64, a) && \
typecheck(__u64, b) && \
((__s64)((a) - (b)) >= 0))
#define time_before_eq64(a,b) time_after_eq64(b,a)
#define time_in_range64(a, b, c) \
(time_after_eq64(a, b) && \
time_before_eq64(a, c))
/*
* These four macros compare jiffies and 'a' for convenience.
*/
/* time_is_before_jiffies(a) return true if a is before jiffies */
#define time_is_before_jiffies(a) time_after(jiffies, a)
#define time_is_before_jiffies64(a) time_after64(get_jiffies_64(), a)
/* time_is_after_jiffies(a) return true if a is after jiffies */
#define time_is_after_jiffies(a) time_before(jiffies, a)
#define time_is_after_jiffies64(a) time_before64(get_jiffies_64(), a)
/* time_is_before_eq_jiffies(a) return true if a is before or equal to jiffies*/
#define time_is_before_eq_jiffies(a) time_after_eq(jiffies, a)
#define time_is_before_eq_jiffies64(a) time_after_eq64(get_jiffies_64(), a)
/* time_is_after_eq_jiffies(a) return true if a is after or equal to jiffies*/
#define time_is_after_eq_jiffies(a) time_before_eq(jiffies, a)
#define time_is_after_eq_jiffies64(a) time_before_eq64(get_jiffies_64(), a)
/*
* Have the 32 bit jiffies value wrap 5 minutes after boot
* so jiffies wrap bugs show up earlier.
*/
#define INITIAL_JIFFIES ((unsigned long)(unsigned int) (-300*HZ))
/*
* Change timeval to jiffies, trying to avoid the
* most obvious overflows..
*
* And some not so obvious.
*
* Note that we don't want to return LONG_MAX, because
* for various timeout reasons we often end up having
* to wait "jiffies+1" in order to guarantee that we wait
* at _least_ "jiffies" - so "jiffies+1" had better still
* be positive.
*/
#define MAX_JIFFY_OFFSET ((LONG_MAX >> 1)-1)
extern unsigned long preset_lpj;
/*
* We want to do realistic conversions of time so we need to use the same
* values the update wall clock code uses as the jiffies size. This value
* is: TICK_NSEC (which is defined in timex.h). This
* is a constant and is in nanoseconds. We will use scaled math
* with a set of scales defined here as SEC_JIFFIE_SC, USEC_JIFFIE_SC and
* NSEC_JIFFIE_SC. Note that these defines contain nothing but
* constants and so are computed at compile time. SHIFT_HZ (computed in
* timex.h) adjusts the scaling for different HZ values.
* Scaled math??? What is that?
*
* Scaled math is a way to do integer math on values that would,
* otherwise, either overflow, underflow, or cause undesired div
* instructions to appear in the execution path. In short, we "scale"
* up the operands so they take more bits (more precision, less
* underflow), do the desired operation and then "scale" the result back
* by the same amount. If we do the scaling by shifting we avoid the
* costly mpy and the dastardly div instructions.
* Suppose, for example, we want to convert from seconds to jiffies
* where jiffies is defined in nanoseconds as NSEC_PER_JIFFIE. The
* simple math is: jiff = (sec * NSEC_PER_SEC) / NSEC_PER_JIFFIE; We
* observe that (NSEC_PER_SEC / NSEC_PER_JIFFIE) is a constant which we
* might calculate at compile time, however, the result will only have
* about 3-4 bits of precision (less for smaller values of HZ).
*
* So, we scale as follows:
* jiff = (sec) * (NSEC_PER_SEC / NSEC_PER_JIFFIE);
* jiff = ((sec) * ((NSEC_PER_SEC * SCALE)/ NSEC_PER_JIFFIE)) / SCALE;
* Then we make SCALE a power of two so:
* jiff = ((sec) * ((NSEC_PER_SEC << SCALE)/ NSEC_PER_JIFFIE)) >> SCALE;
* Now we define:
* #define SEC_CONV = ((NSEC_PER_SEC << SCALE)/ NSEC_PER_JIFFIE))
* jiff = (sec * SEC_CONV) >> SCALE;
*
* Often the math we use will expand beyond 32-bits so we tell C how to
* do this and pass the 64-bit result of the mpy through the ">> SCALE"
* which should take the result back to 32-bits. We want this expansion
* to capture as much precision as possible. At the same time we don't
* want to overflow so we pick the SCALE to avoid this. In this file,
* that means using a different scale for each range of HZ values (as
* defined in timex.h).
*
* For those who want to know, gcc will give a 64-bit result from a "*"
* operator if the result is a long long AND at least one of the
* operands is cast to long long (usually just prior to the "*" so as
* not to confuse it into thinking it really has a 64-bit operand,
* which, buy the way, it can do, but it takes more code and at least 2
* mpys).
* We also need to be aware that one second in nanoseconds is only a
* couple of bits away from overflowing a 32-bit word, so we MUST use
* 64-bits to get the full range time in nanoseconds.
*/
/*
* Here are the scales we will use. One for seconds, nanoseconds and
* microseconds.
*
* Within the limits of cpp we do a rough cut at the SEC_JIFFIE_SC and
* check if the sign bit is set. If not, we bump the shift count by 1.
* (Gets an extra bit of precision where we can use it.)
* We know it is set for HZ = 1024 and HZ = 100 not for 1000.
* Haven't tested others.
* Limits of cpp (for #if expressions) only long (no long long), but
* then we only need the most signicant bit.
*/
#define SEC_JIFFIE_SC (31 - SHIFT_HZ)
#if !((((NSEC_PER_SEC << 2) / TICK_NSEC) << (SEC_JIFFIE_SC - 2)) & 0x80000000)
#undef SEC_JIFFIE_SC
#define SEC_JIFFIE_SC (32 - SHIFT_HZ)
#endif
#define NSEC_JIFFIE_SC (SEC_JIFFIE_SC + 29)
#define SEC_CONVERSION ((unsigned long)((((u64)NSEC_PER_SEC << SEC_JIFFIE_SC) +\
TICK_NSEC -1) / (u64)TICK_NSEC))
#define NSEC_CONVERSION ((unsigned long)((((u64)1 << NSEC_JIFFIE_SC) +\
TICK_NSEC -1) / (u64)TICK_NSEC))
/*
* The maximum jiffie value is (MAX_INT >> 1). Here we translate that
* into seconds. The 64-bit case will overflow if we are not careful,
* so use the messy SH_DIV macro to do it. Still all constants.
*/
#if BITS_PER_LONG < 64
# define MAX_SEC_IN_JIFFIES \
(long)((u64)((u64)MAX_JIFFY_OFFSET * TICK_NSEC) / NSEC_PER_SEC)
#else /* take care of overflow on 64 bits machines */
# define MAX_SEC_IN_JIFFIES \
(SH_DIV((MAX_JIFFY_OFFSET >> SEC_JIFFIE_SC) * TICK_NSEC, NSEC_PER_SEC, 1) - 1)
#endif
/*
* Convert various time units to each other:
*/
extern unsigned int jiffies_to_msecs(const unsigned long j);
extern unsigned int jiffies_to_usecs(const unsigned long j);
static inline u64 jiffies_to_nsecs(const unsigned long j)
{
return (u64)jiffies_to_usecs(j) * NSEC_PER_USEC;
}
extern u64 jiffies64_to_nsecs(u64 j);
extern u64 jiffies64_to_msecs(u64 j);
extern unsigned long __msecs_to_jiffies(const unsigned int m);
#if HZ <= MSEC_PER_SEC && !(MSEC_PER_SEC % HZ)
/*
* HZ is equal to or smaller than 1000, and 1000 is a nice round
* multiple of HZ, divide with the factor between them, but round
* upwards:
*/
static inline unsigned long _msecs_to_jiffies(const unsigned int m)
{
return (m + (MSEC_PER_SEC / HZ) - 1) / (MSEC_PER_SEC / HZ);
}
#elif HZ > MSEC_PER_SEC && !(HZ % MSEC_PER_SEC)
/*
* HZ is larger than 1000, and HZ is a nice round multiple of 1000 -
* simply multiply with the factor between them.
*
* But first make sure the multiplication result cannot overflow:
*/
static inline unsigned long _msecs_to_jiffies(const unsigned int m)
{
if (m > jiffies_to_msecs(MAX_JIFFY_OFFSET))
return MAX_JIFFY_OFFSET;
return m * (HZ / MSEC_PER_SEC);
}
#else
/*
* Generic case - multiply, round and divide. But first check that if
* we are doing a net multiplication, that we wouldn't overflow:
*/
static inline unsigned long _msecs_to_jiffies(const unsigned int m)
{
if (HZ > MSEC_PER_SEC && m > jiffies_to_msecs(MAX_JIFFY_OFFSET))
return MAX_JIFFY_OFFSET;
return (MSEC_TO_HZ_MUL32 * m + MSEC_TO_HZ_ADJ32) >> MSEC_TO_HZ_SHR32;
}
#endif
/**
* msecs_to_jiffies: - convert milliseconds to jiffies
* @m: time in milliseconds
*
* conversion is done as follows:
*
* - negative values mean 'infinite timeout' (MAX_JIFFY_OFFSET)
*
* - 'too large' values [that would result in larger than
* MAX_JIFFY_OFFSET values] mean 'infinite timeout' too.
*
* - all other values are converted to jiffies by either multiplying
* the input value by a factor or dividing it with a factor and
* handling any 32-bit overflows.
* for the details see __msecs_to_jiffies()
*
* msecs_to_jiffies() checks for the passed in value being a constant
* via __builtin_constant_p() allowing gcc to eliminate most of the
* code, __msecs_to_jiffies() is called if the value passed does not
* allow constant folding and the actual conversion must be done at
* runtime.
* the HZ range specific helpers _msecs_to_jiffies() are called both
* directly here and from __msecs_to_jiffies() in the case where
* constant folding is not possible.
*/
static __always_inline unsigned long msecs_to_jiffies(const unsigned int m)
{
if (__builtin_constant_p(m)) {
if ((int)m < 0)
return MAX_JIFFY_OFFSET;
return _msecs_to_jiffies(m);
} else {
return __msecs_to_jiffies(m);
}
}
extern unsigned long __usecs_to_jiffies(const unsigned int u);
#if !(USEC_PER_SEC % HZ)
static inline unsigned long _usecs_to_jiffies(const unsigned int u)
{
return (u + (USEC_PER_SEC / HZ) - 1) / (USEC_PER_SEC / HZ);
}
#else
static inline unsigned long _usecs_to_jiffies(const unsigned int u)
{
return (USEC_TO_HZ_MUL32 * u + USEC_TO_HZ_ADJ32)
>> USEC_TO_HZ_SHR32;
}
#endif
/**
* usecs_to_jiffies: - convert microseconds to jiffies
* @u: time in microseconds
*
* conversion is done as follows:
*
* - 'too large' values [that would result in larger than
* MAX_JIFFY_OFFSET values] mean 'infinite timeout' too.
*
* - all other values are converted to jiffies by either multiplying
* the input value by a factor or dividing it with a factor and
* handling any 32-bit overflows as for msecs_to_jiffies.
*
* usecs_to_jiffies() checks for the passed in value being a constant
* via __builtin_constant_p() allowing gcc to eliminate most of the
* code, __usecs_to_jiffies() is called if the value passed does not
* allow constant folding and the actual conversion must be done at
* runtime.
* the HZ range specific helpers _usecs_to_jiffies() are called both
* directly here and from __msecs_to_jiffies() in the case where
* constant folding is not possible.
*/
static __always_inline unsigned long usecs_to_jiffies(const unsigned int u)
{
if (__builtin_constant_p(u)) {
if (u > jiffies_to_usecs(MAX_JIFFY_OFFSET))
return MAX_JIFFY_OFFSET;
return _usecs_to_jiffies(u);
} else {
return __usecs_to_jiffies(u);
}
}
extern unsigned long timespec64_to_jiffies(const struct timespec64 *value);
extern void jiffies_to_timespec64(const unsigned long jiffies,
struct timespec64 *value);
extern clock_t jiffies_to_clock_t(unsigned long x);
static inline clock_t jiffies_delta_to_clock_t(long delta)
{
return jiffies_to_clock_t(max(0L, delta));
}
static inline unsigned int jiffies_delta_to_msecs(long delta)
{
return jiffies_to_msecs(max(0L, delta));
}
extern unsigned long clock_t_to_jiffies(unsigned long x);
extern u64 jiffies_64_to_clock_t(u64 x);
extern u64 nsec_to_clock_t(u64 x);
extern u64 nsecs_to_jiffies64(u64 n);
extern unsigned long nsecs_to_jiffies(u64 n);
#define TIMESTAMP_SIZE 30
#endif
// SPDX-License-Identifier: GPL-2.0-only
/*
* linux/lib/vsprintf.c
*
* Copyright (C) 1991, 1992 Linus Torvalds
*/
/* vsprintf.c -- Lars Wirzenius & Linus Torvalds. */
/*
* Wirzenius wrote this portably, Torvalds fucked it up :-)
*/
/*
* Fri Jul 13 2001 Crutcher Dunnavant <crutcher+kernel@datastacks.com>
* - changed to provide snprintf and vsnprintf functions
* So Feb 1 16:51:32 CET 2004 Juergen Quade <quade@hsnr.de>
* - scnprintf and vscnprintf
*/
#include <linux/stdarg.h>
#include <linux/build_bug.h>
#include <linux/clk.h>
#include <linux/clk-provider.h>
#include <linux/errname.h>
#include <linux/module.h> /* for KSYM_SYMBOL_LEN */
#include <linux/types.h>
#include <linux/string.h>
#include <linux/ctype.h>
#include <linux/kernel.h>
#include <linux/kallsyms.h>
#include <linux/math64.h>
#include <linux/uaccess.h>
#include <linux/ioport.h>
#include <linux/dcache.h>
#include <linux/cred.h>
#include <linux/rtc.h>
#include <linux/time.h>
#include <linux/uuid.h>
#include <linux/of.h>
#include <net/addrconf.h>
#include <linux/siphash.h>
#include <linux/compiler.h>
#include <linux/property.h>
#ifdef CONFIG_BLOCK
#include <linux/blkdev.h>
#endif
#include "../mm/internal.h" /* For the trace_print_flags arrays */
#include <asm/page.h> /* for PAGE_SIZE */
#include <asm/byteorder.h> /* cpu_to_le16 */
#include <asm/unaligned.h>
#include <linux/string_helpers.h>
#include "kstrtox.h"
/* Disable pointer hashing if requested */
bool no_hash_pointers __ro_after_init;
EXPORT_SYMBOL_GPL(no_hash_pointers);
static noinline unsigned long long simple_strntoull(const char *startp, size_t max_chars, char **endp, unsigned int base)
{
const char *cp;
unsigned long long result = 0ULL;
size_t prefix_chars;
unsigned int rv;
cp = _parse_integer_fixup_radix(startp, &base);
prefix_chars = cp - startp;
if (prefix_chars < max_chars) {
rv = _parse_integer_limit(cp, base, &result, max_chars - prefix_chars);
/* FIXME */
cp += (rv & ~KSTRTOX_OVERFLOW);
} else {
/* Field too short for prefix + digit, skip over without converting */
cp = startp + max_chars;
}
if (endp) *endp = (char *)cp; return result;
}
/**
* simple_strtoull - convert a string to an unsigned long long
* @cp: The start of the string
* @endp: A pointer to the end of the parsed string will be placed here
* @base: The number base to use
*
* This function has caveats. Please use kstrtoull instead.
*/
noinline
unsigned long long simple_strtoull(const char *cp, char **endp, unsigned int base)
{
return simple_strntoull(cp, INT_MAX, endp, base);
}
EXPORT_SYMBOL(simple_strtoull);
/**
* simple_strtoul - convert a string to an unsigned long
* @cp: The start of the string
* @endp: A pointer to the end of the parsed string will be placed here
* @base: The number base to use
*
* This function has caveats. Please use kstrtoul instead.
*/
unsigned long simple_strtoul(const char *cp, char **endp, unsigned int base)
{
return simple_strtoull(cp, endp, base);
}
EXPORT_SYMBOL(simple_strtoul);
/**
* simple_strtol - convert a string to a signed long
* @cp: The start of the string
* @endp: A pointer to the end of the parsed string will be placed here
* @base: The number base to use
*
* This function has caveats. Please use kstrtol instead.
*/
long simple_strtol(const char *cp, char **endp, unsigned int base)
{
if (*cp == '-') return -simple_strtoul(cp + 1, endp, base);
return simple_strtoul(cp, endp, base);
}
EXPORT_SYMBOL(simple_strtol);
static long long simple_strntoll(const char *cp, size_t max_chars, char **endp,
unsigned int base)
{
/*
* simple_strntoull() safely handles receiving max_chars==0 in the
* case cp[0] == '-' && max_chars == 1.
* If max_chars == 0 we can drop through and pass it to simple_strntoull()
* and the content of *cp is irrelevant.
*/
if (*cp == '-' && max_chars > 0)
return -simple_strntoull(cp + 1, max_chars - 1, endp, base);
return simple_strntoull(cp, max_chars, endp, base);
}
/**
* simple_strtoll - convert a string to a signed long long
* @cp: The start of the string
* @endp: A pointer to the end of the parsed string will be placed here
* @base: The number base to use
*
* This function has caveats. Please use kstrtoll instead.
*/
long long simple_strtoll(const char *cp, char **endp, unsigned int base)
{
return simple_strntoll(cp, INT_MAX, endp, base);
}
EXPORT_SYMBOL(simple_strtoll);
static noinline_for_stack
int skip_atoi(const char **s)
{
int i = 0;
do {
i = i*10 + *((*s)++) - '0';
} while (isdigit(**s));
return i;
}
/*
* Decimal conversion is by far the most typical, and is used for
* /proc and /sys data. This directly impacts e.g. top performance
* with many processes running. We optimize it for speed by emitting
* two characters at a time, using a 200 byte lookup table. This
* roughly halves the number of multiplications compared to computing
* the digits one at a time. Implementation strongly inspired by the
* previous version, which in turn used ideas described at
* <http://www.cs.uiowa.edu/~jones/bcd/divide.html> (with permission
* from the author, Douglas W. Jones).
*
* It turns out there is precisely one 26 bit fixed-point
* approximation a of 64/100 for which x/100 == (x * (u64)a) >> 32
* holds for all x in [0, 10^8-1], namely a = 0x28f5c29. The actual
* range happens to be somewhat larger (x <= 1073741898), but that's
* irrelevant for our purpose.
*
* For dividing a number in the range [10^4, 10^6-1] by 100, we still
* need a 32x32->64 bit multiply, so we simply use the same constant.
*
* For dividing a number in the range [100, 10^4-1] by 100, there are
* several options. The simplest is (x * 0x147b) >> 19, which is valid
* for all x <= 43698.
*/
static const u16 decpair[100] = {
#define _(x) (__force u16) cpu_to_le16(((x % 10) | ((x / 10) << 8)) + 0x3030)
_( 0), _( 1), _( 2), _( 3), _( 4), _( 5), _( 6), _( 7), _( 8), _( 9),
_(10), _(11), _(12), _(13), _(14), _(15), _(16), _(17), _(18), _(19),
_(20), _(21), _(22), _(23), _(24), _(25), _(26), _(27), _(28), _(29),
_(30), _(31), _(32), _(33), _(34), _(35), _(36), _(37), _(38), _(39),
_(40), _(41), _(42), _(43), _(44), _(45), _(46), _(47), _(48), _(49),
_(50), _(51), _(52), _(53), _(54), _(55), _(56), _(57), _(58), _(59),
_(60), _(61), _(62), _(63), _(64), _(65), _(66), _(67), _(68), _(69),
_(70), _(71), _(72), _(73), _(74), _(75), _(76), _(77), _(78), _(79),
_(80), _(81), _(82), _(83), _(84), _(85), _(86), _(87), _(88), _(89),
_(90), _(91), _(92), _(93), _(94), _(95), _(96), _(97), _(98), _(99),
#undef _
};
/*
* This will print a single '0' even if r == 0, since we would
* immediately jump to out_r where two 0s would be written but only
* one of them accounted for in buf. This is needed by ip4_string
* below. All other callers pass a non-zero value of r.
*/
static noinline_for_stack
char *put_dec_trunc8(char *buf, unsigned r)
{
unsigned q;
/* 1 <= r < 10^8 */
if (r < 100)
goto out_r;
/* 100 <= r < 10^8 */
q = (r * (u64)0x28f5c29) >> 32;
*((u16 *)buf) = decpair[r - 100*q];
buf += 2;
/* 1 <= q < 10^6 */
if (q < 100)
goto out_q;
/* 100 <= q < 10^6 */
r = (q * (u64)0x28f5c29) >> 32;
*((u16 *)buf) = decpair[q - 100*r];
buf += 2;
/* 1 <= r < 10^4 */
if (r < 100)
goto out_r;
/* 100 <= r < 10^4 */
q = (r * 0x147b) >> 19;
*((u16 *)buf) = decpair[r - 100*q];
buf += 2;
out_q:
/* 1 <= q < 100 */
r = q;
out_r:
/* 1 <= r < 100 */
*((u16 *)buf) = decpair[r]; buf += r < 10 ? 1 : 2;
return buf;
}
#if BITS_PER_LONG == 64 && BITS_PER_LONG_LONG == 64
static noinline_for_stack
char *put_dec_full8(char *buf, unsigned r)
{
unsigned q;
/* 0 <= r < 10^8 */
q = (r * (u64)0x28f5c29) >> 32;
*((u16 *)buf) = decpair[r - 100*q];
buf += 2;
/* 0 <= q < 10^6 */
r = (q * (u64)0x28f5c29) >> 32;
*((u16 *)buf) = decpair[q - 100*r];
buf += 2;
/* 0 <= r < 10^4 */
q = (r * 0x147b) >> 19;
*((u16 *)buf) = decpair[r - 100*q];
buf += 2;
/* 0 <= q < 100 */
*((u16 *)buf) = decpair[q];
buf += 2;
return buf;
}
static noinline_for_stack
char *put_dec(char *buf, unsigned long long n)
{
if (n >= 100*1000*1000) buf = put_dec_full8(buf, do_div(n, 100*1000*1000));
/* 1 <= n <= 1.6e11 */
if (n >= 100*1000*1000)
buf = put_dec_full8(buf, do_div(n, 100*1000*1000));
/* 1 <= n < 1e8 */
return put_dec_trunc8(buf, n);
}
#elif BITS_PER_LONG == 32 && BITS_PER_LONG_LONG == 64
static void
put_dec_full4(char *buf, unsigned r)
{
unsigned q;
/* 0 <= r < 10^4 */
q = (r * 0x147b) >> 19;
*((u16 *)buf) = decpair[r - 100*q];
buf += 2;
/* 0 <= q < 100 */
*((u16 *)buf) = decpair[q];
}
/*
* Call put_dec_full4 on x % 10000, return x / 10000.
* The approximation x/10000 == (x * 0x346DC5D7) >> 43
* holds for all x < 1,128,869,999. The largest value this
* helper will ever be asked to convert is 1,125,520,955.
* (second call in the put_dec code, assuming n is all-ones).
*/
static noinline_for_stack
unsigned put_dec_helper4(char *buf, unsigned x)
{
uint32_t q = (x * (uint64_t)0x346DC5D7) >> 43;
put_dec_full4(buf, x - q * 10000);
return q;
}
/* Based on code by Douglas W. Jones found at
* <http://www.cs.uiowa.edu/~jones/bcd/decimal.html#sixtyfour>
* (with permission from the author).
* Performs no 64-bit division and hence should be fast on 32-bit machines.
*/
static
char *put_dec(char *buf, unsigned long long n)
{
uint32_t d3, d2, d1, q, h;
if (n < 100*1000*1000)
return put_dec_trunc8(buf, n);
d1 = ((uint32_t)n >> 16); /* implicit "& 0xffff" */
h = (n >> 32);
d2 = (h ) & 0xffff;
d3 = (h >> 16); /* implicit "& 0xffff" */
/* n = 2^48 d3 + 2^32 d2 + 2^16 d1 + d0
= 281_4749_7671_0656 d3 + 42_9496_7296 d2 + 6_5536 d1 + d0 */
q = 656 * d3 + 7296 * d2 + 5536 * d1 + ((uint32_t)n & 0xffff);
q = put_dec_helper4(buf, q);
q += 7671 * d3 + 9496 * d2 + 6 * d1;
q = put_dec_helper4(buf+4, q);
q += 4749 * d3 + 42 * d2;
q = put_dec_helper4(buf+8, q);
q += 281 * d3;
buf += 12;
if (q)
buf = put_dec_trunc8(buf, q);
else while (buf[-1] == '0')
--buf;
return buf;
}
#endif
/*
* Convert passed number to decimal string.
* Returns the length of string. On buffer overflow, returns 0.
*
* If speed is not important, use snprintf(). It's easy to read the code.
*/
int num_to_str(char *buf, int size, unsigned long long num, unsigned int width)
{
/* put_dec requires 2-byte alignment of the buffer. */
char tmp[sizeof(num) * 3] __aligned(2);
int idx, len;
/* put_dec() may work incorrectly for num = 0 (generate "", not "0") */
if (num <= 9) {
tmp[0] = '0' + num;
len = 1;
} else {
len = put_dec(tmp, num) - tmp;
}
if (len > size || width > size)
return 0;
if (width > len) {
width = width - len;
for (idx = 0; idx < width; idx++)
buf[idx] = ' ';
} else {
width = 0;
}
for (idx = 0; idx < len; ++idx)
buf[idx + width] = tmp[len - idx - 1];
return len + width;
}
#define SIGN 1 /* unsigned/signed, must be 1 */
#define LEFT 2 /* left justified */
#define PLUS 4 /* show plus */
#define SPACE 8 /* space if plus */
#define ZEROPAD 16 /* pad with zero, must be 16 == '0' - ' ' */
#define SMALL 32 /* use lowercase in hex (must be 32 == 0x20) */
#define SPECIAL 64 /* prefix hex with "0x", octal with "0" */
static_assert(ZEROPAD == ('0' - ' '));
static_assert(SMALL == ' ');
enum format_type {
FORMAT_TYPE_NONE, /* Just a string part */
FORMAT_TYPE_WIDTH,
FORMAT_TYPE_PRECISION,
FORMAT_TYPE_CHAR,
FORMAT_TYPE_STR,
FORMAT_TYPE_PTR,
FORMAT_TYPE_PERCENT_CHAR,
FORMAT_TYPE_INVALID,
FORMAT_TYPE_LONG_LONG,
FORMAT_TYPE_ULONG,
FORMAT_TYPE_LONG,
FORMAT_TYPE_UBYTE,
FORMAT_TYPE_BYTE,
FORMAT_TYPE_USHORT,
FORMAT_TYPE_SHORT,
FORMAT_TYPE_UINT,
FORMAT_TYPE_INT,
FORMAT_TYPE_SIZE_T,
FORMAT_TYPE_PTRDIFF
};
struct printf_spec {
unsigned int type:8; /* format_type enum */
signed int field_width:24; /* width of output field */
unsigned int flags:8; /* flags to number() */
unsigned int base:8; /* number base, 8, 10 or 16 only */
signed int precision:16; /* # of digits/chars */
} __packed;
static_assert(sizeof(struct printf_spec) == 8);
#define FIELD_WIDTH_MAX ((1 << 23) - 1)
#define PRECISION_MAX ((1 << 15) - 1)
static noinline_for_stack
char *number(char *buf, char *end, unsigned long long num,
struct printf_spec spec)
{
/* put_dec requires 2-byte alignment of the buffer. */
char tmp[3 * sizeof(num)] __aligned(2);
char sign;
char locase;
int need_pfx = ((spec.flags & SPECIAL) && spec.base != 10);
int i;
bool is_zero = num == 0LL;
int field_width = spec.field_width;
int precision = spec.precision;
/* locase = 0 or 0x20. ORing digits or letters with 'locase'
* produces same digits or (maybe lowercased) letters */
locase = (spec.flags & SMALL);
if (spec.flags & LEFT) spec.flags &= ~ZEROPAD;
sign = 0;
if (spec.flags & SIGN) {
if ((signed long long)num < 0) {
sign = '-';
num = -(signed long long)num;
field_width--;
} else if (spec.flags & PLUS) { sign = '+'; field_width--; } else if (spec.flags & SPACE) {
sign = ' ';
field_width--;
}
}
if (need_pfx) { if (spec.base == 16) field_width -= 2; else if (!is_zero) field_width--;
}
/* generate full string in tmp[], in reverse order */
i = 0;
if (num < spec.base) tmp[i++] = hex_asc_upper[num] | locase; else if (spec.base != 10) { /* 8 or 16 */ int mask = spec.base - 1;
int shift = 3;
if (spec.base == 16)
shift = 4;
do {
tmp[i++] = (hex_asc_upper[((unsigned char)num) & mask] | locase);
num >>= shift;
} while (num);
} else { /* base 10 */
i = put_dec(tmp, num) - tmp;
}
/* printing 100 using %2d gives "100", not "00" */
if (i > precision)
precision = i;
/* leading space padding */
field_width -= precision;
if (!(spec.flags & (ZEROPAD | LEFT))) {
while (--field_width >= 0) { if (buf < end) *buf = ' '; ++buf;
}
}
/* sign */
if (sign) { if (buf < end) *buf = sign; ++buf;
}
/* "0x" / "0" prefix */
if (need_pfx) { if (spec.base == 16 || !is_zero) { if (buf < end) *buf = '0';
++buf;
}
if (spec.base == 16) {
if (buf < end) *buf = ('X' | locase); ++buf;
}
}
/* zero or space padding */
if (!(spec.flags & LEFT)) { char c = ' ' + (spec.flags & ZEROPAD); while (--field_width >= 0) { if (buf < end) *buf = c; ++buf;
}
}
/* hmm even more zero padding? */
while (i <= --precision) { if (buf < end) *buf = '0'; ++buf;
}
/* actual digits of result */
while (--i >= 0) { if (buf < end) *buf = tmp[i]; ++buf;
}
/* trailing space padding */
while (--field_width >= 0) { if (buf < end) *buf = ' '; ++buf;
}
return buf;
}
static noinline_for_stack
char *special_hex_number(char *buf, char *end, unsigned long long num, int size)
{
struct printf_spec spec;
spec.type = FORMAT_TYPE_PTR;
spec.field_width = 2 + 2 * size; /* 0x + hex */
spec.flags = SPECIAL | SMALL | ZEROPAD;
spec.base = 16;
spec.precision = -1;
return number(buf, end, num, spec);
}
static void move_right(char *buf, char *end, unsigned len, unsigned spaces)
{
size_t size;
if (buf >= end) /* nowhere to put anything */
return;
size = end - buf;
if (size <= spaces) {
memset(buf, ' ', size);
return;
}
if (len) {
if (len > size - spaces) len = size - spaces; memmove(buf + spaces, buf, len);
}
memset(buf, ' ', spaces);
}
/*
* Handle field width padding for a string.
* @buf: current buffer position
* @n: length of string
* @end: end of output buffer
* @spec: for field width and flags
* Returns: new buffer position after padding.
*/
static noinline_for_stack
char *widen_string(char *buf, int n, char *end, struct printf_spec spec)
{
unsigned spaces;
if (likely(n >= spec.field_width))
return buf;
/* we want to pad the sucker */
spaces = spec.field_width - n;
if (!(spec.flags & LEFT)) {
move_right(buf - n, end, n, spaces); return buf + spaces;
}
while (spaces--) { if (buf < end) *buf = ' '; ++buf;
}
return buf;
}
/* Handle string from a well known address. */
static char *string_nocheck(char *buf, char *end, const char *s,
struct printf_spec spec)
{
int len = 0;
int lim = spec.precision;
while (lim--) {
char c = *s++;
if (!c)
break;
if (buf < end) *buf = c; ++buf;
++len;
}
return widen_string(buf, len, end, spec);
}
static char *err_ptr(char *buf, char *end, void *ptr,
struct printf_spec spec)
{
int err = PTR_ERR(ptr);
const char *sym = errname(err);
if (sym)
return string_nocheck(buf, end, sym, spec);
/*
* Somebody passed ERR_PTR(-1234) or some other non-existing
* Efoo - or perhaps CONFIG_SYMBOLIC_ERRNAME=n. Fall back to
* printing it as its decimal representation.
*/
spec.flags |= SIGN;
spec.base = 10;
return number(buf, end, err, spec);
}
/* Be careful: error messages must fit into the given buffer. */
static char *error_string(char *buf, char *end, const char *s,
struct printf_spec spec)
{
/*
* Hard limit to avoid a completely insane messages. It actually
* works pretty well because most error messages are in
* the many pointer format modifiers.
*/
if (spec.precision == -1)
spec.precision = 2 * sizeof(void *);
return string_nocheck(buf, end, s, spec);
}
/*
* Do not call any complex external code here. Nested printk()/vsprintf()
* might cause infinite loops. Failures might break printk() and would
* be hard to debug.
*/
static const char *check_pointer_msg(const void *ptr)
{
if (!ptr) return "(null)"; if ((unsigned long)ptr < PAGE_SIZE || IS_ERR_VALUE(ptr))
return "(efault)";
return NULL;
}
static int check_pointer(char **buf, char *end, const void *ptr,
struct printf_spec spec)
{
const char *err_msg;
err_msg = check_pointer_msg(ptr);
if (err_msg) {
*buf = error_string(*buf, end, err_msg, spec);
return -EFAULT;
}
return 0;
}
static noinline_for_stack
char *string(char *buf, char *end, const char *s,
struct printf_spec spec)
{
if (check_pointer(&buf, end, s, spec)) return buf; return string_nocheck(buf, end, s, spec);
}
static char *pointer_string(char *buf, char *end,
const void *ptr,
struct printf_spec spec)
{
spec.base = 16;
spec.flags |= SMALL;
if (spec.field_width == -1) {
spec.field_width = 2 * sizeof(ptr);
spec.flags |= ZEROPAD;
}
return number(buf, end, (unsigned long int)ptr, spec);
}
/* Make pointers available for printing early in the boot sequence. */
static int debug_boot_weak_hash __ro_after_init;
static int __init debug_boot_weak_hash_enable(char *str)
{
debug_boot_weak_hash = 1;
pr_info("debug_boot_weak_hash enabled\n");
return 0;
}
early_param("debug_boot_weak_hash", debug_boot_weak_hash_enable);
static DEFINE_STATIC_KEY_TRUE(not_filled_random_ptr_key);
static siphash_key_t ptr_key __read_mostly;
static void enable_ptr_key_workfn(struct work_struct *work)
{
get_random_bytes(&ptr_key, sizeof(ptr_key));
/* Needs to run from preemptible context */
static_branch_disable(¬_filled_random_ptr_key);
}
static DECLARE_WORK(enable_ptr_key_work, enable_ptr_key_workfn);
static void fill_random_ptr_key(struct random_ready_callback *unused)
{
/* This may be in an interrupt handler. */
queue_work(system_unbound_wq, &enable_ptr_key_work);
}
static struct random_ready_callback random_ready = {
.func = fill_random_ptr_key
};
static int __init initialize_ptr_random(void)
{
int key_size = sizeof(ptr_key);
int ret;
/* Use hw RNG if available. */
if (get_random_bytes_arch(&ptr_key, key_size) == key_size) {
static_branch_disable(¬_filled_random_ptr_key);
return 0;
}
ret = add_random_ready_callback(&random_ready);
if (!ret) {
return 0;
} else if (ret == -EALREADY) {
/* This is in preemptible context */
enable_ptr_key_workfn(&enable_ptr_key_work);
return 0;
}
return ret;
}
early_initcall(initialize_ptr_random);
/* Maps a pointer to a 32 bit unique identifier. */
static inline int __ptr_to_hashval(const void *ptr, unsigned long *hashval_out)
{
unsigned long hashval;
if (static_branch_unlikely(¬_filled_random_ptr_key))
return -EAGAIN;
#ifdef CONFIG_64BIT
hashval = (unsigned long)siphash_1u64((u64)ptr, &ptr_key);
/*
* Mask off the first 32 bits, this makes explicit that we have
* modified the address (and 32 bits is plenty for a unique ID).
*/
hashval = hashval & 0xffffffff;
#else
hashval = (unsigned long)siphash_1u32((u32)ptr, &ptr_key);
#endif
*hashval_out = hashval;
return 0;
}
int ptr_to_hashval(const void *ptr, unsigned long *hashval_out)
{
return __ptr_to_hashval(ptr, hashval_out);
}
static char *ptr_to_id(char *buf, char *end, const void *ptr,
struct printf_spec spec)
{
const char *str = sizeof(ptr) == 8 ? "(____ptrval____)" : "(ptrval)";
unsigned long hashval;
int ret;
/*
* Print the real pointer value for NULL and error pointers,
* as they are not actual addresses.
*/
if (IS_ERR_OR_NULL(ptr))
return pointer_string(buf, end, ptr, spec);
/* When debugging early boot use non-cryptographically secure hash. */
if (unlikely(debug_boot_weak_hash)) {
hashval = hash_long((unsigned long)ptr, 32);
return pointer_string(buf, end, (const void *)hashval, spec);
}
ret = __ptr_to_hashval(ptr, &hashval);
if (ret) {
spec.field_width = 2 * sizeof(ptr);
/* string length must be less than default_width */
return error_string(buf, end, str, spec);
}
return pointer_string(buf, end, (const void *)hashval, spec);
}
static char *default_pointer(char *buf, char *end, const void *ptr,
struct printf_spec spec)
{
/*
* default is to _not_ leak addresses, so hash before printing,
* unless no_hash_pointers is specified on the command line.
*/
if (unlikely(no_hash_pointers))
return pointer_string(buf, end, ptr, spec);
return ptr_to_id(buf, end, ptr, spec);
}
int kptr_restrict __read_mostly;
static noinline_for_stack
char *restricted_pointer(char *buf, char *end, const void *ptr,
struct printf_spec spec)
{
switch (kptr_restrict) {
case 0:
/* Handle as %p, hash and do _not_ leak addresses. */
return default_pointer(buf, end, ptr, spec);
case 1: {
const struct cred *cred;
/*
* kptr_restrict==1 cannot be used in IRQ context
* because its test for CAP_SYSLOG would be meaningless.
*/
if (in_irq() || in_serving_softirq() || in_nmi()) {
if (spec.field_width == -1)
spec.field_width = 2 * sizeof(ptr);
return error_string(buf, end, "pK-error", spec);
}
/*
* Only print the real pointer value if the current
* process has CAP_SYSLOG and is running with the
* same credentials it started with. This is because
* access to files is checked at open() time, but %pK
* checks permission at read() time. We don't want to
* leak pointer values if a binary opens a file using
* %pK and then elevates privileges before reading it.
*/
cred = current_cred();
if (!has_capability_noaudit(current, CAP_SYSLOG) ||
!uid_eq(cred->euid, cred->uid) ||
!gid_eq(cred->egid, cred->gid))
ptr = NULL;
break;
}
case 2:
default:
/* Always print 0's for %pK */
ptr = NULL;
break;
}
return pointer_string(buf, end, ptr, spec);
}
static noinline_for_stack
char *dentry_name(char *buf, char *end, const struct dentry *d, struct printf_spec spec,
const char *fmt)
{
const char *array[4], *s;
const struct dentry *p;
int depth;
int i, n;
switch (fmt[1]) {
case '2': case '3': case '4':
depth = fmt[1] - '0';
break;
default:
depth = 1;
}
rcu_read_lock();
for (i = 0; i < depth; i++, d = p) {
if (check_pointer(&buf, end, d, spec)) {
rcu_read_unlock();
return buf;
}
p = READ_ONCE(d->d_parent);
array[i] = READ_ONCE(d->d_name.name);
if (p == d) {
if (i)
array[i] = "";
i++;
break;
}
}
s = array[--i];
for (n = 0; n != spec.precision; n++, buf++) {
char c = *s++;
if (!c) {
if (!i)
break;
c = '/';
s = array[--i];
}
if (buf < end)
*buf = c;
}
rcu_read_unlock();
return widen_string(buf, n, end, spec);
}
static noinline_for_stack
char *file_dentry_name(char *buf, char *end, const struct file *f,
struct printf_spec spec, const char *fmt)
{
if (check_pointer(&buf, end, f, spec))
return buf;
return dentry_name(buf, end, f->f_path.dentry, spec, fmt);
}
#ifdef CONFIG_BLOCK
static noinline_for_stack
char *bdev_name(char *buf, char *end, struct block_device *bdev,
struct printf_spec spec, const char *fmt)
{
struct gendisk *hd;
if (check_pointer(&buf, end, bdev, spec)) return buf; hd = bdev->bd_disk;
buf = string(buf, end, hd->disk_name, spec);
if (bdev->bd_partno) {
if (isdigit(hd->disk_name[strlen(hd->disk_name)-1])) { if (buf < end) *buf = 'p'; buf++;
}
buf = number(buf, end, bdev->bd_partno, spec);
}
return buf;
}
#endif
static noinline_for_stack
char *symbol_string(char *buf, char *end, void *ptr,
struct printf_spec spec, const char *fmt)
{
unsigned long value;
#ifdef CONFIG_KALLSYMS
char sym[KSYM_SYMBOL_LEN];
#endif
if (fmt[1] == 'R')
ptr = __builtin_extract_return_addr(ptr);
value = (unsigned long)ptr;
#ifdef CONFIG_KALLSYMS
if (*fmt == 'B' && fmt[1] == 'b')
sprint_backtrace_build_id(sym, value);
else if (*fmt == 'B')
sprint_backtrace(sym, value);
else if (*fmt == 'S' && (fmt[1] == 'b' || (fmt[1] == 'R' && fmt[2] == 'b')))
sprint_symbol_build_id(sym, value);
else if (*fmt != 's')
sprint_symbol(sym, value);
else
sprint_symbol_no_offset(sym, value);
return string_nocheck(buf, end, sym, spec);
#else
return special_hex_number(buf, end, value, sizeof(void *));
#endif
}
static const struct printf_spec default_str_spec = {
.field_width = -1,
.precision = -1,
};
static const struct printf_spec default_flag_spec = {
.base = 16,
.precision = -1,
.flags = SPECIAL | SMALL,
};
static const struct printf_spec default_dec_spec = {
.base = 10,
.precision = -1,
};
static const struct printf_spec default_dec02_spec = {
.base = 10,
.field_width = 2,
.precision = -1,
.flags = ZEROPAD,
};
static const struct printf_spec default_dec04_spec = {
.base = 10,
.field_width = 4,
.precision = -1,
.flags = ZEROPAD,
};
static noinline_for_stack
char *resource_string(char *buf, char *end, struct resource *res,
struct printf_spec spec, const char *fmt)
{
#ifndef IO_RSRC_PRINTK_SIZE
#define IO_RSRC_PRINTK_SIZE 6
#endif
#ifndef MEM_RSRC_PRINTK_SIZE
#define MEM_RSRC_PRINTK_SIZE 10
#endif
static const struct printf_spec io_spec = {
.base = 16,
.field_width = IO_RSRC_PRINTK_SIZE,
.precision = -1,
.flags = SPECIAL | SMALL | ZEROPAD,
};
static const struct printf_spec mem_spec = {
.base = 16,
.field_width = MEM_RSRC_PRINTK_SIZE,
.precision = -1,
.flags = SPECIAL | SMALL | ZEROPAD,
};
static const struct printf_spec bus_spec = {
.base = 16,
.field_width = 2,
.precision = -1,
.flags = SMALL | ZEROPAD,
};
static const struct printf_spec str_spec = {
.field_width = -1,
.precision = 10,
.flags = LEFT,
};
/* 32-bit res (sizeof==4): 10 chars in dec, 10 in hex ("0x" + 8)
* 64-bit res (sizeof==8): 20 chars in dec, 18 in hex ("0x" + 16) */
#define RSRC_BUF_SIZE ((2 * sizeof(resource_size_t)) + 4)
#define FLAG_BUF_SIZE (2 * sizeof(res->flags))
#define DECODED_BUF_SIZE sizeof("[mem - 64bit pref window disabled]")
#define RAW_BUF_SIZE sizeof("[mem - flags 0x]")
char sym[max(2*RSRC_BUF_SIZE + DECODED_BUF_SIZE,
2*RSRC_BUF_SIZE + FLAG_BUF_SIZE + RAW_BUF_SIZE)];
char *p = sym, *pend = sym + sizeof(sym);
int decode = (fmt[0] == 'R') ? 1 : 0;
const struct printf_spec *specp;
if (check_pointer(&buf, end, res, spec))
return buf;
*p++ = '[';
if (res->flags & IORESOURCE_IO) {
p = string_nocheck(p, pend, "io ", str_spec);
specp = &io_spec;
} else if (res->flags & IORESOURCE_MEM) {
p = string_nocheck(p, pend, "mem ", str_spec);
specp = &mem_spec;
} else if (res->flags & IORESOURCE_IRQ) {
p = string_nocheck(p, pend, "irq ", str_spec);
specp = &default_dec_spec;
} else if (res->flags & IORESOURCE_DMA) {
p = string_nocheck(p, pend, "dma ", str_spec);
specp = &default_dec_spec;
} else if (res->flags & IORESOURCE_BUS) {
p = string_nocheck(p, pend, "bus ", str_spec);
specp = &bus_spec;
} else {
p = string_nocheck(p, pend, "??? ", str_spec);
specp = &mem_spec;
decode = 0;
}
if (decode && res->flags & IORESOURCE_UNSET) {
p = string_nocheck(p, pend, "size ", str_spec);
p = number(p, pend, resource_size(res), *specp);
} else {
p = number(p, pend, res->start, *specp);
if (res->start != res->end) {
*p++ = '-';
p = number(p, pend, res->end, *specp);
}
}
if (decode) {
if (res->flags & IORESOURCE_MEM_64)
p = string_nocheck(p, pend, " 64bit", str_spec);
if (res->flags & IORESOURCE_PREFETCH)
p = string_nocheck(p, pend, " pref", str_spec);
if (res->flags & IORESOURCE_WINDOW)
p = string_nocheck(p, pend, " window", str_spec);
if (res->flags & IORESOURCE_DISABLED)
p = string_nocheck(p, pend, " disabled", str_spec);
} else {
p = string_nocheck(p, pend, " flags ", str_spec);
p = number(p, pend, res->flags, default_flag_spec);
}
*p++ = ']';
*p = '\0';
return string_nocheck(buf, end, sym, spec);
}
static noinline_for_stack
char *hex_string(char *buf, char *end, u8 *addr, struct printf_spec spec,
const char *fmt)
{
int i, len = 1; /* if we pass '%ph[CDN]', field width remains
negative value, fallback to the default */
char separator;
if (spec.field_width == 0)
/* nothing to print */
return buf;
if (check_pointer(&buf, end, addr, spec))
return buf;
switch (fmt[1]) {
case 'C':
separator = ':';
break;
case 'D':
separator = '-';
break;
case 'N':
separator = 0;
break;
default:
separator = ' ';
break;
}
if (spec.field_width > 0)
len = min_t(int, spec.field_width, 64);
for (i = 0; i < len; ++i) {
if (buf < end)
*buf = hex_asc_hi(addr[i]);
++buf;
if (buf < end)
*buf = hex_asc_lo(addr[i]);
++buf;
if (separator && i != len - 1) {
if (buf < end)
*buf = separator;
++buf;
}
}
return buf;
}
static noinline_for_stack
char *bitmap_string(char *buf, char *end, unsigned long *bitmap,
struct printf_spec spec, const char *fmt)
{
const int CHUNKSZ = 32;
int nr_bits = max_t(int, spec.field_width, 0);
int i, chunksz;
bool first = true;
if (check_pointer(&buf, end, bitmap, spec))
return buf;
/* reused to print numbers */
spec = (struct printf_spec){ .flags = SMALL | ZEROPAD, .base = 16 };
chunksz = nr_bits & (CHUNKSZ - 1);
if (chunksz == 0)
chunksz = CHUNKSZ;
i = ALIGN(nr_bits, CHUNKSZ) - CHUNKSZ;
for (; i >= 0; i -= CHUNKSZ) {
u32 chunkmask, val;
int word, bit;
chunkmask = ((1ULL << chunksz) - 1);
word = i / BITS_PER_LONG;
bit = i % BITS_PER_LONG;
val = (bitmap[word] >> bit) & chunkmask;
if (!first) {
if (buf < end)
*buf = ',';
buf++;
}
first = false;
spec.field_width = DIV_ROUND_UP(chunksz, 4);
buf = number(buf, end, val, spec);
chunksz = CHUNKSZ;
}
return buf;
}
static noinline_for_stack
char *bitmap_list_string(char *buf, char *end, unsigned long *bitmap,
struct printf_spec spec, const char *fmt)
{
int nr_bits = max_t(int, spec.field_width, 0);
/* current bit is 'cur', most recently seen range is [rbot, rtop] */
int cur, rbot, rtop;
bool first = true;
if (check_pointer(&buf, end, bitmap, spec))
return buf;
rbot = cur = find_first_bit(bitmap, nr_bits);
while (cur < nr_bits) {
rtop = cur;
cur = find_next_bit(bitmap, nr_bits, cur + 1);
if (cur < nr_bits && cur <= rtop + 1)
continue;
if (!first) {
if (buf < end)
*buf = ',';
buf++;
}
first = false;
buf = number(buf, end, rbot, default_dec_spec);
if (rbot < rtop) {
if (buf < end)
*buf = '-';
buf++;
buf = number(buf, end, rtop, default_dec_spec);
}
rbot = cur;
}
return buf;
}
static noinline_for_stack
char *mac_address_string(char *buf, char *end, u8 *addr,
struct printf_spec spec, const char *fmt)
{
char mac_addr[sizeof("xx:xx:xx:xx:xx:xx")];
char *p = mac_addr;
int i;
char separator;
bool reversed = false;
if (check_pointer(&buf, end, addr, spec))
return buf;
switch (fmt[1]) {
case 'F':
separator = '-';
break;
case 'R':
reversed = true;
fallthrough;
default:
separator = ':';
break;
}
for (i = 0; i < 6; i++) {
if (reversed)
p = hex_byte_pack(p, addr[5 - i]);
else
p = hex_byte_pack(p, addr[i]);
if (fmt[0] == 'M' && i != 5)
*p++ = separator;
}
*p = '\0';
return string_nocheck(buf, end, mac_addr, spec);
}
static noinline_for_stack
char *ip4_string(char *p, const u8 *addr, const char *fmt)
{
int i;
bool leading_zeros = (fmt[0] == 'i');
int index;
int step;
switch (fmt[2]) {
case 'h':
#ifdef __BIG_ENDIAN
index = 0;
step = 1;
#else
index = 3;
step = -1;
#endif
break;
case 'l':
index = 3;
step = -1;
break;
case 'n':
case 'b':
default:
index = 0;
step = 1;
break;
}
for (i = 0; i < 4; i++) {
char temp[4] __aligned(2); /* hold each IP quad in reverse order */
int digits = put_dec_trunc8(temp, addr[index]) - temp;
if (leading_zeros) {
if (digits < 3)
*p++ = '0';
if (digits < 2)
*p++ = '0';
}
/* reverse the digits in the quad */
while (digits--)
*p++ = temp[digits];
if (i < 3)
*p++ = '.';
index += step;
}
*p = '\0';
return p;
}
static noinline_for_stack
char *ip6_compressed_string(char *p, const char *addr)
{
int i, j, range;
unsigned char zerolength[8];
int longest = 1;
int colonpos = -1;
u16 word;
u8 hi, lo;
bool needcolon = false;
bool useIPv4;
struct in6_addr in6;
memcpy(&in6, addr, sizeof(struct in6_addr));
useIPv4 = ipv6_addr_v4mapped(&in6) || ipv6_addr_is_isatap(&in6);
memset(zerolength, 0, sizeof(zerolength));
if (useIPv4)
range = 6;
else
range = 8;
/* find position of longest 0 run */
for (i = 0; i < range; i++) {
for (j = i; j < range; j++) {
if (in6.s6_addr16[j] != 0)
break;
zerolength[i]++;
}
}
for (i = 0; i < range; i++) {
if (zerolength[i] > longest) {
longest = zerolength[i];
colonpos = i;
}
}
if (longest == 1) /* don't compress a single 0 */
colonpos = -1;
/* emit address */
for (i = 0; i < range; i++) {
if (i == colonpos) {
if (needcolon || i == 0)
*p++ = ':';
*p++ = ':';
needcolon = false;
i += longest - 1;
continue;
}
if (needcolon) {
*p++ = ':';
needcolon = false;
}
/* hex u16 without leading 0s */
word = ntohs(in6.s6_addr16[i]);
hi = word >> 8;
lo = word & 0xff;
if (hi) {
if (hi > 0x0f)
p = hex_byte_pack(p, hi);
else
*p++ = hex_asc_lo(hi);
p = hex_byte_pack(p, lo);
}
else if (lo > 0x0f)
p = hex_byte_pack(p, lo);
else
*p++ = hex_asc_lo(lo);
needcolon = true;
}
if (useIPv4) {
if (needcolon)
*p++ = ':';
p = ip4_string(p, &in6.s6_addr[12], "I4");
}
*p = '\0';
return p;
}
static noinline_for_stack
char *ip6_string(char *p, const char *addr, const char *fmt)
{
int i;
for (i = 0; i < 8; i++) {
p = hex_byte_pack(p, *addr++);
p = hex_byte_pack(p, *addr++);
if (fmt[0] == 'I' && i != 7)
*p++ = ':';
}
*p = '\0';
return p;
}
static noinline_for_stack
char *ip6_addr_string(char *buf, char *end, const u8 *addr,
struct printf_spec spec, const char *fmt)
{
char ip6_addr[sizeof("xxxx:xxxx:xxxx:xxxx:xxxx:xxxx:255.255.255.255")];
if (fmt[0] == 'I' && fmt[2] == 'c')
ip6_compressed_string(ip6_addr, addr);
else
ip6_string(ip6_addr, addr, fmt);
return string_nocheck(buf, end, ip6_addr, spec);
}
static noinline_for_stack
char *ip4_addr_string(char *buf, char *end, const u8 *addr,
struct printf_spec spec, const char *fmt)
{
char ip4_addr[sizeof("255.255.255.255")];
ip4_string(ip4_addr, addr, fmt);
return string_nocheck(buf, end, ip4_addr, spec);
}
static noinline_for_stack
char *ip6_addr_string_sa(char *buf, char *end, const struct sockaddr_in6 *sa,
struct printf_spec spec, const char *fmt)
{
bool have_p = false, have_s = false, have_f = false, have_c = false;
char ip6_addr[sizeof("[xxxx:xxxx:xxxx:xxxx:xxxx:xxxx:255.255.255.255]") +
sizeof(":12345") + sizeof("/123456789") +
sizeof("%1234567890")];
char *p = ip6_addr, *pend = ip6_addr + sizeof(ip6_addr);
const u8 *addr = (const u8 *) &sa->sin6_addr;
char fmt6[2] = { fmt[0], '6' };
u8 off = 0;
fmt++;
while (isalpha(*++fmt)) {
switch (*fmt) {
case 'p':
have_p = true;
break;
case 'f':
have_f = true;
break;
case 's':
have_s = true;
break;
case 'c':
have_c = true;
break;
}
}
if (have_p || have_s || have_f) {
*p = '[';
off = 1;
}
if (fmt6[0] == 'I' && have_c)
p = ip6_compressed_string(ip6_addr + off, addr);
else
p = ip6_string(ip6_addr + off, addr, fmt6);
if (have_p || have_s || have_f)
*p++ = ']';
if (have_p) {
*p++ = ':';
p = number(p, pend, ntohs(sa->sin6_port), spec);
}
if (have_f) {
*p++ = '/';
p = number(p, pend, ntohl(sa->sin6_flowinfo &
IPV6_FLOWINFO_MASK), spec);
}
if (have_s) {
*p++ = '%';
p = number(p, pend, sa->sin6_scope_id, spec);
}
*p = '\0';
return string_nocheck(buf, end, ip6_addr, spec);
}
static noinline_for_stack
char *ip4_addr_string_sa(char *buf, char *end, const struct sockaddr_in *sa,
struct printf_spec spec, const char *fmt)
{
bool have_p = false;
char *p, ip4_addr[sizeof("255.255.255.255") + sizeof(":12345")];
char *pend = ip4_addr + sizeof(ip4_addr);
const u8 *addr = (const u8 *) &sa->sin_addr.s_addr;
char fmt4[3] = { fmt[0], '4', 0 };
fmt++;
while (isalpha(*++fmt)) {
switch (*fmt) {
case 'p':
have_p = true;
break;
case 'h':
case 'l':
case 'n':
case 'b':
fmt4[2] = *fmt;
break;
}
}
p = ip4_string(ip4_addr, addr, fmt4);
if (have_p) {
*p++ = ':';
p = number(p, pend, ntohs(sa->sin_port), spec);
}
*p = '\0';
return string_nocheck(buf, end, ip4_addr, spec);
}
static noinline_for_stack
char *ip_addr_string(char *buf, char *end, const void *ptr,
struct printf_spec spec, const char *fmt)
{
char *err_fmt_msg;
if (check_pointer(&buf, end, ptr, spec))
return buf;
switch (fmt[1]) {
case '6':
return ip6_addr_string(buf, end, ptr, spec, fmt);
case '4':
return ip4_addr_string(buf, end, ptr, spec, fmt);
case 'S': {
const union {
struct sockaddr raw;
struct sockaddr_in v4;
struct sockaddr_in6 v6;
} *sa = ptr;
switch (sa->raw.sa_family) {
case AF_INET:
return ip4_addr_string_sa(buf, end, &sa->v4, spec, fmt);
case AF_INET6:
return ip6_addr_string_sa(buf, end, &sa->v6, spec, fmt);
default:
return error_string(buf, end, "(einval)", spec);
}}
}
err_fmt_msg = fmt[0] == 'i' ? "(%pi?)" : "(%pI?)";
return error_string(buf, end, err_fmt_msg, spec);
}
static noinline_for_stack
char *escaped_string(char *buf, char *end, u8 *addr, struct printf_spec spec,
const char *fmt)
{
bool found = true;
int count = 1;
unsigned int flags = 0;
int len;
if (spec.field_width == 0)
return buf; /* nothing to print */
if (check_pointer(&buf, end, addr, spec))
return buf;
do {
switch (fmt[count++]) {
case 'a':
flags |= ESCAPE_ANY;
break;
case 'c':
flags |= ESCAPE_SPECIAL;
break;
case 'h':
flags |= ESCAPE_HEX;
break;
case 'n':
flags |= ESCAPE_NULL;
break;
case 'o':
flags |= ESCAPE_OCTAL;
break;
case 'p':
flags |= ESCAPE_NP;
break;
case 's':
flags |= ESCAPE_SPACE;
break;
default:
found = false;
break;
}
} while (found);
if (!flags)
flags = ESCAPE_ANY_NP;
len = spec.field_width < 0 ? 1 : spec.field_width;
/*
* string_escape_mem() writes as many characters as it can to
* the given buffer, and returns the total size of the output
* had the buffer been big enough.
*/
buf += string_escape_mem(addr, len, buf, buf < end ? end - buf : 0, flags, NULL);
return buf;
}
static char *va_format(char *buf, char *end, struct va_format *va_fmt,
struct printf_spec spec, const char *fmt)
{
va_list va;
if (check_pointer(&buf, end, va_fmt, spec)) return buf; va_copy(va, *va_fmt->va); buf += vsnprintf(buf, end > buf ? end - buf : 0, va_fmt->fmt, va);
va_end(va);
return buf;
}
static noinline_for_stack
char *uuid_string(char *buf, char *end, const u8 *addr,
struct printf_spec spec, const char *fmt)
{
char uuid[UUID_STRING_LEN + 1];
char *p = uuid;
int i;
const u8 *index = uuid_index;
bool uc = false;
if (check_pointer(&buf, end, addr, spec)) return buf; switch (*(++fmt)) {
case 'L':
uc = true;
fallthrough;
case 'l':
index = guid_index;
break;
case 'B':
uc = true;
break;
}
for (i = 0; i < 16; i++) { if (uc)
p = hex_byte_pack_upper(p, addr[index[i]]);
else
p = hex_byte_pack(p, addr[index[i]]);
switch (i) {
case 3:
case 5:
case 7:
case 9:
*p++ = '-';
break;
}
}
*p = 0; return string_nocheck(buf, end, uuid, spec);
}
static noinline_for_stack
char *netdev_bits(char *buf, char *end, const void *addr,
struct printf_spec spec, const char *fmt)
{
unsigned long long num;
int size;
if (check_pointer(&buf, end, addr, spec))
return buf;
switch (fmt[1]) {
case 'F':
num = *(const netdev_features_t *)addr;
size = sizeof(netdev_features_t);
break;
default:
return error_string(buf, end, "(%pN?)", spec);
}
return special_hex_number(buf, end, num, size);
}
static noinline_for_stack
char *fourcc_string(char *buf, char *end, const u32 *fourcc,
struct printf_spec spec, const char *fmt)
{
char output[sizeof("0123 little-endian (0x01234567)")];
char *p = output;
unsigned int i;
u32 orig, val;
if (fmt[1] != 'c' || fmt[2] != 'c')
return error_string(buf, end, "(%p4?)", spec);
if (check_pointer(&buf, end, fourcc, spec))
return buf;
orig = get_unaligned(fourcc);
val = orig & ~BIT(31);
for (i = 0; i < sizeof(u32); i++) {
unsigned char c = val >> (i * 8);
/* Print non-control ASCII characters as-is, dot otherwise */
*p++ = isascii(c) && isprint(c) ? c : '.';
}
strcpy(p, orig & BIT(31) ? " big-endian" : " little-endian");
p += strlen(p);
*p++ = ' ';
*p++ = '(';
p = special_hex_number(p, output + sizeof(output) - 2, orig, sizeof(u32));
*p++ = ')';
*p = '\0';
return string(buf, end, output, spec);
}
static noinline_for_stack
char *address_val(char *buf, char *end, const void *addr,
struct printf_spec spec, const char *fmt)
{
unsigned long long num;
int size;
if (check_pointer(&buf, end, addr, spec))
return buf;
switch (fmt[1]) {
case 'd':
num = *(const dma_addr_t *)addr;
size = sizeof(dma_addr_t);
break;
case 'p':
default:
num = *(const phys_addr_t *)addr;
size = sizeof(phys_addr_t);
break;
}
return special_hex_number(buf, end, num, size);
}
static noinline_for_stack
char *date_str(char *buf, char *end, const struct rtc_time *tm, bool r)
{
int year = tm->tm_year + (r ? 0 : 1900);
int mon = tm->tm_mon + (r ? 0 : 1);
buf = number(buf, end, year, default_dec04_spec);
if (buf < end)
*buf = '-';
buf++;
buf = number(buf, end, mon, default_dec02_spec);
if (buf < end)
*buf = '-';
buf++;
return number(buf, end, tm->tm_mday, default_dec02_spec);
}
static noinline_for_stack
char *time_str(char *buf, char *end, const struct rtc_time *tm, bool r)
{
buf = number(buf, end, tm->tm_hour, default_dec02_spec);
if (buf < end)
*buf = ':';
buf++;
buf = number(buf, end, tm->tm_min, default_dec02_spec);
if (buf < end)
*buf = ':';
buf++;
return number(buf, end, tm->tm_sec, default_dec02_spec);
}
static noinline_for_stack
char *rtc_str(char *buf, char *end, const struct rtc_time *tm,
struct printf_spec spec, const char *fmt)
{
bool have_t = true, have_d = true;
bool raw = false, iso8601_separator = true;
bool found = true;
int count = 2;
if (check_pointer(&buf, end, tm, spec))
return buf;
switch (fmt[count]) {
case 'd':
have_t = false;
count++;
break;
case 't':
have_d = false;
count++;
break;
}
do {
switch (fmt[count++]) {
case 'r':
raw = true;
break;
case 's':
iso8601_separator = false;
break;
default:
found = false;
break;
}
} while (found);
if (have_d)
buf = date_str(buf, end, tm, raw);
if (have_d && have_t) {
if (buf < end)
*buf = iso8601_separator ? 'T' : ' ';
buf++;
}
if (have_t)
buf = time_str(buf, end, tm, raw);
return buf;
}
static noinline_for_stack
char *time64_str(char *buf, char *end, const time64_t time,
struct printf_spec spec, const char *fmt)
{
struct rtc_time rtc_time;
struct tm tm;
time64_to_tm(time, 0, &tm);
rtc_time.tm_sec = tm.tm_sec;
rtc_time.tm_min = tm.tm_min;
rtc_time.tm_hour = tm.tm_hour;
rtc_time.tm_mday = tm.tm_mday;
rtc_time.tm_mon = tm.tm_mon;
rtc_time.tm_year = tm.tm_year;
rtc_time.tm_wday = tm.tm_wday;
rtc_time.tm_yday = tm.tm_yday;
rtc_time.tm_isdst = 0;
return rtc_str(buf, end, &rtc_time, spec, fmt);
}
static noinline_for_stack
char *time_and_date(char *buf, char *end, void *ptr, struct printf_spec spec,
const char *fmt)
{
switch (fmt[1]) {
case 'R':
return rtc_str(buf, end, (const struct rtc_time *)ptr, spec, fmt);
case 'T':
return time64_str(buf, end, *(const time64_t *)ptr, spec, fmt);
default:
return error_string(buf, end, "(%pt?)", spec);
}
}
static noinline_for_stack
char *clock(char *buf, char *end, struct clk *clk, struct printf_spec spec,
const char *fmt)
{
if (!IS_ENABLED(CONFIG_HAVE_CLK))
return error_string(buf, end, "(%pC?)", spec);
if (check_pointer(&buf, end, clk, spec))
return buf;
switch (fmt[1]) {
case 'n':
default:
#ifdef CONFIG_COMMON_CLK
return string(buf, end, __clk_get_name(clk), spec);
#else
return ptr_to_id(buf, end, clk, spec);
#endif
}
}
static
char *format_flags(char *buf, char *end, unsigned long flags,
const struct trace_print_flags *names)
{
unsigned long mask;
for ( ; flags && names->name; names++) {
mask = names->mask;
if ((flags & mask) != mask)
continue;
buf = string(buf, end, names->name, default_str_spec);
flags &= ~mask;
if (flags) {
if (buf < end)
*buf = '|';
buf++;
}
}
if (flags)
buf = number(buf, end, flags, default_flag_spec);
return buf;
}
struct page_flags_fields {
int width;
int shift;
int mask;
const struct printf_spec *spec;
const char *name;
};
static const struct page_flags_fields pff[] = {
{SECTIONS_WIDTH, SECTIONS_PGSHIFT, SECTIONS_MASK,
&default_dec_spec, "section"},
{NODES_WIDTH, NODES_PGSHIFT, NODES_MASK,
&default_dec_spec, "node"},
{ZONES_WIDTH, ZONES_PGSHIFT, ZONES_MASK,
&default_dec_spec, "zone"},
{LAST_CPUPID_WIDTH, LAST_CPUPID_PGSHIFT, LAST_CPUPID_MASK,
&default_flag_spec, "lastcpupid"},
{KASAN_TAG_WIDTH, KASAN_TAG_PGSHIFT, KASAN_TAG_MASK,
&default_flag_spec, "kasantag"},
};
static
char *format_page_flags(char *buf, char *end, unsigned long flags)
{
unsigned long main_flags = flags & PAGEFLAGS_MASK;
bool append = false;
int i;
/* Page flags from the main area. */
if (main_flags) {
buf = format_flags(buf, end, main_flags, pageflag_names);
append = true;
}
/* Page flags from the fields area */
for (i = 0; i < ARRAY_SIZE(pff); i++) {
/* Skip undefined fields. */
if (!pff[i].width)
continue;
/* Format: Flag Name + '=' (equals sign) + Number + '|' (separator) */
if (append) {
if (buf < end)
*buf = '|';
buf++;
}
buf = string(buf, end, pff[i].name, default_str_spec);
if (buf < end)
*buf = '=';
buf++;
buf = number(buf, end, (flags >> pff[i].shift) & pff[i].mask,
*pff[i].spec);
append = true;
}
return buf;
}
static noinline_for_stack
char *flags_string(char *buf, char *end, void *flags_ptr,
struct printf_spec spec, const char *fmt)
{
unsigned long flags;
const struct trace_print_flags *names;
if (check_pointer(&buf, end, flags_ptr, spec))
return buf;
switch (fmt[1]) {
case 'p':
return format_page_flags(buf, end, *(unsigned long *)flags_ptr);
case 'v':
flags = *(unsigned long *)flags_ptr;
names = vmaflag_names;
break;
case 'g':
flags = (__force unsigned long)(*(gfp_t *)flags_ptr);
names = gfpflag_names;
break;
default:
return error_string(buf, end, "(%pG?)", spec);
}
return format_flags(buf, end, flags, names);
}
static noinline_for_stack
char *fwnode_full_name_string(struct fwnode_handle *fwnode, char *buf,
char *end)
{
int depth;
/* Loop starting from the root node to the current node. */
for (depth = fwnode_count_parents(fwnode); depth >= 0; depth--) {
struct fwnode_handle *__fwnode =
fwnode_get_nth_parent(fwnode, depth);
buf = string(buf, end, fwnode_get_name_prefix(__fwnode),
default_str_spec);
buf = string(buf, end, fwnode_get_name(__fwnode),
default_str_spec);
fwnode_handle_put(__fwnode);
}
return buf;
}
static noinline_for_stack
char *device_node_string(char *buf, char *end, struct device_node *dn,
struct printf_spec spec, const char *fmt)
{
char tbuf[sizeof("xxxx") + 1];
const char *p;
int ret;
char *buf_start = buf;
struct property *prop;
bool has_mult, pass;
struct printf_spec str_spec = spec;
str_spec.field_width = -1;
if (fmt[0] != 'F')
return error_string(buf, end, "(%pO?)", spec);
if (!IS_ENABLED(CONFIG_OF))
return error_string(buf, end, "(%pOF?)", spec);
if (check_pointer(&buf, end, dn, spec))
return buf;
/* simple case without anything any more format specifiers */
fmt++;
if (fmt[0] == '\0' || strcspn(fmt,"fnpPFcC") > 0)
fmt = "f";
for (pass = false; strspn(fmt,"fnpPFcC"); fmt++, pass = true) {
int precision;
if (pass) {
if (buf < end)
*buf = ':';
buf++;
}
switch (*fmt) {
case 'f': /* full_name */
buf = fwnode_full_name_string(of_fwnode_handle(dn), buf,
end);
break;
case 'n': /* name */
p = fwnode_get_name(of_fwnode_handle(dn));
precision = str_spec.precision;
str_spec.precision = strchrnul(p, '@') - p;
buf = string(buf, end, p, str_spec);
str_spec.precision = precision;
break;
case 'p': /* phandle */
buf = number(buf, end, (unsigned int)dn->phandle, default_dec_spec);
break;
case 'P': /* path-spec */
p = fwnode_get_name(of_fwnode_handle(dn));
if (!p[1])
p = "/";
buf = string(buf, end, p, str_spec);
break;
case 'F': /* flags */
tbuf[0] = of_node_check_flag(dn, OF_DYNAMIC) ? 'D' : '-';
tbuf[1] = of_node_check_flag(dn, OF_DETACHED) ? 'd' : '-';
tbuf[2] = of_node_check_flag(dn, OF_POPULATED) ? 'P' : '-';
tbuf[3] = of_node_check_flag(dn, OF_POPULATED_BUS) ? 'B' : '-';
tbuf[4] = 0;
buf = string_nocheck(buf, end, tbuf, str_spec);
break;
case 'c': /* major compatible string */
ret = of_property_read_string(dn, "compatible", &p);
if (!ret)
buf = string(buf, end, p, str_spec);
break;
case 'C': /* full compatible string */
has_mult = false;
of_property_for_each_string(dn, "compatible", prop, p) {
if (has_mult)
buf = string_nocheck(buf, end, ",", str_spec);
buf = string_nocheck(buf, end, "\"", str_spec);
buf = string(buf, end, p, str_spec);
buf = string_nocheck(buf, end, "\"", str_spec);
has_mult = true;
}
break;
default:
break;
}
}
return widen_string(buf, buf - buf_start, end, spec);
}
static noinline_for_stack
char *fwnode_string(char *buf, char *end, struct fwnode_handle *fwnode,
struct printf_spec spec, const char *fmt)
{
struct printf_spec str_spec = spec;
char *buf_start = buf;
str_spec.field_width = -1;
if (*fmt != 'w')
return error_string(buf, end, "(%pf?)", spec);
if (check_pointer(&buf, end, fwnode, spec))
return buf;
fmt++;
switch (*fmt) {
case 'P': /* name */
buf = string(buf, end, fwnode_get_name(fwnode), str_spec);
break;
case 'f': /* full_name */
default:
buf = fwnode_full_name_string(fwnode, buf, end);
break;
}
return widen_string(buf, buf - buf_start, end, spec);
}
int __init no_hash_pointers_enable(char *str)
{
if (no_hash_pointers)
return 0;
no_hash_pointers = true;
pr_warn("**********************************************************\n");
pr_warn("** NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE **\n");
pr_warn("** **\n");
pr_warn("** This system shows unhashed kernel memory addresses **\n");
pr_warn("** via the console, logs, and other interfaces. This **\n");
pr_warn("** might reduce the security of your system. **\n");
pr_warn("** **\n");
pr_warn("** If you see this message and you are not debugging **\n");
pr_warn("** the kernel, report this immediately to your system **\n");
pr_warn("** administrator! **\n");
pr_warn("** **\n");
pr_warn("** NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE **\n");
pr_warn("**********************************************************\n");
return 0;
}
early_param("no_hash_pointers", no_hash_pointers_enable);
/*
* Show a '%p' thing. A kernel extension is that the '%p' is followed
* by an extra set of alphanumeric characters that are extended format
* specifiers.
*
* Please update scripts/checkpatch.pl when adding/removing conversion
* characters. (Search for "check for vsprintf extension").
*
* Right now we handle:
*
* - 'S' For symbolic direct pointers (or function descriptors) with offset
* - 's' For symbolic direct pointers (or function descriptors) without offset
* - '[Ss]R' as above with __builtin_extract_return_addr() translation
* - 'S[R]b' as above with module build ID (for use in backtraces)
* - '[Ff]' %pf and %pF were obsoleted and later removed in favor of
* %ps and %pS. Be careful when re-using these specifiers.
* - 'B' For backtraced symbolic direct pointers with offset
* - 'Bb' as above with module build ID (for use in backtraces)
* - 'R' For decoded struct resource, e.g., [mem 0x0-0x1f 64bit pref]
* - 'r' For raw struct resource, e.g., [mem 0x0-0x1f flags 0x201]
* - 'b[l]' For a bitmap, the number of bits is determined by the field
* width which must be explicitly specified either as part of the
* format string '%32b[l]' or through '%*b[l]', [l] selects
* range-list format instead of hex format
* - 'M' For a 6-byte MAC address, it prints the address in the
* usual colon-separated hex notation
* - 'm' For a 6-byte MAC address, it prints the hex address without colons
* - 'MF' For a 6-byte MAC FDDI address, it prints the address
* with a dash-separated hex notation
* - '[mM]R' For a 6-byte MAC address, Reverse order (Bluetooth)
* - 'I' [46] for IPv4/IPv6 addresses printed in the usual way
* IPv4 uses dot-separated decimal without leading 0's (1.2.3.4)
* IPv6 uses colon separated network-order 16 bit hex with leading 0's
* [S][pfs]
* Generic IPv4/IPv6 address (struct sockaddr *) that falls back to
* [4] or [6] and is able to print port [p], flowinfo [f], scope [s]
* - 'i' [46] for 'raw' IPv4/IPv6 addresses
* IPv6 omits the colons (01020304...0f)
* IPv4 uses dot-separated decimal with leading 0's (010.123.045.006)
* [S][pfs]
* Generic IPv4/IPv6 address (struct sockaddr *) that falls back to
* [4] or [6] and is able to print port [p], flowinfo [f], scope [s]
* - '[Ii][4S][hnbl]' IPv4 addresses in host, network, big or little endian order
* - 'I[6S]c' for IPv6 addresses printed as specified by
* https://tools.ietf.org/html/rfc5952
* - 'E[achnops]' For an escaped buffer, where rules are defined by combination
* of the following flags (see string_escape_mem() for the
* details):
* a - ESCAPE_ANY
* c - ESCAPE_SPECIAL
* h - ESCAPE_HEX
* n - ESCAPE_NULL
* o - ESCAPE_OCTAL
* p - ESCAPE_NP
* s - ESCAPE_SPACE
* By default ESCAPE_ANY_NP is used.
* - 'U' For a 16 byte UUID/GUID, it prints the UUID/GUID in the form
* "xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx"
* Options for %pU are:
* b big endian lower case hex (default)
* B big endian UPPER case hex
* l little endian lower case hex
* L little endian UPPER case hex
* big endian output byte order is:
* [0][1][2][3]-[4][5]-[6][7]-[8][9]-[10][11][12][13][14][15]
* little endian output byte order is:
* [3][2][1][0]-[5][4]-[7][6]-[8][9]-[10][11][12][13][14][15]
* - 'V' For a struct va_format which contains a format string * and va_list *,
* call vsnprintf(->format, *->va_list).
* Implements a "recursive vsnprintf".
* Do not use this feature without some mechanism to verify the
* correctness of the format string and va_list arguments.
* - 'K' For a kernel pointer that should be hidden from unprivileged users.
* Use only for procfs, sysfs and similar files, not printk(); please
* read the documentation (path below) first.
* - 'NF' For a netdev_features_t
* - '4cc' V4L2 or DRM FourCC code, with endianness and raw numerical value.
* - 'h[CDN]' For a variable-length buffer, it prints it as a hex string with
* a certain separator (' ' by default):
* C colon
* D dash
* N no separator
* The maximum supported length is 64 bytes of the input. Consider
* to use print_hex_dump() for the larger input.
* - 'a[pd]' For address types [p] phys_addr_t, [d] dma_addr_t and derivatives
* (default assumed to be phys_addr_t, passed by reference)
* - 'd[234]' For a dentry name (optionally 2-4 last components)
* - 'D[234]' Same as 'd' but for a struct file
* - 'g' For block_device name (gendisk + partition number)
* - 't[RT][dt][r][s]' For time and date as represented by:
* R struct rtc_time
* T time64_t
* - 'C' For a clock, it prints the name (Common Clock Framework) or address
* (legacy clock framework) of the clock
* - 'Cn' For a clock, it prints the name (Common Clock Framework) or address
* (legacy clock framework) of the clock
* - 'G' For flags to be printed as a collection of symbolic strings that would
* construct the specific value. Supported flags given by option:
* p page flags (see struct page) given as pointer to unsigned long
* g gfp flags (GFP_* and __GFP_*) given as pointer to gfp_t
* v vma flags (VM_*) given as pointer to unsigned long
* - 'OF[fnpPcCF]' For a device tree object
* Without any optional arguments prints the full_name
* f device node full_name
* n device node name
* p device node phandle
* P device node path spec (name + @unit)
* F device node flags
* c major compatible string
* C full compatible string
* - 'fw[fP]' For a firmware node (struct fwnode_handle) pointer
* Without an option prints the full name of the node
* f full name
* P node name, including a possible unit address
* - 'x' For printing the address unmodified. Equivalent to "%lx".
* Please read the documentation (path below) before using!
* - '[ku]s' For a BPF/tracing related format specifier, e.g. used out of
* bpf_trace_printk() where [ku] prefix specifies either kernel (k)
* or user (u) memory to probe, and:
* s a string, equivalent to "%s" on direct vsnprintf() use
*
* ** When making changes please also update:
* Documentation/core-api/printk-formats.rst
*
* Note: The default behaviour (unadorned %p) is to hash the address,
* rendering it useful as a unique identifier.
*/
static noinline_for_stack
char *pointer(const char *fmt, char *buf, char *end, void *ptr,
struct printf_spec spec)
{
switch (*fmt) {
case 'S':
case 's':
ptr = dereference_symbol_descriptor(ptr);
fallthrough;
case 'B':
return symbol_string(buf, end, ptr, spec, fmt);
case 'R':
case 'r':
return resource_string(buf, end, ptr, spec, fmt);
case 'h':
return hex_string(buf, end, ptr, spec, fmt);
case 'b':
switch (fmt[1]) {
case 'l':
return bitmap_list_string(buf, end, ptr, spec, fmt);
default:
return bitmap_string(buf, end, ptr, spec, fmt);
}
case 'M': /* Colon separated: 00:01:02:03:04:05 */
case 'm': /* Contiguous: 000102030405 */
/* [mM]F (FDDI) */
/* [mM]R (Reverse order; Bluetooth) */
return mac_address_string(buf, end, ptr, spec, fmt);
case 'I': /* Formatted IP supported
* 4: 1.2.3.4
* 6: 0001:0203:...:0708
* 6c: 1::708 or 1::1.2.3.4
*/
case 'i': /* Contiguous:
* 4: 001.002.003.004
* 6: 000102...0f
*/
return ip_addr_string(buf, end, ptr, spec, fmt);
case 'E':
return escaped_string(buf, end, ptr, spec, fmt);
case 'U':
return uuid_string(buf, end, ptr, spec, fmt);
case 'V':
return va_format(buf, end, ptr, spec, fmt);
case 'K':
return restricted_pointer(buf, end, ptr, spec);
case 'N':
return netdev_bits(buf, end, ptr, spec, fmt);
case '4':
return fourcc_string(buf, end, ptr, spec, fmt);
case 'a':
return address_val(buf, end, ptr, spec, fmt);
case 'd':
return dentry_name(buf, end, ptr, spec, fmt);
case 't':
return time_and_date(buf, end, ptr, spec, fmt);
case 'C':
return clock(buf, end, ptr, spec, fmt);
case 'D':
return file_dentry_name(buf, end, ptr, spec, fmt);
#ifdef CONFIG_BLOCK
case 'g':
return bdev_name(buf, end, ptr, spec, fmt);
#endif
case 'G':
return flags_string(buf, end, ptr, spec, fmt);
case 'O':
return device_node_string(buf, end, ptr, spec, fmt + 1);
case 'f':
return fwnode_string(buf, end, ptr, spec, fmt + 1);
case 'x':
return pointer_string(buf, end, ptr, spec);
case 'e':
/* %pe with a non-ERR_PTR gets treated as plain %p */
if (!IS_ERR(ptr))
return default_pointer(buf, end, ptr, spec);
return err_ptr(buf, end, ptr, spec);
case 'u':
case 'k':
switch (fmt[1]) {
case 's':
return string(buf, end, ptr, spec);
default:
return error_string(buf, end, "(einval)", spec);
}
default:
return default_pointer(buf, end, ptr, spec);
}
}
/*
* Helper function to decode printf style format.
* Each call decode a token from the format and return the
* number of characters read (or likely the delta where it wants
* to go on the next call).
* The decoded token is returned through the parameters
*
* 'h', 'l', or 'L' for integer fields
* 'z' support added 23/7/1999 S.H.
* 'z' changed to 'Z' --davidm 1/25/99
* 'Z' changed to 'z' --adobriyan 2017-01-25
* 't' added for ptrdiff_t
*
* @fmt: the format string
* @type of the token returned
* @flags: various flags such as +, -, # tokens..
* @field_width: overwritten width
* @base: base of the number (octal, hex, ...)
* @precision: precision of a number
* @qualifier: qualifier of a number (long, size_t, ...)
*/
static noinline_for_stack
int format_decode(const char *fmt, struct printf_spec *spec)
{
const char *start = fmt;
char qualifier;
/* we finished early by reading the field width */
if (spec->type == FORMAT_TYPE_WIDTH) {
if (spec->field_width < 0) { spec->field_width = -spec->field_width;
spec->flags |= LEFT;
}
spec->type = FORMAT_TYPE_NONE;
goto precision;
}
/* we finished early by reading the precision */
if (spec->type == FORMAT_TYPE_PRECISION) { if (spec->precision < 0) spec->precision = 0; spec->type = FORMAT_TYPE_NONE;
goto qualifier;
}
/* By default */
spec->type = FORMAT_TYPE_NONE; for (; *fmt ; ++fmt) { if (*fmt == '%')
break;
}
/* Return the current non-format string */
if (fmt != start || !*fmt)
return fmt - start;
/* Process flags */
spec->flags = 0;
while (1) { /* this also skips first '%' */
bool found = true;
++fmt;
switch (*fmt) {
case '-': spec->flags |= LEFT; break; case '+': spec->flags |= PLUS; break; case ' ': spec->flags |= SPACE; break; case '#': spec->flags |= SPECIAL; break; case '0': spec->flags |= ZEROPAD; break;
default: found = false;
}
if (!found)
break;
}
/* get field width */
spec->field_width = -1;
if (isdigit(*fmt))
spec->field_width = skip_atoi(&fmt); else if (*fmt == '*') {
/* it's the next argument */
spec->type = FORMAT_TYPE_WIDTH;
return ++fmt - start;
}
precision:
/* get the precision */
spec->precision = -1;
if (*fmt == '.') {
++fmt;
if (isdigit(*fmt)) {
spec->precision = skip_atoi(&fmt); if (spec->precision < 0) spec->precision = 0; } else if (*fmt == '*') {
/* it's the next argument */
spec->type = FORMAT_TYPE_PRECISION;
return ++fmt - start;
}
}
qualifier:
/* get the conversion qualifier */
qualifier = 0;
if (*fmt == 'h' || _tolower(*fmt) == 'l' ||
*fmt == 'z' || *fmt == 't') {
qualifier = *fmt++;
if (unlikely(qualifier == *fmt)) {
if (qualifier == 'l') {
qualifier = 'L';
++fmt; } else if (qualifier == 'h') {
qualifier = 'H';
++fmt;
}
}
}
/* default base */
spec->base = 10;
switch (*fmt) {
case 'c':
spec->type = FORMAT_TYPE_CHAR;
return ++fmt - start;
case 's':
spec->type = FORMAT_TYPE_STR;
return ++fmt - start;
case 'p':
spec->type = FORMAT_TYPE_PTR;
return ++fmt - start;
case '%':
spec->type = FORMAT_TYPE_PERCENT_CHAR;
return ++fmt - start;
/* integer number formats - set up the flags and "break" */
case 'o':
spec->base = 8;
break;
case 'x':
spec->flags |= SMALL;
fallthrough;
case 'X':
spec->base = 16;
break;
case 'd':
case 'i':
spec->flags |= SIGN;
break;
case 'u':
break;
case 'n':
/*
* Since %n poses a greater security risk than
* utility, treat it as any other invalid or
* unsupported format specifier.
*/
fallthrough;
default:
WARN_ONCE(1, "Please remove unsupported %%%c in format string\n", *fmt); spec->type = FORMAT_TYPE_INVALID;
return fmt - start;
}
if (qualifier == 'L') spec->type = FORMAT_TYPE_LONG_LONG; else if (qualifier == 'l') {
BUILD_BUG_ON(FORMAT_TYPE_ULONG + SIGN != FORMAT_TYPE_LONG);
spec->type = FORMAT_TYPE_ULONG + (spec->flags & SIGN); } else if (qualifier == 'z') { spec->type = FORMAT_TYPE_SIZE_T; } else if (qualifier == 't') { spec->type = FORMAT_TYPE_PTRDIFF; } else if (qualifier == 'H') {
BUILD_BUG_ON(FORMAT_TYPE_UBYTE + SIGN != FORMAT_TYPE_BYTE);
spec->type = FORMAT_TYPE_UBYTE + (spec->flags & SIGN); } else if (qualifier == 'h') {
BUILD_BUG_ON(FORMAT_TYPE_USHORT + SIGN != FORMAT_TYPE_SHORT);
spec->type = FORMAT_TYPE_USHORT + (spec->flags & SIGN);
} else {
BUILD_BUG_ON(FORMAT_TYPE_UINT + SIGN != FORMAT_TYPE_INT);
spec->type = FORMAT_TYPE_UINT + (spec->flags & SIGN);
}
return ++fmt - start;
}
static void
set_field_width(struct printf_spec *spec, int width)
{
spec->field_width = width;
if (WARN_ONCE(spec->field_width != width, "field width %d too large", width)) {
spec->field_width = clamp(width, -FIELD_WIDTH_MAX, FIELD_WIDTH_MAX);
}
}
static void
set_precision(struct printf_spec *spec, int prec)
{
spec->precision = prec; if (WARN_ONCE(spec->precision != prec, "precision %d too large", prec)) { spec->precision = clamp(prec, 0, PRECISION_MAX);
}
}
/**
* vsnprintf - Format a string and place it in a buffer
* @buf: The buffer to place the result into
* @size: The size of the buffer, including the trailing null space
* @fmt: The format string to use
* @args: Arguments for the format string
*
* This function generally follows C99 vsnprintf, but has some
* extensions and a few limitations:
*
* - ``%n`` is unsupported
* - ``%p*`` is handled by pointer()
*
* See pointer() or Documentation/core-api/printk-formats.rst for more
* extensive description.
*
* **Please update the documentation in both places when making changes**
*
* The return value is the number of characters which would
* be generated for the given input, excluding the trailing
* '\0', as per ISO C99. If you want to have the exact
* number of characters written into @buf as return value
* (not including the trailing '\0'), use vscnprintf(). If the
* return is greater than or equal to @size, the resulting
* string is truncated.
*
* If you're not already dealing with a va_list consider using snprintf().
*/
int vsnprintf(char *buf, size_t size, const char *fmt, va_list args)
{
unsigned long long num;
char *str, *end;
struct printf_spec spec = {0};
/* Reject out-of-range values early. Large positive sizes are
used for unknown buffer sizes. */
if (WARN_ON_ONCE(size > INT_MAX)) return 0;
str = buf;
end = buf + size;
/* Make sure end is always >= buf */
if (end < buf) {
end = ((void *)-1);
size = end - buf;
}
while (*fmt) {
const char *old_fmt = fmt;
int read = format_decode(fmt, &spec);
fmt += read;
switch (spec.type) {
case FORMAT_TYPE_NONE: {
int copy = read;
if (str < end) { if (copy > end - str) copy = end - str; memcpy(str, old_fmt, copy);
}
str += read;
break;
}
case FORMAT_TYPE_WIDTH:
set_field_width(&spec, va_arg(args, int));
break;
case FORMAT_TYPE_PRECISION:
set_precision(&spec, va_arg(args, int));
break;
case FORMAT_TYPE_CHAR: {
char c;
if (!(spec.flags & LEFT)) { while (--spec.field_width > 0) { if (str < end) *str = ' '; ++str;
}
}
c = (unsigned char) va_arg(args, int); if (str < end)
*str = c;
++str; while (--spec.field_width > 0) { if (str < end) *str = ' '; ++str;
}
break;
}
case FORMAT_TYPE_STR:
str = string(str, end, va_arg(args, char *), spec);
break;
case FORMAT_TYPE_PTR:
str = pointer(fmt, str, end, va_arg(args, void *),
spec);
while (isalnum(*fmt))
fmt++;
break;
case FORMAT_TYPE_PERCENT_CHAR:
if (str < end) *str = '%'; ++str;
break;
case FORMAT_TYPE_INVALID:
/*
* Presumably the arguments passed gcc's type
* checking, but there is no safe or sane way
* for us to continue parsing the format and
* fetching from the va_list; the remaining
* specifiers and arguments would be out of
* sync.
*/
goto out;
default:
switch (spec.type) {
case FORMAT_TYPE_LONG_LONG:
num = va_arg(args, long long);
break;
case FORMAT_TYPE_ULONG:
num = va_arg(args, unsigned long);
break;
case FORMAT_TYPE_LONG:
num = va_arg(args, long);
break;
case FORMAT_TYPE_SIZE_T:
if (spec.flags & SIGN) num = va_arg(args, ssize_t);
else
num = va_arg(args, size_t);
break;
case FORMAT_TYPE_PTRDIFF:
num = va_arg(args, ptrdiff_t);
break;
case FORMAT_TYPE_UBYTE:
num = (unsigned char) va_arg(args, int);
break;
case FORMAT_TYPE_BYTE:
num = (signed char) va_arg(args, int);
break;
case FORMAT_TYPE_USHORT:
num = (unsigned short) va_arg(args, int);
break;
case FORMAT_TYPE_SHORT:
num = (short) va_arg(args, int);
break;
case FORMAT_TYPE_INT:
num = (int) va_arg(args, int);
break;
default:
num = va_arg(args, unsigned int);
}
str = number(str, end, num, spec);
}
}
out:
if (size > 0) { if (str < end) *str = '\0';
else
end[-1] = '\0';
}
/* the trailing null byte doesn't count towards the total */
return str-buf;
}
EXPORT_SYMBOL(vsnprintf);
/**
* vscnprintf - Format a string and place it in a buffer
* @buf: The buffer to place the result into
* @size: The size of the buffer, including the trailing null space
* @fmt: The format string to use
* @args: Arguments for the format string
*
* The return value is the number of characters which have been written into
* the @buf not including the trailing '\0'. If @size is == 0 the function
* returns 0.
*
* If you're not already dealing with a va_list consider using scnprintf().
*
* See the vsnprintf() documentation for format string extensions over C99.
*/
int vscnprintf(char *buf, size_t size, const char *fmt, va_list args)
{
int i;
i = vsnprintf(buf, size, fmt, args); if (likely(i < size))
return i;
if (size != 0) return size - 1;
return 0;
}
EXPORT_SYMBOL(vscnprintf);
/**
* snprintf - Format a string and place it in a buffer
* @buf: The buffer to place the result into
* @size: The size of the buffer, including the trailing null space
* @fmt: The format string to use
* @...: Arguments for the format string
*
* The return value is the number of characters which would be
* generated for the given input, excluding the trailing null,
* as per ISO C99. If the return is greater than or equal to
* @size, the resulting string is truncated.
*
* See the vsnprintf() documentation for format string extensions over C99.
*/
int snprintf(char *buf, size_t size, const char *fmt, ...)
{
va_list args;
int i;
va_start(args, fmt);
i = vsnprintf(buf, size, fmt, args);
va_end(args);
return i;
}
EXPORT_SYMBOL(snprintf);
/**
* scnprintf - Format a string and place it in a buffer
* @buf: The buffer to place the result into
* @size: The size of the buffer, including the trailing null space
* @fmt: The format string to use
* @...: Arguments for the format string
*
* The return value is the number of characters written into @buf not including
* the trailing '\0'. If @size is == 0 the function returns 0.
*/
int scnprintf(char *buf, size_t size, const char *fmt, ...)
{
va_list args;
int i;
va_start(args, fmt);
i = vscnprintf(buf, size, fmt, args);
va_end(args);
return i;
}
EXPORT_SYMBOL(scnprintf);
/**
* vsprintf - Format a string and place it in a buffer
* @buf: The buffer to place the result into
* @fmt: The format string to use
* @args: Arguments for the format string
*
* The function returns the number of characters written
* into @buf. Use vsnprintf() or vscnprintf() in order to avoid
* buffer overflows.
*
* If you're not already dealing with a va_list consider using sprintf().
*
* See the vsnprintf() documentation for format string extensions over C99.
*/
int vsprintf(char *buf, const char *fmt, va_list args)
{
return vsnprintf(buf, INT_MAX, fmt, args);
}
EXPORT_SYMBOL(vsprintf);
/**
* sprintf - Format a string and place it in a buffer
* @buf: The buffer to place the result into
* @fmt: The format string to use
* @...: Arguments for the format string
*
* The function returns the number of characters written
* into @buf. Use snprintf() or scnprintf() in order to avoid
* buffer overflows.
*
* See the vsnprintf() documentation for format string extensions over C99.
*/
int sprintf(char *buf, const char *fmt, ...)
{
va_list args;
int i;
va_start(args, fmt);
i = vsnprintf(buf, INT_MAX, fmt, args);
va_end(args);
return i;
}
EXPORT_SYMBOL(sprintf);
#ifdef CONFIG_BINARY_PRINTF
/*
* bprintf service:
* vbin_printf() - VA arguments to binary data
* bstr_printf() - Binary data to text string
*/
/**
* vbin_printf - Parse a format string and place args' binary value in a buffer
* @bin_buf: The buffer to place args' binary value
* @size: The size of the buffer(by words(32bits), not characters)
* @fmt: The format string to use
* @args: Arguments for the format string
*
* The format follows C99 vsnprintf, except %n is ignored, and its argument
* is skipped.
*
* The return value is the number of words(32bits) which would be generated for
* the given input.
*
* NOTE:
* If the return value is greater than @size, the resulting bin_buf is NOT
* valid for bstr_printf().
*/
int vbin_printf(u32 *bin_buf, size_t size, const char *fmt, va_list args)
{
struct printf_spec spec = {0};
char *str, *end;
int width;
str = (char *)bin_buf;
end = (char *)(bin_buf + size);
#define save_arg(type) \
({ \
unsigned long long value; \
if (sizeof(type) == 8) { \
unsigned long long val8; \
str = PTR_ALIGN(str, sizeof(u32)); \
val8 = va_arg(args, unsigned long long); \
if (str + sizeof(type) <= end) { \
*(u32 *)str = *(u32 *)&val8; \
*(u32 *)(str + 4) = *((u32 *)&val8 + 1); \
} \
value = val8; \
} else { \
unsigned int val4; \
str = PTR_ALIGN(str, sizeof(type)); \
val4 = va_arg(args, int); \
if (str + sizeof(type) <= end) \
*(typeof(type) *)str = (type)(long)val4; \
value = (unsigned long long)val4; \
} \
str += sizeof(type); \
value; \
})
while (*fmt) {
int read = format_decode(fmt, &spec);
fmt += read;
switch (spec.type) {
case FORMAT_TYPE_NONE:
case FORMAT_TYPE_PERCENT_CHAR:
break;
case FORMAT_TYPE_INVALID:
goto out;
case FORMAT_TYPE_WIDTH:
case FORMAT_TYPE_PRECISION:
width = (int)save_arg(int);
/* Pointers may require the width */
if (*fmt == 'p')
set_field_width(&spec, width);
break;
case FORMAT_TYPE_CHAR:
save_arg(char);
break;
case FORMAT_TYPE_STR: {
const char *save_str = va_arg(args, char *);
const char *err_msg;
size_t len;
err_msg = check_pointer_msg(save_str);
if (err_msg)
save_str = err_msg;
len = strlen(save_str) + 1;
if (str + len < end)
memcpy(str, save_str, len);
str += len;
break;
}
case FORMAT_TYPE_PTR:
/* Dereferenced pointers must be done now */
switch (*fmt) {
/* Dereference of functions is still OK */
case 'S':
case 's':
case 'x':
case 'K':
case 'e':
save_arg(void *);
break;
default:
if (!isalnum(*fmt)) {
save_arg(void *);
break;
}
str = pointer(fmt, str, end, va_arg(args, void *),
spec);
if (str + 1 < end)
*str++ = '\0';
else
end[-1] = '\0'; /* Must be nul terminated */
}
/* skip all alphanumeric pointer suffixes */
while (isalnum(*fmt))
fmt++;
break;
default:
switch (spec.type) {
case FORMAT_TYPE_LONG_LONG:
save_arg(long long);
break;
case FORMAT_TYPE_ULONG:
case FORMAT_TYPE_LONG:
save_arg(unsigned long);
break;
case FORMAT_TYPE_SIZE_T:
save_arg(size_t);
break;
case FORMAT_TYPE_PTRDIFF:
save_arg(ptrdiff_t);
break;
case FORMAT_TYPE_UBYTE:
case FORMAT_TYPE_BYTE:
save_arg(char);
break;
case FORMAT_TYPE_USHORT:
case FORMAT_TYPE_SHORT:
save_arg(short);
break;
default:
save_arg(int);
}
}
}
out:
return (u32 *)(PTR_ALIGN(str, sizeof(u32))) - bin_buf;
#undef save_arg
}
EXPORT_SYMBOL_GPL(vbin_printf);
/**
* bstr_printf - Format a string from binary arguments and place it in a buffer
* @buf: The buffer to place the result into
* @size: The size of the buffer, including the trailing null space
* @fmt: The format string to use
* @bin_buf: Binary arguments for the format string
*
* This function like C99 vsnprintf, but the difference is that vsnprintf gets
* arguments from stack, and bstr_printf gets arguments from @bin_buf which is
* a binary buffer that generated by vbin_printf.
*
* The format follows C99 vsnprintf, but has some extensions:
* see vsnprintf comment for details.
*
* The return value is the number of characters which would
* be generated for the given input, excluding the trailing
* '\0', as per ISO C99. If you want to have the exact
* number of characters written into @buf as return value
* (not including the trailing '\0'), use vscnprintf(). If the
* return is greater than or equal to @size, the resulting
* string is truncated.
*/
int bstr_printf(char *buf, size_t size, const char *fmt, const u32 *bin_buf)
{
struct printf_spec spec = {0};
char *str, *end;
const char *args = (const char *)bin_buf;
if (WARN_ON_ONCE(size > INT_MAX))
return 0;
str = buf;
end = buf + size;
#define get_arg(type) \
({ \
typeof(type) value; \
if (sizeof(type) == 8) { \
args = PTR_ALIGN(args, sizeof(u32)); \
*(u32 *)&value = *(u32 *)args; \
*((u32 *)&value + 1) = *(u32 *)(args + 4); \
} else { \
args = PTR_ALIGN(args, sizeof(type)); \
value = *(typeof(type) *)args; \
} \
args += sizeof(type); \
value; \
})
/* Make sure end is always >= buf */
if (end < buf) {
end = ((void *)-1);
size = end - buf;
}
while (*fmt) {
const char *old_fmt = fmt;
int read = format_decode(fmt, &spec);
fmt += read;
switch (spec.type) {
case FORMAT_TYPE_NONE: {
int copy = read;
if (str < end) {
if (copy > end - str)
copy = end - str;
memcpy(str, old_fmt, copy);
}
str += read;
break;
}
case FORMAT_TYPE_WIDTH:
set_field_width(&spec, get_arg(int));
break;
case FORMAT_TYPE_PRECISION:
set_precision(&spec, get_arg(int));
break;
case FORMAT_TYPE_CHAR: {
char c;
if (!(spec.flags & LEFT)) {
while (--spec.field_width > 0) {
if (str < end)
*str = ' ';
++str;
}
}
c = (unsigned char) get_arg(char);
if (str < end)
*str = c;
++str;
while (--spec.field_width > 0) {
if (str < end)
*str = ' ';
++str;
}
break;
}
case FORMAT_TYPE_STR: {
const char *str_arg = args;
args += strlen(str_arg) + 1;
str = string(str, end, (char *)str_arg, spec);
break;
}
case FORMAT_TYPE_PTR: {
bool process = false;
int copy, len;
/* Non function dereferences were already done */
switch (*fmt) {
case 'S':
case 's':
case 'x':
case 'K':
case 'e':
process = true;
break;
default:
if (!isalnum(*fmt)) {
process = true;
break;
}
/* Pointer dereference was already processed */
if (str < end) {
len = copy = strlen(args);
if (copy > end - str)
copy = end - str;
memcpy(str, args, copy);
str += len;
args += len + 1;
}
}
if (process)
str = pointer(fmt, str, end, get_arg(void *), spec);
while (isalnum(*fmt))
fmt++;
break;
}
case FORMAT_TYPE_PERCENT_CHAR:
if (str < end)
*str = '%';
++str;
break;
case FORMAT_TYPE_INVALID:
goto out;
default: {
unsigned long long num;
switch (spec.type) {
case FORMAT_TYPE_LONG_LONG:
num = get_arg(long long);
break;
case FORMAT_TYPE_ULONG:
case FORMAT_TYPE_LONG:
num = get_arg(unsigned long);
break;
case FORMAT_TYPE_SIZE_T:
num = get_arg(size_t);
break;
case FORMAT_TYPE_PTRDIFF:
num = get_arg(ptrdiff_t);
break;
case FORMAT_TYPE_UBYTE:
num = get_arg(unsigned char);
break;
case FORMAT_TYPE_BYTE:
num = get_arg(signed char);
break;
case FORMAT_TYPE_USHORT:
num = get_arg(unsigned short);
break;
case FORMAT_TYPE_SHORT:
num = get_arg(short);
break;
case FORMAT_TYPE_UINT:
num = get_arg(unsigned int);
break;
default:
num = get_arg(int);
}
str = number(str, end, num, spec);
} /* default: */
} /* switch(spec.type) */
} /* while(*fmt) */
out:
if (size > 0) {
if (str < end)
*str = '\0';
else
end[-1] = '\0';
}
#undef get_arg
/* the trailing null byte doesn't count towards the total */
return str - buf;
}
EXPORT_SYMBOL_GPL(bstr_printf);
/**
* bprintf - Parse a format string and place args' binary value in a buffer
* @bin_buf: The buffer to place args' binary value
* @size: The size of the buffer(by words(32bits), not characters)
* @fmt: The format string to use
* @...: Arguments for the format string
*
* The function returns the number of words(u32) written
* into @bin_buf.
*/
int bprintf(u32 *bin_buf, size_t size, const char *fmt, ...)
{
va_list args;
int ret;
va_start(args, fmt);
ret = vbin_printf(bin_buf, size, fmt, args);
va_end(args);
return ret;
}
EXPORT_SYMBOL_GPL(bprintf);
#endif /* CONFIG_BINARY_PRINTF */
/**
* vsscanf - Unformat a buffer into a list of arguments
* @buf: input buffer
* @fmt: format of buffer
* @args: arguments
*/
int vsscanf(const char *buf, const char *fmt, va_list args)
{
const char *str = buf;
char *next;
char digit;
int num = 0;
u8 qualifier;
unsigned int base;
union {
long long s;
unsigned long long u;
} val;
s16 field_width;
bool is_sign;
while (*fmt) {
/* skip any white space in format */
/* white space in format matches any amount of
* white space, including none, in the input.
*/
if (isspace(*fmt)) {
fmt = skip_spaces(++fmt);
str = skip_spaces(str);
}
/* anything that is not a conversion must match exactly */
if (*fmt != '%' && *fmt) {
if (*fmt++ != *str++)
break;
continue;
}
if (!*fmt)
break;
++fmt;
/* skip this conversion.
* advance both strings to next white space
*/
if (*fmt == '*') {
if (!*str)
break;
while (!isspace(*fmt) && *fmt != '%' && *fmt) {
/* '%*[' not yet supported, invalid format */
if (*fmt == '[')
return num;
fmt++;
}
while (!isspace(*str) && *str)
str++;
continue;
}
/* get field width */
field_width = -1;
if (isdigit(*fmt)) {
field_width = skip_atoi(&fmt);
if (field_width <= 0)
break;
}
/* get conversion qualifier */
qualifier = -1;
if (*fmt == 'h' || _tolower(*fmt) == 'l' ||
*fmt == 'z') {
qualifier = *fmt++;
if (unlikely(qualifier == *fmt)) {
if (qualifier == 'h') {
qualifier = 'H';
fmt++;
} else if (qualifier == 'l') {
qualifier = 'L';
fmt++;
}
}
}
if (!*fmt)
break;
if (*fmt == 'n') {
/* return number of characters read so far */
*va_arg(args, int *) = str - buf;
++fmt;
continue;
}
if (!*str)
break;
base = 10;
is_sign = false;
switch (*fmt++) {
case 'c':
{
char *s = (char *)va_arg(args, char*);
if (field_width == -1)
field_width = 1;
do {
*s++ = *str++;
} while (--field_width > 0 && *str);
num++;
}
continue;
case 's':
{
char *s = (char *)va_arg(args, char *);
if (field_width == -1)
field_width = SHRT_MAX;
/* first, skip leading white space in buffer */
str = skip_spaces(str);
/* now copy until next white space */
while (*str && !isspace(*str) && field_width--)
*s++ = *str++;
*s = '\0';
num++;
}
continue;
/*
* Warning: This implementation of the '[' conversion specifier
* deviates from its glibc counterpart in the following ways:
* (1) It does NOT support ranges i.e. '-' is NOT a special
* character
* (2) It cannot match the closing bracket ']' itself
* (3) A field width is required
* (4) '%*[' (discard matching input) is currently not supported
*
* Example usage:
* ret = sscanf("00:0a:95","%2[^:]:%2[^:]:%2[^:]",
* buf1, buf2, buf3);
* if (ret < 3)
* // etc..
*/
case '[':
{
char *s = (char *)va_arg(args, char *);
DECLARE_BITMAP(set, 256) = {0};
unsigned int len = 0;
bool negate = (*fmt == '^');
/* field width is required */
if (field_width == -1)
return num;
if (negate)
++fmt;
for ( ; *fmt && *fmt != ']'; ++fmt, ++len)
set_bit((u8)*fmt, set);
/* no ']' or no character set found */
if (!*fmt || !len)
return num;
++fmt;
if (negate) {
bitmap_complement(set, set, 256);
/* exclude null '\0' byte */
clear_bit(0, set);
}
/* match must be non-empty */
if (!test_bit((u8)*str, set))
return num;
while (test_bit((u8)*str, set) && field_width--)
*s++ = *str++;
*s = '\0';
++num;
}
continue;
case 'o':
base = 8;
break;
case 'x':
case 'X':
base = 16;
break;
case 'i':
base = 0;
fallthrough;
case 'd':
is_sign = true;
fallthrough;
case 'u':
break;
case '%':
/* looking for '%' in str */
if (*str++ != '%')
return num;
continue;
default:
/* invalid format; stop here */
return num;
}
/* have some sort of integer conversion.
* first, skip white space in buffer.
*/
str = skip_spaces(str);
digit = *str;
if (is_sign && digit == '-') {
if (field_width == 1)
break;
digit = *(str + 1);
}
if (!digit
|| (base == 16 && !isxdigit(digit))
|| (base == 10 && !isdigit(digit))
|| (base == 8 && (!isdigit(digit) || digit > '7'))
|| (base == 0 && !isdigit(digit)))
break;
if (is_sign)
val.s = simple_strntoll(str,
field_width >= 0 ? field_width : INT_MAX,
&next, base);
else
val.u = simple_strntoull(str,
field_width >= 0 ? field_width : INT_MAX,
&next, base);
switch (qualifier) {
case 'H': /* that's 'hh' in format */
if (is_sign)
*va_arg(args, signed char *) = val.s;
else
*va_arg(args, unsigned char *) = val.u;
break;
case 'h':
if (is_sign)
*va_arg(args, short *) = val.s;
else
*va_arg(args, unsigned short *) = val.u;
break;
case 'l':
if (is_sign)
*va_arg(args, long *) = val.s;
else
*va_arg(args, unsigned long *) = val.u;
break;
case 'L':
if (is_sign)
*va_arg(args, long long *) = val.s;
else
*va_arg(args, unsigned long long *) = val.u;
break;
case 'z':
*va_arg(args, size_t *) = val.u;
break;
default:
if (is_sign)
*va_arg(args, int *) = val.s;
else
*va_arg(args, unsigned int *) = val.u;
break;
}
num++;
if (!next)
break;
str = next;
}
return num;
}
EXPORT_SYMBOL(vsscanf);
/**
* sscanf - Unformat a buffer into a list of arguments
* @buf: input buffer
* @fmt: formatting of buffer
* @...: resulting arguments
*/
int sscanf(const char *buf, const char *fmt, ...)
{
va_list args;
int i;
va_start(args, fmt);
i = vsscanf(buf, fmt, args);
va_end(args);
return i;
}
EXPORT_SYMBOL(sscanf);
/* SPDX-License-Identifier: GPL-2.0 */
#undef TRACE_SYSTEM
#define TRACE_SYSTEM signal
#if !defined(_TRACE_SIGNAL_H) || defined(TRACE_HEADER_MULTI_READ)
#define _TRACE_SIGNAL_H
#include <linux/signal.h>
#include <linux/sched.h>
#include <linux/tracepoint.h>
#define TP_STORE_SIGINFO(__entry, info) \
do { \
if (info == SEND_SIG_NOINFO) { \
__entry->errno = 0; \
__entry->code = SI_USER; \
} else if (info == SEND_SIG_PRIV) { \
__entry->errno = 0; \
__entry->code = SI_KERNEL; \
} else { \
__entry->errno = info->si_errno; \
__entry->code = info->si_code; \
} \
} while (0)
#ifndef TRACE_HEADER_MULTI_READ
enum {
TRACE_SIGNAL_DELIVERED,
TRACE_SIGNAL_IGNORED,
TRACE_SIGNAL_ALREADY_PENDING,
TRACE_SIGNAL_OVERFLOW_FAIL,
TRACE_SIGNAL_LOSE_INFO,
};
#endif
/**
* signal_generate - called when a signal is generated
* @sig: signal number
* @info: pointer to struct siginfo
* @task: pointer to struct task_struct
* @group: shared or private
* @result: TRACE_SIGNAL_*
*
* Current process sends a 'sig' signal to 'task' process with
* 'info' siginfo. If 'info' is SEND_SIG_NOINFO or SEND_SIG_PRIV,
* 'info' is not a pointer and you can't access its field. Instead,
* SEND_SIG_NOINFO means that si_code is SI_USER, and SEND_SIG_PRIV
* means that si_code is SI_KERNEL.
*/
TRACE_EVENT(signal_generate,
TP_PROTO(int sig, struct kernel_siginfo *info, struct task_struct *task,
int group, int result),
TP_ARGS(sig, info, task, group, result),
TP_STRUCT__entry(
__field( int, sig )
__field( int, errno )
__field( int, code )
__array( char, comm, TASK_COMM_LEN )
__field( pid_t, pid )
__field( int, group )
__field( int, result )
),
TP_fast_assign(
__entry->sig = sig;
TP_STORE_SIGINFO(__entry, info);
memcpy(__entry->comm, task->comm, TASK_COMM_LEN);
__entry->pid = task->pid;
__entry->group = group;
__entry->result = result;
),
TP_printk("sig=%d errno=%d code=%d comm=%s pid=%d grp=%d res=%d",
__entry->sig, __entry->errno, __entry->code,
__entry->comm, __entry->pid, __entry->group,
__entry->result)
);
/**
* signal_deliver - called when a signal is delivered
* @sig: signal number
* @info: pointer to struct siginfo
* @ka: pointer to struct k_sigaction
*
* A 'sig' signal is delivered to current process with 'info' siginfo,
* and it will be handled by 'ka'. ka->sa.sa_handler can be SIG_IGN or
* SIG_DFL.
* Note that some signals reported by signal_generate tracepoint can be
* lost, ignored or modified (by debugger) before hitting this tracepoint.
* This means, this can show which signals are actually delivered, but
* matching generated signals and delivered signals may not be correct.
*/
TRACE_EVENT(signal_deliver,
TP_PROTO(int sig, struct kernel_siginfo *info, struct k_sigaction *ka),
TP_ARGS(sig, info, ka),
TP_STRUCT__entry(
__field( int, sig )
__field( int, errno )
__field( int, code )
__field( unsigned long, sa_handler )
__field( unsigned long, sa_flags )
),
TP_fast_assign(
__entry->sig = sig;
TP_STORE_SIGINFO(__entry, info);
__entry->sa_handler = (unsigned long)ka->sa.sa_handler;
__entry->sa_flags = ka->sa.sa_flags;
),
TP_printk("sig=%d errno=%d code=%d sa_handler=%lx sa_flags=%lx",
__entry->sig, __entry->errno, __entry->code,
__entry->sa_handler, __entry->sa_flags)
);
#endif /* _TRACE_SIGNAL_H */
/* This part must be outside protection */
#include <trace/define_trace.h>
// SPDX-License-Identifier: GPL-2.0
/*
* linux/fs/namei.c
*
* Copyright (C) 1991, 1992 Linus Torvalds
*/
/*
* Some corrections by tytso.
*/
/* [Feb 1997 T. Schoebel-Theuer] Complete rewrite of the pathname
* lookup logic.
*/
/* [Feb-Apr 2000, AV] Rewrite to the new namespace architecture.
*/
#include <linux/init.h>
#include <linux/export.h>
#include <linux/kernel.h>
#include <linux/slab.h>
#include <linux/fs.h>
#include <linux/namei.h>
#include <linux/pagemap.h>
#include <linux/fsnotify.h>
#include <linux/personality.h>
#include <linux/security.h>
#include <linux/ima.h>
#include <linux/syscalls.h>
#include <linux/mount.h>
#include <linux/audit.h>
#include <linux/capability.h>
#include <linux/file.h>
#include <linux/fcntl.h>
#include <linux/device_cgroup.h>
#include <linux/fs_struct.h>
#include <linux/posix_acl.h>
#include <linux/hash.h>
#include <linux/bitops.h>
#include <linux/init_task.h>
#include <linux/uaccess.h>
#include "internal.h"
#include "mount.h"
/* [Feb-1997 T. Schoebel-Theuer]
* Fundamental changes in the pathname lookup mechanisms (namei)
* were necessary because of omirr. The reason is that omirr needs
* to know the _real_ pathname, not the user-supplied one, in case
* of symlinks (and also when transname replacements occur).
*
* The new code replaces the old recursive symlink resolution with
* an iterative one (in case of non-nested symlink chains). It does
* this with calls to <fs>_follow_link().
* As a side effect, dir_namei(), _namei() and follow_link() are now
* replaced with a single function lookup_dentry() that can handle all
* the special cases of the former code.
*
* With the new dcache, the pathname is stored at each inode, at least as
* long as the refcount of the inode is positive. As a side effect, the
* size of the dcache depends on the inode cache and thus is dynamic.
*
* [29-Apr-1998 C. Scott Ananian] Updated above description of symlink
* resolution to correspond with current state of the code.
*
* Note that the symlink resolution is not *completely* iterative.
* There is still a significant amount of tail- and mid- recursion in
* the algorithm. Also, note that <fs>_readlink() is not used in
* lookup_dentry(): lookup_dentry() on the result of <fs>_readlink()
* may return different results than <fs>_follow_link(). Many virtual
* filesystems (including /proc) exhibit this behavior.
*/
/* [24-Feb-97 T. Schoebel-Theuer] Side effects caused by new implementation:
* New symlink semantics: when open() is called with flags O_CREAT | O_EXCL
* and the name already exists in form of a symlink, try to create the new
* name indicated by the symlink. The old code always complained that the
* name already exists, due to not following the symlink even if its target
* is nonexistent. The new semantics affects also mknod() and link() when
* the name is a symlink pointing to a non-existent name.
*
* I don't know which semantics is the right one, since I have no access
* to standards. But I found by trial that HP-UX 9.0 has the full "new"
* semantics implemented, while SunOS 4.1.1 and Solaris (SunOS 5.4) have the
* "old" one. Personally, I think the new semantics is much more logical.
* Note that "ln old new" where "new" is a symlink pointing to a non-existing
* file does succeed in both HP-UX and SunOs, but not in Solaris
* and in the old Linux semantics.
*/
/* [16-Dec-97 Kevin Buhr] For security reasons, we change some symlink
* semantics. See the comments in "open_namei" and "do_link" below.
*
* [10-Sep-98 Alan Modra] Another symlink change.
*/
/* [Feb-Apr 2000 AV] Complete rewrite. Rules for symlinks:
* inside the path - always follow.
* in the last component in creation/removal/renaming - never follow.
* if LOOKUP_FOLLOW passed - follow.
* if the pathname has trailing slashes - follow.
* otherwise - don't follow.
* (applied in that order).
*
* [Jun 2000 AV] Inconsistent behaviour of open() in case if flags==O_CREAT
* restored for 2.4. This is the last surviving part of old 4.2BSD bug.
* During the 2.4 we need to fix the userland stuff depending on it -
* hopefully we will be able to get rid of that wart in 2.5. So far only
* XEmacs seems to be relying on it...
*/
/*
* [Sep 2001 AV] Single-semaphore locking scheme (kudos to David Holland)
* implemented. Let's see if raised priority of ->s_vfs_rename_mutex gives
* any extra contention...
*/
/* In order to reduce some races, while at the same time doing additional
* checking and hopefully speeding things up, we copy filenames to the
* kernel data space before using them..
*
* POSIX.1 2.4: an empty pathname is invalid (ENOENT).
* PATH_MAX includes the nul terminator --RR.
*/
#define EMBEDDED_NAME_MAX (PATH_MAX - offsetof(struct filename, iname))
struct filename *
getname_flags(const char __user *filename, int flags, int *empty)
{
struct filename *result;
char *kname;
int len;
result = audit_reusename(filename);
if (result)
return result;
result = __getname();
if (unlikely(!result))
return ERR_PTR(-ENOMEM);
/*
* First, try to embed the struct filename inside the names_cache
* allocation
*/
kname = (char *)result->iname;
result->name = kname;
len = strncpy_from_user(kname, filename, EMBEDDED_NAME_MAX);
if (unlikely(len < 0)) {
__putname(result);
return ERR_PTR(len);
}
/*
* Uh-oh. We have a name that's approaching PATH_MAX. Allocate a
* separate struct filename so we can dedicate the entire
* names_cache allocation for the pathname, and re-do the copy from
* userland.
*/
if (unlikely(len == EMBEDDED_NAME_MAX)) {
const size_t size = offsetof(struct filename, iname[1]);
kname = (char *)result;
/*
* size is chosen that way we to guarantee that
* result->iname[0] is within the same object and that
* kname can't be equal to result->iname, no matter what.
*/
result = kzalloc(size, GFP_KERNEL);
if (unlikely(!result)) {
__putname(kname);
return ERR_PTR(-ENOMEM);
}
result->name = kname;
len = strncpy_from_user(kname, filename, PATH_MAX);
if (unlikely(len < 0)) {
__putname(kname);
kfree(result);
return ERR_PTR(len);
}
if (unlikely(len == PATH_MAX)) { __putname(kname);
kfree(result);
return ERR_PTR(-ENAMETOOLONG);
}
}
result->refcnt = 1;
/* The empty path is special. */
if (unlikely(!len)) {
if (empty) *empty = 1; if (!(flags & LOOKUP_EMPTY)) { putname(result);
return ERR_PTR(-ENOENT);
}
}
result->uptr = filename;
result->aname = NULL;
audit_getname(result);
return result;
}
struct filename *
getname_uflags(const char __user *filename, int uflags)
{
int flags = (uflags & AT_EMPTY_PATH) ? LOOKUP_EMPTY : 0;
return getname_flags(filename, flags, NULL);
}
struct filename *
getname(const char __user * filename)
{
return getname_flags(filename, 0, NULL);
}
struct filename *
getname_kernel(const char * filename)
{
struct filename *result;
int len = strlen(filename) + 1;
result = __getname();
if (unlikely(!result))
return ERR_PTR(-ENOMEM);
if (len <= EMBEDDED_NAME_MAX) { result->name = (char *)result->iname; } else if (len <= PATH_MAX) {
const size_t size = offsetof(struct filename, iname[1]);
struct filename *tmp;
tmp = kmalloc(size, GFP_KERNEL);
if (unlikely(!tmp)) {
__putname(result);
return ERR_PTR(-ENOMEM);
}
tmp->name = (char *)result;
result = tmp;
} else {
__putname(result);
return ERR_PTR(-ENAMETOOLONG);
}
memcpy((char *)result->name, filename, len);
result->uptr = NULL;
result->aname = NULL;
result->refcnt = 1;
audit_getname(result);
return result;
}
void putname(struct filename *name)
{
if (IS_ERR(name))
return;
BUG_ON(name->refcnt <= 0); if (--name->refcnt > 0)
return;
if (name->name != name->iname) { __putname(name->name);
kfree(name);
} else
__putname(name);
}
/**
* check_acl - perform ACL permission checking
* @mnt_userns: user namespace of the mount the inode was found from
* @inode: inode to check permissions on
* @mask: right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC ...)
*
* This function performs the ACL permission checking. Since this function
* retrieve POSIX acls it needs to know whether it is called from a blocking or
* non-blocking context and thus cares about the MAY_NOT_BLOCK bit.
*
* If the inode has been found through an idmapped mount the user namespace of
* the vfsmount must be passed through @mnt_userns. This function will then take
* care to map the inode according to @mnt_userns before checking permissions.
* On non-idmapped mounts or if permission checking is to be performed on the
* raw inode simply passs init_user_ns.
*/
static int check_acl(struct user_namespace *mnt_userns,
struct inode *inode, int mask)
{
#ifdef CONFIG_FS_POSIX_ACL
struct posix_acl *acl;
if (mask & MAY_NOT_BLOCK) { acl = get_cached_acl_rcu(inode, ACL_TYPE_ACCESS);
if (!acl)
return -EAGAIN;
/* no ->get_acl() calls in RCU mode... */
if (is_uncached_acl(acl))
return -ECHILD;
return posix_acl_permission(mnt_userns, inode, acl, mask);
}
acl = get_acl(inode, ACL_TYPE_ACCESS);
if (IS_ERR(acl))
return PTR_ERR(acl);
if (acl) { int error = posix_acl_permission(mnt_userns, inode, acl, mask);
posix_acl_release(acl);
return error;
}
#endif
return -EAGAIN;
}
/**
* acl_permission_check - perform basic UNIX permission checking
* @mnt_userns: user namespace of the mount the inode was found from
* @inode: inode to check permissions on
* @mask: right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC ...)
*
* This function performs the basic UNIX permission checking. Since this
* function may retrieve POSIX acls it needs to know whether it is called from a
* blocking or non-blocking context and thus cares about the MAY_NOT_BLOCK bit.
*
* If the inode has been found through an idmapped mount the user namespace of
* the vfsmount must be passed through @mnt_userns. This function will then take
* care to map the inode according to @mnt_userns before checking permissions.
* On non-idmapped mounts or if permission checking is to be performed on the
* raw inode simply passs init_user_ns.
*/
static int acl_permission_check(struct user_namespace *mnt_userns,
struct inode *inode, int mask)
{
unsigned int mode = inode->i_mode;
kuid_t i_uid;
/* Are we the owner? If so, ACL's don't matter */
i_uid = i_uid_into_mnt(mnt_userns, inode);
if (likely(uid_eq(current_fsuid(), i_uid))) {
mask &= 7;
mode >>= 6;
return (mask & ~mode) ? -EACCES : 0;
}
/* Do we have ACL's? */
if (IS_POSIXACL(inode) && (mode & S_IRWXG)) {
int error = check_acl(mnt_userns, inode, mask);
if (error != -EAGAIN)
return error;
}
/* Only RWX matters for group/other mode bits */
mask &= 7;
/*
* Are the group permissions different from
* the other permissions in the bits we care
* about? Need to check group ownership if so.
*/
if (mask & (mode ^ (mode >> 3))) {
kgid_t kgid = i_gid_into_mnt(mnt_userns, inode);
if (in_group_p(kgid))
mode >>= 3;
}
/* Bits in 'mode' clear that we require? */
return (mask & ~mode) ? -EACCES : 0;
}
/**
* generic_permission - check for access rights on a Posix-like filesystem
* @mnt_userns: user namespace of the mount the inode was found from
* @inode: inode to check access rights for
* @mask: right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC,
* %MAY_NOT_BLOCK ...)
*
* Used to check for read/write/execute permissions on a file.
* We use "fsuid" for this, letting us set arbitrary permissions
* for filesystem access without changing the "normal" uids which
* are used for other things.
*
* generic_permission is rcu-walk aware. It returns -ECHILD in case an rcu-walk
* request cannot be satisfied (eg. requires blocking or too much complexity).
* It would then be called again in ref-walk mode.
*
* If the inode has been found through an idmapped mount the user namespace of
* the vfsmount must be passed through @mnt_userns. This function will then take
* care to map the inode according to @mnt_userns before checking permissions.
* On non-idmapped mounts or if permission checking is to be performed on the
* raw inode simply passs init_user_ns.
*/
int generic_permission(struct user_namespace *mnt_userns, struct inode *inode,
int mask)
{
int ret;
/*
* Do the basic permission checks.
*/
ret = acl_permission_check(mnt_userns, inode, mask);
if (ret != -EACCES)
return ret;
if (S_ISDIR(inode->i_mode)) {
/* DACs are overridable for directories */
if (!(mask & MAY_WRITE)) if (capable_wrt_inode_uidgid(mnt_userns, inode,
CAP_DAC_READ_SEARCH))
return 0;
if (capable_wrt_inode_uidgid(mnt_userns, inode,
CAP_DAC_OVERRIDE))
return 0;
return -EACCES;
}
/*
* Searching includes executable on directories, else just read.
*/
mask &= MAY_READ | MAY_WRITE | MAY_EXEC;
if (mask == MAY_READ)
if (capable_wrt_inode_uidgid(mnt_userns, inode,
CAP_DAC_READ_SEARCH))
return 0;
/*
* Read/write DACs are always overridable.
* Executable DACs are overridable when there is
* at least one exec bit set.
*/
if (!(mask & MAY_EXEC) || (inode->i_mode & S_IXUGO)) if (capable_wrt_inode_uidgid(mnt_userns, inode,
CAP_DAC_OVERRIDE))
return 0;
return -EACCES;
}
EXPORT_SYMBOL(generic_permission);
/**
* do_inode_permission - UNIX permission checking
* @mnt_userns: user namespace of the mount the inode was found from
* @inode: inode to check permissions on
* @mask: right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC ...)
*
* We _really_ want to just do "generic_permission()" without
* even looking at the inode->i_op values. So we keep a cache
* flag in inode->i_opflags, that says "this has not special
* permission function, use the fast case".
*/
static inline int do_inode_permission(struct user_namespace *mnt_userns,
struct inode *inode, int mask)
{
if (unlikely(!(inode->i_opflags & IOP_FASTPERM))) { if (likely(inode->i_op->permission)) return inode->i_op->permission(mnt_userns, inode, mask);
/* This gets set once for the inode lifetime */
spin_lock(&inode->i_lock);
inode->i_opflags |= IOP_FASTPERM;
spin_unlock(&inode->i_lock);
}
return generic_permission(mnt_userns, inode, mask);
}
/**
* sb_permission - Check superblock-level permissions
* @sb: Superblock of inode to check permission on
* @inode: Inode to check permission on
* @mask: Right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC)
*
* Separate out file-system wide checks from inode-specific permission checks.
*/
static int sb_permission(struct super_block *sb, struct inode *inode, int mask)
{
if (unlikely(mask & MAY_WRITE)) { umode_t mode = inode->i_mode;
/* Nobody gets write access to a read-only fs. */
if (sb_rdonly(sb) && (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode)))
return -EROFS;
}
return 0;
}
/**
* inode_permission - Check for access rights to a given inode
* @mnt_userns: User namespace of the mount the inode was found from
* @inode: Inode to check permission on
* @mask: Right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC)
*
* Check for read/write/execute permissions on an inode. We use fs[ug]id for
* this, letting us set arbitrary permissions for filesystem access without
* changing the "normal" UIDs which are used for other things.
*
* When checking for MAY_APPEND, MAY_WRITE must also be set in @mask.
*/
int inode_permission(struct user_namespace *mnt_userns,
struct inode *inode, int mask)
{
int retval;
retval = sb_permission(inode->i_sb, inode, mask);
if (retval)
return retval;
if (unlikely(mask & MAY_WRITE)) {
/*
* Nobody gets write access to an immutable file.
*/
if (IS_IMMUTABLE(inode))
return -EPERM;
/*
* Updating mtime will likely cause i_uid and i_gid to be
* written back improperly if their true value is unknown
* to the vfs.
*/
if (HAS_UNMAPPED_ID(mnt_userns, inode))
return -EACCES;
}
retval = do_inode_permission(mnt_userns, inode, mask);
if (retval)
return retval;
retval = devcgroup_inode_permission(inode, mask);
if (retval)
return retval;
return security_inode_permission(inode, mask);
}
EXPORT_SYMBOL(inode_permission);
/**
* path_get - get a reference to a path
* @path: path to get the reference to
*
* Given a path increment the reference count to the dentry and the vfsmount.
*/
void path_get(const struct path *path)
{
mntget(path->mnt);
dget(path->dentry);
}
EXPORT_SYMBOL(path_get);
/**
* path_put - put a reference to a path
* @path: path to put the reference to
*
* Given a path decrement the reference count to the dentry and the vfsmount.
*/
void path_put(const struct path *path)
{
dput(path->dentry);
mntput(path->mnt);
}
EXPORT_SYMBOL(path_put);
#define EMBEDDED_LEVELS 2
struct nameidata {
struct path path;
struct qstr last;
struct path root;
struct inode *inode; /* path.dentry.d_inode */
unsigned int flags, state;
unsigned seq, m_seq, r_seq;
int last_type;
unsigned depth;
int total_link_count;
struct saved {
struct path link;
struct delayed_call done;
const char *name;
unsigned seq;
} *stack, internal[EMBEDDED_LEVELS];
struct filename *name;
struct nameidata *saved;
unsigned root_seq;
int dfd;
kuid_t dir_uid;
umode_t dir_mode;
} __randomize_layout;
#define ND_ROOT_PRESET 1
#define ND_ROOT_GRABBED 2
#define ND_JUMPED 4
static void __set_nameidata(struct nameidata *p, int dfd, struct filename *name)
{
struct nameidata *old = current->nameidata;
p->stack = p->internal;
p->depth = 0;
p->dfd = dfd;
p->name = name;
p->path.mnt = NULL;
p->path.dentry = NULL;
p->total_link_count = old ? old->total_link_count : 0;
p->saved = old;
current->nameidata = p;
}
static inline void set_nameidata(struct nameidata *p, int dfd, struct filename *name,
const struct path *root)
{
__set_nameidata(p, dfd, name);
p->state = 0;
if (unlikely(root)) {
p->state = ND_ROOT_PRESET;
p->root = *root;
}
}
static void restore_nameidata(void)
{
struct nameidata *now = current->nameidata, *old = now->saved;
current->nameidata = old;
if (old)
old->total_link_count = now->total_link_count; if (now->stack != now->internal) kfree(now->stack);
}
static bool nd_alloc_stack(struct nameidata *nd)
{
struct saved *p;
p= kmalloc_array(MAXSYMLINKS, sizeof(struct saved),
nd->flags & LOOKUP_RCU ? GFP_ATOMIC : GFP_KERNEL);
if (unlikely(!p))
return false;
memcpy(p, nd->internal, sizeof(nd->internal));
nd->stack = p;
return true;}
/**
* path_connected - Verify that a dentry is below mnt.mnt_root
*
* Rename can sometimes move a file or directory outside of a bind
* mount, path_connected allows those cases to be detected.
*/
static bool path_connected(struct vfsmount *mnt, struct dentry *dentry)
{
struct super_block *sb = mnt->mnt_sb;
/* Bind mounts can have disconnected paths */
if (mnt->mnt_root == sb->s_root)
return true;
return is_subdir(dentry, mnt->mnt_root);
}
static void drop_links(struct nameidata *nd)
{
int i = nd->depth;
while (i--) {
struct saved *last = nd->stack + i;
do_delayed_call(&last->done);
clear_delayed_call(&last->done);
}
}
static void terminate_walk(struct nameidata *nd)
{
drop_links(nd); if (!(nd->flags & LOOKUP_RCU)) {
int i;
path_put(&nd->path);
for (i = 0; i < nd->depth; i++)
path_put(&nd->stack[i].link); if (nd->state & ND_ROOT_GRABBED) {
path_put(&nd->root);
nd->state &= ~ND_ROOT_GRABBED;
}
} else {
nd->flags &= ~LOOKUP_RCU;
rcu_read_unlock();
}
nd->depth = 0;
nd->path.mnt = NULL;
nd->path.dentry = NULL;
}
/* path_put is needed afterwards regardless of success or failure */
static bool __legitimize_path(struct path *path, unsigned seq, unsigned mseq)
{
int res = __legitimize_mnt(path->mnt, mseq);
if (unlikely(res)) {
if (res > 0) path->mnt = NULL; path->dentry = NULL;
return false;
}
if (unlikely(!lockref_get_not_dead(&path->dentry->d_lockref))) {
path->dentry = NULL;
return false;
}
return !read_seqcount_retry(&path->dentry->d_seq, seq);
}
static inline bool legitimize_path(struct nameidata *nd,
struct path *path, unsigned seq)
{
return __legitimize_path(path, seq, nd->m_seq);
}
static bool legitimize_links(struct nameidata *nd)
{
int i;
if (unlikely(nd->flags & LOOKUP_CACHED)) {
drop_links(nd);
nd->depth = 0;
return false;
}
for (i = 0; i < nd->depth; i++) { struct saved *last = nd->stack + i;
if (unlikely(!legitimize_path(nd, &last->link, last->seq))) {
drop_links(nd); nd->depth = i + 1; return false;
}
}
return true;
}
static bool legitimize_root(struct nameidata *nd)
{
/*
* For scoped-lookups (where nd->root has been zeroed), we need to
* restart the whole lookup from scratch -- because set_root() is wrong
* for these lookups (nd->dfd is the root, not the filesystem root).
*/
if (!nd->root.mnt && (nd->flags & LOOKUP_IS_SCOPED))
return false;
/* Nothing to do if nd->root is zero or is managed by the VFS user. */
if (!nd->root.mnt || (nd->state & ND_ROOT_PRESET))
return true;
nd->state |= ND_ROOT_GRABBED;
return legitimize_path(nd, &nd->root, nd->root_seq);
}
/*
* Path walking has 2 modes, rcu-walk and ref-walk (see
* Documentation/filesystems/path-lookup.txt). In situations when we can't
* continue in RCU mode, we attempt to drop out of rcu-walk mode and grab
* normal reference counts on dentries and vfsmounts to transition to ref-walk
* mode. Refcounts are grabbed at the last known good point before rcu-walk
* got stuck, so ref-walk may continue from there. If this is not successful
* (eg. a seqcount has changed), then failure is returned and it's up to caller
* to restart the path walk from the beginning in ref-walk mode.
*/
/**
* try_to_unlazy - try to switch to ref-walk mode.
* @nd: nameidata pathwalk data
* Returns: true on success, false on failure
*
* try_to_unlazy attempts to legitimize the current nd->path and nd->root
* for ref-walk mode.
* Must be called from rcu-walk context.
* Nothing should touch nameidata between try_to_unlazy() failure and
* terminate_walk().
*/
static bool try_to_unlazy(struct nameidata *nd)
{
struct dentry *parent = nd->path.dentry; BUG_ON(!(nd->flags & LOOKUP_RCU)); nd->flags &= ~LOOKUP_RCU;
if (unlikely(!legitimize_links(nd)))
goto out1;
if (unlikely(!legitimize_path(nd, &nd->path, nd->seq)))
goto out;
if (unlikely(!legitimize_root(nd)))
goto out;
rcu_read_unlock();
BUG_ON(nd->inode != parent->d_inode);
return true;
out1:
nd->path.mnt = NULL;
nd->path.dentry = NULL;
out:
rcu_read_unlock();
return false;
}
/**
* try_to_unlazy_next - try to switch to ref-walk mode.
* @nd: nameidata pathwalk data
* @dentry: next dentry to step into
* @seq: seq number to check @dentry against
* Returns: true on success, false on failure
*
* Similar to to try_to_unlazy(), but here we have the next dentry already
* picked by rcu-walk and want to legitimize that in addition to the current
* nd->path and nd->root for ref-walk mode. Must be called from rcu-walk context.
* Nothing should touch nameidata between try_to_unlazy_next() failure and
* terminate_walk().
*/
static bool try_to_unlazy_next(struct nameidata *nd, struct dentry *dentry, unsigned seq)
{
BUG_ON(!(nd->flags & LOOKUP_RCU));
nd->flags &= ~LOOKUP_RCU;
if (unlikely(!legitimize_links(nd)))
goto out2;
if (unlikely(!legitimize_mnt(nd->path.mnt, nd->m_seq)))
goto out2;
if (unlikely(!lockref_get_not_dead(&nd->path.dentry->d_lockref)))
goto out1;
/*
* We need to move both the parent and the dentry from the RCU domain
* to be properly refcounted. And the sequence number in the dentry
* validates *both* dentry counters, since we checked the sequence
* number of the parent after we got the child sequence number. So we
* know the parent must still be valid if the child sequence number is
*/
if (unlikely(!lockref_get_not_dead(&dentry->d_lockref)))
goto out;
if (unlikely(read_seqcount_retry(&dentry->d_seq, seq)))
goto out_dput;
/*
* Sequence counts matched. Now make sure that the root is
* still valid and get it if required.
*/
if (unlikely(!legitimize_root(nd)))
goto out_dput;
rcu_read_unlock();
return true;
out2:
nd->path.mnt = NULL;
out1:
nd->path.dentry = NULL;
out:
rcu_read_unlock();
return false;
out_dput:
rcu_read_unlock();
dput(dentry);
return false;
}
static inline int d_revalidate(struct dentry *dentry, unsigned int flags)
{
if (unlikely(dentry->d_flags & DCACHE_OP_REVALIDATE)) return dentry->d_op->d_revalidate(dentry, flags);
else
return 1;
}
/**
* complete_walk - successful completion of path walk
* @nd: pointer nameidata
*
* If we had been in RCU mode, drop out of it and legitimize nd->path.
* Revalidate the final result, unless we'd already done that during
* the path walk or the filesystem doesn't ask for it. Return 0 on
* success, -error on failure. In case of failure caller does not
* need to drop nd->path.
*/
static int complete_walk(struct nameidata *nd)
{
struct dentry *dentry = nd->path.dentry;
int status;
if (nd->flags & LOOKUP_RCU) {
/*
* We don't want to zero nd->root for scoped-lookups or
* externally-managed nd->root.
*/
if (!(nd->state & ND_ROOT_PRESET))
if (!(nd->flags & LOOKUP_IS_SCOPED))
nd->root.mnt = NULL; nd->flags &= ~LOOKUP_CACHED; if (!try_to_unlazy(nd))
return -ECHILD;
}
if (unlikely(nd->flags & LOOKUP_IS_SCOPED)) {
/*
* While the guarantee of LOOKUP_IS_SCOPED is (roughly) "don't
* ever step outside the root during lookup" and should already
* be guaranteed by the rest of namei, we want to avoid a namei
* BUG resulting in userspace being given a path that was not
* scoped within the root at some point during the lookup.
*
* So, do a final sanity-check to make sure that in the
* worst-case scenario (a complete bypass of LOOKUP_IS_SCOPED)
* we won't silently return an fd completely outside of the
* requested root to userspace.
*
* Userspace could move the path outside the root after this
* check, but as discussed elsewhere this is not a concern (the
* resolved file was inside the root at some point).
*/
if (!path_is_under(&nd->path, &nd->root))
return -EXDEV;
}
if (likely(!(nd->state & ND_JUMPED))) return 0; if (likely(!(dentry->d_flags & DCACHE_OP_WEAK_REVALIDATE)))
return 0;
status = dentry->d_op->d_weak_revalidate(dentry, nd->flags);
if (status > 0)
return 0;
if (!status)
status = -ESTALE;
return status;
}
static int set_root(struct nameidata *nd)
{
struct fs_struct *fs = current->fs;
/*
* Jumping to the real root in a scoped-lookup is a BUG in namei, but we
* still have to ensure it doesn't happen because it will cause a breakout
* from the dirfd.
*/
if (WARN_ON(nd->flags & LOOKUP_IS_SCOPED))
return -ENOTRECOVERABLE;
if (nd->flags & LOOKUP_RCU) {
unsigned seq;
do {
seq = read_seqcount_begin(&fs->seq);
nd->root = fs->root;
nd->root_seq = __read_seqcount_begin(&nd->root.dentry->d_seq);
} while (read_seqcount_retry(&fs->seq, seq));
} else {
get_fs_root(fs, &nd->root);
nd->state |= ND_ROOT_GRABBED;
}
return 0;
}
static int nd_jump_root(struct nameidata *nd)
{
if (unlikely(nd->flags & LOOKUP_BENEATH))
return -EXDEV;
if (unlikely(nd->flags & LOOKUP_NO_XDEV)) {
/* Absolute path arguments to path_init() are allowed. */
if (nd->path.mnt != NULL && nd->path.mnt != nd->root.mnt)
return -EXDEV;
}
if (!nd->root.mnt) { int error = set_root(nd); if (error)
return error;
}
if (nd->flags & LOOKUP_RCU) {
struct dentry *d;
nd->path = nd->root;
d = nd->path.dentry;
nd->inode = d->d_inode;
nd->seq = nd->root_seq;
if (unlikely(read_seqcount_retry(&d->d_seq, nd->seq)))
return -ECHILD;
} else {
path_put(&nd->path);
nd->path = nd->root;
path_get(&nd->path);
nd->inode = nd->path.dentry->d_inode;
}
nd->state |= ND_JUMPED; return 0;
}
/*
* Helper to directly jump to a known parsed path from ->get_link,
* caller must have taken a reference to path beforehand.
*/
int nd_jump_link(struct path *path)
{
int error = -ELOOP;
struct nameidata *nd = current->nameidata;
if (unlikely(nd->flags & LOOKUP_NO_MAGICLINKS))
goto err;
error = -EXDEV;
if (unlikely(nd->flags & LOOKUP_NO_XDEV)) {
if (nd->path.mnt != path->mnt)
goto err;
}
/* Not currently safe for scoped-lookups. */
if (unlikely(nd->flags & LOOKUP_IS_SCOPED))
goto err;
path_put(&nd->path);
nd->path = *path;
nd->inode = nd->path.dentry->d_inode;
nd->state |= ND_JUMPED;
return 0;
err:
path_put(path);
return error;
}
static inline void put_link(struct nameidata *nd)
{
struct saved *last = nd->stack + --nd->depth;
do_delayed_call(&last->done);
if (!(nd->flags & LOOKUP_RCU))
path_put(&last->link);
}
int sysctl_protected_symlinks __read_mostly = 0;
int sysctl_protected_hardlinks __read_mostly = 0;
int sysctl_protected_fifos __read_mostly;
int sysctl_protected_regular __read_mostly;
/**
* may_follow_link - Check symlink following for unsafe situations
* @nd: nameidata pathwalk data
*
* In the case of the sysctl_protected_symlinks sysctl being enabled,
* CAP_DAC_OVERRIDE needs to be specifically ignored if the symlink is
* in a sticky world-writable directory. This is to protect privileged
* processes from failing races against path names that may change out
* from under them by way of other users creating malicious symlinks.
* It will permit symlinks to be followed only when outside a sticky
* world-writable directory, or when the uid of the symlink and follower
* match, or when the directory owner matches the symlink's owner.
*
* Returns 0 if following the symlink is allowed, -ve on error.
*/
static inline int may_follow_link(struct nameidata *nd, const struct inode *inode)
{
struct user_namespace *mnt_userns;
kuid_t i_uid;
if (!sysctl_protected_symlinks)
return 0;
mnt_userns = mnt_user_ns(nd->path.mnt);
i_uid = i_uid_into_mnt(mnt_userns, inode);
/* Allowed if owner and follower match. */
if (uid_eq(current_cred()->fsuid, i_uid))
return 0;
/* Allowed if parent directory not sticky and world-writable. */
if ((nd->dir_mode & (S_ISVTX|S_IWOTH)) != (S_ISVTX|S_IWOTH))
return 0;
/* Allowed if parent directory and link owner match. */
if (uid_valid(nd->dir_uid) && uid_eq(nd->dir_uid, i_uid))
return 0;
if (nd->flags & LOOKUP_RCU)
return -ECHILD;
audit_inode(nd->name, nd->stack[0].link.dentry, 0); audit_log_path_denied(AUDIT_ANOM_LINK, "follow_link");
return -EACCES;
}
/**
* safe_hardlink_source - Check for safe hardlink conditions
* @mnt_userns: user namespace of the mount the inode was found from
* @inode: the source inode to hardlink from
*
* Return false if at least one of the following conditions:
* - inode is not a regular file
* - inode is setuid
* - inode is setgid and group-exec
* - access failure for read and write
*
* Otherwise returns true.
*/
static bool safe_hardlink_source(struct user_namespace *mnt_userns,
struct inode *inode)
{
umode_t mode = inode->i_mode;
/* Special files should not get pinned to the filesystem. */
if (!S_ISREG(mode))
return false;
/* Setuid files should not get pinned to the filesystem. */
if (mode & S_ISUID)
return false;
/* Executable setgid files should not get pinned to the filesystem. */
if ((mode & (S_ISGID | S_IXGRP)) == (S_ISGID | S_IXGRP))
return false;
/* Hardlinking to unreadable or unwritable sources is dangerous. */
if (inode_permission(mnt_userns, inode, MAY_READ | MAY_WRITE))
return false;
return true;
}
/**
* may_linkat - Check permissions for creating a hardlink
* @mnt_userns: user namespace of the mount the inode was found from
* @link: the source to hardlink from
*
* Block hardlink when all of:
* - sysctl_protected_hardlinks enabled
* - fsuid does not match inode
* - hardlink source is unsafe (see safe_hardlink_source() above)
* - not CAP_FOWNER in a namespace with the inode owner uid mapped
*
* If the inode has been found through an idmapped mount the user namespace of
* the vfsmount must be passed through @mnt_userns. This function will then take
* care to map the inode according to @mnt_userns before checking permissions.
* On non-idmapped mounts or if permission checking is to be performed on the
* raw inode simply passs init_user_ns.
*
* Returns 0 if successful, -ve on error.
*/
int may_linkat(struct user_namespace *mnt_userns, struct path *link)
{
struct inode *inode = link->dentry->d_inode;
/* Inode writeback is not safe when the uid or gid are invalid. */
if (!uid_valid(i_uid_into_mnt(mnt_userns, inode)) ||
!gid_valid(i_gid_into_mnt(mnt_userns, inode)))
return -EOVERFLOW;
if (!sysctl_protected_hardlinks) return 0;
/* Source inode owner (or CAP_FOWNER) can hardlink all they like,
* otherwise, it must be a safe source.
*/
if (safe_hardlink_source(mnt_userns, inode) ||
inode_owner_or_capable(mnt_userns, inode))
return 0;
audit_log_path_denied(AUDIT_ANOM_LINK, "linkat");
return -EPERM;
}
/**
* may_create_in_sticky - Check whether an O_CREAT open in a sticky directory
* should be allowed, or not, on files that already
* exist.
* @mnt_userns: user namespace of the mount the inode was found from
* @nd: nameidata pathwalk data
* @inode: the inode of the file to open
*
* Block an O_CREAT open of a FIFO (or a regular file) when:
* - sysctl_protected_fifos (or sysctl_protected_regular) is enabled
* - the file already exists
* - we are in a sticky directory
* - we don't own the file
* - the owner of the directory doesn't own the file
* - the directory is world writable
* If the sysctl_protected_fifos (or sysctl_protected_regular) is set to 2
* the directory doesn't have to be world writable: being group writable will
* be enough.
*
* If the inode has been found through an idmapped mount the user namespace of
* the vfsmount must be passed through @mnt_userns. This function will then take
* care to map the inode according to @mnt_userns before checking permissions.
* On non-idmapped mounts or if permission checking is to be performed on the
* raw inode simply passs init_user_ns.
*
* Returns 0 if the open is allowed, -ve on error.
*/
static int may_create_in_sticky(struct user_namespace *mnt_userns,
struct nameidata *nd, struct inode *const inode)
{
umode_t dir_mode = nd->dir_mode; kuid_t dir_uid = nd->dir_uid; if ((!sysctl_protected_fifos && S_ISFIFO(inode->i_mode)) || (!sysctl_protected_regular && S_ISREG(inode->i_mode)) ||
likely(!(dir_mode & S_ISVTX)) ||
uid_eq(i_uid_into_mnt(mnt_userns, inode), dir_uid) ||
uid_eq(current_fsuid(), i_uid_into_mnt(mnt_userns, inode)))
return 0;
if (likely(dir_mode & 0002) || (dir_mode & 0020 && ((sysctl_protected_fifos >= 2 && S_ISFIFO(inode->i_mode)) || (sysctl_protected_regular >= 2 && S_ISREG(inode->i_mode))))) { const char *operation = S_ISFIFO(inode->i_mode) ?
"sticky_create_fifo" :
"sticky_create_regular";
audit_log_path_denied(AUDIT_ANOM_CREAT, operation);
return -EACCES;
}
return 0;
}
/*
* follow_up - Find the mountpoint of path's vfsmount
*
* Given a path, find the mountpoint of its source file system.
* Replace @path with the path of the mountpoint in the parent mount.
* Up is towards /.
*
* Return 1 if we went up a level and 0 if we were already at the
* root.
*/
int follow_up(struct path *path)
{
struct mount *mnt = real_mount(path->mnt);
struct mount *parent;
struct dentry *mountpoint;
read_seqlock_excl(&mount_lock);
parent = mnt->mnt_parent;
if (parent == mnt) {
read_sequnlock_excl(&mount_lock);
return 0;
}
mntget(&parent->mnt);
mountpoint = dget(mnt->mnt_mountpoint);
read_sequnlock_excl(&mount_lock);
dput(path->dentry);
path->dentry = mountpoint;
mntput(path->mnt);
path->mnt = &parent->mnt;
return 1;
}
EXPORT_SYMBOL(follow_up);
static bool choose_mountpoint_rcu(struct mount *m, const struct path *root,
struct path *path, unsigned *seqp)
{
while (mnt_has_parent(m)) {
struct dentry *mountpoint = m->mnt_mountpoint;
m = m->mnt_parent;
if (unlikely(root->dentry == mountpoint &&
root->mnt == &m->mnt))
break;
if (mountpoint != m->mnt.mnt_root) { path->mnt = &m->mnt;
path->dentry = mountpoint;
*seqp = read_seqcount_begin(&mountpoint->d_seq);
return true;
}
}
return false;
}
static bool choose_mountpoint(struct mount *m, const struct path *root,
struct path *path)
{
bool found;
rcu_read_lock();
while (1) {
unsigned seq, mseq = read_seqbegin(&mount_lock);
found = choose_mountpoint_rcu(m, root, path, &seq);
if (unlikely(!found)) {
if (!read_seqretry(&mount_lock, mseq))
break;
} else {
if (likely(__legitimize_path(path, seq, mseq)))
break;
rcu_read_unlock();
path_put(path);
rcu_read_lock();
}
}
rcu_read_unlock();
return found;
}
/*
* Perform an automount
* - return -EISDIR to tell follow_managed() to stop and return the path we
* were called with.
*/
static int follow_automount(struct path *path, int *count, unsigned lookup_flags)
{
struct dentry *dentry = path->dentry;
/* We don't want to mount if someone's just doing a stat -
* unless they're stat'ing a directory and appended a '/' to
* the name.
*
* We do, however, want to mount if someone wants to open or
* create a file of any type under the mountpoint, wants to
* traverse through the mountpoint or wants to open the
* mounted directory. Also, autofs may mark negative dentries
* as being automount points. These will need the attentions
* of the daemon to instantiate them before they can be used.
*/
if (!(lookup_flags & (LOOKUP_PARENT | LOOKUP_DIRECTORY |
LOOKUP_OPEN | LOOKUP_CREATE | LOOKUP_AUTOMOUNT)) &&
dentry->d_inode)
return -EISDIR;
if (count && (*count)++ >= MAXSYMLINKS)
return -ELOOP;
return finish_automount(dentry->d_op->d_automount(path), path);
}
/*
* mount traversal - out-of-line part. One note on ->d_flags accesses -
* dentries are pinned but not locked here, so negative dentry can go
* positive right under us. Use of smp_load_acquire() provides a barrier
* sufficient for ->d_inode and ->d_flags consistency.
*/
static int __traverse_mounts(struct path *path, unsigned flags, bool *jumped,
int *count, unsigned lookup_flags)
{
struct vfsmount *mnt = path->mnt;
bool need_mntput = false;
int ret = 0;
while (flags & DCACHE_MANAGED_DENTRY) {
/* Allow the filesystem to manage the transit without i_mutex
* being held. */
if (flags & DCACHE_MANAGE_TRANSIT) { ret = path->dentry->d_op->d_manage(path, false);
flags = smp_load_acquire(&path->dentry->d_flags);
if (ret < 0)
break;
}
if (flags & DCACHE_MOUNTED) { // something's mounted on it.. struct vfsmount *mounted = lookup_mnt(path);
if (mounted) { // ... in our namespace
dput(path->dentry);
if (need_mntput)
mntput(path->mnt); path->mnt = mounted; path->dentry = dget(mounted->mnt_root);
// here we know it's positive
flags = path->dentry->d_flags;
need_mntput = true;
continue;
}
}
if (!(flags & DCACHE_NEED_AUTOMOUNT))
break;
// uncovered automount point
ret = follow_automount(path, count, lookup_flags);
flags = smp_load_acquire(&path->dentry->d_flags);
if (ret < 0)
break;
}
if (ret == -EISDIR)
ret = 0;
// possible if you race with several mount --move
if (need_mntput && path->mnt == mnt) mntput(path->mnt); if (!ret && unlikely(d_flags_negative(flags)))
ret = -ENOENT;
*jumped = need_mntput;
return ret;
}
static inline int traverse_mounts(struct path *path, bool *jumped,
int *count, unsigned lookup_flags)
{
unsigned flags = smp_load_acquire(&path->dentry->d_flags);
/* fastpath */
if (likely(!(flags & DCACHE_MANAGED_DENTRY))) {
*jumped = false;
if (unlikely(d_flags_negative(flags)))
return -ENOENT;
return 0;
}
return __traverse_mounts(path, flags, jumped, count, lookup_flags);
}
int follow_down_one(struct path *path)
{
struct vfsmount *mounted;
mounted = lookup_mnt(path);
if (mounted) {
dput(path->dentry);
mntput(path->mnt);
path->mnt = mounted;
path->dentry = dget(mounted->mnt_root);
return 1;
}
return 0;
}
EXPORT_SYMBOL(follow_down_one);
/*
* Follow down to the covering mount currently visible to userspace. At each
* point, the filesystem owning that dentry may be queried as to whether the
* caller is permitted to proceed or not.
*/
int follow_down(struct path *path)
{
struct vfsmount *mnt = path->mnt;
bool jumped;
int ret = traverse_mounts(path, &jumped, NULL, 0);
if (path->mnt != mnt)
mntput(mnt);
return ret;
}
EXPORT_SYMBOL(follow_down);
/*
* Try to skip to top of mountpoint pile in rcuwalk mode. Fail if
* we meet a managed dentry that would need blocking.
*/
static bool __follow_mount_rcu(struct nameidata *nd, struct path *path,
struct inode **inode, unsigned *seqp)
{
struct dentry *dentry = path->dentry;
unsigned int flags = dentry->d_flags; if (likely(!(flags & DCACHE_MANAGED_DENTRY)))
return true;
if (unlikely(nd->flags & LOOKUP_NO_XDEV))
return false;
for (;;) {
/*
* Don't forget we might have a non-mountpoint managed dentry
* that wants to block transit.
*/
if (unlikely(flags & DCACHE_MANAGE_TRANSIT)) { int res = dentry->d_op->d_manage(path, true);
if (res)
return res == -EISDIR; flags = dentry->d_flags;
}
if (flags & DCACHE_MOUNTED) { struct mount *mounted = __lookup_mnt(path->mnt, dentry);
if (mounted) {
path->mnt = &mounted->mnt;
dentry = path->dentry = mounted->mnt.mnt_root;
nd->state |= ND_JUMPED;
*seqp = read_seqcount_begin(&dentry->d_seq);
*inode = dentry->d_inode;
/*
* We don't need to re-check ->d_seq after this
* ->d_inode read - there will be an RCU delay
* between mount hash removal and ->mnt_root
* becoming unpinned.
*/
flags = dentry->d_flags;
continue;
}
if (read_seqretry(&mount_lock, nd->m_seq))
return false;
}
return !(flags & DCACHE_NEED_AUTOMOUNT);
}
}
static inline int handle_mounts(struct nameidata *nd, struct dentry *dentry,
struct path *path, struct inode **inode,
unsigned int *seqp)
{
bool jumped;
int ret;
path->mnt = nd->path.mnt;
path->dentry = dentry;
if (nd->flags & LOOKUP_RCU) {
unsigned int seq = *seqp;
if (unlikely(!*inode))
return -ENOENT;
if (likely(__follow_mount_rcu(nd, path, inode, seqp)))
return 0;
if (!try_to_unlazy_next(nd, dentry, seq))
return -ECHILD;
// *path might've been clobbered by __follow_mount_rcu()
path->mnt = nd->path.mnt; path->dentry = dentry;
}
ret = traverse_mounts(path, &jumped, &nd->total_link_count, nd->flags); if (jumped) { if (unlikely(nd->flags & LOOKUP_NO_XDEV))
ret = -EXDEV;
else
nd->state |= ND_JUMPED;
}
if (unlikely(ret)) { dput(path->dentry); if (path->mnt != nd->path.mnt) mntput(path->mnt);
} else {
*inode = d_backing_inode(path->dentry);
*seqp = 0; /* out of RCU mode, so the value doesn't matter */
}
return ret;
}
/*
* This looks up the name in dcache and possibly revalidates the found dentry.
* NULL is returned if the dentry does not exist in the cache.
*/
static struct dentry *lookup_dcache(const struct qstr *name,
struct dentry *dir,
unsigned int flags)
{
struct dentry *dentry = d_lookup(dir, name);
if (dentry) {
int error = d_revalidate(dentry, flags);
if (unlikely(error <= 0)) {
if (!error) d_invalidate(dentry); dput(dentry);
return ERR_PTR(error);
}
}
return dentry;
}
/*
* Parent directory has inode locked exclusive. This is one
* and only case when ->lookup() gets called on non in-lookup
* dentries - as the matter of fact, this only gets called
* when directory is guaranteed to have no in-lookup children
* at all.
*/
static struct dentry *__lookup_hash(const struct qstr *name,
struct dentry *base, unsigned int flags)
{
struct dentry *dentry = lookup_dcache(name, base, flags);
struct dentry *old;
struct inode *dir = base->d_inode; if (dentry)
return dentry;
/* Don't create child dentry for a dead directory. */
if (unlikely(IS_DEADDIR(dir)))
return ERR_PTR(-ENOENT);
dentry = d_alloc(base, name);
if (unlikely(!dentry))
return ERR_PTR(-ENOMEM);
old = dir->i_op->lookup(dir, dentry, flags);
if (unlikely(old)) {
dput(dentry);
dentry = old;
}
return dentry;
}
static struct dentry *lookup_fast(struct nameidata *nd,
struct inode **inode,
unsigned *seqp)
{
struct dentry *dentry, *parent = nd->path.dentry;
int status = 1;
/*
* Rename seqlock is not required here because in the off chance
* of a false negative due to a concurrent rename, the caller is
* going to fall back to non-racy lookup.
*/
if (nd->flags & LOOKUP_RCU) {
unsigned seq;
dentry = __d_lookup_rcu(parent, &nd->last, &seq);
if (unlikely(!dentry)) {
if (!try_to_unlazy(nd)) return ERR_PTR(-ECHILD);
return NULL;
}
/*
* This sequence count validates that the inode matches
* the dentry name information from lookup.
*/
*inode = d_backing_inode(dentry);
if (unlikely(read_seqcount_retry(&dentry->d_seq, seq)))
return ERR_PTR(-ECHILD);
/*
* This sequence count validates that the parent had no
* changes while we did the lookup of the dentry above.
*
* The memory barrier in read_seqcount_begin of child is
* enough, we can use __read_seqcount_retry here.
*/
if (unlikely(__read_seqcount_retry(&parent->d_seq, nd->seq)))
return ERR_PTR(-ECHILD);
*seqp = seq; status = d_revalidate(dentry, nd->flags);
if (likely(status > 0))
return dentry;
if (!try_to_unlazy_next(nd, dentry, seq))
return ERR_PTR(-ECHILD);
if (status == -ECHILD)
/* we'd been told to redo it in non-rcu mode */
status = d_revalidate(dentry, nd->flags);
} else {
dentry = __d_lookup(parent, &nd->last);
if (unlikely(!dentry))
return NULL;
status = d_revalidate(dentry, nd->flags);
}
if (unlikely(status <= 0)) { if (!status) d_invalidate(dentry); dput(dentry);
return ERR_PTR(status);
}
return dentry;
}
/* Fast lookup failed, do it the slow way */
static struct dentry *__lookup_slow(const struct qstr *name,
struct dentry *dir,
unsigned int flags)
{
struct dentry *dentry, *old;
struct inode *inode = dir->d_inode;
DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq);
/* Don't go there if it's already dead */
if (unlikely(IS_DEADDIR(inode)))
return ERR_PTR(-ENOENT);
again:
dentry = d_alloc_parallel(dir, name, &wq);
if (IS_ERR(dentry))
return dentry;
if (unlikely(!d_in_lookup(dentry))) {
int error = d_revalidate(dentry, flags);
if (unlikely(error <= 0)) {
if (!error) { d_invalidate(dentry);
dput(dentry);
goto again;
}
dput(dentry);
dentry = ERR_PTR(error);
}
} else {
old = inode->i_op->lookup(inode, dentry, flags);
d_lookup_done(dentry);
if (unlikely(old)) { dput(dentry);
dentry = old;
}
}
return dentry;
}
static struct dentry *lookup_slow(const struct qstr *name,
struct dentry *dir,
unsigned int flags)
{
struct inode *inode = dir->d_inode;
struct dentry *res;
inode_lock_shared(inode);
res = __lookup_slow(name, dir, flags);
inode_unlock_shared(inode);
return res;
}
static inline int may_lookup(struct user_namespace *mnt_userns,
struct nameidata *nd)
{
if (nd->flags & LOOKUP_RCU) {
int err = inode_permission(mnt_userns, nd->inode, MAY_EXEC|MAY_NOT_BLOCK); if (err != -ECHILD || !try_to_unlazy(nd))
return err;
}
return inode_permission(mnt_userns, nd->inode, MAY_EXEC);
}
static int reserve_stack(struct nameidata *nd, struct path *link, unsigned seq)
{
if (unlikely(nd->total_link_count++ >= MAXSYMLINKS))
return -ELOOP;
if (likely(nd->depth != EMBEDDED_LEVELS))
return 0;
if (likely(nd->stack != nd->internal))
return 0;
if (likely(nd_alloc_stack(nd)))
return 0;
if (nd->flags & LOOKUP_RCU) {
// we need to grab link before we do unlazy. And we can't skip
// unlazy even if we fail to grab the link - cleanup needs it
bool grabbed_link = legitimize_path(nd, link, seq); if (!try_to_unlazy(nd) != 0 || !grabbed_link)
return -ECHILD;
if (nd_alloc_stack(nd))
return 0;
}
return -ENOMEM;
}
enum {WALK_TRAILING = 1, WALK_MORE = 2, WALK_NOFOLLOW = 4};
static const char *pick_link(struct nameidata *nd, struct path *link,
struct inode *inode, unsigned seq, int flags)
{
struct saved *last;
const char *res;
int error = reserve_stack(nd, link, seq);
if (unlikely(error)) {
if (!(nd->flags & LOOKUP_RCU))
path_put(link);
return ERR_PTR(error);
}
last = nd->stack + nd->depth++;
last->link = *link;
clear_delayed_call(&last->done);
last->seq = seq;
if (flags & WALK_TRAILING) {
error = may_follow_link(nd, inode);
if (unlikely(error))
return ERR_PTR(error);
}
if (unlikely(nd->flags & LOOKUP_NO_SYMLINKS) || unlikely(link->mnt->mnt_flags & MNT_NOSYMFOLLOW))
return ERR_PTR(-ELOOP);
if (!(nd->flags & LOOKUP_RCU)) { touch_atime(&last->link);
cond_resched();
} else if (atime_needs_update(&last->link, inode)) { if (!try_to_unlazy(nd))
return ERR_PTR(-ECHILD);
touch_atime(&last->link);
}
error = security_inode_follow_link(link->dentry, inode,
nd->flags & LOOKUP_RCU);
if (unlikely(error))
return ERR_PTR(error);
res = READ_ONCE(inode->i_link);
if (!res) {
const char * (*get)(struct dentry *, struct inode *,
struct delayed_call *);
get = inode->i_op->get_link;
if (nd->flags & LOOKUP_RCU) {
res = get(NULL, inode, &last->done); if (res == ERR_PTR(-ECHILD) && try_to_unlazy(nd))
res = get(link->dentry, inode, &last->done);
} else {
res = get(link->dentry, inode, &last->done);
}
if (!res)
goto all_done;
if (IS_ERR(res))
return res;
}
if (*res == '/') { error = nd_jump_root(nd);
if (unlikely(error))
return ERR_PTR(error); while (unlikely(*++res == '/'))
;
}
if (*res)
return res;
all_done: // pure jump
put_link(nd);
return NULL;
}
/*
* Do we need to follow links? We _really_ want to be able
* to do this check without having to look at inode->i_op,
* so we keep a cache of "no, this doesn't need follow_link"
* for the common case.
*/
static const char *step_into(struct nameidata *nd, int flags,
struct dentry *dentry, struct inode *inode, unsigned seq)
{
struct path path;
int err = handle_mounts(nd, dentry, &path, &inode, &seq);
if (err < 0)
return ERR_PTR(err);
if (likely(!d_is_symlink(path.dentry)) ||
((flags & WALK_TRAILING) && !(nd->flags & LOOKUP_FOLLOW)) || (flags & WALK_NOFOLLOW)) {
/* not a symlink or should not follow */
if (!(nd->flags & LOOKUP_RCU)) { dput(nd->path.dentry);
if (nd->path.mnt != path.mnt)
mntput(nd->path.mnt);
}
nd->path = path;
nd->inode = inode;
nd->seq = seq;
return NULL;
}
if (nd->flags & LOOKUP_RCU) {
/* make sure that d_is_symlink above matches inode */
if (read_seqcount_retry(&path.dentry->d_seq, seq))
return ERR_PTR(-ECHILD);
} else {
if (path.mnt == nd->path.mnt) mntget(path.mnt);
}
return pick_link(nd, &path, inode, seq, flags);
}
static struct dentry *follow_dotdot_rcu(struct nameidata *nd,
struct inode **inodep,
unsigned *seqp)
{
struct dentry *parent, *old;
if (path_equal(&nd->path, &nd->root))
goto in_root;
if (unlikely(nd->path.dentry == nd->path.mnt->mnt_root)) {
struct path path;
unsigned seq;
if (!choose_mountpoint_rcu(real_mount(nd->path.mnt),
&nd->root, &path, &seq))
goto in_root;
if (unlikely(nd->flags & LOOKUP_NO_XDEV))
return ERR_PTR(-ECHILD);
nd->path = path;
nd->inode = path.dentry->d_inode;
nd->seq = seq;
if (unlikely(read_seqretry(&mount_lock, nd->m_seq)))
return ERR_PTR(-ECHILD);
/* we know that mountpoint was pinned */
}
old = nd->path.dentry;
parent = old->d_parent;
*inodep = parent->d_inode;
*seqp = read_seqcount_begin(&parent->d_seq);
if (unlikely(read_seqcount_retry(&old->d_seq, nd->seq)))
return ERR_PTR(-ECHILD);
if (unlikely(!path_connected(nd->path.mnt, parent)))
return ERR_PTR(-ECHILD);
return parent;
in_root:
if (unlikely(read_seqretry(&mount_lock, nd->m_seq)))
return ERR_PTR(-ECHILD);
if (unlikely(nd->flags & LOOKUP_BENEATH))
return ERR_PTR(-ECHILD);
return NULL;
}
static struct dentry *follow_dotdot(struct nameidata *nd,
struct inode **inodep,
unsigned *seqp)
{
struct dentry *parent;
if (path_equal(&nd->path, &nd->root))
goto in_root;
if (unlikely(nd->path.dentry == nd->path.mnt->mnt_root)) {
struct path path;
if (!choose_mountpoint(real_mount(nd->path.mnt),
&nd->root, &path))
goto in_root;
path_put(&nd->path);
nd->path = path;
nd->inode = path.dentry->d_inode;
if (unlikely(nd->flags & LOOKUP_NO_XDEV)) return ERR_PTR(-EXDEV);
}
/* rare case of legitimate dget_parent()... */
parent = dget_parent(nd->path.dentry);
if (unlikely(!path_connected(nd->path.mnt, parent))) {
dput(parent);
return ERR_PTR(-ENOENT);
}
*seqp = 0;
*inodep = parent->d_inode;
return parent;
in_root:
if (unlikely(nd->flags & LOOKUP_BENEATH))
return ERR_PTR(-EXDEV);
dget(nd->path.dentry);
return NULL;
}
static const char *handle_dots(struct nameidata *nd, int type)
{
if (type == LAST_DOTDOT) {
const char *error = NULL;
struct dentry *parent;
struct inode *inode;
unsigned seq;
if (!nd->root.mnt) { error = ERR_PTR(set_root(nd)); if (error)
return error;
}
if (nd->flags & LOOKUP_RCU)
parent = follow_dotdot_rcu(nd, &inode, &seq);
else
parent = follow_dotdot(nd, &inode, &seq);
if (IS_ERR(parent))
return ERR_CAST(parent);
if (unlikely(!parent)) error = step_into(nd, WALK_NOFOLLOW,
nd->path.dentry, nd->inode, nd->seq);
else
error = step_into(nd, WALK_NOFOLLOW,
parent, inode, seq);
if (unlikely(error))
return error;
if (unlikely(nd->flags & LOOKUP_IS_SCOPED)) {
/*
* If there was a racing rename or mount along our
* path, then we can't be sure that ".." hasn't jumped
* above nd->root (and so userspace should retry or use
* some fallback).
*/
smp_rmb();
if (unlikely(__read_seqcount_retry(&mount_lock.seqcount, nd->m_seq)))
return ERR_PTR(-EAGAIN);
if (unlikely(__read_seqcount_retry(&rename_lock.seqcount, nd->r_seq)))
return ERR_PTR(-EAGAIN);
}
}
return NULL;
}
static const char *walk_component(struct nameidata *nd, int flags)
{
struct dentry *dentry;
struct inode *inode;
unsigned seq;
/*
* "." and ".." are special - ".." especially so because it has
* to be able to know about the current root directory and
* parent relationships.
*/
if (unlikely(nd->last_type != LAST_NORM)) { if (!(flags & WALK_MORE) && nd->depth)
put_link(nd);
return handle_dots(nd, nd->last_type);
}
dentry = lookup_fast(nd, &inode, &seq);
if (IS_ERR(dentry))
return ERR_CAST(dentry);
if (unlikely(!dentry)) { dentry = lookup_slow(&nd->last, nd->path.dentry, nd->flags);
if (IS_ERR(dentry))
return ERR_CAST(dentry);
}
if (!(flags & WALK_MORE) && nd->depth)
put_link(nd);
return step_into(nd, flags, dentry, inode, seq);
}
/*
* We can do the critical dentry name comparison and hashing
* operations one word at a time, but we are limited to:
*
* - Architectures with fast unaligned word accesses. We could
* do a "get_unaligned()" if this helps and is sufficiently
* fast.
*
* - non-CONFIG_DEBUG_PAGEALLOC configurations (so that we
* do not trap on the (extremely unlikely) case of a page
* crossing operation.
*
* - Furthermore, we need an efficient 64-bit compile for the
* 64-bit case in order to generate the "number of bytes in
* the final mask". Again, that could be replaced with a
* efficient population count instruction or similar.
*/
#ifdef CONFIG_DCACHE_WORD_ACCESS
#include <asm/word-at-a-time.h>
#ifdef HASH_MIX
/* Architecture provides HASH_MIX and fold_hash() in <asm/hash.h> */
#elif defined(CONFIG_64BIT)
/*
* Register pressure in the mixing function is an issue, particularly
* on 32-bit x86, but almost any function requires one state value and
* one temporary. Instead, use a function designed for two state values
* and no temporaries.
*
* This function cannot create a collision in only two iterations, so
* we have two iterations to achieve avalanche. In those two iterations,
* we have six layers of mixing, which is enough to spread one bit's
* influence out to 2^6 = 64 state bits.
*
* Rotate constants are scored by considering either 64 one-bit input
* deltas or 64*63/2 = 2016 two-bit input deltas, and finding the
* probability of that delta causing a change to each of the 128 output
* bits, using a sample of random initial states.
*
* The Shannon entropy of the computed probabilities is then summed
* to produce a score. Ideally, any input change has a 50% chance of
* toggling any given output bit.
*
* Mixing scores (in bits) for (12,45):
* Input delta: 1-bit 2-bit
* 1 round: 713.3 42542.6
* 2 rounds: 2753.7 140389.8
* 3 rounds: 5954.1 233458.2
* 4 rounds: 7862.6 256672.2
* Perfect: 8192 258048
* (64*128) (64*63/2 * 128)
*/
#define HASH_MIX(x, y, a) \
( x ^= (a), \
y ^= x, x = rol64(x,12),\
x += y, y = rol64(y,45),\
y *= 9 )
/*
* Fold two longs into one 32-bit hash value. This must be fast, but
* latency isn't quite as critical, as there is a fair bit of additional
* work done before the hash value is used.
*/
static inline unsigned int fold_hash(unsigned long x, unsigned long y)
{
y ^= x * GOLDEN_RATIO_64;
y *= GOLDEN_RATIO_64;
return y >> 32;
}
#else /* 32-bit case */
/*
* Mixing scores (in bits) for (7,20):
* Input delta: 1-bit 2-bit
* 1 round: 330.3 9201.6
* 2 rounds: 1246.4 25475.4
* 3 rounds: 1907.1 31295.1
* 4 rounds: 2042.3 31718.6
* Perfect: 2048 31744
* (32*64) (32*31/2 * 64)
*/
#define HASH_MIX(x, y, a) \
( x ^= (a), \
y ^= x, x = rol32(x, 7),\
x += y, y = rol32(y,20),\
y *= 9 )
static inline unsigned int fold_hash(unsigned long x, unsigned long y)
{
/* Use arch-optimized multiply if one exists */
return __hash_32(y ^ __hash_32(x));
}
#endif
/*
* Return the hash of a string of known length. This is carfully
* designed to match hash_name(), which is the more critical function.
* In particular, we must end by hashing a final word containing 0..7
* payload bytes, to match the way that hash_name() iterates until it
* finds the delimiter after the name.
*/
unsigned int full_name_hash(const void *salt, const char *name, unsigned int len)
{
unsigned long a, x = 0, y = (unsigned long)salt;
for (;;) {
if (!len)
goto done;
a = load_unaligned_zeropad(name);
if (len < sizeof(unsigned long))
break;
HASH_MIX(x, y, a);
name += sizeof(unsigned long);
len -= sizeof(unsigned long);
}
x ^= a & bytemask_from_count(len);
done:
return fold_hash(x, y);
}
EXPORT_SYMBOL(full_name_hash);
/* Return the "hash_len" (hash and length) of a null-terminated string */
u64 hashlen_string(const void *salt, const char *name)
{
unsigned long a = 0, x = 0, y = (unsigned long)salt;
unsigned long adata, mask, len;
const struct word_at_a_time constants = WORD_AT_A_TIME_CONSTANTS;
len = 0;
goto inside;
do {
HASH_MIX(x, y, a);
len += sizeof(unsigned long);
inside:
a = load_unaligned_zeropad(name+len);
} while (!has_zero(a, &adata, &constants));
adata = prep_zero_mask(a, adata, &constants);
mask = create_zero_mask(adata);
x ^= a & zero_bytemask(mask);
return hashlen_create(fold_hash(x, y), len + find_zero(mask));
}
EXPORT_SYMBOL(hashlen_string);
/*
* Calculate the length and hash of the path component, and
* return the "hash_len" as the result.
*/
static inline u64 hash_name(const void *salt, const char *name)
{
unsigned long a = 0, b, x = 0, y = (unsigned long)salt;
unsigned long adata, bdata, mask, len;
const struct word_at_a_time constants = WORD_AT_A_TIME_CONSTANTS;
len = 0;
goto inside;
do {
HASH_MIX(x, y, a);
len += sizeof(unsigned long);
inside:
a = load_unaligned_zeropad(name+len);
b = a ^ REPEAT_BYTE('/');
} while (!(has_zero(a, &adata, &constants) | has_zero(b, &bdata, &constants)));
adata = prep_zero_mask(a, adata, &constants);
bdata = prep_zero_mask(b, bdata, &constants);
mask = create_zero_mask(adata | bdata);
x ^= a & zero_bytemask(mask);
return hashlen_create(fold_hash(x, y), len + find_zero(mask));
}
#else /* !CONFIG_DCACHE_WORD_ACCESS: Slow, byte-at-a-time version */
/* Return the hash of a string of known length */
unsigned int full_name_hash(const void *salt, const char *name, unsigned int len)
{
unsigned long hash = init_name_hash(salt);
while (len--)
hash = partial_name_hash((unsigned char)*name++, hash);
return end_name_hash(hash);
}
EXPORT_SYMBOL(full_name_hash);
/* Return the "hash_len" (hash and length) of a null-terminated string */
u64 hashlen_string(const void *salt, const char *name)
{
unsigned long hash = init_name_hash(salt);
unsigned long len = 0, c;
c = (unsigned char)*name;
while (c) {
len++;
hash = partial_name_hash(c, hash);
c = (unsigned char)name[len];
}
return hashlen_create(end_name_hash(hash), len);
}
EXPORT_SYMBOL(hashlen_string);
/*
* We know there's a real path component here of at least
* one character.
*/
static inline u64 hash_name(const void *salt, const char *name)
{
unsigned long hash = init_name_hash(salt);
unsigned long len = 0, c;
c = (unsigned char)*name;
do {
len++;
hash = partial_name_hash(c, hash);
c = (unsigned char)name[len];
} while (c && c != '/');
return hashlen_create(end_name_hash(hash), len);
}
#endif
/*
* Name resolution.
* This is the basic name resolution function, turning a pathname into
* the final dentry. We expect 'base' to be positive and a directory.
*
* Returns 0 and nd will have valid dentry and mnt on success.
* Returns error and drops reference to input namei data on failure.
*/
static int link_path_walk(const char *name, struct nameidata *nd)
{
int depth = 0; // depth <= nd->depth
int err;
nd->last_type = LAST_ROOT;
nd->flags |= LOOKUP_PARENT;
if (IS_ERR(name))
return PTR_ERR(name);
while (*name=='/') name++; if (!*name) { nd->dir_mode = 0; // short-circuit the 'hardening' idiocy return 0;
}
/* At this point we know we have a real path component. */
for(;;) {
struct user_namespace *mnt_userns;
const char *link;
u64 hash_len;
int type;
mnt_userns = mnt_user_ns(nd->path.mnt);
err = may_lookup(mnt_userns, nd);
if (err)
return err;
hash_len = hash_name(nd->path.dentry, name);
type = LAST_NORM;
if (name[0] == '.') switch (hashlen_len(hash_len)) {
case 2:
if (name[1] == '.') {
type = LAST_DOTDOT;
nd->state |= ND_JUMPED;
}
break;
case 1:
type = LAST_DOT;
}
if (likely(type == LAST_NORM)) {
struct dentry *parent = nd->path.dentry;
nd->state &= ~ND_JUMPED; if (unlikely(parent->d_flags & DCACHE_OP_HASH)) { struct qstr this = { { .hash_len = hash_len }, .name = name };
err = parent->d_op->d_hash(parent, &this);
if (err < 0)
return err; hash_len = this.hash_len;
name = this.name;
}
}
nd->last.hash_len = hash_len;
nd->last.name = name;
nd->last_type = type;
name += hashlen_len(hash_len);
if (!*name)
goto OK;
/*
* If it wasn't NUL, we know it was '/'. Skip that
* slash, and continue until no more slashes.
*/
do {
name++;
} while (unlikely(*name == '/'));
if (unlikely(!*name)) {
OK:
/* pathname or trailing symlink, done */
if (!depth) { nd->dir_uid = i_uid_into_mnt(mnt_userns, nd->inode);
nd->dir_mode = nd->inode->i_mode;
nd->flags &= ~LOOKUP_PARENT;
return 0;
}
/* last component of nested symlink */
name = nd->stack[--depth].name;
link = walk_component(nd, 0);
} else {
/* not the last component */
link = walk_component(nd, WALK_MORE);
}
if (unlikely(link)) {
if (IS_ERR(link))
return PTR_ERR(link);
/* a symlink to follow */
nd->stack[depth++].name = name;
name = link;
continue;
}
if (unlikely(!d_can_lookup(nd->path.dentry))) { if (nd->flags & LOOKUP_RCU) { if (!try_to_unlazy(nd))
return -ECHILD;
}
return -ENOTDIR;
}
}
}
/* must be paired with terminate_walk() */
static const char *path_init(struct nameidata *nd, unsigned flags)
{
int error;
const char *s = nd->name->name;
/* LOOKUP_CACHED requires RCU, ask caller to retry */
if ((flags & (LOOKUP_RCU | LOOKUP_CACHED)) == LOOKUP_CACHED)
return ERR_PTR(-EAGAIN);
if (!*s)
flags &= ~LOOKUP_RCU; if (flags & LOOKUP_RCU)
rcu_read_lock();
nd->flags = flags;
nd->state |= ND_JUMPED;
nd->m_seq = __read_seqcount_begin(&mount_lock.seqcount); nd->r_seq = __read_seqcount_begin(&rename_lock.seqcount);
smp_rmb();
if (nd->state & ND_ROOT_PRESET) {
struct dentry *root = nd->root.dentry;
struct inode *inode = root->d_inode;
if (*s && unlikely(!d_can_lookup(root)))
return ERR_PTR(-ENOTDIR);
nd->path = nd->root;
nd->inode = inode;
if (flags & LOOKUP_RCU) {
nd->seq = read_seqcount_begin(&nd->path.dentry->d_seq); nd->root_seq = nd->seq;
} else {
path_get(&nd->path);
}
return s;
}
nd->root.mnt = NULL;
/* Absolute pathname -- fetch the root (LOOKUP_IN_ROOT uses nd->dfd). */
if (*s == '/' && !(flags & LOOKUP_IN_ROOT)) { error = nd_jump_root(nd);
if (unlikely(error))
return ERR_PTR(error);
return s;
}
/* Relative pathname -- get the starting-point it is relative to. */
if (nd->dfd == AT_FDCWD) { if (flags & LOOKUP_RCU) {
struct fs_struct *fs = current->fs;
unsigned seq;
do {
seq = read_seqcount_begin(&fs->seq);
nd->path = fs->pwd;
nd->inode = nd->path.dentry->d_inode;
nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq);
} while (read_seqcount_retry(&fs->seq, seq));
} else {
get_fs_pwd(current->fs, &nd->path);
nd->inode = nd->path.dentry->d_inode;
}
} else {
/* Caller must check execute permissions on the starting path component */
struct fd f = fdget_raw(nd->dfd);
struct dentry *dentry;
if (!f.file)
return ERR_PTR(-EBADF);
dentry = f.file->f_path.dentry;
if (*s && unlikely(!d_can_lookup(dentry))) {
fdput(f);
return ERR_PTR(-ENOTDIR);
}
nd->path = f.file->f_path;
if (flags & LOOKUP_RCU) {
nd->inode = nd->path.dentry->d_inode; nd->seq = read_seqcount_begin(&nd->path.dentry->d_seq);
} else {
path_get(&nd->path);
nd->inode = nd->path.dentry->d_inode;
}
fdput(f);
}
/* For scoped-lookups we need to set the root to the dirfd as well. */
if (flags & LOOKUP_IS_SCOPED) { nd->root = nd->path;
if (flags & LOOKUP_RCU) {
nd->root_seq = nd->seq;
} else {
path_get(&nd->root);
nd->state |= ND_ROOT_GRABBED;
}
}
return s;
}
static inline const char *lookup_last(struct nameidata *nd)
{
if (nd->last_type == LAST_NORM && nd->last.name[nd->last.len]) nd->flags |= LOOKUP_FOLLOW | LOOKUP_DIRECTORY; return walk_component(nd, WALK_TRAILING);
}
static int handle_lookup_down(struct nameidata *nd)
{
if (!(nd->flags & LOOKUP_RCU))
dget(nd->path.dentry);
return PTR_ERR(step_into(nd, WALK_NOFOLLOW,
nd->path.dentry, nd->inode, nd->seq));
}
/* Returns 0 and nd will be valid on success; Retuns error, otherwise. */
static int path_lookupat(struct nameidata *nd, unsigned flags, struct path *path)
{
const char *s = path_init(nd, flags);
int err;
if (unlikely(flags & LOOKUP_DOWN) && !IS_ERR(s)) {
err = handle_lookup_down(nd);
if (unlikely(err < 0))
s = ERR_PTR(err);
}
while (!(err = link_path_walk(s, nd)) &&
(s = lookup_last(nd)) != NULL)
;
if (!err && unlikely(nd->flags & LOOKUP_MOUNTPOINT)) { err = handle_lookup_down(nd);
nd->state &= ~ND_JUMPED; // no d_weak_revalidate(), please...
}
if (!err)
err = complete_walk(nd); if (!err && nd->flags & LOOKUP_DIRECTORY) if (!d_can_lookup(nd->path.dentry))
err = -ENOTDIR;
if (!err) {
*path = nd->path;
nd->path.mnt = NULL;
nd->path.dentry = NULL;
}
terminate_walk(nd);
return err;
}
int filename_lookup(int dfd, struct filename *name, unsigned flags,
struct path *path, struct path *root)
{
int retval;
struct nameidata nd;
if (IS_ERR(name))
return PTR_ERR(name);
set_nameidata(&nd, dfd, name, root);
retval = path_lookupat(&nd, flags | LOOKUP_RCU, path);
if (unlikely(retval == -ECHILD))
retval = path_lookupat(&nd, flags, path); if (unlikely(retval == -ESTALE)) retval = path_lookupat(&nd, flags | LOOKUP_REVAL, path); if (likely(!retval)) audit_inode(name, path->dentry,
flags & LOOKUP_MOUNTPOINT ? AUDIT_INODE_NOEVAL : 0);
restore_nameidata(); return retval;
}
/* Returns 0 and nd will be valid on success; Retuns error, otherwise. */
static int path_parentat(struct nameidata *nd, unsigned flags,
struct path *parent)
{
const char *s = path_init(nd, flags);
int err = link_path_walk(s, nd);
if (!err) err = complete_walk(nd);
if (!err) {
*parent = nd->path;
nd->path.mnt = NULL;
nd->path.dentry = NULL;
}
terminate_walk(nd);
return err;
}
/* Note: this does not consume "name" */
static int filename_parentat(int dfd, struct filename *name,
unsigned int flags, struct path *parent,
struct qstr *last, int *type)
{
int retval;
struct nameidata nd;
if (IS_ERR(name))
return PTR_ERR(name);
set_nameidata(&nd, dfd, name, NULL);
retval = path_parentat(&nd, flags | LOOKUP_RCU, parent);
if (unlikely(retval == -ECHILD))
retval = path_parentat(&nd, flags, parent); if (unlikely(retval == -ESTALE)) retval = path_parentat(&nd, flags | LOOKUP_REVAL, parent); if (likely(!retval)) { *last = nd.last;
*type = nd.last_type;
audit_inode(name, parent->dentry, AUDIT_INODE_PARENT);
}
restore_nameidata(); return retval;
}
/* does lookup, returns the object with parent locked */
static struct dentry *__kern_path_locked(struct filename *name, struct path *path)
{
struct dentry *d;
struct qstr last;
int type, error;
error = filename_parentat(AT_FDCWD, name, 0, path, &last, &type);
if (error)
return ERR_PTR(error);
if (unlikely(type != LAST_NORM)) {
path_put(path);
return ERR_PTR(-EINVAL);
}
inode_lock_nested(path->dentry->d_inode, I_MUTEX_PARENT);
d = __lookup_hash(&last, path->dentry, 0);
if (IS_ERR(d)) {
inode_unlock(path->dentry->d_inode);
path_put(path);
}
return d;
}
struct dentry *kern_path_locked(const char *name, struct path *path)
{
struct filename *filename = getname_kernel(name);
struct dentry *res = __kern_path_locked(filename, path);
putname(filename);
return res;
}
int kern_path(const char *name, unsigned int flags, struct path *path)
{
struct filename *filename = getname_kernel(name);
int ret = filename_lookup(AT_FDCWD, filename, flags, path, NULL);
putname(filename);
return ret;
}
EXPORT_SYMBOL(kern_path);
/**
* vfs_path_lookup - lookup a file path relative to a dentry-vfsmount pair
* @dentry: pointer to dentry of the base directory
* @mnt: pointer to vfs mount of the base directory
* @name: pointer to file name
* @flags: lookup flags
* @path: pointer to struct path to fill
*/
int vfs_path_lookup(struct dentry *dentry, struct vfsmount *mnt,
const char *name, unsigned int flags,
struct path *path)
{
struct filename *filename;
struct path root = {.mnt = mnt, .dentry = dentry};
int ret;
filename = getname_kernel(name);
/* the first argument of filename_lookup() is ignored with root */
ret = filename_lookup(AT_FDCWD, filename, flags, path, &root);
putname(filename);
return ret;
}
EXPORT_SYMBOL(vfs_path_lookup);
static int lookup_one_common(struct user_namespace *mnt_userns,
const char *name, struct dentry *base, int len,
struct qstr *this)
{
this->name = name;
this->len = len;
this->hash = full_name_hash(base, name, len);
if (!len)
return -EACCES; if (unlikely(name[0] == '.')) { if (len < 2 || (len == 2 && name[1] == '.'))
return -EACCES;
}
while (len--) { unsigned int c = *(const unsigned char *)name++; if (c == '/' || c == '\0')
return -EACCES;
}
/*
* See if the low-level filesystem might want
* to use its own hash..
*/
if (base->d_flags & DCACHE_OP_HASH) { int err = base->d_op->d_hash(base, this);
if (err < 0)
return err;
}
return inode_permission(mnt_userns, base->d_inode, MAY_EXEC);
}
/**
* try_lookup_one_len - filesystem helper to lookup single pathname component
* @name: pathname component to lookup
* @base: base directory to lookup from
* @len: maximum length @len should be interpreted to
*
* Look up a dentry by name in the dcache, returning NULL if it does not
* currently exist. The function does not try to create a dentry.
*
* Note that this routine is purely a helper for filesystem usage and should
* not be called by generic code.
*
* The caller must hold base->i_mutex.
*/
struct dentry *try_lookup_one_len(const char *name, struct dentry *base, int len)
{
struct qstr this;
int err;
WARN_ON_ONCE(!inode_is_locked(base->d_inode));
err = lookup_one_common(&init_user_ns, name, base, len, &this);
if (err)
return ERR_PTR(err);
return lookup_dcache(&this, base, 0);
}
EXPORT_SYMBOL(try_lookup_one_len);
/**
* lookup_one_len - filesystem helper to lookup single pathname component
* @name: pathname component to lookup
* @base: base directory to lookup from
* @len: maximum length @len should be interpreted to
*
* Note that this routine is purely a helper for filesystem usage and should
* not be called by generic code.
*
* The caller must hold base->i_mutex.
*/
struct dentry *lookup_one_len(const char *name, struct dentry *base, int len)
{
struct dentry *dentry;
struct qstr this;
int err;
WARN_ON_ONCE(!inode_is_locked(base->d_inode)); err = lookup_one_common(&init_user_ns, name, base, len, &this);
if (err)
return ERR_PTR(err); dentry = lookup_dcache(&this, base, 0); return dentry ? dentry : __lookup_slow(&this, base, 0);
}
EXPORT_SYMBOL(lookup_one_len);
/**
* lookup_one - filesystem helper to lookup single pathname component
* @mnt_userns: user namespace of the mount the lookup is performed from
* @name: pathname component to lookup
* @base: base directory to lookup from
* @len: maximum length @len should be interpreted to
*
* Note that this routine is purely a helper for filesystem usage and should
* not be called by generic code.
*
* The caller must hold base->i_mutex.
*/
struct dentry *lookup_one(struct user_namespace *mnt_userns, const char *name,
struct dentry *base, int len)
{
struct dentry *dentry;
struct qstr this;
int err;
WARN_ON_ONCE(!inode_is_locked(base->d_inode));
err = lookup_one_common(mnt_userns, name, base, len, &this);
if (err)
return ERR_PTR(err);
dentry = lookup_dcache(&this, base, 0);
return dentry ? dentry : __lookup_slow(&this, base, 0);
}
EXPORT_SYMBOL(lookup_one);
/**
* lookup_one_len_unlocked - filesystem helper to lookup single pathname component
* @name: pathname component to lookup
* @base: base directory to lookup from
* @len: maximum length @len should be interpreted to
*
* Note that this routine is purely a helper for filesystem usage and should
* not be called by generic code.
*
* Unlike lookup_one_len, it should be called without the parent
* i_mutex held, and will take the i_mutex itself if necessary.
*/
struct dentry *lookup_one_len_unlocked(const char *name,
struct dentry *base, int len)
{
struct qstr this;
int err;
struct dentry *ret;
err = lookup_one_common(&init_user_ns, name, base, len, &this);
if (err)
return ERR_PTR(err);
ret = lookup_dcache(&this, base, 0);
if (!ret)
ret = lookup_slow(&this, base, 0);
return ret;
}
EXPORT_SYMBOL(lookup_one_len_unlocked);
/*
* Like lookup_one_len_unlocked(), except that it yields ERR_PTR(-ENOENT)
* on negatives. Returns known positive or ERR_PTR(); that's what
* most of the users want. Note that pinned negative with unlocked parent
* _can_ become positive at any time, so callers of lookup_one_len_unlocked()
* need to be very careful; pinned positives have ->d_inode stable, so
* this one avoids such problems.
*/
struct dentry *lookup_positive_unlocked(const char *name,
struct dentry *base, int len)
{
struct dentry *ret = lookup_one_len_unlocked(name, base, len);
if (!IS_ERR(ret) && d_flags_negative(smp_load_acquire(&ret->d_flags))) {
dput(ret);
ret = ERR_PTR(-ENOENT);
}
return ret;
}
EXPORT_SYMBOL(lookup_positive_unlocked);
#ifdef CONFIG_UNIX98_PTYS
int path_pts(struct path *path)
{
/* Find something mounted on "pts" in the same directory as
* the input path.
*/
struct dentry *parent = dget_parent(path->dentry);
struct dentry *child;
struct qstr this = QSTR_INIT("pts", 3);
if (unlikely(!path_connected(path->mnt, parent))) {
dput(parent);
return -ENOENT;
}
dput(path->dentry);
path->dentry = parent;
child = d_hash_and_lookup(parent, &this);
if (!child)
return -ENOENT;
path->dentry = child;
dput(parent);
follow_down(path);
return 0;
}
#endif
int user_path_at_empty(int dfd, const char __user *name, unsigned flags,
struct path *path, int *empty)
{
struct filename *filename = getname_flags(name, flags, empty);
int ret = filename_lookup(dfd, filename, flags, path, NULL);
putname(filename);
return ret;
}
EXPORT_SYMBOL(user_path_at_empty);
int __check_sticky(struct user_namespace *mnt_userns, struct inode *dir,
struct inode *inode)
{
kuid_t fsuid = current_fsuid();
if (uid_eq(i_uid_into_mnt(mnt_userns, inode), fsuid))
return 0;
if (uid_eq(i_uid_into_mnt(mnt_userns, dir), fsuid))
return 0;
return !capable_wrt_inode_uidgid(mnt_userns, inode, CAP_FOWNER);
}
EXPORT_SYMBOL(__check_sticky);
/*
* Check whether we can remove a link victim from directory dir, check
* whether the type of victim is right.
* 1. We can't do it if dir is read-only (done in permission())
* 2. We should have write and exec permissions on dir
* 3. We can't remove anything from append-only dir
* 4. We can't do anything with immutable dir (done in permission())
* 5. If the sticky bit on dir is set we should either
* a. be owner of dir, or
* b. be owner of victim, or
* c. have CAP_FOWNER capability
* 6. If the victim is append-only or immutable we can't do antyhing with
* links pointing to it.
* 7. If the victim has an unknown uid or gid we can't change the inode.
* 8. If we were asked to remove a directory and victim isn't one - ENOTDIR.
* 9. If we were asked to remove a non-directory and victim isn't one - EISDIR.
* 10. We can't remove a root or mountpoint.
* 11. We don't allow removal of NFS sillyrenamed files; it's handled by
* nfs_async_unlink().
*/
static int may_delete(struct user_namespace *mnt_userns, struct inode *dir,
struct dentry *victim, bool isdir)
{
struct inode *inode = d_backing_inode(victim);
int error;
if (d_is_negative(victim))
return -ENOENT;
BUG_ON(!inode); BUG_ON(victim->d_parent->d_inode != dir);
/* Inode writeback is not safe when the uid or gid are invalid. */
if (!uid_valid(i_uid_into_mnt(mnt_userns, inode)) ||
!gid_valid(i_gid_into_mnt(mnt_userns, inode)))
return -EOVERFLOW;
audit_inode_child(dir, victim, AUDIT_TYPE_CHILD_DELETE);
error = inode_permission(mnt_userns, dir, MAY_WRITE | MAY_EXEC);
if (error)
return error;
if (IS_APPEND(dir))
return -EPERM;
if (check_sticky(mnt_userns, dir, inode) || IS_APPEND(inode) ||
IS_IMMUTABLE(inode) || IS_SWAPFILE(inode) ||
HAS_UNMAPPED_ID(mnt_userns, inode))
return -EPERM;
if (isdir) {
if (!d_is_dir(victim))
return -ENOTDIR; if (IS_ROOT(victim))
return -EBUSY;
} else if (d_is_dir(victim))
return -EISDIR;
if (IS_DEADDIR(dir))
return -ENOENT;
if (victim->d_flags & DCACHE_NFSFS_RENAMED)
return -EBUSY;
return 0;
}
/* Check whether we can create an object with dentry child in directory
* dir.
* 1. We can't do it if child already exists (open has special treatment for
* this case, but since we are inlined it's OK)
* 2. We can't do it if dir is read-only (done in permission())
* 3. We can't do it if the fs can't represent the fsuid or fsgid.
* 4. We should have write and exec permissions on dir
* 5. We can't do it if dir is immutable (done in permission())
*/
static inline int may_create(struct user_namespace *mnt_userns,
struct inode *dir, struct dentry *child)
{
audit_inode_child(dir, child, AUDIT_TYPE_CHILD_CREATE);
if (child->d_inode)
return -EEXIST;
if (IS_DEADDIR(dir))
return -ENOENT;
if (!fsuidgid_has_mapping(dir->i_sb, mnt_userns))
return -EOVERFLOW; return inode_permission(mnt_userns, dir, MAY_WRITE | MAY_EXEC);
}
/*
* p1 and p2 should be directories on the same fs.
*/
struct dentry *lock_rename(struct dentry *p1, struct dentry *p2)
{
struct dentry *p;
if (p1 == p2) { inode_lock_nested(p1->d_inode, I_MUTEX_PARENT);
return NULL;
}
mutex_lock(&p1->d_sb->s_vfs_rename_mutex);
p = d_ancestor(p2, p1);
if (p) {
inode_lock_nested(p2->d_inode, I_MUTEX_PARENT);
inode_lock_nested(p1->d_inode, I_MUTEX_CHILD);
return p;
}
p = d_ancestor(p1, p2);
if (p) {
inode_lock_nested(p1->d_inode, I_MUTEX_PARENT);
inode_lock_nested(p2->d_inode, I_MUTEX_CHILD);
return p;
}
inode_lock_nested(p1->d_inode, I_MUTEX_PARENT);
inode_lock_nested(p2->d_inode, I_MUTEX_PARENT2);
return NULL;
}
EXPORT_SYMBOL(lock_rename);
void unlock_rename(struct dentry *p1, struct dentry *p2)
{
inode_unlock(p1->d_inode);
if (p1 != p2) {
inode_unlock(p2->d_inode);
mutex_unlock(&p1->d_sb->s_vfs_rename_mutex);
}
}
EXPORT_SYMBOL(unlock_rename);
/**
* vfs_create - create new file
* @mnt_userns: user namespace of the mount the inode was found from
* @dir: inode of @dentry
* @dentry: pointer to dentry of the base directory
* @mode: mode of the new file
* @want_excl: whether the file must not yet exist
*
* Create a new file.
*
* If the inode has been found through an idmapped mount the user namespace of
* the vfsmount must be passed through @mnt_userns. This function will then take
* care to map the inode according to @mnt_userns before checking permissions.
* On non-idmapped mounts or if permission checking is to be performed on the
* raw inode simply passs init_user_ns.
*/
int vfs_create(struct user_namespace *mnt_userns, struct inode *dir,
struct dentry *dentry, umode_t mode, bool want_excl)
{
int error = may_create(mnt_userns, dir, dentry);
if (error)
return error;
if (!dir->i_op->create)
return -EACCES; /* shouldn't it be ENOSYS? */
mode &= S_IALLUGO;
mode |= S_IFREG;
error = security_inode_create(dir, dentry, mode);
if (error)
return error;
error = dir->i_op->create(mnt_userns, dir, dentry, mode, want_excl);
if (!error)
fsnotify_create(dir, dentry);
return error;
}
EXPORT_SYMBOL(vfs_create);
int vfs_mkobj(struct dentry *dentry, umode_t mode,
int (*f)(struct dentry *, umode_t, void *),
void *arg)
{
struct inode *dir = dentry->d_parent->d_inode;
int error = may_create(&init_user_ns, dir, dentry);
if (error)
return error;
mode &= S_IALLUGO;
mode |= S_IFREG;
error = security_inode_create(dir, dentry, mode);
if (error)
return error;
error = f(dentry, mode, arg);
if (!error)
fsnotify_create(dir, dentry);
return error;
}
EXPORT_SYMBOL(vfs_mkobj);
bool may_open_dev(const struct path *path)
{
return !(path->mnt->mnt_flags & MNT_NODEV) && !(path->mnt->mnt_sb->s_iflags & SB_I_NODEV);
}
static int may_open(struct user_namespace *mnt_userns, const struct path *path,
int acc_mode, int flag)
{
struct dentry *dentry = path->dentry;
struct inode *inode = dentry->d_inode;
int error;
if (!inode)
return -ENOENT;
switch (inode->i_mode & S_IFMT) {
case S_IFLNK:
return -ELOOP;
case S_IFDIR:
if (acc_mode & MAY_WRITE)
return -EISDIR;
if (acc_mode & MAY_EXEC)
return -EACCES;
break;
case S_IFBLK:
case S_IFCHR:
if (!may_open_dev(path))
return -EACCES;
fallthrough;
case S_IFIFO:
case S_IFSOCK:
if (acc_mode & MAY_EXEC)
return -EACCES;
flag &= ~O_TRUNC;
break;
case S_IFREG:
if ((acc_mode & MAY_EXEC) && path_noexec(path))
return -EACCES;
break;
}
error = inode_permission(mnt_userns, inode, MAY_OPEN | acc_mode);
if (error)
return error;
/*
* An append-only file must be opened in append mode for writing.
*/
if (IS_APPEND(inode)) { if ((flag & O_ACCMODE) != O_RDONLY && !(flag & O_APPEND))
return -EPERM;
if (flag & O_TRUNC)
return -EPERM;
}
/* O_NOATIME can only be set by the owner or superuser */
if (flag & O_NOATIME && !inode_owner_or_capable(mnt_userns, inode))
return -EPERM;
return 0;
}
static int handle_truncate(struct user_namespace *mnt_userns, struct file *filp)
{
const struct path *path = &filp->f_path;
struct inode *inode = path->dentry->d_inode;
int error = get_write_access(inode);
if (error)
return error;
/*
* Refuse to truncate files with mandatory locks held on them.
*/
error = security_path_truncate(path);
if (!error) {
error = do_truncate(mnt_userns, path->dentry, 0,
ATTR_MTIME|ATTR_CTIME|ATTR_OPEN,
filp);
}
put_write_access(inode);
return error;
}
static inline int open_to_namei_flags(int flag)
{
if ((flag & O_ACCMODE) == 3)
flag--;
return flag;
}
static int may_o_create(struct user_namespace *mnt_userns,
const struct path *dir, struct dentry *dentry,
umode_t mode)
{
int error = security_path_mknod(dir, dentry, mode, 0);
if (error)
return error;
if (!fsuidgid_has_mapping(dir->dentry->d_sb, mnt_userns))
return -EOVERFLOW;
error = inode_permission(mnt_userns, dir->dentry->d_inode,
MAY_WRITE | MAY_EXEC);
if (error)
return error;
return security_inode_create(dir->dentry->d_inode, dentry, mode);
}
/*
* Attempt to atomically look up, create and open a file from a negative
* dentry.
*
* Returns 0 if successful. The file will have been created and attached to
* @file by the filesystem calling finish_open().
*
* If the file was looked up only or didn't need creating, FMODE_OPENED won't
* be set. The caller will need to perform the open themselves. @path will
* have been updated to point to the new dentry. This may be negative.
*
* Returns an error code otherwise.
*/
static struct dentry *atomic_open(struct nameidata *nd, struct dentry *dentry,
struct file *file,
int open_flag, umode_t mode)
{
struct dentry *const DENTRY_NOT_SET = (void *) -1UL;
struct inode *dir = nd->path.dentry->d_inode;
int error;
if (nd->flags & LOOKUP_DIRECTORY)
open_flag |= O_DIRECTORY; file->f_path.dentry = DENTRY_NOT_SET;
file->f_path.mnt = nd->path.mnt;
error = dir->i_op->atomic_open(dir, dentry, file,
open_to_namei_flags(open_flag), mode);
d_lookup_done(dentry);
if (!error) { if (file->f_mode & FMODE_OPENED) { if (unlikely(dentry != file->f_path.dentry)) { dput(dentry);
dentry = dget(file->f_path.dentry);
}
} else if (WARN_ON(file->f_path.dentry == DENTRY_NOT_SET)) {
error = -EIO;
} else {
if (file->f_path.dentry) { dput(dentry);
dentry = file->f_path.dentry;
}
if (unlikely(d_is_negative(dentry)))
error = -ENOENT;
}
}
if (error) {
dput(dentry);
dentry = ERR_PTR(error);
}
return dentry;
}
/*
* Look up and maybe create and open the last component.
*
* Must be called with parent locked (exclusive in O_CREAT case).
*
* Returns 0 on success, that is, if
* the file was successfully atomically created (if necessary) and opened, or
* the file was not completely opened at this time, though lookups and
* creations were performed.
* These case are distinguished by presence of FMODE_OPENED on file->f_mode.
* In the latter case dentry returned in @path might be negative if O_CREAT
* hadn't been specified.
*
* An error code is returned on failure.
*/
static struct dentry *lookup_open(struct nameidata *nd, struct file *file,
const struct open_flags *op,
bool got_write)
{
struct user_namespace *mnt_userns;
struct dentry *dir = nd->path.dentry;
struct inode *dir_inode = dir->d_inode;
int open_flag = op->open_flag;
struct dentry *dentry;
int error, create_error = 0;
umode_t mode = op->mode;
DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq);
if (unlikely(IS_DEADDIR(dir_inode)))
return ERR_PTR(-ENOENT);
file->f_mode &= ~FMODE_CREATED;
dentry = d_lookup(dir, &nd->last);
for (;;) {
if (!dentry) {
dentry = d_alloc_parallel(dir, &nd->last, &wq); if (IS_ERR(dentry))
return dentry;
}
if (d_in_lookup(dentry))
break;
error = d_revalidate(dentry, nd->flags);
if (likely(error > 0))
break;
if (error)
goto out_dput;
d_invalidate(dentry);
dput(dentry);
dentry = NULL;
}
if (dentry->d_inode) {
/* Cached positive dentry: will open in f_op->open */
return dentry;
}
/*
* Checking write permission is tricky, bacuse we don't know if we are
* going to actually need it: O_CREAT opens should work as long as the
* file exists. But checking existence breaks atomicity. The trick is
* to check access and if not granted clear O_CREAT from the flags.
*
* Another problem is returing the "right" error value (e.g. for an
* O_EXCL open we want to return EEXIST not EROFS).
*/
if (unlikely(!got_write)) open_flag &= ~O_TRUNC; mnt_userns = mnt_user_ns(nd->path.mnt);
if (open_flag & O_CREAT) {
if (open_flag & O_EXCL) open_flag &= ~O_TRUNC; if (!IS_POSIXACL(dir->d_inode)) mode &= ~current_umask(); if (likely(got_write))
create_error = may_o_create(mnt_userns, &nd->path,
dentry, mode);
else
create_error = -EROFS;
}
if (create_error)
open_flag &= ~O_CREAT; if (dir_inode->i_op->atomic_open) { dentry = atomic_open(nd, dentry, file, open_flag, mode); if (unlikely(create_error) && dentry == ERR_PTR(-ENOENT)) dentry = ERR_PTR(create_error);
return dentry;
}
if (d_in_lookup(dentry)) { struct dentry *res = dir_inode->i_op->lookup(dir_inode, dentry,
nd->flags);
d_lookup_done(dentry);
if (unlikely(res)) {
if (IS_ERR(res)) {
error = PTR_ERR(res);
goto out_dput;
}
dput(dentry);
dentry = res;
}
}
/* Negative dentry, just create the file */
if (!dentry->d_inode && (open_flag & O_CREAT)) { file->f_mode |= FMODE_CREATED;
audit_inode_child(dir_inode, dentry, AUDIT_TYPE_CHILD_CREATE);
if (!dir_inode->i_op->create) {
error = -EACCES;
goto out_dput;
}
error = dir_inode->i_op->create(mnt_userns, dir_inode, dentry,
mode, open_flag & O_EXCL);
if (error)
goto out_dput;
}
if (unlikely(create_error) && !dentry->d_inode) {
error = create_error;
goto out_dput;
}
return dentry;
out_dput:
dput(dentry);
return ERR_PTR(error);
}
static const char *open_last_lookups(struct nameidata *nd,
struct file *file, const struct open_flags *op)
{
struct dentry *dir = nd->path.dentry;
int open_flag = op->open_flag;
bool got_write = false;
unsigned seq;
struct inode *inode;
struct dentry *dentry;
const char *res;
nd->flags |= op->intent;
if (nd->last_type != LAST_NORM) {
if (nd->depth)
put_link(nd);
return handle_dots(nd, nd->last_type);
}
if (!(open_flag & O_CREAT)) { if (nd->last.name[nd->last.len]) nd->flags |= LOOKUP_FOLLOW | LOOKUP_DIRECTORY;
/* we _can_ be in RCU mode here */
dentry = lookup_fast(nd, &inode, &seq);
if (IS_ERR(dentry))
return ERR_CAST(dentry);
if (likely(dentry))
goto finish_lookup;
BUG_ON(nd->flags & LOOKUP_RCU);
} else {
/* create side of things */
if (nd->flags & LOOKUP_RCU) { if (!try_to_unlazy(nd))
return ERR_PTR(-ECHILD);
}
audit_inode(nd->name, dir, AUDIT_INODE_PARENT);
/* trailing slashes? */
if (unlikely(nd->last.name[nd->last.len])) return ERR_PTR(-EISDIR);
}
if (open_flag & (O_CREAT | O_TRUNC | O_WRONLY | O_RDWR)) { got_write = !mnt_want_write(nd->path.mnt);
/*
* do _not_ fail yet - we might not need that or fail with
* a different error; let lookup_open() decide; we'll be
* dropping this one anyway.
*/
}
if (open_flag & O_CREAT)
inode_lock(dir->d_inode);
else
inode_lock_shared(dir->d_inode);
dentry = lookup_open(nd, file, op, got_write);
if (!IS_ERR(dentry) && (file->f_mode & FMODE_CREATED))
fsnotify_create(dir->d_inode, dentry);
if (open_flag & O_CREAT)
inode_unlock(dir->d_inode);
else
inode_unlock_shared(dir->d_inode);
if (got_write) mnt_drop_write(nd->path.mnt); if (IS_ERR(dentry))
return ERR_CAST(dentry);
if (file->f_mode & (FMODE_OPENED | FMODE_CREATED)) { dput(nd->path.dentry);
nd->path.dentry = dentry;
return NULL;
}
finish_lookup:
if (nd->depth)
put_link(nd);
res = step_into(nd, WALK_TRAILING, dentry, inode, seq); if (unlikely(res)) nd->flags &= ~(LOOKUP_OPEN|LOOKUP_CREATE|LOOKUP_EXCL);
return res;
}
/*
* Handle the last step of open()
*/
static int do_open(struct nameidata *nd,
struct file *file, const struct open_flags *op)
{
struct user_namespace *mnt_userns;
int open_flag = op->open_flag;
bool do_truncate;
int acc_mode;
int error;
if (!(file->f_mode & (FMODE_OPENED | FMODE_CREATED))) {
error = complete_walk(nd); if (error)
return error;
}
if (!(file->f_mode & FMODE_CREATED)) audit_inode(nd->name, nd->path.dentry, 0); mnt_userns = mnt_user_ns(nd->path.mnt);
if (open_flag & O_CREAT) {
if ((open_flag & O_EXCL) && !(file->f_mode & FMODE_CREATED))
return -EEXIST;
if (d_is_dir(nd->path.dentry))
return -EISDIR;
error = may_create_in_sticky(mnt_userns, nd,
d_backing_inode(nd->path.dentry));
if (unlikely(error))
return error;
}
if ((nd->flags & LOOKUP_DIRECTORY) && !d_can_lookup(nd->path.dentry))
return -ENOTDIR;
do_truncate = false; acc_mode = op->acc_mode; if (file->f_mode & FMODE_CREATED) {
/* Don't check for write permission, don't truncate */
open_flag &= ~O_TRUNC;
acc_mode = 0;
} else if (d_is_reg(nd->path.dentry) && open_flag & O_TRUNC) { error = mnt_want_write(nd->path.mnt);
if (error)
return error;
do_truncate = true;
}
error = may_open(mnt_userns, &nd->path, acc_mode, open_flag); if (!error && !(file->f_mode & FMODE_OPENED)) error = vfs_open(&nd->path, file);
if (!error)
error = ima_file_check(file, op->acc_mode);
if (!error && do_truncate)
error = handle_truncate(mnt_userns, file);
if (unlikely(error > 0)) { WARN_ON(1);
error = -EINVAL;
}
if (do_truncate) mnt_drop_write(nd->path.mnt);
return error;
}
/**
* vfs_tmpfile - create tmpfile
* @mnt_userns: user namespace of the mount the inode was found from
* @dentry: pointer to dentry of the base directory
* @mode: mode of the new tmpfile
* @open_flag: flags
*
* Create a temporary file.
*
* If the inode has been found through an idmapped mount the user namespace of
* the vfsmount must be passed through @mnt_userns. This function will then take
* care to map the inode according to @mnt_userns before checking permissions.
* On non-idmapped mounts or if permission checking is to be performed on the
* raw inode simply passs init_user_ns.
*/
struct dentry *vfs_tmpfile(struct user_namespace *mnt_userns,
struct dentry *dentry, umode_t mode, int open_flag)
{
struct dentry *child = NULL; struct inode *dir = dentry->d_inode;
struct inode *inode;
int error;
/* we want directory to be writable */
error = inode_permission(mnt_userns, dir, MAY_WRITE | MAY_EXEC);
if (error)
goto out_err;
error = -EOPNOTSUPP;
if (!dir->i_op->tmpfile)
goto out_err;
error = -ENOMEM;
child = d_alloc(dentry, &slash_name);
if (unlikely(!child))
goto out_err;
error = dir->i_op->tmpfile(mnt_userns, dir, child, mode);
if (error)
goto out_err;
error = -ENOENT;
inode = child->d_inode;
if (unlikely(!inode))
goto out_err;
if (!(open_flag & O_EXCL)) {
spin_lock(&inode->i_lock);
inode->i_state |= I_LINKABLE;
spin_unlock(&inode->i_lock);
}
ima_post_create_tmpfile(mnt_userns, inode);
return child;
out_err:
dput(child);
return ERR_PTR(error);
}
EXPORT_SYMBOL(vfs_tmpfile);
static int do_tmpfile(struct nameidata *nd, unsigned flags,
const struct open_flags *op,
struct file *file)
{
struct user_namespace *mnt_userns;
struct dentry *child;
struct path path;
int error = path_lookupat(nd, flags | LOOKUP_DIRECTORY, &path);
if (unlikely(error))
return error;
error = mnt_want_write(path.mnt);
if (unlikely(error))
goto out;
mnt_userns = mnt_user_ns(path.mnt);
child = vfs_tmpfile(mnt_userns, path.dentry, op->mode, op->open_flag);
error = PTR_ERR(child);
if (IS_ERR(child))
goto out2;
dput(path.dentry);
path.dentry = child;
audit_inode(nd->name, child, 0);
/* Don't check for other permissions, the inode was just created */
error = may_open(mnt_userns, &path, 0, op->open_flag);
if (!error)
error = vfs_open(&path, file);
out2:
mnt_drop_write(path.mnt);
out:
path_put(&path);
return error;
}
static int do_o_path(struct nameidata *nd, unsigned flags, struct file *file)
{
struct path path;
int error = path_lookupat(nd, flags, &path);
if (!error) {
audit_inode(nd->name, path.dentry, 0); error = vfs_open(&path, file);
path_put(&path);
}
return error;
}
static struct file *path_openat(struct nameidata *nd,
const struct open_flags *op, unsigned flags)
{
struct file *file;
int error;
file = alloc_empty_file(op->open_flag, current_cred());
if (IS_ERR(file))
return file;
if (unlikely(file->f_flags & __O_TMPFILE)) {
error = do_tmpfile(nd, flags, op, file);
} else if (unlikely(file->f_flags & O_PATH)) {
error = do_o_path(nd, flags, file);
} else {
const char *s = path_init(nd, flags); while (!(error = link_path_walk(s, nd)) &&
(s = open_last_lookups(nd, file, op)) != NULL)
;
if (!error)
error = do_open(nd, file, op); terminate_walk(nd);
}
if (likely(!error)) { if (likely(file->f_mode & FMODE_OPENED))
return file;
WARN_ON(1);
error = -EINVAL;
}
fput(file);
if (error == -EOPENSTALE) {
if (flags & LOOKUP_RCU)
error = -ECHILD;
else
error = -ESTALE;
}
return ERR_PTR(error);
}
struct file *do_filp_open(int dfd, struct filename *pathname,
const struct open_flags *op)
{
struct nameidata nd;
int flags = op->lookup_flags;
struct file *filp;
set_nameidata(&nd, dfd, pathname, NULL);
filp = path_openat(&nd, op, flags | LOOKUP_RCU);
if (unlikely(filp == ERR_PTR(-ECHILD)))
filp = path_openat(&nd, op, flags); if (unlikely(filp == ERR_PTR(-ESTALE))) filp = path_openat(&nd, op, flags | LOOKUP_REVAL); restore_nameidata();
return filp;
}
struct file *do_file_open_root(const struct path *root,
const char *name, const struct open_flags *op)
{
struct nameidata nd;
struct file *file;
struct filename *filename;
int flags = op->lookup_flags;
if (d_is_symlink(root->dentry) && op->intent & LOOKUP_OPEN)
return ERR_PTR(-ELOOP);
filename = getname_kernel(name);
if (IS_ERR(filename))
return ERR_CAST(filename);
set_nameidata(&nd, -1, filename, root);
file = path_openat(&nd, op, flags | LOOKUP_RCU);
if (unlikely(file == ERR_PTR(-ECHILD)))
file = path_openat(&nd, op, flags);
if (unlikely(file == ERR_PTR(-ESTALE)))
file = path_openat(&nd, op, flags | LOOKUP_REVAL);
restore_nameidata();
putname(filename);
return file;
}
static struct dentry *filename_create(int dfd, struct filename *name,
struct path *path, unsigned int lookup_flags)
{
struct dentry *dentry = ERR_PTR(-EEXIST);
struct qstr last;
bool want_dir = lookup_flags & LOOKUP_DIRECTORY; unsigned int reval_flag = lookup_flags & LOOKUP_REVAL;
unsigned int create_flags = LOOKUP_CREATE | LOOKUP_EXCL;
int type;
int err2;
int error;
error = filename_parentat(dfd, name, reval_flag, path, &last, &type);
if (error)
return ERR_PTR(error);
/*
* Yucky last component or no last component at all?
* (foo/., foo/.., /////)
*/
if (unlikely(type != LAST_NORM))
goto out;
/* don't fail immediately if it's r/o, at least try to report other errors */
err2 = mnt_want_write(path->mnt);
/*
* Do the final lookup. Suppress 'create' if there is a trailing
* '/', and a directory wasn't requested.
*/
if (last.name[last.len] && !want_dir)
create_flags = 0;
inode_lock_nested(path->dentry->d_inode, I_MUTEX_PARENT);
dentry = __lookup_hash(&last, path->dentry, reval_flag | create_flags);
if (IS_ERR(dentry))
goto unlock;
error = -EEXIST;
if (d_is_positive(dentry))
goto fail;
/*
* Special case - lookup gave negative, but... we had foo/bar/
* From the vfs_mknod() POV we just have a negative dentry -
* all is fine. Let's be bastards - you had / on the end, you've
* been asking for (non-existent) directory. -ENOENT for you.
*/
if (unlikely(!create_flags)) {
error = -ENOENT;
goto fail;
}
if (unlikely(err2)) {
error = err2;
goto fail;
}
return dentry;
fail:
dput(dentry);
dentry = ERR_PTR(error);
unlock:
inode_unlock(path->dentry->d_inode);
if (!err2)
mnt_drop_write(path->mnt);
out:
path_put(path);
return dentry;
}
struct dentry *kern_path_create(int dfd, const char *pathname,
struct path *path, unsigned int lookup_flags)
{
struct filename *filename = getname_kernel(pathname);
struct dentry *res = filename_create(dfd, filename, path, lookup_flags);
putname(filename);
return res;
}
EXPORT_SYMBOL(kern_path_create);
void done_path_create(struct path *path, struct dentry *dentry)
{
dput(dentry);
inode_unlock(path->dentry->d_inode);
mnt_drop_write(path->mnt);
path_put(path);
}
EXPORT_SYMBOL(done_path_create);
inline struct dentry *user_path_create(int dfd, const char __user *pathname,
struct path *path, unsigned int lookup_flags)
{
struct filename *filename = getname(pathname);
struct dentry *res = filename_create(dfd, filename, path, lookup_flags);
putname(filename);
return res;
}
EXPORT_SYMBOL(user_path_create);
/**
* vfs_mknod - create device node or file
* @mnt_userns: user namespace of the mount the inode was found from
* @dir: inode of @dentry
* @dentry: pointer to dentry of the base directory
* @mode: mode of the new device node or file
* @dev: device number of device to create
*
* Create a device node or file.
*
* If the inode has been found through an idmapped mount the user namespace of
* the vfsmount must be passed through @mnt_userns. This function will then take
* care to map the inode according to @mnt_userns before checking permissions.
* On non-idmapped mounts or if permission checking is to be performed on the
* raw inode simply passs init_user_ns.
*/
int vfs_mknod(struct user_namespace *mnt_userns, struct inode *dir,
struct dentry *dentry, umode_t mode, dev_t dev)
{
bool is_whiteout = S_ISCHR(mode) && dev == WHITEOUT_DEV;
int error = may_create(mnt_userns, dir, dentry);
if (error)
return error;
if ((S_ISCHR(mode) || S_ISBLK(mode)) && !is_whiteout &&
!capable(CAP_MKNOD))
return -EPERM;
if (!dir->i_op->mknod)
return -EPERM;
error = devcgroup_inode_mknod(mode, dev);
if (error)
return error;
error = security_inode_mknod(dir, dentry, mode, dev);
if (error)
return error;
error = dir->i_op->mknod(mnt_userns, dir, dentry, mode, dev);
if (!error)
fsnotify_create(dir, dentry);
return error;
}
EXPORT_SYMBOL(vfs_mknod);
static int may_mknod(umode_t mode)
{
switch (mode & S_IFMT) {
case S_IFREG:
case S_IFCHR:
case S_IFBLK:
case S_IFIFO:
case S_IFSOCK:
case 0: /* zero mode translates to S_IFREG */
return 0;
case S_IFDIR:
return -EPERM;
default:
return -EINVAL;
}
}
static int do_mknodat(int dfd, struct filename *name, umode_t mode,
unsigned int dev)
{
struct user_namespace *mnt_userns;
struct dentry *dentry;
struct path path;
int error;
unsigned int lookup_flags = 0;
error = may_mknod(mode);
if (error)
goto out1;
retry:
dentry = filename_create(dfd, name, &path, lookup_flags);
error = PTR_ERR(dentry);
if (IS_ERR(dentry))
goto out1;
if (!IS_POSIXACL(path.dentry->d_inode))
mode &= ~current_umask();
error = security_path_mknod(&path, dentry, mode, dev);
if (error)
goto out2;
mnt_userns = mnt_user_ns(path.mnt);
switch (mode & S_IFMT) {
case 0: case S_IFREG:
error = vfs_create(mnt_userns, path.dentry->d_inode,
dentry, mode, true);
if (!error)
ima_post_path_mknod(mnt_userns, dentry);
break;
case S_IFCHR: case S_IFBLK:
error = vfs_mknod(mnt_userns, path.dentry->d_inode,
dentry, mode, new_decode_dev(dev));
break;
case S_IFIFO: case S_IFSOCK:
error = vfs_mknod(mnt_userns, path.dentry->d_inode,
dentry, mode, 0);
break;
}
out2:
done_path_create(&path, dentry);
if (retry_estale(error, lookup_flags)) {
lookup_flags |= LOOKUP_REVAL;
goto retry;
}
out1:
putname(name);
return error;
}
SYSCALL_DEFINE4(mknodat, int, dfd, const char __user *, filename, umode_t, mode,
unsigned int, dev)
{
return do_mknodat(dfd, getname(filename), mode, dev);
}
SYSCALL_DEFINE3(mknod, const char __user *, filename, umode_t, mode, unsigned, dev)
{
return do_mknodat(AT_FDCWD, getname(filename), mode, dev);
}
/**
* vfs_mkdir - create directory
* @mnt_userns: user namespace of the mount the inode was found from
* @dir: inode of @dentry
* @dentry: pointer to dentry of the base directory
* @mode: mode of the new directory
*
* Create a directory.
*
* If the inode has been found through an idmapped mount the user namespace of
* the vfsmount must be passed through @mnt_userns. This function will then take
* care to map the inode according to @mnt_userns before checking permissions.
* On non-idmapped mounts or if permission checking is to be performed on the
* raw inode simply passs init_user_ns.
*/
int vfs_mkdir(struct user_namespace *mnt_userns, struct inode *dir,
struct dentry *dentry, umode_t mode)
{
int error = may_create(mnt_userns, dir, dentry);
unsigned max_links = dir->i_sb->s_max_links;
if (error)
return error;
if (!dir->i_op->mkdir)
return -EPERM;
mode &= (S_IRWXUGO|S_ISVTX);
error = security_inode_mkdir(dir, dentry, mode);
if (error)
return error;
if (max_links && dir->i_nlink >= max_links)
return -EMLINK;
error = dir->i_op->mkdir(mnt_userns, dir, dentry, mode); if (!error)
fsnotify_mkdir(dir, dentry);
return error;
}
EXPORT_SYMBOL(vfs_mkdir);
int do_mkdirat(int dfd, struct filename *name, umode_t mode)
{
struct dentry *dentry;
struct path path;
int error;
unsigned int lookup_flags = LOOKUP_DIRECTORY;
retry:
dentry = filename_create(dfd, name, &path, lookup_flags);
error = PTR_ERR(dentry);
if (IS_ERR(dentry))
goto out_putname;
if (!IS_POSIXACL(path.dentry->d_inode)) mode &= ~current_umask(); error = security_path_mkdir(&path, dentry, mode);
if (!error) {
struct user_namespace *mnt_userns;
mnt_userns = mnt_user_ns(path.mnt);
error = vfs_mkdir(mnt_userns, path.dentry->d_inode, dentry,
mode);
}
done_path_create(&path, dentry);
if (retry_estale(error, lookup_flags)) {
lookup_flags |= LOOKUP_REVAL;
goto retry;
}
out_putname:
putname(name);
return error;
}
SYSCALL_DEFINE3(mkdirat, int, dfd, const char __user *, pathname, umode_t, mode)
{
return do_mkdirat(dfd, getname(pathname), mode);
}
SYSCALL_DEFINE2(mkdir, const char __user *, pathname, umode_t, mode)
{
return do_mkdirat(AT_FDCWD, getname(pathname), mode);
}
/**
* vfs_rmdir - remove directory
* @mnt_userns: user namespace of the mount the inode was found from
* @dir: inode of @dentry
* @dentry: pointer to dentry of the base directory
*
* Remove a directory.
*
* If the inode has been found through an idmapped mount the user namespace of
* the vfsmount must be passed through @mnt_userns. This function will then take
* care to map the inode according to @mnt_userns before checking permissions.
* On non-idmapped mounts or if permission checking is to be performed on the
* raw inode simply passs init_user_ns.
*/
int vfs_rmdir(struct user_namespace *mnt_userns, struct inode *dir,
struct dentry *dentry)
{
int error = may_delete(mnt_userns, dir, dentry, 1);
if (error)
return error;
if (!dir->i_op->rmdir)
return -EPERM;
dget(dentry);
inode_lock(dentry->d_inode);
error = -EBUSY;
if (is_local_mountpoint(dentry))
goto out;
error = security_inode_rmdir(dir, dentry);
if (error)
goto out;
error = dir->i_op->rmdir(dir, dentry);
if (error)
goto out;
shrink_dcache_parent(dentry);
dentry->d_inode->i_flags |= S_DEAD;
dont_mount(dentry);
detach_mounts(dentry);
out:
inode_unlock(dentry->d_inode);
dput(dentry);
if (!error)
d_delete_notify(dir, dentry);
return error;
}
EXPORT_SYMBOL(vfs_rmdir);
int do_rmdir(int dfd, struct filename *name)
{
struct user_namespace *mnt_userns;
int error;
struct dentry *dentry;
struct path path;
struct qstr last;
int type;
unsigned int lookup_flags = 0;
retry:
error = filename_parentat(dfd, name, lookup_flags, &path, &last, &type);
if (error)
goto exit1;
switch (type) {
case LAST_DOTDOT:
error = -ENOTEMPTY;
goto exit2;
case LAST_DOT:
error = -EINVAL;
goto exit2;
case LAST_ROOT:
error = -EBUSY;
goto exit2;
}
error = mnt_want_write(path.mnt);
if (error)
goto exit2;
inode_lock_nested(path.dentry->d_inode, I_MUTEX_PARENT);
dentry = __lookup_hash(&last, path.dentry, lookup_flags);
error = PTR_ERR(dentry);
if (IS_ERR(dentry))
goto exit3;
if (!dentry->d_inode) {
error = -ENOENT;
goto exit4;
}
error = security_path_rmdir(&path, dentry);
if (error)
goto exit4;
mnt_userns = mnt_user_ns(path.mnt);
error = vfs_rmdir(mnt_userns, path.dentry->d_inode, dentry);
exit4:
dput(dentry);
exit3:
inode_unlock(path.dentry->d_inode);
mnt_drop_write(path.mnt);
exit2:
path_put(&path);
if (retry_estale(error, lookup_flags)) {
lookup_flags |= LOOKUP_REVAL;
goto retry;
}
exit1:
putname(name);
return error;
}
SYSCALL_DEFINE1(rmdir, const char __user *, pathname)
{
return do_rmdir(AT_FDCWD, getname(pathname));
}
/**
* vfs_unlink - unlink a filesystem object
* @mnt_userns: user namespace of the mount the inode was found from
* @dir: parent directory
* @dentry: victim
* @delegated_inode: returns victim inode, if the inode is delegated.
*
* The caller must hold dir->i_mutex.
*
* If vfs_unlink discovers a delegation, it will return -EWOULDBLOCK and
* return a reference to the inode in delegated_inode. The caller
* should then break the delegation on that inode and retry. Because
* breaking a delegation may take a long time, the caller should drop
* dir->i_mutex before doing so.
*
* Alternatively, a caller may pass NULL for delegated_inode. This may
* be appropriate for callers that expect the underlying filesystem not
* to be NFS exported.
*
* If the inode has been found through an idmapped mount the user namespace of
* the vfsmount must be passed through @mnt_userns. This function will then take
* care to map the inode according to @mnt_userns before checking permissions.
* On non-idmapped mounts or if permission checking is to be performed on the
* raw inode simply passs init_user_ns.
*/
int vfs_unlink(struct user_namespace *mnt_userns, struct inode *dir,
struct dentry *dentry, struct inode **delegated_inode)
{
struct inode *target = dentry->d_inode;
int error = may_delete(mnt_userns, dir, dentry, 0);
if (error)
return error;
if (!dir->i_op->unlink)
return -EPERM;
inode_lock(target);
if (IS_SWAPFILE(target))
error = -EPERM;
else if (is_local_mountpoint(dentry))
error = -EBUSY;
else {
error = security_inode_unlink(dir, dentry);
if (!error) {
error = try_break_deleg(target, delegated_inode);
if (error)
goto out;
error = dir->i_op->unlink(dir, dentry);
if (!error) {
dont_mount(dentry);
detach_mounts(dentry);
}
}
}
out:
inode_unlock(target);
/* We don't d_delete() NFS sillyrenamed files--they still exist. */
if (!error && dentry->d_flags & DCACHE_NFSFS_RENAMED) {
fsnotify_unlink(dir, dentry);
} else if (!error) {
fsnotify_link_count(target);
d_delete_notify(dir, dentry);
}
return error;
}
EXPORT_SYMBOL(vfs_unlink);
/*
* Make sure that the actual truncation of the file will occur outside its
* directory's i_mutex. Truncate can take a long time if there is a lot of
* writeout happening, and we don't want to prevent access to the directory
* while waiting on the I/O.
*/
int do_unlinkat(int dfd, struct filename *name)
{
int error;
struct dentry *dentry;
struct path path;
struct qstr last;
int type;
struct inode *inode = NULL;
struct inode *delegated_inode = NULL;
unsigned int lookup_flags = 0;
retry:
error = filename_parentat(dfd, name, lookup_flags, &path, &last, &type);
if (error)
goto exit1;
error = -EISDIR;
if (type != LAST_NORM)
goto exit2;
error = mnt_want_write(path.mnt);
if (error)
goto exit2;
retry_deleg:
inode_lock_nested(path.dentry->d_inode, I_MUTEX_PARENT);
dentry = __lookup_hash(&last, path.dentry, lookup_flags);
error = PTR_ERR(dentry);
if (!IS_ERR(dentry)) {
struct user_namespace *mnt_userns;
/* Why not before? Because we want correct error value */
if (last.name[last.len])
goto slashes;
inode = dentry->d_inode;
if (d_is_negative(dentry))
goto slashes;
ihold(inode);
error = security_path_unlink(&path, dentry);
if (error)
goto exit3;
mnt_userns = mnt_user_ns(path.mnt);
error = vfs_unlink(mnt_userns, path.dentry->d_inode, dentry,
&delegated_inode);
exit3:
dput(dentry);
}
inode_unlock(path.dentry->d_inode);
if (inode)
iput(inode); /* truncate the inode here */
inode = NULL;
if (delegated_inode) {
error = break_deleg_wait(&delegated_inode);
if (!error)
goto retry_deleg;
}
mnt_drop_write(path.mnt);
exit2:
path_put(&path);
if (retry_estale(error, lookup_flags)) {
lookup_flags |= LOOKUP_REVAL;
inode = NULL;
goto retry;
}
exit1:
putname(name);
return error;
slashes:
if (d_is_negative(dentry))
error = -ENOENT;
else if (d_is_dir(dentry))
error = -EISDIR;
else
error = -ENOTDIR;
goto exit3;
}
SYSCALL_DEFINE3(unlinkat, int, dfd, const char __user *, pathname, int, flag)
{
if ((flag & ~AT_REMOVEDIR) != 0)
return -EINVAL;
if (flag & AT_REMOVEDIR)
return do_rmdir(dfd, getname(pathname));
return do_unlinkat(dfd, getname(pathname));
}
SYSCALL_DEFINE1(unlink, const char __user *, pathname)
{
return do_unlinkat(AT_FDCWD, getname(pathname));
}
/**
* vfs_symlink - create symlink
* @mnt_userns: user namespace of the mount the inode was found from
* @dir: inode of @dentry
* @dentry: pointer to dentry of the base directory
* @oldname: name of the file to link to
*
* Create a symlink.
*
* If the inode has been found through an idmapped mount the user namespace of
* the vfsmount must be passed through @mnt_userns. This function will then take
* care to map the inode according to @mnt_userns before checking permissions.
* On non-idmapped mounts or if permission checking is to be performed on the
* raw inode simply passs init_user_ns.
*/
int vfs_symlink(struct user_namespace *mnt_userns, struct inode *dir,
struct dentry *dentry, const char *oldname)
{
int error = may_create(mnt_userns, dir, dentry);
if (error)
return error;
if (!dir->i_op->symlink)
return -EPERM;
error = security_inode_symlink(dir, dentry, oldname);
if (error)
return error;
error = dir->i_op->symlink(mnt_userns, dir, dentry, oldname); if (!error)
fsnotify_create(dir, dentry);
return error;
}
EXPORT_SYMBOL(vfs_symlink);
int do_symlinkat(struct filename *from, int newdfd, struct filename *to)
{
int error;
struct dentry *dentry;
struct path path;
unsigned int lookup_flags = 0;
if (IS_ERR(from)) {
error = PTR_ERR(from);
goto out_putnames;
}
retry:
dentry = filename_create(newdfd, to, &path, lookup_flags);
error = PTR_ERR(dentry);
if (IS_ERR(dentry))
goto out_putnames;
error = security_path_symlink(&path, dentry, from->name);
if (!error) {
struct user_namespace *mnt_userns;
mnt_userns = mnt_user_ns(path.mnt);
error = vfs_symlink(mnt_userns, path.dentry->d_inode, dentry,
from->name);
}
done_path_create(&path, dentry);
if (retry_estale(error, lookup_flags)) {
lookup_flags |= LOOKUP_REVAL;
goto retry;
}
out_putnames:
putname(to);
putname(from);
return error;
}
SYSCALL_DEFINE3(symlinkat, const char __user *, oldname,
int, newdfd, const char __user *, newname)
{
return do_symlinkat(getname(oldname), newdfd, getname(newname));
}
SYSCALL_DEFINE2(symlink, const char __user *, oldname, const char __user *, newname)
{
return do_symlinkat(getname(oldname), AT_FDCWD, getname(newname));
}
/**
* vfs_link - create a new link
* @old_dentry: object to be linked
* @mnt_userns: the user namespace of the mount
* @dir: new parent
* @new_dentry: where to create the new link
* @delegated_inode: returns inode needing a delegation break
*
* The caller must hold dir->i_mutex
*
* If vfs_link discovers a delegation on the to-be-linked file in need
* of breaking, it will return -EWOULDBLOCK and return a reference to the
* inode in delegated_inode. The caller should then break the delegation
* and retry. Because breaking a delegation may take a long time, the
* caller should drop the i_mutex before doing so.
*
* Alternatively, a caller may pass NULL for delegated_inode. This may
* be appropriate for callers that expect the underlying filesystem not
* to be NFS exported.
*
* If the inode has been found through an idmapped mount the user namespace of
* the vfsmount must be passed through @mnt_userns. This function will then take
* care to map the inode according to @mnt_userns before checking permissions.
* On non-idmapped mounts or if permission checking is to be performed on the
* raw inode simply passs init_user_ns.
*/
int vfs_link(struct dentry *old_dentry, struct user_namespace *mnt_userns,
struct inode *dir, struct dentry *new_dentry,
struct inode **delegated_inode)
{
struct inode *inode = old_dentry->d_inode;
unsigned max_links = dir->i_sb->s_max_links;
int error;
if (!inode)
return -ENOENT;
error = may_create(mnt_userns, dir, new_dentry);
if (error)
return error;
if (dir->i_sb != inode->i_sb)
return -EXDEV;
/*
* A link to an append-only or immutable file cannot be created.
*/
if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
return -EPERM;
/*
* Updating the link count will likely cause i_uid and i_gid to
* be writen back improperly if their true value is unknown to
* the vfs.
*/
if (HAS_UNMAPPED_ID(mnt_userns, inode))
return -EPERM;
if (!dir->i_op->link)
return -EPERM;
if (S_ISDIR(inode->i_mode))
return -EPERM;
error = security_inode_link(old_dentry, dir, new_dentry);
if (error)
return error;
inode_lock(inode);
/* Make sure we don't allow creating hardlink to an unlinked file */
if (inode->i_nlink == 0 && !(inode->i_state & I_LINKABLE))
error = -ENOENT;
else if (max_links && inode->i_nlink >= max_links)
error = -EMLINK;
else {
error = try_break_deleg(inode, delegated_inode);
if (!error) error = dir->i_op->link(old_dentry, dir, new_dentry);
}
if (!error && (inode->i_state & I_LINKABLE)) {
spin_lock(&inode->i_lock);
inode->i_state &= ~I_LINKABLE;
spin_unlock(&inode->i_lock);
}
inode_unlock(inode);
if (!error)
fsnotify_link(dir, inode, new_dentry);
return error;
}
EXPORT_SYMBOL(vfs_link);
/*
* Hardlinks are often used in delicate situations. We avoid
* security-related surprises by not following symlinks on the
* newname. --KAB
*
* We don't follow them on the oldname either to be compatible
* with linux 2.0, and to avoid hard-linking to directories
* and other special files. --ADM
*/
int do_linkat(int olddfd, struct filename *old, int newdfd,
struct filename *new, int flags)
{
struct user_namespace *mnt_userns;
struct dentry *new_dentry;
struct path old_path, new_path;
struct inode *delegated_inode = NULL;
int how = 0;
int error;
if ((flags & ~(AT_SYMLINK_FOLLOW | AT_EMPTY_PATH)) != 0) {
error = -EINVAL;
goto out_putnames;
}
/*
* To use null names we require CAP_DAC_READ_SEARCH
* This ensures that not everyone will be able to create
* handlink using the passed filedescriptor.
*/
if (flags & AT_EMPTY_PATH && !capable(CAP_DAC_READ_SEARCH)) {
error = -ENOENT;
goto out_putnames;
}
if (flags & AT_SYMLINK_FOLLOW)
how |= LOOKUP_FOLLOW;
retry:
error = filename_lookup(olddfd, old, how, &old_path, NULL);
if (error)
goto out_putnames;
new_dentry = filename_create(newdfd, new, &new_path,
(how & LOOKUP_REVAL));
error = PTR_ERR(new_dentry);
if (IS_ERR(new_dentry))
goto out_putpath;
error = -EXDEV;
if (old_path.mnt != new_path.mnt)
goto out_dput;
mnt_userns = mnt_user_ns(new_path.mnt);
error = may_linkat(mnt_userns, &old_path);
if (unlikely(error))
goto out_dput;
error = security_path_link(old_path.dentry, &new_path, new_dentry);
if (error)
goto out_dput;
error = vfs_link(old_path.dentry, mnt_userns, new_path.dentry->d_inode,
new_dentry, &delegated_inode);
out_dput:
done_path_create(&new_path, new_dentry);
if (delegated_inode) {
error = break_deleg_wait(&delegated_inode);
if (!error) {
path_put(&old_path);
goto retry;
}
}
if (retry_estale(error, how)) {
path_put(&old_path);
how |= LOOKUP_REVAL;
goto retry;
}
out_putpath:
path_put(&old_path);
out_putnames:
putname(old);
putname(new);
return error;
}
SYSCALL_DEFINE5(linkat, int, olddfd, const char __user *, oldname,
int, newdfd, const char __user *, newname, int, flags)
{
return do_linkat(olddfd, getname_uflags(oldname, flags),
newdfd, getname(newname), flags);
}
SYSCALL_DEFINE2(link, const char __user *, oldname, const char __user *, newname)
{
return do_linkat(AT_FDCWD, getname(oldname), AT_FDCWD, getname(newname), 0);
}
/**
* vfs_rename - rename a filesystem object
* @rd: pointer to &struct renamedata info
*
* The caller must hold multiple mutexes--see lock_rename()).
*
* If vfs_rename discovers a delegation in need of breaking at either
* the source or destination, it will return -EWOULDBLOCK and return a
* reference to the inode in delegated_inode. The caller should then
* break the delegation and retry. Because breaking a delegation may
* take a long time, the caller should drop all locks before doing
* so.
*
* Alternatively, a caller may pass NULL for delegated_inode. This may
* be appropriate for callers that expect the underlying filesystem not
* to be NFS exported.
*
* The worst of all namespace operations - renaming directory. "Perverted"
* doesn't even start to describe it. Somebody in UCB had a heck of a trip...
* Problems:
*
* a) we can get into loop creation.
* b) race potential - two innocent renames can create a loop together.
* That's where 4.4 screws up. Current fix: serialization on
* sb->s_vfs_rename_mutex. We might be more accurate, but that's another
* story.
* c) we have to lock _four_ objects - parents and victim (if it exists),
* and source (if it is not a directory).
* And that - after we got ->i_mutex on parents (until then we don't know
* whether the target exists). Solution: try to be smart with locking
* order for inodes. We rely on the fact that tree topology may change
* only under ->s_vfs_rename_mutex _and_ that parent of the object we
* move will be locked. Thus we can rank directories by the tree
* (ancestors first) and rank all non-directories after them.
* That works since everybody except rename does "lock parent, lookup,
* lock child" and rename is under ->s_vfs_rename_mutex.
* HOWEVER, it relies on the assumption that any object with ->lookup()
* has no more than 1 dentry. If "hybrid" objects will ever appear,
* we'd better make sure that there's no link(2) for them.
* d) conversion from fhandle to dentry may come in the wrong moment - when
* we are removing the target. Solution: we will have to grab ->i_mutex
* in the fhandle_to_dentry code. [FIXME - current nfsfh.c relies on
* ->i_mutex on parents, which works but leads to some truly excessive
* locking].
*/
int vfs_rename(struct renamedata *rd)
{
int error;
struct inode *old_dir = rd->old_dir, *new_dir = rd->new_dir; struct dentry *old_dentry = rd->old_dentry;
struct dentry *new_dentry = rd->new_dentry;
struct inode **delegated_inode = rd->delegated_inode;
unsigned int flags = rd->flags;
bool is_dir = d_is_dir(old_dentry);
struct inode *source = old_dentry->d_inode;
struct inode *target = new_dentry->d_inode;
bool new_is_dir = false;
unsigned max_links = new_dir->i_sb->s_max_links;
struct name_snapshot old_name;
if (source == target)
return 0;
error = may_delete(rd->old_mnt_userns, old_dir, old_dentry, is_dir);
if (error)
return error;
if (!target) {
error = may_create(rd->new_mnt_userns, new_dir, new_dentry);
} else {
new_is_dir = d_is_dir(new_dentry);
if (!(flags & RENAME_EXCHANGE)) error = may_delete(rd->new_mnt_userns, new_dir,
new_dentry, is_dir);
else
error = may_delete(rd->new_mnt_userns, new_dir,
new_dentry, new_is_dir);
}
if (error)
return error;
if (!old_dir->i_op->rename)
return -EPERM;
/*
* If we are going to change the parent - check write permissions,
* we'll need to flip '..'.
*/
if (new_dir != old_dir) { if (is_dir) { error = inode_permission(rd->old_mnt_userns, source,
MAY_WRITE);
if (error)
return error;
}
if ((flags & RENAME_EXCHANGE) && new_is_dir) { error = inode_permission(rd->new_mnt_userns, target,
MAY_WRITE);
if (error)
return error;
}
}
error = security_inode_rename(old_dir, old_dentry, new_dir, new_dentry,
flags);
if (error)
return error;
take_dentry_name_snapshot(&old_name, old_dentry);
dget(new_dentry);
if (!is_dir || (flags & RENAME_EXCHANGE)) lock_two_nondirectories(source, target); else if (target)
inode_lock(target);
error = -EPERM;
if (IS_SWAPFILE(source) || (target && IS_SWAPFILE(target)))
goto out;
error = -EBUSY;
if (is_local_mountpoint(old_dentry) || is_local_mountpoint(new_dentry))
goto out;
if (max_links && new_dir != old_dir) {
error = -EMLINK;
if (is_dir && !new_is_dir && new_dir->i_nlink >= max_links)
goto out;
if ((flags & RENAME_EXCHANGE) && !is_dir && new_is_dir && old_dir->i_nlink >= max_links)
goto out;
}
if (!is_dir) {
error = try_break_deleg(source, delegated_inode);
if (error)
goto out;
}
if (target && !new_is_dir) {
error = try_break_deleg(target, delegated_inode);
if (error)
goto out;
}
error = old_dir->i_op->rename(rd->new_mnt_userns, old_dir, old_dentry,
new_dir, new_dentry, flags);
if (error)
goto out;
if (!(flags & RENAME_EXCHANGE) && target) { if (is_dir) { shrink_dcache_parent(new_dentry);
target->i_flags |= S_DEAD;
}
dont_mount(new_dentry);
detach_mounts(new_dentry);
}
if (!(old_dir->i_sb->s_type->fs_flags & FS_RENAME_DOES_D_MOVE)) {
if (!(flags & RENAME_EXCHANGE))
d_move(old_dentry, new_dentry);
else
d_exchange(old_dentry, new_dentry);
}
out:
if (!is_dir || (flags & RENAME_EXCHANGE)) unlock_two_nondirectories(source, target); else if (target)
inode_unlock(target);
dput(new_dentry);
if (!error) {
fsnotify_move(old_dir, new_dir, &old_name.name, is_dir,
!(flags & RENAME_EXCHANGE) ? target : NULL, old_dentry); if (flags & RENAME_EXCHANGE) {
fsnotify_move(new_dir, old_dir, &old_dentry->d_name,
new_is_dir, NULL, new_dentry);
}
}
release_dentry_name_snapshot(&old_name); return error;
}
EXPORT_SYMBOL(vfs_rename);
int do_renameat2(int olddfd, struct filename *from, int newdfd,
struct filename *to, unsigned int flags)
{
struct renamedata rd;
struct dentry *old_dentry, *new_dentry;
struct dentry *trap;
struct path old_path, new_path;
struct qstr old_last, new_last;
int old_type, new_type;
struct inode *delegated_inode = NULL;
unsigned int lookup_flags = 0, target_flags = LOOKUP_RENAME_TARGET;
bool should_retry = false;
int error = -EINVAL;
if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT))
goto put_names;
if ((flags & (RENAME_NOREPLACE | RENAME_WHITEOUT)) &&
(flags & RENAME_EXCHANGE))
goto put_names;
if (flags & RENAME_EXCHANGE)
target_flags = 0;
retry:
error = filename_parentat(olddfd, from, lookup_flags, &old_path,
&old_last, &old_type);
if (error)
goto put_names;
error = filename_parentat(newdfd, to, lookup_flags, &new_path, &new_last,
&new_type);
if (error)
goto exit1;
error = -EXDEV;
if (old_path.mnt != new_path.mnt)
goto exit2;
error = -EBUSY;
if (old_type != LAST_NORM)
goto exit2;
if (flags & RENAME_NOREPLACE)
error = -EEXIST;
if (new_type != LAST_NORM)
goto exit2;
error = mnt_want_write(old_path.mnt);
if (error)
goto exit2;
retry_deleg:
trap = lock_rename(new_path.dentry, old_path.dentry);
old_dentry = __lookup_hash(&old_last, old_path.dentry, lookup_flags);
error = PTR_ERR(old_dentry);
if (IS_ERR(old_dentry))
goto exit3;
/* source must exist */
error = -ENOENT;
if (d_is_negative(old_dentry))
goto exit4;
new_dentry = __lookup_hash(&new_last, new_path.dentry, lookup_flags | target_flags);
error = PTR_ERR(new_dentry);
if (IS_ERR(new_dentry))
goto exit4;
error = -EEXIST;
if ((flags & RENAME_NOREPLACE) && d_is_positive(new_dentry))
goto exit5;
if (flags & RENAME_EXCHANGE) {
error = -ENOENT;
if (d_is_negative(new_dentry))
goto exit5;
if (!d_is_dir(new_dentry)) {
error = -ENOTDIR;
if (new_last.name[new_last.len])
goto exit5;
}
}
/* unless the source is a directory trailing slashes give -ENOTDIR */
if (!d_is_dir(old_dentry)) {
error = -ENOTDIR;
if (old_last.name[old_last.len])
goto exit5;
if (!(flags & RENAME_EXCHANGE) && new_last.name[new_last.len])
goto exit5;
}
/* source should not be ancestor of target */
error = -EINVAL;
if (old_dentry == trap)
goto exit5;
/* target should not be an ancestor of source */
if (!(flags & RENAME_EXCHANGE))
error = -ENOTEMPTY;
if (new_dentry == trap)
goto exit5;
error = security_path_rename(&old_path, old_dentry,
&new_path, new_dentry, flags);
if (error)
goto exit5;
rd.old_dir = old_path.dentry->d_inode;
rd.old_dentry = old_dentry;
rd.old_mnt_userns = mnt_user_ns(old_path.mnt);
rd.new_dir = new_path.dentry->d_inode;
rd.new_dentry = new_dentry;
rd.new_mnt_userns = mnt_user_ns(new_path.mnt);
rd.delegated_inode = &delegated_inode;
rd.flags = flags;
error = vfs_rename(&rd);
exit5:
dput(new_dentry);
exit4:
dput(old_dentry);
exit3:
unlock_rename(new_path.dentry, old_path.dentry);
if (delegated_inode) {
error = break_deleg_wait(&delegated_inode);
if (!error)
goto retry_deleg;
}
mnt_drop_write(old_path.mnt);
exit2:
if (retry_estale(error, lookup_flags))
should_retry = true;
path_put(&new_path);
exit1:
path_put(&old_path);
if (should_retry) {
should_retry = false;
lookup_flags |= LOOKUP_REVAL;
goto retry;
}
put_names:
putname(from);
putname(to);
return error;
}
SYSCALL_DEFINE5(renameat2, int, olddfd, const char __user *, oldname,
int, newdfd, const char __user *, newname, unsigned int, flags)
{
return do_renameat2(olddfd, getname(oldname), newdfd, getname(newname),
flags);
}
SYSCALL_DEFINE4(renameat, int, olddfd, const char __user *, oldname,
int, newdfd, const char __user *, newname)
{
return do_renameat2(olddfd, getname(oldname), newdfd, getname(newname),
0);
}
SYSCALL_DEFINE2(rename, const char __user *, oldname, const char __user *, newname)
{
return do_renameat2(AT_FDCWD, getname(oldname), AT_FDCWD,
getname(newname), 0);
}
int readlink_copy(char __user *buffer, int buflen, const char *link)
{
int len = PTR_ERR(link);
if (IS_ERR(link))
goto out;
len = strlen(link);
if (len > (unsigned) buflen)
len = buflen;
if (copy_to_user(buffer, link, len))
len = -EFAULT;
out:
return len;
}
/**
* vfs_readlink - copy symlink body into userspace buffer
* @dentry: dentry on which to get symbolic link
* @buffer: user memory pointer
* @buflen: size of buffer
*
* Does not touch atime. That's up to the caller if necessary
*
* Does not call security hook.
*/
int vfs_readlink(struct dentry *dentry, char __user *buffer, int buflen)
{
struct inode *inode = d_inode(dentry);
DEFINE_DELAYED_CALL(done);
const char *link;
int res;
if (unlikely(!(inode->i_opflags & IOP_DEFAULT_READLINK))) {
if (unlikely(inode->i_op->readlink)) return inode->i_op->readlink(dentry, buffer, buflen);
if (!d_is_symlink(dentry))
return -EINVAL;
spin_lock(&inode->i_lock);
inode->i_opflags |= IOP_DEFAULT_READLINK;
spin_unlock(&inode->i_lock);
}
link = READ_ONCE(inode->i_link);
if (!link) {
link = inode->i_op->get_link(dentry, inode, &done);
if (IS_ERR(link))
return PTR_ERR(link);
}
res = readlink_copy(buffer, buflen, link);
do_delayed_call(&done);
return res;
}
EXPORT_SYMBOL(vfs_readlink);
/**
* vfs_get_link - get symlink body
* @dentry: dentry on which to get symbolic link
* @done: caller needs to free returned data with this
*
* Calls security hook and i_op->get_link() on the supplied inode.
*
* It does not touch atime. That's up to the caller if necessary.
*
* Does not work on "special" symlinks like /proc/$$/fd/N
*/
const char *vfs_get_link(struct dentry *dentry, struct delayed_call *done)
{
const char *res = ERR_PTR(-EINVAL);
struct inode *inode = d_inode(dentry);
if (d_is_symlink(dentry)) {
res = ERR_PTR(security_inode_readlink(dentry));
if (!res)
res = inode->i_op->get_link(dentry, inode, done);
}
return res;
}
EXPORT_SYMBOL(vfs_get_link);
/* get the link contents into pagecache */
const char *page_get_link(struct dentry *dentry, struct inode *inode,
struct delayed_call *callback)
{
char *kaddr;
struct page *page;
struct address_space *mapping = inode->i_mapping;
if (!dentry) {
page = find_get_page(mapping, 0);
if (!page)
return ERR_PTR(-ECHILD);
if (!PageUptodate(page)) {
put_page(page);
return ERR_PTR(-ECHILD);
}
} else {
page = read_mapping_page(mapping, 0, NULL);
if (IS_ERR(page))
return (char*)page;
}
set_delayed_call(callback, page_put_link, page);
BUG_ON(mapping_gfp_mask(mapping) & __GFP_HIGHMEM);
kaddr = page_address(page);
nd_terminate_link(kaddr, inode->i_size, PAGE_SIZE - 1);
return kaddr;
}
EXPORT_SYMBOL(page_get_link);
void page_put_link(void *arg)
{
put_page(arg);
}
EXPORT_SYMBOL(page_put_link);
int page_readlink(struct dentry *dentry, char __user *buffer, int buflen)
{
DEFINE_DELAYED_CALL(done);
int res = readlink_copy(buffer, buflen,
page_get_link(dentry, d_inode(dentry),
&done));
do_delayed_call(&done);
return res;
}
EXPORT_SYMBOL(page_readlink);
/*
* The nofs argument instructs pagecache_write_begin to pass AOP_FLAG_NOFS
*/
int __page_symlink(struct inode *inode, const char *symname, int len, int nofs)
{
struct address_space *mapping = inode->i_mapping;
struct page *page;
void *fsdata;
int err;
unsigned int flags = 0;
if (nofs)
flags |= AOP_FLAG_NOFS;
retry:
err = pagecache_write_begin(NULL, mapping, 0, len-1,
flags, &page, &fsdata);
if (err)
goto fail;
memcpy(page_address(page), symname, len-1);
err = pagecache_write_end(NULL, mapping, 0, len-1, len-1,
page, fsdata);
if (err < 0)
goto fail;
if (err < len-1)
goto retry;
mark_inode_dirty(inode);
return 0;
fail:
return err;
}
EXPORT_SYMBOL(__page_symlink);
int page_symlink(struct inode *inode, const char *symname, int len)
{
return __page_symlink(inode, symname, len,
!mapping_gfp_constraint(inode->i_mapping, __GFP_FS));
}
EXPORT_SYMBOL(page_symlink);
const struct inode_operations page_symlink_inode_operations = {
.get_link = page_get_link,
};
EXPORT_SYMBOL(page_symlink_inode_operations);
// SPDX-License-Identifier: GPL-2.0-only
/*
* linux/mm/filemap.c
*
* Copyright (C) 1994-1999 Linus Torvalds
*/
/*
* This file handles the generic file mmap semantics used by
* most "normal" filesystems (but you don't /have/ to use this:
* the NFS filesystem used to do this differently, for example)
*/
#include <linux/export.h>
#include <linux/compiler.h>
#include <linux/dax.h>
#include <linux/fs.h>
#include <linux/sched/signal.h>
#include <linux/uaccess.h>
#include <linux/capability.h>
#include <linux/kernel_stat.h>
#include <linux/gfp.h>
#include <linux/mm.h>
#include <linux/swap.h>
#include <linux/mman.h>
#include <linux/pagemap.h>
#include <linux/file.h>
#include <linux/uio.h>
#include <linux/error-injection.h>
#include <linux/hash.h>
#include <linux/writeback.h>
#include <linux/backing-dev.h>
#include <linux/pagevec.h>
#include <linux/blkdev.h>
#include <linux/security.h>
#include <linux/cpuset.h>
#include <linux/hugetlb.h>
#include <linux/memcontrol.h>
#include <linux/cleancache.h>
#include <linux/shmem_fs.h>
#include <linux/rmap.h>
#include <linux/delayacct.h>
#include <linux/psi.h>
#include <linux/ramfs.h>
#include <linux/page_idle.h>
#include <asm/pgalloc.h>
#include <asm/tlbflush.h>
#include "internal.h"
#define CREATE_TRACE_POINTS
#include <trace/events/filemap.h>
/*
* FIXME: remove all knowledge of the buffer layer from the core VM
*/
#include <linux/buffer_head.h> /* for try_to_free_buffers */
#include <asm/mman.h>
/*
* Shared mappings implemented 30.11.1994. It's not fully working yet,
* though.
*
* Shared mappings now work. 15.8.1995 Bruno.
*
* finished 'unifying' the page and buffer cache and SMP-threaded the
* page-cache, 21.05.1999, Ingo Molnar <mingo@redhat.com>
*
* SMP-threaded pagemap-LRU 1999, Andrea Arcangeli <andrea@suse.de>
*/
/*
* Lock ordering:
*
* ->i_mmap_rwsem (truncate_pagecache)
* ->private_lock (__free_pte->__set_page_dirty_buffers)
* ->swap_lock (exclusive_swap_page, others)
* ->i_pages lock
*
* ->i_rwsem
* ->invalidate_lock (acquired by fs in truncate path)
* ->i_mmap_rwsem (truncate->unmap_mapping_range)
*
* ->mmap_lock
* ->i_mmap_rwsem
* ->page_table_lock or pte_lock (various, mainly in memory.c)
* ->i_pages lock (arch-dependent flush_dcache_mmap_lock)
*
* ->mmap_lock
* ->invalidate_lock (filemap_fault)
* ->lock_page (filemap_fault, access_process_vm)
*
* ->i_rwsem (generic_perform_write)
* ->mmap_lock (fault_in_readable->do_page_fault)
*
* bdi->wb.list_lock
* sb_lock (fs/fs-writeback.c)
* ->i_pages lock (__sync_single_inode)
*
* ->i_mmap_rwsem
* ->anon_vma.lock (vma_adjust)
*
* ->anon_vma.lock
* ->page_table_lock or pte_lock (anon_vma_prepare and various)
*
* ->page_table_lock or pte_lock
* ->swap_lock (try_to_unmap_one)
* ->private_lock (try_to_unmap_one)
* ->i_pages lock (try_to_unmap_one)
* ->lruvec->lru_lock (follow_page->mark_page_accessed)
* ->lruvec->lru_lock (check_pte_range->isolate_lru_page)
* ->private_lock (page_remove_rmap->set_page_dirty)
* ->i_pages lock (page_remove_rmap->set_page_dirty)
* bdi.wb->list_lock (page_remove_rmap->set_page_dirty)
* ->inode->i_lock (page_remove_rmap->set_page_dirty)
* ->memcg->move_lock (page_remove_rmap->lock_page_memcg)
* bdi.wb->list_lock (zap_pte_range->set_page_dirty)
* ->inode->i_lock (zap_pte_range->set_page_dirty)
* ->private_lock (zap_pte_range->__set_page_dirty_buffers)
*
* ->i_mmap_rwsem
* ->tasklist_lock (memory_failure, collect_procs_ao)
*/
static void page_cache_delete(struct address_space *mapping,
struct page *page, void *shadow)
{
XA_STATE(xas, &mapping->i_pages, page->index);
unsigned int nr = 1;
mapping_set_update(&xas, mapping);
/* hugetlb pages are represented by a single entry in the xarray */
if (!PageHuge(page)) { xas_set_order(&xas, page->index, compound_order(page));
nr = compound_nr(page);
}
VM_BUG_ON_PAGE(!PageLocked(page), page);
VM_BUG_ON_PAGE(PageTail(page), page);
VM_BUG_ON_PAGE(nr != 1 && shadow, page);
xas_store(&xas, shadow);
xas_init_marks(&xas);
page->mapping = NULL;
/* Leave page->index set: truncation lookup relies upon it */
mapping->nrpages -= nr;
}
static void unaccount_page_cache_page(struct address_space *mapping,
struct page *page)
{
int nr;
/*
* if we're uptodate, flush out into the cleancache, otherwise
* invalidate any existing cleancache entries. We can't leave
* stale data around in the cleancache once our page is gone
*/
if (PageUptodate(page) && PageMappedToDisk(page))
cleancache_put_page(page);
else
cleancache_invalidate_page(mapping, page);
VM_BUG_ON_PAGE(PageTail(page), page);
VM_BUG_ON_PAGE(page_mapped(page), page);
if (!IS_ENABLED(CONFIG_DEBUG_VM) && unlikely(page_mapped(page))) {
int mapcount;
pr_alert("BUG: Bad page cache in process %s pfn:%05lx\n",
current->comm, page_to_pfn(page));
dump_page(page, "still mapped when deleted");
dump_stack();
add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
mapcount = page_mapcount(page);
if (mapping_exiting(mapping) &&
page_count(page) >= mapcount + 2) {
/*
* All vmas have already been torn down, so it's
* a good bet that actually the page is unmapped,
* and we'd prefer not to leak it: if we're wrong,
* some other bad page check should catch it later.
*/
page_mapcount_reset(page);
page_ref_sub(page, mapcount);
}
}
/* hugetlb pages do not participate in page cache accounting. */
if (PageHuge(page))
return;
nr = thp_nr_pages(page);
__mod_lruvec_page_state(page, NR_FILE_PAGES, -nr);
if (PageSwapBacked(page)) {
__mod_lruvec_page_state(page, NR_SHMEM, -nr);
if (PageTransHuge(page))
__mod_lruvec_page_state(page, NR_SHMEM_THPS, -nr);
} else if (PageTransHuge(page)) {
__mod_lruvec_page_state(page, NR_FILE_THPS, -nr);
filemap_nr_thps_dec(mapping);
}
/*
* At this point page must be either written or cleaned by
* truncate. Dirty page here signals a bug and loss of
* unwritten data.
*
* This fixes dirty accounting after removing the page entirely
* but leaves PageDirty set: it has no effect for truncated
* page and anyway will be cleared before returning page into
* buddy allocator.
*/
if (WARN_ON_ONCE(PageDirty(page)))
account_page_cleaned(page, mapping, inode_to_wb(mapping->host));
}
/*
* Delete a page from the page cache and free it. Caller has to make
* sure the page is locked and that nobody else uses it - or that usage
* is safe. The caller must hold the i_pages lock.
*/
void __delete_from_page_cache(struct page *page, void *shadow)
{
struct address_space *mapping = page->mapping;
trace_mm_filemap_delete_from_page_cache(page);
unaccount_page_cache_page(mapping, page);
page_cache_delete(mapping, page, shadow);
}
static void page_cache_free_page(struct address_space *mapping,
struct page *page)
{
void (*freepage)(struct page *);
freepage = mapping->a_ops->freepage;
if (freepage)
freepage(page);
if (PageTransHuge(page) && !PageHuge(page)) {
page_ref_sub(page, thp_nr_pages(page));
VM_BUG_ON_PAGE(page_count(page) <= 0, page);
} else {
put_page(page);
}
}
/**
* delete_from_page_cache - delete page from page cache
* @page: the page which the kernel is trying to remove from page cache
*
* This must be called only on pages that have been verified to be in the page
* cache and locked. It will never put the page into the free list, the caller
* has a reference on the page.
*/
void delete_from_page_cache(struct page *page)
{
struct address_space *mapping = page_mapping(page); BUG_ON(!PageLocked(page));
xa_lock_irq(&mapping->i_pages);
__delete_from_page_cache(page, NULL);
xa_unlock_irq(&mapping->i_pages);
page_cache_free_page(mapping, page);
}
EXPORT_SYMBOL(delete_from_page_cache);
/*
* page_cache_delete_batch - delete several pages from page cache
* @mapping: the mapping to which pages belong
* @pvec: pagevec with pages to delete
*
* The function walks over mapping->i_pages and removes pages passed in @pvec
* from the mapping. The function expects @pvec to be sorted by page index
* and is optimised for it to be dense.
* It tolerates holes in @pvec (mapping entries at those indices are not
* modified). The function expects only THP head pages to be present in the
* @pvec.
*
* The function expects the i_pages lock to be held.
*/
static void page_cache_delete_batch(struct address_space *mapping,
struct pagevec *pvec)
{
XA_STATE(xas, &mapping->i_pages, pvec->pages[0]->index);
int total_pages = 0;
int i = 0;
struct page *page;
mapping_set_update(&xas, mapping);
xas_for_each(&xas, page, ULONG_MAX) { if (i >= pagevec_count(pvec))
break;
/* A swap/dax/shadow entry got inserted? Skip it. */
if (xa_is_value(page))
continue;
/*
* A page got inserted in our range? Skip it. We have our
* pages locked so they are protected from being removed.
* If we see a page whose index is higher than ours, it
* means our page has been removed, which shouldn't be
* possible because we're holding the PageLock.
*/
if (page != pvec->pages[i]) {
VM_BUG_ON_PAGE(page->index > pvec->pages[i]->index,
page);
continue;
}
WARN_ON_ONCE(!PageLocked(page)); if (page->index == xas.xa_index) page->mapping = NULL;
/* Leave page->index set: truncation lookup relies on it */
/*
* Move to the next page in the vector if this is a regular
* page or the index is of the last sub-page of this compound
* page.
*/
if (page->index + compound_nr(page) - 1 == xas.xa_index) i++; xas_store(&xas, NULL); total_pages++;
}
mapping->nrpages -= total_pages;
}
void delete_from_page_cache_batch(struct address_space *mapping,
struct pagevec *pvec)
{
int i;
if (!pagevec_count(pvec))
return;
xa_lock_irq(&mapping->i_pages);
for (i = 0; i < pagevec_count(pvec); i++) {
trace_mm_filemap_delete_from_page_cache(pvec->pages[i]); unaccount_page_cache_page(mapping, pvec->pages[i]);
}
page_cache_delete_batch(mapping, pvec);
xa_unlock_irq(&mapping->i_pages);
for (i = 0; i < pagevec_count(pvec); i++) page_cache_free_page(mapping, pvec->pages[i]);
}
int filemap_check_errors(struct address_space *mapping)
{
int ret = 0;
/* Check for outstanding write errors */
if (test_bit(AS_ENOSPC, &mapping->flags) &&
test_and_clear_bit(AS_ENOSPC, &mapping->flags))
ret = -ENOSPC;
if (test_bit(AS_EIO, &mapping->flags) &&
test_and_clear_bit(AS_EIO, &mapping->flags))
ret = -EIO;
return ret;
}
EXPORT_SYMBOL(filemap_check_errors);
static int filemap_check_and_keep_errors(struct address_space *mapping)
{
/* Check for outstanding write errors */
if (test_bit(AS_EIO, &mapping->flags))
return -EIO;
if (test_bit(AS_ENOSPC, &mapping->flags))
return -ENOSPC;
return 0;
}
/**
* filemap_fdatawrite_wbc - start writeback on mapping dirty pages in range
* @mapping: address space structure to write
* @wbc: the writeback_control controlling the writeout
*
* Call writepages on the mapping using the provided wbc to control the
* writeout.
*
* Return: %0 on success, negative error code otherwise.
*/
int filemap_fdatawrite_wbc(struct address_space *mapping,
struct writeback_control *wbc)
{
int ret;
if (!mapping_can_writeback(mapping) ||
!mapping_tagged(mapping, PAGECACHE_TAG_DIRTY))
return 0;
wbc_attach_fdatawrite_inode(wbc, mapping->host);
ret = do_writepages(mapping, wbc);
wbc_detach_inode(wbc);
return ret;
}
EXPORT_SYMBOL(filemap_fdatawrite_wbc);
/**
* __filemap_fdatawrite_range - start writeback on mapping dirty pages in range
* @mapping: address space structure to write
* @start: offset in bytes where the range starts
* @end: offset in bytes where the range ends (inclusive)
* @sync_mode: enable synchronous operation
*
* Start writeback against all of a mapping's dirty pages that lie
* within the byte offsets <start, end> inclusive.
*
* If sync_mode is WB_SYNC_ALL then this is a "data integrity" operation, as
* opposed to a regular memory cleansing writeback. The difference between
* these two operations is that if a dirty page/buffer is encountered, it must
* be waited upon, and not just skipped over.
*
* Return: %0 on success, negative error code otherwise.
*/
int __filemap_fdatawrite_range(struct address_space *mapping, loff_t start,
loff_t end, int sync_mode)
{
struct writeback_control wbc = {
.sync_mode = sync_mode,
.nr_to_write = LONG_MAX,
.range_start = start,
.range_end = end,
};
return filemap_fdatawrite_wbc(mapping, &wbc);
}
static inline int __filemap_fdatawrite(struct address_space *mapping,
int sync_mode)
{
return __filemap_fdatawrite_range(mapping, 0, LLONG_MAX, sync_mode);
}
int filemap_fdatawrite(struct address_space *mapping)
{
return __filemap_fdatawrite(mapping, WB_SYNC_ALL);
}
EXPORT_SYMBOL(filemap_fdatawrite);
int filemap_fdatawrite_range(struct address_space *mapping, loff_t start,
loff_t end)
{
return __filemap_fdatawrite_range(mapping, start, end, WB_SYNC_ALL);
}
EXPORT_SYMBOL(filemap_fdatawrite_range);
/**
* filemap_flush - mostly a non-blocking flush
* @mapping: target address_space
*
* This is a mostly non-blocking flush. Not suitable for data-integrity
* purposes - I/O may not be started against all dirty pages.
*
* Return: %0 on success, negative error code otherwise.
*/
int filemap_flush(struct address_space *mapping)
{
return __filemap_fdatawrite(mapping, WB_SYNC_NONE);
}
EXPORT_SYMBOL(filemap_flush);
/**
* filemap_range_has_page - check if a page exists in range.
* @mapping: address space within which to check
* @start_byte: offset in bytes where the range starts
* @end_byte: offset in bytes where the range ends (inclusive)
*
* Find at least one page in the range supplied, usually used to check if
* direct writing in this range will trigger a writeback.
*
* Return: %true if at least one page exists in the specified range,
* %false otherwise.
*/
bool filemap_range_has_page(struct address_space *mapping,
loff_t start_byte, loff_t end_byte)
{
struct page *page;
XA_STATE(xas, &mapping->i_pages, start_byte >> PAGE_SHIFT); pgoff_t max = end_byte >> PAGE_SHIFT;
if (end_byte < start_byte)
return false;
rcu_read_lock();
for (;;) {
page = xas_find(&xas, max);
if (xas_retry(&xas, page))
continue;
/* Shadow entries don't count */
if (xa_is_value(page))
continue;
/*
* We don't need to try to pin this page; we're about to
* release the RCU lock anyway. It is enough to know that
* there was a page here recently.
*/
break;
}
rcu_read_unlock();
return page != NULL;
}
EXPORT_SYMBOL(filemap_range_has_page);
static void __filemap_fdatawait_range(struct address_space *mapping,
loff_t start_byte, loff_t end_byte)
{
pgoff_t index = start_byte >> PAGE_SHIFT; pgoff_t end = end_byte >> PAGE_SHIFT;
struct pagevec pvec;
int nr_pages;
if (end_byte < start_byte)
return;
pagevec_init(&pvec);
while (index <= end) {
unsigned i;
nr_pages = pagevec_lookup_range_tag(&pvec, mapping, &index,
end, PAGECACHE_TAG_WRITEBACK);
if (!nr_pages)
break;
for (i = 0; i < nr_pages; i++) {
struct page *page = pvec.pages[i];
wait_on_page_writeback(page);
ClearPageError(page);
}
pagevec_release(&pvec);
cond_resched();
}
}
/**
* filemap_fdatawait_range - wait for writeback to complete
* @mapping: address space structure to wait for
* @start_byte: offset in bytes where the range starts
* @end_byte: offset in bytes where the range ends (inclusive)
*
* Walk the list of under-writeback pages of the given address space
* in the given range and wait for all of them. Check error status of
* the address space and return it.
*
* Since the error status of the address space is cleared by this function,
* callers are responsible for checking the return value and handling and/or
* reporting the error.
*
* Return: error status of the address space.
*/
int filemap_fdatawait_range(struct address_space *mapping, loff_t start_byte,
loff_t end_byte)
{
__filemap_fdatawait_range(mapping, start_byte, end_byte);
return filemap_check_errors(mapping);
}
EXPORT_SYMBOL(filemap_fdatawait_range);
/**
* filemap_fdatawait_range_keep_errors - wait for writeback to complete
* @mapping: address space structure to wait for
* @start_byte: offset in bytes where the range starts
* @end_byte: offset in bytes where the range ends (inclusive)
*
* Walk the list of under-writeback pages of the given address space in the
* given range and wait for all of them. Unlike filemap_fdatawait_range(),
* this function does not clear error status of the address space.
*
* Use this function if callers don't handle errors themselves. Expected
* call sites are system-wide / filesystem-wide data flushers: e.g. sync(2),
* fsfreeze(8)
*/
int filemap_fdatawait_range_keep_errors(struct address_space *mapping,
loff_t start_byte, loff_t end_byte)
{
__filemap_fdatawait_range(mapping, start_byte, end_byte);
return filemap_check_and_keep_errors(mapping);
}
EXPORT_SYMBOL(filemap_fdatawait_range_keep_errors);
/**
* file_fdatawait_range - wait for writeback to complete
* @file: file pointing to address space structure to wait for
* @start_byte: offset in bytes where the range starts
* @end_byte: offset in bytes where the range ends (inclusive)
*
* Walk the list of under-writeback pages of the address space that file
* refers to, in the given range and wait for all of them. Check error
* status of the address space vs. the file->f_wb_err cursor and return it.
*
* Since the error status of the file is advanced by this function,
* callers are responsible for checking the return value and handling and/or
* reporting the error.
*
* Return: error status of the address space vs. the file->f_wb_err cursor.
*/
int file_fdatawait_range(struct file *file, loff_t start_byte, loff_t end_byte)
{
struct address_space *mapping = file->f_mapping;
__filemap_fdatawait_range(mapping, start_byte, end_byte);
return file_check_and_advance_wb_err(file);
}
EXPORT_SYMBOL(file_fdatawait_range);
/**
* filemap_fdatawait_keep_errors - wait for writeback without clearing errors
* @mapping: address space structure to wait for
*
* Walk the list of under-writeback pages of the given address space
* and wait for all of them. Unlike filemap_fdatawait(), this function
* does not clear error status of the address space.
*
* Use this function if callers don't handle errors themselves. Expected
* call sites are system-wide / filesystem-wide data flushers: e.g. sync(2),
* fsfreeze(8)
*
* Return: error status of the address space.
*/
int filemap_fdatawait_keep_errors(struct address_space *mapping)
{
__filemap_fdatawait_range(mapping, 0, LLONG_MAX);
return filemap_check_and_keep_errors(mapping);
}
EXPORT_SYMBOL(filemap_fdatawait_keep_errors);
/* Returns true if writeback might be needed or already in progress. */
static bool mapping_needs_writeback(struct address_space *mapping)
{
return mapping->nrpages;
}
/**
* filemap_range_needs_writeback - check if range potentially needs writeback
* @mapping: address space within which to check
* @start_byte: offset in bytes where the range starts
* @end_byte: offset in bytes where the range ends (inclusive)
*
* Find at least one page in the range supplied, usually used to check if
* direct writing in this range will trigger a writeback. Used by O_DIRECT
* read/write with IOCB_NOWAIT, to see if the caller needs to do
* filemap_write_and_wait_range() before proceeding.
*
* Return: %true if the caller should do filemap_write_and_wait_range() before
* doing O_DIRECT to a page in this range, %false otherwise.
*/
bool filemap_range_needs_writeback(struct address_space *mapping,
loff_t start_byte, loff_t end_byte)
{
XA_STATE(xas, &mapping->i_pages, start_byte >> PAGE_SHIFT);
pgoff_t max = end_byte >> PAGE_SHIFT;
struct page *page;
if (!mapping_needs_writeback(mapping))
return false;
if (!mapping_tagged(mapping, PAGECACHE_TAG_DIRTY) &&
!mapping_tagged(mapping, PAGECACHE_TAG_WRITEBACK))
return false;
if (end_byte < start_byte)
return false;
rcu_read_lock();
xas_for_each(&xas, page, max) {
if (xas_retry(&xas, page))
continue;
if (xa_is_value(page))
continue;
if (PageDirty(page) || PageLocked(page) || PageWriteback(page))
break;
}
rcu_read_unlock();
return page != NULL;
}
EXPORT_SYMBOL_GPL(filemap_range_needs_writeback);
/**
* filemap_write_and_wait_range - write out & wait on a file range
* @mapping: the address_space for the pages
* @lstart: offset in bytes where the range starts
* @lend: offset in bytes where the range ends (inclusive)
*
* Write out and wait upon file offsets lstart->lend, inclusive.
*
* Note that @lend is inclusive (describes the last byte to be written) so
* that this function can be used to write to the very end-of-file (end = -1).
*
* Return: error status of the address space.
*/
int filemap_write_and_wait_range(struct address_space *mapping,
loff_t lstart, loff_t lend)
{
int err = 0;
if (mapping_needs_writeback(mapping)) {
err = __filemap_fdatawrite_range(mapping, lstart, lend,
WB_SYNC_ALL);
/*
* Even if the above returned error, the pages may be
* written partially (e.g. -ENOSPC), so we wait for it.
* But the -EIO is special case, it may indicate the worst
* thing (e.g. bug) happened, so we avoid waiting for it.
*/
if (err != -EIO) {
int err2 = filemap_fdatawait_range(mapping,
lstart, lend);
if (!err)
err = err2;
} else {
/* Clear any previously stored errors */
filemap_check_errors(mapping);
}
} else {
err = filemap_check_errors(mapping);
}
return err;
}
EXPORT_SYMBOL(filemap_write_and_wait_range);
void __filemap_set_wb_err(struct address_space *mapping, int err)
{
errseq_t eseq = errseq_set(&mapping->wb_err, err);
trace_filemap_set_wb_err(mapping, eseq);
}
EXPORT_SYMBOL(__filemap_set_wb_err);
/**
* file_check_and_advance_wb_err - report wb error (if any) that was previously
* and advance wb_err to current one
* @file: struct file on which the error is being reported
*
* When userland calls fsync (or something like nfsd does the equivalent), we
* want to report any writeback errors that occurred since the last fsync (or
* since the file was opened if there haven't been any).
*
* Grab the wb_err from the mapping. If it matches what we have in the file,
* then just quickly return 0. The file is all caught up.
*
* If it doesn't match, then take the mapping value, set the "seen" flag in
* it and try to swap it into place. If it works, or another task beat us
* to it with the new value, then update the f_wb_err and return the error
* portion. The error at this point must be reported via proper channels
* (a'la fsync, or NFS COMMIT operation, etc.).
*
* While we handle mapping->wb_err with atomic operations, the f_wb_err
* value is protected by the f_lock since we must ensure that it reflects
* the latest value swapped in for this file descriptor.
*
* Return: %0 on success, negative error code otherwise.
*/
int file_check_and_advance_wb_err(struct file *file)
{
int err = 0;
errseq_t old = READ_ONCE(file->f_wb_err);
struct address_space *mapping = file->f_mapping;
/* Locklessly handle the common case where nothing has changed */
if (errseq_check(&mapping->wb_err, old)) {
/* Something changed, must use slow path */
spin_lock(&file->f_lock);
old = file->f_wb_err;
err = errseq_check_and_advance(&mapping->wb_err,
&file->f_wb_err);
trace_file_check_and_advance_wb_err(file, old);
spin_unlock(&file->f_lock);
}
/*
* We're mostly using this function as a drop in replacement for
* filemap_check_errors. Clear AS_EIO/AS_ENOSPC to emulate the effect
* that the legacy code would have had on these flags.
*/
clear_bit(AS_EIO, &mapping->flags);
clear_bit(AS_ENOSPC, &mapping->flags);
return err;
}
EXPORT_SYMBOL(file_check_and_advance_wb_err);
/**
* file_write_and_wait_range - write out & wait on a file range
* @file: file pointing to address_space with pages
* @lstart: offset in bytes where the range starts
* @lend: offset in bytes where the range ends (inclusive)
*
* Write out and wait upon file offsets lstart->lend, inclusive.
*
* Note that @lend is inclusive (describes the last byte to be written) so
* that this function can be used to write to the very end-of-file (end = -1).
*
* After writing out and waiting on the data, we check and advance the
* f_wb_err cursor to the latest value, and return any errors detected there.
*
* Return: %0 on success, negative error code otherwise.
*/
int file_write_and_wait_range(struct file *file, loff_t lstart, loff_t lend)
{
int err = 0, err2;
struct address_space *mapping = file->f_mapping;
if (mapping_needs_writeback(mapping)) {
err = __filemap_fdatawrite_range(mapping, lstart, lend,
WB_SYNC_ALL);
/* See comment of filemap_write_and_wait() */
if (err != -EIO)
__filemap_fdatawait_range(mapping, lstart, lend);
}
err2 = file_check_and_advance_wb_err(file);
if (!err)
err = err2;
return err;
}
EXPORT_SYMBOL(file_write_and_wait_range);
/**
* replace_page_cache_page - replace a pagecache page with a new one
* @old: page to be replaced
* @new: page to replace with
*
* This function replaces a page in the pagecache with a new one. On
* success it acquires the pagecache reference for the new page and
* drops it for the old page. Both the old and new pages must be
* locked. This function does not add the new page to the LRU, the
* caller must do that.
*
* The remove + add is atomic. This function cannot fail.
*/
void replace_page_cache_page(struct page *old, struct page *new)
{
struct address_space *mapping = old->mapping;
void (*freepage)(struct page *) = mapping->a_ops->freepage;
pgoff_t offset = old->index;
XA_STATE(xas, &mapping->i_pages, offset);
VM_BUG_ON_PAGE(!PageLocked(old), old);
VM_BUG_ON_PAGE(!PageLocked(new), new);
VM_BUG_ON_PAGE(new->mapping, new);
get_page(new);
new->mapping = mapping;
new->index = offset;
mem_cgroup_migrate(old, new);
xas_lock_irq(&xas);
xas_store(&xas, new);
old->mapping = NULL;
/* hugetlb pages do not participate in page cache accounting. */
if (!PageHuge(old))
__dec_lruvec_page_state(old, NR_FILE_PAGES);
if (!PageHuge(new))
__inc_lruvec_page_state(new, NR_FILE_PAGES);
if (PageSwapBacked(old))
__dec_lruvec_page_state(old, NR_SHMEM);
if (PageSwapBacked(new))
__inc_lruvec_page_state(new, NR_SHMEM);
xas_unlock_irq(&xas);
if (freepage)
freepage(old);
put_page(old);
}
EXPORT_SYMBOL_GPL(replace_page_cache_page);
noinline int __add_to_page_cache_locked(struct page *page,
struct address_space *mapping,
pgoff_t offset, gfp_t gfp,
void **shadowp)
{
XA_STATE(xas, &mapping->i_pages, offset);
int huge = PageHuge(page);
int error;
bool charged = false;
VM_BUG_ON_PAGE(!PageLocked(page), page);
VM_BUG_ON_PAGE(PageSwapBacked(page), page);
mapping_set_update(&xas, mapping);
get_page(page);
page->mapping = mapping;
page->index = offset;
if (!huge) {
error = mem_cgroup_charge(page, NULL, gfp);
if (error)
goto error;
charged = true;
}
gfp &= GFP_RECLAIM_MASK;
do {
unsigned int order = xa_get_order(xas.xa, xas.xa_index);
void *entry, *old = NULL;
if (order > thp_order(page))
xas_split_alloc(&xas, xa_load(xas.xa, xas.xa_index),
order, gfp);
xas_lock_irq(&xas);
xas_for_each_conflict(&xas, entry) {
old = entry;
if (!xa_is_value(entry)) {
xas_set_err(&xas, -EEXIST);
goto unlock;
}
}
if (old) { if (shadowp) *shadowp = old;
/* entry may have been split before we acquired lock */
order = xa_get_order(xas.xa, xas.xa_index);
if (order > thp_order(page)) {
xas_split(&xas, old, order);
xas_reset(&xas);
}
}
xas_store(&xas, page);
if (xas_error(&xas))
goto unlock;
mapping->nrpages++;
/* hugetlb pages do not participate in page cache accounting */
if (!huge)
__inc_lruvec_page_state(page, NR_FILE_PAGES);
unlock:
xas_unlock_irq(&xas);
} while (xas_nomem(&xas, gfp));
if (xas_error(&xas)) {
error = xas_error(&xas);
if (charged)
mem_cgroup_uncharge(page);
goto error;
}
trace_mm_filemap_add_to_page_cache(page);
return 0;
error:
page->mapping = NULL;
/* Leave page->index set: truncation relies upon it */
put_page(page);
return error;
}
ALLOW_ERROR_INJECTION(__add_to_page_cache_locked, ERRNO);
/**
* add_to_page_cache_locked - add a locked page to the pagecache
* @page: page to add
* @mapping: the page's address_space
* @offset: page index
* @gfp_mask: page allocation mode
*
* This function is used to add a page to the pagecache. It must be locked.
* This function does not add the page to the LRU. The caller must do that.
*
* Return: %0 on success, negative error code otherwise.
*/
int add_to_page_cache_locked(struct page *page, struct address_space *mapping,
pgoff_t offset, gfp_t gfp_mask)
{
return __add_to_page_cache_locked(page, mapping, offset,
gfp_mask, NULL);
}
EXPORT_SYMBOL(add_to_page_cache_locked);
int add_to_page_cache_lru(struct page *page, struct address_space *mapping,
pgoff_t offset, gfp_t gfp_mask)
{
void *shadow = NULL;
int ret;
__SetPageLocked(page);
ret = __add_to_page_cache_locked(page, mapping, offset,
gfp_mask, &shadow);
if (unlikely(ret))
__ClearPageLocked(page);
else {
/*
* The page might have been evicted from cache only
* recently, in which case it should be activated like
* any other repeatedly accessed page.
* The exception is pages getting rewritten; evicting other
* data from the working set, only to cache data that will
* get overwritten with something else, is a waste of memory.
*/
WARN_ON_ONCE(PageActive(page)); if (!(gfp_mask & __GFP_WRITE) && shadow) workingset_refault(page, shadow); lru_cache_add(page);
}
return ret;
}
EXPORT_SYMBOL_GPL(add_to_page_cache_lru);
#ifdef CONFIG_NUMA
struct page *__page_cache_alloc(gfp_t gfp)
{
int n;
struct page *page;
if (cpuset_do_page_mem_spread()) {
unsigned int cpuset_mems_cookie;
do {
cpuset_mems_cookie = read_mems_allowed_begin();
n = cpuset_mem_spread_node();
page = __alloc_pages_node(n, gfp, 0);
} while (!page && read_mems_allowed_retry(cpuset_mems_cookie));
return page;
}
return alloc_pages(gfp, 0);
}
EXPORT_SYMBOL(__page_cache_alloc);
#endif
/*
* filemap_invalidate_lock_two - lock invalidate_lock for two mappings
*
* Lock exclusively invalidate_lock of any passed mapping that is not NULL.
*
* @mapping1: the first mapping to lock
* @mapping2: the second mapping to lock
*/
void filemap_invalidate_lock_two(struct address_space *mapping1,
struct address_space *mapping2)
{
if (mapping1 > mapping2)
swap(mapping1, mapping2);
if (mapping1)
down_write(&mapping1->invalidate_lock);
if (mapping2 && mapping1 != mapping2)
down_write_nested(&mapping2->invalidate_lock, 1);
}
EXPORT_SYMBOL(filemap_invalidate_lock_two);
/*
* filemap_invalidate_unlock_two - unlock invalidate_lock for two mappings
*
* Unlock exclusive invalidate_lock of any passed mapping that is not NULL.
*
* @mapping1: the first mapping to unlock
* @mapping2: the second mapping to unlock
*/
void filemap_invalidate_unlock_two(struct address_space *mapping1,
struct address_space *mapping2)
{
if (mapping1)
up_write(&mapping1->invalidate_lock);
if (mapping2 && mapping1 != mapping2)
up_write(&mapping2->invalidate_lock);
}
EXPORT_SYMBOL(filemap_invalidate_unlock_two);
/*
* In order to wait for pages to become available there must be
* waitqueues associated with pages. By using a hash table of
* waitqueues where the bucket discipline is to maintain all
* waiters on the same queue and wake all when any of the pages
* become available, and for the woken contexts to check to be
* sure the appropriate page became available, this saves space
* at a cost of "thundering herd" phenomena during rare hash
* collisions.
*/
#define PAGE_WAIT_TABLE_BITS 8
#define PAGE_WAIT_TABLE_SIZE (1 << PAGE_WAIT_TABLE_BITS)
static wait_queue_head_t page_wait_table[PAGE_WAIT_TABLE_SIZE] __cacheline_aligned;
static wait_queue_head_t *page_waitqueue(struct page *page)
{
return &page_wait_table[hash_ptr(page, PAGE_WAIT_TABLE_BITS)];
}
void __init pagecache_init(void)
{
int i;
for (i = 0; i < PAGE_WAIT_TABLE_SIZE; i++)
init_waitqueue_head(&page_wait_table[i]);
page_writeback_init();
}
/*
* The page wait code treats the "wait->flags" somewhat unusually, because
* we have multiple different kinds of waits, not just the usual "exclusive"
* one.
*
* We have:
*
* (a) no special bits set:
*
* We're just waiting for the bit to be released, and when a waker
* calls the wakeup function, we set WQ_FLAG_WOKEN and wake it up,
* and remove it from the wait queue.
*
* Simple and straightforward.
*
* (b) WQ_FLAG_EXCLUSIVE:
*
* The waiter is waiting to get the lock, and only one waiter should
* be woken up to avoid any thundering herd behavior. We'll set the
* WQ_FLAG_WOKEN bit, wake it up, and remove it from the wait queue.
*
* This is the traditional exclusive wait.
*
* (c) WQ_FLAG_EXCLUSIVE | WQ_FLAG_CUSTOM:
*
* The waiter is waiting to get the bit, and additionally wants the
* lock to be transferred to it for fair lock behavior. If the lock
* cannot be taken, we stop walking the wait queue without waking
* the waiter.
*
* This is the "fair lock handoff" case, and in addition to setting
* WQ_FLAG_WOKEN, we set WQ_FLAG_DONE to let the waiter easily see
* that it now has the lock.
*/
static int wake_page_function(wait_queue_entry_t *wait, unsigned mode, int sync, void *arg)
{
unsigned int flags;
struct wait_page_key *key = arg;
struct wait_page_queue *wait_page
= container_of(wait, struct wait_page_queue, wait);
if (!wake_page_match(wait_page, key)) return 0;
/*
* If it's a lock handoff wait, we get the bit for it, and
* stop walking (and do not wake it up) if we can't.
*/
flags = wait->flags;
if (flags & WQ_FLAG_EXCLUSIVE) {
if (test_bit(key->bit_nr, &key->page->flags))
return -1;
if (flags & WQ_FLAG_CUSTOM) { if (test_and_set_bit(key->bit_nr, &key->page->flags))
return -1;
flags |= WQ_FLAG_DONE;
}
}
/*
* We are holding the wait-queue lock, but the waiter that
* is waiting for this will be checking the flags without
* any locking.
*
* So update the flags atomically, and wake up the waiter
* afterwards to avoid any races. This store-release pairs
* with the load-acquire in wait_on_page_bit_common().
*/
smp_store_release(&wait->flags, flags | WQ_FLAG_WOKEN);
wake_up_state(wait->private, mode);
/*
* Ok, we have successfully done what we're waiting for,
* and we can unconditionally remove the wait entry.
*
* Note that this pairs with the "finish_wait()" in the
* waiter, and has to be the absolute last thing we do.
* After this list_del_init(&wait->entry) the wait entry
* might be de-allocated and the process might even have
* exited.
*/
list_del_init_careful(&wait->entry);
return (flags & WQ_FLAG_EXCLUSIVE) != 0;
}
static void wake_up_page_bit(struct page *page, int bit_nr)
{
wait_queue_head_t *q = page_waitqueue(page);
struct wait_page_key key;
unsigned long flags;
wait_queue_entry_t bookmark;
key.page = page;
key.bit_nr = bit_nr;
key.page_match = 0;
bookmark.flags = 0;
bookmark.private = NULL;
bookmark.func = NULL;
INIT_LIST_HEAD(&bookmark.entry);
spin_lock_irqsave(&q->lock, flags);
__wake_up_locked_key_bookmark(q, TASK_NORMAL, &key, &bookmark);
while (bookmark.flags & WQ_FLAG_BOOKMARK) {
/*
* Take a breather from holding the lock,
* allow pages that finish wake up asynchronously
* to acquire the lock and remove themselves
* from wait queue
*/
spin_unlock_irqrestore(&q->lock, flags);
cpu_relax();
spin_lock_irqsave(&q->lock, flags);
__wake_up_locked_key_bookmark(q, TASK_NORMAL, &key, &bookmark);
}
/*
* It is possible for other pages to have collided on the waitqueue
* hash, so in that case check for a page match. That prevents a long-
* term waiter
*
* It is still possible to miss a case here, when we woke page waiters
* and removed them from the waitqueue, but there are still other
* page waiters.
*/
if (!waitqueue_active(q) || !key.page_match) {
ClearPageWaiters(page);
/*
* It's possible to miss clearing Waiters here, when we woke
* our page waiters, but the hashed waitqueue has waiters for
* other pages on it.
*
* That's okay, it's a rare case. The next waker will clear it.
*/
}
spin_unlock_irqrestore(&q->lock, flags);
}
static void wake_up_page(struct page *page, int bit)
{
if (!PageWaiters(page))
return;
wake_up_page_bit(page, bit);
}
/*
* A choice of three behaviors for wait_on_page_bit_common():
*/
enum behavior {
EXCLUSIVE, /* Hold ref to page and take the bit when woken, like
* __lock_page() waiting on then setting PG_locked.
*/
SHARED, /* Hold ref to page and check the bit when woken, like
* wait_on_page_writeback() waiting on PG_writeback.
*/
DROP, /* Drop ref to page before wait, no check when woken,
* like put_and_wait_on_page_locked() on PG_locked.
*/
};
/*
* Attempt to check (or get) the page bit, and mark us done
* if successful.
*/
static inline bool trylock_page_bit_common(struct page *page, int bit_nr,
struct wait_queue_entry *wait)
{
if (wait->flags & WQ_FLAG_EXCLUSIVE) {
if (test_and_set_bit(bit_nr, &page->flags))
return false;
} else if (test_bit(bit_nr, &page->flags))
return false;
wait->flags |= WQ_FLAG_WOKEN | WQ_FLAG_DONE;
return true;
}
/* How many times do we accept lock stealing from under a waiter? */
int sysctl_page_lock_unfairness = 5;
static inline int wait_on_page_bit_common(wait_queue_head_t *q,
struct page *page, int bit_nr, int state, enum behavior behavior)
{
int unfairness = sysctl_page_lock_unfairness;
struct wait_page_queue wait_page;
wait_queue_entry_t *wait = &wait_page.wait;
bool thrashing = false;
bool delayacct = false;
unsigned long pflags;
if (bit_nr == PG_locked &&
!PageUptodate(page) && PageWorkingset(page)) {
if (!PageSwapBacked(page)) {
delayacct_thrashing_start();
delayacct = true;
}
psi_memstall_enter(&pflags);
thrashing = true;
}
init_wait(wait);
wait->func = wake_page_function;
wait_page.page = page;
wait_page.bit_nr = bit_nr;
repeat:
wait->flags = 0;
if (behavior == EXCLUSIVE) {
wait->flags = WQ_FLAG_EXCLUSIVE; if (--unfairness < 0) wait->flags |= WQ_FLAG_CUSTOM;
}
/*
* Do one last check whether we can get the
* page bit synchronously.
*
* Do the SetPageWaiters() marking before that
* to let any waker we _just_ missed know they
* need to wake us up (otherwise they'll never
* even go to the slow case that looks at the
* page queue), and add ourselves to the wait
* queue if we need to sleep.
*
* This part needs to be done under the queue
* lock to avoid races.
*/
spin_lock_irq(&q->lock);
SetPageWaiters(page);
if (!trylock_page_bit_common(page, bit_nr, wait))
__add_wait_queue_entry_tail(q, wait);
spin_unlock_irq(&q->lock);
/*
* From now on, all the logic will be based on
* the WQ_FLAG_WOKEN and WQ_FLAG_DONE flag, to
* see whether the page bit testing has already
* been done by the wake function.
*
* We can drop our reference to the page.
*/
if (behavior == DROP)
put_page(page);
/*
* Note that until the "finish_wait()", or until
* we see the WQ_FLAG_WOKEN flag, we need to
* be very careful with the 'wait->flags', because
* we may race with a waker that sets them.
*/
for (;;) {
unsigned int flags;
set_current_state(state);
/* Loop until we've been woken or interrupted */
flags = smp_load_acquire(&wait->flags);
if (!(flags & WQ_FLAG_WOKEN)) {
if (signal_pending_state(state, current))
break;
io_schedule();
continue;
}
/* If we were non-exclusive, we're done */
if (behavior != EXCLUSIVE)
break;
/* If the waker got the lock for us, we're done */
if (flags & WQ_FLAG_DONE)
break;
/*
* Otherwise, if we're getting the lock, we need to
* try to get it ourselves.
*
* And if that fails, we'll have to retry this all.
*/
if (unlikely(test_and_set_bit(bit_nr, &page->flags)))
goto repeat;
wait->flags |= WQ_FLAG_DONE;
break;
}
/*
* If a signal happened, this 'finish_wait()' may remove the last
* waiter from the wait-queues, but the PageWaiters bit will remain
* set. That's ok. The next wakeup will take care of it, and trying
* to do it here would be difficult and prone to races.
*/
finish_wait(q, wait);
if (thrashing) {
if (delayacct)
delayacct_thrashing_end();
psi_memstall_leave(&pflags);
}
/*
* NOTE! The wait->flags weren't stable until we've done the
* 'finish_wait()', and we could have exited the loop above due
* to a signal, and had a wakeup event happen after the signal
* test but before the 'finish_wait()'.
*
* So only after the finish_wait() can we reliably determine
* if we got woken up or not, so we can now figure out the final
* return value based on that state without races.
*
* Also note that WQ_FLAG_WOKEN is sufficient for a non-exclusive
* waiter, but an exclusive one requires WQ_FLAG_DONE.
*/
if (behavior == EXCLUSIVE)
return wait->flags & WQ_FLAG_DONE ? 0 : -EINTR; return wait->flags & WQ_FLAG_WOKEN ? 0 : -EINTR;
}
void wait_on_page_bit(struct page *page, int bit_nr)
{
wait_queue_head_t *q = page_waitqueue(page);
wait_on_page_bit_common(q, page, bit_nr, TASK_UNINTERRUPTIBLE, SHARED);
}
EXPORT_SYMBOL(wait_on_page_bit);
int wait_on_page_bit_killable(struct page *page, int bit_nr)
{
wait_queue_head_t *q = page_waitqueue(page);
return wait_on_page_bit_common(q, page, bit_nr, TASK_KILLABLE, SHARED);
}
EXPORT_SYMBOL(wait_on_page_bit_killable);
/**
* put_and_wait_on_page_locked - Drop a reference and wait for it to be unlocked
* @page: The page to wait for.
* @state: The sleep state (TASK_KILLABLE, TASK_UNINTERRUPTIBLE, etc).
*
* The caller should hold a reference on @page. They expect the page to
* become unlocked relatively soon, but do not wish to hold up migration
* (for example) by holding the reference while waiting for the page to
* come unlocked. After this function returns, the caller should not
* dereference @page.
*
* Return: 0 if the page was unlocked or -EINTR if interrupted by a signal.
*/
int put_and_wait_on_page_locked(struct page *page, int state)
{
wait_queue_head_t *q;
page = compound_head(page);
q = page_waitqueue(page);
return wait_on_page_bit_common(q, page, PG_locked, state, DROP);
}
/**
* add_page_wait_queue - Add an arbitrary waiter to a page's wait queue
* @page: Page defining the wait queue of interest
* @waiter: Waiter to add to the queue
*
* Add an arbitrary @waiter to the wait queue for the nominated @page.
*/
void add_page_wait_queue(struct page *page, wait_queue_entry_t *waiter)
{
wait_queue_head_t *q = page_waitqueue(page);
unsigned long flags;
spin_lock_irqsave(&q->lock, flags);
__add_wait_queue_entry_tail(q, waiter);
SetPageWaiters(page);
spin_unlock_irqrestore(&q->lock, flags);
}
EXPORT_SYMBOL_GPL(add_page_wait_queue);
#ifndef clear_bit_unlock_is_negative_byte
/*
* PG_waiters is the high bit in the same byte as PG_lock.
*
* On x86 (and on many other architectures), we can clear PG_lock and
* test the sign bit at the same time. But if the architecture does
* not support that special operation, we just do this all by hand
* instead.
*
* The read of PG_waiters has to be after (or concurrently with) PG_locked
* being cleared, but a memory barrier should be unnecessary since it is
* in the same byte as PG_locked.
*/
static inline bool clear_bit_unlock_is_negative_byte(long nr, volatile void *mem)
{
clear_bit_unlock(nr, mem);
/* smp_mb__after_atomic(); */
return test_bit(PG_waiters, mem);
}
#endif
/**
* unlock_page - unlock a locked page
* @page: the page
*
* Unlocks the page and wakes up sleepers in wait_on_page_locked().
* Also wakes sleepers in wait_on_page_writeback() because the wakeup
* mechanism between PageLocked pages and PageWriteback pages is shared.
* But that's OK - sleepers in wait_on_page_writeback() just go back to sleep.
*
* Note that this depends on PG_waiters being the sign bit in the byte
* that contains PG_locked - thus the BUILD_BUG_ON(). That allows us to
* clear the PG_locked bit and test PG_waiters at the same time fairly
* portably (architectures that do LL/SC can test any bit, while x86 can
* test the sign bit).
*/
void unlock_page(struct page *page)
{
BUILD_BUG_ON(PG_waiters != 7);
page = compound_head(page);
VM_BUG_ON_PAGE(!PageLocked(page), page);
if (clear_bit_unlock_is_negative_byte(PG_locked, &page->flags)) wake_up_page_bit(page, PG_locked);
}
EXPORT_SYMBOL(unlock_page);
/**
* end_page_private_2 - Clear PG_private_2 and release any waiters
* @page: The page
*
* Clear the PG_private_2 bit on a page and wake up any sleepers waiting for
* this. The page ref held for PG_private_2 being set is released.
*
* This is, for example, used when a netfs page is being written to a local
* disk cache, thereby allowing writes to the cache for the same page to be
* serialised.
*/
void end_page_private_2(struct page *page)
{
page = compound_head(page);
VM_BUG_ON_PAGE(!PagePrivate2(page), page);
clear_bit_unlock(PG_private_2, &page->flags);
wake_up_page_bit(page, PG_private_2);
put_page(page);
}
EXPORT_SYMBOL(end_page_private_2);
/**
* wait_on_page_private_2 - Wait for PG_private_2 to be cleared on a page
* @page: The page to wait on
*
* Wait for PG_private_2 (aka PG_fscache) to be cleared on a page.
*/
void wait_on_page_private_2(struct page *page)
{
page = compound_head(page);
while (PagePrivate2(page))
wait_on_page_bit(page, PG_private_2);
}
EXPORT_SYMBOL(wait_on_page_private_2);
/**
* wait_on_page_private_2_killable - Wait for PG_private_2 to be cleared on a page
* @page: The page to wait on
*
* Wait for PG_private_2 (aka PG_fscache) to be cleared on a page or until a
* fatal signal is received by the calling task.
*
* Return:
* - 0 if successful.
* - -EINTR if a fatal signal was encountered.
*/
int wait_on_page_private_2_killable(struct page *page)
{
int ret = 0;
page = compound_head(page);
while (PagePrivate2(page)) {
ret = wait_on_page_bit_killable(page, PG_private_2);
if (ret < 0)
break;
}
return ret;
}
EXPORT_SYMBOL(wait_on_page_private_2_killable);
/**
* end_page_writeback - end writeback against a page
* @page: the page
*/
void end_page_writeback(struct page *page)
{
/*
* TestClearPageReclaim could be used here but it is an atomic
* operation and overkill in this particular case. Failing to
* shuffle a page marked for immediate reclaim is too mild to
* justify taking an atomic operation penalty at the end of
* ever page writeback.
*/
if (PageReclaim(page)) {
ClearPageReclaim(page);
rotate_reclaimable_page(page);
}
/*
* Writeback does not hold a page reference of its own, relying
* on truncation to wait for the clearing of PG_writeback.
* But here we must make sure that the page is not freed and
* reused before the wake_up_page().
*/
get_page(page);
if (!test_clear_page_writeback(page))
BUG();
smp_mb__after_atomic();
wake_up_page(page, PG_writeback);
put_page(page);
}
EXPORT_SYMBOL(end_page_writeback);
/*
* After completing I/O on a page, call this routine to update the page
* flags appropriately
*/
void page_endio(struct page *page, bool is_write, int err)
{
if (!is_write) {
if (!err) {
SetPageUptodate(page);
} else {
ClearPageUptodate(page);
SetPageError(page);
}
unlock_page(page);
} else {
if (err) {
struct address_space *mapping;
SetPageError(page);
mapping = page_mapping(page);
if (mapping)
mapping_set_error(mapping, err);
}
end_page_writeback(page);
}
}
EXPORT_SYMBOL_GPL(page_endio);
/**
* __lock_page - get a lock on the page, assuming we need to sleep to get it
* @__page: the page to lock
*/
void __lock_page(struct page *__page)
{
struct page *page = compound_head(__page);
wait_queue_head_t *q = page_waitqueue(page);
wait_on_page_bit_common(q, page, PG_locked, TASK_UNINTERRUPTIBLE,
EXCLUSIVE);
}
EXPORT_SYMBOL(__lock_page);
int __lock_page_killable(struct page *__page)
{
struct page *page = compound_head(__page);
wait_queue_head_t *q = page_waitqueue(page);
return wait_on_page_bit_common(q, page, PG_locked, TASK_KILLABLE,
EXCLUSIVE);
}
EXPORT_SYMBOL_GPL(__lock_page_killable);
int __lock_page_async(struct page *page, struct wait_page_queue *wait)
{
struct wait_queue_head *q = page_waitqueue(page);
int ret = 0;
wait->page = page;
wait->bit_nr = PG_locked;
spin_lock_irq(&q->lock);
__add_wait_queue_entry_tail(q, &wait->wait);
SetPageWaiters(page);
ret = !trylock_page(page);
/*
* If we were successful now, we know we're still on the
* waitqueue as we're still under the lock. This means it's
* safe to remove and return success, we know the callback
* isn't going to trigger.
*/
if (!ret)
__remove_wait_queue(q, &wait->wait);
else
ret = -EIOCBQUEUED;
spin_unlock_irq(&q->lock);
return ret;
}
/*
* Return values:
* 1 - page is locked; mmap_lock is still held.
* 0 - page is not locked.
* mmap_lock has been released (mmap_read_unlock(), unless flags had both
* FAULT_FLAG_ALLOW_RETRY and FAULT_FLAG_RETRY_NOWAIT set, in
* which case mmap_lock is still held.
*
* If neither ALLOW_RETRY nor KILLABLE are set, will always return 1
* with the page locked and the mmap_lock unperturbed.
*/
int __lock_page_or_retry(struct page *page, struct mm_struct *mm,
unsigned int flags)
{
if (fault_flag_allow_retry_first(flags)) {
/*
* CAUTION! In this case, mmap_lock is not released
* even though return 0.
*/
if (flags & FAULT_FLAG_RETRY_NOWAIT)
return 0;
mmap_read_unlock(mm);
if (flags & FAULT_FLAG_KILLABLE)
wait_on_page_locked_killable(page);
else
wait_on_page_locked(page);
return 0;
}
if (flags & FAULT_FLAG_KILLABLE) {
int ret;
ret = __lock_page_killable(page);
if (ret) {
mmap_read_unlock(mm);
return 0;
}
} else {
__lock_page(page);
}
return 1;
}
/**
* page_cache_next_miss() - Find the next gap in the page cache.
* @mapping: Mapping.
* @index: Index.
* @max_scan: Maximum range to search.
*
* Search the range [index, min(index + max_scan - 1, ULONG_MAX)] for the
* gap with the lowest index.
*
* This function may be called under the rcu_read_lock. However, this will
* not atomically search a snapshot of the cache at a single point in time.
* For example, if a gap is created at index 5, then subsequently a gap is
* created at index 10, page_cache_next_miss covering both indices may
* return 10 if called under the rcu_read_lock.
*
* Return: The index of the gap if found, otherwise an index outside the
* range specified (in which case 'return - index >= max_scan' will be true).
* In the rare case of index wrap-around, 0 will be returned.
*/
pgoff_t page_cache_next_miss(struct address_space *mapping,
pgoff_t index, unsigned long max_scan)
{
XA_STATE(xas, &mapping->i_pages, index); while (max_scan--) {
void *entry = xas_next(&xas);
if (!entry || xa_is_value(entry))
break;
if (xas.xa_index == 0)
break;
}
return xas.xa_index;}
EXPORT_SYMBOL(page_cache_next_miss);
/**
* page_cache_prev_miss() - Find the previous gap in the page cache.
* @mapping: Mapping.
* @index: Index.
* @max_scan: Maximum range to search.
*
* Search the range [max(index - max_scan + 1, 0), index] for the
* gap with the highest index.
*
* This function may be called under the rcu_read_lock. However, this will
* not atomically search a snapshot of the cache at a single point in time.
* For example, if a gap is created at index 10, then subsequently a gap is
* created at index 5, page_cache_prev_miss() covering both indices may
* return 5 if called under the rcu_read_lock.
*
* Return: The index of the gap if found, otherwise an index outside the
* range specified (in which case 'index - return >= max_scan' will be true).
* In the rare case of wrap-around, ULONG_MAX will be returned.
*/
pgoff_t page_cache_prev_miss(struct address_space *mapping,
pgoff_t index, unsigned long max_scan)
{
XA_STATE(xas, &mapping->i_pages, index); while (max_scan--) {
void *entry = xas_prev(&xas);
if (!entry || xa_is_value(entry))
break;
if (xas.xa_index == ULONG_MAX)
break;
}
return xas.xa_index;}
EXPORT_SYMBOL(page_cache_prev_miss);
/*
* mapping_get_entry - Get a page cache entry.
* @mapping: the address_space to search
* @index: The page cache index.
*
* Looks up the page cache slot at @mapping & @index. If there is a
* page cache page, the head page is returned with an increased refcount.
*
* If the slot holds a shadow entry of a previously evicted page, or a
* swap entry from shmem/tmpfs, it is returned.
*
* Return: The head page or shadow entry, %NULL if nothing is found.
*/
static struct page *mapping_get_entry(struct address_space *mapping,
pgoff_t index)
{
XA_STATE(xas, &mapping->i_pages, index);
struct page *page;
rcu_read_lock();
repeat:
xas_reset(&xas);
page = xas_load(&xas);
if (xas_retry(&xas, page))
goto repeat;
/*
* A shadow entry of a recently evicted page, or a swap entry from
* shmem/tmpfs. Return it without attempting to raise page count.
*/
if (!page || xa_is_value(page))
goto out;
if (!page_cache_get_speculative(page))
goto repeat;
/*
* Has the page moved or been split?
* This is part of the lockless pagecache protocol. See
* include/linux/pagemap.h for details.
*/
if (unlikely(page != xas_reload(&xas))) {
put_page(page);
goto repeat;
}
out:
rcu_read_unlock();
return page;
}
/**
* pagecache_get_page - Find and get a reference to a page.
* @mapping: The address_space to search.
* @index: The page index.
* @fgp_flags: %FGP flags modify how the page is returned.
* @gfp_mask: Memory allocation flags to use if %FGP_CREAT is specified.
*
* Looks up the page cache entry at @mapping & @index.
*
* @fgp_flags can be zero or more of these flags:
*
* * %FGP_ACCESSED - The page will be marked accessed.
* * %FGP_LOCK - The page is returned locked.
* * %FGP_HEAD - If the page is present and a THP, return the head page
* rather than the exact page specified by the index.
* * %FGP_ENTRY - If there is a shadow / swap / DAX entry, return it
* instead of allocating a new page to replace it.
* * %FGP_CREAT - If no page is present then a new page is allocated using
* @gfp_mask and added to the page cache and the VM's LRU list.
* The page is returned locked and with an increased refcount.
* * %FGP_FOR_MMAP - The caller wants to do its own locking dance if the
* page is already in cache. If the page was allocated, unlock it before
* returning so the caller can do the same dance.
* * %FGP_WRITE - The page will be written
* * %FGP_NOFS - __GFP_FS will get cleared in gfp mask
* * %FGP_NOWAIT - Don't get blocked by page lock
*
* If %FGP_LOCK or %FGP_CREAT are specified then the function may sleep even
* if the %GFP flags specified for %FGP_CREAT are atomic.
*
* If there is a page cache page, it is returned with an increased refcount.
*
* Return: The found page or %NULL otherwise.
*/
struct page *pagecache_get_page(struct address_space *mapping, pgoff_t index,
int fgp_flags, gfp_t gfp_mask)
{
struct page *page;
repeat:
page = mapping_get_entry(mapping, index);
if (xa_is_value(page)) {
if (fgp_flags & FGP_ENTRY)
return page;
page = NULL;
}
if (!page)
goto no_page;
if (fgp_flags & FGP_LOCK) { if (fgp_flags & FGP_NOWAIT) {
if (!trylock_page(page)) {
put_page(page);
return NULL;
}
} else {
lock_page(page);
}
/* Has the page been truncated? */
if (unlikely(page->mapping != mapping)) { unlock_page(page);
put_page(page);
goto repeat;
}
VM_BUG_ON_PAGE(!thp_contains(page, index), page);
}
if (fgp_flags & FGP_ACCESSED) mark_page_accessed(page);
else if (fgp_flags & FGP_WRITE) {
/* Clear idle flag for buffer write */
if (page_is_idle(page))
clear_page_idle(page);
}
if (!(fgp_flags & FGP_HEAD))
page = find_subpage(page, index);
no_page:
if (!page && (fgp_flags & FGP_CREAT)) {
int err;
if ((fgp_flags & FGP_WRITE) && mapping_can_writeback(mapping)) gfp_mask |= __GFP_WRITE; if (fgp_flags & FGP_NOFS) gfp_mask &= ~__GFP_FS; page = __page_cache_alloc(gfp_mask);
if (!page)
return NULL;
if (WARN_ON_ONCE(!(fgp_flags & (FGP_LOCK | FGP_FOR_MMAP))))
fgp_flags |= FGP_LOCK;
/* Init accessed so avoid atomic mark_page_accessed later */
if (fgp_flags & FGP_ACCESSED)
__SetPageReferenced(page);
err = add_to_page_cache_lru(page, mapping, index, gfp_mask);
if (unlikely(err)) {
put_page(page);
page = NULL;
if (err == -EEXIST)
goto repeat;
}
/*
* add_to_page_cache_lru locks the page, and for mmap we expect
* an unlocked page.
*/
if (page && (fgp_flags & FGP_FOR_MMAP)) unlock_page(page);
}
return page;
}
EXPORT_SYMBOL(pagecache_get_page);
static inline struct page *find_get_entry(struct xa_state *xas, pgoff_t max,
xa_mark_t mark)
{
struct page *page;
retry:
if (mark == XA_PRESENT) page = xas_find(xas, max);
else
page = xas_find_marked(xas, max, mark);
if (xas_retry(xas, page))
goto retry;
/*
* A shadow entry of a recently evicted page, a swap
* entry from shmem/tmpfs or a DAX entry. Return it
* without attempting to raise page count.
*/
if (!page || xa_is_value(page))
return page;
if (!page_cache_get_speculative(page))
goto reset;
/* Has the page moved or been split? */
if (unlikely(page != xas_reload(xas))) {
put_page(page);
goto reset;
}
return page;
reset:
xas_reset(xas);
goto retry;
}
/**
* find_get_entries - gang pagecache lookup
* @mapping: The address_space to search
* @start: The starting page cache index
* @end: The final page index (inclusive).
* @pvec: Where the resulting entries are placed.
* @indices: The cache indices corresponding to the entries in @entries
*
* find_get_entries() will search for and return a batch of entries in
* the mapping. The entries are placed in @pvec. find_get_entries()
* takes a reference on any actual pages it returns.
*
* The search returns a group of mapping-contiguous page cache entries
* with ascending indexes. There may be holes in the indices due to
* not-present pages.
*
* Any shadow entries of evicted pages, or swap entries from
* shmem/tmpfs, are included in the returned array.
*
* If it finds a Transparent Huge Page, head or tail, find_get_entries()
* stops at that page: the caller is likely to have a better way to handle
* the compound page as a whole, and then skip its extent, than repeatedly
* calling find_get_entries() to return all its tails.
*
* Return: the number of pages and shadow entries which were found.
*/
unsigned find_get_entries(struct address_space *mapping, pgoff_t start,
pgoff_t end, struct pagevec *pvec, pgoff_t *indices)
{
XA_STATE(xas, &mapping->i_pages, start);
struct page *page;
unsigned int ret = 0;
unsigned nr_entries = PAGEVEC_SIZE;
rcu_read_lock();
while ((page = find_get_entry(&xas, end, XA_PRESENT))) {
/*
* Terminate early on finding a THP, to allow the caller to
* handle it all at once; but continue if this is hugetlbfs.
*/
if (!xa_is_value(page) && PageTransHuge(page) &&
!PageHuge(page)) {
page = find_subpage(page, xas.xa_index);
nr_entries = ret + 1;
}
indices[ret] = xas.xa_index;
pvec->pages[ret] = page;
if (++ret == nr_entries)
break;
}
rcu_read_unlock();
pvec->nr = ret;
return ret;
}
/**
* find_lock_entries - Find a batch of pagecache entries.
* @mapping: The address_space to search.
* @start: The starting page cache index.
* @end: The final page index (inclusive).
* @pvec: Where the resulting entries are placed.
* @indices: The cache indices of the entries in @pvec.
*
* find_lock_entries() will return a batch of entries from @mapping.
* Swap, shadow and DAX entries are included. Pages are returned
* locked and with an incremented refcount. Pages which are locked by
* somebody else or under writeback are skipped. Only the head page of
* a THP is returned. Pages which are partially outside the range are
* not returned.
*
* The entries have ascending indexes. The indices may not be consecutive
* due to not-present entries, THP pages, pages which could not be locked
* or pages under writeback.
*
* Return: The number of entries which were found.
*/
unsigned find_lock_entries(struct address_space *mapping, pgoff_t start,
pgoff_t end, struct pagevec *pvec, pgoff_t *indices)
{
XA_STATE(xas, &mapping->i_pages, start);
struct page *page;
rcu_read_lock();
while ((page = find_get_entry(&xas, end, XA_PRESENT))) {
if (!xa_is_value(page)) { if (page->index < start)
goto put;
if (page->index + thp_nr_pages(page) - 1 > end)
goto put;
if (!trylock_page(page))
goto put;
if (page->mapping != mapping || PageWriteback(page))
goto unlock;
VM_BUG_ON_PAGE(!thp_contains(page, xas.xa_index),
page);
}
indices[pvec->nr] = xas.xa_index;
if (!pagevec_add(pvec, page))
break;
goto next;
unlock:
unlock_page(page);
put:
put_page(page);
next:
if (!xa_is_value(page) && PageTransHuge(page)) {
unsigned int nr_pages = thp_nr_pages(page);
/* Final THP may cross MAX_LFS_FILESIZE on 32-bit */
xas_set(&xas, page->index + nr_pages);
if (xas.xa_index < nr_pages)
break;
}
}
rcu_read_unlock();
return pagevec_count(pvec);
}
/**
* find_get_pages_range - gang pagecache lookup
* @mapping: The address_space to search
* @start: The starting page index
* @end: The final page index (inclusive)
* @nr_pages: The maximum number of pages
* @pages: Where the resulting pages are placed
*
* find_get_pages_range() will search for and return a group of up to @nr_pages
* pages in the mapping starting at index @start and up to index @end
* (inclusive). The pages are placed at @pages. find_get_pages_range() takes
* a reference against the returned pages.
*
* The search returns a group of mapping-contiguous pages with ascending
* indexes. There may be holes in the indices due to not-present pages.
* We also update @start to index the next page for the traversal.
*
* Return: the number of pages which were found. If this number is
* smaller than @nr_pages, the end of specified range has been
* reached.
*/
unsigned find_get_pages_range(struct address_space *mapping, pgoff_t *start,
pgoff_t end, unsigned int nr_pages,
struct page **pages)
{
XA_STATE(xas, &mapping->i_pages, *start);
struct page *page;
unsigned ret = 0;
if (unlikely(!nr_pages))
return 0;
rcu_read_lock();
while ((page = find_get_entry(&xas, end, XA_PRESENT))) {
/* Skip over shadow, swap and DAX entries */
if (xa_is_value(page))
continue;
pages[ret] = find_subpage(page, xas.xa_index);
if (++ret == nr_pages) {
*start = xas.xa_index + 1;
goto out;
}
}
/*
* We come here when there is no page beyond @end. We take care to not
* overflow the index @start as it confuses some of the callers. This
* breaks the iteration when there is a page at index -1 but that is
* already broken anyway.
*/
if (end == (pgoff_t)-1)
*start = (pgoff_t)-1;
else
*start = end + 1;
out:
rcu_read_unlock();
return ret;
}
/**
* find_get_pages_contig - gang contiguous pagecache lookup
* @mapping: The address_space to search
* @index: The starting page index
* @nr_pages: The maximum number of pages
* @pages: Where the resulting pages are placed
*
* find_get_pages_contig() works exactly like find_get_pages(), except
* that the returned number of pages are guaranteed to be contiguous.
*
* Return: the number of pages which were found.
*/
unsigned find_get_pages_contig(struct address_space *mapping, pgoff_t index,
unsigned int nr_pages, struct page **pages)
{
XA_STATE(xas, &mapping->i_pages, index);
struct page *page;
unsigned int ret = 0;
if (unlikely(!nr_pages))
return 0;
rcu_read_lock();
for (page = xas_load(&xas); page; page = xas_next(&xas)) {
if (xas_retry(&xas, page))
continue;
/*
* If the entry has been swapped out, we can stop looking.
* No current caller is looking for DAX entries.
*/
if (xa_is_value(page))
break;
if (!page_cache_get_speculative(page))
goto retry;
/* Has the page moved or been split? */
if (unlikely(page != xas_reload(&xas)))
goto put_page;
pages[ret] = find_subpage(page, xas.xa_index);
if (++ret == nr_pages)
break;
continue;
put_page:
put_page(page);
retry:
xas_reset(&xas);
}
rcu_read_unlock();
return ret;
}
EXPORT_SYMBOL(find_get_pages_contig);
/**
* find_get_pages_range_tag - Find and return head pages matching @tag.
* @mapping: the address_space to search
* @index: the starting page index
* @end: The final page index (inclusive)
* @tag: the tag index
* @nr_pages: the maximum number of pages
* @pages: where the resulting pages are placed
*
* Like find_get_pages(), except we only return head pages which are tagged
* with @tag. @index is updated to the index immediately after the last
* page we return, ready for the next iteration.
*
* Return: the number of pages which were found.
*/
unsigned find_get_pages_range_tag(struct address_space *mapping, pgoff_t *index,
pgoff_t end, xa_mark_t tag, unsigned int nr_pages,
struct page **pages)
{
XA_STATE(xas, &mapping->i_pages, *index);
struct page *page;
unsigned ret = 0;
if (unlikely(!nr_pages))
return 0;
rcu_read_lock();
while ((page = find_get_entry(&xas, end, tag))) {
/*
* Shadow entries should never be tagged, but this iteration
* is lockless so there is a window for page reclaim to evict
* a page we saw tagged. Skip over it.
*/
if (xa_is_value(page))
continue;
pages[ret] = page;
if (++ret == nr_pages) {
*index = page->index + thp_nr_pages(page);
goto out;
}
}
/*
* We come here when we got to @end. We take care to not overflow the
* index @index as it confuses some of the callers. This breaks the
* iteration when there is a page at index -1 but that is already
* broken anyway.
*/
if (end == (pgoff_t)-1) *index = (pgoff_t)-1;
else
*index = end + 1;
out:
rcu_read_unlock();
return ret;
}
EXPORT_SYMBOL(find_get_pages_range_tag);
/*
* CD/DVDs are error prone. When a medium error occurs, the driver may fail
* a _large_ part of the i/o request. Imagine the worst scenario:
*
* ---R__________________________________________B__________
* ^ reading here ^ bad block(assume 4k)
*
* read(R) => miss => readahead(R...B) => media error => frustrating retries
* => failing the whole request => read(R) => read(R+1) =>
* readahead(R+1...B+1) => bang => read(R+2) => read(R+3) =>
* readahead(R+3...B+2) => bang => read(R+3) => read(R+4) =>
* readahead(R+4...B+3) => bang => read(R+4) => read(R+5) => ......
*
* It is going insane. Fix it by quickly scaling down the readahead size.
*/
static void shrink_readahead_size_eio(struct file_ra_state *ra)
{
ra->ra_pages /= 4;
}
/*
* filemap_get_read_batch - Get a batch of pages for read
*
* Get a batch of pages which represent a contiguous range of bytes
* in the file. No tail pages will be returned. If @index is in the
* middle of a THP, the entire THP will be returned. The last page in
* the batch may have Readahead set or be not Uptodate so that the
* caller can take the appropriate action.
*/
static void filemap_get_read_batch(struct address_space *mapping,
pgoff_t index, pgoff_t max, struct pagevec *pvec)
{
XA_STATE(xas, &mapping->i_pages, index);
struct page *head;
rcu_read_lock();
for (head = xas_load(&xas); head; head = xas_next(&xas)) {
if (xas_retry(&xas, head))
continue;
if (xas.xa_index > max || xa_is_value(head))
break;
if (!page_cache_get_speculative(head))
goto retry;
/* Has the page moved or been split? */
if (unlikely(head != xas_reload(&xas)))
goto put_page;
if (!pagevec_add(pvec, head))
break;
if (!PageUptodate(head))
break;
if (PageReadahead(head))
break;
if (PageHead(head)) {
xas_set(&xas, head->index + thp_nr_pages(head));
/* Handle wrap correctly */
if (xas.xa_index - 1 >= max)
break;
}
continue;
put_page:
put_page(head);
retry:
xas_reset(&xas);
}
rcu_read_unlock();
}
static int filemap_read_page(struct file *file, struct address_space *mapping,
struct page *page)
{
int error;
/*
* A previous I/O error may have been due to temporary failures,
* eg. multipath errors. PG_error will be set again if readpage
* fails.
*/
ClearPageError(page);
/* Start the actual read. The read will unlock the page. */
error = mapping->a_ops->readpage(file, page);
if (error)
return error;
error = wait_on_page_locked_killable(page);
if (error)
return error;
if (PageUptodate(page))
return 0;
shrink_readahead_size_eio(&file->f_ra);
return -EIO;
}
static bool filemap_range_uptodate(struct address_space *mapping,
loff_t pos, struct iov_iter *iter, struct page *page)
{
int count;
if (PageUptodate(page))
return true;
/* pipes can't handle partially uptodate pages */
if (iov_iter_is_pipe(iter))
return false;
if (!mapping->a_ops->is_partially_uptodate)
return false;
if (mapping->host->i_blkbits >= (PAGE_SHIFT + thp_order(page)))
return false;
count = iter->count;
if (page_offset(page) > pos) {
count -= page_offset(page) - pos;
pos = 0;
} else {
pos -= page_offset(page);
}
return mapping->a_ops->is_partially_uptodate(page, pos, count);
}
static int filemap_update_page(struct kiocb *iocb,
struct address_space *mapping, struct iov_iter *iter,
struct page *page)
{
int error;
if (iocb->ki_flags & IOCB_NOWAIT) {
if (!filemap_invalidate_trylock_shared(mapping))
return -EAGAIN;
} else {
filemap_invalidate_lock_shared(mapping);
}
if (!trylock_page(page)) {
error = -EAGAIN;
if (iocb->ki_flags & (IOCB_NOWAIT | IOCB_NOIO))
goto unlock_mapping;
if (!(iocb->ki_flags & IOCB_WAITQ)) {
filemap_invalidate_unlock_shared(mapping);
put_and_wait_on_page_locked(page, TASK_KILLABLE);
return AOP_TRUNCATED_PAGE;
}
error = __lock_page_async(page, iocb->ki_waitq);
if (error)
goto unlock_mapping;
}
error = AOP_TRUNCATED_PAGE;
if (!page->mapping)
goto unlock;
error = 0;
if (filemap_range_uptodate(mapping, iocb->ki_pos, iter, page))
goto unlock;
error = -EAGAIN;
if (iocb->ki_flags & (IOCB_NOIO | IOCB_NOWAIT | IOCB_WAITQ))
goto unlock;
error = filemap_read_page(iocb->ki_filp, mapping, page);
goto unlock_mapping;
unlock:
unlock_page(page);
unlock_mapping:
filemap_invalidate_unlock_shared(mapping);
if (error == AOP_TRUNCATED_PAGE)
put_page(page);
return error;
}
static int filemap_create_page(struct file *file,
struct address_space *mapping, pgoff_t index,
struct pagevec *pvec)
{
struct page *page;
int error;
page = page_cache_alloc(mapping);
if (!page)
return -ENOMEM;
/*
* Protect against truncate / hole punch. Grabbing invalidate_lock here
* assures we cannot instantiate and bring uptodate new pagecache pages
* after evicting page cache during truncate and before actually
* freeing blocks. Note that we could release invalidate_lock after
* inserting the page into page cache as the locked page would then be
* enough to synchronize with hole punching. But there are code paths
* such as filemap_update_page() filling in partially uptodate pages or
* ->readpages() that need to hold invalidate_lock while mapping blocks
* for IO so let's hold the lock here as well to keep locking rules
* simple.
*/
filemap_invalidate_lock_shared(mapping);
error = add_to_page_cache_lru(page, mapping, index,
mapping_gfp_constraint(mapping, GFP_KERNEL));
if (error == -EEXIST)
error = AOP_TRUNCATED_PAGE;
if (error)
goto error;
error = filemap_read_page(file, mapping, page);
if (error)
goto error;
filemap_invalidate_unlock_shared(mapping);
pagevec_add(pvec, page);
return 0;
error:
filemap_invalidate_unlock_shared(mapping);
put_page(page);
return error;
}
static int filemap_readahead(struct kiocb *iocb, struct file *file,
struct address_space *mapping, struct page *page,
pgoff_t last_index)
{
if (iocb->ki_flags & IOCB_NOIO)
return -EAGAIN;
page_cache_async_readahead(mapping, &file->f_ra, file, page,
page->index, last_index - page->index);
return 0;
}
static int filemap_get_pages(struct kiocb *iocb, struct iov_iter *iter,
struct pagevec *pvec)
{
struct file *filp = iocb->ki_filp;
struct address_space *mapping = filp->f_mapping;
struct file_ra_state *ra = &filp->f_ra;
pgoff_t index = iocb->ki_pos >> PAGE_SHIFT;
pgoff_t last_index;
struct page *page;
int err = 0;
last_index = DIV_ROUND_UP(iocb->ki_pos + iter->count, PAGE_SIZE);
retry:
if (fatal_signal_pending(current))
return -EINTR;
filemap_get_read_batch(mapping, index, last_index, pvec);
if (!pagevec_count(pvec)) {
if (iocb->ki_flags & IOCB_NOIO)
return -EAGAIN;
page_cache_sync_readahead(mapping, ra, filp, index,
last_index - index);
filemap_get_read_batch(mapping, index, last_index, pvec);
}
if (!pagevec_count(pvec)) {
if (iocb->ki_flags & (IOCB_NOWAIT | IOCB_WAITQ))
return -EAGAIN;
err = filemap_create_page(filp, mapping,
iocb->ki_pos >> PAGE_SHIFT, pvec);
if (err == AOP_TRUNCATED_PAGE)
goto retry;
return err;
}
page = pvec->pages[pagevec_count(pvec) - 1];
if (PageReadahead(page)) {
err = filemap_readahead(iocb, filp, mapping, page, last_index);
if (err)
goto err;
}
if (!PageUptodate(page)) {
if ((iocb->ki_flags & IOCB_WAITQ) && pagevec_count(pvec) > 1) iocb->ki_flags |= IOCB_NOWAIT;
err = filemap_update_page(iocb, mapping, iter, page);
if (err)
goto err;
}
return 0;
err:
if (err < 0)
put_page(page);
if (likely(--pvec->nr))
return 0;
if (err == AOP_TRUNCATED_PAGE)
goto retry;
return err;
}
/**
* filemap_read - Read data from the page cache.
* @iocb: The iocb to read.
* @iter: Destination for the data.
* @already_read: Number of bytes already read by the caller.
*
* Copies data from the page cache. If the data is not currently present,
* uses the readahead and readpage address_space operations to fetch it.
*
* Return: Total number of bytes copied, including those already read by
* the caller. If an error happens before any bytes are copied, returns
* a negative error number.
*/
ssize_t filemap_read(struct kiocb *iocb, struct iov_iter *iter,
ssize_t already_read)
{
struct file *filp = iocb->ki_filp;
struct file_ra_state *ra = &filp->f_ra;
struct address_space *mapping = filp->f_mapping;
struct inode *inode = mapping->host;
struct pagevec pvec;
int i, error = 0;
bool writably_mapped;
loff_t isize, end_offset;
if (unlikely(iocb->ki_pos >= inode->i_sb->s_maxbytes))
return 0;
if (unlikely(!iov_iter_count(iter)))
return 0;
iov_iter_truncate(iter, inode->i_sb->s_maxbytes);
pagevec_init(&pvec);
do {
cond_resched();
/*
* If we've already successfully copied some data, then we
* can no longer safely return -EIOCBQUEUED. Hence mark
* an async read NOWAIT at that point.
*/
if ((iocb->ki_flags & IOCB_WAITQ) && already_read) iocb->ki_flags |= IOCB_NOWAIT; error = filemap_get_pages(iocb, iter, &pvec);
if (error < 0)
break;
/*
* i_size must be checked after we know the pages are Uptodate.
*
* Checking i_size after the check allows us to calculate
* the correct value for "nr", which means the zero-filled
* part of the page is not copied back to userspace (unless
* another truncate extends the file - this is desired though).
*/
isize = i_size_read(inode);
if (unlikely(iocb->ki_pos >= isize))
goto put_pages;
end_offset = min_t(loff_t, isize, iocb->ki_pos + iter->count);
/*
* Once we start copying data, we don't want to be touching any
* cachelines that might be contended:
*/
writably_mapped = mapping_writably_mapped(mapping);
/*
* When a sequential read accesses a page several times, only
* mark it as accessed the first time.
*/
if (iocb->ki_pos >> PAGE_SHIFT !=
ra->prev_pos >> PAGE_SHIFT)
mark_page_accessed(pvec.pages[0]); for (i = 0; i < pagevec_count(&pvec); i++) { struct page *page = pvec.pages[i];
size_t page_size = thp_size(page);
size_t offset = iocb->ki_pos & (page_size - 1);
size_t bytes = min_t(loff_t, end_offset - iocb->ki_pos,
page_size - offset);
size_t copied;
if (end_offset < page_offset(page))
break;
if (i > 0) mark_page_accessed(page);
/*
* If users can be writing to this page using arbitrary
* virtual addresses, take care about potential aliasing
* before reading the page on the kernel side.
*/
if (writably_mapped) {
int j;
for (j = 0; j < thp_nr_pages(page); j++)
flush_dcache_page(page + j);
}
copied = copy_page_to_iter(page, offset, bytes, iter);
already_read += copied;
iocb->ki_pos += copied;
ra->prev_pos = iocb->ki_pos;
if (copied < bytes) {
error = -EFAULT;
break;
}
}
put_pages:
for (i = 0; i < pagevec_count(&pvec); i++) put_page(pvec.pages[i]);
pagevec_reinit(&pvec);
} while (iov_iter_count(iter) && iocb->ki_pos < isize && !error);
file_accessed(filp);
return already_read ? already_read : error;
}
EXPORT_SYMBOL_GPL(filemap_read);
/**
* generic_file_read_iter - generic filesystem read routine
* @iocb: kernel I/O control block
* @iter: destination for the data read
*
* This is the "read_iter()" routine for all filesystems
* that can use the page cache directly.
*
* The IOCB_NOWAIT flag in iocb->ki_flags indicates that -EAGAIN shall
* be returned when no data can be read without waiting for I/O requests
* to complete; it doesn't prevent readahead.
*
* The IOCB_NOIO flag in iocb->ki_flags indicates that no new I/O
* requests shall be made for the read or for readahead. When no data
* can be read, -EAGAIN shall be returned. When readahead would be
* triggered, a partial, possibly empty read shall be returned.
*
* Return:
* * number of bytes copied, even for partial reads
* * negative error code (or 0 if IOCB_NOIO) if nothing was read
*/
ssize_t
generic_file_read_iter(struct kiocb *iocb, struct iov_iter *iter)
{
size_t count = iov_iter_count(iter);
ssize_t retval = 0;
if (!count)
return 0; /* skip atime */
if (iocb->ki_flags & IOCB_DIRECT) { struct file *file = iocb->ki_filp;
struct address_space *mapping = file->f_mapping;
struct inode *inode = mapping->host;
loff_t size;
size = i_size_read(inode);
if (iocb->ki_flags & IOCB_NOWAIT) {
if (filemap_range_needs_writeback(mapping, iocb->ki_pos,
iocb->ki_pos + count - 1))
return -EAGAIN;
} else {
retval = filemap_write_and_wait_range(mapping,
iocb->ki_pos,
iocb->ki_pos + count - 1);
if (retval < 0)
return retval;
}
file_accessed(file);
retval = mapping->a_ops->direct_IO(iocb, iter);
if (retval >= 0) {
iocb->ki_pos += retval;
count -= retval;
}
if (retval != -EIOCBQUEUED) iov_iter_revert(iter, count - iov_iter_count(iter));
/*
* Btrfs can have a short DIO read if we encounter
* compressed extents, so if there was an error, or if
* we've already read everything we wanted to, or if
* there was a short read because we hit EOF, go ahead
* and return. Otherwise fallthrough to buffered io for
* the rest of the read. Buffered reads will not work for
* DAX files, so don't bother trying.
*/
if (retval < 0 || !count || iocb->ki_pos >= size ||
IS_DAX(inode))
return retval;
}
return filemap_read(iocb, iter, retval);
}
EXPORT_SYMBOL(generic_file_read_iter);
static inline loff_t page_seek_hole_data(struct xa_state *xas,
struct address_space *mapping, struct page *page,
loff_t start, loff_t end, bool seek_data)
{
const struct address_space_operations *ops = mapping->a_ops;
size_t offset, bsz = i_blocksize(mapping->host);
if (xa_is_value(page) || PageUptodate(page))
return seek_data ? start : end;
if (!ops->is_partially_uptodate)
return seek_data ? end : start;
xas_pause(xas);
rcu_read_unlock();
lock_page(page);
if (unlikely(page->mapping != mapping))
goto unlock;
offset = offset_in_thp(page, start) & ~(bsz - 1);
do {
if (ops->is_partially_uptodate(page, offset, bsz) == seek_data)
break;
start = (start + bsz) & ~(bsz - 1);
offset += bsz;
} while (offset < thp_size(page));
unlock:
unlock_page(page);
rcu_read_lock();
return start;
}
static inline
unsigned int seek_page_size(struct xa_state *xas, struct page *page)
{
if (xa_is_value(page))
return PAGE_SIZE << xa_get_order(xas->xa, xas->xa_index);
return thp_size(page);
}
/**
* mapping_seek_hole_data - Seek for SEEK_DATA / SEEK_HOLE in the page cache.
* @mapping: Address space to search.
* @start: First byte to consider.
* @end: Limit of search (exclusive).
* @whence: Either SEEK_HOLE or SEEK_DATA.
*
* If the page cache knows which blocks contain holes and which blocks
* contain data, your filesystem can use this function to implement
* SEEK_HOLE and SEEK_DATA. This is useful for filesystems which are
* entirely memory-based such as tmpfs, and filesystems which support
* unwritten extents.
*
* Return: The requested offset on success, or -ENXIO if @whence specifies
* SEEK_DATA and there is no data after @start. There is an implicit hole
* after @end - 1, so SEEK_HOLE returns @end if all the bytes between @start
* and @end contain data.
*/
loff_t mapping_seek_hole_data(struct address_space *mapping, loff_t start,
loff_t end, int whence)
{
XA_STATE(xas, &mapping->i_pages, start >> PAGE_SHIFT);
pgoff_t max = (end - 1) >> PAGE_SHIFT;
bool seek_data = (whence == SEEK_DATA);
struct page *page;
if (end <= start)
return -ENXIO;
rcu_read_lock();
while ((page = find_get_entry(&xas, max, XA_PRESENT))) {
loff_t pos = (u64)xas.xa_index << PAGE_SHIFT;
unsigned int seek_size;
if (start < pos) {
if (!seek_data)
goto unlock;
start = pos;
}
seek_size = seek_page_size(&xas, page);
pos = round_up(pos + 1, seek_size);
start = page_seek_hole_data(&xas, mapping, page, start, pos,
seek_data);
if (start < pos)
goto unlock;
if (start >= end)
break;
if (seek_size > PAGE_SIZE)
xas_set(&xas, pos >> PAGE_SHIFT);
if (!xa_is_value(page))
put_page(page);
}
if (seek_data)
start = -ENXIO;
unlock:
rcu_read_unlock();
if (page && !xa_is_value(page))
put_page(page);
if (start > end)
return end;
return start;
}
#ifdef CONFIG_MMU
#define MMAP_LOTSAMISS (100)
/*
* lock_page_maybe_drop_mmap - lock the page, possibly dropping the mmap_lock
* @vmf - the vm_fault for this fault.
* @page - the page to lock.
* @fpin - the pointer to the file we may pin (or is already pinned).
*
* This works similar to lock_page_or_retry in that it can drop the mmap_lock.
* It differs in that it actually returns the page locked if it returns 1 and 0
* if it couldn't lock the page. If we did have to drop the mmap_lock then fpin
* will point to the pinned file and needs to be fput()'ed at a later point.
*/
static int lock_page_maybe_drop_mmap(struct vm_fault *vmf, struct page *page,
struct file **fpin)
{
if (trylock_page(page))
return 1;
/*
* NOTE! This will make us return with VM_FAULT_RETRY, but with
* the mmap_lock still held. That's how FAULT_FLAG_RETRY_NOWAIT
* is supposed to work. We have way too many special cases..
*/
if (vmf->flags & FAULT_FLAG_RETRY_NOWAIT)
return 0;
*fpin = maybe_unlock_mmap_for_io(vmf, *fpin);
if (vmf->flags & FAULT_FLAG_KILLABLE) {
if (__lock_page_killable(page)) {
/*
* We didn't have the right flags to drop the mmap_lock,
* but all fault_handlers only check for fatal signals
* if we return VM_FAULT_RETRY, so we need to drop the
* mmap_lock here and return 0 if we don't have a fpin.
*/
if (*fpin == NULL)
mmap_read_unlock(vmf->vma->vm_mm);
return 0;
}
} else
__lock_page(page);
return 1;
}
/*
* Synchronous readahead happens when we don't even find a page in the page
* cache at all. We don't want to perform IO under the mmap sem, so if we have
* to drop the mmap sem we return the file that was pinned in order for us to do
* that. If we didn't pin a file then we return NULL. The file that is
* returned needs to be fput()'ed when we're done with it.
*/
static struct file *do_sync_mmap_readahead(struct vm_fault *vmf)
{
struct file *file = vmf->vma->vm_file;
struct file_ra_state *ra = &file->f_ra;
struct address_space *mapping = file->f_mapping;
DEFINE_READAHEAD(ractl, file, ra, mapping, vmf->pgoff);
struct file *fpin = NULL;
unsigned int mmap_miss;
/* If we don't want any read-ahead, don't bother */
if (vmf->vma->vm_flags & VM_RAND_READ)
return fpin;
if (!ra->ra_pages)
return fpin;
if (vmf->vma->vm_flags & VM_SEQ_READ) {
fpin = maybe_unlock_mmap_for_io(vmf, fpin);
page_cache_sync_ra(&ractl, ra->ra_pages);
return fpin;
}
/* Avoid banging the cache line if not needed */
mmap_miss = READ_ONCE(ra->mmap_miss);
if (mmap_miss < MMAP_LOTSAMISS * 10)
WRITE_ONCE(ra->mmap_miss, ++mmap_miss);
/*
* Do we miss much more than hit in this file? If so,
* stop bothering with read-ahead. It will only hurt.
*/
if (mmap_miss > MMAP_LOTSAMISS)
return fpin;
/*
* mmap read-around
*/
fpin = maybe_unlock_mmap_for_io(vmf, fpin);
ra->start = max_t(long, 0, vmf->pgoff - ra->ra_pages / 2);
ra->size = ra->ra_pages;
ra->async_size = ra->ra_pages / 4;
ractl._index = ra->start;
do_page_cache_ra(&ractl, ra->size, ra->async_size);
return fpin;
}
/*
* Asynchronous readahead happens when we find the page and PG_readahead,
* so we want to possibly extend the readahead further. We return the file that
* was pinned if we have to drop the mmap_lock in order to do IO.
*/
static struct file *do_async_mmap_readahead(struct vm_fault *vmf,
struct page *page)
{
struct file *file = vmf->vma->vm_file;
struct file_ra_state *ra = &file->f_ra;
struct address_space *mapping = file->f_mapping;
struct file *fpin = NULL;
unsigned int mmap_miss;
pgoff_t offset = vmf->pgoff;
/* If we don't want any read-ahead, don't bother */
if (vmf->vma->vm_flags & VM_RAND_READ || !ra->ra_pages)
return fpin;
mmap_miss = READ_ONCE(ra->mmap_miss);
if (mmap_miss)
WRITE_ONCE(ra->mmap_miss, --mmap_miss);
if (PageReadahead(page)) {
fpin = maybe_unlock_mmap_for_io(vmf, fpin);
page_cache_async_readahead(mapping, ra, file,
page, offset, ra->ra_pages);
}
return fpin;
}
/**
* filemap_fault - read in file data for page fault handling
* @vmf: struct vm_fault containing details of the fault
*
* filemap_fault() is invoked via the vma operations vector for a
* mapped memory region to read in file data during a page fault.
*
* The goto's are kind of ugly, but this streamlines the normal case of having
* it in the page cache, and handles the special cases reasonably without
* having a lot of duplicated code.
*
* vma->vm_mm->mmap_lock must be held on entry.
*
* If our return value has VM_FAULT_RETRY set, it's because the mmap_lock
* may be dropped before doing I/O or by lock_page_maybe_drop_mmap().
*
* If our return value does not have VM_FAULT_RETRY set, the mmap_lock
* has not been released.
*
* We never return with VM_FAULT_RETRY and a bit from VM_FAULT_ERROR set.
*
* Return: bitwise-OR of %VM_FAULT_ codes.
*/
vm_fault_t filemap_fault(struct vm_fault *vmf)
{
int error;
struct file *file = vmf->vma->vm_file;
struct file *fpin = NULL;
struct address_space *mapping = file->f_mapping;
struct inode *inode = mapping->host;
pgoff_t offset = vmf->pgoff;
pgoff_t max_off;
struct page *page;
vm_fault_t ret = 0;
bool mapping_locked = false;
max_off = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE);
if (unlikely(offset >= max_off))
return VM_FAULT_SIGBUS;
/*
* Do we have something in the page cache already?
*/
page = find_get_page(mapping, offset);
if (likely(page)) {
/*
* We found the page, so try async readahead before waiting for
* the lock.
*/
if (!(vmf->flags & FAULT_FLAG_TRIED))
fpin = do_async_mmap_readahead(vmf, page);
if (unlikely(!PageUptodate(page))) {
filemap_invalidate_lock_shared(mapping);
mapping_locked = true;
}
} else {
/* No page in the page cache at all */
count_vm_event(PGMAJFAULT);
count_memcg_event_mm(vmf->vma->vm_mm, PGMAJFAULT);
ret = VM_FAULT_MAJOR;
fpin = do_sync_mmap_readahead(vmf);
retry_find:
/*
* See comment in filemap_create_page() why we need
* invalidate_lock
*/
if (!mapping_locked) {
filemap_invalidate_lock_shared(mapping);
mapping_locked = true;
}
page = pagecache_get_page(mapping, offset,
FGP_CREAT|FGP_FOR_MMAP,
vmf->gfp_mask);
if (!page) {
if (fpin)
goto out_retry;
filemap_invalidate_unlock_shared(mapping);
return VM_FAULT_OOM;
}
}
if (!lock_page_maybe_drop_mmap(vmf, page, &fpin))
goto out_retry;
/* Did it get truncated? */
if (unlikely(compound_head(page)->mapping != mapping)) {
unlock_page(page);
put_page(page);
goto retry_find;
}
VM_BUG_ON_PAGE(page_to_pgoff(page) != offset, page);
/*
* We have a locked page in the page cache, now we need to check
* that it's up-to-date. If not, it is going to be due to an error.
*/
if (unlikely(!PageUptodate(page))) {
/*
* The page was in cache and uptodate and now it is not.
* Strange but possible since we didn't hold the page lock all
* the time. Let's drop everything get the invalidate lock and
* try again.
*/
if (!mapping_locked) {
unlock_page(page);
put_page(page);
goto retry_find;
}
goto page_not_uptodate;
}
/*
* We've made it this far and we had to drop our mmap_lock, now is the
* time to return to the upper layer and have it re-find the vma and
* redo the fault.
*/
if (fpin) {
unlock_page(page);
goto out_retry;
}
if (mapping_locked)
filemap_invalidate_unlock_shared(mapping);
/*
* Found the page and have a reference on it.
* We must recheck i_size under page lock.
*/
max_off = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE);
if (unlikely(offset >= max_off)) {
unlock_page(page);
put_page(page);
return VM_FAULT_SIGBUS;
}
vmf->page = page;
return ret | VM_FAULT_LOCKED;
page_not_uptodate:
/*
* Umm, take care of errors if the page isn't up-to-date.
* Try to re-read it _once_. We do this synchronously,
* because there really aren't any performance issues here
* and we need to check for errors.
*/
fpin = maybe_unlock_mmap_for_io(vmf, fpin);
error = filemap_read_page(file, mapping, page);
if (fpin)
goto out_retry;
put_page(page);
if (!error || error == AOP_TRUNCATED_PAGE)
goto retry_find;
filemap_invalidate_unlock_shared(mapping);
return VM_FAULT_SIGBUS;
out_retry:
/*
* We dropped the mmap_lock, we need to return to the fault handler to
* re-find the vma and come back and find our hopefully still populated
* page.
*/
if (page)
put_page(page);
if (mapping_locked)
filemap_invalidate_unlock_shared(mapping);
if (fpin)
fput(fpin);
return ret | VM_FAULT_RETRY;
}
EXPORT_SYMBOL(filemap_fault);
static bool filemap_map_pmd(struct vm_fault *vmf, struct page *page)
{
struct mm_struct *mm = vmf->vma->vm_mm;
/* Huge page is mapped? No need to proceed. */
if (pmd_trans_huge(*vmf->pmd)) {
unlock_page(page);
put_page(page);
return true;
}
if (pmd_none(*vmf->pmd) && PageTransHuge(page)) {
vm_fault_t ret = do_set_pmd(vmf, page);
if (!ret) {
/* The page is mapped successfully, reference consumed. */
unlock_page(page);
return true;
}
}
if (pmd_none(*vmf->pmd)) {
vmf->ptl = pmd_lock(mm, vmf->pmd);
if (likely(pmd_none(*vmf->pmd))) {
mm_inc_nr_ptes(mm);
pmd_populate(mm, vmf->pmd, vmf->prealloc_pte);
vmf->prealloc_pte = NULL;
}
spin_unlock(vmf->ptl);
}
/* See comment in handle_pte_fault() */
if (pmd_devmap_trans_unstable(vmf->pmd)) {
unlock_page(page);
put_page(page);
return true;
}
return false;
}
static struct page *next_uptodate_page(struct page *page,
struct address_space *mapping,
struct xa_state *xas, pgoff_t end_pgoff)
{
unsigned long max_idx;
do {
if (!page) return NULL;
if (xas_retry(xas, page))
continue;
if (xa_is_value(page))
continue;
if (PageLocked(page))
continue;
if (!page_cache_get_speculative(page))
continue;
/* Has the page moved or been split? */
if (unlikely(page != xas_reload(xas)))
goto skip;
if (!PageUptodate(page) || PageReadahead(page))
goto skip;
if (PageHWPoison(page))
goto skip;
if (!trylock_page(page))
goto skip;
if (page->mapping != mapping)
goto unlock;
if (!PageUptodate(page))
goto unlock;
max_idx = DIV_ROUND_UP(i_size_read(mapping->host), PAGE_SIZE);
if (xas->xa_index >= max_idx)
goto unlock;
return page;
unlock:
unlock_page(page);
skip:
put_page(page);
} while ((page = xas_next_entry(xas, end_pgoff)) != NULL);
return NULL;
}
static inline struct page *first_map_page(struct address_space *mapping,
struct xa_state *xas,
pgoff_t end_pgoff)
{
return next_uptodate_page(xas_find(xas, end_pgoff),
mapping, xas, end_pgoff);
}
static inline struct page *next_map_page(struct address_space *mapping,
struct xa_state *xas,
pgoff_t end_pgoff)
{
return next_uptodate_page(xas_next_entry(xas, end_pgoff),
mapping, xas, end_pgoff);
}
vm_fault_t filemap_map_pages(struct vm_fault *vmf,
pgoff_t start_pgoff, pgoff_t end_pgoff)
{
struct vm_area_struct *vma = vmf->vma;
struct file *file = vma->vm_file;
struct address_space *mapping = file->f_mapping;
pgoff_t last_pgoff = start_pgoff;
unsigned long addr;
XA_STATE(xas, &mapping->i_pages, start_pgoff);
struct page *head, *page;
unsigned int mmap_miss = READ_ONCE(file->f_ra.mmap_miss);
vm_fault_t ret = 0;
rcu_read_lock();
head = first_map_page(mapping, &xas, end_pgoff);
if (!head)
goto out;
if (filemap_map_pmd(vmf, head)) {
ret = VM_FAULT_NOPAGE;
goto out;
}
addr = vma->vm_start + ((start_pgoff - vma->vm_pgoff) << PAGE_SHIFT);
vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, addr, &vmf->ptl);
do {
page = find_subpage(head, xas.xa_index);
if (PageHWPoison(page))
goto unlock;
if (mmap_miss > 0)
mmap_miss--; addr += (xas.xa_index - last_pgoff) << PAGE_SHIFT;
vmf->pte += xas.xa_index - last_pgoff;
last_pgoff = xas.xa_index;
if (!pte_none(*vmf->pte))
goto unlock;
/* We're about to handle the fault */
if (vmf->address == addr)
ret = VM_FAULT_NOPAGE;
do_set_pte(vmf, page, addr);
/* no need to invalidate: a not-present page won't be cached */
update_mmu_cache(vma, addr, vmf->pte);
unlock_page(head);
continue;
unlock:
unlock_page(head);
put_page(head);
} while ((head = next_map_page(mapping, &xas, end_pgoff)) != NULL);
pte_unmap_unlock(vmf->pte, vmf->ptl);
out:
rcu_read_unlock();
WRITE_ONCE(file->f_ra.mmap_miss, mmap_miss);
return ret;
}
EXPORT_SYMBOL(filemap_map_pages);
vm_fault_t filemap_page_mkwrite(struct vm_fault *vmf)
{
struct address_space *mapping = vmf->vma->vm_file->f_mapping;
struct page *page = vmf->page;
vm_fault_t ret = VM_FAULT_LOCKED;
sb_start_pagefault(mapping->host->i_sb);
file_update_time(vmf->vma->vm_file);
lock_page(page);
if (page->mapping != mapping) {
unlock_page(page);
ret = VM_FAULT_NOPAGE;
goto out;
}
/*
* We mark the page dirty already here so that when freeze is in
* progress, we are guaranteed that writeback during freezing will
* see the dirty page and writeprotect it again.
*/
set_page_dirty(page);
wait_for_stable_page(page);
out:
sb_end_pagefault(mapping->host->i_sb);
return ret;
}
const struct vm_operations_struct generic_file_vm_ops = {
.fault = filemap_fault,
.map_pages = filemap_map_pages,
.page_mkwrite = filemap_page_mkwrite,
};
/* This is used for a general mmap of a disk file */
int generic_file_mmap(struct file *file, struct vm_area_struct *vma)
{
struct address_space *mapping = file->f_mapping;
if (!mapping->a_ops->readpage)
return -ENOEXEC;
file_accessed(file);
vma->vm_ops = &generic_file_vm_ops;
return 0;
}
/*
* This is for filesystems which do not implement ->writepage.
*/
int generic_file_readonly_mmap(struct file *file, struct vm_area_struct *vma)
{
if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_MAYWRITE))
return -EINVAL;
return generic_file_mmap(file, vma);
}
#else
vm_fault_t filemap_page_mkwrite(struct vm_fault *vmf)
{
return VM_FAULT_SIGBUS;
}
int generic_file_mmap(struct file *file, struct vm_area_struct *vma)
{
return -ENOSYS;
}
int generic_file_readonly_mmap(struct file *file, struct vm_area_struct *vma)
{
return -ENOSYS;
}
#endif /* CONFIG_MMU */
EXPORT_SYMBOL(filemap_page_mkwrite);
EXPORT_SYMBOL(generic_file_mmap);
EXPORT_SYMBOL(generic_file_readonly_mmap);
static struct page *wait_on_page_read(struct page *page)
{
if (!IS_ERR(page)) {
wait_on_page_locked(page);
if (!PageUptodate(page)) {
put_page(page);
page = ERR_PTR(-EIO);
}
}
return page;
}
static struct page *do_read_cache_page(struct address_space *mapping,
pgoff_t index,
int (*filler)(void *, struct page *),
void *data,
gfp_t gfp)
{
struct page *page;
int err;
repeat:
page = find_get_page(mapping, index);
if (!page) {
page = __page_cache_alloc(gfp);
if (!page)
return ERR_PTR(-ENOMEM);
err = add_to_page_cache_lru(page, mapping, index, gfp);
if (unlikely(err)) {
put_page(page);
if (err == -EEXIST)
goto repeat;
/* Presumably ENOMEM for xarray node */
return ERR_PTR(err);
}
filler:
if (filler) err = filler(data, page);
else
err = mapping->a_ops->readpage(data, page); if (err < 0) {
put_page(page);
return ERR_PTR(err);
}
page = wait_on_page_read(page);
if (IS_ERR(page))
return page;
goto out;
}
if (PageUptodate(page))
goto out;
/*
* Page is not up to date and may be locked due to one of the following
* case a: Page is being filled and the page lock is held
* case b: Read/write error clearing the page uptodate status
* case c: Truncation in progress (page locked)
* case d: Reclaim in progress
*
* Case a, the page will be up to date when the page is unlocked.
* There is no need to serialise on the page lock here as the page
* is pinned so the lock gives no additional protection. Even if the
* page is truncated, the data is still valid if PageUptodate as
* it's a race vs truncate race.
* Case b, the page will not be up to date
* Case c, the page may be truncated but in itself, the data may still
* be valid after IO completes as it's a read vs truncate race. The
* operation must restart if the page is not uptodate on unlock but
* otherwise serialising on page lock to stabilise the mapping gives
* no additional guarantees to the caller as the page lock is
* released before return.
* Case d, similar to truncation. If reclaim holds the page lock, it
* will be a race with remove_mapping that determines if the mapping
* is valid on unlock but otherwise the data is valid and there is
* no need to serialise with page lock.
*
* As the page lock gives no additional guarantee, we optimistically
* wait on the page to be unlocked and check if it's up to date and
* use the page if it is. Otherwise, the page lock is required to
* distinguish between the different cases. The motivation is that we
* avoid spurious serialisations and wakeups when multiple processes
* wait on the same page for IO to complete.
*/
wait_on_page_locked(page);
if (PageUptodate(page))
goto out;
/* Distinguish between all the cases under the safety of the lock */
lock_page(page);
/* Case c or d, restart the operation */
if (!page->mapping) { unlock_page(page);
put_page(page);
goto repeat;
}
/* Someone else locked and filled the page in a very small window */
if (PageUptodate(page)) {
unlock_page(page);
goto out;
}
/*
* A previous I/O error may have been due to temporary
* failures.
* Clear page error before actual read, PG_error will be
* set again if read page fails.
*/
ClearPageError(page);
goto filler;
out:
mark_page_accessed(page); return page;
}
/**
* read_cache_page - read into page cache, fill it if needed
* @mapping: the page's address_space
* @index: the page index
* @filler: function to perform the read
* @data: first arg to filler(data, page) function, often left as NULL
*
* Read into the page cache. If a page already exists, and PageUptodate() is
* not set, try to fill the page and wait for it to become unlocked.
*
* If the page does not get brought uptodate, return -EIO.
*
* The function expects mapping->invalidate_lock to be already held.
*
* Return: up to date page on success, ERR_PTR() on failure.
*/
struct page *read_cache_page(struct address_space *mapping,
pgoff_t index,
int (*filler)(void *, struct page *),
void *data)
{
return do_read_cache_page(mapping, index, filler, data,
mapping_gfp_mask(mapping));
}
EXPORT_SYMBOL(read_cache_page);
/**
* read_cache_page_gfp - read into page cache, using specified page allocation flags.
* @mapping: the page's address_space
* @index: the page index
* @gfp: the page allocator flags to use if allocating
*
* This is the same as "read_mapping_page(mapping, index, NULL)", but with
* any new page allocations done using the specified allocation flags.
*
* If the page does not get brought uptodate, return -EIO.
*
* The function expects mapping->invalidate_lock to be already held.
*
* Return: up to date page on success, ERR_PTR() on failure.
*/
struct page *read_cache_page_gfp(struct address_space *mapping,
pgoff_t index,
gfp_t gfp)
{
return do_read_cache_page(mapping, index, NULL, NULL, gfp);
}
EXPORT_SYMBOL(read_cache_page_gfp);
int pagecache_write_begin(struct file *file, struct address_space *mapping,
loff_t pos, unsigned len, unsigned flags,
struct page **pagep, void **fsdata)
{
const struct address_space_operations *aops = mapping->a_ops;
return aops->write_begin(file, mapping, pos, len, flags,
pagep, fsdata);
}
EXPORT_SYMBOL(pagecache_write_begin);
int pagecache_write_end(struct file *file, struct address_space *mapping,
loff_t pos, unsigned len, unsigned copied,
struct page *page, void *fsdata)
{
const struct address_space_operations *aops = mapping->a_ops;
return aops->write_end(file, mapping, pos, len, copied, page, fsdata);
}
EXPORT_SYMBOL(pagecache_write_end);
/*
* Warn about a page cache invalidation failure during a direct I/O write.
*/
void dio_warn_stale_pagecache(struct file *filp)
{
static DEFINE_RATELIMIT_STATE(_rs, 86400 * HZ, DEFAULT_RATELIMIT_BURST);
char pathname[128];
char *path;
errseq_set(&filp->f_mapping->wb_err, -EIO);
if (__ratelimit(&_rs)) {
path = file_path(filp, pathname, sizeof(pathname));
if (IS_ERR(path))
path = "(unknown)";
pr_crit("Page cache invalidation failure on direct I/O. Possible data corruption due to collision with buffered I/O!\n");
pr_crit("File: %s PID: %d Comm: %.20s\n", path, current->pid,
current->comm);
}
}
ssize_t
generic_file_direct_write(struct kiocb *iocb, struct iov_iter *from)
{
struct file *file = iocb->ki_filp;
struct address_space *mapping = file->f_mapping;
struct inode *inode = mapping->host;
loff_t pos = iocb->ki_pos;
ssize_t written;
size_t write_len;
pgoff_t end;
write_len = iov_iter_count(from);
end = (pos + write_len - 1) >> PAGE_SHIFT;
if (iocb->ki_flags & IOCB_NOWAIT) {
/* If there are pages to writeback, return */
if (filemap_range_has_page(file->f_mapping, pos,
pos + write_len - 1))
return -EAGAIN;
} else {
written = filemap_write_and_wait_range(mapping, pos,
pos + write_len - 1);
if (written)
goto out;
}
/*
* After a write we want buffered reads to be sure to go to disk to get
* the new data. We invalidate clean cached page from the region we're
* about to write. We do this *before* the write so that we can return
* without clobbering -EIOCBQUEUED from ->direct_IO().
*/
written = invalidate_inode_pages2_range(mapping,
pos >> PAGE_SHIFT, end);
/*
* If a page can not be invalidated, return 0 to fall back
* to buffered write.
*/
if (written) {
if (written == -EBUSY)
return 0;
goto out;
}
written = mapping->a_ops->direct_IO(iocb, from);
/*
* Finally, try again to invalidate clean pages which might have been
* cached by non-direct readahead, or faulted in by get_user_pages()
* if the source of the write was an mmap'ed region of the file
* we're writing. Either one is a pretty crazy thing to do,
* so we don't support it 100%. If this invalidation
* fails, tough, the write still worked...
*
* Most of the time we do not need this since dio_complete() will do
* the invalidation for us. However there are some file systems that
* do not end up with dio_complete() being called, so let's not break
* them by removing it completely.
*
* Noticeable example is a blkdev_direct_IO().
*
* Skip invalidation for async writes or if mapping has no pages.
*/
if (written > 0 && mapping->nrpages && invalidate_inode_pages2_range(mapping, pos >> PAGE_SHIFT, end)) dio_warn_stale_pagecache(file);
if (written > 0) {
pos += written;
write_len -= written;
if (pos > i_size_read(inode) && !S_ISBLK(inode->i_mode)) {
i_size_write(inode, pos);
mark_inode_dirty(inode);
}
iocb->ki_pos = pos;
}
if (written != -EIOCBQUEUED) iov_iter_revert(from, write_len - iov_iter_count(from));
out:
return written;
}
EXPORT_SYMBOL(generic_file_direct_write);
/*
* Find or create a page at the given pagecache position. Return the locked
* page. This function is specifically for buffered writes.
*/
struct page *grab_cache_page_write_begin(struct address_space *mapping,
pgoff_t index, unsigned flags)
{
struct page *page;
int fgp_flags = FGP_LOCK|FGP_WRITE|FGP_CREAT;
if (flags & AOP_FLAG_NOFS)
fgp_flags |= FGP_NOFS;
page = pagecache_get_page(mapping, index, fgp_flags,
mapping_gfp_mask(mapping));
if (page)
wait_for_stable_page(page); return page;
}
EXPORT_SYMBOL(grab_cache_page_write_begin);
ssize_t generic_perform_write(struct file *file,
struct iov_iter *i, loff_t pos)
{
struct address_space *mapping = file->f_mapping;
const struct address_space_operations *a_ops = mapping->a_ops;
long status = 0;
ssize_t written = 0;
unsigned int flags = 0;
do {
struct page *page;
unsigned long offset; /* Offset into pagecache page */
unsigned long bytes; /* Bytes to write to page */
size_t copied; /* Bytes copied from user */
void *fsdata;
offset = (pos & (PAGE_SIZE - 1));
bytes = min_t(unsigned long, PAGE_SIZE - offset,
iov_iter_count(i));
again:
/*
* Bring in the user page that we will copy from _first_.
* Otherwise there's a nasty deadlock on copying from the
* same page as we're writing to, without it being marked
* up-to-date.
*/
if (unlikely(fault_in_iov_iter_readable(i, bytes))) {
status = -EFAULT;
break;
}
if (fatal_signal_pending(current)) {
status = -EINTR;
break;
}
status = a_ops->write_begin(file, mapping, pos, bytes, flags,
&page, &fsdata);
if (unlikely(status < 0))
break;
if (mapping_writably_mapped(mapping))
flush_dcache_page(page);
copied = copy_page_from_iter_atomic(page, offset, bytes, i);
flush_dcache_page(page);
status = a_ops->write_end(file, mapping, pos, bytes, copied,
page, fsdata);
if (unlikely(status != copied)) {
iov_iter_revert(i, copied - max(status, 0L));
if (unlikely(status < 0))
break;
}
cond_resched();
if (unlikely(status == 0)) {
/*
* A short copy made ->write_end() reject the
* thing entirely. Might be memory poisoning
* halfway through, might be a race with munmap,
* might be severe memory pressure.
*/
if (copied)
bytes = copied;
goto again;
}
pos += status;
written += status;
balance_dirty_pages_ratelimited(mapping);
} while (iov_iter_count(i));
return written ? written : status;
}
EXPORT_SYMBOL(generic_perform_write);
/**
* __generic_file_write_iter - write data to a file
* @iocb: IO state structure (file, offset, etc.)
* @from: iov_iter with data to write
*
* This function does all the work needed for actually writing data to a
* file. It does all basic checks, removes SUID from the file, updates
* modification times and calls proper subroutines depending on whether we
* do direct IO or a standard buffered write.
*
* It expects i_rwsem to be grabbed unless we work on a block device or similar
* object which does not need locking at all.
*
* This function does *not* take care of syncing data in case of O_SYNC write.
* A caller has to handle it. This is mainly due to the fact that we want to
* avoid syncing under i_rwsem.
*
* Return:
* * number of bytes written, even for truncated writes
* * negative error code if no data has been written at all
*/
ssize_t __generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
{
struct file *file = iocb->ki_filp;
struct address_space *mapping = file->f_mapping;
struct inode *inode = mapping->host;
ssize_t written = 0;
ssize_t err;
ssize_t status;
/* We can write back this queue in page reclaim */
current->backing_dev_info = inode_to_bdi(inode);
err = file_remove_privs(file);
if (err)
goto out;
err = file_update_time(file);
if (err)
goto out;
if (iocb->ki_flags & IOCB_DIRECT) {
loff_t pos, endbyte;
written = generic_file_direct_write(iocb, from);
/*
* If the write stopped short of completing, fall back to
* buffered writes. Some filesystems do this for writes to
* holes, for example. For DAX files, a buffered write will
* not succeed (even if it did, DAX does not handle dirty
* page-cache pages correctly).
*/
if (written < 0 || !iov_iter_count(from) || IS_DAX(inode))
goto out;
status = generic_perform_write(file, from, pos = iocb->ki_pos);
/*
* If generic_perform_write() returned a synchronous error
* then we want to return the number of bytes which were
* direct-written, or the error code if that was zero. Note
* that this differs from normal direct-io semantics, which
* will return -EFOO even if some bytes were written.
*/
if (unlikely(status < 0)) {
err = status;
goto out;
}
/*
* We need to ensure that the page cache pages are written to
* disk and invalidated to preserve the expected O_DIRECT
* semantics.
*/
endbyte = pos + status - 1;
err = filemap_write_and_wait_range(mapping, pos, endbyte);
if (err == 0) {
iocb->ki_pos = endbyte + 1;
written += status;
invalidate_mapping_pages(mapping,
pos >> PAGE_SHIFT,
endbyte >> PAGE_SHIFT);
} else {
/*
* We don't know how much we wrote, so just return
* the number of bytes which were direct-written
*/
}
} else {
written = generic_perform_write(file, from, iocb->ki_pos);
if (likely(written > 0))
iocb->ki_pos += written;
}
out:
current->backing_dev_info = NULL;
return written ? written : err;
}
EXPORT_SYMBOL(__generic_file_write_iter);
/**
* generic_file_write_iter - write data to a file
* @iocb: IO state structure
* @from: iov_iter with data to write
*
* This is a wrapper around __generic_file_write_iter() to be used by most
* filesystems. It takes care of syncing the file in case of O_SYNC file
* and acquires i_rwsem as needed.
* Return:
* * negative error code if no data has been written at all of
* vfs_fsync_range() failed for a synchronous write
* * number of bytes written, even for truncated writes
*/
ssize_t generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
{
struct file *file = iocb->ki_filp;
struct inode *inode = file->f_mapping->host;
ssize_t ret;
inode_lock(inode);
ret = generic_write_checks(iocb, from);
if (ret > 0)
ret = __generic_file_write_iter(iocb, from);
inode_unlock(inode);
if (ret > 0)
ret = generic_write_sync(iocb, ret);
return ret;
}
EXPORT_SYMBOL(generic_file_write_iter);
/**
* try_to_release_page() - release old fs-specific metadata on a page
*
* @page: the page which the kernel is trying to free
* @gfp_mask: memory allocation flags (and I/O mode)
*
* The address_space is to try to release any data against the page
* (presumably at page->private).
*
* This may also be called if PG_fscache is set on a page, indicating that the
* page is known to the local caching routines.
*
* The @gfp_mask argument specifies whether I/O may be performed to release
* this page (__GFP_IO), and whether the call may block (__GFP_RECLAIM & __GFP_FS).
*
* Return: %1 if the release was successful, otherwise return zero.
*/
int try_to_release_page(struct page *page, gfp_t gfp_mask)
{
struct address_space * const mapping = page->mapping; BUG_ON(!PageLocked(page));
if (PageWriteback(page))
return 0;
if (mapping && mapping->a_ops->releasepage) return mapping->a_ops->releasepage(page, gfp_mask); return try_to_free_buffers(page);
}
EXPORT_SYMBOL(try_to_release_page);
// SPDX-License-Identifier: GPL-2.0-only
/*
* INET An implementation of the TCP/IP protocol suite for the LINUX
* operating system. INET is implemented using the BSD Socket
* interface as the means of communication with the user level.
*
* Implementation of the Transmission Control Protocol(TCP).
*
* Authors: Ross Biro
* Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
* Mark Evans, <evansmp@uhura.aston.ac.uk>
* Corey Minyard <wf-rch!minyard@relay.EU.net>
* Florian La Roche, <flla@stud.uni-sb.de>
* Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
* Linus Torvalds, <torvalds@cs.helsinki.fi>
* Alan Cox, <gw4pts@gw4pts.ampr.org>
* Matthew Dillon, <dillon@apollo.west.oic.com>
* Arnt Gulbrandsen, <agulbra@nvg.unit.no>
* Jorge Cwik, <jorge@laser.satlink.net>
*/
#include <linux/module.h>
#include <linux/gfp.h>
#include <net/tcp.h>
static u32 tcp_clamp_rto_to_user_timeout(const struct sock *sk)
{
struct inet_connection_sock *icsk = inet_csk(sk);
u32 elapsed, start_ts;
s32 remaining;
start_ts = tcp_sk(sk)->retrans_stamp;
if (!icsk->icsk_user_timeout)
return icsk->icsk_rto;
elapsed = tcp_time_stamp(tcp_sk(sk)) - start_ts;
remaining = icsk->icsk_user_timeout - elapsed;
if (remaining <= 0)
return 1; /* user timeout has passed; fire ASAP */
return min_t(u32, icsk->icsk_rto, msecs_to_jiffies(remaining));
}
u32 tcp_clamp_probe0_to_user_timeout(const struct sock *sk, u32 when)
{
struct inet_connection_sock *icsk = inet_csk(sk);
u32 remaining;
s32 elapsed;
if (!icsk->icsk_user_timeout || !icsk->icsk_probes_tstamp)
return when;
elapsed = tcp_jiffies32 - icsk->icsk_probes_tstamp;
if (unlikely(elapsed < 0))
elapsed = 0;
remaining = msecs_to_jiffies(icsk->icsk_user_timeout) - elapsed;
remaining = max_t(u32, remaining, TCP_TIMEOUT_MIN);
return min_t(u32, remaining, when);
}
/**
* tcp_write_err() - close socket and save error info
* @sk: The socket the error has appeared on.
*
* Returns: Nothing (void)
*/
static void tcp_write_err(struct sock *sk)
{
sk->sk_err = sk->sk_err_soft ? : ETIMEDOUT;
sk_error_report(sk);
tcp_write_queue_purge(sk);
tcp_done(sk);
__NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONTIMEOUT);
}
/**
* tcp_out_of_resources() - Close socket if out of resources
* @sk: pointer to current socket
* @do_reset: send a last packet with reset flag
*
* Do not allow orphaned sockets to eat all our resources.
* This is direct violation of TCP specs, but it is required
* to prevent DoS attacks. It is called when a retransmission timeout
* or zero probe timeout occurs on orphaned socket.
*
* Also close if our net namespace is exiting; in that case there is no
* hope of ever communicating again since all netns interfaces are already
* down (or about to be down), and we need to release our dst references,
* which have been moved to the netns loopback interface, so the namespace
* can finish exiting. This condition is only possible if we are a kernel
* socket, as those do not hold references to the namespace.
*
* Criteria is still not confirmed experimentally and may change.
* We kill the socket, if:
* 1. If number of orphaned sockets exceeds an administratively configured
* limit.
* 2. If we have strong memory pressure.
* 3. If our net namespace is exiting.
*/
static int tcp_out_of_resources(struct sock *sk, bool do_reset)
{
struct tcp_sock *tp = tcp_sk(sk);
int shift = 0;
/* If peer does not open window for long time, or did not transmit
* anything for long time, penalize it. */
if ((s32)(tcp_jiffies32 - tp->lsndtime) > 2*TCP_RTO_MAX || !do_reset)
shift++;
/* If some dubious ICMP arrived, penalize even more. */
if (sk->sk_err_soft)
shift++;
if (tcp_check_oom(sk, shift)) {
/* Catch exceptional cases, when connection requires reset.
* 1. Last segment was sent recently. */
if ((s32)(tcp_jiffies32 - tp->lsndtime) <= TCP_TIMEWAIT_LEN ||
/* 2. Window is closed. */
(!tp->snd_wnd && !tp->packets_out))
do_reset = true;
if (do_reset)
tcp_send_active_reset(sk, GFP_ATOMIC);
tcp_done(sk);
__NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONMEMORY);
return 1;
}
if (!check_net(sock_net(sk))) {
/* Not possible to send reset; just close */
tcp_done(sk);
return 1;
}
return 0;
}
/**
* tcp_orphan_retries() - Returns maximal number of retries on an orphaned socket
* @sk: Pointer to the current socket.
* @alive: bool, socket alive state
*/
static int tcp_orphan_retries(struct sock *sk, bool alive)
{
int retries = sock_net(sk)->ipv4.sysctl_tcp_orphan_retries; /* May be zero. */
/* We know from an ICMP that something is wrong. */
if (sk->sk_err_soft && !alive)
retries = 0;
/* However, if socket sent something recently, select some safe
* number of retries. 8 corresponds to >100 seconds with minimal
* RTO of 200msec. */
if (retries == 0 && alive)
retries = 8;
return retries;
}
static void tcp_mtu_probing(struct inet_connection_sock *icsk, struct sock *sk)
{
const struct net *net = sock_net(sk);
int mss;
/* Black hole detection */
if (!net->ipv4.sysctl_tcp_mtu_probing)
return;
if (!icsk->icsk_mtup.enabled) {
icsk->icsk_mtup.enabled = 1;
icsk->icsk_mtup.probe_timestamp = tcp_jiffies32;
} else {
mss = tcp_mtu_to_mss(sk, icsk->icsk_mtup.search_low) >> 1;
mss = min(net->ipv4.sysctl_tcp_base_mss, mss);
mss = max(mss, net->ipv4.sysctl_tcp_mtu_probe_floor);
mss = max(mss, net->ipv4.sysctl_tcp_min_snd_mss);
icsk->icsk_mtup.search_low = tcp_mss_to_mtu(sk, mss);
}
tcp_sync_mss(sk, icsk->icsk_pmtu_cookie);
}
static unsigned int tcp_model_timeout(struct sock *sk,
unsigned int boundary,
unsigned int rto_base)
{
unsigned int linear_backoff_thresh, timeout;
linear_backoff_thresh = ilog2(TCP_RTO_MAX / rto_base);
if (boundary <= linear_backoff_thresh)
timeout = ((2 << boundary) - 1) * rto_base;
else
timeout = ((2 << linear_backoff_thresh) - 1) * rto_base +
(boundary - linear_backoff_thresh) * TCP_RTO_MAX;
return jiffies_to_msecs(timeout);
}
/**
* retransmits_timed_out() - returns true if this connection has timed out
* @sk: The current socket
* @boundary: max number of retransmissions
* @timeout: A custom timeout value.
* If set to 0 the default timeout is calculated and used.
* Using TCP_RTO_MIN and the number of unsuccessful retransmits.
*
* The default "timeout" value this function can calculate and use
* is equivalent to the timeout of a TCP Connection
* after "boundary" unsuccessful, exponentially backed-off
* retransmissions with an initial RTO of TCP_RTO_MIN.
*/
static bool retransmits_timed_out(struct sock *sk,
unsigned int boundary,
unsigned int timeout)
{
unsigned int start_ts;
if (!inet_csk(sk)->icsk_retransmits)
return false;
start_ts = tcp_sk(sk)->retrans_stamp;
if (likely(timeout == 0)) {
unsigned int rto_base = TCP_RTO_MIN;
if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV))
rto_base = tcp_timeout_init(sk);
timeout = tcp_model_timeout(sk, boundary, rto_base);
}
return (s32)(tcp_time_stamp(tcp_sk(sk)) - start_ts - timeout) >= 0;
}
/* A write timeout has occurred. Process the after effects. */
static int tcp_write_timeout(struct sock *sk)
{
struct inet_connection_sock *icsk = inet_csk(sk);
struct tcp_sock *tp = tcp_sk(sk);
struct net *net = sock_net(sk);
bool expired = false, do_reset;
int retry_until;
if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) {
if (icsk->icsk_retransmits)
__dst_negative_advice(sk);
retry_until = icsk->icsk_syn_retries ? : net->ipv4.sysctl_tcp_syn_retries;
expired = icsk->icsk_retransmits >= retry_until;
} else {
if (retransmits_timed_out(sk, net->ipv4.sysctl_tcp_retries1, 0)) {
/* Black hole detection */
tcp_mtu_probing(icsk, sk);
__dst_negative_advice(sk);
}
retry_until = net->ipv4.sysctl_tcp_retries2;
if (sock_flag(sk, SOCK_DEAD)) {
const bool alive = icsk->icsk_rto < TCP_RTO_MAX;
retry_until = tcp_orphan_retries(sk, alive);
do_reset = alive ||
!retransmits_timed_out(sk, retry_until, 0);
if (tcp_out_of_resources(sk, do_reset))
return 1;
}
}
if (!expired)
expired = retransmits_timed_out(sk, retry_until,
icsk->icsk_user_timeout);
tcp_fastopen_active_detect_blackhole(sk, expired);
if (BPF_SOCK_OPS_TEST_FLAG(tp, BPF_SOCK_OPS_RTO_CB_FLAG))
tcp_call_bpf_3arg(sk, BPF_SOCK_OPS_RTO_CB,
icsk->icsk_retransmits,
icsk->icsk_rto, (int)expired);
if (expired) {
/* Has it gone just too far? */
tcp_write_err(sk);
return 1;
}
if (sk_rethink_txhash(sk)) {
tp->timeout_rehash++;
__NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPTIMEOUTREHASH);
}
return 0;
}
/* Called with BH disabled */
void tcp_delack_timer_handler(struct sock *sk)
{
struct inet_connection_sock *icsk = inet_csk(sk);
sk_mem_reclaim_partial(sk);
if (((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN)) ||
!(icsk->icsk_ack.pending & ICSK_ACK_TIMER))
goto out;
if (time_after(icsk->icsk_ack.timeout, jiffies)) {
sk_reset_timer(sk, &icsk->icsk_delack_timer, icsk->icsk_ack.timeout);
goto out;
}
icsk->icsk_ack.pending &= ~ICSK_ACK_TIMER;
if (inet_csk_ack_scheduled(sk)) {
if (!inet_csk_in_pingpong_mode(sk)) {
/* Delayed ACK missed: inflate ATO. */
icsk->icsk_ack.ato = min(icsk->icsk_ack.ato << 1, icsk->icsk_rto);
} else {
/* Delayed ACK missed: leave pingpong mode and
* deflate ATO.
*/
inet_csk_exit_pingpong_mode(sk);
icsk->icsk_ack.ato = TCP_ATO_MIN;
}
tcp_mstamp_refresh(tcp_sk(sk));
tcp_send_ack(sk);
__NET_INC_STATS(sock_net(sk), LINUX_MIB_DELAYEDACKS);
}
out:
if (tcp_under_memory_pressure(sk))
sk_mem_reclaim(sk);
}
/**
* tcp_delack_timer() - The TCP delayed ACK timeout handler
* @t: Pointer to the timer. (gets casted to struct sock *)
*
* This function gets (indirectly) called when the kernel timer for a TCP packet
* of this socket expires. Calls tcp_delack_timer_handler() to do the actual work.
*
* Returns: Nothing (void)
*/
static void tcp_delack_timer(struct timer_list *t)
{
struct inet_connection_sock *icsk =
from_timer(icsk, t, icsk_delack_timer);
struct sock *sk = &icsk->icsk_inet.sk;
bh_lock_sock(sk);
if (!sock_owned_by_user(sk)) {
tcp_delack_timer_handler(sk);
} else {
__NET_INC_STATS(sock_net(sk), LINUX_MIB_DELAYEDACKLOCKED);
/* deleguate our work to tcp_release_cb() */
if (!test_and_set_bit(TCP_DELACK_TIMER_DEFERRED, &sk->sk_tsq_flags))
sock_hold(sk);
}
bh_unlock_sock(sk);
sock_put(sk);
}
static void tcp_probe_timer(struct sock *sk)
{
struct inet_connection_sock *icsk = inet_csk(sk);
struct sk_buff *skb = tcp_send_head(sk);
struct tcp_sock *tp = tcp_sk(sk);
int max_probes;
if (tp->packets_out || !skb) {
icsk->icsk_probes_out = 0;
icsk->icsk_probes_tstamp = 0;
return;
}
/* RFC 1122 4.2.2.17 requires the sender to stay open indefinitely as
* long as the receiver continues to respond probes. We support this by
* default and reset icsk_probes_out with incoming ACKs. But if the
* socket is orphaned or the user specifies TCP_USER_TIMEOUT, we
* kill the socket when the retry count and the time exceeds the
* corresponding system limit. We also implement similar policy when
* we use RTO to probe window in tcp_retransmit_timer().
*/
if (!icsk->icsk_probes_tstamp)
icsk->icsk_probes_tstamp = tcp_jiffies32;
else if (icsk->icsk_user_timeout &&
(s32)(tcp_jiffies32 - icsk->icsk_probes_tstamp) >=
msecs_to_jiffies(icsk->icsk_user_timeout))
goto abort;
max_probes = sock_net(sk)->ipv4.sysctl_tcp_retries2;
if (sock_flag(sk, SOCK_DEAD)) {
const bool alive = inet_csk_rto_backoff(icsk, TCP_RTO_MAX) < TCP_RTO_MAX;
max_probes = tcp_orphan_retries(sk, alive);
if (!alive && icsk->icsk_backoff >= max_probes)
goto abort;
if (tcp_out_of_resources(sk, true))
return;
}
if (icsk->icsk_probes_out >= max_probes) {
abort: tcp_write_err(sk);
} else {
/* Only send another probe if we didn't close things up. */
tcp_send_probe0(sk);
}
}
/*
* Timer for Fast Open socket to retransmit SYNACK. Note that the
* sk here is the child socket, not the parent (listener) socket.
*/
static void tcp_fastopen_synack_timer(struct sock *sk, struct request_sock *req)
{
struct inet_connection_sock *icsk = inet_csk(sk);
int max_retries = icsk->icsk_syn_retries ? :
sock_net(sk)->ipv4.sysctl_tcp_synack_retries + 1; /* add one more retry for fastopen */
struct tcp_sock *tp = tcp_sk(sk);
req->rsk_ops->syn_ack_timeout(req);
if (req->num_timeout >= max_retries) {
tcp_write_err(sk);
return;
}
/* Lower cwnd after certain SYNACK timeout like tcp_init_transfer() */
if (icsk->icsk_retransmits == 1)
tcp_enter_loss(sk);
/* XXX (TFO) - Unlike regular SYN-ACK retransmit, we ignore error
* returned from rtx_syn_ack() to make it more persistent like
* regular retransmit because if the child socket has been accepted
* it's not good to give up too easily.
*/
inet_rtx_syn_ack(sk, req);
req->num_timeout++;
icsk->icsk_retransmits++;
if (!tp->retrans_stamp)
tp->retrans_stamp = tcp_time_stamp(tp);
inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
TCP_TIMEOUT_INIT << req->num_timeout, TCP_RTO_MAX);
}
/**
* tcp_retransmit_timer() - The TCP retransmit timeout handler
* @sk: Pointer to the current socket.
*
* This function gets called when the kernel timer for a TCP packet
* of this socket expires.
*
* It handles retransmission, timer adjustment and other necessary measures.
*
* Returns: Nothing (void)
*/
void tcp_retransmit_timer(struct sock *sk)
{
struct tcp_sock *tp = tcp_sk(sk);
struct net *net = sock_net(sk);
struct inet_connection_sock *icsk = inet_csk(sk);
struct request_sock *req;
struct sk_buff *skb;
req = rcu_dereference_protected(tp->fastopen_rsk,
lockdep_sock_is_held(sk));
if (req) {
WARN_ON_ONCE(sk->sk_state != TCP_SYN_RECV &&
sk->sk_state != TCP_FIN_WAIT1);
tcp_fastopen_synack_timer(sk, req);
/* Before we receive ACK to our SYN-ACK don't retransmit
* anything else (e.g., data or FIN segments).
*/
return;
}
if (!tp->packets_out)
return;
skb = tcp_rtx_queue_head(sk);
if (WARN_ON_ONCE(!skb))
return;
tp->tlp_high_seq = 0;
if (!tp->snd_wnd && !sock_flag(sk, SOCK_DEAD) &&
!((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV))) {
/* Receiver dastardly shrinks window. Our retransmits
* become zero probes, but we should not timeout this
* connection. If the socket is an orphan, time it out,
* we cannot allow such beasts to hang infinitely.
*/
struct inet_sock *inet = inet_sk(sk);
if (sk->sk_family == AF_INET) {
net_dbg_ratelimited("Peer %pI4:%u/%u unexpectedly shrunk window %u:%u (repaired)\n",
&inet->inet_daddr,
ntohs(inet->inet_dport),
inet->inet_num,
tp->snd_una, tp->snd_nxt);
}
#if IS_ENABLED(CONFIG_IPV6)
else if (sk->sk_family == AF_INET6) {
net_dbg_ratelimited("Peer %pI6:%u/%u unexpectedly shrunk window %u:%u (repaired)\n",
&sk->sk_v6_daddr,
ntohs(inet->inet_dport),
inet->inet_num,
tp->snd_una, tp->snd_nxt);
}
#endif
if (tcp_jiffies32 - tp->rcv_tstamp > TCP_RTO_MAX) {
tcp_write_err(sk);
goto out;
}
tcp_enter_loss(sk);
tcp_retransmit_skb(sk, skb, 1);
__sk_dst_reset(sk);
goto out_reset_timer;
}
__NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPTIMEOUTS);
if (tcp_write_timeout(sk))
goto out;
if (icsk->icsk_retransmits == 0) {
int mib_idx = 0;
if (icsk->icsk_ca_state == TCP_CA_Recovery) {
if (tcp_is_sack(tp))
mib_idx = LINUX_MIB_TCPSACKRECOVERYFAIL;
else
mib_idx = LINUX_MIB_TCPRENORECOVERYFAIL;
} else if (icsk->icsk_ca_state == TCP_CA_Loss) {
mib_idx = LINUX_MIB_TCPLOSSFAILURES;
} else if ((icsk->icsk_ca_state == TCP_CA_Disorder) ||
tp->sacked_out) {
if (tcp_is_sack(tp))
mib_idx = LINUX_MIB_TCPSACKFAILURES;
else
mib_idx = LINUX_MIB_TCPRENOFAILURES;
}
if (mib_idx)
__NET_INC_STATS(sock_net(sk), mib_idx);
}
tcp_enter_loss(sk);
icsk->icsk_retransmits++;
if (tcp_retransmit_skb(sk, tcp_rtx_queue_head(sk), 1) > 0) {
/* Retransmission failed because of local congestion,
* Let senders fight for local resources conservatively.
*/
inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
TCP_RESOURCE_PROBE_INTERVAL,
TCP_RTO_MAX);
goto out;
}
/* Increase the timeout each time we retransmit. Note that
* we do not increase the rtt estimate. rto is initialized
* from rtt, but increases here. Jacobson (SIGCOMM 88) suggests
* that doubling rto each time is the least we can get away with.
* In KA9Q, Karn uses this for the first few times, and then
* goes to quadratic. netBSD doubles, but only goes up to *64,
* and clamps at 1 to 64 sec afterwards. Note that 120 sec is
* defined in the protocol as the maximum possible RTT. I guess
* we'll have to use something other than TCP to talk to the
* University of Mars.
*
* PAWS allows us longer timeouts and large windows, so once
* implemented ftp to mars will work nicely. We will have to fix
* the 120 second clamps though!
*/
icsk->icsk_backoff++;
out_reset_timer:
/* If stream is thin, use linear timeouts. Since 'icsk_backoff' is
* used to reset timer, set to 0. Recalculate 'icsk_rto' as this
* might be increased if the stream oscillates between thin and thick,
* thus the old value might already be too high compared to the value
* set by 'tcp_set_rto' in tcp_input.c which resets the rto without
* backoff. Limit to TCP_THIN_LINEAR_RETRIES before initiating
* exponential backoff behaviour to avoid continue hammering
* linear-timeout retransmissions into a black hole
*/
if (sk->sk_state == TCP_ESTABLISHED &&
(tp->thin_lto || net->ipv4.sysctl_tcp_thin_linear_timeouts) &&
tcp_stream_is_thin(tp) &&
icsk->icsk_retransmits <= TCP_THIN_LINEAR_RETRIES) {
icsk->icsk_backoff = 0;
icsk->icsk_rto = min(__tcp_set_rto(tp), TCP_RTO_MAX);
} else {
/* Use normal (exponential) backoff */
icsk->icsk_rto = min(icsk->icsk_rto << 1, TCP_RTO_MAX);
}
inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
tcp_clamp_rto_to_user_timeout(sk), TCP_RTO_MAX);
if (retransmits_timed_out(sk, net->ipv4.sysctl_tcp_retries1 + 1, 0))
__sk_dst_reset(sk);
out:;
}
/* Called with bottom-half processing disabled.
Called by tcp_write_timer() */
void tcp_write_timer_handler(struct sock *sk)
{
struct inet_connection_sock *icsk = inet_csk(sk);
int event;
if (((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN)) ||
!icsk->icsk_pending)
goto out;
if (time_after(icsk->icsk_timeout, jiffies)) {
sk_reset_timer(sk, &icsk->icsk_retransmit_timer, icsk->icsk_timeout);
goto out;
}
tcp_mstamp_refresh(tcp_sk(sk));
event = icsk->icsk_pending;
switch (event) {
case ICSK_TIME_REO_TIMEOUT:
tcp_rack_reo_timeout(sk);
break;
case ICSK_TIME_LOSS_PROBE:
tcp_send_loss_probe(sk);
break;
case ICSK_TIME_RETRANS:
icsk->icsk_pending = 0;
tcp_retransmit_timer(sk);
break;
case ICSK_TIME_PROBE0:
icsk->icsk_pending = 0;
tcp_probe_timer(sk);
break;
}
out:
sk_mem_reclaim(sk);
}
static void tcp_write_timer(struct timer_list *t)
{
struct inet_connection_sock *icsk =
from_timer(icsk, t, icsk_retransmit_timer);
struct sock *sk = &icsk->icsk_inet.sk;
bh_lock_sock(sk);
if (!sock_owned_by_user(sk)) {
tcp_write_timer_handler(sk);
} else {
/* delegate our work to tcp_release_cb() */
if (!test_and_set_bit(TCP_WRITE_TIMER_DEFERRED, &sk->sk_tsq_flags))
sock_hold(sk);
}
bh_unlock_sock(sk);
sock_put(sk);
}
void tcp_syn_ack_timeout(const struct request_sock *req)
{
struct net *net = read_pnet(&inet_rsk(req)->ireq_net);
__NET_INC_STATS(net, LINUX_MIB_TCPTIMEOUTS);
}
EXPORT_SYMBOL(tcp_syn_ack_timeout);
void tcp_set_keepalive(struct sock *sk, int val)
{
if ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN))
return;
if (val && !sock_flag(sk, SOCK_KEEPOPEN))
inet_csk_reset_keepalive_timer(sk, keepalive_time_when(tcp_sk(sk)));
else if (!val)
inet_csk_delete_keepalive_timer(sk);
}
EXPORT_SYMBOL_GPL(tcp_set_keepalive);
static void tcp_keepalive_timer (struct timer_list *t)
{
struct sock *sk = from_timer(sk, t, sk_timer);
struct inet_connection_sock *icsk = inet_csk(sk);
struct tcp_sock *tp = tcp_sk(sk);
u32 elapsed;
/* Only process if socket is not in use. */
bh_lock_sock(sk);
if (sock_owned_by_user(sk)) {
/* Try again later. */
inet_csk_reset_keepalive_timer (sk, HZ/20);
goto out;
}
if (sk->sk_state == TCP_LISTEN) {
pr_err("Hmm... keepalive on a LISTEN ???\n");
goto out;
}
tcp_mstamp_refresh(tp);
if (sk->sk_state == TCP_FIN_WAIT2 && sock_flag(sk, SOCK_DEAD)) {
if (tp->linger2 >= 0) {
const int tmo = tcp_fin_time(sk) - TCP_TIMEWAIT_LEN;
if (tmo > 0) {
tcp_time_wait(sk, TCP_FIN_WAIT2, tmo);
goto out;
}
}
tcp_send_active_reset(sk, GFP_ATOMIC);
goto death;
}
if (!sock_flag(sk, SOCK_KEEPOPEN) ||
((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_SYN_SENT)))
goto out;
elapsed = keepalive_time_when(tp);
/* It is alive without keepalive 8) */
if (tp->packets_out || !tcp_write_queue_empty(sk))
goto resched;
elapsed = keepalive_time_elapsed(tp);
if (elapsed >= keepalive_time_when(tp)) {
/* If the TCP_USER_TIMEOUT option is enabled, use that
* to determine when to timeout instead.
*/
if ((icsk->icsk_user_timeout != 0 &&
elapsed >= msecs_to_jiffies(icsk->icsk_user_timeout) &&
icsk->icsk_probes_out > 0) ||
(icsk->icsk_user_timeout == 0 &&
icsk->icsk_probes_out >= keepalive_probes(tp))) {
tcp_send_active_reset(sk, GFP_ATOMIC);
tcp_write_err(sk);
goto out;
}
if (tcp_write_wakeup(sk, LINUX_MIB_TCPKEEPALIVE) <= 0) {
icsk->icsk_probes_out++;
elapsed = keepalive_intvl_when(tp);
} else {
/* If keepalive was lost due to local congestion,
* try harder.
*/
elapsed = TCP_RESOURCE_PROBE_INTERVAL;
}
} else {
/* It is tp->rcv_tstamp + keepalive_time_when(tp) */
elapsed = keepalive_time_when(tp) - elapsed;
}
sk_mem_reclaim(sk);
resched:
inet_csk_reset_keepalive_timer (sk, elapsed);
goto out;
death:
tcp_done(sk);
out:
bh_unlock_sock(sk);
sock_put(sk);
}
static enum hrtimer_restart tcp_compressed_ack_kick(struct hrtimer *timer)
{
struct tcp_sock *tp = container_of(timer, struct tcp_sock, compressed_ack_timer);
struct sock *sk = (struct sock *)tp;
bh_lock_sock(sk);
if (!sock_owned_by_user(sk)) {
if (tp->compressed_ack) {
/* Since we have to send one ack finally,
* subtract one from tp->compressed_ack to keep
* LINUX_MIB_TCPACKCOMPRESSED accurate.
*/
tp->compressed_ack--;
tcp_send_ack(sk);
}
} else {
if (!test_and_set_bit(TCP_DELACK_TIMER_DEFERRED,
&sk->sk_tsq_flags))
sock_hold(sk);
}
bh_unlock_sock(sk);
sock_put(sk);
return HRTIMER_NORESTART;
}
void tcp_init_xmit_timers(struct sock *sk)
{
inet_csk_init_xmit_timers(sk, &tcp_write_timer, &tcp_delack_timer,
&tcp_keepalive_timer);
hrtimer_init(&tcp_sk(sk)->pacing_timer, CLOCK_MONOTONIC,
HRTIMER_MODE_ABS_PINNED_SOFT);
tcp_sk(sk)->pacing_timer.function = tcp_pace_kick;
hrtimer_init(&tcp_sk(sk)->compressed_ack_timer, CLOCK_MONOTONIC,
HRTIMER_MODE_REL_PINNED_SOFT);
tcp_sk(sk)->compressed_ack_timer.function = tcp_compressed_ack_kick;
}
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _MM_PERCPU_INTERNAL_H
#define _MM_PERCPU_INTERNAL_H
#include <linux/types.h>
#include <linux/percpu.h>
/*
* pcpu_block_md is the metadata block struct.
* Each chunk's bitmap is split into a number of full blocks.
* All units are in terms of bits.
*
* The scan hint is the largest known contiguous area before the contig hint.
* It is not necessarily the actual largest contig hint though. There is an
* invariant that the scan_hint_start > contig_hint_start iff
* scan_hint == contig_hint. This is necessary because when scanning forward,
* we don't know if a new contig hint would be better than the current one.
*/
struct pcpu_block_md {
int scan_hint; /* scan hint for block */
int scan_hint_start; /* block relative starting
position of the scan hint */
int contig_hint; /* contig hint for block */
int contig_hint_start; /* block relative starting
position of the contig hint */
int left_free; /* size of free space along
the left side of the block */
int right_free; /* size of free space along
the right side of the block */
int first_free; /* block position of first free */
int nr_bits; /* total bits responsible for */
};
struct pcpu_chunk {
#ifdef CONFIG_PERCPU_STATS
int nr_alloc; /* # of allocations */
size_t max_alloc_size; /* largest allocation size */
#endif
struct list_head list; /* linked to pcpu_slot lists */
int free_bytes; /* free bytes in the chunk */
struct pcpu_block_md chunk_md;
void *base_addr; /* base address of this chunk */
unsigned long *alloc_map; /* allocation map */
unsigned long *bound_map; /* boundary map */
struct pcpu_block_md *md_blocks; /* metadata blocks */
void *data; /* chunk data */
bool immutable; /* no [de]population allowed */
bool isolated; /* isolated from active chunk
slots */
int start_offset; /* the overlap with the previous
region to have a page aligned
base_addr */
int end_offset; /* additional area required to
have the region end page
aligned */
#ifdef CONFIG_MEMCG_KMEM
struct obj_cgroup **obj_cgroups; /* vector of object cgroups */
#endif
int nr_pages; /* # of pages served by this chunk */
int nr_populated; /* # of populated pages */
int nr_empty_pop_pages; /* # of empty populated pages */
unsigned long populated[]; /* populated bitmap */
};
extern spinlock_t pcpu_lock;
extern struct list_head *pcpu_chunk_lists;
extern int pcpu_nr_slots;
extern int pcpu_sidelined_slot;
extern int pcpu_to_depopulate_slot;
extern int pcpu_nr_empty_pop_pages;
extern struct pcpu_chunk *pcpu_first_chunk;
extern struct pcpu_chunk *pcpu_reserved_chunk;
/**
* pcpu_chunk_nr_blocks - converts nr_pages to # of md_blocks
* @chunk: chunk of interest
*
* This conversion is from the number of physical pages that the chunk
* serves to the number of bitmap blocks used.
*/
static inline int pcpu_chunk_nr_blocks(struct pcpu_chunk *chunk)
{
return chunk->nr_pages * PAGE_SIZE / PCPU_BITMAP_BLOCK_SIZE;
}
/**
* pcpu_nr_pages_to_map_bits - converts the pages to size of bitmap
* @pages: number of physical pages
*
* This conversion is from physical pages to the number of bits
* required in the bitmap.
*/
static inline int pcpu_nr_pages_to_map_bits(int pages)
{
return pages * PAGE_SIZE / PCPU_MIN_ALLOC_SIZE;
}
/**
* pcpu_chunk_map_bits - helper to convert nr_pages to size of bitmap
* @chunk: chunk of interest
*
* This conversion is from the number of physical pages that the chunk
* serves to the number of bits in the bitmap.
*/
static inline int pcpu_chunk_map_bits(struct pcpu_chunk *chunk)
{
return pcpu_nr_pages_to_map_bits(chunk->nr_pages);
}
#ifdef CONFIG_PERCPU_STATS
#include <linux/spinlock.h>
struct percpu_stats {
u64 nr_alloc; /* lifetime # of allocations */
u64 nr_dealloc; /* lifetime # of deallocations */
u64 nr_cur_alloc; /* current # of allocations */
u64 nr_max_alloc; /* max # of live allocations */
u32 nr_chunks; /* current # of live chunks */
u32 nr_max_chunks; /* max # of live chunks */
size_t min_alloc_size; /* min allocation size */
size_t max_alloc_size; /* max allocation size */
};
extern struct percpu_stats pcpu_stats;
extern struct pcpu_alloc_info pcpu_stats_ai;
/*
* For debug purposes. We don't care about the flexible array.
*/
static inline void pcpu_stats_save_ai(const struct pcpu_alloc_info *ai)
{
memcpy(&pcpu_stats_ai, ai, sizeof(struct pcpu_alloc_info));
/* initialize min_alloc_size to unit_size */
pcpu_stats.min_alloc_size = pcpu_stats_ai.unit_size;
}
/*
* pcpu_stats_area_alloc - increment area allocation stats
* @chunk: the location of the area being allocated
* @size: size of area to allocate in bytes
*
* CONTEXT:
* pcpu_lock.
*/
static inline void pcpu_stats_area_alloc(struct pcpu_chunk *chunk, size_t size)
{
lockdep_assert_held(&pcpu_lock);
pcpu_stats.nr_alloc++;
pcpu_stats.nr_cur_alloc++;
pcpu_stats.nr_max_alloc =
max(pcpu_stats.nr_max_alloc, pcpu_stats.nr_cur_alloc);
pcpu_stats.min_alloc_size =
min(pcpu_stats.min_alloc_size, size);
pcpu_stats.max_alloc_size =
max(pcpu_stats.max_alloc_size, size);
chunk->nr_alloc++;
chunk->max_alloc_size = max(chunk->max_alloc_size, size);
}
/*
* pcpu_stats_area_dealloc - decrement allocation stats
* @chunk: the location of the area being deallocated
*
* CONTEXT:
* pcpu_lock.
*/
static inline void pcpu_stats_area_dealloc(struct pcpu_chunk *chunk)
{
lockdep_assert_held(&pcpu_lock);
pcpu_stats.nr_dealloc++;
pcpu_stats.nr_cur_alloc--;
chunk->nr_alloc--;
}
/*
* pcpu_stats_chunk_alloc - increment chunk stats
*/
static inline void pcpu_stats_chunk_alloc(void)
{
unsigned long flags;
spin_lock_irqsave(&pcpu_lock, flags);
pcpu_stats.nr_chunks++;
pcpu_stats.nr_max_chunks =
max(pcpu_stats.nr_max_chunks, pcpu_stats.nr_chunks);
spin_unlock_irqrestore(&pcpu_lock, flags);
}
/*
* pcpu_stats_chunk_dealloc - decrement chunk stats
*/
static inline void pcpu_stats_chunk_dealloc(void)
{
unsigned long flags;
spin_lock_irqsave(&pcpu_lock, flags);
pcpu_stats.nr_chunks--;
spin_unlock_irqrestore(&pcpu_lock, flags);
}
#else
static inline void pcpu_stats_save_ai(const struct pcpu_alloc_info *ai)
{
}
static inline void pcpu_stats_area_alloc(struct pcpu_chunk *chunk, size_t size)
{
}
static inline void pcpu_stats_area_dealloc(struct pcpu_chunk *chunk)
{
}
static inline void pcpu_stats_chunk_alloc(void)
{
}
static inline void pcpu_stats_chunk_dealloc(void)
{
}
#endif /* !CONFIG_PERCPU_STATS */
#endif
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_CGROUP_H
#define _LINUX_CGROUP_H
/*
* cgroup interface
*
* Copyright (C) 2003 BULL SA
* Copyright (C) 2004-2006 Silicon Graphics, Inc.
*
*/
#include <linux/sched.h>
#include <linux/cpumask.h>
#include <linux/nodemask.h>
#include <linux/rculist.h>
#include <linux/cgroupstats.h>
#include <linux/fs.h>
#include <linux/seq_file.h>
#include <linux/kernfs.h>
#include <linux/jump_label.h>
#include <linux/types.h>
#include <linux/ns_common.h>
#include <linux/nsproxy.h>
#include <linux/user_namespace.h>
#include <linux/refcount.h>
#include <linux/kernel_stat.h>
#include <linux/cgroup-defs.h>
struct kernel_clone_args;
#ifdef CONFIG_CGROUPS
/*
* All weight knobs on the default hierarchy should use the following min,
* default and max values. The default value is the logarithmic center of
* MIN and MAX and allows 100x to be expressed in both directions.
*/
#define CGROUP_WEIGHT_MIN 1
#define CGROUP_WEIGHT_DFL 100
#define CGROUP_WEIGHT_MAX 10000
/* walk only threadgroup leaders */
#define CSS_TASK_ITER_PROCS (1U << 0)
/* walk all threaded css_sets in the domain */
#define CSS_TASK_ITER_THREADED (1U << 1)
/* internal flags */
#define CSS_TASK_ITER_SKIPPED (1U << 16)
/* a css_task_iter should be treated as an opaque object */
struct css_task_iter {
struct cgroup_subsys *ss;
unsigned int flags;
struct list_head *cset_pos;
struct list_head *cset_head;
struct list_head *tcset_pos;
struct list_head *tcset_head;
struct list_head *task_pos;
struct list_head *cur_tasks_head;
struct css_set *cur_cset;
struct css_set *cur_dcset;
struct task_struct *cur_task;
struct list_head iters_node; /* css_set->task_iters */
};
extern struct cgroup_root cgrp_dfl_root;
extern struct css_set init_css_set;
#define SUBSYS(_x) extern struct cgroup_subsys _x ## _cgrp_subsys;
#include <linux/cgroup_subsys.h>
#undef SUBSYS
#define SUBSYS(_x) \
extern struct static_key_true _x ## _cgrp_subsys_enabled_key; \
extern struct static_key_true _x ## _cgrp_subsys_on_dfl_key;
#include <linux/cgroup_subsys.h>
#undef SUBSYS
/**
* cgroup_subsys_enabled - fast test on whether a subsys is enabled
* @ss: subsystem in question
*/
#define cgroup_subsys_enabled(ss) \
static_branch_likely(&ss ## _enabled_key)
/**
* cgroup_subsys_on_dfl - fast test on whether a subsys is on default hierarchy
* @ss: subsystem in question
*/
#define cgroup_subsys_on_dfl(ss) \
static_branch_likely(&ss ## _on_dfl_key)
bool css_has_online_children(struct cgroup_subsys_state *css);
struct cgroup_subsys_state *css_from_id(int id, struct cgroup_subsys *ss);
struct cgroup_subsys_state *cgroup_e_css(struct cgroup *cgroup,
struct cgroup_subsys *ss);
struct cgroup_subsys_state *cgroup_get_e_css(struct cgroup *cgroup,
struct cgroup_subsys *ss);
struct cgroup_subsys_state *css_tryget_online_from_dir(struct dentry *dentry,
struct cgroup_subsys *ss);
struct cgroup *cgroup_get_from_path(const char *path);
struct cgroup *cgroup_get_from_fd(int fd);
int cgroup_attach_task_all(struct task_struct *from, struct task_struct *);
int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from);
int cgroup_add_dfl_cftypes(struct cgroup_subsys *ss, struct cftype *cfts);
int cgroup_add_legacy_cftypes(struct cgroup_subsys *ss, struct cftype *cfts);
int cgroup_rm_cftypes(struct cftype *cfts);
void cgroup_file_notify(struct cgroup_file *cfile);
int task_cgroup_path(struct task_struct *task, char *buf, size_t buflen);
int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry);
int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns,
struct pid *pid, struct task_struct *tsk);
void cgroup_fork(struct task_struct *p);
extern int cgroup_can_fork(struct task_struct *p,
struct kernel_clone_args *kargs);
extern void cgroup_cancel_fork(struct task_struct *p,
struct kernel_clone_args *kargs);
extern void cgroup_post_fork(struct task_struct *p,
struct kernel_clone_args *kargs);
void cgroup_exit(struct task_struct *p);
void cgroup_release(struct task_struct *p);
void cgroup_free(struct task_struct *p);
int cgroup_init_early(void);
int cgroup_init(void);
int cgroup_parse_float(const char *input, unsigned dec_shift, s64 *v);
/*
* Iteration helpers and macros.
*/
struct cgroup_subsys_state *css_next_child(struct cgroup_subsys_state *pos,
struct cgroup_subsys_state *parent);
struct cgroup_subsys_state *css_next_descendant_pre(struct cgroup_subsys_state *pos,
struct cgroup_subsys_state *css);
struct cgroup_subsys_state *css_rightmost_descendant(struct cgroup_subsys_state *pos);
struct cgroup_subsys_state *css_next_descendant_post(struct cgroup_subsys_state *pos,
struct cgroup_subsys_state *css);
struct task_struct *cgroup_taskset_first(struct cgroup_taskset *tset,
struct cgroup_subsys_state **dst_cssp);
struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset,
struct cgroup_subsys_state **dst_cssp);
void css_task_iter_start(struct cgroup_subsys_state *css, unsigned int flags,
struct css_task_iter *it);
struct task_struct *css_task_iter_next(struct css_task_iter *it);
void css_task_iter_end(struct css_task_iter *it);
/**
* css_for_each_child - iterate through children of a css
* @pos: the css * to use as the loop cursor
* @parent: css whose children to walk
*
* Walk @parent's children. Must be called under rcu_read_lock().
*
* If a subsystem synchronizes ->css_online() and the start of iteration, a
* css which finished ->css_online() is guaranteed to be visible in the
* future iterations and will stay visible until the last reference is put.
* A css which hasn't finished ->css_online() or already finished
* ->css_offline() may show up during traversal. It's each subsystem's
* responsibility to synchronize against on/offlining.
*
* It is allowed to temporarily drop RCU read lock during iteration. The
* caller is responsible for ensuring that @pos remains accessible until
* the start of the next iteration by, for example, bumping the css refcnt.
*/
#define css_for_each_child(pos, parent) \
for ((pos) = css_next_child(NULL, (parent)); (pos); \
(pos) = css_next_child((pos), (parent)))
/**
* css_for_each_descendant_pre - pre-order walk of a css's descendants
* @pos: the css * to use as the loop cursor
* @root: css whose descendants to walk
*
* Walk @root's descendants. @root is included in the iteration and the
* first node to be visited. Must be called under rcu_read_lock().
*
* If a subsystem synchronizes ->css_online() and the start of iteration, a
* css which finished ->css_online() is guaranteed to be visible in the
* future iterations and will stay visible until the last reference is put.
* A css which hasn't finished ->css_online() or already finished
* ->css_offline() may show up during traversal. It's each subsystem's
* responsibility to synchronize against on/offlining.
*
* For example, the following guarantees that a descendant can't escape
* state updates of its ancestors.
*
* my_online(@css)
* {
* Lock @css's parent and @css;
* Inherit state from the parent;
* Unlock both.
* }
*
* my_update_state(@css)
* {
* css_for_each_descendant_pre(@pos, @css) {
* Lock @pos;
* if (@pos == @css)
* Update @css's state;
* else
* Verify @pos is alive and inherit state from its parent;
* Unlock @pos;
* }
* }
*
* As long as the inheriting step, including checking the parent state, is
* enclosed inside @pos locking, double-locking the parent isn't necessary
* while inheriting. The state update to the parent is guaranteed to be
* visible by walking order and, as long as inheriting operations to the
* same @pos are atomic to each other, multiple updates racing each other
* still result in the correct state. It's guaranateed that at least one
* inheritance happens for any css after the latest update to its parent.
*
* If checking parent's state requires locking the parent, each inheriting
* iteration should lock and unlock both @pos->parent and @pos.
*
* Alternatively, a subsystem may choose to use a single global lock to
* synchronize ->css_online() and ->css_offline() against tree-walking
* operations.
*
* It is allowed to temporarily drop RCU read lock during iteration. The
* caller is responsible for ensuring that @pos remains accessible until
* the start of the next iteration by, for example, bumping the css refcnt.
*/
#define css_for_each_descendant_pre(pos, css) \
for ((pos) = css_next_descendant_pre(NULL, (css)); (pos); \
(pos) = css_next_descendant_pre((pos), (css)))
/**
* css_for_each_descendant_post - post-order walk of a css's descendants
* @pos: the css * to use as the loop cursor
* @css: css whose descendants to walk
*
* Similar to css_for_each_descendant_pre() but performs post-order
* traversal instead. @root is included in the iteration and the last
* node to be visited.
*
* If a subsystem synchronizes ->css_online() and the start of iteration, a
* css which finished ->css_online() is guaranteed to be visible in the
* future iterations and will stay visible until the last reference is put.
* A css which hasn't finished ->css_online() or already finished
* ->css_offline() may show up during traversal. It's each subsystem's
* responsibility to synchronize against on/offlining.
*
* Note that the walk visibility guarantee example described in pre-order
* walk doesn't apply the same to post-order walks.
*/
#define css_for_each_descendant_post(pos, css) \
for ((pos) = css_next_descendant_post(NULL, (css)); (pos); \
(pos) = css_next_descendant_post((pos), (css)))
/**
* cgroup_taskset_for_each - iterate cgroup_taskset
* @task: the loop cursor
* @dst_css: the destination css
* @tset: taskset to iterate
*
* @tset may contain multiple tasks and they may belong to multiple
* processes.
*
* On the v2 hierarchy, there may be tasks from multiple processes and they
* may not share the source or destination csses.
*
* On traditional hierarchies, when there are multiple tasks in @tset, if a
* task of a process is in @tset, all tasks of the process are in @tset.
* Also, all are guaranteed to share the same source and destination csses.
*
* Iteration is not in any specific order.
*/
#define cgroup_taskset_for_each(task, dst_css, tset) \
for ((task) = cgroup_taskset_first((tset), &(dst_css)); \
(task); \
(task) = cgroup_taskset_next((tset), &(dst_css)))
/**
* cgroup_taskset_for_each_leader - iterate group leaders in a cgroup_taskset
* @leader: the loop cursor
* @dst_css: the destination css
* @tset: taskset to iterate
*
* Iterate threadgroup leaders of @tset. For single-task migrations, @tset
* may not contain any.
*/
#define cgroup_taskset_for_each_leader(leader, dst_css, tset) \
for ((leader) = cgroup_taskset_first((tset), &(dst_css)); \
(leader); \
(leader) = cgroup_taskset_next((tset), &(dst_css))) \
if ((leader) != (leader)->group_leader) \
; \
else
/*
* Inline functions.
*/
static inline u64 cgroup_id(const struct cgroup *cgrp)
{
return cgrp->kn->id;
}
/**
* css_get - obtain a reference on the specified css
* @css: target css
*
* The caller must already have a reference.
*/
static inline void css_get(struct cgroup_subsys_state *css)
{
if (!(css->flags & CSS_NO_REF))
percpu_ref_get(&css->refcnt);
}
/**
* css_get_many - obtain references on the specified css
* @css: target css
* @n: number of references to get
*
* The caller must already have a reference.
*/
static inline void css_get_many(struct cgroup_subsys_state *css, unsigned int n)
{
if (!(css->flags & CSS_NO_REF))
percpu_ref_get_many(&css->refcnt, n);
}
/**
* css_tryget - try to obtain a reference on the specified css
* @css: target css
*
* Obtain a reference on @css unless it already has reached zero and is
* being released. This function doesn't care whether @css is on or
* offline. The caller naturally needs to ensure that @css is accessible
* but doesn't have to be holding a reference on it - IOW, RCU protected
* access is good enough for this function. Returns %true if a reference
* count was successfully obtained; %false otherwise.
*/
static inline bool css_tryget(struct cgroup_subsys_state *css)
{
if (!(css->flags & CSS_NO_REF))
return percpu_ref_tryget(&css->refcnt);
return true;
}
/**
* css_tryget_online - try to obtain a reference on the specified css if online
* @css: target css
*
* Obtain a reference on @css if it's online. The caller naturally needs
* to ensure that @css is accessible but doesn't have to be holding a
* reference on it - IOW, RCU protected access is good enough for this
* function. Returns %true if a reference count was successfully obtained;
* %false otherwise.
*/
static inline bool css_tryget_online(struct cgroup_subsys_state *css)
{
if (!(css->flags & CSS_NO_REF))
return percpu_ref_tryget_live(&css->refcnt);
return true;
}
/**
* css_is_dying - test whether the specified css is dying
* @css: target css
*
* Test whether @css is in the process of offlining or already offline. In
* most cases, ->css_online() and ->css_offline() callbacks should be
* enough; however, the actual offline operations are RCU delayed and this
* test returns %true also when @css is scheduled to be offlined.
*
* This is useful, for example, when the use case requires synchronous
* behavior with respect to cgroup removal. cgroup removal schedules css
* offlining but the css can seem alive while the operation is being
* delayed. If the delay affects user visible semantics, this test can be
* used to resolve the situation.
*/
static inline bool css_is_dying(struct cgroup_subsys_state *css)
{
return !(css->flags & CSS_NO_REF) && percpu_ref_is_dying(&css->refcnt);
}
/**
* css_put - put a css reference
* @css: target css
*
* Put a reference obtained via css_get() and css_tryget_online().
*/
static inline void css_put(struct cgroup_subsys_state *css)
{
if (!(css->flags & CSS_NO_REF))
percpu_ref_put(&css->refcnt);
}
/**
* css_put_many - put css references
* @css: target css
* @n: number of references to put
*
* Put references obtained via css_get() and css_tryget_online().
*/
static inline void css_put_many(struct cgroup_subsys_state *css, unsigned int n)
{
if (!(css->flags & CSS_NO_REF))
percpu_ref_put_many(&css->refcnt, n);
}
static inline void cgroup_get(struct cgroup *cgrp)
{
css_get(&cgrp->self);
}
static inline bool cgroup_tryget(struct cgroup *cgrp)
{
return css_tryget(&cgrp->self);
}
static inline void cgroup_put(struct cgroup *cgrp)
{
css_put(&cgrp->self);
}
/**
* task_css_set_check - obtain a task's css_set with extra access conditions
* @task: the task to obtain css_set for
* @__c: extra condition expression to be passed to rcu_dereference_check()
*
* A task's css_set is RCU protected, initialized and exited while holding
* task_lock(), and can only be modified while holding both cgroup_mutex
* and task_lock() while the task is alive. This macro verifies that the
* caller is inside proper critical section and returns @task's css_set.
*
* The caller can also specify additional allowed conditions via @__c, such
* as locks used during the cgroup_subsys::attach() methods.
*/
#ifdef CONFIG_PROVE_RCU
extern struct mutex cgroup_mutex;
extern spinlock_t css_set_lock;
#define task_css_set_check(task, __c) \
rcu_dereference_check((task)->cgroups, \
lockdep_is_held(&cgroup_mutex) || \
lockdep_is_held(&css_set_lock) || \
((task)->flags & PF_EXITING) || (__c))
#else
#define task_css_set_check(task, __c) \
rcu_dereference((task)->cgroups)
#endif
/**
* task_css_check - obtain css for (task, subsys) w/ extra access conds
* @task: the target task
* @subsys_id: the target subsystem ID
* @__c: extra condition expression to be passed to rcu_dereference_check()
*
* Return the cgroup_subsys_state for the (@task, @subsys_id) pair. The
* synchronization rules are the same as task_css_set_check().
*/
#define task_css_check(task, subsys_id, __c) \
task_css_set_check((task), (__c))->subsys[(subsys_id)]
/**
* task_css_set - obtain a task's css_set
* @task: the task to obtain css_set for
*
* See task_css_set_check().
*/
static inline struct css_set *task_css_set(struct task_struct *task)
{
return task_css_set_check(task, false);
}
/**
* task_css - obtain css for (task, subsys)
* @task: the target task
* @subsys_id: the target subsystem ID
*
* See task_css_check().
*/
static inline struct cgroup_subsys_state *task_css(struct task_struct *task,
int subsys_id)
{
return task_css_check(task, subsys_id, false);
}
/**
* task_get_css - find and get the css for (task, subsys)
* @task: the target task
* @subsys_id: the target subsystem ID
*
* Find the css for the (@task, @subsys_id) combination, increment a
* reference on and return it. This function is guaranteed to return a
* valid css. The returned css may already have been offlined.
*/
static inline struct cgroup_subsys_state *
task_get_css(struct task_struct *task, int subsys_id)
{
struct cgroup_subsys_state *css;
rcu_read_lock();
while (true) {
css = task_css(task, subsys_id);
/*
* Can't use css_tryget_online() here. A task which has
* PF_EXITING set may stay associated with an offline css.
* If such task calls this function, css_tryget_online()
* will keep failing.
*/
if (likely(css_tryget(css)))
break;
cpu_relax();
}
rcu_read_unlock();
return css;
}
/**
* task_css_is_root - test whether a task belongs to the root css
* @task: the target task
* @subsys_id: the target subsystem ID
*
* Test whether @task belongs to the root css on the specified subsystem.
* May be invoked in any context.
*/
static inline bool task_css_is_root(struct task_struct *task, int subsys_id)
{
return task_css_check(task, subsys_id, true) ==
init_css_set.subsys[subsys_id];
}
static inline struct cgroup *task_cgroup(struct task_struct *task,
int subsys_id)
{
return task_css(task, subsys_id)->cgroup;
}
static inline struct cgroup *task_dfl_cgroup(struct task_struct *task)
{
return task_css_set(task)->dfl_cgrp;
}
static inline struct cgroup *cgroup_parent(struct cgroup *cgrp)
{
struct cgroup_subsys_state *parent_css = cgrp->self.parent;
if (parent_css)
return container_of(parent_css, struct cgroup, self);
return NULL;
}
/**
* cgroup_is_descendant - test ancestry
* @cgrp: the cgroup to be tested
* @ancestor: possible ancestor of @cgrp
*
* Test whether @cgrp is a descendant of @ancestor. It also returns %true
* if @cgrp == @ancestor. This function is safe to call as long as @cgrp
* and @ancestor are accessible.
*/
static inline bool cgroup_is_descendant(struct cgroup *cgrp,
struct cgroup *ancestor)
{
if (cgrp->root != ancestor->root || cgrp->level < ancestor->level)
return false;
return cgrp->ancestor_ids[ancestor->level] == cgroup_id(ancestor);
}
/**
* cgroup_ancestor - find ancestor of cgroup
* @cgrp: cgroup to find ancestor of
* @ancestor_level: level of ancestor to find starting from root
*
* Find ancestor of cgroup at specified level starting from root if it exists
* and return pointer to it. Return NULL if @cgrp doesn't have ancestor at
* @ancestor_level.
*
* This function is safe to call as long as @cgrp is accessible.
*/
static inline struct cgroup *cgroup_ancestor(struct cgroup *cgrp,
int ancestor_level)
{
if (cgrp->level < ancestor_level)
return NULL;
while (cgrp && cgrp->level > ancestor_level)
cgrp = cgroup_parent(cgrp);
return cgrp;
}
/**
* task_under_cgroup_hierarchy - test task's membership of cgroup ancestry
* @task: the task to be tested
* @ancestor: possible ancestor of @task's cgroup
*
* Tests whether @task's default cgroup hierarchy is a descendant of @ancestor.
* It follows all the same rules as cgroup_is_descendant, and only applies
* to the default hierarchy.
*/
static inline bool task_under_cgroup_hierarchy(struct task_struct *task,
struct cgroup *ancestor)
{
struct css_set *cset = task_css_set(task);
return cgroup_is_descendant(cset->dfl_cgrp, ancestor);
}
/* no synchronization, the result can only be used as a hint */
static inline bool cgroup_is_populated(struct cgroup *cgrp)
{
return cgrp->nr_populated_csets + cgrp->nr_populated_domain_children +
cgrp->nr_populated_threaded_children;
}
/* returns ino associated with a cgroup */
static inline ino_t cgroup_ino(struct cgroup *cgrp)
{
return kernfs_ino(cgrp->kn);
}
/* cft/css accessors for cftype->write() operation */
static inline struct cftype *of_cft(struct kernfs_open_file *of)
{
return of->kn->priv;
}
struct cgroup_subsys_state *of_css(struct kernfs_open_file *of);
/* cft/css accessors for cftype->seq_*() operations */
static inline struct cftype *seq_cft(struct seq_file *seq)
{
return of_cft(seq->private);
}
static inline struct cgroup_subsys_state *seq_css(struct seq_file *seq)
{
return of_css(seq->private);
}
/*
* Name / path handling functions. All are thin wrappers around the kernfs
* counterparts and can be called under any context.
*/
static inline int cgroup_name(struct cgroup *cgrp, char *buf, size_t buflen)
{
return kernfs_name(cgrp->kn, buf, buflen);
}
static inline int cgroup_path(struct cgroup *cgrp, char *buf, size_t buflen)
{
return kernfs_path(cgrp->kn, buf, buflen);
}
static inline void pr_cont_cgroup_name(struct cgroup *cgrp)
{
pr_cont_kernfs_name(cgrp->kn);
}
static inline void pr_cont_cgroup_path(struct cgroup *cgrp)
{
pr_cont_kernfs_path(cgrp->kn);
}
static inline struct psi_group *cgroup_psi(struct cgroup *cgrp)
{
return &cgrp->psi;
}
bool cgroup_psi_enabled(void);
static inline void cgroup_init_kthreadd(void)
{
/*
* kthreadd is inherited by all kthreads, keep it in the root so
* that the new kthreads are guaranteed to stay in the root until
* initialization is finished.
*/
current->no_cgroup_migration = 1;
}
static inline void cgroup_kthread_ready(void)
{
/*
* This kthread finished initialization. The creator should have
* set PF_NO_SETAFFINITY if this kthread should stay in the root.
*/
current->no_cgroup_migration = 0;
}
void cgroup_path_from_kernfs_id(u64 id, char *buf, size_t buflen);
struct cgroup *cgroup_get_from_id(u64 id);
#else /* !CONFIG_CGROUPS */
struct cgroup_subsys_state;
struct cgroup;
static inline u64 cgroup_id(const struct cgroup *cgrp) { return 1; }
static inline void css_get(struct cgroup_subsys_state *css) {}
static inline void css_put(struct cgroup_subsys_state *css) {}
static inline int cgroup_attach_task_all(struct task_struct *from,
struct task_struct *t) { return 0; }
static inline int cgroupstats_build(struct cgroupstats *stats,
struct dentry *dentry) { return -EINVAL; }
static inline void cgroup_fork(struct task_struct *p) {}
static inline int cgroup_can_fork(struct task_struct *p,
struct kernel_clone_args *kargs) { return 0; }
static inline void cgroup_cancel_fork(struct task_struct *p,
struct kernel_clone_args *kargs) {}
static inline void cgroup_post_fork(struct task_struct *p,
struct kernel_clone_args *kargs) {}
static inline void cgroup_exit(struct task_struct *p) {}
static inline void cgroup_release(struct task_struct *p) {}
static inline void cgroup_free(struct task_struct *p) {}
static inline int cgroup_init_early(void) { return 0; }
static inline int cgroup_init(void) { return 0; }
static inline void cgroup_init_kthreadd(void) {}
static inline void cgroup_kthread_ready(void) {}
static inline struct cgroup *cgroup_parent(struct cgroup *cgrp)
{
return NULL;
}
static inline struct psi_group *cgroup_psi(struct cgroup *cgrp)
{
return NULL;
}
static inline bool cgroup_psi_enabled(void)
{
return false;
}
static inline bool task_under_cgroup_hierarchy(struct task_struct *task,
struct cgroup *ancestor)
{
return true;
}
static inline void cgroup_path_from_kernfs_id(u64 id, char *buf, size_t buflen)
{}
static inline struct cgroup *cgroup_get_from_id(u64 id)
{
return NULL;
}
#endif /* !CONFIG_CGROUPS */
#ifdef CONFIG_CGROUPS
/*
* cgroup scalable recursive statistics.
*/
void cgroup_rstat_updated(struct cgroup *cgrp, int cpu);
void cgroup_rstat_flush(struct cgroup *cgrp);
void cgroup_rstat_flush_irqsafe(struct cgroup *cgrp);
void cgroup_rstat_flush_hold(struct cgroup *cgrp);
void cgroup_rstat_flush_release(void);
/*
* Basic resource stats.
*/
#ifdef CONFIG_CGROUP_CPUACCT
void cpuacct_charge(struct task_struct *tsk, u64 cputime);
void cpuacct_account_field(struct task_struct *tsk, int index, u64 val);
#else
static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) {}
static inline void cpuacct_account_field(struct task_struct *tsk, int index,
u64 val) {}
#endif
void __cgroup_account_cputime(struct cgroup *cgrp, u64 delta_exec);
void __cgroup_account_cputime_field(struct cgroup *cgrp,
enum cpu_usage_stat index, u64 delta_exec);
static inline void cgroup_account_cputime(struct task_struct *task,
u64 delta_exec)
{
struct cgroup *cgrp;
cpuacct_charge(task, delta_exec);
rcu_read_lock();
cgrp = task_dfl_cgroup(task);
if (cgroup_parent(cgrp))
__cgroup_account_cputime(cgrp, delta_exec);
rcu_read_unlock();
}
static inline void cgroup_account_cputime_field(struct task_struct *task,
enum cpu_usage_stat index,
u64 delta_exec)
{
struct cgroup *cgrp;
cpuacct_account_field(task, index, delta_exec);
rcu_read_lock();
cgrp = task_dfl_cgroup(task);
if (cgroup_parent(cgrp))
__cgroup_account_cputime_field(cgrp, index, delta_exec);
rcu_read_unlock();
}
#else /* CONFIG_CGROUPS */
static inline void cgroup_account_cputime(struct task_struct *task,
u64 delta_exec) {}
static inline void cgroup_account_cputime_field(struct task_struct *task,
enum cpu_usage_stat index,
u64 delta_exec) {}
#endif /* CONFIG_CGROUPS */
/*
* sock->sk_cgrp_data handling. For more info, see sock_cgroup_data
* definition in cgroup-defs.h.
*/
#ifdef CONFIG_SOCK_CGROUP_DATA
void cgroup_sk_alloc(struct sock_cgroup_data *skcd);
void cgroup_sk_clone(struct sock_cgroup_data *skcd);
void cgroup_sk_free(struct sock_cgroup_data *skcd);
static inline struct cgroup *sock_cgroup_ptr(struct sock_cgroup_data *skcd)
{
return skcd->cgroup;
}
#else /* CONFIG_CGROUP_DATA */
static inline void cgroup_sk_alloc(struct sock_cgroup_data *skcd) {}
static inline void cgroup_sk_clone(struct sock_cgroup_data *skcd) {}
static inline void cgroup_sk_free(struct sock_cgroup_data *skcd) {}
#endif /* CONFIG_CGROUP_DATA */
struct cgroup_namespace {
struct ns_common ns;
struct user_namespace *user_ns;
struct ucounts *ucounts;
struct css_set *root_cset;
};
extern struct cgroup_namespace init_cgroup_ns;
#ifdef CONFIG_CGROUPS
void free_cgroup_ns(struct cgroup_namespace *ns);
struct cgroup_namespace *copy_cgroup_ns(unsigned long flags,
struct user_namespace *user_ns,
struct cgroup_namespace *old_ns);
int cgroup_path_ns(struct cgroup *cgrp, char *buf, size_t buflen,
struct cgroup_namespace *ns);
#else /* !CONFIG_CGROUPS */
static inline void free_cgroup_ns(struct cgroup_namespace *ns) { }
static inline struct cgroup_namespace *
copy_cgroup_ns(unsigned long flags, struct user_namespace *user_ns,
struct cgroup_namespace *old_ns)
{
return old_ns;
}
#endif /* !CONFIG_CGROUPS */
static inline void get_cgroup_ns(struct cgroup_namespace *ns)
{
if (ns)
refcount_inc(&ns->ns.count);
}
static inline void put_cgroup_ns(struct cgroup_namespace *ns)
{
if (ns && refcount_dec_and_test(&ns->ns.count))
free_cgroup_ns(ns);
}
#ifdef CONFIG_CGROUPS
void cgroup_enter_frozen(void);
void cgroup_leave_frozen(bool always_leave);
void cgroup_update_frozen(struct cgroup *cgrp);
void cgroup_freeze(struct cgroup *cgrp, bool freeze);
void cgroup_freezer_migrate_task(struct task_struct *task, struct cgroup *src,
struct cgroup *dst);
static inline bool cgroup_task_frozen(struct task_struct *task)
{
return task->frozen;
}
#else /* !CONFIG_CGROUPS */
static inline void cgroup_enter_frozen(void) { }
static inline void cgroup_leave_frozen(bool always_leave) { }
static inline bool cgroup_task_frozen(struct task_struct *task)
{
return false;
}
#endif /* !CONFIG_CGROUPS */
#ifdef CONFIG_CGROUP_BPF
static inline void cgroup_bpf_get(struct cgroup *cgrp)
{
percpu_ref_get(&cgrp->bpf.refcnt);
}
static inline void cgroup_bpf_put(struct cgroup *cgrp)
{
percpu_ref_put(&cgrp->bpf.refcnt);
}
#else /* CONFIG_CGROUP_BPF */
static inline void cgroup_bpf_get(struct cgroup *cgrp) {}
static inline void cgroup_bpf_put(struct cgroup *cgrp) {}
#endif /* CONFIG_CGROUP_BPF */
#endif /* _LINUX_CGROUP_H */
// SPDX-License-Identifier: GPL-2.0
/*
* Copyright (C) 2014 Davidlohr Bueso.
*/
#include <linux/sched/signal.h>
#include <linux/sched/task.h>
#include <linux/mm.h>
#include <linux/vmacache.h>
/*
* Hash based on the pmd of addr if configured with MMU, which provides a good
* hit rate for workloads with spatial locality. Otherwise, use pages.
*/
#ifdef CONFIG_MMU
#define VMACACHE_SHIFT PMD_SHIFT
#else
#define VMACACHE_SHIFT PAGE_SHIFT
#endif
#define VMACACHE_HASH(addr) ((addr >> VMACACHE_SHIFT) & VMACACHE_MASK)
/*
* This task may be accessing a foreign mm via (for example)
* get_user_pages()->find_vma(). The vmacache is task-local and this
* task's vmacache pertains to a different mm (ie, its own). There is
* nothing we can do here.
*
* Also handle the case where a kernel thread has adopted this mm via
* kthread_use_mm(). That kernel thread's vmacache is not applicable to this mm.
*/
static inline bool vmacache_valid_mm(struct mm_struct *mm)
{
return current->mm == mm && !(current->flags & PF_KTHREAD);
}
void vmacache_update(unsigned long addr, struct vm_area_struct *newvma)
{
if (vmacache_valid_mm(newvma->vm_mm))
current->vmacache.vmas[VMACACHE_HASH(addr)] = newvma;
}
static bool vmacache_valid(struct mm_struct *mm)
{
struct task_struct *curr;
if (!vmacache_valid_mm(mm))
return false;
curr = current;
if (mm->vmacache_seqnum != curr->vmacache.seqnum) {
/*
* First attempt will always be invalid, initialize
* the new cache for this task here.
*/
curr->vmacache.seqnum = mm->vmacache_seqnum;
vmacache_flush(curr);
return false;
}
return true;
}
struct vm_area_struct *vmacache_find(struct mm_struct *mm, unsigned long addr)
{
int idx = VMACACHE_HASH(addr);
int i;
count_vm_vmacache_event(VMACACHE_FIND_CALLS);
if (!vmacache_valid(mm))
return NULL; for (i = 0; i < VMACACHE_SIZE; i++) {
struct vm_area_struct *vma = current->vmacache.vmas[idx];
if (vma) {
#ifdef CONFIG_DEBUG_VM_VMACACHE
if (WARN_ON_ONCE(vma->vm_mm != mm))
break;
#endif
if (vma->vm_start <= addr && vma->vm_end > addr) {
count_vm_vmacache_event(VMACACHE_FIND_HITS);
return vma;
}
}
if (++idx == VMACACHE_SIZE)
idx = 0;
}
return NULL;
}
#ifndef CONFIG_MMU
struct vm_area_struct *vmacache_find_exact(struct mm_struct *mm,
unsigned long start,
unsigned long end)
{
int idx = VMACACHE_HASH(start);
int i;
count_vm_vmacache_event(VMACACHE_FIND_CALLS);
if (!vmacache_valid(mm))
return NULL;
for (i = 0; i < VMACACHE_SIZE; i++) {
struct vm_area_struct *vma = current->vmacache.vmas[idx];
if (vma && vma->vm_start == start && vma->vm_end == end) {
count_vm_vmacache_event(VMACACHE_FIND_HITS);
return vma;
}
if (++idx == VMACACHE_SIZE)
idx = 0;
}
return NULL;
}
#endif
// SPDX-License-Identifier: GPL-2.0
/*
* fs/sysfs/dir.c - sysfs core and dir operation implementation
*
* Copyright (c) 2001-3 Patrick Mochel
* Copyright (c) 2007 SUSE Linux Products GmbH
* Copyright (c) 2007 Tejun Heo <teheo@suse.de>
*
* Please see Documentation/filesystems/sysfs.rst for more information.
*/
#define pr_fmt(fmt) "sysfs: " fmt
#include <linux/fs.h>
#include <linux/kobject.h>
#include <linux/slab.h>
#include "sysfs.h"
DEFINE_SPINLOCK(sysfs_symlink_target_lock);
void sysfs_warn_dup(struct kernfs_node *parent, const char *name)
{
char *buf;
buf = kzalloc(PATH_MAX, GFP_KERNEL);
if (buf)
kernfs_path(parent, buf, PATH_MAX);
pr_warn("cannot create duplicate filename '%s/%s'\n", buf, name);
dump_stack();
kfree(buf);
}
/**
* sysfs_create_dir_ns - create a directory for an object with a namespace tag
* @kobj: object we're creating directory for
* @ns: the namespace tag to use
*/
int sysfs_create_dir_ns(struct kobject *kobj, const void *ns)
{
struct kernfs_node *parent, *kn;
kuid_t uid;
kgid_t gid;
if (WARN_ON(!kobj))
return -EINVAL;
if (kobj->parent) parent = kobj->parent->sd;
else
parent = sysfs_root_kn; if (!parent)
return -ENOENT;
kobject_get_ownership(kobj, &uid, &gid);
kn = kernfs_create_dir_ns(parent, kobject_name(kobj),
S_IRWXU | S_IRUGO | S_IXUGO, uid, gid,
kobj, ns);
if (IS_ERR(kn)) {
if (PTR_ERR(kn) == -EEXIST)
sysfs_warn_dup(parent, kobject_name(kobj)); return PTR_ERR(kn);
}
kobj->sd = kn; return 0;
}
/**
* sysfs_remove_dir - remove an object's directory.
* @kobj: object.
*
* The only thing special about this is that we remove any files in
* the directory before we remove the directory, and we've inlined
* what used to be sysfs_rmdir() below, instead of calling separately.
*/
void sysfs_remove_dir(struct kobject *kobj)
{
struct kernfs_node *kn = kobj->sd;
/*
* In general, kboject owner is responsible for ensuring removal
* doesn't race with other operations and sysfs doesn't provide any
* protection; however, when @kobj is used as a symlink target, the
* symlinking entity usually doesn't own @kobj and thus has no
* control over removal. @kobj->sd may be removed anytime
* and symlink code may end up dereferencing an already freed node.
*
* sysfs_symlink_target_lock synchronizes @kobj->sd
* disassociation against symlink operations so that symlink code
* can safely dereference @kobj->sd.
*/
spin_lock(&sysfs_symlink_target_lock);
kobj->sd = NULL;
spin_unlock(&sysfs_symlink_target_lock);
if (kn) {
WARN_ON_ONCE(kernfs_type(kn) != KERNFS_DIR); kernfs_remove(kn);
}
}
int sysfs_rename_dir_ns(struct kobject *kobj, const char *new_name,
const void *new_ns)
{
struct kernfs_node *parent;
int ret;
parent = kernfs_get_parent(kobj->sd);
ret = kernfs_rename_ns(kobj->sd, parent, new_name, new_ns);
kernfs_put(parent);
return ret;
}
int sysfs_move_dir_ns(struct kobject *kobj, struct kobject *new_parent_kobj,
const void *new_ns)
{
struct kernfs_node *kn = kobj->sd;
struct kernfs_node *new_parent;
new_parent = new_parent_kobj && new_parent_kobj->sd ?
new_parent_kobj->sd : sysfs_root_kn;
return kernfs_rename_ns(kn, new_parent, kn->name, new_ns);
}
/**
* sysfs_create_mount_point - create an always empty directory
* @parent_kobj: kobject that will contain this always empty directory
* @name: The name of the always empty directory to add
*/
int sysfs_create_mount_point(struct kobject *parent_kobj, const char *name)
{
struct kernfs_node *kn, *parent = parent_kobj->sd;
kn = kernfs_create_empty_dir(parent, name);
if (IS_ERR(kn)) {
if (PTR_ERR(kn) == -EEXIST)
sysfs_warn_dup(parent, name);
return PTR_ERR(kn);
}
return 0;
}
EXPORT_SYMBOL_GPL(sysfs_create_mount_point);
/**
* sysfs_remove_mount_point - remove an always empty directory.
* @parent_kobj: kobject that will contain this always empty directory
* @name: The name of the always empty directory to remove
*
*/
void sysfs_remove_mount_point(struct kobject *parent_kobj, const char *name)
{
struct kernfs_node *parent = parent_kobj->sd;
kernfs_remove_by_name_ns(parent, name, NULL);
}
EXPORT_SYMBOL_GPL(sysfs_remove_mount_point);
// SPDX-License-Identifier: GPL-2.0-only
/*
* mm/page-writeback.c
*
* Copyright (C) 2002, Linus Torvalds.
* Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra
*
* Contains functions related to writing back dirty pages at the
* address_space level.
*
* 10Apr2002 Andrew Morton
* Initial version
*/
#include <linux/kernel.h>
#include <linux/export.h>
#include <linux/spinlock.h>
#include <linux/fs.h>
#include <linux/mm.h>
#include <linux/swap.h>
#include <linux/slab.h>
#include <linux/pagemap.h>
#include <linux/writeback.h>
#include <linux/init.h>
#include <linux/backing-dev.h>
#include <linux/task_io_accounting_ops.h>
#include <linux/blkdev.h>
#include <linux/mpage.h>
#include <linux/rmap.h>
#include <linux/percpu.h>
#include <linux/smp.h>
#include <linux/sysctl.h>
#include <linux/cpu.h>
#include <linux/syscalls.h>
#include <linux/pagevec.h>
#include <linux/timer.h>
#include <linux/sched/rt.h>
#include <linux/sched/signal.h>
#include <linux/mm_inline.h>
#include <trace/events/writeback.h>
#include "internal.h"
/*
* Sleep at most 200ms at a time in balance_dirty_pages().
*/
#define MAX_PAUSE max(HZ/5, 1)
/*
* Try to keep balance_dirty_pages() call intervals higher than this many pages
* by raising pause time to max_pause when falls below it.
*/
#define DIRTY_POLL_THRESH (128 >> (PAGE_SHIFT - 10))
/*
* Estimate write bandwidth at 200ms intervals.
*/
#define BANDWIDTH_INTERVAL max(HZ/5, 1)
#define RATELIMIT_CALC_SHIFT 10
/*
* After a CPU has dirtied this many pages, balance_dirty_pages_ratelimited
* will look to see if it needs to force writeback or throttling.
*/
static long ratelimit_pages = 32;
/* The following parameters are exported via /proc/sys/vm */
/*
* Start background writeback (via writeback threads) at this percentage
*/
int dirty_background_ratio = 10;
/*
* dirty_background_bytes starts at 0 (disabled) so that it is a function of
* dirty_background_ratio * the amount of dirtyable memory
*/
unsigned long dirty_background_bytes;
/*
* free highmem will not be subtracted from the total free memory
* for calculating free ratios if vm_highmem_is_dirtyable is true
*/
int vm_highmem_is_dirtyable;
/*
* The generator of dirty data starts writeback at this percentage
*/
int vm_dirty_ratio = 20;
/*
* vm_dirty_bytes starts at 0 (disabled) so that it is a function of
* vm_dirty_ratio * the amount of dirtyable memory
*/
unsigned long vm_dirty_bytes;
/*
* The interval between `kupdate'-style writebacks
*/
unsigned int dirty_writeback_interval = 5 * 100; /* centiseconds */
EXPORT_SYMBOL_GPL(dirty_writeback_interval);
/*
* The longest time for which data is allowed to remain dirty
*/
unsigned int dirty_expire_interval = 30 * 100; /* centiseconds */
/*
* Flag that puts the machine in "laptop mode". Doubles as a timeout in jiffies:
* a full sync is triggered after this time elapses without any disk activity.
*/
int laptop_mode;
EXPORT_SYMBOL(laptop_mode);
/* End of sysctl-exported parameters */
struct wb_domain global_wb_domain;
/* consolidated parameters for balance_dirty_pages() and its subroutines */
struct dirty_throttle_control {
#ifdef CONFIG_CGROUP_WRITEBACK
struct wb_domain *dom;
struct dirty_throttle_control *gdtc; /* only set in memcg dtc's */
#endif
struct bdi_writeback *wb;
struct fprop_local_percpu *wb_completions;
unsigned long avail; /* dirtyable */
unsigned long dirty; /* file_dirty + write + nfs */
unsigned long thresh; /* dirty threshold */
unsigned long bg_thresh; /* dirty background threshold */
unsigned long wb_dirty; /* per-wb counterparts */
unsigned long wb_thresh;
unsigned long wb_bg_thresh;
unsigned long pos_ratio;
};
/*
* Length of period for aging writeout fractions of bdis. This is an
* arbitrarily chosen number. The longer the period, the slower fractions will
* reflect changes in current writeout rate.
*/
#define VM_COMPLETIONS_PERIOD_LEN (3*HZ)
#ifdef CONFIG_CGROUP_WRITEBACK
#define GDTC_INIT(__wb) .wb = (__wb), \
.dom = &global_wb_domain, \
.wb_completions = &(__wb)->completions
#define GDTC_INIT_NO_WB .dom = &global_wb_domain
#define MDTC_INIT(__wb, __gdtc) .wb = (__wb), \
.dom = mem_cgroup_wb_domain(__wb), \
.wb_completions = &(__wb)->memcg_completions, \
.gdtc = __gdtc
static bool mdtc_valid(struct dirty_throttle_control *dtc)
{
return dtc->dom;
}
static struct wb_domain *dtc_dom(struct dirty_throttle_control *dtc)
{
return dtc->dom;
}
static struct dirty_throttle_control *mdtc_gdtc(struct dirty_throttle_control *mdtc)
{
return mdtc->gdtc;
}
static struct fprop_local_percpu *wb_memcg_completions(struct bdi_writeback *wb)
{
return &wb->memcg_completions;
}
static void wb_min_max_ratio(struct bdi_writeback *wb,
unsigned long *minp, unsigned long *maxp)
{
unsigned long this_bw = READ_ONCE(wb->avg_write_bandwidth);
unsigned long tot_bw = atomic_long_read(&wb->bdi->tot_write_bandwidth);
unsigned long long min = wb->bdi->min_ratio;
unsigned long long max = wb->bdi->max_ratio;
/*
* @wb may already be clean by the time control reaches here and
* the total may not include its bw.
*/
if (this_bw < tot_bw) {
if (min) {
min *= this_bw;
min = div64_ul(min, tot_bw);
}
if (max < 100) {
max *= this_bw;
max = div64_ul(max, tot_bw);
}
}
*minp = min;
*maxp = max;
}
#else /* CONFIG_CGROUP_WRITEBACK */
#define GDTC_INIT(__wb) .wb = (__wb), \
.wb_completions = &(__wb)->completions
#define GDTC_INIT_NO_WB
#define MDTC_INIT(__wb, __gdtc)
static bool mdtc_valid(struct dirty_throttle_control *dtc)
{
return false;
}
static struct wb_domain *dtc_dom(struct dirty_throttle_control *dtc)
{
return &global_wb_domain;
}
static struct dirty_throttle_control *mdtc_gdtc(struct dirty_throttle_control *mdtc)
{
return NULL;
}
static struct fprop_local_percpu *wb_memcg_completions(struct bdi_writeback *wb)
{
return NULL;
}
static void wb_min_max_ratio(struct bdi_writeback *wb,
unsigned long *minp, unsigned long *maxp)
{
*minp = wb->bdi->min_ratio;
*maxp = wb->bdi->max_ratio;
}
#endif /* CONFIG_CGROUP_WRITEBACK */
/*
* In a memory zone, there is a certain amount of pages we consider
* available for the page cache, which is essentially the number of
* free and reclaimable pages, minus some zone reserves to protect
* lowmem and the ability to uphold the zone's watermarks without
* requiring writeback.
*
* This number of dirtyable pages is the base value of which the
* user-configurable dirty ratio is the effective number of pages that
* are allowed to be actually dirtied. Per individual zone, or
* globally by using the sum of dirtyable pages over all zones.
*
* Because the user is allowed to specify the dirty limit globally as
* absolute number of bytes, calculating the per-zone dirty limit can
* require translating the configured limit into a percentage of
* global dirtyable memory first.
*/
/**
* node_dirtyable_memory - number of dirtyable pages in a node
* @pgdat: the node
*
* Return: the node's number of pages potentially available for dirty
* page cache. This is the base value for the per-node dirty limits.
*/
static unsigned long node_dirtyable_memory(struct pglist_data *pgdat)
{
unsigned long nr_pages = 0;
int z;
for (z = 0; z < MAX_NR_ZONES; z++) { struct zone *zone = pgdat->node_zones + z;
if (!populated_zone(zone))
continue;
nr_pages += zone_page_state(zone, NR_FREE_PAGES);
}
/*
* Pages reserved for the kernel should not be considered
* dirtyable, to prevent a situation where reclaim has to
* clean pages in order to balance the zones.
*/
nr_pages -= min(nr_pages, pgdat->totalreserve_pages);
nr_pages += node_page_state(pgdat, NR_INACTIVE_FILE);
nr_pages += node_page_state(pgdat, NR_ACTIVE_FILE);
return nr_pages;
}
static unsigned long highmem_dirtyable_memory(unsigned long total)
{
#ifdef CONFIG_HIGHMEM
int node;
unsigned long x = 0;
int i;
for_each_node_state(node, N_HIGH_MEMORY) {
for (i = ZONE_NORMAL + 1; i < MAX_NR_ZONES; i++) {
struct zone *z;
unsigned long nr_pages;
if (!is_highmem_idx(i))
continue;
z = &NODE_DATA(node)->node_zones[i];
if (!populated_zone(z))
continue;
nr_pages = zone_page_state(z, NR_FREE_PAGES);
/* watch for underflows */
nr_pages -= min(nr_pages, high_wmark_pages(z));
nr_pages += zone_page_state(z, NR_ZONE_INACTIVE_FILE);
nr_pages += zone_page_state(z, NR_ZONE_ACTIVE_FILE);
x += nr_pages;
}
}
/*
* Unreclaimable memory (kernel memory or anonymous memory
* without swap) can bring down the dirtyable pages below
* the zone's dirty balance reserve and the above calculation
* will underflow. However we still want to add in nodes
* which are below threshold (negative values) to get a more
* accurate calculation but make sure that the total never
* underflows.
*/
if ((long)x < 0)
x = 0;
/*
* Make sure that the number of highmem pages is never larger
* than the number of the total dirtyable memory. This can only
* occur in very strange VM situations but we want to make sure
* that this does not occur.
*/
return min(x, total);
#else
return 0;
#endif
}
/**
* global_dirtyable_memory - number of globally dirtyable pages
*
* Return: the global number of pages potentially available for dirty
* page cache. This is the base value for the global dirty limits.
*/
static unsigned long global_dirtyable_memory(void)
{
unsigned long x;
x = global_zone_page_state(NR_FREE_PAGES);
/*
* Pages reserved for the kernel should not be considered
* dirtyable, to prevent a situation where reclaim has to
* clean pages in order to balance the zones.
*/
x -= min(x, totalreserve_pages);
x += global_node_page_state(NR_INACTIVE_FILE);
x += global_node_page_state(NR_ACTIVE_FILE);
if (!vm_highmem_is_dirtyable)
x -= highmem_dirtyable_memory(x);
return x + 1; /* Ensure that we never return 0 */
}
/**
* domain_dirty_limits - calculate thresh and bg_thresh for a wb_domain
* @dtc: dirty_throttle_control of interest
*
* Calculate @dtc->thresh and ->bg_thresh considering
* vm_dirty_{bytes|ratio} and dirty_background_{bytes|ratio}. The caller
* must ensure that @dtc->avail is set before calling this function. The
* dirty limits will be lifted by 1/4 for real-time tasks.
*/
static void domain_dirty_limits(struct dirty_throttle_control *dtc)
{
const unsigned long available_memory = dtc->avail;
struct dirty_throttle_control *gdtc = mdtc_gdtc(dtc);
unsigned long bytes = vm_dirty_bytes;
unsigned long bg_bytes = dirty_background_bytes;
/* convert ratios to per-PAGE_SIZE for higher precision */
unsigned long ratio = (vm_dirty_ratio * PAGE_SIZE) / 100; unsigned long bg_ratio = (dirty_background_ratio * PAGE_SIZE) / 100;
unsigned long thresh;
unsigned long bg_thresh;
struct task_struct *tsk;
/* gdtc is !NULL iff @dtc is for memcg domain */
if (gdtc) {
unsigned long global_avail = gdtc->avail;
/*
* The byte settings can't be applied directly to memcg
* domains. Convert them to ratios by scaling against
* globally available memory. As the ratios are in
* per-PAGE_SIZE, they can be obtained by dividing bytes by
* number of pages.
*/
if (bytes)
ratio = min(DIV_ROUND_UP(bytes, global_avail),
PAGE_SIZE);
if (bg_bytes)
bg_ratio = min(DIV_ROUND_UP(bg_bytes, global_avail),
PAGE_SIZE);
bytes = bg_bytes = 0;
}
if (bytes)
thresh = DIV_ROUND_UP(bytes, PAGE_SIZE);
else
thresh = (ratio * available_memory) / PAGE_SIZE;
if (bg_bytes) bg_thresh = DIV_ROUND_UP(bg_bytes, PAGE_SIZE);
else
bg_thresh = (bg_ratio * available_memory) / PAGE_SIZE;
if (bg_thresh >= thresh) bg_thresh = thresh / 2;
tsk = current;
if (rt_task(tsk)) {
bg_thresh += bg_thresh / 4 + global_wb_domain.dirty_limit / 32;
thresh += thresh / 4 + global_wb_domain.dirty_limit / 32;
}
dtc->thresh = thresh;
dtc->bg_thresh = bg_thresh;
/* we should eventually report the domain in the TP */
if (!gdtc)
trace_global_dirty_state(bg_thresh, thresh);
}
/**
* global_dirty_limits - background-writeback and dirty-throttling thresholds
* @pbackground: out parameter for bg_thresh
* @pdirty: out parameter for thresh
*
* Calculate bg_thresh and thresh for global_wb_domain. See
* domain_dirty_limits() for details.
*/
void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty)
{
struct dirty_throttle_control gdtc = { GDTC_INIT_NO_WB };
gdtc.avail = global_dirtyable_memory();
domain_dirty_limits(&gdtc);
*pbackground = gdtc.bg_thresh;
*pdirty = gdtc.thresh;
}
/**
* node_dirty_limit - maximum number of dirty pages allowed in a node
* @pgdat: the node
*
* Return: the maximum number of dirty pages allowed in a node, based
* on the node's dirtyable memory.
*/
static unsigned long node_dirty_limit(struct pglist_data *pgdat)
{
unsigned long node_memory = node_dirtyable_memory(pgdat);
struct task_struct *tsk = current;
unsigned long dirty;
if (vm_dirty_bytes)
dirty = DIV_ROUND_UP(vm_dirty_bytes, PAGE_SIZE) *
node_memory / global_dirtyable_memory();
else
dirty = vm_dirty_ratio * node_memory / 100; if (rt_task(tsk)) dirty += dirty / 4;
return dirty;
}
/**
* node_dirty_ok - tells whether a node is within its dirty limits
* @pgdat: the node to check
*
* Return: %true when the dirty pages in @pgdat are within the node's
* dirty limit, %false if the limit is exceeded.
*/
bool node_dirty_ok(struct pglist_data *pgdat)
{
unsigned long limit = node_dirty_limit(pgdat);
unsigned long nr_pages = 0;
nr_pages += node_page_state(pgdat, NR_FILE_DIRTY);
nr_pages += node_page_state(pgdat, NR_WRITEBACK);
return nr_pages <= limit;
}
int dirty_background_ratio_handler(struct ctl_table *table, int write,
void *buffer, size_t *lenp, loff_t *ppos)
{
int ret;
ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
if (ret == 0 && write)
dirty_background_bytes = 0;
return ret;
}
int dirty_background_bytes_handler(struct ctl_table *table, int write,
void *buffer, size_t *lenp, loff_t *ppos)
{
int ret;
ret = proc_doulongvec_minmax(table, write, buffer, lenp, ppos);
if (ret == 0 && write)
dirty_background_ratio = 0;
return ret;
}
int dirty_ratio_handler(struct ctl_table *table, int write, void *buffer,
size_t *lenp, loff_t *ppos)
{
int old_ratio = vm_dirty_ratio;
int ret;
ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
if (ret == 0 && write && vm_dirty_ratio != old_ratio) {
writeback_set_ratelimit();
vm_dirty_bytes = 0;
}
return ret;
}
int dirty_bytes_handler(struct ctl_table *table, int write,
void *buffer, size_t *lenp, loff_t *ppos)
{
unsigned long old_bytes = vm_dirty_bytes;
int ret;
ret = proc_doulongvec_minmax(table, write, buffer, lenp, ppos);
if (ret == 0 && write && vm_dirty_bytes != old_bytes) {
writeback_set_ratelimit();
vm_dirty_ratio = 0;
}
return ret;
}
static unsigned long wp_next_time(unsigned long cur_time)
{
cur_time += VM_COMPLETIONS_PERIOD_LEN;
/* 0 has a special meaning... */
if (!cur_time)
return 1;
return cur_time;
}
static void wb_domain_writeout_inc(struct wb_domain *dom,
struct fprop_local_percpu *completions,
unsigned int max_prop_frac)
{
__fprop_inc_percpu_max(&dom->completions, completions,
max_prop_frac);
/* First event after period switching was turned off? */
if (unlikely(!dom->period_time)) {
/*
* We can race with other __bdi_writeout_inc calls here but
* it does not cause any harm since the resulting time when
* timer will fire and what is in writeout_period_time will be
* roughly the same.
*/
dom->period_time = wp_next_time(jiffies);
mod_timer(&dom->period_timer, dom->period_time);
}
}
/*
* Increment @wb's writeout completion count and the global writeout
* completion count. Called from test_clear_page_writeback().
*/
static inline void __wb_writeout_inc(struct bdi_writeback *wb)
{
struct wb_domain *cgdom;
inc_wb_stat(wb, WB_WRITTEN);
wb_domain_writeout_inc(&global_wb_domain, &wb->completions,
wb->bdi->max_prop_frac);
cgdom = mem_cgroup_wb_domain(wb);
if (cgdom)
wb_domain_writeout_inc(cgdom, wb_memcg_completions(wb),
wb->bdi->max_prop_frac);
}
void wb_writeout_inc(struct bdi_writeback *wb)
{
unsigned long flags;
local_irq_save(flags);
__wb_writeout_inc(wb);
local_irq_restore(flags);
}
EXPORT_SYMBOL_GPL(wb_writeout_inc);
/*
* On idle system, we can be called long after we scheduled because we use
* deferred timers so count with missed periods.
*/
static void writeout_period(struct timer_list *t)
{
struct wb_domain *dom = from_timer(dom, t, period_timer);
int miss_periods = (jiffies - dom->period_time) /
VM_COMPLETIONS_PERIOD_LEN;
if (fprop_new_period(&dom->completions, miss_periods + 1)) {
dom->period_time = wp_next_time(dom->period_time +
miss_periods * VM_COMPLETIONS_PERIOD_LEN);
mod_timer(&dom->period_timer, dom->period_time);
} else {
/*
* Aging has zeroed all fractions. Stop wasting CPU on period
* updates.
*/
dom->period_time = 0;
}
}
int wb_domain_init(struct wb_domain *dom, gfp_t gfp)
{
memset(dom, 0, sizeof(*dom));
spin_lock_init(&dom->lock);
timer_setup(&dom->period_timer, writeout_period, TIMER_DEFERRABLE);
dom->dirty_limit_tstamp = jiffies;
return fprop_global_init(&dom->completions, gfp);
}
#ifdef CONFIG_CGROUP_WRITEBACK
void wb_domain_exit(struct wb_domain *dom)
{
del_timer_sync(&dom->period_timer);
fprop_global_destroy(&dom->completions);
}
#endif
/*
* bdi_min_ratio keeps the sum of the minimum dirty shares of all
* registered backing devices, which, for obvious reasons, can not
* exceed 100%.
*/
static unsigned int bdi_min_ratio;
int bdi_set_min_ratio(struct backing_dev_info *bdi, unsigned int min_ratio)
{
int ret = 0;
spin_lock_bh(&bdi_lock);
if (min_ratio > bdi->max_ratio) {
ret = -EINVAL;
} else {
min_ratio -= bdi->min_ratio;
if (bdi_min_ratio + min_ratio < 100) {
bdi_min_ratio += min_ratio;
bdi->min_ratio += min_ratio;
} else {
ret = -EINVAL;
}
}
spin_unlock_bh(&bdi_lock);
return ret;
}
int bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned max_ratio)
{
int ret = 0;
if (max_ratio > 100)
return -EINVAL;
spin_lock_bh(&bdi_lock);
if (bdi->min_ratio > max_ratio) {
ret = -EINVAL;
} else {
bdi->max_ratio = max_ratio;
bdi->max_prop_frac = (FPROP_FRAC_BASE * max_ratio) / 100;
}
spin_unlock_bh(&bdi_lock);
return ret;
}
EXPORT_SYMBOL(bdi_set_max_ratio);
static unsigned long dirty_freerun_ceiling(unsigned long thresh,
unsigned long bg_thresh)
{
return (thresh + bg_thresh) / 2;
}
static unsigned long hard_dirty_limit(struct wb_domain *dom,
unsigned long thresh)
{
return max(thresh, dom->dirty_limit);
}
/*
* Memory which can be further allocated to a memcg domain is capped by
* system-wide clean memory excluding the amount being used in the domain.
*/
static void mdtc_calc_avail(struct dirty_throttle_control *mdtc,
unsigned long filepages, unsigned long headroom)
{
struct dirty_throttle_control *gdtc = mdtc_gdtc(mdtc);
unsigned long clean = filepages - min(filepages, mdtc->dirty);
unsigned long global_clean = gdtc->avail - min(gdtc->avail, gdtc->dirty);
unsigned long other_clean = global_clean - min(global_clean, clean);
mdtc->avail = filepages + min(headroom, other_clean);
}
/**
* __wb_calc_thresh - @wb's share of dirty throttling threshold
* @dtc: dirty_throttle_context of interest
*
* Note that balance_dirty_pages() will only seriously take it as a hard limit
* when sleeping max_pause per page is not enough to keep the dirty pages under
* control. For example, when the device is completely stalled due to some error
* conditions, or when there are 1000 dd tasks writing to a slow 10MB/s USB key.
* In the other normal situations, it acts more gently by throttling the tasks
* more (rather than completely block them) when the wb dirty pages go high.
*
* It allocates high/low dirty limits to fast/slow devices, in order to prevent
* - starving fast devices
* - piling up dirty pages (that will take long time to sync) on slow devices
*
* The wb's share of dirty limit will be adapting to its throughput and
* bounded by the bdi->min_ratio and/or bdi->max_ratio parameters, if set.
*
* Return: @wb's dirty limit in pages. The term "dirty" in the context of
* dirty balancing includes all PG_dirty and PG_writeback pages.
*/
static unsigned long __wb_calc_thresh(struct dirty_throttle_control *dtc)
{
struct wb_domain *dom = dtc_dom(dtc);
unsigned long thresh = dtc->thresh;
u64 wb_thresh;
unsigned long numerator, denominator;
unsigned long wb_min_ratio, wb_max_ratio;
/*
* Calculate this BDI's share of the thresh ratio.
*/
fprop_fraction_percpu(&dom->completions, dtc->wb_completions,
&numerator, &denominator);
wb_thresh = (thresh * (100 - bdi_min_ratio)) / 100;
wb_thresh *= numerator;
wb_thresh = div64_ul(wb_thresh, denominator);
wb_min_max_ratio(dtc->wb, &wb_min_ratio, &wb_max_ratio);
wb_thresh += (thresh * wb_min_ratio) / 100;
if (wb_thresh > (thresh * wb_max_ratio) / 100)
wb_thresh = thresh * wb_max_ratio / 100;
return wb_thresh;
}
unsigned long wb_calc_thresh(struct bdi_writeback *wb, unsigned long thresh)
{
struct dirty_throttle_control gdtc = { GDTC_INIT(wb),
.thresh = thresh };
return __wb_calc_thresh(&gdtc);
}
/*
* setpoint - dirty 3
* f(dirty) := 1.0 + (----------------)
* limit - setpoint
*
* it's a 3rd order polynomial that subjects to
*
* (1) f(freerun) = 2.0 => rampup dirty_ratelimit reasonably fast
* (2) f(setpoint) = 1.0 => the balance point
* (3) f(limit) = 0 => the hard limit
* (4) df/dx <= 0 => negative feedback control
* (5) the closer to setpoint, the smaller |df/dx| (and the reverse)
* => fast response on large errors; small oscillation near setpoint
*/
static long long pos_ratio_polynom(unsigned long setpoint,
unsigned long dirty,
unsigned long limit)
{
long long pos_ratio;
long x;
x = div64_s64(((s64)setpoint - (s64)dirty) << RATELIMIT_CALC_SHIFT,
(limit - setpoint) | 1);
pos_ratio = x;
pos_ratio = pos_ratio * x >> RATELIMIT_CALC_SHIFT;
pos_ratio = pos_ratio * x >> RATELIMIT_CALC_SHIFT;
pos_ratio += 1 << RATELIMIT_CALC_SHIFT;
return clamp(pos_ratio, 0LL, 2LL << RATELIMIT_CALC_SHIFT);
}
/*
* Dirty position control.
*
* (o) global/bdi setpoints
*
* We want the dirty pages be balanced around the global/wb setpoints.
* When the number of dirty pages is higher/lower than the setpoint, the
* dirty position control ratio (and hence task dirty ratelimit) will be
* decreased/increased to bring the dirty pages back to the setpoint.
*
* pos_ratio = 1 << RATELIMIT_CALC_SHIFT
*
* if (dirty < setpoint) scale up pos_ratio
* if (dirty > setpoint) scale down pos_ratio
*
* if (wb_dirty < wb_setpoint) scale up pos_ratio
* if (wb_dirty > wb_setpoint) scale down pos_ratio
*
* task_ratelimit = dirty_ratelimit * pos_ratio >> RATELIMIT_CALC_SHIFT
*
* (o) global control line
*
* ^ pos_ratio
* |
* | |<===== global dirty control scope ======>|
* 2.0 * * * * * * *
* | .*
* | . *
* | . *
* | . *
* | . *
* | . *
* 1.0 ................................*
* | . . *
* | . . *
* | . . *
* | . . *
* | . . *
* 0 +------------.------------------.----------------------*------------->
* freerun^ setpoint^ limit^ dirty pages
*
* (o) wb control line
*
* ^ pos_ratio
* |
* | *
* | *
* | *
* | *
* | * |<=========== span ============>|
* 1.0 .......................*
* | . *
* | . *
* | . *
* | . *
* | . *
* | . *
* | . *
* | . *
* | . *
* | . *
* | . *
* 1/4 ...............................................* * * * * * * * * * * *
* | . .
* | . .
* | . .
* 0 +----------------------.-------------------------------.------------->
* wb_setpoint^ x_intercept^
*
* The wb control line won't drop below pos_ratio=1/4, so that wb_dirty can
* be smoothly throttled down to normal if it starts high in situations like
* - start writing to a slow SD card and a fast disk at the same time. The SD
* card's wb_dirty may rush to many times higher than wb_setpoint.
* - the wb dirty thresh drops quickly due to change of JBOD workload
*/
static void wb_position_ratio(struct dirty_throttle_control *dtc)
{
struct bdi_writeback *wb = dtc->wb;
unsigned long write_bw = READ_ONCE(wb->avg_write_bandwidth);
unsigned long freerun = dirty_freerun_ceiling(dtc->thresh, dtc->bg_thresh);
unsigned long limit = hard_dirty_limit(dtc_dom(dtc), dtc->thresh);
unsigned long wb_thresh = dtc->wb_thresh;
unsigned long x_intercept;
unsigned long setpoint; /* dirty pages' target balance point */
unsigned long wb_setpoint;
unsigned long span;
long long pos_ratio; /* for scaling up/down the rate limit */
long x;
dtc->pos_ratio = 0;
if (unlikely(dtc->dirty >= limit))
return;
/*
* global setpoint
*
* See comment for pos_ratio_polynom().
*/
setpoint = (freerun + limit) / 2;
pos_ratio = pos_ratio_polynom(setpoint, dtc->dirty, limit);
/*
* The strictlimit feature is a tool preventing mistrusted filesystems
* from growing a large number of dirty pages before throttling. For
* such filesystems balance_dirty_pages always checks wb counters
* against wb limits. Even if global "nr_dirty" is under "freerun".
* This is especially important for fuse which sets bdi->max_ratio to
* 1% by default. Without strictlimit feature, fuse writeback may
* consume arbitrary amount of RAM because it is accounted in
* NR_WRITEBACK_TEMP which is not involved in calculating "nr_dirty".
*
* Here, in wb_position_ratio(), we calculate pos_ratio based on
* two values: wb_dirty and wb_thresh. Let's consider an example:
* total amount of RAM is 16GB, bdi->max_ratio is equal to 1%, global
* limits are set by default to 10% and 20% (background and throttle).
* Then wb_thresh is 1% of 20% of 16GB. This amounts to ~8K pages.
* wb_calc_thresh(wb, bg_thresh) is about ~4K pages. wb_setpoint is
* about ~6K pages (as the average of background and throttle wb
* limits). The 3rd order polynomial will provide positive feedback if
* wb_dirty is under wb_setpoint and vice versa.
*
* Note, that we cannot use global counters in these calculations
* because we want to throttle process writing to a strictlimit wb
* much earlier than global "freerun" is reached (~23MB vs. ~2.3GB
* in the example above).
*/
if (unlikely(wb->bdi->capabilities & BDI_CAP_STRICTLIMIT)) {
long long wb_pos_ratio;
if (dtc->wb_dirty < 8) { dtc->pos_ratio = min_t(long long, pos_ratio * 2,
2 << RATELIMIT_CALC_SHIFT);
return;
}
if (dtc->wb_dirty >= wb_thresh)
return;
wb_setpoint = dirty_freerun_ceiling(wb_thresh,
dtc->wb_bg_thresh);
if (wb_setpoint == 0 || wb_setpoint == wb_thresh)
return;
wb_pos_ratio = pos_ratio_polynom(wb_setpoint, dtc->wb_dirty,
wb_thresh);
/*
* Typically, for strictlimit case, wb_setpoint << setpoint
* and pos_ratio >> wb_pos_ratio. In the other words global
* state ("dirty") is not limiting factor and we have to
* make decision based on wb counters. But there is an
* important case when global pos_ratio should get precedence:
* global limits are exceeded (e.g. due to activities on other
* wb's) while given strictlimit wb is below limit.
*
* "pos_ratio * wb_pos_ratio" would work for the case above,
* but it would look too non-natural for the case of all
* activity in the system coming from a single strictlimit wb
* with bdi->max_ratio == 100%.
*
* Note that min() below somewhat changes the dynamics of the
* control system. Normally, pos_ratio value can be well over 3
* (when globally we are at freerun and wb is well below wb
* setpoint). Now the maximum pos_ratio in the same situation
* is 2. We might want to tweak this if we observe the control
* system is too slow to adapt.
*/
dtc->pos_ratio = min(pos_ratio, wb_pos_ratio);
return;
}
/*
* We have computed basic pos_ratio above based on global situation. If
* the wb is over/under its share of dirty pages, we want to scale
* pos_ratio further down/up. That is done by the following mechanism.
*/
/*
* wb setpoint
*
* f(wb_dirty) := 1.0 + k * (wb_dirty - wb_setpoint)
*
* x_intercept - wb_dirty
* := --------------------------
* x_intercept - wb_setpoint
*
* The main wb control line is a linear function that subjects to
*
* (1) f(wb_setpoint) = 1.0
* (2) k = - 1 / (8 * write_bw) (in single wb case)
* or equally: x_intercept = wb_setpoint + 8 * write_bw
*
* For single wb case, the dirty pages are observed to fluctuate
* regularly within range
* [wb_setpoint - write_bw/2, wb_setpoint + write_bw/2]
* for various filesystems, where (2) can yield in a reasonable 12.5%
* fluctuation range for pos_ratio.
*
* For JBOD case, wb_thresh (not wb_dirty!) could fluctuate up to its
* own size, so move the slope over accordingly and choose a slope that
* yields 100% pos_ratio fluctuation on suddenly doubled wb_thresh.
*/
if (unlikely(wb_thresh > dtc->thresh))
wb_thresh = dtc->thresh;
/*
* It's very possible that wb_thresh is close to 0 not because the
* device is slow, but that it has remained inactive for long time.
* Honour such devices a reasonable good (hopefully IO efficient)
* threshold, so that the occasional writes won't be blocked and active
* writes can rampup the threshold quickly.
*/
wb_thresh = max(wb_thresh, (limit - dtc->dirty) / 8);
/*
* scale global setpoint to wb's:
* wb_setpoint = setpoint * wb_thresh / thresh
*/
x = div_u64((u64)wb_thresh << 16, dtc->thresh | 1);
wb_setpoint = setpoint * (u64)x >> 16;
/*
* Use span=(8*write_bw) in single wb case as indicated by
* (thresh - wb_thresh ~= 0) and transit to wb_thresh in JBOD case.
*
* wb_thresh thresh - wb_thresh
* span = --------- * (8 * write_bw) + ------------------ * wb_thresh
* thresh thresh
*/
span = (dtc->thresh - wb_thresh + 8 * write_bw) * (u64)x >> 16;
x_intercept = wb_setpoint + span;
if (dtc->wb_dirty < x_intercept - span / 4) {
pos_ratio = div64_u64(pos_ratio * (x_intercept - dtc->wb_dirty),
(x_intercept - wb_setpoint) | 1);
} else
pos_ratio /= 4;
/*
* wb reserve area, safeguard against dirty pool underrun and disk idle
* It may push the desired control point of global dirty pages higher
* than setpoint.
*/
x_intercept = wb_thresh / 2; if (dtc->wb_dirty < x_intercept) { if (dtc->wb_dirty > x_intercept / 8) pos_ratio = div_u64(pos_ratio * x_intercept,
dtc->wb_dirty);
else
pos_ratio *= 8;
}
dtc->pos_ratio = pos_ratio;
}
static void wb_update_write_bandwidth(struct bdi_writeback *wb,
unsigned long elapsed,
unsigned long written)
{
const unsigned long period = roundup_pow_of_two(3 * HZ);
unsigned long avg = wb->avg_write_bandwidth;
unsigned long old = wb->write_bandwidth;
u64 bw;
/*
* bw = written * HZ / elapsed
*
* bw * elapsed + write_bandwidth * (period - elapsed)
* write_bandwidth = ---------------------------------------------------
* period
*
* @written may have decreased due to account_page_redirty().
* Avoid underflowing @bw calculation.
*/
bw = written - min(written, wb->written_stamp);
bw *= HZ;
if (unlikely(elapsed > period)) {
bw = div64_ul(bw, elapsed);
avg = bw;
goto out;
}
bw += (u64)wb->write_bandwidth * (period - elapsed);
bw >>= ilog2(period);
/*
* one more level of smoothing, for filtering out sudden spikes
*/
if (avg > old && old >= (unsigned long)bw) avg -= (avg - old) >> 3; if (avg < old && old <= (unsigned long)bw) avg += (old - avg) >> 3;
out:
/* keep avg > 0 to guarantee that tot > 0 if there are dirty wbs */
avg = max(avg, 1LU);
if (wb_has_dirty_io(wb)) {
long delta = avg - wb->avg_write_bandwidth; WARN_ON_ONCE(atomic_long_add_return(delta,
&wb->bdi->tot_write_bandwidth) <= 0);
}
wb->write_bandwidth = bw;
WRITE_ONCE(wb->avg_write_bandwidth, avg);
}
static void update_dirty_limit(struct dirty_throttle_control *dtc)
{
struct wb_domain *dom = dtc_dom(dtc);
unsigned long thresh = dtc->thresh;
unsigned long limit = dom->dirty_limit;
/*
* Follow up in one step.
*/
if (limit < thresh) {
limit = thresh;
goto update;
}
/*
* Follow down slowly. Use the higher one as the target, because thresh
* may drop below dirty. This is exactly the reason to introduce
* dom->dirty_limit which is guaranteed to lie above the dirty pages.
*/
thresh = max(thresh, dtc->dirty);
if (limit > thresh) {
limit -= (limit - thresh) >> 5;
goto update;
}
return;
update:
dom->dirty_limit = limit;
}
static void domain_update_dirty_limit(struct dirty_throttle_control *dtc,
unsigned long now)
{
struct wb_domain *dom = dtc_dom(dtc);
/*
* check locklessly first to optimize away locking for the most time
*/
if (time_before(now, dom->dirty_limit_tstamp + BANDWIDTH_INTERVAL))
return;
spin_lock(&dom->lock);
if (time_after_eq(now, dom->dirty_limit_tstamp + BANDWIDTH_INTERVAL)) {
update_dirty_limit(dtc); dom->dirty_limit_tstamp = now;
}
spin_unlock(&dom->lock);
}
/*
* Maintain wb->dirty_ratelimit, the base dirty throttle rate.
*
* Normal wb tasks will be curbed at or below it in long term.
* Obviously it should be around (write_bw / N) when there are N dd tasks.
*/
static void wb_update_dirty_ratelimit(struct dirty_throttle_control *dtc,
unsigned long dirtied,
unsigned long elapsed)
{
struct bdi_writeback *wb = dtc->wb;
unsigned long dirty = dtc->dirty;
unsigned long freerun = dirty_freerun_ceiling(dtc->thresh, dtc->bg_thresh);
unsigned long limit = hard_dirty_limit(dtc_dom(dtc), dtc->thresh);
unsigned long setpoint = (freerun + limit) / 2;
unsigned long write_bw = wb->avg_write_bandwidth;
unsigned long dirty_ratelimit = wb->dirty_ratelimit;
unsigned long dirty_rate;
unsigned long task_ratelimit;
unsigned long balanced_dirty_ratelimit;
unsigned long step;
unsigned long x;
unsigned long shift;
/*
* The dirty rate will match the writeout rate in long term, except
* when dirty pages are truncated by userspace or re-dirtied by FS.
*/
dirty_rate = (dirtied - wb->dirtied_stamp) * HZ / elapsed;
/*
* task_ratelimit reflects each dd's dirty rate for the past 200ms.
*/
task_ratelimit = (u64)dirty_ratelimit *
dtc->pos_ratio >> RATELIMIT_CALC_SHIFT;
task_ratelimit++; /* it helps rampup dirty_ratelimit from tiny values */
/*
* A linear estimation of the "balanced" throttle rate. The theory is,
* if there are N dd tasks, each throttled at task_ratelimit, the wb's
* dirty_rate will be measured to be (N * task_ratelimit). So the below
* formula will yield the balanced rate limit (write_bw / N).
*
* Note that the expanded form is not a pure rate feedback:
* rate_(i+1) = rate_(i) * (write_bw / dirty_rate) (1)
* but also takes pos_ratio into account:
* rate_(i+1) = rate_(i) * (write_bw / dirty_rate) * pos_ratio (2)
*
* (1) is not realistic because pos_ratio also takes part in balancing
* the dirty rate. Consider the state
* pos_ratio = 0.5 (3)
* rate = 2 * (write_bw / N) (4)
* If (1) is used, it will stuck in that state! Because each dd will
* be throttled at
* task_ratelimit = pos_ratio * rate = (write_bw / N) (5)
* yielding
* dirty_rate = N * task_ratelimit = write_bw (6)
* put (6) into (1) we get
* rate_(i+1) = rate_(i) (7)
*
* So we end up using (2) to always keep
* rate_(i+1) ~= (write_bw / N) (8)
* regardless of the value of pos_ratio. As long as (8) is satisfied,
* pos_ratio is able to drive itself to 1.0, which is not only where
* the dirty count meet the setpoint, but also where the slope of
* pos_ratio is most flat and hence task_ratelimit is least fluctuated.
*/
balanced_dirty_ratelimit = div_u64((u64)task_ratelimit * write_bw,
dirty_rate | 1);
/*
* balanced_dirty_ratelimit ~= (write_bw / N) <= write_bw
*/
if (unlikely(balanced_dirty_ratelimit > write_bw))
balanced_dirty_ratelimit = write_bw;
/*
* We could safely do this and return immediately:
*
* wb->dirty_ratelimit = balanced_dirty_ratelimit;
*
* However to get a more stable dirty_ratelimit, the below elaborated
* code makes use of task_ratelimit to filter out singular points and
* limit the step size.
*
* The below code essentially only uses the relative value of
*
* task_ratelimit - dirty_ratelimit
* = (pos_ratio - 1) * dirty_ratelimit
*
* which reflects the direction and size of dirty position error.
*/
/*
* dirty_ratelimit will follow balanced_dirty_ratelimit iff
* task_ratelimit is on the same side of dirty_ratelimit, too.
* For example, when
* - dirty_ratelimit > balanced_dirty_ratelimit
* - dirty_ratelimit > task_ratelimit (dirty pages are above setpoint)
* lowering dirty_ratelimit will help meet both the position and rate
* control targets. Otherwise, don't update dirty_ratelimit if it will
* only help meet the rate target. After all, what the users ultimately
* feel and care are stable dirty rate and small position error.
*
* |task_ratelimit - dirty_ratelimit| is used to limit the step size
* and filter out the singular points of balanced_dirty_ratelimit. Which
* keeps jumping around randomly and can even leap far away at times
* due to the small 200ms estimation period of dirty_rate (we want to
* keep that period small to reduce time lags).
*/
step = 0;
/*
* For strictlimit case, calculations above were based on wb counters
* and limits (starting from pos_ratio = wb_position_ratio() and up to
* balanced_dirty_ratelimit = task_ratelimit * write_bw / dirty_rate).
* Hence, to calculate "step" properly, we have to use wb_dirty as
* "dirty" and wb_setpoint as "setpoint".
*
* We rampup dirty_ratelimit forcibly if wb_dirty is low because
* it's possible that wb_thresh is close to zero due to inactivity
* of backing device.
*/
if (unlikely(wb->bdi->capabilities & BDI_CAP_STRICTLIMIT)) {
dirty = dtc->wb_dirty;
if (dtc->wb_dirty < 8)
setpoint = dtc->wb_dirty + 1;
else
setpoint = (dtc->wb_thresh + dtc->wb_bg_thresh) / 2;
}
if (dirty < setpoint) { x = min3(wb->balanced_dirty_ratelimit,
balanced_dirty_ratelimit, task_ratelimit);
if (dirty_ratelimit < x)
step = x - dirty_ratelimit;
} else {
x = max3(wb->balanced_dirty_ratelimit,
balanced_dirty_ratelimit, task_ratelimit);
if (dirty_ratelimit > x)
step = dirty_ratelimit - x;
}
/*
* Don't pursue 100% rate matching. It's impossible since the balanced
* rate itself is constantly fluctuating. So decrease the track speed
* when it gets close to the target. Helps eliminate pointless tremors.
*/
shift = dirty_ratelimit / (2 * step + 1);
if (shift < BITS_PER_LONG) step = DIV_ROUND_UP(step >> shift, 8);
else
step = 0;
if (dirty_ratelimit < balanced_dirty_ratelimit) dirty_ratelimit += step;
else
dirty_ratelimit -= step; WRITE_ONCE(wb->dirty_ratelimit, max(dirty_ratelimit, 1UL));
wb->balanced_dirty_ratelimit = balanced_dirty_ratelimit;
trace_bdi_dirty_ratelimit(wb, dirty_rate, task_ratelimit);
}
static void __wb_update_bandwidth(struct dirty_throttle_control *gdtc,
struct dirty_throttle_control *mdtc,
bool update_ratelimit)
{
struct bdi_writeback *wb = gdtc->wb;
unsigned long now = jiffies;
unsigned long elapsed;
unsigned long dirtied;
unsigned long written;
spin_lock(&wb->list_lock);
/*
* Lockless checks for elapsed time are racy and delayed update after
* IO completion doesn't do it at all (to make sure written pages are
* accounted reasonably quickly). Make sure elapsed >= 1 to avoid
* division errors.
*/
elapsed = max(now - wb->bw_time_stamp, 1UL);
dirtied = percpu_counter_read(&wb->stat[WB_DIRTIED]);
written = percpu_counter_read(&wb->stat[WB_WRITTEN]);
if (update_ratelimit) {
domain_update_dirty_limit(gdtc, now);
wb_update_dirty_ratelimit(gdtc, dirtied, elapsed);
/*
* @mdtc is always NULL if !CGROUP_WRITEBACK but the
* compiler has no way to figure that out. Help it.
*/
if (IS_ENABLED(CONFIG_CGROUP_WRITEBACK) && mdtc) {
domain_update_dirty_limit(mdtc, now);
wb_update_dirty_ratelimit(mdtc, dirtied, elapsed);
}
}
wb_update_write_bandwidth(wb, elapsed, written);
wb->dirtied_stamp = dirtied;
wb->written_stamp = written;
WRITE_ONCE(wb->bw_time_stamp, now);
spin_unlock(&wb->list_lock);
}
void wb_update_bandwidth(struct bdi_writeback *wb)
{
struct dirty_throttle_control gdtc = { GDTC_INIT(wb) };
__wb_update_bandwidth(&gdtc, NULL, false);
}
/* Interval after which we consider wb idle and don't estimate bandwidth */
#define WB_BANDWIDTH_IDLE_JIF (HZ)
static void wb_bandwidth_estimate_start(struct bdi_writeback *wb)
{
unsigned long now = jiffies;
unsigned long elapsed = now - READ_ONCE(wb->bw_time_stamp);
if (elapsed > WB_BANDWIDTH_IDLE_JIF &&
!atomic_read(&wb->writeback_inodes)) {
spin_lock(&wb->list_lock);
wb->dirtied_stamp = wb_stat(wb, WB_DIRTIED);
wb->written_stamp = wb_stat(wb, WB_WRITTEN);
WRITE_ONCE(wb->bw_time_stamp, now);
spin_unlock(&wb->list_lock);
}
}
/*
* After a task dirtied this many pages, balance_dirty_pages_ratelimited()
* will look to see if it needs to start dirty throttling.
*
* If dirty_poll_interval is too low, big NUMA machines will call the expensive
* global_zone_page_state() too often. So scale it near-sqrt to the safety margin
* (the number of pages we may dirty without exceeding the dirty limits).
*/
static unsigned long dirty_poll_interval(unsigned long dirty,
unsigned long thresh)
{
if (thresh > dirty) return 1UL << (ilog2(thresh - dirty) >> 1);
return 1;
}
static unsigned long wb_max_pause(struct bdi_writeback *wb,
unsigned long wb_dirty)
{
unsigned long bw = READ_ONCE(wb->avg_write_bandwidth);
unsigned long t;
/*
* Limit pause time for small memory systems. If sleeping for too long
* time, a small pool of dirty/writeback pages may go empty and disk go
* idle.
*
* 8 serves as the safety ratio.
*/
t = wb_dirty / (1 + bw / roundup_pow_of_two(1 + HZ / 8));
t++;
return min_t(unsigned long, t, MAX_PAUSE);
}
static long wb_min_pause(struct bdi_writeback *wb,
long max_pause,
unsigned long task_ratelimit,
unsigned long dirty_ratelimit,
int *nr_dirtied_pause)
{
long hi = ilog2(READ_ONCE(wb->avg_write_bandwidth));
long lo = ilog2(READ_ONCE(wb->dirty_ratelimit));
long t; /* target pause */
long pause; /* estimated next pause */
int pages; /* target nr_dirtied_pause */
/* target for 10ms pause on 1-dd case */
t = max(1, HZ / 100);
/*
* Scale up pause time for concurrent dirtiers in order to reduce CPU
* overheads.
*
* (N * 10ms) on 2^N concurrent tasks.
*/
if (hi > lo)
t += (hi - lo) * (10 * HZ) / 1024;
/*
* This is a bit convoluted. We try to base the next nr_dirtied_pause
* on the much more stable dirty_ratelimit. However the next pause time
* will be computed based on task_ratelimit and the two rate limits may
* depart considerably at some time. Especially if task_ratelimit goes
* below dirty_ratelimit/2 and the target pause is max_pause, the next
* pause time will be max_pause*2 _trimmed down_ to max_pause. As a
* result task_ratelimit won't be executed faithfully, which could
* eventually bring down dirty_ratelimit.
*
* We apply two rules to fix it up:
* 1) try to estimate the next pause time and if necessary, use a lower
* nr_dirtied_pause so as not to exceed max_pause. When this happens,
* nr_dirtied_pause will be "dancing" with task_ratelimit.
* 2) limit the target pause time to max_pause/2, so that the normal
* small fluctuations of task_ratelimit won't trigger rule (1) and
* nr_dirtied_pause will remain as stable as dirty_ratelimit.
*/
t = min(t, 1 + max_pause / 2);
pages = dirty_ratelimit * t / roundup_pow_of_two(HZ);
/*
* Tiny nr_dirtied_pause is found to hurt I/O performance in the test
* case fio-mmap-randwrite-64k, which does 16*{sync read, async write}.
* When the 16 consecutive reads are often interrupted by some dirty
* throttling pause during the async writes, cfq will go into idles
* (deadline is fine). So push nr_dirtied_pause as high as possible
* until reaches DIRTY_POLL_THRESH=32 pages.
*/
if (pages < DIRTY_POLL_THRESH) {
t = max_pause;
pages = dirty_ratelimit * t / roundup_pow_of_two(HZ);
if (pages > DIRTY_POLL_THRESH) {
pages = DIRTY_POLL_THRESH;
t = HZ * DIRTY_POLL_THRESH / dirty_ratelimit;
}
}
pause = HZ * pages / (task_ratelimit + 1);
if (pause > max_pause) {
t = max_pause;
pages = task_ratelimit * t / roundup_pow_of_two(HZ);
}
*nr_dirtied_pause = pages;
/*
* The minimal pause time will normally be half the target pause time.
*/
return pages >= DIRTY_POLL_THRESH ? 1 + t / 2 : t;
}
static inline void wb_dirty_limits(struct dirty_throttle_control *dtc)
{
struct bdi_writeback *wb = dtc->wb;
unsigned long wb_reclaimable;
/*
* wb_thresh is not treated as some limiting factor as
* dirty_thresh, due to reasons
* - in JBOD setup, wb_thresh can fluctuate a lot
* - in a system with HDD and USB key, the USB key may somehow
* go into state (wb_dirty >> wb_thresh) either because
* wb_dirty starts high, or because wb_thresh drops low.
* In this case we don't want to hard throttle the USB key
* dirtiers for 100 seconds until wb_dirty drops under
* wb_thresh. Instead the auxiliary wb control line in
* wb_position_ratio() will let the dirtier task progress
* at some rate <= (write_bw / 2) for bringing down wb_dirty.
*/
dtc->wb_thresh = __wb_calc_thresh(dtc); dtc->wb_bg_thresh = dtc->thresh ? div_u64((u64)dtc->wb_thresh * dtc->bg_thresh, dtc->thresh) : 0;
/*
* In order to avoid the stacked BDI deadlock we need
* to ensure we accurately count the 'dirty' pages when
* the threshold is low.
*
* Otherwise it would be possible to get thresh+n pages
* reported dirty, even though there are thresh-m pages
* actually dirty; with m+n sitting in the percpu
* deltas.
*/
if (dtc->wb_thresh < 2 * wb_stat_error()) {
wb_reclaimable = wb_stat_sum(wb, WB_RECLAIMABLE);
dtc->wb_dirty = wb_reclaimable + wb_stat_sum(wb, WB_WRITEBACK);
} else {
wb_reclaimable = wb_stat(wb, WB_RECLAIMABLE);
dtc->wb_dirty = wb_reclaimable + wb_stat(wb, WB_WRITEBACK);
}
}
/*
* balance_dirty_pages() must be called by processes which are generating dirty
* data. It looks at the number of dirty pages in the machine and will force
* the caller to wait once crossing the (background_thresh + dirty_thresh) / 2.
* If we're over `background_thresh' then the writeback threads are woken to
* perform some writeout.
*/
static void balance_dirty_pages(struct bdi_writeback *wb,
unsigned long pages_dirtied)
{
struct dirty_throttle_control gdtc_stor = { GDTC_INIT(wb) };
struct dirty_throttle_control mdtc_stor = { MDTC_INIT(wb, &gdtc_stor) };
struct dirty_throttle_control * const gdtc = &gdtc_stor;
struct dirty_throttle_control * const mdtc = mdtc_valid(&mdtc_stor) ?
&mdtc_stor : NULL;
struct dirty_throttle_control *sdtc;
unsigned long nr_reclaimable; /* = file_dirty */
long period;
long pause;
long max_pause;
long min_pause;
int nr_dirtied_pause;
bool dirty_exceeded = false;
unsigned long task_ratelimit;
unsigned long dirty_ratelimit;
struct backing_dev_info *bdi = wb->bdi;
bool strictlimit = bdi->capabilities & BDI_CAP_STRICTLIMIT;
unsigned long start_time = jiffies;
for (;;) {
unsigned long now = jiffies;
unsigned long dirty, thresh, bg_thresh;
unsigned long m_dirty = 0; /* stop bogus uninit warnings */
unsigned long m_thresh = 0;
unsigned long m_bg_thresh = 0;
nr_reclaimable = global_node_page_state(NR_FILE_DIRTY);
gdtc->avail = global_dirtyable_memory();
gdtc->dirty = nr_reclaimable + global_node_page_state(NR_WRITEBACK);
domain_dirty_limits(gdtc);
if (unlikely(strictlimit)) {
wb_dirty_limits(gdtc);
dirty = gdtc->wb_dirty;
thresh = gdtc->wb_thresh;
bg_thresh = gdtc->wb_bg_thresh;
} else {
dirty = gdtc->dirty;
thresh = gdtc->thresh;
bg_thresh = gdtc->bg_thresh;
}
if (mdtc) {
unsigned long filepages, headroom, writeback;
/*
* If @wb belongs to !root memcg, repeat the same
* basic calculations for the memcg domain.
*/
mem_cgroup_wb_stats(wb, &filepages, &headroom,
&mdtc->dirty, &writeback);
mdtc->dirty += writeback;
mdtc_calc_avail(mdtc, filepages, headroom);
domain_dirty_limits(mdtc);
if (unlikely(strictlimit)) {
wb_dirty_limits(mdtc);
m_dirty = mdtc->wb_dirty;
m_thresh = mdtc->wb_thresh;
m_bg_thresh = mdtc->wb_bg_thresh;
} else {
m_dirty = mdtc->dirty;
m_thresh = mdtc->thresh;
m_bg_thresh = mdtc->bg_thresh;
}
}
/*
* Throttle it only when the background writeback cannot
* catch-up. This avoids (excessively) small writeouts
* when the wb limits are ramping up in case of !strictlimit.
*
* In strictlimit case make decision based on the wb counters
* and limits. Small writeouts when the wb limits are ramping
* up are the price we consciously pay for strictlimit-ing.
*
* If memcg domain is in effect, @dirty should be under
* both global and memcg freerun ceilings.
*/
if (dirty <= dirty_freerun_ceiling(thresh, bg_thresh) &&
(!mdtc ||
m_dirty <= dirty_freerun_ceiling(m_thresh, m_bg_thresh))) {
unsigned long intv;
unsigned long m_intv;
free_running:
intv = dirty_poll_interval(dirty, thresh);
m_intv = ULONG_MAX;
current->dirty_paused_when = now;
current->nr_dirtied = 0;
if (mdtc)
m_intv = dirty_poll_interval(m_dirty, m_thresh);
current->nr_dirtied_pause = min(intv, m_intv);
break;
}
if (unlikely(!writeback_in_progress(wb)))
wb_start_background_writeback(wb);
mem_cgroup_flush_foreign(wb);
/*
* Calculate global domain's pos_ratio and select the
* global dtc by default.
*/
if (!strictlimit) {
wb_dirty_limits(gdtc);
if ((current->flags & PF_LOCAL_THROTTLE) &&
gdtc->wb_dirty <
dirty_freerun_ceiling(gdtc->wb_thresh,
gdtc->wb_bg_thresh))
/*
* LOCAL_THROTTLE tasks must not be throttled
* when below the per-wb freerun ceiling.
*/
goto free_running;
}
dirty_exceeded = (gdtc->wb_dirty > gdtc->wb_thresh) && ((gdtc->dirty > gdtc->thresh) || strictlimit);
wb_position_ratio(gdtc);
sdtc = gdtc;
if (mdtc) {
/*
* If memcg domain is in effect, calculate its
* pos_ratio. @wb should satisfy constraints from
* both global and memcg domains. Choose the one
* w/ lower pos_ratio.
*/
if (!strictlimit) {
wb_dirty_limits(mdtc);
if ((current->flags & PF_LOCAL_THROTTLE) &&
mdtc->wb_dirty <
dirty_freerun_ceiling(mdtc->wb_thresh,
mdtc->wb_bg_thresh))
/*
* LOCAL_THROTTLE tasks must not be
* throttled when below the per-wb
* freerun ceiling.
*/
goto free_running;
}
dirty_exceeded |= (mdtc->wb_dirty > mdtc->wb_thresh) &&
((mdtc->dirty > mdtc->thresh) || strictlimit);
wb_position_ratio(mdtc);
if (mdtc->pos_ratio < gdtc->pos_ratio)
sdtc = mdtc;
}
if (dirty_exceeded && !wb->dirty_exceeded) wb->dirty_exceeded = 1; if (time_is_before_jiffies(READ_ONCE(wb->bw_time_stamp) +
BANDWIDTH_INTERVAL))
__wb_update_bandwidth(gdtc, mdtc, true);
/* throttle according to the chosen dtc */
dirty_ratelimit = READ_ONCE(wb->dirty_ratelimit);
task_ratelimit = ((u64)dirty_ratelimit * sdtc->pos_ratio) >>
RATELIMIT_CALC_SHIFT;
max_pause = wb_max_pause(wb, sdtc->wb_dirty);
min_pause = wb_min_pause(wb, max_pause,
task_ratelimit, dirty_ratelimit,
&nr_dirtied_pause);
if (unlikely(task_ratelimit == 0)) {
period = max_pause;
pause = max_pause;
goto pause;
}
period = HZ * pages_dirtied / task_ratelimit;
pause = period;
if (current->dirty_paused_when)
pause -= now - current->dirty_paused_when;
/*
* For less than 1s think time (ext3/4 may block the dirtier
* for up to 800ms from time to time on 1-HDD; so does xfs,
* however at much less frequency), try to compensate it in
* future periods by updating the virtual time; otherwise just
* do a reset, as it may be a light dirtier.
*/
if (pause < min_pause) { trace_balance_dirty_pages(wb,
sdtc->thresh,
sdtc->bg_thresh,
sdtc->dirty,
sdtc->wb_thresh,
sdtc->wb_dirty,
dirty_ratelimit,
task_ratelimit,
pages_dirtied,
period,
min(pause, 0L),
start_time);
if (pause < -HZ) {
current->dirty_paused_when = now;
current->nr_dirtied = 0;
} else if (period) {
current->dirty_paused_when += period;
current->nr_dirtied = 0;
} else if (current->nr_dirtied_pause <= pages_dirtied)
current->nr_dirtied_pause += pages_dirtied;
break;
}
if (unlikely(pause > max_pause)) {
/* for occasional dropped task_ratelimit */
now += min(pause - max_pause, max_pause);
pause = max_pause;
}
pause:
trace_balance_dirty_pages(wb,
sdtc->thresh,
sdtc->bg_thresh,
sdtc->dirty,
sdtc->wb_thresh,
sdtc->wb_dirty,
dirty_ratelimit,
task_ratelimit,
pages_dirtied,
period,
pause,
start_time);
__set_current_state(TASK_KILLABLE);
wb->dirty_sleep = now;
io_schedule_timeout(pause);
current->dirty_paused_when = now + pause;
current->nr_dirtied = 0;
current->nr_dirtied_pause = nr_dirtied_pause;
/*
* This is typically equal to (dirty < thresh) and can also
* keep "1000+ dd on a slow USB stick" under control.
*/
if (task_ratelimit)
break;
/*
* In the case of an unresponsive NFS server and the NFS dirty
* pages exceeds dirty_thresh, give the other good wb's a pipe
* to go through, so that tasks on them still remain responsive.
*
* In theory 1 page is enough to keep the consumer-producer
* pipe going: the flusher cleans 1 page => the task dirties 1
* more page. However wb_dirty has accounting errors. So use
* the larger and more IO friendly wb_stat_error.
*/
if (sdtc->wb_dirty <= wb_stat_error())
break;
if (fatal_signal_pending(current))
break;
}
if (!dirty_exceeded && wb->dirty_exceeded) wb->dirty_exceeded = 0;
if (writeback_in_progress(wb))
return;
/*
* In laptop mode, we wait until hitting the higher threshold before
* starting background writeout, and then write out all the way down
* to the lower threshold. So slow writers cause minimal disk activity.
*
* In normal mode, we start background writeout at the lower
* background_thresh, to keep the amount of dirty memory low.
*/
if (laptop_mode)
return;
if (nr_reclaimable > gdtc->bg_thresh) wb_start_background_writeback(wb);
}
static DEFINE_PER_CPU(int, bdp_ratelimits);
/*
* Normal tasks are throttled by
* loop {
* dirty tsk->nr_dirtied_pause pages;
* take a snap in balance_dirty_pages();
* }
* However there is a worst case. If every task exit immediately when dirtied
* (tsk->nr_dirtied_pause - 1) pages, balance_dirty_pages() will never be
* called to throttle the page dirties. The solution is to save the not yet
* throttled page dirties in dirty_throttle_leaks on task exit and charge them
* randomly into the running tasks. This works well for the above worst case,
* as the new task will pick up and accumulate the old task's leaked dirty
* count and eventually get throttled.
*/
DEFINE_PER_CPU(int, dirty_throttle_leaks) = 0;
/**
* balance_dirty_pages_ratelimited - balance dirty memory state
* @mapping: address_space which was dirtied
*
* Processes which are dirtying memory should call in here once for each page
* which was newly dirtied. The function will periodically check the system's
* dirty state and will initiate writeback if needed.
*
* Once we're over the dirty memory limit we decrease the ratelimiting
* by a lot, to prevent individual processes from overshooting the limit
* by (ratelimit_pages) each.
*/
void balance_dirty_pages_ratelimited(struct address_space *mapping)
{
struct inode *inode = mapping->host;
struct backing_dev_info *bdi = inode_to_bdi(inode);
struct bdi_writeback *wb = NULL;
int ratelimit;
int *p;
if (!(bdi->capabilities & BDI_CAP_WRITEBACK))
return;
if (inode_cgwb_enabled(inode))
wb = wb_get_create_current(bdi, GFP_KERNEL);
if (!wb)
wb = &bdi->wb;
ratelimit = current->nr_dirtied_pause;
if (wb->dirty_exceeded)
ratelimit = min(ratelimit, 32 >> (PAGE_SHIFT - 10)); preempt_disable();
/*
* This prevents one CPU to accumulate too many dirtied pages without
* calling into balance_dirty_pages(), which can happen when there are
* 1000+ tasks, all of them start dirtying pages at exactly the same
* time, hence all honoured too large initial task->nr_dirtied_pause.
*/
p = this_cpu_ptr(&bdp_ratelimits);
if (unlikely(current->nr_dirtied >= ratelimit))
*p = 0; else if (unlikely(*p >= ratelimit_pages)) { *p = 0;
ratelimit = 0;
}
/*
* Pick up the dirtied pages by the exited tasks. This avoids lots of
* short-lived tasks (eg. gcc invocations in a kernel build) escaping
* the dirty throttling and livelock other long-run dirtiers.
*/
p = this_cpu_ptr(&dirty_throttle_leaks);
if (*p > 0 && current->nr_dirtied < ratelimit) {
unsigned long nr_pages_dirtied;
nr_pages_dirtied = min(*p, ratelimit - current->nr_dirtied);
*p -= nr_pages_dirtied;
current->nr_dirtied += nr_pages_dirtied;
}
preempt_enable(); if (unlikely(current->nr_dirtied >= ratelimit))
balance_dirty_pages(wb, current->nr_dirtied);
wb_put(wb);
}
EXPORT_SYMBOL(balance_dirty_pages_ratelimited);
/**
* wb_over_bg_thresh - does @wb need to be written back?
* @wb: bdi_writeback of interest
*
* Determines whether background writeback should keep writing @wb or it's
* clean enough.
*
* Return: %true if writeback should continue.
*/
bool wb_over_bg_thresh(struct bdi_writeback *wb)
{
struct dirty_throttle_control gdtc_stor = { GDTC_INIT(wb) };
struct dirty_throttle_control mdtc_stor = { MDTC_INIT(wb, &gdtc_stor) };
struct dirty_throttle_control * const gdtc = &gdtc_stor;
struct dirty_throttle_control * const mdtc = mdtc_valid(&mdtc_stor) ?
&mdtc_stor : NULL;
unsigned long reclaimable;
unsigned long thresh;
/*
* Similar to balance_dirty_pages() but ignores pages being written
* as we're trying to decide whether to put more under writeback.
*/
gdtc->avail = global_dirtyable_memory();
gdtc->dirty = global_node_page_state(NR_FILE_DIRTY);
domain_dirty_limits(gdtc);
if (gdtc->dirty > gdtc->bg_thresh)
return true;
thresh = wb_calc_thresh(gdtc->wb, gdtc->bg_thresh);
if (thresh < 2 * wb_stat_error())
reclaimable = wb_stat_sum(wb, WB_RECLAIMABLE);
else
reclaimable = wb_stat(wb, WB_RECLAIMABLE);
if (reclaimable > thresh)
return true;
if (mdtc) {
unsigned long filepages, headroom, writeback;
mem_cgroup_wb_stats(wb, &filepages, &headroom, &mdtc->dirty,
&writeback);
mdtc_calc_avail(mdtc, filepages, headroom);
domain_dirty_limits(mdtc); /* ditto, ignore writeback */
if (mdtc->dirty > mdtc->bg_thresh)
return true;
thresh = wb_calc_thresh(mdtc->wb, mdtc->bg_thresh);
if (thresh < 2 * wb_stat_error())
reclaimable = wb_stat_sum(wb, WB_RECLAIMABLE);
else
reclaimable = wb_stat(wb, WB_RECLAIMABLE);
if (reclaimable > thresh)
return true;
}
return false;
}
/*
* sysctl handler for /proc/sys/vm/dirty_writeback_centisecs
*/
int dirty_writeback_centisecs_handler(struct ctl_table *table, int write,
void *buffer, size_t *length, loff_t *ppos)
{
unsigned int old_interval = dirty_writeback_interval;
int ret;
ret = proc_dointvec(table, write, buffer, length, ppos);
/*
* Writing 0 to dirty_writeback_interval will disable periodic writeback
* and a different non-zero value will wakeup the writeback threads.
* wb_wakeup_delayed() would be more appropriate, but it's a pain to
* iterate over all bdis and wbs.
* The reason we do this is to make the change take effect immediately.
*/
if (!ret && write && dirty_writeback_interval &&
dirty_writeback_interval != old_interval)
wakeup_flusher_threads(WB_REASON_PERIODIC);
return ret;
}
void laptop_mode_timer_fn(struct timer_list *t)
{
struct backing_dev_info *backing_dev_info =
from_timer(backing_dev_info, t, laptop_mode_wb_timer);
wakeup_flusher_threads_bdi(backing_dev_info, WB_REASON_LAPTOP_TIMER);
}
/*
* We've spun up the disk and we're in laptop mode: schedule writeback
* of all dirty data a few seconds from now. If the flush is already scheduled
* then push it back - the user is still using the disk.
*/
void laptop_io_completion(struct backing_dev_info *info)
{
mod_timer(&info->laptop_mode_wb_timer, jiffies + laptop_mode);
}
/*
* We're in laptop mode and we've just synced. The sync's writes will have
* caused another writeback to be scheduled by laptop_io_completion.
* Nothing needs to be written back anymore, so we unschedule the writeback.
*/
void laptop_sync_completion(void)
{
struct backing_dev_info *bdi;
rcu_read_lock();
list_for_each_entry_rcu(bdi, &bdi_list, bdi_list)
del_timer(&bdi->laptop_mode_wb_timer);
rcu_read_unlock();
}
/*
* If ratelimit_pages is too high then we can get into dirty-data overload
* if a large number of processes all perform writes at the same time.
*
* Here we set ratelimit_pages to a level which ensures that when all CPUs are
* dirtying in parallel, we cannot go more than 3% (1/32) over the dirty memory
* thresholds.
*/
void writeback_set_ratelimit(void)
{
struct wb_domain *dom = &global_wb_domain;
unsigned long background_thresh;
unsigned long dirty_thresh;
global_dirty_limits(&background_thresh, &dirty_thresh);
dom->dirty_limit = dirty_thresh;
ratelimit_pages = dirty_thresh / (num_online_cpus() * 32);
if (ratelimit_pages < 16)
ratelimit_pages = 16;
}
static int page_writeback_cpu_online(unsigned int cpu)
{
writeback_set_ratelimit();
return 0;
}
/*
* Called early on to tune the page writeback dirty limits.
*
* We used to scale dirty pages according to how total memory
* related to pages that could be allocated for buffers.
*
* However, that was when we used "dirty_ratio" to scale with
* all memory, and we don't do that any more. "dirty_ratio"
* is now applied to total non-HIGHPAGE memory, and as such we can't
* get into the old insane situation any more where we had
* large amounts of dirty pages compared to a small amount of
* non-HIGHMEM memory.
*
* But we might still want to scale the dirty_ratio by how
* much memory the box has..
*/
void __init page_writeback_init(void)
{
BUG_ON(wb_domain_init(&global_wb_domain, GFP_KERNEL));
cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "mm/writeback:online",
page_writeback_cpu_online, NULL);
cpuhp_setup_state(CPUHP_MM_WRITEBACK_DEAD, "mm/writeback:dead", NULL,
page_writeback_cpu_online);
}
/**
* tag_pages_for_writeback - tag pages to be written by write_cache_pages
* @mapping: address space structure to write
* @start: starting page index
* @end: ending page index (inclusive)
*
* This function scans the page range from @start to @end (inclusive) and tags
* all pages that have DIRTY tag set with a special TOWRITE tag. The idea is
* that write_cache_pages (or whoever calls this function) will then use
* TOWRITE tag to identify pages eligible for writeback. This mechanism is
* used to avoid livelocking of writeback by a process steadily creating new
* dirty pages in the file (thus it is important for this function to be quick
* so that it can tag pages faster than a dirtying process can create them).
*/
void tag_pages_for_writeback(struct address_space *mapping,
pgoff_t start, pgoff_t end)
{
XA_STATE(xas, &mapping->i_pages, start);
unsigned int tagged = 0;
void *page;
xas_lock_irq(&xas);
xas_for_each_marked(&xas, page, end, PAGECACHE_TAG_DIRTY) { xas_set_mark(&xas, PAGECACHE_TAG_TOWRITE);
if (++tagged % XA_CHECK_SCHED)
continue;
xas_pause(&xas);
xas_unlock_irq(&xas);
cond_resched();
xas_lock_irq(&xas);
}
xas_unlock_irq(&xas);
}
EXPORT_SYMBOL(tag_pages_for_writeback);
/**
* write_cache_pages - walk the list of dirty pages of the given address space and write all of them.
* @mapping: address space structure to write
* @wbc: subtract the number of written pages from *@wbc->nr_to_write
* @writepage: function called for each page
* @data: data passed to writepage function
*
* If a page is already under I/O, write_cache_pages() skips it, even
* if it's dirty. This is desirable behaviour for memory-cleaning writeback,
* but it is INCORRECT for data-integrity system calls such as fsync(). fsync()
* and msync() need to guarantee that all the data which was dirty at the time
* the call was made get new I/O started against them. If wbc->sync_mode is
* WB_SYNC_ALL then we were called for data integrity and we must wait for
* existing IO to complete.
*
* To avoid livelocks (when other process dirties new pages), we first tag
* pages which should be written back with TOWRITE tag and only then start
* writing them. For data-integrity sync we have to be careful so that we do
* not miss some pages (e.g., because some other process has cleared TOWRITE
* tag we set). The rule we follow is that TOWRITE tag can be cleared only
* by the process clearing the DIRTY tag (and submitting the page for IO).
*
* To avoid deadlocks between range_cyclic writeback and callers that hold
* pages in PageWriteback to aggregate IO until write_cache_pages() returns,
* we do not loop back to the start of the file. Doing so causes a page
* lock/page writeback access order inversion - we should only ever lock
* multiple pages in ascending page->index order, and looping back to the start
* of the file violates that rule and causes deadlocks.
*
* Return: %0 on success, negative error code otherwise
*/
int write_cache_pages(struct address_space *mapping,
struct writeback_control *wbc, writepage_t writepage,
void *data)
{
int ret = 0;
int done = 0;
int error;
struct pagevec pvec;
int nr_pages;
pgoff_t index;
pgoff_t end; /* Inclusive */
pgoff_t done_index;
int range_whole = 0;
xa_mark_t tag;
pagevec_init(&pvec);
if (wbc->range_cyclic) {
index = mapping->writeback_index; /* prev offset */
end = -1;
} else {
index = wbc->range_start >> PAGE_SHIFT;
end = wbc->range_end >> PAGE_SHIFT;
if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
range_whole = 1;
}
if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) { tag_pages_for_writeback(mapping, index, end);
tag = PAGECACHE_TAG_TOWRITE;
} else {
tag = PAGECACHE_TAG_DIRTY;
}
done_index = index;
while (!done && (index <= end)) {
int i;
nr_pages = pagevec_lookup_range_tag(&pvec, mapping, &index, end,
tag);
if (nr_pages == 0)
break;
for (i = 0; i < nr_pages; i++) { struct page *page = pvec.pages[i];
done_index = page->index;
lock_page(page);
/*
* Page truncated or invalidated. We can freely skip it
* then, even for data integrity operations: the page
* has disappeared concurrently, so there could be no
* real expectation of this data integrity operation
* even if there is now a new, dirty page at the same
* pagecache address.
*/
if (unlikely(page->mapping != mapping)) {
continue_unlock:
unlock_page(page);
continue;
}
if (!PageDirty(page)) {
/* someone wrote it for us */
goto continue_unlock;
}
if (PageWriteback(page)) {
if (wbc->sync_mode != WB_SYNC_NONE) wait_on_page_writeback(page);
else
goto continue_unlock;
}
BUG_ON(PageWriteback(page)); if (!clear_page_dirty_for_io(page))
goto continue_unlock;
trace_wbc_writepage(wbc, inode_to_bdi(mapping->host)); error = (*writepage)(page, wbc, data);
if (unlikely(error)) {
/*
* Handle errors according to the type of
* writeback. There's no need to continue for
* background writeback. Just push done_index
* past this page so media errors won't choke
* writeout for the entire file. For integrity
* writeback, we must process the entire dirty
* set regardless of errors because the fs may
* still have state to clear for each page. In
* that case we continue processing and return
* the first error.
*/
if (error == AOP_WRITEPAGE_ACTIVATE) { unlock_page(page);
error = 0;
} else if (wbc->sync_mode != WB_SYNC_ALL) {
ret = error;
done_index = page->index + 1;
done = 1;
break;
}
if (!ret)
ret = error;
}
/*
* We stop writing back only if we are not doing
* integrity sync. In case of integrity sync we have to
* keep going until we have written all the pages
* we tagged for writeback prior to entering this loop.
*/
if (--wbc->nr_to_write <= 0 && wbc->sync_mode == WB_SYNC_NONE) {
done = 1;
break;
}
}
pagevec_release(&pvec);
cond_resched();
}
/*
* If we hit the last page and there is more work to be done: wrap
* back the index back to the start of the file for the next
* time we are called.
*/
if (wbc->range_cyclic && !done)
done_index = 0;
if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0)) mapping->writeback_index = done_index; return ret;
}
EXPORT_SYMBOL(write_cache_pages);
/*
* Function used by generic_writepages to call the real writepage
* function and set the mapping flags on error
*/
static int __writepage(struct page *page, struct writeback_control *wbc,
void *data)
{
struct address_space *mapping = data;
int ret = mapping->a_ops->writepage(page, wbc);
mapping_set_error(mapping, ret);
return ret;
}
/**
* generic_writepages - walk the list of dirty pages of the given address space and writepage() all of them.
* @mapping: address space structure to write
* @wbc: subtract the number of written pages from *@wbc->nr_to_write
*
* This is a library function, which implements the writepages()
* address_space_operation.
*
* Return: %0 on success, negative error code otherwise
*/
int generic_writepages(struct address_space *mapping,
struct writeback_control *wbc)
{
struct blk_plug plug;
int ret;
/* deal with chardevs and other special file */
if (!mapping->a_ops->writepage)
return 0;
blk_start_plug(&plug);
ret = write_cache_pages(mapping, wbc, __writepage, mapping);
blk_finish_plug(&plug);
return ret;
}
EXPORT_SYMBOL(generic_writepages);
int do_writepages(struct address_space *mapping, struct writeback_control *wbc)
{
int ret;
struct bdi_writeback *wb;
if (wbc->nr_to_write <= 0)
return 0;
wb = inode_to_wb_wbc(mapping->host, wbc);
wb_bandwidth_estimate_start(wb);
while (1) {
if (mapping->a_ops->writepages) ret = mapping->a_ops->writepages(mapping, wbc);
else
ret = generic_writepages(mapping, wbc);
if ((ret != -ENOMEM) || (wbc->sync_mode != WB_SYNC_ALL))
break;
cond_resched();
congestion_wait(BLK_RW_ASYNC, HZ/50);
}
/*
* Usually few pages are written by now from those we've just submitted
* but if there's constant writeback being submitted, this makes sure
* writeback bandwidth is updated once in a while.
*/
if (time_is_before_jiffies(READ_ONCE(wb->bw_time_stamp) +
BANDWIDTH_INTERVAL))
wb_update_bandwidth(wb);
return ret;
}
/**
* write_one_page - write out a single page and wait on I/O
* @page: the page to write
*
* The page must be locked by the caller and will be unlocked upon return.
*
* Note that the mapping's AS_EIO/AS_ENOSPC flags will be cleared when this
* function returns.
*
* Return: %0 on success, negative error code otherwise
*/
int write_one_page(struct page *page)
{
struct address_space *mapping = page->mapping;
int ret = 0;
struct writeback_control wbc = {
.sync_mode = WB_SYNC_ALL,
.nr_to_write = 1,
};
BUG_ON(!PageLocked(page));
wait_on_page_writeback(page);
if (clear_page_dirty_for_io(page)) {
get_page(page);
ret = mapping->a_ops->writepage(page, &wbc);
if (ret == 0)
wait_on_page_writeback(page);
put_page(page);
} else {
unlock_page(page);
}
if (!ret)
ret = filemap_check_errors(mapping);
return ret;
}
EXPORT_SYMBOL(write_one_page);
/*
* For address_spaces which do not use buffers nor write back.
*/
int __set_page_dirty_no_writeback(struct page *page)
{
if (!PageDirty(page)) return !TestSetPageDirty(page);
return 0;
}
EXPORT_SYMBOL(__set_page_dirty_no_writeback);
/*
* Helper function for set_page_dirty family.
*
* Caller must hold lock_page_memcg().
*
* NOTE: This relies on being atomic wrt interrupts.
*/
static void account_page_dirtied(struct page *page,
struct address_space *mapping)
{
struct inode *inode = mapping->host;
trace_writeback_dirty_page(page, mapping);
if (mapping_can_writeback(mapping)) {
struct bdi_writeback *wb;
inode_attach_wb(inode, page);
wb = inode_to_wb(inode);
__inc_lruvec_page_state(page, NR_FILE_DIRTY);
__inc_zone_page_state(page, NR_ZONE_WRITE_PENDING);
__inc_node_page_state(page, NR_DIRTIED);
inc_wb_stat(wb, WB_RECLAIMABLE);
inc_wb_stat(wb, WB_DIRTIED);
task_io_account_write(PAGE_SIZE);
current->nr_dirtied++;
__this_cpu_inc(bdp_ratelimits);
mem_cgroup_track_foreign_dirty(page, wb);
}
}
/*
* Helper function for deaccounting dirty page without writeback.
*
* Caller must hold lock_page_memcg().
*/
void account_page_cleaned(struct page *page, struct address_space *mapping,
struct bdi_writeback *wb)
{
if (mapping_can_writeback(mapping)) {
dec_lruvec_page_state(page, NR_FILE_DIRTY);
dec_zone_page_state(page, NR_ZONE_WRITE_PENDING);
dec_wb_stat(wb, WB_RECLAIMABLE);
task_io_account_cancelled_write(PAGE_SIZE);
}
}
/*
* Mark the page dirty, and set it dirty in the page cache, and mark the inode
* dirty.
*
* If warn is true, then emit a warning if the page is not uptodate and has
* not been truncated.
*
* The caller must hold lock_page_memcg().
*/
void __set_page_dirty(struct page *page, struct address_space *mapping,
int warn)
{
unsigned long flags;
xa_lock_irqsave(&mapping->i_pages, flags);
if (page->mapping) { /* Race with truncate? */
WARN_ON_ONCE(warn && !PageUptodate(page));
account_page_dirtied(page, mapping);
__xa_set_mark(&mapping->i_pages, page_index(page),
PAGECACHE_TAG_DIRTY);
}
xa_unlock_irqrestore(&mapping->i_pages, flags);
}
/*
* For address_spaces which do not use buffers. Just tag the page as dirty in
* the xarray.
*
* This is also used when a single buffer is being dirtied: we want to set the
* page dirty in that case, but not all the buffers. This is a "bottom-up"
* dirtying, whereas __set_page_dirty_buffers() is a "top-down" dirtying.
*
* The caller must ensure this doesn't race with truncation. Most will simply
* hold the page lock, but e.g. zap_pte_range() calls with the page mapped and
* the pte lock held, which also locks out truncation.
*/
int __set_page_dirty_nobuffers(struct page *page)
{
lock_page_memcg(page);
if (!TestSetPageDirty(page)) { struct address_space *mapping = page_mapping(page);
if (!mapping) {
unlock_page_memcg(page);
return 1;
}
__set_page_dirty(page, mapping, !PagePrivate(page));
unlock_page_memcg(page);
if (mapping->host) {
/* !PageAnon && !swapper_space */
__mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
}
return 1;
}
unlock_page_memcg(page);
return 0;
}
EXPORT_SYMBOL(__set_page_dirty_nobuffers);
/*
* Call this whenever redirtying a page, to de-account the dirty counters
* (NR_DIRTIED, WB_DIRTIED, tsk->nr_dirtied), so that they match the written
* counters (NR_WRITTEN, WB_WRITTEN) in long term. The mismatches will lead to
* systematic errors in balanced_dirty_ratelimit and the dirty pages position
* control.
*/
void account_page_redirty(struct page *page)
{
struct address_space *mapping = page->mapping;
if (mapping && mapping_can_writeback(mapping)) {
struct inode *inode = mapping->host;
struct bdi_writeback *wb;
struct wb_lock_cookie cookie = {};
wb = unlocked_inode_to_wb_begin(inode, &cookie);
current->nr_dirtied--;
dec_node_page_state(page, NR_DIRTIED);
dec_wb_stat(wb, WB_DIRTIED);
unlocked_inode_to_wb_end(inode, &cookie);
}
}
EXPORT_SYMBOL(account_page_redirty);
/*
* When a writepage implementation decides that it doesn't want to write this
* page for some reason, it should redirty the locked page via
* redirty_page_for_writepage() and it should then unlock the page and return 0
*/
int redirty_page_for_writepage(struct writeback_control *wbc, struct page *page)
{
int ret;
wbc->pages_skipped++;
ret = __set_page_dirty_nobuffers(page);
account_page_redirty(page);
return ret;
}
EXPORT_SYMBOL(redirty_page_for_writepage);
/*
* Dirty a page.
*
* For pages with a mapping this should be done under the page lock for the
* benefit of asynchronous memory errors who prefer a consistent dirty state.
* This rule can be broken in some special cases, but should be better not to.
*/
int set_page_dirty(struct page *page)
{
struct address_space *mapping = page_mapping(page);
page = compound_head(page);
if (likely(mapping)) {
/*
* readahead/lru_deactivate_page could remain
* PG_readahead/PG_reclaim due to race with end_page_writeback
* About readahead, if the page is written, the flags would be
* reset. So no problem.
* About lru_deactivate_page, if the page is redirty, the flag
* will be reset. So no problem. but if the page is used by readahead
* it will confuse readahead and make it restart the size rampup
* process. But it's a trivial problem.
*/
if (PageReclaim(page))
ClearPageReclaim(page);
return mapping->a_ops->set_page_dirty(page);
}
if (!PageDirty(page)) {
if (!TestSetPageDirty(page))
return 1;
}
return 0;
}
EXPORT_SYMBOL(set_page_dirty);
/*
* set_page_dirty() is racy if the caller has no reference against
* page->mapping->host, and if the page is unlocked. This is because another
* CPU could truncate the page off the mapping and then free the mapping.
*
* Usually, the page _is_ locked, or the caller is a user-space process which
* holds a reference on the inode by having an open file.
*
* In other cases, the page should be locked before running set_page_dirty().
*/
int set_page_dirty_lock(struct page *page)
{
int ret;
lock_page(page);
ret = set_page_dirty(page);
unlock_page(page);
return ret;
}
EXPORT_SYMBOL(set_page_dirty_lock);
/*
* This cancels just the dirty bit on the kernel page itself, it does NOT
* actually remove dirty bits on any mmap's that may be around. It also
* leaves the page tagged dirty, so any sync activity will still find it on
* the dirty lists, and in particular, clear_page_dirty_for_io() will still
* look at the dirty bits in the VM.
*
* Doing this should *normally* only ever be done when a page is truncated,
* and is not actually mapped anywhere at all. However, fs/buffer.c does
* this when it notices that somebody has cleaned out all the buffers on a
* page without actually doing it through the VM. Can you say "ext3 is
* horribly ugly"? Thought you could.
*/
void __cancel_dirty_page(struct page *page)
{
struct address_space *mapping = page_mapping(page);
if (mapping_can_writeback(mapping)) {
struct inode *inode = mapping->host;
struct bdi_writeback *wb;
struct wb_lock_cookie cookie = {};
lock_page_memcg(page);
wb = unlocked_inode_to_wb_begin(inode, &cookie);
if (TestClearPageDirty(page))
account_page_cleaned(page, mapping, wb);
unlocked_inode_to_wb_end(inode, &cookie);
unlock_page_memcg(page);
} else {
ClearPageDirty(page);
}
}
EXPORT_SYMBOL(__cancel_dirty_page);
/*
* Clear a page's dirty flag, while caring for dirty memory accounting.
* Returns true if the page was previously dirty.
*
* This is for preparing to put the page under writeout. We leave the page
* tagged as dirty in the xarray so that a concurrent write-for-sync
* can discover it via a PAGECACHE_TAG_DIRTY walk. The ->writepage
* implementation will run either set_page_writeback() or set_page_dirty(),
* at which stage we bring the page's dirty flag and xarray dirty tag
* back into sync.
*
* This incoherency between the page's dirty flag and xarray tag is
* unfortunate, but it only exists while the page is locked.
*/
int clear_page_dirty_for_io(struct page *page)
{
struct address_space *mapping = page_mapping(page);
int ret = 0;
VM_BUG_ON_PAGE(!PageLocked(page), page);
if (mapping && mapping_can_writeback(mapping)) { struct inode *inode = mapping->host;
struct bdi_writeback *wb;
struct wb_lock_cookie cookie = {};
/*
* Yes, Virginia, this is indeed insane.
*
* We use this sequence to make sure that
* (a) we account for dirty stats properly
* (b) we tell the low-level filesystem to
* mark the whole page dirty if it was
* dirty in a pagetable. Only to then
* (c) clean the page again and return 1 to
* cause the writeback.
*
* This way we avoid all nasty races with the
* dirty bit in multiple places and clearing
* them concurrently from different threads.
*
* Note! Normally the "set_page_dirty(page)"
* has no effect on the actual dirty bit - since
* that will already usually be set. But we
* need the side effects, and it can help us
* avoid races.
*
* We basically use the page "master dirty bit"
* as a serialization point for all the different
* threads doing their things.
*/
if (page_mkclean(page))
set_page_dirty(page);
/*
* We carefully synchronise fault handlers against
* installing a dirty pte and marking the page dirty
* at this point. We do this by having them hold the
* page lock while dirtying the page, and pages are
* always locked coming in here, so we get the desired
* exclusion.
*/
wb = unlocked_inode_to_wb_begin(inode, &cookie);
if (TestClearPageDirty(page)) {
dec_lruvec_page_state(page, NR_FILE_DIRTY);
dec_zone_page_state(page, NR_ZONE_WRITE_PENDING);
dec_wb_stat(wb, WB_RECLAIMABLE);
ret = 1;
}
unlocked_inode_to_wb_end(inode, &cookie);
return ret;
}
return TestClearPageDirty(page);
}
EXPORT_SYMBOL(clear_page_dirty_for_io);
static void wb_inode_writeback_start(struct bdi_writeback *wb)
{
atomic_inc(&wb->writeback_inodes);
}
static void wb_inode_writeback_end(struct bdi_writeback *wb)
{
atomic_dec(&wb->writeback_inodes);
/*
* Make sure estimate of writeback throughput gets updated after
* writeback completed. We delay the update by BANDWIDTH_INTERVAL
* (which is the interval other bandwidth updates use for batching) so
* that if multiple inodes end writeback at a similar time, they get
* batched into one bandwidth update.
*/
queue_delayed_work(bdi_wq, &wb->bw_dwork, BANDWIDTH_INTERVAL);
}
int test_clear_page_writeback(struct page *page)
{
struct address_space *mapping = page_mapping(page);
int ret;
lock_page_memcg(page);
if (mapping && mapping_use_writeback_tags(mapping)) {
struct inode *inode = mapping->host;
struct backing_dev_info *bdi = inode_to_bdi(inode);
unsigned long flags;
xa_lock_irqsave(&mapping->i_pages, flags);
ret = TestClearPageWriteback(page);
if (ret) {
__xa_clear_mark(&mapping->i_pages, page_index(page),
PAGECACHE_TAG_WRITEBACK);
if (bdi->capabilities & BDI_CAP_WRITEBACK_ACCT) {
struct bdi_writeback *wb = inode_to_wb(inode);
dec_wb_stat(wb, WB_WRITEBACK);
__wb_writeout_inc(wb);
if (!mapping_tagged(mapping,
PAGECACHE_TAG_WRITEBACK))
wb_inode_writeback_end(wb);
}
}
if (mapping->host && !mapping_tagged(mapping,
PAGECACHE_TAG_WRITEBACK))
sb_clear_inode_writeback(mapping->host);
xa_unlock_irqrestore(&mapping->i_pages, flags);
} else {
ret = TestClearPageWriteback(page);
}
if (ret) {
dec_lruvec_page_state(page, NR_WRITEBACK);
dec_zone_page_state(page, NR_ZONE_WRITE_PENDING);
inc_node_page_state(page, NR_WRITTEN);
}
unlock_page_memcg(page);
return ret;
}
int __test_set_page_writeback(struct page *page, bool keep_write)
{
struct address_space *mapping = page_mapping(page);
int ret, access_ret;
lock_page_memcg(page);
if (mapping && mapping_use_writeback_tags(mapping)) {
XA_STATE(xas, &mapping->i_pages, page_index(page));
struct inode *inode = mapping->host;
struct backing_dev_info *bdi = inode_to_bdi(inode);
unsigned long flags;
xas_lock_irqsave(&xas, flags);
xas_load(&xas);
ret = TestSetPageWriteback(page);
if (!ret) {
bool on_wblist;
on_wblist = mapping_tagged(mapping,
PAGECACHE_TAG_WRITEBACK);
xas_set_mark(&xas, PAGECACHE_TAG_WRITEBACK);
if (bdi->capabilities & BDI_CAP_WRITEBACK_ACCT) {
struct bdi_writeback *wb = inode_to_wb(inode);
inc_wb_stat(wb, WB_WRITEBACK);
if (!on_wblist)
wb_inode_writeback_start(wb);
}
/*
* We can come through here when swapping anonymous
* pages, so we don't necessarily have an inode to track
* for sync.
*/
if (mapping->host && !on_wblist) sb_mark_inode_writeback(mapping->host);
}
if (!PageDirty(page))
xas_clear_mark(&xas, PAGECACHE_TAG_DIRTY); if (!keep_write) xas_clear_mark(&xas, PAGECACHE_TAG_TOWRITE); xas_unlock_irqrestore(&xas, flags);
} else {
ret = TestSetPageWriteback(page);
}
if (!ret) {
inc_lruvec_page_state(page, NR_WRITEBACK);
inc_zone_page_state(page, NR_ZONE_WRITE_PENDING);
}
unlock_page_memcg(page);
access_ret = arch_make_page_accessible(page);
/*
* If writeback has been triggered on a page that cannot be made
* accessible, it is too late to recover here.
*/
VM_BUG_ON_PAGE(access_ret != 0, page);
return ret;
}
EXPORT_SYMBOL(__test_set_page_writeback);
/*
* Wait for a page to complete writeback
*/
void wait_on_page_writeback(struct page *page)
{
while (PageWriteback(page)) {
trace_wait_on_page_writeback(page, page_mapping(page)); wait_on_page_bit(page, PG_writeback);
}
}
EXPORT_SYMBOL_GPL(wait_on_page_writeback);
/*
* Wait for a page to complete writeback. Returns -EINTR if we get a
* fatal signal while waiting.
*/
int wait_on_page_writeback_killable(struct page *page)
{
while (PageWriteback(page)) {
trace_wait_on_page_writeback(page, page_mapping(page));
if (wait_on_page_bit_killable(page, PG_writeback))
return -EINTR;
}
return 0;
}
EXPORT_SYMBOL_GPL(wait_on_page_writeback_killable);
/**
* wait_for_stable_page() - wait for writeback to finish, if necessary.
* @page: The page to wait on.
*
* This function determines if the given page is related to a backing device
* that requires page contents to be held stable during writeback. If so, then
* it will wait for any pending writeback to complete.
*/
void wait_for_stable_page(struct page *page)
{
page = thp_head(page);
if (page->mapping->host->i_sb->s_iflags & SB_I_STABLE_WRITES) wait_on_page_writeback(page);
}
EXPORT_SYMBOL_GPL(wait_for_stable_page);
// SPDX-License-Identifier: GPL-2.0
/*
* arch/x86_64/lib/csum-partial.c
*
* This file contains network checksum routines that are better done
* in an architecture-specific manner due to speed.
*/
#include <linux/compiler.h>
#include <linux/export.h>
#include <asm/checksum.h>
static inline unsigned short from32to16(unsigned a)
{
unsigned short b = a >> 16;
asm("addw %w2,%w0\n\t"
"adcw $0,%w0\n"
: "=r" (b)
: "0" (b), "r" (a));
return b;
}
/*
* Do a 64-bit checksum on an arbitrary memory area.
* Returns a 32bit checksum.
*
* This isn't as time critical as it used to be because many NICs
* do hardware checksumming these days.
*
* Things tried and found to not make it faster:
* Manual Prefetching
* Unrolling to an 128 bytes inner loop.
* Using interleaving with more registers to break the carry chains.
*/
static unsigned do_csum(const unsigned char *buff, unsigned len)
{
unsigned odd, count;
unsigned long result = 0;
if (unlikely(len == 0))
return result;
odd = 1 & (unsigned long) buff;
if (unlikely(odd)) {
result = *buff << 8;
len--;
buff++;
}
count = len >> 1; /* nr of 16-bit words.. */
if (count) {
if (2 & (unsigned long) buff) { result += *(unsigned short *)buff;
count--;
len -= 2;
buff += 2;
}
count >>= 1; /* nr of 32-bit words.. */
if (count) {
unsigned long zero;
unsigned count64;
if (4 & (unsigned long) buff) { result += *(unsigned int *) buff;
count--;
len -= 4;
buff += 4;
}
count >>= 1; /* nr of 64-bit words.. */
/* main loop using 64byte blocks */
zero = 0;
count64 = count >> 3;
while (count64) {
asm("addq 0*8(%[src]),%[res]\n\t"
"adcq 1*8(%[src]),%[res]\n\t"
"adcq 2*8(%[src]),%[res]\n\t"
"adcq 3*8(%[src]),%[res]\n\t"
"adcq 4*8(%[src]),%[res]\n\t"
"adcq 5*8(%[src]),%[res]\n\t"
"adcq 6*8(%[src]),%[res]\n\t"
"adcq 7*8(%[src]),%[res]\n\t"
"adcq %[zero],%[res]"
: [res] "=r" (result)
: [src] "r" (buff), [zero] "r" (zero),
"[res]" (result));
buff += 64;
count64--;
}
/* last up to 7 8byte blocks */
count %= 8;
while (count) {
asm("addq %1,%0\n\t"
"adcq %2,%0\n"
: "=r" (result)
: "m" (*(unsigned long *)buff),
"r" (zero), "0" (result));
--count;
buff += 8;
}
result = add32_with_carry(result>>32,
result&0xffffffff);
if (len & 4) {
result += *(unsigned int *) buff;
buff += 4;
}
}
if (len & 2) { result += *(unsigned short *) buff;
buff += 2;
}
}
if (len & 1) result += *buff; result = add32_with_carry(result>>32, result & 0xffffffff); if (unlikely(odd)) {
result = from32to16(result);
result = ((result >> 8) & 0xff) | ((result & 0xff) << 8);
}
return result;
}
/*
* computes the checksum of a memory block at buff, length len,
* and adds in "sum" (32-bit)
*
* returns a 32-bit number suitable for feeding into itself
* or csum_tcpudp_magic
*
* this function must be called with even lengths, except
* for the last fragment, which may be odd
*
* it's best to have buff aligned on a 64-bit boundary
*/
__wsum csum_partial(const void *buff, int len, __wsum sum)
{
return (__force __wsum)add32_with_carry(do_csum(buff, len),
(__force u32)sum);
}
EXPORT_SYMBOL(csum_partial);
/*
* this routine is used for miscellaneous IP-like checksums, mainly
* in icmp.c
*/
__sum16 ip_compute_csum(const void *buff, int len)
{
return csum_fold(csum_partial(buff,len,0));
}
EXPORT_SYMBOL(ip_compute_csum);
// SPDX-License-Identifier: GPL-2.0-only
/*
* Copyright 2002, 2003 Andi Kleen, SuSE Labs.
*
* Wrappers of assembly checksum functions for x86-64.
*/
#include <asm/checksum.h>
#include <linux/export.h>
#include <linux/uaccess.h>
#include <asm/smap.h>
/**
* csum_and_copy_from_user - Copy and checksum from user space.
* @src: source address (user space)
* @dst: destination address
* @len: number of bytes to be copied.
* @isum: initial sum that is added into the result (32bit unfolded)
* @errp: set to -EFAULT for an bad source address.
*
* Returns an 32bit unfolded checksum of the buffer.
* src and dst are best aligned to 64bits.
*/
__wsum
csum_and_copy_from_user(const void __user *src, void *dst, int len)
{
__wsum sum;
might_sleep();
if (!user_access_begin(src, len))
return 0;
sum = csum_partial_copy_generic((__force const void *)src, dst, len);
user_access_end();
return sum;
}
EXPORT_SYMBOL(csum_and_copy_from_user);
/**
* csum_and_copy_to_user - Copy and checksum to user space.
* @src: source address
* @dst: destination address (user space)
* @len: number of bytes to be copied.
* @isum: initial sum that is added into the result (32bit unfolded)
* @errp: set to -EFAULT for an bad destination address.
*
* Returns an 32bit unfolded checksum of the buffer.
* src and dst are best aligned to 64bits.
*/
__wsum
csum_and_copy_to_user(const void *src, void __user *dst, int len)
{
__wsum sum;
might_sleep();
if (!user_access_begin(dst, len))
return 0;
sum = csum_partial_copy_generic(src, (void __force *)dst, len);
user_access_end();
return sum;
}
EXPORT_SYMBOL(csum_and_copy_to_user);
/**
* csum_partial_copy_nocheck - Copy and checksum.
* @src: source address
* @dst: destination address
* @len: number of bytes to be copied.
* @sum: initial sum that is added into the result (32bit unfolded)
*
* Returns an 32bit unfolded checksum of the buffer.
*/
__wsum
csum_partial_copy_nocheck(const void *src, void *dst, int len)
{
return csum_partial_copy_generic(src, dst, len);
}
EXPORT_SYMBOL(csum_partial_copy_nocheck);
__sum16 csum_ipv6_magic(const struct in6_addr *saddr,
const struct in6_addr *daddr,
__u32 len, __u8 proto, __wsum sum)
{
__u64 rest, sum64;
rest = (__force __u64)htonl(len) + (__force __u64)htons(proto) +
(__force __u64)sum;
asm(" addq (%[saddr]),%[sum]\n"
" adcq 8(%[saddr]),%[sum]\n"
" adcq (%[daddr]),%[sum]\n"
" adcq 8(%[daddr]),%[sum]\n"
" adcq $0,%[sum]\n"
: [sum] "=r" (sum64)
: "[sum]" (rest), [saddr] "r" (saddr), [daddr] "r" (daddr));
return csum_fold(
(__force __wsum)add32_with_carry(sum64 & 0xffffffff, sum64>>32));
}
EXPORT_SYMBOL(csum_ipv6_magic);
// SPDX-License-Identifier: GPL-2.0
/*
* kobject.c - library routines for handling generic kernel objects
*
* Copyright (c) 2002-2003 Patrick Mochel <mochel@osdl.org>
* Copyright (c) 2006-2007 Greg Kroah-Hartman <greg@kroah.com>
* Copyright (c) 2006-2007 Novell Inc.
*
* Please see the file Documentation/core-api/kobject.rst for critical information
* about using the kobject interface.
*/
#include <linux/kobject.h>
#include <linux/string.h>
#include <linux/export.h>
#include <linux/stat.h>
#include <linux/slab.h>
#include <linux/random.h>
/**
* kobject_namespace() - Return @kobj's namespace tag.
* @kobj: kobject in question
*
* Returns namespace tag of @kobj if its parent has namespace ops enabled
* and thus @kobj should have a namespace tag associated with it. Returns
* %NULL otherwise.
*/
const void *kobject_namespace(struct kobject *kobj)
{
const struct kobj_ns_type_operations *ns_ops = kobj_ns_ops(kobj);
if (!ns_ops || ns_ops->type == KOBJ_NS_TYPE_NONE)
return NULL;
return kobj->ktype->namespace(kobj);}
/**
* kobject_get_ownership() - Get sysfs ownership data for @kobj.
* @kobj: kobject in question
* @uid: kernel user ID for sysfs objects
* @gid: kernel group ID for sysfs objects
*
* Returns initial uid/gid pair that should be used when creating sysfs
* representation of given kobject. Normally used to adjust ownership of
* objects in a container.
*/
void kobject_get_ownership(struct kobject *kobj, kuid_t *uid, kgid_t *gid)
{
*uid = GLOBAL_ROOT_UID;
*gid = GLOBAL_ROOT_GID;
if (kobj->ktype->get_ownership)
kobj->ktype->get_ownership(kobj, uid, gid);
}
/*
* populate_dir - populate directory with attributes.
* @kobj: object we're working on.
*
* Most subsystems have a set of default attributes that are associated
* with an object that registers with them. This is a helper called during
* object registration that loops through the default attributes of the
* subsystem and creates attributes files for them in sysfs.
*/
static int populate_dir(struct kobject *kobj)
{
struct kobj_type *t = get_ktype(kobj);
struct attribute *attr;
int error = 0;
int i;
if (t && t->default_attrs) { for (i = 0; (attr = t->default_attrs[i]) != NULL; i++) {
error = sysfs_create_file(kobj, attr);
if (error)
break;
}
}
return error;
}
static int create_dir(struct kobject *kobj)
{
const struct kobj_type *ktype = get_ktype(kobj);
const struct kobj_ns_type_operations *ops;
int error;
error = sysfs_create_dir_ns(kobj, kobject_namespace(kobj));
if (error)
return error;
error = populate_dir(kobj);
if (error) {
sysfs_remove_dir(kobj);
return error;
}
if (ktype) { error = sysfs_create_groups(kobj, ktype->default_groups);
if (error) {
sysfs_remove_dir(kobj);
return error;
}
}
/*
* @kobj->sd may be deleted by an ancestor going away. Hold an
* extra reference so that it stays until @kobj is gone.
*/
sysfs_get(kobj->sd);
/*
* If @kobj has ns_ops, its children need to be filtered based on
* their namespace tags. Enable namespace support on @kobj->sd.
*/
ops = kobj_child_ns_ops(kobj);
if (ops) {
BUG_ON(ops->type <= KOBJ_NS_TYPE_NONE); BUG_ON(ops->type >= KOBJ_NS_TYPES); BUG_ON(!kobj_ns_type_registered(ops->type)); sysfs_enable_ns(kobj->sd);
}
return 0;
}
static int get_kobj_path_length(struct kobject *kobj)
{
int length = 1;
struct kobject *parent = kobj;
/* walk up the ancestors until we hit the one pointing to the
* root.
* Add 1 to strlen for leading '/' of each level.
*/
do {
if (kobject_name(parent) == NULL)
return 0;
length += strlen(kobject_name(parent)) + 1;
parent = parent->parent;
} while (parent);
return length;
}
static void fill_kobj_path(struct kobject *kobj, char *path, int length)
{
struct kobject *parent;
--length;
for (parent = kobj; parent; parent = parent->parent) {
int cur = strlen(kobject_name(parent));
/* back up enough to print this name with '/' */
length -= cur;
memcpy(path + length, kobject_name(parent), cur);
*(path + --length) = '/';
}
pr_debug("kobject: '%s' (%p): %s: path = '%s'\n", kobject_name(kobj),
kobj, __func__, path);
}
/**
* kobject_get_path() - Allocate memory and fill in the path for @kobj.
* @kobj: kobject in question, with which to build the path
* @gfp_mask: the allocation type used to allocate the path
*
* Return: The newly allocated memory, caller must free with kfree().
*/
char *kobject_get_path(struct kobject *kobj, gfp_t gfp_mask)
{
char *path;
int len;
len = get_kobj_path_length(kobj);
if (len == 0) return NULL; path = kzalloc(len, gfp_mask);
if (!path)
return NULL;
fill_kobj_path(kobj, path, len);
return path;
}
EXPORT_SYMBOL_GPL(kobject_get_path);
/* add the kobject to its kset's list */
static void kobj_kset_join(struct kobject *kobj)
{
if (!kobj->kset)
return;
kset_get(kobj->kset);
spin_lock(&kobj->kset->list_lock);
list_add_tail(&kobj->entry, &kobj->kset->list);
spin_unlock(&kobj->kset->list_lock);
}
/* remove the kobject from its kset's list */
static void kobj_kset_leave(struct kobject *kobj)
{
if (!kobj->kset)
return;
spin_lock(&kobj->kset->list_lock);
list_del_init(&kobj->entry);
spin_unlock(&kobj->kset->list_lock);
kset_put(kobj->kset);
}
static void kobject_init_internal(struct kobject *kobj)
{
if (!kobj)
return;
kref_init(&kobj->kref);
INIT_LIST_HEAD(&kobj->entry);
kobj->state_in_sysfs = 0;
kobj->state_add_uevent_sent = 0;
kobj->state_remove_uevent_sent = 0;
kobj->state_initialized = 1;
}
static int kobject_add_internal(struct kobject *kobj)
{
int error = 0;
struct kobject *parent;
if (!kobj)
return -ENOENT;
if (!kobj->name || !kobj->name[0]) { WARN(1,
"kobject: (%p): attempted to be registered with empty name!\n",
kobj);
return -EINVAL;
}
parent = kobject_get(kobj->parent);
/* join kset if set, use it as parent if we do not already have one */
if (kobj->kset) {
if (!parent) parent = kobject_get(&kobj->kset->kobj);
kobj_kset_join(kobj);
kobj->parent = parent;
}
pr_debug("kobject: '%s' (%p): %s: parent: '%s', set: '%s'\n",
kobject_name(kobj), kobj, __func__,
parent ? kobject_name(parent) : "<NULL>",
kobj->kset ? kobject_name(&kobj->kset->kobj) : "<NULL>");
error = create_dir(kobj);
if (error) {
kobj_kset_leave(kobj);
kobject_put(parent);
kobj->parent = NULL;
/* be noisy on error issues */
if (error == -EEXIST)
pr_err("%s failed for %s with -EEXIST, don't try to register things with the same name in the same directory.\n",
__func__, kobject_name(kobj));
else
pr_err("%s failed for %s (error: %d parent: %s)\n",
__func__, kobject_name(kobj), error,
parent ? kobject_name(parent) : "'none'");
} else
kobj->state_in_sysfs = 1;
return error;
}
/**
* kobject_set_name_vargs() - Set the name of a kobject.
* @kobj: struct kobject to set the name of
* @fmt: format string used to build the name
* @vargs: vargs to format the string.
*/
int kobject_set_name_vargs(struct kobject *kobj, const char *fmt,
va_list vargs)
{
const char *s;
if (kobj->name && !fmt)
return 0;
s = kvasprintf_const(GFP_KERNEL, fmt, vargs);
if (!s)
return -ENOMEM;
/*
* ewww... some of these buggers have '/' in the name ... If
* that's the case, we need to make sure we have an actual
* allocated copy to modify, since kvasprintf_const may have
* returned something from .rodata.
*/
if (strchr(s, '/')) {
char *t;
t = kstrdup(s, GFP_KERNEL);
kfree_const(s);
if (!t)
return -ENOMEM;
strreplace(t, '/', '!');
s = t;
}
kfree_const(kobj->name);
kobj->name = s;
return 0;}
/**
* kobject_set_name() - Set the name of a kobject.
* @kobj: struct kobject to set the name of
* @fmt: format string used to build the name
*
* This sets the name of the kobject. If you have already added the
* kobject to the system, you must call kobject_rename() in order to
* change the name of the kobject.
*/
int kobject_set_name(struct kobject *kobj, const char *fmt, ...)
{
va_list vargs;
int retval;
va_start(vargs, fmt);
retval = kobject_set_name_vargs(kobj, fmt, vargs);
va_end(vargs);
return retval;
}
EXPORT_SYMBOL(kobject_set_name);
/**
* kobject_init() - Initialize a kobject structure.
* @kobj: pointer to the kobject to initialize
* @ktype: pointer to the ktype for this kobject.
*
* This function will properly initialize a kobject such that it can then
* be passed to the kobject_add() call.
*
* After this function is called, the kobject MUST be cleaned up by a call
* to kobject_put(), not by a call to kfree directly to ensure that all of
* the memory is cleaned up properly.
*/
void kobject_init(struct kobject *kobj, struct kobj_type *ktype)
{
char *err_str;
if (!kobj) {
err_str = "invalid kobject pointer!";
goto error;
}
if (!ktype) {
err_str = "must have a ktype to be initialized properly!\n";
goto error;
}
if (kobj->state_initialized) {
/* do not error out as sometimes we can recover */
pr_err("kobject (%p): tried to init an initialized object, something is seriously wrong.\n",
kobj);
dump_stack();
}
kobject_init_internal(kobj);
kobj->ktype = ktype;
return;
error:
pr_err("kobject (%p): %s\n", kobj, err_str);
dump_stack();
}
EXPORT_SYMBOL(kobject_init);
static __printf(3, 0) int kobject_add_varg(struct kobject *kobj,
struct kobject *parent,
const char *fmt, va_list vargs)
{
int retval;
retval = kobject_set_name_vargs(kobj, fmt, vargs);
if (retval) {
pr_err("kobject: can not set name properly!\n");
return retval;
}
kobj->parent = parent;
return kobject_add_internal(kobj);
}
/**
* kobject_add() - The main kobject add function.
* @kobj: the kobject to add
* @parent: pointer to the parent of the kobject.
* @fmt: format to name the kobject with.
*
* The kobject name is set and added to the kobject hierarchy in this
* function.
*
* If @parent is set, then the parent of the @kobj will be set to it.
* If @parent is NULL, then the parent of the @kobj will be set to the
* kobject associated with the kset assigned to this kobject. If no kset
* is assigned to the kobject, then the kobject will be located in the
* root of the sysfs tree.
*
* Note, no "add" uevent will be created with this call, the caller should set
* up all of the necessary sysfs files for the object and then call
* kobject_uevent() with the UEVENT_ADD parameter to ensure that
* userspace is properly notified of this kobject's creation.
*
* Return: If this function returns an error, kobject_put() must be
* called to properly clean up the memory associated with the
* object. Under no instance should the kobject that is passed
* to this function be directly freed with a call to kfree(),
* that can leak memory.
*
* If this function returns success, kobject_put() must also be called
* in order to properly clean up the memory associated with the object.
*
* In short, once this function is called, kobject_put() MUST be called
* when the use of the object is finished in order to properly free
* everything.
*/
int kobject_add(struct kobject *kobj, struct kobject *parent,
const char *fmt, ...)
{
va_list args;
int retval;
if (!kobj)
return -EINVAL;
if (!kobj->state_initialized) {
pr_err("kobject '%s' (%p): tried to add an uninitialized object, something is seriously wrong.\n",
kobject_name(kobj), kobj);
dump_stack();
return -EINVAL;
}
va_start(args, fmt);
retval = kobject_add_varg(kobj, parent, fmt, args);
va_end(args); return retval;
}
EXPORT_SYMBOL(kobject_add);
/**
* kobject_init_and_add() - Initialize a kobject structure and add it to
* the kobject hierarchy.
* @kobj: pointer to the kobject to initialize
* @ktype: pointer to the ktype for this kobject.
* @parent: pointer to the parent of this kobject.
* @fmt: the name of the kobject.
*
* This function combines the call to kobject_init() and kobject_add().
*
* If this function returns an error, kobject_put() must be called to
* properly clean up the memory associated with the object. This is the
* same type of error handling after a call to kobject_add() and kobject
* lifetime rules are the same here.
*/
int kobject_init_and_add(struct kobject *kobj, struct kobj_type *ktype,
struct kobject *parent, const char *fmt, ...)
{
va_list args;
int retval;
kobject_init(kobj, ktype);
va_start(args, fmt);
retval = kobject_add_varg(kobj, parent, fmt, args);
va_end(args);
return retval;
}
EXPORT_SYMBOL_GPL(kobject_init_and_add);
/**
* kobject_rename() - Change the name of an object.
* @kobj: object in question.
* @new_name: object's new name
*
* It is the responsibility of the caller to provide mutual
* exclusion between two different calls of kobject_rename
* on the same kobject and to ensure that new_name is valid and
* won't conflict with other kobjects.
*/
int kobject_rename(struct kobject *kobj, const char *new_name)
{
int error = 0;
const char *devpath = NULL;
const char *dup_name = NULL, *name;
char *devpath_string = NULL;
char *envp[2];
kobj = kobject_get(kobj);
if (!kobj)
return -EINVAL;
if (!kobj->parent) {
kobject_put(kobj);
return -EINVAL;
}
devpath = kobject_get_path(kobj, GFP_KERNEL);
if (!devpath) {
error = -ENOMEM;
goto out;
}
devpath_string = kmalloc(strlen(devpath) + 15, GFP_KERNEL);
if (!devpath_string) {
error = -ENOMEM;
goto out;
}
sprintf(devpath_string, "DEVPATH_OLD=%s", devpath);
envp[0] = devpath_string;
envp[1] = NULL;
name = dup_name = kstrdup_const(new_name, GFP_KERNEL);
if (!name) {
error = -ENOMEM;
goto out;
}
error = sysfs_rename_dir_ns(kobj, new_name, kobject_namespace(kobj));
if (error)
goto out;
/* Install the new kobject name */
dup_name = kobj->name;
kobj->name = name;
/* This function is mostly/only used for network interface.
* Some hotplug package track interfaces by their name and
* therefore want to know when the name is changed by the user. */
kobject_uevent_env(kobj, KOBJ_MOVE, envp);
out:
kfree_const(dup_name);
kfree(devpath_string);
kfree(devpath);
kobject_put(kobj);
return error;
}
EXPORT_SYMBOL_GPL(kobject_rename);
/**
* kobject_move() - Move object to another parent.
* @kobj: object in question.
* @new_parent: object's new parent (can be NULL)
*/
int kobject_move(struct kobject *kobj, struct kobject *new_parent)
{
int error;
struct kobject *old_parent;
const char *devpath = NULL;
char *devpath_string = NULL;
char *envp[2];
kobj = kobject_get(kobj);
if (!kobj)
return -EINVAL;
new_parent = kobject_get(new_parent);
if (!new_parent) {
if (kobj->kset)
new_parent = kobject_get(&kobj->kset->kobj);
}
/* old object path */
devpath = kobject_get_path(kobj, GFP_KERNEL);
if (!devpath) {
error = -ENOMEM;
goto out;
}
devpath_string = kmalloc(strlen(devpath) + 15, GFP_KERNEL);
if (!devpath_string) {
error = -ENOMEM;
goto out;
}
sprintf(devpath_string, "DEVPATH_OLD=%s", devpath);
envp[0] = devpath_string;
envp[1] = NULL;
error = sysfs_move_dir_ns(kobj, new_parent, kobject_namespace(kobj));
if (error)
goto out;
old_parent = kobj->parent;
kobj->parent = new_parent;
new_parent = NULL;
kobject_put(old_parent);
kobject_uevent_env(kobj, KOBJ_MOVE, envp);
out:
kobject_put(new_parent);
kobject_put(kobj);
kfree(devpath_string);
kfree(devpath);
return error;
}
EXPORT_SYMBOL_GPL(kobject_move);
static void __kobject_del(struct kobject *kobj)
{
struct kernfs_node *sd;
const struct kobj_type *ktype;
sd = kobj->sd;
ktype = get_ktype(kobj);
if (ktype)
sysfs_remove_groups(kobj, ktype->default_groups);
/* send "remove" if the caller did not do it but sent "add" */
if (kobj->state_add_uevent_sent && !kobj->state_remove_uevent_sent) {
pr_debug("kobject: '%s' (%p): auto cleanup 'remove' event\n",
kobject_name(kobj), kobj);
kobject_uevent(kobj, KOBJ_REMOVE);
}
sysfs_remove_dir(kobj);
sysfs_put(sd);
kobj->state_in_sysfs = 0;
kobj_kset_leave(kobj);
kobj->parent = NULL;
}
/**
* kobject_del() - Unlink kobject from hierarchy.
* @kobj: object.
*
* This is the function that should be called to delete an object
* successfully added via kobject_add().
*/
void kobject_del(struct kobject *kobj)
{
struct kobject *parent;
if (!kobj)
return;
parent = kobj->parent;
__kobject_del(kobj);
kobject_put(parent);
}
EXPORT_SYMBOL(kobject_del);
/**
* kobject_get() - Increment refcount for object.
* @kobj: object.
*/
struct kobject *kobject_get(struct kobject *kobj)
{
if (kobj) { if (!kobj->state_initialized) WARN(1, KERN_WARNING
"kobject: '%s' (%p): is not initialized, yet kobject_get() is being called.\n",
kobject_name(kobj), kobj);
kref_get(&kobj->kref);
}
return kobj;
}
EXPORT_SYMBOL(kobject_get);
struct kobject * __must_check kobject_get_unless_zero(struct kobject *kobj)
{
if (!kobj)
return NULL;
if (!kref_get_unless_zero(&kobj->kref))
kobj = NULL;
return kobj;
}
EXPORT_SYMBOL(kobject_get_unless_zero);
/*
* kobject_cleanup - free kobject resources.
* @kobj: object to cleanup
*/
static void kobject_cleanup(struct kobject *kobj)
{
struct kobject *parent = kobj->parent;
struct kobj_type *t = get_ktype(kobj);
const char *name = kobj->name;
pr_debug("kobject: '%s' (%p): %s, parent %p\n",
kobject_name(kobj), kobj, __func__, kobj->parent);
if (t && !t->release)
pr_debug("kobject: '%s' (%p): does not have a release() function, it is broken and must be fixed. See Documentation/core-api/kobject.rst.\n",
kobject_name(kobj), kobj);
/* remove from sysfs if the caller did not do it */
if (kobj->state_in_sysfs) {
pr_debug("kobject: '%s' (%p): auto cleanup kobject_del\n",
kobject_name(kobj), kobj);
__kobject_del(kobj);
} else {
/* avoid dropping the parent reference unnecessarily */
parent = NULL;
}
if (t && t->release) {
pr_debug("kobject: '%s' (%p): calling ktype release\n",
kobject_name(kobj), kobj);
t->release(kobj);
}
/* free name if we allocated it */
if (name) {
pr_debug("kobject: '%s': free name\n", name);
kfree_const(name);
}
kobject_put(parent);
}
#ifdef CONFIG_DEBUG_KOBJECT_RELEASE
static void kobject_delayed_cleanup(struct work_struct *work)
{
kobject_cleanup(container_of(to_delayed_work(work),
struct kobject, release));
}
#endif
static void kobject_release(struct kref *kref)
{
struct kobject *kobj = container_of(kref, struct kobject, kref);
#ifdef CONFIG_DEBUG_KOBJECT_RELEASE
unsigned long delay = HZ + HZ * (get_random_int() & 0x3);
pr_info("kobject: '%s' (%p): %s, parent %p (delayed %ld)\n",
kobject_name(kobj), kobj, __func__, kobj->parent, delay);
INIT_DELAYED_WORK(&kobj->release, kobject_delayed_cleanup);
schedule_delayed_work(&kobj->release, delay);
#else
kobject_cleanup(kobj);
#endif
}
/**
* kobject_put() - Decrement refcount for object.
* @kobj: object.
*
* Decrement the refcount, and if 0, call kobject_cleanup().
*/
void kobject_put(struct kobject *kobj)
{
if (kobj) { if (!kobj->state_initialized) WARN(1, KERN_WARNING
"kobject: '%s' (%p): is not initialized, yet kobject_put() is being called.\n",
kobject_name(kobj), kobj);
kref_put(&kobj->kref, kobject_release);
}
}
EXPORT_SYMBOL(kobject_put);
static void dynamic_kobj_release(struct kobject *kobj)
{
pr_debug("kobject: (%p): %s\n", kobj, __func__);
kfree(kobj);
}
static struct kobj_type dynamic_kobj_ktype = {
.release = dynamic_kobj_release,
.sysfs_ops = &kobj_sysfs_ops,
};
/**
* kobject_create() - Create a struct kobject dynamically.
*
* This function creates a kobject structure dynamically and sets it up
* to be a "dynamic" kobject with a default release function set up.
*
* If the kobject was not able to be created, NULL will be returned.
* The kobject structure returned from here must be cleaned up with a
* call to kobject_put() and not kfree(), as kobject_init() has
* already been called on this structure.
*/
struct kobject *kobject_create(void)
{
struct kobject *kobj;
kobj = kzalloc(sizeof(*kobj), GFP_KERNEL);
if (!kobj)
return NULL;
kobject_init(kobj, &dynamic_kobj_ktype); return kobj;
}
/**
* kobject_create_and_add() - Create a struct kobject dynamically and
* register it with sysfs.
* @name: the name for the kobject
* @parent: the parent kobject of this kobject, if any.
*
* This function creates a kobject structure dynamically and registers it
* with sysfs. When you are finished with this structure, call
* kobject_put() and the structure will be dynamically freed when
* it is no longer being used.
*
* If the kobject was not able to be created, NULL will be returned.
*/
struct kobject *kobject_create_and_add(const char *name, struct kobject *parent)
{
struct kobject *kobj;
int retval;
kobj = kobject_create();
if (!kobj)
return NULL;
retval = kobject_add(kobj, parent, "%s", name); if (retval) {
pr_warn("%s: kobject_add error: %d\n", __func__, retval);
kobject_put(kobj);
kobj = NULL;
}
return kobj;
}
EXPORT_SYMBOL_GPL(kobject_create_and_add);
/**
* kset_init() - Initialize a kset for use.
* @k: kset
*/
void kset_init(struct kset *k)
{
kobject_init_internal(&k->kobj);
INIT_LIST_HEAD(&k->list);
spin_lock_init(&k->list_lock);
}
/* default kobject attribute operations */
static ssize_t kobj_attr_show(struct kobject *kobj, struct attribute *attr,
char *buf)
{
struct kobj_attribute *kattr;
ssize_t ret = -EIO;
kattr = container_of(attr, struct kobj_attribute, attr);
if (kattr->show)
ret = kattr->show(kobj, kattr, buf);
return ret;
}
static ssize_t kobj_attr_store(struct kobject *kobj, struct attribute *attr,
const char *buf, size_t count)
{
struct kobj_attribute *kattr;
ssize_t ret = -EIO;
kattr = container_of(attr, struct kobj_attribute, attr);
if (kattr->store)
ret = kattr->store(kobj, kattr, buf, count);
return ret;
}
const struct sysfs_ops kobj_sysfs_ops = {
.show = kobj_attr_show,
.store = kobj_attr_store,
};
EXPORT_SYMBOL_GPL(kobj_sysfs_ops);
/**
* kset_register() - Initialize and add a kset.
* @k: kset.
*/
int kset_register(struct kset *k)
{
int err;
if (!k)
return -EINVAL;
kset_init(k);
err = kobject_add_internal(&k->kobj);
if (err)
return err;
kobject_uevent(&k->kobj, KOBJ_ADD);
return 0;
}
EXPORT_SYMBOL(kset_register);
/**
* kset_unregister() - Remove a kset.
* @k: kset.
*/
void kset_unregister(struct kset *k)
{
if (!k)
return;
kobject_del(&k->kobj);
kobject_put(&k->kobj);
}
EXPORT_SYMBOL(kset_unregister);
/**
* kset_find_obj() - Search for object in kset.
* @kset: kset we're looking in.
* @name: object's name.
*
* Lock kset via @kset->subsys, and iterate over @kset->list,
* looking for a matching kobject. If matching object is found
* take a reference and return the object.
*/
struct kobject *kset_find_obj(struct kset *kset, const char *name)
{
struct kobject *k;
struct kobject *ret = NULL;
spin_lock(&kset->list_lock);
list_for_each_entry(k, &kset->list, entry) {
if (kobject_name(k) && !strcmp(kobject_name(k), name)) {
ret = kobject_get_unless_zero(k);
break;
}
}
spin_unlock(&kset->list_lock);
return ret;
}
EXPORT_SYMBOL_GPL(kset_find_obj);
static void kset_release(struct kobject *kobj)
{
struct kset *kset = container_of(kobj, struct kset, kobj);
pr_debug("kobject: '%s' (%p): %s\n",
kobject_name(kobj), kobj, __func__);
kfree(kset);
}
static void kset_get_ownership(struct kobject *kobj, kuid_t *uid, kgid_t *gid)
{
if (kobj->parent)
kobject_get_ownership(kobj->parent, uid, gid);
}
static struct kobj_type kset_ktype = {
.sysfs_ops = &kobj_sysfs_ops,
.release = kset_release,
.get_ownership = kset_get_ownership,
};
/**
* kset_create() - Create a struct kset dynamically.
*
* @name: the name for the kset
* @uevent_ops: a struct kset_uevent_ops for the kset
* @parent_kobj: the parent kobject of this kset, if any.
*
* This function creates a kset structure dynamically. This structure can
* then be registered with the system and show up in sysfs with a call to
* kset_register(). When you are finished with this structure, if
* kset_register() has been called, call kset_unregister() and the
* structure will be dynamically freed when it is no longer being used.
*
* If the kset was not able to be created, NULL will be returned.
*/
static struct kset *kset_create(const char *name,
const struct kset_uevent_ops *uevent_ops,
struct kobject *parent_kobj)
{
struct kset *kset;
int retval;
kset = kzalloc(sizeof(*kset), GFP_KERNEL);
if (!kset)
return NULL;
retval = kobject_set_name(&kset->kobj, "%s", name);
if (retval) {
kfree(kset);
return NULL;
}
kset->uevent_ops = uevent_ops;
kset->kobj.parent = parent_kobj;
/*
* The kobject of this kset will have a type of kset_ktype and belong to
* no kset itself. That way we can properly free it when it is
* finished being used.
*/
kset->kobj.ktype = &kset_ktype;
kset->kobj.kset = NULL;
return kset;
}
/**
* kset_create_and_add() - Create a struct kset dynamically and add it to sysfs.
*
* @name: the name for the kset
* @uevent_ops: a struct kset_uevent_ops for the kset
* @parent_kobj: the parent kobject of this kset, if any.
*
* This function creates a kset structure dynamically and registers it
* with sysfs. When you are finished with this structure, call
* kset_unregister() and the structure will be dynamically freed when it
* is no longer being used.
*
* If the kset was not able to be created, NULL will be returned.
*/
struct kset *kset_create_and_add(const char *name,
const struct kset_uevent_ops *uevent_ops,
struct kobject *parent_kobj)
{
struct kset *kset;
int error;
kset = kset_create(name, uevent_ops, parent_kobj);
if (!kset)
return NULL;
error = kset_register(kset);
if (error) {
kfree(kset);
return NULL;
}
return kset;
}
EXPORT_SYMBOL_GPL(kset_create_and_add);
static DEFINE_SPINLOCK(kobj_ns_type_lock);
static const struct kobj_ns_type_operations *kobj_ns_ops_tbl[KOBJ_NS_TYPES];
int kobj_ns_type_register(const struct kobj_ns_type_operations *ops)
{
enum kobj_ns_type type = ops->type;
int error;
spin_lock(&kobj_ns_type_lock);
error = -EINVAL;
if (type >= KOBJ_NS_TYPES)
goto out;
error = -EINVAL;
if (type <= KOBJ_NS_TYPE_NONE)
goto out;
error = -EBUSY;
if (kobj_ns_ops_tbl[type])
goto out;
error = 0;
kobj_ns_ops_tbl[type] = ops;
out:
spin_unlock(&kobj_ns_type_lock);
return error;
}
int kobj_ns_type_registered(enum kobj_ns_type type)
{
int registered = 0;
spin_lock(&kobj_ns_type_lock);
if ((type > KOBJ_NS_TYPE_NONE) && (type < KOBJ_NS_TYPES))
registered = kobj_ns_ops_tbl[type] != NULL;
spin_unlock(&kobj_ns_type_lock);
return registered;
}
const struct kobj_ns_type_operations *kobj_child_ns_ops(struct kobject *parent)
{
const struct kobj_ns_type_operations *ops = NULL;
if (parent && parent->ktype && parent->ktype->child_ns_type) ops = parent->ktype->child_ns_type(parent);
return ops;
}
const struct kobj_ns_type_operations *kobj_ns_ops(struct kobject *kobj)
{
return kobj_child_ns_ops(kobj->parent);
}
bool kobj_ns_current_may_mount(enum kobj_ns_type type)
{
bool may_mount = true;
spin_lock(&kobj_ns_type_lock);
if ((type > KOBJ_NS_TYPE_NONE) && (type < KOBJ_NS_TYPES) &&
kobj_ns_ops_tbl[type])
may_mount = kobj_ns_ops_tbl[type]->current_may_mount();
spin_unlock(&kobj_ns_type_lock);
return may_mount;
}
void *kobj_ns_grab_current(enum kobj_ns_type type)
{
void *ns = NULL;
spin_lock(&kobj_ns_type_lock);
if ((type > KOBJ_NS_TYPE_NONE) && (type < KOBJ_NS_TYPES) &&
kobj_ns_ops_tbl[type])
ns = kobj_ns_ops_tbl[type]->grab_current_ns();
spin_unlock(&kobj_ns_type_lock);
return ns;
}
EXPORT_SYMBOL_GPL(kobj_ns_grab_current);
const void *kobj_ns_netlink(enum kobj_ns_type type, struct sock *sk)
{
const void *ns = NULL;
spin_lock(&kobj_ns_type_lock);
if ((type > KOBJ_NS_TYPE_NONE) && (type < KOBJ_NS_TYPES) &&
kobj_ns_ops_tbl[type])
ns = kobj_ns_ops_tbl[type]->netlink_ns(sk);
spin_unlock(&kobj_ns_type_lock);
return ns;
}
const void *kobj_ns_initial(enum kobj_ns_type type)
{
const void *ns = NULL;
spin_lock(&kobj_ns_type_lock);
if ((type > KOBJ_NS_TYPE_NONE) && (type < KOBJ_NS_TYPES) &&
kobj_ns_ops_tbl[type])
ns = kobj_ns_ops_tbl[type]->initial_ns();
spin_unlock(&kobj_ns_type_lock);
return ns;
}
void kobj_ns_drop(enum kobj_ns_type type, void *ns)
{
spin_lock(&kobj_ns_type_lock);
if ((type > KOBJ_NS_TYPE_NONE) && (type < KOBJ_NS_TYPES) &&
kobj_ns_ops_tbl[type] && kobj_ns_ops_tbl[type]->drop_ns)
kobj_ns_ops_tbl[type]->drop_ns(ns);
spin_unlock(&kobj_ns_type_lock);
}
EXPORT_SYMBOL_GPL(kobj_ns_drop);
/* SPDX-License-Identifier: GPL-2.0+ */
/*
* Sleepable Read-Copy Update mechanism for mutual exclusion
*
* Copyright (C) IBM Corporation, 2006
* Copyright (C) Fujitsu, 2012
*
* Author: Paul McKenney <paulmck@linux.ibm.com>
* Lai Jiangshan <laijs@cn.fujitsu.com>
*
* For detailed explanation of Read-Copy Update mechanism see -
* Documentation/RCU/ *.txt
*
*/
#ifndef _LINUX_SRCU_H
#define _LINUX_SRCU_H
#include <linux/mutex.h>
#include <linux/rcupdate.h>
#include <linux/workqueue.h>
#include <linux/rcu_segcblist.h>
struct srcu_struct;
#ifdef CONFIG_DEBUG_LOCK_ALLOC
int __init_srcu_struct(struct srcu_struct *ssp, const char *name,
struct lock_class_key *key);
#define init_srcu_struct(ssp) \
({ \
static struct lock_class_key __srcu_key; \
\
__init_srcu_struct((ssp), #ssp, &__srcu_key); \
})
#define __SRCU_DEP_MAP_INIT(srcu_name) .dep_map = { .name = #srcu_name },
#else /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
int init_srcu_struct(struct srcu_struct *ssp);
#define __SRCU_DEP_MAP_INIT(srcu_name)
#endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */
#ifdef CONFIG_TINY_SRCU
#include <linux/srcutiny.h>
#elif defined(CONFIG_TREE_SRCU)
#include <linux/srcutree.h>
#elif defined(CONFIG_SRCU)
#error "Unknown SRCU implementation specified to kernel configuration"
#else
/* Dummy definition for things like notifiers. Actual use gets link error. */
struct srcu_struct { };
#endif
void call_srcu(struct srcu_struct *ssp, struct rcu_head *head,
void (*func)(struct rcu_head *head));
void cleanup_srcu_struct(struct srcu_struct *ssp);
int __srcu_read_lock(struct srcu_struct *ssp) __acquires(ssp);
void __srcu_read_unlock(struct srcu_struct *ssp, int idx) __releases(ssp);
void synchronize_srcu(struct srcu_struct *ssp);
unsigned long get_state_synchronize_srcu(struct srcu_struct *ssp);
unsigned long start_poll_synchronize_srcu(struct srcu_struct *ssp);
bool poll_state_synchronize_srcu(struct srcu_struct *ssp, unsigned long cookie);
#ifdef CONFIG_SRCU
void srcu_init(void);
#else /* #ifdef CONFIG_SRCU */
static inline void srcu_init(void) { }
#endif /* #else #ifdef CONFIG_SRCU */
#ifdef CONFIG_DEBUG_LOCK_ALLOC
/**
* srcu_read_lock_held - might we be in SRCU read-side critical section?
* @ssp: The srcu_struct structure to check
*
* If CONFIG_DEBUG_LOCK_ALLOC is selected, returns nonzero iff in an SRCU
* read-side critical section. In absence of CONFIG_DEBUG_LOCK_ALLOC,
* this assumes we are in an SRCU read-side critical section unless it can
* prove otherwise.
*
* Checks debug_lockdep_rcu_enabled() to prevent false positives during boot
* and while lockdep is disabled.
*
* Note that SRCU is based on its own statemachine and it doesn't
* relies on normal RCU, it can be called from the CPU which
* is in the idle loop from an RCU point of view or offline.
*/
static inline int srcu_read_lock_held(const struct srcu_struct *ssp)
{
if (!debug_lockdep_rcu_enabled())
return 1;
return lock_is_held(&ssp->dep_map);
}
#else /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
static inline int srcu_read_lock_held(const struct srcu_struct *ssp)
{
return 1;
}
#endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */
/**
* srcu_dereference_check - fetch SRCU-protected pointer for later dereferencing
* @p: the pointer to fetch and protect for later dereferencing
* @ssp: pointer to the srcu_struct, which is used to check that we
* really are in an SRCU read-side critical section.
* @c: condition to check for update-side use
*
* If PROVE_RCU is enabled, invoking this outside of an RCU read-side
* critical section will result in an RCU-lockdep splat, unless @c evaluates
* to 1. The @c argument will normally be a logical expression containing
* lockdep_is_held() calls.
*/
#define srcu_dereference_check(p, ssp, c) \
__rcu_dereference_check((p), (c) || srcu_read_lock_held(ssp), __rcu)
/**
* srcu_dereference - fetch SRCU-protected pointer for later dereferencing
* @p: the pointer to fetch and protect for later dereferencing
* @ssp: pointer to the srcu_struct, which is used to check that we
* really are in an SRCU read-side critical section.
*
* Makes rcu_dereference_check() do the dirty work. If PROVE_RCU
* is enabled, invoking this outside of an RCU read-side critical
* section will result in an RCU-lockdep splat.
*/
#define srcu_dereference(p, ssp) srcu_dereference_check((p), (ssp), 0)
/**
* srcu_dereference_notrace - no tracing and no lockdep calls from here
* @p: the pointer to fetch and protect for later dereferencing
* @ssp: pointer to the srcu_struct, which is used to check that we
* really are in an SRCU read-side critical section.
*/
#define srcu_dereference_notrace(p, ssp) srcu_dereference_check((p), (ssp), 1)
/**
* srcu_read_lock - register a new reader for an SRCU-protected structure.
* @ssp: srcu_struct in which to register the new reader.
*
* Enter an SRCU read-side critical section. Note that SRCU read-side
* critical sections may be nested. However, it is illegal to
* call anything that waits on an SRCU grace period for the same
* srcu_struct, whether directly or indirectly. Please note that
* one way to indirectly wait on an SRCU grace period is to acquire
* a mutex that is held elsewhere while calling synchronize_srcu() or
* synchronize_srcu_expedited().
*
* Note that srcu_read_lock() and the matching srcu_read_unlock() must
* occur in the same context, for example, it is illegal to invoke
* srcu_read_unlock() in an irq handler if the matching srcu_read_lock()
* was invoked in process context.
*/
static inline int srcu_read_lock(struct srcu_struct *ssp) __acquires(ssp)
{
int retval;
retval = __srcu_read_lock(ssp);
rcu_lock_acquire(&(ssp)->dep_map);
return retval;
}
/* Used by tracing, cannot be traced and cannot invoke lockdep. */
static inline notrace int
srcu_read_lock_notrace(struct srcu_struct *ssp) __acquires(ssp)
{
int retval;
retval = __srcu_read_lock(ssp);
return retval;
}
/**
* srcu_read_unlock - unregister a old reader from an SRCU-protected structure.
* @ssp: srcu_struct in which to unregister the old reader.
* @idx: return value from corresponding srcu_read_lock().
*
* Exit an SRCU read-side critical section.
*/
static inline void srcu_read_unlock(struct srcu_struct *ssp, int idx)
__releases(ssp)
{
WARN_ON_ONCE(idx & ~0x1);
rcu_lock_release(&(ssp)->dep_map);
__srcu_read_unlock(ssp, idx);
}
/* Used by tracing, cannot be traced and cannot call lockdep. */
static inline notrace void
srcu_read_unlock_notrace(struct srcu_struct *ssp, int idx) __releases(ssp)
{
__srcu_read_unlock(ssp, idx);
}
/**
* smp_mb__after_srcu_read_unlock - ensure full ordering after srcu_read_unlock
*
* Converts the preceding srcu_read_unlock into a two-way memory barrier.
*
* Call this after srcu_read_unlock, to guarantee that all memory operations
* that occur after smp_mb__after_srcu_read_unlock will appear to happen after
* the preceding srcu_read_unlock.
*/
static inline void smp_mb__after_srcu_read_unlock(void)
{
/* __srcu_read_unlock has smp_mb() internally so nothing to do here. */
}
#endif
// SPDX-License-Identifier: GPL-2.0
/*
* Copyright (C) 2008 Oracle. All rights reserved.
*/
#include <linux/sched.h>
#include <linux/pagemap.h>
#include <linux/spinlock.h>
#include <linux/page-flags.h>
#include <asm/bug.h>
#include "misc.h"
#include "ctree.h"
#include "extent_io.h"
#include "locking.h"
/*
* Extent buffer locking
* =====================
*
* We use a rw_semaphore for tree locking, and the semantics are exactly the
* same:
*
* - reader/writer exclusion
* - writer/writer exclusion
* - reader/reader sharing
* - try-lock semantics for readers and writers
*
* The rwsem implementation does opportunistic spinning which reduces number of
* times the locking task needs to sleep.
*/
/*
* __btrfs_tree_read_lock - lock extent buffer for read
* @eb: the eb to be locked
* @nest: the nesting level to be used for lockdep
*
* This takes the read lock on the extent buffer, using the specified nesting
* level for lockdep purposes.
*/
void __btrfs_tree_read_lock(struct extent_buffer *eb, enum btrfs_lock_nesting nest)
{
u64 start_ns = 0;
if (trace_btrfs_tree_read_lock_enabled())
start_ns = ktime_get_ns();
down_read_nested(&eb->lock, nest);
eb->lock_owner = current->pid;
trace_btrfs_tree_read_lock(eb, start_ns);
}
void btrfs_tree_read_lock(struct extent_buffer *eb)
{
__btrfs_tree_read_lock(eb, BTRFS_NESTING_NORMAL);
}
/*
* Try-lock for read.
*
* Return 1 if the rwlock has been taken, 0 otherwise
*/
int btrfs_try_tree_read_lock(struct extent_buffer *eb)
{
if (down_read_trylock(&eb->lock)) {
eb->lock_owner = current->pid;
trace_btrfs_try_tree_read_lock(eb);
return 1;
}
return 0;
}
/*
* Try-lock for write.
*
* Return 1 if the rwlock has been taken, 0 otherwise
*/
int btrfs_try_tree_write_lock(struct extent_buffer *eb)
{
if (down_write_trylock(&eb->lock)) {
eb->lock_owner = current->pid;
trace_btrfs_try_tree_write_lock(eb);
return 1;
}
return 0;
}
/*
* Release read lock.
*/
void btrfs_tree_read_unlock(struct extent_buffer *eb)
{
trace_btrfs_tree_read_unlock(eb);
eb->lock_owner = 0;
up_read(&eb->lock);
}
/*
* __btrfs_tree_lock - lock eb for write
* @eb: the eb to lock
* @nest: the nesting to use for the lock
*
* Returns with the eb->lock write locked.
*/
void __btrfs_tree_lock(struct extent_buffer *eb, enum btrfs_lock_nesting nest)
__acquires(&eb->lock)
{
u64 start_ns = 0;
if (trace_btrfs_tree_lock_enabled())
start_ns = ktime_get_ns();
down_write_nested(&eb->lock, nest);
eb->lock_owner = current->pid;
trace_btrfs_tree_lock(eb, start_ns);
}
void btrfs_tree_lock(struct extent_buffer *eb)
{
__btrfs_tree_lock(eb, BTRFS_NESTING_NORMAL);
}
/*
* Release the write lock.
*/
void btrfs_tree_unlock(struct extent_buffer *eb)
{
trace_btrfs_tree_unlock(eb);
eb->lock_owner = 0;
up_write(&eb->lock);
}
/*
* This releases any locks held in the path starting at level and going all the
* way up to the root.
*
* btrfs_search_slot will keep the lock held on higher nodes in a few corner
* cases, such as COW of the block at slot zero in the node. This ignores
* those rules, and it should only be called when there are no more updates to
* be done higher up in the tree.
*/
void btrfs_unlock_up_safe(struct btrfs_path *path, int level)
{
int i;
if (path->keep_locks)
return;
for (i = level; i < BTRFS_MAX_LEVEL; i++) { if (!path->nodes[i])
continue;
if (!path->locks[i])
continue;
btrfs_tree_unlock_rw(path->nodes[i], path->locks[i]); path->locks[i] = 0;
}
}
/*
* Loop around taking references on and locking the root node of the tree until
* we end up with a lock on the root node.
*
* Return: root extent buffer with write lock held
*/
struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root)
{
struct extent_buffer *eb;
while (1) {
eb = btrfs_root_node(root);
btrfs_tree_lock(eb);
if (eb == root->node)
break;
btrfs_tree_unlock(eb);
free_extent_buffer(eb);
}
return eb;
}
/*
* Loop around taking references on and locking the root node of the tree until
* we end up with a lock on the root node.
*
* Return: root extent buffer with read lock held
*/
struct extent_buffer *btrfs_read_lock_root_node(struct btrfs_root *root)
{
struct extent_buffer *eb;
while (1) {
eb = btrfs_root_node(root);
btrfs_tree_read_lock(eb);
if (eb == root->node)
break;
btrfs_tree_read_unlock(eb);
free_extent_buffer(eb);
}
return eb;
}
/*
* DREW locks
* ==========
*
* DREW stands for double-reader-writer-exclusion lock. It's used in situation
* where you want to provide A-B exclusion but not AA or BB.
*
* Currently implementation gives more priority to reader. If a reader and a
* writer both race to acquire their respective sides of the lock the writer
* would yield its lock as soon as it detects a concurrent reader. Additionally
* if there are pending readers no new writers would be allowed to come in and
* acquire the lock.
*/
int btrfs_drew_lock_init(struct btrfs_drew_lock *lock)
{
int ret;
ret = percpu_counter_init(&lock->writers, 0, GFP_KERNEL); if (ret)
return ret;
atomic_set(&lock->readers, 0);
init_waitqueue_head(&lock->pending_readers);
init_waitqueue_head(&lock->pending_writers);
return 0;
}
void btrfs_drew_lock_destroy(struct btrfs_drew_lock *lock)
{
percpu_counter_destroy(&lock->writers);
}
/* Return true if acquisition is successful, false otherwise */
bool btrfs_drew_try_write_lock(struct btrfs_drew_lock *lock)
{
if (atomic_read(&lock->readers))
return false;
percpu_counter_inc(&lock->writers);
/* Ensure writers count is updated before we check for pending readers */
smp_mb();
if (atomic_read(&lock->readers)) {
btrfs_drew_write_unlock(lock);
return false;
}
return true;
}
void btrfs_drew_write_lock(struct btrfs_drew_lock *lock)
{
while (true) {
if (btrfs_drew_try_write_lock(lock)) return; wait_event(lock->pending_writers, !atomic_read(&lock->readers));
}
}
void btrfs_drew_write_unlock(struct btrfs_drew_lock *lock)
{
percpu_counter_dec(&lock->writers); cond_wake_up(&lock->pending_readers);
}
void btrfs_drew_read_lock(struct btrfs_drew_lock *lock)
{
atomic_inc(&lock->readers);
/*
* Ensure the pending reader count is perceieved BEFORE this reader
* goes to sleep in case of active writers. This guarantees new writers
* won't be allowed and that the current reader will be woken up when
* the last active writer finishes its jobs.
*/
smp_mb__after_atomic();
wait_event(lock->pending_readers,
percpu_counter_sum(&lock->writers) == 0);
}
void btrfs_drew_read_unlock(struct btrfs_drew_lock *lock)
{
/*
* atomic_dec_and_test implies a full barrier, so woken up writers
* are guaranteed to see the decrement
*/
if (atomic_dec_and_test(&lock->readers))
wake_up(&lock->pending_writers);
}
/* SPDX-License-Identifier: GPL-2.0 */
#undef TRACE_SYSTEM
#define TRACE_SYSTEM printk
#if !defined(_TRACE_PRINTK_H) || defined(TRACE_HEADER_MULTI_READ)
#define _TRACE_PRINTK_H
#include <linux/tracepoint.h>
TRACE_EVENT(console,
TP_PROTO(const char *text, size_t len),
TP_ARGS(text, len),
TP_STRUCT__entry(
__dynamic_array(char, msg, len + 1)
),
TP_fast_assign(
/*
* Each trace entry is printed in a new line.
* If the msg finishes with '\n', cut it off
* to avoid blank lines in the trace.
*/
if ((len > 0) && (text[len-1] == '\n'))
len -= 1;
memcpy(__get_str(msg), text, len);
__get_str(msg)[len] = 0;
),
TP_printk("%s", __get_str(msg))
);
#endif /* _TRACE_PRINTK_H */
/* This part must be outside protection */
#include <trace/define_trace.h>
// SPDX-License-Identifier: GPL-2.0
#include <linux/err.h>
#include <linux/slab.h>
#include <linux/spinlock.h>
#include "ctree.h"
#include "volumes.h"
#include "extent_map.h"
#include "compression.h"
static struct kmem_cache *extent_map_cache;
int __init extent_map_init(void)
{
extent_map_cache = kmem_cache_create("btrfs_extent_map",
sizeof(struct extent_map), 0,
SLAB_MEM_SPREAD, NULL);
if (!extent_map_cache)
return -ENOMEM;
return 0;
}
void __cold extent_map_exit(void)
{
kmem_cache_destroy(extent_map_cache);
}
/**
* extent_map_tree_init - initialize extent map tree
* @tree: tree to initialize
*
* Initialize the extent tree @tree. Should be called for each new inode
* or other user of the extent_map interface.
*/
void extent_map_tree_init(struct extent_map_tree *tree)
{
tree->map = RB_ROOT_CACHED;
INIT_LIST_HEAD(&tree->modified_extents);
rwlock_init(&tree->lock);
}
/**
* alloc_extent_map - allocate new extent map structure
*
* Allocate a new extent_map structure. The new structure is
* returned with a reference count of one and needs to be
* freed using free_extent_map()
*/
struct extent_map *alloc_extent_map(void)
{
struct extent_map *em;
em = kmem_cache_zalloc(extent_map_cache, GFP_NOFS);
if (!em)
return NULL;
RB_CLEAR_NODE(&em->rb_node);
em->flags = 0;
em->compress_type = BTRFS_COMPRESS_NONE;
em->generation = 0;
refcount_set(&em->refs, 1);
INIT_LIST_HEAD(&em->list);
return em;
}
/**
* free_extent_map - drop reference count of an extent_map
* @em: extent map being released
*
* Drops the reference out on @em by one and free the structure
* if the reference count hits zero.
*/
void free_extent_map(struct extent_map *em)
{
if (!em)
return;
WARN_ON(refcount_read(&em->refs) == 0);
if (refcount_dec_and_test(&em->refs)) {
WARN_ON(extent_map_in_tree(em)); WARN_ON(!list_empty(&em->list)); if (test_bit(EXTENT_FLAG_FS_MAPPING, &em->flags)) kfree(em->map_lookup); kmem_cache_free(extent_map_cache, em);
}
}
/* simple helper to do math around the end of an extent, handling wrap */
static u64 range_end(u64 start, u64 len)
{
if (start + len < start)
return (u64)-1;
return start + len;
}
static int tree_insert(struct rb_root_cached *root, struct extent_map *em)
{
struct rb_node **p = &root->rb_root.rb_node;
struct rb_node *parent = NULL;
struct extent_map *entry = NULL;
struct rb_node *orig_parent = NULL;
u64 end = range_end(em->start, em->len);
bool leftmost = true;
while (*p) {
parent = *p;
entry = rb_entry(parent, struct extent_map, rb_node);
if (em->start < entry->start) { p = &(*p)->rb_left; } else if (em->start >= extent_map_end(entry)) { p = &(*p)->rb_right;
leftmost = false;
} else {
return -EEXIST;
}
}
orig_parent = parent;
while (parent && em->start >= extent_map_end(entry)) { parent = rb_next(parent);
entry = rb_entry(parent, struct extent_map, rb_node);
}
if (parent)
if (end > entry->start && em->start < extent_map_end(entry))
return -EEXIST;
parent = orig_parent;
entry = rb_entry(parent, struct extent_map, rb_node);
while (parent && em->start < entry->start) { parent = rb_prev(parent);
entry = rb_entry(parent, struct extent_map, rb_node);
}
if (parent)
if (end > entry->start && em->start < extent_map_end(entry))
return -EEXIST;
rb_link_node(&em->rb_node, orig_parent, p);
rb_insert_color_cached(&em->rb_node, root, leftmost);
return 0;
}
/*
* search through the tree for an extent_map with a given offset. If
* it can't be found, try to find some neighboring extents
*/
static struct rb_node *__tree_search(struct rb_root *root, u64 offset,
struct rb_node **prev_ret,
struct rb_node **next_ret)
{
struct rb_node *n = root->rb_node;
struct rb_node *prev = NULL;
struct rb_node *orig_prev = NULL;
struct extent_map *entry;
struct extent_map *prev_entry = NULL;
while (n) {
entry = rb_entry(n, struct extent_map, rb_node);
prev = n;
prev_entry = entry;
if (offset < entry->start) n = n->rb_left; else if (offset >= extent_map_end(entry)) n = n->rb_right;
else
return n;
}
if (prev_ret) {
orig_prev = prev;
while (prev && offset >= extent_map_end(prev_entry)) { prev = rb_next(prev);
prev_entry = rb_entry(prev, struct extent_map, rb_node);
}
*prev_ret = prev;
prev = orig_prev;
}
if (next_ret) {
prev_entry = rb_entry(prev, struct extent_map, rb_node);
while (prev && offset < prev_entry->start) { prev = rb_prev(prev);
prev_entry = rb_entry(prev, struct extent_map, rb_node);
}
*next_ret = prev;
}
return NULL;
}
/* check to see if two extent_map structs are adjacent and safe to merge */
static int mergable_maps(struct extent_map *prev, struct extent_map *next)
{
if (test_bit(EXTENT_FLAG_PINNED, &prev->flags)) return 0;
/*
* don't merge compressed extents, we need to know their
* actual size
*/
if (test_bit(EXTENT_FLAG_COMPRESSED, &prev->flags))
return 0;
if (test_bit(EXTENT_FLAG_LOGGING, &prev->flags) ||
test_bit(EXTENT_FLAG_LOGGING, &next->flags))
return 0;
/*
* We don't want to merge stuff that hasn't been written to the log yet
* since it may not reflect exactly what is on disk, and that would be
* bad.
*/
if (!list_empty(&prev->list) || !list_empty(&next->list))
return 0;
ASSERT(next->block_start != EXTENT_MAP_DELALLOC &&
prev->block_start != EXTENT_MAP_DELALLOC);
if (prev->map_lookup || next->map_lookup)
ASSERT(test_bit(EXTENT_FLAG_FS_MAPPING, &prev->flags) &&
test_bit(EXTENT_FLAG_FS_MAPPING, &next->flags));
if (extent_map_end(prev) == next->start && prev->flags == next->flags && prev->map_lookup == next->map_lookup && ((next->block_start == EXTENT_MAP_HOLE && prev->block_start == EXTENT_MAP_HOLE) ||
(next->block_start == EXTENT_MAP_INLINE &&
prev->block_start == EXTENT_MAP_INLINE) || (next->block_start < EXTENT_MAP_LAST_BYTE - 1 && next->block_start == extent_map_block_end(prev)))) {
return 1;
}
return 0;
}
static void try_merge_map(struct extent_map_tree *tree, struct extent_map *em)
{
struct extent_map *merge = NULL;
struct rb_node *rb;
/*
* We can't modify an extent map that is in the tree and that is being
* used by another task, as it can cause that other task to see it in
* inconsistent state during the merging. We always have 1 reference for
* the tree and 1 for this task (which is unpinning the extent map or
* clearing the logging flag), so anything > 2 means it's being used by
* other tasks too.
*/
if (refcount_read(&em->refs) > 2)
return;
if (em->start != 0) { rb = rb_prev(&em->rb_node);
if (rb)
merge = rb_entry(rb, struct extent_map, rb_node);
if (rb && mergable_maps(merge, em)) { em->start = merge->start;
em->orig_start = merge->orig_start;
em->len += merge->len;
em->block_len += merge->block_len;
em->block_start = merge->block_start;
em->mod_len = (em->mod_len + em->mod_start) - merge->mod_start;
em->mod_start = merge->mod_start;
em->generation = max(em->generation, merge->generation);
rb_erase_cached(&merge->rb_node, &tree->map);
RB_CLEAR_NODE(&merge->rb_node);
free_extent_map(merge);
}
}
rb = rb_next(&em->rb_node);
if (rb)
merge = rb_entry(rb, struct extent_map, rb_node);
if (rb && mergable_maps(em, merge)) { em->len += merge->len;
em->block_len += merge->block_len;
rb_erase_cached(&merge->rb_node, &tree->map);
RB_CLEAR_NODE(&merge->rb_node);
em->mod_len = (merge->mod_start + merge->mod_len) - em->mod_start;
em->generation = max(em->generation, merge->generation);
free_extent_map(merge);
}
}
/**
* unpin_extent_cache - unpin an extent from the cache
* @tree: tree to unpin the extent in
* @start: logical offset in the file
* @len: length of the extent
* @gen: generation that this extent has been modified in
*
* Called after an extent has been written to disk properly. Set the generation
* to the generation that actually added the file item to the inode so we know
* we need to sync this extent when we call fsync().
*/
int unpin_extent_cache(struct extent_map_tree *tree, u64 start, u64 len,
u64 gen)
{
int ret = 0;
struct extent_map *em;
bool prealloc = false;
write_lock(&tree->lock);
em = lookup_extent_mapping(tree, start, len);
WARN_ON(!em || em->start != start);
if (!em)
goto out;
em->generation = gen;
clear_bit(EXTENT_FLAG_PINNED, &em->flags);
em->mod_start = em->start;
em->mod_len = em->len;
if (test_bit(EXTENT_FLAG_FILLING, &em->flags)) {
prealloc = true;
clear_bit(EXTENT_FLAG_FILLING, &em->flags);
}
try_merge_map(tree, em);
if (prealloc) {
em->mod_start = em->start;
em->mod_len = em->len;
}
free_extent_map(em);
out:
write_unlock(&tree->lock);
return ret;
}
void clear_em_logging(struct extent_map_tree *tree, struct extent_map *em)
{
clear_bit(EXTENT_FLAG_LOGGING, &em->flags);
if (extent_map_in_tree(em))
try_merge_map(tree, em);
}
static inline void setup_extent_mapping(struct extent_map_tree *tree,
struct extent_map *em,
int modified)
{
refcount_inc(&em->refs);
em->mod_start = em->start;
em->mod_len = em->len;
if (modified)
list_move(&em->list, &tree->modified_extents);
else
try_merge_map(tree, em);
}
static void extent_map_device_set_bits(struct extent_map *em, unsigned bits)
{
struct map_lookup *map = em->map_lookup;
u64 stripe_size = em->orig_block_len;
int i;
for (i = 0; i < map->num_stripes; i++) {
struct btrfs_bio_stripe *stripe = &map->stripes[i];
struct btrfs_device *device = stripe->dev;
set_extent_bits_nowait(&device->alloc_state, stripe->physical,
stripe->physical + stripe_size - 1, bits);
}
}
static void extent_map_device_clear_bits(struct extent_map *em, unsigned bits)
{
struct map_lookup *map = em->map_lookup;
u64 stripe_size = em->orig_block_len;
int i;
for (i = 0; i < map->num_stripes; i++) {
struct btrfs_bio_stripe *stripe = &map->stripes[i];
struct btrfs_device *device = stripe->dev;
__clear_extent_bit(&device->alloc_state, stripe->physical,
stripe->physical + stripe_size - 1, bits,
0, 0, NULL, GFP_NOWAIT, NULL);
}
}
/**
* Add new extent map to the extent tree
*
* @tree: tree to insert new map in
* @em: map to insert
* @modified: indicate whether the given @em should be added to the
* modified list, which indicates the extent needs to be logged
*
* Insert @em into @tree or perform a simple forward/backward merge with
* existing mappings. The extent_map struct passed in will be inserted
* into the tree directly, with an additional reference taken, or a
* reference dropped if the merge attempt was successful.
*/
int add_extent_mapping(struct extent_map_tree *tree,
struct extent_map *em, int modified)
{
int ret = 0;
lockdep_assert_held_write(&tree->lock);
ret = tree_insert(&tree->map, em);
if (ret)
goto out;
setup_extent_mapping(tree, em, modified);
if (test_bit(EXTENT_FLAG_FS_MAPPING, &em->flags)) { extent_map_device_set_bits(em, CHUNK_ALLOCATED); extent_map_device_clear_bits(em, CHUNK_TRIMMED);
}
out:
return ret;
}
static struct extent_map *
__lookup_extent_mapping(struct extent_map_tree *tree,
u64 start, u64 len, int strict)
{
struct extent_map *em;
struct rb_node *rb_node;
struct rb_node *prev = NULL;
struct rb_node *next = NULL;
u64 end = range_end(start, len);
rb_node = __tree_search(&tree->map.rb_root, start, &prev, &next);
if (!rb_node) {
if (prev)
rb_node = prev;
else if (next)
rb_node = next;
else
return NULL;
}
em = rb_entry(rb_node, struct extent_map, rb_node);
if (strict && !(end > em->start && start < extent_map_end(em)))
return NULL;
refcount_inc(&em->refs);
return em;
}
/**
* lookup_extent_mapping - lookup extent_map
* @tree: tree to lookup in
* @start: byte offset to start the search
* @len: length of the lookup range
*
* Find and return the first extent_map struct in @tree that intersects the
* [start, len] range. There may be additional objects in the tree that
* intersect, so check the object returned carefully to make sure that no
* additional lookups are needed.
*/
struct extent_map *lookup_extent_mapping(struct extent_map_tree *tree,
u64 start, u64 len)
{
return __lookup_extent_mapping(tree, start, len, 1);
}
/**
* search_extent_mapping - find a nearby extent map
* @tree: tree to lookup in
* @start: byte offset to start the search
* @len: length of the lookup range
*
* Find and return the first extent_map struct in @tree that intersects the
* [start, len] range.
*
* If one can't be found, any nearby extent may be returned
*/
struct extent_map *search_extent_mapping(struct extent_map_tree *tree,
u64 start, u64 len)
{
return __lookup_extent_mapping(tree, start, len, 0);
}
/**
* remove_extent_mapping - removes an extent_map from the extent tree
* @tree: extent tree to remove from
* @em: extent map being removed
*
* Removes @em from @tree. No reference counts are dropped, and no checks
* are done to see if the range is in use
*/
void remove_extent_mapping(struct extent_map_tree *tree, struct extent_map *em)
{
WARN_ON(test_bit(EXTENT_FLAG_PINNED, &em->flags)); rb_erase_cached(&em->rb_node, &tree->map);
if (!test_bit(EXTENT_FLAG_LOGGING, &em->flags))
list_del_init(&em->list);
if (test_bit(EXTENT_FLAG_FS_MAPPING, &em->flags))
extent_map_device_clear_bits(em, CHUNK_ALLOCATED); RB_CLEAR_NODE(&em->rb_node);
}
void replace_extent_mapping(struct extent_map_tree *tree,
struct extent_map *cur,
struct extent_map *new,
int modified)
{
WARN_ON(test_bit(EXTENT_FLAG_PINNED, &cur->flags));
ASSERT(extent_map_in_tree(cur));
if (!test_bit(EXTENT_FLAG_LOGGING, &cur->flags))
list_del_init(&cur->list); rb_replace_node_cached(&cur->rb_node, &new->rb_node, &tree->map);
RB_CLEAR_NODE(&cur->rb_node);
setup_extent_mapping(tree, new, modified);
}
static struct extent_map *next_extent_map(struct extent_map *em)
{
struct rb_node *next;
next = rb_next(&em->rb_node);
if (!next)
return NULL;
return container_of(next, struct extent_map, rb_node);
}
static struct extent_map *prev_extent_map(struct extent_map *em)
{
struct rb_node *prev;
prev = rb_prev(&em->rb_node);
if (!prev)
return NULL;
return container_of(prev, struct extent_map, rb_node);
}
/*
* Helper for btrfs_get_extent. Given an existing extent in the tree,
* the existing extent is the nearest extent to map_start,
* and an extent that you want to insert, deal with overlap and insert
* the best fitted new extent into the tree.
*/
static noinline int merge_extent_mapping(struct extent_map_tree *em_tree,
struct extent_map *existing,
struct extent_map *em,
u64 map_start)
{
struct extent_map *prev;
struct extent_map *next;
u64 start;
u64 end;
u64 start_diff;
BUG_ON(map_start < em->start || map_start >= extent_map_end(em)); if (existing->start > map_start) {
next = existing;
prev = prev_extent_map(next);
} else {
prev = existing;
next = next_extent_map(prev);
}
start = prev ? extent_map_end(prev) : em->start;
start = max_t(u64, start, em->start);
end = next ? next->start : extent_map_end(em);
end = min_t(u64, end, extent_map_end(em));
start_diff = start - em->start;
em->start = start;
em->len = end - start;
if (em->block_start < EXTENT_MAP_LAST_BYTE &&
!test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) { em->block_start += start_diff;
em->block_len = em->len;
}
return add_extent_mapping(em_tree, em, 0);
}
/**
* Add extent mapping into em_tree
*
* @fs_info: the filesystem
* @em_tree: extent tree into which we want to insert the extent mapping
* @em_in: extent we are inserting
* @start: start of the logical range btrfs_get_extent() is requesting
* @len: length of the logical range btrfs_get_extent() is requesting
*
* Note that @em_in's range may be different from [start, start+len),
* but they must be overlapped.
*
* Insert @em_in into @em_tree. In case there is an overlapping range, handle
* the -EEXIST by either:
* a) Returning the existing extent in @em_in if @start is within the
* existing em.
* b) Merge the existing extent with @em_in passed in.
*
* Return 0 on success, otherwise -EEXIST.
*
*/
int btrfs_add_extent_mapping(struct btrfs_fs_info *fs_info,
struct extent_map_tree *em_tree,
struct extent_map **em_in, u64 start, u64 len)
{
int ret;
struct extent_map *em = *em_in;
ret = add_extent_mapping(em_tree, em, 0);
/* it is possible that someone inserted the extent into the tree
* while we had the lock dropped. It is also possible that
* an overlapping map exists in the tree
*/
if (ret == -EEXIST) {
struct extent_map *existing;
ret = 0;
existing = search_extent_mapping(em_tree, start, len);
trace_btrfs_handle_em_exist(fs_info, existing, em, start, len);
/*
* existing will always be non-NULL, since there must be
* extent causing the -EEXIST.
*/
if (start >= existing->start && start < extent_map_end(existing)) { free_extent_map(em);
*em_in = existing;
ret = 0;
} else {
u64 orig_start = em->start;
u64 orig_len = em->len;
/*
* The existing extent map is the one nearest to
* the [start, start + len) range which overlaps
*/
ret = merge_extent_mapping(em_tree, existing,
em, start);
if (ret) {
free_extent_map(em);
*em_in = NULL;
WARN_ONCE(ret,
"unexpected error %d: merge existing(start %llu len %llu) with em(start %llu len %llu)\n",
ret, existing->start, existing->len,
orig_start, orig_len);
}
free_extent_map(existing);
}
}
ASSERT(ret == 0 || ret == -EEXIST);
return ret;
}
// SPDX-License-Identifier: GPL-2.0-only
/*
* Generic helpers for smp ipi calls
*
* (C) Jens Axboe <jens.axboe@oracle.com> 2008
*/
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/irq_work.h>
#include <linux/rcupdate.h>
#include <linux/rculist.h>
#include <linux/kernel.h>
#include <linux/export.h>
#include <linux/percpu.h>
#include <linux/init.h>
#include <linux/interrupt.h>
#include <linux/gfp.h>
#include <linux/smp.h>
#include <linux/cpu.h>
#include <linux/sched.h>
#include <linux/sched/idle.h>
#include <linux/hypervisor.h>
#include <linux/sched/clock.h>
#include <linux/nmi.h>
#include <linux/sched/debug.h>
#include <linux/jump_label.h>
#include "smpboot.h"
#include "sched/smp.h"
#define CSD_TYPE(_csd) ((_csd)->node.u_flags & CSD_FLAG_TYPE_MASK)
#ifdef CONFIG_CSD_LOCK_WAIT_DEBUG
union cfd_seq_cnt {
u64 val;
struct {
u64 src:16;
u64 dst:16;
#define CFD_SEQ_NOCPU 0xffff
u64 type:4;
#define CFD_SEQ_QUEUE 0
#define CFD_SEQ_IPI 1
#define CFD_SEQ_NOIPI 2
#define CFD_SEQ_PING 3
#define CFD_SEQ_PINGED 4
#define CFD_SEQ_HANDLE 5
#define CFD_SEQ_DEQUEUE 6
#define CFD_SEQ_IDLE 7
#define CFD_SEQ_GOTIPI 8
#define CFD_SEQ_HDLEND 9
u64 cnt:28;
} u;
};
static char *seq_type[] = {
[CFD_SEQ_QUEUE] = "queue",
[CFD_SEQ_IPI] = "ipi",
[CFD_SEQ_NOIPI] = "noipi",
[CFD_SEQ_PING] = "ping",
[CFD_SEQ_PINGED] = "pinged",
[CFD_SEQ_HANDLE] = "handle",
[CFD_SEQ_DEQUEUE] = "dequeue (src CPU 0 == empty)",
[CFD_SEQ_IDLE] = "idle",
[CFD_SEQ_GOTIPI] = "gotipi",
[CFD_SEQ_HDLEND] = "hdlend (src CPU 0 == early)",
};
struct cfd_seq_local {
u64 ping;
u64 pinged;
u64 handle;
u64 dequeue;
u64 idle;
u64 gotipi;
u64 hdlend;
};
#endif
struct cfd_percpu {
call_single_data_t csd;
#ifdef CONFIG_CSD_LOCK_WAIT_DEBUG
u64 seq_queue;
u64 seq_ipi;
u64 seq_noipi;
#endif
};
struct call_function_data {
struct cfd_percpu __percpu *pcpu;
cpumask_var_t cpumask;
cpumask_var_t cpumask_ipi;
};
static DEFINE_PER_CPU_ALIGNED(struct call_function_data, cfd_data);
static DEFINE_PER_CPU_SHARED_ALIGNED(struct llist_head, call_single_queue);
static void flush_smp_call_function_queue(bool warn_cpu_offline);
int smpcfd_prepare_cpu(unsigned int cpu)
{
struct call_function_data *cfd = &per_cpu(cfd_data, cpu);
if (!zalloc_cpumask_var_node(&cfd->cpumask, GFP_KERNEL,
cpu_to_node(cpu)))
return -ENOMEM;
if (!zalloc_cpumask_var_node(&cfd->cpumask_ipi, GFP_KERNEL,
cpu_to_node(cpu))) {
free_cpumask_var(cfd->cpumask);
return -ENOMEM;
}
cfd->pcpu = alloc_percpu(struct cfd_percpu);
if (!cfd->pcpu) {
free_cpumask_var(cfd->cpumask);
free_cpumask_var(cfd->cpumask_ipi);
return -ENOMEM;
}
return 0;
}
int smpcfd_dead_cpu(unsigned int cpu)
{
struct call_function_data *cfd = &per_cpu(cfd_data, cpu);
free_cpumask_var(cfd->cpumask);
free_cpumask_var(cfd->cpumask_ipi);
free_percpu(cfd->pcpu);
return 0;
}
int smpcfd_dying_cpu(unsigned int cpu)
{
/*
* The IPIs for the smp-call-function callbacks queued by other
* CPUs might arrive late, either due to hardware latencies or
* because this CPU disabled interrupts (inside stop-machine)
* before the IPIs were sent. So flush out any pending callbacks
* explicitly (without waiting for the IPIs to arrive), to
* ensure that the outgoing CPU doesn't go offline with work
* still pending.
*/
flush_smp_call_function_queue(false);
irq_work_run();
return 0;
}
void __init call_function_init(void)
{
int i;
for_each_possible_cpu(i)
init_llist_head(&per_cpu(call_single_queue, i));
smpcfd_prepare_cpu(smp_processor_id());
}
#ifdef CONFIG_CSD_LOCK_WAIT_DEBUG
static DEFINE_STATIC_KEY_FALSE(csdlock_debug_enabled);
static DEFINE_STATIC_KEY_FALSE(csdlock_debug_extended);
static int __init csdlock_debug(char *str)
{
unsigned int val = 0;
if (str && !strcmp(str, "ext")) {
val = 1;
static_branch_enable(&csdlock_debug_extended);
} else
get_option(&str, &val);
if (val)
static_branch_enable(&csdlock_debug_enabled);
return 0;
}
early_param("csdlock_debug", csdlock_debug);
static DEFINE_PER_CPU(call_single_data_t *, cur_csd);
static DEFINE_PER_CPU(smp_call_func_t, cur_csd_func);
static DEFINE_PER_CPU(void *, cur_csd_info);
static DEFINE_PER_CPU(struct cfd_seq_local, cfd_seq_local);
#define CSD_LOCK_TIMEOUT (5ULL * NSEC_PER_SEC)
static atomic_t csd_bug_count = ATOMIC_INIT(0);
static u64 cfd_seq;
#define CFD_SEQ(s, d, t, c) \
(union cfd_seq_cnt){ .u.src = s, .u.dst = d, .u.type = t, .u.cnt = c }
static u64 cfd_seq_inc(unsigned int src, unsigned int dst, unsigned int type)
{
union cfd_seq_cnt new, old;
new = CFD_SEQ(src, dst, type, 0);
do {
old.val = READ_ONCE(cfd_seq);
new.u.cnt = old.u.cnt + 1;
} while (cmpxchg(&cfd_seq, old.val, new.val) != old.val);
return old.val;
}
#define cfd_seq_store(var, src, dst, type) \
do { \
if (static_branch_unlikely(&csdlock_debug_extended)) \
var = cfd_seq_inc(src, dst, type); \
} while (0)
/* Record current CSD work for current CPU, NULL to erase. */
static void __csd_lock_record(struct __call_single_data *csd)
{
if (!csd) {
smp_mb(); /* NULL cur_csd after unlock. */
__this_cpu_write(cur_csd, NULL);
return;
}
__this_cpu_write(cur_csd_func, csd->func);
__this_cpu_write(cur_csd_info, csd->info);
smp_wmb(); /* func and info before csd. */
__this_cpu_write(cur_csd, csd);
smp_mb(); /* Update cur_csd before function call. */
/* Or before unlock, as the case may be. */
}
static __always_inline void csd_lock_record(struct __call_single_data *csd)
{
if (static_branch_unlikely(&csdlock_debug_enabled))
__csd_lock_record(csd);
}
static int csd_lock_wait_getcpu(struct __call_single_data *csd)
{
unsigned int csd_type;
csd_type = CSD_TYPE(csd);
if (csd_type == CSD_TYPE_ASYNC || csd_type == CSD_TYPE_SYNC)
return csd->node.dst; /* Other CSD_TYPE_ values might not have ->dst. */
return -1;
}
static void cfd_seq_data_add(u64 val, unsigned int src, unsigned int dst,
unsigned int type, union cfd_seq_cnt *data,
unsigned int *n_data, unsigned int now)
{
union cfd_seq_cnt new[2];
unsigned int i, j, k;
new[0].val = val;
new[1] = CFD_SEQ(src, dst, type, new[0].u.cnt + 1);
for (i = 0; i < 2; i++) {
if (new[i].u.cnt <= now)
new[i].u.cnt |= 0x80000000U;
for (j = 0; j < *n_data; j++) {
if (new[i].u.cnt == data[j].u.cnt) {
/* Direct read value trumps generated one. */
if (i == 0)
data[j].val = new[i].val;
break;
}
if (new[i].u.cnt < data[j].u.cnt) {
for (k = *n_data; k > j; k--)
data[k].val = data[k - 1].val;
data[j].val = new[i].val;
(*n_data)++;
break;
}
}
if (j == *n_data) {
data[j].val = new[i].val;
(*n_data)++;
}
}
}
static const char *csd_lock_get_type(unsigned int type)
{
return (type >= ARRAY_SIZE(seq_type)) ? "?" : seq_type[type];
}
static void csd_lock_print_extended(struct __call_single_data *csd, int cpu)
{
struct cfd_seq_local *seq = &per_cpu(cfd_seq_local, cpu);
unsigned int srccpu = csd->node.src;
struct call_function_data *cfd = per_cpu_ptr(&cfd_data, srccpu);
struct cfd_percpu *pcpu = per_cpu_ptr(cfd->pcpu, cpu);
unsigned int now;
union cfd_seq_cnt data[2 * ARRAY_SIZE(seq_type)];
unsigned int n_data = 0, i;
data[0].val = READ_ONCE(cfd_seq);
now = data[0].u.cnt;
cfd_seq_data_add(pcpu->seq_queue, srccpu, cpu, CFD_SEQ_QUEUE, data, &n_data, now);
cfd_seq_data_add(pcpu->seq_ipi, srccpu, cpu, CFD_SEQ_IPI, data, &n_data, now);
cfd_seq_data_add(pcpu->seq_noipi, srccpu, cpu, CFD_SEQ_NOIPI, data, &n_data, now);
cfd_seq_data_add(per_cpu(cfd_seq_local.ping, srccpu), srccpu, CFD_SEQ_NOCPU, CFD_SEQ_PING, data, &n_data, now);
cfd_seq_data_add(per_cpu(cfd_seq_local.pinged, srccpu), srccpu, CFD_SEQ_NOCPU, CFD_SEQ_PINGED, data, &n_data, now);
cfd_seq_data_add(seq->idle, CFD_SEQ_NOCPU, cpu, CFD_SEQ_IDLE, data, &n_data, now);
cfd_seq_data_add(seq->gotipi, CFD_SEQ_NOCPU, cpu, CFD_SEQ_GOTIPI, data, &n_data, now);
cfd_seq_data_add(seq->handle, CFD_SEQ_NOCPU, cpu, CFD_SEQ_HANDLE, data, &n_data, now);
cfd_seq_data_add(seq->dequeue, CFD_SEQ_NOCPU, cpu, CFD_SEQ_DEQUEUE, data, &n_data, now);
cfd_seq_data_add(seq->hdlend, CFD_SEQ_NOCPU, cpu, CFD_SEQ_HDLEND, data, &n_data, now);
for (i = 0; i < n_data; i++) {
pr_alert("\tcsd: cnt(%07x): %04x->%04x %s\n",
data[i].u.cnt & ~0x80000000U, data[i].u.src,
data[i].u.dst, csd_lock_get_type(data[i].u.type));
}
pr_alert("\tcsd: cnt now: %07x\n", now);
}
/*
* Complain if too much time spent waiting. Note that only
* the CSD_TYPE_SYNC/ASYNC types provide the destination CPU,
* so waiting on other types gets much less information.
*/
static bool csd_lock_wait_toolong(struct __call_single_data *csd, u64 ts0, u64 *ts1, int *bug_id)
{
int cpu = -1;
int cpux;
bool firsttime;
u64 ts2, ts_delta;
call_single_data_t *cpu_cur_csd;
unsigned int flags = READ_ONCE(csd->node.u_flags);
if (!(flags & CSD_FLAG_LOCK)) {
if (!unlikely(*bug_id))
return true;
cpu = csd_lock_wait_getcpu(csd);
pr_alert("csd: CSD lock (#%d) got unstuck on CPU#%02d, CPU#%02d released the lock.\n",
*bug_id, raw_smp_processor_id(), cpu);
return true;
}
ts2 = sched_clock();
ts_delta = ts2 - *ts1;
if (likely(ts_delta <= CSD_LOCK_TIMEOUT))
return false;
firsttime = !*bug_id;
if (firsttime)
*bug_id = atomic_inc_return(&csd_bug_count);
cpu = csd_lock_wait_getcpu(csd);
if (WARN_ONCE(cpu < 0 || cpu >= nr_cpu_ids, "%s: cpu = %d\n", __func__, cpu))
cpux = 0;
else
cpux = cpu;
cpu_cur_csd = smp_load_acquire(&per_cpu(cur_csd, cpux)); /* Before func and info. */
pr_alert("csd: %s non-responsive CSD lock (#%d) on CPU#%d, waiting %llu ns for CPU#%02d %pS(%ps).\n",
firsttime ? "Detected" : "Continued", *bug_id, raw_smp_processor_id(), ts2 - ts0,
cpu, csd->func, csd->info);
if (cpu_cur_csd && csd != cpu_cur_csd) {
pr_alert("\tcsd: CSD lock (#%d) handling prior %pS(%ps) request.\n",
*bug_id, READ_ONCE(per_cpu(cur_csd_func, cpux)),
READ_ONCE(per_cpu(cur_csd_info, cpux)));
} else {
pr_alert("\tcsd: CSD lock (#%d) %s.\n",
*bug_id, !cpu_cur_csd ? "unresponsive" : "handling this request");
}
if (cpu >= 0) {
if (static_branch_unlikely(&csdlock_debug_extended))
csd_lock_print_extended(csd, cpu);
if (!trigger_single_cpu_backtrace(cpu))
dump_cpu_task(cpu);
if (!cpu_cur_csd) {
pr_alert("csd: Re-sending CSD lock (#%d) IPI from CPU#%02d to CPU#%02d\n", *bug_id, raw_smp_processor_id(), cpu);
arch_send_call_function_single_ipi(cpu);
}
}
dump_stack();
*ts1 = ts2;
return false;
}
/*
* csd_lock/csd_unlock used to serialize access to per-cpu csd resources
*
* For non-synchronous ipi calls the csd can still be in use by the
* previous function call. For multi-cpu calls its even more interesting
* as we'll have to ensure no other cpu is observing our csd.
*/
static void __csd_lock_wait(struct __call_single_data *csd)
{
int bug_id = 0;
u64 ts0, ts1;
ts1 = ts0 = sched_clock();
for (;;) {
if (csd_lock_wait_toolong(csd, ts0, &ts1, &bug_id))
break;
cpu_relax();
}
smp_acquire__after_ctrl_dep();
}
static __always_inline void csd_lock_wait(struct __call_single_data *csd)
{
if (static_branch_unlikely(&csdlock_debug_enabled)) {
__csd_lock_wait(csd);
return;
}
smp_cond_load_acquire(&csd->node.u_flags, !(VAL & CSD_FLAG_LOCK));
}
static void __smp_call_single_queue_debug(int cpu, struct llist_node *node)
{
unsigned int this_cpu = smp_processor_id();
struct cfd_seq_local *seq = this_cpu_ptr(&cfd_seq_local);
struct call_function_data *cfd = this_cpu_ptr(&cfd_data);
struct cfd_percpu *pcpu = per_cpu_ptr(cfd->pcpu, cpu);
cfd_seq_store(pcpu->seq_queue, this_cpu, cpu, CFD_SEQ_QUEUE);
if (llist_add(node, &per_cpu(call_single_queue, cpu))) {
cfd_seq_store(pcpu->seq_ipi, this_cpu, cpu, CFD_SEQ_IPI);
cfd_seq_store(seq->ping, this_cpu, cpu, CFD_SEQ_PING);
send_call_function_single_ipi(cpu);
cfd_seq_store(seq->pinged, this_cpu, cpu, CFD_SEQ_PINGED);
} else {
cfd_seq_store(pcpu->seq_noipi, this_cpu, cpu, CFD_SEQ_NOIPI);
}
}
#else
#define cfd_seq_store(var, src, dst, type)
static void csd_lock_record(struct __call_single_data *csd)
{
}
static __always_inline void csd_lock_wait(struct __call_single_data *csd)
{
smp_cond_load_acquire(&csd->node.u_flags, !(VAL & CSD_FLAG_LOCK));
}
#endif
static __always_inline void csd_lock(struct __call_single_data *csd)
{
csd_lock_wait(csd);
csd->node.u_flags |= CSD_FLAG_LOCK;
/*
* prevent CPU from reordering the above assignment
* to ->flags with any subsequent assignments to other
* fields of the specified call_single_data_t structure:
*/
smp_wmb();
}
static __always_inline void csd_unlock(struct __call_single_data *csd)
{
WARN_ON(!(csd->node.u_flags & CSD_FLAG_LOCK));
/*
* ensure we're all done before releasing data:
*/
smp_store_release(&csd->node.u_flags, 0);
}
static DEFINE_PER_CPU_SHARED_ALIGNED(call_single_data_t, csd_data);
void __smp_call_single_queue(int cpu, struct llist_node *node)
{
#ifdef CONFIG_CSD_LOCK_WAIT_DEBUG
if (static_branch_unlikely(&csdlock_debug_extended)) {
unsigned int type;
type = CSD_TYPE(container_of(node, call_single_data_t,
node.llist));
if (type == CSD_TYPE_SYNC || type == CSD_TYPE_ASYNC) {
__smp_call_single_queue_debug(cpu, node);
return;
}
}
#endif
/*
* The list addition should be visible before sending the IPI
* handler locks the list to pull the entry off it because of
* normal cache coherency rules implied by spinlocks.
*
* If IPIs can go out of order to the cache coherency protocol
* in an architecture, sufficient synchronisation should be added
* to arch code to make it appear to obey cache coherency WRT
* locking and barrier primitives. Generic code isn't really
* equipped to do the right thing...
*/
if (llist_add(node, &per_cpu(call_single_queue, cpu))) send_call_function_single_ipi(cpu);
}
/*
* Insert a previously allocated call_single_data_t element
* for execution on the given CPU. data must already have
* ->func, ->info, and ->flags set.
*/
static int generic_exec_single(int cpu, struct __call_single_data *csd)
{
if (cpu == smp_processor_id()) {
smp_call_func_t func = csd->func;
void *info = csd->info;
unsigned long flags;
/*
* We can unlock early even for the synchronous on-stack case,
* since we're doing this from the same CPU..
*/
csd_lock_record(csd);
csd_unlock(csd);
local_irq_save(flags);
func(info);
csd_lock_record(NULL);
local_irq_restore(flags);
return 0;
}
if ((unsigned)cpu >= nr_cpu_ids || !cpu_online(cpu)) {
csd_unlock(csd);
return -ENXIO;
}
__smp_call_single_queue(cpu, &csd->node.llist);
return 0;
}
/**
* generic_smp_call_function_single_interrupt - Execute SMP IPI callbacks
*
* Invoked by arch to handle an IPI for call function single.
* Must be called with interrupts disabled.
*/
void generic_smp_call_function_single_interrupt(void)
{
cfd_seq_store(this_cpu_ptr(&cfd_seq_local)->gotipi, CFD_SEQ_NOCPU,
smp_processor_id(), CFD_SEQ_GOTIPI);
flush_smp_call_function_queue(true);
}
/**
* flush_smp_call_function_queue - Flush pending smp-call-function callbacks
*
* @warn_cpu_offline: If set to 'true', warn if callbacks were queued on an
* offline CPU. Skip this check if set to 'false'.
*
* Flush any pending smp-call-function callbacks queued on this CPU. This is
* invoked by the generic IPI handler, as well as by a CPU about to go offline,
* to ensure that all pending IPI callbacks are run before it goes completely
* offline.
*
* Loop through the call_single_queue and run all the queued callbacks.
* Must be called with interrupts disabled.
*/
static void flush_smp_call_function_queue(bool warn_cpu_offline)
{
call_single_data_t *csd, *csd_next;
struct llist_node *entry, *prev;
struct llist_head *head;
static bool warned;
lockdep_assert_irqs_disabled();
head = this_cpu_ptr(&call_single_queue);
cfd_seq_store(this_cpu_ptr(&cfd_seq_local)->handle, CFD_SEQ_NOCPU,
smp_processor_id(), CFD_SEQ_HANDLE);
entry = llist_del_all(head);
cfd_seq_store(this_cpu_ptr(&cfd_seq_local)->dequeue,
/* Special meaning of source cpu: 0 == queue empty */
entry ? CFD_SEQ_NOCPU : 0,
smp_processor_id(), CFD_SEQ_DEQUEUE);
entry = llist_reverse_order(entry);
/* There shouldn't be any pending callbacks on an offline CPU. */
if (unlikely(warn_cpu_offline && !cpu_online(smp_processor_id()) &&
!warned && entry != NULL)) {
warned = true;
WARN(1, "IPI on offline CPU %d\n", smp_processor_id());
/*
* We don't have to use the _safe() variant here
* because we are not invoking the IPI handlers yet.
*/
llist_for_each_entry(csd, entry, node.llist) {
switch (CSD_TYPE(csd)) {
case CSD_TYPE_ASYNC:
case CSD_TYPE_SYNC:
case CSD_TYPE_IRQ_WORK:
pr_warn("IPI callback %pS sent to offline CPU\n",
csd->func);
break;
case CSD_TYPE_TTWU:
pr_warn("IPI task-wakeup sent to offline CPU\n");
break;
default:
pr_warn("IPI callback, unknown type %d, sent to offline CPU\n",
CSD_TYPE(csd));
break;
}
}
}
/*
* First; run all SYNC callbacks, people are waiting for us.
*/
prev = NULL;
llist_for_each_entry_safe(csd, csd_next, entry, node.llist) {
/* Do we wait until *after* callback? */
if (CSD_TYPE(csd) == CSD_TYPE_SYNC) {
smp_call_func_t func = csd->func;
void *info = csd->info;
if (prev) {
prev->next = &csd_next->node.llist;
} else {
entry = &csd_next->node.llist;
}
csd_lock_record(csd);
func(info);
csd_unlock(csd);
csd_lock_record(NULL);
} else {
prev = &csd->node.llist;
}
}
if (!entry) {
cfd_seq_store(this_cpu_ptr(&cfd_seq_local)->hdlend,
0, smp_processor_id(),
CFD_SEQ_HDLEND);
return;
}
/*
* Second; run all !SYNC callbacks.
*/
prev = NULL;
llist_for_each_entry_safe(csd, csd_next, entry, node.llist) {
int type = CSD_TYPE(csd);
if (type != CSD_TYPE_TTWU) {
if (prev) {
prev->next = &csd_next->node.llist;
} else {
entry = &csd_next->node.llist;
}
if (type == CSD_TYPE_ASYNC) {
smp_call_func_t func = csd->func;
void *info = csd->info;
csd_lock_record(csd);
csd_unlock(csd);
func(info);
csd_lock_record(NULL);
} else if (type == CSD_TYPE_IRQ_WORK) {
irq_work_single(csd);
}
} else {
prev = &csd->node.llist;
}
}
/*
* Third; only CSD_TYPE_TTWU is left, issue those.
*/
if (entry)
sched_ttwu_pending(entry);
cfd_seq_store(this_cpu_ptr(&cfd_seq_local)->hdlend, CFD_SEQ_NOCPU,
smp_processor_id(), CFD_SEQ_HDLEND);
}
void flush_smp_call_function_from_idle(void)
{
unsigned long flags;
if (llist_empty(this_cpu_ptr(&call_single_queue)))
return;
cfd_seq_store(this_cpu_ptr(&cfd_seq_local)->idle, CFD_SEQ_NOCPU,
smp_processor_id(), CFD_SEQ_IDLE);
local_irq_save(flags);
flush_smp_call_function_queue(true);
if (local_softirq_pending())
do_softirq();
local_irq_restore(flags);
}
/*
* smp_call_function_single - Run a function on a specific CPU
* @func: The function to run. This must be fast and non-blocking.
* @info: An arbitrary pointer to pass to the function.
* @wait: If true, wait until function has completed on other CPUs.
*
* Returns 0 on success, else a negative status code.
*/
int smp_call_function_single(int cpu, smp_call_func_t func, void *info,
int wait)
{
call_single_data_t *csd;
call_single_data_t csd_stack = {
.node = { .u_flags = CSD_FLAG_LOCK | CSD_TYPE_SYNC, },
};
int this_cpu;
int err;
/*
* prevent preemption and reschedule on another processor,
* as well as CPU removal
*/
this_cpu = get_cpu();
/*
* Can deadlock when called with interrupts disabled.
* We allow cpu's that are not yet online though, as no one else can
* send smp call function interrupt to this cpu and as such deadlocks
* can't happen.
*/
WARN_ON_ONCE(cpu_online(this_cpu) && irqs_disabled()
&& !oops_in_progress);
/*
* When @wait we can deadlock when we interrupt between llist_add() and
* arch_send_call_function_ipi*(); when !@wait we can deadlock due to
* csd_lock() on because the interrupt context uses the same csd
* storage.
*/
WARN_ON_ONCE(!in_task());
csd = &csd_stack;
if (!wait) {
csd = this_cpu_ptr(&csd_data);
csd_lock(csd);
}
csd->func = func;
csd->info = info;
#ifdef CONFIG_CSD_LOCK_WAIT_DEBUG
csd->node.src = smp_processor_id();
csd->node.dst = cpu;
#endif
err = generic_exec_single(cpu, csd);
if (wait)
csd_lock_wait(csd);
put_cpu();
return err;
}
EXPORT_SYMBOL(smp_call_function_single);
/**
* smp_call_function_single_async() - Run an asynchronous function on a
* specific CPU.
* @cpu: The CPU to run on.
* @csd: Pre-allocated and setup data structure
*
* Like smp_call_function_single(), but the call is asynchonous and
* can thus be done from contexts with disabled interrupts.
*
* The caller passes his own pre-allocated data structure
* (ie: embedded in an object) and is responsible for synchronizing it
* such that the IPIs performed on the @csd are strictly serialized.
*
* If the function is called with one csd which has not yet been
* processed by previous call to smp_call_function_single_async(), the
* function will return immediately with -EBUSY showing that the csd
* object is still in progress.
*
* NOTE: Be careful, there is unfortunately no current debugging facility to
* validate the correctness of this serialization.
*
* Return: %0 on success or negative errno value on error
*/
int smp_call_function_single_async(int cpu, struct __call_single_data *csd)
{
int err = 0;
preempt_disable();
if (csd->node.u_flags & CSD_FLAG_LOCK) {
err = -EBUSY;
goto out;
}
csd->node.u_flags = CSD_FLAG_LOCK;
smp_wmb();
err = generic_exec_single(cpu, csd);
out:
preempt_enable();
return err;
}
EXPORT_SYMBOL_GPL(smp_call_function_single_async);
/*
* smp_call_function_any - Run a function on any of the given cpus
* @mask: The mask of cpus it can run on.
* @func: The function to run. This must be fast and non-blocking.
* @info: An arbitrary pointer to pass to the function.
* @wait: If true, wait until function has completed.
*
* Returns 0 on success, else a negative status code (if no cpus were online).
*
* Selection preference:
* 1) current cpu if in @mask
* 2) any cpu of current node if in @mask
* 3) any other online cpu in @mask
*/
int smp_call_function_any(const struct cpumask *mask,
smp_call_func_t func, void *info, int wait)
{
unsigned int cpu;
const struct cpumask *nodemask;
int ret;
/* Try for same CPU (cheapest) */
cpu = get_cpu();
if (cpumask_test_cpu(cpu, mask))
goto call;
/* Try for same node. */
nodemask = cpumask_of_node(cpu_to_node(cpu));
for (cpu = cpumask_first_and(nodemask, mask); cpu < nr_cpu_ids;
cpu = cpumask_next_and(cpu, nodemask, mask)) {
if (cpu_online(cpu))
goto call;
}
/* Any online will do: smp_call_function_single handles nr_cpu_ids. */
cpu = cpumask_any_and(mask, cpu_online_mask);
call:
ret = smp_call_function_single(cpu, func, info, wait);
put_cpu();
return ret;
}
EXPORT_SYMBOL_GPL(smp_call_function_any);
/*
* Flags to be used as scf_flags argument of smp_call_function_many_cond().
*
* %SCF_WAIT: Wait until function execution is completed
* %SCF_RUN_LOCAL: Run also locally if local cpu is set in cpumask
*/
#define SCF_WAIT (1U << 0)
#define SCF_RUN_LOCAL (1U << 1)
static void smp_call_function_many_cond(const struct cpumask *mask,
smp_call_func_t func, void *info,
unsigned int scf_flags,
smp_cond_func_t cond_func)
{
int cpu, last_cpu, this_cpu = smp_processor_id();
struct call_function_data *cfd;
bool wait = scf_flags & SCF_WAIT;
bool run_remote = false;
bool run_local = false;
int nr_cpus = 0;
lockdep_assert_preemption_disabled();
/*
* Can deadlock when called with interrupts disabled.
* We allow cpu's that are not yet online though, as no one else can
* send smp call function interrupt to this cpu and as such deadlocks
* can't happen.
*/
if (cpu_online(this_cpu) && !oops_in_progress &&
!early_boot_irqs_disabled)
lockdep_assert_irqs_enabled();
/*
* When @wait we can deadlock when we interrupt between llist_add() and
* arch_send_call_function_ipi*(); when !@wait we can deadlock due to
* csd_lock() on because the interrupt context uses the same csd
* storage.
*/
WARN_ON_ONCE(!in_task());
/* Check if we need local execution. */
if ((scf_flags & SCF_RUN_LOCAL) && cpumask_test_cpu(this_cpu, mask))
run_local = true;
/* Check if we need remote execution, i.e., any CPU excluding this one. */
cpu = cpumask_first_and(mask, cpu_online_mask);
if (cpu == this_cpu)
cpu = cpumask_next_and(cpu, mask, cpu_online_mask); if (cpu < nr_cpu_ids)
run_remote = true;
if (run_remote) {
cfd = this_cpu_ptr(&cfd_data);
cpumask_and(cfd->cpumask, mask, cpu_online_mask);
__cpumask_clear_cpu(this_cpu, cfd->cpumask);
cpumask_clear(cfd->cpumask_ipi);
for_each_cpu(cpu, cfd->cpumask) { struct cfd_percpu *pcpu = per_cpu_ptr(cfd->pcpu, cpu);
call_single_data_t *csd = &pcpu->csd;
if (cond_func && !cond_func(cpu, info))
continue;
csd_lock(csd);
if (wait)
csd->node.u_flags |= CSD_TYPE_SYNC; csd->func = func;
csd->info = info;
#ifdef CONFIG_CSD_LOCK_WAIT_DEBUG
csd->node.src = smp_processor_id();
csd->node.dst = cpu;
#endif
cfd_seq_store(pcpu->seq_queue, this_cpu, cpu, CFD_SEQ_QUEUE);
if (llist_add(&csd->node.llist, &per_cpu(call_single_queue, cpu))) {
__cpumask_set_cpu(cpu, cfd->cpumask_ipi);
nr_cpus++;
last_cpu = cpu;
cfd_seq_store(pcpu->seq_ipi, this_cpu, cpu, CFD_SEQ_IPI);
} else {
cfd_seq_store(pcpu->seq_noipi, this_cpu, cpu, CFD_SEQ_NOIPI);
}
}
cfd_seq_store(this_cpu_ptr(&cfd_seq_local)->ping, this_cpu, CFD_SEQ_NOCPU, CFD_SEQ_PING);
/*
* Choose the most efficient way to send an IPI. Note that the
* number of CPUs might be zero due to concurrent changes to the
* provided mask.
*/
if (nr_cpus == 1) send_call_function_single_ipi(last_cpu); else if (likely(nr_cpus > 1))
arch_send_call_function_ipi_mask(cfd->cpumask_ipi);
cfd_seq_store(this_cpu_ptr(&cfd_seq_local)->pinged, this_cpu, CFD_SEQ_NOCPU, CFD_SEQ_PINGED);
}
if (run_local && (!cond_func || cond_func(this_cpu, info))) {
unsigned long flags;
local_irq_save(flags);
func(info);
local_irq_restore(flags);
}
if (run_remote && wait) { for_each_cpu(cpu, cfd->cpumask) {
call_single_data_t *csd;
csd = &per_cpu_ptr(cfd->pcpu, cpu)->csd;
csd_lock_wait(csd);
}
}
}
/**
* smp_call_function_many(): Run a function on a set of CPUs.
* @mask: The set of cpus to run on (only runs on online subset).
* @func: The function to run. This must be fast and non-blocking.
* @info: An arbitrary pointer to pass to the function.
* @wait: Bitmask that controls the operation. If %SCF_WAIT is set, wait
* (atomically) until function has completed on other CPUs. If
* %SCF_RUN_LOCAL is set, the function will also be run locally
* if the local CPU is set in the @cpumask.
*
* If @wait is true, then returns once @func has returned.
*
* You must not call this function with disabled interrupts or from a
* hardware interrupt handler or from a bottom half handler. Preemption
* must be disabled when calling this function.
*/
void smp_call_function_many(const struct cpumask *mask,
smp_call_func_t func, void *info, bool wait)
{
smp_call_function_many_cond(mask, func, info, wait * SCF_WAIT, NULL);
}
EXPORT_SYMBOL(smp_call_function_many);
/**
* smp_call_function(): Run a function on all other CPUs.
* @func: The function to run. This must be fast and non-blocking.
* @info: An arbitrary pointer to pass to the function.
* @wait: If true, wait (atomically) until function has completed
* on other CPUs.
*
* Returns 0.
*
* If @wait is true, then returns once @func has returned; otherwise
* it returns just before the target cpu calls @func.
*
* You must not call this function with disabled interrupts or from a
* hardware interrupt handler or from a bottom half handler.
*/
void smp_call_function(smp_call_func_t func, void *info, int wait)
{
preempt_disable();
smp_call_function_many(cpu_online_mask, func, info, wait);
preempt_enable();
}
EXPORT_SYMBOL(smp_call_function);
/* Setup configured maximum number of CPUs to activate */
unsigned int setup_max_cpus = NR_CPUS;
EXPORT_SYMBOL(setup_max_cpus);
/*
* Setup routine for controlling SMP activation
*
* Command-line option of "nosmp" or "maxcpus=0" will disable SMP
* activation entirely (the MPS table probe still happens, though).
*
* Command-line option of "maxcpus=<NUM>", where <NUM> is an integer
* greater than 0, limits the maximum number of CPUs activated in
* SMP mode to <NUM>.
*/
void __weak arch_disable_smp_support(void) { }
static int __init nosmp(char *str)
{
setup_max_cpus = 0;
arch_disable_smp_support();
return 0;
}
early_param("nosmp", nosmp);
/* this is hard limit */
static int __init nrcpus(char *str)
{
int nr_cpus;
if (get_option(&str, &nr_cpus) && nr_cpus > 0 && nr_cpus < nr_cpu_ids)
nr_cpu_ids = nr_cpus;
return 0;
}
early_param("nr_cpus", nrcpus);
static int __init maxcpus(char *str)
{
get_option(&str, &setup_max_cpus);
if (setup_max_cpus == 0)
arch_disable_smp_support();
return 0;
}
early_param("maxcpus", maxcpus);
/* Setup number of possible processor ids */
unsigned int nr_cpu_ids __read_mostly = NR_CPUS;
EXPORT_SYMBOL(nr_cpu_ids);
/* An arch may set nr_cpu_ids earlier if needed, so this would be redundant */
void __init setup_nr_cpu_ids(void)
{
nr_cpu_ids = find_last_bit(cpumask_bits(cpu_possible_mask),NR_CPUS) + 1;
}
/* Called by boot processor to activate the rest. */
void __init smp_init(void)
{
int num_nodes, num_cpus;
idle_threads_init();
cpuhp_threads_init();
pr_info("Bringing up secondary CPUs ...\n");
bringup_nonboot_cpus(setup_max_cpus);
num_nodes = num_online_nodes();
num_cpus = num_online_cpus();
pr_info("Brought up %d node%s, %d CPU%s\n",
num_nodes, (num_nodes > 1 ? "s" : ""),
num_cpus, (num_cpus > 1 ? "s" : ""));
/* Any cleanup work */
smp_cpus_done(setup_max_cpus);
}
/*
* on_each_cpu_cond(): Call a function on each processor for which
* the supplied function cond_func returns true, optionally waiting
* for all the required CPUs to finish. This may include the local
* processor.
* @cond_func: A callback function that is passed a cpu id and
* the info parameter. The function is called
* with preemption disabled. The function should
* return a blooean value indicating whether to IPI
* the specified CPU.
* @func: The function to run on all applicable CPUs.
* This must be fast and non-blocking.
* @info: An arbitrary pointer to pass to both functions.
* @wait: If true, wait (atomically) until function has
* completed on other CPUs.
*
* Preemption is disabled to protect against CPUs going offline but not online.
* CPUs going online during the call will not be seen or sent an IPI.
*
* You must not call this function with disabled interrupts or
* from a hardware interrupt handler or from a bottom half handler.
*/
void on_each_cpu_cond_mask(smp_cond_func_t cond_func, smp_call_func_t func,
void *info, bool wait, const struct cpumask *mask)
{
unsigned int scf_flags = SCF_RUN_LOCAL;
if (wait)
scf_flags |= SCF_WAIT;
preempt_disable();
smp_call_function_many_cond(mask, func, info, scf_flags, cond_func);
preempt_enable();
}
EXPORT_SYMBOL(on_each_cpu_cond_mask);
static void do_nothing(void *unused)
{
}
/**
* kick_all_cpus_sync - Force all cpus out of idle
*
* Used to synchronize the update of pm_idle function pointer. It's
* called after the pointer is updated and returns after the dummy
* callback function has been executed on all cpus. The execution of
* the function can only happen on the remote cpus after they have
* left the idle function which had been called via pm_idle function
* pointer. So it's guaranteed that nothing uses the previous pointer
* anymore.
*/
void kick_all_cpus_sync(void)
{
/* Make sure the change is visible before we kick the cpus */
smp_mb();
smp_call_function(do_nothing, NULL, 1);
}
EXPORT_SYMBOL_GPL(kick_all_cpus_sync);
/**
* wake_up_all_idle_cpus - break all cpus out of idle
* wake_up_all_idle_cpus try to break all cpus which is in idle state even
* including idle polling cpus, for non-idle cpus, we will do nothing
* for them.
*/
void wake_up_all_idle_cpus(void)
{
int cpu;
preempt_disable();
for_each_online_cpu(cpu) {
if (cpu == smp_processor_id())
continue;
wake_up_if_idle(cpu);
}
preempt_enable();
}
EXPORT_SYMBOL_GPL(wake_up_all_idle_cpus);
/**
* struct smp_call_on_cpu_struct - Call a function on a specific CPU
* @work: &work_struct
* @done: &completion to signal
* @func: function to call
* @data: function's data argument
* @ret: return value from @func
* @cpu: target CPU (%-1 for any CPU)
*
* Used to call a function on a specific cpu and wait for it to return.
* Optionally make sure the call is done on a specified physical cpu via vcpu
* pinning in order to support virtualized environments.
*/
struct smp_call_on_cpu_struct {
struct work_struct work;
struct completion done;
int (*func)(void *);
void *data;
int ret;
int cpu;
};
static void smp_call_on_cpu_callback(struct work_struct *work)
{
struct smp_call_on_cpu_struct *sscs;
sscs = container_of(work, struct smp_call_on_cpu_struct, work);
if (sscs->cpu >= 0)
hypervisor_pin_vcpu(sscs->cpu);
sscs->ret = sscs->func(sscs->data);
if (sscs->cpu >= 0)
hypervisor_pin_vcpu(-1);
complete(&sscs->done);
}
int smp_call_on_cpu(unsigned int cpu, int (*func)(void *), void *par, bool phys)
{
struct smp_call_on_cpu_struct sscs = {
.done = COMPLETION_INITIALIZER_ONSTACK(sscs.done),
.func = func,
.data = par,
.cpu = phys ? cpu : -1,
};
INIT_WORK_ONSTACK(&sscs.work, smp_call_on_cpu_callback);
if (cpu >= nr_cpu_ids || !cpu_online(cpu))
return -ENXIO;
queue_work_on(cpu, system_wq, &sscs.work);
wait_for_completion(&sscs.done);
return sscs.ret;
}
EXPORT_SYMBOL_GPL(smp_call_on_cpu);
/*
* Generic process-grouping system.
*
* Based originally on the cpuset system, extracted by Paul Menage
* Copyright (C) 2006 Google, Inc
*
* Notifications support
* Copyright (C) 2009 Nokia Corporation
* Author: Kirill A. Shutemov
*
* Copyright notices from the original cpuset code:
* --------------------------------------------------
* Copyright (C) 2003 BULL SA.
* Copyright (C) 2004-2006 Silicon Graphics, Inc.
*
* Portions derived from Patrick Mochel's sysfs code.
* sysfs is Copyright (c) 2001-3 Patrick Mochel
*
* 2003-10-10 Written by Simon Derr.
* 2003-10-22 Updates by Stephen Hemminger.
* 2004 May-July Rework by Paul Jackson.
* ---------------------------------------------------
*
* This file is subject to the terms and conditions of the GNU General Public
* License. See the file COPYING in the main directory of the Linux
* distribution for more details.
*/
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include "cgroup-internal.h"
#include <linux/cred.h>
#include <linux/errno.h>
#include <linux/init_task.h>
#include <linux/kernel.h>
#include <linux/magic.h>
#include <linux/mutex.h>
#include <linux/mount.h>
#include <linux/pagemap.h>
#include <linux/proc_fs.h>
#include <linux/rcupdate.h>
#include <linux/sched.h>
#include <linux/sched/task.h>
#include <linux/slab.h>
#include <linux/spinlock.h>
#include <linux/percpu-rwsem.h>
#include <linux/string.h>
#include <linux/hashtable.h>
#include <linux/idr.h>
#include <linux/kthread.h>
#include <linux/atomic.h>
#include <linux/cpuset.h>
#include <linux/proc_ns.h>
#include <linux/nsproxy.h>
#include <linux/file.h>
#include <linux/fs_parser.h>
#include <linux/sched/cputime.h>
#include <linux/psi.h>
#include <net/sock.h>
#define CREATE_TRACE_POINTS
#include <trace/events/cgroup.h>
#define CGROUP_FILE_NAME_MAX (MAX_CGROUP_TYPE_NAMELEN + \
MAX_CFTYPE_NAME + 2)
/* let's not notify more than 100 times per second */
#define CGROUP_FILE_NOTIFY_MIN_INTV DIV_ROUND_UP(HZ, 100)
/*
* To avoid confusing the compiler (and generating warnings) with code
* that attempts to access what would be a 0-element array (i.e. sized
* to a potentially empty array when CGROUP_SUBSYS_COUNT == 0), this
* constant expression can be added.
*/
#define CGROUP_HAS_SUBSYS_CONFIG (CGROUP_SUBSYS_COUNT > 0)
/*
* cgroup_mutex is the master lock. Any modification to cgroup or its
* hierarchy must be performed while holding it.
*
* css_set_lock protects task->cgroups pointer, the list of css_set
* objects, and the chain of tasks off each css_set.
*
* These locks are exported if CONFIG_PROVE_RCU so that accessors in
* cgroup.h can use them for lockdep annotations.
*/
DEFINE_MUTEX(cgroup_mutex);
DEFINE_SPINLOCK(css_set_lock);
#ifdef CONFIG_PROVE_RCU
EXPORT_SYMBOL_GPL(cgroup_mutex);
EXPORT_SYMBOL_GPL(css_set_lock);
#endif
DEFINE_SPINLOCK(trace_cgroup_path_lock);
char trace_cgroup_path[TRACE_CGROUP_PATH_LEN];
bool cgroup_debug __read_mostly;
/*
* Protects cgroup_idr and css_idr so that IDs can be released without
* grabbing cgroup_mutex.
*/
static DEFINE_SPINLOCK(cgroup_idr_lock);
/*
* Protects cgroup_file->kn for !self csses. It synchronizes notifications
* against file removal/re-creation across css hiding.
*/
static DEFINE_SPINLOCK(cgroup_file_kn_lock);
DEFINE_PERCPU_RWSEM(cgroup_threadgroup_rwsem);
#define cgroup_assert_mutex_or_rcu_locked() \
RCU_LOCKDEP_WARN(!rcu_read_lock_held() && \
!lockdep_is_held(&cgroup_mutex), \
"cgroup_mutex or RCU read lock required");
/*
* cgroup destruction makes heavy use of work items and there can be a lot
* of concurrent destructions. Use a separate workqueue so that cgroup
* destruction work items don't end up filling up max_active of system_wq
* which may lead to deadlock.
*/
static struct workqueue_struct *cgroup_destroy_wq;
/* generate an array of cgroup subsystem pointers */
#define SUBSYS(_x) [_x ## _cgrp_id] = &_x ## _cgrp_subsys,
struct cgroup_subsys *cgroup_subsys[] = {
#include <linux/cgroup_subsys.h>
};
#undef SUBSYS
/* array of cgroup subsystem names */
#define SUBSYS(_x) [_x ## _cgrp_id] = #_x,
static const char *cgroup_subsys_name[] = {
#include <linux/cgroup_subsys.h>
};
#undef SUBSYS
/* array of static_keys for cgroup_subsys_enabled() and cgroup_subsys_on_dfl() */
#define SUBSYS(_x) \
DEFINE_STATIC_KEY_TRUE(_x ## _cgrp_subsys_enabled_key); \
DEFINE_STATIC_KEY_TRUE(_x ## _cgrp_subsys_on_dfl_key); \
EXPORT_SYMBOL_GPL(_x ## _cgrp_subsys_enabled_key); \
EXPORT_SYMBOL_GPL(_x ## _cgrp_subsys_on_dfl_key);
#include <linux/cgroup_subsys.h>
#undef SUBSYS
#define SUBSYS(_x) [_x ## _cgrp_id] = &_x ## _cgrp_subsys_enabled_key,
static struct static_key_true *cgroup_subsys_enabled_key[] = {
#include <linux/cgroup_subsys.h>
};
#undef SUBSYS
#define SUBSYS(_x) [_x ## _cgrp_id] = &_x ## _cgrp_subsys_on_dfl_key,
static struct static_key_true *cgroup_subsys_on_dfl_key[] = {
#include <linux/cgroup_subsys.h>
};
#undef SUBSYS
static DEFINE_PER_CPU(struct cgroup_rstat_cpu, cgrp_dfl_root_rstat_cpu);
/* the default hierarchy */
struct cgroup_root cgrp_dfl_root = { .cgrp.rstat_cpu = &cgrp_dfl_root_rstat_cpu };
EXPORT_SYMBOL_GPL(cgrp_dfl_root);
/*
* The default hierarchy always exists but is hidden until mounted for the
* first time. This is for backward compatibility.
*/
static bool cgrp_dfl_visible;
/* some controllers are not supported in the default hierarchy */
static u16 cgrp_dfl_inhibit_ss_mask;
/* some controllers are implicitly enabled on the default hierarchy */
static u16 cgrp_dfl_implicit_ss_mask;
/* some controllers can be threaded on the default hierarchy */
static u16 cgrp_dfl_threaded_ss_mask;
/* The list of hierarchy roots */
LIST_HEAD(cgroup_roots);
static int cgroup_root_count;
/* hierarchy ID allocation and mapping, protected by cgroup_mutex */
static DEFINE_IDR(cgroup_hierarchy_idr);
/*
* Assign a monotonically increasing serial number to csses. It guarantees
* cgroups with bigger numbers are newer than those with smaller numbers.
* Also, as csses are always appended to the parent's ->children list, it
* guarantees that sibling csses are always sorted in the ascending serial
* number order on the list. Protected by cgroup_mutex.
*/
static u64 css_serial_nr_next = 1;
/*
* These bitmasks identify subsystems with specific features to avoid
* having to do iterative checks repeatedly.
*/
static u16 have_fork_callback __read_mostly;
static u16 have_exit_callback __read_mostly;
static u16 have_release_callback __read_mostly;
static u16 have_canfork_callback __read_mostly;
/* cgroup namespace for init task */
struct cgroup_namespace init_cgroup_ns = {
.ns.count = REFCOUNT_INIT(2),
.user_ns = &init_user_ns,
.ns.ops = &cgroupns_operations,
.ns.inum = PROC_CGROUP_INIT_INO,
.root_cset = &init_css_set,
};
static struct file_system_type cgroup2_fs_type;
static struct cftype cgroup_base_files[];
/* cgroup optional features */
enum cgroup_opt_features {
#ifdef CONFIG_PSI
OPT_FEATURE_PRESSURE,
#endif
OPT_FEATURE_COUNT
};
static const char *cgroup_opt_feature_names[OPT_FEATURE_COUNT] = {
#ifdef CONFIG_PSI
"pressure",
#endif
};
static u16 cgroup_feature_disable_mask __read_mostly;
static int cgroup_apply_control(struct cgroup *cgrp);
static void cgroup_finalize_control(struct cgroup *cgrp, int ret);
static void css_task_iter_skip(struct css_task_iter *it,
struct task_struct *task);
static int cgroup_destroy_locked(struct cgroup *cgrp);
static struct cgroup_subsys_state *css_create(struct cgroup *cgrp,
struct cgroup_subsys *ss);
static void css_release(struct percpu_ref *ref);
static void kill_css(struct cgroup_subsys_state *css);
static int cgroup_addrm_files(struct cgroup_subsys_state *css,
struct cgroup *cgrp, struct cftype cfts[],
bool is_add);
/**
* cgroup_ssid_enabled - cgroup subsys enabled test by subsys ID
* @ssid: subsys ID of interest
*
* cgroup_subsys_enabled() can only be used with literal subsys names which
* is fine for individual subsystems but unsuitable for cgroup core. This
* is slower static_key_enabled() based test indexed by @ssid.
*/
bool cgroup_ssid_enabled(int ssid)
{
if (!CGROUP_HAS_SUBSYS_CONFIG)
return false;
return static_key_enabled(cgroup_subsys_enabled_key[ssid]);
}
/**
* cgroup_on_dfl - test whether a cgroup is on the default hierarchy
* @cgrp: the cgroup of interest
*
* The default hierarchy is the v2 interface of cgroup and this function
* can be used to test whether a cgroup is on the default hierarchy for
* cases where a subsystem should behave differently depending on the
* interface version.
*
* List of changed behaviors:
*
* - Mount options "noprefix", "xattr", "clone_children", "release_agent"
* and "name" are disallowed.
*
* - When mounting an existing superblock, mount options should match.
*
* - Remount is disallowed.
*
* - rename(2) is disallowed.
*
* - "tasks" is removed. Everything should be at process granularity. Use
* "cgroup.procs" instead.
*
* - "cgroup.procs" is not sorted. pids will be unique unless they got
* recycled in-between reads.
*
* - "release_agent" and "notify_on_release" are removed. Replacement
* notification mechanism will be implemented.
*
* - "cgroup.clone_children" is removed.
*
* - "cgroup.subtree_populated" is available. Its value is 0 if the cgroup
* and its descendants contain no task; otherwise, 1. The file also
* generates kernfs notification which can be monitored through poll and
* [di]notify when the value of the file changes.
*
* - cpuset: tasks will be kept in empty cpusets when hotplug happens and
* take masks of ancestors with non-empty cpus/mems, instead of being
* moved to an ancestor.
*
* - cpuset: a task can be moved into an empty cpuset, and again it takes
* masks of ancestors.
*
* - blkcg: blk-throttle becomes properly hierarchical.
*
* - debug: disallowed on the default hierarchy.
*/
bool cgroup_on_dfl(const struct cgroup *cgrp)
{
return cgrp->root == &cgrp_dfl_root;
}
/* IDR wrappers which synchronize using cgroup_idr_lock */
static int cgroup_idr_alloc(struct idr *idr, void *ptr, int start, int end,
gfp_t gfp_mask)
{
int ret;
idr_preload(gfp_mask);
spin_lock_bh(&cgroup_idr_lock);
ret = idr_alloc(idr, ptr, start, end, gfp_mask & ~__GFP_DIRECT_RECLAIM);
spin_unlock_bh(&cgroup_idr_lock);
idr_preload_end();
return ret;
}
static void *cgroup_idr_replace(struct idr *idr, void *ptr, int id)
{
void *ret;
spin_lock_bh(&cgroup_idr_lock);
ret = idr_replace(idr, ptr, id);
spin_unlock_bh(&cgroup_idr_lock);
return ret;
}
static void cgroup_idr_remove(struct idr *idr, int id)
{
spin_lock_bh(&cgroup_idr_lock);
idr_remove(idr, id);
spin_unlock_bh(&cgroup_idr_lock);
}
static bool cgroup_has_tasks(struct cgroup *cgrp)
{
return cgrp->nr_populated_csets;
}
bool cgroup_is_threaded(struct cgroup *cgrp)
{
return cgrp->dom_cgrp != cgrp;
}
/* can @cgrp host both domain and threaded children? */
static bool cgroup_is_mixable(struct cgroup *cgrp)
{
/*
* Root isn't under domain level resource control exempting it from
* the no-internal-process constraint, so it can serve as a thread
* root and a parent of resource domains at the same time.
*/
return !cgroup_parent(cgrp);
}
/* can @cgrp become a thread root? Should always be true for a thread root */
static bool cgroup_can_be_thread_root(struct cgroup *cgrp)
{
/* mixables don't care */
if (cgroup_is_mixable(cgrp))
return true;
/* domain roots can't be nested under threaded */
if (cgroup_is_threaded(cgrp))
return false;
/* can only have either domain or threaded children */
if (cgrp->nr_populated_domain_children)
return false;
/* and no domain controllers can be enabled */
if (cgrp->subtree_control & ~cgrp_dfl_threaded_ss_mask)
return false;
return true;
}
/* is @cgrp root of a threaded subtree? */
bool cgroup_is_thread_root(struct cgroup *cgrp)
{
/* thread root should be a domain */
if (cgroup_is_threaded(cgrp))
return false;
/* a domain w/ threaded children is a thread root */
if (cgrp->nr_threaded_children)
return true;
/*
* A domain which has tasks and explicit threaded controllers
* enabled is a thread root.
*/
if (cgroup_has_tasks(cgrp) &&
(cgrp->subtree_control & cgrp_dfl_threaded_ss_mask))
return true;
return false;
}
/* a domain which isn't connected to the root w/o brekage can't be used */
static bool cgroup_is_valid_domain(struct cgroup *cgrp)
{
/* the cgroup itself can be a thread root */
if (cgroup_is_threaded(cgrp))
return false;
/* but the ancestors can't be unless mixable */
while ((cgrp = cgroup_parent(cgrp))) {
if (!cgroup_is_mixable(cgrp) && cgroup_is_thread_root(cgrp))
return false;
if (cgroup_is_threaded(cgrp))
return false;
}
return true;
}
/* subsystems visibly enabled on a cgroup */
static u16 cgroup_control(struct cgroup *cgrp)
{
struct cgroup *parent = cgroup_parent(cgrp);
u16 root_ss_mask = cgrp->root->subsys_mask;
if (parent) {
u16 ss_mask = parent->subtree_control;
/* threaded cgroups can only have threaded controllers */
if (cgroup_is_threaded(cgrp))
ss_mask &= cgrp_dfl_threaded_ss_mask;
return ss_mask;
}
if (cgroup_on_dfl(cgrp))
root_ss_mask &= ~(cgrp_dfl_inhibit_ss_mask |
cgrp_dfl_implicit_ss_mask);
return root_ss_mask;
}
/* subsystems enabled on a cgroup */
static u16 cgroup_ss_mask(struct cgroup *cgrp)
{
struct cgroup *parent = cgroup_parent(cgrp);
if (parent) {
u16 ss_mask = parent->subtree_ss_mask;
/* threaded cgroups can only have threaded controllers */
if (cgroup_is_threaded(cgrp))
ss_mask &= cgrp_dfl_threaded_ss_mask;
return ss_mask;
}
return cgrp->root->subsys_mask;
}
/**
* cgroup_css - obtain a cgroup's css for the specified subsystem
* @cgrp: the cgroup of interest
* @ss: the subsystem of interest (%NULL returns @cgrp->self)
*
* Return @cgrp's css (cgroup_subsys_state) associated with @ss. This
* function must be called either under cgroup_mutex or rcu_read_lock() and
* the caller is responsible for pinning the returned css if it wants to
* keep accessing it outside the said locks. This function may return
* %NULL if @cgrp doesn't have @subsys_id enabled.
*/
static struct cgroup_subsys_state *cgroup_css(struct cgroup *cgrp,
struct cgroup_subsys *ss)
{
if (CGROUP_HAS_SUBSYS_CONFIG && ss)
return rcu_dereference_check(cgrp->subsys[ss->id],
lockdep_is_held(&cgroup_mutex));
else
return &cgrp->self;
}
/**
* cgroup_tryget_css - try to get a cgroup's css for the specified subsystem
* @cgrp: the cgroup of interest
* @ss: the subsystem of interest
*
* Find and get @cgrp's css associated with @ss. If the css doesn't exist
* or is offline, %NULL is returned.
*/
static struct cgroup_subsys_state *cgroup_tryget_css(struct cgroup *cgrp,
struct cgroup_subsys *ss)
{
struct cgroup_subsys_state *css;
rcu_read_lock();
css = cgroup_css(cgrp, ss);
if (css && !css_tryget_online(css))
css = NULL;
rcu_read_unlock();
return css;
}
/**
* cgroup_e_css_by_mask - obtain a cgroup's effective css for the specified ss
* @cgrp: the cgroup of interest
* @ss: the subsystem of interest (%NULL returns @cgrp->self)
*
* Similar to cgroup_css() but returns the effective css, which is defined
* as the matching css of the nearest ancestor including self which has @ss
* enabled. If @ss is associated with the hierarchy @cgrp is on, this
* function is guaranteed to return non-NULL css.
*/
static struct cgroup_subsys_state *cgroup_e_css_by_mask(struct cgroup *cgrp,
struct cgroup_subsys *ss)
{
lockdep_assert_held(&cgroup_mutex);
if (!ss)
return &cgrp->self;
/*
* This function is used while updating css associations and thus
* can't test the csses directly. Test ss_mask.
*/
while (!(cgroup_ss_mask(cgrp) & (1 << ss->id))) {
cgrp = cgroup_parent(cgrp);
if (!cgrp)
return NULL;
}
return cgroup_css(cgrp, ss);
}
/**
* cgroup_e_css - obtain a cgroup's effective css for the specified subsystem
* @cgrp: the cgroup of interest
* @ss: the subsystem of interest
*
* Find and get the effective css of @cgrp for @ss. The effective css is
* defined as the matching css of the nearest ancestor including self which
* has @ss enabled. If @ss is not mounted on the hierarchy @cgrp is on,
* the root css is returned, so this function always returns a valid css.
*
* The returned css is not guaranteed to be online, and therefore it is the
* callers responsibility to try get a reference for it.
*/
struct cgroup_subsys_state *cgroup_e_css(struct cgroup *cgrp,
struct cgroup_subsys *ss)
{
struct cgroup_subsys_state *css;
if (!CGROUP_HAS_SUBSYS_CONFIG)
return NULL;
do {
css = cgroup_css(cgrp, ss);
if (css)
return css;
cgrp = cgroup_parent(cgrp);
} while (cgrp);
return init_css_set.subsys[ss->id];
}
/**
* cgroup_get_e_css - get a cgroup's effective css for the specified subsystem
* @cgrp: the cgroup of interest
* @ss: the subsystem of interest
*
* Find and get the effective css of @cgrp for @ss. The effective css is
* defined as the matching css of the nearest ancestor including self which
* has @ss enabled. If @ss is not mounted on the hierarchy @cgrp is on,
* the root css is returned, so this function always returns a valid css.
* The returned css must be put using css_put().
*/
struct cgroup_subsys_state *cgroup_get_e_css(struct cgroup *cgrp,
struct cgroup_subsys *ss)
{
struct cgroup_subsys_state *css;
if (!CGROUP_HAS_SUBSYS_CONFIG)
return NULL;
rcu_read_lock();
do {
css = cgroup_css(cgrp, ss);
if (css && css_tryget_online(css))
goto out_unlock;
cgrp = cgroup_parent(cgrp);
} while (cgrp);
css = init_css_set.subsys[ss->id];
css_get(css);
out_unlock:
rcu_read_unlock();
return css;
}
EXPORT_SYMBOL_GPL(cgroup_get_e_css);
static void cgroup_get_live(struct cgroup *cgrp)
{
WARN_ON_ONCE(cgroup_is_dead(cgrp));
css_get(&cgrp->self);
}
/**
* __cgroup_task_count - count the number of tasks in a cgroup. The caller
* is responsible for taking the css_set_lock.
* @cgrp: the cgroup in question
*/
int __cgroup_task_count(const struct cgroup *cgrp)
{
int count = 0;
struct cgrp_cset_link *link;
lockdep_assert_held(&css_set_lock);
list_for_each_entry(link, &cgrp->cset_links, cset_link)
count += link->cset->nr_tasks;
return count;
}
/**
* cgroup_task_count - count the number of tasks in a cgroup.
* @cgrp: the cgroup in question
*/
int cgroup_task_count(const struct cgroup *cgrp)
{
int count;
spin_lock_irq(&css_set_lock);
count = __cgroup_task_count(cgrp);
spin_unlock_irq(&css_set_lock);
return count;
}
struct cgroup_subsys_state *of_css(struct kernfs_open_file *of)
{
struct cgroup *cgrp = of->kn->parent->priv;
struct cftype *cft = of_cft(of);
/*
* This is open and unprotected implementation of cgroup_css().
* seq_css() is only called from a kernfs file operation which has
* an active reference on the file. Because all the subsystem
* files are drained before a css is disassociated with a cgroup,
* the matching css from the cgroup's subsys table is guaranteed to
* be and stay valid until the enclosing operation is complete.
*/
if (CGROUP_HAS_SUBSYS_CONFIG && cft->ss)
return rcu_dereference_raw(cgrp->subsys[cft->ss->id]);
else
return &cgrp->self;
}
EXPORT_SYMBOL_GPL(of_css);
/**
* for_each_css - iterate all css's of a cgroup
* @css: the iteration cursor
* @ssid: the index of the subsystem, CGROUP_SUBSYS_COUNT after reaching the end
* @cgrp: the target cgroup to iterate css's of
*
* Should be called under cgroup_[tree_]mutex.
*/
#define for_each_css(css, ssid, cgrp) \
for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT; (ssid)++) \
if (!((css) = rcu_dereference_check( \
(cgrp)->subsys[(ssid)], \
lockdep_is_held(&cgroup_mutex)))) { } \
else
/**
* for_each_e_css - iterate all effective css's of a cgroup
* @css: the iteration cursor
* @ssid: the index of the subsystem, CGROUP_SUBSYS_COUNT after reaching the end
* @cgrp: the target cgroup to iterate css's of
*
* Should be called under cgroup_[tree_]mutex.
*/
#define for_each_e_css(css, ssid, cgrp) \
for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT; (ssid)++) \
if (!((css) = cgroup_e_css_by_mask(cgrp, \
cgroup_subsys[(ssid)]))) \
; \
else
/**
* do_each_subsys_mask - filter for_each_subsys with a bitmask
* @ss: the iteration cursor
* @ssid: the index of @ss, CGROUP_SUBSYS_COUNT after reaching the end
* @ss_mask: the bitmask
*
* The block will only run for cases where the ssid-th bit (1 << ssid) of
* @ss_mask is set.
*/
#define do_each_subsys_mask(ss, ssid, ss_mask) do { \
unsigned long __ss_mask = (ss_mask); \
if (!CGROUP_HAS_SUBSYS_CONFIG) { \
(ssid) = 0; \
break; \
} \
for_each_set_bit(ssid, &__ss_mask, CGROUP_SUBSYS_COUNT) { \
(ss) = cgroup_subsys[ssid]; \
{
#define while_each_subsys_mask() \
} \
} \
} while (false)
/* iterate over child cgrps, lock should be held throughout iteration */
#define cgroup_for_each_live_child(child, cgrp) \
list_for_each_entry((child), &(cgrp)->self.children, self.sibling) \
if (({ lockdep_assert_held(&cgroup_mutex); \
cgroup_is_dead(child); })) \
; \
else
/* walk live descendants in pre order */
#define cgroup_for_each_live_descendant_pre(dsct, d_css, cgrp) \
css_for_each_descendant_pre((d_css), cgroup_css((cgrp), NULL)) \
if (({ lockdep_assert_held(&cgroup_mutex); \
(dsct) = (d_css)->cgroup; \
cgroup_is_dead(dsct); })) \
; \
else
/* walk live descendants in postorder */
#define cgroup_for_each_live_descendant_post(dsct, d_css, cgrp) \
css_for_each_descendant_post((d_css), cgroup_css((cgrp), NULL)) \
if (({ lockdep_assert_held(&cgroup_mutex); \
(dsct) = (d_css)->cgroup; \
cgroup_is_dead(dsct); })) \
; \
else
/*
* The default css_set - used by init and its children prior to any
* hierarchies being mounted. It contains a pointer to the root state
* for each subsystem. Also used to anchor the list of css_sets. Not
* reference-counted, to improve performance when child cgroups
* haven't been created.
*/
struct css_set init_css_set = {
.refcount = REFCOUNT_INIT(1),
.dom_cset = &init_css_set,
.tasks = LIST_HEAD_INIT(init_css_set.tasks),
.mg_tasks = LIST_HEAD_INIT(init_css_set.mg_tasks),
.dying_tasks = LIST_HEAD_INIT(init_css_set.dying_tasks),
.task_iters = LIST_HEAD_INIT(init_css_set.task_iters),
.threaded_csets = LIST_HEAD_INIT(init_css_set.threaded_csets),
.cgrp_links = LIST_HEAD_INIT(init_css_set.cgrp_links),
.mg_preload_node = LIST_HEAD_INIT(init_css_set.mg_preload_node),
.mg_node = LIST_HEAD_INIT(init_css_set.mg_node),
/*
* The following field is re-initialized when this cset gets linked
* in cgroup_init(). However, let's initialize the field
* statically too so that the default cgroup can be accessed safely
* early during boot.
*/
.dfl_cgrp = &cgrp_dfl_root.cgrp,
};
static int css_set_count = 1; /* 1 for init_css_set */
static bool css_set_threaded(struct css_set *cset)
{
return cset->dom_cset != cset;
}
/**
* css_set_populated - does a css_set contain any tasks?
* @cset: target css_set
*
* css_set_populated() should be the same as !!cset->nr_tasks at steady
* state. However, css_set_populated() can be called while a task is being
* added to or removed from the linked list before the nr_tasks is
* properly updated. Hence, we can't just look at ->nr_tasks here.
*/
static bool css_set_populated(struct css_set *cset)
{
lockdep_assert_held(&css_set_lock);
return !list_empty(&cset->tasks) || !list_empty(&cset->mg_tasks);
}
/**
* cgroup_update_populated - update the populated count of a cgroup
* @cgrp: the target cgroup
* @populated: inc or dec populated count
*
* One of the css_sets associated with @cgrp is either getting its first
* task or losing the last. Update @cgrp->nr_populated_* accordingly. The
* count is propagated towards root so that a given cgroup's
* nr_populated_children is zero iff none of its descendants contain any
* tasks.
*
* @cgrp's interface file "cgroup.populated" is zero if both
* @cgrp->nr_populated_csets and @cgrp->nr_populated_children are zero and
* 1 otherwise. When the sum changes from or to zero, userland is notified
* that the content of the interface file has changed. This can be used to
* detect when @cgrp and its descendants become populated or empty.
*/
static void cgroup_update_populated(struct cgroup *cgrp, bool populated)
{
struct cgroup *child = NULL;
int adj = populated ? 1 : -1;
lockdep_assert_held(&css_set_lock);
do {
bool was_populated = cgroup_is_populated(cgrp);
if (!child) {
cgrp->nr_populated_csets += adj;
} else {
if (cgroup_is_threaded(child))
cgrp->nr_populated_threaded_children += adj;
else
cgrp->nr_populated_domain_children += adj;
}
if (was_populated == cgroup_is_populated(cgrp))
break;
cgroup1_check_for_release(cgrp);
TRACE_CGROUP_PATH(notify_populated, cgrp,
cgroup_is_populated(cgrp));
cgroup_file_notify(&cgrp->events_file);
child = cgrp;
cgrp = cgroup_parent(cgrp);
} while (cgrp);
}
/**
* css_set_update_populated - update populated state of a css_set
* @cset: target css_set
* @populated: whether @cset is populated or depopulated
*
* @cset is either getting the first task or losing the last. Update the
* populated counters of all associated cgroups accordingly.
*/
static void css_set_update_populated(struct css_set *cset, bool populated)
{
struct cgrp_cset_link *link;
lockdep_assert_held(&css_set_lock);
list_for_each_entry(link, &cset->cgrp_links, cgrp_link)
cgroup_update_populated(link->cgrp, populated);
}
/*
* @task is leaving, advance task iterators which are pointing to it so
* that they can resume at the next position. Advancing an iterator might
* remove it from the list, use safe walk. See css_task_iter_skip() for
* details.
*/
static void css_set_skip_task_iters(struct css_set *cset,
struct task_struct *task)
{
struct css_task_iter *it, *pos;
list_for_each_entry_safe(it, pos, &cset->task_iters, iters_node)
css_task_iter_skip(it, task);
}
/**
* css_set_move_task - move a task from one css_set to another
* @task: task being moved
* @from_cset: css_set @task currently belongs to (may be NULL)
* @to_cset: new css_set @task is being moved to (may be NULL)
* @use_mg_tasks: move to @to_cset->mg_tasks instead of ->tasks
*
* Move @task from @from_cset to @to_cset. If @task didn't belong to any
* css_set, @from_cset can be NULL. If @task is being disassociated
* instead of moved, @to_cset can be NULL.
*
* This function automatically handles populated counter updates and
* css_task_iter adjustments but the caller is responsible for managing
* @from_cset and @to_cset's reference counts.
*/
static void css_set_move_task(struct task_struct *task,
struct css_set *from_cset, struct css_set *to_cset,
bool use_mg_tasks)
{
lockdep_assert_held(&css_set_lock);
if (to_cset && !css_set_populated(to_cset))
css_set_update_populated(to_cset, true);
if (from_cset) {
WARN_ON_ONCE(list_empty(&task->cg_list));
css_set_skip_task_iters(from_cset, task);
list_del_init(&task->cg_list);
if (!css_set_populated(from_cset))
css_set_update_populated(from_cset, false);
} else {
WARN_ON_ONCE(!list_empty(&task->cg_list));
}
if (to_cset) {
/*
* We are synchronized through cgroup_threadgroup_rwsem
* against PF_EXITING setting such that we can't race
* against cgroup_exit()/cgroup_free() dropping the css_set.
*/
WARN_ON_ONCE(task->flags & PF_EXITING);
cgroup_move_task(task, to_cset);
list_add_tail(&task->cg_list, use_mg_tasks ? &to_cset->mg_tasks :
&to_cset->tasks);
}
}
/*
* hash table for cgroup groups. This improves the performance to find
* an existing css_set. This hash doesn't (currently) take into
* account cgroups in empty hierarchies.
*/
#define CSS_SET_HASH_BITS 7
static DEFINE_HASHTABLE(css_set_table, CSS_SET_HASH_BITS);
static unsigned long css_set_hash(struct cgroup_subsys_state *css[])
{
unsigned long key = 0UL;
struct cgroup_subsys *ss;
int i;
for_each_subsys(ss, i)
key += (unsigned long)css[i];
key = (key >> 16) ^ key;
return key;
}
void put_css_set_locked(struct css_set *cset)
{
struct cgrp_cset_link *link, *tmp_link;
struct cgroup_subsys *ss;
int ssid;
lockdep_assert_held(&css_set_lock);
if (!refcount_dec_and_test(&cset->refcount))
return;
WARN_ON_ONCE(!list_empty(&cset->threaded_csets));
/* This css_set is dead. Unlink it and release cgroup and css refs */
for_each_subsys(ss, ssid) {
list_del(&cset->e_cset_node[ssid]);
css_put(cset->subsys[ssid]);
}
hash_del(&cset->hlist);
css_set_count--;
list_for_each_entry_safe(link, tmp_link, &cset->cgrp_links, cgrp_link) {
list_del(&link->cset_link);
list_del(&link->cgrp_link);
if (cgroup_parent(link->cgrp))
cgroup_put(link->cgrp);
kfree(link);
}
if (css_set_threaded(cset)) {
list_del(&cset->threaded_csets_node);
put_css_set_locked(cset->dom_cset);
}
kfree_rcu(cset, rcu_head);
}
/**
* compare_css_sets - helper function for find_existing_css_set().
* @cset: candidate css_set being tested
* @old_cset: existing css_set for a task
* @new_cgrp: cgroup that's being entered by the task
* @template: desired set of css pointers in css_set (pre-calculated)
*
* Returns true if "cset" matches "old_cset" except for the hierarchy
* which "new_cgrp" belongs to, for which it should match "new_cgrp".
*/
static bool compare_css_sets(struct css_set *cset,
struct css_set *old_cset,
struct cgroup *new_cgrp,
struct cgroup_subsys_state *template[])
{
struct cgroup *new_dfl_cgrp;
struct list_head *l1, *l2;
/*
* On the default hierarchy, there can be csets which are
* associated with the same set of cgroups but different csses.
* Let's first ensure that csses match.
*/
if (memcmp(template, cset->subsys, sizeof(cset->subsys)))
return false;
/* @cset's domain should match the default cgroup's */
if (cgroup_on_dfl(new_cgrp))
new_dfl_cgrp = new_cgrp;
else
new_dfl_cgrp = old_cset->dfl_cgrp;
if (new_dfl_cgrp->dom_cgrp != cset->dom_cset->dfl_cgrp)
return false;
/*
* Compare cgroup pointers in order to distinguish between
* different cgroups in hierarchies. As different cgroups may
* share the same effective css, this comparison is always
* necessary.
*/
l1 = &cset->cgrp_links;
l2 = &old_cset->cgrp_links;
while (1) {
struct cgrp_cset_link *link1, *link2;
struct cgroup *cgrp1, *cgrp2;
l1 = l1->next;
l2 = l2->next;
/* See if we reached the end - both lists are equal length. */
if (l1 == &cset->cgrp_links) {
BUG_ON(l2 != &old_cset->cgrp_links);
break;
} else {
BUG_ON(l2 == &old_cset->cgrp_links);
}
/* Locate the cgroups associated with these links. */
link1 = list_entry(l1, struct cgrp_cset_link, cgrp_link);
link2 = list_entry(l2, struct cgrp_cset_link, cgrp_link);
cgrp1 = link1->cgrp;
cgrp2 = link2->cgrp;
/* Hierarchies should be linked in the same order. */
BUG_ON(cgrp1->root != cgrp2->root);
/*
* If this hierarchy is the hierarchy of the cgroup
* that's changing, then we need to check that this
* css_set points to the new cgroup; if it's any other
* hierarchy, then this css_set should point to the
* same cgroup as the old css_set.
*/
if (cgrp1->root == new_cgrp->root) {
if (cgrp1 != new_cgrp)
return false;
} else {
if (cgrp1 != cgrp2)
return false;
}
}
return true;
}
/**
* find_existing_css_set - init css array and find the matching css_set
* @old_cset: the css_set that we're using before the cgroup transition
* @cgrp: the cgroup that we're moving into
* @template: out param for the new set of csses, should be clear on entry
*/
static struct css_set *find_existing_css_set(struct css_set *old_cset,
struct cgroup *cgrp,
struct cgroup_subsys_state *template[])
{
struct cgroup_root *root = cgrp->root;
struct cgroup_subsys *ss;
struct css_set *cset;
unsigned long key;
int i;
/*
* Build the set of subsystem state objects that we want to see in the
* new css_set. While subsystems can change globally, the entries here
* won't change, so no need for locking.
*/
for_each_subsys(ss, i) {
if (root->subsys_mask & (1UL << i)) {
/*
* @ss is in this hierarchy, so we want the
* effective css from @cgrp.
*/
template[i] = cgroup_e_css_by_mask(cgrp, ss);
} else {
/*
* @ss is not in this hierarchy, so we don't want
* to change the css.
*/
template[i] = old_cset->subsys[i];
}
}
key = css_set_hash(template);
hash_for_each_possible(css_set_table, cset, hlist, key) {
if (!compare_css_sets(cset, old_cset, cgrp, template))
continue;
/* This css_set matches what we need */
return cset;
}
/* No existing cgroup group matched */
return NULL;
}
static void free_cgrp_cset_links(struct list_head *links_to_free)
{
struct cgrp_cset_link *link, *tmp_link;
list_for_each_entry_safe(link, tmp_link, links_to_free, cset_link) {
list_del(&link->cset_link);
kfree(link);
}
}
/**
* allocate_cgrp_cset_links - allocate cgrp_cset_links
* @count: the number of links to allocate
* @tmp_links: list_head the allocated links are put on
*
* Allocate @count cgrp_cset_link structures and chain them on @tmp_links
* through ->cset_link. Returns 0 on success or -errno.
*/
static int allocate_cgrp_cset_links(int count, struct list_head *tmp_links)
{
struct cgrp_cset_link *link;
int i;
INIT_LIST_HEAD(tmp_links);
for (i = 0; i < count; i++) {
link = kzalloc(sizeof(*link), GFP_KERNEL);
if (!link) {
free_cgrp_cset_links(tmp_links);
return -ENOMEM;
}
list_add(&link->cset_link, tmp_links);
}
return 0;
}
/**
* link_css_set - a helper function to link a css_set to a cgroup
* @tmp_links: cgrp_cset_link objects allocated by allocate_cgrp_cset_links()
* @cset: the css_set to be linked
* @cgrp: the destination cgroup
*/
static void link_css_set(struct list_head *tmp_links, struct css_set *cset,
struct cgroup *cgrp)
{
struct cgrp_cset_link *link;
BUG_ON(list_empty(tmp_links));
if (cgroup_on_dfl(cgrp))
cset->dfl_cgrp = cgrp;
link = list_first_entry(tmp_links, struct cgrp_cset_link, cset_link);
link->cset = cset;
link->cgrp = cgrp;
/*
* Always add links to the tail of the lists so that the lists are
* in chronological order.
*/
list_move_tail(&link->cset_link, &cgrp->cset_links);
list_add_tail(&link->cgrp_link, &cset->cgrp_links);
if (cgroup_parent(cgrp))
cgroup_get_live(cgrp);
}
/**
* find_css_set - return a new css_set with one cgroup updated
* @old_cset: the baseline css_set
* @cgrp: the cgroup to be updated
*
* Return a new css_set that's equivalent to @old_cset, but with @cgrp
* substituted into the appropriate hierarchy.
*/
static struct css_set *find_css_set(struct css_set *old_cset,
struct cgroup *cgrp)
{
struct cgroup_subsys_state *template[CGROUP_SUBSYS_COUNT] = { };
struct css_set *cset;
struct list_head tmp_links;
struct cgrp_cset_link *link;
struct cgroup_subsys *ss;
unsigned long key;
int ssid;
lockdep_assert_held(&cgroup_mutex);
/* First see if we already have a cgroup group that matches
* the desired set */
spin_lock_irq(&css_set_lock);
cset = find_existing_css_set(old_cset, cgrp, template);
if (cset)
get_css_set(cset);
spin_unlock_irq(&css_set_lock);
if (cset)
return cset;
cset = kzalloc(sizeof(*cset), GFP_KERNEL);
if (!cset)
return NULL;
/* Allocate all the cgrp_cset_link objects that we'll need */
if (allocate_cgrp_cset_links(cgroup_root_count, &tmp_links) < 0) {
kfree(cset);
return NULL;
}
refcount_set(&cset->refcount, 1);
cset->dom_cset = cset;
INIT_LIST_HEAD(&cset->tasks);
INIT_LIST_HEAD(&cset->mg_tasks);
INIT_LIST_HEAD(&cset->dying_tasks);
INIT_LIST_HEAD(&cset->task_iters);
INIT_LIST_HEAD(&cset->threaded_csets);
INIT_HLIST_NODE(&cset->hlist);
INIT_LIST_HEAD(&cset->cgrp_links);
INIT_LIST_HEAD(&cset->mg_preload_node);
INIT_LIST_HEAD(&cset->mg_node);
/* Copy the set of subsystem state objects generated in
* find_existing_css_set() */
memcpy(cset->subsys, template, sizeof(cset->subsys));
spin_lock_irq(&css_set_lock);
/* Add reference counts and links from the new css_set. */
list_for_each_entry(link, &old_cset->cgrp_links, cgrp_link) {
struct cgroup *c = link->cgrp;
if (c->root == cgrp->root)
c = cgrp;
link_css_set(&tmp_links, cset, c);
}
BUG_ON(!list_empty(&tmp_links));
css_set_count++;
/* Add @cset to the hash table */
key = css_set_hash(cset->subsys);
hash_add(css_set_table, &cset->hlist, key);
for_each_subsys(ss, ssid) {
struct cgroup_subsys_state *css = cset->subsys[ssid];
list_add_tail(&cset->e_cset_node[ssid],
&css->cgroup->e_csets[ssid]);
css_get(css);
}
spin_unlock_irq(&css_set_lock);
/*
* If @cset should be threaded, look up the matching dom_cset and
* link them up. We first fully initialize @cset then look for the
* dom_cset. It's simpler this way and safe as @cset is guaranteed
* to stay empty until we return.
*/
if (cgroup_is_threaded(cset->dfl_cgrp)) {
struct css_set *dcset;
dcset = find_css_set(cset, cset->dfl_cgrp->dom_cgrp);
if (!dcset) {
put_css_set(cset);
return NULL;
}
spin_lock_irq(&css_set_lock);
cset->dom_cset = dcset;
list_add_tail(&cset->threaded_csets_node,
&dcset->threaded_csets);
spin_unlock_irq(&css_set_lock);
}
return cset;
}
struct cgroup_root *cgroup_root_from_kf(struct kernfs_root *kf_root)
{
struct cgroup *root_cgrp = kf_root->kn->priv;
return root_cgrp->root;
}
static int cgroup_init_root_id(struct cgroup_root *root)
{
int id;
lockdep_assert_held(&cgroup_mutex);
id = idr_alloc_cyclic(&cgroup_hierarchy_idr, root, 0, 0, GFP_KERNEL);
if (id < 0)
return id;
root->hierarchy_id = id;
return 0;
}
static void cgroup_exit_root_id(struct cgroup_root *root)
{
lockdep_assert_held(&cgroup_mutex);
idr_remove(&cgroup_hierarchy_idr, root->hierarchy_id);
}
void cgroup_free_root(struct cgroup_root *root)
{
kfree(root);
}
static void cgroup_destroy_root(struct cgroup_root *root)
{
struct cgroup *cgrp = &root->cgrp;
struct cgrp_cset_link *link, *tmp_link;
trace_cgroup_destroy_root(root);
cgroup_lock_and_drain_offline(&cgrp_dfl_root.cgrp);
BUG_ON(atomic_read(&root->nr_cgrps));
BUG_ON(!list_empty(&cgrp->self.children));
/* Rebind all subsystems back to the default hierarchy */
WARN_ON(rebind_subsystems(&cgrp_dfl_root, root->subsys_mask));
/*
* Release all the links from cset_links to this hierarchy's
* root cgroup
*/
spin_lock_irq(&css_set_lock);
list_for_each_entry_safe(link, tmp_link, &cgrp->cset_links, cset_link) {
list_del(&link->cset_link);
list_del(&link->cgrp_link);
kfree(link);
}
spin_unlock_irq(&css_set_lock);
if (!list_empty(&root->root_list)) {
list_del(&root->root_list);
cgroup_root_count--;
}
cgroup_exit_root_id(root);
mutex_unlock(&cgroup_mutex);
cgroup_rstat_exit(cgrp);
kernfs_destroy_root(root->kf_root);
cgroup_free_root(root);
}
/*
* look up cgroup associated with current task's cgroup namespace on the
* specified hierarchy
*/
static struct cgroup *
current_cgns_cgroup_from_root(struct cgroup_root *root)
{
struct cgroup *res = NULL;
struct css_set *cset;
lockdep_assert_held(&css_set_lock);
rcu_read_lock();
cset = current->nsproxy->cgroup_ns->root_cset;
if (cset == &init_css_set) {
res = &root->cgrp;
} else if (root == &cgrp_dfl_root) {
res = cset->dfl_cgrp;
} else {
struct cgrp_cset_link *link;
list_for_each_entry(link, &cset->cgrp_links, cgrp_link) {
struct cgroup *c = link->cgrp;
if (c->root == root) {
res = c;
break;
}
}
}
rcu_read_unlock();
BUG_ON(!res);
return res;
}
/* look up cgroup associated with given css_set on the specified hierarchy */
static struct cgroup *cset_cgroup_from_root(struct css_set *cset,
struct cgroup_root *root)
{
struct cgroup *res = NULL;
lockdep_assert_held(&cgroup_mutex);
lockdep_assert_held(&css_set_lock);
if (cset == &init_css_set) {
res = &root->cgrp;
} else if (root == &cgrp_dfl_root) {
res = cset->dfl_cgrp;
} else {
struct cgrp_cset_link *link;
list_for_each_entry(link, &cset->cgrp_links, cgrp_link) {
struct cgroup *c = link->cgrp;
if (c->root == root) {
res = c;
break;
}
}
}
BUG_ON(!res);
return res;
}
/*
* Return the cgroup for "task" from the given hierarchy. Must be
* called with cgroup_mutex and css_set_lock held.
*/
struct cgroup *task_cgroup_from_root(struct task_struct *task,
struct cgroup_root *root)
{
/*
* No need to lock the task - since we hold css_set_lock the
* task can't change groups.
*/
return cset_cgroup_from_root(task_css_set(task), root);
}
/*
* A task must hold cgroup_mutex to modify cgroups.
*
* Any task can increment and decrement the count field without lock.
* So in general, code holding cgroup_mutex can't rely on the count
* field not changing. However, if the count goes to zero, then only
* cgroup_attach_task() can increment it again. Because a count of zero
* means that no tasks are currently attached, therefore there is no
* way a task attached to that cgroup can fork (the other way to
* increment the count). So code holding cgroup_mutex can safely
* assume that if the count is zero, it will stay zero. Similarly, if
* a task holds cgroup_mutex on a cgroup with zero count, it
* knows that the cgroup won't be removed, as cgroup_rmdir()
* needs that mutex.
*
* A cgroup can only be deleted if both its 'count' of using tasks
* is zero, and its list of 'children' cgroups is empty. Since all
* tasks in the system use _some_ cgroup, and since there is always at
* least one task in the system (init, pid == 1), therefore, root cgroup
* always has either children cgroups and/or using tasks. So we don't
* need a special hack to ensure that root cgroup cannot be deleted.
*
* P.S. One more locking exception. RCU is used to guard the
* update of a tasks cgroup pointer by cgroup_attach_task()
*/
static struct kernfs_syscall_ops cgroup_kf_syscall_ops;
static char *cgroup_file_name(struct cgroup *cgrp, const struct cftype *cft,
char *buf)
{
struct cgroup_subsys *ss = cft->ss;
if (cft->ss && !(cft->flags & CFTYPE_NO_PREFIX) &&
!(cgrp->root->flags & CGRP_ROOT_NOPREFIX)) {
const char *dbg = (cft->flags & CFTYPE_DEBUG) ? ".__DEBUG__." : "";
snprintf(buf, CGROUP_FILE_NAME_MAX, "%s%s.%s",
dbg, cgroup_on_dfl(cgrp) ? ss->name : ss->legacy_name,
cft->name);
} else {
strscpy(buf, cft->name, CGROUP_FILE_NAME_MAX);
}
return buf;
}
/**
* cgroup_file_mode - deduce file mode of a control file
* @cft: the control file in question
*
* S_IRUGO for read, S_IWUSR for write.
*/
static umode_t cgroup_file_mode(const struct cftype *cft)
{
umode_t mode = 0;
if (cft->read_u64 || cft->read_s64 || cft->seq_show)
mode |= S_IRUGO;
if (cft->write_u64 || cft->write_s64 || cft->write) {
if (cft->flags & CFTYPE_WORLD_WRITABLE)
mode |= S_IWUGO;
else
mode |= S_IWUSR;
}
return mode;
}
/**
* cgroup_calc_subtree_ss_mask - calculate subtree_ss_mask
* @subtree_control: the new subtree_control mask to consider
* @this_ss_mask: available subsystems
*
* On the default hierarchy, a subsystem may request other subsystems to be
* enabled together through its ->depends_on mask. In such cases, more
* subsystems than specified in "cgroup.subtree_control" may be enabled.
*
* This function calculates which subsystems need to be enabled if
* @subtree_control is to be applied while restricted to @this_ss_mask.
*/
static u16 cgroup_calc_subtree_ss_mask(u16 subtree_control, u16 this_ss_mask)
{
u16 cur_ss_mask = subtree_control;
struct cgroup_subsys *ss;
int ssid;
lockdep_assert_held(&cgroup_mutex);
cur_ss_mask |= cgrp_dfl_implicit_ss_mask;
while (true) {
u16 new_ss_mask = cur_ss_mask;
do_each_subsys_mask(ss, ssid, cur_ss_mask) {
new_ss_mask |= ss->depends_on;
} while_each_subsys_mask();
/*
* Mask out subsystems which aren't available. This can
* happen only if some depended-upon subsystems were bound
* to non-default hierarchies.
*/
new_ss_mask &= this_ss_mask;
if (new_ss_mask == cur_ss_mask)
break;
cur_ss_mask = new_ss_mask;
}
return cur_ss_mask;
}
/**
* cgroup_kn_unlock - unlocking helper for cgroup kernfs methods
* @kn: the kernfs_node being serviced
*
* This helper undoes cgroup_kn_lock_live() and should be invoked before
* the method finishes if locking succeeded. Note that once this function
* returns the cgroup returned by cgroup_kn_lock_live() may become
* inaccessible any time. If the caller intends to continue to access the
* cgroup, it should pin it before invoking this function.
*/
void cgroup_kn_unlock(struct kernfs_node *kn)
{
struct cgroup *cgrp;
if (kernfs_type(kn) == KERNFS_DIR)
cgrp = kn->priv;
else
cgrp = kn->parent->priv;
mutex_unlock(&cgroup_mutex);
kernfs_unbreak_active_protection(kn);
cgroup_put(cgrp);
}
/**
* cgroup_kn_lock_live - locking helper for cgroup kernfs methods
* @kn: the kernfs_node being serviced
* @drain_offline: perform offline draining on the cgroup
*
* This helper is to be used by a cgroup kernfs method currently servicing
* @kn. It breaks the active protection, performs cgroup locking and
* verifies that the associated cgroup is alive. Returns the cgroup if
* alive; otherwise, %NULL. A successful return should be undone by a
* matching cgroup_kn_unlock() invocation. If @drain_offline is %true, the
* cgroup is drained of offlining csses before return.
*
* Any cgroup kernfs method implementation which requires locking the
* associated cgroup should use this helper. It avoids nesting cgroup
* locking under kernfs active protection and allows all kernfs operations
* including self-removal.
*/
struct cgroup *cgroup_kn_lock_live(struct kernfs_node *kn, bool drain_offline)
{
struct cgroup *cgrp;
if (kernfs_type(kn) == KERNFS_DIR)
cgrp = kn->priv;
else
cgrp = kn->parent->priv;
/*
* We're gonna grab cgroup_mutex which nests outside kernfs
* active_ref. cgroup liveliness check alone provides enough
* protection against removal. Ensure @cgrp stays accessible and
* break the active_ref protection.
*/
if (!cgroup_tryget(cgrp))
return NULL;
kernfs_break_active_protection(kn);
if (drain_offline)
cgroup_lock_and_drain_offline(cgrp);
else
mutex_lock(&cgroup_mutex);
if (!cgroup_is_dead(cgrp))
return cgrp;
cgroup_kn_unlock(kn);
return NULL;
}
static void cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft)
{
char name[CGROUP_FILE_NAME_MAX];
lockdep_assert_held(&cgroup_mutex);
if (cft->file_offset) {
struct cgroup_subsys_state *css = cgroup_css(cgrp, cft->ss);
struct cgroup_file *cfile = (void *)css + cft->file_offset;
spin_lock_irq(&cgroup_file_kn_lock);
cfile->kn = NULL;
spin_unlock_irq(&cgroup_file_kn_lock);
del_timer_sync(&cfile->notify_timer);
}
kernfs_remove_by_name(cgrp->kn, cgroup_file_name(cgrp, cft, name));
}
/**
* css_clear_dir - remove subsys files in a cgroup directory
* @css: target css
*/
static void css_clear_dir(struct cgroup_subsys_state *css)
{
struct cgroup *cgrp = css->cgroup;
struct cftype *cfts;
if (!(css->flags & CSS_VISIBLE))
return;
css->flags &= ~CSS_VISIBLE;
if (!css->ss) {
if (cgroup_on_dfl(cgrp))
cfts = cgroup_base_files;
else
cfts = cgroup1_base_files;
cgroup_addrm_files(css, cgrp, cfts, false);
} else {
list_for_each_entry(cfts, &css->ss->cfts, node)
cgroup_addrm_files(css, cgrp, cfts, false);
}
}
/**
* css_populate_dir - create subsys files in a cgroup directory
* @css: target css
*
* On failure, no file is added.
*/
static int css_populate_dir(struct cgroup_subsys_state *css)
{
struct cgroup *cgrp = css->cgroup;
struct cftype *cfts, *failed_cfts;
int ret;
if ((css->flags & CSS_VISIBLE) || !cgrp->kn)
return 0;
if (!css->ss) {
if (cgroup_on_dfl(cgrp))
cfts = cgroup_base_files;
else
cfts = cgroup1_base_files;
ret = cgroup_addrm_files(&cgrp->self, cgrp, cfts, true);
if (ret < 0)
return ret;
} else {
list_for_each_entry(cfts, &css->ss->cfts, node) {
ret = cgroup_addrm_files(css, cgrp, cfts, true);
if (ret < 0) {
failed_cfts = cfts;
goto err;
}
}
}
css->flags |= CSS_VISIBLE;
return 0;
err:
list_for_each_entry(cfts, &css->ss->cfts, node) {
if (cfts == failed_cfts)
break;
cgroup_addrm_files(css, cgrp, cfts, false);
}
return ret;
}
int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask)
{
struct cgroup *dcgrp = &dst_root->cgrp;
struct cgroup_subsys *ss;
int ssid, i, ret;
u16 dfl_disable_ss_mask = 0;
lockdep_assert_held(&cgroup_mutex);
do_each_subsys_mask(ss, ssid, ss_mask) {
/*
* If @ss has non-root csses attached to it, can't move.
* If @ss is an implicit controller, it is exempt from this
* rule and can be stolen.
*/
if (css_next_child(NULL, cgroup_css(&ss->root->cgrp, ss)) &&
!ss->implicit_on_dfl)
return -EBUSY;
/* can't move between two non-dummy roots either */
if (ss->root != &cgrp_dfl_root && dst_root != &cgrp_dfl_root)
return -EBUSY;
/*
* Collect ssid's that need to be disabled from default
* hierarchy.
*/
if (ss->root == &cgrp_dfl_root)
dfl_disable_ss_mask |= 1 << ssid;
} while_each_subsys_mask();
if (dfl_disable_ss_mask) {
struct cgroup *scgrp = &cgrp_dfl_root.cgrp;
/*
* Controllers from default hierarchy that need to be rebound
* are all disabled together in one go.
*/
cgrp_dfl_root.subsys_mask &= ~dfl_disable_ss_mask;
WARN_ON(cgroup_apply_control(scgrp));
cgroup_finalize_control(scgrp, 0);
}
do_each_subsys_mask(ss, ssid, ss_mask) {
struct cgroup_root *src_root = ss->root;
struct cgroup *scgrp = &src_root->cgrp;
struct cgroup_subsys_state *css = cgroup_css(scgrp, ss);
struct css_set *cset;
WARN_ON(!css || cgroup_css(dcgrp, ss));
if (src_root != &cgrp_dfl_root) {
/* disable from the source */
src_root->subsys_mask &= ~(1 << ssid);
WARN_ON(cgroup_apply_control(scgrp));
cgroup_finalize_control(scgrp, 0);
}
/* rebind */
RCU_INIT_POINTER(scgrp->subsys[ssid], NULL);
rcu_assign_pointer(dcgrp->subsys[ssid], css);
ss->root = dst_root;
css->cgroup = dcgrp;
spin_lock_irq(&css_set_lock);
hash_for_each(css_set_table, i, cset, hlist)
list_move_tail(&cset->e_cset_node[ss->id],
&dcgrp->e_csets[ss->id]);
spin_unlock_irq(&css_set_lock);
if (ss->css_rstat_flush) {
list_del_rcu(&css->rstat_css_node);
list_add_rcu(&css->rstat_css_node,
&dcgrp->rstat_css_list);
}
/* default hierarchy doesn't enable controllers by default */
dst_root->subsys_mask |= 1 << ssid;
if (dst_root == &cgrp_dfl_root) {
static_branch_enable(cgroup_subsys_on_dfl_key[ssid]);
} else {
dcgrp->subtree_control |= 1 << ssid;
static_branch_disable(cgroup_subsys_on_dfl_key[ssid]);
}
ret = cgroup_apply_control(dcgrp);
if (ret)
pr_warn("partial failure to rebind %s controller (err=%d)\n",
ss->name, ret);
if (ss->bind)
ss->bind(css);
} while_each_subsys_mask();
kernfs_activate(dcgrp->kn);
return 0;
}
int cgroup_show_path(struct seq_file *sf, struct kernfs_node *kf_node,
struct kernfs_root *kf_root)
{
int len = 0;
char *buf = NULL;
struct cgroup_root *kf_cgroot = cgroup_root_from_kf(kf_root);
struct cgroup *ns_cgroup;
buf = kmalloc(PATH_MAX, GFP_KERNEL);
if (!buf)
return -ENOMEM;
spin_lock_irq(&css_set_lock);
ns_cgroup = current_cgns_cgroup_from_root(kf_cgroot);
len = kernfs_path_from_node(kf_node, ns_cgroup->kn, buf, PATH_MAX);
spin_unlock_irq(&css_set_lock);
if (len >= PATH_MAX)
len = -ERANGE;
else if (len > 0) {
seq_escape(sf, buf, " \t\n\\");
len = 0;
}
kfree(buf);
return len;
}
enum cgroup2_param {
Opt_nsdelegate,
Opt_memory_localevents,
Opt_memory_recursiveprot,
nr__cgroup2_params
};
static const struct fs_parameter_spec cgroup2_fs_parameters[] = {
fsparam_flag("nsdelegate", Opt_nsdelegate),
fsparam_flag("memory_localevents", Opt_memory_localevents),
fsparam_flag("memory_recursiveprot", Opt_memory_recursiveprot),
{}
};
static int cgroup2_parse_param(struct fs_context *fc, struct fs_parameter *param)
{
struct cgroup_fs_context *ctx = cgroup_fc2context(fc);
struct fs_parse_result result;
int opt;
opt = fs_parse(fc, cgroup2_fs_parameters, param, &result);
if (opt < 0)
return opt;
switch (opt) {
case Opt_nsdelegate:
ctx->flags |= CGRP_ROOT_NS_DELEGATE;
return 0;
case Opt_memory_localevents:
ctx->flags |= CGRP_ROOT_MEMORY_LOCAL_EVENTS;
return 0;
case Opt_memory_recursiveprot:
ctx->flags |= CGRP_ROOT_MEMORY_RECURSIVE_PROT;
return 0;
}
return -EINVAL;
}
static void apply_cgroup_root_flags(unsigned int root_flags)
{
if (current->nsproxy->cgroup_ns == &init_cgroup_ns) {
if (root_flags & CGRP_ROOT_NS_DELEGATE)
cgrp_dfl_root.flags |= CGRP_ROOT_NS_DELEGATE;
else
cgrp_dfl_root.flags &= ~CGRP_ROOT_NS_DELEGATE;
if (root_flags & CGRP_ROOT_MEMORY_LOCAL_EVENTS)
cgrp_dfl_root.flags |= CGRP_ROOT_MEMORY_LOCAL_EVENTS;
else
cgrp_dfl_root.flags &= ~CGRP_ROOT_MEMORY_LOCAL_EVENTS;
if (root_flags & CGRP_ROOT_MEMORY_RECURSIVE_PROT)
cgrp_dfl_root.flags |= CGRP_ROOT_MEMORY_RECURSIVE_PROT;
else
cgrp_dfl_root.flags &= ~CGRP_ROOT_MEMORY_RECURSIVE_PROT;
}
}
static int cgroup_show_options(struct seq_file *seq, struct kernfs_root *kf_root)
{
if (cgrp_dfl_root.flags & CGRP_ROOT_NS_DELEGATE)
seq_puts(seq, ",nsdelegate");
if (cgrp_dfl_root.flags & CGRP_ROOT_MEMORY_LOCAL_EVENTS)
seq_puts(seq, ",memory_localevents");
if (cgrp_dfl_root.flags & CGRP_ROOT_MEMORY_RECURSIVE_PROT)
seq_puts(seq, ",memory_recursiveprot");
return 0;
}
static int cgroup_reconfigure(struct fs_context *fc)
{
struct cgroup_fs_context *ctx = cgroup_fc2context(fc);
apply_cgroup_root_flags(ctx->flags);
return 0;
}
static void init_cgroup_housekeeping(struct cgroup *cgrp)
{
struct cgroup_subsys *ss;
int ssid;
INIT_LIST_HEAD(&cgrp->self.sibling);
INIT_LIST_HEAD(&cgrp->self.children);
INIT_LIST_HEAD(&cgrp->cset_links);
INIT_LIST_HEAD(&cgrp->pidlists);
mutex_init(&cgrp->pidlist_mutex);
cgrp->self.cgroup = cgrp;
cgrp->self.flags |= CSS_ONLINE;
cgrp->dom_cgrp = cgrp;
cgrp->max_descendants = INT_MAX;
cgrp->max_depth = INT_MAX;
INIT_LIST_HEAD(&cgrp->rstat_css_list);
prev_cputime_init(&cgrp->prev_cputime);
for_each_subsys(ss, ssid)
INIT_LIST_HEAD(&cgrp->e_csets[ssid]);
init_waitqueue_head(&cgrp->offline_waitq);
INIT_WORK(&cgrp->release_agent_work, cgroup1_release_agent);
}
void init_cgroup_root(struct cgroup_fs_context *ctx)
{
struct cgroup_root *root = ctx->root;
struct cgroup *cgrp = &root->cgrp;
INIT_LIST_HEAD(&root->root_list);
atomic_set(&root->nr_cgrps, 1);
cgrp->root = root;
init_cgroup_housekeeping(cgrp);
root->flags = ctx->flags;
if (ctx->release_agent)
strscpy(root->release_agent_path, ctx->release_agent, PATH_MAX);
if (ctx->name)
strscpy(root->name, ctx->name, MAX_CGROUP_ROOT_NAMELEN);
if (ctx->cpuset_clone_children)
set_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags);
}
int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask)
{
LIST_HEAD(tmp_links);
struct cgroup *root_cgrp = &root->cgrp;
struct kernfs_syscall_ops *kf_sops;
struct css_set *cset;
int i, ret;
lockdep_assert_held(&cgroup_mutex);
ret = percpu_ref_init(&root_cgrp->self.refcnt, css_release,
0, GFP_KERNEL);
if (ret)
goto out;
/*
* We're accessing css_set_count without locking css_set_lock here,
* but that's OK - it can only be increased by someone holding
* cgroup_lock, and that's us. Later rebinding may disable
* controllers on the default hierarchy and thus create new csets,
* which can't be more than the existing ones. Allocate 2x.
*/
ret = allocate_cgrp_cset_links(2 * css_set_count, &tmp_links);
if (ret)
goto cancel_ref;
ret = cgroup_init_root_id(root);
if (ret)
goto cancel_ref;
kf_sops = root == &cgrp_dfl_root ?
&cgroup_kf_syscall_ops : &cgroup1_kf_syscall_ops;
root->kf_root = kernfs_create_root(kf_sops,
KERNFS_ROOT_CREATE_DEACTIVATED |
KERNFS_ROOT_SUPPORT_EXPORTOP |
KERNFS_ROOT_SUPPORT_USER_XATTR,
root_cgrp);
if (IS_ERR(root->kf_root)) {
ret = PTR_ERR(root->kf_root);
goto exit_root_id;
}
root_cgrp->kn = root->kf_root->kn;
WARN_ON_ONCE(cgroup_ino(root_cgrp) != 1);
root_cgrp->ancestor_ids[0] = cgroup_id(root_cgrp);
ret = css_populate_dir(&root_cgrp->self);
if (ret)
goto destroy_root;
ret = cgroup_rstat_init(root_cgrp);
if (ret)
goto destroy_root;
ret = rebind_subsystems(root, ss_mask);
if (ret)
goto exit_stats;
ret = cgroup_bpf_inherit(root_cgrp);
WARN_ON_ONCE(ret);
trace_cgroup_setup_root(root);
/*
* There must be no failure case after here, since rebinding takes
* care of subsystems' refcounts, which are explicitly dropped in
* the failure exit path.
*/
list_add(&root->root_list, &cgroup_roots);
cgroup_root_count++;
/*
* Link the root cgroup in this hierarchy into all the css_set
* objects.
*/
spin_lock_irq(&css_set_lock);
hash_for_each(css_set_table, i, cset, hlist) {
link_css_set(&tmp_links, cset, root_cgrp);
if (css_set_populated(cset))
cgroup_update_populated(root_cgrp, true);
}
spin_unlock_irq(&css_set_lock);
BUG_ON(!list_empty(&root_cgrp->self.children));
BUG_ON(atomic_read(&root->nr_cgrps) != 1);
ret = 0;
goto out;
exit_stats:
cgroup_rstat_exit(root_cgrp);
destroy_root:
kernfs_destroy_root(root->kf_root);
root->kf_root = NULL;
exit_root_id:
cgroup_exit_root_id(root);
cancel_ref:
percpu_ref_exit(&root_cgrp->self.refcnt);
out:
free_cgrp_cset_links(&tmp_links);
return ret;
}
int cgroup_do_get_tree(struct fs_context *fc)
{
struct cgroup_fs_context *ctx = cgroup_fc2context(fc);
int ret;
ctx->kfc.root = ctx->root->kf_root;
if (fc->fs_type == &cgroup2_fs_type)
ctx->kfc.magic = CGROUP2_SUPER_MAGIC;
else
ctx->kfc.magic = CGROUP_SUPER_MAGIC;
ret = kernfs_get_tree(fc);
/*
* In non-init cgroup namespace, instead of root cgroup's dentry,
* we return the dentry corresponding to the cgroupns->root_cgrp.
*/
if (!ret && ctx->ns != &init_cgroup_ns) {
struct dentry *nsdentry;
struct super_block *sb = fc->root->d_sb;
struct cgroup *cgrp;
mutex_lock(&cgroup_mutex);
spin_lock_irq(&css_set_lock);
cgrp = cset_cgroup_from_root(ctx->ns->root_cset, ctx->root);
spin_unlock_irq(&css_set_lock);
mutex_unlock(&cgroup_mutex);
nsdentry = kernfs_node_dentry(cgrp->kn, sb);
dput(fc->root);
if (IS_ERR(nsdentry)) {
deactivate_locked_super(sb);
ret = PTR_ERR(nsdentry);
nsdentry = NULL;
}
fc->root = nsdentry;
}
if (!ctx->kfc.new_sb_created)
cgroup_put(&ctx->root->cgrp);
return ret;
}
/*
* Destroy a cgroup filesystem context.
*/
static void cgroup_fs_context_free(struct fs_context *fc)
{
struct cgroup_fs_context *ctx = cgroup_fc2context(fc);
kfree(ctx->name);
kfree(ctx->release_agent);
put_cgroup_ns(ctx->ns);
kernfs_free_fs_context(fc);
kfree(ctx);
}
static int cgroup_get_tree(struct fs_context *fc)
{
struct cgroup_fs_context *ctx = cgroup_fc2context(fc);
int ret;
cgrp_dfl_visible = true;
cgroup_get_live(&cgrp_dfl_root.cgrp);
ctx->root = &cgrp_dfl_root;
ret = cgroup_do_get_tree(fc);
if (!ret)
apply_cgroup_root_flags(ctx->flags);
return ret;
}
static const struct fs_context_operations cgroup_fs_context_ops = {
.free = cgroup_fs_context_free,
.parse_param = cgroup2_parse_param,
.get_tree = cgroup_get_tree,
.reconfigure = cgroup_reconfigure,
};
static const struct fs_context_operations cgroup1_fs_context_ops = {
.free = cgroup_fs_context_free,
.parse_param = cgroup1_parse_param,
.get_tree = cgroup1_get_tree,
.reconfigure = cgroup1_reconfigure,
};
/*
* Initialise the cgroup filesystem creation/reconfiguration context. Notably,
* we select the namespace we're going to use.
*/
static int cgroup_init_fs_context(struct fs_context *fc)
{
struct cgroup_fs_context *ctx;
ctx = kzalloc(sizeof(struct cgroup_fs_context), GFP_KERNEL);
if (!ctx)
return -ENOMEM;
ctx->ns = current->nsproxy->cgroup_ns;
get_cgroup_ns(ctx->ns);
fc->fs_private = &ctx->kfc;
if (fc->fs_type == &cgroup2_fs_type)
fc->ops = &cgroup_fs_context_ops;
else
fc->ops = &cgroup1_fs_context_ops;
put_user_ns(fc->user_ns);
fc->user_ns = get_user_ns(ctx->ns->user_ns);
fc->global = true;
return 0;
}
static void cgroup_kill_sb(struct super_block *sb)
{
struct kernfs_root *kf_root = kernfs_root_from_sb(sb);
struct cgroup_root *root = cgroup_root_from_kf(kf_root);
/*
* If @root doesn't have any children, start killing it.
* This prevents new mounts by disabling percpu_ref_tryget_live().
*
* And don't kill the default root.
*/
if (list_empty(&root->cgrp.self.children) && root != &cgrp_dfl_root &&
!percpu_ref_is_dying(&root->cgrp.self.refcnt)) {
cgroup_bpf_offline(&root->cgrp);
percpu_ref_kill(&root->cgrp.self.refcnt);
}
cgroup_put(&root->cgrp);
kernfs_kill_sb(sb);
}
struct file_system_type cgroup_fs_type = {
.name = "cgroup",
.init_fs_context = cgroup_init_fs_context,
.parameters = cgroup1_fs_parameters,
.kill_sb = cgroup_kill_sb,
.fs_flags = FS_USERNS_MOUNT,
};
static struct file_system_type cgroup2_fs_type = {
.name = "cgroup2",
.init_fs_context = cgroup_init_fs_context,
.parameters = cgroup2_fs_parameters,
.kill_sb = cgroup_kill_sb,
.fs_flags = FS_USERNS_MOUNT,
};
#ifdef CONFIG_CPUSETS
static const struct fs_context_operations cpuset_fs_context_ops = {
.get_tree = cgroup1_get_tree,
.free = cgroup_fs_context_free,
};
/*
* This is ugly, but preserves the userspace API for existing cpuset
* users. If someone tries to mount the "cpuset" filesystem, we
* silently switch it to mount "cgroup" instead
*/
static int cpuset_init_fs_context(struct fs_context *fc)
{
char *agent = kstrdup("/sbin/cpuset_release_agent", GFP_USER);
struct cgroup_fs_context *ctx;
int err;
err = cgroup_init_fs_context(fc);
if (err) {
kfree(agent);
return err;
}
fc->ops = &cpuset_fs_context_ops;
ctx = cgroup_fc2context(fc);
ctx->subsys_mask = 1 << cpuset_cgrp_id;
ctx->flags |= CGRP_ROOT_NOPREFIX;
ctx->release_agent = agent;
get_filesystem(&cgroup_fs_type);
put_filesystem(fc->fs_type);
fc->fs_type = &cgroup_fs_type;
return 0;
}
static struct file_system_type cpuset_fs_type = {
.name = "cpuset",
.init_fs_context = cpuset_init_fs_context,
.fs_flags = FS_USERNS_MOUNT,
};
#endif
int cgroup_path_ns_locked(struct cgroup *cgrp, char *buf, size_t buflen,
struct cgroup_namespace *ns)
{
struct cgroup *root = cset_cgroup_from_root(ns->root_cset, cgrp->root);
return kernfs_path_from_node(cgrp->kn, root->kn, buf, buflen);
}
int cgroup_path_ns(struct cgroup *cgrp, char *buf, size_t buflen,
struct cgroup_namespace *ns)
{
int ret;
mutex_lock(&cgroup_mutex);
spin_lock_irq(&css_set_lock);
ret = cgroup_path_ns_locked(cgrp, buf, buflen, ns);
spin_unlock_irq(&css_set_lock);
mutex_unlock(&cgroup_mutex);
return ret;
}
EXPORT_SYMBOL_GPL(cgroup_path_ns);
/**
* task_cgroup_path - cgroup path of a task in the first cgroup hierarchy
* @task: target task
* @buf: the buffer to write the path into
* @buflen: the length of the buffer
*
* Determine @task's cgroup on the first (the one with the lowest non-zero
* hierarchy_id) cgroup hierarchy and copy its path into @buf. This
* function grabs cgroup_mutex and shouldn't be used inside locks used by
* cgroup controller callbacks.
*
* Return value is the same as kernfs_path().
*/
int task_cgroup_path(struct task_struct *task, char *buf, size_t buflen)
{
struct cgroup_root *root;
struct cgroup *cgrp;
int hierarchy_id = 1;
int ret;
mutex_lock(&cgroup_mutex);
spin_lock_irq(&css_set_lock);
root = idr_get_next(&cgroup_hierarchy_idr, &hierarchy_id);
if (root) {
cgrp = task_cgroup_from_root(task, root);
ret = cgroup_path_ns_locked(cgrp, buf, buflen, &init_cgroup_ns);
} else {
/* if no hierarchy exists, everyone is in "/" */
ret = strlcpy(buf, "/", buflen);
}
spin_unlock_irq(&css_set_lock);
mutex_unlock(&cgroup_mutex);
return ret;
}
EXPORT_SYMBOL_GPL(task_cgroup_path);
/**
* cgroup_migrate_add_task - add a migration target task to a migration context
* @task: target task
* @mgctx: target migration context
*
* Add @task, which is a migration target, to @mgctx->tset. This function
* becomes noop if @task doesn't need to be migrated. @task's css_set
* should have been added as a migration source and @task->cg_list will be
* moved from the css_set's tasks list to mg_tasks one.
*/
static void cgroup_migrate_add_task(struct task_struct *task,
struct cgroup_mgctx *mgctx)
{
struct css_set *cset;
lockdep_assert_held(&css_set_lock);
/* @task either already exited or can't exit until the end */
if (task->flags & PF_EXITING)
return;
/* cgroup_threadgroup_rwsem protects racing against forks */
WARN_ON_ONCE(list_empty(&task->cg_list));
cset = task_css_set(task);
if (!cset->mg_src_cgrp)
return;
mgctx->tset.nr_tasks++;
list_move_tail(&task->cg_list, &cset->mg_tasks);
if (list_empty(&cset->mg_node))
list_add_tail(&cset->mg_node,
&mgctx->tset.src_csets);
if (list_empty(&cset->mg_dst_cset->mg_node))
list_add_tail(&cset->mg_dst_cset->mg_node,
&mgctx->tset.dst_csets);
}
/**
* cgroup_taskset_first - reset taskset and return the first task
* @tset: taskset of interest
* @dst_cssp: output variable for the destination css
*
* @tset iteration is initialized and the first task is returned.
*/
struct task_struct *cgroup_taskset_first(struct cgroup_taskset *tset,
struct cgroup_subsys_state **dst_cssp)
{
tset->cur_cset = list_first_entry(tset->csets, struct css_set, mg_node);
tset->cur_task = NULL;
return cgroup_taskset_next(tset, dst_cssp);
}
/**
* cgroup_taskset_next - iterate to the next task in taskset
* @tset: taskset of interest
* @dst_cssp: output variable for the destination css
*
* Return the next task in @tset. Iteration must have been initialized
* with cgroup_taskset_first().
*/
struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset,
struct cgroup_subsys_state **dst_cssp)
{
struct css_set *cset = tset->cur_cset;
struct task_struct *task = tset->cur_task;
while (CGROUP_HAS_SUBSYS_CONFIG && &cset->mg_node != tset->csets) {
if (!task)
task = list_first_entry(&cset->mg_tasks,
struct task_struct, cg_list);
else
task = list_next_entry(task, cg_list);
if (&task->cg_list != &cset->mg_tasks) {
tset->cur_cset = cset;
tset->cur_task = task;
/*
* This function may be called both before and
* after cgroup_taskset_migrate(). The two cases
* can be distinguished by looking at whether @cset
* has its ->mg_dst_cset set.
*/
if (cset->mg_dst_cset)
*dst_cssp = cset->mg_dst_cset->subsys[tset->ssid];
else
*dst_cssp = cset->subsys[tset->ssid];
return task;
}
cset = list_next_entry(cset, mg_node);
task = NULL;
}
return NULL;
}
/**
* cgroup_migrate_execute - migrate a taskset
* @mgctx: migration context
*
* Migrate tasks in @mgctx as setup by migration preparation functions.
* This function fails iff one of the ->can_attach callbacks fails and
* guarantees that either all or none of the tasks in @mgctx are migrated.
* @mgctx is consumed regardless of success.
*/
static int cgroup_migrate_execute(struct cgroup_mgctx *mgctx)
{
struct cgroup_taskset *tset = &mgctx->tset;
struct cgroup_subsys *ss;
struct task_struct *task, *tmp_task;
struct css_set *cset, *tmp_cset;
int ssid, failed_ssid, ret;
/* check that we can legitimately attach to the cgroup */
if (tset->nr_tasks) {
do_each_subsys_mask(ss, ssid, mgctx->ss_mask) {
if (ss->can_attach) {
tset->ssid = ssid;
ret = ss->can_attach(tset);
if (ret) {
failed_ssid = ssid;
goto out_cancel_attach;
}
}
} while_each_subsys_mask();
}
/*
* Now that we're guaranteed success, proceed to move all tasks to
* the new cgroup. There are no failure cases after here, so this
* is the commit point.
*/
spin_lock_irq(&css_set_lock);
list_for_each_entry(cset, &tset->src_csets, mg_node) {
list_for_each_entry_safe(task, tmp_task, &cset->mg_tasks, cg_list) {
struct css_set *from_cset = task_css_set(task);
struct css_set *to_cset = cset->mg_dst_cset;
get_css_set(to_cset);
to_cset->nr_tasks++;
css_set_move_task(task, from_cset, to_cset, true);
from_cset->nr_tasks--;
/*
* If the source or destination cgroup is frozen,
* the task might require to change its state.
*/
cgroup_freezer_migrate_task(task, from_cset->dfl_cgrp,
to_cset->dfl_cgrp);
put_css_set_locked(from_cset);
}
}
spin_unlock_irq(&css_set_lock);
/*
* Migration is committed, all target tasks are now on dst_csets.
* Nothing is sensitive to fork() after this point. Notify
* controllers that migration is complete.
*/
tset->csets = &tset->dst_csets;
if (tset->nr_tasks) {
do_each_subsys_mask(ss, ssid, mgctx->ss_mask) {
if (ss->attach) {
tset->ssid = ssid;
ss->attach(tset);
}
} while_each_subsys_mask();
}
ret = 0;
goto out_release_tset;
out_cancel_attach:
if (tset->nr_tasks) {
do_each_subsys_mask(ss, ssid, mgctx->ss_mask) {
if (ssid == failed_ssid)
break;
if (ss->cancel_attach) {
tset->ssid = ssid;
ss->cancel_attach(tset);
}
} while_each_subsys_mask();
}
out_release_tset:
spin_lock_irq(&css_set_lock);
list_splice_init(&tset->dst_csets, &tset->src_csets);
list_for_each_entry_safe(cset, tmp_cset, &tset->src_csets, mg_node) {
list_splice_tail_init(&cset->mg_tasks, &cset->tasks);
list_del_init(&cset->mg_node);
}
spin_unlock_irq(&css_set_lock);
/*
* Re-initialize the cgroup_taskset structure in case it is reused
* again in another cgroup_migrate_add_task()/cgroup_migrate_execute()
* iteration.
*/
tset->nr_tasks = 0;
tset->csets = &tset->src_csets;
return ret;
}
/**
* cgroup_migrate_vet_dst - verify whether a cgroup can be migration destination
* @dst_cgrp: destination cgroup to test
*
* On the default hierarchy, except for the mixable, (possible) thread root
* and threaded cgroups, subtree_control must be zero for migration
* destination cgroups with tasks so that child cgroups don't compete
* against tasks.
*/
int cgroup_migrate_vet_dst(struct cgroup *dst_cgrp)
{
/* v1 doesn't have any restriction */
if (!cgroup_on_dfl(dst_cgrp))
return 0;
/* verify @dst_cgrp can host resources */
if (!cgroup_is_valid_domain(dst_cgrp->dom_cgrp))
return -EOPNOTSUPP;
/* mixables don't care */
if (cgroup_is_mixable(dst_cgrp))
return 0;
/*
* If @dst_cgrp is already or can become a thread root or is
* threaded, it doesn't matter.
*/
if (cgroup_can_be_thread_root(dst_cgrp) || cgroup_is_threaded(dst_cgrp))
return 0;
/* apply no-internal-process constraint */
if (dst_cgrp->subtree_control)
return -EBUSY;
return 0;
}
/**
* cgroup_migrate_finish - cleanup after attach
* @mgctx: migration context
*
* Undo cgroup_migrate_add_src() and cgroup_migrate_prepare_dst(). See
* those functions for details.
*/
void cgroup_migrate_finish(struct cgroup_mgctx *mgctx)
{
LIST_HEAD(preloaded);
struct css_set *cset, *tmp_cset;
lockdep_assert_held(&cgroup_mutex);
spin_lock_irq(&css_set_lock);
list_splice_tail_init(&mgctx->preloaded_src_csets, &preloaded);
list_splice_tail_init(&mgctx->preloaded_dst_csets, &preloaded);
list_for_each_entry_safe(cset, tmp_cset, &preloaded, mg_preload_node) {
cset->mg_src_cgrp = NULL;
cset->mg_dst_cgrp = NULL;
cset->mg_dst_cset = NULL;
list_del_init(&cset->mg_preload_node);
put_css_set_locked(cset);
}
spin_unlock_irq(&css_set_lock);
}
/**
* cgroup_migrate_add_src - add a migration source css_set
* @src_cset: the source css_set to add
* @dst_cgrp: the destination cgroup
* @mgctx: migration context
*
* Tasks belonging to @src_cset are about to be migrated to @dst_cgrp. Pin
* @src_cset and add it to @mgctx->src_csets, which should later be cleaned
* up by cgroup_migrate_finish().
*
* This function may be called without holding cgroup_threadgroup_rwsem
* even if the target is a process. Threads may be created and destroyed
* but as long as cgroup_mutex is not dropped, no new css_set can be put
* into play and the preloaded css_sets are guaranteed to cover all
* migrations.
*/
void cgroup_migrate_add_src(struct css_set *src_cset,
struct cgroup *dst_cgrp,
struct cgroup_mgctx *mgctx)
{
struct cgroup *src_cgrp;
lockdep_assert_held(&cgroup_mutex);
lockdep_assert_held(&css_set_lock);
/*
* If ->dead, @src_set is associated with one or more dead cgroups
* and doesn't contain any migratable tasks. Ignore it early so
* that the rest of migration path doesn't get confused by it.
*/
if (src_cset->dead)
return;
src_cgrp = cset_cgroup_from_root(src_cset, dst_cgrp->root);
if (!list_empty(&src_cset->mg_preload_node))
return;
WARN_ON(src_cset->mg_src_cgrp);
WARN_ON(src_cset->mg_dst_cgrp);
WARN_ON(!list_empty(&src_cset->mg_tasks));
WARN_ON(!list_empty(&src_cset->mg_node));
src_cset->mg_src_cgrp = src_cgrp;
src_cset->mg_dst_cgrp = dst_cgrp;
get_css_set(src_cset);
list_add_tail(&src_cset->mg_preload_node, &mgctx->preloaded_src_csets);
}
/**
* cgroup_migrate_prepare_dst - prepare destination css_sets for migration
* @mgctx: migration context
*
* Tasks are about to be moved and all the source css_sets have been
* preloaded to @mgctx->preloaded_src_csets. This function looks up and
* pins all destination css_sets, links each to its source, and append them
* to @mgctx->preloaded_dst_csets.
*
* This function must be called after cgroup_migrate_add_src() has been
* called on each migration source css_set. After migration is performed
* using cgroup_migrate(), cgroup_migrate_finish() must be called on
* @mgctx.
*/
int cgroup_migrate_prepare_dst(struct cgroup_mgctx *mgctx)
{
struct css_set *src_cset, *tmp_cset;
lockdep_assert_held(&cgroup_mutex);
/* look up the dst cset for each src cset and link it to src */
list_for_each_entry_safe(src_cset, tmp_cset, &mgctx->preloaded_src_csets,
mg_preload_node) {
struct css_set *dst_cset;
struct cgroup_subsys *ss;
int ssid;
dst_cset = find_css_set(src_cset, src_cset->mg_dst_cgrp);
if (!dst_cset)
return -ENOMEM;
WARN_ON_ONCE(src_cset->mg_dst_cset || dst_cset->mg_dst_cset);
/*
* If src cset equals dst, it's noop. Drop the src.
* cgroup_migrate() will skip the cset too. Note that we
* can't handle src == dst as some nodes are used by both.
*/
if (src_cset == dst_cset) {
src_cset->mg_src_cgrp = NULL;
src_cset->mg_dst_cgrp = NULL;
list_del_init(&src_cset->mg_preload_node);
put_css_set(src_cset);
put_css_set(dst_cset);
continue;
}
src_cset->mg_dst_cset = dst_cset;
if (list_empty(&dst_cset->mg_preload_node))
list_add_tail(&dst_cset->mg_preload_node,
&mgctx->preloaded_dst_csets);
else
put_css_set(dst_cset);
for_each_subsys(ss, ssid)
if (src_cset->subsys[ssid] != dst_cset->subsys[ssid])
mgctx->ss_mask |= 1 << ssid;
}
return 0;
}
/**
* cgroup_migrate - migrate a process or task to a cgroup
* @leader: the leader of the process or the task to migrate
* @threadgroup: whether @leader points to the whole process or a single task
* @mgctx: migration context
*
* Migrate a process or task denoted by @leader. If migrating a process,
* the caller must be holding cgroup_threadgroup_rwsem. The caller is also
* responsible for invoking cgroup_migrate_add_src() and
* cgroup_migrate_prepare_dst() on the targets before invoking this
* function and following up with cgroup_migrate_finish().
*
* As long as a controller's ->can_attach() doesn't fail, this function is
* guaranteed to succeed. This means that, excluding ->can_attach()
* failure, when migrating multiple targets, the success or failure can be
* decided for all targets by invoking group_migrate_prepare_dst() before
* actually starting migrating.
*/
int cgroup_migrate(struct task_struct *leader, bool threadgroup,
struct cgroup_mgctx *mgctx)
{
struct task_struct *task;
/*
* Prevent freeing of tasks while we take a snapshot. Tasks that are
* already PF_EXITING could be freed from underneath us unless we
* take an rcu_read_lock.
*/
spin_lock_irq(&css_set_lock);
rcu_read_lock();
task = leader;
do {
cgroup_migrate_add_task(task, mgctx);
if (!threadgroup)
break;
} while_each_thread(leader, task);
rcu_read_unlock();
spin_unlock_irq(&css_set_lock);
return cgroup_migrate_execute(mgctx);
}
/**
* cgroup_attach_task - attach a task or a whole threadgroup to a cgroup
* @dst_cgrp: the cgroup to attach to
* @leader: the task or the leader of the threadgroup to be attached
* @threadgroup: attach the whole threadgroup?
*
* Call holding cgroup_mutex and cgroup_threadgroup_rwsem.
*/
int cgroup_attach_task(struct cgroup *dst_cgrp, struct task_struct *leader,
bool threadgroup)
{
DEFINE_CGROUP_MGCTX(mgctx);
struct task_struct *task;
int ret = 0;
/* look up all src csets */
spin_lock_irq(&css_set_lock);
rcu_read_lock();
task = leader;
do {
cgroup_migrate_add_src(task_css_set(task), dst_cgrp, &mgctx);
if (!threadgroup)
break;
} while_each_thread(leader, task);
rcu_read_unlock();
spin_unlock_irq(&css_set_lock);
/* prepare dst csets and commit */
ret = cgroup_migrate_prepare_dst(&mgctx);
if (!ret)
ret = cgroup_migrate(leader, threadgroup, &mgctx);
cgroup_migrate_finish(&mgctx);
if (!ret)
TRACE_CGROUP_PATH(attach_task, dst_cgrp, leader, threadgroup);
return ret;
}
struct task_struct *cgroup_procs_write_start(char *buf, bool threadgroup,
bool *locked)
__acquires(&cgroup_threadgroup_rwsem)
{
struct task_struct *tsk;
pid_t pid;
if (kstrtoint(strstrip(buf), 0, &pid) || pid < 0)
return ERR_PTR(-EINVAL);
/*
* If we migrate a single thread, we don't care about threadgroup
* stability. If the thread is `current`, it won't exit(2) under our
* hands or change PID through exec(2). We exclude
* cgroup_update_dfl_csses and other cgroup_{proc,thread}s_write
* callers by cgroup_mutex.
* Therefore, we can skip the global lock.
*/
lockdep_assert_held(&cgroup_mutex);
if (pid || threadgroup) {
percpu_down_write(&cgroup_threadgroup_rwsem);
*locked = true;
} else {
*locked = false;
}
rcu_read_lock();
if (pid) {
tsk = find_task_by_vpid(pid);
if (!tsk) {
tsk = ERR_PTR(-ESRCH);
goto out_unlock_threadgroup;
}
} else {
tsk = current;
}
if (threadgroup)
tsk = tsk->group_leader;
/*
* kthreads may acquire PF_NO_SETAFFINITY during initialization.
* If userland migrates such a kthread to a non-root cgroup, it can
* become trapped in a cpuset, or RT kthread may be born in a
* cgroup with no rt_runtime allocated. Just say no.
*/
if (tsk->no_cgroup_migration || (tsk->flags & PF_NO_SETAFFINITY)) {
tsk = ERR_PTR(-EINVAL);
goto out_unlock_threadgroup;
}
get_task_struct(tsk);
goto out_unlock_rcu;
out_unlock_threadgroup:
if (*locked) {
percpu_up_write(&cgroup_threadgroup_rwsem);
*locked = false;
}
out_unlock_rcu:
rcu_read_unlock();
return tsk;
}
void cgroup_procs_write_finish(struct task_struct *task, bool locked)
__releases(&cgroup_threadgroup_rwsem)
{
struct cgroup_subsys *ss;
int ssid;
/* release reference from cgroup_procs_write_start() */
put_task_struct(task);
if (locked)
percpu_up_write(&cgroup_threadgroup_rwsem);
for_each_subsys(ss, ssid)
if (ss->post_attach)
ss->post_attach();
}
static void cgroup_print_ss_mask(struct seq_file *seq, u16 ss_mask)
{
struct cgroup_subsys *ss;
bool printed = false;
int ssid;
do_each_subsys_mask(ss, ssid, ss_mask) {
if (printed)
seq_putc(seq, ' ');
seq_puts(seq, ss->name);
printed = true;
} while_each_subsys_mask();
if (printed)
seq_putc(seq, '\n');
}
/* show controllers which are enabled from the parent */
static int cgroup_controllers_show(struct seq_file *seq, void *v)
{
struct cgroup *cgrp = seq_css(seq)->cgroup;
cgroup_print_ss_mask(seq, cgroup_control(cgrp));
return 0;
}
/* show controllers which are enabled for a given cgroup's children */
static int cgroup_subtree_control_show(struct seq_file *seq, void *v)
{
struct cgroup *cgrp = seq_css(seq)->cgroup;
cgroup_print_ss_mask(seq, cgrp->subtree_control);
return 0;
}
/**
* cgroup_update_dfl_csses - update css assoc of a subtree in default hierarchy
* @cgrp: root of the subtree to update csses for
*
* @cgrp's control masks have changed and its subtree's css associations
* need to be updated accordingly. This function looks up all css_sets
* which are attached to the subtree, creates the matching updated css_sets
* and migrates the tasks to the new ones.
*/
static int cgroup_update_dfl_csses(struct cgroup *cgrp)
{
DEFINE_CGROUP_MGCTX(mgctx);
struct cgroup_subsys_state *d_css;
struct cgroup *dsct;
struct css_set *src_cset;
int ret;
lockdep_assert_held(&cgroup_mutex);
percpu_down_write(&cgroup_threadgroup_rwsem);
/* look up all csses currently attached to @cgrp's subtree */
spin_lock_irq(&css_set_lock);
cgroup_for_each_live_descendant_pre(dsct, d_css, cgrp) {
struct cgrp_cset_link *link;
list_for_each_entry(link, &dsct->cset_links, cset_link)
cgroup_migrate_add_src(link->cset, dsct, &mgctx);
}
spin_unlock_irq(&css_set_lock);
/* NULL dst indicates self on default hierarchy */
ret = cgroup_migrate_prepare_dst(&mgctx);
if (ret)
goto out_finish;
spin_lock_irq(&css_set_lock);
list_for_each_entry(src_cset, &mgctx.preloaded_src_csets, mg_preload_node) {
struct task_struct *task, *ntask;
/* all tasks in src_csets need to be migrated */
list_for_each_entry_safe(task, ntask, &src_cset->tasks, cg_list)
cgroup_migrate_add_task(task, &mgctx);
}
spin_unlock_irq(&css_set_lock);
ret = cgroup_migrate_execute(&mgctx);
out_finish:
cgroup_migrate_finish(&mgctx);
percpu_up_write(&cgroup_threadgroup_rwsem);
return ret;
}
/**
* cgroup_lock_and_drain_offline - lock cgroup_mutex and drain offlined csses
* @cgrp: root of the target subtree
*
* Because css offlining is asynchronous, userland may try to re-enable a
* controller while the previous css is still around. This function grabs
* cgroup_mutex and drains the previous css instances of @cgrp's subtree.
*/
void cgroup_lock_and_drain_offline(struct cgroup *cgrp)
__acquires(&cgroup_mutex)
{
struct cgroup *dsct;
struct cgroup_subsys_state *d_css;
struct cgroup_subsys *ss;
int ssid;
restart:
mutex_lock(&cgroup_mutex);
cgroup_for_each_live_descendant_post(dsct, d_css, cgrp) {
for_each_subsys(ss, ssid) {
struct cgroup_subsys_state *css = cgroup_css(dsct, ss);
DEFINE_WAIT(wait);
if (!css || !percpu_ref_is_dying(&css->refcnt))
continue;
cgroup_get_live(dsct);
prepare_to_wait(&dsct->offline_waitq, &wait,
TASK_UNINTERRUPTIBLE);
mutex_unlock(&cgroup_mutex);
schedule();
finish_wait(&dsct->offline_waitq, &wait);
cgroup_put(dsct);
goto restart;
}
}
}
/**
* cgroup_save_control - save control masks and dom_cgrp of a subtree
* @cgrp: root of the target subtree
*
* Save ->subtree_control, ->subtree_ss_mask and ->dom_cgrp to the
* respective old_ prefixed fields for @cgrp's subtree including @cgrp
* itself.
*/
static void cgroup_save_control(struct cgroup *cgrp)
{
struct cgroup *dsct;
struct cgroup_subsys_state *d_css;
cgroup_for_each_live_descendant_pre(dsct, d_css, cgrp) {
dsct->old_subtree_control = dsct->subtree_control;
dsct->old_subtree_ss_mask = dsct->subtree_ss_mask;
dsct->old_dom_cgrp = dsct->dom_cgrp;
}
}
/**
* cgroup_propagate_control - refresh control masks of a subtree
* @cgrp: root of the target subtree
*
* For @cgrp and its subtree, ensure ->subtree_ss_mask matches
* ->subtree_control and propagate controller availability through the
* subtree so that descendants don't have unavailable controllers enabled.
*/
static void cgroup_propagate_control(struct cgroup *cgrp)
{
struct cgroup *dsct;
struct cgroup_subsys_state *d_css;
cgroup_for_each_live_descendant_pre(dsct, d_css, cgrp) {
dsct->subtree_control &= cgroup_control(dsct);
dsct->subtree_ss_mask =
cgroup_calc_subtree_ss_mask(dsct->subtree_control,
cgroup_ss_mask(dsct));
}
}
/**
* cgroup_restore_control - restore control masks and dom_cgrp of a subtree
* @cgrp: root of the target subtree
*
* Restore ->subtree_control, ->subtree_ss_mask and ->dom_cgrp from the
* respective old_ prefixed fields for @cgrp's subtree including @cgrp
* itself.
*/
static void cgroup_restore_control(struct cgroup *cgrp)
{
struct cgroup *dsct;
struct cgroup_subsys_state *d_css;
cgroup_for_each_live_descendant_post(dsct, d_css, cgrp) {
dsct->subtree_control = dsct->old_subtree_control;
dsct->subtree_ss_mask = dsct->old_subtree_ss_mask;
dsct->dom_cgrp = dsct->old_dom_cgrp;
}
}
static bool css_visible(struct cgroup_subsys_state *css)
{
struct cgroup_subsys *ss = css->ss;
struct cgroup *cgrp = css->cgroup;
if (cgroup_control(cgrp) & (1 << ss->id))
return true;
if (!(cgroup_ss_mask(cgrp) & (1 << ss->id)))
return false;
return cgroup_on_dfl(cgrp) && ss->implicit_on_dfl;
}
/**
* cgroup_apply_control_enable - enable or show csses according to control
* @cgrp: root of the target subtree
*
* Walk @cgrp's subtree and create new csses or make the existing ones
* visible. A css is created invisible if it's being implicitly enabled
* through dependency. An invisible css is made visible when the userland
* explicitly enables it.
*
* Returns 0 on success, -errno on failure. On failure, csses which have
* been processed already aren't cleaned up. The caller is responsible for
* cleaning up with cgroup_apply_control_disable().
*/
static int cgroup_apply_control_enable(struct cgroup *cgrp)
{
struct cgroup *dsct;
struct cgroup_subsys_state *d_css;
struct cgroup_subsys *ss;
int ssid, ret;
cgroup_for_each_live_descendant_pre(dsct, d_css, cgrp) {
for_each_subsys(ss, ssid) {
struct cgroup_subsys_state *css = cgroup_css(dsct, ss);
if (!(cgroup_ss_mask(dsct) & (1 << ss->id)))
continue;
if (!css) {
css = css_create(dsct, ss);
if (IS_ERR(css))
return PTR_ERR(css);
}
WARN_ON_ONCE(percpu_ref_is_dying(&css->refcnt));
if (css_visible(css)) {
ret = css_populate_dir(css);
if (ret)
return ret;
}
}
}
return 0;
}
/**
* cgroup_apply_control_disable - kill or hide csses according to control
* @cgrp: root of the target subtree
*
* Walk @cgrp's subtree and kill and hide csses so that they match
* cgroup_ss_mask() and cgroup_visible_mask().
*
* A css is hidden when the userland requests it to be disabled while other
* subsystems are still depending on it. The css must not actively control
* resources and be in the vanilla state if it's made visible again later.
* Controllers which may be depended upon should provide ->css_reset() for
* this purpose.
*/
static void cgroup_apply_control_disable(struct cgroup *cgrp)
{
struct cgroup *dsct;
struct cgroup_subsys_state *d_css;
struct cgroup_subsys *ss;
int ssid;
cgroup_for_each_live_descendant_post(dsct, d_css, cgrp) {
for_each_subsys(ss, ssid) {
struct cgroup_subsys_state *css = cgroup_css(dsct, ss);
if (!css)
continue;
WARN_ON_ONCE(percpu_ref_is_dying(&css->refcnt));
if (css->parent &&
!(cgroup_ss_mask(dsct) & (1 << ss->id))) {
kill_css(css);
} else if (!css_visible(css)) {
css_clear_dir(css);
if (ss->css_reset)
ss->css_reset(css);
}
}
}
}
/**
* cgroup_apply_control - apply control mask updates to the subtree
* @cgrp: root of the target subtree
*
* subsystems can be enabled and disabled in a subtree using the following
* steps.
*
* 1. Call cgroup_save_control() to stash the current state.
* 2. Update ->subtree_control masks in the subtree as desired.
* 3. Call cgroup_apply_control() to apply the changes.
* 4. Optionally perform other related operations.
* 5. Call cgroup_finalize_control() to finish up.
*
* This function implements step 3 and propagates the mask changes
* throughout @cgrp's subtree, updates csses accordingly and perform
* process migrations.
*/
static int cgroup_apply_control(struct cgroup *cgrp)
{
int ret;
cgroup_propagate_control(cgrp);
ret = cgroup_apply_control_enable(cgrp);
if (ret)
return ret;
/*
* At this point, cgroup_e_css_by_mask() results reflect the new csses
* making the following cgroup_update_dfl_csses() properly update
* css associations of all tasks in the subtree.
*/
ret = cgroup_update_dfl_csses(cgrp);
if (ret)
return ret;
return 0;
}
/**
* cgroup_finalize_control - finalize control mask update
* @cgrp: root of the target subtree
* @ret: the result of the update
*
* Finalize control mask update. See cgroup_apply_control() for more info.
*/
static void cgroup_finalize_control(struct cgroup *cgrp, int ret)
{
if (ret) {
cgroup_restore_control(cgrp);
cgroup_propagate_control(cgrp);
}
cgroup_apply_control_disable(cgrp);
}
static int cgroup_vet_subtree_control_enable(struct cgroup *cgrp, u16 enable)
{
u16 domain_enable = enable & ~cgrp_dfl_threaded_ss_mask;
/* if nothing is getting enabled, nothing to worry about */
if (!enable)
return 0;
/* can @cgrp host any resources? */
if (!cgroup_is_valid_domain(cgrp->dom_cgrp))
return -EOPNOTSUPP;
/* mixables don't care */
if (cgroup_is_mixable(cgrp))
return 0;
if (domain_enable) {
/* can't enable domain controllers inside a thread subtree */
if (cgroup_is_thread_root(cgrp) || cgroup_is_threaded(cgrp))
return -EOPNOTSUPP;
} else {
/*
* Threaded controllers can handle internal competitions
* and are always allowed inside a (prospective) thread
* subtree.
*/
if (cgroup_can_be_thread_root(cgrp) || cgroup_is_threaded(cgrp))
return 0;
}
/*
* Controllers can't be enabled for a cgroup with tasks to avoid
* child cgroups competing against tasks.
*/
if (cgroup_has_tasks(cgrp))
return -EBUSY;
return 0;
}
/* change the enabled child controllers for a cgroup in the default hierarchy */
static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of,
char *buf, size_t nbytes,
loff_t off)
{
u16 enable = 0, disable = 0;
struct cgroup *cgrp, *child;
struct cgroup_subsys *ss;
char *tok;
int ssid, ret;
/*
* Parse input - space separated list of subsystem names prefixed
* with either + or -.
*/
buf = strstrip(buf);
while ((tok = strsep(&buf, " "))) {
if (tok[0] == '\0')
continue;
do_each_subsys_mask(ss, ssid, ~cgrp_dfl_inhibit_ss_mask) {
if (!cgroup_ssid_enabled(ssid) ||
strcmp(tok + 1, ss->name))
continue;
if (*tok == '+') {
enable |= 1 << ssid;
disable &= ~(1 << ssid);
} else if (*tok == '-') {
disable |= 1 << ssid;
enable &= ~(1 << ssid);
} else {
return -EINVAL;
}
break;
} while_each_subsys_mask();
if (ssid == CGROUP_SUBSYS_COUNT)
return -EINVAL;
}
cgrp = cgroup_kn_lock_live(of->kn, true);
if (!cgrp)
return -ENODEV;
for_each_subsys(ss, ssid) {
if (enable & (1 << ssid)) {
if (cgrp->subtree_control & (1 << ssid)) {
enable &= ~(1 << ssid);
continue;
}
if (!(cgroup_control(cgrp) & (1 << ssid))) {
ret = -ENOENT;
goto out_unlock;
}
} else if (disable & (1 << ssid)) {
if (!(cgrp->subtree_control & (1 << ssid))) {
disable &= ~(1 << ssid);
continue;
}
/* a child has it enabled? */
cgroup_for_each_live_child(child, cgrp) {
if (child->subtree_control & (1 << ssid)) {
ret = -EBUSY;
goto out_unlock;
}
}
}
}
if (!enable && !disable) {
ret = 0;
goto out_unlock;
}
ret = cgroup_vet_subtree_control_enable(cgrp, enable);
if (ret)
goto out_unlock;
/* save and update control masks and prepare csses */
cgroup_save_control(cgrp);
cgrp->subtree_control |= enable;
cgrp->subtree_control &= ~disable;
ret = cgroup_apply_control(cgrp);
cgroup_finalize_control(cgrp, ret);
if (ret)
goto out_unlock;
kernfs_activate(cgrp->kn);
out_unlock:
cgroup_kn_unlock(of->kn);
return ret ?: nbytes;
}
/**
* cgroup_enable_threaded - make @cgrp threaded
* @cgrp: the target cgroup
*
* Called when "threaded" is written to the cgroup.type interface file and
* tries to make @cgrp threaded and join the parent's resource domain.
* This function is never called on the root cgroup as cgroup.type doesn't
* exist on it.
*/
static int cgroup_enable_threaded(struct cgroup *cgrp)
{
struct cgroup *parent = cgroup_parent(cgrp);
struct cgroup *dom_cgrp = parent->dom_cgrp;
struct cgroup *dsct;
struct cgroup_subsys_state *d_css;
int ret;
lockdep_assert_held(&cgroup_mutex);
/* noop if already threaded */
if (cgroup_is_threaded(cgrp))
return 0;
/*
* If @cgroup is populated or has domain controllers enabled, it
* can't be switched. While the below cgroup_can_be_thread_root()
* test can catch the same conditions, that's only when @parent is
* not mixable, so let's check it explicitly.
*/
if (cgroup_is_populated(cgrp) ||
cgrp->subtree_control & ~cgrp_dfl_threaded_ss_mask)
return -EOPNOTSUPP;
/* we're joining the parent's domain, ensure its validity */
if (!cgroup_is_valid_domain(dom_cgrp) ||
!cgroup_can_be_thread_root(dom_cgrp))
return -EOPNOTSUPP;
/*
* The following shouldn't cause actual migrations and should
* always succeed.
*/
cgroup_save_control(cgrp);
cgroup_for_each_live_descendant_pre(dsct, d_css, cgrp)
if (dsct == cgrp || cgroup_is_threaded(dsct))
dsct->dom_cgrp = dom_cgrp;
ret = cgroup_apply_control(cgrp);
if (!ret)
parent->nr_threaded_children++;
cgroup_finalize_control(cgrp, ret);
return ret;
}
static int cgroup_type_show(struct seq_file *seq, void *v)
{
struct cgroup *cgrp = seq_css(seq)->cgroup;
if (cgroup_is_threaded(cgrp))
seq_puts(seq, "threaded\n");
else if (!cgroup_is_valid_domain(cgrp))
seq_puts(seq, "domain invalid\n");
else if (cgroup_is_thread_root(cgrp))
seq_puts(seq, "domain threaded\n");
else
seq_puts(seq, "domain\n");
return 0;
}
static ssize_t cgroup_type_write(struct kernfs_open_file *of, char *buf,
size_t nbytes, loff_t off)
{
struct cgroup *cgrp;
int ret;
/* only switching to threaded mode is supported */
if (strcmp(strstrip(buf), "threaded"))
return -EINVAL;
/* drain dying csses before we re-apply (threaded) subtree control */
cgrp = cgroup_kn_lock_live(of->kn, true);
if (!cgrp)
return -ENOENT;
/* threaded can only be enabled */
ret = cgroup_enable_threaded(cgrp);
cgroup_kn_unlock(of->kn);
return ret ?: nbytes;
}
static int cgroup_max_descendants_show(struct seq_file *seq, void *v)
{
struct cgroup *cgrp = seq_css(seq)->cgroup;
int descendants = READ_ONCE(cgrp->max_descendants);
if (descendants == INT_MAX)
seq_puts(seq, "max\n");
else
seq_printf(seq, "%d\n", descendants);
return 0;
}
static ssize_t cgroup_max_descendants_write(struct kernfs_open_file *of,
char *buf, size_t nbytes, loff_t off)
{
struct cgroup *cgrp;
int descendants;
ssize_t ret;
buf = strstrip(buf);
if (!strcmp(buf, "max")) {
descendants = INT_MAX;
} else {
ret = kstrtoint(buf, 0, &descendants);
if (ret)
return ret;
}
if (descendants < 0)
return -ERANGE;
cgrp = cgroup_kn_lock_live(of->kn, false);
if (!cgrp)
return -ENOENT;
cgrp->max_descendants = descendants;
cgroup_kn_unlock(of->kn);
return nbytes;
}
static int cgroup_max_depth_show(struct seq_file *seq, void *v)
{
struct cgroup *cgrp = seq_css(seq)->cgroup;
int depth = READ_ONCE(cgrp->max_depth);
if (depth == INT_MAX)
seq_puts(seq, "max\n");
else
seq_printf(seq, "%d\n", depth);
return 0;
}
static ssize_t cgroup_max_depth_write(struct kernfs_open_file *of,
char *buf, size_t nbytes, loff_t off)
{
struct cgroup *cgrp;
ssize_t ret;
int depth;
buf = strstrip(buf);
if (!strcmp(buf, "max")) {
depth = INT_MAX;
} else {
ret = kstrtoint(buf, 0, &depth);
if (ret)
return ret;
}
if (depth < 0)
return -ERANGE;
cgrp = cgroup_kn_lock_live(of->kn, false);
if (!cgrp)
return -ENOENT;
cgrp->max_depth = depth;
cgroup_kn_unlock(of->kn);
return nbytes;
}
static int cgroup_events_show(struct seq_file *seq, void *v)
{
struct cgroup *cgrp = seq_css(seq)->cgroup;
seq_printf(seq, "populated %d\n", cgroup_is_populated(cgrp));
seq_printf(seq, "frozen %d\n", test_bit(CGRP_FROZEN, &cgrp->flags));
return 0;
}
static int cgroup_stat_show(struct seq_file *seq, void *v)
{
struct cgroup *cgroup = seq_css(seq)->cgroup;
seq_printf(seq, "nr_descendants %d\n",
cgroup->nr_descendants);
seq_printf(seq, "nr_dying_descendants %d\n",
cgroup->nr_dying_descendants);
return 0;
}
static int __maybe_unused cgroup_extra_stat_show(struct seq_file *seq,
struct cgroup *cgrp, int ssid)
{
struct cgroup_subsys *ss = cgroup_subsys[ssid];
struct cgroup_subsys_state *css;
int ret;
if (!ss->css_extra_stat_show)
return 0;
css = cgroup_tryget_css(cgrp, ss);
if (!css)
return 0;
ret = ss->css_extra_stat_show(seq, css);
css_put(css);
return ret;
}
static int cpu_stat_show(struct seq_file *seq, void *v)
{
struct cgroup __maybe_unused *cgrp = seq_css(seq)->cgroup;
int ret = 0;
cgroup_base_stat_cputime_show(seq);
#ifdef CONFIG_CGROUP_SCHED
ret = cgroup_extra_stat_show(seq, cgrp, cpu_cgrp_id);
#endif
return ret;
}
#ifdef CONFIG_PSI
static int cgroup_io_pressure_show(struct seq_file *seq, void *v)
{
struct cgroup *cgrp = seq_css(seq)->cgroup;
struct psi_group *psi = cgroup_ino(cgrp) == 1 ? &psi_system : &cgrp->psi;
return psi_show(seq, psi, PSI_IO);
}
static int cgroup_memory_pressure_show(struct seq_file *seq, void *v)
{
struct cgroup *cgrp = seq_css(seq)->cgroup;
struct psi_group *psi = cgroup_ino(cgrp) == 1 ? &psi_system : &cgrp->psi;
return psi_show(seq, psi, PSI_MEM);
}
static int cgroup_cpu_pressure_show(struct seq_file *seq, void *v)
{
struct cgroup *cgrp = seq_css(seq)->cgroup;
struct psi_group *psi = cgroup_ino(cgrp) == 1 ? &psi_system : &cgrp->psi;
return psi_show(seq, psi, PSI_CPU);
}
static ssize_t cgroup_pressure_write(struct kernfs_open_file *of, char *buf,
size_t nbytes, enum psi_res res)
{
struct cgroup_file_ctx *ctx = of->priv;
struct psi_trigger *new;
struct cgroup *cgrp;
struct psi_group *psi;
cgrp = cgroup_kn_lock_live(of->kn, false);
if (!cgrp)
return -ENODEV;
cgroup_get(cgrp);
cgroup_kn_unlock(of->kn);
/* Allow only one trigger per file descriptor */
if (ctx->psi.trigger) {
cgroup_put(cgrp);
return -EBUSY;
}
psi = cgroup_ino(cgrp) == 1 ? &psi_system : &cgrp->psi;
new = psi_trigger_create(psi, buf, nbytes, res);
if (IS_ERR(new)) {
cgroup_put(cgrp);
return PTR_ERR(new);
}
smp_store_release(&ctx->psi.trigger, new);
cgroup_put(cgrp);
return nbytes;
}
static ssize_t cgroup_io_pressure_write(struct kernfs_open_file *of,
char *buf, size_t nbytes,
loff_t off)
{
return cgroup_pressure_write(of, buf, nbytes, PSI_IO);
}
static ssize_t cgroup_memory_pressure_write(struct kernfs_open_file *of,
char *buf, size_t nbytes,
loff_t off)
{
return cgroup_pressure_write(of, buf, nbytes, PSI_MEM);
}
static ssize_t cgroup_cpu_pressure_write(struct kernfs_open_file *of,
char *buf, size_t nbytes,
loff_t off)
{
return cgroup_pressure_write(of, buf, nbytes, PSI_CPU);
}
static __poll_t cgroup_pressure_poll(struct kernfs_open_file *of,
poll_table *pt)
{
struct cgroup_file_ctx *ctx = of->priv;
return psi_trigger_poll(&ctx->psi.trigger, of->file, pt);
}
static void cgroup_pressure_release(struct kernfs_open_file *of)
{
struct cgroup_file_ctx *ctx = of->priv;
psi_trigger_destroy(ctx->psi.trigger);
}
bool cgroup_psi_enabled(void)
{
return (cgroup_feature_disable_mask & (1 << OPT_FEATURE_PRESSURE)) == 0;
}
#else /* CONFIG_PSI */
bool cgroup_psi_enabled(void)
{
return false;
}
#endif /* CONFIG_PSI */
static int cgroup_freeze_show(struct seq_file *seq, void *v)
{
struct cgroup *cgrp = seq_css(seq)->cgroup;
seq_printf(seq, "%d\n", cgrp->freezer.freeze);
return 0;
}
static ssize_t cgroup_freeze_write(struct kernfs_open_file *of,
char *buf, size_t nbytes, loff_t off)
{
struct cgroup *cgrp;
ssize_t ret;
int freeze;
ret = kstrtoint(strstrip(buf), 0, &freeze);
if (ret)
return ret;
if (freeze < 0 || freeze > 1)
return -ERANGE;
cgrp = cgroup_kn_lock_live(of->kn, false);
if (!cgrp)
return -ENOENT;
cgroup_freeze(cgrp, freeze);
cgroup_kn_unlock(of->kn);
return nbytes;
}
static void __cgroup_kill(struct cgroup *cgrp)
{
struct css_task_iter it;
struct task_struct *task;
lockdep_assert_held(&cgroup_mutex);
spin_lock_irq(&css_set_lock);
set_bit(CGRP_KILL, &cgrp->flags);
spin_unlock_irq(&css_set_lock);
css_task_iter_start(&cgrp->self, CSS_TASK_ITER_PROCS | CSS_TASK_ITER_THREADED, &it);
while ((task = css_task_iter_next(&it))) {
/* Ignore kernel threads here. */
if (task->flags & PF_KTHREAD)
continue;
/* Skip tasks that are already dying. */
if (__fatal_signal_pending(task))
continue;
send_sig(SIGKILL, task, 0);
}
css_task_iter_end(&it);
spin_lock_irq(&css_set_lock);
clear_bit(CGRP_KILL, &cgrp->flags);
spin_unlock_irq(&css_set_lock);
}
static void cgroup_kill(struct cgroup *cgrp)
{
struct cgroup_subsys_state *css;
struct cgroup *dsct;
lockdep_assert_held(&cgroup_mutex);
cgroup_for_each_live_descendant_pre(dsct, css, cgrp)
__cgroup_kill(dsct);
}
static ssize_t cgroup_kill_write(struct kernfs_open_file *of, char *buf,
size_t nbytes, loff_t off)
{
ssize_t ret = 0;
int kill;
struct cgroup *cgrp;
ret = kstrtoint(strstrip(buf), 0, &kill);
if (ret)
return ret;
if (kill != 1)
return -ERANGE;
cgrp = cgroup_kn_lock_live(of->kn, false);
if (!cgrp)
return -ENOENT;
/*
* Killing is a process directed operation, i.e. the whole thread-group
* is taken down so act like we do for cgroup.procs and only make this
* writable in non-threaded cgroups.
*/
if (cgroup_is_threaded(cgrp))
ret = -EOPNOTSUPP;
else
cgroup_kill(cgrp);
cgroup_kn_unlock(of->kn);
return ret ?: nbytes;
}
static int cgroup_file_open(struct kernfs_open_file *of)
{
struct cftype *cft = of_cft(of);
struct cgroup_file_ctx *ctx;
int ret;
ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
if (!ctx)
return -ENOMEM;
ctx->ns = current->nsproxy->cgroup_ns;
get_cgroup_ns(ctx->ns);
of->priv = ctx;
if (!cft->open)
return 0;
ret = cft->open(of);
if (ret) {
put_cgroup_ns(ctx->ns);
kfree(ctx);
}
return ret;
}
static void cgroup_file_release(struct kernfs_open_file *of)
{
struct cftype *cft = of_cft(of);
struct cgroup_file_ctx *ctx = of->priv;
if (cft->release)
cft->release(of);
put_cgroup_ns(ctx->ns);
kfree(ctx);
}
static ssize_t cgroup_file_write(struct kernfs_open_file *of, char *buf,
size_t nbytes, loff_t off)
{
struct cgroup_file_ctx *ctx = of->priv;
struct cgroup *cgrp = of->kn->parent->priv;
struct cftype *cft = of_cft(of);
struct cgroup_subsys_state *css;
int ret;
if (!nbytes)
return 0;
/*
* If namespaces are delegation boundaries, disallow writes to
* files in an non-init namespace root from inside the namespace
* except for the files explicitly marked delegatable -
* cgroup.procs and cgroup.subtree_control.
*/
if ((cgrp->root->flags & CGRP_ROOT_NS_DELEGATE) &&
!(cft->flags & CFTYPE_NS_DELEGATABLE) &&
ctx->ns != &init_cgroup_ns && ctx->ns->root_cset->dfl_cgrp == cgrp)
return -EPERM;
if (cft->write)
return cft->write(of, buf, nbytes, off);
/*
* kernfs guarantees that a file isn't deleted with operations in
* flight, which means that the matching css is and stays alive and
* doesn't need to be pinned. The RCU locking is not necessary
* either. It's just for the convenience of using cgroup_css().
*/
rcu_read_lock();
css = cgroup_css(cgrp, cft->ss);
rcu_read_unlock();
if (cft->write_u64) {
unsigned long long v;
ret = kstrtoull(buf, 0, &v);
if (!ret)
ret = cft->write_u64(css, cft, v);
} else if (cft->write_s64) {
long long v;
ret = kstrtoll(buf, 0, &v);
if (!ret)
ret = cft->write_s64(css, cft, v);
} else {
ret = -EINVAL;
}
return ret ?: nbytes;
}
static __poll_t cgroup_file_poll(struct kernfs_open_file *of, poll_table *pt)
{
struct cftype *cft = of_cft(of);
if (cft->poll)
return cft->poll(of, pt);
return kernfs_generic_poll(of, pt);
}
static void *cgroup_seqfile_start(struct seq_file *seq, loff_t *ppos)
{
return seq_cft(seq)->seq_start(seq, ppos);
}
static void *cgroup_seqfile_next(struct seq_file *seq, void *v, loff_t *ppos)
{
return seq_cft(seq)->seq_next(seq, v, ppos);
}
static void cgroup_seqfile_stop(struct seq_file *seq, void *v)
{
if (seq_cft(seq)->seq_stop)
seq_cft(seq)->seq_stop(seq, v);
}
static int cgroup_seqfile_show(struct seq_file *m, void *arg)
{
struct cftype *cft = seq_cft(m);
struct cgroup_subsys_state *css = seq_css(m);
if (cft->seq_show)
return cft->seq_show(m, arg);
if (cft->read_u64)
seq_printf(m, "%llu\n", cft->read_u64(css, cft));
else if (cft->read_s64)
seq_printf(m, "%lld\n", cft->read_s64(css, cft));
else
return -EINVAL;
return 0;
}
static struct kernfs_ops cgroup_kf_single_ops = {
.atomic_write_len = PAGE_SIZE,
.open = cgroup_file_open,
.release = cgroup_file_release,
.write = cgroup_file_write,
.poll = cgroup_file_poll,
.seq_show = cgroup_seqfile_show,
};
static struct kernfs_ops cgroup_kf_ops = {
.atomic_write_len = PAGE_SIZE,
.open = cgroup_file_open,
.release = cgroup_file_release,
.write = cgroup_file_write,
.poll = cgroup_file_poll,
.seq_start = cgroup_seqfile_start,
.seq_next = cgroup_seqfile_next,
.seq_stop = cgroup_seqfile_stop,
.seq_show = cgroup_seqfile_show,
};
/* set uid and gid of cgroup dirs and files to that of the creator */
static int cgroup_kn_set_ugid(struct kernfs_node *kn)
{
struct iattr iattr = { .ia_valid = ATTR_UID | ATTR_GID,
.ia_uid = current_fsuid(),
.ia_gid = current_fsgid(), };
if (uid_eq(iattr.ia_uid, GLOBAL_ROOT_UID) &&
gid_eq(iattr.ia_gid, GLOBAL_ROOT_GID))
return 0;
return kernfs_setattr(kn, &iattr);
}
static void cgroup_file_notify_timer(struct timer_list *timer)
{
cgroup_file_notify(container_of(timer, struct cgroup_file,
notify_timer));
}
static int cgroup_add_file(struct cgroup_subsys_state *css, struct cgroup *cgrp,
struct cftype *cft)
{
char name[CGROUP_FILE_NAME_MAX];
struct kernfs_node *kn;
struct lock_class_key *key = NULL;
int ret;
#ifdef CONFIG_DEBUG_LOCK_ALLOC
key = &cft->lockdep_key;
#endif
kn = __kernfs_create_file(cgrp->kn, cgroup_file_name(cgrp, cft, name),
cgroup_file_mode(cft),
GLOBAL_ROOT_UID, GLOBAL_ROOT_GID,
0, cft->kf_ops, cft,
NULL, key);
if (IS_ERR(kn))
return PTR_ERR(kn);
ret = cgroup_kn_set_ugid(kn);
if (ret) {
kernfs_remove(kn);
return ret;
}
if (cft->file_offset) {
struct cgroup_file *cfile = (void *)css + cft->file_offset;
timer_setup(&cfile->notify_timer, cgroup_file_notify_timer, 0);
spin_lock_irq(&cgroup_file_kn_lock);
cfile->kn = kn;
spin_unlock_irq(&cgroup_file_kn_lock);
}
return 0;
}
/**
* cgroup_addrm_files - add or remove files to a cgroup directory
* @css: the target css
* @cgrp: the target cgroup (usually css->cgroup)
* @cfts: array of cftypes to be added
* @is_add: whether to add or remove
*
* Depending on @is_add, add or remove files defined by @cfts on @cgrp.
* For removals, this function never fails.
*/
static int cgroup_addrm_files(struct cgroup_subsys_state *css,
struct cgroup *cgrp, struct cftype cfts[],
bool is_add)
{
struct cftype *cft, *cft_end = NULL;
int ret = 0;
lockdep_assert_held(&cgroup_mutex);
restart:
for (cft = cfts; cft != cft_end && cft->name[0] != '\0'; cft++) {
/* does cft->flags tell us to skip this file on @cgrp? */
if ((cft->flags & CFTYPE_PRESSURE) && !cgroup_psi_enabled())
continue;
if ((cft->flags & __CFTYPE_ONLY_ON_DFL) && !cgroup_on_dfl(cgrp))
continue;
if ((cft->flags & __CFTYPE_NOT_ON_DFL) && cgroup_on_dfl(cgrp))
continue;
if ((cft->flags & CFTYPE_NOT_ON_ROOT) && !cgroup_parent(cgrp))
continue;
if ((cft->flags & CFTYPE_ONLY_ON_ROOT) && cgroup_parent(cgrp))
continue;
if ((cft->flags & CFTYPE_DEBUG) && !cgroup_debug)
continue;
if (is_add) {
ret = cgroup_add_file(css, cgrp, cft);
if (ret) {
pr_warn("%s: failed to add %s, err=%d\n",
__func__, cft->name, ret);
cft_end = cft;
is_add = false;
goto restart;
}
} else {
cgroup_rm_file(cgrp, cft);
}
}
return ret;
}
static int cgroup_apply_cftypes(struct cftype *cfts, bool is_add)
{
struct cgroup_subsys *ss = cfts[0].ss;
struct cgroup *root = &ss->root->cgrp;
struct cgroup_subsys_state *css;
int ret = 0;
lockdep_assert_held(&cgroup_mutex);
/* add/rm files for all cgroups created before */
css_for_each_descendant_pre(css, cgroup_css(root, ss)) {
struct cgroup *cgrp = css->cgroup;
if (!(css->flags & CSS_VISIBLE))
continue;
ret = cgroup_addrm_files(css, cgrp, cfts, is_add);
if (ret)
break;
}
if (is_add && !ret)
kernfs_activate(root->kn);
return ret;
}
static void cgroup_exit_cftypes(struct cftype *cfts)
{
struct cftype *cft;
for (cft = cfts; cft->name[0] != '\0'; cft++) {
/* free copy for custom atomic_write_len, see init_cftypes() */
if (cft->max_write_len && cft->max_write_len != PAGE_SIZE)
kfree(cft->kf_ops);
cft->kf_ops = NULL;
cft->ss = NULL;
/* revert flags set by cgroup core while adding @cfts */
cft->flags &= ~(__CFTYPE_ONLY_ON_DFL | __CFTYPE_NOT_ON_DFL);
}
}
static int cgroup_init_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
{
struct cftype *cft;
for (cft = cfts; cft->name[0] != '\0'; cft++) {
struct kernfs_ops *kf_ops;
WARN_ON(cft->ss || cft->kf_ops);
if ((cft->flags & CFTYPE_PRESSURE) && !cgroup_psi_enabled())
continue;
if (cft->seq_start)
kf_ops = &cgroup_kf_ops;
else
kf_ops = &cgroup_kf_single_ops;
/*
* Ugh... if @cft wants a custom max_write_len, we need to
* make a copy of kf_ops to set its atomic_write_len.
*/
if (cft->max_write_len && cft->max_write_len != PAGE_SIZE) {
kf_ops = kmemdup(kf_ops, sizeof(*kf_ops), GFP_KERNEL);
if (!kf_ops) {
cgroup_exit_cftypes(cfts);
return -ENOMEM;
}
kf_ops->atomic_write_len = cft->max_write_len;
}
cft->kf_ops = kf_ops;
cft->ss = ss;
}
return 0;
}
static int cgroup_rm_cftypes_locked(struct cftype *cfts)
{
lockdep_assert_held(&cgroup_mutex);
if (!cfts || !cfts[0].ss)
return -ENOENT;
list_del(&cfts->node);
cgroup_apply_cftypes(cfts, false);
cgroup_exit_cftypes(cfts);
return 0;
}
/**
* cgroup_rm_cftypes - remove an array of cftypes from a subsystem
* @cfts: zero-length name terminated array of cftypes
*
* Unregister @cfts. Files described by @cfts are removed from all
* existing cgroups and all future cgroups won't have them either. This
* function can be called anytime whether @cfts' subsys is attached or not.
*
* Returns 0 on successful unregistration, -ENOENT if @cfts is not
* registered.
*/
int cgroup_rm_cftypes(struct cftype *cfts)
{
int ret;
mutex_lock(&cgroup_mutex);
ret = cgroup_rm_cftypes_locked(cfts);
mutex_unlock(&cgroup_mutex);
return ret;
}
/**
* cgroup_add_cftypes - add an array of cftypes to a subsystem
* @ss: target cgroup subsystem
* @cfts: zero-length name terminated array of cftypes
*
* Register @cfts to @ss. Files described by @cfts are created for all
* existing cgroups to which @ss is attached and all future cgroups will
* have them too. This function can be called anytime whether @ss is
* attached or not.
*
* Returns 0 on successful registration, -errno on failure. Note that this
* function currently returns 0 as long as @cfts registration is successful
* even if some file creation attempts on existing cgroups fail.
*/
static int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
{
int ret;
if (!cgroup_ssid_enabled(ss->id))
return 0;
if (!cfts || cfts[0].name[0] == '\0')
return 0;
ret = cgroup_init_cftypes(ss, cfts);
if (ret)
return ret;
mutex_lock(&cgroup_mutex);
list_add_tail(&cfts->node, &ss->cfts);
ret = cgroup_apply_cftypes(cfts, true);
if (ret)
cgroup_rm_cftypes_locked(cfts);
mutex_unlock(&cgroup_mutex);
return ret;
}
/**
* cgroup_add_dfl_cftypes - add an array of cftypes for default hierarchy
* @ss: target cgroup subsystem
* @cfts: zero-length name terminated array of cftypes
*
* Similar to cgroup_add_cftypes() but the added files are only used for
* the default hierarchy.
*/
int cgroup_add_dfl_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
{
struct cftype *cft;
for (cft = cfts; cft && cft->name[0] != '\0'; cft++)
cft->flags |= __CFTYPE_ONLY_ON_DFL;
return cgroup_add_cftypes(ss, cfts);
}
/**
* cgroup_add_legacy_cftypes - add an array of cftypes for legacy hierarchies
* @ss: target cgroup subsystem
* @cfts: zero-length name terminated array of cftypes
*
* Similar to cgroup_add_cftypes() but the added files are only used for
* the legacy hierarchies.
*/
int cgroup_add_legacy_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
{
struct cftype *cft;
for (cft = cfts; cft && cft->name[0] != '\0'; cft++)
cft->flags |= __CFTYPE_NOT_ON_DFL;
return cgroup_add_cftypes(ss, cfts);
}
/**
* cgroup_file_notify - generate a file modified event for a cgroup_file
* @cfile: target cgroup_file
*
* @cfile must have been obtained by setting cftype->file_offset.
*/
void cgroup_file_notify(struct cgroup_file *cfile)
{
unsigned long flags;
spin_lock_irqsave(&cgroup_file_kn_lock, flags);
if (cfile->kn) {
unsigned long last = cfile->notified_at;
unsigned long next = last + CGROUP_FILE_NOTIFY_MIN_INTV;
if (time_in_range(jiffies, last, next)) {
timer_reduce(&cfile->notify_timer, next);
} else {
kernfs_notify(cfile->kn);
cfile->notified_at = jiffies;
}
}
spin_unlock_irqrestore(&cgroup_file_kn_lock, flags);
}
/**
* css_next_child - find the next child of a given css
* @pos: the current position (%NULL to initiate traversal)
* @parent: css whose children to walk
*
* This function returns the next child of @parent and should be called
* under either cgroup_mutex or RCU read lock. The only requirement is
* that @parent and @pos are accessible. The next sibling is guaranteed to
* be returned regardless of their states.
*
* If a subsystem synchronizes ->css_online() and the start of iteration, a
* css which finished ->css_online() is guaranteed to be visible in the
* future iterations and will stay visible until the last reference is put.
* A css which hasn't finished ->css_online() or already finished
* ->css_offline() may show up during traversal. It's each subsystem's
* responsibility to synchronize against on/offlining.
*/
struct cgroup_subsys_state *css_next_child(struct cgroup_subsys_state *pos,
struct cgroup_subsys_state *parent)
{
struct cgroup_subsys_state *next;
cgroup_assert_mutex_or_rcu_locked();
/*
* @pos could already have been unlinked from the sibling list.
* Once a cgroup is removed, its ->sibling.next is no longer
* updated when its next sibling changes. CSS_RELEASED is set when
* @pos is taken off list, at which time its next pointer is valid,
* and, as releases are serialized, the one pointed to by the next
* pointer is guaranteed to not have started release yet. This
* implies that if we observe !CSS_RELEASED on @pos in this RCU
* critical section, the one pointed to by its next pointer is
* guaranteed to not have finished its RCU grace period even if we
* have dropped rcu_read_lock() in-between iterations.
*
* If @pos has CSS_RELEASED set, its next pointer can't be
* dereferenced; however, as each css is given a monotonically
* increasing unique serial number and always appended to the
* sibling list, the next one can be found by walking the parent's
* children until the first css with higher serial number than
* @pos's. While this path can be slower, it happens iff iteration
* races against release and the race window is very small.
*/
if (!pos) {
next = list_entry_rcu(parent->children.next, struct cgroup_subsys_state, sibling);
} else if (likely(!(pos->flags & CSS_RELEASED))) {
next = list_entry_rcu(pos->sibling.next, struct cgroup_subsys_state, sibling);
} else {
list_for_each_entry_rcu(next, &parent->children, sibling,
lockdep_is_held(&cgroup_mutex))
if (next->serial_nr > pos->serial_nr)
break;
}
/*
* @next, if not pointing to the head, can be dereferenced and is
* the next sibling.
*/
if (&next->sibling != &parent->children)
return next;
return NULL;
}
/**
* css_next_descendant_pre - find the next descendant for pre-order walk
* @pos: the current position (%NULL to initiate traversal)
* @root: css whose descendants to walk
*
* To be used by css_for_each_descendant_pre(). Find the next descendant
* to visit for pre-order traversal of @root's descendants. @root is
* included in the iteration and the first node to be visited.
*
* While this function requires cgroup_mutex or RCU read locking, it
* doesn't require the whole traversal to be contained in a single critical
* section. This function will return the correct next descendant as long
* as both @pos and @root are accessible and @pos is a descendant of @root.
*
* If a subsystem synchronizes ->css_online() and the start of iteration, a
* css which finished ->css_online() is guaranteed to be visible in the
* future iterations and will stay visible until the last reference is put.
* A css which hasn't finished ->css_online() or already finished
* ->css_offline() may show up during traversal. It's each subsystem's
* responsibility to synchronize against on/offlining.
*/
struct cgroup_subsys_state *
css_next_descendant_pre(struct cgroup_subsys_state *pos,
struct cgroup_subsys_state *root)
{
struct cgroup_subsys_state *next;
cgroup_assert_mutex_or_rcu_locked();
/* if first iteration, visit @root */
if (!pos)
return root;
/* visit the first child if exists */
next = css_next_child(NULL, pos);
if (next)
return next;
/* no child, visit my or the closest ancestor's next sibling */
while (pos != root) {
next = css_next_child(pos, pos->parent);
if (next)
return next;
pos = pos->parent;
}
return NULL;
}
EXPORT_SYMBOL_GPL(css_next_descendant_pre);
/**
* css_rightmost_descendant - return the rightmost descendant of a css
* @pos: css of interest
*
* Return the rightmost descendant of @pos. If there's no descendant, @pos
* is returned. This can be used during pre-order traversal to skip
* subtree of @pos.
*
* While this function requires cgroup_mutex or RCU read locking, it
* doesn't require the whole traversal to be contained in a single critical
* section. This function will return the correct rightmost descendant as
* long as @pos is accessible.
*/
struct cgroup_subsys_state *
css_rightmost_descendant(struct cgroup_subsys_state *pos)
{
struct cgroup_subsys_state *last, *tmp;
cgroup_assert_mutex_or_rcu_locked();
do {
last = pos;
/* ->prev isn't RCU safe, walk ->next till the end */
pos = NULL;
css_for_each_child(tmp, last)
pos = tmp;
} while (pos);
return last;
}
static struct cgroup_subsys_state *
css_leftmost_descendant(struct cgroup_subsys_state *pos)
{
struct cgroup_subsys_state *last;
do {
last = pos;
pos = css_next_child(NULL, pos);
} while (pos);
return last;
}
/**
* css_next_descendant_post - find the next descendant for post-order walk
* @pos: the current position (%NULL to initiate traversal)
* @root: css whose descendants to walk
*
* To be used by css_for_each_descendant_post(). Find the next descendant
* to visit for post-order traversal of @root's descendants. @root is
* included in the iteration and the last node to be visited.
*
* While this function requires cgroup_mutex or RCU read locking, it
* doesn't require the whole traversal to be contained in a single critical
* section. This function will return the correct next descendant as long
* as both @pos and @cgroup are accessible and @pos is a descendant of
* @cgroup.
*
* If a subsystem synchronizes ->css_online() and the start of iteration, a
* css which finished ->css_online() is guaranteed to be visible in the
* future iterations and will stay visible until the last reference is put.
* A css which hasn't finished ->css_online() or already finished
* ->css_offline() may show up during traversal. It's each subsystem's
* responsibility to synchronize against on/offlining.
*/
struct cgroup_subsys_state *
css_next_descendant_post(struct cgroup_subsys_state *pos,
struct cgroup_subsys_state *root)
{
struct cgroup_subsys_state *next;
cgroup_assert_mutex_or_rcu_locked();
/* if first iteration, visit leftmost descendant which may be @root */
if (!pos)
return css_leftmost_descendant(root);
/* if we visited @root, we're done */
if (pos == root)
return NULL;
/* if there's an unvisited sibling, visit its leftmost descendant */
next = css_next_child(pos, pos->parent);
if (next)
return css_leftmost_descendant(next);
/* no sibling left, visit parent */
return pos->parent;
}
/**
* css_has_online_children - does a css have online children
* @css: the target css
*
* Returns %true if @css has any online children; otherwise, %false. This
* function can be called from any context but the caller is responsible
* for synchronizing against on/offlining as necessary.
*/
bool css_has_online_children(struct cgroup_subsys_state *css)
{
struct cgroup_subsys_state *child;
bool ret = false;
rcu_read_lock();
css_for_each_child(child, css) {
if (child->flags & CSS_ONLINE) {
ret = true;
break;
}
}
rcu_read_unlock();
return ret;
}
static struct css_set *css_task_iter_next_css_set(struct css_task_iter *it)
{
struct list_head *l;
struct cgrp_cset_link *link;
struct css_set *cset;
lockdep_assert_held(&css_set_lock);
/* find the next threaded cset */
if (it->tcset_pos) {
l = it->tcset_pos->next;
if (l != it->tcset_head) {
it->tcset_pos = l;
return container_of(l, struct css_set,
threaded_csets_node);
}
it->tcset_pos = NULL;
}
/* find the next cset */
l = it->cset_pos;
l = l->next;
if (l == it->cset_head) {
it->cset_pos = NULL;
return NULL;
}
if (it->ss) {
cset = container_of(l, struct css_set, e_cset_node[it->ss->id]);
} else {
link = list_entry(l, struct cgrp_cset_link, cset_link);
cset = link->cset;
}
it->cset_pos = l;
/* initialize threaded css_set walking */
if (it->flags & CSS_TASK_ITER_THREADED) {
if (it->cur_dcset)
put_css_set_locked(it->cur_dcset);
it->cur_dcset = cset;
get_css_set(cset);
it->tcset_head = &cset->threaded_csets;
it->tcset_pos = &cset->threaded_csets;
}
return cset;
}
/**
* css_task_iter_advance_css_set - advance a task iterator to the next css_set
* @it: the iterator to advance
*
* Advance @it to the next css_set to walk.
*/
static void css_task_iter_advance_css_set(struct css_task_iter *it)
{
struct css_set *cset;
lockdep_assert_held(&css_set_lock);
/* Advance to the next non-empty css_set and find first non-empty tasks list*/
while ((cset = css_task_iter_next_css_set(it))) {
if (!list_empty(&cset->tasks)) {
it->cur_tasks_head = &cset->tasks;
break;
} else if (!list_empty(&cset->mg_tasks)) {
it->cur_tasks_head = &cset->mg_tasks;
break;
} else if (!list_empty(&cset->dying_tasks)) {
it->cur_tasks_head = &cset->dying_tasks;
break;
}
}
if (!cset) {
it->task_pos = NULL;
return;
}
it->task_pos = it->cur_tasks_head->next;
/*
* We don't keep css_sets locked across iteration steps and thus
* need to take steps to ensure that iteration can be resumed after
* the lock is re-acquired. Iteration is performed at two levels -
* css_sets and tasks in them.
*
* Once created, a css_set never leaves its cgroup lists, so a
* pinned css_set is guaranteed to stay put and we can resume
* iteration afterwards.
*
* Tasks may leave @cset across iteration steps. This is resolved
* by registering each iterator with the css_set currently being
* walked and making css_set_move_task() advance iterators whose
* next task is leaving.
*/
if (it->cur_cset) {
list_del(&it->iters_node);
put_css_set_locked(it->cur_cset);
}
get_css_set(cset);
it->cur_cset = cset;
list_add(&it->iters_node, &cset->task_iters);
}
static void css_task_iter_skip(struct css_task_iter *it,
struct task_struct *task)
{
lockdep_assert_held(&css_set_lock);
if (it->task_pos == &task->cg_list) {
it->task_pos = it->task_pos->next;
it->flags |= CSS_TASK_ITER_SKIPPED;
}
}
static void css_task_iter_advance(struct css_task_iter *it)
{
struct task_struct *task;
lockdep_assert_held(&css_set_lock);
repeat:
if (it->task_pos) {
/*
* Advance iterator to find next entry. We go through cset
* tasks, mg_tasks and dying_tasks, when consumed we move onto
* the next cset.
*/
if (it->flags & CSS_TASK_ITER_SKIPPED)
it->flags &= ~CSS_TASK_ITER_SKIPPED;
else
it->task_pos = it->task_pos->next;
if (it->task_pos == &it->cur_cset->tasks) {
it->cur_tasks_head = &it->cur_cset->mg_tasks;
it->task_pos = it->cur_tasks_head->next;
}
if (it->task_pos == &it->cur_cset->mg_tasks) {
it->cur_tasks_head = &it->cur_cset->dying_tasks;
it->task_pos = it->cur_tasks_head->next;
}
if (it->task_pos == &it->cur_cset->dying_tasks)
css_task_iter_advance_css_set(it);
} else {
/* called from start, proceed to the first cset */
css_task_iter_advance_css_set(it);
}
if (!it->task_pos)
return;
task = list_entry(it->task_pos, struct task_struct, cg_list);
if (it->flags & CSS_TASK_ITER_PROCS) {
/* if PROCS, skip over tasks which aren't group leaders */
if (!thread_group_leader(task))
goto repeat;
/* and dying leaders w/o live member threads */
if (it->cur_tasks_head == &it->cur_cset->dying_tasks &&
!atomic_read(&task->signal->live))
goto repeat;
} else {
/* skip all dying ones */
if (it->cur_tasks_head == &it->cur_cset->dying_tasks)
goto repeat;
}
}
/**
* css_task_iter_start - initiate task iteration
* @css: the css to walk tasks of
* @flags: CSS_TASK_ITER_* flags
* @it: the task iterator to use
*
* Initiate iteration through the tasks of @css. The caller can call
* css_task_iter_next() to walk through the tasks until the function
* returns NULL. On completion of iteration, css_task_iter_end() must be
* called.
*/
void css_task_iter_start(struct cgroup_subsys_state *css, unsigned int flags,
struct css_task_iter *it)
{
memset(it, 0, sizeof(*it));
spin_lock_irq(&css_set_lock);
it->ss = css->ss;
it->flags = flags;
if (CGROUP_HAS_SUBSYS_CONFIG && it->ss)
it->cset_pos = &css->cgroup->e_csets[css->ss->id];
else
it->cset_pos = &css->cgroup->cset_links;
it->cset_head = it->cset_pos;
css_task_iter_advance(it);
spin_unlock_irq(&css_set_lock);
}
/**
* css_task_iter_next - return the next task for the iterator
* @it: the task iterator being iterated
*
* The "next" function for task iteration. @it should have been
* initialized via css_task_iter_start(). Returns NULL when the iteration
* reaches the end.
*/
struct task_struct *css_task_iter_next(struct css_task_iter *it)
{
if (it->cur_task) {
put_task_struct(it->cur_task);
it->cur_task = NULL;
}
spin_lock_irq(&css_set_lock);
/* @it may be half-advanced by skips, finish advancing */
if (it->flags & CSS_TASK_ITER_SKIPPED)
css_task_iter_advance(it);
if (it->task_pos) {
it->cur_task = list_entry(it->task_pos, struct task_struct,
cg_list);
get_task_struct(it->cur_task);
css_task_iter_advance(it);
}
spin_unlock_irq(&css_set_lock);
return it->cur_task;
}
/**
* css_task_iter_end - finish task iteration
* @it: the task iterator to finish
*
* Finish task iteration started by css_task_iter_start().
*/
void css_task_iter_end(struct css_task_iter *it)
{
if (it->cur_cset) {
spin_lock_irq(&css_set_lock);
list_del(&it->iters_node);
put_css_set_locked(it->cur_cset);
spin_unlock_irq(&css_set_lock);
}
if (it->cur_dcset)
put_css_set(it->cur_dcset);
if (it->cur_task)
put_task_struct(it->cur_task);
}
static void cgroup_procs_release(struct kernfs_open_file *of)
{
struct cgroup_file_ctx *ctx = of->priv;
if (ctx->procs.started)
css_task_iter_end(&ctx->procs.iter);
}
static void *cgroup_procs_next(struct seq_file *s, void *v, loff_t *pos)
{
struct kernfs_open_file *of = s->private;
struct cgroup_file_ctx *ctx = of->priv;
if (pos)
(*pos)++;
return css_task_iter_next(&ctx->procs.iter);
}
static void *__cgroup_procs_start(struct seq_file *s, loff_t *pos,
unsigned int iter_flags)
{
struct kernfs_open_file *of = s->private;
struct cgroup *cgrp = seq_css(s)->cgroup;
struct cgroup_file_ctx *ctx = of->priv;
struct css_task_iter *it = &ctx->procs.iter;
/*
* When a seq_file is seeked, it's always traversed sequentially
* from position 0, so we can simply keep iterating on !0 *pos.
*/
if (!ctx->procs.started) {
if (WARN_ON_ONCE((*pos)))
return ERR_PTR(-EINVAL);
css_task_iter_start(&cgrp->self, iter_flags, it);
ctx->procs.started = true;
} else if (!(*pos)) {
css_task_iter_end(it);
css_task_iter_start(&cgrp->self, iter_flags, it);
} else
return it->cur_task;
return cgroup_procs_next(s, NULL, NULL);
}
static void *cgroup_procs_start(struct seq_file *s, loff_t *pos)
{
struct cgroup *cgrp = seq_css(s)->cgroup;
/*
* All processes of a threaded subtree belong to the domain cgroup
* of the subtree. Only threads can be distributed across the
* subtree. Reject reads on cgroup.procs in the subtree proper.
* They're always empty anyway.
*/
if (cgroup_is_threaded(cgrp))
return ERR_PTR(-EOPNOTSUPP);
return __cgroup_procs_start(s, pos, CSS_TASK_ITER_PROCS |
CSS_TASK_ITER_THREADED);
}
static int cgroup_procs_show(struct seq_file *s, void *v)
{
seq_printf(s, "%d\n", task_pid_vnr(v));
return 0;
}
static int cgroup_may_write(const struct cgroup *cgrp, struct super_block *sb)
{
int ret;
struct inode *inode;
lockdep_assert_held(&cgroup_mutex);
inode = kernfs_get_inode(sb, cgrp->procs_file.kn);
if (!inode)
return -ENOMEM;
ret = inode_permission(&init_user_ns, inode, MAY_WRITE);
iput(inode);
return ret;
}
static int cgroup_procs_write_permission(struct cgroup *src_cgrp,
struct cgroup *dst_cgrp,
struct super_block *sb,
struct cgroup_namespace *ns)
{
struct cgroup *com_cgrp = src_cgrp;
int ret;
lockdep_assert_held(&cgroup_mutex);
/* find the common ancestor */
while (!cgroup_is_descendant(dst_cgrp, com_cgrp))
com_cgrp = cgroup_parent(com_cgrp);
/* %current should be authorized to migrate to the common ancestor */
ret = cgroup_may_write(com_cgrp, sb);
if (ret)
return ret;
/*
* If namespaces are delegation boundaries, %current must be able
* to see both source and destination cgroups from its namespace.
*/
if ((cgrp_dfl_root.flags & CGRP_ROOT_NS_DELEGATE) &&
(!cgroup_is_descendant(src_cgrp, ns->root_cset->dfl_cgrp) ||
!cgroup_is_descendant(dst_cgrp, ns->root_cset->dfl_cgrp)))
return -ENOENT;
return 0;
}
static int cgroup_attach_permissions(struct cgroup *src_cgrp,
struct cgroup *dst_cgrp,
struct super_block *sb, bool threadgroup,
struct cgroup_namespace *ns)
{
int ret = 0;
ret = cgroup_procs_write_permission(src_cgrp, dst_cgrp, sb, ns);
if (ret)
return ret;
ret = cgroup_migrate_vet_dst(dst_cgrp);
if (ret)
return ret;
if (!threadgroup && (src_cgrp->dom_cgrp != dst_cgrp->dom_cgrp))
ret = -EOPNOTSUPP;
return ret;
}
static ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf,
bool threadgroup)
{
struct cgroup_file_ctx *ctx = of->priv;
struct cgroup *src_cgrp, *dst_cgrp;
struct task_struct *task;
const struct cred *saved_cred;
ssize_t ret;
bool locked;
dst_cgrp = cgroup_kn_lock_live(of->kn, false);
if (!dst_cgrp)
return -ENODEV;
task = cgroup_procs_write_start(buf, threadgroup, &locked);
ret = PTR_ERR_OR_ZERO(task);
if (ret)
goto out_unlock;
/* find the source cgroup */
spin_lock_irq(&css_set_lock);
src_cgrp = task_cgroup_from_root(task, &cgrp_dfl_root);
spin_unlock_irq(&css_set_lock);
/*
* Process and thread migrations follow same delegation rule. Check
* permissions using the credentials from file open to protect against
* inherited fd attacks.
*/
saved_cred = override_creds(of->file->f_cred);
ret = cgroup_attach_permissions(src_cgrp, dst_cgrp,
of->file->f_path.dentry->d_sb,
threadgroup, ctx->ns);
revert_creds(saved_cred);
if (ret)
goto out_finish;
ret = cgroup_attach_task(dst_cgrp, task, threadgroup);
out_finish:
cgroup_procs_write_finish(task, locked);
out_unlock:
cgroup_kn_unlock(of->kn);
return ret;
}
static ssize_t cgroup_procs_write(struct kernfs_open_file *of,
char *buf, size_t nbytes, loff_t off)
{
return __cgroup_procs_write(of, buf, true) ?: nbytes;
}
static void *cgroup_threads_start(struct seq_file *s, loff_t *pos)
{
return __cgroup_procs_start(s, pos, 0);
}
static ssize_t cgroup_threads_write(struct kernfs_open_file *of,
char *buf, size_t nbytes, loff_t off)
{
return __cgroup_procs_write(of, buf, false) ?: nbytes;
}
/* cgroup core interface files for the default hierarchy */
static struct cftype cgroup_base_files[] = {
{
.name = "cgroup.type",
.flags = CFTYPE_NOT_ON_ROOT,
.seq_show = cgroup_type_show,
.write = cgroup_type_write,
},
{
.name = "cgroup.procs",
.flags = CFTYPE_NS_DELEGATABLE,
.file_offset = offsetof(struct cgroup, procs_file),
.release = cgroup_procs_release,
.seq_start = cgroup_procs_start,
.seq_next = cgroup_procs_next,
.seq_show = cgroup_procs_show,
.write = cgroup_procs_write,
},
{
.name = "cgroup.threads",
.flags = CFTYPE_NS_DELEGATABLE,
.release = cgroup_procs_release,
.seq_start = cgroup_threads_start,
.seq_next = cgroup_procs_next,
.seq_show = cgroup_procs_show,
.write = cgroup_threads_write,
},
{
.name = "cgroup.controllers",
.seq_show = cgroup_controllers_show,
},
{
.name = "cgroup.subtree_control",
.flags = CFTYPE_NS_DELEGATABLE,
.seq_show = cgroup_subtree_control_show,
.write = cgroup_subtree_control_write,
},
{
.name = "cgroup.events",
.flags = CFTYPE_NOT_ON_ROOT,
.file_offset = offsetof(struct cgroup, events_file),
.seq_show = cgroup_events_show,
},
{
.name = "cgroup.max.descendants",
.seq_show = cgroup_max_descendants_show,
.write = cgroup_max_descendants_write,
},
{
.name = "cgroup.max.depth",
.seq_show = cgroup_max_depth_show,
.write = cgroup_max_depth_write,
},
{
.name = "cgroup.stat",
.seq_show = cgroup_stat_show,
},
{
.name = "cgroup.freeze",
.flags = CFTYPE_NOT_ON_ROOT,
.seq_show = cgroup_freeze_show,
.write = cgroup_freeze_write,
},
{
.name = "cgroup.kill",
.flags = CFTYPE_NOT_ON_ROOT,
.write = cgroup_kill_write,
},
{
.name = "cpu.stat",
.seq_show = cpu_stat_show,
},
#ifdef CONFIG_PSI
{
.name = "io.pressure",
.flags = CFTYPE_PRESSURE,
.seq_show = cgroup_io_pressure_show,
.write = cgroup_io_pressure_write,
.poll = cgroup_pressure_poll,
.release = cgroup_pressure_release,
},
{
.name = "memory.pressure",
.flags = CFTYPE_PRESSURE,
.seq_show = cgroup_memory_pressure_show,
.write = cgroup_memory_pressure_write,
.poll = cgroup_pressure_poll,
.release = cgroup_pressure_release,
},
{
.name = "cpu.pressure",
.flags = CFTYPE_PRESSURE,
.seq_show = cgroup_cpu_pressure_show,
.write = cgroup_cpu_pressure_write,
.poll = cgroup_pressure_poll,
.release = cgroup_pressure_release,
},
#endif /* CONFIG_PSI */
{ } /* terminate */
};
/*
* css destruction is four-stage process.
*
* 1. Destruction starts. Killing of the percpu_ref is initiated.
* Implemented in kill_css().
*
* 2. When the percpu_ref is confirmed to be visible as killed on all CPUs
* and thus css_tryget_online() is guaranteed to fail, the css can be
* offlined by invoking offline_css(). After offlining, the base ref is
* put. Implemented in css_killed_work_fn().
*
* 3. When the percpu_ref reaches zero, the only possible remaining
* accessors are inside RCU read sections. css_release() schedules the
* RCU callback.
*
* 4. After the grace period, the css can be freed. Implemented in
* css_free_work_fn().
*
* It is actually hairier because both step 2 and 4 require process context
* and thus involve punting to css->destroy_work adding two additional
* steps to the already complex sequence.
*/
static void css_free_rwork_fn(struct work_struct *work)
{
struct cgroup_subsys_state *css = container_of(to_rcu_work(work),
struct cgroup_subsys_state, destroy_rwork);
struct cgroup_subsys *ss = css->ss;
struct cgroup *cgrp = css->cgroup;
percpu_ref_exit(&css->refcnt);
if (ss) {
/* css free path */
struct cgroup_subsys_state *parent = css->parent;
int id = css->id;
ss->css_free(css);
cgroup_idr_remove(&ss->css_idr, id);
cgroup_put(cgrp);
if (parent)
css_put(parent);
} else {
/* cgroup free path */
atomic_dec(&cgrp->root->nr_cgrps);
cgroup1_pidlist_destroy_all(cgrp);
cancel_work_sync(&cgrp->release_agent_work);
if (cgroup_parent(cgrp)) {
/*
* We get a ref to the parent, and put the ref when
* this cgroup is being freed, so it's guaranteed
* that the parent won't be destroyed before its
* children.
*/
cgroup_put(cgroup_parent(cgrp));
kernfs_put(cgrp->kn);
psi_cgroup_free(cgrp);
cgroup_rstat_exit(cgrp);
kfree(cgrp);
} else {
/*
* This is root cgroup's refcnt reaching zero,
* which indicates that the root should be
* released.
*/
cgroup_destroy_root(cgrp->root);
}
}
}
static void css_release_work_fn(struct work_struct *work)
{
struct cgroup_subsys_state *css =
container_of(work, struct cgroup_subsys_state, destroy_work);
struct cgroup_subsys *ss = css->ss;
struct cgroup *cgrp = css->cgroup;
mutex_lock(&cgroup_mutex);
css->flags |= CSS_RELEASED;
list_del_rcu(&css->sibling);
if (ss) {
/* css release path */
if (!list_empty(&css->rstat_css_node)) {
cgroup_rstat_flush(cgrp);
list_del_rcu(&css->rstat_css_node);
}
cgroup_idr_replace(&ss->css_idr, NULL, css->id);
if (ss->css_released)
ss->css_released(css);
} else {
struct cgroup *tcgrp;
/* cgroup release path */
TRACE_CGROUP_PATH(release, cgrp);
cgroup_rstat_flush(cgrp);
spin_lock_irq(&css_set_lock);
for (tcgrp = cgroup_parent(cgrp); tcgrp;
tcgrp = cgroup_parent(tcgrp))
tcgrp->nr_dying_descendants--;
spin_unlock_irq(&css_set_lock);
/*
* There are two control paths which try to determine
* cgroup from dentry without going through kernfs -
* cgroupstats_build() and css_tryget_online_from_dir().
* Those are supported by RCU protecting clearing of
* cgrp->kn->priv backpointer.
*/
if (cgrp->kn)
RCU_INIT_POINTER(*(void __rcu __force **)&cgrp->kn->priv,
NULL);
}
mutex_unlock(&cgroup_mutex);
INIT_RCU_WORK(&css->destroy_rwork, css_free_rwork_fn);
queue_rcu_work(cgroup_destroy_wq, &css->destroy_rwork);
}
static void css_release(struct percpu_ref *ref)
{
struct cgroup_subsys_state *css =
container_of(ref, struct cgroup_subsys_state, refcnt);
INIT_WORK(&css->destroy_work, css_release_work_fn);
queue_work(cgroup_destroy_wq, &css->destroy_work);
}
static void init_and_link_css(struct cgroup_subsys_state *css,
struct cgroup_subsys *ss, struct cgroup *cgrp)
{
lockdep_assert_held(&cgroup_mutex);
cgroup_get_live(cgrp);
memset(css, 0, sizeof(*css));
css->cgroup = cgrp;
css->ss = ss;
css->id = -1;
INIT_LIST_HEAD(&css->sibling);
INIT_LIST_HEAD(&css->children);
INIT_LIST_HEAD(&css->rstat_css_node);
css->serial_nr = css_serial_nr_next++;
atomic_set(&css->online_cnt, 0);
if (cgroup_parent(cgrp)) {
css->parent = cgroup_css(cgroup_parent(cgrp), ss);
css_get(css->parent);
}
if (ss->css_rstat_flush)
list_add_rcu(&css->rstat_css_node, &cgrp->rstat_css_list);
BUG_ON(cgroup_css(cgrp, ss));
}
/* invoke ->css_online() on a new CSS and mark it online if successful */
static int online_css(struct cgroup_subsys_state *css)
{
struct cgroup_subsys *ss = css->ss;
int ret = 0;
lockdep_assert_held(&cgroup_mutex);
if (ss->css_online)
ret = ss->css_online(css);
if (!ret) {
css->flags |= CSS_ONLINE;
rcu_assign_pointer(css->cgroup->subsys[ss->id], css);
atomic_inc(&css->online_cnt);
if (css->parent)
atomic_inc(&css->parent->online_cnt);
}
return ret;
}
/* if the CSS is online, invoke ->css_offline() on it and mark it offline */
static void offline_css(struct cgroup_subsys_state *css)
{
struct cgroup_subsys *ss = css->ss;
lockdep_assert_held(&cgroup_mutex);
if (!(css->flags & CSS_ONLINE))
return;
if (ss->css_offline)
ss->css_offline(css);
css->flags &= ~CSS_ONLINE;
RCU_INIT_POINTER(css->cgroup->subsys[ss->id], NULL);
wake_up_all(&css->cgroup->offline_waitq);
}
/**
* css_create - create a cgroup_subsys_state
* @cgrp: the cgroup new css will be associated with
* @ss: the subsys of new css
*
* Create a new css associated with @cgrp - @ss pair. On success, the new
* css is online and installed in @cgrp. This function doesn't create the
* interface files. Returns 0 on success, -errno on failure.
*/
static struct cgroup_subsys_state *css_create(struct cgroup *cgrp,
struct cgroup_subsys *ss)
{
struct cgroup *parent = cgroup_parent(cgrp);
struct cgroup_subsys_state *parent_css = cgroup_css(parent, ss);
struct cgroup_subsys_state *css;
int err;
lockdep_assert_held(&cgroup_mutex);
css = ss->css_alloc(parent_css);
if (!css)
css = ERR_PTR(-ENOMEM);
if (IS_ERR(css))
return css;
init_and_link_css(css, ss, cgrp);
err = percpu_ref_init(&css->refcnt, css_release, 0, GFP_KERNEL);
if (err)
goto err_free_css;
err = cgroup_idr_alloc(&ss->css_idr, NULL, 2, 0, GFP_KERNEL);
if (err < 0)
goto err_free_css;
css->id = err;
/* @css is ready to be brought online now, make it visible */
list_add_tail_rcu(&css->sibling, &parent_css->children);
cgroup_idr_replace(&ss->css_idr, css, css->id);
err = online_css(css);
if (err)
goto err_list_del;
return css;
err_list_del:
list_del_rcu(&css->sibling);
err_free_css:
list_del_rcu(&css->rstat_css_node);
INIT_RCU_WORK(&css->destroy_rwork, css_free_rwork_fn);
queue_rcu_work(cgroup_destroy_wq, &css->destroy_rwork);
return ERR_PTR(err);
}
/*
* The returned cgroup is fully initialized including its control mask, but
* it isn't associated with its kernfs_node and doesn't have the control
* mask applied.
*/
static struct cgroup *cgroup_create(struct cgroup *parent, const char *name,
umode_t mode)
{
struct cgroup_root *root = parent->root;
struct cgroup *cgrp, *tcgrp;
struct kernfs_node *kn;
int level = parent->level + 1;
int ret;
/* allocate the cgroup and its ID, 0 is reserved for the root */
cgrp = kzalloc(struct_size(cgrp, ancestor_ids, (level + 1)),
GFP_KERNEL);
if (!cgrp)
return ERR_PTR(-ENOMEM);
ret = percpu_ref_init(&cgrp->self.refcnt, css_release, 0, GFP_KERNEL);
if (ret)
goto out_free_cgrp;
ret = cgroup_rstat_init(cgrp);
if (ret)
goto out_cancel_ref;
/* create the directory */
kn = kernfs_create_dir(parent->kn, name, mode, cgrp);
if (IS_ERR(kn)) {
ret = PTR_ERR(kn);
goto out_stat_exit;
}
cgrp->kn = kn;
init_cgroup_housekeeping(cgrp);
cgrp->self.parent = &parent->self;
cgrp->root = root;
cgrp->level = level;
ret = psi_cgroup_alloc(cgrp);
if (ret)
goto out_kernfs_remove;
ret = cgroup_bpf_inherit(cgrp);
if (ret)
goto out_psi_free;
/*
* New cgroup inherits effective freeze counter, and
* if the parent has to be frozen, the child has too.
*/
cgrp->freezer.e_freeze = parent->freezer.e_freeze;
if (cgrp->freezer.e_freeze) {
/*
* Set the CGRP_FREEZE flag, so when a process will be
* attached to the child cgroup, it will become frozen.
* At this point the new cgroup is unpopulated, so we can
* consider it frozen immediately.
*/
set_bit(CGRP_FREEZE, &cgrp->flags);
set_bit(CGRP_FROZEN, &cgrp->flags);
}
spin_lock_irq(&css_set_lock);
for (tcgrp = cgrp; tcgrp; tcgrp = cgroup_parent(tcgrp)) {
cgrp->ancestor_ids[tcgrp->level] = cgroup_id(tcgrp);
if (tcgrp != cgrp) {
tcgrp->nr_descendants++;
/*
* If the new cgroup is frozen, all ancestor cgroups
* get a new frozen descendant, but their state can't
* change because of this.
*/
if (cgrp->freezer.e_freeze)
tcgrp->freezer.nr_frozen_descendants++;
}
}
spin_unlock_irq(&css_set_lock);
if (notify_on_release(parent))
set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &parent->flags))
set_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags);
cgrp->self.serial_nr = css_serial_nr_next++;
/* allocation complete, commit to creation */
list_add_tail_rcu(&cgrp->self.sibling, &cgroup_parent(cgrp)->self.children);
atomic_inc(&root->nr_cgrps);
cgroup_get_live(parent);
/*
* On the default hierarchy, a child doesn't automatically inherit
* subtree_control from the parent. Each is configured manually.
*/
if (!cgroup_on_dfl(cgrp))
cgrp->subtree_control = cgroup_control(cgrp);
cgroup_propagate_control(cgrp);
return cgrp;
out_psi_free:
psi_cgroup_free(cgrp);
out_kernfs_remove:
kernfs_remove(cgrp->kn);
out_stat_exit:
cgroup_rstat_exit(cgrp);
out_cancel_ref:
percpu_ref_exit(&cgrp->self.refcnt);
out_free_cgrp:
kfree(cgrp);
return ERR_PTR(ret);
}
static bool cgroup_check_hierarchy_limits(struct cgroup *parent)
{
struct cgroup *cgroup;
int ret = false;
int level = 1;
lockdep_assert_held(&cgroup_mutex);
for (cgroup = parent; cgroup; cgroup = cgroup_parent(cgroup)) {
if (cgroup->nr_descendants >= cgroup->max_descendants)
goto fail;
if (level > cgroup->max_depth)
goto fail;
level++;
}
ret = true;
fail:
return ret;
}
int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, umode_t mode)
{
struct cgroup *parent, *cgrp;
int ret;
/* do not accept '\n' to prevent making /proc/<pid>/cgroup unparsable */
if (strchr(name, '\n'))
return -EINVAL;
parent = cgroup_kn_lock_live(parent_kn, false);
if (!parent)
return -ENODEV;
if (!cgroup_check_hierarchy_limits(parent)) {
ret = -EAGAIN;
goto out_unlock;
}
cgrp = cgroup_create(parent, name, mode);
if (IS_ERR(cgrp)) {
ret = PTR_ERR(cgrp);
goto out_unlock;
}
/*
* This extra ref will be put in cgroup_free_fn() and guarantees
* that @cgrp->kn is always accessible.
*/
kernfs_get(cgrp->kn);
ret = cgroup_kn_set_ugid(cgrp->kn);
if (ret)
goto out_destroy;
ret = css_populate_dir(&cgrp->self);
if (ret)
goto out_destroy;
ret = cgroup_apply_control_enable(cgrp);
if (ret)
goto out_destroy;
TRACE_CGROUP_PATH(mkdir, cgrp);
/* let's create and online css's */
kernfs_activate(cgrp->kn);
ret = 0;
goto out_unlock;
out_destroy:
cgroup_destroy_locked(cgrp);
out_unlock:
cgroup_kn_unlock(parent_kn);
return ret;
}
/*
* This is called when the refcnt of a css is confirmed to be killed.
* css_tryget_online() is now guaranteed to fail. Tell the subsystem to
* initiate destruction and put the css ref from kill_css().
*/
static void css_killed_work_fn(struct work_struct *work)
{
struct cgroup_subsys_state *css =
container_of(work, struct cgroup_subsys_state, destroy_work);
mutex_lock(&cgroup_mutex);
do {
offline_css(css);
css_put(css);
/* @css can't go away while we're holding cgroup_mutex */
css = css->parent;
} while (css && atomic_dec_and_test(&css->online_cnt));
mutex_unlock(&cgroup_mutex);
}
/* css kill confirmation processing requires process context, bounce */
static void css_killed_ref_fn(struct percpu_ref *ref)
{
struct cgroup_subsys_state *css =
container_of(ref, struct cgroup_subsys_state, refcnt);
if (atomic_dec_and_test(&css->online_cnt)) {
INIT_WORK(&css->destroy_work, css_killed_work_fn);
queue_work(cgroup_destroy_wq, &css->destroy_work);
}
}
/**
* kill_css - destroy a css
* @css: css to destroy
*
* This function initiates destruction of @css by removing cgroup interface
* files and putting its base reference. ->css_offline() will be invoked
* asynchronously once css_tryget_online() is guaranteed to fail and when
* the reference count reaches zero, @css will be released.
*/
static void kill_css(struct cgroup_subsys_state *css)
{
lockdep_assert_held(&cgroup_mutex);
if (css->flags & CSS_DYING)
return;
css->flags |= CSS_DYING;
/*
* This must happen before css is disassociated with its cgroup.
* See seq_css() for details.
*/
css_clear_dir(css);
/*
* Killing would put the base ref, but we need to keep it alive
* until after ->css_offline().
*/
css_get(css);
/*
* cgroup core guarantees that, by the time ->css_offline() is
* invoked, no new css reference will be given out via
* css_tryget_online(). We can't simply call percpu_ref_kill() and
* proceed to offlining css's because percpu_ref_kill() doesn't
* guarantee that the ref is seen as killed on all CPUs on return.
*
* Use percpu_ref_kill_and_confirm() to get notifications as each
* css is confirmed to be seen as killed on all CPUs.
*/
percpu_ref_kill_and_confirm(&css->refcnt, css_killed_ref_fn);
}
/**
* cgroup_destroy_locked - the first stage of cgroup destruction
* @cgrp: cgroup to be destroyed
*
* css's make use of percpu refcnts whose killing latency shouldn't be
* exposed to userland and are RCU protected. Also, cgroup core needs to
* guarantee that css_tryget_online() won't succeed by the time
* ->css_offline() is invoked. To satisfy all the requirements,
* destruction is implemented in the following two steps.
*
* s1. Verify @cgrp can be destroyed and mark it dying. Remove all
* userland visible parts and start killing the percpu refcnts of
* css's. Set up so that the next stage will be kicked off once all
* the percpu refcnts are confirmed to be killed.
*
* s2. Invoke ->css_offline(), mark the cgroup dead and proceed with the
* rest of destruction. Once all cgroup references are gone, the
* cgroup is RCU-freed.
*
* This function implements s1. After this step, @cgrp is gone as far as
* the userland is concerned and a new cgroup with the same name may be
* created. As cgroup doesn't care about the names internally, this
* doesn't cause any problem.
*/
static int cgroup_destroy_locked(struct cgroup *cgrp)
__releases(&cgroup_mutex) __acquires(&cgroup_mutex)
{
struct cgroup *tcgrp, *parent = cgroup_parent(cgrp);
struct cgroup_subsys_state *css;
struct cgrp_cset_link *link;
int ssid;
lockdep_assert_held(&cgroup_mutex);
/*
* Only migration can raise populated from zero and we're already
* holding cgroup_mutex.
*/
if (cgroup_is_populated(cgrp))
return -EBUSY;
/*
* Make sure there's no live children. We can't test emptiness of
* ->self.children as dead children linger on it while being
* drained; otherwise, "rmdir parent/child parent" may fail.
*/
if (css_has_online_children(&cgrp->self))
return -EBUSY;
/*
* Mark @cgrp and the associated csets dead. The former prevents
* further task migration and child creation by disabling
* cgroup_lock_live_group(). The latter makes the csets ignored by
* the migration path.
*/
cgrp->self.flags &= ~CSS_ONLINE;
spin_lock_irq(&css_set_lock);
list_for_each_entry(link, &cgrp->cset_links, cset_link)
link->cset->dead = true;
spin_unlock_irq(&css_set_lock);
/* initiate massacre of all css's */
for_each_css(css, ssid, cgrp)
kill_css(css);
/* clear and remove @cgrp dir, @cgrp has an extra ref on its kn */
css_clear_dir(&cgrp->self);
kernfs_remove(cgrp->kn);
if (parent && cgroup_is_threaded(cgrp))
parent->nr_threaded_children--;
spin_lock_irq(&css_set_lock);
for (tcgrp = cgroup_parent(cgrp); tcgrp; tcgrp = cgroup_parent(tcgrp)) {
tcgrp->nr_descendants--;
tcgrp->nr_dying_descendants++;
/*
* If the dying cgroup is frozen, decrease frozen descendants
* counters of ancestor cgroups.
*/
if (test_bit(CGRP_FROZEN, &cgrp->flags))
tcgrp->freezer.nr_frozen_descendants--;
}
spin_unlock_irq(&css_set_lock);
cgroup1_check_for_release(parent);
cgroup_bpf_offline(cgrp);
/* put the base reference */
percpu_ref_kill(&cgrp->self.refcnt);
return 0;
};
int cgroup_rmdir(struct kernfs_node *kn)
{
struct cgroup *cgrp;
int ret = 0;
cgrp = cgroup_kn_lock_live(kn, false);
if (!cgrp)
return 0;
ret = cgroup_destroy_locked(cgrp);
if (!ret)
TRACE_CGROUP_PATH(rmdir, cgrp);
cgroup_kn_unlock(kn);
return ret;
}
static struct kernfs_syscall_ops cgroup_kf_syscall_ops = {
.show_options = cgroup_show_options,
.mkdir = cgroup_mkdir,
.rmdir = cgroup_rmdir,
.show_path = cgroup_show_path,
};
static void __init cgroup_init_subsys(struct cgroup_subsys *ss, bool early)
{
struct cgroup_subsys_state *css;
pr_debug("Initializing cgroup subsys %s\n", ss->name);
mutex_lock(&cgroup_mutex);
idr_init(&ss->css_idr);
INIT_LIST_HEAD(&ss->cfts);
/* Create the root cgroup state for this subsystem */
ss->root = &cgrp_dfl_root;
css = ss->css_alloc(cgroup_css(&cgrp_dfl_root.cgrp, ss));
/* We don't handle early failures gracefully */
BUG_ON(IS_ERR(css));
init_and_link_css(css, ss, &cgrp_dfl_root.cgrp);
/*
* Root csses are never destroyed and we can't initialize
* percpu_ref during early init. Disable refcnting.
*/
css->flags |= CSS_NO_REF;
if (early) {
/* allocation can't be done safely during early init */
css->id = 1;
} else {
css->id = cgroup_idr_alloc(&ss->css_idr, css, 1, 2, GFP_KERNEL);
BUG_ON(css->id < 0);
}
/* Update the init_css_set to contain a subsys
* pointer to this state - since the subsystem is
* newly registered, all tasks and hence the
* init_css_set is in the subsystem's root cgroup. */
init_css_set.subsys[ss->id] = css;
have_fork_callback |= (bool)ss->fork << ss->id;
have_exit_callback |= (bool)ss->exit << ss->id;
have_release_callback |= (bool)ss->release << ss->id;
have_canfork_callback |= (bool)ss->can_fork << ss->id;
/* At system boot, before all subsystems have been
* registered, no tasks have been forked, so we don't
* need to invoke fork callbacks here. */
BUG_ON(!list_empty(&init_task.tasks));
BUG_ON(online_css(css));
mutex_unlock(&cgroup_mutex);
}
/**
* cgroup_init_early - cgroup initialization at system boot
*
* Initialize cgroups at system boot, and initialize any
* subsystems that request early init.
*/
int __init cgroup_init_early(void)
{
static struct cgroup_fs_context __initdata ctx;
struct cgroup_subsys *ss;
int i;
ctx.root = &cgrp_dfl_root;
init_cgroup_root(&ctx);
cgrp_dfl_root.cgrp.self.flags |= CSS_NO_REF;
RCU_INIT_POINTER(init_task.cgroups, &init_css_set);
for_each_subsys(ss, i) {
WARN(!ss->css_alloc || !ss->css_free || ss->name || ss->id,
"invalid cgroup_subsys %d:%s css_alloc=%p css_free=%p id:name=%d:%s\n",
i, cgroup_subsys_name[i], ss->css_alloc, ss->css_free,
ss->id, ss->name);
WARN(strlen(cgroup_subsys_name[i]) > MAX_CGROUP_TYPE_NAMELEN,
"cgroup_subsys_name %s too long\n", cgroup_subsys_name[i]);
ss->id = i;
ss->name = cgroup_subsys_name[i];
if (!ss->legacy_name)
ss->legacy_name = cgroup_subsys_name[i];
if (ss->early_init)
cgroup_init_subsys(ss, true);
}
return 0;
}
/**
* cgroup_init - cgroup initialization
*
* Register cgroup filesystem and /proc file, and initialize
* any subsystems that didn't request early init.
*/
int __init cgroup_init(void)
{
struct cgroup_subsys *ss;
int ssid;
BUILD_BUG_ON(CGROUP_SUBSYS_COUNT > 16);
BUG_ON(cgroup_init_cftypes(NULL, cgroup_base_files));
BUG_ON(cgroup_init_cftypes(NULL, cgroup1_base_files));
cgroup_rstat_boot();
/*
* The latency of the synchronize_rcu() is too high for cgroups,
* avoid it at the cost of forcing all readers into the slow path.
*/
rcu_sync_enter_start(&cgroup_threadgroup_rwsem.rss);
get_user_ns(init_cgroup_ns.user_ns);
mutex_lock(&cgroup_mutex);
/*
* Add init_css_set to the hash table so that dfl_root can link to
* it during init.
*/
hash_add(css_set_table, &init_css_set.hlist,
css_set_hash(init_css_set.subsys));
BUG_ON(cgroup_setup_root(&cgrp_dfl_root, 0));
mutex_unlock(&cgroup_mutex);
for_each_subsys(ss, ssid) {
if (ss->early_init) {
struct cgroup_subsys_state *css =
init_css_set.subsys[ss->id];
css->id = cgroup_idr_alloc(&ss->css_idr, css, 1, 2,
GFP_KERNEL);
BUG_ON(css->id < 0);
} else {
cgroup_init_subsys(ss, false);
}
list_add_tail(&init_css_set.e_cset_node[ssid],
&cgrp_dfl_root.cgrp.e_csets[ssid]);
/*
* Setting dfl_root subsys_mask needs to consider the
* disabled flag and cftype registration needs kmalloc,
* both of which aren't available during early_init.
*/
if (!cgroup_ssid_enabled(ssid))
continue;
if (cgroup1_ssid_disabled(ssid))
printk(KERN_INFO "Disabling %s control group subsystem in v1 mounts\n",
ss->name);
cgrp_dfl_root.subsys_mask |= 1 << ss->id;
/* implicit controllers must be threaded too */
WARN_ON(ss->implicit_on_dfl && !ss->threaded);
if (ss->implicit_on_dfl)
cgrp_dfl_implicit_ss_mask |= 1 << ss->id;
else if (!ss->dfl_cftypes)
cgrp_dfl_inhibit_ss_mask |= 1 << ss->id;
if (ss->threaded)
cgrp_dfl_threaded_ss_mask |= 1 << ss->id;
if (ss->dfl_cftypes == ss->legacy_cftypes) {
WARN_ON(cgroup_add_cftypes(ss, ss->dfl_cftypes));
} else {
WARN_ON(cgroup_add_dfl_cftypes(ss, ss->dfl_cftypes));
WARN_ON(cgroup_add_legacy_cftypes(ss, ss->legacy_cftypes));
}
if (ss->bind)
ss->bind(init_css_set.subsys[ssid]);
mutex_lock(&cgroup_mutex);
css_populate_dir(init_css_set.subsys[ssid]);
mutex_unlock(&cgroup_mutex);
}
/* init_css_set.subsys[] has been updated, re-hash */
hash_del(&init_css_set.hlist);
hash_add(css_set_table, &init_css_set.hlist,
css_set_hash(init_css_set.subsys));
WARN_ON(sysfs_create_mount_point(fs_kobj, "cgroup"));
WARN_ON(register_filesystem(&cgroup_fs_type));
WARN_ON(register_filesystem(&cgroup2_fs_type));
WARN_ON(!proc_create_single("cgroups", 0, NULL, proc_cgroupstats_show));
#ifdef CONFIG_CPUSETS
WARN_ON(register_filesystem(&cpuset_fs_type));
#endif
return 0;
}
static int __init cgroup_wq_init(void)
{
/*
* There isn't much point in executing destruction path in
* parallel. Good chunk is serialized with cgroup_mutex anyway.
* Use 1 for @max_active.
*
* We would prefer to do this in cgroup_init() above, but that
* is called before init_workqueues(): so leave this until after.
*/
cgroup_destroy_wq = alloc_workqueue("cgroup_destroy", 0, 1);
BUG_ON(!cgroup_destroy_wq);
return 0;
}
core_initcall(cgroup_wq_init);
void cgroup_path_from_kernfs_id(u64 id, char *buf, size_t buflen)
{
struct kernfs_node *kn;
kn = kernfs_find_and_get_node_by_id(cgrp_dfl_root.kf_root, id);
if (!kn)
return;
kernfs_path(kn, buf, buflen);
kernfs_put(kn);
}
/*
* cgroup_get_from_id : get the cgroup associated with cgroup id
* @id: cgroup id
* On success return the cgrp, on failure return NULL
*/
struct cgroup *cgroup_get_from_id(u64 id)
{
struct kernfs_node *kn;
struct cgroup *cgrp = NULL;
mutex_lock(&cgroup_mutex);
kn = kernfs_find_and_get_node_by_id(cgrp_dfl_root.kf_root, id);
if (!kn)
goto out_unlock;
cgrp = kn->priv;
if (cgroup_is_dead(cgrp) || !cgroup_tryget(cgrp))
cgrp = NULL;
kernfs_put(kn);
out_unlock:
mutex_unlock(&cgroup_mutex);
return cgrp;
}
EXPORT_SYMBOL_GPL(cgroup_get_from_id);
/*
* proc_cgroup_show()
* - Print task's cgroup paths into seq_file, one line for each hierarchy
* - Used for /proc/<pid>/cgroup.
*/
int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns,
struct pid *pid, struct task_struct *tsk)
{
char *buf;
int retval;
struct cgroup_root *root;
retval = -ENOMEM;
buf = kmalloc(PATH_MAX, GFP_KERNEL);
if (!buf)
goto out;
mutex_lock(&cgroup_mutex);
spin_lock_irq(&css_set_lock);
for_each_root(root) {
struct cgroup_subsys *ss;
struct cgroup *cgrp;
int ssid, count = 0;
if (root == &cgrp_dfl_root && !cgrp_dfl_visible)
continue;
seq_printf(m, "%d:", root->hierarchy_id);
if (root != &cgrp_dfl_root)
for_each_subsys(ss, ssid)
if (root->subsys_mask & (1 << ssid))
seq_printf(m, "%s%s", count++ ? "," : "",
ss->legacy_name);
if (strlen(root->name))
seq_printf(m, "%sname=%s", count ? "," : "",
root->name);
seq_putc(m, ':');
cgrp = task_cgroup_from_root(tsk, root);
/*
* On traditional hierarchies, all zombie tasks show up as
* belonging to the root cgroup. On the default hierarchy,
* while a zombie doesn't show up in "cgroup.procs" and
* thus can't be migrated, its /proc/PID/cgroup keeps
* reporting the cgroup it belonged to before exiting. If
* the cgroup is removed before the zombie is reaped,
* " (deleted)" is appended to the cgroup path.
*/
if (cgroup_on_dfl(cgrp) || !(tsk->flags & PF_EXITING)) {
retval = cgroup_path_ns_locked(cgrp, buf, PATH_MAX,
current->nsproxy->cgroup_ns);
if (retval >= PATH_MAX)
retval = -ENAMETOOLONG;
if (retval < 0)
goto out_unlock;
seq_puts(m, buf);
} else {
seq_puts(m, "/");
}
if (cgroup_on_dfl(cgrp) && cgroup_is_dead(cgrp))
seq_puts(m, " (deleted)\n");
else
seq_putc(m, '\n');
}
retval = 0;
out_unlock:
spin_unlock_irq(&css_set_lock);
mutex_unlock(&cgroup_mutex);
kfree(buf);
out:
return retval;
}
/**
* cgroup_fork - initialize cgroup related fields during copy_process()
* @child: pointer to task_struct of forking parent process.
*
* A task is associated with the init_css_set until cgroup_post_fork()
* attaches it to the target css_set.
*/
void cgroup_fork(struct task_struct *child)
{
RCU_INIT_POINTER(child->cgroups, &init_css_set);
INIT_LIST_HEAD(&child->cg_list);
}
static struct cgroup *cgroup_get_from_file(struct file *f)
{
struct cgroup_subsys_state *css;
struct cgroup *cgrp;
css = css_tryget_online_from_dir(f->f_path.dentry, NULL);
if (IS_ERR(css))
return ERR_CAST(css);
cgrp = css->cgroup;
if (!cgroup_on_dfl(cgrp)) {
cgroup_put(cgrp);
return ERR_PTR(-EBADF);
}
return cgrp;
}
/**
* cgroup_css_set_fork - find or create a css_set for a child process
* @kargs: the arguments passed to create the child process
*
* This functions finds or creates a new css_set which the child
* process will be attached to in cgroup_post_fork(). By default,
* the child process will be given the same css_set as its parent.
*
* If CLONE_INTO_CGROUP is specified this function will try to find an
* existing css_set which includes the requested cgroup and if not create
* a new css_set that the child will be attached to later. If this function
* succeeds it will hold cgroup_threadgroup_rwsem on return. If
* CLONE_INTO_CGROUP is requested this function will grab cgroup mutex
* before grabbing cgroup_threadgroup_rwsem and will hold a reference
* to the target cgroup.
*/
static int cgroup_css_set_fork(struct kernel_clone_args *kargs)
__acquires(&cgroup_mutex) __acquires(&cgroup_threadgroup_rwsem)
{
int ret;
struct cgroup *dst_cgrp = NULL;
struct css_set *cset;
struct super_block *sb;
struct file *f;
if (kargs->flags & CLONE_INTO_CGROUP)
mutex_lock(&cgroup_mutex);
cgroup_threadgroup_change_begin(current);
spin_lock_irq(&css_set_lock);
cset = task_css_set(current);
get_css_set(cset);
spin_unlock_irq(&css_set_lock);
if (!(kargs->flags & CLONE_INTO_CGROUP)) {
kargs->cset = cset;
return 0;
}
f = fget_raw(kargs->cgroup);
if (!f) {
ret = -EBADF;
goto err;
}
sb = f->f_path.dentry->d_sb;
dst_cgrp = cgroup_get_from_file(f);
if (IS_ERR(dst_cgrp)) {
ret = PTR_ERR(dst_cgrp);
dst_cgrp = NULL;
goto err;
}
if (cgroup_is_dead(dst_cgrp)) {
ret = -ENODEV;
goto err;
}
/*
* Verify that we the target cgroup is writable for us. This is
* usually done by the vfs layer but since we're not going through
* the vfs layer here we need to do it "manually".
*/
ret = cgroup_may_write(dst_cgrp, sb);
if (ret)
goto err;
ret = cgroup_attach_permissions(cset->dfl_cgrp, dst_cgrp, sb,
!(kargs->flags & CLONE_THREAD),
current->nsproxy->cgroup_ns);
if (ret)
goto err;
kargs->cset = find_css_set(cset, dst_cgrp);
if (!kargs->cset) {
ret = -ENOMEM;
goto err;
}
put_css_set(cset);
fput(f);
kargs->cgrp = dst_cgrp;
return ret;
err:
cgroup_threadgroup_change_end(current);
mutex_unlock(&cgroup_mutex);
if (f)
fput(f);
if (dst_cgrp)
cgroup_put(dst_cgrp);
put_css_set(cset);
if (kargs->cset)
put_css_set(kargs->cset);
return ret;
}
/**
* cgroup_css_set_put_fork - drop references we took during fork
* @kargs: the arguments passed to create the child process
*
* Drop references to the prepared css_set and target cgroup if
* CLONE_INTO_CGROUP was requested.
*/
static void cgroup_css_set_put_fork(struct kernel_clone_args *kargs)
__releases(&cgroup_threadgroup_rwsem) __releases(&cgroup_mutex)
{
cgroup_threadgroup_change_end(current);
if (kargs->flags & CLONE_INTO_CGROUP) {
struct cgroup *cgrp = kargs->cgrp;
struct css_set *cset = kargs->cset;
mutex_unlock(&cgroup_mutex);
if (cset) {
put_css_set(cset);
kargs->cset = NULL;
}
if (cgrp) {
cgroup_put(cgrp);
kargs->cgrp = NULL;
}
}
}
/**
* cgroup_can_fork - called on a new task before the process is exposed
* @child: the child process
*
* This prepares a new css_set for the child process which the child will
* be attached to in cgroup_post_fork().
* This calls the subsystem can_fork() callbacks. If the cgroup_can_fork()
* callback returns an error, the fork aborts with that error code. This
* allows for a cgroup subsystem to conditionally allow or deny new forks.
*/
int cgroup_can_fork(struct task_struct *child, struct kernel_clone_args *kargs)
{
struct cgroup_subsys *ss;
int i, j, ret;
ret = cgroup_css_set_fork(kargs);
if (ret)
return ret;
do_each_subsys_mask(ss, i, have_canfork_callback) {
ret = ss->can_fork(child, kargs->cset);
if (ret)
goto out_revert;
} while_each_subsys_mask();
return 0;
out_revert:
for_each_subsys(ss, j) {
if (j >= i)
break;
if (ss->cancel_fork)
ss->cancel_fork(child, kargs->cset);
}
cgroup_css_set_put_fork(kargs);
return ret;
}
/**
* cgroup_cancel_fork - called if a fork failed after cgroup_can_fork()
* @child: the child process
* @kargs: the arguments passed to create the child process
*
* This calls the cancel_fork() callbacks if a fork failed *after*
* cgroup_can_fork() succeeded and cleans up references we took to
* prepare a new css_set for the child process in cgroup_can_fork().
*/
void cgroup_cancel_fork(struct task_struct *child,
struct kernel_clone_args *kargs)
{
struct cgroup_subsys *ss;
int i;
for_each_subsys(ss, i)
if (ss->cancel_fork)
ss->cancel_fork(child, kargs->cset);
cgroup_css_set_put_fork(kargs);
}
/**
* cgroup_post_fork - finalize cgroup setup for the child process
* @child: the child process
*
* Attach the child process to its css_set calling the subsystem fork()
* callbacks.
*/
void cgroup_post_fork(struct task_struct *child,
struct kernel_clone_args *kargs)
__releases(&cgroup_threadgroup_rwsem) __releases(&cgroup_mutex)
{
unsigned long cgrp_flags = 0;
bool kill = false;
struct cgroup_subsys *ss;
struct css_set *cset;
int i;
cset = kargs->cset;
kargs->cset = NULL;
spin_lock_irq(&css_set_lock);
/* init tasks are special, only link regular threads */
if (likely(child->pid)) {
if (kargs->cgrp)
cgrp_flags = kargs->cgrp->flags;
else
cgrp_flags = cset->dfl_cgrp->flags;
WARN_ON_ONCE(!list_empty(&child->cg_list));
cset->nr_tasks++;
css_set_move_task(child, NULL, cset, false);
} else {
put_css_set(cset);
cset = NULL;
}
if (!(child->flags & PF_KTHREAD)) {
if (unlikely(test_bit(CGRP_FREEZE, &cgrp_flags))) {
/*
* If the cgroup has to be frozen, the new task has
* too. Let's set the JOBCTL_TRAP_FREEZE jobctl bit to
* get the task into the frozen state.
*/
spin_lock(&child->sighand->siglock);
WARN_ON_ONCE(child->frozen);
child->jobctl |= JOBCTL_TRAP_FREEZE;
spin_unlock(&child->sighand->siglock);
/*
* Calling cgroup_update_frozen() isn't required here,
* because it will be called anyway a bit later from
* do_freezer_trap(). So we avoid cgroup's transient
* switch from the frozen state and back.
*/
}
/*
* If the cgroup is to be killed notice it now and take the
* child down right after we finished preparing it for
* userspace.
*/
kill = test_bit(CGRP_KILL, &cgrp_flags);
}
spin_unlock_irq(&css_set_lock);
/*
* Call ss->fork(). This must happen after @child is linked on
* css_set; otherwise, @child might change state between ->fork()
* and addition to css_set.
*/
do_each_subsys_mask(ss, i, have_fork_callback) {
ss->fork(child);
} while_each_subsys_mask();
/* Make the new cset the root_cset of the new cgroup namespace. */
if (kargs->flags & CLONE_NEWCGROUP) {
struct css_set *rcset = child->nsproxy->cgroup_ns->root_cset;
get_css_set(cset);
child->nsproxy->cgroup_ns->root_cset = cset;
put_css_set(rcset);
}
/* Cgroup has to be killed so take down child immediately. */
if (unlikely(kill))
do_send_sig_info(SIGKILL, SEND_SIG_NOINFO, child, PIDTYPE_TGID);
cgroup_css_set_put_fork(kargs);
}
/**
* cgroup_exit - detach cgroup from exiting task
* @tsk: pointer to task_struct of exiting process
*
* Description: Detach cgroup from @tsk.
*
*/
void cgroup_exit(struct task_struct *tsk)
{
struct cgroup_subsys *ss;
struct css_set *cset;
int i;
spin_lock_irq(&css_set_lock);
WARN_ON_ONCE(list_empty(&tsk->cg_list));
cset = task_css_set(tsk);
css_set_move_task(tsk, cset, NULL, false);
list_add_tail(&tsk->cg_list, &cset->dying_tasks);
cset->nr_tasks--;
WARN_ON_ONCE(cgroup_task_frozen(tsk));
if (unlikely(!(tsk->flags & PF_KTHREAD) &&
test_bit(CGRP_FREEZE, &task_dfl_cgroup(tsk)->flags)))
cgroup_update_frozen(task_dfl_cgroup(tsk));
spin_unlock_irq(&css_set_lock);
/* see cgroup_post_fork() for details */
do_each_subsys_mask(ss, i, have_exit_callback) {
ss->exit(tsk);
} while_each_subsys_mask();
}
void cgroup_release(struct task_struct *task)
{
struct cgroup_subsys *ss;
int ssid;
do_each_subsys_mask(ss, ssid, have_release_callback) {
ss->release(task);
} while_each_subsys_mask();
spin_lock_irq(&css_set_lock);
css_set_skip_task_iters(task_css_set(task), task);
list_del_init(&task->cg_list);
spin_unlock_irq(&css_set_lock);
}
void cgroup_free(struct task_struct *task)
{
struct css_set *cset = task_css_set(task);
put_css_set(cset);
}
static int __init cgroup_disable(char *str)
{
struct cgroup_subsys *ss;
char *token;
int i;
while ((token = strsep(&str, ",")) != NULL) {
if (!*token)
continue;
for_each_subsys(ss, i) {
if (strcmp(token, ss->name) &&
strcmp(token, ss->legacy_name))
continue;
static_branch_disable(cgroup_subsys_enabled_key[i]);
pr_info("Disabling %s control group subsystem\n",
ss->name);
}
for (i = 0; i < OPT_FEATURE_COUNT; i++) {
if (strcmp(token, cgroup_opt_feature_names[i]))
continue;
cgroup_feature_disable_mask |= 1 << i;
pr_info("Disabling %s control group feature\n",
cgroup_opt_feature_names[i]);
break;
}
}
return 1;
}
__setup("cgroup_disable=", cgroup_disable);
void __init __weak enable_debug_cgroup(void) { }
static int __init enable_cgroup_debug(char *str)
{
cgroup_debug = true;
enable_debug_cgroup();
return 1;
}
__setup("cgroup_debug", enable_cgroup_debug);
/**
* css_tryget_online_from_dir - get corresponding css from a cgroup dentry
* @dentry: directory dentry of interest
* @ss: subsystem of interest
*
* If @dentry is a directory for a cgroup which has @ss enabled on it, try
* to get the corresponding css and return it. If such css doesn't exist
* or can't be pinned, an ERR_PTR value is returned.
*/
struct cgroup_subsys_state *css_tryget_online_from_dir(struct dentry *dentry,
struct cgroup_subsys *ss)
{
struct kernfs_node *kn = kernfs_node_from_dentry(dentry);
struct file_system_type *s_type = dentry->d_sb->s_type;
struct cgroup_subsys_state *css = NULL;
struct cgroup *cgrp;
/* is @dentry a cgroup dir? */
if ((s_type != &cgroup_fs_type && s_type != &cgroup2_fs_type) ||
!kn || kernfs_type(kn) != KERNFS_DIR)
return ERR_PTR(-EBADF);
rcu_read_lock();
/*
* This path doesn't originate from kernfs and @kn could already
* have been or be removed at any point. @kn->priv is RCU
* protected for this access. See css_release_work_fn() for details.
*/
cgrp = rcu_dereference(*(void __rcu __force **)&kn->priv);
if (cgrp)
css = cgroup_css(cgrp, ss);
if (!css || !css_tryget_online(css))
css = ERR_PTR(-ENOENT);
rcu_read_unlock();
return css;
}
/**
* css_from_id - lookup css by id
* @id: the cgroup id
* @ss: cgroup subsys to be looked into
*
* Returns the css if there's valid one with @id, otherwise returns NULL.
* Should be called under rcu_read_lock().
*/
struct cgroup_subsys_state *css_from_id(int id, struct cgroup_subsys *ss)
{
WARN_ON_ONCE(!rcu_read_lock_held());
return idr_find(&ss->css_idr, id);
}
/**
* cgroup_get_from_path - lookup and get a cgroup from its default hierarchy path
* @path: path on the default hierarchy
*
* Find the cgroup at @path on the default hierarchy, increment its
* reference count and return it. Returns pointer to the found cgroup on
* success, ERR_PTR(-ENOENT) if @path doesn't exist and ERR_PTR(-ENOTDIR)
* if @path points to a non-directory.
*/
struct cgroup *cgroup_get_from_path(const char *path)
{
struct kernfs_node *kn;
struct cgroup *cgrp;
mutex_lock(&cgroup_mutex);
kn = kernfs_walk_and_get(cgrp_dfl_root.cgrp.kn, path);
if (kn) {
if (kernfs_type(kn) == KERNFS_DIR) {
cgrp = kn->priv;
cgroup_get_live(cgrp);
} else {
cgrp = ERR_PTR(-ENOTDIR);
}
kernfs_put(kn);
} else {
cgrp = ERR_PTR(-ENOENT);
}
mutex_unlock(&cgroup_mutex);
return cgrp;
}
EXPORT_SYMBOL_GPL(cgroup_get_from_path);
/**
* cgroup_get_from_fd - get a cgroup pointer from a fd
* @fd: fd obtained by open(cgroup2_dir)
*
* Find the cgroup from a fd which should be obtained
* by opening a cgroup directory. Returns a pointer to the
* cgroup on success. ERR_PTR is returned if the cgroup
* cannot be found.
*/
struct cgroup *cgroup_get_from_fd(int fd)
{
struct cgroup *cgrp;
struct file *f;
f = fget_raw(fd);
if (!f)
return ERR_PTR(-EBADF);
cgrp = cgroup_get_from_file(f);
fput(f);
return cgrp;
}
EXPORT_SYMBOL_GPL(cgroup_get_from_fd);
static u64 power_of_ten(int power)
{
u64 v = 1;
while (power--)
v *= 10;
return v;
}
/**
* cgroup_parse_float - parse a floating number
* @input: input string
* @dec_shift: number of decimal digits to shift
* @v: output
*
* Parse a decimal floating point number in @input and store the result in
* @v with decimal point right shifted @dec_shift times. For example, if
* @input is "12.3456" and @dec_shift is 3, *@v will be set to 12345.
* Returns 0 on success, -errno otherwise.
*
* There's nothing cgroup specific about this function except that it's
* currently the only user.
*/
int cgroup_parse_float(const char *input, unsigned dec_shift, s64 *v)
{
s64 whole, frac = 0;
int fstart = 0, fend = 0, flen;
if (!sscanf(input, "%lld.%n%lld%n", &whole, &fstart, &frac, &fend))
return -EINVAL;
if (frac < 0)
return -EINVAL;
flen = fend > fstart ? fend - fstart : 0;
if (flen < dec_shift)
frac *= power_of_ten(dec_shift - flen);
else
frac = DIV_ROUND_CLOSEST_ULL(frac, power_of_ten(flen - dec_shift));
*v = whole * power_of_ten(dec_shift) + frac;
return 0;
}
/*
* sock->sk_cgrp_data handling. For more info, see sock_cgroup_data
* definition in cgroup-defs.h.
*/
#ifdef CONFIG_SOCK_CGROUP_DATA
void cgroup_sk_alloc(struct sock_cgroup_data *skcd)
{
struct cgroup *cgroup;
rcu_read_lock();
/* Don't associate the sock with unrelated interrupted task's cgroup. */
if (in_interrupt()) {
cgroup = &cgrp_dfl_root.cgrp;
cgroup_get(cgroup);
goto out;
}
while (true) {
struct css_set *cset;
cset = task_css_set(current);
if (likely(cgroup_tryget(cset->dfl_cgrp))) {
cgroup = cset->dfl_cgrp;
break;
}
cpu_relax();
}
out:
skcd->cgroup = cgroup;
cgroup_bpf_get(cgroup);
rcu_read_unlock();
}
void cgroup_sk_clone(struct sock_cgroup_data *skcd)
{
struct cgroup *cgrp = sock_cgroup_ptr(skcd);
/*
* We might be cloning a socket which is left in an empty
* cgroup and the cgroup might have already been rmdir'd.
* Don't use cgroup_get_live().
*/
cgroup_get(cgrp);
cgroup_bpf_get(cgrp);
}
void cgroup_sk_free(struct sock_cgroup_data *skcd)
{
struct cgroup *cgrp = sock_cgroup_ptr(skcd);
cgroup_bpf_put(cgrp);
cgroup_put(cgrp);
}
#endif /* CONFIG_SOCK_CGROUP_DATA */
#ifdef CONFIG_CGROUP_BPF
int cgroup_bpf_attach(struct cgroup *cgrp,
struct bpf_prog *prog, struct bpf_prog *replace_prog,
struct bpf_cgroup_link *link,
enum bpf_attach_type type,
u32 flags)
{
int ret;
mutex_lock(&cgroup_mutex);
ret = __cgroup_bpf_attach(cgrp, prog, replace_prog, link, type, flags);
mutex_unlock(&cgroup_mutex);
return ret;
}
int cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog,
enum bpf_attach_type type)
{
int ret;
mutex_lock(&cgroup_mutex);
ret = __cgroup_bpf_detach(cgrp, prog, NULL, type);
mutex_unlock(&cgroup_mutex);
return ret;
}
int cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr,
union bpf_attr __user *uattr)
{
int ret;
mutex_lock(&cgroup_mutex);
ret = __cgroup_bpf_query(cgrp, attr, uattr);
mutex_unlock(&cgroup_mutex);
return ret;
}
#endif /* CONFIG_CGROUP_BPF */
#ifdef CONFIG_SYSFS
static ssize_t show_delegatable_files(struct cftype *files, char *buf,
ssize_t size, const char *prefix)
{
struct cftype *cft;
ssize_t ret = 0;
for (cft = files; cft && cft->name[0] != '\0'; cft++) {
if (!(cft->flags & CFTYPE_NS_DELEGATABLE))
continue;
if ((cft->flags & CFTYPE_PRESSURE) && !cgroup_psi_enabled())
continue;
if (prefix)
ret += snprintf(buf + ret, size - ret, "%s.", prefix);
ret += snprintf(buf + ret, size - ret, "%s\n", cft->name);
if (WARN_ON(ret >= size))
break;
}
return ret;
}
static ssize_t delegate_show(struct kobject *kobj, struct kobj_attribute *attr,
char *buf)
{
struct cgroup_subsys *ss;
int ssid;
ssize_t ret = 0;
ret = show_delegatable_files(cgroup_base_files, buf, PAGE_SIZE - ret,
NULL);
for_each_subsys(ss, ssid)
ret += show_delegatable_files(ss->dfl_cftypes, buf + ret,
PAGE_SIZE - ret,
cgroup_subsys_name[ssid]);
return ret;
}
static struct kobj_attribute cgroup_delegate_attr = __ATTR_RO(delegate);
static ssize_t features_show(struct kobject *kobj, struct kobj_attribute *attr,
char *buf)
{
return snprintf(buf, PAGE_SIZE,
"nsdelegate\n"
"memory_localevents\n"
"memory_recursiveprot\n");
}
static struct kobj_attribute cgroup_features_attr = __ATTR_RO(features);
static struct attribute *cgroup_sysfs_attrs[] = {
&cgroup_delegate_attr.attr,
&cgroup_features_attr.attr,
NULL,
};
static const struct attribute_group cgroup_sysfs_attr_group = {
.attrs = cgroup_sysfs_attrs,
.name = "cgroup",
};
static int __init cgroup_sysfs_init(void)
{
return sysfs_create_group(kernel_kobj, &cgroup_sysfs_attr_group);
}
subsys_initcall(cgroup_sysfs_init);
#endif /* CONFIG_SYSFS */
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef __LINUX_SEQLOCK_H
#define __LINUX_SEQLOCK_H
/*
* seqcount_t / seqlock_t - a reader-writer consistency mechanism with
* lockless readers (read-only retry loops), and no writer starvation.
*
* See Documentation/locking/seqlock.rst
*
* Copyrights:
* - Based on x86_64 vsyscall gettimeofday: Keith Owens, Andrea Arcangeli
* - Sequence counters with associated locks, (C) 2020 Linutronix GmbH
*/
#include <linux/compiler.h>
#include <linux/kcsan-checks.h>
#include <linux/lockdep.h>
#include <linux/mutex.h>
#include <linux/ww_mutex.h>
#include <linux/preempt.h>
#include <linux/spinlock.h>
#include <asm/processor.h>
/*
* The seqlock seqcount_t interface does not prescribe a precise sequence of
* read begin/retry/end. For readers, typically there is a call to
* read_seqcount_begin() and read_seqcount_retry(), however, there are more
* esoteric cases which do not follow this pattern.
*
* As a consequence, we take the following best-effort approach for raw usage
* via seqcount_t under KCSAN: upon beginning a seq-reader critical section,
* pessimistically mark the next KCSAN_SEQLOCK_REGION_MAX memory accesses as
* atomics; if there is a matching read_seqcount_retry() call, no following
* memory operations are considered atomic. Usage of the seqlock_t interface
* is not affected.
*/
#define KCSAN_SEQLOCK_REGION_MAX 1000
/*
* Sequence counters (seqcount_t)
*
* This is the raw counting mechanism, without any writer protection.
*
* Write side critical sections must be serialized and non-preemptible.
*
* If readers can be invoked from hardirq or softirq contexts,
* interrupts or bottom halves must also be respectively disabled before
* entering the write section.
*
* This mechanism can't be used if the protected data contains pointers,
* as the writer can invalidate a pointer that a reader is following.
*
* If the write serialization mechanism is one of the common kernel
* locking primitives, use a sequence counter with associated lock
* (seqcount_LOCKNAME_t) instead.
*
* If it's desired to automatically handle the sequence counter writer
* serialization and non-preemptibility requirements, use a sequential
* lock (seqlock_t) instead.
*
* See Documentation/locking/seqlock.rst
*/
typedef struct seqcount {
unsigned sequence;
#ifdef CONFIG_DEBUG_LOCK_ALLOC
struct lockdep_map dep_map;
#endif
} seqcount_t;
static inline void __seqcount_init(seqcount_t *s, const char *name,
struct lock_class_key *key)
{
/*
* Make sure we are not reinitializing a held lock:
*/
lockdep_init_map(&s->dep_map, name, key, 0);
s->sequence = 0;
}
#ifdef CONFIG_DEBUG_LOCK_ALLOC
# define SEQCOUNT_DEP_MAP_INIT(lockname) \
.dep_map = { .name = #lockname }
/**
* seqcount_init() - runtime initializer for seqcount_t
* @s: Pointer to the seqcount_t instance
*/
# define seqcount_init(s) \
do { \
static struct lock_class_key __key; \
__seqcount_init((s), #s, &__key); \
} while (0)
static inline void seqcount_lockdep_reader_access(const seqcount_t *s)
{
seqcount_t *l = (seqcount_t *)s;
unsigned long flags;
local_irq_save(flags);
seqcount_acquire_read(&l->dep_map, 0, 0, _RET_IP_);
seqcount_release(&l->dep_map, _RET_IP_);
local_irq_restore(flags);
}
#else
# define SEQCOUNT_DEP_MAP_INIT(lockname)
# define seqcount_init(s) __seqcount_init(s, NULL, NULL)
# define seqcount_lockdep_reader_access(x)
#endif
/**
* SEQCNT_ZERO() - static initializer for seqcount_t
* @name: Name of the seqcount_t instance
*/
#define SEQCNT_ZERO(name) { .sequence = 0, SEQCOUNT_DEP_MAP_INIT(name) }
/*
* Sequence counters with associated locks (seqcount_LOCKNAME_t)
*
* A sequence counter which associates the lock used for writer
* serialization at initialization time. This enables lockdep to validate
* that the write side critical section is properly serialized.
*
* For associated locks which do not implicitly disable preemption,
* preemption protection is enforced in the write side function.
*
* Lockdep is never used in any for the raw write variants.
*
* See Documentation/locking/seqlock.rst
*/
/*
* For PREEMPT_RT, seqcount_LOCKNAME_t write side critical sections cannot
* disable preemption. It can lead to higher latencies, and the write side
* sections will not be able to acquire locks which become sleeping locks
* (e.g. spinlock_t).
*
* To remain preemptible while avoiding a possible livelock caused by the
* reader preempting the writer, use a different technique: let the reader
* detect if a seqcount_LOCKNAME_t writer is in progress. If that is the
* case, acquire then release the associated LOCKNAME writer serialization
* lock. This will allow any possibly-preempted writer to make progress
* until the end of its writer serialization lock critical section.
*
* This lock-unlock technique must be implemented for all of PREEMPT_RT
* sleeping locks. See Documentation/locking/locktypes.rst
*/
#if defined(CONFIG_LOCKDEP) || defined(CONFIG_PREEMPT_RT)
#define __SEQ_LOCK(expr) expr
#else
#define __SEQ_LOCK(expr)
#endif
/*
* typedef seqcount_LOCKNAME_t - sequence counter with LOCKNAME associated
* @seqcount: The real sequence counter
* @lock: Pointer to the associated lock
*
* A plain sequence counter with external writer synchronization by
* LOCKNAME @lock. The lock is associated to the sequence counter in the
* static initializer or init function. This enables lockdep to validate
* that the write side critical section is properly serialized.
*
* LOCKNAME: raw_spinlock, spinlock, rwlock, mutex, or ww_mutex.
*/
/*
* seqcount_LOCKNAME_init() - runtime initializer for seqcount_LOCKNAME_t
* @s: Pointer to the seqcount_LOCKNAME_t instance
* @lock: Pointer to the associated lock
*/
#define seqcount_LOCKNAME_init(s, _lock, lockname) \
do { \
seqcount_##lockname##_t *____s = (s); \
seqcount_init(&____s->seqcount); \
__SEQ_LOCK(____s->lock = (_lock)); \
} while (0)
#define seqcount_raw_spinlock_init(s, lock) seqcount_LOCKNAME_init(s, lock, raw_spinlock)
#define seqcount_spinlock_init(s, lock) seqcount_LOCKNAME_init(s, lock, spinlock)
#define seqcount_rwlock_init(s, lock) seqcount_LOCKNAME_init(s, lock, rwlock)
#define seqcount_mutex_init(s, lock) seqcount_LOCKNAME_init(s, lock, mutex)
#define seqcount_ww_mutex_init(s, lock) seqcount_LOCKNAME_init(s, lock, ww_mutex)
/*
* SEQCOUNT_LOCKNAME() - Instantiate seqcount_LOCKNAME_t and helpers
* seqprop_LOCKNAME_*() - Property accessors for seqcount_LOCKNAME_t
*
* @lockname: "LOCKNAME" part of seqcount_LOCKNAME_t
* @locktype: LOCKNAME canonical C data type
* @preemptible: preemptibility of above locktype
* @lockmember: argument for lockdep_assert_held()
* @lockbase: associated lock release function (prefix only)
* @lock_acquire: associated lock acquisition function (full call)
*/
#define SEQCOUNT_LOCKNAME(lockname, locktype, preemptible, lockmember, lockbase, lock_acquire) \
typedef struct seqcount_##lockname { \
seqcount_t seqcount; \
__SEQ_LOCK(locktype *lock); \
} seqcount_##lockname##_t; \
\
static __always_inline seqcount_t * \
__seqprop_##lockname##_ptr(seqcount_##lockname##_t *s) \
{ \
return &s->seqcount; \
} \
\
static __always_inline unsigned \
__seqprop_##lockname##_sequence(const seqcount_##lockname##_t *s) \
{ \
unsigned seq = READ_ONCE(s->seqcount.sequence); \
\
if (!IS_ENABLED(CONFIG_PREEMPT_RT)) \
return seq; \
\
if (preemptible && unlikely(seq & 1)) { \
__SEQ_LOCK(lock_acquire); \
__SEQ_LOCK(lockbase##_unlock(s->lock)); \
\
/* \
* Re-read the sequence counter since the (possibly \
* preempted) writer made progress. \
*/ \
seq = READ_ONCE(s->seqcount.sequence); \
} \
\
return seq; \
} \
\
static __always_inline bool \
__seqprop_##lockname##_preemptible(const seqcount_##lockname##_t *s) \
{ \
if (!IS_ENABLED(CONFIG_PREEMPT_RT)) \
return preemptible; \
\
/* PREEMPT_RT relies on the above LOCK+UNLOCK */ \
return false; \
} \
\
static __always_inline void \
__seqprop_##lockname##_assert(const seqcount_##lockname##_t *s) \
{ \
__SEQ_LOCK(lockdep_assert_held(lockmember)); \
}
/*
* __seqprop() for seqcount_t
*/
static inline seqcount_t *__seqprop_ptr(seqcount_t *s)
{
return s;
}
static inline unsigned __seqprop_sequence(const seqcount_t *s)
{
return READ_ONCE(s->sequence);
}
static inline bool __seqprop_preemptible(const seqcount_t *s)
{
return false;
}
static inline void __seqprop_assert(const seqcount_t *s)
{
lockdep_assert_preemption_disabled();
}
#define __SEQ_RT IS_ENABLED(CONFIG_PREEMPT_RT)
SEQCOUNT_LOCKNAME(raw_spinlock, raw_spinlock_t, false, s->lock, raw_spin, raw_spin_lock(s->lock))SEQCOUNT_LOCKNAME(spinlock, spinlock_t, __SEQ_RT, s->lock, spin, spin_lock(s->lock))
SEQCOUNT_LOCKNAME(rwlock, rwlock_t, __SEQ_RT, s->lock, read, read_lock(s->lock))
SEQCOUNT_LOCKNAME(mutex, struct mutex, true, s->lock, mutex, mutex_lock(s->lock))
SEQCOUNT_LOCKNAME(ww_mutex, struct ww_mutex, true, &s->lock->base, ww_mutex, ww_mutex_lock(s->lock, NULL))
/*
* SEQCNT_LOCKNAME_ZERO - static initializer for seqcount_LOCKNAME_t
* @name: Name of the seqcount_LOCKNAME_t instance
* @lock: Pointer to the associated LOCKNAME
*/
#define SEQCOUNT_LOCKNAME_ZERO(seq_name, assoc_lock) { \
.seqcount = SEQCNT_ZERO(seq_name.seqcount), \
__SEQ_LOCK(.lock = (assoc_lock)) \
}
#define SEQCNT_RAW_SPINLOCK_ZERO(name, lock) SEQCOUNT_LOCKNAME_ZERO(name, lock)
#define SEQCNT_SPINLOCK_ZERO(name, lock) SEQCOUNT_LOCKNAME_ZERO(name, lock)
#define SEQCNT_RWLOCK_ZERO(name, lock) SEQCOUNT_LOCKNAME_ZERO(name, lock)
#define SEQCNT_MUTEX_ZERO(name, lock) SEQCOUNT_LOCKNAME_ZERO(name, lock)
#define SEQCNT_WW_MUTEX_ZERO(name, lock) SEQCOUNT_LOCKNAME_ZERO(name, lock)
#define __seqprop_case(s, lockname, prop) \
seqcount_##lockname##_t: __seqprop_##lockname##_##prop((void *)(s))
#define __seqprop(s, prop) _Generic(*(s), \
seqcount_t: __seqprop_##prop((void *)(s)), \
__seqprop_case((s), raw_spinlock, prop), \
__seqprop_case((s), spinlock, prop), \
__seqprop_case((s), rwlock, prop), \
__seqprop_case((s), mutex, prop), \
__seqprop_case((s), ww_mutex, prop))
#define seqprop_ptr(s) __seqprop(s, ptr)
#define seqprop_sequence(s) __seqprop(s, sequence)
#define seqprop_preemptible(s) __seqprop(s, preemptible)
#define seqprop_assert(s) __seqprop(s, assert)
/**
* __read_seqcount_begin() - begin a seqcount_t read section w/o barrier
* @s: Pointer to seqcount_t or any of the seqcount_LOCKNAME_t variants
*
* __read_seqcount_begin is like read_seqcount_begin, but has no smp_rmb()
* barrier. Callers should ensure that smp_rmb() or equivalent ordering is
* provided before actually loading any of the variables that are to be
* protected in this critical section.
*
* Use carefully, only in critical code, and comment how the barrier is
* provided.
*
* Return: count to be passed to read_seqcount_retry()
*/
#define __read_seqcount_begin(s) \
({ \
unsigned __seq; \
\
while ((__seq = seqprop_sequence(s)) & 1) \
cpu_relax(); \
\
kcsan_atomic_next(KCSAN_SEQLOCK_REGION_MAX); \
__seq; \
})
/**
* raw_read_seqcount_begin() - begin a seqcount_t read section w/o lockdep
* @s: Pointer to seqcount_t or any of the seqcount_LOCKNAME_t variants
*
* Return: count to be passed to read_seqcount_retry()
*/
#define raw_read_seqcount_begin(s) \
({ \
unsigned _seq = __read_seqcount_begin(s); \
\
smp_rmb(); \
_seq; \
})
/**
* read_seqcount_begin() - begin a seqcount_t read critical section
* @s: Pointer to seqcount_t or any of the seqcount_LOCKNAME_t variants
*
* Return: count to be passed to read_seqcount_retry()
*/
#define read_seqcount_begin(s) \
({ \
seqcount_lockdep_reader_access(seqprop_ptr(s)); \
raw_read_seqcount_begin(s); \
})
/**
* raw_read_seqcount() - read the raw seqcount_t counter value
* @s: Pointer to seqcount_t or any of the seqcount_LOCKNAME_t variants
*
* raw_read_seqcount opens a read critical section of the given
* seqcount_t, without any lockdep checking, and without checking or
* masking the sequence counter LSB. Calling code is responsible for
* handling that.
*
* Return: count to be passed to read_seqcount_retry()
*/
#define raw_read_seqcount(s) \
({ \
unsigned __seq = seqprop_sequence(s); \
\
smp_rmb(); \
kcsan_atomic_next(KCSAN_SEQLOCK_REGION_MAX); \
__seq; \
})
/**
* raw_seqcount_begin() - begin a seqcount_t read critical section w/o
* lockdep and w/o counter stabilization
* @s: Pointer to seqcount_t or any of the seqcount_LOCKNAME_t variants
*
* raw_seqcount_begin opens a read critical section of the given
* seqcount_t. Unlike read_seqcount_begin(), this function will not wait
* for the count to stabilize. If a writer is active when it begins, it
* will fail the read_seqcount_retry() at the end of the read critical
* section instead of stabilizing at the beginning of it.
*
* Use this only in special kernel hot paths where the read section is
* small and has a high probability of success through other external
* means. It will save a single branching instruction.
*
* Return: count to be passed to read_seqcount_retry()
*/
#define raw_seqcount_begin(s) \
({ \
/* \
* If the counter is odd, let read_seqcount_retry() fail \
* by decrementing the counter. \
*/ \
raw_read_seqcount(s) & ~1; \
})
/**
* __read_seqcount_retry() - end a seqcount_t read section w/o barrier
* @s: Pointer to seqcount_t or any of the seqcount_LOCKNAME_t variants
* @start: count, from read_seqcount_begin()
*
* __read_seqcount_retry is like read_seqcount_retry, but has no smp_rmb()
* barrier. Callers should ensure that smp_rmb() or equivalent ordering is
* provided before actually loading any of the variables that are to be
* protected in this critical section.
*
* Use carefully, only in critical code, and comment how the barrier is
* provided.
*
* Return: true if a read section retry is required, else false
*/
#define __read_seqcount_retry(s, start) \
do___read_seqcount_retry(seqprop_ptr(s), start)
static inline int do___read_seqcount_retry(const seqcount_t *s, unsigned start)
{
kcsan_atomic_next(0);
return unlikely(READ_ONCE(s->sequence) != start);
}
/**
* read_seqcount_retry() - end a seqcount_t read critical section
* @s: Pointer to seqcount_t or any of the seqcount_LOCKNAME_t variants
* @start: count, from read_seqcount_begin()
*
* read_seqcount_retry closes the read critical section of given
* seqcount_t. If the critical section was invalid, it must be ignored
* (and typically retried).
*
* Return: true if a read section retry is required, else false
*/
#define read_seqcount_retry(s, start) \
do_read_seqcount_retry(seqprop_ptr(s), start)
static inline int do_read_seqcount_retry(const seqcount_t *s, unsigned start)
{
smp_rmb();
return do___read_seqcount_retry(s, start);
}
/**
* raw_write_seqcount_begin() - start a seqcount_t write section w/o lockdep
* @s: Pointer to seqcount_t or any of the seqcount_LOCKNAME_t variants
*
* Context: check write_seqcount_begin()
*/
#define raw_write_seqcount_begin(s) \
do { \
if (seqprop_preemptible(s)) \
preempt_disable(); \
\
do_raw_write_seqcount_begin(seqprop_ptr(s)); \
} while (0)
static inline void do_raw_write_seqcount_begin(seqcount_t *s)
{
kcsan_nestable_atomic_begin();
s->sequence++;
smp_wmb();
}
/**
* raw_write_seqcount_end() - end a seqcount_t write section w/o lockdep
* @s: Pointer to seqcount_t or any of the seqcount_LOCKNAME_t variants
*
* Context: check write_seqcount_end()
*/
#define raw_write_seqcount_end(s) \
do { \
do_raw_write_seqcount_end(seqprop_ptr(s)); \
\
if (seqprop_preemptible(s)) \
preempt_enable(); \
} while (0)
static inline void do_raw_write_seqcount_end(seqcount_t *s)
{
smp_wmb();
s->sequence++;
kcsan_nestable_atomic_end();
}
/**
* write_seqcount_begin_nested() - start a seqcount_t write section with
* custom lockdep nesting level
* @s: Pointer to seqcount_t or any of the seqcount_LOCKNAME_t variants
* @subclass: lockdep nesting level
*
* See Documentation/locking/lockdep-design.rst
* Context: check write_seqcount_begin()
*/
#define write_seqcount_begin_nested(s, subclass) \
do { \
seqprop_assert(s); \
\
if (seqprop_preemptible(s)) \
preempt_disable(); \
\
do_write_seqcount_begin_nested(seqprop_ptr(s), subclass); \
} while (0)
static inline void do_write_seqcount_begin_nested(seqcount_t *s, int subclass)
{
do_raw_write_seqcount_begin(s);
seqcount_acquire(&s->dep_map, subclass, 0, _RET_IP_);
}
/**
* write_seqcount_begin() - start a seqcount_t write side critical section
* @s: Pointer to seqcount_t or any of the seqcount_LOCKNAME_t variants
*
* Context: sequence counter write side sections must be serialized and
* non-preemptible. Preemption will be automatically disabled if and
* only if the seqcount write serialization lock is associated, and
* preemptible. If readers can be invoked from hardirq or softirq
* context, interrupts or bottom halves must be respectively disabled.
*/
#define write_seqcount_begin(s) \
do { \
seqprop_assert(s); \
\
if (seqprop_preemptible(s)) \
preempt_disable(); \
\
do_write_seqcount_begin(seqprop_ptr(s)); \
} while (0)
static inline void do_write_seqcount_begin(seqcount_t *s)
{
do_write_seqcount_begin_nested(s, 0);
}
/**
* write_seqcount_end() - end a seqcount_t write side critical section
* @s: Pointer to seqcount_t or any of the seqcount_LOCKNAME_t variants
*
* Context: Preemption will be automatically re-enabled if and only if
* the seqcount write serialization lock is associated, and preemptible.
*/
#define write_seqcount_end(s) \
do { \
do_write_seqcount_end(seqprop_ptr(s)); \
\
if (seqprop_preemptible(s)) \
preempt_enable(); \
} while (0)
static inline void do_write_seqcount_end(seqcount_t *s)
{
seqcount_release(&s->dep_map, _RET_IP_);
do_raw_write_seqcount_end(s);
}
/**
* raw_write_seqcount_barrier() - do a seqcount_t write barrier
* @s: Pointer to seqcount_t or any of the seqcount_LOCKNAME_t variants
*
* This can be used to provide an ordering guarantee instead of the usual
* consistency guarantee. It is one wmb cheaper, because it can collapse
* the two back-to-back wmb()s.
*
* Note that writes surrounding the barrier should be declared atomic (e.g.
* via WRITE_ONCE): a) to ensure the writes become visible to other threads
* atomically, avoiding compiler optimizations; b) to document which writes are
* meant to propagate to the reader critical section. This is necessary because
* neither writes before and after the barrier are enclosed in a seq-writer
* critical section that would ensure readers are aware of ongoing writes::
*
* seqcount_t seq;
* bool X = true, Y = false;
*
* void read(void)
* {
* bool x, y;
*
* do {
* int s = read_seqcount_begin(&seq);
*
* x = X; y = Y;
*
* } while (read_seqcount_retry(&seq, s));
*
* BUG_ON(!x && !y);
* }
*
* void write(void)
* {
* WRITE_ONCE(Y, true);
*
* raw_write_seqcount_barrier(seq);
*
* WRITE_ONCE(X, false);
* }
*/
#define raw_write_seqcount_barrier(s) \
do_raw_write_seqcount_barrier(seqprop_ptr(s))
static inline void do_raw_write_seqcount_barrier(seqcount_t *s)
{
kcsan_nestable_atomic_begin();
s->sequence++;
smp_wmb();
s->sequence++;
kcsan_nestable_atomic_end();
}
/**
* write_seqcount_invalidate() - invalidate in-progress seqcount_t read
* side operations
* @s: Pointer to seqcount_t or any of the seqcount_LOCKNAME_t variants
*
* After write_seqcount_invalidate, no seqcount_t read side operations
* will complete successfully and see data older than this.
*/
#define write_seqcount_invalidate(s) \
do_write_seqcount_invalidate(seqprop_ptr(s))
static inline void do_write_seqcount_invalidate(seqcount_t *s)
{
smp_wmb();
kcsan_nestable_atomic_begin();
s->sequence+=2;
kcsan_nestable_atomic_end();
}
/*
* Latch sequence counters (seqcount_latch_t)
*
* A sequence counter variant where the counter even/odd value is used to
* switch between two copies of protected data. This allows the read path,
* typically NMIs, to safely interrupt the write side critical section.
*
* As the write sections are fully preemptible, no special handling for
* PREEMPT_RT is needed.
*/
typedef struct {
seqcount_t seqcount;
} seqcount_latch_t;
/**
* SEQCNT_LATCH_ZERO() - static initializer for seqcount_latch_t
* @seq_name: Name of the seqcount_latch_t instance
*/
#define SEQCNT_LATCH_ZERO(seq_name) { \
.seqcount = SEQCNT_ZERO(seq_name.seqcount), \
}
/**
* seqcount_latch_init() - runtime initializer for seqcount_latch_t
* @s: Pointer to the seqcount_latch_t instance
*/
#define seqcount_latch_init(s) seqcount_init(&(s)->seqcount)
/**
* raw_read_seqcount_latch() - pick even/odd latch data copy
* @s: Pointer to seqcount_latch_t
*
* See raw_write_seqcount_latch() for details and a full reader/writer
* usage example.
*
* Return: sequence counter raw value. Use the lowest bit as an index for
* picking which data copy to read. The full counter must then be checked
* with read_seqcount_latch_retry().
*/
static inline unsigned raw_read_seqcount_latch(const seqcount_latch_t *s)
{
/*
* Pairs with the first smp_wmb() in raw_write_seqcount_latch().
* Due to the dependent load, a full smp_rmb() is not needed.
*/
return READ_ONCE(s->seqcount.sequence);
}
/**
* read_seqcount_latch_retry() - end a seqcount_latch_t read section
* @s: Pointer to seqcount_latch_t
* @start: count, from raw_read_seqcount_latch()
*
* Return: true if a read section retry is required, else false
*/
static inline int
read_seqcount_latch_retry(const seqcount_latch_t *s, unsigned start)
{
return read_seqcount_retry(&s->seqcount, start);
}
/**
* raw_write_seqcount_latch() - redirect latch readers to even/odd copy
* @s: Pointer to seqcount_latch_t
*
* The latch technique is a multiversion concurrency control method that allows
* queries during non-atomic modifications. If you can guarantee queries never
* interrupt the modification -- e.g. the concurrency is strictly between CPUs
* -- you most likely do not need this.
*
* Where the traditional RCU/lockless data structures rely on atomic
* modifications to ensure queries observe either the old or the new state the
* latch allows the same for non-atomic updates. The trade-off is doubling the
* cost of storage; we have to maintain two copies of the entire data
* structure.
*
* Very simply put: we first modify one copy and then the other. This ensures
* there is always one copy in a stable state, ready to give us an answer.
*
* The basic form is a data structure like::
*
* struct latch_struct {
* seqcount_latch_t seq;
* struct data_struct data[2];
* };
*
* Where a modification, which is assumed to be externally serialized, does the
* following::
*
* void latch_modify(struct latch_struct *latch, ...)
* {
* smp_wmb(); // Ensure that the last data[1] update is visible
* latch->seq.sequence++;
* smp_wmb(); // Ensure that the seqcount update is visible
*
* modify(latch->data[0], ...);
*
* smp_wmb(); // Ensure that the data[0] update is visible
* latch->seq.sequence++;
* smp_wmb(); // Ensure that the seqcount update is visible
*
* modify(latch->data[1], ...);
* }
*
* The query will have a form like::
*
* struct entry *latch_query(struct latch_struct *latch, ...)
* {
* struct entry *entry;
* unsigned seq, idx;
*
* do {
* seq = raw_read_seqcount_latch(&latch->seq);
*
* idx = seq & 0x01;
* entry = data_query(latch->data[idx], ...);
*
* // This includes needed smp_rmb()
* } while (read_seqcount_latch_retry(&latch->seq, seq));
*
* return entry;
* }
*
* So during the modification, queries are first redirected to data[1]. Then we
* modify data[0]. When that is complete, we redirect queries back to data[0]
* and we can modify data[1].
*
* NOTE:
*
* The non-requirement for atomic modifications does _NOT_ include
* the publishing of new entries in the case where data is a dynamic
* data structure.
*
* An iteration might start in data[0] and get suspended long enough
* to miss an entire modification sequence, once it resumes it might
* observe the new entry.
*
* NOTE2:
*
* When data is a dynamic data structure; one should use regular RCU
* patterns to manage the lifetimes of the objects within.
*/
static inline void raw_write_seqcount_latch(seqcount_latch_t *s)
{
smp_wmb(); /* prior stores before incrementing "sequence" */
s->seqcount.sequence++;
smp_wmb(); /* increment "sequence" before following stores */
}
/*
* Sequential locks (seqlock_t)
*
* Sequence counters with an embedded spinlock for writer serialization
* and non-preemptibility.
*
* For more info, see:
* - Comments on top of seqcount_t
* - Documentation/locking/seqlock.rst
*/
typedef struct {
/*
* Make sure that readers don't starve writers on PREEMPT_RT: use
* seqcount_spinlock_t instead of seqcount_t. Check __SEQ_LOCK().
*/
seqcount_spinlock_t seqcount;
spinlock_t lock;
} seqlock_t;
#define __SEQLOCK_UNLOCKED(lockname) \
{ \
.seqcount = SEQCNT_SPINLOCK_ZERO(lockname, &(lockname).lock), \
.lock = __SPIN_LOCK_UNLOCKED(lockname) \
}
/**
* seqlock_init() - dynamic initializer for seqlock_t
* @sl: Pointer to the seqlock_t instance
*/
#define seqlock_init(sl) \
do { \
spin_lock_init(&(sl)->lock); \
seqcount_spinlock_init(&(sl)->seqcount, &(sl)->lock); \
} while (0)
/**
* DEFINE_SEQLOCK(sl) - Define a statically allocated seqlock_t
* @sl: Name of the seqlock_t instance
*/
#define DEFINE_SEQLOCK(sl) \
seqlock_t sl = __SEQLOCK_UNLOCKED(sl)
/**
* read_seqbegin() - start a seqlock_t read side critical section
* @sl: Pointer to seqlock_t
*
* Return: count, to be passed to read_seqretry()
*/
static inline unsigned read_seqbegin(const seqlock_t *sl)
{
unsigned ret = read_seqcount_begin(&sl->seqcount);
kcsan_atomic_next(0); /* non-raw usage, assume closing read_seqretry() */
kcsan_flat_atomic_begin();
return ret;
}
/**
* read_seqretry() - end a seqlock_t read side section
* @sl: Pointer to seqlock_t
* @start: count, from read_seqbegin()
*
* read_seqretry closes the read side critical section of given seqlock_t.
* If the critical section was invalid, it must be ignored (and typically
* retried).
*
* Return: true if a read section retry is required, else false
*/
static inline unsigned read_seqretry(const seqlock_t *sl, unsigned start)
{
/*
* Assume not nested: read_seqretry() may be called multiple times when
* completing read critical section.
*/
kcsan_flat_atomic_end();
return read_seqcount_retry(&sl->seqcount, start);
}
/*
* For all seqlock_t write side functions, use the the internal
* do_write_seqcount_begin() instead of generic write_seqcount_begin().
* This way, no redundant lockdep_assert_held() checks are added.
*/
/**
* write_seqlock() - start a seqlock_t write side critical section
* @sl: Pointer to seqlock_t
*
* write_seqlock opens a write side critical section for the given
* seqlock_t. It also implicitly acquires the spinlock_t embedded inside
* that sequential lock. All seqlock_t write side sections are thus
* automatically serialized and non-preemptible.
*
* Context: if the seqlock_t read section, or other write side critical
* sections, can be invoked from hardirq or softirq contexts, use the
* _irqsave or _bh variants of this function instead.
*/
static inline void write_seqlock(seqlock_t *sl)
{
spin_lock(&sl->lock);
do_write_seqcount_begin(&sl->seqcount.seqcount);
}
/**
* write_sequnlock() - end a seqlock_t write side critical section
* @sl: Pointer to seqlock_t
*
* write_sequnlock closes the (serialized and non-preemptible) write side
* critical section of given seqlock_t.
*/
static inline void write_sequnlock(seqlock_t *sl)
{
do_write_seqcount_end(&sl->seqcount.seqcount);
spin_unlock(&sl->lock);
}
/**
* write_seqlock_bh() - start a softirqs-disabled seqlock_t write section
* @sl: Pointer to seqlock_t
*
* _bh variant of write_seqlock(). Use only if the read side section, or
* other write side sections, can be invoked from softirq contexts.
*/
static inline void write_seqlock_bh(seqlock_t *sl)
{
spin_lock_bh(&sl->lock);
do_write_seqcount_begin(&sl->seqcount.seqcount);
}
/**
* write_sequnlock_bh() - end a softirqs-disabled seqlock_t write section
* @sl: Pointer to seqlock_t
*
* write_sequnlock_bh closes the serialized, non-preemptible, and
* softirqs-disabled, seqlock_t write side critical section opened with
* write_seqlock_bh().
*/
static inline void write_sequnlock_bh(seqlock_t *sl)
{
do_write_seqcount_end(&sl->seqcount.seqcount);
spin_unlock_bh(&sl->lock);
}
/**
* write_seqlock_irq() - start a non-interruptible seqlock_t write section
* @sl: Pointer to seqlock_t
*
* _irq variant of write_seqlock(). Use only if the read side section, or
* other write sections, can be invoked from hardirq contexts.
*/
static inline void write_seqlock_irq(seqlock_t *sl)
{
spin_lock_irq(&sl->lock);
do_write_seqcount_begin(&sl->seqcount.seqcount);
}
/**
* write_sequnlock_irq() - end a non-interruptible seqlock_t write section
* @sl: Pointer to seqlock_t
*
* write_sequnlock_irq closes the serialized and non-interruptible
* seqlock_t write side section opened with write_seqlock_irq().
*/
static inline void write_sequnlock_irq(seqlock_t *sl)
{
do_write_seqcount_end(&sl->seqcount.seqcount);
spin_unlock_irq(&sl->lock);
}
static inline unsigned long __write_seqlock_irqsave(seqlock_t *sl)
{
unsigned long flags;
spin_lock_irqsave(&sl->lock, flags);
do_write_seqcount_begin(&sl->seqcount.seqcount);
return flags;
}
/**
* write_seqlock_irqsave() - start a non-interruptible seqlock_t write
* section
* @lock: Pointer to seqlock_t
* @flags: Stack-allocated storage for saving caller's local interrupt
* state, to be passed to write_sequnlock_irqrestore().
*
* _irqsave variant of write_seqlock(). Use it only if the read side
* section, or other write sections, can be invoked from hardirq context.
*/
#define write_seqlock_irqsave(lock, flags) \
do { flags = __write_seqlock_irqsave(lock); } while (0)
/**
* write_sequnlock_irqrestore() - end non-interruptible seqlock_t write
* section
* @sl: Pointer to seqlock_t
* @flags: Caller's saved interrupt state, from write_seqlock_irqsave()
*
* write_sequnlock_irqrestore closes the serialized and non-interruptible
* seqlock_t write section previously opened with write_seqlock_irqsave().
*/
static inline void
write_sequnlock_irqrestore(seqlock_t *sl, unsigned long flags)
{
do_write_seqcount_end(&sl->seqcount.seqcount);
spin_unlock_irqrestore(&sl->lock, flags);
}
/**
* read_seqlock_excl() - begin a seqlock_t locking reader section
* @sl: Pointer to seqlock_t
*
* read_seqlock_excl opens a seqlock_t locking reader critical section. A
* locking reader exclusively locks out *both* other writers *and* other
* locking readers, but it does not update the embedded sequence number.
*
* Locking readers act like a normal spin_lock()/spin_unlock().
*
* Context: if the seqlock_t write section, *or other read sections*, can
* be invoked from hardirq or softirq contexts, use the _irqsave or _bh
* variant of this function instead.
*
* The opened read section must be closed with read_sequnlock_excl().
*/
static inline void read_seqlock_excl(seqlock_t *sl)
{
spin_lock(&sl->lock);
}
/**
* read_sequnlock_excl() - end a seqlock_t locking reader critical section
* @sl: Pointer to seqlock_t
*/
static inline void read_sequnlock_excl(seqlock_t *sl)
{
spin_unlock(&sl->lock);
}
/**
* read_seqlock_excl_bh() - start a seqlock_t locking reader section with
* softirqs disabled
* @sl: Pointer to seqlock_t
*
* _bh variant of read_seqlock_excl(). Use this variant only if the
* seqlock_t write side section, *or other read sections*, can be invoked
* from softirq contexts.
*/
static inline void read_seqlock_excl_bh(seqlock_t *sl)
{
spin_lock_bh(&sl->lock);
}
/**
* read_sequnlock_excl_bh() - stop a seqlock_t softirq-disabled locking
* reader section
* @sl: Pointer to seqlock_t
*/
static inline void read_sequnlock_excl_bh(seqlock_t *sl)
{
spin_unlock_bh(&sl->lock);
}
/**
* read_seqlock_excl_irq() - start a non-interruptible seqlock_t locking
* reader section
* @sl: Pointer to seqlock_t
*
* _irq variant of read_seqlock_excl(). Use this only if the seqlock_t
* write side section, *or other read sections*, can be invoked from a
* hardirq context.
*/
static inline void read_seqlock_excl_irq(seqlock_t *sl)
{
spin_lock_irq(&sl->lock);
}
/**
* read_sequnlock_excl_irq() - end an interrupts-disabled seqlock_t
* locking reader section
* @sl: Pointer to seqlock_t
*/
static inline void read_sequnlock_excl_irq(seqlock_t *sl)
{
spin_unlock_irq(&sl->lock);
}
static inline unsigned long __read_seqlock_excl_irqsave(seqlock_t *sl)
{
unsigned long flags;
spin_lock_irqsave(&sl->lock, flags);
return flags;
}
/**
* read_seqlock_excl_irqsave() - start a non-interruptible seqlock_t
* locking reader section
* @lock: Pointer to seqlock_t
* @flags: Stack-allocated storage for saving caller's local interrupt
* state, to be passed to read_sequnlock_excl_irqrestore().
*
* _irqsave variant of read_seqlock_excl(). Use this only if the seqlock_t
* write side section, *or other read sections*, can be invoked from a
* hardirq context.
*/
#define read_seqlock_excl_irqsave(lock, flags) \
do { flags = __read_seqlock_excl_irqsave(lock); } while (0)
/**
* read_sequnlock_excl_irqrestore() - end non-interruptible seqlock_t
* locking reader section
* @sl: Pointer to seqlock_t
* @flags: Caller saved interrupt state, from read_seqlock_excl_irqsave()
*/
static inline void
read_sequnlock_excl_irqrestore(seqlock_t *sl, unsigned long flags)
{
spin_unlock_irqrestore(&sl->lock, flags);
}
/**
* read_seqbegin_or_lock() - begin a seqlock_t lockless or locking reader
* @lock: Pointer to seqlock_t
* @seq : Marker and return parameter. If the passed value is even, the
* reader will become a *lockless* seqlock_t reader as in read_seqbegin().
* If the passed value is odd, the reader will become a *locking* reader
* as in read_seqlock_excl(). In the first call to this function, the
* caller *must* initialize and pass an even value to @seq; this way, a
* lockless read can be optimistically tried first.
*
* read_seqbegin_or_lock is an API designed to optimistically try a normal
* lockless seqlock_t read section first. If an odd counter is found, the
* lockless read trial has failed, and the next read iteration transforms
* itself into a full seqlock_t locking reader.
*
* This is typically used to avoid seqlock_t lockless readers starvation
* (too much retry loops) in the case of a sharp spike in write side
* activity.
*
* Context: if the seqlock_t write section, *or other read sections*, can
* be invoked from hardirq or softirq contexts, use the _irqsave or _bh
* variant of this function instead.
*
* Check Documentation/locking/seqlock.rst for template example code.
*
* Return: the encountered sequence counter value, through the @seq
* parameter, which is overloaded as a return parameter. This returned
* value must be checked with need_seqretry(). If the read section need to
* be retried, this returned value must also be passed as the @seq
* parameter of the next read_seqbegin_or_lock() iteration.
*/
static inline void read_seqbegin_or_lock(seqlock_t *lock, int *seq)
{
if (!(*seq & 1)) /* Even */
*seq = read_seqbegin(lock);
else /* Odd */
read_seqlock_excl(lock);
}
/**
* need_seqretry() - validate seqlock_t "locking or lockless" read section
* @lock: Pointer to seqlock_t
* @seq: sequence count, from read_seqbegin_or_lock()
*
* Return: true if a read section retry is required, false otherwise
*/
static inline int need_seqretry(seqlock_t *lock, int seq)
{
return !(seq & 1) && read_seqretry(lock, seq);
}
/**
* done_seqretry() - end seqlock_t "locking or lockless" reader section
* @lock: Pointer to seqlock_t
* @seq: count, from read_seqbegin_or_lock()
*
* done_seqretry finishes the seqlock_t read side critical section started
* with read_seqbegin_or_lock() and validated by need_seqretry().
*/
static inline void done_seqretry(seqlock_t *lock, int seq)
{
if (seq & 1)
read_sequnlock_excl(lock);
}
/**
* read_seqbegin_or_lock_irqsave() - begin a seqlock_t lockless reader, or
* a non-interruptible locking reader
* @lock: Pointer to seqlock_t
* @seq: Marker and return parameter. Check read_seqbegin_or_lock().
*
* This is the _irqsave variant of read_seqbegin_or_lock(). Use it only if
* the seqlock_t write section, *or other read sections*, can be invoked
* from hardirq context.
*
* Note: Interrupts will be disabled only for "locking reader" mode.
*
* Return:
*
* 1. The saved local interrupts state in case of a locking reader, to
* be passed to done_seqretry_irqrestore().
*
* 2. The encountered sequence counter value, returned through @seq
* overloaded as a return parameter. Check read_seqbegin_or_lock().
*/
static inline unsigned long
read_seqbegin_or_lock_irqsave(seqlock_t *lock, int *seq)
{
unsigned long flags = 0;
if (!(*seq & 1)) /* Even */
*seq = read_seqbegin(lock);
else /* Odd */
read_seqlock_excl_irqsave(lock, flags);
return flags;
}
/**
* done_seqretry_irqrestore() - end a seqlock_t lockless reader, or a
* non-interruptible locking reader section
* @lock: Pointer to seqlock_t
* @seq: Count, from read_seqbegin_or_lock_irqsave()
* @flags: Caller's saved local interrupt state in case of a locking
* reader, also from read_seqbegin_or_lock_irqsave()
*
* This is the _irqrestore variant of done_seqretry(). The read section
* must've been opened with read_seqbegin_or_lock_irqsave(), and validated
* by need_seqretry().
*/
static inline void
done_seqretry_irqrestore(seqlock_t *lock, int seq, unsigned long flags)
{
if (seq & 1)
read_sequnlock_excl_irqrestore(lock, flags);
}
#endif /* __LINUX_SEQLOCK_H */
// SPDX-License-Identifier: GPL-2.0-only
/*
* Copyright (C) 2007 Jens Axboe <jens.axboe@oracle.com>
*
* Scatterlist handling helpers.
*/
#include <linux/export.h>
#include <linux/slab.h>
#include <linux/scatterlist.h>
#include <linux/highmem.h>
#include <linux/kmemleak.h>
/**
* sg_next - return the next scatterlist entry in a list
* @sg: The current sg entry
*
* Description:
* Usually the next entry will be @sg@ + 1, but if this sg element is part
* of a chained scatterlist, it could jump to the start of a new
* scatterlist array.
*
**/
struct scatterlist *sg_next(struct scatterlist *sg)
{
if (sg_is_last(sg))
return NULL;
sg++; if (unlikely(sg_is_chain(sg))) sg = sg_chain_ptr(sg);
return sg;
}
EXPORT_SYMBOL(sg_next);
/**
* sg_nents - return total count of entries in scatterlist
* @sg: The scatterlist
*
* Description:
* Allows to know how many entries are in sg, taking into account
* chaining as well
*
**/
int sg_nents(struct scatterlist *sg)
{
int nents;
for (nents = 0; sg; sg = sg_next(sg))
nents++;
return nents;
}
EXPORT_SYMBOL(sg_nents);
/**
* sg_nents_for_len - return total count of entries in scatterlist
* needed to satisfy the supplied length
* @sg: The scatterlist
* @len: The total required length
*
* Description:
* Determines the number of entries in sg that are required to meet
* the supplied length, taking into account chaining as well
*
* Returns:
* the number of sg entries needed, negative error on failure
*
**/
int sg_nents_for_len(struct scatterlist *sg, u64 len)
{
int nents;
u64 total;
if (!len)
return 0;
for (nents = 0, total = 0; sg; sg = sg_next(sg)) {
nents++;
total += sg->length;
if (total >= len)
return nents;
}
return -EINVAL;
}
EXPORT_SYMBOL(sg_nents_for_len);
/**
* sg_last - return the last scatterlist entry in a list
* @sgl: First entry in the scatterlist
* @nents: Number of entries in the scatterlist
*
* Description:
* Should only be used casually, it (currently) scans the entire list
* to get the last entry.
*
* Note that the @sgl@ pointer passed in need not be the first one,
* the important bit is that @nents@ denotes the number of entries that
* exist from @sgl@.
*
**/
struct scatterlist *sg_last(struct scatterlist *sgl, unsigned int nents)
{
struct scatterlist *sg, *ret = NULL;
unsigned int i;
for_each_sg(sgl, sg, nents, i)
ret = sg;
BUG_ON(!sg_is_last(ret));
return ret;
}
EXPORT_SYMBOL(sg_last);
/**
* sg_init_table - Initialize SG table
* @sgl: The SG table
* @nents: Number of entries in table
*
* Notes:
* If this is part of a chained sg table, sg_mark_end() should be
* used only on the last table part.
*
**/
void sg_init_table(struct scatterlist *sgl, unsigned int nents)
{
memset(sgl, 0, sizeof(*sgl) * nents);
sg_init_marker(sgl, nents);
}
EXPORT_SYMBOL(sg_init_table);
/**
* sg_init_one - Initialize a single entry sg list
* @sg: SG entry
* @buf: Virtual address for IO
* @buflen: IO length
*
**/
void sg_init_one(struct scatterlist *sg, const void *buf, unsigned int buflen)
{
sg_init_table(sg, 1);
sg_set_buf(sg, buf, buflen);
}
EXPORT_SYMBOL(sg_init_one);
/*
* The default behaviour of sg_alloc_table() is to use these kmalloc/kfree
* helpers.
*/
static struct scatterlist *sg_kmalloc(unsigned int nents, gfp_t gfp_mask)
{
if (nents == SG_MAX_SINGLE_ALLOC) {
/*
* Kmemleak doesn't track page allocations as they are not
* commonly used (in a raw form) for kernel data structures.
* As we chain together a list of pages and then a normal
* kmalloc (tracked by kmemleak), in order to for that last
* allocation not to become decoupled (and thus a
* false-positive) we need to inform kmemleak of all the
* intermediate allocations.
*/
void *ptr = (void *) __get_free_page(gfp_mask);
kmemleak_alloc(ptr, PAGE_SIZE, 1, gfp_mask);
return ptr;
} else
return kmalloc_array(nents, sizeof(struct scatterlist),
gfp_mask);
}
static void sg_kfree(struct scatterlist *sg, unsigned int nents)
{
if (nents == SG_MAX_SINGLE_ALLOC) {
kmemleak_free(sg);
free_page((unsigned long) sg);
} else
kfree(sg);
}
/**
* __sg_free_table - Free a previously mapped sg table
* @table: The sg table header to use
* @max_ents: The maximum number of entries per single scatterlist
* @nents_first_chunk: Number of entries int the (preallocated) first
* scatterlist chunk, 0 means no such preallocated first chunk
* @free_fn: Free function
* @num_ents: Number of entries in the table
*
* Description:
* Free an sg table previously allocated and setup with
* __sg_alloc_table(). The @max_ents value must be identical to
* that previously used with __sg_alloc_table().
*
**/
void __sg_free_table(struct sg_table *table, unsigned int max_ents,
unsigned int nents_first_chunk, sg_free_fn *free_fn,
unsigned int num_ents)
{
struct scatterlist *sgl, *next;
unsigned curr_max_ents = nents_first_chunk ?: max_ents;
if (unlikely(!table->sgl))
return;
sgl = table->sgl;
while (num_ents) {
unsigned int alloc_size = num_ents;
unsigned int sg_size;
/*
* If we have more than max_ents segments left,
* then assign 'next' to the sg table after the current one.
* sg_size is then one less than alloc size, since the last
* element is the chain pointer.
*/
if (alloc_size > curr_max_ents) {
next = sg_chain_ptr(&sgl[curr_max_ents - 1]);
alloc_size = curr_max_ents;
sg_size = alloc_size - 1;
} else {
sg_size = alloc_size;
next = NULL;
}
num_ents -= sg_size;
if (nents_first_chunk)
nents_first_chunk = 0;
else
free_fn(sgl, alloc_size);
sgl = next;
curr_max_ents = max_ents;
}
table->sgl = NULL;
}
EXPORT_SYMBOL(__sg_free_table);
/**
* sg_free_append_table - Free a previously allocated append sg table.
* @table: The mapped sg append table header
*
**/
void sg_free_append_table(struct sg_append_table *table)
{
__sg_free_table(&table->sgt, SG_MAX_SINGLE_ALLOC, false, sg_kfree,
table->total_nents);
}
EXPORT_SYMBOL(sg_free_append_table);
/**
* sg_free_table - Free a previously allocated sg table
* @table: The mapped sg table header
*
**/
void sg_free_table(struct sg_table *table)
{
__sg_free_table(table, SG_MAX_SINGLE_ALLOC, false, sg_kfree,
table->orig_nents);
}
EXPORT_SYMBOL(sg_free_table);
/**
* __sg_alloc_table - Allocate and initialize an sg table with given allocator
* @table: The sg table header to use
* @nents: Number of entries in sg list
* @max_ents: The maximum number of entries the allocator returns per call
* @nents_first_chunk: Number of entries int the (preallocated) first
* scatterlist chunk, 0 means no such preallocated chunk provided by user
* @gfp_mask: GFP allocation mask
* @alloc_fn: Allocator to use
*
* Description:
* This function returns a @table @nents long. The allocator is
* defined to return scatterlist chunks of maximum size @max_ents.
* Thus if @nents is bigger than @max_ents, the scatterlists will be
* chained in units of @max_ents.
*
* Notes:
* If this function returns non-0 (eg failure), the caller must call
* __sg_free_table() to cleanup any leftover allocations.
*
**/
int __sg_alloc_table(struct sg_table *table, unsigned int nents,
unsigned int max_ents, struct scatterlist *first_chunk,
unsigned int nents_first_chunk, gfp_t gfp_mask,
sg_alloc_fn *alloc_fn)
{
struct scatterlist *sg, *prv;
unsigned int left;
unsigned curr_max_ents = nents_first_chunk ?: max_ents;
unsigned prv_max_ents;
memset(table, 0, sizeof(*table));
if (nents == 0)
return -EINVAL;
#ifdef CONFIG_ARCH_NO_SG_CHAIN
if (WARN_ON_ONCE(nents > max_ents))
return -EINVAL;
#endif
left = nents;
prv = NULL;
do {
unsigned int sg_size, alloc_size = left;
if (alloc_size > curr_max_ents) {
alloc_size = curr_max_ents;
sg_size = alloc_size - 1;
} else
sg_size = alloc_size;
left -= sg_size;
if (first_chunk) {
sg = first_chunk;
first_chunk = NULL;
} else {
sg = alloc_fn(alloc_size, gfp_mask);
}
if (unlikely(!sg)) {
/*
* Adjust entry count to reflect that the last
* entry of the previous table won't be used for
* linkage. Without this, sg_kfree() may get
* confused.
*/
if (prv) table->nents = ++table->orig_nents;
return -ENOMEM;
}
sg_init_table(sg, alloc_size);
table->nents = table->orig_nents += sg_size;
/*
* If this is the first mapping, assign the sg table header.
* If this is not the first mapping, chain previous part.
*/
if (prv)
sg_chain(prv, prv_max_ents, sg);
else
table->sgl = sg;
/*
* If no more entries after this one, mark the end
*/
if (!left) sg_mark_end(&sg[sg_size - 1]);
prv = sg;
prv_max_ents = curr_max_ents;
curr_max_ents = max_ents;
} while (left);
return 0;
}
EXPORT_SYMBOL(__sg_alloc_table);
/**
* sg_alloc_table - Allocate and initialize an sg table
* @table: The sg table header to use
* @nents: Number of entries in sg list
* @gfp_mask: GFP allocation mask
*
* Description:
* Allocate and initialize an sg table. If @nents@ is larger than
* SG_MAX_SINGLE_ALLOC a chained sg table will be setup.
*
**/
int sg_alloc_table(struct sg_table *table, unsigned int nents, gfp_t gfp_mask)
{
int ret;
ret = __sg_alloc_table(table, nents, SG_MAX_SINGLE_ALLOC,
NULL, 0, gfp_mask, sg_kmalloc);
if (unlikely(ret))
sg_free_table(table);
return ret;
}
EXPORT_SYMBOL(sg_alloc_table);
static struct scatterlist *get_next_sg(struct sg_append_table *table,
struct scatterlist *cur,
unsigned long needed_sges,
gfp_t gfp_mask)
{
struct scatterlist *new_sg, *next_sg;
unsigned int alloc_size;
if (cur) {
next_sg = sg_next(cur);
/* Check if last entry should be keeped for chainning */
if (!sg_is_last(next_sg) || needed_sges == 1)
return next_sg;
}
alloc_size = min_t(unsigned long, needed_sges, SG_MAX_SINGLE_ALLOC);
new_sg = sg_kmalloc(alloc_size, gfp_mask);
if (!new_sg)
return ERR_PTR(-ENOMEM);
sg_init_table(new_sg, alloc_size);
if (cur) {
table->total_nents += alloc_size - 1;
__sg_chain(next_sg, new_sg);
} else {
table->sgt.sgl = new_sg;
table->total_nents = alloc_size;
}
return new_sg;
}
/**
* sg_alloc_append_table_from_pages - Allocate and initialize an append sg
* table from an array of pages
* @sgt_append: The sg append table to use
* @pages: Pointer to an array of page pointers
* @n_pages: Number of pages in the pages array
* @offset: Offset from start of the first page to the start of a buffer
* @size: Number of valid bytes in the buffer (after offset)
* @max_segment: Maximum size of a scatterlist element in bytes
* @left_pages: Left pages caller have to set after this call
* @gfp_mask: GFP allocation mask
*
* Description:
* In the first call it allocate and initialize an sg table from a list of
* pages, else reuse the scatterlist from sgt_append. Contiguous ranges of
* the pages are squashed into a single scatterlist entry up to the maximum
* size specified in @max_segment. A user may provide an offset at a start
* and a size of valid data in a buffer specified by the page array. The
* returned sg table is released by sg_free_append_table
*
* Returns:
* 0 on success, negative error on failure
*
* Notes:
* If this function returns non-0 (eg failure), the caller must call
* sg_free_append_table() to cleanup any leftover allocations.
*
* In the fist call, sgt_append must by initialized.
*/
int sg_alloc_append_table_from_pages(struct sg_append_table *sgt_append,
struct page **pages, unsigned int n_pages, unsigned int offset,
unsigned long size, unsigned int max_segment,
unsigned int left_pages, gfp_t gfp_mask)
{
unsigned int chunks, cur_page, seg_len, i, prv_len = 0;
unsigned int added_nents = 0;
struct scatterlist *s = sgt_append->prv;
/*
* The algorithm below requires max_segment to be aligned to PAGE_SIZE
* otherwise it can overshoot.
*/
max_segment = ALIGN_DOWN(max_segment, PAGE_SIZE);
if (WARN_ON(max_segment < PAGE_SIZE))
return -EINVAL;
if (IS_ENABLED(CONFIG_ARCH_NO_SG_CHAIN) && sgt_append->prv)
return -EOPNOTSUPP;
if (sgt_append->prv) {
unsigned long paddr =
(page_to_pfn(sg_page(sgt_append->prv)) * PAGE_SIZE +
sgt_append->prv->offset + sgt_append->prv->length) /
PAGE_SIZE;
if (WARN_ON(offset))
return -EINVAL;
/* Merge contiguous pages into the last SG */
prv_len = sgt_append->prv->length;
while (n_pages && page_to_pfn(pages[0]) == paddr) {
if (sgt_append->prv->length + PAGE_SIZE > max_segment)
break;
sgt_append->prv->length += PAGE_SIZE;
paddr++;
pages++;
n_pages--;
}
if (!n_pages)
goto out;
}
/* compute number of contiguous chunks */
chunks = 1;
seg_len = 0;
for (i = 1; i < n_pages; i++) {
seg_len += PAGE_SIZE;
if (seg_len >= max_segment ||
page_to_pfn(pages[i]) != page_to_pfn(pages[i - 1]) + 1) {
chunks++;
seg_len = 0;
}
}
/* merging chunks and putting them into the scatterlist */
cur_page = 0;
for (i = 0; i < chunks; i++) {
unsigned int j, chunk_size;
/* look for the end of the current chunk */
seg_len = 0;
for (j = cur_page + 1; j < n_pages; j++) {
seg_len += PAGE_SIZE;
if (seg_len >= max_segment ||
page_to_pfn(pages[j]) !=
page_to_pfn(pages[j - 1]) + 1)
break;
}
/* Pass how many chunks might be left */
s = get_next_sg(sgt_append, s, chunks - i + left_pages,
gfp_mask);
if (IS_ERR(s)) {
/*
* Adjust entry length to be as before function was
* called.
*/
if (sgt_append->prv)
sgt_append->prv->length = prv_len;
return PTR_ERR(s);
}
chunk_size = ((j - cur_page) << PAGE_SHIFT) - offset;
sg_set_page(s, pages[cur_page],
min_t(unsigned long, size, chunk_size), offset);
added_nents++;
size -= chunk_size;
offset = 0;
cur_page = j;
}
sgt_append->sgt.nents += added_nents;
sgt_append->sgt.orig_nents = sgt_append->sgt.nents;
sgt_append->prv = s;
out:
if (!left_pages)
sg_mark_end(s);
return 0;
}
EXPORT_SYMBOL(sg_alloc_append_table_from_pages);
/**
* sg_alloc_table_from_pages_segment - Allocate and initialize an sg table from
* an array of pages and given maximum
* segment.
* @sgt: The sg table header to use
* @pages: Pointer to an array of page pointers
* @n_pages: Number of pages in the pages array
* @offset: Offset from start of the first page to the start of a buffer
* @size: Number of valid bytes in the buffer (after offset)
* @max_segment: Maximum size of a scatterlist element in bytes
* @gfp_mask: GFP allocation mask
*
* Description:
* Allocate and initialize an sg table from a list of pages. Contiguous
* ranges of the pages are squashed into a single scatterlist node up to the
* maximum size specified in @max_segment. A user may provide an offset at a
* start and a size of valid data in a buffer specified by the page array.
*
* The returned sg table is released by sg_free_table.
*
* Returns:
* 0 on success, negative error on failure
*/
int sg_alloc_table_from_pages_segment(struct sg_table *sgt, struct page **pages,
unsigned int n_pages, unsigned int offset,
unsigned long size, unsigned int max_segment,
gfp_t gfp_mask)
{
struct sg_append_table append = {};
int err;
err = sg_alloc_append_table_from_pages(&append, pages, n_pages, offset,
size, max_segment, 0, gfp_mask);
if (err) {
sg_free_append_table(&append);
return err;
}
memcpy(sgt, &append.sgt, sizeof(*sgt));
WARN_ON(append.total_nents != sgt->orig_nents);
return 0;
}
EXPORT_SYMBOL(sg_alloc_table_from_pages_segment);
#ifdef CONFIG_SGL_ALLOC
/**
* sgl_alloc_order - allocate a scatterlist and its pages
* @length: Length in bytes of the scatterlist. Must be at least one
* @order: Second argument for alloc_pages()
* @chainable: Whether or not to allocate an extra element in the scatterlist
* for scatterlist chaining purposes
* @gfp: Memory allocation flags
* @nent_p: [out] Number of entries in the scatterlist that have pages
*
* Returns: A pointer to an initialized scatterlist or %NULL upon failure.
*/
struct scatterlist *sgl_alloc_order(unsigned long long length,
unsigned int order, bool chainable,
gfp_t gfp, unsigned int *nent_p)
{
struct scatterlist *sgl, *sg;
struct page *page;
unsigned int nent, nalloc;
u32 elem_len;
nent = round_up(length, PAGE_SIZE << order) >> (PAGE_SHIFT + order);
/* Check for integer overflow */
if (length > (nent << (PAGE_SHIFT + order)))
return NULL;
nalloc = nent;
if (chainable) {
/* Check for integer overflow */
if (nalloc + 1 < nalloc)
return NULL;
nalloc++;
}
sgl = kmalloc_array(nalloc, sizeof(struct scatterlist),
gfp & ~GFP_DMA);
if (!sgl)
return NULL;
sg_init_table(sgl, nalloc);
sg = sgl;
while (length) {
elem_len = min_t(u64, length, PAGE_SIZE << order);
page = alloc_pages(gfp, order);
if (!page) {
sgl_free_order(sgl, order);
return NULL;
}
sg_set_page(sg, page, elem_len, 0);
length -= elem_len;
sg = sg_next(sg);
}
WARN_ONCE(length, "length = %lld\n", length);
if (nent_p)
*nent_p = nent;
return sgl;
}
EXPORT_SYMBOL(sgl_alloc_order);
/**
* sgl_alloc - allocate a scatterlist and its pages
* @length: Length in bytes of the scatterlist
* @gfp: Memory allocation flags
* @nent_p: [out] Number of entries in the scatterlist
*
* Returns: A pointer to an initialized scatterlist or %NULL upon failure.
*/
struct scatterlist *sgl_alloc(unsigned long long length, gfp_t gfp,
unsigned int *nent_p)
{
return sgl_alloc_order(length, 0, false, gfp, nent_p);
}
EXPORT_SYMBOL(sgl_alloc);
/**
* sgl_free_n_order - free a scatterlist and its pages
* @sgl: Scatterlist with one or more elements
* @nents: Maximum number of elements to free
* @order: Second argument for __free_pages()
*
* Notes:
* - If several scatterlists have been chained and each chain element is
* freed separately then it's essential to set nents correctly to avoid that a
* page would get freed twice.
* - All pages in a chained scatterlist can be freed at once by setting @nents
* to a high number.
*/
void sgl_free_n_order(struct scatterlist *sgl, int nents, int order)
{
struct scatterlist *sg;
struct page *page;
int i;
for_each_sg(sgl, sg, nents, i) {
if (!sg)
break;
page = sg_page(sg);
if (page)
__free_pages(page, order);
}
kfree(sgl);
}
EXPORT_SYMBOL(sgl_free_n_order);
/**
* sgl_free_order - free a scatterlist and its pages
* @sgl: Scatterlist with one or more elements
* @order: Second argument for __free_pages()
*/
void sgl_free_order(struct scatterlist *sgl, int order)
{
sgl_free_n_order(sgl, INT_MAX, order);
}
EXPORT_SYMBOL(sgl_free_order);
/**
* sgl_free - free a scatterlist and its pages
* @sgl: Scatterlist with one or more elements
*/
void sgl_free(struct scatterlist *sgl)
{
sgl_free_order(sgl, 0);
}
EXPORT_SYMBOL(sgl_free);
#endif /* CONFIG_SGL_ALLOC */
void __sg_page_iter_start(struct sg_page_iter *piter,
struct scatterlist *sglist, unsigned int nents,
unsigned long pgoffset)
{
piter->__pg_advance = 0;
piter->__nents = nents;
piter->sg = sglist;
piter->sg_pgoffset = pgoffset;
}
EXPORT_SYMBOL(__sg_page_iter_start);
static int sg_page_count(struct scatterlist *sg)
{
return PAGE_ALIGN(sg->offset + sg->length) >> PAGE_SHIFT;
}
bool __sg_page_iter_next(struct sg_page_iter *piter)
{
if (!piter->__nents || !piter->sg)
return false;
piter->sg_pgoffset += piter->__pg_advance;
piter->__pg_advance = 1;
while (piter->sg_pgoffset >= sg_page_count(piter->sg)) {
piter->sg_pgoffset -= sg_page_count(piter->sg);
piter->sg = sg_next(piter->sg);
if (!--piter->__nents || !piter->sg)
return false;
}
return true;
}
EXPORT_SYMBOL(__sg_page_iter_next);
static int sg_dma_page_count(struct scatterlist *sg)
{
return PAGE_ALIGN(sg->offset + sg_dma_len(sg)) >> PAGE_SHIFT;
}
bool __sg_page_iter_dma_next(struct sg_dma_page_iter *dma_iter)
{
struct sg_page_iter *piter = &dma_iter->base;
if (!piter->__nents || !piter->sg)
return false;
piter->sg_pgoffset += piter->__pg_advance;
piter->__pg_advance = 1;
while (piter->sg_pgoffset >= sg_dma_page_count(piter->sg)) {
piter->sg_pgoffset -= sg_dma_page_count(piter->sg);
piter->sg = sg_next(piter->sg);
if (!--piter->__nents || !piter->sg)
return false;
}
return true;
}
EXPORT_SYMBOL(__sg_page_iter_dma_next);
/**
* sg_miter_start - start mapping iteration over a sg list
* @miter: sg mapping iter to be started
* @sgl: sg list to iterate over
* @nents: number of sg entries
*
* Description:
* Starts mapping iterator @miter.
*
* Context:
* Don't care.
*/
void sg_miter_start(struct sg_mapping_iter *miter, struct scatterlist *sgl,
unsigned int nents, unsigned int flags)
{
memset(miter, 0, sizeof(struct sg_mapping_iter));
__sg_page_iter_start(&miter->piter, sgl, nents, 0);
WARN_ON(!(flags & (SG_MITER_TO_SG | SG_MITER_FROM_SG)));
miter->__flags = flags;
}
EXPORT_SYMBOL(sg_miter_start);
static bool sg_miter_get_next_page(struct sg_mapping_iter *miter)
{
if (!miter->__remaining) {
struct scatterlist *sg;
if (!__sg_page_iter_next(&miter->piter))
return false;
sg = miter->piter.sg;
miter->__offset = miter->piter.sg_pgoffset ? 0 : sg->offset;
miter->piter.sg_pgoffset += miter->__offset >> PAGE_SHIFT;
miter->__offset &= PAGE_SIZE - 1;
miter->__remaining = sg->offset + sg->length -
(miter->piter.sg_pgoffset << PAGE_SHIFT) -
miter->__offset;
miter->__remaining = min_t(unsigned long, miter->__remaining,
PAGE_SIZE - miter->__offset);
}
return true;
}
/**
* sg_miter_skip - reposition mapping iterator
* @miter: sg mapping iter to be skipped
* @offset: number of bytes to plus the current location
*
* Description:
* Sets the offset of @miter to its current location plus @offset bytes.
* If mapping iterator @miter has been proceeded by sg_miter_next(), this
* stops @miter.
*
* Context:
* Don't care if @miter is stopped, or not proceeded yet.
* Otherwise, preemption disabled if the SG_MITER_ATOMIC is set.
*
* Returns:
* true if @miter contains the valid mapping. false if end of sg
* list is reached.
*/
bool sg_miter_skip(struct sg_mapping_iter *miter, off_t offset)
{
sg_miter_stop(miter);
while (offset) {
off_t consumed;
if (!sg_miter_get_next_page(miter))
return false;
consumed = min_t(off_t, offset, miter->__remaining);
miter->__offset += consumed;
miter->__remaining -= consumed;
offset -= consumed;
}
return true;
}
EXPORT_SYMBOL(sg_miter_skip);
/**
* sg_miter_next - proceed mapping iterator to the next mapping
* @miter: sg mapping iter to proceed
*
* Description:
* Proceeds @miter to the next mapping. @miter should have been started
* using sg_miter_start(). On successful return, @miter->page,
* @miter->addr and @miter->length point to the current mapping.
*
* Context:
* Preemption disabled if SG_MITER_ATOMIC. Preemption must stay disabled
* till @miter is stopped. May sleep if !SG_MITER_ATOMIC.
*
* Returns:
* true if @miter contains the next mapping. false if end of sg
* list is reached.
*/
bool sg_miter_next(struct sg_mapping_iter *miter)
{
sg_miter_stop(miter);
/*
* Get to the next page if necessary.
* __remaining, __offset is adjusted by sg_miter_stop
*/
if (!sg_miter_get_next_page(miter))
return false;
miter->page = sg_page_iter_page(&miter->piter);
miter->consumed = miter->length = miter->__remaining;
if (miter->__flags & SG_MITER_ATOMIC)
miter->addr = kmap_atomic(miter->page) + miter->__offset;
else
miter->addr = kmap(miter->page) + miter->__offset;
return true;
}
EXPORT_SYMBOL(sg_miter_next);
/**
* sg_miter_stop - stop mapping iteration
* @miter: sg mapping iter to be stopped
*
* Description:
* Stops mapping iterator @miter. @miter should have been started
* using sg_miter_start(). A stopped iteration can be resumed by
* calling sg_miter_next() on it. This is useful when resources (kmap)
* need to be released during iteration.
*
* Context:
* Preemption disabled if the SG_MITER_ATOMIC is set. Don't care
* otherwise.
*/
void sg_miter_stop(struct sg_mapping_iter *miter)
{
WARN_ON(miter->consumed > miter->length);
/* drop resources from the last iteration */
if (miter->addr) {
miter->__offset += miter->consumed;
miter->__remaining -= miter->consumed;
if (miter->__flags & SG_MITER_TO_SG)
flush_dcache_page(miter->page);
if (miter->__flags & SG_MITER_ATOMIC) {
WARN_ON_ONCE(preemptible());
kunmap_atomic(miter->addr);
} else
kunmap(miter->page);
miter->page = NULL;
miter->addr = NULL;
miter->length = 0;
miter->consumed = 0;
}
}
EXPORT_SYMBOL(sg_miter_stop);
/**
* sg_copy_buffer - Copy data between a linear buffer and an SG list
* @sgl: The SG list
* @nents: Number of SG entries
* @buf: Where to copy from
* @buflen: The number of bytes to copy
* @skip: Number of bytes to skip before copying
* @to_buffer: transfer direction (true == from an sg list to a
* buffer, false == from a buffer to an sg list)
*
* Returns the number of copied bytes.
*
**/
size_t sg_copy_buffer(struct scatterlist *sgl, unsigned int nents, void *buf,
size_t buflen, off_t skip, bool to_buffer)
{
unsigned int offset = 0;
struct sg_mapping_iter miter;
unsigned int sg_flags = SG_MITER_ATOMIC;
if (to_buffer)
sg_flags |= SG_MITER_FROM_SG;
else
sg_flags |= SG_MITER_TO_SG;
sg_miter_start(&miter, sgl, nents, sg_flags);
if (!sg_miter_skip(&miter, skip))
return 0;
while ((offset < buflen) && sg_miter_next(&miter)) {
unsigned int len;
len = min(miter.length, buflen - offset);
if (to_buffer)
memcpy(buf + offset, miter.addr, len);
else
memcpy(miter.addr, buf + offset, len);
offset += len;
}
sg_miter_stop(&miter);
return offset;
}
EXPORT_SYMBOL(sg_copy_buffer);
/**
* sg_copy_from_buffer - Copy from a linear buffer to an SG list
* @sgl: The SG list
* @nents: Number of SG entries
* @buf: Where to copy from
* @buflen: The number of bytes to copy
*
* Returns the number of copied bytes.
*
**/
size_t sg_copy_from_buffer(struct scatterlist *sgl, unsigned int nents,
const void *buf, size_t buflen)
{
return sg_copy_buffer(sgl, nents, (void *)buf, buflen, 0, false);
}
EXPORT_SYMBOL(sg_copy_from_buffer);
/**
* sg_copy_to_buffer - Copy from an SG list to a linear buffer
* @sgl: The SG list
* @nents: Number of SG entries
* @buf: Where to copy to
* @buflen: The number of bytes to copy
*
* Returns the number of copied bytes.
*
**/
size_t sg_copy_to_buffer(struct scatterlist *sgl, unsigned int nents,
void *buf, size_t buflen)
{
return sg_copy_buffer(sgl, nents, buf, buflen, 0, true);
}
EXPORT_SYMBOL(sg_copy_to_buffer);
/**
* sg_pcopy_from_buffer - Copy from a linear buffer to an SG list
* @sgl: The SG list
* @nents: Number of SG entries
* @buf: Where to copy from
* @buflen: The number of bytes to copy
* @skip: Number of bytes to skip before copying
*
* Returns the number of copied bytes.
*
**/
size_t sg_pcopy_from_buffer(struct scatterlist *sgl, unsigned int nents,
const void *buf, size_t buflen, off_t skip)
{
return sg_copy_buffer(sgl, nents, (void *)buf, buflen, skip, false);
}
EXPORT_SYMBOL(sg_pcopy_from_buffer);
/**
* sg_pcopy_to_buffer - Copy from an SG list to a linear buffer
* @sgl: The SG list
* @nents: Number of SG entries
* @buf: Where to copy to
* @buflen: The number of bytes to copy
* @skip: Number of bytes to skip before copying
*
* Returns the number of copied bytes.
*
**/
size_t sg_pcopy_to_buffer(struct scatterlist *sgl, unsigned int nents,
void *buf, size_t buflen, off_t skip)
{
return sg_copy_buffer(sgl, nents, buf, buflen, skip, true);
}
EXPORT_SYMBOL(sg_pcopy_to_buffer);
/**
* sg_zero_buffer - Zero-out a part of a SG list
* @sgl: The SG list
* @nents: Number of SG entries
* @buflen: The number of bytes to zero out
* @skip: Number of bytes to skip before zeroing
*
* Returns the number of bytes zeroed.
**/
size_t sg_zero_buffer(struct scatterlist *sgl, unsigned int nents,
size_t buflen, off_t skip)
{
unsigned int offset = 0;
struct sg_mapping_iter miter;
unsigned int sg_flags = SG_MITER_ATOMIC | SG_MITER_TO_SG;
sg_miter_start(&miter, sgl, nents, sg_flags);
if (!sg_miter_skip(&miter, skip))
return false;
while (offset < buflen && sg_miter_next(&miter)) {
unsigned int len;
len = min(miter.length, buflen - offset);
memset(miter.addr, 0, len);
offset += len;
}
sg_miter_stop(&miter);
return offset;
}
EXPORT_SYMBOL(sg_zero_buffer);
/* SPDX-License-Identifier: GPL-2.0 */
/*
* Prevent the compiler from merging or refetching reads or writes. The
* compiler is also forbidden from reordering successive instances of
* READ_ONCE and WRITE_ONCE, but only when the compiler is aware of some
* particular ordering. One way to make the compiler aware of ordering is to
* put the two invocations of READ_ONCE or WRITE_ONCE in different C
* statements.
*
* These two macros will also work on aggregate data types like structs or
* unions.
*
* Their two major use cases are: (1) Mediating communication between
* process-level code and irq/NMI handlers, all running on the same CPU,
* and (2) Ensuring that the compiler does not fold, spindle, or otherwise
* mutilate accesses that either do not require ordering or that interact
* with an explicit memory barrier or atomic instruction that provides the
* required ordering.
*/
#ifndef __ASM_GENERIC_RWONCE_H
#define __ASM_GENERIC_RWONCE_H
#ifndef __ASSEMBLY__
#include <linux/compiler_types.h>
#include <linux/kasan-checks.h>
#include <linux/kcsan-checks.h>
/*
* Yes, this permits 64-bit accesses on 32-bit architectures. These will
* actually be atomic in some cases (namely Armv7 + LPAE), but for others we
* rely on the access being split into 2x32-bit accesses for a 32-bit quantity
* (e.g. a virtual address) and a strong prevailing wind.
*/
#define compiletime_assert_rwonce_type(t) \
compiletime_assert(__native_word(t) || sizeof(t) == sizeof(long long), \
"Unsupported access size for {READ,WRITE}_ONCE().")
/*
* Use __READ_ONCE() instead of READ_ONCE() if you do not require any
* atomicity. Note that this may result in tears!
*/
#ifndef __READ_ONCE
#define __READ_ONCE(x) (*(const volatile __unqual_scalar_typeof(x) *)&(x))
#endif
#define READ_ONCE(x) \
({ \
compiletime_assert_rwonce_type(x); \
__READ_ONCE(x); \
})
#define __WRITE_ONCE(x, val) \
do { \
*(volatile typeof(x) *)&(x) = (val); \
} while (0)
#define WRITE_ONCE(x, val) \
do { \
compiletime_assert_rwonce_type(x); \
__WRITE_ONCE(x, val); \
} while (0)
static __no_sanitize_or_inline
unsigned long __read_once_word_nocheck(const void *addr)
{
return __READ_ONCE(*(unsigned long *)addr);
}
/*
* Use READ_ONCE_NOCHECK() instead of READ_ONCE() if you need to load a
* word from memory atomically but without telling KASAN/KCSAN. This is
* usually used by unwinding code when walking the stack of a running process.
*/
#define READ_ONCE_NOCHECK(x) \
({ \
compiletime_assert(sizeof(x) == sizeof(unsigned long), \
"Unsupported access size for READ_ONCE_NOCHECK()."); \
(typeof(x))__read_once_word_nocheck(&(x)); \
})
static __no_kasan_or_inline
unsigned long read_word_at_a_time(const void *addr)
{
kasan_check_read(addr, 1);
return *(unsigned long *)addr;
}
#endif /* __ASSEMBLY__ */
#endif /* __ASM_GENERIC_RWONCE_H */
// SPDX-License-Identifier: GPL-2.0
/*
* Copyright (C) 1991, 1992 Linus Torvalds
* Copyright (C) 1994, Karl Keyte: Added support for disk statistics
* Elevator latency, (C) 2000 Andrea Arcangeli <andrea@suse.de> SuSE
* Queue request tables / lock, selectable elevator, Jens Axboe <axboe@suse.de>
* kernel-doc documentation started by NeilBrown <neilb@cse.unsw.edu.au>
* - July2000
* bio rewrite, highmem i/o, etc, Jens Axboe <axboe@suse.de> - may 2001
*/
/*
* This handles all read/write requests to block devices
*/
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/bio.h>
#include <linux/blkdev.h>
#include <linux/blk-mq.h>
#include <linux/blk-pm.h>
#include <linux/highmem.h>
#include <linux/mm.h>
#include <linux/pagemap.h>
#include <linux/kernel_stat.h>
#include <linux/string.h>
#include <linux/init.h>
#include <linux/completion.h>
#include <linux/slab.h>
#include <linux/swap.h>
#include <linux/writeback.h>
#include <linux/task_io_accounting_ops.h>
#include <linux/fault-inject.h>
#include <linux/list_sort.h>
#include <linux/delay.h>
#include <linux/ratelimit.h>
#include <linux/pm_runtime.h>
#include <linux/blk-cgroup.h>
#include <linux/t10-pi.h>
#include <linux/debugfs.h>
#include <linux/bpf.h>
#include <linux/psi.h>
#include <linux/sched/sysctl.h>
#include <linux/blk-crypto.h>
#define CREATE_TRACE_POINTS
#include <trace/events/block.h>
#include "blk.h"
#include "blk-mq.h"
#include "blk-mq-sched.h"
#include "blk-pm.h"
#include "blk-rq-qos.h"
struct dentry *blk_debugfs_root;
EXPORT_TRACEPOINT_SYMBOL_GPL(block_bio_remap);
EXPORT_TRACEPOINT_SYMBOL_GPL(block_rq_remap);
EXPORT_TRACEPOINT_SYMBOL_GPL(block_bio_complete);
EXPORT_TRACEPOINT_SYMBOL_GPL(block_split);
EXPORT_TRACEPOINT_SYMBOL_GPL(block_unplug);
EXPORT_TRACEPOINT_SYMBOL_GPL(block_rq_insert);
DEFINE_IDA(blk_queue_ida);
/*
* For queue allocation
*/
struct kmem_cache *blk_requestq_cachep;
/*
* Controlling structure to kblockd
*/
static struct workqueue_struct *kblockd_workqueue;
/**
* blk_queue_flag_set - atomically set a queue flag
* @flag: flag to be set
* @q: request queue
*/
void blk_queue_flag_set(unsigned int flag, struct request_queue *q)
{
set_bit(flag, &q->queue_flags);
}
EXPORT_SYMBOL(blk_queue_flag_set);
/**
* blk_queue_flag_clear - atomically clear a queue flag
* @flag: flag to be cleared
* @q: request queue
*/
void blk_queue_flag_clear(unsigned int flag, struct request_queue *q)
{
clear_bit(flag, &q->queue_flags);
}
EXPORT_SYMBOL(blk_queue_flag_clear);
/**
* blk_queue_flag_test_and_set - atomically test and set a queue flag
* @flag: flag to be set
* @q: request queue
*
* Returns the previous value of @flag - 0 if the flag was not set and 1 if
* the flag was already set.
*/
bool blk_queue_flag_test_and_set(unsigned int flag, struct request_queue *q)
{
return test_and_set_bit(flag, &q->queue_flags);
}
EXPORT_SYMBOL_GPL(blk_queue_flag_test_and_set);
void blk_rq_init(struct request_queue *q, struct request *rq)
{
memset(rq, 0, sizeof(*rq));
INIT_LIST_HEAD(&rq->queuelist);
rq->q = q;
rq->__sector = (sector_t) -1;
INIT_HLIST_NODE(&rq->hash);
RB_CLEAR_NODE(&rq->rb_node);
rq->tag = BLK_MQ_NO_TAG;
rq->internal_tag = BLK_MQ_NO_TAG;
rq->start_time_ns = ktime_get_ns();
rq->part = NULL;
blk_crypto_rq_set_defaults(rq);
}
EXPORT_SYMBOL(blk_rq_init);
#define REQ_OP_NAME(name) [REQ_OP_##name] = #name
static const char *const blk_op_name[] = {
REQ_OP_NAME(READ),
REQ_OP_NAME(WRITE),
REQ_OP_NAME(FLUSH),
REQ_OP_NAME(DISCARD),
REQ_OP_NAME(SECURE_ERASE),
REQ_OP_NAME(ZONE_RESET),
REQ_OP_NAME(ZONE_RESET_ALL),
REQ_OP_NAME(ZONE_OPEN),
REQ_OP_NAME(ZONE_CLOSE),
REQ_OP_NAME(ZONE_FINISH),
REQ_OP_NAME(ZONE_APPEND),
REQ_OP_NAME(WRITE_SAME),
REQ_OP_NAME(WRITE_ZEROES),
REQ_OP_NAME(DRV_IN),
REQ_OP_NAME(DRV_OUT),
};
#undef REQ_OP_NAME
/**
* blk_op_str - Return string XXX in the REQ_OP_XXX.
* @op: REQ_OP_XXX.
*
* Description: Centralize block layer function to convert REQ_OP_XXX into
* string format. Useful in the debugging and tracing bio or request. For
* invalid REQ_OP_XXX it returns string "UNKNOWN".
*/
inline const char *blk_op_str(unsigned int op)
{
const char *op_str = "UNKNOWN";
if (op < ARRAY_SIZE(blk_op_name) && blk_op_name[op])
op_str = blk_op_name[op];
return op_str;
}
EXPORT_SYMBOL_GPL(blk_op_str);
static const struct {
int errno;
const char *name;
} blk_errors[] = {
[BLK_STS_OK] = { 0, "" },
[BLK_STS_NOTSUPP] = { -EOPNOTSUPP, "operation not supported" },
[BLK_STS_TIMEOUT] = { -ETIMEDOUT, "timeout" },
[BLK_STS_NOSPC] = { -ENOSPC, "critical space allocation" },
[BLK_STS_TRANSPORT] = { -ENOLINK, "recoverable transport" },
[BLK_STS_TARGET] = { -EREMOTEIO, "critical target" },
[BLK_STS_NEXUS] = { -EBADE, "critical nexus" },
[BLK_STS_MEDIUM] = { -ENODATA, "critical medium" },
[BLK_STS_PROTECTION] = { -EILSEQ, "protection" },
[BLK_STS_RESOURCE] = { -ENOMEM, "kernel resource" },
[BLK_STS_DEV_RESOURCE] = { -EBUSY, "device resource" },
[BLK_STS_AGAIN] = { -EAGAIN, "nonblocking retry" },
/* device mapper special case, should not leak out: */
[BLK_STS_DM_REQUEUE] = { -EREMCHG, "dm internal retry" },
/* zone device specific errors */
[BLK_STS_ZONE_OPEN_RESOURCE] = { -ETOOMANYREFS, "open zones exceeded" },
[BLK_STS_ZONE_ACTIVE_RESOURCE] = { -EOVERFLOW, "active zones exceeded" },
/* everything else not covered above: */
[BLK_STS_IOERR] = { -EIO, "I/O" },
};
blk_status_t errno_to_blk_status(int errno)
{
int i;
for (i = 0; i < ARRAY_SIZE(blk_errors); i++) { if (blk_errors[i].errno == errno) return (__force blk_status_t)i;
}
return BLK_STS_IOERR;
}
EXPORT_SYMBOL_GPL(errno_to_blk_status);
int blk_status_to_errno(blk_status_t status)
{
int idx = (__force int)status; if (WARN_ON_ONCE(idx >= ARRAY_SIZE(blk_errors)))
return -EIO;
return blk_errors[idx].errno;
}
EXPORT_SYMBOL_GPL(blk_status_to_errno);
static void print_req_error(struct request *req, blk_status_t status,
const char *caller)
{
int idx = (__force int)status;
if (WARN_ON_ONCE(idx >= ARRAY_SIZE(blk_errors)))
return;
printk_ratelimited(KERN_ERR
"%s: %s error, dev %s, sector %llu op 0x%x:(%s) flags 0x%x "
"phys_seg %u prio class %u\n",
caller, blk_errors[idx].name,
req->rq_disk ? req->rq_disk->disk_name : "?",
blk_rq_pos(req), req_op(req), blk_op_str(req_op(req)),
req->cmd_flags & ~REQ_OP_MASK,
req->nr_phys_segments,
IOPRIO_PRIO_CLASS(req->ioprio));
}
static void req_bio_endio(struct request *rq, struct bio *bio,
unsigned int nbytes, blk_status_t error)
{
if (error)
bio->bi_status = error;
if (unlikely(rq->rq_flags & RQF_QUIET))
bio_set_flag(bio, BIO_QUIET);
bio_advance(bio, nbytes);
if (req_op(rq) == REQ_OP_ZONE_APPEND && error == BLK_STS_OK) {
/*
* Partial zone append completions cannot be supported as the
* BIO fragments may end up not being written sequentially.
*/
if (bio->bi_iter.bi_size)
bio->bi_status = BLK_STS_IOERR;
else
bio->bi_iter.bi_sector = rq->__sector;
}
/* don't actually finish bio if it's part of flush sequence */
if (bio->bi_iter.bi_size == 0 && !(rq->rq_flags & RQF_FLUSH_SEQ))
bio_endio(bio);
}
void blk_dump_rq_flags(struct request *rq, char *msg)
{
printk(KERN_INFO "%s: dev %s: flags=%llx\n", msg,
rq->rq_disk ? rq->rq_disk->disk_name : "?",
(unsigned long long) rq->cmd_flags);
printk(KERN_INFO " sector %llu, nr/cnr %u/%u\n",
(unsigned long long)blk_rq_pos(rq),
blk_rq_sectors(rq), blk_rq_cur_sectors(rq));
printk(KERN_INFO " bio %p, biotail %p, len %u\n",
rq->bio, rq->biotail, blk_rq_bytes(rq));
}
EXPORT_SYMBOL(blk_dump_rq_flags);
/**
* blk_sync_queue - cancel any pending callbacks on a queue
* @q: the queue
*
* Description:
* The block layer may perform asynchronous callback activity
* on a queue, such as calling the unplug function after a timeout.
* A block device may call blk_sync_queue to ensure that any
* such activity is cancelled, thus allowing it to release resources
* that the callbacks might use. The caller must already have made sure
* that its ->submit_bio will not re-add plugging prior to calling
* this function.
*
* This function does not cancel any asynchronous activity arising
* out of elevator or throttling code. That would require elevator_exit()
* and blkcg_exit_queue() to be called with queue lock initialized.
*
*/
void blk_sync_queue(struct request_queue *q)
{
del_timer_sync(&q->timeout);
cancel_work_sync(&q->timeout_work);
}
EXPORT_SYMBOL(blk_sync_queue);
/**
* blk_set_pm_only - increment pm_only counter
* @q: request queue pointer
*/
void blk_set_pm_only(struct request_queue *q)
{
atomic_inc(&q->pm_only);
}
EXPORT_SYMBOL_GPL(blk_set_pm_only);
void blk_clear_pm_only(struct request_queue *q)
{
int pm_only;
pm_only = atomic_dec_return(&q->pm_only);
WARN_ON_ONCE(pm_only < 0);
if (pm_only == 0)
wake_up_all(&q->mq_freeze_wq);
}
EXPORT_SYMBOL_GPL(blk_clear_pm_only);
/**
* blk_put_queue - decrement the request_queue refcount
* @q: the request_queue structure to decrement the refcount for
*
* Decrements the refcount of the request_queue kobject. When this reaches 0
* we'll have blk_release_queue() called.
*
* Context: Any context, but the last reference must not be dropped from
* atomic context.
*/
void blk_put_queue(struct request_queue *q)
{
kobject_put(&q->kobj);
}
EXPORT_SYMBOL(blk_put_queue);
void blk_queue_start_drain(struct request_queue *q)
{
/*
* When queue DYING flag is set, we need to block new req
* entering queue, so we call blk_freeze_queue_start() to
* prevent I/O from crossing blk_queue_enter().
*/
blk_freeze_queue_start(q);
if (queue_is_mq(q))
blk_mq_wake_waiters(q);
/* Make blk_queue_enter() reexamine the DYING flag. */
wake_up_all(&q->mq_freeze_wq);
}
/**
* blk_cleanup_queue - shutdown a request queue
* @q: request queue to shutdown
*
* Mark @q DYING, drain all pending requests, mark @q DEAD, destroy and
* put it. All future requests will be failed immediately with -ENODEV.
*
* Context: can sleep
*/
void blk_cleanup_queue(struct request_queue *q)
{
/* cannot be called from atomic context */
might_sleep();
WARN_ON_ONCE(blk_queue_registered(q));
/* mark @q DYING, no new request or merges will be allowed afterwards */
blk_queue_flag_set(QUEUE_FLAG_DYING, q);
blk_queue_start_drain(q);
blk_queue_flag_set(QUEUE_FLAG_NOMERGES, q);
blk_queue_flag_set(QUEUE_FLAG_NOXMERGES, q);
/*
* Drain all requests queued before DYING marking. Set DEAD flag to
* prevent that blk_mq_run_hw_queues() accesses the hardware queues
* after draining finished.
*/
blk_freeze_queue(q);
/* cleanup rq qos structures for queue without disk */
rq_qos_exit(q);
blk_queue_flag_set(QUEUE_FLAG_DEAD, q);
blk_sync_queue(q);
if (queue_is_mq(q)) {
blk_mq_cancel_work_sync(q);
blk_mq_exit_queue(q);
}
/*
* In theory, request pool of sched_tags belongs to request queue.
* However, the current implementation requires tag_set for freeing
* requests, so free the pool now.
*
* Queue has become frozen, there can't be any in-queue requests, so
* it is safe to free requests now.
*/
mutex_lock(&q->sysfs_lock);
if (q->elevator)
blk_mq_sched_free_requests(q);
mutex_unlock(&q->sysfs_lock);
percpu_ref_exit(&q->q_usage_counter);
/* @q is and will stay empty, shutdown and put */
blk_put_queue(q);
}
EXPORT_SYMBOL(blk_cleanup_queue);
static bool blk_try_enter_queue(struct request_queue *q, bool pm)
{
rcu_read_lock();
if (!percpu_ref_tryget_live(&q->q_usage_counter))
goto fail;
/*
* The code that increments the pm_only counter must ensure that the
* counter is globally visible before the queue is unfrozen.
*/
if (blk_queue_pm_only(q) && (!pm || queue_rpm_status(q) == RPM_SUSPENDED))
goto fail_put;
rcu_read_unlock();
return true;
fail_put:
percpu_ref_put(&q->q_usage_counter);
fail:
rcu_read_unlock();
return false;
}
/**
* blk_queue_enter() - try to increase q->q_usage_counter
* @q: request queue pointer
* @flags: BLK_MQ_REQ_NOWAIT and/or BLK_MQ_REQ_PM
*/
int blk_queue_enter(struct request_queue *q, blk_mq_req_flags_t flags)
{
const bool pm = flags & BLK_MQ_REQ_PM;
while (!blk_try_enter_queue(q, pm)) {
if (flags & BLK_MQ_REQ_NOWAIT)
return -EBUSY;
/*
* read pair of barrier in blk_freeze_queue_start(), we need to
* order reading __PERCPU_REF_DEAD flag of .q_usage_counter and
* reading .mq_freeze_depth or queue dying flag, otherwise the
* following wait may never return if the two reads are
* reordered.
*/
smp_rmb();
wait_event(q->mq_freeze_wq,
(!q->mq_freeze_depth &&
blk_pm_resume_queue(pm, q)) ||
blk_queue_dying(q));
if (blk_queue_dying(q))
return -ENODEV;
}
return 0;
}
static inline int bio_queue_enter(struct bio *bio)
{
struct gendisk *disk = bio->bi_bdev->bd_disk;
struct request_queue *q = disk->queue;
while (!blk_try_enter_queue(q, false)) { if (bio->bi_opf & REQ_NOWAIT) {
if (test_bit(GD_DEAD, &disk->state))
goto dead;
bio_wouldblock_error(bio);
return -EBUSY;
}
/*
* read pair of barrier in blk_freeze_queue_start(), we need to
* order reading __PERCPU_REF_DEAD flag of .q_usage_counter and
* reading .mq_freeze_depth or queue dying flag, otherwise the
* following wait may never return if the two reads are
* reordered.
*/
smp_rmb(); wait_event(q->mq_freeze_wq,
(!q->mq_freeze_depth &&
blk_pm_resume_queue(false, q)) ||
test_bit(GD_DEAD, &disk->state));
if (test_bit(GD_DEAD, &disk->state))
goto dead;
}
return 0;
dead:
bio_io_error(bio);
return -ENODEV;
}
void blk_queue_exit(struct request_queue *q)
{
percpu_ref_put(&q->q_usage_counter);
}
static void blk_queue_usage_counter_release(struct percpu_ref *ref)
{
struct request_queue *q =
container_of(ref, struct request_queue, q_usage_counter);
wake_up_all(&q->mq_freeze_wq);
}
static void blk_rq_timed_out_timer(struct timer_list *t)
{
struct request_queue *q = from_timer(q, t, timeout);
kblockd_schedule_work(&q->timeout_work);
}
static void blk_timeout_work(struct work_struct *work)
{
}
struct request_queue *blk_alloc_queue(int node_id)
{
struct request_queue *q;
int ret;
q = kmem_cache_alloc_node(blk_requestq_cachep,
GFP_KERNEL | __GFP_ZERO, node_id);
if (!q)
return NULL;
q->last_merge = NULL;
q->id = ida_simple_get(&blk_queue_ida, 0, 0, GFP_KERNEL);
if (q->id < 0)
goto fail_q;
ret = bioset_init(&q->bio_split, BIO_POOL_SIZE, 0, 0);
if (ret)
goto fail_id;
q->stats = blk_alloc_queue_stats();
if (!q->stats)
goto fail_split;
q->node = node_id;
atomic_set(&q->nr_active_requests_shared_sbitmap, 0);
timer_setup(&q->timeout, blk_rq_timed_out_timer, 0);
INIT_WORK(&q->timeout_work, blk_timeout_work);
INIT_LIST_HEAD(&q->icq_list);
#ifdef CONFIG_BLK_CGROUP
INIT_LIST_HEAD(&q->blkg_list);
#endif
kobject_init(&q->kobj, &blk_queue_ktype);
mutex_init(&q->debugfs_mutex);
mutex_init(&q->sysfs_lock);
mutex_init(&q->sysfs_dir_lock);
spin_lock_init(&q->queue_lock);
init_waitqueue_head(&q->mq_freeze_wq);
mutex_init(&q->mq_freeze_lock);
/*
* Init percpu_ref in atomic mode so that it's faster to shutdown.
* See blk_register_queue() for details.
*/
if (percpu_ref_init(&q->q_usage_counter,
blk_queue_usage_counter_release,
PERCPU_REF_INIT_ATOMIC, GFP_KERNEL))
goto fail_stats;
if (blkcg_init_queue(q))
goto fail_ref;
blk_queue_dma_alignment(q, 511);
blk_set_default_limits(&q->limits);
q->nr_requests = BLKDEV_MAX_RQ;
return q;
fail_ref:
percpu_ref_exit(&q->q_usage_counter);
fail_stats:
blk_free_queue_stats(q->stats);
fail_split:
bioset_exit(&q->bio_split);
fail_id:
ida_simple_remove(&blk_queue_ida, q->id);
fail_q:
kmem_cache_free(blk_requestq_cachep, q);
return NULL;
}
/**
* blk_get_queue - increment the request_queue refcount
* @q: the request_queue structure to increment the refcount for
*
* Increment the refcount of the request_queue kobject.
*
* Context: Any context.
*/
bool blk_get_queue(struct request_queue *q)
{
if (likely(!blk_queue_dying(q))) {
__blk_get_queue(q);
return true;
}
return false;
}
EXPORT_SYMBOL(blk_get_queue);
/**
* blk_get_request - allocate a request
* @q: request queue to allocate a request for
* @op: operation (REQ_OP_*) and REQ_* flags, e.g. REQ_SYNC.
* @flags: BLK_MQ_REQ_* flags, e.g. BLK_MQ_REQ_NOWAIT.
*/
struct request *blk_get_request(struct request_queue *q, unsigned int op,
blk_mq_req_flags_t flags)
{
struct request *req;
WARN_ON_ONCE(op & REQ_NOWAIT);
WARN_ON_ONCE(flags & ~(BLK_MQ_REQ_NOWAIT | BLK_MQ_REQ_PM));
req = blk_mq_alloc_request(q, op, flags);
if (!IS_ERR(req) && q->mq_ops->initialize_rq_fn)
q->mq_ops->initialize_rq_fn(req);
return req;
}
EXPORT_SYMBOL(blk_get_request);
void blk_put_request(struct request *req)
{
blk_mq_free_request(req);
}
EXPORT_SYMBOL(blk_put_request);
static void handle_bad_sector(struct bio *bio, sector_t maxsector)
{
char b[BDEVNAME_SIZE];
pr_info_ratelimited("attempt to access beyond end of device\n"
"%s: rw=%d, want=%llu, limit=%llu\n",
bio_devname(bio, b), bio->bi_opf,
bio_end_sector(bio), maxsector);
}
#ifdef CONFIG_FAIL_MAKE_REQUEST
static DECLARE_FAULT_ATTR(fail_make_request);
static int __init setup_fail_make_request(char *str)
{
return setup_fault_attr(&fail_make_request, str);
}
__setup("fail_make_request=", setup_fail_make_request);
static bool should_fail_request(struct block_device *part, unsigned int bytes)
{
return part->bd_make_it_fail && should_fail(&fail_make_request, bytes);
}
static int __init fail_make_request_debugfs(void)
{
struct dentry *dir = fault_create_debugfs_attr("fail_make_request",
NULL, &fail_make_request);
return PTR_ERR_OR_ZERO(dir);
}
late_initcall(fail_make_request_debugfs);
#else /* CONFIG_FAIL_MAKE_REQUEST */
static inline bool should_fail_request(struct block_device *part,
unsigned int bytes)
{
return false;
}
#endif /* CONFIG_FAIL_MAKE_REQUEST */
static inline bool bio_check_ro(struct bio *bio)
{
if (op_is_write(bio_op(bio)) && bdev_read_only(bio->bi_bdev)) {
char b[BDEVNAME_SIZE];
if (op_is_flush(bio->bi_opf) && !bio_sectors(bio))
return false;
WARN_ONCE(1,
"Trying to write to read-only block-device %s (partno %d)\n",
bio_devname(bio, b), bio->bi_bdev->bd_partno);
/* Older lvm-tools actually trigger this */
return false;
}
return false;
}
static noinline int should_fail_bio(struct bio *bio)
{
if (should_fail_request(bdev_whole(bio->bi_bdev), bio->bi_iter.bi_size))
return -EIO;
return 0;
}
ALLOW_ERROR_INJECTION(should_fail_bio, ERRNO);
/*
* Check whether this bio extends beyond the end of the device or partition.
* This may well happen - the kernel calls bread() without checking the size of
* the device, e.g., when mounting a file system.
*/
static inline int bio_check_eod(struct bio *bio)
{
sector_t maxsector = bdev_nr_sectors(bio->bi_bdev); unsigned int nr_sectors = bio_sectors(bio);
if (nr_sectors && maxsector &&
(nr_sectors > maxsector || bio->bi_iter.bi_sector > maxsector - nr_sectors)) {
handle_bad_sector(bio, maxsector);
return -EIO;
}
return 0;
}
/*
* Remap block n of partition p to block n+start(p) of the disk.
*/
static int blk_partition_remap(struct bio *bio)
{
struct block_device *p = bio->bi_bdev;
if (unlikely(should_fail_request(p, bio->bi_iter.bi_size)))
return -EIO;
if (bio_sectors(bio)) {
bio->bi_iter.bi_sector += p->bd_start_sect;
trace_block_bio_remap(bio, p->bd_dev,
bio->bi_iter.bi_sector -
p->bd_start_sect);
}
bio_set_flag(bio, BIO_REMAPPED);
return 0;
}
/*
* Check write append to a zoned block device.
*/
static inline blk_status_t blk_check_zone_append(struct request_queue *q,
struct bio *bio)
{
sector_t pos = bio->bi_iter.bi_sector;
int nr_sectors = bio_sectors(bio);
/* Only applicable to zoned block devices */
if (!blk_queue_is_zoned(q))
return BLK_STS_NOTSUPP;
/* The bio sector must point to the start of a sequential zone */
if (pos & (blk_queue_zone_sectors(q) - 1) ||
!blk_queue_zone_is_seq(q, pos))
return BLK_STS_IOERR;
/*
* Not allowed to cross zone boundaries. Otherwise, the BIO will be
* split and could result in non-contiguous sectors being written in
* different zones.
*/
if (nr_sectors > q->limits.chunk_sectors)
return BLK_STS_IOERR;
/* Make sure the BIO is small enough and will not get split */
if (nr_sectors > q->limits.max_zone_append_sectors)
return BLK_STS_IOERR;
bio->bi_opf |= REQ_NOMERGE;
return BLK_STS_OK;
}
static noinline_for_stack bool submit_bio_checks(struct bio *bio)
{
struct block_device *bdev = bio->bi_bdev;
struct request_queue *q = bdev->bd_disk->queue;
blk_status_t status = BLK_STS_IOERR;
struct blk_plug *plug;
might_sleep();
plug = blk_mq_plug(q, bio);
if (plug && plug->nowait) bio->bi_opf |= REQ_NOWAIT;
/*
* For a REQ_NOWAIT based request, return -EOPNOTSUPP
* if queue does not support NOWAIT.
*/
if ((bio->bi_opf & REQ_NOWAIT) && !blk_queue_nowait(q))
goto not_supported;
if (should_fail_bio(bio))
goto end_io;
if (unlikely(bio_check_ro(bio)))
goto end_io;
if (!bio_flagged(bio, BIO_REMAPPED)) {
if (unlikely(bio_check_eod(bio)))
goto end_io;
if (bdev->bd_partno && unlikely(blk_partition_remap(bio)))
goto end_io;
}
/*
* Filter flush bio's early so that bio based drivers without flush
* support don't have to worry about them.
*/
if (op_is_flush(bio->bi_opf) &&
!test_bit(QUEUE_FLAG_WC, &q->queue_flags)) {
bio->bi_opf &= ~(REQ_PREFLUSH | REQ_FUA);
if (!bio_sectors(bio)) {
status = BLK_STS_OK;
goto end_io;
}
}
if (!test_bit(QUEUE_FLAG_POLL, &q->queue_flags))
bio_clear_hipri(bio);
switch (bio_op(bio)) {
case REQ_OP_DISCARD:
if (!blk_queue_discard(q))
goto not_supported;
break;
case REQ_OP_SECURE_ERASE:
if (!blk_queue_secure_erase(q))
goto not_supported;
break;
case REQ_OP_WRITE_SAME:
if (!q->limits.max_write_same_sectors)
goto not_supported;
break;
case REQ_OP_ZONE_APPEND:
status = blk_check_zone_append(q, bio);
if (status != BLK_STS_OK)
goto end_io;
break;
case REQ_OP_ZONE_RESET:
case REQ_OP_ZONE_OPEN:
case REQ_OP_ZONE_CLOSE:
case REQ_OP_ZONE_FINISH:
if (!blk_queue_is_zoned(q))
goto not_supported;
break;
case REQ_OP_ZONE_RESET_ALL:
if (!blk_queue_is_zoned(q) || !blk_queue_zone_resetall(q))
goto not_supported;
break;
case REQ_OP_WRITE_ZEROES:
if (!q->limits.max_write_zeroes_sectors)
goto not_supported;
break;
default:
break;
}
/*
* Various block parts want %current->io_context, so allocate it up
* front rather than dealing with lots of pain to allocate it only
* where needed. This may fail and the block layer knows how to live
* with it.
*/
if (unlikely(!current->io_context))
create_task_io_context(current, GFP_ATOMIC, q->node);
if (blk_throtl_bio(bio))
return false;
blk_cgroup_bio_start(bio);
blkcg_bio_issue_init(bio);
if (!bio_flagged(bio, BIO_TRACE_COMPLETION)) {
trace_block_bio_queue(bio);
/* Now that enqueuing has been traced, we need to trace
* completion as well.
*/
bio_set_flag(bio, BIO_TRACE_COMPLETION);
}
return true;
not_supported:
status = BLK_STS_NOTSUPP;
end_io:
bio->bi_status = status;
bio_endio(bio);
return false;
}
static blk_qc_t __submit_bio(struct bio *bio)
{
struct gendisk *disk = bio->bi_bdev->bd_disk;
blk_qc_t ret = BLK_QC_T_NONE;
if (unlikely(bio_queue_enter(bio) != 0))
return BLK_QC_T_NONE;
if (!submit_bio_checks(bio) || !blk_crypto_bio_prep(&bio))
goto queue_exit;
if (disk->fops->submit_bio) { ret = disk->fops->submit_bio(bio);
goto queue_exit;
}
return blk_mq_submit_bio(bio);
queue_exit:
blk_queue_exit(disk->queue);
return ret;
}
/*
* The loop in this function may be a bit non-obvious, and so deserves some
* explanation:
*
* - Before entering the loop, bio->bi_next is NULL (as all callers ensure
* that), so we have a list with a single bio.
* - We pretend that we have just taken it off a longer list, so we assign
* bio_list to a pointer to the bio_list_on_stack, thus initialising the
* bio_list of new bios to be added. ->submit_bio() may indeed add some more
* bios through a recursive call to submit_bio_noacct. If it did, we find a
* non-NULL value in bio_list and re-enter the loop from the top.
* - In this case we really did just take the bio of the top of the list (no
* pretending) and so remove it from bio_list, and call into ->submit_bio()
* again.
*
* bio_list_on_stack[0] contains bios submitted by the current ->submit_bio.
* bio_list_on_stack[1] contains bios that were submitted before the current
* ->submit_bio_bio, but that haven't been processed yet.
*/
static blk_qc_t __submit_bio_noacct(struct bio *bio)
{
struct bio_list bio_list_on_stack[2];
blk_qc_t ret = BLK_QC_T_NONE;
BUG_ON(bio->bi_next);
bio_list_init(&bio_list_on_stack[0]);
current->bio_list = bio_list_on_stack;
do {
struct request_queue *q = bio->bi_bdev->bd_disk->queue;
struct bio_list lower, same;
/*
* Create a fresh bio_list for all subordinate requests.
*/
bio_list_on_stack[1] = bio_list_on_stack[0];
bio_list_init(&bio_list_on_stack[0]);
ret = __submit_bio(bio);
/*
* Sort new bios into those for a lower level and those for the
* same level.
*/
bio_list_init(&lower);
bio_list_init(&same);
while ((bio = bio_list_pop(&bio_list_on_stack[0])) != NULL)
if (q == bio->bi_bdev->bd_disk->queue)
bio_list_add(&same, bio);
else
bio_list_add(&lower, bio);
/*
* Now assemble so we handle the lowest level first.
*/
bio_list_merge(&bio_list_on_stack[0], &lower);
bio_list_merge(&bio_list_on_stack[0], &same);
bio_list_merge(&bio_list_on_stack[0], &bio_list_on_stack[1]);
} while ((bio = bio_list_pop(&bio_list_on_stack[0])));
current->bio_list = NULL;
return ret;
}
static blk_qc_t __submit_bio_noacct_mq(struct bio *bio)
{
struct bio_list bio_list[2] = { };
blk_qc_t ret;
current->bio_list = bio_list;
do {
ret = __submit_bio(bio);
} while ((bio = bio_list_pop(&bio_list[0])));
current->bio_list = NULL;
return ret;
}
/**
* submit_bio_noacct - re-submit a bio to the block device layer for I/O
* @bio: The bio describing the location in memory and on the device.
*
* This is a version of submit_bio() that shall only be used for I/O that is
* resubmitted to lower level drivers by stacking block drivers. All file
* systems and other upper level users of the block layer should use
* submit_bio() instead.
*/
blk_qc_t submit_bio_noacct(struct bio *bio)
{
/*
* We only want one ->submit_bio to be active at a time, else stack
* usage with stacked devices could be a problem. Use current->bio_list
* to collect a list of requests submited by a ->submit_bio method while
* it is active, and then process them after it returned.
*/
if (current->bio_list) {
bio_list_add(¤t->bio_list[0], bio);
return BLK_QC_T_NONE;
}
if (!bio->bi_bdev->bd_disk->fops->submit_bio)
return __submit_bio_noacct_mq(bio);
return __submit_bio_noacct(bio);
}
EXPORT_SYMBOL(submit_bio_noacct);
/**
* submit_bio - submit a bio to the block device layer for I/O
* @bio: The &struct bio which describes the I/O
*
* submit_bio() is used to submit I/O requests to block devices. It is passed a
* fully set up &struct bio that describes the I/O that needs to be done. The
* bio will be send to the device described by the bi_bdev field.
*
* The success/failure status of the request, along with notification of
* completion, is delivered asynchronously through the ->bi_end_io() callback
* in @bio. The bio must NOT be touched by thecaller until ->bi_end_io() has
* been called.
*/
blk_qc_t submit_bio(struct bio *bio)
{
if (blkcg_punt_bio_submit(bio))
return BLK_QC_T_NONE;
/*
* If it's a regular read/write or a barrier with data attached,
* go through the normal accounting stuff before submission.
*/
if (bio_has_data(bio)) {
unsigned int count;
if (unlikely(bio_op(bio) == REQ_OP_WRITE_SAME))
count = queue_logical_block_size(
bio->bi_bdev->bd_disk->queue) >> 9;
else
count = bio_sectors(bio);
if (op_is_write(bio_op(bio))) {
count_vm_events(PGPGOUT, count);
} else {
task_io_account_read(bio->bi_iter.bi_size);
count_vm_events(PGPGIN, count);
}
}
/*
* If we're reading data that is part of the userspace workingset, count
* submission time as memory stall. When the device is congested, or
* the submitting cgroup IO-throttled, submission can be a significant
* part of overall IO time.
*/
if (unlikely(bio_op(bio) == REQ_OP_READ &&
bio_flagged(bio, BIO_WORKINGSET))) {
unsigned long pflags;
blk_qc_t ret;
psi_memstall_enter(&pflags);
ret = submit_bio_noacct(bio);
psi_memstall_leave(&pflags);
return ret;
}
return submit_bio_noacct(bio);
}
EXPORT_SYMBOL(submit_bio);
/**
* blk_cloned_rq_check_limits - Helper function to check a cloned request
* for the new queue limits
* @q: the queue
* @rq: the request being checked
*
* Description:
* @rq may have been made based on weaker limitations of upper-level queues
* in request stacking drivers, and it may violate the limitation of @q.
* Since the block layer and the underlying device driver trust @rq
* after it is inserted to @q, it should be checked against @q before
* the insertion using this generic function.
*
* Request stacking drivers like request-based dm may change the queue
* limits when retrying requests on other queues. Those requests need
* to be checked against the new queue limits again during dispatch.
*/
static blk_status_t blk_cloned_rq_check_limits(struct request_queue *q,
struct request *rq)
{
unsigned int max_sectors = blk_queue_get_max_sectors(q, req_op(rq));
if (blk_rq_sectors(rq) > max_sectors) {
/*
* SCSI device does not have a good way to return if
* Write Same/Zero is actually supported. If a device rejects
* a non-read/write command (discard, write same,etc.) the
* low-level device driver will set the relevant queue limit to
* 0 to prevent blk-lib from issuing more of the offending
* operations. Commands queued prior to the queue limit being
* reset need to be completed with BLK_STS_NOTSUPP to avoid I/O
* errors being propagated to upper layers.
*/
if (max_sectors == 0)
return BLK_STS_NOTSUPP;
printk(KERN_ERR "%s: over max size limit. (%u > %u)\n",
__func__, blk_rq_sectors(rq), max_sectors);
return BLK_STS_IOERR;
}
/*
* The queue settings related to segment counting may differ from the
* original queue.
*/
rq->nr_phys_segments = blk_recalc_rq_segments(rq);
if (rq->nr_phys_segments > queue_max_segments(q)) {
printk(KERN_ERR "%s: over max segments limit. (%hu > %hu)\n",
__func__, rq->nr_phys_segments, queue_max_segments(q));
return BLK_STS_IOERR;
}
return BLK_STS_OK;
}
/**
* blk_insert_cloned_request - Helper for stacking drivers to submit a request
* @q: the queue to submit the request
* @rq: the request being queued
*/
blk_status_t blk_insert_cloned_request(struct request_queue *q, struct request *rq)
{
blk_status_t ret;
ret = blk_cloned_rq_check_limits(q, rq);
if (ret != BLK_STS_OK)
return ret;
if (rq->rq_disk &&
should_fail_request(rq->rq_disk->part0, blk_rq_bytes(rq)))
return BLK_STS_IOERR;
if (blk_crypto_insert_cloned_request(rq))
return BLK_STS_IOERR;
if (blk_queue_io_stat(q))
blk_account_io_start(rq);
/*
* Since we have a scheduler attached on the top device,
* bypass a potential scheduler on the bottom device for
* insert.
*/
return blk_mq_request_issue_directly(rq, true);
}
EXPORT_SYMBOL_GPL(blk_insert_cloned_request);
/**
* blk_rq_err_bytes - determine number of bytes till the next failure boundary
* @rq: request to examine
*
* Description:
* A request could be merge of IOs which require different failure
* handling. This function determines the number of bytes which
* can be failed from the beginning of the request without
* crossing into area which need to be retried further.
*
* Return:
* The number of bytes to fail.
*/
unsigned int blk_rq_err_bytes(const struct request *rq)
{
unsigned int ff = rq->cmd_flags & REQ_FAILFAST_MASK;
unsigned int bytes = 0;
struct bio *bio;
if (!(rq->rq_flags & RQF_MIXED_MERGE))
return blk_rq_bytes(rq);
/*
* Currently the only 'mixing' which can happen is between
* different fastfail types. We can safely fail portions
* which have all the failfast bits that the first one has -
* the ones which are at least as eager to fail as the first
* one.
*/
for (bio = rq->bio; bio; bio = bio->bi_next) {
if ((bio->bi_opf & ff) != ff)
break;
bytes += bio->bi_iter.bi_size;
}
/* this could lead to infinite loop */
BUG_ON(blk_rq_bytes(rq) && !bytes);
return bytes;
}
EXPORT_SYMBOL_GPL(blk_rq_err_bytes);
static void update_io_ticks(struct block_device *part, unsigned long now,
bool end)
{
unsigned long stamp;
again:
stamp = READ_ONCE(part->bd_stamp);
if (unlikely(time_after(now, stamp))) {
if (likely(cmpxchg(&part->bd_stamp, stamp, now) == stamp)) __part_stat_add(part, io_ticks, end ? now - stamp : 1);
}
if (part->bd_partno) { part = bdev_whole(part);
goto again;
}
}
static void blk_account_io_completion(struct request *req, unsigned int bytes)
{
if (req->part && blk_do_io_stat(req)) {
const int sgrp = op_stat_group(req_op(req));
part_stat_lock();
part_stat_add(req->part, sectors[sgrp], bytes >> 9);
part_stat_unlock();
}
}
void blk_account_io_done(struct request *req, u64 now)
{
/*
* Account IO completion. flush_rq isn't accounted as a
* normal IO on queueing nor completion. Accounting the
* containing request is enough.
*/
if (req->part && blk_do_io_stat(req) &&
!(req->rq_flags & RQF_FLUSH_SEQ)) {
const int sgrp = op_stat_group(req_op(req));
part_stat_lock();
update_io_ticks(req->part, jiffies, true);
part_stat_inc(req->part, ios[sgrp]);
part_stat_add(req->part, nsecs[sgrp], now - req->start_time_ns);
part_stat_unlock();
}
}
void blk_account_io_start(struct request *rq)
{
if (!blk_do_io_stat(rq))
return;
/* passthrough requests can hold bios that do not have ->bi_bdev set */
if (rq->bio && rq->bio->bi_bdev)
rq->part = rq->bio->bi_bdev;
else
rq->part = rq->rq_disk->part0;
part_stat_lock();
update_io_ticks(rq->part, jiffies, false);
part_stat_unlock();
}
static unsigned long __part_start_io_acct(struct block_device *part,
unsigned int sectors, unsigned int op,
unsigned long start_time)
{
const int sgrp = op_stat_group(op);
part_stat_lock();
update_io_ticks(part, start_time, false);
part_stat_inc(part, ios[sgrp]);
part_stat_add(part, sectors[sgrp], sectors);
part_stat_local_inc(part, in_flight[op_is_write(op)]);
part_stat_unlock();
return start_time;
}
/**
* bio_start_io_acct_time - start I/O accounting for bio based drivers
* @bio: bio to start account for
* @start_time: start time that should be passed back to bio_end_io_acct().
*/
void bio_start_io_acct_time(struct bio *bio, unsigned long start_time)
{
__part_start_io_acct(bio->bi_bdev, bio_sectors(bio),
bio_op(bio), start_time);
}
EXPORT_SYMBOL_GPL(bio_start_io_acct_time);
/**
* bio_start_io_acct - start I/O accounting for bio based drivers
* @bio: bio to start account for
*
* Returns the start time that should be passed back to bio_end_io_acct().
*/
unsigned long bio_start_io_acct(struct bio *bio)
{
return __part_start_io_acct(bio->bi_bdev, bio_sectors(bio),
bio_op(bio), jiffies);
}
EXPORT_SYMBOL_GPL(bio_start_io_acct);
unsigned long disk_start_io_acct(struct gendisk *disk, unsigned int sectors,
unsigned int op)
{
return __part_start_io_acct(disk->part0, sectors, op, jiffies);
}
EXPORT_SYMBOL(disk_start_io_acct);
static void __part_end_io_acct(struct block_device *part, unsigned int op,
unsigned long start_time)
{
const int sgrp = op_stat_group(op);
unsigned long now = READ_ONCE(jiffies);
unsigned long duration = now - start_time;
part_stat_lock();
update_io_ticks(part, now, true);
part_stat_add(part, nsecs[sgrp], jiffies_to_nsecs(duration));
part_stat_local_dec(part, in_flight[op_is_write(op)]);
part_stat_unlock();
}
void bio_end_io_acct_remapped(struct bio *bio, unsigned long start_time,
struct block_device *orig_bdev)
{
__part_end_io_acct(orig_bdev, bio_op(bio), start_time);
}
EXPORT_SYMBOL_GPL(bio_end_io_acct_remapped);
void disk_end_io_acct(struct gendisk *disk, unsigned int op,
unsigned long start_time)
{
__part_end_io_acct(disk->part0, op, start_time);
}
EXPORT_SYMBOL(disk_end_io_acct);
/*
* Steal bios from a request and add them to a bio list.
* The request must not have been partially completed before.
*/
void blk_steal_bios(struct bio_list *list, struct request *rq)
{
if (rq->bio) {
if (list->tail)
list->tail->bi_next = rq->bio;
else
list->head = rq->bio;
list->tail = rq->biotail;
rq->bio = NULL;
rq->biotail = NULL;
}
rq->__data_len = 0;
}
EXPORT_SYMBOL_GPL(blk_steal_bios);
/**
* blk_update_request - Complete multiple bytes without completing the request
* @req: the request being processed
* @error: block status code
* @nr_bytes: number of bytes to complete for @req
*
* Description:
* Ends I/O on a number of bytes attached to @req, but doesn't complete
* the request structure even if @req doesn't have leftover.
* If @req has leftover, sets it up for the next range of segments.
*
* Passing the result of blk_rq_bytes() as @nr_bytes guarantees
* %false return from this function.
*
* Note:
* The RQF_SPECIAL_PAYLOAD flag is ignored on purpose in this function
* except in the consistency check at the end of this function.
*
* Return:
* %false - this request doesn't have any more data
* %true - this request has more data
**/
bool blk_update_request(struct request *req, blk_status_t error,
unsigned int nr_bytes)
{
int total_bytes;
trace_block_rq_complete(req, blk_status_to_errno(error), nr_bytes);
if (!req->bio)
return false;
#ifdef CONFIG_BLK_DEV_INTEGRITY
if (blk_integrity_rq(req) && req_op(req) == REQ_OP_READ &&
error == BLK_STS_OK)
req->q->integrity.profile->complete_fn(req, nr_bytes);
#endif
if (unlikely(error && !blk_rq_is_passthrough(req) &&
!(req->rq_flags & RQF_QUIET)))
print_req_error(req, error, __func__);
blk_account_io_completion(req, nr_bytes);
total_bytes = 0;
while (req->bio) {
struct bio *bio = req->bio;
unsigned bio_bytes = min(bio->bi_iter.bi_size, nr_bytes);
if (bio_bytes == bio->bi_iter.bi_size)
req->bio = bio->bi_next;
/* Completion has already been traced */
bio_clear_flag(bio, BIO_TRACE_COMPLETION);
req_bio_endio(req, bio, bio_bytes, error);
total_bytes += bio_bytes;
nr_bytes -= bio_bytes;
if (!nr_bytes)
break;
}
/*
* completely done
*/
if (!req->bio) {
/*
* Reset counters so that the request stacking driver
* can find how many bytes remain in the request
* later.
*/
req->__data_len = 0;
return false;
}
req->__data_len -= total_bytes;
/* update sector only for requests with clear definition of sector */
if (!blk_rq_is_passthrough(req))
req->__sector += total_bytes >> 9;
/* mixed attributes always follow the first bio */
if (req->rq_flags & RQF_MIXED_MERGE) {
req->cmd_flags &= ~REQ_FAILFAST_MASK;
req->cmd_flags |= req->bio->bi_opf & REQ_FAILFAST_MASK;
}
if (!(req->rq_flags & RQF_SPECIAL_PAYLOAD)) {
/*
* If total number of sectors is less than the first segment
* size, something has gone terribly wrong.
*/
if (blk_rq_bytes(req) < blk_rq_cur_bytes(req)) {
blk_dump_rq_flags(req, "request botched");
req->__data_len = blk_rq_cur_bytes(req);
}
/* recalculate the number of segments */
req->nr_phys_segments = blk_recalc_rq_segments(req);
}
return true;
}
EXPORT_SYMBOL_GPL(blk_update_request);
#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE
/**
* rq_flush_dcache_pages - Helper function to flush all pages in a request
* @rq: the request to be flushed
*
* Description:
* Flush all pages in @rq.
*/
void rq_flush_dcache_pages(struct request *rq)
{
struct req_iterator iter;
struct bio_vec bvec;
rq_for_each_segment(bvec, rq, iter)
flush_dcache_page(bvec.bv_page);
}
EXPORT_SYMBOL_GPL(rq_flush_dcache_pages);
#endif
/**
* blk_lld_busy - Check if underlying low-level drivers of a device are busy
* @q : the queue of the device being checked
*
* Description:
* Check if underlying low-level drivers of a device are busy.
* If the drivers want to export their busy state, they must set own
* exporting function using blk_queue_lld_busy() first.
*
* Basically, this function is used only by request stacking drivers
* to stop dispatching requests to underlying devices when underlying
* devices are busy. This behavior helps more I/O merging on the queue
* of the request stacking driver and prevents I/O throughput regression
* on burst I/O load.
*
* Return:
* 0 - Not busy (The request stacking driver should dispatch request)
* 1 - Busy (The request stacking driver should stop dispatching request)
*/
int blk_lld_busy(struct request_queue *q)
{
if (queue_is_mq(q) && q->mq_ops->busy)
return q->mq_ops->busy(q);
return 0;
}
EXPORT_SYMBOL_GPL(blk_lld_busy);
/**
* blk_rq_unprep_clone - Helper function to free all bios in a cloned request
* @rq: the clone request to be cleaned up
*
* Description:
* Free all bios in @rq for a cloned request.
*/
void blk_rq_unprep_clone(struct request *rq)
{
struct bio *bio;
while ((bio = rq->bio) != NULL) {
rq->bio = bio->bi_next;
bio_put(bio);
}
}
EXPORT_SYMBOL_GPL(blk_rq_unprep_clone);
/**
* blk_rq_prep_clone - Helper function to setup clone request
* @rq: the request to be setup
* @rq_src: original request to be cloned
* @bs: bio_set that bios for clone are allocated from
* @gfp_mask: memory allocation mask for bio
* @bio_ctr: setup function to be called for each clone bio.
* Returns %0 for success, non %0 for failure.
* @data: private data to be passed to @bio_ctr
*
* Description:
* Clones bios in @rq_src to @rq, and copies attributes of @rq_src to @rq.
* Also, pages which the original bios are pointing to are not copied
* and the cloned bios just point same pages.
* So cloned bios must be completed before original bios, which means
* the caller must complete @rq before @rq_src.
*/
int blk_rq_prep_clone(struct request *rq, struct request *rq_src,
struct bio_set *bs, gfp_t gfp_mask,
int (*bio_ctr)(struct bio *, struct bio *, void *),
void *data)
{
struct bio *bio, *bio_src;
if (!bs)
bs = &fs_bio_set;
__rq_for_each_bio(bio_src, rq_src) {
bio = bio_clone_fast(bio_src, gfp_mask, bs);
if (!bio)
goto free_and_out;
if (bio_ctr && bio_ctr(bio, bio_src, data))
goto free_and_out;
if (rq->bio) {
rq->biotail->bi_next = bio;
rq->biotail = bio;
} else {
rq->bio = rq->biotail = bio;
}
bio = NULL;
}
/* Copy attributes of the original request to the clone request. */
rq->__sector = blk_rq_pos(rq_src);
rq->__data_len = blk_rq_bytes(rq_src);
if (rq_src->rq_flags & RQF_SPECIAL_PAYLOAD) {
rq->rq_flags |= RQF_SPECIAL_PAYLOAD;
rq->special_vec = rq_src->special_vec;
}
rq->nr_phys_segments = rq_src->nr_phys_segments;
rq->ioprio = rq_src->ioprio;
if (rq->bio && blk_crypto_rq_bio_prep(rq, rq->bio, gfp_mask) < 0)
goto free_and_out;
return 0;
free_and_out:
if (bio)
bio_put(bio);
blk_rq_unprep_clone(rq);
return -ENOMEM;
}
EXPORT_SYMBOL_GPL(blk_rq_prep_clone);
int kblockd_schedule_work(struct work_struct *work)
{
return queue_work(kblockd_workqueue, work);
}
EXPORT_SYMBOL(kblockd_schedule_work);
int kblockd_mod_delayed_work_on(int cpu, struct delayed_work *dwork,
unsigned long delay)
{
return mod_delayed_work_on(cpu, kblockd_workqueue, dwork, delay);
}
EXPORT_SYMBOL(kblockd_mod_delayed_work_on);
/**
* blk_start_plug - initialize blk_plug and track it inside the task_struct
* @plug: The &struct blk_plug that needs to be initialized
*
* Description:
* blk_start_plug() indicates to the block layer an intent by the caller
* to submit multiple I/O requests in a batch. The block layer may use
* this hint to defer submitting I/Os from the caller until blk_finish_plug()
* is called. However, the block layer may choose to submit requests
* before a call to blk_finish_plug() if the number of queued I/Os
* exceeds %BLK_MAX_REQUEST_COUNT, or if the size of the I/O is larger than
* %BLK_PLUG_FLUSH_SIZE. The queued I/Os may also be submitted early if
* the task schedules (see below).
*
* Tracking blk_plug inside the task_struct will help with auto-flushing the
* pending I/O should the task end up blocking between blk_start_plug() and
* blk_finish_plug(). This is important from a performance perspective, but
* also ensures that we don't deadlock. For instance, if the task is blocking
* for a memory allocation, memory reclaim could end up wanting to free a
* page belonging to that request that is currently residing in our private
* plug. By flushing the pending I/O when the process goes to sleep, we avoid
* this kind of deadlock.
*/
void blk_start_plug(struct blk_plug *plug)
{
struct task_struct *tsk = current;
/*
* If this is a nested plug, don't actually assign it.
*/
if (tsk->plug)
return;
INIT_LIST_HEAD(&plug->mq_list);
INIT_LIST_HEAD(&plug->cb_list);
plug->rq_count = 0;
plug->multiple_queues = false;
plug->nowait = false;
/*
* Store ordering should not be needed here, since a potential
* preempt will imply a full memory barrier
*/
tsk->plug = plug;
}
EXPORT_SYMBOL(blk_start_plug);
static void flush_plug_callbacks(struct blk_plug *plug, bool from_schedule)
{
LIST_HEAD(callbacks);
while (!list_empty(&plug->cb_list)) {
list_splice_init(&plug->cb_list, &callbacks);
while (!list_empty(&callbacks)) {
struct blk_plug_cb *cb = list_first_entry(&callbacks,
struct blk_plug_cb,
list);
list_del(&cb->list);
cb->callback(cb, from_schedule);
}
}
}
struct blk_plug_cb *blk_check_plugged(blk_plug_cb_fn unplug, void *data,
int size)
{
struct blk_plug *plug = current->plug;
struct blk_plug_cb *cb;
if (!plug)
return NULL;
list_for_each_entry(cb, &plug->cb_list, list)
if (cb->callback == unplug && cb->data == data)
return cb;
/* Not currently on the callback list */
BUG_ON(size < sizeof(*cb));
cb = kzalloc(size, GFP_ATOMIC);
if (cb) {
cb->data = data;
cb->callback = unplug;
list_add(&cb->list, &plug->cb_list);
}
return cb;
}
EXPORT_SYMBOL(blk_check_plugged);
void blk_flush_plug_list(struct blk_plug *plug, bool from_schedule)
{
flush_plug_callbacks(plug, from_schedule);
if (!list_empty(&plug->mq_list))
blk_mq_flush_plug_list(plug, from_schedule);}
/**
* blk_finish_plug - mark the end of a batch of submitted I/O
* @plug: The &struct blk_plug passed to blk_start_plug()
*
* Description:
* Indicate that a batch of I/O submissions is complete. This function
* must be paired with an initial call to blk_start_plug(). The intent
* is to allow the block layer to optimize I/O submission. See the
* documentation for blk_start_plug() for more information.
*/
void blk_finish_plug(struct blk_plug *plug)
{
if (plug != current->plug)
return;
blk_flush_plug_list(plug, false);
current->plug = NULL;
}
EXPORT_SYMBOL(blk_finish_plug);
void blk_io_schedule(void)
{
/* Prevent hang_check timer from firing at us during very long I/O */
unsigned long timeout = sysctl_hung_task_timeout_secs * HZ / 2;
if (timeout)
io_schedule_timeout(timeout);
else
io_schedule();
}
EXPORT_SYMBOL_GPL(blk_io_schedule);
int __init blk_dev_init(void)
{
BUILD_BUG_ON(REQ_OP_LAST >= (1 << REQ_OP_BITS));
BUILD_BUG_ON(REQ_OP_BITS + REQ_FLAG_BITS > 8 *
sizeof_field(struct request, cmd_flags));
BUILD_BUG_ON(REQ_OP_BITS + REQ_FLAG_BITS > 8 *
sizeof_field(struct bio, bi_opf));
/* used for unplugging and affects IO latency/throughput - HIGHPRI */
kblockd_workqueue = alloc_workqueue("kblockd",
WQ_MEM_RECLAIM | WQ_HIGHPRI, 0);
if (!kblockd_workqueue)
panic("Failed to create kblockd\n");
blk_requestq_cachep = kmem_cache_create("request_queue",
sizeof(struct request_queue), 0, SLAB_PANIC, NULL);
blk_debugfs_root = debugfs_create_dir("block", NULL);
return 0;
}
// SPDX-License-Identifier: GPL-2.0-only
/*
* Copyright (C) 1993 Linus Torvalds
* Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999
* SMP-safe vmalloc/vfree/ioremap, Tigran Aivazian <tigran@veritas.com>, May 2000
* Major rework to support vmap/vunmap, Christoph Hellwig, SGI, August 2002
* Numa awareness, Christoph Lameter, SGI, June 2005
* Improving global KVA allocator, Uladzislau Rezki, Sony, May 2019
*/
#include <linux/vmalloc.h>
#include <linux/mm.h>
#include <linux/module.h>
#include <linux/highmem.h>
#include <linux/sched/signal.h>
#include <linux/slab.h>
#include <linux/spinlock.h>
#include <linux/interrupt.h>
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
#include <linux/set_memory.h>
#include <linux/debugobjects.h>
#include <linux/kallsyms.h>
#include <linux/list.h>
#include <linux/notifier.h>
#include <linux/rbtree.h>
#include <linux/xarray.h>
#include <linux/io.h>
#include <linux/rcupdate.h>
#include <linux/pfn.h>
#include <linux/kmemleak.h>
#include <linux/atomic.h>
#include <linux/compiler.h>
#include <linux/llist.h>
#include <linux/bitops.h>
#include <linux/rbtree_augmented.h>
#include <linux/overflow.h>
#include <linux/pgtable.h>
#include <linux/uaccess.h>
#include <linux/hugetlb.h>
#include <asm/tlbflush.h>
#include <asm/shmparam.h>
#include "internal.h"
#include "pgalloc-track.h"
#ifdef CONFIG_HAVE_ARCH_HUGE_VMAP
static unsigned int __ro_after_init ioremap_max_page_shift = BITS_PER_LONG - 1;
static int __init set_nohugeiomap(char *str)
{
ioremap_max_page_shift = PAGE_SHIFT;
return 0;
}
early_param("nohugeiomap", set_nohugeiomap);
#else /* CONFIG_HAVE_ARCH_HUGE_VMAP */
static const unsigned int ioremap_max_page_shift = PAGE_SHIFT;
#endif /* CONFIG_HAVE_ARCH_HUGE_VMAP */
#ifdef CONFIG_HAVE_ARCH_HUGE_VMALLOC
static bool __ro_after_init vmap_allow_huge = true;
static int __init set_nohugevmalloc(char *str)
{
vmap_allow_huge = false;
return 0;
}
early_param("nohugevmalloc", set_nohugevmalloc);
#else /* CONFIG_HAVE_ARCH_HUGE_VMALLOC */
static const bool vmap_allow_huge = false;
#endif /* CONFIG_HAVE_ARCH_HUGE_VMALLOC */
bool is_vmalloc_addr(const void *x)
{
unsigned long addr = (unsigned long)x; return addr >= VMALLOC_START && addr < VMALLOC_END;
}
EXPORT_SYMBOL(is_vmalloc_addr);
struct vfree_deferred {
struct llist_head list;
struct work_struct wq;
};
static DEFINE_PER_CPU(struct vfree_deferred, vfree_deferred);
static void __vunmap(const void *, int);
static void free_work(struct work_struct *w)
{
struct vfree_deferred *p = container_of(w, struct vfree_deferred, wq);
struct llist_node *t, *llnode;
llist_for_each_safe(llnode, t, llist_del_all(&p->list))
__vunmap((void *)llnode, 1);
}
/*** Page table manipulation functions ***/
static int vmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
phys_addr_t phys_addr, pgprot_t prot,
unsigned int max_page_shift, pgtbl_mod_mask *mask)
{
pte_t *pte;
u64 pfn;
unsigned long size = PAGE_SIZE;
pfn = phys_addr >> PAGE_SHIFT;
pte = pte_alloc_kernel_track(pmd, addr, mask);
if (!pte)
return -ENOMEM;
do {
BUG_ON(!pte_none(*pte));
#ifdef CONFIG_HUGETLB_PAGE
size = arch_vmap_pte_range_map_size(addr, end, pfn, max_page_shift);
if (size != PAGE_SIZE) {
pte_t entry = pfn_pte(pfn, prot);
entry = pte_mkhuge(entry);
entry = arch_make_huge_pte(entry, ilog2(size), 0);
set_huge_pte_at(&init_mm, addr, pte, entry);
pfn += PFN_DOWN(size);
continue;
}
#endif
set_pte_at(&init_mm, addr, pte, pfn_pte(pfn, prot));
pfn++;
} while (pte += PFN_DOWN(size), addr += size, addr != end);
*mask |= PGTBL_PTE_MODIFIED;
return 0;
}
static int vmap_try_huge_pmd(pmd_t *pmd, unsigned long addr, unsigned long end,
phys_addr_t phys_addr, pgprot_t prot,
unsigned int max_page_shift)
{
if (max_page_shift < PMD_SHIFT)
return 0;
if (!arch_vmap_pmd_supported(prot))
return 0;
if ((end - addr) != PMD_SIZE)
return 0;
if (!IS_ALIGNED(addr, PMD_SIZE))
return 0;
if (!IS_ALIGNED(phys_addr, PMD_SIZE))
return 0;
if (pmd_present(*pmd) && !pmd_free_pte_page(pmd, addr))
return 0;
return pmd_set_huge(pmd, phys_addr, prot);
}
static int vmap_pmd_range(pud_t *pud, unsigned long addr, unsigned long end,
phys_addr_t phys_addr, pgprot_t prot,
unsigned int max_page_shift, pgtbl_mod_mask *mask)
{
pmd_t *pmd;
unsigned long next;
pmd = pmd_alloc_track(&init_mm, pud, addr, mask);
if (!pmd)
return -ENOMEM;
do {
next = pmd_addr_end(addr, end);
if (vmap_try_huge_pmd(pmd, addr, next, phys_addr, prot,
max_page_shift)) {
*mask |= PGTBL_PMD_MODIFIED;
continue;
}
if (vmap_pte_range(pmd, addr, next, phys_addr, prot, max_page_shift, mask))
return -ENOMEM;
} while (pmd++, phys_addr += (next - addr), addr = next, addr != end);
return 0;
}
static int vmap_try_huge_pud(pud_t *pud, unsigned long addr, unsigned long end,
phys_addr_t phys_addr, pgprot_t prot,
unsigned int max_page_shift)
{
if (max_page_shift < PUD_SHIFT)
return 0;
if (!arch_vmap_pud_supported(prot))
return 0;
if ((end - addr) != PUD_SIZE)
return 0;
if (!IS_ALIGNED(addr, PUD_SIZE))
return 0;
if (!IS_ALIGNED(phys_addr, PUD_SIZE))
return 0;
if (pud_present(*pud) && !pud_free_pmd_page(pud, addr))
return 0;
return pud_set_huge(pud, phys_addr, prot);
}
static int vmap_pud_range(p4d_t *p4d, unsigned long addr, unsigned long end,
phys_addr_t phys_addr, pgprot_t prot,
unsigned int max_page_shift, pgtbl_mod_mask *mask)
{
pud_t *pud;
unsigned long next;
pud = pud_alloc_track(&init_mm, p4d, addr, mask);
if (!pud)
return -ENOMEM;
do {
next = pud_addr_end(addr, end);
if (vmap_try_huge_pud(pud, addr, next, phys_addr, prot,
max_page_shift)) {
*mask |= PGTBL_PUD_MODIFIED;
continue;
}
if (vmap_pmd_range(pud, addr, next, phys_addr, prot,
max_page_shift, mask))
return -ENOMEM;
} while (pud++, phys_addr += (next - addr), addr = next, addr != end);
return 0;
}
static int vmap_try_huge_p4d(p4d_t *p4d, unsigned long addr, unsigned long end,
phys_addr_t phys_addr, pgprot_t prot,
unsigned int max_page_shift)
{
if (max_page_shift < P4D_SHIFT)
return 0;
if (!arch_vmap_p4d_supported(prot))
return 0;
if ((end - addr) != P4D_SIZE)
return 0;
if (!IS_ALIGNED(addr, P4D_SIZE))
return 0;
if (!IS_ALIGNED(phys_addr, P4D_SIZE))
return 0;
if (p4d_present(*p4d) && !p4d_free_pud_page(p4d, addr))
return 0;
return p4d_set_huge(p4d, phys_addr, prot);
}
static int vmap_p4d_range(pgd_t *pgd, unsigned long addr, unsigned long end,
phys_addr_t phys_addr, pgprot_t prot,
unsigned int max_page_shift, pgtbl_mod_mask *mask)
{
p4d_t *p4d;
unsigned long next;
p4d = p4d_alloc_track(&init_mm, pgd, addr, mask);
if (!p4d)
return -ENOMEM;
do {
next = p4d_addr_end(addr, end);
if (vmap_try_huge_p4d(p4d, addr, next, phys_addr, prot,
max_page_shift)) {
*mask |= PGTBL_P4D_MODIFIED;
continue;
}
if (vmap_pud_range(p4d, addr, next, phys_addr, prot,
max_page_shift, mask))
return -ENOMEM;
} while (p4d++, phys_addr += (next - addr), addr = next, addr != end);
return 0;
}
static int vmap_range_noflush(unsigned long addr, unsigned long end,
phys_addr_t phys_addr, pgprot_t prot,
unsigned int max_page_shift)
{
pgd_t *pgd;
unsigned long start;
unsigned long next;
int err;
pgtbl_mod_mask mask = 0;
might_sleep();
BUG_ON(addr >= end);
start = addr;
pgd = pgd_offset_k(addr);
do {
next = pgd_addr_end(addr, end);
err = vmap_p4d_range(pgd, addr, next, phys_addr, prot,
max_page_shift, &mask);
if (err)
break;
} while (pgd++, phys_addr += (next - addr), addr = next, addr != end);
if (mask & ARCH_PAGE_TABLE_SYNC_MASK)
arch_sync_kernel_mappings(start, end);
return err;
}
int ioremap_page_range(unsigned long addr, unsigned long end,
phys_addr_t phys_addr, pgprot_t prot)
{
int err;
err = vmap_range_noflush(addr, end, phys_addr, pgprot_nx(prot),
ioremap_max_page_shift);
flush_cache_vmap(addr, end);
return err;
}
static void vunmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
pgtbl_mod_mask *mask)
{
pte_t *pte;
pte = pte_offset_kernel(pmd, addr);
do {
pte_t ptent = ptep_get_and_clear(&init_mm, addr, pte);
WARN_ON(!pte_none(ptent) && !pte_present(ptent)); } while (pte++, addr += PAGE_SIZE, addr != end);
*mask |= PGTBL_PTE_MODIFIED;
}
static void vunmap_pmd_range(pud_t *pud, unsigned long addr, unsigned long end,
pgtbl_mod_mask *mask)
{
pmd_t *pmd;
unsigned long next;
int cleared;
pmd = pmd_offset(pud, addr);
do {
next = pmd_addr_end(addr, end); cleared = pmd_clear_huge(pmd); if (cleared || pmd_bad(*pmd))
*mask |= PGTBL_PMD_MODIFIED;
if (cleared)
continue;
if (pmd_none_or_clear_bad(pmd))
continue;
vunmap_pte_range(pmd, addr, next, mask);
cond_resched();
} while (pmd++, addr = next, addr != end);
}
static void vunmap_pud_range(p4d_t *p4d, unsigned long addr, unsigned long end,
pgtbl_mod_mask *mask)
{
pud_t *pud;
unsigned long next;
int cleared;
pud = pud_offset(p4d, addr);
do {
next = pud_addr_end(addr, end); cleared = pud_clear_huge(pud); if (cleared || pud_bad(*pud))
*mask |= PGTBL_PUD_MODIFIED;
if (cleared)
continue;
if (pud_none_or_clear_bad(pud))
continue;
vunmap_pmd_range(pud, addr, next, mask);
} while (pud++, addr = next, addr != end);
}
static void vunmap_p4d_range(pgd_t *pgd, unsigned long addr, unsigned long end,
pgtbl_mod_mask *mask)
{
p4d_t *p4d;
unsigned long next;
int cleared;
p4d = p4d_offset(pgd, addr);
do {
next = p4d_addr_end(addr, end); cleared = p4d_clear_huge(p4d); if (cleared || p4d_bad(*p4d))
*mask |= PGTBL_P4D_MODIFIED;
if (cleared)
continue;
if (p4d_none_or_clear_bad(p4d))
continue;
vunmap_pud_range(p4d, addr, next, mask);
} while (p4d++, addr = next, addr != end);
}
/*
* vunmap_range_noflush is similar to vunmap_range, but does not
* flush caches or TLBs.
*
* The caller is responsible for calling flush_cache_vmap() before calling
* this function, and flush_tlb_kernel_range after it has returned
* successfully (and before the addresses are expected to cause a page fault
* or be re-mapped for something else, if TLB flushes are being delayed or
* coalesced).
*
* This is an internal function only. Do not use outside mm/.
*/
void vunmap_range_noflush(unsigned long start, unsigned long end)
{
unsigned long next;
pgd_t *pgd;
unsigned long addr = start;
pgtbl_mod_mask mask = 0;
BUG_ON(addr >= end); pgd = pgd_offset_k(addr);
do {
next = pgd_addr_end(addr, end); if (pgd_bad(*pgd))
mask |= PGTBL_PGD_MODIFIED;
if (pgd_none_or_clear_bad(pgd))
continue;
vunmap_p4d_range(pgd, addr, next, &mask);
} while (pgd++, addr = next, addr != end);
if (mask & ARCH_PAGE_TABLE_SYNC_MASK)
arch_sync_kernel_mappings(start, end);
}
/**
* vunmap_range - unmap kernel virtual addresses
* @addr: start of the VM area to unmap
* @end: end of the VM area to unmap (non-inclusive)
*
* Clears any present PTEs in the virtual address range, flushes TLBs and
* caches. Any subsequent access to the address before it has been re-mapped
* is a kernel bug.
*/
void vunmap_range(unsigned long addr, unsigned long end)
{
flush_cache_vunmap(addr, end);
vunmap_range_noflush(addr, end);
flush_tlb_kernel_range(addr, end);
}
static int vmap_pages_pte_range(pmd_t *pmd, unsigned long addr,
unsigned long end, pgprot_t prot, struct page **pages, int *nr,
pgtbl_mod_mask *mask)
{
pte_t *pte;
/*
* nr is a running index into the array which helps higher level
* callers keep track of where we're up to.
*/
pte = pte_alloc_kernel_track(pmd, addr, mask);
if (!pte)
return -ENOMEM;
do {
struct page *page = pages[*nr]; if (WARN_ON(!pte_none(*pte)))
return -EBUSY;
if (WARN_ON(!page))
return -ENOMEM;
set_pte_at(&init_mm, addr, pte, mk_pte(page, prot));
(*nr)++;
} while (pte++, addr += PAGE_SIZE, addr != end);
*mask |= PGTBL_PTE_MODIFIED;
return 0;
}
static int vmap_pages_pmd_range(pud_t *pud, unsigned long addr,
unsigned long end, pgprot_t prot, struct page **pages, int *nr,
pgtbl_mod_mask *mask)
{
pmd_t *pmd;
unsigned long next;
pmd = pmd_alloc_track(&init_mm, pud, addr, mask);
if (!pmd)
return -ENOMEM;
do {
next = pmd_addr_end(addr, end);
if (vmap_pages_pte_range(pmd, addr, next, prot, pages, nr, mask))
return -ENOMEM;
} while (pmd++, addr = next, addr != end);
return 0;
}
static int vmap_pages_pud_range(p4d_t *p4d, unsigned long addr,
unsigned long end, pgprot_t prot, struct page **pages, int *nr,
pgtbl_mod_mask *mask)
{
pud_t *pud;
unsigned long next;
pud = pud_alloc_track(&init_mm, p4d, addr, mask);
if (!pud)
return -ENOMEM;
do {
next = pud_addr_end(addr, end);
if (vmap_pages_pmd_range(pud, addr, next, prot, pages, nr, mask))
return -ENOMEM;
} while (pud++, addr = next, addr != end);
return 0;
}
static int vmap_pages_p4d_range(pgd_t *pgd, unsigned long addr,
unsigned long end, pgprot_t prot, struct page **pages, int *nr,
pgtbl_mod_mask *mask)
{
p4d_t *p4d;
unsigned long next;
p4d = p4d_alloc_track(&init_mm, pgd, addr, mask);
if (!p4d)
return -ENOMEM;
do {
next = p4d_addr_end(addr, end);
if (vmap_pages_pud_range(p4d, addr, next, prot, pages, nr, mask))
return -ENOMEM;
} while (p4d++, addr = next, addr != end);
return 0;
}
static int vmap_small_pages_range_noflush(unsigned long addr, unsigned long end,
pgprot_t prot, struct page **pages)
{
unsigned long start = addr;
pgd_t *pgd;
unsigned long next;
int err = 0;
int nr = 0;
pgtbl_mod_mask mask = 0;
BUG_ON(addr >= end); pgd = pgd_offset_k(addr);
do {
next = pgd_addr_end(addr, end);
if (pgd_bad(*pgd))
mask |= PGTBL_PGD_MODIFIED;
err = vmap_pages_p4d_range(pgd, addr, next, prot, pages, &nr, &mask);
if (err)
return err;
} while (pgd++, addr = next, addr != end);
if (mask & ARCH_PAGE_TABLE_SYNC_MASK)
arch_sync_kernel_mappings(start, end);
return 0;
}
/*
* vmap_pages_range_noflush is similar to vmap_pages_range, but does not
* flush caches.
*
* The caller is responsible for calling flush_cache_vmap() after this
* function returns successfully and before the addresses are accessed.
*
* This is an internal function only. Do not use outside mm/.
*/
int vmap_pages_range_noflush(unsigned long addr, unsigned long end,
pgprot_t prot, struct page **pages, unsigned int page_shift)
{
unsigned int i, nr = (end - addr) >> PAGE_SHIFT;
WARN_ON(page_shift < PAGE_SHIFT);
if (!IS_ENABLED(CONFIG_HAVE_ARCH_HUGE_VMALLOC) ||
page_shift == PAGE_SHIFT)
return vmap_small_pages_range_noflush(addr, end, prot, pages);
for (i = 0; i < nr; i += 1U << (page_shift - PAGE_SHIFT)) {
int err;
err = vmap_range_noflush(addr, addr + (1UL << page_shift),
__pa(page_address(pages[i])), prot,
page_shift);
if (err)
return err;
addr += 1UL << page_shift;
}
return 0;
}
/**
* vmap_pages_range - map pages to a kernel virtual address
* @addr: start of the VM area to map
* @end: end of the VM area to map (non-inclusive)
* @prot: page protection flags to use
* @pages: pages to map (always PAGE_SIZE pages)
* @page_shift: maximum shift that the pages may be mapped with, @pages must
* be aligned and contiguous up to at least this shift.
*
* RETURNS:
* 0 on success, -errno on failure.
*/
static int vmap_pages_range(unsigned long addr, unsigned long end,
pgprot_t prot, struct page **pages, unsigned int page_shift)
{
int err;
err = vmap_pages_range_noflush(addr, end, prot, pages, page_shift);
flush_cache_vmap(addr, end);
return err;
}
int is_vmalloc_or_module_addr(const void *x)
{
/*
* ARM, x86-64 and sparc64 put modules in a special place,
* and fall back on vmalloc() if that fails. Others
* just put it in the vmalloc space.
*/
#if defined(CONFIG_MODULES) && defined(MODULES_VADDR)
unsigned long addr = (unsigned long)x;
if (addr >= MODULES_VADDR && addr < MODULES_END)
return 1;
#endif
return is_vmalloc_addr(x);
}
/*
* Walk a vmap address to the struct page it maps. Huge vmap mappings will
* return the tail page that corresponds to the base page address, which
* matches small vmap mappings.
*/
struct page *vmalloc_to_page(const void *vmalloc_addr)
{
unsigned long addr = (unsigned long) vmalloc_addr;
struct page *page = NULL;
pgd_t *pgd = pgd_offset_k(addr);
p4d_t *p4d;
pud_t *pud;
pmd_t *pmd;
pte_t *ptep, pte;
/*
* XXX we might need to change this if we add VIRTUAL_BUG_ON for
* architectures that do not vmalloc module space
*/
VIRTUAL_BUG_ON(!is_vmalloc_or_module_addr(vmalloc_addr));
if (pgd_none(*pgd))
return NULL;
if (WARN_ON_ONCE(pgd_leaf(*pgd)))
return NULL; /* XXX: no allowance for huge pgd */
if (WARN_ON_ONCE(pgd_bad(*pgd)))
return NULL;
p4d = p4d_offset(pgd, addr);
if (p4d_none(*p4d))
return NULL;
if (p4d_leaf(*p4d))
return p4d_page(*p4d) + ((addr & ~P4D_MASK) >> PAGE_SHIFT);
if (WARN_ON_ONCE(p4d_bad(*p4d)))
return NULL;
pud = pud_offset(p4d, addr);
if (pud_none(*pud))
return NULL;
if (pud_leaf(*pud))
return pud_page(*pud) + ((addr & ~PUD_MASK) >> PAGE_SHIFT);
if (WARN_ON_ONCE(pud_bad(*pud)))
return NULL;
pmd = pmd_offset(pud, addr);
if (pmd_none(*pmd))
return NULL;
if (pmd_leaf(*pmd))
return pmd_page(*pmd) + ((addr & ~PMD_MASK) >> PAGE_SHIFT);
if (WARN_ON_ONCE(pmd_bad(*pmd)))
return NULL;
ptep = pte_offset_map(pmd, addr);
pte = *ptep;
if (pte_present(pte))
page = pte_page(pte);
pte_unmap(ptep);
return page;
}
EXPORT_SYMBOL(vmalloc_to_page);
/*
* Map a vmalloc()-space virtual address to the physical page frame number.
*/
unsigned long vmalloc_to_pfn(const void *vmalloc_addr)
{
return page_to_pfn(vmalloc_to_page(vmalloc_addr));
}
EXPORT_SYMBOL(vmalloc_to_pfn);
/*** Global kva allocator ***/
#define DEBUG_AUGMENT_PROPAGATE_CHECK 0
#define DEBUG_AUGMENT_LOWEST_MATCH_CHECK 0
static DEFINE_SPINLOCK(vmap_area_lock);
static DEFINE_SPINLOCK(free_vmap_area_lock);
/* Export for kexec only */
LIST_HEAD(vmap_area_list);
static struct rb_root vmap_area_root = RB_ROOT;
static bool vmap_initialized __read_mostly;
static struct rb_root purge_vmap_area_root = RB_ROOT;
static LIST_HEAD(purge_vmap_area_list);
static DEFINE_SPINLOCK(purge_vmap_area_lock);
/*
* This kmem_cache is used for vmap_area objects. Instead of
* allocating from slab we reuse an object from this cache to
* make things faster. Especially in "no edge" splitting of
* free block.
*/
static struct kmem_cache *vmap_area_cachep;
/*
* This linked list is used in pair with free_vmap_area_root.
* It gives O(1) access to prev/next to perform fast coalescing.
*/
static LIST_HEAD(free_vmap_area_list);
/*
* This augment red-black tree represents the free vmap space.
* All vmap_area objects in this tree are sorted by va->va_start
* address. It is used for allocation and merging when a vmap
* object is released.
*
* Each vmap_area node contains a maximum available free block
* of its sub-tree, right or left. Therefore it is possible to
* find a lowest match of free area.
*/
static struct rb_root free_vmap_area_root = RB_ROOT;
/*
* Preload a CPU with one object for "no edge" split case. The
* aim is to get rid of allocations from the atomic context, thus
* to use more permissive allocation masks.
*/
static DEFINE_PER_CPU(struct vmap_area *, ne_fit_preload_node);
static __always_inline unsigned long
va_size(struct vmap_area *va)
{
return (va->va_end - va->va_start);
}
static __always_inline unsigned long
get_subtree_max_size(struct rb_node *node)
{
struct vmap_area *va;
va = rb_entry_safe(node, struct vmap_area, rb_node); return va ? va->subtree_max_size : 0;
}
/*
* Gets called when remove the node and rotate.
*/
static __always_inline unsigned long
compute_subtree_max_size(struct vmap_area *va)
{
return max3(va_size(va),
get_subtree_max_size(va->rb_node.rb_left),
get_subtree_max_size(va->rb_node.rb_right));
}
RB_DECLARE_CALLBACKS_MAX(static, free_vmap_area_rb_augment_cb,
struct vmap_area, rb_node, unsigned long, subtree_max_size, va_size)
static void purge_vmap_area_lazy(void);
static BLOCKING_NOTIFIER_HEAD(vmap_notify_list);
static unsigned long lazy_max_pages(void);
static atomic_long_t nr_vmalloc_pages;
unsigned long vmalloc_nr_pages(void)
{
return atomic_long_read(&nr_vmalloc_pages);
}
static struct vmap_area *find_vmap_area_exceed_addr(unsigned long addr)
{
struct vmap_area *va = NULL;
struct rb_node *n = vmap_area_root.rb_node;
while (n) {
struct vmap_area *tmp;
tmp = rb_entry(n, struct vmap_area, rb_node);
if (tmp->va_end > addr) {
va = tmp;
if (tmp->va_start <= addr)
break;
n = n->rb_left;
} else
n = n->rb_right;
}
return va;
}
static struct vmap_area *__find_vmap_area(unsigned long addr)
{
struct rb_node *n = vmap_area_root.rb_node;
while (n) {
struct vmap_area *va;
va = rb_entry(n, struct vmap_area, rb_node); if (addr < va->va_start) n = n->rb_left; else if (addr >= va->va_end) n = n->rb_right;
else
return va;
}
return NULL;
}
/*
* This function returns back addresses of parent node
* and its left or right link for further processing.
*
* Otherwise NULL is returned. In that case all further
* steps regarding inserting of conflicting overlap range
* have to be declined and actually considered as a bug.
*/
static __always_inline struct rb_node **
find_va_links(struct vmap_area *va,
struct rb_root *root, struct rb_node *from,
struct rb_node **parent)
{
struct vmap_area *tmp_va;
struct rb_node **link;
if (root) {
link = &root->rb_node;
if (unlikely(!*link)) {
*parent = NULL;
return link;
}
} else {
link = &from;
}
/*
* Go to the bottom of the tree. When we hit the last point
* we end up with parent rb_node and correct direction, i name
* it link, where the new va->rb_node will be attached to.
*/
do {
tmp_va = rb_entry(*link, struct vmap_area, rb_node);
/*
* During the traversal we also do some sanity check.
* Trigger the BUG() if there are sides(left/right)
* or full overlaps.
*/
if (va->va_start < tmp_va->va_end &&
va->va_end <= tmp_va->va_start)
link = &(*link)->rb_left; else if (va->va_end > tmp_va->va_start &&
va->va_start >= tmp_va->va_end)
link = &(*link)->rb_right;
else {
WARN(1, "vmalloc bug: 0x%lx-0x%lx overlaps with 0x%lx-0x%lx\n",
va->va_start, va->va_end, tmp_va->va_start, tmp_va->va_end);
return NULL;
}
} while (*link); *parent = &tmp_va->rb_node;
return link;
}
static __always_inline struct list_head *
get_va_next_sibling(struct rb_node *parent, struct rb_node **link)
{
struct list_head *list;
if (unlikely(!parent))
/*
* The red-black tree where we try to find VA neighbors
* before merging or inserting is empty, i.e. it means
* there is no free vmap space. Normally it does not
* happen but we handle this case anyway.
*/
return NULL;
list = &rb_entry(parent, struct vmap_area, rb_node)->list; return (&parent->rb_right == link ? list->next : list);
}
static __always_inline void
link_va(struct vmap_area *va, struct rb_root *root,
struct rb_node *parent, struct rb_node **link, struct list_head *head)
{
/*
* VA is still not in the list, but we can
* identify its future previous list_head node.
*/
if (likely(parent)) {
head = &rb_entry(parent, struct vmap_area, rb_node)->list; if (&parent->rb_right != link) head = head->prev;
}
/* Insert to the rb-tree */
rb_link_node(&va->rb_node, parent, link);
if (root == &free_vmap_area_root) {
/*
* Some explanation here. Just perform simple insertion
* to the tree. We do not set va->subtree_max_size to
* its current size before calling rb_insert_augmented().
* It is because of we populate the tree from the bottom
* to parent levels when the node _is_ in the tree.
*
* Therefore we set subtree_max_size to zero after insertion,
* to let __augment_tree_propagate_from() puts everything to
* the correct order later on.
*/
rb_insert_augmented(&va->rb_node,
root, &free_vmap_area_rb_augment_cb);
va->subtree_max_size = 0;
} else {
rb_insert_color(&va->rb_node, root);
}
/* Address-sort this list */
list_add(&va->list, head);
}
static __always_inline void
unlink_va(struct vmap_area *va, struct rb_root *root)
{
if (WARN_ON(RB_EMPTY_NODE(&va->rb_node)))
return;
if (root == &free_vmap_area_root)
rb_erase_augmented(&va->rb_node,
root, &free_vmap_area_rb_augment_cb);
else
rb_erase(&va->rb_node, root);
list_del(&va->list);
RB_CLEAR_NODE(&va->rb_node);
}
#if DEBUG_AUGMENT_PROPAGATE_CHECK
static void
augment_tree_propagate_check(void)
{
struct vmap_area *va;
unsigned long computed_size;
list_for_each_entry(va, &free_vmap_area_list, list) {
computed_size = compute_subtree_max_size(va);
if (computed_size != va->subtree_max_size)
pr_emerg("tree is corrupted: %lu, %lu\n",
va_size(va), va->subtree_max_size);
}
}
#endif
/*
* This function populates subtree_max_size from bottom to upper
* levels starting from VA point. The propagation must be done
* when VA size is modified by changing its va_start/va_end. Or
* in case of newly inserting of VA to the tree.
*
* It means that __augment_tree_propagate_from() must be called:
* - After VA has been inserted to the tree(free path);
* - After VA has been shrunk(allocation path);
* - After VA has been increased(merging path).
*
* Please note that, it does not mean that upper parent nodes
* and their subtree_max_size are recalculated all the time up
* to the root node.
*
* 4--8
* /\
* / \
* / \
* 2--2 8--8
*
* For example if we modify the node 4, shrinking it to 2, then
* no any modification is required. If we shrink the node 2 to 1
* its subtree_max_size is updated only, and set to 1. If we shrink
* the node 8 to 6, then its subtree_max_size is set to 6 and parent
* node becomes 4--6.
*/
static __always_inline void
augment_tree_propagate_from(struct vmap_area *va)
{
/*
* Populate the tree from bottom towards the root until
* the calculated maximum available size of checked node
* is equal to its current one.
*/
free_vmap_area_rb_augment_cb_propagate(&va->rb_node, NULL);
#if DEBUG_AUGMENT_PROPAGATE_CHECK
augment_tree_propagate_check();
#endif
}
static void
insert_vmap_area(struct vmap_area *va,
struct rb_root *root, struct list_head *head)
{
struct rb_node **link;
struct rb_node *parent;
link = find_va_links(va, root, NULL, &parent);
if (link)
link_va(va, root, parent, link, head);
}
static void
insert_vmap_area_augment(struct vmap_area *va,
struct rb_node *from, struct rb_root *root,
struct list_head *head)
{
struct rb_node **link;
struct rb_node *parent;
if (from)
link = find_va_links(va, NULL, from, &parent);
else
link = find_va_links(va, root, NULL, &parent);
if (link) {
link_va(va, root, parent, link, head);
augment_tree_propagate_from(va);
}
}
/*
* Merge de-allocated chunk of VA memory with previous
* and next free blocks. If coalesce is not done a new
* free area is inserted. If VA has been merged, it is
* freed.
*
* Please note, it can return NULL in case of overlap
* ranges, followed by WARN() report. Despite it is a
* buggy behaviour, a system can be alive and keep
* ongoing.
*/
static __always_inline struct vmap_area *
merge_or_add_vmap_area(struct vmap_area *va,
struct rb_root *root, struct list_head *head)
{
struct vmap_area *sibling;
struct list_head *next;
struct rb_node **link;
struct rb_node *parent;
bool merged = false;
/*
* Find a place in the tree where VA potentially will be
* inserted, unless it is merged with its sibling/siblings.
*/
link = find_va_links(va, root, NULL, &parent);
if (!link)
return NULL;
/*
* Get next node of VA to check if merging can be done.
*/
next = get_va_next_sibling(parent, link);
if (unlikely(next == NULL))
goto insert;
/*
* start end
* | |
* |<------VA------>|<-----Next----->|
* | |
* start end
*/
if (next != head) { sibling = list_entry(next, struct vmap_area, list);
if (sibling->va_start == va->va_end) {
sibling->va_start = va->va_start;
/* Free vmap_area object. */
kmem_cache_free(vmap_area_cachep, va);
/* Point to the new merged area. */
va = sibling;
merged = true;
}
}
/*
* start end
* | |
* |<-----Prev----->|<------VA------>|
* | |
* start end
*/
if (next->prev != head) {
sibling = list_entry(next->prev, struct vmap_area, list);
if (sibling->va_end == va->va_start) {
/*
* If both neighbors are coalesced, it is important
* to unlink the "next" node first, followed by merging
* with "previous" one. Otherwise the tree might not be
* fully populated if a sibling's augmented value is
* "normalized" because of rotation operations.
*/
if (merged)
unlink_va(va, root);
sibling->va_end = va->va_end;
/* Free vmap_area object. */
kmem_cache_free(vmap_area_cachep, va);
/* Point to the new merged area. */
va = sibling;
merged = true;
}
}
insert:
if (!merged)
link_va(va, root, parent, link, head);
return va;
}
static __always_inline struct vmap_area *
merge_or_add_vmap_area_augment(struct vmap_area *va,
struct rb_root *root, struct list_head *head)
{
va = merge_or_add_vmap_area(va, root, head);
if (va)
augment_tree_propagate_from(va);
return va;
}
static __always_inline bool
is_within_this_va(struct vmap_area *va, unsigned long size,
unsigned long align, unsigned long vstart)
{
unsigned long nva_start_addr;
if (va->va_start > vstart) nva_start_addr = ALIGN(va->va_start, align);
else
nva_start_addr = ALIGN(vstart, align);
/* Can be overflowed due to big size or alignment. */
if (nva_start_addr + size < nva_start_addr ||
nva_start_addr < vstart)
return false;
return (nva_start_addr + size <= va->va_end);
}
/*
* Find the first free block(lowest start address) in the tree,
* that will accomplish the request corresponding to passing
* parameters.
*/
static __always_inline struct vmap_area *
find_vmap_lowest_match(unsigned long size,
unsigned long align, unsigned long vstart)
{
struct vmap_area *va;
struct rb_node *node;
unsigned long length;
/* Start from the root. */
node = free_vmap_area_root.rb_node;
/* Adjust the search size for alignment overhead. */
length = size + align - 1;
while (node) { va = rb_entry(node, struct vmap_area, rb_node); if (get_subtree_max_size(node->rb_left) >= length &&
vstart < va->va_start) {
node = node->rb_left;
} else {
if (is_within_this_va(va, size, align, vstart))
return va;
/*
* Does not make sense to go deeper towards the right
* sub-tree if it does not have a free block that is
* equal or bigger to the requested search length.
*/
if (get_subtree_max_size(node->rb_right) >= length) {
node = node->rb_right;
continue;
}
/*
* OK. We roll back and find the first right sub-tree,
* that will satisfy the search criteria. It can happen
* only once due to "vstart" restriction.
*/
while ((node = rb_parent(node))) { va = rb_entry(node, struct vmap_area, rb_node);
if (is_within_this_va(va, size, align, vstart))
return va;
if (get_subtree_max_size(node->rb_right) >= length &&
vstart <= va->va_start) {
node = node->rb_right;
break;
}
}
}
}
return NULL;
}
#if DEBUG_AUGMENT_LOWEST_MATCH_CHECK
#include <linux/random.h>
static struct vmap_area *
find_vmap_lowest_linear_match(unsigned long size,
unsigned long align, unsigned long vstart)
{
struct vmap_area *va;
list_for_each_entry(va, &free_vmap_area_list, list) {
if (!is_within_this_va(va, size, align, vstart))
continue;
return va;
}
return NULL;
}
static void
find_vmap_lowest_match_check(unsigned long size)
{
struct vmap_area *va_1, *va_2;
unsigned long vstart;
unsigned int rnd;
get_random_bytes(&rnd, sizeof(rnd));
vstart = VMALLOC_START + rnd;
va_1 = find_vmap_lowest_match(size, 1, vstart);
va_2 = find_vmap_lowest_linear_match(size, 1, vstart);
if (va_1 != va_2)
pr_emerg("not lowest: t: 0x%p, l: 0x%p, v: 0x%lx\n",
va_1, va_2, vstart);
}
#endif
enum fit_type {
NOTHING_FIT = 0,
FL_FIT_TYPE = 1, /* full fit */
LE_FIT_TYPE = 2, /* left edge fit */
RE_FIT_TYPE = 3, /* right edge fit */
NE_FIT_TYPE = 4 /* no edge fit */
};
static __always_inline enum fit_type
classify_va_fit_type(struct vmap_area *va,
unsigned long nva_start_addr, unsigned long size)
{
enum fit_type type;
/* Check if it is within VA. */
if (nva_start_addr < va->va_start || nva_start_addr + size > va->va_end)
return NOTHING_FIT;
/* Now classify. */
if (va->va_start == nva_start_addr) { if (va->va_end == nva_start_addr + size)
type = FL_FIT_TYPE;
else
type = LE_FIT_TYPE;
} else if (va->va_end == nva_start_addr + size) {
type = RE_FIT_TYPE;
} else {
type = NE_FIT_TYPE;
}
return type;
}
static __always_inline int
adjust_va_to_fit_type(struct vmap_area *va,
unsigned long nva_start_addr, unsigned long size,
enum fit_type type)
{
struct vmap_area *lva = NULL;
if (type == FL_FIT_TYPE) {
/*
* No need to split VA, it fully fits.
*
* | |
* V NVA V
* |---------------|
*/
unlink_va(va, &free_vmap_area_root);
kmem_cache_free(vmap_area_cachep, va);
} else if (type == LE_FIT_TYPE) {
/*
* Split left edge of fit VA.
*
* | |
* V NVA V R
* |-------|-------|
*/
va->va_start += size;
} else if (type == RE_FIT_TYPE) {
/*
* Split right edge of fit VA.
*
* | |
* L V NVA V
* |-------|-------|
*/
va->va_end = nva_start_addr;
} else if (type == NE_FIT_TYPE) {
/*
* Split no edge of fit VA.
*
* | |
* L V NVA V R
* |---|-------|---|
*/
lva = __this_cpu_xchg(ne_fit_preload_node, NULL);
if (unlikely(!lva)) {
/*
* For percpu allocator we do not do any pre-allocation
* and leave it as it is. The reason is it most likely
* never ends up with NE_FIT_TYPE splitting. In case of
* percpu allocations offsets and sizes are aligned to
* fixed align request, i.e. RE_FIT_TYPE and FL_FIT_TYPE
* are its main fitting cases.
*
* There are a few exceptions though, as an example it is
* a first allocation (early boot up) when we have "one"
* big free space that has to be split.
*
* Also we can hit this path in case of regular "vmap"
* allocations, if "this" current CPU was not preloaded.
* See the comment in alloc_vmap_area() why. If so, then
* GFP_NOWAIT is used instead to get an extra object for
* split purpose. That is rare and most time does not
* occur.
*
* What happens if an allocation gets failed. Basically,
* an "overflow" path is triggered to purge lazily freed
* areas to free some memory, then, the "retry" path is
* triggered to repeat one more time. See more details
* in alloc_vmap_area() function.
*/
lva = kmem_cache_alloc(vmap_area_cachep, GFP_NOWAIT);
if (!lva)
return -1;
}
/*
* Build the remainder.
*/
lva->va_start = va->va_start;
lva->va_end = nva_start_addr;
/*
* Shrink this VA to remaining size.
*/
va->va_start = nva_start_addr + size;
} else {
return -1;
}
if (type != FL_FIT_TYPE) {
augment_tree_propagate_from(va);
if (lva) /* type == NE_FIT_TYPE */ insert_vmap_area_augment(lva, &va->rb_node,
&free_vmap_area_root, &free_vmap_area_list);
}
return 0;
}
/*
* Returns a start address of the newly allocated area, if success.
* Otherwise a vend is returned that indicates failure.
*/
static __always_inline unsigned long
__alloc_vmap_area(unsigned long size, unsigned long align,
unsigned long vstart, unsigned long vend)
{
unsigned long nva_start_addr;
struct vmap_area *va;
enum fit_type type;
int ret;
va = find_vmap_lowest_match(size, align, vstart);
if (unlikely(!va))
return vend;
if (va->va_start > vstart) nva_start_addr = ALIGN(va->va_start, align);
else
nva_start_addr = ALIGN(vstart, align);
/* Check the "vend" restriction. */
if (nva_start_addr + size > vend)
return vend;
/* Classify what we have found. */
type = classify_va_fit_type(va, nva_start_addr, size);
if (WARN_ON_ONCE(type == NOTHING_FIT))
return vend;
/* Update the free vmap_area. */
ret = adjust_va_to_fit_type(va, nva_start_addr, size, type);
if (ret)
return vend;
#if DEBUG_AUGMENT_LOWEST_MATCH_CHECK
find_vmap_lowest_match_check(size);
#endif
return nva_start_addr;
}
/*
* Free a region of KVA allocated by alloc_vmap_area
*/
static void free_vmap_area(struct vmap_area *va)
{
/*
* Remove from the busy tree/list.
*/
spin_lock(&vmap_area_lock);
unlink_va(va, &vmap_area_root);
spin_unlock(&vmap_area_lock);
/*
* Insert/Merge it back to the free tree/list.
*/
spin_lock(&free_vmap_area_lock);
merge_or_add_vmap_area_augment(va, &free_vmap_area_root, &free_vmap_area_list);
spin_unlock(&free_vmap_area_lock);
}
static inline void
preload_this_cpu_lock(spinlock_t *lock, gfp_t gfp_mask, int node)
{
struct vmap_area *va = NULL;
/*
* Preload this CPU with one extra vmap_area object. It is used
* when fit type of free area is NE_FIT_TYPE. It guarantees that
* a CPU that does an allocation is preloaded.
*
* We do it in non-atomic context, thus it allows us to use more
* permissive allocation masks to be more stable under low memory
* condition and high memory pressure.
*/
if (!this_cpu_read(ne_fit_preload_node)) va = kmem_cache_alloc_node(vmap_area_cachep, gfp_mask, node);
spin_lock(lock);
if (va && __this_cpu_cmpxchg(ne_fit_preload_node, NULL, va)) kmem_cache_free(vmap_area_cachep, va);
}
/*
* Allocate a region of KVA of the specified size and alignment, within the
* vstart and vend.
*/
static struct vmap_area *alloc_vmap_area(unsigned long size,
unsigned long align,
unsigned long vstart, unsigned long vend,
int node, gfp_t gfp_mask)
{
struct vmap_area *va;
unsigned long freed;
unsigned long addr;
int purged = 0;
int ret;
BUG_ON(!size); BUG_ON(offset_in_page(size)); BUG_ON(!is_power_of_2(align)); if (unlikely(!vmap_initialized))
return ERR_PTR(-EBUSY);
might_sleep();
gfp_mask = gfp_mask & GFP_RECLAIM_MASK;
va = kmem_cache_alloc_node(vmap_area_cachep, gfp_mask, node);
if (unlikely(!va))
return ERR_PTR(-ENOMEM);
/*
* Only scan the relevant parts containing pointers to other objects
* to avoid false negatives.
*/
kmemleak_scan_area(&va->rb_node, SIZE_MAX, gfp_mask);
retry:
preload_this_cpu_lock(&free_vmap_area_lock, gfp_mask, node);
addr = __alloc_vmap_area(size, align, vstart, vend);
spin_unlock(&free_vmap_area_lock);
/*
* If an allocation fails, the "vend" address is
* returned. Therefore trigger the overflow path.
*/
if (unlikely(addr == vend))
goto overflow;
va->va_start = addr;
va->va_end = addr + size;
va->vm = NULL;
spin_lock(&vmap_area_lock);
insert_vmap_area(va, &vmap_area_root, &vmap_area_list);
spin_unlock(&vmap_area_lock);
BUG_ON(!IS_ALIGNED(va->va_start, align)); BUG_ON(va->va_start < vstart); BUG_ON(va->va_end > vend);
ret = kasan_populate_vmalloc(addr, size);
if (ret) {
free_vmap_area(va);
return ERR_PTR(ret);
}
return va;
overflow:
if (!purged) { purge_vmap_area_lazy();
purged = 1;
goto retry;
}
freed = 0;
blocking_notifier_call_chain(&vmap_notify_list, 0, &freed);
if (freed > 0) {
purged = 0;
goto retry;
}
if (!(gfp_mask & __GFP_NOWARN) && printk_ratelimit())
pr_warn("vmap allocation for size %lu failed: use vmalloc=<size> to increase size\n",
size);
kmem_cache_free(vmap_area_cachep, va);
return ERR_PTR(-EBUSY);
}
int register_vmap_purge_notifier(struct notifier_block *nb)
{
return blocking_notifier_chain_register(&vmap_notify_list, nb);
}
EXPORT_SYMBOL_GPL(register_vmap_purge_notifier);
int unregister_vmap_purge_notifier(struct notifier_block *nb)
{
return blocking_notifier_chain_unregister(&vmap_notify_list, nb);
}
EXPORT_SYMBOL_GPL(unregister_vmap_purge_notifier);
/*
* lazy_max_pages is the maximum amount of virtual address space we gather up
* before attempting to purge with a TLB flush.
*
* There is a tradeoff here: a larger number will cover more kernel page tables
* and take slightly longer to purge, but it will linearly reduce the number of
* global TLB flushes that must be performed. It would seem natural to scale
* this number up linearly with the number of CPUs (because vmapping activity
* could also scale linearly with the number of CPUs), however it is likely
* that in practice, workloads might be constrained in other ways that mean
* vmap activity will not scale linearly with CPUs. Also, I want to be
* conservative and not introduce a big latency on huge systems, so go with
* a less aggressive log scale. It will still be an improvement over the old
* code, and it will be simple to change the scale factor if we find that it
* becomes a problem on bigger systems.
*/
static unsigned long lazy_max_pages(void)
{
unsigned int log;
log = fls(num_online_cpus());
return log * (32UL * 1024 * 1024 / PAGE_SIZE);
}
static atomic_long_t vmap_lazy_nr = ATOMIC_LONG_INIT(0);
/*
* Serialize vmap purging. There is no actual critical section protected
* by this look, but we want to avoid concurrent calls for performance
* reasons and to make the pcpu_get_vm_areas more deterministic.
*/
static DEFINE_MUTEX(vmap_purge_lock);
/* for per-CPU blocks */
static void purge_fragmented_blocks_allcpus(void);
#ifdef CONFIG_X86_64
/*
* called before a call to iounmap() if the caller wants vm_area_struct's
* immediately freed.
*/
void set_iounmap_nonlazy(void)
{
atomic_long_set(&vmap_lazy_nr, lazy_max_pages()+1);
}
#endif /* CONFIG_X86_64 */
/*
* Purges all lazily-freed vmap areas.
*/
static bool __purge_vmap_area_lazy(unsigned long start, unsigned long end)
{
unsigned long resched_threshold;
struct list_head local_pure_list;
struct vmap_area *va, *n_va;
lockdep_assert_held(&vmap_purge_lock);
spin_lock(&purge_vmap_area_lock);
purge_vmap_area_root = RB_ROOT;
list_replace_init(&purge_vmap_area_list, &local_pure_list);
spin_unlock(&purge_vmap_area_lock);
if (unlikely(list_empty(&local_pure_list)))
return false;
start = min(start,
list_first_entry(&local_pure_list,
struct vmap_area, list)->va_start);
end = max(end,
list_last_entry(&local_pure_list,
struct vmap_area, list)->va_end);
flush_tlb_kernel_range(start, end);
resched_threshold = lazy_max_pages() << 1;
spin_lock(&free_vmap_area_lock);
list_for_each_entry_safe(va, n_va, &local_pure_list, list) {
unsigned long nr = (va->va_end - va->va_start) >> PAGE_SHIFT;
unsigned long orig_start = va->va_start;
unsigned long orig_end = va->va_end;
/*
* Finally insert or merge lazily-freed area. It is
* detached and there is no need to "unlink" it from
* anything.
*/
va = merge_or_add_vmap_area_augment(va, &free_vmap_area_root,
&free_vmap_area_list);
if (!va)
continue;
if (is_vmalloc_or_module_addr((void *)orig_start))
kasan_release_vmalloc(orig_start, orig_end,
va->va_start, va->va_end);
atomic_long_sub(nr, &vmap_lazy_nr);
if (atomic_long_read(&vmap_lazy_nr) < resched_threshold)
cond_resched_lock(&free_vmap_area_lock);
}
spin_unlock(&free_vmap_area_lock);
return true;
}
/*
* Kick off a purge of the outstanding lazy areas. Don't bother if somebody
* is already purging.
*/
static void try_purge_vmap_area_lazy(void)
{
if (mutex_trylock(&vmap_purge_lock)) { __purge_vmap_area_lazy(ULONG_MAX, 0);
mutex_unlock(&vmap_purge_lock);
}
}
/*
* Kick off a purge of the outstanding lazy areas.
*/
static void purge_vmap_area_lazy(void)
{
mutex_lock(&vmap_purge_lock);
purge_fragmented_blocks_allcpus();
__purge_vmap_area_lazy(ULONG_MAX, 0);
mutex_unlock(&vmap_purge_lock);
}
/*
* Free a vmap area, caller ensuring that the area has been unmapped
* and flush_cache_vunmap had been called for the correct range
* previously.
*/
static void free_vmap_area_noflush(struct vmap_area *va)
{
unsigned long nr_lazy;
spin_lock(&vmap_area_lock);
unlink_va(va, &vmap_area_root);
spin_unlock(&vmap_area_lock);
nr_lazy = atomic_long_add_return((va->va_end - va->va_start) >>
PAGE_SHIFT, &vmap_lazy_nr);
/*
* Merge or place it to the purge tree/list.
*/
spin_lock(&purge_vmap_area_lock);
merge_or_add_vmap_area(va,
&purge_vmap_area_root, &purge_vmap_area_list);
spin_unlock(&purge_vmap_area_lock);
/* After this point, we may free va at any time */
if (unlikely(nr_lazy > lazy_max_pages()))
try_purge_vmap_area_lazy();
}
/*
* Free and unmap a vmap area
*/
static void free_unmap_vmap_area(struct vmap_area *va)
{
flush_cache_vunmap(va->va_start, va->va_end);
vunmap_range_noflush(va->va_start, va->va_end);
if (debug_pagealloc_enabled_static())
flush_tlb_kernel_range(va->va_start, va->va_end);
free_vmap_area_noflush(va);
}
static struct vmap_area *find_vmap_area(unsigned long addr)
{
struct vmap_area *va;
spin_lock(&vmap_area_lock);
va = __find_vmap_area(addr);
spin_unlock(&vmap_area_lock);
return va;
}
/*** Per cpu kva allocator ***/
/*
* vmap space is limited especially on 32 bit architectures. Ensure there is
* room for at least 16 percpu vmap blocks per CPU.
*/
/*
* If we had a constant VMALLOC_START and VMALLOC_END, we'd like to be able
* to #define VMALLOC_SPACE (VMALLOC_END-VMALLOC_START). Guess
* instead (we just need a rough idea)
*/
#if BITS_PER_LONG == 32
#define VMALLOC_SPACE (128UL*1024*1024)
#else
#define VMALLOC_SPACE (128UL*1024*1024*1024)
#endif
#define VMALLOC_PAGES (VMALLOC_SPACE / PAGE_SIZE)
#define VMAP_MAX_ALLOC BITS_PER_LONG /* 256K with 4K pages */
#define VMAP_BBMAP_BITS_MAX 1024 /* 4MB with 4K pages */
#define VMAP_BBMAP_BITS_MIN (VMAP_MAX_ALLOC*2)
#define VMAP_MIN(x, y) ((x) < (y) ? (x) : (y)) /* can't use min() */
#define VMAP_MAX(x, y) ((x) > (y) ? (x) : (y)) /* can't use max() */
#define VMAP_BBMAP_BITS \
VMAP_MIN(VMAP_BBMAP_BITS_MAX, \
VMAP_MAX(VMAP_BBMAP_BITS_MIN, \
VMALLOC_PAGES / roundup_pow_of_two(NR_CPUS) / 16))
#define VMAP_BLOCK_SIZE (VMAP_BBMAP_BITS * PAGE_SIZE)
struct vmap_block_queue {
spinlock_t lock;
struct list_head free;
};
struct vmap_block {
spinlock_t lock;
struct vmap_area *va;
unsigned long free, dirty;
unsigned long dirty_min, dirty_max; /*< dirty range */
struct list_head free_list;
struct rcu_head rcu_head;
struct list_head purge;
};
/* Queue of free and dirty vmap blocks, for allocation and flushing purposes */
static DEFINE_PER_CPU(struct vmap_block_queue, vmap_block_queue);
/*
* XArray of vmap blocks, indexed by address, to quickly find a vmap block
* in the free path. Could get rid of this if we change the API to return a
* "cookie" from alloc, to be passed to free. But no big deal yet.
*/
static DEFINE_XARRAY(vmap_blocks);
/*
* We should probably have a fallback mechanism to allocate virtual memory
* out of partially filled vmap blocks. However vmap block sizing should be
* fairly reasonable according to the vmalloc size, so it shouldn't be a
* big problem.
*/
static unsigned long addr_to_vb_idx(unsigned long addr)
{
addr -= VMALLOC_START & ~(VMAP_BLOCK_SIZE-1);
addr /= VMAP_BLOCK_SIZE;
return addr;
}
static void *vmap_block_vaddr(unsigned long va_start, unsigned long pages_off)
{
unsigned long addr;
addr = va_start + (pages_off << PAGE_SHIFT);
BUG_ON(addr_to_vb_idx(addr) != addr_to_vb_idx(va_start));
return (void *)addr;
}
/**
* new_vmap_block - allocates new vmap_block and occupies 2^order pages in this
* block. Of course pages number can't exceed VMAP_BBMAP_BITS
* @order: how many 2^order pages should be occupied in newly allocated block
* @gfp_mask: flags for the page level allocator
*
* Return: virtual address in a newly allocated block or ERR_PTR(-errno)
*/
static void *new_vmap_block(unsigned int order, gfp_t gfp_mask)
{
struct vmap_block_queue *vbq;
struct vmap_block *vb;
struct vmap_area *va;
unsigned long vb_idx;
int node, err;
void *vaddr;
node = numa_node_id();
vb = kmalloc_node(sizeof(struct vmap_block),
gfp_mask & GFP_RECLAIM_MASK, node);
if (unlikely(!vb))
return ERR_PTR(-ENOMEM);
va = alloc_vmap_area(VMAP_BLOCK_SIZE, VMAP_BLOCK_SIZE,
VMALLOC_START, VMALLOC_END,
node, gfp_mask);
if (IS_ERR(va)) {
kfree(vb);
return ERR_CAST(va);
}
vaddr = vmap_block_vaddr(va->va_start, 0);
spin_lock_init(&vb->lock);
vb->va = va;
/* At least something should be left free */
BUG_ON(VMAP_BBMAP_BITS <= (1UL << order));
vb->free = VMAP_BBMAP_BITS - (1UL << order);
vb->dirty = 0;
vb->dirty_min = VMAP_BBMAP_BITS;
vb->dirty_max = 0;
INIT_LIST_HEAD(&vb->free_list);
vb_idx = addr_to_vb_idx(va->va_start);
err = xa_insert(&vmap_blocks, vb_idx, vb, gfp_mask);
if (err) {
kfree(vb);
free_vmap_area(va);
return ERR_PTR(err);
}
vbq = &get_cpu_var(vmap_block_queue);
spin_lock(&vbq->lock);
list_add_tail_rcu(&vb->free_list, &vbq->free);
spin_unlock(&vbq->lock);
put_cpu_var(vmap_block_queue);
return vaddr;
}
static void free_vmap_block(struct vmap_block *vb)
{
struct vmap_block *tmp;
tmp = xa_erase(&vmap_blocks, addr_to_vb_idx(vb->va->va_start));
BUG_ON(tmp != vb);
free_vmap_area_noflush(vb->va);
kfree_rcu(vb, rcu_head);
}
static void purge_fragmented_blocks(int cpu)
{
LIST_HEAD(purge);
struct vmap_block *vb;
struct vmap_block *n_vb;
struct vmap_block_queue *vbq = &per_cpu(vmap_block_queue, cpu);
rcu_read_lock();
list_for_each_entry_rcu(vb, &vbq->free, free_list) {
if (!(vb->free + vb->dirty == VMAP_BBMAP_BITS && vb->dirty != VMAP_BBMAP_BITS))
continue;
spin_lock(&vb->lock);
if (vb->free + vb->dirty == VMAP_BBMAP_BITS && vb->dirty != VMAP_BBMAP_BITS) {
vb->free = 0; /* prevent further allocs after releasing lock */
vb->dirty = VMAP_BBMAP_BITS; /* prevent purging it again */
vb->dirty_min = 0;
vb->dirty_max = VMAP_BBMAP_BITS;
spin_lock(&vbq->lock);
list_del_rcu(&vb->free_list);
spin_unlock(&vbq->lock);
spin_unlock(&vb->lock);
list_add_tail(&vb->purge, &purge);
} else
spin_unlock(&vb->lock);
}
rcu_read_unlock();
list_for_each_entry_safe(vb, n_vb, &purge, purge) {
list_del(&vb->purge);
free_vmap_block(vb);
}
}
static void purge_fragmented_blocks_allcpus(void)
{
int cpu;
for_each_possible_cpu(cpu)
purge_fragmented_blocks(cpu);
}
static void *vb_alloc(unsigned long size, gfp_t gfp_mask)
{
struct vmap_block_queue *vbq;
struct vmap_block *vb;
void *vaddr = NULL;
unsigned int order;
BUG_ON(offset_in_page(size));
BUG_ON(size > PAGE_SIZE*VMAP_MAX_ALLOC);
if (WARN_ON(size == 0)) {
/*
* Allocating 0 bytes isn't what caller wants since
* get_order(0) returns funny result. Just warn and terminate
* early.
*/
return NULL;
}
order = get_order(size);
rcu_read_lock();
vbq = &get_cpu_var(vmap_block_queue);
list_for_each_entry_rcu(vb, &vbq->free, free_list) {
unsigned long pages_off;
spin_lock(&vb->lock);
if (vb->free < (1UL << order)) {
spin_unlock(&vb->lock);
continue;
}
pages_off = VMAP_BBMAP_BITS - vb->free;
vaddr = vmap_block_vaddr(vb->va->va_start, pages_off);
vb->free -= 1UL << order;
if (vb->free == 0) {
spin_lock(&vbq->lock);
list_del_rcu(&vb->free_list);
spin_unlock(&vbq->lock);
}
spin_unlock(&vb->lock);
break;
}
put_cpu_var(vmap_block_queue);
rcu_read_unlock();
/* Allocate new block if nothing was found */
if (!vaddr)
vaddr = new_vmap_block(order, gfp_mask);
return vaddr;
}
static void vb_free(unsigned long addr, unsigned long size)
{
unsigned long offset;
unsigned int order;
struct vmap_block *vb;
BUG_ON(offset_in_page(size));
BUG_ON(size > PAGE_SIZE*VMAP_MAX_ALLOC);
flush_cache_vunmap(addr, addr + size);
order = get_order(size);
offset = (addr & (VMAP_BLOCK_SIZE - 1)) >> PAGE_SHIFT;
vb = xa_load(&vmap_blocks, addr_to_vb_idx(addr));
vunmap_range_noflush(addr, addr + size);
if (debug_pagealloc_enabled_static())
flush_tlb_kernel_range(addr, addr + size);
spin_lock(&vb->lock);
/* Expand dirty range */
vb->dirty_min = min(vb->dirty_min, offset);
vb->dirty_max = max(vb->dirty_max, offset + (1UL << order));
vb->dirty += 1UL << order;
if (vb->dirty == VMAP_BBMAP_BITS) {
BUG_ON(vb->free);
spin_unlock(&vb->lock);
free_vmap_block(vb);
} else
spin_unlock(&vb->lock);
}
static void _vm_unmap_aliases(unsigned long start, unsigned long end, int flush)
{
int cpu;
if (unlikely(!vmap_initialized))
return;
might_sleep();
for_each_possible_cpu(cpu) {
struct vmap_block_queue *vbq = &per_cpu(vmap_block_queue, cpu);
struct vmap_block *vb;
rcu_read_lock();
list_for_each_entry_rcu(vb, &vbq->free, free_list) {
spin_lock(&vb->lock);
if (vb->dirty && vb->dirty != VMAP_BBMAP_BITS) {
unsigned long va_start = vb->va->va_start;
unsigned long s, e;
s = va_start + (vb->dirty_min << PAGE_SHIFT);
e = va_start + (vb->dirty_max << PAGE_SHIFT);
start = min(s, start);
end = max(e, end);
flush = 1;
}
spin_unlock(&vb->lock);
}
rcu_read_unlock();
}
mutex_lock(&vmap_purge_lock);
purge_fragmented_blocks_allcpus();
if (!__purge_vmap_area_lazy(start, end) && flush)
flush_tlb_kernel_range(start, end);
mutex_unlock(&vmap_purge_lock);
}
/**
* vm_unmap_aliases - unmap outstanding lazy aliases in the vmap layer
*
* The vmap/vmalloc layer lazily flushes kernel virtual mappings primarily
* to amortize TLB flushing overheads. What this means is that any page you
* have now, may, in a former life, have been mapped into kernel virtual
* address by the vmap layer and so there might be some CPUs with TLB entries
* still referencing that page (additional to the regular 1:1 kernel mapping).
*
* vm_unmap_aliases flushes all such lazy mappings. After it returns, we can
* be sure that none of the pages we have control over will have any aliases
* from the vmap layer.
*/
void vm_unmap_aliases(void)
{
unsigned long start = ULONG_MAX, end = 0;
int flush = 0;
_vm_unmap_aliases(start, end, flush);
}
EXPORT_SYMBOL_GPL(vm_unmap_aliases);
/**
* vm_unmap_ram - unmap linear kernel address space set up by vm_map_ram
* @mem: the pointer returned by vm_map_ram
* @count: the count passed to that vm_map_ram call (cannot unmap partial)
*/
void vm_unmap_ram(const void *mem, unsigned int count)
{
unsigned long size = (unsigned long)count << PAGE_SHIFT;
unsigned long addr = (unsigned long)mem;
struct vmap_area *va;
might_sleep();
BUG_ON(!addr);
BUG_ON(addr < VMALLOC_START);
BUG_ON(addr > VMALLOC_END);
BUG_ON(!PAGE_ALIGNED(addr));
kasan_poison_vmalloc(mem, size);
if (likely(count <= VMAP_MAX_ALLOC)) {
debug_check_no_locks_freed(mem, size);
vb_free(addr, size);
return;
}
va = find_vmap_area(addr);
BUG_ON(!va);
debug_check_no_locks_freed((void *)va->va_start,
(va->va_end - va->va_start));
free_unmap_vmap_area(va);
}
EXPORT_SYMBOL(vm_unmap_ram);
/**
* vm_map_ram - map pages linearly into kernel virtual address (vmalloc space)
* @pages: an array of pointers to the pages to be mapped
* @count: number of pages
* @node: prefer to allocate data structures on this node
*
* If you use this function for less than VMAP_MAX_ALLOC pages, it could be
* faster than vmap so it's good. But if you mix long-life and short-life
* objects with vm_map_ram(), it could consume lots of address space through
* fragmentation (especially on a 32bit machine). You could see failures in
* the end. Please use this function for short-lived objects.
*
* Returns: a pointer to the address that has been mapped, or %NULL on failure
*/
void *vm_map_ram(struct page **pages, unsigned int count, int node)
{
unsigned long size = (unsigned long)count << PAGE_SHIFT;
unsigned long addr;
void *mem;
if (likely(count <= VMAP_MAX_ALLOC)) {
mem = vb_alloc(size, GFP_KERNEL);
if (IS_ERR(mem))
return NULL;
addr = (unsigned long)mem;
} else {
struct vmap_area *va;
va = alloc_vmap_area(size, PAGE_SIZE,
VMALLOC_START, VMALLOC_END, node, GFP_KERNEL);
if (IS_ERR(va))
return NULL;
addr = va->va_start;
mem = (void *)addr;
}
kasan_unpoison_vmalloc(mem, size);
if (vmap_pages_range(addr, addr + size, PAGE_KERNEL,
pages, PAGE_SHIFT) < 0) {
vm_unmap_ram(mem, count);
return NULL;
}
return mem;
}
EXPORT_SYMBOL(vm_map_ram);
static struct vm_struct *vmlist __initdata;
static inline unsigned int vm_area_page_order(struct vm_struct *vm)
{
#ifdef CONFIG_HAVE_ARCH_HUGE_VMALLOC
return vm->page_order;
#else
return 0;
#endif
}
static inline void set_vm_area_page_order(struct vm_struct *vm, unsigned int order)
{
#ifdef CONFIG_HAVE_ARCH_HUGE_VMALLOC
vm->page_order = order;
#else
BUG_ON(order != 0);
#endif
}
/**
* vm_area_add_early - add vmap area early during boot
* @vm: vm_struct to add
*
* This function is used to add fixed kernel vm area to vmlist before
* vmalloc_init() is called. @vm->addr, @vm->size, and @vm->flags
* should contain proper values and the other fields should be zero.
*
* DO NOT USE THIS FUNCTION UNLESS YOU KNOW WHAT YOU'RE DOING.
*/
void __init vm_area_add_early(struct vm_struct *vm)
{
struct vm_struct *tmp, **p;
BUG_ON(vmap_initialized);
for (p = &vmlist; (tmp = *p) != NULL; p = &tmp->next) {
if (tmp->addr >= vm->addr) {
BUG_ON(tmp->addr < vm->addr + vm->size);
break;
} else
BUG_ON(tmp->addr + tmp->size > vm->addr);
}
vm->next = *p;
*p = vm;
}
/**
* vm_area_register_early - register vmap area early during boot
* @vm: vm_struct to register
* @align: requested alignment
*
* This function is used to register kernel vm area before
* vmalloc_init() is called. @vm->size and @vm->flags should contain
* proper values on entry and other fields should be zero. On return,
* vm->addr contains the allocated address.
*
* DO NOT USE THIS FUNCTION UNLESS YOU KNOW WHAT YOU'RE DOING.
*/
void __init vm_area_register_early(struct vm_struct *vm, size_t align)
{
static size_t vm_init_off __initdata;
unsigned long addr;
addr = ALIGN(VMALLOC_START + vm_init_off, align);
vm_init_off = PFN_ALIGN(addr + vm->size) - VMALLOC_START;
vm->addr = (void *)addr;
vm_area_add_early(vm);
}
static void vmap_init_free_space(void)
{
unsigned long vmap_start = 1;
const unsigned long vmap_end = ULONG_MAX;
struct vmap_area *busy, *free;
/*
* B F B B B F
* -|-----|.....|-----|-----|-----|.....|-
* | The KVA space |
* |<--------------------------------->|
*/
list_for_each_entry(busy, &vmap_area_list, list) {
if (busy->va_start - vmap_start > 0) {
free = kmem_cache_zalloc(vmap_area_cachep, GFP_NOWAIT);
if (!WARN_ON_ONCE(!free)) {
free->va_start = vmap_start;
free->va_end = busy->va_start;
insert_vmap_area_augment(free, NULL,
&free_vmap_area_root,
&free_vmap_area_list);
}
}
vmap_start = busy->va_end;
}
if (vmap_end - vmap_start > 0) {
free = kmem_cache_zalloc(vmap_area_cachep, GFP_NOWAIT);
if (!WARN_ON_ONCE(!free)) {
free->va_start = vmap_start;
free->va_end = vmap_end;
insert_vmap_area_augment(free, NULL,
&free_vmap_area_root,
&free_vmap_area_list);
}
}
}
void __init vmalloc_init(void)
{
struct vmap_area *va;
struct vm_struct *tmp;
int i;
/*
* Create the cache for vmap_area objects.
*/
vmap_area_cachep = KMEM_CACHE(vmap_area, SLAB_PANIC);
for_each_possible_cpu(i) {
struct vmap_block_queue *vbq;
struct vfree_deferred *p;
vbq = &per_cpu(vmap_block_queue, i);
spin_lock_init(&vbq->lock);
INIT_LIST_HEAD(&vbq->free);
p = &per_cpu(vfree_deferred, i);
init_llist_head(&p->list);
INIT_WORK(&p->wq, free_work);
}
/* Import existing vmlist entries. */
for (tmp = vmlist; tmp; tmp = tmp->next) {
va = kmem_cache_zalloc(vmap_area_cachep, GFP_NOWAIT);
if (WARN_ON_ONCE(!va))
continue;
va->va_start = (unsigned long)tmp->addr;
va->va_end = va->va_start + tmp->size;
va->vm = tmp;
insert_vmap_area(va, &vmap_area_root, &vmap_area_list);
}
/*
* Now we can initialize a free vmap space.
*/
vmap_init_free_space();
vmap_initialized = true;
}
static inline void setup_vmalloc_vm_locked(struct vm_struct *vm,
struct vmap_area *va, unsigned long flags, const void *caller)
{
vm->flags = flags;
vm->addr = (void *)va->va_start;
vm->size = va->va_end - va->va_start;
vm->caller = caller;
va->vm = vm;
}
static void setup_vmalloc_vm(struct vm_struct *vm, struct vmap_area *va,
unsigned long flags, const void *caller)
{
spin_lock(&vmap_area_lock);
setup_vmalloc_vm_locked(vm, va, flags, caller);
spin_unlock(&vmap_area_lock);
}
static void clear_vm_uninitialized_flag(struct vm_struct *vm)
{
/*
* Before removing VM_UNINITIALIZED,
* we should make sure that vm has proper values.
* Pair with smp_rmb() in show_numa_info().
*/
smp_wmb();
vm->flags &= ~VM_UNINITIALIZED;
}
static struct vm_struct *__get_vm_area_node(unsigned long size,
unsigned long align, unsigned long shift, unsigned long flags,
unsigned long start, unsigned long end, int node,
gfp_t gfp_mask, const void *caller)
{
struct vmap_area *va;
struct vm_struct *area;
unsigned long requested_size = size;
BUG_ON(in_interrupt()); size = ALIGN(size, 1ul << shift);
if (unlikely(!size))
return NULL;
if (flags & VM_IOREMAP)
align = 1ul << clamp_t(int, get_count_order_long(size),
PAGE_SHIFT, IOREMAP_MAX_ORDER);
area = kzalloc_node(sizeof(*area), gfp_mask & GFP_RECLAIM_MASK, node);
if (unlikely(!area))
return NULL;
if (!(flags & VM_NO_GUARD)) size += PAGE_SIZE; va = alloc_vmap_area(size, align, start, end, node, gfp_mask);
if (IS_ERR(va)) {
kfree(area);
return NULL;
}
kasan_unpoison_vmalloc((void *)va->va_start, requested_size);
setup_vmalloc_vm(area, va, flags, caller);
return area;
}
struct vm_struct *__get_vm_area_caller(unsigned long size, unsigned long flags,
unsigned long start, unsigned long end,
const void *caller)
{
return __get_vm_area_node(size, 1, PAGE_SHIFT, flags, start, end,
NUMA_NO_NODE, GFP_KERNEL, caller);
}
/**
* get_vm_area - reserve a contiguous kernel virtual area
* @size: size of the area
* @flags: %VM_IOREMAP for I/O mappings or VM_ALLOC
*
* Search an area of @size in the kernel virtual mapping area,
* and reserved it for out purposes. Returns the area descriptor
* on success or %NULL on failure.
*
* Return: the area descriptor on success or %NULL on failure.
*/
struct vm_struct *get_vm_area(unsigned long size, unsigned long flags)
{
return __get_vm_area_node(size, 1, PAGE_SHIFT, flags,
VMALLOC_START, VMALLOC_END,
NUMA_NO_NODE, GFP_KERNEL,
__builtin_return_address(0));
}
struct vm_struct *get_vm_area_caller(unsigned long size, unsigned long flags,
const void *caller)
{
return __get_vm_area_node(size, 1, PAGE_SHIFT, flags,
VMALLOC_START, VMALLOC_END,
NUMA_NO_NODE, GFP_KERNEL, caller);
}
/**
* find_vm_area - find a continuous kernel virtual area
* @addr: base address
*
* Search for the kernel VM area starting at @addr, and return it.
* It is up to the caller to do all required locking to keep the returned
* pointer valid.
*
* Return: the area descriptor on success or %NULL on failure.
*/
struct vm_struct *find_vm_area(const void *addr)
{
struct vmap_area *va;
va = find_vmap_area((unsigned long)addr);
if (!va)
return NULL;
return va->vm;
}
/**
* remove_vm_area - find and remove a continuous kernel virtual area
* @addr: base address
*
* Search for the kernel VM area starting at @addr, and remove it.
* This function returns the found VM area, but using it is NOT safe
* on SMP machines, except for its size or flags.
*
* Return: the area descriptor on success or %NULL on failure.
*/
struct vm_struct *remove_vm_area(const void *addr)
{
struct vmap_area *va;
might_sleep();
spin_lock(&vmap_area_lock);
va = __find_vmap_area((unsigned long)addr);
if (va && va->vm) {
struct vm_struct *vm = va->vm;
va->vm = NULL;
spin_unlock(&vmap_area_lock);
kasan_free_shadow(vm);
free_unmap_vmap_area(va);
return vm;
}
spin_unlock(&vmap_area_lock);
return NULL;
}
static inline void set_area_direct_map(const struct vm_struct *area,
int (*set_direct_map)(struct page *page))
{
int i;
/* HUGE_VMALLOC passes small pages to set_direct_map */
for (i = 0; i < area->nr_pages; i++) if (page_address(area->pages[i])) set_direct_map(area->pages[i]);
}
/* Handle removing and resetting vm mappings related to the vm_struct. */
static void vm_remove_mappings(struct vm_struct *area, int deallocate_pages)
{
unsigned long start = ULONG_MAX, end = 0;
unsigned int page_order = vm_area_page_order(area);
int flush_reset = area->flags & VM_FLUSH_RESET_PERMS;
int flush_dmap = 0;
int i;
remove_vm_area(area->addr);
/* If this is not VM_FLUSH_RESET_PERMS memory, no need for the below. */
if (!flush_reset)
return;
/*
* If not deallocating pages, just do the flush of the VM area and
* return.
*/
if (!deallocate_pages) {
vm_unmap_aliases();
return;
}
/*
* If execution gets here, flush the vm mapping and reset the direct
* map. Find the start and end range of the direct mappings to make sure
* the vm_unmap_aliases() flush includes the direct map.
*/
for (i = 0; i < area->nr_pages; i += 1U << page_order) { unsigned long addr = (unsigned long)page_address(area->pages[i]);
if (addr) {
unsigned long page_size;
page_size = PAGE_SIZE << page_order;
start = min(addr, start);
end = max(addr + page_size, end);
flush_dmap = 1;
}
}
/*
* Set direct map to something invalid so that it won't be cached if
* there are any accesses after the TLB flush, then flush the TLB and
* reset the direct map permissions to the default.
*/
set_area_direct_map(area, set_direct_map_invalid_noflush);
_vm_unmap_aliases(start, end, flush_dmap);
set_area_direct_map(area, set_direct_map_default_noflush);
}
static void __vunmap(const void *addr, int deallocate_pages)
{
struct vm_struct *area;
if (!addr)
return;
if (WARN(!PAGE_ALIGNED(addr), "Trying to vfree() bad address (%p)\n",
addr))
return;
area = find_vm_area(addr);
if (unlikely(!area)) {
WARN(1, KERN_ERR "Trying to vfree() nonexistent vm area (%p)\n",
addr);
return;
}
debug_check_no_locks_freed(area->addr, get_vm_area_size(area));
debug_check_no_obj_freed(area->addr, get_vm_area_size(area));
kasan_poison_vmalloc(area->addr, get_vm_area_size(area));
vm_remove_mappings(area, deallocate_pages);
if (deallocate_pages) {
unsigned int page_order = vm_area_page_order(area);
int i;
for (i = 0; i < area->nr_pages; i += 1U << page_order) { struct page *page = area->pages[i]; BUG_ON(!page); __free_pages(page, page_order);
cond_resched();
}
atomic_long_sub(area->nr_pages, &nr_vmalloc_pages);
kvfree(area->pages);
}
kfree(area);
}
static inline void __vfree_deferred(const void *addr)
{
/*
* Use raw_cpu_ptr() because this can be called from preemptible
* context. Preemption is absolutely fine here, because the llist_add()
* implementation is lockless, so it works even if we are adding to
* another cpu's list. schedule_work() should be fine with this too.
*/
struct vfree_deferred *p = raw_cpu_ptr(&vfree_deferred);
if (llist_add((struct llist_node *)addr, &p->list))
schedule_work(&p->wq);
}
/**
* vfree_atomic - release memory allocated by vmalloc()
* @addr: memory base address
*
* This one is just like vfree() but can be called in any atomic context
* except NMIs.
*/
void vfree_atomic(const void *addr)
{
BUG_ON(in_nmi());
kmemleak_free(addr);
if (!addr)
return;
__vfree_deferred(addr);
}
static void __vfree(const void *addr)
{
if (unlikely(in_interrupt()))
__vfree_deferred(addr);
else
__vunmap(addr, 1);
}
/**
* vfree - Release memory allocated by vmalloc()
* @addr: Memory base address
*
* Free the virtually continuous memory area starting at @addr, as obtained
* from one of the vmalloc() family of APIs. This will usually also free the
* physical memory underlying the virtual allocation, but that memory is
* reference counted, so it will not be freed until the last user goes away.
*
* If @addr is NULL, no operation is performed.
*
* Context:
* May sleep if called *not* from interrupt context.
* Must not be called in NMI context (strictly speaking, it could be
* if we have CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG, but making the calling
* conventions for vfree() arch-dependent would be a really bad idea).
*/
void vfree(const void *addr)
{
BUG_ON(in_nmi());
kmemleak_free(addr);
might_sleep_if(!in_interrupt()); if (!addr)
return;
__vfree(addr);
}
EXPORT_SYMBOL(vfree);
/**
* vunmap - release virtual mapping obtained by vmap()
* @addr: memory base address
*
* Free the virtually contiguous memory area starting at @addr,
* which was created from the page array passed to vmap().
*
* Must not be called in interrupt context.
*/
void vunmap(const void *addr)
{
BUG_ON(in_interrupt());
might_sleep();
if (addr)
__vunmap(addr, 0);
}
EXPORT_SYMBOL(vunmap);
/**
* vmap - map an array of pages into virtually contiguous space
* @pages: array of page pointers
* @count: number of pages to map
* @flags: vm_area->flags
* @prot: page protection for the mapping
*
* Maps @count pages from @pages into contiguous kernel virtual space.
* If @flags contains %VM_MAP_PUT_PAGES the ownership of the pages array itself
* (which must be kmalloc or vmalloc memory) and one reference per pages in it
* are transferred from the caller to vmap(), and will be freed / dropped when
* vfree() is called on the return value.
*
* Return: the address of the area or %NULL on failure
*/
void *vmap(struct page **pages, unsigned int count,
unsigned long flags, pgprot_t prot)
{
struct vm_struct *area;
unsigned long addr;
unsigned long size; /* In bytes */
might_sleep();
if (count > totalram_pages())
return NULL;
size = (unsigned long)count << PAGE_SHIFT;
area = get_vm_area_caller(size, flags, __builtin_return_address(0));
if (!area)
return NULL;
addr = (unsigned long)area->addr;
if (vmap_pages_range(addr, addr + size, pgprot_nx(prot),
pages, PAGE_SHIFT) < 0) {
vunmap(area->addr);
return NULL;
}
if (flags & VM_MAP_PUT_PAGES) {
area->pages = pages;
area->nr_pages = count;
}
return area->addr;
}
EXPORT_SYMBOL(vmap);
#ifdef CONFIG_VMAP_PFN
struct vmap_pfn_data {
unsigned long *pfns;
pgprot_t prot;
unsigned int idx;
};
static int vmap_pfn_apply(pte_t *pte, unsigned long addr, void *private)
{
struct vmap_pfn_data *data = private;
if (WARN_ON_ONCE(pfn_valid(data->pfns[data->idx])))
return -EINVAL;
*pte = pte_mkspecial(pfn_pte(data->pfns[data->idx++], data->prot));
return 0;
}
/**
* vmap_pfn - map an array of PFNs into virtually contiguous space
* @pfns: array of PFNs
* @count: number of pages to map
* @prot: page protection for the mapping
*
* Maps @count PFNs from @pfns into contiguous kernel virtual space and returns
* the start address of the mapping.
*/
void *vmap_pfn(unsigned long *pfns, unsigned int count, pgprot_t prot)
{
struct vmap_pfn_data data = { .pfns = pfns, .prot = pgprot_nx(prot) };
struct vm_struct *area;
area = get_vm_area_caller(count * PAGE_SIZE, VM_IOREMAP,
__builtin_return_address(0));
if (!area)
return NULL;
if (apply_to_page_range(&init_mm, (unsigned long)area->addr,
count * PAGE_SIZE, vmap_pfn_apply, &data)) {
free_vm_area(area);
return NULL;
}
return area->addr;
}
EXPORT_SYMBOL_GPL(vmap_pfn);
#endif /* CONFIG_VMAP_PFN */
static inline unsigned int
vm_area_alloc_pages(gfp_t gfp, int nid,
unsigned int order, unsigned int nr_pages, struct page **pages)
{
unsigned int nr_allocated = 0;
struct page *page;
int i;
/*
* For order-0 pages we make use of bulk allocator, if
* the page array is partly or not at all populated due
* to fails, fallback to a single page allocator that is
* more permissive.
*/
if (!order && nid != NUMA_NO_NODE) { while (nr_allocated < nr_pages) {
unsigned int nr, nr_pages_request;
/*
* A maximum allowed request is hard-coded and is 100
* pages per call. That is done in order to prevent a
* long preemption off scenario in the bulk-allocator
* so the range is [1:100].
*/
nr_pages_request = min(100U, nr_pages - nr_allocated);
nr = alloc_pages_bulk_array_node(gfp, nid,
nr_pages_request, pages + nr_allocated);
nr_allocated += nr;
cond_resched();
/*
* If zero or pages were obtained partly,
* fallback to a single page allocator.
*/
if (nr != nr_pages_request)
break;
}
} else if (order)
/*
* Compound pages required for remap_vmalloc_page if
* high-order pages.
*/
gfp |= __GFP_COMP;
/* High-order pages or fallback path if "bulk" fails. */
while (nr_allocated < nr_pages) { if (nid == NUMA_NO_NODE) page = alloc_pages(gfp, order);
else
page = alloc_pages_node(nid, gfp, order);
if (unlikely(!page))
break;
/*
* Careful, we allocate and map page-order pages, but
* tracking is done per PAGE_SIZE page so as to keep the
* vm_struct APIs independent of the physical/mapped size.
*/
for (i = 0; i < (1U << order); i++)
pages[nr_allocated + i] = page + i;
cond_resched();
nr_allocated += 1U << order;
}
return nr_allocated;
}
static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
pgprot_t prot, unsigned int page_shift,
int node)
{
const gfp_t nested_gfp = (gfp_mask & GFP_RECLAIM_MASK) | __GFP_ZERO;
unsigned long addr = (unsigned long)area->addr;
unsigned long size = get_vm_area_size(area);
unsigned long array_size;
unsigned int nr_small_pages = size >> PAGE_SHIFT;
unsigned int page_order;
array_size = (unsigned long)nr_small_pages * sizeof(struct page *);
gfp_mask |= __GFP_NOWARN;
if (!(gfp_mask & (GFP_DMA | GFP_DMA32)))
gfp_mask |= __GFP_HIGHMEM;
/* Please note that the recursion is strictly bounded. */
if (array_size > PAGE_SIZE) { area->pages = __vmalloc_node(array_size, 1, nested_gfp, node,
area->caller);
} else {
area->pages = kmalloc_node(array_size, nested_gfp, node);
}
if (!area->pages) { warn_alloc(gfp_mask, NULL,
"vmalloc error: size %lu, failed to allocated page array size %lu",
nr_small_pages * PAGE_SIZE, array_size);
free_vm_area(area);
return NULL;
}
set_vm_area_page_order(area, page_shift - PAGE_SHIFT);
page_order = vm_area_page_order(area);
area->nr_pages = vm_area_alloc_pages(gfp_mask, node,
page_order, nr_small_pages, area->pages);
atomic_long_add(area->nr_pages, &nr_vmalloc_pages);
/*
* If not enough pages were obtained to accomplish an
* allocation request, free them via __vfree() if any.
*/
if (area->nr_pages != nr_small_pages) {
warn_alloc(gfp_mask, NULL,
"vmalloc error: size %lu, page order %u, failed to allocate pages",
area->nr_pages * PAGE_SIZE, page_order);
goto fail;
}
if (vmap_pages_range(addr, addr + size, prot, area->pages,
page_shift) < 0) {
warn_alloc(gfp_mask, NULL,
"vmalloc error: size %lu, failed to map pages",
area->nr_pages * PAGE_SIZE);
goto fail;
}
return area->addr;
fail:
__vfree(area->addr);
return NULL;
}
/**
* __vmalloc_node_range - allocate virtually contiguous memory
* @size: allocation size
* @align: desired alignment
* @start: vm area range start
* @end: vm area range end
* @gfp_mask: flags for the page level allocator
* @prot: protection mask for the allocated pages
* @vm_flags: additional vm area flags (e.g. %VM_NO_GUARD)
* @node: node to use for allocation or NUMA_NO_NODE
* @caller: caller's return address
*
* Allocate enough pages to cover @size from the page level
* allocator with @gfp_mask flags. Map them into contiguous
* kernel virtual space, using a pagetable protection of @prot.
*
* Return: the address of the area or %NULL on failure
*/
void *__vmalloc_node_range(unsigned long size, unsigned long align,
unsigned long start, unsigned long end, gfp_t gfp_mask,
pgprot_t prot, unsigned long vm_flags, int node,
const void *caller)
{
struct vm_struct *area;
void *addr;
unsigned long real_size = size;
unsigned long real_align = align;
unsigned int shift = PAGE_SHIFT;
if (WARN_ON_ONCE(!size))
return NULL;
if ((size >> PAGE_SHIFT) > totalram_pages()) { warn_alloc(gfp_mask, NULL,
"vmalloc error: size %lu, exceeds total pages",
real_size);
return NULL;
}
if (vmap_allow_huge && !(vm_flags & VM_NO_HUGE_VMAP)) {
unsigned long size_per_node;
/*
* Try huge pages. Only try for PAGE_KERNEL allocations,
* others like modules don't yet expect huge pages in
* their allocations due to apply_to_page_range not
* supporting them.
*/
size_per_node = size;
if (node == NUMA_NO_NODE)
size_per_node /= num_online_nodes();
if (arch_vmap_pmd_supported(prot) && size_per_node >= PMD_SIZE)
shift = PMD_SHIFT;
else
shift = arch_vmap_pte_supported_shift(size_per_node);
align = max(real_align, 1UL << shift);
size = ALIGN(real_size, 1UL << shift);
}
again:
area = __get_vm_area_node(real_size, align, shift, VM_ALLOC |
VM_UNINITIALIZED | vm_flags, start, end, node,
gfp_mask, caller);
if (!area) {
warn_alloc(gfp_mask, NULL,
"vmalloc error: size %lu, vm_struct allocation failed",
real_size);
goto fail;
}
addr = __vmalloc_area_node(area, gfp_mask, prot, shift, node);
if (!addr)
goto fail;
/*
* In this function, newly allocated vm_struct has VM_UNINITIALIZED
* flag. It means that vm_struct is not fully initialized.
* Now, it is fully initialized, so remove this flag here.
*/
clear_vm_uninitialized_flag(area);
size = PAGE_ALIGN(size);
if (!(vm_flags & VM_DEFER_KMEMLEAK))
kmemleak_vmalloc(area, size, gfp_mask);
return addr;
fail:
if (shift > PAGE_SHIFT) {
shift = PAGE_SHIFT;
align = real_align;
size = real_size;
goto again;
}
return NULL;
}
/**
* __vmalloc_node - allocate virtually contiguous memory
* @size: allocation size
* @align: desired alignment
* @gfp_mask: flags for the page level allocator
* @node: node to use for allocation or NUMA_NO_NODE
* @caller: caller's return address
*
* Allocate enough pages to cover @size from the page level allocator with
* @gfp_mask flags. Map them into contiguous kernel virtual space.
*
* Reclaim modifiers in @gfp_mask - __GFP_NORETRY, __GFP_RETRY_MAYFAIL
* and __GFP_NOFAIL are not supported
*
* Any use of gfp flags outside of GFP_KERNEL should be consulted
* with mm people.
*
* Return: pointer to the allocated memory or %NULL on error
*/
void *__vmalloc_node(unsigned long size, unsigned long align,
gfp_t gfp_mask, int node, const void *caller)
{
return __vmalloc_node_range(size, align, VMALLOC_START, VMALLOC_END, gfp_mask, PAGE_KERNEL, 0, node, caller);
}
/*
* This is only for performance analysis of vmalloc and stress purpose.
* It is required by vmalloc test module, therefore do not use it other
* than that.
*/
#ifdef CONFIG_TEST_VMALLOC_MODULE
EXPORT_SYMBOL_GPL(__vmalloc_node);
#endif
void *__vmalloc(unsigned long size, gfp_t gfp_mask)
{
return __vmalloc_node(size, 1, gfp_mask, NUMA_NO_NODE,
__builtin_return_address(0));
}
EXPORT_SYMBOL(__vmalloc);
/**
* vmalloc - allocate virtually contiguous memory
* @size: allocation size
*
* Allocate enough pages to cover @size from the page level
* allocator and map them into contiguous kernel virtual space.
*
* For tight control over page level allocator and protection flags
* use __vmalloc() instead.
*
* Return: pointer to the allocated memory or %NULL on error
*/
void *vmalloc(unsigned long size)
{
return __vmalloc_node(size, 1, GFP_KERNEL, NUMA_NO_NODE,
__builtin_return_address(0));
}
EXPORT_SYMBOL(vmalloc);
/**
* vmalloc_no_huge - allocate virtually contiguous memory using small pages
* @size: allocation size
*
* Allocate enough non-huge pages to cover @size from the page level
* allocator and map them into contiguous kernel virtual space.
*
* Return: pointer to the allocated memory or %NULL on error
*/
void *vmalloc_no_huge(unsigned long size)
{
return __vmalloc_node_range(size, 1, VMALLOC_START, VMALLOC_END,
GFP_KERNEL, PAGE_KERNEL, VM_NO_HUGE_VMAP,
NUMA_NO_NODE, __builtin_return_address(0));
}
EXPORT_SYMBOL(vmalloc_no_huge);
/**
* vzalloc - allocate virtually contiguous memory with zero fill
* @size: allocation size
*
* Allocate enough pages to cover @size from the page level
* allocator and map them into contiguous kernel virtual space.
* The memory allocated is set to zero.
*
* For tight control over page level allocator and protection flags
* use __vmalloc() instead.
*
* Return: pointer to the allocated memory or %NULL on error
*/
void *vzalloc(unsigned long size)
{
return __vmalloc_node(size, 1, GFP_KERNEL | __GFP_ZERO, NUMA_NO_NODE,
__builtin_return_address(0));
}
EXPORT_SYMBOL(vzalloc);
/**
* vmalloc_user - allocate zeroed virtually contiguous memory for userspace
* @size: allocation size
*
* The resulting memory area is zeroed so it can be mapped to userspace
* without leaking data.
*
* Return: pointer to the allocated memory or %NULL on error
*/
void *vmalloc_user(unsigned long size)
{
return __vmalloc_node_range(size, SHMLBA, VMALLOC_START, VMALLOC_END,
GFP_KERNEL | __GFP_ZERO, PAGE_KERNEL,
VM_USERMAP, NUMA_NO_NODE,
__builtin_return_address(0));
}
EXPORT_SYMBOL(vmalloc_user);
/**
* vmalloc_node - allocate memory on a specific node
* @size: allocation size
* @node: numa node
*
* Allocate enough pages to cover @size from the page level
* allocator and map them into contiguous kernel virtual space.
*
* For tight control over page level allocator and protection flags
* use __vmalloc() instead.
*
* Return: pointer to the allocated memory or %NULL on error
*/
void *vmalloc_node(unsigned long size, int node)
{
return __vmalloc_node(size, 1, GFP_KERNEL, node,
__builtin_return_address(0));
}
EXPORT_SYMBOL(vmalloc_node);
/**
* vzalloc_node - allocate memory on a specific node with zero fill
* @size: allocation size
* @node: numa node
*
* Allocate enough pages to cover @size from the page level
* allocator and map them into contiguous kernel virtual space.
* The memory allocated is set to zero.
*
* Return: pointer to the allocated memory or %NULL on error
*/
void *vzalloc_node(unsigned long size, int node)
{
return __vmalloc_node(size, 1, GFP_KERNEL | __GFP_ZERO, node,
__builtin_return_address(0));
}
EXPORT_SYMBOL(vzalloc_node);
#if defined(CONFIG_64BIT) && defined(CONFIG_ZONE_DMA32)
#define GFP_VMALLOC32 (GFP_DMA32 | GFP_KERNEL)
#elif defined(CONFIG_64BIT) && defined(CONFIG_ZONE_DMA)
#define GFP_VMALLOC32 (GFP_DMA | GFP_KERNEL)
#else
/*
* 64b systems should always have either DMA or DMA32 zones. For others
* GFP_DMA32 should do the right thing and use the normal zone.
*/
#define GFP_VMALLOC32 (GFP_DMA32 | GFP_KERNEL)
#endif
/**
* vmalloc_32 - allocate virtually contiguous memory (32bit addressable)
* @size: allocation size
*
* Allocate enough 32bit PA addressable pages to cover @size from the
* page level allocator and map them into contiguous kernel virtual space.
*
* Return: pointer to the allocated memory or %NULL on error
*/
void *vmalloc_32(unsigned long size)
{
return __vmalloc_node(size, 1, GFP_VMALLOC32, NUMA_NO_NODE,
__builtin_return_address(0));
}
EXPORT_SYMBOL(vmalloc_32);
/**
* vmalloc_32_user - allocate zeroed virtually contiguous 32bit memory
* @size: allocation size
*
* The resulting memory area is 32bit addressable and zeroed so it can be
* mapped to userspace without leaking data.
*
* Return: pointer to the allocated memory or %NULL on error
*/
void *vmalloc_32_user(unsigned long size)
{
return __vmalloc_node_range(size, SHMLBA, VMALLOC_START, VMALLOC_END,
GFP_VMALLOC32 | __GFP_ZERO, PAGE_KERNEL,
VM_USERMAP, NUMA_NO_NODE,
__builtin_return_address(0));
}
EXPORT_SYMBOL(vmalloc_32_user);
/*
* small helper routine , copy contents to buf from addr.
* If the page is not present, fill zero.
*/
static int aligned_vread(char *buf, char *addr, unsigned long count)
{
struct page *p;
int copied = 0;
while (count) {
unsigned long offset, length;
offset = offset_in_page(addr);
length = PAGE_SIZE - offset;
if (length > count)
length = count;
p = vmalloc_to_page(addr);
/*
* To do safe access to this _mapped_ area, we need
* lock. But adding lock here means that we need to add
* overhead of vmalloc()/vfree() calls for this _debug_
* interface, rarely used. Instead of that, we'll use
* kmap() and get small overhead in this access function.
*/
if (p) {
/* We can expect USER0 is not used -- see vread() */
void *map = kmap_atomic(p);
memcpy(buf, map + offset, length);
kunmap_atomic(map);
} else
memset(buf, 0, length);
addr += length;
buf += length;
copied += length;
count -= length;
}
return copied;
}
/**
* vread() - read vmalloc area in a safe way.
* @buf: buffer for reading data
* @addr: vm address.
* @count: number of bytes to be read.
*
* This function checks that addr is a valid vmalloc'ed area, and
* copy data from that area to a given buffer. If the given memory range
* of [addr...addr+count) includes some valid address, data is copied to
* proper area of @buf. If there are memory holes, they'll be zero-filled.
* IOREMAP area is treated as memory hole and no copy is done.
*
* If [addr...addr+count) doesn't includes any intersects with alive
* vm_struct area, returns 0. @buf should be kernel's buffer.
*
* Note: In usual ops, vread() is never necessary because the caller
* should know vmalloc() area is valid and can use memcpy().
* This is for routines which have to access vmalloc area without
* any information, as /proc/kcore.
*
* Return: number of bytes for which addr and buf should be increased
* (same number as @count) or %0 if [addr...addr+count) doesn't
* include any intersection with valid vmalloc area
*/
long vread(char *buf, char *addr, unsigned long count)
{
struct vmap_area *va;
struct vm_struct *vm;
char *vaddr, *buf_start = buf;
unsigned long buflen = count;
unsigned long n;
/* Don't allow overflow */
if ((unsigned long) addr + count < count)
count = -(unsigned long) addr;
spin_lock(&vmap_area_lock);
va = find_vmap_area_exceed_addr((unsigned long)addr);
if (!va)
goto finished;
/* no intersects with alive vmap_area */
if ((unsigned long)addr + count <= va->va_start)
goto finished;
list_for_each_entry_from(va, &vmap_area_list, list) {
if (!count)
break;
if (!va->vm)
continue;
vm = va->vm;
vaddr = (char *) vm->addr;
if (addr >= vaddr + get_vm_area_size(vm))
continue;
while (addr < vaddr) {
if (count == 0)
goto finished;
*buf = '\0';
buf++;
addr++;
count--;
}
n = vaddr + get_vm_area_size(vm) - addr;
if (n > count)
n = count;
if (!(vm->flags & VM_IOREMAP))
aligned_vread(buf, addr, n);
else /* IOREMAP area is treated as memory hole */
memset(buf, 0, n);
buf += n;
addr += n;
count -= n;
}
finished:
spin_unlock(&vmap_area_lock);
if (buf == buf_start)
return 0;
/* zero-fill memory holes */
if (buf != buf_start + buflen)
memset(buf, 0, buflen - (buf - buf_start));
return buflen;
}
/**
* remap_vmalloc_range_partial - map vmalloc pages to userspace
* @vma: vma to cover
* @uaddr: target user address to start at
* @kaddr: virtual address of vmalloc kernel memory
* @pgoff: offset from @kaddr to start at
* @size: size of map area
*
* Returns: 0 for success, -Exxx on failure
*
* This function checks that @kaddr is a valid vmalloc'ed area,
* and that it is big enough to cover the range starting at
* @uaddr in @vma. Will return failure if that criteria isn't
* met.
*
* Similar to remap_pfn_range() (see mm/memory.c)
*/
int remap_vmalloc_range_partial(struct vm_area_struct *vma, unsigned long uaddr,
void *kaddr, unsigned long pgoff,
unsigned long size)
{
struct vm_struct *area;
unsigned long off;
unsigned long end_index;
if (check_shl_overflow(pgoff, PAGE_SHIFT, &off))
return -EINVAL;
size = PAGE_ALIGN(size);
if (!PAGE_ALIGNED(uaddr) || !PAGE_ALIGNED(kaddr))
return -EINVAL;
area = find_vm_area(kaddr);
if (!area)
return -EINVAL;
if (!(area->flags & (VM_USERMAP | VM_DMA_COHERENT)))
return -EINVAL;
if (check_add_overflow(size, off, &end_index) ||
end_index > get_vm_area_size(area))
return -EINVAL;
kaddr += off;
do {
struct page *page = vmalloc_to_page(kaddr);
int ret;
ret = vm_insert_page(vma, uaddr, page);
if (ret)
return ret;
uaddr += PAGE_SIZE;
kaddr += PAGE_SIZE;
size -= PAGE_SIZE;
} while (size > 0);
vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP;
return 0;
}
/**
* remap_vmalloc_range - map vmalloc pages to userspace
* @vma: vma to cover (map full range of vma)
* @addr: vmalloc memory
* @pgoff: number of pages into addr before first page to map
*
* Returns: 0 for success, -Exxx on failure
*
* This function checks that addr is a valid vmalloc'ed area, and
* that it is big enough to cover the vma. Will return failure if
* that criteria isn't met.
*
* Similar to remap_pfn_range() (see mm/memory.c)
*/
int remap_vmalloc_range(struct vm_area_struct *vma, void *addr,
unsigned long pgoff)
{
return remap_vmalloc_range_partial(vma, vma->vm_start,
addr, pgoff,
vma->vm_end - vma->vm_start);
}
EXPORT_SYMBOL(remap_vmalloc_range);
void free_vm_area(struct vm_struct *area)
{
struct vm_struct *ret;
ret = remove_vm_area(area->addr);
BUG_ON(ret != area);
kfree(area);
}
EXPORT_SYMBOL_GPL(free_vm_area);
#ifdef CONFIG_SMP
static struct vmap_area *node_to_va(struct rb_node *n)
{
return rb_entry_safe(n, struct vmap_area, rb_node);
}
/**
* pvm_find_va_enclose_addr - find the vmap_area @addr belongs to
* @addr: target address
*
* Returns: vmap_area if it is found. If there is no such area
* the first highest(reverse order) vmap_area is returned
* i.e. va->va_start < addr && va->va_end < addr or NULL
* if there are no any areas before @addr.
*/
static struct vmap_area *
pvm_find_va_enclose_addr(unsigned long addr)
{
struct vmap_area *va, *tmp;
struct rb_node *n;
n = free_vmap_area_root.rb_node;
va = NULL;
while (n) {
tmp = rb_entry(n, struct vmap_area, rb_node);
if (tmp->va_start <= addr) {
va = tmp;
if (tmp->va_end >= addr)
break;
n = n->rb_right;
} else {
n = n->rb_left;
}
}
return va;
}
/**
* pvm_determine_end_from_reverse - find the highest aligned address
* of free block below VMALLOC_END
* @va:
* in - the VA we start the search(reverse order);
* out - the VA with the highest aligned end address.
* @align: alignment for required highest address
*
* Returns: determined end address within vmap_area
*/
static unsigned long
pvm_determine_end_from_reverse(struct vmap_area **va, unsigned long align)
{
unsigned long vmalloc_end = VMALLOC_END & ~(align - 1);
unsigned long addr;
if (likely(*va)) {
list_for_each_entry_from_reverse((*va),
&free_vmap_area_list, list) {
addr = min((*va)->va_end & ~(align - 1), vmalloc_end);
if ((*va)->va_start < addr)
return addr;
}
}
return 0;
}
/**
* pcpu_get_vm_areas - allocate vmalloc areas for percpu allocator
* @offsets: array containing offset of each area
* @sizes: array containing size of each area
* @nr_vms: the number of areas to allocate
* @align: alignment, all entries in @offsets and @sizes must be aligned to this
*
* Returns: kmalloc'd vm_struct pointer array pointing to allocated
* vm_structs on success, %NULL on failure
*
* Percpu allocator wants to use congruent vm areas so that it can
* maintain the offsets among percpu areas. This function allocates
* congruent vmalloc areas for it with GFP_KERNEL. These areas tend to
* be scattered pretty far, distance between two areas easily going up
* to gigabytes. To avoid interacting with regular vmallocs, these
* areas are allocated from top.
*
* Despite its complicated look, this allocator is rather simple. It
* does everything top-down and scans free blocks from the end looking
* for matching base. While scanning, if any of the areas do not fit the
* base address is pulled down to fit the area. Scanning is repeated till
* all the areas fit and then all necessary data structures are inserted
* and the result is returned.
*/
struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets,
const size_t *sizes, int nr_vms,
size_t align)
{
const unsigned long vmalloc_start = ALIGN(VMALLOC_START, align);
const unsigned long vmalloc_end = VMALLOC_END & ~(align - 1);
struct vmap_area **vas, *va;
struct vm_struct **vms;
int area, area2, last_area, term_area;
unsigned long base, start, size, end, last_end, orig_start, orig_end;
bool purged = false;
enum fit_type type;
/* verify parameters and allocate data structures */
BUG_ON(offset_in_page(align) || !is_power_of_2(align));
for (last_area = 0, area = 0; area < nr_vms; area++) {
start = offsets[area];
end = start + sizes[area];
/* is everything aligned properly? */
BUG_ON(!IS_ALIGNED(offsets[area], align));
BUG_ON(!IS_ALIGNED(sizes[area], align));
/* detect the area with the highest address */
if (start > offsets[last_area])
last_area = area;
for (area2 = area + 1; area2 < nr_vms; area2++) {
unsigned long start2 = offsets[area2];
unsigned long end2 = start2 + sizes[area2];
BUG_ON(start2 < end && start < end2);
}
}
last_end = offsets[last_area] + sizes[last_area];
if (vmalloc_end - vmalloc_start < last_end) {
WARN_ON(true);
return NULL;
}
vms = kcalloc(nr_vms, sizeof(vms[0]), GFP_KERNEL);
vas = kcalloc(nr_vms, sizeof(vas[0]), GFP_KERNEL);
if (!vas || !vms)
goto err_free2;
for (area = 0; area < nr_vms; area++) {
vas[area] = kmem_cache_zalloc(vmap_area_cachep, GFP_KERNEL);
vms[area] = kzalloc(sizeof(struct vm_struct), GFP_KERNEL);
if (!vas[area] || !vms[area])
goto err_free;
}
retry:
spin_lock(&free_vmap_area_lock);
/* start scanning - we scan from the top, begin with the last area */
area = term_area = last_area;
start = offsets[area];
end = start + sizes[area];
va = pvm_find_va_enclose_addr(vmalloc_end);
base = pvm_determine_end_from_reverse(&va, align) - end;
while (true) {
/*
* base might have underflowed, add last_end before
* comparing.
*/
if (base + last_end < vmalloc_start + last_end)
goto overflow;
/*
* Fitting base has not been found.
*/
if (va == NULL)
goto overflow;
/*
* If required width exceeds current VA block, move
* base downwards and then recheck.
*/
if (base + end > va->va_end) {
base = pvm_determine_end_from_reverse(&va, align) - end;
term_area = area;
continue;
}
/*
* If this VA does not fit, move base downwards and recheck.
*/
if (base + start < va->va_start) {
va = node_to_va(rb_prev(&va->rb_node));
base = pvm_determine_end_from_reverse(&va, align) - end;
term_area = area;
continue;
}
/*
* This area fits, move on to the previous one. If
* the previous one is the terminal one, we're done.
*/
area = (area + nr_vms - 1) % nr_vms;
if (area == term_area)
break;
start = offsets[area];
end = start + sizes[area];
va = pvm_find_va_enclose_addr(base + end);
}
/* we've found a fitting base, insert all va's */
for (area = 0; area < nr_vms; area++) {
int ret;
start = base + offsets[area];
size = sizes[area];
va = pvm_find_va_enclose_addr(start);
if (WARN_ON_ONCE(va == NULL))
/* It is a BUG(), but trigger recovery instead. */
goto recovery;
type = classify_va_fit_type(va, start, size);
if (WARN_ON_ONCE(type == NOTHING_FIT))
/* It is a BUG(), but trigger recovery instead. */
goto recovery;
ret = adjust_va_to_fit_type(va, start, size, type);
if (unlikely(ret))
goto recovery;
/* Allocated area. */
va = vas[area];
va->va_start = start;
va->va_end = start + size;
}
spin_unlock(&free_vmap_area_lock);
/* populate the kasan shadow space */
for (area = 0; area < nr_vms; area++) {
if (kasan_populate_vmalloc(vas[area]->va_start, sizes[area]))
goto err_free_shadow;
kasan_unpoison_vmalloc((void *)vas[area]->va_start,
sizes[area]);
}
/* insert all vm's */
spin_lock(&vmap_area_lock);
for (area = 0; area < nr_vms; area++) {
insert_vmap_area(vas[area], &vmap_area_root, &vmap_area_list);
setup_vmalloc_vm_locked(vms[area], vas[area], VM_ALLOC,
pcpu_get_vm_areas);
}
spin_unlock(&vmap_area_lock);
kfree(vas);
return vms;
recovery:
/*
* Remove previously allocated areas. There is no
* need in removing these areas from the busy tree,
* because they are inserted only on the final step
* and when pcpu_get_vm_areas() is success.
*/
while (area--) {
orig_start = vas[area]->va_start;
orig_end = vas[area]->va_end;
va = merge_or_add_vmap_area_augment(vas[area], &free_vmap_area_root,
&free_vmap_area_list);
if (va)
kasan_release_vmalloc(orig_start, orig_end,
va->va_start, va->va_end);
vas[area] = NULL;
}
overflow:
spin_unlock(&free_vmap_area_lock);
if (!purged) {
purge_vmap_area_lazy();
purged = true;
/* Before "retry", check if we recover. */
for (area = 0; area < nr_vms; area++) {
if (vas[area])
continue;
vas[area] = kmem_cache_zalloc(
vmap_area_cachep, GFP_KERNEL);
if (!vas[area])
goto err_free;
}
goto retry;
}
err_free:
for (area = 0; area < nr_vms; area++) {
if (vas[area])
kmem_cache_free(vmap_area_cachep, vas[area]);
kfree(vms[area]);
}
err_free2:
kfree(vas);
kfree(vms);
return NULL;
err_free_shadow:
spin_lock(&free_vmap_area_lock);
/*
* We release all the vmalloc shadows, even the ones for regions that
* hadn't been successfully added. This relies on kasan_release_vmalloc
* being able to tolerate this case.
*/
for (area = 0; area < nr_vms; area++) {
orig_start = vas[area]->va_start;
orig_end = vas[area]->va_end;
va = merge_or_add_vmap_area_augment(vas[area], &free_vmap_area_root,
&free_vmap_area_list);
if (va)
kasan_release_vmalloc(orig_start, orig_end,
va->va_start, va->va_end);
vas[area] = NULL;
kfree(vms[area]);
}
spin_unlock(&free_vmap_area_lock);
kfree(vas);
kfree(vms);
return NULL;
}
/**
* pcpu_free_vm_areas - free vmalloc areas for percpu allocator
* @vms: vm_struct pointer array returned by pcpu_get_vm_areas()
* @nr_vms: the number of allocated areas
*
* Free vm_structs and the array allocated by pcpu_get_vm_areas().
*/
void pcpu_free_vm_areas(struct vm_struct **vms, int nr_vms)
{
int i;
for (i = 0; i < nr_vms; i++)
free_vm_area(vms[i]);
kfree(vms);
}
#endif /* CONFIG_SMP */
#ifdef CONFIG_PRINTK
bool vmalloc_dump_obj(void *object)
{
struct vm_struct *vm;
void *objp = (void *)PAGE_ALIGN((unsigned long)object);
vm = find_vm_area(objp);
if (!vm)
return false;
pr_cont(" %u-page vmalloc region starting at %#lx allocated at %pS\n",
vm->nr_pages, (unsigned long)vm->addr, vm->caller);
return true;
}
#endif
#ifdef CONFIG_PROC_FS
static void *s_start(struct seq_file *m, loff_t *pos)
__acquires(&vmap_purge_lock)
__acquires(&vmap_area_lock)
{
mutex_lock(&vmap_purge_lock);
spin_lock(&vmap_area_lock);
return seq_list_start(&vmap_area_list, *pos);
}
static void *s_next(struct seq_file *m, void *p, loff_t *pos)
{
return seq_list_next(p, &vmap_area_list, pos);
}
static void s_stop(struct seq_file *m, void *p)
__releases(&vmap_area_lock)
__releases(&vmap_purge_lock)
{
spin_unlock(&vmap_area_lock);
mutex_unlock(&vmap_purge_lock);
}
static void show_numa_info(struct seq_file *m, struct vm_struct *v)
{
if (IS_ENABLED(CONFIG_NUMA)) {
unsigned int nr, *counters = m->private;
if (!counters)
return;
if (v->flags & VM_UNINITIALIZED)
return;
/* Pair with smp_wmb() in clear_vm_uninitialized_flag() */
smp_rmb();
memset(counters, 0, nr_node_ids * sizeof(unsigned int));
for (nr = 0; nr < v->nr_pages; nr++)
counters[page_to_nid(v->pages[nr])]++;
for_each_node_state(nr, N_HIGH_MEMORY)
if (counters[nr])
seq_printf(m, " N%u=%u", nr, counters[nr]);
}
}
static void show_purge_info(struct seq_file *m)
{
struct vmap_area *va;
spin_lock(&purge_vmap_area_lock);
list_for_each_entry(va, &purge_vmap_area_list, list) {
seq_printf(m, "0x%pK-0x%pK %7ld unpurged vm_area\n",
(void *)va->va_start, (void *)va->va_end,
va->va_end - va->va_start);
}
spin_unlock(&purge_vmap_area_lock);
}
static int s_show(struct seq_file *m, void *p)
{
struct vmap_area *va;
struct vm_struct *v;
va = list_entry(p, struct vmap_area, list);
/*
* s_show can encounter race with remove_vm_area, !vm on behalf
* of vmap area is being tear down or vm_map_ram allocation.
*/
if (!va->vm) {
seq_printf(m, "0x%pK-0x%pK %7ld vm_map_ram\n",
(void *)va->va_start, (void *)va->va_end,
va->va_end - va->va_start);
return 0;
}
v = va->vm;
seq_printf(m, "0x%pK-0x%pK %7ld",
v->addr, v->addr + v->size, v->size);
if (v->caller)
seq_printf(m, " %pS", v->caller);
if (v->nr_pages)
seq_printf(m, " pages=%d", v->nr_pages);
if (v->phys_addr)
seq_printf(m, " phys=%pa", &v->phys_addr);
if (v->flags & VM_IOREMAP)
seq_puts(m, " ioremap");
if (v->flags & VM_ALLOC)
seq_puts(m, " vmalloc");
if (v->flags & VM_MAP)
seq_puts(m, " vmap");
if (v->flags & VM_USERMAP)
seq_puts(m, " user");
if (v->flags & VM_DMA_COHERENT)
seq_puts(m, " dma-coherent");
if (is_vmalloc_addr(v->pages))
seq_puts(m, " vpages");
show_numa_info(m, v);
seq_putc(m, '\n');
/*
* As a final step, dump "unpurged" areas.
*/
if (list_is_last(&va->list, &vmap_area_list))
show_purge_info(m);
return 0;
}
static const struct seq_operations vmalloc_op = {
.start = s_start,
.next = s_next,
.stop = s_stop,
.show = s_show,
};
static int __init proc_vmalloc_init(void)
{
if (IS_ENABLED(CONFIG_NUMA))
proc_create_seq_private("vmallocinfo", 0400, NULL,
&vmalloc_op,
nr_node_ids * sizeof(unsigned int), NULL);
else
proc_create_seq("vmallocinfo", 0400, NULL, &vmalloc_op);
return 0;
}
module_init(proc_vmalloc_init);
#endif
/* SPDX-License-Identifier: GPL-2.0 */
/*
* Copyright (C) 2008 Oracle. All rights reserved.
*/
#ifndef BTRFS_LOCKING_H
#define BTRFS_LOCKING_H
#include <linux/atomic.h>
#include <linux/wait.h>
#include <linux/percpu_counter.h>
#include "extent_io.h"
#define BTRFS_WRITE_LOCK 1
#define BTRFS_READ_LOCK 2
/*
* We are limited in number of subclasses by MAX_LOCKDEP_SUBCLASSES, which at
* the time of this patch is 8, which is how many we use. Keep this in mind if
* you decide you want to add another subclass.
*/
enum btrfs_lock_nesting {
BTRFS_NESTING_NORMAL,
/*
* When we COW a block we are holding the lock on the original block,
* and since our lockdep maps are rootid+level, this confuses lockdep
* when we lock the newly allocated COW'd block. Handle this by having
* a subclass for COW'ed blocks so that lockdep doesn't complain.
*/
BTRFS_NESTING_COW,
/*
* Oftentimes we need to lock adjacent nodes on the same level while
* still holding the lock on the original node we searched to, such as
* for searching forward or for split/balance.
*
* Because of this we need to indicate to lockdep that this is
* acceptable by having a different subclass for each of these
* operations.
*/
BTRFS_NESTING_LEFT,
BTRFS_NESTING_RIGHT,
/*
* When splitting we will be holding a lock on the left/right node when
* we need to cow that node, thus we need a new set of subclasses for
* these two operations.
*/
BTRFS_NESTING_LEFT_COW,
BTRFS_NESTING_RIGHT_COW,
/*
* When splitting we may push nodes to the left or right, but still use
* the subsequent nodes in our path, keeping our locks on those adjacent
* blocks. Thus when we go to allocate a new split block we've already
* used up all of our available subclasses, so this subclass exists to
* handle this case where we need to allocate a new split block.
*/
BTRFS_NESTING_SPLIT,
/*
* When promoting a new block to a root we need to have a special
* subclass so we don't confuse lockdep, as it will appear that we are
* locking a higher level node before a lower level one. Copying also
* has this problem as it appears we're locking the same block again
* when we make a snapshot of an existing root.
*/
BTRFS_NESTING_NEW_ROOT,
/*
* We are limited to MAX_LOCKDEP_SUBLCLASSES number of subclasses, so
* add this in here and add a static_assert to keep us from going over
* the limit. As of this writing we're limited to 8, and we're
* definitely using 8, hence this check to keep us from messing up in
* the future.
*/
BTRFS_NESTING_MAX,
};
static_assert(BTRFS_NESTING_MAX <= MAX_LOCKDEP_SUBCLASSES,
"too many lock subclasses defined");
struct btrfs_path;
void __btrfs_tree_lock(struct extent_buffer *eb, enum btrfs_lock_nesting nest);
void btrfs_tree_lock(struct extent_buffer *eb);
void btrfs_tree_unlock(struct extent_buffer *eb);
void __btrfs_tree_read_lock(struct extent_buffer *eb, enum btrfs_lock_nesting nest);
void btrfs_tree_read_lock(struct extent_buffer *eb);
void btrfs_tree_read_unlock(struct extent_buffer *eb);
int btrfs_try_tree_read_lock(struct extent_buffer *eb);
int btrfs_try_tree_write_lock(struct extent_buffer *eb);
struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root);
struct extent_buffer *btrfs_read_lock_root_node(struct btrfs_root *root);
#ifdef CONFIG_BTRFS_DEBUG
static inline void btrfs_assert_tree_locked(struct extent_buffer *eb) {
lockdep_assert_held(&eb->lock);
}
#else
static inline void btrfs_assert_tree_locked(struct extent_buffer *eb) { }
#endif
void btrfs_unlock_up_safe(struct btrfs_path *path, int level);
static inline void btrfs_tree_unlock_rw(struct extent_buffer *eb, int rw)
{
if (rw == BTRFS_WRITE_LOCK) btrfs_tree_unlock(eb); else if (rw == BTRFS_READ_LOCK) btrfs_tree_read_unlock(eb);
else
BUG();
}
struct btrfs_drew_lock {
atomic_t readers;
struct percpu_counter writers;
wait_queue_head_t pending_writers;
wait_queue_head_t pending_readers;
};
int btrfs_drew_lock_init(struct btrfs_drew_lock *lock);
void btrfs_drew_lock_destroy(struct btrfs_drew_lock *lock);
void btrfs_drew_write_lock(struct btrfs_drew_lock *lock);
bool btrfs_drew_try_write_lock(struct btrfs_drew_lock *lock);
void btrfs_drew_write_unlock(struct btrfs_drew_lock *lock);
void btrfs_drew_read_lock(struct btrfs_drew_lock *lock);
void btrfs_drew_read_unlock(struct btrfs_drew_lock *lock);
#endif
// SPDX-License-Identifier: GPL-2.0-only
/*
* mm/percpu.c - percpu memory allocator
*
* Copyright (C) 2009 SUSE Linux Products GmbH
* Copyright (C) 2009 Tejun Heo <tj@kernel.org>
*
* Copyright (C) 2017 Facebook Inc.
* Copyright (C) 2017 Dennis Zhou <dennis@kernel.org>
*
* The percpu allocator handles both static and dynamic areas. Percpu
* areas are allocated in chunks which are divided into units. There is
* a 1-to-1 mapping for units to possible cpus. These units are grouped
* based on NUMA properties of the machine.
*
* c0 c1 c2
* ------------------- ------------------- ------------
* | u0 | u1 | u2 | u3 | | u0 | u1 | u2 | u3 | | u0 | u1 | u
* ------------------- ...... ------------------- .... ------------
*
* Allocation is done by offsets into a unit's address space. Ie., an
* area of 512 bytes at 6k in c1 occupies 512 bytes at 6k in c1:u0,
* c1:u1, c1:u2, etc. On NUMA machines, the mapping may be non-linear
* and even sparse. Access is handled by configuring percpu base
* registers according to the cpu to unit mappings and offsetting the
* base address using pcpu_unit_size.
*
* There is special consideration for the first chunk which must handle
* the static percpu variables in the kernel image as allocation services
* are not online yet. In short, the first chunk is structured like so:
*
* <Static | [Reserved] | Dynamic>
*
* The static data is copied from the original section managed by the
* linker. The reserved section, if non-zero, primarily manages static
* percpu variables from kernel modules. Finally, the dynamic section
* takes care of normal allocations.
*
* The allocator organizes chunks into lists according to free size and
* memcg-awareness. To make a percpu allocation memcg-aware the __GFP_ACCOUNT
* flag should be passed. All memcg-aware allocations are sharing one set
* of chunks and all unaccounted allocations and allocations performed
* by processes belonging to the root memory cgroup are using the second set.
*
* The allocator tries to allocate from the fullest chunk first. Each chunk
* is managed by a bitmap with metadata blocks. The allocation map is updated
* on every allocation and free to reflect the current state while the boundary
* map is only updated on allocation. Each metadata block contains
* information to help mitigate the need to iterate over large portions
* of the bitmap. The reverse mapping from page to chunk is stored in
* the page's index. Lastly, units are lazily backed and grow in unison.
*
* There is a unique conversion that goes on here between bytes and bits.
* Each bit represents a fragment of size PCPU_MIN_ALLOC_SIZE. The chunk
* tracks the number of pages it is responsible for in nr_pages. Helper
* functions are used to convert from between the bytes, bits, and blocks.
* All hints are managed in bits unless explicitly stated.
*
* To use this allocator, arch code should do the following:
*
* - define __addr_to_pcpu_ptr() and __pcpu_ptr_to_addr() to translate
* regular address to percpu pointer and back if they need to be
* different from the default
*
* - use pcpu_setup_first_chunk() during percpu area initialization to
* setup the first chunk containing the kernel static percpu area
*/
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/bitmap.h>
#include <linux/cpumask.h>
#include <linux/memblock.h>
#include <linux/err.h>
#include <linux/lcm.h>
#include <linux/list.h>
#include <linux/log2.h>
#include <linux/mm.h>
#include <linux/module.h>
#include <linux/mutex.h>
#include <linux/percpu.h>
#include <linux/pfn.h>
#include <linux/slab.h>
#include <linux/spinlock.h>
#include <linux/vmalloc.h>
#include <linux/workqueue.h>
#include <linux/kmemleak.h>
#include <linux/sched.h>
#include <linux/sched/mm.h>
#include <linux/memcontrol.h>
#include <asm/cacheflush.h>
#include <asm/sections.h>
#include <asm/tlbflush.h>
#include <asm/io.h>
#define CREATE_TRACE_POINTS
#include <trace/events/percpu.h>
#include "percpu-internal.h"
/*
* The slots are sorted by the size of the biggest continuous free area.
* 1-31 bytes share the same slot.
*/
#define PCPU_SLOT_BASE_SHIFT 5
/* chunks in slots below this are subject to being sidelined on failed alloc */
#define PCPU_SLOT_FAIL_THRESHOLD 3
#define PCPU_EMPTY_POP_PAGES_LOW 2
#define PCPU_EMPTY_POP_PAGES_HIGH 4
#ifdef CONFIG_SMP
/* default addr <-> pcpu_ptr mapping, override in asm/percpu.h if necessary */
#ifndef __addr_to_pcpu_ptr
#define __addr_to_pcpu_ptr(addr) \
(void __percpu *)((unsigned long)(addr) - \
(unsigned long)pcpu_base_addr + \
(unsigned long)__per_cpu_start)
#endif
#ifndef __pcpu_ptr_to_addr
#define __pcpu_ptr_to_addr(ptr) \
(void __force *)((unsigned long)(ptr) + \
(unsigned long)pcpu_base_addr - \
(unsigned long)__per_cpu_start)
#endif
#else /* CONFIG_SMP */
/* on UP, it's always identity mapped */
#define __addr_to_pcpu_ptr(addr) (void __percpu *)(addr)
#define __pcpu_ptr_to_addr(ptr) (void __force *)(ptr)
#endif /* CONFIG_SMP */
static int pcpu_unit_pages __ro_after_init;
static int pcpu_unit_size __ro_after_init;
static int pcpu_nr_units __ro_after_init;
static int pcpu_atom_size __ro_after_init;
int pcpu_nr_slots __ro_after_init;
static int pcpu_free_slot __ro_after_init;
int pcpu_sidelined_slot __ro_after_init;
int pcpu_to_depopulate_slot __ro_after_init;
static size_t pcpu_chunk_struct_size __ro_after_init;
/* cpus with the lowest and highest unit addresses */
static unsigned int pcpu_low_unit_cpu __ro_after_init;
static unsigned int pcpu_high_unit_cpu __ro_after_init;
/* the address of the first chunk which starts with the kernel static area */
void *pcpu_base_addr __ro_after_init;
static const int *pcpu_unit_map __ro_after_init; /* cpu -> unit */
const unsigned long *pcpu_unit_offsets __ro_after_init; /* cpu -> unit offset */
/* group information, used for vm allocation */
static int pcpu_nr_groups __ro_after_init;
static const unsigned long *pcpu_group_offsets __ro_after_init;
static const size_t *pcpu_group_sizes __ro_after_init;
/*
* The first chunk which always exists. Note that unlike other
* chunks, this one can be allocated and mapped in several different
* ways and thus often doesn't live in the vmalloc area.
*/
struct pcpu_chunk *pcpu_first_chunk __ro_after_init;
/*
* Optional reserved chunk. This chunk reserves part of the first
* chunk and serves it for reserved allocations. When the reserved
* region doesn't exist, the following variable is NULL.
*/
struct pcpu_chunk *pcpu_reserved_chunk __ro_after_init;
DEFINE_SPINLOCK(pcpu_lock); /* all internal data structures */
static DEFINE_MUTEX(pcpu_alloc_mutex); /* chunk create/destroy, [de]pop, map ext */
struct list_head *pcpu_chunk_lists __ro_after_init; /* chunk list slots */
/* chunks which need their map areas extended, protected by pcpu_lock */
static LIST_HEAD(pcpu_map_extend_chunks);
/*
* The number of empty populated pages, protected by pcpu_lock.
* The reserved chunk doesn't contribute to the count.
*/
int pcpu_nr_empty_pop_pages;
/*
* The number of populated pages in use by the allocator, protected by
* pcpu_lock. This number is kept per a unit per chunk (i.e. when a page gets
* allocated/deallocated, it is allocated/deallocated in all units of a chunk
* and increments/decrements this count by 1).
*/
static unsigned long pcpu_nr_populated;
/*
* Balance work is used to populate or destroy chunks asynchronously. We
* try to keep the number of populated free pages between
* PCPU_EMPTY_POP_PAGES_LOW and HIGH for atomic allocations and at most one
* empty chunk.
*/
static void pcpu_balance_workfn(struct work_struct *work);
static DECLARE_WORK(pcpu_balance_work, pcpu_balance_workfn);
static bool pcpu_async_enabled __read_mostly;
static bool pcpu_atomic_alloc_failed;
static void pcpu_schedule_balance_work(void)
{
if (pcpu_async_enabled)
schedule_work(&pcpu_balance_work);
}
/**
* pcpu_addr_in_chunk - check if the address is served from this chunk
* @chunk: chunk of interest
* @addr: percpu address
*
* RETURNS:
* True if the address is served from this chunk.
*/
static bool pcpu_addr_in_chunk(struct pcpu_chunk *chunk, void *addr)
{
void *start_addr, *end_addr;
if (!chunk)
return false;
start_addr = chunk->base_addr + chunk->start_offset; end_addr = chunk->base_addr + chunk->nr_pages * PAGE_SIZE -
chunk->end_offset;
return addr >= start_addr && addr < end_addr;
}
static int __pcpu_size_to_slot(int size)
{
int highbit = fls(size); /* size is in bytes */
return max(highbit - PCPU_SLOT_BASE_SHIFT + 2, 1);
}
static int pcpu_size_to_slot(int size)
{
if (size == pcpu_unit_size) return pcpu_free_slot;
return __pcpu_size_to_slot(size);
}
static int pcpu_chunk_slot(const struct pcpu_chunk *chunk)
{
const struct pcpu_block_md *chunk_md = &chunk->chunk_md;
if (chunk->free_bytes < PCPU_MIN_ALLOC_SIZE || chunk_md->contig_hint == 0)
return 0;
return pcpu_size_to_slot(chunk_md->contig_hint * PCPU_MIN_ALLOC_SIZE);
}
/* set the pointer to a chunk in a page struct */
static void pcpu_set_page_chunk(struct page *page, struct pcpu_chunk *pcpu)
{
page->index = (unsigned long)pcpu;
}
/* obtain pointer to a chunk from a page struct */
static struct pcpu_chunk *pcpu_get_page_chunk(struct page *page)
{
return (struct pcpu_chunk *)page->index;
}
static int __maybe_unused pcpu_page_idx(unsigned int cpu, int page_idx)
{
return pcpu_unit_map[cpu] * pcpu_unit_pages + page_idx;
}
static unsigned long pcpu_unit_page_offset(unsigned int cpu, int page_idx)
{
return pcpu_unit_offsets[cpu] + (page_idx << PAGE_SHIFT);
}
static unsigned long pcpu_chunk_addr(struct pcpu_chunk *chunk,
unsigned int cpu, int page_idx)
{
return (unsigned long)chunk->base_addr +
pcpu_unit_page_offset(cpu, page_idx);
}
/*
* The following are helper functions to help access bitmaps and convert
* between bitmap offsets to address offsets.
*/
static unsigned long *pcpu_index_alloc_map(struct pcpu_chunk *chunk, int index)
{
return chunk->alloc_map +
(index * PCPU_BITMAP_BLOCK_BITS / BITS_PER_LONG);
}
static unsigned long pcpu_off_to_block_index(int off)
{
return off / PCPU_BITMAP_BLOCK_BITS;
}
static unsigned long pcpu_off_to_block_off(int off)
{
return off & (PCPU_BITMAP_BLOCK_BITS - 1);
}
static unsigned long pcpu_block_off_to_off(int index, int off)
{
return index * PCPU_BITMAP_BLOCK_BITS + off;
}
/**
* pcpu_check_block_hint - check against the contig hint
* @block: block of interest
* @bits: size of allocation
* @align: alignment of area (max PAGE_SIZE)
*
* Check to see if the allocation can fit in the block's contig hint.
* Note, a chunk uses the same hints as a block so this can also check against
* the chunk's contig hint.
*/
static bool pcpu_check_block_hint(struct pcpu_block_md *block, int bits,
size_t align)
{
int bit_off = ALIGN(block->contig_hint_start, align) -
block->contig_hint_start;
return bit_off + bits <= block->contig_hint;
}
/*
* pcpu_next_hint - determine which hint to use
* @block: block of interest
* @alloc_bits: size of allocation
*
* This determines if we should scan based on the scan_hint or first_free.
* In general, we want to scan from first_free to fulfill allocations by
* first fit. However, if we know a scan_hint at position scan_hint_start
* cannot fulfill an allocation, we can begin scanning from there knowing
* the contig_hint will be our fallback.
*/
static int pcpu_next_hint(struct pcpu_block_md *block, int alloc_bits)
{
/*
* The three conditions below determine if we can skip past the
* scan_hint. First, does the scan hint exist. Second, is the
* contig_hint after the scan_hint (possibly not true iff
* contig_hint == scan_hint). Third, is the allocation request
* larger than the scan_hint.
*/
if (block->scan_hint && block->contig_hint_start > block->scan_hint_start &&
alloc_bits > block->scan_hint)
return block->scan_hint_start + block->scan_hint; return block->first_free;
}
/**
* pcpu_next_md_free_region - finds the next hint free area
* @chunk: chunk of interest
* @bit_off: chunk offset
* @bits: size of free area
*
* Helper function for pcpu_for_each_md_free_region. It checks
* block->contig_hint and performs aggregation across blocks to find the
* next hint. It modifies bit_off and bits in-place to be consumed in the
* loop.
*/
static void pcpu_next_md_free_region(struct pcpu_chunk *chunk, int *bit_off,
int *bits)
{
int i = pcpu_off_to_block_index(*bit_off);
int block_off = pcpu_off_to_block_off(*bit_off);
struct pcpu_block_md *block;
*bits = 0;
for (block = chunk->md_blocks + i; i < pcpu_chunk_nr_blocks(chunk); block++, i++) {
/* handles contig area across blocks */
if (*bits) { *bits += block->left_free;
if (block->left_free == PCPU_BITMAP_BLOCK_BITS)
continue;
return;
}
/*
* This checks three things. First is there a contig_hint to
* check. Second, have we checked this hint before by
* comparing the block_off. Third, is this the same as the
* right contig hint. In the last case, it spills over into
* the next block and should be handled by the contig area
* across blocks code.
*/
*bits = block->contig_hint; if (*bits && block->contig_hint_start >= block_off && *bits + block->contig_hint_start < PCPU_BITMAP_BLOCK_BITS) {
*bit_off = pcpu_block_off_to_off(i,
block->contig_hint_start);
return;
}
/* reset to satisfy the second predicate above */
block_off = 0;
*bits = block->right_free;
*bit_off = (i + 1) * PCPU_BITMAP_BLOCK_BITS - block->right_free;
}
}
/**
* pcpu_next_fit_region - finds fit areas for a given allocation request
* @chunk: chunk of interest
* @alloc_bits: size of allocation
* @align: alignment of area (max PAGE_SIZE)
* @bit_off: chunk offset
* @bits: size of free area
*
* Finds the next free region that is viable for use with a given size and
* alignment. This only returns if there is a valid area to be used for this
* allocation. block->first_free is returned if the allocation request fits
* within the block to see if the request can be fulfilled prior to the contig
* hint.
*/
static void pcpu_next_fit_region(struct pcpu_chunk *chunk, int alloc_bits,
int align, int *bit_off, int *bits)
{
int i = pcpu_off_to_block_index(*bit_off);
int block_off = pcpu_off_to_block_off(*bit_off);
struct pcpu_block_md *block;
*bits = 0;
for (block = chunk->md_blocks + i; i < pcpu_chunk_nr_blocks(chunk);
block++, i++) {
/* handles contig area across blocks */
if (*bits) { *bits += block->left_free;
if (*bits >= alloc_bits)
return;
if (block->left_free == PCPU_BITMAP_BLOCK_BITS)
continue;
}
/* check block->contig_hint */
*bits = ALIGN(block->contig_hint_start, align) -
block->contig_hint_start;
/*
* This uses the block offset to determine if this has been
* checked in the prior iteration.
*/
if (block->contig_hint &&
block->contig_hint_start >= block_off && block->contig_hint >= *bits + alloc_bits) {
int start = pcpu_next_hint(block, alloc_bits);
*bits += alloc_bits + block->contig_hint_start -
start;
*bit_off = pcpu_block_off_to_off(i, start);
return;
}
/* reset to satisfy the second predicate above */
block_off = 0;
*bit_off = ALIGN(PCPU_BITMAP_BLOCK_BITS - block->right_free,
align);
*bits = PCPU_BITMAP_BLOCK_BITS - *bit_off;
*bit_off = pcpu_block_off_to_off(i, *bit_off);
if (*bits >= alloc_bits)
return;
}
/* no valid offsets were found - fail condition */
*bit_off = pcpu_chunk_map_bits(chunk);
}
/*
* Metadata free area iterators. These perform aggregation of free areas
* based on the metadata blocks and return the offset @bit_off and size in
* bits of the free area @bits. pcpu_for_each_fit_region only returns when
* a fit is found for the allocation request.
*/
#define pcpu_for_each_md_free_region(chunk, bit_off, bits) \
for (pcpu_next_md_free_region((chunk), &(bit_off), &(bits)); \
(bit_off) < pcpu_chunk_map_bits((chunk)); \
(bit_off) += (bits) + 1, \
pcpu_next_md_free_region((chunk), &(bit_off), &(bits)))
#define pcpu_for_each_fit_region(chunk, alloc_bits, align, bit_off, bits) \
for (pcpu_next_fit_region((chunk), (alloc_bits), (align), &(bit_off), \
&(bits)); \
(bit_off) < pcpu_chunk_map_bits((chunk)); \
(bit_off) += (bits), \
pcpu_next_fit_region((chunk), (alloc_bits), (align), &(bit_off), \
&(bits)))
/**
* pcpu_mem_zalloc - allocate memory
* @size: bytes to allocate
* @gfp: allocation flags
*
* Allocate @size bytes. If @size is smaller than PAGE_SIZE,
* kzalloc() is used; otherwise, the equivalent of vzalloc() is used.
* This is to facilitate passing through whitelisted flags. The
* returned memory is always zeroed.
*
* RETURNS:
* Pointer to the allocated area on success, NULL on failure.
*/
static void *pcpu_mem_zalloc(size_t size, gfp_t gfp)
{
if (WARN_ON_ONCE(!slab_is_available()))
return NULL;
if (size <= PAGE_SIZE)
return kzalloc(size, gfp);
else
return __vmalloc(size, gfp | __GFP_ZERO);
}
/**
* pcpu_mem_free - free memory
* @ptr: memory to free
*
* Free @ptr. @ptr should have been allocated using pcpu_mem_zalloc().
*/
static void pcpu_mem_free(void *ptr)
{
kvfree(ptr);
}
static void __pcpu_chunk_move(struct pcpu_chunk *chunk, int slot,
bool move_front)
{
if (chunk != pcpu_reserved_chunk) {
if (move_front)
list_move(&chunk->list, &pcpu_chunk_lists[slot]);
else
list_move_tail(&chunk->list, &pcpu_chunk_lists[slot]);
}
}
static void pcpu_chunk_move(struct pcpu_chunk *chunk, int slot)
{
__pcpu_chunk_move(chunk, slot, true);
}
/**
* pcpu_chunk_relocate - put chunk in the appropriate chunk slot
* @chunk: chunk of interest
* @oslot: the previous slot it was on
*
* This function is called after an allocation or free changed @chunk.
* New slot according to the changed state is determined and @chunk is
* moved to the slot. Note that the reserved chunk is never put on
* chunk slots.
*
* CONTEXT:
* pcpu_lock.
*/
static void pcpu_chunk_relocate(struct pcpu_chunk *chunk, int oslot)
{
int nslot = pcpu_chunk_slot(chunk);
/* leave isolated chunks in-place */
if (chunk->isolated)
return;
if (oslot != nslot) __pcpu_chunk_move(chunk, nslot, oslot < nslot);
}
static void pcpu_isolate_chunk(struct pcpu_chunk *chunk)
{
lockdep_assert_held(&pcpu_lock);
if (!chunk->isolated) { chunk->isolated = true;
pcpu_nr_empty_pop_pages -= chunk->nr_empty_pop_pages;
}
list_move(&chunk->list, &pcpu_chunk_lists[pcpu_to_depopulate_slot]);
}
static void pcpu_reintegrate_chunk(struct pcpu_chunk *chunk)
{
lockdep_assert_held(&pcpu_lock);
if (chunk->isolated) { chunk->isolated = false;
pcpu_nr_empty_pop_pages += chunk->nr_empty_pop_pages;
pcpu_chunk_relocate(chunk, -1);
}
}
/*
* pcpu_update_empty_pages - update empty page counters
* @chunk: chunk of interest
* @nr: nr of empty pages
*
* This is used to keep track of the empty pages now based on the premise
* a md_block covers a page. The hint update functions recognize if a block
* is made full or broken to calculate deltas for keeping track of free pages.
*/
static inline void pcpu_update_empty_pages(struct pcpu_chunk *chunk, int nr)
{
chunk->nr_empty_pop_pages += nr; if (chunk != pcpu_reserved_chunk && !chunk->isolated) pcpu_nr_empty_pop_pages += nr;
}
/*
* pcpu_region_overlap - determines if two regions overlap
* @a: start of first region, inclusive
* @b: end of first region, exclusive
* @x: start of second region, inclusive
* @y: end of second region, exclusive
*
* This is used to determine if the hint region [a, b) overlaps with the
* allocated region [x, y).
*/
static inline bool pcpu_region_overlap(int a, int b, int x, int y)
{
return (a < y) && (x < b);
}
/**
* pcpu_block_update - updates a block given a free area
* @block: block of interest
* @start: start offset in block
* @end: end offset in block
*
* Updates a block given a known free area. The region [start, end) is
* expected to be the entirety of the free area within a block. Chooses
* the best starting offset if the contig hints are equal.
*/
static void pcpu_block_update(struct pcpu_block_md *block, int start, int end)
{
int contig = end - start;
block->first_free = min(block->first_free, start);
if (start == 0)
block->left_free = contig; if (end == block->nr_bits) block->right_free = contig; if (contig > block->contig_hint) {
/* promote the old contig_hint to be the new scan_hint */
if (start > block->contig_hint_start) { if (block->contig_hint > block->scan_hint) { block->scan_hint_start =
block->contig_hint_start;
block->scan_hint = block->contig_hint;
} else if (start < block->scan_hint_start) {
/*
* The old contig_hint == scan_hint. But, the
* new contig is larger so hold the invariant
* scan_hint_start < contig_hint_start.
*/
block->scan_hint = 0;
}
} else {
block->scan_hint = 0;
}
block->contig_hint_start = start;
block->contig_hint = contig;
} else if (contig == block->contig_hint) { if (block->contig_hint_start &&
(!start ||
__ffs(start) > __ffs(block->contig_hint_start))) {
/* start has a better alignment so use it */
block->contig_hint_start = start;
if (start < block->scan_hint_start &&
block->contig_hint > block->scan_hint) block->scan_hint = 0; } else if (start > block->scan_hint_start || block->contig_hint > block->scan_hint) {
/*
* Knowing contig == contig_hint, update the scan_hint
* if it is farther than or larger than the current
* scan_hint.
*/
block->scan_hint_start = start;
block->scan_hint = contig;
}
} else {
/*
* The region is smaller than the contig_hint. So only update
* the scan_hint if it is larger than or equal and farther than
* the current scan_hint.
*/
if ((start < block->contig_hint_start && (contig > block->scan_hint ||
(contig == block->scan_hint &&
start > block->scan_hint_start)))) { block->scan_hint_start = start;
block->scan_hint = contig;
}
}
}
/*
* pcpu_block_update_scan - update a block given a free area from a scan
* @chunk: chunk of interest
* @bit_off: chunk offset
* @bits: size of free area
*
* Finding the final allocation spot first goes through pcpu_find_block_fit()
* to find a block that can hold the allocation and then pcpu_alloc_area()
* where a scan is used. When allocations require specific alignments,
* we can inadvertently create holes which will not be seen in the alloc
* or free paths.
*
* This takes a given free area hole and updates a block as it may change the
* scan_hint. We need to scan backwards to ensure we don't miss free bits
* from alignment.
*/
static void pcpu_block_update_scan(struct pcpu_chunk *chunk, int bit_off,
int bits)
{
int s_off = pcpu_off_to_block_off(bit_off);
int e_off = s_off + bits;
int s_index, l_bit;
struct pcpu_block_md *block;
if (e_off > PCPU_BITMAP_BLOCK_BITS)
return;
s_index = pcpu_off_to_block_index(bit_off);
block = chunk->md_blocks + s_index;
/* scan backwards in case of alignment skipping free bits */
l_bit = find_last_bit(pcpu_index_alloc_map(chunk, s_index), s_off);
s_off = (s_off == l_bit) ? 0 : l_bit + 1; pcpu_block_update(block, s_off, e_off);
}
/**
* pcpu_chunk_refresh_hint - updates metadata about a chunk
* @chunk: chunk of interest
* @full_scan: if we should scan from the beginning
*
* Iterates over the metadata blocks to find the largest contig area.
* A full scan can be avoided on the allocation path as this is triggered
* if we broke the contig_hint. In doing so, the scan_hint will be before
* the contig_hint or after if the scan_hint == contig_hint. This cannot
* be prevented on freeing as we want to find the largest area possibly
* spanning blocks.
*/
static void pcpu_chunk_refresh_hint(struct pcpu_chunk *chunk, bool full_scan)
{
struct pcpu_block_md *chunk_md = &chunk->chunk_md;
int bit_off, bits;
/* promote scan_hint to contig_hint */
if (!full_scan && chunk_md->scan_hint) { bit_off = chunk_md->scan_hint_start + chunk_md->scan_hint;
chunk_md->contig_hint_start = chunk_md->scan_hint_start;
chunk_md->contig_hint = chunk_md->scan_hint;
chunk_md->scan_hint = 0;
} else {
bit_off = chunk_md->first_free;
chunk_md->contig_hint = 0;
}
bits = 0; pcpu_for_each_md_free_region(chunk, bit_off, bits) pcpu_block_update(chunk_md, bit_off, bit_off + bits);
}
/**
* pcpu_block_refresh_hint
* @chunk: chunk of interest
* @index: index of the metadata block
*
* Scans over the block beginning at first_free and updates the block
* metadata accordingly.
*/
static void pcpu_block_refresh_hint(struct pcpu_chunk *chunk, int index)
{
struct pcpu_block_md *block = chunk->md_blocks + index;
unsigned long *alloc_map = pcpu_index_alloc_map(chunk, index);
unsigned int rs, re, start; /* region start, region end */
/* promote scan_hint to contig_hint */
if (block->scan_hint) {
start = block->scan_hint_start + block->scan_hint;
block->contig_hint_start = block->scan_hint_start;
block->contig_hint = block->scan_hint;
block->scan_hint = 0;
} else {
start = block->first_free;
block->contig_hint = 0;
}
block->right_free = 0;
/* iterate over free areas and update the contig hints */
bitmap_for_each_clear_region(alloc_map, rs, re, start,
PCPU_BITMAP_BLOCK_BITS)
pcpu_block_update(block, rs, re);
}
/**
* pcpu_block_update_hint_alloc - update hint on allocation path
* @chunk: chunk of interest
* @bit_off: chunk offset
* @bits: size of request
*
* Updates metadata for the allocation path. The metadata only has to be
* refreshed by a full scan iff the chunk's contig hint is broken. Block level
* scans are required if the block's contig hint is broken.
*/
static void pcpu_block_update_hint_alloc(struct pcpu_chunk *chunk, int bit_off,
int bits)
{
struct pcpu_block_md *chunk_md = &chunk->chunk_md;
int nr_empty_pages = 0;
struct pcpu_block_md *s_block, *e_block, *block;
int s_index, e_index; /* block indexes of the freed allocation */
int s_off, e_off; /* block offsets of the freed allocation */
/*
* Calculate per block offsets.
* The calculation uses an inclusive range, but the resulting offsets
* are [start, end). e_index always points to the last block in the
* range.
*/
s_index = pcpu_off_to_block_index(bit_off);
e_index = pcpu_off_to_block_index(bit_off + bits - 1);
s_off = pcpu_off_to_block_off(bit_off);
e_off = pcpu_off_to_block_off(bit_off + bits - 1) + 1;
s_block = chunk->md_blocks + s_index;
e_block = chunk->md_blocks + e_index;
/*
* Update s_block.
* block->first_free must be updated if the allocation takes its place.
* If the allocation breaks the contig_hint, a scan is required to
* restore this hint.
*/
if (s_block->contig_hint == PCPU_BITMAP_BLOCK_BITS)
nr_empty_pages++;
if (s_off == s_block->first_free)
s_block->first_free = find_next_zero_bit(
pcpu_index_alloc_map(chunk, s_index),
PCPU_BITMAP_BLOCK_BITS,
s_off + bits);
if (pcpu_region_overlap(s_block->scan_hint_start,
s_block->scan_hint_start + s_block->scan_hint,
s_off,
s_off + bits))
s_block->scan_hint = 0; if (pcpu_region_overlap(s_block->contig_hint_start, s_block->contig_hint_start +
s_block->contig_hint,
s_off,
s_off + bits)) {
/* block contig hint is broken - scan to fix it */
if (!s_off) s_block->left_free = 0; pcpu_block_refresh_hint(chunk, s_index);
} else {
/* update left and right contig manually */
s_block->left_free = min(s_block->left_free, s_off);
if (s_index == e_index)
s_block->right_free = min_t(int, s_block->right_free,
PCPU_BITMAP_BLOCK_BITS - e_off);
else
s_block->right_free = 0;
}
/*
* Update e_block.
*/
if (s_index != e_index) {
if (e_block->contig_hint == PCPU_BITMAP_BLOCK_BITS)
nr_empty_pages++;
/*
* When the allocation is across blocks, the end is along
* the left part of the e_block.
*/
e_block->first_free = find_next_zero_bit(
pcpu_index_alloc_map(chunk, e_index),
PCPU_BITMAP_BLOCK_BITS, e_off);
if (e_off == PCPU_BITMAP_BLOCK_BITS) {
/* reset the block */
e_block++;
} else {
if (e_off > e_block->scan_hint_start)
e_block->scan_hint = 0; e_block->left_free = 0;
if (e_off > e_block->contig_hint_start) {
/* contig hint is broken - scan to fix it */
pcpu_block_refresh_hint(chunk, e_index);
} else {
e_block->right_free =
min_t(int, e_block->right_free,
PCPU_BITMAP_BLOCK_BITS - e_off);
}
}
/* update in-between md_blocks */
nr_empty_pages += (e_index - s_index - 1);
for (block = s_block + 1; block < e_block; block++) {
block->scan_hint = 0;
block->contig_hint = 0;
block->left_free = 0;
block->right_free = 0;
}
}
if (nr_empty_pages)
pcpu_update_empty_pages(chunk, -nr_empty_pages);
if (pcpu_region_overlap(chunk_md->scan_hint_start,
chunk_md->scan_hint_start + chunk_md->scan_hint,
bit_off,
bit_off + bits))
chunk_md->scan_hint = 0;
/*
* The only time a full chunk scan is required is if the chunk
* contig hint is broken. Otherwise, it means a smaller space
* was used and therefore the chunk contig hint is still correct.
*/
if (pcpu_region_overlap(chunk_md->contig_hint_start,
chunk_md->contig_hint_start + chunk_md->contig_hint,
bit_off,
bit_off + bits))
pcpu_chunk_refresh_hint(chunk, false);
}
/**
* pcpu_block_update_hint_free - updates the block hints on the free path
* @chunk: chunk of interest
* @bit_off: chunk offset
* @bits: size of request
*
* Updates metadata for the allocation path. This avoids a blind block
* refresh by making use of the block contig hints. If this fails, it scans
* forward and backward to determine the extent of the free area. This is
* capped at the boundary of blocks.
*
* A chunk update is triggered if a page becomes free, a block becomes free,
* or the free spans across blocks. This tradeoff is to minimize iterating
* over the block metadata to update chunk_md->contig_hint.
* chunk_md->contig_hint may be off by up to a page, but it will never be more
* than the available space. If the contig hint is contained in one block, it
* will be accurate.
*/
static void pcpu_block_update_hint_free(struct pcpu_chunk *chunk, int bit_off,
int bits)
{
int nr_empty_pages = 0;
struct pcpu_block_md *s_block, *e_block, *block;
int s_index, e_index; /* block indexes of the freed allocation */
int s_off, e_off; /* block offsets of the freed allocation */
int start, end; /* start and end of the whole free area */
/*
* Calculate per block offsets.
* The calculation uses an inclusive range, but the resulting offsets
* are [start, end). e_index always points to the last block in the
* range.
*/
s_index = pcpu_off_to_block_index(bit_off);
e_index = pcpu_off_to_block_index(bit_off + bits - 1);
s_off = pcpu_off_to_block_off(bit_off);
e_off = pcpu_off_to_block_off(bit_off + bits - 1) + 1;
s_block = chunk->md_blocks + s_index;
e_block = chunk->md_blocks + e_index;
/*
* Check if the freed area aligns with the block->contig_hint.
* If it does, then the scan to find the beginning/end of the
* larger free area can be avoided.
*
* start and end refer to beginning and end of the free area
* within each their respective blocks. This is not necessarily
* the entire free area as it may span blocks past the beginning
* or end of the block.
*/
start = s_off;
if (s_off == s_block->contig_hint + s_block->contig_hint_start) {
start = s_block->contig_hint_start;
} else {
/*
* Scan backwards to find the extent of the free area.
* find_last_bit returns the starting bit, so if the start bit
* is returned, that means there was no last bit and the
* remainder of the chunk is free.
*/
int l_bit = find_last_bit(pcpu_index_alloc_map(chunk, s_index),
start);
start = (start == l_bit) ? 0 : l_bit + 1;
}
end = e_off;
if (e_off == e_block->contig_hint_start) end = e_block->contig_hint_start + e_block->contig_hint;
else
end = find_next_bit(pcpu_index_alloc_map(chunk, e_index),
PCPU_BITMAP_BLOCK_BITS, end);
/* update s_block */
e_off = (s_index == e_index) ? end : PCPU_BITMAP_BLOCK_BITS; if (!start && e_off == PCPU_BITMAP_BLOCK_BITS)
nr_empty_pages++;
pcpu_block_update(s_block, start, e_off);
/* freeing in the same block */
if (s_index != e_index) {
/* update e_block */
if (end == PCPU_BITMAP_BLOCK_BITS) nr_empty_pages++; pcpu_block_update(e_block, 0, end);
/* reset md_blocks in the middle */
nr_empty_pages += (e_index - s_index - 1);
for (block = s_block + 1; block < e_block; block++) {
block->first_free = 0;
block->scan_hint = 0;
block->contig_hint_start = 0;
block->contig_hint = PCPU_BITMAP_BLOCK_BITS;
block->left_free = PCPU_BITMAP_BLOCK_BITS;
block->right_free = PCPU_BITMAP_BLOCK_BITS;
}
}
if (nr_empty_pages)
pcpu_update_empty_pages(chunk, nr_empty_pages);
/*
* Refresh chunk metadata when the free makes a block free or spans
* across blocks. The contig_hint may be off by up to a page, but if
* the contig_hint is contained in a block, it will be accurate with
* the else condition below.
*/
if (((end - start) >= PCPU_BITMAP_BLOCK_BITS) || s_index != e_index) pcpu_chunk_refresh_hint(chunk, true);
else
pcpu_block_update(&chunk->chunk_md,
pcpu_block_off_to_off(s_index, start),
end);
}
/**
* pcpu_is_populated - determines if the region is populated
* @chunk: chunk of interest
* @bit_off: chunk offset
* @bits: size of area
* @next_off: return value for the next offset to start searching
*
* For atomic allocations, check if the backing pages are populated.
*
* RETURNS:
* Bool if the backing pages are populated.
* next_index is to skip over unpopulated blocks in pcpu_find_block_fit.
*/
static bool pcpu_is_populated(struct pcpu_chunk *chunk, int bit_off, int bits,
int *next_off)
{
unsigned int page_start, page_end, rs, re;
page_start = PFN_DOWN(bit_off * PCPU_MIN_ALLOC_SIZE);
page_end = PFN_UP((bit_off + bits) * PCPU_MIN_ALLOC_SIZE);
rs = page_start;
bitmap_next_clear_region(chunk->populated, &rs, &re, page_end);
if (rs >= page_end)
return true;
*next_off = re * PAGE_SIZE / PCPU_MIN_ALLOC_SIZE;
return false;
}
/**
* pcpu_find_block_fit - finds the block index to start searching
* @chunk: chunk of interest
* @alloc_bits: size of request in allocation units
* @align: alignment of area (max PAGE_SIZE bytes)
* @pop_only: use populated regions only
*
* Given a chunk and an allocation spec, find the offset to begin searching
* for a free region. This iterates over the bitmap metadata blocks to
* find an offset that will be guaranteed to fit the requirements. It is
* not quite first fit as if the allocation does not fit in the contig hint
* of a block or chunk, it is skipped. This errs on the side of caution
* to prevent excess iteration. Poor alignment can cause the allocator to
* skip over blocks and chunks that have valid free areas.
*
* RETURNS:
* The offset in the bitmap to begin searching.
* -1 if no offset is found.
*/
static int pcpu_find_block_fit(struct pcpu_chunk *chunk, int alloc_bits,
size_t align, bool pop_only)
{
struct pcpu_block_md *chunk_md = &chunk->chunk_md;
int bit_off, bits, next_off;
/*
* This is an optimization to prevent scanning by assuming if the
* allocation cannot fit in the global hint, there is memory pressure
* and creating a new chunk would happen soon.
*/
if (!pcpu_check_block_hint(chunk_md, alloc_bits, align))
return -1;
bit_off = pcpu_next_hint(chunk_md, alloc_bits);
bits = 0;
pcpu_for_each_fit_region(chunk, alloc_bits, align, bit_off, bits) {
if (!pop_only || pcpu_is_populated(chunk, bit_off, bits,
&next_off))
break;
bit_off = next_off;
bits = 0;
}
if (bit_off == pcpu_chunk_map_bits(chunk))
return -1;
return bit_off;
}
/*
* pcpu_find_zero_area - modified from bitmap_find_next_zero_area_off()
* @map: the address to base the search on
* @size: the bitmap size in bits
* @start: the bitnumber to start searching at
* @nr: the number of zeroed bits we're looking for
* @align_mask: alignment mask for zero area
* @largest_off: offset of the largest area skipped
* @largest_bits: size of the largest area skipped
*
* The @align_mask should be one less than a power of 2.
*
* This is a modified version of bitmap_find_next_zero_area_off() to remember
* the largest area that was skipped. This is imperfect, but in general is
* good enough. The largest remembered region is the largest failed region
* seen. This does not include anything we possibly skipped due to alignment.
* pcpu_block_update_scan() does scan backwards to try and recover what was
* lost to alignment. While this can cause scanning to miss earlier possible
* free areas, smaller allocations will eventually fill those holes.
*/
static unsigned long pcpu_find_zero_area(unsigned long *map,
unsigned long size,
unsigned long start,
unsigned long nr,
unsigned long align_mask,
unsigned long *largest_off,
unsigned long *largest_bits)
{
unsigned long index, end, i, area_off, area_bits;
again:
index = find_next_zero_bit(map, size, start);
/* Align allocation */
index = __ALIGN_MASK(index, align_mask);
area_off = index;
end = index + nr;
if (end > size)
return end;
i = find_next_bit(map, end, index);
if (i < end) {
area_bits = i - area_off;
/* remember largest unused area with best alignment */
if (area_bits > *largest_bits || (area_bits == *largest_bits && *largest_off &&
(!area_off || __ffs(area_off) > __ffs(*largest_off)))) {
*largest_off = area_off;
*largest_bits = area_bits;
}
start = i + 1;
goto again;
}
return index;
}
/**
* pcpu_alloc_area - allocates an area from a pcpu_chunk
* @chunk: chunk of interest
* @alloc_bits: size of request in allocation units
* @align: alignment of area (max PAGE_SIZE)
* @start: bit_off to start searching
*
* This function takes in a @start offset to begin searching to fit an
* allocation of @alloc_bits with alignment @align. It needs to scan
* the allocation map because if it fits within the block's contig hint,
* @start will be block->first_free. This is an attempt to fill the
* allocation prior to breaking the contig hint. The allocation and
* boundary maps are updated accordingly if it confirms a valid
* free area.
*
* RETURNS:
* Allocated addr offset in @chunk on success.
* -1 if no matching area is found.
*/
static int pcpu_alloc_area(struct pcpu_chunk *chunk, int alloc_bits,
size_t align, int start)
{
struct pcpu_block_md *chunk_md = &chunk->chunk_md;
size_t align_mask = (align) ? (align - 1) : 0;
unsigned long area_off = 0, area_bits = 0;
int bit_off, end, oslot;
lockdep_assert_held(&pcpu_lock);
oslot = pcpu_chunk_slot(chunk);
/*
* Search to find a fit.
*/
end = min_t(int, start + alloc_bits + PCPU_BITMAP_BLOCK_BITS,
pcpu_chunk_map_bits(chunk));
bit_off = pcpu_find_zero_area(chunk->alloc_map, end, start, alloc_bits,
align_mask, &area_off, &area_bits);
if (bit_off >= end)
return -1;
if (area_bits) pcpu_block_update_scan(chunk, area_off, area_bits);
/* update alloc map */
bitmap_set(chunk->alloc_map, bit_off, alloc_bits);
/* update boundary map */
set_bit(bit_off, chunk->bound_map);
bitmap_clear(chunk->bound_map, bit_off + 1, alloc_bits - 1);
set_bit(bit_off + alloc_bits, chunk->bound_map);
chunk->free_bytes -= alloc_bits * PCPU_MIN_ALLOC_SIZE;
/* update first free bit */
if (bit_off == chunk_md->first_free)
chunk_md->first_free = find_next_zero_bit(
chunk->alloc_map,
pcpu_chunk_map_bits(chunk),
bit_off + alloc_bits);
pcpu_block_update_hint_alloc(chunk, bit_off, alloc_bits);
pcpu_chunk_relocate(chunk, oslot);
return bit_off * PCPU_MIN_ALLOC_SIZE;
}
/**
* pcpu_free_area - frees the corresponding offset
* @chunk: chunk of interest
* @off: addr offset into chunk
*
* This function determines the size of an allocation to free using
* the boundary bitmap and clears the allocation map.
*
* RETURNS:
* Number of freed bytes.
*/
static int pcpu_free_area(struct pcpu_chunk *chunk, int off)
{
struct pcpu_block_md *chunk_md = &chunk->chunk_md;
int bit_off, bits, end, oslot, freed;
lockdep_assert_held(&pcpu_lock);
pcpu_stats_area_dealloc(chunk);
oslot = pcpu_chunk_slot(chunk);
bit_off = off / PCPU_MIN_ALLOC_SIZE;
/* find end index */
end = find_next_bit(chunk->bound_map, pcpu_chunk_map_bits(chunk),
bit_off + 1);
bits = end - bit_off;
bitmap_clear(chunk->alloc_map, bit_off, bits);
freed = bits * PCPU_MIN_ALLOC_SIZE;
/* update metadata */
chunk->free_bytes += freed;
/* update first free bit */
chunk_md->first_free = min(chunk_md->first_free, bit_off);
pcpu_block_update_hint_free(chunk, bit_off, bits);
pcpu_chunk_relocate(chunk, oslot);
return freed;
}
static void pcpu_init_md_block(struct pcpu_block_md *block, int nr_bits)
{
block->scan_hint = 0;
block->contig_hint = nr_bits;
block->left_free = nr_bits;
block->right_free = nr_bits;
block->first_free = 0;
block->nr_bits = nr_bits;
}
static void pcpu_init_md_blocks(struct pcpu_chunk *chunk)
{
struct pcpu_block_md *md_block;
/* init the chunk's block */
pcpu_init_md_block(&chunk->chunk_md, pcpu_chunk_map_bits(chunk));
for (md_block = chunk->md_blocks;
md_block != chunk->md_blocks + pcpu_chunk_nr_blocks(chunk);
md_block++)
pcpu_init_md_block(md_block, PCPU_BITMAP_BLOCK_BITS);
}
/**
* pcpu_alloc_first_chunk - creates chunks that serve the first chunk
* @tmp_addr: the start of the region served
* @map_size: size of the region served
*
* This is responsible for creating the chunks that serve the first chunk. The
* base_addr is page aligned down of @tmp_addr while the region end is page
* aligned up. Offsets are kept track of to determine the region served. All
* this is done to appease the bitmap allocator in avoiding partial blocks.
*
* RETURNS:
* Chunk serving the region at @tmp_addr of @map_size.
*/
static struct pcpu_chunk * __init pcpu_alloc_first_chunk(unsigned long tmp_addr,
int map_size)
{
struct pcpu_chunk *chunk;
unsigned long aligned_addr, lcm_align;
int start_offset, offset_bits, region_size, region_bits;
size_t alloc_size;
/* region calculations */
aligned_addr = tmp_addr & PAGE_MASK;
start_offset = tmp_addr - aligned_addr;
/*
* Align the end of the region with the LCM of PAGE_SIZE and
* PCPU_BITMAP_BLOCK_SIZE. One of these constants is a multiple of
* the other.
*/
lcm_align = lcm(PAGE_SIZE, PCPU_BITMAP_BLOCK_SIZE);
region_size = ALIGN(start_offset + map_size, lcm_align);
/* allocate chunk */
alloc_size = struct_size(chunk, populated,
BITS_TO_LONGS(region_size >> PAGE_SHIFT));
chunk = memblock_alloc(alloc_size, SMP_CACHE_BYTES);
if (!chunk)
panic("%s: Failed to allocate %zu bytes\n", __func__,
alloc_size);
INIT_LIST_HEAD(&chunk->list);
chunk->base_addr = (void *)aligned_addr;
chunk->start_offset = start_offset;
chunk->end_offset = region_size - chunk->start_offset - map_size;
chunk->nr_pages = region_size >> PAGE_SHIFT;
region_bits = pcpu_chunk_map_bits(chunk);
alloc_size = BITS_TO_LONGS(region_bits) * sizeof(chunk->alloc_map[0]);
chunk->alloc_map = memblock_alloc(alloc_size, SMP_CACHE_BYTES);
if (!chunk->alloc_map)
panic("%s: Failed to allocate %zu bytes\n", __func__,
alloc_size);
alloc_size =
BITS_TO_LONGS(region_bits + 1) * sizeof(chunk->bound_map[0]);
chunk->bound_map = memblock_alloc(alloc_size, SMP_CACHE_BYTES);
if (!chunk->bound_map)
panic("%s: Failed to allocate %zu bytes\n", __func__,
alloc_size);
alloc_size = pcpu_chunk_nr_blocks(chunk) * sizeof(chunk->md_blocks[0]);
chunk->md_blocks = memblock_alloc(alloc_size, SMP_CACHE_BYTES);
if (!chunk->md_blocks)
panic("%s: Failed to allocate %zu bytes\n", __func__,
alloc_size);
#ifdef CONFIG_MEMCG_KMEM
/* first chunk is free to use */
chunk->obj_cgroups = NULL;
#endif
pcpu_init_md_blocks(chunk);
/* manage populated page bitmap */
chunk->immutable = true;
bitmap_fill(chunk->populated, chunk->nr_pages);
chunk->nr_populated = chunk->nr_pages;
chunk->nr_empty_pop_pages = chunk->nr_pages;
chunk->free_bytes = map_size;
if (chunk->start_offset) {
/* hide the beginning of the bitmap */
offset_bits = chunk->start_offset / PCPU_MIN_ALLOC_SIZE;
bitmap_set(chunk->alloc_map, 0, offset_bits);
set_bit(0, chunk->bound_map);
set_bit(offset_bits, chunk->bound_map);
chunk->chunk_md.first_free = offset_bits;
pcpu_block_update_hint_alloc(chunk, 0, offset_bits);
}
if (chunk->end_offset) {
/* hide the end of the bitmap */
offset_bits = chunk->end_offset / PCPU_MIN_ALLOC_SIZE;
bitmap_set(chunk->alloc_map,
pcpu_chunk_map_bits(chunk) - offset_bits,
offset_bits);
set_bit((start_offset + map_size) / PCPU_MIN_ALLOC_SIZE,
chunk->bound_map);
set_bit(region_bits, chunk->bound_map);
pcpu_block_update_hint_alloc(chunk, pcpu_chunk_map_bits(chunk)
- offset_bits, offset_bits);
}
return chunk;
}
static struct pcpu_chunk *pcpu_alloc_chunk(gfp_t gfp)
{
struct pcpu_chunk *chunk;
int region_bits;
chunk = pcpu_mem_zalloc(pcpu_chunk_struct_size, gfp);
if (!chunk)
return NULL;
INIT_LIST_HEAD(&chunk->list);
chunk->nr_pages = pcpu_unit_pages;
region_bits = pcpu_chunk_map_bits(chunk);
chunk->alloc_map = pcpu_mem_zalloc(BITS_TO_LONGS(region_bits) *
sizeof(chunk->alloc_map[0]), gfp);
if (!chunk->alloc_map)
goto alloc_map_fail;
chunk->bound_map = pcpu_mem_zalloc(BITS_TO_LONGS(region_bits + 1) *
sizeof(chunk->bound_map[0]), gfp);
if (!chunk->bound_map)
goto bound_map_fail;
chunk->md_blocks = pcpu_mem_zalloc(pcpu_chunk_nr_blocks(chunk) *
sizeof(chunk->md_blocks[0]), gfp);
if (!chunk->md_blocks)
goto md_blocks_fail;
#ifdef CONFIG_MEMCG_KMEM
if (!mem_cgroup_kmem_disabled()) {
chunk->obj_cgroups =
pcpu_mem_zalloc(pcpu_chunk_map_bits(chunk) *
sizeof(struct obj_cgroup *), gfp);
if (!chunk->obj_cgroups)
goto objcg_fail;
}
#endif
pcpu_init_md_blocks(chunk);
/* init metadata */
chunk->free_bytes = chunk->nr_pages * PAGE_SIZE;
return chunk;
#ifdef CONFIG_MEMCG_KMEM
objcg_fail:
pcpu_mem_free(chunk->md_blocks);
#endif
md_blocks_fail:
pcpu_mem_free(chunk->bound_map);
bound_map_fail:
pcpu_mem_free(chunk->alloc_map);
alloc_map_fail:
pcpu_mem_free(chunk);
return NULL;
}
static void pcpu_free_chunk(struct pcpu_chunk *chunk)
{
if (!chunk)
return;
#ifdef CONFIG_MEMCG_KMEM
pcpu_mem_free(chunk->obj_cgroups);
#endif
pcpu_mem_free(chunk->md_blocks);
pcpu_mem_free(chunk->bound_map);
pcpu_mem_free(chunk->alloc_map);
pcpu_mem_free(chunk);
}
/**
* pcpu_chunk_populated - post-population bookkeeping
* @chunk: pcpu_chunk which got populated
* @page_start: the start page
* @page_end: the end page
*
* Pages in [@page_start,@page_end) have been populated to @chunk. Update
* the bookkeeping information accordingly. Must be called after each
* successful population.
*/
static void pcpu_chunk_populated(struct pcpu_chunk *chunk, int page_start,
int page_end)
{
int nr = page_end - page_start;
lockdep_assert_held(&pcpu_lock);
bitmap_set(chunk->populated, page_start, nr);
chunk->nr_populated += nr;
pcpu_nr_populated += nr;
pcpu_update_empty_pages(chunk, nr);
}
/**
* pcpu_chunk_depopulated - post-depopulation bookkeeping
* @chunk: pcpu_chunk which got depopulated
* @page_start: the start page
* @page_end: the end page
*
* Pages in [@page_start,@page_end) have been depopulated from @chunk.
* Update the bookkeeping information accordingly. Must be called after
* each successful depopulation.
*/
static void pcpu_chunk_depopulated(struct pcpu_chunk *chunk,
int page_start, int page_end)
{
int nr = page_end - page_start;
lockdep_assert_held(&pcpu_lock);
bitmap_clear(chunk->populated, page_start, nr);
chunk->nr_populated -= nr;
pcpu_nr_populated -= nr;
pcpu_update_empty_pages(chunk, -nr);
}
/*
* Chunk management implementation.
*
* To allow different implementations, chunk alloc/free and
* [de]population are implemented in a separate file which is pulled
* into this file and compiled together. The following functions
* should be implemented.
*
* pcpu_populate_chunk - populate the specified range of a chunk
* pcpu_depopulate_chunk - depopulate the specified range of a chunk
* pcpu_post_unmap_tlb_flush - flush tlb for the specified range of a chunk
* pcpu_create_chunk - create a new chunk
* pcpu_destroy_chunk - destroy a chunk, always preceded by full depop
* pcpu_addr_to_page - translate address to physical address
* pcpu_verify_alloc_info - check alloc_info is acceptable during init
*/
static int pcpu_populate_chunk(struct pcpu_chunk *chunk,
int page_start, int page_end, gfp_t gfp);
static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk,
int page_start, int page_end);
static void pcpu_post_unmap_tlb_flush(struct pcpu_chunk *chunk,
int page_start, int page_end);
static struct pcpu_chunk *pcpu_create_chunk(gfp_t gfp);
static void pcpu_destroy_chunk(struct pcpu_chunk *chunk);
static struct page *pcpu_addr_to_page(void *addr);
static int __init pcpu_verify_alloc_info(const struct pcpu_alloc_info *ai);
#ifdef CONFIG_NEED_PER_CPU_KM
#include "percpu-km.c"
#else
#include "percpu-vm.c"
#endif
/**
* pcpu_chunk_addr_search - determine chunk containing specified address
* @addr: address for which the chunk needs to be determined.
*
* This is an internal function that handles all but static allocations.
* Static percpu address values should never be passed into the allocator.
*
* RETURNS:
* The address of the found chunk.
*/
static struct pcpu_chunk *pcpu_chunk_addr_search(void *addr)
{
/* is it in the dynamic region (first chunk)? */
if (pcpu_addr_in_chunk(pcpu_first_chunk, addr))
return pcpu_first_chunk;
/* is it in the reserved region? */
if (pcpu_addr_in_chunk(pcpu_reserved_chunk, addr))
return pcpu_reserved_chunk;
/*
* The address is relative to unit0 which might be unused and
* thus unmapped. Offset the address to the unit space of the
* current processor before looking it up in the vmalloc
* space. Note that any possible cpu id can be used here, so
* there's no need to worry about preemption or cpu hotplug.
*/
addr += pcpu_unit_offsets[raw_smp_processor_id()];
return pcpu_get_page_chunk(pcpu_addr_to_page(addr));
}
#ifdef CONFIG_MEMCG_KMEM
static bool pcpu_memcg_pre_alloc_hook(size_t size, gfp_t gfp,
struct obj_cgroup **objcgp)
{
struct obj_cgroup *objcg;
if (!memcg_kmem_enabled() || !(gfp & __GFP_ACCOUNT))
return true;
objcg = get_obj_cgroup_from_current();
if (!objcg)
return true;
if (obj_cgroup_charge(objcg, gfp, size * num_possible_cpus())) {
obj_cgroup_put(objcg);
return false;
}
*objcgp = objcg;
return true;
}
static void pcpu_memcg_post_alloc_hook(struct obj_cgroup *objcg,
struct pcpu_chunk *chunk, int off,
size_t size)
{
if (!objcg)
return;
if (likely(chunk && chunk->obj_cgroups)) {
chunk->obj_cgroups[off >> PCPU_MIN_ALLOC_SHIFT] = objcg;
rcu_read_lock();
mod_memcg_state(obj_cgroup_memcg(objcg), MEMCG_PERCPU_B,
size * num_possible_cpus());
rcu_read_unlock();
} else {
obj_cgroup_uncharge(objcg, size * num_possible_cpus());
obj_cgroup_put(objcg);
}
}
static void pcpu_memcg_free_hook(struct pcpu_chunk *chunk, int off, size_t size)
{
struct obj_cgroup *objcg;
if (unlikely(!chunk->obj_cgroups))
return;
objcg = chunk->obj_cgroups[off >> PCPU_MIN_ALLOC_SHIFT];
if (!objcg)
return;
chunk->obj_cgroups[off >> PCPU_MIN_ALLOC_SHIFT] = NULL;
obj_cgroup_uncharge(objcg, size * num_possible_cpus());
rcu_read_lock();
mod_memcg_state(obj_cgroup_memcg(objcg), MEMCG_PERCPU_B,
-(size * num_possible_cpus()));
rcu_read_unlock();
obj_cgroup_put(objcg);
}
#else /* CONFIG_MEMCG_KMEM */
static bool
pcpu_memcg_pre_alloc_hook(size_t size, gfp_t gfp, struct obj_cgroup **objcgp)
{
return true;
}
static void pcpu_memcg_post_alloc_hook(struct obj_cgroup *objcg,
struct pcpu_chunk *chunk, int off,
size_t size)
{
}
static void pcpu_memcg_free_hook(struct pcpu_chunk *chunk, int off, size_t size)
{
}
#endif /* CONFIG_MEMCG_KMEM */
/**
* pcpu_alloc - the percpu allocator
* @size: size of area to allocate in bytes
* @align: alignment of area (max PAGE_SIZE)
* @reserved: allocate from the reserved chunk if available
* @gfp: allocation flags
*
* Allocate percpu area of @size bytes aligned at @align. If @gfp doesn't
* contain %GFP_KERNEL, the allocation is atomic. If @gfp has __GFP_NOWARN
* then no warning will be triggered on invalid or failed allocation
* requests.
*
* RETURNS:
* Percpu pointer to the allocated area on success, NULL on failure.
*/
static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved,
gfp_t gfp)
{
gfp_t pcpu_gfp;
bool is_atomic;
bool do_warn;
struct obj_cgroup *objcg = NULL;
static int warn_limit = 10;
struct pcpu_chunk *chunk, *next;
const char *err;
int slot, off, cpu, ret;
unsigned long flags;
void __percpu *ptr;
size_t bits, bit_align;
gfp = current_gfp_context(gfp);
/* whitelisted flags that can be passed to the backing allocators */
pcpu_gfp = gfp & (GFP_KERNEL | __GFP_NORETRY | __GFP_NOWARN); is_atomic = (gfp & GFP_KERNEL) != GFP_KERNEL; do_warn = !(gfp & __GFP_NOWARN);
/*
* There is now a minimum allocation size of PCPU_MIN_ALLOC_SIZE,
* therefore alignment must be a minimum of that many bytes.
* An allocation may have internal fragmentation from rounding up
* of up to PCPU_MIN_ALLOC_SIZE - 1 bytes.
*/
if (unlikely(align < PCPU_MIN_ALLOC_SIZE))
align = PCPU_MIN_ALLOC_SIZE;
size = ALIGN(size, PCPU_MIN_ALLOC_SIZE);
bits = size >> PCPU_MIN_ALLOC_SHIFT;
bit_align = align >> PCPU_MIN_ALLOC_SHIFT; if (unlikely(!size || size > PCPU_MIN_UNIT_SIZE || align > PAGE_SIZE ||
!is_power_of_2(align))) {
WARN(do_warn, "illegal size (%zu) or align (%zu) for percpu allocation\n",
size, align);
return NULL;
}
if (unlikely(!pcpu_memcg_pre_alloc_hook(size, gfp, &objcg)))
return NULL;
if (!is_atomic) {
/*
* pcpu_balance_workfn() allocates memory under this mutex,
* and it may wait for memory reclaim. Allow current task
* to become OOM victim, in case of memory pressure.
*/
if (gfp & __GFP_NOFAIL) { mutex_lock(&pcpu_alloc_mutex); } else if (mutex_lock_killable(&pcpu_alloc_mutex)) {
pcpu_memcg_post_alloc_hook(objcg, NULL, 0, size);
return NULL;
}
}
spin_lock_irqsave(&pcpu_lock, flags);
/* serve reserved allocations from the reserved chunk if available */
if (reserved && pcpu_reserved_chunk) {
chunk = pcpu_reserved_chunk;
off = pcpu_find_block_fit(chunk, bits, bit_align, is_atomic);
if (off < 0) {
err = "alloc from reserved chunk failed";
goto fail_unlock;
}
off = pcpu_alloc_area(chunk, bits, bit_align, off);
if (off >= 0)
goto area_found;
err = "alloc from reserved chunk failed";
goto fail_unlock;
}
restart:
/* search through normal chunks */
for (slot = pcpu_size_to_slot(size); slot <= pcpu_free_slot; slot++) { list_for_each_entry_safe(chunk, next, &pcpu_chunk_lists[slot],
list) {
off = pcpu_find_block_fit(chunk, bits, bit_align,
is_atomic);
if (off < 0) {
if (slot < PCPU_SLOT_FAIL_THRESHOLD)
pcpu_chunk_move(chunk, 0);
continue;
}
off = pcpu_alloc_area(chunk, bits, bit_align, off);
if (off >= 0) {
pcpu_reintegrate_chunk(chunk);
goto area_found;
}
}
}
spin_unlock_irqrestore(&pcpu_lock, flags);
/*
* No space left. Create a new chunk. We don't want multiple
* tasks to create chunks simultaneously. Serialize and create iff
* there's still no empty chunk after grabbing the mutex.
*/
if (is_atomic) {
err = "atomic alloc failed, no space left";
goto fail;
}
if (list_empty(&pcpu_chunk_lists[pcpu_free_slot])) { chunk = pcpu_create_chunk(pcpu_gfp);
if (!chunk) {
err = "failed to allocate new chunk";
goto fail;
}
spin_lock_irqsave(&pcpu_lock, flags);
pcpu_chunk_relocate(chunk, -1);
} else {
spin_lock_irqsave(&pcpu_lock, flags);
}
goto restart;
area_found:
pcpu_stats_area_alloc(chunk, size);
spin_unlock_irqrestore(&pcpu_lock, flags);
/* populate if not all pages are already there */
if (!is_atomic) {
unsigned int page_start, page_end, rs, re;
page_start = PFN_DOWN(off);
page_end = PFN_UP(off + size);
bitmap_for_each_clear_region(chunk->populated, rs, re,
page_start, page_end) {
WARN_ON(chunk->immutable); ret = pcpu_populate_chunk(chunk, rs, re, pcpu_gfp);
spin_lock_irqsave(&pcpu_lock, flags);
if (ret) {
pcpu_free_area(chunk, off);
err = "failed to populate";
goto fail_unlock;
}
pcpu_chunk_populated(chunk, rs, re);
spin_unlock_irqrestore(&pcpu_lock, flags);
}
mutex_unlock(&pcpu_alloc_mutex);
}
if (pcpu_nr_empty_pop_pages < PCPU_EMPTY_POP_PAGES_LOW)
pcpu_schedule_balance_work();
/* clear the areas and return address relative to base address */
for_each_possible_cpu(cpu)
memset((void *)pcpu_chunk_addr(chunk, cpu, 0) + off, 0, size);
ptr = __addr_to_pcpu_ptr(chunk->base_addr + off);
kmemleak_alloc_percpu(ptr, size, gfp);
trace_percpu_alloc_percpu(reserved, is_atomic, size, align,
chunk->base_addr, off, ptr);
pcpu_memcg_post_alloc_hook(objcg, chunk, off, size);
return ptr;
fail_unlock:
spin_unlock_irqrestore(&pcpu_lock, flags);
fail:
trace_percpu_alloc_percpu_fail(reserved, is_atomic, size, align);
if (!is_atomic && do_warn && warn_limit) {
pr_warn("allocation failed, size=%zu align=%zu atomic=%d, %s\n",
size, align, is_atomic, err);
dump_stack();
if (!--warn_limit)
pr_info("limit reached, disable warning\n");
}
if (is_atomic) {
/* see the flag handling in pcpu_balance_workfn() */
pcpu_atomic_alloc_failed = true;
pcpu_schedule_balance_work();
} else {
mutex_unlock(&pcpu_alloc_mutex);
}
pcpu_memcg_post_alloc_hook(objcg, NULL, 0, size);
return NULL;
}
/**
* __alloc_percpu_gfp - allocate dynamic percpu area
* @size: size of area to allocate in bytes
* @align: alignment of area (max PAGE_SIZE)
* @gfp: allocation flags
*
* Allocate zero-filled percpu area of @size bytes aligned at @align. If
* @gfp doesn't contain %GFP_KERNEL, the allocation doesn't block and can
* be called from any context but is a lot more likely to fail. If @gfp
* has __GFP_NOWARN then no warning will be triggered on invalid or failed
* allocation requests.
*
* RETURNS:
* Percpu pointer to the allocated area on success, NULL on failure.
*/
void __percpu *__alloc_percpu_gfp(size_t size, size_t align, gfp_t gfp)
{
return pcpu_alloc(size, align, false, gfp);
}
EXPORT_SYMBOL_GPL(__alloc_percpu_gfp);
/**
* __alloc_percpu - allocate dynamic percpu area
* @size: size of area to allocate in bytes
* @align: alignment of area (max PAGE_SIZE)
*
* Equivalent to __alloc_percpu_gfp(size, align, %GFP_KERNEL).
*/
void __percpu *__alloc_percpu(size_t size, size_t align)
{
return pcpu_alloc(size, align, false, GFP_KERNEL);
}
EXPORT_SYMBOL_GPL(__alloc_percpu);
/**
* __alloc_reserved_percpu - allocate reserved percpu area
* @size: size of area to allocate in bytes
* @align: alignment of area (max PAGE_SIZE)
*
* Allocate zero-filled percpu area of @size bytes aligned at @align
* from reserved percpu area if arch has set it up; otherwise,
* allocation is served from the same dynamic area. Might sleep.
* Might trigger writeouts.
*
* CONTEXT:
* Does GFP_KERNEL allocation.
*
* RETURNS:
* Percpu pointer to the allocated area on success, NULL on failure.
*/
void __percpu *__alloc_reserved_percpu(size_t size, size_t align)
{
return pcpu_alloc(size, align, true, GFP_KERNEL);
}
/**
* pcpu_balance_free - manage the amount of free chunks
* @empty_only: free chunks only if there are no populated pages
*
* If empty_only is %false, reclaim all fully free chunks regardless of the
* number of populated pages. Otherwise, only reclaim chunks that have no
* populated pages.
*
* CONTEXT:
* pcpu_lock (can be dropped temporarily)
*/
static void pcpu_balance_free(bool empty_only)
{
LIST_HEAD(to_free);
struct list_head *free_head = &pcpu_chunk_lists[pcpu_free_slot];
struct pcpu_chunk *chunk, *next;
lockdep_assert_held(&pcpu_lock);
/*
* There's no reason to keep around multiple unused chunks and VM
* areas can be scarce. Destroy all free chunks except for one.
*/
list_for_each_entry_safe(chunk, next, free_head, list) {
WARN_ON(chunk->immutable);
/* spare the first one */
if (chunk == list_first_entry(free_head, struct pcpu_chunk, list))
continue;
if (!empty_only || chunk->nr_empty_pop_pages == 0)
list_move(&chunk->list, &to_free);
}
if (list_empty(&to_free))
return;
spin_unlock_irq(&pcpu_lock);
list_for_each_entry_safe(chunk, next, &to_free, list) {
unsigned int rs, re;
bitmap_for_each_set_region(chunk->populated, rs, re, 0,
chunk->nr_pages) {
pcpu_depopulate_chunk(chunk, rs, re);
spin_lock_irq(&pcpu_lock);
pcpu_chunk_depopulated(chunk, rs, re);
spin_unlock_irq(&pcpu_lock);
}
pcpu_destroy_chunk(chunk);
cond_resched();
}
spin_lock_irq(&pcpu_lock);
}
/**
* pcpu_balance_populated - manage the amount of populated pages
*
* Maintain a certain amount of populated pages to satisfy atomic allocations.
* It is possible that this is called when physical memory is scarce causing
* OOM killer to be triggered. We should avoid doing so until an actual
* allocation causes the failure as it is possible that requests can be
* serviced from already backed regions.
*
* CONTEXT:
* pcpu_lock (can be dropped temporarily)
*/
static void pcpu_balance_populated(void)
{
/* gfp flags passed to underlying allocators */
const gfp_t gfp = GFP_KERNEL | __GFP_NORETRY | __GFP_NOWARN;
struct pcpu_chunk *chunk;
int slot, nr_to_pop, ret;
lockdep_assert_held(&pcpu_lock);
/*
* Ensure there are certain number of free populated pages for
* atomic allocs. Fill up from the most packed so that atomic
* allocs don't increase fragmentation. If atomic allocation
* failed previously, always populate the maximum amount. This
* should prevent atomic allocs larger than PAGE_SIZE from keeping
* failing indefinitely; however, large atomic allocs are not
* something we support properly and can be highly unreliable and
* inefficient.
*/
retry_pop:
if (pcpu_atomic_alloc_failed) {
nr_to_pop = PCPU_EMPTY_POP_PAGES_HIGH;
/* best effort anyway, don't worry about synchronization */
pcpu_atomic_alloc_failed = false;
} else {
nr_to_pop = clamp(PCPU_EMPTY_POP_PAGES_HIGH -
pcpu_nr_empty_pop_pages,
0, PCPU_EMPTY_POP_PAGES_HIGH);
}
for (slot = pcpu_size_to_slot(PAGE_SIZE); slot <= pcpu_free_slot; slot++) {
unsigned int nr_unpop = 0, rs, re;
if (!nr_to_pop)
break;
list_for_each_entry(chunk, &pcpu_chunk_lists[slot], list) {
nr_unpop = chunk->nr_pages - chunk->nr_populated;
if (nr_unpop)
break;
}
if (!nr_unpop)
continue;
/* @chunk can't go away while pcpu_alloc_mutex is held */
bitmap_for_each_clear_region(chunk->populated, rs, re, 0,
chunk->nr_pages) {
int nr = min_t(int, re - rs, nr_to_pop);
spin_unlock_irq(&pcpu_lock);
ret = pcpu_populate_chunk(chunk, rs, rs + nr, gfp);
cond_resched();
spin_lock_irq(&pcpu_lock);
if (!ret) {
nr_to_pop -= nr;
pcpu_chunk_populated(chunk, rs, rs + nr);
} else {
nr_to_pop = 0;
}
if (!nr_to_pop)
break;
}
}
if (nr_to_pop) {
/* ran out of chunks to populate, create a new one and retry */
spin_unlock_irq(&pcpu_lock);
chunk = pcpu_create_chunk(gfp);
cond_resched();
spin_lock_irq(&pcpu_lock);
if (chunk) {
pcpu_chunk_relocate(chunk, -1);
goto retry_pop;
}
}
}
/**
* pcpu_reclaim_populated - scan over to_depopulate chunks and free empty pages
*
* Scan over chunks in the depopulate list and try to release unused populated
* pages back to the system. Depopulated chunks are sidelined to prevent
* repopulating these pages unless required. Fully free chunks are reintegrated
* and freed accordingly (1 is kept around). If we drop below the empty
* populated pages threshold, reintegrate the chunk if it has empty free pages.
* Each chunk is scanned in the reverse order to keep populated pages close to
* the beginning of the chunk.
*
* CONTEXT:
* pcpu_lock (can be dropped temporarily)
*
*/
static void pcpu_reclaim_populated(void)
{
struct pcpu_chunk *chunk;
struct pcpu_block_md *block;
int freed_page_start, freed_page_end;
int i, end;
bool reintegrate;
lockdep_assert_held(&pcpu_lock);
/*
* Once a chunk is isolated to the to_depopulate list, the chunk is no
* longer discoverable to allocations whom may populate pages. The only
* other accessor is the free path which only returns area back to the
* allocator not touching the populated bitmap.
*/
while (!list_empty(&pcpu_chunk_lists[pcpu_to_depopulate_slot])) {
chunk = list_first_entry(&pcpu_chunk_lists[pcpu_to_depopulate_slot],
struct pcpu_chunk, list);
WARN_ON(chunk->immutable);
/*
* Scan chunk's pages in the reverse order to keep populated
* pages close to the beginning of the chunk.
*/
freed_page_start = chunk->nr_pages;
freed_page_end = 0;
reintegrate = false;
for (i = chunk->nr_pages - 1, end = -1; i >= 0; i--) {
/* no more work to do */
if (chunk->nr_empty_pop_pages == 0)
break;
/* reintegrate chunk to prevent atomic alloc failures */
if (pcpu_nr_empty_pop_pages < PCPU_EMPTY_POP_PAGES_HIGH) {
reintegrate = true;
goto end_chunk;
}
/*
* If the page is empty and populated, start or
* extend the (i, end) range. If i == 0, decrease
* i and perform the depopulation to cover the last
* (first) page in the chunk.
*/
block = chunk->md_blocks + i;
if (block->contig_hint == PCPU_BITMAP_BLOCK_BITS &&
test_bit(i, chunk->populated)) {
if (end == -1)
end = i;
if (i > 0)
continue;
i--;
}
/* depopulate if there is an active range */
if (end == -1)
continue;
spin_unlock_irq(&pcpu_lock);
pcpu_depopulate_chunk(chunk, i + 1, end + 1);
cond_resched();
spin_lock_irq(&pcpu_lock);
pcpu_chunk_depopulated(chunk, i + 1, end + 1);
freed_page_start = min(freed_page_start, i + 1);
freed_page_end = max(freed_page_end, end + 1);
/* reset the range and continue */
end = -1;
}
end_chunk:
/* batch tlb flush per chunk to amortize cost */
if (freed_page_start < freed_page_end) {
spin_unlock_irq(&pcpu_lock);
pcpu_post_unmap_tlb_flush(chunk,
freed_page_start,
freed_page_end);
cond_resched();
spin_lock_irq(&pcpu_lock);
}
if (reintegrate || chunk->free_bytes == pcpu_unit_size)
pcpu_reintegrate_chunk(chunk);
else
list_move_tail(&chunk->list,
&pcpu_chunk_lists[pcpu_sidelined_slot]);
}
}
/**
* pcpu_balance_workfn - manage the amount of free chunks and populated pages
* @work: unused
*
* For each chunk type, manage the number of fully free chunks and the number of
* populated pages. An important thing to consider is when pages are freed and
* how they contribute to the global counts.
*/
static void pcpu_balance_workfn(struct work_struct *work)
{
/*
* pcpu_balance_free() is called twice because the first time we may
* trim pages in the active pcpu_nr_empty_pop_pages which may cause us
* to grow other chunks. This then gives pcpu_reclaim_populated() time
* to move fully free chunks to the active list to be freed if
* appropriate.
*/
mutex_lock(&pcpu_alloc_mutex);
spin_lock_irq(&pcpu_lock);
pcpu_balance_free(false);
pcpu_reclaim_populated();
pcpu_balance_populated();
pcpu_balance_free(true);
spin_unlock_irq(&pcpu_lock);
mutex_unlock(&pcpu_alloc_mutex);
}
/**
* free_percpu - free percpu area
* @ptr: pointer to area to free
*
* Free percpu area @ptr.
*
* CONTEXT:
* Can be called from atomic context.
*/
void free_percpu(void __percpu *ptr)
{
void *addr;
struct pcpu_chunk *chunk;
unsigned long flags;
int size, off;
bool need_balance = false;
if (!ptr)
return;
kmemleak_free_percpu(ptr);
addr = __pcpu_ptr_to_addr(ptr);
spin_lock_irqsave(&pcpu_lock, flags);
chunk = pcpu_chunk_addr_search(addr);
off = addr - chunk->base_addr;
size = pcpu_free_area(chunk, off);
pcpu_memcg_free_hook(chunk, off, size);
/*
* If there are more than one fully free chunks, wake up grim reaper.
* If the chunk is isolated, it may be in the process of being
* reclaimed. Let reclaim manage cleaning up of that chunk.
*/
if (!chunk->isolated && chunk->free_bytes == pcpu_unit_size) {
struct pcpu_chunk *pos;
list_for_each_entry(pos, &pcpu_chunk_lists[pcpu_free_slot], list) if (pos != chunk) {
need_balance = true;
break;
}
} else if (pcpu_should_reclaim_chunk(chunk)) {
pcpu_isolate_chunk(chunk);
need_balance = true;
}
trace_percpu_free_percpu(chunk->base_addr, off, ptr);
spin_unlock_irqrestore(&pcpu_lock, flags);
if (need_balance)
pcpu_schedule_balance_work();
}
EXPORT_SYMBOL_GPL(free_percpu);
bool __is_kernel_percpu_address(unsigned long addr, unsigned long *can_addr)
{
#ifdef CONFIG_SMP
const size_t static_size = __per_cpu_end - __per_cpu_start;
void __percpu *base = __addr_to_pcpu_ptr(pcpu_base_addr);
unsigned int cpu;
for_each_possible_cpu(cpu) {
void *start = per_cpu_ptr(base, cpu);
void *va = (void *)addr;
if (va >= start && va < start + static_size) {
if (can_addr) {
*can_addr = (unsigned long) (va - start);
*can_addr += (unsigned long)
per_cpu_ptr(base, get_boot_cpu_id());
}
return true;
}
}
#endif
/* on UP, can't distinguish from other static vars, always false */
return false;
}
/**
* is_kernel_percpu_address - test whether address is from static percpu area
* @addr: address to test
*
* Test whether @addr belongs to in-kernel static percpu area. Module
* static percpu areas are not considered. For those, use
* is_module_percpu_address().
*
* RETURNS:
* %true if @addr is from in-kernel static percpu area, %false otherwise.
*/
bool is_kernel_percpu_address(unsigned long addr)
{
return __is_kernel_percpu_address(addr, NULL);
}
/**
* per_cpu_ptr_to_phys - convert translated percpu address to physical address
* @addr: the address to be converted to physical address
*
* Given @addr which is dereferenceable address obtained via one of
* percpu access macros, this function translates it into its physical
* address. The caller is responsible for ensuring @addr stays valid
* until this function finishes.
*
* percpu allocator has special setup for the first chunk, which currently
* supports either embedding in linear address space or vmalloc mapping,
* and, from the second one, the backing allocator (currently either vm or
* km) provides translation.
*
* The addr can be translated simply without checking if it falls into the
* first chunk. But the current code reflects better how percpu allocator
* actually works, and the verification can discover both bugs in percpu
* allocator itself and per_cpu_ptr_to_phys() callers. So we keep current
* code.
*
* RETURNS:
* The physical address for @addr.
*/
phys_addr_t per_cpu_ptr_to_phys(void *addr)
{
void __percpu *base = __addr_to_pcpu_ptr(pcpu_base_addr);
bool in_first_chunk = false;
unsigned long first_low, first_high;
unsigned int cpu;
/*
* The following test on unit_low/high isn't strictly
* necessary but will speed up lookups of addresses which
* aren't in the first chunk.
*
* The address check is against full chunk sizes. pcpu_base_addr
* points to the beginning of the first chunk including the
* static region. Assumes good intent as the first chunk may
* not be full (ie. < pcpu_unit_pages in size).
*/
first_low = (unsigned long)pcpu_base_addr +
pcpu_unit_page_offset(pcpu_low_unit_cpu, 0);
first_high = (unsigned long)pcpu_base_addr +
pcpu_unit_page_offset(pcpu_high_unit_cpu, pcpu_unit_pages);
if ((unsigned long)addr >= first_low &&
(unsigned long)addr < first_high) {
for_each_possible_cpu(cpu) {
void *start = per_cpu_ptr(base, cpu);
if (addr >= start && addr < start + pcpu_unit_size) {
in_first_chunk = true;
break;
}
}
}
if (in_first_chunk) {
if (!is_vmalloc_addr(addr))
return __pa(addr);
else
return page_to_phys(vmalloc_to_page(addr)) +
offset_in_page(addr);
} else
return page_to_phys(pcpu_addr_to_page(addr)) +
offset_in_page(addr);
}
/**
* pcpu_alloc_alloc_info - allocate percpu allocation info
* @nr_groups: the number of groups
* @nr_units: the number of units
*
* Allocate ai which is large enough for @nr_groups groups containing
* @nr_units units. The returned ai's groups[0].cpu_map points to the
* cpu_map array which is long enough for @nr_units and filled with
* NR_CPUS. It's the caller's responsibility to initialize cpu_map
* pointer of other groups.
*
* RETURNS:
* Pointer to the allocated pcpu_alloc_info on success, NULL on
* failure.
*/
struct pcpu_alloc_info * __init pcpu_alloc_alloc_info(int nr_groups,
int nr_units)
{
struct pcpu_alloc_info *ai;
size_t base_size, ai_size;
void *ptr;
int unit;
base_size = ALIGN(struct_size(ai, groups, nr_groups),
__alignof__(ai->groups[0].cpu_map[0]));
ai_size = base_size + nr_units * sizeof(ai->groups[0].cpu_map[0]);
ptr = memblock_alloc(PFN_ALIGN(ai_size), PAGE_SIZE);
if (!ptr)
return NULL;
ai = ptr;
ptr += base_size;
ai->groups[0].cpu_map = ptr;
for (unit = 0; unit < nr_units; unit++)
ai->groups[0].cpu_map[unit] = NR_CPUS;
ai->nr_groups = nr_groups;
ai->__ai_size = PFN_ALIGN(ai_size);
return ai;
}
/**
* pcpu_free_alloc_info - free percpu allocation info
* @ai: pcpu_alloc_info to free
*
* Free @ai which was allocated by pcpu_alloc_alloc_info().
*/
void __init pcpu_free_alloc_info(struct pcpu_alloc_info *ai)
{
memblock_free_early(__pa(ai), ai->__ai_size);
}
/**
* pcpu_dump_alloc_info - print out information about pcpu_alloc_info
* @lvl: loglevel
* @ai: allocation info to dump
*
* Print out information about @ai using loglevel @lvl.
*/
static void pcpu_dump_alloc_info(const char *lvl,
const struct pcpu_alloc_info *ai)
{
int group_width = 1, cpu_width = 1, width;
char empty_str[] = "--------";
int alloc = 0, alloc_end = 0;
int group, v;
int upa, apl; /* units per alloc, allocs per line */
v = ai->nr_groups;
while (v /= 10)
group_width++;
v = num_possible_cpus();
while (v /= 10)
cpu_width++;
empty_str[min_t(int, cpu_width, sizeof(empty_str) - 1)] = '\0';
upa = ai->alloc_size / ai->unit_size;
width = upa * (cpu_width + 1) + group_width + 3;
apl = rounddown_pow_of_two(max(60 / width, 1));
printk("%spcpu-alloc: s%zu r%zu d%zu u%zu alloc=%zu*%zu",
lvl, ai->static_size, ai->reserved_size, ai->dyn_size,
ai->unit_size, ai->alloc_size / ai->atom_size, ai->atom_size);
for (group = 0; group < ai->nr_groups; group++) {
const struct pcpu_group_info *gi = &ai->groups[group];
int unit = 0, unit_end = 0;
BUG_ON(gi->nr_units % upa);
for (alloc_end += gi->nr_units / upa;
alloc < alloc_end; alloc++) {
if (!(alloc % apl)) {
pr_cont("\n");
printk("%spcpu-alloc: ", lvl);
}
pr_cont("[%0*d] ", group_width, group);
for (unit_end += upa; unit < unit_end; unit++)
if (gi->cpu_map[unit] != NR_CPUS)
pr_cont("%0*d ",
cpu_width, gi->cpu_map[unit]);
else
pr_cont("%s ", empty_str);
}
}
pr_cont("\n");
}
/**
* pcpu_setup_first_chunk - initialize the first percpu chunk
* @ai: pcpu_alloc_info describing how to percpu area is shaped
* @base_addr: mapped address
*
* Initialize the first percpu chunk which contains the kernel static
* percpu area. This function is to be called from arch percpu area
* setup path.
*
* @ai contains all information necessary to initialize the first
* chunk and prime the dynamic percpu allocator.
*
* @ai->static_size is the size of static percpu area.
*
* @ai->reserved_size, if non-zero, specifies the amount of bytes to
* reserve after the static area in the first chunk. This reserves
* the first chunk such that it's available only through reserved
* percpu allocation. This is primarily used to serve module percpu
* static areas on architectures where the addressing model has
* limited offset range for symbol relocations to guarantee module
* percpu symbols fall inside the relocatable range.
*
* @ai->dyn_size determines the number of bytes available for dynamic
* allocation in the first chunk. The area between @ai->static_size +
* @ai->reserved_size + @ai->dyn_size and @ai->unit_size is unused.
*
* @ai->unit_size specifies unit size and must be aligned to PAGE_SIZE
* and equal to or larger than @ai->static_size + @ai->reserved_size +
* @ai->dyn_size.
*
* @ai->atom_size is the allocation atom size and used as alignment
* for vm areas.
*
* @ai->alloc_size is the allocation size and always multiple of
* @ai->atom_size. This is larger than @ai->atom_size if
* @ai->unit_size is larger than @ai->atom_size.
*
* @ai->nr_groups and @ai->groups describe virtual memory layout of
* percpu areas. Units which should be colocated are put into the
* same group. Dynamic VM areas will be allocated according to these
* groupings. If @ai->nr_groups is zero, a single group containing
* all units is assumed.
*
* The caller should have mapped the first chunk at @base_addr and
* copied static data to each unit.
*
* The first chunk will always contain a static and a dynamic region.
* However, the static region is not managed by any chunk. If the first
* chunk also contains a reserved region, it is served by two chunks -
* one for the reserved region and one for the dynamic region. They
* share the same vm, but use offset regions in the area allocation map.
* The chunk serving the dynamic region is circulated in the chunk slots
* and available for dynamic allocation like any other chunk.
*/
void __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
void *base_addr)
{
size_t size_sum = ai->static_size + ai->reserved_size + ai->dyn_size;
size_t static_size, dyn_size;
struct pcpu_chunk *chunk;
unsigned long *group_offsets;
size_t *group_sizes;
unsigned long *unit_off;
unsigned int cpu;
int *unit_map;
int group, unit, i;
int map_size;
unsigned long tmp_addr;
size_t alloc_size;
#define PCPU_SETUP_BUG_ON(cond) do { \
if (unlikely(cond)) { \
pr_emerg("failed to initialize, %s\n", #cond); \
pr_emerg("cpu_possible_mask=%*pb\n", \
cpumask_pr_args(cpu_possible_mask)); \
pcpu_dump_alloc_info(KERN_EMERG, ai); \
BUG(); \
} \
} while (0)
/* sanity checks */
PCPU_SETUP_BUG_ON(ai->nr_groups <= 0);
#ifdef CONFIG_SMP
PCPU_SETUP_BUG_ON(!ai->static_size);
PCPU_SETUP_BUG_ON(offset_in_page(__per_cpu_start));
#endif
PCPU_SETUP_BUG_ON(!base_addr);
PCPU_SETUP_BUG_ON(offset_in_page(base_addr));
PCPU_SETUP_BUG_ON(ai->unit_size < size_sum);
PCPU_SETUP_BUG_ON(offset_in_page(ai->unit_size));
PCPU_SETUP_BUG_ON(ai->unit_size < PCPU_MIN_UNIT_SIZE);
PCPU_SETUP_BUG_ON(!IS_ALIGNED(ai->unit_size, PCPU_BITMAP_BLOCK_SIZE));
PCPU_SETUP_BUG_ON(ai->dyn_size < PERCPU_DYNAMIC_EARLY_SIZE);
PCPU_SETUP_BUG_ON(!ai->dyn_size);
PCPU_SETUP_BUG_ON(!IS_ALIGNED(ai->reserved_size, PCPU_MIN_ALLOC_SIZE));
PCPU_SETUP_BUG_ON(!(IS_ALIGNED(PCPU_BITMAP_BLOCK_SIZE, PAGE_SIZE) ||
IS_ALIGNED(PAGE_SIZE, PCPU_BITMAP_BLOCK_SIZE)));
PCPU_SETUP_BUG_ON(pcpu_verify_alloc_info(ai) < 0);
/* process group information and build config tables accordingly */
alloc_size = ai->nr_groups * sizeof(group_offsets[0]);
group_offsets = memblock_alloc(alloc_size, SMP_CACHE_BYTES);
if (!group_offsets)
panic("%s: Failed to allocate %zu bytes\n", __func__,
alloc_size);
alloc_size = ai->nr_groups * sizeof(group_sizes[0]);
group_sizes = memblock_alloc(alloc_size, SMP_CACHE_BYTES);
if (!group_sizes)
panic("%s: Failed to allocate %zu bytes\n", __func__,
alloc_size);
alloc_size = nr_cpu_ids * sizeof(unit_map[0]);
unit_map = memblock_alloc(alloc_size, SMP_CACHE_BYTES);
if (!unit_map)
panic("%s: Failed to allocate %zu bytes\n", __func__,
alloc_size);
alloc_size = nr_cpu_ids * sizeof(unit_off[0]);
unit_off = memblock_alloc(alloc_size, SMP_CACHE_BYTES);
if (!unit_off)
panic("%s: Failed to allocate %zu bytes\n", __func__,
alloc_size);
for (cpu = 0; cpu < nr_cpu_ids; cpu++)
unit_map[cpu] = UINT_MAX;
pcpu_low_unit_cpu = NR_CPUS;
pcpu_high_unit_cpu = NR_CPUS;
for (group = 0, unit = 0; group < ai->nr_groups; group++, unit += i) {
const struct pcpu_group_info *gi = &ai->groups[group];
group_offsets[group] = gi->base_offset;
group_sizes[group] = gi->nr_units * ai->unit_size;
for (i = 0; i < gi->nr_units; i++) {
cpu = gi->cpu_map[i];
if (cpu == NR_CPUS)
continue;
PCPU_SETUP_BUG_ON(cpu >= nr_cpu_ids);
PCPU_SETUP_BUG_ON(!cpu_possible(cpu));
PCPU_SETUP_BUG_ON(unit_map[cpu] != UINT_MAX);
unit_map[cpu] = unit + i;
unit_off[cpu] = gi->base_offset + i * ai->unit_size;
/* determine low/high unit_cpu */
if (pcpu_low_unit_cpu == NR_CPUS ||
unit_off[cpu] < unit_off[pcpu_low_unit_cpu])
pcpu_low_unit_cpu = cpu;
if (pcpu_high_unit_cpu == NR_CPUS ||
unit_off[cpu] > unit_off[pcpu_high_unit_cpu])
pcpu_high_unit_cpu = cpu;
}
}
pcpu_nr_units = unit;
for_each_possible_cpu(cpu)
PCPU_SETUP_BUG_ON(unit_map[cpu] == UINT_MAX);
/* we're done parsing the input, undefine BUG macro and dump config */
#undef PCPU_SETUP_BUG_ON
pcpu_dump_alloc_info(KERN_DEBUG, ai);
pcpu_nr_groups = ai->nr_groups;
pcpu_group_offsets = group_offsets;
pcpu_group_sizes = group_sizes;
pcpu_unit_map = unit_map;
pcpu_unit_offsets = unit_off;
/* determine basic parameters */
pcpu_unit_pages = ai->unit_size >> PAGE_SHIFT;
pcpu_unit_size = pcpu_unit_pages << PAGE_SHIFT;
pcpu_atom_size = ai->atom_size;
pcpu_chunk_struct_size = struct_size(chunk, populated,
BITS_TO_LONGS(pcpu_unit_pages));
pcpu_stats_save_ai(ai);
/*
* Allocate chunk slots. The slots after the active slots are:
* sidelined_slot - isolated, depopulated chunks
* free_slot - fully free chunks
* to_depopulate_slot - isolated, chunks to depopulate
*/
pcpu_sidelined_slot = __pcpu_size_to_slot(pcpu_unit_size) + 1;
pcpu_free_slot = pcpu_sidelined_slot + 1;
pcpu_to_depopulate_slot = pcpu_free_slot + 1;
pcpu_nr_slots = pcpu_to_depopulate_slot + 1;
pcpu_chunk_lists = memblock_alloc(pcpu_nr_slots *
sizeof(pcpu_chunk_lists[0]),
SMP_CACHE_BYTES);
if (!pcpu_chunk_lists)
panic("%s: Failed to allocate %zu bytes\n", __func__,
pcpu_nr_slots * sizeof(pcpu_chunk_lists[0]));
for (i = 0; i < pcpu_nr_slots; i++)
INIT_LIST_HEAD(&pcpu_chunk_lists[i]);
/*
* The end of the static region needs to be aligned with the
* minimum allocation size as this offsets the reserved and
* dynamic region. The first chunk ends page aligned by
* expanding the dynamic region, therefore the dynamic region
* can be shrunk to compensate while still staying above the
* configured sizes.
*/
static_size = ALIGN(ai->static_size, PCPU_MIN_ALLOC_SIZE);
dyn_size = ai->dyn_size - (static_size - ai->static_size);
/*
* Initialize first chunk.
* If the reserved_size is non-zero, this initializes the reserved
* chunk. If the reserved_size is zero, the reserved chunk is NULL
* and the dynamic region is initialized here. The first chunk,
* pcpu_first_chunk, will always point to the chunk that serves
* the dynamic region.
*/
tmp_addr = (unsigned long)base_addr + static_size;
map_size = ai->reserved_size ?: dyn_size;
chunk = pcpu_alloc_first_chunk(tmp_addr, map_size);
/* init dynamic chunk if necessary */
if (ai->reserved_size) {
pcpu_reserved_chunk = chunk;
tmp_addr = (unsigned long)base_addr + static_size +
ai->reserved_size;
map_size = dyn_size;
chunk = pcpu_alloc_first_chunk(tmp_addr, map_size);
}
/* link the first chunk in */
pcpu_first_chunk = chunk;
pcpu_nr_empty_pop_pages = pcpu_first_chunk->nr_empty_pop_pages;
pcpu_chunk_relocate(pcpu_first_chunk, -1);
/* include all regions of the first chunk */
pcpu_nr_populated += PFN_DOWN(size_sum);
pcpu_stats_chunk_alloc();
trace_percpu_create_chunk(base_addr);
/* we're done */
pcpu_base_addr = base_addr;
}
#ifdef CONFIG_SMP
const char * const pcpu_fc_names[PCPU_FC_NR] __initconst = {
[PCPU_FC_AUTO] = "auto",
[PCPU_FC_EMBED] = "embed",
[PCPU_FC_PAGE] = "page",
};
enum pcpu_fc pcpu_chosen_fc __initdata = PCPU_FC_AUTO;
static int __init percpu_alloc_setup(char *str)
{
if (!str)
return -EINVAL;
if (0)
/* nada */;
#ifdef CONFIG_NEED_PER_CPU_EMBED_FIRST_CHUNK
else if (!strcmp(str, "embed"))
pcpu_chosen_fc = PCPU_FC_EMBED;
#endif
#ifdef CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK
else if (!strcmp(str, "page"))
pcpu_chosen_fc = PCPU_FC_PAGE;
#endif
else
pr_warn("unknown allocator %s specified\n", str);
return 0;
}
early_param("percpu_alloc", percpu_alloc_setup);
/*
* pcpu_embed_first_chunk() is used by the generic percpu setup.
* Build it if needed by the arch config or the generic setup is going
* to be used.
*/
#if defined(CONFIG_NEED_PER_CPU_EMBED_FIRST_CHUNK) || \
!defined(CONFIG_HAVE_SETUP_PER_CPU_AREA)
#define BUILD_EMBED_FIRST_CHUNK
#endif
/* build pcpu_page_first_chunk() iff needed by the arch config */
#if defined(CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK)
#define BUILD_PAGE_FIRST_CHUNK
#endif
/* pcpu_build_alloc_info() is used by both embed and page first chunk */
#if defined(BUILD_EMBED_FIRST_CHUNK) || defined(BUILD_PAGE_FIRST_CHUNK)
/**
* pcpu_build_alloc_info - build alloc_info considering distances between CPUs
* @reserved_size: the size of reserved percpu area in bytes
* @dyn_size: minimum free size for dynamic allocation in bytes
* @atom_size: allocation atom size
* @cpu_distance_fn: callback to determine distance between cpus, optional
*
* This function determines grouping of units, their mappings to cpus
* and other parameters considering needed percpu size, allocation
* atom size and distances between CPUs.
*
* Groups are always multiples of atom size and CPUs which are of
* LOCAL_DISTANCE both ways are grouped together and share space for
* units in the same group. The returned configuration is guaranteed
* to have CPUs on different nodes on different groups and >=75% usage
* of allocated virtual address space.
*
* RETURNS:
* On success, pointer to the new allocation_info is returned. On
* failure, ERR_PTR value is returned.
*/
static struct pcpu_alloc_info * __init __flatten pcpu_build_alloc_info(
size_t reserved_size, size_t dyn_size,
size_t atom_size,
pcpu_fc_cpu_distance_fn_t cpu_distance_fn)
{
static int group_map[NR_CPUS] __initdata;
static int group_cnt[NR_CPUS] __initdata;
static struct cpumask mask __initdata;
const size_t static_size = __per_cpu_end - __per_cpu_start;
int nr_groups = 1, nr_units = 0;
size_t size_sum, min_unit_size, alloc_size;
int upa, max_upa, best_upa; /* units_per_alloc */
int last_allocs, group, unit;
unsigned int cpu, tcpu;
struct pcpu_alloc_info *ai;
unsigned int *cpu_map;
/* this function may be called multiple times */
memset(group_map, 0, sizeof(group_map));
memset(group_cnt, 0, sizeof(group_cnt));
cpumask_clear(&mask);
/* calculate size_sum and ensure dyn_size is enough for early alloc */
size_sum = PFN_ALIGN(static_size + reserved_size +
max_t(size_t, dyn_size, PERCPU_DYNAMIC_EARLY_SIZE));
dyn_size = size_sum - static_size - reserved_size;
/*
* Determine min_unit_size, alloc_size and max_upa such that
* alloc_size is multiple of atom_size and is the smallest
* which can accommodate 4k aligned segments which are equal to
* or larger than min_unit_size.
*/
min_unit_size = max_t(size_t, size_sum, PCPU_MIN_UNIT_SIZE);
/* determine the maximum # of units that can fit in an allocation */
alloc_size = roundup(min_unit_size, atom_size);
upa = alloc_size / min_unit_size;
while (alloc_size % upa || (offset_in_page(alloc_size / upa)))
upa--;
max_upa = upa;
cpumask_copy(&mask, cpu_possible_mask);
/* group cpus according to their proximity */
for (group = 0; !cpumask_empty(&mask); group++) {
/* pop the group's first cpu */
cpu = cpumask_first(&mask);
group_map[cpu] = group;
group_cnt[group]++;
cpumask_clear_cpu(cpu, &mask);
for_each_cpu(tcpu, &mask) {
if (!cpu_distance_fn ||
(cpu_distance_fn(cpu, tcpu) == LOCAL_DISTANCE &&
cpu_distance_fn(tcpu, cpu) == LOCAL_DISTANCE)) {
group_map[tcpu] = group;
group_cnt[group]++;
cpumask_clear_cpu(tcpu, &mask);
}
}
}
nr_groups = group;
/*
* Wasted space is caused by a ratio imbalance of upa to group_cnt.
* Expand the unit_size until we use >= 75% of the units allocated.
* Related to atom_size, which could be much larger than the unit_size.
*/
last_allocs = INT_MAX;
best_upa = 0;
for (upa = max_upa; upa; upa--) {
int allocs = 0, wasted = 0;
if (alloc_size % upa || (offset_in_page(alloc_size / upa)))
continue;
for (group = 0; group < nr_groups; group++) {
int this_allocs = DIV_ROUND_UP(group_cnt[group], upa);
allocs += this_allocs;
wasted += this_allocs * upa - group_cnt[group];
}
/*
* Don't accept if wastage is over 1/3. The
* greater-than comparison ensures upa==1 always
* passes the following check.
*/
if (wasted > num_possible_cpus() / 3)
continue;
/* and then don't consume more memory */
if (allocs > last_allocs)
break;
last_allocs = allocs;
best_upa = upa;
}
BUG_ON(!best_upa);
upa = best_upa;
/* allocate and fill alloc_info */
for (group = 0; group < nr_groups; group++)
nr_units += roundup(group_cnt[group], upa);
ai = pcpu_alloc_alloc_info(nr_groups, nr_units);
if (!ai)
return ERR_PTR(-ENOMEM);
cpu_map = ai->groups[0].cpu_map;
for (group = 0; group < nr_groups; group++) {
ai->groups[group].cpu_map = cpu_map;
cpu_map += roundup(group_cnt[group], upa);
}
ai->static_size = static_size;
ai->reserved_size = reserved_size;
ai->dyn_size = dyn_size;
ai->unit_size = alloc_size / upa;
ai->atom_size = atom_size;
ai->alloc_size = alloc_size;
for (group = 0, unit = 0; group < nr_groups; group++) {
struct pcpu_group_info *gi = &ai->groups[group];
/*
* Initialize base_offset as if all groups are located
* back-to-back. The caller should update this to
* reflect actual allocation.
*/
gi->base_offset = unit * ai->unit_size;
for_each_possible_cpu(cpu)
if (group_map[cpu] == group)
gi->cpu_map[gi->nr_units++] = cpu;
gi->nr_units = roundup(gi->nr_units, upa);
unit += gi->nr_units;
}
BUG_ON(unit != nr_units);
return ai;
}
#endif /* BUILD_EMBED_FIRST_CHUNK || BUILD_PAGE_FIRST_CHUNK */
#if defined(BUILD_EMBED_FIRST_CHUNK)
/**
* pcpu_embed_first_chunk - embed the first percpu chunk into bootmem
* @reserved_size: the size of reserved percpu area in bytes
* @dyn_size: minimum free size for dynamic allocation in bytes
* @atom_size: allocation atom size
* @cpu_distance_fn: callback to determine distance between cpus, optional
* @alloc_fn: function to allocate percpu page
* @free_fn: function to free percpu page
*
* This is a helper to ease setting up embedded first percpu chunk and
* can be called where pcpu_setup_first_chunk() is expected.
*
* If this function is used to setup the first chunk, it is allocated
* by calling @alloc_fn and used as-is without being mapped into
* vmalloc area. Allocations are always whole multiples of @atom_size
* aligned to @atom_size.
*
* This enables the first chunk to piggy back on the linear physical
* mapping which often uses larger page size. Please note that this
* can result in very sparse cpu->unit mapping on NUMA machines thus
* requiring large vmalloc address space. Don't use this allocator if
* vmalloc space is not orders of magnitude larger than distances
* between node memory addresses (ie. 32bit NUMA machines).
*
* @dyn_size specifies the minimum dynamic area size.
*
* If the needed size is smaller than the minimum or specified unit
* size, the leftover is returned using @free_fn.
*
* RETURNS:
* 0 on success, -errno on failure.
*/
int __init pcpu_embed_first_chunk(size_t reserved_size, size_t dyn_size,
size_t atom_size,
pcpu_fc_cpu_distance_fn_t cpu_distance_fn,
pcpu_fc_alloc_fn_t alloc_fn,
pcpu_fc_free_fn_t free_fn)
{
void *base = (void *)ULONG_MAX;
void **areas = NULL;
struct pcpu_alloc_info *ai;
size_t size_sum, areas_size;
unsigned long max_distance;
int group, i, highest_group, rc = 0;
ai = pcpu_build_alloc_info(reserved_size, dyn_size, atom_size,
cpu_distance_fn);
if (IS_ERR(ai))
return PTR_ERR(ai);
size_sum = ai->static_size + ai->reserved_size + ai->dyn_size;
areas_size = PFN_ALIGN(ai->nr_groups * sizeof(void *));
areas = memblock_alloc(areas_size, SMP_CACHE_BYTES);
if (!areas) {
rc = -ENOMEM;
goto out_free;
}
/* allocate, copy and determine base address & max_distance */
highest_group = 0;
for (group = 0; group < ai->nr_groups; group++) {
struct pcpu_group_info *gi = &ai->groups[group];
unsigned int cpu = NR_CPUS;
void *ptr;
for (i = 0; i < gi->nr_units && cpu == NR_CPUS; i++)
cpu = gi->cpu_map[i];
BUG_ON(cpu == NR_CPUS);
/* allocate space for the whole group */
ptr = alloc_fn(cpu, gi->nr_units * ai->unit_size, atom_size);
if (!ptr) {
rc = -ENOMEM;
goto out_free_areas;
}
/* kmemleak tracks the percpu allocations separately */
kmemleak_free(ptr);
areas[group] = ptr;
base = min(ptr, base);
if (ptr > areas[highest_group])
highest_group = group;
}
max_distance = areas[highest_group] - base;
max_distance += ai->unit_size * ai->groups[highest_group].nr_units;
/* warn if maximum distance is further than 75% of vmalloc space */
if (max_distance > VMALLOC_TOTAL * 3 / 4) {
pr_warn("max_distance=0x%lx too large for vmalloc space 0x%lx\n",
max_distance, VMALLOC_TOTAL);
#ifdef CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK
/* and fail if we have fallback */
rc = -EINVAL;
goto out_free_areas;
#endif
}
/*
* Copy data and free unused parts. This should happen after all
* allocations are complete; otherwise, we may end up with
* overlapping groups.
*/
for (group = 0; group < ai->nr_groups; group++) {
struct pcpu_group_info *gi = &ai->groups[group];
void *ptr = areas[group];
for (i = 0; i < gi->nr_units; i++, ptr += ai->unit_size) {
if (gi->cpu_map[i] == NR_CPUS) {
/* unused unit, free whole */
free_fn(ptr, ai->unit_size);
continue;
}
/* copy and return the unused part */
memcpy(ptr, __per_cpu_load, ai->static_size);
free_fn(ptr + size_sum, ai->unit_size - size_sum);
}
}
/* base address is now known, determine group base offsets */
for (group = 0; group < ai->nr_groups; group++) {
ai->groups[group].base_offset = areas[group] - base;
}
pr_info("Embedded %zu pages/cpu s%zu r%zu d%zu u%zu\n",
PFN_DOWN(size_sum), ai->static_size, ai->reserved_size,
ai->dyn_size, ai->unit_size);
pcpu_setup_first_chunk(ai, base);
goto out_free;
out_free_areas:
for (group = 0; group < ai->nr_groups; group++)
if (areas[group])
free_fn(areas[group],
ai->groups[group].nr_units * ai->unit_size);
out_free:
pcpu_free_alloc_info(ai);
if (areas)
memblock_free_early(__pa(areas), areas_size);
return rc;
}
#endif /* BUILD_EMBED_FIRST_CHUNK */
#ifdef BUILD_PAGE_FIRST_CHUNK
/**
* pcpu_page_first_chunk - map the first chunk using PAGE_SIZE pages
* @reserved_size: the size of reserved percpu area in bytes
* @alloc_fn: function to allocate percpu page, always called with PAGE_SIZE
* @free_fn: function to free percpu page, always called with PAGE_SIZE
* @populate_pte_fn: function to populate pte
*
* This is a helper to ease setting up page-remapped first percpu
* chunk and can be called where pcpu_setup_first_chunk() is expected.
*
* This is the basic allocator. Static percpu area is allocated
* page-by-page into vmalloc area.
*
* RETURNS:
* 0 on success, -errno on failure.
*/
int __init pcpu_page_first_chunk(size_t reserved_size,
pcpu_fc_alloc_fn_t alloc_fn,
pcpu_fc_free_fn_t free_fn,
pcpu_fc_populate_pte_fn_t populate_pte_fn)
{
static struct vm_struct vm;
struct pcpu_alloc_info *ai;
char psize_str[16];
int unit_pages;
size_t pages_size;
struct page **pages;
int unit, i, j, rc = 0;
int upa;
int nr_g0_units;
snprintf(psize_str, sizeof(psize_str), "%luK", PAGE_SIZE >> 10);
ai = pcpu_build_alloc_info(reserved_size, 0, PAGE_SIZE, NULL);
if (IS_ERR(ai))
return PTR_ERR(ai);
BUG_ON(ai->nr_groups != 1);
upa = ai->alloc_size/ai->unit_size;
nr_g0_units = roundup(num_possible_cpus(), upa);
if (WARN_ON(ai->groups[0].nr_units != nr_g0_units)) {
pcpu_free_alloc_info(ai);
return -EINVAL;
}
unit_pages = ai->unit_size >> PAGE_SHIFT;
/* unaligned allocations can't be freed, round up to page size */
pages_size = PFN_ALIGN(unit_pages * num_possible_cpus() *
sizeof(pages[0]));
pages = memblock_alloc(pages_size, SMP_CACHE_BYTES);
if (!pages)
panic("%s: Failed to allocate %zu bytes\n", __func__,
pages_size);
/* allocate pages */
j = 0;
for (unit = 0; unit < num_possible_cpus(); unit++) {
unsigned int cpu = ai->groups[0].cpu_map[unit];
for (i = 0; i < unit_pages; i++) {
void *ptr;
ptr = alloc_fn(cpu, PAGE_SIZE, PAGE_SIZE);
if (!ptr) {
pr_warn("failed to allocate %s page for cpu%u\n",
psize_str, cpu);
goto enomem;
}
/* kmemleak tracks the percpu allocations separately */
kmemleak_free(ptr);
pages[j++] = virt_to_page(ptr);
}
}
/* allocate vm area, map the pages and copy static data */
vm.flags = VM_ALLOC;
vm.size = num_possible_cpus() * ai->unit_size;
vm_area_register_early(&vm, PAGE_SIZE);
for (unit = 0; unit < num_possible_cpus(); unit++) {
unsigned long unit_addr =
(unsigned long)vm.addr + unit * ai->unit_size;
for (i = 0; i < unit_pages; i++)
populate_pte_fn(unit_addr + (i << PAGE_SHIFT));
/* pte already populated, the following shouldn't fail */
rc = __pcpu_map_pages(unit_addr, &pages[unit * unit_pages],
unit_pages);
if (rc < 0)
panic("failed to map percpu area, err=%d\n", rc);
/*
* FIXME: Archs with virtual cache should flush local
* cache for the linear mapping here - something
* equivalent to flush_cache_vmap() on the local cpu.
* flush_cache_vmap() can't be used as most supporting
* data structures are not set up yet.
*/
/* copy static data */
memcpy((void *)unit_addr, __per_cpu_load, ai->static_size);
}
/* we're ready, commit */
pr_info("%d %s pages/cpu s%zu r%zu d%zu\n",
unit_pages, psize_str, ai->static_size,
ai->reserved_size, ai->dyn_size);
pcpu_setup_first_chunk(ai, vm.addr);
goto out_free_ar;
enomem:
while (--j >= 0)
free_fn(page_address(pages[j]), PAGE_SIZE);
rc = -ENOMEM;
out_free_ar:
memblock_free_early(__pa(pages), pages_size);
pcpu_free_alloc_info(ai);
return rc;
}
#endif /* BUILD_PAGE_FIRST_CHUNK */
#ifndef CONFIG_HAVE_SETUP_PER_CPU_AREA
/*
* Generic SMP percpu area setup.
*
* The embedding helper is used because its behavior closely resembles
* the original non-dynamic generic percpu area setup. This is
* important because many archs have addressing restrictions and might
* fail if the percpu area is located far away from the previous
* location. As an added bonus, in non-NUMA cases, embedding is
* generally a good idea TLB-wise because percpu area can piggy back
* on the physical linear memory mapping which uses large page
* mappings on applicable archs.
*/
unsigned long __per_cpu_offset[NR_CPUS] __read_mostly;
EXPORT_SYMBOL(__per_cpu_offset);
static void * __init pcpu_dfl_fc_alloc(unsigned int cpu, size_t size,
size_t align)
{
return memblock_alloc_from(size, align, __pa(MAX_DMA_ADDRESS));
}
static void __init pcpu_dfl_fc_free(void *ptr, size_t size)
{
memblock_free_early(__pa(ptr), size);
}
void __init setup_per_cpu_areas(void)
{
unsigned long delta;
unsigned int cpu;
int rc;
/*
* Always reserve area for module percpu variables. That's
* what the legacy allocator did.
*/
rc = pcpu_embed_first_chunk(PERCPU_MODULE_RESERVE,
PERCPU_DYNAMIC_RESERVE, PAGE_SIZE, NULL,
pcpu_dfl_fc_alloc, pcpu_dfl_fc_free);
if (rc < 0)
panic("Failed to initialize percpu areas.");
delta = (unsigned long)pcpu_base_addr - (unsigned long)__per_cpu_start;
for_each_possible_cpu(cpu)
__per_cpu_offset[cpu] = delta + pcpu_unit_offsets[cpu];
}
#endif /* CONFIG_HAVE_SETUP_PER_CPU_AREA */
#else /* CONFIG_SMP */
/*
* UP percpu area setup.
*
* UP always uses km-based percpu allocator with identity mapping.
* Static percpu variables are indistinguishable from the usual static
* variables and don't require any special preparation.
*/
void __init setup_per_cpu_areas(void)
{
const size_t unit_size =
roundup_pow_of_two(max_t(size_t, PCPU_MIN_UNIT_SIZE,
PERCPU_DYNAMIC_RESERVE));
struct pcpu_alloc_info *ai;
void *fc;
ai = pcpu_alloc_alloc_info(1, 1);
fc = memblock_alloc_from(unit_size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS));
if (!ai || !fc)
panic("Failed to allocate memory for percpu areas.");
/* kmemleak tracks the percpu allocations separately */
kmemleak_free(fc);
ai->dyn_size = unit_size;
ai->unit_size = unit_size;
ai->atom_size = unit_size;
ai->alloc_size = unit_size;
ai->groups[0].nr_units = 1;
ai->groups[0].cpu_map[0] = 0;
pcpu_setup_first_chunk(ai, fc);
pcpu_free_alloc_info(ai);
}
#endif /* CONFIG_SMP */
/*
* pcpu_nr_pages - calculate total number of populated backing pages
*
* This reflects the number of pages populated to back chunks. Metadata is
* excluded in the number exposed in meminfo as the number of backing pages
* scales with the number of cpus and can quickly outweigh the memory used for
* metadata. It also keeps this calculation nice and simple.
*
* RETURNS:
* Total number of populated backing pages in use by the allocator.
*/
unsigned long pcpu_nr_pages(void)
{
return pcpu_nr_populated * pcpu_nr_units;
}
/*
* Percpu allocator is initialized early during boot when neither slab or
* workqueue is available. Plug async management until everything is up
* and running.
*/
static int __init percpu_enable_async(void)
{
pcpu_async_enabled = true;
return 0;
}
subsys_initcall(percpu_enable_async);
// SPDX-License-Identifier: GPL-2.0-only
/*
* Network port table
*
* SELinux must keep a mapping of network ports to labels/SIDs. This
* mapping is maintained as part of the normal policy but a fast cache is
* needed to reduce the lookup overhead.
*
* Author: Paul Moore <paul@paul-moore.com>
*
* This code is heavily based on the "netif" concept originally developed by
* James Morris <jmorris@redhat.com>
* (see security/selinux/netif.c for more information)
*/
/*
* (c) Copyright Hewlett-Packard Development Company, L.P., 2008
*/
#include <linux/types.h>
#include <linux/rcupdate.h>
#include <linux/list.h>
#include <linux/slab.h>
#include <linux/spinlock.h>
#include <linux/in.h>
#include <linux/in6.h>
#include <linux/ip.h>
#include <linux/ipv6.h>
#include <net/ip.h>
#include <net/ipv6.h>
#include "netport.h"
#include "objsec.h"
#define SEL_NETPORT_HASH_SIZE 256
#define SEL_NETPORT_HASH_BKT_LIMIT 16
struct sel_netport_bkt {
int size;
struct list_head list;
};
struct sel_netport {
struct netport_security_struct psec;
struct list_head list;
struct rcu_head rcu;
};
/* NOTE: we are using a combined hash table for both IPv4 and IPv6, the reason
* for this is that I suspect most users will not make heavy use of both
* address families at the same time so one table will usually end up wasted,
* if this becomes a problem we can always add a hash table for each address
* family later */
static DEFINE_SPINLOCK(sel_netport_lock);
static struct sel_netport_bkt sel_netport_hash[SEL_NETPORT_HASH_SIZE];
/**
* sel_netport_hashfn - Hashing function for the port table
* @pnum: port number
*
* Description:
* This is the hashing function for the port table, it returns the bucket
* number for the given port.
*
*/
static unsigned int sel_netport_hashfn(u16 pnum)
{
return (pnum & (SEL_NETPORT_HASH_SIZE - 1));
}
/**
* sel_netport_find - Search for a port record
* @protocol: protocol
* @port: pnum
*
* Description:
* Search the network port table and return the matching record. If an entry
* can not be found in the table return NULL.
*
*/
static struct sel_netport *sel_netport_find(u8 protocol, u16 pnum)
{
unsigned int idx;
struct sel_netport *port;
idx = sel_netport_hashfn(pnum);
list_for_each_entry_rcu(port, &sel_netport_hash[idx].list, list) if (port->psec.port == pnum && port->psec.protocol == protocol)
return port;
return NULL;
}
/**
* sel_netport_insert - Insert a new port into the table
* @port: the new port record
*
* Description:
* Add a new port record to the network address hash table.
*
*/
static void sel_netport_insert(struct sel_netport *port)
{
unsigned int idx;
/* we need to impose a limit on the growth of the hash table so check
* this bucket to make sure it is within the specified bounds */
idx = sel_netport_hashfn(port->psec.port);
list_add_rcu(&port->list, &sel_netport_hash[idx].list);
if (sel_netport_hash[idx].size == SEL_NETPORT_HASH_BKT_LIMIT) {
struct sel_netport *tail;
tail = list_entry(
rcu_dereference_protected(
sel_netport_hash[idx].list.prev,
lockdep_is_held(&sel_netport_lock)),
struct sel_netport, list);
list_del_rcu(&tail->list);
kfree_rcu(tail, rcu);
} else
sel_netport_hash[idx].size++;
}
/**
* sel_netport_sid_slow - Lookup the SID of a network address using the policy
* @protocol: protocol
* @pnum: port
* @sid: port SID
*
* Description:
* This function determines the SID of a network port by querying the security
* policy. The result is added to the network port table to speedup future
* queries. Returns zero on success, negative values on failure.
*
*/
static int sel_netport_sid_slow(u8 protocol, u16 pnum, u32 *sid)
{
int ret;
struct sel_netport *port;
struct sel_netport *new;
spin_lock_bh(&sel_netport_lock);
port = sel_netport_find(protocol, pnum);
if (port != NULL) { *sid = port->psec.sid;
spin_unlock_bh(&sel_netport_lock);
return 0;
}
ret = security_port_sid(&selinux_state, protocol, pnum, sid);
if (ret != 0)
goto out;
new = kzalloc(sizeof(*new), GFP_ATOMIC);
if (new) {
new->psec.port = pnum;
new->psec.protocol = protocol;
new->psec.sid = *sid;
sel_netport_insert(new);
}
out:
spin_unlock_bh(&sel_netport_lock);
if (unlikely(ret))
pr_warn("SELinux: failure in %s(), unable to determine network port label\n",
__func__);
return ret;
}
/**
* sel_netport_sid - Lookup the SID of a network port
* @protocol: protocol
* @pnum: port
* @sid: port SID
*
* Description:
* This function determines the SID of a network port using the fastest method
* possible. First the port table is queried, but if an entry can't be found
* then the policy is queried and the result is added to the table to speedup
* future queries. Returns zero on success, negative values on failure.
*
*/
int sel_netport_sid(u8 protocol, u16 pnum, u32 *sid)
{
struct sel_netport *port;
rcu_read_lock();
port = sel_netport_find(protocol, pnum); if (port != NULL) { *sid = port->psec.sid;
rcu_read_unlock();
return 0;
}
rcu_read_unlock();
return sel_netport_sid_slow(protocol, pnum, sid);
}
/**
* sel_netport_flush - Flush the entire network port table
*
* Description:
* Remove all entries from the network address table.
*
*/
void sel_netport_flush(void)
{
unsigned int idx;
struct sel_netport *port, *port_tmp;
spin_lock_bh(&sel_netport_lock);
for (idx = 0; idx < SEL_NETPORT_HASH_SIZE; idx++) {
list_for_each_entry_safe(port, port_tmp,
&sel_netport_hash[idx].list, list) {
list_del_rcu(&port->list);
kfree_rcu(port, rcu);
}
sel_netport_hash[idx].size = 0;
}
spin_unlock_bh(&sel_netport_lock);
}
static __init int sel_netport_init(void)
{
int iter;
if (!selinux_enabled_boot)
return 0;
for (iter = 0; iter < SEL_NETPORT_HASH_SIZE; iter++) {
INIT_LIST_HEAD(&sel_netport_hash[iter].list);
sel_netport_hash[iter].size = 0;
}
return 0;
}
__initcall(sel_netport_init);
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_X86_PROCESSOR_H
#define _ASM_X86_PROCESSOR_H
#include <asm/processor-flags.h>
/* Forward declaration, a strange C thing */
struct task_struct;
struct mm_struct;
struct io_bitmap;
struct vm86;
#include <asm/math_emu.h>
#include <asm/segment.h>
#include <asm/types.h>
#include <uapi/asm/sigcontext.h>
#include <asm/current.h>
#include <asm/cpufeatures.h>
#include <asm/page.h>
#include <asm/pgtable_types.h>
#include <asm/percpu.h>
#include <asm/msr.h>
#include <asm/desc_defs.h>
#include <asm/nops.h>
#include <asm/special_insns.h>
#include <asm/fpu/types.h>
#include <asm/unwind_hints.h>
#include <asm/vmxfeatures.h>
#include <asm/vdso/processor.h>
#include <linux/personality.h>
#include <linux/cache.h>
#include <linux/threads.h>
#include <linux/math64.h>
#include <linux/err.h>
#include <linux/irqflags.h>
#include <linux/mem_encrypt.h>
/*
* We handle most unaligned accesses in hardware. On the other hand
* unaligned DMA can be quite expensive on some Nehalem processors.
*
* Based on this we disable the IP header alignment in network drivers.
*/
#define NET_IP_ALIGN 0
#define HBP_NUM 4
/*
* These alignment constraints are for performance in the vSMP case,
* but in the task_struct case we must also meet hardware imposed
* alignment requirements of the FPU state:
*/
#ifdef CONFIG_X86_VSMP
# define ARCH_MIN_TASKALIGN (1 << INTERNODE_CACHE_SHIFT)
# define ARCH_MIN_MMSTRUCT_ALIGN (1 << INTERNODE_CACHE_SHIFT)
#else
# define ARCH_MIN_TASKALIGN __alignof__(union fpregs_state)
# define ARCH_MIN_MMSTRUCT_ALIGN 0
#endif
enum tlb_infos {
ENTRIES,
NR_INFO
};
extern u16 __read_mostly tlb_lli_4k[NR_INFO];
extern u16 __read_mostly tlb_lli_2m[NR_INFO];
extern u16 __read_mostly tlb_lli_4m[NR_INFO];
extern u16 __read_mostly tlb_lld_4k[NR_INFO];
extern u16 __read_mostly tlb_lld_2m[NR_INFO];
extern u16 __read_mostly tlb_lld_4m[NR_INFO];
extern u16 __read_mostly tlb_lld_1g[NR_INFO];
/*
* CPU type and hardware bug flags. Kept separately for each CPU.
* Members of this structure are referenced in head_32.S, so think twice
* before touching them. [mj]
*/
struct cpuinfo_x86 {
__u8 x86; /* CPU family */
__u8 x86_vendor; /* CPU vendor */
__u8 x86_model;
__u8 x86_stepping;
#ifdef CONFIG_X86_64
/* Number of 4K pages in DTLB/ITLB combined(in pages): */
int x86_tlbsize;
#endif
#ifdef CONFIG_X86_VMX_FEATURE_NAMES
__u32 vmx_capability[NVMXINTS];
#endif
__u8 x86_virt_bits;
__u8 x86_phys_bits;
/* CPUID returned core id bits: */
__u8 x86_coreid_bits;
__u8 cu_id;
/* Max extended CPUID function supported: */
__u32 extended_cpuid_level;
/* Maximum supported CPUID level, -1=no CPUID: */
int cpuid_level;
/*
* Align to size of unsigned long because the x86_capability array
* is passed to bitops which require the alignment. Use unnamed
* union to enforce the array is aligned to size of unsigned long.
*/
union {
__u32 x86_capability[NCAPINTS + NBUGINTS];
unsigned long x86_capability_alignment;
};
char x86_vendor_id[16];
char x86_model_id[64];
/* in KB - valid for CPUS which support this call: */
unsigned int x86_cache_size;
int x86_cache_alignment; /* In bytes */
/* Cache QoS architectural values, valid only on the BSP: */
int x86_cache_max_rmid; /* max index */
int x86_cache_occ_scale; /* scale to bytes */
int x86_cache_mbm_width_offset;
int x86_power;
unsigned long loops_per_jiffy;
/* cpuid returned max cores value: */
u16 x86_max_cores;
u16 apicid;
u16 initial_apicid;
u16 x86_clflush_size;
/* number of cores as seen by the OS: */
u16 booted_cores;
/* Physical processor id: */
u16 phys_proc_id;
/* Logical processor id: */
u16 logical_proc_id;
/* Core id: */
u16 cpu_core_id;
u16 cpu_die_id;
u16 logical_die_id;
/* Index into per_cpu list: */
u16 cpu_index;
/* Is SMT active on this core? */
bool smt_active;
u32 microcode;
/* Address space bits used by the cache internally */
u8 x86_cache_bits;
unsigned initialized : 1;
} __randomize_layout;
struct cpuid_regs {
u32 eax, ebx, ecx, edx;
};
enum cpuid_regs_idx {
CPUID_EAX = 0,
CPUID_EBX,
CPUID_ECX,
CPUID_EDX,
};
#define X86_VENDOR_INTEL 0
#define X86_VENDOR_CYRIX 1
#define X86_VENDOR_AMD 2
#define X86_VENDOR_UMC 3
#define X86_VENDOR_CENTAUR 5
#define X86_VENDOR_TRANSMETA 7
#define X86_VENDOR_NSC 8
#define X86_VENDOR_HYGON 9
#define X86_VENDOR_ZHAOXIN 10
#define X86_VENDOR_NUM 11
#define X86_VENDOR_UNKNOWN 0xff
/*
* capabilities of CPUs
*/
extern struct cpuinfo_x86 boot_cpu_data;
extern struct cpuinfo_x86 new_cpu_data;
extern __u32 cpu_caps_cleared[NCAPINTS + NBUGINTS];
extern __u32 cpu_caps_set[NCAPINTS + NBUGINTS];
#ifdef CONFIG_SMP
DECLARE_PER_CPU_READ_MOSTLY(struct cpuinfo_x86, cpu_info);
#define cpu_data(cpu) per_cpu(cpu_info, cpu)
#else
#define cpu_info boot_cpu_data
#define cpu_data(cpu) boot_cpu_data
#endif
extern const struct seq_operations cpuinfo_op;
#define cache_line_size() (boot_cpu_data.x86_cache_alignment)
extern void cpu_detect(struct cpuinfo_x86 *c);
static inline unsigned long long l1tf_pfn_limit(void)
{
return BIT_ULL(boot_cpu_data.x86_cache_bits - 1 - PAGE_SHIFT);
}
extern void early_cpu_init(void);
extern void identify_boot_cpu(void);
extern void identify_secondary_cpu(struct cpuinfo_x86 *);
extern void print_cpu_info(struct cpuinfo_x86 *);
void print_cpu_msr(struct cpuinfo_x86 *);
#ifdef CONFIG_X86_32
extern int have_cpuid_p(void);
#else
static inline int have_cpuid_p(void)
{
return 1;
}
#endif
static inline void native_cpuid(unsigned int *eax, unsigned int *ebx,
unsigned int *ecx, unsigned int *edx)
{
/* ecx is often an input as well as an output. */
asm volatile("cpuid"
: "=a" (*eax),
"=b" (*ebx),
"=c" (*ecx),
"=d" (*edx)
: "0" (*eax), "2" (*ecx)
: "memory");
}
#define native_cpuid_reg(reg) \
static inline unsigned int native_cpuid_##reg(unsigned int op) \
{ \
unsigned int eax = op, ebx, ecx = 0, edx; \
\
native_cpuid(&eax, &ebx, &ecx, &edx); \
\
return reg; \
}
/*
* Native CPUID functions returning a single datum.
*/
native_cpuid_reg(eax)
native_cpuid_reg(ebx)
native_cpuid_reg(ecx)
native_cpuid_reg(edx)
/*
* Friendlier CR3 helpers.
*/
static inline unsigned long read_cr3_pa(void)
{
return __read_cr3() & CR3_ADDR_MASK;
}
static inline unsigned long native_read_cr3_pa(void)
{
return __native_read_cr3() & CR3_ADDR_MASK;
}
static inline void load_cr3(pgd_t *pgdir)
{
write_cr3(__sme_pa(pgdir));
}
/*
* Note that while the legacy 'TSS' name comes from 'Task State Segment',
* on modern x86 CPUs the TSS also holds information important to 64-bit mode,
* unrelated to the task-switch mechanism:
*/
#ifdef CONFIG_X86_32
/* This is the TSS defined by the hardware. */
struct x86_hw_tss {
unsigned short back_link, __blh;
unsigned long sp0;
unsigned short ss0, __ss0h;
unsigned long sp1;
/*
* We don't use ring 1, so ss1 is a convenient scratch space in
* the same cacheline as sp0. We use ss1 to cache the value in
* MSR_IA32_SYSENTER_CS. When we context switch
* MSR_IA32_SYSENTER_CS, we first check if the new value being
* written matches ss1, and, if it's not, then we wrmsr the new
* value and update ss1.
*
* The only reason we context switch MSR_IA32_SYSENTER_CS is
* that we set it to zero in vm86 tasks to avoid corrupting the
* stack if we were to go through the sysenter path from vm86
* mode.
*/
unsigned short ss1; /* MSR_IA32_SYSENTER_CS */
unsigned short __ss1h;
unsigned long sp2;
unsigned short ss2, __ss2h;
unsigned long __cr3;
unsigned long ip;
unsigned long flags;
unsigned long ax;
unsigned long cx;
unsigned long dx;
unsigned long bx;
unsigned long sp;
unsigned long bp;
unsigned long si;
unsigned long di;
unsigned short es, __esh;
unsigned short cs, __csh;
unsigned short ss, __ssh;
unsigned short ds, __dsh;
unsigned short fs, __fsh;
unsigned short gs, __gsh;
unsigned short ldt, __ldth;
unsigned short trace;
unsigned short io_bitmap_base;
} __attribute__((packed));
#else
struct x86_hw_tss {
u32 reserved1;
u64 sp0;
u64 sp1;
/*
* Since Linux does not use ring 2, the 'sp2' slot is unused by
* hardware. entry_SYSCALL_64 uses it as scratch space to stash
* the user RSP value.
*/
u64 sp2;
u64 reserved2;
u64 ist[7];
u32 reserved3;
u32 reserved4;
u16 reserved5;
u16 io_bitmap_base;
} __attribute__((packed));
#endif
/*
* IO-bitmap sizes:
*/
#define IO_BITMAP_BITS 65536
#define IO_BITMAP_BYTES (IO_BITMAP_BITS / BITS_PER_BYTE)
#define IO_BITMAP_LONGS (IO_BITMAP_BYTES / sizeof(long))
#define IO_BITMAP_OFFSET_VALID_MAP \
(offsetof(struct tss_struct, io_bitmap.bitmap) - \
offsetof(struct tss_struct, x86_tss))
#define IO_BITMAP_OFFSET_VALID_ALL \
(offsetof(struct tss_struct, io_bitmap.mapall) - \
offsetof(struct tss_struct, x86_tss))
#ifdef CONFIG_X86_IOPL_IOPERM
/*
* sizeof(unsigned long) coming from an extra "long" at the end of the
* iobitmap. The limit is inclusive, i.e. the last valid byte.
*/
# define __KERNEL_TSS_LIMIT \
(IO_BITMAP_OFFSET_VALID_ALL + IO_BITMAP_BYTES + \
sizeof(unsigned long) - 1)
#else
# define __KERNEL_TSS_LIMIT \
(offsetof(struct tss_struct, x86_tss) + sizeof(struct x86_hw_tss) - 1)
#endif
/* Base offset outside of TSS_LIMIT so unpriviledged IO causes #GP */
#define IO_BITMAP_OFFSET_INVALID (__KERNEL_TSS_LIMIT + 1)
struct entry_stack {
char stack[PAGE_SIZE];
};
struct entry_stack_page {
struct entry_stack stack;
} __aligned(PAGE_SIZE);
/*
* All IO bitmap related data stored in the TSS:
*/
struct x86_io_bitmap {
/* The sequence number of the last active bitmap. */
u64 prev_sequence;
/*
* Store the dirty size of the last io bitmap offender. The next
* one will have to do the cleanup as the switch out to a non io
* bitmap user will just set x86_tss.io_bitmap_base to a value
* outside of the TSS limit. So for sane tasks there is no need to
* actually touch the io_bitmap at all.
*/
unsigned int prev_max;
/*
* The extra 1 is there because the CPU will access an
* additional byte beyond the end of the IO permission
* bitmap. The extra byte must be all 1 bits, and must
* be within the limit.
*/
unsigned long bitmap[IO_BITMAP_LONGS + 1];
/*
* Special I/O bitmap to emulate IOPL(3). All bytes zero,
* except the additional byte at the end.
*/
unsigned long mapall[IO_BITMAP_LONGS + 1];
};
struct tss_struct {
/*
* The fixed hardware portion. This must not cross a page boundary
* at risk of violating the SDM's advice and potentially triggering
* errata.
*/
struct x86_hw_tss x86_tss;
struct x86_io_bitmap io_bitmap;
} __aligned(PAGE_SIZE);
DECLARE_PER_CPU_PAGE_ALIGNED(struct tss_struct, cpu_tss_rw);
/* Per CPU interrupt stacks */
struct irq_stack {
char stack[IRQ_STACK_SIZE];
} __aligned(IRQ_STACK_SIZE);
DECLARE_PER_CPU(unsigned long, cpu_current_top_of_stack);
#ifdef CONFIG_X86_64
struct fixed_percpu_data {
/*
* GCC hardcodes the stack canary as %gs:40. Since the
* irq_stack is the object at %gs:0, we reserve the bottom
* 48 bytes of the irq stack for the canary.
*
* Once we are willing to require -mstack-protector-guard-symbol=
* support for x86_64 stackprotector, we can get rid of this.
*/
char gs_base[40];
unsigned long stack_canary;
};
DECLARE_PER_CPU_FIRST(struct fixed_percpu_data, fixed_percpu_data) __visible;
DECLARE_INIT_PER_CPU(fixed_percpu_data);
static inline unsigned long cpu_kernelmode_gs_base(int cpu)
{
return (unsigned long)per_cpu(fixed_percpu_data.gs_base, cpu);
}
DECLARE_PER_CPU(void *, hardirq_stack_ptr);
DECLARE_PER_CPU(bool, hardirq_stack_inuse);
extern asmlinkage void ignore_sysret(void);
/* Save actual FS/GS selectors and bases to current->thread */
void current_save_fsgs(void);
#else /* X86_64 */
#ifdef CONFIG_STACKPROTECTOR
DECLARE_PER_CPU(unsigned long, __stack_chk_guard);
#endif
DECLARE_PER_CPU(struct irq_stack *, hardirq_stack_ptr);
DECLARE_PER_CPU(struct irq_stack *, softirq_stack_ptr);
#endif /* !X86_64 */
extern unsigned int fpu_kernel_xstate_size;
extern unsigned int fpu_user_xstate_size;
struct perf_event;
struct thread_struct {
/* Cached TLS descriptors: */
struct desc_struct tls_array[GDT_ENTRY_TLS_ENTRIES];
#ifdef CONFIG_X86_32
unsigned long sp0;
#endif
unsigned long sp;
#ifdef CONFIG_X86_32
unsigned long sysenter_cs;
#else
unsigned short es;
unsigned short ds;
unsigned short fsindex;
unsigned short gsindex;
#endif
#ifdef CONFIG_X86_64
unsigned long fsbase;
unsigned long gsbase;
#else
/*
* XXX: this could presumably be unsigned short. Alternatively,
* 32-bit kernels could be taught to use fsindex instead.
*/
unsigned long fs;
unsigned long gs;
#endif
/* Save middle states of ptrace breakpoints */
struct perf_event *ptrace_bps[HBP_NUM];
/* Debug status used for traps, single steps, etc... */
unsigned long virtual_dr6;
/* Keep track of the exact dr7 value set by the user */
unsigned long ptrace_dr7;
/* Fault info: */
unsigned long cr2;
unsigned long trap_nr;
unsigned long error_code;
#ifdef CONFIG_VM86
/* Virtual 86 mode info */
struct vm86 *vm86;
#endif
/* IO permissions: */
struct io_bitmap *io_bitmap;
/*
* IOPL. Privilege level dependent I/O permission which is
* emulated via the I/O bitmap to prevent user space from disabling
* interrupts.
*/
unsigned long iopl_emul;
unsigned int iopl_warn:1;
unsigned int sig_on_uaccess_err:1;
/*
* Protection Keys Register for Userspace. Loaded immediately on
* context switch. Store it in thread_struct to avoid a lookup in
* the tasks's FPU xstate buffer. This value is only valid when a
* task is scheduled out. For 'current' the authoritative source of
* PKRU is the hardware itself.
*/
u32 pkru;
/* Floating point and extended processor state */
struct fpu fpu;
/*
* WARNING: 'fpu' is dynamically-sized. It *MUST* be at
* the end.
*/
};
/* Whitelist the FPU state from the task_struct for hardened usercopy. */
static inline void arch_thread_struct_whitelist(unsigned long *offset,
unsigned long *size)
{
*offset = offsetof(struct thread_struct, fpu.state);
*size = fpu_kernel_xstate_size;
}
static inline void
native_load_sp0(unsigned long sp0)
{
this_cpu_write(cpu_tss_rw.x86_tss.sp0, sp0);
}
static __always_inline void native_swapgs(void)
{
#ifdef CONFIG_X86_64
asm volatile("swapgs" ::: "memory");
#endif
}
static inline unsigned long current_top_of_stack(void)
{
/*
* We can't read directly from tss.sp0: sp0 on x86_32 is special in
* and around vm86 mode and sp0 on x86_64 is special because of the
* entry trampoline.
*/
return this_cpu_read_stable(cpu_current_top_of_stack);
}
static inline bool on_thread_stack(void)
{
return (unsigned long)(current_top_of_stack() -
current_stack_pointer) < THREAD_SIZE;
}
#ifdef CONFIG_PARAVIRT_XXL
#include <asm/paravirt.h>
#else
#define __cpuid native_cpuid
static inline void load_sp0(unsigned long sp0)
{
native_load_sp0(sp0);
}
#endif /* CONFIG_PARAVIRT_XXL */
/* Free all resources held by a thread. */
extern void release_thread(struct task_struct *);
unsigned long get_wchan(struct task_struct *p);
/*
* Generic CPUID function
* clear %ecx since some cpus (Cyrix MII) do not set or clear %ecx
* resulting in stale register contents being returned.
*/
static inline void cpuid(unsigned int op,
unsigned int *eax, unsigned int *ebx,
unsigned int *ecx, unsigned int *edx)
{
*eax = op;
*ecx = 0;
__cpuid(eax, ebx, ecx, edx);
}
/* Some CPUID calls want 'count' to be placed in ecx */
static inline void cpuid_count(unsigned int op, int count,
unsigned int *eax, unsigned int *ebx,
unsigned int *ecx, unsigned int *edx)
{
*eax = op;
*ecx = count;
__cpuid(eax, ebx, ecx, edx);
}
/*
* CPUID functions returning a single datum
*/
static inline unsigned int cpuid_eax(unsigned int op)
{
unsigned int eax, ebx, ecx, edx;
cpuid(op, &eax, &ebx, &ecx, &edx);
return eax;
}
static inline unsigned int cpuid_ebx(unsigned int op)
{
unsigned int eax, ebx, ecx, edx;
cpuid(op, &eax, &ebx, &ecx, &edx);
return ebx;
}
static inline unsigned int cpuid_ecx(unsigned int op)
{
unsigned int eax, ebx, ecx, edx;
cpuid(op, &eax, &ebx, &ecx, &edx);
return ecx;
}
static inline unsigned int cpuid_edx(unsigned int op)
{
unsigned int eax, ebx, ecx, edx;
cpuid(op, &eax, &ebx, &ecx, &edx);
return edx;
}
extern void select_idle_routine(const struct cpuinfo_x86 *c);
extern void amd_e400_c1e_apic_setup(void);
extern unsigned long boot_option_idle_override;
enum idle_boot_override {IDLE_NO_OVERRIDE=0, IDLE_HALT, IDLE_NOMWAIT,
IDLE_POLL};
extern void enable_sep_cpu(void);
extern int sysenter_setup(void);
/* Defined in head.S */
extern struct desc_ptr early_gdt_descr;
extern void switch_to_new_gdt(int);
extern void load_direct_gdt(int);
extern void load_fixmap_gdt(int);
extern void load_percpu_segment(int);
extern void cpu_init(void);
extern void cpu_init_secondary(void);
extern void cpu_init_exception_handling(void);
extern void cr4_init(void);
static inline unsigned long get_debugctlmsr(void)
{
unsigned long debugctlmsr = 0;
#ifndef CONFIG_X86_DEBUGCTLMSR
if (boot_cpu_data.x86 < 6)
return 0;
#endif
rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctlmsr);
return debugctlmsr;
}
static inline void update_debugctlmsr(unsigned long debugctlmsr)
{
#ifndef CONFIG_X86_DEBUGCTLMSR
if (boot_cpu_data.x86 < 6)
return;
#endif
wrmsrl(MSR_IA32_DEBUGCTLMSR, debugctlmsr);
}
extern void set_task_blockstep(struct task_struct *task, bool on);
/* Boot loader type from the setup header: */
extern int bootloader_type;
extern int bootloader_version;
extern char ignore_fpu_irq;
#define HAVE_ARCH_PICK_MMAP_LAYOUT 1
#define ARCH_HAS_PREFETCHW
#define ARCH_HAS_SPINLOCK_PREFETCH
#ifdef CONFIG_X86_32
# define BASE_PREFETCH ""
# define ARCH_HAS_PREFETCH
#else
# define BASE_PREFETCH "prefetcht0 %P1"
#endif
/*
* Prefetch instructions for Pentium III (+) and AMD Athlon (+)
*
* It's not worth to care about 3dnow prefetches for the K6
* because they are microcoded there and very slow.
*/
static inline void prefetch(const void *x)
{
alternative_input(BASE_PREFETCH, "prefetchnta %P1",
X86_FEATURE_XMM,
"m" (*(const char *)x));
}
/*
* 3dnow prefetch to get an exclusive cache line.
* Useful for spinlocks to avoid one state transition in the
* cache coherency protocol:
*/
static __always_inline void prefetchw(const void *x)
{
alternative_input(BASE_PREFETCH, "prefetchw %P1",
X86_FEATURE_3DNOWPREFETCH,
"m" (*(const char *)x));
}
static inline void spin_lock_prefetch(const void *x)
{
prefetchw(x);
}
#define TOP_OF_INIT_STACK ((unsigned long)&init_stack + sizeof(init_stack) - \
TOP_OF_KERNEL_STACK_PADDING)
#define task_top_of_stack(task) ((unsigned long)(task_pt_regs(task) + 1))
#define task_pt_regs(task) \
({ \
unsigned long __ptr = (unsigned long)task_stack_page(task); \
__ptr += THREAD_SIZE - TOP_OF_KERNEL_STACK_PADDING; \
((struct pt_regs *)__ptr) - 1; \
})
#ifdef CONFIG_X86_32
#define INIT_THREAD { \
.sp0 = TOP_OF_INIT_STACK, \
.sysenter_cs = __KERNEL_CS, \
}
#define KSTK_ESP(task) (task_pt_regs(task)->sp)
#else
#define INIT_THREAD { }
extern unsigned long KSTK_ESP(struct task_struct *task);
#endif /* CONFIG_X86_64 */
extern void start_thread(struct pt_regs *regs, unsigned long new_ip,
unsigned long new_sp);
/*
* This decides where the kernel will search for a free chunk of vm
* space during mmap's.
*/
#define __TASK_UNMAPPED_BASE(task_size) (PAGE_ALIGN(task_size / 3))
#define TASK_UNMAPPED_BASE __TASK_UNMAPPED_BASE(TASK_SIZE_LOW)
#define KSTK_EIP(task) (task_pt_regs(task)->ip)
/* Get/set a process' ability to use the timestamp counter instruction */
#define GET_TSC_CTL(adr) get_tsc_mode((adr))
#define SET_TSC_CTL(val) set_tsc_mode((val))
extern int get_tsc_mode(unsigned long adr);
extern int set_tsc_mode(unsigned int val);
DECLARE_PER_CPU(u64, msr_misc_features_shadow);
extern u16 get_llc_id(unsigned int cpu);
#ifdef CONFIG_CPU_SUP_AMD
extern u32 amd_get_nodes_per_socket(void);
extern u32 amd_get_highest_perf(void);
#else
static inline u32 amd_get_nodes_per_socket(void) { return 0; }
static inline u32 amd_get_highest_perf(void) { return 0; }
#endif
static inline uint32_t hypervisor_cpuid_base(const char *sig, uint32_t leaves)
{
uint32_t base, eax, signature[3];
for (base = 0x40000000; base < 0x40010000; base += 0x100) {
cpuid(base, &eax, &signature[0], &signature[1], &signature[2]);
if (!memcmp(sig, signature, 12) &&
(leaves == 0 || ((eax - base) >= leaves)))
return base;
}
return 0;
}
extern unsigned long arch_align_stack(unsigned long sp);
void free_init_pages(const char *what, unsigned long begin, unsigned long end);
extern void free_kernel_image_pages(const char *what, void *begin, void *end);
void default_idle(void);
#ifdef CONFIG_XEN
bool xen_set_default_idle(void);
#else
#define xen_set_default_idle 0
#endif
void stop_this_cpu(void *dummy);
void microcode_check(void);
enum l1tf_mitigations {
L1TF_MITIGATION_OFF,
L1TF_MITIGATION_FLUSH_NOWARN,
L1TF_MITIGATION_FLUSH,
L1TF_MITIGATION_FLUSH_NOSMT,
L1TF_MITIGATION_FULL,
L1TF_MITIGATION_FULL_FORCE
};
extern enum l1tf_mitigations l1tf_mitigation;
enum mds_mitigations {
MDS_MITIGATION_OFF,
MDS_MITIGATION_FULL,
MDS_MITIGATION_VMWERV,
};
#endif /* _ASM_X86_PROCESSOR_H */
// SPDX-License-Identifier: GPL-2.0
/*
* Kernel internal timers
*
* Copyright (C) 1991, 1992 Linus Torvalds
*
* 1997-01-28 Modified by Finn Arne Gangstad to make timers scale better.
*
* 1997-09-10 Updated NTP code according to technical memorandum Jan '96
* "A Kernel Model for Precision Timekeeping" by Dave Mills
* 1998-12-24 Fixed a xtime SMP race (we need the xtime_lock rw spinlock to
* serialize accesses to xtime/lost_ticks).
* Copyright (C) 1998 Andrea Arcangeli
* 1999-03-10 Improved NTP compatibility by Ulrich Windl
* 2002-05-31 Move sys_sysinfo here and make its locking sane, Robert Love
* 2000-10-05 Implemented scalable SMP per-CPU timer handling.
* Copyright (C) 2000, 2001, 2002 Ingo Molnar
* Designed by David S. Miller, Alexey Kuznetsov and Ingo Molnar
*/
#include <linux/kernel_stat.h>
#include <linux/export.h>
#include <linux/interrupt.h>
#include <linux/percpu.h>
#include <linux/init.h>
#include <linux/mm.h>
#include <linux/swap.h>
#include <linux/pid_namespace.h>
#include <linux/notifier.h>
#include <linux/thread_info.h>
#include <linux/time.h>
#include <linux/jiffies.h>
#include <linux/posix-timers.h>
#include <linux/cpu.h>
#include <linux/syscalls.h>
#include <linux/delay.h>
#include <linux/tick.h>
#include <linux/kallsyms.h>
#include <linux/irq_work.h>
#include <linux/sched/signal.h>
#include <linux/sched/sysctl.h>
#include <linux/sched/nohz.h>
#include <linux/sched/debug.h>
#include <linux/slab.h>
#include <linux/compat.h>
#include <linux/random.h>
#include <linux/uaccess.h>
#include <asm/unistd.h>
#include <asm/div64.h>
#include <asm/timex.h>
#include <asm/io.h>
#include "tick-internal.h"
#define CREATE_TRACE_POINTS
#include <trace/events/timer.h>
__visible u64 jiffies_64 __cacheline_aligned_in_smp = INITIAL_JIFFIES;
EXPORT_SYMBOL(jiffies_64);
/*
* The timer wheel has LVL_DEPTH array levels. Each level provides an array of
* LVL_SIZE buckets. Each level is driven by its own clock and therefor each
* level has a different granularity.
*
* The level granularity is: LVL_CLK_DIV ^ lvl
* The level clock frequency is: HZ / (LVL_CLK_DIV ^ level)
*
* The array level of a newly armed timer depends on the relative expiry
* time. The farther the expiry time is away the higher the array level and
* therefor the granularity becomes.
*
* Contrary to the original timer wheel implementation, which aims for 'exact'
* expiry of the timers, this implementation removes the need for recascading
* the timers into the lower array levels. The previous 'classic' timer wheel
* implementation of the kernel already violated the 'exact' expiry by adding
* slack to the expiry time to provide batched expiration. The granularity
* levels provide implicit batching.
*
* This is an optimization of the original timer wheel implementation for the
* majority of the timer wheel use cases: timeouts. The vast majority of
* timeout timers (networking, disk I/O ...) are canceled before expiry. If
* the timeout expires it indicates that normal operation is disturbed, so it
* does not matter much whether the timeout comes with a slight delay.
*
* The only exception to this are networking timers with a small expiry
* time. They rely on the granularity. Those fit into the first wheel level,
* which has HZ granularity.
*
* We don't have cascading anymore. timers with a expiry time above the
* capacity of the last wheel level are force expired at the maximum timeout
* value of the last wheel level. From data sampling we know that the maximum
* value observed is 5 days (network connection tracking), so this should not
* be an issue.
*
* The currently chosen array constants values are a good compromise between
* array size and granularity.
*
* This results in the following granularity and range levels:
*
* HZ 1000 steps
* Level Offset Granularity Range
* 0 0 1 ms 0 ms - 63 ms
* 1 64 8 ms 64 ms - 511 ms
* 2 128 64 ms 512 ms - 4095 ms (512ms - ~4s)
* 3 192 512 ms 4096 ms - 32767 ms (~4s - ~32s)
* 4 256 4096 ms (~4s) 32768 ms - 262143 ms (~32s - ~4m)
* 5 320 32768 ms (~32s) 262144 ms - 2097151 ms (~4m - ~34m)
* 6 384 262144 ms (~4m) 2097152 ms - 16777215 ms (~34m - ~4h)
* 7 448 2097152 ms (~34m) 16777216 ms - 134217727 ms (~4h - ~1d)
* 8 512 16777216 ms (~4h) 134217728 ms - 1073741822 ms (~1d - ~12d)
*
* HZ 300
* Level Offset Granularity Range
* 0 0 3 ms 0 ms - 210 ms
* 1 64 26 ms 213 ms - 1703 ms (213ms - ~1s)
* 2 128 213 ms 1706 ms - 13650 ms (~1s - ~13s)
* 3 192 1706 ms (~1s) 13653 ms - 109223 ms (~13s - ~1m)
* 4 256 13653 ms (~13s) 109226 ms - 873810 ms (~1m - ~14m)
* 5 320 109226 ms (~1m) 873813 ms - 6990503 ms (~14m - ~1h)
* 6 384 873813 ms (~14m) 6990506 ms - 55924050 ms (~1h - ~15h)
* 7 448 6990506 ms (~1h) 55924053 ms - 447392423 ms (~15h - ~5d)
* 8 512 55924053 ms (~15h) 447392426 ms - 3579139406 ms (~5d - ~41d)
*
* HZ 250
* Level Offset Granularity Range
* 0 0 4 ms 0 ms - 255 ms
* 1 64 32 ms 256 ms - 2047 ms (256ms - ~2s)
* 2 128 256 ms 2048 ms - 16383 ms (~2s - ~16s)
* 3 192 2048 ms (~2s) 16384 ms - 131071 ms (~16s - ~2m)
* 4 256 16384 ms (~16s) 131072 ms - 1048575 ms (~2m - ~17m)
* 5 320 131072 ms (~2m) 1048576 ms - 8388607 ms (~17m - ~2h)
* 6 384 1048576 ms (~17m) 8388608 ms - 67108863 ms (~2h - ~18h)
* 7 448 8388608 ms (~2h) 67108864 ms - 536870911 ms (~18h - ~6d)
* 8 512 67108864 ms (~18h) 536870912 ms - 4294967288 ms (~6d - ~49d)
*
* HZ 100
* Level Offset Granularity Range
* 0 0 10 ms 0 ms - 630 ms
* 1 64 80 ms 640 ms - 5110 ms (640ms - ~5s)
* 2 128 640 ms 5120 ms - 40950 ms (~5s - ~40s)
* 3 192 5120 ms (~5s) 40960 ms - 327670 ms (~40s - ~5m)
* 4 256 40960 ms (~40s) 327680 ms - 2621430 ms (~5m - ~43m)
* 5 320 327680 ms (~5m) 2621440 ms - 20971510 ms (~43m - ~5h)
* 6 384 2621440 ms (~43m) 20971520 ms - 167772150 ms (~5h - ~1d)
* 7 448 20971520 ms (~5h) 167772160 ms - 1342177270 ms (~1d - ~15d)
*/
/* Clock divisor for the next level */
#define LVL_CLK_SHIFT 3
#define LVL_CLK_DIV (1UL << LVL_CLK_SHIFT)
#define LVL_CLK_MASK (LVL_CLK_DIV - 1)
#define LVL_SHIFT(n) ((n) * LVL_CLK_SHIFT)
#define LVL_GRAN(n) (1UL << LVL_SHIFT(n))
/*
* The time start value for each level to select the bucket at enqueue
* time. We start from the last possible delta of the previous level
* so that we can later add an extra LVL_GRAN(n) to n (see calc_index()).
*/
#define LVL_START(n) ((LVL_SIZE - 1) << (((n) - 1) * LVL_CLK_SHIFT))
/* Size of each clock level */
#define LVL_BITS 6
#define LVL_SIZE (1UL << LVL_BITS)
#define LVL_MASK (LVL_SIZE - 1)
#define LVL_OFFS(n) ((n) * LVL_SIZE)
/* Level depth */
#if HZ > 100
# define LVL_DEPTH 9
# else
# define LVL_DEPTH 8
#endif
/* The cutoff (max. capacity of the wheel) */
#define WHEEL_TIMEOUT_CUTOFF (LVL_START(LVL_DEPTH))
#define WHEEL_TIMEOUT_MAX (WHEEL_TIMEOUT_CUTOFF - LVL_GRAN(LVL_DEPTH - 1))
/*
* The resulting wheel size. If NOHZ is configured we allocate two
* wheels so we have a separate storage for the deferrable timers.
*/
#define WHEEL_SIZE (LVL_SIZE * LVL_DEPTH)
#ifdef CONFIG_NO_HZ_COMMON
# define NR_BASES 2
# define BASE_STD 0
# define BASE_DEF 1
#else
# define NR_BASES 1
# define BASE_STD 0
# define BASE_DEF 0
#endif
struct timer_base {
raw_spinlock_t lock;
struct timer_list *running_timer;
#ifdef CONFIG_PREEMPT_RT
spinlock_t expiry_lock;
atomic_t timer_waiters;
#endif
unsigned long clk;
unsigned long next_expiry;
unsigned int cpu;
bool next_expiry_recalc;
bool is_idle;
bool timers_pending;
DECLARE_BITMAP(pending_map, WHEEL_SIZE);
struct hlist_head vectors[WHEEL_SIZE];
} ____cacheline_aligned;
static DEFINE_PER_CPU(struct timer_base, timer_bases[NR_BASES]);
#ifdef CONFIG_NO_HZ_COMMON
static DEFINE_STATIC_KEY_FALSE(timers_nohz_active);
static DEFINE_MUTEX(timer_keys_mutex);
static void timer_update_keys(struct work_struct *work);
static DECLARE_WORK(timer_update_work, timer_update_keys);
#ifdef CONFIG_SMP
unsigned int sysctl_timer_migration = 1;
DEFINE_STATIC_KEY_FALSE(timers_migration_enabled);
static void timers_update_migration(void)
{
if (sysctl_timer_migration && tick_nohz_active)
static_branch_enable(&timers_migration_enabled);
else
static_branch_disable(&timers_migration_enabled);
}
#else
static inline void timers_update_migration(void) { }
#endif /* !CONFIG_SMP */
static void timer_update_keys(struct work_struct *work)
{
mutex_lock(&timer_keys_mutex);
timers_update_migration();
static_branch_enable(&timers_nohz_active);
mutex_unlock(&timer_keys_mutex);
}
void timers_update_nohz(void)
{
schedule_work(&timer_update_work);
}
int timer_migration_handler(struct ctl_table *table, int write,
void *buffer, size_t *lenp, loff_t *ppos)
{
int ret;
mutex_lock(&timer_keys_mutex);
ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
if (!ret && write)
timers_update_migration();
mutex_unlock(&timer_keys_mutex);
return ret;
}
static inline bool is_timers_nohz_active(void)
{
return static_branch_unlikely(&timers_nohz_active);
}
#else
static inline bool is_timers_nohz_active(void) { return false; }
#endif /* NO_HZ_COMMON */
static unsigned long round_jiffies_common(unsigned long j, int cpu,
bool force_up)
{
int rem;
unsigned long original = j;
/*
* We don't want all cpus firing their timers at once hitting the
* same lock or cachelines, so we skew each extra cpu with an extra
* 3 jiffies. This 3 jiffies came originally from the mm/ code which
* already did this.
* The skew is done by adding 3*cpunr, then round, then subtract this
* extra offset again.
*/
j += cpu * 3;
rem = j % HZ;
/*
* If the target jiffie is just after a whole second (which can happen
* due to delays of the timer irq, long irq off times etc etc) then
* we should round down to the whole second, not up. Use 1/4th second
* as cutoff for this rounding as an extreme upper bound for this.
* But never round down if @force_up is set.
*/
if (rem < HZ/4 && !force_up) /* round down */
j = j - rem;
else /* round up */
j = j - rem + HZ;
/* now that we have rounded, subtract the extra skew again */
j -= cpu * 3;
/*
* Make sure j is still in the future. Otherwise return the
* unmodified value.
*/
return time_is_after_jiffies(j) ? j : original;
}
/**
* __round_jiffies - function to round jiffies to a full second
* @j: the time in (absolute) jiffies that should be rounded
* @cpu: the processor number on which the timeout will happen
*
* __round_jiffies() rounds an absolute time in the future (in jiffies)
* up or down to (approximately) full seconds. This is useful for timers
* for which the exact time they fire does not matter too much, as long as
* they fire approximately every X seconds.
*
* By rounding these timers to whole seconds, all such timers will fire
* at the same time, rather than at various times spread out. The goal
* of this is to have the CPU wake up less, which saves power.
*
* The exact rounding is skewed for each processor to avoid all
* processors firing at the exact same time, which could lead
* to lock contention or spurious cache line bouncing.
*
* The return value is the rounded version of the @j parameter.
*/
unsigned long __round_jiffies(unsigned long j, int cpu)
{
return round_jiffies_common(j, cpu, false);
}
EXPORT_SYMBOL_GPL(__round_jiffies);
/**
* __round_jiffies_relative - function to round jiffies to a full second
* @j: the time in (relative) jiffies that should be rounded
* @cpu: the processor number on which the timeout will happen
*
* __round_jiffies_relative() rounds a time delta in the future (in jiffies)
* up or down to (approximately) full seconds. This is useful for timers
* for which the exact time they fire does not matter too much, as long as
* they fire approximately every X seconds.
*
* By rounding these timers to whole seconds, all such timers will fire
* at the same time, rather than at various times spread out. The goal
* of this is to have the CPU wake up less, which saves power.
*
* The exact rounding is skewed for each processor to avoid all
* processors firing at the exact same time, which could lead
* to lock contention or spurious cache line bouncing.
*
* The return value is the rounded version of the @j parameter.
*/
unsigned long __round_jiffies_relative(unsigned long j, int cpu)
{
unsigned long j0 = jiffies;
/* Use j0 because jiffies might change while we run */
return round_jiffies_common(j + j0, cpu, false) - j0;
}
EXPORT_SYMBOL_GPL(__round_jiffies_relative);
/**
* round_jiffies - function to round jiffies to a full second
* @j: the time in (absolute) jiffies that should be rounded
*
* round_jiffies() rounds an absolute time in the future (in jiffies)
* up or down to (approximately) full seconds. This is useful for timers
* for which the exact time they fire does not matter too much, as long as
* they fire approximately every X seconds.
*
* By rounding these timers to whole seconds, all such timers will fire
* at the same time, rather than at various times spread out. The goal
* of this is to have the CPU wake up less, which saves power.
*
* The return value is the rounded version of the @j parameter.
*/
unsigned long round_jiffies(unsigned long j)
{
return round_jiffies_common(j, raw_smp_processor_id(), false);
}
EXPORT_SYMBOL_GPL(round_jiffies);
/**
* round_jiffies_relative - function to round jiffies to a full second
* @j: the time in (relative) jiffies that should be rounded
*
* round_jiffies_relative() rounds a time delta in the future (in jiffies)
* up or down to (approximately) full seconds. This is useful for timers
* for which the exact time they fire does not matter too much, as long as
* they fire approximately every X seconds.
*
* By rounding these timers to whole seconds, all such timers will fire
* at the same time, rather than at various times spread out. The goal
* of this is to have the CPU wake up less, which saves power.
*
* The return value is the rounded version of the @j parameter.
*/
unsigned long round_jiffies_relative(unsigned long j)
{
return __round_jiffies_relative(j, raw_smp_processor_id());
}
EXPORT_SYMBOL_GPL(round_jiffies_relative);
/**
* __round_jiffies_up - function to round jiffies up to a full second
* @j: the time in (absolute) jiffies that should be rounded
* @cpu: the processor number on which the timeout will happen
*
* This is the same as __round_jiffies() except that it will never
* round down. This is useful for timeouts for which the exact time
* of firing does not matter too much, as long as they don't fire too
* early.
*/
unsigned long __round_jiffies_up(unsigned long j, int cpu)
{
return round_jiffies_common(j, cpu, true);
}
EXPORT_SYMBOL_GPL(__round_jiffies_up);
/**
* __round_jiffies_up_relative - function to round jiffies up to a full second
* @j: the time in (relative) jiffies that should be rounded
* @cpu: the processor number on which the timeout will happen
*
* This is the same as __round_jiffies_relative() except that it will never
* round down. This is useful for timeouts for which the exact time
* of firing does not matter too much, as long as they don't fire too
* early.
*/
unsigned long __round_jiffies_up_relative(unsigned long j, int cpu)
{
unsigned long j0 = jiffies;
/* Use j0 because jiffies might change while we run */
return round_jiffies_common(j + j0, cpu, true) - j0;
}
EXPORT_SYMBOL_GPL(__round_jiffies_up_relative);
/**
* round_jiffies_up - function to round jiffies up to a full second
* @j: the time in (absolute) jiffies that should be rounded
*
* This is the same as round_jiffies() except that it will never
* round down. This is useful for timeouts for which the exact time
* of firing does not matter too much, as long as they don't fire too
* early.
*/
unsigned long round_jiffies_up(unsigned long j)
{
return round_jiffies_common(j, raw_smp_processor_id(), true);
}
EXPORT_SYMBOL_GPL(round_jiffies_up);
/**
* round_jiffies_up_relative - function to round jiffies up to a full second
* @j: the time in (relative) jiffies that should be rounded
*
* This is the same as round_jiffies_relative() except that it will never
* round down. This is useful for timeouts for which the exact time
* of firing does not matter too much, as long as they don't fire too
* early.
*/
unsigned long round_jiffies_up_relative(unsigned long j)
{
return __round_jiffies_up_relative(j, raw_smp_processor_id());
}
EXPORT_SYMBOL_GPL(round_jiffies_up_relative);
static inline unsigned int timer_get_idx(struct timer_list *timer)
{
return (timer->flags & TIMER_ARRAYMASK) >> TIMER_ARRAYSHIFT;
}
static inline void timer_set_idx(struct timer_list *timer, unsigned int idx)
{
timer->flags = (timer->flags & ~TIMER_ARRAYMASK) |
idx << TIMER_ARRAYSHIFT;
}
/*
* Helper function to calculate the array index for a given expiry
* time.
*/
static inline unsigned calc_index(unsigned long expires, unsigned lvl,
unsigned long *bucket_expiry)
{
/*
* The timer wheel has to guarantee that a timer does not fire
* early. Early expiry can happen due to:
* - Timer is armed at the edge of a tick
* - Truncation of the expiry time in the outer wheel levels
*
* Round up with level granularity to prevent this.
*/
expires = (expires + LVL_GRAN(lvl)) >> LVL_SHIFT(lvl);
*bucket_expiry = expires << LVL_SHIFT(lvl);
return LVL_OFFS(lvl) + (expires & LVL_MASK);
}
static int calc_wheel_index(unsigned long expires, unsigned long clk,
unsigned long *bucket_expiry)
{
unsigned long delta = expires - clk;
unsigned int idx;
if (delta < LVL_START(1)) {
idx = calc_index(expires, 0, bucket_expiry);
} else if (delta < LVL_START(2)) {
idx = calc_index(expires, 1, bucket_expiry);
} else if (delta < LVL_START(3)) {
idx = calc_index(expires, 2, bucket_expiry);
} else if (delta < LVL_START(4)) {
idx = calc_index(expires, 3, bucket_expiry);
} else if (delta < LVL_START(5)) {
idx = calc_index(expires, 4, bucket_expiry);
} else if (delta < LVL_START(6)) {
idx = calc_index(expires, 5, bucket_expiry);
} else if (delta < LVL_START(7)) {
idx = calc_index(expires, 6, bucket_expiry);
} else if (LVL_DEPTH > 8 && delta < LVL_START(8)) {
idx = calc_index(expires, 7, bucket_expiry);
} else if ((long) delta < 0) { idx = clk & LVL_MASK;
*bucket_expiry = clk;
} else {
/*
* Force expire obscene large timeouts to expire at the
* capacity limit of the wheel.
*/
if (delta >= WHEEL_TIMEOUT_CUTOFF) expires = clk + WHEEL_TIMEOUT_MAX;
idx = calc_index(expires, LVL_DEPTH - 1, bucket_expiry);
}
return idx;
}
static void
trigger_dyntick_cpu(struct timer_base *base, struct timer_list *timer)
{
if (!is_timers_nohz_active())
return;
/*
* TODO: This wants some optimizing similar to the code below, but we
* will do that when we switch from push to pull for deferrable timers.
*/
if (timer->flags & TIMER_DEFERRABLE) {
if (tick_nohz_full_cpu(base->cpu))
wake_up_nohz_cpu(base->cpu);
return;
}
/*
* We might have to IPI the remote CPU if the base is idle and the
* timer is not deferrable. If the other CPU is on the way to idle
* then it can't set base->is_idle as we hold the base lock:
*/
if (base->is_idle) wake_up_nohz_cpu(base->cpu);
}
/*
* Enqueue the timer into the hash bucket, mark it pending in
* the bitmap, store the index in the timer flags then wake up
* the target CPU if needed.
*/
static void enqueue_timer(struct timer_base *base, struct timer_list *timer,
unsigned int idx, unsigned long bucket_expiry)
{
hlist_add_head(&timer->entry, base->vectors + idx);
__set_bit(idx, base->pending_map);
timer_set_idx(timer, idx);
trace_timer_start(timer, timer->expires, timer->flags);
/*
* Check whether this is the new first expiring timer. The
* effective expiry time of the timer is required here
* (bucket_expiry) instead of timer->expires.
*/
if (time_before(bucket_expiry, base->next_expiry)) {
/*
* Set the next expiry time and kick the CPU so it
* can reevaluate the wheel:
*/
base->next_expiry = bucket_expiry;
base->timers_pending = true;
base->next_expiry_recalc = false;
trigger_dyntick_cpu(base, timer);
}
}
static void internal_add_timer(struct timer_base *base, struct timer_list *timer)
{
unsigned long bucket_expiry;
unsigned int idx;
idx = calc_wheel_index(timer->expires, base->clk, &bucket_expiry);
enqueue_timer(base, timer, idx, bucket_expiry);
}
#ifdef CONFIG_DEBUG_OBJECTS_TIMERS
static const struct debug_obj_descr timer_debug_descr;
static void *timer_debug_hint(void *addr)
{
return ((struct timer_list *) addr)->function;
}
static bool timer_is_static_object(void *addr)
{
struct timer_list *timer = addr;
return (timer->entry.pprev == NULL &&
timer->entry.next == TIMER_ENTRY_STATIC);
}
/*
* fixup_init is called when:
* - an active object is initialized
*/
static bool timer_fixup_init(void *addr, enum debug_obj_state state)
{
struct timer_list *timer = addr;
switch (state) {
case ODEBUG_STATE_ACTIVE:
del_timer_sync(timer);
debug_object_init(timer, &timer_debug_descr);
return true;
default:
return false;
}
}
/* Stub timer callback for improperly used timers. */
static void stub_timer(struct timer_list *unused)
{
WARN_ON(1);
}
/*
* fixup_activate is called when:
* - an active object is activated
* - an unknown non-static object is activated
*/
static bool timer_fixup_activate(void *addr, enum debug_obj_state state)
{
struct timer_list *timer = addr;
switch (state) {
case ODEBUG_STATE_NOTAVAILABLE:
timer_setup(timer, stub_timer, 0);
return true;
case ODEBUG_STATE_ACTIVE:
WARN_ON(1);
fallthrough;
default:
return false;
}
}
/*
* fixup_free is called when:
* - an active object is freed
*/
static bool timer_fixup_free(void *addr, enum debug_obj_state state)
{
struct timer_list *timer = addr;
switch (state) {
case ODEBUG_STATE_ACTIVE:
del_timer_sync(timer);
debug_object_free(timer, &timer_debug_descr);
return true;
default:
return false;
}
}
/*
* fixup_assert_init is called when:
* - an untracked/uninit-ed object is found
*/
static bool timer_fixup_assert_init(void *addr, enum debug_obj_state state)
{
struct timer_list *timer = addr;
switch (state) {
case ODEBUG_STATE_NOTAVAILABLE:
timer_setup(timer, stub_timer, 0);
return true;
default:
return false;
}
}
static const struct debug_obj_descr timer_debug_descr = {
.name = "timer_list",
.debug_hint = timer_debug_hint,
.is_static_object = timer_is_static_object,
.fixup_init = timer_fixup_init,
.fixup_activate = timer_fixup_activate,
.fixup_free = timer_fixup_free,
.fixup_assert_init = timer_fixup_assert_init,
};
static inline void debug_timer_init(struct timer_list *timer)
{
debug_object_init(timer, &timer_debug_descr);
}
static inline void debug_timer_activate(struct timer_list *timer)
{
debug_object_activate(timer, &timer_debug_descr);
}
static inline void debug_timer_deactivate(struct timer_list *timer)
{
debug_object_deactivate(timer, &timer_debug_descr);
}
static inline void debug_timer_assert_init(struct timer_list *timer)
{
debug_object_assert_init(timer, &timer_debug_descr);
}
static void do_init_timer(struct timer_list *timer,
void (*func)(struct timer_list *),
unsigned int flags,
const char *name, struct lock_class_key *key);
void init_timer_on_stack_key(struct timer_list *timer,
void (*func)(struct timer_list *),
unsigned int flags,
const char *name, struct lock_class_key *key)
{
debug_object_init_on_stack(timer, &timer_debug_descr);
do_init_timer(timer, func, flags, name, key);
}
EXPORT_SYMBOL_GPL(init_timer_on_stack_key);
void destroy_timer_on_stack(struct timer_list *timer)
{
debug_object_free(timer, &timer_debug_descr);
}
EXPORT_SYMBOL_GPL(destroy_timer_on_stack);
#else
static inline void debug_timer_init(struct timer_list *timer) { }
static inline void debug_timer_activate(struct timer_list *timer) { }
static inline void debug_timer_deactivate(struct timer_list *timer) { }
static inline void debug_timer_assert_init(struct timer_list *timer) { }
#endif
static inline void debug_init(struct timer_list *timer)
{
debug_timer_init(timer);
trace_timer_init(timer);
}
static inline void debug_deactivate(struct timer_list *timer)
{
debug_timer_deactivate(timer);
trace_timer_cancel(timer);
}
static inline void debug_assert_init(struct timer_list *timer)
{
debug_timer_assert_init(timer);
}
static void do_init_timer(struct timer_list *timer,
void (*func)(struct timer_list *),
unsigned int flags,
const char *name, struct lock_class_key *key)
{
timer->entry.pprev = NULL;
timer->function = func;
if (WARN_ON_ONCE(flags & ~TIMER_INIT_FLAGS))
flags &= TIMER_INIT_FLAGS;
timer->flags = flags | raw_smp_processor_id();
lockdep_init_map(&timer->lockdep_map, name, key, 0);
}
/**
* init_timer_key - initialize a timer
* @timer: the timer to be initialized
* @func: timer callback function
* @flags: timer flags
* @name: name of the timer
* @key: lockdep class key of the fake lock used for tracking timer
* sync lock dependencies
*
* init_timer_key() must be done to a timer prior calling *any* of the
* other timer functions.
*/
void init_timer_key(struct timer_list *timer,
void (*func)(struct timer_list *), unsigned int flags,
const char *name, struct lock_class_key *key)
{
debug_init(timer);
do_init_timer(timer, func, flags, name, key);
}
EXPORT_SYMBOL(init_timer_key);
static inline void detach_timer(struct timer_list *timer, bool clear_pending)
{
struct hlist_node *entry = &timer->entry;
debug_deactivate(timer);
__hlist_del(entry); if (clear_pending) entry->pprev = NULL; entry->next = LIST_POISON2;
}
static int detach_if_pending(struct timer_list *timer, struct timer_base *base,
bool clear_pending)
{
unsigned idx = timer_get_idx(timer);
if (!timer_pending(timer))
return 0;
if (hlist_is_singular_node(&timer->entry, base->vectors + idx)) { __clear_bit(idx, base->pending_map);
base->next_expiry_recalc = true;
}
detach_timer(timer, clear_pending);
return 1;
}
static inline struct timer_base *get_timer_cpu_base(u32 tflags, u32 cpu)
{
struct timer_base *base = per_cpu_ptr(&timer_bases[BASE_STD], cpu);
/*
* If the timer is deferrable and NO_HZ_COMMON is set then we need
* to use the deferrable base.
*/
if (IS_ENABLED(CONFIG_NO_HZ_COMMON) && (tflags & TIMER_DEFERRABLE))
base = per_cpu_ptr(&timer_bases[BASE_DEF], cpu);
return base;
}
static inline struct timer_base *get_timer_this_cpu_base(u32 tflags)
{
struct timer_base *base = this_cpu_ptr(&timer_bases[BASE_STD]);
/*
* If the timer is deferrable and NO_HZ_COMMON is set then we need
* to use the deferrable base.
*/
if (IS_ENABLED(CONFIG_NO_HZ_COMMON) && (tflags & TIMER_DEFERRABLE)) base = this_cpu_ptr(&timer_bases[BASE_DEF]);
return base;
}
static inline struct timer_base *get_timer_base(u32 tflags)
{
return get_timer_cpu_base(tflags, tflags & TIMER_CPUMASK);
}
static inline struct timer_base *
get_target_base(struct timer_base *base, unsigned tflags)
{
#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON)
if (static_branch_likely(&timers_migration_enabled) &&
!(tflags & TIMER_PINNED))
return get_timer_cpu_base(tflags, get_nohz_timer_target());
#endif
return get_timer_this_cpu_base(tflags);
}
static inline void forward_timer_base(struct timer_base *base)
{
unsigned long jnow = READ_ONCE(jiffies);
/*
* No need to forward if we are close enough below jiffies.
* Also while executing timers, base->clk is 1 offset ahead
* of jiffies to avoid endless requeuing to current jiffies.
*/
if ((long)(jnow - base->clk) < 1)
return;
/*
* If the next expiry value is > jiffies, then we fast forward to
* jiffies otherwise we forward to the next expiry value.
*/
if (time_after(base->next_expiry, jnow)) { base->clk = jnow;
} else {
if (WARN_ON_ONCE(time_before(base->next_expiry, base->clk)))
return;
base->clk = base->next_expiry;
}
}
/*
* We are using hashed locking: Holding per_cpu(timer_bases[x]).lock means
* that all timers which are tied to this base are locked, and the base itself
* is locked too.
*
* So __run_timers/migrate_timers can safely modify all timers which could
* be found in the base->vectors array.
*
* When a timer is migrating then the TIMER_MIGRATING flag is set and we need
* to wait until the migration is done.
*/
static struct timer_base *lock_timer_base(struct timer_list *timer,
unsigned long *flags)
__acquires(timer->base->lock)
{
for (;;) {
struct timer_base *base;
u32 tf;
/*
* We need to use READ_ONCE() here, otherwise the compiler
* might re-read @tf between the check for TIMER_MIGRATING
* and spin_lock().
*/
tf = READ_ONCE(timer->flags);
if (!(tf & TIMER_MIGRATING)) {
base = get_timer_base(tf);
raw_spin_lock_irqsave(&base->lock, *flags);
if (timer->flags == tf)
return base; raw_spin_unlock_irqrestore(&base->lock, *flags);
}
cpu_relax();
}
}
#define MOD_TIMER_PENDING_ONLY 0x01
#define MOD_TIMER_REDUCE 0x02
#define MOD_TIMER_NOTPENDING 0x04
static inline int
__mod_timer(struct timer_list *timer, unsigned long expires, unsigned int options)
{
unsigned long clk = 0, flags, bucket_expiry;
struct timer_base *base, *new_base;
unsigned int idx = UINT_MAX;
int ret = 0;
BUG_ON(!timer->function);
/*
* This is a common optimization triggered by the networking code - if
* the timer is re-modified to have the same timeout or ends up in the
* same array bucket then just return:
*/
if (!(options & MOD_TIMER_NOTPENDING) && timer_pending(timer)) {
/*
* The downside of this optimization is that it can result in
* larger granularity than you would get from adding a new
* timer with this expiry.
*/
long diff = timer->expires - expires;
if (!diff)
return 1;
if (options & MOD_TIMER_REDUCE && diff <= 0)
return 1;
/*
* We lock timer base and calculate the bucket index right
* here. If the timer ends up in the same bucket, then we
* just update the expiry time and avoid the whole
* dequeue/enqueue dance.
*/
base = lock_timer_base(timer, &flags);
forward_timer_base(base);
if (timer_pending(timer) && (options & MOD_TIMER_REDUCE) &&
time_before_eq(timer->expires, expires)) {
ret = 1;
goto out_unlock;
}
clk = base->clk;
idx = calc_wheel_index(expires, clk, &bucket_expiry);
/*
* Retrieve and compare the array index of the pending
* timer. If it matches set the expiry to the new value so a
* subsequent call will exit in the expires check above.
*/
if (idx == timer_get_idx(timer)) {
if (!(options & MOD_TIMER_REDUCE))
timer->expires = expires;
else if (time_after(timer->expires, expires))
timer->expires = expires;
ret = 1;
goto out_unlock;
}
} else {
base = lock_timer_base(timer, &flags);
forward_timer_base(base);
}
ret = detach_if_pending(timer, base, false);
if (!ret && (options & MOD_TIMER_PENDING_ONLY))
goto out_unlock;
new_base = get_target_base(base, timer->flags);
if (base != new_base) {
/*
* We are trying to schedule the timer on the new base.
* However we can't change timer's base while it is running,
* otherwise del_timer_sync() can't detect that the timer's
* handler yet has not finished. This also guarantees that the
* timer is serialized wrt itself.
*/
if (likely(base->running_timer != timer)) {
/* See the comment in lock_timer_base() */
timer->flags |= TIMER_MIGRATING;
raw_spin_unlock(&base->lock);
base = new_base;
raw_spin_lock(&base->lock);
WRITE_ONCE(timer->flags,
(timer->flags & ~TIMER_BASEMASK) | base->cpu);
forward_timer_base(base);
}
}
debug_timer_activate(timer);
timer->expires = expires;
/*
* If 'idx' was calculated above and the base time did not advance
* between calculating 'idx' and possibly switching the base, only
* enqueue_timer() is required. Otherwise we need to (re)calculate
* the wheel index via internal_add_timer().
*/
if (idx != UINT_MAX && clk == base->clk) enqueue_timer(base, timer, idx, bucket_expiry);
else
internal_add_timer(base, timer);
out_unlock:
raw_spin_unlock_irqrestore(&base->lock, flags); return ret;
}
/**
* mod_timer_pending - modify a pending timer's timeout
* @timer: the pending timer to be modified
* @expires: new timeout in jiffies
*
* mod_timer_pending() is the same for pending timers as mod_timer(),
* but will not re-activate and modify already deleted timers.
*
* It is useful for unserialized use of timers.
*/
int mod_timer_pending(struct timer_list *timer, unsigned long expires)
{
return __mod_timer(timer, expires, MOD_TIMER_PENDING_ONLY);
}
EXPORT_SYMBOL(mod_timer_pending);
/**
* mod_timer - modify a timer's timeout
* @timer: the timer to be modified
* @expires: new timeout in jiffies
*
* mod_timer() is a more efficient way to update the expire field of an
* active timer (if the timer is inactive it will be activated)
*
* mod_timer(timer, expires) is equivalent to:
*
* del_timer(timer); timer->expires = expires; add_timer(timer);
*
* Note that if there are multiple unserialized concurrent users of the
* same timer, then mod_timer() is the only safe way to modify the timeout,
* since add_timer() cannot modify an already running timer.
*
* The function returns whether it has modified a pending timer or not.
* (ie. mod_timer() of an inactive timer returns 0, mod_timer() of an
* active timer returns 1.)
*/
int mod_timer(struct timer_list *timer, unsigned long expires)
{
return __mod_timer(timer, expires, 0);
}
EXPORT_SYMBOL(mod_timer);
/**
* timer_reduce - Modify a timer's timeout if it would reduce the timeout
* @timer: The timer to be modified
* @expires: New timeout in jiffies
*
* timer_reduce() is very similar to mod_timer(), except that it will only
* modify a running timer if that would reduce the expiration time (it will
* start a timer that isn't running).
*/
int timer_reduce(struct timer_list *timer, unsigned long expires)
{
return __mod_timer(timer, expires, MOD_TIMER_REDUCE);
}
EXPORT_SYMBOL(timer_reduce);
/**
* add_timer - start a timer
* @timer: the timer to be added
*
* The kernel will do a ->function(@timer) callback from the
* timer interrupt at the ->expires point in the future. The
* current time is 'jiffies'.
*
* The timer's ->expires, ->function fields must be set prior calling this
* function.
*
* Timers with an ->expires field in the past will be executed in the next
* timer tick.
*/
void add_timer(struct timer_list *timer)
{
BUG_ON(timer_pending(timer)); __mod_timer(timer, timer->expires, MOD_TIMER_NOTPENDING);}
EXPORT_SYMBOL(add_timer);
/**
* add_timer_on - start a timer on a particular CPU
* @timer: the timer to be added
* @cpu: the CPU to start it on
*
* This is not very scalable on SMP. Double adds are not possible.
*/
void add_timer_on(struct timer_list *timer, int cpu)
{
struct timer_base *new_base, *base;
unsigned long flags;
BUG_ON(timer_pending(timer) || !timer->function);
new_base = get_timer_cpu_base(timer->flags, cpu);
/*
* If @timer was on a different CPU, it should be migrated with the
* old base locked to prevent other operations proceeding with the
* wrong base locked. See lock_timer_base().
*/
base = lock_timer_base(timer, &flags);
if (base != new_base) {
timer->flags |= TIMER_MIGRATING;
raw_spin_unlock(&base->lock);
base = new_base;
raw_spin_lock(&base->lock);
WRITE_ONCE(timer->flags,
(timer->flags & ~TIMER_BASEMASK) | cpu);
}
forward_timer_base(base);
debug_timer_activate(timer);
internal_add_timer(base, timer);
raw_spin_unlock_irqrestore(&base->lock, flags);
}
EXPORT_SYMBOL_GPL(add_timer_on);
/**
* del_timer - deactivate a timer.
* @timer: the timer to be deactivated
*
* del_timer() deactivates a timer - this works on both active and inactive
* timers.
*
* The function returns whether it has deactivated a pending timer or not.
* (ie. del_timer() of an inactive timer returns 0, del_timer() of an
* active timer returns 1.)
*/
int del_timer(struct timer_list *timer)
{
struct timer_base *base;
unsigned long flags;
int ret = 0;
debug_assert_init(timer);
if (timer_pending(timer)) {
base = lock_timer_base(timer, &flags);
ret = detach_if_pending(timer, base, true);
raw_spin_unlock_irqrestore(&base->lock, flags);
}
return ret;
}
EXPORT_SYMBOL(del_timer);
/**
* try_to_del_timer_sync - Try to deactivate a timer
* @timer: timer to delete
*
* This function tries to deactivate a timer. Upon successful (ret >= 0)
* exit the timer is not queued and the handler is not running on any CPU.
*/
int try_to_del_timer_sync(struct timer_list *timer)
{
struct timer_base *base;
unsigned long flags;
int ret = -1;
debug_assert_init(timer);
base = lock_timer_base(timer, &flags);
if (base->running_timer != timer)
ret = detach_if_pending(timer, base, true); raw_spin_unlock_irqrestore(&base->lock, flags);
return ret;
}
EXPORT_SYMBOL(try_to_del_timer_sync);
#ifdef CONFIG_PREEMPT_RT
static __init void timer_base_init_expiry_lock(struct timer_base *base)
{
spin_lock_init(&base->expiry_lock);
}
static inline void timer_base_lock_expiry(struct timer_base *base)
{
spin_lock(&base->expiry_lock);
}
static inline void timer_base_unlock_expiry(struct timer_base *base)
{
spin_unlock(&base->expiry_lock);
}
/*
* The counterpart to del_timer_wait_running().
*
* If there is a waiter for base->expiry_lock, then it was waiting for the
* timer callback to finish. Drop expiry_lock and reacquire it. That allows
* the waiter to acquire the lock and make progress.
*/
static void timer_sync_wait_running(struct timer_base *base)
{
if (atomic_read(&base->timer_waiters)) {
raw_spin_unlock_irq(&base->lock);
spin_unlock(&base->expiry_lock);
spin_lock(&base->expiry_lock);
raw_spin_lock_irq(&base->lock);
}
}
/*
* This function is called on PREEMPT_RT kernels when the fast path
* deletion of a timer failed because the timer callback function was
* running.
*
* This prevents priority inversion, if the softirq thread on a remote CPU
* got preempted, and it prevents a life lock when the task which tries to
* delete a timer preempted the softirq thread running the timer callback
* function.
*/
static void del_timer_wait_running(struct timer_list *timer)
{
u32 tf;
tf = READ_ONCE(timer->flags);
if (!(tf & (TIMER_MIGRATING | TIMER_IRQSAFE))) {
struct timer_base *base = get_timer_base(tf);
/*
* Mark the base as contended and grab the expiry lock,
* which is held by the softirq across the timer
* callback. Drop the lock immediately so the softirq can
* expire the next timer. In theory the timer could already
* be running again, but that's more than unlikely and just
* causes another wait loop.
*/
atomic_inc(&base->timer_waiters);
spin_lock_bh(&base->expiry_lock);
atomic_dec(&base->timer_waiters);
spin_unlock_bh(&base->expiry_lock);
}
}
#else
static inline void timer_base_init_expiry_lock(struct timer_base *base) { }
static inline void timer_base_lock_expiry(struct timer_base *base) { }
static inline void timer_base_unlock_expiry(struct timer_base *base) { }
static inline void timer_sync_wait_running(struct timer_base *base) { }
static inline void del_timer_wait_running(struct timer_list *timer) { }
#endif
#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT)
/**
* del_timer_sync - deactivate a timer and wait for the handler to finish.
* @timer: the timer to be deactivated
*
* This function only differs from del_timer() on SMP: besides deactivating
* the timer it also makes sure the handler has finished executing on other
* CPUs.
*
* Synchronization rules: Callers must prevent restarting of the timer,
* otherwise this function is meaningless. It must not be called from
* interrupt contexts unless the timer is an irqsafe one. The caller must
* not hold locks which would prevent completion of the timer's
* handler. The timer's handler must not call add_timer_on(). Upon exit the
* timer is not queued and the handler is not running on any CPU.
*
* Note: For !irqsafe timers, you must not hold locks that are held in
* interrupt context while calling this function. Even if the lock has
* nothing to do with the timer in question. Here's why::
*
* CPU0 CPU1
* ---- ----
* <SOFTIRQ>
* call_timer_fn();
* base->running_timer = mytimer;
* spin_lock_irq(somelock);
* <IRQ>
* spin_lock(somelock);
* del_timer_sync(mytimer);
* while (base->running_timer == mytimer);
*
* Now del_timer_sync() will never return and never release somelock.
* The interrupt on the other CPU is waiting to grab somelock but
* it has interrupted the softirq that CPU0 is waiting to finish.
*
* The function returns whether it has deactivated a pending timer or not.
*/
int del_timer_sync(struct timer_list *timer)
{
int ret;
#ifdef CONFIG_LOCKDEP
unsigned long flags;
/*
* If lockdep gives a backtrace here, please reference
* the synchronization rules above.
*/
local_irq_save(flags);
lock_map_acquire(&timer->lockdep_map);
lock_map_release(&timer->lockdep_map);
local_irq_restore(flags);
#endif
/*
* don't use it in hardirq context, because it
* could lead to deadlock.
*/
WARN_ON(in_irq() && !(timer->flags & TIMER_IRQSAFE));
/*
* Must be able to sleep on PREEMPT_RT because of the slowpath in
* del_timer_wait_running().
*/
if (IS_ENABLED(CONFIG_PREEMPT_RT) && !(timer->flags & TIMER_IRQSAFE))
lockdep_assert_preemption_enabled();
do {
ret = try_to_del_timer_sync(timer);
if (unlikely(ret < 0)) {
del_timer_wait_running(timer);
cpu_relax();
}
} while (ret < 0);
return ret;
}
EXPORT_SYMBOL(del_timer_sync);
#endif
static void call_timer_fn(struct timer_list *timer,
void (*fn)(struct timer_list *),
unsigned long baseclk)
{
int count = preempt_count();
#ifdef CONFIG_LOCKDEP
/*
* It is permissible to free the timer from inside the
* function that is called from it, this we need to take into
* account for lockdep too. To avoid bogus "held lock freed"
* warnings as well as problems when looking into
* timer->lockdep_map, make a copy and use that here.
*/
struct lockdep_map lockdep_map;
lockdep_copy_map(&lockdep_map, &timer->lockdep_map);
#endif
/*
* Couple the lock chain with the lock chain at
* del_timer_sync() by acquiring the lock_map around the fn()
* call here and in del_timer_sync().
*/
lock_map_acquire(&lockdep_map);
trace_timer_expire_entry(timer, baseclk);
fn(timer);
trace_timer_expire_exit(timer);
lock_map_release(&lockdep_map);
if (count != preempt_count()) {
WARN_ONCE(1, "timer: %pS preempt leak: %08x -> %08x\n",
fn, count, preempt_count());
/*
* Restore the preempt count. That gives us a decent
* chance to survive and extract information. If the
* callback kept a lock held, bad luck, but not worse
* than the BUG() we had.
*/
preempt_count_set(count);
}
}
static void expire_timers(struct timer_base *base, struct hlist_head *head)
{
/*
* This value is required only for tracing. base->clk was
* incremented directly before expire_timers was called. But expiry
* is related to the old base->clk value.
*/
unsigned long baseclk = base->clk - 1;
while (!hlist_empty(head)) {
struct timer_list *timer;
void (*fn)(struct timer_list *);
timer = hlist_entry(head->first, struct timer_list, entry);
base->running_timer = timer;
detach_timer(timer, true);
fn = timer->function;
if (timer->flags & TIMER_IRQSAFE) {
raw_spin_unlock(&base->lock);
call_timer_fn(timer, fn, baseclk);
raw_spin_lock(&base->lock);
base->running_timer = NULL;
} else {
raw_spin_unlock_irq(&base->lock);
call_timer_fn(timer, fn, baseclk);
raw_spin_lock_irq(&base->lock);
base->running_timer = NULL;
timer_sync_wait_running(base);
}
}
}
static int collect_expired_timers(struct timer_base *base,
struct hlist_head *heads)
{
unsigned long clk = base->clk = base->next_expiry;
struct hlist_head *vec;
int i, levels = 0;
unsigned int idx;
for (i = 0; i < LVL_DEPTH; i++) {
idx = (clk & LVL_MASK) + i * LVL_SIZE;
if (__test_and_clear_bit(idx, base->pending_map)) {
vec = base->vectors + idx;
hlist_move_list(vec, heads++);
levels++;
}
/* Is it time to look at the next level? */
if (clk & LVL_CLK_MASK)
break;
/* Shift clock for the next level granularity */
clk >>= LVL_CLK_SHIFT;
}
return levels;
}
/*
* Find the next pending bucket of a level. Search from level start (@offset)
* + @clk upwards and if nothing there, search from start of the level
* (@offset) up to @offset + clk.
*/
static int next_pending_bucket(struct timer_base *base, unsigned offset,
unsigned clk)
{
unsigned pos, start = offset + clk;
unsigned end = offset + LVL_SIZE;
pos = find_next_bit(base->pending_map, end, start);
if (pos < end)
return pos - start;
pos = find_next_bit(base->pending_map, start, offset);
return pos < start ? pos + LVL_SIZE - start : -1;
}
/*
* Search the first expiring timer in the various clock levels. Caller must
* hold base->lock.
*/
static unsigned long __next_timer_interrupt(struct timer_base *base)
{
unsigned long clk, next, adj;
unsigned lvl, offset = 0;
next = base->clk + NEXT_TIMER_MAX_DELTA;
clk = base->clk;
for (lvl = 0; lvl < LVL_DEPTH; lvl++, offset += LVL_SIZE) {
int pos = next_pending_bucket(base, offset, clk & LVL_MASK);
unsigned long lvl_clk = clk & LVL_CLK_MASK;
if (pos >= 0) {
unsigned long tmp = clk + (unsigned long) pos;
tmp <<= LVL_SHIFT(lvl);
if (time_before(tmp, next))
next = tmp;
/*
* If the next expiration happens before we reach
* the next level, no need to check further.
*/
if (pos <= ((LVL_CLK_DIV - lvl_clk) & LVL_CLK_MASK))
break;
}
/*
* Clock for the next level. If the current level clock lower
* bits are zero, we look at the next level as is. If not we
* need to advance it by one because that's going to be the
* next expiring bucket in that level. base->clk is the next
* expiring jiffie. So in case of:
*
* LVL5 LVL4 LVL3 LVL2 LVL1 LVL0
* 0 0 0 0 0 0
*
* we have to look at all levels @index 0. With
*
* LVL5 LVL4 LVL3 LVL2 LVL1 LVL0
* 0 0 0 0 0 2
*
* LVL0 has the next expiring bucket @index 2. The upper
* levels have the next expiring bucket @index 1.
*
* In case that the propagation wraps the next level the same
* rules apply:
*
* LVL5 LVL4 LVL3 LVL2 LVL1 LVL0
* 0 0 0 0 F 2
*
* So after looking at LVL0 we get:
*
* LVL5 LVL4 LVL3 LVL2 LVL1
* 0 0 0 1 0
*
* So no propagation from LVL1 to LVL2 because that happened
* with the add already, but then we need to propagate further
* from LVL2 to LVL3.
*
* So the simple check whether the lower bits of the current
* level are 0 or not is sufficient for all cases.
*/
adj = lvl_clk ? 1 : 0;
clk >>= LVL_CLK_SHIFT;
clk += adj;
}
base->next_expiry_recalc = false;
base->timers_pending = !(next == base->clk + NEXT_TIMER_MAX_DELTA);
return next;
}
#ifdef CONFIG_NO_HZ_COMMON
/*
* Check, if the next hrtimer event is before the next timer wheel
* event:
*/
static u64 cmp_next_hrtimer_event(u64 basem, u64 expires)
{
u64 nextevt = hrtimer_get_next_event();
/*
* If high resolution timers are enabled
* hrtimer_get_next_event() returns KTIME_MAX.
*/
if (expires <= nextevt)
return expires;
/*
* If the next timer is already expired, return the tick base
* time so the tick is fired immediately.
*/
if (nextevt <= basem)
return basem;
/*
* Round up to the next jiffie. High resolution timers are
* off, so the hrtimers are expired in the tick and we need to
* make sure that this tick really expires the timer to avoid
* a ping pong of the nohz stop code.
*
* Use DIV_ROUND_UP_ULL to prevent gcc calling __divdi3
*/
return DIV_ROUND_UP_ULL(nextevt, TICK_NSEC) * TICK_NSEC;
}
/**
* get_next_timer_interrupt - return the time (clock mono) of the next timer
* @basej: base time jiffies
* @basem: base time clock monotonic
*
* Returns the tick aligned clock monotonic time of the next pending
* timer or KTIME_MAX if no timer is pending.
*/
u64 get_next_timer_interrupt(unsigned long basej, u64 basem)
{
struct timer_base *base = this_cpu_ptr(&timer_bases[BASE_STD]);
u64 expires = KTIME_MAX;
unsigned long nextevt;
/*
* Pretend that there is no timer pending if the cpu is offline.
* Possible pending timers will be migrated later to an active cpu.
*/
if (cpu_is_offline(smp_processor_id()))
return expires;
raw_spin_lock(&base->lock);
if (base->next_expiry_recalc)
base->next_expiry = __next_timer_interrupt(base);
nextevt = base->next_expiry;
/*
* We have a fresh next event. Check whether we can forward the
* base. We can only do that when @basej is past base->clk
* otherwise we might rewind base->clk.
*/
if (time_after(basej, base->clk)) {
if (time_after(nextevt, basej))
base->clk = basej;
else if (time_after(nextevt, base->clk))
base->clk = nextevt;
}
if (time_before_eq(nextevt, basej)) {
expires = basem;
base->is_idle = false;
} else {
if (base->timers_pending)
expires = basem + (u64)(nextevt - basej) * TICK_NSEC;
/*
* If we expect to sleep more than a tick, mark the base idle.
* Also the tick is stopped so any added timer must forward
* the base clk itself to keep granularity small. This idle
* logic is only maintained for the BASE_STD base, deferrable
* timers may still see large granularity skew (by design).
*/
if ((expires - basem) > TICK_NSEC)
base->is_idle = true;
}
raw_spin_unlock(&base->lock);
return cmp_next_hrtimer_event(basem, expires);
}
/**
* timer_clear_idle - Clear the idle state of the timer base
*
* Called with interrupts disabled
*/
void timer_clear_idle(void)
{
struct timer_base *base = this_cpu_ptr(&timer_bases[BASE_STD]);
/*
* We do this unlocked. The worst outcome is a remote enqueue sending
* a pointless IPI, but taking the lock would just make the window for
* sending the IPI a few instructions smaller for the cost of taking
* the lock in the exit from idle path.
*/
base->is_idle = false;
}
#endif
/**
* __run_timers - run all expired timers (if any) on this CPU.
* @base: the timer vector to be processed.
*/
static inline void __run_timers(struct timer_base *base)
{
struct hlist_head heads[LVL_DEPTH];
int levels;
if (time_before(jiffies, base->next_expiry))
return;
timer_base_lock_expiry(base);
raw_spin_lock_irq(&base->lock);
while (time_after_eq(jiffies, base->clk) &&
time_after_eq(jiffies, base->next_expiry)) {
levels = collect_expired_timers(base, heads);
/*
* The two possible reasons for not finding any expired
* timer at this clk are that all matching timers have been
* dequeued or no timer has been queued since
* base::next_expiry was set to base::clk +
* NEXT_TIMER_MAX_DELTA.
*/
WARN_ON_ONCE(!levels && !base->next_expiry_recalc
&& base->timers_pending);
base->clk++;
base->next_expiry = __next_timer_interrupt(base);
while (levels--)
expire_timers(base, heads + levels);
}
raw_spin_unlock_irq(&base->lock);
timer_base_unlock_expiry(base);
}
/*
* This function runs timers and the timer-tq in bottom half context.
*/
static __latent_entropy void run_timer_softirq(struct softirq_action *h)
{
struct timer_base *base = this_cpu_ptr(&timer_bases[BASE_STD]);
__run_timers(base);
if (IS_ENABLED(CONFIG_NO_HZ_COMMON))
__run_timers(this_cpu_ptr(&timer_bases[BASE_DEF]));
}
/*
* Called by the local, per-CPU timer interrupt on SMP.
*/
static void run_local_timers(void)
{
struct timer_base *base = this_cpu_ptr(&timer_bases[BASE_STD]);
hrtimer_run_queues();
/* Raise the softirq only if required. */
if (time_before(jiffies, base->next_expiry)) {
if (!IS_ENABLED(CONFIG_NO_HZ_COMMON))
return;
/* CPU is awake, so check the deferrable base. */
base++;
if (time_before(jiffies, base->next_expiry))
return;
}
raise_softirq(TIMER_SOFTIRQ);
}
/*
* Called from the timer interrupt handler to charge one tick to the current
* process. user_tick is 1 if the tick is user time, 0 for system.
*/
void update_process_times(int user_tick)
{
struct task_struct *p = current;
PRANDOM_ADD_NOISE(jiffies, user_tick, p, 0);
/* Note: this timer irq context must be accounted for as well. */
account_process_tick(p, user_tick);
run_local_timers();
rcu_sched_clock_irq(user_tick);
#ifdef CONFIG_IRQ_WORK
if (in_irq())
irq_work_tick();
#endif
scheduler_tick();
if (IS_ENABLED(CONFIG_POSIX_TIMERS))
run_posix_cpu_timers();
}
/*
* Since schedule_timeout()'s timer is defined on the stack, it must store
* the target task on the stack as well.
*/
struct process_timer {
struct timer_list timer;
struct task_struct *task;
};
static void process_timeout(struct timer_list *t)
{
struct process_timer *timeout = from_timer(timeout, t, timer);
wake_up_process(timeout->task);
}
/**
* schedule_timeout - sleep until timeout
* @timeout: timeout value in jiffies
*
* Make the current task sleep until @timeout jiffies have elapsed.
* The function behavior depends on the current task state
* (see also set_current_state() description):
*
* %TASK_RUNNING - the scheduler is called, but the task does not sleep
* at all. That happens because sched_submit_work() does nothing for
* tasks in %TASK_RUNNING state.
*
* %TASK_UNINTERRUPTIBLE - at least @timeout jiffies are guaranteed to
* pass before the routine returns unless the current task is explicitly
* woken up, (e.g. by wake_up_process()).
*
* %TASK_INTERRUPTIBLE - the routine may return early if a signal is
* delivered to the current task or the current task is explicitly woken
* up.
*
* The current task state is guaranteed to be %TASK_RUNNING when this
* routine returns.
*
* Specifying a @timeout value of %MAX_SCHEDULE_TIMEOUT will schedule
* the CPU away without a bound on the timeout. In this case the return
* value will be %MAX_SCHEDULE_TIMEOUT.
*
* Returns 0 when the timer has expired otherwise the remaining time in
* jiffies will be returned. In all cases the return value is guaranteed
* to be non-negative.
*/
signed long __sched schedule_timeout(signed long timeout)
{
struct process_timer timer;
unsigned long expire;
switch (timeout)
{
case MAX_SCHEDULE_TIMEOUT:
/*
* These two special cases are useful to be comfortable
* in the caller. Nothing more. We could take
* MAX_SCHEDULE_TIMEOUT from one of the negative value
* but I' d like to return a valid offset (>=0) to allow
* the caller to do everything it want with the retval.
*/
schedule();
goto out;
default:
/*
* Another bit of PARANOID. Note that the retval will be
* 0 since no piece of kernel is supposed to do a check
* for a negative retval of schedule_timeout() (since it
* should never happens anyway). You just have the printk()
* that will tell you if something is gone wrong and where.
*/
if (timeout < 0) { printk(KERN_ERR "schedule_timeout: wrong timeout "
"value %lx\n", timeout);
dump_stack();
__set_current_state(TASK_RUNNING);
goto out;
}
}
expire = timeout + jiffies;
timer.task = current;
timer_setup_on_stack(&timer.timer, process_timeout, 0);
__mod_timer(&timer.timer, expire, MOD_TIMER_NOTPENDING);
schedule();
del_singleshot_timer_sync(&timer.timer);
/* Remove the timer from the object tracker */
destroy_timer_on_stack(&timer.timer);
timeout = expire - jiffies;
out:
return timeout < 0 ? 0 : timeout;}
EXPORT_SYMBOL(schedule_timeout);
/*
* We can use __set_current_state() here because schedule_timeout() calls
* schedule() unconditionally.
*/
signed long __sched schedule_timeout_interruptible(signed long timeout)
{
__set_current_state(TASK_INTERRUPTIBLE);
return schedule_timeout(timeout);
}
EXPORT_SYMBOL(schedule_timeout_interruptible);
signed long __sched schedule_timeout_killable(signed long timeout)
{
__set_current_state(TASK_KILLABLE);
return schedule_timeout(timeout);
}
EXPORT_SYMBOL(schedule_timeout_killable);
signed long __sched schedule_timeout_uninterruptible(signed long timeout)
{
__set_current_state(TASK_UNINTERRUPTIBLE);
return schedule_timeout(timeout);
}
EXPORT_SYMBOL(schedule_timeout_uninterruptible);
/*
* Like schedule_timeout_uninterruptible(), except this task will not contribute
* to load average.
*/
signed long __sched schedule_timeout_idle(signed long timeout)
{
__set_current_state(TASK_IDLE);
return schedule_timeout(timeout);
}
EXPORT_SYMBOL(schedule_timeout_idle);
#ifdef CONFIG_HOTPLUG_CPU
static void migrate_timer_list(struct timer_base *new_base, struct hlist_head *head)
{
struct timer_list *timer;
int cpu = new_base->cpu;
while (!hlist_empty(head)) {
timer = hlist_entry(head->first, struct timer_list, entry);
detach_timer(timer, false);
timer->flags = (timer->flags & ~TIMER_BASEMASK) | cpu;
internal_add_timer(new_base, timer);
}
}
int timers_prepare_cpu(unsigned int cpu)
{
struct timer_base *base;
int b;
for (b = 0; b < NR_BASES; b++) {
base = per_cpu_ptr(&timer_bases[b], cpu);
base->clk = jiffies;
base->next_expiry = base->clk + NEXT_TIMER_MAX_DELTA;
base->timers_pending = false;
base->is_idle = false;
}
return 0;
}
int timers_dead_cpu(unsigned int cpu)
{
struct timer_base *old_base;
struct timer_base *new_base;
int b, i;
BUG_ON(cpu_online(cpu));
for (b = 0; b < NR_BASES; b++) {
old_base = per_cpu_ptr(&timer_bases[b], cpu);
new_base = get_cpu_ptr(&timer_bases[b]);
/*
* The caller is globally serialized and nobody else
* takes two locks at once, deadlock is not possible.
*/
raw_spin_lock_irq(&new_base->lock);
raw_spin_lock_nested(&old_base->lock, SINGLE_DEPTH_NESTING);
/*
* The current CPUs base clock might be stale. Update it
* before moving the timers over.
*/
forward_timer_base(new_base);
BUG_ON(old_base->running_timer);
for (i = 0; i < WHEEL_SIZE; i++)
migrate_timer_list(new_base, old_base->vectors + i);
raw_spin_unlock(&old_base->lock);
raw_spin_unlock_irq(&new_base->lock);
put_cpu_ptr(&timer_bases);
}
return 0;
}
#endif /* CONFIG_HOTPLUG_CPU */
static void __init init_timer_cpu(int cpu)
{
struct timer_base *base;
int i;
for (i = 0; i < NR_BASES; i++) {
base = per_cpu_ptr(&timer_bases[i], cpu);
base->cpu = cpu;
raw_spin_lock_init(&base->lock);
base->clk = jiffies;
base->next_expiry = base->clk + NEXT_TIMER_MAX_DELTA;
timer_base_init_expiry_lock(base);
}
}
static void __init init_timer_cpus(void)
{
int cpu;
for_each_possible_cpu(cpu)
init_timer_cpu(cpu);
}
void __init init_timers(void)
{
init_timer_cpus();
posix_cputimers_init_work();
open_softirq(TIMER_SOFTIRQ, run_timer_softirq);
}
/**
* msleep - sleep safely even with waitqueue interruptions
* @msecs: Time in milliseconds to sleep for
*/
void msleep(unsigned int msecs)
{
unsigned long timeout = msecs_to_jiffies(msecs) + 1;
while (timeout)
timeout = schedule_timeout_uninterruptible(timeout);
}
EXPORT_SYMBOL(msleep);
/**
* msleep_interruptible - sleep waiting for signals
* @msecs: Time in milliseconds to sleep for
*/
unsigned long msleep_interruptible(unsigned int msecs)
{
unsigned long timeout = msecs_to_jiffies(msecs) + 1;
while (timeout && !signal_pending(current))
timeout = schedule_timeout_interruptible(timeout);
return jiffies_to_msecs(timeout);
}
EXPORT_SYMBOL(msleep_interruptible);
/**
* usleep_range_state - Sleep for an approximate time in a given state
* @min: Minimum time in usecs to sleep
* @max: Maximum time in usecs to sleep
* @state: State of the current task that will be while sleeping
*
* In non-atomic context where the exact wakeup time is flexible, use
* usleep_range_state() instead of udelay(). The sleep improves responsiveness
* by avoiding the CPU-hogging busy-wait of udelay(), and the range reduces
* power usage by allowing hrtimers to take advantage of an already-
* scheduled interrupt instead of scheduling a new one just for this sleep.
*/
void __sched usleep_range_state(unsigned long min, unsigned long max,
unsigned int state)
{
ktime_t exp = ktime_add_us(ktime_get(), min);
u64 delta = (u64)(max - min) * NSEC_PER_USEC;
for (;;) {
__set_current_state(state);
/* Do not return before the requested sleep time has elapsed */
if (!schedule_hrtimeout_range(&exp, delta, HRTIMER_MODE_ABS))
break;
}
}
EXPORT_SYMBOL(usleep_range_state);
// SPDX-License-Identifier: GPL-2.0-only
/*
* lib/bitmap.c
* Helper functions for bitmap.h.
*/
#include <linux/bitmap.h>
#include <linux/bitops.h>
#include <linux/bug.h>
#include <linux/ctype.h>
#include <linux/device.h>
#include <linux/errno.h>
#include <linux/export.h>
#include <linux/kernel.h>
#include <linux/mm.h>
#include <linux/slab.h>
#include <linux/string.h>
#include <linux/thread_info.h>
#include <linux/uaccess.h>
#include <asm/page.h>
#include "kstrtox.h"
/**
* DOC: bitmap introduction
*
* bitmaps provide an array of bits, implemented using an
* array of unsigned longs. The number of valid bits in a
* given bitmap does _not_ need to be an exact multiple of
* BITS_PER_LONG.
*
* The possible unused bits in the last, partially used word
* of a bitmap are 'don't care'. The implementation makes
* no particular effort to keep them zero. It ensures that
* their value will not affect the results of any operation.
* The bitmap operations that return Boolean (bitmap_empty,
* for example) or scalar (bitmap_weight, for example) results
* carefully filter out these unused bits from impacting their
* results.
*
* The byte ordering of bitmaps is more natural on little
* endian architectures. See the big-endian headers
* include/asm-ppc64/bitops.h and include/asm-s390/bitops.h
* for the best explanations of this ordering.
*/
int __bitmap_equal(const unsigned long *bitmap1,
const unsigned long *bitmap2, unsigned int bits)
{
unsigned int k, lim = bits/BITS_PER_LONG;
for (k = 0; k < lim; ++k)
if (bitmap1[k] != bitmap2[k])
return 0;
if (bits % BITS_PER_LONG)
if ((bitmap1[k] ^ bitmap2[k]) & BITMAP_LAST_WORD_MASK(bits))
return 0;
return 1;
}
EXPORT_SYMBOL(__bitmap_equal);
bool __bitmap_or_equal(const unsigned long *bitmap1,
const unsigned long *bitmap2,
const unsigned long *bitmap3,
unsigned int bits)
{
unsigned int k, lim = bits / BITS_PER_LONG;
unsigned long tmp;
for (k = 0; k < lim; ++k) {
if ((bitmap1[k] | bitmap2[k]) != bitmap3[k])
return false;
}
if (!(bits % BITS_PER_LONG))
return true;
tmp = (bitmap1[k] | bitmap2[k]) ^ bitmap3[k];
return (tmp & BITMAP_LAST_WORD_MASK(bits)) == 0;
}
void __bitmap_complement(unsigned long *dst, const unsigned long *src, unsigned int bits)
{
unsigned int k, lim = BITS_TO_LONGS(bits);
for (k = 0; k < lim; ++k)
dst[k] = ~src[k];
}
EXPORT_SYMBOL(__bitmap_complement);
/**
* __bitmap_shift_right - logical right shift of the bits in a bitmap
* @dst : destination bitmap
* @src : source bitmap
* @shift : shift by this many bits
* @nbits : bitmap size, in bits
*
* Shifting right (dividing) means moving bits in the MS -> LS bit
* direction. Zeros are fed into the vacated MS positions and the
* LS bits shifted off the bottom are lost.
*/
void __bitmap_shift_right(unsigned long *dst, const unsigned long *src,
unsigned shift, unsigned nbits)
{
unsigned k, lim = BITS_TO_LONGS(nbits);
unsigned off = shift/BITS_PER_LONG, rem = shift % BITS_PER_LONG;
unsigned long mask = BITMAP_LAST_WORD_MASK(nbits);
for (k = 0; off + k < lim; ++k) {
unsigned long upper, lower;
/*
* If shift is not word aligned, take lower rem bits of
* word above and make them the top rem bits of result.
*/
if (!rem || off + k + 1 >= lim)
upper = 0;
else {
upper = src[off + k + 1];
if (off + k + 1 == lim - 1)
upper &= mask;
upper <<= (BITS_PER_LONG - rem);
}
lower = src[off + k];
if (off + k == lim - 1)
lower &= mask;
lower >>= rem;
dst[k] = lower | upper;
}
if (off)
memset(&dst[lim - off], 0, off*sizeof(unsigned long));
}
EXPORT_SYMBOL(__bitmap_shift_right);
/**
* __bitmap_shift_left - logical left shift of the bits in a bitmap
* @dst : destination bitmap
* @src : source bitmap
* @shift : shift by this many bits
* @nbits : bitmap size, in bits
*
* Shifting left (multiplying) means moving bits in the LS -> MS
* direction. Zeros are fed into the vacated LS bit positions
* and those MS bits shifted off the top are lost.
*/
void __bitmap_shift_left(unsigned long *dst, const unsigned long *src,
unsigned int shift, unsigned int nbits)
{
int k;
unsigned int lim = BITS_TO_LONGS(nbits);
unsigned int off = shift/BITS_PER_LONG, rem = shift % BITS_PER_LONG;
for (k = lim - off - 1; k >= 0; --k) {
unsigned long upper, lower;
/*
* If shift is not word aligned, take upper rem bits of
* word below and make them the bottom rem bits of result.
*/
if (rem && k > 0)
lower = src[k - 1] >> (BITS_PER_LONG - rem);
else
lower = 0;
upper = src[k] << rem;
dst[k + off] = lower | upper;
}
if (off)
memset(dst, 0, off*sizeof(unsigned long));
}
EXPORT_SYMBOL(__bitmap_shift_left);
/**
* bitmap_cut() - remove bit region from bitmap and right shift remaining bits
* @dst: destination bitmap, might overlap with src
* @src: source bitmap
* @first: start bit of region to be removed
* @cut: number of bits to remove
* @nbits: bitmap size, in bits
*
* Set the n-th bit of @dst iff the n-th bit of @src is set and
* n is less than @first, or the m-th bit of @src is set for any
* m such that @first <= n < nbits, and m = n + @cut.
*
* In pictures, example for a big-endian 32-bit architecture:
*
* The @src bitmap is::
*
* 31 63
* | |
* 10000000 11000001 11110010 00010101 10000000 11000001 01110010 00010101
* | | | |
* 16 14 0 32
*
* if @cut is 3, and @first is 14, bits 14-16 in @src are cut and @dst is::
*
* 31 63
* | |
* 10110000 00011000 00110010 00010101 00010000 00011000 00101110 01000010
* | | |
* 14 (bit 17 0 32
* from @src)
*
* Note that @dst and @src might overlap partially or entirely.
*
* This is implemented in the obvious way, with a shift and carry
* step for each moved bit. Optimisation is left as an exercise
* for the compiler.
*/
void bitmap_cut(unsigned long *dst, const unsigned long *src,
unsigned int first, unsigned int cut, unsigned int nbits)
{
unsigned int len = BITS_TO_LONGS(nbits);
unsigned long keep = 0, carry;
int i;
if (first % BITS_PER_LONG) {
keep = src[first / BITS_PER_LONG] &
(~0UL >> (BITS_PER_LONG - first % BITS_PER_LONG));
}
memmove(dst, src, len * sizeof(*dst));
while (cut--) {
for (i = first / BITS_PER_LONG; i < len; i++) {
if (i < len - 1)
carry = dst[i + 1] & 1UL;
else
carry = 0;
dst[i] = (dst[i] >> 1) | (carry << (BITS_PER_LONG - 1));
}
}
dst[first / BITS_PER_LONG] &= ~0UL << (first % BITS_PER_LONG);
dst[first / BITS_PER_LONG] |= keep;
}
EXPORT_SYMBOL(bitmap_cut);
int __bitmap_and(unsigned long *dst, const unsigned long *bitmap1,
const unsigned long *bitmap2, unsigned int bits)
{
unsigned int k;
unsigned int lim = bits/BITS_PER_LONG;
unsigned long result = 0;
for (k = 0; k < lim; k++)
result |= (dst[k] = bitmap1[k] & bitmap2[k]);
if (bits % BITS_PER_LONG)
result |= (dst[k] = bitmap1[k] & bitmap2[k] &
BITMAP_LAST_WORD_MASK(bits));
return result != 0;
}
EXPORT_SYMBOL(__bitmap_and);
void __bitmap_or(unsigned long *dst, const unsigned long *bitmap1,
const unsigned long *bitmap2, unsigned int bits)
{
unsigned int k;
unsigned int nr = BITS_TO_LONGS(bits);
for (k = 0; k < nr; k++)
dst[k] = bitmap1[k] | bitmap2[k];
}
EXPORT_SYMBOL(__bitmap_or);
void __bitmap_xor(unsigned long *dst, const unsigned long *bitmap1,
const unsigned long *bitmap2, unsigned int bits)
{
unsigned int k;
unsigned int nr = BITS_TO_LONGS(bits);
for (k = 0; k < nr; k++)
dst[k] = bitmap1[k] ^ bitmap2[k];
}
EXPORT_SYMBOL(__bitmap_xor);
int __bitmap_andnot(unsigned long *dst, const unsigned long *bitmap1,
const unsigned long *bitmap2, unsigned int bits)
{
unsigned int k;
unsigned int lim = bits/BITS_PER_LONG;
unsigned long result = 0;
for (k = 0; k < lim; k++)
result |= (dst[k] = bitmap1[k] & ~bitmap2[k]);
if (bits % BITS_PER_LONG)
result |= (dst[k] = bitmap1[k] & ~bitmap2[k] &
BITMAP_LAST_WORD_MASK(bits));
return result != 0;
}
EXPORT_SYMBOL(__bitmap_andnot);
void __bitmap_replace(unsigned long *dst,
const unsigned long *old, const unsigned long *new,
const unsigned long *mask, unsigned int nbits)
{
unsigned int k;
unsigned int nr = BITS_TO_LONGS(nbits);
for (k = 0; k < nr; k++)
dst[k] = (old[k] & ~mask[k]) | (new[k] & mask[k]);
}
EXPORT_SYMBOL(__bitmap_replace);
int __bitmap_intersects(const unsigned long *bitmap1,
const unsigned long *bitmap2, unsigned int bits)
{
unsigned int k, lim = bits/BITS_PER_LONG;
for (k = 0; k < lim; ++k)
if (bitmap1[k] & bitmap2[k])
return 1;
if (bits % BITS_PER_LONG)
if ((bitmap1[k] & bitmap2[k]) & BITMAP_LAST_WORD_MASK(bits))
return 1;
return 0;
}
EXPORT_SYMBOL(__bitmap_intersects);
int __bitmap_subset(const unsigned long *bitmap1,
const unsigned long *bitmap2, unsigned int bits)
{
unsigned int k, lim = bits/BITS_PER_LONG;
for (k = 0; k < lim; ++k)
if (bitmap1[k] & ~bitmap2[k])
return 0;
if (bits % BITS_PER_LONG)
if ((bitmap1[k] & ~bitmap2[k]) & BITMAP_LAST_WORD_MASK(bits))
return 0;
return 1;
}
EXPORT_SYMBOL(__bitmap_subset);
int __bitmap_weight(const unsigned long *bitmap, unsigned int bits)
{
unsigned int k, lim = bits/BITS_PER_LONG;
int w = 0;
for (k = 0; k < lim; k++)
w += hweight_long(bitmap[k]); if (bits % BITS_PER_LONG) w += hweight_long(bitmap[k] & BITMAP_LAST_WORD_MASK(bits)); return w;
}
EXPORT_SYMBOL(__bitmap_weight);
void __bitmap_set(unsigned long *map, unsigned int start, int len)
{
unsigned long *p = map + BIT_WORD(start); const unsigned int size = start + len;
int bits_to_set = BITS_PER_LONG - (start % BITS_PER_LONG);
unsigned long mask_to_set = BITMAP_FIRST_WORD_MASK(start);
while (len - bits_to_set >= 0) {
*p |= mask_to_set;
len -= bits_to_set;
bits_to_set = BITS_PER_LONG;
mask_to_set = ~0UL;
p++;
}
if (len) {
mask_to_set &= BITMAP_LAST_WORD_MASK(size);
*p |= mask_to_set;
}
}
EXPORT_SYMBOL(__bitmap_set);
void __bitmap_clear(unsigned long *map, unsigned int start, int len)
{
unsigned long *p = map + BIT_WORD(start); const unsigned int size = start + len;
int bits_to_clear = BITS_PER_LONG - (start % BITS_PER_LONG);
unsigned long mask_to_clear = BITMAP_FIRST_WORD_MASK(start);
while (len - bits_to_clear >= 0) {
*p &= ~mask_to_clear;
len -= bits_to_clear;
bits_to_clear = BITS_PER_LONG;
mask_to_clear = ~0UL;
p++;
}
if (len) {
mask_to_clear &= BITMAP_LAST_WORD_MASK(size);
*p &= ~mask_to_clear;
}
}
EXPORT_SYMBOL(__bitmap_clear);
/**
* bitmap_find_next_zero_area_off - find a contiguous aligned zero area
* @map: The address to base the search on
* @size: The bitmap size in bits
* @start: The bitnumber to start searching at
* @nr: The number of zeroed bits we're looking for
* @align_mask: Alignment mask for zero area
* @align_offset: Alignment offset for zero area.
*
* The @align_mask should be one less than a power of 2; the effect is that
* the bit offset of all zero areas this function finds plus @align_offset
* is multiple of that power of 2.
*/
unsigned long bitmap_find_next_zero_area_off(unsigned long *map,
unsigned long size,
unsigned long start,
unsigned int nr,
unsigned long align_mask,
unsigned long align_offset)
{
unsigned long index, end, i;
again:
index = find_next_zero_bit(map, size, start);
/* Align allocation */
index = __ALIGN_MASK(index + align_offset, align_mask) - align_offset;
end = index + nr;
if (end > size)
return end;
i = find_next_bit(map, end, index);
if (i < end) {
start = i + 1;
goto again;
}
return index;
}
EXPORT_SYMBOL(bitmap_find_next_zero_area_off);
/*
* Bitmap printing & parsing functions: first version by Nadia Yvette Chambers,
* second version by Paul Jackson, third by Joe Korty.
*/
/**
* bitmap_parse_user - convert an ASCII hex string in a user buffer into a bitmap
*
* @ubuf: pointer to user buffer containing string.
* @ulen: buffer size in bytes. If string is smaller than this
* then it must be terminated with a \0.
* @maskp: pointer to bitmap array that will contain result.
* @nmaskbits: size of bitmap, in bits.
*/
int bitmap_parse_user(const char __user *ubuf,
unsigned int ulen, unsigned long *maskp,
int nmaskbits)
{
char *buf;
int ret;
buf = memdup_user_nul(ubuf, ulen);
if (IS_ERR(buf))
return PTR_ERR(buf);
ret = bitmap_parse(buf, UINT_MAX, maskp, nmaskbits);
kfree(buf);
return ret;
}
EXPORT_SYMBOL(bitmap_parse_user);
/**
* bitmap_print_to_pagebuf - convert bitmap to list or hex format ASCII string
* @list: indicates whether the bitmap must be list
* @buf: page aligned buffer into which string is placed
* @maskp: pointer to bitmap to convert
* @nmaskbits: size of bitmap, in bits
*
* Output format is a comma-separated list of decimal numbers and
* ranges if list is specified or hex digits grouped into comma-separated
* sets of 8 digits/set. Returns the number of characters written to buf.
*
* It is assumed that @buf is a pointer into a PAGE_SIZE, page-aligned
* area and that sufficient storage remains at @buf to accommodate the
* bitmap_print_to_pagebuf() output. Returns the number of characters
* actually printed to @buf, excluding terminating '\0'.
*/
int bitmap_print_to_pagebuf(bool list, char *buf, const unsigned long *maskp,
int nmaskbits)
{
ptrdiff_t len = PAGE_SIZE - offset_in_page(buf);
return list ? scnprintf(buf, len, "%*pbl\n", nmaskbits, maskp) :
scnprintf(buf, len, "%*pb\n", nmaskbits, maskp);
}
EXPORT_SYMBOL(bitmap_print_to_pagebuf);
/**
* bitmap_print_to_buf - convert bitmap to list or hex format ASCII string
* @list: indicates whether the bitmap must be list
* true: print in decimal list format
* false: print in hexadecimal bitmask format
*/
static int bitmap_print_to_buf(bool list, char *buf, const unsigned long *maskp,
int nmaskbits, loff_t off, size_t count)
{
const char *fmt = list ? "%*pbl\n" : "%*pb\n";
ssize_t size;
void *data;
data = kasprintf(GFP_KERNEL, fmt, nmaskbits, maskp);
if (!data)
return -ENOMEM;
size = memory_read_from_buffer(buf, count, &off, data, strlen(data) + 1);
kfree(data);
return size;
}
/**
* bitmap_print_bitmask_to_buf - convert bitmap to hex bitmask format ASCII string
*
* The bitmap_print_to_pagebuf() is used indirectly via its cpumap wrapper
* cpumap_print_to_pagebuf() or directly by drivers to export hexadecimal
* bitmask and decimal list to userspace by sysfs ABI.
* Drivers might be using a normal attribute for this kind of ABIs. A
* normal attribute typically has show entry as below:
* static ssize_t example_attribute_show(struct device *dev,
* struct device_attribute *attr, char *buf)
* {
* ...
* return bitmap_print_to_pagebuf(true, buf, &mask, nr_trig_max);
* }
* show entry of attribute has no offset and count parameters and this
* means the file is limited to one page only.
* bitmap_print_to_pagebuf() API works terribly well for this kind of
* normal attribute with buf parameter and without offset, count:
* bitmap_print_to_pagebuf(bool list, char *buf, const unsigned long *maskp,
* int nmaskbits)
* {
* }
* The problem is once we have a large bitmap, we have a chance to get a
* bitmask or list more than one page. Especially for list, it could be
* as complex as 0,3,5,7,9,... We have no simple way to know it exact size.
* It turns out bin_attribute is a way to break this limit. bin_attribute
* has show entry as below:
* static ssize_t
* example_bin_attribute_show(struct file *filp, struct kobject *kobj,
* struct bin_attribute *attr, char *buf,
* loff_t offset, size_t count)
* {
* ...
* }
* With the new offset and count parameters, this makes sysfs ABI be able
* to support file size more than one page. For example, offset could be
* >= 4096.
* bitmap_print_bitmask_to_buf(), bitmap_print_list_to_buf() wit their
* cpumap wrapper cpumap_print_bitmask_to_buf(), cpumap_print_list_to_buf()
* make those drivers be able to support large bitmask and list after they
* move to use bin_attribute. In result, we have to pass the corresponding
* parameters such as off, count from bin_attribute show entry to this API.
*
* @buf: buffer into which string is placed
* @maskp: pointer to bitmap to convert
* @nmaskbits: size of bitmap, in bits
* @off: in the string from which we are copying, We copy to @buf
* @count: the maximum number of bytes to print
*
* The role of cpumap_print_bitmask_to_buf() and cpumap_print_list_to_buf()
* is similar with cpumap_print_to_pagebuf(), the difference is that
* bitmap_print_to_pagebuf() mainly serves sysfs attribute with the assumption
* the destination buffer is exactly one page and won't be more than one page.
* cpumap_print_bitmask_to_buf() and cpumap_print_list_to_buf(), on the other
* hand, mainly serves bin_attribute which doesn't work with exact one page,
* and it can break the size limit of converted decimal list and hexadecimal
* bitmask.
*
* WARNING!
*
* This function is not a replacement for sprintf() or bitmap_print_to_pagebuf().
* It is intended to workaround sysfs limitations discussed above and should be
* used carefully in general case for the following reasons:
* - Time complexity is O(nbits^2/count), comparing to O(nbits) for snprintf().
* - Memory complexity is O(nbits), comparing to O(1) for snprintf().
* - @off and @count are NOT offset and number of bits to print.
* - If printing part of bitmap as list, the resulting string is not a correct
* list representation of bitmap. Particularly, some bits within or out of
* related interval may be erroneously set or unset. The format of the string
* may be broken, so bitmap_parselist-like parser may fail parsing it.
* - If printing the whole bitmap as list by parts, user must ensure the order
* of calls of the function such that the offset is incremented linearly.
* - If printing the whole bitmap as list by parts, user must keep bitmap
* unchanged between the very first and very last call. Otherwise concatenated
* result may be incorrect, and format may be broken.
*
* Returns the number of characters actually printed to @buf
*/
int bitmap_print_bitmask_to_buf(char *buf, const unsigned long *maskp,
int nmaskbits, loff_t off, size_t count)
{
return bitmap_print_to_buf(false, buf, maskp, nmaskbits, off, count);
}
EXPORT_SYMBOL(bitmap_print_bitmask_to_buf);
/**
* bitmap_print_list_to_buf - convert bitmap to decimal list format ASCII string
*
* Everything is same with the above bitmap_print_bitmask_to_buf() except
* the print format.
*/
int bitmap_print_list_to_buf(char *buf, const unsigned long *maskp,
int nmaskbits, loff_t off, size_t count)
{
return bitmap_print_to_buf(true, buf, maskp, nmaskbits, off, count);
}
EXPORT_SYMBOL(bitmap_print_list_to_buf);
/*
* Region 9-38:4/10 describes the following bitmap structure:
* 0 9 12 18 38 N
* .........****......****......****..................
* ^ ^ ^ ^ ^
* start off group_len end nbits
*/
struct region {
unsigned int start;
unsigned int off;
unsigned int group_len;
unsigned int end;
unsigned int nbits;
};
static void bitmap_set_region(const struct region *r, unsigned long *bitmap)
{
unsigned int start;
for (start = r->start; start <= r->end; start += r->group_len)
bitmap_set(bitmap, start, min(r->end - start + 1, r->off));
}
static int bitmap_check_region(const struct region *r)
{
if (r->start > r->end || r->group_len == 0 || r->off > r->group_len)
return -EINVAL;
if (r->end >= r->nbits)
return -ERANGE;
return 0;
}
static const char *bitmap_getnum(const char *str, unsigned int *num,
unsigned int lastbit)
{
unsigned long long n;
unsigned int len;
if (str[0] == 'N') {
*num = lastbit;
return str + 1;
}
len = _parse_integer(str, 10, &n);
if (!len)
return ERR_PTR(-EINVAL);
if (len & KSTRTOX_OVERFLOW || n != (unsigned int)n)
return ERR_PTR(-EOVERFLOW);
*num = n;
return str + len;
}
static inline bool end_of_str(char c)
{
return c == '\0' || c == '\n';
}
static inline bool __end_of_region(char c)
{
return isspace(c) || c == ',';
}
static inline bool end_of_region(char c)
{
return __end_of_region(c) || end_of_str(c);
}
/*
* The format allows commas and whitespaces at the beginning
* of the region.
*/
static const char *bitmap_find_region(const char *str)
{
while (__end_of_region(*str))
str++;
return end_of_str(*str) ? NULL : str;
}
static const char *bitmap_find_region_reverse(const char *start, const char *end)
{
while (start <= end && __end_of_region(*end))
end--;
return end;
}
static const char *bitmap_parse_region(const char *str, struct region *r)
{
unsigned int lastbit = r->nbits - 1;
if (!strncasecmp(str, "all", 3)) {
r->start = 0;
r->end = lastbit;
str += 3;
goto check_pattern;
}
str = bitmap_getnum(str, &r->start, lastbit);
if (IS_ERR(str))
return str;
if (end_of_region(*str))
goto no_end;
if (*str != '-')
return ERR_PTR(-EINVAL);
str = bitmap_getnum(str + 1, &r->end, lastbit);
if (IS_ERR(str))
return str;
check_pattern:
if (end_of_region(*str))
goto no_pattern;
if (*str != ':')
return ERR_PTR(-EINVAL);
str = bitmap_getnum(str + 1, &r->off, lastbit);
if (IS_ERR(str))
return str;
if (*str != '/')
return ERR_PTR(-EINVAL);
return bitmap_getnum(str + 1, &r->group_len, lastbit);
no_end:
r->end = r->start;
no_pattern:
r->off = r->end + 1;
r->group_len = r->end + 1;
return end_of_str(*str) ? NULL : str;
}
/**
* bitmap_parselist - convert list format ASCII string to bitmap
* @buf: read user string from this buffer; must be terminated
* with a \0 or \n.
* @maskp: write resulting mask here
* @nmaskbits: number of bits in mask to be written
*
* Input format is a comma-separated list of decimal numbers and
* ranges. Consecutively set bits are shown as two hyphen-separated
* decimal numbers, the smallest and largest bit numbers set in
* the range.
* Optionally each range can be postfixed to denote that only parts of it
* should be set. The range will divided to groups of specific size.
* From each group will be used only defined amount of bits.
* Syntax: range:used_size/group_size
* Example: 0-1023:2/256 ==> 0,1,256,257,512,513,768,769
* The value 'N' can be used as a dynamically substituted token for the
* maximum allowed value; i.e (nmaskbits - 1). Keep in mind that it is
* dynamic, so if system changes cause the bitmap width to change, such
* as more cores in a CPU list, then any ranges using N will also change.
*
* Returns: 0 on success, -errno on invalid input strings. Error values:
*
* - ``-EINVAL``: wrong region format
* - ``-EINVAL``: invalid character in string
* - ``-ERANGE``: bit number specified too large for mask
* - ``-EOVERFLOW``: integer overflow in the input parameters
*/
int bitmap_parselist(const char *buf, unsigned long *maskp, int nmaskbits)
{
struct region r;
long ret;
r.nbits = nmaskbits;
bitmap_zero(maskp, r.nbits);
while (buf) {
buf = bitmap_find_region(buf);
if (buf == NULL)
return 0;
buf = bitmap_parse_region(buf, &r);
if (IS_ERR(buf))
return PTR_ERR(buf);
ret = bitmap_check_region(&r);
if (ret)
return ret;
bitmap_set_region(&r, maskp);
}
return 0;
}
EXPORT_SYMBOL(bitmap_parselist);
/**
* bitmap_parselist_user()
*
* @ubuf: pointer to user buffer containing string.
* @ulen: buffer size in bytes. If string is smaller than this
* then it must be terminated with a \0.
* @maskp: pointer to bitmap array that will contain result.
* @nmaskbits: size of bitmap, in bits.
*
* Wrapper for bitmap_parselist(), providing it with user buffer.
*/
int bitmap_parselist_user(const char __user *ubuf,
unsigned int ulen, unsigned long *maskp,
int nmaskbits)
{
char *buf;
int ret;
buf = memdup_user_nul(ubuf, ulen);
if (IS_ERR(buf))
return PTR_ERR(buf);
ret = bitmap_parselist(buf, maskp, nmaskbits);
kfree(buf);
return ret;
}
EXPORT_SYMBOL(bitmap_parselist_user);
static const char *bitmap_get_x32_reverse(const char *start,
const char *end, u32 *num)
{
u32 ret = 0;
int c, i;
for (i = 0; i < 32; i += 4) {
c = hex_to_bin(*end--);
if (c < 0)
return ERR_PTR(-EINVAL);
ret |= c << i;
if (start > end || __end_of_region(*end))
goto out;
}
if (hex_to_bin(*end--) >= 0)
return ERR_PTR(-EOVERFLOW);
out:
*num = ret;
return end;
}
/**
* bitmap_parse - convert an ASCII hex string into a bitmap.
* @start: pointer to buffer containing string.
* @buflen: buffer size in bytes. If string is smaller than this
* then it must be terminated with a \0 or \n. In that case,
* UINT_MAX may be provided instead of string length.
* @maskp: pointer to bitmap array that will contain result.
* @nmaskbits: size of bitmap, in bits.
*
* Commas group hex digits into chunks. Each chunk defines exactly 32
* bits of the resultant bitmask. No chunk may specify a value larger
* than 32 bits (%-EOVERFLOW), and if a chunk specifies a smaller value
* then leading 0-bits are prepended. %-EINVAL is returned for illegal
* characters. Grouping such as "1,,5", ",44", "," or "" is allowed.
* Leading, embedded and trailing whitespace accepted.
*/
int bitmap_parse(const char *start, unsigned int buflen,
unsigned long *maskp, int nmaskbits)
{
const char *end = strnchrnul(start, buflen, '\n') - 1;
int chunks = BITS_TO_U32(nmaskbits);
u32 *bitmap = (u32 *)maskp;
int unset_bit;
int chunk;
for (chunk = 0; ; chunk++) {
end = bitmap_find_region_reverse(start, end);
if (start > end)
break;
if (!chunks--)
return -EOVERFLOW;
#if defined(CONFIG_64BIT) && defined(__BIG_ENDIAN)
end = bitmap_get_x32_reverse(start, end, &bitmap[chunk ^ 1]);
#else
end = bitmap_get_x32_reverse(start, end, &bitmap[chunk]);
#endif
if (IS_ERR(end))
return PTR_ERR(end);
}
unset_bit = (BITS_TO_U32(nmaskbits) - chunks) * 32;
if (unset_bit < nmaskbits) {
bitmap_clear(maskp, unset_bit, nmaskbits - unset_bit);
return 0;
}
if (find_next_bit(maskp, unset_bit, nmaskbits) != unset_bit)
return -EOVERFLOW;
return 0;
}
EXPORT_SYMBOL(bitmap_parse);
/**
* bitmap_pos_to_ord - find ordinal of set bit at given position in bitmap
* @buf: pointer to a bitmap
* @pos: a bit position in @buf (0 <= @pos < @nbits)
* @nbits: number of valid bit positions in @buf
*
* Map the bit at position @pos in @buf (of length @nbits) to the
* ordinal of which set bit it is. If it is not set or if @pos
* is not a valid bit position, map to -1.
*
* If for example, just bits 4 through 7 are set in @buf, then @pos
* values 4 through 7 will get mapped to 0 through 3, respectively,
* and other @pos values will get mapped to -1. When @pos value 7
* gets mapped to (returns) @ord value 3 in this example, that means
* that bit 7 is the 3rd (starting with 0th) set bit in @buf.
*
* The bit positions 0 through @bits are valid positions in @buf.
*/
static int bitmap_pos_to_ord(const unsigned long *buf, unsigned int pos, unsigned int nbits)
{
if (pos >= nbits || !test_bit(pos, buf))
return -1;
return __bitmap_weight(buf, pos);
}
/**
* bitmap_ord_to_pos - find position of n-th set bit in bitmap
* @buf: pointer to bitmap
* @ord: ordinal bit position (n-th set bit, n >= 0)
* @nbits: number of valid bit positions in @buf
*
* Map the ordinal offset of bit @ord in @buf to its position in @buf.
* Value of @ord should be in range 0 <= @ord < weight(buf). If @ord
* >= weight(buf), returns @nbits.
*
* If for example, just bits 4 through 7 are set in @buf, then @ord
* values 0 through 3 will get mapped to 4 through 7, respectively,
* and all other @ord values returns @nbits. When @ord value 3
* gets mapped to (returns) @pos value 7 in this example, that means
* that the 3rd set bit (starting with 0th) is at position 7 in @buf.
*
* The bit positions 0 through @nbits-1 are valid positions in @buf.
*/
unsigned int bitmap_ord_to_pos(const unsigned long *buf, unsigned int ord, unsigned int nbits)
{
unsigned int pos;
for (pos = find_first_bit(buf, nbits);
pos < nbits && ord;
pos = find_next_bit(buf, nbits, pos + 1))
ord--;
return pos;
}
/**
* bitmap_remap - Apply map defined by a pair of bitmaps to another bitmap
* @dst: remapped result
* @src: subset to be remapped
* @old: defines domain of map
* @new: defines range of map
* @nbits: number of bits in each of these bitmaps
*
* Let @old and @new define a mapping of bit positions, such that
* whatever position is held by the n-th set bit in @old is mapped
* to the n-th set bit in @new. In the more general case, allowing
* for the possibility that the weight 'w' of @new is less than the
* weight of @old, map the position of the n-th set bit in @old to
* the position of the m-th set bit in @new, where m == n % w.
*
* If either of the @old and @new bitmaps are empty, or if @src and
* @dst point to the same location, then this routine copies @src
* to @dst.
*
* The positions of unset bits in @old are mapped to themselves
* (the identify map).
*
* Apply the above specified mapping to @src, placing the result in
* @dst, clearing any bits previously set in @dst.
*
* For example, lets say that @old has bits 4 through 7 set, and
* @new has bits 12 through 15 set. This defines the mapping of bit
* position 4 to 12, 5 to 13, 6 to 14 and 7 to 15, and of all other
* bit positions unchanged. So if say @src comes into this routine
* with bits 1, 5 and 7 set, then @dst should leave with bits 1,
* 13 and 15 set.
*/
void bitmap_remap(unsigned long *dst, const unsigned long *src,
const unsigned long *old, const unsigned long *new,
unsigned int nbits)
{
unsigned int oldbit, w;
if (dst == src) /* following doesn't handle inplace remaps */
return;
bitmap_zero(dst, nbits);
w = bitmap_weight(new, nbits);
for_each_set_bit(oldbit, src, nbits) {
int n = bitmap_pos_to_ord(old, oldbit, nbits);
if (n < 0 || w == 0)
set_bit(oldbit, dst); /* identity map */
else
set_bit(bitmap_ord_to_pos(new, n % w, nbits), dst);
}
}
EXPORT_SYMBOL(bitmap_remap);
/**
* bitmap_bitremap - Apply map defined by a pair of bitmaps to a single bit
* @oldbit: bit position to be mapped
* @old: defines domain of map
* @new: defines range of map
* @bits: number of bits in each of these bitmaps
*
* Let @old and @new define a mapping of bit positions, such that
* whatever position is held by the n-th set bit in @old is mapped
* to the n-th set bit in @new. In the more general case, allowing
* for the possibility that the weight 'w' of @new is less than the
* weight of @old, map the position of the n-th set bit in @old to
* the position of the m-th set bit in @new, where m == n % w.
*
* The positions of unset bits in @old are mapped to themselves
* (the identify map).
*
* Apply the above specified mapping to bit position @oldbit, returning
* the new bit position.
*
* For example, lets say that @old has bits 4 through 7 set, and
* @new has bits 12 through 15 set. This defines the mapping of bit
* position 4 to 12, 5 to 13, 6 to 14 and 7 to 15, and of all other
* bit positions unchanged. So if say @oldbit is 5, then this routine
* returns 13.
*/
int bitmap_bitremap(int oldbit, const unsigned long *old,
const unsigned long *new, int bits)
{
int w = bitmap_weight(new, bits);
int n = bitmap_pos_to_ord(old, oldbit, bits);
if (n < 0 || w == 0)
return oldbit;
else
return bitmap_ord_to_pos(new, n % w, bits);
}
EXPORT_SYMBOL(bitmap_bitremap);
#ifdef CONFIG_NUMA
/**
* bitmap_onto - translate one bitmap relative to another
* @dst: resulting translated bitmap
* @orig: original untranslated bitmap
* @relmap: bitmap relative to which translated
* @bits: number of bits in each of these bitmaps
*
* Set the n-th bit of @dst iff there exists some m such that the
* n-th bit of @relmap is set, the m-th bit of @orig is set, and
* the n-th bit of @relmap is also the m-th _set_ bit of @relmap.
* (If you understood the previous sentence the first time your
* read it, you're overqualified for your current job.)
*
* In other words, @orig is mapped onto (surjectively) @dst,
* using the map { <n, m> | the n-th bit of @relmap is the
* m-th set bit of @relmap }.
*
* Any set bits in @orig above bit number W, where W is the
* weight of (number of set bits in) @relmap are mapped nowhere.
* In particular, if for all bits m set in @orig, m >= W, then
* @dst will end up empty. In situations where the possibility
* of such an empty result is not desired, one way to avoid it is
* to use the bitmap_fold() operator, below, to first fold the
* @orig bitmap over itself so that all its set bits x are in the
* range 0 <= x < W. The bitmap_fold() operator does this by
* setting the bit (m % W) in @dst, for each bit (m) set in @orig.
*
* Example [1] for bitmap_onto():
* Let's say @relmap has bits 30-39 set, and @orig has bits
* 1, 3, 5, 7, 9 and 11 set. Then on return from this routine,
* @dst will have bits 31, 33, 35, 37 and 39 set.
*
* When bit 0 is set in @orig, it means turn on the bit in
* @dst corresponding to whatever is the first bit (if any)
* that is turned on in @relmap. Since bit 0 was off in the
* above example, we leave off that bit (bit 30) in @dst.
*
* When bit 1 is set in @orig (as in the above example), it
* means turn on the bit in @dst corresponding to whatever
* is the second bit that is turned on in @relmap. The second
* bit in @relmap that was turned on in the above example was
* bit 31, so we turned on bit 31 in @dst.
*
* Similarly, we turned on bits 33, 35, 37 and 39 in @dst,
* because they were the 4th, 6th, 8th and 10th set bits
* set in @relmap, and the 4th, 6th, 8th and 10th bits of
* @orig (i.e. bits 3, 5, 7 and 9) were also set.
*
* When bit 11 is set in @orig, it means turn on the bit in
* @dst corresponding to whatever is the twelfth bit that is
* turned on in @relmap. In the above example, there were
* only ten bits turned on in @relmap (30..39), so that bit
* 11 was set in @orig had no affect on @dst.
*
* Example [2] for bitmap_fold() + bitmap_onto():
* Let's say @relmap has these ten bits set::
*
* 40 41 42 43 45 48 53 61 74 95
*
* (for the curious, that's 40 plus the first ten terms of the
* Fibonacci sequence.)
*
* Further lets say we use the following code, invoking
* bitmap_fold() then bitmap_onto, as suggested above to
* avoid the possibility of an empty @dst result::
*
* unsigned long *tmp; // a temporary bitmap's bits
*
* bitmap_fold(tmp, orig, bitmap_weight(relmap, bits), bits);
* bitmap_onto(dst, tmp, relmap, bits);
*
* Then this table shows what various values of @dst would be, for
* various @orig's. I list the zero-based positions of each set bit.
* The tmp column shows the intermediate result, as computed by
* using bitmap_fold() to fold the @orig bitmap modulo ten
* (the weight of @relmap):
*
* =============== ============== =================
* @orig tmp @dst
* 0 0 40
* 1 1 41
* 9 9 95
* 10 0 40 [#f1]_
* 1 3 5 7 1 3 5 7 41 43 48 61
* 0 1 2 3 4 0 1 2 3 4 40 41 42 43 45
* 0 9 18 27 0 9 8 7 40 61 74 95
* 0 10 20 30 0 40
* 0 11 22 33 0 1 2 3 40 41 42 43
* 0 12 24 36 0 2 4 6 40 42 45 53
* 78 102 211 1 2 8 41 42 74 [#f1]_
* =============== ============== =================
*
* .. [#f1]
*
* For these marked lines, if we hadn't first done bitmap_fold()
* into tmp, then the @dst result would have been empty.
*
* If either of @orig or @relmap is empty (no set bits), then @dst
* will be returned empty.
*
* If (as explained above) the only set bits in @orig are in positions
* m where m >= W, (where W is the weight of @relmap) then @dst will
* once again be returned empty.
*
* All bits in @dst not set by the above rule are cleared.
*/
void bitmap_onto(unsigned long *dst, const unsigned long *orig,
const unsigned long *relmap, unsigned int bits)
{
unsigned int n, m; /* same meaning as in above comment */
if (dst == orig) /* following doesn't handle inplace mappings */
return;
bitmap_zero(dst, bits);
/*
* The following code is a more efficient, but less
* obvious, equivalent to the loop:
* for (m = 0; m < bitmap_weight(relmap, bits); m++) {
* n = bitmap_ord_to_pos(orig, m, bits);
* if (test_bit(m, orig))
* set_bit(n, dst);
* }
*/
m = 0;
for_each_set_bit(n, relmap, bits) {
/* m == bitmap_pos_to_ord(relmap, n, bits) */
if (test_bit(m, orig))
set_bit(n, dst);
m++;
}
}
/**
* bitmap_fold - fold larger bitmap into smaller, modulo specified size
* @dst: resulting smaller bitmap
* @orig: original larger bitmap
* @sz: specified size
* @nbits: number of bits in each of these bitmaps
*
* For each bit oldbit in @orig, set bit oldbit mod @sz in @dst.
* Clear all other bits in @dst. See further the comment and
* Example [2] for bitmap_onto() for why and how to use this.
*/
void bitmap_fold(unsigned long *dst, const unsigned long *orig,
unsigned int sz, unsigned int nbits)
{
unsigned int oldbit;
if (dst == orig) /* following doesn't handle inplace mappings */
return;
bitmap_zero(dst, nbits);
for_each_set_bit(oldbit, orig, nbits)
set_bit(oldbit % sz, dst);
}
#endif /* CONFIG_NUMA */
/*
* Common code for bitmap_*_region() routines.
* bitmap: array of unsigned longs corresponding to the bitmap
* pos: the beginning of the region
* order: region size (log base 2 of number of bits)
* reg_op: operation(s) to perform on that region of bitmap
*
* Can set, verify and/or release a region of bits in a bitmap,
* depending on which combination of REG_OP_* flag bits is set.
*
* A region of a bitmap is a sequence of bits in the bitmap, of
* some size '1 << order' (a power of two), aligned to that same
* '1 << order' power of two.
*
* Returns 1 if REG_OP_ISFREE succeeds (region is all zero bits).
* Returns 0 in all other cases and reg_ops.
*/
enum {
REG_OP_ISFREE, /* true if region is all zero bits */
REG_OP_ALLOC, /* set all bits in region */
REG_OP_RELEASE, /* clear all bits in region */
};
static int __reg_op(unsigned long *bitmap, unsigned int pos, int order, int reg_op)
{
int nbits_reg; /* number of bits in region */
int index; /* index first long of region in bitmap */
int offset; /* bit offset region in bitmap[index] */
int nlongs_reg; /* num longs spanned by region in bitmap */
int nbitsinlong; /* num bits of region in each spanned long */
unsigned long mask; /* bitmask for one long of region */
int i; /* scans bitmap by longs */
int ret = 0; /* return value */
/*
* Either nlongs_reg == 1 (for small orders that fit in one long)
* or (offset == 0 && mask == ~0UL) (for larger multiword orders.)
*/
nbits_reg = 1 << order;
index = pos / BITS_PER_LONG;
offset = pos - (index * BITS_PER_LONG);
nlongs_reg = BITS_TO_LONGS(nbits_reg);
nbitsinlong = min(nbits_reg, BITS_PER_LONG);
/*
* Can't do "mask = (1UL << nbitsinlong) - 1", as that
* overflows if nbitsinlong == BITS_PER_LONG.
*/
mask = (1UL << (nbitsinlong - 1));
mask += mask - 1;
mask <<= offset;
switch (reg_op) {
case REG_OP_ISFREE:
for (i = 0; i < nlongs_reg; i++) {
if (bitmap[index + i] & mask)
goto done;
}
ret = 1; /* all bits in region free (zero) */
break;
case REG_OP_ALLOC:
for (i = 0; i < nlongs_reg; i++)
bitmap[index + i] |= mask;
break;
case REG_OP_RELEASE:
for (i = 0; i < nlongs_reg; i++)
bitmap[index + i] &= ~mask;
break;
}
done:
return ret;
}
/**
* bitmap_find_free_region - find a contiguous aligned mem region
* @bitmap: array of unsigned longs corresponding to the bitmap
* @bits: number of bits in the bitmap
* @order: region size (log base 2 of number of bits) to find
*
* Find a region of free (zero) bits in a @bitmap of @bits bits and
* allocate them (set them to one). Only consider regions of length
* a power (@order) of two, aligned to that power of two, which
* makes the search algorithm much faster.
*
* Return the bit offset in bitmap of the allocated region,
* or -errno on failure.
*/
int bitmap_find_free_region(unsigned long *bitmap, unsigned int bits, int order)
{
unsigned int pos, end; /* scans bitmap by regions of size order */
for (pos = 0 ; (end = pos + (1U << order)) <= bits; pos = end) {
if (!__reg_op(bitmap, pos, order, REG_OP_ISFREE))
continue;
__reg_op(bitmap, pos, order, REG_OP_ALLOC);
return pos;
}
return -ENOMEM;
}
EXPORT_SYMBOL(bitmap_find_free_region);
/**
* bitmap_release_region - release allocated bitmap region
* @bitmap: array of unsigned longs corresponding to the bitmap
* @pos: beginning of bit region to release
* @order: region size (log base 2 of number of bits) to release
*
* This is the complement to __bitmap_find_free_region() and releases
* the found region (by clearing it in the bitmap).
*
* No return value.
*/
void bitmap_release_region(unsigned long *bitmap, unsigned int pos, int order)
{
__reg_op(bitmap, pos, order, REG_OP_RELEASE);
}
EXPORT_SYMBOL(bitmap_release_region);
/**
* bitmap_allocate_region - allocate bitmap region
* @bitmap: array of unsigned longs corresponding to the bitmap
* @pos: beginning of bit region to allocate
* @order: region size (log base 2 of number of bits) to allocate
*
* Allocate (set bits in) a specified region of a bitmap.
*
* Return 0 on success, or %-EBUSY if specified region wasn't
* free (not all bits were zero).
*/
int bitmap_allocate_region(unsigned long *bitmap, unsigned int pos, int order)
{
if (!__reg_op(bitmap, pos, order, REG_OP_ISFREE))
return -EBUSY;
return __reg_op(bitmap, pos, order, REG_OP_ALLOC);
}
EXPORT_SYMBOL(bitmap_allocate_region);
/**
* bitmap_copy_le - copy a bitmap, putting the bits into little-endian order.
* @dst: destination buffer
* @src: bitmap to copy
* @nbits: number of bits in the bitmap
*
* Require nbits % BITS_PER_LONG == 0.
*/
#ifdef __BIG_ENDIAN
void bitmap_copy_le(unsigned long *dst, const unsigned long *src, unsigned int nbits)
{
unsigned int i;
for (i = 0; i < nbits/BITS_PER_LONG; i++) {
if (BITS_PER_LONG == 64)
dst[i] = cpu_to_le64(src[i]);
else
dst[i] = cpu_to_le32(src[i]);
}
}
EXPORT_SYMBOL(bitmap_copy_le);
#endif
unsigned long *bitmap_alloc(unsigned int nbits, gfp_t flags)
{
return kmalloc_array(BITS_TO_LONGS(nbits), sizeof(unsigned long),
flags);
}
EXPORT_SYMBOL(bitmap_alloc);
unsigned long *bitmap_zalloc(unsigned int nbits, gfp_t flags)
{
return bitmap_alloc(nbits, flags | __GFP_ZERO);
}
EXPORT_SYMBOL(bitmap_zalloc);
void bitmap_free(const unsigned long *bitmap)
{
kfree(bitmap);
}
EXPORT_SYMBOL(bitmap_free);
static void devm_bitmap_free(void *data)
{
unsigned long *bitmap = data;
bitmap_free(bitmap);
}
unsigned long *devm_bitmap_alloc(struct device *dev,
unsigned int nbits, gfp_t flags)
{
unsigned long *bitmap;
int ret;
bitmap = bitmap_alloc(nbits, flags);
if (!bitmap)
return NULL;
ret = devm_add_action_or_reset(dev, devm_bitmap_free, bitmap);
if (ret)
return NULL;
return bitmap;
}
EXPORT_SYMBOL_GPL(devm_bitmap_alloc);
unsigned long *devm_bitmap_zalloc(struct device *dev,
unsigned int nbits, gfp_t flags)
{
return devm_bitmap_alloc(dev, nbits, flags | __GFP_ZERO);
}
EXPORT_SYMBOL_GPL(devm_bitmap_zalloc);
#if BITS_PER_LONG == 64
/**
* bitmap_from_arr32 - copy the contents of u32 array of bits to bitmap
* @bitmap: array of unsigned longs, the destination bitmap
* @buf: array of u32 (in host byte order), the source bitmap
* @nbits: number of bits in @bitmap
*/
void bitmap_from_arr32(unsigned long *bitmap, const u32 *buf, unsigned int nbits)
{
unsigned int i, halfwords;
halfwords = DIV_ROUND_UP(nbits, 32);
for (i = 0; i < halfwords; i++) {
bitmap[i/2] = (unsigned long) buf[i];
if (++i < halfwords)
bitmap[i/2] |= ((unsigned long) buf[i]) << 32;
}
/* Clear tail bits in last word beyond nbits. */
if (nbits % BITS_PER_LONG)
bitmap[(halfwords - 1) / 2] &= BITMAP_LAST_WORD_MASK(nbits);
}
EXPORT_SYMBOL(bitmap_from_arr32);
/**
* bitmap_to_arr32 - copy the contents of bitmap to a u32 array of bits
* @buf: array of u32 (in host byte order), the dest bitmap
* @bitmap: array of unsigned longs, the source bitmap
* @nbits: number of bits in @bitmap
*/
void bitmap_to_arr32(u32 *buf, const unsigned long *bitmap, unsigned int nbits)
{
unsigned int i, halfwords;
halfwords = DIV_ROUND_UP(nbits, 32);
for (i = 0; i < halfwords; i++) {
buf[i] = (u32) (bitmap[i/2] & UINT_MAX);
if (++i < halfwords)
buf[i] = (u32) (bitmap[i/2] >> 32);
}
/* Clear tail bits in last element of array beyond nbits. */
if (nbits % BITS_PER_LONG)
buf[halfwords - 1] &= (u32) (UINT_MAX >> ((-nbits) & 31));
}
EXPORT_SYMBOL(bitmap_to_arr32);
#endif
/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
* INET An implementation of the TCP/IP protocol suite for the LINUX
* operating system. INET is implemented using the BSD Socket
* interface as the means of communication with the user level.
*
* Definitions for the Interfaces handler.
*
* Version: @(#)dev.h 1.0.10 08/12/93
*
* Authors: Ross Biro
* Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
* Corey Minyard <wf-rch!minyard@relay.EU.net>
* Donald J. Becker, <becker@cesdis.gsfc.nasa.gov>
* Alan Cox, <alan@lxorguk.ukuu.org.uk>
* Bjorn Ekwall. <bj0rn@blox.se>
* Pekka Riikonen <priikone@poseidon.pspt.fi>
*
* Moved to /usr/include/linux for NET3
*/
#ifndef _LINUX_NETDEVICE_H
#define _LINUX_NETDEVICE_H
#include <linux/timer.h>
#include <linux/bug.h>
#include <linux/delay.h>
#include <linux/atomic.h>
#include <linux/prefetch.h>
#include <asm/cache.h>
#include <asm/byteorder.h>
#include <linux/percpu.h>
#include <linux/rculist.h>
#include <linux/workqueue.h>
#include <linux/dynamic_queue_limits.h>
#include <net/net_namespace.h>
#ifdef CONFIG_DCB
#include <net/dcbnl.h>
#endif
#include <net/netprio_cgroup.h>
#include <net/xdp.h>
#include <linux/netdev_features.h>
#include <linux/neighbour.h>
#include <uapi/linux/netdevice.h>
#include <uapi/linux/if_bonding.h>
#include <uapi/linux/pkt_cls.h>
#include <linux/hashtable.h>
#include <linux/rbtree.h>
struct netpoll_info;
struct device;
struct ethtool_ops;
struct phy_device;
struct dsa_port;
struct ip_tunnel_parm;
struct macsec_context;
struct macsec_ops;
struct sfp_bus;
/* 802.11 specific */
struct wireless_dev;
/* 802.15.4 specific */
struct wpan_dev;
struct mpls_dev;
/* UDP Tunnel offloads */
struct udp_tunnel_info;
struct udp_tunnel_nic_info;
struct udp_tunnel_nic;
struct bpf_prog;
struct xdp_buff;
void synchronize_net(void);
void netdev_set_default_ethtool_ops(struct net_device *dev,
const struct ethtool_ops *ops);
/* Backlog congestion levels */
#define NET_RX_SUCCESS 0 /* keep 'em coming, baby */
#define NET_RX_DROP 1 /* packet dropped */
#define MAX_NEST_DEV 8
/*
* Transmit return codes: transmit return codes originate from three different
* namespaces:
*
* - qdisc return codes
* - driver transmit return codes
* - errno values
*
* Drivers are allowed to return any one of those in their hard_start_xmit()
* function. Real network devices commonly used with qdiscs should only return
* the driver transmit return codes though - when qdiscs are used, the actual
* transmission happens asynchronously, so the value is not propagated to
* higher layers. Virtual network devices transmit synchronously; in this case
* the driver transmit return codes are consumed by dev_queue_xmit(), and all
* others are propagated to higher layers.
*/
/* qdisc ->enqueue() return codes. */
#define NET_XMIT_SUCCESS 0x00
#define NET_XMIT_DROP 0x01 /* skb dropped */
#define NET_XMIT_CN 0x02 /* congestion notification */
#define NET_XMIT_MASK 0x0f /* qdisc flags in net/sch_generic.h */
/* NET_XMIT_CN is special. It does not guarantee that this packet is lost. It
* indicates that the device will soon be dropping packets, or already drops
* some packets of the same priority; prompting us to send less aggressively. */
#define net_xmit_eval(e) ((e) == NET_XMIT_CN ? 0 : (e))
#define net_xmit_errno(e) ((e) != NET_XMIT_CN ? -ENOBUFS : 0)
/* Driver transmit return codes */
#define NETDEV_TX_MASK 0xf0
enum netdev_tx {
__NETDEV_TX_MIN = INT_MIN, /* make sure enum is signed */
NETDEV_TX_OK = 0x00, /* driver took care of packet */
NETDEV_TX_BUSY = 0x10, /* driver tx path was busy*/
};
typedef enum netdev_tx netdev_tx_t;
/*
* Current order: NETDEV_TX_MASK > NET_XMIT_MASK >= 0 is significant;
* hard_start_xmit() return < NET_XMIT_MASK means skb was consumed.
*/
static inline bool dev_xmit_complete(int rc)
{
/*
* Positive cases with an skb consumed by a driver:
* - successful transmission (rc == NETDEV_TX_OK)
* - error while transmitting (rc < 0)
* - error while queueing to a different device (rc & NET_XMIT_MASK)
*/
if (likely(rc < NET_XMIT_MASK))
return true;
return false;
}
/*
* Compute the worst-case header length according to the protocols
* used.
*/
#if defined(CONFIG_HYPERV_NET)
# define LL_MAX_HEADER 128
#elif defined(CONFIG_WLAN) || IS_ENABLED(CONFIG_AX25)
# if defined(CONFIG_MAC80211_MESH)
# define LL_MAX_HEADER 128
# else
# define LL_MAX_HEADER 96
# endif
#else
# define LL_MAX_HEADER 32
#endif
#if !IS_ENABLED(CONFIG_NET_IPIP) && !IS_ENABLED(CONFIG_NET_IPGRE) && \
!IS_ENABLED(CONFIG_IPV6_SIT) && !IS_ENABLED(CONFIG_IPV6_TUNNEL)
#define MAX_HEADER LL_MAX_HEADER
#else
#define MAX_HEADER (LL_MAX_HEADER + 48)
#endif
/*
* Old network device statistics. Fields are native words
* (unsigned long) so they can be read and written atomically.
*/
struct net_device_stats {
unsigned long rx_packets;
unsigned long tx_packets;
unsigned long rx_bytes;
unsigned long tx_bytes;
unsigned long rx_errors;
unsigned long tx_errors;
unsigned long rx_dropped;
unsigned long tx_dropped;
unsigned long multicast;
unsigned long collisions;
unsigned long rx_length_errors;
unsigned long rx_over_errors;
unsigned long rx_crc_errors;
unsigned long rx_frame_errors;
unsigned long rx_fifo_errors;
unsigned long rx_missed_errors;
unsigned long tx_aborted_errors;
unsigned long tx_carrier_errors;
unsigned long tx_fifo_errors;
unsigned long tx_heartbeat_errors;
unsigned long tx_window_errors;
unsigned long rx_compressed;
unsigned long tx_compressed;
};
#include <linux/cache.h>
#include <linux/skbuff.h>
#ifdef CONFIG_RPS
#include <linux/static_key.h>
extern struct static_key_false rps_needed;
extern struct static_key_false rfs_needed;
#endif
struct neighbour;
struct neigh_parms;
struct sk_buff;
struct netdev_hw_addr {
struct list_head list;
struct rb_node node;
unsigned char addr[MAX_ADDR_LEN];
unsigned char type;
#define NETDEV_HW_ADDR_T_LAN 1
#define NETDEV_HW_ADDR_T_SAN 2
#define NETDEV_HW_ADDR_T_UNICAST 3
#define NETDEV_HW_ADDR_T_MULTICAST 4
bool global_use;
int sync_cnt;
int refcount;
int synced;
struct rcu_head rcu_head;
};
struct netdev_hw_addr_list {
struct list_head list;
int count;
/* Auxiliary tree for faster lookup on addition and deletion */
struct rb_root tree;
};
#define netdev_hw_addr_list_count(l) ((l)->count)
#define netdev_hw_addr_list_empty(l) (netdev_hw_addr_list_count(l) == 0)
#define netdev_hw_addr_list_for_each(ha, l) \
list_for_each_entry(ha, &(l)->list, list)
#define netdev_uc_count(dev) netdev_hw_addr_list_count(&(dev)->uc)
#define netdev_uc_empty(dev) netdev_hw_addr_list_empty(&(dev)->uc)
#define netdev_for_each_uc_addr(ha, dev) \
netdev_hw_addr_list_for_each(ha, &(dev)->uc)
#define netdev_mc_count(dev) netdev_hw_addr_list_count(&(dev)->mc)
#define netdev_mc_empty(dev) netdev_hw_addr_list_empty(&(dev)->mc)
#define netdev_for_each_mc_addr(ha, dev) \
netdev_hw_addr_list_for_each(ha, &(dev)->mc)
struct hh_cache {
unsigned int hh_len;
seqlock_t hh_lock;
/* cached hardware header; allow for machine alignment needs. */
#define HH_DATA_MOD 16
#define HH_DATA_OFF(__len) \
(HH_DATA_MOD - (((__len - 1) & (HH_DATA_MOD - 1)) + 1))
#define HH_DATA_ALIGN(__len) \
(((__len)+(HH_DATA_MOD-1))&~(HH_DATA_MOD - 1))
unsigned long hh_data[HH_DATA_ALIGN(LL_MAX_HEADER) / sizeof(long)];
};
/* Reserve HH_DATA_MOD byte-aligned hard_header_len, but at least that much.
* Alternative is:
* dev->hard_header_len ? (dev->hard_header_len +
* (HH_DATA_MOD - 1)) & ~(HH_DATA_MOD - 1) : 0
*
* We could use other alignment values, but we must maintain the
* relationship HH alignment <= LL alignment.
*/
#define LL_RESERVED_SPACE(dev) \
((((dev)->hard_header_len+(dev)->needed_headroom)&~(HH_DATA_MOD - 1)) + HH_DATA_MOD)
#define LL_RESERVED_SPACE_EXTRA(dev,extra) \
((((dev)->hard_header_len+(dev)->needed_headroom+(extra))&~(HH_DATA_MOD - 1)) + HH_DATA_MOD)
struct header_ops {
int (*create) (struct sk_buff *skb, struct net_device *dev,
unsigned short type, const void *daddr,
const void *saddr, unsigned int len);
int (*parse)(const struct sk_buff *skb, unsigned char *haddr);
int (*cache)(const struct neighbour *neigh, struct hh_cache *hh, __be16 type);
void (*cache_update)(struct hh_cache *hh,
const struct net_device *dev,
const unsigned char *haddr);
bool (*validate)(const char *ll_header, unsigned int len);
__be16 (*parse_protocol)(const struct sk_buff *skb);
};
/* These flag bits are private to the generic network queueing
* layer; they may not be explicitly referenced by any other
* code.
*/
enum netdev_state_t {
__LINK_STATE_START,
__LINK_STATE_PRESENT,
__LINK_STATE_NOCARRIER,
__LINK_STATE_LINKWATCH_PENDING,
__LINK_STATE_DORMANT,
__LINK_STATE_TESTING,
};
struct gro_list {
struct list_head list;
int count;
};
/*
* size of gro hash buckets, must less than bit number of
* napi_struct::gro_bitmask
*/
#define GRO_HASH_BUCKETS 8
/*
* Structure for NAPI scheduling similar to tasklet but with weighting
*/
struct napi_struct {
/* The poll_list must only be managed by the entity which
* changes the state of the NAPI_STATE_SCHED bit. This means
* whoever atomically sets that bit can add this napi_struct
* to the per-CPU poll_list, and whoever clears that bit
* can remove from the list right before clearing the bit.
*/
struct list_head poll_list;
unsigned long state;
int weight;
int defer_hard_irqs_count;
unsigned long gro_bitmask;
int (*poll)(struct napi_struct *, int);
#ifdef CONFIG_NETPOLL
int poll_owner;
#endif
struct net_device *dev;
struct gro_list gro_hash[GRO_HASH_BUCKETS];
struct sk_buff *skb;
struct list_head rx_list; /* Pending GRO_NORMAL skbs */
int rx_count; /* length of rx_list */
struct hrtimer timer;
struct list_head dev_list;
struct hlist_node napi_hash_node;
unsigned int napi_id;
struct task_struct *thread;
};
enum {
NAPI_STATE_SCHED, /* Poll is scheduled */
NAPI_STATE_MISSED, /* reschedule a napi */
NAPI_STATE_DISABLE, /* Disable pending */
NAPI_STATE_NPSVC, /* Netpoll - don't dequeue from poll_list */
NAPI_STATE_LISTED, /* NAPI added to system lists */
NAPI_STATE_NO_BUSY_POLL, /* Do not add in napi_hash, no busy polling */
NAPI_STATE_IN_BUSY_POLL, /* sk_busy_loop() owns this NAPI */
NAPI_STATE_PREFER_BUSY_POLL, /* prefer busy-polling over softirq processing*/
NAPI_STATE_THREADED, /* The poll is performed inside its own thread*/
NAPI_STATE_SCHED_THREADED, /* Napi is currently scheduled in threaded mode */
};
enum {
NAPIF_STATE_SCHED = BIT(NAPI_STATE_SCHED),
NAPIF_STATE_MISSED = BIT(NAPI_STATE_MISSED),
NAPIF_STATE_DISABLE = BIT(NAPI_STATE_DISABLE),
NAPIF_STATE_NPSVC = BIT(NAPI_STATE_NPSVC),
NAPIF_STATE_LISTED = BIT(NAPI_STATE_LISTED),
NAPIF_STATE_NO_BUSY_POLL = BIT(NAPI_STATE_NO_BUSY_POLL),
NAPIF_STATE_IN_BUSY_POLL = BIT(NAPI_STATE_IN_BUSY_POLL),
NAPIF_STATE_PREFER_BUSY_POLL = BIT(NAPI_STATE_PREFER_BUSY_POLL),
NAPIF_STATE_THREADED = BIT(NAPI_STATE_THREADED),
NAPIF_STATE_SCHED_THREADED = BIT(NAPI_STATE_SCHED_THREADED),
};
enum gro_result {
GRO_MERGED,
GRO_MERGED_FREE,
GRO_HELD,
GRO_NORMAL,
GRO_CONSUMED,
};
typedef enum gro_result gro_result_t;
/*
* enum rx_handler_result - Possible return values for rx_handlers.
* @RX_HANDLER_CONSUMED: skb was consumed by rx_handler, do not process it
* further.
* @RX_HANDLER_ANOTHER: Do another round in receive path. This is indicated in
* case skb->dev was changed by rx_handler.
* @RX_HANDLER_EXACT: Force exact delivery, no wildcard.
* @RX_HANDLER_PASS: Do nothing, pass the skb as if no rx_handler was called.
*
* rx_handlers are functions called from inside __netif_receive_skb(), to do
* special processing of the skb, prior to delivery to protocol handlers.
*
* Currently, a net_device can only have a single rx_handler registered. Trying
* to register a second rx_handler will return -EBUSY.
*
* To register a rx_handler on a net_device, use netdev_rx_handler_register().
* To unregister a rx_handler on a net_device, use
* netdev_rx_handler_unregister().
*
* Upon return, rx_handler is expected to tell __netif_receive_skb() what to
* do with the skb.
*
* If the rx_handler consumed the skb in some way, it should return
* RX_HANDLER_CONSUMED. This is appropriate when the rx_handler arranged for
* the skb to be delivered in some other way.
*
* If the rx_handler changed skb->dev, to divert the skb to another
* net_device, it should return RX_HANDLER_ANOTHER. The rx_handler for the
* new device will be called if it exists.
*
* If the rx_handler decides the skb should be ignored, it should return
* RX_HANDLER_EXACT. The skb will only be delivered to protocol handlers that
* are registered on exact device (ptype->dev == skb->dev).
*
* If the rx_handler didn't change skb->dev, but wants the skb to be normally
* delivered, it should return RX_HANDLER_PASS.
*
* A device without a registered rx_handler will behave as if rx_handler
* returned RX_HANDLER_PASS.
*/
enum rx_handler_result {
RX_HANDLER_CONSUMED,
RX_HANDLER_ANOTHER,
RX_HANDLER_EXACT,
RX_HANDLER_PASS,
};
typedef enum rx_handler_result rx_handler_result_t;
typedef rx_handler_result_t rx_handler_func_t(struct sk_buff **pskb);
void __napi_schedule(struct napi_struct *n);
void __napi_schedule_irqoff(struct napi_struct *n);
static inline bool napi_disable_pending(struct napi_struct *n)
{
return test_bit(NAPI_STATE_DISABLE, &n->state);
}
static inline bool napi_prefer_busy_poll(struct napi_struct *n)
{
return test_bit(NAPI_STATE_PREFER_BUSY_POLL, &n->state);
}
bool napi_schedule_prep(struct napi_struct *n);
/**
* napi_schedule - schedule NAPI poll
* @n: NAPI context
*
* Schedule NAPI poll routine to be called if it is not already
* running.
*/
static inline void napi_schedule(struct napi_struct *n)
{
if (napi_schedule_prep(n))
__napi_schedule(n);
}
/**
* napi_schedule_irqoff - schedule NAPI poll
* @n: NAPI context
*
* Variant of napi_schedule(), assuming hard irqs are masked.
*/
static inline void napi_schedule_irqoff(struct napi_struct *n)
{
if (napi_schedule_prep(n))
__napi_schedule_irqoff(n);
}
/* Try to reschedule poll. Called by dev->poll() after napi_complete(). */
static inline bool napi_reschedule(struct napi_struct *napi)
{
if (napi_schedule_prep(napi)) {
__napi_schedule(napi);
return true;
}
return false;
}
bool napi_complete_done(struct napi_struct *n, int work_done);
/**
* napi_complete - NAPI processing complete
* @n: NAPI context
*
* Mark NAPI processing as complete.
* Consider using napi_complete_done() instead.
* Return false if device should avoid rearming interrupts.
*/
static inline bool napi_complete(struct napi_struct *n)
{
return napi_complete_done(n, 0);
}
int dev_set_threaded(struct net_device *dev, bool threaded);
/**
* napi_disable - prevent NAPI from scheduling
* @n: NAPI context
*
* Stop NAPI from being scheduled on this context.
* Waits till any outstanding processing completes.
*/
void napi_disable(struct napi_struct *n);
void napi_enable(struct napi_struct *n);
/**
* napi_synchronize - wait until NAPI is not running
* @n: NAPI context
*
* Wait until NAPI is done being scheduled on this context.
* Waits till any outstanding processing completes but
* does not disable future activations.
*/
static inline void napi_synchronize(const struct napi_struct *n)
{
if (IS_ENABLED(CONFIG_SMP))
while (test_bit(NAPI_STATE_SCHED, &n->state))
msleep(1);
else
barrier();
}
/**
* napi_if_scheduled_mark_missed - if napi is running, set the
* NAPIF_STATE_MISSED
* @n: NAPI context
*
* If napi is running, set the NAPIF_STATE_MISSED, and return true if
* NAPI is scheduled.
**/
static inline bool napi_if_scheduled_mark_missed(struct napi_struct *n)
{
unsigned long val, new;
do {
val = READ_ONCE(n->state);
if (val & NAPIF_STATE_DISABLE)
return true;
if (!(val & NAPIF_STATE_SCHED))
return false;
new = val | NAPIF_STATE_MISSED;
} while (cmpxchg(&n->state, val, new) != val);
return true;
}
enum netdev_queue_state_t {
__QUEUE_STATE_DRV_XOFF,
__QUEUE_STATE_STACK_XOFF,
__QUEUE_STATE_FROZEN,
};
#define QUEUE_STATE_DRV_XOFF (1 << __QUEUE_STATE_DRV_XOFF)
#define QUEUE_STATE_STACK_XOFF (1 << __QUEUE_STATE_STACK_XOFF)
#define QUEUE_STATE_FROZEN (1 << __QUEUE_STATE_FROZEN)
#define QUEUE_STATE_ANY_XOFF (QUEUE_STATE_DRV_XOFF | QUEUE_STATE_STACK_XOFF)
#define QUEUE_STATE_ANY_XOFF_OR_FROZEN (QUEUE_STATE_ANY_XOFF | \
QUEUE_STATE_FROZEN)
#define QUEUE_STATE_DRV_XOFF_OR_FROZEN (QUEUE_STATE_DRV_XOFF | \
QUEUE_STATE_FROZEN)
/*
* __QUEUE_STATE_DRV_XOFF is used by drivers to stop the transmit queue. The
* netif_tx_* functions below are used to manipulate this flag. The
* __QUEUE_STATE_STACK_XOFF flag is used by the stack to stop the transmit
* queue independently. The netif_xmit_*stopped functions below are called
* to check if the queue has been stopped by the driver or stack (either
* of the XOFF bits are set in the state). Drivers should not need to call
* netif_xmit*stopped functions, they should only be using netif_tx_*.
*/
struct netdev_queue {
/*
* read-mostly part
*/
struct net_device *dev;
struct Qdisc __rcu *qdisc;
struct Qdisc *qdisc_sleeping;
#ifdef CONFIG_SYSFS
struct kobject kobj;
#endif
#if defined(CONFIG_XPS) && defined(CONFIG_NUMA)
int numa_node;
#endif
unsigned long tx_maxrate;
/*
* Number of TX timeouts for this queue
* (/sys/class/net/DEV/Q/trans_timeout)
*/
unsigned long trans_timeout;
/* Subordinate device that the queue has been assigned to */
struct net_device *sb_dev;
#ifdef CONFIG_XDP_SOCKETS
struct xsk_buff_pool *pool;
#endif
/*
* write-mostly part
*/
spinlock_t _xmit_lock ____cacheline_aligned_in_smp;
int xmit_lock_owner;
/*
* Time (in jiffies) of last Tx
*/
unsigned long trans_start;
unsigned long state;
#ifdef CONFIG_BQL
struct dql dql;
#endif
} ____cacheline_aligned_in_smp;
extern int sysctl_fb_tunnels_only_for_init_net;
extern int sysctl_devconf_inherit_init_net;
/*
* sysctl_fb_tunnels_only_for_init_net == 0 : For all netns
* == 1 : For initns only
* == 2 : For none.
*/
static inline bool net_has_fallback_tunnels(const struct net *net)
{
return !IS_ENABLED(CONFIG_SYSCTL) ||
!sysctl_fb_tunnels_only_for_init_net ||
(net == &init_net && sysctl_fb_tunnels_only_for_init_net == 1);
}
static inline int netdev_queue_numa_node_read(const struct netdev_queue *q)
{
#if defined(CONFIG_XPS) && defined(CONFIG_NUMA)
return q->numa_node;
#else
return NUMA_NO_NODE;
#endif
}
static inline void netdev_queue_numa_node_write(struct netdev_queue *q, int node)
{
#if defined(CONFIG_XPS) && defined(CONFIG_NUMA)
q->numa_node = node;
#endif
}
#ifdef CONFIG_RPS
/*
* This structure holds an RPS map which can be of variable length. The
* map is an array of CPUs.
*/
struct rps_map {
unsigned int len;
struct rcu_head rcu;
u16 cpus[];
};
#define RPS_MAP_SIZE(_num) (sizeof(struct rps_map) + ((_num) * sizeof(u16)))
/*
* The rps_dev_flow structure contains the mapping of a flow to a CPU, the
* tail pointer for that CPU's input queue at the time of last enqueue, and
* a hardware filter index.
*/
struct rps_dev_flow {
u16 cpu;
u16 filter;
unsigned int last_qtail;
};
#define RPS_NO_FILTER 0xffff
/*
* The rps_dev_flow_table structure contains a table of flow mappings.
*/
struct rps_dev_flow_table {
unsigned int mask;
struct rcu_head rcu;
struct rps_dev_flow flows[];
};
#define RPS_DEV_FLOW_TABLE_SIZE(_num) (sizeof(struct rps_dev_flow_table) + \
((_num) * sizeof(struct rps_dev_flow)))
/*
* The rps_sock_flow_table contains mappings of flows to the last CPU
* on which they were processed by the application (set in recvmsg).
* Each entry is a 32bit value. Upper part is the high-order bits
* of flow hash, lower part is CPU number.
* rps_cpu_mask is used to partition the space, depending on number of
* possible CPUs : rps_cpu_mask = roundup_pow_of_two(nr_cpu_ids) - 1
* For example, if 64 CPUs are possible, rps_cpu_mask = 0x3f,
* meaning we use 32-6=26 bits for the hash.
*/
struct rps_sock_flow_table {
u32 mask;
u32 ents[] ____cacheline_aligned_in_smp;
};
#define RPS_SOCK_FLOW_TABLE_SIZE(_num) (offsetof(struct rps_sock_flow_table, ents[_num]))
#define RPS_NO_CPU 0xffff
extern u32 rps_cpu_mask;
extern struct rps_sock_flow_table __rcu *rps_sock_flow_table;
static inline void rps_record_sock_flow(struct rps_sock_flow_table *table,
u32 hash)
{
if (table && hash) { unsigned int index = hash & table->mask;
u32 val = hash & ~rps_cpu_mask;
/* We only give a hint, preemption can change CPU under us */
val |= raw_smp_processor_id();
if (table->ents[index] != val)
table->ents[index] = val;
}
}
#ifdef CONFIG_RFS_ACCEL
bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index, u32 flow_id,
u16 filter_id);
#endif
#endif /* CONFIG_RPS */
/* This structure contains an instance of an RX queue. */
struct netdev_rx_queue {
struct xdp_rxq_info xdp_rxq;
#ifdef CONFIG_RPS
struct rps_map __rcu *rps_map;
struct rps_dev_flow_table __rcu *rps_flow_table;
#endif
struct kobject kobj;
struct net_device *dev;
#ifdef CONFIG_XDP_SOCKETS
struct xsk_buff_pool *pool;
#endif
} ____cacheline_aligned_in_smp;
/*
* RX queue sysfs structures and functions.
*/
struct rx_queue_attribute {
struct attribute attr;
ssize_t (*show)(struct netdev_rx_queue *queue, char *buf);
ssize_t (*store)(struct netdev_rx_queue *queue,
const char *buf, size_t len);
};
/* XPS map type and offset of the xps map within net_device->xps_maps[]. */
enum xps_map_type {
XPS_CPUS = 0,
XPS_RXQS,
XPS_MAPS_MAX,
};
#ifdef CONFIG_XPS
/*
* This structure holds an XPS map which can be of variable length. The
* map is an array of queues.
*/
struct xps_map {
unsigned int len;
unsigned int alloc_len;
struct rcu_head rcu;
u16 queues[];
};
#define XPS_MAP_SIZE(_num) (sizeof(struct xps_map) + ((_num) * sizeof(u16)))
#define XPS_MIN_MAP_ALLOC ((L1_CACHE_ALIGN(offsetof(struct xps_map, queues[1])) \
- sizeof(struct xps_map)) / sizeof(u16))
/*
* This structure holds all XPS maps for device. Maps are indexed by CPU.
*
* We keep track of the number of cpus/rxqs used when the struct is allocated,
* in nr_ids. This will help not accessing out-of-bound memory.
*
* We keep track of the number of traffic classes used when the struct is
* allocated, in num_tc. This will be used to navigate the maps, to ensure we're
* not crossing its upper bound, as the original dev->num_tc can be updated in
* the meantime.
*/
struct xps_dev_maps {
struct rcu_head rcu;
unsigned int nr_ids;
s16 num_tc;
struct xps_map __rcu *attr_map[]; /* Either CPUs map or RXQs map */
};
#define XPS_CPU_DEV_MAPS_SIZE(_tcs) (sizeof(struct xps_dev_maps) + \
(nr_cpu_ids * (_tcs) * sizeof(struct xps_map *)))
#define XPS_RXQ_DEV_MAPS_SIZE(_tcs, _rxqs) (sizeof(struct xps_dev_maps) +\
(_rxqs * (_tcs) * sizeof(struct xps_map *)))
#endif /* CONFIG_XPS */
#define TC_MAX_QUEUE 16
#define TC_BITMASK 15
/* HW offloaded queuing disciplines txq count and offset maps */
struct netdev_tc_txq {
u16 count;
u16 offset;
};
#if defined(CONFIG_FCOE) || defined(CONFIG_FCOE_MODULE)
/*
* This structure is to hold information about the device
* configured to run FCoE protocol stack.
*/
struct netdev_fcoe_hbainfo {
char manufacturer[64];
char serial_number[64];
char hardware_version[64];
char driver_version[64];
char optionrom_version[64];
char firmware_version[64];
char model[256];
char model_description[256];
};
#endif
#define MAX_PHYS_ITEM_ID_LEN 32
/* This structure holds a unique identifier to identify some
* physical item (port for example) used by a netdevice.
*/
struct netdev_phys_item_id {
unsigned char id[MAX_PHYS_ITEM_ID_LEN];
unsigned char id_len;
};
static inline bool netdev_phys_item_id_same(struct netdev_phys_item_id *a,
struct netdev_phys_item_id *b)
{
return a->id_len == b->id_len &&
memcmp(a->id, b->id, a->id_len) == 0;
}
typedef u16 (*select_queue_fallback_t)(struct net_device *dev,
struct sk_buff *skb,
struct net_device *sb_dev);
enum net_device_path_type {
DEV_PATH_ETHERNET = 0,
DEV_PATH_VLAN,
DEV_PATH_BRIDGE,
DEV_PATH_PPPOE,
DEV_PATH_DSA,
};
struct net_device_path {
enum net_device_path_type type;
const struct net_device *dev;
union {
struct {
u16 id;
__be16 proto;
u8 h_dest[ETH_ALEN];
} encap;
struct {
enum {
DEV_PATH_BR_VLAN_KEEP,
DEV_PATH_BR_VLAN_TAG,
DEV_PATH_BR_VLAN_UNTAG,
DEV_PATH_BR_VLAN_UNTAG_HW,
} vlan_mode;
u16 vlan_id;
__be16 vlan_proto;
} bridge;
struct {
int port;
u16 proto;
} dsa;
};
};
#define NET_DEVICE_PATH_STACK_MAX 5
#define NET_DEVICE_PATH_VLAN_MAX 2
struct net_device_path_stack {
int num_paths;
struct net_device_path path[NET_DEVICE_PATH_STACK_MAX];
};
struct net_device_path_ctx {
const struct net_device *dev;
const u8 *daddr;
int num_vlans;
struct {
u16 id;
__be16 proto;
} vlan[NET_DEVICE_PATH_VLAN_MAX];
};
enum tc_setup_type {
TC_SETUP_QDISC_MQPRIO,
TC_SETUP_CLSU32,
TC_SETUP_CLSFLOWER,
TC_SETUP_CLSMATCHALL,
TC_SETUP_CLSBPF,
TC_SETUP_BLOCK,
TC_SETUP_QDISC_CBS,
TC_SETUP_QDISC_RED,
TC_SETUP_QDISC_PRIO,
TC_SETUP_QDISC_MQ,
TC_SETUP_QDISC_ETF,
TC_SETUP_ROOT_QDISC,
TC_SETUP_QDISC_GRED,
TC_SETUP_QDISC_TAPRIO,
TC_SETUP_FT,
TC_SETUP_QDISC_ETS,
TC_SETUP_QDISC_TBF,
TC_SETUP_QDISC_FIFO,
TC_SETUP_QDISC_HTB,
};
/* These structures hold the attributes of bpf state that are being passed
* to the netdevice through the bpf op.
*/
enum bpf_netdev_command {
/* Set or clear a bpf program used in the earliest stages of packet
* rx. The prog will have been loaded as BPF_PROG_TYPE_XDP. The callee
* is responsible for calling bpf_prog_put on any old progs that are
* stored. In case of error, the callee need not release the new prog
* reference, but on success it takes ownership and must bpf_prog_put
* when it is no longer used.
*/
XDP_SETUP_PROG,
XDP_SETUP_PROG_HW,
/* BPF program for offload callbacks, invoked at program load time. */
BPF_OFFLOAD_MAP_ALLOC,
BPF_OFFLOAD_MAP_FREE,
XDP_SETUP_XSK_POOL,
};
struct bpf_prog_offload_ops;
struct netlink_ext_ack;
struct xdp_umem;
struct xdp_dev_bulk_queue;
struct bpf_xdp_link;
enum bpf_xdp_mode {
XDP_MODE_SKB = 0,
XDP_MODE_DRV = 1,
XDP_MODE_HW = 2,
__MAX_XDP_MODE
};
struct bpf_xdp_entity {
struct bpf_prog *prog;
struct bpf_xdp_link *link;
};
struct netdev_bpf {
enum bpf_netdev_command command;
union {
/* XDP_SETUP_PROG */
struct {
u32 flags;
struct bpf_prog *prog;
struct netlink_ext_ack *extack;
};
/* BPF_OFFLOAD_MAP_ALLOC, BPF_OFFLOAD_MAP_FREE */
struct {
struct bpf_offloaded_map *offmap;
};
/* XDP_SETUP_XSK_POOL */
struct {
struct xsk_buff_pool *pool;
u16 queue_id;
} xsk;
};
};
/* Flags for ndo_xsk_wakeup. */
#define XDP_WAKEUP_RX (1 << 0)
#define XDP_WAKEUP_TX (1 << 1)
#ifdef CONFIG_XFRM_OFFLOAD
struct xfrmdev_ops {
int (*xdo_dev_state_add) (struct xfrm_state *x);
void (*xdo_dev_state_delete) (struct xfrm_state *x);
void (*xdo_dev_state_free) (struct xfrm_state *x);
bool (*xdo_dev_offload_ok) (struct sk_buff *skb,
struct xfrm_state *x);
void (*xdo_dev_state_advance_esn) (struct xfrm_state *x);
};
#endif
struct dev_ifalias {
struct rcu_head rcuhead;
char ifalias[];
};
struct devlink;
struct tlsdev_ops;
struct netdev_name_node {
struct hlist_node hlist;
struct list_head list;
struct net_device *dev;
const char *name;
};
int netdev_name_node_alt_create(struct net_device *dev, const char *name);
int netdev_name_node_alt_destroy(struct net_device *dev, const char *name);
struct netdev_net_notifier {
struct list_head list;
struct notifier_block *nb;
};
/*
* This structure defines the management hooks for network devices.
* The following hooks can be defined; unless noted otherwise, they are
* optional and can be filled with a null pointer.
*
* int (*ndo_init)(struct net_device *dev);
* This function is called once when a network device is registered.
* The network device can use this for any late stage initialization
* or semantic validation. It can fail with an error code which will
* be propagated back to register_netdev.
*
* void (*ndo_uninit)(struct net_device *dev);
* This function is called when device is unregistered or when registration
* fails. It is not called if init fails.
*
* int (*ndo_open)(struct net_device *dev);
* This function is called when a network device transitions to the up
* state.
*
* int (*ndo_stop)(struct net_device *dev);
* This function is called when a network device transitions to the down
* state.
*
* netdev_tx_t (*ndo_start_xmit)(struct sk_buff *skb,
* struct net_device *dev);
* Called when a packet needs to be transmitted.
* Returns NETDEV_TX_OK. Can return NETDEV_TX_BUSY, but you should stop
* the queue before that can happen; it's for obsolete devices and weird
* corner cases, but the stack really does a non-trivial amount
* of useless work if you return NETDEV_TX_BUSY.
* Required; cannot be NULL.
*
* netdev_features_t (*ndo_features_check)(struct sk_buff *skb,
* struct net_device *dev
* netdev_features_t features);
* Called by core transmit path to determine if device is capable of
* performing offload operations on a given packet. This is to give
* the device an opportunity to implement any restrictions that cannot
* be otherwise expressed by feature flags. The check is called with
* the set of features that the stack has calculated and it returns
* those the driver believes to be appropriate.
*
* u16 (*ndo_select_queue)(struct net_device *dev, struct sk_buff *skb,
* struct net_device *sb_dev);
* Called to decide which queue to use when device supports multiple
* transmit queues.
*
* void (*ndo_change_rx_flags)(struct net_device *dev, int flags);
* This function is called to allow device receiver to make
* changes to configuration when multicast or promiscuous is enabled.
*
* void (*ndo_set_rx_mode)(struct net_device *dev);
* This function is called device changes address list filtering.
* If driver handles unicast address filtering, it should set
* IFF_UNICAST_FLT in its priv_flags.
*
* int (*ndo_set_mac_address)(struct net_device *dev, void *addr);
* This function is called when the Media Access Control address
* needs to be changed. If this interface is not defined, the
* MAC address can not be changed.
*
* int (*ndo_validate_addr)(struct net_device *dev);
* Test if Media Access Control address is valid for the device.
*
* int (*ndo_do_ioctl)(struct net_device *dev, struct ifreq *ifr, int cmd);
* Old-style ioctl entry point. This is used internally by the
* appletalk and ieee802154 subsystems but is no longer called by
* the device ioctl handler.
*
* int (*ndo_siocbond)(struct net_device *dev, struct ifreq *ifr, int cmd);
* Used by the bonding driver for its device specific ioctls:
* SIOCBONDENSLAVE, SIOCBONDRELEASE, SIOCBONDSETHWADDR, SIOCBONDCHANGEACTIVE,
* SIOCBONDSLAVEINFOQUERY, and SIOCBONDINFOQUERY
*
* * int (*ndo_eth_ioctl)(struct net_device *dev, struct ifreq *ifr, int cmd);
* Called for ethernet specific ioctls: SIOCGMIIPHY, SIOCGMIIREG,
* SIOCSMIIREG, SIOCSHWTSTAMP and SIOCGHWTSTAMP.
*
* int (*ndo_set_config)(struct net_device *dev, struct ifmap *map);
* Used to set network devices bus interface parameters. This interface
* is retained for legacy reasons; new devices should use the bus
* interface (PCI) for low level management.
*
* int (*ndo_change_mtu)(struct net_device *dev, int new_mtu);
* Called when a user wants to change the Maximum Transfer Unit
* of a device.
*
* void (*ndo_tx_timeout)(struct net_device *dev, unsigned int txqueue);
* Callback used when the transmitter has not made any progress
* for dev->watchdog ticks.
*
* void (*ndo_get_stats64)(struct net_device *dev,
* struct rtnl_link_stats64 *storage);
* struct net_device_stats* (*ndo_get_stats)(struct net_device *dev);
* Called when a user wants to get the network device usage
* statistics. Drivers must do one of the following:
* 1. Define @ndo_get_stats64 to fill in a zero-initialised
* rtnl_link_stats64 structure passed by the caller.
* 2. Define @ndo_get_stats to update a net_device_stats structure
* (which should normally be dev->stats) and return a pointer to
* it. The structure may be changed asynchronously only if each
* field is written atomically.
* 3. Update dev->stats asynchronously and atomically, and define
* neither operation.
*
* bool (*ndo_has_offload_stats)(const struct net_device *dev, int attr_id)
* Return true if this device supports offload stats of this attr_id.
*
* int (*ndo_get_offload_stats)(int attr_id, const struct net_device *dev,
* void *attr_data)
* Get statistics for offload operations by attr_id. Write it into the
* attr_data pointer.
*
* int (*ndo_vlan_rx_add_vid)(struct net_device *dev, __be16 proto, u16 vid);
* If device supports VLAN filtering this function is called when a
* VLAN id is registered.
*
* int (*ndo_vlan_rx_kill_vid)(struct net_device *dev, __be16 proto, u16 vid);
* If device supports VLAN filtering this function is called when a
* VLAN id is unregistered.
*
* void (*ndo_poll_controller)(struct net_device *dev);
*
* SR-IOV management functions.
* int (*ndo_set_vf_mac)(struct net_device *dev, int vf, u8* mac);
* int (*ndo_set_vf_vlan)(struct net_device *dev, int vf, u16 vlan,
* u8 qos, __be16 proto);
* int (*ndo_set_vf_rate)(struct net_device *dev, int vf, int min_tx_rate,
* int max_tx_rate);
* int (*ndo_set_vf_spoofchk)(struct net_device *dev, int vf, bool setting);
* int (*ndo_set_vf_trust)(struct net_device *dev, int vf, bool setting);
* int (*ndo_get_vf_config)(struct net_device *dev,
* int vf, struct ifla_vf_info *ivf);
* int (*ndo_set_vf_link_state)(struct net_device *dev, int vf, int link_state);
* int (*ndo_set_vf_port)(struct net_device *dev, int vf,
* struct nlattr *port[]);
*
* Enable or disable the VF ability to query its RSS Redirection Table and
* Hash Key. This is needed since on some devices VF share this information
* with PF and querying it may introduce a theoretical security risk.
* int (*ndo_set_vf_rss_query_en)(struct net_device *dev, int vf, bool setting);
* int (*ndo_get_vf_port)(struct net_device *dev, int vf, struct sk_buff *skb);
* int (*ndo_setup_tc)(struct net_device *dev, enum tc_setup_type type,
* void *type_data);
* Called to setup any 'tc' scheduler, classifier or action on @dev.
* This is always called from the stack with the rtnl lock held and netif
* tx queues stopped. This allows the netdevice to perform queue
* management safely.
*
* Fiber Channel over Ethernet (FCoE) offload functions.
* int (*ndo_fcoe_enable)(struct net_device *dev);
* Called when the FCoE protocol stack wants to start using LLD for FCoE
* so the underlying device can perform whatever needed configuration or
* initialization to support acceleration of FCoE traffic.
*
* int (*ndo_fcoe_disable)(struct net_device *dev);
* Called when the FCoE protocol stack wants to stop using LLD for FCoE
* so the underlying device can perform whatever needed clean-ups to
* stop supporting acceleration of FCoE traffic.
*
* int (*ndo_fcoe_ddp_setup)(struct net_device *dev, u16 xid,
* struct scatterlist *sgl, unsigned int sgc);
* Called when the FCoE Initiator wants to initialize an I/O that
* is a possible candidate for Direct Data Placement (DDP). The LLD can
* perform necessary setup and returns 1 to indicate the device is set up
* successfully to perform DDP on this I/O, otherwise this returns 0.
*
* int (*ndo_fcoe_ddp_done)(struct net_device *dev, u16 xid);
* Called when the FCoE Initiator/Target is done with the DDPed I/O as
* indicated by the FC exchange id 'xid', so the underlying device can
* clean up and reuse resources for later DDP requests.
*
* int (*ndo_fcoe_ddp_target)(struct net_device *dev, u16 xid,
* struct scatterlist *sgl, unsigned int sgc);
* Called when the FCoE Target wants to initialize an I/O that
* is a possible candidate for Direct Data Placement (DDP). The LLD can
* perform necessary setup and returns 1 to indicate the device is set up
* successfully to perform DDP on this I/O, otherwise this returns 0.
*
* int (*ndo_fcoe_get_hbainfo)(struct net_device *dev,
* struct netdev_fcoe_hbainfo *hbainfo);
* Called when the FCoE Protocol stack wants information on the underlying
* device. This information is utilized by the FCoE protocol stack to
* register attributes with Fiber Channel management service as per the
* FC-GS Fabric Device Management Information(FDMI) specification.
*
* int (*ndo_fcoe_get_wwn)(struct net_device *dev, u64 *wwn, int type);
* Called when the underlying device wants to override default World Wide
* Name (WWN) generation mechanism in FCoE protocol stack to pass its own
* World Wide Port Name (WWPN) or World Wide Node Name (WWNN) to the FCoE
* protocol stack to use.
*
* RFS acceleration.
* int (*ndo_rx_flow_steer)(struct net_device *dev, const struct sk_buff *skb,
* u16 rxq_index, u32 flow_id);
* Set hardware filter for RFS. rxq_index is the target queue index;
* flow_id is a flow ID to be passed to rps_may_expire_flow() later.
* Return the filter ID on success, or a negative error code.
*
* Slave management functions (for bridge, bonding, etc).
* int (*ndo_add_slave)(struct net_device *dev, struct net_device *slave_dev);
* Called to make another netdev an underling.
*
* int (*ndo_del_slave)(struct net_device *dev, struct net_device *slave_dev);
* Called to release previously enslaved netdev.
*
* struct net_device *(*ndo_get_xmit_slave)(struct net_device *dev,
* struct sk_buff *skb,
* bool all_slaves);
* Get the xmit slave of master device. If all_slaves is true, function
* assume all the slaves can transmit.
*
* Feature/offload setting functions.
* netdev_features_t (*ndo_fix_features)(struct net_device *dev,
* netdev_features_t features);
* Adjusts the requested feature flags according to device-specific
* constraints, and returns the resulting flags. Must not modify
* the device state.
*
* int (*ndo_set_features)(struct net_device *dev, netdev_features_t features);
* Called to update device configuration to new features. Passed
* feature set might be less than what was returned by ndo_fix_features()).
* Must return >0 or -errno if it changed dev->features itself.
*
* int (*ndo_fdb_add)(struct ndmsg *ndm, struct nlattr *tb[],
* struct net_device *dev,
* const unsigned char *addr, u16 vid, u16 flags,
* struct netlink_ext_ack *extack);
* Adds an FDB entry to dev for addr.
* int (*ndo_fdb_del)(struct ndmsg *ndm, struct nlattr *tb[],
* struct net_device *dev,
* const unsigned char *addr, u16 vid)
* Deletes the FDB entry from dev coresponding to addr.
* int (*ndo_fdb_dump)(struct sk_buff *skb, struct netlink_callback *cb,
* struct net_device *dev, struct net_device *filter_dev,
* int *idx)
* Used to add FDB entries to dump requests. Implementers should add
* entries to skb and update idx with the number of entries.
*
* int (*ndo_bridge_setlink)(struct net_device *dev, struct nlmsghdr *nlh,
* u16 flags, struct netlink_ext_ack *extack)
* int (*ndo_bridge_getlink)(struct sk_buff *skb, u32 pid, u32 seq,
* struct net_device *dev, u32 filter_mask,
* int nlflags)
* int (*ndo_bridge_dellink)(struct net_device *dev, struct nlmsghdr *nlh,
* u16 flags);
*
* int (*ndo_change_carrier)(struct net_device *dev, bool new_carrier);
* Called to change device carrier. Soft-devices (like dummy, team, etc)
* which do not represent real hardware may define this to allow their
* userspace components to manage their virtual carrier state. Devices
* that determine carrier state from physical hardware properties (eg
* network cables) or protocol-dependent mechanisms (eg
* USB_CDC_NOTIFY_NETWORK_CONNECTION) should NOT implement this function.
*
* int (*ndo_get_phys_port_id)(struct net_device *dev,
* struct netdev_phys_item_id *ppid);
* Called to get ID of physical port of this device. If driver does
* not implement this, it is assumed that the hw is not able to have
* multiple net devices on single physical port.
*
* int (*ndo_get_port_parent_id)(struct net_device *dev,
* struct netdev_phys_item_id *ppid)
* Called to get the parent ID of the physical port of this device.
*
* void* (*ndo_dfwd_add_station)(struct net_device *pdev,
* struct net_device *dev)
* Called by upper layer devices to accelerate switching or other
* station functionality into hardware. 'pdev is the lowerdev
* to use for the offload and 'dev' is the net device that will
* back the offload. Returns a pointer to the private structure
* the upper layer will maintain.
* void (*ndo_dfwd_del_station)(struct net_device *pdev, void *priv)
* Called by upper layer device to delete the station created
* by 'ndo_dfwd_add_station'. 'pdev' is the net device backing
* the station and priv is the structure returned by the add
* operation.
* int (*ndo_set_tx_maxrate)(struct net_device *dev,
* int queue_index, u32 maxrate);
* Called when a user wants to set a max-rate limitation of specific
* TX queue.
* int (*ndo_get_iflink)(const struct net_device *dev);
* Called to get the iflink value of this device.
* void (*ndo_change_proto_down)(struct net_device *dev,
* bool proto_down);
* This function is used to pass protocol port error state information
* to the switch driver. The switch driver can react to the proto_down
* by doing a phys down on the associated switch port.
* int (*ndo_fill_metadata_dst)(struct net_device *dev, struct sk_buff *skb);
* This function is used to get egress tunnel information for given skb.
* This is useful for retrieving outer tunnel header parameters while
* sampling packet.
* void (*ndo_set_rx_headroom)(struct net_device *dev, int needed_headroom);
* This function is used to specify the headroom that the skb must
* consider when allocation skb during packet reception. Setting
* appropriate rx headroom value allows avoiding skb head copy on
* forward. Setting a negative value resets the rx headroom to the
* default value.
* int (*ndo_bpf)(struct net_device *dev, struct netdev_bpf *bpf);
* This function is used to set or query state related to XDP on the
* netdevice and manage BPF offload. See definition of
* enum bpf_netdev_command for details.
* int (*ndo_xdp_xmit)(struct net_device *dev, int n, struct xdp_frame **xdp,
* u32 flags);
* This function is used to submit @n XDP packets for transmit on a
* netdevice. Returns number of frames successfully transmitted, frames
* that got dropped are freed/returned via xdp_return_frame().
* Returns negative number, means general error invoking ndo, meaning
* no frames were xmit'ed and core-caller will free all frames.
* struct net_device *(*ndo_xdp_get_xmit_slave)(struct net_device *dev,
* struct xdp_buff *xdp);
* Get the xmit slave of master device based on the xdp_buff.
* int (*ndo_xsk_wakeup)(struct net_device *dev, u32 queue_id, u32 flags);
* This function is used to wake up the softirq, ksoftirqd or kthread
* responsible for sending and/or receiving packets on a specific
* queue id bound to an AF_XDP socket. The flags field specifies if
* only RX, only Tx, or both should be woken up using the flags
* XDP_WAKEUP_RX and XDP_WAKEUP_TX.
* struct devlink_port *(*ndo_get_devlink_port)(struct net_device *dev);
* Get devlink port instance associated with a given netdev.
* Called with a reference on the netdevice and devlink locks only,
* rtnl_lock is not held.
* int (*ndo_tunnel_ctl)(struct net_device *dev, struct ip_tunnel_parm *p,
* int cmd);
* Add, change, delete or get information on an IPv4 tunnel.
* struct net_device *(*ndo_get_peer_dev)(struct net_device *dev);
* If a device is paired with a peer device, return the peer instance.
* The caller must be under RCU read context.
* int (*ndo_fill_forward_path)(struct net_device_path_ctx *ctx, struct net_device_path *path);
* Get the forwarding path to reach the real device from the HW destination address
*/
struct net_device_ops {
int (*ndo_init)(struct net_device *dev);
void (*ndo_uninit)(struct net_device *dev);
int (*ndo_open)(struct net_device *dev);
int (*ndo_stop)(struct net_device *dev);
netdev_tx_t (*ndo_start_xmit)(struct sk_buff *skb,
struct net_device *dev);
netdev_features_t (*ndo_features_check)(struct sk_buff *skb,
struct net_device *dev,
netdev_features_t features);
u16 (*ndo_select_queue)(struct net_device *dev,
struct sk_buff *skb,
struct net_device *sb_dev);
void (*ndo_change_rx_flags)(struct net_device *dev,
int flags);
void (*ndo_set_rx_mode)(struct net_device *dev);
int (*ndo_set_mac_address)(struct net_device *dev,
void *addr);
int (*ndo_validate_addr)(struct net_device *dev);
int (*ndo_do_ioctl)(struct net_device *dev,
struct ifreq *ifr, int cmd);
int (*ndo_eth_ioctl)(struct net_device *dev,
struct ifreq *ifr, int cmd);
int (*ndo_siocbond)(struct net_device *dev,
struct ifreq *ifr, int cmd);
int (*ndo_siocwandev)(struct net_device *dev,
struct if_settings *ifs);
int (*ndo_siocdevprivate)(struct net_device *dev,
struct ifreq *ifr,
void __user *data, int cmd);
int (*ndo_set_config)(struct net_device *dev,
struct ifmap *map);
int (*ndo_change_mtu)(struct net_device *dev,
int new_mtu);
int (*ndo_neigh_setup)(struct net_device *dev,
struct neigh_parms *);
void (*ndo_tx_timeout) (struct net_device *dev,
unsigned int txqueue);
void (*ndo_get_stats64)(struct net_device *dev,
struct rtnl_link_stats64 *storage);
bool (*ndo_has_offload_stats)(const struct net_device *dev, int attr_id);
int (*ndo_get_offload_stats)(int attr_id,
const struct net_device *dev,
void *attr_data);
struct net_device_stats* (*ndo_get_stats)(struct net_device *dev);
int (*ndo_vlan_rx_add_vid)(struct net_device *dev,
__be16 proto, u16 vid);
int (*ndo_vlan_rx_kill_vid)(struct net_device *dev,
__be16 proto, u16 vid);
#ifdef CONFIG_NET_POLL_CONTROLLER
void (*ndo_poll_controller)(struct net_device *dev);
int (*ndo_netpoll_setup)(struct net_device *dev,
struct netpoll_info *info);
void (*ndo_netpoll_cleanup)(struct net_device *dev);
#endif
int (*ndo_set_vf_mac)(struct net_device *dev,
int queue, u8 *mac);
int (*ndo_set_vf_vlan)(struct net_device *dev,
int queue, u16 vlan,
u8 qos, __be16 proto);
int (*ndo_set_vf_rate)(struct net_device *dev,
int vf, int min_tx_rate,
int max_tx_rate);
int (*ndo_set_vf_spoofchk)(struct net_device *dev,
int vf, bool setting);
int (*ndo_set_vf_trust)(struct net_device *dev,
int vf, bool setting);
int (*ndo_get_vf_config)(struct net_device *dev,
int vf,
struct ifla_vf_info *ivf);
int (*ndo_set_vf_link_state)(struct net_device *dev,
int vf, int link_state);
int (*ndo_get_vf_stats)(struct net_device *dev,
int vf,
struct ifla_vf_stats
*vf_stats);
int (*ndo_set_vf_port)(struct net_device *dev,
int vf,
struct nlattr *port[]);
int (*ndo_get_vf_port)(struct net_device *dev,
int vf, struct sk_buff *skb);
int (*ndo_get_vf_guid)(struct net_device *dev,
int vf,
struct ifla_vf_guid *node_guid,
struct ifla_vf_guid *port_guid);
int (*ndo_set_vf_guid)(struct net_device *dev,
int vf, u64 guid,
int guid_type);
int (*ndo_set_vf_rss_query_en)(
struct net_device *dev,
int vf, bool setting);
int (*ndo_setup_tc)(struct net_device *dev,
enum tc_setup_type type,
void *type_data);
#if IS_ENABLED(CONFIG_FCOE)
int (*ndo_fcoe_enable)(struct net_device *dev);
int (*ndo_fcoe_disable)(struct net_device *dev);
int (*ndo_fcoe_ddp_setup)(struct net_device *dev,
u16 xid,
struct scatterlist *sgl,
unsigned int sgc);
int (*ndo_fcoe_ddp_done)(struct net_device *dev,
u16 xid);
int (*ndo_fcoe_ddp_target)(struct net_device *dev,
u16 xid,
struct scatterlist *sgl,
unsigned int sgc);
int (*ndo_fcoe_get_hbainfo)(struct net_device *dev,
struct netdev_fcoe_hbainfo *hbainfo);
#endif
#if IS_ENABLED(CONFIG_LIBFCOE)
#define NETDEV_FCOE_WWNN 0
#define NETDEV_FCOE_WWPN 1
int (*ndo_fcoe_get_wwn)(struct net_device *dev,
u64 *wwn, int type);
#endif
#ifdef CONFIG_RFS_ACCEL
int (*ndo_rx_flow_steer)(struct net_device *dev,
const struct sk_buff *skb,
u16 rxq_index,
u32 flow_id);
#endif
int (*ndo_add_slave)(struct net_device *dev,
struct net_device *slave_dev,
struct netlink_ext_ack *extack);
int (*ndo_del_slave)(struct net_device *dev,
struct net_device *slave_dev);
struct net_device* (*ndo_get_xmit_slave)(struct net_device *dev,
struct sk_buff *skb,
bool all_slaves);
struct net_device* (*ndo_sk_get_lower_dev)(struct net_device *dev,
struct sock *sk);
netdev_features_t (*ndo_fix_features)(struct net_device *dev,
netdev_features_t features);
int (*ndo_set_features)(struct net_device *dev,
netdev_features_t features);
int (*ndo_neigh_construct)(struct net_device *dev,
struct neighbour *n);
void (*ndo_neigh_destroy)(struct net_device *dev,
struct neighbour *n);
int (*ndo_fdb_add)(struct ndmsg *ndm,
struct nlattr *tb[],
struct net_device *dev,
const unsigned char *addr,
u16 vid,
u16 flags,
struct netlink_ext_ack *extack);
int (*ndo_fdb_del)(struct ndmsg *ndm,
struct nlattr *tb[],
struct net_device *dev,
const unsigned char *addr,
u16 vid);
int (*ndo_fdb_dump)(struct sk_buff *skb,
struct netlink_callback *cb,
struct net_device *dev,
struct net_device *filter_dev,
int *idx);
int (*ndo_fdb_get)(struct sk_buff *skb,
struct nlattr *tb[],
struct net_device *dev,
const unsigned char *addr,
u16 vid, u32 portid, u32 seq,
struct netlink_ext_ack *extack);
int (*ndo_bridge_setlink)(struct net_device *dev,
struct nlmsghdr *nlh,
u16 flags,
struct netlink_ext_ack *extack);
int (*ndo_bridge_getlink)(struct sk_buff *skb,
u32 pid, u32 seq,
struct net_device *dev,
u32 filter_mask,
int nlflags);
int (*ndo_bridge_dellink)(struct net_device *dev,
struct nlmsghdr *nlh,
u16 flags);
int (*ndo_change_carrier)(struct net_device *dev,
bool new_carrier);
int (*ndo_get_phys_port_id)(struct net_device *dev,
struct netdev_phys_item_id *ppid);
int (*ndo_get_port_parent_id)(struct net_device *dev,
struct netdev_phys_item_id *ppid);
int (*ndo_get_phys_port_name)(struct net_device *dev,
char *name, size_t len);
void* (*ndo_dfwd_add_station)(struct net_device *pdev,
struct net_device *dev);
void (*ndo_dfwd_del_station)(struct net_device *pdev,
void *priv);
int (*ndo_set_tx_maxrate)(struct net_device *dev,
int queue_index,
u32 maxrate);
int (*ndo_get_iflink)(const struct net_device *dev);
int (*ndo_change_proto_down)(struct net_device *dev,
bool proto_down);
int (*ndo_fill_metadata_dst)(struct net_device *dev,
struct sk_buff *skb);
void (*ndo_set_rx_headroom)(struct net_device *dev,
int needed_headroom);
int (*ndo_bpf)(struct net_device *dev,
struct netdev_bpf *bpf);
int (*ndo_xdp_xmit)(struct net_device *dev, int n,
struct xdp_frame **xdp,
u32 flags);
struct net_device * (*ndo_xdp_get_xmit_slave)(struct net_device *dev,
struct xdp_buff *xdp);
int (*ndo_xsk_wakeup)(struct net_device *dev,
u32 queue_id, u32 flags);
struct devlink_port * (*ndo_get_devlink_port)(struct net_device *dev);
int (*ndo_tunnel_ctl)(struct net_device *dev,
struct ip_tunnel_parm *p, int cmd);
struct net_device * (*ndo_get_peer_dev)(struct net_device *dev);
int (*ndo_fill_forward_path)(struct net_device_path_ctx *ctx,
struct net_device_path *path);
};
/**
* enum netdev_priv_flags - &struct net_device priv_flags
*
* These are the &struct net_device, they are only set internally
* by drivers and used in the kernel. These flags are invisible to
* userspace; this means that the order of these flags can change
* during any kernel release.
*
* You should have a pretty good reason to be extending these flags.
*
* @IFF_802_1Q_VLAN: 802.1Q VLAN device
* @IFF_EBRIDGE: Ethernet bridging device
* @IFF_BONDING: bonding master or slave
* @IFF_ISATAP: ISATAP interface (RFC4214)
* @IFF_WAN_HDLC: WAN HDLC device
* @IFF_XMIT_DST_RELEASE: dev_hard_start_xmit() is allowed to
* release skb->dst
* @IFF_DONT_BRIDGE: disallow bridging this ether dev
* @IFF_DISABLE_NETPOLL: disable netpoll at run-time
* @IFF_MACVLAN_PORT: device used as macvlan port
* @IFF_BRIDGE_PORT: device used as bridge port
* @IFF_OVS_DATAPATH: device used as Open vSwitch datapath port
* @IFF_TX_SKB_SHARING: The interface supports sharing skbs on transmit
* @IFF_UNICAST_FLT: Supports unicast filtering
* @IFF_TEAM_PORT: device used as team port
* @IFF_SUPP_NOFCS: device supports sending custom FCS
* @IFF_LIVE_ADDR_CHANGE: device supports hardware address
* change when it's running
* @IFF_MACVLAN: Macvlan device
* @IFF_XMIT_DST_RELEASE_PERM: IFF_XMIT_DST_RELEASE not taking into account
* underlying stacked devices
* @IFF_L3MDEV_MASTER: device is an L3 master device
* @IFF_NO_QUEUE: device can run without qdisc attached
* @IFF_OPENVSWITCH: device is a Open vSwitch master
* @IFF_L3MDEV_SLAVE: device is enslaved to an L3 master device
* @IFF_TEAM: device is a team device
* @IFF_RXFH_CONFIGURED: device has had Rx Flow indirection table configured
* @IFF_PHONY_HEADROOM: the headroom value is controlled by an external
* entity (i.e. the master device for bridged veth)
* @IFF_MACSEC: device is a MACsec device
* @IFF_NO_RX_HANDLER: device doesn't support the rx_handler hook
* @IFF_FAILOVER: device is a failover master device
* @IFF_FAILOVER_SLAVE: device is lower dev of a failover master device
* @IFF_L3MDEV_RX_HANDLER: only invoke the rx handler of L3 master device
* @IFF_LIVE_RENAME_OK: rename is allowed while device is up and running
* @IFF_TX_SKB_NO_LINEAR: device/driver is capable of xmitting frames with
* skb_headlen(skb) == 0 (data starts from frag0)
*/
enum netdev_priv_flags {
IFF_802_1Q_VLAN = 1<<0,
IFF_EBRIDGE = 1<<1,
IFF_BONDING = 1<<2,
IFF_ISATAP = 1<<3,
IFF_WAN_HDLC = 1<<4,
IFF_XMIT_DST_RELEASE = 1<<5,
IFF_DONT_BRIDGE = 1<<6,
IFF_DISABLE_NETPOLL = 1<<7,
IFF_MACVLAN_PORT = 1<<8,
IFF_BRIDGE_PORT = 1<<9,
IFF_OVS_DATAPATH = 1<<10,
IFF_TX_SKB_SHARING = 1<<11,
IFF_UNICAST_FLT = 1<<12,
IFF_TEAM_PORT = 1<<13,
IFF_SUPP_NOFCS = 1<<14,
IFF_LIVE_ADDR_CHANGE = 1<<15,
IFF_MACVLAN = 1<<16,
IFF_XMIT_DST_RELEASE_PERM = 1<<17,
IFF_L3MDEV_MASTER = 1<<18,
IFF_NO_QUEUE = 1<<19,
IFF_OPENVSWITCH = 1<<20,
IFF_L3MDEV_SLAVE = 1<<21,
IFF_TEAM = 1<<22,
IFF_RXFH_CONFIGURED = 1<<23,
IFF_PHONY_HEADROOM = 1<<24,
IFF_MACSEC = 1<<25,
IFF_NO_RX_HANDLER = 1<<26,
IFF_FAILOVER = 1<<27,
IFF_FAILOVER_SLAVE = 1<<28,
IFF_L3MDEV_RX_HANDLER = 1<<29,
IFF_LIVE_RENAME_OK = 1<<30,
IFF_TX_SKB_NO_LINEAR = 1<<31,
};
#define IFF_802_1Q_VLAN IFF_802_1Q_VLAN
#define IFF_EBRIDGE IFF_EBRIDGE
#define IFF_BONDING IFF_BONDING
#define IFF_ISATAP IFF_ISATAP
#define IFF_WAN_HDLC IFF_WAN_HDLC
#define IFF_XMIT_DST_RELEASE IFF_XMIT_DST_RELEASE
#define IFF_DONT_BRIDGE IFF_DONT_BRIDGE
#define IFF_DISABLE_NETPOLL IFF_DISABLE_NETPOLL
#define IFF_MACVLAN_PORT IFF_MACVLAN_PORT
#define IFF_BRIDGE_PORT IFF_BRIDGE_PORT
#define IFF_OVS_DATAPATH IFF_OVS_DATAPATH
#define IFF_TX_SKB_SHARING IFF_TX_SKB_SHARING
#define IFF_UNICAST_FLT IFF_UNICAST_FLT
#define IFF_TEAM_PORT IFF_TEAM_PORT
#define IFF_SUPP_NOFCS IFF_SUPP_NOFCS
#define IFF_LIVE_ADDR_CHANGE IFF_LIVE_ADDR_CHANGE
#define IFF_MACVLAN IFF_MACVLAN
#define IFF_XMIT_DST_RELEASE_PERM IFF_XMIT_DST_RELEASE_PERM
#define IFF_L3MDEV_MASTER IFF_L3MDEV_MASTER
#define IFF_NO_QUEUE IFF_NO_QUEUE
#define IFF_OPENVSWITCH IFF_OPENVSWITCH
#define IFF_L3MDEV_SLAVE IFF_L3MDEV_SLAVE
#define IFF_TEAM IFF_TEAM
#define IFF_RXFH_CONFIGURED IFF_RXFH_CONFIGURED
#define IFF_PHONY_HEADROOM IFF_PHONY_HEADROOM
#define IFF_MACSEC IFF_MACSEC
#define IFF_NO_RX_HANDLER IFF_NO_RX_HANDLER
#define IFF_FAILOVER IFF_FAILOVER
#define IFF_FAILOVER_SLAVE IFF_FAILOVER_SLAVE
#define IFF_L3MDEV_RX_HANDLER IFF_L3MDEV_RX_HANDLER
#define IFF_LIVE_RENAME_OK IFF_LIVE_RENAME_OK
#define IFF_TX_SKB_NO_LINEAR IFF_TX_SKB_NO_LINEAR
/* Specifies the type of the struct net_device::ml_priv pointer */
enum netdev_ml_priv_type {
ML_PRIV_NONE,
ML_PRIV_CAN,
};
/**
* struct net_device - The DEVICE structure.
*
* Actually, this whole structure is a big mistake. It mixes I/O
* data with strictly "high-level" data, and it has to know about
* almost every data structure used in the INET module.
*
* @name: This is the first field of the "visible" part of this structure
* (i.e. as seen by users in the "Space.c" file). It is the name
* of the interface.
*
* @name_node: Name hashlist node
* @ifalias: SNMP alias
* @mem_end: Shared memory end
* @mem_start: Shared memory start
* @base_addr: Device I/O address
* @irq: Device IRQ number
*
* @state: Generic network queuing layer state, see netdev_state_t
* @dev_list: The global list of network devices
* @napi_list: List entry used for polling NAPI devices
* @unreg_list: List entry when we are unregistering the
* device; see the function unregister_netdev
* @close_list: List entry used when we are closing the device
* @ptype_all: Device-specific packet handlers for all protocols
* @ptype_specific: Device-specific, protocol-specific packet handlers
*
* @adj_list: Directly linked devices, like slaves for bonding
* @features: Currently active device features
* @hw_features: User-changeable features
*
* @wanted_features: User-requested features
* @vlan_features: Mask of features inheritable by VLAN devices
*
* @hw_enc_features: Mask of features inherited by encapsulating devices
* This field indicates what encapsulation
* offloads the hardware is capable of doing,
* and drivers will need to set them appropriately.
*
* @mpls_features: Mask of features inheritable by MPLS
* @gso_partial_features: value(s) from NETIF_F_GSO\*
*
* @ifindex: interface index
* @group: The group the device belongs to
*
* @stats: Statistics struct, which was left as a legacy, use
* rtnl_link_stats64 instead
*
* @rx_dropped: Dropped packets by core network,
* do not use this in drivers
* @tx_dropped: Dropped packets by core network,
* do not use this in drivers
* @rx_nohandler: nohandler dropped packets by core network on
* inactive devices, do not use this in drivers
* @carrier_up_count: Number of times the carrier has been up
* @carrier_down_count: Number of times the carrier has been down
*
* @wireless_handlers: List of functions to handle Wireless Extensions,
* instead of ioctl,
* see <net/iw_handler.h> for details.
* @wireless_data: Instance data managed by the core of wireless extensions
*
* @netdev_ops: Includes several pointers to callbacks,
* if one wants to override the ndo_*() functions
* @ethtool_ops: Management operations
* @l3mdev_ops: Layer 3 master device operations
* @ndisc_ops: Includes callbacks for different IPv6 neighbour
* discovery handling. Necessary for e.g. 6LoWPAN.
* @xfrmdev_ops: Transformation offload operations
* @tlsdev_ops: Transport Layer Security offload operations
* @header_ops: Includes callbacks for creating,parsing,caching,etc
* of Layer 2 headers.
*
* @flags: Interface flags (a la BSD)
* @priv_flags: Like 'flags' but invisible to userspace,
* see if.h for the definitions
* @gflags: Global flags ( kept as legacy )
* @padded: How much padding added by alloc_netdev()
* @operstate: RFC2863 operstate
* @link_mode: Mapping policy to operstate
* @if_port: Selectable AUI, TP, ...
* @dma: DMA channel
* @mtu: Interface MTU value
* @min_mtu: Interface Minimum MTU value
* @max_mtu: Interface Maximum MTU value
* @type: Interface hardware type
* @hard_header_len: Maximum hardware header length.
* @min_header_len: Minimum hardware header length
*
* @needed_headroom: Extra headroom the hardware may need, but not in all
* cases can this be guaranteed
* @needed_tailroom: Extra tailroom the hardware may need, but not in all
* cases can this be guaranteed. Some cases also use
* LL_MAX_HEADER instead to allocate the skb
*
* interface address info:
*
* @perm_addr: Permanent hw address
* @addr_assign_type: Hw address assignment type
* @addr_len: Hardware address length
* @upper_level: Maximum depth level of upper devices.
* @lower_level: Maximum depth level of lower devices.
* @neigh_priv_len: Used in neigh_alloc()
* @dev_id: Used to differentiate devices that share
* the same link layer address
* @dev_port: Used to differentiate devices that share
* the same function
* @addr_list_lock: XXX: need comments on this one
* @name_assign_type: network interface name assignment type
* @uc_promisc: Counter that indicates promiscuous mode
* has been enabled due to the need to listen to
* additional unicast addresses in a device that
* does not implement ndo_set_rx_mode()
* @uc: unicast mac addresses
* @mc: multicast mac addresses
* @dev_addrs: list of device hw addresses
* @queues_kset: Group of all Kobjects in the Tx and RX queues
* @promiscuity: Number of times the NIC is told to work in
* promiscuous mode; if it becomes 0 the NIC will
* exit promiscuous mode
* @allmulti: Counter, enables or disables allmulticast mode
*
* @vlan_info: VLAN info
* @dsa_ptr: dsa specific data
* @tipc_ptr: TIPC specific data
* @atalk_ptr: AppleTalk link
* @ip_ptr: IPv4 specific data
* @dn_ptr: DECnet specific data
* @ip6_ptr: IPv6 specific data
* @ax25_ptr: AX.25 specific data
* @ieee80211_ptr: IEEE 802.11 specific data, assign before registering
* @ieee802154_ptr: IEEE 802.15.4 low-rate Wireless Personal Area Network
* device struct
* @mpls_ptr: mpls_dev struct pointer
* @mctp_ptr: MCTP specific data
*
* @dev_addr: Hw address (before bcast,
* because most packets are unicast)
*
* @_rx: Array of RX queues
* @num_rx_queues: Number of RX queues
* allocated at register_netdev() time
* @real_num_rx_queues: Number of RX queues currently active in device
* @xdp_prog: XDP sockets filter program pointer
* @gro_flush_timeout: timeout for GRO layer in NAPI
* @napi_defer_hard_irqs: If not zero, provides a counter that would
* allow to avoid NIC hard IRQ, on busy queues.
*
* @rx_handler: handler for received packets
* @rx_handler_data: XXX: need comments on this one
* @miniq_ingress: ingress/clsact qdisc specific data for
* ingress processing
* @ingress_queue: XXX: need comments on this one
* @nf_hooks_ingress: netfilter hooks executed for ingress packets
* @broadcast: hw bcast address
*
* @rx_cpu_rmap: CPU reverse-mapping for RX completion interrupts,
* indexed by RX queue number. Assigned by driver.
* This must only be set if the ndo_rx_flow_steer
* operation is defined
* @index_hlist: Device index hash chain
*
* @_tx: Array of TX queues
* @num_tx_queues: Number of TX queues allocated at alloc_netdev_mq() time
* @real_num_tx_queues: Number of TX queues currently active in device
* @qdisc: Root qdisc from userspace point of view
* @tx_queue_len: Max frames per queue allowed
* @tx_global_lock: XXX: need comments on this one
* @xdp_bulkq: XDP device bulk queue
* @xps_maps: all CPUs/RXQs maps for XPS device
*
* @xps_maps: XXX: need comments on this one
* @miniq_egress: clsact qdisc specific data for
* egress processing
* @qdisc_hash: qdisc hash table
* @watchdog_timeo: Represents the timeout that is used by
* the watchdog (see dev_watchdog())
* @watchdog_timer: List of timers
*
* @proto_down_reason: reason a netdev interface is held down
* @pcpu_refcnt: Number of references to this device
* @dev_refcnt: Number of references to this device
* @todo_list: Delayed register/unregister
* @link_watch_list: XXX: need comments on this one
*
* @reg_state: Register/unregister state machine
* @dismantle: Device is going to be freed
* @rtnl_link_state: This enum represents the phases of creating
* a new link
*
* @needs_free_netdev: Should unregister perform free_netdev?
* @priv_destructor: Called from unregister
* @npinfo: XXX: need comments on this one
* @nd_net: Network namespace this network device is inside
*
* @ml_priv: Mid-layer private
* @ml_priv_type: Mid-layer private type
* @lstats: Loopback statistics
* @tstats: Tunnel statistics
* @dstats: Dummy statistics
* @vstats: Virtual ethernet statistics
*
* @garp_port: GARP
* @mrp_port: MRP
*
* @dev: Class/net/name entry
* @sysfs_groups: Space for optional device, statistics and wireless
* sysfs groups
*
* @sysfs_rx_queue_group: Space for optional per-rx queue attributes
* @rtnl_link_ops: Rtnl_link_ops
*
* @gso_max_size: Maximum size of generic segmentation offload
* @gso_max_segs: Maximum number of segments that can be passed to the
* NIC for GSO
*
* @dcbnl_ops: Data Center Bridging netlink ops
* @num_tc: Number of traffic classes in the net device
* @tc_to_txq: XXX: need comments on this one
* @prio_tc_map: XXX: need comments on this one
*
* @fcoe_ddp_xid: Max exchange id for FCoE LRO by ddp
*
* @priomap: XXX: need comments on this one
* @phydev: Physical device may attach itself
* for hardware timestamping
* @sfp_bus: attached &struct sfp_bus structure.
*
* @qdisc_tx_busylock: lockdep class annotating Qdisc->busylock spinlock
* @qdisc_running_key: lockdep class annotating Qdisc->running seqcount
*
* @proto_down: protocol port state information can be sent to the
* switch driver and used to set the phys state of the
* switch port.
*
* @wol_enabled: Wake-on-LAN is enabled
*
* @threaded: napi threaded mode is enabled
*
* @net_notifier_list: List of per-net netdev notifier block
* that follow this device when it is moved
* to another network namespace.
*
* @macsec_ops: MACsec offloading ops
*
* @udp_tunnel_nic_info: static structure describing the UDP tunnel
* offload capabilities of the device
* @udp_tunnel_nic: UDP tunnel offload state
* @xdp_state: stores info on attached XDP BPF programs
*
* @nested_level: Used as as a parameter of spin_lock_nested() of
* dev->addr_list_lock.
* @unlink_list: As netif_addr_lock() can be called recursively,
* keep a list of interfaces to be deleted.
*
* FIXME: cleanup struct net_device such that network protocol info
* moves out.
*/
struct net_device {
char name[IFNAMSIZ];
struct netdev_name_node *name_node;
struct dev_ifalias __rcu *ifalias;
/*
* I/O specific fields
* FIXME: Merge these and struct ifmap into one
*/
unsigned long mem_end;
unsigned long mem_start;
unsigned long base_addr;
/*
* Some hardware also needs these fields (state,dev_list,
* napi_list,unreg_list,close_list) but they are not
* part of the usual set specified in Space.c.
*/
unsigned long state;
struct list_head dev_list;
struct list_head napi_list;
struct list_head unreg_list;
struct list_head close_list;
struct list_head ptype_all;
struct list_head ptype_specific;
struct {
struct list_head upper;
struct list_head lower;
} adj_list;
/* Read-mostly cache-line for fast-path access */
unsigned int flags;
unsigned int priv_flags;
const struct net_device_ops *netdev_ops;
int ifindex;
unsigned short gflags;
unsigned short hard_header_len;
/* Note : dev->mtu is often read without holding a lock.
* Writers usually hold RTNL.
* It is recommended to use READ_ONCE() to annotate the reads,
* and to use WRITE_ONCE() to annotate the writes.
*/
unsigned int mtu;
unsigned short needed_headroom;
unsigned short needed_tailroom;
netdev_features_t features;
netdev_features_t hw_features;
netdev_features_t wanted_features;
netdev_features_t vlan_features;
netdev_features_t hw_enc_features;
netdev_features_t mpls_features;
netdev_features_t gso_partial_features;
unsigned int min_mtu;
unsigned int max_mtu;
unsigned short type;
unsigned char min_header_len;
unsigned char name_assign_type;
int group;
struct net_device_stats stats; /* not used by modern drivers */
atomic_long_t rx_dropped;
atomic_long_t tx_dropped;
atomic_long_t rx_nohandler;
/* Stats to monitor link on/off, flapping */
atomic_t carrier_up_count;
atomic_t carrier_down_count;
#ifdef CONFIG_WIRELESS_EXT
const struct iw_handler_def *wireless_handlers;
struct iw_public_data *wireless_data;
#endif
const struct ethtool_ops *ethtool_ops;
#ifdef CONFIG_NET_L3_MASTER_DEV
const struct l3mdev_ops *l3mdev_ops;
#endif
#if IS_ENABLED(CONFIG_IPV6)
const struct ndisc_ops *ndisc_ops;
#endif
#ifdef CONFIG_XFRM_OFFLOAD
const struct xfrmdev_ops *xfrmdev_ops;
#endif
#if IS_ENABLED(CONFIG_TLS_DEVICE)
const struct tlsdev_ops *tlsdev_ops;
#endif
const struct header_ops *header_ops;
unsigned char operstate;
unsigned char link_mode;
unsigned char if_port;
unsigned char dma;
/* Interface address info. */
unsigned char perm_addr[MAX_ADDR_LEN];
unsigned char addr_assign_type;
unsigned char addr_len;
unsigned char upper_level;
unsigned char lower_level;
unsigned short neigh_priv_len;
unsigned short dev_id;
unsigned short dev_port;
unsigned short padded;
spinlock_t addr_list_lock;
int irq;
struct netdev_hw_addr_list uc;
struct netdev_hw_addr_list mc;
struct netdev_hw_addr_list dev_addrs;
#ifdef CONFIG_SYSFS
struct kset *queues_kset;
#endif
#ifdef CONFIG_LOCKDEP
struct list_head unlink_list;
#endif
unsigned int promiscuity;
unsigned int allmulti;
bool uc_promisc;
#ifdef CONFIG_LOCKDEP
unsigned char nested_level;
#endif
/* Protocol-specific pointers */
#if IS_ENABLED(CONFIG_VLAN_8021Q)
struct vlan_info __rcu *vlan_info;
#endif
#if IS_ENABLED(CONFIG_NET_DSA)
struct dsa_port *dsa_ptr;
#endif
#if IS_ENABLED(CONFIG_TIPC)
struct tipc_bearer __rcu *tipc_ptr;
#endif
#if IS_ENABLED(CONFIG_IRDA) || IS_ENABLED(CONFIG_ATALK)
void *atalk_ptr;
#endif
struct in_device __rcu *ip_ptr;
#if IS_ENABLED(CONFIG_DECNET)
struct dn_dev __rcu *dn_ptr;
#endif
struct inet6_dev __rcu *ip6_ptr;
#if IS_ENABLED(CONFIG_AX25)
void *ax25_ptr;
#endif
struct wireless_dev *ieee80211_ptr;
struct wpan_dev *ieee802154_ptr;
#if IS_ENABLED(CONFIG_MPLS_ROUTING)
struct mpls_dev __rcu *mpls_ptr;
#endif
#if IS_ENABLED(CONFIG_MCTP)
struct mctp_dev __rcu *mctp_ptr;
#endif
/*
* Cache lines mostly used on receive path (including eth_type_trans())
*/
/* Interface address info used in eth_type_trans() */
unsigned char *dev_addr;
struct netdev_rx_queue *_rx;
unsigned int num_rx_queues;
unsigned int real_num_rx_queues;
struct bpf_prog __rcu *xdp_prog;
unsigned long gro_flush_timeout;
int napi_defer_hard_irqs;
rx_handler_func_t __rcu *rx_handler;
void __rcu *rx_handler_data;
#ifdef CONFIG_NET_CLS_ACT
struct mini_Qdisc __rcu *miniq_ingress;
#endif
struct netdev_queue __rcu *ingress_queue;
#ifdef CONFIG_NETFILTER_INGRESS
struct nf_hook_entries __rcu *nf_hooks_ingress;
#endif
unsigned char broadcast[MAX_ADDR_LEN];
#ifdef CONFIG_RFS_ACCEL
struct cpu_rmap *rx_cpu_rmap;
#endif
struct hlist_node index_hlist;
/*
* Cache lines mostly used on transmit path
*/
struct netdev_queue *_tx ____cacheline_aligned_in_smp;
unsigned int num_tx_queues;
unsigned int real_num_tx_queues;
struct Qdisc __rcu *qdisc;
unsigned int tx_queue_len;
spinlock_t tx_global_lock;
struct xdp_dev_bulk_queue __percpu *xdp_bulkq;
#ifdef CONFIG_XPS
struct xps_dev_maps __rcu *xps_maps[XPS_MAPS_MAX];
#endif
#ifdef CONFIG_NET_CLS_ACT
struct mini_Qdisc __rcu *miniq_egress;
#endif
#ifdef CONFIG_NET_SCHED
DECLARE_HASHTABLE (qdisc_hash, 4);
#endif
/* These may be needed for future network-power-down code. */
struct timer_list watchdog_timer;
int watchdog_timeo;
u32 proto_down_reason;
struct list_head todo_list;
#ifdef CONFIG_PCPU_DEV_REFCNT
int __percpu *pcpu_refcnt;
#else
refcount_t dev_refcnt;
#endif
struct list_head link_watch_list;
enum { NETREG_UNINITIALIZED=0,
NETREG_REGISTERED, /* completed register_netdevice */
NETREG_UNREGISTERING, /* called unregister_netdevice */
NETREG_UNREGISTERED, /* completed unregister todo */
NETREG_RELEASED, /* called free_netdev */
NETREG_DUMMY, /* dummy device for NAPI poll */
} reg_state:8;
bool dismantle;
enum {
RTNL_LINK_INITIALIZED,
RTNL_LINK_INITIALIZING,
} rtnl_link_state:16;
bool needs_free_netdev;
void (*priv_destructor)(struct net_device *dev);
#ifdef CONFIG_NETPOLL
struct netpoll_info __rcu *npinfo;
#endif
possible_net_t nd_net;
/* mid-layer private */
void *ml_priv;
enum netdev_ml_priv_type ml_priv_type;
union {
struct pcpu_lstats __percpu *lstats;
struct pcpu_sw_netstats __percpu *tstats;
struct pcpu_dstats __percpu *dstats;
};
#if IS_ENABLED(CONFIG_GARP)
struct garp_port __rcu *garp_port;
#endif
#if IS_ENABLED(CONFIG_MRP)
struct mrp_port __rcu *mrp_port;
#endif
struct device dev;
const struct attribute_group *sysfs_groups[4];
const struct attribute_group *sysfs_rx_queue_group;
const struct rtnl_link_ops *rtnl_link_ops;
/* for setting kernel sock attribute on TCP connection setup */
#define GSO_MAX_SIZE 65536
unsigned int gso_max_size;
#define GSO_MAX_SEGS 65535
u16 gso_max_segs;
#ifdef CONFIG_DCB
const struct dcbnl_rtnl_ops *dcbnl_ops;
#endif
s16 num_tc;
struct netdev_tc_txq tc_to_txq[TC_MAX_QUEUE];
u8 prio_tc_map[TC_BITMASK + 1];
#if IS_ENABLED(CONFIG_FCOE)
unsigned int fcoe_ddp_xid;
#endif
#if IS_ENABLED(CONFIG_CGROUP_NET_PRIO)
struct netprio_map __rcu *priomap;
#endif
struct phy_device *phydev;
struct sfp_bus *sfp_bus;
struct lock_class_key *qdisc_tx_busylock;
struct lock_class_key *qdisc_running_key;
bool proto_down;
unsigned wol_enabled:1;
unsigned threaded:1;
struct list_head net_notifier_list;
#if IS_ENABLED(CONFIG_MACSEC)
/* MACsec management functions */
const struct macsec_ops *macsec_ops;
#endif
const struct udp_tunnel_nic_info *udp_tunnel_nic_info;
struct udp_tunnel_nic *udp_tunnel_nic;
/* protected by rtnl_lock */
struct bpf_xdp_entity xdp_state[__MAX_XDP_MODE];
};
#define to_net_dev(d) container_of(d, struct net_device, dev)
static inline bool netif_elide_gro(const struct net_device *dev)
{
if (!(dev->features & NETIF_F_GRO) || dev->xdp_prog)
return true;
return false;
}
#define NETDEV_ALIGN 32
static inline
int netdev_get_prio_tc_map(const struct net_device *dev, u32 prio)
{
return dev->prio_tc_map[prio & TC_BITMASK];
}
static inline
int netdev_set_prio_tc_map(struct net_device *dev, u8 prio, u8 tc)
{
if (tc >= dev->num_tc)
return -EINVAL;
dev->prio_tc_map[prio & TC_BITMASK] = tc & TC_BITMASK;
return 0;
}
int netdev_txq_to_tc(struct net_device *dev, unsigned int txq);
void netdev_reset_tc(struct net_device *dev);
int netdev_set_tc_queue(struct net_device *dev, u8 tc, u16 count, u16 offset);
int netdev_set_num_tc(struct net_device *dev, u8 num_tc);
static inline
int netdev_get_num_tc(struct net_device *dev)
{
return dev->num_tc;
}
static inline void net_prefetch(void *p)
{
prefetch(p);
#if L1_CACHE_BYTES < 128
prefetch((u8 *)p + L1_CACHE_BYTES);
#endif
}
static inline void net_prefetchw(void *p)
{
prefetchw(p);
#if L1_CACHE_BYTES < 128
prefetchw((u8 *)p + L1_CACHE_BYTES);
#endif
}
void netdev_unbind_sb_channel(struct net_device *dev,
struct net_device *sb_dev);
int netdev_bind_sb_channel_queue(struct net_device *dev,
struct net_device *sb_dev,
u8 tc, u16 count, u16 offset);
int netdev_set_sb_channel(struct net_device *dev, u16 channel);
static inline int netdev_get_sb_channel(struct net_device *dev)
{
return max_t(int, -dev->num_tc, 0);
}
static inline
struct netdev_queue *netdev_get_tx_queue(const struct net_device *dev,
unsigned int index)
{
return &dev->_tx[index];
}
static inline struct netdev_queue *skb_get_tx_queue(const struct net_device *dev,
const struct sk_buff *skb)
{
return netdev_get_tx_queue(dev, skb_get_queue_mapping(skb));
}
static inline void netdev_for_each_tx_queue(struct net_device *dev,
void (*f)(struct net_device *,
struct netdev_queue *,
void *),
void *arg)
{
unsigned int i;
for (i = 0; i < dev->num_tx_queues; i++)
f(dev, &dev->_tx[i], arg);
}
#define netdev_lockdep_set_classes(dev) \
{ \
static struct lock_class_key qdisc_tx_busylock_key; \
static struct lock_class_key qdisc_running_key; \
static struct lock_class_key qdisc_xmit_lock_key; \
static struct lock_class_key dev_addr_list_lock_key; \
unsigned int i; \
\
(dev)->qdisc_tx_busylock = &qdisc_tx_busylock_key; \
(dev)->qdisc_running_key = &qdisc_running_key; \
lockdep_set_class(&(dev)->addr_list_lock, \
&dev_addr_list_lock_key); \
for (i = 0; i < (dev)->num_tx_queues; i++) \
lockdep_set_class(&(dev)->_tx[i]._xmit_lock, \
&qdisc_xmit_lock_key); \
}
u16 netdev_pick_tx(struct net_device *dev, struct sk_buff *skb,
struct net_device *sb_dev);
struct netdev_queue *netdev_core_pick_tx(struct net_device *dev,
struct sk_buff *skb,
struct net_device *sb_dev);
/* returns the headroom that the master device needs to take in account
* when forwarding to this dev
*/
static inline unsigned netdev_get_fwd_headroom(struct net_device *dev)
{
return dev->priv_flags & IFF_PHONY_HEADROOM ? 0 : dev->needed_headroom;
}
static inline void netdev_set_rx_headroom(struct net_device *dev, int new_hr)
{
if (dev->netdev_ops->ndo_set_rx_headroom)
dev->netdev_ops->ndo_set_rx_headroom(dev, new_hr);
}
/* set the device rx headroom to the dev's default */
static inline void netdev_reset_rx_headroom(struct net_device *dev)
{
netdev_set_rx_headroom(dev, -1);
}
static inline void *netdev_get_ml_priv(struct net_device *dev,
enum netdev_ml_priv_type type)
{
if (dev->ml_priv_type != type)
return NULL;
return dev->ml_priv;
}
static inline void netdev_set_ml_priv(struct net_device *dev,
void *ml_priv,
enum netdev_ml_priv_type type)
{
WARN(dev->ml_priv_type && dev->ml_priv_type != type,
"Overwriting already set ml_priv_type (%u) with different ml_priv_type (%u)!\n",
dev->ml_priv_type, type);
WARN(!dev->ml_priv_type && dev->ml_priv,
"Overwriting already set ml_priv and ml_priv_type is ML_PRIV_NONE!\n");
dev->ml_priv = ml_priv;
dev->ml_priv_type = type;
}
/*
* Net namespace inlines
*/
static inline
struct net *dev_net(const struct net_device *dev)
{
return read_pnet(&dev->nd_net);
}
static inline
void dev_net_set(struct net_device *dev, struct net *net)
{
write_pnet(&dev->nd_net, net);
}
/**
* netdev_priv - access network device private data
* @dev: network device
*
* Get network device private data
*/
static inline void *netdev_priv(const struct net_device *dev)
{
return (char *)dev + ALIGN(sizeof(struct net_device), NETDEV_ALIGN);
}
/* Set the sysfs physical device reference for the network logical device
* if set prior to registration will cause a symlink during initialization.
*/
#define SET_NETDEV_DEV(net, pdev) ((net)->dev.parent = (pdev))
/* Set the sysfs device type for the network logical device to allow
* fine-grained identification of different network device types. For
* example Ethernet, Wireless LAN, Bluetooth, WiMAX etc.
*/
#define SET_NETDEV_DEVTYPE(net, devtype) ((net)->dev.type = (devtype))
/* Default NAPI poll() weight
* Device drivers are strongly advised to not use bigger value
*/
#define NAPI_POLL_WEIGHT 64
/**
* netif_napi_add - initialize a NAPI context
* @dev: network device
* @napi: NAPI context
* @poll: polling function
* @weight: default weight
*
* netif_napi_add() must be used to initialize a NAPI context prior to calling
* *any* of the other NAPI-related functions.
*/
void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
int (*poll)(struct napi_struct *, int), int weight);
/**
* netif_tx_napi_add - initialize a NAPI context
* @dev: network device
* @napi: NAPI context
* @poll: polling function
* @weight: default weight
*
* This variant of netif_napi_add() should be used from drivers using NAPI
* to exclusively poll a TX queue.
* This will avoid we add it into napi_hash[], thus polluting this hash table.
*/
static inline void netif_tx_napi_add(struct net_device *dev,
struct napi_struct *napi,
int (*poll)(struct napi_struct *, int),
int weight)
{
set_bit(NAPI_STATE_NO_BUSY_POLL, &napi->state);
netif_napi_add(dev, napi, poll, weight);
}
/**
* __netif_napi_del - remove a NAPI context
* @napi: NAPI context
*
* Warning: caller must observe RCU grace period before freeing memory
* containing @napi. Drivers might want to call this helper to combine
* all the needed RCU grace periods into a single one.
*/
void __netif_napi_del(struct napi_struct *napi);
/**
* netif_napi_del - remove a NAPI context
* @napi: NAPI context
*
* netif_napi_del() removes a NAPI context from the network device NAPI list
*/
static inline void netif_napi_del(struct napi_struct *napi)
{
__netif_napi_del(napi);
synchronize_net();
}
struct napi_gro_cb {
/* Virtual address of skb_shinfo(skb)->frags[0].page + offset. */
void *frag0;
/* Length of frag0. */
unsigned int frag0_len;
/* This indicates where we are processing relative to skb->data. */
int data_offset;
/* This is non-zero if the packet cannot be merged with the new skb. */
u16 flush;
/* Save the IP ID here and check when we get to the transport layer */
u16 flush_id;
/* Number of segments aggregated. */
u16 count;
/* Start offset for remote checksum offload */
u16 gro_remcsum_start;
/* jiffies when first packet was created/queued */
unsigned long age;
/* Used in ipv6_gro_receive() and foo-over-udp */
u16 proto;
/* This is non-zero if the packet may be of the same flow. */
u8 same_flow:1;
/* Used in tunnel GRO receive */
u8 encap_mark:1;
/* GRO checksum is valid */
u8 csum_valid:1;
/* Number of checksums via CHECKSUM_UNNECESSARY */
u8 csum_cnt:3;
/* Free the skb? */
u8 free:2;
#define NAPI_GRO_FREE 1
#define NAPI_GRO_FREE_STOLEN_HEAD 2
/* Used in foo-over-udp, set in udp[46]_gro_receive */
u8 is_ipv6:1;
/* Used in GRE, set in fou/gue_gro_receive */
u8 is_fou:1;
/* Used to determine if flush_id can be ignored */
u8 is_atomic:1;
/* Number of gro_receive callbacks this packet already went through */
u8 recursion_counter:4;
/* GRO is done by frag_list pointer chaining. */
u8 is_flist:1;
/* used to support CHECKSUM_COMPLETE for tunneling protocols */
__wsum csum;
/* used in skb_gro_receive() slow path */
struct sk_buff *last;
};
#define NAPI_GRO_CB(skb) ((struct napi_gro_cb *)(skb)->cb)
#define GRO_RECURSION_LIMIT 15
static inline int gro_recursion_inc_test(struct sk_buff *skb)
{
return ++NAPI_GRO_CB(skb)->recursion_counter == GRO_RECURSION_LIMIT;
}
typedef struct sk_buff *(*gro_receive_t)(struct list_head *, struct sk_buff *);
static inline struct sk_buff *call_gro_receive(gro_receive_t cb,
struct list_head *head,
struct sk_buff *skb)
{
if (unlikely(gro_recursion_inc_test(skb))) {
NAPI_GRO_CB(skb)->flush |= 1;
return NULL;
}
return cb(head, skb);
}
typedef struct sk_buff *(*gro_receive_sk_t)(struct sock *, struct list_head *,
struct sk_buff *);
static inline struct sk_buff *call_gro_receive_sk(gro_receive_sk_t cb,
struct sock *sk,
struct list_head *head,
struct sk_buff *skb)
{
if (unlikely(gro_recursion_inc_test(skb))) {
NAPI_GRO_CB(skb)->flush |= 1;
return NULL;
}
return cb(sk, head, skb);
}
struct packet_type {
__be16 type; /* This is really htons(ether_type). */
bool ignore_outgoing;
struct net_device *dev; /* NULL is wildcarded here */
int (*func) (struct sk_buff *,
struct net_device *,
struct packet_type *,
struct net_device *);
void (*list_func) (struct list_head *,
struct packet_type *,
struct net_device *);
bool (*id_match)(struct packet_type *ptype,
struct sock *sk);
struct net *af_packet_net;
void *af_packet_priv;
struct list_head list;
};
struct offload_callbacks {
struct sk_buff *(*gso_segment)(struct sk_buff *skb,
netdev_features_t features);
struct sk_buff *(*gro_receive)(struct list_head *head,
struct sk_buff *skb);
int (*gro_complete)(struct sk_buff *skb, int nhoff);
};
struct packet_offload {
__be16 type; /* This is really htons(ether_type). */
u16 priority;
struct offload_callbacks callbacks;
struct list_head list;
};
/* often modified stats are per-CPU, other are shared (netdev->stats) */
struct pcpu_sw_netstats {
u64 rx_packets;
u64 rx_bytes;
u64 tx_packets;
u64 tx_bytes;
struct u64_stats_sync syncp;
} __aligned(4 * sizeof(u64));
struct pcpu_lstats {
u64_stats_t packets;
u64_stats_t bytes;
struct u64_stats_sync syncp;
} __aligned(2 * sizeof(u64));
void dev_lstats_read(struct net_device *dev, u64 *packets, u64 *bytes);
static inline void dev_sw_netstats_rx_add(struct net_device *dev, unsigned int len)
{
struct pcpu_sw_netstats *tstats = this_cpu_ptr(dev->tstats);
u64_stats_update_begin(&tstats->syncp);
tstats->rx_bytes += len;
tstats->rx_packets++;
u64_stats_update_end(&tstats->syncp);
}
static inline void dev_sw_netstats_tx_add(struct net_device *dev,
unsigned int packets,
unsigned int len)
{
struct pcpu_sw_netstats *tstats = this_cpu_ptr(dev->tstats);
u64_stats_update_begin(&tstats->syncp);
tstats->tx_bytes += len;
tstats->tx_packets += packets;
u64_stats_update_end(&tstats->syncp);
}
static inline void dev_lstats_add(struct net_device *dev, unsigned int len)
{
struct pcpu_lstats *lstats = this_cpu_ptr(dev->lstats);
u64_stats_update_begin(&lstats->syncp);
u64_stats_add(&lstats->bytes, len);
u64_stats_inc(&lstats->packets);
u64_stats_update_end(&lstats->syncp);
}
#define __netdev_alloc_pcpu_stats(type, gfp) \
({ \
typeof(type) __percpu *pcpu_stats = alloc_percpu_gfp(type, gfp);\
if (pcpu_stats) { \
int __cpu; \
for_each_possible_cpu(__cpu) { \
typeof(type) *stat; \
stat = per_cpu_ptr(pcpu_stats, __cpu); \
u64_stats_init(&stat->syncp); \
} \
} \
pcpu_stats; \
})
#define netdev_alloc_pcpu_stats(type) \
__netdev_alloc_pcpu_stats(type, GFP_KERNEL)
#define devm_netdev_alloc_pcpu_stats(dev, type) \
({ \
typeof(type) __percpu *pcpu_stats = devm_alloc_percpu(dev, type);\
if (pcpu_stats) { \
int __cpu; \
for_each_possible_cpu(__cpu) { \
typeof(type) *stat; \
stat = per_cpu_ptr(pcpu_stats, __cpu); \
u64_stats_init(&stat->syncp); \
} \
} \
pcpu_stats; \
})
enum netdev_lag_tx_type {
NETDEV_LAG_TX_TYPE_UNKNOWN,
NETDEV_LAG_TX_TYPE_RANDOM,
NETDEV_LAG_TX_TYPE_BROADCAST,
NETDEV_LAG_TX_TYPE_ROUNDROBIN,
NETDEV_LAG_TX_TYPE_ACTIVEBACKUP,
NETDEV_LAG_TX_TYPE_HASH,
};
enum netdev_lag_hash {
NETDEV_LAG_HASH_NONE,
NETDEV_LAG_HASH_L2,
NETDEV_LAG_HASH_L34,
NETDEV_LAG_HASH_L23,
NETDEV_LAG_HASH_E23,
NETDEV_LAG_HASH_E34,
NETDEV_LAG_HASH_VLAN_SRCMAC,
NETDEV_LAG_HASH_UNKNOWN,
};
struct netdev_lag_upper_info {
enum netdev_lag_tx_type tx_type;
enum netdev_lag_hash hash_type;
};
struct netdev_lag_lower_state_info {
u8 link_up : 1,
tx_enabled : 1;
};
#include <linux/notifier.h>
/* netdevice notifier chain. Please remember to update netdev_cmd_to_name()
* and the rtnetlink notification exclusion list in rtnetlink_event() when
* adding new types.
*/
enum netdev_cmd {
NETDEV_UP = 1, /* For now you can't veto a device up/down */
NETDEV_DOWN,
NETDEV_REBOOT, /* Tell a protocol stack a network interface
detected a hardware crash and restarted
- we can use this eg to kick tcp sessions
once done */
NETDEV_CHANGE, /* Notify device state change */
NETDEV_REGISTER,
NETDEV_UNREGISTER,
NETDEV_CHANGEMTU, /* notify after mtu change happened */
NETDEV_CHANGEADDR, /* notify after the address change */
NETDEV_PRE_CHANGEADDR, /* notify before the address change */
NETDEV_GOING_DOWN,
NETDEV_CHANGENAME,
NETDEV_FEAT_CHANGE,
NETDEV_BONDING_FAILOVER,
NETDEV_PRE_UP,
NETDEV_PRE_TYPE_CHANGE,
NETDEV_POST_TYPE_CHANGE,
NETDEV_POST_INIT,
NETDEV_RELEASE,
NETDEV_NOTIFY_PEERS,
NETDEV_JOIN,
NETDEV_CHANGEUPPER,
NETDEV_RESEND_IGMP,
NETDEV_PRECHANGEMTU, /* notify before mtu change happened */
NETDEV_CHANGEINFODATA,
NETDEV_BONDING_INFO,
NETDEV_PRECHANGEUPPER,
NETDEV_CHANGELOWERSTATE,
NETDEV_UDP_TUNNEL_PUSH_INFO,
NETDEV_UDP_TUNNEL_DROP_INFO,
NETDEV_CHANGE_TX_QUEUE_LEN,
NETDEV_CVLAN_FILTER_PUSH_INFO,
NETDEV_CVLAN_FILTER_DROP_INFO,
NETDEV_SVLAN_FILTER_PUSH_INFO,
NETDEV_SVLAN_FILTER_DROP_INFO,
};
const char *netdev_cmd_to_name(enum netdev_cmd cmd);
int register_netdevice_notifier(struct notifier_block *nb);
int unregister_netdevice_notifier(struct notifier_block *nb);
int register_netdevice_notifier_net(struct net *net, struct notifier_block *nb);
int unregister_netdevice_notifier_net(struct net *net,
struct notifier_block *nb);
int register_netdevice_notifier_dev_net(struct net_device *dev,
struct notifier_block *nb,
struct netdev_net_notifier *nn);
int unregister_netdevice_notifier_dev_net(struct net_device *dev,
struct notifier_block *nb,
struct netdev_net_notifier *nn);
struct netdev_notifier_info {
struct net_device *dev;
struct netlink_ext_ack *extack;
};
struct netdev_notifier_info_ext {
struct netdev_notifier_info info; /* must be first */
union {
u32 mtu;
} ext;
};
struct netdev_notifier_change_info {
struct netdev_notifier_info info; /* must be first */
unsigned int flags_changed;
};
struct netdev_notifier_changeupper_info {
struct netdev_notifier_info info; /* must be first */
struct net_device *upper_dev; /* new upper dev */
bool master; /* is upper dev master */
bool linking; /* is the notification for link or unlink */
void *upper_info; /* upper dev info */
};
struct netdev_notifier_changelowerstate_info {
struct netdev_notifier_info info; /* must be first */
void *lower_state_info; /* is lower dev state */
};
struct netdev_notifier_pre_changeaddr_info {
struct netdev_notifier_info info; /* must be first */
const unsigned char *dev_addr;
};
static inline void netdev_notifier_info_init(struct netdev_notifier_info *info,
struct net_device *dev)
{
info->dev = dev;
info->extack = NULL;
}
static inline struct net_device *
netdev_notifier_info_to_dev(const struct netdev_notifier_info *info)
{
return info->dev;
}
static inline struct netlink_ext_ack *
netdev_notifier_info_to_extack(const struct netdev_notifier_info *info)
{
return info->extack;
}
int call_netdevice_notifiers(unsigned long val, struct net_device *dev);
extern rwlock_t dev_base_lock; /* Device list lock */
#define for_each_netdev(net, d) \
list_for_each_entry(d, &(net)->dev_base_head, dev_list)
#define for_each_netdev_reverse(net, d) \
list_for_each_entry_reverse(d, &(net)->dev_base_head, dev_list)
#define for_each_netdev_rcu(net, d) \
list_for_each_entry_rcu(d, &(net)->dev_base_head, dev_list)
#define for_each_netdev_safe(net, d, n) \
list_for_each_entry_safe(d, n, &(net)->dev_base_head, dev_list)
#define for_each_netdev_continue(net, d) \
list_for_each_entry_continue(d, &(net)->dev_base_head, dev_list)
#define for_each_netdev_continue_reverse(net, d) \
list_for_each_entry_continue_reverse(d, &(net)->dev_base_head, \
dev_list)
#define for_each_netdev_continue_rcu(net, d) \
list_for_each_entry_continue_rcu(d, &(net)->dev_base_head, dev_list)
#define for_each_netdev_in_bond_rcu(bond, slave) \
for_each_netdev_rcu(&init_net, slave) \
if (netdev_master_upper_dev_get_rcu(slave) == (bond))
#define net_device_entry(lh) list_entry(lh, struct net_device, dev_list)
static inline struct net_device *next_net_device(struct net_device *dev)
{
struct list_head *lh;
struct net *net;
net = dev_net(dev);
lh = dev->dev_list.next;
return lh == &net->dev_base_head ? NULL : net_device_entry(lh);
}
static inline struct net_device *next_net_device_rcu(struct net_device *dev)
{
struct list_head *lh;
struct net *net;
net = dev_net(dev);
lh = rcu_dereference(list_next_rcu(&dev->dev_list));
return lh == &net->dev_base_head ? NULL : net_device_entry(lh);
}
static inline struct net_device *first_net_device(struct net *net)
{
return list_empty(&net->dev_base_head) ? NULL :
net_device_entry(net->dev_base_head.next);
}
static inline struct net_device *first_net_device_rcu(struct net *net)
{
struct list_head *lh = rcu_dereference(list_next_rcu(&net->dev_base_head));
return lh == &net->dev_base_head ? NULL : net_device_entry(lh);
}
int netdev_boot_setup_check(struct net_device *dev);
struct net_device *dev_getbyhwaddr_rcu(struct net *net, unsigned short type,
const char *hwaddr);
struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type);
void dev_add_pack(struct packet_type *pt);
void dev_remove_pack(struct packet_type *pt);
void __dev_remove_pack(struct packet_type *pt);
void dev_add_offload(struct packet_offload *po);
void dev_remove_offload(struct packet_offload *po);
int dev_get_iflink(const struct net_device *dev);
int dev_fill_metadata_dst(struct net_device *dev, struct sk_buff *skb);
int dev_fill_forward_path(const struct net_device *dev, const u8 *daddr,
struct net_device_path_stack *stack);
struct net_device *__dev_get_by_flags(struct net *net, unsigned short flags,
unsigned short mask);
struct net_device *dev_get_by_name(struct net *net, const char *name);
struct net_device *dev_get_by_name_rcu(struct net *net, const char *name);
struct net_device *__dev_get_by_name(struct net *net, const char *name);
int dev_alloc_name(struct net_device *dev, const char *name);
int dev_open(struct net_device *dev, struct netlink_ext_ack *extack);
void dev_close(struct net_device *dev);
void dev_close_many(struct list_head *head, bool unlink);
void dev_disable_lro(struct net_device *dev);
int dev_loopback_xmit(struct net *net, struct sock *sk, struct sk_buff *newskb);
u16 dev_pick_tx_zero(struct net_device *dev, struct sk_buff *skb,
struct net_device *sb_dev);
u16 dev_pick_tx_cpu_id(struct net_device *dev, struct sk_buff *skb,
struct net_device *sb_dev);
int dev_queue_xmit(struct sk_buff *skb);
int dev_queue_xmit_accel(struct sk_buff *skb, struct net_device *sb_dev);
int __dev_direct_xmit(struct sk_buff *skb, u16 queue_id);
static inline int dev_direct_xmit(struct sk_buff *skb, u16 queue_id)
{
int ret;
ret = __dev_direct_xmit(skb, queue_id);
if (!dev_xmit_complete(ret))
kfree_skb(skb);
return ret;
}
int register_netdevice(struct net_device *dev);
void unregister_netdevice_queue(struct net_device *dev, struct list_head *head);
void unregister_netdevice_many(struct list_head *head);
static inline void unregister_netdevice(struct net_device *dev)
{
unregister_netdevice_queue(dev, NULL);
}
int netdev_refcnt_read(const struct net_device *dev);
void free_netdev(struct net_device *dev);
void netdev_freemem(struct net_device *dev);
int init_dummy_netdev(struct net_device *dev);
struct net_device *netdev_get_xmit_slave(struct net_device *dev,
struct sk_buff *skb,
bool all_slaves);
struct net_device *netdev_sk_get_lowest_dev(struct net_device *dev,
struct sock *sk);
struct net_device *dev_get_by_index(struct net *net, int ifindex);
struct net_device *__dev_get_by_index(struct net *net, int ifindex);
struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex);
struct net_device *dev_get_by_napi_id(unsigned int napi_id);
int netdev_get_name(struct net *net, char *name, int ifindex);
int dev_restart(struct net_device *dev);
int skb_gro_receive(struct sk_buff *p, struct sk_buff *skb);
int skb_gro_receive_list(struct sk_buff *p, struct sk_buff *skb);
static inline unsigned int skb_gro_offset(const struct sk_buff *skb)
{
return NAPI_GRO_CB(skb)->data_offset;
}
static inline unsigned int skb_gro_len(const struct sk_buff *skb)
{
return skb->len - NAPI_GRO_CB(skb)->data_offset;
}
static inline void skb_gro_pull(struct sk_buff *skb, unsigned int len)
{
NAPI_GRO_CB(skb)->data_offset += len;
}
static inline void *skb_gro_header_fast(struct sk_buff *skb,
unsigned int offset)
{
return NAPI_GRO_CB(skb)->frag0 + offset;
}
static inline int skb_gro_header_hard(struct sk_buff *skb, unsigned int hlen)
{
return NAPI_GRO_CB(skb)->frag0_len < hlen;
}
static inline void skb_gro_frag0_invalidate(struct sk_buff *skb)
{
NAPI_GRO_CB(skb)->frag0 = NULL;
NAPI_GRO_CB(skb)->frag0_len = 0;
}
static inline void *skb_gro_header_slow(struct sk_buff *skb, unsigned int hlen,
unsigned int offset)
{
if (!pskb_may_pull(skb, hlen))
return NULL;
skb_gro_frag0_invalidate(skb);
return skb->data + offset;
}
static inline void *skb_gro_network_header(struct sk_buff *skb)
{
return (NAPI_GRO_CB(skb)->frag0 ?: skb->data) +
skb_network_offset(skb);
}
static inline void skb_gro_postpull_rcsum(struct sk_buff *skb,
const void *start, unsigned int len)
{
if (NAPI_GRO_CB(skb)->csum_valid)
NAPI_GRO_CB(skb)->csum = csum_sub(NAPI_GRO_CB(skb)->csum,
csum_partial(start, len, 0));
}
/* GRO checksum functions. These are logical equivalents of the normal
* checksum functions (in skbuff.h) except that they operate on the GRO
* offsets and fields in sk_buff.
*/
__sum16 __skb_gro_checksum_complete(struct sk_buff *skb);
static inline bool skb_at_gro_remcsum_start(struct sk_buff *skb)
{
return (NAPI_GRO_CB(skb)->gro_remcsum_start == skb_gro_offset(skb));
}
static inline bool __skb_gro_checksum_validate_needed(struct sk_buff *skb,
bool zero_okay,
__sum16 check)
{
return ((skb->ip_summed != CHECKSUM_PARTIAL ||
skb_checksum_start_offset(skb) <
skb_gro_offset(skb)) &&
!skb_at_gro_remcsum_start(skb) &&
NAPI_GRO_CB(skb)->csum_cnt == 0 &&
(!zero_okay || check));
}
static inline __sum16 __skb_gro_checksum_validate_complete(struct sk_buff *skb,
__wsum psum)
{
if (NAPI_GRO_CB(skb)->csum_valid &&
!csum_fold(csum_add(psum, NAPI_GRO_CB(skb)->csum)))
return 0;
NAPI_GRO_CB(skb)->csum = psum;
return __skb_gro_checksum_complete(skb);
}
static inline void skb_gro_incr_csum_unnecessary(struct sk_buff *skb)
{
if (NAPI_GRO_CB(skb)->csum_cnt > 0) {
/* Consume a checksum from CHECKSUM_UNNECESSARY */
NAPI_GRO_CB(skb)->csum_cnt--;
} else {
/* Update skb for CHECKSUM_UNNECESSARY and csum_level when we
* verified a new top level checksum or an encapsulated one
* during GRO. This saves work if we fallback to normal path.
*/
__skb_incr_checksum_unnecessary(skb);
}
}
#define __skb_gro_checksum_validate(skb, proto, zero_okay, check, \
compute_pseudo) \
({ \
__sum16 __ret = 0; \
if (__skb_gro_checksum_validate_needed(skb, zero_okay, check)) \
__ret = __skb_gro_checksum_validate_complete(skb, \
compute_pseudo(skb, proto)); \
if (!__ret) \
skb_gro_incr_csum_unnecessary(skb); \
__ret; \
})
#define skb_gro_checksum_validate(skb, proto, compute_pseudo) \
__skb_gro_checksum_validate(skb, proto, false, 0, compute_pseudo)
#define skb_gro_checksum_validate_zero_check(skb, proto, check, \
compute_pseudo) \
__skb_gro_checksum_validate(skb, proto, true, check, compute_pseudo)
#define skb_gro_checksum_simple_validate(skb) \
__skb_gro_checksum_validate(skb, 0, false, 0, null_compute_pseudo)
static inline bool __skb_gro_checksum_convert_check(struct sk_buff *skb)
{
return (NAPI_GRO_CB(skb)->csum_cnt == 0 &&
!NAPI_GRO_CB(skb)->csum_valid);
}
static inline void __skb_gro_checksum_convert(struct sk_buff *skb,
__wsum pseudo)
{
NAPI_GRO_CB(skb)->csum = ~pseudo;
NAPI_GRO_CB(skb)->csum_valid = 1;
}
#define skb_gro_checksum_try_convert(skb, proto, compute_pseudo) \
do { \
if (__skb_gro_checksum_convert_check(skb)) \
__skb_gro_checksum_convert(skb, \
compute_pseudo(skb, proto)); \
} while (0)
struct gro_remcsum {
int offset;
__wsum delta;
};
static inline void skb_gro_remcsum_init(struct gro_remcsum *grc)
{
grc->offset = 0;
grc->delta = 0;
}
static inline void *skb_gro_remcsum_process(struct sk_buff *skb, void *ptr,
unsigned int off, size_t hdrlen,
int start, int offset,
struct gro_remcsum *grc,
bool nopartial)
{
__wsum delta;
size_t plen = hdrlen + max_t(size_t, offset + sizeof(u16), start);
BUG_ON(!NAPI_GRO_CB(skb)->csum_valid);
if (!nopartial) {
NAPI_GRO_CB(skb)->gro_remcsum_start = off + hdrlen + start;
return ptr;
}
ptr = skb_gro_header_fast(skb, off);
if (skb_gro_header_hard(skb, off + plen)) {
ptr = skb_gro_header_slow(skb, off + plen, off);
if (!ptr)
return NULL;
}
delta = remcsum_adjust(ptr + hdrlen, NAPI_GRO_CB(skb)->csum,
start, offset);
/* Adjust skb->csum since we changed the packet */
NAPI_GRO_CB(skb)->csum = csum_add(NAPI_GRO_CB(skb)->csum, delta);
grc->offset = off + hdrlen + offset;
grc->delta = delta;
return ptr;
}
static inline void skb_gro_remcsum_cleanup(struct sk_buff *skb,
struct gro_remcsum *grc)
{
void *ptr;
size_t plen = grc->offset + sizeof(u16);
if (!grc->delta)
return;
ptr = skb_gro_header_fast(skb, grc->offset);
if (skb_gro_header_hard(skb, grc->offset + sizeof(u16))) {
ptr = skb_gro_header_slow(skb, plen, grc->offset);
if (!ptr)
return;
}
remcsum_unadjust((__sum16 *)ptr, grc->delta);
}
#ifdef CONFIG_XFRM_OFFLOAD
static inline void skb_gro_flush_final(struct sk_buff *skb, struct sk_buff *pp, int flush)
{
if (PTR_ERR(pp) != -EINPROGRESS)
NAPI_GRO_CB(skb)->flush |= flush;
}
static inline void skb_gro_flush_final_remcsum(struct sk_buff *skb,
struct sk_buff *pp,
int flush,
struct gro_remcsum *grc)
{
if (PTR_ERR(pp) != -EINPROGRESS) {
NAPI_GRO_CB(skb)->flush |= flush;
skb_gro_remcsum_cleanup(skb, grc);
skb->remcsum_offload = 0;
}
}
#else
static inline void skb_gro_flush_final(struct sk_buff *skb, struct sk_buff *pp, int flush)
{
NAPI_GRO_CB(skb)->flush |= flush;
}
static inline void skb_gro_flush_final_remcsum(struct sk_buff *skb,
struct sk_buff *pp,
int flush,
struct gro_remcsum *grc)
{
NAPI_GRO_CB(skb)->flush |= flush;
skb_gro_remcsum_cleanup(skb, grc);
skb->remcsum_offload = 0;
}
#endif
static inline int dev_hard_header(struct sk_buff *skb, struct net_device *dev,
unsigned short type,
const void *daddr, const void *saddr,
unsigned int len)
{
if (!dev->header_ops || !dev->header_ops->create)
return 0;
return dev->header_ops->create(skb, dev, type, daddr, saddr, len);
}
static inline int dev_parse_header(const struct sk_buff *skb,
unsigned char *haddr)
{
const struct net_device *dev = skb->dev;
if (!dev->header_ops || !dev->header_ops->parse)
return 0;
return dev->header_ops->parse(skb, haddr);
}
static inline __be16 dev_parse_header_protocol(const struct sk_buff *skb)
{
const struct net_device *dev = skb->dev; if (!dev->header_ops || !dev->header_ops->parse_protocol)
return 0;
return dev->header_ops->parse_protocol(skb);
}
/* ll_header must have at least hard_header_len allocated */
static inline bool dev_validate_header(const struct net_device *dev,
char *ll_header, int len)
{
if (likely(len >= dev->hard_header_len))
return true;
if (len < dev->min_header_len)
return false;
if (capable(CAP_SYS_RAWIO)) {
memset(ll_header + len, 0, dev->hard_header_len - len); return true;
}
if (dev->header_ops && dev->header_ops->validate) return dev->header_ops->validate(ll_header, len);
return false;
}
static inline bool dev_has_header(const struct net_device *dev)
{
return dev->header_ops && dev->header_ops->create;
}
#ifdef CONFIG_NET_FLOW_LIMIT
#define FLOW_LIMIT_HISTORY (1 << 7) /* must be ^2 and !overflow buckets */
struct sd_flow_limit {
u64 count;
unsigned int num_buckets;
unsigned int history_head;
u16 history[FLOW_LIMIT_HISTORY];
u8 buckets[];
};
extern int netdev_flow_limit_table_len;
#endif /* CONFIG_NET_FLOW_LIMIT */
/*
* Incoming packets are placed on per-CPU queues
*/
struct softnet_data {
struct list_head poll_list;
struct sk_buff_head process_queue;
/* stats */
unsigned int processed;
unsigned int time_squeeze;
unsigned int received_rps;
#ifdef CONFIG_RPS
struct softnet_data *rps_ipi_list;
#endif
#ifdef CONFIG_NET_FLOW_LIMIT
struct sd_flow_limit __rcu *flow_limit;
#endif
struct Qdisc *output_queue;
struct Qdisc **output_queue_tailp;
struct sk_buff *completion_queue;
#ifdef CONFIG_XFRM_OFFLOAD
struct sk_buff_head xfrm_backlog;
#endif
/* written and read only by owning cpu: */
struct {
u16 recursion;
u8 more;
} xmit;
#ifdef CONFIG_RPS
/* input_queue_head should be written by cpu owning this struct,
* and only read by other cpus. Worth using a cache line.
*/
unsigned int input_queue_head ____cacheline_aligned_in_smp;
/* Elements below can be accessed between CPUs for RPS/RFS */
call_single_data_t csd ____cacheline_aligned_in_smp;
struct softnet_data *rps_ipi_next;
unsigned int cpu;
unsigned int input_queue_tail;
#endif
unsigned int dropped;
struct sk_buff_head input_pkt_queue;
struct napi_struct backlog;
};
static inline void input_queue_head_incr(struct softnet_data *sd)
{
#ifdef CONFIG_RPS
sd->input_queue_head++;
#endif
}
static inline void input_queue_tail_incr_save(struct softnet_data *sd,
unsigned int *qtail)
{
#ifdef CONFIG_RPS
*qtail = ++sd->input_queue_tail;
#endif
}
DECLARE_PER_CPU_ALIGNED(struct softnet_data, softnet_data);
static inline int dev_recursion_level(void)
{
return this_cpu_read(softnet_data.xmit.recursion);
}
#define XMIT_RECURSION_LIMIT 8
static inline bool dev_xmit_recursion(void)
{
return unlikely(__this_cpu_read(softnet_data.xmit.recursion) >
XMIT_RECURSION_LIMIT);
}
static inline void dev_xmit_recursion_inc(void)
{
__this_cpu_inc(softnet_data.xmit.recursion);
}
static inline void dev_xmit_recursion_dec(void)
{
__this_cpu_dec(softnet_data.xmit.recursion);
}
void __netif_schedule(struct Qdisc *q);
void netif_schedule_queue(struct netdev_queue *txq);
static inline void netif_tx_schedule_all(struct net_device *dev)
{
unsigned int i;
for (i = 0; i < dev->num_tx_queues; i++)
netif_schedule_queue(netdev_get_tx_queue(dev, i));
}
static __always_inline void netif_tx_start_queue(struct netdev_queue *dev_queue)
{
clear_bit(__QUEUE_STATE_DRV_XOFF, &dev_queue->state);
}
/**
* netif_start_queue - allow transmit
* @dev: network device
*
* Allow upper layers to call the device hard_start_xmit routine.
*/
static inline void netif_start_queue(struct net_device *dev)
{
netif_tx_start_queue(netdev_get_tx_queue(dev, 0));
}
static inline void netif_tx_start_all_queues(struct net_device *dev)
{
unsigned int i;
for (i = 0; i < dev->num_tx_queues; i++) {
struct netdev_queue *txq = netdev_get_tx_queue(dev, i);
netif_tx_start_queue(txq);
}
}
void netif_tx_wake_queue(struct netdev_queue *dev_queue);
/**
* netif_wake_queue - restart transmit
* @dev: network device
*
* Allow upper layers to call the device hard_start_xmit routine.
* Used for flow control when transmit resources are available.
*/
static inline void netif_wake_queue(struct net_device *dev)
{
netif_tx_wake_queue(netdev_get_tx_queue(dev, 0));
}
static inline void netif_tx_wake_all_queues(struct net_device *dev)
{
unsigned int i;
for (i = 0; i < dev->num_tx_queues; i++) {
struct netdev_queue *txq = netdev_get_tx_queue(dev, i);
netif_tx_wake_queue(txq);
}
}
static __always_inline void netif_tx_stop_queue(struct netdev_queue *dev_queue)
{
set_bit(__QUEUE_STATE_DRV_XOFF, &dev_queue->state);
}
/**
* netif_stop_queue - stop transmitted packets
* @dev: network device
*
* Stop upper layers calling the device hard_start_xmit routine.
* Used for flow control when transmit resources are unavailable.
*/
static inline void netif_stop_queue(struct net_device *dev)
{
netif_tx_stop_queue(netdev_get_tx_queue(dev, 0));
}
void netif_tx_stop_all_queues(struct net_device *dev);
static inline bool netif_tx_queue_stopped(const struct netdev_queue *dev_queue)
{
return test_bit(__QUEUE_STATE_DRV_XOFF, &dev_queue->state);
}
/**
* netif_queue_stopped - test if transmit queue is flowblocked
* @dev: network device
*
* Test if transmit queue on device is currently unable to send.
*/
static inline bool netif_queue_stopped(const struct net_device *dev)
{
return netif_tx_queue_stopped(netdev_get_tx_queue(dev, 0));
}
static inline bool netif_xmit_stopped(const struct netdev_queue *dev_queue)
{
return dev_queue->state & QUEUE_STATE_ANY_XOFF;
}
static inline bool
netif_xmit_frozen_or_stopped(const struct netdev_queue *dev_queue)
{
return dev_queue->state & QUEUE_STATE_ANY_XOFF_OR_FROZEN;
}
static inline bool
netif_xmit_frozen_or_drv_stopped(const struct netdev_queue *dev_queue)
{
return dev_queue->state & QUEUE_STATE_DRV_XOFF_OR_FROZEN;
}
/**
* netdev_queue_set_dql_min_limit - set dql minimum limit
* @dev_queue: pointer to transmit queue
* @min_limit: dql minimum limit
*
* Forces xmit_more() to return true until the minimum threshold
* defined by @min_limit is reached (or until the tx queue is
* empty). Warning: to be use with care, misuse will impact the
* latency.
*/
static inline void netdev_queue_set_dql_min_limit(struct netdev_queue *dev_queue,
unsigned int min_limit)
{
#ifdef CONFIG_BQL
dev_queue->dql.min_limit = min_limit;
#endif
}
/**
* netdev_txq_bql_enqueue_prefetchw - prefetch bql data for write
* @dev_queue: pointer to transmit queue
*
* BQL enabled drivers might use this helper in their ndo_start_xmit(),
* to give appropriate hint to the CPU.
*/
static inline void netdev_txq_bql_enqueue_prefetchw(struct netdev_queue *dev_queue)
{
#ifdef CONFIG_BQL
prefetchw(&dev_queue->dql.num_queued);
#endif
}
/**
* netdev_txq_bql_complete_prefetchw - prefetch bql data for write
* @dev_queue: pointer to transmit queue
*
* BQL enabled drivers might use this helper in their TX completion path,
* to give appropriate hint to the CPU.
*/
static inline void netdev_txq_bql_complete_prefetchw(struct netdev_queue *dev_queue)
{
#ifdef CONFIG_BQL
prefetchw(&dev_queue->dql.limit);
#endif
}
static inline void netdev_tx_sent_queue(struct netdev_queue *dev_queue,
unsigned int bytes)
{
#ifdef CONFIG_BQL
dql_queued(&dev_queue->dql, bytes);
if (likely(dql_avail(&dev_queue->dql) >= 0))
return;
set_bit(__QUEUE_STATE_STACK_XOFF, &dev_queue->state);
/*
* The XOFF flag must be set before checking the dql_avail below,
* because in netdev_tx_completed_queue we update the dql_completed
* before checking the XOFF flag.
*/
smp_mb();
/* check again in case another CPU has just made room avail */
if (unlikely(dql_avail(&dev_queue->dql) >= 0))
clear_bit(__QUEUE_STATE_STACK_XOFF, &dev_queue->state);
#endif
}
/* Variant of netdev_tx_sent_queue() for drivers that are aware
* that they should not test BQL status themselves.
* We do want to change __QUEUE_STATE_STACK_XOFF only for the last
* skb of a batch.
* Returns true if the doorbell must be used to kick the NIC.
*/
static inline bool __netdev_tx_sent_queue(struct netdev_queue *dev_queue,
unsigned int bytes,
bool xmit_more)
{
if (xmit_more) {
#ifdef CONFIG_BQL
dql_queued(&dev_queue->dql, bytes);
#endif
return netif_tx_queue_stopped(dev_queue);
}
netdev_tx_sent_queue(dev_queue, bytes);
return true;
}
/**
* netdev_sent_queue - report the number of bytes queued to hardware
* @dev: network device
* @bytes: number of bytes queued to the hardware device queue
*
* Report the number of bytes queued for sending/completion to the network
* device hardware queue. @bytes should be a good approximation and should
* exactly match netdev_completed_queue() @bytes
*/
static inline void netdev_sent_queue(struct net_device *dev, unsigned int bytes)
{
netdev_tx_sent_queue(netdev_get_tx_queue(dev, 0), bytes);
}
static inline bool __netdev_sent_queue(struct net_device *dev,
unsigned int bytes,
bool xmit_more)
{
return __netdev_tx_sent_queue(netdev_get_tx_queue(dev, 0), bytes,
xmit_more);
}
static inline void netdev_tx_completed_queue(struct netdev_queue *dev_queue,
unsigned int pkts, unsigned int bytes)
{
#ifdef CONFIG_BQL
if (unlikely(!bytes))
return;
dql_completed(&dev_queue->dql, bytes);
/*
* Without the memory barrier there is a small possiblity that
* netdev_tx_sent_queue will miss the update and cause the queue to
* be stopped forever
*/
smp_mb();
if (unlikely(dql_avail(&dev_queue->dql) < 0))
return;
if (test_and_clear_bit(__QUEUE_STATE_STACK_XOFF, &dev_queue->state))
netif_schedule_queue(dev_queue);
#endif
}
/**
* netdev_completed_queue - report bytes and packets completed by device
* @dev: network device
* @pkts: actual number of packets sent over the medium
* @bytes: actual number of bytes sent over the medium
*
* Report the number of bytes and packets transmitted by the network device
* hardware queue over the physical medium, @bytes must exactly match the
* @bytes amount passed to netdev_sent_queue()
*/
static inline void netdev_completed_queue(struct net_device *dev,
unsigned int pkts, unsigned int bytes)
{
netdev_tx_completed_queue(netdev_get_tx_queue(dev, 0), pkts, bytes);
}
static inline void netdev_tx_reset_queue(struct netdev_queue *q)
{
#ifdef CONFIG_BQL
clear_bit(__QUEUE_STATE_STACK_XOFF, &q->state);
dql_reset(&q->dql);
#endif
}
/**
* netdev_reset_queue - reset the packets and bytes count of a network device
* @dev_queue: network device
*
* Reset the bytes and packet count of a network device and clear the
* software flow control OFF bit for this network device
*/
static inline void netdev_reset_queue(struct net_device *dev_queue)
{
netdev_tx_reset_queue(netdev_get_tx_queue(dev_queue, 0));
}
/**
* netdev_cap_txqueue - check if selected tx queue exceeds device queues
* @dev: network device
* @queue_index: given tx queue index
*
* Returns 0 if given tx queue index >= number of device tx queues,
* otherwise returns the originally passed tx queue index.
*/
static inline u16 netdev_cap_txqueue(struct net_device *dev, u16 queue_index)
{
if (unlikely(queue_index >= dev->real_num_tx_queues)) { net_warn_ratelimited("%s selects TX queue %d, but real number of TX queues is %d\n",
dev->name, queue_index,
dev->real_num_tx_queues);
return 0;
}
return queue_index;
}
/**
* netif_running - test if up
* @dev: network device
*
* Test if the device has been brought up.
*/
static inline bool netif_running(const struct net_device *dev)
{
return test_bit(__LINK_STATE_START, &dev->state);
}
/*
* Routines to manage the subqueues on a device. We only need start,
* stop, and a check if it's stopped. All other device management is
* done at the overall netdevice level.
* Also test the device if we're multiqueue.
*/
/**
* netif_start_subqueue - allow sending packets on subqueue
* @dev: network device
* @queue_index: sub queue index
*
* Start individual transmit queue of a device with multiple transmit queues.
*/
static inline void netif_start_subqueue(struct net_device *dev, u16 queue_index)
{
struct netdev_queue *txq = netdev_get_tx_queue(dev, queue_index);
netif_tx_start_queue(txq);
}
/**
* netif_stop_subqueue - stop sending packets on subqueue
* @dev: network device
* @queue_index: sub queue index
*
* Stop individual transmit queue of a device with multiple transmit queues.
*/
static inline void netif_stop_subqueue(struct net_device *dev, u16 queue_index)
{
struct netdev_queue *txq = netdev_get_tx_queue(dev, queue_index);
netif_tx_stop_queue(txq);
}
/**
* __netif_subqueue_stopped - test status of subqueue
* @dev: network device
* @queue_index: sub queue index
*
* Check individual transmit queue of a device with multiple transmit queues.
*/
static inline bool __netif_subqueue_stopped(const struct net_device *dev,
u16 queue_index)
{
struct netdev_queue *txq = netdev_get_tx_queue(dev, queue_index);
return netif_tx_queue_stopped(txq);
}
/**
* netif_subqueue_stopped - test status of subqueue
* @dev: network device
* @skb: sub queue buffer pointer
*
* Check individual transmit queue of a device with multiple transmit queues.
*/
static inline bool netif_subqueue_stopped(const struct net_device *dev,
struct sk_buff *skb)
{
return __netif_subqueue_stopped(dev, skb_get_queue_mapping(skb));
}
/**
* netif_wake_subqueue - allow sending packets on subqueue
* @dev: network device
* @queue_index: sub queue index
*
* Resume individual transmit queue of a device with multiple transmit queues.
*/
static inline void netif_wake_subqueue(struct net_device *dev, u16 queue_index)
{
struct netdev_queue *txq = netdev_get_tx_queue(dev, queue_index);
netif_tx_wake_queue(txq);
}
#ifdef CONFIG_XPS
int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask,
u16 index);
int __netif_set_xps_queue(struct net_device *dev, const unsigned long *mask,
u16 index, enum xps_map_type type);
/**
* netif_attr_test_mask - Test a CPU or Rx queue set in a mask
* @j: CPU/Rx queue index
* @mask: bitmask of all cpus/rx queues
* @nr_bits: number of bits in the bitmask
*
* Test if a CPU or Rx queue index is set in a mask of all CPU/Rx queues.
*/
static inline bool netif_attr_test_mask(unsigned long j,
const unsigned long *mask,
unsigned int nr_bits)
{
cpu_max_bits_warn(j, nr_bits);
return test_bit(j, mask);
}
/**
* netif_attr_test_online - Test for online CPU/Rx queue
* @j: CPU/Rx queue index
* @online_mask: bitmask for CPUs/Rx queues that are online
* @nr_bits: number of bits in the bitmask
*
* Returns true if a CPU/Rx queue is online.
*/
static inline bool netif_attr_test_online(unsigned long j,
const unsigned long *online_mask,
unsigned int nr_bits)
{
cpu_max_bits_warn(j, nr_bits);
if (online_mask)
return test_bit(j, online_mask);
return (j < nr_bits);
}
/**
* netif_attrmask_next - get the next CPU/Rx queue in a cpu/Rx queues mask
* @n: CPU/Rx queue index
* @srcp: the cpumask/Rx queue mask pointer
* @nr_bits: number of bits in the bitmask
*
* Returns >= nr_bits if no further CPUs/Rx queues set.
*/
static inline unsigned int netif_attrmask_next(int n, const unsigned long *srcp,
unsigned int nr_bits)
{
/* -1 is a legal arg here. */
if (n != -1)
cpu_max_bits_warn(n, nr_bits);
if (srcp)
return find_next_bit(srcp, nr_bits, n + 1);
return n + 1;
}
/**
* netif_attrmask_next_and - get the next CPU/Rx queue in \*src1p & \*src2p
* @n: CPU/Rx queue index
* @src1p: the first CPUs/Rx queues mask pointer
* @src2p: the second CPUs/Rx queues mask pointer
* @nr_bits: number of bits in the bitmask
*
* Returns >= nr_bits if no further CPUs/Rx queues set in both.
*/
static inline int netif_attrmask_next_and(int n, const unsigned long *src1p,
const unsigned long *src2p,
unsigned int nr_bits)
{
/* -1 is a legal arg here. */
if (n != -1)
cpu_max_bits_warn(n, nr_bits);
if (src1p && src2p)
return find_next_and_bit(src1p, src2p, nr_bits, n + 1);
else if (src1p)
return find_next_bit(src1p, nr_bits, n + 1);
else if (src2p)
return find_next_bit(src2p, nr_bits, n + 1);
return n + 1;
}
#else
static inline int netif_set_xps_queue(struct net_device *dev,
const struct cpumask *mask,
u16 index)
{
return 0;
}
static inline int __netif_set_xps_queue(struct net_device *dev,
const unsigned long *mask,
u16 index, enum xps_map_type type)
{
return 0;
}
#endif
/**
* netif_is_multiqueue - test if device has multiple transmit queues
* @dev: network device
*
* Check if device has multiple transmit queues
*/
static inline bool netif_is_multiqueue(const struct net_device *dev)
{
return dev->num_tx_queues > 1;
}
int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq);
#ifdef CONFIG_SYSFS
int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq);
#else
static inline int netif_set_real_num_rx_queues(struct net_device *dev,
unsigned int rxqs)
{
dev->real_num_rx_queues = rxqs;
return 0;
}
#endif
int netif_set_real_num_queues(struct net_device *dev,
unsigned int txq, unsigned int rxq);
static inline struct netdev_rx_queue *
__netif_get_rx_queue(struct net_device *dev, unsigned int rxq)
{
return dev->_rx + rxq;
}
#ifdef CONFIG_SYSFS
static inline unsigned int get_netdev_rx_queue_index(
struct netdev_rx_queue *queue)
{
struct net_device *dev = queue->dev;
int index = queue - dev->_rx;
BUG_ON(index >= dev->num_rx_queues);
return index;
}
#endif
#define DEFAULT_MAX_NUM_RSS_QUEUES (8)
int netif_get_num_default_rss_queues(void);
enum skb_free_reason {
SKB_REASON_CONSUMED,
SKB_REASON_DROPPED,
};
void __dev_kfree_skb_irq(struct sk_buff *skb, enum skb_free_reason reason);
void __dev_kfree_skb_any(struct sk_buff *skb, enum skb_free_reason reason);
/*
* It is not allowed to call kfree_skb() or consume_skb() from hardware
* interrupt context or with hardware interrupts being disabled.
* (in_hardirq() || irqs_disabled())
*
* We provide four helpers that can be used in following contexts :
*
* dev_kfree_skb_irq(skb) when caller drops a packet from irq context,
* replacing kfree_skb(skb)
*
* dev_consume_skb_irq(skb) when caller consumes a packet from irq context.
* Typically used in place of consume_skb(skb) in TX completion path
*
* dev_kfree_skb_any(skb) when caller doesn't know its current irq context,
* replacing kfree_skb(skb)
*
* dev_consume_skb_any(skb) when caller doesn't know its current irq context,
* and consumed a packet. Used in place of consume_skb(skb)
*/
static inline void dev_kfree_skb_irq(struct sk_buff *skb)
{
__dev_kfree_skb_irq(skb, SKB_REASON_DROPPED);
}
static inline void dev_consume_skb_irq(struct sk_buff *skb)
{
__dev_kfree_skb_irq(skb, SKB_REASON_CONSUMED);
}
static inline void dev_kfree_skb_any(struct sk_buff *skb)
{
__dev_kfree_skb_any(skb, SKB_REASON_DROPPED);
}
static inline void dev_consume_skb_any(struct sk_buff *skb)
{
__dev_kfree_skb_any(skb, SKB_REASON_CONSUMED);
}
u32 bpf_prog_run_generic_xdp(struct sk_buff *skb, struct xdp_buff *xdp,
struct bpf_prog *xdp_prog);
void generic_xdp_tx(struct sk_buff *skb, struct bpf_prog *xdp_prog);
int do_xdp_generic(struct bpf_prog *xdp_prog, struct sk_buff *skb);
int netif_rx(struct sk_buff *skb);
int netif_rx_ni(struct sk_buff *skb);
int netif_rx_any_context(struct sk_buff *skb);
int netif_receive_skb(struct sk_buff *skb);
int netif_receive_skb_core(struct sk_buff *skb);
void netif_receive_skb_list(struct list_head *head);
gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb);
void napi_gro_flush(struct napi_struct *napi, bool flush_old);
struct sk_buff *napi_get_frags(struct napi_struct *napi);
gro_result_t napi_gro_frags(struct napi_struct *napi);
struct packet_offload *gro_find_receive_by_type(__be16 type);
struct packet_offload *gro_find_complete_by_type(__be16 type);
static inline void napi_free_frags(struct napi_struct *napi)
{
kfree_skb(napi->skb);
napi->skb = NULL;
}
bool netdev_is_rx_handler_busy(struct net_device *dev);
int netdev_rx_handler_register(struct net_device *dev,
rx_handler_func_t *rx_handler,
void *rx_handler_data);
void netdev_rx_handler_unregister(struct net_device *dev);
bool dev_valid_name(const char *name);
static inline bool is_socket_ioctl_cmd(unsigned int cmd)
{
return _IOC_TYPE(cmd) == SOCK_IOC_TYPE;
}
int get_user_ifreq(struct ifreq *ifr, void __user **ifrdata, void __user *arg);
int put_user_ifreq(struct ifreq *ifr, void __user *arg);
int dev_ioctl(struct net *net, unsigned int cmd, struct ifreq *ifr,
void __user *data, bool *need_copyout);
int dev_ifconf(struct net *net, struct ifconf __user *ifc);
int dev_ethtool(struct net *net, struct ifreq *ifr, void __user *userdata);
unsigned int dev_get_flags(const struct net_device *);
int __dev_change_flags(struct net_device *dev, unsigned int flags,
struct netlink_ext_ack *extack);
int dev_change_flags(struct net_device *dev, unsigned int flags,
struct netlink_ext_ack *extack);
void __dev_notify_flags(struct net_device *, unsigned int old_flags,
unsigned int gchanges);
int dev_change_name(struct net_device *, const char *);
int dev_set_alias(struct net_device *, const char *, size_t);
int dev_get_alias(const struct net_device *, char *, size_t);
int __dev_change_net_namespace(struct net_device *dev, struct net *net,
const char *pat, int new_ifindex);
static inline
int dev_change_net_namespace(struct net_device *dev, struct net *net,
const char *pat)
{
return __dev_change_net_namespace(dev, net, pat, 0);
}
int __dev_set_mtu(struct net_device *, int);
int dev_validate_mtu(struct net_device *dev, int mtu,
struct netlink_ext_ack *extack);
int dev_set_mtu_ext(struct net_device *dev, int mtu,
struct netlink_ext_ack *extack);
int dev_set_mtu(struct net_device *, int);
int dev_change_tx_queue_len(struct net_device *, unsigned long);
void dev_set_group(struct net_device *, int);
int dev_pre_changeaddr_notify(struct net_device *dev, const char *addr,
struct netlink_ext_ack *extack);
int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa,
struct netlink_ext_ack *extack);
int dev_set_mac_address_user(struct net_device *dev, struct sockaddr *sa,
struct netlink_ext_ack *extack);
int dev_get_mac_address(struct sockaddr *sa, struct net *net, char *dev_name);
int dev_change_carrier(struct net_device *, bool new_carrier);
int dev_get_phys_port_id(struct net_device *dev,
struct netdev_phys_item_id *ppid);
int dev_get_phys_port_name(struct net_device *dev,
char *name, size_t len);
int dev_get_port_parent_id(struct net_device *dev,
struct netdev_phys_item_id *ppid, bool recurse);
bool netdev_port_same_parent_id(struct net_device *a, struct net_device *b);
int dev_change_proto_down(struct net_device *dev, bool proto_down);
int dev_change_proto_down_generic(struct net_device *dev, bool proto_down);
void dev_change_proto_down_reason(struct net_device *dev, unsigned long mask,
u32 value);
struct sk_buff *validate_xmit_skb_list(struct sk_buff *skb, struct net_device *dev, bool *again);
struct sk_buff *dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
struct netdev_queue *txq, int *ret);
typedef int (*bpf_op_t)(struct net_device *dev, struct netdev_bpf *bpf);
int dev_change_xdp_fd(struct net_device *dev, struct netlink_ext_ack *extack,
int fd, int expected_fd, u32 flags);
int bpf_xdp_link_attach(const union bpf_attr *attr, struct bpf_prog *prog);
u8 dev_xdp_prog_count(struct net_device *dev);
u32 dev_xdp_prog_id(struct net_device *dev, enum bpf_xdp_mode mode);
int __dev_forward_skb(struct net_device *dev, struct sk_buff *skb);
int dev_forward_skb(struct net_device *dev, struct sk_buff *skb);
int dev_forward_skb_nomtu(struct net_device *dev, struct sk_buff *skb);
bool is_skb_forwardable(const struct net_device *dev,
const struct sk_buff *skb);
static __always_inline bool __is_skb_forwardable(const struct net_device *dev,
const struct sk_buff *skb,
const bool check_mtu)
{
const u32 vlan_hdr_len = 4; /* VLAN_HLEN */
unsigned int len;
if (!(dev->flags & IFF_UP))
return false;
if (!check_mtu)
return true;
len = dev->mtu + dev->hard_header_len + vlan_hdr_len;
if (skb->len <= len)
return true;
/* if TSO is enabled, we don't care about the length as the packet
* could be forwarded without being segmented before
*/
if (skb_is_gso(skb))
return true;
return false;
}
static __always_inline int ____dev_forward_skb(struct net_device *dev,
struct sk_buff *skb,
const bool check_mtu)
{
if (skb_orphan_frags(skb, GFP_ATOMIC) ||
unlikely(!__is_skb_forwardable(dev, skb, check_mtu))) {
atomic_long_inc(&dev->rx_dropped);
kfree_skb(skb);
return NET_RX_DROP;
}
skb_scrub_packet(skb, !net_eq(dev_net(dev), dev_net(skb->dev)));
skb->priority = 0;
return 0;
}
bool dev_nit_active(struct net_device *dev);
void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev);
extern int netdev_budget;
extern unsigned int netdev_budget_usecs;
/* Called by rtnetlink.c:rtnl_unlock() */
void netdev_run_todo(void);
/**
* dev_put - release reference to device
* @dev: network device
*
* Release reference to device to allow it to be freed.
*/
static inline void dev_put(struct net_device *dev)
{
if (dev) {
#ifdef CONFIG_PCPU_DEV_REFCNT
this_cpu_dec(*dev->pcpu_refcnt);
#else
refcount_dec(&dev->dev_refcnt);
#endif
}
}
/**
* dev_hold - get reference to device
* @dev: network device
*
* Hold reference to device to keep it from being freed.
*/
static inline void dev_hold(struct net_device *dev)
{
if (dev) {
#ifdef CONFIG_PCPU_DEV_REFCNT
this_cpu_inc(*dev->pcpu_refcnt);
#else
refcount_inc(&dev->dev_refcnt);
#endif
}
}
/* Carrier loss detection, dial on demand. The functions netif_carrier_on
* and _off may be called from IRQ context, but it is caller
* who is responsible for serialization of these calls.
*
* The name carrier is inappropriate, these functions should really be
* called netif_lowerlayer_*() because they represent the state of any
* kind of lower layer not just hardware media.
*/
void linkwatch_init_dev(struct net_device *dev);
void linkwatch_fire_event(struct net_device *dev);
void linkwatch_forget_dev(struct net_device *dev);
/**
* netif_carrier_ok - test if carrier present
* @dev: network device
*
* Check if carrier is present on device
*/
static inline bool netif_carrier_ok(const struct net_device *dev)
{
return !test_bit(__LINK_STATE_NOCARRIER, &dev->state);
}
unsigned long dev_trans_start(struct net_device *dev);
void __netdev_watchdog_up(struct net_device *dev);
void netif_carrier_on(struct net_device *dev);
void netif_carrier_off(struct net_device *dev);
void netif_carrier_event(struct net_device *dev);
/**
* netif_dormant_on - mark device as dormant.
* @dev: network device
*
* Mark device as dormant (as per RFC2863).
*
* The dormant state indicates that the relevant interface is not
* actually in a condition to pass packets (i.e., it is not 'up') but is
* in a "pending" state, waiting for some external event. For "on-
* demand" interfaces, this new state identifies the situation where the
* interface is waiting for events to place it in the up state.
*/
static inline void netif_dormant_on(struct net_device *dev)
{
if (!test_and_set_bit(__LINK_STATE_DORMANT, &dev->state))
linkwatch_fire_event(dev);
}
/**
* netif_dormant_off - set device as not dormant.
* @dev: network device
*
* Device is not in dormant state.
*/
static inline void netif_dormant_off(struct net_device *dev)
{
if (test_and_clear_bit(__LINK_STATE_DORMANT, &dev->state))
linkwatch_fire_event(dev);
}
/**
* netif_dormant - test if device is dormant
* @dev: network device
*
* Check if device is dormant.
*/
static inline bool netif_dormant(const struct net_device *dev)
{
return test_bit(__LINK_STATE_DORMANT, &dev->state);
}
/**
* netif_testing_on - mark device as under test.
* @dev: network device
*
* Mark device as under test (as per RFC2863).
*
* The testing state indicates that some test(s) must be performed on
* the interface. After completion, of the test, the interface state
* will change to up, dormant, or down, as appropriate.
*/
static inline void netif_testing_on(struct net_device *dev)
{
if (!test_and_set_bit(__LINK_STATE_TESTING, &dev->state))
linkwatch_fire_event(dev);
}
/**
* netif_testing_off - set device as not under test.
* @dev: network device
*
* Device is not in testing state.
*/
static inline void netif_testing_off(struct net_device *dev)
{
if (test_and_clear_bit(__LINK_STATE_TESTING, &dev->state))
linkwatch_fire_event(dev);
}
/**
* netif_testing - test if device is under test
* @dev: network device
*
* Check if device is under test
*/
static inline bool netif_testing(const struct net_device *dev)
{
return test_bit(__LINK_STATE_TESTING, &dev->state);
}
/**
* netif_oper_up - test if device is operational
* @dev: network device
*
* Check if carrier is operational
*/
static inline bool netif_oper_up(const struct net_device *dev)
{
return (dev->operstate == IF_OPER_UP ||
dev->operstate == IF_OPER_UNKNOWN /* backward compat */);
}
/**
* netif_device_present - is device available or removed
* @dev: network device
*
* Check if device has not been removed from system.
*/
static inline bool netif_device_present(const struct net_device *dev)
{
return test_bit(__LINK_STATE_PRESENT, &dev->state);
}
void netif_device_detach(struct net_device *dev);
void netif_device_attach(struct net_device *dev);
/*
* Network interface message level settings
*/
enum {
NETIF_MSG_DRV_BIT,
NETIF_MSG_PROBE_BIT,
NETIF_MSG_LINK_BIT,
NETIF_MSG_TIMER_BIT,
NETIF_MSG_IFDOWN_BIT,
NETIF_MSG_IFUP_BIT,
NETIF_MSG_RX_ERR_BIT,
NETIF_MSG_TX_ERR_BIT,
NETIF_MSG_TX_QUEUED_BIT,
NETIF_MSG_INTR_BIT,
NETIF_MSG_TX_DONE_BIT,
NETIF_MSG_RX_STATUS_BIT,
NETIF_MSG_PKTDATA_BIT,
NETIF_MSG_HW_BIT,
NETIF_MSG_WOL_BIT,
/* When you add a new bit above, update netif_msg_class_names array
* in net/ethtool/common.c
*/
NETIF_MSG_CLASS_COUNT,
};
/* Both ethtool_ops interface and internal driver implementation use u32 */
static_assert(NETIF_MSG_CLASS_COUNT <= 32);
#define __NETIF_MSG_BIT(bit) ((u32)1 << (bit))
#define __NETIF_MSG(name) __NETIF_MSG_BIT(NETIF_MSG_ ## name ## _BIT)
#define NETIF_MSG_DRV __NETIF_MSG(DRV)
#define NETIF_MSG_PROBE __NETIF_MSG(PROBE)
#define NETIF_MSG_LINK __NETIF_MSG(LINK)
#define NETIF_MSG_TIMER __NETIF_MSG(TIMER)
#define NETIF_MSG_IFDOWN __NETIF_MSG(IFDOWN)
#define NETIF_MSG_IFUP __NETIF_MSG(IFUP)
#define NETIF_MSG_RX_ERR __NETIF_MSG(RX_ERR)
#define NETIF_MSG_TX_ERR __NETIF_MSG(TX_ERR)
#define NETIF_MSG_TX_QUEUED __NETIF_MSG(TX_QUEUED)
#define NETIF_MSG_INTR __NETIF_MSG(INTR)
#define NETIF_MSG_TX_DONE __NETIF_MSG(TX_DONE)
#define NETIF_MSG_RX_STATUS __NETIF_MSG(RX_STATUS)
#define NETIF_MSG_PKTDATA __NETIF_MSG(PKTDATA)
#define NETIF_MSG_HW __NETIF_MSG(HW)
#define NETIF_MSG_WOL __NETIF_MSG(WOL)
#define netif_msg_drv(p) ((p)->msg_enable & NETIF_MSG_DRV)
#define netif_msg_probe(p) ((p)->msg_enable & NETIF_MSG_PROBE)
#define netif_msg_link(p) ((p)->msg_enable & NETIF_MSG_LINK)
#define netif_msg_timer(p) ((p)->msg_enable & NETIF_MSG_TIMER)
#define netif_msg_ifdown(p) ((p)->msg_enable & NETIF_MSG_IFDOWN)
#define netif_msg_ifup(p) ((p)->msg_enable & NETIF_MSG_IFUP)
#define netif_msg_rx_err(p) ((p)->msg_enable & NETIF_MSG_RX_ERR)
#define netif_msg_tx_err(p) ((p)->msg_enable & NETIF_MSG_TX_ERR)
#define netif_msg_tx_queued(p) ((p)->msg_enable & NETIF_MSG_TX_QUEUED)
#define netif_msg_intr(p) ((p)->msg_enable & NETIF_MSG_INTR)
#define netif_msg_tx_done(p) ((p)->msg_enable & NETIF_MSG_TX_DONE)
#define netif_msg_rx_status(p) ((p)->msg_enable & NETIF_MSG_RX_STATUS)
#define netif_msg_pktdata(p) ((p)->msg_enable & NETIF_MSG_PKTDATA)
#define netif_msg_hw(p) ((p)->msg_enable & NETIF_MSG_HW)
#define netif_msg_wol(p) ((p)->msg_enable & NETIF_MSG_WOL)
static inline u32 netif_msg_init(int debug_value, int default_msg_enable_bits)
{
/* use default */
if (debug_value < 0 || debug_value >= (sizeof(u32) * 8))
return default_msg_enable_bits;
if (debug_value == 0) /* no output */
return 0;
/* set low N bits */
return (1U << debug_value) - 1;
}
static inline void __netif_tx_lock(struct netdev_queue *txq, int cpu)
{
spin_lock(&txq->_xmit_lock);
/* Pairs with READ_ONCE() in __dev_queue_xmit() */
WRITE_ONCE(txq->xmit_lock_owner, cpu);
}
static inline bool __netif_tx_acquire(struct netdev_queue *txq)
{
__acquire(&txq->_xmit_lock);
return true;
}
static inline void __netif_tx_release(struct netdev_queue *txq)
{
__release(&txq->_xmit_lock);
}
static inline void __netif_tx_lock_bh(struct netdev_queue *txq)
{
spin_lock_bh(&txq->_xmit_lock);
/* Pairs with READ_ONCE() in __dev_queue_xmit() */
WRITE_ONCE(txq->xmit_lock_owner, smp_processor_id());
}
static inline bool __netif_tx_trylock(struct netdev_queue *txq)
{
bool ok = spin_trylock(&txq->_xmit_lock);
if (likely(ok)) {
/* Pairs with READ_ONCE() in __dev_queue_xmit() */
WRITE_ONCE(txq->xmit_lock_owner, smp_processor_id());
}
return ok;
}
static inline void __netif_tx_unlock(struct netdev_queue *txq)
{
/* Pairs with READ_ONCE() in __dev_queue_xmit() */
WRITE_ONCE(txq->xmit_lock_owner, -1);
spin_unlock(&txq->_xmit_lock);
}
static inline void __netif_tx_unlock_bh(struct netdev_queue *txq)
{
/* Pairs with READ_ONCE() in __dev_queue_xmit() */
WRITE_ONCE(txq->xmit_lock_owner, -1);
spin_unlock_bh(&txq->_xmit_lock);
}
static inline void txq_trans_update(struct netdev_queue *txq)
{
if (txq->xmit_lock_owner != -1)
txq->trans_start = jiffies;
}
/* legacy drivers only, netdev_start_xmit() sets txq->trans_start */
static inline void netif_trans_update(struct net_device *dev)
{
struct netdev_queue *txq = netdev_get_tx_queue(dev, 0);
if (txq->trans_start != jiffies)
txq->trans_start = jiffies;
}
/**
* netif_tx_lock - grab network device transmit lock
* @dev: network device
*
* Get network device transmit lock
*/
static inline void netif_tx_lock(struct net_device *dev)
{
unsigned int i;
int cpu;
spin_lock(&dev->tx_global_lock);
cpu = smp_processor_id();
for (i = 0; i < dev->num_tx_queues; i++) {
struct netdev_queue *txq = netdev_get_tx_queue(dev, i);
/* We are the only thread of execution doing a
* freeze, but we have to grab the _xmit_lock in
* order to synchronize with threads which are in
* the ->hard_start_xmit() handler and already
* checked the frozen bit.
*/
__netif_tx_lock(txq, cpu);
set_bit(__QUEUE_STATE_FROZEN, &txq->state);
__netif_tx_unlock(txq);
}
}
static inline void netif_tx_lock_bh(struct net_device *dev)
{
local_bh_disable();
netif_tx_lock(dev);
}
static inline void netif_tx_unlock(struct net_device *dev)
{
unsigned int i;
for (i = 0; i < dev->num_tx_queues; i++) {
struct netdev_queue *txq = netdev_get_tx_queue(dev, i);
/* No need to grab the _xmit_lock here. If the
* queue is not stopped for another reason, we
* force a schedule.
*/
clear_bit(__QUEUE_STATE_FROZEN, &txq->state);
netif_schedule_queue(txq);
}
spin_unlock(&dev->tx_global_lock);
}
static inline void netif_tx_unlock_bh(struct net_device *dev)
{
netif_tx_unlock(dev);
local_bh_enable();
}
#define HARD_TX_LOCK(dev, txq, cpu) { \
if ((dev->features & NETIF_F_LLTX) == 0) { \
__netif_tx_lock(txq, cpu); \
} else { \
__netif_tx_acquire(txq); \
} \
}
#define HARD_TX_TRYLOCK(dev, txq) \
(((dev->features & NETIF_F_LLTX) == 0) ? \
__netif_tx_trylock(txq) : \
__netif_tx_acquire(txq))
#define HARD_TX_UNLOCK(dev, txq) { \
if ((dev->features & NETIF_F_LLTX) == 0) { \
__netif_tx_unlock(txq); \
} else { \
__netif_tx_release(txq); \
} \
}
static inline void netif_tx_disable(struct net_device *dev)
{
unsigned int i;
int cpu;
local_bh_disable();
cpu = smp_processor_id();
spin_lock(&dev->tx_global_lock);
for (i = 0; i < dev->num_tx_queues; i++) {
struct netdev_queue *txq = netdev_get_tx_queue(dev, i);
__netif_tx_lock(txq, cpu);
netif_tx_stop_queue(txq);
__netif_tx_unlock(txq);
}
spin_unlock(&dev->tx_global_lock);
local_bh_enable();
}
static inline void netif_addr_lock(struct net_device *dev)
{
unsigned char nest_level = 0;
#ifdef CONFIG_LOCKDEP
nest_level = dev->nested_level;
#endif
spin_lock_nested(&dev->addr_list_lock, nest_level);
}
static inline void netif_addr_lock_bh(struct net_device *dev)
{
unsigned char nest_level = 0;
#ifdef CONFIG_LOCKDEP
nest_level = dev->nested_level;
#endif
local_bh_disable();
spin_lock_nested(&dev->addr_list_lock, nest_level);
}
static inline void netif_addr_unlock(struct net_device *dev)
{
spin_unlock(&dev->addr_list_lock);
}
static inline void netif_addr_unlock_bh(struct net_device *dev)
{
spin_unlock_bh(&dev->addr_list_lock);
}
/*
* dev_addrs walker. Should be used only for read access. Call with
* rcu_read_lock held.
*/
#define for_each_dev_addr(dev, ha) \
list_for_each_entry_rcu(ha, &dev->dev_addrs.list, list)
/* These functions live elsewhere (drivers/net/net_init.c, but related) */
void ether_setup(struct net_device *dev);
/* Support for loadable net-drivers */
struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
unsigned char name_assign_type,
void (*setup)(struct net_device *),
unsigned int txqs, unsigned int rxqs);
#define alloc_netdev(sizeof_priv, name, name_assign_type, setup) \
alloc_netdev_mqs(sizeof_priv, name, name_assign_type, setup, 1, 1)
#define alloc_netdev_mq(sizeof_priv, name, name_assign_type, setup, count) \
alloc_netdev_mqs(sizeof_priv, name, name_assign_type, setup, count, \
count)
int register_netdev(struct net_device *dev);
void unregister_netdev(struct net_device *dev);
int devm_register_netdev(struct device *dev, struct net_device *ndev);
/* General hardware address lists handling functions */
int __hw_addr_sync(struct netdev_hw_addr_list *to_list,
struct netdev_hw_addr_list *from_list, int addr_len);
void __hw_addr_unsync(struct netdev_hw_addr_list *to_list,
struct netdev_hw_addr_list *from_list, int addr_len);
int __hw_addr_sync_dev(struct netdev_hw_addr_list *list,
struct net_device *dev,
int (*sync)(struct net_device *, const unsigned char *),
int (*unsync)(struct net_device *,
const unsigned char *));
int __hw_addr_ref_sync_dev(struct netdev_hw_addr_list *list,
struct net_device *dev,
int (*sync)(struct net_device *,
const unsigned char *, int),
int (*unsync)(struct net_device *,
const unsigned char *, int));
void __hw_addr_ref_unsync_dev(struct netdev_hw_addr_list *list,
struct net_device *dev,
int (*unsync)(struct net_device *,
const unsigned char *, int));
void __hw_addr_unsync_dev(struct netdev_hw_addr_list *list,
struct net_device *dev,
int (*unsync)(struct net_device *,
const unsigned char *));
void __hw_addr_init(struct netdev_hw_addr_list *list);
/* Functions used for device addresses handling */
static inline void
__dev_addr_set(struct net_device *dev, const u8 *addr, size_t len)
{
memcpy(dev->dev_addr, addr, len);
}
static inline void dev_addr_set(struct net_device *dev, const u8 *addr)
{
__dev_addr_set(dev, addr, dev->addr_len);
}
static inline void
dev_addr_mod(struct net_device *dev, unsigned int offset,
const u8 *addr, size_t len)
{
memcpy(&dev->dev_addr[offset], addr, len);
}
int dev_addr_add(struct net_device *dev, const unsigned char *addr,
unsigned char addr_type);
int dev_addr_del(struct net_device *dev, const unsigned char *addr,
unsigned char addr_type);
void dev_addr_flush(struct net_device *dev);
int dev_addr_init(struct net_device *dev);
/* Functions used for unicast addresses handling */
int dev_uc_add(struct net_device *dev, const unsigned char *addr);
int dev_uc_add_excl(struct net_device *dev, const unsigned char *addr);
int dev_uc_del(struct net_device *dev, const unsigned char *addr);
int dev_uc_sync(struct net_device *to, struct net_device *from);
int dev_uc_sync_multiple(struct net_device *to, struct net_device *from);
void dev_uc_unsync(struct net_device *to, struct net_device *from);
void dev_uc_flush(struct net_device *dev);
void dev_uc_init(struct net_device *dev);
/**
* __dev_uc_sync - Synchonize device's unicast list
* @dev: device to sync
* @sync: function to call if address should be added
* @unsync: function to call if address should be removed
*
* Add newly added addresses to the interface, and release
* addresses that have been deleted.
*/
static inline int __dev_uc_sync(struct net_device *dev,
int (*sync)(struct net_device *,
const unsigned char *),
int (*unsync)(struct net_device *,
const unsigned char *))
{
return __hw_addr_sync_dev(&dev->uc, dev, sync, unsync);
}
/**
* __dev_uc_unsync - Remove synchronized addresses from device
* @dev: device to sync
* @unsync: function to call if address should be removed
*
* Remove all addresses that were added to the device by dev_uc_sync().
*/
static inline void __dev_uc_unsync(struct net_device *dev,
int (*unsync)(struct net_device *,
const unsigned char *))
{
__hw_addr_unsync_dev(&dev->uc, dev, unsync);
}
/* Functions used for multicast addresses handling */
int dev_mc_add(struct net_device *dev, const unsigned char *addr);
int dev_mc_add_global(struct net_device *dev, const unsigned char *addr);
int dev_mc_add_excl(struct net_device *dev, const unsigned char *addr);
int dev_mc_del(struct net_device *dev, const unsigned char *addr);
int dev_mc_del_global(struct net_device *dev, const unsigned char *addr);
int dev_mc_sync(struct net_device *to, struct net_device *from);
int dev_mc_sync_multiple(struct net_device *to, struct net_device *from);
void dev_mc_unsync(struct net_device *to, struct net_device *from);
void dev_mc_flush(struct net_device *dev);
void dev_mc_init(struct net_device *dev);
/**
* __dev_mc_sync - Synchonize device's multicast list
* @dev: device to sync
* @sync: function to call if address should be added
* @unsync: function to call if address should be removed
*
* Add newly added addresses to the interface, and release
* addresses that have been deleted.
*/
static inline int __dev_mc_sync(struct net_device *dev,
int (*sync)(struct net_device *,
const unsigned char *),
int (*unsync)(struct net_device *,
const unsigned char *))
{
return __hw_addr_sync_dev(&dev->mc, dev, sync, unsync);
}
/**
* __dev_mc_unsync - Remove synchronized addresses from device
* @dev: device to sync
* @unsync: function to call if address should be removed
*
* Remove all addresses that were added to the device by dev_mc_sync().
*/
static inline void __dev_mc_unsync(struct net_device *dev,
int (*unsync)(struct net_device *,
const unsigned char *))
{
__hw_addr_unsync_dev(&dev->mc, dev, unsync);
}
/* Functions used for secondary unicast and multicast support */
void dev_set_rx_mode(struct net_device *dev);
void __dev_set_rx_mode(struct net_device *dev);
int dev_set_promiscuity(struct net_device *dev, int inc);
int dev_set_allmulti(struct net_device *dev, int inc);
void netdev_state_change(struct net_device *dev);
void __netdev_notify_peers(struct net_device *dev);
void netdev_notify_peers(struct net_device *dev);
void netdev_features_change(struct net_device *dev);
/* Load a device via the kmod */
void dev_load(struct net *net, const char *name);
struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev,
struct rtnl_link_stats64 *storage);
void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64,
const struct net_device_stats *netdev_stats);
void dev_fetch_sw_netstats(struct rtnl_link_stats64 *s,
const struct pcpu_sw_netstats __percpu *netstats);
void dev_get_tstats64(struct net_device *dev, struct rtnl_link_stats64 *s);
extern int netdev_max_backlog;
extern int netdev_tstamp_prequeue;
extern int netdev_unregister_timeout_secs;
extern int weight_p;
extern int dev_weight_rx_bias;
extern int dev_weight_tx_bias;
extern int dev_rx_weight;
extern int dev_tx_weight;
extern int gro_normal_batch;
enum {
NESTED_SYNC_IMM_BIT,
NESTED_SYNC_TODO_BIT,
};
#define __NESTED_SYNC_BIT(bit) ((u32)1 << (bit))
#define __NESTED_SYNC(name) __NESTED_SYNC_BIT(NESTED_SYNC_ ## name ## _BIT)
#define NESTED_SYNC_IMM __NESTED_SYNC(IMM)
#define NESTED_SYNC_TODO __NESTED_SYNC(TODO)
struct netdev_nested_priv {
unsigned char flags;
void *data;
};
bool netdev_has_upper_dev(struct net_device *dev, struct net_device *upper_dev);
struct net_device *netdev_upper_get_next_dev_rcu(struct net_device *dev,
struct list_head **iter);
struct net_device *netdev_all_upper_get_next_dev_rcu(struct net_device *dev,
struct list_head **iter);
#ifdef CONFIG_LOCKDEP
static LIST_HEAD(net_unlink_list);
static inline void net_unlink_todo(struct net_device *dev)
{
if (list_empty(&dev->unlink_list))
list_add_tail(&dev->unlink_list, &net_unlink_list);
}
#endif
/* iterate through upper list, must be called under RCU read lock */
#define netdev_for_each_upper_dev_rcu(dev, updev, iter) \
for (iter = &(dev)->adj_list.upper, \
updev = netdev_upper_get_next_dev_rcu(dev, &(iter)); \
updev; \
updev = netdev_upper_get_next_dev_rcu(dev, &(iter)))
int netdev_walk_all_upper_dev_rcu(struct net_device *dev,
int (*fn)(struct net_device *upper_dev,
struct netdev_nested_priv *priv),
struct netdev_nested_priv *priv);
bool netdev_has_upper_dev_all_rcu(struct net_device *dev,
struct net_device *upper_dev);
bool netdev_has_any_upper_dev(struct net_device *dev);
void *netdev_lower_get_next_private(struct net_device *dev,
struct list_head **iter);
void *netdev_lower_get_next_private_rcu(struct net_device *dev,
struct list_head **iter);
#define netdev_for_each_lower_private(dev, priv, iter) \
for (iter = (dev)->adj_list.lower.next, \
priv = netdev_lower_get_next_private(dev, &(iter)); \
priv; \
priv = netdev_lower_get_next_private(dev, &(iter)))
#define netdev_for_each_lower_private_rcu(dev, priv, iter) \
for (iter = &(dev)->adj_list.lower, \
priv = netdev_lower_get_next_private_rcu(dev, &(iter)); \
priv; \
priv = netdev_lower_get_next_private_rcu(dev, &(iter)))
void *netdev_lower_get_next(struct net_device *dev,
struct list_head **iter);
#define netdev_for_each_lower_dev(dev, ldev, iter) \
for (iter = (dev)->adj_list.lower.next, \
ldev = netdev_lower_get_next(dev, &(iter)); \
ldev; \
ldev = netdev_lower_get_next(dev, &(iter)))
struct net_device *netdev_next_lower_dev_rcu(struct net_device *dev,
struct list_head **iter);
int netdev_walk_all_lower_dev(struct net_device *dev,
int (*fn)(struct net_device *lower_dev,
struct netdev_nested_priv *priv),
struct netdev_nested_priv *priv);
int netdev_walk_all_lower_dev_rcu(struct net_device *dev,
int (*fn)(struct net_device *lower_dev,
struct netdev_nested_priv *priv),
struct netdev_nested_priv *priv);
void *netdev_adjacent_get_private(struct list_head *adj_list);
void *netdev_lower_get_first_private_rcu(struct net_device *dev);
struct net_device *netdev_master_upper_dev_get(struct net_device *dev);
struct net_device *netdev_master_upper_dev_get_rcu(struct net_device *dev);
int netdev_upper_dev_link(struct net_device *dev, struct net_device *upper_dev,
struct netlink_ext_ack *extack);
int netdev_master_upper_dev_link(struct net_device *dev,
struct net_device *upper_dev,
void *upper_priv, void *upper_info,
struct netlink_ext_ack *extack);
void netdev_upper_dev_unlink(struct net_device *dev,
struct net_device *upper_dev);
int netdev_adjacent_change_prepare(struct net_device *old_dev,
struct net_device *new_dev,
struct net_device *dev,
struct netlink_ext_ack *extack);
void netdev_adjacent_change_commit(struct net_device *old_dev,
struct net_device *new_dev,
struct net_device *dev);
void netdev_adjacent_change_abort(struct net_device *old_dev,
struct net_device *new_dev,
struct net_device *dev);
void netdev_adjacent_rename_links(struct net_device *dev, char *oldname);
void *netdev_lower_dev_get_private(struct net_device *dev,
struct net_device *lower_dev);
void netdev_lower_state_changed(struct net_device *lower_dev,
void *lower_state_info);
/* RSS keys are 40 or 52 bytes long */
#define NETDEV_RSS_KEY_LEN 52
extern u8 netdev_rss_key[NETDEV_RSS_KEY_LEN] __read_mostly;
void netdev_rss_key_fill(void *buffer, size_t len);
int skb_checksum_help(struct sk_buff *skb);
int skb_crc32c_csum_help(struct sk_buff *skb);
int skb_csum_hwoffload_help(struct sk_buff *skb,
const netdev_features_t features);
struct sk_buff *__skb_gso_segment(struct sk_buff *skb,
netdev_features_t features, bool tx_path);
struct sk_buff *skb_mac_gso_segment(struct sk_buff *skb,
netdev_features_t features);
struct netdev_bonding_info {
ifslave slave;
ifbond master;
};
struct netdev_notifier_bonding_info {
struct netdev_notifier_info info; /* must be first */
struct netdev_bonding_info bonding_info;
};
void netdev_bonding_info_change(struct net_device *dev,
struct netdev_bonding_info *bonding_info);
#if IS_ENABLED(CONFIG_ETHTOOL_NETLINK)
void ethtool_notify(struct net_device *dev, unsigned int cmd, const void *data);
#else
static inline void ethtool_notify(struct net_device *dev, unsigned int cmd,
const void *data)
{
}
#endif
static inline
struct sk_buff *skb_gso_segment(struct sk_buff *skb, netdev_features_t features)
{
return __skb_gso_segment(skb, features, true);
}
__be16 skb_network_protocol(struct sk_buff *skb, int *depth);
static inline bool can_checksum_protocol(netdev_features_t features,
__be16 protocol)
{
if (protocol == htons(ETH_P_FCOE)) return !!(features & NETIF_F_FCOE_CRC);
/* Assume this is an IP checksum (not SCTP CRC) */
if (features & NETIF_F_HW_CSUM) {
/* Can checksum everything */
return true;
}
switch (protocol) {
case htons(ETH_P_IP):
return !!(features & NETIF_F_IP_CSUM);
case htons(ETH_P_IPV6):
return !!(features & NETIF_F_IPV6_CSUM);
default:
return false;
}
}
#ifdef CONFIG_BUG
void netdev_rx_csum_fault(struct net_device *dev, struct sk_buff *skb);
#else
static inline void netdev_rx_csum_fault(struct net_device *dev,
struct sk_buff *skb)
{
}
#endif
/* rx skb timestamps */
void net_enable_timestamp(void);
void net_disable_timestamp(void);
#ifdef CONFIG_PROC_FS
int __init dev_proc_init(void);
#else
#define dev_proc_init() 0
#endif
static inline netdev_tx_t __netdev_start_xmit(const struct net_device_ops *ops,
struct sk_buff *skb, struct net_device *dev,
bool more)
{
__this_cpu_write(softnet_data.xmit.more, more);
return ops->ndo_start_xmit(skb, dev);
}
static inline bool netdev_xmit_more(void)
{
return __this_cpu_read(softnet_data.xmit.more);
}
static inline netdev_tx_t netdev_start_xmit(struct sk_buff *skb, struct net_device *dev,
struct netdev_queue *txq, bool more)
{
const struct net_device_ops *ops = dev->netdev_ops;
netdev_tx_t rc;
rc = __netdev_start_xmit(ops, skb, dev, more);
if (rc == NETDEV_TX_OK)
txq_trans_update(txq);
return rc;
}
int netdev_class_create_file_ns(const struct class_attribute *class_attr,
const void *ns);
void netdev_class_remove_file_ns(const struct class_attribute *class_attr,
const void *ns);
extern const struct kobj_ns_type_operations net_ns_type_operations;
const char *netdev_drivername(const struct net_device *dev);
void linkwatch_run_queue(void);
static inline netdev_features_t netdev_intersect_features(netdev_features_t f1,
netdev_features_t f2)
{
if ((f1 ^ f2) & NETIF_F_HW_CSUM) {
if (f1 & NETIF_F_HW_CSUM) f1 |= (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM);
else
f2 |= (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM);
}
return f1 & f2;
}
static inline netdev_features_t netdev_get_wanted_features(
struct net_device *dev)
{
return (dev->features & ~dev->hw_features) | dev->wanted_features;
}
netdev_features_t netdev_increment_features(netdev_features_t all,
netdev_features_t one, netdev_features_t mask);
/* Allow TSO being used on stacked device :
* Performing the GSO segmentation before last device
* is a performance improvement.
*/
static inline netdev_features_t netdev_add_tso_features(netdev_features_t features,
netdev_features_t mask)
{
return netdev_increment_features(features, NETIF_F_ALL_TSO, mask);
}
int __netdev_update_features(struct net_device *dev);
void netdev_update_features(struct net_device *dev);
void netdev_change_features(struct net_device *dev);
void netif_stacked_transfer_operstate(const struct net_device *rootdev,
struct net_device *dev);
netdev_features_t passthru_features_check(struct sk_buff *skb,
struct net_device *dev,
netdev_features_t features);
netdev_features_t netif_skb_features(struct sk_buff *skb);
static inline bool net_gso_ok(netdev_features_t features, int gso_type)
{
netdev_features_t feature = (netdev_features_t)gso_type << NETIF_F_GSO_SHIFT;
/* check flags correspondence */
BUILD_BUG_ON(SKB_GSO_TCPV4 != (NETIF_F_TSO >> NETIF_F_GSO_SHIFT));
BUILD_BUG_ON(SKB_GSO_DODGY != (NETIF_F_GSO_ROBUST >> NETIF_F_GSO_SHIFT));
BUILD_BUG_ON(SKB_GSO_TCP_ECN != (NETIF_F_TSO_ECN >> NETIF_F_GSO_SHIFT));
BUILD_BUG_ON(SKB_GSO_TCP_FIXEDID != (NETIF_F_TSO_MANGLEID >> NETIF_F_GSO_SHIFT));
BUILD_BUG_ON(SKB_GSO_TCPV6 != (NETIF_F_TSO6 >> NETIF_F_GSO_SHIFT));
BUILD_BUG_ON(SKB_GSO_FCOE != (NETIF_F_FSO >> NETIF_F_GSO_SHIFT));
BUILD_BUG_ON(SKB_GSO_GRE != (NETIF_F_GSO_GRE >> NETIF_F_GSO_SHIFT));
BUILD_BUG_ON(SKB_GSO_GRE_CSUM != (NETIF_F_GSO_GRE_CSUM >> NETIF_F_GSO_SHIFT));
BUILD_BUG_ON(SKB_GSO_IPXIP4 != (NETIF_F_GSO_IPXIP4 >> NETIF_F_GSO_SHIFT));
BUILD_BUG_ON(SKB_GSO_IPXIP6 != (NETIF_F_GSO_IPXIP6 >> NETIF_F_GSO_SHIFT));
BUILD_BUG_ON(SKB_GSO_UDP_TUNNEL != (NETIF_F_GSO_UDP_TUNNEL >> NETIF_F_GSO_SHIFT));
BUILD_BUG_ON(SKB_GSO_UDP_TUNNEL_CSUM != (NETIF_F_GSO_UDP_TUNNEL_CSUM >> NETIF_F_GSO_SHIFT));
BUILD_BUG_ON(SKB_GSO_PARTIAL != (NETIF_F_GSO_PARTIAL >> NETIF_F_GSO_SHIFT));
BUILD_BUG_ON(SKB_GSO_TUNNEL_REMCSUM != (NETIF_F_GSO_TUNNEL_REMCSUM >> NETIF_F_GSO_SHIFT));
BUILD_BUG_ON(SKB_GSO_SCTP != (NETIF_F_GSO_SCTP >> NETIF_F_GSO_SHIFT));
BUILD_BUG_ON(SKB_GSO_ESP != (NETIF_F_GSO_ESP >> NETIF_F_GSO_SHIFT));
BUILD_BUG_ON(SKB_GSO_UDP != (NETIF_F_GSO_UDP >> NETIF_F_GSO_SHIFT));
BUILD_BUG_ON(SKB_GSO_UDP_L4 != (NETIF_F_GSO_UDP_L4 >> NETIF_F_GSO_SHIFT));
BUILD_BUG_ON(SKB_GSO_FRAGLIST != (NETIF_F_GSO_FRAGLIST >> NETIF_F_GSO_SHIFT));
return (features & feature) == feature;
}
static inline bool skb_gso_ok(struct sk_buff *skb, netdev_features_t features)
{
return net_gso_ok(features, skb_shinfo(skb)->gso_type) && (!skb_has_frag_list(skb) || (features & NETIF_F_FRAGLIST));
}
static inline bool netif_needs_gso(struct sk_buff *skb,
netdev_features_t features)
{
return skb_is_gso(skb) && (!skb_gso_ok(skb, features) || unlikely((skb->ip_summed != CHECKSUM_PARTIAL) &&
(skb->ip_summed != CHECKSUM_UNNECESSARY)));
}
static inline void netif_set_gso_max_size(struct net_device *dev,
unsigned int size)
{
dev->gso_max_size = size;
}
static inline void skb_gso_error_unwind(struct sk_buff *skb, __be16 protocol,
int pulled_hlen, u16 mac_offset,
int mac_len)
{
skb->protocol = protocol;
skb->encapsulation = 1;
skb_push(skb, pulled_hlen);
skb_reset_transport_header(skb);
skb->mac_header = mac_offset;
skb->network_header = skb->mac_header + mac_len;
skb->mac_len = mac_len;
}
static inline bool netif_is_macsec(const struct net_device *dev)
{
return dev->priv_flags & IFF_MACSEC;
}
static inline bool netif_is_macvlan(const struct net_device *dev)
{
return dev->priv_flags & IFF_MACVLAN;
}
static inline bool netif_is_macvlan_port(const struct net_device *dev)
{
return dev->priv_flags & IFF_MACVLAN_PORT;
}
static inline bool netif_is_bond_master(const struct net_device *dev)
{
return dev->flags & IFF_MASTER && dev->priv_flags & IFF_BONDING;
}
static inline bool netif_is_bond_slave(const struct net_device *dev)
{
return dev->flags & IFF_SLAVE && dev->priv_flags & IFF_BONDING;
}
static inline bool netif_supports_nofcs(struct net_device *dev)
{
return dev->priv_flags & IFF_SUPP_NOFCS;
}
static inline bool netif_has_l3_rx_handler(const struct net_device *dev)
{
return dev->priv_flags & IFF_L3MDEV_RX_HANDLER;
}
static inline bool netif_is_l3_master(const struct net_device *dev)
{
return dev->priv_flags & IFF_L3MDEV_MASTER;
}
static inline bool netif_is_l3_slave(const struct net_device *dev)
{
return dev->priv_flags & IFF_L3MDEV_SLAVE;
}
static inline bool netif_is_bridge_master(const struct net_device *dev)
{
return dev->priv_flags & IFF_EBRIDGE;
}
static inline bool netif_is_bridge_port(const struct net_device *dev)
{
return dev->priv_flags & IFF_BRIDGE_PORT;
}
static inline bool netif_is_ovs_master(const struct net_device *dev)
{
return dev->priv_flags & IFF_OPENVSWITCH;
}
static inline bool netif_is_ovs_port(const struct net_device *dev)
{
return dev->priv_flags & IFF_OVS_DATAPATH;
}
static inline bool netif_is_any_bridge_port(const struct net_device *dev)
{
return netif_is_bridge_port(dev) || netif_is_ovs_port(dev);
}
static inline bool netif_is_team_master(const struct net_device *dev)
{
return dev->priv_flags & IFF_TEAM;
}
static inline bool netif_is_team_port(const struct net_device *dev)
{
return dev->priv_flags & IFF_TEAM_PORT;
}
static inline bool netif_is_lag_master(const struct net_device *dev)
{
return netif_is_bond_master(dev) || netif_is_team_master(dev);
}
static inline bool netif_is_lag_port(const struct net_device *dev)
{
return netif_is_bond_slave(dev) || netif_is_team_port(dev);
}
static inline bool netif_is_rxfh_configured(const struct net_device *dev)
{
return dev->priv_flags & IFF_RXFH_CONFIGURED;
}
static inline bool netif_is_failover(const struct net_device *dev)
{
return dev->priv_flags & IFF_FAILOVER;
}
static inline bool netif_is_failover_slave(const struct net_device *dev)
{
return dev->priv_flags & IFF_FAILOVER_SLAVE;
}
/* This device needs to keep skb dst for qdisc enqueue or ndo_start_xmit() */
static inline void netif_keep_dst(struct net_device *dev)
{
dev->priv_flags &= ~(IFF_XMIT_DST_RELEASE | IFF_XMIT_DST_RELEASE_PERM);
}
/* return true if dev can't cope with mtu frames that need vlan tag insertion */
static inline bool netif_reduces_vlan_mtu(struct net_device *dev)
{
/* TODO: reserve and use an additional IFF bit, if we get more users */
return dev->priv_flags & IFF_MACSEC;
}
extern struct pernet_operations __net_initdata loopback_net_ops;
/* Logging, debugging and troubleshooting/diagnostic helpers. */
/* netdev_printk helpers, similar to dev_printk */
static inline const char *netdev_name(const struct net_device *dev)
{
if (!dev->name[0] || strchr(dev->name, '%'))
return "(unnamed net_device)";
return dev->name;
}
static inline bool netdev_unregistering(const struct net_device *dev)
{
return dev->reg_state == NETREG_UNREGISTERING;
}
static inline const char *netdev_reg_state(const struct net_device *dev)
{
switch (dev->reg_state) {
case NETREG_UNINITIALIZED: return " (uninitialized)";
case NETREG_REGISTERED: return "";
case NETREG_UNREGISTERING: return " (unregistering)";
case NETREG_UNREGISTERED: return " (unregistered)";
case NETREG_RELEASED: return " (released)";
case NETREG_DUMMY: return " (dummy)";
}
WARN_ONCE(1, "%s: unknown reg_state %d\n", dev->name, dev->reg_state);
return " (unknown)";
}
__printf(3, 4) __cold
void netdev_printk(const char *level, const struct net_device *dev,
const char *format, ...);
__printf(2, 3) __cold
void netdev_emerg(const struct net_device *dev, const char *format, ...);
__printf(2, 3) __cold
void netdev_alert(const struct net_device *dev, const char *format, ...);
__printf(2, 3) __cold
void netdev_crit(const struct net_device *dev, const char *format, ...);
__printf(2, 3) __cold
void netdev_err(const struct net_device *dev, const char *format, ...);
__printf(2, 3) __cold
void netdev_warn(const struct net_device *dev, const char *format, ...);
__printf(2, 3) __cold
void netdev_notice(const struct net_device *dev, const char *format, ...);
__printf(2, 3) __cold
void netdev_info(const struct net_device *dev, const char *format, ...);
#define netdev_level_once(level, dev, fmt, ...) \
do { \
static bool __print_once __read_mostly; \
\
if (!__print_once) { \
__print_once = true; \
netdev_printk(level, dev, fmt, ##__VA_ARGS__); \
} \
} while (0)
#define netdev_emerg_once(dev, fmt, ...) \
netdev_level_once(KERN_EMERG, dev, fmt, ##__VA_ARGS__)
#define netdev_alert_once(dev, fmt, ...) \
netdev_level_once(KERN_ALERT, dev, fmt, ##__VA_ARGS__)
#define netdev_crit_once(dev, fmt, ...) \
netdev_level_once(KERN_CRIT, dev, fmt, ##__VA_ARGS__)
#define netdev_err_once(dev, fmt, ...) \
netdev_level_once(KERN_ERR, dev, fmt, ##__VA_ARGS__)
#define netdev_warn_once(dev, fmt, ...) \
netdev_level_once(KERN_WARNING, dev, fmt, ##__VA_ARGS__)
#define netdev_notice_once(dev, fmt, ...) \
netdev_level_once(KERN_NOTICE, dev, fmt, ##__VA_ARGS__)
#define netdev_info_once(dev, fmt, ...) \
netdev_level_once(KERN_INFO, dev, fmt, ##__VA_ARGS__)
#define MODULE_ALIAS_NETDEV(device) \
MODULE_ALIAS("netdev-" device)
#if defined(CONFIG_DYNAMIC_DEBUG) || \
(defined(CONFIG_DYNAMIC_DEBUG_CORE) && defined(DYNAMIC_DEBUG_MODULE))
#define netdev_dbg(__dev, format, args...) \
do { \
dynamic_netdev_dbg(__dev, format, ##args); \
} while (0)
#elif defined(DEBUG)
#define netdev_dbg(__dev, format, args...) \
netdev_printk(KERN_DEBUG, __dev, format, ##args)
#else
#define netdev_dbg(__dev, format, args...) \
({ \
if (0) \
netdev_printk(KERN_DEBUG, __dev, format, ##args); \
})
#endif
#if defined(VERBOSE_DEBUG)
#define netdev_vdbg netdev_dbg
#else
#define netdev_vdbg(dev, format, args...) \
({ \
if (0) \
netdev_printk(KERN_DEBUG, dev, format, ##args); \
0; \
})
#endif
/*
* netdev_WARN() acts like dev_printk(), but with the key difference
* of using a WARN/WARN_ON to get the message out, including the
* file/line information and a backtrace.
*/
#define netdev_WARN(dev, format, args...) \
WARN(1, "netdevice: %s%s: " format, netdev_name(dev), \
netdev_reg_state(dev), ##args)
#define netdev_WARN_ONCE(dev, format, args...) \
WARN_ONCE(1, "netdevice: %s%s: " format, netdev_name(dev), \
netdev_reg_state(dev), ##args)
/* netif printk helpers, similar to netdev_printk */
#define netif_printk(priv, type, level, dev, fmt, args...) \
do { \
if (netif_msg_##type(priv)) \
netdev_printk(level, (dev), fmt, ##args); \
} while (0)
#define netif_level(level, priv, type, dev, fmt, args...) \
do { \
if (netif_msg_##type(priv)) \
netdev_##level(dev, fmt, ##args); \
} while (0)
#define netif_emerg(priv, type, dev, fmt, args...) \
netif_level(emerg, priv, type, dev, fmt, ##args)
#define netif_alert(priv, type, dev, fmt, args...) \
netif_level(alert, priv, type, dev, fmt, ##args)
#define netif_crit(priv, type, dev, fmt, args...) \
netif_level(crit, priv, type, dev, fmt, ##args)
#define netif_err(priv, type, dev, fmt, args...) \
netif_level(err, priv, type, dev, fmt, ##args)
#define netif_warn(priv, type, dev, fmt, args...) \
netif_level(warn, priv, type, dev, fmt, ##args)
#define netif_notice(priv, type, dev, fmt, args...) \
netif_level(notice, priv, type, dev, fmt, ##args)
#define netif_info(priv, type, dev, fmt, args...) \
netif_level(info, priv, type, dev, fmt, ##args)
#if defined(CONFIG_DYNAMIC_DEBUG) || \
(defined(CONFIG_DYNAMIC_DEBUG_CORE) && defined(DYNAMIC_DEBUG_MODULE))
#define netif_dbg(priv, type, netdev, format, args...) \
do { \
if (netif_msg_##type(priv)) \
dynamic_netdev_dbg(netdev, format, ##args); \
} while (0)
#elif defined(DEBUG)
#define netif_dbg(priv, type, dev, format, args...) \
netif_printk(priv, type, KERN_DEBUG, dev, format, ##args)
#else
#define netif_dbg(priv, type, dev, format, args...) \
({ \
if (0) \
netif_printk(priv, type, KERN_DEBUG, dev, format, ##args); \
0; \
})
#endif
/* if @cond then downgrade to debug, else print at @level */
#define netif_cond_dbg(priv, type, netdev, cond, level, fmt, args...) \
do { \
if (cond) \
netif_dbg(priv, type, netdev, fmt, ##args); \
else \
netif_ ## level(priv, type, netdev, fmt, ##args); \
} while (0)
#if defined(VERBOSE_DEBUG)
#define netif_vdbg netif_dbg
#else
#define netif_vdbg(priv, type, dev, format, args...) \
({ \
if (0) \
netif_printk(priv, type, KERN_DEBUG, dev, format, ##args); \
0; \
})
#endif
/*
* The list of packet types we will receive (as opposed to discard)
* and the routines to invoke.
*
* Why 16. Because with 16 the only overlap we get on a hash of the
* low nibble of the protocol value is RARP/SNAP/X.25.
*
* 0800 IP
* 0001 802.3
* 0002 AX.25
* 0004 802.2
* 8035 RARP
* 0005 SNAP
* 0805 X.25
* 0806 ARP
* 8137 IPX
* 0009 Localtalk
* 86DD IPv6
*/
#define PTYPE_HASH_SIZE (16)
#define PTYPE_HASH_MASK (PTYPE_HASH_SIZE - 1)
extern struct list_head ptype_all __read_mostly;
extern struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
extern struct net_device *blackhole_netdev;
#endif /* _LINUX_NETDEVICE_H */
// SPDX-License-Identifier: GPL-2.0
/*
* Copyright (C) 2007 Oracle. All rights reserved.
*/
#include <linux/err.h>
#include <linux/uuid.h>
#include "ctree.h"
#include "transaction.h"
#include "disk-io.h"
#include "print-tree.h"
#include "qgroup.h"
#include "space-info.h"
/*
* Read a root item from the tree. In case we detect a root item smaller then
* sizeof(root_item), we know it's an old version of the root structure and
* initialize all new fields to zero. The same happens if we detect mismatching
* generation numbers as then we know the root was once mounted with an older
* kernel that was not aware of the root item structure change.
*/
static void btrfs_read_root_item(struct extent_buffer *eb, int slot,
struct btrfs_root_item *item)
{
u32 len;
int need_reset = 0;
len = btrfs_item_size_nr(eb, slot);
read_extent_buffer(eb, item, btrfs_item_ptr_offset(eb, slot),
min_t(u32, len, sizeof(*item)));
if (len < sizeof(*item))
need_reset = 1;
if (!need_reset && btrfs_root_generation(item)
!= btrfs_root_generation_v2(item)) {
if (btrfs_root_generation_v2(item) != 0) {
btrfs_warn(eb->fs_info,
"mismatching generation and generation_v2 found in root item. This root was probably mounted with an older kernel. Resetting all new fields.");
}
need_reset = 1;
}
if (need_reset) {
memset(&item->generation_v2, 0,
sizeof(*item) - offsetof(struct btrfs_root_item,
generation_v2));
generate_random_guid(item->uuid);
}
}
/*
* btrfs_find_root - lookup the root by the key.
* root: the root of the root tree
* search_key: the key to search
* path: the path we search
* root_item: the root item of the tree we look for
* root_key: the root key of the tree we look for
*
* If ->offset of 'search_key' is -1ULL, it means we are not sure the offset
* of the search key, just lookup the root with the highest offset for a
* given objectid.
*
* If we find something return 0, otherwise > 0, < 0 on error.
*/
int btrfs_find_root(struct btrfs_root *root, const struct btrfs_key *search_key,
struct btrfs_path *path, struct btrfs_root_item *root_item,
struct btrfs_key *root_key)
{
struct btrfs_key found_key;
struct extent_buffer *l;
int ret;
int slot;
ret = btrfs_search_slot(NULL, root, search_key, path, 0, 0);
if (ret < 0)
return ret;
if (search_key->offset != -1ULL) { /* the search key is exact */ if (ret > 0)
goto out;
} else {
BUG_ON(ret == 0); /* Logical error */ if (path->slots[0] == 0)
goto out;
path->slots[0]--;
ret = 0;
}
l = path->nodes[0];
slot = path->slots[0];
btrfs_item_key_to_cpu(l, &found_key, slot);
if (found_key.objectid != search_key->objectid ||
found_key.type != BTRFS_ROOT_ITEM_KEY) {
ret = 1;
goto out;
}
if (root_item)
btrfs_read_root_item(l, slot, root_item);
if (root_key) memcpy(root_key, &found_key, sizeof(found_key));
out:
btrfs_release_path(path); return ret;
}
void btrfs_set_root_node(struct btrfs_root_item *item,
struct extent_buffer *node)
{
btrfs_set_root_bytenr(item, node->start);
btrfs_set_root_level(item, btrfs_header_level(node));
btrfs_set_root_generation(item, btrfs_header_generation(node));
}
/*
* copy the data in 'item' into the btree
*/
int btrfs_update_root(struct btrfs_trans_handle *trans, struct btrfs_root
*root, struct btrfs_key *key, struct btrfs_root_item
*item)
{
struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_path *path;
struct extent_buffer *l;
int ret;
int slot;
unsigned long ptr;
u32 old_len;
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
ret = btrfs_search_slot(trans, root, key, path, 0, 1);
if (ret < 0)
goto out;
if (ret > 0) {
btrfs_crit(fs_info,
"unable to find root key (%llu %u %llu) in tree %llu",
key->objectid, key->type, key->offset,
root->root_key.objectid);
ret = -EUCLEAN;
btrfs_abort_transaction(trans, ret);
goto out;
}
l = path->nodes[0];
slot = path->slots[0];
ptr = btrfs_item_ptr_offset(l, slot);
old_len = btrfs_item_size_nr(l, slot);
/*
* If this is the first time we update the root item which originated
* from an older kernel, we need to enlarge the item size to make room
* for the added fields.
*/
if (old_len < sizeof(*item)) {
btrfs_release_path(path);
ret = btrfs_search_slot(trans, root, key, path,
-1, 1);
if (ret < 0) {
btrfs_abort_transaction(trans, ret);
goto out;
}
ret = btrfs_del_item(trans, root, path);
if (ret < 0) {
btrfs_abort_transaction(trans, ret);
goto out;
}
btrfs_release_path(path);
ret = btrfs_insert_empty_item(trans, root, path,
key, sizeof(*item));
if (ret < 0) {
btrfs_abort_transaction(trans, ret);
goto out;
}
l = path->nodes[0];
slot = path->slots[0];
ptr = btrfs_item_ptr_offset(l, slot);
}
/*
* Update generation_v2 so at the next mount we know the new root
* fields are valid.
*/
btrfs_set_root_generation_v2(item, btrfs_root_generation(item));
write_extent_buffer(l, item, ptr, sizeof(*item));
btrfs_mark_buffer_dirty(path->nodes[0]);
out:
btrfs_free_path(path); return ret;
}
int btrfs_insert_root(struct btrfs_trans_handle *trans, struct btrfs_root *root,
const struct btrfs_key *key, struct btrfs_root_item *item)
{
/*
* Make sure generation v1 and v2 match. See update_root for details.
*/
btrfs_set_root_generation_v2(item, btrfs_root_generation(item));
return btrfs_insert_item(trans, root, key, item, sizeof(*item));
}
int btrfs_find_orphan_roots(struct btrfs_fs_info *fs_info)
{
struct btrfs_root *tree_root = fs_info->tree_root;
struct extent_buffer *leaf;
struct btrfs_path *path;
struct btrfs_key key;
struct btrfs_root *root;
int err = 0;
int ret;
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
key.objectid = BTRFS_ORPHAN_OBJECTID;
key.type = BTRFS_ORPHAN_ITEM_KEY;
key.offset = 0;
while (1) {
u64 root_objectid;
ret = btrfs_search_slot(NULL, tree_root, &key, path, 0, 0);
if (ret < 0) {
err = ret;
break;
}
leaf = path->nodes[0];
if (path->slots[0] >= btrfs_header_nritems(leaf)) {
ret = btrfs_next_leaf(tree_root, path);
if (ret < 0)
err = ret;
if (ret != 0)
break;
leaf = path->nodes[0];
}
btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
btrfs_release_path(path);
if (key.objectid != BTRFS_ORPHAN_OBJECTID ||
key.type != BTRFS_ORPHAN_ITEM_KEY)
break;
root_objectid = key.offset;
key.offset++;
root = btrfs_get_fs_root(fs_info, root_objectid, false);
err = PTR_ERR_OR_ZERO(root);
if (err && err != -ENOENT) {
break;
} else if (err == -ENOENT) {
struct btrfs_trans_handle *trans;
btrfs_release_path(path);
trans = btrfs_join_transaction(tree_root);
if (IS_ERR(trans)) {
err = PTR_ERR(trans);
btrfs_handle_fs_error(fs_info, err,
"Failed to start trans to delete orphan item");
break;
}
err = btrfs_del_orphan_item(trans, tree_root,
root_objectid);
btrfs_end_transaction(trans);
if (err) {
btrfs_handle_fs_error(fs_info, err,
"Failed to delete root orphan item");
break;
}
continue;
}
WARN_ON(!test_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &root->state));
if (btrfs_root_refs(&root->root_item) == 0) {
struct btrfs_key drop_key;
btrfs_disk_key_to_cpu(&drop_key, &root->root_item.drop_progress);
/*
* If we have a non-zero drop_progress then we know we
* made it partly through deleting this snapshot, and
* thus we need to make sure we block any balance from
* happening until this snapshot is completely dropped.
*/
if (drop_key.objectid != 0 || drop_key.type != 0 ||
drop_key.offset != 0) {
set_bit(BTRFS_FS_UNFINISHED_DROPS, &fs_info->flags);
set_bit(BTRFS_ROOT_UNFINISHED_DROP, &root->state);
}
set_bit(BTRFS_ROOT_DEAD_TREE, &root->state);
btrfs_add_dead_root(root);
}
btrfs_put_root(root);
}
btrfs_free_path(path); return err;
}
/* drop the root item for 'key' from the tree root */
int btrfs_del_root(struct btrfs_trans_handle *trans,
const struct btrfs_key *key)
{
struct btrfs_root *root = trans->fs_info->tree_root;
struct btrfs_path *path;
int ret;
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
ret = btrfs_search_slot(trans, root, key, path, -1, 1);
if (ret < 0)
goto out;
BUG_ON(ret != 0);
ret = btrfs_del_item(trans, root, path);
out:
btrfs_free_path(path);
return ret;
}
int btrfs_del_root_ref(struct btrfs_trans_handle *trans, u64 root_id,
u64 ref_id, u64 dirid, u64 *sequence, const char *name,
int name_len)
{
struct btrfs_root *tree_root = trans->fs_info->tree_root;
struct btrfs_path *path;
struct btrfs_root_ref *ref;
struct extent_buffer *leaf;
struct btrfs_key key;
unsigned long ptr;
int err = 0;
int ret;
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
key.objectid = root_id;
key.type = BTRFS_ROOT_BACKREF_KEY;
key.offset = ref_id;
again:
ret = btrfs_search_slot(trans, tree_root, &key, path, -1, 1);
if (ret < 0)
goto out;
if (ret == 0) {
leaf = path->nodes[0];
ref = btrfs_item_ptr(leaf, path->slots[0],
struct btrfs_root_ref);
ptr = (unsigned long)(ref + 1);
if ((btrfs_root_ref_dirid(leaf, ref) != dirid) ||
(btrfs_root_ref_name_len(leaf, ref) != name_len) ||
memcmp_extent_buffer(leaf, name, ptr, name_len)) {
err = -ENOENT;
goto out;
}
*sequence = btrfs_root_ref_sequence(leaf, ref);
ret = btrfs_del_item(trans, tree_root, path);
if (ret) {
err = ret;
goto out;
}
} else
err = -ENOENT;
if (key.type == BTRFS_ROOT_BACKREF_KEY) {
btrfs_release_path(path);
key.objectid = ref_id;
key.type = BTRFS_ROOT_REF_KEY;
key.offset = root_id;
goto again;
}
out:
btrfs_free_path(path);
return err;
}
/*
* add a btrfs_root_ref item. type is either BTRFS_ROOT_REF_KEY
* or BTRFS_ROOT_BACKREF_KEY.
*
* The dirid, sequence, name and name_len refer to the directory entry
* that is referencing the root.
*
* For a forward ref, the root_id is the id of the tree referencing
* the root and ref_id is the id of the subvol or snapshot.
*
* For a back ref the root_id is the id of the subvol or snapshot and
* ref_id is the id of the tree referencing it.
*
* Will return 0, -ENOMEM, or anything from the CoW path
*/
int btrfs_add_root_ref(struct btrfs_trans_handle *trans, u64 root_id,
u64 ref_id, u64 dirid, u64 sequence, const char *name,
int name_len)
{
struct btrfs_root *tree_root = trans->fs_info->tree_root;
struct btrfs_key key;
int ret;
struct btrfs_path *path;
struct btrfs_root_ref *ref;
struct extent_buffer *leaf;
unsigned long ptr;
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
key.objectid = root_id;
key.type = BTRFS_ROOT_BACKREF_KEY;
key.offset = ref_id;
again:
ret = btrfs_insert_empty_item(trans, tree_root, path, &key,
sizeof(*ref) + name_len);
if (ret) {
btrfs_abort_transaction(trans, ret);
btrfs_free_path(path);
return ret;
}
leaf = path->nodes[0];
ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_root_ref);
btrfs_set_root_ref_dirid(leaf, ref, dirid);
btrfs_set_root_ref_sequence(leaf, ref, sequence);
btrfs_set_root_ref_name_len(leaf, ref, name_len);
ptr = (unsigned long)(ref + 1);
write_extent_buffer(leaf, name, ptr, name_len);
btrfs_mark_buffer_dirty(leaf);
if (key.type == BTRFS_ROOT_BACKREF_KEY) {
btrfs_release_path(path);
key.objectid = ref_id;
key.type = BTRFS_ROOT_REF_KEY;
key.offset = root_id;
goto again;
}
btrfs_free_path(path);
return 0;
}
/*
* Old btrfs forgets to init root_item->flags and root_item->byte_limit
* for subvolumes. To work around this problem, we steal a bit from
* root_item->inode_item->flags, and use it to indicate if those fields
* have been properly initialized.
*/
void btrfs_check_and_init_root_item(struct btrfs_root_item *root_item)
{
u64 inode_flags = btrfs_stack_inode_flags(&root_item->inode);
if (!(inode_flags & BTRFS_INODE_ROOT_ITEM_INIT)) {
inode_flags |= BTRFS_INODE_ROOT_ITEM_INIT;
btrfs_set_stack_inode_flags(&root_item->inode, inode_flags);
btrfs_set_root_flags(root_item, 0);
btrfs_set_root_limit(root_item, 0);
}
}
void btrfs_update_root_times(struct btrfs_trans_handle *trans,
struct btrfs_root *root)
{
struct btrfs_root_item *item = &root->root_item;
struct timespec64 ct;
ktime_get_real_ts64(&ct);
spin_lock(&root->root_item_lock);
btrfs_set_root_ctransid(item, trans->transid);
btrfs_set_stack_timespec_sec(&item->ctime, ct.tv_sec);
btrfs_set_stack_timespec_nsec(&item->ctime, ct.tv_nsec);
spin_unlock(&root->root_item_lock);
}
/*
* btrfs_subvolume_reserve_metadata() - reserve space for subvolume operation
* root: the root of the parent directory
* rsv: block reservation
* items: the number of items that we need do reservation
* use_global_rsv: allow fallback to the global block reservation
*
* This function is used to reserve the space for snapshot/subvolume
* creation and deletion. Those operations are different with the
* common file/directory operations, they change two fs/file trees
* and root tree, the number of items that the qgroup reserves is
* different with the free space reservation. So we can not use
* the space reservation mechanism in start_transaction().
*/
int btrfs_subvolume_reserve_metadata(struct btrfs_root *root,
struct btrfs_block_rsv *rsv, int items,
bool use_global_rsv)
{
u64 qgroup_num_bytes = 0;
u64 num_bytes;
int ret;
struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
if (test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) {
/* One for parent inode, two for dir entries */
qgroup_num_bytes = 3 * fs_info->nodesize;
ret = btrfs_qgroup_reserve_meta_prealloc(root,
qgroup_num_bytes, true);
if (ret)
return ret;
}
num_bytes = btrfs_calc_insert_metadata_size(fs_info, items);
rsv->space_info = btrfs_find_space_info(fs_info,
BTRFS_BLOCK_GROUP_METADATA);
ret = btrfs_block_rsv_add(root, rsv, num_bytes,
BTRFS_RESERVE_FLUSH_ALL);
if (ret == -ENOSPC && use_global_rsv)
ret = btrfs_block_rsv_migrate(global_rsv, rsv, num_bytes, true);
if (ret && qgroup_num_bytes)
btrfs_qgroup_free_meta_prealloc(root, qgroup_num_bytes);
if (!ret) {
spin_lock(&rsv->lock);
rsv->qgroup_rsv_reserved += qgroup_num_bytes;
spin_unlock(&rsv->lock);
}
return ret;
}
void btrfs_subvolume_release_metadata(struct btrfs_root *root,
struct btrfs_block_rsv *rsv)
{
struct btrfs_fs_info *fs_info = root->fs_info;
u64 qgroup_to_release;
btrfs_block_rsv_release(fs_info, rsv, (u64)-1, &qgroup_to_release);
btrfs_qgroup_convert_reserved_meta(root, qgroup_to_release);
}
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_X86_IO_H
#define _ASM_X86_IO_H
/*
* This file contains the definitions for the x86 IO instructions
* inb/inw/inl/outb/outw/outl and the "string versions" of the same
* (insb/insw/insl/outsb/outsw/outsl). You can also use "pausing"
* versions of the single-IO instructions (inb_p/inw_p/..).
*
* This file is not meant to be obfuscating: it's just complicated
* to (a) handle it all in a way that makes gcc able to optimize it
* as well as possible and (b) trying to avoid writing the same thing
* over and over again with slight variations and possibly making a
* mistake somewhere.
*/
/*
* Thanks to James van Artsdalen for a better timing-fix than
* the two short jumps: using outb's to a nonexistent port seems
* to guarantee better timings even on fast machines.
*
* On the other hand, I'd like to be sure of a non-existent port:
* I feel a bit unsafe about using 0x80 (should be safe, though)
*
* Linus
*/
/*
* Bit simplified and optimized by Jan Hubicka
* Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999.
*
* isa_memset_io, isa_memcpy_fromio, isa_memcpy_toio added,
* isa_read[wl] and isa_write[wl] fixed
* - Arnaldo Carvalho de Melo <acme@conectiva.com.br>
*/
#define ARCH_HAS_IOREMAP_WC
#define ARCH_HAS_IOREMAP_WT
#include <linux/string.h>
#include <linux/compiler.h>
#include <asm/page.h>
#include <asm/early_ioremap.h>
#include <asm/pgtable_types.h>
#define build_mmio_read(name, size, type, reg, barrier) \
static inline type name(const volatile void __iomem *addr) \
{ type ret; asm volatile("mov" size " %1,%0":reg (ret) \
:"m" (*(volatile type __force *)addr) barrier); return ret; }
#define build_mmio_write(name, size, type, reg, barrier) \
static inline void name(type val, volatile void __iomem *addr) \
{ asm volatile("mov" size " %0,%1": :reg (val), \
"m" (*(volatile type __force *)addr) barrier); }
build_mmio_read(readb, "b", unsigned char, "=q", :"memory")
build_mmio_read(readw, "w", unsigned short, "=r", :"memory")
build_mmio_read(readl, "l", unsigned int, "=r", :"memory")
build_mmio_read(__readb, "b", unsigned char, "=q", )
build_mmio_read(__readw, "w", unsigned short, "=r", )
build_mmio_read(__readl, "l", unsigned int, "=r", )
build_mmio_write(writeb, "b", unsigned char, "q", :"memory")
build_mmio_write(writew, "w", unsigned short, "r", :"memory")
build_mmio_write(writel, "l", unsigned int, "r", :"memory")
build_mmio_write(__writeb, "b", unsigned char, "q", )
build_mmio_write(__writew, "w", unsigned short, "r", )
build_mmio_write(__writel, "l", unsigned int, "r", )
#define readb readb
#define readw readw
#define readl readl
#define readb_relaxed(a) __readb(a)
#define readw_relaxed(a) __readw(a)
#define readl_relaxed(a) __readl(a)
#define __raw_readb __readb
#define __raw_readw __readw
#define __raw_readl __readl
#define writeb writeb
#define writew writew
#define writel writel
#define writeb_relaxed(v, a) __writeb(v, a)
#define writew_relaxed(v, a) __writew(v, a)
#define writel_relaxed(v, a) __writel(v, a)
#define __raw_writeb __writeb
#define __raw_writew __writew
#define __raw_writel __writel
#ifdef CONFIG_X86_64
build_mmio_read(readq, "q", u64, "=r", :"memory")
build_mmio_read(__readq, "q", u64, "=r", )
build_mmio_write(writeq, "q", u64, "r", :"memory")
build_mmio_write(__writeq, "q", u64, "r", )
#define readq_relaxed(a) __readq(a)
#define writeq_relaxed(v, a) __writeq(v, a)
#define __raw_readq __readq
#define __raw_writeq __writeq
/* Let people know that we have them */
#define readq readq
#define writeq writeq
#endif
#define ARCH_HAS_VALID_PHYS_ADDR_RANGE
extern int valid_phys_addr_range(phys_addr_t addr, size_t size);
extern int valid_mmap_phys_addr_range(unsigned long pfn, size_t size);
/**
* virt_to_phys - map virtual addresses to physical
* @address: address to remap
*
* The returned physical address is the physical (CPU) mapping for
* the memory address given. It is only valid to use this function on
* addresses directly mapped or allocated via kmalloc.
*
* This function does not give bus mappings for DMA transfers. In
* almost all conceivable cases a device driver should not be using
* this function
*/
static inline phys_addr_t virt_to_phys(volatile void *address)
{
return __pa(address);
}
#define virt_to_phys virt_to_phys
/**
* phys_to_virt - map physical address to virtual
* @address: address to remap
*
* The returned virtual address is a current CPU mapping for
* the memory address given. It is only valid to use this function on
* addresses that have a kernel mapping
*
* This function does not handle bus mappings for DMA transfers. In
* almost all conceivable cases a device driver should not be using
* this function
*/
static inline void *phys_to_virt(phys_addr_t address)
{
return __va(address);
}
#define phys_to_virt phys_to_virt
/*
* Change "struct page" to physical address.
*/
#define page_to_phys(page) ((dma_addr_t)page_to_pfn(page) << PAGE_SHIFT)
/*
* ISA I/O bus memory addresses are 1:1 with the physical address.
* However, we truncate the address to unsigned int to avoid undesirable
* promotions in legacy drivers.
*/
static inline unsigned int isa_virt_to_bus(volatile void *address)
{
return (unsigned int)virt_to_phys(address);
}
#define isa_bus_to_virt phys_to_virt
/*
* However PCI ones are not necessarily 1:1 and therefore these interfaces
* are forbidden in portable PCI drivers.
*
* Allow them on x86 for legacy drivers, though.
*/
#define virt_to_bus virt_to_phys
#define bus_to_virt phys_to_virt
/*
* The default ioremap() behavior is non-cached; if you need something
* else, you probably want one of the following.
*/
extern void __iomem *ioremap_uc(resource_size_t offset, unsigned long size);
#define ioremap_uc ioremap_uc
extern void __iomem *ioremap_cache(resource_size_t offset, unsigned long size);
#define ioremap_cache ioremap_cache
extern void __iomem *ioremap_prot(resource_size_t offset, unsigned long size, unsigned long prot_val);
#define ioremap_prot ioremap_prot
extern void __iomem *ioremap_encrypted(resource_size_t phys_addr, unsigned long size);
#define ioremap_encrypted ioremap_encrypted
/**
* ioremap - map bus memory into CPU space
* @offset: bus address of the memory
* @size: size of the resource to map
*
* ioremap performs a platform specific sequence of operations to
* make bus memory CPU accessible via the readb/readw/readl/writeb/
* writew/writel functions and the other mmio helpers. The returned
* address is not guaranteed to be usable directly as a virtual
* address.
*
* If the area you are trying to map is a PCI BAR you should have a
* look at pci_iomap().
*/
void __iomem *ioremap(resource_size_t offset, unsigned long size);
#define ioremap ioremap
extern void iounmap(volatile void __iomem *addr);
#define iounmap iounmap
extern void set_iounmap_nonlazy(void);
#ifdef __KERNEL__
void memcpy_fromio(void *, const volatile void __iomem *, size_t);
void memcpy_toio(volatile void __iomem *, const void *, size_t);
void memset_io(volatile void __iomem *, int, size_t);
#define memcpy_fromio memcpy_fromio
#define memcpy_toio memcpy_toio
#define memset_io memset_io
#include <asm-generic/iomap.h>
/*
* ISA space is 'always mapped' on a typical x86 system, no need to
* explicitly ioremap() it. The fact that the ISA IO space is mapped
* to PAGE_OFFSET is pure coincidence - it does not mean ISA values
* are physical addresses. The following constant pointer can be
* used as the IO-area pointer (it can be iounmapped as well, so the
* analogy with PCI is quite large):
*/
#define __ISA_IO_base ((char __iomem *)(PAGE_OFFSET))
#endif /* __KERNEL__ */
extern void native_io_delay(void);
extern int io_delay_type;
extern void io_delay_init(void);
#if defined(CONFIG_PARAVIRT)
#include <asm/paravirt.h>
#else
static inline void slow_down_io(void)
{
native_io_delay();
#ifdef REALLY_SLOW_IO
native_io_delay();
native_io_delay();
native_io_delay();
#endif
}
#endif
#ifdef CONFIG_AMD_MEM_ENCRYPT
#include <linux/jump_label.h>
extern struct static_key_false sev_enable_key;
static inline bool sev_key_active(void)
{
return static_branch_unlikely(&sev_enable_key);
}
#else /* !CONFIG_AMD_MEM_ENCRYPT */
static inline bool sev_key_active(void) { return false; }
#endif /* CONFIG_AMD_MEM_ENCRYPT */
#define BUILDIO(bwl, bw, type) \
static inline void out##bwl(unsigned type value, int port) \
{ \
asm volatile("out" #bwl " %" #bw "0, %w1" \
: : "a"(value), "Nd"(port)); \
} \
\
static inline unsigned type in##bwl(int port) \
{ \
unsigned type value; \
asm volatile("in" #bwl " %w1, %" #bw "0" \
: "=a"(value) : "Nd"(port)); \
return value; \
} \
\
static inline void out##bwl##_p(unsigned type value, int port) \
{ \
out##bwl(value, port); \
slow_down_io(); \
} \
\
static inline unsigned type in##bwl##_p(int port) \
{ \
unsigned type value = in##bwl(port); \
slow_down_io(); \
return value; \
} \
\
static inline void outs##bwl(int port, const void *addr, unsigned long count) \
{ \
if (sev_key_active()) { \
unsigned type *value = (unsigned type *)addr; \
while (count) { \
out##bwl(*value, port); \
value++; \
count--; \
} \
} else { \
asm volatile("rep; outs" #bwl \
: "+S"(addr), "+c"(count) \
: "d"(port) : "memory"); \
} \
} \
\
static inline void ins##bwl(int port, void *addr, unsigned long count) \
{ \
if (sev_key_active()) { \
unsigned type *value = (unsigned type *)addr; \
while (count) { \
*value = in##bwl(port); \
value++; \
count--; \
} \
} else { \
asm volatile("rep; ins" #bwl \
: "+D"(addr), "+c"(count) \
: "d"(port) : "memory"); \
} \
}
BUILDIO(b, b, char)
BUILDIO(w, w, short)
BUILDIO(l, , int)
#define inb inb
#define inw inw
#define inl inl
#define inb_p inb_p
#define inw_p inw_p
#define inl_p inl_p
#define insb insb
#define insw insw
#define insl insl
#define outb outb
#define outw outw
#define outl outl
#define outb_p outb_p
#define outw_p outw_p
#define outl_p outl_p
#define outsb outsb
#define outsw outsw
#define outsl outsl
extern void *xlate_dev_mem_ptr(phys_addr_t phys);
extern void unxlate_dev_mem_ptr(phys_addr_t phys, void *addr);
#define xlate_dev_mem_ptr xlate_dev_mem_ptr
#define unxlate_dev_mem_ptr unxlate_dev_mem_ptr
extern int ioremap_change_attr(unsigned long vaddr, unsigned long size,
enum page_cache_mode pcm);
extern void __iomem *ioremap_wc(resource_size_t offset, unsigned long size);
#define ioremap_wc ioremap_wc
extern void __iomem *ioremap_wt(resource_size_t offset, unsigned long size);
#define ioremap_wt ioremap_wt
extern bool is_early_ioremap_ptep(pte_t *ptep);
#define IO_SPACE_LIMIT 0xffff
#include <asm-generic/io.h>
#undef PCI_IOBASE
#ifdef CONFIG_MTRR
extern int __must_check arch_phys_wc_index(int handle);
#define arch_phys_wc_index arch_phys_wc_index
extern int __must_check arch_phys_wc_add(unsigned long base,
unsigned long size);
extern void arch_phys_wc_del(int handle);
#define arch_phys_wc_add arch_phys_wc_add
#endif
#ifdef CONFIG_X86_PAT
extern int arch_io_reserve_memtype_wc(resource_size_t start, resource_size_t size);
extern void arch_io_free_memtype_wc(resource_size_t start, resource_size_t size);
#define arch_io_reserve_memtype_wc arch_io_reserve_memtype_wc
#endif
extern bool arch_memremap_can_ram_remap(resource_size_t offset,
unsigned long size,
unsigned long flags);
#define arch_memremap_can_ram_remap arch_memremap_can_ram_remap
extern bool phys_mem_access_encrypted(unsigned long phys_addr,
unsigned long size);
/**
* iosubmit_cmds512 - copy data to single MMIO location, in 512-bit units
* @dst: destination, in MMIO space (must be 512-bit aligned)
* @src: source
* @count: number of 512 bits quantities to submit
*
* Submit data from kernel space to MMIO space, in units of 512 bits at a
* time. Order of access is not guaranteed, nor is a memory barrier
* performed afterwards.
*
* Warning: Do not use this helper unless your driver has checked that the CPU
* instruction is supported on the platform.
*/
static inline void iosubmit_cmds512(void __iomem *dst, const void *src,
size_t count)
{
const u8 *from = src;
const u8 *end = from + count * 64;
while (from < end) {
movdir64b(dst, from);
from += 64;
}
}
#endif /* _ASM_X86_IO_H */
// SPDX-License-Identifier: GPL-2.0-only
/*
* Generic pidhash and scalable, time-bounded PID allocator
*
* (C) 2002-2003 Nadia Yvette Chambers, IBM
* (C) 2004 Nadia Yvette Chambers, Oracle
* (C) 2002-2004 Ingo Molnar, Red Hat
*
* pid-structures are backing objects for tasks sharing a given ID to chain
* against. There is very little to them aside from hashing them and
* parking tasks using given ID's on a list.
*
* The hash is always changed with the tasklist_lock write-acquired,
* and the hash is only accessed with the tasklist_lock at least
* read-acquired, so there's no additional SMP locking needed here.
*
* We have a list of bitmap pages, which bitmaps represent the PID space.
* Allocating and freeing PIDs is completely lockless. The worst-case
* allocation scenario when all but one out of 1 million PIDs possible are
* allocated already: the scanning of 32 list entries and at most PAGE_SIZE
* bytes. The typical fastpath is a single successful setbit. Freeing is O(1).
*
* Pid namespaces:
* (C) 2007 Pavel Emelyanov <xemul@openvz.org>, OpenVZ, SWsoft Inc.
* (C) 2007 Sukadev Bhattiprolu <sukadev@us.ibm.com>, IBM
* Many thanks to Oleg Nesterov for comments and help
*
*/
#include <linux/mm.h>
#include <linux/export.h>
#include <linux/slab.h>
#include <linux/init.h>
#include <linux/rculist.h>
#include <linux/memblock.h>
#include <linux/pid_namespace.h>
#include <linux/init_task.h>
#include <linux/syscalls.h>
#include <linux/proc_ns.h>
#include <linux/refcount.h>
#include <linux/anon_inodes.h>
#include <linux/sched/signal.h>
#include <linux/sched/task.h>
#include <linux/idr.h>
#include <net/sock.h>
#include <uapi/linux/pidfd.h>
struct pid init_struct_pid = {
.count = REFCOUNT_INIT(1),
.tasks = {
{ .first = NULL },
{ .first = NULL },
{ .first = NULL },
},
.level = 0,
.numbers = { {
.nr = 0,
.ns = &init_pid_ns,
}, }
};
int pid_max = PID_MAX_DEFAULT;
#define RESERVED_PIDS 300
int pid_max_min = RESERVED_PIDS + 1;
int pid_max_max = PID_MAX_LIMIT;
/*
* PID-map pages start out as NULL, they get allocated upon
* first use and are never deallocated. This way a low pid_max
* value does not cause lots of bitmaps to be allocated, but
* the scheme scales to up to 4 million PIDs, runtime.
*/
struct pid_namespace init_pid_ns = {
.ns.count = REFCOUNT_INIT(2),
.idr = IDR_INIT(init_pid_ns.idr),
.pid_allocated = PIDNS_ADDING,
.level = 0,
.child_reaper = &init_task,
.user_ns = &init_user_ns,
.ns.inum = PROC_PID_INIT_INO,
#ifdef CONFIG_PID_NS
.ns.ops = &pidns_operations,
#endif
};
EXPORT_SYMBOL_GPL(init_pid_ns);
/*
* Note: disable interrupts while the pidmap_lock is held as an
* interrupt might come in and do read_lock(&tasklist_lock).
*
* If we don't disable interrupts there is a nasty deadlock between
* detach_pid()->free_pid() and another cpu that does
* spin_lock(&pidmap_lock) followed by an interrupt routine that does
* read_lock(&tasklist_lock);
*
* After we clean up the tasklist_lock and know there are no
* irq handlers that take it we can leave the interrupts enabled.
* For now it is easier to be safe than to prove it can't happen.
*/
static __cacheline_aligned_in_smp DEFINE_SPINLOCK(pidmap_lock);
void put_pid(struct pid *pid)
{
struct pid_namespace *ns;
if (!pid)
return;
ns = pid->numbers[pid->level].ns;
if (refcount_dec_and_test(&pid->count)) {
kmem_cache_free(ns->pid_cachep, pid);
put_pid_ns(ns);
}
}
EXPORT_SYMBOL_GPL(put_pid);
static void delayed_put_pid(struct rcu_head *rhp)
{
struct pid *pid = container_of(rhp, struct pid, rcu);
put_pid(pid);
}
void free_pid(struct pid *pid)
{
/* We can be called with write_lock_irq(&tasklist_lock) held */
int i;
unsigned long flags;
spin_lock_irqsave(&pidmap_lock, flags);
for (i = 0; i <= pid->level; i++) {
struct upid *upid = pid->numbers + i;
struct pid_namespace *ns = upid->ns;
switch (--ns->pid_allocated) {
case 2:
case 1:
/* When all that is left in the pid namespace
* is the reaper wake up the reaper. The reaper
* may be sleeping in zap_pid_ns_processes().
*/
wake_up_process(ns->child_reaper);
break;
case PIDNS_ADDING:
/* Handle a fork failure of the first process */
WARN_ON(ns->child_reaper);
ns->pid_allocated = 0;
break;
}
idr_remove(&ns->idr, upid->nr);
}
spin_unlock_irqrestore(&pidmap_lock, flags);
call_rcu(&pid->rcu, delayed_put_pid);
}
struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid,
size_t set_tid_size)
{
struct pid *pid;
enum pid_type type;
int i, nr;
struct pid_namespace *tmp;
struct upid *upid;
int retval = -ENOMEM;
/*
* set_tid_size contains the size of the set_tid array. Starting at
* the most nested currently active PID namespace it tells alloc_pid()
* which PID to set for a process in that most nested PID namespace
* up to set_tid_size PID namespaces. It does not have to set the PID
* for a process in all nested PID namespaces but set_tid_size must
* never be greater than the current ns->level + 1.
*/
if (set_tid_size > ns->level + 1)
return ERR_PTR(-EINVAL);
pid = kmem_cache_alloc(ns->pid_cachep, GFP_KERNEL);
if (!pid)
return ERR_PTR(retval);
tmp = ns;
pid->level = ns->level;
for (i = ns->level; i >= 0; i--) {
int tid = 0;
if (set_tid_size) {
tid = set_tid[ns->level - i];
retval = -EINVAL;
if (tid < 1 || tid >= pid_max)
goto out_free;
/*
* Also fail if a PID != 1 is requested and
* no PID 1 exists.
*/
if (tid != 1 && !tmp->child_reaper)
goto out_free;
retval = -EPERM;
if (!checkpoint_restore_ns_capable(tmp->user_ns))
goto out_free;
set_tid_size--;
}
idr_preload(GFP_KERNEL);
spin_lock_irq(&pidmap_lock);
if (tid) {
nr = idr_alloc(&tmp->idr, NULL, tid,
tid + 1, GFP_ATOMIC);
/*
* If ENOSPC is returned it means that the PID is
* alreay in use. Return EEXIST in that case.
*/
if (nr == -ENOSPC)
nr = -EEXIST;
} else {
int pid_min = 1;
/*
* init really needs pid 1, but after reaching the
* maximum wrap back to RESERVED_PIDS
*/
if (idr_get_cursor(&tmp->idr) > RESERVED_PIDS)
pid_min = RESERVED_PIDS;
/*
* Store a null pointer so find_pid_ns does not find
* a partially initialized PID (see below).
*/
nr = idr_alloc_cyclic(&tmp->idr, NULL, pid_min,
pid_max, GFP_ATOMIC);
}
spin_unlock_irq(&pidmap_lock);
idr_preload_end();
if (nr < 0) {
retval = (nr == -ENOSPC) ? -EAGAIN : nr;
goto out_free;
}
pid->numbers[i].nr = nr;
pid->numbers[i].ns = tmp;
tmp = tmp->parent;
}
/*
* ENOMEM is not the most obvious choice especially for the case
* where the child subreaper has already exited and the pid
* namespace denies the creation of any new processes. But ENOMEM
* is what we have exposed to userspace for a long time and it is
* documented behavior for pid namespaces. So we can't easily
* change it even if there were an error code better suited.
*/
retval = -ENOMEM;
get_pid_ns(ns);
refcount_set(&pid->count, 1);
spin_lock_init(&pid->lock);
for (type = 0; type < PIDTYPE_MAX; ++type)
INIT_HLIST_HEAD(&pid->tasks[type]);
init_waitqueue_head(&pid->wait_pidfd);
INIT_HLIST_HEAD(&pid->inodes);
upid = pid->numbers + ns->level;
spin_lock_irq(&pidmap_lock);
if (!(ns->pid_allocated & PIDNS_ADDING))
goto out_unlock;
for ( ; upid >= pid->numbers; --upid) {
/* Make the PID visible to find_pid_ns. */
idr_replace(&upid->ns->idr, pid, upid->nr);
upid->ns->pid_allocated++;
}
spin_unlock_irq(&pidmap_lock);
return pid;
out_unlock:
spin_unlock_irq(&pidmap_lock);
put_pid_ns(ns);
out_free:
spin_lock_irq(&pidmap_lock);
while (++i <= ns->level) {
upid = pid->numbers + i;
idr_remove(&upid->ns->idr, upid->nr);
}
/* On failure to allocate the first pid, reset the state */
if (ns->pid_allocated == PIDNS_ADDING)
idr_set_cursor(&ns->idr, 0);
spin_unlock_irq(&pidmap_lock);
kmem_cache_free(ns->pid_cachep, pid);
return ERR_PTR(retval);
}
void disable_pid_allocation(struct pid_namespace *ns)
{
spin_lock_irq(&pidmap_lock);
ns->pid_allocated &= ~PIDNS_ADDING;
spin_unlock_irq(&pidmap_lock);
}
struct pid *find_pid_ns(int nr, struct pid_namespace *ns)
{
return idr_find(&ns->idr, nr);
}
EXPORT_SYMBOL_GPL(find_pid_ns);
struct pid *find_vpid(int nr)
{
return find_pid_ns(nr, task_active_pid_ns(current));
}
EXPORT_SYMBOL_GPL(find_vpid);
static struct pid **task_pid_ptr(struct task_struct *task, enum pid_type type)
{
return (type == PIDTYPE_PID) ?
&task->thread_pid : &task->signal->pids[type];
}
/*
* attach_pid() must be called with the tasklist_lock write-held.
*/
void attach_pid(struct task_struct *task, enum pid_type type)
{
struct pid *pid = *task_pid_ptr(task, type);
hlist_add_head_rcu(&task->pid_links[type], &pid->tasks[type]);
}
static void __change_pid(struct task_struct *task, enum pid_type type,
struct pid *new)
{
struct pid **pid_ptr = task_pid_ptr(task, type);
struct pid *pid;
int tmp;
pid = *pid_ptr;
hlist_del_rcu(&task->pid_links[type]);
*pid_ptr = new;
for (tmp = PIDTYPE_MAX; --tmp >= 0; )
if (pid_has_task(pid, tmp))
return;
free_pid(pid);
}
void detach_pid(struct task_struct *task, enum pid_type type)
{
__change_pid(task, type, NULL);
}
void change_pid(struct task_struct *task, enum pid_type type,
struct pid *pid)
{
__change_pid(task, type, pid);
attach_pid(task, type);
}
void exchange_tids(struct task_struct *left, struct task_struct *right)
{
struct pid *pid1 = left->thread_pid;
struct pid *pid2 = right->thread_pid;
struct hlist_head *head1 = &pid1->tasks[PIDTYPE_PID];
struct hlist_head *head2 = &pid2->tasks[PIDTYPE_PID];
/* Swap the single entry tid lists */
hlists_swap_heads_rcu(head1, head2);
/* Swap the per task_struct pid */
rcu_assign_pointer(left->thread_pid, pid2);
rcu_assign_pointer(right->thread_pid, pid1);
/* Swap the cached value */
WRITE_ONCE(left->pid, pid_nr(pid2));
WRITE_ONCE(right->pid, pid_nr(pid1));
}
/* transfer_pid is an optimization of attach_pid(new), detach_pid(old) */
void transfer_pid(struct task_struct *old, struct task_struct *new,
enum pid_type type)
{
if (type == PIDTYPE_PID)
new->thread_pid = old->thread_pid;
hlist_replace_rcu(&old->pid_links[type], &new->pid_links[type]);
}
struct task_struct *pid_task(struct pid *pid, enum pid_type type)
{
struct task_struct *result = NULL;
if (pid) {
struct hlist_node *first;
first = rcu_dereference_check(hlist_first_rcu(&pid->tasks[type]),
lockdep_tasklist_lock_is_held());
if (first)
result = hlist_entry(first, struct task_struct, pid_links[(type)]);
}
return result;
}
EXPORT_SYMBOL(pid_task);
/*
* Must be called under rcu_read_lock().
*/
struct task_struct *find_task_by_pid_ns(pid_t nr, struct pid_namespace *ns)
{
RCU_LOCKDEP_WARN(!rcu_read_lock_held(),
"find_task_by_pid_ns() needs rcu_read_lock() protection");
return pid_task(find_pid_ns(nr, ns), PIDTYPE_PID);
}
struct task_struct *find_task_by_vpid(pid_t vnr)
{
return find_task_by_pid_ns(vnr, task_active_pid_ns(current));
}
struct task_struct *find_get_task_by_vpid(pid_t nr)
{
struct task_struct *task;
rcu_read_lock();
task = find_task_by_vpid(nr);
if (task)
get_task_struct(task);
rcu_read_unlock();
return task;
}
struct pid *get_task_pid(struct task_struct *task, enum pid_type type)
{
struct pid *pid;
rcu_read_lock();
pid = get_pid(rcu_dereference(*task_pid_ptr(task, type)));
rcu_read_unlock();
return pid;
}
EXPORT_SYMBOL_GPL(get_task_pid);
struct task_struct *get_pid_task(struct pid *pid, enum pid_type type)
{
struct task_struct *result;
rcu_read_lock();
result = pid_task(pid, type);
if (result)
get_task_struct(result);
rcu_read_unlock();
return result;
}
EXPORT_SYMBOL_GPL(get_pid_task);
struct pid *find_get_pid(pid_t nr)
{
struct pid *pid;
rcu_read_lock();
pid = get_pid(find_vpid(nr));
rcu_read_unlock();
return pid;
}
EXPORT_SYMBOL_GPL(find_get_pid);
pid_t pid_nr_ns(struct pid *pid, struct pid_namespace *ns)
{
struct upid *upid;
pid_t nr = 0;
if (pid && ns->level <= pid->level) {
upid = &pid->numbers[ns->level];
if (upid->ns == ns) nr = upid->nr;
}
return nr;
}
EXPORT_SYMBOL_GPL(pid_nr_ns);
pid_t pid_vnr(struct pid *pid)
{
return pid_nr_ns(pid, task_active_pid_ns(current));
}
EXPORT_SYMBOL_GPL(pid_vnr);
pid_t __task_pid_nr_ns(struct task_struct *task, enum pid_type type,
struct pid_namespace *ns)
{
pid_t nr = 0;
rcu_read_lock();
if (!ns)
ns = task_active_pid_ns(current);
nr = pid_nr_ns(rcu_dereference(*task_pid_ptr(task, type)), ns);
rcu_read_unlock();
return nr;
}
EXPORT_SYMBOL(__task_pid_nr_ns);
struct pid_namespace *task_active_pid_ns(struct task_struct *tsk)
{
return ns_of_pid(task_pid(tsk));
}
EXPORT_SYMBOL_GPL(task_active_pid_ns);
/*
* Used by proc to find the first pid that is greater than or equal to nr.
*
* If there is a pid at nr this function is exactly the same as find_pid_ns.
*/
struct pid *find_ge_pid(int nr, struct pid_namespace *ns)
{
return idr_get_next(&ns->idr, &nr);
}
struct pid *pidfd_get_pid(unsigned int fd, unsigned int *flags)
{
struct fd f;
struct pid *pid;
f = fdget(fd);
if (!f.file)
return ERR_PTR(-EBADF);
pid = pidfd_pid(f.file);
if (!IS_ERR(pid)) {
get_pid(pid);
*flags = f.file->f_flags;
}
fdput(f);
return pid;
}
/**
* pidfd_create() - Create a new pid file descriptor.
*
* @pid: struct pid that the pidfd will reference
* @flags: flags to pass
*
* This creates a new pid file descriptor with the O_CLOEXEC flag set.
*
* Note, that this function can only be called after the fd table has
* been unshared to avoid leaking the pidfd to the new process.
*
* This symbol should not be explicitly exported to loadable modules.
*
* Return: On success, a cloexec pidfd is returned.
* On error, a negative errno number will be returned.
*/
int pidfd_create(struct pid *pid, unsigned int flags)
{
int fd;
if (!pid || !pid_has_task(pid, PIDTYPE_TGID))
return -EINVAL;
if (flags & ~(O_NONBLOCK | O_RDWR | O_CLOEXEC))
return -EINVAL;
fd = anon_inode_getfd("[pidfd]", &pidfd_fops, get_pid(pid),
flags | O_RDWR | O_CLOEXEC);
if (fd < 0)
put_pid(pid);
return fd;
}
/**
* pidfd_open() - Open new pid file descriptor.
*
* @pid: pid for which to retrieve a pidfd
* @flags: flags to pass
*
* This creates a new pid file descriptor with the O_CLOEXEC flag set for
* the process identified by @pid. Currently, the process identified by
* @pid must be a thread-group leader. This restriction currently exists
* for all aspects of pidfds including pidfd creation (CLONE_PIDFD cannot
* be used with CLONE_THREAD) and pidfd polling (only supports thread group
* leaders).
*
* Return: On success, a cloexec pidfd is returned.
* On error, a negative errno number will be returned.
*/
SYSCALL_DEFINE2(pidfd_open, pid_t, pid, unsigned int, flags)
{
int fd;
struct pid *p;
if (flags & ~PIDFD_NONBLOCK)
return -EINVAL;
if (pid <= 0)
return -EINVAL;
p = find_get_pid(pid);
if (!p)
return -ESRCH;
fd = pidfd_create(p, flags);
put_pid(p);
return fd;
}
void __init pid_idr_init(void)
{
/* Verify no one has done anything silly: */
BUILD_BUG_ON(PID_MAX_LIMIT >= PIDNS_ADDING);
/* bump default and minimum pid_max based on number of cpus */
pid_max = min(pid_max_max, max_t(int, pid_max,
PIDS_PER_CPU_DEFAULT * num_possible_cpus()));
pid_max_min = max_t(int, pid_max_min,
PIDS_PER_CPU_MIN * num_possible_cpus());
pr_info("pid_max: default: %u minimum: %u\n", pid_max, pid_max_min);
idr_init(&init_pid_ns.idr);
init_pid_ns.pid_cachep = KMEM_CACHE(pid,
SLAB_HWCACHE_ALIGN | SLAB_PANIC | SLAB_ACCOUNT);
}
static struct file *__pidfd_fget(struct task_struct *task, int fd)
{
struct file *file;
int ret;
ret = down_read_killable(&task->signal->exec_update_lock);
if (ret)
return ERR_PTR(ret);
if (ptrace_may_access(task, PTRACE_MODE_ATTACH_REALCREDS))
file = fget_task(task, fd);
else
file = ERR_PTR(-EPERM);
up_read(&task->signal->exec_update_lock);
return file ?: ERR_PTR(-EBADF);
}
static int pidfd_getfd(struct pid *pid, int fd)
{
struct task_struct *task;
struct file *file;
int ret;
task = get_pid_task(pid, PIDTYPE_PID);
if (!task)
return -ESRCH;
file = __pidfd_fget(task, fd);
put_task_struct(task);
if (IS_ERR(file))
return PTR_ERR(file);
ret = receive_fd(file, O_CLOEXEC);
fput(file);
return ret;
}
/**
* sys_pidfd_getfd() - Get a file descriptor from another process
*
* @pidfd: the pidfd file descriptor of the process
* @fd: the file descriptor number to get
* @flags: flags on how to get the fd (reserved)
*
* This syscall gets a copy of a file descriptor from another process
* based on the pidfd, and file descriptor number. It requires that
* the calling process has the ability to ptrace the process represented
* by the pidfd. The process which is having its file descriptor copied
* is otherwise unaffected.
*
* Return: On success, a cloexec file descriptor is returned.
* On error, a negative errno number will be returned.
*/
SYSCALL_DEFINE3(pidfd_getfd, int, pidfd, int, fd,
unsigned int, flags)
{
struct pid *pid;
struct fd f;
int ret;
/* flags is currently unused - make sure it's unset */
if (flags)
return -EINVAL;
f = fdget(pidfd);
if (!f.file)
return -EBADF;
pid = pidfd_pid(f.file);
if (IS_ERR(pid))
ret = PTR_ERR(pid);
else
ret = pidfd_getfd(pid, fd);
fdput(f);
return ret;
}
/* SPDX-License-Identifier: GPL-2.0+ */
/*
* Read-Copy Update mechanism for mutual exclusion
*
* Copyright IBM Corporation, 2001
*
* Author: Dipankar Sarma <dipankar@in.ibm.com>
*
* Based on the original work by Paul McKenney <paulmck@vnet.ibm.com>
* and inputs from Rusty Russell, Andrea Arcangeli and Andi Kleen.
* Papers:
* http://www.rdrop.com/users/paulmck/paper/rclockpdcsproof.pdf
* http://lse.sourceforge.net/locking/rclock_OLS.2001.05.01c.sc.pdf (OLS2001)
*
* For detailed explanation of Read-Copy Update mechanism see -
* http://lse.sourceforge.net/locking/rcupdate.html
*
*/
#ifndef __LINUX_RCUPDATE_H
#define __LINUX_RCUPDATE_H
#include <linux/types.h>
#include <linux/compiler.h>
#include <linux/atomic.h>
#include <linux/irqflags.h>
#include <linux/preempt.h>
#include <linux/bottom_half.h>
#include <linux/lockdep.h>
#include <asm/processor.h>
#include <linux/cpumask.h>
#define ULONG_CMP_GE(a, b) (ULONG_MAX / 2 >= (a) - (b))
#define ULONG_CMP_LT(a, b) (ULONG_MAX / 2 < (a) - (b))
#define ulong2long(a) (*(long *)(&(a)))
#define USHORT_CMP_GE(a, b) (USHRT_MAX / 2 >= (unsigned short)((a) - (b)))
#define USHORT_CMP_LT(a, b) (USHRT_MAX / 2 < (unsigned short)((a) - (b)))
/* Exported common interfaces */
void call_rcu(struct rcu_head *head, rcu_callback_t func);
void rcu_barrier_tasks(void);
void rcu_barrier_tasks_rude(void);
void synchronize_rcu(void);
#ifdef CONFIG_PREEMPT_RCU
void __rcu_read_lock(void);
void __rcu_read_unlock(void);
/*
* Defined as a macro as it is a very low level header included from
* areas that don't even know about current. This gives the rcu_read_lock()
* nesting depth, but makes sense only if CONFIG_PREEMPT_RCU -- in other
* types of kernel builds, the rcu_read_lock() nesting depth is unknowable.
*/
#define rcu_preempt_depth() READ_ONCE(current->rcu_read_lock_nesting)
#else /* #ifdef CONFIG_PREEMPT_RCU */
#ifdef CONFIG_TINY_RCU
#define rcu_read_unlock_strict() do { } while (0)
#else
void rcu_read_unlock_strict(void);
#endif
static inline void __rcu_read_lock(void)
{
preempt_disable();}
static inline void __rcu_read_unlock(void)
{
preempt_enable(); rcu_read_unlock_strict();
}
static inline int rcu_preempt_depth(void)
{
return 0;
}
#endif /* #else #ifdef CONFIG_PREEMPT_RCU */
/* Internal to kernel */
void rcu_init(void);
extern int rcu_scheduler_active __read_mostly;
void rcu_sched_clock_irq(int user);
void rcu_report_dead(unsigned int cpu);
void rcutree_migrate_callbacks(int cpu);
#ifdef CONFIG_TASKS_RCU_GENERIC
void rcu_init_tasks_generic(void);
#else
static inline void rcu_init_tasks_generic(void) { }
#endif
#ifdef CONFIG_RCU_STALL_COMMON
void rcu_sysrq_start(void);
void rcu_sysrq_end(void);
#else /* #ifdef CONFIG_RCU_STALL_COMMON */
static inline void rcu_sysrq_start(void) { }
static inline void rcu_sysrq_end(void) { }
#endif /* #else #ifdef CONFIG_RCU_STALL_COMMON */
#ifdef CONFIG_NO_HZ_FULL
void rcu_user_enter(void);
void rcu_user_exit(void);
#else
static inline void rcu_user_enter(void) { }
static inline void rcu_user_exit(void) { }
#endif /* CONFIG_NO_HZ_FULL */
#ifdef CONFIG_RCU_NOCB_CPU
void rcu_init_nohz(void);
int rcu_nocb_cpu_offload(int cpu);
int rcu_nocb_cpu_deoffload(int cpu);
void rcu_nocb_flush_deferred_wakeup(void);
#else /* #ifdef CONFIG_RCU_NOCB_CPU */
static inline void rcu_init_nohz(void) { }
static inline int rcu_nocb_cpu_offload(int cpu) { return -EINVAL; }
static inline int rcu_nocb_cpu_deoffload(int cpu) { return 0; }
static inline void rcu_nocb_flush_deferred_wakeup(void) { }
#endif /* #else #ifdef CONFIG_RCU_NOCB_CPU */
/**
* RCU_NONIDLE - Indicate idle-loop code that needs RCU readers
* @a: Code that RCU needs to pay attention to.
*
* RCU read-side critical sections are forbidden in the inner idle loop,
* that is, between the rcu_idle_enter() and the rcu_idle_exit() -- RCU
* will happily ignore any such read-side critical sections. However,
* things like powertop need tracepoints in the inner idle loop.
*
* This macro provides the way out: RCU_NONIDLE(do_something_with_RCU())
* will tell RCU that it needs to pay attention, invoke its argument
* (in this example, calling the do_something_with_RCU() function),
* and then tell RCU to go back to ignoring this CPU. It is permissible
* to nest RCU_NONIDLE() wrappers, but not indefinitely (but the limit is
* on the order of a million or so, even on 32-bit systems). It is
* not legal to block within RCU_NONIDLE(), nor is it permissible to
* transfer control either into or out of RCU_NONIDLE()'s statement.
*/
#define RCU_NONIDLE(a) \
do { \
rcu_irq_enter_irqson(); \
do { a; } while (0); \
rcu_irq_exit_irqson(); \
} while (0)
/*
* Note a quasi-voluntary context switch for RCU-tasks's benefit.
* This is a macro rather than an inline function to avoid #include hell.
*/
#ifdef CONFIG_TASKS_RCU_GENERIC
# ifdef CONFIG_TASKS_RCU
# define rcu_tasks_classic_qs(t, preempt) \
do { \
if (!(preempt) && READ_ONCE((t)->rcu_tasks_holdout)) \
WRITE_ONCE((t)->rcu_tasks_holdout, false); \
} while (0)
void call_rcu_tasks(struct rcu_head *head, rcu_callback_t func);
void synchronize_rcu_tasks(void);
# else
# define rcu_tasks_classic_qs(t, preempt) do { } while (0)
# define call_rcu_tasks call_rcu
# define synchronize_rcu_tasks synchronize_rcu
# endif
# ifdef CONFIG_TASKS_TRACE_RCU
# define rcu_tasks_trace_qs(t) \
do { \
if (!likely(READ_ONCE((t)->trc_reader_checked)) && \
!unlikely(READ_ONCE((t)->trc_reader_nesting))) { \
smp_store_release(&(t)->trc_reader_checked, true); \
smp_mb(); /* Readers partitioned by store. */ \
} \
} while (0)
# else
# define rcu_tasks_trace_qs(t) do { } while (0)
# endif
#define rcu_tasks_qs(t, preempt) \
do { \
rcu_tasks_classic_qs((t), (preempt)); \
rcu_tasks_trace_qs((t)); \
} while (0)
# ifdef CONFIG_TASKS_RUDE_RCU
void call_rcu_tasks_rude(struct rcu_head *head, rcu_callback_t func);
void synchronize_rcu_tasks_rude(void);
# endif
#define rcu_note_voluntary_context_switch(t) rcu_tasks_qs(t, false)
void exit_tasks_rcu_start(void);
void exit_tasks_rcu_finish(void);
#else /* #ifdef CONFIG_TASKS_RCU_GENERIC */
#define rcu_tasks_qs(t, preempt) do { } while (0)
#define rcu_note_voluntary_context_switch(t) do { } while (0)
#define call_rcu_tasks call_rcu
#define synchronize_rcu_tasks synchronize_rcu
static inline void exit_tasks_rcu_start(void) { }
static inline void exit_tasks_rcu_finish(void) { }
#endif /* #else #ifdef CONFIG_TASKS_RCU_GENERIC */
/**
* cond_resched_tasks_rcu_qs - Report potential quiescent states to RCU
*
* This macro resembles cond_resched(), except that it is defined to
* report potential quiescent states to RCU-tasks even if the cond_resched()
* machinery were to be shut off, as some advocate for PREEMPTION kernels.
*/
#define cond_resched_tasks_rcu_qs() \
do { \
rcu_tasks_qs(current, false); \
cond_resched(); \
} while (0)
/*
* Infrastructure to implement the synchronize_() primitives in
* TREE_RCU and rcu_barrier_() primitives in TINY_RCU.
*/
#if defined(CONFIG_TREE_RCU)
#include <linux/rcutree.h>
#elif defined(CONFIG_TINY_RCU)
#include <linux/rcutiny.h>
#else
#error "Unknown RCU implementation specified to kernel configuration"
#endif
/*
* The init_rcu_head_on_stack() and destroy_rcu_head_on_stack() calls
* are needed for dynamic initialization and destruction of rcu_head
* on the stack, and init_rcu_head()/destroy_rcu_head() are needed for
* dynamic initialization and destruction of statically allocated rcu_head
* structures. However, rcu_head structures allocated dynamically in the
* heap don't need any initialization.
*/
#ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD
void init_rcu_head(struct rcu_head *head);
void destroy_rcu_head(struct rcu_head *head);
void init_rcu_head_on_stack(struct rcu_head *head);
void destroy_rcu_head_on_stack(struct rcu_head *head);
#else /* !CONFIG_DEBUG_OBJECTS_RCU_HEAD */
static inline void init_rcu_head(struct rcu_head *head) { }
static inline void destroy_rcu_head(struct rcu_head *head) { }
static inline void init_rcu_head_on_stack(struct rcu_head *head) { }
static inline void destroy_rcu_head_on_stack(struct rcu_head *head) { }
#endif /* #else !CONFIG_DEBUG_OBJECTS_RCU_HEAD */
#if defined(CONFIG_HOTPLUG_CPU) && defined(CONFIG_PROVE_RCU)
bool rcu_lockdep_current_cpu_online(void);
#else /* #if defined(CONFIG_HOTPLUG_CPU) && defined(CONFIG_PROVE_RCU) */
static inline bool rcu_lockdep_current_cpu_online(void) { return true; }
#endif /* #else #if defined(CONFIG_HOTPLUG_CPU) && defined(CONFIG_PROVE_RCU) */
extern struct lockdep_map rcu_lock_map;
extern struct lockdep_map rcu_bh_lock_map;
extern struct lockdep_map rcu_sched_lock_map;
extern struct lockdep_map rcu_callback_map;
#ifdef CONFIG_DEBUG_LOCK_ALLOC
static inline void rcu_lock_acquire(struct lockdep_map *map)
{
lock_acquire(map, 0, 0, 2, 0, NULL, _THIS_IP_);
}
static inline void rcu_lock_release(struct lockdep_map *map)
{
lock_release(map, _THIS_IP_);
}
int debug_lockdep_rcu_enabled(void);
int rcu_read_lock_held(void);
int rcu_read_lock_bh_held(void);
int rcu_read_lock_sched_held(void);
int rcu_read_lock_any_held(void);
#else /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
# define rcu_lock_acquire(a) do { } while (0)
# define rcu_lock_release(a) do { } while (0)
static inline int rcu_read_lock_held(void)
{
return 1;
}
static inline int rcu_read_lock_bh_held(void)
{
return 1;
}
static inline int rcu_read_lock_sched_held(void)
{
return !preemptible();
}
static inline int rcu_read_lock_any_held(void)
{
return !preemptible();
}
#endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */
#ifdef CONFIG_PROVE_RCU
/**
* RCU_LOCKDEP_WARN - emit lockdep splat if specified condition is met
* @c: condition to check
* @s: informative message
*/
#define RCU_LOCKDEP_WARN(c, s) \
do { \
static bool __section(".data.unlikely") __warned; \
if ((c) && debug_lockdep_rcu_enabled() && !__warned) { \
__warned = true; \
lockdep_rcu_suspicious(__FILE__, __LINE__, s); \
} \
} while (0)
#if defined(CONFIG_PROVE_RCU) && !defined(CONFIG_PREEMPT_RCU)
static inline void rcu_preempt_sleep_check(void)
{
RCU_LOCKDEP_WARN(lock_is_held(&rcu_lock_map),
"Illegal context switch in RCU read-side critical section");
}
#else /* #ifdef CONFIG_PROVE_RCU */
static inline void rcu_preempt_sleep_check(void) { }
#endif /* #else #ifdef CONFIG_PROVE_RCU */
#define rcu_sleep_check() \
do { \
rcu_preempt_sleep_check(); \
if (!IS_ENABLED(CONFIG_PREEMPT_RT)) \
RCU_LOCKDEP_WARN(lock_is_held(&rcu_bh_lock_map), \
"Illegal context switch in RCU-bh read-side critical section"); \
RCU_LOCKDEP_WARN(lock_is_held(&rcu_sched_lock_map), \
"Illegal context switch in RCU-sched read-side critical section"); \
} while (0)
#else /* #ifdef CONFIG_PROVE_RCU */
#define RCU_LOCKDEP_WARN(c, s) do { } while (0 && (c))
#define rcu_sleep_check() do { } while (0)
#endif /* #else #ifdef CONFIG_PROVE_RCU */
/*
* Helper functions for rcu_dereference_check(), rcu_dereference_protected()
* and rcu_assign_pointer(). Some of these could be folded into their
* callers, but they are left separate in order to ease introduction of
* multiple pointers markings to match different RCU implementations
* (e.g., __srcu), should this make sense in the future.
*/
#ifdef __CHECKER__
#define rcu_check_sparse(p, space) \
((void)(((typeof(*p) space *)p) == p))
#else /* #ifdef __CHECKER__ */
#define rcu_check_sparse(p, space)
#endif /* #else #ifdef __CHECKER__ */
/**
* unrcu_pointer - mark a pointer as not being RCU protected
* @p: pointer needing to lose its __rcu property
*
* Converts @p from an __rcu pointer to a __kernel pointer.
* This allows an __rcu pointer to be used with xchg() and friends.
*/
#define unrcu_pointer(p) \
({ \
typeof(*p) *_________p1 = (typeof(*p) *__force)(p); \
rcu_check_sparse(p, __rcu); \
((typeof(*p) __force __kernel *)(_________p1)); \
})
#define __rcu_access_pointer(p, space) \
({ \
typeof(*p) *_________p1 = (typeof(*p) *__force)READ_ONCE(p); \
rcu_check_sparse(p, space); \
((typeof(*p) __force __kernel *)(_________p1)); \
})
#define __rcu_dereference_check(p, c, space) \
({ \
/* Dependency order vs. p above. */ \
typeof(*p) *________p1 = (typeof(*p) *__force)READ_ONCE(p); \
RCU_LOCKDEP_WARN(!(c), "suspicious rcu_dereference_check() usage"); \
rcu_check_sparse(p, space); \
((typeof(*p) __force __kernel *)(________p1)); \
})
#define __rcu_dereference_protected(p, c, space) \
({ \
RCU_LOCKDEP_WARN(!(c), "suspicious rcu_dereference_protected() usage"); \
rcu_check_sparse(p, space); \
((typeof(*p) __force __kernel *)(p)); \
})
#define rcu_dereference_raw(p) \
({ \
/* Dependency order vs. p above. */ \
typeof(p) ________p1 = READ_ONCE(p); \
((typeof(*p) __force __kernel *)(________p1)); \
})
/**
* RCU_INITIALIZER() - statically initialize an RCU-protected global variable
* @v: The value to statically initialize with.
*/
#define RCU_INITIALIZER(v) (typeof(*(v)) __force __rcu *)(v)
/**
* rcu_assign_pointer() - assign to RCU-protected pointer
* @p: pointer to assign to
* @v: value to assign (publish)
*
* Assigns the specified value to the specified RCU-protected
* pointer, ensuring that any concurrent RCU readers will see
* any prior initialization.
*
* Inserts memory barriers on architectures that require them
* (which is most of them), and also prevents the compiler from
* reordering the code that initializes the structure after the pointer
* assignment. More importantly, this call documents which pointers
* will be dereferenced by RCU read-side code.
*
* In some special cases, you may use RCU_INIT_POINTER() instead
* of rcu_assign_pointer(). RCU_INIT_POINTER() is a bit faster due
* to the fact that it does not constrain either the CPU or the compiler.
* That said, using RCU_INIT_POINTER() when you should have used
* rcu_assign_pointer() is a very bad thing that results in
* impossible-to-diagnose memory corruption. So please be careful.
* See the RCU_INIT_POINTER() comment header for details.
*
* Note that rcu_assign_pointer() evaluates each of its arguments only
* once, appearances notwithstanding. One of the "extra" evaluations
* is in typeof() and the other visible only to sparse (__CHECKER__),
* neither of which actually execute the argument. As with most cpp
* macros, this execute-arguments-only-once property is important, so
* please be careful when making changes to rcu_assign_pointer() and the
* other macros that it invokes.
*/
#define rcu_assign_pointer(p, v) \
do { \
uintptr_t _r_a_p__v = (uintptr_t)(v); \
rcu_check_sparse(p, __rcu); \
\
if (__builtin_constant_p(v) && (_r_a_p__v) == (uintptr_t)NULL) \
WRITE_ONCE((p), (typeof(p))(_r_a_p__v)); \
else \
smp_store_release(&p, RCU_INITIALIZER((typeof(p))_r_a_p__v)); \
} while (0)
/**
* rcu_replace_pointer() - replace an RCU pointer, returning its old value
* @rcu_ptr: RCU pointer, whose old value is returned
* @ptr: regular pointer
* @c: the lockdep conditions under which the dereference will take place
*
* Perform a replacement, where @rcu_ptr is an RCU-annotated
* pointer and @c is the lockdep argument that is passed to the
* rcu_dereference_protected() call used to read that pointer. The old
* value of @rcu_ptr is returned, and @rcu_ptr is set to @ptr.
*/
#define rcu_replace_pointer(rcu_ptr, ptr, c) \
({ \
typeof(ptr) __tmp = rcu_dereference_protected((rcu_ptr), (c)); \
rcu_assign_pointer((rcu_ptr), (ptr)); \
__tmp; \
})
/**
* rcu_access_pointer() - fetch RCU pointer with no dereferencing
* @p: The pointer to read
*
* Return the value of the specified RCU-protected pointer, but omit the
* lockdep checks for being in an RCU read-side critical section. This is
* useful when the value of this pointer is accessed, but the pointer is
* not dereferenced, for example, when testing an RCU-protected pointer
* against NULL. Although rcu_access_pointer() may also be used in cases
* where update-side locks prevent the value of the pointer from changing,
* you should instead use rcu_dereference_protected() for this use case.
*
* It is also permissible to use rcu_access_pointer() when read-side
* access to the pointer was removed at least one grace period ago, as
* is the case in the context of the RCU callback that is freeing up
* the data, or after a synchronize_rcu() returns. This can be useful
* when tearing down multi-linked structures after a grace period
* has elapsed.
*/
#define rcu_access_pointer(p) __rcu_access_pointer((p), __rcu)
/**
* rcu_dereference_check() - rcu_dereference with debug checking
* @p: The pointer to read, prior to dereferencing
* @c: The conditions under which the dereference will take place
*
* Do an rcu_dereference(), but check that the conditions under which the
* dereference will take place are correct. Typically the conditions
* indicate the various locking conditions that should be held at that
* point. The check should return true if the conditions are satisfied.
* An implicit check for being in an RCU read-side critical section
* (rcu_read_lock()) is included.
*
* For example:
*
* bar = rcu_dereference_check(foo->bar, lockdep_is_held(&foo->lock));
*
* could be used to indicate to lockdep that foo->bar may only be dereferenced
* if either rcu_read_lock() is held, or that the lock required to replace
* the bar struct at foo->bar is held.
*
* Note that the list of conditions may also include indications of when a lock
* need not be held, for example during initialisation or destruction of the
* target struct:
*
* bar = rcu_dereference_check(foo->bar, lockdep_is_held(&foo->lock) ||
* atomic_read(&foo->usage) == 0);
*
* Inserts memory barriers on architectures that require them
* (currently only the Alpha), prevents the compiler from refetching
* (and from merging fetches), and, more importantly, documents exactly
* which pointers are protected by RCU and checks that the pointer is
* annotated as __rcu.
*/
#define rcu_dereference_check(p, c) \
__rcu_dereference_check((p), (c) || rcu_read_lock_held(), __rcu)
/**
* rcu_dereference_bh_check() - rcu_dereference_bh with debug checking
* @p: The pointer to read, prior to dereferencing
* @c: The conditions under which the dereference will take place
*
* This is the RCU-bh counterpart to rcu_dereference_check(). However,
* please note that starting in v5.0 kernels, vanilla RCU grace periods
* wait for local_bh_disable() regions of code in addition to regions of
* code demarked by rcu_read_lock() and rcu_read_unlock(). This means
* that synchronize_rcu(), call_rcu, and friends all take not only
* rcu_read_lock() but also rcu_read_lock_bh() into account.
*/
#define rcu_dereference_bh_check(p, c) \
__rcu_dereference_check((p), (c) || rcu_read_lock_bh_held(), __rcu)
/**
* rcu_dereference_sched_check() - rcu_dereference_sched with debug checking
* @p: The pointer to read, prior to dereferencing
* @c: The conditions under which the dereference will take place
*
* This is the RCU-sched counterpart to rcu_dereference_check().
* However, please note that starting in v5.0 kernels, vanilla RCU grace
* periods wait for preempt_disable() regions of code in addition to
* regions of code demarked by rcu_read_lock() and rcu_read_unlock().
* This means that synchronize_rcu(), call_rcu, and friends all take not
* only rcu_read_lock() but also rcu_read_lock_sched() into account.
*/
#define rcu_dereference_sched_check(p, c) \
__rcu_dereference_check((p), (c) || rcu_read_lock_sched_held(), \
__rcu)
/*
* The tracing infrastructure traces RCU (we want that), but unfortunately
* some of the RCU checks causes tracing to lock up the system.
*
* The no-tracing version of rcu_dereference_raw() must not call
* rcu_read_lock_held().
*/
#define rcu_dereference_raw_check(p) __rcu_dereference_check((p), 1, __rcu)
/**
* rcu_dereference_protected() - fetch RCU pointer when updates prevented
* @p: The pointer to read, prior to dereferencing
* @c: The conditions under which the dereference will take place
*
* Return the value of the specified RCU-protected pointer, but omit
* the READ_ONCE(). This is useful in cases where update-side locks
* prevent the value of the pointer from changing. Please note that this
* primitive does *not* prevent the compiler from repeating this reference
* or combining it with other references, so it should not be used without
* protection of appropriate locks.
*
* This function is only for update-side use. Using this function
* when protected only by rcu_read_lock() will result in infrequent
* but very ugly failures.
*/
#define rcu_dereference_protected(p, c) \
__rcu_dereference_protected((p), (c), __rcu)
/**
* rcu_dereference() - fetch RCU-protected pointer for dereferencing
* @p: The pointer to read, prior to dereferencing
*
* This is a simple wrapper around rcu_dereference_check().
*/
#define rcu_dereference(p) rcu_dereference_check(p, 0)
/**
* rcu_dereference_bh() - fetch an RCU-bh-protected pointer for dereferencing
* @p: The pointer to read, prior to dereferencing
*
* Makes rcu_dereference_check() do the dirty work.
*/
#define rcu_dereference_bh(p) rcu_dereference_bh_check(p, 0)
/**
* rcu_dereference_sched() - fetch RCU-sched-protected pointer for dereferencing
* @p: The pointer to read, prior to dereferencing
*
* Makes rcu_dereference_check() do the dirty work.
*/
#define rcu_dereference_sched(p) rcu_dereference_sched_check(p, 0)
/**
* rcu_pointer_handoff() - Hand off a pointer from RCU to other mechanism
* @p: The pointer to hand off
*
* This is simply an identity function, but it documents where a pointer
* is handed off from RCU to some other synchronization mechanism, for
* example, reference counting or locking. In C11, it would map to
* kill_dependency(). It could be used as follows::
*
* rcu_read_lock();
* p = rcu_dereference(gp);
* long_lived = is_long_lived(p);
* if (long_lived) {
* if (!atomic_inc_not_zero(p->refcnt))
* long_lived = false;
* else
* p = rcu_pointer_handoff(p);
* }
* rcu_read_unlock();
*/
#define rcu_pointer_handoff(p) (p)
/**
* rcu_read_lock() - mark the beginning of an RCU read-side critical section
*
* When synchronize_rcu() is invoked on one CPU while other CPUs
* are within RCU read-side critical sections, then the
* synchronize_rcu() is guaranteed to block until after all the other
* CPUs exit their critical sections. Similarly, if call_rcu() is invoked
* on one CPU while other CPUs are within RCU read-side critical
* sections, invocation of the corresponding RCU callback is deferred
* until after the all the other CPUs exit their critical sections.
*
* In v5.0 and later kernels, synchronize_rcu() and call_rcu() also
* wait for regions of code with preemption disabled, including regions of
* code with interrupts or softirqs disabled. In pre-v5.0 kernels, which
* define synchronize_sched(), only code enclosed within rcu_read_lock()
* and rcu_read_unlock() are guaranteed to be waited for.
*
* Note, however, that RCU callbacks are permitted to run concurrently
* with new RCU read-side critical sections. One way that this can happen
* is via the following sequence of events: (1) CPU 0 enters an RCU
* read-side critical section, (2) CPU 1 invokes call_rcu() to register
* an RCU callback, (3) CPU 0 exits the RCU read-side critical section,
* (4) CPU 2 enters a RCU read-side critical section, (5) the RCU
* callback is invoked. This is legal, because the RCU read-side critical
* section that was running concurrently with the call_rcu() (and which
* therefore might be referencing something that the corresponding RCU
* callback would free up) has completed before the corresponding
* RCU callback is invoked.
*
* RCU read-side critical sections may be nested. Any deferred actions
* will be deferred until the outermost RCU read-side critical section
* completes.
*
* You can avoid reading and understanding the next paragraph by
* following this rule: don't put anything in an rcu_read_lock() RCU
* read-side critical section that would block in a !PREEMPTION kernel.
* But if you want the full story, read on!
*
* In non-preemptible RCU implementations (pure TREE_RCU and TINY_RCU),
* it is illegal to block while in an RCU read-side critical section.
* In preemptible RCU implementations (PREEMPT_RCU) in CONFIG_PREEMPTION
* kernel builds, RCU read-side critical sections may be preempted,
* but explicit blocking is illegal. Finally, in preemptible RCU
* implementations in real-time (with -rt patchset) kernel builds, RCU
* read-side critical sections may be preempted and they may also block, but
* only when acquiring spinlocks that are subject to priority inheritance.
*/
static __always_inline void rcu_read_lock(void)
{
__rcu_read_lock();
__acquire(RCU);
rcu_lock_acquire(&rcu_lock_map);
RCU_LOCKDEP_WARN(!rcu_is_watching(),
"rcu_read_lock() used illegally while idle");
}
/*
* So where is rcu_write_lock()? It does not exist, as there is no
* way for writers to lock out RCU readers. This is a feature, not
* a bug -- this property is what provides RCU's performance benefits.
* Of course, writers must coordinate with each other. The normal
* spinlock primitives work well for this, but any other technique may be
* used as well. RCU does not care how the writers keep out of each
* others' way, as long as they do so.
*/
/**
* rcu_read_unlock() - marks the end of an RCU read-side critical section.
*
* In almost all situations, rcu_read_unlock() is immune from deadlock.
* In recent kernels that have consolidated synchronize_sched() and
* synchronize_rcu_bh() into synchronize_rcu(), this deadlock immunity
* also extends to the scheduler's runqueue and priority-inheritance
* spinlocks, courtesy of the quiescent-state deferral that is carried
* out when rcu_read_unlock() is invoked with interrupts disabled.
*
* See rcu_read_lock() for more information.
*/
static inline void rcu_read_unlock(void)
{
RCU_LOCKDEP_WARN(!rcu_is_watching(),
"rcu_read_unlock() used illegally while idle");
__release(RCU);
__rcu_read_unlock();
rcu_lock_release(&rcu_lock_map); /* Keep acq info for rls diags. */
}
/**
* rcu_read_lock_bh() - mark the beginning of an RCU-bh critical section
*
* This is equivalent to rcu_read_lock(), but also disables softirqs.
* Note that anything else that disables softirqs can also serve as an RCU
* read-side critical section. However, please note that this equivalence
* applies only to v5.0 and later. Before v5.0, rcu_read_lock() and
* rcu_read_lock_bh() were unrelated.
*
* Note that rcu_read_lock_bh() and the matching rcu_read_unlock_bh()
* must occur in the same context, for example, it is illegal to invoke
* rcu_read_unlock_bh() from one task if the matching rcu_read_lock_bh()
* was invoked from some other task.
*/
static inline void rcu_read_lock_bh(void)
{
local_bh_disable();
__acquire(RCU_BH);
rcu_lock_acquire(&rcu_bh_lock_map);
RCU_LOCKDEP_WARN(!rcu_is_watching(),
"rcu_read_lock_bh() used illegally while idle");
}
/**
* rcu_read_unlock_bh() - marks the end of a softirq-only RCU critical section
*
* See rcu_read_lock_bh() for more information.
*/
static inline void rcu_read_unlock_bh(void)
{
RCU_LOCKDEP_WARN(!rcu_is_watching(),
"rcu_read_unlock_bh() used illegally while idle");
rcu_lock_release(&rcu_bh_lock_map);
__release(RCU_BH);
local_bh_enable();
}
/**
* rcu_read_lock_sched() - mark the beginning of a RCU-sched critical section
*
* This is equivalent to rcu_read_lock(), but also disables preemption.
* Read-side critical sections can also be introduced by anything else that
* disables preemption, including local_irq_disable() and friends. However,
* please note that the equivalence to rcu_read_lock() applies only to
* v5.0 and later. Before v5.0, rcu_read_lock() and rcu_read_lock_sched()
* were unrelated.
*
* Note that rcu_read_lock_sched() and the matching rcu_read_unlock_sched()
* must occur in the same context, for example, it is illegal to invoke
* rcu_read_unlock_sched() from process context if the matching
* rcu_read_lock_sched() was invoked from an NMI handler.
*/
static inline void rcu_read_lock_sched(void)
{
preempt_disable();
__acquire(RCU_SCHED);
rcu_lock_acquire(&rcu_sched_lock_map);
RCU_LOCKDEP_WARN(!rcu_is_watching(),
"rcu_read_lock_sched() used illegally while idle");
}
/* Used by lockdep and tracing: cannot be traced, cannot call lockdep. */
static inline notrace void rcu_read_lock_sched_notrace(void)
{
preempt_disable_notrace();
__acquire(RCU_SCHED);
}
/**
* rcu_read_unlock_sched() - marks the end of a RCU-classic critical section
*
* See rcu_read_lock_sched() for more information.
*/
static inline void rcu_read_unlock_sched(void)
{
RCU_LOCKDEP_WARN(!rcu_is_watching(),
"rcu_read_unlock_sched() used illegally while idle");
rcu_lock_release(&rcu_sched_lock_map);
__release(RCU_SCHED);
preempt_enable();
}
/* Used by lockdep and tracing: cannot be traced, cannot call lockdep. */
static inline notrace void rcu_read_unlock_sched_notrace(void)
{
__release(RCU_SCHED);
preempt_enable_notrace();
}
/**
* RCU_INIT_POINTER() - initialize an RCU protected pointer
* @p: The pointer to be initialized.
* @v: The value to initialized the pointer to.
*
* Initialize an RCU-protected pointer in special cases where readers
* do not need ordering constraints on the CPU or the compiler. These
* special cases are:
*
* 1. This use of RCU_INIT_POINTER() is NULLing out the pointer *or*
* 2. The caller has taken whatever steps are required to prevent
* RCU readers from concurrently accessing this pointer *or*
* 3. The referenced data structure has already been exposed to
* readers either at compile time or via rcu_assign_pointer() *and*
*
* a. You have not made *any* reader-visible changes to
* this structure since then *or*
* b. It is OK for readers accessing this structure from its
* new location to see the old state of the structure. (For
* example, the changes were to statistical counters or to
* other state where exact synchronization is not required.)
*
* Failure to follow these rules governing use of RCU_INIT_POINTER() will
* result in impossible-to-diagnose memory corruption. As in the structures
* will look OK in crash dumps, but any concurrent RCU readers might
* see pre-initialized values of the referenced data structure. So
* please be very careful how you use RCU_INIT_POINTER()!!!
*
* If you are creating an RCU-protected linked structure that is accessed
* by a single external-to-structure RCU-protected pointer, then you may
* use RCU_INIT_POINTER() to initialize the internal RCU-protected
* pointers, but you must use rcu_assign_pointer() to initialize the
* external-to-structure pointer *after* you have completely initialized
* the reader-accessible portions of the linked structure.
*
* Note that unlike rcu_assign_pointer(), RCU_INIT_POINTER() provides no
* ordering guarantees for either the CPU or the compiler.
*/
#define RCU_INIT_POINTER(p, v) \
do { \
rcu_check_sparse(p, __rcu); \
WRITE_ONCE(p, RCU_INITIALIZER(v)); \
} while (0)
/**
* RCU_POINTER_INITIALIZER() - statically initialize an RCU protected pointer
* @p: The pointer to be initialized.
* @v: The value to initialized the pointer to.
*
* GCC-style initialization for an RCU-protected pointer in a structure field.
*/
#define RCU_POINTER_INITIALIZER(p, v) \
.p = RCU_INITIALIZER(v)
/*
* Does the specified offset indicate that the corresponding rcu_head
* structure can be handled by kvfree_rcu()?
*/
#define __is_kvfree_rcu_offset(offset) ((offset) < 4096)
/**
* kfree_rcu() - kfree an object after a grace period.
* @ptr: pointer to kfree for both single- and double-argument invocations.
* @rhf: the name of the struct rcu_head within the type of @ptr,
* but only for double-argument invocations.
*
* Many rcu callbacks functions just call kfree() on the base structure.
* These functions are trivial, but their size adds up, and furthermore
* when they are used in a kernel module, that module must invoke the
* high-latency rcu_barrier() function at module-unload time.
*
* The kfree_rcu() function handles this issue. Rather than encoding a
* function address in the embedded rcu_head structure, kfree_rcu() instead
* encodes the offset of the rcu_head structure within the base structure.
* Because the functions are not allowed in the low-order 4096 bytes of
* kernel virtual memory, offsets up to 4095 bytes can be accommodated.
* If the offset is larger than 4095 bytes, a compile-time error will
* be generated in kvfree_rcu_arg_2(). If this error is triggered, you can
* either fall back to use of call_rcu() or rearrange the structure to
* position the rcu_head structure into the first 4096 bytes.
*
* Note that the allowable offset might decrease in the future, for example,
* to allow something like kmem_cache_free_rcu().
*
* The BUILD_BUG_ON check must not involve any function calls, hence the
* checks are done in macros here.
*/
#define kfree_rcu(ptr, rhf...) kvfree_rcu(ptr, ## rhf)
/**
* kvfree_rcu() - kvfree an object after a grace period.
*
* This macro consists of one or two arguments and it is
* based on whether an object is head-less or not. If it
* has a head then a semantic stays the same as it used
* to be before:
*
* kvfree_rcu(ptr, rhf);
*
* where @ptr is a pointer to kvfree(), @rhf is the name
* of the rcu_head structure within the type of @ptr.
*
* When it comes to head-less variant, only one argument
* is passed and that is just a pointer which has to be
* freed after a grace period. Therefore the semantic is
*
* kvfree_rcu(ptr);
*
* where @ptr is a pointer to kvfree().
*
* Please note, head-less way of freeing is permitted to
* use from a context that has to follow might_sleep()
* annotation. Otherwise, please switch and embed the
* rcu_head structure within the type of @ptr.
*/
#define kvfree_rcu(...) KVFREE_GET_MACRO(__VA_ARGS__, \
kvfree_rcu_arg_2, kvfree_rcu_arg_1)(__VA_ARGS__)
#define KVFREE_GET_MACRO(_1, _2, NAME, ...) NAME
#define kvfree_rcu_arg_2(ptr, rhf) \
do { \
typeof (ptr) ___p = (ptr); \
\
if (___p) { \
BUILD_BUG_ON(!__is_kvfree_rcu_offset(offsetof(typeof(*(ptr)), rhf))); \
kvfree_call_rcu(&((___p)->rhf), (rcu_callback_t)(unsigned long) \
(offsetof(typeof(*(ptr)), rhf))); \
} \
} while (0)
#define kvfree_rcu_arg_1(ptr) \
do { \
typeof(ptr) ___p = (ptr); \
\
if (___p) \
kvfree_call_rcu(NULL, (rcu_callback_t) (___p)); \
} while (0)
/*
* Place this after a lock-acquisition primitive to guarantee that
* an UNLOCK+LOCK pair acts as a full barrier. This guarantee applies
* if the UNLOCK and LOCK are executed by the same CPU or if the
* UNLOCK and LOCK operate on the same lock variable.
*/
#ifdef CONFIG_ARCH_WEAK_RELEASE_ACQUIRE
#define smp_mb__after_unlock_lock() smp_mb() /* Full ordering for lock. */
#else /* #ifdef CONFIG_ARCH_WEAK_RELEASE_ACQUIRE */
#define smp_mb__after_unlock_lock() do { } while (0)
#endif /* #else #ifdef CONFIG_ARCH_WEAK_RELEASE_ACQUIRE */
/* Has the specified rcu_head structure been handed to call_rcu()? */
/**
* rcu_head_init - Initialize rcu_head for rcu_head_after_call_rcu()
* @rhp: The rcu_head structure to initialize.
*
* If you intend to invoke rcu_head_after_call_rcu() to test whether a
* given rcu_head structure has already been passed to call_rcu(), then
* you must also invoke this rcu_head_init() function on it just after
* allocating that structure. Calls to this function must not race with
* calls to call_rcu(), rcu_head_after_call_rcu(), or callback invocation.
*/
static inline void rcu_head_init(struct rcu_head *rhp)
{
rhp->func = (rcu_callback_t)~0L;
}
/**
* rcu_head_after_call_rcu() - Has this rcu_head been passed to call_rcu()?
* @rhp: The rcu_head structure to test.
* @f: The function passed to call_rcu() along with @rhp.
*
* Returns @true if the @rhp has been passed to call_rcu() with @func,
* and @false otherwise. Emits a warning in any other case, including
* the case where @rhp has already been invoked after a grace period.
* Calls to this function must not race with callback invocation. One way
* to avoid such races is to enclose the call to rcu_head_after_call_rcu()
* in an RCU read-side critical section that includes a read-side fetch
* of the pointer to the structure containing @rhp.
*/
static inline bool
rcu_head_after_call_rcu(struct rcu_head *rhp, rcu_callback_t f)
{
rcu_callback_t func = READ_ONCE(rhp->func);
if (func == f)
return true;
WARN_ON_ONCE(func != (rcu_callback_t)~0L);
return false;
}
/* kernel/ksysfs.c definitions */
extern int rcu_expedited;
extern int rcu_normal;
#endif /* __LINUX_RCUPDATE_H */
/*
* kernel/cpuset.c
*
* Processor and Memory placement constraints for sets of tasks.
*
* Copyright (C) 2003 BULL SA.
* Copyright (C) 2004-2007 Silicon Graphics, Inc.
* Copyright (C) 2006 Google, Inc
*
* Portions derived from Patrick Mochel's sysfs code.
* sysfs is Copyright (c) 2001-3 Patrick Mochel
*
* 2003-10-10 Written by Simon Derr.
* 2003-10-22 Updates by Stephen Hemminger.
* 2004 May-July Rework by Paul Jackson.
* 2006 Rework by Paul Menage to use generic cgroups
* 2008 Rework of the scheduler domains and CPU hotplug handling
* by Max Krasnyansky
*
* This file is subject to the terms and conditions of the GNU General Public
* License. See the file COPYING in the main directory of the Linux
* distribution for more details.
*/
#include <linux/cpu.h>
#include <linux/cpumask.h>
#include <linux/cpuset.h>
#include <linux/err.h>
#include <linux/errno.h>
#include <linux/file.h>
#include <linux/fs.h>
#include <linux/init.h>
#include <linux/interrupt.h>
#include <linux/kernel.h>
#include <linux/kmod.h>
#include <linux/list.h>
#include <linux/mempolicy.h>
#include <linux/mm.h>
#include <linux/memory.h>
#include <linux/export.h>
#include <linux/mount.h>
#include <linux/fs_context.h>
#include <linux/namei.h>
#include <linux/pagemap.h>
#include <linux/proc_fs.h>
#include <linux/rcupdate.h>
#include <linux/sched.h>
#include <linux/sched/deadline.h>
#include <linux/sched/mm.h>
#include <linux/sched/task.h>
#include <linux/seq_file.h>
#include <linux/security.h>
#include <linux/slab.h>
#include <linux/spinlock.h>
#include <linux/stat.h>
#include <linux/string.h>
#include <linux/time.h>
#include <linux/time64.h>
#include <linux/backing-dev.h>
#include <linux/sort.h>
#include <linux/oom.h>
#include <linux/sched/isolation.h>
#include <linux/uaccess.h>
#include <linux/atomic.h>
#include <linux/mutex.h>
#include <linux/cgroup.h>
#include <linux/wait.h>
DEFINE_STATIC_KEY_FALSE(cpusets_pre_enable_key);
DEFINE_STATIC_KEY_FALSE(cpusets_enabled_key);
/* See "Frequency meter" comments, below. */
struct fmeter {
int cnt; /* unprocessed events count */
int val; /* most recent output value */
time64_t time; /* clock (secs) when val computed */
spinlock_t lock; /* guards read or write of above */
};
struct cpuset {
struct cgroup_subsys_state css;
unsigned long flags; /* "unsigned long" so bitops work */
/*
* On default hierarchy:
*
* The user-configured masks can only be changed by writing to
* cpuset.cpus and cpuset.mems, and won't be limited by the
* parent masks.
*
* The effective masks is the real masks that apply to the tasks
* in the cpuset. They may be changed if the configured masks are
* changed or hotplug happens.
*
* effective_mask == configured_mask & parent's effective_mask,
* and if it ends up empty, it will inherit the parent's mask.
*
*
* On legacy hierarchy:
*
* The user-configured masks are always the same with effective masks.
*/
/* user-configured CPUs and Memory Nodes allow to tasks */
cpumask_var_t cpus_allowed;
nodemask_t mems_allowed;
/* effective CPUs and Memory Nodes allow to tasks */
cpumask_var_t effective_cpus;
nodemask_t effective_mems;
/*
* CPUs allocated to child sub-partitions (default hierarchy only)
* - CPUs granted by the parent = effective_cpus U subparts_cpus
* - effective_cpus and subparts_cpus are mutually exclusive.
*
* effective_cpus contains only onlined CPUs, but subparts_cpus
* may have offlined ones.
*/
cpumask_var_t subparts_cpus;
/*
* This is old Memory Nodes tasks took on.
*
* - top_cpuset.old_mems_allowed is initialized to mems_allowed.
* - A new cpuset's old_mems_allowed is initialized when some
* task is moved into it.
* - old_mems_allowed is used in cpuset_migrate_mm() when we change
* cpuset.mems_allowed and have tasks' nodemask updated, and
* then old_mems_allowed is updated to mems_allowed.
*/
nodemask_t old_mems_allowed;
struct fmeter fmeter; /* memory_pressure filter */
/*
* Tasks are being attached to this cpuset. Used to prevent
* zeroing cpus/mems_allowed between ->can_attach() and ->attach().
*/
int attach_in_progress;
/* partition number for rebuild_sched_domains() */
int pn;
/* for custom sched domain */
int relax_domain_level;
/* number of CPUs in subparts_cpus */
int nr_subparts_cpus;
/* partition root state */
int partition_root_state;
/*
* Default hierarchy only:
* use_parent_ecpus - set if using parent's effective_cpus
* child_ecpus_count - # of children with use_parent_ecpus set
*/
int use_parent_ecpus;
int child_ecpus_count;
/* Handle for cpuset.cpus.partition */
struct cgroup_file partition_file;
};
/*
* Partition root states:
*
* 0 - not a partition root
*
* 1 - partition root
*
* -1 - invalid partition root
* None of the cpus in cpus_allowed can be put into the parent's
* subparts_cpus. In this case, the cpuset is not a real partition
* root anymore. However, the CPU_EXCLUSIVE bit will still be set
* and the cpuset can be restored back to a partition root if the
* parent cpuset can give more CPUs back to this child cpuset.
*/
#define PRS_DISABLED 0
#define PRS_ENABLED 1
#define PRS_ERROR -1
/*
* Temporary cpumasks for working with partitions that are passed among
* functions to avoid memory allocation in inner functions.
*/
struct tmpmasks {
cpumask_var_t addmask, delmask; /* For partition root */
cpumask_var_t new_cpus; /* For update_cpumasks_hier() */
};
static inline struct cpuset *css_cs(struct cgroup_subsys_state *css)
{
return css ? container_of(css, struct cpuset, css) : NULL;
}
/* Retrieve the cpuset for a task */
static inline struct cpuset *task_cs(struct task_struct *task)
{
return css_cs(task_css(task, cpuset_cgrp_id));
}
static inline struct cpuset *parent_cs(struct cpuset *cs)
{
return css_cs(cs->css.parent);
}
/* bits in struct cpuset flags field */
typedef enum {
CS_ONLINE,
CS_CPU_EXCLUSIVE,
CS_MEM_EXCLUSIVE,
CS_MEM_HARDWALL,
CS_MEMORY_MIGRATE,
CS_SCHED_LOAD_BALANCE,
CS_SPREAD_PAGE,
CS_SPREAD_SLAB,
} cpuset_flagbits_t;
/* convenient tests for these bits */
static inline bool is_cpuset_online(struct cpuset *cs)
{
return test_bit(CS_ONLINE, &cs->flags) && !css_is_dying(&cs->css);
}
static inline int is_cpu_exclusive(const struct cpuset *cs)
{
return test_bit(CS_CPU_EXCLUSIVE, &cs->flags);
}
static inline int is_mem_exclusive(const struct cpuset *cs)
{
return test_bit(CS_MEM_EXCLUSIVE, &cs->flags);
}
static inline int is_mem_hardwall(const struct cpuset *cs)
{
return test_bit(CS_MEM_HARDWALL, &cs->flags);
}
static inline int is_sched_load_balance(const struct cpuset *cs)
{
return test_bit(CS_SCHED_LOAD_BALANCE, &cs->flags);
}
static inline int is_memory_migrate(const struct cpuset *cs)
{
return test_bit(CS_MEMORY_MIGRATE, &cs->flags);
}
static inline int is_spread_page(const struct cpuset *cs)
{
return test_bit(CS_SPREAD_PAGE, &cs->flags);
}
static inline int is_spread_slab(const struct cpuset *cs)
{
return test_bit(CS_SPREAD_SLAB, &cs->flags);
}
static inline int is_partition_root(const struct cpuset *cs)
{
return cs->partition_root_state > 0;
}
/*
* Send notification event of whenever partition_root_state changes.
*/
static inline void notify_partition_change(struct cpuset *cs,
int old_prs, int new_prs)
{
if (old_prs != new_prs)
cgroup_file_notify(&cs->partition_file);
}
static struct cpuset top_cpuset = {
.flags = ((1 << CS_ONLINE) | (1 << CS_CPU_EXCLUSIVE) |
(1 << CS_MEM_EXCLUSIVE)),
.partition_root_state = PRS_ENABLED,
};
/**
* cpuset_for_each_child - traverse online children of a cpuset
* @child_cs: loop cursor pointing to the current child
* @pos_css: used for iteration
* @parent_cs: target cpuset to walk children of
*
* Walk @child_cs through the online children of @parent_cs. Must be used
* with RCU read locked.
*/
#define cpuset_for_each_child(child_cs, pos_css, parent_cs) \
css_for_each_child((pos_css), &(parent_cs)->css) \
if (is_cpuset_online(((child_cs) = css_cs((pos_css)))))
/**
* cpuset_for_each_descendant_pre - pre-order walk of a cpuset's descendants
* @des_cs: loop cursor pointing to the current descendant
* @pos_css: used for iteration
* @root_cs: target cpuset to walk ancestor of
*
* Walk @des_cs through the online descendants of @root_cs. Must be used
* with RCU read locked. The caller may modify @pos_css by calling
* css_rightmost_descendant() to skip subtree. @root_cs is included in the
* iteration and the first node to be visited.
*/
#define cpuset_for_each_descendant_pre(des_cs, pos_css, root_cs) \
css_for_each_descendant_pre((pos_css), &(root_cs)->css) \
if (is_cpuset_online(((des_cs) = css_cs((pos_css)))))
/*
* There are two global locks guarding cpuset structures - cpuset_rwsem and
* callback_lock. We also require taking task_lock() when dereferencing a
* task's cpuset pointer. See "The task_lock() exception", at the end of this
* comment. The cpuset code uses only cpuset_rwsem write lock. Other
* kernel subsystems can use cpuset_read_lock()/cpuset_read_unlock() to
* prevent change to cpuset structures.
*
* A task must hold both locks to modify cpusets. If a task holds
* cpuset_rwsem, it blocks others wanting that rwsem, ensuring that it
* is the only task able to also acquire callback_lock and be able to
* modify cpusets. It can perform various checks on the cpuset structure
* first, knowing nothing will change. It can also allocate memory while
* just holding cpuset_rwsem. While it is performing these checks, various
* callback routines can briefly acquire callback_lock to query cpusets.
* Once it is ready to make the changes, it takes callback_lock, blocking
* everyone else.
*
* Calls to the kernel memory allocator can not be made while holding
* callback_lock, as that would risk double tripping on callback_lock
* from one of the callbacks into the cpuset code from within
* __alloc_pages().
*
* If a task is only holding callback_lock, then it has read-only
* access to cpusets.
*
* Now, the task_struct fields mems_allowed and mempolicy may be changed
* by other task, we use alloc_lock in the task_struct fields to protect
* them.
*
* The cpuset_common_file_read() handlers only hold callback_lock across
* small pieces of code, such as when reading out possibly multi-word
* cpumasks and nodemasks.
*
* Accessing a task's cpuset should be done in accordance with the
* guidelines for accessing subsystem state in kernel/cgroup.c
*/
DEFINE_STATIC_PERCPU_RWSEM(cpuset_rwsem);
void cpuset_read_lock(void)
{
percpu_down_read(&cpuset_rwsem);
}
void cpuset_read_unlock(void)
{
percpu_up_read(&cpuset_rwsem);
}
static DEFINE_SPINLOCK(callback_lock);
static struct workqueue_struct *cpuset_migrate_mm_wq;
/*
* CPU / memory hotplug is handled asynchronously.
*/
static void cpuset_hotplug_workfn(struct work_struct *work);
static DECLARE_WORK(cpuset_hotplug_work, cpuset_hotplug_workfn);
static DECLARE_WAIT_QUEUE_HEAD(cpuset_attach_wq);
/*
* Cgroup v2 behavior is used on the "cpus" and "mems" control files when
* on default hierarchy or when the cpuset_v2_mode flag is set by mounting
* the v1 cpuset cgroup filesystem with the "cpuset_v2_mode" mount option.
* With v2 behavior, "cpus" and "mems" are always what the users have
* requested and won't be changed by hotplug events. Only the effective
* cpus or mems will be affected.
*/
static inline bool is_in_v2_mode(void)
{
return cgroup_subsys_on_dfl(cpuset_cgrp_subsys) ||
(cpuset_cgrp_subsys.root->flags & CGRP_ROOT_CPUSET_V2_MODE);
}
/*
* Return in pmask the portion of a task's cpusets's cpus_allowed that
* are online and are capable of running the task. If none are found,
* walk up the cpuset hierarchy until we find one that does have some
* appropriate cpus.
*
* One way or another, we guarantee to return some non-empty subset
* of cpu_online_mask.
*
* Call with callback_lock or cpuset_rwsem held.
*/
static void guarantee_online_cpus(struct task_struct *tsk,
struct cpumask *pmask)
{
const struct cpumask *possible_mask = task_cpu_possible_mask(tsk);
struct cpuset *cs;
if (WARN_ON(!cpumask_and(pmask, possible_mask, cpu_online_mask)))
cpumask_copy(pmask, cpu_online_mask);
rcu_read_lock();
cs = task_cs(tsk);
while (!cpumask_intersects(cs->effective_cpus, pmask)) {
cs = parent_cs(cs);
if (unlikely(!cs)) {
/*
* The top cpuset doesn't have any online cpu as a
* consequence of a race between cpuset_hotplug_work
* and cpu hotplug notifier. But we know the top
* cpuset's effective_cpus is on its way to be
* identical to cpu_online_mask.
*/
goto out_unlock;
}
}
cpumask_and(pmask, pmask, cs->effective_cpus);
out_unlock:
rcu_read_unlock();
}
/*
* Return in *pmask the portion of a cpusets's mems_allowed that
* are online, with memory. If none are online with memory, walk
* up the cpuset hierarchy until we find one that does have some
* online mems. The top cpuset always has some mems online.
*
* One way or another, we guarantee to return some non-empty subset
* of node_states[N_MEMORY].
*
* Call with callback_lock or cpuset_rwsem held.
*/
static void guarantee_online_mems(struct cpuset *cs, nodemask_t *pmask)
{
while (!nodes_intersects(cs->effective_mems, node_states[N_MEMORY]))
cs = parent_cs(cs);
nodes_and(*pmask, cs->effective_mems, node_states[N_MEMORY]);
}
/*
* update task's spread flag if cpuset's page/slab spread flag is set
*
* Call with callback_lock or cpuset_rwsem held.
*/
static void cpuset_update_task_spread_flag(struct cpuset *cs,
struct task_struct *tsk)
{
if (is_spread_page(cs))
task_set_spread_page(tsk);
else
task_clear_spread_page(tsk);
if (is_spread_slab(cs))
task_set_spread_slab(tsk);
else
task_clear_spread_slab(tsk);
}
/*
* is_cpuset_subset(p, q) - Is cpuset p a subset of cpuset q?
*
* One cpuset is a subset of another if all its allowed CPUs and
* Memory Nodes are a subset of the other, and its exclusive flags
* are only set if the other's are set. Call holding cpuset_rwsem.
*/
static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q)
{
return cpumask_subset(p->cpus_allowed, q->cpus_allowed) &&
nodes_subset(p->mems_allowed, q->mems_allowed) &&
is_cpu_exclusive(p) <= is_cpu_exclusive(q) &&
is_mem_exclusive(p) <= is_mem_exclusive(q);
}
/**
* alloc_cpumasks - allocate three cpumasks for cpuset
* @cs: the cpuset that have cpumasks to be allocated.
* @tmp: the tmpmasks structure pointer
* Return: 0 if successful, -ENOMEM otherwise.
*
* Only one of the two input arguments should be non-NULL.
*/
static inline int alloc_cpumasks(struct cpuset *cs, struct tmpmasks *tmp)
{
cpumask_var_t *pmask1, *pmask2, *pmask3;
if (cs) {
pmask1 = &cs->cpus_allowed;
pmask2 = &cs->effective_cpus;
pmask3 = &cs->subparts_cpus;
} else {
pmask1 = &tmp->new_cpus;
pmask2 = &tmp->addmask;
pmask3 = &tmp->delmask;
}
if (!zalloc_cpumask_var(pmask1, GFP_KERNEL))
return -ENOMEM;
if (!zalloc_cpumask_var(pmask2, GFP_KERNEL))
goto free_one;
if (!zalloc_cpumask_var(pmask3, GFP_KERNEL))
goto free_two;
return 0;
free_two:
free_cpumask_var(*pmask2);
free_one:
free_cpumask_var(*pmask1);
return -ENOMEM;
}
/**
* free_cpumasks - free cpumasks in a tmpmasks structure
* @cs: the cpuset that have cpumasks to be free.
* @tmp: the tmpmasks structure pointer
*/
static inline void free_cpumasks(struct cpuset *cs, struct tmpmasks *tmp)
{
if (cs) {
free_cpumask_var(cs->cpus_allowed);
free_cpumask_var(cs->effective_cpus);
free_cpumask_var(cs->subparts_cpus);
}
if (tmp) {
free_cpumask_var(tmp->new_cpus);
free_cpumask_var(tmp->addmask);
free_cpumask_var(tmp->delmask);
}
}
/**
* alloc_trial_cpuset - allocate a trial cpuset
* @cs: the cpuset that the trial cpuset duplicates
*/
static struct cpuset *alloc_trial_cpuset(struct cpuset *cs)
{
struct cpuset *trial;
trial = kmemdup(cs, sizeof(*cs), GFP_KERNEL);
if (!trial)
return NULL;
if (alloc_cpumasks(trial, NULL)) {
kfree(trial);
return NULL;
}
cpumask_copy(trial->cpus_allowed, cs->cpus_allowed);
cpumask_copy(trial->effective_cpus, cs->effective_cpus);
return trial;
}
/**
* free_cpuset - free the cpuset
* @cs: the cpuset to be freed
*/
static inline void free_cpuset(struct cpuset *cs)
{
free_cpumasks(cs, NULL);
kfree(cs);
}
/*
* validate_change() - Used to validate that any proposed cpuset change
* follows the structural rules for cpusets.
*
* If we replaced the flag and mask values of the current cpuset
* (cur) with those values in the trial cpuset (trial), would
* our various subset and exclusive rules still be valid? Presumes
* cpuset_rwsem held.
*
* 'cur' is the address of an actual, in-use cpuset. Operations
* such as list traversal that depend on the actual address of the
* cpuset in the list must use cur below, not trial.
*
* 'trial' is the address of bulk structure copy of cur, with
* perhaps one or more of the fields cpus_allowed, mems_allowed,
* or flags changed to new, trial values.
*
* Return 0 if valid, -errno if not.
*/
static int validate_change(struct cpuset *cur, struct cpuset *trial)
{
struct cgroup_subsys_state *css;
struct cpuset *c, *par;
int ret;
rcu_read_lock();
/* Each of our child cpusets must be a subset of us */
ret = -EBUSY;
cpuset_for_each_child(c, css, cur)
if (!is_cpuset_subset(c, trial))
goto out;
/* Remaining checks don't apply to root cpuset */
ret = 0;
if (cur == &top_cpuset)
goto out;
par = parent_cs(cur);
/* On legacy hierarchy, we must be a subset of our parent cpuset. */
ret = -EACCES;
if (!is_in_v2_mode() && !is_cpuset_subset(trial, par))
goto out;
/*
* If either I or some sibling (!= me) is exclusive, we can't
* overlap
*/
ret = -EINVAL;
cpuset_for_each_child(c, css, par) {
if ((is_cpu_exclusive(trial) || is_cpu_exclusive(c)) &&
c != cur &&
cpumask_intersects(trial->cpus_allowed, c->cpus_allowed))
goto out;
if ((is_mem_exclusive(trial) || is_mem_exclusive(c)) &&
c != cur &&
nodes_intersects(trial->mems_allowed, c->mems_allowed))
goto out;
}
/*
* Cpusets with tasks - existing or newly being attached - can't
* be changed to have empty cpus_allowed or mems_allowed.
*/
ret = -ENOSPC;
if ((cgroup_is_populated(cur->css.cgroup) || cur->attach_in_progress)) {
if (!cpumask_empty(cur->cpus_allowed) &&
cpumask_empty(trial->cpus_allowed))
goto out;
if (!nodes_empty(cur->mems_allowed) &&
nodes_empty(trial->mems_allowed))
goto out;
}
/*
* We can't shrink if we won't have enough room for SCHED_DEADLINE
* tasks.
*/
ret = -EBUSY;
if (is_cpu_exclusive(cur) &&
!cpuset_cpumask_can_shrink(cur->cpus_allowed,
trial->cpus_allowed))
goto out;
ret = 0;
out:
rcu_read_unlock();
return ret;
}
#ifdef CONFIG_SMP
/*
* Helper routine for generate_sched_domains().
* Do cpusets a, b have overlapping effective cpus_allowed masks?
*/
static int cpusets_overlap(struct cpuset *a, struct cpuset *b)
{
return cpumask_intersects(a->effective_cpus, b->effective_cpus);
}
static void
update_domain_attr(struct sched_domain_attr *dattr, struct cpuset *c)
{
if (dattr->relax_domain_level < c->relax_domain_level)
dattr->relax_domain_level = c->relax_domain_level;
return;
}
static void update_domain_attr_tree(struct sched_domain_attr *dattr,
struct cpuset *root_cs)
{
struct cpuset *cp;
struct cgroup_subsys_state *pos_css;
rcu_read_lock();
cpuset_for_each_descendant_pre(cp, pos_css, root_cs) {
/* skip the whole subtree if @cp doesn't have any CPU */
if (cpumask_empty(cp->cpus_allowed)) {
pos_css = css_rightmost_descendant(pos_css);
continue;
}
if (is_sched_load_balance(cp))
update_domain_attr(dattr, cp);
}
rcu_read_unlock();
}
/* Must be called with cpuset_rwsem held. */
static inline int nr_cpusets(void)
{
/* jump label reference count + the top-level cpuset */
return static_key_count(&cpusets_enabled_key.key) + 1;
}
/*
* generate_sched_domains()
*
* This function builds a partial partition of the systems CPUs
* A 'partial partition' is a set of non-overlapping subsets whose
* union is a subset of that set.
* The output of this function needs to be passed to kernel/sched/core.c
* partition_sched_domains() routine, which will rebuild the scheduler's
* load balancing domains (sched domains) as specified by that partial
* partition.
*
* See "What is sched_load_balance" in Documentation/admin-guide/cgroup-v1/cpusets.rst
* for a background explanation of this.
*
* Does not return errors, on the theory that the callers of this
* routine would rather not worry about failures to rebuild sched
* domains when operating in the severe memory shortage situations
* that could cause allocation failures below.
*
* Must be called with cpuset_rwsem held.
*
* The three key local variables below are:
* cp - cpuset pointer, used (together with pos_css) to perform a
* top-down scan of all cpusets. For our purposes, rebuilding
* the schedulers sched domains, we can ignore !is_sched_load_
* balance cpusets.
* csa - (for CpuSet Array) Array of pointers to all the cpusets
* that need to be load balanced, for convenient iterative
* access by the subsequent code that finds the best partition,
* i.e the set of domains (subsets) of CPUs such that the
* cpus_allowed of every cpuset marked is_sched_load_balance
* is a subset of one of these domains, while there are as
* many such domains as possible, each as small as possible.
* doms - Conversion of 'csa' to an array of cpumasks, for passing to
* the kernel/sched/core.c routine partition_sched_domains() in a
* convenient format, that can be easily compared to the prior
* value to determine what partition elements (sched domains)
* were changed (added or removed.)
*
* Finding the best partition (set of domains):
* The triple nested loops below over i, j, k scan over the
* load balanced cpusets (using the array of cpuset pointers in
* csa[]) looking for pairs of cpusets that have overlapping
* cpus_allowed, but which don't have the same 'pn' partition
* number and gives them in the same partition number. It keeps
* looping on the 'restart' label until it can no longer find
* any such pairs.
*
* The union of the cpus_allowed masks from the set of
* all cpusets having the same 'pn' value then form the one
* element of the partition (one sched domain) to be passed to
* partition_sched_domains().
*/
static int generate_sched_domains(cpumask_var_t **domains,
struct sched_domain_attr **attributes)
{
struct cpuset *cp; /* top-down scan of cpusets */
struct cpuset **csa; /* array of all cpuset ptrs */
int csn; /* how many cpuset ptrs in csa so far */
int i, j, k; /* indices for partition finding loops */
cpumask_var_t *doms; /* resulting partition; i.e. sched domains */
struct sched_domain_attr *dattr; /* attributes for custom domains */
int ndoms = 0; /* number of sched domains in result */
int nslot; /* next empty doms[] struct cpumask slot */
struct cgroup_subsys_state *pos_css;
bool root_load_balance = is_sched_load_balance(&top_cpuset);
doms = NULL;
dattr = NULL;
csa = NULL;
/* Special case for the 99% of systems with one, full, sched domain */
if (root_load_balance && !top_cpuset.nr_subparts_cpus) {
ndoms = 1;
doms = alloc_sched_domains(ndoms);
if (!doms)
goto done;
dattr = kmalloc(sizeof(struct sched_domain_attr), GFP_KERNEL);
if (dattr) {
*dattr = SD_ATTR_INIT;
update_domain_attr_tree(dattr, &top_cpuset);
}
cpumask_and(doms[0], top_cpuset.effective_cpus,
housekeeping_cpumask(HK_FLAG_DOMAIN));
goto done;
}
csa = kmalloc_array(nr_cpusets(), sizeof(cp), GFP_KERNEL);
if (!csa)
goto done;
csn = 0;
rcu_read_lock();
if (root_load_balance)
csa[csn++] = &top_cpuset;
cpuset_for_each_descendant_pre(cp, pos_css, &top_cpuset) {
if (cp == &top_cpuset)
continue;
/*
* Continue traversing beyond @cp iff @cp has some CPUs and
* isn't load balancing. The former is obvious. The
* latter: All child cpusets contain a subset of the
* parent's cpus, so just skip them, and then we call
* update_domain_attr_tree() to calc relax_domain_level of
* the corresponding sched domain.
*
* If root is load-balancing, we can skip @cp if it
* is a subset of the root's effective_cpus.
*/
if (!cpumask_empty(cp->cpus_allowed) &&
!(is_sched_load_balance(cp) &&
cpumask_intersects(cp->cpus_allowed,
housekeeping_cpumask(HK_FLAG_DOMAIN))))
continue;
if (root_load_balance &&
cpumask_subset(cp->cpus_allowed, top_cpuset.effective_cpus))
continue;
if (is_sched_load_balance(cp) &&
!cpumask_empty(cp->effective_cpus))
csa[csn++] = cp;
/* skip @cp's subtree if not a partition root */
if (!is_partition_root(cp))
pos_css = css_rightmost_descendant(pos_css);
}
rcu_read_unlock();
for (i = 0; i < csn; i++)
csa[i]->pn = i;
ndoms = csn;
restart:
/* Find the best partition (set of sched domains) */
for (i = 0; i < csn; i++) {
struct cpuset *a = csa[i];
int apn = a->pn;
for (j = 0; j < csn; j++) {
struct cpuset *b = csa[j];
int bpn = b->pn;
if (apn != bpn && cpusets_overlap(a, b)) {
for (k = 0; k < csn; k++) {
struct cpuset *c = csa[k];
if (c->pn == bpn)
c->pn = apn;
}
ndoms--; /* one less element */
goto restart;
}
}
}
/*
* Now we know how many domains to create.
* Convert <csn, csa> to <ndoms, doms> and populate cpu masks.
*/
doms = alloc_sched_domains(ndoms);
if (!doms)
goto done;
/*
* The rest of the code, including the scheduler, can deal with
* dattr==NULL case. No need to abort if alloc fails.
*/
dattr = kmalloc_array(ndoms, sizeof(struct sched_domain_attr),
GFP_KERNEL);
for (nslot = 0, i = 0; i < csn; i++) {
struct cpuset *a = csa[i];
struct cpumask *dp;
int apn = a->pn;
if (apn < 0) {
/* Skip completed partitions */
continue;
}
dp = doms[nslot];
if (nslot == ndoms) {
static int warnings = 10;
if (warnings) {
pr_warn("rebuild_sched_domains confused: nslot %d, ndoms %d, csn %d, i %d, apn %d\n",
nslot, ndoms, csn, i, apn);
warnings--;
}
continue;
}
cpumask_clear(dp);
if (dattr)
*(dattr + nslot) = SD_ATTR_INIT;
for (j = i; j < csn; j++) {
struct cpuset *b = csa[j];
if (apn == b->pn) {
cpumask_or(dp, dp, b->effective_cpus);
cpumask_and(dp, dp, housekeeping_cpumask(HK_FLAG_DOMAIN));
if (dattr)
update_domain_attr_tree(dattr + nslot, b);
/* Done with this partition */
b->pn = -1;
}
}
nslot++;
}
BUG_ON(nslot != ndoms);
done:
kfree(csa);
/*
* Fallback to the default domain if kmalloc() failed.
* See comments in partition_sched_domains().
*/
if (doms == NULL)
ndoms = 1;
*domains = doms;
*attributes = dattr;
return ndoms;
}
static void update_tasks_root_domain(struct cpuset *cs)
{
struct css_task_iter it;
struct task_struct *task;
css_task_iter_start(&cs->css, 0, &it);
while ((task = css_task_iter_next(&it)))
dl_add_task_root_domain(task);
css_task_iter_end(&it);
}
static void rebuild_root_domains(void)
{
struct cpuset *cs = NULL;
struct cgroup_subsys_state *pos_css;
percpu_rwsem_assert_held(&cpuset_rwsem);
lockdep_assert_cpus_held();
lockdep_assert_held(&sched_domains_mutex);
rcu_read_lock();
/*
* Clear default root domain DL accounting, it will be computed again
* if a task belongs to it.
*/
dl_clear_root_domain(&def_root_domain);
cpuset_for_each_descendant_pre(cs, pos_css, &top_cpuset) {
if (cpumask_empty(cs->effective_cpus)) {
pos_css = css_rightmost_descendant(pos_css);
continue;
}
css_get(&cs->css);
rcu_read_unlock();
update_tasks_root_domain(cs);
rcu_read_lock();
css_put(&cs->css);
}
rcu_read_unlock();
}
static void
partition_and_rebuild_sched_domains(int ndoms_new, cpumask_var_t doms_new[],
struct sched_domain_attr *dattr_new)
{
mutex_lock(&sched_domains_mutex);
partition_sched_domains_locked(ndoms_new, doms_new, dattr_new);
rebuild_root_domains();
mutex_unlock(&sched_domains_mutex);
}
/*
* Rebuild scheduler domains.
*
* If the flag 'sched_load_balance' of any cpuset with non-empty
* 'cpus' changes, or if the 'cpus' allowed changes in any cpuset
* which has that flag enabled, or if any cpuset with a non-empty
* 'cpus' is removed, then call this routine to rebuild the
* scheduler's dynamic sched domains.
*
* Call with cpuset_rwsem held. Takes cpus_read_lock().
*/
static void rebuild_sched_domains_locked(void)
{
struct cgroup_subsys_state *pos_css;
struct sched_domain_attr *attr;
cpumask_var_t *doms;
struct cpuset *cs;
int ndoms;
lockdep_assert_cpus_held();
percpu_rwsem_assert_held(&cpuset_rwsem);
/*
* If we have raced with CPU hotplug, return early to avoid
* passing doms with offlined cpu to partition_sched_domains().
* Anyways, cpuset_hotplug_workfn() will rebuild sched domains.
*
* With no CPUs in any subpartitions, top_cpuset's effective CPUs
* should be the same as the active CPUs, so checking only top_cpuset
* is enough to detect racing CPU offlines.
*/
if (!top_cpuset.nr_subparts_cpus &&
!cpumask_equal(top_cpuset.effective_cpus, cpu_active_mask))
return;
/*
* With subpartition CPUs, however, the effective CPUs of a partition
* root should be only a subset of the active CPUs. Since a CPU in any
* partition root could be offlined, all must be checked.
*/
if (top_cpuset.nr_subparts_cpus) {
rcu_read_lock();
cpuset_for_each_descendant_pre(cs, pos_css, &top_cpuset) {
if (!is_partition_root(cs)) {
pos_css = css_rightmost_descendant(pos_css);
continue;
}
if (!cpumask_subset(cs->effective_cpus,
cpu_active_mask)) {
rcu_read_unlock();
return;
}
}
rcu_read_unlock();
}
/* Generate domain masks and attrs */
ndoms = generate_sched_domains(&doms, &attr);
/* Have scheduler rebuild the domains */
partition_and_rebuild_sched_domains(ndoms, doms, attr);
}
#else /* !CONFIG_SMP */
static void rebuild_sched_domains_locked(void)
{
}
#endif /* CONFIG_SMP */
void rebuild_sched_domains(void)
{
cpus_read_lock();
percpu_down_write(&cpuset_rwsem);
rebuild_sched_domains_locked();
percpu_up_write(&cpuset_rwsem);
cpus_read_unlock();
}
/**
* update_tasks_cpumask - Update the cpumasks of tasks in the cpuset.
* @cs: the cpuset in which each task's cpus_allowed mask needs to be changed
*
* Iterate through each task of @cs updating its cpus_allowed to the
* effective cpuset's. As this function is called with cpuset_rwsem held,
* cpuset membership stays stable.
*/
static void update_tasks_cpumask(struct cpuset *cs)
{
struct css_task_iter it;
struct task_struct *task;
css_task_iter_start(&cs->css, 0, &it);
while ((task = css_task_iter_next(&it)))
set_cpus_allowed_ptr(task, cs->effective_cpus);
css_task_iter_end(&it);
}
/**
* compute_effective_cpumask - Compute the effective cpumask of the cpuset
* @new_cpus: the temp variable for the new effective_cpus mask
* @cs: the cpuset the need to recompute the new effective_cpus mask
* @parent: the parent cpuset
*
* If the parent has subpartition CPUs, include them in the list of
* allowable CPUs in computing the new effective_cpus mask. Since offlined
* CPUs are not removed from subparts_cpus, we have to use cpu_active_mask
* to mask those out.
*/
static void compute_effective_cpumask(struct cpumask *new_cpus,
struct cpuset *cs, struct cpuset *parent)
{
if (parent->nr_subparts_cpus) {
cpumask_or(new_cpus, parent->effective_cpus,
parent->subparts_cpus);
cpumask_and(new_cpus, new_cpus, cs->cpus_allowed);
cpumask_and(new_cpus, new_cpus, cpu_active_mask);
} else {
cpumask_and(new_cpus, cs->cpus_allowed, parent->effective_cpus);
}
}
/*
* Commands for update_parent_subparts_cpumask
*/
enum subparts_cmd {
partcmd_enable, /* Enable partition root */
partcmd_disable, /* Disable partition root */
partcmd_update, /* Update parent's subparts_cpus */
};
/**
* update_parent_subparts_cpumask - update subparts_cpus mask of parent cpuset
* @cpuset: The cpuset that requests change in partition root state
* @cmd: Partition root state change command
* @newmask: Optional new cpumask for partcmd_update
* @tmp: Temporary addmask and delmask
* Return: 0, 1 or an error code
*
* For partcmd_enable, the cpuset is being transformed from a non-partition
* root to a partition root. The cpus_allowed mask of the given cpuset will
* be put into parent's subparts_cpus and taken away from parent's
* effective_cpus. The function will return 0 if all the CPUs listed in
* cpus_allowed can be granted or an error code will be returned.
*
* For partcmd_disable, the cpuset is being transofrmed from a partition
* root back to a non-partition root. Any CPUs in cpus_allowed that are in
* parent's subparts_cpus will be taken away from that cpumask and put back
* into parent's effective_cpus. 0 should always be returned.
*
* For partcmd_update, if the optional newmask is specified, the cpu
* list is to be changed from cpus_allowed to newmask. Otherwise,
* cpus_allowed is assumed to remain the same. The cpuset should either
* be a partition root or an invalid partition root. The partition root
* state may change if newmask is NULL and none of the requested CPUs can
* be granted by the parent. The function will return 1 if changes to
* parent's subparts_cpus and effective_cpus happen or 0 otherwise.
* Error code should only be returned when newmask is non-NULL.
*
* The partcmd_enable and partcmd_disable commands are used by
* update_prstate(). The partcmd_update command is used by
* update_cpumasks_hier() with newmask NULL and update_cpumask() with
* newmask set.
*
* The checking is more strict when enabling partition root than the
* other two commands.
*
* Because of the implicit cpu exclusive nature of a partition root,
* cpumask changes that violates the cpu exclusivity rule will not be
* permitted when checked by validate_change(). The validate_change()
* function will also prevent any changes to the cpu list if it is not
* a superset of children's cpu lists.
*/
static int update_parent_subparts_cpumask(struct cpuset *cpuset, int cmd,
struct cpumask *newmask,
struct tmpmasks *tmp)
{
struct cpuset *parent = parent_cs(cpuset);
int adding; /* Moving cpus from effective_cpus to subparts_cpus */
int deleting; /* Moving cpus from subparts_cpus to effective_cpus */
int old_prs, new_prs;
bool part_error = false; /* Partition error? */
percpu_rwsem_assert_held(&cpuset_rwsem);
/*
* The parent must be a partition root.
* The new cpumask, if present, or the current cpus_allowed must
* not be empty.
*/
if (!is_partition_root(parent) ||
(newmask && cpumask_empty(newmask)) ||
(!newmask && cpumask_empty(cpuset->cpus_allowed)))
return -EINVAL;
/*
* Enabling/disabling partition root is not allowed if there are
* online children.
*/
if ((cmd != partcmd_update) && css_has_online_children(&cpuset->css))
return -EBUSY;
/*
* Enabling partition root is not allowed if not all the CPUs
* can be granted from parent's effective_cpus or at least one
* CPU will be left after that.
*/
if ((cmd == partcmd_enable) &&
(!cpumask_subset(cpuset->cpus_allowed, parent->effective_cpus) ||
cpumask_equal(cpuset->cpus_allowed, parent->effective_cpus)))
return -EINVAL;
/*
* A cpumask update cannot make parent's effective_cpus become empty.
*/
adding = deleting = false;
old_prs = new_prs = cpuset->partition_root_state;
if (cmd == partcmd_enable) {
cpumask_copy(tmp->addmask, cpuset->cpus_allowed);
adding = true;
} else if (cmd == partcmd_disable) {
deleting = cpumask_and(tmp->delmask, cpuset->cpus_allowed,
parent->subparts_cpus);
} else if (newmask) {
/*
* partcmd_update with newmask:
*
* delmask = cpus_allowed & ~newmask & parent->subparts_cpus
* addmask = newmask & parent->effective_cpus
* & ~parent->subparts_cpus
*/
cpumask_andnot(tmp->delmask, cpuset->cpus_allowed, newmask);
deleting = cpumask_and(tmp->delmask, tmp->delmask,
parent->subparts_cpus);
cpumask_and(tmp->addmask, newmask, parent->effective_cpus);
adding = cpumask_andnot(tmp->addmask, tmp->addmask,
parent->subparts_cpus);
/*
* Return error if the new effective_cpus could become empty.
*/
if (adding &&
cpumask_equal(parent->effective_cpus, tmp->addmask)) {
if (!deleting)
return -EINVAL;
/*
* As some of the CPUs in subparts_cpus might have
* been offlined, we need to compute the real delmask
* to confirm that.
*/
if (!cpumask_and(tmp->addmask, tmp->delmask,
cpu_active_mask))
return -EINVAL;
cpumask_copy(tmp->addmask, parent->effective_cpus);
}
} else {
/*
* partcmd_update w/o newmask:
*
* addmask = cpus_allowed & parent->effective_cpus
*
* Note that parent's subparts_cpus may have been
* pre-shrunk in case there is a change in the cpu list.
* So no deletion is needed.
*/
adding = cpumask_and(tmp->addmask, cpuset->cpus_allowed,
parent->effective_cpus);
part_error = cpumask_equal(tmp->addmask,
parent->effective_cpus);
}
if (cmd == partcmd_update) {
int prev_prs = cpuset->partition_root_state;
/*
* Check for possible transition between PRS_ENABLED
* and PRS_ERROR.
*/
switch (cpuset->partition_root_state) {
case PRS_ENABLED:
if (part_error)
new_prs = PRS_ERROR;
break;
case PRS_ERROR:
if (!part_error)
new_prs = PRS_ENABLED;
break;
}
/*
* Set part_error if previously in invalid state.
*/
part_error = (prev_prs == PRS_ERROR);
}
if (!part_error && (new_prs == PRS_ERROR))
return 0; /* Nothing need to be done */
if (new_prs == PRS_ERROR) {
/*
* Remove all its cpus from parent's subparts_cpus.
*/
adding = false;
deleting = cpumask_and(tmp->delmask, cpuset->cpus_allowed,
parent->subparts_cpus);
}
if (!adding && !deleting && (new_prs == old_prs))
return 0;
/*
* Change the parent's subparts_cpus.
* Newly added CPUs will be removed from effective_cpus and
* newly deleted ones will be added back to effective_cpus.
*/
spin_lock_irq(&callback_lock);
if (adding) {
cpumask_or(parent->subparts_cpus,
parent->subparts_cpus, tmp->addmask);
cpumask_andnot(parent->effective_cpus,
parent->effective_cpus, tmp->addmask);
}
if (deleting) {
cpumask_andnot(parent->subparts_cpus,
parent->subparts_cpus, tmp->delmask);
/*
* Some of the CPUs in subparts_cpus might have been offlined.
*/
cpumask_and(tmp->delmask, tmp->delmask, cpu_active_mask);
cpumask_or(parent->effective_cpus,
parent->effective_cpus, tmp->delmask);
}
parent->nr_subparts_cpus = cpumask_weight(parent->subparts_cpus);
if (old_prs != new_prs)
cpuset->partition_root_state = new_prs;
spin_unlock_irq(&callback_lock);
notify_partition_change(cpuset, old_prs, new_prs);
return cmd == partcmd_update;
}
/*
* update_cpumasks_hier - Update effective cpumasks and tasks in the subtree
* @cs: the cpuset to consider
* @tmp: temp variables for calculating effective_cpus & partition setup
*
* When configured cpumask is changed, the effective cpumasks of this cpuset
* and all its descendants need to be updated.
*
* On legacy hierarchy, effective_cpus will be the same with cpu_allowed.
*
* Called with cpuset_rwsem held
*/
static void update_cpumasks_hier(struct cpuset *cs, struct tmpmasks *tmp)
{
struct cpuset *cp;
struct cgroup_subsys_state *pos_css;
bool need_rebuild_sched_domains = false;
int old_prs, new_prs;
rcu_read_lock();
cpuset_for_each_descendant_pre(cp, pos_css, cs) {
struct cpuset *parent = parent_cs(cp);
compute_effective_cpumask(tmp->new_cpus, cp, parent);
/*
* If it becomes empty, inherit the effective mask of the
* parent, which is guaranteed to have some CPUs.
*/
if (is_in_v2_mode() && cpumask_empty(tmp->new_cpus)) {
cpumask_copy(tmp->new_cpus, parent->effective_cpus);
if (!cp->use_parent_ecpus) {
cp->use_parent_ecpus = true;
parent->child_ecpus_count++;
}
} else if (cp->use_parent_ecpus) {
cp->use_parent_ecpus = false;
WARN_ON_ONCE(!parent->child_ecpus_count);
parent->child_ecpus_count--;
}
/*
* Skip the whole subtree if the cpumask remains the same
* and has no partition root state.
*/
if (!cp->partition_root_state &&
cpumask_equal(tmp->new_cpus, cp->effective_cpus)) {
pos_css = css_rightmost_descendant(pos_css);
continue;
}
/*
* update_parent_subparts_cpumask() should have been called
* for cs already in update_cpumask(). We should also call
* update_tasks_cpumask() again for tasks in the parent
* cpuset if the parent's subparts_cpus changes.
*/
old_prs = new_prs = cp->partition_root_state;
if ((cp != cs) && old_prs) {
switch (parent->partition_root_state) {
case PRS_DISABLED:
/*
* If parent is not a partition root or an
* invalid partition root, clear its state
* and its CS_CPU_EXCLUSIVE flag.
*/
WARN_ON_ONCE(cp->partition_root_state
!= PRS_ERROR);
new_prs = PRS_DISABLED;
/*
* clear_bit() is an atomic operation and
* readers aren't interested in the state
* of CS_CPU_EXCLUSIVE anyway. So we can
* just update the flag without holding
* the callback_lock.
*/
clear_bit(CS_CPU_EXCLUSIVE, &cp->flags);
break;
case PRS_ENABLED:
if (update_parent_subparts_cpumask(cp, partcmd_update, NULL, tmp))
update_tasks_cpumask(parent);
break;
case PRS_ERROR:
/*
* When parent is invalid, it has to be too.
*/
new_prs = PRS_ERROR;
break;
}
}
if (!css_tryget_online(&cp->css))
continue;
rcu_read_unlock();
spin_lock_irq(&callback_lock);
cpumask_copy(cp->effective_cpus, tmp->new_cpus);
if (cp->nr_subparts_cpus && (new_prs != PRS_ENABLED)) {
cp->nr_subparts_cpus = 0;
cpumask_clear(cp->subparts_cpus);
} else if (cp->nr_subparts_cpus) {
/*
* Make sure that effective_cpus & subparts_cpus
* are mutually exclusive.
*
* In the unlikely event that effective_cpus
* becomes empty. we clear cp->nr_subparts_cpus and
* let its child partition roots to compete for
* CPUs again.
*/
cpumask_andnot(cp->effective_cpus, cp->effective_cpus,
cp->subparts_cpus);
if (cpumask_empty(cp->effective_cpus)) {
cpumask_copy(cp->effective_cpus, tmp->new_cpus);
cpumask_clear(cp->subparts_cpus);
cp->nr_subparts_cpus = 0;
} else if (!cpumask_subset(cp->subparts_cpus,
tmp->new_cpus)) {
cpumask_andnot(cp->subparts_cpus,
cp->subparts_cpus, tmp->new_cpus);
cp->nr_subparts_cpus
= cpumask_weight(cp->subparts_cpus);
}
}
if (new_prs != old_prs)
cp->partition_root_state = new_prs;
spin_unlock_irq(&callback_lock);
notify_partition_change(cp, old_prs, new_prs);
WARN_ON(!is_in_v2_mode() &&
!cpumask_equal(cp->cpus_allowed, cp->effective_cpus));
update_tasks_cpumask(cp);
/*
* On legacy hierarchy, if the effective cpumask of any non-
* empty cpuset is changed, we need to rebuild sched domains.
* On default hierarchy, the cpuset needs to be a partition
* root as well.
*/
if (!cpumask_empty(cp->cpus_allowed) &&
is_sched_load_balance(cp) &&
(!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) ||
is_partition_root(cp)))
need_rebuild_sched_domains = true;
rcu_read_lock();
css_put(&cp->css);
}
rcu_read_unlock();
if (need_rebuild_sched_domains)
rebuild_sched_domains_locked();
}
/**
* update_sibling_cpumasks - Update siblings cpumasks
* @parent: Parent cpuset
* @cs: Current cpuset
* @tmp: Temp variables
*/
static void update_sibling_cpumasks(struct cpuset *parent, struct cpuset *cs,
struct tmpmasks *tmp)
{
struct cpuset *sibling;
struct cgroup_subsys_state *pos_css;
percpu_rwsem_assert_held(&cpuset_rwsem);
/*
* Check all its siblings and call update_cpumasks_hier()
* if their use_parent_ecpus flag is set in order for them
* to use the right effective_cpus value.
*
* The update_cpumasks_hier() function may sleep. So we have to
* release the RCU read lock before calling it.
*/
rcu_read_lock();
cpuset_for_each_child(sibling, pos_css, parent) {
if (sibling == cs)
continue;
if (!sibling->use_parent_ecpus)
continue;
if (!css_tryget_online(&sibling->css))
continue;
rcu_read_unlock();
update_cpumasks_hier(sibling, tmp);
rcu_read_lock();
css_put(&sibling->css);
}
rcu_read_unlock();
}
/**
* update_cpumask - update the cpus_allowed mask of a cpuset and all tasks in it
* @cs: the cpuset to consider
* @trialcs: trial cpuset
* @buf: buffer of cpu numbers written to this cpuset
*/
static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
const char *buf)
{
int retval;
struct tmpmasks tmp;
/* top_cpuset.cpus_allowed tracks cpu_online_mask; it's read-only */
if (cs == &top_cpuset)
return -EACCES;
/*
* An empty cpus_allowed is ok only if the cpuset has no tasks.
* Since cpulist_parse() fails on an empty mask, we special case
* that parsing. The validate_change() call ensures that cpusets
* with tasks have cpus.
*/
if (!*buf) {
cpumask_clear(trialcs->cpus_allowed);
} else {
retval = cpulist_parse(buf, trialcs->cpus_allowed);
if (retval < 0)
return retval;
if (!cpumask_subset(trialcs->cpus_allowed,
top_cpuset.cpus_allowed))
return -EINVAL;
}
/* Nothing to do if the cpus didn't change */
if (cpumask_equal(cs->cpus_allowed, trialcs->cpus_allowed))
return 0;
retval = validate_change(cs, trialcs);
if (retval < 0)
return retval;
#ifdef CONFIG_CPUMASK_OFFSTACK
/*
* Use the cpumasks in trialcs for tmpmasks when they are pointers
* to allocated cpumasks.
*/
tmp.addmask = trialcs->subparts_cpus;
tmp.delmask = trialcs->effective_cpus;
tmp.new_cpus = trialcs->cpus_allowed;
#endif
if (cs->partition_root_state) {
/* Cpumask of a partition root cannot be empty */
if (cpumask_empty(trialcs->cpus_allowed))
return -EINVAL;
if (update_parent_subparts_cpumask(cs, partcmd_update,
trialcs->cpus_allowed, &tmp) < 0)
return -EINVAL;
}
spin_lock_irq(&callback_lock);
cpumask_copy(cs->cpus_allowed, trialcs->cpus_allowed);
/*
* Make sure that subparts_cpus is a subset of cpus_allowed.
*/
if (cs->nr_subparts_cpus) {
cpumask_and(cs->subparts_cpus, cs->subparts_cpus, cs->cpus_allowed);
cs->nr_subparts_cpus = cpumask_weight(cs->subparts_cpus);
}
spin_unlock_irq(&callback_lock);
update_cpumasks_hier(cs, &tmp);
if (cs->partition_root_state) {
struct cpuset *parent = parent_cs(cs);
/*
* For partition root, update the cpumasks of sibling
* cpusets if they use parent's effective_cpus.
*/
if (parent->child_ecpus_count)
update_sibling_cpumasks(parent, cs, &tmp);
}
return 0;
}
/*
* Migrate memory region from one set of nodes to another. This is
* performed asynchronously as it can be called from process migration path
* holding locks involved in process management. All mm migrations are
* performed in the queued order and can be waited for by flushing
* cpuset_migrate_mm_wq.
*/
struct cpuset_migrate_mm_work {
struct work_struct work;
struct mm_struct *mm;
nodemask_t from;
nodemask_t to;
};
static void cpuset_migrate_mm_workfn(struct work_struct *work)
{
struct cpuset_migrate_mm_work *mwork =
container_of(work, struct cpuset_migrate_mm_work, work);
/* on a wq worker, no need to worry about %current's mems_allowed */
do_migrate_pages(mwork->mm, &mwork->from, &mwork->to, MPOL_MF_MOVE_ALL);
mmput(mwork->mm);
kfree(mwork);
}
static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from,
const nodemask_t *to)
{
struct cpuset_migrate_mm_work *mwork;
if (nodes_equal(*from, *to)) {
mmput(mm);
return;
}
mwork = kzalloc(sizeof(*mwork), GFP_KERNEL);
if (mwork) {
mwork->mm = mm;
mwork->from = *from;
mwork->to = *to;
INIT_WORK(&mwork->work, cpuset_migrate_mm_workfn);
queue_work(cpuset_migrate_mm_wq, &mwork->work);
} else {
mmput(mm);
}
}
static void cpuset_post_attach(void)
{
flush_workqueue(cpuset_migrate_mm_wq);
}
/*
* cpuset_change_task_nodemask - change task's mems_allowed and mempolicy
* @tsk: the task to change
* @newmems: new nodes that the task will be set
*
* We use the mems_allowed_seq seqlock to safely update both tsk->mems_allowed
* and rebind an eventual tasks' mempolicy. If the task is allocating in
* parallel, it might temporarily see an empty intersection, which results in
* a seqlock check and retry before OOM or allocation failure.
*/
static void cpuset_change_task_nodemask(struct task_struct *tsk,
nodemask_t *newmems)
{
task_lock(tsk);
local_irq_disable();
write_seqcount_begin(&tsk->mems_allowed_seq);
nodes_or(tsk->mems_allowed, tsk->mems_allowed, *newmems);
mpol_rebind_task(tsk, newmems);
tsk->mems_allowed = *newmems;
write_seqcount_end(&tsk->mems_allowed_seq);
local_irq_enable();
task_unlock(tsk);
}
static void *cpuset_being_rebound;
/**
* update_tasks_nodemask - Update the nodemasks of tasks in the cpuset.
* @cs: the cpuset in which each task's mems_allowed mask needs to be changed
*
* Iterate through each task of @cs updating its mems_allowed to the
* effective cpuset's. As this function is called with cpuset_rwsem held,
* cpuset membership stays stable.
*/
static void update_tasks_nodemask(struct cpuset *cs)
{
static nodemask_t newmems; /* protected by cpuset_rwsem */
struct css_task_iter it;
struct task_struct *task;
cpuset_being_rebound = cs; /* causes mpol_dup() rebind */
guarantee_online_mems(cs, &newmems);
/*
* The mpol_rebind_mm() call takes mmap_lock, which we couldn't
* take while holding tasklist_lock. Forks can happen - the
* mpol_dup() cpuset_being_rebound check will catch such forks,
* and rebind their vma mempolicies too. Because we still hold
* the global cpuset_rwsem, we know that no other rebind effort
* will be contending for the global variable cpuset_being_rebound.
* It's ok if we rebind the same mm twice; mpol_rebind_mm()
* is idempotent. Also migrate pages in each mm to new nodes.
*/
css_task_iter_start(&cs->css, 0, &it);
while ((task = css_task_iter_next(&it))) {
struct mm_struct *mm;
bool migrate;
cpuset_change_task_nodemask(task, &newmems);
mm = get_task_mm(task);
if (!mm)
continue;
migrate = is_memory_migrate(cs);
mpol_rebind_mm(mm, &cs->mems_allowed);
if (migrate)
cpuset_migrate_mm(mm, &cs->old_mems_allowed, &newmems);
else
mmput(mm);
}
css_task_iter_end(&it);
/*
* All the tasks' nodemasks have been updated, update
* cs->old_mems_allowed.
*/
cs->old_mems_allowed = newmems;
/* We're done rebinding vmas to this cpuset's new mems_allowed. */
cpuset_being_rebound = NULL;
}
/*
* update_nodemasks_hier - Update effective nodemasks and tasks in the subtree
* @cs: the cpuset to consider
* @new_mems: a temp variable for calculating new effective_mems
*
* When configured nodemask is changed, the effective nodemasks of this cpuset
* and all its descendants need to be updated.
*
* On legacy hierarchy, effective_mems will be the same with mems_allowed.
*
* Called with cpuset_rwsem held
*/
static void update_nodemasks_hier(struct cpuset *cs, nodemask_t *new_mems)
{
struct cpuset *cp;
struct cgroup_subsys_state *pos_css;
rcu_read_lock();
cpuset_for_each_descendant_pre(cp, pos_css, cs) {
struct cpuset *parent = parent_cs(cp);
nodes_and(*new_mems, cp->mems_allowed, parent->effective_mems);
/*
* If it becomes empty, inherit the effective mask of the
* parent, which is guaranteed to have some MEMs.
*/
if (is_in_v2_mode() && nodes_empty(*new_mems))
*new_mems = parent->effective_mems;
/* Skip the whole subtree if the nodemask remains the same. */
if (nodes_equal(*new_mems, cp->effective_mems)) {
pos_css = css_rightmost_descendant(pos_css);
continue;
}
if (!css_tryget_online(&cp->css))
continue;
rcu_read_unlock();
spin_lock_irq(&callback_lock);
cp->effective_mems = *new_mems;
spin_unlock_irq(&callback_lock);
WARN_ON(!is_in_v2_mode() &&
!nodes_equal(cp->mems_allowed, cp->effective_mems));
update_tasks_nodemask(cp);
rcu_read_lock();
css_put(&cp->css);
}
rcu_read_unlock();
}
/*
* Handle user request to change the 'mems' memory placement
* of a cpuset. Needs to validate the request, update the
* cpusets mems_allowed, and for each task in the cpuset,
* update mems_allowed and rebind task's mempolicy and any vma
* mempolicies and if the cpuset is marked 'memory_migrate',
* migrate the tasks pages to the new memory.
*
* Call with cpuset_rwsem held. May take callback_lock during call.
* Will take tasklist_lock, scan tasklist for tasks in cpuset cs,
* lock each such tasks mm->mmap_lock, scan its vma's and rebind
* their mempolicies to the cpusets new mems_allowed.
*/
static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs,
const char *buf)
{
int retval;
/*
* top_cpuset.mems_allowed tracks node_stats[N_MEMORY];
* it's read-only
*/
if (cs == &top_cpuset) {
retval = -EACCES;
goto done;
}
/*
* An empty mems_allowed is ok iff there are no tasks in the cpuset.
* Since nodelist_parse() fails on an empty mask, we special case
* that parsing. The validate_change() call ensures that cpusets
* with tasks have memory.
*/
if (!*buf) {
nodes_clear(trialcs->mems_allowed);
} else {
retval = nodelist_parse(buf, trialcs->mems_allowed);
if (retval < 0)
goto done;
if (!nodes_subset(trialcs->mems_allowed,
top_cpuset.mems_allowed)) {
retval = -EINVAL;
goto done;
}
}
if (nodes_equal(cs->mems_allowed, trialcs->mems_allowed)) {
retval = 0; /* Too easy - nothing to do */
goto done;
}
retval = validate_change(cs, trialcs);
if (retval < 0)
goto done;
spin_lock_irq(&callback_lock);
cs->mems_allowed = trialcs->mems_allowed;
spin_unlock_irq(&callback_lock);
/* use trialcs->mems_allowed as a temp variable */
update_nodemasks_hier(cs, &trialcs->mems_allowed);
done:
return retval;
}
bool current_cpuset_is_being_rebound(void)
{
bool ret;
rcu_read_lock();
ret = task_cs(current) == cpuset_being_rebound;
rcu_read_unlock();
return ret;
}
static int update_relax_domain_level(struct cpuset *cs, s64 val)
{
#ifdef CONFIG_SMP
if (val < -1 || val >= sched_domain_level_max)
return -EINVAL;
#endif
if (val != cs->relax_domain_level) {
cs->relax_domain_level = val;
if (!cpumask_empty(cs->cpus_allowed) &&
is_sched_load_balance(cs))
rebuild_sched_domains_locked();
}
return 0;
}
/**
* update_tasks_flags - update the spread flags of tasks in the cpuset.
* @cs: the cpuset in which each task's spread flags needs to be changed
*
* Iterate through each task of @cs updating its spread flags. As this
* function is called with cpuset_rwsem held, cpuset membership stays
* stable.
*/
static void update_tasks_flags(struct cpuset *cs)
{
struct css_task_iter it;
struct task_struct *task;
css_task_iter_start(&cs->css, 0, &it);
while ((task = css_task_iter_next(&it)))
cpuset_update_task_spread_flag(cs, task);
css_task_iter_end(&it);
}
/*
* update_flag - read a 0 or a 1 in a file and update associated flag
* bit: the bit to update (see cpuset_flagbits_t)
* cs: the cpuset to update
* turning_on: whether the flag is being set or cleared
*
* Call with cpuset_rwsem held.
*/
static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs,
int turning_on)
{
struct cpuset *trialcs;
int balance_flag_changed;
int spread_flag_changed;
int err;
trialcs = alloc_trial_cpuset(cs);
if (!trialcs)
return -ENOMEM;
if (turning_on)
set_bit(bit, &trialcs->flags);
else
clear_bit(bit, &trialcs->flags);
err = validate_change(cs, trialcs);
if (err < 0)
goto out;
balance_flag_changed = (is_sched_load_balance(cs) !=
is_sched_load_balance(trialcs));
spread_flag_changed = ((is_spread_slab(cs) != is_spread_slab(trialcs))
|| (is_spread_page(cs) != is_spread_page(trialcs)));
spin_lock_irq(&callback_lock);
cs->flags = trialcs->flags;
spin_unlock_irq(&callback_lock);
if (!cpumask_empty(trialcs->cpus_allowed) && balance_flag_changed)
rebuild_sched_domains_locked();
if (spread_flag_changed)
update_tasks_flags(cs);
out:
free_cpuset(trialcs);
return err;
}
/*
* update_prstate - update partititon_root_state
* cs: the cpuset to update
* new_prs: new partition root state
*
* Call with cpuset_rwsem held.
*/
static int update_prstate(struct cpuset *cs, int new_prs)
{
int err, old_prs = cs->partition_root_state;
struct cpuset *parent = parent_cs(cs);
struct tmpmasks tmpmask;
if (old_prs == new_prs)
return 0;
/*
* Cannot force a partial or invalid partition root to a full
* partition root.
*/
if (new_prs && (old_prs == PRS_ERROR))
return -EINVAL;
if (alloc_cpumasks(NULL, &tmpmask))
return -ENOMEM;
err = -EINVAL;
if (!old_prs) {
/*
* Turning on partition root requires setting the
* CS_CPU_EXCLUSIVE bit implicitly as well and cpus_allowed
* cannot be NULL.
*/
if (cpumask_empty(cs->cpus_allowed))
goto out;
err = update_flag(CS_CPU_EXCLUSIVE, cs, 1);
if (err)
goto out;
err = update_parent_subparts_cpumask(cs, partcmd_enable,
NULL, &tmpmask);
if (err) {
update_flag(CS_CPU_EXCLUSIVE, cs, 0);
goto out;
}
} else {
/*
* Turning off partition root will clear the
* CS_CPU_EXCLUSIVE bit.
*/
if (old_prs == PRS_ERROR) {
update_flag(CS_CPU_EXCLUSIVE, cs, 0);
err = 0;
goto out;
}
err = update_parent_subparts_cpumask(cs, partcmd_disable,
NULL, &tmpmask);
if (err)
goto out;
/* Turning off CS_CPU_EXCLUSIVE will not return error */
update_flag(CS_CPU_EXCLUSIVE, cs, 0);
}
/*
* Update cpumask of parent's tasks except when it is the top
* cpuset as some system daemons cannot be mapped to other CPUs.
*/
if (parent != &top_cpuset)
update_tasks_cpumask(parent);
if (parent->child_ecpus_count)
update_sibling_cpumasks(parent, cs, &tmpmask);
rebuild_sched_domains_locked();
out:
if (!err) {
spin_lock_irq(&callback_lock);
cs->partition_root_state = new_prs;
spin_unlock_irq(&callback_lock);
notify_partition_change(cs, old_prs, new_prs);
}
free_cpumasks(NULL, &tmpmask);
return err;
}
/*
* Frequency meter - How fast is some event occurring?
*
* These routines manage a digitally filtered, constant time based,
* event frequency meter. There are four routines:
* fmeter_init() - initialize a frequency meter.
* fmeter_markevent() - called each time the event happens.
* fmeter_getrate() - returns the recent rate of such events.
* fmeter_update() - internal routine used to update fmeter.
*
* A common data structure is passed to each of these routines,
* which is used to keep track of the state required to manage the
* frequency meter and its digital filter.
*
* The filter works on the number of events marked per unit time.
* The filter is single-pole low-pass recursive (IIR). The time unit
* is 1 second. Arithmetic is done using 32-bit integers scaled to
* simulate 3 decimal digits of precision (multiplied by 1000).
*
* With an FM_COEF of 933, and a time base of 1 second, the filter
* has a half-life of 10 seconds, meaning that if the events quit
* happening, then the rate returned from the fmeter_getrate()
* will be cut in half each 10 seconds, until it converges to zero.
*
* It is not worth doing a real infinitely recursive filter. If more
* than FM_MAXTICKS ticks have elapsed since the last filter event,
* just compute FM_MAXTICKS ticks worth, by which point the level
* will be stable.
*
* Limit the count of unprocessed events to FM_MAXCNT, so as to avoid
* arithmetic overflow in the fmeter_update() routine.
*
* Given the simple 32 bit integer arithmetic used, this meter works
* best for reporting rates between one per millisecond (msec) and
* one per 32 (approx) seconds. At constant rates faster than one
* per msec it maxes out at values just under 1,000,000. At constant
* rates between one per msec, and one per second it will stabilize
* to a value N*1000, where N is the rate of events per second.
* At constant rates between one per second and one per 32 seconds,
* it will be choppy, moving up on the seconds that have an event,
* and then decaying until the next event. At rates slower than
* about one in 32 seconds, it decays all the way back to zero between
* each event.
*/
#define FM_COEF 933 /* coefficient for half-life of 10 secs */
#define FM_MAXTICKS ((u32)99) /* useless computing more ticks than this */
#define FM_MAXCNT 1000000 /* limit cnt to avoid overflow */
#define FM_SCALE 1000 /* faux fixed point scale */
/* Initialize a frequency meter */
static void fmeter_init(struct fmeter *fmp)
{
fmp->cnt = 0;
fmp->val = 0;
fmp->time = 0;
spin_lock_init(&fmp->lock);
}
/* Internal meter update - process cnt events and update value */
static void fmeter_update(struct fmeter *fmp)
{
time64_t now;
u32 ticks;
now = ktime_get_seconds();
ticks = now - fmp->time;
if (ticks == 0)
return;
ticks = min(FM_MAXTICKS, ticks);
while (ticks-- > 0)
fmp->val = (FM_COEF * fmp->val) / FM_SCALE;
fmp->time = now;
fmp->val += ((FM_SCALE - FM_COEF) * fmp->cnt) / FM_SCALE;
fmp->cnt = 0;
}
/* Process any previous ticks, then bump cnt by one (times scale). */
static void fmeter_markevent(struct fmeter *fmp)
{
spin_lock(&fmp->lock);
fmeter_update(fmp);
fmp->cnt = min(FM_MAXCNT, fmp->cnt + FM_SCALE);
spin_unlock(&fmp->lock);
}
/* Process any previous ticks, then return current value. */
static int fmeter_getrate(struct fmeter *fmp)
{
int val;
spin_lock(&fmp->lock);
fmeter_update(fmp);
val = fmp->val;
spin_unlock(&fmp->lock);
return val;
}
static struct cpuset *cpuset_attach_old_cs;
/* Called by cgroups to determine if a cpuset is usable; cpuset_rwsem held */
static int cpuset_can_attach(struct cgroup_taskset *tset)
{
struct cgroup_subsys_state *css;
struct cpuset *cs;
struct task_struct *task;
int ret;
/* used later by cpuset_attach() */
cpuset_attach_old_cs = task_cs(cgroup_taskset_first(tset, &css));
cs = css_cs(css);
percpu_down_write(&cpuset_rwsem);
/* allow moving tasks into an empty cpuset if on default hierarchy */
ret = -ENOSPC;
if (!is_in_v2_mode() &&
(cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed)))
goto out_unlock;
cgroup_taskset_for_each(task, css, tset) {
ret = task_can_attach(task, cs->cpus_allowed);
if (ret)
goto out_unlock;
ret = security_task_setscheduler(task);
if (ret)
goto out_unlock;
}
/*
* Mark attach is in progress. This makes validate_change() fail
* changes which zero cpus/mems_allowed.
*/
cs->attach_in_progress++;
ret = 0;
out_unlock:
percpu_up_write(&cpuset_rwsem);
return ret;
}
static void cpuset_cancel_attach(struct cgroup_taskset *tset)
{
struct cgroup_subsys_state *css;
cgroup_taskset_first(tset, &css);
percpu_down_write(&cpuset_rwsem);
css_cs(css)->attach_in_progress--;
percpu_up_write(&cpuset_rwsem);
}
/*
* Protected by cpuset_rwsem. cpus_attach is used only by cpuset_attach()
* but we can't allocate it dynamically there. Define it global and
* allocate from cpuset_init().
*/
static cpumask_var_t cpus_attach;
static void cpuset_attach(struct cgroup_taskset *tset)
{
/* static buf protected by cpuset_rwsem */
static nodemask_t cpuset_attach_nodemask_to;
struct task_struct *task;
struct task_struct *leader;
struct cgroup_subsys_state *css;
struct cpuset *cs;
struct cpuset *oldcs = cpuset_attach_old_cs;
cgroup_taskset_first(tset, &css);
cs = css_cs(css);
cpus_read_lock();
percpu_down_write(&cpuset_rwsem);
guarantee_online_mems(cs, &cpuset_attach_nodemask_to);
cgroup_taskset_for_each(task, css, tset) {
if (cs != &top_cpuset)
guarantee_online_cpus(task, cpus_attach);
else
cpumask_copy(cpus_attach, task_cpu_possible_mask(task));
/*
* can_attach beforehand should guarantee that this doesn't
* fail. TODO: have a better way to handle failure here
*/
WARN_ON_ONCE(set_cpus_allowed_ptr(task, cpus_attach));
cpuset_change_task_nodemask(task, &cpuset_attach_nodemask_to);
cpuset_update_task_spread_flag(cs, task);
}
/*
* Change mm for all threadgroup leaders. This is expensive and may
* sleep and should be moved outside migration path proper.
*/
cpuset_attach_nodemask_to = cs->effective_mems;
cgroup_taskset_for_each_leader(leader, css, tset) {
struct mm_struct *mm = get_task_mm(leader);
if (mm) {
mpol_rebind_mm(mm, &cpuset_attach_nodemask_to);
/*
* old_mems_allowed is the same with mems_allowed
* here, except if this task is being moved
* automatically due to hotplug. In that case
* @mems_allowed has been updated and is empty, so
* @old_mems_allowed is the right nodesets that we
* migrate mm from.
*/
if (is_memory_migrate(cs))
cpuset_migrate_mm(mm, &oldcs->old_mems_allowed,
&cpuset_attach_nodemask_to);
else
mmput(mm);
}
}
cs->old_mems_allowed = cpuset_attach_nodemask_to;
cs->attach_in_progress--;
if (!cs->attach_in_progress)
wake_up(&cpuset_attach_wq);
percpu_up_write(&cpuset_rwsem);
cpus_read_unlock();
}
/* The various types of files and directories in a cpuset file system */
typedef enum {
FILE_MEMORY_MIGRATE,
FILE_CPULIST,
FILE_MEMLIST,
FILE_EFFECTIVE_CPULIST,
FILE_EFFECTIVE_MEMLIST,
FILE_SUBPARTS_CPULIST,
FILE_CPU_EXCLUSIVE,
FILE_MEM_EXCLUSIVE,
FILE_MEM_HARDWALL,
FILE_SCHED_LOAD_BALANCE,
FILE_PARTITION_ROOT,
FILE_SCHED_RELAX_DOMAIN_LEVEL,
FILE_MEMORY_PRESSURE_ENABLED,
FILE_MEMORY_PRESSURE,
FILE_SPREAD_PAGE,
FILE_SPREAD_SLAB,
} cpuset_filetype_t;
static int cpuset_write_u64(struct cgroup_subsys_state *css, struct cftype *cft,
u64 val)
{
struct cpuset *cs = css_cs(css);
cpuset_filetype_t type = cft->private;
int retval = 0;
cpus_read_lock();
percpu_down_write(&cpuset_rwsem);
if (!is_cpuset_online(cs)) {
retval = -ENODEV;
goto out_unlock;
}
switch (type) {
case FILE_CPU_EXCLUSIVE:
retval = update_flag(CS_CPU_EXCLUSIVE, cs, val);
break;
case FILE_MEM_EXCLUSIVE:
retval = update_flag(CS_MEM_EXCLUSIVE, cs, val);
break;
case FILE_MEM_HARDWALL:
retval = update_flag(CS_MEM_HARDWALL, cs, val);
break;
case FILE_SCHED_LOAD_BALANCE:
retval = update_flag(CS_SCHED_LOAD_BALANCE, cs, val);
break;
case FILE_MEMORY_MIGRATE:
retval = update_flag(CS_MEMORY_MIGRATE, cs, val);
break;
case FILE_MEMORY_PRESSURE_ENABLED:
cpuset_memory_pressure_enabled = !!val;
break;
case FILE_SPREAD_PAGE:
retval = update_flag(CS_SPREAD_PAGE, cs, val);
break;
case FILE_SPREAD_SLAB:
retval = update_flag(CS_SPREAD_SLAB, cs, val);
break;
default:
retval = -EINVAL;
break;
}
out_unlock:
percpu_up_write(&cpuset_rwsem);
cpus_read_unlock();
return retval;
}
static int cpuset_write_s64(struct cgroup_subsys_state *css, struct cftype *cft,
s64 val)
{
struct cpuset *cs = css_cs(css);
cpuset_filetype_t type = cft->private;
int retval = -ENODEV;
cpus_read_lock();
percpu_down_write(&cpuset_rwsem);
if (!is_cpuset_online(cs))
goto out_unlock;
switch (type) {
case FILE_SCHED_RELAX_DOMAIN_LEVEL:
retval = update_relax_domain_level(cs, val);
break;
default:
retval = -EINVAL;
break;
}
out_unlock:
percpu_up_write(&cpuset_rwsem);
cpus_read_unlock();
return retval;
}
/*
* Common handling for a write to a "cpus" or "mems" file.
*/
static ssize_t cpuset_write_resmask(struct kernfs_open_file *of,
char *buf, size_t nbytes, loff_t off)
{
struct cpuset *cs = css_cs(of_css(of));
struct cpuset *trialcs;
int retval = -ENODEV;
buf = strstrip(buf);
/*
* CPU or memory hotunplug may leave @cs w/o any execution
* resources, in which case the hotplug code asynchronously updates
* configuration and transfers all tasks to the nearest ancestor
* which can execute.
*
* As writes to "cpus" or "mems" may restore @cs's execution
* resources, wait for the previously scheduled operations before
* proceeding, so that we don't end up keep removing tasks added
* after execution capability is restored.
*
* cpuset_hotplug_work calls back into cgroup core via
* cgroup_transfer_tasks() and waiting for it from a cgroupfs
* operation like this one can lead to a deadlock through kernfs
* active_ref protection. Let's break the protection. Losing the
* protection is okay as we check whether @cs is online after
* grabbing cpuset_rwsem anyway. This only happens on the legacy
* hierarchies.
*/
css_get(&cs->css);
kernfs_break_active_protection(of->kn);
flush_work(&cpuset_hotplug_work);
cpus_read_lock();
percpu_down_write(&cpuset_rwsem);
if (!is_cpuset_online(cs))
goto out_unlock;
trialcs = alloc_trial_cpuset(cs);
if (!trialcs) {
retval = -ENOMEM;
goto out_unlock;
}
switch (of_cft(of)->private) {
case FILE_CPULIST:
retval = update_cpumask(cs, trialcs, buf);
break;
case FILE_MEMLIST:
retval = update_nodemask(cs, trialcs, buf);
break;
default:
retval = -EINVAL;
break;
}
free_cpuset(trialcs);
out_unlock:
percpu_up_write(&cpuset_rwsem);
cpus_read_unlock();
kernfs_unbreak_active_protection(of->kn);
css_put(&cs->css);
flush_workqueue(cpuset_migrate_mm_wq);
return retval ?: nbytes;
}
/*
* These ascii lists should be read in a single call, by using a user
* buffer large enough to hold the entire map. If read in smaller
* chunks, there is no guarantee of atomicity. Since the display format
* used, list of ranges of sequential numbers, is variable length,
* and since these maps can change value dynamically, one could read
* gibberish by doing partial reads while a list was changing.
*/
static int cpuset_common_seq_show(struct seq_file *sf, void *v)
{
struct cpuset *cs = css_cs(seq_css(sf));
cpuset_filetype_t type = seq_cft(sf)->private;
int ret = 0;
spin_lock_irq(&callback_lock);
switch (type) {
case FILE_CPULIST:
seq_printf(sf, "%*pbl\n", cpumask_pr_args(cs->cpus_allowed));
break;
case FILE_MEMLIST:
seq_printf(sf, "%*pbl\n", nodemask_pr_args(&cs->mems_allowed));
break;
case FILE_EFFECTIVE_CPULIST:
seq_printf(sf, "%*pbl\n", cpumask_pr_args(cs->effective_cpus));
break;
case FILE_EFFECTIVE_MEMLIST:
seq_printf(sf, "%*pbl\n", nodemask_pr_args(&cs->effective_mems));
break;
case FILE_SUBPARTS_CPULIST:
seq_printf(sf, "%*pbl\n", cpumask_pr_args(cs->subparts_cpus));
break;
default:
ret = -EINVAL;
}
spin_unlock_irq(&callback_lock);
return ret;
}
static u64 cpuset_read_u64(struct cgroup_subsys_state *css, struct cftype *cft)
{
struct cpuset *cs = css_cs(css);
cpuset_filetype_t type = cft->private;
switch (type) {
case FILE_CPU_EXCLUSIVE:
return is_cpu_exclusive(cs);
case FILE_MEM_EXCLUSIVE:
return is_mem_exclusive(cs);
case FILE_MEM_HARDWALL:
return is_mem_hardwall(cs);
case FILE_SCHED_LOAD_BALANCE:
return is_sched_load_balance(cs);
case FILE_MEMORY_MIGRATE:
return is_memory_migrate(cs);
case FILE_MEMORY_PRESSURE_ENABLED:
return cpuset_memory_pressure_enabled;
case FILE_MEMORY_PRESSURE:
return fmeter_getrate(&cs->fmeter);
case FILE_SPREAD_PAGE:
return is_spread_page(cs);
case FILE_SPREAD_SLAB:
return is_spread_slab(cs);
default:
BUG();
}
/* Unreachable but makes gcc happy */
return 0;
}
static s64 cpuset_read_s64(struct cgroup_subsys_state *css, struct cftype *cft)
{
struct cpuset *cs = css_cs(css);
cpuset_filetype_t type = cft->private;
switch (type) {
case FILE_SCHED_RELAX_DOMAIN_LEVEL:
return cs->relax_domain_level;
default:
BUG();
}
/* Unreachable but makes gcc happy */
return 0;
}
static int sched_partition_show(struct seq_file *seq, void *v)
{
struct cpuset *cs = css_cs(seq_css(seq));
switch (cs->partition_root_state) {
case PRS_ENABLED:
seq_puts(seq, "root\n");
break;
case PRS_DISABLED:
seq_puts(seq, "member\n");
break;
case PRS_ERROR:
seq_puts(seq, "root invalid\n");
break;
}
return 0;
}
static ssize_t sched_partition_write(struct kernfs_open_file *of, char *buf,
size_t nbytes, loff_t off)
{
struct cpuset *cs = css_cs(of_css(of));
int val;
int retval = -ENODEV;
buf = strstrip(buf);
/*
* Convert "root" to ENABLED, and convert "member" to DISABLED.
*/
if (!strcmp(buf, "root"))
val = PRS_ENABLED;
else if (!strcmp(buf, "member"))
val = PRS_DISABLED;
else
return -EINVAL;
css_get(&cs->css);
cpus_read_lock();
percpu_down_write(&cpuset_rwsem);
if (!is_cpuset_online(cs))
goto out_unlock;
retval = update_prstate(cs, val);
out_unlock:
percpu_up_write(&cpuset_rwsem);
cpus_read_unlock();
css_put(&cs->css);
return retval ?: nbytes;
}
/*
* for the common functions, 'private' gives the type of file
*/
static struct cftype legacy_files[] = {
{
.name = "cpus",
.seq_show = cpuset_common_seq_show,
.write = cpuset_write_resmask,
.max_write_len = (100U + 6 * NR_CPUS),
.private = FILE_CPULIST,
},
{
.name = "mems",
.seq_show = cpuset_common_seq_show,
.write = cpuset_write_resmask,
.max_write_len = (100U + 6 * MAX_NUMNODES),
.private = FILE_MEMLIST,
},
{
.name = "effective_cpus",
.seq_show = cpuset_common_seq_show,
.private = FILE_EFFECTIVE_CPULIST,
},
{
.name = "effective_mems",
.seq_show = cpuset_common_seq_show,
.private = FILE_EFFECTIVE_MEMLIST,
},
{
.name = "cpu_exclusive",
.read_u64 = cpuset_read_u64,
.write_u64 = cpuset_write_u64,
.private = FILE_CPU_EXCLUSIVE,
},
{
.name = "mem_exclusive",
.read_u64 = cpuset_read_u64,
.write_u64 = cpuset_write_u64,
.private = FILE_MEM_EXCLUSIVE,
},
{
.name = "mem_hardwall",
.read_u64 = cpuset_read_u64,
.write_u64 = cpuset_write_u64,
.private = FILE_MEM_HARDWALL,
},
{
.name = "sched_load_balance",
.read_u64 = cpuset_read_u64,
.write_u64 = cpuset_write_u64,
.private = FILE_SCHED_LOAD_BALANCE,
},
{
.name = "sched_relax_domain_level",
.read_s64 = cpuset_read_s64,
.write_s64 = cpuset_write_s64,
.private = FILE_SCHED_RELAX_DOMAIN_LEVEL,
},
{
.name = "memory_migrate",
.read_u64 = cpuset_read_u64,
.write_u64 = cpuset_write_u64,
.private = FILE_MEMORY_MIGRATE,
},
{
.name = "memory_pressure",
.read_u64 = cpuset_read_u64,
.private = FILE_MEMORY_PRESSURE,
},
{
.name = "memory_spread_page",
.read_u64 = cpuset_read_u64,
.write_u64 = cpuset_write_u64,
.private = FILE_SPREAD_PAGE,
},
{
.name = "memory_spread_slab",
.read_u64 = cpuset_read_u64,
.write_u64 = cpuset_write_u64,
.private = FILE_SPREAD_SLAB,
},
{
.name = "memory_pressure_enabled",
.flags = CFTYPE_ONLY_ON_ROOT,
.read_u64 = cpuset_read_u64,
.write_u64 = cpuset_write_u64,
.private = FILE_MEMORY_PRESSURE_ENABLED,
},
{ } /* terminate */
};
/*
* This is currently a minimal set for the default hierarchy. It can be
* expanded later on by migrating more features and control files from v1.
*/
static struct cftype dfl_files[] = {
{
.name = "cpus",
.seq_show = cpuset_common_seq_show,
.write = cpuset_write_resmask,
.max_write_len = (100U + 6 * NR_CPUS),
.private = FILE_CPULIST,
.flags = CFTYPE_NOT_ON_ROOT,
},
{
.name = "mems",
.seq_show = cpuset_common_seq_show,
.write = cpuset_write_resmask,
.max_write_len = (100U + 6 * MAX_NUMNODES),
.private = FILE_MEMLIST,
.flags = CFTYPE_NOT_ON_ROOT,
},
{
.name = "cpus.effective",
.seq_show = cpuset_common_seq_show,
.private = FILE_EFFECTIVE_CPULIST,
},
{
.name = "mems.effective",
.seq_show = cpuset_common_seq_show,
.private = FILE_EFFECTIVE_MEMLIST,
},
{
.name = "cpus.partition",
.seq_show = sched_partition_show,
.write = sched_partition_write,
.private = FILE_PARTITION_ROOT,
.flags = CFTYPE_NOT_ON_ROOT,
.file_offset = offsetof(struct cpuset, partition_file),
},
{
.name = "cpus.subpartitions",
.seq_show = cpuset_common_seq_show,
.private = FILE_SUBPARTS_CPULIST,
.flags = CFTYPE_DEBUG,
},
{ } /* terminate */
};
/*
* cpuset_css_alloc - allocate a cpuset css
* cgrp: control group that the new cpuset will be part of
*/
static struct cgroup_subsys_state *
cpuset_css_alloc(struct cgroup_subsys_state *parent_css)
{
struct cpuset *cs;
if (!parent_css)
return &top_cpuset.css;
cs = kzalloc(sizeof(*cs), GFP_KERNEL);
if (!cs)
return ERR_PTR(-ENOMEM);
if (alloc_cpumasks(cs, NULL)) {
kfree(cs);
return ERR_PTR(-ENOMEM);
}
__set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags);
nodes_clear(cs->mems_allowed);
nodes_clear(cs->effective_mems);
fmeter_init(&cs->fmeter);
cs->relax_domain_level = -1;
/* Set CS_MEMORY_MIGRATE for default hierarchy */
if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys))
__set_bit(CS_MEMORY_MIGRATE, &cs->flags);
return &cs->css;
}
static int cpuset_css_online(struct cgroup_subsys_state *css)
{
struct cpuset *cs = css_cs(css);
struct cpuset *parent = parent_cs(cs);
struct cpuset *tmp_cs;
struct cgroup_subsys_state *pos_css;
if (!parent)
return 0;
cpus_read_lock();
percpu_down_write(&cpuset_rwsem);
set_bit(CS_ONLINE, &cs->flags);
if (is_spread_page(parent))
set_bit(CS_SPREAD_PAGE, &cs->flags);
if (is_spread_slab(parent))
set_bit(CS_SPREAD_SLAB, &cs->flags);
cpuset_inc();
spin_lock_irq(&callback_lock);
if (is_in_v2_mode()) {
cpumask_copy(cs->effective_cpus, parent->effective_cpus);
cs->effective_mems = parent->effective_mems;
cs->use_parent_ecpus = true;
parent->child_ecpus_count++;
}
spin_unlock_irq(&callback_lock);
if (!test_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags))
goto out_unlock;
/*
* Clone @parent's configuration if CGRP_CPUSET_CLONE_CHILDREN is
* set. This flag handling is implemented in cgroup core for
* histrical reasons - the flag may be specified during mount.
*
* Currently, if any sibling cpusets have exclusive cpus or mem, we
* refuse to clone the configuration - thereby refusing the task to
* be entered, and as a result refusing the sys_unshare() or
* clone() which initiated it. If this becomes a problem for some
* users who wish to allow that scenario, then this could be
* changed to grant parent->cpus_allowed-sibling_cpus_exclusive
* (and likewise for mems) to the new cgroup.
*/
rcu_read_lock();
cpuset_for_each_child(tmp_cs, pos_css, parent) {
if (is_mem_exclusive(tmp_cs) || is_cpu_exclusive(tmp_cs)) {
rcu_read_unlock();
goto out_unlock;
}
}
rcu_read_unlock();
spin_lock_irq(&callback_lock);
cs->mems_allowed = parent->mems_allowed;
cs->effective_mems = parent->mems_allowed;
cpumask_copy(cs->cpus_allowed, parent->cpus_allowed);
cpumask_copy(cs->effective_cpus, parent->cpus_allowed);
spin_unlock_irq(&callback_lock);
out_unlock:
percpu_up_write(&cpuset_rwsem);
cpus_read_unlock();
return 0;
}
/*
* If the cpuset being removed has its flag 'sched_load_balance'
* enabled, then simulate turning sched_load_balance off, which
* will call rebuild_sched_domains_locked(). That is not needed
* in the default hierarchy where only changes in partition
* will cause repartitioning.
*
* If the cpuset has the 'sched.partition' flag enabled, simulate
* turning 'sched.partition" off.
*/
static void cpuset_css_offline(struct cgroup_subsys_state *css)
{
struct cpuset *cs = css_cs(css);
cpus_read_lock();
percpu_down_write(&cpuset_rwsem);
if (is_partition_root(cs))
update_prstate(cs, 0);
if (!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) &&
is_sched_load_balance(cs))
update_flag(CS_SCHED_LOAD_BALANCE, cs, 0);
if (cs->use_parent_ecpus) {
struct cpuset *parent = parent_cs(cs);
cs->use_parent_ecpus = false;
parent->child_ecpus_count--;
}
cpuset_dec();
clear_bit(CS_ONLINE, &cs->flags);
percpu_up_write(&cpuset_rwsem);
cpus_read_unlock();
}
static void cpuset_css_free(struct cgroup_subsys_state *css)
{
struct cpuset *cs = css_cs(css);
free_cpuset(cs);
}
static void cpuset_bind(struct cgroup_subsys_state *root_css)
{
percpu_down_write(&cpuset_rwsem);
spin_lock_irq(&callback_lock);
if (is_in_v2_mode()) {
cpumask_copy(top_cpuset.cpus_allowed, cpu_possible_mask);
top_cpuset.mems_allowed = node_possible_map;
} else {
cpumask_copy(top_cpuset.cpus_allowed,
top_cpuset.effective_cpus);
top_cpuset.mems_allowed = top_cpuset.effective_mems;
}
spin_unlock_irq(&callback_lock);
percpu_up_write(&cpuset_rwsem);
}
/*
* Make sure the new task conform to the current state of its parent,
* which could have been changed by cpuset just after it inherits the
* state from the parent and before it sits on the cgroup's task list.
*/
static void cpuset_fork(struct task_struct *task)
{
if (task_css_is_root(task, cpuset_cgrp_id))
return;
set_cpus_allowed_ptr(task, current->cpus_ptr);
task->mems_allowed = current->mems_allowed;
}
struct cgroup_subsys cpuset_cgrp_subsys = {
.css_alloc = cpuset_css_alloc,
.css_online = cpuset_css_online,
.css_offline = cpuset_css_offline,
.css_free = cpuset_css_free,
.can_attach = cpuset_can_attach,
.cancel_attach = cpuset_cancel_attach,
.attach = cpuset_attach,
.post_attach = cpuset_post_attach,
.bind = cpuset_bind,
.fork = cpuset_fork,
.legacy_cftypes = legacy_files,
.dfl_cftypes = dfl_files,
.early_init = true,
.threaded = true,
};
/**
* cpuset_init - initialize cpusets at system boot
*
* Description: Initialize top_cpuset
**/
int __init cpuset_init(void)
{
BUG_ON(percpu_init_rwsem(&cpuset_rwsem));
BUG_ON(!alloc_cpumask_var(&top_cpuset.cpus_allowed, GFP_KERNEL));
BUG_ON(!alloc_cpumask_var(&top_cpuset.effective_cpus, GFP_KERNEL));
BUG_ON(!zalloc_cpumask_var(&top_cpuset.subparts_cpus, GFP_KERNEL));
cpumask_setall(top_cpuset.cpus_allowed);
nodes_setall(top_cpuset.mems_allowed);
cpumask_setall(top_cpuset.effective_cpus);
nodes_setall(top_cpuset.effective_mems);
fmeter_init(&top_cpuset.fmeter);
set_bit(CS_SCHED_LOAD_BALANCE, &top_cpuset.flags);
top_cpuset.relax_domain_level = -1;
BUG_ON(!alloc_cpumask_var(&cpus_attach, GFP_KERNEL));
return 0;
}
/*
* If CPU and/or memory hotplug handlers, below, unplug any CPUs
* or memory nodes, we need to walk over the cpuset hierarchy,
* removing that CPU or node from all cpusets. If this removes the
* last CPU or node from a cpuset, then move the tasks in the empty
* cpuset to its next-highest non-empty parent.
*/
static void remove_tasks_in_empty_cpuset(struct cpuset *cs)
{
struct cpuset *parent;
/*
* Find its next-highest non-empty parent, (top cpuset
* has online cpus, so can't be empty).
*/
parent = parent_cs(cs);
while (cpumask_empty(parent->cpus_allowed) ||
nodes_empty(parent->mems_allowed))
parent = parent_cs(parent);
if (cgroup_transfer_tasks(parent->css.cgroup, cs->css.cgroup)) {
pr_err("cpuset: failed to transfer tasks out of empty cpuset ");
pr_cont_cgroup_name(cs->css.cgroup);
pr_cont("\n");
}
}
static void
hotplug_update_tasks_legacy(struct cpuset *cs,
struct cpumask *new_cpus, nodemask_t *new_mems,
bool cpus_updated, bool mems_updated)
{
bool is_empty;
spin_lock_irq(&callback_lock);
cpumask_copy(cs->cpus_allowed, new_cpus);
cpumask_copy(cs->effective_cpus, new_cpus);
cs->mems_allowed = *new_mems;
cs->effective_mems = *new_mems;
spin_unlock_irq(&callback_lock);
/*
* Don't call update_tasks_cpumask() if the cpuset becomes empty,
* as the tasks will be migratecd to an ancestor.
*/
if (cpus_updated && !cpumask_empty(cs->cpus_allowed))
update_tasks_cpumask(cs);
if (mems_updated && !nodes_empty(cs->mems_allowed))
update_tasks_nodemask(cs);
is_empty = cpumask_empty(cs->cpus_allowed) ||
nodes_empty(cs->mems_allowed);
percpu_up_write(&cpuset_rwsem);
/*
* Move tasks to the nearest ancestor with execution resources,
* This is full cgroup operation which will also call back into
* cpuset. Should be done outside any lock.
*/
if (is_empty)
remove_tasks_in_empty_cpuset(cs);
percpu_down_write(&cpuset_rwsem);
}
static void
hotplug_update_tasks(struct cpuset *cs,
struct cpumask *new_cpus, nodemask_t *new_mems,
bool cpus_updated, bool mems_updated)
{
if (cpumask_empty(new_cpus))
cpumask_copy(new_cpus, parent_cs(cs)->effective_cpus);
if (nodes_empty(*new_mems))
*new_mems = parent_cs(cs)->effective_mems;
spin_lock_irq(&callback_lock);
cpumask_copy(cs->effective_cpus, new_cpus);
cs->effective_mems = *new_mems;
spin_unlock_irq(&callback_lock);
if (cpus_updated)
update_tasks_cpumask(cs);
if (mems_updated)
update_tasks_nodemask(cs);
}
static bool force_rebuild;
void cpuset_force_rebuild(void)
{
force_rebuild = true;
}
/**
* cpuset_hotplug_update_tasks - update tasks in a cpuset for hotunplug
* @cs: cpuset in interest
* @tmp: the tmpmasks structure pointer
*
* Compare @cs's cpu and mem masks against top_cpuset and if some have gone
* offline, update @cs accordingly. If @cs ends up with no CPU or memory,
* all its tasks are moved to the nearest ancestor with both resources.
*/
static void cpuset_hotplug_update_tasks(struct cpuset *cs, struct tmpmasks *tmp)
{
static cpumask_t new_cpus;
static nodemask_t new_mems;
bool cpus_updated;
bool mems_updated;
struct cpuset *parent;
retry:
wait_event(cpuset_attach_wq, cs->attach_in_progress == 0);
percpu_down_write(&cpuset_rwsem);
/*
* We have raced with task attaching. We wait until attaching
* is finished, so we won't attach a task to an empty cpuset.
*/
if (cs->attach_in_progress) {
percpu_up_write(&cpuset_rwsem);
goto retry;
}
parent = parent_cs(cs);
compute_effective_cpumask(&new_cpus, cs, parent);
nodes_and(new_mems, cs->mems_allowed, parent->effective_mems);
if (cs->nr_subparts_cpus)
/*
* Make sure that CPUs allocated to child partitions
* do not show up in effective_cpus.
*/
cpumask_andnot(&new_cpus, &new_cpus, cs->subparts_cpus);
if (!tmp || !cs->partition_root_state)
goto update_tasks;
/*
* In the unlikely event that a partition root has empty
* effective_cpus or its parent becomes erroneous, we have to
* transition it to the erroneous state.
*/
if (is_partition_root(cs) && (cpumask_empty(&new_cpus) ||
(parent->partition_root_state == PRS_ERROR))) {
if (cs->nr_subparts_cpus) {
spin_lock_irq(&callback_lock);
cs->nr_subparts_cpus = 0;
cpumask_clear(cs->subparts_cpus);
spin_unlock_irq(&callback_lock);
compute_effective_cpumask(&new_cpus, cs, parent);
}
/*
* If the effective_cpus is empty because the child
* partitions take away all the CPUs, we can keep
* the current partition and let the child partitions
* fight for available CPUs.
*/
if ((parent->partition_root_state == PRS_ERROR) ||
cpumask_empty(&new_cpus)) {
int old_prs;
update_parent_subparts_cpumask(cs, partcmd_disable,
NULL, tmp);
old_prs = cs->partition_root_state;
if (old_prs != PRS_ERROR) {
spin_lock_irq(&callback_lock);
cs->partition_root_state = PRS_ERROR;
spin_unlock_irq(&callback_lock);
notify_partition_change(cs, old_prs, PRS_ERROR);
}
}
cpuset_force_rebuild();
}
/*
* On the other hand, an erroneous partition root may be transitioned
* back to a regular one or a partition root with no CPU allocated
* from the parent may change to erroneous.
*/
if (is_partition_root(parent) &&
((cs->partition_root_state == PRS_ERROR) ||
!cpumask_intersects(&new_cpus, parent->subparts_cpus)) &&
update_parent_subparts_cpumask(cs, partcmd_update, NULL, tmp))
cpuset_force_rebuild();
update_tasks:
cpus_updated = !cpumask_equal(&new_cpus, cs->effective_cpus);
mems_updated = !nodes_equal(new_mems, cs->effective_mems);
if (is_in_v2_mode())
hotplug_update_tasks(cs, &new_cpus, &new_mems,
cpus_updated, mems_updated);
else
hotplug_update_tasks_legacy(cs, &new_cpus, &new_mems,
cpus_updated, mems_updated);
percpu_up_write(&cpuset_rwsem);
}
/**
* cpuset_hotplug_workfn - handle CPU/memory hotunplug for a cpuset
*
* This function is called after either CPU or memory configuration has
* changed and updates cpuset accordingly. The top_cpuset is always
* synchronized to cpu_active_mask and N_MEMORY, which is necessary in
* order to make cpusets transparent (of no affect) on systems that are
* actively using CPU hotplug but making no active use of cpusets.
*
* Non-root cpusets are only affected by offlining. If any CPUs or memory
* nodes have been taken down, cpuset_hotplug_update_tasks() is invoked on
* all descendants.
*
* Note that CPU offlining during suspend is ignored. We don't modify
* cpusets across suspend/resume cycles at all.
*/
static void cpuset_hotplug_workfn(struct work_struct *work)
{
static cpumask_t new_cpus;
static nodemask_t new_mems;
bool cpus_updated, mems_updated;
bool on_dfl = is_in_v2_mode();
struct tmpmasks tmp, *ptmp = NULL;
if (on_dfl && !alloc_cpumasks(NULL, &tmp))
ptmp = &tmp;
percpu_down_write(&cpuset_rwsem);
/* fetch the available cpus/mems and find out which changed how */
cpumask_copy(&new_cpus, cpu_active_mask);
new_mems = node_states[N_MEMORY];
/*
* If subparts_cpus is populated, it is likely that the check below
* will produce a false positive on cpus_updated when the cpu list
* isn't changed. It is extra work, but it is better to be safe.
*/
cpus_updated = !cpumask_equal(top_cpuset.effective_cpus, &new_cpus);
mems_updated = !nodes_equal(top_cpuset.effective_mems, new_mems);
/*
* In the rare case that hotplug removes all the cpus in subparts_cpus,
* we assumed that cpus are updated.
*/
if (!cpus_updated && top_cpuset.nr_subparts_cpus)
cpus_updated = true;
/* synchronize cpus_allowed to cpu_active_mask */
if (cpus_updated) {
spin_lock_irq(&callback_lock);
if (!on_dfl)
cpumask_copy(top_cpuset.cpus_allowed, &new_cpus);
/*
* Make sure that CPUs allocated to child partitions
* do not show up in effective_cpus. If no CPU is left,
* we clear the subparts_cpus & let the child partitions
* fight for the CPUs again.
*/
if (top_cpuset.nr_subparts_cpus) {
if (cpumask_subset(&new_cpus,
top_cpuset.subparts_cpus)) {
top_cpuset.nr_subparts_cpus = 0;
cpumask_clear(top_cpuset.subparts_cpus);
} else {
cpumask_andnot(&new_cpus, &new_cpus,
top_cpuset.subparts_cpus);
}
}
cpumask_copy(top_cpuset.effective_cpus, &new_cpus);
spin_unlock_irq(&callback_lock);
/* we don't mess with cpumasks of tasks in top_cpuset */
}
/* synchronize mems_allowed to N_MEMORY */
if (mems_updated) {
spin_lock_irq(&callback_lock);
if (!on_dfl)
top_cpuset.mems_allowed = new_mems;
top_cpuset.effective_mems = new_mems;
spin_unlock_irq(&callback_lock);
update_tasks_nodemask(&top_cpuset);
}
percpu_up_write(&cpuset_rwsem);
/* if cpus or mems changed, we need to propagate to descendants */
if (cpus_updated || mems_updated) {
struct cpuset *cs;
struct cgroup_subsys_state *pos_css;
rcu_read_lock();
cpuset_for_each_descendant_pre(cs, pos_css, &top_cpuset) {
if (cs == &top_cpuset || !css_tryget_online(&cs->css))
continue;
rcu_read_unlock();
cpuset_hotplug_update_tasks(cs, ptmp);
rcu_read_lock();
css_put(&cs->css);
}
rcu_read_unlock();
}
/* rebuild sched domains if cpus_allowed has changed */
if (cpus_updated || force_rebuild) {
force_rebuild = false;
rebuild_sched_domains();
}
free_cpumasks(NULL, ptmp);
}
void cpuset_update_active_cpus(void)
{
/*
* We're inside cpu hotplug critical region which usually nests
* inside cgroup synchronization. Bounce actual hotplug processing
* to a work item to avoid reverse locking order.
*/
schedule_work(&cpuset_hotplug_work);
}
void cpuset_wait_for_hotplug(void)
{
flush_work(&cpuset_hotplug_work);
}
/*
* Keep top_cpuset.mems_allowed tracking node_states[N_MEMORY].
* Call this routine anytime after node_states[N_MEMORY] changes.
* See cpuset_update_active_cpus() for CPU hotplug handling.
*/
static int cpuset_track_online_nodes(struct notifier_block *self,
unsigned long action, void *arg)
{
schedule_work(&cpuset_hotplug_work);
return NOTIFY_OK;
}
static struct notifier_block cpuset_track_online_nodes_nb = {
.notifier_call = cpuset_track_online_nodes,
.priority = 10, /* ??! */
};
/**
* cpuset_init_smp - initialize cpus_allowed
*
* Description: Finish top cpuset after cpu, node maps are initialized
*/
void __init cpuset_init_smp(void)
{
cpumask_copy(top_cpuset.cpus_allowed, cpu_active_mask);
top_cpuset.mems_allowed = node_states[N_MEMORY];
top_cpuset.old_mems_allowed = top_cpuset.mems_allowed;
cpumask_copy(top_cpuset.effective_cpus, cpu_active_mask);
top_cpuset.effective_mems = node_states[N_MEMORY];
register_hotmemory_notifier(&cpuset_track_online_nodes_nb);
cpuset_migrate_mm_wq = alloc_ordered_workqueue("cpuset_migrate_mm", 0);
BUG_ON(!cpuset_migrate_mm_wq);
}
/**
* cpuset_cpus_allowed - return cpus_allowed mask from a tasks cpuset.
* @tsk: pointer to task_struct from which to obtain cpuset->cpus_allowed.
* @pmask: pointer to struct cpumask variable to receive cpus_allowed set.
*
* Description: Returns the cpumask_var_t cpus_allowed of the cpuset
* attached to the specified @tsk. Guaranteed to return some non-empty
* subset of cpu_online_mask, even if this means going outside the
* tasks cpuset.
**/
void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask)
{
unsigned long flags;
spin_lock_irqsave(&callback_lock, flags);
guarantee_online_cpus(tsk, pmask);
spin_unlock_irqrestore(&callback_lock, flags);
}
/**
* cpuset_cpus_allowed_fallback - final fallback before complete catastrophe.
* @tsk: pointer to task_struct with which the scheduler is struggling
*
* Description: In the case that the scheduler cannot find an allowed cpu in
* tsk->cpus_allowed, we fall back to task_cs(tsk)->cpus_allowed. In legacy
* mode however, this value is the same as task_cs(tsk)->effective_cpus,
* which will not contain a sane cpumask during cases such as cpu hotplugging.
* This is the absolute last resort for the scheduler and it is only used if
* _every_ other avenue has been traveled.
*
* Returns true if the affinity of @tsk was changed, false otherwise.
**/
bool cpuset_cpus_allowed_fallback(struct task_struct *tsk)
{
const struct cpumask *possible_mask = task_cpu_possible_mask(tsk);
const struct cpumask *cs_mask;
bool changed = false;
rcu_read_lock();
cs_mask = task_cs(tsk)->cpus_allowed;
if (is_in_v2_mode() && cpumask_subset(cs_mask, possible_mask)) {
do_set_cpus_allowed(tsk, cs_mask);
changed = true;
}
rcu_read_unlock();
/*
* We own tsk->cpus_allowed, nobody can change it under us.
*
* But we used cs && cs->cpus_allowed lockless and thus can
* race with cgroup_attach_task() or update_cpumask() and get
* the wrong tsk->cpus_allowed. However, both cases imply the
* subsequent cpuset_change_cpumask()->set_cpus_allowed_ptr()
* which takes task_rq_lock().
*
* If we are called after it dropped the lock we must see all
* changes in tsk_cs()->cpus_allowed. Otherwise we can temporary
* set any mask even if it is not right from task_cs() pov,
* the pending set_cpus_allowed_ptr() will fix things.
*
* select_fallback_rq() will fix things ups and set cpu_possible_mask
* if required.
*/
return changed;
}
void __init cpuset_init_current_mems_allowed(void)
{
nodes_setall(current->mems_allowed);
}
/**
* cpuset_mems_allowed - return mems_allowed mask from a tasks cpuset.
* @tsk: pointer to task_struct from which to obtain cpuset->mems_allowed.
*
* Description: Returns the nodemask_t mems_allowed of the cpuset
* attached to the specified @tsk. Guaranteed to return some non-empty
* subset of node_states[N_MEMORY], even if this means going outside the
* tasks cpuset.
**/
nodemask_t cpuset_mems_allowed(struct task_struct *tsk)
{
nodemask_t mask;
unsigned long flags;
spin_lock_irqsave(&callback_lock, flags);
rcu_read_lock();
guarantee_online_mems(task_cs(tsk), &mask);
rcu_read_unlock();
spin_unlock_irqrestore(&callback_lock, flags);
return mask;
}
/**
* cpuset_nodemask_valid_mems_allowed - check nodemask vs. current mems_allowed
* @nodemask: the nodemask to be checked
*
* Are any of the nodes in the nodemask allowed in current->mems_allowed?
*/
int cpuset_nodemask_valid_mems_allowed(nodemask_t *nodemask)
{
return nodes_intersects(*nodemask, current->mems_allowed);
}
/*
* nearest_hardwall_ancestor() - Returns the nearest mem_exclusive or
* mem_hardwall ancestor to the specified cpuset. Call holding
* callback_lock. If no ancestor is mem_exclusive or mem_hardwall
* (an unusual configuration), then returns the root cpuset.
*/
static struct cpuset *nearest_hardwall_ancestor(struct cpuset *cs)
{
while (!(is_mem_exclusive(cs) || is_mem_hardwall(cs)) && parent_cs(cs))
cs = parent_cs(cs);
return cs;
}
/**
* cpuset_node_allowed - Can we allocate on a memory node?
* @node: is this an allowed node?
* @gfp_mask: memory allocation flags
*
* If we're in interrupt, yes, we can always allocate. If @node is set in
* current's mems_allowed, yes. If it's not a __GFP_HARDWALL request and this
* node is set in the nearest hardwalled cpuset ancestor to current's cpuset,
* yes. If current has access to memory reserves as an oom victim, yes.
* Otherwise, no.
*
* GFP_USER allocations are marked with the __GFP_HARDWALL bit,
* and do not allow allocations outside the current tasks cpuset
* unless the task has been OOM killed.
* GFP_KERNEL allocations are not so marked, so can escape to the
* nearest enclosing hardwalled ancestor cpuset.
*
* Scanning up parent cpusets requires callback_lock. The
* __alloc_pages() routine only calls here with __GFP_HARDWALL bit
* _not_ set if it's a GFP_KERNEL allocation, and all nodes in the
* current tasks mems_allowed came up empty on the first pass over
* the zonelist. So only GFP_KERNEL allocations, if all nodes in the
* cpuset are short of memory, might require taking the callback_lock.
*
* The first call here from mm/page_alloc:get_page_from_freelist()
* has __GFP_HARDWALL set in gfp_mask, enforcing hardwall cpusets,
* so no allocation on a node outside the cpuset is allowed (unless
* in interrupt, of course).
*
* The second pass through get_page_from_freelist() doesn't even call
* here for GFP_ATOMIC calls. For those calls, the __alloc_pages()
* variable 'wait' is not set, and the bit ALLOC_CPUSET is not set
* in alloc_flags. That logic and the checks below have the combined
* affect that:
* in_interrupt - any node ok (current task context irrelevant)
* GFP_ATOMIC - any node ok
* tsk_is_oom_victim - any node ok
* GFP_KERNEL - any node in enclosing hardwalled cpuset ok
* GFP_USER - only nodes in current tasks mems allowed ok.
*/
bool __cpuset_node_allowed(int node, gfp_t gfp_mask)
{
struct cpuset *cs; /* current cpuset ancestors */
int allowed; /* is allocation in zone z allowed? */
unsigned long flags;
if (in_interrupt())
return true;
if (node_isset(node, current->mems_allowed))
return true;
/*
* Allow tasks that have access to memory reserves because they have
* been OOM killed to get memory anywhere.
*/
if (unlikely(tsk_is_oom_victim(current)))
return true;
if (gfp_mask & __GFP_HARDWALL) /* If hardwall request, stop here */
return false;
if (current->flags & PF_EXITING) /* Let dying task have memory */
return true;
/* Not hardwall and node outside mems_allowed: scan up cpusets */
spin_lock_irqsave(&callback_lock, flags);
rcu_read_lock();
cs = nearest_hardwall_ancestor(task_cs(current));
allowed = node_isset(node, cs->mems_allowed);
rcu_read_unlock();
spin_unlock_irqrestore(&callback_lock, flags);
return allowed;
}
/**
* cpuset_mem_spread_node() - On which node to begin search for a file page
* cpuset_slab_spread_node() - On which node to begin search for a slab page
*
* If a task is marked PF_SPREAD_PAGE or PF_SPREAD_SLAB (as for
* tasks in a cpuset with is_spread_page or is_spread_slab set),
* and if the memory allocation used cpuset_mem_spread_node()
* to determine on which node to start looking, as it will for
* certain page cache or slab cache pages such as used for file
* system buffers and inode caches, then instead of starting on the
* local node to look for a free page, rather spread the starting
* node around the tasks mems_allowed nodes.
*
* We don't have to worry about the returned node being offline
* because "it can't happen", and even if it did, it would be ok.
*
* The routines calling guarantee_online_mems() are careful to
* only set nodes in task->mems_allowed that are online. So it
* should not be possible for the following code to return an
* offline node. But if it did, that would be ok, as this routine
* is not returning the node where the allocation must be, only
* the node where the search should start. The zonelist passed to
* __alloc_pages() will include all nodes. If the slab allocator
* is passed an offline node, it will fall back to the local node.
* See kmem_cache_alloc_node().
*/
static int cpuset_spread_node(int *rotor)
{
return *rotor = next_node_in(*rotor, current->mems_allowed);
}
int cpuset_mem_spread_node(void)
{
if (current->cpuset_mem_spread_rotor == NUMA_NO_NODE)
current->cpuset_mem_spread_rotor =
node_random(¤t->mems_allowed);
return cpuset_spread_node(¤t->cpuset_mem_spread_rotor);
}
int cpuset_slab_spread_node(void)
{
if (current->cpuset_slab_spread_rotor == NUMA_NO_NODE)
current->cpuset_slab_spread_rotor =
node_random(¤t->mems_allowed);
return cpuset_spread_node(¤t->cpuset_slab_spread_rotor);
}
EXPORT_SYMBOL_GPL(cpuset_mem_spread_node);
/**
* cpuset_mems_allowed_intersects - Does @tsk1's mems_allowed intersect @tsk2's?
* @tsk1: pointer to task_struct of some task.
* @tsk2: pointer to task_struct of some other task.
*
* Description: Return true if @tsk1's mems_allowed intersects the
* mems_allowed of @tsk2. Used by the OOM killer to determine if
* one of the task's memory usage might impact the memory available
* to the other.
**/
int cpuset_mems_allowed_intersects(const struct task_struct *tsk1,
const struct task_struct *tsk2)
{
return nodes_intersects(tsk1->mems_allowed, tsk2->mems_allowed);
}
/**
* cpuset_print_current_mems_allowed - prints current's cpuset and mems_allowed
*
* Description: Prints current's name, cpuset name, and cached copy of its
* mems_allowed to the kernel log.
*/
void cpuset_print_current_mems_allowed(void)
{
struct cgroup *cgrp;
rcu_read_lock();
cgrp = task_cs(current)->css.cgroup;
pr_cont(",cpuset=");
pr_cont_cgroup_name(cgrp);
pr_cont(",mems_allowed=%*pbl",
nodemask_pr_args(¤t->mems_allowed));
rcu_read_unlock();
}
/*
* Collection of memory_pressure is suppressed unless
* this flag is enabled by writing "1" to the special
* cpuset file 'memory_pressure_enabled' in the root cpuset.
*/
int cpuset_memory_pressure_enabled __read_mostly;
/**
* cpuset_memory_pressure_bump - keep stats of per-cpuset reclaims.
*
* Keep a running average of the rate of synchronous (direct)
* page reclaim efforts initiated by tasks in each cpuset.
*
* This represents the rate at which some task in the cpuset
* ran low on memory on all nodes it was allowed to use, and
* had to enter the kernels page reclaim code in an effort to
* create more free memory by tossing clean pages or swapping
* or writing dirty pages.
*
* Display to user space in the per-cpuset read-only file
* "memory_pressure". Value displayed is an integer
* representing the recent rate of entry into the synchronous
* (direct) page reclaim by any task attached to the cpuset.
**/
void __cpuset_memory_pressure_bump(void)
{
rcu_read_lock();
fmeter_markevent(&task_cs(current)->fmeter);
rcu_read_unlock();
}
#ifdef CONFIG_PROC_PID_CPUSET
/*
* proc_cpuset_show()
* - Print tasks cpuset path into seq_file.
* - Used for /proc/<pid>/cpuset.
* - No need to task_lock(tsk) on this tsk->cpuset reference, as it
* doesn't really matter if tsk->cpuset changes after we read it,
* and we take cpuset_rwsem, keeping cpuset_attach() from changing it
* anyway.
*/
int proc_cpuset_show(struct seq_file *m, struct pid_namespace *ns,
struct pid *pid, struct task_struct *tsk)
{
char *buf;
struct cgroup_subsys_state *css;
int retval;
retval = -ENOMEM;
buf = kmalloc(PATH_MAX, GFP_KERNEL);
if (!buf)
goto out;
css = task_get_css(tsk, cpuset_cgrp_id);
retval = cgroup_path_ns(css->cgroup, buf, PATH_MAX,
current->nsproxy->cgroup_ns);
css_put(css);
if (retval >= PATH_MAX)
retval = -ENAMETOOLONG;
if (retval < 0)
goto out_free;
seq_puts(m, buf);
seq_putc(m, '\n');
retval = 0;
out_free:
kfree(buf);
out:
return retval;
}
#endif /* CONFIG_PROC_PID_CPUSET */
/* Display task mems_allowed in /proc/<pid>/status file. */
void cpuset_task_status_allowed(struct seq_file *m, struct task_struct *task)
{
seq_printf(m, "Mems_allowed:\t%*pb\n",
nodemask_pr_args(&task->mems_allowed));
seq_printf(m, "Mems_allowed_list:\t%*pbl\n",
nodemask_pr_args(&task->mems_allowed));
}
// SPDX-License-Identifier: GPL-2.0
#include <linux/bitops.h>
#include <linux/fault-inject-usercopy.h>
#include <linux/instrumented.h>
#include <linux/uaccess.h>
/* out-of-line parts */
#ifndef INLINE_COPY_FROM_USER
unsigned long _copy_from_user(void *to, const void __user *from, unsigned long n)
{
unsigned long res = n;
might_fault();
if (!should_fail_usercopy() && likely(access_ok(from, n))) {
instrument_copy_from_user(to, from, n);
res = raw_copy_from_user(to, from, n);
}
if (unlikely(res)) memset(to + (n - res), 0, res); return res;}
EXPORT_SYMBOL(_copy_from_user);
#endif
#ifndef INLINE_COPY_TO_USER
unsigned long _copy_to_user(void __user *to, const void *from, unsigned long n)
{
might_fault();
if (should_fail_usercopy())
return n;
if (likely(access_ok(to, n))) {
instrument_copy_to_user(to, from, n);
n = raw_copy_to_user(to, from, n);
}
return n;
}
EXPORT_SYMBOL(_copy_to_user);
#endif
/**
* check_zeroed_user: check if a userspace buffer only contains zero bytes
* @from: Source address, in userspace.
* @size: Size of buffer.
*
* This is effectively shorthand for "memchr_inv(from, 0, size) == NULL" for
* userspace addresses (and is more efficient because we don't care where the
* first non-zero byte is).
*
* Returns:
* * 0: There were non-zero bytes present in the buffer.
* * 1: The buffer was full of zero bytes.
* * -EFAULT: access to userspace failed.
*/
int check_zeroed_user(const void __user *from, size_t size)
{
unsigned long val;
uintptr_t align = (uintptr_t) from % sizeof(unsigned long);
if (unlikely(size == 0))
return 1;
from -= align;
size += align;
if (!user_read_access_begin(from, size))
return -EFAULT;
unsafe_get_user(val, (unsigned long __user *) from, err_fault);
if (align)
val &= ~aligned_byte_mask(align);
while (size > sizeof(unsigned long)) {
if (unlikely(val))
goto done;
from += sizeof(unsigned long);
size -= sizeof(unsigned long);
unsafe_get_user(val, (unsigned long __user *) from, err_fault);
}
if (size < sizeof(unsigned long))
val &= aligned_byte_mask(size);
done:
user_read_access_end();
return (val == 0);
err_fault:
user_read_access_end();
return -EFAULT;
}
EXPORT_SYMBOL(check_zeroed_user);
// SPDX-License-Identifier: GPL-2.0-only
/*
* linux/fs/file_table.c
*
* Copyright (C) 1991, 1992 Linus Torvalds
* Copyright (C) 1997 David S. Miller (davem@caip.rutgers.edu)
*/
#include <linux/string.h>
#include <linux/slab.h>
#include <linux/file.h>
#include <linux/fdtable.h>
#include <linux/init.h>
#include <linux/module.h>
#include <linux/fs.h>
#include <linux/security.h>
#include <linux/cred.h>
#include <linux/eventpoll.h>
#include <linux/rcupdate.h>
#include <linux/mount.h>
#include <linux/capability.h>
#include <linux/cdev.h>
#include <linux/fsnotify.h>
#include <linux/sysctl.h>
#include <linux/percpu_counter.h>
#include <linux/percpu.h>
#include <linux/task_work.h>
#include <linux/ima.h>
#include <linux/swap.h>
#include <linux/atomic.h>
#include "internal.h"
/* sysctl tunables... */
struct files_stat_struct files_stat = {
.max_files = NR_FILE
};
/* SLAB cache for file structures */
static struct kmem_cache *filp_cachep __read_mostly;
static struct percpu_counter nr_files __cacheline_aligned_in_smp;
static void file_free_rcu(struct rcu_head *head)
{
struct file *f = container_of(head, struct file, f_u.fu_rcuhead);
put_cred(f->f_cred);
kmem_cache_free(filp_cachep, f);
}
static inline void file_free(struct file *f)
{
security_file_free(f);
if (!(f->f_mode & FMODE_NOACCOUNT))
percpu_counter_dec(&nr_files);
call_rcu(&f->f_u.fu_rcuhead, file_free_rcu);
}
/*
* Return the total number of open files in the system
*/
static long get_nr_files(void)
{
return percpu_counter_read_positive(&nr_files);
}
/*
* Return the maximum number of open files in the system
*/
unsigned long get_max_files(void)
{
return files_stat.max_files;
}
EXPORT_SYMBOL_GPL(get_max_files);
/*
* Handle nr_files sysctl
*/
#if defined(CONFIG_SYSCTL) && defined(CONFIG_PROC_FS)
int proc_nr_files(struct ctl_table *table, int write,
void *buffer, size_t *lenp, loff_t *ppos)
{
files_stat.nr_files = get_nr_files();
return proc_doulongvec_minmax(table, write, buffer, lenp, ppos);
}
#else
int proc_nr_files(struct ctl_table *table, int write,
void *buffer, size_t *lenp, loff_t *ppos)
{
return -ENOSYS;
}
#endif
static struct file *__alloc_file(int flags, const struct cred *cred)
{
struct file *f;
int error;
f = kmem_cache_zalloc(filp_cachep, GFP_KERNEL);
if (unlikely(!f))
return ERR_PTR(-ENOMEM);
f->f_cred = get_cred(cred);
error = security_file_alloc(f);
if (unlikely(error)) {
file_free_rcu(&f->f_u.fu_rcuhead);
return ERR_PTR(error);
}
atomic_long_set(&f->f_count, 1);
rwlock_init(&f->f_owner.lock);
spin_lock_init(&f->f_lock);
mutex_init(&f->f_pos_lock);
f->f_flags = flags;
f->f_mode = OPEN_FMODE(flags);
/* f->f_version: 0 */
return f;
}
/* Find an unused file structure and return a pointer to it.
* Returns an error pointer if some error happend e.g. we over file
* structures limit, run out of memory or operation is not permitted.
*
* Be very careful using this. You are responsible for
* getting write access to any mount that you might assign
* to this filp, if it is opened for write. If this is not
* done, you will imbalance int the mount's writer count
* and a warning at __fput() time.
*/
struct file *alloc_empty_file(int flags, const struct cred *cred)
{
static long old_max;
struct file *f;
/*
* Privileged users can go above max_files
*/
if (get_nr_files() >= files_stat.max_files && !capable(CAP_SYS_ADMIN)) {
/*
* percpu_counters are inaccurate. Do an expensive check before
* we go and fail.
*/
if (percpu_counter_sum_positive(&nr_files) >= files_stat.max_files)
goto over;
}
f = __alloc_file(flags, cred);
if (!IS_ERR(f))
percpu_counter_inc(&nr_files);
return f;
over:
/* Ran out of filps - report that */
if (get_nr_files() > old_max) {
pr_info("VFS: file-max limit %lu reached\n", get_max_files());
old_max = get_nr_files();
}
return ERR_PTR(-ENFILE);
}
/*
* Variant of alloc_empty_file() that doesn't check and modify nr_files.
*
* Should not be used unless there's a very good reason to do so.
*/
struct file *alloc_empty_file_noaccount(int flags, const struct cred *cred)
{
struct file *f = __alloc_file(flags, cred);
if (!IS_ERR(f))
f->f_mode |= FMODE_NOACCOUNT;
return f;
}
/**
* alloc_file - allocate and initialize a 'struct file'
*
* @path: the (dentry, vfsmount) pair for the new file
* @flags: O_... flags with which the new file will be opened
* @fop: the 'struct file_operations' for the new file
*/
static struct file *alloc_file(const struct path *path, int flags,
const struct file_operations *fop)
{
struct file *file;
file = alloc_empty_file(flags, current_cred());
if (IS_ERR(file))
return file;
file->f_path = *path;
file->f_inode = path->dentry->d_inode;
file->f_mapping = path->dentry->d_inode->i_mapping;
file->f_wb_err = filemap_sample_wb_err(file->f_mapping);
file->f_sb_err = file_sample_sb_err(file);
if ((file->f_mode & FMODE_READ) &&
likely(fop->read || fop->read_iter)) file->f_mode |= FMODE_CAN_READ; if ((file->f_mode & FMODE_WRITE) && likely(fop->write || fop->write_iter)) file->f_mode |= FMODE_CAN_WRITE; file->f_mode |= FMODE_OPENED;
file->f_op = fop;
if ((file->f_mode & (FMODE_READ | FMODE_WRITE)) == FMODE_READ) i_readcount_inc(path->dentry->d_inode);
return file;
}
struct file *alloc_file_pseudo(struct inode *inode, struct vfsmount *mnt,
const char *name, int flags,
const struct file_operations *fops)
{
static const struct dentry_operations anon_ops = {
.d_dname = simple_dname
};
struct qstr this = QSTR_INIT(name, strlen(name));
struct path path;
struct file *file;
path.dentry = d_alloc_pseudo(mnt->mnt_sb, &this);
if (!path.dentry)
return ERR_PTR(-ENOMEM);
if (!mnt->mnt_sb->s_d_op) d_set_d_op(path.dentry, &anon_ops); path.mnt = mntget(mnt);
d_instantiate(path.dentry, inode);
file = alloc_file(&path, flags, fops);
if (IS_ERR(file)) { ihold(inode);
path_put(&path);
}
return file;
}
EXPORT_SYMBOL(alloc_file_pseudo);
struct file *alloc_file_clone(struct file *base, int flags,
const struct file_operations *fops)
{
struct file *f = alloc_file(&base->f_path, flags, fops);
if (!IS_ERR(f)) {
path_get(&f->f_path);
f->f_mapping = base->f_mapping;
}
return f;
}
/* the real guts of fput() - releasing the last reference to file
*/
static void __fput(struct file *file)
{
struct dentry *dentry = file->f_path.dentry;
struct vfsmount *mnt = file->f_path.mnt;
struct inode *inode = file->f_inode;
fmode_t mode = file->f_mode;
if (unlikely(!(file->f_mode & FMODE_OPENED)))
goto out;
might_sleep();
fsnotify_close(file);
/*
* The function eventpoll_release() should be the first called
* in the file cleanup chain.
*/
eventpoll_release(file);
locks_remove_file(file);
ima_file_free(file);
if (unlikely(file->f_flags & FASYNC)) { if (file->f_op->fasync) file->f_op->fasync(-1, file, 0);
}
if (file->f_op->release) file->f_op->release(inode, file); if (unlikely(S_ISCHR(inode->i_mode) && inode->i_cdev != NULL &&
!(mode & FMODE_PATH))) {
cdev_put(inode->i_cdev);
}
fops_put(file->f_op); put_pid(file->f_owner.pid);
if ((mode & (FMODE_READ | FMODE_WRITE)) == FMODE_READ)
i_readcount_dec(inode);
if (mode & FMODE_WRITER) {
put_write_access(inode);
__mnt_drop_write(mnt);
}
dput(dentry);
if (unlikely(mode & FMODE_NEED_UNMOUNT))
dissolve_on_fput(mnt); mntput(mnt);
out:
file_free(file);
}
static LLIST_HEAD(delayed_fput_list);
static void delayed_fput(struct work_struct *unused)
{
struct llist_node *node = llist_del_all(&delayed_fput_list);
struct file *f, *t;
llist_for_each_entry_safe(f, t, node, f_u.fu_llist)
__fput(f);
}
static void ____fput(struct callback_head *work)
{
__fput(container_of(work, struct file, f_u.fu_rcuhead));
}
/*
* If kernel thread really needs to have the final fput() it has done
* to complete, call this. The only user right now is the boot - we
* *do* need to make sure our writes to binaries on initramfs has
* not left us with opened struct file waiting for __fput() - execve()
* won't work without that. Please, don't add more callers without
* very good reasons; in particular, never call that with locks
* held and never call that from a thread that might need to do
* some work on any kind of umount.
*/
void flush_delayed_fput(void)
{
delayed_fput(NULL);
}
EXPORT_SYMBOL_GPL(flush_delayed_fput);
static DECLARE_DELAYED_WORK(delayed_fput_work, delayed_fput);
void fput_many(struct file *file, unsigned int refs)
{
if (atomic_long_sub_and_test(refs, &file->f_count)) {
struct task_struct *task = current;
if (likely(!in_interrupt() && !(task->flags & PF_KTHREAD))) {
init_task_work(&file->f_u.fu_rcuhead, ____fput);
if (!task_work_add(task, &file->f_u.fu_rcuhead, TWA_RESUME))
return;
/*
* After this task has run exit_task_work(),
* task_work_add() will fail. Fall through to delayed
* fput to avoid leaking *file.
*/
}
if (llist_add(&file->f_u.fu_llist, &delayed_fput_list))
schedule_delayed_work(&delayed_fput_work, 1);
}
}
void fput(struct file *file)
{
fput_many(file, 1);
}
/*
* synchronous analog of fput(); for kernel threads that might be needed
* in some umount() (and thus can't use flush_delayed_fput() without
* risking deadlocks), need to wait for completion of __fput() and know
* for this specific struct file it won't involve anything that would
* need them. Use only if you really need it - at the very least,
* don't blindly convert fput() by kernel thread to that.
*/
void __fput_sync(struct file *file)
{
if (atomic_long_dec_and_test(&file->f_count)) {
struct task_struct *task = current;
BUG_ON(!(task->flags & PF_KTHREAD));
__fput(file);
}
}
EXPORT_SYMBOL(fput);
void __init files_init(void)
{
filp_cachep = kmem_cache_create("filp", sizeof(struct file), 0,
SLAB_HWCACHE_ALIGN | SLAB_PANIC | SLAB_ACCOUNT, NULL);
percpu_counter_init(&nr_files, 0, GFP_KERNEL);
}
/*
* One file with associated inode and dcache is very roughly 1K. Per default
* do not use more than 10% of our memory for files.
*/
void __init files_maxfiles_init(void)
{
unsigned long n;
unsigned long nr_pages = totalram_pages();
unsigned long memreserve = (nr_pages - nr_free_pages()) * 3/2;
memreserve = min(memreserve, nr_pages - 1);
n = ((nr_pages - memreserve) * (PAGE_SIZE / 1024)) / 10;
files_stat.max_files = max_t(unsigned long, n, NR_FILE);
}
// SPDX-License-Identifier: GPL-2.0
/*
* bus.c - bus driver management
*
* Copyright (c) 2002-3 Patrick Mochel
* Copyright (c) 2002-3 Open Source Development Labs
* Copyright (c) 2007 Greg Kroah-Hartman <gregkh@suse.de>
* Copyright (c) 2007 Novell Inc.
*/
#include <linux/async.h>
#include <linux/device/bus.h>
#include <linux/device.h>
#include <linux/module.h>
#include <linux/errno.h>
#include <linux/slab.h>
#include <linux/init.h>
#include <linux/string.h>
#include <linux/mutex.h>
#include <linux/sysfs.h>
#include "base.h"
#include "power/power.h"
/* /sys/devices/system */
static struct kset *system_kset;
#define to_bus_attr(_attr) container_of(_attr, struct bus_attribute, attr)
/*
* sysfs bindings for drivers
*/
#define to_drv_attr(_attr) container_of(_attr, struct driver_attribute, attr)
#define DRIVER_ATTR_IGNORE_LOCKDEP(_name, _mode, _show, _store) \
struct driver_attribute driver_attr_##_name = \
__ATTR_IGNORE_LOCKDEP(_name, _mode, _show, _store)
static int __must_check bus_rescan_devices_helper(struct device *dev,
void *data);
static struct bus_type *bus_get(struct bus_type *bus)
{
if (bus) {
kset_get(&bus->p->subsys);
return bus;
}
return NULL;
}
static void bus_put(struct bus_type *bus)
{
if (bus)
kset_put(&bus->p->subsys);
}
static ssize_t drv_attr_show(struct kobject *kobj, struct attribute *attr,
char *buf)
{
struct driver_attribute *drv_attr = to_drv_attr(attr);
struct driver_private *drv_priv = to_driver(kobj);
ssize_t ret = -EIO;
if (drv_attr->show)
ret = drv_attr->show(drv_priv->driver, buf);
return ret;
}
static ssize_t drv_attr_store(struct kobject *kobj, struct attribute *attr,
const char *buf, size_t count)
{
struct driver_attribute *drv_attr = to_drv_attr(attr);
struct driver_private *drv_priv = to_driver(kobj);
ssize_t ret = -EIO;
if (drv_attr->store)
ret = drv_attr->store(drv_priv->driver, buf, count);
return ret;
}
static const struct sysfs_ops driver_sysfs_ops = {
.show = drv_attr_show,
.store = drv_attr_store,
};
static void driver_release(struct kobject *kobj)
{
struct driver_private *drv_priv = to_driver(kobj);
pr_debug("driver: '%s': %s\n", kobject_name(kobj), __func__);
kfree(drv_priv);
}
static struct kobj_type driver_ktype = {
.sysfs_ops = &driver_sysfs_ops,
.release = driver_release,
};
/*
* sysfs bindings for buses
*/
static ssize_t bus_attr_show(struct kobject *kobj, struct attribute *attr,
char *buf)
{
struct bus_attribute *bus_attr = to_bus_attr(attr);
struct subsys_private *subsys_priv = to_subsys_private(kobj);
ssize_t ret = 0;
if (bus_attr->show)
ret = bus_attr->show(subsys_priv->bus, buf);
return ret;
}
static ssize_t bus_attr_store(struct kobject *kobj, struct attribute *attr,
const char *buf, size_t count)
{
struct bus_attribute *bus_attr = to_bus_attr(attr);
struct subsys_private *subsys_priv = to_subsys_private(kobj);
ssize_t ret = 0;
if (bus_attr->store)
ret = bus_attr->store(subsys_priv->bus, buf, count);
return ret;
}
static const struct sysfs_ops bus_sysfs_ops = {
.show = bus_attr_show,
.store = bus_attr_store,
};
int bus_create_file(struct bus_type *bus, struct bus_attribute *attr)
{
int error;
if (bus_get(bus)) {
error = sysfs_create_file(&bus->p->subsys.kobj, &attr->attr);
bus_put(bus);
} else
error = -EINVAL;
return error;
}
EXPORT_SYMBOL_GPL(bus_create_file);
void bus_remove_file(struct bus_type *bus, struct bus_attribute *attr)
{
if (bus_get(bus)) {
sysfs_remove_file(&bus->p->subsys.kobj, &attr->attr);
bus_put(bus);
}
}
EXPORT_SYMBOL_GPL(bus_remove_file);
static void bus_release(struct kobject *kobj)
{
struct subsys_private *priv = to_subsys_private(kobj);
struct bus_type *bus = priv->bus;
kfree(priv);
bus->p = NULL;
}
static struct kobj_type bus_ktype = {
.sysfs_ops = &bus_sysfs_ops,
.release = bus_release,
};
static int bus_uevent_filter(struct kset *kset, struct kobject *kobj)
{
struct kobj_type *ktype = get_ktype(kobj);
if (ktype == &bus_ktype)
return 1;
return 0;
}
static const struct kset_uevent_ops bus_uevent_ops = {
.filter = bus_uevent_filter,
};
static struct kset *bus_kset;
/* Manually detach a device from its associated driver. */
static ssize_t unbind_store(struct device_driver *drv, const char *buf,
size_t count)
{
struct bus_type *bus = bus_get(drv->bus);
struct device *dev;
int err = -ENODEV;
dev = bus_find_device_by_name(bus, NULL, buf);
if (dev && dev->driver == drv) {
device_driver_detach(dev);
err = count;
}
put_device(dev);
bus_put(bus);
return err;
}
static DRIVER_ATTR_IGNORE_LOCKDEP(unbind, 0200, NULL, unbind_store);
/*
* Manually attach a device to a driver.
* Note: the driver must want to bind to the device,
* it is not possible to override the driver's id table.
*/
static ssize_t bind_store(struct device_driver *drv, const char *buf,
size_t count)
{
struct bus_type *bus = bus_get(drv->bus);
struct device *dev;
int err = -ENODEV;
dev = bus_find_device_by_name(bus, NULL, buf);
if (dev && driver_match_device(drv, dev)) {
err = device_driver_attach(drv, dev);
if (!err) {
/* success */
err = count;
}
}
put_device(dev);
bus_put(bus);
return err;
}
static DRIVER_ATTR_IGNORE_LOCKDEP(bind, 0200, NULL, bind_store);
static ssize_t drivers_autoprobe_show(struct bus_type *bus, char *buf)
{
return sysfs_emit(buf, "%d\n", bus->p->drivers_autoprobe);
}
static ssize_t drivers_autoprobe_store(struct bus_type *bus,
const char *buf, size_t count)
{
if (buf[0] == '0')
bus->p->drivers_autoprobe = 0;
else
bus->p->drivers_autoprobe = 1;
return count;
}
static ssize_t drivers_probe_store(struct bus_type *bus,
const char *buf, size_t count)
{
struct device *dev;
int err = -EINVAL;
dev = bus_find_device_by_name(bus, NULL, buf);
if (!dev)
return -ENODEV;
if (bus_rescan_devices_helper(dev, NULL) == 0)
err = count;
put_device(dev);
return err;
}
static struct device *next_device(struct klist_iter *i)
{
struct klist_node *n = klist_next(i);
struct device *dev = NULL;
struct device_private *dev_prv;
if (n) {
dev_prv = to_device_private_bus(n);
dev = dev_prv->device;
}
return dev;
}
/**
* bus_for_each_dev - device iterator.
* @bus: bus type.
* @start: device to start iterating from.
* @data: data for the callback.
* @fn: function to be called for each device.
*
* Iterate over @bus's list of devices, and call @fn for each,
* passing it @data. If @start is not NULL, we use that device to
* begin iterating from.
*
* We check the return of @fn each time. If it returns anything
* other than 0, we break out and return that value.
*
* NOTE: The device that returns a non-zero value is not retained
* in any way, nor is its refcount incremented. If the caller needs
* to retain this data, it should do so, and increment the reference
* count in the supplied callback.
*/
int bus_for_each_dev(struct bus_type *bus, struct device *start,
void *data, int (*fn)(struct device *, void *))
{
struct klist_iter i;
struct device *dev;
int error = 0;
if (!bus || !bus->p)
return -EINVAL;
klist_iter_init_node(&bus->p->klist_devices, &i,
(start ? &start->p->knode_bus : NULL));
while (!error && (dev = next_device(&i)))
error = fn(dev, data);
klist_iter_exit(&i);
return error;
}
EXPORT_SYMBOL_GPL(bus_for_each_dev);
/**
* bus_find_device - device iterator for locating a particular device.
* @bus: bus type
* @start: Device to begin with
* @data: Data to pass to match function
* @match: Callback function to check device
*
* This is similar to the bus_for_each_dev() function above, but it
* returns a reference to a device that is 'found' for later use, as
* determined by the @match callback.
*
* The callback should return 0 if the device doesn't match and non-zero
* if it does. If the callback returns non-zero, this function will
* return to the caller and not iterate over any more devices.
*/
struct device *bus_find_device(struct bus_type *bus,
struct device *start, const void *data,
int (*match)(struct device *dev, const void *data))
{
struct klist_iter i;
struct device *dev;
if (!bus || !bus->p)
return NULL;
klist_iter_init_node(&bus->p->klist_devices, &i,
(start ? &start->p->knode_bus : NULL));
while ((dev = next_device(&i)))
if (match(dev, data) && get_device(dev))
break;
klist_iter_exit(&i);
return dev;
}
EXPORT_SYMBOL_GPL(bus_find_device);
/**
* subsys_find_device_by_id - find a device with a specific enumeration number
* @subsys: subsystem
* @id: index 'id' in struct device
* @hint: device to check first
*
* Check the hint's next object and if it is a match return it directly,
* otherwise, fall back to a full list search. Either way a reference for
* the returned object is taken.
*/
struct device *subsys_find_device_by_id(struct bus_type *subsys, unsigned int id,
struct device *hint)
{
struct klist_iter i;
struct device *dev;
if (!subsys)
return NULL;
if (hint) {
klist_iter_init_node(&subsys->p->klist_devices, &i, &hint->p->knode_bus);
dev = next_device(&i);
if (dev && dev->id == id && get_device(dev)) {
klist_iter_exit(&i);
return dev;
}
klist_iter_exit(&i);
}
klist_iter_init_node(&subsys->p->klist_devices, &i, NULL);
while ((dev = next_device(&i))) {
if (dev->id == id && get_device(dev)) {
klist_iter_exit(&i);
return dev;
}
}
klist_iter_exit(&i);
return NULL;
}
EXPORT_SYMBOL_GPL(subsys_find_device_by_id);
static struct device_driver *next_driver(struct klist_iter *i)
{
struct klist_node *n = klist_next(i);
struct driver_private *drv_priv;
if (n) {
drv_priv = container_of(n, struct driver_private, knode_bus);
return drv_priv->driver;
}
return NULL;
}
/**
* bus_for_each_drv - driver iterator
* @bus: bus we're dealing with.
* @start: driver to start iterating on.
* @data: data to pass to the callback.
* @fn: function to call for each driver.
*
* This is nearly identical to the device iterator above.
* We iterate over each driver that belongs to @bus, and call
* @fn for each. If @fn returns anything but 0, we break out
* and return it. If @start is not NULL, we use it as the head
* of the list.
*
* NOTE: we don't return the driver that returns a non-zero
* value, nor do we leave the reference count incremented for that
* driver. If the caller needs to know that info, it must set it
* in the callback. It must also be sure to increment the refcount
* so it doesn't disappear before returning to the caller.
*/
int bus_for_each_drv(struct bus_type *bus, struct device_driver *start,
void *data, int (*fn)(struct device_driver *, void *))
{
struct klist_iter i;
struct device_driver *drv;
int error = 0;
if (!bus)
return -EINVAL;
klist_iter_init_node(&bus->p->klist_drivers, &i,
start ? &start->p->knode_bus : NULL);
while ((drv = next_driver(&i)) && !error)
error = fn(drv, data);
klist_iter_exit(&i);
return error;
}
EXPORT_SYMBOL_GPL(bus_for_each_drv);
/**
* bus_add_device - add device to bus
* @dev: device being added
*
* - Add device's bus attributes.
* - Create links to device's bus.
* - Add the device to its bus's list of devices.
*/
int bus_add_device(struct device *dev)
{
struct bus_type *bus = bus_get(dev->bus);
int error = 0;
if (bus) {
pr_debug("bus: '%s': add device %s\n", bus->name, dev_name(dev));
error = device_add_groups(dev, bus->dev_groups);
if (error)
goto out_put;
error = sysfs_create_link(&bus->p->devices_kset->kobj,
&dev->kobj, dev_name(dev));
if (error)
goto out_groups;
error = sysfs_create_link(&dev->kobj,
&dev->bus->p->subsys.kobj, "subsystem");
if (error)
goto out_subsys;
klist_add_tail(&dev->p->knode_bus, &bus->p->klist_devices);
}
return 0;
out_subsys:
sysfs_remove_link(&bus->p->devices_kset->kobj, dev_name(dev));
out_groups:
device_remove_groups(dev, bus->dev_groups);
out_put:
bus_put(dev->bus);
return error;
}
/**
* bus_probe_device - probe drivers for a new device
* @dev: device to probe
*
* - Automatically probe for a driver if the bus allows it.
*/
void bus_probe_device(struct device *dev)
{
struct bus_type *bus = dev->bus;
struct subsys_interface *sif;
if (!bus)
return;
if (bus->p->drivers_autoprobe) device_initial_probe(dev); mutex_lock(&bus->p->mutex); list_for_each_entry(sif, &bus->p->interfaces, node) if (sif->add_dev) sif->add_dev(dev, sif); mutex_unlock(&bus->p->mutex);
}
/**
* bus_remove_device - remove device from bus
* @dev: device to be removed
*
* - Remove device from all interfaces.
* - Remove symlink from bus' directory.
* - Delete device from bus's list.
* - Detach from its driver.
* - Drop reference taken in bus_add_device().
*/
void bus_remove_device(struct device *dev)
{
struct bus_type *bus = dev->bus;
struct subsys_interface *sif;
if (!bus)
return;
mutex_lock(&bus->p->mutex); list_for_each_entry(sif, &bus->p->interfaces, node) if (sif->remove_dev) sif->remove_dev(dev, sif); mutex_unlock(&bus->p->mutex);
sysfs_remove_link(&dev->kobj, "subsystem");
sysfs_remove_link(&dev->bus->p->devices_kset->kobj,
dev_name(dev));
device_remove_groups(dev, dev->bus->dev_groups);
if (klist_node_attached(&dev->p->knode_bus))
klist_del(&dev->p->knode_bus);
pr_debug("bus: '%s': remove device %s\n",
dev->bus->name, dev_name(dev));
device_release_driver(dev);
bus_put(dev->bus);
}
static int __must_check add_bind_files(struct device_driver *drv)
{
int ret;
ret = driver_create_file(drv, &driver_attr_unbind);
if (ret == 0) {
ret = driver_create_file(drv, &driver_attr_bind);
if (ret)
driver_remove_file(drv, &driver_attr_unbind);
}
return ret;
}
static void remove_bind_files(struct device_driver *drv)
{
driver_remove_file(drv, &driver_attr_bind);
driver_remove_file(drv, &driver_attr_unbind);
}
static BUS_ATTR_WO(drivers_probe);
static BUS_ATTR_RW(drivers_autoprobe);
static int add_probe_files(struct bus_type *bus)
{
int retval;
retval = bus_create_file(bus, &bus_attr_drivers_probe);
if (retval)
goto out;
retval = bus_create_file(bus, &bus_attr_drivers_autoprobe);
if (retval)
bus_remove_file(bus, &bus_attr_drivers_probe);
out:
return retval;
}
static void remove_probe_files(struct bus_type *bus)
{
bus_remove_file(bus, &bus_attr_drivers_autoprobe);
bus_remove_file(bus, &bus_attr_drivers_probe);
}
static ssize_t uevent_store(struct device_driver *drv, const char *buf,
size_t count)
{
int rc;
rc = kobject_synth_uevent(&drv->p->kobj, buf, count);
return rc ? rc : count;
}
static DRIVER_ATTR_WO(uevent);
/**
* bus_add_driver - Add a driver to the bus.
* @drv: driver.
*/
int bus_add_driver(struct device_driver *drv)
{
struct bus_type *bus;
struct driver_private *priv;
int error = 0;
bus = bus_get(drv->bus);
if (!bus)
return -EINVAL;
pr_debug("bus: '%s': add driver %s\n", bus->name, drv->name);
priv = kzalloc(sizeof(*priv), GFP_KERNEL);
if (!priv) {
error = -ENOMEM;
goto out_put_bus;
}
klist_init(&priv->klist_devices, NULL, NULL);
priv->driver = drv;
drv->p = priv;
priv->kobj.kset = bus->p->drivers_kset;
error = kobject_init_and_add(&priv->kobj, &driver_ktype, NULL,
"%s", drv->name);
if (error)
goto out_unregister;
klist_add_tail(&priv->knode_bus, &bus->p->klist_drivers);
if (drv->bus->p->drivers_autoprobe) {
error = driver_attach(drv);
if (error)
goto out_unregister;
}
module_add_driver(drv->owner, drv);
error = driver_create_file(drv, &driver_attr_uevent);
if (error) {
printk(KERN_ERR "%s: uevent attr (%s) failed\n",
__func__, drv->name);
}
error = driver_add_groups(drv, bus->drv_groups);
if (error) {
/* How the hell do we get out of this pickle? Give up */
printk(KERN_ERR "%s: driver_add_groups(%s) failed\n",
__func__, drv->name);
}
if (!drv->suppress_bind_attrs) {
error = add_bind_files(drv);
if (error) {
/* Ditto */
printk(KERN_ERR "%s: add_bind_files(%s) failed\n",
__func__, drv->name);
}
}
return 0;
out_unregister:
kobject_put(&priv->kobj);
/* drv->p is freed in driver_release() */
drv->p = NULL;
out_put_bus:
bus_put(bus);
return error;
}
/**
* bus_remove_driver - delete driver from bus's knowledge.
* @drv: driver.
*
* Detach the driver from the devices it controls, and remove
* it from its bus's list of drivers. Finally, we drop the reference
* to the bus we took in bus_add_driver().
*/
void bus_remove_driver(struct device_driver *drv)
{
if (!drv->bus)
return;
if (!drv->suppress_bind_attrs)
remove_bind_files(drv);
driver_remove_groups(drv, drv->bus->drv_groups);
driver_remove_file(drv, &driver_attr_uevent);
klist_remove(&drv->p->knode_bus);
pr_debug("bus: '%s': remove driver %s\n", drv->bus->name, drv->name);
driver_detach(drv);
module_remove_driver(drv);
kobject_put(&drv->p->kobj);
bus_put(drv->bus);
}
/* Helper for bus_rescan_devices's iter */
static int __must_check bus_rescan_devices_helper(struct device *dev,
void *data)
{
int ret = 0;
if (!dev->driver) {
if (dev->parent && dev->bus->need_parent_lock)
device_lock(dev->parent);
ret = device_attach(dev);
if (dev->parent && dev->bus->need_parent_lock)
device_unlock(dev->parent);
}
return ret < 0 ? ret : 0;
}
/**
* bus_rescan_devices - rescan devices on the bus for possible drivers
* @bus: the bus to scan.
*
* This function will look for devices on the bus with no driver
* attached and rescan it against existing drivers to see if it matches
* any by calling device_attach() for the unbound devices.
*/
int bus_rescan_devices(struct bus_type *bus)
{
return bus_for_each_dev(bus, NULL, NULL, bus_rescan_devices_helper);
}
EXPORT_SYMBOL_GPL(bus_rescan_devices);
/**
* device_reprobe - remove driver for a device and probe for a new driver
* @dev: the device to reprobe
*
* This function detaches the attached driver (if any) for the given
* device and restarts the driver probing process. It is intended
* to use if probing criteria changed during a devices lifetime and
* driver attachment should change accordingly.
*/
int device_reprobe(struct device *dev)
{
if (dev->driver)
device_driver_detach(dev);
return bus_rescan_devices_helper(dev, NULL);
}
EXPORT_SYMBOL_GPL(device_reprobe);
static int bus_add_groups(struct bus_type *bus,
const struct attribute_group **groups)
{
return sysfs_create_groups(&bus->p->subsys.kobj, groups);
}
static void bus_remove_groups(struct bus_type *bus,
const struct attribute_group **groups)
{
sysfs_remove_groups(&bus->p->subsys.kobj, groups);
}
static void klist_devices_get(struct klist_node *n)
{
struct device_private *dev_prv = to_device_private_bus(n);
struct device *dev = dev_prv->device;
get_device(dev);
}
static void klist_devices_put(struct klist_node *n)
{
struct device_private *dev_prv = to_device_private_bus(n);
struct device *dev = dev_prv->device;
put_device(dev);
}
static ssize_t bus_uevent_store(struct bus_type *bus,
const char *buf, size_t count)
{
int rc;
rc = kobject_synth_uevent(&bus->p->subsys.kobj, buf, count);
return rc ? rc : count;
}
/*
* "open code" the old BUS_ATTR() macro here. We want to use BUS_ATTR_WO()
* here, but can not use it as earlier in the file we have
* DEVICE_ATTR_WO(uevent), which would cause a clash with the with the store
* function name.
*/
static struct bus_attribute bus_attr_uevent = __ATTR(uevent, 0200, NULL,
bus_uevent_store);
/**
* bus_register - register a driver-core subsystem
* @bus: bus to register
*
* Once we have that, we register the bus with the kobject
* infrastructure, then register the children subsystems it has:
* the devices and drivers that belong to the subsystem.
*/
int bus_register(struct bus_type *bus)
{
int retval;
struct subsys_private *priv;
struct lock_class_key *key = &bus->lock_key;
priv = kzalloc(sizeof(struct subsys_private), GFP_KERNEL);
if (!priv)
return -ENOMEM;
priv->bus = bus;
bus->p = priv;
BLOCKING_INIT_NOTIFIER_HEAD(&priv->bus_notifier);
retval = kobject_set_name(&priv->subsys.kobj, "%s", bus->name);
if (retval)
goto out;
priv->subsys.kobj.kset = bus_kset;
priv->subsys.kobj.ktype = &bus_ktype;
priv->drivers_autoprobe = 1;
retval = kset_register(&priv->subsys);
if (retval)
goto out;
retval = bus_create_file(bus, &bus_attr_uevent);
if (retval)
goto bus_uevent_fail;
priv->devices_kset = kset_create_and_add("devices", NULL,
&priv->subsys.kobj);
if (!priv->devices_kset) {
retval = -ENOMEM;
goto bus_devices_fail;
}
priv->drivers_kset = kset_create_and_add("drivers", NULL,
&priv->subsys.kobj);
if (!priv->drivers_kset) {
retval = -ENOMEM;
goto bus_drivers_fail;
}
INIT_LIST_HEAD(&priv->interfaces);
__mutex_init(&priv->mutex, "subsys mutex", key);
klist_init(&priv->klist_devices, klist_devices_get, klist_devices_put);
klist_init(&priv->klist_drivers, NULL, NULL);
retval = add_probe_files(bus);
if (retval)
goto bus_probe_files_fail;
retval = bus_add_groups(bus, bus->bus_groups);
if (retval)
goto bus_groups_fail;
pr_debug("bus: '%s': registered\n", bus->name);
return 0;
bus_groups_fail:
remove_probe_files(bus);
bus_probe_files_fail:
kset_unregister(bus->p->drivers_kset);
bus_drivers_fail:
kset_unregister(bus->p->devices_kset);
bus_devices_fail:
bus_remove_file(bus, &bus_attr_uevent);
bus_uevent_fail:
kset_unregister(&bus->p->subsys);
out:
kfree(bus->p);
bus->p = NULL;
return retval;
}
EXPORT_SYMBOL_GPL(bus_register);
/**
* bus_unregister - remove a bus from the system
* @bus: bus.
*
* Unregister the child subsystems and the bus itself.
* Finally, we call bus_put() to release the refcount
*/
void bus_unregister(struct bus_type *bus)
{
pr_debug("bus: '%s': unregistering\n", bus->name);
if (bus->dev_root)
device_unregister(bus->dev_root);
bus_remove_groups(bus, bus->bus_groups);
remove_probe_files(bus);
kset_unregister(bus->p->drivers_kset);
kset_unregister(bus->p->devices_kset);
bus_remove_file(bus, &bus_attr_uevent);
kset_unregister(&bus->p->subsys);
}
EXPORT_SYMBOL_GPL(bus_unregister);
int bus_register_notifier(struct bus_type *bus, struct notifier_block *nb)
{
return blocking_notifier_chain_register(&bus->p->bus_notifier, nb);
}
EXPORT_SYMBOL_GPL(bus_register_notifier);
int bus_unregister_notifier(struct bus_type *bus, struct notifier_block *nb)
{
return blocking_notifier_chain_unregister(&bus->p->bus_notifier, nb);
}
EXPORT_SYMBOL_GPL(bus_unregister_notifier);
struct kset *bus_get_kset(struct bus_type *bus)
{
return &bus->p->subsys;
}
EXPORT_SYMBOL_GPL(bus_get_kset);
struct klist *bus_get_device_klist(struct bus_type *bus)
{
return &bus->p->klist_devices;
}
EXPORT_SYMBOL_GPL(bus_get_device_klist);
/*
* Yes, this forcibly breaks the klist abstraction temporarily. It
* just wants to sort the klist, not change reference counts and
* take/drop locks rapidly in the process. It does all this while
* holding the lock for the list, so objects can't otherwise be
* added/removed while we're swizzling.
*/
static void device_insertion_sort_klist(struct device *a, struct list_head *list,
int (*compare)(const struct device *a,
const struct device *b))
{
struct klist_node *n;
struct device_private *dev_prv;
struct device *b;
list_for_each_entry(n, list, n_node) {
dev_prv = to_device_private_bus(n);
b = dev_prv->device;
if (compare(a, b) <= 0) {
list_move_tail(&a->p->knode_bus.n_node,
&b->p->knode_bus.n_node);
return;
}
}
list_move_tail(&a->p->knode_bus.n_node, list);
}
void bus_sort_breadthfirst(struct bus_type *bus,
int (*compare)(const struct device *a,
const struct device *b))
{
LIST_HEAD(sorted_devices);
struct klist_node *n, *tmp;
struct device_private *dev_prv;
struct device *dev;
struct klist *device_klist;
device_klist = bus_get_device_klist(bus);
spin_lock(&device_klist->k_lock);
list_for_each_entry_safe(n, tmp, &device_klist->k_list, n_node) {
dev_prv = to_device_private_bus(n);
dev = dev_prv->device;
device_insertion_sort_klist(dev, &sorted_devices, compare);
}
list_splice(&sorted_devices, &device_klist->k_list);
spin_unlock(&device_klist->k_lock);
}
EXPORT_SYMBOL_GPL(bus_sort_breadthfirst);
/**
* subsys_dev_iter_init - initialize subsys device iterator
* @iter: subsys iterator to initialize
* @subsys: the subsys we wanna iterate over
* @start: the device to start iterating from, if any
* @type: device_type of the devices to iterate over, NULL for all
*
* Initialize subsys iterator @iter such that it iterates over devices
* of @subsys. If @start is set, the list iteration will start there,
* otherwise if it is NULL, the iteration starts at the beginning of
* the list.
*/
void subsys_dev_iter_init(struct subsys_dev_iter *iter, struct bus_type *subsys,
struct device *start, const struct device_type *type)
{
struct klist_node *start_knode = NULL;
if (start)
start_knode = &start->p->knode_bus;
klist_iter_init_node(&subsys->p->klist_devices, &iter->ki, start_knode);
iter->type = type;
}
EXPORT_SYMBOL_GPL(subsys_dev_iter_init);
/**
* subsys_dev_iter_next - iterate to the next device
* @iter: subsys iterator to proceed
*
* Proceed @iter to the next device and return it. Returns NULL if
* iteration is complete.
*
* The returned device is referenced and won't be released till
* iterator is proceed to the next device or exited. The caller is
* free to do whatever it wants to do with the device including
* calling back into subsys code.
*/
struct device *subsys_dev_iter_next(struct subsys_dev_iter *iter)
{
struct klist_node *knode;
struct device *dev;
for (;;) {
knode = klist_next(&iter->ki);
if (!knode)
return NULL;
dev = to_device_private_bus(knode)->device;
if (!iter->type || iter->type == dev->type)
return dev;
}
}
EXPORT_SYMBOL_GPL(subsys_dev_iter_next);
/**
* subsys_dev_iter_exit - finish iteration
* @iter: subsys iterator to finish
*
* Finish an iteration. Always call this function after iteration is
* complete whether the iteration ran till the end or not.
*/
void subsys_dev_iter_exit(struct subsys_dev_iter *iter)
{
klist_iter_exit(&iter->ki);
}
EXPORT_SYMBOL_GPL(subsys_dev_iter_exit);
int subsys_interface_register(struct subsys_interface *sif)
{
struct bus_type *subsys;
struct subsys_dev_iter iter;
struct device *dev;
if (!sif || !sif->subsys)
return -ENODEV;
subsys = bus_get(sif->subsys);
if (!subsys)
return -EINVAL;
mutex_lock(&subsys->p->mutex);
list_add_tail(&sif->node, &subsys->p->interfaces);
if (sif->add_dev) {
subsys_dev_iter_init(&iter, subsys, NULL, NULL);
while ((dev = subsys_dev_iter_next(&iter)))
sif->add_dev(dev, sif);
subsys_dev_iter_exit(&iter);
}
mutex_unlock(&subsys->p->mutex);
return 0;
}
EXPORT_SYMBOL_GPL(subsys_interface_register);
void subsys_interface_unregister(struct subsys_interface *sif)
{
struct bus_type *subsys;
struct subsys_dev_iter iter;
struct device *dev;
if (!sif || !sif->subsys)
return;
subsys = sif->subsys;
mutex_lock(&subsys->p->mutex);
list_del_init(&sif->node);
if (sif->remove_dev) {
subsys_dev_iter_init(&iter, subsys, NULL, NULL);
while ((dev = subsys_dev_iter_next(&iter)))
sif->remove_dev(dev, sif);
subsys_dev_iter_exit(&iter);
}
mutex_unlock(&subsys->p->mutex);
bus_put(subsys);
}
EXPORT_SYMBOL_GPL(subsys_interface_unregister);
static void system_root_device_release(struct device *dev)
{
kfree(dev);
}
static int subsys_register(struct bus_type *subsys,
const struct attribute_group **groups,
struct kobject *parent_of_root)
{
struct device *dev;
int err;
err = bus_register(subsys);
if (err < 0)
return err;
dev = kzalloc(sizeof(struct device), GFP_KERNEL);
if (!dev) {
err = -ENOMEM;
goto err_dev;
}
err = dev_set_name(dev, "%s", subsys->name);
if (err < 0)
goto err_name;
dev->kobj.parent = parent_of_root;
dev->groups = groups;
dev->release = system_root_device_release;
err = device_register(dev);
if (err < 0)
goto err_dev_reg;
subsys->dev_root = dev;
return 0;
err_dev_reg:
put_device(dev);
dev = NULL;
err_name:
kfree(dev);
err_dev:
bus_unregister(subsys);
return err;
}
/**
* subsys_system_register - register a subsystem at /sys/devices/system/
* @subsys: system subsystem
* @groups: default attributes for the root device
*
* All 'system' subsystems have a /sys/devices/system/<name> root device
* with the name of the subsystem. The root device can carry subsystem-
* wide attributes. All registered devices are below this single root
* device and are named after the subsystem with a simple enumeration
* number appended. The registered devices are not explicitly named;
* only 'id' in the device needs to be set.
*
* Do not use this interface for anything new, it exists for compatibility
* with bad ideas only. New subsystems should use plain subsystems; and
* add the subsystem-wide attributes should be added to the subsystem
* directory itself and not some create fake root-device placed in
* /sys/devices/system/<name>.
*/
int subsys_system_register(struct bus_type *subsys,
const struct attribute_group **groups)
{
return subsys_register(subsys, groups, &system_kset->kobj);
}
EXPORT_SYMBOL_GPL(subsys_system_register);
/**
* subsys_virtual_register - register a subsystem at /sys/devices/virtual/
* @subsys: virtual subsystem
* @groups: default attributes for the root device
*
* All 'virtual' subsystems have a /sys/devices/system/<name> root device
* with the name of the subystem. The root device can carry subsystem-wide
* attributes. All registered devices are below this single root device.
* There's no restriction on device naming. This is for kernel software
* constructs which need sysfs interface.
*/
int subsys_virtual_register(struct bus_type *subsys,
const struct attribute_group **groups)
{
struct kobject *virtual_dir;
virtual_dir = virtual_device_parent(NULL);
if (!virtual_dir)
return -ENOMEM;
return subsys_register(subsys, groups, virtual_dir);
}
EXPORT_SYMBOL_GPL(subsys_virtual_register);
int __init buses_init(void)
{
bus_kset = kset_create_and_add("bus", &bus_uevent_ops, NULL);
if (!bus_kset)
return -ENOMEM;
system_kset = kset_create_and_add("system", NULL, &devices_kset->kobj);
if (!system_kset)
return -ENOMEM;
return 0;
}
// SPDX-License-Identifier: GPL-2.0
/*
* Copyright (C) 2001 Jens Axboe <axboe@kernel.dk>
*/
#include <linux/mm.h>
#include <linux/swap.h>
#include <linux/bio.h>
#include <linux/blkdev.h>
#include <linux/uio.h>
#include <linux/iocontext.h>
#include <linux/slab.h>
#include <linux/init.h>
#include <linux/kernel.h>
#include <linux/export.h>
#include <linux/mempool.h>
#include <linux/workqueue.h>
#include <linux/cgroup.h>
#include <linux/blk-cgroup.h>
#include <linux/highmem.h>
#include <linux/sched/sysctl.h>
#include <linux/blk-crypto.h>
#include <linux/xarray.h>
#include <trace/events/block.h>
#include "blk.h"
#include "blk-rq-qos.h"
struct bio_alloc_cache {
struct bio_list free_list;
unsigned int nr;
};
static struct biovec_slab {
int nr_vecs;
char *name;
struct kmem_cache *slab;
} bvec_slabs[] __read_mostly = {
{ .nr_vecs = 16, .name = "biovec-16" },
{ .nr_vecs = 64, .name = "biovec-64" },
{ .nr_vecs = 128, .name = "biovec-128" },
{ .nr_vecs = BIO_MAX_VECS, .name = "biovec-max" },
};
static struct biovec_slab *biovec_slab(unsigned short nr_vecs)
{
switch (nr_vecs) {
/* smaller bios use inline vecs */
case 5 ... 16:
return &bvec_slabs[0];
case 17 ... 64:
return &bvec_slabs[1];
case 65 ... 128:
return &bvec_slabs[2];
case 129 ... BIO_MAX_VECS:
return &bvec_slabs[3];
default:
BUG();
return NULL;
}
}
/*
* fs_bio_set is the bio_set containing bio and iovec memory pools used by
* IO code that does not need private memory pools.
*/
struct bio_set fs_bio_set;
EXPORT_SYMBOL(fs_bio_set);
/*
* Our slab pool management
*/
struct bio_slab {
struct kmem_cache *slab;
unsigned int slab_ref;
unsigned int slab_size;
char name[8];
};
static DEFINE_MUTEX(bio_slab_lock);
static DEFINE_XARRAY(bio_slabs);
static struct bio_slab *create_bio_slab(unsigned int size)
{
struct bio_slab *bslab = kzalloc(sizeof(*bslab), GFP_KERNEL);
if (!bslab)
return NULL;
snprintf(bslab->name, sizeof(bslab->name), "bio-%d", size);
bslab->slab = kmem_cache_create(bslab->name, size,
ARCH_KMALLOC_MINALIGN, SLAB_HWCACHE_ALIGN, NULL);
if (!bslab->slab)
goto fail_alloc_slab;
bslab->slab_ref = 1;
bslab->slab_size = size;
if (!xa_err(xa_store(&bio_slabs, size, bslab, GFP_KERNEL)))
return bslab;
kmem_cache_destroy(bslab->slab);
fail_alloc_slab:
kfree(bslab);
return NULL;
}
static inline unsigned int bs_bio_slab_size(struct bio_set *bs)
{
return bs->front_pad + sizeof(struct bio) + bs->back_pad;
}
static struct kmem_cache *bio_find_or_create_slab(struct bio_set *bs)
{
unsigned int size = bs_bio_slab_size(bs);
struct bio_slab *bslab;
mutex_lock(&bio_slab_lock);
bslab = xa_load(&bio_slabs, size);
if (bslab)
bslab->slab_ref++;
else
bslab = create_bio_slab(size);
mutex_unlock(&bio_slab_lock);
if (bslab)
return bslab->slab;
return NULL;
}
static void bio_put_slab(struct bio_set *bs)
{
struct bio_slab *bslab = NULL;
unsigned int slab_size = bs_bio_slab_size(bs);
mutex_lock(&bio_slab_lock);
bslab = xa_load(&bio_slabs, slab_size);
if (WARN(!bslab, KERN_ERR "bio: unable to find slab!\n"))
goto out;
WARN_ON_ONCE(bslab->slab != bs->bio_slab);
WARN_ON(!bslab->slab_ref);
if (--bslab->slab_ref)
goto out;
xa_erase(&bio_slabs, slab_size);
kmem_cache_destroy(bslab->slab);
kfree(bslab);
out:
mutex_unlock(&bio_slab_lock);
}
void bvec_free(mempool_t *pool, struct bio_vec *bv, unsigned short nr_vecs)
{
BIO_BUG_ON(nr_vecs > BIO_MAX_VECS); if (nr_vecs == BIO_MAX_VECS) mempool_free(bv, pool); else if (nr_vecs > BIO_INLINE_VECS) kmem_cache_free(biovec_slab(nr_vecs)->slab, bv);
}
/*
* Make the first allocation restricted and don't dump info on allocation
* failures, since we'll fall back to the mempool in case of failure.
*/
static inline gfp_t bvec_alloc_gfp(gfp_t gfp)
{
return (gfp & ~(__GFP_DIRECT_RECLAIM | __GFP_IO)) |
__GFP_NOMEMALLOC | __GFP_NORETRY | __GFP_NOWARN;
}
struct bio_vec *bvec_alloc(mempool_t *pool, unsigned short *nr_vecs,
gfp_t gfp_mask)
{
struct biovec_slab *bvs = biovec_slab(*nr_vecs);
if (WARN_ON_ONCE(!bvs))
return NULL;
/*
* Upgrade the nr_vecs request to take full advantage of the allocation.
* We also rely on this in the bvec_free path.
*/
*nr_vecs = bvs->nr_vecs;
/*
* Try a slab allocation first for all smaller allocations. If that
* fails and __GFP_DIRECT_RECLAIM is set retry with the mempool.
* The mempool is sized to handle up to BIO_MAX_VECS entries.
*/
if (*nr_vecs < BIO_MAX_VECS) {
struct bio_vec *bvl;
bvl = kmem_cache_alloc(bvs->slab, bvec_alloc_gfp(gfp_mask));
if (likely(bvl) || !(gfp_mask & __GFP_DIRECT_RECLAIM))
return bvl;
*nr_vecs = BIO_MAX_VECS;
}
return mempool_alloc(pool, gfp_mask);
}
void bio_uninit(struct bio *bio)
{
#ifdef CONFIG_BLK_CGROUP
if (bio->bi_blkg) {
blkg_put(bio->bi_blkg);
bio->bi_blkg = NULL;
}
#endif
if (bio_integrity(bio))
bio_integrity_free(bio);
bio_crypt_free_ctx(bio);
}
EXPORT_SYMBOL(bio_uninit);
static void bio_free(struct bio *bio)
{
struct bio_set *bs = bio->bi_pool;
void *p;
bio_uninit(bio);
if (bs) {
bvec_free(&bs->bvec_pool, bio->bi_io_vec, bio->bi_max_vecs);
/*
* If we have front padding, adjust the bio pointer before freeing
*/
p = bio;
p -= bs->front_pad;
mempool_free(p, &bs->bio_pool);
} else {
/* Bio was allocated by bio_kmalloc() */
kfree(bio);
}
}
/*
* Users of this function have their own bio allocation. Subsequently,
* they must remember to pair any call to bio_init() with bio_uninit()
* when IO has completed, or when the bio is released.
*/
void bio_init(struct bio *bio, struct bio_vec *table,
unsigned short max_vecs)
{
bio->bi_next = NULL;
bio->bi_bdev = NULL;
bio->bi_opf = 0;
bio->bi_flags = 0;
bio->bi_ioprio = 0;
bio->bi_write_hint = 0;
bio->bi_status = 0;
bio->bi_iter.bi_sector = 0;
bio->bi_iter.bi_size = 0;
bio->bi_iter.bi_idx = 0;
bio->bi_iter.bi_bvec_done = 0;
bio->bi_end_io = NULL;
bio->bi_private = NULL;
#ifdef CONFIG_BLK_CGROUP
bio->bi_blkg = NULL;
bio->bi_issue.value = 0;
#ifdef CONFIG_BLK_CGROUP_IOCOST
bio->bi_iocost_cost = 0;
#endif
#endif
#ifdef CONFIG_BLK_INLINE_ENCRYPTION
bio->bi_crypt_context = NULL;
#endif
#ifdef CONFIG_BLK_DEV_INTEGRITY
bio->bi_integrity = NULL;
#endif
bio->bi_vcnt = 0;
atomic_set(&bio->__bi_remaining, 1);
atomic_set(&bio->__bi_cnt, 1);
bio->bi_max_vecs = max_vecs;
bio->bi_io_vec = table;
bio->bi_pool = NULL;
}
EXPORT_SYMBOL(bio_init);
/**
* bio_reset - reinitialize a bio
* @bio: bio to reset
*
* Description:
* After calling bio_reset(), @bio will be in the same state as a freshly
* allocated bio returned bio bio_alloc_bioset() - the only fields that are
* preserved are the ones that are initialized by bio_alloc_bioset(). See
* comment in struct bio.
*/
void bio_reset(struct bio *bio)
{
bio_uninit(bio);
memset(bio, 0, BIO_RESET_BYTES);
atomic_set(&bio->__bi_remaining, 1);
}
EXPORT_SYMBOL(bio_reset);
static struct bio *__bio_chain_endio(struct bio *bio)
{
struct bio *parent = bio->bi_private; if (bio->bi_status && !parent->bi_status) parent->bi_status = bio->bi_status; bio_put(bio);
return parent;
}
static void bio_chain_endio(struct bio *bio)
{
bio_endio(__bio_chain_endio(bio));
}
/**
* bio_chain - chain bio completions
* @bio: the target bio
* @parent: the parent bio of @bio
*
* The caller won't have a bi_end_io called when @bio completes - instead,
* @parent's bi_end_io won't be called until both @parent and @bio have
* completed; the chained bio will also be freed when it completes.
*
* The caller must not set bi_private or bi_end_io in @bio.
*/
void bio_chain(struct bio *bio, struct bio *parent)
{
BUG_ON(bio->bi_private || bio->bi_end_io); bio->bi_private = parent;
bio->bi_end_io = bio_chain_endio;
bio_inc_remaining(parent);
}
EXPORT_SYMBOL(bio_chain);
static void bio_alloc_rescue(struct work_struct *work)
{
struct bio_set *bs = container_of(work, struct bio_set, rescue_work);
struct bio *bio;
while (1) {
spin_lock(&bs->rescue_lock);
bio = bio_list_pop(&bs->rescue_list);
spin_unlock(&bs->rescue_lock);
if (!bio)
break;
submit_bio_noacct(bio);
}
}
static void punt_bios_to_rescuer(struct bio_set *bs)
{
struct bio_list punt, nopunt;
struct bio *bio;
if (WARN_ON_ONCE(!bs->rescue_workqueue))
return;
/*
* In order to guarantee forward progress we must punt only bios that
* were allocated from this bio_set; otherwise, if there was a bio on
* there for a stacking driver higher up in the stack, processing it
* could require allocating bios from this bio_set, and doing that from
* our own rescuer would be bad.
*
* Since bio lists are singly linked, pop them all instead of trying to
* remove from the middle of the list:
*/
bio_list_init(&punt);
bio_list_init(&nopunt);
while ((bio = bio_list_pop(¤t->bio_list[0])))
bio_list_add(bio->bi_pool == bs ? &punt : &nopunt, bio);
current->bio_list[0] = nopunt;
bio_list_init(&nopunt);
while ((bio = bio_list_pop(¤t->bio_list[1])))
bio_list_add(bio->bi_pool == bs ? &punt : &nopunt, bio);
current->bio_list[1] = nopunt;
spin_lock(&bs->rescue_lock);
bio_list_merge(&bs->rescue_list, &punt);
spin_unlock(&bs->rescue_lock);
queue_work(bs->rescue_workqueue, &bs->rescue_work);
}
/**
* bio_alloc_bioset - allocate a bio for I/O
* @gfp_mask: the GFP_* mask given to the slab allocator
* @nr_iovecs: number of iovecs to pre-allocate
* @bs: the bio_set to allocate from.
*
* Allocate a bio from the mempools in @bs.
*
* If %__GFP_DIRECT_RECLAIM is set then bio_alloc will always be able to
* allocate a bio. This is due to the mempool guarantees. To make this work,
* callers must never allocate more than 1 bio at a time from the general pool.
* Callers that need to allocate more than 1 bio must always submit the
* previously allocated bio for IO before attempting to allocate a new one.
* Failure to do so can cause deadlocks under memory pressure.
*
* Note that when running under submit_bio_noacct() (i.e. any block driver),
* bios are not submitted until after you return - see the code in
* submit_bio_noacct() that converts recursion into iteration, to prevent
* stack overflows.
*
* This would normally mean allocating multiple bios under submit_bio_noacct()
* would be susceptible to deadlocks, but we have
* deadlock avoidance code that resubmits any blocked bios from a rescuer
* thread.
*
* However, we do not guarantee forward progress for allocations from other
* mempools. Doing multiple allocations from the same mempool under
* submit_bio_noacct() should be avoided - instead, use bio_set's front_pad
* for per bio allocations.
*
* Returns: Pointer to new bio on success, NULL on failure.
*/
struct bio *bio_alloc_bioset(gfp_t gfp_mask, unsigned short nr_iovecs,
struct bio_set *bs)
{
gfp_t saved_gfp = gfp_mask;
struct bio *bio;
void *p;
/* should not use nobvec bioset for nr_iovecs > 0 */
if (WARN_ON_ONCE(!mempool_initialized(&bs->bvec_pool) && nr_iovecs > 0))
return NULL;
/*
* submit_bio_noacct() converts recursion to iteration; this means if
* we're running beneath it, any bios we allocate and submit will not be
* submitted (and thus freed) until after we return.
*
* This exposes us to a potential deadlock if we allocate multiple bios
* from the same bio_set() while running underneath submit_bio_noacct().
* If we were to allocate multiple bios (say a stacking block driver
* that was splitting bios), we would deadlock if we exhausted the
* mempool's reserve.
*
* We solve this, and guarantee forward progress, with a rescuer
* workqueue per bio_set. If we go to allocate and there are bios on
* current->bio_list, we first try the allocation without
* __GFP_DIRECT_RECLAIM; if that fails, we punt those bios we would be
* blocking to the rescuer workqueue before we retry with the original
* gfp_flags.
*/
if (current->bio_list &&
(!bio_list_empty(¤t->bio_list[0]) ||
!bio_list_empty(¤t->bio_list[1])) &&
bs->rescue_workqueue) gfp_mask &= ~__GFP_DIRECT_RECLAIM; p = mempool_alloc(&bs->bio_pool, gfp_mask); if (!p && gfp_mask != saved_gfp) { punt_bios_to_rescuer(bs);
gfp_mask = saved_gfp;
p = mempool_alloc(&bs->bio_pool, gfp_mask);
}
if (unlikely(!p))
return NULL;
bio = p + bs->front_pad;
if (nr_iovecs > BIO_INLINE_VECS) {
struct bio_vec *bvl = NULL;
bvl = bvec_alloc(&bs->bvec_pool, &nr_iovecs, gfp_mask); if (!bvl && gfp_mask != saved_gfp) { punt_bios_to_rescuer(bs);
gfp_mask = saved_gfp;
bvl = bvec_alloc(&bs->bvec_pool, &nr_iovecs, gfp_mask);
}
if (unlikely(!bvl))
goto err_free;
bio_init(bio, bvl, nr_iovecs); } else if (nr_iovecs) { bio_init(bio, bio->bi_inline_vecs, BIO_INLINE_VECS);
} else {
bio_init(bio, NULL, 0);
}
bio->bi_pool = bs; return bio;
err_free:
mempool_free(p, &bs->bio_pool);
return NULL;
}
EXPORT_SYMBOL(bio_alloc_bioset);
/**
* bio_kmalloc - kmalloc a bio for I/O
* @gfp_mask: the GFP_* mask given to the slab allocator
* @nr_iovecs: number of iovecs to pre-allocate
*
* Use kmalloc to allocate and initialize a bio.
*
* Returns: Pointer to new bio on success, NULL on failure.
*/
struct bio *bio_kmalloc(gfp_t gfp_mask, unsigned short nr_iovecs)
{
struct bio *bio;
if (nr_iovecs > UIO_MAXIOV)
return NULL;
bio = kmalloc(struct_size(bio, bi_inline_vecs, nr_iovecs), gfp_mask);
if (unlikely(!bio))
return NULL;
bio_init(bio, nr_iovecs ? bio->bi_inline_vecs : NULL, nr_iovecs);
bio->bi_pool = NULL;
return bio;
}
EXPORT_SYMBOL(bio_kmalloc);
void zero_fill_bio(struct bio *bio)
{
struct bio_vec bv;
struct bvec_iter iter;
bio_for_each_segment(bv, bio, iter)
memzero_bvec(&bv);
}
EXPORT_SYMBOL(zero_fill_bio);
/**
* bio_truncate - truncate the bio to small size of @new_size
* @bio: the bio to be truncated
* @new_size: new size for truncating the bio
*
* Description:
* Truncate the bio to new size of @new_size. If bio_op(bio) is
* REQ_OP_READ, zero the truncated part. This function should only
* be used for handling corner cases, such as bio eod.
*/
void bio_truncate(struct bio *bio, unsigned new_size)
{
struct bio_vec bv;
struct bvec_iter iter;
unsigned int done = 0;
bool truncated = false;
if (new_size >= bio->bi_iter.bi_size)
return;
if (bio_op(bio) != REQ_OP_READ)
goto exit;
bio_for_each_segment(bv, bio, iter) {
if (done + bv.bv_len > new_size) {
unsigned offset;
if (!truncated) offset = new_size - done;
else
offset = 0;
zero_user(bv.bv_page, bv.bv_offset + offset,
bv.bv_len - offset);
truncated = true;
}
done += bv.bv_len;
}
exit:
/*
* Don't touch bvec table here and make it really immutable, since
* fs bio user has to retrieve all pages via bio_for_each_segment_all
* in its .end_bio() callback.
*
* It is enough to truncate bio by updating .bi_size since we can make
* correct bvec with the updated .bi_size for drivers.
*/
bio->bi_iter.bi_size = new_size;
}
/**
* guard_bio_eod - truncate a BIO to fit the block device
* @bio: bio to truncate
*
* This allows us to do IO even on the odd last sectors of a device, even if the
* block size is some multiple of the physical sector size.
*
* We'll just truncate the bio to the size of the device, and clear the end of
* the buffer head manually. Truly out-of-range accesses will turn into actual
* I/O errors, this only handles the "we need to be able to do I/O at the final
* sector" case.
*/
void guard_bio_eod(struct bio *bio)
{
sector_t maxsector = bdev_nr_sectors(bio->bi_bdev);
if (!maxsector)
return;
/*
* If the *whole* IO is past the end of the device,
* let it through, and the IO layer will turn it into
* an EIO.
*/
if (unlikely(bio->bi_iter.bi_sector >= maxsector))
return;
maxsector -= bio->bi_iter.bi_sector;
if (likely((bio->bi_iter.bi_size >> 9) <= maxsector))
return;
bio_truncate(bio, maxsector << 9);
}
#define ALLOC_CACHE_MAX 512
#define ALLOC_CACHE_SLACK 64
static void bio_alloc_cache_prune(struct bio_alloc_cache *cache,
unsigned int nr)
{
unsigned int i = 0;
struct bio *bio;
while ((bio = bio_list_pop(&cache->free_list)) != NULL) {
cache->nr--;
bio_free(bio);
if (++i == nr)
break;
}
}
static int bio_cpu_dead(unsigned int cpu, struct hlist_node *node)
{
struct bio_set *bs;
bs = hlist_entry_safe(node, struct bio_set, cpuhp_dead);
if (bs->cache) {
struct bio_alloc_cache *cache = per_cpu_ptr(bs->cache, cpu);
bio_alloc_cache_prune(cache, -1U);
}
return 0;
}
static void bio_alloc_cache_destroy(struct bio_set *bs)
{
int cpu;
if (!bs->cache)
return;
cpuhp_state_remove_instance_nocalls(CPUHP_BIO_DEAD, &bs->cpuhp_dead);
for_each_possible_cpu(cpu) {
struct bio_alloc_cache *cache;
cache = per_cpu_ptr(bs->cache, cpu);
bio_alloc_cache_prune(cache, -1U);
}
free_percpu(bs->cache);
}
/**
* bio_put - release a reference to a bio
* @bio: bio to release reference to
*
* Description:
* Put a reference to a &struct bio, either one you have gotten with
* bio_alloc, bio_get or bio_clone_*. The last put of a bio will free it.
**/
void bio_put(struct bio *bio)
{
if (unlikely(bio_flagged(bio, BIO_REFFED))) {
BIO_BUG_ON(!atomic_read(&bio->__bi_cnt)); if (!atomic_dec_and_test(&bio->__bi_cnt))
return;
}
if (bio_flagged(bio, BIO_PERCPU_CACHE)) {
struct bio_alloc_cache *cache;
bio_uninit(bio);
cache = per_cpu_ptr(bio->bi_pool->cache, get_cpu());
bio_list_add_head(&cache->free_list, bio);
if (++cache->nr > ALLOC_CACHE_MAX + ALLOC_CACHE_SLACK) bio_alloc_cache_prune(cache, ALLOC_CACHE_SLACK); put_cpu();
} else {
bio_free(bio);
}
}
EXPORT_SYMBOL(bio_put);
/**
* __bio_clone_fast - clone a bio that shares the original bio's biovec
* @bio: destination bio
* @bio_src: bio to clone
*
* Clone a &bio. Caller will own the returned bio, but not
* the actual data it points to. Reference count of returned
* bio will be one.
*
* Caller must ensure that @bio_src is not freed before @bio.
*/
void __bio_clone_fast(struct bio *bio, struct bio *bio_src)
{
WARN_ON_ONCE(bio->bi_pool && bio->bi_max_vecs);
/*
* most users will be overriding ->bi_bdev with a new target,
* so we don't set nor calculate new physical/hw segment counts here
*/
bio->bi_bdev = bio_src->bi_bdev;
bio_set_flag(bio, BIO_CLONED);
if (bio_flagged(bio_src, BIO_THROTTLED))
bio_set_flag(bio, BIO_THROTTLED);
if (bio_flagged(bio_src, BIO_REMAPPED))
bio_set_flag(bio, BIO_REMAPPED);
bio->bi_opf = bio_src->bi_opf;
bio->bi_ioprio = bio_src->bi_ioprio;
bio->bi_write_hint = bio_src->bi_write_hint;
bio->bi_iter = bio_src->bi_iter;
bio->bi_io_vec = bio_src->bi_io_vec;
bio_clone_blkg_association(bio, bio_src);
blkcg_bio_issue_init(bio);
}
EXPORT_SYMBOL(__bio_clone_fast);
/**
* bio_clone_fast - clone a bio that shares the original bio's biovec
* @bio: bio to clone
* @gfp_mask: allocation priority
* @bs: bio_set to allocate from
*
* Like __bio_clone_fast, only also allocates the returned bio
*/
struct bio *bio_clone_fast(struct bio *bio, gfp_t gfp_mask, struct bio_set *bs)
{
struct bio *b;
b = bio_alloc_bioset(gfp_mask, 0, bs);
if (!b)
return NULL;
__bio_clone_fast(b, bio);
if (bio_crypt_clone(b, bio, gfp_mask) < 0)
goto err_put;
if (bio_integrity(bio) &&
bio_integrity_clone(b, bio, gfp_mask) < 0)
goto err_put;
return b;
err_put:
bio_put(b);
return NULL;
}
EXPORT_SYMBOL(bio_clone_fast);
const char *bio_devname(struct bio *bio, char *buf)
{
return bdevname(bio->bi_bdev, buf);
}
EXPORT_SYMBOL(bio_devname);
static inline bool page_is_mergeable(const struct bio_vec *bv,
struct page *page, unsigned int len, unsigned int off,
bool *same_page)
{
size_t bv_end = bv->bv_offset + bv->bv_len;
phys_addr_t vec_end_addr = page_to_phys(bv->bv_page) + bv_end - 1;
phys_addr_t page_addr = page_to_phys(page);
if (vec_end_addr + 1 != page_addr + off)
return false;
if (xen_domain() && !xen_biovec_phys_mergeable(bv, page))
return false;
*same_page = ((vec_end_addr & PAGE_MASK) == page_addr);
if (*same_page)
return true;
return (bv->bv_page + bv_end / PAGE_SIZE) == (page + off / PAGE_SIZE);
}
/*
* Try to merge a page into a segment, while obeying the hardware segment
* size limit. This is not for normal read/write bios, but for passthrough
* or Zone Append operations that we can't split.
*/
static bool bio_try_merge_hw_seg(struct request_queue *q, struct bio *bio,
struct page *page, unsigned len,
unsigned offset, bool *same_page)
{
struct bio_vec *bv = &bio->bi_io_vec[bio->bi_vcnt - 1];
unsigned long mask = queue_segment_boundary(q);
phys_addr_t addr1 = page_to_phys(bv->bv_page) + bv->bv_offset;
phys_addr_t addr2 = page_to_phys(page) + offset + len - 1;
if ((addr1 | mask) != (addr2 | mask))
return false;
if (bv->bv_len + len > queue_max_segment_size(q))
return false;
return __bio_try_merge_page(bio, page, len, offset, same_page);
}
/**
* bio_add_hw_page - attempt to add a page to a bio with hw constraints
* @q: the target queue
* @bio: destination bio
* @page: page to add
* @len: vec entry length
* @offset: vec entry offset
* @max_sectors: maximum number of sectors that can be added
* @same_page: return if the segment has been merged inside the same page
*
* Add a page to a bio while respecting the hardware max_sectors, max_segment
* and gap limitations.
*/
int bio_add_hw_page(struct request_queue *q, struct bio *bio,
struct page *page, unsigned int len, unsigned int offset,
unsigned int max_sectors, bool *same_page)
{
struct bio_vec *bvec;
if (WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED)))
return 0;
if (((bio->bi_iter.bi_size + len) >> 9) > max_sectors)
return 0;
if (bio->bi_vcnt > 0) {
if (bio_try_merge_hw_seg(q, bio, page, len, offset, same_page))
return len;
/*
* If the queue doesn't support SG gaps and adding this segment
* would create a gap, disallow it.
*/
bvec = &bio->bi_io_vec[bio->bi_vcnt - 1];
if (bvec_gap_to_prev(q, bvec, offset))
return 0;
}
if (bio_full(bio, len))
return 0;
if (bio->bi_vcnt >= queue_max_segments(q))
return 0;
bvec = &bio->bi_io_vec[bio->bi_vcnt];
bvec->bv_page = page;
bvec->bv_len = len;
bvec->bv_offset = offset;
bio->bi_vcnt++;
bio->bi_iter.bi_size += len;
return len;
}
/**
* bio_add_pc_page - attempt to add page to passthrough bio
* @q: the target queue
* @bio: destination bio
* @page: page to add
* @len: vec entry length
* @offset: vec entry offset
*
* Attempt to add a page to the bio_vec maplist. This can fail for a
* number of reasons, such as the bio being full or target block device
* limitations. The target block device must allow bio's up to PAGE_SIZE,
* so it is always possible to add a single page to an empty bio.
*
* This should only be used by passthrough bios.
*/
int bio_add_pc_page(struct request_queue *q, struct bio *bio,
struct page *page, unsigned int len, unsigned int offset)
{
bool same_page = false;
return bio_add_hw_page(q, bio, page, len, offset,
queue_max_hw_sectors(q), &same_page);
}
EXPORT_SYMBOL(bio_add_pc_page);
/**
* bio_add_zone_append_page - attempt to add page to zone-append bio
* @bio: destination bio
* @page: page to add
* @len: vec entry length
* @offset: vec entry offset
*
* Attempt to add a page to the bio_vec maplist of a bio that will be submitted
* for a zone-append request. This can fail for a number of reasons, such as the
* bio being full or the target block device is not a zoned block device or
* other limitations of the target block device. The target block device must
* allow bio's up to PAGE_SIZE, so it is always possible to add a single page
* to an empty bio.
*
* Returns: number of bytes added to the bio, or 0 in case of a failure.
*/
int bio_add_zone_append_page(struct bio *bio, struct page *page,
unsigned int len, unsigned int offset)
{
struct request_queue *q = bio->bi_bdev->bd_disk->queue;
bool same_page = false;
if (WARN_ON_ONCE(bio_op(bio) != REQ_OP_ZONE_APPEND))
return 0;
if (WARN_ON_ONCE(!blk_queue_is_zoned(q)))
return 0;
return bio_add_hw_page(q, bio, page, len, offset,
queue_max_zone_append_sectors(q), &same_page);
}
EXPORT_SYMBOL_GPL(bio_add_zone_append_page);
/**
* __bio_try_merge_page - try appending data to an existing bvec.
* @bio: destination bio
* @page: start page to add
* @len: length of the data to add
* @off: offset of the data relative to @page
* @same_page: return if the segment has been merged inside the same page
*
* Try to add the data at @page + @off to the last bvec of @bio. This is a
* useful optimisation for file systems with a block size smaller than the
* page size.
*
* Warn if (@len, @off) crosses pages in case that @same_page is true.
*
* Return %true on success or %false on failure.
*/
bool __bio_try_merge_page(struct bio *bio, struct page *page,
unsigned int len, unsigned int off, bool *same_page)
{
if (WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED)))
return false;
if (bio->bi_vcnt > 0) { struct bio_vec *bv = &bio->bi_io_vec[bio->bi_vcnt - 1];
if (page_is_mergeable(bv, page, len, off, same_page)) {
if (bio->bi_iter.bi_size > UINT_MAX - len) { *same_page = false; return false;
}
bv->bv_len += len;
bio->bi_iter.bi_size += len;
return true;
}
}
return false;
}
EXPORT_SYMBOL_GPL(__bio_try_merge_page);
/**
* __bio_add_page - add page(s) to a bio in a new segment
* @bio: destination bio
* @page: start page to add
* @len: length of the data to add, may cross pages
* @off: offset of the data relative to @page, may cross pages
*
* Add the data at @page + @off to @bio as a new bvec. The caller must ensure
* that @bio has space for another bvec.
*/
void __bio_add_page(struct bio *bio, struct page *page,
unsigned int len, unsigned int off)
{
struct bio_vec *bv = &bio->bi_io_vec[bio->bi_vcnt]; WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED)); WARN_ON_ONCE(bio_full(bio, len)); bv->bv_page = page;
bv->bv_offset = off;
bv->bv_len = len;
bio->bi_iter.bi_size += len;
bio->bi_vcnt++;
if (!bio_flagged(bio, BIO_WORKINGSET) && unlikely(PageWorkingset(page)))
bio_set_flag(bio, BIO_WORKINGSET);
}
EXPORT_SYMBOL_GPL(__bio_add_page);
/**
* bio_add_page - attempt to add page(s) to bio
* @bio: destination bio
* @page: start page to add
* @len: vec entry length, may cross pages
* @offset: vec entry offset relative to @page, may cross pages
*
* Attempt to add page(s) to the bio_vec maplist. This will only fail
* if either bio->bi_vcnt == bio->bi_max_vecs or it's a cloned bio.
*/
int bio_add_page(struct bio *bio, struct page *page,
unsigned int len, unsigned int offset)
{
bool same_page = false;
if (!__bio_try_merge_page(bio, page, len, offset, &same_page)) {
if (bio_full(bio, len))
return 0;
__bio_add_page(bio, page, len, offset);
}
return len;
}
EXPORT_SYMBOL(bio_add_page);
void bio_release_pages(struct bio *bio, bool mark_dirty)
{
struct bvec_iter_all iter_all;
struct bio_vec *bvec;
if (bio_flagged(bio, BIO_NO_PAGE_REF))
return;
bio_for_each_segment_all(bvec, bio, iter_all) {
if (mark_dirty && !PageCompound(bvec->bv_page)) set_page_dirty_lock(bvec->bv_page);
put_page(bvec->bv_page);
}
}
EXPORT_SYMBOL_GPL(bio_release_pages);
static void __bio_iov_bvec_set(struct bio *bio, struct iov_iter *iter)
{
WARN_ON_ONCE(bio->bi_max_vecs); bio->bi_vcnt = iter->nr_segs;
bio->bi_io_vec = (struct bio_vec *)iter->bvec;
bio->bi_iter.bi_bvec_done = iter->iov_offset;
bio->bi_iter.bi_size = iter->count;
bio_set_flag(bio, BIO_NO_PAGE_REF);
bio_set_flag(bio, BIO_CLONED);
}
static int bio_iov_bvec_set(struct bio *bio, struct iov_iter *iter)
{
__bio_iov_bvec_set(bio, iter);
iov_iter_advance(iter, iter->count);
return 0;
}
static int bio_iov_bvec_set_append(struct bio *bio, struct iov_iter *iter)
{
struct request_queue *q = bio->bi_bdev->bd_disk->queue;
struct iov_iter i = *iter;
iov_iter_truncate(&i, queue_max_zone_append_sectors(q) << 9);
__bio_iov_bvec_set(bio, &i);
iov_iter_advance(iter, i.count);
return 0;
}
static void bio_put_pages(struct page **pages, size_t size, size_t off)
{
size_t i, nr = DIV_ROUND_UP(size + (off & ~PAGE_MASK), PAGE_SIZE);
for (i = 0; i < nr; i++)
put_page(pages[i]);
}
#define PAGE_PTRS_PER_BVEC (sizeof(struct bio_vec) / sizeof(struct page *))
/**
* __bio_iov_iter_get_pages - pin user or kernel pages and add them to a bio
* @bio: bio to add pages to
* @iter: iov iterator describing the region to be mapped
*
* Pins pages from *iter and appends them to @bio's bvec array. The
* pages will have to be released using put_page() when done.
* For multi-segment *iter, this function only adds pages from the
* next non-empty segment of the iov iterator.
*/
static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter)
{
unsigned short nr_pages = bio->bi_max_vecs - bio->bi_vcnt;
unsigned short entries_left = bio->bi_max_vecs - bio->bi_vcnt;
struct bio_vec *bv = bio->bi_io_vec + bio->bi_vcnt;
struct page **pages = (struct page **)bv;
bool same_page = false;
ssize_t size, left;
unsigned len, i;
size_t offset;
/*
* Move page array up in the allocated memory for the bio vecs as far as
* possible so that we can start filling biovecs from the beginning
* without overwriting the temporary page array.
*/
BUILD_BUG_ON(PAGE_PTRS_PER_BVEC < 2);
pages += entries_left * (PAGE_PTRS_PER_BVEC - 1);
size = iov_iter_get_pages(iter, pages, LONG_MAX, nr_pages, &offset);
if (unlikely(size <= 0)) return size ? size : -EFAULT;
for (left = size, i = 0; left > 0; left -= len, i++) {
struct page *page = pages[i];
len = min_t(size_t, PAGE_SIZE - offset, left);
if (__bio_try_merge_page(bio, page, len, offset, &same_page)) {
if (same_page)
put_page(page);
} else {
if (WARN_ON_ONCE(bio_full(bio, len))) {
bio_put_pages(pages + i, left, offset);
return -EINVAL;
}
__bio_add_page(bio, page, len, offset);
}
offset = 0;
}
iov_iter_advance(iter, size);
return 0;
}
static int __bio_iov_append_get_pages(struct bio *bio, struct iov_iter *iter)
{
unsigned short nr_pages = bio->bi_max_vecs - bio->bi_vcnt;
unsigned short entries_left = bio->bi_max_vecs - bio->bi_vcnt;
struct request_queue *q = bio->bi_bdev->bd_disk->queue;
unsigned int max_append_sectors = queue_max_zone_append_sectors(q);
struct bio_vec *bv = bio->bi_io_vec + bio->bi_vcnt;
struct page **pages = (struct page **)bv;
ssize_t size, left;
unsigned len, i;
size_t offset;
int ret = 0;
if (WARN_ON_ONCE(!max_append_sectors))
return 0;
/*
* Move page array up in the allocated memory for the bio vecs as far as
* possible so that we can start filling biovecs from the beginning
* without overwriting the temporary page array.
*/
BUILD_BUG_ON(PAGE_PTRS_PER_BVEC < 2);
pages += entries_left * (PAGE_PTRS_PER_BVEC - 1);
size = iov_iter_get_pages(iter, pages, LONG_MAX, nr_pages, &offset);
if (unlikely(size <= 0)) return size ? size : -EFAULT;
for (left = size, i = 0; left > 0; left -= len, i++) {
struct page *page = pages[i];
bool same_page = false;
len = min_t(size_t, PAGE_SIZE - offset, left);
if (bio_add_hw_page(q, bio, page, len, offset,
max_append_sectors, &same_page) != len) {
bio_put_pages(pages + i, left, offset);
ret = -EINVAL;
break;
}
if (same_page)
put_page(page);
offset = 0;
}
iov_iter_advance(iter, size - left);
return ret;
}
/**
* bio_iov_iter_get_pages - add user or kernel pages to a bio
* @bio: bio to add pages to
* @iter: iov iterator describing the region to be added
*
* This takes either an iterator pointing to user memory, or one pointing to
* kernel pages (BVEC iterator). If we're adding user pages, we pin them and
* map them into the kernel. On IO completion, the caller should put those
* pages. For bvec based iterators bio_iov_iter_get_pages() uses the provided
* bvecs rather than copying them. Hence anyone issuing kiocb based IO needs
* to ensure the bvecs and pages stay referenced until the submitted I/O is
* completed by a call to ->ki_complete() or returns with an error other than
* -EIOCBQUEUED. The caller needs to check if the bio is flagged BIO_NO_PAGE_REF
* on IO completion. If it isn't, then pages should be released.
*
* The function tries, but does not guarantee, to pin as many pages as
* fit into the bio, or are requested in @iter, whatever is smaller. If
* MM encounters an error pinning the requested pages, it stops. Error
* is returned only if 0 pages could be pinned.
*
* It's intended for direct IO, so doesn't do PSI tracking, the caller is
* responsible for setting BIO_WORKINGSET if necessary.
*/
int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter)
{
int ret = 0;
if (iov_iter_is_bvec(iter)) { if (bio_op(bio) == REQ_OP_ZONE_APPEND)
return bio_iov_bvec_set_append(bio, iter);
return bio_iov_bvec_set(bio, iter);
}
do {
if (bio_op(bio) == REQ_OP_ZONE_APPEND)
ret = __bio_iov_append_get_pages(bio, iter);
else
ret = __bio_iov_iter_get_pages(bio, iter);
} while (!ret && iov_iter_count(iter) && !bio_full(bio, 0));
/* don't account direct I/O as memory stall */
bio_clear_flag(bio, BIO_WORKINGSET);
return bio->bi_vcnt ? 0 : ret;
}
EXPORT_SYMBOL_GPL(bio_iov_iter_get_pages);
static void submit_bio_wait_endio(struct bio *bio)
{
complete(bio->bi_private);
}
/**
* submit_bio_wait - submit a bio, and wait until it completes
* @bio: The &struct bio which describes the I/O
*
* Simple wrapper around submit_bio(). Returns 0 on success, or the error from
* bio_endio() on failure.
*
* WARNING: Unlike to how submit_bio() is usually used, this function does not
* result in bio reference to be consumed. The caller must drop the reference
* on his own.
*/
int submit_bio_wait(struct bio *bio)
{
DECLARE_COMPLETION_ONSTACK_MAP(done,
bio->bi_bdev->bd_disk->lockdep_map);
unsigned long hang_check;
bio->bi_private = &done;
bio->bi_end_io = submit_bio_wait_endio;
bio->bi_opf |= REQ_SYNC;
submit_bio(bio);
/* Prevent hang_check timer from firing at us during very long I/O */
hang_check = sysctl_hung_task_timeout_secs;
if (hang_check)
while (!wait_for_completion_io_timeout(&done,
hang_check * (HZ/2)))
;
else
wait_for_completion_io(&done);
return blk_status_to_errno(bio->bi_status);
}
EXPORT_SYMBOL(submit_bio_wait);
/**
* bio_advance - increment/complete a bio by some number of bytes
* @bio: bio to advance
* @bytes: number of bytes to complete
*
* This updates bi_sector, bi_size and bi_idx; if the number of bytes to
* complete doesn't align with a bvec boundary, then bv_len and bv_offset will
* be updated on the last bvec as well.
*
* @bio will then represent the remaining, uncompleted portion of the io.
*/
void bio_advance(struct bio *bio, unsigned bytes)
{
if (bio_integrity(bio))
bio_integrity_advance(bio, bytes);
bio_crypt_advance(bio, bytes);
bio_advance_iter(bio, &bio->bi_iter, bytes);
}
EXPORT_SYMBOL(bio_advance);
void bio_copy_data_iter(struct bio *dst, struct bvec_iter *dst_iter,
struct bio *src, struct bvec_iter *src_iter)
{
while (src_iter->bi_size && dst_iter->bi_size) {
struct bio_vec src_bv = bio_iter_iovec(src, *src_iter);
struct bio_vec dst_bv = bio_iter_iovec(dst, *dst_iter);
unsigned int bytes = min(src_bv.bv_len, dst_bv.bv_len);
void *src_buf;
src_buf = bvec_kmap_local(&src_bv);
memcpy_to_bvec(&dst_bv, src_buf);
kunmap_local(src_buf);
bio_advance_iter_single(src, src_iter, bytes);
bio_advance_iter_single(dst, dst_iter, bytes);
}
}
EXPORT_SYMBOL(bio_copy_data_iter);
/**
* bio_copy_data - copy contents of data buffers from one bio to another
* @src: source bio
* @dst: destination bio
*
* Stops when it reaches the end of either @src or @dst - that is, copies
* min(src->bi_size, dst->bi_size) bytes (or the equivalent for lists of bios).
*/
void bio_copy_data(struct bio *dst, struct bio *src)
{
struct bvec_iter src_iter = src->bi_iter;
struct bvec_iter dst_iter = dst->bi_iter;
bio_copy_data_iter(dst, &dst_iter, src, &src_iter);
}
EXPORT_SYMBOL(bio_copy_data);
void bio_free_pages(struct bio *bio)
{
struct bio_vec *bvec;
struct bvec_iter_all iter_all;
bio_for_each_segment_all(bvec, bio, iter_all)
__free_page(bvec->bv_page);
}
EXPORT_SYMBOL(bio_free_pages);
/*
* bio_set_pages_dirty() and bio_check_pages_dirty() are support functions
* for performing direct-IO in BIOs.
*
* The problem is that we cannot run set_page_dirty() from interrupt context
* because the required locks are not interrupt-safe. So what we can do is to
* mark the pages dirty _before_ performing IO. And in interrupt context,
* check that the pages are still dirty. If so, fine. If not, redirty them
* in process context.
*
* We special-case compound pages here: normally this means reads into hugetlb
* pages. The logic in here doesn't really work right for compound pages
* because the VM does not uniformly chase down the head page in all cases.
* But dirtiness of compound pages is pretty meaningless anyway: the VM doesn't
* handle them at all. So we skip compound pages here at an early stage.
*
* Note that this code is very hard to test under normal circumstances because
* direct-io pins the pages with get_user_pages(). This makes
* is_page_cache_freeable return false, and the VM will not clean the pages.
* But other code (eg, flusher threads) could clean the pages if they are mapped
* pagecache.
*
* Simply disabling the call to bio_set_pages_dirty() is a good way to test the
* deferred bio dirtying paths.
*/
/*
* bio_set_pages_dirty() will mark all the bio's pages as dirty.
*/
void bio_set_pages_dirty(struct bio *bio)
{
struct bio_vec *bvec;
struct bvec_iter_all iter_all;
bio_for_each_segment_all(bvec, bio, iter_all) {
if (!PageCompound(bvec->bv_page))
set_page_dirty_lock(bvec->bv_page);
}
}
/*
* bio_check_pages_dirty() will check that all the BIO's pages are still dirty.
* If they are, then fine. If, however, some pages are clean then they must
* have been written out during the direct-IO read. So we take another ref on
* the BIO and re-dirty the pages in process context.
*
* It is expected that bio_check_pages_dirty() will wholly own the BIO from
* here on. It will run one put_page() against each page and will run one
* bio_put() against the BIO.
*/
static void bio_dirty_fn(struct work_struct *work);
static DECLARE_WORK(bio_dirty_work, bio_dirty_fn);
static DEFINE_SPINLOCK(bio_dirty_lock);
static struct bio *bio_dirty_list;
/*
* This runs in process context
*/
static void bio_dirty_fn(struct work_struct *work)
{
struct bio *bio, *next;
spin_lock_irq(&bio_dirty_lock);
next = bio_dirty_list;
bio_dirty_list = NULL;
spin_unlock_irq(&bio_dirty_lock);
while ((bio = next) != NULL) {
next = bio->bi_private;
bio_release_pages(bio, true);
bio_put(bio);
}
}
void bio_check_pages_dirty(struct bio *bio)
{
struct bio_vec *bvec;
unsigned long flags;
struct bvec_iter_all iter_all;
bio_for_each_segment_all(bvec, bio, iter_all) {
if (!PageDirty(bvec->bv_page) && !PageCompound(bvec->bv_page))
goto defer;
}
bio_release_pages(bio, false);
bio_put(bio);
return;
defer:
spin_lock_irqsave(&bio_dirty_lock, flags);
bio->bi_private = bio_dirty_list;
bio_dirty_list = bio;
spin_unlock_irqrestore(&bio_dirty_lock, flags);
schedule_work(&bio_dirty_work);
}
static inline bool bio_remaining_done(struct bio *bio)
{
/*
* If we're not chaining, then ->__bi_remaining is always 1 and
* we always end io on the first invocation.
*/
if (!bio_flagged(bio, BIO_CHAIN))
return true;
BUG_ON(atomic_read(&bio->__bi_remaining) <= 0);
if (atomic_dec_and_test(&bio->__bi_remaining)) {
bio_clear_flag(bio, BIO_CHAIN);
return true;
}
return false;
}
/**
* bio_endio - end I/O on a bio
* @bio: bio
*
* Description:
* bio_endio() will end I/O on the whole bio. bio_endio() is the preferred
* way to end I/O on a bio. No one should call bi_end_io() directly on a
* bio unless they own it and thus know that it has an end_io function.
*
* bio_endio() can be called several times on a bio that has been chained
* using bio_chain(). The ->bi_end_io() function will only be called the
* last time.
**/
void bio_endio(struct bio *bio)
{
again:
if (!bio_remaining_done(bio))
return;
if (!bio_integrity_endio(bio))
return;
if (bio->bi_bdev && bio_flagged(bio, BIO_TRACKED)) rq_qos_done_bio(bio->bi_bdev->bd_disk->queue, bio); if (bio->bi_bdev && bio_flagged(bio, BIO_TRACE_COMPLETION)) { trace_block_bio_complete(bio->bi_bdev->bd_disk->queue, bio);
bio_clear_flag(bio, BIO_TRACE_COMPLETION);
}
/*
* Need to have a real endio function for chained bios, otherwise
* various corner cases will break (like stacking block devices that
* save/restore bi_end_io) - however, we want to avoid unbounded
* recursion and blowing the stack. Tail call optimization would
* handle this, but compiling with frame pointers also disables
* gcc's sibling call optimization.
*/
if (bio->bi_end_io == bio_chain_endio) {
bio = __bio_chain_endio(bio);
goto again;
}
blk_throtl_bio_endio(bio);
/* release cgroup info */
bio_uninit(bio);
if (bio->bi_end_io) bio->bi_end_io(bio);
}
EXPORT_SYMBOL(bio_endio);
/**
* bio_split - split a bio
* @bio: bio to split
* @sectors: number of sectors to split from the front of @bio
* @gfp: gfp mask
* @bs: bio set to allocate from
*
* Allocates and returns a new bio which represents @sectors from the start of
* @bio, and updates @bio to represent the remaining sectors.
*
* Unless this is a discard request the newly allocated bio will point
* to @bio's bi_io_vec. It is the caller's responsibility to ensure that
* neither @bio nor @bs are freed before the split bio.
*/
struct bio *bio_split(struct bio *bio, int sectors,
gfp_t gfp, struct bio_set *bs)
{
struct bio *split;
BUG_ON(sectors <= 0); BUG_ON(sectors >= bio_sectors(bio));
/* Zone append commands cannot be split */
if (WARN_ON_ONCE(bio_op(bio) == REQ_OP_ZONE_APPEND)) return NULL; split = bio_clone_fast(bio, gfp, bs);
if (!split)
return NULL;
split->bi_iter.bi_size = sectors << 9;
if (bio_integrity(split))
bio_integrity_trim(split);
bio_advance(bio, split->bi_iter.bi_size);
if (bio_flagged(bio, BIO_TRACE_COMPLETION))
bio_set_flag(split, BIO_TRACE_COMPLETION);
return split;
}
EXPORT_SYMBOL(bio_split);
/**
* bio_trim - trim a bio
* @bio: bio to trim
* @offset: number of sectors to trim from the front of @bio
* @size: size we want to trim @bio to, in sectors
*
* This function is typically used for bios that are cloned and submitted
* to the underlying device in parts.
*/
void bio_trim(struct bio *bio, sector_t offset, sector_t size)
{
if (WARN_ON_ONCE(offset > BIO_MAX_SECTORS || size > BIO_MAX_SECTORS ||
offset + size > bio_sectors(bio)))
return;
size <<= 9; if (offset == 0 && size == bio->bi_iter.bi_size)
return;
bio_advance(bio, offset << 9); bio->bi_iter.bi_size = size;
if (bio_integrity(bio))
bio_integrity_trim(bio);
}
EXPORT_SYMBOL_GPL(bio_trim);
/*
* create memory pools for biovec's in a bio_set.
* use the global biovec slabs created for general use.
*/
int biovec_init_pool(mempool_t *pool, int pool_entries)
{
struct biovec_slab *bp = bvec_slabs + ARRAY_SIZE(bvec_slabs) - 1;
return mempool_init_slab_pool(pool, pool_entries, bp->slab);
}
/*
* bioset_exit - exit a bioset initialized with bioset_init()
*
* May be called on a zeroed but uninitialized bioset (i.e. allocated with
* kzalloc()).
*/
void bioset_exit(struct bio_set *bs)
{
bio_alloc_cache_destroy(bs);
if (bs->rescue_workqueue)
destroy_workqueue(bs->rescue_workqueue);
bs->rescue_workqueue = NULL;
mempool_exit(&bs->bio_pool);
mempool_exit(&bs->bvec_pool);
bioset_integrity_free(bs);
if (bs->bio_slab)
bio_put_slab(bs);
bs->bio_slab = NULL;
}
EXPORT_SYMBOL(bioset_exit);
/**
* bioset_init - Initialize a bio_set
* @bs: pool to initialize
* @pool_size: Number of bio and bio_vecs to cache in the mempool
* @front_pad: Number of bytes to allocate in front of the returned bio
* @flags: Flags to modify behavior, currently %BIOSET_NEED_BVECS
* and %BIOSET_NEED_RESCUER
*
* Description:
* Set up a bio_set to be used with @bio_alloc_bioset. Allows the caller
* to ask for a number of bytes to be allocated in front of the bio.
* Front pad allocation is useful for embedding the bio inside
* another structure, to avoid allocating extra data to go with the bio.
* Note that the bio must be embedded at the END of that structure always,
* or things will break badly.
* If %BIOSET_NEED_BVECS is set in @flags, a separate pool will be allocated
* for allocating iovecs. This pool is not needed e.g. for bio_clone_fast().
* If %BIOSET_NEED_RESCUER is set, a workqueue is created which can be used to
* dispatch queued requests when the mempool runs out of space.
*
*/
int bioset_init(struct bio_set *bs,
unsigned int pool_size,
unsigned int front_pad,
int flags)
{
bs->front_pad = front_pad;
if (flags & BIOSET_NEED_BVECS)
bs->back_pad = BIO_INLINE_VECS * sizeof(struct bio_vec);
else
bs->back_pad = 0;
spin_lock_init(&bs->rescue_lock);
bio_list_init(&bs->rescue_list);
INIT_WORK(&bs->rescue_work, bio_alloc_rescue);
bs->bio_slab = bio_find_or_create_slab(bs);
if (!bs->bio_slab)
return -ENOMEM;
if (mempool_init_slab_pool(&bs->bio_pool, pool_size, bs->bio_slab))
goto bad;
if ((flags & BIOSET_NEED_BVECS) &&
biovec_init_pool(&bs->bvec_pool, pool_size))
goto bad;
if (flags & BIOSET_NEED_RESCUER) {
bs->rescue_workqueue = alloc_workqueue("bioset",
WQ_MEM_RECLAIM, 0);
if (!bs->rescue_workqueue)
goto bad;
}
if (flags & BIOSET_PERCPU_CACHE) {
bs->cache = alloc_percpu(struct bio_alloc_cache);
if (!bs->cache)
goto bad;
cpuhp_state_add_instance_nocalls(CPUHP_BIO_DEAD, &bs->cpuhp_dead);
}
return 0;
bad:
bioset_exit(bs);
return -ENOMEM;
}
EXPORT_SYMBOL(bioset_init);
/*
* Initialize and setup a new bio_set, based on the settings from
* another bio_set.
*/
int bioset_init_from_src(struct bio_set *bs, struct bio_set *src)
{
int flags;
flags = 0;
if (src->bvec_pool.min_nr)
flags |= BIOSET_NEED_BVECS;
if (src->rescue_workqueue)
flags |= BIOSET_NEED_RESCUER;
return bioset_init(bs, src->bio_pool.min_nr, src->front_pad, flags);
}
EXPORT_SYMBOL(bioset_init_from_src);
/**
* bio_alloc_kiocb - Allocate a bio from bio_set based on kiocb
* @kiocb: kiocb describing the IO
* @nr_vecs: number of iovecs to pre-allocate
* @bs: bio_set to allocate from
*
* Description:
* Like @bio_alloc_bioset, but pass in the kiocb. The kiocb is only
* used to check if we should dip into the per-cpu bio_set allocation
* cache. The allocation uses GFP_KERNEL internally. On return, the
* bio is marked BIO_PERCPU_CACHEABLE, and the final put of the bio
* MUST be done from process context, not hard/soft IRQ.
*
*/
struct bio *bio_alloc_kiocb(struct kiocb *kiocb, unsigned short nr_vecs,
struct bio_set *bs)
{
struct bio_alloc_cache *cache;
struct bio *bio;
if (!(kiocb->ki_flags & IOCB_ALLOC_CACHE) || nr_vecs > BIO_INLINE_VECS) return bio_alloc_bioset(GFP_KERNEL, nr_vecs, bs); cache = per_cpu_ptr(bs->cache, get_cpu());
bio = bio_list_pop(&cache->free_list);
if (bio) {
cache->nr--;
put_cpu();
bio_init(bio, nr_vecs ? bio->bi_inline_vecs : NULL, nr_vecs);
bio->bi_pool = bs;
bio_set_flag(bio, BIO_PERCPU_CACHE);
return bio;
}
put_cpu();
bio = bio_alloc_bioset(GFP_KERNEL, nr_vecs, bs);
bio_set_flag(bio, BIO_PERCPU_CACHE);
return bio;
}
EXPORT_SYMBOL_GPL(bio_alloc_kiocb);
static int __init init_bio(void)
{
int i;
bio_integrity_init();
for (i = 0; i < ARRAY_SIZE(bvec_slabs); i++) {
struct biovec_slab *bvs = bvec_slabs + i;
bvs->slab = kmem_cache_create(bvs->name,
bvs->nr_vecs * sizeof(struct bio_vec), 0,
SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL);
}
cpuhp_setup_state_multi(CPUHP_BIO_DEAD, "block/bio:dead", NULL,
bio_cpu_dead);
if (bioset_init(&fs_bio_set, BIO_POOL_SIZE, 0, BIOSET_NEED_BVECS))
panic("bio: can't allocate bios\n");
if (bioset_integrity_create(&fs_bio_set, BIO_POOL_SIZE))
panic("bio: can't create integrity pool\n");
return 0;
}
subsys_initcall(init_bio);
/* SPDX-License-Identifier: GPL-2.0 */
/*
* Percpu refcounts:
* (C) 2012 Google, Inc.
* Author: Kent Overstreet <koverstreet@google.com>
*
* This implements a refcount with similar semantics to atomic_t - atomic_inc(),
* atomic_dec_and_test() - but percpu.
*
* There's one important difference between percpu refs and normal atomic_t
* refcounts; you have to keep track of your initial refcount, and then when you
* start shutting down you call percpu_ref_kill() _before_ dropping the initial
* refcount.
*
* The refcount will have a range of 0 to ((1U << 31) - 1), i.e. one bit less
* than an atomic_t - this is because of the way shutdown works, see
* percpu_ref_kill()/PERCPU_COUNT_BIAS.
*
* Before you call percpu_ref_kill(), percpu_ref_put() does not check for the
* refcount hitting 0 - it can't, if it was in percpu mode. percpu_ref_kill()
* puts the ref back in single atomic_t mode, collecting the per cpu refs and
* issuing the appropriate barriers, and then marks the ref as shutting down so
* that percpu_ref_put() will check for the ref hitting 0. After it returns,
* it's safe to drop the initial ref.
*
* USAGE:
*
* See fs/aio.c for some example usage; it's used there for struct kioctx, which
* is created when userspaces calls io_setup(), and destroyed when userspace
* calls io_destroy() or the process exits.
*
* In the aio code, kill_ioctx() is called when we wish to destroy a kioctx; it
* removes the kioctx from the proccess's table of kioctxs and kills percpu_ref.
* After that, there can't be any new users of the kioctx (from lookup_ioctx())
* and it's then safe to drop the initial ref with percpu_ref_put().
*
* Note that the free path, free_ioctx(), needs to go through explicit call_rcu()
* to synchronize with RCU protected lookup_ioctx(). percpu_ref operations don't
* imply RCU grace periods of any kind and if a user wants to combine percpu_ref
* with RCU protection, it must be done explicitly.
*
* Code that does a two stage shutdown like this often needs some kind of
* explicit synchronization to ensure the initial refcount can only be dropped
* once - percpu_ref_kill() does this for you, it returns true once and false if
* someone else already called it. The aio code uses it this way, but it's not
* necessary if the code has some other mechanism to synchronize teardown.
* around.
*/
#ifndef _LINUX_PERCPU_REFCOUNT_H
#define _LINUX_PERCPU_REFCOUNT_H
#include <linux/atomic.h>
#include <linux/kernel.h>
#include <linux/percpu.h>
#include <linux/rcupdate.h>
#include <linux/gfp.h>
struct percpu_ref;
typedef void (percpu_ref_func_t)(struct percpu_ref *);
/* flags set in the lower bits of percpu_ref->percpu_count_ptr */
enum {
__PERCPU_REF_ATOMIC = 1LU << 0, /* operating in atomic mode */
__PERCPU_REF_DEAD = 1LU << 1, /* (being) killed */
__PERCPU_REF_ATOMIC_DEAD = __PERCPU_REF_ATOMIC | __PERCPU_REF_DEAD,
__PERCPU_REF_FLAG_BITS = 2,
};
/* @flags for percpu_ref_init() */
enum {
/*
* Start w/ ref == 1 in atomic mode. Can be switched to percpu
* operation using percpu_ref_switch_to_percpu(). If initialized
* with this flag, the ref will stay in atomic mode until
* percpu_ref_switch_to_percpu() is invoked on it.
* Implies ALLOW_REINIT.
*/
PERCPU_REF_INIT_ATOMIC = 1 << 0,
/*
* Start dead w/ ref == 0 in atomic mode. Must be revived with
* percpu_ref_reinit() before used. Implies INIT_ATOMIC and
* ALLOW_REINIT.
*/
PERCPU_REF_INIT_DEAD = 1 << 1,
/*
* Allow switching from atomic mode to percpu mode.
*/
PERCPU_REF_ALLOW_REINIT = 1 << 2,
};
struct percpu_ref_data {
atomic_long_t count;
percpu_ref_func_t *release;
percpu_ref_func_t *confirm_switch;
bool force_atomic:1;
bool allow_reinit:1;
struct rcu_head rcu;
struct percpu_ref *ref;
};
struct percpu_ref {
/*
* The low bit of the pointer indicates whether the ref is in percpu
* mode; if set, then get/put will manipulate the atomic_t.
*/
unsigned long percpu_count_ptr;
/*
* 'percpu_ref' is often embedded into user structure, and only
* 'percpu_count_ptr' is required in fast path, move other fields
* into 'percpu_ref_data', so we can reduce memory footprint in
* fast path.
*/
struct percpu_ref_data *data;
};
int __must_check percpu_ref_init(struct percpu_ref *ref,
percpu_ref_func_t *release, unsigned int flags,
gfp_t gfp);
void percpu_ref_exit(struct percpu_ref *ref);
void percpu_ref_switch_to_atomic(struct percpu_ref *ref,
percpu_ref_func_t *confirm_switch);
void percpu_ref_switch_to_atomic_sync(struct percpu_ref *ref);
void percpu_ref_switch_to_percpu(struct percpu_ref *ref);
void percpu_ref_kill_and_confirm(struct percpu_ref *ref,
percpu_ref_func_t *confirm_kill);
void percpu_ref_resurrect(struct percpu_ref *ref);
void percpu_ref_reinit(struct percpu_ref *ref);
bool percpu_ref_is_zero(struct percpu_ref *ref);
/**
* percpu_ref_kill - drop the initial ref
* @ref: percpu_ref to kill
*
* Must be used to drop the initial ref on a percpu refcount; must be called
* precisely once before shutdown.
*
* Switches @ref into atomic mode before gathering up the percpu counters
* and dropping the initial ref.
*
* There are no implied RCU grace periods between kill and release.
*/
static inline void percpu_ref_kill(struct percpu_ref *ref)
{
percpu_ref_kill_and_confirm(ref, NULL);
}
/*
* Internal helper. Don't use outside percpu-refcount proper. The
* function doesn't return the pointer and let the caller test it for NULL
* because doing so forces the compiler to generate two conditional
* branches as it can't assume that @ref->percpu_count is not NULL.
*/
static inline bool __ref_is_percpu(struct percpu_ref *ref,
unsigned long __percpu **percpu_countp)
{
unsigned long percpu_ptr;
/*
* The value of @ref->percpu_count_ptr is tested for
* !__PERCPU_REF_ATOMIC, which may be set asynchronously, and then
* used as a pointer. If the compiler generates a separate fetch
* when using it as a pointer, __PERCPU_REF_ATOMIC may be set in
* between contaminating the pointer value, meaning that
* READ_ONCE() is required when fetching it.
*
* The dependency ordering from the READ_ONCE() pairs
* with smp_store_release() in __percpu_ref_switch_to_percpu().
*/
percpu_ptr = READ_ONCE(ref->percpu_count_ptr);
/*
* Theoretically, the following could test just ATOMIC; however,
* then we'd have to mask off DEAD separately as DEAD may be
* visible without ATOMIC if we race with percpu_ref_kill(). DEAD
* implies ATOMIC anyway. Test them together.
*/
if (unlikely(percpu_ptr & __PERCPU_REF_ATOMIC_DEAD))
return false;
*percpu_countp = (unsigned long __percpu *)percpu_ptr;
return true;
}
/**
* percpu_ref_get_many - increment a percpu refcount
* @ref: percpu_ref to get
* @nr: number of references to get
*
* Analogous to atomic_long_add().
*
* This function is safe to call as long as @ref is between init and exit.
*/
static inline void percpu_ref_get_many(struct percpu_ref *ref, unsigned long nr)
{
unsigned long __percpu *percpu_count;
rcu_read_lock();
if (__ref_is_percpu(ref, &percpu_count))
this_cpu_add(*percpu_count, nr);
else
atomic_long_add(nr, &ref->data->count);
rcu_read_unlock();
}
/**
* percpu_ref_get - increment a percpu refcount
* @ref: percpu_ref to get
*
* Analogous to atomic_long_inc().
*
* This function is safe to call as long as @ref is between init and exit.
*/
static inline void percpu_ref_get(struct percpu_ref *ref)
{
percpu_ref_get_many(ref, 1);
}
/**
* percpu_ref_tryget_many - try to increment a percpu refcount
* @ref: percpu_ref to try-get
* @nr: number of references to get
*
* Increment a percpu refcount by @nr unless its count already reached zero.
* Returns %true on success; %false on failure.
*
* This function is safe to call as long as @ref is between init and exit.
*/
static inline bool percpu_ref_tryget_many(struct percpu_ref *ref,
unsigned long nr)
{
unsigned long __percpu *percpu_count;
bool ret;
rcu_read_lock();
if (__ref_is_percpu(ref, &percpu_count)) {
this_cpu_add(*percpu_count, nr);
ret = true;
} else {
ret = atomic_long_add_unless(&ref->data->count, nr, 0);
}
rcu_read_unlock();
return ret;
}
/**
* percpu_ref_tryget - try to increment a percpu refcount
* @ref: percpu_ref to try-get
*
* Increment a percpu refcount unless its count already reached zero.
* Returns %true on success; %false on failure.
*
* This function is safe to call as long as @ref is between init and exit.
*/
static inline bool percpu_ref_tryget(struct percpu_ref *ref)
{
return percpu_ref_tryget_many(ref, 1);
}
/**
* percpu_ref_tryget_live - try to increment a live percpu refcount
* @ref: percpu_ref to try-get
*
* Increment a percpu refcount unless it has already been killed. Returns
* %true on success; %false on failure.
*
* Completion of percpu_ref_kill() in itself doesn't guarantee that this
* function will fail. For such guarantee, percpu_ref_kill_and_confirm()
* should be used. After the confirm_kill callback is invoked, it's
* guaranteed that no new reference will be given out by
* percpu_ref_tryget_live().
*
* This function is safe to call as long as @ref is between init and exit.
*/
static inline bool percpu_ref_tryget_live(struct percpu_ref *ref)
{
unsigned long __percpu *percpu_count;
bool ret = false;
rcu_read_lock();
if (__ref_is_percpu(ref, &percpu_count)) {
this_cpu_inc(*percpu_count);
ret = true;
} else if (!(ref->percpu_count_ptr & __PERCPU_REF_DEAD)) { ret = atomic_long_inc_not_zero(&ref->data->count);
}
rcu_read_unlock();
return ret;
}
/**
* percpu_ref_put_many - decrement a percpu refcount
* @ref: percpu_ref to put
* @nr: number of references to put
*
* Decrement the refcount, and if 0, call the release function (which was passed
* to percpu_ref_init())
*
* This function is safe to call as long as @ref is between init and exit.
*/
static inline void percpu_ref_put_many(struct percpu_ref *ref, unsigned long nr)
{
unsigned long __percpu *percpu_count;
rcu_read_lock();
if (__ref_is_percpu(ref, &percpu_count))
this_cpu_sub(*percpu_count, nr);
else if (unlikely(atomic_long_sub_and_test(nr, &ref->data->count))) ref->data->release(ref);
rcu_read_unlock();
}
/**
* percpu_ref_put - decrement a percpu refcount
* @ref: percpu_ref to put
*
* Decrement the refcount, and if 0, call the release function (which was passed
* to percpu_ref_init())
*
* This function is safe to call as long as @ref is between init and exit.
*/
static inline void percpu_ref_put(struct percpu_ref *ref)
{
percpu_ref_put_many(ref, 1);
}
/**
* percpu_ref_is_dying - test whether a percpu refcount is dying or dead
* @ref: percpu_ref to test
*
* Returns %true if @ref is dying or dead.
*
* This function is safe to call as long as @ref is between init and exit
* and the caller is responsible for synchronizing against state changes.
*/
static inline bool percpu_ref_is_dying(struct percpu_ref *ref)
{
return ref->percpu_count_ptr & __PERCPU_REF_DEAD;
}
#endif
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _SCSI_SCSI_HOST_H
#define _SCSI_SCSI_HOST_H
#include <linux/device.h>
#include <linux/list.h>
#include <linux/types.h>
#include <linux/workqueue.h>
#include <linux/mutex.h>
#include <linux/seq_file.h>
#include <linux/blk-mq.h>
#include <scsi/scsi.h>
struct block_device;
struct completion;
struct module;
struct scsi_cmnd;
struct scsi_device;
struct scsi_host_cmd_pool;
struct scsi_target;
struct Scsi_Host;
struct scsi_transport_template;
#define SG_ALL SG_CHUNK_SIZE
#define MODE_UNKNOWN 0x00
#define MODE_INITIATOR 0x01
#define MODE_TARGET 0x02
struct scsi_host_template {
/*
* Put fields referenced in IO submission path together in
* same cacheline
*/
/*
* Additional per-command data allocated for the driver.
*/
unsigned int cmd_size;
/*
* The queuecommand function is used to queue up a scsi
* command block to the LLDD. When the driver finished
* processing the command the done callback is invoked.
*
* If queuecommand returns 0, then the driver has accepted the
* command. It must also push it to the HBA if the scsi_cmnd
* flag SCMD_LAST is set, or if the driver does not implement
* commit_rqs. The done() function must be called on the command
* when the driver has finished with it. (you may call done on the
* command before queuecommand returns, but in this case you
* *must* return 0 from queuecommand).
*
* Queuecommand may also reject the command, in which case it may
* not touch the command and must not call done() for it.
*
* There are two possible rejection returns:
*
* SCSI_MLQUEUE_DEVICE_BUSY: Block this device temporarily, but
* allow commands to other devices serviced by this host.
*
* SCSI_MLQUEUE_HOST_BUSY: Block all devices served by this
* host temporarily.
*
* For compatibility, any other non-zero return is treated the
* same as SCSI_MLQUEUE_HOST_BUSY.
*
* NOTE: "temporarily" means either until the next command for#
* this device/host completes, or a period of time determined by
* I/O pressure in the system if there are no other outstanding
* commands.
*
* STATUS: REQUIRED
*/
int (* queuecommand)(struct Scsi_Host *, struct scsi_cmnd *);
/*
* The commit_rqs function is used to trigger a hardware
* doorbell after some requests have been queued with
* queuecommand, when an error is encountered before sending
* the request with SCMD_LAST set.
*
* STATUS: OPTIONAL
*/
void (*commit_rqs)(struct Scsi_Host *, u16);
struct module *module;
const char *name;
/*
* The info function will return whatever useful information the
* developer sees fit. If not provided, then the name field will
* be used instead.
*
* Status: OPTIONAL
*/
const char *(*info)(struct Scsi_Host *);
/*
* Ioctl interface
*
* Status: OPTIONAL
*/
int (*ioctl)(struct scsi_device *dev, unsigned int cmd,
void __user *arg);
#ifdef CONFIG_COMPAT
/*
* Compat handler. Handle 32bit ABI.
* When unknown ioctl is passed return -ENOIOCTLCMD.
*
* Status: OPTIONAL
*/
int (*compat_ioctl)(struct scsi_device *dev, unsigned int cmd,
void __user *arg);
#endif
int (*init_cmd_priv)(struct Scsi_Host *shost, struct scsi_cmnd *cmd);
int (*exit_cmd_priv)(struct Scsi_Host *shost, struct scsi_cmnd *cmd);
/*
* This is an error handling strategy routine. You don't need to
* define one of these if you don't want to - there is a default
* routine that is present that should work in most cases. For those
* driver authors that have the inclination and ability to write their
* own strategy routine, this is where it is specified. Note - the
* strategy routine is *ALWAYS* run in the context of the kernel eh
* thread. Thus you are guaranteed to *NOT* be in an interrupt
* handler when you execute this, and you are also guaranteed to
* *NOT* have any other commands being queued while you are in the
* strategy routine. When you return from this function, operations
* return to normal.
*
* See scsi_error.c scsi_unjam_host for additional comments about
* what this function should and should not be attempting to do.
*
* Status: REQUIRED (at least one of them)
*/
int (* eh_abort_handler)(struct scsi_cmnd *);
int (* eh_device_reset_handler)(struct scsi_cmnd *);
int (* eh_target_reset_handler)(struct scsi_cmnd *);
int (* eh_bus_reset_handler)(struct scsi_cmnd *);
int (* eh_host_reset_handler)(struct scsi_cmnd *);
/*
* Before the mid layer attempts to scan for a new device where none
* currently exists, it will call this entry in your driver. Should
* your driver need to allocate any structs or perform any other init
* items in order to send commands to a currently unused target/lun
* combo, then this is where you can perform those allocations. This
* is specifically so that drivers won't have to perform any kind of
* "is this a new device" checks in their queuecommand routine,
* thereby making the hot path a bit quicker.
*
* Return values: 0 on success, non-0 on failure
*
* Deallocation: If we didn't find any devices at this ID, you will
* get an immediate call to slave_destroy(). If we find something
* here then you will get a call to slave_configure(), then the
* device will be used for however long it is kept around, then when
* the device is removed from the system (or * possibly at reboot
* time), you will then get a call to slave_destroy(). This is
* assuming you implement slave_configure and slave_destroy.
* However, if you allocate memory and hang it off the device struct,
* then you must implement the slave_destroy() routine at a minimum
* in order to avoid leaking memory
* each time a device is tore down.
*
* Status: OPTIONAL
*/
int (* slave_alloc)(struct scsi_device *);
/*
* Once the device has responded to an INQUIRY and we know the
* device is online, we call into the low level driver with the
* struct scsi_device *. If the low level device driver implements
* this function, it *must* perform the task of setting the queue
* depth on the device. All other tasks are optional and depend
* on what the driver supports and various implementation details.
*
* Things currently recommended to be handled at this time include:
*
* 1. Setting the device queue depth. Proper setting of this is
* described in the comments for scsi_change_queue_depth.
* 2. Determining if the device supports the various synchronous
* negotiation protocols. The device struct will already have
* responded to INQUIRY and the results of the standard items
* will have been shoved into the various device flag bits, eg.
* device->sdtr will be true if the device supports SDTR messages.
* 3. Allocating command structs that the device will need.
* 4. Setting the default timeout on this device (if needed).
* 5. Anything else the low level driver might want to do on a device
* specific setup basis...
* 6. Return 0 on success, non-0 on error. The device will be marked
* as offline on error so that no access will occur. If you return
* non-0, your slave_destroy routine will never get called for this
* device, so don't leave any loose memory hanging around, clean
* up after yourself before returning non-0
*
* Status: OPTIONAL
*/
int (* slave_configure)(struct scsi_device *);
/*
* Immediately prior to deallocating the device and after all activity
* has ceased the mid layer calls this point so that the low level
* driver may completely detach itself from the scsi device and vice
* versa. The low level driver is responsible for freeing any memory
* it allocated in the slave_alloc or slave_configure calls.
*
* Status: OPTIONAL
*/
void (* slave_destroy)(struct scsi_device *);
/*
* Before the mid layer attempts to scan for a new device attached
* to a target where no target currently exists, it will call this
* entry in your driver. Should your driver need to allocate any
* structs or perform any other init items in order to send commands
* to a currently unused target, then this is where you can perform
* those allocations.
*
* Return values: 0 on success, non-0 on failure
*
* Status: OPTIONAL
*/
int (* target_alloc)(struct scsi_target *);
/*
* Immediately prior to deallocating the target structure, and
* after all activity to attached scsi devices has ceased, the
* midlayer calls this point so that the driver may deallocate
* and terminate any references to the target.
*
* Status: OPTIONAL
*/
void (* target_destroy)(struct scsi_target *);
/*
* If a host has the ability to discover targets on its own instead
* of scanning the entire bus, it can fill in this function and
* call scsi_scan_host(). This function will be called periodically
* until it returns 1 with the scsi_host and the elapsed time of
* the scan in jiffies.
*
* Status: OPTIONAL
*/
int (* scan_finished)(struct Scsi_Host *, unsigned long);
/*
* If the host wants to be called before the scan starts, but
* after the midlayer has set up ready for the scan, it can fill
* in this function.
*
* Status: OPTIONAL
*/
void (* scan_start)(struct Scsi_Host *);
/*
* Fill in this function to allow the queue depth of this host
* to be changeable (on a per device basis). Returns either
* the current queue depth setting (may be different from what
* was passed in) or an error. An error should only be
* returned if the requested depth is legal but the driver was
* unable to set it. If the requested depth is illegal, the
* driver should set and return the closest legal queue depth.
*
* Status: OPTIONAL
*/
int (* change_queue_depth)(struct scsi_device *, int);
/*
* This functions lets the driver expose the queue mapping
* to the block layer.
*
* Status: OPTIONAL
*/
int (* map_queues)(struct Scsi_Host *shost);
/*
* SCSI interface of blk_poll - poll for IO completions.
* Only applicable if SCSI LLD exposes multiple h/w queues.
*
* Return value: Number of completed entries found.
*
* Status: OPTIONAL
*/
int (* mq_poll)(struct Scsi_Host *shost, unsigned int queue_num);
/*
* Check if scatterlists need to be padded for DMA draining.
*
* Status: OPTIONAL
*/
bool (* dma_need_drain)(struct request *rq);
/*
* This function determines the BIOS parameters for a given
* harddisk. These tend to be numbers that are made up by
* the host adapter. Parameters:
* size, device, list (heads, sectors, cylinders)
*
* Status: OPTIONAL
*/
int (* bios_param)(struct scsi_device *, struct block_device *,
sector_t, int []);
/*
* This function is called when one or more partitions on the
* device reach beyond the end of the device.
*
* Status: OPTIONAL
*/
void (*unlock_native_capacity)(struct scsi_device *);
/*
* Can be used to export driver statistics and other infos to the
* world outside the kernel ie. userspace and it also provides an
* interface to feed the driver with information.
*
* Status: OBSOLETE
*/
int (*show_info)(struct seq_file *, struct Scsi_Host *);
int (*write_info)(struct Scsi_Host *, char *, int);
/*
* This is an optional routine that allows the transport to become
* involved when a scsi io timer fires. The return value tells the
* timer routine how to finish the io timeout handling.
*
* Status: OPTIONAL
*/
enum blk_eh_timer_return (*eh_timed_out)(struct scsi_cmnd *);
/*
* Optional routine that allows the transport to decide if a cmd
* is retryable. Return true if the transport is in a state the
* cmd should be retried on.
*/
bool (*eh_should_retry_cmd)(struct scsi_cmnd *scmd);
/* This is an optional routine that allows transport to initiate
* LLD adapter or firmware reset using sysfs attribute.
*
* Return values: 0 on success, -ve value on failure.
*
* Status: OPTIONAL
*/
int (*host_reset)(struct Scsi_Host *shost, int reset_type);
#define SCSI_ADAPTER_RESET 1
#define SCSI_FIRMWARE_RESET 2
/*
* Name of proc directory
*/
const char *proc_name;
/*
* Used to store the procfs directory if a driver implements the
* show_info method.
*/
struct proc_dir_entry *proc_dir;
/*
* This determines if we will use a non-interrupt driven
* or an interrupt driven scheme. It is set to the maximum number
* of simultaneous commands a single hw queue in HBA will accept.
*/
int can_queue;
/*
* In many instances, especially where disconnect / reconnect are
* supported, our host also has an ID on the SCSI bus. If this is
* the case, then it must be reserved. Please set this_id to -1 if
* your setup is in single initiator mode, and the host lacks an
* ID.
*/
int this_id;
/*
* This determines the degree to which the host adapter is capable
* of scatter-gather.
*/
unsigned short sg_tablesize;
unsigned short sg_prot_tablesize;
/*
* Set this if the host adapter has limitations beside segment count.
*/
unsigned int max_sectors;
/*
* Maximum size in bytes of a single segment.
*/
unsigned int max_segment_size;
/*
* DMA scatter gather segment boundary limit. A segment crossing this
* boundary will be split in two.
*/
unsigned long dma_boundary;
unsigned long virt_boundary_mask;
/*
* This specifies "machine infinity" for host templates which don't
* limit the transfer size. Note this limit represents an absolute
* maximum, and may be over the transfer limits allowed for
* individual devices (e.g. 256 for SCSI-1).
*/
#define SCSI_DEFAULT_MAX_SECTORS 1024
/*
* True if this host adapter can make good use of linked commands.
* This will allow more than one command to be queued to a given
* unit on a given host. Set this to the maximum number of command
* blocks to be provided for each device. Set this to 1 for one
* command block per lun, 2 for two, etc. Do not set this to 0.
* You should make sure that the host adapter will do the right thing
* before you try setting this above 1.
*/
short cmd_per_lun;
/*
* present contains counter indicating how many boards of this
* type were found when we did the scan.
*/
unsigned char present;
/* If use block layer to manage tags, this is tag allocation policy */
int tag_alloc_policy;
/*
* Track QUEUE_FULL events and reduce queue depth on demand.
*/
unsigned track_queue_depth:1;
/*
* This specifies the mode that a LLD supports.
*/
unsigned supported_mode:2;
/*
* True for emulated SCSI host adapters (e.g. ATAPI).
*/
unsigned emulated:1;
/*
* True if the low-level driver performs its own reset-settle delays.
*/
unsigned skip_settle_delay:1;
/* True if the controller does not support WRITE SAME */
unsigned no_write_same:1;
/* True if the host uses host-wide tagspace */
unsigned host_tagset:1;
/*
* Countdown for host blocking with no commands outstanding.
*/
unsigned int max_host_blocked;
/*
* Default value for the blocking. If the queue is empty,
* host_blocked counts down in the request_fn until it restarts
* host operations as zero is reached.
*
* FIXME: This should probably be a value in the template
*/
#define SCSI_DEFAULT_HOST_BLOCKED 7
/*
* Pointer to the sysfs class properties for this host, NULL terminated.
*/
struct device_attribute **shost_attrs;
/*
* Pointer to the SCSI device properties for this host, NULL terminated.
*/
struct device_attribute **sdev_attrs;
/*
* Pointer to the SCSI device attribute groups for this host,
* NULL terminated.
*/
const struct attribute_group **sdev_groups;
/*
* Vendor Identifier associated with the host
*
* Note: When specifying vendor_id, be sure to read the
* Vendor Type and ID formatting requirements specified in
* scsi_netlink.h
*/
u64 vendor_id;
struct scsi_host_cmd_pool *cmd_pool;
/* Delay for runtime autosuspend */
int rpm_autosuspend_delay;
};
/*
* Temporary #define for host lock push down. Can be removed when all
* drivers have been updated to take advantage of unlocked
* queuecommand.
*
*/
#define DEF_SCSI_QCMD(func_name) \
int func_name(struct Scsi_Host *shost, struct scsi_cmnd *cmd) \
{ \
unsigned long irq_flags; \
int rc; \
spin_lock_irqsave(shost->host_lock, irq_flags); \
rc = func_name##_lck (cmd, cmd->scsi_done); \
spin_unlock_irqrestore(shost->host_lock, irq_flags); \
return rc; \
}
/*
* shost state: If you alter this, you also need to alter scsi_sysfs.c
* (for the ascii descriptions) and the state model enforcer:
* scsi_host_set_state()
*/
enum scsi_host_state {
SHOST_CREATED = 1,
SHOST_RUNNING,
SHOST_CANCEL,
SHOST_DEL,
SHOST_RECOVERY,
SHOST_CANCEL_RECOVERY,
SHOST_DEL_RECOVERY,
};
struct Scsi_Host {
/*
* __devices is protected by the host_lock, but you should
* usually use scsi_device_lookup / shost_for_each_device
* to access it and don't care about locking yourself.
* In the rare case of being in irq context you can use
* their __ prefixed variants with the lock held. NEVER
* access this list directly from a driver.
*/
struct list_head __devices;
struct list_head __targets;
struct list_head starved_list;
spinlock_t default_lock;
spinlock_t *host_lock;
struct mutex scan_mutex;/* serialize scanning activity */
struct list_head eh_abort_list;
struct list_head eh_cmd_q;
struct task_struct * ehandler; /* Error recovery thread. */
struct completion * eh_action; /* Wait for specific actions on the
host. */
wait_queue_head_t host_wait;
struct scsi_host_template *hostt;
struct scsi_transport_template *transportt;
/* Area to keep a shared tag map */
struct blk_mq_tag_set tag_set;
atomic_t host_blocked;
unsigned int host_failed; /* commands that failed.
protected by host_lock */
unsigned int host_eh_scheduled; /* EH scheduled without command */
unsigned int host_no; /* Used for IOCTL_GET_IDLUN, /proc/scsi et al. */
/* next two fields are used to bound the time spent in error handling */
int eh_deadline;
unsigned long last_reset;
/*
* These three parameters can be used to allow for wide scsi,
* and for host adapters that support multiple busses
* The last two should be set to 1 more than the actual max id
* or lun (e.g. 8 for SCSI parallel systems).
*/
unsigned int max_channel;
unsigned int max_id;
u64 max_lun;
/*
* This is a unique identifier that must be assigned so that we
* have some way of identifying each detected host adapter properly
* and uniquely. For hosts that do not support more than one card
* in the system at one time, this does not need to be set. It is
* initialized to 0 in scsi_register.
*/
unsigned int unique_id;
/*
* The maximum length of SCSI commands that this host can accept.
* Probably 12 for most host adapters, but could be 16 for others.
* or 260 if the driver supports variable length cdbs.
* For drivers that don't set this field, a value of 12 is
* assumed.
*/
unsigned short max_cmd_len;
int this_id;
int can_queue;
short cmd_per_lun;
short unsigned int sg_tablesize;
short unsigned int sg_prot_tablesize;
unsigned int max_sectors;
unsigned int max_segment_size;
unsigned long dma_boundary;
unsigned long virt_boundary_mask;
/*
* In scsi-mq mode, the number of hardware queues supported by the LLD.
*
* Note: it is assumed that each hardware queue has a queue depth of
* can_queue. In other words, the total queue depth per host
* is nr_hw_queues * can_queue. However, for when host_tagset is set,
* the total queue depth is can_queue.
*/
unsigned nr_hw_queues;
unsigned nr_maps;
unsigned active_mode:2;
/*
* Host has requested that no further requests come through for the
* time being.
*/
unsigned host_self_blocked:1;
/*
* Host uses correct SCSI ordering not PC ordering. The bit is
* set for the minority of drivers whose authors actually read
* the spec ;).
*/
unsigned reverse_ordering:1;
/* Task mgmt function in progress */
unsigned tmf_in_progress:1;
/* Asynchronous scan in progress */
unsigned async_scan:1;
/* Don't resume host in EH */
unsigned eh_noresume:1;
/* The controller does not support WRITE SAME */
unsigned no_write_same:1;
/* True if the host uses host-wide tagspace */
unsigned host_tagset:1;
/* Host responded with short (<36 bytes) INQUIRY result */
unsigned short_inquiry:1;
/* The transport requires the LUN bits NOT to be stored in CDB[1] */
unsigned no_scsi2_lun_in_cdb:1;
/*
* Optional work queue to be utilized by the transport
*/
char work_q_name[20];
struct workqueue_struct *work_q;
/*
* Task management function work queue
*/
struct workqueue_struct *tmf_work_q;
/*
* Value host_blocked counts down from
*/
unsigned int max_host_blocked;
/* Protection Information */
unsigned int prot_capabilities;
unsigned char prot_guard_type;
/* legacy crap */
unsigned long base;
unsigned long io_port;
unsigned char n_io_port;
unsigned char dma_channel;
unsigned int irq;
enum scsi_host_state shost_state;
/* ldm bits */
struct device shost_gendev, shost_dev;
/*
* Points to the transport data (if any) which is allocated
* separately
*/
void *shost_data;
/*
* Points to the physical bus device we'd use to do DMA
* Needed just in case we have virtual hosts.
*/
struct device *dma_dev;
/*
* We should ensure that this is aligned, both for better performance
* and also because some compilers (m68k) don't automatically force
* alignment to a long boundary.
*/
unsigned long hostdata[] /* Used for storage of host specific stuff */
__attribute__ ((aligned (sizeof(unsigned long))));
};
#define class_to_shost(d) \
container_of(d, struct Scsi_Host, shost_dev)
#define shost_printk(prefix, shost, fmt, a...) \
dev_printk(prefix, &(shost)->shost_gendev, fmt, ##a)
static inline void *shost_priv(struct Scsi_Host *shost)
{
return (void *)shost->hostdata;
}
int scsi_is_host_device(const struct device *);
static inline struct Scsi_Host *dev_to_shost(struct device *dev)
{
while (!scsi_is_host_device(dev)) {
if (!dev->parent)
return NULL;
dev = dev->parent;
}
return container_of(dev, struct Scsi_Host, shost_gendev);
}
static inline int scsi_host_in_recovery(struct Scsi_Host *shost)
{
return shost->shost_state == SHOST_RECOVERY ||
shost->shost_state == SHOST_CANCEL_RECOVERY ||
shost->shost_state == SHOST_DEL_RECOVERY ||
shost->tmf_in_progress;
}
extern int scsi_queue_work(struct Scsi_Host *, struct work_struct *);
extern void scsi_flush_work(struct Scsi_Host *);
extern struct Scsi_Host *scsi_host_alloc(struct scsi_host_template *, int);
extern int __must_check scsi_add_host_with_dma(struct Scsi_Host *,
struct device *,
struct device *);
extern void scsi_scan_host(struct Scsi_Host *);
extern void scsi_rescan_device(struct device *);
extern void scsi_remove_host(struct Scsi_Host *);
extern struct Scsi_Host *scsi_host_get(struct Scsi_Host *);
extern int scsi_host_busy(struct Scsi_Host *shost);
extern void scsi_host_put(struct Scsi_Host *t);
extern struct Scsi_Host *scsi_host_lookup(unsigned short);
extern const char *scsi_host_state_name(enum scsi_host_state);
extern void scsi_host_complete_all_commands(struct Scsi_Host *shost,
enum scsi_host_status status);
static inline int __must_check scsi_add_host(struct Scsi_Host *host,
struct device *dev)
{
return scsi_add_host_with_dma(host, dev, dev);
}
static inline struct device *scsi_get_device(struct Scsi_Host *shost)
{
return shost->shost_gendev.parent;
}
/**
* scsi_host_scan_allowed - Is scanning of this host allowed
* @shost: Pointer to Scsi_Host.
**/
static inline int scsi_host_scan_allowed(struct Scsi_Host *shost)
{
return shost->shost_state == SHOST_RUNNING ||
shost->shost_state == SHOST_RECOVERY;
}
extern void scsi_unblock_requests(struct Scsi_Host *);
extern void scsi_block_requests(struct Scsi_Host *);
extern int scsi_host_block(struct Scsi_Host *shost);
extern int scsi_host_unblock(struct Scsi_Host *shost, int new_state);
void scsi_host_busy_iter(struct Scsi_Host *,
bool (*fn)(struct scsi_cmnd *, void *, bool), void *priv);
struct class_container;
/*
* These two functions are used to allocate and free a pseudo device
* which will connect to the host adapter itself rather than any
* physical device. You must deallocate when you are done with the
* thing. This physical pseudo-device isn't real and won't be available
* from any high-level drivers.
*/
extern void scsi_free_host_dev(struct scsi_device *);
extern struct scsi_device *scsi_get_host_dev(struct Scsi_Host *);
/*
* DIF defines the exchange of protection information between
* initiator and SBC block device.
*
* DIX defines the exchange of protection information between OS and
* initiator.
*/
enum scsi_host_prot_capabilities {
SHOST_DIF_TYPE1_PROTECTION = 1 << 0, /* T10 DIF Type 1 */
SHOST_DIF_TYPE2_PROTECTION = 1 << 1, /* T10 DIF Type 2 */
SHOST_DIF_TYPE3_PROTECTION = 1 << 2, /* T10 DIF Type 3 */
SHOST_DIX_TYPE0_PROTECTION = 1 << 3, /* DIX between OS and HBA only */
SHOST_DIX_TYPE1_PROTECTION = 1 << 4, /* DIX with DIF Type 1 */
SHOST_DIX_TYPE2_PROTECTION = 1 << 5, /* DIX with DIF Type 2 */
SHOST_DIX_TYPE3_PROTECTION = 1 << 6, /* DIX with DIF Type 3 */
};
/*
* SCSI hosts which support the Data Integrity Extensions must
* indicate their capabilities by setting the prot_capabilities using
* this call.
*/
static inline void scsi_host_set_prot(struct Scsi_Host *shost, unsigned int mask)
{
shost->prot_capabilities = mask;
}
static inline unsigned int scsi_host_get_prot(struct Scsi_Host *shost)
{
return shost->prot_capabilities;
}
static inline int scsi_host_prot_dma(struct Scsi_Host *shost)
{
return shost->prot_capabilities >= SHOST_DIX_TYPE0_PROTECTION;
}
static inline unsigned int scsi_host_dif_capable(struct Scsi_Host *shost, unsigned int target_type)
{
static unsigned char cap[] = { 0,
SHOST_DIF_TYPE1_PROTECTION,
SHOST_DIF_TYPE2_PROTECTION,
SHOST_DIF_TYPE3_PROTECTION };
if (target_type >= ARRAY_SIZE(cap))
return 0;
return shost->prot_capabilities & cap[target_type] ? target_type : 0;
}
static inline unsigned int scsi_host_dix_capable(struct Scsi_Host *shost, unsigned int target_type)
{
#if defined(CONFIG_BLK_DEV_INTEGRITY)
static unsigned char cap[] = { SHOST_DIX_TYPE0_PROTECTION,
SHOST_DIX_TYPE1_PROTECTION,
SHOST_DIX_TYPE2_PROTECTION,
SHOST_DIX_TYPE3_PROTECTION };
if (target_type >= ARRAY_SIZE(cap))
return 0;
return shost->prot_capabilities & cap[target_type];
#endif
return 0;
}
/*
* All DIX-capable initiators must support the T10-mandated CRC
* checksum. Controllers can optionally implement the IP checksum
* scheme which has much lower impact on system performance. Note
* that the main rationale for the checksum is to match integrity
* metadata with data. Detecting bit errors are a job for ECC memory
* and buses.
*/
enum scsi_host_guard_type {
SHOST_DIX_GUARD_CRC = 1 << 0,
SHOST_DIX_GUARD_IP = 1 << 1,
};
static inline void scsi_host_set_guard(struct Scsi_Host *shost, unsigned char type)
{
shost->prot_guard_type = type;
}
static inline unsigned char scsi_host_get_guard(struct Scsi_Host *shost)
{
return shost->prot_guard_type;
}
extern int scsi_host_set_state(struct Scsi_Host *, enum scsi_host_state);
#endif /* _SCSI_SCSI_HOST_H */
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_WAIT_H
#define _LINUX_WAIT_H
/*
* Linux wait queue related types and methods
*/
#include <linux/list.h>
#include <linux/stddef.h>
#include <linux/spinlock.h>
#include <asm/current.h>
#include <uapi/linux/wait.h>
typedef struct wait_queue_entry wait_queue_entry_t;
typedef int (*wait_queue_func_t)(struct wait_queue_entry *wq_entry, unsigned mode, int flags, void *key);
int default_wake_function(struct wait_queue_entry *wq_entry, unsigned mode, int flags, void *key);
/* wait_queue_entry::flags */
#define WQ_FLAG_EXCLUSIVE 0x01
#define WQ_FLAG_WOKEN 0x02
#define WQ_FLAG_BOOKMARK 0x04
#define WQ_FLAG_CUSTOM 0x08
#define WQ_FLAG_DONE 0x10
#define WQ_FLAG_PRIORITY 0x20
/*
* A single wait-queue entry structure:
*/
struct wait_queue_entry {
unsigned int flags;
void *private;
wait_queue_func_t func;
struct list_head entry;
};
struct wait_queue_head {
spinlock_t lock;
struct list_head head;
};
typedef struct wait_queue_head wait_queue_head_t;
struct task_struct;
/*
* Macros for declaration and initialisaton of the datatypes
*/
#define __WAITQUEUE_INITIALIZER(name, tsk) { \
.private = tsk, \
.func = default_wake_function, \
.entry = { NULL, NULL } }
#define DECLARE_WAITQUEUE(name, tsk) \
struct wait_queue_entry name = __WAITQUEUE_INITIALIZER(name, tsk)
#define __WAIT_QUEUE_HEAD_INITIALIZER(name) { \
.lock = __SPIN_LOCK_UNLOCKED(name.lock), \
.head = LIST_HEAD_INIT(name.head) }
#define DECLARE_WAIT_QUEUE_HEAD(name) \
struct wait_queue_head name = __WAIT_QUEUE_HEAD_INITIALIZER(name)
extern void __init_waitqueue_head(struct wait_queue_head *wq_head, const char *name, struct lock_class_key *);
#define init_waitqueue_head(wq_head) \
do { \
static struct lock_class_key __key; \
\
__init_waitqueue_head((wq_head), #wq_head, &__key); \
} while (0)
#ifdef CONFIG_LOCKDEP
# define __WAIT_QUEUE_HEAD_INIT_ONSTACK(name) \
({ init_waitqueue_head(&name); name; })
# define DECLARE_WAIT_QUEUE_HEAD_ONSTACK(name) \
struct wait_queue_head name = __WAIT_QUEUE_HEAD_INIT_ONSTACK(name)
#else
# define DECLARE_WAIT_QUEUE_HEAD_ONSTACK(name) DECLARE_WAIT_QUEUE_HEAD(name)
#endif
static inline void init_waitqueue_entry(struct wait_queue_entry *wq_entry, struct task_struct *p)
{
wq_entry->flags = 0;
wq_entry->private = p;
wq_entry->func = default_wake_function;
}
static inline void
init_waitqueue_func_entry(struct wait_queue_entry *wq_entry, wait_queue_func_t func)
{
wq_entry->flags = 0;
wq_entry->private = NULL;
wq_entry->func = func;
}
/**
* waitqueue_active -- locklessly test for waiters on the queue
* @wq_head: the waitqueue to test for waiters
*
* returns true if the wait list is not empty
*
* NOTE: this function is lockless and requires care, incorrect usage _will_
* lead to sporadic and non-obvious failure.
*
* Use either while holding wait_queue_head::lock or when used for wakeups
* with an extra smp_mb() like::
*
* CPU0 - waker CPU1 - waiter
*
* for (;;) {
* @cond = true; prepare_to_wait(&wq_head, &wait, state);
* smp_mb(); // smp_mb() from set_current_state()
* if (waitqueue_active(wq_head)) if (@cond)
* wake_up(wq_head); break;
* schedule();
* }
* finish_wait(&wq_head, &wait);
*
* Because without the explicit smp_mb() it's possible for the
* waitqueue_active() load to get hoisted over the @cond store such that we'll
* observe an empty wait list while the waiter might not observe @cond.
*
* Also note that this 'optimization' trades a spin_lock() for an smp_mb(),
* which (when the lock is uncontended) are of roughly equal cost.
*/
static inline int waitqueue_active(struct wait_queue_head *wq_head)
{
return !list_empty(&wq_head->head);
}
/**
* wq_has_single_sleeper - check if there is only one sleeper
* @wq_head: wait queue head
*
* Returns true of wq_head has only one sleeper on the list.
*
* Please refer to the comment for waitqueue_active.
*/
static inline bool wq_has_single_sleeper(struct wait_queue_head *wq_head)
{
return list_is_singular(&wq_head->head);
}
/**
* wq_has_sleeper - check if there are any waiting processes
* @wq_head: wait queue head
*
* Returns true if wq_head has waiting processes
*
* Please refer to the comment for waitqueue_active.
*/
static inline bool wq_has_sleeper(struct wait_queue_head *wq_head)
{
/*
* We need to be sure we are in sync with the
* add_wait_queue modifications to the wait queue.
*
* This memory barrier should be paired with one on the
* waiting side.
*/
smp_mb();
return waitqueue_active(wq_head);
}
extern void add_wait_queue(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry);
extern void add_wait_queue_exclusive(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry);
extern void add_wait_queue_priority(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry);
extern void remove_wait_queue(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry);
static inline void __add_wait_queue(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry)
{
struct list_head *head = &wq_head->head;
struct wait_queue_entry *wq;
list_for_each_entry(wq, &wq_head->head, entry) { if (!(wq->flags & WQ_FLAG_PRIORITY))
break;
head = &wq->entry;
}
list_add(&wq_entry->entry, head);
}
/*
* Used for wake-one threads:
*/
static inline void
__add_wait_queue_exclusive(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry)
{
wq_entry->flags |= WQ_FLAG_EXCLUSIVE;
__add_wait_queue(wq_head, wq_entry);
}
static inline void __add_wait_queue_entry_tail(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry)
{
list_add_tail(&wq_entry->entry, &wq_head->head);
}
static inline void
__add_wait_queue_entry_tail_exclusive(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry)
{
wq_entry->flags |= WQ_FLAG_EXCLUSIVE;
__add_wait_queue_entry_tail(wq_head, wq_entry);
}
static inline void
__remove_wait_queue(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry)
{
list_del(&wq_entry->entry);
}
void __wake_up(struct wait_queue_head *wq_head, unsigned int mode, int nr, void *key);
void __wake_up_locked_key(struct wait_queue_head *wq_head, unsigned int mode, void *key);
void __wake_up_locked_key_bookmark(struct wait_queue_head *wq_head,
unsigned int mode, void *key, wait_queue_entry_t *bookmark);
void __wake_up_sync_key(struct wait_queue_head *wq_head, unsigned int mode, void *key);
void __wake_up_locked_sync_key(struct wait_queue_head *wq_head, unsigned int mode, void *key);
void __wake_up_locked(struct wait_queue_head *wq_head, unsigned int mode, int nr);
void __wake_up_sync(struct wait_queue_head *wq_head, unsigned int mode);
void __wake_up_pollfree(struct wait_queue_head *wq_head);
#define wake_up(x) __wake_up(x, TASK_NORMAL, 1, NULL)
#define wake_up_nr(x, nr) __wake_up(x, TASK_NORMAL, nr, NULL)
#define wake_up_all(x) __wake_up(x, TASK_NORMAL, 0, NULL)
#define wake_up_locked(x) __wake_up_locked((x), TASK_NORMAL, 1)
#define wake_up_all_locked(x) __wake_up_locked((x), TASK_NORMAL, 0)
#define wake_up_interruptible(x) __wake_up(x, TASK_INTERRUPTIBLE, 1, NULL)
#define wake_up_interruptible_nr(x, nr) __wake_up(x, TASK_INTERRUPTIBLE, nr, NULL)
#define wake_up_interruptible_all(x) __wake_up(x, TASK_INTERRUPTIBLE, 0, NULL)
#define wake_up_interruptible_sync(x) __wake_up_sync((x), TASK_INTERRUPTIBLE)
/*
* Wakeup macros to be used to report events to the targets.
*/
#define poll_to_key(m) ((void *)(__force uintptr_t)(__poll_t)(m))
#define key_to_poll(m) ((__force __poll_t)(uintptr_t)(void *)(m))
#define wake_up_poll(x, m) \
__wake_up(x, TASK_NORMAL, 1, poll_to_key(m))
#define wake_up_locked_poll(x, m) \
__wake_up_locked_key((x), TASK_NORMAL, poll_to_key(m))
#define wake_up_interruptible_poll(x, m) \
__wake_up(x, TASK_INTERRUPTIBLE, 1, poll_to_key(m))
#define wake_up_interruptible_sync_poll(x, m) \
__wake_up_sync_key((x), TASK_INTERRUPTIBLE, poll_to_key(m))
#define wake_up_interruptible_sync_poll_locked(x, m) \
__wake_up_locked_sync_key((x), TASK_INTERRUPTIBLE, poll_to_key(m))
/**
* wake_up_pollfree - signal that a polled waitqueue is going away
* @wq_head: the wait queue head
*
* In the very rare cases where a ->poll() implementation uses a waitqueue whose
* lifetime is tied to a task rather than to the 'struct file' being polled,
* this function must be called before the waitqueue is freed so that
* non-blocking polls (e.g. epoll) are notified that the queue is going away.
*
* The caller must also RCU-delay the freeing of the wait_queue_head, e.g. via
* an explicit synchronize_rcu() or call_rcu(), or via SLAB_TYPESAFE_BY_RCU.
*/
static inline void wake_up_pollfree(struct wait_queue_head *wq_head)
{
/*
* For performance reasons, we don't always take the queue lock here.
* Therefore, we might race with someone removing the last entry from
* the queue, and proceed while they still hold the queue lock.
* However, rcu_read_lock() is required to be held in such cases, so we
* can safely proceed with an RCU-delayed free.
*/
if (waitqueue_active(wq_head))
__wake_up_pollfree(wq_head);
}
#define ___wait_cond_timeout(condition) \
({ \
bool __cond = (condition); \
if (__cond && !__ret) \
__ret = 1; \
__cond || !__ret; \
})
#define ___wait_is_interruptible(state) \
(!__builtin_constant_p(state) || \
state == TASK_INTERRUPTIBLE || state == TASK_KILLABLE) \
extern void init_wait_entry(struct wait_queue_entry *wq_entry, int flags);
/*
* The below macro ___wait_event() has an explicit shadow of the __ret
* variable when used from the wait_event_*() macros.
*
* This is so that both can use the ___wait_cond_timeout() construct
* to wrap the condition.
*
* The type inconsistency of the wait_event_*() __ret variable is also
* on purpose; we use long where we can return timeout values and int
* otherwise.
*/
#define ___wait_event(wq_head, condition, state, exclusive, ret, cmd) \
({ \
__label__ __out; \
struct wait_queue_entry __wq_entry; \
long __ret = ret; /* explicit shadow */ \
\
init_wait_entry(&__wq_entry, exclusive ? WQ_FLAG_EXCLUSIVE : 0); \
for (;;) { \
long __int = prepare_to_wait_event(&wq_head, &__wq_entry, state);\
\
if (condition) \
break; \
\
if (___wait_is_interruptible(state) && __int) { \
__ret = __int; \
goto __out; \
} \
\
cmd; \
} \
finish_wait(&wq_head, &__wq_entry); \
__out: __ret; \
})
#define __wait_event(wq_head, condition) \
(void)___wait_event(wq_head, condition, TASK_UNINTERRUPTIBLE, 0, 0, \
schedule())
/**
* wait_event - sleep until a condition gets true
* @wq_head: the waitqueue to wait on
* @condition: a C expression for the event to wait for
*
* The process is put to sleep (TASK_UNINTERRUPTIBLE) until the
* @condition evaluates to true. The @condition is checked each time
* the waitqueue @wq_head is woken up.
*
* wake_up() has to be called after changing any variable that could
* change the result of the wait condition.
*/
#define wait_event(wq_head, condition) \
do { \
might_sleep(); \
if (condition) \
break; \
__wait_event(wq_head, condition); \
} while (0)
#define __io_wait_event(wq_head, condition) \
(void)___wait_event(wq_head, condition, TASK_UNINTERRUPTIBLE, 0, 0, \
io_schedule())
/*
* io_wait_event() -- like wait_event() but with io_schedule()
*/
#define io_wait_event(wq_head, condition) \
do { \
might_sleep(); \
if (condition) \
break; \
__io_wait_event(wq_head, condition); \
} while (0)
#define __wait_event_freezable(wq_head, condition) \
___wait_event(wq_head, condition, TASK_INTERRUPTIBLE, 0, 0, \
freezable_schedule())
/**
* wait_event_freezable - sleep (or freeze) until a condition gets true
* @wq_head: the waitqueue to wait on
* @condition: a C expression for the event to wait for
*
* The process is put to sleep (TASK_INTERRUPTIBLE -- so as not to contribute
* to system load) until the @condition evaluates to true. The
* @condition is checked each time the waitqueue @wq_head is woken up.
*
* wake_up() has to be called after changing any variable that could
* change the result of the wait condition.
*/
#define wait_event_freezable(wq_head, condition) \
({ \
int __ret = 0; \
might_sleep(); \
if (!(condition)) \
__ret = __wait_event_freezable(wq_head, condition); \
__ret; \
})
#define __wait_event_timeout(wq_head, condition, timeout) \
___wait_event(wq_head, ___wait_cond_timeout(condition), \
TASK_UNINTERRUPTIBLE, 0, timeout, \
__ret = schedule_timeout(__ret))
/**
* wait_event_timeout - sleep until a condition gets true or a timeout elapses
* @wq_head: the waitqueue to wait on
* @condition: a C expression for the event to wait for
* @timeout: timeout, in jiffies
*
* The process is put to sleep (TASK_UNINTERRUPTIBLE) until the
* @condition evaluates to true. The @condition is checked each time
* the waitqueue @wq_head is woken up.
*
* wake_up() has to be called after changing any variable that could
* change the result of the wait condition.
*
* Returns:
* 0 if the @condition evaluated to %false after the @timeout elapsed,
* 1 if the @condition evaluated to %true after the @timeout elapsed,
* or the remaining jiffies (at least 1) if the @condition evaluated
* to %true before the @timeout elapsed.
*/
#define wait_event_timeout(wq_head, condition, timeout) \
({ \
long __ret = timeout; \
might_sleep(); \
if (!___wait_cond_timeout(condition)) \
__ret = __wait_event_timeout(wq_head, condition, timeout); \
__ret; \
})
#define __wait_event_freezable_timeout(wq_head, condition, timeout) \
___wait_event(wq_head, ___wait_cond_timeout(condition), \
TASK_INTERRUPTIBLE, 0, timeout, \
__ret = freezable_schedule_timeout(__ret))
/*
* like wait_event_timeout() -- except it uses TASK_INTERRUPTIBLE to avoid
* increasing load and is freezable.
*/
#define wait_event_freezable_timeout(wq_head, condition, timeout) \
({ \
long __ret = timeout; \
might_sleep(); \
if (!___wait_cond_timeout(condition)) \
__ret = __wait_event_freezable_timeout(wq_head, condition, timeout); \
__ret; \
})
#define __wait_event_exclusive_cmd(wq_head, condition, cmd1, cmd2) \
(void)___wait_event(wq_head, condition, TASK_UNINTERRUPTIBLE, 1, 0, \
cmd1; schedule(); cmd2)
/*
* Just like wait_event_cmd(), except it sets exclusive flag
*/
#define wait_event_exclusive_cmd(wq_head, condition, cmd1, cmd2) \
do { \
if (condition) \
break; \
__wait_event_exclusive_cmd(wq_head, condition, cmd1, cmd2); \
} while (0)
#define __wait_event_cmd(wq_head, condition, cmd1, cmd2) \
(void)___wait_event(wq_head, condition, TASK_UNINTERRUPTIBLE, 0, 0, \
cmd1; schedule(); cmd2)
/**
* wait_event_cmd - sleep until a condition gets true
* @wq_head: the waitqueue to wait on
* @condition: a C expression for the event to wait for
* @cmd1: the command will be executed before sleep
* @cmd2: the command will be executed after sleep
*
* The process is put to sleep (TASK_UNINTERRUPTIBLE) until the
* @condition evaluates to true. The @condition is checked each time
* the waitqueue @wq_head is woken up.
*
* wake_up() has to be called after changing any variable that could
* change the result of the wait condition.
*/
#define wait_event_cmd(wq_head, condition, cmd1, cmd2) \
do { \
if (condition) \
break; \
__wait_event_cmd(wq_head, condition, cmd1, cmd2); \
} while (0)
#define __wait_event_interruptible(wq_head, condition) \
___wait_event(wq_head, condition, TASK_INTERRUPTIBLE, 0, 0, \
schedule())
/**
* wait_event_interruptible - sleep until a condition gets true
* @wq_head: the waitqueue to wait on
* @condition: a C expression for the event to wait for
*
* The process is put to sleep (TASK_INTERRUPTIBLE) until the
* @condition evaluates to true or a signal is received.
* The @condition is checked each time the waitqueue @wq_head is woken up.
*
* wake_up() has to be called after changing any variable that could
* change the result of the wait condition.
*
* The function will return -ERESTARTSYS if it was interrupted by a
* signal and 0 if @condition evaluated to true.
*/
#define wait_event_interruptible(wq_head, condition) \
({ \
int __ret = 0; \
might_sleep(); \
if (!(condition)) \
__ret = __wait_event_interruptible(wq_head, condition); \
__ret; \
})
#define __wait_event_interruptible_timeout(wq_head, condition, timeout) \
___wait_event(wq_head, ___wait_cond_timeout(condition), \
TASK_INTERRUPTIBLE, 0, timeout, \
__ret = schedule_timeout(__ret))
/**
* wait_event_interruptible_timeout - sleep until a condition gets true or a timeout elapses
* @wq_head: the waitqueue to wait on
* @condition: a C expression for the event to wait for
* @timeout: timeout, in jiffies
*
* The process is put to sleep (TASK_INTERRUPTIBLE) until the
* @condition evaluates to true or a signal is received.
* The @condition is checked each time the waitqueue @wq_head is woken up.
*
* wake_up() has to be called after changing any variable that could
* change the result of the wait condition.
*
* Returns:
* 0 if the @condition evaluated to %false after the @timeout elapsed,
* 1 if the @condition evaluated to %true after the @timeout elapsed,
* the remaining jiffies (at least 1) if the @condition evaluated
* to %true before the @timeout elapsed, or -%ERESTARTSYS if it was
* interrupted by a signal.
*/
#define wait_event_interruptible_timeout(wq_head, condition, timeout) \
({ \
long __ret = timeout; \
might_sleep(); \
if (!___wait_cond_timeout(condition)) \
__ret = __wait_event_interruptible_timeout(wq_head, \
condition, timeout); \
__ret; \
})
#define __wait_event_hrtimeout(wq_head, condition, timeout, state) \
({ \
int __ret = 0; \
struct hrtimer_sleeper __t; \
\
hrtimer_init_sleeper_on_stack(&__t, CLOCK_MONOTONIC, \
HRTIMER_MODE_REL); \
if ((timeout) != KTIME_MAX) \
hrtimer_start_range_ns(&__t.timer, timeout, \
current->timer_slack_ns, \
HRTIMER_MODE_REL); \
\
__ret = ___wait_event(wq_head, condition, state, 0, 0, \
if (!__t.task) { \
__ret = -ETIME; \
break; \
} \
schedule()); \
\
hrtimer_cancel(&__t.timer); \
destroy_hrtimer_on_stack(&__t.timer); \
__ret; \
})
/**
* wait_event_hrtimeout - sleep until a condition gets true or a timeout elapses
* @wq_head: the waitqueue to wait on
* @condition: a C expression for the event to wait for
* @timeout: timeout, as a ktime_t
*
* The process is put to sleep (TASK_UNINTERRUPTIBLE) until the
* @condition evaluates to true or a signal is received.
* The @condition is checked each time the waitqueue @wq_head is woken up.
*
* wake_up() has to be called after changing any variable that could
* change the result of the wait condition.
*
* The function returns 0 if @condition became true, or -ETIME if the timeout
* elapsed.
*/
#define wait_event_hrtimeout(wq_head, condition, timeout) \
({ \
int __ret = 0; \
might_sleep(); \
if (!(condition)) \
__ret = __wait_event_hrtimeout(wq_head, condition, timeout, \
TASK_UNINTERRUPTIBLE); \
__ret; \
})
/**
* wait_event_interruptible_hrtimeout - sleep until a condition gets true or a timeout elapses
* @wq: the waitqueue to wait on
* @condition: a C expression for the event to wait for
* @timeout: timeout, as a ktime_t
*
* The process is put to sleep (TASK_INTERRUPTIBLE) until the
* @condition evaluates to true or a signal is received.
* The @condition is checked each time the waitqueue @wq is woken up.
*
* wake_up() has to be called after changing any variable that could
* change the result of the wait condition.
*
* The function returns 0 if @condition became true, -ERESTARTSYS if it was
* interrupted by a signal, or -ETIME if the timeout elapsed.
*/
#define wait_event_interruptible_hrtimeout(wq, condition, timeout) \
({ \
long __ret = 0; \
might_sleep(); \
if (!(condition)) \
__ret = __wait_event_hrtimeout(wq, condition, timeout, \
TASK_INTERRUPTIBLE); \
__ret; \
})
#define __wait_event_interruptible_exclusive(wq, condition) \
___wait_event(wq, condition, TASK_INTERRUPTIBLE, 1, 0, \
schedule())
#define wait_event_interruptible_exclusive(wq, condition) \
({ \
int __ret = 0; \
might_sleep(); \
if (!(condition)) \
__ret = __wait_event_interruptible_exclusive(wq, condition); \
__ret; \
})
#define __wait_event_killable_exclusive(wq, condition) \
___wait_event(wq, condition, TASK_KILLABLE, 1, 0, \
schedule())
#define wait_event_killable_exclusive(wq, condition) \
({ \
int __ret = 0; \
might_sleep(); \
if (!(condition)) \
__ret = __wait_event_killable_exclusive(wq, condition); \
__ret; \
})
#define __wait_event_freezable_exclusive(wq, condition) \
___wait_event(wq, condition, TASK_INTERRUPTIBLE, 1, 0, \
freezable_schedule())
#define wait_event_freezable_exclusive(wq, condition) \
({ \
int __ret = 0; \
might_sleep(); \
if (!(condition)) \
__ret = __wait_event_freezable_exclusive(wq, condition); \
__ret; \
})
/**
* wait_event_idle - wait for a condition without contributing to system load
* @wq_head: the waitqueue to wait on
* @condition: a C expression for the event to wait for
*
* The process is put to sleep (TASK_IDLE) until the
* @condition evaluates to true.
* The @condition is checked each time the waitqueue @wq_head is woken up.
*
* wake_up() has to be called after changing any variable that could
* change the result of the wait condition.
*
*/
#define wait_event_idle(wq_head, condition) \
do { \
might_sleep(); \
if (!(condition)) \
___wait_event(wq_head, condition, TASK_IDLE, 0, 0, schedule()); \
} while (0)
/**
* wait_event_idle_exclusive - wait for a condition with contributing to system load
* @wq_head: the waitqueue to wait on
* @condition: a C expression for the event to wait for
*
* The process is put to sleep (TASK_IDLE) until the
* @condition evaluates to true.
* The @condition is checked each time the waitqueue @wq_head is woken up.
*
* The process is put on the wait queue with an WQ_FLAG_EXCLUSIVE flag
* set thus if other processes wait on the same list, when this
* process is woken further processes are not considered.
*
* wake_up() has to be called after changing any variable that could
* change the result of the wait condition.
*
*/
#define wait_event_idle_exclusive(wq_head, condition) \
do { \
might_sleep(); \
if (!(condition)) \
___wait_event(wq_head, condition, TASK_IDLE, 1, 0, schedule()); \
} while (0)
#define __wait_event_idle_timeout(wq_head, condition, timeout) \
___wait_event(wq_head, ___wait_cond_timeout(condition), \
TASK_IDLE, 0, timeout, \
__ret = schedule_timeout(__ret))
/**
* wait_event_idle_timeout - sleep without load until a condition becomes true or a timeout elapses
* @wq_head: the waitqueue to wait on
* @condition: a C expression for the event to wait for
* @timeout: timeout, in jiffies
*
* The process is put to sleep (TASK_IDLE) until the
* @condition evaluates to true. The @condition is checked each time
* the waitqueue @wq_head is woken up.
*
* wake_up() has to be called after changing any variable that could
* change the result of the wait condition.
*
* Returns:
* 0 if the @condition evaluated to %false after the @timeout elapsed,
* 1 if the @condition evaluated to %true after the @timeout elapsed,
* or the remaining jiffies (at least 1) if the @condition evaluated
* to %true before the @timeout elapsed.
*/
#define wait_event_idle_timeout(wq_head, condition, timeout) \
({ \
long __ret = timeout; \
might_sleep(); \
if (!___wait_cond_timeout(condition)) \
__ret = __wait_event_idle_timeout(wq_head, condition, timeout); \
__ret; \
})
#define __wait_event_idle_exclusive_timeout(wq_head, condition, timeout) \
___wait_event(wq_head, ___wait_cond_timeout(condition), \
TASK_IDLE, 1, timeout, \
__ret = schedule_timeout(__ret))
/**
* wait_event_idle_exclusive_timeout - sleep without load until a condition becomes true or a timeout elapses
* @wq_head: the waitqueue to wait on
* @condition: a C expression for the event to wait for
* @timeout: timeout, in jiffies
*
* The process is put to sleep (TASK_IDLE) until the
* @condition evaluates to true. The @condition is checked each time
* the waitqueue @wq_head is woken up.
*
* The process is put on the wait queue with an WQ_FLAG_EXCLUSIVE flag
* set thus if other processes wait on the same list, when this
* process is woken further processes are not considered.
*
* wake_up() has to be called after changing any variable that could
* change the result of the wait condition.
*
* Returns:
* 0 if the @condition evaluated to %false after the @timeout elapsed,
* 1 if the @condition evaluated to %true after the @timeout elapsed,
* or the remaining jiffies (at least 1) if the @condition evaluated
* to %true before the @timeout elapsed.
*/
#define wait_event_idle_exclusive_timeout(wq_head, condition, timeout) \
({ \
long __ret = timeout; \
might_sleep(); \
if (!___wait_cond_timeout(condition)) \
__ret = __wait_event_idle_exclusive_timeout(wq_head, condition, timeout);\
__ret; \
})
extern int do_wait_intr(wait_queue_head_t *, wait_queue_entry_t *);
extern int do_wait_intr_irq(wait_queue_head_t *, wait_queue_entry_t *);
#define __wait_event_interruptible_locked(wq, condition, exclusive, fn) \
({ \
int __ret; \
DEFINE_WAIT(__wait); \
if (exclusive) \
__wait.flags |= WQ_FLAG_EXCLUSIVE; \
do { \
__ret = fn(&(wq), &__wait); \
if (__ret) \
break; \
} while (!(condition)); \
__remove_wait_queue(&(wq), &__wait); \
__set_current_state(TASK_RUNNING); \
__ret; \
})
/**
* wait_event_interruptible_locked - sleep until a condition gets true
* @wq: the waitqueue to wait on
* @condition: a C expression for the event to wait for
*
* The process is put to sleep (TASK_INTERRUPTIBLE) until the
* @condition evaluates to true or a signal is received.
* The @condition is checked each time the waitqueue @wq is woken up.
*
* It must be called with wq.lock being held. This spinlock is
* unlocked while sleeping but @condition testing is done while lock
* is held and when this macro exits the lock is held.
*
* The lock is locked/unlocked using spin_lock()/spin_unlock()
* functions which must match the way they are locked/unlocked outside
* of this macro.
*
* wake_up_locked() has to be called after changing any variable that could
* change the result of the wait condition.
*
* The function will return -ERESTARTSYS if it was interrupted by a
* signal and 0 if @condition evaluated to true.
*/
#define wait_event_interruptible_locked(wq, condition) \
((condition) \
? 0 : __wait_event_interruptible_locked(wq, condition, 0, do_wait_intr))
/**
* wait_event_interruptible_locked_irq - sleep until a condition gets true
* @wq: the waitqueue to wait on
* @condition: a C expression for the event to wait for
*
* The process is put to sleep (TASK_INTERRUPTIBLE) until the
* @condition evaluates to true or a signal is received.
* The @condition is checked each time the waitqueue @wq is woken up.
*
* It must be called with wq.lock being held. This spinlock is
* unlocked while sleeping but @condition testing is done while lock
* is held and when this macro exits the lock is held.
*
* The lock is locked/unlocked using spin_lock_irq()/spin_unlock_irq()
* functions which must match the way they are locked/unlocked outside
* of this macro.
*
* wake_up_locked() has to be called after changing any variable that could
* change the result of the wait condition.
*
* The function will return -ERESTARTSYS if it was interrupted by a
* signal and 0 if @condition evaluated to true.
*/
#define wait_event_interruptible_locked_irq(wq, condition) \
((condition) \
? 0 : __wait_event_interruptible_locked(wq, condition, 0, do_wait_intr_irq))
/**
* wait_event_interruptible_exclusive_locked - sleep exclusively until a condition gets true
* @wq: the waitqueue to wait on
* @condition: a C expression for the event to wait for
*
* The process is put to sleep (TASK_INTERRUPTIBLE) until the
* @condition evaluates to true or a signal is received.
* The @condition is checked each time the waitqueue @wq is woken up.
*
* It must be called with wq.lock being held. This spinlock is
* unlocked while sleeping but @condition testing is done while lock
* is held and when this macro exits the lock is held.
*
* The lock is locked/unlocked using spin_lock()/spin_unlock()
* functions which must match the way they are locked/unlocked outside
* of this macro.
*
* The process is put on the wait queue with an WQ_FLAG_EXCLUSIVE flag
* set thus when other process waits process on the list if this
* process is awaken further processes are not considered.
*
* wake_up_locked() has to be called after changing any variable that could
* change the result of the wait condition.
*
* The function will return -ERESTARTSYS if it was interrupted by a
* signal and 0 if @condition evaluated to true.
*/
#define wait_event_interruptible_exclusive_locked(wq, condition) \
((condition) \
? 0 : __wait_event_interruptible_locked(wq, condition, 1, do_wait_intr))
/**
* wait_event_interruptible_exclusive_locked_irq - sleep until a condition gets true
* @wq: the waitqueue to wait on
* @condition: a C expression for the event to wait for
*
* The process is put to sleep (TASK_INTERRUPTIBLE) until the
* @condition evaluates to true or a signal is received.
* The @condition is checked each time the waitqueue @wq is woken up.
*
* It must be called with wq.lock being held. This spinlock is
* unlocked while sleeping but @condition testing is done while lock
* is held and when this macro exits the lock is held.
*
* The lock is locked/unlocked using spin_lock_irq()/spin_unlock_irq()
* functions which must match the way they are locked/unlocked outside
* of this macro.
*
* The process is put on the wait queue with an WQ_FLAG_EXCLUSIVE flag
* set thus when other process waits process on the list if this
* process is awaken further processes are not considered.
*
* wake_up_locked() has to be called after changing any variable that could
* change the result of the wait condition.
*
* The function will return -ERESTARTSYS if it was interrupted by a
* signal and 0 if @condition evaluated to true.
*/
#define wait_event_interruptible_exclusive_locked_irq(wq, condition) \
((condition) \
? 0 : __wait_event_interruptible_locked(wq, condition, 1, do_wait_intr_irq))
#define __wait_event_killable(wq, condition) \
___wait_event(wq, condition, TASK_KILLABLE, 0, 0, schedule())
/**
* wait_event_killable - sleep until a condition gets true
* @wq_head: the waitqueue to wait on
* @condition: a C expression for the event to wait for
*
* The process is put to sleep (TASK_KILLABLE) until the
* @condition evaluates to true or a signal is received.
* The @condition is checked each time the waitqueue @wq_head is woken up.
*
* wake_up() has to be called after changing any variable that could
* change the result of the wait condition.
*
* The function will return -ERESTARTSYS if it was interrupted by a
* signal and 0 if @condition evaluated to true.
*/
#define wait_event_killable(wq_head, condition) \
({ \
int __ret = 0; \
might_sleep(); \
if (!(condition)) \
__ret = __wait_event_killable(wq_head, condition); \
__ret; \
})
#define __wait_event_killable_timeout(wq_head, condition, timeout) \
___wait_event(wq_head, ___wait_cond_timeout(condition), \
TASK_KILLABLE, 0, timeout, \
__ret = schedule_timeout(__ret))
/**
* wait_event_killable_timeout - sleep until a condition gets true or a timeout elapses
* @wq_head: the waitqueue to wait on
* @condition: a C expression for the event to wait for
* @timeout: timeout, in jiffies
*
* The process is put to sleep (TASK_KILLABLE) until the
* @condition evaluates to true or a kill signal is received.
* The @condition is checked each time the waitqueue @wq_head is woken up.
*
* wake_up() has to be called after changing any variable that could
* change the result of the wait condition.
*
* Returns:
* 0 if the @condition evaluated to %false after the @timeout elapsed,
* 1 if the @condition evaluated to %true after the @timeout elapsed,
* the remaining jiffies (at least 1) if the @condition evaluated
* to %true before the @timeout elapsed, or -%ERESTARTSYS if it was
* interrupted by a kill signal.
*
* Only kill signals interrupt this process.
*/
#define wait_event_killable_timeout(wq_head, condition, timeout) \
({ \
long __ret = timeout; \
might_sleep(); \
if (!___wait_cond_timeout(condition)) \
__ret = __wait_event_killable_timeout(wq_head, \
condition, timeout); \
__ret; \
})
#define __wait_event_lock_irq(wq_head, condition, lock, cmd) \
(void)___wait_event(wq_head, condition, TASK_UNINTERRUPTIBLE, 0, 0, \
spin_unlock_irq(&lock); \
cmd; \
schedule(); \
spin_lock_irq(&lock))
/**
* wait_event_lock_irq_cmd - sleep until a condition gets true. The
* condition is checked under the lock. This
* is expected to be called with the lock
* taken.
* @wq_head: the waitqueue to wait on
* @condition: a C expression for the event to wait for
* @lock: a locked spinlock_t, which will be released before cmd
* and schedule() and reacquired afterwards.
* @cmd: a command which is invoked outside the critical section before
* sleep
*
* The process is put to sleep (TASK_UNINTERRUPTIBLE) until the
* @condition evaluates to true. The @condition is checked each time
* the waitqueue @wq_head is woken up.
*
* wake_up() has to be called after changing any variable that could
* change the result of the wait condition.
*
* This is supposed to be called while holding the lock. The lock is
* dropped before invoking the cmd and going to sleep and is reacquired
* afterwards.
*/
#define wait_event_lock_irq_cmd(wq_head, condition, lock, cmd) \
do { \
if (condition) \
break; \
__wait_event_lock_irq(wq_head, condition, lock, cmd); \
} while (0)
/**
* wait_event_lock_irq - sleep until a condition gets true. The
* condition is checked under the lock. This
* is expected to be called with the lock
* taken.
* @wq_head: the waitqueue to wait on
* @condition: a C expression for the event to wait for
* @lock: a locked spinlock_t, which will be released before schedule()
* and reacquired afterwards.
*
* The process is put to sleep (TASK_UNINTERRUPTIBLE) until the
* @condition evaluates to true. The @condition is checked each time
* the waitqueue @wq_head is woken up.
*
* wake_up() has to be called after changing any variable that could
* change the result of the wait condition.
*
* This is supposed to be called while holding the lock. The lock is
* dropped before going to sleep and is reacquired afterwards.
*/
#define wait_event_lock_irq(wq_head, condition, lock) \
do { \
if (condition) \
break; \
__wait_event_lock_irq(wq_head, condition, lock, ); \
} while (0)
#define __wait_event_interruptible_lock_irq(wq_head, condition, lock, cmd) \
___wait_event(wq_head, condition, TASK_INTERRUPTIBLE, 0, 0, \
spin_unlock_irq(&lock); \
cmd; \
schedule(); \
spin_lock_irq(&lock))
/**
* wait_event_interruptible_lock_irq_cmd - sleep until a condition gets true.
* The condition is checked under the lock. This is expected to
* be called with the lock taken.
* @wq_head: the waitqueue to wait on
* @condition: a C expression for the event to wait for
* @lock: a locked spinlock_t, which will be released before cmd and
* schedule() and reacquired afterwards.
* @cmd: a command which is invoked outside the critical section before
* sleep
*
* The process is put to sleep (TASK_INTERRUPTIBLE) until the
* @condition evaluates to true or a signal is received. The @condition is
* checked each time the waitqueue @wq_head is woken up.
*
* wake_up() has to be called after changing any variable that could
* change the result of the wait condition.
*
* This is supposed to be called while holding the lock. The lock is
* dropped before invoking the cmd and going to sleep and is reacquired
* afterwards.
*
* The macro will return -ERESTARTSYS if it was interrupted by a signal
* and 0 if @condition evaluated to true.
*/
#define wait_event_interruptible_lock_irq_cmd(wq_head, condition, lock, cmd) \
({ \
int __ret = 0; \
if (!(condition)) \
__ret = __wait_event_interruptible_lock_irq(wq_head, \
condition, lock, cmd); \
__ret; \
})
/**
* wait_event_interruptible_lock_irq - sleep until a condition gets true.
* The condition is checked under the lock. This is expected
* to be called with the lock taken.
* @wq_head: the waitqueue to wait on
* @condition: a C expression for the event to wait for
* @lock: a locked spinlock_t, which will be released before schedule()
* and reacquired afterwards.
*
* The process is put to sleep (TASK_INTERRUPTIBLE) until the
* @condition evaluates to true or signal is received. The @condition is
* checked each time the waitqueue @wq_head is woken up.
*
* wake_up() has to be called after changing any variable that could
* change the result of the wait condition.
*
* This is supposed to be called while holding the lock. The lock is
* dropped before going to sleep and is reacquired afterwards.
*
* The macro will return -ERESTARTSYS if it was interrupted by a signal
* and 0 if @condition evaluated to true.
*/
#define wait_event_interruptible_lock_irq(wq_head, condition, lock) \
({ \
int __ret = 0; \
if (!(condition)) \
__ret = __wait_event_interruptible_lock_irq(wq_head, \
condition, lock,); \
__ret; \
})
#define __wait_event_lock_irq_timeout(wq_head, condition, lock, timeout, state) \
___wait_event(wq_head, ___wait_cond_timeout(condition), \
state, 0, timeout, \
spin_unlock_irq(&lock); \
__ret = schedule_timeout(__ret); \
spin_lock_irq(&lock));
/**
* wait_event_interruptible_lock_irq_timeout - sleep until a condition gets
* true or a timeout elapses. The condition is checked under
* the lock. This is expected to be called with the lock taken.
* @wq_head: the waitqueue to wait on
* @condition: a C expression for the event to wait for
* @lock: a locked spinlock_t, which will be released before schedule()
* and reacquired afterwards.
* @timeout: timeout, in jiffies
*
* The process is put to sleep (TASK_INTERRUPTIBLE) until the
* @condition evaluates to true or signal is received. The @condition is
* checked each time the waitqueue @wq_head is woken up.
*
* wake_up() has to be called after changing any variable that could
* change the result of the wait condition.
*
* This is supposed to be called while holding the lock. The lock is
* dropped before going to sleep and is reacquired afterwards.
*
* The function returns 0 if the @timeout elapsed, -ERESTARTSYS if it
* was interrupted by a signal, and the remaining jiffies otherwise
* if the condition evaluated to true before the timeout elapsed.
*/
#define wait_event_interruptible_lock_irq_timeout(wq_head, condition, lock, \
timeout) \
({ \
long __ret = timeout; \
if (!___wait_cond_timeout(condition)) \
__ret = __wait_event_lock_irq_timeout( \
wq_head, condition, lock, timeout, \
TASK_INTERRUPTIBLE); \
__ret; \
})
#define wait_event_lock_irq_timeout(wq_head, condition, lock, timeout) \
({ \
long __ret = timeout; \
if (!___wait_cond_timeout(condition)) \
__ret = __wait_event_lock_irq_timeout( \
wq_head, condition, lock, timeout, \
TASK_UNINTERRUPTIBLE); \
__ret; \
})
/*
* Waitqueues which are removed from the waitqueue_head at wakeup time
*/
void prepare_to_wait(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry, int state);
bool prepare_to_wait_exclusive(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry, int state);
long prepare_to_wait_event(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry, int state);
void finish_wait(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry);
long wait_woken(struct wait_queue_entry *wq_entry, unsigned mode, long timeout);
int woken_wake_function(struct wait_queue_entry *wq_entry, unsigned mode, int sync, void *key);
int autoremove_wake_function(struct wait_queue_entry *wq_entry, unsigned mode, int sync, void *key);
#define DEFINE_WAIT_FUNC(name, function) \
struct wait_queue_entry name = { \
.private = current, \
.func = function, \
.entry = LIST_HEAD_INIT((name).entry), \
}
#define DEFINE_WAIT(name) DEFINE_WAIT_FUNC(name, autoremove_wake_function)
#define init_wait(wait) \
do { \
(wait)->private = current; \
(wait)->func = autoremove_wake_function; \
INIT_LIST_HEAD(&(wait)->entry); \
(wait)->flags = 0; \
} while (0)
bool try_invoke_on_locked_down_task(struct task_struct *p, bool (*func)(struct task_struct *t, void *arg), void *arg);
#endif /* _LINUX_WAIT_H */
// SPDX-License-Identifier: GPL-2.0
/*
* Copyright (C) 2007 Oracle. All rights reserved.
*/
#include <linux/fs.h>
#include <linux/pagemap.h>
#include <linux/time.h>
#include <linux/init.h>
#include <linux/string.h>
#include <linux/backing-dev.h>
#include <linux/falloc.h>
#include <linux/writeback.h>
#include <linux/compat.h>
#include <linux/slab.h>
#include <linux/btrfs.h>
#include <linux/uio.h>
#include <linux/iversion.h>
#include <linux/fsverity.h>
#include "ctree.h"
#include "disk-io.h"
#include "transaction.h"
#include "btrfs_inode.h"
#include "print-tree.h"
#include "tree-log.h"
#include "locking.h"
#include "volumes.h"
#include "qgroup.h"
#include "compression.h"
#include "delalloc-space.h"
#include "reflink.h"
#include "subpage.h"
static struct kmem_cache *btrfs_inode_defrag_cachep;
/*
* when auto defrag is enabled we
* queue up these defrag structs to remember which
* inodes need defragging passes
*/
struct inode_defrag {
struct rb_node rb_node;
/* objectid */
u64 ino;
/*
* transid where the defrag was added, we search for
* extents newer than this
*/
u64 transid;
/* root objectid */
u64 root;
/* last offset we were able to defrag */
u64 last_offset;
/* if we've wrapped around back to zero once already */
int cycled;
};
static int __compare_inode_defrag(struct inode_defrag *defrag1,
struct inode_defrag *defrag2)
{
if (defrag1->root > defrag2->root)
return 1;
else if (defrag1->root < defrag2->root)
return -1;
else if (defrag1->ino > defrag2->ino)
return 1;
else if (defrag1->ino < defrag2->ino)
return -1;
else
return 0;
}
/* pop a record for an inode into the defrag tree. The lock
* must be held already
*
* If you're inserting a record for an older transid than an
* existing record, the transid already in the tree is lowered
*
* If an existing record is found the defrag item you
* pass in is freed
*/
static int __btrfs_add_inode_defrag(struct btrfs_inode *inode,
struct inode_defrag *defrag)
{
struct btrfs_fs_info *fs_info = inode->root->fs_info;
struct inode_defrag *entry;
struct rb_node **p;
struct rb_node *parent = NULL;
int ret;
p = &fs_info->defrag_inodes.rb_node;
while (*p) {
parent = *p;
entry = rb_entry(parent, struct inode_defrag, rb_node);
ret = __compare_inode_defrag(defrag, entry);
if (ret < 0)
p = &parent->rb_left;
else if (ret > 0)
p = &parent->rb_right;
else {
/* if we're reinserting an entry for
* an old defrag run, make sure to
* lower the transid of our existing record
*/
if (defrag->transid < entry->transid)
entry->transid = defrag->transid;
if (defrag->last_offset > entry->last_offset)
entry->last_offset = defrag->last_offset;
return -EEXIST;
}
}
set_bit(BTRFS_INODE_IN_DEFRAG, &inode->runtime_flags);
rb_link_node(&defrag->rb_node, parent, p);
rb_insert_color(&defrag->rb_node, &fs_info->defrag_inodes);
return 0;
}
static inline int __need_auto_defrag(struct btrfs_fs_info *fs_info)
{
if (!btrfs_test_opt(fs_info, AUTO_DEFRAG))
return 0;
if (btrfs_fs_closing(fs_info))
return 0;
return 1;
}
/*
* insert a defrag record for this inode if auto defrag is
* enabled
*/
int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans,
struct btrfs_inode *inode)
{
struct btrfs_root *root = inode->root;
struct btrfs_fs_info *fs_info = root->fs_info;
struct inode_defrag *defrag;
u64 transid;
int ret;
if (!__need_auto_defrag(fs_info))
return 0; if (test_bit(BTRFS_INODE_IN_DEFRAG, &inode->runtime_flags))
return 0;
if (trans) transid = trans->transid;
else
transid = inode->root->last_trans; defrag = kmem_cache_zalloc(btrfs_inode_defrag_cachep, GFP_NOFS);
if (!defrag)
return -ENOMEM;
defrag->ino = btrfs_ino(inode);
defrag->transid = transid;
defrag->root = root->root_key.objectid;
spin_lock(&fs_info->defrag_inodes_lock);
if (!test_bit(BTRFS_INODE_IN_DEFRAG, &inode->runtime_flags)) {
/*
* If we set IN_DEFRAG flag and evict the inode from memory,
* and then re-read this inode, this new inode doesn't have
* IN_DEFRAG flag. At the case, we may find the existed defrag.
*/
ret = __btrfs_add_inode_defrag(inode, defrag);
if (ret)
kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
} else {
kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
}
spin_unlock(&fs_info->defrag_inodes_lock);
return 0;
}
/*
* Requeue the defrag object. If there is a defrag object that points to
* the same inode in the tree, we will merge them together (by
* __btrfs_add_inode_defrag()) and free the one that we want to requeue.
*/
static void btrfs_requeue_inode_defrag(struct btrfs_inode *inode,
struct inode_defrag *defrag)
{
struct btrfs_fs_info *fs_info = inode->root->fs_info;
int ret;
if (!__need_auto_defrag(fs_info))
goto out;
/*
* Here we don't check the IN_DEFRAG flag, because we need merge
* them together.
*/
spin_lock(&fs_info->defrag_inodes_lock);
ret = __btrfs_add_inode_defrag(inode, defrag);
spin_unlock(&fs_info->defrag_inodes_lock);
if (ret)
goto out;
return;
out:
kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
}
/*
* pick the defragable inode that we want, if it doesn't exist, we will get
* the next one.
*/
static struct inode_defrag *
btrfs_pick_defrag_inode(struct btrfs_fs_info *fs_info, u64 root, u64 ino)
{
struct inode_defrag *entry = NULL;
struct inode_defrag tmp;
struct rb_node *p;
struct rb_node *parent = NULL;
int ret;
tmp.ino = ino;
tmp.root = root;
spin_lock(&fs_info->defrag_inodes_lock);
p = fs_info->defrag_inodes.rb_node;
while (p) {
parent = p;
entry = rb_entry(parent, struct inode_defrag, rb_node);
ret = __compare_inode_defrag(&tmp, entry);
if (ret < 0)
p = parent->rb_left;
else if (ret > 0)
p = parent->rb_right;
else
goto out;
}
if (parent && __compare_inode_defrag(&tmp, entry) > 0) {
parent = rb_next(parent);
if (parent)
entry = rb_entry(parent, struct inode_defrag, rb_node);
else
entry = NULL;
}
out:
if (entry)
rb_erase(parent, &fs_info->defrag_inodes);
spin_unlock(&fs_info->defrag_inodes_lock);
return entry;
}
void btrfs_cleanup_defrag_inodes(struct btrfs_fs_info *fs_info)
{
struct inode_defrag *defrag;
struct rb_node *node;
spin_lock(&fs_info->defrag_inodes_lock);
node = rb_first(&fs_info->defrag_inodes);
while (node) {
rb_erase(node, &fs_info->defrag_inodes);
defrag = rb_entry(node, struct inode_defrag, rb_node);
kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
cond_resched_lock(&fs_info->defrag_inodes_lock);
node = rb_first(&fs_info->defrag_inodes);
}
spin_unlock(&fs_info->defrag_inodes_lock);
}
#define BTRFS_DEFRAG_BATCH 1024
static int __btrfs_run_defrag_inode(struct btrfs_fs_info *fs_info,
struct inode_defrag *defrag)
{
struct btrfs_root *inode_root;
struct inode *inode;
struct btrfs_ioctl_defrag_range_args range;
int num_defrag;
int ret;
/* get the inode */
inode_root = btrfs_get_fs_root(fs_info, defrag->root, true);
if (IS_ERR(inode_root)) {
ret = PTR_ERR(inode_root);
goto cleanup;
}
inode = btrfs_iget(fs_info->sb, defrag->ino, inode_root);
btrfs_put_root(inode_root);
if (IS_ERR(inode)) {
ret = PTR_ERR(inode);
goto cleanup;
}
/* do a chunk of defrag */
clear_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags);
memset(&range, 0, sizeof(range));
range.len = (u64)-1;
range.start = defrag->last_offset;
sb_start_write(fs_info->sb);
num_defrag = btrfs_defrag_file(inode, NULL, &range, defrag->transid,
BTRFS_DEFRAG_BATCH);
sb_end_write(fs_info->sb);
/*
* if we filled the whole defrag batch, there
* must be more work to do. Queue this defrag
* again
*/
if (num_defrag == BTRFS_DEFRAG_BATCH) {
defrag->last_offset = range.start;
btrfs_requeue_inode_defrag(BTRFS_I(inode), defrag);
} else if (defrag->last_offset && !defrag->cycled) {
/*
* we didn't fill our defrag batch, but
* we didn't start at zero. Make sure we loop
* around to the start of the file.
*/
defrag->last_offset = 0;
defrag->cycled = 1;
btrfs_requeue_inode_defrag(BTRFS_I(inode), defrag);
} else {
kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
}
iput(inode);
return 0;
cleanup:
kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
return ret;
}
/*
* run through the list of inodes in the FS that need
* defragging
*/
int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info)
{
struct inode_defrag *defrag;
u64 first_ino = 0;
u64 root_objectid = 0;
atomic_inc(&fs_info->defrag_running);
while (1) {
/* Pause the auto defragger. */
if (test_bit(BTRFS_FS_STATE_REMOUNTING,
&fs_info->fs_state))
break;
if (!__need_auto_defrag(fs_info))
break;
/* find an inode to defrag */
defrag = btrfs_pick_defrag_inode(fs_info, root_objectid,
first_ino);
if (!defrag) {
if (root_objectid || first_ino) {
root_objectid = 0;
first_ino = 0;
continue;
} else {
break;
}
}
first_ino = defrag->ino + 1;
root_objectid = defrag->root;
__btrfs_run_defrag_inode(fs_info, defrag);
}
atomic_dec(&fs_info->defrag_running);
/*
* during unmount, we use the transaction_wait queue to
* wait for the defragger to stop
*/
wake_up(&fs_info->transaction_wait);
return 0;
}
/* simple helper to fault in pages and copy. This should go away
* and be replaced with calls into generic code.
*/
static noinline int btrfs_copy_from_user(loff_t pos, size_t write_bytes,
struct page **prepared_pages,
struct iov_iter *i)
{
size_t copied = 0;
size_t total_copied = 0;
int pg = 0;
int offset = offset_in_page(pos); while (write_bytes > 0) { size_t count = min_t(size_t,
PAGE_SIZE - offset, write_bytes);
struct page *page = prepared_pages[pg];
/*
* Copy data from userspace to the current page
*/
copied = copy_page_from_iter_atomic(page, offset, count, i);
/* Flush processor's dcache for this page */
flush_dcache_page(page);
/*
* if we get a partial write, we can end up with
* partially up to date pages. These add
* a lot of complexity, so make sure they don't
* happen by forcing this copy to be retried.
*
* The rest of the btrfs_file_write code will fall
* back to page at a time copies after we return 0.
*/
if (unlikely(copied < count)) {
if (!PageUptodate(page)) {
iov_iter_revert(i, copied);
copied = 0;
}
if (!copied)
break;
}
write_bytes -= copied;
total_copied += copied;
offset += copied;
if (offset == PAGE_SIZE) {
pg++;
offset = 0;
}
}
return total_copied;
}
/*
* unlocks pages after btrfs_file_write is done with them
*/
static void btrfs_drop_pages(struct page **pages, size_t num_pages)
{
size_t i;
for (i = 0; i < num_pages; i++) {
/* page checked is some magic around finding pages that
* have been modified without going through btrfs_set_page_dirty
* clear it here. There should be no need to mark the pages
* accessed as prepare_pages should have marked them accessed
* in prepare_pages via find_or_create_page()
*/
ClearPageChecked(pages[i]);
unlock_page(pages[i]);
put_page(pages[i]);
}
}
/*
* After btrfs_copy_from_user(), update the following things for delalloc:
* - Mark newly dirtied pages as DELALLOC in the io tree.
* Used to advise which range is to be written back.
* - Mark modified pages as Uptodate/Dirty and not needing COW fixup
* - Update inode size for past EOF write
*/
int btrfs_dirty_pages(struct btrfs_inode *inode, struct page **pages,
size_t num_pages, loff_t pos, size_t write_bytes,
struct extent_state **cached, bool noreserve)
{
struct btrfs_fs_info *fs_info = inode->root->fs_info;
int err = 0;
int i;
u64 num_bytes;
u64 start_pos;
u64 end_of_last_block;
u64 end_pos = pos + write_bytes;
loff_t isize = i_size_read(&inode->vfs_inode);
unsigned int extra_bits = 0;
if (write_bytes == 0)
return 0;
if (noreserve)
extra_bits |= EXTENT_NORESERVE;
start_pos = round_down(pos, fs_info->sectorsize);
num_bytes = round_up(write_bytes + pos - start_pos,
fs_info->sectorsize);
ASSERT(num_bytes <= U32_MAX);
end_of_last_block = start_pos + num_bytes - 1;
/*
* The pages may have already been dirty, clear out old accounting so
* we can set things up properly
*/
clear_extent_bit(&inode->io_tree, start_pos, end_of_last_block,
EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG,
0, 0, cached);
err = btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block,
extra_bits, cached);
if (err)
return err;
for (i = 0; i < num_pages; i++) { struct page *p = pages[i];
btrfs_page_clamp_set_uptodate(fs_info, p, start_pos, num_bytes);
ClearPageChecked(p);
btrfs_page_clamp_set_dirty(fs_info, p, start_pos, num_bytes);
}
/*
* we've only changed i_size in ram, and we haven't updated
* the disk i_size. There is no need to log the inode
* at this time.
*/
if (end_pos > isize) i_size_write(&inode->vfs_inode, end_pos);
return 0;
}
/*
* this drops all the extents in the cache that intersect the range
* [start, end]. Existing extents are split as required.
*/
void btrfs_drop_extent_cache(struct btrfs_inode *inode, u64 start, u64 end,
int skip_pinned)
{
struct extent_map *em;
struct extent_map *split = NULL;
struct extent_map *split2 = NULL;
struct extent_map_tree *em_tree = &inode->extent_tree; u64 len = end - start + 1;
u64 gen;
int ret;
int testend = 1;
unsigned long flags;
int compressed = 0;
bool modified;
WARN_ON(end < start); if (end == (u64)-1) {
len = (u64)-1;
testend = 0;
}
while (1) {
int no_splits = 0;
modified = false;
if (!split) split = alloc_extent_map(); if (!split2) split2 = alloc_extent_map(); if (!split || !split2)
no_splits = 1;
write_lock(&em_tree->lock);
em = lookup_extent_mapping(em_tree, start, len);
if (!em) {
write_unlock(&em_tree->lock);
break;
}
flags = em->flags;
gen = em->generation;
if (skip_pinned && test_bit(EXTENT_FLAG_PINNED, &em->flags)) {
if (testend && em->start + em->len >= start + len) { free_extent_map(em);
write_unlock(&em_tree->lock);
break;
}
start = em->start + em->len;
if (testend)
len = start + len - (em->start + em->len);
free_extent_map(em);
write_unlock(&em_tree->lock);
continue;
}
compressed = test_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
clear_bit(EXTENT_FLAG_PINNED, &em->flags);
clear_bit(EXTENT_FLAG_LOGGING, &flags);
modified = !list_empty(&em->list);
if (no_splits)
goto next;
if (em->start < start) {
split->start = em->start;
split->len = start - em->start;
if (em->block_start < EXTENT_MAP_LAST_BYTE) {
split->orig_start = em->orig_start;
split->block_start = em->block_start;
if (compressed) split->block_len = em->block_len;
else
split->block_len = split->len;
split->orig_block_len = max(split->block_len,
em->orig_block_len);
split->ram_bytes = em->ram_bytes;
} else {
split->orig_start = split->start;
split->block_len = 0;
split->block_start = em->block_start;
split->orig_block_len = 0;
split->ram_bytes = split->len;
}
split->generation = gen;
split->flags = flags;
split->compress_type = em->compress_type;
replace_extent_mapping(em_tree, em, split, modified);
free_extent_map(split);
split = split2;
split2 = NULL;
}
if (testend && em->start + em->len > start + len) {
u64 diff = start + len - em->start;
split->start = start + len;
split->len = em->start + em->len - (start + len);
split->flags = flags;
split->compress_type = em->compress_type;
split->generation = gen;
if (em->block_start < EXTENT_MAP_LAST_BYTE) {
split->orig_block_len = max(em->block_len,
em->orig_block_len);
split->ram_bytes = em->ram_bytes;
if (compressed) {
split->block_len = em->block_len;
split->block_start = em->block_start;
split->orig_start = em->orig_start;
} else {
split->block_len = split->len;
split->block_start = em->block_start
+ diff;
split->orig_start = em->orig_start;
}
} else {
split->ram_bytes = split->len;
split->orig_start = split->start;
split->block_len = 0;
split->block_start = em->block_start;
split->orig_block_len = 0;
}
if (extent_map_in_tree(em)) {
replace_extent_mapping(em_tree, em, split,
modified);
} else {
ret = add_extent_mapping(em_tree, split,
modified);
ASSERT(ret == 0); /* Logic error */
}
free_extent_map(split);
split = NULL;
}
next:
if (extent_map_in_tree(em))
remove_extent_mapping(em_tree, em);
write_unlock(&em_tree->lock);
/* once for us */
free_extent_map(em);
/* once for the tree*/
free_extent_map(em);
}
if (split) free_extent_map(split); if (split2) free_extent_map(split2);
}
/*
* this is very complex, but the basic idea is to drop all extents
* in the range start - end. hint_block is filled in with a block number
* that would be a good hint to the block allocator for this file.
*
* If an extent intersects the range but is not entirely inside the range
* it is either truncated or split. Anything entirely inside the range
* is deleted from the tree.
*
* Note: the VFS' inode number of bytes is not updated, it's up to the caller
* to deal with that. We set the field 'bytes_found' of the arguments structure
* with the number of allocated bytes found in the target range, so that the
* caller can update the inode's number of bytes in an atomic way when
* replacing extents in a range to avoid races with stat(2).
*/
int btrfs_drop_extents(struct btrfs_trans_handle *trans,
struct btrfs_root *root, struct btrfs_inode *inode,
struct btrfs_drop_extents_args *args)
{
struct btrfs_fs_info *fs_info = root->fs_info;
struct extent_buffer *leaf;
struct btrfs_file_extent_item *fi;
struct btrfs_ref ref = { 0 };
struct btrfs_key key;
struct btrfs_key new_key;
u64 ino = btrfs_ino(inode);
u64 search_start = args->start;
u64 disk_bytenr = 0;
u64 num_bytes = 0;
u64 extent_offset = 0;
u64 extent_end = 0;
u64 last_end = args->start;
int del_nr = 0;
int del_slot = 0;
int extent_type;
int recow;
int ret;
int modify_tree = -1;
int update_refs;
int found = 0;
int leafs_visited = 0;
struct btrfs_path *path = args->path;
args->bytes_found = 0;
args->extent_inserted = false;
/* Must always have a path if ->replace_extent is true */
ASSERT(!(args->replace_extent && !args->path));
if (!path) {
path = btrfs_alloc_path(); if (!path) {
ret = -ENOMEM;
goto out;
}
}
if (args->drop_cache) btrfs_drop_extent_cache(inode, args->start, args->end - 1, 0); if (args->start >= inode->disk_i_size && !args->replace_extent)
modify_tree = 0;
update_refs = (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID);
while (1) {
recow = 0;
ret = btrfs_lookup_file_extent(trans, root, path, ino,
search_start, modify_tree);
if (ret < 0)
break;
if (ret > 0 && path->slots[0] > 0 && search_start == args->start) {
leaf = path->nodes[0];
btrfs_item_key_to_cpu(leaf, &key, path->slots[0] - 1); if (key.objectid == ino && key.type == BTRFS_EXTENT_DATA_KEY) path->slots[0]--;
}
ret = 0;
leafs_visited++;
next_slot:
leaf = path->nodes[0];
if (path->slots[0] >= btrfs_header_nritems(leaf)) { BUG_ON(del_nr > 0);
ret = btrfs_next_leaf(root, path);
if (ret < 0)
break;
if (ret > 0) {
ret = 0;
break;
}
leafs_visited++;
leaf = path->nodes[0];
recow = 1;
}
btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
if (key.objectid > ino)
break;
if (WARN_ON_ONCE(key.objectid < ino) || key.type < BTRFS_EXTENT_DATA_KEY) {
ASSERT(del_nr == 0);
path->slots[0]++; goto next_slot;
}
if (key.type > BTRFS_EXTENT_DATA_KEY || key.offset >= args->end)
break;
fi = btrfs_item_ptr(leaf, path->slots[0],
struct btrfs_file_extent_item);
extent_type = btrfs_file_extent_type(leaf, fi);
if (extent_type == BTRFS_FILE_EXTENT_REG ||
extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi);
extent_offset = btrfs_file_extent_offset(leaf, fi);
extent_end = key.offset +
btrfs_file_extent_num_bytes(leaf, fi);
} else if (extent_type == BTRFS_FILE_EXTENT_INLINE) { extent_end = key.offset +
btrfs_file_extent_ram_bytes(leaf, fi);
} else {
/* can't happen */
BUG();
}
/*
* Don't skip extent items representing 0 byte lengths. They
* used to be created (bug) if while punching holes we hit
* -ENOSPC condition. So if we find one here, just ensure we
* delete it, otherwise we would insert a new file extent item
* with the same key (offset) as that 0 bytes length file
* extent item in the call to setup_items_for_insert() later
* in this function.
*/
if (extent_end == key.offset && extent_end >= search_start) {
last_end = extent_end;
goto delete_extent_item;
}
if (extent_end <= search_start) { path->slots[0]++;
goto next_slot;
}
found = 1;
search_start = max(key.offset, args->start); if (recow || !modify_tree) {
modify_tree = -1;
btrfs_release_path(path);
continue;
}
/*
* | - range to drop - |
* | -------- extent -------- |
*/
if (args->start > key.offset && args->end < extent_end) { BUG_ON(del_nr > 0); if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
ret = -EOPNOTSUPP;
break;
}
memcpy(&new_key, &key, sizeof(new_key));
new_key.offset = args->start;
ret = btrfs_duplicate_item(trans, root, path,
&new_key);
if (ret == -EAGAIN) {
btrfs_release_path(path);
continue;
}
if (ret < 0)
break;
leaf = path->nodes[0];
fi = btrfs_item_ptr(leaf, path->slots[0] - 1,
struct btrfs_file_extent_item);
btrfs_set_file_extent_num_bytes(leaf, fi,
args->start - key.offset);
fi = btrfs_item_ptr(leaf, path->slots[0],
struct btrfs_file_extent_item);
extent_offset += args->start - key.offset;
btrfs_set_file_extent_offset(leaf, fi, extent_offset);
btrfs_set_file_extent_num_bytes(leaf, fi,
extent_end - args->start);
btrfs_mark_buffer_dirty(leaf);
if (update_refs && disk_bytenr > 0) {
btrfs_init_generic_ref(&ref,
BTRFS_ADD_DELAYED_REF,
disk_bytenr, num_bytes, 0);
btrfs_init_data_ref(&ref,
root->root_key.objectid,
new_key.objectid,
args->start - extent_offset);
ret = btrfs_inc_extent_ref(trans, &ref);
BUG_ON(ret); /* -ENOMEM */
}
key.offset = args->start;
}
/*
* From here on out we will have actually dropped something, so
* last_end can be updated.
*/
last_end = extent_end;
/*
* | ---- range to drop ----- |
* | -------- extent -------- |
*/
if (args->start <= key.offset && args->end < extent_end) { if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
ret = -EOPNOTSUPP;
break;
}
memcpy(&new_key, &key, sizeof(new_key));
new_key.offset = args->end;
btrfs_set_item_key_safe(fs_info, path, &new_key);
extent_offset += args->end - key.offset;
btrfs_set_file_extent_offset(leaf, fi, extent_offset);
btrfs_set_file_extent_num_bytes(leaf, fi,
extent_end - args->end);
btrfs_mark_buffer_dirty(leaf);
if (update_refs && disk_bytenr > 0) args->bytes_found += args->end - key.offset;
break;
}
search_start = extent_end;
/*
* | ---- range to drop ----- |
* | -------- extent -------- |
*/
if (args->start > key.offset && args->end >= extent_end) {
BUG_ON(del_nr > 0); if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
ret = -EOPNOTSUPP;
break;
}
btrfs_set_file_extent_num_bytes(leaf, fi,
args->start - key.offset);
btrfs_mark_buffer_dirty(leaf);
if (update_refs && disk_bytenr > 0) args->bytes_found += extent_end - args->start; if (args->end == extent_end)
break;
path->slots[0]++;
goto next_slot;
}
/*
* | ---- range to drop ----- |
* | ------ extent ------ |
*/
if (args->start <= key.offset && args->end >= extent_end) {
delete_extent_item:
if (del_nr == 0) { del_slot = path->slots[0];
del_nr = 1;
} else {
BUG_ON(del_slot + del_nr != path->slots[0]); del_nr++;
}
if (update_refs &&
extent_type == BTRFS_FILE_EXTENT_INLINE) {
args->bytes_found += extent_end - key.offset;
extent_end = ALIGN(extent_end,
fs_info->sectorsize);
} else if (update_refs && disk_bytenr > 0) {
btrfs_init_generic_ref(&ref,
BTRFS_DROP_DELAYED_REF,
disk_bytenr, num_bytes, 0);
btrfs_init_data_ref(&ref,
root->root_key.objectid,
key.objectid,
key.offset - extent_offset);
ret = btrfs_free_extent(trans, &ref);
BUG_ON(ret); /* -ENOMEM */ args->bytes_found += extent_end - key.offset;
}
if (args->end == extent_end)
break;
if (path->slots[0] + 1 < btrfs_header_nritems(leaf)) { path->slots[0]++;
goto next_slot;
}
ret = btrfs_del_items(trans, root, path, del_slot,
del_nr);
if (ret) {
btrfs_abort_transaction(trans, ret);
break;
}
del_nr = 0;
del_slot = 0;
btrfs_release_path(path);
continue;
}
BUG();
}
if (!ret && del_nr > 0) {
/*
* Set path->slots[0] to first slot, so that after the delete
* if items are move off from our leaf to its immediate left or
* right neighbor leafs, we end up with a correct and adjusted
* path->slots[0] for our insertion (if args->replace_extent).
*/
path->slots[0] = del_slot;
ret = btrfs_del_items(trans, root, path, del_slot, del_nr);
if (ret)
btrfs_abort_transaction(trans, ret);
}
leaf = path->nodes[0];
/*
* If btrfs_del_items() was called, it might have deleted a leaf, in
* which case it unlocked our path, so check path->locks[0] matches a
* write lock.
*/
if (!ret && args->replace_extent && leafs_visited == 1 && path->locks[0] == BTRFS_WRITE_LOCK && btrfs_leaf_free_space(leaf) >=
sizeof(struct btrfs_item) + args->extent_item_size) {
key.objectid = ino;
key.type = BTRFS_EXTENT_DATA_KEY;
key.offset = args->start;
if (!del_nr && path->slots[0] < btrfs_header_nritems(leaf)) {
struct btrfs_key slot_key;
btrfs_item_key_to_cpu(leaf, &slot_key, path->slots[0]);
if (btrfs_comp_cpu_keys(&key, &slot_key) > 0)
path->slots[0]++;
}
setup_items_for_insert(root, path, &key,
&args->extent_item_size, 1);
args->extent_inserted = true;
}
if (!args->path) btrfs_free_path(path); else if (!args->extent_inserted) btrfs_release_path(path);
out:
args->drop_end = found ? min(args->end, last_end) : args->end;
return ret;
}
static int extent_mergeable(struct extent_buffer *leaf, int slot,
u64 objectid, u64 bytenr, u64 orig_offset,
u64 *start, u64 *end)
{
struct btrfs_file_extent_item *fi;
struct btrfs_key key;
u64 extent_end;
if (slot < 0 || slot >= btrfs_header_nritems(leaf)) return 0;
btrfs_item_key_to_cpu(leaf, &key, slot);
if (key.objectid != objectid || key.type != BTRFS_EXTENT_DATA_KEY) return 0;
fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
if (btrfs_file_extent_type(leaf, fi) != BTRFS_FILE_EXTENT_REG ||
btrfs_file_extent_disk_bytenr(leaf, fi) != bytenr ||
btrfs_file_extent_offset(leaf, fi) != key.offset - orig_offset ||
btrfs_file_extent_compression(leaf, fi) ||
btrfs_file_extent_encryption(leaf, fi) ||
btrfs_file_extent_other_encoding(leaf, fi))
return 0;
extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, fi); if ((*start && *start != key.offset) || (*end && *end != extent_end))
return 0;
*start = key.offset;
*end = extent_end;
return 1;
}
/*
* Mark extent in the range start - end as written.
*
* This changes extent type from 'pre-allocated' to 'regular'. If only
* part of extent is marked as written, the extent will be split into
* two or three.
*/
int btrfs_mark_extent_written(struct btrfs_trans_handle *trans,
struct btrfs_inode *inode, u64 start, u64 end)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
struct btrfs_root *root = inode->root;
struct extent_buffer *leaf;
struct btrfs_path *path;
struct btrfs_file_extent_item *fi;
struct btrfs_ref ref = { 0 };
struct btrfs_key key;
struct btrfs_key new_key;
u64 bytenr;
u64 num_bytes;
u64 extent_end;
u64 orig_offset;
u64 other_start;
u64 other_end;
u64 split;
int del_nr = 0;
int del_slot = 0;
int recow;
int ret = 0;
u64 ino = btrfs_ino(inode);
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
again:
recow = 0;
split = start;
key.objectid = ino;
key.type = BTRFS_EXTENT_DATA_KEY;
key.offset = split;
ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
if (ret < 0)
goto out;
if (ret > 0 && path->slots[0] > 0) path->slots[0]--; leaf = path->nodes[0];
btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
if (key.objectid != ino ||
key.type != BTRFS_EXTENT_DATA_KEY) {
ret = -EINVAL;
btrfs_abort_transaction(trans, ret);
goto out;
}
fi = btrfs_item_ptr(leaf, path->slots[0],
struct btrfs_file_extent_item);
if (btrfs_file_extent_type(leaf, fi) != BTRFS_FILE_EXTENT_PREALLOC) {
ret = -EINVAL;
btrfs_abort_transaction(trans, ret);
goto out;
}
extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, fi); if (key.offset > start || extent_end < end) {
ret = -EINVAL;
btrfs_abort_transaction(trans, ret);
goto out;
}
bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi);
orig_offset = key.offset - btrfs_file_extent_offset(leaf, fi);
memcpy(&new_key, &key, sizeof(new_key));
if (start == key.offset && end < extent_end) { other_start = 0;
other_end = start;
if (extent_mergeable(leaf, path->slots[0] - 1,
ino, bytenr, orig_offset,
&other_start, &other_end)) {
new_key.offset = end;
btrfs_set_item_key_safe(fs_info, path, &new_key);
fi = btrfs_item_ptr(leaf, path->slots[0],
struct btrfs_file_extent_item);
btrfs_set_file_extent_generation(leaf, fi,
trans->transid);
btrfs_set_file_extent_num_bytes(leaf, fi,
extent_end - end);
btrfs_set_file_extent_offset(leaf, fi,
end - orig_offset);
fi = btrfs_item_ptr(leaf, path->slots[0] - 1,
struct btrfs_file_extent_item);
btrfs_set_file_extent_generation(leaf, fi,
trans->transid);
btrfs_set_file_extent_num_bytes(leaf, fi,
end - other_start);
btrfs_mark_buffer_dirty(leaf);
goto out;
}
}
if (start > key.offset && end == extent_end) { other_start = end;
other_end = 0;
if (extent_mergeable(leaf, path->slots[0] + 1,
ino, bytenr, orig_offset,
&other_start, &other_end)) {
fi = btrfs_item_ptr(leaf, path->slots[0],
struct btrfs_file_extent_item);
btrfs_set_file_extent_num_bytes(leaf, fi,
start - key.offset);
btrfs_set_file_extent_generation(leaf, fi,
trans->transid);
path->slots[0]++;
new_key.offset = start;
btrfs_set_item_key_safe(fs_info, path, &new_key);
fi = btrfs_item_ptr(leaf, path->slots[0],
struct btrfs_file_extent_item);
btrfs_set_file_extent_generation(leaf, fi,
trans->transid);
btrfs_set_file_extent_num_bytes(leaf, fi,
other_end - start);
btrfs_set_file_extent_offset(leaf, fi,
start - orig_offset);
btrfs_mark_buffer_dirty(leaf);
goto out;
}
}
while (start > key.offset || end < extent_end) { if (key.offset == start)
split = end;
new_key.offset = split;
ret = btrfs_duplicate_item(trans, root, path, &new_key);
if (ret == -EAGAIN) {
btrfs_release_path(path);
goto again;
}
if (ret < 0) { btrfs_abort_transaction(trans, ret);
goto out;
}
leaf = path->nodes[0];
fi = btrfs_item_ptr(leaf, path->slots[0] - 1,
struct btrfs_file_extent_item);
btrfs_set_file_extent_generation(leaf, fi, trans->transid);
btrfs_set_file_extent_num_bytes(leaf, fi,
split - key.offset);
fi = btrfs_item_ptr(leaf, path->slots[0],
struct btrfs_file_extent_item);
btrfs_set_file_extent_generation(leaf, fi, trans->transid);
btrfs_set_file_extent_offset(leaf, fi, split - orig_offset);
btrfs_set_file_extent_num_bytes(leaf, fi,
extent_end - split);
btrfs_mark_buffer_dirty(leaf);
btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF, bytenr,
num_bytes, 0);
btrfs_init_data_ref(&ref, root->root_key.objectid, ino,
orig_offset);
ret = btrfs_inc_extent_ref(trans, &ref);
if (ret) {
btrfs_abort_transaction(trans, ret);
goto out;
}
if (split == start) { key.offset = start;
} else {
if (start != key.offset) {
ret = -EINVAL;
btrfs_abort_transaction(trans, ret);
goto out;
}
path->slots[0]--;
extent_end = end;
}
recow = 1;
}
other_start = end;
other_end = 0;
btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF, bytenr,
num_bytes, 0);
btrfs_init_data_ref(&ref, root->root_key.objectid, ino, orig_offset);
if (extent_mergeable(leaf, path->slots[0] + 1,
ino, bytenr, orig_offset,
&other_start, &other_end)) {
if (recow) {
btrfs_release_path(path);
goto again;
}
extent_end = other_end;
del_slot = path->slots[0] + 1;
del_nr++;
ret = btrfs_free_extent(trans, &ref);
if (ret) {
btrfs_abort_transaction(trans, ret);
goto out;
}
}
other_start = 0;
other_end = start;
if (extent_mergeable(leaf, path->slots[0] - 1,
ino, bytenr, orig_offset,
&other_start, &other_end)) {
if (recow) { btrfs_release_path(path);
goto again;
}
key.offset = other_start;
del_slot = path->slots[0];
del_nr++;
ret = btrfs_free_extent(trans, &ref);
if (ret) { btrfs_abort_transaction(trans, ret);
goto out;
}
}
if (del_nr == 0) {
fi = btrfs_item_ptr(leaf, path->slots[0],
struct btrfs_file_extent_item);
btrfs_set_file_extent_type(leaf, fi,
BTRFS_FILE_EXTENT_REG);
btrfs_set_file_extent_generation(leaf, fi, trans->transid);
btrfs_mark_buffer_dirty(leaf);
} else {
fi = btrfs_item_ptr(leaf, del_slot - 1,
struct btrfs_file_extent_item);
btrfs_set_file_extent_type(leaf, fi,
BTRFS_FILE_EXTENT_REG);
btrfs_set_file_extent_generation(leaf, fi, trans->transid);
btrfs_set_file_extent_num_bytes(leaf, fi,
extent_end - key.offset);
btrfs_mark_buffer_dirty(leaf);
ret = btrfs_del_items(trans, root, path, del_slot, del_nr);
if (ret < 0) {
btrfs_abort_transaction(trans, ret);
goto out;
}
}
out:
btrfs_free_path(path); return ret;
}
/*
* on error we return an unlocked page and the error value
* on success we return a locked page and 0
*/
static int prepare_uptodate_page(struct inode *inode,
struct page *page, u64 pos,
bool force_uptodate)
{
int ret = 0;
if (((pos & (PAGE_SIZE - 1)) || force_uptodate) &&
!PageUptodate(page)) {
ret = btrfs_readpage(NULL, page);
if (ret)
return ret;
lock_page(page);
if (!PageUptodate(page)) {
unlock_page(page);
return -EIO;
}
/*
* Since btrfs_readpage() will unlock the page before it
* returns, there is a window where btrfs_releasepage() can be
* called to release the page. Here we check both inode
* mapping and PagePrivate() to make sure the page was not
* released.
*
* The private flag check is essential for subpage as we need
* to store extra bitmap using page->private.
*/
if (page->mapping != inode->i_mapping || !PagePrivate(page)) {
unlock_page(page);
return -EAGAIN;
}
}
return 0;
}
/*
* this just gets pages into the page cache and locks them down.
*/
static noinline int prepare_pages(struct inode *inode, struct page **pages,
size_t num_pages, loff_t pos,
size_t write_bytes, bool force_uptodate)
{
int i;
unsigned long index = pos >> PAGE_SHIFT;
gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping);
int err = 0;
int faili;
for (i = 0; i < num_pages; i++) {
again:
pages[i] = find_or_create_page(inode->i_mapping, index + i,
mask | __GFP_WRITE);
if (!pages[i]) {
faili = i - 1;
err = -ENOMEM;
goto fail;
}
err = set_page_extent_mapped(pages[i]);
if (err < 0) {
faili = i;
goto fail;
}
if (i == 0) err = prepare_uptodate_page(inode, pages[i], pos,
force_uptodate);
if (!err && i == num_pages - 1) err = prepare_uptodate_page(inode, pages[i],
pos + write_bytes, false);
if (err) {
put_page(pages[i]);
if (err == -EAGAIN) {
err = 0;
goto again;
}
faili = i - 1;
goto fail;
}
wait_on_page_writeback(pages[i]);
}
return 0;
fail:
while (faili >= 0) { unlock_page(pages[faili]);
put_page(pages[faili]);
faili--;
}
return err;
}
/*
* This function locks the extent and properly waits for data=ordered extents
* to finish before allowing the pages to be modified if need.
*
* The return value:
* 1 - the extent is locked
* 0 - the extent is not locked, and everything is OK
* -EAGAIN - need re-prepare the pages
* the other < 0 number - Something wrong happens
*/
static noinline int
lock_and_cleanup_extent_if_need(struct btrfs_inode *inode, struct page **pages,
size_t num_pages, loff_t pos,
size_t write_bytes,
u64 *lockstart, u64 *lockend,
struct extent_state **cached_state)
{
struct btrfs_fs_info *fs_info = inode->root->fs_info;
u64 start_pos;
u64 last_pos;
int i;
int ret = 0;
start_pos = round_down(pos, fs_info->sectorsize);
last_pos = round_up(pos + write_bytes, fs_info->sectorsize) - 1;
if (start_pos < inode->vfs_inode.i_size) {
struct btrfs_ordered_extent *ordered;
lock_extent_bits(&inode->io_tree, start_pos, last_pos,
cached_state);
ordered = btrfs_lookup_ordered_range(inode, start_pos,
last_pos - start_pos + 1);
if (ordered &&
ordered->file_offset + ordered->num_bytes > start_pos &&
ordered->file_offset <= last_pos) {
unlock_extent_cached(&inode->io_tree, start_pos,
last_pos, cached_state);
for (i = 0; i < num_pages; i++) { unlock_page(pages[i]);
put_page(pages[i]);
}
btrfs_start_ordered_extent(ordered, 1);
btrfs_put_ordered_extent(ordered);
return -EAGAIN;
}
if (ordered)
btrfs_put_ordered_extent(ordered); *lockstart = start_pos;
*lockend = last_pos;
ret = 1;
}
/*
* We should be called after prepare_pages() which should have locked
* all pages in the range.
*/
for (i = 0; i < num_pages; i++) WARN_ON(!PageLocked(pages[i]));
return ret;
}
static int check_can_nocow(struct btrfs_inode *inode, loff_t pos,
size_t *write_bytes, bool nowait)
{
struct btrfs_fs_info *fs_info = inode->root->fs_info;
struct btrfs_root *root = inode->root;
u64 lockstart, lockend;
u64 num_bytes;
int ret;
if (!(inode->flags & (BTRFS_INODE_NODATACOW | BTRFS_INODE_PREALLOC)))
return 0;
if (!nowait && !btrfs_drew_try_write_lock(&root->snapshot_lock))
return -EAGAIN;
lockstart = round_down(pos, fs_info->sectorsize);
lockend = round_up(pos + *write_bytes,
fs_info->sectorsize) - 1;
num_bytes = lockend - lockstart + 1;
if (nowait) {
struct btrfs_ordered_extent *ordered;
if (!try_lock_extent(&inode->io_tree, lockstart, lockend))
return -EAGAIN;
ordered = btrfs_lookup_ordered_range(inode, lockstart,
num_bytes);
if (ordered) {
btrfs_put_ordered_extent(ordered);
ret = -EAGAIN;
goto out_unlock;
}
} else {
btrfs_lock_and_flush_ordered_range(inode, lockstart,
lockend, NULL);
}
ret = can_nocow_extent(&inode->vfs_inode, lockstart, &num_bytes,
NULL, NULL, NULL, false);
if (ret <= 0) {
ret = 0;
if (!nowait)
btrfs_drew_write_unlock(&root->snapshot_lock);
} else {
*write_bytes = min_t(size_t, *write_bytes ,
num_bytes - pos + lockstart);
}
out_unlock:
unlock_extent(&inode->io_tree, lockstart, lockend);
return ret;
}
static int check_nocow_nolock(struct btrfs_inode *inode, loff_t pos,
size_t *write_bytes)
{
return check_can_nocow(inode, pos, write_bytes, true);
}
/*
* Check if we can do nocow write into the range [@pos, @pos + @write_bytes)
*
* @pos: File offset
* @write_bytes: The length to write, will be updated to the nocow writeable
* range
*
* This function will flush ordered extents in the range to ensure proper
* nocow checks.
*
* Return:
* >0 and update @write_bytes if we can do nocow write
* 0 if we can't do nocow write
* -EAGAIN if we can't get the needed lock or there are ordered extents
* for * (nowait == true) case
* <0 if other error happened
*
* NOTE: Callers need to release the lock by btrfs_check_nocow_unlock().
*/
int btrfs_check_nocow_lock(struct btrfs_inode *inode, loff_t pos,
size_t *write_bytes)
{
return check_can_nocow(inode, pos, write_bytes, false);
}
void btrfs_check_nocow_unlock(struct btrfs_inode *inode)
{
btrfs_drew_write_unlock(&inode->root->snapshot_lock);
}
static void update_time_for_write(struct inode *inode)
{
struct timespec64 now;
if (IS_NOCMTIME(inode))
return;
now = current_time(inode);
if (!timespec64_equal(&inode->i_mtime, &now))
inode->i_mtime = now; if (!timespec64_equal(&inode->i_ctime, &now)) inode->i_ctime = now; if (IS_I_VERSION(inode))
inode_inc_iversion(inode);
}
static int btrfs_write_check(struct kiocb *iocb, struct iov_iter *from,
size_t count)
{
struct file *file = iocb->ki_filp;
struct inode *inode = file_inode(file);
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
loff_t pos = iocb->ki_pos;
int ret;
loff_t oldsize;
loff_t start_pos;
if (iocb->ki_flags & IOCB_NOWAIT) {
size_t nocow_bytes = count;
/* We will allocate space in case nodatacow is not set, so bail */
if (check_nocow_nolock(BTRFS_I(inode), pos, &nocow_bytes) <= 0)
return -EAGAIN;
/*
* There are holes in the range or parts of the range that must
* be COWed (shared extents, RO block groups, etc), so just bail
* out.
*/
if (nocow_bytes < count)
return -EAGAIN;
}
current->backing_dev_info = inode_to_bdi(inode);
ret = file_remove_privs(file);
if (ret)
return ret;
/*
* We reserve space for updating the inode when we reserve space for the
* extent we are going to write, so we will enospc out there. We don't
* need to start yet another transaction to update the inode as we will
* update the inode when we finish writing whatever data we write.
*/
update_time_for_write(inode);
start_pos = round_down(pos, fs_info->sectorsize);
oldsize = i_size_read(inode);
if (start_pos > oldsize) {
/* Expand hole size to cover write data, preventing empty gap */
loff_t end_pos = round_up(pos + count, fs_info->sectorsize);
ret = btrfs_cont_expand(BTRFS_I(inode), oldsize, end_pos);
if (ret) {
current->backing_dev_info = NULL;
return ret;
}
}
return 0;
}
static noinline ssize_t btrfs_buffered_write(struct kiocb *iocb,
struct iov_iter *i)
{
struct file *file = iocb->ki_filp;
loff_t pos;
struct inode *inode = file_inode(file);
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct page **pages = NULL;
struct extent_changeset *data_reserved = NULL;
u64 release_bytes = 0;
u64 lockstart;
u64 lockend;
size_t num_written = 0;
int nrptrs;
ssize_t ret;
bool only_release_metadata = false;
bool force_page_uptodate = false;
loff_t old_isize = i_size_read(inode);
unsigned int ilock_flags = 0;
if (iocb->ki_flags & IOCB_NOWAIT)
ilock_flags |= BTRFS_ILOCK_TRY;
ret = btrfs_inode_lock(inode, ilock_flags);
if (ret < 0)
return ret;
ret = generic_write_checks(iocb, i);
if (ret <= 0)
goto out;
ret = btrfs_write_check(iocb, i, ret);
if (ret < 0)
goto out;
pos = iocb->ki_pos;
nrptrs = min(DIV_ROUND_UP(iov_iter_count(i), PAGE_SIZE),
PAGE_SIZE / (sizeof(struct page *)));
nrptrs = min(nrptrs, current->nr_dirtied_pause - current->nr_dirtied);
nrptrs = max(nrptrs, 8);
pages = kmalloc_array(nrptrs, sizeof(struct page *), GFP_KERNEL);
if (!pages) {
ret = -ENOMEM;
goto out;
}
while (iov_iter_count(i) > 0) { struct extent_state *cached_state = NULL;
size_t offset = offset_in_page(pos);
size_t sector_offset;
size_t write_bytes = min(iov_iter_count(i),
nrptrs * (size_t)PAGE_SIZE -
offset);
size_t num_pages;
size_t reserve_bytes;
size_t dirty_pages;
size_t copied;
size_t dirty_sectors;
size_t num_sectors;
int extents_locked;
/*
* Fault pages before locking them in prepare_pages
* to avoid recursive lock
*/
if (unlikely(fault_in_iov_iter_readable(i, write_bytes))) {
ret = -EFAULT;
break;
}
only_release_metadata = false;
sector_offset = pos & (fs_info->sectorsize - 1);
extent_changeset_release(data_reserved);
ret = btrfs_check_data_free_space(BTRFS_I(inode),
&data_reserved, pos,
write_bytes);
if (ret < 0) {
/*
* If we don't have to COW at the offset, reserve
* metadata only. write_bytes may get smaller than
* requested here.
*/
if (btrfs_check_nocow_lock(BTRFS_I(inode), pos,
&write_bytes) > 0)
only_release_metadata = true;
else
break;
}
num_pages = DIV_ROUND_UP(write_bytes + offset, PAGE_SIZE); WARN_ON(num_pages > nrptrs); reserve_bytes = round_up(write_bytes + sector_offset,
fs_info->sectorsize);
WARN_ON(reserve_bytes == 0); ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode),
reserve_bytes);
if (ret) { if (!only_release_metadata) btrfs_free_reserved_data_space(BTRFS_I(inode),
data_reserved, pos,
write_bytes);
else
btrfs_check_nocow_unlock(BTRFS_I(inode));
break;
}
release_bytes = reserve_bytes;
again:
/*
* This is going to setup the pages array with the number of
* pages we want, so we don't really need to worry about the
* contents of pages from loop to loop
*/
ret = prepare_pages(inode, pages, num_pages,
pos, write_bytes,
force_page_uptodate);
if (ret) {
btrfs_delalloc_release_extents(BTRFS_I(inode),
reserve_bytes);
break;
}
extents_locked = lock_and_cleanup_extent_if_need(
BTRFS_I(inode), pages,
num_pages, pos, write_bytes, &lockstart,
&lockend, &cached_state);
if (extents_locked < 0) {
if (extents_locked == -EAGAIN)
goto again;
btrfs_delalloc_release_extents(BTRFS_I(inode),
reserve_bytes);
ret = extents_locked;
break;
}
copied = btrfs_copy_from_user(pos, write_bytes, pages, i);
num_sectors = BTRFS_BYTES_TO_BLKS(fs_info, reserve_bytes);
dirty_sectors = round_up(copied + sector_offset,
fs_info->sectorsize);
dirty_sectors = BTRFS_BYTES_TO_BLKS(fs_info, dirty_sectors);
/*
* if we have trouble faulting in the pages, fall
* back to one page at a time
*/
if (copied < write_bytes)
nrptrs = 1;
if (copied == 0) {
force_page_uptodate = true;
dirty_sectors = 0;
dirty_pages = 0;
} else {
force_page_uptodate = false;
dirty_pages = DIV_ROUND_UP(copied + offset,
PAGE_SIZE);
}
if (num_sectors > dirty_sectors) {
/* release everything except the sectors we dirtied */
release_bytes -= dirty_sectors << fs_info->sectorsize_bits;
if (only_release_metadata) {
btrfs_delalloc_release_metadata(BTRFS_I(inode),
release_bytes, true);
} else {
u64 __pos;
__pos = round_down(pos,
fs_info->sectorsize) +
(dirty_pages << PAGE_SHIFT);
btrfs_delalloc_release_space(BTRFS_I(inode),
data_reserved, __pos,
release_bytes, true);
}
}
release_bytes = round_up(copied + sector_offset,
fs_info->sectorsize);
ret = btrfs_dirty_pages(BTRFS_I(inode), pages,
dirty_pages, pos, copied,
&cached_state, only_release_metadata);
/*
* If we have not locked the extent range, because the range's
* start offset is >= i_size, we might still have a non-NULL
* cached extent state, acquired while marking the extent range
* as delalloc through btrfs_dirty_pages(). Therefore free any
* possible cached extent state to avoid a memory leak.
*/
if (extents_locked)
unlock_extent_cached(&BTRFS_I(inode)->io_tree,
lockstart, lockend, &cached_state);
else
free_extent_state(cached_state); btrfs_delalloc_release_extents(BTRFS_I(inode), reserve_bytes);
if (ret) {
btrfs_drop_pages(pages, num_pages);
break;
}
release_bytes = 0;
if (only_release_metadata)
btrfs_check_nocow_unlock(BTRFS_I(inode));
btrfs_drop_pages(pages, num_pages);
cond_resched();
balance_dirty_pages_ratelimited(inode->i_mapping);
pos += copied;
num_written += copied;
}
kfree(pages);
if (release_bytes) {
if (only_release_metadata) {
btrfs_check_nocow_unlock(BTRFS_I(inode));
btrfs_delalloc_release_metadata(BTRFS_I(inode),
release_bytes, true);
} else {
btrfs_delalloc_release_space(BTRFS_I(inode),
data_reserved,
round_down(pos, fs_info->sectorsize),
release_bytes, true);
}
}
extent_changeset_free(data_reserved); if (num_written > 0) { pagecache_isize_extended(inode, old_isize, iocb->ki_pos);
iocb->ki_pos += num_written;
}
out:
btrfs_inode_unlock(inode, ilock_flags); return num_written ? num_written : ret;
}
static ssize_t check_direct_IO(struct btrfs_fs_info *fs_info,
const struct iov_iter *iter, loff_t offset)
{
const u32 blocksize_mask = fs_info->sectorsize - 1;
if (offset & blocksize_mask)
return -EINVAL;
if (iov_iter_alignment(iter) & blocksize_mask)
return -EINVAL;
return 0;
}
static ssize_t btrfs_direct_write(struct kiocb *iocb, struct iov_iter *from)
{
const bool is_sync_write = (iocb->ki_flags & IOCB_DSYNC); struct file *file = iocb->ki_filp;
struct inode *inode = file_inode(file);
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
loff_t pos;
ssize_t written = 0;
ssize_t written_buffered;
size_t prev_left = 0;
loff_t endbyte;
ssize_t err;
unsigned int ilock_flags = 0;
if (iocb->ki_flags & IOCB_NOWAIT)
ilock_flags |= BTRFS_ILOCK_TRY;
/* If the write DIO is within EOF, use a shared lock */
if (iocb->ki_pos + iov_iter_count(from) <= i_size_read(inode)) ilock_flags |= BTRFS_ILOCK_SHARED;
relock:
err = btrfs_inode_lock(inode, ilock_flags);
if (err < 0)
return err;
err = generic_write_checks(iocb, from);
if (err <= 0) {
btrfs_inode_unlock(inode, ilock_flags);
return err;
}
err = btrfs_write_check(iocb, from, err);
if (err < 0) {
btrfs_inode_unlock(inode, ilock_flags);
goto out;
}
pos = iocb->ki_pos;
/*
* Re-check since file size may have changed just before taking the
* lock or pos may have changed because of O_APPEND in generic_write_check()
*/
if ((ilock_flags & BTRFS_ILOCK_SHARED) &&
pos + iov_iter_count(from) > i_size_read(inode)) { btrfs_inode_unlock(inode, ilock_flags);
ilock_flags &= ~BTRFS_ILOCK_SHARED;
goto relock;
}
if (check_direct_IO(fs_info, from, pos)) { btrfs_inode_unlock(inode, ilock_flags);
goto buffered;
}
/*
* We remove IOCB_DSYNC so that we don't deadlock when iomap_dio_rw()
* calls generic_write_sync() (through iomap_dio_complete()), because
* that results in calling fsync (btrfs_sync_file()) which will try to
* lock the inode in exclusive/write mode.
*/
if (is_sync_write)
iocb->ki_flags &= ~IOCB_DSYNC;
/*
* The iov_iter can be mapped to the same file range we are writing to.
* If that's the case, then we will deadlock in the iomap code, because
* it first calls our callback btrfs_dio_iomap_begin(), which will create
* an ordered extent, and after that it will fault in the pages that the
* iov_iter refers to. During the fault in we end up in the readahead
* pages code (starting at btrfs_readahead()), which will lock the range,
* find that ordered extent and then wait for it to complete (at
* btrfs_lock_and_flush_ordered_range()), resulting in a deadlock since
* obviously the ordered extent can never complete as we didn't submit
* yet the respective bio(s). This always happens when the buffer is
* memory mapped to the same file range, since the iomap DIO code always
* invalidates pages in the target file range (after starting and waiting
* for any writeback).
*
* So here we disable page faults in the iov_iter and then retry if we
* got -EFAULT, faulting in the pages before the retry.
*/
again:
from->nofault = true;
err = iomap_dio_rw(iocb, from, &btrfs_dio_iomap_ops, &btrfs_dio_ops,
IOMAP_DIO_PARTIAL, written);
from->nofault = false;
/* No increment (+=) because iomap returns a cumulative value. */
if (err > 0)
written = err;
if (iov_iter_count(from) > 0 && (err == -EFAULT || err > 0)) {
const size_t left = iov_iter_count(from);
/*
* We have more data left to write. Try to fault in as many as
* possible of the remainder pages and retry. We do this without
* releasing and locking again the inode, to prevent races with
* truncate.
*
* Also, in case the iov refers to pages in the file range of the
* file we want to write to (due to a mmap), we could enter an
* infinite loop if we retry after faulting the pages in, since
* iomap will invalidate any pages in the range early on, before
* it tries to fault in the pages of the iov. So we keep track of
* how much was left of iov in the previous EFAULT and fallback
* to buffered IO in case we haven't made any progress.
*/
if (left == prev_left) {
err = -ENOTBLK;
} else {
fault_in_iov_iter_readable(from, left);
prev_left = left;
goto again;
}
}
btrfs_inode_unlock(inode, ilock_flags);
/*
* Add back IOCB_DSYNC. Our caller, btrfs_file_write_iter(), will do
* the fsync (call generic_write_sync()).
*/
if (is_sync_write)
iocb->ki_flags |= IOCB_DSYNC;
/* If 'err' is -ENOTBLK then it means we must fallback to buffered IO. */
if ((err < 0 && err != -ENOTBLK) || !iov_iter_count(from))
goto out;
buffered:
pos = iocb->ki_pos;
written_buffered = btrfs_buffered_write(iocb, from);
if (written_buffered < 0) {
err = written_buffered;
goto out;
}
/*
* Ensure all data is persisted. We want the next direct IO read to be
* able to read what was just written.
*/
endbyte = pos + written_buffered - 1;
err = btrfs_fdatawrite_range(inode, pos, endbyte);
if (err)
goto out;
err = filemap_fdatawait_range(inode->i_mapping, pos, endbyte);
if (err)
goto out;
written += written_buffered;
iocb->ki_pos = pos + written_buffered;
invalidate_mapping_pages(file->f_mapping, pos >> PAGE_SHIFT,
endbyte >> PAGE_SHIFT);
out:
return err < 0 ? err : written;
}
static ssize_t btrfs_file_write_iter(struct kiocb *iocb,
struct iov_iter *from)
{
struct file *file = iocb->ki_filp;
struct btrfs_inode *inode = BTRFS_I(file_inode(file));
ssize_t num_written = 0;
const bool sync = iocb->ki_flags & IOCB_DSYNC;
/*
* If the fs flips readonly due to some impossible error, although we
* have opened a file as writable, we have to stop this write operation
* to ensure consistency.
*/
if (test_bit(BTRFS_FS_STATE_ERROR, &inode->root->fs_info->fs_state))
return -EROFS;
if (!(iocb->ki_flags & IOCB_DIRECT) &&
(iocb->ki_flags & IOCB_NOWAIT))
return -EOPNOTSUPP;
if (sync) atomic_inc(&inode->sync_writers); if (iocb->ki_flags & IOCB_DIRECT)
num_written = btrfs_direct_write(iocb, from);
else
num_written = btrfs_buffered_write(iocb, from);
btrfs_set_inode_last_sub_trans(inode);
if (num_written > 0)
num_written = generic_write_sync(iocb, num_written);
if (sync) atomic_dec(&inode->sync_writers);
current->backing_dev_info = NULL;
return num_written;
}
int btrfs_release_file(struct inode *inode, struct file *filp)
{
struct btrfs_file_private *private = filp->private_data; if (private && private->filldir_buf) kfree(private->filldir_buf); kfree(private);
filp->private_data = NULL;
/*
* Set by setattr when we are about to truncate a file from a non-zero
* size to a zero size. This tries to flush down new bytes that may
* have been written if the application were using truncate to replace
* a file in place.
*/
if (test_and_clear_bit(BTRFS_INODE_FLUSH_ON_CLOSE,
&BTRFS_I(inode)->runtime_flags))
filemap_flush(inode->i_mapping); return 0;
}
static int start_ordered_ops(struct inode *inode, loff_t start, loff_t end)
{
int ret;
struct blk_plug plug;
/*
* This is only called in fsync, which would do synchronous writes, so
* a plug can merge adjacent IOs as much as possible. Esp. in case of
* multiple disks using raid profile, a large IO can be split to
* several segments of stripe length (currently 64K).
*/
blk_start_plug(&plug);
atomic_inc(&BTRFS_I(inode)->sync_writers);
ret = btrfs_fdatawrite_range(inode, start, end);
atomic_dec(&BTRFS_I(inode)->sync_writers);
blk_finish_plug(&plug);
return ret;
}
static inline bool skip_inode_logging(const struct btrfs_log_ctx *ctx)
{
struct btrfs_inode *inode = BTRFS_I(ctx->inode);
struct btrfs_fs_info *fs_info = inode->root->fs_info;
if (btrfs_inode_in_log(inode, fs_info->generation) &&
list_empty(&ctx->ordered_extents))
return true;
/*
* If we are doing a fast fsync we can not bail out if the inode's
* last_trans is <= then the last committed transaction, because we only
* update the last_trans of the inode during ordered extent completion,
* and for a fast fsync we don't wait for that, we only wait for the
* writeback to complete.
*/
if (inode->last_trans <= fs_info->last_trans_committed && (test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags) ||
list_empty(&ctx->ordered_extents)))
return true;
return false;
}
/*
* fsync call for both files and directories. This logs the inode into
* the tree log instead of forcing full commits whenever possible.
*
* It needs to call filemap_fdatawait so that all ordered extent updates are
* in the metadata btree are up to date for copying to the log.
*
* It drops the inode mutex before doing the tree log commit. This is an
* important optimization for directories because holding the mutex prevents
* new operations on the dir while we write to disk.
*/
int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
{
struct dentry *dentry = file_dentry(file);
struct inode *inode = d_inode(dentry);
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct btrfs_root *root = BTRFS_I(inode)->root;
struct btrfs_trans_handle *trans;
struct btrfs_log_ctx ctx;
int ret = 0, err;
u64 len;
bool full_sync;
trace_btrfs_sync_file(file, datasync);
btrfs_init_log_ctx(&ctx, inode);
/*
* Always set the range to a full range, otherwise we can get into
* several problems, from missing file extent items to represent holes
* when not using the NO_HOLES feature, to log tree corruption due to
* races between hole detection during logging and completion of ordered
* extents outside the range, to missing checksums due to ordered extents
* for which we flushed only a subset of their pages.
*/
start = 0;
end = LLONG_MAX;
len = (u64)LLONG_MAX + 1;
/*
* We write the dirty pages in the range and wait until they complete
* out of the ->i_mutex. If so, we can flush the dirty pages by
* multi-task, and make the performance up. See
* btrfs_wait_ordered_range for an explanation of the ASYNC check.
*/
ret = start_ordered_ops(inode, start, end);
if (ret)
goto out;
btrfs_inode_lock(inode, BTRFS_ILOCK_MMAP);
atomic_inc(&root->log_batch);
/*
* Always check for the full sync flag while holding the inode's lock,
* to avoid races with other tasks. The flag must be either set all the
* time during logging or always off all the time while logging.
*/
full_sync = test_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
&BTRFS_I(inode)->runtime_flags);
/*
* Before we acquired the inode's lock and the mmap lock, someone may
* have dirtied more pages in the target range. We need to make sure
* that writeback for any such pages does not start while we are logging
* the inode, because if it does, any of the following might happen when
* we are not doing a full inode sync:
*
* 1) We log an extent after its writeback finishes but before its
* checksums are added to the csum tree, leading to -EIO errors
* when attempting to read the extent after a log replay.
*
* 2) We can end up logging an extent before its writeback finishes.
* Therefore after the log replay we will have a file extent item
* pointing to an unwritten extent (and no data checksums as well).
*
* So trigger writeback for any eventual new dirty pages and then we
* wait for all ordered extents to complete below.
*/
ret = start_ordered_ops(inode, start, end);
if (ret) {
btrfs_inode_unlock(inode, BTRFS_ILOCK_MMAP);
goto out;
}
/*
* We have to do this here to avoid the priority inversion of waiting on
* IO of a lower priority task while holding a transaction open.
*
* For a full fsync we wait for the ordered extents to complete while
* for a fast fsync we wait just for writeback to complete, and then
* attach the ordered extents to the transaction so that a transaction
* commit waits for their completion, to avoid data loss if we fsync,
* the current transaction commits before the ordered extents complete
* and a power failure happens right after that.
*
* For zoned filesystem, if a write IO uses a ZONE_APPEND command, the
* logical address recorded in the ordered extent may change. We need
* to wait for the IO to stabilize the logical address.
*/
if (full_sync || btrfs_is_zoned(fs_info)) { ret = btrfs_wait_ordered_range(inode, start, len);
} else {
/*
* Get our ordered extents as soon as possible to avoid doing
* checksum lookups in the csum tree, and use instead the
* checksums attached to the ordered extents.
*/
btrfs_get_ordered_extents_for_logging(BTRFS_I(inode),
&ctx.ordered_extents);
ret = filemap_fdatawait_range(inode->i_mapping, start, end);
}
if (ret)
goto out_release_extents;
atomic_inc(&root->log_batch);
smp_mb();
if (skip_inode_logging(&ctx)) {
/*
* We've had everything committed since the last time we were
* modified so clear this flag in case it was set for whatever
* reason, it's no longer relevant.
*/
clear_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
&BTRFS_I(inode)->runtime_flags);
/*
* An ordered extent might have started before and completed
* already with io errors, in which case the inode was not
* updated and we end up here. So check the inode's mapping
* for any errors that might have happened since we last
* checked called fsync.
*/
ret = filemap_check_wb_err(inode->i_mapping, file->f_wb_err);
goto out_release_extents;
}
/*
* We use start here because we will need to wait on the IO to complete
* in btrfs_sync_log, which could require joining a transaction (for
* example checking cross references in the nocow path). If we use join
* here we could get into a situation where we're waiting on IO to
* happen that is blocked on a transaction trying to commit. With start
* we inc the extwriter counter, so we wait for all extwriters to exit
* before we start blocking joiners. This comment is to keep somebody
* from thinking they are super smart and changing this to
* btrfs_join_transaction *cough*Josef*cough*.
*/
trans = btrfs_start_transaction(root, 0);
if (IS_ERR(trans)) {
ret = PTR_ERR(trans);
goto out_release_extents;
}
trans->in_fsync = true;
ret = btrfs_log_dentry_safe(trans, dentry, &ctx);
btrfs_release_log_ctx_extents(&ctx);
if (ret < 0) {
/* Fallthrough and commit/free transaction. */
ret = 1;
}
/* we've logged all the items and now have a consistent
* version of the file in the log. It is possible that
* someone will come in and modify the file, but that's
* fine because the log is consistent on disk, and we
* have references to all of the file's extents
*
* It is possible that someone will come in and log the
* file again, but that will end up using the synchronization
* inside btrfs_sync_log to keep things safe.
*/
btrfs_inode_unlock(inode, BTRFS_ILOCK_MMAP);
if (ret != BTRFS_NO_LOG_SYNC) {
if (!ret) { ret = btrfs_sync_log(trans, root, &ctx);
if (!ret) {
ret = btrfs_end_transaction(trans);
goto out;
}
}
if (!full_sync) { ret = btrfs_wait_ordered_range(inode, start, len);
if (ret) {
btrfs_end_transaction(trans);
goto out;
}
}
ret = btrfs_commit_transaction(trans);
} else {
ret = btrfs_end_transaction(trans);
}
out:
ASSERT(list_empty(&ctx.list));
err = file_check_and_advance_wb_err(file);
if (!ret)
ret = err;
return ret > 0 ? -EIO : ret;
out_release_extents:
btrfs_release_log_ctx_extents(&ctx);
btrfs_inode_unlock(inode, BTRFS_ILOCK_MMAP);
goto out;
}
static const struct vm_operations_struct btrfs_file_vm_ops = {
.fault = filemap_fault,
.map_pages = filemap_map_pages,
.page_mkwrite = btrfs_page_mkwrite,
};
static int btrfs_file_mmap(struct file *filp, struct vm_area_struct *vma)
{
struct address_space *mapping = filp->f_mapping;
if (!mapping->a_ops->readpage)
return -ENOEXEC;
file_accessed(filp);
vma->vm_ops = &btrfs_file_vm_ops;
return 0;
}
static int hole_mergeable(struct btrfs_inode *inode, struct extent_buffer *leaf,
int slot, u64 start, u64 end)
{
struct btrfs_file_extent_item *fi;
struct btrfs_key key;
if (slot < 0 || slot >= btrfs_header_nritems(leaf)) return 0;
btrfs_item_key_to_cpu(leaf, &key, slot);
if (key.objectid != btrfs_ino(inode) || key.type != BTRFS_EXTENT_DATA_KEY) return 0;
fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
if (btrfs_file_extent_type(leaf, fi) != BTRFS_FILE_EXTENT_REG)
return 0;
if (btrfs_file_extent_disk_bytenr(leaf, fi))
return 0;
if (key.offset == end)
return 1;
if (key.offset + btrfs_file_extent_num_bytes(leaf, fi) == start)
return 1;
return 0;
}
static int fill_holes(struct btrfs_trans_handle *trans,
struct btrfs_inode *inode,
struct btrfs_path *path, u64 offset, u64 end)
{
struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_root *root = inode->root;
struct extent_buffer *leaf;
struct btrfs_file_extent_item *fi;
struct extent_map *hole_em;
struct extent_map_tree *em_tree = &inode->extent_tree;
struct btrfs_key key;
int ret;
if (btrfs_fs_incompat(fs_info, NO_HOLES))
goto out;
key.objectid = btrfs_ino(inode);
key.type = BTRFS_EXTENT_DATA_KEY;
key.offset = offset;
ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
if (ret <= 0) {
/*
* We should have dropped this offset, so if we find it then
* something has gone horribly wrong.
*/
if (ret == 0)
ret = -EINVAL;
return ret;
}
leaf = path->nodes[0];
if (hole_mergeable(inode, leaf, path->slots[0] - 1, offset, end)) {
u64 num_bytes;
path->slots[0]--;
fi = btrfs_item_ptr(leaf, path->slots[0],
struct btrfs_file_extent_item);
num_bytes = btrfs_file_extent_num_bytes(leaf, fi) +
end - offset;
btrfs_set_file_extent_num_bytes(leaf, fi, num_bytes);
btrfs_set_file_extent_ram_bytes(leaf, fi, num_bytes);
btrfs_set_file_extent_offset(leaf, fi, 0);
btrfs_mark_buffer_dirty(leaf);
goto out;
}
if (hole_mergeable(inode, leaf, path->slots[0], offset, end)) {
u64 num_bytes;
key.offset = offset;
btrfs_set_item_key_safe(fs_info, path, &key);
fi = btrfs_item_ptr(leaf, path->slots[0],
struct btrfs_file_extent_item);
num_bytes = btrfs_file_extent_num_bytes(leaf, fi) + end -
offset;
btrfs_set_file_extent_num_bytes(leaf, fi, num_bytes);
btrfs_set_file_extent_ram_bytes(leaf, fi, num_bytes);
btrfs_set_file_extent_offset(leaf, fi, 0);
btrfs_mark_buffer_dirty(leaf);
goto out;
}
btrfs_release_path(path); ret = btrfs_insert_file_extent(trans, root, btrfs_ino(inode),
offset, 0, 0, end - offset, 0, end - offset, 0, 0, 0);
if (ret)
return ret;
out:
btrfs_release_path(path);
hole_em = alloc_extent_map();
if (!hole_em) {
btrfs_drop_extent_cache(inode, offset, end - 1, 0);
set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags);
} else {
hole_em->start = offset;
hole_em->len = end - offset;
hole_em->ram_bytes = hole_em->len;
hole_em->orig_start = offset;
hole_em->block_start = EXTENT_MAP_HOLE;
hole_em->block_len = 0;
hole_em->orig_block_len = 0;
hole_em->compress_type = BTRFS_COMPRESS_NONE;
hole_em->generation = trans->transid;
do {
btrfs_drop_extent_cache(inode, offset, end - 1, 0);
write_lock(&em_tree->lock);
ret = add_extent_mapping(em_tree, hole_em, 1);
write_unlock(&em_tree->lock);
} while (ret == -EEXIST);
free_extent_map(hole_em); if (ret)
set_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
&inode->runtime_flags);
}
return 0;
}
/*
* Find a hole extent on given inode and change start/len to the end of hole
* extent.(hole/vacuum extent whose em->start <= start &&
* em->start + em->len > start)
* When a hole extent is found, return 1 and modify start/len.
*/
static int find_first_non_hole(struct btrfs_inode *inode, u64 *start, u64 *len)
{
struct btrfs_fs_info *fs_info = inode->root->fs_info;
struct extent_map *em;
int ret = 0;
em = btrfs_get_extent(inode, NULL, 0,
round_down(*start, fs_info->sectorsize),
round_up(*len, fs_info->sectorsize));
if (IS_ERR(em))
return PTR_ERR(em);
/* Hole or vacuum extent(only exists in no-hole mode) */
if (em->block_start == EXTENT_MAP_HOLE) {
ret = 1;
*len = em->start + em->len > *start + *len ? 0 : *start + *len - em->start - em->len;
*start = em->start + em->len;
}
free_extent_map(em); return ret;
}
static int btrfs_punch_hole_lock_range(struct inode *inode,
const u64 lockstart,
const u64 lockend,
struct extent_state **cached_state)
{
/*
* For subpage case, if the range is not at page boundary, we could
* have pages at the leading/tailing part of the range.
* This could lead to dead loop since filemap_range_has_page()
* will always return true.
* So here we need to do extra page alignment for
* filemap_range_has_page().
*/
const u64 page_lockstart = round_up(lockstart, PAGE_SIZE);
const u64 page_lockend = round_down(lockend + 1, PAGE_SIZE) - 1;
while (1) {
struct btrfs_ordered_extent *ordered;
int ret;
truncate_pagecache_range(inode, lockstart, lockend);
lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend,
cached_state);
ordered = btrfs_lookup_first_ordered_extent(BTRFS_I(inode),
lockend);
/*
* We need to make sure we have no ordered extents in this range
* and nobody raced in and read a page in this range, if we did
* we need to try again.
*/
if ((!ordered ||
(ordered->file_offset + ordered->num_bytes <= lockstart ||
ordered->file_offset > lockend)) &&
!filemap_range_has_page(inode->i_mapping,
page_lockstart, page_lockend)) {
if (ordered)
btrfs_put_ordered_extent(ordered);
break;
}
if (ordered)
btrfs_put_ordered_extent(ordered);
unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart,
lockend, cached_state);
ret = btrfs_wait_ordered_range(inode, lockstart,
lockend - lockstart + 1);
if (ret)
return ret;
}
return 0;
}
static int btrfs_insert_replace_extent(struct btrfs_trans_handle *trans,
struct btrfs_inode *inode,
struct btrfs_path *path,
struct btrfs_replace_extent_info *extent_info,
const u64 replace_len,
const u64 bytes_to_drop)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
struct btrfs_root *root = inode->root;
struct btrfs_file_extent_item *extent;
struct extent_buffer *leaf;
struct btrfs_key key;
int slot;
struct btrfs_ref ref = { 0 };
int ret;
if (replace_len == 0)
return 0;
if (extent_info->disk_offset == 0 && btrfs_fs_incompat(fs_info, NO_HOLES)) { btrfs_update_inode_bytes(inode, 0, bytes_to_drop);
return 0;
}
key.objectid = btrfs_ino(inode);
key.type = BTRFS_EXTENT_DATA_KEY;
key.offset = extent_info->file_offset;
ret = btrfs_insert_empty_item(trans, root, path, &key,
sizeof(struct btrfs_file_extent_item));
if (ret)
return ret;
leaf = path->nodes[0];
slot = path->slots[0];
write_extent_buffer(leaf, extent_info->extent_buf,
btrfs_item_ptr_offset(leaf, slot),
sizeof(struct btrfs_file_extent_item));
extent = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
ASSERT(btrfs_file_extent_type(leaf, extent) != BTRFS_FILE_EXTENT_INLINE);
btrfs_set_file_extent_offset(leaf, extent, extent_info->data_offset);
btrfs_set_file_extent_num_bytes(leaf, extent, replace_len);
if (extent_info->is_new_extent)
btrfs_set_file_extent_generation(leaf, extent, trans->transid); btrfs_mark_buffer_dirty(leaf);
btrfs_release_path(path);
ret = btrfs_inode_set_file_extent_range(inode, extent_info->file_offset,
replace_len);
if (ret)
return ret;
/* If it's a hole, nothing more needs to be done. */
if (extent_info->disk_offset == 0) {
btrfs_update_inode_bytes(inode, 0, bytes_to_drop);
return 0;
}
btrfs_update_inode_bytes(inode, replace_len, bytes_to_drop); if (extent_info->is_new_extent && extent_info->insertions == 0) { key.objectid = extent_info->disk_offset;
key.type = BTRFS_EXTENT_ITEM_KEY;
key.offset = extent_info->disk_len;
ret = btrfs_alloc_reserved_file_extent(trans, root,
btrfs_ino(inode),
extent_info->file_offset,
extent_info->qgroup_reserved,
&key);
} else {
u64 ref_offset;
btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF,
extent_info->disk_offset,
extent_info->disk_len, 0);
ref_offset = extent_info->file_offset - extent_info->data_offset;
btrfs_init_data_ref(&ref, root->root_key.objectid,
btrfs_ino(inode), ref_offset);
ret = btrfs_inc_extent_ref(trans, &ref);
}
extent_info->insertions++; return ret;
}
/*
* The respective range must have been previously locked, as well as the inode.
* The end offset is inclusive (last byte of the range).
* @extent_info is NULL for fallocate's hole punching and non-NULL when replacing
* the file range with an extent.
* When not punching a hole, we don't want to end up in a state where we dropped
* extents without inserting a new one, so we must abort the transaction to avoid
* a corruption.
*/
int btrfs_replace_file_extents(struct btrfs_inode *inode,
struct btrfs_path *path, const u64 start,
const u64 end,
struct btrfs_replace_extent_info *extent_info,
struct btrfs_trans_handle **trans_out)
{
struct btrfs_drop_extents_args drop_args = { 0 };
struct btrfs_root *root = inode->root;
struct btrfs_fs_info *fs_info = root->fs_info;
u64 min_size = btrfs_calc_insert_metadata_size(fs_info, 1);
u64 ino_size = round_up(inode->vfs_inode.i_size, fs_info->sectorsize);
struct btrfs_trans_handle *trans = NULL;
struct btrfs_block_rsv *rsv;
unsigned int rsv_count;
u64 cur_offset;
u64 len = end - start;
int ret = 0;
if (end <= start)
return -EINVAL;
rsv = btrfs_alloc_block_rsv(fs_info, BTRFS_BLOCK_RSV_TEMP);
if (!rsv) {
ret = -ENOMEM;
goto out;
}
rsv->size = btrfs_calc_insert_metadata_size(fs_info, 1);
rsv->failfast = 1;
/*
* 1 - update the inode
* 1 - removing the extents in the range
* 1 - adding the hole extent if no_holes isn't set or if we are
* replacing the range with a new extent
*/
if (!btrfs_fs_incompat(fs_info, NO_HOLES) || extent_info)
rsv_count = 3;
else
rsv_count = 2;
trans = btrfs_start_transaction(root, rsv_count);
if (IS_ERR(trans)) {
ret = PTR_ERR(trans);
trans = NULL;
goto out_free;
}
ret = btrfs_block_rsv_migrate(&fs_info->trans_block_rsv, rsv,
min_size, false);
BUG_ON(ret); trans->block_rsv = rsv;
cur_offset = start;
drop_args.path = path;
drop_args.end = end + 1;
drop_args.drop_cache = true;
while (cur_offset < end) { drop_args.start = cur_offset;
ret = btrfs_drop_extents(trans, root, inode, &drop_args);
/* If we are punching a hole decrement the inode's byte count */
if (!extent_info)
btrfs_update_inode_bytes(inode, 0,
drop_args.bytes_found);
if (ret != -ENOSPC) {
/*
* The only time we don't want to abort is if we are
* attempting to clone a partial inline extent, in which
* case we'll get EOPNOTSUPP. However if we aren't
* clone we need to abort no matter what, because if we
* got EOPNOTSUPP via prealloc then we messed up and
* need to abort.
*/
if (ret && (ret != -EOPNOTSUPP || (extent_info && extent_info->is_new_extent))) btrfs_abort_transaction(trans, ret);
break;
}
trans->block_rsv = &fs_info->trans_block_rsv; if (!extent_info && cur_offset < drop_args.drop_end &&
cur_offset < ino_size) {
ret = fill_holes(trans, inode, path, cur_offset,
drop_args.drop_end);
if (ret) {
/*
* If we failed then we didn't insert our hole
* entries for the area we dropped, so now the
* fs is corrupted, so we must abort the
* transaction.
*/
btrfs_abort_transaction(trans, ret);
break;
}
} else if (!extent_info && cur_offset < drop_args.drop_end) {
/*
* We are past the i_size here, but since we didn't
* insert holes we need to clear the mapped area so we
* know to not set disk_i_size in this area until a new
* file extent is inserted here.
*/
ret = btrfs_inode_clear_file_extent_range(inode,
cur_offset,
drop_args.drop_end - cur_offset);
if (ret) {
/*
* We couldn't clear our area, so we could
* presumably adjust up and corrupt the fs, so
* we need to abort.
*/
btrfs_abort_transaction(trans, ret);
break;
}
}
if (extent_info &&
drop_args.drop_end > extent_info->file_offset) {
u64 replace_len = drop_args.drop_end -
extent_info->file_offset;
ret = btrfs_insert_replace_extent(trans, inode, path,
extent_info, replace_len,
drop_args.bytes_found);
if (ret) {
btrfs_abort_transaction(trans, ret);
break;
}
extent_info->data_len -= replace_len;
extent_info->data_offset += replace_len;
extent_info->file_offset += replace_len;
}
ret = btrfs_update_inode(trans, root, inode);
if (ret)
break;
btrfs_end_transaction(trans);
btrfs_btree_balance_dirty(fs_info);
trans = btrfs_start_transaction(root, rsv_count);
if (IS_ERR(trans)) {
ret = PTR_ERR(trans);
trans = NULL;
break;
}
ret = btrfs_block_rsv_migrate(&fs_info->trans_block_rsv,
rsv, min_size, false);
BUG_ON(ret); /* shouldn't happen */ trans->block_rsv = rsv;
cur_offset = drop_args.drop_end;
len = end - cur_offset;
if (!extent_info && len) { ret = find_first_non_hole(inode, &cur_offset, &len);
if (unlikely(ret < 0))
break;
if (ret && !len) {
ret = 0;
break;
}
}
}
/*
* If we were cloning, force the next fsync to be a full one since we
* we replaced (or just dropped in the case of cloning holes when
* NO_HOLES is enabled) file extent items and did not setup new extent
* maps for the replacement extents (or holes).
*/
if (extent_info && !extent_info->is_new_extent) set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags); if (ret)
goto out_trans;
trans->block_rsv = &fs_info->trans_block_rsv;
/*
* If we are using the NO_HOLES feature we might have had already an
* hole that overlaps a part of the region [lockstart, lockend] and
* ends at (or beyond) lockend. Since we have no file extent items to
* represent holes, drop_end can be less than lockend and so we must
* make sure we have an extent map representing the existing hole (the
* call to __btrfs_drop_extents() might have dropped the existing extent
* map representing the existing hole), otherwise the fast fsync path
* will not record the existence of the hole region
* [existing_hole_start, lockend].
*/
if (drop_args.drop_end <= end)
drop_args.drop_end = end + 1;
/*
* Don't insert file hole extent item if it's for a range beyond eof
* (because it's useless) or if it represents a 0 bytes range (when
* cur_offset == drop_end).
*/
if (!extent_info && cur_offset < ino_size &&
cur_offset < drop_args.drop_end) {
ret = fill_holes(trans, inode, path, cur_offset,
drop_args.drop_end);
if (ret) {
/* Same comment as above. */
btrfs_abort_transaction(trans, ret);
goto out_trans;
}
} else if (!extent_info && cur_offset < drop_args.drop_end) {
/* See the comment in the loop above for the reasoning here. */
ret = btrfs_inode_clear_file_extent_range(inode, cur_offset,
drop_args.drop_end - cur_offset);
if (ret) {
btrfs_abort_transaction(trans, ret);
goto out_trans;
}
}
if (extent_info) {
ret = btrfs_insert_replace_extent(trans, inode, path,
extent_info, extent_info->data_len,
drop_args.bytes_found);
if (ret) {
btrfs_abort_transaction(trans, ret);
goto out_trans;
}
}
out_trans:
if (!trans)
goto out_free;
trans->block_rsv = &fs_info->trans_block_rsv;
if (ret)
btrfs_end_transaction(trans);
else
*trans_out = trans;
out_free:
btrfs_free_block_rsv(fs_info, rsv);
out:
return ret;
}
static int btrfs_punch_hole(struct file *file, loff_t offset, loff_t len)
{
struct inode *inode = file_inode(file);
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct btrfs_root *root = BTRFS_I(inode)->root;
struct extent_state *cached_state = NULL;
struct btrfs_path *path;
struct btrfs_trans_handle *trans = NULL;
u64 lockstart;
u64 lockend;
u64 tail_start;
u64 tail_len;
u64 orig_start = offset;
int ret = 0;
bool same_block;
u64 ino_size;
bool truncated_block = false;
bool updated_inode = false;
ret = btrfs_wait_ordered_range(inode, offset, len);
if (ret)
return ret;
btrfs_inode_lock(inode, BTRFS_ILOCK_MMAP); ino_size = round_up(inode->i_size, fs_info->sectorsize);
ret = find_first_non_hole(BTRFS_I(inode), &offset, &len);
if (ret < 0)
goto out_only_mutex;
if (ret && !len) {
/* Already in a large hole */
ret = 0;
goto out_only_mutex;
}
ret = file_modified(file);
if (ret)
goto out_only_mutex;
lockstart = round_up(offset, btrfs_inode_sectorsize(BTRFS_I(inode)));
lockend = round_down(offset + len,
btrfs_inode_sectorsize(BTRFS_I(inode))) - 1;
same_block = (BTRFS_BYTES_TO_BLKS(fs_info, offset))
== (BTRFS_BYTES_TO_BLKS(fs_info, offset + len - 1));
/*
* We needn't truncate any block which is beyond the end of the file
* because we are sure there is no data there.
*/
/*
* Only do this if we are in the same block and we aren't doing the
* entire block.
*/
if (same_block && len < fs_info->sectorsize) { if (offset < ino_size) {
truncated_block = true;
ret = btrfs_truncate_block(BTRFS_I(inode), offset, len,
0);
} else {
ret = 0;
}
goto out_only_mutex;
}
/* zero back part of the first block */
if (offset < ino_size) {
truncated_block = true;
ret = btrfs_truncate_block(BTRFS_I(inode), offset, 0, 0); if (ret) { btrfs_inode_unlock(inode, BTRFS_ILOCK_MMAP);
return ret;
}
}
/* Check the aligned pages after the first unaligned page,
* if offset != orig_start, which means the first unaligned page
* including several following pages are already in holes,
* the extra check can be skipped */
if (offset == orig_start) {
/* after truncate page, check hole again */
len = offset + len - lockstart;
offset = lockstart;
ret = find_first_non_hole(BTRFS_I(inode), &offset, &len);
if (ret < 0)
goto out_only_mutex;
if (ret && !len) {
ret = 0;
goto out_only_mutex;
}
lockstart = offset;
}
/* Check the tail unaligned part is in a hole */
tail_start = lockend + 1;
tail_len = offset + len - tail_start;
if (tail_len) {
ret = find_first_non_hole(BTRFS_I(inode), &tail_start, &tail_len);
if (unlikely(ret < 0))
goto out_only_mutex;
if (!ret) {
/* zero the front end of the last page */
if (tail_start + tail_len < ino_size) {
truncated_block = true;
ret = btrfs_truncate_block(BTRFS_I(inode),
tail_start + tail_len,
0, 1);
if (ret)
goto out_only_mutex;
}
}
}
if (lockend < lockstart) {
ret = 0;
goto out_only_mutex;
}
ret = btrfs_punch_hole_lock_range(inode, lockstart, lockend,
&cached_state);
if (ret)
goto out_only_mutex;
path = btrfs_alloc_path();
if (!path) {
ret = -ENOMEM;
goto out;
}
ret = btrfs_replace_file_extents(BTRFS_I(inode), path, lockstart,
lockend, NULL, &trans);
btrfs_free_path(path);
if (ret)
goto out;
ASSERT(trans != NULL);
inode_inc_iversion(inode);
inode->i_mtime = inode->i_ctime = current_time(inode);
ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
updated_inode = true;
btrfs_end_transaction(trans);
btrfs_btree_balance_dirty(fs_info);
out:
unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend,
&cached_state);
out_only_mutex:
if (!updated_inode && truncated_block && !ret) {
/*
* If we only end up zeroing part of a page, we still need to
* update the inode item, so that all the time fields are
* updated as well as the necessary btrfs inode in memory fields
* for detecting, at fsync time, if the inode isn't yet in the
* log tree or it's there but not up to date.
*/
struct timespec64 now = current_time(inode);
inode_inc_iversion(inode);
inode->i_mtime = now;
inode->i_ctime = now;
trans = btrfs_start_transaction(root, 1);
if (IS_ERR(trans)) {
ret = PTR_ERR(trans);
} else {
int ret2;
ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
ret2 = btrfs_end_transaction(trans);
if (!ret)
ret = ret2;
}
}
btrfs_inode_unlock(inode, BTRFS_ILOCK_MMAP);
return ret;
}
/* Helper structure to record which range is already reserved */
struct falloc_range {
struct list_head list;
u64 start;
u64 len;
};
/*
* Helper function to add falloc range
*
* Caller should have locked the larger range of extent containing
* [start, len)
*/
static int add_falloc_range(struct list_head *head, u64 start, u64 len)
{
struct falloc_range *range = NULL;
if (!list_empty(head)) {
/*
* As fallocate iterates by bytenr order, we only need to check
* the last range.
*/
range = list_last_entry(head, struct falloc_range, list);
if (range->start + range->len == start) {
range->len += len;
return 0;
}
}
range = kmalloc(sizeof(*range), GFP_KERNEL);
if (!range)
return -ENOMEM;
range->start = start;
range->len = len;
list_add_tail(&range->list, head);
return 0;
}
static int btrfs_fallocate_update_isize(struct inode *inode,
const u64 end,
const int mode)
{
struct btrfs_trans_handle *trans;
struct btrfs_root *root = BTRFS_I(inode)->root;
int ret;
int ret2;
if (mode & FALLOC_FL_KEEP_SIZE || end <= i_size_read(inode)) return 0;
trans = btrfs_start_transaction(root, 1);
if (IS_ERR(trans))
return PTR_ERR(trans);
inode->i_ctime = current_time(inode);
i_size_write(inode, end);
btrfs_inode_safe_disk_i_size_write(BTRFS_I(inode), 0);
ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
ret2 = btrfs_end_transaction(trans);
return ret ? ret : ret2;
}
enum {
RANGE_BOUNDARY_WRITTEN_EXTENT,
RANGE_BOUNDARY_PREALLOC_EXTENT,
RANGE_BOUNDARY_HOLE,
};
static int btrfs_zero_range_check_range_boundary(struct btrfs_inode *inode,
u64 offset)
{
const u64 sectorsize = btrfs_inode_sectorsize(inode);
struct extent_map *em;
int ret;
offset = round_down(offset, sectorsize);
em = btrfs_get_extent(inode, NULL, 0, offset, sectorsize);
if (IS_ERR(em))
return PTR_ERR(em); if (em->block_start == EXTENT_MAP_HOLE)
ret = RANGE_BOUNDARY_HOLE;
else if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
ret = RANGE_BOUNDARY_PREALLOC_EXTENT;
else
ret = RANGE_BOUNDARY_WRITTEN_EXTENT;
free_extent_map(em); return ret;
}
static int btrfs_zero_range(struct inode *inode,
loff_t offset,
loff_t len,
const int mode)
{
struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
struct extent_map *em;
struct extent_changeset *data_reserved = NULL;
int ret;
u64 alloc_hint = 0;
const u64 sectorsize = btrfs_inode_sectorsize(BTRFS_I(inode));
u64 alloc_start = round_down(offset, sectorsize);
u64 alloc_end = round_up(offset + len, sectorsize);
u64 bytes_to_reserve = 0;
bool space_reserved = false;
inode_dio_wait(inode);
em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, alloc_start,
alloc_end - alloc_start);
if (IS_ERR(em)) {
ret = PTR_ERR(em);
goto out;
}
/*
* Avoid hole punching and extent allocation for some cases. More cases
* could be considered, but these are unlikely common and we keep things
* as simple as possible for now. Also, intentionally, if the target
* range contains one or more prealloc extents together with regular
* extents and holes, we drop all the existing extents and allocate a
* new prealloc extent, so that we get a larger contiguous disk extent.
*/
if (em->start <= alloc_start && test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) { const u64 em_end = em->start + em->len;
if (em_end >= offset + len) {
/*
* The whole range is already a prealloc extent,
* do nothing except updating the inode's i_size if
* needed.
*/
free_extent_map(em);
ret = btrfs_fallocate_update_isize(inode, offset + len,
mode);
goto out;
}
/*
* Part of the range is already a prealloc extent, so operate
* only on the remaining part of the range.
*/
alloc_start = em_end;
ASSERT(IS_ALIGNED(alloc_start, sectorsize));
len = offset + len - alloc_start;
offset = alloc_start;
alloc_hint = em->block_start + em->len;
}
free_extent_map(em);
if (BTRFS_BYTES_TO_BLKS(fs_info, offset) ==
BTRFS_BYTES_TO_BLKS(fs_info, offset + len - 1)) {
em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, alloc_start,
sectorsize);
if (IS_ERR(em)) {
ret = PTR_ERR(em);
goto out;
}
if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) { free_extent_map(em);
ret = btrfs_fallocate_update_isize(inode, offset + len,
mode);
goto out;
}
if (len < sectorsize && em->block_start != EXTENT_MAP_HOLE) { free_extent_map(em);
ret = btrfs_truncate_block(BTRFS_I(inode), offset, len,
0);
if (!ret)
ret = btrfs_fallocate_update_isize(inode,
offset + len,
mode);
return ret;
}
free_extent_map(em);
alloc_start = round_down(offset, sectorsize);
alloc_end = alloc_start + sectorsize;
goto reserve_space;
}
alloc_start = round_up(offset, sectorsize);
alloc_end = round_down(offset + len, sectorsize);
/*
* For unaligned ranges, check the pages at the boundaries, they might
* map to an extent, in which case we need to partially zero them, or
* they might map to a hole, in which case we need our allocation range
* to cover them.
*/
if (!IS_ALIGNED(offset, sectorsize)) {
ret = btrfs_zero_range_check_range_boundary(BTRFS_I(inode),
offset);
if (ret < 0)
goto out;
if (ret == RANGE_BOUNDARY_HOLE) { alloc_start = round_down(offset, sectorsize);
ret = 0;
} else if (ret == RANGE_BOUNDARY_WRITTEN_EXTENT) { ret = btrfs_truncate_block(BTRFS_I(inode), offset, 0, 0);
if (ret)
goto out;
} else {
ret = 0;
}
}
if (!IS_ALIGNED(offset + len, sectorsize)) { ret = btrfs_zero_range_check_range_boundary(BTRFS_I(inode),
offset + len);
if (ret < 0)
goto out;
if (ret == RANGE_BOUNDARY_HOLE) { alloc_end = round_up(offset + len, sectorsize);
ret = 0;
} else if (ret == RANGE_BOUNDARY_WRITTEN_EXTENT) { ret = btrfs_truncate_block(BTRFS_I(inode), offset + len,
0, 1);
if (ret)
goto out;
} else {
ret = 0;
}
}
reserve_space:
if (alloc_start < alloc_end) { struct extent_state *cached_state = NULL;
const u64 lockstart = alloc_start;
const u64 lockend = alloc_end - 1;
bytes_to_reserve = alloc_end - alloc_start;
ret = btrfs_alloc_data_chunk_ondemand(BTRFS_I(inode),
bytes_to_reserve);
if (ret < 0)
goto out;
space_reserved = true;
ret = btrfs_punch_hole_lock_range(inode, lockstart, lockend,
&cached_state);
if (ret)
goto out;
ret = btrfs_qgroup_reserve_data(BTRFS_I(inode), &data_reserved,
alloc_start, bytes_to_reserve);
if (ret) {
unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart,
lockend, &cached_state);
goto out;
}
ret = btrfs_prealloc_file_range(inode, mode, alloc_start,
alloc_end - alloc_start,
i_blocksize(inode),
offset + len, &alloc_hint);
unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart,
lockend, &cached_state);
/* btrfs_prealloc_file_range releases reserved space on error */
if (ret) {
space_reserved = false;
goto out;
}
}
ret = btrfs_fallocate_update_isize(inode, offset + len, mode);
out:
if (ret && space_reserved)
btrfs_free_reserved_data_space(BTRFS_I(inode), data_reserved,
alloc_start, bytes_to_reserve);
extent_changeset_free(data_reserved);
return ret;
}
static long btrfs_fallocate(struct file *file, int mode,
loff_t offset, loff_t len)
{
struct inode *inode = file_inode(file);
struct extent_state *cached_state = NULL;
struct extent_changeset *data_reserved = NULL;
struct falloc_range *range;
struct falloc_range *tmp;
struct list_head reserve_list;
u64 cur_offset;
u64 last_byte;
u64 alloc_start;
u64 alloc_end;
u64 alloc_hint = 0;
u64 locked_end;
u64 actual_end = 0;
struct extent_map *em;
int blocksize = btrfs_inode_sectorsize(BTRFS_I(inode));
int ret;
/* Do not allow fallocate in ZONED mode */
if (btrfs_is_zoned(btrfs_sb(inode->i_sb)))
return -EOPNOTSUPP;
alloc_start = round_down(offset, blocksize);
alloc_end = round_up(offset + len, blocksize);
cur_offset = alloc_start;
/* Make sure we aren't being give some crap mode */
if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE |
FALLOC_FL_ZERO_RANGE))
return -EOPNOTSUPP;
if (mode & FALLOC_FL_PUNCH_HOLE) return btrfs_punch_hole(file, offset, len);
/*
* Only trigger disk allocation, don't trigger qgroup reserve
*
* For qgroup space, it will be checked later.
*/
if (!(mode & FALLOC_FL_ZERO_RANGE)) { ret = btrfs_alloc_data_chunk_ondemand(BTRFS_I(inode),
alloc_end - alloc_start);
if (ret < 0)
return ret;
}
btrfs_inode_lock(inode, BTRFS_ILOCK_MMAP); if (!(mode & FALLOC_FL_KEEP_SIZE) && offset + len > inode->i_size) { ret = inode_newsize_ok(inode, offset + len);
if (ret)
goto out;
}
ret = file_modified(file);
if (ret)
goto out;
/*
* TODO: Move these two operations after we have checked
* accurate reserved space, or fallocate can still fail but
* with page truncated or size expanded.
*
* But that's a minor problem and won't do much harm BTW.
*/
if (alloc_start > inode->i_size) {
ret = btrfs_cont_expand(BTRFS_I(inode), i_size_read(inode),
alloc_start);
if (ret)
goto out;
} else if (offset + len > inode->i_size) {
/*
* If we are fallocating from the end of the file onward we
* need to zero out the end of the block if i_size lands in the
* middle of a block.
*/
ret = btrfs_truncate_block(BTRFS_I(inode), inode->i_size, 0, 0);
if (ret)
goto out;
}
/*
* wait for ordered IO before we have any locks. We'll loop again
* below with the locks held.
*/
ret = btrfs_wait_ordered_range(inode, alloc_start,
alloc_end - alloc_start);
if (ret)
goto out;
if (mode & FALLOC_FL_ZERO_RANGE) {
ret = btrfs_zero_range(inode, offset, len, mode);
btrfs_inode_unlock(inode, BTRFS_ILOCK_MMAP);
return ret;
}
locked_end = alloc_end - 1;
while (1) {
struct btrfs_ordered_extent *ordered;
/* the extent lock is ordered inside the running
* transaction
*/
lock_extent_bits(&BTRFS_I(inode)->io_tree, alloc_start,
locked_end, &cached_state);
ordered = btrfs_lookup_first_ordered_extent(BTRFS_I(inode),
locked_end);
if (ordered &&
ordered->file_offset + ordered->num_bytes > alloc_start &&
ordered->file_offset < alloc_end) {
btrfs_put_ordered_extent(ordered);
unlock_extent_cached(&BTRFS_I(inode)->io_tree,
alloc_start, locked_end,
&cached_state);
/*
* we can't wait on the range with the transaction
* running or with the extent lock held
*/
ret = btrfs_wait_ordered_range(inode, alloc_start,
alloc_end - alloc_start);
if (ret)
goto out;
} else {
if (ordered)
btrfs_put_ordered_extent(ordered);
break;
}
}
/* First, check if we exceed the qgroup limit */
INIT_LIST_HEAD(&reserve_list);
while (cur_offset < alloc_end) {
em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, cur_offset,
alloc_end - cur_offset);
if (IS_ERR(em)) {
ret = PTR_ERR(em);
break;
}
last_byte = min(extent_map_end(em), alloc_end); actual_end = min_t(u64, extent_map_end(em), offset + len);
last_byte = ALIGN(last_byte, blocksize);
if (em->block_start == EXTENT_MAP_HOLE ||
(cur_offset >= inode->i_size && !test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) { ret = add_falloc_range(&reserve_list, cur_offset,
last_byte - cur_offset);
if (ret < 0) {
free_extent_map(em);
break;
}
ret = btrfs_qgroup_reserve_data(BTRFS_I(inode),
&data_reserved, cur_offset,
last_byte - cur_offset);
if (ret < 0) {
cur_offset = last_byte;
free_extent_map(em);
break;
}
} else {
/*
* Do not need to reserve unwritten extent for this
* range, free reserved data space first, otherwise
* it'll result in false ENOSPC error.
*/
btrfs_free_reserved_data_space(BTRFS_I(inode),
data_reserved, cur_offset,
last_byte - cur_offset);
}
free_extent_map(em);
cur_offset = last_byte;
}
/*
* If ret is still 0, means we're OK to fallocate.
* Or just cleanup the list and exit.
*/
list_for_each_entry_safe(range, tmp, &reserve_list, list) { if (!ret)
ret = btrfs_prealloc_file_range(inode, mode,
range->start,
range->len, i_blocksize(inode),
offset + len, &alloc_hint);
else
btrfs_free_reserved_data_space(BTRFS_I(inode),
data_reserved, range->start,
range->len);
list_del(&range->list);
kfree(range);
}
if (ret < 0)
goto out_unlock;
/*
* We didn't need to allocate any more space, but we still extended the
* size of the file so we need to update i_size and the inode item.
*/
ret = btrfs_fallocate_update_isize(inode, actual_end, mode);
out_unlock:
unlock_extent_cached(&BTRFS_I(inode)->io_tree, alloc_start, locked_end,
&cached_state);
out:
btrfs_inode_unlock(inode, BTRFS_ILOCK_MMAP);
/* Let go of our reservation. */
if (ret != 0 && !(mode & FALLOC_FL_ZERO_RANGE)) btrfs_free_reserved_data_space(BTRFS_I(inode), data_reserved,
cur_offset, alloc_end - cur_offset);
extent_changeset_free(data_reserved); return ret;
}
static loff_t find_desired_extent(struct btrfs_inode *inode, loff_t offset,
int whence)
{
struct btrfs_fs_info *fs_info = inode->root->fs_info;
struct extent_map *em = NULL;
struct extent_state *cached_state = NULL;
loff_t i_size = inode->vfs_inode.i_size;
u64 lockstart;
u64 lockend;
u64 start;
u64 len;
int ret = 0;
if (i_size == 0 || offset >= i_size)
return -ENXIO;
/*
* offset can be negative, in this case we start finding DATA/HOLE from
* the very start of the file.
*/
start = max_t(loff_t, 0, offset);
lockstart = round_down(start, fs_info->sectorsize);
lockend = round_up(i_size, fs_info->sectorsize);
if (lockend <= lockstart)
lockend = lockstart + fs_info->sectorsize; lockend--;
len = lockend - lockstart + 1;
lock_extent_bits(&inode->io_tree, lockstart, lockend, &cached_state);
while (start < i_size) {
em = btrfs_get_extent_fiemap(inode, start, len);
if (IS_ERR(em)) {
ret = PTR_ERR(em);
em = NULL;
break;
}
if (whence == SEEK_HOLE &&
(em->block_start == EXTENT_MAP_HOLE ||
test_bit(EXTENT_FLAG_PREALLOC, &em->flags)))
break;
else if (whence == SEEK_DATA &&
(em->block_start != EXTENT_MAP_HOLE &&
!test_bit(EXTENT_FLAG_PREALLOC, &em->flags)))
break;
start = em->start + em->len;
free_extent_map(em);
em = NULL;
cond_resched();
}
free_extent_map(em);
unlock_extent_cached(&inode->io_tree, lockstart, lockend,
&cached_state);
if (ret) {
offset = ret;
} else {
if (whence == SEEK_DATA && start >= i_size)
offset = -ENXIO;
else
offset = min_t(loff_t, start, i_size);
}
return offset;
}
static loff_t btrfs_file_llseek(struct file *file, loff_t offset, int whence)
{
struct inode *inode = file->f_mapping->host; switch (whence) {
default:
return generic_file_llseek(file, offset, whence);
case SEEK_DATA:
case SEEK_HOLE:
btrfs_inode_lock(inode, BTRFS_ILOCK_SHARED);
offset = find_desired_extent(BTRFS_I(inode), offset, whence);
btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED);
break;
}
if (offset < 0)
return offset;
return vfs_setpos(file, offset, inode->i_sb->s_maxbytes);
}
static int btrfs_file_open(struct inode *inode, struct file *filp)
{
int ret;
filp->f_mode |= FMODE_NOWAIT | FMODE_BUF_RASYNC;
ret = fsverity_file_open(inode, filp);
if (ret)
return ret;
return generic_file_open(inode, filp);
}
static int check_direct_read(struct btrfs_fs_info *fs_info,
const struct iov_iter *iter, loff_t offset)
{
int ret;
int i, seg;
ret = check_direct_IO(fs_info, iter, offset);
if (ret < 0)
return ret;
if (!iter_is_iovec(iter))
return 0;
for (seg = 0; seg < iter->nr_segs; seg++) for (i = seg + 1; i < iter->nr_segs; i++) if (iter->iov[seg].iov_base == iter->iov[i].iov_base)
return -EINVAL;
return 0;
}
static ssize_t btrfs_direct_read(struct kiocb *iocb, struct iov_iter *to)
{
struct inode *inode = file_inode(iocb->ki_filp);
size_t prev_left = 0;
ssize_t read = 0;
ssize_t ret;
if (fsverity_active(inode))
return 0;
if (check_direct_read(btrfs_sb(inode->i_sb), to, iocb->ki_pos))
return 0;
btrfs_inode_lock(inode, BTRFS_ILOCK_SHARED);
again:
/*
* This is similar to what we do for direct IO writes, see the comment
* at btrfs_direct_write(), but we also disable page faults in addition
* to disabling them only at the iov_iter level. This is because when
* reading from a hole or prealloc extent, iomap calls iov_iter_zero(),
* which can still trigger page fault ins despite having set ->nofault
* to true of our 'to' iov_iter.
*
* The difference to direct IO writes is that we deadlock when trying
* to lock the extent range in the inode's tree during he page reads
* triggered by the fault in (while for writes it is due to waiting for
* our own ordered extent). This is because for direct IO reads,
* btrfs_dio_iomap_begin() returns with the extent range locked, which
* is only unlocked in the endio callback (end_bio_extent_readpage()).
*/
pagefault_disable();
to->nofault = true;
ret = iomap_dio_rw(iocb, to, &btrfs_dio_iomap_ops, &btrfs_dio_ops,
IOMAP_DIO_PARTIAL, read);
to->nofault = false;
pagefault_enable();
/* No increment (+=) because iomap returns a cumulative value. */
if (ret > 0)
read = ret;
if (iov_iter_count(to) > 0 && (ret == -EFAULT || ret > 0)) {
const size_t left = iov_iter_count(to);
if (left == prev_left) {
/*
* We didn't make any progress since the last attempt,
* fallback to a buffered read for the remainder of the
* range. This is just to avoid any possibility of looping
* for too long.
*/
ret = read;
} else {
/*
* We made some progress since the last retry or this is
* the first time we are retrying. Fault in as many pages
* as possible and retry.
*/
fault_in_iov_iter_writeable(to, left);
prev_left = left;
goto again;
}
}
btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED);
return ret < 0 ? ret : read;
}
static ssize_t btrfs_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
{
ssize_t ret = 0;
if (iocb->ki_flags & IOCB_DIRECT) {
ret = btrfs_direct_read(iocb, to);
if (ret < 0 || !iov_iter_count(to) || iocb->ki_pos >= i_size_read(file_inode(iocb->ki_filp)))
return ret;
}
return filemap_read(iocb, to, ret);
}
const struct file_operations btrfs_file_operations = {
.llseek = btrfs_file_llseek,
.read_iter = btrfs_file_read_iter,
.splice_read = generic_file_splice_read,
.write_iter = btrfs_file_write_iter,
.splice_write = iter_file_splice_write,
.mmap = btrfs_file_mmap,
.open = btrfs_file_open,
.release = btrfs_release_file,
.fsync = btrfs_sync_file,
.fallocate = btrfs_fallocate,
.unlocked_ioctl = btrfs_ioctl,
#ifdef CONFIG_COMPAT
.compat_ioctl = btrfs_compat_ioctl,
#endif
.remap_file_range = btrfs_remap_file_range,
};
void __cold btrfs_auto_defrag_exit(void)
{
kmem_cache_destroy(btrfs_inode_defrag_cachep);
}
int __init btrfs_auto_defrag_init(void)
{
btrfs_inode_defrag_cachep = kmem_cache_create("btrfs_inode_defrag",
sizeof(struct inode_defrag), 0,
SLAB_MEM_SPREAD,
NULL);
if (!btrfs_inode_defrag_cachep)
return -ENOMEM;
return 0;
}
int btrfs_fdatawrite_range(struct inode *inode, loff_t start, loff_t end)
{
int ret;
/*
* So with compression we will find and lock a dirty page and clear the
* first one as dirty, setup an async extent, and immediately return
* with the entire range locked but with nobody actually marked with
* writeback. So we can't just filemap_write_and_wait_range() and
* expect it to work since it will just kick off a thread to do the
* actual work. So we need to call filemap_fdatawrite_range _again_
* since it will wait on the page lock, which won't be unlocked until
* after the pages have been marked as writeback and so we're good to go
* from there. We have to do this otherwise we'll miss the ordered
* extents and that results in badness. Please Josef, do not think you
* know better and pull this out at some point in the future, it is
* right and you are wrong.
*/
ret = filemap_fdatawrite_range(inode->i_mapping, start, end);
if (!ret && test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
&BTRFS_I(inode)->runtime_flags)) ret = filemap_fdatawrite_range(inode->i_mapping, start, end); return ret;
}
/*
* mm/rmap.c - physical to virtual reverse mappings
*
* Copyright 2001, Rik van Riel <riel@conectiva.com.br>
* Released under the General Public License (GPL).
*
* Simple, low overhead reverse mapping scheme.
* Please try to keep this thing as modular as possible.
*
* Provides methods for unmapping each kind of mapped page:
* the anon methods track anonymous pages, and
* the file methods track pages belonging to an inode.
*
* Original design by Rik van Riel <riel@conectiva.com.br> 2001
* File methods by Dave McCracken <dmccr@us.ibm.com> 2003, 2004
* Anonymous methods by Andrea Arcangeli <andrea@suse.de> 2004
* Contributions by Hugh Dickins 2003, 2004
*/
/*
* Lock ordering in mm:
*
* inode->i_rwsem (while writing or truncating, not reading or faulting)
* mm->mmap_lock
* mapping->invalidate_lock (in filemap_fault)
* page->flags PG_locked (lock_page) * (see hugetlbfs below)
* hugetlbfs_i_mmap_rwsem_key (in huge_pmd_share)
* mapping->i_mmap_rwsem
* hugetlb_fault_mutex (hugetlbfs specific page fault mutex)
* anon_vma->rwsem
* mm->page_table_lock or pte_lock
* swap_lock (in swap_duplicate, swap_info_get)
* mmlist_lock (in mmput, drain_mmlist and others)
* mapping->private_lock (in __set_page_dirty_buffers)
* lock_page_memcg move_lock (in __set_page_dirty_buffers)
* i_pages lock (widely used)
* lruvec->lru_lock (in lock_page_lruvec_irq)
* inode->i_lock (in set_page_dirty's __mark_inode_dirty)
* bdi.wb->list_lock (in set_page_dirty's __mark_inode_dirty)
* sb_lock (within inode_lock in fs/fs-writeback.c)
* i_pages lock (widely used, in set_page_dirty,
* in arch-dependent flush_dcache_mmap_lock,
* within bdi.wb->list_lock in __sync_single_inode)
*
* anon_vma->rwsem,mapping->i_mmap_rwsem (memory_failure, collect_procs_anon)
* ->tasklist_lock
* pte map lock
*
* * hugetlbfs PageHuge() pages take locks in this order:
* mapping->i_mmap_rwsem
* hugetlb_fault_mutex (hugetlbfs specific page fault mutex)
* page->flags PG_locked (lock_page)
*/
#include <linux/mm.h>
#include <linux/sched/mm.h>
#include <linux/sched/task.h>
#include <linux/pagemap.h>
#include <linux/swap.h>
#include <linux/swapops.h>
#include <linux/slab.h>
#include <linux/init.h>
#include <linux/ksm.h>
#include <linux/rmap.h>
#include <linux/rcupdate.h>
#include <linux/export.h>
#include <linux/memcontrol.h>
#include <linux/mmu_notifier.h>
#include <linux/migrate.h>
#include <linux/hugetlb.h>
#include <linux/huge_mm.h>
#include <linux/backing-dev.h>
#include <linux/page_idle.h>
#include <linux/memremap.h>
#include <linux/userfaultfd_k.h>
#include <asm/tlbflush.h>
#include <trace/events/tlb.h>
#include "internal.h"
static struct kmem_cache *anon_vma_cachep;
static struct kmem_cache *anon_vma_chain_cachep;
static inline struct anon_vma *anon_vma_alloc(void)
{
struct anon_vma *anon_vma;
anon_vma = kmem_cache_alloc(anon_vma_cachep, GFP_KERNEL);
if (anon_vma) {
atomic_set(&anon_vma->refcount, 1);
anon_vma->degree = 1; /* Reference for first vma */
anon_vma->parent = anon_vma;
/*
* Initialise the anon_vma root to point to itself. If called
* from fork, the root will be reset to the parents anon_vma.
*/
anon_vma->root = anon_vma;
}
return anon_vma;
}
static inline void anon_vma_free(struct anon_vma *anon_vma)
{
VM_BUG_ON(atomic_read(&anon_vma->refcount));
/*
* Synchronize against page_lock_anon_vma_read() such that
* we can safely hold the lock without the anon_vma getting
* freed.
*
* Relies on the full mb implied by the atomic_dec_and_test() from
* put_anon_vma() against the acquire barrier implied by
* down_read_trylock() from page_lock_anon_vma_read(). This orders:
*
* page_lock_anon_vma_read() VS put_anon_vma()
* down_read_trylock() atomic_dec_and_test()
* LOCK MB
* atomic_read() rwsem_is_locked()
*
* LOCK should suffice since the actual taking of the lock must
* happen _before_ what follows.
*/
might_sleep();
if (rwsem_is_locked(&anon_vma->root->rwsem)) {
anon_vma_lock_write(anon_vma);
anon_vma_unlock_write(anon_vma);
}
kmem_cache_free(anon_vma_cachep, anon_vma);
}
static inline struct anon_vma_chain *anon_vma_chain_alloc(gfp_t gfp)
{
return kmem_cache_alloc(anon_vma_chain_cachep, gfp);
}
static void anon_vma_chain_free(struct anon_vma_chain *anon_vma_chain)
{
kmem_cache_free(anon_vma_chain_cachep, anon_vma_chain);
}
static void anon_vma_chain_link(struct vm_area_struct *vma,
struct anon_vma_chain *avc,
struct anon_vma *anon_vma)
{
avc->vma = vma;
avc->anon_vma = anon_vma;
list_add(&avc->same_vma, &vma->anon_vma_chain);
anon_vma_interval_tree_insert(avc, &anon_vma->rb_root);
}
/**
* __anon_vma_prepare - attach an anon_vma to a memory region
* @vma: the memory region in question
*
* This makes sure the memory mapping described by 'vma' has
* an 'anon_vma' attached to it, so that we can associate the
* anonymous pages mapped into it with that anon_vma.
*
* The common case will be that we already have one, which
* is handled inline by anon_vma_prepare(). But if
* not we either need to find an adjacent mapping that we
* can re-use the anon_vma from (very common when the only
* reason for splitting a vma has been mprotect()), or we
* allocate a new one.
*
* Anon-vma allocations are very subtle, because we may have
* optimistically looked up an anon_vma in page_lock_anon_vma_read()
* and that may actually touch the rwsem even in the newly
* allocated vma (it depends on RCU to make sure that the
* anon_vma isn't actually destroyed).
*
* As a result, we need to do proper anon_vma locking even
* for the new allocation. At the same time, we do not want
* to do any locking for the common case of already having
* an anon_vma.
*
* This must be called with the mmap_lock held for reading.
*/
int __anon_vma_prepare(struct vm_area_struct *vma)
{
struct mm_struct *mm = vma->vm_mm;
struct anon_vma *anon_vma, *allocated;
struct anon_vma_chain *avc;
might_sleep();
avc = anon_vma_chain_alloc(GFP_KERNEL);
if (!avc)
goto out_enomem;
anon_vma = find_mergeable_anon_vma(vma);
allocated = NULL;
if (!anon_vma) {
anon_vma = anon_vma_alloc();
if (unlikely(!anon_vma))
goto out_enomem_free_avc;
allocated = anon_vma;
}
anon_vma_lock_write(anon_vma);
/* page_table_lock to protect against threads */
spin_lock(&mm->page_table_lock);
if (likely(!vma->anon_vma)) {
vma->anon_vma = anon_vma;
anon_vma_chain_link(vma, avc, anon_vma);
/* vma reference or self-parent link for new root */
anon_vma->degree++;
allocated = NULL;
avc = NULL;
}
spin_unlock(&mm->page_table_lock);
anon_vma_unlock_write(anon_vma);
if (unlikely(allocated))
put_anon_vma(allocated);
if (unlikely(avc))
anon_vma_chain_free(avc);
return 0;
out_enomem_free_avc:
anon_vma_chain_free(avc);
out_enomem:
return -ENOMEM;
}
/*
* This is a useful helper function for locking the anon_vma root as
* we traverse the vma->anon_vma_chain, looping over anon_vma's that
* have the same vma.
*
* Such anon_vma's should have the same root, so you'd expect to see
* just a single mutex_lock for the whole traversal.
*/
static inline struct anon_vma *lock_anon_vma_root(struct anon_vma *root, struct anon_vma *anon_vma)
{
struct anon_vma *new_root = anon_vma->root;
if (new_root != root) {
if (WARN_ON_ONCE(root))
up_write(&root->rwsem);
root = new_root;
down_write(&root->rwsem);
}
return root;
}
static inline void unlock_anon_vma_root(struct anon_vma *root)
{
if (root)
up_write(&root->rwsem);
}
/*
* Attach the anon_vmas from src to dst.
* Returns 0 on success, -ENOMEM on failure.
*
* anon_vma_clone() is called by __vma_adjust(), __split_vma(), copy_vma() and
* anon_vma_fork(). The first three want an exact copy of src, while the last
* one, anon_vma_fork(), may try to reuse an existing anon_vma to prevent
* endless growth of anon_vma. Since dst->anon_vma is set to NULL before call,
* we can identify this case by checking (!dst->anon_vma && src->anon_vma).
*
* If (!dst->anon_vma && src->anon_vma) is true, this function tries to find
* and reuse existing anon_vma which has no vmas and only one child anon_vma.
* This prevents degradation of anon_vma hierarchy to endless linear chain in
* case of constantly forking task. On the other hand, an anon_vma with more
* than one child isn't reused even if there was no alive vma, thus rmap
* walker has a good chance of avoiding scanning the whole hierarchy when it
* searches where page is mapped.
*/
int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src)
{
struct anon_vma_chain *avc, *pavc;
struct anon_vma *root = NULL;
list_for_each_entry_reverse(pavc, &src->anon_vma_chain, same_vma) {
struct anon_vma *anon_vma;
avc = anon_vma_chain_alloc(GFP_NOWAIT | __GFP_NOWARN);
if (unlikely(!avc)) {
unlock_anon_vma_root(root);
root = NULL;
avc = anon_vma_chain_alloc(GFP_KERNEL);
if (!avc)
goto enomem_failure;
}
anon_vma = pavc->anon_vma;
root = lock_anon_vma_root(root, anon_vma);
anon_vma_chain_link(dst, avc, anon_vma);
/*
* Reuse existing anon_vma if its degree lower than two,
* that means it has no vma and only one anon_vma child.
*
* Do not chose parent anon_vma, otherwise first child
* will always reuse it. Root anon_vma is never reused:
* it has self-parent reference and at least one child.
*/
if (!dst->anon_vma && src->anon_vma &&
anon_vma != src->anon_vma && anon_vma->degree < 2)
dst->anon_vma = anon_vma;
}
if (dst->anon_vma)
dst->anon_vma->degree++;
unlock_anon_vma_root(root);
return 0;
enomem_failure:
/*
* dst->anon_vma is dropped here otherwise its degree can be incorrectly
* decremented in unlink_anon_vmas().
* We can safely do this because callers of anon_vma_clone() don't care
* about dst->anon_vma if anon_vma_clone() failed.
*/
dst->anon_vma = NULL;
unlink_anon_vmas(dst);
return -ENOMEM;
}
/*
* Attach vma to its own anon_vma, as well as to the anon_vmas that
* the corresponding VMA in the parent process is attached to.
* Returns 0 on success, non-zero on failure.
*/
int anon_vma_fork(struct vm_area_struct *vma, struct vm_area_struct *pvma)
{
struct anon_vma_chain *avc;
struct anon_vma *anon_vma;
int error;
/* Don't bother if the parent process has no anon_vma here. */
if (!pvma->anon_vma)
return 0;
/* Drop inherited anon_vma, we'll reuse existing or allocate new. */
vma->anon_vma = NULL;
/*
* First, attach the new VMA to the parent VMA's anon_vmas,
* so rmap can find non-COWed pages in child processes.
*/
error = anon_vma_clone(vma, pvma);
if (error)
return error;
/* An existing anon_vma has been reused, all done then. */
if (vma->anon_vma)
return 0;
/* Then add our own anon_vma. */
anon_vma = anon_vma_alloc();
if (!anon_vma)
goto out_error;
avc = anon_vma_chain_alloc(GFP_KERNEL);
if (!avc)
goto out_error_free_anon_vma;
/*
* The root anon_vma's rwsem is the lock actually used when we
* lock any of the anon_vmas in this anon_vma tree.
*/
anon_vma->root = pvma->anon_vma->root;
anon_vma->parent = pvma->anon_vma;
/*
* With refcounts, an anon_vma can stay around longer than the
* process it belongs to. The root anon_vma needs to be pinned until
* this anon_vma is freed, because the lock lives in the root.
*/
get_anon_vma(anon_vma->root);
/* Mark this anon_vma as the one where our new (COWed) pages go. */
vma->anon_vma = anon_vma;
anon_vma_lock_write(anon_vma);
anon_vma_chain_link(vma, avc, anon_vma);
anon_vma->parent->degree++;
anon_vma_unlock_write(anon_vma);
return 0;
out_error_free_anon_vma:
put_anon_vma(anon_vma);
out_error:
unlink_anon_vmas(vma);
return -ENOMEM;
}
void unlink_anon_vmas(struct vm_area_struct *vma)
{
struct anon_vma_chain *avc, *next;
struct anon_vma *root = NULL;
/*
* Unlink each anon_vma chained to the VMA. This list is ordered
* from newest to oldest, ensuring the root anon_vma gets freed last.
*/
list_for_each_entry_safe(avc, next, &vma->anon_vma_chain, same_vma) {
struct anon_vma *anon_vma = avc->anon_vma;
root = lock_anon_vma_root(root, anon_vma);
anon_vma_interval_tree_remove(avc, &anon_vma->rb_root);
/*
* Leave empty anon_vmas on the list - we'll need
* to free them outside the lock.
*/
if (RB_EMPTY_ROOT(&anon_vma->rb_root.rb_root)) {
anon_vma->parent->degree--;
continue;
}
list_del(&avc->same_vma);
anon_vma_chain_free(avc);
}
if (vma->anon_vma) {
vma->anon_vma->degree--;
/*
* vma would still be needed after unlink, and anon_vma will be prepared
* when handle fault.
*/
vma->anon_vma = NULL;
}
unlock_anon_vma_root(root);
/*
* Iterate the list once more, it now only contains empty and unlinked
* anon_vmas, destroy them. Could not do before due to __put_anon_vma()
* needing to write-acquire the anon_vma->root->rwsem.
*/
list_for_each_entry_safe(avc, next, &vma->anon_vma_chain, same_vma) {
struct anon_vma *anon_vma = avc->anon_vma;
VM_WARN_ON(anon_vma->degree);
put_anon_vma(anon_vma);
list_del(&avc->same_vma);
anon_vma_chain_free(avc);
}
}
static void anon_vma_ctor(void *data)
{
struct anon_vma *anon_vma = data;
init_rwsem(&anon_vma->rwsem);
atomic_set(&anon_vma->refcount, 0);
anon_vma->rb_root = RB_ROOT_CACHED;
}
void __init anon_vma_init(void)
{
anon_vma_cachep = kmem_cache_create("anon_vma", sizeof(struct anon_vma),
0, SLAB_TYPESAFE_BY_RCU|SLAB_PANIC|SLAB_ACCOUNT,
anon_vma_ctor);
anon_vma_chain_cachep = KMEM_CACHE(anon_vma_chain,
SLAB_PANIC|SLAB_ACCOUNT);
}
/*
* Getting a lock on a stable anon_vma from a page off the LRU is tricky!
*
* Since there is no serialization what so ever against page_remove_rmap()
* the best this function can do is return a refcount increased anon_vma
* that might have been relevant to this page.
*
* The page might have been remapped to a different anon_vma or the anon_vma
* returned may already be freed (and even reused).
*
* In case it was remapped to a different anon_vma, the new anon_vma will be a
* child of the old anon_vma, and the anon_vma lifetime rules will therefore
* ensure that any anon_vma obtained from the page will still be valid for as
* long as we observe page_mapped() [ hence all those page_mapped() tests ].
*
* All users of this function must be very careful when walking the anon_vma
* chain and verify that the page in question is indeed mapped in it
* [ something equivalent to page_mapped_in_vma() ].
*
* Since anon_vma's slab is SLAB_TYPESAFE_BY_RCU and we know from
* page_remove_rmap() that the anon_vma pointer from page->mapping is valid
* if there is a mapcount, we can dereference the anon_vma after observing
* those.
*/
struct anon_vma *page_get_anon_vma(struct page *page)
{
struct anon_vma *anon_vma = NULL;
unsigned long anon_mapping;
rcu_read_lock();
anon_mapping = (unsigned long)READ_ONCE(page->mapping);
if ((anon_mapping & PAGE_MAPPING_FLAGS) != PAGE_MAPPING_ANON)
goto out;
if (!page_mapped(page))
goto out;
anon_vma = (struct anon_vma *) (anon_mapping - PAGE_MAPPING_ANON);
if (!atomic_inc_not_zero(&anon_vma->refcount)) {
anon_vma = NULL;
goto out;
}
/*
* If this page is still mapped, then its anon_vma cannot have been
* freed. But if it has been unmapped, we have no security against the
* anon_vma structure being freed and reused (for another anon_vma:
* SLAB_TYPESAFE_BY_RCU guarantees that - so the atomic_inc_not_zero()
* above cannot corrupt).
*/
if (!page_mapped(page)) {
rcu_read_unlock();
put_anon_vma(anon_vma);
return NULL;
}
out:
rcu_read_unlock();
return anon_vma;
}
/*
* Similar to page_get_anon_vma() except it locks the anon_vma.
*
* Its a little more complex as it tries to keep the fast path to a single
* atomic op -- the trylock. If we fail the trylock, we fall back to getting a
* reference like with page_get_anon_vma() and then block on the mutex.
*/
struct anon_vma *page_lock_anon_vma_read(struct page *page)
{
struct anon_vma *anon_vma = NULL;
struct anon_vma *root_anon_vma;
unsigned long anon_mapping;
rcu_read_lock();
anon_mapping = (unsigned long)READ_ONCE(page->mapping);
if ((anon_mapping & PAGE_MAPPING_FLAGS) != PAGE_MAPPING_ANON)
goto out;
if (!page_mapped(page))
goto out;
anon_vma = (struct anon_vma *) (anon_mapping - PAGE_MAPPING_ANON);
root_anon_vma = READ_ONCE(anon_vma->root);
if (down_read_trylock(&root_anon_vma->rwsem)) {
/*
* If the page is still mapped, then this anon_vma is still
* its anon_vma, and holding the mutex ensures that it will
* not go away, see anon_vma_free().
*/
if (!page_mapped(page)) {
up_read(&root_anon_vma->rwsem);
anon_vma = NULL;
}
goto out;
}
/* trylock failed, we got to sleep */
if (!atomic_inc_not_zero(&anon_vma->refcount)) {
anon_vma = NULL;
goto out;
}
if (!page_mapped(page)) {
rcu_read_unlock();
put_anon_vma(anon_vma);
return NULL;
}
/* we pinned the anon_vma, its safe to sleep */
rcu_read_unlock();
anon_vma_lock_read(anon_vma);
if (atomic_dec_and_test(&anon_vma->refcount)) {
/*
* Oops, we held the last refcount, release the lock
* and bail -- can't simply use put_anon_vma() because
* we'll deadlock on the anon_vma_lock_write() recursion.
*/
anon_vma_unlock_read(anon_vma);
__put_anon_vma(anon_vma);
anon_vma = NULL;
}
return anon_vma;
out:
rcu_read_unlock();
return anon_vma;
}
void page_unlock_anon_vma_read(struct anon_vma *anon_vma)
{
anon_vma_unlock_read(anon_vma);
}
#ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH
/*
* Flush TLB entries for recently unmapped pages from remote CPUs. It is
* important if a PTE was dirty when it was unmapped that it's flushed
* before any IO is initiated on the page to prevent lost writes. Similarly,
* it must be flushed before freeing to prevent data leakage.
*/
void try_to_unmap_flush(void)
{
struct tlbflush_unmap_batch *tlb_ubc = ¤t->tlb_ubc;
if (!tlb_ubc->flush_required)
return;
arch_tlbbatch_flush(&tlb_ubc->arch);
tlb_ubc->flush_required = false;
tlb_ubc->writable = false;
}
/* Flush iff there are potentially writable TLB entries that can race with IO */
void try_to_unmap_flush_dirty(void)
{
struct tlbflush_unmap_batch *tlb_ubc = ¤t->tlb_ubc;
if (tlb_ubc->writable)
try_to_unmap_flush();
}
static void set_tlb_ubc_flush_pending(struct mm_struct *mm, bool writable)
{
struct tlbflush_unmap_batch *tlb_ubc = ¤t->tlb_ubc;
arch_tlbbatch_add_mm(&tlb_ubc->arch, mm);
tlb_ubc->flush_required = true;
/*
* Ensure compiler does not re-order the setting of tlb_flush_batched
* before the PTE is cleared.
*/
barrier();
mm->tlb_flush_batched = true;
/*
* If the PTE was dirty then it's best to assume it's writable. The
* caller must use try_to_unmap_flush_dirty() or try_to_unmap_flush()
* before the page is queued for IO.
*/
if (writable)
tlb_ubc->writable = true;
}
/*
* Returns true if the TLB flush should be deferred to the end of a batch of
* unmap operations to reduce IPIs.
*/
static bool should_defer_flush(struct mm_struct *mm, enum ttu_flags flags)
{
bool should_defer = false;
if (!(flags & TTU_BATCH_FLUSH))
return false;
/* If remote CPUs need to be flushed then defer batch the flush */
if (cpumask_any_but(mm_cpumask(mm), get_cpu()) < nr_cpu_ids)
should_defer = true;
put_cpu();
return should_defer;
}
/*
* Reclaim unmaps pages under the PTL but do not flush the TLB prior to
* releasing the PTL if TLB flushes are batched. It's possible for a parallel
* operation such as mprotect or munmap to race between reclaim unmapping
* the page and flushing the page. If this race occurs, it potentially allows
* access to data via a stale TLB entry. Tracking all mm's that have TLB
* batching in flight would be expensive during reclaim so instead track
* whether TLB batching occurred in the past and if so then do a flush here
* if required. This will cost one additional flush per reclaim cycle paid
* by the first operation at risk such as mprotect and mumap.
*
* This must be called under the PTL so that an access to tlb_flush_batched
* that is potentially a "reclaim vs mprotect/munmap/etc" race will synchronise
* via the PTL.
*/
void flush_tlb_batched_pending(struct mm_struct *mm)
{
if (data_race(mm->tlb_flush_batched)) {
flush_tlb_mm(mm);
/*
* Do not allow the compiler to re-order the clearing of
* tlb_flush_batched before the tlb is flushed.
*/
barrier();
mm->tlb_flush_batched = false;
}
}
#else
static void set_tlb_ubc_flush_pending(struct mm_struct *mm, bool writable)
{
}
static bool should_defer_flush(struct mm_struct *mm, enum ttu_flags flags)
{
return false;
}
#endif /* CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH */
/*
* At what user virtual address is page expected in vma?
* Caller should check the page is actually part of the vma.
*/
unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma)
{
if (PageAnon(page)) {
struct anon_vma *page__anon_vma = page_anon_vma(page);
/*
* Note: swapoff's unuse_vma() is more efficient with this
* check, and needs it to match anon_vma when KSM is active.
*/
if (!vma->anon_vma || !page__anon_vma ||
vma->anon_vma->root != page__anon_vma->root)
return -EFAULT;
} else if (!vma->vm_file) {
return -EFAULT;
} else if (vma->vm_file->f_mapping != compound_head(page)->mapping) {
return -EFAULT;
}
return vma_address(page, vma);
}
pmd_t *mm_find_pmd(struct mm_struct *mm, unsigned long address)
{
pgd_t *pgd;
p4d_t *p4d;
pud_t *pud;
pmd_t *pmd = NULL;
pmd_t pmde;
pgd = pgd_offset(mm, address);
if (!pgd_present(*pgd))
goto out;
p4d = p4d_offset(pgd, address);
if (!p4d_present(*p4d))
goto out;
pud = pud_offset(p4d, address);
if (!pud_present(*pud))
goto out;
pmd = pmd_offset(pud, address);
/*
* Some THP functions use the sequence pmdp_huge_clear_flush(), set_pmd_at()
* without holding anon_vma lock for write. So when looking for a
* genuine pmde (in which to find pte), test present and !THP together.
*/
pmde = *pmd;
barrier();
if (!pmd_present(pmde) || pmd_trans_huge(pmde))
pmd = NULL;
out:
return pmd;
}
struct page_referenced_arg {
int mapcount;
int referenced;
unsigned long vm_flags;
struct mem_cgroup *memcg;
};
/*
* arg: page_referenced_arg will be passed
*/
static bool page_referenced_one(struct page *page, struct vm_area_struct *vma,
unsigned long address, void *arg)
{
struct page_referenced_arg *pra = arg;
struct page_vma_mapped_walk pvmw = {
.page = page,
.vma = vma,
.address = address,
};
int referenced = 0;
while (page_vma_mapped_walk(&pvmw)) {
address = pvmw.address;
if (vma->vm_flags & VM_LOCKED) {
page_vma_mapped_walk_done(&pvmw);
pra->vm_flags |= VM_LOCKED;
return false; /* To break the loop */
}
if (pvmw.pte) {
if (ptep_clear_flush_young_notify(vma, address,
pvmw.pte)) {
/*
* Don't treat a reference through
* a sequentially read mapping as such.
* If the page has been used in another mapping,
* we will catch it; if this other mapping is
* already gone, the unmap path will have set
* PG_referenced or activated the page.
*/
if (likely(!(vma->vm_flags & VM_SEQ_READ)))
referenced++;
}
} else if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) {
if (pmdp_clear_flush_young_notify(vma, address,
pvmw.pmd))
referenced++;
} else {
/* unexpected pmd-mapped page? */
WARN_ON_ONCE(1);
}
pra->mapcount--;
}
if (referenced)
clear_page_idle(page);
if (test_and_clear_page_young(page))
referenced++;
if (referenced) {
pra->referenced++;
pra->vm_flags |= vma->vm_flags;
}
if (!pra->mapcount)
return false; /* To break the loop */
return true;
}
static bool invalid_page_referenced_vma(struct vm_area_struct *vma, void *arg)
{
struct page_referenced_arg *pra = arg;
struct mem_cgroup *memcg = pra->memcg;
if (!mm_match_cgroup(vma->vm_mm, memcg))
return true;
return false;
}
/**
* page_referenced - test if the page was referenced
* @page: the page to test
* @is_locked: caller holds lock on the page
* @memcg: target memory cgroup
* @vm_flags: collect encountered vma->vm_flags who actually referenced the page
*
* Quick test_and_clear_referenced for all mappings to a page,
* returns the number of ptes which referenced the page.
*/
int page_referenced(struct page *page,
int is_locked,
struct mem_cgroup *memcg,
unsigned long *vm_flags)
{
int we_locked = 0;
struct page_referenced_arg pra = {
.mapcount = total_mapcount(page),
.memcg = memcg,
};
struct rmap_walk_control rwc = {
.rmap_one = page_referenced_one,
.arg = (void *)&pra,
.anon_lock = page_lock_anon_vma_read,
};
*vm_flags = 0;
if (!pra.mapcount)
return 0;
if (!page_rmapping(page))
return 0;
if (!is_locked && (!PageAnon(page) || PageKsm(page))) {
we_locked = trylock_page(page);
if (!we_locked)
return 1;
}
/*
* If we are reclaiming on behalf of a cgroup, skip
* counting on behalf of references from different
* cgroups
*/
if (memcg) {
rwc.invalid_vma = invalid_page_referenced_vma;
}
rmap_walk(page, &rwc);
*vm_flags = pra.vm_flags;
if (we_locked)
unlock_page(page);
return pra.referenced;
}
static bool page_mkclean_one(struct page *page, struct vm_area_struct *vma,
unsigned long address, void *arg)
{
struct page_vma_mapped_walk pvmw = {
.page = page,
.vma = vma,
.address = address,
.flags = PVMW_SYNC,
};
struct mmu_notifier_range range;
int *cleaned = arg;
/*
* We have to assume the worse case ie pmd for invalidation. Note that
* the page can not be free from this function.
*/
mmu_notifier_range_init(&range, MMU_NOTIFY_PROTECTION_PAGE,
0, vma, vma->vm_mm, address,
vma_address_end(page, vma));
mmu_notifier_invalidate_range_start(&range);
while (page_vma_mapped_walk(&pvmw)) {
int ret = 0;
address = pvmw.address;
if (pvmw.pte) {
pte_t entry;
pte_t *pte = pvmw.pte;
if (!pte_dirty(*pte) && !pte_write(*pte))
continue;
flush_cache_page(vma, address, pte_pfn(*pte));
entry = ptep_clear_flush(vma, address, pte);
entry = pte_wrprotect(entry);
entry = pte_mkclean(entry);
set_pte_at(vma->vm_mm, address, pte, entry);
ret = 1;
} else {
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
pmd_t *pmd = pvmw.pmd;
pmd_t entry;
if (!pmd_dirty(*pmd) && !pmd_write(*pmd))
continue;
flush_cache_page(vma, address, page_to_pfn(page));
entry = pmdp_invalidate(vma, address, pmd);
entry = pmd_wrprotect(entry);
entry = pmd_mkclean(entry);
set_pmd_at(vma->vm_mm, address, pmd, entry);
ret = 1;
#else
/* unexpected pmd-mapped page? */
WARN_ON_ONCE(1);
#endif
}
/*
* No need to call mmu_notifier_invalidate_range() as we are
* downgrading page table protection not changing it to point
* to a new page.
*
* See Documentation/vm/mmu_notifier.rst
*/
if (ret)
(*cleaned)++;
}
mmu_notifier_invalidate_range_end(&range);
return true;
}
static bool invalid_mkclean_vma(struct vm_area_struct *vma, void *arg)
{
if (vma->vm_flags & VM_SHARED)
return false;
return true;
}
int page_mkclean(struct page *page)
{
int cleaned = 0;
struct address_space *mapping;
struct rmap_walk_control rwc = {
.arg = (void *)&cleaned,
.rmap_one = page_mkclean_one,
.invalid_vma = invalid_mkclean_vma,
};
BUG_ON(!PageLocked(page)); if (!page_mapped(page)) return 0; mapping = page_mapping(page);
if (!mapping)
return 0;
rmap_walk(page, &rwc);
return cleaned;
}
EXPORT_SYMBOL_GPL(page_mkclean);
/**
* page_move_anon_rmap - move a page to our anon_vma
* @page: the page to move to our anon_vma
* @vma: the vma the page belongs to
*
* When a page belongs exclusively to one process after a COW event,
* that page can be moved into the anon_vma that belongs to just that
* process, so the rmap code will not search the parent or sibling
* processes.
*/
void page_move_anon_rmap(struct page *page, struct vm_area_struct *vma)
{
struct anon_vma *anon_vma = vma->anon_vma;
page = compound_head(page);
VM_BUG_ON_PAGE(!PageLocked(page), page);
VM_BUG_ON_VMA(!anon_vma, vma);
anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON;
/*
* Ensure that anon_vma and the PAGE_MAPPING_ANON bit are written
* simultaneously, so a concurrent reader (eg page_referenced()'s
* PageAnon()) will not see one without the other.
*/
WRITE_ONCE(page->mapping, (struct address_space *) anon_vma);
}
/**
* __page_set_anon_rmap - set up new anonymous rmap
* @page: Page or Hugepage to add to rmap
* @vma: VM area to add page to.
* @address: User virtual address of the mapping
* @exclusive: the page is exclusively owned by the current process
*/
static void __page_set_anon_rmap(struct page *page,
struct vm_area_struct *vma, unsigned long address, int exclusive)
{
struct anon_vma *anon_vma = vma->anon_vma; BUG_ON(!anon_vma);
if (PageAnon(page))
return;
/*
* If the page isn't exclusively mapped into this vma,
* we must use the _oldest_ possible anon_vma for the
* page mapping!
*/
if (!exclusive) anon_vma = anon_vma->root;
/*
* page_idle does a lockless/optimistic rmap scan on page->mapping.
* Make sure the compiler doesn't split the stores of anon_vma and
* the PAGE_MAPPING_ANON type identifier, otherwise the rmap code
* could mistake the mapping for a struct address_space and crash.
*/
anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON;
WRITE_ONCE(page->mapping, (struct address_space *) anon_vma);
page->index = linear_page_index(vma, address);
}
/**
* __page_check_anon_rmap - sanity check anonymous rmap addition
* @page: the page to add the mapping to
* @vma: the vm area in which the mapping is added
* @address: the user virtual address mapped
*/
static void __page_check_anon_rmap(struct page *page,
struct vm_area_struct *vma, unsigned long address)
{
/*
* The page's anon-rmap details (mapping and index) are guaranteed to
* be set up correctly at this point.
*
* We have exclusion against page_add_anon_rmap because the caller
* always holds the page locked.
*
* We have exclusion against page_add_new_anon_rmap because those pages
* are initially only visible via the pagetables, and the pte is locked
* over the call to page_add_new_anon_rmap.
*/
VM_BUG_ON_PAGE(page_anon_vma(page)->root != vma->anon_vma->root, page);
VM_BUG_ON_PAGE(page_to_pgoff(page) != linear_page_index(vma, address),
page);
}
/**
* page_add_anon_rmap - add pte mapping to an anonymous page
* @page: the page to add the mapping to
* @vma: the vm area in which the mapping is added
* @address: the user virtual address mapped
* @compound: charge the page as compound or small page
*
* The caller needs to hold the pte lock, and the page must be locked in
* the anon_vma case: to serialize mapping,index checking after setting,
* and to ensure that PageAnon is not being upgraded racily to PageKsm
* (but PageKsm is never downgraded to PageAnon).
*/
void page_add_anon_rmap(struct page *page,
struct vm_area_struct *vma, unsigned long address, bool compound)
{
do_page_add_anon_rmap(page, vma, address, compound ? RMAP_COMPOUND : 0);
}
/*
* Special version of the above for do_swap_page, which often runs
* into pages that are exclusively owned by the current process.
* Everybody else should continue to use page_add_anon_rmap above.
*/
void do_page_add_anon_rmap(struct page *page,
struct vm_area_struct *vma, unsigned long address, int flags)
{
bool compound = flags & RMAP_COMPOUND;
bool first;
if (unlikely(PageKsm(page)))
lock_page_memcg(page);
else
VM_BUG_ON_PAGE(!PageLocked(page), page);
if (compound) {
atomic_t *mapcount;
VM_BUG_ON_PAGE(!PageLocked(page), page);
VM_BUG_ON_PAGE(!PageTransHuge(page), page);
mapcount = compound_mapcount_ptr(page);
first = atomic_inc_and_test(mapcount);
} else {
first = atomic_inc_and_test(&page->_mapcount);
}
if (first) {
int nr = compound ? thp_nr_pages(page) : 1;
/*
* We use the irq-unsafe __{inc|mod}_zone_page_stat because
* these counters are not modified in interrupt context, and
* pte lock(a spinlock) is held, which implies preemption
* disabled.
*/
if (compound)
__mod_lruvec_page_state(page, NR_ANON_THPS, nr);
__mod_lruvec_page_state(page, NR_ANON_MAPPED, nr);
}
if (unlikely(PageKsm(page))) {
unlock_page_memcg(page);
return;
}
/* address might be in next vma when migration races vma_adjust */
if (first)
__page_set_anon_rmap(page, vma, address,
flags & RMAP_EXCLUSIVE);
else
__page_check_anon_rmap(page, vma, address);
}
/**
* page_add_new_anon_rmap - add pte mapping to a new anonymous page
* @page: the page to add the mapping to
* @vma: the vm area in which the mapping is added
* @address: the user virtual address mapped
* @compound: charge the page as compound or small page
*
* Same as page_add_anon_rmap but must only be called on *new* pages.
* This means the inc-and-test can be bypassed.
* Page does not have to be locked.
*/
void page_add_new_anon_rmap(struct page *page,
struct vm_area_struct *vma, unsigned long address, bool compound)
{
int nr = compound ? thp_nr_pages(page) : 1;
VM_BUG_ON_VMA(address < vma->vm_start || address >= vma->vm_end, vma);
__SetPageSwapBacked(page);
if (compound) {
VM_BUG_ON_PAGE(!PageTransHuge(page), page);
/* increment count (starts at -1) */
atomic_set(compound_mapcount_ptr(page), 0);
if (hpage_pincount_available(page))
atomic_set(compound_pincount_ptr(page), 0);
__mod_lruvec_page_state(page, NR_ANON_THPS, nr);
} else {
/* Anon THP always mapped first with PMD */
VM_BUG_ON_PAGE(PageTransCompound(page), page);
/* increment count (starts at -1) */
atomic_set(&page->_mapcount, 0);
}
__mod_lruvec_page_state(page, NR_ANON_MAPPED, nr);
__page_set_anon_rmap(page, vma, address, 1);
}
/**
* page_add_file_rmap - add pte mapping to a file page
* @page: the page to add the mapping to
* @compound: charge the page as compound or small page
*
* The caller needs to hold the pte lock.
*/
void page_add_file_rmap(struct page *page, bool compound)
{
int i, nr = 1;
VM_BUG_ON_PAGE(compound && !PageTransHuge(page), page);
lock_page_memcg(page);
if (compound && PageTransHuge(page)) {
int nr_pages = thp_nr_pages(page);
for (i = 0, nr = 0; i < nr_pages; i++) {
if (atomic_inc_and_test(&page[i]._mapcount))
nr++;
}
if (!atomic_inc_and_test(compound_mapcount_ptr(page)))
goto out;
if (PageSwapBacked(page))
__mod_lruvec_page_state(page, NR_SHMEM_PMDMAPPED,
nr_pages);
else
__mod_lruvec_page_state(page, NR_FILE_PMDMAPPED,
nr_pages);
} else {
if (PageTransCompound(page) && page_mapping(page)) {
struct page *head = compound_head(page);
VM_WARN_ON_ONCE(!PageLocked(page));
SetPageDoubleMap(head);
if (PageMlocked(page))
clear_page_mlock(head);
}
if (!atomic_inc_and_test(&page->_mapcount))
goto out;
}
__mod_lruvec_page_state(page, NR_FILE_MAPPED, nr);
out:
unlock_page_memcg(page);
}
static void page_remove_file_rmap(struct page *page, bool compound)
{
int i, nr = 1;
VM_BUG_ON_PAGE(compound && !PageHead(page), page);
/* Hugepages are not counted in NR_FILE_MAPPED for now. */
if (unlikely(PageHuge(page))) {
/* hugetlb pages are always mapped with pmds */
atomic_dec(compound_mapcount_ptr(page));
return;
}
/* page still mapped by someone else? */
if (compound && PageTransHuge(page)) {
int nr_pages = thp_nr_pages(page);
for (i = 0, nr = 0; i < nr_pages; i++) {
if (atomic_add_negative(-1, &page[i]._mapcount))
nr++;
}
if (!atomic_add_negative(-1, compound_mapcount_ptr(page)))
return;
if (PageSwapBacked(page))
__mod_lruvec_page_state(page, NR_SHMEM_PMDMAPPED,
-nr_pages);
else
__mod_lruvec_page_state(page, NR_FILE_PMDMAPPED,
-nr_pages);
} else {
if (!atomic_add_negative(-1, &page->_mapcount))
return;
}
/*
* We use the irq-unsafe __{inc|mod}_lruvec_page_state because
* these counters are not modified in interrupt context, and
* pte lock(a spinlock) is held, which implies preemption disabled.
*/
__mod_lruvec_page_state(page, NR_FILE_MAPPED, -nr);
if (unlikely(PageMlocked(page)))
clear_page_mlock(page);
}
static void page_remove_anon_compound_rmap(struct page *page)
{
int i, nr;
if (!atomic_add_negative(-1, compound_mapcount_ptr(page)))
return;
/* Hugepages are not counted in NR_ANON_PAGES for now. */
if (unlikely(PageHuge(page)))
return;
if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE))
return;
__mod_lruvec_page_state(page, NR_ANON_THPS, -thp_nr_pages(page));
if (TestClearPageDoubleMap(page)) {
/*
* Subpages can be mapped with PTEs too. Check how many of
* them are still mapped.
*/
for (i = 0, nr = 0; i < thp_nr_pages(page); i++) {
if (atomic_add_negative(-1, &page[i]._mapcount))
nr++;
}
/*
* Queue the page for deferred split if at least one small
* page of the compound page is unmapped, but at least one
* small page is still mapped.
*/
if (nr && nr < thp_nr_pages(page))
deferred_split_huge_page(page);
} else {
nr = thp_nr_pages(page);
}
if (unlikely(PageMlocked(page)))
clear_page_mlock(page);
if (nr)
__mod_lruvec_page_state(page, NR_ANON_MAPPED, -nr);
}
/**
* page_remove_rmap - take down pte mapping from a page
* @page: page to remove mapping from
* @compound: uncharge the page as compound or small page
*
* The caller needs to hold the pte lock.
*/
void page_remove_rmap(struct page *page, bool compound)
{
lock_page_memcg(page);
if (!PageAnon(page)) {
page_remove_file_rmap(page, compound);
goto out;
}
if (compound) {
page_remove_anon_compound_rmap(page);
goto out;
}
/* page still mapped by someone else? */
if (!atomic_add_negative(-1, &page->_mapcount))
goto out;
/*
* We use the irq-unsafe __{inc|mod}_zone_page_stat because
* these counters are not modified in interrupt context, and
* pte lock(a spinlock) is held, which implies preemption disabled.
*/
__dec_lruvec_page_state(page, NR_ANON_MAPPED);
if (unlikely(PageMlocked(page)))
clear_page_mlock(page);
if (PageTransCompound(page))
deferred_split_huge_page(compound_head(page));
/*
* It would be tidy to reset the PageAnon mapping here,
* but that might overwrite a racing page_add_anon_rmap
* which increments mapcount after us but sets mapping
* before us: so leave the reset to free_unref_page,
* and remember that it's only reliable while mapped.
* Leaving it set also helps swapoff to reinstate ptes
* faster for those pages still in swapcache.
*/
out:
unlock_page_memcg(page);
}
/*
* @arg: enum ttu_flags will be passed to this argument
*/
static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
unsigned long address, void *arg)
{
struct mm_struct *mm = vma->vm_mm;
struct page_vma_mapped_walk pvmw = {
.page = page,
.vma = vma,
.address = address,
};
pte_t pteval;
struct page *subpage;
bool ret = true;
struct mmu_notifier_range range;
enum ttu_flags flags = (enum ttu_flags)(long)arg;
/*
* When racing against e.g. zap_pte_range() on another cpu,
* in between its ptep_get_and_clear_full() and page_remove_rmap(),
* try_to_unmap() may return before page_mapped() has become false,
* if page table locking is skipped: use TTU_SYNC to wait for that.
*/
if (flags & TTU_SYNC)
pvmw.flags = PVMW_SYNC;
if (flags & TTU_SPLIT_HUGE_PMD)
split_huge_pmd_address(vma, address, false, page);
/*
* For THP, we have to assume the worse case ie pmd for invalidation.
* For hugetlb, it could be much worse if we need to do pud
* invalidation in the case of pmd sharing.
*
* Note that the page can not be free in this function as call of
* try_to_unmap() must hold a reference on the page.
*/
range.end = PageKsm(page) ?
address + PAGE_SIZE : vma_address_end(page, vma);
mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm,
address, range.end);
if (PageHuge(page)) {
/*
* If sharing is possible, start and end will be adjusted
* accordingly.
*/
adjust_range_if_pmd_sharing_possible(vma, &range.start,
&range.end);
}
mmu_notifier_invalidate_range_start(&range);
while (page_vma_mapped_walk(&pvmw)) {
/*
* If the page is mlock()d, we cannot swap it out.
*/
if (!(flags & TTU_IGNORE_MLOCK) &&
(vma->vm_flags & VM_LOCKED)) {
/*
* PTE-mapped THP are never marked as mlocked: so do
* not set it on a DoubleMap THP, nor on an Anon THP
* (which may still be PTE-mapped after DoubleMap was
* cleared). But stop unmapping even in those cases.
*/
if (!PageTransCompound(page) || (PageHead(page) &&
!PageDoubleMap(page) && !PageAnon(page)))
mlock_vma_page(page);
page_vma_mapped_walk_done(&pvmw);
ret = false;
break;
}
/* Unexpected PMD-mapped THP? */
VM_BUG_ON_PAGE(!pvmw.pte, page);
subpage = page - page_to_pfn(page) + pte_pfn(*pvmw.pte);
address = pvmw.address;
if (PageHuge(page) && !PageAnon(page)) {
/*
* To call huge_pmd_unshare, i_mmap_rwsem must be
* held in write mode. Caller needs to explicitly
* do this outside rmap routines.
*/
VM_BUG_ON(!(flags & TTU_RMAP_LOCKED));
if (huge_pmd_unshare(mm, vma, &address, pvmw.pte)) {
/*
* huge_pmd_unshare unmapped an entire PMD
* page. There is no way of knowing exactly
* which PMDs may be cached for this mm, so
* we must flush them all. start/end were
* already adjusted above to cover this range.
*/
flush_cache_range(vma, range.start, range.end);
flush_tlb_range(vma, range.start, range.end);
mmu_notifier_invalidate_range(mm, range.start,
range.end);
/*
* The ref count of the PMD page was dropped
* which is part of the way map counting
* is done for shared PMDs. Return 'true'
* here. When there is no other sharing,
* huge_pmd_unshare returns false and we will
* unmap the actual page and drop map count
* to zero.
*/
page_vma_mapped_walk_done(&pvmw);
break;
}
}
/* Nuke the page table entry. */
flush_cache_page(vma, address, pte_pfn(*pvmw.pte));
if (should_defer_flush(mm, flags)) {
/*
* We clear the PTE but do not flush so potentially
* a remote CPU could still be writing to the page.
* If the entry was previously clean then the
* architecture must guarantee that a clear->dirty
* transition on a cached TLB entry is written through
* and traps if the PTE is unmapped.
*/
pteval = ptep_get_and_clear(mm, address, pvmw.pte);
set_tlb_ubc_flush_pending(mm, pte_dirty(pteval));
} else {
pteval = ptep_clear_flush(vma, address, pvmw.pte);
}
/* Move the dirty bit to the page. Now the pte is gone. */
if (pte_dirty(pteval))
set_page_dirty(page);
/* Update high watermark before we lower rss */
update_hiwater_rss(mm);
if (PageHWPoison(page) && !(flags & TTU_IGNORE_HWPOISON)) {
pteval = swp_entry_to_pte(make_hwpoison_entry(subpage));
if (PageHuge(page)) {
hugetlb_count_sub(compound_nr(page), mm);
set_huge_swap_pte_at(mm, address,
pvmw.pte, pteval,
vma_mmu_pagesize(vma));
} else {
dec_mm_counter(mm, mm_counter(page));
set_pte_at(mm, address, pvmw.pte, pteval);
}
} else if (pte_unused(pteval) && !userfaultfd_armed(vma)) {
/*
* The guest indicated that the page content is of no
* interest anymore. Simply discard the pte, vmscan
* will take care of the rest.
* A future reference will then fault in a new zero
* page. When userfaultfd is active, we must not drop
* this page though, as its main user (postcopy
* migration) will not expect userfaults on already
* copied pages.
*/
dec_mm_counter(mm, mm_counter(page));
/* We have to invalidate as we cleared the pte */
mmu_notifier_invalidate_range(mm, address,
address + PAGE_SIZE);
} else if (PageAnon(page)) {
swp_entry_t entry = { .val = page_private(subpage) };
pte_t swp_pte;
/*
* Store the swap location in the pte.
* See handle_pte_fault() ...
*/
if (unlikely(PageSwapBacked(page) != PageSwapCache(page))) {
WARN_ON_ONCE(1);
ret = false;
/* We have to invalidate as we cleared the pte */
mmu_notifier_invalidate_range(mm, address,
address + PAGE_SIZE);
page_vma_mapped_walk_done(&pvmw);
break;
}
/* MADV_FREE page check */
if (!PageSwapBacked(page)) {
int ref_count, map_count;
/*
* Synchronize with gup_pte_range():
* - clear PTE; barrier; read refcount
* - inc refcount; barrier; read PTE
*/
smp_mb();
ref_count = page_ref_count(page);
map_count = page_mapcount(page);
/*
* Order reads for page refcount and dirty flag
* (see comments in __remove_mapping()).
*/
smp_rmb();
/*
* The only page refs must be one from isolation
* plus the rmap(s) (dropped by discard:).
*/
if (ref_count == 1 + map_count &&
!PageDirty(page)) {
/* Invalidate as we cleared the pte */
mmu_notifier_invalidate_range(mm,
address, address + PAGE_SIZE);
dec_mm_counter(mm, MM_ANONPAGES);
goto discard;
}
/*
* If the page was redirtied, it cannot be
* discarded. Remap the page to page table.
*/
set_pte_at(mm, address, pvmw.pte, pteval);
SetPageSwapBacked(page);
ret = false;
page_vma_mapped_walk_done(&pvmw);
break;
}
if (swap_duplicate(entry) < 0) {
set_pte_at(mm, address, pvmw.pte, pteval);
ret = false;
page_vma_mapped_walk_done(&pvmw);
break;
}
if (arch_unmap_one(mm, vma, address, pteval) < 0) {
set_pte_at(mm, address, pvmw.pte, pteval);
ret = false;
page_vma_mapped_walk_done(&pvmw);
break;
}
if (list_empty(&mm->mmlist)) {
spin_lock(&mmlist_lock);
if (list_empty(&mm->mmlist))
list_add(&mm->mmlist, &init_mm.mmlist);
spin_unlock(&mmlist_lock);
}
dec_mm_counter(mm, MM_ANONPAGES);
inc_mm_counter(mm, MM_SWAPENTS);
swp_pte = swp_entry_to_pte(entry);
if (pte_soft_dirty(pteval))
swp_pte = pte_swp_mksoft_dirty(swp_pte);
if (pte_uffd_wp(pteval))
swp_pte = pte_swp_mkuffd_wp(swp_pte);
set_pte_at(mm, address, pvmw.pte, swp_pte);
/* Invalidate as we cleared the pte */
mmu_notifier_invalidate_range(mm, address,
address + PAGE_SIZE);
} else {
/*
* This is a locked file-backed page, thus it cannot
* be removed from the page cache and replaced by a new
* page before mmu_notifier_invalidate_range_end, so no
* concurrent thread might update its page table to
* point at new page while a device still is using this
* page.
*
* See Documentation/vm/mmu_notifier.rst
*/
dec_mm_counter(mm, mm_counter_file(page));
}
discard:
/*
* No need to call mmu_notifier_invalidate_range() it has be
* done above for all cases requiring it to happen under page
* table lock before mmu_notifier_invalidate_range_end()
*
* See Documentation/vm/mmu_notifier.rst
*/
page_remove_rmap(subpage, PageHuge(page));
put_page(page);
}
mmu_notifier_invalidate_range_end(&range);
return ret;
}
static bool invalid_migration_vma(struct vm_area_struct *vma, void *arg)
{
return vma_is_temporary_stack(vma);
}
static int page_not_mapped(struct page *page)
{
return !page_mapped(page);
}
/**
* try_to_unmap - try to remove all page table mappings to a page
* @page: the page to get unmapped
* @flags: action and flags
*
* Tries to remove all the page table entries which are mapping this
* page, used in the pageout path. Caller must hold the page lock.
*
* It is the caller's responsibility to check if the page is still
* mapped when needed (use TTU_SYNC to prevent accounting races).
*/
void try_to_unmap(struct page *page, enum ttu_flags flags)
{
struct rmap_walk_control rwc = {
.rmap_one = try_to_unmap_one,
.arg = (void *)flags,
.done = page_not_mapped,
.anon_lock = page_lock_anon_vma_read,
};
if (flags & TTU_RMAP_LOCKED)
rmap_walk_locked(page, &rwc);
else
rmap_walk(page, &rwc);
}
/*
* @arg: enum ttu_flags will be passed to this argument.
*
* If TTU_SPLIT_HUGE_PMD is specified any PMD mappings will be split into PTEs
* containing migration entries.
*/
static bool try_to_migrate_one(struct page *page, struct vm_area_struct *vma,
unsigned long address, void *arg)
{
struct mm_struct *mm = vma->vm_mm;
struct page_vma_mapped_walk pvmw = {
.page = page,
.vma = vma,
.address = address,
};
pte_t pteval;
struct page *subpage;
bool ret = true;
struct mmu_notifier_range range;
enum ttu_flags flags = (enum ttu_flags)(long)arg;
/*
* When racing against e.g. zap_pte_range() on another cpu,
* in between its ptep_get_and_clear_full() and page_remove_rmap(),
* try_to_migrate() may return before page_mapped() has become false,
* if page table locking is skipped: use TTU_SYNC to wait for that.
*/
if (flags & TTU_SYNC)
pvmw.flags = PVMW_SYNC;
/*
* unmap_page() in mm/huge_memory.c is the only user of migration with
* TTU_SPLIT_HUGE_PMD and it wants to freeze.
*/
if (flags & TTU_SPLIT_HUGE_PMD)
split_huge_pmd_address(vma, address, true, page);
/*
* For THP, we have to assume the worse case ie pmd for invalidation.
* For hugetlb, it could be much worse if we need to do pud
* invalidation in the case of pmd sharing.
*
* Note that the page can not be free in this function as call of
* try_to_unmap() must hold a reference on the page.
*/
range.end = PageKsm(page) ?
address + PAGE_SIZE : vma_address_end(page, vma);
mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm,
address, range.end);
if (PageHuge(page)) {
/*
* If sharing is possible, start and end will be adjusted
* accordingly.
*/
adjust_range_if_pmd_sharing_possible(vma, &range.start,
&range.end);
}
mmu_notifier_invalidate_range_start(&range);
while (page_vma_mapped_walk(&pvmw)) {
#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
/* PMD-mapped THP migration entry */
if (!pvmw.pte) {
VM_BUG_ON_PAGE(PageHuge(page) ||
!PageTransCompound(page), page);
set_pmd_migration_entry(&pvmw, page);
continue;
}
#endif
/* Unexpected PMD-mapped THP? */
VM_BUG_ON_PAGE(!pvmw.pte, page);
subpage = page - page_to_pfn(page) + pte_pfn(*pvmw.pte);
address = pvmw.address;
if (PageHuge(page) && !PageAnon(page)) {
/*
* To call huge_pmd_unshare, i_mmap_rwsem must be
* held in write mode. Caller needs to explicitly
* do this outside rmap routines.
*/
VM_BUG_ON(!(flags & TTU_RMAP_LOCKED));
if (huge_pmd_unshare(mm, vma, &address, pvmw.pte)) {
/*
* huge_pmd_unshare unmapped an entire PMD
* page. There is no way of knowing exactly
* which PMDs may be cached for this mm, so
* we must flush them all. start/end were
* already adjusted above to cover this range.
*/
flush_cache_range(vma, range.start, range.end);
flush_tlb_range(vma, range.start, range.end);
mmu_notifier_invalidate_range(mm, range.start,
range.end);
/*
* The ref count of the PMD page was dropped
* which is part of the way map counting
* is done for shared PMDs. Return 'true'
* here. When there is no other sharing,
* huge_pmd_unshare returns false and we will
* unmap the actual page and drop map count
* to zero.
*/
page_vma_mapped_walk_done(&pvmw);
break;
}
}
/* Nuke the page table entry. */
flush_cache_page(vma, address, pte_pfn(*pvmw.pte));
pteval = ptep_clear_flush(vma, address, pvmw.pte);
/* Move the dirty bit to the page. Now the pte is gone. */
if (pte_dirty(pteval))
set_page_dirty(page);
/* Update high watermark before we lower rss */
update_hiwater_rss(mm);
if (is_zone_device_page(page)) {
swp_entry_t entry;
pte_t swp_pte;
/*
* Store the pfn of the page in a special migration
* pte. do_swap_page() will wait until the migration
* pte is removed and then restart fault handling.
*/
entry = make_readable_migration_entry(
page_to_pfn(page));
swp_pte = swp_entry_to_pte(entry);
/*
* pteval maps a zone device page and is therefore
* a swap pte.
*/
if (pte_swp_soft_dirty(pteval))
swp_pte = pte_swp_mksoft_dirty(swp_pte);
if (pte_swp_uffd_wp(pteval))
swp_pte = pte_swp_mkuffd_wp(swp_pte);
set_pte_at(mm, pvmw.address, pvmw.pte, swp_pte);
/*
* No need to invalidate here it will synchronize on
* against the special swap migration pte.
*
* The assignment to subpage above was computed from a
* swap PTE which results in an invalid pointer.
* Since only PAGE_SIZE pages can currently be
* migrated, just set it to page. This will need to be
* changed when hugepage migrations to device private
* memory are supported.
*/
subpage = page;
} else if (PageHWPoison(page)) {
pteval = swp_entry_to_pte(make_hwpoison_entry(subpage));
if (PageHuge(page)) {
hugetlb_count_sub(compound_nr(page), mm);
set_huge_swap_pte_at(mm, address,
pvmw.pte, pteval,
vma_mmu_pagesize(vma));
} else {
dec_mm_counter(mm, mm_counter(page));
set_pte_at(mm, address, pvmw.pte, pteval);
}
} else if (pte_unused(pteval) && !userfaultfd_armed(vma)) {
/*
* The guest indicated that the page content is of no
* interest anymore. Simply discard the pte, vmscan
* will take care of the rest.
* A future reference will then fault in a new zero
* page. When userfaultfd is active, we must not drop
* this page though, as its main user (postcopy
* migration) will not expect userfaults on already
* copied pages.
*/
dec_mm_counter(mm, mm_counter(page));
/* We have to invalidate as we cleared the pte */
mmu_notifier_invalidate_range(mm, address,
address + PAGE_SIZE);
} else {
swp_entry_t entry;
pte_t swp_pte;
if (arch_unmap_one(mm, vma, address, pteval) < 0) {
set_pte_at(mm, address, pvmw.pte, pteval);
ret = false;
page_vma_mapped_walk_done(&pvmw);
break;
}
/*
* Store the pfn of the page in a special migration
* pte. do_swap_page() will wait until the migration
* pte is removed and then restart fault handling.
*/
if (pte_write(pteval))
entry = make_writable_migration_entry(
page_to_pfn(subpage));
else
entry = make_readable_migration_entry(
page_to_pfn(subpage));
swp_pte = swp_entry_to_pte(entry);
if (pte_soft_dirty(pteval))
swp_pte = pte_swp_mksoft_dirty(swp_pte);
if (pte_uffd_wp(pteval))
swp_pte = pte_swp_mkuffd_wp(swp_pte);
set_pte_at(mm, address, pvmw.pte, swp_pte);
/*
* No need to invalidate here it will synchronize on
* against the special swap migration pte.
*/
}
/*
* No need to call mmu_notifier_invalidate_range() it has be
* done above for all cases requiring it to happen under page
* table lock before mmu_notifier_invalidate_range_end()
*
* See Documentation/vm/mmu_notifier.rst
*/
page_remove_rmap(subpage, PageHuge(page));
put_page(page);
}
mmu_notifier_invalidate_range_end(&range);
return ret;
}
/**
* try_to_migrate - try to replace all page table mappings with swap entries
* @page: the page to replace page table entries for
* @flags: action and flags
*
* Tries to remove all the page table entries which are mapping this page and
* replace them with special swap entries. Caller must hold the page lock.
*/
void try_to_migrate(struct page *page, enum ttu_flags flags)
{
struct rmap_walk_control rwc = {
.rmap_one = try_to_migrate_one,
.arg = (void *)flags,
.done = page_not_mapped,
.anon_lock = page_lock_anon_vma_read,
};
/*
* Migration always ignores mlock and only supports TTU_RMAP_LOCKED and
* TTU_SPLIT_HUGE_PMD and TTU_SYNC flags.
*/
if (WARN_ON_ONCE(flags & ~(TTU_RMAP_LOCKED | TTU_SPLIT_HUGE_PMD |
TTU_SYNC)))
return;
if (is_zone_device_page(page) && !is_device_private_page(page))
return;
/*
* During exec, a temporary VMA is setup and later moved.
* The VMA is moved under the anon_vma lock but not the
* page tables leading to a race where migration cannot
* find the migration ptes. Rather than increasing the
* locking requirements of exec(), migration skips
* temporary VMAs until after exec() completes.
*/
if (!PageKsm(page) && PageAnon(page))
rwc.invalid_vma = invalid_migration_vma;
if (flags & TTU_RMAP_LOCKED)
rmap_walk_locked(page, &rwc);
else
rmap_walk(page, &rwc);
}
/*
* Walks the vma's mapping a page and mlocks the page if any locked vma's are
* found. Once one is found the page is locked and the scan can be terminated.
*/
static bool page_mlock_one(struct page *page, struct vm_area_struct *vma,
unsigned long address, void *unused)
{
struct page_vma_mapped_walk pvmw = {
.page = page,
.vma = vma,
.address = address,
};
/* An un-locked vma doesn't have any pages to lock, continue the scan */
if (!(vma->vm_flags & VM_LOCKED))
return true;
while (page_vma_mapped_walk(&pvmw)) {
/*
* Need to recheck under the ptl to serialise with
* __munlock_pagevec_fill() after VM_LOCKED is cleared in
* munlock_vma_pages_range().
*/
if (vma->vm_flags & VM_LOCKED) {
/*
* PTE-mapped THP are never marked as mlocked; but
* this function is never called on a DoubleMap THP,
* nor on an Anon THP (which may still be PTE-mapped
* after DoubleMap was cleared).
*/
mlock_vma_page(page);
/*
* No need to scan further once the page is marked
* as mlocked.
*/
page_vma_mapped_walk_done(&pvmw);
return false;
}
}
return true;
}
/**
* page_mlock - try to mlock a page
* @page: the page to be mlocked
*
* Called from munlock code. Checks all of the VMAs mapping the page and mlocks
* the page if any are found. The page will be returned with PG_mlocked cleared
* if it is not mapped by any locked vmas.
*/
void page_mlock(struct page *page)
{
struct rmap_walk_control rwc = {
.rmap_one = page_mlock_one,
.done = page_not_mapped,
.anon_lock = page_lock_anon_vma_read,
};
VM_BUG_ON_PAGE(!PageLocked(page) || PageLRU(page), page);
VM_BUG_ON_PAGE(PageCompound(page) && PageDoubleMap(page), page);
/* Anon THP are only marked as mlocked when singly mapped */
if (PageTransCompound(page) && PageAnon(page))
return;
rmap_walk(page, &rwc);
}
#ifdef CONFIG_DEVICE_PRIVATE
struct make_exclusive_args {
struct mm_struct *mm;
unsigned long address;
void *owner;
bool valid;
};
static bool page_make_device_exclusive_one(struct page *page,
struct vm_area_struct *vma, unsigned long address, void *priv)
{
struct mm_struct *mm = vma->vm_mm;
struct page_vma_mapped_walk pvmw = {
.page = page,
.vma = vma,
.address = address,
};
struct make_exclusive_args *args = priv;
pte_t pteval;
struct page *subpage;
bool ret = true;
struct mmu_notifier_range range;
swp_entry_t entry;
pte_t swp_pte;
mmu_notifier_range_init_owner(&range, MMU_NOTIFY_EXCLUSIVE, 0, vma,
vma->vm_mm, address, min(vma->vm_end,
address + page_size(page)), args->owner);
mmu_notifier_invalidate_range_start(&range);
while (page_vma_mapped_walk(&pvmw)) {
/* Unexpected PMD-mapped THP? */
VM_BUG_ON_PAGE(!pvmw.pte, page);
if (!pte_present(*pvmw.pte)) {
ret = false;
page_vma_mapped_walk_done(&pvmw);
break;
}
subpage = page - page_to_pfn(page) + pte_pfn(*pvmw.pte);
address = pvmw.address;
/* Nuke the page table entry. */
flush_cache_page(vma, address, pte_pfn(*pvmw.pte));
pteval = ptep_clear_flush(vma, address, pvmw.pte);
/* Move the dirty bit to the page. Now the pte is gone. */
if (pte_dirty(pteval))
set_page_dirty(page);
/*
* Check that our target page is still mapped at the expected
* address.
*/
if (args->mm == mm && args->address == address &&
pte_write(pteval))
args->valid = true;
/*
* Store the pfn of the page in a special migration
* pte. do_swap_page() will wait until the migration
* pte is removed and then restart fault handling.
*/
if (pte_write(pteval))
entry = make_writable_device_exclusive_entry(
page_to_pfn(subpage));
else
entry = make_readable_device_exclusive_entry(
page_to_pfn(subpage));
swp_pte = swp_entry_to_pte(entry);
if (pte_soft_dirty(pteval))
swp_pte = pte_swp_mksoft_dirty(swp_pte);
if (pte_uffd_wp(pteval))
swp_pte = pte_swp_mkuffd_wp(swp_pte);
set_pte_at(mm, address, pvmw.pte, swp_pte);
/*
* There is a reference on the page for the swap entry which has
* been removed, so shouldn't take another.
*/
page_remove_rmap(subpage, false);
}
mmu_notifier_invalidate_range_end(&range);
return ret;
}
/**
* page_make_device_exclusive - mark the page exclusively owned by a device
* @page: the page to replace page table entries for
* @mm: the mm_struct where the page is expected to be mapped
* @address: address where the page is expected to be mapped
* @owner: passed to MMU_NOTIFY_EXCLUSIVE range notifier callbacks
*
* Tries to remove all the page table entries which are mapping this page and
* replace them with special device exclusive swap entries to grant a device
* exclusive access to the page. Caller must hold the page lock.
*
* Returns false if the page is still mapped, or if it could not be unmapped
* from the expected address. Otherwise returns true (success).
*/
static bool page_make_device_exclusive(struct page *page, struct mm_struct *mm,
unsigned long address, void *owner)
{
struct make_exclusive_args args = {
.mm = mm,
.address = address,
.owner = owner,
.valid = false,
};
struct rmap_walk_control rwc = {
.rmap_one = page_make_device_exclusive_one,
.done = page_not_mapped,
.anon_lock = page_lock_anon_vma_read,
.arg = &args,
};
/*
* Restrict to anonymous pages for now to avoid potential writeback
* issues. Also tail pages shouldn't be passed to rmap_walk so skip
* those.
*/
if (!PageAnon(page) || PageTail(page))
return false;
rmap_walk(page, &rwc);
return args.valid && !page_mapcount(page);
}
/**
* make_device_exclusive_range() - Mark a range for exclusive use by a device
* @mm: mm_struct of assoicated target process
* @start: start of the region to mark for exclusive device access
* @end: end address of region
* @pages: returns the pages which were successfully marked for exclusive access
* @owner: passed to MMU_NOTIFY_EXCLUSIVE range notifier to allow filtering
*
* Returns: number of pages found in the range by GUP. A page is marked for
* exclusive access only if the page pointer is non-NULL.
*
* This function finds ptes mapping page(s) to the given address range, locks
* them and replaces mappings with special swap entries preventing userspace CPU
* access. On fault these entries are replaced with the original mapping after
* calling MMU notifiers.
*
* A driver using this to program access from a device must use a mmu notifier
* critical section to hold a device specific lock during programming. Once
* programming is complete it should drop the page lock and reference after
* which point CPU access to the page will revoke the exclusive access.
*/
int make_device_exclusive_range(struct mm_struct *mm, unsigned long start,
unsigned long end, struct page **pages,
void *owner)
{
long npages = (end - start) >> PAGE_SHIFT;
long i;
npages = get_user_pages_remote(mm, start, npages,
FOLL_GET | FOLL_WRITE | FOLL_SPLIT_PMD,
pages, NULL, NULL);
if (npages < 0)
return npages;
for (i = 0; i < npages; i++, start += PAGE_SIZE) {
if (!trylock_page(pages[i])) {
put_page(pages[i]);
pages[i] = NULL;
continue;
}
if (!page_make_device_exclusive(pages[i], mm, start, owner)) {
unlock_page(pages[i]);
put_page(pages[i]);
pages[i] = NULL;
}
}
return npages;
}
EXPORT_SYMBOL_GPL(make_device_exclusive_range);
#endif
void __put_anon_vma(struct anon_vma *anon_vma)
{
struct anon_vma *root = anon_vma->root;
anon_vma_free(anon_vma);
if (root != anon_vma && atomic_dec_and_test(&root->refcount))
anon_vma_free(root);
}
static struct anon_vma *rmap_walk_anon_lock(struct page *page,
struct rmap_walk_control *rwc)
{
struct anon_vma *anon_vma;
if (rwc->anon_lock)
return rwc->anon_lock(page);
/*
* Note: remove_migration_ptes() cannot use page_lock_anon_vma_read()
* because that depends on page_mapped(); but not all its usages
* are holding mmap_lock. Users without mmap_lock are required to
* take a reference count to prevent the anon_vma disappearing
*/
anon_vma = page_anon_vma(page);
if (!anon_vma)
return NULL;
anon_vma_lock_read(anon_vma);
return anon_vma;
}
/*
* rmap_walk_anon - do something to anonymous page using the object-based
* rmap method
* @page: the page to be handled
* @rwc: control variable according to each walk type
*
* Find all the mappings of a page using the mapping pointer and the vma chains
* contained in the anon_vma struct it points to.
*
* When called from page_mlock(), the mmap_lock of the mm containing the vma
* where the page was found will be held for write. So, we won't recheck
* vm_flags for that VMA. That should be OK, because that vma shouldn't be
* LOCKED.
*/
static void rmap_walk_anon(struct page *page, struct rmap_walk_control *rwc,
bool locked)
{
struct anon_vma *anon_vma;
pgoff_t pgoff_start, pgoff_end;
struct anon_vma_chain *avc;
if (locked) {
anon_vma = page_anon_vma(page);
/* anon_vma disappear under us? */
VM_BUG_ON_PAGE(!anon_vma, page);
} else {
anon_vma = rmap_walk_anon_lock(page, rwc);
}
if (!anon_vma)
return;
pgoff_start = page_to_pgoff(page);
pgoff_end = pgoff_start + thp_nr_pages(page) - 1;
anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root,
pgoff_start, pgoff_end) {
struct vm_area_struct *vma = avc->vma;
unsigned long address = vma_address(page, vma);
VM_BUG_ON_VMA(address == -EFAULT, vma);
cond_resched();
if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg))
continue;
if (!rwc->rmap_one(page, vma, address, rwc->arg))
break;
if (rwc->done && rwc->done(page))
break;
}
if (!locked)
anon_vma_unlock_read(anon_vma);
}
/*
* rmap_walk_file - do something to file page using the object-based rmap method
* @page: the page to be handled
* @rwc: control variable according to each walk type
*
* Find all the mappings of a page using the mapping pointer and the vma chains
* contained in the address_space struct it points to.
*
* When called from page_mlock(), the mmap_lock of the mm containing the vma
* where the page was found will be held for write. So, we won't recheck
* vm_flags for that VMA. That should be OK, because that vma shouldn't be
* LOCKED.
*/
static void rmap_walk_file(struct page *page, struct rmap_walk_control *rwc,
bool locked)
{
struct address_space *mapping = page_mapping(page);
pgoff_t pgoff_start, pgoff_end;
struct vm_area_struct *vma;
/*
* The page lock not only makes sure that page->mapping cannot
* suddenly be NULLified by truncation, it makes sure that the
* structure at mapping cannot be freed and reused yet,
* so we can safely take mapping->i_mmap_rwsem.
*/
VM_BUG_ON_PAGE(!PageLocked(page), page);
if (!mapping)
return;
pgoff_start = page_to_pgoff(page);
pgoff_end = pgoff_start + thp_nr_pages(page) - 1;
if (!locked)
i_mmap_lock_read(mapping);
vma_interval_tree_foreach(vma, &mapping->i_mmap,
pgoff_start, pgoff_end) {
unsigned long address = vma_address(page, vma);
VM_BUG_ON_VMA(address == -EFAULT, vma);
cond_resched();
if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg))
continue;
if (!rwc->rmap_one(page, vma, address, rwc->arg))
goto done;
if (rwc->done && rwc->done(page))
goto done;
}
done:
if (!locked)
i_mmap_unlock_read(mapping);
}
void rmap_walk(struct page *page, struct rmap_walk_control *rwc)
{
if (unlikely(PageKsm(page)))
rmap_walk_ksm(page, rwc);
else if (PageAnon(page))
rmap_walk_anon(page, rwc, false);
else
rmap_walk_file(page, rwc, false);
}
/* Like rmap_walk, but caller holds relevant rmap lock */
void rmap_walk_locked(struct page *page, struct rmap_walk_control *rwc)
{
/* no ksm support for now */
VM_BUG_ON_PAGE(PageKsm(page), page);
if (PageAnon(page))
rmap_walk_anon(page, rwc, true);
else
rmap_walk_file(page, rwc, true);
}
#ifdef CONFIG_HUGETLB_PAGE
/*
* The following two functions are for anonymous (private mapped) hugepages.
* Unlike common anonymous pages, anonymous hugepages have no accounting code
* and no lru code, because we handle hugepages differently from common pages.
*/
void hugepage_add_anon_rmap(struct page *page,
struct vm_area_struct *vma, unsigned long address)
{
struct anon_vma *anon_vma = vma->anon_vma;
int first;
BUG_ON(!PageLocked(page));
BUG_ON(!anon_vma);
/* address might be in next vma when migration races vma_adjust */
first = atomic_inc_and_test(compound_mapcount_ptr(page));
if (first)
__page_set_anon_rmap(page, vma, address, 0);
}
void hugepage_add_new_anon_rmap(struct page *page,
struct vm_area_struct *vma, unsigned long address)
{
BUG_ON(address < vma->vm_start || address >= vma->vm_end);
atomic_set(compound_mapcount_ptr(page), 0);
if (hpage_pincount_available(page))
atomic_set(compound_pincount_ptr(page), 0);
__page_set_anon_rmap(page, vma, address, 1);
}
#endif /* CONFIG_HUGETLB_PAGE */
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef __ASM_GENERIC_GETORDER_H
#define __ASM_GENERIC_GETORDER_H
#ifndef __ASSEMBLY__
#include <linux/compiler.h>
#include <linux/log2.h>
/**
* get_order - Determine the allocation order of a memory size
* @size: The size for which to get the order
*
* Determine the allocation order of a particular sized block of memory. This
* is on a logarithmic scale, where:
*
* 0 -> 2^0 * PAGE_SIZE and below
* 1 -> 2^1 * PAGE_SIZE to 2^0 * PAGE_SIZE + 1
* 2 -> 2^2 * PAGE_SIZE to 2^1 * PAGE_SIZE + 1
* 3 -> 2^3 * PAGE_SIZE to 2^2 * PAGE_SIZE + 1
* 4 -> 2^4 * PAGE_SIZE to 2^3 * PAGE_SIZE + 1
* ...
*
* The order returned is used to find the smallest allocation granule required
* to hold an object of the specified size.
*
* The result is undefined if the size is 0.
*/
static __always_inline __attribute_const__ int get_order(unsigned long size)
{
if (__builtin_constant_p(size)) {
if (!size)
return BITS_PER_LONG - PAGE_SHIFT;
if (size < (1UL << PAGE_SHIFT))
return 0;
return ilog2((size) - 1) - PAGE_SHIFT + 1;
}
size--;
size >>= PAGE_SHIFT;
#if BITS_PER_LONG == 32
return fls(size);
#else
return fls64(size);
#endif
}
#endif /* __ASSEMBLY__ */
#endif /* __ASM_GENERIC_GETORDER_H */
// SPDX-License-Identifier: GPL-2.0-or-later
/* audit.c -- Auditing support
* Gateway between the kernel (e.g., selinux) and the user-space audit daemon.
* System-call specific features have moved to auditsc.c
*
* Copyright 2003-2007 Red Hat Inc., Durham, North Carolina.
* All Rights Reserved.
*
* Written by Rickard E. (Rik) Faith <faith@redhat.com>
*
* Goals: 1) Integrate fully with Security Modules.
* 2) Minimal run-time overhead:
* a) Minimal when syscall auditing is disabled (audit_enable=0).
* b) Small when syscall auditing is enabled and no audit record
* is generated (defer as much work as possible to record
* generation time):
* i) context is allocated,
* ii) names from getname are stored without a copy, and
* iii) inode information stored from path_lookup.
* 3) Ability to disable syscall auditing at boot time (audit=0).
* 4) Usable by other parts of the kernel (if audit_log* is called,
* then a syscall record will be generated automatically for the
* current syscall).
* 5) Netlink interface to user-space.
* 6) Support low-overhead kernel-based filtering to minimize the
* information that must be passed to user-space.
*
* Audit userspace, documentation, tests, and bug/issue trackers:
* https://github.com/linux-audit
*/
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/file.h>
#include <linux/init.h>
#include <linux/types.h>
#include <linux/atomic.h>
#include <linux/mm.h>
#include <linux/export.h>
#include <linux/slab.h>
#include <linux/err.h>
#include <linux/kthread.h>
#include <linux/kernel.h>
#include <linux/syscalls.h>
#include <linux/spinlock.h>
#include <linux/rcupdate.h>
#include <linux/mutex.h>
#include <linux/gfp.h>
#include <linux/pid.h>
#include <linux/audit.h>
#include <net/sock.h>
#include <net/netlink.h>
#include <linux/skbuff.h>
#ifdef CONFIG_SECURITY
#include <linux/security.h>
#endif
#include <linux/freezer.h>
#include <linux/pid_namespace.h>
#include <net/netns/generic.h>
#include "audit.h"
/* No auditing will take place until audit_initialized == AUDIT_INITIALIZED.
* (Initialization happens after skb_init is called.) */
#define AUDIT_DISABLED -1
#define AUDIT_UNINITIALIZED 0
#define AUDIT_INITIALIZED 1
static int audit_initialized = AUDIT_UNINITIALIZED;
u32 audit_enabled = AUDIT_OFF;
bool audit_ever_enabled = !!AUDIT_OFF;
EXPORT_SYMBOL_GPL(audit_enabled);
/* Default state when kernel boots without any parameters. */
static u32 audit_default = AUDIT_OFF;
/* If auditing cannot proceed, audit_failure selects what happens. */
static u32 audit_failure = AUDIT_FAIL_PRINTK;
/* private audit network namespace index */
static unsigned int audit_net_id;
/**
* struct audit_net - audit private network namespace data
* @sk: communication socket
*/
struct audit_net {
struct sock *sk;
};
/**
* struct auditd_connection - kernel/auditd connection state
* @pid: auditd PID
* @portid: netlink portid
* @net: the associated network namespace
* @rcu: RCU head
*
* Description:
* This struct is RCU protected; you must either hold the RCU lock for reading
* or the associated spinlock for writing.
*/
struct auditd_connection {
struct pid *pid;
u32 portid;
struct net *net;
struct rcu_head rcu;
};
static struct auditd_connection __rcu *auditd_conn;
static DEFINE_SPINLOCK(auditd_conn_lock);
/* If audit_rate_limit is non-zero, limit the rate of sending audit records
* to that number per second. This prevents DoS attacks, but results in
* audit records being dropped. */
static u32 audit_rate_limit;
/* Number of outstanding audit_buffers allowed.
* When set to zero, this means unlimited. */
static u32 audit_backlog_limit = 64;
#define AUDIT_BACKLOG_WAIT_TIME (60 * HZ)
static u32 audit_backlog_wait_time = AUDIT_BACKLOG_WAIT_TIME;
/* The identity of the user shutting down the audit system. */
static kuid_t audit_sig_uid = INVALID_UID;
static pid_t audit_sig_pid = -1;
static u32 audit_sig_sid;
/* Records can be lost in several ways:
0) [suppressed in audit_alloc]
1) out of memory in audit_log_start [kmalloc of struct audit_buffer]
2) out of memory in audit_log_move [alloc_skb]
3) suppressed due to audit_rate_limit
4) suppressed due to audit_backlog_limit
*/
static atomic_t audit_lost = ATOMIC_INIT(0);
/* Monotonically increasing sum of time the kernel has spent
* waiting while the backlog limit is exceeded.
*/
static atomic_t audit_backlog_wait_time_actual = ATOMIC_INIT(0);
/* Hash for inode-based rules */
struct list_head audit_inode_hash[AUDIT_INODE_BUCKETS];
static struct kmem_cache *audit_buffer_cache;
/* queue msgs to send via kauditd_task */
static struct sk_buff_head audit_queue;
/* queue msgs due to temporary unicast send problems */
static struct sk_buff_head audit_retry_queue;
/* queue msgs waiting for new auditd connection */
static struct sk_buff_head audit_hold_queue;
/* queue servicing thread */
static struct task_struct *kauditd_task;
static DECLARE_WAIT_QUEUE_HEAD(kauditd_wait);
/* waitqueue for callers who are blocked on the audit backlog */
static DECLARE_WAIT_QUEUE_HEAD(audit_backlog_wait);
static struct audit_features af = {.vers = AUDIT_FEATURE_VERSION,
.mask = -1,
.features = 0,
.lock = 0,};
static char *audit_feature_names[2] = {
"only_unset_loginuid",
"loginuid_immutable",
};
/**
* struct audit_ctl_mutex - serialize requests from userspace
* @lock: the mutex used for locking
* @owner: the task which owns the lock
*
* Description:
* This is the lock struct used to ensure we only process userspace requests
* in an orderly fashion. We can't simply use a mutex/lock here because we
* need to track lock ownership so we don't end up blocking the lock owner in
* audit_log_start() or similar.
*/
static struct audit_ctl_mutex {
struct mutex lock;
void *owner;
} audit_cmd_mutex;
/* AUDIT_BUFSIZ is the size of the temporary buffer used for formatting
* audit records. Since printk uses a 1024 byte buffer, this buffer
* should be at least that large. */
#define AUDIT_BUFSIZ 1024
/* The audit_buffer is used when formatting an audit record. The caller
* locks briefly to get the record off the freelist or to allocate the
* buffer, and locks briefly to send the buffer to the netlink layer or
* to place it on a transmit queue. Multiple audit_buffers can be in
* use simultaneously. */
struct audit_buffer {
struct sk_buff *skb; /* formatted skb ready to send */
struct audit_context *ctx; /* NULL or associated context */
gfp_t gfp_mask;
};
struct audit_reply {
__u32 portid;
struct net *net;
struct sk_buff *skb;
};
/**
* auditd_test_task - Check to see if a given task is an audit daemon
* @task: the task to check
*
* Description:
* Return 1 if the task is a registered audit daemon, 0 otherwise.
*/
int auditd_test_task(struct task_struct *task)
{
int rc;
struct auditd_connection *ac;
rcu_read_lock();
ac = rcu_dereference(auditd_conn);
rc = (ac && ac->pid == task_tgid(task) ? 1 : 0);
rcu_read_unlock();
return rc;
}
/**
* audit_ctl_lock - Take the audit control lock
*/
void audit_ctl_lock(void)
{
mutex_lock(&audit_cmd_mutex.lock);
audit_cmd_mutex.owner = current;
}
/**
* audit_ctl_unlock - Drop the audit control lock
*/
void audit_ctl_unlock(void)
{
audit_cmd_mutex.owner = NULL;
mutex_unlock(&audit_cmd_mutex.lock);
}
/**
* audit_ctl_owner_current - Test to see if the current task owns the lock
*
* Description:
* Return true if the current task owns the audit control lock, false if it
* doesn't own the lock.
*/
static bool audit_ctl_owner_current(void)
{
return (current == audit_cmd_mutex.owner);
}
/**
* auditd_pid_vnr - Return the auditd PID relative to the namespace
*
* Description:
* Returns the PID in relation to the namespace, 0 on failure.
*/
static pid_t auditd_pid_vnr(void)
{
pid_t pid;
const struct auditd_connection *ac;
rcu_read_lock();
ac = rcu_dereference(auditd_conn);
if (!ac || !ac->pid)
pid = 0;
else
pid = pid_vnr(ac->pid);
rcu_read_unlock();
return pid;
}
/**
* audit_get_sk - Return the audit socket for the given network namespace
* @net: the destination network namespace
*
* Description:
* Returns the sock pointer if valid, NULL otherwise. The caller must ensure
* that a reference is held for the network namespace while the sock is in use.
*/
static struct sock *audit_get_sk(const struct net *net)
{
struct audit_net *aunet;
if (!net)
return NULL;
aunet = net_generic(net, audit_net_id);
return aunet->sk;
}
void audit_panic(const char *message)
{
switch (audit_failure) {
case AUDIT_FAIL_SILENT:
break;
case AUDIT_FAIL_PRINTK:
if (printk_ratelimit())
pr_err("%s\n", message);
break;
case AUDIT_FAIL_PANIC:
panic("audit: %s\n", message);
break;
}
}
static inline int audit_rate_check(void)
{
static unsigned long last_check = 0;
static int messages = 0;
static DEFINE_SPINLOCK(lock);
unsigned long flags;
unsigned long now;
unsigned long elapsed;
int retval = 0;
if (!audit_rate_limit) return 1;
spin_lock_irqsave(&lock, flags);
if (++messages < audit_rate_limit) {
retval = 1;
} else {
now = jiffies;
elapsed = now - last_check;
if (elapsed > HZ) {
last_check = now;
messages = 0;
retval = 1;
}
}
spin_unlock_irqrestore(&lock, flags);
return retval;
}
/**
* audit_log_lost - conditionally log lost audit message event
* @message: the message stating reason for lost audit message
*
* Emit at least 1 message per second, even if audit_rate_check is
* throttling.
* Always increment the lost messages counter.
*/
void audit_log_lost(const char *message)
{
static unsigned long last_msg = 0;
static DEFINE_SPINLOCK(lock);
unsigned long flags;
unsigned long now;
int print;
atomic_inc(&audit_lost);
print = (audit_failure == AUDIT_FAIL_PANIC || !audit_rate_limit);
if (!print) {
spin_lock_irqsave(&lock, flags);
now = jiffies;
if (now - last_msg > HZ) {
print = 1;
last_msg = now;
}
spin_unlock_irqrestore(&lock, flags);
}
if (print) {
if (printk_ratelimit())
pr_warn("audit_lost=%u audit_rate_limit=%u audit_backlog_limit=%u\n",
atomic_read(&audit_lost),
audit_rate_limit,
audit_backlog_limit);
audit_panic(message);
}
}
static int audit_log_config_change(char *function_name, u32 new, u32 old,
int allow_changes)
{
struct audit_buffer *ab;
int rc = 0;
ab = audit_log_start(audit_context(), GFP_KERNEL, AUDIT_CONFIG_CHANGE);
if (unlikely(!ab))
return rc;
audit_log_format(ab, "op=set %s=%u old=%u ", function_name, new, old);
audit_log_session_info(ab);
rc = audit_log_task_context(ab);
if (rc)
allow_changes = 0; /* Something weird, deny request */
audit_log_format(ab, " res=%d", allow_changes);
audit_log_end(ab);
return rc;
}
static int audit_do_config_change(char *function_name, u32 *to_change, u32 new)
{
int allow_changes, rc = 0;
u32 old = *to_change;
/* check if we are locked */
if (audit_enabled == AUDIT_LOCKED)
allow_changes = 0;
else
allow_changes = 1;
if (audit_enabled != AUDIT_OFF) {
rc = audit_log_config_change(function_name, new, old, allow_changes);
if (rc)
allow_changes = 0;
}
/* If we are allowed, make the change */
if (allow_changes == 1)
*to_change = new;
/* Not allowed, update reason */
else if (rc == 0)
rc = -EPERM;
return rc;
}
static int audit_set_rate_limit(u32 limit)
{
return audit_do_config_change("audit_rate_limit", &audit_rate_limit, limit);
}
static int audit_set_backlog_limit(u32 limit)
{
return audit_do_config_change("audit_backlog_limit", &audit_backlog_limit, limit);
}
static int audit_set_backlog_wait_time(u32 timeout)
{
return audit_do_config_change("audit_backlog_wait_time",
&audit_backlog_wait_time, timeout);
}
static int audit_set_enabled(u32 state)
{
int rc;
if (state > AUDIT_LOCKED)
return -EINVAL;
rc = audit_do_config_change("audit_enabled", &audit_enabled, state);
if (!rc)
audit_ever_enabled |= !!state;
return rc;
}
static int audit_set_failure(u32 state)
{
if (state != AUDIT_FAIL_SILENT
&& state != AUDIT_FAIL_PRINTK
&& state != AUDIT_FAIL_PANIC)
return -EINVAL;
return audit_do_config_change("audit_failure", &audit_failure, state);
}
/**
* auditd_conn_free - RCU helper to release an auditd connection struct
* @rcu: RCU head
*
* Description:
* Drop any references inside the auditd connection tracking struct and free
* the memory.
*/
static void auditd_conn_free(struct rcu_head *rcu)
{
struct auditd_connection *ac;
ac = container_of(rcu, struct auditd_connection, rcu);
put_pid(ac->pid);
put_net(ac->net);
kfree(ac);
}
/**
* auditd_set - Set/Reset the auditd connection state
* @pid: auditd PID
* @portid: auditd netlink portid
* @net: auditd network namespace pointer
*
* Description:
* This function will obtain and drop network namespace references as
* necessary. Returns zero on success, negative values on failure.
*/
static int auditd_set(struct pid *pid, u32 portid, struct net *net)
{
unsigned long flags;
struct auditd_connection *ac_old, *ac_new;
if (!pid || !net)
return -EINVAL;
ac_new = kzalloc(sizeof(*ac_new), GFP_KERNEL);
if (!ac_new)
return -ENOMEM;
ac_new->pid = get_pid(pid);
ac_new->portid = portid;
ac_new->net = get_net(net);
spin_lock_irqsave(&auditd_conn_lock, flags);
ac_old = rcu_dereference_protected(auditd_conn,
lockdep_is_held(&auditd_conn_lock));
rcu_assign_pointer(auditd_conn, ac_new);
spin_unlock_irqrestore(&auditd_conn_lock, flags);
if (ac_old)
call_rcu(&ac_old->rcu, auditd_conn_free);
return 0;
}
/**
* kauditd_printk_skb - Print the audit record to the ring buffer
* @skb: audit record
*
* Whatever the reason, this packet may not make it to the auditd connection
* so write it via printk so the information isn't completely lost.
*/
static void kauditd_printk_skb(struct sk_buff *skb)
{
struct nlmsghdr *nlh = nlmsg_hdr(skb);
char *data = nlmsg_data(nlh);
if (nlh->nlmsg_type != AUDIT_EOE && printk_ratelimit())
pr_notice("type=%d %s\n", nlh->nlmsg_type, data);
}
/**
* kauditd_rehold_skb - Handle a audit record send failure in the hold queue
* @skb: audit record
* @error: error code (unused)
*
* Description:
* This should only be used by the kauditd_thread when it fails to flush the
* hold queue.
*/
static void kauditd_rehold_skb(struct sk_buff *skb, __always_unused int error)
{
/* put the record back in the queue */
skb_queue_tail(&audit_hold_queue, skb);
}
/**
* kauditd_hold_skb - Queue an audit record, waiting for auditd
* @skb: audit record
* @error: error code
*
* Description:
* Queue the audit record, waiting for an instance of auditd. When this
* function is called we haven't given up yet on sending the record, but things
* are not looking good. The first thing we want to do is try to write the
* record via printk and then see if we want to try and hold on to the record
* and queue it, if we have room. If we want to hold on to the record, but we
* don't have room, record a record lost message.
*/
static void kauditd_hold_skb(struct sk_buff *skb, int error)
{
/* at this point it is uncertain if we will ever send this to auditd so
* try to send the message via printk before we go any further */
kauditd_printk_skb(skb);
/* can we just silently drop the message? */
if (!audit_default)
goto drop;
/* the hold queue is only for when the daemon goes away completely,
* not -EAGAIN failures; if we are in a -EAGAIN state requeue the
* record on the retry queue unless it's full, in which case drop it
*/
if (error == -EAGAIN) {
if (!audit_backlog_limit ||
skb_queue_len(&audit_retry_queue) < audit_backlog_limit) {
skb_queue_tail(&audit_retry_queue, skb);
return;
}
audit_log_lost("kauditd retry queue overflow");
goto drop;
}
/* if we have room in the hold queue, queue the message */
if (!audit_backlog_limit ||
skb_queue_len(&audit_hold_queue) < audit_backlog_limit) {
skb_queue_tail(&audit_hold_queue, skb);
return;
}
/* we have no other options - drop the message */
audit_log_lost("kauditd hold queue overflow");
drop:
kfree_skb(skb);
}
/**
* kauditd_retry_skb - Queue an audit record, attempt to send again to auditd
* @skb: audit record
* @error: error code (unused)
*
* Description:
* Not as serious as kauditd_hold_skb() as we still have a connected auditd,
* but for some reason we are having problems sending it audit records so
* queue the given record and attempt to resend.
*/
static void kauditd_retry_skb(struct sk_buff *skb, __always_unused int error)
{
if (!audit_backlog_limit ||
skb_queue_len(&audit_retry_queue) < audit_backlog_limit) {
skb_queue_tail(&audit_retry_queue, skb);
return;
}
/* we have to drop the record, send it via printk as a last effort */
kauditd_printk_skb(skb);
audit_log_lost("kauditd retry queue overflow");
kfree_skb(skb);
}
/**
* auditd_reset - Disconnect the auditd connection
* @ac: auditd connection state
*
* Description:
* Break the auditd/kauditd connection and move all the queued records into the
* hold queue in case auditd reconnects. It is important to note that the @ac
* pointer should never be dereferenced inside this function as it may be NULL
* or invalid, you can only compare the memory address! If @ac is NULL then
* the connection will always be reset.
*/
static void auditd_reset(const struct auditd_connection *ac)
{
unsigned long flags;
struct sk_buff *skb;
struct auditd_connection *ac_old;
/* if it isn't already broken, break the connection */
spin_lock_irqsave(&auditd_conn_lock, flags);
ac_old = rcu_dereference_protected(auditd_conn,
lockdep_is_held(&auditd_conn_lock));
if (ac && ac != ac_old) {
/* someone already registered a new auditd connection */
spin_unlock_irqrestore(&auditd_conn_lock, flags);
return;
}
rcu_assign_pointer(auditd_conn, NULL);
spin_unlock_irqrestore(&auditd_conn_lock, flags);
if (ac_old)
call_rcu(&ac_old->rcu, auditd_conn_free);
/* flush the retry queue to the hold queue, but don't touch the main
* queue since we need to process that normally for multicast */
while ((skb = skb_dequeue(&audit_retry_queue)))
kauditd_hold_skb(skb, -ECONNREFUSED);
}
/**
* auditd_send_unicast_skb - Send a record via unicast to auditd
* @skb: audit record
*
* Description:
* Send a skb to the audit daemon, returns positive/zero values on success and
* negative values on failure; in all cases the skb will be consumed by this
* function. If the send results in -ECONNREFUSED the connection with auditd
* will be reset. This function may sleep so callers should not hold any locks
* where this would cause a problem.
*/
static int auditd_send_unicast_skb(struct sk_buff *skb)
{
int rc;
u32 portid;
struct net *net;
struct sock *sk;
struct auditd_connection *ac;
/* NOTE: we can't call netlink_unicast while in the RCU section so
* take a reference to the network namespace and grab local
* copies of the namespace, the sock, and the portid; the
* namespace and sock aren't going to go away while we hold a
* reference and if the portid does become invalid after the RCU
* section netlink_unicast() should safely return an error */
rcu_read_lock();
ac = rcu_dereference(auditd_conn);
if (!ac) {
rcu_read_unlock();
kfree_skb(skb);
rc = -ECONNREFUSED;
goto err;
}
net = get_net(ac->net);
sk = audit_get_sk(net);
portid = ac->portid;
rcu_read_unlock();
rc = netlink_unicast(sk, skb, portid, 0);
put_net(net);
if (rc < 0)
goto err;
return rc;
err:
if (ac && rc == -ECONNREFUSED) auditd_reset(ac);
return rc;
}
/**
* kauditd_send_queue - Helper for kauditd_thread to flush skb queues
* @sk: the sending sock
* @portid: the netlink destination
* @queue: the skb queue to process
* @retry_limit: limit on number of netlink unicast failures
* @skb_hook: per-skb hook for additional processing
* @err_hook: hook called if the skb fails the netlink unicast send
*
* Description:
* Run through the given queue and attempt to send the audit records to auditd,
* returns zero on success, negative values on failure. It is up to the caller
* to ensure that the @sk is valid for the duration of this function.
*
*/
static int kauditd_send_queue(struct sock *sk, u32 portid,
struct sk_buff_head *queue,
unsigned int retry_limit,
void (*skb_hook)(struct sk_buff *skb),
void (*err_hook)(struct sk_buff *skb, int error))
{
int rc = 0;
struct sk_buff *skb = NULL;
struct sk_buff *skb_tail;
unsigned int failed = 0;
/* NOTE: kauditd_thread takes care of all our locking, we just use
* the netlink info passed to us (e.g. sk and portid) */
skb_tail = skb_peek_tail(queue);
while ((skb != skb_tail) && (skb = skb_dequeue(queue))) {
/* call the skb_hook for each skb we touch */
if (skb_hook)
(*skb_hook)(skb);
/* can we send to anyone via unicast? */
if (!sk) {
if (err_hook)
(*err_hook)(skb, -ECONNREFUSED);
continue;
}
retry:
/* grab an extra skb reference in case of error */
skb_get(skb);
rc = netlink_unicast(sk, skb, portid, 0);
if (rc < 0) {
/* send failed - try a few times unless fatal error */
if (++failed >= retry_limit ||
rc == -ECONNREFUSED || rc == -EPERM) {
sk = NULL;
if (err_hook)
(*err_hook)(skb, rc);
if (rc == -EAGAIN)
rc = 0;
/* continue to drain the queue */
continue;
} else
goto retry;
} else {
/* skb sent - drop the extra reference and continue */
consume_skb(skb);
failed = 0;
}
}
return (rc >= 0 ? 0 : rc);
}
/*
* kauditd_send_multicast_skb - Send a record to any multicast listeners
* @skb: audit record
*
* Description:
* Write a multicast message to anyone listening in the initial network
* namespace. This function doesn't consume an skb as might be expected since
* it has to copy it anyways.
*/
static void kauditd_send_multicast_skb(struct sk_buff *skb)
{
struct sk_buff *copy;
struct sock *sock = audit_get_sk(&init_net);
struct nlmsghdr *nlh;
/* NOTE: we are not taking an additional reference for init_net since
* we don't have to worry about it going away */
if (!netlink_has_listeners(sock, AUDIT_NLGRP_READLOG))
return;
/*
* The seemingly wasteful skb_copy() rather than bumping the refcount
* using skb_get() is necessary because non-standard mods are made to
* the skb by the original kaudit unicast socket send routine. The
* existing auditd daemon assumes this breakage. Fixing this would
* require co-ordinating a change in the established protocol between
* the kaudit kernel subsystem and the auditd userspace code. There is
* no reason for new multicast clients to continue with this
* non-compliance.
*/
copy = skb_copy(skb, GFP_KERNEL);
if (!copy)
return;
nlh = nlmsg_hdr(copy);
nlh->nlmsg_len = skb->len;
nlmsg_multicast(sock, copy, 0, AUDIT_NLGRP_READLOG, GFP_KERNEL);
}
/**
* kauditd_thread - Worker thread to send audit records to userspace
* @dummy: unused
*/
static int kauditd_thread(void *dummy)
{
int rc;
u32 portid = 0;
struct net *net = NULL;
struct sock *sk = NULL;
struct auditd_connection *ac;
#define UNICAST_RETRIES 5
set_freezable();
while (!kthread_should_stop()) {
/* NOTE: see the lock comments in auditd_send_unicast_skb() */
rcu_read_lock();
ac = rcu_dereference(auditd_conn);
if (!ac) {
rcu_read_unlock();
goto main_queue;
}
net = get_net(ac->net);
sk = audit_get_sk(net);
portid = ac->portid;
rcu_read_unlock();
/* attempt to flush the hold queue */
rc = kauditd_send_queue(sk, portid,
&audit_hold_queue, UNICAST_RETRIES,
NULL, kauditd_rehold_skb);
if (rc < 0) {
sk = NULL;
auditd_reset(ac);
goto main_queue;
}
/* attempt to flush the retry queue */
rc = kauditd_send_queue(sk, portid,
&audit_retry_queue, UNICAST_RETRIES,
NULL, kauditd_hold_skb);
if (rc < 0) {
sk = NULL;
auditd_reset(ac);
goto main_queue;
}
main_queue:
/* process the main queue - do the multicast send and attempt
* unicast, dump failed record sends to the retry queue; if
* sk == NULL due to previous failures we will just do the
* multicast send and move the record to the hold queue */
rc = kauditd_send_queue(sk, portid, &audit_queue, 1,
kauditd_send_multicast_skb,
(sk ?
kauditd_retry_skb : kauditd_hold_skb));
if (ac && rc < 0)
auditd_reset(ac);
sk = NULL;
/* drop our netns reference, no auditd sends past this line */
if (net) {
put_net(net);
net = NULL;
}
/* we have processed all the queues so wake everyone */
wake_up(&audit_backlog_wait);
/* NOTE: we want to wake up if there is anything on the queue,
* regardless of if an auditd is connected, as we need to
* do the multicast send and rotate records from the
* main queue to the retry/hold queues */
wait_event_freezable(kauditd_wait,
(skb_queue_len(&audit_queue) ? 1 : 0));
}
return 0;
}
int audit_send_list_thread(void *_dest)
{
struct audit_netlink_list *dest = _dest;
struct sk_buff *skb;
struct sock *sk = audit_get_sk(dest->net);
/* wait for parent to finish and send an ACK */
audit_ctl_lock();
audit_ctl_unlock();
while ((skb = __skb_dequeue(&dest->q)) != NULL)
netlink_unicast(sk, skb, dest->portid, 0);
put_net(dest->net);
kfree(dest);
return 0;
}
struct sk_buff *audit_make_reply(int seq, int type, int done,
int multi, const void *payload, int size)
{
struct sk_buff *skb;
struct nlmsghdr *nlh;
void *data;
int flags = multi ? NLM_F_MULTI : 0;
int t = done ? NLMSG_DONE : type;
skb = nlmsg_new(size, GFP_KERNEL);
if (!skb)
return NULL;
nlh = nlmsg_put(skb, 0, seq, t, size, flags);
if (!nlh)
goto out_kfree_skb;
data = nlmsg_data(nlh);
memcpy(data, payload, size);
return skb;
out_kfree_skb:
kfree_skb(skb);
return NULL;
}
static void audit_free_reply(struct audit_reply *reply)
{
if (!reply)
return;
kfree_skb(reply->skb);
if (reply->net)
put_net(reply->net);
kfree(reply);
}
static int audit_send_reply_thread(void *arg)
{
struct audit_reply *reply = (struct audit_reply *)arg;
audit_ctl_lock();
audit_ctl_unlock();
/* Ignore failure. It'll only happen if the sender goes away,
because our timeout is set to infinite. */
netlink_unicast(audit_get_sk(reply->net), reply->skb, reply->portid, 0);
reply->skb = NULL;
audit_free_reply(reply);
return 0;
}
/**
* audit_send_reply - send an audit reply message via netlink
* @request_skb: skb of request we are replying to (used to target the reply)
* @seq: sequence number
* @type: audit message type
* @done: done (last) flag
* @multi: multi-part message flag
* @payload: payload data
* @size: payload size
*
* Allocates a skb, builds the netlink message, and sends it to the port id.
*/
static void audit_send_reply(struct sk_buff *request_skb, int seq, int type, int done,
int multi, const void *payload, int size)
{
struct task_struct *tsk;
struct audit_reply *reply;
reply = kzalloc(sizeof(*reply), GFP_KERNEL);
if (!reply)
return;
reply->skb = audit_make_reply(seq, type, done, multi, payload, size);
if (!reply->skb)
goto err;
reply->net = get_net(sock_net(NETLINK_CB(request_skb).sk));
reply->portid = NETLINK_CB(request_skb).portid;
tsk = kthread_run(audit_send_reply_thread, reply, "audit_send_reply");
if (IS_ERR(tsk))
goto err;
return;
err:
audit_free_reply(reply);
}
/*
* Check for appropriate CAP_AUDIT_ capabilities on incoming audit
* control messages.
*/
static int audit_netlink_ok(struct sk_buff *skb, u16 msg_type)
{
int err = 0;
/* Only support initial user namespace for now. */
/*
* We return ECONNREFUSED because it tricks userspace into thinking
* that audit was not configured into the kernel. Lots of users
* configure their PAM stack (because that's what the distro does)
* to reject login if unable to send messages to audit. If we return
* ECONNREFUSED the PAM stack thinks the kernel does not have audit
* configured in and will let login proceed. If we return EPERM
* userspace will reject all logins. This should be removed when we
* support non init namespaces!!
*/
if (current_user_ns() != &init_user_ns)
return -ECONNREFUSED;
switch (msg_type) {
case AUDIT_LIST:
case AUDIT_ADD:
case AUDIT_DEL:
return -EOPNOTSUPP;
case AUDIT_GET:
case AUDIT_SET:
case AUDIT_GET_FEATURE:
case AUDIT_SET_FEATURE:
case AUDIT_LIST_RULES:
case AUDIT_ADD_RULE:
case AUDIT_DEL_RULE:
case AUDIT_SIGNAL_INFO:
case AUDIT_TTY_GET:
case AUDIT_TTY_SET:
case AUDIT_TRIM:
case AUDIT_MAKE_EQUIV:
/* Only support auditd and auditctl in initial pid namespace
* for now. */
if (task_active_pid_ns(current) != &init_pid_ns)
return -EPERM;
if (!netlink_capable(skb, CAP_AUDIT_CONTROL))
err = -EPERM;
break;
case AUDIT_USER:
case AUDIT_FIRST_USER_MSG ... AUDIT_LAST_USER_MSG:
case AUDIT_FIRST_USER_MSG2 ... AUDIT_LAST_USER_MSG2:
if (!netlink_capable(skb, CAP_AUDIT_WRITE))
err = -EPERM;
break;
default: /* bad msg */
err = -EINVAL;
}
return err;
}
static void audit_log_common_recv_msg(struct audit_context *context,
struct audit_buffer **ab, u16 msg_type)
{
uid_t uid = from_kuid(&init_user_ns, current_uid());
pid_t pid = task_tgid_nr(current);
if (!audit_enabled && msg_type != AUDIT_USER_AVC) {
*ab = NULL;
return;
}
*ab = audit_log_start(context, GFP_KERNEL, msg_type);
if (unlikely(!*ab))
return;
audit_log_format(*ab, "pid=%d uid=%u ", pid, uid);
audit_log_session_info(*ab);
audit_log_task_context(*ab);
}
static inline void audit_log_user_recv_msg(struct audit_buffer **ab,
u16 msg_type)
{
audit_log_common_recv_msg(NULL, ab, msg_type);
}
int is_audit_feature_set(int i)
{
return af.features & AUDIT_FEATURE_TO_MASK(i);
}
static int audit_get_feature(struct sk_buff *skb)
{
u32 seq;
seq = nlmsg_hdr(skb)->nlmsg_seq;
audit_send_reply(skb, seq, AUDIT_GET_FEATURE, 0, 0, &af, sizeof(af));
return 0;
}
static void audit_log_feature_change(int which, u32 old_feature, u32 new_feature,
u32 old_lock, u32 new_lock, int res)
{
struct audit_buffer *ab;
if (audit_enabled == AUDIT_OFF)
return;
ab = audit_log_start(audit_context(), GFP_KERNEL, AUDIT_FEATURE_CHANGE);
if (!ab)
return;
audit_log_task_info(ab);
audit_log_format(ab, " feature=%s old=%u new=%u old_lock=%u new_lock=%u res=%d",
audit_feature_names[which], !!old_feature, !!new_feature,
!!old_lock, !!new_lock, res);
audit_log_end(ab);
}
static int audit_set_feature(struct audit_features *uaf)
{
int i;
BUILD_BUG_ON(AUDIT_LAST_FEATURE + 1 > ARRAY_SIZE(audit_feature_names));
/* if there is ever a version 2 we should handle that here */
for (i = 0; i <= AUDIT_LAST_FEATURE; i++) { u32 feature = AUDIT_FEATURE_TO_MASK(i);
u32 old_feature, new_feature, old_lock, new_lock;
/* if we are not changing this feature, move along */
if (!(feature & uaf->mask))
continue;
old_feature = af.features & feature;
new_feature = uaf->features & feature;
new_lock = (uaf->lock | af.lock) & feature; old_lock = af.lock & feature;
/* are we changing a locked feature? */
if (old_lock && (new_feature != old_feature)) {
audit_log_feature_change(i, old_feature, new_feature,
old_lock, new_lock, 0);
return -EPERM;
}
}
/* nothing invalid, do the changes */
for (i = 0; i <= AUDIT_LAST_FEATURE; i++) { u32 feature = AUDIT_FEATURE_TO_MASK(i);
u32 old_feature, new_feature, old_lock, new_lock;
/* if we are not changing this feature, move along */
if (!(feature & uaf->mask))
continue;
old_feature = af.features & feature;
new_feature = uaf->features & feature;
old_lock = af.lock & feature;
new_lock = (uaf->lock | af.lock) & feature;
if (new_feature != old_feature)
audit_log_feature_change(i, old_feature, new_feature,
old_lock, new_lock, 1);
if (new_feature) af.features |= feature;
else
af.features &= ~feature;
af.lock |= new_lock;
}
return 0;
}
static int audit_replace(struct pid *pid)
{
pid_t pvnr;
struct sk_buff *skb;
pvnr = pid_vnr(pid);
skb = audit_make_reply(0, AUDIT_REPLACE, 0, 0, &pvnr, sizeof(pvnr));
if (!skb)
return -ENOMEM;
return auditd_send_unicast_skb(skb);
}
static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
{
u32 seq;
void *data;
int data_len;
int err;
struct audit_buffer *ab;
u16 msg_type = nlh->nlmsg_type;
struct audit_sig_info *sig_data;
char *ctx = NULL;
u32 len;
err = audit_netlink_ok(skb, msg_type);
if (err)
return err;
seq = nlh->nlmsg_seq;
data = nlmsg_data(nlh);
data_len = nlmsg_len(nlh);
switch (msg_type) {
case AUDIT_GET: {
struct audit_status s;
memset(&s, 0, sizeof(s));
s.enabled = audit_enabled;
s.failure = audit_failure;
/* NOTE: use pid_vnr() so the PID is relative to the current
* namespace */
s.pid = auditd_pid_vnr();
s.rate_limit = audit_rate_limit;
s.backlog_limit = audit_backlog_limit;
s.lost = atomic_read(&audit_lost);
s.backlog = skb_queue_len(&audit_queue);
s.feature_bitmap = AUDIT_FEATURE_BITMAP_ALL;
s.backlog_wait_time = audit_backlog_wait_time;
s.backlog_wait_time_actual = atomic_read(&audit_backlog_wait_time_actual);
audit_send_reply(skb, seq, AUDIT_GET, 0, 0, &s, sizeof(s));
break;
}
case AUDIT_SET: {
struct audit_status s;
memset(&s, 0, sizeof(s));
/* guard against past and future API changes */
memcpy(&s, data, min_t(size_t, sizeof(s), data_len));
if (s.mask & AUDIT_STATUS_ENABLED) {
err = audit_set_enabled(s.enabled); if (err < 0)
return err;
}
if (s.mask & AUDIT_STATUS_FAILURE) { err = audit_set_failure(s.failure); if (err < 0)
return err;
}
if (s.mask & AUDIT_STATUS_PID) {
/* NOTE: we are using the vnr PID functions below
* because the s.pid value is relative to the
* namespace of the caller; at present this
* doesn't matter much since you can really only
* run auditd from the initial pid namespace, but
* something to keep in mind if this changes */
pid_t new_pid = s.pid;
pid_t auditd_pid;
struct pid *req_pid = task_tgid(current);
/* Sanity check - PID values must match. Setting
* pid to 0 is how auditd ends auditing. */
if (new_pid && (new_pid != pid_vnr(req_pid)))
return -EINVAL;
/* test the auditd connection */
audit_replace(req_pid);
auditd_pid = auditd_pid_vnr();
if (auditd_pid) {
/* replacing a healthy auditd is not allowed */
if (new_pid) { audit_log_config_change("audit_pid",
new_pid, auditd_pid, 0);
return -EEXIST;
}
/* only current auditd can unregister itself */
if (pid_vnr(req_pid) != auditd_pid) { audit_log_config_change("audit_pid",
new_pid, auditd_pid, 0);
return -EACCES;
}
}
if (new_pid) {
/* register a new auditd connection */
err = auditd_set(req_pid,
NETLINK_CB(skb).portid,
sock_net(NETLINK_CB(skb).sk)); if (audit_enabled != AUDIT_OFF) audit_log_config_change("audit_pid",
new_pid,
auditd_pid,
err ? 0 : 1);
if (err)
return err;
/* try to process any backlog */
wake_up_interruptible(&kauditd_wait);
} else {
if (audit_enabled != AUDIT_OFF) audit_log_config_change("audit_pid",
new_pid,
auditd_pid, 1);
/* unregister the auditd connection */
auditd_reset(NULL);
}
}
if (s.mask & AUDIT_STATUS_RATE_LIMIT) { err = audit_set_rate_limit(s.rate_limit); if (err < 0)
return err;
}
if (s.mask & AUDIT_STATUS_BACKLOG_LIMIT) { err = audit_set_backlog_limit(s.backlog_limit); if (err < 0)
return err;
}
if (s.mask & AUDIT_STATUS_BACKLOG_WAIT_TIME) { if (sizeof(s) > (size_t)nlh->nlmsg_len)
return -EINVAL;
if (s.backlog_wait_time > 10*AUDIT_BACKLOG_WAIT_TIME)
return -EINVAL;
err = audit_set_backlog_wait_time(s.backlog_wait_time);
if (err < 0)
return err;
}
if (s.mask == AUDIT_STATUS_LOST) {
u32 lost = atomic_xchg(&audit_lost, 0);
audit_log_config_change("lost", 0, lost, 1);
return lost;
}
if (s.mask == AUDIT_STATUS_BACKLOG_WAIT_TIME_ACTUAL) {
u32 actual = atomic_xchg(&audit_backlog_wait_time_actual, 0);
audit_log_config_change("backlog_wait_time_actual", 0, actual, 1);
return actual;
}
break;
}
case AUDIT_GET_FEATURE:
err = audit_get_feature(skb);
if (err)
return err;
break;
case AUDIT_SET_FEATURE:
if (data_len < sizeof(struct audit_features))
return -EINVAL;
err = audit_set_feature(data);
if (err)
return err;
break;
case AUDIT_USER:
case AUDIT_FIRST_USER_MSG ... AUDIT_LAST_USER_MSG:
case AUDIT_FIRST_USER_MSG2 ... AUDIT_LAST_USER_MSG2:
if (!audit_enabled && msg_type != AUDIT_USER_AVC)
return 0;
/* exit early if there isn't at least one character to print */
if (data_len < 2)
return -EINVAL;
err = audit_filter(msg_type, AUDIT_FILTER_USER); if (err == 1) { /* match or error */
char *str = data;
err = 0;
if (msg_type == AUDIT_USER_TTY) { err = tty_audit_push();
if (err)
break;
}
audit_log_user_recv_msg(&ab, msg_type);
if (msg_type != AUDIT_USER_TTY) {
/* ensure NULL termination */
str[data_len - 1] = '\0';
audit_log_format(ab, " msg='%.*s'",
AUDIT_MESSAGE_TEXT_MAX,
str);
} else {
audit_log_format(ab, " data=");
if (data_len > 0 && str[data_len - 1] == '\0')
data_len--;
audit_log_n_untrustedstring(ab, str, data_len);
}
audit_log_end(ab);
}
break;
case AUDIT_ADD_RULE:
case AUDIT_DEL_RULE:
if (data_len < sizeof(struct audit_rule_data))
return -EINVAL;
if (audit_enabled == AUDIT_LOCKED) {
audit_log_common_recv_msg(audit_context(), &ab,
AUDIT_CONFIG_CHANGE);
audit_log_format(ab, " op=%s audit_enabled=%d res=0",
msg_type == AUDIT_ADD_RULE ?
"add_rule" : "remove_rule",
audit_enabled);
audit_log_end(ab);
return -EPERM;
}
err = audit_rule_change(msg_type, seq, data, data_len);
break;
case AUDIT_LIST_RULES:
err = audit_list_rules_send(skb, seq);
break;
case AUDIT_TRIM:
audit_trim_trees();
audit_log_common_recv_msg(audit_context(), &ab,
AUDIT_CONFIG_CHANGE);
audit_log_format(ab, " op=trim res=1");
audit_log_end(ab);
break;
case AUDIT_MAKE_EQUIV: {
void *bufp = data;
u32 sizes[2];
size_t msglen = data_len;
char *old, *new;
err = -EINVAL;
if (msglen < 2 * sizeof(u32))
break;
memcpy(sizes, bufp, 2 * sizeof(u32));
bufp += 2 * sizeof(u32);
msglen -= 2 * sizeof(u32);
old = audit_unpack_string(&bufp, &msglen, sizes[0]);
if (IS_ERR(old)) {
err = PTR_ERR(old);
break;
}
new = audit_unpack_string(&bufp, &msglen, sizes[1]);
if (IS_ERR(new)) {
err = PTR_ERR(new);
kfree(old);
break;
}
/* OK, here comes... */
err = audit_tag_tree(old, new);
audit_log_common_recv_msg(audit_context(), &ab,
AUDIT_CONFIG_CHANGE);
audit_log_format(ab, " op=make_equiv old=");
audit_log_untrustedstring(ab, old);
audit_log_format(ab, " new=");
audit_log_untrustedstring(ab, new);
audit_log_format(ab, " res=%d", !err);
audit_log_end(ab);
kfree(old);
kfree(new);
break;
}
case AUDIT_SIGNAL_INFO:
len = 0;
if (audit_sig_sid) {
err = security_secid_to_secctx(audit_sig_sid, &ctx, &len); if (err)
return err;
}
sig_data = kmalloc(sizeof(*sig_data) + len, GFP_KERNEL);
if (!sig_data) {
if (audit_sig_sid) security_release_secctx(ctx, len);
return -ENOMEM;
}
sig_data->uid = from_kuid(&init_user_ns, audit_sig_uid);
sig_data->pid = audit_sig_pid;
if (audit_sig_sid) {
memcpy(sig_data->ctx, ctx, len);
security_release_secctx(ctx, len);
}
audit_send_reply(skb, seq, AUDIT_SIGNAL_INFO, 0, 0,
sig_data, sizeof(*sig_data) + len);
kfree(sig_data);
break;
case AUDIT_TTY_GET: {
struct audit_tty_status s;
unsigned int t;
t = READ_ONCE(current->signal->audit_tty);
s.enabled = t & AUDIT_TTY_ENABLE;
s.log_passwd = !!(t & AUDIT_TTY_LOG_PASSWD);
audit_send_reply(skb, seq, AUDIT_TTY_GET, 0, 0, &s, sizeof(s));
break;
}
case AUDIT_TTY_SET: {
struct audit_tty_status s, old;
struct audit_buffer *ab;
unsigned int t;
memset(&s, 0, sizeof(s));
/* guard against past and future API changes */
memcpy(&s, data, min_t(size_t, sizeof(s), data_len));
/* check if new data is valid */
if ((s.enabled != 0 && s.enabled != 1) ||
(s.log_passwd != 0 && s.log_passwd != 1))
err = -EINVAL;
if (err)
t = READ_ONCE(current->signal->audit_tty);
else {
t = s.enabled | (-s.log_passwd & AUDIT_TTY_LOG_PASSWD);
t = xchg(¤t->signal->audit_tty, t);
}
old.enabled = t & AUDIT_TTY_ENABLE;
old.log_passwd = !!(t & AUDIT_TTY_LOG_PASSWD);
audit_log_common_recv_msg(audit_context(), &ab,
AUDIT_CONFIG_CHANGE);
audit_log_format(ab, " op=tty_set old-enabled=%d new-enabled=%d"
" old-log_passwd=%d new-log_passwd=%d res=%d",
old.enabled, s.enabled, old.log_passwd,
s.log_passwd, !err);
audit_log_end(ab);
break;
}
default:
err = -EINVAL;
break;
}
return err < 0 ? err : 0;
}
/**
* audit_receive - receive messages from a netlink control socket
* @skb: the message buffer
*
* Parse the provided skb and deal with any messages that may be present,
* malformed skbs are discarded.
*/
static void audit_receive(struct sk_buff *skb)
{
struct nlmsghdr *nlh;
/*
* len MUST be signed for nlmsg_next to be able to dec it below 0
* if the nlmsg_len was not aligned
*/
int len;
int err;
nlh = nlmsg_hdr(skb);
len = skb->len;
audit_ctl_lock();
while (nlmsg_ok(nlh, len)) {
err = audit_receive_msg(skb, nlh);
/* if err or if this message says it wants a response */
if (err || (nlh->nlmsg_flags & NLM_F_ACK)) netlink_ack(skb, nlh, err, NULL);
nlh = nlmsg_next(nlh, &len);
}
audit_ctl_unlock();
/* can't block with the ctrl lock, so penalize the sender now */
if (audit_backlog_limit &&
(skb_queue_len(&audit_queue) > audit_backlog_limit)) { DECLARE_WAITQUEUE(wait, current);
/* wake kauditd to try and flush the queue */
wake_up_interruptible(&kauditd_wait);
add_wait_queue_exclusive(&audit_backlog_wait, &wait);
set_current_state(TASK_UNINTERRUPTIBLE);
schedule_timeout(audit_backlog_wait_time);
remove_wait_queue(&audit_backlog_wait, &wait);
}
}
/* Log information about who is connecting to the audit multicast socket */
static void audit_log_multicast(int group, const char *op, int err)
{
const struct cred *cred;
struct tty_struct *tty;
char comm[sizeof(current->comm)];
struct audit_buffer *ab;
if (!audit_enabled)
return;
ab = audit_log_start(audit_context(), GFP_KERNEL, AUDIT_EVENT_LISTENER);
if (!ab)
return;
cred = current_cred();
tty = audit_get_tty();
audit_log_format(ab, "pid=%u uid=%u auid=%u tty=%s ses=%u",
task_pid_nr(current),
from_kuid(&init_user_ns, cred->uid),
from_kuid(&init_user_ns, audit_get_loginuid(current)),
tty ? tty_name(tty) : "(none)",
audit_get_sessionid(current));
audit_put_tty(tty);
audit_log_task_context(ab); /* subj= */
audit_log_format(ab, " comm=");
audit_log_untrustedstring(ab, get_task_comm(comm, current));
audit_log_d_path_exe(ab, current->mm); /* exe= */
audit_log_format(ab, " nl-mcgrp=%d op=%s res=%d", group, op, !err);
audit_log_end(ab);
}
/* Run custom bind function on netlink socket group connect or bind requests. */
static int audit_multicast_bind(struct net *net, int group)
{
int err = 0;
if (!capable(CAP_AUDIT_READ))
err = -EPERM;
audit_log_multicast(group, "connect", err);
return err;
}
static void audit_multicast_unbind(struct net *net, int group)
{
audit_log_multicast(group, "disconnect", 0);
}
static int __net_init audit_net_init(struct net *net)
{
struct netlink_kernel_cfg cfg = {
.input = audit_receive,
.bind = audit_multicast_bind,
.unbind = audit_multicast_unbind,
.flags = NL_CFG_F_NONROOT_RECV,
.groups = AUDIT_NLGRP_MAX,
};
struct audit_net *aunet = net_generic(net, audit_net_id);
aunet->sk = netlink_kernel_create(net, NETLINK_AUDIT, &cfg);
if (aunet->sk == NULL) {
audit_panic("cannot initialize netlink socket in namespace");
return -ENOMEM;
}
/* limit the timeout in case auditd is blocked/stopped */
aunet->sk->sk_sndtimeo = HZ / 10;
return 0;
}
static void __net_exit audit_net_exit(struct net *net)
{
struct audit_net *aunet = net_generic(net, audit_net_id);
/* NOTE: you would think that we would want to check the auditd
* connection and potentially reset it here if it lives in this
* namespace, but since the auditd connection tracking struct holds a
* reference to this namespace (see auditd_set()) we are only ever
* going to get here after that connection has been released */
netlink_kernel_release(aunet->sk);
}
static struct pernet_operations audit_net_ops __net_initdata = {
.init = audit_net_init,
.exit = audit_net_exit,
.id = &audit_net_id,
.size = sizeof(struct audit_net),
};
/* Initialize audit support at boot time. */
static int __init audit_init(void)
{
int i;
if (audit_initialized == AUDIT_DISABLED)
return 0;
audit_buffer_cache = kmem_cache_create("audit_buffer",
sizeof(struct audit_buffer),
0, SLAB_PANIC, NULL);
skb_queue_head_init(&audit_queue);
skb_queue_head_init(&audit_retry_queue);
skb_queue_head_init(&audit_hold_queue);
for (i = 0; i < AUDIT_INODE_BUCKETS; i++)
INIT_LIST_HEAD(&audit_inode_hash[i]);
mutex_init(&audit_cmd_mutex.lock);
audit_cmd_mutex.owner = NULL;
pr_info("initializing netlink subsys (%s)\n",
audit_default ? "enabled" : "disabled");
register_pernet_subsys(&audit_net_ops);
audit_initialized = AUDIT_INITIALIZED;
kauditd_task = kthread_run(kauditd_thread, NULL, "kauditd");
if (IS_ERR(kauditd_task)) {
int err = PTR_ERR(kauditd_task);
panic("audit: failed to start the kauditd thread (%d)\n", err);
}
audit_log(NULL, GFP_KERNEL, AUDIT_KERNEL,
"state=initialized audit_enabled=%u res=1",
audit_enabled);
return 0;
}
postcore_initcall(audit_init);
/*
* Process kernel command-line parameter at boot time.
* audit={0|off} or audit={1|on}.
*/
static int __init audit_enable(char *str)
{
if (!strcasecmp(str, "off") || !strcmp(str, "0"))
audit_default = AUDIT_OFF;
else if (!strcasecmp(str, "on") || !strcmp(str, "1"))
audit_default = AUDIT_ON;
else {
pr_err("audit: invalid 'audit' parameter value (%s)\n", str);
audit_default = AUDIT_ON;
}
if (audit_default == AUDIT_OFF)
audit_initialized = AUDIT_DISABLED;
if (audit_set_enabled(audit_default))
pr_err("audit: error setting audit state (%d)\n",
audit_default);
pr_info("%s\n", audit_default ?
"enabled (after initialization)" : "disabled (until reboot)");
return 1;
}
__setup("audit=", audit_enable);
/* Process kernel command-line parameter at boot time.
* audit_backlog_limit=<n> */
static int __init audit_backlog_limit_set(char *str)
{
u32 audit_backlog_limit_arg;
pr_info("audit_backlog_limit: ");
if (kstrtouint(str, 0, &audit_backlog_limit_arg)) {
pr_cont("using default of %u, unable to parse %s\n",
audit_backlog_limit, str);
return 1;
}
audit_backlog_limit = audit_backlog_limit_arg;
pr_cont("%d\n", audit_backlog_limit);
return 1;
}
__setup("audit_backlog_limit=", audit_backlog_limit_set);
static void audit_buffer_free(struct audit_buffer *ab)
{
if (!ab)
return;
kfree_skb(ab->skb);
kmem_cache_free(audit_buffer_cache, ab);
}
static struct audit_buffer *audit_buffer_alloc(struct audit_context *ctx,
gfp_t gfp_mask, int type)
{
struct audit_buffer *ab;
ab = kmem_cache_alloc(audit_buffer_cache, gfp_mask);
if (!ab)
return NULL;
ab->skb = nlmsg_new(AUDIT_BUFSIZ, gfp_mask);
if (!ab->skb)
goto err;
if (!nlmsg_put(ab->skb, 0, 0, type, 0, 0))
goto err;
ab->ctx = ctx;
ab->gfp_mask = gfp_mask;
return ab;
err:
audit_buffer_free(ab);
return NULL;
}
/**
* audit_serial - compute a serial number for the audit record
*
* Compute a serial number for the audit record. Audit records are
* written to user-space as soon as they are generated, so a complete
* audit record may be written in several pieces. The timestamp of the
* record and this serial number are used by the user-space tools to
* determine which pieces belong to the same audit record. The
* (timestamp,serial) tuple is unique for each syscall and is live from
* syscall entry to syscall exit.
*
* NOTE: Another possibility is to store the formatted records off the
* audit context (for those records that have a context), and emit them
* all at syscall exit. However, this could delay the reporting of
* significant errors until syscall exit (or never, if the system
* halts).
*/
unsigned int audit_serial(void)
{
static atomic_t serial = ATOMIC_INIT(0);
return atomic_inc_return(&serial);
}
static inline void audit_get_stamp(struct audit_context *ctx,
struct timespec64 *t, unsigned int *serial)
{
if (!ctx || !auditsc_get_stamp(ctx, t, serial)) {
ktime_get_coarse_real_ts64(t);
*serial = audit_serial();
}
}
/**
* audit_log_start - obtain an audit buffer
* @ctx: audit_context (may be NULL)
* @gfp_mask: type of allocation
* @type: audit message type
*
* Returns audit_buffer pointer on success or NULL on error.
*
* Obtain an audit buffer. This routine does locking to obtain the
* audit buffer, but then no locking is required for calls to
* audit_log_*format. If the task (ctx) is a task that is currently in a
* syscall, then the syscall is marked as auditable and an audit record
* will be written at syscall exit. If there is no associated task, then
* task context (ctx) should be NULL.
*/
struct audit_buffer *audit_log_start(struct audit_context *ctx, gfp_t gfp_mask,
int type)
{
struct audit_buffer *ab;
struct timespec64 t;
unsigned int serial;
if (audit_initialized != AUDIT_INITIALIZED)
return NULL;
if (unlikely(!audit_filter(type, AUDIT_FILTER_EXCLUDE)))
return NULL;
/* NOTE: don't ever fail/sleep on these two conditions:
* 1. auditd generated record - since we need auditd to drain the
* queue; also, when we are checking for auditd, compare PIDs using
* task_tgid_vnr() since auditd_pid is set in audit_receive_msg()
* using a PID anchored in the caller's namespace
* 2. generator holding the audit_cmd_mutex - we don't want to block
* while holding the mutex, although we do penalize the sender
* later in audit_receive() when it is safe to block
*/
if (!(auditd_test_task(current) || audit_ctl_owner_current())) {
long stime = audit_backlog_wait_time;
while (audit_backlog_limit &&
(skb_queue_len(&audit_queue) > audit_backlog_limit)) {
/* wake kauditd to try and flush the queue */
wake_up_interruptible(&kauditd_wait);
/* sleep if we are allowed and we haven't exhausted our
* backlog wait limit */
if (gfpflags_allow_blocking(gfp_mask) && (stime > 0)) {
long rtime = stime;
DECLARE_WAITQUEUE(wait, current);
add_wait_queue_exclusive(&audit_backlog_wait,
&wait);
set_current_state(TASK_UNINTERRUPTIBLE);
stime = schedule_timeout(rtime);
atomic_add(rtime - stime, &audit_backlog_wait_time_actual);
remove_wait_queue(&audit_backlog_wait, &wait);
} else {
if (audit_rate_check() && printk_ratelimit())
pr_warn("audit_backlog=%d > audit_backlog_limit=%d\n",
skb_queue_len(&audit_queue),
audit_backlog_limit);
audit_log_lost("backlog limit exceeded");
return NULL;
}
}
}
ab = audit_buffer_alloc(ctx, gfp_mask, type);
if (!ab) {
audit_log_lost("out of memory in audit_log_start");
return NULL;
}
audit_get_stamp(ab->ctx, &t, &serial);
/* cancel dummy context to enable supporting records */
if (ctx)
ctx->dummy = 0;
audit_log_format(ab, "audit(%llu.%03lu:%u): ",
(unsigned long long)t.tv_sec, t.tv_nsec/1000000, serial);
return ab;
}
/**
* audit_expand - expand skb in the audit buffer
* @ab: audit_buffer
* @extra: space to add at tail of the skb
*
* Returns 0 (no space) on failed expansion, or available space if
* successful.
*/
static inline int audit_expand(struct audit_buffer *ab, int extra)
{
struct sk_buff *skb = ab->skb;
int oldtail = skb_tailroom(skb);
int ret = pskb_expand_head(skb, 0, extra, ab->gfp_mask);
int newtail = skb_tailroom(skb);
if (ret < 0) {
audit_log_lost("out of memory in audit_expand");
return 0;
}
skb->truesize += newtail - oldtail;
return newtail;
}
/*
* Format an audit message into the audit buffer. If there isn't enough
* room in the audit buffer, more room will be allocated and vsnprint
* will be called a second time. Currently, we assume that a printk
* can't format message larger than 1024 bytes, so we don't either.
*/
static void audit_log_vformat(struct audit_buffer *ab, const char *fmt,
va_list args)
{
int len, avail;
struct sk_buff *skb;
va_list args2;
if (!ab)
return;
BUG_ON(!ab->skb);
skb = ab->skb;
avail = skb_tailroom(skb);
if (avail == 0) {
avail = audit_expand(ab, AUDIT_BUFSIZ);
if (!avail)
goto out;
}
va_copy(args2, args);
len = vsnprintf(skb_tail_pointer(skb), avail, fmt, args);
if (len >= avail) {
/* The printk buffer is 1024 bytes long, so if we get
* here and AUDIT_BUFSIZ is at least 1024, then we can
* log everything that printk could have logged. */
avail = audit_expand(ab,
max_t(unsigned, AUDIT_BUFSIZ, 1+len-avail));
if (!avail)
goto out_va_end;
len = vsnprintf(skb_tail_pointer(skb), avail, fmt, args2);
}
if (len > 0)
skb_put(skb, len);
out_va_end:
va_end(args2);
out:
return;
}
/**
* audit_log_format - format a message into the audit buffer.
* @ab: audit_buffer
* @fmt: format string
* @...: optional parameters matching @fmt string
*
* All the work is done in audit_log_vformat.
*/
void audit_log_format(struct audit_buffer *ab, const char *fmt, ...)
{
va_list args;
if (!ab)
return;
va_start(args, fmt);
audit_log_vformat(ab, fmt, args);
va_end(args);
}
/**
* audit_log_n_hex - convert a buffer to hex and append it to the audit skb
* @ab: the audit_buffer
* @buf: buffer to convert to hex
* @len: length of @buf to be converted
*
* No return value; failure to expand is silently ignored.
*
* This function will take the passed buf and convert it into a string of
* ascii hex digits. The new string is placed onto the skb.
*/
void audit_log_n_hex(struct audit_buffer *ab, const unsigned char *buf,
size_t len)
{
int i, avail, new_len;
unsigned char *ptr;
struct sk_buff *skb;
if (!ab)
return;
BUG_ON(!ab->skb);
skb = ab->skb;
avail = skb_tailroom(skb);
new_len = len<<1;
if (new_len >= avail) {
/* Round the buffer request up to the next multiple */
new_len = AUDIT_BUFSIZ*(((new_len-avail)/AUDIT_BUFSIZ) + 1);
avail = audit_expand(ab, new_len);
if (!avail)
return;
}
ptr = skb_tail_pointer(skb);
for (i = 0; i < len; i++)
ptr = hex_byte_pack_upper(ptr, buf[i]);
*ptr = 0;
skb_put(skb, len << 1); /* new string is twice the old string */
}
/*
* Format a string of no more than slen characters into the audit buffer,
* enclosed in quote marks.
*/
void audit_log_n_string(struct audit_buffer *ab, const char *string,
size_t slen)
{
int avail, new_len;
unsigned char *ptr;
struct sk_buff *skb;
if (!ab)
return;
BUG_ON(!ab->skb);
skb = ab->skb;
avail = skb_tailroom(skb);
new_len = slen + 3; /* enclosing quotes + null terminator */
if (new_len > avail) {
avail = audit_expand(ab, new_len);
if (!avail)
return;
}
ptr = skb_tail_pointer(skb);
*ptr++ = '"';
memcpy(ptr, string, slen);
ptr += slen;
*ptr++ = '"';
*ptr = 0;
skb_put(skb, slen + 2); /* don't include null terminator */
}
/**
* audit_string_contains_control - does a string need to be logged in hex
* @string: string to be checked
* @len: max length of the string to check
*/
bool audit_string_contains_control(const char *string, size_t len)
{
const unsigned char *p;
for (p = string; p < (const unsigned char *)string + len; p++) {
if (*p == '"' || *p < 0x21 || *p > 0x7e)
return true;
}
return false;
}
/**
* audit_log_n_untrustedstring - log a string that may contain random characters
* @ab: audit_buffer
* @len: length of string (not including trailing null)
* @string: string to be logged
*
* This code will escape a string that is passed to it if the string
* contains a control character, unprintable character, double quote mark,
* or a space. Unescaped strings will start and end with a double quote mark.
* Strings that are escaped are printed in hex (2 digits per char).
*
* The caller specifies the number of characters in the string to log, which may
* or may not be the entire string.
*/
void audit_log_n_untrustedstring(struct audit_buffer *ab, const char *string,
size_t len)
{
if (audit_string_contains_control(string, len))
audit_log_n_hex(ab, string, len);
else
audit_log_n_string(ab, string, len);
}
/**
* audit_log_untrustedstring - log a string that may contain random characters
* @ab: audit_buffer
* @string: string to be logged
*
* Same as audit_log_n_untrustedstring(), except that strlen is used to
* determine string length.
*/
void audit_log_untrustedstring(struct audit_buffer *ab, const char *string)
{
audit_log_n_untrustedstring(ab, string, strlen(string));
}
/* This is a helper-function to print the escaped d_path */
void audit_log_d_path(struct audit_buffer *ab, const char *prefix,
const struct path *path)
{
char *p, *pathname;
if (prefix)
audit_log_format(ab, "%s", prefix);
/* We will allow 11 spaces for ' (deleted)' to be appended */
pathname = kmalloc(PATH_MAX+11, ab->gfp_mask);
if (!pathname) {
audit_log_format(ab, "\"<no_memory>\"");
return;
}
p = d_path(path, pathname, PATH_MAX+11);
if (IS_ERR(p)) { /* Should never happen since we send PATH_MAX */
/* FIXME: can we save some information here? */
audit_log_format(ab, "\"<too_long>\"");
} else
audit_log_untrustedstring(ab, p);
kfree(pathname);
}
void audit_log_session_info(struct audit_buffer *ab)
{
unsigned int sessionid = audit_get_sessionid(current);
uid_t auid = from_kuid(&init_user_ns, audit_get_loginuid(current));
audit_log_format(ab, "auid=%u ses=%u", auid, sessionid);
}
void audit_log_key(struct audit_buffer *ab, char *key)
{
audit_log_format(ab, " key=");
if (key)
audit_log_untrustedstring(ab, key);
else
audit_log_format(ab, "(null)");
}
int audit_log_task_context(struct audit_buffer *ab)
{
char *ctx = NULL;
unsigned len;
int error;
u32 sid;
security_task_getsecid_subj(current, &sid);
if (!sid)
return 0;
error = security_secid_to_secctx(sid, &ctx, &len);
if (error) {
if (error != -EINVAL)
goto error_path;
return 0;
}
audit_log_format(ab, " subj=%s", ctx);
security_release_secctx(ctx, len);
return 0;
error_path:
audit_panic("error in audit_log_task_context");
return error;
}
EXPORT_SYMBOL(audit_log_task_context);
void audit_log_d_path_exe(struct audit_buffer *ab,
struct mm_struct *mm)
{
struct file *exe_file;
if (!mm)
goto out_null;
exe_file = get_mm_exe_file(mm);
if (!exe_file)
goto out_null;
audit_log_d_path(ab, " exe=", &exe_file->f_path);
fput(exe_file);
return;
out_null:
audit_log_format(ab, " exe=(null)");
}
struct tty_struct *audit_get_tty(void)
{
struct tty_struct *tty = NULL;
unsigned long flags;
spin_lock_irqsave(¤t->sighand->siglock, flags);
if (current->signal)
tty = tty_kref_get(current->signal->tty);
spin_unlock_irqrestore(¤t->sighand->siglock, flags);
return tty;
}
void audit_put_tty(struct tty_struct *tty)
{
tty_kref_put(tty);
}
void audit_log_task_info(struct audit_buffer *ab)
{
const struct cred *cred;
char comm[sizeof(current->comm)];
struct tty_struct *tty;
if (!ab)
return;
cred = current_cred();
tty = audit_get_tty();
audit_log_format(ab,
" ppid=%d pid=%d auid=%u uid=%u gid=%u"
" euid=%u suid=%u fsuid=%u"
" egid=%u sgid=%u fsgid=%u tty=%s ses=%u",
task_ppid_nr(current),
task_tgid_nr(current),
from_kuid(&init_user_ns, audit_get_loginuid(current)),
from_kuid(&init_user_ns, cred->uid),
from_kgid(&init_user_ns, cred->gid),
from_kuid(&init_user_ns, cred->euid),
from_kuid(&init_user_ns, cred->suid),
from_kuid(&init_user_ns, cred->fsuid),
from_kgid(&init_user_ns, cred->egid),
from_kgid(&init_user_ns, cred->sgid),
from_kgid(&init_user_ns, cred->fsgid),
tty ? tty_name(tty) : "(none)",
audit_get_sessionid(current));
audit_put_tty(tty);
audit_log_format(ab, " comm=");
audit_log_untrustedstring(ab, get_task_comm(comm, current));
audit_log_d_path_exe(ab, current->mm);
audit_log_task_context(ab);
}
EXPORT_SYMBOL(audit_log_task_info);
/**
* audit_log_path_denied - report a path restriction denial
* @type: audit message type (AUDIT_ANOM_LINK, AUDIT_ANOM_CREAT, etc)
* @operation: specific operation name
*/
void audit_log_path_denied(int type, const char *operation)
{
struct audit_buffer *ab;
if (!audit_enabled || audit_dummy_context())
return;
/* Generate log with subject, operation, outcome. */
ab = audit_log_start(audit_context(), GFP_KERNEL, type);
if (!ab)
return;
audit_log_format(ab, "op=%s", operation);
audit_log_task_info(ab);
audit_log_format(ab, " res=0");
audit_log_end(ab);
}
/* global counter which is incremented every time something logs in */
static atomic_t session_id = ATOMIC_INIT(0);
static int audit_set_loginuid_perm(kuid_t loginuid)
{
/* if we are unset, we don't need privs */
if (!audit_loginuid_set(current))
return 0;
/* if AUDIT_FEATURE_LOGINUID_IMMUTABLE means never ever allow a change*/
if (is_audit_feature_set(AUDIT_FEATURE_LOGINUID_IMMUTABLE))
return -EPERM;
/* it is set, you need permission */
if (!capable(CAP_AUDIT_CONTROL))
return -EPERM;
/* reject if this is not an unset and we don't allow that */
if (is_audit_feature_set(AUDIT_FEATURE_ONLY_UNSET_LOGINUID)
&& uid_valid(loginuid))
return -EPERM;
return 0;
}
static void audit_log_set_loginuid(kuid_t koldloginuid, kuid_t kloginuid,
unsigned int oldsessionid,
unsigned int sessionid, int rc)
{
struct audit_buffer *ab;
uid_t uid, oldloginuid, loginuid;
struct tty_struct *tty;
if (!audit_enabled)
return;
ab = audit_log_start(audit_context(), GFP_KERNEL, AUDIT_LOGIN);
if (!ab)
return;
uid = from_kuid(&init_user_ns, task_uid(current));
oldloginuid = from_kuid(&init_user_ns, koldloginuid);
loginuid = from_kuid(&init_user_ns, kloginuid);
tty = audit_get_tty();
audit_log_format(ab, "pid=%d uid=%u", task_tgid_nr(current), uid);
audit_log_task_context(ab);
audit_log_format(ab, " old-auid=%u auid=%u tty=%s old-ses=%u ses=%u res=%d",
oldloginuid, loginuid, tty ? tty_name(tty) : "(none)",
oldsessionid, sessionid, !rc);
audit_put_tty(tty);
audit_log_end(ab);
}
/**
* audit_set_loginuid - set current task's loginuid
* @loginuid: loginuid value
*
* Returns 0.
*
* Called (set) from fs/proc/base.c::proc_loginuid_write().
*/
int audit_set_loginuid(kuid_t loginuid)
{
unsigned int oldsessionid, sessionid = AUDIT_SID_UNSET;
kuid_t oldloginuid;
int rc;
oldloginuid = audit_get_loginuid(current);
oldsessionid = audit_get_sessionid(current);
rc = audit_set_loginuid_perm(loginuid);
if (rc)
goto out;
/* are we setting or clearing? */
if (uid_valid(loginuid)) {
sessionid = (unsigned int)atomic_inc_return(&session_id);
if (unlikely(sessionid == AUDIT_SID_UNSET))
sessionid = (unsigned int)atomic_inc_return(&session_id);
}
current->sessionid = sessionid;
current->loginuid = loginuid;
out:
audit_log_set_loginuid(oldloginuid, loginuid, oldsessionid, sessionid, rc);
return rc;
}
/**
* audit_signal_info - record signal info for shutting down audit subsystem
* @sig: signal value
* @t: task being signaled
*
* If the audit subsystem is being terminated, record the task (pid)
* and uid that is doing that.
*/
int audit_signal_info(int sig, struct task_struct *t)
{
kuid_t uid = current_uid(), auid;
if (auditd_test_task(t) &&
(sig == SIGTERM || sig == SIGHUP ||
sig == SIGUSR1 || sig == SIGUSR2)) {
audit_sig_pid = task_tgid_nr(current);
auid = audit_get_loginuid(current);
if (uid_valid(auid))
audit_sig_uid = auid;
else
audit_sig_uid = uid;
security_task_getsecid_subj(current, &audit_sig_sid);
}
return audit_signal_info_syscall(t);
}
/**
* audit_log_end - end one audit record
* @ab: the audit_buffer
*
* We can not do a netlink send inside an irq context because it blocks (last
* arg, flags, is not set to MSG_DONTWAIT), so the audit buffer is placed on a
* queue and a kthread is scheduled to remove them from the queue outside the
* irq context. May be called in any context.
*/
void audit_log_end(struct audit_buffer *ab)
{
struct sk_buff *skb;
struct nlmsghdr *nlh;
if (!ab)
return;
if (audit_rate_check()) {
skb = ab->skb;
ab->skb = NULL;
/* setup the netlink header, see the comments in
* kauditd_send_multicast_skb() for length quirks */
nlh = nlmsg_hdr(skb);
nlh->nlmsg_len = skb->len - NLMSG_HDRLEN;
/* queue the netlink packet and poke the kauditd thread */
skb_queue_tail(&audit_queue, skb);
wake_up_interruptible(&kauditd_wait);
} else
audit_log_lost("rate limit exceeded");
audit_buffer_free(ab);
}
/**
* audit_log - Log an audit record
* @ctx: audit context
* @gfp_mask: type of allocation
* @type: audit message type
* @fmt: format string to use
* @...: variable parameters matching the format string
*
* This is a convenience function that calls audit_log_start,
* audit_log_vformat, and audit_log_end. It may be called
* in any context.
*/
void audit_log(struct audit_context *ctx, gfp_t gfp_mask, int type,
const char *fmt, ...)
{
struct audit_buffer *ab;
va_list args;
ab = audit_log_start(ctx, gfp_mask, type);
if (ab) {
va_start(args, fmt);
audit_log_vformat(ab, fmt, args);
va_end(args);
audit_log_end(ab);
}
}
EXPORT_SYMBOL(audit_log_start);
EXPORT_SYMBOL(audit_log_end);
EXPORT_SYMBOL(audit_log_format);
EXPORT_SYMBOL(audit_log);
// SPDX-License-Identifier: GPL-2.0
/*
* Kernel timekeeping code and accessor functions. Based on code from
* timer.c, moved in commit 8524070b7982.
*/
#include <linux/timekeeper_internal.h>
#include <linux/module.h>
#include <linux/interrupt.h>
#include <linux/percpu.h>
#include <linux/init.h>
#include <linux/mm.h>
#include <linux/nmi.h>
#include <linux/sched.h>
#include <linux/sched/loadavg.h>
#include <linux/sched/clock.h>
#include <linux/syscore_ops.h>
#include <linux/clocksource.h>
#include <linux/jiffies.h>
#include <linux/time.h>
#include <linux/tick.h>
#include <linux/stop_machine.h>
#include <linux/pvclock_gtod.h>
#include <linux/compiler.h>
#include <linux/audit.h>
#include "tick-internal.h"
#include "ntp_internal.h"
#include "timekeeping_internal.h"
#define TK_CLEAR_NTP (1 << 0)
#define TK_MIRROR (1 << 1)
#define TK_CLOCK_WAS_SET (1 << 2)
enum timekeeping_adv_mode {
/* Update timekeeper when a tick has passed */
TK_ADV_TICK,
/* Update timekeeper on a direct frequency change */
TK_ADV_FREQ
};
DEFINE_RAW_SPINLOCK(timekeeper_lock);
/*
* The most important data for readout fits into a single 64 byte
* cache line.
*/
static struct {
seqcount_raw_spinlock_t seq;
struct timekeeper timekeeper;
} tk_core ____cacheline_aligned = {
.seq = SEQCNT_RAW_SPINLOCK_ZERO(tk_core.seq, &timekeeper_lock),
};
static struct timekeeper shadow_timekeeper;
/* flag for if timekeeping is suspended */
int __read_mostly timekeeping_suspended;
/**
* struct tk_fast - NMI safe timekeeper
* @seq: Sequence counter for protecting updates. The lowest bit
* is the index for the tk_read_base array
* @base: tk_read_base array. Access is indexed by the lowest bit of
* @seq.
*
* See @update_fast_timekeeper() below.
*/
struct tk_fast {
seqcount_latch_t seq;
struct tk_read_base base[2];
};
/* Suspend-time cycles value for halted fast timekeeper. */
static u64 cycles_at_suspend;
static u64 dummy_clock_read(struct clocksource *cs)
{
if (timekeeping_suspended)
return cycles_at_suspend;
return local_clock();
}
static struct clocksource dummy_clock = {
.read = dummy_clock_read,
};
/*
* Boot time initialization which allows local_clock() to be utilized
* during early boot when clocksources are not available. local_clock()
* returns nanoseconds already so no conversion is required, hence mult=1
* and shift=0. When the first proper clocksource is installed then
* the fast time keepers are updated with the correct values.
*/
#define FAST_TK_INIT \
{ \
.clock = &dummy_clock, \
.mask = CLOCKSOURCE_MASK(64), \
.mult = 1, \
.shift = 0, \
}
static struct tk_fast tk_fast_mono ____cacheline_aligned = {
.seq = SEQCNT_LATCH_ZERO(tk_fast_mono.seq),
.base[0] = FAST_TK_INIT,
.base[1] = FAST_TK_INIT,
};
static struct tk_fast tk_fast_raw ____cacheline_aligned = {
.seq = SEQCNT_LATCH_ZERO(tk_fast_raw.seq),
.base[0] = FAST_TK_INIT,
.base[1] = FAST_TK_INIT,
};
static inline void tk_normalize_xtime(struct timekeeper *tk)
{
while (tk->tkr_mono.xtime_nsec >= ((u64)NSEC_PER_SEC << tk->tkr_mono.shift)) {
tk->tkr_mono.xtime_nsec -= (u64)NSEC_PER_SEC << tk->tkr_mono.shift;
tk->xtime_sec++;
}
while (tk->tkr_raw.xtime_nsec >= ((u64)NSEC_PER_SEC << tk->tkr_raw.shift)) {
tk->tkr_raw.xtime_nsec -= (u64)NSEC_PER_SEC << tk->tkr_raw.shift;
tk->raw_sec++;
}
}
static inline struct timespec64 tk_xtime(const struct timekeeper *tk)
{
struct timespec64 ts;
ts.tv_sec = tk->xtime_sec;
ts.tv_nsec = (long)(tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift);
return ts;
}
static void tk_set_xtime(struct timekeeper *tk, const struct timespec64 *ts)
{
tk->xtime_sec = ts->tv_sec;
tk->tkr_mono.xtime_nsec = (u64)ts->tv_nsec << tk->tkr_mono.shift;
}
static void tk_xtime_add(struct timekeeper *tk, const struct timespec64 *ts)
{
tk->xtime_sec += ts->tv_sec;
tk->tkr_mono.xtime_nsec += (u64)ts->tv_nsec << tk->tkr_mono.shift;
tk_normalize_xtime(tk);
}
static void tk_set_wall_to_mono(struct timekeeper *tk, struct timespec64 wtm)
{
struct timespec64 tmp;
/*
* Verify consistency of: offset_real = -wall_to_monotonic
* before modifying anything
*/
set_normalized_timespec64(&tmp, -tk->wall_to_monotonic.tv_sec,
-tk->wall_to_monotonic.tv_nsec);
WARN_ON_ONCE(tk->offs_real != timespec64_to_ktime(tmp));
tk->wall_to_monotonic = wtm;
set_normalized_timespec64(&tmp, -wtm.tv_sec, -wtm.tv_nsec);
tk->offs_real = timespec64_to_ktime(tmp);
tk->offs_tai = ktime_add(tk->offs_real, ktime_set(tk->tai_offset, 0));
}
static inline void tk_update_sleep_time(struct timekeeper *tk, ktime_t delta)
{
tk->offs_boot = ktime_add(tk->offs_boot, delta);
/*
* Timespec representation for VDSO update to avoid 64bit division
* on every update.
*/
tk->monotonic_to_boot = ktime_to_timespec64(tk->offs_boot);
}
/*
* tk_clock_read - atomic clocksource read() helper
*
* This helper is necessary to use in the read paths because, while the
* seqcount ensures we don't return a bad value while structures are updated,
* it doesn't protect from potential crashes. There is the possibility that
* the tkr's clocksource may change between the read reference, and the
* clock reference passed to the read function. This can cause crashes if
* the wrong clocksource is passed to the wrong read function.
* This isn't necessary to use when holding the timekeeper_lock or doing
* a read of the fast-timekeeper tkrs (which is protected by its own locking
* and update logic).
*/
static inline u64 tk_clock_read(const struct tk_read_base *tkr)
{
struct clocksource *clock = READ_ONCE(tkr->clock);
return clock->read(clock);
}
#ifdef CONFIG_DEBUG_TIMEKEEPING
#define WARNING_FREQ (HZ*300) /* 5 minute rate-limiting */
static void timekeeping_check_update(struct timekeeper *tk, u64 offset)
{
u64 max_cycles = tk->tkr_mono.clock->max_cycles;
const char *name = tk->tkr_mono.clock->name;
if (offset > max_cycles) {
printk_deferred("WARNING: timekeeping: Cycle offset (%lld) is larger than allowed by the '%s' clock's max_cycles value (%lld): time overflow danger\n",
offset, name, max_cycles);
printk_deferred(" timekeeping: Your kernel is sick, but tries to cope by capping time updates\n");
} else {
if (offset > (max_cycles >> 1)) {
printk_deferred("INFO: timekeeping: Cycle offset (%lld) is larger than the '%s' clock's 50%% safety margin (%lld)\n",
offset, name, max_cycles >> 1);
printk_deferred(" timekeeping: Your kernel is still fine, but is feeling a bit nervous\n");
}
}
if (tk->underflow_seen) {
if (jiffies - tk->last_warning > WARNING_FREQ) {
printk_deferred("WARNING: Underflow in clocksource '%s' observed, time update ignored.\n", name);
printk_deferred(" Please report this, consider using a different clocksource, if possible.\n");
printk_deferred(" Your kernel is probably still fine.\n");
tk->last_warning = jiffies;
}
tk->underflow_seen = 0;
}
if (tk->overflow_seen) {
if (jiffies - tk->last_warning > WARNING_FREQ) {
printk_deferred("WARNING: Overflow in clocksource '%s' observed, time update capped.\n", name);
printk_deferred(" Please report this, consider using a different clocksource, if possible.\n");
printk_deferred(" Your kernel is probably still fine.\n");
tk->last_warning = jiffies;
}
tk->overflow_seen = 0;
}
}
static inline u64 timekeeping_get_delta(const struct tk_read_base *tkr)
{
struct timekeeper *tk = &tk_core.timekeeper;
u64 now, last, mask, max, delta;
unsigned int seq;
/*
* Since we're called holding a seqcount, the data may shift
* under us while we're doing the calculation. This can cause
* false positives, since we'd note a problem but throw the
* results away. So nest another seqcount here to atomically
* grab the points we are checking with.
*/
do {
seq = read_seqcount_begin(&tk_core.seq);
now = tk_clock_read(tkr);
last = tkr->cycle_last;
mask = tkr->mask;
max = tkr->clock->max_cycles;
} while (read_seqcount_retry(&tk_core.seq, seq));
delta = clocksource_delta(now, last, mask);
/*
* Try to catch underflows by checking if we are seeing small
* mask-relative negative values.
*/
if (unlikely((~delta & mask) < (mask >> 3))) {
tk->underflow_seen = 1;
delta = 0;
}
/* Cap delta value to the max_cycles values to avoid mult overflows */
if (unlikely(delta > max)) {
tk->overflow_seen = 1;
delta = tkr->clock->max_cycles;
}
return delta;
}
#else
static inline void timekeeping_check_update(struct timekeeper *tk, u64 offset)
{
}
static inline u64 timekeeping_get_delta(const struct tk_read_base *tkr)
{
u64 cycle_now, delta;
/* read clocksource */
cycle_now = tk_clock_read(tkr);
/* calculate the delta since the last update_wall_time */
delta = clocksource_delta(cycle_now, tkr->cycle_last, tkr->mask);
return delta;
}
#endif
/**
* tk_setup_internals - Set up internals to use clocksource clock.
*
* @tk: The target timekeeper to setup.
* @clock: Pointer to clocksource.
*
* Calculates a fixed cycle/nsec interval for a given clocksource/adjustment
* pair and interval request.
*
* Unless you're the timekeeping code, you should not be using this!
*/
static void tk_setup_internals(struct timekeeper *tk, struct clocksource *clock)
{
u64 interval;
u64 tmp, ntpinterval;
struct clocksource *old_clock;
++tk->cs_was_changed_seq;
old_clock = tk->tkr_mono.clock;
tk->tkr_mono.clock = clock;
tk->tkr_mono.mask = clock->mask;
tk->tkr_mono.cycle_last = tk_clock_read(&tk->tkr_mono);
tk->tkr_raw.clock = clock;
tk->tkr_raw.mask = clock->mask;
tk->tkr_raw.cycle_last = tk->tkr_mono.cycle_last;
/* Do the ns -> cycle conversion first, using original mult */
tmp = NTP_INTERVAL_LENGTH;
tmp <<= clock->shift;
ntpinterval = tmp;
tmp += clock->mult/2;
do_div(tmp, clock->mult);
if (tmp == 0)
tmp = 1;
interval = (u64) tmp;
tk->cycle_interval = interval;
/* Go back from cycles -> shifted ns */
tk->xtime_interval = interval * clock->mult;
tk->xtime_remainder = ntpinterval - tk->xtime_interval;
tk->raw_interval = interval * clock->mult;
/* if changing clocks, convert xtime_nsec shift units */
if (old_clock) {
int shift_change = clock->shift - old_clock->shift;
if (shift_change < 0) {
tk->tkr_mono.xtime_nsec >>= -shift_change;
tk->tkr_raw.xtime_nsec >>= -shift_change;
} else {
tk->tkr_mono.xtime_nsec <<= shift_change;
tk->tkr_raw.xtime_nsec <<= shift_change;
}
}
tk->tkr_mono.shift = clock->shift;
tk->tkr_raw.shift = clock->shift;
tk->ntp_error = 0;
tk->ntp_error_shift = NTP_SCALE_SHIFT - clock->shift;
tk->ntp_tick = ntpinterval << tk->ntp_error_shift;
/*
* The timekeeper keeps its own mult values for the currently
* active clocksource. These value will be adjusted via NTP
* to counteract clock drifting.
*/
tk->tkr_mono.mult = clock->mult;
tk->tkr_raw.mult = clock->mult;
tk->ntp_err_mult = 0;
tk->skip_second_overflow = 0;
}
/* Timekeeper helper functions. */
static inline u64 timekeeping_delta_to_ns(const struct tk_read_base *tkr, u64 delta)
{
u64 nsec;
nsec = delta * tkr->mult + tkr->xtime_nsec;
nsec >>= tkr->shift;
return nsec;
}
static inline u64 timekeeping_get_ns(const struct tk_read_base *tkr)
{
u64 delta;
delta = timekeeping_get_delta(tkr);
return timekeeping_delta_to_ns(tkr, delta);
}
static inline u64 timekeeping_cycles_to_ns(const struct tk_read_base *tkr, u64 cycles)
{
u64 delta;
/* calculate the delta since the last update_wall_time */
delta = clocksource_delta(cycles, tkr->cycle_last, tkr->mask);
return timekeeping_delta_to_ns(tkr, delta);
}
/**
* update_fast_timekeeper - Update the fast and NMI safe monotonic timekeeper.
* @tkr: Timekeeping readout base from which we take the update
* @tkf: Pointer to NMI safe timekeeper
*
* We want to use this from any context including NMI and tracing /
* instrumenting the timekeeping code itself.
*
* Employ the latch technique; see @raw_write_seqcount_latch.
*
* So if a NMI hits the update of base[0] then it will use base[1]
* which is still consistent. In the worst case this can result is a
* slightly wrong timestamp (a few nanoseconds). See
* @ktime_get_mono_fast_ns.
*/
static void update_fast_timekeeper(const struct tk_read_base *tkr,
struct tk_fast *tkf)
{
struct tk_read_base *base = tkf->base;
/* Force readers off to base[1] */
raw_write_seqcount_latch(&tkf->seq);
/* Update base[0] */
memcpy(base, tkr, sizeof(*base));
/* Force readers back to base[0] */
raw_write_seqcount_latch(&tkf->seq);
/* Update base[1] */
memcpy(base + 1, base, sizeof(*base));
}
static __always_inline u64 __ktime_get_fast_ns(struct tk_fast *tkf)
{
struct tk_read_base *tkr;
unsigned int seq;
u64 now;
do {
seq = raw_read_seqcount_latch(&tkf->seq);
tkr = tkf->base + (seq & 0x01);
now = ktime_to_ns(tkr->base);
now += timekeeping_delta_to_ns(tkr,
clocksource_delta(
tk_clock_read(tkr),
tkr->cycle_last,
tkr->mask));
} while (read_seqcount_latch_retry(&tkf->seq, seq));
return now;
}
/**
* ktime_get_mono_fast_ns - Fast NMI safe access to clock monotonic
*
* This timestamp is not guaranteed to be monotonic across an update.
* The timestamp is calculated by:
*
* now = base_mono + clock_delta * slope
*
* So if the update lowers the slope, readers who are forced to the
* not yet updated second array are still using the old steeper slope.
*
* tmono
* ^
* | o n
* | o n
* | u
* | o
* |o
* |12345678---> reader order
*
* o = old slope
* u = update
* n = new slope
*
* So reader 6 will observe time going backwards versus reader 5.
*
* While other CPUs are likely to be able to observe that, the only way
* for a CPU local observation is when an NMI hits in the middle of
* the update. Timestamps taken from that NMI context might be ahead
* of the following timestamps. Callers need to be aware of that and
* deal with it.
*/
u64 ktime_get_mono_fast_ns(void)
{
return __ktime_get_fast_ns(&tk_fast_mono);
}
EXPORT_SYMBOL_GPL(ktime_get_mono_fast_ns);
/**
* ktime_get_raw_fast_ns - Fast NMI safe access to clock monotonic raw
*
* Contrary to ktime_get_mono_fast_ns() this is always correct because the
* conversion factor is not affected by NTP/PTP correction.
*/
u64 ktime_get_raw_fast_ns(void)
{
return __ktime_get_fast_ns(&tk_fast_raw);
}
EXPORT_SYMBOL_GPL(ktime_get_raw_fast_ns);
/**
* ktime_get_boot_fast_ns - NMI safe and fast access to boot clock.
*
* To keep it NMI safe since we're accessing from tracing, we're not using a
* separate timekeeper with updates to monotonic clock and boot offset
* protected with seqcounts. This has the following minor side effects:
*
* (1) Its possible that a timestamp be taken after the boot offset is updated
* but before the timekeeper is updated. If this happens, the new boot offset
* is added to the old timekeeping making the clock appear to update slightly
* earlier:
* CPU 0 CPU 1
* timekeeping_inject_sleeptime64()
* __timekeeping_inject_sleeptime(tk, delta);
* timestamp();
* timekeeping_update(tk, TK_CLEAR_NTP...);
*
* (2) On 32-bit systems, the 64-bit boot offset (tk->offs_boot) may be
* partially updated. Since the tk->offs_boot update is a rare event, this
* should be a rare occurrence which postprocessing should be able to handle.
*
* The caveats vs. timestamp ordering as documented for ktime_get_fast_ns()
* apply as well.
*/
u64 notrace ktime_get_boot_fast_ns(void)
{
struct timekeeper *tk = &tk_core.timekeeper;
return (ktime_get_mono_fast_ns() + ktime_to_ns(tk->offs_boot));
}
EXPORT_SYMBOL_GPL(ktime_get_boot_fast_ns);
static __always_inline u64 __ktime_get_real_fast(struct tk_fast *tkf, u64 *mono)
{
struct tk_read_base *tkr;
u64 basem, baser, delta;
unsigned int seq;
do {
seq = raw_read_seqcount_latch(&tkf->seq);
tkr = tkf->base + (seq & 0x01);
basem = ktime_to_ns(tkr->base);
baser = ktime_to_ns(tkr->base_real);
delta = timekeeping_delta_to_ns(tkr,
clocksource_delta(tk_clock_read(tkr),
tkr->cycle_last, tkr->mask));
} while (read_seqcount_latch_retry(&tkf->seq, seq));
if (mono)
*mono = basem + delta;
return baser + delta;
}
/**
* ktime_get_real_fast_ns: - NMI safe and fast access to clock realtime.
*
* See ktime_get_fast_ns() for documentation of the time stamp ordering.
*/
u64 ktime_get_real_fast_ns(void)
{
return __ktime_get_real_fast(&tk_fast_mono, NULL);
}
EXPORT_SYMBOL_GPL(ktime_get_real_fast_ns);
/**
* ktime_get_fast_timestamps: - NMI safe timestamps
* @snapshot: Pointer to timestamp storage
*
* Stores clock monotonic, boottime and realtime timestamps.
*
* Boot time is a racy access on 32bit systems if the sleep time injection
* happens late during resume and not in timekeeping_resume(). That could
* be avoided by expanding struct tk_read_base with boot offset for 32bit
* and adding more overhead to the update. As this is a hard to observe
* once per resume event which can be filtered with reasonable effort using
* the accurate mono/real timestamps, it's probably not worth the trouble.
*
* Aside of that it might be possible on 32 and 64 bit to observe the
* following when the sleep time injection happens late:
*
* CPU 0 CPU 1
* timekeeping_resume()
* ktime_get_fast_timestamps()
* mono, real = __ktime_get_real_fast()
* inject_sleep_time()
* update boot offset
* boot = mono + bootoffset;
*
* That means that boot time already has the sleep time adjustment, but
* real time does not. On the next readout both are in sync again.
*
* Preventing this for 64bit is not really feasible without destroying the
* careful cache layout of the timekeeper because the sequence count and
* struct tk_read_base would then need two cache lines instead of one.
*
* Access to the time keeper clock source is disabled across the innermost
* steps of suspend/resume. The accessors still work, but the timestamps
* are frozen until time keeping is resumed which happens very early.
*
* For regular suspend/resume there is no observable difference vs. sched
* clock, but it might affect some of the nasty low level debug printks.
*
* OTOH, access to sched clock is not guaranteed across suspend/resume on
* all systems either so it depends on the hardware in use.
*
* If that turns out to be a real problem then this could be mitigated by
* using sched clock in a similar way as during early boot. But it's not as
* trivial as on early boot because it needs some careful protection
* against the clock monotonic timestamp jumping backwards on resume.
*/
void ktime_get_fast_timestamps(struct ktime_timestamps *snapshot)
{
struct timekeeper *tk = &tk_core.timekeeper;
snapshot->real = __ktime_get_real_fast(&tk_fast_mono, &snapshot->mono);
snapshot->boot = snapshot->mono + ktime_to_ns(data_race(tk->offs_boot));
}
/**
* halt_fast_timekeeper - Prevent fast timekeeper from accessing clocksource.
* @tk: Timekeeper to snapshot.
*
* It generally is unsafe to access the clocksource after timekeeping has been
* suspended, so take a snapshot of the readout base of @tk and use it as the
* fast timekeeper's readout base while suspended. It will return the same
* number of cycles every time until timekeeping is resumed at which time the
* proper readout base for the fast timekeeper will be restored automatically.
*/
static void halt_fast_timekeeper(const struct timekeeper *tk)
{
static struct tk_read_base tkr_dummy;
const struct tk_read_base *tkr = &tk->tkr_mono;
memcpy(&tkr_dummy, tkr, sizeof(tkr_dummy));
cycles_at_suspend = tk_clock_read(tkr);
tkr_dummy.clock = &dummy_clock;
tkr_dummy.base_real = tkr->base + tk->offs_real;
update_fast_timekeeper(&tkr_dummy, &tk_fast_mono);
tkr = &tk->tkr_raw;
memcpy(&tkr_dummy, tkr, sizeof(tkr_dummy));
tkr_dummy.clock = &dummy_clock;
update_fast_timekeeper(&tkr_dummy, &tk_fast_raw);
}
static RAW_NOTIFIER_HEAD(pvclock_gtod_chain);
static void update_pvclock_gtod(struct timekeeper *tk, bool was_set)
{
raw_notifier_call_chain(&pvclock_gtod_chain, was_set, tk);
}
/**
* pvclock_gtod_register_notifier - register a pvclock timedata update listener
* @nb: Pointer to the notifier block to register
*/
int pvclock_gtod_register_notifier(struct notifier_block *nb)
{
struct timekeeper *tk = &tk_core.timekeeper;
unsigned long flags;
int ret;
raw_spin_lock_irqsave(&timekeeper_lock, flags);
ret = raw_notifier_chain_register(&pvclock_gtod_chain, nb);
update_pvclock_gtod(tk, true);
raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
return ret;
}
EXPORT_SYMBOL_GPL(pvclock_gtod_register_notifier);
/**
* pvclock_gtod_unregister_notifier - unregister a pvclock
* timedata update listener
* @nb: Pointer to the notifier block to unregister
*/
int pvclock_gtod_unregister_notifier(struct notifier_block *nb)
{
unsigned long flags;
int ret;
raw_spin_lock_irqsave(&timekeeper_lock, flags);
ret = raw_notifier_chain_unregister(&pvclock_gtod_chain, nb);
raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
return ret;
}
EXPORT_SYMBOL_GPL(pvclock_gtod_unregister_notifier);
/*
* tk_update_leap_state - helper to update the next_leap_ktime
*/
static inline void tk_update_leap_state(struct timekeeper *tk)
{
tk->next_leap_ktime = ntp_get_next_leap();
if (tk->next_leap_ktime != KTIME_MAX)
/* Convert to monotonic time */
tk->next_leap_ktime = ktime_sub(tk->next_leap_ktime, tk->offs_real);
}
/*
* Update the ktime_t based scalar nsec members of the timekeeper
*/
static inline void tk_update_ktime_data(struct timekeeper *tk)
{
u64 seconds;
u32 nsec;
/*
* The xtime based monotonic readout is:
* nsec = (xtime_sec + wtm_sec) * 1e9 + wtm_nsec + now();
* The ktime based monotonic readout is:
* nsec = base_mono + now();
* ==> base_mono = (xtime_sec + wtm_sec) * 1e9 + wtm_nsec
*/
seconds = (u64)(tk->xtime_sec + tk->wall_to_monotonic.tv_sec);
nsec = (u32) tk->wall_to_monotonic.tv_nsec;
tk->tkr_mono.base = ns_to_ktime(seconds * NSEC_PER_SEC + nsec);
/*
* The sum of the nanoseconds portions of xtime and
* wall_to_monotonic can be greater/equal one second. Take
* this into account before updating tk->ktime_sec.
*/
nsec += (u32)(tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift);
if (nsec >= NSEC_PER_SEC)
seconds++;
tk->ktime_sec = seconds;
/* Update the monotonic raw base */
tk->tkr_raw.base = ns_to_ktime(tk->raw_sec * NSEC_PER_SEC);
}
/* must hold timekeeper_lock */
static void timekeeping_update(struct timekeeper *tk, unsigned int action)
{
if (action & TK_CLEAR_NTP) {
tk->ntp_error = 0;
ntp_clear();
}
tk_update_leap_state(tk);
tk_update_ktime_data(tk);
update_vsyscall(tk);
update_pvclock_gtod(tk, action & TK_CLOCK_WAS_SET);
tk->tkr_mono.base_real = tk->tkr_mono.base + tk->offs_real;
update_fast_timekeeper(&tk->tkr_mono, &tk_fast_mono);
update_fast_timekeeper(&tk->tkr_raw, &tk_fast_raw);
if (action & TK_CLOCK_WAS_SET)
tk->clock_was_set_seq++;
/*
* The mirroring of the data to the shadow-timekeeper needs
* to happen last here to ensure we don't over-write the
* timekeeper structure on the next update with stale data
*/
if (action & TK_MIRROR)
memcpy(&shadow_timekeeper, &tk_core.timekeeper,
sizeof(tk_core.timekeeper));
}
/**
* timekeeping_forward_now - update clock to the current time
* @tk: Pointer to the timekeeper to update
*
* Forward the current clock to update its state since the last call to
* update_wall_time(). This is useful before significant clock changes,
* as it avoids having to deal with this time offset explicitly.
*/
static void timekeeping_forward_now(struct timekeeper *tk)
{
u64 cycle_now, delta;
cycle_now = tk_clock_read(&tk->tkr_mono);
delta = clocksource_delta(cycle_now, tk->tkr_mono.cycle_last, tk->tkr_mono.mask);
tk->tkr_mono.cycle_last = cycle_now;
tk->tkr_raw.cycle_last = cycle_now;
tk->tkr_mono.xtime_nsec += delta * tk->tkr_mono.mult;
tk->tkr_raw.xtime_nsec += delta * tk->tkr_raw.mult;
tk_normalize_xtime(tk);
}
/**
* ktime_get_real_ts64 - Returns the time of day in a timespec64.
* @ts: pointer to the timespec to be set
*
* Returns the time of day in a timespec64 (WARN if suspended).
*/
void ktime_get_real_ts64(struct timespec64 *ts)
{
struct timekeeper *tk = &tk_core.timekeeper;
unsigned int seq;
u64 nsecs;
WARN_ON(timekeeping_suspended);
do {
seq = read_seqcount_begin(&tk_core.seq);
ts->tv_sec = tk->xtime_sec;
nsecs = timekeeping_get_ns(&tk->tkr_mono);
} while (read_seqcount_retry(&tk_core.seq, seq));
ts->tv_nsec = 0;
timespec64_add_ns(ts, nsecs);
}
EXPORT_SYMBOL(ktime_get_real_ts64);
ktime_t ktime_get(void)
{
struct timekeeper *tk = &tk_core.timekeeper;
unsigned int seq;
ktime_t base;
u64 nsecs;
WARN_ON(timekeeping_suspended);
do {
seq = read_seqcount_begin(&tk_core.seq);
base = tk->tkr_mono.base;
nsecs = timekeeping_get_ns(&tk->tkr_mono);
} while (read_seqcount_retry(&tk_core.seq, seq));
return ktime_add_ns(base, nsecs);
}
EXPORT_SYMBOL_GPL(ktime_get);
u32 ktime_get_resolution_ns(void)
{
struct timekeeper *tk = &tk_core.timekeeper;
unsigned int seq;
u32 nsecs;
WARN_ON(timekeeping_suspended);
do {
seq = read_seqcount_begin(&tk_core.seq);
nsecs = tk->tkr_mono.mult >> tk->tkr_mono.shift;
} while (read_seqcount_retry(&tk_core.seq, seq));
return nsecs;
}
EXPORT_SYMBOL_GPL(ktime_get_resolution_ns);
static ktime_t *offsets[TK_OFFS_MAX] = {
[TK_OFFS_REAL] = &tk_core.timekeeper.offs_real,
[TK_OFFS_BOOT] = &tk_core.timekeeper.offs_boot,
[TK_OFFS_TAI] = &tk_core.timekeeper.offs_tai,
};
ktime_t ktime_get_with_offset(enum tk_offsets offs)
{
struct timekeeper *tk = &tk_core.timekeeper;
unsigned int seq;
ktime_t base, *offset = offsets[offs];
u64 nsecs;
WARN_ON(timekeeping_suspended);
do {
seq = read_seqcount_begin(&tk_core.seq); base = ktime_add(tk->tkr_mono.base, *offset);
nsecs = timekeeping_get_ns(&tk->tkr_mono);
} while (read_seqcount_retry(&tk_core.seq, seq));
return ktime_add_ns(base, nsecs);
}
EXPORT_SYMBOL_GPL(ktime_get_with_offset);
ktime_t ktime_get_coarse_with_offset(enum tk_offsets offs)
{
struct timekeeper *tk = &tk_core.timekeeper;
unsigned int seq;
ktime_t base, *offset = offsets[offs];
u64 nsecs;
WARN_ON(timekeeping_suspended);
do {
seq = read_seqcount_begin(&tk_core.seq);
base = ktime_add(tk->tkr_mono.base, *offset);
nsecs = tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift;
} while (read_seqcount_retry(&tk_core.seq, seq));
return ktime_add_ns(base, nsecs);
}
EXPORT_SYMBOL_GPL(ktime_get_coarse_with_offset);
/**
* ktime_mono_to_any() - convert monotonic time to any other time
* @tmono: time to convert.
* @offs: which offset to use
*/
ktime_t ktime_mono_to_any(ktime_t tmono, enum tk_offsets offs)
{
ktime_t *offset = offsets[offs];
unsigned int seq;
ktime_t tconv;
do {
seq = read_seqcount_begin(&tk_core.seq);
tconv = ktime_add(tmono, *offset);
} while (read_seqcount_retry(&tk_core.seq, seq));
return tconv;
}
EXPORT_SYMBOL_GPL(ktime_mono_to_any);
/**
* ktime_get_raw - Returns the raw monotonic time in ktime_t format
*/
ktime_t ktime_get_raw(void)
{
struct timekeeper *tk = &tk_core.timekeeper;
unsigned int seq;
ktime_t base;
u64 nsecs;
do {
seq = read_seqcount_begin(&tk_core.seq);
base = tk->tkr_raw.base;
nsecs = timekeeping_get_ns(&tk->tkr_raw);
} while (read_seqcount_retry(&tk_core.seq, seq));
return ktime_add_ns(base, nsecs);
}
EXPORT_SYMBOL_GPL(ktime_get_raw);
/**
* ktime_get_ts64 - get the monotonic clock in timespec64 format
* @ts: pointer to timespec variable
*
* The function calculates the monotonic clock from the realtime
* clock and the wall_to_monotonic offset and stores the result
* in normalized timespec64 format in the variable pointed to by @ts.
*/
void ktime_get_ts64(struct timespec64 *ts)
{
struct timekeeper *tk = &tk_core.timekeeper;
struct timespec64 tomono;
unsigned int seq;
u64 nsec;
WARN_ON(timekeeping_suspended);
do {
seq = read_seqcount_begin(&tk_core.seq);
ts->tv_sec = tk->xtime_sec;
nsec = timekeeping_get_ns(&tk->tkr_mono);
tomono = tk->wall_to_monotonic;
} while (read_seqcount_retry(&tk_core.seq, seq));
ts->tv_sec += tomono.tv_sec;
ts->tv_nsec = 0;
timespec64_add_ns(ts, nsec + tomono.tv_nsec);
}
EXPORT_SYMBOL_GPL(ktime_get_ts64);
/**
* ktime_get_seconds - Get the seconds portion of CLOCK_MONOTONIC
*
* Returns the seconds portion of CLOCK_MONOTONIC with a single non
* serialized read. tk->ktime_sec is of type 'unsigned long' so this
* works on both 32 and 64 bit systems. On 32 bit systems the readout
* covers ~136 years of uptime which should be enough to prevent
* premature wrap arounds.
*/
time64_t ktime_get_seconds(void)
{
struct timekeeper *tk = &tk_core.timekeeper;
WARN_ON(timekeeping_suspended); return tk->ktime_sec;
}
EXPORT_SYMBOL_GPL(ktime_get_seconds);
/**
* ktime_get_real_seconds - Get the seconds portion of CLOCK_REALTIME
*
* Returns the wall clock seconds since 1970.
*
* For 64bit systems the fast access to tk->xtime_sec is preserved. On
* 32bit systems the access must be protected with the sequence
* counter to provide "atomic" access to the 64bit tk->xtime_sec
* value.
*/
time64_t ktime_get_real_seconds(void)
{
struct timekeeper *tk = &tk_core.timekeeper;
time64_t seconds;
unsigned int seq;
if (IS_ENABLED(CONFIG_64BIT))
return tk->xtime_sec;
do {
seq = read_seqcount_begin(&tk_core.seq);
seconds = tk->xtime_sec;
} while (read_seqcount_retry(&tk_core.seq, seq));
return seconds;
}
EXPORT_SYMBOL_GPL(ktime_get_real_seconds);
/**
* __ktime_get_real_seconds - The same as ktime_get_real_seconds
* but without the sequence counter protect. This internal function
* is called just when timekeeping lock is already held.
*/
noinstr time64_t __ktime_get_real_seconds(void)
{
struct timekeeper *tk = &tk_core.timekeeper;
return tk->xtime_sec;
}
/**
* ktime_get_snapshot - snapshots the realtime/monotonic raw clocks with counter
* @systime_snapshot: pointer to struct receiving the system time snapshot
*/
void ktime_get_snapshot(struct system_time_snapshot *systime_snapshot)
{
struct timekeeper *tk = &tk_core.timekeeper;
unsigned int seq;
ktime_t base_raw;
ktime_t base_real;
u64 nsec_raw;
u64 nsec_real;
u64 now;
WARN_ON_ONCE(timekeeping_suspended);
do {
seq = read_seqcount_begin(&tk_core.seq);
now = tk_clock_read(&tk->tkr_mono);
systime_snapshot->cs_id = tk->tkr_mono.clock->id;
systime_snapshot->cs_was_changed_seq = tk->cs_was_changed_seq;
systime_snapshot->clock_was_set_seq = tk->clock_was_set_seq;
base_real = ktime_add(tk->tkr_mono.base,
tk_core.timekeeper.offs_real);
base_raw = tk->tkr_raw.base;
nsec_real = timekeeping_cycles_to_ns(&tk->tkr_mono, now);
nsec_raw = timekeeping_cycles_to_ns(&tk->tkr_raw, now);
} while (read_seqcount_retry(&tk_core.seq, seq));
systime_snapshot->cycles = now;
systime_snapshot->real = ktime_add_ns(base_real, nsec_real);
systime_snapshot->raw = ktime_add_ns(base_raw, nsec_raw);
}
EXPORT_SYMBOL_GPL(ktime_get_snapshot);
/* Scale base by mult/div checking for overflow */
static int scale64_check_overflow(u64 mult, u64 div, u64 *base)
{
u64 tmp, rem;
tmp = div64_u64_rem(*base, div, &rem);
if (((int)sizeof(u64)*8 - fls64(mult) < fls64(tmp)) ||
((int)sizeof(u64)*8 - fls64(mult) < fls64(rem)))
return -EOVERFLOW;
tmp *= mult;
rem = div64_u64(rem * mult, div);
*base = tmp + rem;
return 0;
}
/**
* adjust_historical_crosststamp - adjust crosstimestamp previous to current interval
* @history: Snapshot representing start of history
* @partial_history_cycles: Cycle offset into history (fractional part)
* @total_history_cycles: Total history length in cycles
* @discontinuity: True indicates clock was set on history period
* @ts: Cross timestamp that should be adjusted using
* partial/total ratio
*
* Helper function used by get_device_system_crosststamp() to correct the
* crosstimestamp corresponding to the start of the current interval to the
* system counter value (timestamp point) provided by the driver. The
* total_history_* quantities are the total history starting at the provided
* reference point and ending at the start of the current interval. The cycle
* count between the driver timestamp point and the start of the current
* interval is partial_history_cycles.
*/
static int adjust_historical_crosststamp(struct system_time_snapshot *history,
u64 partial_history_cycles,
u64 total_history_cycles,
bool discontinuity,
struct system_device_crosststamp *ts)
{
struct timekeeper *tk = &tk_core.timekeeper;
u64 corr_raw, corr_real;
bool interp_forward;
int ret;
if (total_history_cycles == 0 || partial_history_cycles == 0)
return 0;
/* Interpolate shortest distance from beginning or end of history */
interp_forward = partial_history_cycles > total_history_cycles / 2;
partial_history_cycles = interp_forward ?
total_history_cycles - partial_history_cycles :
partial_history_cycles;
/*
* Scale the monotonic raw time delta by:
* partial_history_cycles / total_history_cycles
*/
corr_raw = (u64)ktime_to_ns(
ktime_sub(ts->sys_monoraw, history->raw));
ret = scale64_check_overflow(partial_history_cycles,
total_history_cycles, &corr_raw);
if (ret)
return ret;
/*
* If there is a discontinuity in the history, scale monotonic raw
* correction by:
* mult(real)/mult(raw) yielding the realtime correction
* Otherwise, calculate the realtime correction similar to monotonic
* raw calculation
*/
if (discontinuity) {
corr_real = mul_u64_u32_div
(corr_raw, tk->tkr_mono.mult, tk->tkr_raw.mult);
} else {
corr_real = (u64)ktime_to_ns(
ktime_sub(ts->sys_realtime, history->real));
ret = scale64_check_overflow(partial_history_cycles,
total_history_cycles, &corr_real);
if (ret)
return ret;
}
/* Fixup monotonic raw and real time time values */
if (interp_forward) {
ts->sys_monoraw = ktime_add_ns(history->raw, corr_raw);
ts->sys_realtime = ktime_add_ns(history->real, corr_real);
} else {
ts->sys_monoraw = ktime_sub_ns(ts->sys_monoraw, corr_raw);
ts->sys_realtime = ktime_sub_ns(ts->sys_realtime, corr_real);
}
return 0;
}
/*
* cycle_between - true if test occurs chronologically between before and after
*/
static bool cycle_between(u64 before, u64 test, u64 after)
{
if (test > before && test < after)
return true;
if (test < before && before > after)
return true;
return false;
}
/**
* get_device_system_crosststamp - Synchronously capture system/device timestamp
* @get_time_fn: Callback to get simultaneous device time and
* system counter from the device driver
* @ctx: Context passed to get_time_fn()
* @history_begin: Historical reference point used to interpolate system
* time when counter provided by the driver is before the current interval
* @xtstamp: Receives simultaneously captured system and device time
*
* Reads a timestamp from a device and correlates it to system time
*/
int get_device_system_crosststamp(int (*get_time_fn)
(ktime_t *device_time,
struct system_counterval_t *sys_counterval,
void *ctx),
void *ctx,
struct system_time_snapshot *history_begin,
struct system_device_crosststamp *xtstamp)
{
struct system_counterval_t system_counterval;
struct timekeeper *tk = &tk_core.timekeeper;
u64 cycles, now, interval_start;
unsigned int clock_was_set_seq = 0;
ktime_t base_real, base_raw;
u64 nsec_real, nsec_raw;
u8 cs_was_changed_seq;
unsigned int seq;
bool do_interp;
int ret;
do {
seq = read_seqcount_begin(&tk_core.seq);
/*
* Try to synchronously capture device time and a system
* counter value calling back into the device driver
*/
ret = get_time_fn(&xtstamp->device, &system_counterval, ctx);
if (ret)
return ret;
/*
* Verify that the clocksource associated with the captured
* system counter value is the same as the currently installed
* timekeeper clocksource
*/
if (tk->tkr_mono.clock != system_counterval.cs)
return -ENODEV;
cycles = system_counterval.cycles;
/*
* Check whether the system counter value provided by the
* device driver is on the current timekeeping interval.
*/
now = tk_clock_read(&tk->tkr_mono);
interval_start = tk->tkr_mono.cycle_last;
if (!cycle_between(interval_start, cycles, now)) {
clock_was_set_seq = tk->clock_was_set_seq;
cs_was_changed_seq = tk->cs_was_changed_seq;
cycles = interval_start;
do_interp = true;
} else {
do_interp = false;
}
base_real = ktime_add(tk->tkr_mono.base,
tk_core.timekeeper.offs_real);
base_raw = tk->tkr_raw.base;
nsec_real = timekeeping_cycles_to_ns(&tk->tkr_mono,
system_counterval.cycles);
nsec_raw = timekeeping_cycles_to_ns(&tk->tkr_raw,
system_counterval.cycles);
} while (read_seqcount_retry(&tk_core.seq, seq));
xtstamp->sys_realtime = ktime_add_ns(base_real, nsec_real);
xtstamp->sys_monoraw = ktime_add_ns(base_raw, nsec_raw);
/*
* Interpolate if necessary, adjusting back from the start of the
* current interval
*/
if (do_interp) {
u64 partial_history_cycles, total_history_cycles;
bool discontinuity;
/*
* Check that the counter value occurs after the provided
* history reference and that the history doesn't cross a
* clocksource change
*/
if (!history_begin ||
!cycle_between(history_begin->cycles,
system_counterval.cycles, cycles) ||
history_begin->cs_was_changed_seq != cs_was_changed_seq)
return -EINVAL;
partial_history_cycles = cycles - system_counterval.cycles;
total_history_cycles = cycles - history_begin->cycles;
discontinuity =
history_begin->clock_was_set_seq != clock_was_set_seq;
ret = adjust_historical_crosststamp(history_begin,
partial_history_cycles,
total_history_cycles,
discontinuity, xtstamp);
if (ret)
return ret;
}
return 0;
}
EXPORT_SYMBOL_GPL(get_device_system_crosststamp);
/**
* do_settimeofday64 - Sets the time of day.
* @ts: pointer to the timespec64 variable containing the new time
*
* Sets the time of day to the new time and update NTP and notify hrtimers
*/
int do_settimeofday64(const struct timespec64 *ts)
{
struct timekeeper *tk = &tk_core.timekeeper;
struct timespec64 ts_delta, xt;
unsigned long flags;
int ret = 0;
if (!timespec64_valid_settod(ts))
return -EINVAL;
raw_spin_lock_irqsave(&timekeeper_lock, flags);
write_seqcount_begin(&tk_core.seq);
timekeeping_forward_now(tk);
xt = tk_xtime(tk);
ts_delta = timespec64_sub(*ts, xt);
if (timespec64_compare(&tk->wall_to_monotonic, &ts_delta) > 0) {
ret = -EINVAL;
goto out;
}
tk_set_wall_to_mono(tk, timespec64_sub(tk->wall_to_monotonic, ts_delta));
tk_set_xtime(tk, ts);
out:
timekeeping_update(tk, TK_CLEAR_NTP | TK_MIRROR | TK_CLOCK_WAS_SET);
write_seqcount_end(&tk_core.seq);
raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
/* Signal hrtimers about time change */
clock_was_set(CLOCK_SET_WALL);
if (!ret)
audit_tk_injoffset(ts_delta);
return ret;
}
EXPORT_SYMBOL(do_settimeofday64);
/**
* timekeeping_inject_offset - Adds or subtracts from the current time.
* @ts: Pointer to the timespec variable containing the offset
*
* Adds or subtracts an offset value from the current time.
*/
static int timekeeping_inject_offset(const struct timespec64 *ts)
{
struct timekeeper *tk = &tk_core.timekeeper;
unsigned long flags;
struct timespec64 tmp;
int ret = 0;
if (ts->tv_nsec < 0 || ts->tv_nsec >= NSEC_PER_SEC)
return -EINVAL;
raw_spin_lock_irqsave(&timekeeper_lock, flags);
write_seqcount_begin(&tk_core.seq);
timekeeping_forward_now(tk);
/* Make sure the proposed value is valid */
tmp = timespec64_add(tk_xtime(tk), *ts);
if (timespec64_compare(&tk->wall_to_monotonic, ts) > 0 ||
!timespec64_valid_settod(&tmp)) {
ret = -EINVAL;
goto error;
}
tk_xtime_add(tk, ts);
tk_set_wall_to_mono(tk, timespec64_sub(tk->wall_to_monotonic, *ts));
error: /* even if we error out, we forwarded the time, so call update */
timekeeping_update(tk, TK_CLEAR_NTP | TK_MIRROR | TK_CLOCK_WAS_SET);
write_seqcount_end(&tk_core.seq);
raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
/* Signal hrtimers about time change */
clock_was_set(CLOCK_SET_WALL);
return ret;
}
/*
* Indicates if there is an offset between the system clock and the hardware
* clock/persistent clock/rtc.
*/
int persistent_clock_is_local;
/*
* Adjust the time obtained from the CMOS to be UTC time instead of
* local time.
*
* This is ugly, but preferable to the alternatives. Otherwise we
* would either need to write a program to do it in /etc/rc (and risk
* confusion if the program gets run more than once; it would also be
* hard to make the program warp the clock precisely n hours) or
* compile in the timezone information into the kernel. Bad, bad....
*
* - TYT, 1992-01-01
*
* The best thing to do is to keep the CMOS clock in universal time (UTC)
* as real UNIX machines always do it. This avoids all headaches about
* daylight saving times and warping kernel clocks.
*/
void timekeeping_warp_clock(void)
{
if (sys_tz.tz_minuteswest != 0) {
struct timespec64 adjust;
persistent_clock_is_local = 1;
adjust.tv_sec = sys_tz.tz_minuteswest * 60;
adjust.tv_nsec = 0;
timekeeping_inject_offset(&adjust);
}
}
/*
* __timekeeping_set_tai_offset - Sets the TAI offset from UTC and monotonic
*/
static void __timekeeping_set_tai_offset(struct timekeeper *tk, s32 tai_offset)
{
tk->tai_offset = tai_offset;
tk->offs_tai = ktime_add(tk->offs_real, ktime_set(tai_offset, 0));
}
/*
* change_clocksource - Swaps clocksources if a new one is available
*
* Accumulates current time interval and initializes new clocksource
*/
static int change_clocksource(void *data)
{
struct timekeeper *tk = &tk_core.timekeeper;
struct clocksource *new, *old = NULL;
unsigned long flags;
bool change = false;
new = (struct clocksource *) data;
/*
* If the cs is in module, get a module reference. Succeeds
* for built-in code (owner == NULL) as well.
*/
if (try_module_get(new->owner)) {
if (!new->enable || new->enable(new) == 0)
change = true;
else
module_put(new->owner);
}
raw_spin_lock_irqsave(&timekeeper_lock, flags);
write_seqcount_begin(&tk_core.seq);
timekeeping_forward_now(tk);
if (change) {
old = tk->tkr_mono.clock;
tk_setup_internals(tk, new);
}
timekeeping_update(tk, TK_CLEAR_NTP | TK_MIRROR | TK_CLOCK_WAS_SET);
write_seqcount_end(&tk_core.seq);
raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
if (old) {
if (old->disable)
old->disable(old);
module_put(old->owner);
}
return 0;
}
/**
* timekeeping_notify - Install a new clock source
* @clock: pointer to the clock source
*
* This function is called from clocksource.c after a new, better clock
* source has been registered. The caller holds the clocksource_mutex.
*/
int timekeeping_notify(struct clocksource *clock)
{
struct timekeeper *tk = &tk_core.timekeeper;
if (tk->tkr_mono.clock == clock)
return 0;
stop_machine(change_clocksource, clock, NULL);
tick_clock_notify();
return tk->tkr_mono.clock == clock ? 0 : -1;
}
/**
* ktime_get_raw_ts64 - Returns the raw monotonic time in a timespec
* @ts: pointer to the timespec64 to be set
*
* Returns the raw monotonic time (completely un-modified by ntp)
*/
void ktime_get_raw_ts64(struct timespec64 *ts)
{
struct timekeeper *tk = &tk_core.timekeeper;
unsigned int seq;
u64 nsecs;
do {
seq = read_seqcount_begin(&tk_core.seq);
ts->tv_sec = tk->raw_sec;
nsecs = timekeeping_get_ns(&tk->tkr_raw);
} while (read_seqcount_retry(&tk_core.seq, seq));
ts->tv_nsec = 0;
timespec64_add_ns(ts, nsecs);
}
EXPORT_SYMBOL(ktime_get_raw_ts64);
/**
* timekeeping_valid_for_hres - Check if timekeeping is suitable for hres
*/
int timekeeping_valid_for_hres(void)
{
struct timekeeper *tk = &tk_core.timekeeper;
unsigned int seq;
int ret;
do {
seq = read_seqcount_begin(&tk_core.seq);
ret = tk->tkr_mono.clock->flags & CLOCK_SOURCE_VALID_FOR_HRES;
} while (read_seqcount_retry(&tk_core.seq, seq));
return ret;
}
/**
* timekeeping_max_deferment - Returns max time the clocksource can be deferred
*/
u64 timekeeping_max_deferment(void)
{
struct timekeeper *tk = &tk_core.timekeeper;
unsigned int seq;
u64 ret;
do {
seq = read_seqcount_begin(&tk_core.seq);
ret = tk->tkr_mono.clock->max_idle_ns;
} while (read_seqcount_retry(&tk_core.seq, seq));
return ret;
}
/**
* read_persistent_clock64 - Return time from the persistent clock.
* @ts: Pointer to the storage for the readout value
*
* Weak dummy function for arches that do not yet support it.
* Reads the time from the battery backed persistent clock.
* Returns a timespec with tv_sec=0 and tv_nsec=0 if unsupported.
*
* XXX - Do be sure to remove it once all arches implement it.
*/
void __weak read_persistent_clock64(struct timespec64 *ts)
{
ts->tv_sec = 0;
ts->tv_nsec = 0;
}
/**
* read_persistent_wall_and_boot_offset - Read persistent clock, and also offset
* from the boot.
*
* Weak dummy function for arches that do not yet support it.
* @wall_time: - current time as returned by persistent clock
* @boot_offset: - offset that is defined as wall_time - boot_time
*
* The default function calculates offset based on the current value of
* local_clock(). This way architectures that support sched_clock() but don't
* support dedicated boot time clock will provide the best estimate of the
* boot time.
*/
void __weak __init
read_persistent_wall_and_boot_offset(struct timespec64 *wall_time,
struct timespec64 *boot_offset)
{
read_persistent_clock64(wall_time);
*boot_offset = ns_to_timespec64(local_clock());
}
/*
* Flag reflecting whether timekeeping_resume() has injected sleeptime.
*
* The flag starts of false and is only set when a suspend reaches
* timekeeping_suspend(), timekeeping_resume() sets it to false when the
* timekeeper clocksource is not stopping across suspend and has been
* used to update sleep time. If the timekeeper clocksource has stopped
* then the flag stays true and is used by the RTC resume code to decide
* whether sleeptime must be injected and if so the flag gets false then.
*
* If a suspend fails before reaching timekeeping_resume() then the flag
* stays false and prevents erroneous sleeptime injection.
*/
static bool suspend_timing_needed;
/* Flag for if there is a persistent clock on this platform */
static bool persistent_clock_exists;
/*
* timekeeping_init - Initializes the clocksource and common timekeeping values
*/
void __init timekeeping_init(void)
{
struct timespec64 wall_time, boot_offset, wall_to_mono;
struct timekeeper *tk = &tk_core.timekeeper;
struct clocksource *clock;
unsigned long flags;
read_persistent_wall_and_boot_offset(&wall_time, &boot_offset);
if (timespec64_valid_settod(&wall_time) &&
timespec64_to_ns(&wall_time) > 0) {
persistent_clock_exists = true;
} else if (timespec64_to_ns(&wall_time) != 0) {
pr_warn("Persistent clock returned invalid value");
wall_time = (struct timespec64){0};
}
if (timespec64_compare(&wall_time, &boot_offset) < 0)
boot_offset = (struct timespec64){0};
/*
* We want set wall_to_mono, so the following is true:
* wall time + wall_to_mono = boot time
*/
wall_to_mono = timespec64_sub(boot_offset, wall_time);
raw_spin_lock_irqsave(&timekeeper_lock, flags);
write_seqcount_begin(&tk_core.seq);
ntp_init();
clock = clocksource_default_clock();
if (clock->enable)
clock->enable(clock);
tk_setup_internals(tk, clock);
tk_set_xtime(tk, &wall_time);
tk->raw_sec = 0;
tk_set_wall_to_mono(tk, wall_to_mono);
timekeeping_update(tk, TK_MIRROR | TK_CLOCK_WAS_SET);
write_seqcount_end(&tk_core.seq);
raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
}
/* time in seconds when suspend began for persistent clock */
static struct timespec64 timekeeping_suspend_time;
/**
* __timekeeping_inject_sleeptime - Internal function to add sleep interval
* @tk: Pointer to the timekeeper to be updated
* @delta: Pointer to the delta value in timespec64 format
*
* Takes a timespec offset measuring a suspend interval and properly
* adds the sleep offset to the timekeeping variables.
*/
static void __timekeeping_inject_sleeptime(struct timekeeper *tk,
const struct timespec64 *delta)
{
if (!timespec64_valid_strict(delta)) {
printk_deferred(KERN_WARNING
"__timekeeping_inject_sleeptime: Invalid "
"sleep delta value!\n");
return;
}
tk_xtime_add(tk, delta);
tk_set_wall_to_mono(tk, timespec64_sub(tk->wall_to_monotonic, *delta));
tk_update_sleep_time(tk, timespec64_to_ktime(*delta));
tk_debug_account_sleep_time(delta);
}
#if defined(CONFIG_PM_SLEEP) && defined(CONFIG_RTC_HCTOSYS_DEVICE)
/**
* We have three kinds of time sources to use for sleep time
* injection, the preference order is:
* 1) non-stop clocksource
* 2) persistent clock (ie: RTC accessible when irqs are off)
* 3) RTC
*
* 1) and 2) are used by timekeeping, 3) by RTC subsystem.
* If system has neither 1) nor 2), 3) will be used finally.
*
*
* If timekeeping has injected sleeptime via either 1) or 2),
* 3) becomes needless, so in this case we don't need to call
* rtc_resume(), and this is what timekeeping_rtc_skipresume()
* means.
*/
bool timekeeping_rtc_skipresume(void)
{
return !suspend_timing_needed;
}
/**
* 1) can be determined whether to use or not only when doing
* timekeeping_resume() which is invoked after rtc_suspend(),
* so we can't skip rtc_suspend() surely if system has 1).
*
* But if system has 2), 2) will definitely be used, so in this
* case we don't need to call rtc_suspend(), and this is what
* timekeeping_rtc_skipsuspend() means.
*/
bool timekeeping_rtc_skipsuspend(void)
{
return persistent_clock_exists;
}
/**
* timekeeping_inject_sleeptime64 - Adds suspend interval to timeekeeping values
* @delta: pointer to a timespec64 delta value
*
* This hook is for architectures that cannot support read_persistent_clock64
* because their RTC/persistent clock is only accessible when irqs are enabled.
* and also don't have an effective nonstop clocksource.
*
* This function should only be called by rtc_resume(), and allows
* a suspend offset to be injected into the timekeeping values.
*/
void timekeeping_inject_sleeptime64(const struct timespec64 *delta)
{
struct timekeeper *tk = &tk_core.timekeeper;
unsigned long flags;
raw_spin_lock_irqsave(&timekeeper_lock, flags);
write_seqcount_begin(&tk_core.seq);
suspend_timing_needed = false;
timekeeping_forward_now(tk);
__timekeeping_inject_sleeptime(tk, delta);
timekeeping_update(tk, TK_CLEAR_NTP | TK_MIRROR | TK_CLOCK_WAS_SET);
write_seqcount_end(&tk_core.seq);
raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
/* Signal hrtimers about time change */
clock_was_set(CLOCK_SET_WALL | CLOCK_SET_BOOT);
}
#endif
/**
* timekeeping_resume - Resumes the generic timekeeping subsystem.
*/
void timekeeping_resume(void)
{
struct timekeeper *tk = &tk_core.timekeeper;
struct clocksource *clock = tk->tkr_mono.clock;
unsigned long flags;
struct timespec64 ts_new, ts_delta;
u64 cycle_now, nsec;
bool inject_sleeptime = false;
read_persistent_clock64(&ts_new);
clockevents_resume();
clocksource_resume();
raw_spin_lock_irqsave(&timekeeper_lock, flags);
write_seqcount_begin(&tk_core.seq);
/*
* After system resumes, we need to calculate the suspended time and
* compensate it for the OS time. There are 3 sources that could be
* used: Nonstop clocksource during suspend, persistent clock and rtc
* device.
*
* One specific platform may have 1 or 2 or all of them, and the
* preference will be:
* suspend-nonstop clocksource -> persistent clock -> rtc
* The less preferred source will only be tried if there is no better
* usable source. The rtc part is handled separately in rtc core code.
*/
cycle_now = tk_clock_read(&tk->tkr_mono);
nsec = clocksource_stop_suspend_timing(clock, cycle_now);
if (nsec > 0) {
ts_delta = ns_to_timespec64(nsec);
inject_sleeptime = true;
} else if (timespec64_compare(&ts_new, &timekeeping_suspend_time) > 0) {
ts_delta = timespec64_sub(ts_new, timekeeping_suspend_time);
inject_sleeptime = true;
}
if (inject_sleeptime) {
suspend_timing_needed = false;
__timekeeping_inject_sleeptime(tk, &ts_delta);
}
/* Re-base the last cycle value */
tk->tkr_mono.cycle_last = cycle_now;
tk->tkr_raw.cycle_last = cycle_now;
tk->ntp_error = 0;
timekeeping_suspended = 0;
timekeeping_update(tk, TK_MIRROR | TK_CLOCK_WAS_SET);
write_seqcount_end(&tk_core.seq);
raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
touch_softlockup_watchdog();
/* Resume the clockevent device(s) and hrtimers */
tick_resume();
/* Notify timerfd as resume is equivalent to clock_was_set() */
timerfd_resume();
}
int timekeeping_suspend(void)
{
struct timekeeper *tk = &tk_core.timekeeper;
unsigned long flags;
struct timespec64 delta, delta_delta;
static struct timespec64 old_delta;
struct clocksource *curr_clock;
u64 cycle_now;
read_persistent_clock64(&timekeeping_suspend_time);
/*
* On some systems the persistent_clock can not be detected at
* timekeeping_init by its return value, so if we see a valid
* value returned, update the persistent_clock_exists flag.
*/
if (timekeeping_suspend_time.tv_sec || timekeeping_suspend_time.tv_nsec)
persistent_clock_exists = true;
suspend_timing_needed = true;
raw_spin_lock_irqsave(&timekeeper_lock, flags);
write_seqcount_begin(&tk_core.seq);
timekeeping_forward_now(tk);
timekeeping_suspended = 1;
/*
* Since we've called forward_now, cycle_last stores the value
* just read from the current clocksource. Save this to potentially
* use in suspend timing.
*/
curr_clock = tk->tkr_mono.clock;
cycle_now = tk->tkr_mono.cycle_last;
clocksource_start_suspend_timing(curr_clock, cycle_now);
if (persistent_clock_exists) {
/*
* To avoid drift caused by repeated suspend/resumes,
* which each can add ~1 second drift error,
* try to compensate so the difference in system time
* and persistent_clock time stays close to constant.
*/
delta = timespec64_sub(tk_xtime(tk), timekeeping_suspend_time);
delta_delta = timespec64_sub(delta, old_delta);
if (abs(delta_delta.tv_sec) >= 2) {
/*
* if delta_delta is too large, assume time correction
* has occurred and set old_delta to the current delta.
*/
old_delta = delta;
} else {
/* Otherwise try to adjust old_system to compensate */
timekeeping_suspend_time =
timespec64_add(timekeeping_suspend_time, delta_delta);
}
}
timekeeping_update(tk, TK_MIRROR);
halt_fast_timekeeper(tk);
write_seqcount_end(&tk_core.seq);
raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
tick_suspend();
clocksource_suspend();
clockevents_suspend();
return 0;
}
/* sysfs resume/suspend bits for timekeeping */
static struct syscore_ops timekeeping_syscore_ops = {
.resume = timekeeping_resume,
.suspend = timekeeping_suspend,
};
static int __init timekeeping_init_ops(void)
{
register_syscore_ops(&timekeeping_syscore_ops);
return 0;
}
device_initcall(timekeeping_init_ops);
/*
* Apply a multiplier adjustment to the timekeeper
*/
static __always_inline void timekeeping_apply_adjustment(struct timekeeper *tk,
s64 offset,
s32 mult_adj)
{
s64 interval = tk->cycle_interval;
if (mult_adj == 0) {
return;
} else if (mult_adj == -1) {
interval = -interval;
offset = -offset;
} else if (mult_adj != 1) {
interval *= mult_adj;
offset *= mult_adj;
}
/*
* So the following can be confusing.
*
* To keep things simple, lets assume mult_adj == 1 for now.
*
* When mult_adj != 1, remember that the interval and offset values
* have been appropriately scaled so the math is the same.
*
* The basic idea here is that we're increasing the multiplier
* by one, this causes the xtime_interval to be incremented by
* one cycle_interval. This is because:
* xtime_interval = cycle_interval * mult
* So if mult is being incremented by one:
* xtime_interval = cycle_interval * (mult + 1)
* Its the same as:
* xtime_interval = (cycle_interval * mult) + cycle_interval
* Which can be shortened to:
* xtime_interval += cycle_interval
*
* So offset stores the non-accumulated cycles. Thus the current
* time (in shifted nanoseconds) is:
* now = (offset * adj) + xtime_nsec
* Now, even though we're adjusting the clock frequency, we have
* to keep time consistent. In other words, we can't jump back
* in time, and we also want to avoid jumping forward in time.
*
* So given the same offset value, we need the time to be the same
* both before and after the freq adjustment.
* now = (offset * adj_1) + xtime_nsec_1
* now = (offset * adj_2) + xtime_nsec_2
* So:
* (offset * adj_1) + xtime_nsec_1 =
* (offset * adj_2) + xtime_nsec_2
* And we know:
* adj_2 = adj_1 + 1
* So:
* (offset * adj_1) + xtime_nsec_1 =
* (offset * (adj_1+1)) + xtime_nsec_2
* (offset * adj_1) + xtime_nsec_1 =
* (offset * adj_1) + offset + xtime_nsec_2
* Canceling the sides:
* xtime_nsec_1 = offset + xtime_nsec_2
* Which gives us:
* xtime_nsec_2 = xtime_nsec_1 - offset
* Which simplifies to:
* xtime_nsec -= offset
*/
if ((mult_adj > 0) && (tk->tkr_mono.mult + mult_adj < mult_adj)) {
/* NTP adjustment caused clocksource mult overflow */
WARN_ON_ONCE(1);
return;
}
tk->tkr_mono.mult += mult_adj;
tk->xtime_interval += interval;
tk->tkr_mono.xtime_nsec -= offset;
}
/*
* Adjust the timekeeper's multiplier to the correct frequency
* and also to reduce the accumulated error value.
*/
static void timekeeping_adjust(struct timekeeper *tk, s64 offset)
{
u32 mult;
/*
* Determine the multiplier from the current NTP tick length.
* Avoid expensive division when the tick length doesn't change.
*/
if (likely(tk->ntp_tick == ntp_tick_length())) {
mult = tk->tkr_mono.mult - tk->ntp_err_mult;
} else {
tk->ntp_tick = ntp_tick_length();
mult = div64_u64((tk->ntp_tick >> tk->ntp_error_shift) -
tk->xtime_remainder, tk->cycle_interval);
}
/*
* If the clock is behind the NTP time, increase the multiplier by 1
* to catch up with it. If it's ahead and there was a remainder in the
* tick division, the clock will slow down. Otherwise it will stay
* ahead until the tick length changes to a non-divisible value.
*/
tk->ntp_err_mult = tk->ntp_error > 0 ? 1 : 0;
mult += tk->ntp_err_mult;
timekeeping_apply_adjustment(tk, offset, mult - tk->tkr_mono.mult);
if (unlikely(tk->tkr_mono.clock->maxadj &&
(abs(tk->tkr_mono.mult - tk->tkr_mono.clock->mult)
> tk->tkr_mono.clock->maxadj))) {
printk_once(KERN_WARNING
"Adjusting %s more than 11%% (%ld vs %ld)\n",
tk->tkr_mono.clock->name, (long)tk->tkr_mono.mult,
(long)tk->tkr_mono.clock->mult + tk->tkr_mono.clock->maxadj);
}
/*
* It may be possible that when we entered this function, xtime_nsec
* was very small. Further, if we're slightly speeding the clocksource
* in the code above, its possible the required corrective factor to
* xtime_nsec could cause it to underflow.
*
* Now, since we have already accumulated the second and the NTP
* subsystem has been notified via second_overflow(), we need to skip
* the next update.
*/
if (unlikely((s64)tk->tkr_mono.xtime_nsec < 0)) {
tk->tkr_mono.xtime_nsec += (u64)NSEC_PER_SEC <<
tk->tkr_mono.shift;
tk->xtime_sec--;
tk->skip_second_overflow = 1;
}
}
/*
* accumulate_nsecs_to_secs - Accumulates nsecs into secs
*
* Helper function that accumulates the nsecs greater than a second
* from the xtime_nsec field to the xtime_secs field.
* It also calls into the NTP code to handle leapsecond processing.
*/
static inline unsigned int accumulate_nsecs_to_secs(struct timekeeper *tk)
{
u64 nsecps = (u64)NSEC_PER_SEC << tk->tkr_mono.shift;
unsigned int clock_set = 0;
while (tk->tkr_mono.xtime_nsec >= nsecps) {
int leap;
tk->tkr_mono.xtime_nsec -= nsecps;
tk->xtime_sec++;
/*
* Skip NTP update if this second was accumulated before,
* i.e. xtime_nsec underflowed in timekeeping_adjust()
*/
if (unlikely(tk->skip_second_overflow)) {
tk->skip_second_overflow = 0;
continue;
}
/* Figure out if its a leap sec and apply if needed */
leap = second_overflow(tk->xtime_sec);
if (unlikely(leap)) {
struct timespec64 ts;
tk->xtime_sec += leap;
ts.tv_sec = leap;
ts.tv_nsec = 0;
tk_set_wall_to_mono(tk,
timespec64_sub(tk->wall_to_monotonic, ts));
__timekeeping_set_tai_offset(tk, tk->tai_offset - leap);
clock_set = TK_CLOCK_WAS_SET;
}
}
return clock_set;
}
/*
* logarithmic_accumulation - shifted accumulation of cycles
*
* This functions accumulates a shifted interval of cycles into
* a shifted interval nanoseconds. Allows for O(log) accumulation
* loop.
*
* Returns the unconsumed cycles.
*/
static u64 logarithmic_accumulation(struct timekeeper *tk, u64 offset,
u32 shift, unsigned int *clock_set)
{
u64 interval = tk->cycle_interval << shift;
u64 snsec_per_sec;
/* If the offset is smaller than a shifted interval, do nothing */
if (offset < interval)
return offset;
/* Accumulate one shifted interval */
offset -= interval;
tk->tkr_mono.cycle_last += interval;
tk->tkr_raw.cycle_last += interval;
tk->tkr_mono.xtime_nsec += tk->xtime_interval << shift;
*clock_set |= accumulate_nsecs_to_secs(tk);
/* Accumulate raw time */
tk->tkr_raw.xtime_nsec += tk->raw_interval << shift;
snsec_per_sec = (u64)NSEC_PER_SEC << tk->tkr_raw.shift;
while (tk->tkr_raw.xtime_nsec >= snsec_per_sec) {
tk->tkr_raw.xtime_nsec -= snsec_per_sec;
tk->raw_sec++;
}
/* Accumulate error between NTP and clock interval */
tk->ntp_error += tk->ntp_tick << shift;
tk->ntp_error -= (tk->xtime_interval + tk->xtime_remainder) <<
(tk->ntp_error_shift + shift);
return offset;
}
/*
* timekeeping_advance - Updates the timekeeper to the current time and
* current NTP tick length
*/
static bool timekeeping_advance(enum timekeeping_adv_mode mode)
{
struct timekeeper *real_tk = &tk_core.timekeeper;
struct timekeeper *tk = &shadow_timekeeper;
u64 offset;
int shift = 0, maxshift;
unsigned int clock_set = 0;
unsigned long flags;
raw_spin_lock_irqsave(&timekeeper_lock, flags);
/* Make sure we're fully resumed: */
if (unlikely(timekeeping_suspended))
goto out;
offset = clocksource_delta(tk_clock_read(&tk->tkr_mono),
tk->tkr_mono.cycle_last, tk->tkr_mono.mask);
/* Check if there's really nothing to do */
if (offset < real_tk->cycle_interval && mode == TK_ADV_TICK)
goto out;
/* Do some additional sanity checking */
timekeeping_check_update(tk, offset);
/*
* With NO_HZ we may have to accumulate many cycle_intervals
* (think "ticks") worth of time at once. To do this efficiently,
* we calculate the largest doubling multiple of cycle_intervals
* that is smaller than the offset. We then accumulate that
* chunk in one go, and then try to consume the next smaller
* doubled multiple.
*/
shift = ilog2(offset) - ilog2(tk->cycle_interval);
shift = max(0, shift);
/* Bound shift to one less than what overflows tick_length */
maxshift = (64 - (ilog2(ntp_tick_length())+1)) - 1;
shift = min(shift, maxshift);
while (offset >= tk->cycle_interval) {
offset = logarithmic_accumulation(tk, offset, shift,
&clock_set);
if (offset < tk->cycle_interval<<shift)
shift--;
}
/* Adjust the multiplier to correct NTP error */
timekeeping_adjust(tk, offset);
/*
* Finally, make sure that after the rounding
* xtime_nsec isn't larger than NSEC_PER_SEC
*/
clock_set |= accumulate_nsecs_to_secs(tk);
write_seqcount_begin(&tk_core.seq);
/*
* Update the real timekeeper.
*
* We could avoid this memcpy by switching pointers, but that
* requires changes to all other timekeeper usage sites as
* well, i.e. move the timekeeper pointer getter into the
* spinlocked/seqcount protected sections. And we trade this
* memcpy under the tk_core.seq against one before we start
* updating.
*/
timekeeping_update(tk, clock_set);
memcpy(real_tk, tk, sizeof(*tk));
/* The memcpy must come last. Do not put anything here! */
write_seqcount_end(&tk_core.seq);
out:
raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
return !!clock_set;
}
/**
* update_wall_time - Uses the current clocksource to increment the wall time
*
*/
void update_wall_time(void)
{
if (timekeeping_advance(TK_ADV_TICK))
clock_was_set_delayed();
}
/**
* getboottime64 - Return the real time of system boot.
* @ts: pointer to the timespec64 to be set
*
* Returns the wall-time of boot in a timespec64.
*
* This is based on the wall_to_monotonic offset and the total suspend
* time. Calls to settimeofday will affect the value returned (which
* basically means that however wrong your real time clock is at boot time,
* you get the right time here).
*/
void getboottime64(struct timespec64 *ts)
{
struct timekeeper *tk = &tk_core.timekeeper;
ktime_t t = ktime_sub(tk->offs_real, tk->offs_boot);
*ts = ktime_to_timespec64(t);
}
EXPORT_SYMBOL_GPL(getboottime64);
void ktime_get_coarse_real_ts64(struct timespec64 *ts)
{
struct timekeeper *tk = &tk_core.timekeeper;
unsigned int seq;
do {
seq = read_seqcount_begin(&tk_core.seq);
*ts = tk_xtime(tk);
} while (read_seqcount_retry(&tk_core.seq, seq));
}
EXPORT_SYMBOL(ktime_get_coarse_real_ts64);
void ktime_get_coarse_ts64(struct timespec64 *ts)
{
struct timekeeper *tk = &tk_core.timekeeper;
struct timespec64 now, mono;
unsigned int seq;
do {
seq = read_seqcount_begin(&tk_core.seq);
now = tk_xtime(tk);
mono = tk->wall_to_monotonic;
} while (read_seqcount_retry(&tk_core.seq, seq));
set_normalized_timespec64(ts, now.tv_sec + mono.tv_sec,
now.tv_nsec + mono.tv_nsec);
}
EXPORT_SYMBOL(ktime_get_coarse_ts64);
/*
* Must hold jiffies_lock
*/
void do_timer(unsigned long ticks)
{
jiffies_64 += ticks;
calc_global_load();
}
/**
* ktime_get_update_offsets_now - hrtimer helper
* @cwsseq: pointer to check and store the clock was set sequence number
* @offs_real: pointer to storage for monotonic -> realtime offset
* @offs_boot: pointer to storage for monotonic -> boottime offset
* @offs_tai: pointer to storage for monotonic -> clock tai offset
*
* Returns current monotonic time and updates the offsets if the
* sequence number in @cwsseq and timekeeper.clock_was_set_seq are
* different.
*
* Called from hrtimer_interrupt() or retrigger_next_event()
*/
ktime_t ktime_get_update_offsets_now(unsigned int *cwsseq, ktime_t *offs_real,
ktime_t *offs_boot, ktime_t *offs_tai)
{
struct timekeeper *tk = &tk_core.timekeeper;
unsigned int seq;
ktime_t base;
u64 nsecs;
do {
seq = read_seqcount_begin(&tk_core.seq);
base = tk->tkr_mono.base;
nsecs = timekeeping_get_ns(&tk->tkr_mono);
base = ktime_add_ns(base, nsecs);
if (*cwsseq != tk->clock_was_set_seq) {
*cwsseq = tk->clock_was_set_seq;
*offs_real = tk->offs_real;
*offs_boot = tk->offs_boot;
*offs_tai = tk->offs_tai;
}
/* Handle leapsecond insertion adjustments */
if (unlikely(base >= tk->next_leap_ktime))
*offs_real = ktime_sub(tk->offs_real, ktime_set(1, 0));
} while (read_seqcount_retry(&tk_core.seq, seq));
return base;
}
/*
* timekeeping_validate_timex - Ensures the timex is ok for use in do_adjtimex
*/
static int timekeeping_validate_timex(const struct __kernel_timex *txc)
{
if (txc->modes & ADJ_ADJTIME) {
/* singleshot must not be used with any other mode bits */
if (!(txc->modes & ADJ_OFFSET_SINGLESHOT))
return -EINVAL;
if (!(txc->modes & ADJ_OFFSET_READONLY) &&
!capable(CAP_SYS_TIME))
return -EPERM;
} else {
/* In order to modify anything, you gotta be super-user! */
if (txc->modes && !capable(CAP_SYS_TIME))
return -EPERM;
/*
* if the quartz is off by more than 10% then
* something is VERY wrong!
*/
if (txc->modes & ADJ_TICK &&
(txc->tick < 900000/USER_HZ ||
txc->tick > 1100000/USER_HZ))
return -EINVAL;
}
if (txc->modes & ADJ_SETOFFSET) {
/* In order to inject time, you gotta be super-user! */
if (!capable(CAP_SYS_TIME))
return -EPERM;
/*
* Validate if a timespec/timeval used to inject a time
* offset is valid. Offsets can be positive or negative, so
* we don't check tv_sec. The value of the timeval/timespec
* is the sum of its fields,but *NOTE*:
* The field tv_usec/tv_nsec must always be non-negative and
* we can't have more nanoseconds/microseconds than a second.
*/
if (txc->time.tv_usec < 0)
return -EINVAL;
if (txc->modes & ADJ_NANO) {
if (txc->time.tv_usec >= NSEC_PER_SEC)
return -EINVAL;
} else {
if (txc->time.tv_usec >= USEC_PER_SEC)
return -EINVAL;
}
}
/*
* Check for potential multiplication overflows that can
* only happen on 64-bit systems:
*/
if ((txc->modes & ADJ_FREQUENCY) && (BITS_PER_LONG == 64)) {
if (LLONG_MIN / PPM_SCALE > txc->freq)
return -EINVAL;
if (LLONG_MAX / PPM_SCALE < txc->freq)
return -EINVAL;
}
return 0;
}
/**
* do_adjtimex() - Accessor function to NTP __do_adjtimex function
*/
int do_adjtimex(struct __kernel_timex *txc)
{
struct timekeeper *tk = &tk_core.timekeeper;
struct audit_ntp_data ad;
bool clock_set = false;
struct timespec64 ts;
unsigned long flags;
s32 orig_tai, tai;
int ret;
/* Validate the data before disabling interrupts */
ret = timekeeping_validate_timex(txc);
if (ret)
return ret;
if (txc->modes & ADJ_SETOFFSET) {
struct timespec64 delta;
delta.tv_sec = txc->time.tv_sec;
delta.tv_nsec = txc->time.tv_usec;
if (!(txc->modes & ADJ_NANO))
delta.tv_nsec *= 1000;
ret = timekeeping_inject_offset(&delta);
if (ret)
return ret;
audit_tk_injoffset(delta);
}
audit_ntp_init(&ad);
ktime_get_real_ts64(&ts);
raw_spin_lock_irqsave(&timekeeper_lock, flags);
write_seqcount_begin(&tk_core.seq);
orig_tai = tai = tk->tai_offset;
ret = __do_adjtimex(txc, &ts, &tai, &ad);
if (tai != orig_tai) {
__timekeeping_set_tai_offset(tk, tai);
timekeeping_update(tk, TK_MIRROR | TK_CLOCK_WAS_SET);
clock_set = true;
}
tk_update_leap_state(tk);
write_seqcount_end(&tk_core.seq);
raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
audit_ntp_log(&ad);
/* Update the multiplier immediately if frequency was set directly */
if (txc->modes & (ADJ_FREQUENCY | ADJ_TICK))
clock_set |= timekeeping_advance(TK_ADV_FREQ);
if (clock_set)
clock_was_set(CLOCK_REALTIME);
ntp_notify_cmos_timer();
return ret;
}
#ifdef CONFIG_NTP_PPS
/**
* hardpps() - Accessor function to NTP __hardpps function
*/
void hardpps(const struct timespec64 *phase_ts, const struct timespec64 *raw_ts)
{
unsigned long flags;
raw_spin_lock_irqsave(&timekeeper_lock, flags);
write_seqcount_begin(&tk_core.seq);
__hardpps(phase_ts, raw_ts);
write_seqcount_end(&tk_core.seq);
raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
}
EXPORT_SYMBOL(hardpps);
#endif /* CONFIG_NTP_PPS */
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_MMU_NOTIFIER_H
#define _LINUX_MMU_NOTIFIER_H
#include <linux/list.h>
#include <linux/spinlock.h>
#include <linux/mm_types.h>
#include <linux/mmap_lock.h>
#include <linux/srcu.h>
#include <linux/interval_tree.h>
struct mmu_notifier_subscriptions;
struct mmu_notifier;
struct mmu_notifier_range;
struct mmu_interval_notifier;
/**
* enum mmu_notifier_event - reason for the mmu notifier callback
* @MMU_NOTIFY_UNMAP: either munmap() that unmap the range or a mremap() that
* move the range
*
* @MMU_NOTIFY_CLEAR: clear page table entry (many reasons for this like
* madvise() or replacing a page by another one, ...).
*
* @MMU_NOTIFY_PROTECTION_VMA: update is due to protection change for the range
* ie using the vma access permission (vm_page_prot) to update the whole range
* is enough no need to inspect changes to the CPU page table (mprotect()
* syscall)
*
* @MMU_NOTIFY_PROTECTION_PAGE: update is due to change in read/write flag for
* pages in the range so to mirror those changes the user must inspect the CPU
* page table (from the end callback).
*
* @MMU_NOTIFY_SOFT_DIRTY: soft dirty accounting (still same page and same
* access flags). User should soft dirty the page in the end callback to make
* sure that anyone relying on soft dirtiness catch pages that might be written
* through non CPU mappings.
*
* @MMU_NOTIFY_RELEASE: used during mmu_interval_notifier invalidate to signal
* that the mm refcount is zero and the range is no longer accessible.
*
* @MMU_NOTIFY_MIGRATE: used during migrate_vma_collect() invalidate to signal
* a device driver to possibly ignore the invalidation if the
* owner field matches the driver's device private pgmap owner.
*
* @MMU_NOTIFY_EXCLUSIVE: to signal a device driver that the device will no
* longer have exclusive access to the page. When sent during creation of an
* exclusive range the owner will be initialised to the value provided by the
* caller of make_device_exclusive_range(), otherwise the owner will be NULL.
*/
enum mmu_notifier_event {
MMU_NOTIFY_UNMAP = 0,
MMU_NOTIFY_CLEAR,
MMU_NOTIFY_PROTECTION_VMA,
MMU_NOTIFY_PROTECTION_PAGE,
MMU_NOTIFY_SOFT_DIRTY,
MMU_NOTIFY_RELEASE,
MMU_NOTIFY_MIGRATE,
MMU_NOTIFY_EXCLUSIVE,
};
#define MMU_NOTIFIER_RANGE_BLOCKABLE (1 << 0)
struct mmu_notifier_ops {
/*
* Called either by mmu_notifier_unregister or when the mm is
* being destroyed by exit_mmap, always before all pages are
* freed. This can run concurrently with other mmu notifier
* methods (the ones invoked outside the mm context) and it
* should tear down all secondary mmu mappings and freeze the
* secondary mmu. If this method isn't implemented you've to
* be sure that nothing could possibly write to the pages
* through the secondary mmu by the time the last thread with
* tsk->mm == mm exits.
*
* As side note: the pages freed after ->release returns could
* be immediately reallocated by the gart at an alias physical
* address with a different cache model, so if ->release isn't
* implemented because all _software_ driven memory accesses
* through the secondary mmu are terminated by the time the
* last thread of this mm quits, you've also to be sure that
* speculative _hardware_ operations can't allocate dirty
* cachelines in the cpu that could not be snooped and made
* coherent with the other read and write operations happening
* through the gart alias address, so leading to memory
* corruption.
*/
void (*release)(struct mmu_notifier *subscription,
struct mm_struct *mm);
/*
* clear_flush_young is called after the VM is
* test-and-clearing the young/accessed bitflag in the
* pte. This way the VM will provide proper aging to the
* accesses to the page through the secondary MMUs and not
* only to the ones through the Linux pte.
* Start-end is necessary in case the secondary MMU is mapping the page
* at a smaller granularity than the primary MMU.
*/
int (*clear_flush_young)(struct mmu_notifier *subscription,
struct mm_struct *mm,
unsigned long start,
unsigned long end);
/*
* clear_young is a lightweight version of clear_flush_young. Like the
* latter, it is supposed to test-and-clear the young/accessed bitflag
* in the secondary pte, but it may omit flushing the secondary tlb.
*/
int (*clear_young)(struct mmu_notifier *subscription,
struct mm_struct *mm,
unsigned long start,
unsigned long end);
/*
* test_young is called to check the young/accessed bitflag in
* the secondary pte. This is used to know if the page is
* frequently used without actually clearing the flag or tearing
* down the secondary mapping on the page.
*/
int (*test_young)(struct mmu_notifier *subscription,
struct mm_struct *mm,
unsigned long address);
/*
* change_pte is called in cases that pte mapping to page is changed:
* for example, when ksm remaps pte to point to a new shared page.
*/
void (*change_pte)(struct mmu_notifier *subscription,
struct mm_struct *mm,
unsigned long address,
pte_t pte);
/*
* invalidate_range_start() and invalidate_range_end() must be
* paired and are called only when the mmap_lock and/or the
* locks protecting the reverse maps are held. If the subsystem
* can't guarantee that no additional references are taken to
* the pages in the range, it has to implement the
* invalidate_range() notifier to remove any references taken
* after invalidate_range_start().
*
* Invalidation of multiple concurrent ranges may be
* optionally permitted by the driver. Either way the
* establishment of sptes is forbidden in the range passed to
* invalidate_range_begin/end for the whole duration of the
* invalidate_range_begin/end critical section.
*
* invalidate_range_start() is called when all pages in the
* range are still mapped and have at least a refcount of one.
*
* invalidate_range_end() is called when all pages in the
* range have been unmapped and the pages have been freed by
* the VM.
*
* The VM will remove the page table entries and potentially
* the page between invalidate_range_start() and
* invalidate_range_end(). If the page must not be freed
* because of pending I/O or other circumstances then the
* invalidate_range_start() callback (or the initial mapping
* by the driver) must make sure that the refcount is kept
* elevated.
*
* If the driver increases the refcount when the pages are
* initially mapped into an address space then either
* invalidate_range_start() or invalidate_range_end() may
* decrease the refcount. If the refcount is decreased on
* invalidate_range_start() then the VM can free pages as page
* table entries are removed. If the refcount is only
* dropped on invalidate_range_end() then the driver itself
* will drop the last refcount but it must take care to flush
* any secondary tlb before doing the final free on the
* page. Pages will no longer be referenced by the linux
* address space but may still be referenced by sptes until
* the last refcount is dropped.
*
* If blockable argument is set to false then the callback cannot
* sleep and has to return with -EAGAIN if sleeping would be required.
* 0 should be returned otherwise. Please note that notifiers that can
* fail invalidate_range_start are not allowed to implement
* invalidate_range_end, as there is no mechanism for informing the
* notifier that its start failed.
*/
int (*invalidate_range_start)(struct mmu_notifier *subscription,
const struct mmu_notifier_range *range);
void (*invalidate_range_end)(struct mmu_notifier *subscription,
const struct mmu_notifier_range *range);
/*
* invalidate_range() is either called between
* invalidate_range_start() and invalidate_range_end() when the
* VM has to free pages that where unmapped, but before the
* pages are actually freed, or outside of _start()/_end() when
* a (remote) TLB is necessary.
*
* If invalidate_range() is used to manage a non-CPU TLB with
* shared page-tables, it not necessary to implement the
* invalidate_range_start()/end() notifiers, as
* invalidate_range() already catches the points in time when an
* external TLB range needs to be flushed. For more in depth
* discussion on this see Documentation/vm/mmu_notifier.rst
*
* Note that this function might be called with just a sub-range
* of what was passed to invalidate_range_start()/end(), if
* called between those functions.
*/
void (*invalidate_range)(struct mmu_notifier *subscription,
struct mm_struct *mm,
unsigned long start,
unsigned long end);
/*
* These callbacks are used with the get/put interface to manage the
* lifetime of the mmu_notifier memory. alloc_notifier() returns a new
* notifier for use with the mm.
*
* free_notifier() is only called after the mmu_notifier has been
* fully put, calls to any ops callback are prevented and no ops
* callbacks are currently running. It is called from a SRCU callback
* and cannot sleep.
*/
struct mmu_notifier *(*alloc_notifier)(struct mm_struct *mm);
void (*free_notifier)(struct mmu_notifier *subscription);
};
/*
* The notifier chains are protected by mmap_lock and/or the reverse map
* semaphores. Notifier chains are only changed when all reverse maps and
* the mmap_lock locks are taken.
*
* Therefore notifier chains can only be traversed when either
*
* 1. mmap_lock is held.
* 2. One of the reverse map locks is held (i_mmap_rwsem or anon_vma->rwsem).
* 3. No other concurrent thread can access the list (release)
*/
struct mmu_notifier {
struct hlist_node hlist;
const struct mmu_notifier_ops *ops;
struct mm_struct *mm;
struct rcu_head rcu;
unsigned int users;
};
/**
* struct mmu_interval_notifier_ops
* @invalidate: Upon return the caller must stop using any SPTEs within this
* range. This function can sleep. Return false only if sleeping
* was required but mmu_notifier_range_blockable(range) is false.
*/
struct mmu_interval_notifier_ops {
bool (*invalidate)(struct mmu_interval_notifier *interval_sub,
const struct mmu_notifier_range *range,
unsigned long cur_seq);
};
struct mmu_interval_notifier {
struct interval_tree_node interval_tree;
const struct mmu_interval_notifier_ops *ops;
struct mm_struct *mm;
struct hlist_node deferred_item;
unsigned long invalidate_seq;
};
#ifdef CONFIG_MMU_NOTIFIER
#ifdef CONFIG_LOCKDEP
extern struct lockdep_map __mmu_notifier_invalidate_range_start_map;
#endif
struct mmu_notifier_range {
struct vm_area_struct *vma;
struct mm_struct *mm;
unsigned long start;
unsigned long end;
unsigned flags;
enum mmu_notifier_event event;
void *owner;
};
static inline int mm_has_notifiers(struct mm_struct *mm)
{
return unlikely(mm->notifier_subscriptions);
}
struct mmu_notifier *mmu_notifier_get_locked(const struct mmu_notifier_ops *ops,
struct mm_struct *mm);
static inline struct mmu_notifier *
mmu_notifier_get(const struct mmu_notifier_ops *ops, struct mm_struct *mm)
{
struct mmu_notifier *ret;
mmap_write_lock(mm);
ret = mmu_notifier_get_locked(ops, mm);
mmap_write_unlock(mm);
return ret;
}
void mmu_notifier_put(struct mmu_notifier *subscription);
void mmu_notifier_synchronize(void);
extern int mmu_notifier_register(struct mmu_notifier *subscription,
struct mm_struct *mm);
extern int __mmu_notifier_register(struct mmu_notifier *subscription,
struct mm_struct *mm);
extern void mmu_notifier_unregister(struct mmu_notifier *subscription,
struct mm_struct *mm);
unsigned long
mmu_interval_read_begin(struct mmu_interval_notifier *interval_sub);
int mmu_interval_notifier_insert(struct mmu_interval_notifier *interval_sub,
struct mm_struct *mm, unsigned long start,
unsigned long length,
const struct mmu_interval_notifier_ops *ops);
int mmu_interval_notifier_insert_locked(
struct mmu_interval_notifier *interval_sub, struct mm_struct *mm,
unsigned long start, unsigned long length,
const struct mmu_interval_notifier_ops *ops);
void mmu_interval_notifier_remove(struct mmu_interval_notifier *interval_sub);
/**
* mmu_interval_set_seq - Save the invalidation sequence
* @interval_sub - The subscription passed to invalidate
* @cur_seq - The cur_seq passed to the invalidate() callback
*
* This must be called unconditionally from the invalidate callback of a
* struct mmu_interval_notifier_ops under the same lock that is used to call
* mmu_interval_read_retry(). It updates the sequence number for later use by
* mmu_interval_read_retry(). The provided cur_seq will always be odd.
*
* If the caller does not call mmu_interval_read_begin() or
* mmu_interval_read_retry() then this call is not required.
*/
static inline void
mmu_interval_set_seq(struct mmu_interval_notifier *interval_sub,
unsigned long cur_seq)
{
WRITE_ONCE(interval_sub->invalidate_seq, cur_seq);
}
/**
* mmu_interval_read_retry - End a read side critical section against a VA range
* interval_sub: The subscription
* seq: The return of the paired mmu_interval_read_begin()
*
* This MUST be called under a user provided lock that is also held
* unconditionally by op->invalidate() when it calls mmu_interval_set_seq().
*
* Each call should be paired with a single mmu_interval_read_begin() and
* should be used to conclude the read side.
*
* Returns true if an invalidation collided with this critical section, and
* the caller should retry.
*/
static inline bool
mmu_interval_read_retry(struct mmu_interval_notifier *interval_sub,
unsigned long seq)
{
return interval_sub->invalidate_seq != seq;
}
/**
* mmu_interval_check_retry - Test if a collision has occurred
* interval_sub: The subscription
* seq: The return of the matching mmu_interval_read_begin()
*
* This can be used in the critical section between mmu_interval_read_begin()
* and mmu_interval_read_retry(). A return of true indicates an invalidation
* has collided with this critical region and a future
* mmu_interval_read_retry() will return true.
*
* False is not reliable and only suggests a collision may not have
* occurred. It can be called many times and does not have to hold the user
* provided lock.
*
* This call can be used as part of loops and other expensive operations to
* expedite a retry.
*/
static inline bool
mmu_interval_check_retry(struct mmu_interval_notifier *interval_sub,
unsigned long seq)
{
/* Pairs with the WRITE_ONCE in mmu_interval_set_seq() */
return READ_ONCE(interval_sub->invalidate_seq) != seq;
}
extern void __mmu_notifier_subscriptions_destroy(struct mm_struct *mm);
extern void __mmu_notifier_release(struct mm_struct *mm);
extern int __mmu_notifier_clear_flush_young(struct mm_struct *mm,
unsigned long start,
unsigned long end);
extern int __mmu_notifier_clear_young(struct mm_struct *mm,
unsigned long start,
unsigned long end);
extern int __mmu_notifier_test_young(struct mm_struct *mm,
unsigned long address);
extern void __mmu_notifier_change_pte(struct mm_struct *mm,
unsigned long address, pte_t pte);
extern int __mmu_notifier_invalidate_range_start(struct mmu_notifier_range *r);
extern void __mmu_notifier_invalidate_range_end(struct mmu_notifier_range *r,
bool only_end);
extern void __mmu_notifier_invalidate_range(struct mm_struct *mm,
unsigned long start, unsigned long end);
extern bool
mmu_notifier_range_update_to_read_only(const struct mmu_notifier_range *range);
static inline bool
mmu_notifier_range_blockable(const struct mmu_notifier_range *range)
{
return (range->flags & MMU_NOTIFIER_RANGE_BLOCKABLE);
}
static inline void mmu_notifier_release(struct mm_struct *mm)
{
if (mm_has_notifiers(mm))
__mmu_notifier_release(mm);
}
static inline int mmu_notifier_clear_flush_young(struct mm_struct *mm,
unsigned long start,
unsigned long end)
{
if (mm_has_notifiers(mm))
return __mmu_notifier_clear_flush_young(mm, start, end);
return 0;
}
static inline int mmu_notifier_clear_young(struct mm_struct *mm,
unsigned long start,
unsigned long end)
{
if (mm_has_notifiers(mm))
return __mmu_notifier_clear_young(mm, start, end);
return 0;
}
static inline int mmu_notifier_test_young(struct mm_struct *mm,
unsigned long address)
{
if (mm_has_notifiers(mm))
return __mmu_notifier_test_young(mm, address);
return 0;
}
static inline void mmu_notifier_change_pte(struct mm_struct *mm,
unsigned long address, pte_t pte)
{
if (mm_has_notifiers(mm))
__mmu_notifier_change_pte(mm, address, pte);
}
static inline void
mmu_notifier_invalidate_range_start(struct mmu_notifier_range *range)
{
might_sleep();
lock_map_acquire(&__mmu_notifier_invalidate_range_start_map);
if (mm_has_notifiers(range->mm)) {
range->flags |= MMU_NOTIFIER_RANGE_BLOCKABLE;
__mmu_notifier_invalidate_range_start(range);
}
lock_map_release(&__mmu_notifier_invalidate_range_start_map);
}
static inline int
mmu_notifier_invalidate_range_start_nonblock(struct mmu_notifier_range *range)
{
int ret = 0;
lock_map_acquire(&__mmu_notifier_invalidate_range_start_map);
if (mm_has_notifiers(range->mm)) {
range->flags &= ~MMU_NOTIFIER_RANGE_BLOCKABLE;
ret = __mmu_notifier_invalidate_range_start(range);
}
lock_map_release(&__mmu_notifier_invalidate_range_start_map);
return ret;
}
static inline void
mmu_notifier_invalidate_range_end(struct mmu_notifier_range *range)
{
if (mmu_notifier_range_blockable(range))
might_sleep();
if (mm_has_notifiers(range->mm))
__mmu_notifier_invalidate_range_end(range, false);
}
static inline void
mmu_notifier_invalidate_range_only_end(struct mmu_notifier_range *range)
{
if (mm_has_notifiers(range->mm))
__mmu_notifier_invalidate_range_end(range, true);
}
static inline void mmu_notifier_invalidate_range(struct mm_struct *mm,
unsigned long start, unsigned long end)
{
if (mm_has_notifiers(mm))
__mmu_notifier_invalidate_range(mm, start, end);
}
static inline void mmu_notifier_subscriptions_init(struct mm_struct *mm)
{
mm->notifier_subscriptions = NULL;
}
static inline void mmu_notifier_subscriptions_destroy(struct mm_struct *mm)
{
if (mm_has_notifiers(mm))
__mmu_notifier_subscriptions_destroy(mm);
}
static inline void mmu_notifier_range_init(struct mmu_notifier_range *range,
enum mmu_notifier_event event,
unsigned flags,
struct vm_area_struct *vma,
struct mm_struct *mm,
unsigned long start,
unsigned long end)
{
range->vma = vma;
range->event = event;
range->mm = mm;
range->start = start;
range->end = end;
range->flags = flags;
}
static inline void mmu_notifier_range_init_owner(
struct mmu_notifier_range *range,
enum mmu_notifier_event event, unsigned int flags,
struct vm_area_struct *vma, struct mm_struct *mm,
unsigned long start, unsigned long end, void *owner)
{
mmu_notifier_range_init(range, event, flags, vma, mm, start, end);
range->owner = owner;
}
#define ptep_clear_flush_young_notify(__vma, __address, __ptep) \
({ \
int __young; \
struct vm_area_struct *___vma = __vma; \
unsigned long ___address = __address; \
__young = ptep_clear_flush_young(___vma, ___address, __ptep); \
__young |= mmu_notifier_clear_flush_young(___vma->vm_mm, \
___address, \
___address + \
PAGE_SIZE); \
__young; \
})
#define pmdp_clear_flush_young_notify(__vma, __address, __pmdp) \
({ \
int __young; \
struct vm_area_struct *___vma = __vma; \
unsigned long ___address = __address; \
__young = pmdp_clear_flush_young(___vma, ___address, __pmdp); \
__young |= mmu_notifier_clear_flush_young(___vma->vm_mm, \
___address, \
___address + \
PMD_SIZE); \
__young; \
})
#define ptep_clear_young_notify(__vma, __address, __ptep) \
({ \
int __young; \
struct vm_area_struct *___vma = __vma; \
unsigned long ___address = __address; \
__young = ptep_test_and_clear_young(___vma, ___address, __ptep);\
__young |= mmu_notifier_clear_young(___vma->vm_mm, ___address, \
___address + PAGE_SIZE); \
__young; \
})
#define pmdp_clear_young_notify(__vma, __address, __pmdp) \
({ \
int __young; \
struct vm_area_struct *___vma = __vma; \
unsigned long ___address = __address; \
__young = pmdp_test_and_clear_young(___vma, ___address, __pmdp);\
__young |= mmu_notifier_clear_young(___vma->vm_mm, ___address, \
___address + PMD_SIZE); \
__young; \
})
#define ptep_clear_flush_notify(__vma, __address, __ptep) \
({ \
unsigned long ___addr = __address & PAGE_MASK; \
struct mm_struct *___mm = (__vma)->vm_mm; \
pte_t ___pte; \
\
___pte = ptep_clear_flush(__vma, __address, __ptep); \
mmu_notifier_invalidate_range(___mm, ___addr, \
___addr + PAGE_SIZE); \
\
___pte; \
})
#define pmdp_huge_clear_flush_notify(__vma, __haddr, __pmd) \
({ \
unsigned long ___haddr = __haddr & HPAGE_PMD_MASK; \
struct mm_struct *___mm = (__vma)->vm_mm; \
pmd_t ___pmd; \
\
___pmd = pmdp_huge_clear_flush(__vma, __haddr, __pmd); \
mmu_notifier_invalidate_range(___mm, ___haddr, \
___haddr + HPAGE_PMD_SIZE); \
\
___pmd; \
})
#define pudp_huge_clear_flush_notify(__vma, __haddr, __pud) \
({ \
unsigned long ___haddr = __haddr & HPAGE_PUD_MASK; \
struct mm_struct *___mm = (__vma)->vm_mm; \
pud_t ___pud; \
\
___pud = pudp_huge_clear_flush(__vma, __haddr, __pud); \
mmu_notifier_invalidate_range(___mm, ___haddr, \
___haddr + HPAGE_PUD_SIZE); \
\
___pud; \
})
/*
* set_pte_at_notify() sets the pte _after_ running the notifier.
* This is safe to start by updating the secondary MMUs, because the primary MMU
* pte invalidate must have already happened with a ptep_clear_flush() before
* set_pte_at_notify() has been invoked. Updating the secondary MMUs first is
* required when we change both the protection of the mapping from read-only to
* read-write and the pfn (like during copy on write page faults). Otherwise the
* old page would remain mapped readonly in the secondary MMUs after the new
* page is already writable by some CPU through the primary MMU.
*/
#define set_pte_at_notify(__mm, __address, __ptep, __pte) \
({ \
struct mm_struct *___mm = __mm; \
unsigned long ___address = __address; \
pte_t ___pte = __pte; \
\
mmu_notifier_change_pte(___mm, ___address, ___pte); \
set_pte_at(___mm, ___address, __ptep, ___pte); \
})
#else /* CONFIG_MMU_NOTIFIER */
struct mmu_notifier_range {
unsigned long start;
unsigned long end;
};
static inline void _mmu_notifier_range_init(struct mmu_notifier_range *range,
unsigned long start,
unsigned long end)
{
range->start = start;
range->end = end;
}
#define mmu_notifier_range_init(range,event,flags,vma,mm,start,end) \
_mmu_notifier_range_init(range, start, end)
#define mmu_notifier_range_init_owner(range, event, flags, vma, mm, start, \
end, owner) \
_mmu_notifier_range_init(range, start, end)
static inline bool
mmu_notifier_range_blockable(const struct mmu_notifier_range *range)
{
return true;
}
static inline int mm_has_notifiers(struct mm_struct *mm)
{
return 0;
}
static inline void mmu_notifier_release(struct mm_struct *mm)
{
}
static inline int mmu_notifier_clear_flush_young(struct mm_struct *mm,
unsigned long start,
unsigned long end)
{
return 0;
}
static inline int mmu_notifier_test_young(struct mm_struct *mm,
unsigned long address)
{
return 0;
}
static inline void mmu_notifier_change_pte(struct mm_struct *mm,
unsigned long address, pte_t pte)
{
}
static inline void
mmu_notifier_invalidate_range_start(struct mmu_notifier_range *range)
{
}
static inline int
mmu_notifier_invalidate_range_start_nonblock(struct mmu_notifier_range *range)
{
return 0;
}
static inline
void mmu_notifier_invalidate_range_end(struct mmu_notifier_range *range)
{
}
static inline void
mmu_notifier_invalidate_range_only_end(struct mmu_notifier_range *range)
{
}
static inline void mmu_notifier_invalidate_range(struct mm_struct *mm,
unsigned long start, unsigned long end)
{
}
static inline void mmu_notifier_subscriptions_init(struct mm_struct *mm)
{
}
static inline void mmu_notifier_subscriptions_destroy(struct mm_struct *mm)
{
}
#define mmu_notifier_range_update_to_read_only(r) false
#define ptep_clear_flush_young_notify ptep_clear_flush_young
#define pmdp_clear_flush_young_notify pmdp_clear_flush_young
#define ptep_clear_young_notify ptep_test_and_clear_young
#define pmdp_clear_young_notify pmdp_test_and_clear_young
#define ptep_clear_flush_notify ptep_clear_flush
#define pmdp_huge_clear_flush_notify pmdp_huge_clear_flush
#define pudp_huge_clear_flush_notify pudp_huge_clear_flush
#define set_pte_at_notify set_pte_at
static inline void mmu_notifier_synchronize(void)
{
}
#endif /* CONFIG_MMU_NOTIFIER */
#endif /* _LINUX_MMU_NOTIFIER_H */
// SPDX-License-Identifier: GPL-2.0
#include <linux/kernel.h>
#include <linux/export.h>
#include <linux/uaccess.h>
#include <linux/mm.h>
#include <linux/bitops.h>
#include <asm/word-at-a-time.h>
/*
* Do a strnlen, return length of string *with* final '\0'.
* 'count' is the user-supplied count, while 'max' is the
* address space maximum.
*
* Return 0 for exceptions (which includes hitting the address
* space maximum), or 'count+1' if hitting the user-supplied
* maximum count.
*
* NOTE! We can sometimes overshoot the user-supplied maximum
* if it fits in a aligned 'long'. The caller needs to check
* the return value against "> max".
*/
static inline long do_strnlen_user(const char __user *src, unsigned long count, unsigned long max)
{
const struct word_at_a_time constants = WORD_AT_A_TIME_CONSTANTS;
unsigned long align, res = 0;
unsigned long c;
/*
* Do everything aligned. But that means that we
* need to also expand the maximum..
*/
align = (sizeof(unsigned long) - 1) & (unsigned long)src;
src -= align;
max += align;
unsafe_get_user(c, (unsigned long __user *)src, efault);
c |= aligned_byte_mask(align);
for (;;) {
unsigned long data;
if (has_zero(c, &data, &constants)) {
data = prep_zero_mask(c, data, &constants);
data = create_zero_mask(data);
return res + find_zero(data) + 1 - align;
}
res += sizeof(unsigned long);
/* We already handled 'unsigned long' bytes. Did we do it all ? */
if (unlikely(max <= sizeof(unsigned long)))
break;
max -= sizeof(unsigned long);
unsafe_get_user(c, (unsigned long __user *)(src+res), efault);
}
res -= align;
/*
* Uhhuh. We hit 'max'. But was that the user-specified maximum
* too? If so, return the marker for "too long".
*/
if (res >= count)
return count+1;
/*
* Nope: we hit the address space limit, and we still had more
* characters the caller would have wanted. That's 0.
*/
efault:
return 0;
}
/**
* strnlen_user: - Get the size of a user string INCLUDING final NUL.
* @str: The string to measure.
* @count: Maximum count (including NUL character)
*
* Context: User context only. This function may sleep if pagefaults are
* enabled.
*
* Get the size of a NUL-terminated string in user space.
*
* Returns the size of the string INCLUDING the terminating NUL.
* If the string is too long, returns a number larger than @count. User
* has to check the return value against "> count".
* On exception (or invalid count), returns 0.
*
* NOTE! You should basically never use this function. There is
* almost never any valid case for using the length of a user space
* string, since the string can be changed at any time by other
* threads. Use "strncpy_from_user()" instead to get a stable copy
* of the string.
*/
long strnlen_user(const char __user *str, long count)
{
unsigned long max_addr, src_addr;
if (unlikely(count <= 0))
return 0;
max_addr = user_addr_max();
src_addr = (unsigned long)untagged_addr(str);
if (likely(src_addr < max_addr)) {
unsigned long max = max_addr - src_addr;
long retval;
/*
* Truncate 'max' to the user-specified limit, so that
* we only have one limit we need to check in the loop
*/
if (max > count)
max = count;
if (user_read_access_begin(str, max)) {
retval = do_strnlen_user(str, count, max);
user_read_access_end();
return retval;
}
}
return 0;
}
EXPORT_SYMBOL(strnlen_user);
// SPDX-License-Identifier: GPL-2.0
/*
* linux/fs/fcntl.c
*
* Copyright (C) 1991, 1992 Linus Torvalds
*/
#include <linux/syscalls.h>
#include <linux/init.h>
#include <linux/mm.h>
#include <linux/sched/task.h>
#include <linux/fs.h>
#include <linux/file.h>
#include <linux/fdtable.h>
#include <linux/capability.h>
#include <linux/dnotify.h>
#include <linux/slab.h>
#include <linux/module.h>
#include <linux/pipe_fs_i.h>
#include <linux/security.h>
#include <linux/ptrace.h>
#include <linux/signal.h>
#include <linux/rcupdate.h>
#include <linux/pid_namespace.h>
#include <linux/user_namespace.h>
#include <linux/memfd.h>
#include <linux/compat.h>
#include <linux/mount.h>
#include <linux/poll.h>
#include <asm/siginfo.h>
#include <linux/uaccess.h>
#define SETFL_MASK (O_APPEND | O_NONBLOCK | O_NDELAY | O_DIRECT | O_NOATIME)
static int setfl(int fd, struct file * filp, unsigned long arg)
{
struct inode * inode = file_inode(filp);
int error = 0;
/*
* O_APPEND cannot be cleared if the file is marked as append-only
* and the file is open for write.
*/
if (((arg ^ filp->f_flags) & O_APPEND) && IS_APPEND(inode))
return -EPERM;
/* O_NOATIME can only be set by the owner or superuser */
if ((arg & O_NOATIME) && !(filp->f_flags & O_NOATIME))
if (!inode_owner_or_capable(file_mnt_user_ns(filp), inode))
return -EPERM;
/* required for strict SunOS emulation */
if (O_NONBLOCK != O_NDELAY)
if (arg & O_NDELAY)
arg |= O_NONBLOCK;
/* Pipe packetized mode is controlled by O_DIRECT flag */
if (!S_ISFIFO(inode->i_mode) && (arg & O_DIRECT)) {
if (!filp->f_mapping || !filp->f_mapping->a_ops ||
!filp->f_mapping->a_ops->direct_IO)
return -EINVAL;
}
if (filp->f_op->check_flags)
error = filp->f_op->check_flags(arg);
if (error)
return error;
/*
* ->fasync() is responsible for setting the FASYNC bit.
*/
if (((arg ^ filp->f_flags) & FASYNC) && filp->f_op->fasync) {
error = filp->f_op->fasync(fd, filp, (arg & FASYNC) != 0);
if (error < 0)
goto out;
if (error > 0)
error = 0;
}
spin_lock(&filp->f_lock);
filp->f_flags = (arg & SETFL_MASK) | (filp->f_flags & ~SETFL_MASK);
spin_unlock(&filp->f_lock);
out:
return error;
}
static void f_modown(struct file *filp, struct pid *pid, enum pid_type type,
int force)
{
write_lock_irq(&filp->f_owner.lock);
if (force || !filp->f_owner.pid) {
put_pid(filp->f_owner.pid);
filp->f_owner.pid = get_pid(pid);
filp->f_owner.pid_type = type;
if (pid) {
const struct cred *cred = current_cred();
filp->f_owner.uid = cred->uid;
filp->f_owner.euid = cred->euid;
}
}
write_unlock_irq(&filp->f_owner.lock);
}
void __f_setown(struct file *filp, struct pid *pid, enum pid_type type,
int force)
{
security_file_set_fowner(filp);
f_modown(filp, pid, type, force);
}
EXPORT_SYMBOL(__f_setown);
int f_setown(struct file *filp, unsigned long arg, int force)
{
enum pid_type type;
struct pid *pid = NULL;
int who = arg, ret = 0;
type = PIDTYPE_TGID;
if (who < 0) {
/* avoid overflow below */
if (who == INT_MIN)
return -EINVAL;
type = PIDTYPE_PGID;
who = -who;
}
rcu_read_lock();
if (who) {
pid = find_vpid(who);
if (!pid)
ret = -ESRCH;
}
if (!ret)
__f_setown(filp, pid, type, force);
rcu_read_unlock();
return ret;
}
EXPORT_SYMBOL(f_setown);
void f_delown(struct file *filp)
{
f_modown(filp, NULL, PIDTYPE_TGID, 1);
}
pid_t f_getown(struct file *filp)
{
pid_t pid = 0;
read_lock_irq(&filp->f_owner.lock);
rcu_read_lock();
if (pid_task(filp->f_owner.pid, filp->f_owner.pid_type)) {
pid = pid_vnr(filp->f_owner.pid);
if (filp->f_owner.pid_type == PIDTYPE_PGID)
pid = -pid;
}
rcu_read_unlock();
read_unlock_irq(&filp->f_owner.lock);
return pid;
}
static int f_setown_ex(struct file *filp, unsigned long arg)
{
struct f_owner_ex __user *owner_p = (void __user *)arg;
struct f_owner_ex owner;
struct pid *pid;
int type;
int ret;
ret = copy_from_user(&owner, owner_p, sizeof(owner));
if (ret)
return -EFAULT;
switch (owner.type) {
case F_OWNER_TID:
type = PIDTYPE_PID;
break;
case F_OWNER_PID:
type = PIDTYPE_TGID;
break;
case F_OWNER_PGRP:
type = PIDTYPE_PGID;
break;
default:
return -EINVAL;
}
rcu_read_lock();
pid = find_vpid(owner.pid);
if (owner.pid && !pid)
ret = -ESRCH;
else
__f_setown(filp, pid, type, 1);
rcu_read_unlock();
return ret;
}
static int f_getown_ex(struct file *filp, unsigned long arg)
{
struct f_owner_ex __user *owner_p = (void __user *)arg;
struct f_owner_ex owner = {};
int ret = 0;
read_lock_irq(&filp->f_owner.lock);
rcu_read_lock();
if (pid_task(filp->f_owner.pid, filp->f_owner.pid_type))
owner.pid = pid_vnr(filp->f_owner.pid);
rcu_read_unlock();
switch (filp->f_owner.pid_type) {
case PIDTYPE_PID:
owner.type = F_OWNER_TID;
break;
case PIDTYPE_TGID:
owner.type = F_OWNER_PID;
break;
case PIDTYPE_PGID:
owner.type = F_OWNER_PGRP;
break;
default:
WARN_ON(1);
ret = -EINVAL;
break;
}
read_unlock_irq(&filp->f_owner.lock);
if (!ret) {
ret = copy_to_user(owner_p, &owner, sizeof(owner));
if (ret)
ret = -EFAULT;
}
return ret;
}
#ifdef CONFIG_CHECKPOINT_RESTORE
static int f_getowner_uids(struct file *filp, unsigned long arg)
{
struct user_namespace *user_ns = current_user_ns();
uid_t __user *dst = (void __user *)arg;
uid_t src[2];
int err;
read_lock_irq(&filp->f_owner.lock);
src[0] = from_kuid(user_ns, filp->f_owner.uid);
src[1] = from_kuid(user_ns, filp->f_owner.euid);
read_unlock_irq(&filp->f_owner.lock);
err = put_user(src[0], &dst[0]);
err |= put_user(src[1], &dst[1]);
return err;
}
#else
static int f_getowner_uids(struct file *filp, unsigned long arg)
{
return -EINVAL;
}
#endif
static bool rw_hint_valid(enum rw_hint hint)
{
switch (hint) {
case RWH_WRITE_LIFE_NOT_SET:
case RWH_WRITE_LIFE_NONE:
case RWH_WRITE_LIFE_SHORT:
case RWH_WRITE_LIFE_MEDIUM:
case RWH_WRITE_LIFE_LONG:
case RWH_WRITE_LIFE_EXTREME:
return true;
default:
return false;
}
}
static long fcntl_rw_hint(struct file *file, unsigned int cmd,
unsigned long arg)
{
struct inode *inode = file_inode(file);
u64 __user *argp = (u64 __user *)arg;
enum rw_hint hint;
u64 h;
switch (cmd) {
case F_GET_FILE_RW_HINT:
h = file_write_hint(file);
if (copy_to_user(argp, &h, sizeof(*argp)))
return -EFAULT;
return 0;
case F_SET_FILE_RW_HINT:
if (copy_from_user(&h, argp, sizeof(h)))
return -EFAULT;
hint = (enum rw_hint) h;
if (!rw_hint_valid(hint))
return -EINVAL;
spin_lock(&file->f_lock);
file->f_write_hint = hint;
spin_unlock(&file->f_lock);
return 0;
case F_GET_RW_HINT:
h = inode->i_write_hint;
if (copy_to_user(argp, &h, sizeof(*argp)))
return -EFAULT;
return 0;
case F_SET_RW_HINT:
if (copy_from_user(&h, argp, sizeof(h)))
return -EFAULT;
hint = (enum rw_hint) h;
if (!rw_hint_valid(hint))
return -EINVAL;
inode_lock(inode);
inode->i_write_hint = hint;
inode_unlock(inode);
return 0;
default:
return -EINVAL;
}
}
static long do_fcntl(int fd, unsigned int cmd, unsigned long arg,
struct file *filp)
{
void __user *argp = (void __user *)arg;
struct flock flock;
long err = -EINVAL;
switch (cmd) {
case F_DUPFD:
err = f_dupfd(arg, filp, 0);
break;
case F_DUPFD_CLOEXEC:
err = f_dupfd(arg, filp, O_CLOEXEC);
break;
case F_GETFD:
err = get_close_on_exec(fd) ? FD_CLOEXEC : 0;
break;
case F_SETFD:
err = 0;
set_close_on_exec(fd, arg & FD_CLOEXEC);
break;
case F_GETFL:
err = filp->f_flags;
break;
case F_SETFL:
err = setfl(fd, filp, arg);
break;
#if BITS_PER_LONG != 32
/* 32-bit arches must use fcntl64() */
case F_OFD_GETLK:
#endif
case F_GETLK:
if (copy_from_user(&flock, argp, sizeof(flock)))
return -EFAULT;
err = fcntl_getlk(filp, cmd, &flock);
if (!err && copy_to_user(argp, &flock, sizeof(flock)))
return -EFAULT;
break;
#if BITS_PER_LONG != 32
/* 32-bit arches must use fcntl64() */
case F_OFD_SETLK:
case F_OFD_SETLKW:
fallthrough;
#endif
case F_SETLK:
case F_SETLKW:
if (copy_from_user(&flock, argp, sizeof(flock)))
return -EFAULT;
err = fcntl_setlk(fd, filp, cmd, &flock);
break;
case F_GETOWN:
/*
* XXX If f_owner is a process group, the
* negative return value will get converted
* into an error. Oops. If we keep the
* current syscall conventions, the only way
* to fix this will be in libc.
*/
err = f_getown(filp);
force_successful_syscall_return();
break;
case F_SETOWN:
err = f_setown(filp, arg, 1);
break;
case F_GETOWN_EX:
err = f_getown_ex(filp, arg);
break;
case F_SETOWN_EX:
err = f_setown_ex(filp, arg);
break;
case F_GETOWNER_UIDS:
err = f_getowner_uids(filp, arg);
break;
case F_GETSIG:
err = filp->f_owner.signum;
break;
case F_SETSIG:
/* arg == 0 restores default behaviour. */
if (!valid_signal(arg)) {
break;
}
err = 0;
filp->f_owner.signum = arg;
break;
case F_GETLEASE:
err = fcntl_getlease(filp);
break;
case F_SETLEASE:
err = fcntl_setlease(fd, filp, arg);
break;
case F_NOTIFY:
err = fcntl_dirnotify(fd, filp, arg);
break;
case F_SETPIPE_SZ:
case F_GETPIPE_SZ:
err = pipe_fcntl(filp, cmd, arg);
break;
case F_ADD_SEALS:
case F_GET_SEALS:
err = memfd_fcntl(filp, cmd, arg);
break;
case F_GET_RW_HINT:
case F_SET_RW_HINT:
case F_GET_FILE_RW_HINT:
case F_SET_FILE_RW_HINT:
err = fcntl_rw_hint(filp, cmd, arg);
break;
default:
break;
}
return err;
}
static int check_fcntl_cmd(unsigned cmd)
{
switch (cmd) {
case F_DUPFD:
case F_DUPFD_CLOEXEC:
case F_GETFD:
case F_SETFD:
case F_GETFL:
return 1;
}
return 0;
}
SYSCALL_DEFINE3(fcntl, unsigned int, fd, unsigned int, cmd, unsigned long, arg)
{
struct fd f = fdget_raw(fd);
long err = -EBADF;
if (!f.file)
goto out;
if (unlikely(f.file->f_mode & FMODE_PATH)) {
if (!check_fcntl_cmd(cmd))
goto out1;
}
err = security_file_fcntl(f.file, cmd, arg);
if (!err)
err = do_fcntl(fd, cmd, arg, f.file);
out1:
fdput(f);
out:
return err;
}
#if BITS_PER_LONG == 32
SYSCALL_DEFINE3(fcntl64, unsigned int, fd, unsigned int, cmd,
unsigned long, arg)
{
void __user *argp = (void __user *)arg;
struct fd f = fdget_raw(fd);
struct flock64 flock;
long err = -EBADF;
if (!f.file)
goto out;
if (unlikely(f.file->f_mode & FMODE_PATH)) {
if (!check_fcntl_cmd(cmd))
goto out1;
}
err = security_file_fcntl(f.file, cmd, arg);
if (err)
goto out1;
switch (cmd) {
case F_GETLK64:
case F_OFD_GETLK:
err = -EFAULT;
if (copy_from_user(&flock, argp, sizeof(flock)))
break;
err = fcntl_getlk64(f.file, cmd, &flock);
if (!err && copy_to_user(argp, &flock, sizeof(flock)))
err = -EFAULT;
break;
case F_SETLK64:
case F_SETLKW64:
case F_OFD_SETLK:
case F_OFD_SETLKW:
err = -EFAULT;
if (copy_from_user(&flock, argp, sizeof(flock)))
break;
err = fcntl_setlk64(fd, f.file, cmd, &flock);
break;
default:
err = do_fcntl(fd, cmd, arg, f.file);
break;
}
out1:
fdput(f);
out:
return err;
}
#endif
#ifdef CONFIG_COMPAT
/* careful - don't use anywhere else */
#define copy_flock_fields(dst, src) \
(dst)->l_type = (src)->l_type; \
(dst)->l_whence = (src)->l_whence; \
(dst)->l_start = (src)->l_start; \
(dst)->l_len = (src)->l_len; \
(dst)->l_pid = (src)->l_pid;
static int get_compat_flock(struct flock *kfl, const struct compat_flock __user *ufl)
{
struct compat_flock fl;
if (copy_from_user(&fl, ufl, sizeof(struct compat_flock)))
return -EFAULT;
copy_flock_fields(kfl, &fl);
return 0;
}
static int get_compat_flock64(struct flock *kfl, const struct compat_flock64 __user *ufl)
{
struct compat_flock64 fl;
if (copy_from_user(&fl, ufl, sizeof(struct compat_flock64)))
return -EFAULT;
copy_flock_fields(kfl, &fl);
return 0;
}
static int put_compat_flock(const struct flock *kfl, struct compat_flock __user *ufl)
{
struct compat_flock fl;
memset(&fl, 0, sizeof(struct compat_flock));
copy_flock_fields(&fl, kfl);
if (copy_to_user(ufl, &fl, sizeof(struct compat_flock)))
return -EFAULT;
return 0;
}
static int put_compat_flock64(const struct flock *kfl, struct compat_flock64 __user *ufl)
{
struct compat_flock64 fl;
BUILD_BUG_ON(sizeof(kfl->l_start) > sizeof(ufl->l_start));
BUILD_BUG_ON(sizeof(kfl->l_len) > sizeof(ufl->l_len));
memset(&fl, 0, sizeof(struct compat_flock64));
copy_flock_fields(&fl, kfl);
if (copy_to_user(ufl, &fl, sizeof(struct compat_flock64)))
return -EFAULT;
return 0;
}
#undef copy_flock_fields
static unsigned int
convert_fcntl_cmd(unsigned int cmd)
{
switch (cmd) {
case F_GETLK64:
return F_GETLK;
case F_SETLK64:
return F_SETLK;
case F_SETLKW64:
return F_SETLKW;
}
return cmd;
}
/*
* GETLK was successful and we need to return the data, but it needs to fit in
* the compat structure.
* l_start shouldn't be too big, unless the original start + end is greater than
* COMPAT_OFF_T_MAX, in which case the app was asking for trouble, so we return
* -EOVERFLOW in that case. l_len could be too big, in which case we just
* truncate it, and only allow the app to see that part of the conflicting lock
* that might make sense to it anyway
*/
static int fixup_compat_flock(struct flock *flock)
{
if (flock->l_start > COMPAT_OFF_T_MAX)
return -EOVERFLOW;
if (flock->l_len > COMPAT_OFF_T_MAX)
flock->l_len = COMPAT_OFF_T_MAX;
return 0;
}
static long do_compat_fcntl64(unsigned int fd, unsigned int cmd,
compat_ulong_t arg)
{
struct fd f = fdget_raw(fd);
struct flock flock;
long err = -EBADF;
if (!f.file)
return err;
if (unlikely(f.file->f_mode & FMODE_PATH)) {
if (!check_fcntl_cmd(cmd))
goto out_put;
}
err = security_file_fcntl(f.file, cmd, arg);
if (err)
goto out_put;
switch (cmd) {
case F_GETLK:
err = get_compat_flock(&flock, compat_ptr(arg));
if (err)
break;
err = fcntl_getlk(f.file, convert_fcntl_cmd(cmd), &flock);
if (err)
break;
err = fixup_compat_flock(&flock);
if (!err)
err = put_compat_flock(&flock, compat_ptr(arg));
break;
case F_GETLK64:
case F_OFD_GETLK:
err = get_compat_flock64(&flock, compat_ptr(arg));
if (err)
break;
err = fcntl_getlk(f.file, convert_fcntl_cmd(cmd), &flock);
if (!err)
err = put_compat_flock64(&flock, compat_ptr(arg));
break;
case F_SETLK:
case F_SETLKW:
err = get_compat_flock(&flock, compat_ptr(arg));
if (err)
break;
err = fcntl_setlk(fd, f.file, convert_fcntl_cmd(cmd), &flock);
break;
case F_SETLK64:
case F_SETLKW64:
case F_OFD_SETLK:
case F_OFD_SETLKW:
err = get_compat_flock64(&flock, compat_ptr(arg));
if (err)
break;
err = fcntl_setlk(fd, f.file, convert_fcntl_cmd(cmd), &flock);
break;
default:
err = do_fcntl(fd, cmd, arg, f.file);
break;
}
out_put:
fdput(f);
return err;
}
COMPAT_SYSCALL_DEFINE3(fcntl64, unsigned int, fd, unsigned int, cmd,
compat_ulong_t, arg)
{
return do_compat_fcntl64(fd, cmd, arg);
}
COMPAT_SYSCALL_DEFINE3(fcntl, unsigned int, fd, unsigned int, cmd,
compat_ulong_t, arg)
{
switch (cmd) {
case F_GETLK64:
case F_SETLK64:
case F_SETLKW64:
case F_OFD_GETLK:
case F_OFD_SETLK:
case F_OFD_SETLKW:
return -EINVAL;
}
return do_compat_fcntl64(fd, cmd, arg);
}
#endif
/* Table to convert sigio signal codes into poll band bitmaps */
static const __poll_t band_table[NSIGPOLL] = {
EPOLLIN | EPOLLRDNORM, /* POLL_IN */
EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND, /* POLL_OUT */
EPOLLIN | EPOLLRDNORM | EPOLLMSG, /* POLL_MSG */
EPOLLERR, /* POLL_ERR */
EPOLLPRI | EPOLLRDBAND, /* POLL_PRI */
EPOLLHUP | EPOLLERR /* POLL_HUP */
};
static inline int sigio_perm(struct task_struct *p,
struct fown_struct *fown, int sig)
{
const struct cred *cred;
int ret;
rcu_read_lock();
cred = __task_cred(p);
ret = ((uid_eq(fown->euid, GLOBAL_ROOT_UID) ||
uid_eq(fown->euid, cred->suid) || uid_eq(fown->euid, cred->uid) ||
uid_eq(fown->uid, cred->suid) || uid_eq(fown->uid, cred->uid)) &&
!security_file_send_sigiotask(p, fown, sig));
rcu_read_unlock();
return ret;
}
static void send_sigio_to_task(struct task_struct *p,
struct fown_struct *fown,
int fd, int reason, enum pid_type type)
{
/*
* F_SETSIG can change ->signum lockless in parallel, make
* sure we read it once and use the same value throughout.
*/
int signum = READ_ONCE(fown->signum);
if (!sigio_perm(p, fown, signum))
return;
switch (signum) {
default: {
kernel_siginfo_t si;
/* Queue a rt signal with the appropriate fd as its
value. We use SI_SIGIO as the source, not
SI_KERNEL, since kernel signals always get
delivered even if we can't queue. Failure to
queue in this case _should_ be reported; we fall
back to SIGIO in that case. --sct */
clear_siginfo(&si);
si.si_signo = signum;
si.si_errno = 0;
si.si_code = reason;
/*
* Posix definies POLL_IN and friends to be signal
* specific si_codes for SIG_POLL. Linux extended
* these si_codes to other signals in a way that is
* ambiguous if other signals also have signal
* specific si_codes. In that case use SI_SIGIO instead
* to remove the ambiguity.
*/
if ((signum != SIGPOLL) && sig_specific_sicodes(signum))
si.si_code = SI_SIGIO;
/* Make sure we are called with one of the POLL_*
reasons, otherwise we could leak kernel stack into
userspace. */
BUG_ON((reason < POLL_IN) || ((reason - POLL_IN) >= NSIGPOLL));
if (reason - POLL_IN >= NSIGPOLL)
si.si_band = ~0L;
else
si.si_band = mangle_poll(band_table[reason - POLL_IN]);
si.si_fd = fd;
if (!do_send_sig_info(signum, &si, p, type))
break;
}
fallthrough; /* fall back on the old plain SIGIO signal */
case 0:
do_send_sig_info(SIGIO, SEND_SIG_PRIV, p, type);
}
}
void send_sigio(struct fown_struct *fown, int fd, int band)
{
struct task_struct *p;
enum pid_type type;
unsigned long flags;
struct pid *pid;
read_lock_irqsave(&fown->lock, flags);
type = fown->pid_type;
pid = fown->pid;
if (!pid)
goto out_unlock_fown;
if (type <= PIDTYPE_TGID) {
rcu_read_lock();
p = pid_task(pid, PIDTYPE_PID);
if (p)
send_sigio_to_task(p, fown, fd, band, type);
rcu_read_unlock();
} else {
read_lock(&tasklist_lock);
do_each_pid_task(pid, type, p) {
send_sigio_to_task(p, fown, fd, band, type);
} while_each_pid_task(pid, type, p);
read_unlock(&tasklist_lock);
}
out_unlock_fown:
read_unlock_irqrestore(&fown->lock, flags);
}
static void send_sigurg_to_task(struct task_struct *p,
struct fown_struct *fown, enum pid_type type)
{
if (sigio_perm(p, fown, SIGURG))
do_send_sig_info(SIGURG, SEND_SIG_PRIV, p, type);
}
int send_sigurg(struct fown_struct *fown)
{
struct task_struct *p;
enum pid_type type;
struct pid *pid;
unsigned long flags;
int ret = 0;
read_lock_irqsave(&fown->lock, flags);
type = fown->pid_type;
pid = fown->pid;
if (!pid)
goto out_unlock_fown;
ret = 1;
if (type <= PIDTYPE_TGID) {
rcu_read_lock();
p = pid_task(pid, PIDTYPE_PID);
if (p)
send_sigurg_to_task(p, fown, type);
rcu_read_unlock();
} else {
read_lock(&tasklist_lock);
do_each_pid_task(pid, type, p) {
send_sigurg_to_task(p, fown, type);
} while_each_pid_task(pid, type, p);
read_unlock(&tasklist_lock);
}
out_unlock_fown:
read_unlock_irqrestore(&fown->lock, flags);
return ret;
}
static DEFINE_SPINLOCK(fasync_lock);
static struct kmem_cache *fasync_cache __read_mostly;
static void fasync_free_rcu(struct rcu_head *head)
{
kmem_cache_free(fasync_cache,
container_of(head, struct fasync_struct, fa_rcu));
}
/*
* Remove a fasync entry. If successfully removed, return
* positive and clear the FASYNC flag. If no entry exists,
* do nothing and return 0.
*
* NOTE! It is very important that the FASYNC flag always
* match the state "is the filp on a fasync list".
*
*/
int fasync_remove_entry(struct file *filp, struct fasync_struct **fapp)
{
struct fasync_struct *fa, **fp;
int result = 0;
spin_lock(&filp->f_lock);
spin_lock(&fasync_lock);
for (fp = fapp; (fa = *fp) != NULL; fp = &fa->fa_next) {
if (fa->fa_file != filp)
continue;
write_lock_irq(&fa->fa_lock);
fa->fa_file = NULL;
write_unlock_irq(&fa->fa_lock);
*fp = fa->fa_next;
call_rcu(&fa->fa_rcu, fasync_free_rcu);
filp->f_flags &= ~FASYNC;
result = 1;
break;
}
spin_unlock(&fasync_lock);
spin_unlock(&filp->f_lock);
return result;
}
struct fasync_struct *fasync_alloc(void)
{
return kmem_cache_alloc(fasync_cache, GFP_KERNEL);
}
/*
* NOTE! This can be used only for unused fasync entries:
* entries that actually got inserted on the fasync list
* need to be released by rcu - see fasync_remove_entry.
*/
void fasync_free(struct fasync_struct *new)
{
kmem_cache_free(fasync_cache, new);
}
/*
* Insert a new entry into the fasync list. Return the pointer to the
* old one if we didn't use the new one.
*
* NOTE! It is very important that the FASYNC flag always
* match the state "is the filp on a fasync list".
*/
struct fasync_struct *fasync_insert_entry(int fd, struct file *filp, struct fasync_struct **fapp, struct fasync_struct *new)
{
struct fasync_struct *fa, **fp;
spin_lock(&filp->f_lock);
spin_lock(&fasync_lock);
for (fp = fapp; (fa = *fp) != NULL; fp = &fa->fa_next) {
if (fa->fa_file != filp)
continue;
write_lock_irq(&fa->fa_lock);
fa->fa_fd = fd;
write_unlock_irq(&fa->fa_lock);
goto out;
}
rwlock_init(&new->fa_lock);
new->magic = FASYNC_MAGIC;
new->fa_file = filp;
new->fa_fd = fd;
new->fa_next = *fapp;
rcu_assign_pointer(*fapp, new);
filp->f_flags |= FASYNC;
out:
spin_unlock(&fasync_lock);
spin_unlock(&filp->f_lock);
return fa;
}
/*
* Add a fasync entry. Return negative on error, positive if
* added, and zero if did nothing but change an existing one.
*/
static int fasync_add_entry(int fd, struct file *filp, struct fasync_struct **fapp)
{
struct fasync_struct *new;
new = fasync_alloc();
if (!new)
return -ENOMEM;
/*
* fasync_insert_entry() returns the old (update) entry if
* it existed.
*
* So free the (unused) new entry and return 0 to let the
* caller know that we didn't add any new fasync entries.
*/
if (fasync_insert_entry(fd, filp, fapp, new)) {
fasync_free(new);
return 0;
}
return 1;
}
/*
* fasync_helper() is used by almost all character device drivers
* to set up the fasync queue, and for regular files by the file
* lease code. It returns negative on error, 0 if it did no changes
* and positive if it added/deleted the entry.
*/
int fasync_helper(int fd, struct file * filp, int on, struct fasync_struct **fapp)
{
if (!on)
return fasync_remove_entry(filp, fapp);
return fasync_add_entry(fd, filp, fapp);
}
EXPORT_SYMBOL(fasync_helper);
/*
* rcu_read_lock() is held
*/
static void kill_fasync_rcu(struct fasync_struct *fa, int sig, int band)
{
while (fa) {
struct fown_struct *fown;
unsigned long flags;
if (fa->magic != FASYNC_MAGIC) {
printk(KERN_ERR "kill_fasync: bad magic number in "
"fasync_struct!\n");
return;
}
read_lock_irqsave(&fa->fa_lock, flags);
if (fa->fa_file) {
fown = &fa->fa_file->f_owner;
/* Don't send SIGURG to processes which have not set a
queued signum: SIGURG has its own default signalling
mechanism. */
if (!(sig == SIGURG && fown->signum == 0)) send_sigio(fown, fa->fa_fd, band);
}
read_unlock_irqrestore(&fa->fa_lock, flags);
fa = rcu_dereference(fa->fa_next);
}
}
void kill_fasync(struct fasync_struct **fp, int sig, int band)
{
/* First a quick test without locking: usually
* the list is empty.
*/
if (*fp) {
rcu_read_lock();
kill_fasync_rcu(rcu_dereference(*fp), sig, band);
rcu_read_unlock();
}
}
EXPORT_SYMBOL(kill_fasync);
static int __init fcntl_init(void)
{
/*
* Please add new bits here to ensure allocation uniqueness.
* Exceptions: O_NONBLOCK is a two bit define on parisc; O_NDELAY
* is defined as O_NONBLOCK on some platforms and not on others.
*/
BUILD_BUG_ON(21 - 1 /* for O_RDONLY being 0 */ !=
HWEIGHT32(
(VALID_OPEN_FLAGS & ~(O_NONBLOCK | O_NDELAY)) |
__FMODE_EXEC | __FMODE_NONOTIFY));
fasync_cache = kmem_cache_create("fasync_cache",
sizeof(struct fasync_struct), 0,
SLAB_PANIC | SLAB_ACCOUNT, NULL);
return 0;
}
module_init(fcntl_init)
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef __LINUX_UACCESS_H__
#define __LINUX_UACCESS_H__
#include <linux/fault-inject-usercopy.h>
#include <linux/instrumented.h>
#include <linux/minmax.h>
#include <linux/sched.h>
#include <linux/thread_info.h>
#include <asm/uaccess.h>
#ifdef CONFIG_SET_FS
/*
* Force the uaccess routines to be wired up for actual userspace access,
* overriding any possible set_fs(KERNEL_DS) still lingering around. Undone
* using force_uaccess_end below.
*/
static inline mm_segment_t force_uaccess_begin(void)
{
mm_segment_t fs = get_fs();
set_fs(USER_DS);
return fs;
}
static inline void force_uaccess_end(mm_segment_t oldfs)
{
set_fs(oldfs);
}
#else /* CONFIG_SET_FS */
typedef struct {
/* empty dummy */
} mm_segment_t;
#ifndef TASK_SIZE_MAX
#define TASK_SIZE_MAX TASK_SIZE
#endif
#define uaccess_kernel() (false)
#define user_addr_max() (TASK_SIZE_MAX)
static inline mm_segment_t force_uaccess_begin(void)
{
return (mm_segment_t) { };
}
static inline void force_uaccess_end(mm_segment_t oldfs)
{
}
#endif /* CONFIG_SET_FS */
/*
* Architectures should provide two primitives (raw_copy_{to,from}_user())
* and get rid of their private instances of copy_{to,from}_user() and
* __copy_{to,from}_user{,_inatomic}().
*
* raw_copy_{to,from}_user(to, from, size) should copy up to size bytes and
* return the amount left to copy. They should assume that access_ok() has
* already been checked (and succeeded); they should *not* zero-pad anything.
* No KASAN or object size checks either - those belong here.
*
* Both of these functions should attempt to copy size bytes starting at from
* into the area starting at to. They must not fetch or store anything
* outside of those areas. Return value must be between 0 (everything
* copied successfully) and size (nothing copied).
*
* If raw_copy_{to,from}_user(to, from, size) returns N, size - N bytes starting
* at to must become equal to the bytes fetched from the corresponding area
* starting at from. All data past to + size - N must be left unmodified.
*
* If copying succeeds, the return value must be 0. If some data cannot be
* fetched, it is permitted to copy less than had been fetched; the only
* hard requirement is that not storing anything at all (i.e. returning size)
* should happen only when nothing could be copied. In other words, you don't
* have to squeeze as much as possible - it is allowed, but not necessary.
*
* For raw_copy_from_user() to always points to kernel memory and no faults
* on store should happen. Interpretation of from is affected by set_fs().
* For raw_copy_to_user() it's the other way round.
*
* Both can be inlined - it's up to architectures whether it wants to bother
* with that. They should not be used directly; they are used to implement
* the 6 functions (copy_{to,from}_user(), __copy_{to,from}_user_inatomic())
* that are used instead. Out of those, __... ones are inlined. Plain
* copy_{to,from}_user() might or might not be inlined. If you want them
* inlined, have asm/uaccess.h define INLINE_COPY_{TO,FROM}_USER.
*
* NOTE: only copy_from_user() zero-pads the destination in case of short copy.
* Neither __copy_from_user() nor __copy_from_user_inatomic() zero anything
* at all; their callers absolutely must check the return value.
*
* Biarch ones should also provide raw_copy_in_user() - similar to the above,
* but both source and destination are __user pointers (affected by set_fs()
* as usual) and both source and destination can trigger faults.
*/
static __always_inline __must_check unsigned long
__copy_from_user_inatomic(void *to, const void __user *from, unsigned long n)
{
instrument_copy_from_user(to, from, n);
check_object_size(to, n, false);
return raw_copy_from_user(to, from, n);
}
static __always_inline __must_check unsigned long
__copy_from_user(void *to, const void __user *from, unsigned long n)
{
might_fault();
if (should_fail_usercopy())
return n;
instrument_copy_from_user(to, from, n);
check_object_size(to, n, false);
return raw_copy_from_user(to, from, n);
}
/**
* __copy_to_user_inatomic: - Copy a block of data into user space, with less checking.
* @to: Destination address, in user space.
* @from: Source address, in kernel space.
* @n: Number of bytes to copy.
*
* Context: User context only.
*
* Copy data from kernel space to user space. Caller must check
* the specified block with access_ok() before calling this function.
* The caller should also make sure he pins the user space address
* so that we don't result in page fault and sleep.
*/
static __always_inline __must_check unsigned long
__copy_to_user_inatomic(void __user *to, const void *from, unsigned long n)
{
if (should_fail_usercopy())
return n;
instrument_copy_to_user(to, from, n);
check_object_size(from, n, true);
return raw_copy_to_user(to, from, n);
}
static __always_inline __must_check unsigned long
__copy_to_user(void __user *to, const void *from, unsigned long n)
{
might_fault();
if (should_fail_usercopy())
return n;
instrument_copy_to_user(to, from, n);
check_object_size(from, n, true);
return raw_copy_to_user(to, from, n);
}
#ifdef INLINE_COPY_FROM_USER
static inline __must_check unsigned long
_copy_from_user(void *to, const void __user *from, unsigned long n)
{
unsigned long res = n;
might_fault();
if (!should_fail_usercopy() && likely(access_ok(from, n))) {
instrument_copy_from_user(to, from, n);
res = raw_copy_from_user(to, from, n);
}
if (unlikely(res))
memset(to + (n - res), 0, res);
return res;
}
#else
extern __must_check unsigned long
_copy_from_user(void *, const void __user *, unsigned long);
#endif
#ifdef INLINE_COPY_TO_USER
static inline __must_check unsigned long
_copy_to_user(void __user *to, const void *from, unsigned long n)
{
might_fault();
if (should_fail_usercopy())
return n;
if (access_ok(to, n)) {
instrument_copy_to_user(to, from, n);
n = raw_copy_to_user(to, from, n);
}
return n;
}
#else
extern __must_check unsigned long
_copy_to_user(void __user *, const void *, unsigned long);
#endif
static __always_inline unsigned long __must_check
copy_from_user(void *to, const void __user *from, unsigned long n)
{
if (likely(check_copy_size(to, n, false)))
n = _copy_from_user(to, from, n);
return n;
}
static __always_inline unsigned long __must_check
copy_to_user(void __user *to, const void *from, unsigned long n)
{
if (likely(check_copy_size(from, n, true)))
n = _copy_to_user(to, from, n);
return n;
}
#ifndef copy_mc_to_kernel
/*
* Without arch opt-in this generic copy_mc_to_kernel() will not handle
* #MC (or arch equivalent) during source read.
*/
static inline unsigned long __must_check
copy_mc_to_kernel(void *dst, const void *src, size_t cnt)
{
memcpy(dst, src, cnt);
return 0;
}
#endif
static __always_inline void pagefault_disabled_inc(void)
{
current->pagefault_disabled++;
}
static __always_inline void pagefault_disabled_dec(void)
{
current->pagefault_disabled--;
}
/*
* These routines enable/disable the pagefault handler. If disabled, it will
* not take any locks and go straight to the fixup table.
*
* User access methods will not sleep when called from a pagefault_disabled()
* environment.
*/
static inline void pagefault_disable(void)
{
pagefault_disabled_inc();
/*
* make sure to have issued the store before a pagefault
* can hit.
*/
barrier();
}
static inline void pagefault_enable(void)
{
/*
* make sure to issue those last loads/stores before enabling
* the pagefault handler again.
*/
barrier();
pagefault_disabled_dec();
}
/*
* Is the pagefault handler disabled? If so, user access methods will not sleep.
*/
static inline bool pagefault_disabled(void)
{
return current->pagefault_disabled != 0;
}
/*
* The pagefault handler is in general disabled by pagefault_disable() or
* when in irq context (via in_atomic()).
*
* This function should only be used by the fault handlers. Other users should
* stick to pagefault_disabled().
* Please NEVER use preempt_disable() to disable the fault handler. With
* !CONFIG_PREEMPT_COUNT, this is like a NOP. So the handler won't be disabled.
* in_atomic() will report different values based on !CONFIG_PREEMPT_COUNT.
*/
#define faulthandler_disabled() (pagefault_disabled() || in_atomic())
#ifndef ARCH_HAS_NOCACHE_UACCESS
static inline __must_check unsigned long
__copy_from_user_inatomic_nocache(void *to, const void __user *from,
unsigned long n)
{
return __copy_from_user_inatomic(to, from, n);
}
#endif /* ARCH_HAS_NOCACHE_UACCESS */
extern __must_check int check_zeroed_user(const void __user *from, size_t size);
/**
* copy_struct_from_user: copy a struct from userspace
* @dst: Destination address, in kernel space. This buffer must be @ksize
* bytes long.
* @ksize: Size of @dst struct.
* @src: Source address, in userspace.
* @usize: (Alleged) size of @src struct.
*
* Copies a struct from userspace to kernel space, in a way that guarantees
* backwards-compatibility for struct syscall arguments (as long as future
* struct extensions are made such that all new fields are *appended* to the
* old struct, and zeroed-out new fields have the same meaning as the old
* struct).
*
* @ksize is just sizeof(*dst), and @usize should've been passed by userspace.
* The recommended usage is something like the following:
*
* SYSCALL_DEFINE2(foobar, const struct foo __user *, uarg, size_t, usize)
* {
* int err;
* struct foo karg = {};
*
* if (usize > PAGE_SIZE)
* return -E2BIG;
* if (usize < FOO_SIZE_VER0)
* return -EINVAL;
*
* err = copy_struct_from_user(&karg, sizeof(karg), uarg, usize);
* if (err)
* return err;
*
* // ...
* }
*
* There are three cases to consider:
* * If @usize == @ksize, then it's copied verbatim.
* * If @usize < @ksize, then the userspace has passed an old struct to a
* newer kernel. The rest of the trailing bytes in @dst (@ksize - @usize)
* are to be zero-filled.
* * If @usize > @ksize, then the userspace has passed a new struct to an
* older kernel. The trailing bytes unknown to the kernel (@usize - @ksize)
* are checked to ensure they are zeroed, otherwise -E2BIG is returned.
*
* Returns (in all cases, some data may have been copied):
* * -E2BIG: (@usize > @ksize) and there are non-zero trailing bytes in @src.
* * -EFAULT: access to userspace failed.
*/
static __always_inline __must_check int
copy_struct_from_user(void *dst, size_t ksize, const void __user *src,
size_t usize)
{
size_t size = min(ksize, usize);
size_t rest = max(ksize, usize) - size;
/* Deal with trailing bytes. */
if (usize < ksize) {
memset(dst + size, 0, rest);
} else if (usize > ksize) {
int ret = check_zeroed_user(src + size, rest);
if (ret <= 0)
return ret ?: -E2BIG;
}
/* Copy the interoperable parts of the struct. */
if (copy_from_user(dst, src, size))
return -EFAULT;
return 0;
}
bool copy_from_kernel_nofault_allowed(const void *unsafe_src, size_t size);
long copy_from_kernel_nofault(void *dst, const void *src, size_t size);
long notrace copy_to_kernel_nofault(void *dst, const void *src, size_t size);
long copy_from_user_nofault(void *dst, const void __user *src, size_t size);
long notrace copy_to_user_nofault(void __user *dst, const void *src,
size_t size);
long strncpy_from_kernel_nofault(char *dst, const void *unsafe_addr,
long count);
long strncpy_from_user_nofault(char *dst, const void __user *unsafe_addr,
long count);
long strnlen_user_nofault(const void __user *unsafe_addr, long count);
/**
* get_kernel_nofault(): safely attempt to read from a location
* @val: read into this variable
* @ptr: address to read from
*
* Returns 0 on success, or -EFAULT.
*/
#define get_kernel_nofault(val, ptr) ({ \
const typeof(val) *__gk_ptr = (ptr); \
copy_from_kernel_nofault(&(val), __gk_ptr, sizeof(val));\
})
#ifndef user_access_begin
#define user_access_begin(ptr,len) access_ok(ptr, len)
#define user_access_end() do { } while (0)
#define unsafe_op_wrap(op, err) do { if (unlikely(op)) goto err; } while (0)
#define unsafe_get_user(x,p,e) unsafe_op_wrap(__get_user(x,p),e)
#define unsafe_put_user(x,p,e) unsafe_op_wrap(__put_user(x,p),e)
#define unsafe_copy_to_user(d,s,l,e) unsafe_op_wrap(__copy_to_user(d,s,l),e)
#define unsafe_copy_from_user(d,s,l,e) unsafe_op_wrap(__copy_from_user(d,s,l),e)
static inline unsigned long user_access_save(void) { return 0UL; }
static inline void user_access_restore(unsigned long flags) { }
#endif
#ifndef user_write_access_begin
#define user_write_access_begin user_access_begin
#define user_write_access_end user_access_end
#endif
#ifndef user_read_access_begin
#define user_read_access_begin user_access_begin
#define user_read_access_end user_access_end
#endif
#ifdef CONFIG_HARDENED_USERCOPY
void usercopy_warn(const char *name, const char *detail, bool to_user,
unsigned long offset, unsigned long len);
void __noreturn usercopy_abort(const char *name, const char *detail,
bool to_user, unsigned long offset,
unsigned long len);
#endif
#endif /* __LINUX_UACCESS_H__ */
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_X86_UACCESS_H
#define _ASM_X86_UACCESS_H
/*
* User space memory access functions
*/
#include <linux/compiler.h>
#include <linux/kasan-checks.h>
#include <linux/string.h>
#include <asm/asm.h>
#include <asm/page.h>
#include <asm/smap.h>
#include <asm/extable.h>
/*
* Test whether a block of memory is a valid user space address.
* Returns 0 if the range is valid, nonzero otherwise.
*/
static inline bool __chk_range_not_ok(unsigned long addr, unsigned long size, unsigned long limit)
{
/*
* If we have used "sizeof()" for the size,
* we know it won't overflow the limit (but
* it might overflow the 'addr', so it's
* important to subtract the size from the
* limit, not add it to the address).
*/
if (__builtin_constant_p(size))
return unlikely(addr > limit - size);
/* Arbitrary sizes? Be careful about overflow */
addr += size;
if (unlikely(addr < size))
return true;
return unlikely(addr > limit);
}
#define __range_not_ok(addr, size, limit) \
({ \
__chk_user_ptr(addr); \
__chk_range_not_ok((unsigned long __force)(addr), size, limit); \
})
#ifdef CONFIG_DEBUG_ATOMIC_SLEEP
static inline bool pagefault_disabled(void);
# define WARN_ON_IN_IRQ() \
WARN_ON_ONCE(!in_task() && !pagefault_disabled())
#else
# define WARN_ON_IN_IRQ()
#endif
/**
* access_ok - Checks if a user space pointer is valid
* @addr: User space pointer to start of block to check
* @size: Size of block to check
*
* Context: User context only. This function may sleep if pagefaults are
* enabled.
*
* Checks if a pointer to a block of memory in user space is valid.
*
* Note that, depending on architecture, this function probably just
* checks that the pointer is in the user space range - after calling
* this function, memory access functions may still return -EFAULT.
*
* Return: true (nonzero) if the memory block may be valid, false (zero)
* if it is definitely invalid.
*/
#define access_ok(addr, size) \
({ \
WARN_ON_IN_IRQ(); \
likely(!__range_not_ok(addr, size, TASK_SIZE_MAX)); \
})
extern int __get_user_1(void);
extern int __get_user_2(void);
extern int __get_user_4(void);
extern int __get_user_8(void);
extern int __get_user_nocheck_1(void);
extern int __get_user_nocheck_2(void);
extern int __get_user_nocheck_4(void);
extern int __get_user_nocheck_8(void);
extern int __get_user_bad(void);
#define __uaccess_begin() stac()
#define __uaccess_end() clac()
#define __uaccess_begin_nospec() \
({ \
stac(); \
barrier_nospec(); \
})
/*
* This is the smallest unsigned integer type that can fit a value
* (up to 'long long')
*/
#define __inttype(x) __typeof__( \
__typefits(x,char, \
__typefits(x,short, \
__typefits(x,int, \
__typefits(x,long,0ULL)))))
#define __typefits(x,type,not) \
__builtin_choose_expr(sizeof(x)<=sizeof(type),(unsigned type)0,not)
/*
* This is used for both get_user() and __get_user() to expand to
* the proper special function call that has odd calling conventions
* due to returning both a value and an error, and that depends on
* the size of the pointer passed in.
*
* Careful: we have to cast the result to the type of the pointer
* for sign reasons.
*
* The use of _ASM_DX as the register specifier is a bit of a
* simplification, as gcc only cares about it as the starting point
* and not size: for a 64-bit value it will use %ecx:%edx on 32 bits
* (%ecx being the next register in gcc's x86 register sequence), and
* %rdx on 64 bits.
*
* Clang/LLVM cares about the size of the register, but still wants
* the base register for something that ends up being a pair.
*/
#define do_get_user_call(fn,x,ptr) \
({ \
int __ret_gu; \
register __inttype(*(ptr)) __val_gu asm("%"_ASM_DX); \
__chk_user_ptr(ptr); \
asm volatile("call __" #fn "_%P4" \
: "=a" (__ret_gu), "=r" (__val_gu), \
ASM_CALL_CONSTRAINT \
: "0" (ptr), "i" (sizeof(*(ptr)))); \
(x) = (__force __typeof__(*(ptr))) __val_gu; \
__builtin_expect(__ret_gu, 0); \
})
/**
* get_user - Get a simple variable from user space.
* @x: Variable to store result.
* @ptr: Source address, in user space.
*
* Context: User context only. This function may sleep if pagefaults are
* enabled.
*
* This macro copies a single simple variable from user space to kernel
* space. It supports simple types like char and int, but not larger
* data types like structures or arrays.
*
* @ptr must have pointer-to-simple-variable type, and the result of
* dereferencing @ptr must be assignable to @x without a cast.
*
* Return: zero on success, or -EFAULT on error.
* On error, the variable @x is set to zero.
*/
#define get_user(x,ptr) ({ might_fault(); do_get_user_call(get_user,x,ptr); })
/**
* __get_user - Get a simple variable from user space, with less checking.
* @x: Variable to store result.
* @ptr: Source address, in user space.
*
* Context: User context only. This function may sleep if pagefaults are
* enabled.
*
* This macro copies a single simple variable from user space to kernel
* space. It supports simple types like char and int, but not larger
* data types like structures or arrays.
*
* @ptr must have pointer-to-simple-variable type, and the result of
* dereferencing @ptr must be assignable to @x without a cast.
*
* Caller must check the pointer with access_ok() before calling this
* function.
*
* Return: zero on success, or -EFAULT on error.
* On error, the variable @x is set to zero.
*/
#define __get_user(x,ptr) do_get_user_call(get_user_nocheck,x,ptr)
#ifdef CONFIG_X86_32
#define __put_user_goto_u64(x, addr, label) \
asm_volatile_goto("\n" \
"1: movl %%eax,0(%1)\n" \
"2: movl %%edx,4(%1)\n" \
_ASM_EXTABLE_UA(1b, %l2) \
_ASM_EXTABLE_UA(2b, %l2) \
: : "A" (x), "r" (addr) \
: : label)
#else
#define __put_user_goto_u64(x, ptr, label) \
__put_user_goto(x, ptr, "q", "er", label)
#endif
extern void __put_user_bad(void);
/*
* Strange magic calling convention: pointer in %ecx,
* value in %eax(:%edx), return value in %ecx. clobbers %rbx
*/
extern void __put_user_1(void);
extern void __put_user_2(void);
extern void __put_user_4(void);
extern void __put_user_8(void);
extern void __put_user_nocheck_1(void);
extern void __put_user_nocheck_2(void);
extern void __put_user_nocheck_4(void);
extern void __put_user_nocheck_8(void);
/*
* ptr must be evaluated and assigned to the temporary __ptr_pu before
* the assignment of x to __val_pu, to avoid any function calls
* involved in the ptr expression (possibly implicitly generated due
* to KASAN) from clobbering %ax.
*/
#define do_put_user_call(fn,x,ptr) \
({ \
int __ret_pu; \
void __user *__ptr_pu; \
register __typeof__(*(ptr)) __val_pu asm("%"_ASM_AX); \
__chk_user_ptr(ptr); \
__ptr_pu = (ptr); \
__val_pu = (x); \
asm volatile("call __" #fn "_%P[size]" \
: "=c" (__ret_pu), \
ASM_CALL_CONSTRAINT \
: "0" (__ptr_pu), \
"r" (__val_pu), \
[size] "i" (sizeof(*(ptr))) \
:"ebx"); \
__builtin_expect(__ret_pu, 0); \
})
/**
* put_user - Write a simple value into user space.
* @x: Value to copy to user space.
* @ptr: Destination address, in user space.
*
* Context: User context only. This function may sleep if pagefaults are
* enabled.
*
* This macro copies a single simple value from kernel space to user
* space. It supports simple types like char and int, but not larger
* data types like structures or arrays.
*
* @ptr must have pointer-to-simple-variable type, and @x must be assignable
* to the result of dereferencing @ptr.
*
* Return: zero on success, or -EFAULT on error.
*/
#define put_user(x, ptr) ({ might_fault(); do_put_user_call(put_user,x,ptr); })
/**
* __put_user - Write a simple value into user space, with less checking.
* @x: Value to copy to user space.
* @ptr: Destination address, in user space.
*
* Context: User context only. This function may sleep if pagefaults are
* enabled.
*
* This macro copies a single simple value from kernel space to user
* space. It supports simple types like char and int, but not larger
* data types like structures or arrays.
*
* @ptr must have pointer-to-simple-variable type, and @x must be assignable
* to the result of dereferencing @ptr.
*
* Caller must check the pointer with access_ok() before calling this
* function.
*
* Return: zero on success, or -EFAULT on error.
*/
#define __put_user(x, ptr) do_put_user_call(put_user_nocheck,x,ptr)
#define __put_user_size(x, ptr, size, label) \
do { \
__chk_user_ptr(ptr); \
switch (size) { \
case 1: \
__put_user_goto(x, ptr, "b", "iq", label); \
break; \
case 2: \
__put_user_goto(x, ptr, "w", "ir", label); \
break; \
case 4: \
__put_user_goto(x, ptr, "l", "ir", label); \
break; \
case 8: \
__put_user_goto_u64(x, ptr, label); \
break; \
default: \
__put_user_bad(); \
} \
} while (0)
#ifdef CONFIG_CC_HAS_ASM_GOTO_OUTPUT
#ifdef CONFIG_X86_32
#define __get_user_asm_u64(x, ptr, label) do { \
unsigned int __gu_low, __gu_high; \
const unsigned int __user *__gu_ptr; \
__gu_ptr = (const void __user *)(ptr); \
__get_user_asm(__gu_low, __gu_ptr, "l", "=r", label); \
__get_user_asm(__gu_high, __gu_ptr+1, "l", "=r", label); \
(x) = ((unsigned long long)__gu_high << 32) | __gu_low; \
} while (0)
#else
#define __get_user_asm_u64(x, ptr, label) \
__get_user_asm(x, ptr, "q", "=r", label)
#endif
#define __get_user_size(x, ptr, size, label) \
do { \
__chk_user_ptr(ptr); \
switch (size) { \
case 1: { \
unsigned char x_u8__; \
__get_user_asm(x_u8__, ptr, "b", "=q", label); \
(x) = x_u8__; \
break; \
} \
case 2: \
__get_user_asm(x, ptr, "w", "=r", label); \
break; \
case 4: \
__get_user_asm(x, ptr, "l", "=r", label); \
break; \
case 8: \
__get_user_asm_u64(x, ptr, label); \
break; \
default: \
(x) = __get_user_bad(); \
} \
} while (0)
#define __get_user_asm(x, addr, itype, ltype, label) \
asm_volatile_goto("\n" \
"1: mov"itype" %[umem],%[output]\n" \
_ASM_EXTABLE_UA(1b, %l2) \
: [output] ltype(x) \
: [umem] "m" (__m(addr)) \
: : label)
#else // !CONFIG_CC_HAS_ASM_GOTO_OUTPUT
#ifdef CONFIG_X86_32
#define __get_user_asm_u64(x, ptr, retval) \
({ \
__typeof__(ptr) __ptr = (ptr); \
asm volatile("\n" \
"1: movl %[lowbits],%%eax\n" \
"2: movl %[highbits],%%edx\n" \
"3:\n" \
".section .fixup,\"ax\"\n" \
"4: mov %[efault],%[errout]\n" \
" xorl %%eax,%%eax\n" \
" xorl %%edx,%%edx\n" \
" jmp 3b\n" \
".previous\n" \
_ASM_EXTABLE_UA(1b, 4b) \
_ASM_EXTABLE_UA(2b, 4b) \
: [errout] "=r" (retval), \
[output] "=&A"(x) \
: [lowbits] "m" (__m(__ptr)), \
[highbits] "m" __m(((u32 __user *)(__ptr)) + 1), \
[efault] "i" (-EFAULT), "0" (retval)); \
})
#else
#define __get_user_asm_u64(x, ptr, retval) \
__get_user_asm(x, ptr, retval, "q", "=r")
#endif
#define __get_user_size(x, ptr, size, retval) \
do { \
unsigned char x_u8__; \
\
retval = 0; \
__chk_user_ptr(ptr); \
switch (size) { \
case 1: \
__get_user_asm(x_u8__, ptr, retval, "b", "=q"); \
(x) = x_u8__; \
break; \
case 2: \
__get_user_asm(x, ptr, retval, "w", "=r"); \
break; \
case 4: \
__get_user_asm(x, ptr, retval, "l", "=r"); \
break; \
case 8: \
__get_user_asm_u64(x, ptr, retval); \
break; \
default: \
(x) = __get_user_bad(); \
} \
} while (0)
#define __get_user_asm(x, addr, err, itype, ltype) \
asm volatile("\n" \
"1: mov"itype" %[umem],%[output]\n" \
"2:\n" \
".section .fixup,\"ax\"\n" \
"3: mov %[efault],%[errout]\n" \
" xorl %k[output],%k[output]\n" \
" jmp 2b\n" \
".previous\n" \
_ASM_EXTABLE_UA(1b, 3b) \
: [errout] "=r" (err), \
[output] ltype(x) \
: [umem] "m" (__m(addr)), \
[efault] "i" (-EFAULT), "0" (err))
#endif // CONFIG_CC_ASM_GOTO_OUTPUT
/* FIXME: this hack is definitely wrong -AK */
struct __large_struct { unsigned long buf[100]; };
#define __m(x) (*(struct __large_struct __user *)(x))
/*
* Tell gcc we read from memory instead of writing: this is because
* we do not write to any memory gcc knows about, so there are no
* aliasing issues.
*/
#define __put_user_goto(x, addr, itype, ltype, label) \
asm_volatile_goto("\n" \
"1: mov"itype" %0,%1\n" \
_ASM_EXTABLE_UA(1b, %l2) \
: : ltype(x), "m" (__m(addr)) \
: : label)
extern unsigned long
copy_from_user_nmi(void *to, const void __user *from, unsigned long n);
extern __must_check long
strncpy_from_user(char *dst, const char __user *src, long count);
extern __must_check long strnlen_user(const char __user *str, long n);
unsigned long __must_check clear_user(void __user *mem, unsigned long len);
unsigned long __must_check __clear_user(void __user *mem, unsigned long len);
#ifdef CONFIG_ARCH_HAS_COPY_MC
unsigned long __must_check
copy_mc_to_kernel(void *to, const void *from, unsigned len);
#define copy_mc_to_kernel copy_mc_to_kernel
unsigned long __must_check
copy_mc_to_user(void *to, const void *from, unsigned len);
#endif
/*
* movsl can be slow when source and dest are not both 8-byte aligned
*/
#ifdef CONFIG_X86_INTEL_USERCOPY
extern struct movsl_mask {
int mask;
} ____cacheline_aligned_in_smp movsl_mask;
#endif
#define ARCH_HAS_NOCACHE_UACCESS 1
#ifdef CONFIG_X86_32
# include <asm/uaccess_32.h>
#else
# include <asm/uaccess_64.h>
#endif
/*
* The "unsafe" user accesses aren't really "unsafe", but the naming
* is a big fat warning: you have to not only do the access_ok()
* checking before using them, but you have to surround them with the
* user_access_begin/end() pair.
*/
static __must_check __always_inline bool user_access_begin(const void __user *ptr, size_t len)
{
if (unlikely(!access_ok(ptr,len)))
return 0;
__uaccess_begin_nospec();
return 1;
}
#define user_access_begin(a,b) user_access_begin(a,b)
#define user_access_end() __uaccess_end()
#define user_access_save() smap_save()
#define user_access_restore(x) smap_restore(x)
#define unsafe_put_user(x, ptr, label) \
__put_user_size((__typeof__(*(ptr)))(x), (ptr), sizeof(*(ptr)), label)
#ifdef CONFIG_CC_HAS_ASM_GOTO_OUTPUT
#define unsafe_get_user(x, ptr, err_label) \
do { \
__inttype(*(ptr)) __gu_val; \
__get_user_size(__gu_val, (ptr), sizeof(*(ptr)), err_label); \
(x) = (__force __typeof__(*(ptr)))__gu_val; \
} while (0)
#else // !CONFIG_CC_HAS_ASM_GOTO_OUTPUT
#define unsafe_get_user(x, ptr, err_label) \
do { \
int __gu_err; \
__inttype(*(ptr)) __gu_val; \
__get_user_size(__gu_val, (ptr), sizeof(*(ptr)), __gu_err); \
(x) = (__force __typeof__(*(ptr)))__gu_val; \
if (unlikely(__gu_err)) goto err_label; \
} while (0)
#endif // CONFIG_CC_HAS_ASM_GOTO_OUTPUT
/*
* We want the unsafe accessors to always be inlined and use
* the error labels - thus the macro games.
*/
#define unsafe_copy_loop(dst, src, len, type, label) \
while (len >= sizeof(type)) { \
unsafe_put_user(*(type *)(src),(type __user *)(dst),label); \
dst += sizeof(type); \
src += sizeof(type); \
len -= sizeof(type); \
}
#define unsafe_copy_to_user(_dst,_src,_len,label) \
do { \
char __user *__ucu_dst = (_dst); \
const char *__ucu_src = (_src); \
size_t __ucu_len = (_len); \
unsafe_copy_loop(__ucu_dst, __ucu_src, __ucu_len, u64, label); \
unsafe_copy_loop(__ucu_dst, __ucu_src, __ucu_len, u32, label); \
unsafe_copy_loop(__ucu_dst, __ucu_src, __ucu_len, u16, label); \
unsafe_copy_loop(__ucu_dst, __ucu_src, __ucu_len, u8, label); \
} while (0)
#define HAVE_GET_KERNEL_NOFAULT
#ifdef CONFIG_CC_HAS_ASM_GOTO_OUTPUT
#define __get_kernel_nofault(dst, src, type, err_label) \
__get_user_size(*((type *)(dst)), (__force type __user *)(src), \
sizeof(type), err_label)
#else // !CONFIG_CC_HAS_ASM_GOTO_OUTPUT
#define __get_kernel_nofault(dst, src, type, err_label) \
do { \
int __kr_err; \
\
__get_user_size(*((type *)(dst)), (__force type __user *)(src), \
sizeof(type), __kr_err); \
if (unlikely(__kr_err)) \
goto err_label; \
} while (0)
#endif // CONFIG_CC_HAS_ASM_GOTO_OUTPUT
#define __put_kernel_nofault(dst, src, type, err_label) \
__put_user_size(*((type *)(src)), (__force type __user *)(dst), \
sizeof(type), err_label)
#endif /* _ASM_X86_UACCESS_H */
/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
Red Black Trees
(C) 1999 Andrea Arcangeli <andrea@suse.de>
linux/include/linux/rbtree.h
To use rbtrees you'll have to implement your own insert and search cores.
This will avoid us to use callbacks and to drop drammatically performances.
I know it's not the cleaner way, but in C (not in C++) to get
performances and genericity...
See Documentation/core-api/rbtree.rst for documentation and samples.
*/
#ifndef _LINUX_RBTREE_H
#define _LINUX_RBTREE_H
#include <linux/rbtree_types.h>
#include <linux/kernel.h>
#include <linux/stddef.h>
#include <linux/rcupdate.h>
#define rb_parent(r) ((struct rb_node *)((r)->__rb_parent_color & ~3))
#define rb_entry(ptr, type, member) container_of(ptr, type, member)
#define RB_EMPTY_ROOT(root) (READ_ONCE((root)->rb_node) == NULL)
/* 'empty' nodes are nodes that are known not to be inserted in an rbtree */
#define RB_EMPTY_NODE(node) \
((node)->__rb_parent_color == (unsigned long)(node))
#define RB_CLEAR_NODE(node) \
((node)->__rb_parent_color = (unsigned long)(node))
extern void rb_insert_color(struct rb_node *, struct rb_root *);
extern void rb_erase(struct rb_node *, struct rb_root *);
/* Find logical next and previous nodes in a tree */
extern struct rb_node *rb_next(const struct rb_node *);
extern struct rb_node *rb_prev(const struct rb_node *);
extern struct rb_node *rb_first(const struct rb_root *);
extern struct rb_node *rb_last(const struct rb_root *);
/* Postorder iteration - always visit the parent after its children */
extern struct rb_node *rb_first_postorder(const struct rb_root *);
extern struct rb_node *rb_next_postorder(const struct rb_node *);
/* Fast replacement of a single node without remove/rebalance/add/rebalance */
extern void rb_replace_node(struct rb_node *victim, struct rb_node *new,
struct rb_root *root);
extern void rb_replace_node_rcu(struct rb_node *victim, struct rb_node *new,
struct rb_root *root);
static inline void rb_link_node(struct rb_node *node, struct rb_node *parent,
struct rb_node **rb_link)
{
node->__rb_parent_color = (unsigned long)parent;
node->rb_left = node->rb_right = NULL;
*rb_link = node;
}
static inline void rb_link_node_rcu(struct rb_node *node, struct rb_node *parent,
struct rb_node **rb_link)
{
node->__rb_parent_color = (unsigned long)parent;
node->rb_left = node->rb_right = NULL;
rcu_assign_pointer(*rb_link, node);
}
#define rb_entry_safe(ptr, type, member) \
({ typeof(ptr) ____ptr = (ptr); \
____ptr ? rb_entry(____ptr, type, member) : NULL; \
})
/**
* rbtree_postorder_for_each_entry_safe - iterate in post-order over rb_root of
* given type allowing the backing memory of @pos to be invalidated
*
* @pos: the 'type *' to use as a loop cursor.
* @n: another 'type *' to use as temporary storage
* @root: 'rb_root *' of the rbtree.
* @field: the name of the rb_node field within 'type'.
*
* rbtree_postorder_for_each_entry_safe() provides a similar guarantee as
* list_for_each_entry_safe() and allows the iteration to continue independent
* of changes to @pos by the body of the loop.
*
* Note, however, that it cannot handle other modifications that re-order the
* rbtree it is iterating over. This includes calling rb_erase() on @pos, as
* rb_erase() may rebalance the tree, causing us to miss some nodes.
*/
#define rbtree_postorder_for_each_entry_safe(pos, n, root, field) \
for (pos = rb_entry_safe(rb_first_postorder(root), typeof(*pos), field); \
pos && ({ n = rb_entry_safe(rb_next_postorder(&pos->field), \
typeof(*pos), field); 1; }); \
pos = n)
/* Same as rb_first(), but O(1) */
#define rb_first_cached(root) (root)->rb_leftmost
static inline void rb_insert_color_cached(struct rb_node *node,
struct rb_root_cached *root,
bool leftmost)
{
if (leftmost)
root->rb_leftmost = node; rb_insert_color(node, &root->rb_root);
}
static inline struct rb_node *
rb_erase_cached(struct rb_node *node, struct rb_root_cached *root)
{
struct rb_node *leftmost = NULL;
if (root->rb_leftmost == node)
leftmost = root->rb_leftmost = rb_next(node); rb_erase(node, &root->rb_root);
return leftmost;
}
static inline void rb_replace_node_cached(struct rb_node *victim,
struct rb_node *new,
struct rb_root_cached *root)
{
if (root->rb_leftmost == victim)
root->rb_leftmost = new; rb_replace_node(victim, new, &root->rb_root);
}
/*
* The below helper functions use 2 operators with 3 different
* calling conventions. The operators are related like:
*
* comp(a->key,b) < 0 := less(a,b)
* comp(a->key,b) > 0 := less(b,a)
* comp(a->key,b) == 0 := !less(a,b) && !less(b,a)
*
* If these operators define a partial order on the elements we make no
* guarantee on which of the elements matching the key is found. See
* rb_find().
*
* The reason for this is to allow the find() interface without requiring an
* on-stack dummy object, which might not be feasible due to object size.
*/
/**
* rb_add_cached() - insert @node into the leftmost cached tree @tree
* @node: node to insert
* @tree: leftmost cached tree to insert @node into
* @less: operator defining the (partial) node order
*
* Returns @node when it is the new leftmost, or NULL.
*/
static __always_inline struct rb_node *
rb_add_cached(struct rb_node *node, struct rb_root_cached *tree,
bool (*less)(struct rb_node *, const struct rb_node *))
{
struct rb_node **link = &tree->rb_root.rb_node;
struct rb_node *parent = NULL;
bool leftmost = true;
while (*link) {
parent = *link;
if (less(node, parent)) {
link = &parent->rb_left;
} else {
link = &parent->rb_right;
leftmost = false;
}
}
rb_link_node(node, parent, link);
rb_insert_color_cached(node, tree, leftmost);
return leftmost ? node : NULL;
}
/**
* rb_add() - insert @node into @tree
* @node: node to insert
* @tree: tree to insert @node into
* @less: operator defining the (partial) node order
*/
static __always_inline void
rb_add(struct rb_node *node, struct rb_root *tree,
bool (*less)(struct rb_node *, const struct rb_node *))
{
struct rb_node **link = &tree->rb_node;
struct rb_node *parent = NULL;
while (*link) {
parent = *link;
if (less(node, parent))
link = &parent->rb_left;
else
link = &parent->rb_right;
}
rb_link_node(node, parent, link);
rb_insert_color(node, tree);
}
/**
* rb_find_add() - find equivalent @node in @tree, or add @node
* @node: node to look-for / insert
* @tree: tree to search / modify
* @cmp: operator defining the node order
*
* Returns the rb_node matching @node, or NULL when no match is found and @node
* is inserted.
*/
static __always_inline struct rb_node *
rb_find_add(struct rb_node *node, struct rb_root *tree,
int (*cmp)(struct rb_node *, const struct rb_node *))
{
struct rb_node **link = &tree->rb_node;
struct rb_node *parent = NULL;
int c;
while (*link) {
parent = *link;
c = cmp(node, parent);
if (c < 0)
link = &parent->rb_left;
else if (c > 0)
link = &parent->rb_right;
else
return parent;
}
rb_link_node(node, parent, link);
rb_insert_color(node, tree);
return NULL;
}
/**
* rb_find() - find @key in tree @tree
* @key: key to match
* @tree: tree to search
* @cmp: operator defining the node order
*
* Returns the rb_node matching @key or NULL.
*/
static __always_inline struct rb_node *
rb_find(const void *key, const struct rb_root *tree,
int (*cmp)(const void *key, const struct rb_node *))
{
struct rb_node *node = tree->rb_node;
while (node) {
int c = cmp(key, node);
if (c < 0)
node = node->rb_left;
else if (c > 0)
node = node->rb_right;
else
return node;
}
return NULL;
}
/**
* rb_find_first() - find the first @key in @tree
* @key: key to match
* @tree: tree to search
* @cmp: operator defining node order
*
* Returns the leftmost node matching @key, or NULL.
*/
static __always_inline struct rb_node *
rb_find_first(const void *key, const struct rb_root *tree,
int (*cmp)(const void *key, const struct rb_node *))
{
struct rb_node *node = tree->rb_node;
struct rb_node *match = NULL;
while (node) {
int c = cmp(key, node);
if (c <= 0) {
if (!c)
match = node;
node = node->rb_left;
} else if (c > 0) {
node = node->rb_right;
}
}
return match;
}
/**
* rb_next_match() - find the next @key in @tree
* @key: key to match
* @tree: tree to search
* @cmp: operator defining node order
*
* Returns the next node matching @key, or NULL.
*/
static __always_inline struct rb_node *
rb_next_match(const void *key, struct rb_node *node,
int (*cmp)(const void *key, const struct rb_node *))
{
node = rb_next(node);
if (node && cmp(key, node))
node = NULL;
return node;
}
/**
* rb_for_each() - iterates a subtree matching @key
* @node: iterator
* @key: key to match
* @tree: tree to search
* @cmp: operator defining node order
*/
#define rb_for_each(node, key, tree, cmp) \
for ((node) = rb_find_first((key), (tree), (cmp)); \
(node); (node) = rb_next_match((key), (node), (cmp)))
#endif /* _LINUX_RBTREE_H */
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef __ASM_X86_XSAVE_H
#define __ASM_X86_XSAVE_H
#include <linux/uaccess.h>
#include <linux/types.h>
#include <asm/processor.h>
#include <asm/fpu/api.h>
#include <asm/user.h>
/* Bit 63 of XCR0 is reserved for future expansion */
#define XFEATURE_MASK_EXTEND (~(XFEATURE_MASK_FPSSE | (1ULL << 63)))
#define XSTATE_CPUID 0x0000000d
#define FXSAVE_SIZE 512
#define XSAVE_HDR_SIZE 64
#define XSAVE_HDR_OFFSET FXSAVE_SIZE
#define XSAVE_YMM_SIZE 256
#define XSAVE_YMM_OFFSET (XSAVE_HDR_SIZE + XSAVE_HDR_OFFSET)
#define XSAVE_ALIGNMENT 64
/* All currently supported user features */
#define XFEATURE_MASK_USER_SUPPORTED (XFEATURE_MASK_FP | \
XFEATURE_MASK_SSE | \
XFEATURE_MASK_YMM | \
XFEATURE_MASK_OPMASK | \
XFEATURE_MASK_ZMM_Hi256 | \
XFEATURE_MASK_Hi16_ZMM | \
XFEATURE_MASK_PKRU | \
XFEATURE_MASK_BNDREGS | \
XFEATURE_MASK_BNDCSR)
/*
* Features which are restored when returning to user space.
* PKRU is not restored on return to user space because PKRU
* is switched eagerly in switch_to() and flush_thread()
*/
#define XFEATURE_MASK_USER_RESTORE \
(XFEATURE_MASK_USER_SUPPORTED & ~XFEATURE_MASK_PKRU)
/* All currently supported supervisor features */
#define XFEATURE_MASK_SUPERVISOR_SUPPORTED (XFEATURE_MASK_PASID)
/*
* A supervisor state component may not always contain valuable information,
* and its size may be huge. Saving/restoring such supervisor state components
* at each context switch can cause high CPU and space overhead, which should
* be avoided. Such supervisor state components should only be saved/restored
* on demand. The on-demand supervisor features are set in this mask.
*
* Unlike the existing supported supervisor features, an independent supervisor
* feature does not allocate a buffer in task->fpu, and the corresponding
* supervisor state component cannot be saved/restored at each context switch.
*
* To support an independent supervisor feature, a developer should follow the
* dos and don'ts as below:
* - Do dynamically allocate a buffer for the supervisor state component.
* - Do manually invoke the XSAVES/XRSTORS instruction to save/restore the
* state component to/from the buffer.
* - Don't set the bit corresponding to the independent supervisor feature in
* IA32_XSS at run time, since it has been set at boot time.
*/
#define XFEATURE_MASK_INDEPENDENT (XFEATURE_MASK_LBR)
/*
* Unsupported supervisor features. When a supervisor feature in this mask is
* supported in the future, move it to the supported supervisor feature mask.
*/
#define XFEATURE_MASK_SUPERVISOR_UNSUPPORTED (XFEATURE_MASK_PT)
/* All supervisor states including supported and unsupported states. */
#define XFEATURE_MASK_SUPERVISOR_ALL (XFEATURE_MASK_SUPERVISOR_SUPPORTED | \
XFEATURE_MASK_INDEPENDENT | \
XFEATURE_MASK_SUPERVISOR_UNSUPPORTED)
#ifdef CONFIG_X86_64
#define REX_PREFIX "0x48, "
#else
#define REX_PREFIX
#endif
extern u64 xfeatures_mask_all;
static inline u64 xfeatures_mask_supervisor(void)
{
return xfeatures_mask_all & XFEATURE_MASK_SUPERVISOR_SUPPORTED;
}
/*
* The xfeatures which are enabled in XCR0 and expected to be in ptrace
* buffers and signal frames.
*/
static inline u64 xfeatures_mask_uabi(void)
{
return xfeatures_mask_all & XFEATURE_MASK_USER_SUPPORTED;
}
/*
* The xfeatures which are restored by the kernel when returning to user
* mode. This is not necessarily the same as xfeatures_mask_uabi() as the
* kernel does not manage all XCR0 enabled features via xsave/xrstor as
* some of them have to be switched eagerly on context switch and exec().
*/
static inline u64 xfeatures_mask_restore_user(void)
{
return xfeatures_mask_all & XFEATURE_MASK_USER_RESTORE;
}
/*
* Like xfeatures_mask_restore_user() but additionally restors the
* supported supervisor states.
*/
static inline u64 xfeatures_mask_fpstate(void)
{
return xfeatures_mask_all & \
(XFEATURE_MASK_USER_RESTORE | XFEATURE_MASK_SUPERVISOR_SUPPORTED);
}
static inline u64 xfeatures_mask_independent(void)
{
if (!boot_cpu_has(X86_FEATURE_ARCH_LBR))
return XFEATURE_MASK_INDEPENDENT & ~XFEATURE_MASK_LBR;
return XFEATURE_MASK_INDEPENDENT;
}
extern u64 xstate_fx_sw_bytes[USER_XSTATE_FX_SW_WORDS];
extern void __init update_regset_xstate_info(unsigned int size,
u64 xstate_mask);
void *get_xsave_addr(struct xregs_state *xsave, int xfeature_nr);
int xfeature_size(int xfeature_nr);
int copy_uabi_from_kernel_to_xstate(struct xregs_state *xsave, const void *kbuf);
int copy_sigframe_from_user_to_xstate(struct xregs_state *xsave, const void __user *ubuf);
void xsaves(struct xregs_state *xsave, u64 mask);
void xrstors(struct xregs_state *xsave, u64 mask);
enum xstate_copy_mode {
XSTATE_COPY_FP,
XSTATE_COPY_FX,
XSTATE_COPY_XSAVE,
};
struct membuf;
void copy_xstate_to_uabi_buf(struct membuf to, struct task_struct *tsk,
enum xstate_copy_mode mode);
#endif
// SPDX-License-Identifier: GPL-2.0
/*
* Convert integer string representation to an integer.
* If an integer doesn't fit into specified type, -E is returned.
*
* Integer starts with optional sign.
* kstrtou*() functions do not accept sign "-".
*
* Radix 0 means autodetection: leading "0x" implies radix 16,
* leading "0" implies radix 8, otherwise radix is 10.
* Autodetection hints work after optional sign, but not before.
*
* If -E is returned, result is not touched.
*/
#include <linux/ctype.h>
#include <linux/errno.h>
#include <linux/export.h>
#include <linux/kstrtox.h>
#include <linux/math64.h>
#include <linux/types.h>
#include <linux/uaccess.h>
#include "kstrtox.h"
const char *_parse_integer_fixup_radix(const char *s, unsigned int *base)
{
if (*base == 0) { if (s[0] == '0') { if (_tolower(s[1]) == 'x' && isxdigit(s[2])) *base = 16;
else
*base = 8;
} else
*base = 10;
}
if (*base == 16 && s[0] == '0' && _tolower(s[1]) == 'x') s += 2; return s;
}
/*
* Convert non-negative integer string representation in explicitly given radix
* to an integer. A maximum of max_chars characters will be converted.
*
* Return number of characters consumed maybe or-ed with overflow bit.
* If overflow occurs, result integer (incorrect) is still returned.
*
* Don't you dare use this function.
*/
unsigned int _parse_integer_limit(const char *s, unsigned int base, unsigned long long *p,
size_t max_chars)
{
unsigned long long res;
unsigned int rv;
res = 0;
rv = 0;
while (max_chars--) { unsigned int c = *s;
unsigned int lc = c | 0x20; /* don't tolower() this line */
unsigned int val;
if ('0' <= c && c <= '9')
val = c - '0';
else if ('a' <= lc && lc <= 'f')
val = lc - 'a' + 10;
else
break;
if (val >= base)
break;
/*
* Check for overflow only if we are within range of
* it in the max base we support (16)
*/
if (unlikely(res & (~0ull << 60))) { if (res > div_u64(ULLONG_MAX - val, base)) rv |= KSTRTOX_OVERFLOW;
}
res = res * base + val;
rv++;
s++;
}
*p = res;
return rv;
}
unsigned int _parse_integer(const char *s, unsigned int base, unsigned long long *p)
{
return _parse_integer_limit(s, base, p, INT_MAX);
}
static int _kstrtoull(const char *s, unsigned int base, unsigned long long *res)
{
unsigned long long _res;
unsigned int rv;
s = _parse_integer_fixup_radix(s, &base);
rv = _parse_integer(s, base, &_res);
if (rv & KSTRTOX_OVERFLOW)
return -ERANGE;
if (rv == 0)
return -EINVAL;
s += rv;
if (*s == '\n')
s++; if (*s)
return -EINVAL;
*res = _res; return 0;
}
/**
* kstrtoull - convert a string to an unsigned long long
* @s: The start of the string. The string must be null-terminated, and may also
* include a single newline before its terminating null. The first character
* may also be a plus sign, but not a minus sign.
* @base: The number base to use. The maximum supported base is 16. If base is
* given as 0, then the base of the string is automatically detected with the
* conventional semantics - If it begins with 0x the number will be parsed as a
* hexadecimal (case insensitive), if it otherwise begins with 0, it will be
* parsed as an octal number. Otherwise it will be parsed as a decimal.
* @res: Where to write the result of the conversion on success.
*
* Returns 0 on success, -ERANGE on overflow and -EINVAL on parsing error.
* Preferred over simple_strtoull(). Return code must be checked.
*/
int kstrtoull(const char *s, unsigned int base, unsigned long long *res)
{
if (s[0] == '+') s++; return _kstrtoull(s, base, res);
}
EXPORT_SYMBOL(kstrtoull);
/**
* kstrtoll - convert a string to a long long
* @s: The start of the string. The string must be null-terminated, and may also
* include a single newline before its terminating null. The first character
* may also be a plus sign or a minus sign.
* @base: The number base to use. The maximum supported base is 16. If base is
* given as 0, then the base of the string is automatically detected with the
* conventional semantics - If it begins with 0x the number will be parsed as a
* hexadecimal (case insensitive), if it otherwise begins with 0, it will be
* parsed as an octal number. Otherwise it will be parsed as a decimal.
* @res: Where to write the result of the conversion on success.
*
* Returns 0 on success, -ERANGE on overflow and -EINVAL on parsing error.
* Preferred over simple_strtoll(). Return code must be checked.
*/
int kstrtoll(const char *s, unsigned int base, long long *res)
{
unsigned long long tmp;
int rv;
if (s[0] == '-') {
rv = _kstrtoull(s + 1, base, &tmp);
if (rv < 0)
return rv;
if ((long long)-tmp > 0)
return -ERANGE;
*res = -tmp;
} else {
rv = kstrtoull(s, base, &tmp);
if (rv < 0)
return rv;
if ((long long)tmp < 0)
return -ERANGE;
*res = tmp;
}
return 0;
}
EXPORT_SYMBOL(kstrtoll);
/* Internal, do not use. */
int _kstrtoul(const char *s, unsigned int base, unsigned long *res)
{
unsigned long long tmp;
int rv;
rv = kstrtoull(s, base, &tmp);
if (rv < 0)
return rv;
if (tmp != (unsigned long)tmp)
return -ERANGE;
*res = tmp;
return 0;
}
EXPORT_SYMBOL(_kstrtoul);
/* Internal, do not use. */
int _kstrtol(const char *s, unsigned int base, long *res)
{
long long tmp;
int rv;
rv = kstrtoll(s, base, &tmp);
if (rv < 0)
return rv;
if (tmp != (long)tmp)
return -ERANGE;
*res = tmp;
return 0;
}
EXPORT_SYMBOL(_kstrtol);
/**
* kstrtouint - convert a string to an unsigned int
* @s: The start of the string. The string must be null-terminated, and may also
* include a single newline before its terminating null. The first character
* may also be a plus sign, but not a minus sign.
* @base: The number base to use. The maximum supported base is 16. If base is
* given as 0, then the base of the string is automatically detected with the
* conventional semantics - If it begins with 0x the number will be parsed as a
* hexadecimal (case insensitive), if it otherwise begins with 0, it will be
* parsed as an octal number. Otherwise it will be parsed as a decimal.
* @res: Where to write the result of the conversion on success.
*
* Returns 0 on success, -ERANGE on overflow and -EINVAL on parsing error.
* Preferred over simple_strtoul(). Return code must be checked.
*/
int kstrtouint(const char *s, unsigned int base, unsigned int *res)
{
unsigned long long tmp;
int rv;
rv = kstrtoull(s, base, &tmp);
if (rv < 0)
return rv;
if (tmp != (unsigned int)tmp)
return -ERANGE;
*res = tmp; return 0;
}
EXPORT_SYMBOL(kstrtouint);
/**
* kstrtoint - convert a string to an int
* @s: The start of the string. The string must be null-terminated, and may also
* include a single newline before its terminating null. The first character
* may also be a plus sign or a minus sign.
* @base: The number base to use. The maximum supported base is 16. If base is
* given as 0, then the base of the string is automatically detected with the
* conventional semantics - If it begins with 0x the number will be parsed as a
* hexadecimal (case insensitive), if it otherwise begins with 0, it will be
* parsed as an octal number. Otherwise it will be parsed as a decimal.
* @res: Where to write the result of the conversion on success.
*
* Returns 0 on success, -ERANGE on overflow and -EINVAL on parsing error.
* Preferred over simple_strtol(). Return code must be checked.
*/
int kstrtoint(const char *s, unsigned int base, int *res)
{
long long tmp;
int rv;
rv = kstrtoll(s, base, &tmp);
if (rv < 0)
return rv;
if (tmp != (int)tmp)
return -ERANGE;
*res = tmp;
return 0;
}
EXPORT_SYMBOL(kstrtoint);
int kstrtou16(const char *s, unsigned int base, u16 *res)
{
unsigned long long tmp;
int rv;
rv = kstrtoull(s, base, &tmp);
if (rv < 0)
return rv;
if (tmp != (u16)tmp)
return -ERANGE;
*res = tmp;
return 0;
}
EXPORT_SYMBOL(kstrtou16);
int kstrtos16(const char *s, unsigned int base, s16 *res)
{
long long tmp;
int rv;
rv = kstrtoll(s, base, &tmp);
if (rv < 0)
return rv;
if (tmp != (s16)tmp)
return -ERANGE;
*res = tmp;
return 0;
}
EXPORT_SYMBOL(kstrtos16);
int kstrtou8(const char *s, unsigned int base, u8 *res)
{
unsigned long long tmp;
int rv;
rv = kstrtoull(s, base, &tmp);
if (rv < 0)
return rv;
if (tmp != (u8)tmp)
return -ERANGE;
*res = tmp;
return 0;
}
EXPORT_SYMBOL(kstrtou8);
int kstrtos8(const char *s, unsigned int base, s8 *res)
{
long long tmp;
int rv;
rv = kstrtoll(s, base, &tmp);
if (rv < 0)
return rv;
if (tmp != (s8)tmp)
return -ERANGE;
*res = tmp;
return 0;
}
EXPORT_SYMBOL(kstrtos8);
/**
* kstrtobool - convert common user inputs into boolean values
* @s: input string
* @res: result
*
* This routine returns 0 iff the first character is one of 'Yy1Nn0', or
* [oO][NnFf] for "on" and "off". Otherwise it will return -EINVAL. Value
* pointed to by res is updated upon finding a match.
*/
int kstrtobool(const char *s, bool *res)
{
if (!s)
return -EINVAL;
switch (s[0]) {
case 'y':
case 'Y':
case '1':
*res = true;
return 0;
case 'n':
case 'N':
case '0':
*res = false;
return 0;
case 'o':
case 'O':
switch (s[1]) {
case 'n':
case 'N':
*res = true;
return 0;
case 'f':
case 'F':
*res = false;
return 0;
default:
break;
}
break;
default:
break;
}
return -EINVAL;
}
EXPORT_SYMBOL(kstrtobool);
/*
* Since "base" would be a nonsense argument, this open-codes the
* _from_user helper instead of using the helper macro below.
*/
int kstrtobool_from_user(const char __user *s, size_t count, bool *res)
{
/* Longest string needed to differentiate, newline, terminator */
char buf[4];
count = min(count, sizeof(buf) - 1);
if (copy_from_user(buf, s, count))
return -EFAULT;
buf[count] = '\0';
return kstrtobool(buf, res);
}
EXPORT_SYMBOL(kstrtobool_from_user);
#define kstrto_from_user(f, g, type) \
int f(const char __user *s, size_t count, unsigned int base, type *res) \
{ \
/* sign, base 2 representation, newline, terminator */ \
char buf[1 + sizeof(type) * 8 + 1 + 1]; \
\
count = min(count, sizeof(buf) - 1); \
if (copy_from_user(buf, s, count)) \
return -EFAULT; \
buf[count] = '\0'; \
return g(buf, base, res); \
} \
EXPORT_SYMBOL(f)
kstrto_from_user(kstrtoull_from_user, kstrtoull, unsigned long long);
kstrto_from_user(kstrtoll_from_user, kstrtoll, long long);
kstrto_from_user(kstrtoul_from_user, kstrtoul, unsigned long);
kstrto_from_user(kstrtol_from_user, kstrtol, long);
kstrto_from_user(kstrtouint_from_user, kstrtouint, unsigned int);
kstrto_from_user(kstrtoint_from_user, kstrtoint, int);
kstrto_from_user(kstrtou16_from_user, kstrtou16, u16);
kstrto_from_user(kstrtos16_from_user, kstrtos16, s16);
kstrto_from_user(kstrtou8_from_user, kstrtou8, u8);
kstrto_from_user(kstrtos8_from_user, kstrtos8, s8);
/* SPDX-License-Identifier: GPL-2.0 */
/* interrupt.h */
#ifndef _LINUX_INTERRUPT_H
#define _LINUX_INTERRUPT_H
#include <linux/kernel.h>
#include <linux/bitops.h>
#include <linux/cpumask.h>
#include <linux/irqreturn.h>
#include <linux/irqnr.h>
#include <linux/hardirq.h>
#include <linux/irqflags.h>
#include <linux/hrtimer.h>
#include <linux/kref.h>
#include <linux/workqueue.h>
#include <linux/jump_label.h>
#include <linux/atomic.h>
#include <asm/ptrace.h>
#include <asm/irq.h>
#include <asm/sections.h>
/*
* These correspond to the IORESOURCE_IRQ_* defines in
* linux/ioport.h to select the interrupt line behaviour. When
* requesting an interrupt without specifying a IRQF_TRIGGER, the
* setting should be assumed to be "as already configured", which
* may be as per machine or firmware initialisation.
*/
#define IRQF_TRIGGER_NONE 0x00000000
#define IRQF_TRIGGER_RISING 0x00000001
#define IRQF_TRIGGER_FALLING 0x00000002
#define IRQF_TRIGGER_HIGH 0x00000004
#define IRQF_TRIGGER_LOW 0x00000008
#define IRQF_TRIGGER_MASK (IRQF_TRIGGER_HIGH | IRQF_TRIGGER_LOW | \
IRQF_TRIGGER_RISING | IRQF_TRIGGER_FALLING)
#define IRQF_TRIGGER_PROBE 0x00000010
/*
* These flags used only by the kernel as part of the
* irq handling routines.
*
* IRQF_SHARED - allow sharing the irq among several devices
* IRQF_PROBE_SHARED - set by callers when they expect sharing mismatches to occur
* IRQF_TIMER - Flag to mark this interrupt as timer interrupt
* IRQF_PERCPU - Interrupt is per cpu
* IRQF_NOBALANCING - Flag to exclude this interrupt from irq balancing
* IRQF_IRQPOLL - Interrupt is used for polling (only the interrupt that is
* registered first in a shared interrupt is considered for
* performance reasons)
* IRQF_ONESHOT - Interrupt is not reenabled after the hardirq handler finished.
* Used by threaded interrupts which need to keep the
* irq line disabled until the threaded handler has been run.
* IRQF_NO_SUSPEND - Do not disable this IRQ during suspend. Does not guarantee
* that this interrupt will wake the system from a suspended
* state. See Documentation/power/suspend-and-interrupts.rst
* IRQF_FORCE_RESUME - Force enable it on resume even if IRQF_NO_SUSPEND is set
* IRQF_NO_THREAD - Interrupt cannot be threaded
* IRQF_EARLY_RESUME - Resume IRQ early during syscore instead of at device
* resume time.
* IRQF_COND_SUSPEND - If the IRQ is shared with a NO_SUSPEND user, execute this
* interrupt handler after suspending interrupts. For system
* wakeup devices users need to implement wakeup detection in
* their interrupt handlers.
* IRQF_NO_AUTOEN - Don't enable IRQ or NMI automatically when users request it.
* Users will enable it explicitly by enable_irq() or enable_nmi()
* later.
* IRQF_NO_DEBUG - Exclude from runnaway detection for IPI and similar handlers,
* depends on IRQF_PERCPU.
*/
#define IRQF_SHARED 0x00000080
#define IRQF_PROBE_SHARED 0x00000100
#define __IRQF_TIMER 0x00000200
#define IRQF_PERCPU 0x00000400
#define IRQF_NOBALANCING 0x00000800
#define IRQF_IRQPOLL 0x00001000
#define IRQF_ONESHOT 0x00002000
#define IRQF_NO_SUSPEND 0x00004000
#define IRQF_FORCE_RESUME 0x00008000
#define IRQF_NO_THREAD 0x00010000
#define IRQF_EARLY_RESUME 0x00020000
#define IRQF_COND_SUSPEND 0x00040000
#define IRQF_NO_AUTOEN 0x00080000
#define IRQF_NO_DEBUG 0x00100000
#define IRQF_TIMER (__IRQF_TIMER | IRQF_NO_SUSPEND | IRQF_NO_THREAD)
/*
* These values can be returned by request_any_context_irq() and
* describe the context the interrupt will be run in.
*
* IRQC_IS_HARDIRQ - interrupt runs in hardirq context
* IRQC_IS_NESTED - interrupt runs in a nested threaded context
*/
enum {
IRQC_IS_HARDIRQ = 0,
IRQC_IS_NESTED,
};
typedef irqreturn_t (*irq_handler_t)(int, void *);
/**
* struct irqaction - per interrupt action descriptor
* @handler: interrupt handler function
* @name: name of the device
* @dev_id: cookie to identify the device
* @percpu_dev_id: cookie to identify the device
* @next: pointer to the next irqaction for shared interrupts
* @irq: interrupt number
* @flags: flags (see IRQF_* above)
* @thread_fn: interrupt handler function for threaded interrupts
* @thread: thread pointer for threaded interrupts
* @secondary: pointer to secondary irqaction (force threading)
* @thread_flags: flags related to @thread
* @thread_mask: bitmask for keeping track of @thread activity
* @dir: pointer to the proc/irq/NN/name entry
*/
struct irqaction {
irq_handler_t handler;
void *dev_id;
void __percpu *percpu_dev_id;
struct irqaction *next;
irq_handler_t thread_fn;
struct task_struct *thread;
struct irqaction *secondary;
unsigned int irq;
unsigned int flags;
unsigned long thread_flags;
unsigned long thread_mask;
const char *name;
struct proc_dir_entry *dir;
} ____cacheline_internodealigned_in_smp;
extern irqreturn_t no_action(int cpl, void *dev_id);
/*
* If a (PCI) device interrupt is not connected we set dev->irq to
* IRQ_NOTCONNECTED. This causes request_irq() to fail with -ENOTCONN, so we
* can distingiush that case from other error returns.
*
* 0x80000000 is guaranteed to be outside the available range of interrupts
* and easy to distinguish from other possible incorrect values.
*/
#define IRQ_NOTCONNECTED (1U << 31)
extern int __must_check
request_threaded_irq(unsigned int irq, irq_handler_t handler,
irq_handler_t thread_fn,
unsigned long flags, const char *name, void *dev);
/**
* request_irq - Add a handler for an interrupt line
* @irq: The interrupt line to allocate
* @handler: Function to be called when the IRQ occurs.
* Primary handler for threaded interrupts
* If NULL, the default primary handler is installed
* @flags: Handling flags
* @name: Name of the device generating this interrupt
* @dev: A cookie passed to the handler function
*
* This call allocates an interrupt and establishes a handler; see
* the documentation for request_threaded_irq() for details.
*/
static inline int __must_check
request_irq(unsigned int irq, irq_handler_t handler, unsigned long flags,
const char *name, void *dev)
{
return request_threaded_irq(irq, handler, NULL, flags, name, dev);
}
extern int __must_check
request_any_context_irq(unsigned int irq, irq_handler_t handler,
unsigned long flags, const char *name, void *dev_id);
extern int __must_check
__request_percpu_irq(unsigned int irq, irq_handler_t handler,
unsigned long flags, const char *devname,
void __percpu *percpu_dev_id);
extern int __must_check
request_nmi(unsigned int irq, irq_handler_t handler, unsigned long flags,
const char *name, void *dev);
static inline int __must_check
request_percpu_irq(unsigned int irq, irq_handler_t handler,
const char *devname, void __percpu *percpu_dev_id)
{
return __request_percpu_irq(irq, handler, 0,
devname, percpu_dev_id);
}
extern int __must_check
request_percpu_nmi(unsigned int irq, irq_handler_t handler,
const char *devname, void __percpu *dev);
extern const void *free_irq(unsigned int, void *);
extern void free_percpu_irq(unsigned int, void __percpu *);
extern const void *free_nmi(unsigned int irq, void *dev_id);
extern void free_percpu_nmi(unsigned int irq, void __percpu *percpu_dev_id);
struct device;
extern int __must_check
devm_request_threaded_irq(struct device *dev, unsigned int irq,
irq_handler_t handler, irq_handler_t thread_fn,
unsigned long irqflags, const char *devname,
void *dev_id);
static inline int __must_check
devm_request_irq(struct device *dev, unsigned int irq, irq_handler_t handler,
unsigned long irqflags, const char *devname, void *dev_id)
{
return devm_request_threaded_irq(dev, irq, handler, NULL, irqflags,
devname, dev_id);
}
extern int __must_check
devm_request_any_context_irq(struct device *dev, unsigned int irq,
irq_handler_t handler, unsigned long irqflags,
const char *devname, void *dev_id);
extern void devm_free_irq(struct device *dev, unsigned int irq, void *dev_id);
/*
* On lockdep we dont want to enable hardirqs in hardirq
* context. Use local_irq_enable_in_hardirq() to annotate
* kernel code that has to do this nevertheless (pretty much
* the only valid case is for old/broken hardware that is
* insanely slow).
*
* NOTE: in theory this might break fragile code that relies
* on hardirq delivery - in practice we dont seem to have such
* places left. So the only effect should be slightly increased
* irqs-off latencies.
*/
#ifdef CONFIG_LOCKDEP
# define local_irq_enable_in_hardirq() do { } while (0)
#else
# define local_irq_enable_in_hardirq() local_irq_enable()
#endif
bool irq_has_action(unsigned int irq);
extern void disable_irq_nosync(unsigned int irq);
extern bool disable_hardirq(unsigned int irq);
extern void disable_irq(unsigned int irq);
extern void disable_percpu_irq(unsigned int irq);
extern void enable_irq(unsigned int irq);
extern void enable_percpu_irq(unsigned int irq, unsigned int type);
extern bool irq_percpu_is_enabled(unsigned int irq);
extern void irq_wake_thread(unsigned int irq, void *dev_id);
extern void disable_nmi_nosync(unsigned int irq);
extern void disable_percpu_nmi(unsigned int irq);
extern void enable_nmi(unsigned int irq);
extern void enable_percpu_nmi(unsigned int irq, unsigned int type);
extern int prepare_percpu_nmi(unsigned int irq);
extern void teardown_percpu_nmi(unsigned int irq);
extern int irq_inject_interrupt(unsigned int irq);
/* The following three functions are for the core kernel use only. */
extern void suspend_device_irqs(void);
extern void resume_device_irqs(void);
extern void rearm_wake_irq(unsigned int irq);
/**
* struct irq_affinity_notify - context for notification of IRQ affinity changes
* @irq: Interrupt to which notification applies
* @kref: Reference count, for internal use
* @work: Work item, for internal use
* @notify: Function to be called on change. This will be
* called in process context.
* @release: Function to be called on release. This will be
* called in process context. Once registered, the
* structure must only be freed when this function is
* called or later.
*/
struct irq_affinity_notify {
unsigned int irq;
struct kref kref;
struct work_struct work;
void (*notify)(struct irq_affinity_notify *, const cpumask_t *mask);
void (*release)(struct kref *ref);
};
#define IRQ_AFFINITY_MAX_SETS 4
/**
* struct irq_affinity - Description for automatic irq affinity assignements
* @pre_vectors: Don't apply affinity to @pre_vectors at beginning of
* the MSI(-X) vector space
* @post_vectors: Don't apply affinity to @post_vectors at end of
* the MSI(-X) vector space
* @nr_sets: The number of interrupt sets for which affinity
* spreading is required
* @set_size: Array holding the size of each interrupt set
* @calc_sets: Callback for calculating the number and size
* of interrupt sets
* @priv: Private data for usage by @calc_sets, usually a
* pointer to driver/device specific data.
*/
struct irq_affinity {
unsigned int pre_vectors;
unsigned int post_vectors;
unsigned int nr_sets;
unsigned int set_size[IRQ_AFFINITY_MAX_SETS];
void (*calc_sets)(struct irq_affinity *, unsigned int nvecs);
void *priv;
};
/**
* struct irq_affinity_desc - Interrupt affinity descriptor
* @mask: cpumask to hold the affinity assignment
* @is_managed: 1 if the interrupt is managed internally
*/
struct irq_affinity_desc {
struct cpumask mask;
unsigned int is_managed : 1;
};
#if defined(CONFIG_SMP)
extern cpumask_var_t irq_default_affinity;
extern int irq_set_affinity(unsigned int irq, const struct cpumask *cpumask);
extern int irq_force_affinity(unsigned int irq, const struct cpumask *cpumask);
extern int irq_can_set_affinity(unsigned int irq);
extern int irq_select_affinity(unsigned int irq);
extern int irq_set_affinity_hint(unsigned int irq, const struct cpumask *m);
extern int irq_update_affinity_desc(unsigned int irq,
struct irq_affinity_desc *affinity);
extern int
irq_set_affinity_notifier(unsigned int irq, struct irq_affinity_notify *notify);
struct irq_affinity_desc *
irq_create_affinity_masks(unsigned int nvec, struct irq_affinity *affd);
unsigned int irq_calc_affinity_vectors(unsigned int minvec, unsigned int maxvec,
const struct irq_affinity *affd);
#else /* CONFIG_SMP */
static inline int irq_set_affinity(unsigned int irq, const struct cpumask *m)
{
return -EINVAL;
}
static inline int irq_force_affinity(unsigned int irq, const struct cpumask *cpumask)
{
return 0;
}
static inline int irq_can_set_affinity(unsigned int irq)
{
return 0;
}
static inline int irq_select_affinity(unsigned int irq) { return 0; }
static inline int irq_set_affinity_hint(unsigned int irq,
const struct cpumask *m)
{
return -EINVAL;
}
static inline int irq_update_affinity_desc(unsigned int irq,
struct irq_affinity_desc *affinity)
{
return -EINVAL;
}
static inline int
irq_set_affinity_notifier(unsigned int irq, struct irq_affinity_notify *notify)
{
return 0;
}
static inline struct irq_affinity_desc *
irq_create_affinity_masks(unsigned int nvec, struct irq_affinity *affd)
{
return NULL;
}
static inline unsigned int
irq_calc_affinity_vectors(unsigned int minvec, unsigned int maxvec,
const struct irq_affinity *affd)
{
return maxvec;
}
#endif /* CONFIG_SMP */
/*
* Special lockdep variants of irq disabling/enabling.
* These should be used for locking constructs that
* know that a particular irq context which is disabled,
* and which is the only irq-context user of a lock,
* that it's safe to take the lock in the irq-disabled
* section without disabling hardirqs.
*
* On !CONFIG_LOCKDEP they are equivalent to the normal
* irq disable/enable methods.
*/
static inline void disable_irq_nosync_lockdep(unsigned int irq)
{
disable_irq_nosync(irq);
#ifdef CONFIG_LOCKDEP
local_irq_disable();
#endif
}
static inline void disable_irq_nosync_lockdep_irqsave(unsigned int irq, unsigned long *flags)
{
disable_irq_nosync(irq);
#ifdef CONFIG_LOCKDEP
local_irq_save(*flags);
#endif
}
static inline void disable_irq_lockdep(unsigned int irq)
{
disable_irq(irq);
#ifdef CONFIG_LOCKDEP
local_irq_disable();
#endif
}
static inline void enable_irq_lockdep(unsigned int irq)
{
#ifdef CONFIG_LOCKDEP
local_irq_enable();
#endif
enable_irq(irq);
}
static inline void enable_irq_lockdep_irqrestore(unsigned int irq, unsigned long *flags)
{
#ifdef CONFIG_LOCKDEP
local_irq_restore(*flags);
#endif
enable_irq(irq);
}
/* IRQ wakeup (PM) control: */
extern int irq_set_irq_wake(unsigned int irq, unsigned int on);
static inline int enable_irq_wake(unsigned int irq)
{
return irq_set_irq_wake(irq, 1);
}
static inline int disable_irq_wake(unsigned int irq)
{
return irq_set_irq_wake(irq, 0);
}
/*
* irq_get_irqchip_state/irq_set_irqchip_state specific flags
*/
enum irqchip_irq_state {
IRQCHIP_STATE_PENDING, /* Is interrupt pending? */
IRQCHIP_STATE_ACTIVE, /* Is interrupt in progress? */
IRQCHIP_STATE_MASKED, /* Is interrupt masked? */
IRQCHIP_STATE_LINE_LEVEL, /* Is IRQ line high? */
};
extern int irq_get_irqchip_state(unsigned int irq, enum irqchip_irq_state which,
bool *state);
extern int irq_set_irqchip_state(unsigned int irq, enum irqchip_irq_state which,
bool state);
#ifdef CONFIG_IRQ_FORCED_THREADING
# ifdef CONFIG_PREEMPT_RT
# define force_irqthreads() (true)
# else
DECLARE_STATIC_KEY_FALSE(force_irqthreads_key);
# define force_irqthreads() (static_branch_unlikely(&force_irqthreads_key))
# endif
#else
#define force_irqthreads() (false)
#endif
#ifndef local_softirq_pending
#ifndef local_softirq_pending_ref
#define local_softirq_pending_ref irq_stat.__softirq_pending
#endif
#define local_softirq_pending() (__this_cpu_read(local_softirq_pending_ref))
#define set_softirq_pending(x) (__this_cpu_write(local_softirq_pending_ref, (x)))
#define or_softirq_pending(x) (__this_cpu_or(local_softirq_pending_ref, (x)))
#endif /* local_softirq_pending */
/* Some architectures might implement lazy enabling/disabling of
* interrupts. In some cases, such as stop_machine, we might want
* to ensure that after a local_irq_disable(), interrupts have
* really been disabled in hardware. Such architectures need to
* implement the following hook.
*/
#ifndef hard_irq_disable
#define hard_irq_disable() do { } while(0)
#endif
/* PLEASE, avoid to allocate new softirqs, if you need not _really_ high
frequency threaded job scheduling. For almost all the purposes
tasklets are more than enough. F.e. all serial device BHs et
al. should be converted to tasklets, not to softirqs.
*/
enum
{
HI_SOFTIRQ=0,
TIMER_SOFTIRQ,
NET_TX_SOFTIRQ,
NET_RX_SOFTIRQ,
BLOCK_SOFTIRQ,
IRQ_POLL_SOFTIRQ,
TASKLET_SOFTIRQ,
SCHED_SOFTIRQ,
HRTIMER_SOFTIRQ,
RCU_SOFTIRQ, /* Preferable RCU should always be the last softirq */
NR_SOFTIRQS
};
#define SOFTIRQ_STOP_IDLE_MASK (~(1 << RCU_SOFTIRQ))
/* map softirq index to softirq name. update 'softirq_to_name' in
* kernel/softirq.c when adding a new softirq.
*/
extern const char * const softirq_to_name[NR_SOFTIRQS];
/* softirq mask and active fields moved to irq_cpustat_t in
* asm/hardirq.h to get better cache usage. KAO
*/
struct softirq_action
{
void (*action)(struct softirq_action *);
};
asmlinkage void do_softirq(void);
asmlinkage void __do_softirq(void);
extern void open_softirq(int nr, void (*action)(struct softirq_action *));
extern void softirq_init(void);
extern void __raise_softirq_irqoff(unsigned int nr);
extern void raise_softirq_irqoff(unsigned int nr);
extern void raise_softirq(unsigned int nr);
DECLARE_PER_CPU(struct task_struct *, ksoftirqd);
static inline struct task_struct *this_cpu_ksoftirqd(void)
{
return this_cpu_read(ksoftirqd);
}
/* Tasklets --- multithreaded analogue of BHs.
This API is deprecated. Please consider using threaded IRQs instead:
https://lore.kernel.org/lkml/20200716081538.2sivhkj4hcyrusem@linutronix.de
Main feature differing them of generic softirqs: tasklet
is running only on one CPU simultaneously.
Main feature differing them of BHs: different tasklets
may be run simultaneously on different CPUs.
Properties:
* If tasklet_schedule() is called, then tasklet is guaranteed
to be executed on some cpu at least once after this.
* If the tasklet is already scheduled, but its execution is still not
started, it will be executed only once.
* If this tasklet is already running on another CPU (or schedule is called
from tasklet itself), it is rescheduled for later.
* Tasklet is strictly serialized wrt itself, but not
wrt another tasklets. If client needs some intertask synchronization,
he makes it with spinlocks.
*/
struct tasklet_struct
{
struct tasklet_struct *next;
unsigned long state;
atomic_t count;
bool use_callback;
union {
void (*func)(unsigned long data);
void (*callback)(struct tasklet_struct *t);
};
unsigned long data;
};
#define DECLARE_TASKLET(name, _callback) \
struct tasklet_struct name = { \
.count = ATOMIC_INIT(0), \
.callback = _callback, \
.use_callback = true, \
}
#define DECLARE_TASKLET_DISABLED(name, _callback) \
struct tasklet_struct name = { \
.count = ATOMIC_INIT(1), \
.callback = _callback, \
.use_callback = true, \
}
#define from_tasklet(var, callback_tasklet, tasklet_fieldname) \
container_of(callback_tasklet, typeof(*var), tasklet_fieldname)
#define DECLARE_TASKLET_OLD(name, _func) \
struct tasklet_struct name = { \
.count = ATOMIC_INIT(0), \
.func = _func, \
}
#define DECLARE_TASKLET_DISABLED_OLD(name, _func) \
struct tasklet_struct name = { \
.count = ATOMIC_INIT(1), \
.func = _func, \
}
enum
{
TASKLET_STATE_SCHED, /* Tasklet is scheduled for execution */
TASKLET_STATE_RUN /* Tasklet is running (SMP only) */
};
#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT)
static inline int tasklet_trylock(struct tasklet_struct *t)
{
return !test_and_set_bit(TASKLET_STATE_RUN, &(t)->state);
}
void tasklet_unlock(struct tasklet_struct *t);
void tasklet_unlock_wait(struct tasklet_struct *t);
void tasklet_unlock_spin_wait(struct tasklet_struct *t);
#else
static inline int tasklet_trylock(struct tasklet_struct *t) { return 1; }
static inline void tasklet_unlock(struct tasklet_struct *t) { }
static inline void tasklet_unlock_wait(struct tasklet_struct *t) { }
static inline void tasklet_unlock_spin_wait(struct tasklet_struct *t) { }
#endif
extern void __tasklet_schedule(struct tasklet_struct *t);
static inline void tasklet_schedule(struct tasklet_struct *t)
{
if (!test_and_set_bit(TASKLET_STATE_SCHED, &t->state))
__tasklet_schedule(t);
}
extern void __tasklet_hi_schedule(struct tasklet_struct *t);
static inline void tasklet_hi_schedule(struct tasklet_struct *t)
{
if (!test_and_set_bit(TASKLET_STATE_SCHED, &t->state))
__tasklet_hi_schedule(t);
}
static inline void tasklet_disable_nosync(struct tasklet_struct *t)
{
atomic_inc(&t->count);
smp_mb__after_atomic();
}
/*
* Do not use in new code. Disabling tasklets from atomic contexts is
* error prone and should be avoided.
*/
static inline void tasklet_disable_in_atomic(struct tasklet_struct *t)
{
tasklet_disable_nosync(t);
tasklet_unlock_spin_wait(t);
smp_mb();
}
static inline void tasklet_disable(struct tasklet_struct *t)
{
tasklet_disable_nosync(t);
tasklet_unlock_wait(t);
smp_mb();
}
static inline void tasklet_enable(struct tasklet_struct *t)
{
smp_mb__before_atomic();
atomic_dec(&t->count);
}
extern void tasklet_kill(struct tasklet_struct *t);
extern void tasklet_init(struct tasklet_struct *t,
void (*func)(unsigned long), unsigned long data);
extern void tasklet_setup(struct tasklet_struct *t,
void (*callback)(struct tasklet_struct *));
/*
* Autoprobing for irqs:
*
* probe_irq_on() and probe_irq_off() provide robust primitives
* for accurate IRQ probing during kernel initialization. They are
* reasonably simple to use, are not "fooled" by spurious interrupts,
* and, unlike other attempts at IRQ probing, they do not get hung on
* stuck interrupts (such as unused PS2 mouse interfaces on ASUS boards).
*
* For reasonably foolproof probing, use them as follows:
*
* 1. clear and/or mask the device's internal interrupt.
* 2. sti();
* 3. irqs = probe_irq_on(); // "take over" all unassigned idle IRQs
* 4. enable the device and cause it to trigger an interrupt.
* 5. wait for the device to interrupt, using non-intrusive polling or a delay.
* 6. irq = probe_irq_off(irqs); // get IRQ number, 0=none, negative=multiple
* 7. service the device to clear its pending interrupt.
* 8. loop again if paranoia is required.
*
* probe_irq_on() returns a mask of allocated irq's.
*
* probe_irq_off() takes the mask as a parameter,
* and returns the irq number which occurred,
* or zero if none occurred, or a negative irq number
* if more than one irq occurred.
*/
#if !defined(CONFIG_GENERIC_IRQ_PROBE)
static inline unsigned long probe_irq_on(void)
{
return 0;
}
static inline int probe_irq_off(unsigned long val)
{
return 0;
}
static inline unsigned int probe_irq_mask(unsigned long val)
{
return 0;
}
#else
extern unsigned long probe_irq_on(void); /* returns 0 on failure */
extern int probe_irq_off(unsigned long); /* returns 0 or negative on failure */
extern unsigned int probe_irq_mask(unsigned long); /* returns mask of ISA interrupts */
#endif
#ifdef CONFIG_PROC_FS
/* Initialize /proc/irq/ */
extern void init_irq_proc(void);
#else
static inline void init_irq_proc(void)
{
}
#endif
#ifdef CONFIG_IRQ_TIMINGS
void irq_timings_enable(void);
void irq_timings_disable(void);
u64 irq_timings_next_event(u64 now);
#endif
struct seq_file;
int show_interrupts(struct seq_file *p, void *v);
int arch_show_interrupts(struct seq_file *p, int prec);
extern int early_irq_init(void);
extern int arch_probe_nr_irqs(void);
extern int arch_early_irq_init(void);
/*
* We want to know which function is an entrypoint of a hardirq or a softirq.
*/
#ifndef __irq_entry
# define __irq_entry __section(".irqentry.text")
#endif
#define __softirq_entry __section(".softirqentry.text")
#endif
// SPDX-License-Identifier: GPL-2.0-only
/*
* linux/kernel/signal.c
*
* Copyright (C) 1991, 1992 Linus Torvalds
*
* 1997-11-02 Modified for POSIX.1b signals by Richard Henderson
*
* 2003-06-02 Jim Houston - Concurrent Computer Corp.
* Changes to use preallocated sigqueue structures
* to allow signals to be sent reliably.
*/
#include <linux/slab.h>
#include <linux/export.h>
#include <linux/init.h>
#include <linux/sched/mm.h>
#include <linux/sched/user.h>
#include <linux/sched/debug.h>
#include <linux/sched/task.h>
#include <linux/sched/task_stack.h>
#include <linux/sched/cputime.h>
#include <linux/file.h>
#include <linux/fs.h>
#include <linux/proc_fs.h>
#include <linux/tty.h>
#include <linux/binfmts.h>
#include <linux/coredump.h>
#include <linux/security.h>
#include <linux/syscalls.h>
#include <linux/ptrace.h>
#include <linux/signal.h>
#include <linux/signalfd.h>
#include <linux/ratelimit.h>
#include <linux/tracehook.h>
#include <linux/capability.h>
#include <linux/freezer.h>
#include <linux/pid_namespace.h>
#include <linux/nsproxy.h>
#include <linux/user_namespace.h>
#include <linux/uprobes.h>
#include <linux/compat.h>
#include <linux/cn_proc.h>
#include <linux/compiler.h>
#include <linux/posix-timers.h>
#include <linux/cgroup.h>
#include <linux/audit.h>
#define CREATE_TRACE_POINTS
#include <trace/events/signal.h>
#include <asm/param.h>
#include <linux/uaccess.h>
#include <asm/unistd.h>
#include <asm/siginfo.h>
#include <asm/cacheflush.h>
#include <asm/syscall.h> /* for syscall_get_* */
/*
* SLAB caches for signal bits.
*/
static struct kmem_cache *sigqueue_cachep;
int print_fatal_signals __read_mostly;
static void __user *sig_handler(struct task_struct *t, int sig)
{
return t->sighand->action[sig - 1].sa.sa_handler;
}
static inline bool sig_handler_ignored(void __user *handler, int sig)
{
/* Is it explicitly or implicitly ignored? */
return handler == SIG_IGN || (handler == SIG_DFL && sig_kernel_ignore(sig));
}
static bool sig_task_ignored(struct task_struct *t, int sig, bool force)
{
void __user *handler;
handler = sig_handler(t, sig);
/* SIGKILL and SIGSTOP may not be sent to the global init */
if (unlikely(is_global_init(t) && sig_kernel_only(sig)))
return true;
if (unlikely(t->signal->flags & SIGNAL_UNKILLABLE) && handler == SIG_DFL && !(force && sig_kernel_only(sig)))
return true;
/* Only allow kernel generated signals to this kthread */
if (unlikely((t->flags & PF_KTHREAD) &&
(handler == SIG_KTHREAD_KERNEL) && !force))
return true;
return sig_handler_ignored(handler, sig);
}
static bool sig_ignored(struct task_struct *t, int sig, bool force)
{
/*
* Blocked signals are never ignored, since the
* signal handler may change by the time it is
* unblocked.
*/
if (sigismember(&t->blocked, sig) || sigismember(&t->real_blocked, sig))
return false;
/*
* Tracers may want to know about even ignored signal unless it
* is SIGKILL which can't be reported anyway but can be ignored
* by SIGNAL_UNKILLABLE task.
*/
if (t->ptrace && sig != SIGKILL)
return false;
return sig_task_ignored(t, sig, force);
}
/*
* Re-calculate pending state from the set of locally pending
* signals, globally pending signals, and blocked signals.
*/
static inline bool has_pending_signals(sigset_t *signal, sigset_t *blocked)
{
unsigned long ready;
long i;
switch (_NSIG_WORDS) {
default:
for (i = _NSIG_WORDS, ready = 0; --i >= 0 ;)
ready |= signal->sig[i] &~ blocked->sig[i];
break;
case 4: ready = signal->sig[3] &~ blocked->sig[3];
ready |= signal->sig[2] &~ blocked->sig[2];
ready |= signal->sig[1] &~ blocked->sig[1];
ready |= signal->sig[0] &~ blocked->sig[0];
break;
case 2: ready = signal->sig[1] &~ blocked->sig[1];
ready |= signal->sig[0] &~ blocked->sig[0];
break;
case 1: ready = signal->sig[0] &~ blocked->sig[0];
}
return ready != 0;
}
#define PENDING(p,b) has_pending_signals(&(p)->signal, (b))
static bool recalc_sigpending_tsk(struct task_struct *t)
{
if ((t->jobctl & (JOBCTL_PENDING_MASK | JOBCTL_TRAP_FREEZE)) || PENDING(&t->pending, &t->blocked) || PENDING(&t->signal->shared_pending, &t->blocked) ||
cgroup_task_frozen(t)) {
set_tsk_thread_flag(t, TIF_SIGPENDING);
return true;
}
/*
* We must never clear the flag in another thread, or in current
* when it's possible the current syscall is returning -ERESTART*.
* So we don't clear it here, and only callers who know they should do.
*/
return false;
}
/*
* After recalculating TIF_SIGPENDING, we need to make sure the task wakes up.
* This is superfluous when called on current, the wakeup is a harmless no-op.
*/
void recalc_sigpending_and_wake(struct task_struct *t)
{
if (recalc_sigpending_tsk(t))
signal_wake_up(t, 0);
}
void recalc_sigpending(void)
{
if (!recalc_sigpending_tsk(current) && !freezing(current))
clear_thread_flag(TIF_SIGPENDING);
}
EXPORT_SYMBOL(recalc_sigpending);
void calculate_sigpending(void)
{
/* Have any signals or users of TIF_SIGPENDING been delayed
* until after fork?
*/
spin_lock_irq(¤t->sighand->siglock);
set_tsk_thread_flag(current, TIF_SIGPENDING);
recalc_sigpending();
spin_unlock_irq(¤t->sighand->siglock);
}
/* Given the mask, find the first available signal that should be serviced. */
#define SYNCHRONOUS_MASK \
(sigmask(SIGSEGV) | sigmask(SIGBUS) | sigmask(SIGILL) | \
sigmask(SIGTRAP) | sigmask(SIGFPE) | sigmask(SIGSYS))
int next_signal(struct sigpending *pending, sigset_t *mask)
{
unsigned long i, *s, *m, x;
int sig = 0;
s = pending->signal.sig;
m = mask->sig;
/*
* Handle the first word specially: it contains the
* synchronous signals that need to be dequeued first.
*/
x = *s &~ *m;
if (x) {
if (x & SYNCHRONOUS_MASK)
x &= SYNCHRONOUS_MASK;
sig = ffz(~x) + 1;
return sig;
}
switch (_NSIG_WORDS) {
default:
for (i = 1; i < _NSIG_WORDS; ++i) {
x = *++s &~ *++m;
if (!x)
continue;
sig = ffz(~x) + i*_NSIG_BPW + 1;
break;
}
break;
case 2:
x = s[1] &~ m[1];
if (!x)
break;
sig = ffz(~x) + _NSIG_BPW + 1;
break;
case 1:
/* Nothing to do */
break;
}
return sig;
}
static inline void print_dropped_signal(int sig)
{
static DEFINE_RATELIMIT_STATE(ratelimit_state, 5 * HZ, 10);
if (!print_fatal_signals)
return;
if (!__ratelimit(&ratelimit_state))
return;
pr_info("%s/%d: reached RLIMIT_SIGPENDING, dropped signal %d\n",
current->comm, current->pid, sig);
}
/**
* task_set_jobctl_pending - set jobctl pending bits
* @task: target task
* @mask: pending bits to set
*
* Clear @mask from @task->jobctl. @mask must be subset of
* %JOBCTL_PENDING_MASK | %JOBCTL_STOP_CONSUME | %JOBCTL_STOP_SIGMASK |
* %JOBCTL_TRAPPING. If stop signo is being set, the existing signo is
* cleared. If @task is already being killed or exiting, this function
* becomes noop.
*
* CONTEXT:
* Must be called with @task->sighand->siglock held.
*
* RETURNS:
* %true if @mask is set, %false if made noop because @task was dying.
*/
bool task_set_jobctl_pending(struct task_struct *task, unsigned long mask)
{
BUG_ON(mask & ~(JOBCTL_PENDING_MASK | JOBCTL_STOP_CONSUME |
JOBCTL_STOP_SIGMASK | JOBCTL_TRAPPING));
BUG_ON((mask & JOBCTL_TRAPPING) && !(mask & JOBCTL_PENDING_MASK));
if (unlikely(fatal_signal_pending(task) || (task->flags & PF_EXITING)))
return false;
if (mask & JOBCTL_STOP_SIGMASK)
task->jobctl &= ~JOBCTL_STOP_SIGMASK;
task->jobctl |= mask;
return true;
}
/**
* task_clear_jobctl_trapping - clear jobctl trapping bit
* @task: target task
*
* If JOBCTL_TRAPPING is set, a ptracer is waiting for us to enter TRACED.
* Clear it and wake up the ptracer. Note that we don't need any further
* locking. @task->siglock guarantees that @task->parent points to the
* ptracer.
*
* CONTEXT:
* Must be called with @task->sighand->siglock held.
*/
void task_clear_jobctl_trapping(struct task_struct *task)
{
if (unlikely(task->jobctl & JOBCTL_TRAPPING)) {
task->jobctl &= ~JOBCTL_TRAPPING;
smp_mb(); /* advised by wake_up_bit() */
wake_up_bit(&task->jobctl, JOBCTL_TRAPPING_BIT);
}
}
/**
* task_clear_jobctl_pending - clear jobctl pending bits
* @task: target task
* @mask: pending bits to clear
*
* Clear @mask from @task->jobctl. @mask must be subset of
* %JOBCTL_PENDING_MASK. If %JOBCTL_STOP_PENDING is being cleared, other
* STOP bits are cleared together.
*
* If clearing of @mask leaves no stop or trap pending, this function calls
* task_clear_jobctl_trapping().
*
* CONTEXT:
* Must be called with @task->sighand->siglock held.
*/
void task_clear_jobctl_pending(struct task_struct *task, unsigned long mask)
{
BUG_ON(mask & ~JOBCTL_PENDING_MASK);
if (mask & JOBCTL_STOP_PENDING)
mask |= JOBCTL_STOP_CONSUME | JOBCTL_STOP_DEQUEUED;
task->jobctl &= ~mask;
if (!(task->jobctl & JOBCTL_PENDING_MASK))
task_clear_jobctl_trapping(task);
}
/**
* task_participate_group_stop - participate in a group stop
* @task: task participating in a group stop
*
* @task has %JOBCTL_STOP_PENDING set and is participating in a group stop.
* Group stop states are cleared and the group stop count is consumed if
* %JOBCTL_STOP_CONSUME was set. If the consumption completes the group
* stop, the appropriate `SIGNAL_*` flags are set.
*
* CONTEXT:
* Must be called with @task->sighand->siglock held.
*
* RETURNS:
* %true if group stop completion should be notified to the parent, %false
* otherwise.
*/
static bool task_participate_group_stop(struct task_struct *task)
{
struct signal_struct *sig = task->signal;
bool consume = task->jobctl & JOBCTL_STOP_CONSUME;
WARN_ON_ONCE(!(task->jobctl & JOBCTL_STOP_PENDING));
task_clear_jobctl_pending(task, JOBCTL_STOP_PENDING);
if (!consume)
return false;
if (!WARN_ON_ONCE(sig->group_stop_count == 0))
sig->group_stop_count--;
/*
* Tell the caller to notify completion iff we are entering into a
* fresh group stop. Read comment in do_signal_stop() for details.
*/
if (!sig->group_stop_count && !(sig->flags & SIGNAL_STOP_STOPPED)) {
signal_set_stop_flags(sig, SIGNAL_STOP_STOPPED);
return true;
}
return false;
}
void task_join_group_stop(struct task_struct *task)
{
unsigned long mask = current->jobctl & JOBCTL_STOP_SIGMASK;
struct signal_struct *sig = current->signal;
if (sig->group_stop_count) {
sig->group_stop_count++;
mask |= JOBCTL_STOP_CONSUME;
} else if (!(sig->flags & SIGNAL_STOP_STOPPED))
return;
/* Have the new thread join an on-going signal group stop */
task_set_jobctl_pending(task, mask | JOBCTL_STOP_PENDING);
}
/*
* allocate a new signal queue record
* - this may be called without locks if and only if t == current, otherwise an
* appropriate lock must be held to stop the target task from exiting
*/
static struct sigqueue *
__sigqueue_alloc(int sig, struct task_struct *t, gfp_t gfp_flags,
int override_rlimit, const unsigned int sigqueue_flags)
{
struct sigqueue *q = NULL;
struct ucounts *ucounts = NULL;
long sigpending;
/*
* Protect access to @t credentials. This can go away when all
* callers hold rcu read lock.
*
* NOTE! A pending signal will hold on to the user refcount,
* and we get/put the refcount only when the sigpending count
* changes from/to zero.
*/
rcu_read_lock();
ucounts = task_ucounts(t);
sigpending = inc_rlimit_get_ucounts(ucounts, UCOUNT_RLIMIT_SIGPENDING);
rcu_read_unlock();
if (!sigpending)
return NULL;
if (override_rlimit || likely(sigpending <= task_rlimit(t, RLIMIT_SIGPENDING))) { q = kmem_cache_alloc(sigqueue_cachep, gfp_flags);
} else {
print_dropped_signal(sig);
}
if (unlikely(q == NULL)) {
dec_rlimit_put_ucounts(ucounts, UCOUNT_RLIMIT_SIGPENDING);
} else {
INIT_LIST_HEAD(&q->list);
q->flags = sigqueue_flags;
q->ucounts = ucounts;
}
return q;
}
static void __sigqueue_free(struct sigqueue *q)
{
if (q->flags & SIGQUEUE_PREALLOC)
return;
if (q->ucounts) { dec_rlimit_put_ucounts(q->ucounts, UCOUNT_RLIMIT_SIGPENDING);
q->ucounts = NULL;
}
kmem_cache_free(sigqueue_cachep, q);
}
void flush_sigqueue(struct sigpending *queue)
{
struct sigqueue *q;
sigemptyset(&queue->signal);
while (!list_empty(&queue->list)) {
q = list_entry(queue->list.next, struct sigqueue , list);
list_del_init(&q->list);
__sigqueue_free(q);
}
}
/*
* Flush all pending signals for this kthread.
*/
void flush_signals(struct task_struct *t)
{
unsigned long flags;
spin_lock_irqsave(&t->sighand->siglock, flags);
clear_tsk_thread_flag(t, TIF_SIGPENDING);
flush_sigqueue(&t->pending);
flush_sigqueue(&t->signal->shared_pending);
spin_unlock_irqrestore(&t->sighand->siglock, flags);
}
EXPORT_SYMBOL(flush_signals);
#ifdef CONFIG_POSIX_TIMERS
static void __flush_itimer_signals(struct sigpending *pending)
{
sigset_t signal, retain;
struct sigqueue *q, *n;
signal = pending->signal;
sigemptyset(&retain);
list_for_each_entry_safe(q, n, &pending->list, list) {
int sig = q->info.si_signo;
if (likely(q->info.si_code != SI_TIMER)) {
sigaddset(&retain, sig);
} else {
sigdelset(&signal, sig);
list_del_init(&q->list);
__sigqueue_free(q);
}
}
sigorsets(&pending->signal, &signal, &retain);
}
void flush_itimer_signals(void)
{
struct task_struct *tsk = current;
unsigned long flags;
spin_lock_irqsave(&tsk->sighand->siglock, flags);
__flush_itimer_signals(&tsk->pending);
__flush_itimer_signals(&tsk->signal->shared_pending);
spin_unlock_irqrestore(&tsk->sighand->siglock, flags);
}
#endif
void ignore_signals(struct task_struct *t)
{
int i;
for (i = 0; i < _NSIG; ++i)
t->sighand->action[i].sa.sa_handler = SIG_IGN;
flush_signals(t);
}
/*
* Flush all handlers for a task.
*/
void
flush_signal_handlers(struct task_struct *t, int force_default)
{
int i;
struct k_sigaction *ka = &t->sighand->action[0];
for (i = _NSIG ; i != 0 ; i--) {
if (force_default || ka->sa.sa_handler != SIG_IGN)
ka->sa.sa_handler = SIG_DFL;
ka->sa.sa_flags = 0;
#ifdef __ARCH_HAS_SA_RESTORER
ka->sa.sa_restorer = NULL;
#endif
sigemptyset(&ka->sa.sa_mask);
ka++;
}
}
bool unhandled_signal(struct task_struct *tsk, int sig)
{
void __user *handler = tsk->sighand->action[sig-1].sa.sa_handler;
if (is_global_init(tsk))
return true;
if (handler != SIG_IGN && handler != SIG_DFL)
return false;
/* if ptraced, let the tracer determine */
return !tsk->ptrace;
}
static void collect_signal(int sig, struct sigpending *list, kernel_siginfo_t *info,
bool *resched_timer)
{
struct sigqueue *q, *first = NULL;
/*
* Collect the siginfo appropriate to this signal. Check if
* there is another siginfo for the same signal.
*/
list_for_each_entry(q, &list->list, list) {
if (q->info.si_signo == sig) {
if (first)
goto still_pending;
first = q;
}
}
sigdelset(&list->signal, sig);
if (first) {
still_pending:
list_del_init(&first->list);
copy_siginfo(info, &first->info);
*resched_timer =
(first->flags & SIGQUEUE_PREALLOC) &&
(info->si_code == SI_TIMER) &&
(info->si_sys_private);
__sigqueue_free(first);
} else {
/*
* Ok, it wasn't in the queue. This must be
* a fast-pathed signal or we must have been
* out of queue space. So zero out the info.
*/
clear_siginfo(info);
info->si_signo = sig;
info->si_errno = 0;
info->si_code = SI_USER;
info->si_pid = 0;
info->si_uid = 0;
}
}
static int __dequeue_signal(struct sigpending *pending, sigset_t *mask,
kernel_siginfo_t *info, bool *resched_timer)
{
int sig = next_signal(pending, mask);
if (sig)
collect_signal(sig, pending, info, resched_timer);
return sig;
}
/*
* Dequeue a signal and return the element to the caller, which is
* expected to free it.
*
* All callers have to hold the siglock.
*/
int dequeue_signal(struct task_struct *tsk, sigset_t *mask, kernel_siginfo_t *info)
{
bool resched_timer = false;
int signr;
/* We only dequeue private signals from ourselves, we don't let
* signalfd steal them
*/
signr = __dequeue_signal(&tsk->pending, mask, info, &resched_timer);
if (!signr) {
signr = __dequeue_signal(&tsk->signal->shared_pending,
mask, info, &resched_timer);
#ifdef CONFIG_POSIX_TIMERS
/*
* itimer signal ?
*
* itimers are process shared and we restart periodic
* itimers in the signal delivery path to prevent DoS
* attacks in the high resolution timer case. This is
* compliant with the old way of self-restarting
* itimers, as the SIGALRM is a legacy signal and only
* queued once. Changing the restart behaviour to
* restart the timer in the signal dequeue path is
* reducing the timer noise on heavy loaded !highres
* systems too.
*/
if (unlikely(signr == SIGALRM)) {
struct hrtimer *tmr = &tsk->signal->real_timer;
if (!hrtimer_is_queued(tmr) &&
tsk->signal->it_real_incr != 0) {
hrtimer_forward(tmr, tmr->base->get_time(),
tsk->signal->it_real_incr);
hrtimer_restart(tmr);
}
}
#endif
}
recalc_sigpending();
if (!signr)
return 0;
if (unlikely(sig_kernel_stop(signr))) {
/*
* Set a marker that we have dequeued a stop signal. Our
* caller might release the siglock and then the pending
* stop signal it is about to process is no longer in the
* pending bitmasks, but must still be cleared by a SIGCONT
* (and overruled by a SIGKILL). So those cases clear this
* shared flag after we've set it. Note that this flag may
* remain set after the signal we return is ignored or
* handled. That doesn't matter because its only purpose
* is to alert stop-signal processing code when another
* processor has come along and cleared the flag.
*/
current->jobctl |= JOBCTL_STOP_DEQUEUED;
}
#ifdef CONFIG_POSIX_TIMERS
if (resched_timer) {
/*
* Release the siglock to ensure proper locking order
* of timer locks outside of siglocks. Note, we leave
* irqs disabled here, since the posix-timers code is
* about to disable them again anyway.
*/
spin_unlock(&tsk->sighand->siglock);
posixtimer_rearm(info);
spin_lock(&tsk->sighand->siglock);
/* Don't expose the si_sys_private value to userspace */
info->si_sys_private = 0;
}
#endif
return signr;
}
EXPORT_SYMBOL_GPL(dequeue_signal);
static int dequeue_synchronous_signal(kernel_siginfo_t *info)
{
struct task_struct *tsk = current;
struct sigpending *pending = &tsk->pending;
struct sigqueue *q, *sync = NULL;
/*
* Might a synchronous signal be in the queue?
*/
if (!((pending->signal.sig[0] & ~tsk->blocked.sig[0]) & SYNCHRONOUS_MASK))
return 0;
/*
* Return the first synchronous signal in the queue.
*/
list_for_each_entry(q, &pending->list, list) {
/* Synchronous signals have a positive si_code */
if ((q->info.si_code > SI_USER) && (sigmask(q->info.si_signo) & SYNCHRONOUS_MASK)) {
sync = q;
goto next;
}
}
return 0;
next:
/*
* Check if there is another siginfo for the same signal.
*/
list_for_each_entry_continue(q, &pending->list, list) { if (q->info.si_signo == sync->info.si_signo)
goto still_pending;
}
sigdelset(&pending->signal, sync->info.si_signo);
recalc_sigpending();
still_pending:
list_del_init(&sync->list);
copy_siginfo(info, &sync->info);
__sigqueue_free(sync);
return info->si_signo;
}
/*
* Tell a process that it has a new active signal..
*
* NOTE! we rely on the previous spin_lock to
* lock interrupts for us! We can only be called with
* "siglock" held, and the local interrupt must
* have been disabled when that got acquired!
*
* No need to set need_resched since signal event passing
* goes through ->blocked
*/
void signal_wake_up_state(struct task_struct *t, unsigned int state)
{
set_tsk_thread_flag(t, TIF_SIGPENDING);
/*
* TASK_WAKEKILL also means wake it up in the stopped/traced/killable
* case. We don't check t->state here because there is a race with it
* executing another processor and just now entering stopped state.
* By using wake_up_state, we ensure the process will wake up and
* handle its death signal.
*/
if (!wake_up_state(t, state | TASK_INTERRUPTIBLE))
kick_process(t);
}
/*
* Remove signals in mask from the pending set and queue.
* Returns 1 if any signals were found.
*
* All callers must be holding the siglock.
*/
static void flush_sigqueue_mask(sigset_t *mask, struct sigpending *s)
{
struct sigqueue *q, *n;
sigset_t m;
sigandsets(&m, mask, &s->signal);
if (sigisemptyset(&m))
return;
sigandnsets(&s->signal, &s->signal, mask);
list_for_each_entry_safe(q, n, &s->list, list) {
if (sigismember(mask, q->info.si_signo)) {
list_del_init(&q->list);
__sigqueue_free(q);
}
}
}
static inline int is_si_special(const struct kernel_siginfo *info)
{
return info <= SEND_SIG_PRIV;
}
static inline bool si_fromuser(const struct kernel_siginfo *info)
{
return info == SEND_SIG_NOINFO ||
(!is_si_special(info) && SI_FROMUSER(info));
}
/*
* called with RCU read lock from check_kill_permission()
*/
static bool kill_ok_by_cred(struct task_struct *t)
{
const struct cred *cred = current_cred();
const struct cred *tcred = __task_cred(t);
return uid_eq(cred->euid, tcred->suid) ||
uid_eq(cred->euid, tcred->uid) ||
uid_eq(cred->uid, tcred->suid) ||
uid_eq(cred->uid, tcred->uid) ||
ns_capable(tcred->user_ns, CAP_KILL);
}
/*
* Bad permissions for sending the signal
* - the caller must hold the RCU read lock
*/
static int check_kill_permission(int sig, struct kernel_siginfo *info,
struct task_struct *t)
{
struct pid *sid;
int error;
if (!valid_signal(sig))
return -EINVAL;
if (!si_fromuser(info))
return 0;
error = audit_signal_info(sig, t); /* Let audit system see the signal */
if (error)
return error;
if (!same_thread_group(current, t) &&
!kill_ok_by_cred(t)) {
switch (sig) {
case SIGCONT:
sid = task_session(t);
/*
* We don't return the error if sid == NULL. The
* task was unhashed, the caller must notice this.
*/
if (!sid || sid == task_session(current))
break;
fallthrough;
default:
return -EPERM;
}
}
return security_task_kill(t, info, sig, NULL);
}
/**
* ptrace_trap_notify - schedule trap to notify ptracer
* @t: tracee wanting to notify tracer
*
* This function schedules sticky ptrace trap which is cleared on the next
* TRAP_STOP to notify ptracer of an event. @t must have been seized by
* ptracer.
*
* If @t is running, STOP trap will be taken. If trapped for STOP and
* ptracer is listening for events, tracee is woken up so that it can
* re-trap for the new event. If trapped otherwise, STOP trap will be
* eventually taken without returning to userland after the existing traps
* are finished by PTRACE_CONT.
*
* CONTEXT:
* Must be called with @task->sighand->siglock held.
*/
static void ptrace_trap_notify(struct task_struct *t)
{
WARN_ON_ONCE(!(t->ptrace & PT_SEIZED));
assert_spin_locked(&t->sighand->siglock);
task_set_jobctl_pending(t, JOBCTL_TRAP_NOTIFY);
ptrace_signal_wake_up(t, t->jobctl & JOBCTL_LISTENING);
}
/*
* Handle magic process-wide effects of stop/continue signals. Unlike
* the signal actions, these happen immediately at signal-generation
* time regardless of blocking, ignoring, or handling. This does the
* actual continuing for SIGCONT, but not the actual stopping for stop
* signals. The process stop is done as a signal action for SIG_DFL.
*
* Returns true if the signal should be actually delivered, otherwise
* it should be dropped.
*/
static bool prepare_signal(int sig, struct task_struct *p, bool force)
{
struct signal_struct *signal = p->signal;
struct task_struct *t;
sigset_t flush;
if (signal->flags & (SIGNAL_GROUP_EXIT | SIGNAL_GROUP_COREDUMP)) {
if (!(signal->flags & SIGNAL_GROUP_EXIT)) return sig == SIGKILL;
/*
* The process is in the middle of dying, nothing to do.
*/
} else if (sig_kernel_stop(sig)) {
/*
* This is a stop signal. Remove SIGCONT from all queues.
*/
siginitset(&flush, sigmask(SIGCONT));
flush_sigqueue_mask(&flush, &signal->shared_pending);
for_each_thread(p, t)
flush_sigqueue_mask(&flush, &t->pending); } else if (sig == SIGCONT) {
unsigned int why;
/*
* Remove all stop signals from all queues, wake all threads.
*/
siginitset(&flush, SIG_KERNEL_STOP_MASK);
flush_sigqueue_mask(&flush, &signal->shared_pending);
for_each_thread(p, t) { flush_sigqueue_mask(&flush, &t->pending);
task_clear_jobctl_pending(t, JOBCTL_STOP_PENDING);
if (likely(!(t->ptrace & PT_SEIZED))) wake_up_state(t, __TASK_STOPPED);
else
ptrace_trap_notify(t);
}
/*
* Notify the parent with CLD_CONTINUED if we were stopped.
*
* If we were in the middle of a group stop, we pretend it
* was already finished, and then continued. Since SIGCHLD
* doesn't queue we report only CLD_STOPPED, as if the next
* CLD_CONTINUED was dropped.
*/
why = 0;
if (signal->flags & SIGNAL_STOP_STOPPED)
why |= SIGNAL_CLD_CONTINUED;
else if (signal->group_stop_count)
why |= SIGNAL_CLD_STOPPED;
if (why) {
/*
* The first thread which returns from do_signal_stop()
* will take ->siglock, notice SIGNAL_CLD_MASK, and
* notify its parent. See get_signal().
*/
signal_set_stop_flags(signal, why | SIGNAL_STOP_CONTINUED);
signal->group_stop_count = 0;
signal->group_exit_code = 0;
}
}
return !sig_ignored(p, sig, force);
}
/*
* Test if P wants to take SIG. After we've checked all threads with this,
* it's equivalent to finding no threads not blocking SIG. Any threads not
* blocking SIG were ruled out because they are not running and already
* have pending signals. Such threads will dequeue from the shared queue
* as soon as they're available, so putting the signal on the shared queue
* will be equivalent to sending it to one such thread.
*/
static inline bool wants_signal(int sig, struct task_struct *p)
{
if (sigismember(&p->blocked, sig))
return false;
if (p->flags & PF_EXITING)
return false;
if (sig == SIGKILL)
return true;
if (task_is_stopped_or_traced(p))
return false;
return task_curr(p) || !task_sigpending(p);
}
static void complete_signal(int sig, struct task_struct *p, enum pid_type type)
{
struct signal_struct *signal = p->signal;
struct task_struct *t;
/*
* Now find a thread we can wake up to take the signal off the queue.
*
* If the main thread wants the signal, it gets first crack.
* Probably the least surprising to the average bear.
*/
if (wants_signal(sig, p))
t = p;
else if ((type == PIDTYPE_PID) || thread_group_empty(p))
/*
* There is just one thread and it does not need to be woken.
* It will dequeue unblocked signals before it runs again.
*/
return;
else {
/*
* Otherwise try to find a suitable thread.
*/
t = signal->curr_target;
while (!wants_signal(sig, t)) {
t = next_thread(t);
if (t == signal->curr_target)
/*
* No thread needs to be woken.
* Any eligible threads will see
* the signal in the queue soon.
*/
return;
}
signal->curr_target = t;
}
/*
* Found a killable thread. If the signal will be fatal,
* then start taking the whole group down immediately.
*/
if (sig_fatal(p, sig) && !(signal->flags & SIGNAL_GROUP_EXIT) && !sigismember(&t->real_blocked, sig) && (sig == SIGKILL || !p->ptrace)) {
/*
* This signal will be fatal to the whole group.
*/
if (!sig_kernel_coredump(sig)) {
/*
* Start a group exit and wake everybody up.
* This way we don't have other threads
* running and doing things after a slower
* thread has the fatal signal pending.
*/
signal->flags = SIGNAL_GROUP_EXIT;
signal->group_exit_code = sig;
signal->group_stop_count = 0;
t = p;
do {
task_clear_jobctl_pending(t, JOBCTL_PENDING_MASK);
sigaddset(&t->pending.signal, SIGKILL);
signal_wake_up(t, 1);
} while_each_thread(p, t);
return;
}
}
/*
* The signal is already in the shared-pending queue.
* Tell the chosen thread to wake up and dequeue it.
*/
signal_wake_up(t, sig == SIGKILL);
return;
}
static inline bool legacy_queue(struct sigpending *signals, int sig)
{
return (sig < SIGRTMIN) && sigismember(&signals->signal, sig);
}
static int __send_signal(int sig, struct kernel_siginfo *info, struct task_struct *t,
enum pid_type type, bool force)
{
struct sigpending *pending;
struct sigqueue *q;
int override_rlimit;
int ret = 0, result;
assert_spin_locked(&t->sighand->siglock);
result = TRACE_SIGNAL_IGNORED;
if (!prepare_signal(sig, t, force))
goto ret;
pending = (type != PIDTYPE_PID) ? &t->signal->shared_pending : &t->pending;
/*
* Short-circuit ignored signals and support queuing
* exactly one non-rt signal, so that we can get more
* detailed information about the cause of the signal.
*/
result = TRACE_SIGNAL_ALREADY_PENDING;
if (legacy_queue(pending, sig))
goto ret;
result = TRACE_SIGNAL_DELIVERED;
/*
* Skip useless siginfo allocation for SIGKILL and kernel threads.
*/
if ((sig == SIGKILL) || (t->flags & PF_KTHREAD))
goto out_set;
/*
* Real-time signals must be queued if sent by sigqueue, or
* some other real-time mechanism. It is implementation
* defined whether kill() does so. We attempt to do so, on
* the principle of least surprise, but since kill is not
* allowed to fail with EAGAIN when low on memory we just
* make sure at least one signal gets delivered and don't
* pass on the info struct.
*/
if (sig < SIGRTMIN) override_rlimit = (is_si_special(info) || info->si_code >= 0);
else
override_rlimit = 0;
q = __sigqueue_alloc(sig, t, GFP_ATOMIC, override_rlimit, 0);
if (q) {
list_add_tail(&q->list, &pending->list);
switch ((unsigned long) info) {
case (unsigned long) SEND_SIG_NOINFO:
clear_siginfo(&q->info);
q->info.si_signo = sig;
q->info.si_errno = 0;
q->info.si_code = SI_USER;
q->info.si_pid = task_tgid_nr_ns(current,
task_active_pid_ns(t));
rcu_read_lock();
q->info.si_uid =
from_kuid_munged(task_cred_xxx(t, user_ns),
current_uid());
rcu_read_unlock();
break;
case (unsigned long) SEND_SIG_PRIV:
clear_siginfo(&q->info);
q->info.si_signo = sig;
q->info.si_errno = 0;
q->info.si_code = SI_KERNEL;
q->info.si_pid = 0;
q->info.si_uid = 0;
break;
default:
copy_siginfo(&q->info, info);
break;
}
} else if (!is_si_special(info) && sig >= SIGRTMIN && info->si_code != SI_USER) {
/*
* Queue overflow, abort. We may abort if the
* signal was rt and sent by user using something
* other than kill().
*/
result = TRACE_SIGNAL_OVERFLOW_FAIL;
ret = -EAGAIN;
goto ret;
} else {
/*
* This is a silent loss of information. We still
* send the signal, but the *info bits are lost.
*/
result = TRACE_SIGNAL_LOSE_INFO;
}
out_set:
signalfd_notify(t, sig);
sigaddset(&pending->signal, sig);
/* Let multiprocess signals appear after on-going forks */
if (type > PIDTYPE_TGID) {
struct multiprocess_signals *delayed;
hlist_for_each_entry(delayed, &t->signal->multiprocess, node) {
sigset_t *signal = &delayed->signal;
/* Can't queue both a stop and a continue signal */
if (sig == SIGCONT)
sigdelsetmask(signal, SIG_KERNEL_STOP_MASK);
else if (sig_kernel_stop(sig))
sigdelset(signal, SIGCONT);
sigaddset(signal, sig);
}
}
complete_signal(sig, t, type);
ret:
trace_signal_generate(sig, info, t, type != PIDTYPE_PID, result);
return ret;
}
static inline bool has_si_pid_and_uid(struct kernel_siginfo *info)
{
bool ret = false;
switch (siginfo_layout(info->si_signo, info->si_code)) {
case SIL_KILL:
case SIL_CHLD:
case SIL_RT:
ret = true;
break;
case SIL_TIMER:
case SIL_POLL:
case SIL_FAULT:
case SIL_FAULT_TRAPNO:
case SIL_FAULT_MCEERR:
case SIL_FAULT_BNDERR:
case SIL_FAULT_PKUERR:
case SIL_FAULT_PERF_EVENT:
case SIL_SYS:
ret = false;
break;
}
return ret;
}
static int send_signal(int sig, struct kernel_siginfo *info, struct task_struct *t,
enum pid_type type)
{
/* Should SIGKILL or SIGSTOP be received by a pid namespace init? */
bool force = false;
if (info == SEND_SIG_NOINFO) {
/* Force if sent from an ancestor pid namespace */
force = !task_pid_nr_ns(current, task_active_pid_ns(t)); } else if (info == SEND_SIG_PRIV) {
/* Don't ignore kernel generated signals */
force = true;
} else if (has_si_pid_and_uid(info)) {
/* SIGKILL and SIGSTOP is special or has ids */
struct user_namespace *t_user_ns;
rcu_read_lock();
t_user_ns = task_cred_xxx(t, user_ns);
if (current_user_ns() != t_user_ns) {
kuid_t uid = make_kuid(current_user_ns(), info->si_uid); info->si_uid = from_kuid_munged(t_user_ns, uid);
}
rcu_read_unlock();
/* A kernel generated signal? */
force = (info->si_code == SI_KERNEL);
/* From an ancestor pid namespace? */
if (!task_pid_nr_ns(current, task_active_pid_ns(t))) {
info->si_pid = 0;
force = true;
}
}
return __send_signal(sig, info, t, type, force);
}
static void print_fatal_signal(int signr)
{
struct pt_regs *regs = signal_pt_regs();
pr_info("potentially unexpected fatal signal %d.\n", signr);
#if defined(__i386__) && !defined(__arch_um__)
pr_info("code at %08lx: ", regs->ip);
{
int i;
for (i = 0; i < 16; i++) {
unsigned char insn;
if (get_user(insn, (unsigned char *)(regs->ip + i)))
break;
pr_cont("%02x ", insn);
}
}
pr_cont("\n");
#endif
preempt_disable();
show_regs(regs);
preempt_enable();
}
static int __init setup_print_fatal_signals(char *str)
{
get_option (&str, &print_fatal_signals);
return 1;
}
__setup("print-fatal-signals=", setup_print_fatal_signals);
int
__group_send_sig_info(int sig, struct kernel_siginfo *info, struct task_struct *p)
{
return send_signal(sig, info, p, PIDTYPE_TGID);
}
int do_send_sig_info(int sig, struct kernel_siginfo *info, struct task_struct *p,
enum pid_type type)
{
unsigned long flags;
int ret = -ESRCH;
if (lock_task_sighand(p, &flags)) {
ret = send_signal(sig, info, p, type);
unlock_task_sighand(p, &flags);
}
return ret;
}
enum sig_handler {
HANDLER_CURRENT, /* If reachable use the current handler */
HANDLER_SIG_DFL, /* Always use SIG_DFL handler semantics */
HANDLER_EXIT, /* Only visible as the process exit code */
};
/*
* Force a signal that the process can't ignore: if necessary
* we unblock the signal and change any SIG_IGN to SIG_DFL.
*
* Note: If we unblock the signal, we always reset it to SIG_DFL,
* since we do not want to have a signal handler that was blocked
* be invoked when user space had explicitly blocked it.
*
* We don't want to have recursive SIGSEGV's etc, for example,
* that is why we also clear SIGNAL_UNKILLABLE.
*/
static int
force_sig_info_to_task(struct kernel_siginfo *info, struct task_struct *t,
enum sig_handler handler)
{
unsigned long int flags;
int ret, blocked, ignored;
struct k_sigaction *action;
int sig = info->si_signo;
spin_lock_irqsave(&t->sighand->siglock, flags);
action = &t->sighand->action[sig-1];
ignored = action->sa.sa_handler == SIG_IGN;
blocked = sigismember(&t->blocked, sig);
if (blocked || ignored || (handler != HANDLER_CURRENT)) {
action->sa.sa_handler = SIG_DFL;
if (handler == HANDLER_EXIT)
action->sa.sa_flags |= SA_IMMUTABLE; if (blocked) {
sigdelset(&t->blocked, sig);
recalc_sigpending_and_wake(t);
}
}
/*
* Don't clear SIGNAL_UNKILLABLE for traced tasks, users won't expect
* debugging to leave init killable. But HANDLER_EXIT is always fatal.
*/
if (action->sa.sa_handler == SIG_DFL && (!t->ptrace || (handler == HANDLER_EXIT))) t->signal->flags &= ~SIGNAL_UNKILLABLE; ret = send_signal(sig, info, t, PIDTYPE_PID);
spin_unlock_irqrestore(&t->sighand->siglock, flags);
return ret;
}
int force_sig_info(struct kernel_siginfo *info)
{
return force_sig_info_to_task(info, current, HANDLER_CURRENT);
}
/*
* Nuke all other threads in the group.
*/
int zap_other_threads(struct task_struct *p)
{
struct task_struct *t = p;
int count = 0;
p->signal->group_stop_count = 0;
while_each_thread(p, t) {
task_clear_jobctl_pending(t, JOBCTL_PENDING_MASK);
count++;
/* Don't bother with already dead threads */
if (t->exit_state)
continue;
sigaddset(&t->pending.signal, SIGKILL);
signal_wake_up(t, 1);
}
return count;
}
struct sighand_struct *__lock_task_sighand(struct task_struct *tsk,
unsigned long *flags)
{
struct sighand_struct *sighand;
rcu_read_lock();
for (;;) {
sighand = rcu_dereference(tsk->sighand);
if (unlikely(sighand == NULL))
break;
/*
* This sighand can be already freed and even reused, but
* we rely on SLAB_TYPESAFE_BY_RCU and sighand_ctor() which
* initializes ->siglock: this slab can't go away, it has
* the same object type, ->siglock can't be reinitialized.
*
* We need to ensure that tsk->sighand is still the same
* after we take the lock, we can race with de_thread() or
* __exit_signal(). In the latter case the next iteration
* must see ->sighand == NULL.
*/
spin_lock_irqsave(&sighand->siglock, *flags);
if (likely(sighand == rcu_access_pointer(tsk->sighand)))
break;
spin_unlock_irqrestore(&sighand->siglock, *flags);
}
rcu_read_unlock();
return sighand;
}
#ifdef CONFIG_LOCKDEP
void lockdep_assert_task_sighand_held(struct task_struct *task)
{
struct sighand_struct *sighand;
rcu_read_lock();
sighand = rcu_dereference(task->sighand);
if (sighand)
lockdep_assert_held(&sighand->siglock);
else
WARN_ON_ONCE(1);
rcu_read_unlock();
}
#endif
/*
* send signal info to all the members of a group
*/
int group_send_sig_info(int sig, struct kernel_siginfo *info,
struct task_struct *p, enum pid_type type)
{
int ret;
rcu_read_lock();
ret = check_kill_permission(sig, info, p);
rcu_read_unlock();
if (!ret && sig)
ret = do_send_sig_info(sig, info, p, type);
return ret;
}
/*
* __kill_pgrp_info() sends a signal to a process group: this is what the tty
* control characters do (^C, ^Z etc)
* - the caller must hold at least a readlock on tasklist_lock
*/
int __kill_pgrp_info(int sig, struct kernel_siginfo *info, struct pid *pgrp)
{
struct task_struct *p = NULL;
int retval, success;
success = 0;
retval = -ESRCH;
do_each_pid_task(pgrp, PIDTYPE_PGID, p) {
int err = group_send_sig_info(sig, info, p, PIDTYPE_PGID);
success |= !err;
retval = err;
} while_each_pid_task(pgrp, PIDTYPE_PGID, p);
return success ? 0 : retval;
}
int kill_pid_info(int sig, struct kernel_siginfo *info, struct pid *pid)
{
int error = -ESRCH;
struct task_struct *p;
for (;;) {
rcu_read_lock();
p = pid_task(pid, PIDTYPE_PID);
if (p)
error = group_send_sig_info(sig, info, p, PIDTYPE_TGID);
rcu_read_unlock();
if (likely(!p || error != -ESRCH))
return error;
/*
* The task was unhashed in between, try again. If it
* is dead, pid_task() will return NULL, if we race with
* de_thread() it will find the new leader.
*/
}
}
static int kill_proc_info(int sig, struct kernel_siginfo *info, pid_t pid)
{
int error;
rcu_read_lock();
error = kill_pid_info(sig, info, find_vpid(pid));
rcu_read_unlock();
return error;
}
static inline bool kill_as_cred_perm(const struct cred *cred,
struct task_struct *target)
{
const struct cred *pcred = __task_cred(target);
return uid_eq(cred->euid, pcred->suid) ||
uid_eq(cred->euid, pcred->uid) ||
uid_eq(cred->uid, pcred->suid) ||
uid_eq(cred->uid, pcred->uid);
}
/*
* The usb asyncio usage of siginfo is wrong. The glibc support
* for asyncio which uses SI_ASYNCIO assumes the layout is SIL_RT.
* AKA after the generic fields:
* kernel_pid_t si_pid;
* kernel_uid32_t si_uid;
* sigval_t si_value;
*
* Unfortunately when usb generates SI_ASYNCIO it assumes the layout
* after the generic fields is:
* void __user *si_addr;
*
* This is a practical problem when there is a 64bit big endian kernel
* and a 32bit userspace. As the 32bit address will encoded in the low
* 32bits of the pointer. Those low 32bits will be stored at higher
* address than appear in a 32 bit pointer. So userspace will not
* see the address it was expecting for it's completions.
*
* There is nothing in the encoding that can allow
* copy_siginfo_to_user32 to detect this confusion of formats, so
* handle this by requiring the caller of kill_pid_usb_asyncio to
* notice when this situration takes place and to store the 32bit
* pointer in sival_int, instead of sival_addr of the sigval_t addr
* parameter.
*/
int kill_pid_usb_asyncio(int sig, int errno, sigval_t addr,
struct pid *pid, const struct cred *cred)
{
struct kernel_siginfo info;
struct task_struct *p;
unsigned long flags;
int ret = -EINVAL;
if (!valid_signal(sig))
return ret;
clear_siginfo(&info);
info.si_signo = sig;
info.si_errno = errno;
info.si_code = SI_ASYNCIO;
*((sigval_t *)&info.si_pid) = addr;
rcu_read_lock();
p = pid_task(pid, PIDTYPE_PID);
if (!p) {
ret = -ESRCH;
goto out_unlock;
}
if (!kill_as_cred_perm(cred, p)) {
ret = -EPERM;
goto out_unlock;
}
ret = security_task_kill(p, &info, sig, cred);
if (ret)
goto out_unlock;
if (sig) {
if (lock_task_sighand(p, &flags)) {
ret = __send_signal(sig, &info, p, PIDTYPE_TGID, false);
unlock_task_sighand(p, &flags);
} else
ret = -ESRCH;
}
out_unlock:
rcu_read_unlock();
return ret;
}
EXPORT_SYMBOL_GPL(kill_pid_usb_asyncio);
/*
* kill_something_info() interprets pid in interesting ways just like kill(2).
*
* POSIX specifies that kill(-1,sig) is unspecified, but what we have
* is probably wrong. Should make it like BSD or SYSV.
*/
static int kill_something_info(int sig, struct kernel_siginfo *info, pid_t pid)
{
int ret;
if (pid > 0)
return kill_proc_info(sig, info, pid);
/* -INT_MIN is undefined. Exclude this case to avoid a UBSAN warning */
if (pid == INT_MIN)
return -ESRCH;
read_lock(&tasklist_lock);
if (pid != -1) {
ret = __kill_pgrp_info(sig, info,
pid ? find_vpid(-pid) : task_pgrp(current));
} else {
int retval = 0, count = 0;
struct task_struct * p;
for_each_process(p) {
if (task_pid_vnr(p) > 1 &&
!same_thread_group(p, current)) {
int err = group_send_sig_info(sig, info, p,
PIDTYPE_MAX);
++count;
if (err != -EPERM)
retval = err;
}
}
ret = count ? retval : -ESRCH;
}
read_unlock(&tasklist_lock);
return ret;
}
/*
* These are for backward compatibility with the rest of the kernel source.
*/
int send_sig_info(int sig, struct kernel_siginfo *info, struct task_struct *p)
{
/*
* Make sure legacy kernel users don't send in bad values
* (normal paths check this in check_kill_permission).
*/
if (!valid_signal(sig))
return -EINVAL;
return do_send_sig_info(sig, info, p, PIDTYPE_PID);
}
EXPORT_SYMBOL(send_sig_info);
#define __si_special(priv) \
((priv) ? SEND_SIG_PRIV : SEND_SIG_NOINFO)
int
send_sig(int sig, struct task_struct *p, int priv)
{
return send_sig_info(sig, __si_special(priv), p);
}
EXPORT_SYMBOL(send_sig);
void force_sig(int sig)
{
struct kernel_siginfo info;
clear_siginfo(&info);
info.si_signo = sig;
info.si_errno = 0;
info.si_code = SI_KERNEL;
info.si_pid = 0;
info.si_uid = 0;
force_sig_info(&info);
}
EXPORT_SYMBOL(force_sig);
void force_fatal_sig(int sig)
{
struct kernel_siginfo info;
clear_siginfo(&info);
info.si_signo = sig;
info.si_errno = 0;
info.si_code = SI_KERNEL;
info.si_pid = 0;
info.si_uid = 0;
force_sig_info_to_task(&info, current, HANDLER_SIG_DFL);
}
void force_exit_sig(int sig)
{
struct kernel_siginfo info;
clear_siginfo(&info);
info.si_signo = sig;
info.si_errno = 0;
info.si_code = SI_KERNEL;
info.si_pid = 0;
info.si_uid = 0;
force_sig_info_to_task(&info, current, HANDLER_EXIT);
}
/*
* When things go south during signal handling, we
* will force a SIGSEGV. And if the signal that caused
* the problem was already a SIGSEGV, we'll want to
* make sure we don't even try to deliver the signal..
*/
void force_sigsegv(int sig)
{
if (sig == SIGSEGV)
force_fatal_sig(SIGSEGV);
else
force_sig(SIGSEGV);
}
int force_sig_fault_to_task(int sig, int code, void __user *addr
___ARCH_SI_IA64(int imm, unsigned int flags, unsigned long isr)
, struct task_struct *t)
{
struct kernel_siginfo info;
clear_siginfo(&info);
info.si_signo = sig;
info.si_errno = 0;
info.si_code = code;
info.si_addr = addr;
#ifdef __ia64__
info.si_imm = imm;
info.si_flags = flags;
info.si_isr = isr;
#endif
return force_sig_info_to_task(&info, t, HANDLER_CURRENT);
}
int force_sig_fault(int sig, int code, void __user *addr
___ARCH_SI_IA64(int imm, unsigned int flags, unsigned long isr))
{
return force_sig_fault_to_task(sig, code, addr
___ARCH_SI_IA64(imm, flags, isr), current);
}
int send_sig_fault(int sig, int code, void __user *addr
___ARCH_SI_IA64(int imm, unsigned int flags, unsigned long isr)
, struct task_struct *t)
{
struct kernel_siginfo info;
clear_siginfo(&info);
info.si_signo = sig;
info.si_errno = 0;
info.si_code = code;
info.si_addr = addr;
#ifdef __ia64__
info.si_imm = imm;
info.si_flags = flags;
info.si_isr = isr;
#endif
return send_sig_info(info.si_signo, &info, t);
}
int force_sig_mceerr(int code, void __user *addr, short lsb)
{
struct kernel_siginfo info;
WARN_ON((code != BUS_MCEERR_AO) && (code != BUS_MCEERR_AR));
clear_siginfo(&info);
info.si_signo = SIGBUS;
info.si_errno = 0;
info.si_code = code;
info.si_addr = addr;
info.si_addr_lsb = lsb;
return force_sig_info(&info);
}
int send_sig_mceerr(int code, void __user *addr, short lsb, struct task_struct *t)
{
struct kernel_siginfo info;
WARN_ON((code != BUS_MCEERR_AO) && (code != BUS_MCEERR_AR));
clear_siginfo(&info);
info.si_signo = SIGBUS;
info.si_errno = 0;
info.si_code = code;
info.si_addr = addr;
info.si_addr_lsb = lsb;
return send_sig_info(info.si_signo, &info, t);
}
EXPORT_SYMBOL(send_sig_mceerr);
int force_sig_bnderr(void __user *addr, void __user *lower, void __user *upper)
{
struct kernel_siginfo info;
clear_siginfo(&info);
info.si_signo = SIGSEGV;
info.si_errno = 0;
info.si_code = SEGV_BNDERR;
info.si_addr = addr;
info.si_lower = lower;
info.si_upper = upper;
return force_sig_info(&info);
}
#ifdef SEGV_PKUERR
int force_sig_pkuerr(void __user *addr, u32 pkey)
{
struct kernel_siginfo info;
clear_siginfo(&info);
info.si_signo = SIGSEGV;
info.si_errno = 0;
info.si_code = SEGV_PKUERR;
info.si_addr = addr;
info.si_pkey = pkey;
return force_sig_info(&info);
}
#endif
int force_sig_perf(void __user *addr, u32 type, u64 sig_data)
{
struct kernel_siginfo info;
clear_siginfo(&info);
info.si_signo = SIGTRAP;
info.si_errno = 0;
info.si_code = TRAP_PERF;
info.si_addr = addr;
info.si_perf_data = sig_data;
info.si_perf_type = type;
return force_sig_info(&info);
}
/**
* force_sig_seccomp - signals the task to allow in-process syscall emulation
* @syscall: syscall number to send to userland
* @reason: filter-supplied reason code to send to userland (via si_errno)
*
* Forces a SIGSYS with a code of SYS_SECCOMP and related sigsys info.
*/
int force_sig_seccomp(int syscall, int reason, bool force_coredump)
{
struct kernel_siginfo info;
clear_siginfo(&info);
info.si_signo = SIGSYS;
info.si_code = SYS_SECCOMP;
info.si_call_addr = (void __user *)KSTK_EIP(current);
info.si_errno = reason;
info.si_arch = syscall_get_arch(current);
info.si_syscall = syscall;
return force_sig_info_to_task(&info, current,
force_coredump ? HANDLER_EXIT : HANDLER_CURRENT);
}
/* For the crazy architectures that include trap information in
* the errno field, instead of an actual errno value.
*/
int force_sig_ptrace_errno_trap(int errno, void __user *addr)
{
struct kernel_siginfo info;
clear_siginfo(&info);
info.si_signo = SIGTRAP;
info.si_errno = errno;
info.si_code = TRAP_HWBKPT;
info.si_addr = addr;
return force_sig_info(&info);
}
/* For the rare architectures that include trap information using
* si_trapno.
*/
int force_sig_fault_trapno(int sig, int code, void __user *addr, int trapno)
{
struct kernel_siginfo info;
clear_siginfo(&info);
info.si_signo = sig;
info.si_errno = 0;
info.si_code = code;
info.si_addr = addr;
info.si_trapno = trapno;
return force_sig_info(&info);
}
/* For the rare architectures that include trap information using
* si_trapno.
*/
int send_sig_fault_trapno(int sig, int code, void __user *addr, int trapno,
struct task_struct *t)
{
struct kernel_siginfo info;
clear_siginfo(&info);
info.si_signo = sig;
info.si_errno = 0;
info.si_code = code;
info.si_addr = addr;
info.si_trapno = trapno;
return send_sig_info(info.si_signo, &info, t);
}
int kill_pgrp(struct pid *pid, int sig, int priv)
{
int ret;
read_lock(&tasklist_lock);
ret = __kill_pgrp_info(sig, __si_special(priv), pid);
read_unlock(&tasklist_lock);
return ret;
}
EXPORT_SYMBOL(kill_pgrp);
int kill_pid(struct pid *pid, int sig, int priv)
{
return kill_pid_info(sig, __si_special(priv), pid);
}
EXPORT_SYMBOL(kill_pid);
/*
* These functions support sending signals using preallocated sigqueue
* structures. This is needed "because realtime applications cannot
* afford to lose notifications of asynchronous events, like timer
* expirations or I/O completions". In the case of POSIX Timers
* we allocate the sigqueue structure from the timer_create. If this
* allocation fails we are able to report the failure to the application
* with an EAGAIN error.
*/
struct sigqueue *sigqueue_alloc(void)
{
return __sigqueue_alloc(-1, current, GFP_KERNEL, 0, SIGQUEUE_PREALLOC);
}
void sigqueue_free(struct sigqueue *q)
{
unsigned long flags;
spinlock_t *lock = ¤t->sighand->siglock;
BUG_ON(!(q->flags & SIGQUEUE_PREALLOC));
/*
* We must hold ->siglock while testing q->list
* to serialize with collect_signal() or with
* __exit_signal()->flush_sigqueue().
*/
spin_lock_irqsave(lock, flags);
q->flags &= ~SIGQUEUE_PREALLOC;
/*
* If it is queued it will be freed when dequeued,
* like the "regular" sigqueue.
*/
if (!list_empty(&q->list))
q = NULL;
spin_unlock_irqrestore(lock, flags);
if (q)
__sigqueue_free(q);
}
int send_sigqueue(struct sigqueue *q, struct pid *pid, enum pid_type type)
{
int sig = q->info.si_signo;
struct sigpending *pending;
struct task_struct *t;
unsigned long flags;
int ret, result;
BUG_ON(!(q->flags & SIGQUEUE_PREALLOC));
ret = -1;
rcu_read_lock();
t = pid_task(pid, type);
if (!t || !likely(lock_task_sighand(t, &flags)))
goto ret;
ret = 1; /* the signal is ignored */
result = TRACE_SIGNAL_IGNORED;
if (!prepare_signal(sig, t, false))
goto out;
ret = 0;
if (unlikely(!list_empty(&q->list))) {
/*
* If an SI_TIMER entry is already queue just increment
* the overrun count.
*/
BUG_ON(q->info.si_code != SI_TIMER);
q->info.si_overrun++;
result = TRACE_SIGNAL_ALREADY_PENDING;
goto out;
}
q->info.si_overrun = 0;
signalfd_notify(t, sig);
pending = (type != PIDTYPE_PID) ? &t->signal->shared_pending : &t->pending;
list_add_tail(&q->list, &pending->list);
sigaddset(&pending->signal, sig);
complete_signal(sig, t, type);
result = TRACE_SIGNAL_DELIVERED;
out:
trace_signal_generate(sig, &q->info, t, type != PIDTYPE_PID, result);
unlock_task_sighand(t, &flags);
ret:
rcu_read_unlock();
return ret;
}
static void do_notify_pidfd(struct task_struct *task)
{
struct pid *pid;
WARN_ON(task->exit_state == 0);
pid = task_pid(task);
wake_up_all(&pid->wait_pidfd);
}
/*
* Let a parent know about the death of a child.
* For a stopped/continued status change, use do_notify_parent_cldstop instead.
*
* Returns true if our parent ignored us and so we've switched to
* self-reaping.
*/
bool do_notify_parent(struct task_struct *tsk, int sig)
{
struct kernel_siginfo info;
unsigned long flags;
struct sighand_struct *psig;
bool autoreap = false;
u64 utime, stime;
BUG_ON(sig == -1);
/* do_notify_parent_cldstop should have been called instead. */
BUG_ON(task_is_stopped_or_traced(tsk));
BUG_ON(!tsk->ptrace &&
(tsk->group_leader != tsk || !thread_group_empty(tsk)));
/* Wake up all pidfd waiters */
do_notify_pidfd(tsk);
if (sig != SIGCHLD) {
/*
* This is only possible if parent == real_parent.
* Check if it has changed security domain.
*/
if (tsk->parent_exec_id != READ_ONCE(tsk->parent->self_exec_id))
sig = SIGCHLD;
}
clear_siginfo(&info);
info.si_signo = sig;
info.si_errno = 0;
/*
* We are under tasklist_lock here so our parent is tied to
* us and cannot change.
*
* task_active_pid_ns will always return the same pid namespace
* until a task passes through release_task.
*
* write_lock() currently calls preempt_disable() which is the
* same as rcu_read_lock(), but according to Oleg, this is not
* correct to rely on this
*/
rcu_read_lock();
info.si_pid = task_pid_nr_ns(tsk, task_active_pid_ns(tsk->parent));
info.si_uid = from_kuid_munged(task_cred_xxx(tsk->parent, user_ns),
task_uid(tsk));
rcu_read_unlock();
task_cputime(tsk, &utime, &stime);
info.si_utime = nsec_to_clock_t(utime + tsk->signal->utime);
info.si_stime = nsec_to_clock_t(stime + tsk->signal->stime);
info.si_status = tsk->exit_code & 0x7f;
if (tsk->exit_code & 0x80)
info.si_code = CLD_DUMPED;
else if (tsk->exit_code & 0x7f)
info.si_code = CLD_KILLED;
else {
info.si_code = CLD_EXITED;
info.si_status = tsk->exit_code >> 8;
}
psig = tsk->parent->sighand;
spin_lock_irqsave(&psig->siglock, flags);
if (!tsk->ptrace && sig == SIGCHLD &&
(psig->action[SIGCHLD-1].sa.sa_handler == SIG_IGN ||
(psig->action[SIGCHLD-1].sa.sa_flags & SA_NOCLDWAIT))) {
/*
* We are exiting and our parent doesn't care. POSIX.1
* defines special semantics for setting SIGCHLD to SIG_IGN
* or setting the SA_NOCLDWAIT flag: we should be reaped
* automatically and not left for our parent's wait4 call.
* Rather than having the parent do it as a magic kind of
* signal handler, we just set this to tell do_exit that we
* can be cleaned up without becoming a zombie. Note that
* we still call __wake_up_parent in this case, because a
* blocked sys_wait4 might now return -ECHILD.
*
* Whether we send SIGCHLD or not for SA_NOCLDWAIT
* is implementation-defined: we do (if you don't want
* it, just use SIG_IGN instead).
*/
autoreap = true;
if (psig->action[SIGCHLD-1].sa.sa_handler == SIG_IGN)
sig = 0;
}
/*
* Send with __send_signal as si_pid and si_uid are in the
* parent's namespaces.
*/
if (valid_signal(sig) && sig)
__send_signal(sig, &info, tsk->parent, PIDTYPE_TGID, false);
__wake_up_parent(tsk, tsk->parent);
spin_unlock_irqrestore(&psig->siglock, flags);
return autoreap;
}
/**
* do_notify_parent_cldstop - notify parent of stopped/continued state change
* @tsk: task reporting the state change
* @for_ptracer: the notification is for ptracer
* @why: CLD_{CONTINUED|STOPPED|TRAPPED} to report
*
* Notify @tsk's parent that the stopped/continued state has changed. If
* @for_ptracer is %false, @tsk's group leader notifies to its real parent.
* If %true, @tsk reports to @tsk->parent which should be the ptracer.
*
* CONTEXT:
* Must be called with tasklist_lock at least read locked.
*/
static void do_notify_parent_cldstop(struct task_struct *tsk,
bool for_ptracer, int why)
{
struct kernel_siginfo info;
unsigned long flags;
struct task_struct *parent;
struct sighand_struct *sighand;
u64 utime, stime;
if (for_ptracer) {
parent = tsk->parent;
} else {
tsk = tsk->group_leader;
parent = tsk->real_parent;
}
clear_siginfo(&info);
info.si_signo = SIGCHLD;
info.si_errno = 0;
/*
* see comment in do_notify_parent() about the following 4 lines
*/
rcu_read_lock();
info.si_pid = task_pid_nr_ns(tsk, task_active_pid_ns(parent));
info.si_uid = from_kuid_munged(task_cred_xxx(parent, user_ns), task_uid(tsk));
rcu_read_unlock();
task_cputime(tsk, &utime, &stime);
info.si_utime = nsec_to_clock_t(utime);
info.si_stime = nsec_to_clock_t(stime);
info.si_code = why;
switch (why) {
case CLD_CONTINUED:
info.si_status = SIGCONT;
break;
case CLD_STOPPED:
info.si_status = tsk->signal->group_exit_code & 0x7f;
break;
case CLD_TRAPPED:
info.si_status = tsk->exit_code & 0x7f;
break;
default:
BUG();
}
sighand = parent->sighand;
spin_lock_irqsave(&sighand->siglock, flags);
if (sighand->action[SIGCHLD-1].sa.sa_handler != SIG_IGN &&
!(sighand->action[SIGCHLD-1].sa.sa_flags & SA_NOCLDSTOP))
__group_send_sig_info(SIGCHLD, &info, parent);
/*
* Even if SIGCHLD is not generated, we must wake up wait4 calls.
*/
__wake_up_parent(tsk, parent);
spin_unlock_irqrestore(&sighand->siglock, flags);
}
static inline bool may_ptrace_stop(void)
{
if (!likely(current->ptrace))
return false;
/*
* Are we in the middle of do_coredump?
* If so and our tracer is also part of the coredump stopping
* is a deadlock situation, and pointless because our tracer
* is dead so don't allow us to stop.
* If SIGKILL was already sent before the caller unlocked
* ->siglock we must see ->core_state != NULL. Otherwise it
* is safe to enter schedule().
*
* This is almost outdated, a task with the pending SIGKILL can't
* block in TASK_TRACED. But PTRACE_EVENT_EXIT can be reported
* after SIGKILL was already dequeued.
*/
if (unlikely(current->mm->core_state) &&
unlikely(current->mm == current->parent->mm))
return false;
return true;
}
/*
* This must be called with current->sighand->siglock held.
*
* This should be the path for all ptrace stops.
* We always set current->last_siginfo while stopped here.
* That makes it a way to test a stopped process for
* being ptrace-stopped vs being job-control-stopped.
*
* If we actually decide not to stop at all because the tracer
* is gone, we keep current->exit_code unless clear_code.
*/
static void ptrace_stop(int exit_code, int why, int clear_code, kernel_siginfo_t *info)
__releases(¤t->sighand->siglock)
__acquires(¤t->sighand->siglock)
{
bool gstop_done = false;
if (arch_ptrace_stop_needed(exit_code, info)) {
/*
* The arch code has something special to do before a
* ptrace stop. This is allowed to block, e.g. for faults
* on user stack pages. We can't keep the siglock while
* calling arch_ptrace_stop, so we must release it now.
* To preserve proper semantics, we must do this before
* any signal bookkeeping like checking group_stop_count.
*/
spin_unlock_irq(¤t->sighand->siglock);
arch_ptrace_stop(exit_code, info);
spin_lock_irq(¤t->sighand->siglock);
}
/*
* schedule() will not sleep if there is a pending signal that
* can awaken the task.
*/
set_special_state(TASK_TRACED);
/*
* We're committing to trapping. TRACED should be visible before
* TRAPPING is cleared; otherwise, the tracer might fail do_wait().
* Also, transition to TRACED and updates to ->jobctl should be
* atomic with respect to siglock and should be done after the arch
* hook as siglock is released and regrabbed across it.
*
* TRACER TRACEE
*
* ptrace_attach()
* [L] wait_on_bit(JOBCTL_TRAPPING) [S] set_special_state(TRACED)
* do_wait()
* set_current_state() smp_wmb();
* ptrace_do_wait()
* wait_task_stopped()
* task_stopped_code()
* [L] task_is_traced() [S] task_clear_jobctl_trapping();
*/
smp_wmb();
current->last_siginfo = info;
current->exit_code = exit_code;
/*
* If @why is CLD_STOPPED, we're trapping to participate in a group
* stop. Do the bookkeeping. Note that if SIGCONT was delievered
* across siglock relocks since INTERRUPT was scheduled, PENDING
* could be clear now. We act as if SIGCONT is received after
* TASK_TRACED is entered - ignore it.
*/
if (why == CLD_STOPPED && (current->jobctl & JOBCTL_STOP_PENDING))
gstop_done = task_participate_group_stop(current);
/* any trap clears pending STOP trap, STOP trap clears NOTIFY */
task_clear_jobctl_pending(current, JOBCTL_TRAP_STOP);
if (info && info->si_code >> 8 == PTRACE_EVENT_STOP)
task_clear_jobctl_pending(current, JOBCTL_TRAP_NOTIFY);
/* entering a trap, clear TRAPPING */
task_clear_jobctl_trapping(current);
spin_unlock_irq(¤t->sighand->siglock);
read_lock(&tasklist_lock);
if (may_ptrace_stop()) {
/*
* Notify parents of the stop.
*
* While ptraced, there are two parents - the ptracer and
* the real_parent of the group_leader. The ptracer should
* know about every stop while the real parent is only
* interested in the completion of group stop. The states
* for the two don't interact with each other. Notify
* separately unless they're gonna be duplicates.
*/
do_notify_parent_cldstop(current, true, why);
if (gstop_done && ptrace_reparented(current))
do_notify_parent_cldstop(current, false, why);
/*
* Don't want to allow preemption here, because
* sys_ptrace() needs this task to be inactive.
*
* XXX: implement read_unlock_no_resched().
*/
preempt_disable();
read_unlock(&tasklist_lock);
cgroup_enter_frozen();
preempt_enable_no_resched();
freezable_schedule();
cgroup_leave_frozen(true);
} else {
/*
* By the time we got the lock, our tracer went away.
* Don't drop the lock yet, another tracer may come.
*
* If @gstop_done, the ptracer went away between group stop
* completion and here. During detach, it would have set
* JOBCTL_STOP_PENDING on us and we'll re-enter
* TASK_STOPPED in do_signal_stop() on return, so notifying
* the real parent of the group stop completion is enough.
*/
if (gstop_done)
do_notify_parent_cldstop(current, false, why);
/* tasklist protects us from ptrace_freeze_traced() */
__set_current_state(TASK_RUNNING);
if (clear_code)
current->exit_code = 0;
read_unlock(&tasklist_lock);
}
/*
* We are back. Now reacquire the siglock before touching
* last_siginfo, so that we are sure to have synchronized with
* any signal-sending on another CPU that wants to examine it.
*/
spin_lock_irq(¤t->sighand->siglock);
current->last_siginfo = NULL;
/* LISTENING can be set only during STOP traps, clear it */
current->jobctl &= ~JOBCTL_LISTENING;
/*
* Queued signals ignored us while we were stopped for tracing.
* So check for any that we should take before resuming user mode.
* This sets TIF_SIGPENDING, but never clears it.
*/
recalc_sigpending_tsk(current);
}
static void ptrace_do_notify(int signr, int exit_code, int why)
{
kernel_siginfo_t info;
clear_siginfo(&info);
info.si_signo = signr;
info.si_code = exit_code;
info.si_pid = task_pid_vnr(current);
info.si_uid = from_kuid_munged(current_user_ns(), current_uid());
/* Let the debugger run. */
ptrace_stop(exit_code, why, 1, &info);
}
void ptrace_notify(int exit_code)
{
BUG_ON((exit_code & (0x7f | ~0xffff)) != SIGTRAP);
if (unlikely(current->task_works))
task_work_run();
spin_lock_irq(¤t->sighand->siglock);
ptrace_do_notify(SIGTRAP, exit_code, CLD_TRAPPED);
spin_unlock_irq(¤t->sighand->siglock);
}
/**
* do_signal_stop - handle group stop for SIGSTOP and other stop signals
* @signr: signr causing group stop if initiating
*
* If %JOBCTL_STOP_PENDING is not set yet, initiate group stop with @signr
* and participate in it. If already set, participate in the existing
* group stop. If participated in a group stop (and thus slept), %true is
* returned with siglock released.
*
* If ptraced, this function doesn't handle stop itself. Instead,
* %JOBCTL_TRAP_STOP is scheduled and %false is returned with siglock
* untouched. The caller must ensure that INTERRUPT trap handling takes
* places afterwards.
*
* CONTEXT:
* Must be called with @current->sighand->siglock held, which is released
* on %true return.
*
* RETURNS:
* %false if group stop is already cancelled or ptrace trap is scheduled.
* %true if participated in group stop.
*/
static bool do_signal_stop(int signr)
__releases(¤t->sighand->siglock)
{
struct signal_struct *sig = current->signal;
if (!(current->jobctl & JOBCTL_STOP_PENDING)) {
unsigned long gstop = JOBCTL_STOP_PENDING | JOBCTL_STOP_CONSUME;
struct task_struct *t;
/* signr will be recorded in task->jobctl for retries */
WARN_ON_ONCE(signr & ~JOBCTL_STOP_SIGMASK);
if (!likely(current->jobctl & JOBCTL_STOP_DEQUEUED) ||
unlikely(signal_group_exit(sig)))
return false;
/*
* There is no group stop already in progress. We must
* initiate one now.
*
* While ptraced, a task may be resumed while group stop is
* still in effect and then receive a stop signal and
* initiate another group stop. This deviates from the
* usual behavior as two consecutive stop signals can't
* cause two group stops when !ptraced. That is why we
* also check !task_is_stopped(t) below.
*
* The condition can be distinguished by testing whether
* SIGNAL_STOP_STOPPED is already set. Don't generate
* group_exit_code in such case.
*
* This is not necessary for SIGNAL_STOP_CONTINUED because
* an intervening stop signal is required to cause two
* continued events regardless of ptrace.
*/
if (!(sig->flags & SIGNAL_STOP_STOPPED))
sig->group_exit_code = signr;
sig->group_stop_count = 0;
if (task_set_jobctl_pending(current, signr | gstop))
sig->group_stop_count++;
t = current;
while_each_thread(current, t) {
/*
* Setting state to TASK_STOPPED for a group
* stop is always done with the siglock held,
* so this check has no races.
*/
if (!task_is_stopped(t) &&
task_set_jobctl_pending(t, signr | gstop)) {
sig->group_stop_count++;
if (likely(!(t->ptrace & PT_SEIZED)))
signal_wake_up(t, 0);
else
ptrace_trap_notify(t);
}
}
}
if (likely(!current->ptrace)) {
int notify = 0;
/*
* If there are no other threads in the group, or if there
* is a group stop in progress and we are the last to stop,
* report to the parent.
*/
if (task_participate_group_stop(current))
notify = CLD_STOPPED;
set_special_state(TASK_STOPPED);
spin_unlock_irq(¤t->sighand->siglock);
/*
* Notify the parent of the group stop completion. Because
* we're not holding either the siglock or tasklist_lock
* here, ptracer may attach inbetween; however, this is for
* group stop and should always be delivered to the real
* parent of the group leader. The new ptracer will get
* its notification when this task transitions into
* TASK_TRACED.
*/
if (notify) {
read_lock(&tasklist_lock);
do_notify_parent_cldstop(current, false, notify);
read_unlock(&tasklist_lock);
}
/* Now we don't run again until woken by SIGCONT or SIGKILL */
cgroup_enter_frozen();
freezable_schedule();
return true;
} else {
/*
* While ptraced, group stop is handled by STOP trap.
* Schedule it and let the caller deal with it.
*/
task_set_jobctl_pending(current, JOBCTL_TRAP_STOP);
return false;
}
}
/**
* do_jobctl_trap - take care of ptrace jobctl traps
*
* When PT_SEIZED, it's used for both group stop and explicit
* SEIZE/INTERRUPT traps. Both generate PTRACE_EVENT_STOP trap with
* accompanying siginfo. If stopped, lower eight bits of exit_code contain
* the stop signal; otherwise, %SIGTRAP.
*
* When !PT_SEIZED, it's used only for group stop trap with stop signal
* number as exit_code and no siginfo.
*
* CONTEXT:
* Must be called with @current->sighand->siglock held, which may be
* released and re-acquired before returning with intervening sleep.
*/
static void do_jobctl_trap(void)
{
struct signal_struct *signal = current->signal;
int signr = current->jobctl & JOBCTL_STOP_SIGMASK;
if (current->ptrace & PT_SEIZED) {
if (!signal->group_stop_count &&
!(signal->flags & SIGNAL_STOP_STOPPED))
signr = SIGTRAP;
WARN_ON_ONCE(!signr); ptrace_do_notify(signr, signr | (PTRACE_EVENT_STOP << 8),
CLD_STOPPED);
} else {
WARN_ON_ONCE(!signr); ptrace_stop(signr, CLD_STOPPED, 0, NULL);
current->exit_code = 0;
}
}
/**
* do_freezer_trap - handle the freezer jobctl trap
*
* Puts the task into frozen state, if only the task is not about to quit.
* In this case it drops JOBCTL_TRAP_FREEZE.
*
* CONTEXT:
* Must be called with @current->sighand->siglock held,
* which is always released before returning.
*/
static void do_freezer_trap(void)
__releases(¤t->sighand->siglock)
{
/*
* If there are other trap bits pending except JOBCTL_TRAP_FREEZE,
* let's make another loop to give it a chance to be handled.
* In any case, we'll return back.
*/
if ((current->jobctl & (JOBCTL_PENDING_MASK | JOBCTL_TRAP_FREEZE)) !=
JOBCTL_TRAP_FREEZE) {
spin_unlock_irq(¤t->sighand->siglock);
return;
}
/*
* Now we're sure that there is no pending fatal signal and no
* pending traps. Clear TIF_SIGPENDING to not get out of schedule()
* immediately (if there is a non-fatal signal pending), and
* put the task into sleep.
*/
__set_current_state(TASK_INTERRUPTIBLE);
clear_thread_flag(TIF_SIGPENDING);
spin_unlock_irq(¤t->sighand->siglock);
cgroup_enter_frozen();
freezable_schedule();
}
static int ptrace_signal(int signr, kernel_siginfo_t *info)
{
/*
* We do not check sig_kernel_stop(signr) but set this marker
* unconditionally because we do not know whether debugger will
* change signr. This flag has no meaning unless we are going
* to stop after return from ptrace_stop(). In this case it will
* be checked in do_signal_stop(), we should only stop if it was
* not cleared by SIGCONT while we were sleeping. See also the
* comment in dequeue_signal().
*/
current->jobctl |= JOBCTL_STOP_DEQUEUED;
ptrace_stop(signr, CLD_TRAPPED, 0, info);
/* We're back. Did the debugger cancel the sig? */
signr = current->exit_code;
if (signr == 0)
return signr;
current->exit_code = 0;
/*
* Update the siginfo structure if the signal has
* changed. If the debugger wanted something
* specific in the siginfo structure then it should
* have updated *info via PTRACE_SETSIGINFO.
*/
if (signr != info->si_signo) {
clear_siginfo(info);
info->si_signo = signr;
info->si_errno = 0;
info->si_code = SI_USER;
rcu_read_lock();
info->si_pid = task_pid_vnr(current->parent);
info->si_uid = from_kuid_munged(current_user_ns(),
task_uid(current->parent));
rcu_read_unlock();
}
/* If the (new) signal is now blocked, requeue it. */
if (sigismember(¤t->blocked, signr)) {
send_signal(signr, info, current, PIDTYPE_PID);
signr = 0;
}
return signr;
}
static void hide_si_addr_tag_bits(struct ksignal *ksig)
{
switch (siginfo_layout(ksig->sig, ksig->info.si_code)) {
case SIL_FAULT:
case SIL_FAULT_TRAPNO:
case SIL_FAULT_MCEERR:
case SIL_FAULT_BNDERR:
case SIL_FAULT_PKUERR:
case SIL_FAULT_PERF_EVENT:
ksig->info.si_addr = arch_untagged_si_addr(
ksig->info.si_addr, ksig->sig, ksig->info.si_code);
break;
case SIL_KILL:
case SIL_TIMER:
case SIL_POLL:
case SIL_CHLD:
case SIL_RT:
case SIL_SYS:
break;
}
}
bool get_signal(struct ksignal *ksig)
{
struct sighand_struct *sighand = current->sighand;
struct signal_struct *signal = current->signal;
int signr;
if (unlikely(current->task_works))
task_work_run();
/*
* For non-generic architectures, check for TIF_NOTIFY_SIGNAL so
* that the arch handlers don't all have to do it. If we get here
* without TIF_SIGPENDING, just exit after running signal work.
*/
if (!IS_ENABLED(CONFIG_GENERIC_ENTRY)) {
if (test_thread_flag(TIF_NOTIFY_SIGNAL))
tracehook_notify_signal();
if (!task_sigpending(current))
return false;
}
if (unlikely(uprobe_deny_signal()))
return false;
/*
* Do this once, we can't return to user-mode if freezing() == T.
* do_signal_stop() and ptrace_stop() do freezable_schedule() and
* thus do not need another check after return.
*/
try_to_freeze();
relock:
spin_lock_irq(&sighand->siglock);
/*
* Every stopped thread goes here after wakeup. Check to see if
* we should notify the parent, prepare_signal(SIGCONT) encodes
* the CLD_ si_code into SIGNAL_CLD_MASK bits.
*/
if (unlikely(signal->flags & SIGNAL_CLD_MASK)) {
int why;
if (signal->flags & SIGNAL_CLD_CONTINUED)
why = CLD_CONTINUED;
else
why = CLD_STOPPED;
signal->flags &= ~SIGNAL_CLD_MASK;
spin_unlock_irq(&sighand->siglock);
/*
* Notify the parent that we're continuing. This event is
* always per-process and doesn't make whole lot of sense
* for ptracers, who shouldn't consume the state via
* wait(2) either, but, for backward compatibility, notify
* the ptracer of the group leader too unless it's gonna be
* a duplicate.
*/
read_lock(&tasklist_lock);
do_notify_parent_cldstop(current, false, why);
if (ptrace_reparented(current->group_leader))
do_notify_parent_cldstop(current->group_leader,
true, why);
read_unlock(&tasklist_lock);
goto relock;
}
for (;;) {
struct k_sigaction *ka;
/* Has this task already been marked for death? */
if (signal_group_exit(signal)) { ksig->info.si_signo = signr = SIGKILL;
sigdelset(¤t->pending.signal, SIGKILL);
trace_signal_deliver(SIGKILL, SEND_SIG_NOINFO,
&sighand->action[SIGKILL - 1]);
recalc_sigpending();
goto fatal;
}
if (unlikely(current->jobctl & JOBCTL_STOP_PENDING) &&
do_signal_stop(0))
goto relock;
if (unlikely(current->jobctl &
(JOBCTL_TRAP_MASK | JOBCTL_TRAP_FREEZE))) {
if (current->jobctl & JOBCTL_TRAP_MASK) {
do_jobctl_trap();
spin_unlock_irq(&sighand->siglock);
} else if (current->jobctl & JOBCTL_TRAP_FREEZE)
do_freezer_trap();
goto relock;
}
/*
* If the task is leaving the frozen state, let's update
* cgroup counters and reset the frozen bit.
*/
if (unlikely(cgroup_task_frozen(current))) {
spin_unlock_irq(&sighand->siglock);
cgroup_leave_frozen(false);
goto relock;
}
/*
* Signals generated by the execution of an instruction
* need to be delivered before any other pending signals
* so that the instruction pointer in the signal stack
* frame points to the faulting instruction.
*/
signr = dequeue_synchronous_signal(&ksig->info); if (!signr)
signr = dequeue_signal(current, ¤t->blocked, &ksig->info);
if (!signr)
break; /* will return 0 */
if (unlikely(current->ptrace) && (signr != SIGKILL) && !(sighand->action[signr -1].sa.sa_flags & SA_IMMUTABLE)) {
signr = ptrace_signal(signr, &ksig->info);
if (!signr)
continue;
}
ka = &sighand->action[signr-1];
/* Trace actually delivered signals. */
trace_signal_deliver(signr, &ksig->info, ka);
if (ka->sa.sa_handler == SIG_IGN) /* Do nothing. */
continue;
if (ka->sa.sa_handler != SIG_DFL) {
/* Run the handler. */
ksig->ka = *ka;
if (ka->sa.sa_flags & SA_ONESHOT)
ka->sa.sa_handler = SIG_DFL;
break; /* will return non-zero "signr" value */
}
/*
* Now we are doing the default action for this signal.
*/
if (sig_kernel_ignore(signr)) /* Default is nothing. */
continue;
/*
* Global init gets no signals it doesn't want.
* Container-init gets no signals it doesn't want from same
* container.
*
* Note that if global/container-init sees a sig_kernel_only()
* signal here, the signal must have been generated internally
* or must have come from an ancestor namespace. In either
* case, the signal cannot be dropped.
*/
if (unlikely(signal->flags & SIGNAL_UNKILLABLE) && !sig_kernel_only(signr))
continue;
if (sig_kernel_stop(signr)) {
/*
* The default action is to stop all threads in
* the thread group. The job control signals
* do nothing in an orphaned pgrp, but SIGSTOP
* always works. Note that siglock needs to be
* dropped during the call to is_orphaned_pgrp()
* because of lock ordering with tasklist_lock.
* This allows an intervening SIGCONT to be posted.
* We need to check for that and bail out if necessary.
*/
if (signr != SIGSTOP) {
spin_unlock_irq(&sighand->siglock);
/* signals can be posted during this window */
if (is_current_pgrp_orphaned())
goto relock;
spin_lock_irq(&sighand->siglock);
}
if (likely(do_signal_stop(ksig->info.si_signo))) {
/* It released the siglock. */
goto relock;
}
/*
* We didn't actually stop, due to a race
* with SIGCONT or something like that.
*/
continue;
}
fatal:
spin_unlock_irq(&sighand->siglock);
if (unlikely(cgroup_task_frozen(current)))
cgroup_leave_frozen(true);
/*
* Anything else is fatal, maybe with a core dump.
*/
current->flags |= PF_SIGNALED;
if (sig_kernel_coredump(signr)) { if (print_fatal_signals)
print_fatal_signal(ksig->info.si_signo);
proc_coredump_connector(current);
/*
* If it was able to dump core, this kills all
* other threads in the group and synchronizes with
* their demise. If we lost the race with another
* thread getting here, it set group_exit_code
* first and our do_group_exit call below will use
* that value and ignore the one we pass it.
*/
do_coredump(&ksig->info);
}
/*
* PF_IO_WORKER threads will catch and exit on fatal signals
* themselves. They have cleanup that must be performed, so
* we cannot call do_exit() on their behalf.
*/
if (current->flags & PF_IO_WORKER)
goto out;
/*
* Death signals, no core dump.
*/
do_group_exit(ksig->info.si_signo);
/* NOTREACHED */
}
spin_unlock_irq(&sighand->siglock);
out:
ksig->sig = signr;
if (!(ksig->ka.sa.sa_flags & SA_EXPOSE_TAGBITS))
hide_si_addr_tag_bits(ksig);
return ksig->sig > 0;
}
/**
* signal_delivered -
* @ksig: kernel signal struct
* @stepping: nonzero if debugger single-step or block-step in use
*
* This function should be called when a signal has successfully been
* delivered. It updates the blocked signals accordingly (@ksig->ka.sa.sa_mask
* is always blocked, and the signal itself is blocked unless %SA_NODEFER
* is set in @ksig->ka.sa.sa_flags. Tracing is notified.
*/
static void signal_delivered(struct ksignal *ksig, int stepping)
{
sigset_t blocked;
/* A signal was successfully delivered, and the
saved sigmask was stored on the signal frame,
and will be restored by sigreturn. So we can
simply clear the restore sigmask flag. */
clear_restore_sigmask();
sigorsets(&blocked, ¤t->blocked, &ksig->ka.sa.sa_mask);
if (!(ksig->ka.sa.sa_flags & SA_NODEFER))
sigaddset(&blocked, ksig->sig);
set_current_blocked(&blocked);
if (current->sas_ss_flags & SS_AUTODISARM)
sas_ss_reset(current);
tracehook_signal_handler(stepping);
}
void signal_setup_done(int failed, struct ksignal *ksig, int stepping)
{
if (failed) force_sigsegv(ksig->sig);
else
signal_delivered(ksig, stepping);
}
/*
* It could be that complete_signal() picked us to notify about the
* group-wide signal. Other threads should be notified now to take
* the shared signals in @which since we will not.
*/
static void retarget_shared_pending(struct task_struct *tsk, sigset_t *which)
{
sigset_t retarget;
struct task_struct *t;
sigandsets(&retarget, &tsk->signal->shared_pending.signal, which);
if (sigisemptyset(&retarget))
return;
t = tsk;
while_each_thread(tsk, t) {
if (t->flags & PF_EXITING)
continue;
if (!has_pending_signals(&retarget, &t->blocked))
continue;
/* Remove the signals this thread can handle. */
sigandsets(&retarget, &retarget, &t->blocked);
if (!task_sigpending(t))
signal_wake_up(t, 0);
if (sigisemptyset(&retarget))
break;
}
}
void exit_signals(struct task_struct *tsk)
{
int group_stop = 0;
sigset_t unblocked;
/*
* @tsk is about to have PF_EXITING set - lock out users which
* expect stable threadgroup.
*/
cgroup_threadgroup_change_begin(tsk);
if (thread_group_empty(tsk) || signal_group_exit(tsk->signal)) {
tsk->flags |= PF_EXITING;
cgroup_threadgroup_change_end(tsk);
return;
}
spin_lock_irq(&tsk->sighand->siglock);
/*
* From now this task is not visible for group-wide signals,
* see wants_signal(), do_signal_stop().
*/
tsk->flags |= PF_EXITING;
cgroup_threadgroup_change_end(tsk);
if (!task_sigpending(tsk))
goto out;
unblocked = tsk->blocked;
signotset(&unblocked);
retarget_shared_pending(tsk, &unblocked);
if (unlikely(tsk->jobctl & JOBCTL_STOP_PENDING) &&
task_participate_group_stop(tsk))
group_stop = CLD_STOPPED;
out:
spin_unlock_irq(&tsk->sighand->siglock);
/*
* If group stop has completed, deliver the notification. This
* should always go to the real parent of the group leader.
*/
if (unlikely(group_stop)) {
read_lock(&tasklist_lock);
do_notify_parent_cldstop(tsk, false, group_stop);
read_unlock(&tasklist_lock);
}
}
/*
* System call entry points.
*/
/**
* sys_restart_syscall - restart a system call
*/
SYSCALL_DEFINE0(restart_syscall)
{
struct restart_block *restart = ¤t->restart_block;
return restart->fn(restart);
}
long do_no_restart_syscall(struct restart_block *param)
{
return -EINTR;
}
static void __set_task_blocked(struct task_struct *tsk, const sigset_t *newset)
{
if (task_sigpending(tsk) && !thread_group_empty(tsk)) {
sigset_t newblocked;
/* A set of now blocked but previously unblocked signals. */
sigandnsets(&newblocked, newset, ¤t->blocked);
retarget_shared_pending(tsk, &newblocked);
}
tsk->blocked = *newset;
recalc_sigpending();
}
/**
* set_current_blocked - change current->blocked mask
* @newset: new mask
*
* It is wrong to change ->blocked directly, this helper should be used
* to ensure the process can't miss a shared signal we are going to block.
*/
void set_current_blocked(sigset_t *newset)
{
sigdelsetmask(newset, sigmask(SIGKILL) | sigmask(SIGSTOP));
__set_current_blocked(newset);
}
void __set_current_blocked(const sigset_t *newset)
{
struct task_struct *tsk = current;
/*
* In case the signal mask hasn't changed, there is nothing we need
* to do. The current->blocked shouldn't be modified by other task.
*/
if (sigequalsets(&tsk->blocked, newset))
return;
spin_lock_irq(&tsk->sighand->siglock);
__set_task_blocked(tsk, newset);
spin_unlock_irq(&tsk->sighand->siglock);
}
/*
* This is also useful for kernel threads that want to temporarily
* (or permanently) block certain signals.
*
* NOTE! Unlike the user-mode sys_sigprocmask(), the kernel
* interface happily blocks "unblockable" signals like SIGKILL
* and friends.
*/
int sigprocmask(int how, sigset_t *set, sigset_t *oldset)
{
struct task_struct *tsk = current;
sigset_t newset;
/* Lockless, only current can change ->blocked, never from irq */
if (oldset)
*oldset = tsk->blocked;
switch (how) {
case SIG_BLOCK:
sigorsets(&newset, &tsk->blocked, set);
break;
case SIG_UNBLOCK:
sigandnsets(&newset, &tsk->blocked, set);
break;
case SIG_SETMASK:
newset = *set;
break;
default:
return -EINVAL;
}
__set_current_blocked(&newset);
return 0;
}
EXPORT_SYMBOL(sigprocmask);
/*
* The api helps set app-provided sigmasks.
*
* This is useful for syscalls such as ppoll, pselect, io_pgetevents and
* epoll_pwait where a new sigmask is passed from userland for the syscalls.
*
* Note that it does set_restore_sigmask() in advance, so it must be always
* paired with restore_saved_sigmask_unless() before return from syscall.
*/
int set_user_sigmask(const sigset_t __user *umask, size_t sigsetsize)
{
sigset_t kmask;
if (!umask)
return 0;
if (sigsetsize != sizeof(sigset_t))
return -EINVAL;
if (copy_from_user(&kmask, umask, sizeof(sigset_t)))
return -EFAULT;
set_restore_sigmask();
current->saved_sigmask = current->blocked;
set_current_blocked(&kmask);
return 0;
}
#ifdef CONFIG_COMPAT
int set_compat_user_sigmask(const compat_sigset_t __user *umask,
size_t sigsetsize)
{
sigset_t kmask;
if (!umask)
return 0;
if (sigsetsize != sizeof(compat_sigset_t))
return -EINVAL;
if (get_compat_sigset(&kmask, umask))
return -EFAULT;
set_restore_sigmask();
current->saved_sigmask = current->blocked;
set_current_blocked(&kmask);
return 0;
}
#endif
/**
* sys_rt_sigprocmask - change the list of currently blocked signals
* @how: whether to add, remove, or set signals
* @nset: stores pending signals
* @oset: previous value of signal mask if non-null
* @sigsetsize: size of sigset_t type
*/
SYSCALL_DEFINE4(rt_sigprocmask, int, how, sigset_t __user *, nset,
sigset_t __user *, oset, size_t, sigsetsize)
{
sigset_t old_set, new_set;
int error;
/* XXX: Don't preclude handling different sized sigset_t's. */
if (sigsetsize != sizeof(sigset_t))
return -EINVAL;
old_set = current->blocked;
if (nset) {
if (copy_from_user(&new_set, nset, sizeof(sigset_t)))
return -EFAULT;
sigdelsetmask(&new_set, sigmask(SIGKILL)|sigmask(SIGSTOP));
error = sigprocmask(how, &new_set, NULL);
if (error)
return error;
}
if (oset) {
if (copy_to_user(oset, &old_set, sizeof(sigset_t)))
return -EFAULT;
}
return 0;
}
#ifdef CONFIG_COMPAT
COMPAT_SYSCALL_DEFINE4(rt_sigprocmask, int, how, compat_sigset_t __user *, nset,
compat_sigset_t __user *, oset, compat_size_t, sigsetsize)
{
sigset_t old_set = current->blocked;
/* XXX: Don't preclude handling different sized sigset_t's. */
if (sigsetsize != sizeof(sigset_t))
return -EINVAL;
if (nset) {
sigset_t new_set;
int error;
if (get_compat_sigset(&new_set, nset))
return -EFAULT;
sigdelsetmask(&new_set, sigmask(SIGKILL)|sigmask(SIGSTOP));
error = sigprocmask(how, &new_set, NULL);
if (error)
return error;
}
return oset ? put_compat_sigset(oset, &old_set, sizeof(*oset)) : 0;
}
#endif
static void do_sigpending(sigset_t *set)
{
spin_lock_irq(¤t->sighand->siglock);
sigorsets(set, ¤t->pending.signal,
¤t->signal->shared_pending.signal);
spin_unlock_irq(¤t->sighand->siglock);
/* Outside the lock because only this thread touches it. */
sigandsets(set, ¤t->blocked, set);
}
/**
* sys_rt_sigpending - examine a pending signal that has been raised
* while blocked
* @uset: stores pending signals
* @sigsetsize: size of sigset_t type or larger
*/
SYSCALL_DEFINE2(rt_sigpending, sigset_t __user *, uset, size_t, sigsetsize)
{
sigset_t set;
if (sigsetsize > sizeof(*uset))
return -EINVAL;
do_sigpending(&set);
if (copy_to_user(uset, &set, sigsetsize))
return -EFAULT;
return 0;
}
#ifdef CONFIG_COMPAT
COMPAT_SYSCALL_DEFINE2(rt_sigpending, compat_sigset_t __user *, uset,
compat_size_t, sigsetsize)
{
sigset_t set;
if (sigsetsize > sizeof(*uset))
return -EINVAL;
do_sigpending(&set);
return put_compat_sigset(uset, &set, sigsetsize);
}
#endif
static const struct {
unsigned char limit, layout;
} sig_sicodes[] = {
[SIGILL] = { NSIGILL, SIL_FAULT },
[SIGFPE] = { NSIGFPE, SIL_FAULT },
[SIGSEGV] = { NSIGSEGV, SIL_FAULT },
[SIGBUS] = { NSIGBUS, SIL_FAULT },
[SIGTRAP] = { NSIGTRAP, SIL_FAULT },
#if defined(SIGEMT)
[SIGEMT] = { NSIGEMT, SIL_FAULT },
#endif
[SIGCHLD] = { NSIGCHLD, SIL_CHLD },
[SIGPOLL] = { NSIGPOLL, SIL_POLL },
[SIGSYS] = { NSIGSYS, SIL_SYS },
};
static bool known_siginfo_layout(unsigned sig, int si_code)
{
if (si_code == SI_KERNEL)
return true;
else if ((si_code > SI_USER)) {
if (sig_specific_sicodes(sig)) {
if (si_code <= sig_sicodes[sig].limit)
return true;
}
else if (si_code <= NSIGPOLL)
return true;
}
else if (si_code >= SI_DETHREAD)
return true;
else if (si_code == SI_ASYNCNL)
return true;
return false;
}
enum siginfo_layout siginfo_layout(unsigned sig, int si_code)
{
enum siginfo_layout layout = SIL_KILL;
if ((si_code > SI_USER) && (si_code < SI_KERNEL)) { if ((sig < ARRAY_SIZE(sig_sicodes)) && (si_code <= sig_sicodes[sig].limit)) { layout = sig_sicodes[sig].layout;
/* Handle the exceptions */
if ((sig == SIGBUS) &&
(si_code >= BUS_MCEERR_AR) && (si_code <= BUS_MCEERR_AO))
layout = SIL_FAULT_MCEERR;
else if ((sig == SIGSEGV) && (si_code == SEGV_BNDERR))
layout = SIL_FAULT_BNDERR;
#ifdef SEGV_PKUERR
else if ((sig == SIGSEGV) && (si_code == SEGV_PKUERR))
layout = SIL_FAULT_PKUERR;
#endif
else if ((sig == SIGTRAP) && (si_code == TRAP_PERF))
layout = SIL_FAULT_PERF_EVENT;
else if (IS_ENABLED(CONFIG_SPARC) &&
(sig == SIGILL) && (si_code == ILL_ILLTRP))
layout = SIL_FAULT_TRAPNO;
else if (IS_ENABLED(CONFIG_ALPHA) &&
((sig == SIGFPE) ||
((sig == SIGTRAP) && (si_code == TRAP_UNK))))
layout = SIL_FAULT_TRAPNO;
}
else if (si_code <= NSIGPOLL)
layout = SIL_POLL;
} else {
if (si_code == SI_TIMER)
layout = SIL_TIMER;
else if (si_code == SI_SIGIO)
layout = SIL_POLL;
else if (si_code < 0)
layout = SIL_RT;
}
return layout;
}
static inline char __user *si_expansion(const siginfo_t __user *info)
{
return ((char __user *)info) + sizeof(struct kernel_siginfo);
}
int copy_siginfo_to_user(siginfo_t __user *to, const kernel_siginfo_t *from)
{
char __user *expansion = si_expansion(to);
if (copy_to_user(to, from , sizeof(struct kernel_siginfo)))
return -EFAULT;
if (clear_user(expansion, SI_EXPANSION_SIZE))
return -EFAULT;
return 0;
}
static int post_copy_siginfo_from_user(kernel_siginfo_t *info,
const siginfo_t __user *from)
{
if (unlikely(!known_siginfo_layout(info->si_signo, info->si_code))) {
char __user *expansion = si_expansion(from);
char buf[SI_EXPANSION_SIZE];
int i;
/*
* An unknown si_code might need more than
* sizeof(struct kernel_siginfo) bytes. Verify all of the
* extra bytes are 0. This guarantees copy_siginfo_to_user
* will return this data to userspace exactly.
*/
if (copy_from_user(&buf, expansion, SI_EXPANSION_SIZE))
return -EFAULT;
for (i = 0; i < SI_EXPANSION_SIZE; i++) {
if (buf[i] != 0)
return -E2BIG;
}
}
return 0;
}
static int __copy_siginfo_from_user(int signo, kernel_siginfo_t *to,
const siginfo_t __user *from)
{
if (copy_from_user(to, from, sizeof(struct kernel_siginfo)))
return -EFAULT;
to->si_signo = signo;
return post_copy_siginfo_from_user(to, from);
}
int copy_siginfo_from_user(kernel_siginfo_t *to, const siginfo_t __user *from)
{
if (copy_from_user(to, from, sizeof(struct kernel_siginfo)))
return -EFAULT;
return post_copy_siginfo_from_user(to, from);
}
#ifdef CONFIG_COMPAT
/**
* copy_siginfo_to_external32 - copy a kernel siginfo into a compat user siginfo
* @to: compat siginfo destination
* @from: kernel siginfo source
*
* Note: This function does not work properly for the SIGCHLD on x32, but
* fortunately it doesn't have to. The only valid callers for this function are
* copy_siginfo_to_user32, which is overriden for x32 and the coredump code.
* The latter does not care because SIGCHLD will never cause a coredump.
*/
void copy_siginfo_to_external32(struct compat_siginfo *to,
const struct kernel_siginfo *from)
{
memset(to, 0, sizeof(*to));
to->si_signo = from->si_signo;
to->si_errno = from->si_errno;
to->si_code = from->si_code;
switch(siginfo_layout(from->si_signo, from->si_code)) {
case SIL_KILL:
to->si_pid = from->si_pid;
to->si_uid = from->si_uid;
break;
case SIL_TIMER:
to->si_tid = from->si_tid;
to->si_overrun = from->si_overrun;
to->si_int = from->si_int;
break;
case SIL_POLL:
to->si_band = from->si_band;
to->si_fd = from->si_fd;
break;
case SIL_FAULT:
to->si_addr = ptr_to_compat(from->si_addr);
break;
case SIL_FAULT_TRAPNO:
to->si_addr = ptr_to_compat(from->si_addr);
to->si_trapno = from->si_trapno;
break;
case SIL_FAULT_MCEERR:
to->si_addr = ptr_to_compat(from->si_addr);
to->si_addr_lsb = from->si_addr_lsb;
break;
case SIL_FAULT_BNDERR:
to->si_addr = ptr_to_compat(from->si_addr);
to->si_lower = ptr_to_compat(from->si_lower);
to->si_upper = ptr_to_compat(from->si_upper);
break;
case SIL_FAULT_PKUERR:
to->si_addr = ptr_to_compat(from->si_addr);
to->si_pkey = from->si_pkey;
break;
case SIL_FAULT_PERF_EVENT:
to->si_addr = ptr_to_compat(from->si_addr);
to->si_perf_data = from->si_perf_data;
to->si_perf_type = from->si_perf_type;
break;
case SIL_CHLD:
to->si_pid = from->si_pid;
to->si_uid = from->si_uid;
to->si_status = from->si_status;
to->si_utime = from->si_utime;
to->si_stime = from->si_stime;
break;
case SIL_RT:
to->si_pid = from->si_pid;
to->si_uid = from->si_uid;
to->si_int = from->si_int;
break;
case SIL_SYS:
to->si_call_addr = ptr_to_compat(from->si_call_addr);
to->si_syscall = from->si_syscall;
to->si_arch = from->si_arch;
break;
}
}
int __copy_siginfo_to_user32(struct compat_siginfo __user *to,
const struct kernel_siginfo *from)
{
struct compat_siginfo new;
copy_siginfo_to_external32(&new, from);
if (copy_to_user(to, &new, sizeof(struct compat_siginfo)))
return -EFAULT;
return 0;
}
static int post_copy_siginfo_from_user32(kernel_siginfo_t *to,
const struct compat_siginfo *from)
{
clear_siginfo(to);
to->si_signo = from->si_signo;
to->si_errno = from->si_errno;
to->si_code = from->si_code;
switch(siginfo_layout(from->si_signo, from->si_code)) {
case SIL_KILL:
to->si_pid = from->si_pid;
to->si_uid = from->si_uid;
break;
case SIL_TIMER:
to->si_tid = from->si_tid;
to->si_overrun = from->si_overrun;
to->si_int = from->si_int;
break;
case SIL_POLL:
to->si_band = from->si_band;
to->si_fd = from->si_fd;
break;
case SIL_FAULT:
to->si_addr = compat_ptr(from->si_addr);
break;
case SIL_FAULT_TRAPNO:
to->si_addr = compat_ptr(from->si_addr);
to->si_trapno = from->si_trapno;
break;
case SIL_FAULT_MCEERR:
to->si_addr = compat_ptr(from->si_addr);
to->si_addr_lsb = from->si_addr_lsb;
break;
case SIL_FAULT_BNDERR:
to->si_addr = compat_ptr(from->si_addr);
to->si_lower = compat_ptr(from->si_lower);
to->si_upper = compat_ptr(from->si_upper);
break;
case SIL_FAULT_PKUERR:
to->si_addr = compat_ptr(from->si_addr);
to->si_pkey = from->si_pkey;
break;
case SIL_FAULT_PERF_EVENT:
to->si_addr = compat_ptr(from->si_addr);
to->si_perf_data = from->si_perf_data;
to->si_perf_type = from->si_perf_type;
break;
case SIL_CHLD:
to->si_pid = from->si_pid;
to->si_uid = from->si_uid;
to->si_status = from->si_status;
#ifdef CONFIG_X86_X32_ABI
if (in_x32_syscall()) {
to->si_utime = from->_sifields._sigchld_x32._utime;
to->si_stime = from->_sifields._sigchld_x32._stime;
} else
#endif
{
to->si_utime = from->si_utime;
to->si_stime = from->si_stime;
}
break;
case SIL_RT:
to->si_pid = from->si_pid;
to->si_uid = from->si_uid;
to->si_int = from->si_int;
break;
case SIL_SYS:
to->si_call_addr = compat_ptr(from->si_call_addr);
to->si_syscall = from->si_syscall;
to->si_arch = from->si_arch;
break;
}
return 0;
}
static int __copy_siginfo_from_user32(int signo, struct kernel_siginfo *to,
const struct compat_siginfo __user *ufrom)
{
struct compat_siginfo from;
if (copy_from_user(&from, ufrom, sizeof(struct compat_siginfo)))
return -EFAULT;
from.si_signo = signo;
return post_copy_siginfo_from_user32(to, &from);
}
int copy_siginfo_from_user32(struct kernel_siginfo *to,
const struct compat_siginfo __user *ufrom)
{
struct compat_siginfo from;
if (copy_from_user(&from, ufrom, sizeof(struct compat_siginfo)))
return -EFAULT;
return post_copy_siginfo_from_user32(to, &from);
}
#endif /* CONFIG_COMPAT */
/**
* do_sigtimedwait - wait for queued signals specified in @which
* @which: queued signals to wait for
* @info: if non-null, the signal's siginfo is returned here
* @ts: upper bound on process time suspension
*/
static int do_sigtimedwait(const sigset_t *which, kernel_siginfo_t *info,
const struct timespec64 *ts)
{
ktime_t *to = NULL, timeout = KTIME_MAX;
struct task_struct *tsk = current;
sigset_t mask = *which;
int sig, ret = 0;
if (ts) {
if (!timespec64_valid(ts))
return -EINVAL;
timeout = timespec64_to_ktime(*ts);
to = &timeout;
}
/*
* Invert the set of allowed signals to get those we want to block.
*/
sigdelsetmask(&mask, sigmask(SIGKILL) | sigmask(SIGSTOP));
signotset(&mask);
spin_lock_irq(&tsk->sighand->siglock);
sig = dequeue_signal(tsk, &mask, info);
if (!sig && timeout) {
/*
* None ready, temporarily unblock those we're interested
* while we are sleeping in so that we'll be awakened when
* they arrive. Unblocking is always fine, we can avoid
* set_current_blocked().
*/
tsk->real_blocked = tsk->blocked;
sigandsets(&tsk->blocked, &tsk->blocked, &mask);
recalc_sigpending();
spin_unlock_irq(&tsk->sighand->siglock);
__set_current_state(TASK_INTERRUPTIBLE);
ret = freezable_schedule_hrtimeout_range(to, tsk->timer_slack_ns,
HRTIMER_MODE_REL);
spin_lock_irq(&tsk->sighand->siglock);
__set_task_blocked(tsk, &tsk->real_blocked);
sigemptyset(&tsk->real_blocked);
sig = dequeue_signal(tsk, &mask, info);
}
spin_unlock_irq(&tsk->sighand->siglock);
if (sig)
return sig;
return ret ? -EINTR : -EAGAIN;
}
/**
* sys_rt_sigtimedwait - synchronously wait for queued signals specified
* in @uthese
* @uthese: queued signals to wait for
* @uinfo: if non-null, the signal's siginfo is returned here
* @uts: upper bound on process time suspension
* @sigsetsize: size of sigset_t type
*/
SYSCALL_DEFINE4(rt_sigtimedwait, const sigset_t __user *, uthese,
siginfo_t __user *, uinfo,
const struct __kernel_timespec __user *, uts,
size_t, sigsetsize)
{
sigset_t these;
struct timespec64 ts;
kernel_siginfo_t info;
int ret;
/* XXX: Don't preclude handling different sized sigset_t's. */
if (sigsetsize != sizeof(sigset_t))
return -EINVAL;
if (copy_from_user(&these, uthese, sizeof(these)))
return -EFAULT;
if (uts) {
if (get_timespec64(&ts, uts))
return -EFAULT;
}
ret = do_sigtimedwait(&these, &info, uts ? &ts : NULL);
if (ret > 0 && uinfo) {
if (copy_siginfo_to_user(uinfo, &info))
ret = -EFAULT;
}
return ret;
}
#ifdef CONFIG_COMPAT_32BIT_TIME
SYSCALL_DEFINE4(rt_sigtimedwait_time32, const sigset_t __user *, uthese,
siginfo_t __user *, uinfo,
const struct old_timespec32 __user *, uts,
size_t, sigsetsize)
{
sigset_t these;
struct timespec64 ts;
kernel_siginfo_t info;
int ret;
if (sigsetsize != sizeof(sigset_t))
return -EINVAL;
if (copy_from_user(&these, uthese, sizeof(these)))
return -EFAULT;
if (uts) {
if (get_old_timespec32(&ts, uts))
return -EFAULT;
}
ret = do_sigtimedwait(&these, &info, uts ? &ts : NULL);
if (ret > 0 && uinfo) {
if (copy_siginfo_to_user(uinfo, &info))
ret = -EFAULT;
}
return ret;
}
#endif
#ifdef CONFIG_COMPAT
COMPAT_SYSCALL_DEFINE4(rt_sigtimedwait_time64, compat_sigset_t __user *, uthese,
struct compat_siginfo __user *, uinfo,
struct __kernel_timespec __user *, uts, compat_size_t, sigsetsize)
{
sigset_t s;
struct timespec64 t;
kernel_siginfo_t info;
long ret;
if (sigsetsize != sizeof(sigset_t))
return -EINVAL;
if (get_compat_sigset(&s, uthese))
return -EFAULT;
if (uts) {
if (get_timespec64(&t, uts))
return -EFAULT;
}
ret = do_sigtimedwait(&s, &info, uts ? &t : NULL);
if (ret > 0 && uinfo) {
if (copy_siginfo_to_user32(uinfo, &info))
ret = -EFAULT;
}
return ret;
}
#ifdef CONFIG_COMPAT_32BIT_TIME
COMPAT_SYSCALL_DEFINE4(rt_sigtimedwait_time32, compat_sigset_t __user *, uthese,
struct compat_siginfo __user *, uinfo,
struct old_timespec32 __user *, uts, compat_size_t, sigsetsize)
{
sigset_t s;
struct timespec64 t;
kernel_siginfo_t info;
long ret;
if (sigsetsize != sizeof(sigset_t))
return -EINVAL;
if (get_compat_sigset(&s, uthese))
return -EFAULT;
if (uts) {
if (get_old_timespec32(&t, uts))
return -EFAULT;
}
ret = do_sigtimedwait(&s, &info, uts ? &t : NULL);
if (ret > 0 && uinfo) {
if (copy_siginfo_to_user32(uinfo, &info))
ret = -EFAULT;
}
return ret;
}
#endif
#endif
static inline void prepare_kill_siginfo(int sig, struct kernel_siginfo *info)
{
clear_siginfo(info);
info->si_signo = sig;
info->si_errno = 0;
info->si_code = SI_USER;
info->si_pid = task_tgid_vnr(current);
info->si_uid = from_kuid_munged(current_user_ns(), current_uid());
}
/**
* sys_kill - send a signal to a process
* @pid: the PID of the process
* @sig: signal to be sent
*/
SYSCALL_DEFINE2(kill, pid_t, pid, int, sig)
{
struct kernel_siginfo info;
prepare_kill_siginfo(sig, &info);
return kill_something_info(sig, &info, pid);
}
/*
* Verify that the signaler and signalee either are in the same pid namespace
* or that the signaler's pid namespace is an ancestor of the signalee's pid
* namespace.
*/
static bool access_pidfd_pidns(struct pid *pid)
{
struct pid_namespace *active = task_active_pid_ns(current);
struct pid_namespace *p = ns_of_pid(pid);
for (;;) {
if (!p)
return false;
if (p == active)
break;
p = p->parent;
}
return true;
}
static int copy_siginfo_from_user_any(kernel_siginfo_t *kinfo,
siginfo_t __user *info)
{
#ifdef CONFIG_COMPAT
/*
* Avoid hooking up compat syscalls and instead handle necessary
* conversions here. Note, this is a stop-gap measure and should not be
* considered a generic solution.
*/
if (in_compat_syscall())
return copy_siginfo_from_user32(
kinfo, (struct compat_siginfo __user *)info);
#endif
return copy_siginfo_from_user(kinfo, info);
}
static struct pid *pidfd_to_pid(const struct file *file)
{
struct pid *pid;
pid = pidfd_pid(file);
if (!IS_ERR(pid))
return pid;
return tgid_pidfd_to_pid(file);
}
/**
* sys_pidfd_send_signal - Signal a process through a pidfd
* @pidfd: file descriptor of the process
* @sig: signal to send
* @info: signal info
* @flags: future flags
*
* The syscall currently only signals via PIDTYPE_PID which covers
* kill(<positive-pid>, <signal>. It does not signal threads or process
* groups.
* In order to extend the syscall to threads and process groups the @flags
* argument should be used. In essence, the @flags argument will determine
* what is signaled and not the file descriptor itself. Put in other words,
* grouping is a property of the flags argument not a property of the file
* descriptor.
*
* Return: 0 on success, negative errno on failure
*/
SYSCALL_DEFINE4(pidfd_send_signal, int, pidfd, int, sig,
siginfo_t __user *, info, unsigned int, flags)
{
int ret;
struct fd f;
struct pid *pid;
kernel_siginfo_t kinfo;
/* Enforce flags be set to 0 until we add an extension. */
if (flags)
return -EINVAL;
f = fdget(pidfd);
if (!f.file)
return -EBADF;
/* Is this a pidfd? */
pid = pidfd_to_pid(f.file);
if (IS_ERR(pid)) {
ret = PTR_ERR(pid);
goto err;
}
ret = -EINVAL;
if (!access_pidfd_pidns(pid))
goto err;
if (info) {
ret = copy_siginfo_from_user_any(&kinfo, info);
if (unlikely(ret))
goto err;
ret = -EINVAL;
if (unlikely(sig != kinfo.si_signo))
goto err;
/* Only allow sending arbitrary signals to yourself. */
ret = -EPERM;
if ((task_pid(current) != pid) &&
(kinfo.si_code >= 0 || kinfo.si_code == SI_TKILL))
goto err;
} else {
prepare_kill_siginfo(sig, &kinfo);
}
ret = kill_pid_info(sig, &kinfo, pid);
err:
fdput(f);
return ret;
}
static int
do_send_specific(pid_t tgid, pid_t pid, int sig, struct kernel_siginfo *info)
{
struct task_struct *p;
int error = -ESRCH;
rcu_read_lock();
p = find_task_by_vpid(pid);
if (p && (tgid <= 0 || task_tgid_vnr(p) == tgid)) {
error = check_kill_permission(sig, info, p);
/*
* The null signal is a permissions and process existence
* probe. No signal is actually delivered.
*/
if (!error && sig) {
error = do_send_sig_info(sig, info, p, PIDTYPE_PID);
/*
* If lock_task_sighand() failed we pretend the task
* dies after receiving the signal. The window is tiny,
* and the signal is private anyway.
*/
if (unlikely(error == -ESRCH))
error = 0;
}
}
rcu_read_unlock();
return error;
}
static int do_tkill(pid_t tgid, pid_t pid, int sig)
{
struct kernel_siginfo info;
clear_siginfo(&info);
info.si_signo = sig;
info.si_errno = 0;
info.si_code = SI_TKILL;
info.si_pid = task_tgid_vnr(current);
info.si_uid = from_kuid_munged(current_user_ns(), current_uid());
return do_send_specific(tgid, pid, sig, &info);
}
/**
* sys_tgkill - send signal to one specific thread
* @tgid: the thread group ID of the thread
* @pid: the PID of the thread
* @sig: signal to be sent
*
* This syscall also checks the @tgid and returns -ESRCH even if the PID
* exists but it's not belonging to the target process anymore. This
* method solves the problem of threads exiting and PIDs getting reused.
*/
SYSCALL_DEFINE3(tgkill, pid_t, tgid, pid_t, pid, int, sig)
{
/* This is only valid for single tasks */
if (pid <= 0 || tgid <= 0)
return -EINVAL;
return do_tkill(tgid, pid, sig);
}
/**
* sys_tkill - send signal to one specific task
* @pid: the PID of the task
* @sig: signal to be sent
*
* Send a signal to only one task, even if it's a CLONE_THREAD task.
*/
SYSCALL_DEFINE2(tkill, pid_t, pid, int, sig)
{
/* This is only valid for single tasks */
if (pid <= 0)
return -EINVAL;
return do_tkill(0, pid, sig);
}
static int do_rt_sigqueueinfo(pid_t pid, int sig, kernel_siginfo_t *info)
{
/* Not even root can pretend to send signals from the kernel.
* Nor can they impersonate a kill()/tgkill(), which adds source info.
*/
if ((info->si_code >= 0 || info->si_code == SI_TKILL) &&
(task_pid_vnr(current) != pid))
return -EPERM;
/* POSIX.1b doesn't mention process groups. */
return kill_proc_info(sig, info, pid);
}
/**
* sys_rt_sigqueueinfo - send signal information to a signal
* @pid: the PID of the thread
* @sig: signal to be sent
* @uinfo: signal info to be sent
*/
SYSCALL_DEFINE3(rt_sigqueueinfo, pid_t, pid, int, sig,
siginfo_t __user *, uinfo)
{
kernel_siginfo_t info;
int ret = __copy_siginfo_from_user(sig, &info, uinfo);
if (unlikely(ret))
return ret;
return do_rt_sigqueueinfo(pid, sig, &info);
}
#ifdef CONFIG_COMPAT
COMPAT_SYSCALL_DEFINE3(rt_sigqueueinfo,
compat_pid_t, pid,
int, sig,
struct compat_siginfo __user *, uinfo)
{
kernel_siginfo_t info;
int ret = __copy_siginfo_from_user32(sig, &info, uinfo);
if (unlikely(ret))
return ret;
return do_rt_sigqueueinfo(pid, sig, &info);
}
#endif
static int do_rt_tgsigqueueinfo(pid_t tgid, pid_t pid, int sig, kernel_siginfo_t *info)
{
/* This is only valid for single tasks */
if (pid <= 0 || tgid <= 0)
return -EINVAL;
/* Not even root can pretend to send signals from the kernel.
* Nor can they impersonate a kill()/tgkill(), which adds source info.
*/
if ((info->si_code >= 0 || info->si_code == SI_TKILL) &&
(task_pid_vnr(current) != pid))
return -EPERM;
return do_send_specific(tgid, pid, sig, info);
}
SYSCALL_DEFINE4(rt_tgsigqueueinfo, pid_t, tgid, pid_t, pid, int, sig,
siginfo_t __user *, uinfo)
{
kernel_siginfo_t info;
int ret = __copy_siginfo_from_user(sig, &info, uinfo);
if (unlikely(ret))
return ret;
return do_rt_tgsigqueueinfo(tgid, pid, sig, &info);
}
#ifdef CONFIG_COMPAT
COMPAT_SYSCALL_DEFINE4(rt_tgsigqueueinfo,
compat_pid_t, tgid,
compat_pid_t, pid,
int, sig,
struct compat_siginfo __user *, uinfo)
{
kernel_siginfo_t info;
int ret = __copy_siginfo_from_user32(sig, &info, uinfo);
if (unlikely(ret))
return ret;
return do_rt_tgsigqueueinfo(tgid, pid, sig, &info);
}
#endif
/*
* For kthreads only, must not be used if cloned with CLONE_SIGHAND
*/
void kernel_sigaction(int sig, __sighandler_t action)
{
spin_lock_irq(¤t->sighand->siglock);
current->sighand->action[sig - 1].sa.sa_handler = action;
if (action == SIG_IGN) {
sigset_t mask;
sigemptyset(&mask);
sigaddset(&mask, sig);
flush_sigqueue_mask(&mask, ¤t->signal->shared_pending);
flush_sigqueue_mask(&mask, ¤t->pending);
recalc_sigpending();
}
spin_unlock_irq(¤t->sighand->siglock);
}
EXPORT_SYMBOL(kernel_sigaction);
void __weak sigaction_compat_abi(struct k_sigaction *act,
struct k_sigaction *oact)
{
}
int do_sigaction(int sig, struct k_sigaction *act, struct k_sigaction *oact)
{
struct task_struct *p = current, *t;
struct k_sigaction *k;
sigset_t mask;
if (!valid_signal(sig) || sig < 1 || (act && sig_kernel_only(sig)))
return -EINVAL;
k = &p->sighand->action[sig-1];
spin_lock_irq(&p->sighand->siglock);
if (k->sa.sa_flags & SA_IMMUTABLE) {
spin_unlock_irq(&p->sighand->siglock);
return -EINVAL;
}
if (oact)
*oact = *k;
/*
* Make sure that we never accidentally claim to support SA_UNSUPPORTED,
* e.g. by having an architecture use the bit in their uapi.
*/
BUILD_BUG_ON(UAPI_SA_FLAGS & SA_UNSUPPORTED);
/*
* Clear unknown flag bits in order to allow userspace to detect missing
* support for flag bits and to allow the kernel to use non-uapi bits
* internally.
*/
if (act)
act->sa.sa_flags &= UAPI_SA_FLAGS;
if (oact)
oact->sa.sa_flags &= UAPI_SA_FLAGS;
sigaction_compat_abi(act, oact);
if (act) {
sigdelsetmask(&act->sa.sa_mask,
sigmask(SIGKILL) | sigmask(SIGSTOP));
*k = *act;
/*
* POSIX 3.3.1.3:
* "Setting a signal action to SIG_IGN for a signal that is
* pending shall cause the pending signal to be discarded,
* whether or not it is blocked."
*
* "Setting a signal action to SIG_DFL for a signal that is
* pending and whose default action is to ignore the signal
* (for example, SIGCHLD), shall cause the pending signal to
* be discarded, whether or not it is blocked"
*/
if (sig_handler_ignored(sig_handler(p, sig), sig)) {
sigemptyset(&mask);
sigaddset(&mask, sig);
flush_sigqueue_mask(&mask, &p->signal->shared_pending);
for_each_thread(p, t)
flush_sigqueue_mask(&mask, &t->pending);
}
}
spin_unlock_irq(&p->sighand->siglock);
return 0;
}
static int
do_sigaltstack (const stack_t *ss, stack_t *oss, unsigned long sp,
size_t min_ss_size)
{
struct task_struct *t = current;
if (oss) {
memset(oss, 0, sizeof(stack_t));
oss->ss_sp = (void __user *) t->sas_ss_sp;
oss->ss_size = t->sas_ss_size;
oss->ss_flags = sas_ss_flags(sp) |
(current->sas_ss_flags & SS_FLAG_BITS);
}
if (ss) {
void __user *ss_sp = ss->ss_sp;
size_t ss_size = ss->ss_size;
unsigned ss_flags = ss->ss_flags;
int ss_mode;
if (unlikely(on_sig_stack(sp)))
return -EPERM;
ss_mode = ss_flags & ~SS_FLAG_BITS;
if (unlikely(ss_mode != SS_DISABLE && ss_mode != SS_ONSTACK &&
ss_mode != 0))
return -EINVAL;
if (ss_mode == SS_DISABLE) {
ss_size = 0;
ss_sp = NULL;
} else {
if (unlikely(ss_size < min_ss_size))
return -ENOMEM;
}
t->sas_ss_sp = (unsigned long) ss_sp;
t->sas_ss_size = ss_size;
t->sas_ss_flags = ss_flags;
}
return 0;
}
SYSCALL_DEFINE2(sigaltstack,const stack_t __user *,uss, stack_t __user *,uoss)
{
stack_t new, old;
int err;
if (uss && copy_from_user(&new, uss, sizeof(stack_t)))
return -EFAULT;
err = do_sigaltstack(uss ? &new : NULL, uoss ? &old : NULL,
current_user_stack_pointer(),
MINSIGSTKSZ);
if (!err && uoss && copy_to_user(uoss, &old, sizeof(stack_t)))
err = -EFAULT;
return err;
}
int restore_altstack(const stack_t __user *uss)
{
stack_t new;
if (copy_from_user(&new, uss, sizeof(stack_t)))
return -EFAULT;
(void)do_sigaltstack(&new, NULL, current_user_stack_pointer(),
MINSIGSTKSZ);
/* squash all but EFAULT for now */
return 0;
}
int __save_altstack(stack_t __user *uss, unsigned long sp)
{
struct task_struct *t = current;
int err = __put_user((void __user *)t->sas_ss_sp, &uss->ss_sp) |
__put_user(t->sas_ss_flags, &uss->ss_flags) |
__put_user(t->sas_ss_size, &uss->ss_size);
return err;
}
#ifdef CONFIG_COMPAT
static int do_compat_sigaltstack(const compat_stack_t __user *uss_ptr,
compat_stack_t __user *uoss_ptr)
{
stack_t uss, uoss;
int ret;
if (uss_ptr) {
compat_stack_t uss32;
if (copy_from_user(&uss32, uss_ptr, sizeof(compat_stack_t)))
return -EFAULT;
uss.ss_sp = compat_ptr(uss32.ss_sp);
uss.ss_flags = uss32.ss_flags;
uss.ss_size = uss32.ss_size;
}
ret = do_sigaltstack(uss_ptr ? &uss : NULL, &uoss,
compat_user_stack_pointer(),
COMPAT_MINSIGSTKSZ);
if (ret >= 0 && uoss_ptr) {
compat_stack_t old;
memset(&old, 0, sizeof(old));
old.ss_sp = ptr_to_compat(uoss.ss_sp);
old.ss_flags = uoss.ss_flags;
old.ss_size = uoss.ss_size;
if (copy_to_user(uoss_ptr, &old, sizeof(compat_stack_t)))
ret = -EFAULT;
}
return ret;
}
COMPAT_SYSCALL_DEFINE2(sigaltstack,
const compat_stack_t __user *, uss_ptr,
compat_stack_t __user *, uoss_ptr)
{
return do_compat_sigaltstack(uss_ptr, uoss_ptr);
}
int compat_restore_altstack(const compat_stack_t __user *uss)
{
int err = do_compat_sigaltstack(uss, NULL);
/* squash all but -EFAULT for now */
return err == -EFAULT ? err : 0;
}
int __compat_save_altstack(compat_stack_t __user *uss, unsigned long sp)
{
int err;
struct task_struct *t = current;
err = __put_user(ptr_to_compat((void __user *)t->sas_ss_sp),
&uss->ss_sp) |
__put_user(t->sas_ss_flags, &uss->ss_flags) |
__put_user(t->sas_ss_size, &uss->ss_size);
return err;
}
#endif
#ifdef __ARCH_WANT_SYS_SIGPENDING
/**
* sys_sigpending - examine pending signals
* @uset: where mask of pending signal is returned
*/
SYSCALL_DEFINE1(sigpending, old_sigset_t __user *, uset)
{
sigset_t set;
if (sizeof(old_sigset_t) > sizeof(*uset))
return -EINVAL;
do_sigpending(&set);
if (copy_to_user(uset, &set, sizeof(old_sigset_t)))
return -EFAULT;
return 0;
}
#ifdef CONFIG_COMPAT
COMPAT_SYSCALL_DEFINE1(sigpending, compat_old_sigset_t __user *, set32)
{
sigset_t set;
do_sigpending(&set);
return put_user(set.sig[0], set32);
}
#endif
#endif
#ifdef __ARCH_WANT_SYS_SIGPROCMASK
/**
* sys_sigprocmask - examine and change blocked signals
* @how: whether to add, remove, or set signals
* @nset: signals to add or remove (if non-null)
* @oset: previous value of signal mask if non-null
*
* Some platforms have their own version with special arguments;
* others support only sys_rt_sigprocmask.
*/
SYSCALL_DEFINE3(sigprocmask, int, how, old_sigset_t __user *, nset,
old_sigset_t __user *, oset)
{
old_sigset_t old_set, new_set;
sigset_t new_blocked;
old_set = current->blocked.sig[0];
if (nset) {
if (copy_from_user(&new_set, nset, sizeof(*nset)))
return -EFAULT;
new_blocked = current->blocked;
switch (how) {
case SIG_BLOCK:
sigaddsetmask(&new_blocked, new_set);
break;
case SIG_UNBLOCK:
sigdelsetmask(&new_blocked, new_set);
break;
case SIG_SETMASK:
new_blocked.sig[0] = new_set;
break;
default:
return -EINVAL;
}
set_current_blocked(&new_blocked);
}
if (oset) {
if (copy_to_user(oset, &old_set, sizeof(*oset)))
return -EFAULT;
}
return 0;
}
#endif /* __ARCH_WANT_SYS_SIGPROCMASK */
#ifndef CONFIG_ODD_RT_SIGACTION
/**
* sys_rt_sigaction - alter an action taken by a process
* @sig: signal to be sent
* @act: new sigaction
* @oact: used to save the previous sigaction
* @sigsetsize: size of sigset_t type
*/
SYSCALL_DEFINE4(rt_sigaction, int, sig,
const struct sigaction __user *, act,
struct sigaction __user *, oact,
size_t, sigsetsize)
{
struct k_sigaction new_sa, old_sa;
int ret;
/* XXX: Don't preclude handling different sized sigset_t's. */
if (sigsetsize != sizeof(sigset_t))
return -EINVAL;
if (act && copy_from_user(&new_sa.sa, act, sizeof(new_sa.sa)))
return -EFAULT;
ret = do_sigaction(sig, act ? &new_sa : NULL, oact ? &old_sa : NULL);
if (ret)
return ret;
if (oact && copy_to_user(oact, &old_sa.sa, sizeof(old_sa.sa)))
return -EFAULT;
return 0;
}
#ifdef CONFIG_COMPAT
COMPAT_SYSCALL_DEFINE4(rt_sigaction, int, sig,
const struct compat_sigaction __user *, act,
struct compat_sigaction __user *, oact,
compat_size_t, sigsetsize)
{
struct k_sigaction new_ka, old_ka;
#ifdef __ARCH_HAS_SA_RESTORER
compat_uptr_t restorer;
#endif
int ret;
/* XXX: Don't preclude handling different sized sigset_t's. */
if (sigsetsize != sizeof(compat_sigset_t))
return -EINVAL;
if (act) {
compat_uptr_t handler;
ret = get_user(handler, &act->sa_handler);
new_ka.sa.sa_handler = compat_ptr(handler);
#ifdef __ARCH_HAS_SA_RESTORER
ret |= get_user(restorer, &act->sa_restorer);
new_ka.sa.sa_restorer = compat_ptr(restorer);
#endif
ret |= get_compat_sigset(&new_ka.sa.sa_mask, &act->sa_mask);
ret |= get_user(new_ka.sa.sa_flags, &act->sa_flags);
if (ret)
return -EFAULT;
}
ret = do_sigaction(sig, act ? &new_ka : NULL, oact ? &old_ka : NULL);
if (!ret && oact) {
ret = put_user(ptr_to_compat(old_ka.sa.sa_handler),
&oact->sa_handler);
ret |= put_compat_sigset(&oact->sa_mask, &old_ka.sa.sa_mask,
sizeof(oact->sa_mask));
ret |= put_user(old_ka.sa.sa_flags, &oact->sa_flags);
#ifdef __ARCH_HAS_SA_RESTORER
ret |= put_user(ptr_to_compat(old_ka.sa.sa_restorer),
&oact->sa_restorer);
#endif
}
return ret;
}
#endif
#endif /* !CONFIG_ODD_RT_SIGACTION */
#ifdef CONFIG_OLD_SIGACTION
SYSCALL_DEFINE3(sigaction, int, sig,
const struct old_sigaction __user *, act,
struct old_sigaction __user *, oact)
{
struct k_sigaction new_ka, old_ka;
int ret;
if (act) {
old_sigset_t mask;
if (!access_ok(act, sizeof(*act)) ||
__get_user(new_ka.sa.sa_handler, &act->sa_handler) ||
__get_user(new_ka.sa.sa_restorer, &act->sa_restorer) ||
__get_user(new_ka.sa.sa_flags, &act->sa_flags) ||
__get_user(mask, &act->sa_mask))
return -EFAULT;
#ifdef __ARCH_HAS_KA_RESTORER
new_ka.ka_restorer = NULL;
#endif
siginitset(&new_ka.sa.sa_mask, mask);
}
ret = do_sigaction(sig, act ? &new_ka : NULL, oact ? &old_ka : NULL);
if (!ret && oact) {
if (!access_ok(oact, sizeof(*oact)) ||
__put_user(old_ka.sa.sa_handler, &oact->sa_handler) ||
__put_user(old_ka.sa.sa_restorer, &oact->sa_restorer) ||
__put_user(old_ka.sa.sa_flags, &oact->sa_flags) ||
__put_user(old_ka.sa.sa_mask.sig[0], &oact->sa_mask))
return -EFAULT;
}
return ret;
}
#endif
#ifdef CONFIG_COMPAT_OLD_SIGACTION
COMPAT_SYSCALL_DEFINE3(sigaction, int, sig,
const struct compat_old_sigaction __user *, act,
struct compat_old_sigaction __user *, oact)
{
struct k_sigaction new_ka, old_ka;
int ret;
compat_old_sigset_t mask;
compat_uptr_t handler, restorer;
if (act) {
if (!access_ok(act, sizeof(*act)) ||
__get_user(handler, &act->sa_handler) ||
__get_user(restorer, &act->sa_restorer) ||
__get_user(new_ka.sa.sa_flags, &act->sa_flags) ||
__get_user(mask, &act->sa_mask))
return -EFAULT;
#ifdef __ARCH_HAS_KA_RESTORER
new_ka.ka_restorer = NULL;
#endif
new_ka.sa.sa_handler = compat_ptr(handler);
new_ka.sa.sa_restorer = compat_ptr(restorer);
siginitset(&new_ka.sa.sa_mask, mask);
}
ret = do_sigaction(sig, act ? &new_ka : NULL, oact ? &old_ka : NULL);
if (!ret && oact) {
if (!access_ok(oact, sizeof(*oact)) ||
__put_user(ptr_to_compat(old_ka.sa.sa_handler),
&oact->sa_handler) ||
__put_user(ptr_to_compat(old_ka.sa.sa_restorer),
&oact->sa_restorer) ||
__put_user(old_ka.sa.sa_flags, &oact->sa_flags) ||
__put_user(old_ka.sa.sa_mask.sig[0], &oact->sa_mask))
return -EFAULT;
}
return ret;
}
#endif
#ifdef CONFIG_SGETMASK_SYSCALL
/*
* For backwards compatibility. Functionality superseded by sigprocmask.
*/
SYSCALL_DEFINE0(sgetmask)
{
/* SMP safe */
return current->blocked.sig[0];
}
SYSCALL_DEFINE1(ssetmask, int, newmask)
{
int old = current->blocked.sig[0];
sigset_t newset;
siginitset(&newset, newmask);
set_current_blocked(&newset);
return old;
}
#endif /* CONFIG_SGETMASK_SYSCALL */
#ifdef __ARCH_WANT_SYS_SIGNAL
/*
* For backwards compatibility. Functionality superseded by sigaction.
*/
SYSCALL_DEFINE2(signal, int, sig, __sighandler_t, handler)
{
struct k_sigaction new_sa, old_sa;
int ret;
new_sa.sa.sa_handler = handler;
new_sa.sa.sa_flags = SA_ONESHOT | SA_NOMASK;
sigemptyset(&new_sa.sa.sa_mask);
ret = do_sigaction(sig, &new_sa, &old_sa);
return ret ? ret : (unsigned long)old_sa.sa.sa_handler;
}
#endif /* __ARCH_WANT_SYS_SIGNAL */
#ifdef __ARCH_WANT_SYS_PAUSE
SYSCALL_DEFINE0(pause)
{
while (!signal_pending(current)) {
__set_current_state(TASK_INTERRUPTIBLE);
schedule();
}
return -ERESTARTNOHAND;
}
#endif
static int sigsuspend(sigset_t *set)
{
current->saved_sigmask = current->blocked;
set_current_blocked(set);
while (!signal_pending(current)) {
__set_current_state(TASK_INTERRUPTIBLE);
schedule();
}
set_restore_sigmask();
return -ERESTARTNOHAND;
}
/**
* sys_rt_sigsuspend - replace the signal mask for a value with the
* @unewset value until a signal is received
* @unewset: new signal mask value
* @sigsetsize: size of sigset_t type
*/
SYSCALL_DEFINE2(rt_sigsuspend, sigset_t __user *, unewset, size_t, sigsetsize)
{
sigset_t newset;
/* XXX: Don't preclude handling different sized sigset_t's. */
if (sigsetsize != sizeof(sigset_t))
return -EINVAL;
if (copy_from_user(&newset, unewset, sizeof(newset)))
return -EFAULT;
return sigsuspend(&newset);
}
#ifdef CONFIG_COMPAT
COMPAT_SYSCALL_DEFINE2(rt_sigsuspend, compat_sigset_t __user *, unewset, compat_size_t, sigsetsize)
{
sigset_t newset;
/* XXX: Don't preclude handling different sized sigset_t's. */
if (sigsetsize != sizeof(sigset_t))
return -EINVAL;
if (get_compat_sigset(&newset, unewset))
return -EFAULT;
return sigsuspend(&newset);
}
#endif
#ifdef CONFIG_OLD_SIGSUSPEND
SYSCALL_DEFINE1(sigsuspend, old_sigset_t, mask)
{
sigset_t blocked;
siginitset(&blocked, mask);
return sigsuspend(&blocked);
}
#endif
#ifdef CONFIG_OLD_SIGSUSPEND3
SYSCALL_DEFINE3(sigsuspend, int, unused1, int, unused2, old_sigset_t, mask)
{
sigset_t blocked;
siginitset(&blocked, mask);
return sigsuspend(&blocked);
}
#endif
__weak const char *arch_vma_name(struct vm_area_struct *vma)
{
return NULL;
}
static inline void siginfo_buildtime_checks(void)
{
BUILD_BUG_ON(sizeof(struct siginfo) != SI_MAX_SIZE);
/* Verify the offsets in the two siginfos match */
#define CHECK_OFFSET(field) \
BUILD_BUG_ON(offsetof(siginfo_t, field) != offsetof(kernel_siginfo_t, field))
/* kill */
CHECK_OFFSET(si_pid);
CHECK_OFFSET(si_uid);
/* timer */
CHECK_OFFSET(si_tid);
CHECK_OFFSET(si_overrun);
CHECK_OFFSET(si_value);
/* rt */
CHECK_OFFSET(si_pid);
CHECK_OFFSET(si_uid);
CHECK_OFFSET(si_value);
/* sigchld */
CHECK_OFFSET(si_pid);
CHECK_OFFSET(si_uid);
CHECK_OFFSET(si_status);
CHECK_OFFSET(si_utime);
CHECK_OFFSET(si_stime);
/* sigfault */
CHECK_OFFSET(si_addr);
CHECK_OFFSET(si_trapno);
CHECK_OFFSET(si_addr_lsb);
CHECK_OFFSET(si_lower);
CHECK_OFFSET(si_upper);
CHECK_OFFSET(si_pkey);
CHECK_OFFSET(si_perf_data);
CHECK_OFFSET(si_perf_type);
/* sigpoll */
CHECK_OFFSET(si_band);
CHECK_OFFSET(si_fd);
/* sigsys */
CHECK_OFFSET(si_call_addr);
CHECK_OFFSET(si_syscall);
CHECK_OFFSET(si_arch);
#undef CHECK_OFFSET
/* usb asyncio */
BUILD_BUG_ON(offsetof(struct siginfo, si_pid) !=
offsetof(struct siginfo, si_addr));
if (sizeof(int) == sizeof(void __user *)) {
BUILD_BUG_ON(sizeof_field(struct siginfo, si_pid) !=
sizeof(void __user *));
} else {
BUILD_BUG_ON((sizeof_field(struct siginfo, si_pid) +
sizeof_field(struct siginfo, si_uid)) !=
sizeof(void __user *));
BUILD_BUG_ON(offsetofend(struct siginfo, si_pid) !=
offsetof(struct siginfo, si_uid));
}
#ifdef CONFIG_COMPAT
BUILD_BUG_ON(offsetof(struct compat_siginfo, si_pid) !=
offsetof(struct compat_siginfo, si_addr));
BUILD_BUG_ON(sizeof_field(struct compat_siginfo, si_pid) !=
sizeof(compat_uptr_t));
BUILD_BUG_ON(sizeof_field(struct compat_siginfo, si_pid) !=
sizeof_field(struct siginfo, si_pid));
#endif
}
void __init signals_init(void)
{
siginfo_buildtime_checks();
sigqueue_cachep = KMEM_CACHE(sigqueue, SLAB_PANIC | SLAB_ACCOUNT);
}
#ifdef CONFIG_KGDB_KDB
#include <linux/kdb.h>
/*
* kdb_send_sig - Allows kdb to send signals without exposing
* signal internals. This function checks if the required locks are
* available before calling the main signal code, to avoid kdb
* deadlocks.
*/
void kdb_send_sig(struct task_struct *t, int sig)
{
static struct task_struct *kdb_prev_t;
int new_t, ret;
if (!spin_trylock(&t->sighand->siglock)) {
kdb_printf("Can't do kill command now.\n"
"The sigmask lock is held somewhere else in "
"kernel, try again later\n");
return;
}
new_t = kdb_prev_t != t;
kdb_prev_t = t;
if (!task_is_running(t) && new_t) {
spin_unlock(&t->sighand->siglock);
kdb_printf("Process is not RUNNING, sending a signal from "
"kdb risks deadlock\n"
"on the run queue locks. "
"The signal has _not_ been sent.\n"
"Reissue the kill command if you want to risk "
"the deadlock.\n");
return;
}
ret = send_signal(sig, SEND_SIG_PRIV, t, PIDTYPE_PID);
spin_unlock(&t->sighand->siglock);
if (ret)
kdb_printf("Fail to deliver Signal %d to process %d.\n",
sig, t->pid);
else
kdb_printf("Signal %d is sent to process %d.\n", sig, t->pid);
}
#endif /* CONFIG_KGDB_KDB */
// SPDX-License-Identifier: GPL-2.0-or-later
/*
* NET3 Protocol independent device support routines.
*
* Derived from the non IP parts of dev.c 1.0.19
* Authors: Ross Biro
* Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
* Mark Evans, <evansmp@uhura.aston.ac.uk>
*
* Additional Authors:
* Florian la Roche <rzsfl@rz.uni-sb.de>
* Alan Cox <gw4pts@gw4pts.ampr.org>
* David Hinds <dahinds@users.sourceforge.net>
* Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
* Adam Sulmicki <adam@cfar.umd.edu>
* Pekka Riikonen <priikone@poesidon.pspt.fi>
*
* Changes:
* D.J. Barrow : Fixed bug where dev->refcnt gets set
* to 2 if register_netdev gets called
* before net_dev_init & also removed a
* few lines of code in the process.
* Alan Cox : device private ioctl copies fields back.
* Alan Cox : Transmit queue code does relevant
* stunts to keep the queue safe.
* Alan Cox : Fixed double lock.
* Alan Cox : Fixed promisc NULL pointer trap
* ???????? : Support the full private ioctl range
* Alan Cox : Moved ioctl permission check into
* drivers
* Tim Kordas : SIOCADDMULTI/SIOCDELMULTI
* Alan Cox : 100 backlog just doesn't cut it when
* you start doing multicast video 8)
* Alan Cox : Rewrote net_bh and list manager.
* Alan Cox : Fix ETH_P_ALL echoback lengths.
* Alan Cox : Took out transmit every packet pass
* Saved a few bytes in the ioctl handler
* Alan Cox : Network driver sets packet type before
* calling netif_rx. Saves a function
* call a packet.
* Alan Cox : Hashed net_bh()
* Richard Kooijman: Timestamp fixes.
* Alan Cox : Wrong field in SIOCGIFDSTADDR
* Alan Cox : Device lock protection.
* Alan Cox : Fixed nasty side effect of device close
* changes.
* Rudi Cilibrasi : Pass the right thing to
* set_mac_address()
* Dave Miller : 32bit quantity for the device lock to
* make it work out on a Sparc.
* Bjorn Ekwall : Added KERNELD hack.
* Alan Cox : Cleaned up the backlog initialise.
* Craig Metz : SIOCGIFCONF fix if space for under
* 1 device.
* Thomas Bogendoerfer : Return ENODEV for dev_open, if there
* is no device open function.
* Andi Kleen : Fix error reporting for SIOCGIFCONF
* Michael Chastain : Fix signed/unsigned for SIOCGIFCONF
* Cyrus Durgin : Cleaned for KMOD
* Adam Sulmicki : Bug Fix : Network Device Unload
* A network device unload needs to purge
* the backlog queue.
* Paul Rusty Russell : SIOCSIFNAME
* Pekka Riikonen : Netdev boot-time settings code
* Andrew Morton : Make unregister_netdevice wait
* indefinitely on dev->refcnt
* J Hadi Salim : - Backlog queue sampling
* - netif_rx() feedback
*/
#include <linux/uaccess.h>
#include <linux/bitops.h>
#include <linux/capability.h>
#include <linux/cpu.h>
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/hash.h>
#include <linux/slab.h>
#include <linux/sched.h>
#include <linux/sched/mm.h>
#include <linux/mutex.h>
#include <linux/rwsem.h>
#include <linux/string.h>
#include <linux/mm.h>
#include <linux/socket.h>
#include <linux/sockios.h>
#include <linux/errno.h>
#include <linux/interrupt.h>
#include <linux/if_ether.h>
#include <linux/netdevice.h>
#include <linux/etherdevice.h>
#include <linux/ethtool.h>
#include <linux/skbuff.h>
#include <linux/kthread.h>
#include <linux/bpf.h>
#include <linux/bpf_trace.h>
#include <net/net_namespace.h>
#include <net/sock.h>
#include <net/busy_poll.h>
#include <linux/rtnetlink.h>
#include <linux/stat.h>
#include <net/dsa.h>
#include <net/dst.h>
#include <net/dst_metadata.h>
#include <net/gro.h>
#include <net/pkt_sched.h>
#include <net/pkt_cls.h>
#include <net/checksum.h>
#include <net/xfrm.h>
#include <linux/highmem.h>
#include <linux/init.h>
#include <linux/module.h>
#include <linux/netpoll.h>
#include <linux/rcupdate.h>
#include <linux/delay.h>
#include <net/iw_handler.h>
#include <asm/current.h>
#include <linux/audit.h>
#include <linux/dmaengine.h>
#include <linux/err.h>
#include <linux/ctype.h>
#include <linux/if_arp.h>
#include <linux/if_vlan.h>
#include <linux/ip.h>
#include <net/ip.h>
#include <net/mpls.h>
#include <linux/ipv6.h>
#include <linux/in.h>
#include <linux/jhash.h>
#include <linux/random.h>
#include <trace/events/napi.h>
#include <trace/events/net.h>
#include <trace/events/skb.h>
#include <trace/events/qdisc.h>
#include <linux/inetdevice.h>
#include <linux/cpu_rmap.h>
#include <linux/static_key.h>
#include <linux/hashtable.h>
#include <linux/vmalloc.h>
#include <linux/if_macvlan.h>
#include <linux/errqueue.h>
#include <linux/hrtimer.h>
#include <linux/netfilter_ingress.h>
#include <linux/crash_dump.h>
#include <linux/sctp.h>
#include <net/udp_tunnel.h>
#include <linux/net_namespace.h>
#include <linux/indirect_call_wrapper.h>
#include <net/devlink.h>
#include <linux/pm_runtime.h>
#include <linux/prandom.h>
#include <linux/once_lite.h>
#include "net-sysfs.h"
#define MAX_GRO_SKBS 8
/* This should be increased if a protocol with a bigger head is added. */
#define GRO_MAX_HEAD (MAX_HEADER + 128)
static DEFINE_SPINLOCK(ptype_lock);
static DEFINE_SPINLOCK(offload_lock);
struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
struct list_head ptype_all __read_mostly; /* Taps */
static struct list_head offload_base __read_mostly;
static int netif_rx_internal(struct sk_buff *skb);
static int call_netdevice_notifiers_info(unsigned long val,
struct netdev_notifier_info *info);
static int call_netdevice_notifiers_extack(unsigned long val,
struct net_device *dev,
struct netlink_ext_ack *extack);
static struct napi_struct *napi_by_id(unsigned int napi_id);
/*
* The @dev_base_head list is protected by @dev_base_lock and the rtnl
* semaphore.
*
* Pure readers hold dev_base_lock for reading, or rcu_read_lock()
*
* Writers must hold the rtnl semaphore while they loop through the
* dev_base_head list, and hold dev_base_lock for writing when they do the
* actual updates. This allows pure readers to access the list even
* while a writer is preparing to update it.
*
* To put it another way, dev_base_lock is held for writing only to
* protect against pure readers; the rtnl semaphore provides the
* protection against other writers.
*
* See, for example usages, register_netdevice() and
* unregister_netdevice(), which must be called with the rtnl
* semaphore held.
*/
DEFINE_RWLOCK(dev_base_lock);
EXPORT_SYMBOL(dev_base_lock);
static DEFINE_MUTEX(ifalias_mutex);
/* protects napi_hash addition/deletion and napi_gen_id */
static DEFINE_SPINLOCK(napi_hash_lock);
static unsigned int napi_gen_id = NR_CPUS;
static DEFINE_READ_MOSTLY_HASHTABLE(napi_hash, 8);
static DECLARE_RWSEM(devnet_rename_sem);
static inline void dev_base_seq_inc(struct net *net)
{
while (++net->dev_base_seq == 0)
;
}
static inline struct hlist_head *dev_name_hash(struct net *net, const char *name)
{
unsigned int hash = full_name_hash(net, name, strnlen(name, IFNAMSIZ));
return &net->dev_name_head[hash_32(hash, NETDEV_HASHBITS)];
}
static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
{
return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)];
}
static inline void rps_lock(struct softnet_data *sd)
{
#ifdef CONFIG_RPS
spin_lock(&sd->input_pkt_queue.lock);
#endif
}
static inline void rps_unlock(struct softnet_data *sd)
{
#ifdef CONFIG_RPS
spin_unlock(&sd->input_pkt_queue.lock);
#endif
}
static struct netdev_name_node *netdev_name_node_alloc(struct net_device *dev,
const char *name)
{
struct netdev_name_node *name_node;
name_node = kmalloc(sizeof(*name_node), GFP_KERNEL);
if (!name_node)
return NULL;
INIT_HLIST_NODE(&name_node->hlist);
name_node->dev = dev;
name_node->name = name;
return name_node;
}
static struct netdev_name_node *
netdev_name_node_head_alloc(struct net_device *dev)
{
struct netdev_name_node *name_node;
name_node = netdev_name_node_alloc(dev, dev->name);
if (!name_node)
return NULL;
INIT_LIST_HEAD(&name_node->list);
return name_node;
}
static void netdev_name_node_free(struct netdev_name_node *name_node)
{
kfree(name_node);
}
static void netdev_name_node_add(struct net *net,
struct netdev_name_node *name_node)
{
hlist_add_head_rcu(&name_node->hlist,
dev_name_hash(net, name_node->name));
}
static void netdev_name_node_del(struct netdev_name_node *name_node)
{
hlist_del_rcu(&name_node->hlist);
}
static struct netdev_name_node *netdev_name_node_lookup(struct net *net,
const char *name)
{
struct hlist_head *head = dev_name_hash(net, name);
struct netdev_name_node *name_node;
hlist_for_each_entry(name_node, head, hlist)
if (!strcmp(name_node->name, name))
return name_node;
return NULL;
}
static struct netdev_name_node *netdev_name_node_lookup_rcu(struct net *net,
const char *name)
{
struct hlist_head *head = dev_name_hash(net, name);
struct netdev_name_node *name_node;
hlist_for_each_entry_rcu(name_node, head, hlist)
if (!strcmp(name_node->name, name))
return name_node;
return NULL;
}
int netdev_name_node_alt_create(struct net_device *dev, const char *name)
{
struct netdev_name_node *name_node;
struct net *net = dev_net(dev);
name_node = netdev_name_node_lookup(net, name);
if (name_node)
return -EEXIST;
name_node = netdev_name_node_alloc(dev, name);
if (!name_node)
return -ENOMEM;
netdev_name_node_add(net, name_node);
/* The node that holds dev->name acts as a head of per-device list. */
list_add_tail(&name_node->list, &dev->name_node->list);
return 0;
}
EXPORT_SYMBOL(netdev_name_node_alt_create);
static void __netdev_name_node_alt_destroy(struct netdev_name_node *name_node)
{
list_del(&name_node->list);
netdev_name_node_del(name_node);
kfree(name_node->name);
netdev_name_node_free(name_node);
}
int netdev_name_node_alt_destroy(struct net_device *dev, const char *name)
{
struct netdev_name_node *name_node;
struct net *net = dev_net(dev);
name_node = netdev_name_node_lookup(net, name);
if (!name_node)
return -ENOENT;
/* lookup might have found our primary name or a name belonging
* to another device.
*/
if (name_node == dev->name_node || name_node->dev != dev)
return -EINVAL;
__netdev_name_node_alt_destroy(name_node);
return 0;
}
EXPORT_SYMBOL(netdev_name_node_alt_destroy);
static void netdev_name_node_alt_flush(struct net_device *dev)
{
struct netdev_name_node *name_node, *tmp;
list_for_each_entry_safe(name_node, tmp, &dev->name_node->list, list)
__netdev_name_node_alt_destroy(name_node);
}
/* Device list insertion */
static void list_netdevice(struct net_device *dev)
{
struct net *net = dev_net(dev);
ASSERT_RTNL();
write_lock_bh(&dev_base_lock);
list_add_tail_rcu(&dev->dev_list, &net->dev_base_head);
netdev_name_node_add(net, dev->name_node);
hlist_add_head_rcu(&dev->index_hlist,
dev_index_hash(net, dev->ifindex));
write_unlock_bh(&dev_base_lock);
dev_base_seq_inc(net);
}
/* Device list removal
* caller must respect a RCU grace period before freeing/reusing dev
*/
static void unlist_netdevice(struct net_device *dev)
{
ASSERT_RTNL();
/* Unlink dev from the device chain */
write_lock_bh(&dev_base_lock);
list_del_rcu(&dev->dev_list);
netdev_name_node_del(dev->name_node);
hlist_del_rcu(&dev->index_hlist);
write_unlock_bh(&dev_base_lock);
dev_base_seq_inc(dev_net(dev));
}
/*
* Our notifier list
*/
static RAW_NOTIFIER_HEAD(netdev_chain);
/*
* Device drivers call our routines to queue packets here. We empty the
* queue in the local softnet handler.
*/
DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data);
EXPORT_PER_CPU_SYMBOL(softnet_data);
#ifdef CONFIG_LOCKDEP
/*
* register_netdevice() inits txq->_xmit_lock and sets lockdep class
* according to dev->type
*/
static const unsigned short netdev_lock_type[] = {
ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25,
ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET,
ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM,
ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP,
ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD,
ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25,
ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP,
ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD,
ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI,
ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE,
ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET,
ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL,
ARPHRD_FCFABRIC, ARPHRD_IEEE80211, ARPHRD_IEEE80211_PRISM,
ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET, ARPHRD_PHONET_PIPE,
ARPHRD_IEEE802154, ARPHRD_VOID, ARPHRD_NONE};
static const char *const netdev_lock_name[] = {
"_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25",
"_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET",
"_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM",
"_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP",
"_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD",
"_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25",
"_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP",
"_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD",
"_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI",
"_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE",
"_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET",
"_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL",
"_xmit_FCFABRIC", "_xmit_IEEE80211", "_xmit_IEEE80211_PRISM",
"_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET", "_xmit_PHONET_PIPE",
"_xmit_IEEE802154", "_xmit_VOID", "_xmit_NONE"};
static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)];
static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)];
static inline unsigned short netdev_lock_pos(unsigned short dev_type)
{
int i;
for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++)
if (netdev_lock_type[i] == dev_type)
return i;
/* the last key is used by default */
return ARRAY_SIZE(netdev_lock_type) - 1;
}
static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
unsigned short dev_type)
{
int i;
i = netdev_lock_pos(dev_type);
lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i],
netdev_lock_name[i]);
}
static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
{
int i;
i = netdev_lock_pos(dev->type);
lockdep_set_class_and_name(&dev->addr_list_lock,
&netdev_addr_lock_key[i],
netdev_lock_name[i]);
}
#else
static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
unsigned short dev_type)
{
}
static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
{
}
#endif
/*******************************************************************************
*
* Protocol management and registration routines
*
*******************************************************************************/
/*
* Add a protocol ID to the list. Now that the input handler is
* smarter we can dispense with all the messy stuff that used to be
* here.
*
* BEWARE!!! Protocol handlers, mangling input packets,
* MUST BE last in hash buckets and checking protocol handlers
* MUST start from promiscuous ptype_all chain in net_bh.
* It is true now, do not change it.
* Explanation follows: if protocol handler, mangling packet, will
* be the first on list, it is not able to sense, that packet
* is cloned and should be copied-on-write, so that it will
* change it and subsequent readers will get broken packet.
* --ANK (980803)
*/
static inline struct list_head *ptype_head(const struct packet_type *pt)
{
if (pt->type == htons(ETH_P_ALL))
return pt->dev ? &pt->dev->ptype_all : &ptype_all;
else
return pt->dev ? &pt->dev->ptype_specific : &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
}
/**
* dev_add_pack - add packet handler
* @pt: packet type declaration
*
* Add a protocol handler to the networking stack. The passed &packet_type
* is linked into kernel lists and may not be freed until it has been
* removed from the kernel lists.
*
* This call does not sleep therefore it can not
* guarantee all CPU's that are in middle of receiving packets
* will see the new packet type (until the next received packet).
*/
void dev_add_pack(struct packet_type *pt)
{
struct list_head *head = ptype_head(pt);
spin_lock(&ptype_lock);
list_add_rcu(&pt->list, head);
spin_unlock(&ptype_lock);
}
EXPORT_SYMBOL(dev_add_pack);
/**
* __dev_remove_pack - remove packet handler
* @pt: packet type declaration
*
* Remove a protocol handler that was previously added to the kernel
* protocol handlers by dev_add_pack(). The passed &packet_type is removed
* from the kernel lists and can be freed or reused once this function
* returns.
*
* The packet type might still be in use by receivers
* and must not be freed until after all the CPU's have gone
* through a quiescent state.
*/
void __dev_remove_pack(struct packet_type *pt)
{
struct list_head *head = ptype_head(pt);
struct packet_type *pt1;
spin_lock(&ptype_lock);
list_for_each_entry(pt1, head, list) {
if (pt == pt1) {
list_del_rcu(&pt->list);
goto out;
}
}
pr_warn("dev_remove_pack: %p not found\n", pt);
out:
spin_unlock(&ptype_lock);
}
EXPORT_SYMBOL(__dev_remove_pack);
/**
* dev_remove_pack - remove packet handler
* @pt: packet type declaration
*
* Remove a protocol handler that was previously added to the kernel
* protocol handlers by dev_add_pack(). The passed &packet_type is removed
* from the kernel lists and can be freed or reused once this function
* returns.
*
* This call sleeps to guarantee that no CPU is looking at the packet
* type after return.
*/
void dev_remove_pack(struct packet_type *pt)
{
__dev_remove_pack(pt);
synchronize_net();
}
EXPORT_SYMBOL(dev_remove_pack);
/**
* dev_add_offload - register offload handlers
* @po: protocol offload declaration
*
* Add protocol offload handlers to the networking stack. The passed
* &proto_offload is linked into kernel lists and may not be freed until
* it has been removed from the kernel lists.
*
* This call does not sleep therefore it can not
* guarantee all CPU's that are in middle of receiving packets
* will see the new offload handlers (until the next received packet).
*/
void dev_add_offload(struct packet_offload *po)
{
struct packet_offload *elem;
spin_lock(&offload_lock);
list_for_each_entry(elem, &offload_base, list) {
if (po->priority < elem->priority)
break;
}
list_add_rcu(&po->list, elem->list.prev);
spin_unlock(&offload_lock);
}
EXPORT_SYMBOL(dev_add_offload);
/**
* __dev_remove_offload - remove offload handler
* @po: packet offload declaration
*
* Remove a protocol offload handler that was previously added to the
* kernel offload handlers by dev_add_offload(). The passed &offload_type
* is removed from the kernel lists and can be freed or reused once this
* function returns.
*
* The packet type might still be in use by receivers
* and must not be freed until after all the CPU's have gone
* through a quiescent state.
*/
static void __dev_remove_offload(struct packet_offload *po)
{
struct list_head *head = &offload_base;
struct packet_offload *po1;
spin_lock(&offload_lock);
list_for_each_entry(po1, head, list) {
if (po == po1) {
list_del_rcu(&po->list);
goto out;
}
}
pr_warn("dev_remove_offload: %p not found\n", po);
out:
spin_unlock(&offload_lock);
}
/**
* dev_remove_offload - remove packet offload handler
* @po: packet offload declaration
*
* Remove a packet offload handler that was previously added to the kernel
* offload handlers by dev_add_offload(). The passed &offload_type is
* removed from the kernel lists and can be freed or reused once this
* function returns.
*
* This call sleeps to guarantee that no CPU is looking at the packet
* type after return.
*/
void dev_remove_offload(struct packet_offload *po)
{
__dev_remove_offload(po);
synchronize_net();
}
EXPORT_SYMBOL(dev_remove_offload);
/*******************************************************************************
*
* Device Interface Subroutines
*
*******************************************************************************/
/**
* dev_get_iflink - get 'iflink' value of a interface
* @dev: targeted interface
*
* Indicates the ifindex the interface is linked to.
* Physical interfaces have the same 'ifindex' and 'iflink' values.
*/
int dev_get_iflink(const struct net_device *dev)
{
if (dev->netdev_ops && dev->netdev_ops->ndo_get_iflink)
return dev->netdev_ops->ndo_get_iflink(dev);
return dev->ifindex;
}
EXPORT_SYMBOL(dev_get_iflink);
/**
* dev_fill_metadata_dst - Retrieve tunnel egress information.
* @dev: targeted interface
* @skb: The packet.
*
* For better visibility of tunnel traffic OVS needs to retrieve
* egress tunnel information for a packet. Following API allows
* user to get this info.
*/
int dev_fill_metadata_dst(struct net_device *dev, struct sk_buff *skb)
{
struct ip_tunnel_info *info;
if (!dev->netdev_ops || !dev->netdev_ops->ndo_fill_metadata_dst)
return -EINVAL;
info = skb_tunnel_info_unclone(skb);
if (!info)
return -ENOMEM;
if (unlikely(!(info->mode & IP_TUNNEL_INFO_TX)))
return -EINVAL;
return dev->netdev_ops->ndo_fill_metadata_dst(dev, skb);
}
EXPORT_SYMBOL_GPL(dev_fill_metadata_dst);
static struct net_device_path *dev_fwd_path(struct net_device_path_stack *stack)
{
int k = stack->num_paths++;
if (WARN_ON_ONCE(k >= NET_DEVICE_PATH_STACK_MAX))
return NULL;
return &stack->path[k];
}
int dev_fill_forward_path(const struct net_device *dev, const u8 *daddr,
struct net_device_path_stack *stack)
{
const struct net_device *last_dev;
struct net_device_path_ctx ctx = {
.dev = dev,
.daddr = daddr,
};
struct net_device_path *path;
int ret = 0;
stack->num_paths = 0;
while (ctx.dev && ctx.dev->netdev_ops->ndo_fill_forward_path) {
last_dev = ctx.dev;
path = dev_fwd_path(stack);
if (!path)
return -1;
memset(path, 0, sizeof(struct net_device_path));
ret = ctx.dev->netdev_ops->ndo_fill_forward_path(&ctx, path);
if (ret < 0)
return -1;
if (WARN_ON_ONCE(last_dev == ctx.dev))
return -1;
}
path = dev_fwd_path(stack);
if (!path)
return -1;
path->type = DEV_PATH_ETHERNET;
path->dev = ctx.dev;
return ret;
}
EXPORT_SYMBOL_GPL(dev_fill_forward_path);
/**
* __dev_get_by_name - find a device by its name
* @net: the applicable net namespace
* @name: name to find
*
* Find an interface by name. Must be called under RTNL semaphore
* or @dev_base_lock. If the name is found a pointer to the device
* is returned. If the name is not found then %NULL is returned. The
* reference counters are not incremented so the caller must be
* careful with locks.
*/
struct net_device *__dev_get_by_name(struct net *net, const char *name)
{
struct netdev_name_node *node_name;
node_name = netdev_name_node_lookup(net, name);
return node_name ? node_name->dev : NULL;
}
EXPORT_SYMBOL(__dev_get_by_name);
/**
* dev_get_by_name_rcu - find a device by its name
* @net: the applicable net namespace
* @name: name to find
*
* Find an interface by name.
* If the name is found a pointer to the device is returned.
* If the name is not found then %NULL is returned.
* The reference counters are not incremented so the caller must be
* careful with locks. The caller must hold RCU lock.
*/
struct net_device *dev_get_by_name_rcu(struct net *net, const char *name)
{
struct netdev_name_node *node_name;
node_name = netdev_name_node_lookup_rcu(net, name);
return node_name ? node_name->dev : NULL;
}
EXPORT_SYMBOL(dev_get_by_name_rcu);
/**
* dev_get_by_name - find a device by its name
* @net: the applicable net namespace
* @name: name to find
*
* Find an interface by name. This can be called from any
* context and does its own locking. The returned handle has
* the usage count incremented and the caller must use dev_put() to
* release it when it is no longer needed. %NULL is returned if no
* matching device is found.
*/
struct net_device *dev_get_by_name(struct net *net, const char *name)
{
struct net_device *dev;
rcu_read_lock();
dev = dev_get_by_name_rcu(net, name);
dev_hold(dev);
rcu_read_unlock();
return dev;
}
EXPORT_SYMBOL(dev_get_by_name);
/**
* __dev_get_by_index - find a device by its ifindex
* @net: the applicable net namespace
* @ifindex: index of device
*
* Search for an interface by index. Returns %NULL if the device
* is not found or a pointer to the device. The device has not
* had its reference counter increased so the caller must be careful
* about locking. The caller must hold either the RTNL semaphore
* or @dev_base_lock.
*/
struct net_device *__dev_get_by_index(struct net *net, int ifindex)
{
struct net_device *dev;
struct hlist_head *head = dev_index_hash(net, ifindex); hlist_for_each_entry(dev, head, index_hlist) if (dev->ifindex == ifindex)
return dev;
return NULL;
}
EXPORT_SYMBOL(__dev_get_by_index);
/**
* dev_get_by_index_rcu - find a device by its ifindex
* @net: the applicable net namespace
* @ifindex: index of device
*
* Search for an interface by index. Returns %NULL if the device
* is not found or a pointer to the device. The device has not
* had its reference counter increased so the caller must be careful
* about locking. The caller must hold RCU lock.
*/
struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex)
{
struct net_device *dev;
struct hlist_head *head = dev_index_hash(net, ifindex); hlist_for_each_entry_rcu(dev, head, index_hlist) if (dev->ifindex == ifindex)
return dev;
return NULL;
}
EXPORT_SYMBOL(dev_get_by_index_rcu);
/**
* dev_get_by_index - find a device by its ifindex
* @net: the applicable net namespace
* @ifindex: index of device
*
* Search for an interface by index. Returns NULL if the device
* is not found or a pointer to the device. The device returned has
* had a reference added and the pointer is safe until the user calls
* dev_put to indicate they have finished with it.
*/
struct net_device *dev_get_by_index(struct net *net, int ifindex)
{
struct net_device *dev;
rcu_read_lock();
dev = dev_get_by_index_rcu(net, ifindex);
dev_hold(dev);
rcu_read_unlock();
return dev;
}
EXPORT_SYMBOL(dev_get_by_index);
/**
* dev_get_by_napi_id - find a device by napi_id
* @napi_id: ID of the NAPI struct
*
* Search for an interface by NAPI ID. Returns %NULL if the device
* is not found or a pointer to the device. The device has not had
* its reference counter increased so the caller must be careful
* about locking. The caller must hold RCU lock.
*/
struct net_device *dev_get_by_napi_id(unsigned int napi_id)
{
struct napi_struct *napi;
WARN_ON_ONCE(!rcu_read_lock_held());
if (napi_id < MIN_NAPI_ID)
return NULL;
napi = napi_by_id(napi_id);
return napi ? napi->dev : NULL;
}
EXPORT_SYMBOL(dev_get_by_napi_id);
/**
* netdev_get_name - get a netdevice name, knowing its ifindex.
* @net: network namespace
* @name: a pointer to the buffer where the name will be stored.
* @ifindex: the ifindex of the interface to get the name from.
*/
int netdev_get_name(struct net *net, char *name, int ifindex)
{
struct net_device *dev;
int ret;
down_read(&devnet_rename_sem);
rcu_read_lock();
dev = dev_get_by_index_rcu(net, ifindex);
if (!dev) {
ret = -ENODEV;
goto out;
}
strcpy(name, dev->name);
ret = 0;
out:
rcu_read_unlock();
up_read(&devnet_rename_sem);
return ret;
}
/**
* dev_getbyhwaddr_rcu - find a device by its hardware address
* @net: the applicable net namespace
* @type: media type of device
* @ha: hardware address
*
* Search for an interface by MAC address. Returns NULL if the device
* is not found or a pointer to the device.
* The caller must hold RCU or RTNL.
* The returned device has not had its ref count increased
* and the caller must therefore be careful about locking
*
*/
struct net_device *dev_getbyhwaddr_rcu(struct net *net, unsigned short type,
const char *ha)
{
struct net_device *dev;
for_each_netdev_rcu(net, dev)
if (dev->type == type &&
!memcmp(dev->dev_addr, ha, dev->addr_len))
return dev;
return NULL;
}
EXPORT_SYMBOL(dev_getbyhwaddr_rcu);
struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type)
{
struct net_device *dev, *ret = NULL;
rcu_read_lock();
for_each_netdev_rcu(net, dev)
if (dev->type == type) {
dev_hold(dev);
ret = dev;
break;
}
rcu_read_unlock();
return ret;
}
EXPORT_SYMBOL(dev_getfirstbyhwtype);
/**
* __dev_get_by_flags - find any device with given flags
* @net: the applicable net namespace
* @if_flags: IFF_* values
* @mask: bitmask of bits in if_flags to check
*
* Search for any interface with the given flags. Returns NULL if a device
* is not found or a pointer to the device. Must be called inside
* rtnl_lock(), and result refcount is unchanged.
*/
struct net_device *__dev_get_by_flags(struct net *net, unsigned short if_flags,
unsigned short mask)
{
struct net_device *dev, *ret;
ASSERT_RTNL();
ret = NULL;
for_each_netdev(net, dev) {
if (((dev->flags ^ if_flags) & mask) == 0) {
ret = dev;
break;
}
}
return ret;
}
EXPORT_SYMBOL(__dev_get_by_flags);
/**
* dev_valid_name - check if name is okay for network device
* @name: name string
*
* Network device names need to be valid file names to
* allow sysfs to work. We also disallow any kind of
* whitespace.
*/
bool dev_valid_name(const char *name)
{
if (*name == '\0')
return false;
if (strnlen(name, IFNAMSIZ) == IFNAMSIZ)
return false;
if (!strcmp(name, ".") || !strcmp(name, ".."))
return false;
while (*name) {
if (*name == '/' || *name == ':' || isspace(*name))
return false;
name++;
}
return true;
}
EXPORT_SYMBOL(dev_valid_name);
/**
* __dev_alloc_name - allocate a name for a device
* @net: network namespace to allocate the device name in
* @name: name format string
* @buf: scratch buffer and result name string
*
* Passed a format string - eg "lt%d" it will try and find a suitable
* id. It scans list of devices to build up a free map, then chooses
* the first empty slot. The caller must hold the dev_base or rtnl lock
* while allocating the name and adding the device in order to avoid
* duplicates.
* Limited to bits_per_byte * page size devices (ie 32K on most platforms).
* Returns the number of the unit assigned or a negative errno code.
*/
static int __dev_alloc_name(struct net *net, const char *name, char *buf)
{
int i = 0;
const char *p;
const int max_netdevices = 8*PAGE_SIZE;
unsigned long *inuse;
struct net_device *d;
if (!dev_valid_name(name))
return -EINVAL;
p = strchr(name, '%');
if (p) {
/*
* Verify the string as this thing may have come from
* the user. There must be either one "%d" and no other "%"
* characters.
*/
if (p[1] != 'd' || strchr(p + 2, '%'))
return -EINVAL;
/* Use one page as a bit array of possible slots */
inuse = (unsigned long *) get_zeroed_page(GFP_ATOMIC);
if (!inuse)
return -ENOMEM;
for_each_netdev(net, d) {
struct netdev_name_node *name_node;
list_for_each_entry(name_node, &d->name_node->list, list) {
if (!sscanf(name_node->name, name, &i))
continue;
if (i < 0 || i >= max_netdevices)
continue;
/* avoid cases where sscanf is not exact inverse of printf */
snprintf(buf, IFNAMSIZ, name, i);
if (!strncmp(buf, name_node->name, IFNAMSIZ))
set_bit(i, inuse);
}
if (!sscanf(d->name, name, &i))
continue;
if (i < 0 || i >= max_netdevices)
continue;
/* avoid cases where sscanf is not exact inverse of printf */
snprintf(buf, IFNAMSIZ, name, i);
if (!strncmp(buf, d->name, IFNAMSIZ))
set_bit(i, inuse);
}
i = find_first_zero_bit(inuse, max_netdevices);
free_page((unsigned long) inuse);
}
snprintf(buf, IFNAMSIZ, name, i);
if (!__dev_get_by_name(net, buf))
return i;
/* It is possible to run out of possible slots
* when the name is long and there isn't enough space left
* for the digits, or if all bits are used.
*/
return -ENFILE;
}
static int dev_alloc_name_ns(struct net *net,
struct net_device *dev,
const char *name)
{
char buf[IFNAMSIZ];
int ret;
BUG_ON(!net);
ret = __dev_alloc_name(net, name, buf);
if (ret >= 0)
strlcpy(dev->name, buf, IFNAMSIZ);
return ret;
}
/**
* dev_alloc_name - allocate a name for a device
* @dev: device
* @name: name format string
*
* Passed a format string - eg "lt%d" it will try and find a suitable
* id. It scans list of devices to build up a free map, then chooses
* the first empty slot. The caller must hold the dev_base or rtnl lock
* while allocating the name and adding the device in order to avoid
* duplicates.
* Limited to bits_per_byte * page size devices (ie 32K on most platforms).
* Returns the number of the unit assigned or a negative errno code.
*/
int dev_alloc_name(struct net_device *dev, const char *name)
{
return dev_alloc_name_ns(dev_net(dev), dev, name);
}
EXPORT_SYMBOL(dev_alloc_name);
static int dev_get_valid_name(struct net *net, struct net_device *dev,
const char *name)
{
BUG_ON(!net);
if (!dev_valid_name(name))
return -EINVAL;
if (strchr(name, '%'))
return dev_alloc_name_ns(net, dev, name);
else if (__dev_get_by_name(net, name))
return -EEXIST;
else if (dev->name != name)
strlcpy(dev->name, name, IFNAMSIZ);
return 0;
}
/**
* dev_change_name - change name of a device
* @dev: device
* @newname: name (or format string) must be at least IFNAMSIZ
*
* Change name of a device, can pass format strings "eth%d".
* for wildcarding.
*/
int dev_change_name(struct net_device *dev, const char *newname)
{
unsigned char old_assign_type;
char oldname[IFNAMSIZ];
int err = 0;
int ret;
struct net *net;
ASSERT_RTNL();
BUG_ON(!dev_net(dev));
net = dev_net(dev);
/* Some auto-enslaved devices e.g. failover slaves are
* special, as userspace might rename the device after
* the interface had been brought up and running since
* the point kernel initiated auto-enslavement. Allow
* live name change even when these slave devices are
* up and running.
*
* Typically, users of these auto-enslaving devices
* don't actually care about slave name change, as
* they are supposed to operate on master interface
* directly.
*/
if (dev->flags & IFF_UP &&
likely(!(dev->priv_flags & IFF_LIVE_RENAME_OK)))
return -EBUSY;
down_write(&devnet_rename_sem);
if (strncmp(newname, dev->name, IFNAMSIZ) == 0) {
up_write(&devnet_rename_sem);
return 0;
}
memcpy(oldname, dev->name, IFNAMSIZ);
err = dev_get_valid_name(net, dev, newname);
if (err < 0) {
up_write(&devnet_rename_sem);
return err;
}
if (oldname[0] && !strchr(oldname, '%'))
netdev_info(dev, "renamed from %s\n", oldname);
old_assign_type = dev->name_assign_type;
dev->name_assign_type = NET_NAME_RENAMED;
rollback:
ret = device_rename(&dev->dev, dev->name);
if (ret) {
memcpy(dev->name, oldname, IFNAMSIZ);
dev->name_assign_type = old_assign_type;
up_write(&devnet_rename_sem);
return ret;
}
up_write(&devnet_rename_sem);
netdev_adjacent_rename_links(dev, oldname);
write_lock_bh(&dev_base_lock);
netdev_name_node_del(dev->name_node);
write_unlock_bh(&dev_base_lock);
synchronize_rcu();
write_lock_bh(&dev_base_lock);
netdev_name_node_add(net, dev->name_node);
write_unlock_bh(&dev_base_lock);
ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev);
ret = notifier_to_errno(ret);
if (ret) {
/* err >= 0 after dev_alloc_name() or stores the first errno */
if (err >= 0) {
err = ret;
down_write(&devnet_rename_sem);
memcpy(dev->name, oldname, IFNAMSIZ);
memcpy(oldname, newname, IFNAMSIZ);
dev->name_assign_type = old_assign_type;
old_assign_type = NET_NAME_RENAMED;
goto rollback;
} else {
pr_err("%s: name change rollback failed: %d\n",
dev->name, ret);
}
}
return err;
}
/**
* dev_set_alias - change ifalias of a device
* @dev: device
* @alias: name up to IFALIASZ
* @len: limit of bytes to copy from info
*
* Set ifalias for a device,
*/
int dev_set_alias(struct net_device *dev, const char *alias, size_t len)
{
struct dev_ifalias *new_alias = NULL;
if (len >= IFALIASZ)
return -EINVAL;
if (len) {
new_alias = kmalloc(sizeof(*new_alias) + len + 1, GFP_KERNEL);
if (!new_alias)
return -ENOMEM;
memcpy(new_alias->ifalias, alias, len);
new_alias->ifalias[len] = 0;
}
mutex_lock(&ifalias_mutex);
new_alias = rcu_replace_pointer(dev->ifalias, new_alias,
mutex_is_locked(&ifalias_mutex));
mutex_unlock(&ifalias_mutex);
if (new_alias)
kfree_rcu(new_alias, rcuhead);
return len;
}
EXPORT_SYMBOL(dev_set_alias);
/**
* dev_get_alias - get ifalias of a device
* @dev: device
* @name: buffer to store name of ifalias
* @len: size of buffer
*
* get ifalias for a device. Caller must make sure dev cannot go
* away, e.g. rcu read lock or own a reference count to device.
*/
int dev_get_alias(const struct net_device *dev, char *name, size_t len)
{
const struct dev_ifalias *alias;
int ret = 0;
rcu_read_lock();
alias = rcu_dereference(dev->ifalias);
if (alias)
ret = snprintf(name, len, "%s", alias->ifalias);
rcu_read_unlock();
return ret;
}
/**
* netdev_features_change - device changes features
* @dev: device to cause notification
*
* Called to indicate a device has changed features.
*/
void netdev_features_change(struct net_device *dev)
{
call_netdevice_notifiers(NETDEV_FEAT_CHANGE, dev);
}
EXPORT_SYMBOL(netdev_features_change);
/**
* netdev_state_change - device changes state
* @dev: device to cause notification
*
* Called to indicate a device has changed state. This function calls
* the notifier chains for netdev_chain and sends a NEWLINK message
* to the routing socket.
*/
void netdev_state_change(struct net_device *dev)
{
if (dev->flags & IFF_UP) {
struct netdev_notifier_change_info change_info = {
.info.dev = dev,
};
call_netdevice_notifiers_info(NETDEV_CHANGE,
&change_info.info);
rtmsg_ifinfo(RTM_NEWLINK, dev, 0, GFP_KERNEL);
}
}
EXPORT_SYMBOL(netdev_state_change);
/**
* __netdev_notify_peers - notify network peers about existence of @dev,
* to be called when rtnl lock is already held.
* @dev: network device
*
* Generate traffic such that interested network peers are aware of
* @dev, such as by generating a gratuitous ARP. This may be used when
* a device wants to inform the rest of the network about some sort of
* reconfiguration such as a failover event or virtual machine
* migration.
*/
void __netdev_notify_peers(struct net_device *dev)
{
ASSERT_RTNL();
call_netdevice_notifiers(NETDEV_NOTIFY_PEERS, dev);
call_netdevice_notifiers(NETDEV_RESEND_IGMP, dev);
}
EXPORT_SYMBOL(__netdev_notify_peers);
/**
* netdev_notify_peers - notify network peers about existence of @dev
* @dev: network device
*
* Generate traffic such that interested network peers are aware of
* @dev, such as by generating a gratuitous ARP. This may be used when
* a device wants to inform the rest of the network about some sort of
* reconfiguration such as a failover event or virtual machine
* migration.
*/
void netdev_notify_peers(struct net_device *dev)
{
rtnl_lock();
__netdev_notify_peers(dev);
rtnl_unlock();
}
EXPORT_SYMBOL(netdev_notify_peers);
static int napi_threaded_poll(void *data);
static int napi_kthread_create(struct napi_struct *n)
{
int err = 0;
/* Create and wake up the kthread once to put it in
* TASK_INTERRUPTIBLE mode to avoid the blocked task
* warning and work with loadavg.
*/
n->thread = kthread_run(napi_threaded_poll, n, "napi/%s-%d",
n->dev->name, n->napi_id);
if (IS_ERR(n->thread)) {
err = PTR_ERR(n->thread);
pr_err("kthread_run failed with err %d\n", err);
n->thread = NULL;
}
return err;
}
static int __dev_open(struct net_device *dev, struct netlink_ext_ack *extack)
{
const struct net_device_ops *ops = dev->netdev_ops;
int ret;
ASSERT_RTNL();
if (!netif_device_present(dev)) {
/* may be detached because parent is runtime-suspended */
if (dev->dev.parent)
pm_runtime_resume(dev->dev.parent);
if (!netif_device_present(dev))
return -ENODEV;
}
/* Block netpoll from trying to do any rx path servicing.
* If we don't do this there is a chance ndo_poll_controller
* or ndo_poll may be running while we open the device
*/
netpoll_poll_disable(dev);
ret = call_netdevice_notifiers_extack(NETDEV_PRE_UP, dev, extack);
ret = notifier_to_errno(ret);
if (ret)
return ret;
set_bit(__LINK_STATE_START, &dev->state);
if (ops->ndo_validate_addr)
ret = ops->ndo_validate_addr(dev);
if (!ret && ops->ndo_open)
ret = ops->ndo_open(dev);
netpoll_poll_enable(dev);
if (ret)
clear_bit(__LINK_STATE_START, &dev->state);
else {
dev->flags |= IFF_UP;
dev_set_rx_mode(dev);
dev_activate(dev);
add_device_randomness(dev->dev_addr, dev->addr_len);
}
return ret;
}
/**
* dev_open - prepare an interface for use.
* @dev: device to open
* @extack: netlink extended ack
*
* Takes a device from down to up state. The device's private open
* function is invoked and then the multicast lists are loaded. Finally
* the device is moved into the up state and a %NETDEV_UP message is
* sent to the netdev notifier chain.
*
* Calling this function on an active interface is a nop. On a failure
* a negative errno code is returned.
*/
int dev_open(struct net_device *dev, struct netlink_ext_ack *extack)
{
int ret;
if (dev->flags & IFF_UP)
return 0;
ret = __dev_open(dev, extack);
if (ret < 0)
return ret;
rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING, GFP_KERNEL);
call_netdevice_notifiers(NETDEV_UP, dev);
return ret;
}
EXPORT_SYMBOL(dev_open);
static void __dev_close_many(struct list_head *head)
{
struct net_device *dev;
ASSERT_RTNL();
might_sleep();
list_for_each_entry(dev, head, close_list) {
/* Temporarily disable netpoll until the interface is down */
netpoll_poll_disable(dev);
call_netdevice_notifiers(NETDEV_GOING_DOWN, dev);
clear_bit(__LINK_STATE_START, &dev->state);
/* Synchronize to scheduled poll. We cannot touch poll list, it
* can be even on different cpu. So just clear netif_running().
*
* dev->stop() will invoke napi_disable() on all of it's
* napi_struct instances on this device.
*/
smp_mb__after_atomic(); /* Commit netif_running(). */
}
dev_deactivate_many(head);
list_for_each_entry(dev, head, close_list) {
const struct net_device_ops *ops = dev->netdev_ops;
/*
* Call the device specific close. This cannot fail.
* Only if device is UP
*
* We allow it to be called even after a DETACH hot-plug
* event.
*/
if (ops->ndo_stop)
ops->ndo_stop(dev);
dev->flags &= ~IFF_UP;
netpoll_poll_enable(dev);
}
}
static void __dev_close(struct net_device *dev)
{
LIST_HEAD(single);
list_add(&dev->close_list, &single);
__dev_close_many(&single);
list_del(&single);
}
void dev_close_many(struct list_head *head, bool unlink)
{
struct net_device *dev, *tmp;
/* Remove the devices that don't need to be closed */
list_for_each_entry_safe(dev, tmp, head, close_list)
if (!(dev->flags & IFF_UP))
list_del_init(&dev->close_list);
__dev_close_many(head);
list_for_each_entry_safe(dev, tmp, head, close_list) {
rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING, GFP_KERNEL);
call_netdevice_notifiers(NETDEV_DOWN, dev);
if (unlink)
list_del_init(&dev->close_list);
}
}
EXPORT_SYMBOL(dev_close_many);
/**
* dev_close - shutdown an interface.
* @dev: device to shutdown
*
* This function moves an active device into down state. A
* %NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
* is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
* chain.
*/
void dev_close(struct net_device *dev)
{
if (dev->flags & IFF_UP) {
LIST_HEAD(single);
list_add(&dev->close_list, &single);
dev_close_many(&single, true);
list_del(&single);
}
}
EXPORT_SYMBOL(dev_close);
/**
* dev_disable_lro - disable Large Receive Offload on a device
* @dev: device
*
* Disable Large Receive Offload (LRO) on a net device. Must be
* called under RTNL. This is needed if received packets may be
* forwarded to another interface.
*/
void dev_disable_lro(struct net_device *dev)
{
struct net_device *lower_dev;
struct list_head *iter;
dev->wanted_features &= ~NETIF_F_LRO;
netdev_update_features(dev);
if (unlikely(dev->features & NETIF_F_LRO))
netdev_WARN(dev, "failed to disable LRO!\n");
netdev_for_each_lower_dev(dev, lower_dev, iter)
dev_disable_lro(lower_dev);
}
EXPORT_SYMBOL(dev_disable_lro);
/**
* dev_disable_gro_hw - disable HW Generic Receive Offload on a device
* @dev: device
*
* Disable HW Generic Receive Offload (GRO_HW) on a net device. Must be
* called under RTNL. This is needed if Generic XDP is installed on
* the device.
*/
static void dev_disable_gro_hw(struct net_device *dev)
{
dev->wanted_features &= ~NETIF_F_GRO_HW;
netdev_update_features(dev);
if (unlikely(dev->features & NETIF_F_GRO_HW))
netdev_WARN(dev, "failed to disable GRO_HW!\n");
}
const char *netdev_cmd_to_name(enum netdev_cmd cmd)
{
#define N(val) \
case NETDEV_##val: \
return "NETDEV_" __stringify(val);
switch (cmd) {
N(UP) N(DOWN) N(REBOOT) N(CHANGE) N(REGISTER) N(UNREGISTER)
N(CHANGEMTU) N(CHANGEADDR) N(GOING_DOWN) N(CHANGENAME) N(FEAT_CHANGE)
N(BONDING_FAILOVER) N(PRE_UP) N(PRE_TYPE_CHANGE) N(POST_TYPE_CHANGE)
N(POST_INIT) N(RELEASE) N(NOTIFY_PEERS) N(JOIN) N(CHANGEUPPER)
N(RESEND_IGMP) N(PRECHANGEMTU) N(CHANGEINFODATA) N(BONDING_INFO)
N(PRECHANGEUPPER) N(CHANGELOWERSTATE) N(UDP_TUNNEL_PUSH_INFO)
N(UDP_TUNNEL_DROP_INFO) N(CHANGE_TX_QUEUE_LEN)
N(CVLAN_FILTER_PUSH_INFO) N(CVLAN_FILTER_DROP_INFO)
N(SVLAN_FILTER_PUSH_INFO) N(SVLAN_FILTER_DROP_INFO)
N(PRE_CHANGEADDR)
}
#undef N
return "UNKNOWN_NETDEV_EVENT";
}
EXPORT_SYMBOL_GPL(netdev_cmd_to_name);
static int call_netdevice_notifier(struct notifier_block *nb, unsigned long val,
struct net_device *dev)
{
struct netdev_notifier_info info = {
.dev = dev,
};
return nb->notifier_call(nb, val, &info);
}
static int call_netdevice_register_notifiers(struct notifier_block *nb,
struct net_device *dev)
{
int err;
err = call_netdevice_notifier(nb, NETDEV_REGISTER, dev);
err = notifier_to_errno(err);
if (err)
return err;
if (!(dev->flags & IFF_UP))
return 0;
call_netdevice_notifier(nb, NETDEV_UP, dev);
return 0;
}
static void call_netdevice_unregister_notifiers(struct notifier_block *nb,
struct net_device *dev)
{
if (dev->flags & IFF_UP) {
call_netdevice_notifier(nb, NETDEV_GOING_DOWN,
dev);
call_netdevice_notifier(nb, NETDEV_DOWN, dev);
}
call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev);
}
static int call_netdevice_register_net_notifiers(struct notifier_block *nb,
struct net *net)
{
struct net_device *dev;
int err;
for_each_netdev(net, dev) {
err = call_netdevice_register_notifiers(nb, dev);
if (err)
goto rollback;
}
return 0;
rollback:
for_each_netdev_continue_reverse(net, dev)
call_netdevice_unregister_notifiers(nb, dev);
return err;
}
static void call_netdevice_unregister_net_notifiers(struct notifier_block *nb,
struct net *net)
{
struct net_device *dev;
for_each_netdev(net, dev)
call_netdevice_unregister_notifiers(nb, dev);
}
static int dev_boot_phase = 1;
/**
* register_netdevice_notifier - register a network notifier block
* @nb: notifier
*
* Register a notifier to be called when network device events occur.
* The notifier passed is linked into the kernel structures and must
* not be reused until it has been unregistered. A negative errno code
* is returned on a failure.
*
* When registered all registration and up events are replayed
* to the new notifier to allow device to have a race free
* view of the network device list.
*/
int register_netdevice_notifier(struct notifier_block *nb)
{
struct net *net;
int err;
/* Close race with setup_net() and cleanup_net() */
down_write(&pernet_ops_rwsem);
rtnl_lock();
err = raw_notifier_chain_register(&netdev_chain, nb);
if (err)
goto unlock;
if (dev_boot_phase)
goto unlock;
for_each_net(net) {
err = call_netdevice_register_net_notifiers(nb, net);
if (err)
goto rollback;
}
unlock:
rtnl_unlock();
up_write(&pernet_ops_rwsem);
return err;
rollback:
for_each_net_continue_reverse(net)
call_netdevice_unregister_net_notifiers(nb, net);
raw_notifier_chain_unregister(&netdev_chain, nb);
goto unlock;
}
EXPORT_SYMBOL(register_netdevice_notifier);
/**
* unregister_netdevice_notifier - unregister a network notifier block
* @nb: notifier
*
* Unregister a notifier previously registered by
* register_netdevice_notifier(). The notifier is unlinked into the
* kernel structures and may then be reused. A negative errno code
* is returned on a failure.
*
* After unregistering unregister and down device events are synthesized
* for all devices on the device list to the removed notifier to remove
* the need for special case cleanup code.
*/
int unregister_netdevice_notifier(struct notifier_block *nb)
{
struct net *net;
int err;
/* Close race with setup_net() and cleanup_net() */
down_write(&pernet_ops_rwsem);
rtnl_lock();
err = raw_notifier_chain_unregister(&netdev_chain, nb);
if (err)
goto unlock;
for_each_net(net)
call_netdevice_unregister_net_notifiers(nb, net);
unlock:
rtnl_unlock();
up_write(&pernet_ops_rwsem);
return err;
}
EXPORT_SYMBOL(unregister_netdevice_notifier);
static int __register_netdevice_notifier_net(struct net *net,
struct notifier_block *nb,
bool ignore_call_fail)
{
int err;
err = raw_notifier_chain_register(&net->netdev_chain, nb);
if (err)
return err;
if (dev_boot_phase)
return 0;
err = call_netdevice_register_net_notifiers(nb, net);
if (err && !ignore_call_fail)
goto chain_unregister;
return 0;
chain_unregister:
raw_notifier_chain_unregister(&net->netdev_chain, nb);
return err;
}
static int __unregister_netdevice_notifier_net(struct net *net,
struct notifier_block *nb)
{
int err;
err = raw_notifier_chain_unregister(&net->netdev_chain, nb);
if (err)
return err;
call_netdevice_unregister_net_notifiers(nb, net);
return 0;
}
/**
* register_netdevice_notifier_net - register a per-netns network notifier block
* @net: network namespace
* @nb: notifier
*
* Register a notifier to be called when network device events occur.
* The notifier passed is linked into the kernel structures and must
* not be reused until it has been unregistered. A negative errno code
* is returned on a failure.
*
* When registered all registration and up events are replayed
* to the new notifier to allow device to have a race free
* view of the network device list.
*/
int register_netdevice_notifier_net(struct net *net, struct notifier_block *nb)
{
int err;
rtnl_lock();
err = __register_netdevice_notifier_net(net, nb, false);
rtnl_unlock();
return err;
}
EXPORT_SYMBOL(register_netdevice_notifier_net);
/**
* unregister_netdevice_notifier_net - unregister a per-netns
* network notifier block
* @net: network namespace
* @nb: notifier
*
* Unregister a notifier previously registered by
* register_netdevice_notifier(). The notifier is unlinked into the
* kernel structures and may then be reused. A negative errno code
* is returned on a failure.
*
* After unregistering unregister and down device events are synthesized
* for all devices on the device list to the removed notifier to remove
* the need for special case cleanup code.
*/
int unregister_netdevice_notifier_net(struct net *net,
struct notifier_block *nb)
{
int err;
rtnl_lock();
err = __unregister_netdevice_notifier_net(net, nb);
rtnl_unlock();
return err;
}
EXPORT_SYMBOL(unregister_netdevice_notifier_net);
int register_netdevice_notifier_dev_net(struct net_device *dev,
struct notifier_block *nb,
struct netdev_net_notifier *nn)
{
int err;
rtnl_lock();
err = __register_netdevice_notifier_net(dev_net(dev), nb, false);
if (!err) {
nn->nb = nb;
list_add(&nn->list, &dev->net_notifier_list);
}
rtnl_unlock();
return err;
}
EXPORT_SYMBOL(register_netdevice_notifier_dev_net);
int unregister_netdevice_notifier_dev_net(struct net_device *dev,
struct notifier_block *nb,
struct netdev_net_notifier *nn)
{
int err;
rtnl_lock();
list_del(&nn->list);
err = __unregister_netdevice_notifier_net(dev_net(dev), nb);
rtnl_unlock();
return err;
}
EXPORT_SYMBOL(unregister_netdevice_notifier_dev_net);
static void move_netdevice_notifiers_dev_net(struct net_device *dev,
struct net *net)
{
struct netdev_net_notifier *nn;
list_for_each_entry(nn, &dev->net_notifier_list, list) {
__unregister_netdevice_notifier_net(dev_net(dev), nn->nb);
__register_netdevice_notifier_net(net, nn->nb, true);
}
}
/**
* call_netdevice_notifiers_info - call all network notifier blocks
* @val: value passed unmodified to notifier function
* @info: notifier information data
*
* Call all network notifier blocks. Parameters and return value
* are as for raw_notifier_call_chain().
*/
static int call_netdevice_notifiers_info(unsigned long val,
struct netdev_notifier_info *info)
{
struct net *net = dev_net(info->dev);
int ret;
ASSERT_RTNL();
/* Run per-netns notifier block chain first, then run the global one.
* Hopefully, one day, the global one is going to be removed after
* all notifier block registrators get converted to be per-netns.
*/
ret = raw_notifier_call_chain(&net->netdev_chain, val, info);
if (ret & NOTIFY_STOP_MASK)
return ret;
return raw_notifier_call_chain(&netdev_chain, val, info);
}
static int call_netdevice_notifiers_extack(unsigned long val,
struct net_device *dev,
struct netlink_ext_ack *extack)
{
struct netdev_notifier_info info = {
.dev = dev,
.extack = extack,
};
return call_netdevice_notifiers_info(val, &info);
}
/**
* call_netdevice_notifiers - call all network notifier blocks
* @val: value passed unmodified to notifier function
* @dev: net_device pointer passed unmodified to notifier function
*
* Call all network notifier blocks. Parameters and return value
* are as for raw_notifier_call_chain().
*/
int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
{
return call_netdevice_notifiers_extack(val, dev, NULL);
}
EXPORT_SYMBOL(call_netdevice_notifiers);
/**
* call_netdevice_notifiers_mtu - call all network notifier blocks
* @val: value passed unmodified to notifier function
* @dev: net_device pointer passed unmodified to notifier function
* @arg: additional u32 argument passed to the notifier function
*
* Call all network notifier blocks. Parameters and return value
* are as for raw_notifier_call_chain().
*/
static int call_netdevice_notifiers_mtu(unsigned long val,
struct net_device *dev, u32 arg)
{
struct netdev_notifier_info_ext info = {
.info.dev = dev,
.ext.mtu = arg,
};
BUILD_BUG_ON(offsetof(struct netdev_notifier_info_ext, info) != 0);
return call_netdevice_notifiers_info(val, &info.info);
}
#ifdef CONFIG_NET_INGRESS
static DEFINE_STATIC_KEY_FALSE(ingress_needed_key);
void net_inc_ingress_queue(void)
{
static_branch_inc(&ingress_needed_key);
}
EXPORT_SYMBOL_GPL(net_inc_ingress_queue);
void net_dec_ingress_queue(void)
{
static_branch_dec(&ingress_needed_key);
}
EXPORT_SYMBOL_GPL(net_dec_ingress_queue);
#endif
#ifdef CONFIG_NET_EGRESS
static DEFINE_STATIC_KEY_FALSE(egress_needed_key);
void net_inc_egress_queue(void)
{
static_branch_inc(&egress_needed_key);
}
EXPORT_SYMBOL_GPL(net_inc_egress_queue);
void net_dec_egress_queue(void)
{
static_branch_dec(&egress_needed_key);
}
EXPORT_SYMBOL_GPL(net_dec_egress_queue);
#endif
static DEFINE_STATIC_KEY_FALSE(netstamp_needed_key);
#ifdef CONFIG_JUMP_LABEL
static atomic_t netstamp_needed_deferred;
static atomic_t netstamp_wanted;
static void netstamp_clear(struct work_struct *work)
{
int deferred = atomic_xchg(&netstamp_needed_deferred, 0);
int wanted;
wanted = atomic_add_return(deferred, &netstamp_wanted);
if (wanted > 0)
static_branch_enable(&netstamp_needed_key);
else
static_branch_disable(&netstamp_needed_key);
}
static DECLARE_WORK(netstamp_work, netstamp_clear);
#endif
void net_enable_timestamp(void)
{
#ifdef CONFIG_JUMP_LABEL
int wanted;
while (1) {
wanted = atomic_read(&netstamp_wanted);
if (wanted <= 0)
break;
if (atomic_cmpxchg(&netstamp_wanted, wanted, wanted + 1) == wanted)
return;
}
atomic_inc(&netstamp_needed_deferred);
schedule_work(&netstamp_work);
#else
static_branch_inc(&netstamp_needed_key);
#endif
}
EXPORT_SYMBOL(net_enable_timestamp);
void net_disable_timestamp(void)
{
#ifdef CONFIG_JUMP_LABEL
int wanted;
while (1) {
wanted = atomic_read(&netstamp_wanted);
if (wanted <= 1)
break;
if (atomic_cmpxchg(&netstamp_wanted, wanted, wanted - 1) == wanted)
return;
}
atomic_dec(&netstamp_needed_deferred);
schedule_work(&netstamp_work);
#else
static_branch_dec(&netstamp_needed_key);
#endif
}
EXPORT_SYMBOL(net_disable_timestamp);
static inline void net_timestamp_set(struct sk_buff *skb)
{
skb->tstamp = 0;
if (static_branch_unlikely(&netstamp_needed_key))
__net_timestamp(skb);
}
#define net_timestamp_check(COND, SKB) \
if (static_branch_unlikely(&netstamp_needed_key)) { \
if ((COND) && !(SKB)->tstamp) \
__net_timestamp(SKB); \
} \
bool is_skb_forwardable(const struct net_device *dev, const struct sk_buff *skb)
{
return __is_skb_forwardable(dev, skb, true);
}
EXPORT_SYMBOL_GPL(is_skb_forwardable);
static int __dev_forward_skb2(struct net_device *dev, struct sk_buff *skb,
bool check_mtu)
{
int ret = ____dev_forward_skb(dev, skb, check_mtu);
if (likely(!ret)) {
skb->protocol = eth_type_trans(skb, dev);
skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
}
return ret;
}
int __dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
{
return __dev_forward_skb2(dev, skb, true);
}
EXPORT_SYMBOL_GPL(__dev_forward_skb);
/**
* dev_forward_skb - loopback an skb to another netif
*
* @dev: destination network device
* @skb: buffer to forward
*
* return values:
* NET_RX_SUCCESS (no congestion)
* NET_RX_DROP (packet was dropped, but freed)
*
* dev_forward_skb can be used for injecting an skb from the
* start_xmit function of one device into the receive queue
* of another device.
*
* The receiving device may be in another namespace, so
* we have to clear all information in the skb that could
* impact namespace isolation.
*/
int dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
{
return __dev_forward_skb(dev, skb) ?: netif_rx_internal(skb);
}
EXPORT_SYMBOL_GPL(dev_forward_skb);
int dev_forward_skb_nomtu(struct net_device *dev, struct sk_buff *skb)
{
return __dev_forward_skb2(dev, skb, false) ?: netif_rx_internal(skb);
}
static inline int deliver_skb(struct sk_buff *skb,
struct packet_type *pt_prev,
struct net_device *orig_dev)
{
if (unlikely(skb_orphan_frags_rx(skb, GFP_ATOMIC)))
return -ENOMEM;
refcount_inc(&skb->users); return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
}
static inline void deliver_ptype_list_skb(struct sk_buff *skb,
struct packet_type **pt,
struct net_device *orig_dev,
__be16 type,
struct list_head *ptype_list)
{
struct packet_type *ptype, *pt_prev = *pt;
list_for_each_entry_rcu(ptype, ptype_list, list) {
if (ptype->type != type)
continue;
if (pt_prev)
deliver_skb(skb, pt_prev, orig_dev);
pt_prev = ptype;
}
*pt = pt_prev;
}
static inline bool skb_loop_sk(struct packet_type *ptype, struct sk_buff *skb)
{
if (!ptype->af_packet_priv || !skb->sk)
return false;
if (ptype->id_match) return ptype->id_match(ptype, skb->sk); else if ((struct sock *)ptype->af_packet_priv == skb->sk)
return true;
return false;
}
/**
* dev_nit_active - return true if any network interface taps are in use
*
* @dev: network device to check for the presence of taps
*/
bool dev_nit_active(struct net_device *dev)
{
return !list_empty(&ptype_all) || !list_empty(&dev->ptype_all);
}
EXPORT_SYMBOL_GPL(dev_nit_active);
/*
* Support routine. Sends outgoing frames to any network
* taps currently in use.
*/
void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
{
struct packet_type *ptype;
struct sk_buff *skb2 = NULL;
struct packet_type *pt_prev = NULL;
struct list_head *ptype_list = &ptype_all;
rcu_read_lock();
again:
list_for_each_entry_rcu(ptype, ptype_list, list) { if (ptype->ignore_outgoing)
continue;
/* Never send packets back to the socket
* they originated from - MvS (miquels@drinkel.ow.org)
*/
if (skb_loop_sk(ptype, skb))
continue;
if (pt_prev) { deliver_skb(skb2, pt_prev, skb->dev);
pt_prev = ptype;
continue;
}
/* need to clone skb, done only once */
skb2 = skb_clone(skb, GFP_ATOMIC);
if (!skb2)
goto out_unlock;
net_timestamp_set(skb2);
/* skb->nh should be correctly
* set by sender, so that the second statement is
* just protection against buggy protocols.
*/
skb_reset_mac_header(skb2);
if (skb_network_header(skb2) < skb2->data ||
skb_network_header(skb2) > skb_tail_pointer(skb2)) { net_crit_ratelimited("protocol %04x is buggy, dev %s\n",
ntohs(skb2->protocol),
dev->name);
skb_reset_network_header(skb2);
}
skb2->transport_header = skb2->network_header;
skb2->pkt_type = PACKET_OUTGOING;
pt_prev = ptype;
}
if (ptype_list == &ptype_all) { ptype_list = &dev->ptype_all;
goto again;
}
out_unlock:
if (pt_prev) {
if (!skb_orphan_frags_rx(skb2, GFP_ATOMIC))
pt_prev->func(skb2, skb->dev, pt_prev, skb->dev);
else
kfree_skb(skb2);
}
rcu_read_unlock();
}
EXPORT_SYMBOL_GPL(dev_queue_xmit_nit);
/**
* netif_setup_tc - Handle tc mappings on real_num_tx_queues change
* @dev: Network device
* @txq: number of queues available
*
* If real_num_tx_queues is changed the tc mappings may no longer be
* valid. To resolve this verify the tc mapping remains valid and if
* not NULL the mapping. With no priorities mapping to this
* offset/count pair it will no longer be used. In the worst case TC0
* is invalid nothing can be done so disable priority mappings. If is
* expected that drivers will fix this mapping if they can before
* calling netif_set_real_num_tx_queues.
*/
static void netif_setup_tc(struct net_device *dev, unsigned int txq)
{
int i;
struct netdev_tc_txq *tc = &dev->tc_to_txq[0];
/* If TC0 is invalidated disable TC mapping */
if (tc->offset + tc->count > txq) {
pr_warn("Number of in use tx queues changed invalidating tc mappings. Priority traffic classification disabled!\n");
dev->num_tc = 0;
return;
}
/* Invalidated prio to tc mappings set to TC0 */
for (i = 1; i < TC_BITMASK + 1; i++) {
int q = netdev_get_prio_tc_map(dev, i);
tc = &dev->tc_to_txq[q];
if (tc->offset + tc->count > txq) {
pr_warn("Number of in use tx queues changed. Priority %i to tc mapping %i is no longer valid. Setting map to 0\n",
i, q);
netdev_set_prio_tc_map(dev, i, 0);
}
}
}
int netdev_txq_to_tc(struct net_device *dev, unsigned int txq)
{
if (dev->num_tc) {
struct netdev_tc_txq *tc = &dev->tc_to_txq[0];
int i;
/* walk through the TCs and see if it falls into any of them */
for (i = 0; i < TC_MAX_QUEUE; i++, tc++) {
if ((txq - tc->offset) < tc->count)
return i;
}
/* didn't find it, just return -1 to indicate no match */
return -1;
}
return 0;
}
EXPORT_SYMBOL(netdev_txq_to_tc);
#ifdef CONFIG_XPS
static struct static_key xps_needed __read_mostly;
static struct static_key xps_rxqs_needed __read_mostly;
static DEFINE_MUTEX(xps_map_mutex);
#define xmap_dereference(P) \
rcu_dereference_protected((P), lockdep_is_held(&xps_map_mutex))
static bool remove_xps_queue(struct xps_dev_maps *dev_maps,
struct xps_dev_maps *old_maps, int tci, u16 index)
{
struct xps_map *map = NULL;
int pos;
if (dev_maps)
map = xmap_dereference(dev_maps->attr_map[tci]);
if (!map)
return false;
for (pos = map->len; pos--;) {
if (map->queues[pos] != index)
continue;
if (map->len > 1) {
map->queues[pos] = map->queues[--map->len];
break;
}
if (old_maps)
RCU_INIT_POINTER(old_maps->attr_map[tci], NULL);
RCU_INIT_POINTER(dev_maps->attr_map[tci], NULL);
kfree_rcu(map, rcu);
return false;
}
return true;
}
static bool remove_xps_queue_cpu(struct net_device *dev,
struct xps_dev_maps *dev_maps,
int cpu, u16 offset, u16 count)
{
int num_tc = dev_maps->num_tc;
bool active = false;
int tci;
for (tci = cpu * num_tc; num_tc--; tci++) {
int i, j;
for (i = count, j = offset; i--; j++) {
if (!remove_xps_queue(dev_maps, NULL, tci, j))
break;
}
active |= i < 0;
}
return active;
}
static void reset_xps_maps(struct net_device *dev,
struct xps_dev_maps *dev_maps,
enum xps_map_type type)
{
static_key_slow_dec_cpuslocked(&xps_needed);
if (type == XPS_RXQS)
static_key_slow_dec_cpuslocked(&xps_rxqs_needed);
RCU_INIT_POINTER(dev->xps_maps[type], NULL);
kfree_rcu(dev_maps, rcu);
}
static void clean_xps_maps(struct net_device *dev, enum xps_map_type type,
u16 offset, u16 count)
{
struct xps_dev_maps *dev_maps;
bool active = false;
int i, j;
dev_maps = xmap_dereference(dev->xps_maps[type]);
if (!dev_maps)
return;
for (j = 0; j < dev_maps->nr_ids; j++)
active |= remove_xps_queue_cpu(dev, dev_maps, j, offset, count);
if (!active)
reset_xps_maps(dev, dev_maps, type);
if (type == XPS_CPUS) {
for (i = offset + (count - 1); count--; i--)
netdev_queue_numa_node_write(
netdev_get_tx_queue(dev, i), NUMA_NO_NODE);
}
}
static void netif_reset_xps_queues(struct net_device *dev, u16 offset,
u16 count)
{
if (!static_key_false(&xps_needed))
return;
cpus_read_lock();
mutex_lock(&xps_map_mutex);
if (static_key_false(&xps_rxqs_needed))
clean_xps_maps(dev, XPS_RXQS, offset, count);
clean_xps_maps(dev, XPS_CPUS, offset, count);
mutex_unlock(&xps_map_mutex);
cpus_read_unlock();
}
static void netif_reset_xps_queues_gt(struct net_device *dev, u16 index)
{
netif_reset_xps_queues(dev, index, dev->num_tx_queues - index);
}
static struct xps_map *expand_xps_map(struct xps_map *map, int attr_index,
u16 index, bool is_rxqs_map)
{
struct xps_map *new_map;
int alloc_len = XPS_MIN_MAP_ALLOC;
int i, pos;
for (pos = 0; map && pos < map->len; pos++) {
if (map->queues[pos] != index)
continue;
return map;
}
/* Need to add tx-queue to this CPU's/rx-queue's existing map */
if (map) {
if (pos < map->alloc_len)
return map;
alloc_len = map->alloc_len * 2;
}
/* Need to allocate new map to store tx-queue on this CPU's/rx-queue's
* map
*/
if (is_rxqs_map)
new_map = kzalloc(XPS_MAP_SIZE(alloc_len), GFP_KERNEL);
else
new_map = kzalloc_node(XPS_MAP_SIZE(alloc_len), GFP_KERNEL,
cpu_to_node(attr_index));
if (!new_map)
return NULL;
for (i = 0; i < pos; i++)
new_map->queues[i] = map->queues[i];
new_map->alloc_len = alloc_len;
new_map->len = pos;
return new_map;
}
/* Copy xps maps at a given index */
static void xps_copy_dev_maps(struct xps_dev_maps *dev_maps,
struct xps_dev_maps *new_dev_maps, int index,
int tc, bool skip_tc)
{
int i, tci = index * dev_maps->num_tc;
struct xps_map *map;
/* copy maps belonging to foreign traffic classes */
for (i = 0; i < dev_maps->num_tc; i++, tci++) {
if (i == tc && skip_tc)
continue;
/* fill in the new device map from the old device map */
map = xmap_dereference(dev_maps->attr_map[tci]);
RCU_INIT_POINTER(new_dev_maps->attr_map[tci], map);
}
}
/* Must be called under cpus_read_lock */
int __netif_set_xps_queue(struct net_device *dev, const unsigned long *mask,
u16 index, enum xps_map_type type)
{
struct xps_dev_maps *dev_maps, *new_dev_maps = NULL, *old_dev_maps = NULL;
const unsigned long *online_mask = NULL;
bool active = false, copy = false;
int i, j, tci, numa_node_id = -2;
int maps_sz, num_tc = 1, tc = 0;
struct xps_map *map, *new_map;
unsigned int nr_ids;
if (dev->num_tc) {
/* Do not allow XPS on subordinate device directly */
num_tc = dev->num_tc;
if (num_tc < 0)
return -EINVAL;
/* If queue belongs to subordinate dev use its map */
dev = netdev_get_tx_queue(dev, index)->sb_dev ? : dev;
tc = netdev_txq_to_tc(dev, index);
if (tc < 0)
return -EINVAL;
}
mutex_lock(&xps_map_mutex);
dev_maps = xmap_dereference(dev->xps_maps[type]);
if (type == XPS_RXQS) {
maps_sz = XPS_RXQ_DEV_MAPS_SIZE(num_tc, dev->num_rx_queues);
nr_ids = dev->num_rx_queues;
} else {
maps_sz = XPS_CPU_DEV_MAPS_SIZE(num_tc);
if (num_possible_cpus() > 1)
online_mask = cpumask_bits(cpu_online_mask);
nr_ids = nr_cpu_ids;
}
if (maps_sz < L1_CACHE_BYTES)
maps_sz = L1_CACHE_BYTES;
/* The old dev_maps could be larger or smaller than the one we're
* setting up now, as dev->num_tc or nr_ids could have been updated in
* between. We could try to be smart, but let's be safe instead and only
* copy foreign traffic classes if the two map sizes match.
*/
if (dev_maps &&
dev_maps->num_tc == num_tc && dev_maps->nr_ids == nr_ids)
copy = true;
/* allocate memory for queue storage */
for (j = -1; j = netif_attrmask_next_and(j, online_mask, mask, nr_ids),
j < nr_ids;) {
if (!new_dev_maps) {
new_dev_maps = kzalloc(maps_sz, GFP_KERNEL);
if (!new_dev_maps) {
mutex_unlock(&xps_map_mutex);
return -ENOMEM;
}
new_dev_maps->nr_ids = nr_ids;
new_dev_maps->num_tc = num_tc;
}
tci = j * num_tc + tc;
map = copy ? xmap_dereference(dev_maps->attr_map[tci]) : NULL;
map = expand_xps_map(map, j, index, type == XPS_RXQS);
if (!map)
goto error;
RCU_INIT_POINTER(new_dev_maps->attr_map[tci], map);
}
if (!new_dev_maps)
goto out_no_new_maps;
if (!dev_maps) {
/* Increment static keys at most once per type */
static_key_slow_inc_cpuslocked(&xps_needed);
if (type == XPS_RXQS)
static_key_slow_inc_cpuslocked(&xps_rxqs_needed);
}
for (j = 0; j < nr_ids; j++) {
bool skip_tc = false;
tci = j * num_tc + tc;
if (netif_attr_test_mask(j, mask, nr_ids) &&
netif_attr_test_online(j, online_mask, nr_ids)) {
/* add tx-queue to CPU/rx-queue maps */
int pos = 0;
skip_tc = true;
map = xmap_dereference(new_dev_maps->attr_map[tci]);
while ((pos < map->len) && (map->queues[pos] != index))
pos++;
if (pos == map->len)
map->queues[map->len++] = index;
#ifdef CONFIG_NUMA
if (type == XPS_CPUS) {
if (numa_node_id == -2)
numa_node_id = cpu_to_node(j);
else if (numa_node_id != cpu_to_node(j))
numa_node_id = -1;
}
#endif
}
if (copy)
xps_copy_dev_maps(dev_maps, new_dev_maps, j, tc,
skip_tc);
}
rcu_assign_pointer(dev->xps_maps[type], new_dev_maps);
/* Cleanup old maps */
if (!dev_maps)
goto out_no_old_maps;
for (j = 0; j < dev_maps->nr_ids; j++) {
for (i = num_tc, tci = j * dev_maps->num_tc; i--; tci++) {
map = xmap_dereference(dev_maps->attr_map[tci]);
if (!map)
continue;
if (copy) {
new_map = xmap_dereference(new_dev_maps->attr_map[tci]);
if (map == new_map)
continue;
}
RCU_INIT_POINTER(dev_maps->attr_map[tci], NULL);
kfree_rcu(map, rcu);
}
}
old_dev_maps = dev_maps;
out_no_old_maps:
dev_maps = new_dev_maps;
active = true;
out_no_new_maps:
if (type == XPS_CPUS)
/* update Tx queue numa node */
netdev_queue_numa_node_write(netdev_get_tx_queue(dev, index),
(numa_node_id >= 0) ?
numa_node_id : NUMA_NO_NODE);
if (!dev_maps)
goto out_no_maps;
/* removes tx-queue from unused CPUs/rx-queues */
for (j = 0; j < dev_maps->nr_ids; j++) {
tci = j * dev_maps->num_tc;
for (i = 0; i < dev_maps->num_tc; i++, tci++) {
if (i == tc &&
netif_attr_test_mask(j, mask, dev_maps->nr_ids) &&
netif_attr_test_online(j, online_mask, dev_maps->nr_ids))
continue;
active |= remove_xps_queue(dev_maps,
copy ? old_dev_maps : NULL,
tci, index);
}
}
if (old_dev_maps)
kfree_rcu(old_dev_maps, rcu);
/* free map if not active */
if (!active)
reset_xps_maps(dev, dev_maps, type);
out_no_maps:
mutex_unlock(&xps_map_mutex);
return 0;
error:
/* remove any maps that we added */
for (j = 0; j < nr_ids; j++) {
for (i = num_tc, tci = j * num_tc; i--; tci++) {
new_map = xmap_dereference(new_dev_maps->attr_map[tci]);
map = copy ?
xmap_dereference(dev_maps->attr_map[tci]) :
NULL;
if (new_map && new_map != map)
kfree(new_map);
}
}
mutex_unlock(&xps_map_mutex);
kfree(new_dev_maps);
return -ENOMEM;
}
EXPORT_SYMBOL_GPL(__netif_set_xps_queue);
int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask,
u16 index)
{
int ret;
cpus_read_lock();
ret = __netif_set_xps_queue(dev, cpumask_bits(mask), index, XPS_CPUS);
cpus_read_unlock();
return ret;
}
EXPORT_SYMBOL(netif_set_xps_queue);
#endif
static void netdev_unbind_all_sb_channels(struct net_device *dev)
{
struct netdev_queue *txq = &dev->_tx[dev->num_tx_queues];
/* Unbind any subordinate channels */
while (txq-- != &dev->_tx[0]) {
if (txq->sb_dev)
netdev_unbind_sb_channel(dev, txq->sb_dev);
}
}
void netdev_reset_tc(struct net_device *dev)
{
#ifdef CONFIG_XPS
netif_reset_xps_queues_gt(dev, 0);
#endif
netdev_unbind_all_sb_channels(dev);
/* Reset TC configuration of device */
dev->num_tc = 0;
memset(dev->tc_to_txq, 0, sizeof(dev->tc_to_txq));
memset(dev->prio_tc_map, 0, sizeof(dev->prio_tc_map));
}
EXPORT_SYMBOL(netdev_reset_tc);
int netdev_set_tc_queue(struct net_device *dev, u8 tc, u16 count, u16 offset)
{
if (tc >= dev->num_tc)
return -EINVAL;
#ifdef CONFIG_XPS
netif_reset_xps_queues(dev, offset, count);
#endif
dev->tc_to_txq[tc].count = count;
dev->tc_to_txq[tc].offset = offset;
return 0;
}
EXPORT_SYMBOL(netdev_set_tc_queue);
int netdev_set_num_tc(struct net_device *dev, u8 num_tc)
{
if (num_tc > TC_MAX_QUEUE)
return -EINVAL;
#ifdef CONFIG_XPS
netif_reset_xps_queues_gt(dev, 0);
#endif
netdev_unbind_all_sb_channels(dev);
dev->num_tc = num_tc;
return 0;
}
EXPORT_SYMBOL(netdev_set_num_tc);
void netdev_unbind_sb_channel(struct net_device *dev,
struct net_device *sb_dev)
{
struct netdev_queue *txq = &dev->_tx[dev->num_tx_queues];
#ifdef CONFIG_XPS
netif_reset_xps_queues_gt(sb_dev, 0);
#endif
memset(sb_dev->tc_to_txq, 0, sizeof(sb_dev->tc_to_txq));
memset(sb_dev->prio_tc_map, 0, sizeof(sb_dev->prio_tc_map));
while (txq-- != &dev->_tx[0]) {
if (txq->sb_dev == sb_dev)
txq->sb_dev = NULL;
}
}
EXPORT_SYMBOL(netdev_unbind_sb_channel);
int netdev_bind_sb_channel_queue(struct net_device *dev,
struct net_device *sb_dev,
u8 tc, u16 count, u16 offset)
{
/* Make certain the sb_dev and dev are already configured */
if (sb_dev->num_tc >= 0 || tc >= dev->num_tc)
return -EINVAL;
/* We cannot hand out queues we don't have */
if ((offset + count) > dev->real_num_tx_queues)
return -EINVAL;
/* Record the mapping */
sb_dev->tc_to_txq[tc].count = count;
sb_dev->tc_to_txq[tc].offset = offset;
/* Provide a way for Tx queue to find the tc_to_txq map or
* XPS map for itself.
*/
while (count--)
netdev_get_tx_queue(dev, count + offset)->sb_dev = sb_dev;
return 0;
}
EXPORT_SYMBOL(netdev_bind_sb_channel_queue);
int netdev_set_sb_channel(struct net_device *dev, u16 channel)
{
/* Do not use a multiqueue device to represent a subordinate channel */
if (netif_is_multiqueue(dev))
return -ENODEV;
/* We allow channels 1 - 32767 to be used for subordinate channels.
* Channel 0 is meant to be "native" mode and used only to represent
* the main root device. We allow writing 0 to reset the device back
* to normal mode after being used as a subordinate channel.
*/
if (channel > S16_MAX)
return -EINVAL;
dev->num_tc = -channel;
return 0;
}
EXPORT_SYMBOL(netdev_set_sb_channel);
/*
* Routine to help set real_num_tx_queues. To avoid skbs mapped to queues
* greater than real_num_tx_queues stale skbs on the qdisc must be flushed.
*/
int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq)
{
bool disabling;
int rc;
disabling = txq < dev->real_num_tx_queues;
if (txq < 1 || txq > dev->num_tx_queues)
return -EINVAL;
if (dev->reg_state == NETREG_REGISTERED ||
dev->reg_state == NETREG_UNREGISTERING) {
ASSERT_RTNL();
rc = netdev_queue_update_kobjects(dev, dev->real_num_tx_queues,
txq);
if (rc)
return rc;
if (dev->num_tc)
netif_setup_tc(dev, txq);
dev_qdisc_change_real_num_tx(dev, txq);
dev->real_num_tx_queues = txq;
if (disabling) {
synchronize_net();
qdisc_reset_all_tx_gt(dev, txq);
#ifdef CONFIG_XPS
netif_reset_xps_queues_gt(dev, txq);
#endif
}
} else {
dev->real_num_tx_queues = txq;
}
return 0;
}
EXPORT_SYMBOL(netif_set_real_num_tx_queues);
#ifdef CONFIG_SYSFS
/**
* netif_set_real_num_rx_queues - set actual number of RX queues used
* @dev: Network device
* @rxq: Actual number of RX queues
*
* This must be called either with the rtnl_lock held or before
* registration of the net device. Returns 0 on success, or a
* negative error code. If called before registration, it always
* succeeds.
*/
int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq)
{
int rc;
if (rxq < 1 || rxq > dev->num_rx_queues)
return -EINVAL;
if (dev->reg_state == NETREG_REGISTERED) {
ASSERT_RTNL();
rc = net_rx_queue_update_kobjects(dev, dev->real_num_rx_queues,
rxq);
if (rc)
return rc;
}
dev->real_num_rx_queues = rxq;
return 0;
}
EXPORT_SYMBOL(netif_set_real_num_rx_queues);
#endif
/**
* netif_set_real_num_queues - set actual number of RX and TX queues used
* @dev: Network device
* @txq: Actual number of TX queues
* @rxq: Actual number of RX queues
*
* Set the real number of both TX and RX queues.
* Does nothing if the number of queues is already correct.
*/
int netif_set_real_num_queues(struct net_device *dev,
unsigned int txq, unsigned int rxq)
{
unsigned int old_rxq = dev->real_num_rx_queues;
int err;
if (txq < 1 || txq > dev->num_tx_queues ||
rxq < 1 || rxq > dev->num_rx_queues)
return -EINVAL;
/* Start from increases, so the error path only does decreases -
* decreases can't fail.
*/
if (rxq > dev->real_num_rx_queues) {
err = netif_set_real_num_rx_queues(dev, rxq);
if (err)
return err;
}
if (txq > dev->real_num_tx_queues) {
err = netif_set_real_num_tx_queues(dev, txq);
if (err)
goto undo_rx;
}
if (rxq < dev->real_num_rx_queues)
WARN_ON(netif_set_real_num_rx_queues(dev, rxq));
if (txq < dev->real_num_tx_queues)
WARN_ON(netif_set_real_num_tx_queues(dev, txq));
return 0;
undo_rx:
WARN_ON(netif_set_real_num_rx_queues(dev, old_rxq));
return err;
}
EXPORT_SYMBOL(netif_set_real_num_queues);
/**
* netif_get_num_default_rss_queues - default number of RSS queues
*
* This routine should set an upper limit on the number of RSS queues
* used by default by multiqueue devices.
*/
int netif_get_num_default_rss_queues(void)
{
return is_kdump_kernel() ?
1 : min_t(int, DEFAULT_MAX_NUM_RSS_QUEUES, num_online_cpus());
}
EXPORT_SYMBOL(netif_get_num_default_rss_queues);
static void __netif_reschedule(struct Qdisc *q)
{
struct softnet_data *sd;
unsigned long flags;
local_irq_save(flags);
sd = this_cpu_ptr(&softnet_data);
q->next_sched = NULL;
*sd->output_queue_tailp = q;
sd->output_queue_tailp = &q->next_sched;
raise_softirq_irqoff(NET_TX_SOFTIRQ);
local_irq_restore(flags);
}
void __netif_schedule(struct Qdisc *q)
{
if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state))
__netif_reschedule(q);
}
EXPORT_SYMBOL(__netif_schedule);
struct dev_kfree_skb_cb {
enum skb_free_reason reason;
};
static struct dev_kfree_skb_cb *get_kfree_skb_cb(const struct sk_buff *skb)
{
return (struct dev_kfree_skb_cb *)skb->cb;
}
void netif_schedule_queue(struct netdev_queue *txq)
{
rcu_read_lock();
if (!netif_xmit_stopped(txq)) {
struct Qdisc *q = rcu_dereference(txq->qdisc);
__netif_schedule(q);
}
rcu_read_unlock();
}
EXPORT_SYMBOL(netif_schedule_queue);
void netif_tx_wake_queue(struct netdev_queue *dev_queue)
{
if (test_and_clear_bit(__QUEUE_STATE_DRV_XOFF, &dev_queue->state)) {
struct Qdisc *q;
rcu_read_lock();
q = rcu_dereference(dev_queue->qdisc);
__netif_schedule(q);
rcu_read_unlock();
}
}
EXPORT_SYMBOL(netif_tx_wake_queue);
void __dev_kfree_skb_irq(struct sk_buff *skb, enum skb_free_reason reason)
{
unsigned long flags;
if (unlikely(!skb))
return;
if (likely(refcount_read(&skb->users) == 1)) {
smp_rmb();
refcount_set(&skb->users, 0);
} else if (likely(!refcount_dec_and_test(&skb->users))) {
return;
}
get_kfree_skb_cb(skb)->reason = reason;
local_irq_save(flags);
skb->next = __this_cpu_read(softnet_data.completion_queue);
__this_cpu_write(softnet_data.completion_queue, skb);
raise_softirq_irqoff(NET_TX_SOFTIRQ);
local_irq_restore(flags);
}
EXPORT_SYMBOL(__dev_kfree_skb_irq);
void __dev_kfree_skb_any(struct sk_buff *skb, enum skb_free_reason reason)
{
if (in_hardirq() || irqs_disabled())
__dev_kfree_skb_irq(skb, reason);
else
dev_kfree_skb(skb);
}
EXPORT_SYMBOL(__dev_kfree_skb_any);
/**
* netif_device_detach - mark device as removed
* @dev: network device
*
* Mark device as removed from system and therefore no longer available.
*/
void netif_device_detach(struct net_device *dev)
{
if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) &&
netif_running(dev)) {
netif_tx_stop_all_queues(dev);
}
}
EXPORT_SYMBOL(netif_device_detach);
/**
* netif_device_attach - mark device as attached
* @dev: network device
*
* Mark device as attached from system and restart if needed.
*/
void netif_device_attach(struct net_device *dev)
{
if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) &&
netif_running(dev)) {
netif_tx_wake_all_queues(dev);
__netdev_watchdog_up(dev);
}
}
EXPORT_SYMBOL(netif_device_attach);
/*
* Returns a Tx hash based on the given packet descriptor a Tx queues' number
* to be used as a distribution range.
*/
static u16 skb_tx_hash(const struct net_device *dev,
const struct net_device *sb_dev,
struct sk_buff *skb)
{
u32 hash;
u16 qoffset = 0;
u16 qcount = dev->real_num_tx_queues;
if (dev->num_tc) {
u8 tc = netdev_get_prio_tc_map(dev, skb->priority);
qoffset = sb_dev->tc_to_txq[tc].offset;
qcount = sb_dev->tc_to_txq[tc].count;
if (unlikely(!qcount)) {
net_warn_ratelimited("%s: invalid qcount, qoffset %u for tc %u\n",
sb_dev->name, qoffset, tc);
qoffset = 0;
qcount = dev->real_num_tx_queues;
}
}
if (skb_rx_queue_recorded(skb)) {
hash = skb_get_rx_queue(skb);
if (hash >= qoffset)
hash -= qoffset;
while (unlikely(hash >= qcount))
hash -= qcount;
return hash + qoffset;
}
return (u16) reciprocal_scale(skb_get_hash(skb), qcount) + qoffset;
}
static void skb_warn_bad_offload(const struct sk_buff *skb)
{
static const netdev_features_t null_features;
struct net_device *dev = skb->dev;
const char *name = "";
if (!net_ratelimit())
return;
if (dev) {
if (dev->dev.parent)
name = dev_driver_string(dev->dev.parent);
else
name = netdev_name(dev);
}
skb_dump(KERN_WARNING, skb, false);
WARN(1, "%s: caps=(%pNF, %pNF)\n",
name, dev ? &dev->features : &null_features,
skb->sk ? &skb->sk->sk_route_caps : &null_features);
}
/*
* Invalidate hardware checksum when packet is to be mangled, and
* complete checksum manually on outgoing path.
*/
int skb_checksum_help(struct sk_buff *skb)
{
__wsum csum;
int ret = 0, offset;
if (skb->ip_summed == CHECKSUM_COMPLETE)
goto out_set_summed;
if (unlikely(skb_is_gso(skb))) {
skb_warn_bad_offload(skb);
return -EINVAL;
}
/* Before computing a checksum, we should make sure no frag could
* be modified by an external entity : checksum could be wrong.
*/
if (skb_has_shared_frag(skb)) {
ret = __skb_linearize(skb);
if (ret)
goto out;
}
offset = skb_checksum_start_offset(skb);
BUG_ON(offset >= skb_headlen(skb)); csum = skb_checksum(skb, offset, skb->len - offset, 0);
offset += skb->csum_offset;
BUG_ON(offset + sizeof(__sum16) > skb_headlen(skb)); ret = skb_ensure_writable(skb, offset + sizeof(__sum16));
if (ret)
goto out;
*(__sum16 *)(skb->data + offset) = csum_fold(csum) ?: CSUM_MANGLED_0;
out_set_summed:
skb->ip_summed = CHECKSUM_NONE;
out:
return ret;
}
EXPORT_SYMBOL(skb_checksum_help);
int skb_crc32c_csum_help(struct sk_buff *skb)
{
__le32 crc32c_csum;
int ret = 0, offset, start;
if (skb->ip_summed != CHECKSUM_PARTIAL)
goto out;
if (unlikely(skb_is_gso(skb)))
goto out;
/* Before computing a checksum, we should make sure no frag could
* be modified by an external entity : checksum could be wrong.
*/
if (unlikely(skb_has_shared_frag(skb))) {
ret = __skb_linearize(skb);
if (ret)
goto out;
}
start = skb_checksum_start_offset(skb);
offset = start + offsetof(struct sctphdr, checksum);
if (WARN_ON_ONCE(offset >= skb_headlen(skb))) {
ret = -EINVAL;
goto out;
}
ret = skb_ensure_writable(skb, offset + sizeof(__le32));
if (ret)
goto out;
crc32c_csum = cpu_to_le32(~__skb_checksum(skb, start,
skb->len - start, ~(__u32)0,
crc32c_csum_stub));
*(__le32 *)(skb->data + offset) = crc32c_csum;
skb->ip_summed = CHECKSUM_NONE;
skb->csum_not_inet = 0;
out:
return ret;
}
__be16 skb_network_protocol(struct sk_buff *skb, int *depth)
{
__be16 type = skb->protocol;
/* Tunnel gso handlers can set protocol to ethernet. */
if (type == htons(ETH_P_TEB)) {
struct ethhdr *eth;
if (unlikely(!pskb_may_pull(skb, sizeof(struct ethhdr))))
return 0;
eth = (struct ethhdr *)skb->data;
type = eth->h_proto;
}
return __vlan_get_protocol(skb, type, depth);
}
/**
* skb_mac_gso_segment - mac layer segmentation handler.
* @skb: buffer to segment
* @features: features for the output path (see dev->features)
*/
struct sk_buff *skb_mac_gso_segment(struct sk_buff *skb,
netdev_features_t features)
{
struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
struct packet_offload *ptype;
int vlan_depth = skb->mac_len;
__be16 type = skb_network_protocol(skb, &vlan_depth);
if (unlikely(!type))
return ERR_PTR(-EINVAL);
__skb_pull(skb, vlan_depth);
rcu_read_lock();
list_for_each_entry_rcu(ptype, &offload_base, list) {
if (ptype->type == type && ptype->callbacks.gso_segment) {
segs = ptype->callbacks.gso_segment(skb, features);
break;
}
}
rcu_read_unlock();
__skb_push(skb, skb->data - skb_mac_header(skb));
return segs;
}
EXPORT_SYMBOL(skb_mac_gso_segment);
/* openvswitch calls this on rx path, so we need a different check.
*/
static inline bool skb_needs_check(struct sk_buff *skb, bool tx_path)
{
if (tx_path)
return skb->ip_summed != CHECKSUM_PARTIAL &&
skb->ip_summed != CHECKSUM_UNNECESSARY;
return skb->ip_summed == CHECKSUM_NONE;
}
/**
* __skb_gso_segment - Perform segmentation on skb.
* @skb: buffer to segment
* @features: features for the output path (see dev->features)
* @tx_path: whether it is called in TX path
*
* This function segments the given skb and returns a list of segments.
*
* It may return NULL if the skb requires no segmentation. This is
* only possible when GSO is used for verifying header integrity.
*
* Segmentation preserves SKB_GSO_CB_OFFSET bytes of previous skb cb.
*/
struct sk_buff *__skb_gso_segment(struct sk_buff *skb,
netdev_features_t features, bool tx_path)
{
struct sk_buff *segs;
if (unlikely(skb_needs_check(skb, tx_path))) {
int err;
/* We're going to init ->check field in TCP or UDP header */
err = skb_cow_head(skb, 0);
if (err < 0)
return ERR_PTR(err);
}
/* Only report GSO partial support if it will enable us to
* support segmentation on this frame without needing additional
* work.
*/
if (features & NETIF_F_GSO_PARTIAL) {
netdev_features_t partial_features = NETIF_F_GSO_ROBUST;
struct net_device *dev = skb->dev;
partial_features |= dev->features & dev->gso_partial_features;
if (!skb_gso_ok(skb, features | partial_features))
features &= ~NETIF_F_GSO_PARTIAL;
}
BUILD_BUG_ON(SKB_GSO_CB_OFFSET +
sizeof(*SKB_GSO_CB(skb)) > sizeof(skb->cb));
SKB_GSO_CB(skb)->mac_offset = skb_headroom(skb);
SKB_GSO_CB(skb)->encap_level = 0;
skb_reset_mac_header(skb);
skb_reset_mac_len(skb);
segs = skb_mac_gso_segment(skb, features);
if (segs != skb && unlikely(skb_needs_check(skb, tx_path) && !IS_ERR(segs)))
skb_warn_bad_offload(skb);
return segs;
}
EXPORT_SYMBOL(__skb_gso_segment);
/* Take action when hardware reception checksum errors are detected. */
#ifdef CONFIG_BUG
static void do_netdev_rx_csum_fault(struct net_device *dev, struct sk_buff *skb)
{
pr_err("%s: hw csum failure\n", dev ? dev->name : "<unknown>");
skb_dump(KERN_ERR, skb, true);
dump_stack();
}
void netdev_rx_csum_fault(struct net_device *dev, struct sk_buff *skb)
{
DO_ONCE_LITE(do_netdev_rx_csum_fault, dev, skb);
}
EXPORT_SYMBOL(netdev_rx_csum_fault);
#endif
/* XXX: check that highmem exists at all on the given machine. */
static int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
{
#ifdef CONFIG_HIGHMEM
int i;
if (!(dev->features & NETIF_F_HIGHDMA)) {
for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
if (PageHighMem(skb_frag_page(frag)))
return 1;
}
}
#endif
return 0;
}
/* If MPLS offload request, verify we are testing hardware MPLS features
* instead of standard features for the netdev.
*/
#if IS_ENABLED(CONFIG_NET_MPLS_GSO)
static netdev_features_t net_mpls_features(struct sk_buff *skb,
netdev_features_t features,
__be16 type)
{
if (eth_p_mpls(type))
features &= skb->dev->mpls_features;
return features;
}
#else
static netdev_features_t net_mpls_features(struct sk_buff *skb,
netdev_features_t features,
__be16 type)
{
return features;
}
#endif
static netdev_features_t harmonize_features(struct sk_buff *skb,
netdev_features_t features)
{
__be16 type;
type = skb_network_protocol(skb, NULL);
features = net_mpls_features(skb, features, type);
if (skb->ip_summed != CHECKSUM_NONE &&
!can_checksum_protocol(features, type)) {
features &= ~(NETIF_F_CSUM_MASK | NETIF_F_GSO_MASK);
}
if (illegal_highdma(skb->dev, skb))
features &= ~NETIF_F_SG;
return features;
}
netdev_features_t passthru_features_check(struct sk_buff *skb,
struct net_device *dev,
netdev_features_t features)
{
return features;
}
EXPORT_SYMBOL(passthru_features_check);
static netdev_features_t dflt_features_check(struct sk_buff *skb,
struct net_device *dev,
netdev_features_t features)
{
return vlan_features_check(skb, features);
}
static netdev_features_t gso_features_check(const struct sk_buff *skb,
struct net_device *dev,
netdev_features_t features)
{
u16 gso_segs = skb_shinfo(skb)->gso_segs;
if (gso_segs > dev->gso_max_segs)
return features & ~NETIF_F_GSO_MASK; if (!skb_shinfo(skb)->gso_type) { skb_warn_bad_offload(skb);
return features & ~NETIF_F_GSO_MASK;
}
/* Support for GSO partial features requires software
* intervention before we can actually process the packets
* so we need to strip support for any partial features now
* and we can pull them back in after we have partially
* segmented the frame.
*/
if (!(skb_shinfo(skb)->gso_type & SKB_GSO_PARTIAL))
features &= ~dev->gso_partial_features;
/* Make sure to clear the IPv4 ID mangling feature if the
* IPv4 header has the potential to be fragmented.
*/
if (skb_shinfo(skb)->gso_type & SKB_GSO_TCPV4) {
struct iphdr *iph = skb->encapsulation ?
inner_ip_hdr(skb) : ip_hdr(skb);
if (!(iph->frag_off & htons(IP_DF)))
features &= ~NETIF_F_TSO_MANGLEID;
}
return features;
}
netdev_features_t netif_skb_features(struct sk_buff *skb)
{
struct net_device *dev = skb->dev;
netdev_features_t features = dev->features;
if (skb_is_gso(skb)) features = gso_features_check(skb, dev, features);
/* If encapsulation offload request, verify we are testing
* hardware encapsulation features instead of standard
* features for the netdev
*/
if (skb->encapsulation) features &= dev->hw_enc_features;
if (skb_vlan_tagged(skb))
features = netdev_intersect_features(features,
dev->vlan_features |
NETIF_F_HW_VLAN_CTAG_TX |
NETIF_F_HW_VLAN_STAG_TX);
if (dev->netdev_ops->ndo_features_check) features &= dev->netdev_ops->ndo_features_check(skb, dev,
features);
else
features &= dflt_features_check(skb, dev, features);
return harmonize_features(skb, features);
}
EXPORT_SYMBOL(netif_skb_features);
static int xmit_one(struct sk_buff *skb, struct net_device *dev,
struct netdev_queue *txq, bool more)
{
unsigned int len;
int rc;
if (dev_nit_active(dev))
dev_queue_xmit_nit(skb, dev); len = skb->len;
PRANDOM_ADD_NOISE(skb, dev, txq, len + jiffies);
trace_net_dev_start_xmit(skb, dev);
rc = netdev_start_xmit(skb, dev, txq, more);
trace_net_dev_xmit(skb, rc, dev, len);
return rc;
}
struct sk_buff *dev_hard_start_xmit(struct sk_buff *first, struct net_device *dev,
struct netdev_queue *txq, int *ret)
{
struct sk_buff *skb = first;
int rc = NETDEV_TX_OK;
while (skb) { struct sk_buff *next = skb->next;
skb_mark_not_on_list(skb);
rc = xmit_one(skb, dev, txq, next != NULL);
if (unlikely(!dev_xmit_complete(rc))) {
skb->next = next;
goto out;
}
skb = next;
if (netif_tx_queue_stopped(txq) && skb) {
rc = NETDEV_TX_BUSY;
break;
}
}
out:
*ret = rc;
return skb;
}
static struct sk_buff *validate_xmit_vlan(struct sk_buff *skb,
netdev_features_t features)
{
if (skb_vlan_tag_present(skb) &&
!vlan_hw_offload_capable(features, skb->vlan_proto))
skb = __vlan_hwaccel_push_inside(skb);
return skb;
}
int skb_csum_hwoffload_help(struct sk_buff *skb,
const netdev_features_t features)
{
if (unlikely(skb_csum_is_sctp(skb)))
return !!(features & NETIF_F_SCTP_CRC) ? 0 : skb_crc32c_csum_help(skb); if (features & NETIF_F_HW_CSUM)
return 0;
if (features & (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM)) { switch (skb->csum_offset) {
case offsetof(struct tcphdr, check):
case offsetof(struct udphdr, check):
return 0;
}
}
return skb_checksum_help(skb);
}
EXPORT_SYMBOL(skb_csum_hwoffload_help);
static struct sk_buff *validate_xmit_skb(struct sk_buff *skb, struct net_device *dev, bool *again)
{
netdev_features_t features;
features = netif_skb_features(skb);
skb = validate_xmit_vlan(skb, features);
if (unlikely(!skb))
goto out_null;
skb = sk_validate_xmit_skb(skb, dev);
if (unlikely(!skb))
goto out_null;
if (netif_needs_gso(skb, features)) {
struct sk_buff *segs;
segs = skb_gso_segment(skb, features);
if (IS_ERR(segs)) {
goto out_kfree_skb;
} else if (segs) { consume_skb(skb);
skb = segs;
}
} else {
if (skb_needs_linearize(skb, features) &&
__skb_linearize(skb))
goto out_kfree_skb;
/* If packet is not checksummed and device does not
* support checksumming for this protocol, complete
* checksumming here.
*/
if (skb->ip_summed == CHECKSUM_PARTIAL) { if (skb->encapsulation)
skb_set_inner_transport_header(skb,
skb_checksum_start_offset(skb));
else
skb_set_transport_header(skb,
skb_checksum_start_offset(skb));
if (skb_csum_hwoffload_help(skb, features))
goto out_kfree_skb;
}
}
skb = validate_xmit_xfrm(skb, features, again);
return skb;
out_kfree_skb:
kfree_skb(skb);
out_null:
atomic_long_inc(&dev->tx_dropped); return NULL;
}
struct sk_buff *validate_xmit_skb_list(struct sk_buff *skb, struct net_device *dev, bool *again)
{
struct sk_buff *next, *head = NULL, *tail;
for (; skb != NULL; skb = next) {
next = skb->next;
skb_mark_not_on_list(skb);
/* in case skb wont be segmented, point to itself */
skb->prev = skb;
skb = validate_xmit_skb(skb, dev, again);
if (!skb)
continue;
if (!head)
head = skb;
else
tail->next = skb;
/* If skb was segmented, skb->prev points to
* the last segment. If not, it still contains skb.
*/
tail = skb->prev;
}
return head;
}
EXPORT_SYMBOL_GPL(validate_xmit_skb_list);
static void qdisc_pkt_len_init(struct sk_buff *skb)
{
const struct skb_shared_info *shinfo = skb_shinfo(skb);
qdisc_skb_cb(skb)->pkt_len = skb->len;
/* To get more precise estimation of bytes sent on wire,
* we add to pkt_len the headers size of all segments
*/
if (shinfo->gso_size && skb_transport_header_was_set(skb)) {
unsigned int hdr_len;
u16 gso_segs = shinfo->gso_segs;
/* mac layer + network layer */
hdr_len = skb_transport_header(skb) - skb_mac_header(skb);
/* + transport layer */
if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6))) {
const struct tcphdr *th;
struct tcphdr _tcphdr;
th = skb_header_pointer(skb, skb_transport_offset(skb),
sizeof(_tcphdr), &_tcphdr);
if (likely(th))
hdr_len += __tcp_hdrlen(th);
} else {
struct udphdr _udphdr;
if (skb_header_pointer(skb, skb_transport_offset(skb),
sizeof(_udphdr), &_udphdr))
hdr_len += sizeof(struct udphdr);
}
if (shinfo->gso_type & SKB_GSO_DODGY) gso_segs = DIV_ROUND_UP(skb->len - hdr_len,
shinfo->gso_size);
qdisc_skb_cb(skb)->pkt_len += (gso_segs - 1) * hdr_len;
}
}
static int dev_qdisc_enqueue(struct sk_buff *skb, struct Qdisc *q,
struct sk_buff **to_free,
struct netdev_queue *txq)
{
int rc;
rc = q->enqueue(skb, q, to_free) & NET_XMIT_MASK;
if (rc == NET_XMIT_SUCCESS)
trace_qdisc_enqueue(q, txq, skb);
return rc;
}
static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
struct net_device *dev,
struct netdev_queue *txq)
{
spinlock_t *root_lock = qdisc_lock(q);
struct sk_buff *to_free = NULL;
bool contended;
int rc;
qdisc_calculate_pkt_len(skb, q);
if (q->flags & TCQ_F_NOLOCK) { if (q->flags & TCQ_F_CAN_BYPASS && nolock_qdisc_is_empty(q) &&
qdisc_run_begin(q)) {
/* Retest nolock_qdisc_is_empty() within the protection
* of q->seqlock to protect from racing with requeuing.
*/
if (unlikely(!nolock_qdisc_is_empty(q))) {
rc = dev_qdisc_enqueue(skb, q, &to_free, txq);
__qdisc_run(q);
qdisc_run_end(q);
goto no_lock_out;
}
qdisc_bstats_cpu_update(q, skb);
if (sch_direct_xmit(skb, q, dev, txq, NULL, true) &&
!nolock_qdisc_is_empty(q))
__qdisc_run(q);
qdisc_run_end(q);
return NET_XMIT_SUCCESS;
}
rc = dev_qdisc_enqueue(skb, q, &to_free, txq);
qdisc_run(q);
no_lock_out:
if (unlikely(to_free)) kfree_skb_list(to_free);
return rc;
}
/*
* Heuristic to force contended enqueues to serialize on a
* separate lock before trying to get qdisc main lock.
* This permits qdisc->running owner to get the lock more
* often and dequeue packets faster.
*/
contended = qdisc_is_running(q);
if (unlikely(contended))
spin_lock(&q->busylock);
spin_lock(root_lock);
if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
__qdisc_drop(skb, &to_free);
rc = NET_XMIT_DROP;
} else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) &&
qdisc_run_begin(q)) {
/*
* This is a work-conserving queue; there are no old skbs
* waiting to be sent out; and the qdisc is not running -
* xmit the skb directly.
*/
qdisc_bstats_update(q, skb);
if (sch_direct_xmit(skb, q, dev, txq, root_lock, true)) {
if (unlikely(contended)) {
spin_unlock(&q->busylock);
contended = false;
}
__qdisc_run(q);
}
qdisc_run_end(q);
rc = NET_XMIT_SUCCESS;
} else {
rc = dev_qdisc_enqueue(skb, q, &to_free, txq);
if (qdisc_run_begin(q)) {
if (unlikely(contended)) {
spin_unlock(&q->busylock);
contended = false;
}
__qdisc_run(q);
qdisc_run_end(q);
}
}
spin_unlock(root_lock);
if (unlikely(to_free))
kfree_skb_list(to_free); if (unlikely(contended))
spin_unlock(&q->busylock);
return rc;
}
#if IS_ENABLED(CONFIG_CGROUP_NET_PRIO)
static void skb_update_prio(struct sk_buff *skb)
{
const struct netprio_map *map;
const struct sock *sk;
unsigned int prioidx;
if (skb->priority)
return;
map = rcu_dereference_bh(skb->dev->priomap);
if (!map)
return;
sk = skb_to_full_sk(skb);
if (!sk)
return;
prioidx = sock_cgroup_prioidx(&sk->sk_cgrp_data);
if (prioidx < map->priomap_len)
skb->priority = map->priomap[prioidx];
}
#else
#define skb_update_prio(skb)
#endif
/**
* dev_loopback_xmit - loop back @skb
* @net: network namespace this loopback is happening in
* @sk: sk needed to be a netfilter okfn
* @skb: buffer to transmit
*/
int dev_loopback_xmit(struct net *net, struct sock *sk, struct sk_buff *skb)
{
skb_reset_mac_header(skb);
__skb_pull(skb, skb_network_offset(skb));
skb->pkt_type = PACKET_LOOPBACK;
if (skb->ip_summed == CHECKSUM_NONE)
skb->ip_summed = CHECKSUM_UNNECESSARY; WARN_ON(!skb_dst(skb));
skb_dst_force(skb);
netif_rx_ni(skb);
return 0;
}
EXPORT_SYMBOL(dev_loopback_xmit);
#ifdef CONFIG_NET_EGRESS
static struct sk_buff *
sch_handle_egress(struct sk_buff *skb, int *ret, struct net_device *dev)
{
struct mini_Qdisc *miniq = rcu_dereference_bh(dev->miniq_egress);
struct tcf_result cl_res;
if (!miniq)
return skb;
/* qdisc_skb_cb(skb)->pkt_len was already set by the caller. */
tc_skb_cb(skb)->mru = 0;
tc_skb_cb(skb)->post_ct = false;
mini_qdisc_bstats_cpu_update(miniq, skb);
switch (tcf_classify(skb, miniq->block, miniq->filter_list, &cl_res, false)) {
case TC_ACT_OK:
case TC_ACT_RECLASSIFY:
skb->tc_index = TC_H_MIN(cl_res.classid);
break;
case TC_ACT_SHOT:
mini_qdisc_qstats_cpu_drop(miniq);
*ret = NET_XMIT_DROP;
kfree_skb(skb);
return NULL;
case TC_ACT_STOLEN:
case TC_ACT_QUEUED:
case TC_ACT_TRAP:
*ret = NET_XMIT_SUCCESS;
consume_skb(skb);
return NULL;
case TC_ACT_REDIRECT:
/* No need to push/pop skb's mac_header here on egress! */
skb_do_redirect(skb);
*ret = NET_XMIT_SUCCESS;
return NULL;
default:
break;
}
return skb;
}
#endif /* CONFIG_NET_EGRESS */
#ifdef CONFIG_XPS
static int __get_xps_queue_idx(struct net_device *dev, struct sk_buff *skb,
struct xps_dev_maps *dev_maps, unsigned int tci)
{
int tc = netdev_get_prio_tc_map(dev, skb->priority);
struct xps_map *map;
int queue_index = -1;
if (tc >= dev_maps->num_tc || tci >= dev_maps->nr_ids)
return queue_index;
tci *= dev_maps->num_tc;
tci += tc;
map = rcu_dereference(dev_maps->attr_map[tci]);
if (map) {
if (map->len == 1)
queue_index = map->queues[0];
else
queue_index = map->queues[reciprocal_scale(
skb_get_hash(skb), map->len)];
if (unlikely(queue_index >= dev->real_num_tx_queues))
queue_index = -1;
}
return queue_index;
}
#endif
static int get_xps_queue(struct net_device *dev, struct net_device *sb_dev,
struct sk_buff *skb)
{
#ifdef CONFIG_XPS
struct xps_dev_maps *dev_maps;
struct sock *sk = skb->sk;
int queue_index = -1;
if (!static_key_false(&xps_needed))
return -1;
rcu_read_lock();
if (!static_key_false(&xps_rxqs_needed))
goto get_cpus_map;
dev_maps = rcu_dereference(sb_dev->xps_maps[XPS_RXQS]);
if (dev_maps) {
int tci = sk_rx_queue_get(sk);
if (tci >= 0)
queue_index = __get_xps_queue_idx(dev, skb, dev_maps,
tci);
}
get_cpus_map:
if (queue_index < 0) {
dev_maps = rcu_dereference(sb_dev->xps_maps[XPS_CPUS]);
if (dev_maps) {
unsigned int tci = skb->sender_cpu - 1;
queue_index = __get_xps_queue_idx(dev, skb, dev_maps,
tci);
}
}
rcu_read_unlock();
return queue_index;
#else
return -1;
#endif
}
u16 dev_pick_tx_zero(struct net_device *dev, struct sk_buff *skb,
struct net_device *sb_dev)
{
return 0;
}
EXPORT_SYMBOL(dev_pick_tx_zero);
u16 dev_pick_tx_cpu_id(struct net_device *dev, struct sk_buff *skb,
struct net_device *sb_dev)
{
return (u16)raw_smp_processor_id() % dev->real_num_tx_queues;
}
EXPORT_SYMBOL(dev_pick_tx_cpu_id);
u16 netdev_pick_tx(struct net_device *dev, struct sk_buff *skb,
struct net_device *sb_dev)
{
struct sock *sk = skb->sk;
int queue_index = sk_tx_queue_get(sk);
sb_dev = sb_dev ? : dev;
if (queue_index < 0 || skb->ooo_okay ||
queue_index >= dev->real_num_tx_queues) {
int new_index = get_xps_queue(dev, sb_dev, skb);
if (new_index < 0)
new_index = skb_tx_hash(dev, sb_dev, skb);
if (queue_index != new_index && sk &&
sk_fullsock(sk) &&
rcu_access_pointer(sk->sk_dst_cache))
sk_tx_queue_set(sk, new_index);
queue_index = new_index;
}
return queue_index;
}
EXPORT_SYMBOL(netdev_pick_tx);
struct netdev_queue *netdev_core_pick_tx(struct net_device *dev,
struct sk_buff *skb,
struct net_device *sb_dev)
{
int queue_index = 0;
#ifdef CONFIG_XPS
u32 sender_cpu = skb->sender_cpu - 1;
if (sender_cpu >= (u32)NR_CPUS)
skb->sender_cpu = raw_smp_processor_id() + 1;
#endif
if (dev->real_num_tx_queues != 1) { const struct net_device_ops *ops = dev->netdev_ops;
if (ops->ndo_select_queue)
queue_index = ops->ndo_select_queue(dev, skb, sb_dev);
else
queue_index = netdev_pick_tx(dev, skb, sb_dev);
queue_index = netdev_cap_txqueue(dev, queue_index);
}
skb_set_queue_mapping(skb, queue_index);
return netdev_get_tx_queue(dev, queue_index);
}
/**
* __dev_queue_xmit - transmit a buffer
* @skb: buffer to transmit
* @sb_dev: suboordinate device used for L2 forwarding offload
*
* Queue a buffer for transmission to a network device. The caller must
* have set the device and priority and built the buffer before calling
* this function. The function can be called from an interrupt.
*
* A negative errno code is returned on a failure. A success does not
* guarantee the frame will be transmitted as it may be dropped due
* to congestion or traffic shaping.
*
* -----------------------------------------------------------------------------------
* I notice this method can also return errors from the queue disciplines,
* including NET_XMIT_DROP, which is a positive value. So, errors can also
* be positive.
*
* Regardless of the return value, the skb is consumed, so it is currently
* difficult to retry a send to this method. (You can bump the ref count
* before sending to hold a reference for retry if you are careful.)
*
* When calling this method, interrupts MUST be enabled. This is because
* the BH enable code must have IRQs enabled so that it will not deadlock.
* --BLG
*/
static int __dev_queue_xmit(struct sk_buff *skb, struct net_device *sb_dev)
{
struct net_device *dev = skb->dev;
struct netdev_queue *txq;
struct Qdisc *q;
int rc = -ENOMEM;
bool again = false;
skb_reset_mac_header(skb);
if (unlikely(skb_shinfo(skb)->tx_flags & SKBTX_SCHED_TSTAMP))
__skb_tstamp_tx(skb, NULL, NULL, skb->sk, SCM_TSTAMP_SCHED);
/* Disable soft irqs for various locks below. Also
* stops preemption for RCU.
*/
rcu_read_lock_bh();
skb_update_prio(skb);
qdisc_pkt_len_init(skb);
#ifdef CONFIG_NET_CLS_ACT
skb->tc_at_ingress = 0;
# ifdef CONFIG_NET_EGRESS
if (static_branch_unlikely(&egress_needed_key)) {
skb = sch_handle_egress(skb, &rc, dev);
if (!skb)
goto out;
}
# endif
#endif
/* If device/qdisc don't need skb->dst, release it right now while
* its hot in this cpu cache.
*/
if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
skb_dst_drop(skb);
else
skb_dst_force(skb);
txq = netdev_core_pick_tx(dev, skb, sb_dev);
q = rcu_dereference_bh(txq->qdisc);
trace_net_dev_queue(skb);
if (q->enqueue) {
rc = __dev_xmit_skb(skb, q, dev, txq);
goto out;
}
/* The device has no queue. Common case for software devices:
* loopback, all the sorts of tunnels...
* Really, it is unlikely that netif_tx_lock protection is necessary
* here. (f.e. loopback and IP tunnels are clean ignoring statistics
* counters.)
* However, it is possible, that they rely on protection
* made by us here.
* Check this and shot the lock. It is not prone from deadlocks.
*Either shot noqueue qdisc, it is even simpler 8)
*/
if (dev->flags & IFF_UP) { int cpu = smp_processor_id(); /* ok because BHs are off */
/* Other cpus might concurrently change txq->xmit_lock_owner
* to -1 or to their cpu id, but not to our id.
*/
if (READ_ONCE(txq->xmit_lock_owner) != cpu) {
if (dev_xmit_recursion())
goto recursion_alert;
skb = validate_xmit_skb(skb, dev, &again);
if (!skb)
goto out;
PRANDOM_ADD_NOISE(skb, dev, txq, jiffies);
HARD_TX_LOCK(dev, txq, cpu);
if (!netif_xmit_stopped(txq)) {
dev_xmit_recursion_inc();
skb = dev_hard_start_xmit(skb, dev, txq, &rc);
dev_xmit_recursion_dec();
if (dev_xmit_complete(rc)) {
HARD_TX_UNLOCK(dev, txq);
goto out;
}
}
HARD_TX_UNLOCK(dev, txq); net_crit_ratelimited("Virtual device %s asks to queue packet!\n",
dev->name);
} else {
/* Recursion is detected! It is possible,
* unfortunately
*/
recursion_alert:
net_crit_ratelimited("Dead loop on virtual device %s, fix it urgently!\n",
dev->name);
}
}
rc = -ENETDOWN;
rcu_read_unlock_bh();
atomic_long_inc(&dev->tx_dropped);
kfree_skb_list(skb);
return rc;
out:
rcu_read_unlock_bh();
return rc;
}
int dev_queue_xmit(struct sk_buff *skb)
{
return __dev_queue_xmit(skb, NULL);
}
EXPORT_SYMBOL(dev_queue_xmit);
int dev_queue_xmit_accel(struct sk_buff *skb, struct net_device *sb_dev)
{
return __dev_queue_xmit(skb, sb_dev);
}
EXPORT_SYMBOL(dev_queue_xmit_accel);
int __dev_direct_xmit(struct sk_buff *skb, u16 queue_id)
{
struct net_device *dev = skb->dev;
struct sk_buff *orig_skb = skb;
struct netdev_queue *txq;
int ret = NETDEV_TX_BUSY;
bool again = false;
if (unlikely(!netif_running(dev) ||
!netif_carrier_ok(dev)))
goto drop;
skb = validate_xmit_skb_list(skb, dev, &again);
if (skb != orig_skb)
goto drop;
skb_set_queue_mapping(skb, queue_id);
txq = skb_get_tx_queue(dev, skb);
PRANDOM_ADD_NOISE(skb, dev, txq, jiffies);
local_bh_disable();
dev_xmit_recursion_inc();
HARD_TX_LOCK(dev, txq, smp_processor_id());
if (!netif_xmit_frozen_or_drv_stopped(txq))
ret = netdev_start_xmit(skb, dev, txq, false);
HARD_TX_UNLOCK(dev, txq);
dev_xmit_recursion_dec();
local_bh_enable();
return ret;
drop:
atomic_long_inc(&dev->tx_dropped);
kfree_skb_list(skb);
return NET_XMIT_DROP;
}
EXPORT_SYMBOL(__dev_direct_xmit);
/*************************************************************************
* Receiver routines
*************************************************************************/
int netdev_max_backlog __read_mostly = 1000;
EXPORT_SYMBOL(netdev_max_backlog);
int netdev_tstamp_prequeue __read_mostly = 1;
int netdev_budget __read_mostly = 300;
/* Must be at least 2 jiffes to guarantee 1 jiffy timeout */
unsigned int __read_mostly netdev_budget_usecs = 2 * USEC_PER_SEC / HZ;
int weight_p __read_mostly = 64; /* old backlog weight */
int dev_weight_rx_bias __read_mostly = 1; /* bias for backlog weight */
int dev_weight_tx_bias __read_mostly = 1; /* bias for output_queue quota */
int dev_rx_weight __read_mostly = 64;
int dev_tx_weight __read_mostly = 64;
/* Maximum number of GRO_NORMAL skbs to batch up for list-RX */
int gro_normal_batch __read_mostly = 8;
/* Called with irq disabled */
static inline void ____napi_schedule(struct softnet_data *sd,
struct napi_struct *napi)
{
struct task_struct *thread;
if (test_bit(NAPI_STATE_THREADED, &napi->state)) {
/* Paired with smp_mb__before_atomic() in
* napi_enable()/dev_set_threaded().
* Use READ_ONCE() to guarantee a complete
* read on napi->thread. Only call
* wake_up_process() when it's not NULL.
*/
thread = READ_ONCE(napi->thread);
if (thread) {
/* Avoid doing set_bit() if the thread is in
* INTERRUPTIBLE state, cause napi_thread_wait()
* makes sure to proceed with napi polling
* if the thread is explicitly woken from here.
*/
if (READ_ONCE(thread->__state) != TASK_INTERRUPTIBLE)
set_bit(NAPI_STATE_SCHED_THREADED, &napi->state);
wake_up_process(thread);
return;
}
}
list_add_tail(&napi->poll_list, &sd->poll_list);
__raise_softirq_irqoff(NET_RX_SOFTIRQ);
}
#ifdef CONFIG_RPS
/* One global table that all flow-based protocols share. */
struct rps_sock_flow_table __rcu *rps_sock_flow_table __read_mostly;
EXPORT_SYMBOL(rps_sock_flow_table);
u32 rps_cpu_mask __read_mostly;
EXPORT_SYMBOL(rps_cpu_mask);
struct static_key_false rps_needed __read_mostly;
EXPORT_SYMBOL(rps_needed);
struct static_key_false rfs_needed __read_mostly;
EXPORT_SYMBOL(rfs_needed);
static struct rps_dev_flow *
set_rps_cpu(struct net_device *dev, struct sk_buff *skb,
struct rps_dev_flow *rflow, u16 next_cpu)
{
if (next_cpu < nr_cpu_ids) {
#ifdef CONFIG_RFS_ACCEL
struct netdev_rx_queue *rxqueue;
struct rps_dev_flow_table *flow_table;
struct rps_dev_flow *old_rflow;
u32 flow_id;
u16 rxq_index;
int rc;
/* Should we steer this flow to a different hardware queue? */
if (!skb_rx_queue_recorded(skb) || !dev->rx_cpu_rmap ||
!(dev->features & NETIF_F_NTUPLE))
goto out;
rxq_index = cpu_rmap_lookup_index(dev->rx_cpu_rmap, next_cpu);
if (rxq_index == skb_get_rx_queue(skb))
goto out;
rxqueue = dev->_rx + rxq_index;
flow_table = rcu_dereference(rxqueue->rps_flow_table);
if (!flow_table)
goto out;
flow_id = skb_get_hash(skb) & flow_table->mask;
rc = dev->netdev_ops->ndo_rx_flow_steer(dev, skb,
rxq_index, flow_id);
if (rc < 0)
goto out;
old_rflow = rflow;
rflow = &flow_table->flows[flow_id];
rflow->filter = rc;
if (old_rflow->filter == rflow->filter)
old_rflow->filter = RPS_NO_FILTER;
out:
#endif
rflow->last_qtail =
per_cpu(softnet_data, next_cpu).input_queue_head;
}
rflow->cpu = next_cpu;
return rflow;
}
/*
* get_rps_cpu is called from netif_receive_skb and returns the target
* CPU from the RPS map of the receiving queue for a given skb.
* rcu_read_lock must be held on entry.
*/
static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
struct rps_dev_flow **rflowp)
{
const struct rps_sock_flow_table *sock_flow_table;
struct netdev_rx_queue *rxqueue = dev->_rx;
struct rps_dev_flow_table *flow_table;
struct rps_map *map;
int cpu = -1;
u32 tcpu;
u32 hash;
if (skb_rx_queue_recorded(skb)) {
u16 index = skb_get_rx_queue(skb);
if (unlikely(index >= dev->real_num_rx_queues)) {
WARN_ONCE(dev->real_num_rx_queues > 1,
"%s received packet on queue %u, but number "
"of RX queues is %u\n",
dev->name, index, dev->real_num_rx_queues);
goto done;
}
rxqueue += index;
}
/* Avoid computing hash if RFS/RPS is not active for this rxqueue */
flow_table = rcu_dereference(rxqueue->rps_flow_table);
map = rcu_dereference(rxqueue->rps_map);
if (!flow_table && !map)
goto done;
skb_reset_network_header(skb);
hash = skb_get_hash(skb);
if (!hash)
goto done;
sock_flow_table = rcu_dereference(rps_sock_flow_table);
if (flow_table && sock_flow_table) {
struct rps_dev_flow *rflow;
u32 next_cpu;
u32 ident;
/* First check into global flow table if there is a match */
ident = sock_flow_table->ents[hash & sock_flow_table->mask];
if ((ident ^ hash) & ~rps_cpu_mask)
goto try_rps;
next_cpu = ident & rps_cpu_mask;
/* OK, now we know there is a match,
* we can look at the local (per receive queue) flow table
*/
rflow = &flow_table->flows[hash & flow_table->mask];
tcpu = rflow->cpu;
/*
* If the desired CPU (where last recvmsg was done) is
* different from current CPU (one in the rx-queue flow
* table entry), switch if one of the following holds:
* - Current CPU is unset (>= nr_cpu_ids).
* - Current CPU is offline.
* - The current CPU's queue tail has advanced beyond the
* last packet that was enqueued using this table entry.
* This guarantees that all previous packets for the flow
* have been dequeued, thus preserving in order delivery.
*/
if (unlikely(tcpu != next_cpu) &&
(tcpu >= nr_cpu_ids || !cpu_online(tcpu) ||
((int)(per_cpu(softnet_data, tcpu).input_queue_head -
rflow->last_qtail)) >= 0)) {
tcpu = next_cpu;
rflow = set_rps_cpu(dev, skb, rflow, next_cpu);
}
if (tcpu < nr_cpu_ids && cpu_online(tcpu)) {
*rflowp = rflow;
cpu = tcpu;
goto done;
}
}
try_rps:
if (map) {
tcpu = map->cpus[reciprocal_scale(hash, map->len)];
if (cpu_online(tcpu)) {
cpu = tcpu;
goto done;
}
}
done:
return cpu;
}
#ifdef CONFIG_RFS_ACCEL
/**
* rps_may_expire_flow - check whether an RFS hardware filter may be removed
* @dev: Device on which the filter was set
* @rxq_index: RX queue index
* @flow_id: Flow ID passed to ndo_rx_flow_steer()
* @filter_id: Filter ID returned by ndo_rx_flow_steer()
*
* Drivers that implement ndo_rx_flow_steer() should periodically call
* this function for each installed filter and remove the filters for
* which it returns %true.
*/
bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index,
u32 flow_id, u16 filter_id)
{
struct netdev_rx_queue *rxqueue = dev->_rx + rxq_index;
struct rps_dev_flow_table *flow_table;
struct rps_dev_flow *rflow;
bool expire = true;
unsigned int cpu;
rcu_read_lock();
flow_table = rcu_dereference(rxqueue->rps_flow_table);
if (flow_table && flow_id <= flow_table->mask) {
rflow = &flow_table->flows[flow_id];
cpu = READ_ONCE(rflow->cpu);
if (rflow->filter == filter_id && cpu < nr_cpu_ids &&
((int)(per_cpu(softnet_data, cpu).input_queue_head -
rflow->last_qtail) <
(int)(10 * flow_table->mask)))
expire = false;
}
rcu_read_unlock();
return expire;
}
EXPORT_SYMBOL(rps_may_expire_flow);
#endif /* CONFIG_RFS_ACCEL */
/* Called from hardirq (IPI) context */
static void rps_trigger_softirq(void *data)
{
struct softnet_data *sd = data;
____napi_schedule(sd, &sd->backlog);
sd->received_rps++;
}
#endif /* CONFIG_RPS */
/*
* Check if this softnet_data structure is another cpu one
* If yes, queue it to our IPI list and return 1
* If no, return 0
*/
static int rps_ipi_queued(struct softnet_data *sd)
{
#ifdef CONFIG_RPS
struct softnet_data *mysd = this_cpu_ptr(&softnet_data);
if (sd != mysd) {
sd->rps_ipi_next = mysd->rps_ipi_list;
mysd->rps_ipi_list = sd;
__raise_softirq_irqoff(NET_RX_SOFTIRQ);
return 1;
}
#endif /* CONFIG_RPS */
return 0;
}
#ifdef CONFIG_NET_FLOW_LIMIT
int netdev_flow_limit_table_len __read_mostly = (1 << 12);
#endif
static bool skb_flow_limit(struct sk_buff *skb, unsigned int qlen)
{
#ifdef CONFIG_NET_FLOW_LIMIT
struct sd_flow_limit *fl;
struct softnet_data *sd;
unsigned int old_flow, new_flow;
if (qlen < (netdev_max_backlog >> 1))
return false;
sd = this_cpu_ptr(&softnet_data);
rcu_read_lock();
fl = rcu_dereference(sd->flow_limit);
if (fl) {
new_flow = skb_get_hash(skb) & (fl->num_buckets - 1);
old_flow = fl->history[fl->history_head];
fl->history[fl->history_head] = new_flow;
fl->history_head++;
fl->history_head &= FLOW_LIMIT_HISTORY - 1;
if (likely(fl->buckets[old_flow]))
fl->buckets[old_flow]--; if (++fl->buckets[new_flow] > (FLOW_LIMIT_HISTORY >> 1)) { fl->count++;
rcu_read_unlock();
return true;
}
}
rcu_read_unlock();
#endif
return false;
}
/*
* enqueue_to_backlog is called to queue an skb to a per CPU backlog
* queue (may be a remote CPU queue).
*/
static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
unsigned int *qtail)
{
struct softnet_data *sd;
unsigned long flags;
unsigned int qlen;
sd = &per_cpu(softnet_data, cpu);
local_irq_save(flags);
rps_lock(sd);
if (!netif_running(skb->dev))
goto drop;
qlen = skb_queue_len(&sd->input_pkt_queue);
if (qlen <= netdev_max_backlog && !skb_flow_limit(skb, qlen)) {
if (qlen) {
enqueue:
__skb_queue_tail(&sd->input_pkt_queue, skb);
input_queue_tail_incr_save(sd, qtail);
rps_unlock(sd);
local_irq_restore(flags);
return NET_RX_SUCCESS;
}
/* Schedule NAPI for backlog device
* We can use non atomic operation since we own the queue lock
*/
if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state)) {
if (!rps_ipi_queued(sd))
____napi_schedule(sd, &sd->backlog);
}
goto enqueue;
}
drop:
sd->dropped++;
rps_unlock(sd);
local_irq_restore(flags);
atomic_long_inc(&skb->dev->rx_dropped);
kfree_skb(skb);
return NET_RX_DROP;
}
static struct netdev_rx_queue *netif_get_rxqueue(struct sk_buff *skb)
{
struct net_device *dev = skb->dev;
struct netdev_rx_queue *rxqueue;
rxqueue = dev->_rx;
if (skb_rx_queue_recorded(skb)) {
u16 index = skb_get_rx_queue(skb);
if (unlikely(index >= dev->real_num_rx_queues)) {
WARN_ONCE(dev->real_num_rx_queues > 1,
"%s received packet on queue %u, but number "
"of RX queues is %u\n",
dev->name, index, dev->real_num_rx_queues);
return rxqueue; /* Return first rxqueue */
}
rxqueue += index;
}
return rxqueue;
}
u32 bpf_prog_run_generic_xdp(struct sk_buff *skb, struct xdp_buff *xdp,
struct bpf_prog *xdp_prog)
{
void *orig_data, *orig_data_end, *hard_start;
struct netdev_rx_queue *rxqueue;
bool orig_bcast, orig_host;
u32 mac_len, frame_sz;
__be16 orig_eth_type;
struct ethhdr *eth;
u32 metalen, act;
int off;
/* The XDP program wants to see the packet starting at the MAC
* header.
*/
mac_len = skb->data - skb_mac_header(skb);
hard_start = skb->data - skb_headroom(skb);
/* SKB "head" area always have tailroom for skb_shared_info */
frame_sz = (void *)skb_end_pointer(skb) - hard_start;
frame_sz += SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
rxqueue = netif_get_rxqueue(skb);
xdp_init_buff(xdp, frame_sz, &rxqueue->xdp_rxq);
xdp_prepare_buff(xdp, hard_start, skb_headroom(skb) - mac_len,
skb_headlen(skb) + mac_len, true);
orig_data_end = xdp->data_end;
orig_data = xdp->data;
eth = (struct ethhdr *)xdp->data;
orig_host = ether_addr_equal_64bits(eth->h_dest, skb->dev->dev_addr);
orig_bcast = is_multicast_ether_addr_64bits(eth->h_dest);
orig_eth_type = eth->h_proto;
act = bpf_prog_run_xdp(xdp_prog, xdp);
/* check if bpf_xdp_adjust_head was used */
off = xdp->data - orig_data;
if (off) {
if (off > 0)
__skb_pull(skb, off);
else if (off < 0)
__skb_push(skb, -off);
skb->mac_header += off;
skb_reset_network_header(skb);
}
/* check if bpf_xdp_adjust_tail was used */
off = xdp->data_end - orig_data_end;
if (off != 0) {
skb_set_tail_pointer(skb, xdp->data_end - xdp->data);
skb->len += off; /* positive on grow, negative on shrink */
}
/* check if XDP changed eth hdr such SKB needs update */
eth = (struct ethhdr *)xdp->data;
if ((orig_eth_type != eth->h_proto) ||
(orig_host != ether_addr_equal_64bits(eth->h_dest,
skb->dev->dev_addr)) ||
(orig_bcast != is_multicast_ether_addr_64bits(eth->h_dest))) {
__skb_push(skb, ETH_HLEN);
skb->pkt_type = PACKET_HOST;
skb->protocol = eth_type_trans(skb, skb->dev);
}
/* Redirect/Tx gives L2 packet, code that will reuse skb must __skb_pull
* before calling us again on redirect path. We do not call do_redirect
* as we leave that up to the caller.
*
* Caller is responsible for managing lifetime of skb (i.e. calling
* kfree_skb in response to actions it cannot handle/XDP_DROP).
*/
switch (act) {
case XDP_REDIRECT:
case XDP_TX:
__skb_push(skb, mac_len);
break;
case XDP_PASS:
metalen = xdp->data - xdp->data_meta;
if (metalen)
skb_metadata_set(skb, metalen);
break;
}
return act;
}
static u32 netif_receive_generic_xdp(struct sk_buff *skb,
struct xdp_buff *xdp,
struct bpf_prog *xdp_prog)
{
u32 act = XDP_DROP;
/* Reinjected packets coming from act_mirred or similar should
* not get XDP generic processing.
*/
if (skb_is_redirected(skb))
return XDP_PASS;
/* XDP packets must be linear and must have sufficient headroom
* of XDP_PACKET_HEADROOM bytes. This is the guarantee that also
* native XDP provides, thus we need to do it here as well.
*/
if (skb_cloned(skb) || skb_is_nonlinear(skb) ||
skb_headroom(skb) < XDP_PACKET_HEADROOM) {
int hroom = XDP_PACKET_HEADROOM - skb_headroom(skb);
int troom = skb->tail + skb->data_len - skb->end;
/* In case we have to go down the path and also linearize,
* then lets do the pskb_expand_head() work just once here.
*/
if (pskb_expand_head(skb,
hroom > 0 ? ALIGN(hroom, NET_SKB_PAD) : 0,
troom > 0 ? troom + 128 : 0, GFP_ATOMIC))
goto do_drop;
if (skb_linearize(skb))
goto do_drop;
}
act = bpf_prog_run_generic_xdp(skb, xdp, xdp_prog);
switch (act) {
case XDP_REDIRECT:
case XDP_TX:
case XDP_PASS:
break;
default:
bpf_warn_invalid_xdp_action(act);
fallthrough;
case XDP_ABORTED:
trace_xdp_exception(skb->dev, xdp_prog, act);
fallthrough;
case XDP_DROP:
do_drop:
kfree_skb(skb);
break;
}
return act;
}
/* When doing generic XDP we have to bypass the qdisc layer and the
* network taps in order to match in-driver-XDP behavior.
*/
void generic_xdp_tx(struct sk_buff *skb, struct bpf_prog *xdp_prog)
{
struct net_device *dev = skb->dev;
struct netdev_queue *txq;
bool free_skb = true;
int cpu, rc;
txq = netdev_core_pick_tx(dev, skb, NULL);
cpu = smp_processor_id();
HARD_TX_LOCK(dev, txq, cpu);
if (!netif_xmit_stopped(txq)) {
rc = netdev_start_xmit(skb, dev, txq, 0);
if (dev_xmit_complete(rc))
free_skb = false;
}
HARD_TX_UNLOCK(dev, txq);
if (free_skb) {
trace_xdp_exception(dev, xdp_prog, XDP_TX);
kfree_skb(skb);
}
}
static DEFINE_STATIC_KEY_FALSE(generic_xdp_needed_key);
int do_xdp_generic(struct bpf_prog *xdp_prog, struct sk_buff *skb)
{
if (xdp_prog) {
struct xdp_buff xdp;
u32 act;
int err;
act = netif_receive_generic_xdp(skb, &xdp, xdp_prog);
if (act != XDP_PASS) {
switch (act) {
case XDP_REDIRECT:
err = xdp_do_generic_redirect(skb->dev, skb,
&xdp, xdp_prog);
if (err)
goto out_redir;
break;
case XDP_TX:
generic_xdp_tx(skb, xdp_prog);
break;
}
return XDP_DROP;
}
}
return XDP_PASS;
out_redir:
kfree_skb(skb);
return XDP_DROP;
}
EXPORT_SYMBOL_GPL(do_xdp_generic);
static int netif_rx_internal(struct sk_buff *skb)
{
int ret;
net_timestamp_check(netdev_tstamp_prequeue, skb);
trace_netif_rx(skb);
#ifdef CONFIG_RPS
if (static_branch_unlikely(&rps_needed)) {
struct rps_dev_flow voidflow, *rflow = &voidflow;
int cpu;
preempt_disable();
rcu_read_lock();
cpu = get_rps_cpu(skb->dev, skb, &rflow);
if (cpu < 0)
cpu = smp_processor_id(); ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
rcu_read_unlock();
preempt_enable();
} else
#endif
{
unsigned int qtail;
ret = enqueue_to_backlog(skb, get_cpu(), &qtail);
put_cpu();
}
return ret;
}
/**
* netif_rx - post buffer to the network code
* @skb: buffer to post
*
* This function receives a packet from a device driver and queues it for
* the upper (protocol) levels to process. It always succeeds. The buffer
* may be dropped during processing for congestion control or by the
* protocol layers.
*
* return values:
* NET_RX_SUCCESS (no congestion)
* NET_RX_DROP (packet was dropped)
*
*/
int netif_rx(struct sk_buff *skb)
{
int ret;
trace_netif_rx_entry(skb);
ret = netif_rx_internal(skb);
trace_netif_rx_exit(ret);
return ret;
}
EXPORT_SYMBOL(netif_rx);
int netif_rx_ni(struct sk_buff *skb)
{
int err;
trace_netif_rx_ni_entry(skb);
preempt_disable();
err = netif_rx_internal(skb);
if (local_softirq_pending())
do_softirq(); preempt_enable();
trace_netif_rx_ni_exit(err);
return err;
}
EXPORT_SYMBOL(netif_rx_ni);
int netif_rx_any_context(struct sk_buff *skb)
{
/*
* If invoked from contexts which do not invoke bottom half
* processing either at return from interrupt or when softrqs are
* reenabled, use netif_rx_ni() which invokes bottomhalf processing
* directly.
*/
if (in_interrupt())
return netif_rx(skb);
else
return netif_rx_ni(skb);
}
EXPORT_SYMBOL(netif_rx_any_context);
static __latent_entropy void net_tx_action(struct softirq_action *h)
{
struct softnet_data *sd = this_cpu_ptr(&softnet_data);
if (sd->completion_queue) {
struct sk_buff *clist;
local_irq_disable();
clist = sd->completion_queue;
sd->completion_queue = NULL;
local_irq_enable();
while (clist) {
struct sk_buff *skb = clist;
clist = clist->next;
WARN_ON(refcount_read(&skb->users));
if (likely(get_kfree_skb_cb(skb)->reason == SKB_REASON_CONSUMED))
trace_consume_skb(skb);
else
trace_kfree_skb(skb, net_tx_action);
if (skb->fclone != SKB_FCLONE_UNAVAILABLE)
__kfree_skb(skb);
else
__kfree_skb_defer(skb);
}
}
if (sd->output_queue) {
struct Qdisc *head;
local_irq_disable();
head = sd->output_queue;
sd->output_queue = NULL;
sd->output_queue_tailp = &sd->output_queue;
local_irq_enable();
rcu_read_lock();
while (head) {
struct Qdisc *q = head;
spinlock_t *root_lock = NULL;
head = head->next_sched;
/* We need to make sure head->next_sched is read
* before clearing __QDISC_STATE_SCHED
*/
smp_mb__before_atomic();
if (!(q->flags & TCQ_F_NOLOCK)) {
root_lock = qdisc_lock(q);
spin_lock(root_lock);
} else if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED,
&q->state))) {
/* There is a synchronize_net() between
* STATE_DEACTIVATED flag being set and
* qdisc_reset()/some_qdisc_is_busy() in
* dev_deactivate(), so we can safely bail out
* early here to avoid data race between
* qdisc_deactivate() and some_qdisc_is_busy()
* for lockless qdisc.
*/
clear_bit(__QDISC_STATE_SCHED, &q->state);
continue;
}
clear_bit(__QDISC_STATE_SCHED, &q->state);
qdisc_run(q);
if (root_lock)
spin_unlock(root_lock);
}
rcu_read_unlock();
}
xfrm_dev_backlog(sd);
}
#if IS_ENABLED(CONFIG_BRIDGE) && IS_ENABLED(CONFIG_ATM_LANE)
/* This hook is defined here for ATM LANE */
int (*br_fdb_test_addr_hook)(struct net_device *dev,
unsigned char *addr) __read_mostly;
EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook);
#endif
static inline struct sk_buff *
sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret,
struct net_device *orig_dev, bool *another)
{
#ifdef CONFIG_NET_CLS_ACT
struct mini_Qdisc *miniq = rcu_dereference_bh(skb->dev->miniq_ingress);
struct tcf_result cl_res;
/* If there's at least one ingress present somewhere (so
* we get here via enabled static key), remaining devices
* that are not configured with an ingress qdisc will bail
* out here.
*/
if (!miniq)
return skb;
if (*pt_prev) {
*ret = deliver_skb(skb, *pt_prev, orig_dev);
*pt_prev = NULL;
}
qdisc_skb_cb(skb)->pkt_len = skb->len;
tc_skb_cb(skb)->mru = 0;
tc_skb_cb(skb)->post_ct = false;
skb->tc_at_ingress = 1;
mini_qdisc_bstats_cpu_update(miniq, skb);
switch (tcf_classify(skb, miniq->block, miniq->filter_list, &cl_res, false)) {
case TC_ACT_OK:
case TC_ACT_RECLASSIFY:
skb->tc_index = TC_H_MIN(cl_res.classid);
break;
case TC_ACT_SHOT:
mini_qdisc_qstats_cpu_drop(miniq);
kfree_skb(skb);
return NULL;
case TC_ACT_STOLEN:
case TC_ACT_QUEUED:
case TC_ACT_TRAP:
consume_skb(skb);
return NULL;
case TC_ACT_REDIRECT:
/* skb_mac_header check was done by cls/act_bpf, so
* we can safely push the L2 header back before
* redirecting to another netdev
*/
__skb_push(skb, skb->mac_len);
if (skb_do_redirect(skb) == -EAGAIN) {
__skb_pull(skb, skb->mac_len);
*another = true;
break;
}
return NULL;
case TC_ACT_CONSUMED:
return NULL;
default:
break;
}
#endif /* CONFIG_NET_CLS_ACT */
return skb;
}
/**
* netdev_is_rx_handler_busy - check if receive handler is registered
* @dev: device to check
*
* Check if a receive handler is already registered for a given device.
* Return true if there one.
*
* The caller must hold the rtnl_mutex.
*/
bool netdev_is_rx_handler_busy(struct net_device *dev)
{
ASSERT_RTNL();
return dev && rtnl_dereference(dev->rx_handler);
}
EXPORT_SYMBOL_GPL(netdev_is_rx_handler_busy);
/**
* netdev_rx_handler_register - register receive handler
* @dev: device to register a handler for
* @rx_handler: receive handler to register
* @rx_handler_data: data pointer that is used by rx handler
*
* Register a receive handler for a device. This handler will then be
* called from __netif_receive_skb. A negative errno code is returned
* on a failure.
*
* The caller must hold the rtnl_mutex.
*
* For a general description of rx_handler, see enum rx_handler_result.
*/
int netdev_rx_handler_register(struct net_device *dev,
rx_handler_func_t *rx_handler,
void *rx_handler_data)
{
if (netdev_is_rx_handler_busy(dev))
return -EBUSY;
if (dev->priv_flags & IFF_NO_RX_HANDLER)
return -EINVAL;
/* Note: rx_handler_data must be set before rx_handler */
rcu_assign_pointer(dev->rx_handler_data, rx_handler_data);
rcu_assign_pointer(dev->rx_handler, rx_handler);
return 0;
}
EXPORT_SYMBOL_GPL(netdev_rx_handler_register);
/**
* netdev_rx_handler_unregister - unregister receive handler
* @dev: device to unregister a handler from
*
* Unregister a receive handler from a device.
*
* The caller must hold the rtnl_mutex.
*/
void netdev_rx_handler_unregister(struct net_device *dev)
{
ASSERT_RTNL();
RCU_INIT_POINTER(dev->rx_handler, NULL);
/* a reader seeing a non NULL rx_handler in a rcu_read_lock()
* section has a guarantee to see a non NULL rx_handler_data
* as well.
*/
synchronize_net();
RCU_INIT_POINTER(dev->rx_handler_data, NULL);
}
EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister);
/*
* Limit the use of PFMEMALLOC reserves to those protocols that implement
* the special handling of PFMEMALLOC skbs.
*/
static bool skb_pfmemalloc_protocol(struct sk_buff *skb)
{
switch (skb->protocol) {
case htons(ETH_P_ARP):
case htons(ETH_P_IP):
case htons(ETH_P_IPV6):
case htons(ETH_P_8021Q):
case htons(ETH_P_8021AD):
return true;
default:
return false;
}
}
static inline int nf_ingress(struct sk_buff *skb, struct packet_type **pt_prev,
int *ret, struct net_device *orig_dev)
{
if (nf_hook_ingress_active(skb)) {
int ingress_retval;
if (*pt_prev) {
*ret = deliver_skb(skb, *pt_prev, orig_dev);
*pt_prev = NULL;
}
rcu_read_lock();
ingress_retval = nf_hook_ingress(skb);
rcu_read_unlock();
return ingress_retval;
}
return 0;
}
static int __netif_receive_skb_core(struct sk_buff **pskb, bool pfmemalloc,
struct packet_type **ppt_prev)
{
struct packet_type *ptype, *pt_prev;
rx_handler_func_t *rx_handler;
struct sk_buff *skb = *pskb;
struct net_device *orig_dev;
bool deliver_exact = false;
int ret = NET_RX_DROP;
__be16 type;
net_timestamp_check(!netdev_tstamp_prequeue, skb);
trace_netif_receive_skb(skb);
orig_dev = skb->dev;
skb_reset_network_header(skb);
if (!skb_transport_header_was_set(skb))
skb_reset_transport_header(skb);
skb_reset_mac_len(skb);
pt_prev = NULL;
another_round:
skb->skb_iif = skb->dev->ifindex;
__this_cpu_inc(softnet_data.processed);
if (static_branch_unlikely(&generic_xdp_needed_key)) {
int ret2;
migrate_disable();
ret2 = do_xdp_generic(rcu_dereference(skb->dev->xdp_prog), skb);
migrate_enable();
if (ret2 != XDP_PASS) {
ret = NET_RX_DROP;
goto out;
}
}
if (eth_type_vlan(skb->protocol)) {
skb = skb_vlan_untag(skb);
if (unlikely(!skb))
goto out;
}
if (skb_skip_tc_classify(skb))
goto skip_classify;
if (pfmemalloc)
goto skip_taps;
list_for_each_entry_rcu(ptype, &ptype_all, list) {
if (pt_prev)
ret = deliver_skb(skb, pt_prev, orig_dev);
pt_prev = ptype;
}
list_for_each_entry_rcu(ptype, &skb->dev->ptype_all, list) {
if (pt_prev)
ret = deliver_skb(skb, pt_prev, orig_dev);
pt_prev = ptype;
}
skip_taps:
#ifdef CONFIG_NET_INGRESS
if (static_branch_unlikely(&ingress_needed_key)) {
bool another = false;
skb = sch_handle_ingress(skb, &pt_prev, &ret, orig_dev,
&another);
if (another)
goto another_round;
if (!skb)
goto out;
if (nf_ingress(skb, &pt_prev, &ret, orig_dev) < 0)
goto out;
}
#endif
skb_reset_redirect(skb);
skip_classify:
if (pfmemalloc && !skb_pfmemalloc_protocol(skb))
goto drop;
if (skb_vlan_tag_present(skb)) {
if (pt_prev) {
ret = deliver_skb(skb, pt_prev, orig_dev);
pt_prev = NULL;
}
if (vlan_do_receive(&skb))
goto another_round;
else if (unlikely(!skb))
goto out;
}
rx_handler = rcu_dereference(skb->dev->rx_handler);
if (rx_handler) {
if (pt_prev) {
ret = deliver_skb(skb, pt_prev, orig_dev);
pt_prev = NULL;
}
switch (rx_handler(&skb)) {
case RX_HANDLER_CONSUMED:
ret = NET_RX_SUCCESS;
goto out;
case RX_HANDLER_ANOTHER:
goto another_round;
case RX_HANDLER_EXACT:
deliver_exact = true;
break;
case RX_HANDLER_PASS:
break;
default:
BUG();
}
}
if (unlikely(skb_vlan_tag_present(skb)) && !netdev_uses_dsa(skb->dev)) {
check_vlan_id:
if (skb_vlan_tag_get_id(skb)) {
/* Vlan id is non 0 and vlan_do_receive() above couldn't
* find vlan device.
*/
skb->pkt_type = PACKET_OTHERHOST;
} else if (eth_type_vlan(skb->protocol)) {
/* Outer header is 802.1P with vlan 0, inner header is
* 802.1Q or 802.1AD and vlan_do_receive() above could
* not find vlan dev for vlan id 0.
*/
__vlan_hwaccel_clear_tag(skb);
skb = skb_vlan_untag(skb);
if (unlikely(!skb))
goto out;
if (vlan_do_receive(&skb))
/* After stripping off 802.1P header with vlan 0
* vlan dev is found for inner header.
*/
goto another_round;
else if (unlikely(!skb))
goto out;
else
/* We have stripped outer 802.1P vlan 0 header.
* But could not find vlan dev.
* check again for vlan id to set OTHERHOST.
*/
goto check_vlan_id;
}
/* Note: we might in the future use prio bits
* and set skb->priority like in vlan_do_receive()
* For the time being, just ignore Priority Code Point
*/
__vlan_hwaccel_clear_tag(skb);
}
type = skb->protocol;
/* deliver only exact match when indicated */
if (likely(!deliver_exact)) {
deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
&ptype_base[ntohs(type) &
PTYPE_HASH_MASK]);
}
deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
&orig_dev->ptype_specific);
if (unlikely(skb->dev != orig_dev)) {
deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
&skb->dev->ptype_specific);
}
if (pt_prev) {
if (unlikely(skb_orphan_frags_rx(skb, GFP_ATOMIC)))
goto drop;
*ppt_prev = pt_prev;
} else {
drop:
if (!deliver_exact)
atomic_long_inc(&skb->dev->rx_dropped);
else
atomic_long_inc(&skb->dev->rx_nohandler);
kfree_skb(skb);
/* Jamal, now you will not able to escape explaining
* me how you were going to use this. :-)
*/
ret = NET_RX_DROP;
}
out:
/* The invariant here is that if *ppt_prev is not NULL
* then skb should also be non-NULL.
*
* Apparently *ppt_prev assignment above holds this invariant due to
* skb dereferencing near it.
*/
*pskb = skb;
return ret;
}
static int __netif_receive_skb_one_core(struct sk_buff *skb, bool pfmemalloc)
{
struct net_device *orig_dev = skb->dev;
struct packet_type *pt_prev = NULL;
int ret;
ret = __netif_receive_skb_core(&skb, pfmemalloc, &pt_prev);
if (pt_prev)
ret = INDIRECT_CALL_INET(pt_prev->func, ipv6_rcv, ip_rcv, skb,
skb->dev, pt_prev, orig_dev);
return ret;
}
/**
* netif_receive_skb_core - special purpose version of netif_receive_skb
* @skb: buffer to process
*
* More direct receive version of netif_receive_skb(). It should
* only be used by callers that have a need to skip RPS and Generic XDP.
* Caller must also take care of handling if ``(page_is_)pfmemalloc``.
*
* This function may only be called from softirq context and interrupts
* should be enabled.
*
* Return values (usually ignored):
* NET_RX_SUCCESS: no congestion
* NET_RX_DROP: packet was dropped
*/
int netif_receive_skb_core(struct sk_buff *skb)
{
int ret;
rcu_read_lock();
ret = __netif_receive_skb_one_core(skb, false);
rcu_read_unlock();
return ret;
}
EXPORT_SYMBOL(netif_receive_skb_core);
static inline void __netif_receive_skb_list_ptype(struct list_head *head,
struct packet_type *pt_prev,
struct net_device *orig_dev)
{
struct sk_buff *skb, *next;
if (!pt_prev)
return;
if (list_empty(head))
return;
if (pt_prev->list_func != NULL)
INDIRECT_CALL_INET(pt_prev->list_func, ipv6_list_rcv,
ip_list_rcv, head, pt_prev, orig_dev);
else
list_for_each_entry_safe(skb, next, head, list) {
skb_list_del_init(skb);
pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
}
}
static void __netif_receive_skb_list_core(struct list_head *head, bool pfmemalloc)
{
/* Fast-path assumptions:
* - There is no RX handler.
* - Only one packet_type matches.
* If either of these fails, we will end up doing some per-packet
* processing in-line, then handling the 'last ptype' for the whole
* sublist. This can't cause out-of-order delivery to any single ptype,
* because the 'last ptype' must be constant across the sublist, and all
* other ptypes are handled per-packet.
*/
/* Current (common) ptype of sublist */
struct packet_type *pt_curr = NULL;
/* Current (common) orig_dev of sublist */
struct net_device *od_curr = NULL;
struct list_head sublist;
struct sk_buff *skb, *next;
INIT_LIST_HEAD(&sublist);
list_for_each_entry_safe(skb, next, head, list) {
struct net_device *orig_dev = skb->dev;
struct packet_type *pt_prev = NULL;
skb_list_del_init(skb);
__netif_receive_skb_core(&skb, pfmemalloc, &pt_prev);
if (!pt_prev)
continue;
if (pt_curr != pt_prev || od_curr != orig_dev) {
/* dispatch old sublist */
__netif_receive_skb_list_ptype(&sublist, pt_curr, od_curr);
/* start new sublist */
INIT_LIST_HEAD(&sublist);
pt_curr = pt_prev;
od_curr = orig_dev;
}
list_add_tail(&skb->list, &sublist);
}
/* dispatch final sublist */
__netif_receive_skb_list_ptype(&sublist, pt_curr, od_curr);
}
static int __netif_receive_skb(struct sk_buff *skb)
{
int ret;
if (sk_memalloc_socks() && skb_pfmemalloc(skb)) {
unsigned int noreclaim_flag;
/*
* PFMEMALLOC skbs are special, they should
* - be delivered to SOCK_MEMALLOC sockets only
* - stay away from userspace
* - have bounded memory usage
*
* Use PF_MEMALLOC as this saves us from propagating the allocation
* context down to all allocation sites.
*/
noreclaim_flag = memalloc_noreclaim_save();
ret = __netif_receive_skb_one_core(skb, true);
memalloc_noreclaim_restore(noreclaim_flag);
} else
ret = __netif_receive_skb_one_core(skb, false);
return ret;
}
static void __netif_receive_skb_list(struct list_head *head)
{
unsigned long noreclaim_flag = 0;
struct sk_buff *skb, *next;
bool pfmemalloc = false; /* Is current sublist PF_MEMALLOC? */
list_for_each_entry_safe(skb, next, head, list) {
if ((sk_memalloc_socks() && skb_pfmemalloc(skb)) != pfmemalloc) {
struct list_head sublist;
/* Handle the previous sublist */
list_cut_before(&sublist, head, &skb->list);
if (!list_empty(&sublist))
__netif_receive_skb_list_core(&sublist, pfmemalloc);
pfmemalloc = !pfmemalloc;
/* See comments in __netif_receive_skb */
if (pfmemalloc)
noreclaim_flag = memalloc_noreclaim_save();
else
memalloc_noreclaim_restore(noreclaim_flag);
}
}
/* Handle the remaining sublist */
if (!list_empty(head))
__netif_receive_skb_list_core(head, pfmemalloc);
/* Restore pflags */
if (pfmemalloc)
memalloc_noreclaim_restore(noreclaim_flag);
}
static int generic_xdp_install(struct net_device *dev, struct netdev_bpf *xdp)
{
struct bpf_prog *old = rtnl_dereference(dev->xdp_prog);
struct bpf_prog *new = xdp->prog;
int ret = 0;
switch (xdp->command) {
case XDP_SETUP_PROG:
rcu_assign_pointer(dev->xdp_prog, new);
if (old)
bpf_prog_put(old);
if (old && !new) {
static_branch_dec(&generic_xdp_needed_key);
} else if (new && !old) {
static_branch_inc(&generic_xdp_needed_key);
dev_disable_lro(dev);
dev_disable_gro_hw(dev);
}
break;
default:
ret = -EINVAL;
break;
}
return ret;
}
static int netif_receive_skb_internal(struct sk_buff *skb)
{
int ret;
net_timestamp_check(netdev_tstamp_prequeue, skb);
if (skb_defer_rx_timestamp(skb))
return NET_RX_SUCCESS;
rcu_read_lock();
#ifdef CONFIG_RPS
if (static_branch_unlikely(&rps_needed)) {
struct rps_dev_flow voidflow, *rflow = &voidflow;
int cpu = get_rps_cpu(skb->dev, skb, &rflow);
if (cpu >= 0) {
ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
rcu_read_unlock();
return ret;
}
}
#endif
ret = __netif_receive_skb(skb);
rcu_read_unlock();
return ret;
}
static void netif_receive_skb_list_internal(struct list_head *head)
{
struct sk_buff *skb, *next;
struct list_head sublist;
INIT_LIST_HEAD(&sublist);
list_for_each_entry_safe(skb, next, head, list) {
net_timestamp_check(netdev_tstamp_prequeue, skb);
skb_list_del_init(skb);
if (!skb_defer_rx_timestamp(skb))
list_add_tail(&skb->list, &sublist);
}
list_splice_init(&sublist, head);
rcu_read_lock();
#ifdef CONFIG_RPS
if (static_branch_unlikely(&rps_needed)) {
list_for_each_entry_safe(skb, next, head, list) {
struct rps_dev_flow voidflow, *rflow = &voidflow;
int cpu = get_rps_cpu(skb->dev, skb, &rflow);
if (cpu >= 0) {
/* Will be handled, remove from list */
skb_list_del_init(skb);
enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
}
}
}
#endif
__netif_receive_skb_list(head);
rcu_read_unlock();
}
/**
* netif_receive_skb - process receive buffer from network
* @skb: buffer to process
*
* netif_receive_skb() is the main receive data processing function.
* It always succeeds. The buffer may be dropped during processing
* for congestion control or by the protocol layers.
*
* This function may only be called from softirq context and interrupts
* should be enabled.
*
* Return values (usually ignored):
* NET_RX_SUCCESS: no congestion
* NET_RX_DROP: packet was dropped
*/
int netif_receive_skb(struct sk_buff *skb)
{
int ret;
trace_netif_receive_skb_entry(skb);
ret = netif_receive_skb_internal(skb);
trace_netif_receive_skb_exit(ret);
return ret;
}
EXPORT_SYMBOL(netif_receive_skb);
/**
* netif_receive_skb_list - process many receive buffers from network
* @head: list of skbs to process.
*
* Since return value of netif_receive_skb() is normally ignored, and
* wouldn't be meaningful for a list, this function returns void.
*
* This function may only be called from softirq context and interrupts
* should be enabled.
*/
void netif_receive_skb_list(struct list_head *head)
{
struct sk_buff *skb;
if (list_empty(head))
return;
if (trace_netif_receive_skb_list_entry_enabled()) {
list_for_each_entry(skb, head, list)
trace_netif_receive_skb_list_entry(skb);
}
netif_receive_skb_list_internal(head);
trace_netif_receive_skb_list_exit(0);
}
EXPORT_SYMBOL(netif_receive_skb_list);
static DEFINE_PER_CPU(struct work_struct, flush_works);
/* Network device is going away, flush any packets still pending */
static void flush_backlog(struct work_struct *work)
{
struct sk_buff *skb, *tmp;
struct softnet_data *sd;
local_bh_disable();
sd = this_cpu_ptr(&softnet_data);
local_irq_disable();
rps_lock(sd);
skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) {
if (skb->dev->reg_state == NETREG_UNREGISTERING) {
__skb_unlink(skb, &sd->input_pkt_queue);
dev_kfree_skb_irq(skb);
input_queue_head_incr(sd);
}
}
rps_unlock(sd);
local_irq_enable();
skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
if (skb->dev->reg_state == NETREG_UNREGISTERING) {
__skb_unlink(skb, &sd->process_queue);
kfree_skb(skb);
input_queue_head_incr(sd);
}
}
local_bh_enable();
}
static bool flush_required(int cpu)
{
#if IS_ENABLED(CONFIG_RPS)
struct softnet_data *sd = &per_cpu(softnet_data, cpu);
bool do_flush;
local_irq_disable();
rps_lock(sd);
/* as insertion into process_queue happens with the rps lock held,
* process_queue access may race only with dequeue
*/
do_flush = !skb_queue_empty(&sd->input_pkt_queue) ||
!skb_queue_empty_lockless(&sd->process_queue);
rps_unlock(sd);
local_irq_enable();
return do_flush;
#endif
/* without RPS we can't safely check input_pkt_queue: during a
* concurrent remote skb_queue_splice() we can detect as empty both
* input_pkt_queue and process_queue even if the latter could end-up
* containing a lot of packets.
*/
return true;
}
static void flush_all_backlogs(void)
{
static cpumask_t flush_cpus;
unsigned int cpu;
/* since we are under rtnl lock protection we can use static data
* for the cpumask and avoid allocating on stack the possibly
* large mask
*/
ASSERT_RTNL();
cpus_read_lock();
cpumask_clear(&flush_cpus);
for_each_online_cpu(cpu) {
if (flush_required(cpu)) {
queue_work_on(cpu, system_highpri_wq,
per_cpu_ptr(&flush_works, cpu));
cpumask_set_cpu(cpu, &flush_cpus);
}
}
/* we can have in flight packet[s] on the cpus we are not flushing,
* synchronize_net() in unregister_netdevice_many() will take care of
* them
*/
for_each_cpu(cpu, &flush_cpus)
flush_work(per_cpu_ptr(&flush_works, cpu));
cpus_read_unlock();
}
/* Pass the currently batched GRO_NORMAL SKBs up to the stack. */
static void gro_normal_list(struct napi_struct *napi)
{
if (!napi->rx_count)
return;
netif_receive_skb_list_internal(&napi->rx_list);
INIT_LIST_HEAD(&napi->rx_list);
napi->rx_count = 0;
}
/* Queue one GRO_NORMAL SKB up for list processing. If batch size exceeded,
* pass the whole batch up to the stack.
*/
static void gro_normal_one(struct napi_struct *napi, struct sk_buff *skb, int segs)
{
list_add_tail(&skb->list, &napi->rx_list);
napi->rx_count += segs;
if (napi->rx_count >= gro_normal_batch)
gro_normal_list(napi);
}
static int napi_gro_complete(struct napi_struct *napi, struct sk_buff *skb)
{
struct packet_offload *ptype;
__be16 type = skb->protocol;
struct list_head *head = &offload_base;
int err = -ENOENT;
BUILD_BUG_ON(sizeof(struct napi_gro_cb) > sizeof(skb->cb));
if (NAPI_GRO_CB(skb)->count == 1) {
skb_shinfo(skb)->gso_size = 0;
goto out;
}
rcu_read_lock();
list_for_each_entry_rcu(ptype, head, list) {
if (ptype->type != type || !ptype->callbacks.gro_complete)
continue;
err = INDIRECT_CALL_INET(ptype->callbacks.gro_complete,
ipv6_gro_complete, inet_gro_complete,
skb, 0);
break;
}
rcu_read_unlock();
if (err) {
WARN_ON(&ptype->list == head);
kfree_skb(skb);
return NET_RX_SUCCESS;
}
out:
gro_normal_one(napi, skb, NAPI_GRO_CB(skb)->count);
return NET_RX_SUCCESS;
}
static void __napi_gro_flush_chain(struct napi_struct *napi, u32 index,
bool flush_old)
{
struct list_head *head = &napi->gro_hash[index].list;
struct sk_buff *skb, *p;
list_for_each_entry_safe_reverse(skb, p, head, list) {
if (flush_old && NAPI_GRO_CB(skb)->age == jiffies)
return;
skb_list_del_init(skb);
napi_gro_complete(napi, skb);
napi->gro_hash[index].count--;
}
if (!napi->gro_hash[index].count)
__clear_bit(index, &napi->gro_bitmask);
}
/* napi->gro_hash[].list contains packets ordered by age.
* youngest packets at the head of it.
* Complete skbs in reverse order to reduce latencies.
*/
void napi_gro_flush(struct napi_struct *napi, bool flush_old)
{
unsigned long bitmask = napi->gro_bitmask;
unsigned int i, base = ~0U;
while ((i = ffs(bitmask)) != 0) {
bitmask >>= i;
base += i;
__napi_gro_flush_chain(napi, base, flush_old);
}
}
EXPORT_SYMBOL(napi_gro_flush);
static void gro_list_prepare(const struct list_head *head,
const struct sk_buff *skb)
{
unsigned int maclen = skb->dev->hard_header_len;
u32 hash = skb_get_hash_raw(skb);
struct sk_buff *p;
list_for_each_entry(p, head, list) {
unsigned long diffs;
NAPI_GRO_CB(p)->flush = 0;
if (hash != skb_get_hash_raw(p)) {
NAPI_GRO_CB(p)->same_flow = 0;
continue;
}
diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev;
diffs |= skb_vlan_tag_present(p) ^ skb_vlan_tag_present(skb);
if (skb_vlan_tag_present(p))
diffs |= skb_vlan_tag_get(p) ^ skb_vlan_tag_get(skb);
diffs |= skb_metadata_differs(p, skb);
if (maclen == ETH_HLEN)
diffs |= compare_ether_header(skb_mac_header(p),
skb_mac_header(skb));
else if (!diffs)
diffs = memcmp(skb_mac_header(p),
skb_mac_header(skb),
maclen);
/* in most common scenarions 'slow_gro' is 0
* otherwise we are already on some slower paths
* either skip all the infrequent tests altogether or
* avoid trying too hard to skip each of them individually
*/
if (!diffs && unlikely(skb->slow_gro | p->slow_gro)) {
#if IS_ENABLED(CONFIG_SKB_EXTENSIONS) && IS_ENABLED(CONFIG_NET_TC_SKB_EXT)
struct tc_skb_ext *skb_ext;
struct tc_skb_ext *p_ext;
#endif
diffs |= p->sk != skb->sk;
diffs |= skb_metadata_dst_cmp(p, skb);
diffs |= skb_get_nfct(p) ^ skb_get_nfct(skb);
#if IS_ENABLED(CONFIG_SKB_EXTENSIONS) && IS_ENABLED(CONFIG_NET_TC_SKB_EXT)
skb_ext = skb_ext_find(skb, TC_SKB_EXT);
p_ext = skb_ext_find(p, TC_SKB_EXT);
diffs |= (!!p_ext) ^ (!!skb_ext);
if (!diffs && unlikely(skb_ext))
diffs |= p_ext->chain ^ skb_ext->chain;
#endif
}
NAPI_GRO_CB(p)->same_flow = !diffs;
}
}
static inline void skb_gro_reset_offset(struct sk_buff *skb, u32 nhoff)
{
const struct skb_shared_info *pinfo = skb_shinfo(skb);
const skb_frag_t *frag0 = &pinfo->frags[0];
NAPI_GRO_CB(skb)->data_offset = 0;
NAPI_GRO_CB(skb)->frag0 = NULL;
NAPI_GRO_CB(skb)->frag0_len = 0;
if (!skb_headlen(skb) && pinfo->nr_frags &&
!PageHighMem(skb_frag_page(frag0)) &&
(!NET_IP_ALIGN || !((skb_frag_off(frag0) + nhoff) & 3))) {
NAPI_GRO_CB(skb)->frag0 = skb_frag_address(frag0);
NAPI_GRO_CB(skb)->frag0_len = min_t(unsigned int,
skb_frag_size(frag0),
skb->end - skb->tail);
}
}
static void gro_pull_from_frag0(struct sk_buff *skb, int grow)
{
struct skb_shared_info *pinfo = skb_shinfo(skb);
BUG_ON(skb->end - skb->tail < grow);
memcpy(skb_tail_pointer(skb), NAPI_GRO_CB(skb)->frag0, grow);
skb->data_len -= grow;
skb->tail += grow;
skb_frag_off_add(&pinfo->frags[0], grow);
skb_frag_size_sub(&pinfo->frags[0], grow);
if (unlikely(!skb_frag_size(&pinfo->frags[0]))) {
skb_frag_unref(skb, 0);
memmove(pinfo->frags, pinfo->frags + 1,
--pinfo->nr_frags * sizeof(pinfo->frags[0]));
}
}
static void gro_flush_oldest(struct napi_struct *napi, struct list_head *head)
{
struct sk_buff *oldest;
oldest = list_last_entry(head, struct sk_buff, list);
/* We are called with head length >= MAX_GRO_SKBS, so this is
* impossible.
*/
if (WARN_ON_ONCE(!oldest))
return;
/* Do not adjust napi->gro_hash[].count, caller is adding a new
* SKB to the chain.
*/
skb_list_del_init(oldest);
napi_gro_complete(napi, oldest);
}
static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
{
u32 bucket = skb_get_hash_raw(skb) & (GRO_HASH_BUCKETS - 1);
struct gro_list *gro_list = &napi->gro_hash[bucket];
struct list_head *head = &offload_base;
struct packet_offload *ptype;
__be16 type = skb->protocol;
struct sk_buff *pp = NULL;
enum gro_result ret;
int same_flow;
int grow;
if (netif_elide_gro(skb->dev))
goto normal;
gro_list_prepare(&gro_list->list, skb);
rcu_read_lock();
list_for_each_entry_rcu(ptype, head, list) {
if (ptype->type != type || !ptype->callbacks.gro_receive)
continue;
skb_set_network_header(skb, skb_gro_offset(skb));
skb_reset_mac_len(skb);
NAPI_GRO_CB(skb)->same_flow = 0;
NAPI_GRO_CB(skb)->flush = skb_is_gso(skb) || skb_has_frag_list(skb);
NAPI_GRO_CB(skb)->free = 0;
NAPI_GRO_CB(skb)->encap_mark = 0;
NAPI_GRO_CB(skb)->recursion_counter = 0;
NAPI_GRO_CB(skb)->is_fou = 0;
NAPI_GRO_CB(skb)->is_atomic = 1;
NAPI_GRO_CB(skb)->gro_remcsum_start = 0;
/* Setup for GRO checksum validation */
switch (skb->ip_summed) {
case CHECKSUM_COMPLETE:
NAPI_GRO_CB(skb)->csum = skb->csum;
NAPI_GRO_CB(skb)->csum_valid = 1;
NAPI_GRO_CB(skb)->csum_cnt = 0;
break;
case CHECKSUM_UNNECESSARY:
NAPI_GRO_CB(skb)->csum_cnt = skb->csum_level + 1;
NAPI_GRO_CB(skb)->csum_valid = 0;
break;
default:
NAPI_GRO_CB(skb)->csum_cnt = 0;
NAPI_GRO_CB(skb)->csum_valid = 0;
}
pp = INDIRECT_CALL_INET(ptype->callbacks.gro_receive,
ipv6_gro_receive, inet_gro_receive,
&gro_list->list, skb);
break;
}
rcu_read_unlock();
if (&ptype->list == head)
goto normal;
if (PTR_ERR(pp) == -EINPROGRESS) {
ret = GRO_CONSUMED;
goto ok;
}
same_flow = NAPI_GRO_CB(skb)->same_flow;
ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED;
if (pp) {
skb_list_del_init(pp);
napi_gro_complete(napi, pp);
gro_list->count--;
}
if (same_flow)
goto ok;
if (NAPI_GRO_CB(skb)->flush)
goto normal;
if (unlikely(gro_list->count >= MAX_GRO_SKBS))
gro_flush_oldest(napi, &gro_list->list);
else
gro_list->count++;
NAPI_GRO_CB(skb)->count = 1;
NAPI_GRO_CB(skb)->age = jiffies;
NAPI_GRO_CB(skb)->last = skb;
skb_shinfo(skb)->gso_size = skb_gro_len(skb);
list_add(&skb->list, &gro_list->list);
ret = GRO_HELD;
pull:
grow = skb_gro_offset(skb) - skb_headlen(skb);
if (grow > 0)
gro_pull_from_frag0(skb, grow);
ok:
if (gro_list->count) {
if (!test_bit(bucket, &napi->gro_bitmask))
__set_bit(bucket, &napi->gro_bitmask);
} else if (test_bit(bucket, &napi->gro_bitmask)) {
__clear_bit(bucket, &napi->gro_bitmask);
}
return ret;
normal:
ret = GRO_NORMAL;
goto pull;
}
struct packet_offload *gro_find_receive_by_type(__be16 type)
{
struct list_head *offload_head = &offload_base;
struct packet_offload *ptype;
list_for_each_entry_rcu(ptype, offload_head, list) {
if (ptype->type != type || !ptype->callbacks.gro_receive)
continue;
return ptype;
}
return NULL;
}
EXPORT_SYMBOL(gro_find_receive_by_type);
struct packet_offload *gro_find_complete_by_type(__be16 type)
{
struct list_head *offload_head = &offload_base;
struct packet_offload *ptype;
list_for_each_entry_rcu(ptype, offload_head, list) {
if (ptype->type != type || !ptype->callbacks.gro_complete)
continue;
return ptype;
}
return NULL;
}
EXPORT_SYMBOL(gro_find_complete_by_type);
static gro_result_t napi_skb_finish(struct napi_struct *napi,
struct sk_buff *skb,
gro_result_t ret)
{
switch (ret) {
case GRO_NORMAL:
gro_normal_one(napi, skb, 1);
break;
case GRO_MERGED_FREE:
if (NAPI_GRO_CB(skb)->free == NAPI_GRO_FREE_STOLEN_HEAD)
napi_skb_free_stolen_head(skb);
else if (skb->fclone != SKB_FCLONE_UNAVAILABLE)
__kfree_skb(skb);
else
__kfree_skb_defer(skb);
break;
case GRO_HELD:
case GRO_MERGED:
case GRO_CONSUMED:
break;
}
return ret;
}
gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
{
gro_result_t ret;
skb_mark_napi_id(skb, napi);
trace_napi_gro_receive_entry(skb);
skb_gro_reset_offset(skb, 0);
ret = napi_skb_finish(napi, skb, dev_gro_receive(napi, skb));
trace_napi_gro_receive_exit(ret);
return ret;
}
EXPORT_SYMBOL(napi_gro_receive);
static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)
{
if (unlikely(skb->pfmemalloc)) {
consume_skb(skb);
return;
}
__skb_pull(skb, skb_headlen(skb));
/* restore the reserve we had after netdev_alloc_skb_ip_align() */
skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN - skb_headroom(skb));
__vlan_hwaccel_clear_tag(skb);
skb->dev = napi->dev;
skb->skb_iif = 0;
/* eth_type_trans() assumes pkt_type is PACKET_HOST */
skb->pkt_type = PACKET_HOST;
skb->encapsulation = 0;
skb_shinfo(skb)->gso_type = 0;
skb->truesize = SKB_TRUESIZE(skb_end_offset(skb));
if (unlikely(skb->slow_gro)) {
skb_orphan(skb);
skb_ext_reset(skb);
nf_reset_ct(skb);
skb->slow_gro = 0;
}
napi->skb = skb;
}
struct sk_buff *napi_get_frags(struct napi_struct *napi)
{
struct sk_buff *skb = napi->skb;
if (!skb) {
skb = napi_alloc_skb(napi, GRO_MAX_HEAD);
if (skb) {
napi->skb = skb;
skb_mark_napi_id(skb, napi);
}
}
return skb;
}
EXPORT_SYMBOL(napi_get_frags);
static gro_result_t napi_frags_finish(struct napi_struct *napi,
struct sk_buff *skb,
gro_result_t ret)
{
switch (ret) {
case GRO_NORMAL:
case GRO_HELD:
__skb_push(skb, ETH_HLEN);
skb->protocol = eth_type_trans(skb, skb->dev);
if (ret == GRO_NORMAL)
gro_normal_one(napi, skb, 1);
break;
case GRO_MERGED_FREE:
if (NAPI_GRO_CB(skb)->free == NAPI_GRO_FREE_STOLEN_HEAD)
napi_skb_free_stolen_head(skb);
else
napi_reuse_skb(napi, skb);
break;
case GRO_MERGED:
case GRO_CONSUMED:
break;
}
return ret;
}
/* Upper GRO stack assumes network header starts at gro_offset=0
* Drivers could call both napi_gro_frags() and napi_gro_receive()
* We copy ethernet header into skb->data to have a common layout.
*/
static struct sk_buff *napi_frags_skb(struct napi_struct *napi)
{
struct sk_buff *skb = napi->skb;
const struct ethhdr *eth;
unsigned int hlen = sizeof(*eth);
napi->skb = NULL;
skb_reset_mac_header(skb);
skb_gro_reset_offset(skb, hlen);
if (unlikely(skb_gro_header_hard(skb, hlen))) {
eth = skb_gro_header_slow(skb, hlen, 0);
if (unlikely(!eth)) {
net_warn_ratelimited("%s: dropping impossible skb from %s\n",
__func__, napi->dev->name);
napi_reuse_skb(napi, skb);
return NULL;
}
} else {
eth = (const struct ethhdr *)skb->data;
gro_pull_from_frag0(skb, hlen);
NAPI_GRO_CB(skb)->frag0 += hlen;
NAPI_GRO_CB(skb)->frag0_len -= hlen;
}
__skb_pull(skb, hlen);
/*
* This works because the only protocols we care about don't require
* special handling.
* We'll fix it up properly in napi_frags_finish()
*/
skb->protocol = eth->h_proto;
return skb;
}
gro_result_t napi_gro_frags(struct napi_struct *napi)
{
gro_result_t ret;
struct sk_buff *skb = napi_frags_skb(napi);
trace_napi_gro_frags_entry(skb);
ret = napi_frags_finish(napi, skb, dev_gro_receive(napi, skb));
trace_napi_gro_frags_exit(ret);
return ret;
}
EXPORT_SYMBOL(napi_gro_frags);
/* Compute the checksum from gro_offset and return the folded value
* after adding in any pseudo checksum.
*/
__sum16 __skb_gro_checksum_complete(struct sk_buff *skb)
{
__wsum wsum;
__sum16 sum;
wsum = skb_checksum(skb, skb_gro_offset(skb), skb_gro_len(skb), 0);
/* NAPI_GRO_CB(skb)->csum holds pseudo checksum */
sum = csum_fold(csum_add(NAPI_GRO_CB(skb)->csum, wsum));
/* See comments in __skb_checksum_complete(). */
if (likely(!sum)) {
if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE) &&
!skb->csum_complete_sw)
netdev_rx_csum_fault(skb->dev, skb);
}
NAPI_GRO_CB(skb)->csum = wsum;
NAPI_GRO_CB(skb)->csum_valid = 1;
return sum;
}
EXPORT_SYMBOL(__skb_gro_checksum_complete);
static void net_rps_send_ipi(struct softnet_data *remsd)
{
#ifdef CONFIG_RPS
while (remsd) {
struct softnet_data *next = remsd->rps_ipi_next;
if (cpu_online(remsd->cpu))
smp_call_function_single_async(remsd->cpu, &remsd->csd);
remsd = next;
}
#endif
}
/*
* net_rps_action_and_irq_enable sends any pending IPI's for rps.
* Note: called with local irq disabled, but exits with local irq enabled.
*/
static void net_rps_action_and_irq_enable(struct softnet_data *sd)
{
#ifdef CONFIG_RPS
struct softnet_data *remsd = sd->rps_ipi_list;
if (remsd) {
sd->rps_ipi_list = NULL;
local_irq_enable();
/* Send pending IPI's to kick RPS processing on remote cpus. */
net_rps_send_ipi(remsd);
} else
#endif
local_irq_enable();
}
static bool sd_has_rps_ipi_waiting(struct softnet_data *sd)
{
#ifdef CONFIG_RPS
return sd->rps_ipi_list != NULL;
#else
return false;
#endif
}
static int process_backlog(struct napi_struct *napi, int quota)
{
struct softnet_data *sd = container_of(napi, struct softnet_data, backlog);
bool again = true;
int work = 0;
/* Check if we have pending ipi, its better to send them now,
* not waiting net_rx_action() end.
*/
if (sd_has_rps_ipi_waiting(sd)) {
local_irq_disable();
net_rps_action_and_irq_enable(sd);
}
napi->weight = dev_rx_weight;
while (again) {
struct sk_buff *skb;
while ((skb = __skb_dequeue(&sd->process_queue))) {
rcu_read_lock();
__netif_receive_skb(skb);
rcu_read_unlock();
input_queue_head_incr(sd);
if (++work >= quota)
return work;
}
local_irq_disable();
rps_lock(sd);
if (skb_queue_empty(&sd->input_pkt_queue)) {
/*
* Inline a custom version of __napi_complete().
* only current cpu owns and manipulates this napi,
* and NAPI_STATE_SCHED is the only possible flag set
* on backlog.
* We can use a plain write instead of clear_bit(),
* and we dont need an smp_mb() memory barrier.
*/
napi->state = 0;
again = false;
} else {
skb_queue_splice_tail_init(&sd->input_pkt_queue,
&sd->process_queue);
}
rps_unlock(sd);
local_irq_enable();
}
return work;
}
/**
* __napi_schedule - schedule for receive
* @n: entry to schedule
*
* The entry's receive function will be scheduled to run.
* Consider using __napi_schedule_irqoff() if hard irqs are masked.
*/
void __napi_schedule(struct napi_struct *n)
{
unsigned long flags;
local_irq_save(flags);
____napi_schedule(this_cpu_ptr(&softnet_data), n);
local_irq_restore(flags);
}
EXPORT_SYMBOL(__napi_schedule);
/**
* napi_schedule_prep - check if napi can be scheduled
* @n: napi context
*
* Test if NAPI routine is already running, and if not mark
* it as running. This is used as a condition variable to
* insure only one NAPI poll instance runs. We also make
* sure there is no pending NAPI disable.
*/
bool napi_schedule_prep(struct napi_struct *n)
{
unsigned long val, new;
do {
val = READ_ONCE(n->state);
if (unlikely(val & NAPIF_STATE_DISABLE))
return false;
new = val | NAPIF_STATE_SCHED;
/* Sets STATE_MISSED bit if STATE_SCHED was already set
* This was suggested by Alexander Duyck, as compiler
* emits better code than :
* if (val & NAPIF_STATE_SCHED)
* new |= NAPIF_STATE_MISSED;
*/
new |= (val & NAPIF_STATE_SCHED) / NAPIF_STATE_SCHED *
NAPIF_STATE_MISSED;
} while (cmpxchg(&n->state, val, new) != val);
return !(val & NAPIF_STATE_SCHED);
}
EXPORT_SYMBOL(napi_schedule_prep);
/**
* __napi_schedule_irqoff - schedule for receive
* @n: entry to schedule
*
* Variant of __napi_schedule() assuming hard irqs are masked.
*
* On PREEMPT_RT enabled kernels this maps to __napi_schedule()
* because the interrupt disabled assumption might not be true
* due to force-threaded interrupts and spinlock substitution.
*/
void __napi_schedule_irqoff(struct napi_struct *n)
{
if (!IS_ENABLED(CONFIG_PREEMPT_RT))
____napi_schedule(this_cpu_ptr(&softnet_data), n);
else
__napi_schedule(n);
}
EXPORT_SYMBOL(__napi_schedule_irqoff);
bool napi_complete_done(struct napi_struct *n, int work_done)
{
unsigned long flags, val, new, timeout = 0;
bool ret = true;
/*
* 1) Don't let napi dequeue from the cpu poll list
* just in case its running on a different cpu.
* 2) If we are busy polling, do nothing here, we have
* the guarantee we will be called later.
*/
if (unlikely(n->state & (NAPIF_STATE_NPSVC |
NAPIF_STATE_IN_BUSY_POLL)))
return false;
if (work_done) {
if (n->gro_bitmask)
timeout = READ_ONCE(n->dev->gro_flush_timeout);
n->defer_hard_irqs_count = READ_ONCE(n->dev->napi_defer_hard_irqs);
}
if (n->defer_hard_irqs_count > 0) {
n->defer_hard_irqs_count--;
timeout = READ_ONCE(n->dev->gro_flush_timeout);
if (timeout)
ret = false;
}
if (n->gro_bitmask) {
/* When the NAPI instance uses a timeout and keeps postponing
* it, we need to bound somehow the time packets are kept in
* the GRO layer
*/
napi_gro_flush(n, !!timeout);
}
gro_normal_list(n);
if (unlikely(!list_empty(&n->poll_list))) {
/* If n->poll_list is not empty, we need to mask irqs */
local_irq_save(flags);
list_del_init(&n->poll_list);
local_irq_restore(flags);
}
do {
val = READ_ONCE(n->state);
WARN_ON_ONCE(!(val & NAPIF_STATE_SCHED));
new = val & ~(NAPIF_STATE_MISSED | NAPIF_STATE_SCHED |
NAPIF_STATE_SCHED_THREADED |
NAPIF_STATE_PREFER_BUSY_POLL);
/* If STATE_MISSED was set, leave STATE_SCHED set,
* because we will call napi->poll() one more time.
* This C code was suggested by Alexander Duyck to help gcc.
*/
new |= (val & NAPIF_STATE_MISSED) / NAPIF_STATE_MISSED *
NAPIF_STATE_SCHED;
} while (cmpxchg(&n->state, val, new) != val);
if (unlikely(val & NAPIF_STATE_MISSED)) {
__napi_schedule(n);
return false;
}
if (timeout)
hrtimer_start(&n->timer, ns_to_ktime(timeout),
HRTIMER_MODE_REL_PINNED);
return ret;
}
EXPORT_SYMBOL(napi_complete_done);
/* must be called under rcu_read_lock(), as we dont take a reference */
static struct napi_struct *napi_by_id(unsigned int napi_id)
{
unsigned int hash = napi_id % HASH_SIZE(napi_hash);
struct napi_struct *napi;
hlist_for_each_entry_rcu(napi, &napi_hash[hash], napi_hash_node)
if (napi->napi_id == napi_id)
return napi;
return NULL;
}
#if defined(CONFIG_NET_RX_BUSY_POLL)
static void __busy_poll_stop(struct napi_struct *napi, bool skip_schedule)
{
if (!skip_schedule) {
gro_normal_list(napi);
__napi_schedule(napi);
return;
}
if (napi->gro_bitmask) {
/* flush too old packets
* If HZ < 1000, flush all packets.
*/
napi_gro_flush(napi, HZ >= 1000);
}
gro_normal_list(napi);
clear_bit(NAPI_STATE_SCHED, &napi->state);
}
static void busy_poll_stop(struct napi_struct *napi, void *have_poll_lock, bool prefer_busy_poll,
u16 budget)
{
bool skip_schedule = false;
unsigned long timeout;
int rc;
/* Busy polling means there is a high chance device driver hard irq
* could not grab NAPI_STATE_SCHED, and that NAPI_STATE_MISSED was
* set in napi_schedule_prep().
* Since we are about to call napi->poll() once more, we can safely
* clear NAPI_STATE_MISSED.
*
* Note: x86 could use a single "lock and ..." instruction
* to perform these two clear_bit()
*/
clear_bit(NAPI_STATE_MISSED, &napi->state);
clear_bit(NAPI_STATE_IN_BUSY_POLL, &napi->state);
local_bh_disable();
if (prefer_busy_poll) {
napi->defer_hard_irqs_count = READ_ONCE(napi->dev->napi_defer_hard_irqs);
timeout = READ_ONCE(napi->dev->gro_flush_timeout);
if (napi->defer_hard_irqs_count && timeout) {
hrtimer_start(&napi->timer, ns_to_ktime(timeout), HRTIMER_MODE_REL_PINNED);
skip_schedule = true;
}
}
/* All we really want here is to re-enable device interrupts.
* Ideally, a new ndo_busy_poll_stop() could avoid another round.
*/
rc = napi->poll(napi, budget);
/* We can't gro_normal_list() here, because napi->poll() might have
* rearmed the napi (napi_complete_done()) in which case it could
* already be running on another CPU.
*/
trace_napi_poll(napi, rc, budget);
netpoll_poll_unlock(have_poll_lock);
if (rc == budget)
__busy_poll_stop(napi, skip_schedule);
local_bh_enable();
}
void napi_busy_loop(unsigned int napi_id,
bool (*loop_end)(void *, unsigned long),
void *loop_end_arg, bool prefer_busy_poll, u16 budget)
{
unsigned long start_time = loop_end ? busy_loop_current_time() : 0;
int (*napi_poll)(struct napi_struct *napi, int budget);
void *have_poll_lock = NULL;
struct napi_struct *napi;
restart:
napi_poll = NULL;
rcu_read_lock();
napi = napi_by_id(napi_id);
if (!napi)
goto out;
preempt_disable();
for (;;) {
int work = 0;
local_bh_disable();
if (!napi_poll) {
unsigned long val = READ_ONCE(napi->state);
/* If multiple threads are competing for this napi,
* we avoid dirtying napi->state as much as we can.
*/
if (val & (NAPIF_STATE_DISABLE | NAPIF_STATE_SCHED |
NAPIF_STATE_IN_BUSY_POLL)) {
if (prefer_busy_poll)
set_bit(NAPI_STATE_PREFER_BUSY_POLL, &napi->state);
goto count;
}
if (cmpxchg(&napi->state, val,
val | NAPIF_STATE_IN_BUSY_POLL |
NAPIF_STATE_SCHED) != val) {
if (prefer_busy_poll)
set_bit(NAPI_STATE_PREFER_BUSY_POLL, &napi->state);
goto count;
}
have_poll_lock = netpoll_poll_lock(napi);
napi_poll = napi->poll;
}
work = napi_poll(napi, budget);
trace_napi_poll(napi, work, budget);
gro_normal_list(napi);
count:
if (work > 0)
__NET_ADD_STATS(dev_net(napi->dev),
LINUX_MIB_BUSYPOLLRXPACKETS, work);
local_bh_enable();
if (!loop_end || loop_end(loop_end_arg, start_time))
break;
if (unlikely(need_resched())) {
if (napi_poll)
busy_poll_stop(napi, have_poll_lock, prefer_busy_poll, budget);
preempt_enable();
rcu_read_unlock();
cond_resched();
if (loop_end(loop_end_arg, start_time))
return;
goto restart;
}
cpu_relax();
}
if (napi_poll)
busy_poll_stop(napi, have_poll_lock, prefer_busy_poll, budget);
preempt_enable();
out:
rcu_read_unlock();
}
EXPORT_SYMBOL(napi_busy_loop);
#endif /* CONFIG_NET_RX_BUSY_POLL */
static void napi_hash_add(struct napi_struct *napi)
{
if (test_bit(NAPI_STATE_NO_BUSY_POLL, &napi->state))
return;
spin_lock(&napi_hash_lock);
/* 0..NR_CPUS range is reserved for sender_cpu use */
do {
if (unlikely(++napi_gen_id < MIN_NAPI_ID))
napi_gen_id = MIN_NAPI_ID;
} while (napi_by_id(napi_gen_id));
napi->napi_id = napi_gen_id;
hlist_add_head_rcu(&napi->napi_hash_node,
&napi_hash[napi->napi_id % HASH_SIZE(napi_hash)]);
spin_unlock(&napi_hash_lock);
}
/* Warning : caller is responsible to make sure rcu grace period
* is respected before freeing memory containing @napi
*/
static void napi_hash_del(struct napi_struct *napi)
{
spin_lock(&napi_hash_lock);
hlist_del_init_rcu(&napi->napi_hash_node);
spin_unlock(&napi_hash_lock);
}
static enum hrtimer_restart napi_watchdog(struct hrtimer *timer)
{
struct napi_struct *napi;
napi = container_of(timer, struct napi_struct, timer);
/* Note : we use a relaxed variant of napi_schedule_prep() not setting
* NAPI_STATE_MISSED, since we do not react to a device IRQ.
*/
if (!napi_disable_pending(napi) &&
!test_and_set_bit(NAPI_STATE_SCHED, &napi->state)) {
clear_bit(NAPI_STATE_PREFER_BUSY_POLL, &napi->state);
__napi_schedule_irqoff(napi);
}
return HRTIMER_NORESTART;
}
static void init_gro_hash(struct napi_struct *napi)
{
int i;
for (i = 0; i < GRO_HASH_BUCKETS; i++) {
INIT_LIST_HEAD(&napi->gro_hash[i].list);
napi->gro_hash[i].count = 0;
}
napi->gro_bitmask = 0;
}
int dev_set_threaded(struct net_device *dev, bool threaded)
{
struct napi_struct *napi;
int err = 0;
if (dev->threaded == threaded)
return 0;
if (threaded) {
list_for_each_entry(napi, &dev->napi_list, dev_list) {
if (!napi->thread) {
err = napi_kthread_create(napi);
if (err) {
threaded = false;
break;
}
}
}
}
dev->threaded = threaded;
/* Make sure kthread is created before THREADED bit
* is set.
*/
smp_mb__before_atomic();
/* Setting/unsetting threaded mode on a napi might not immediately
* take effect, if the current napi instance is actively being
* polled. In this case, the switch between threaded mode and
* softirq mode will happen in the next round of napi_schedule().
* This should not cause hiccups/stalls to the live traffic.
*/
list_for_each_entry(napi, &dev->napi_list, dev_list) {
if (threaded)
set_bit(NAPI_STATE_THREADED, &napi->state);
else
clear_bit(NAPI_STATE_THREADED, &napi->state);
}
return err;
}
EXPORT_SYMBOL(dev_set_threaded);
void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
int (*poll)(struct napi_struct *, int), int weight)
{
if (WARN_ON(test_and_set_bit(NAPI_STATE_LISTED, &napi->state)))
return;
INIT_LIST_HEAD(&napi->poll_list);
INIT_HLIST_NODE(&napi->napi_hash_node);
hrtimer_init(&napi->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED);
napi->timer.function = napi_watchdog;
init_gro_hash(napi);
napi->skb = NULL;
INIT_LIST_HEAD(&napi->rx_list);
napi->rx_count = 0;
napi->poll = poll;
if (weight > NAPI_POLL_WEIGHT)
netdev_err_once(dev, "%s() called with weight %d\n", __func__,
weight);
napi->weight = weight;
napi->dev = dev;
#ifdef CONFIG_NETPOLL
napi->poll_owner = -1;
#endif
set_bit(NAPI_STATE_SCHED, &napi->state);
set_bit(NAPI_STATE_NPSVC, &napi->state);
list_add_rcu(&napi->dev_list, &dev->napi_list);
napi_hash_add(napi);
/* Create kthread for this napi if dev->threaded is set.
* Clear dev->threaded if kthread creation failed so that
* threaded mode will not be enabled in napi_enable().
*/
if (dev->threaded && napi_kthread_create(napi))
dev->threaded = 0;
}
EXPORT_SYMBOL(netif_napi_add);
void napi_disable(struct napi_struct *n)
{
might_sleep();
set_bit(NAPI_STATE_DISABLE, &n->state);
while (test_and_set_bit(NAPI_STATE_SCHED, &n->state))
msleep(1);
while (test_and_set_bit(NAPI_STATE_NPSVC, &n->state))
msleep(1);
hrtimer_cancel(&n->timer);
clear_bit(NAPI_STATE_PREFER_BUSY_POLL, &n->state);
clear_bit(NAPI_STATE_DISABLE, &n->state);
clear_bit(NAPI_STATE_THREADED, &n->state);
}
EXPORT_SYMBOL(napi_disable);
/**
* napi_enable - enable NAPI scheduling
* @n: NAPI context
*
* Resume NAPI from being scheduled on this context.
* Must be paired with napi_disable.
*/
void napi_enable(struct napi_struct *n)
{
unsigned long val, new;
do {
val = READ_ONCE(n->state);
BUG_ON(!test_bit(NAPI_STATE_SCHED, &val));
new = val & ~(NAPIF_STATE_SCHED | NAPIF_STATE_NPSVC);
if (n->dev->threaded && n->thread)
new |= NAPIF_STATE_THREADED;
} while (cmpxchg(&n->state, val, new) != val);
}
EXPORT_SYMBOL(napi_enable);
static void flush_gro_hash(struct napi_struct *napi)
{
int i;
for (i = 0; i < GRO_HASH_BUCKETS; i++) {
struct sk_buff *skb, *n;
list_for_each_entry_safe(skb, n, &napi->gro_hash[i].list, list)
kfree_skb(skb);
napi->gro_hash[i].count = 0;
}
}
/* Must be called in process context */
void __netif_napi_del(struct napi_struct *napi)
{
if (!test_and_clear_bit(NAPI_STATE_LISTED, &napi->state))
return;
napi_hash_del(napi);
list_del_rcu(&napi->dev_list);
napi_free_frags(napi);
flush_gro_hash(napi);
napi->gro_bitmask = 0;
if (napi->thread) {
kthread_stop(napi->thread);
napi->thread = NULL;
}
}
EXPORT_SYMBOL(__netif_napi_del);
static int __napi_poll(struct napi_struct *n, bool *repoll)
{
int work, weight;
weight = n->weight;
/* This NAPI_STATE_SCHED test is for avoiding a race
* with netpoll's poll_napi(). Only the entity which
* obtains the lock and sees NAPI_STATE_SCHED set will
* actually make the ->poll() call. Therefore we avoid
* accidentally calling ->poll() when NAPI is not scheduled.
*/
work = 0;
if (test_bit(NAPI_STATE_SCHED, &n->state)) {
work = n->poll(n, weight);
trace_napi_poll(n, work, weight);
}
if (unlikely(work > weight))
pr_err_once("NAPI poll function %pS returned %d, exceeding its budget of %d.\n",
n->poll, work, weight);
if (likely(work < weight))
return work;
/* Drivers must not modify the NAPI state if they
* consume the entire weight. In such cases this code
* still "owns" the NAPI instance and therefore can
* move the instance around on the list at-will.
*/
if (unlikely(napi_disable_pending(n))) {
napi_complete(n);
return work;
}
/* The NAPI context has more processing work, but busy-polling
* is preferred. Exit early.
*/
if (napi_prefer_busy_poll(n)) {
if (napi_complete_done(n, work)) {
/* If timeout is not set, we need to make sure
* that the NAPI is re-scheduled.
*/
napi_schedule(n);
}
return work;
}
if (n->gro_bitmask) {
/* flush too old packets
* If HZ < 1000, flush all packets.
*/
napi_gro_flush(n, HZ >= 1000);
}
gro_normal_list(n);
/* Some drivers may have called napi_schedule
* prior to exhausting their budget.
*/
if (unlikely(!list_empty(&n->poll_list))) {
pr_warn_once("%s: Budget exhausted after napi rescheduled\n",
n->dev ? n->dev->name : "backlog");
return work;
}
*repoll = true;
return work;
}
static int napi_poll(struct napi_struct *n, struct list_head *repoll)
{
bool do_repoll = false;
void *have;
int work;
list_del_init(&n->poll_list);
have = netpoll_poll_lock(n);
work = __napi_poll(n, &do_repoll);
if (do_repoll)
list_add_tail(&n->poll_list, repoll);
netpoll_poll_unlock(have);
return work;
}
static int napi_thread_wait(struct napi_struct *napi)
{
bool woken = false;
set_current_state(TASK_INTERRUPTIBLE);
while (!kthread_should_stop()) {
/* Testing SCHED_THREADED bit here to make sure the current
* kthread owns this napi and could poll on this napi.
* Testing SCHED bit is not enough because SCHED bit might be
* set by some other busy poll thread or by napi_disable().
*/
if (test_bit(NAPI_STATE_SCHED_THREADED, &napi->state) || woken) {
WARN_ON(!list_empty(&napi->poll_list));
__set_current_state(TASK_RUNNING);
return 0;
}
schedule();
/* woken being true indicates this thread owns this napi. */
woken = true;
set_current_state(TASK_INTERRUPTIBLE);
}
__set_current_state(TASK_RUNNING);
return -1;
}
static int napi_threaded_poll(void *data)
{
struct napi_struct *napi = data;
void *have;
while (!napi_thread_wait(napi)) {
for (;;) {
bool repoll = false;
local_bh_disable();
have = netpoll_poll_lock(napi);
__napi_poll(napi, &repoll);
netpoll_poll_unlock(have);
local_bh_enable();
if (!repoll)
break;
cond_resched();
}
}
return 0;
}
static __latent_entropy void net_rx_action(struct softirq_action *h)
{
struct softnet_data *sd = this_cpu_ptr(&softnet_data);
unsigned long time_limit = jiffies +
usecs_to_jiffies(netdev_budget_usecs);
int budget = netdev_budget;
LIST_HEAD(list);
LIST_HEAD(repoll);
local_irq_disable();
list_splice_init(&sd->poll_list, &list);
local_irq_enable();
for (;;) {
struct napi_struct *n;
if (list_empty(&list)) {
if (!sd_has_rps_ipi_waiting(sd) && list_empty(&repoll))
return;
break;
}
n = list_first_entry(&list, struct napi_struct, poll_list);
budget -= napi_poll(n, &repoll);
/* If softirq window is exhausted then punt.
* Allow this to run for 2 jiffies since which will allow
* an average latency of 1.5/HZ.
*/
if (unlikely(budget <= 0 ||
time_after_eq(jiffies, time_limit))) {
sd->time_squeeze++;
break;
}
}
local_irq_disable();
list_splice_tail_init(&sd->poll_list, &list);
list_splice_tail(&repoll, &list);
list_splice(&list, &sd->poll_list);
if (!list_empty(&sd->poll_list))
__raise_softirq_irqoff(NET_RX_SOFTIRQ);
net_rps_action_and_irq_enable(sd);
}
struct netdev_adjacent {
struct net_device *dev;
/* upper master flag, there can only be one master device per list */
bool master;
/* lookup ignore flag */
bool ignore;
/* counter for the number of times this device was added to us */
u16 ref_nr;
/* private field for the users */
void *private;
struct list_head list;
struct rcu_head rcu;
};
static struct netdev_adjacent *__netdev_find_adj(struct net_device *adj_dev,
struct list_head *adj_list)
{
struct netdev_adjacent *adj;
list_for_each_entry(adj, adj_list, list) {
if (adj->dev == adj_dev)
return adj;
}
return NULL;
}
static int ____netdev_has_upper_dev(struct net_device *upper_dev,
struct netdev_nested_priv *priv)
{
struct net_device *dev = (struct net_device *)priv->data;
return upper_dev == dev;
}
/**
* netdev_has_upper_dev - Check if device is linked to an upper device
* @dev: device
* @upper_dev: upper device to check
*
* Find out if a device is linked to specified upper device and return true
* in case it is. Note that this checks only immediate upper device,
* not through a complete stack of devices. The caller must hold the RTNL lock.
*/
bool netdev_has_upper_dev(struct net_device *dev,
struct net_device *upper_dev)
{
struct netdev_nested_priv priv = {
.data = (void *)upper_dev,
};
ASSERT_RTNL();
return netdev_walk_all_upper_dev_rcu(dev, ____netdev_has_upper_dev,
&priv);
}
EXPORT_SYMBOL(netdev_has_upper_dev);
/**
* netdev_has_upper_dev_all_rcu - Check if device is linked to an upper device
* @dev: device
* @upper_dev: upper device to check
*
* Find out if a device is linked to specified upper device and return true
* in case it is. Note that this checks the entire upper device chain.
* The caller must hold rcu lock.
*/
bool netdev_has_upper_dev_all_rcu(struct net_device *dev,
struct net_device *upper_dev)
{
struct netdev_nested_priv priv = {
.data = (void *)upper_dev,
};
return !!netdev_walk_all_upper_dev_rcu(dev, ____netdev_has_upper_dev,
&priv);
}
EXPORT_SYMBOL(netdev_has_upper_dev_all_rcu);
/**
* netdev_has_any_upper_dev - Check if device is linked to some device
* @dev: device
*
* Find out if a device is linked to an upper device and return true in case
* it is. The caller must hold the RTNL lock.
*/
bool netdev_has_any_upper_dev(struct net_device *dev)
{
ASSERT_RTNL();
return !list_empty(&dev->adj_list.upper);
}
EXPORT_SYMBOL(netdev_has_any_upper_dev);
/**
* netdev_master_upper_dev_get - Get master upper device
* @dev: device
*
* Find a master upper device and return pointer to it or NULL in case
* it's not there. The caller must hold the RTNL lock.
*/
struct net_device *netdev_master_upper_dev_get(struct net_device *dev)
{
struct netdev_adjacent *upper;
ASSERT_RTNL(); if (list_empty(&dev->adj_list.upper))
return NULL;
upper = list_first_entry(&dev->adj_list.upper,
struct netdev_adjacent, list);
if (likely(upper->master))
return upper->dev;
return NULL;
}
EXPORT_SYMBOL(netdev_master_upper_dev_get);
static struct net_device *__netdev_master_upper_dev_get(struct net_device *dev)
{
struct netdev_adjacent *upper;
ASSERT_RTNL();
if (list_empty(&dev->adj_list.upper))
return NULL;
upper = list_first_entry(&dev->adj_list.upper,
struct netdev_adjacent, list);
if (likely(upper->master) && !upper->ignore)
return upper->dev;
return NULL;
}
/**
* netdev_has_any_lower_dev - Check if device is linked to some device
* @dev: device
*
* Find out if a device is linked to a lower device and return true in case
* it is. The caller must hold the RTNL lock.
*/
static bool netdev_has_any_lower_dev(struct net_device *dev)
{
ASSERT_RTNL();
return !list_empty(&dev->adj_list.lower);
}
void *netdev_adjacent_get_private(struct list_head *adj_list)
{
struct netdev_adjacent *adj;
adj = list_entry(adj_list, struct netdev_adjacent, list);
return adj->private;
}
EXPORT_SYMBOL(netdev_adjacent_get_private);
/**
* netdev_upper_get_next_dev_rcu - Get the next dev from upper list
* @dev: device
* @iter: list_head ** of the current position
*
* Gets the next device from the dev's upper list, starting from iter
* position. The caller must hold RCU read lock.
*/
struct net_device *netdev_upper_get_next_dev_rcu(struct net_device *dev,
struct list_head **iter)
{
struct netdev_adjacent *upper;
WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held());
upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
if (&upper->list == &dev->adj_list.upper)
return NULL;
*iter = &upper->list;
return upper->dev;
}
EXPORT_SYMBOL(netdev_upper_get_next_dev_rcu);
static struct net_device *__netdev_next_upper_dev(struct net_device *dev,
struct list_head **iter,
bool *ignore)
{
struct netdev_adjacent *upper;
upper = list_entry((*iter)->next, struct netdev_adjacent, list);
if (&upper->list == &dev->adj_list.upper)
return NULL;
*iter = &upper->list;
*ignore = upper->ignore;
return upper->dev;
}
static struct net_device *netdev_next_upper_dev_rcu(struct net_device *dev,
struct list_head **iter)
{
struct netdev_adjacent *upper;
WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held());
upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
if (&upper->list == &dev->adj_list.upper)
return NULL;
*iter = &upper->list;
return upper->dev;
}
static int __netdev_walk_all_upper_dev(struct net_device *dev,
int (*fn)(struct net_device *dev,
struct netdev_nested_priv *priv),
struct netdev_nested_priv *priv)
{
struct net_device *udev, *next, *now, *dev_stack[MAX_NEST_DEV + 1];
struct list_head *niter, *iter, *iter_stack[MAX_NEST_DEV + 1];
int ret, cur = 0;
bool ignore;
now = dev;
iter = &dev->adj_list.upper;
while (1) {
if (now != dev) {
ret = fn(now, priv);
if (ret)
return ret;
}
next = NULL;
while (1) {
udev = __netdev_next_upper_dev(now, &iter, &ignore);
if (!udev)
break;
if (ignore)
continue;
next = udev;
niter = &udev->adj_list.upper;
dev_stack[cur] = now;
iter_stack[cur++] = iter;
break;
}
if (!next) {
if (!cur)
return 0;
next = dev_stack[--cur];
niter = iter_stack[cur];
}
now = next;
iter = niter;
}
return 0;
}
int netdev_walk_all_upper_dev_rcu(struct net_device *dev,
int (*fn)(struct net_device *dev,
struct netdev_nested_priv *priv),
struct netdev_nested_priv *priv)
{
struct net_device *udev, *next, *now, *dev_stack[MAX_NEST_DEV + 1];
struct list_head *niter, *iter, *iter_stack[MAX_NEST_DEV + 1];
int ret, cur = 0;
now = dev;
iter = &dev->adj_list.upper;
while (1) {
if (now != dev) {
ret = fn(now, priv);
if (ret)
return ret;
}
next = NULL;
while (1) {
udev = netdev_next_upper_dev_rcu(now, &iter);
if (!udev)
break;
next = udev;
niter = &udev->adj_list.upper;
dev_stack[cur] = now;
iter_stack[cur++] = iter;
break;
}
if (!next) {
if (!cur)
return 0;
next = dev_stack[--cur];
niter = iter_stack[cur];
}
now = next;
iter = niter;
}
return 0;
}
EXPORT_SYMBOL_GPL(netdev_walk_all_upper_dev_rcu);
static bool __netdev_has_upper_dev(struct net_device *dev,
struct net_device *upper_dev)
{
struct netdev_nested_priv priv = {
.flags = 0,
.data = (void *)upper_dev,
};
ASSERT_RTNL();
return __netdev_walk_all_upper_dev(dev, ____netdev_has_upper_dev,
&priv);
}
/**
* netdev_lower_get_next_private - Get the next ->private from the
* lower neighbour list
* @dev: device
* @iter: list_head ** of the current position
*
* Gets the next netdev_adjacent->private from the dev's lower neighbour
* list, starting from iter position. The caller must hold either hold the
* RTNL lock or its own locking that guarantees that the neighbour lower
* list will remain unchanged.
*/
void *netdev_lower_get_next_private(struct net_device *dev,
struct list_head **iter)
{
struct netdev_adjacent *lower;
lower = list_entry(*iter, struct netdev_adjacent, list);
if (&lower->list == &dev->adj_list.lower)
return NULL;
*iter = lower->list.next;
return lower->private;
}
EXPORT_SYMBOL(netdev_lower_get_next_private);
/**
* netdev_lower_get_next_private_rcu - Get the next ->private from the
* lower neighbour list, RCU
* variant
* @dev: device
* @iter: list_head ** of the current position
*
* Gets the next netdev_adjacent->private from the dev's lower neighbour
* list, starting from iter position. The caller must hold RCU read lock.
*/
void *netdev_lower_get_next_private_rcu(struct net_device *dev,
struct list_head **iter)
{
struct netdev_adjacent *lower;
WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_bh_held());
lower = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
if (&lower->list == &dev->adj_list.lower)
return NULL;
*iter = &lower->list;
return lower->private;
}
EXPORT_SYMBOL(netdev_lower_get_next_private_rcu);
/**
* netdev_lower_get_next - Get the next device from the lower neighbour
* list
* @dev: device
* @iter: list_head ** of the current position
*
* Gets the next netdev_adjacent from the dev's lower neighbour
* list, starting from iter position. The caller must hold RTNL lock or
* its own locking that guarantees that the neighbour lower
* list will remain unchanged.
*/
void *netdev_lower_get_next(struct net_device *dev, struct list_head **iter)
{
struct netdev_adjacent *lower;
lower = list_entry(*iter, struct netdev_adjacent, list);
if (&lower->list == &dev->adj_list.lower)
return NULL;
*iter = lower->list.next;
return lower->dev;
}
EXPORT_SYMBOL(netdev_lower_get_next);
static struct net_device *netdev_next_lower_dev(struct net_device *dev,
struct list_head **iter)
{
struct netdev_adjacent *lower;
lower = list_entry((*iter)->next, struct netdev_adjacent, list);
if (&lower->list == &dev->adj_list.lower)
return NULL;
*iter = &lower->list;
return lower->dev;
}
static struct net_device *__netdev_next_lower_dev(struct net_device *dev,
struct list_head **iter,
bool *ignore)
{
struct netdev_adjacent *lower;
lower = list_entry((*iter)->next, struct netdev_adjacent, list);
if (&lower->list == &dev->adj_list.lower)
return NULL;
*iter = &lower->list;
*ignore = lower->ignore;
return lower->dev;
}
int netdev_walk_all_lower_dev(struct net_device *dev,
int (*fn)(struct net_device *dev,
struct netdev_nested_priv *priv),
struct netdev_nested_priv *priv)
{
struct net_device *ldev, *next, *now, *dev_stack[MAX_NEST_DEV + 1];
struct list_head *niter, *iter, *iter_stack[MAX_NEST_DEV + 1];
int ret, cur = 0;
now = dev;
iter = &dev->adj_list.lower;
while (1) {
if (now != dev) {
ret = fn(now, priv);
if (ret)
return ret;
}
next = NULL;
while (1) {
ldev = netdev_next_lower_dev(now, &iter);
if (!ldev)
break;
next = ldev;
niter = &ldev->adj_list.lower;
dev_stack[cur] = now;
iter_stack[cur++] = iter;
break;
}
if (!next) {
if (!cur)
return 0;
next = dev_stack[--cur];
niter = iter_stack[cur];
}
now = next;
iter = niter;
}
return 0;
}
EXPORT_SYMBOL_GPL(netdev_walk_all_lower_dev);
static int __netdev_walk_all_lower_dev(struct net_device *dev,
int (*fn)(struct net_device *dev,
struct netdev_nested_priv *priv),
struct netdev_nested_priv *priv)
{
struct net_device *ldev, *next, *now, *dev_stack[MAX_NEST_DEV + 1];
struct list_head *niter, *iter, *iter_stack[MAX_NEST_DEV + 1];
int ret, cur = 0;
bool ignore;
now = dev;
iter = &dev->adj_list.lower;
while (1) {
if (now != dev) {
ret = fn(now, priv);
if (ret)
return ret;
}
next = NULL;
while (1) {
ldev = __netdev_next_lower_dev(now, &iter, &ignore);
if (!ldev)
break;
if (ignore)
continue;
next = ldev;
niter = &ldev->adj_list.lower;
dev_stack[cur] = now;
iter_stack[cur++] = iter;
break;
}
if (!next) {
if (!cur)
return 0;
next = dev_stack[--cur];
niter = iter_stack[cur];
}
now = next;
iter = niter;
}
return 0;
}
struct net_device *netdev_next_lower_dev_rcu(struct net_device *dev,
struct list_head **iter)
{
struct netdev_adjacent *lower;
lower = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
if (&lower->list == &dev->adj_list.lower)
return NULL;
*iter = &lower->list;
return lower->dev;
}
EXPORT_SYMBOL(netdev_next_lower_dev_rcu);
static u8 __netdev_upper_depth(struct net_device *dev)
{
struct net_device *udev;
struct list_head *iter;
u8 max_depth = 0;
bool ignore;
for (iter = &dev->adj_list.upper,
udev = __netdev_next_upper_dev(dev, &iter, &ignore);
udev;
udev = __netdev_next_upper_dev(dev, &iter, &ignore)) {
if (ignore)
continue;
if (max_depth < udev->upper_level)
max_depth = udev->upper_level;
}
return max_depth;
}
static u8 __netdev_lower_depth(struct net_device *dev)
{
struct net_device *ldev;
struct list_head *iter;
u8 max_depth = 0;
bool ignore;
for (iter = &dev->adj_list.lower,
ldev = __netdev_next_lower_dev(dev, &iter, &ignore);
ldev;
ldev = __netdev_next_lower_dev(dev, &iter, &ignore)) {
if (ignore)
continue;
if (max_depth < ldev->lower_level)
max_depth = ldev->lower_level;
}
return max_depth;
}
static int __netdev_update_upper_level(struct net_device *dev,
struct netdev_nested_priv *__unused)
{
dev->upper_level = __netdev_upper_depth(dev) + 1;
return 0;
}
static int __netdev_update_lower_level(struct net_device *dev,
struct netdev_nested_priv *priv)
{
dev->lower_level = __netdev_lower_depth(dev) + 1;
#ifdef CONFIG_LOCKDEP
if (!priv)
return 0;
if (priv->flags & NESTED_SYNC_IMM)
dev->nested_level = dev->lower_level - 1;
if (priv->flags & NESTED_SYNC_TODO)
net_unlink_todo(dev);
#endif
return 0;
}
int netdev_walk_all_lower_dev_rcu(struct net_device *dev,
int (*fn)(struct net_device *dev,
struct netdev_nested_priv *priv),
struct netdev_nested_priv *priv)
{
struct net_device *ldev, *next, *now, *dev_stack[MAX_NEST_DEV + 1];
struct list_head *niter, *iter, *iter_stack[MAX_NEST_DEV + 1];
int ret, cur = 0;
now = dev;
iter = &dev->adj_list.lower;
while (1) {
if (now != dev) {
ret = fn(now, priv);
if (ret)
return ret;
}
next = NULL;
while (1) {
ldev = netdev_next_lower_dev_rcu(now, &iter);
if (!ldev)
break;
next = ldev;
niter = &ldev->adj_list.lower;
dev_stack[cur] = now;
iter_stack[cur++] = iter;
break;
}
if (!next) {
if (!cur)
return 0;
next = dev_stack[--cur];
niter = iter_stack[cur];
}
now = next;
iter = niter;
}
return 0;
}
EXPORT_SYMBOL_GPL(netdev_walk_all_lower_dev_rcu);
/**
* netdev_lower_get_first_private_rcu - Get the first ->private from the
* lower neighbour list, RCU
* variant
* @dev: device
*
* Gets the first netdev_adjacent->private from the dev's lower neighbour
* list. The caller must hold RCU read lock.
*/
void *netdev_lower_get_first_private_rcu(struct net_device *dev)
{
struct netdev_adjacent *lower;
lower = list_first_or_null_rcu(&dev->adj_list.lower,
struct netdev_adjacent, list);
if (lower)
return lower->private;
return NULL;
}
EXPORT_SYMBOL(netdev_lower_get_first_private_rcu);
/**
* netdev_master_upper_dev_get_rcu - Get master upper device
* @dev: device
*
* Find a master upper device and return pointer to it or NULL in case
* it's not there. The caller must hold the RCU read lock.
*/
struct net_device *netdev_master_upper_dev_get_rcu(struct net_device *dev)
{
struct netdev_adjacent *upper;
upper = list_first_or_null_rcu(&dev->adj_list.upper,
struct netdev_adjacent, list);
if (upper && likely(upper->master))
return upper->dev;
return NULL;
}
EXPORT_SYMBOL(netdev_master_upper_dev_get_rcu);
static int netdev_adjacent_sysfs_add(struct net_device *dev,
struct net_device *adj_dev,
struct list_head *dev_list)
{
char linkname[IFNAMSIZ+7];
sprintf(linkname, dev_list == &dev->adj_list.upper ?
"upper_%s" : "lower_%s", adj_dev->name);
return sysfs_create_link(&(dev->dev.kobj), &(adj_dev->dev.kobj),
linkname);
}
static void netdev_adjacent_sysfs_del(struct net_device *dev,
char *name,
struct list_head *dev_list)
{
char linkname[IFNAMSIZ+7];
sprintf(linkname, dev_list == &dev->adj_list.upper ?
"upper_%s" : "lower_%s", name);
sysfs_remove_link(&(dev->dev.kobj), linkname);
}
static inline bool netdev_adjacent_is_neigh_list(struct net_device *dev,
struct net_device *adj_dev,
struct list_head *dev_list)
{
return (dev_list == &dev->adj_list.upper ||
dev_list == &dev->adj_list.lower) &&
net_eq(dev_net(dev), dev_net(adj_dev));
}
static int __netdev_adjacent_dev_insert(struct net_device *dev,
struct net_device *adj_dev,
struct list_head *dev_list,
void *private, bool master)
{
struct netdev_adjacent *adj;
int ret;
adj = __netdev_find_adj(adj_dev, dev_list);
if (adj) {
adj->ref_nr += 1;
pr_debug("Insert adjacency: dev %s adj_dev %s adj->ref_nr %d\n",
dev->name, adj_dev->name, adj->ref_nr);
return 0;
}
adj = kmalloc(sizeof(*adj), GFP_KERNEL);
if (!adj)
return -ENOMEM;
adj->dev = adj_dev;
adj->master = master;
adj->ref_nr = 1;
adj->private = private;
adj->ignore = false;
dev_hold(adj_dev);
pr_debug("Insert adjacency: dev %s adj_dev %s adj->ref_nr %d; dev_hold on %s\n",
dev->name, adj_dev->name, adj->ref_nr, adj_dev->name);
if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list)) {
ret = netdev_adjacent_sysfs_add(dev, adj_dev, dev_list);
if (ret)
goto free_adj;
}
/* Ensure that master link is always the first item in list. */
if (master) {
ret = sysfs_create_link(&(dev->dev.kobj),
&(adj_dev->dev.kobj), "master");
if (ret)
goto remove_symlinks;
list_add_rcu(&adj->list, dev_list);
} else {
list_add_tail_rcu(&adj->list, dev_list);
}
return 0;
remove_symlinks:
if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list))
netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list);
free_adj:
kfree(adj);
dev_put(adj_dev);
return ret;
}
static void __netdev_adjacent_dev_remove(struct net_device *dev,
struct net_device *adj_dev,
u16 ref_nr,
struct list_head *dev_list)
{
struct netdev_adjacent *adj;
pr_debug("Remove adjacency: dev %s adj_dev %s ref_nr %d\n",
dev->name, adj_dev->name, ref_nr);
adj = __netdev_find_adj(adj_dev, dev_list);
if (!adj) {
pr_err("Adjacency does not exist for device %s from %s\n",
dev->name, adj_dev->name);
WARN_ON(1);
return;
}
if (adj->ref_nr > ref_nr) {
pr_debug("adjacency: %s to %s ref_nr - %d = %d\n",
dev->name, adj_dev->name, ref_nr,
adj->ref_nr - ref_nr);
adj->ref_nr -= ref_nr;
return;
}
if (adj->master)
sysfs_remove_link(&(dev->dev.kobj), "master");
if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list))
netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list);
list_del_rcu(&adj->list);
pr_debug("adjacency: dev_put for %s, because link removed from %s to %s\n",
adj_dev->name, dev->name, adj_dev->name);
dev_put(adj_dev);
kfree_rcu(adj, rcu);
}
static int __netdev_adjacent_dev_link_lists(struct net_device *dev,
struct net_device *upper_dev,
struct list_head *up_list,
struct list_head *down_list,
void *private, bool master)
{
int ret;
ret = __netdev_adjacent_dev_insert(dev, upper_dev, up_list,
private, master);
if (ret)
return ret;
ret = __netdev_adjacent_dev_insert(upper_dev, dev, down_list,
private, false);
if (ret) {
__netdev_adjacent_dev_remove(dev, upper_dev, 1, up_list);
return ret;
}
return 0;
}
static void __netdev_adjacent_dev_unlink_lists(struct net_device *dev,
struct net_device *upper_dev,
u16 ref_nr,
struct list_head *up_list,
struct list_head *down_list)
{
__netdev_adjacent_dev_remove(dev, upper_dev, ref_nr, up_list);
__netdev_adjacent_dev_remove(upper_dev, dev, ref_nr, down_list);
}
static int __netdev_adjacent_dev_link_neighbour(struct net_device *dev,
struct net_device *upper_dev,
void *private, bool master)
{
return __netdev_adjacent_dev_link_lists(dev, upper_dev,
&dev->adj_list.upper,
&upper_dev->adj_list.lower,
private, master);
}
static void __netdev_adjacent_dev_unlink_neighbour(struct net_device *dev,
struct net_device *upper_dev)
{
__netdev_adjacent_dev_unlink_lists(dev, upper_dev, 1,
&dev->adj_list.upper,
&upper_dev->adj_list.lower);
}
static int __netdev_upper_dev_link(struct net_device *dev,
struct net_device *upper_dev, bool master,
void *upper_priv, void *upper_info,
struct netdev_nested_priv *priv,
struct netlink_ext_ack *extack)
{
struct netdev_notifier_changeupper_info changeupper_info = {
.info = {
.dev = dev,
.extack = extack,
},
.upper_dev = upper_dev,
.master = master,
.linking = true,
.upper_info = upper_info,
};
struct net_device *master_dev;
int ret = 0;
ASSERT_RTNL();
if (dev == upper_dev)
return -EBUSY;
/* To prevent loops, check if dev is not upper device to upper_dev. */
if (__netdev_has_upper_dev(upper_dev, dev))
return -EBUSY;
if ((dev->lower_level + upper_dev->upper_level) > MAX_NEST_DEV)
return -EMLINK;
if (!master) {
if (__netdev_has_upper_dev(dev, upper_dev))
return -EEXIST;
} else {
master_dev = __netdev_master_upper_dev_get(dev);
if (master_dev)
return master_dev == upper_dev ? -EEXIST : -EBUSY;
}
ret = call_netdevice_notifiers_info(NETDEV_PRECHANGEUPPER,
&changeupper_info.info);
ret = notifier_to_errno(ret);
if (ret)
return ret;
ret = __netdev_adjacent_dev_link_neighbour(dev, upper_dev, upper_priv,
master);
if (ret)
return ret;
ret = call_netdevice_notifiers_info(NETDEV_CHANGEUPPER,
&changeupper_info.info);
ret = notifier_to_errno(ret);
if (ret)
goto rollback;
__netdev_update_upper_level(dev, NULL);
__netdev_walk_all_lower_dev(dev, __netdev_update_upper_level, NULL);
__netdev_update_lower_level(upper_dev, priv);
__netdev_walk_all_upper_dev(upper_dev, __netdev_update_lower_level,
priv);
return 0;
rollback:
__netdev_adjacent_dev_unlink_neighbour(dev, upper_dev);
return ret;
}
/**
* netdev_upper_dev_link - Add a link to the upper device
* @dev: device
* @upper_dev: new upper device
* @extack: netlink extended ack
*
* Adds a link to device which is upper to this one. The caller must hold
* the RTNL lock. On a failure a negative errno code is returned.
* On success the reference counts are adjusted and the function
* returns zero.
*/
int netdev_upper_dev_link(struct net_device *dev,
struct net_device *upper_dev,
struct netlink_ext_ack *extack)
{
struct netdev_nested_priv priv = {
.flags = NESTED_SYNC_IMM | NESTED_SYNC_TODO,
.data = NULL,
};
return __netdev_upper_dev_link(dev, upper_dev, false,
NULL, NULL, &priv, extack);
}
EXPORT_SYMBOL(netdev_upper_dev_link);
/**
* netdev_master_upper_dev_link - Add a master link to the upper device
* @dev: device
* @upper_dev: new upper device
* @upper_priv: upper device private
* @upper_info: upper info to be passed down via notifier
* @extack: netlink extended ack
*
* Adds a link to device which is upper to this one. In this case, only
* one master upper device can be linked, although other non-master devices
* might be linked as well. The caller must hold the RTNL lock.
* On a failure a negative errno code is returned. On success the reference
* counts are adjusted and the function returns zero.
*/
int netdev_master_upper_dev_link(struct net_device *dev,
struct net_device *upper_dev,
void *upper_priv, void *upper_info,
struct netlink_ext_ack *extack)
{
struct netdev_nested_priv priv = {
.flags = NESTED_SYNC_IMM | NESTED_SYNC_TODO,
.data = NULL,
};
return __netdev_upper_dev_link(dev, upper_dev, true,
upper_priv, upper_info, &priv, extack);
}
EXPORT_SYMBOL(netdev_master_upper_dev_link);
static void __netdev_upper_dev_unlink(struct net_device *dev,
struct net_device *upper_dev,
struct netdev_nested_priv *priv)
{
struct netdev_notifier_changeupper_info changeupper_info = {
.info = {
.dev = dev,
},
.upper_dev = upper_dev,
.linking = false,
};
ASSERT_RTNL();
changeupper_info.master = netdev_master_upper_dev_get(dev) == upper_dev;
call_netdevice_notifiers_info(NETDEV_PRECHANGEUPPER,
&changeupper_info.info);
__netdev_adjacent_dev_unlink_neighbour(dev, upper_dev);
call_netdevice_notifiers_info(NETDEV_CHANGEUPPER,
&changeupper_info.info);
__netdev_update_upper_level(dev, NULL);
__netdev_walk_all_lower_dev(dev, __netdev_update_upper_level, NULL);
__netdev_update_lower_level(upper_dev, priv);
__netdev_walk_all_upper_dev(upper_dev, __netdev_update_lower_level,
priv);
}
/**
* netdev_upper_dev_unlink - Removes a link to upper device
* @dev: device
* @upper_dev: new upper device
*
* Removes a link to device which is upper to this one. The caller must hold
* the RTNL lock.
*/
void netdev_upper_dev_unlink(struct net_device *dev,
struct net_device *upper_dev)
{
struct netdev_nested_priv priv = {
.flags = NESTED_SYNC_TODO,
.data = NULL,
};
__netdev_upper_dev_unlink(dev, upper_dev, &priv);
}
EXPORT_SYMBOL(netdev_upper_dev_unlink);
static void __netdev_adjacent_dev_set(struct net_device *upper_dev,
struct net_device *lower_dev,
bool val)
{
struct netdev_adjacent *adj;
adj = __netdev_find_adj(lower_dev, &upper_dev->adj_list.lower);
if (adj)
adj->ignore = val;
adj = __netdev_find_adj(upper_dev, &lower_dev->adj_list.upper);
if (adj)
adj->ignore = val;
}
static void netdev_adjacent_dev_disable(struct net_device *upper_dev,
struct net_device *lower_dev)
{
__netdev_adjacent_dev_set(upper_dev, lower_dev, true);
}
static void netdev_adjacent_dev_enable(struct net_device *upper_dev,
struct net_device *lower_dev)
{
__netdev_adjacent_dev_set(upper_dev, lower_dev, false);
}
int netdev_adjacent_change_prepare(struct net_device *old_dev,
struct net_device *new_dev,
struct net_device *dev,
struct netlink_ext_ack *extack)
{
struct netdev_nested_priv priv = {
.flags = 0,
.data = NULL,
};
int err;
if (!new_dev)
return 0;
if (old_dev && new_dev != old_dev)
netdev_adjacent_dev_disable(dev, old_dev);
err = __netdev_upper_dev_link(new_dev, dev, false, NULL, NULL, &priv,
extack);
if (err) {
if (old_dev && new_dev != old_dev)
netdev_adjacent_dev_enable(dev, old_dev);
return err;
}
return 0;
}
EXPORT_SYMBOL(netdev_adjacent_change_prepare);
void netdev_adjacent_change_commit(struct net_device *old_dev,
struct net_device *new_dev,
struct net_device *dev)
{
struct netdev_nested_priv priv = {
.flags = NESTED_SYNC_IMM | NESTED_SYNC_TODO,
.data = NULL,
};
if (!new_dev || !old_dev)
return;
if (new_dev == old_dev)
return;
netdev_adjacent_dev_enable(dev, old_dev);
__netdev_upper_dev_unlink(old_dev, dev, &priv);
}
EXPORT_SYMBOL(netdev_adjacent_change_commit);
void netdev_adjacent_change_abort(struct net_device *old_dev,
struct net_device *new_dev,
struct net_device *dev)
{
struct netdev_nested_priv priv = {
.flags = 0,
.data = NULL,
};
if (!new_dev)
return;
if (old_dev && new_dev != old_dev)
netdev_adjacent_dev_enable(dev, old_dev);
__netdev_upper_dev_unlink(new_dev, dev, &priv);
}
EXPORT_SYMBOL(netdev_adjacent_change_abort);
/**
* netdev_bonding_info_change - Dispatch event about slave change
* @dev: device
* @bonding_info: info to dispatch
*
* Send NETDEV_BONDING_INFO to netdev notifiers with info.
* The caller must hold the RTNL lock.
*/
void netdev_bonding_info_change(struct net_device *dev,
struct netdev_bonding_info *bonding_info)
{
struct netdev_notifier_bonding_info info = {
.info.dev = dev,
};
memcpy(&info.bonding_info, bonding_info,
sizeof(struct netdev_bonding_info));
call_netdevice_notifiers_info(NETDEV_BONDING_INFO,
&info.info);
}
EXPORT_SYMBOL(netdev_bonding_info_change);
/**
* netdev_get_xmit_slave - Get the xmit slave of master device
* @dev: device
* @skb: The packet
* @all_slaves: assume all the slaves are active
*
* The reference counters are not incremented so the caller must be
* careful with locks. The caller must hold RCU lock.
* %NULL is returned if no slave is found.
*/
struct net_device *netdev_get_xmit_slave(struct net_device *dev,
struct sk_buff *skb,
bool all_slaves)
{
const struct net_device_ops *ops = dev->netdev_ops;
if (!ops->ndo_get_xmit_slave)
return NULL;
return ops->ndo_get_xmit_slave(dev, skb, all_slaves);
}
EXPORT_SYMBOL(netdev_get_xmit_slave);
static struct net_device *netdev_sk_get_lower_dev(struct net_device *dev,
struct sock *sk)
{
const struct net_device_ops *ops = dev->netdev_ops;
if (!ops->ndo_sk_get_lower_dev)
return NULL;
return ops->ndo_sk_get_lower_dev(dev, sk);
}
/**
* netdev_sk_get_lowest_dev - Get the lowest device in chain given device and socket
* @dev: device
* @sk: the socket
*
* %NULL is returned if no lower device is found.
*/
struct net_device *netdev_sk_get_lowest_dev(struct net_device *dev,
struct sock *sk)
{
struct net_device *lower;
lower = netdev_sk_get_lower_dev(dev, sk);
while (lower) {
dev = lower;
lower = netdev_sk_get_lower_dev(dev, sk);
}
return dev;
}
EXPORT_SYMBOL(netdev_sk_get_lowest_dev);
static void netdev_adjacent_add_links(struct net_device *dev)
{
struct netdev_adjacent *iter;
struct net *net = dev_net(dev);
list_for_each_entry(iter, &dev->adj_list.upper, list) {
if (!net_eq(net, dev_net(iter->dev)))
continue;
netdev_adjacent_sysfs_add(iter->dev, dev,
&iter->dev->adj_list.lower);
netdev_adjacent_sysfs_add(dev, iter->dev,
&dev->adj_list.upper);
}
list_for_each_entry(iter, &dev->adj_list.lower, list) {
if (!net_eq(net, dev_net(iter->dev)))
continue;
netdev_adjacent_sysfs_add(iter->dev, dev,
&iter->dev->adj_list.upper);
netdev_adjacent_sysfs_add(dev, iter->dev,
&dev->adj_list.lower);
}
}
static void netdev_adjacent_del_links(struct net_device *dev)
{
struct netdev_adjacent *iter;
struct net *net = dev_net(dev);
list_for_each_entry(iter, &dev->adj_list.upper, list) {
if (!net_eq(net, dev_net(iter->dev)))
continue;
netdev_adjacent_sysfs_del(iter->dev, dev->name,
&iter->dev->adj_list.lower);
netdev_adjacent_sysfs_del(dev, iter->dev->name,
&dev->adj_list.upper);
}
list_for_each_entry(iter, &dev->adj_list.lower, list) {
if (!net_eq(net, dev_net(iter->dev)))
continue;
netdev_adjacent_sysfs_del(iter->dev, dev->name,
&iter->dev->adj_list.upper);
netdev_adjacent_sysfs_del(dev, iter->dev->name,
&dev->adj_list.lower);
}
}
void netdev_adjacent_rename_links(struct net_device *dev, char *oldname)
{
struct netdev_adjacent *iter;
struct net *net = dev_net(dev);
list_for_each_entry(iter, &dev->adj_list.upper, list) {
if (!net_eq(net, dev_net(iter->dev)))
continue;
netdev_adjacent_sysfs_del(iter->dev, oldname,
&iter->dev->adj_list.lower);
netdev_adjacent_sysfs_add(iter->dev, dev,
&iter->dev->adj_list.lower);
}
list_for_each_entry(iter, &dev->adj_list.lower, list) {
if (!net_eq(net, dev_net(iter->dev)))
continue;
netdev_adjacent_sysfs_del(iter->dev, oldname,
&iter->dev->adj_list.upper);
netdev_adjacent_sysfs_add(iter->dev, dev,
&iter->dev->adj_list.upper);
}
}
void *netdev_lower_dev_get_private(struct net_device *dev,
struct net_device *lower_dev)
{
struct netdev_adjacent *lower;
if (!lower_dev)
return NULL;
lower = __netdev_find_adj(lower_dev, &dev->adj_list.lower);
if (!lower)
return NULL;
return lower->private;
}
EXPORT_SYMBOL(netdev_lower_dev_get_private);
/**
* netdev_lower_state_changed - Dispatch event about lower device state change
* @lower_dev: device
* @lower_state_info: state to dispatch
*
* Send NETDEV_CHANGELOWERSTATE to netdev notifiers with info.
* The caller must hold the RTNL lock.
*/
void netdev_lower_state_changed(struct net_device *lower_dev,
void *lower_state_info)
{
struct netdev_notifier_changelowerstate_info changelowerstate_info = {
.info.dev = lower_dev,
};
ASSERT_RTNL();
changelowerstate_info.lower_state_info = lower_state_info;
call_netdevice_notifiers_info(NETDEV_CHANGELOWERSTATE,
&changelowerstate_info.info);
}
EXPORT_SYMBOL(netdev_lower_state_changed);
static void dev_change_rx_flags(struct net_device *dev, int flags)
{
const struct net_device_ops *ops = dev->netdev_ops;
if (ops->ndo_change_rx_flags)
ops->ndo_change_rx_flags(dev, flags);
}
static int __dev_set_promiscuity(struct net_device *dev, int inc, bool notify)
{
unsigned int old_flags = dev->flags;
kuid_t uid;
kgid_t gid;
ASSERT_RTNL();
dev->flags |= IFF_PROMISC;
dev->promiscuity += inc;
if (dev->promiscuity == 0) {
/*
* Avoid overflow.
* If inc causes overflow, untouch promisc and return error.
*/
if (inc < 0)
dev->flags &= ~IFF_PROMISC;
else {
dev->promiscuity -= inc;
pr_warn("%s: promiscuity touches roof, set promiscuity failed. promiscuity feature of device might be broken.\n",
dev->name);
return -EOVERFLOW;
}
}
if (dev->flags != old_flags) {
pr_info("device %s %s promiscuous mode\n",
dev->name,
dev->flags & IFF_PROMISC ? "entered" : "left");
if (audit_enabled) {
current_uid_gid(&uid, &gid);
audit_log(audit_context(), GFP_ATOMIC,
AUDIT_ANOM_PROMISCUOUS,
"dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u",
dev->name, (dev->flags & IFF_PROMISC),
(old_flags & IFF_PROMISC),
from_kuid(&init_user_ns, audit_get_loginuid(current)),
from_kuid(&init_user_ns, uid),
from_kgid(&init_user_ns, gid),
audit_get_sessionid(current));
}
dev_change_rx_flags(dev, IFF_PROMISC);
}
if (notify)
__dev_notify_flags(dev, old_flags, IFF_PROMISC);
return 0;
}
/**
* dev_set_promiscuity - update promiscuity count on a device
* @dev: device
* @inc: modifier
*
* Add or remove promiscuity from a device. While the count in the device
* remains above zero the interface remains promiscuous. Once it hits zero
* the device reverts back to normal filtering operation. A negative inc
* value is used to drop promiscuity on the device.
* Return 0 if successful or a negative errno code on error.
*/
int dev_set_promiscuity(struct net_device *dev, int inc)
{
unsigned int old_flags = dev->flags;
int err;
err = __dev_set_promiscuity(dev, inc, true);
if (err < 0)
return err;
if (dev->flags != old_flags)
dev_set_rx_mode(dev);
return err;
}
EXPORT_SYMBOL(dev_set_promiscuity);
static int __dev_set_allmulti(struct net_device *dev, int inc, bool notify)
{
unsigned int old_flags = dev->flags, old_gflags = dev->gflags;
ASSERT_RTNL();
dev->flags |= IFF_ALLMULTI;
dev->allmulti += inc;
if (dev->allmulti == 0) {
/*
* Avoid overflow.
* If inc causes overflow, untouch allmulti and return error.
*/
if (inc < 0)
dev->flags &= ~IFF_ALLMULTI;
else {
dev->allmulti -= inc;
pr_warn("%s: allmulti touches roof, set allmulti failed. allmulti feature of device might be broken.\n",
dev->name);
return -EOVERFLOW;
}
}
if (dev->flags ^ old_flags) {
dev_change_rx_flags(dev, IFF_ALLMULTI);
dev_set_rx_mode(dev);
if (notify)
__dev_notify_flags(dev, old_flags,
dev->gflags ^ old_gflags);
}
return 0;
}
/**
* dev_set_allmulti - update allmulti count on a device
* @dev: device
* @inc: modifier
*
* Add or remove reception of all multicast frames to a device. While the
* count in the device remains above zero the interface remains listening
* to all interfaces. Once it hits zero the device reverts back to normal
* filtering operation. A negative @inc value is used to drop the counter
* when releasing a resource needing all multicasts.
* Return 0 if successful or a negative errno code on error.
*/
int dev_set_allmulti(struct net_device *dev, int inc)
{
return __dev_set_allmulti(dev, inc, true);
}
EXPORT_SYMBOL(dev_set_allmulti);
/*
* Upload unicast and multicast address lists to device and
* configure RX filtering. When the device doesn't support unicast
* filtering it is put in promiscuous mode while unicast addresses
* are present.
*/
void __dev_set_rx_mode(struct net_device *dev)
{
const struct net_device_ops *ops = dev->netdev_ops;
/* dev_open will call this function so the list will stay sane. */
if (!(dev->flags&IFF_UP))
return;
if (!netif_device_present(dev))
return;
if (!(dev->priv_flags & IFF_UNICAST_FLT)) {
/* Unicast addresses changes may only happen under the rtnl,
* therefore calling __dev_set_promiscuity here is safe.
*/
if (!netdev_uc_empty(dev) && !dev->uc_promisc) {
__dev_set_promiscuity(dev, 1, false);
dev->uc_promisc = true;
} else if (netdev_uc_empty(dev) && dev->uc_promisc) {
__dev_set_promiscuity(dev, -1, false);
dev->uc_promisc = false;
}
}
if (ops->ndo_set_rx_mode)
ops->ndo_set_rx_mode(dev);
}
void dev_set_rx_mode(struct net_device *dev)
{
netif_addr_lock_bh(dev);
__dev_set_rx_mode(dev);
netif_addr_unlock_bh(dev);
}
/**
* dev_get_flags - get flags reported to userspace
* @dev: device
*
* Get the combination of flag bits exported through APIs to userspace.
*/
unsigned int dev_get_flags(const struct net_device *dev)
{
unsigned int flags;
flags = (dev->flags & ~(IFF_PROMISC |
IFF_ALLMULTI |
IFF_RUNNING |
IFF_LOWER_UP |
IFF_DORMANT)) |
(dev->gflags & (IFF_PROMISC |
IFF_ALLMULTI));
if (netif_running(dev)) {
if (netif_oper_up(dev))
flags |= IFF_RUNNING;
if (netif_carrier_ok(dev))
flags |= IFF_LOWER_UP;
if (netif_dormant(dev))
flags |= IFF_DORMANT;
}
return flags;
}
EXPORT_SYMBOL(dev_get_flags);
int __dev_change_flags(struct net_device *dev, unsigned int flags,
struct netlink_ext_ack *extack)
{
unsigned int old_flags = dev->flags;
int ret;
ASSERT_RTNL();
/*
* Set the flags on our device.
*/
dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP |
IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL |
IFF_AUTOMEDIA)) |
(dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC |
IFF_ALLMULTI));
/*
* Load in the correct multicast list now the flags have changed.
*/
if ((old_flags ^ flags) & IFF_MULTICAST)
dev_change_rx_flags(dev, IFF_MULTICAST);
dev_set_rx_mode(dev);
/*
* Have we downed the interface. We handle IFF_UP ourselves
* according to user attempts to set it, rather than blindly
* setting it.
*/
ret = 0;
if ((old_flags ^ flags) & IFF_UP) {
if (old_flags & IFF_UP)
__dev_close(dev);
else
ret = __dev_open(dev, extack);
}
if ((flags ^ dev->gflags) & IFF_PROMISC) {
int inc = (flags & IFF_PROMISC) ? 1 : -1;
unsigned int old_flags = dev->flags;
dev->gflags ^= IFF_PROMISC;
if (__dev_set_promiscuity(dev, inc, false) >= 0)
if (dev->flags != old_flags)
dev_set_rx_mode(dev);
}
/* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI
* is important. Some (broken) drivers set IFF_PROMISC, when
* IFF_ALLMULTI is requested not asking us and not reporting.
*/
if ((flags ^ dev->gflags) & IFF_ALLMULTI) {
int inc = (flags & IFF_ALLMULTI) ? 1 : -1;
dev->gflags ^= IFF_ALLMULTI;
__dev_set_allmulti(dev, inc, false);
}
return ret;
}
void __dev_notify_flags(struct net_device *dev, unsigned int old_flags,
unsigned int gchanges)
{
unsigned int changes = dev->flags ^ old_flags;
if (gchanges)
rtmsg_ifinfo(RTM_NEWLINK, dev, gchanges, GFP_ATOMIC);
if (changes & IFF_UP) {
if (dev->flags & IFF_UP)
call_netdevice_notifiers(NETDEV_UP, dev);
else
call_netdevice_notifiers(NETDEV_DOWN, dev);
}
if (dev->flags & IFF_UP &&
(changes & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI | IFF_VOLATILE))) {
struct netdev_notifier_change_info change_info = {
.info = {
.dev = dev,
},
.flags_changed = changes,
};
call_netdevice_notifiers_info(NETDEV_CHANGE, &change_info.info);
}
}
/**
* dev_change_flags - change device settings
* @dev: device
* @flags: device state flags
* @extack: netlink extended ack
*
* Change settings on device based state flags. The flags are
* in the userspace exported format.
*/
int dev_change_flags(struct net_device *dev, unsigned int flags,
struct netlink_ext_ack *extack)
{
int ret;
unsigned int changes, old_flags = dev->flags, old_gflags = dev->gflags;
ret = __dev_change_flags(dev, flags, extack);
if (ret < 0)
return ret;
changes = (old_flags ^ dev->flags) | (old_gflags ^ dev->gflags);
__dev_notify_flags(dev, old_flags, changes);
return ret;
}
EXPORT_SYMBOL(dev_change_flags);
int __dev_set_mtu(struct net_device *dev, int new_mtu)
{
const struct net_device_ops *ops = dev->netdev_ops;
if (ops->ndo_change_mtu)
return ops->ndo_change_mtu(dev, new_mtu);
/* Pairs with all the lockless reads of dev->mtu in the stack */
WRITE_ONCE(dev->mtu, new_mtu);
return 0;
}
EXPORT_SYMBOL(__dev_set_mtu);
int dev_validate_mtu(struct net_device *dev, int new_mtu,
struct netlink_ext_ack *extack)
{
/* MTU must be positive, and in range */
if (new_mtu < 0 || new_mtu < dev->min_mtu) {
NL_SET_ERR_MSG(extack, "mtu less than device minimum");
return -EINVAL;
}
if (dev->max_mtu > 0 && new_mtu > dev->max_mtu) {
NL_SET_ERR_MSG(extack, "mtu greater than device maximum");
return -EINVAL;
}
return 0;
}
/**
* dev_set_mtu_ext - Change maximum transfer unit
* @dev: device
* @new_mtu: new transfer unit
* @extack: netlink extended ack
*
* Change the maximum transfer size of the network device.
*/
int dev_set_mtu_ext(struct net_device *dev, int new_mtu,
struct netlink_ext_ack *extack)
{
int err, orig_mtu;
if (new_mtu == dev->mtu)
return 0;
err = dev_validate_mtu(dev, new_mtu, extack);
if (err)
return err;
if (!netif_device_present(dev))
return -ENODEV;
err = call_netdevice_notifiers(NETDEV_PRECHANGEMTU, dev);
err = notifier_to_errno(err);
if (err)
return err;
orig_mtu = dev->mtu;
err = __dev_set_mtu(dev, new_mtu);
if (!err) {
err = call_netdevice_notifiers_mtu(NETDEV_CHANGEMTU, dev,
orig_mtu);
err = notifier_to_errno(err);
if (err) {
/* setting mtu back and notifying everyone again,
* so that they have a chance to revert changes.
*/
__dev_set_mtu(dev, orig_mtu);
call_netdevice_notifiers_mtu(NETDEV_CHANGEMTU, dev,
new_mtu);
}
}
return err;
}
int dev_set_mtu(struct net_device *dev, int new_mtu)
{
struct netlink_ext_ack extack;
int err;
memset(&extack, 0, sizeof(extack));
err = dev_set_mtu_ext(dev, new_mtu, &extack);
if (err && extack._msg)
net_err_ratelimited("%s: %s\n", dev->name, extack._msg);
return err;
}
EXPORT_SYMBOL(dev_set_mtu);
/**
* dev_change_tx_queue_len - Change TX queue length of a netdevice
* @dev: device
* @new_len: new tx queue length
*/
int dev_change_tx_queue_len(struct net_device *dev, unsigned long new_len)
{
unsigned int orig_len = dev->tx_queue_len;
int res;
if (new_len != (unsigned int)new_len)
return -ERANGE;
if (new_len != orig_len) {
dev->tx_queue_len = new_len;
res = call_netdevice_notifiers(NETDEV_CHANGE_TX_QUEUE_LEN, dev);
res = notifier_to_errno(res);
if (res)
goto err_rollback;
res = dev_qdisc_change_tx_queue_len(dev);
if (res)
goto err_rollback;
}
return 0;
err_rollback:
netdev_err(dev, "refused to change device tx_queue_len\n");
dev->tx_queue_len = orig_len;
return res;
}
/**
* dev_set_group - Change group this device belongs to
* @dev: device
* @new_group: group this device should belong to
*/
void dev_set_group(struct net_device *dev, int new_group)
{
dev->group = new_group;
}
EXPORT_SYMBOL(dev_set_group);
/**
* dev_pre_changeaddr_notify - Call NETDEV_PRE_CHANGEADDR.
* @dev: device
* @addr: new address
* @extack: netlink extended ack
*/
int dev_pre_changeaddr_notify(struct net_device *dev, const char *addr,
struct netlink_ext_ack *extack)
{
struct netdev_notifier_pre_changeaddr_info info = {
.info.dev = dev,
.info.extack = extack,
.dev_addr = addr,
};
int rc;
rc = call_netdevice_notifiers_info(NETDEV_PRE_CHANGEADDR, &info.info);
return notifier_to_errno(rc);
}
EXPORT_SYMBOL(dev_pre_changeaddr_notify);
/**
* dev_set_mac_address - Change Media Access Control Address
* @dev: device
* @sa: new address
* @extack: netlink extended ack
*
* Change the hardware (MAC) address of the device
*/
int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa,
struct netlink_ext_ack *extack)
{
const struct net_device_ops *ops = dev->netdev_ops;
int err;
if (!ops->ndo_set_mac_address)
return -EOPNOTSUPP;
if (sa->sa_family != dev->type)
return -EINVAL;
if (!netif_device_present(dev))
return -ENODEV;
err = dev_pre_changeaddr_notify(dev, sa->sa_data, extack);
if (err)
return err;
err = ops->ndo_set_mac_address(dev, sa);
if (err)
return err;
dev->addr_assign_type = NET_ADDR_SET;
call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
add_device_randomness(dev->dev_addr, dev->addr_len);
return 0;
}
EXPORT_SYMBOL(dev_set_mac_address);
static DECLARE_RWSEM(dev_addr_sem);
int dev_set_mac_address_user(struct net_device *dev, struct sockaddr *sa,
struct netlink_ext_ack *extack)
{
int ret;
down_write(&dev_addr_sem);
ret = dev_set_mac_address(dev, sa, extack);
up_write(&dev_addr_sem);
return ret;
}
EXPORT_SYMBOL(dev_set_mac_address_user);
int dev_get_mac_address(struct sockaddr *sa, struct net *net, char *dev_name)
{
size_t size = sizeof(sa->sa_data);
struct net_device *dev;
int ret = 0;
down_read(&dev_addr_sem);
rcu_read_lock();
dev = dev_get_by_name_rcu(net, dev_name);
if (!dev) {
ret = -ENODEV;
goto unlock;
}
if (!dev->addr_len)
memset(sa->sa_data, 0, size);
else
memcpy(sa->sa_data, dev->dev_addr,
min_t(size_t, size, dev->addr_len));
sa->sa_family = dev->type;
unlock:
rcu_read_unlock();
up_read(&dev_addr_sem);
return ret;
}
EXPORT_SYMBOL(dev_get_mac_address);
/**
* dev_change_carrier - Change device carrier
* @dev: device
* @new_carrier: new value
*
* Change device carrier
*/
int dev_change_carrier(struct net_device *dev, bool new_carrier)
{
const struct net_device_ops *ops = dev->netdev_ops;
if (!ops->ndo_change_carrier)
return -EOPNOTSUPP;
if (!netif_device_present(dev))
return -ENODEV;
return ops->ndo_change_carrier(dev, new_carrier);
}
EXPORT_SYMBOL(dev_change_carrier);
/**
* dev_get_phys_port_id - Get device physical port ID
* @dev: device
* @ppid: port ID
*
* Get device physical port ID
*/
int dev_get_phys_port_id(struct net_device *dev,
struct netdev_phys_item_id *ppid)
{
const struct net_device_ops *ops = dev->netdev_ops;
if (!ops->ndo_get_phys_port_id)
return -EOPNOTSUPP;
return ops->ndo_get_phys_port_id(dev, ppid);
}
EXPORT_SYMBOL(dev_get_phys_port_id);
/**
* dev_get_phys_port_name - Get device physical port name
* @dev: device
* @name: port name
* @len: limit of bytes to copy to name
*
* Get device physical port name
*/
int dev_get_phys_port_name(struct net_device *dev,
char *name, size_t len)
{
const struct net_device_ops *ops = dev->netdev_ops;
int err;
if (ops->ndo_get_phys_port_name) {
err = ops->ndo_get_phys_port_name(dev, name, len);
if (err != -EOPNOTSUPP)
return err;
}
return devlink_compat_phys_port_name_get(dev, name, len);
}
EXPORT_SYMBOL(dev_get_phys_port_name);
/**
* dev_get_port_parent_id - Get the device's port parent identifier
* @dev: network device
* @ppid: pointer to a storage for the port's parent identifier
* @recurse: allow/disallow recursion to lower devices
*
* Get the devices's port parent identifier
*/
int dev_get_port_parent_id(struct net_device *dev,
struct netdev_phys_item_id *ppid,
bool recurse)
{
const struct net_device_ops *ops = dev->netdev_ops;
struct netdev_phys_item_id first = { };
struct net_device *lower_dev;
struct list_head *iter;
int err;
if (ops->ndo_get_port_parent_id) {
err = ops->ndo_get_port_parent_id(dev, ppid);
if (err != -EOPNOTSUPP)
return err;
}
err = devlink_compat_switch_id_get(dev, ppid);
if (!err || err != -EOPNOTSUPP)
return err;
if (!recurse)
return -EOPNOTSUPP;
netdev_for_each_lower_dev(dev, lower_dev, iter) {
err = dev_get_port_parent_id(lower_dev, ppid, recurse);
if (err)
break;
if (!first.id_len)
first = *ppid;
else if (memcmp(&first, ppid, sizeof(*ppid)))
return -EOPNOTSUPP;
}
return err;
}
EXPORT_SYMBOL(dev_get_port_parent_id);
/**
* netdev_port_same_parent_id - Indicate if two network devices have
* the same port parent identifier
* @a: first network device
* @b: second network device
*/
bool netdev_port_same_parent_id(struct net_device *a, struct net_device *b)
{
struct netdev_phys_item_id a_id = { };
struct netdev_phys_item_id b_id = { };
if (dev_get_port_parent_id(a, &a_id, true) ||
dev_get_port_parent_id(b, &b_id, true))
return false;
return netdev_phys_item_id_same(&a_id, &b_id);
}
EXPORT_SYMBOL(netdev_port_same_parent_id);
/**
* dev_change_proto_down - update protocol port state information
* @dev: device
* @proto_down: new value
*
* This info can be used by switch drivers to set the phys state of the
* port.
*/
int dev_change_proto_down(struct net_device *dev, bool proto_down)
{
const struct net_device_ops *ops = dev->netdev_ops;
if (!ops->ndo_change_proto_down)
return -EOPNOTSUPP;
if (!netif_device_present(dev))
return -ENODEV;
return ops->ndo_change_proto_down(dev, proto_down);
}
EXPORT_SYMBOL(dev_change_proto_down);
/**
* dev_change_proto_down_generic - generic implementation for
* ndo_change_proto_down that sets carrier according to
* proto_down.
*
* @dev: device
* @proto_down: new value
*/
int dev_change_proto_down_generic(struct net_device *dev, bool proto_down)
{
if (proto_down)
netif_carrier_off(dev);
else
netif_carrier_on(dev);
dev->proto_down = proto_down;
return 0;
}
EXPORT_SYMBOL(dev_change_proto_down_generic);
/**
* dev_change_proto_down_reason - proto down reason
*
* @dev: device
* @mask: proto down mask
* @value: proto down value
*/
void dev_change_proto_down_reason(struct net_device *dev, unsigned long mask,
u32 value)
{
int b;
if (!mask) {
dev->proto_down_reason = value;
} else {
for_each_set_bit(b, &mask, 32) {
if (value & (1 << b))
dev->proto_down_reason |= BIT(b);
else
dev->proto_down_reason &= ~BIT(b);
}
}
}
EXPORT_SYMBOL(dev_change_proto_down_reason);
struct bpf_xdp_link {
struct bpf_link link;
struct net_device *dev; /* protected by rtnl_lock, no refcnt held */
int flags;
};
static enum bpf_xdp_mode dev_xdp_mode(struct net_device *dev, u32 flags)
{
if (flags & XDP_FLAGS_HW_MODE)
return XDP_MODE_HW;
if (flags & XDP_FLAGS_DRV_MODE)
return XDP_MODE_DRV;
if (flags & XDP_FLAGS_SKB_MODE)
return XDP_MODE_SKB;
return dev->netdev_ops->ndo_bpf ? XDP_MODE_DRV : XDP_MODE_SKB;
}
static bpf_op_t dev_xdp_bpf_op(struct net_device *dev, enum bpf_xdp_mode mode)
{
switch (mode) {
case XDP_MODE_SKB:
return generic_xdp_install;
case XDP_MODE_DRV:
case XDP_MODE_HW:
return dev->netdev_ops->ndo_bpf;
default:
return NULL;
}
}
static struct bpf_xdp_link *dev_xdp_link(struct net_device *dev,
enum bpf_xdp_mode mode)
{
return dev->xdp_state[mode].link;
}
static struct bpf_prog *dev_xdp_prog(struct net_device *dev,
enum bpf_xdp_mode mode)
{
struct bpf_xdp_link *link = dev_xdp_link(dev, mode);
if (link)
return link->link.prog;
return dev->xdp_state[mode].prog;
}
u8 dev_xdp_prog_count(struct net_device *dev)
{
u8 count = 0;
int i;
for (i = 0; i < __MAX_XDP_MODE; i++)
if (dev->xdp_state[i].prog || dev->xdp_state[i].link)
count++;
return count;
}
EXPORT_SYMBOL_GPL(dev_xdp_prog_count);
u32 dev_xdp_prog_id(struct net_device *dev, enum bpf_xdp_mode mode)
{
struct bpf_prog *prog = dev_xdp_prog(dev, mode);
return prog ? prog->aux->id : 0;
}
static void dev_xdp_set_link(struct net_device *dev, enum bpf_xdp_mode mode,
struct bpf_xdp_link *link)
{
dev->xdp_state[mode].link = link;
dev->xdp_state[mode].prog = NULL;
}
static void dev_xdp_set_prog(struct net_device *dev, enum bpf_xdp_mode mode,
struct bpf_prog *prog)
{
dev->xdp_state[mode].link = NULL;
dev->xdp_state[mode].prog = prog;
}
static int dev_xdp_install(struct net_device *dev, enum bpf_xdp_mode mode,
bpf_op_t bpf_op, struct netlink_ext_ack *extack,
u32 flags, struct bpf_prog *prog)
{
struct netdev_bpf xdp;
int err;
memset(&xdp, 0, sizeof(xdp));
xdp.command = mode == XDP_MODE_HW ? XDP_SETUP_PROG_HW : XDP_SETUP_PROG;
xdp.extack = extack;
xdp.flags = flags;
xdp.prog = prog;
/* Drivers assume refcnt is already incremented (i.e, prog pointer is
* "moved" into driver), so they don't increment it on their own, but
* they do decrement refcnt when program is detached or replaced.
* Given net_device also owns link/prog, we need to bump refcnt here
* to prevent drivers from underflowing it.
*/
if (prog)
bpf_prog_inc(prog);
err = bpf_op(dev, &xdp);
if (err) {
if (prog)
bpf_prog_put(prog);
return err;
}
if (mode != XDP_MODE_HW)
bpf_prog_change_xdp(dev_xdp_prog(dev, mode), prog);
return 0;
}
static void dev_xdp_uninstall(struct net_device *dev)
{
struct bpf_xdp_link *link;
struct bpf_prog *prog;
enum bpf_xdp_mode mode;
bpf_op_t bpf_op;
ASSERT_RTNL();
for (mode = XDP_MODE_SKB; mode < __MAX_XDP_MODE; mode++) {
prog = dev_xdp_prog(dev, mode);
if (!prog)
continue;
bpf_op = dev_xdp_bpf_op(dev, mode);
if (!bpf_op)
continue;
WARN_ON(dev_xdp_install(dev, mode, bpf_op, NULL, 0, NULL));
/* auto-detach link from net device */
link = dev_xdp_link(dev, mode);
if (link)
link->dev = NULL;
else
bpf_prog_put(prog);
dev_xdp_set_link(dev, mode, NULL);
}
}
static int dev_xdp_attach(struct net_device *dev, struct netlink_ext_ack *extack,
struct bpf_xdp_link *link, struct bpf_prog *new_prog,
struct bpf_prog *old_prog, u32 flags)
{
unsigned int num_modes = hweight32(flags & XDP_FLAGS_MODES);
struct bpf_prog *cur_prog;
struct net_device *upper;
struct list_head *iter;
enum bpf_xdp_mode mode;
bpf_op_t bpf_op;
int err;
ASSERT_RTNL();
/* either link or prog attachment, never both */
if (link && (new_prog || old_prog))
return -EINVAL;
/* link supports only XDP mode flags */
if (link && (flags & ~XDP_FLAGS_MODES)) {
NL_SET_ERR_MSG(extack, "Invalid XDP flags for BPF link attachment");
return -EINVAL;
}
/* just one XDP mode bit should be set, zero defaults to drv/skb mode */
if (num_modes > 1) {
NL_SET_ERR_MSG(extack, "Only one XDP mode flag can be set");
return -EINVAL;
}
/* avoid ambiguity if offload + drv/skb mode progs are both loaded */
if (!num_modes && dev_xdp_prog_count(dev) > 1) {
NL_SET_ERR_MSG(extack,
"More than one program loaded, unset mode is ambiguous");
return -EINVAL;
}
/* old_prog != NULL implies XDP_FLAGS_REPLACE is set */
if (old_prog && !(flags & XDP_FLAGS_REPLACE)) {
NL_SET_ERR_MSG(extack, "XDP_FLAGS_REPLACE is not specified");
return -EINVAL;
}
mode = dev_xdp_mode(dev, flags);
/* can't replace attached link */
if (dev_xdp_link(dev, mode)) {
NL_SET_ERR_MSG(extack, "Can't replace active BPF XDP link");
return -EBUSY;
}
/* don't allow if an upper device already has a program */
netdev_for_each_upper_dev_rcu(dev, upper, iter) {
if (dev_xdp_prog_count(upper) > 0) {
NL_SET_ERR_MSG(extack, "Cannot attach when an upper device already has a program");
return -EEXIST;
}
}
cur_prog = dev_xdp_prog(dev, mode);
/* can't replace attached prog with link */
if (link && cur_prog) {
NL_SET_ERR_MSG(extack, "Can't replace active XDP program with BPF link");
return -EBUSY;
}
if ((flags & XDP_FLAGS_REPLACE) && cur_prog != old_prog) {
NL_SET_ERR_MSG(extack, "Active program does not match expected");
return -EEXIST;
}
/* put effective new program into new_prog */
if (link)
new_prog = link->link.prog;
if (new_prog) {
bool offload = mode == XDP_MODE_HW;
enum bpf_xdp_mode other_mode = mode == XDP_MODE_SKB
? XDP_MODE_DRV : XDP_MODE_SKB;
if ((flags & XDP_FLAGS_UPDATE_IF_NOEXIST) && cur_prog) {
NL_SET_ERR_MSG(extack, "XDP program already attached");
return -EBUSY;
}
if (!offload && dev_xdp_prog(dev, other_mode)) {
NL_SET_ERR_MSG(extack, "Native and generic XDP can't be active at the same time");
return -EEXIST;
}
if (!offload && bpf_prog_is_dev_bound(new_prog->aux)) {
NL_SET_ERR_MSG(extack, "Using device-bound program without HW_MODE flag is not supported");
return -EINVAL;
}
if (new_prog->expected_attach_type == BPF_XDP_DEVMAP) {
NL_SET_ERR_MSG(extack, "BPF_XDP_DEVMAP programs can not be attached to a device");
return -EINVAL;
}
if (new_prog->expected_attach_type == BPF_XDP_CPUMAP) {
NL_SET_ERR_MSG(extack, "BPF_XDP_CPUMAP programs can not be attached to a device");
return -EINVAL;
}
}
/* don't call drivers if the effective program didn't change */
if (new_prog != cur_prog) {
bpf_op = dev_xdp_bpf_op(dev, mode);
if (!bpf_op) {
NL_SET_ERR_MSG(extack, "Underlying driver does not support XDP in native mode");
return -EOPNOTSUPP;
}
err = dev_xdp_install(dev, mode, bpf_op, extack, flags, new_prog);
if (err)
return err;
}
if (link)
dev_xdp_set_link(dev, mode, link);
else
dev_xdp_set_prog(dev, mode, new_prog);
if (cur_prog)
bpf_prog_put(cur_prog);
return 0;
}
static int dev_xdp_attach_link(struct net_device *dev,
struct netlink_ext_ack *extack,
struct bpf_xdp_link *link)
{
return dev_xdp_attach(dev, extack, link, NULL, NULL, link->flags);
}
static int dev_xdp_detach_link(struct net_device *dev,
struct netlink_ext_ack *extack,
struct bpf_xdp_link *link)
{
enum bpf_xdp_mode mode;
bpf_op_t bpf_op;
ASSERT_RTNL();
mode = dev_xdp_mode(dev, link->flags);
if (dev_xdp_link(dev, mode) != link)
return -EINVAL;
bpf_op = dev_xdp_bpf_op(dev, mode);
WARN_ON(dev_xdp_install(dev, mode, bpf_op, NULL, 0, NULL));
dev_xdp_set_link(dev, mode, NULL);
return 0;
}
static void bpf_xdp_link_release(struct bpf_link *link)
{
struct bpf_xdp_link *xdp_link = container_of(link, struct bpf_xdp_link, link);
rtnl_lock();
/* if racing with net_device's tear down, xdp_link->dev might be
* already NULL, in which case link was already auto-detached
*/
if (xdp_link->dev) {
WARN_ON(dev_xdp_detach_link(xdp_link->dev, NULL, xdp_link));
xdp_link->dev = NULL;
}
rtnl_unlock();
}
static int bpf_xdp_link_detach(struct bpf_link *link)
{
bpf_xdp_link_release(link);
return 0;
}
static void bpf_xdp_link_dealloc(struct bpf_link *link)
{
struct bpf_xdp_link *xdp_link = container_of(link, struct bpf_xdp_link, link);
kfree(xdp_link);
}
static void bpf_xdp_link_show_fdinfo(const struct bpf_link *link,
struct seq_file *seq)
{
struct bpf_xdp_link *xdp_link = container_of(link, struct bpf_xdp_link, link);
u32 ifindex = 0;
rtnl_lock();
if (xdp_link->dev)
ifindex = xdp_link->dev->ifindex;
rtnl_unlock();
seq_printf(seq, "ifindex:\t%u\n", ifindex);
}
static int bpf_xdp_link_fill_link_info(const struct bpf_link *link,
struct bpf_link_info *info)
{
struct bpf_xdp_link *xdp_link = container_of(link, struct bpf_xdp_link, link);
u32 ifindex = 0;
rtnl_lock();
if (xdp_link->dev)
ifindex = xdp_link->dev->ifindex;
rtnl_unlock();
info->xdp.ifindex = ifindex;
return 0;
}
static int bpf_xdp_link_update(struct bpf_link *link, struct bpf_prog *new_prog,
struct bpf_prog *old_prog)
{
struct bpf_xdp_link *xdp_link = container_of(link, struct bpf_xdp_link, link);
enum bpf_xdp_mode mode;
bpf_op_t bpf_op;
int err = 0;
rtnl_lock();
/* link might have been auto-released already, so fail */
if (!xdp_link->dev) {
err = -ENOLINK;
goto out_unlock;
}
if (old_prog && link->prog != old_prog) {
err = -EPERM;
goto out_unlock;
}
old_prog = link->prog;
if (old_prog->type != new_prog->type ||
old_prog->expected_attach_type != new_prog->expected_attach_type) {
err = -EINVAL;
goto out_unlock;
}
if (old_prog == new_prog) {
/* no-op, don't disturb drivers */
bpf_prog_put(new_prog);
goto out_unlock;
}
mode = dev_xdp_mode(xdp_link->dev, xdp_link->flags);
bpf_op = dev_xdp_bpf_op(xdp_link->dev, mode);
err = dev_xdp_install(xdp_link->dev, mode, bpf_op, NULL,
xdp_link->flags, new_prog);
if (err)
goto out_unlock;
old_prog = xchg(&link->prog, new_prog);
bpf_prog_put(old_prog);
out_unlock:
rtnl_unlock();
return err;
}
static const struct bpf_link_ops bpf_xdp_link_lops = {
.release = bpf_xdp_link_release,
.dealloc = bpf_xdp_link_dealloc,
.detach = bpf_xdp_link_detach,
.show_fdinfo = bpf_xdp_link_show_fdinfo,
.fill_link_info = bpf_xdp_link_fill_link_info,
.update_prog = bpf_xdp_link_update,
};
int bpf_xdp_link_attach(const union bpf_attr *attr, struct bpf_prog *prog)
{
struct net *net = current->nsproxy->net_ns;
struct bpf_link_primer link_primer;
struct bpf_xdp_link *link;
struct net_device *dev;
int err, fd;
rtnl_lock();
dev = dev_get_by_index(net, attr->link_create.target_ifindex);
if (!dev) {
rtnl_unlock();
return -EINVAL;
}
link = kzalloc(sizeof(*link), GFP_USER);
if (!link) {
err = -ENOMEM;
goto unlock;
}
bpf_link_init(&link->link, BPF_LINK_TYPE_XDP, &bpf_xdp_link_lops, prog);
link->dev = dev;
link->flags = attr->link_create.flags;
err = bpf_link_prime(&link->link, &link_primer);
if (err) {
kfree(link);
goto unlock;
}
err = dev_xdp_attach_link(dev, NULL, link);
rtnl_unlock();
if (err) {
link->dev = NULL;
bpf_link_cleanup(&link_primer);
goto out_put_dev;
}
fd = bpf_link_settle(&link_primer);
/* link itself doesn't hold dev's refcnt to not complicate shutdown */
dev_put(dev);
return fd;
unlock:
rtnl_unlock();
out_put_dev:
dev_put(dev);
return err;
}
/**
* dev_change_xdp_fd - set or clear a bpf program for a device rx path
* @dev: device
* @extack: netlink extended ack
* @fd: new program fd or negative value to clear
* @expected_fd: old program fd that userspace expects to replace or clear
* @flags: xdp-related flags
*
* Set or clear a bpf program for a device
*/
int dev_change_xdp_fd(struct net_device *dev, struct netlink_ext_ack *extack,
int fd, int expected_fd, u32 flags)
{
enum bpf_xdp_mode mode = dev_xdp_mode(dev, flags);
struct bpf_prog *new_prog = NULL, *old_prog = NULL;
int err;
ASSERT_RTNL();
if (fd >= 0) {
new_prog = bpf_prog_get_type_dev(fd, BPF_PROG_TYPE_XDP,
mode != XDP_MODE_SKB);
if (IS_ERR(new_prog))
return PTR_ERR(new_prog);
}
if (expected_fd >= 0) {
old_prog = bpf_prog_get_type_dev(expected_fd, BPF_PROG_TYPE_XDP,
mode != XDP_MODE_SKB);
if (IS_ERR(old_prog)) {
err = PTR_ERR(old_prog);
old_prog = NULL;
goto err_out;
}
}
err = dev_xdp_attach(dev, extack, NULL, new_prog, old_prog, flags);
err_out:
if (err && new_prog)
bpf_prog_put(new_prog);
if (old_prog)
bpf_prog_put(old_prog);
return err;
}
/**
* dev_new_index - allocate an ifindex
* @net: the applicable net namespace
*
* Returns a suitable unique value for a new device interface
* number. The caller must hold the rtnl semaphore or the
* dev_base_lock to be sure it remains unique.
*/
static int dev_new_index(struct net *net)
{
int ifindex = net->ifindex;
for (;;) {
if (++ifindex <= 0)
ifindex = 1;
if (!__dev_get_by_index(net, ifindex))
return net->ifindex = ifindex;
}
}
/* Delayed registration/unregisteration */
static LIST_HEAD(net_todo_list);
DECLARE_WAIT_QUEUE_HEAD(netdev_unregistering_wq);
static void net_set_todo(struct net_device *dev)
{
list_add_tail(&dev->todo_list, &net_todo_list);
dev_net(dev)->dev_unreg_count++;
}
static netdev_features_t netdev_sync_upper_features(struct net_device *lower,
struct net_device *upper, netdev_features_t features)
{
netdev_features_t upper_disables = NETIF_F_UPPER_DISABLES;
netdev_features_t feature;
int feature_bit;
for_each_netdev_feature(upper_disables, feature_bit) {
feature = __NETIF_F_BIT(feature_bit);
if (!(upper->wanted_features & feature)
&& (features & feature)) {
netdev_dbg(lower, "Dropping feature %pNF, upper dev %s has it off.\n",
&feature, upper->name);
features &= ~feature;
}
}
return features;
}
static void netdev_sync_lower_features(struct net_device *upper,
struct net_device *lower, netdev_features_t features)
{
netdev_features_t upper_disables = NETIF_F_UPPER_DISABLES;
netdev_features_t feature;
int feature_bit;
for_each_netdev_feature(upper_disables, feature_bit) {
feature = __NETIF_F_BIT(feature_bit);
if (!(features & feature) && (lower->features & feature)) {
netdev_dbg(upper, "Disabling feature %pNF on lower dev %s.\n",
&feature, lower->name);
lower->wanted_features &= ~feature;
__netdev_update_features(lower);
if (unlikely(lower->features & feature))
netdev_WARN(upper, "failed to disable %pNF on %s!\n",
&feature, lower->name);
else
netdev_features_change(lower);
}
}
}
static netdev_features_t netdev_fix_features(struct net_device *dev,
netdev_features_t features)
{
/* Fix illegal checksum combinations */
if ((features & NETIF_F_HW_CSUM) &&
(features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
netdev_warn(dev, "mixed HW and IP checksum settings.\n");
features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM);
}
/* TSO requires that SG is present as well. */
if ((features & NETIF_F_ALL_TSO) && !(features & NETIF_F_SG)) {
netdev_dbg(dev, "Dropping TSO features since no SG feature.\n");
features &= ~NETIF_F_ALL_TSO;
}
if ((features & NETIF_F_TSO) && !(features & NETIF_F_HW_CSUM) &&
!(features & NETIF_F_IP_CSUM)) {
netdev_dbg(dev, "Dropping TSO features since no CSUM feature.\n");
features &= ~NETIF_F_TSO;
features &= ~NETIF_F_TSO_ECN;
}
if ((features & NETIF_F_TSO6) && !(features & NETIF_F_HW_CSUM) &&
!(features & NETIF_F_IPV6_CSUM)) {
netdev_dbg(dev, "Dropping TSO6 features since no CSUM feature.\n");
features &= ~NETIF_F_TSO6;
}
/* TSO with IPv4 ID mangling requires IPv4 TSO be enabled */
if ((features & NETIF_F_TSO_MANGLEID) && !(features & NETIF_F_TSO))
features &= ~NETIF_F_TSO_MANGLEID;
/* TSO ECN requires that TSO is present as well. */
if ((features & NETIF_F_ALL_TSO) == NETIF_F_TSO_ECN)
features &= ~NETIF_F_TSO_ECN;
/* Software GSO depends on SG. */
if ((features & NETIF_F_GSO) && !(features & NETIF_F_SG)) {
netdev_dbg(dev, "Dropping NETIF_F_GSO since no SG feature.\n");
features &= ~NETIF_F_GSO;
}
/* GSO partial features require GSO partial be set */
if ((features & dev->gso_partial_features) &&
!(features & NETIF_F_GSO_PARTIAL)) {
netdev_dbg(dev,
"Dropping partially supported GSO features since no GSO partial.\n");
features &= ~dev->gso_partial_features;
}
if (!(features & NETIF_F_RXCSUM)) {
/* NETIF_F_GRO_HW implies doing RXCSUM since every packet
* successfully merged by hardware must also have the
* checksum verified by hardware. If the user does not
* want to enable RXCSUM, logically, we should disable GRO_HW.
*/
if (features & NETIF_F_GRO_HW) {
netdev_dbg(dev, "Dropping NETIF_F_GRO_HW since no RXCSUM feature.\n");
features &= ~NETIF_F_GRO_HW;
}
}
/* LRO/HW-GRO features cannot be combined with RX-FCS */
if (features & NETIF_F_RXFCS) {
if (features & NETIF_F_LRO) {
netdev_dbg(dev, "Dropping LRO feature since RX-FCS is requested.\n");
features &= ~NETIF_F_LRO;
}
if (features & NETIF_F_GRO_HW) {
netdev_dbg(dev, "Dropping HW-GRO feature since RX-FCS is requested.\n");
features &= ~NETIF_F_GRO_HW;
}
}
if (features & NETIF_F_HW_TLS_TX) {
bool ip_csum = (features & (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM)) ==
(NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM);
bool hw_csum = features & NETIF_F_HW_CSUM;
if (!ip_csum && !hw_csum) {
netdev_dbg(dev, "Dropping TLS TX HW offload feature since no CSUM feature.\n");
features &= ~NETIF_F_HW_TLS_TX;
}
}
if ((features & NETIF_F_HW_TLS_RX) && !(features & NETIF_F_RXCSUM)) {
netdev_dbg(dev, "Dropping TLS RX HW offload feature since no RXCSUM feature.\n");
features &= ~NETIF_F_HW_TLS_RX;
}
return features;
}
int __netdev_update_features(struct net_device *dev)
{
struct net_device *upper, *lower;
netdev_features_t features;
struct list_head *iter;
int err = -1;
ASSERT_RTNL();
features = netdev_get_wanted_features(dev);
if (dev->netdev_ops->ndo_fix_features)
features = dev->netdev_ops->ndo_fix_features(dev, features);
/* driver might be less strict about feature dependencies */
features = netdev_fix_features(dev, features);
/* some features can't be enabled if they're off on an upper device */
netdev_for_each_upper_dev_rcu(dev, upper, iter)
features = netdev_sync_upper_features(dev, upper, features);
if (dev->features == features)
goto sync_lower;
netdev_dbg(dev, "Features changed: %pNF -> %pNF\n",
&dev->features, &features);
if (dev->netdev_ops->ndo_set_features)
err = dev->netdev_ops->ndo_set_features(dev, features);
else
err = 0;
if (unlikely(err < 0)) {
netdev_err(dev,
"set_features() failed (%d); wanted %pNF, left %pNF\n",
err, &features, &dev->features);
/* return non-0 since some features might have changed and
* it's better to fire a spurious notification than miss it
*/
return -1;
}
sync_lower:
/* some features must be disabled on lower devices when disabled
* on an upper device (think: bonding master or bridge)
*/
netdev_for_each_lower_dev(dev, lower, iter)
netdev_sync_lower_features(dev, lower, features);
if (!err) {
netdev_features_t diff = features ^ dev->features;
if (diff & NETIF_F_RX_UDP_TUNNEL_PORT) {
/* udp_tunnel_{get,drop}_rx_info both need
* NETIF_F_RX_UDP_TUNNEL_PORT enabled on the
* device, or they won't do anything.
* Thus we need to update dev->features
* *before* calling udp_tunnel_get_rx_info,
* but *after* calling udp_tunnel_drop_rx_info.
*/
if (features & NETIF_F_RX_UDP_TUNNEL_PORT) {
dev->features = features;
udp_tunnel_get_rx_info(dev);
} else {
udp_tunnel_drop_rx_info(dev);
}
}
if (diff & NETIF_F_HW_VLAN_CTAG_FILTER) {
if (features & NETIF_F_HW_VLAN_CTAG_FILTER) {
dev->features = features;
err |= vlan_get_rx_ctag_filter_info(dev);
} else {
vlan_drop_rx_ctag_filter_info(dev);
}
}
if (diff & NETIF_F_HW_VLAN_STAG_FILTER) {
if (features & NETIF_F_HW_VLAN_STAG_FILTER) {
dev->features = features;
err |= vlan_get_rx_stag_filter_info(dev);
} else {
vlan_drop_rx_stag_filter_info(dev);
}
}
dev->features = features;
}
return err < 0 ? 0 : 1;
}
/**
* netdev_update_features - recalculate device features
* @dev: the device to check
*
* Recalculate dev->features set and send notifications if it
* has changed. Should be called after driver or hardware dependent
* conditions might have changed that influence the features.
*/
void netdev_update_features(struct net_device *dev)
{
if (__netdev_update_features(dev))
netdev_features_change(dev);
}
EXPORT_SYMBOL(netdev_update_features);
/**
* netdev_change_features - recalculate device features
* @dev: the device to check
*
* Recalculate dev->features set and send notifications even
* if they have not changed. Should be called instead of
* netdev_update_features() if also dev->vlan_features might
* have changed to allow the changes to be propagated to stacked
* VLAN devices.
*/
void netdev_change_features(struct net_device *dev)
{
__netdev_update_features(dev);
netdev_features_change(dev);
}
EXPORT_SYMBOL(netdev_change_features);
/**
* netif_stacked_transfer_operstate - transfer operstate
* @rootdev: the root or lower level device to transfer state from
* @dev: the device to transfer operstate to
*
* Transfer operational state from root to device. This is normally
* called when a stacking relationship exists between the root
* device and the device(a leaf device).
*/
void netif_stacked_transfer_operstate(const struct net_device *rootdev,
struct net_device *dev)
{
if (rootdev->operstate == IF_OPER_DORMANT)
netif_dormant_on(dev);
else
netif_dormant_off(dev);
if (rootdev->operstate == IF_OPER_TESTING)
netif_testing_on(dev);
else
netif_testing_off(dev);
if (netif_carrier_ok(rootdev))
netif_carrier_on(dev);
else
netif_carrier_off(dev);
}
EXPORT_SYMBOL(netif_stacked_transfer_operstate);
static int netif_alloc_rx_queues(struct net_device *dev)
{
unsigned int i, count = dev->num_rx_queues;
struct netdev_rx_queue *rx;
size_t sz = count * sizeof(*rx);
int err = 0;
BUG_ON(count < 1);
rx = kvzalloc(sz, GFP_KERNEL_ACCOUNT | __GFP_RETRY_MAYFAIL);
if (!rx)
return -ENOMEM;
dev->_rx = rx;
for (i = 0; i < count; i++) {
rx[i].dev = dev;
/* XDP RX-queue setup */
err = xdp_rxq_info_reg(&rx[i].xdp_rxq, dev, i, 0);
if (err < 0)
goto err_rxq_info;
}
return 0;
err_rxq_info:
/* Rollback successful reg's and free other resources */
while (i--)
xdp_rxq_info_unreg(&rx[i].xdp_rxq);
kvfree(dev->_rx);
dev->_rx = NULL;
return err;
}
static void netif_free_rx_queues(struct net_device *dev)
{
unsigned int i, count = dev->num_rx_queues;
/* netif_alloc_rx_queues alloc failed, resources have been unreg'ed */
if (!dev->_rx)
return;
for (i = 0; i < count; i++)
xdp_rxq_info_unreg(&dev->_rx[i].xdp_rxq);
kvfree(dev->_rx);
}
static void netdev_init_one_queue(struct net_device *dev,
struct netdev_queue *queue, void *_unused)
{
/* Initialize queue lock */
spin_lock_init(&queue->_xmit_lock);
netdev_set_xmit_lockdep_class(&queue->_xmit_lock, dev->type);
queue->xmit_lock_owner = -1;
netdev_queue_numa_node_write(queue, NUMA_NO_NODE);
queue->dev = dev;
#ifdef CONFIG_BQL
dql_init(&queue->dql, HZ);
#endif
}
static void netif_free_tx_queues(struct net_device *dev)
{
kvfree(dev->_tx);
}
static int netif_alloc_netdev_queues(struct net_device *dev)
{
unsigned int count = dev->num_tx_queues;
struct netdev_queue *tx;
size_t sz = count * sizeof(*tx);
if (count < 1 || count > 0xffff)
return -EINVAL;
tx = kvzalloc(sz, GFP_KERNEL_ACCOUNT | __GFP_RETRY_MAYFAIL);
if (!tx)
return -ENOMEM;
dev->_tx = tx;
netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL);
spin_lock_init(&dev->tx_global_lock);
return 0;
}
void netif_tx_stop_all_queues(struct net_device *dev)
{
unsigned int i;
for (i = 0; i < dev->num_tx_queues; i++) {
struct netdev_queue *txq = netdev_get_tx_queue(dev, i);
netif_tx_stop_queue(txq);
}
}
EXPORT_SYMBOL(netif_tx_stop_all_queues);
/**
* register_netdevice - register a network device
* @dev: device to register
*
* Take a completed network device structure and add it to the kernel
* interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
* chain. 0 is returned on success. A negative errno code is returned
* on a failure to set up the device, or if the name is a duplicate.
*
* Callers must hold the rtnl semaphore. You may want
* register_netdev() instead of this.
*
* BUGS:
* The locking appears insufficient to guarantee two parallel registers
* will not get the same name.
*/
int register_netdevice(struct net_device *dev)
{
int ret;
struct net *net = dev_net(dev);
BUILD_BUG_ON(sizeof(netdev_features_t) * BITS_PER_BYTE <
NETDEV_FEATURE_COUNT);
BUG_ON(dev_boot_phase);
ASSERT_RTNL();
might_sleep();
/* When net_device's are persistent, this will be fatal. */
BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);
BUG_ON(!net);
ret = ethtool_check_ops(dev->ethtool_ops);
if (ret)
return ret;
spin_lock_init(&dev->addr_list_lock);
netdev_set_addr_lockdep_class(dev);
ret = dev_get_valid_name(net, dev, dev->name);
if (ret < 0)
goto out;
ret = -ENOMEM;
dev->name_node = netdev_name_node_head_alloc(dev);
if (!dev->name_node)
goto out;
/* Init, if this function is available */
if (dev->netdev_ops->ndo_init) {
ret = dev->netdev_ops->ndo_init(dev);
if (ret) {
if (ret > 0)
ret = -EIO;
goto err_free_name;
}
}
if (((dev->hw_features | dev->features) &
NETIF_F_HW_VLAN_CTAG_FILTER) &&
(!dev->netdev_ops->ndo_vlan_rx_add_vid ||
!dev->netdev_ops->ndo_vlan_rx_kill_vid)) {
netdev_WARN(dev, "Buggy VLAN acceleration in driver!\n");
ret = -EINVAL;
goto err_uninit;
}
ret = -EBUSY;
if (!dev->ifindex)
dev->ifindex = dev_new_index(net);
else if (__dev_get_by_index(net, dev->ifindex))
goto err_uninit;
/* Transfer changeable features to wanted_features and enable
* software offloads (GSO and GRO).
*/
dev->hw_features |= (NETIF_F_SOFT_FEATURES | NETIF_F_SOFT_FEATURES_OFF);
dev->features |= NETIF_F_SOFT_FEATURES;
if (dev->udp_tunnel_nic_info) {
dev->features |= NETIF_F_RX_UDP_TUNNEL_PORT;
dev->hw_features |= NETIF_F_RX_UDP_TUNNEL_PORT;
}
dev->wanted_features = dev->features & dev->hw_features;
if (!(dev->flags & IFF_LOOPBACK))
dev->hw_features |= NETIF_F_NOCACHE_COPY;
/* If IPv4 TCP segmentation offload is supported we should also
* allow the device to enable segmenting the frame with the option
* of ignoring a static IP ID value. This doesn't enable the
* feature itself but allows the user to enable it later.
*/
if (dev->hw_features & NETIF_F_TSO)
dev->hw_features |= NETIF_F_TSO_MANGLEID;
if (dev->vlan_features & NETIF_F_TSO)
dev->vlan_features |= NETIF_F_TSO_MANGLEID;
if (dev->mpls_features & NETIF_F_TSO)
dev->mpls_features |= NETIF_F_TSO_MANGLEID;
if (dev->hw_enc_features & NETIF_F_TSO)
dev->hw_enc_features |= NETIF_F_TSO_MANGLEID;
/* Make NETIF_F_HIGHDMA inheritable to VLAN devices.
*/
dev->vlan_features |= NETIF_F_HIGHDMA;
/* Make NETIF_F_SG inheritable to tunnel devices.
*/
dev->hw_enc_features |= NETIF_F_SG | NETIF_F_GSO_PARTIAL;
/* Make NETIF_F_SG inheritable to MPLS.
*/
dev->mpls_features |= NETIF_F_SG;
ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev);
ret = notifier_to_errno(ret);
if (ret)
goto err_uninit;
ret = netdev_register_kobject(dev);
if (ret) {
dev->reg_state = NETREG_UNREGISTERED;
goto err_uninit;
}
dev->reg_state = NETREG_REGISTERED;
__netdev_update_features(dev);
/*
* Default initial state at registry is that the
* device is present.
*/
set_bit(__LINK_STATE_PRESENT, &dev->state);
linkwatch_init_dev(dev);
dev_init_scheduler(dev);
dev_hold(dev);
list_netdevice(dev);
add_device_randomness(dev->dev_addr, dev->addr_len);
/* If the device has permanent device address, driver should
* set dev_addr and also addr_assign_type should be set to
* NET_ADDR_PERM (default value).
*/
if (dev->addr_assign_type == NET_ADDR_PERM)
memcpy(dev->perm_addr, dev->dev_addr, dev->addr_len);
/* Notify protocols, that a new device appeared. */
ret = call_netdevice_notifiers(NETDEV_REGISTER, dev);
ret = notifier_to_errno(ret);
if (ret) {
/* Expect explicit free_netdev() on failure */
dev->needs_free_netdev = false;
unregister_netdevice_queue(dev, NULL);
goto out;
}
/*
* Prevent userspace races by waiting until the network
* device is fully setup before sending notifications.
*/
if (!dev->rtnl_link_ops ||
dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL);
out:
return ret;
err_uninit:
if (dev->netdev_ops->ndo_uninit)
dev->netdev_ops->ndo_uninit(dev);
if (dev->priv_destructor)
dev->priv_destructor(dev);
err_free_name:
netdev_name_node_free(dev->name_node);
goto out;
}
EXPORT_SYMBOL(register_netdevice);
/**
* init_dummy_netdev - init a dummy network device for NAPI
* @dev: device to init
*
* This takes a network device structure and initialize the minimum
* amount of fields so it can be used to schedule NAPI polls without
* registering a full blown interface. This is to be used by drivers
* that need to tie several hardware interfaces to a single NAPI
* poll scheduler due to HW limitations.
*/
int init_dummy_netdev(struct net_device *dev)
{
/* Clear everything. Note we don't initialize spinlocks
* are they aren't supposed to be taken by any of the
* NAPI code and this dummy netdev is supposed to be
* only ever used for NAPI polls
*/
memset(dev, 0, sizeof(struct net_device));
/* make sure we BUG if trying to hit standard
* register/unregister code path
*/
dev->reg_state = NETREG_DUMMY;
/* NAPI wants this */
INIT_LIST_HEAD(&dev->napi_list);
/* a dummy interface is started by default */
set_bit(__LINK_STATE_PRESENT, &dev->state);
set_bit(__LINK_STATE_START, &dev->state);
/* napi_busy_loop stats accounting wants this */
dev_net_set(dev, &init_net);
/* Note : We dont allocate pcpu_refcnt for dummy devices,
* because users of this 'device' dont need to change
* its refcount.
*/
return 0;
}
EXPORT_SYMBOL_GPL(init_dummy_netdev);
/**
* register_netdev - register a network device
* @dev: device to register
*
* Take a completed network device structure and add it to the kernel
* interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
* chain. 0 is returned on success. A negative errno code is returned
* on a failure to set up the device, or if the name is a duplicate.
*
* This is a wrapper around register_netdevice that takes the rtnl semaphore
* and expands the device name if you passed a format string to
* alloc_netdev.
*/
int register_netdev(struct net_device *dev)
{
int err;
if (rtnl_lock_killable())
return -EINTR;
err = register_netdevice(dev);
rtnl_unlock();
return err;
}
EXPORT_SYMBOL(register_netdev);
int netdev_refcnt_read(const struct net_device *dev)
{
#ifdef CONFIG_PCPU_DEV_REFCNT
int i, refcnt = 0;
for_each_possible_cpu(i)
refcnt += *per_cpu_ptr(dev->pcpu_refcnt, i);
return refcnt;
#else
return refcount_read(&dev->dev_refcnt);
#endif
}
EXPORT_SYMBOL(netdev_refcnt_read);
int netdev_unregister_timeout_secs __read_mostly = 10;
#define WAIT_REFS_MIN_MSECS 1
#define WAIT_REFS_MAX_MSECS 250
/**
* netdev_wait_allrefs - wait until all references are gone.
* @dev: target net_device
*
* This is called when unregistering network devices.
*
* Any protocol or device that holds a reference should register
* for netdevice notification, and cleanup and put back the
* reference if they receive an UNREGISTER event.
* We can get stuck here if buggy protocols don't correctly
* call dev_put.
*/
static void netdev_wait_allrefs(struct net_device *dev)
{
unsigned long rebroadcast_time, warning_time;
int wait = 0, refcnt;
linkwatch_forget_dev(dev);
rebroadcast_time = warning_time = jiffies;
refcnt = netdev_refcnt_read(dev);
while (refcnt != 1) {
if (time_after(jiffies, rebroadcast_time + 1 * HZ)) { rtnl_lock();
/* Rebroadcast unregister notification */
call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
__rtnl_unlock();
rcu_barrier();
rtnl_lock();
if (test_bit(__LINK_STATE_LINKWATCH_PENDING,
&dev->state)) {
/* We must not have linkwatch events
* pending on unregister. If this
* happens, we simply run the queue
* unscheduled, resulting in a noop
* for this device.
*/
linkwatch_run_queue();
}
__rtnl_unlock();
rebroadcast_time = jiffies;
}
if (!wait) { rcu_barrier();
wait = WAIT_REFS_MIN_MSECS;
} else {
msleep(wait);
wait = min(wait << 1, WAIT_REFS_MAX_MSECS);
}
refcnt = netdev_refcnt_read(dev);
if (refcnt != 1 &&
time_after(jiffies, warning_time +
netdev_unregister_timeout_secs * HZ)) {
pr_emerg("unregister_netdevice: waiting for %s to become free. Usage count = %d\n",
dev->name, refcnt);
warning_time = jiffies;
}
}
}
/* The sequence is:
*
* rtnl_lock();
* ...
* register_netdevice(x1);
* register_netdevice(x2);
* ...
* unregister_netdevice(y1);
* unregister_netdevice(y2);
* ...
* rtnl_unlock();
* free_netdev(y1);
* free_netdev(y2);
*
* We are invoked by rtnl_unlock().
* This allows us to deal with problems:
* 1) We can delete sysfs objects which invoke hotplug
* without deadlocking with linkwatch via keventd.
* 2) Since we run with the RTNL semaphore not held, we can sleep
* safely in order to wait for the netdev refcnt to drop to zero.
*
* We must not return until all unregister events added during
* the interval the lock was held have been completed.
*/
void netdev_run_todo(void)
{
struct list_head list;
#ifdef CONFIG_LOCKDEP
struct list_head unlink_list;
list_replace_init(&net_unlink_list, &unlink_list);
while (!list_empty(&unlink_list)) {
struct net_device *dev = list_first_entry(&unlink_list,
struct net_device,
unlink_list);
list_del_init(&dev->unlink_list);
dev->nested_level = dev->lower_level - 1;
}
#endif
/* Snapshot list, allow later requests */
list_replace_init(&net_todo_list, &list);
__rtnl_unlock();
/* Wait for rcu callbacks to finish before next phase */
if (!list_empty(&list))
rcu_barrier(); while (!list_empty(&list)) {
struct net_device *dev
= list_first_entry(&list, struct net_device, todo_list);
list_del(&dev->todo_list);
if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) {
pr_err("network todo '%s' but state %d\n",
dev->name, dev->reg_state);
dump_stack();
continue;
}
dev->reg_state = NETREG_UNREGISTERED;
netdev_wait_allrefs(dev);
/* paranoia */
BUG_ON(netdev_refcnt_read(dev) != 1); BUG_ON(!list_empty(&dev->ptype_all)); BUG_ON(!list_empty(&dev->ptype_specific)); WARN_ON(rcu_access_pointer(dev->ip_ptr)); WARN_ON(rcu_access_pointer(dev->ip6_ptr));
#if IS_ENABLED(CONFIG_DECNET)
WARN_ON(dev->dn_ptr);
#endif
if (dev->priv_destructor) dev->priv_destructor(dev); if (dev->needs_free_netdev) free_netdev(dev);
/* Report a network device has been unregistered */
rtnl_lock();
dev_net(dev)->dev_unreg_count--;
__rtnl_unlock();
wake_up(&netdev_unregistering_wq);
/* Free network device */
kobject_put(&dev->dev.kobj);
}
}
/* Convert net_device_stats to rtnl_link_stats64. rtnl_link_stats64 has
* all the same fields in the same order as net_device_stats, with only
* the type differing, but rtnl_link_stats64 may have additional fields
* at the end for newer counters.
*/
void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64,
const struct net_device_stats *netdev_stats)
{
#if BITS_PER_LONG == 64
BUILD_BUG_ON(sizeof(*stats64) < sizeof(*netdev_stats));
memcpy(stats64, netdev_stats, sizeof(*netdev_stats));
/* zero out counters that only exist in rtnl_link_stats64 */
memset((char *)stats64 + sizeof(*netdev_stats), 0,
sizeof(*stats64) - sizeof(*netdev_stats));
#else
size_t i, n = sizeof(*netdev_stats) / sizeof(unsigned long);
const unsigned long *src = (const unsigned long *)netdev_stats;
u64 *dst = (u64 *)stats64;
BUILD_BUG_ON(n > sizeof(*stats64) / sizeof(u64));
for (i = 0; i < n; i++)
dst[i] = src[i];
/* zero out counters that only exist in rtnl_link_stats64 */
memset((char *)stats64 + n * sizeof(u64), 0,
sizeof(*stats64) - n * sizeof(u64));
#endif
}
EXPORT_SYMBOL(netdev_stats_to_stats64);
/**
* dev_get_stats - get network device statistics
* @dev: device to get statistics from
* @storage: place to store stats
*
* Get network statistics from device. Return @storage.
* The device driver may provide its own method by setting
* dev->netdev_ops->get_stats64 or dev->netdev_ops->get_stats;
* otherwise the internal statistics structure is used.
*/
struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev,
struct rtnl_link_stats64 *storage)
{
const struct net_device_ops *ops = dev->netdev_ops;
if (ops->ndo_get_stats64) {
memset(storage, 0, sizeof(*storage));
ops->ndo_get_stats64(dev, storage);
} else if (ops->ndo_get_stats) { netdev_stats_to_stats64(storage, ops->ndo_get_stats(dev));
} else {
netdev_stats_to_stats64(storage, &dev->stats);
}
storage->rx_dropped += (unsigned long)atomic_long_read(&dev->rx_dropped);
storage->tx_dropped += (unsigned long)atomic_long_read(&dev->tx_dropped);
storage->rx_nohandler += (unsigned long)atomic_long_read(&dev->rx_nohandler);
return storage;
}
EXPORT_SYMBOL(dev_get_stats);
/**
* dev_fetch_sw_netstats - get per-cpu network device statistics
* @s: place to store stats
* @netstats: per-cpu network stats to read from
*
* Read per-cpu network statistics and populate the related fields in @s.
*/
void dev_fetch_sw_netstats(struct rtnl_link_stats64 *s,
const struct pcpu_sw_netstats __percpu *netstats)
{
int cpu;
for_each_possible_cpu(cpu) {
const struct pcpu_sw_netstats *stats;
struct pcpu_sw_netstats tmp;
unsigned int start;
stats = per_cpu_ptr(netstats, cpu);
do {
start = u64_stats_fetch_begin_irq(&stats->syncp);
tmp.rx_packets = stats->rx_packets;
tmp.rx_bytes = stats->rx_bytes;
tmp.tx_packets = stats->tx_packets;
tmp.tx_bytes = stats->tx_bytes;
} while (u64_stats_fetch_retry_irq(&stats->syncp, start));
s->rx_packets += tmp.rx_packets;
s->rx_bytes += tmp.rx_bytes;
s->tx_packets += tmp.tx_packets;
s->tx_bytes += tmp.tx_bytes;
}
}
EXPORT_SYMBOL_GPL(dev_fetch_sw_netstats);
/**
* dev_get_tstats64 - ndo_get_stats64 implementation
* @dev: device to get statistics from
* @s: place to store stats
*
* Populate @s from dev->stats and dev->tstats. Can be used as
* ndo_get_stats64() callback.
*/
void dev_get_tstats64(struct net_device *dev, struct rtnl_link_stats64 *s)
{
netdev_stats_to_stats64(s, &dev->stats);
dev_fetch_sw_netstats(s, dev->tstats);
}
EXPORT_SYMBOL_GPL(dev_get_tstats64);
struct netdev_queue *dev_ingress_queue_create(struct net_device *dev)
{
struct netdev_queue *queue = dev_ingress_queue(dev);
#ifdef CONFIG_NET_CLS_ACT
if (queue)
return queue;
queue = kzalloc(sizeof(*queue), GFP_KERNEL);
if (!queue)
return NULL;
netdev_init_one_queue(dev, queue, NULL);
RCU_INIT_POINTER(queue->qdisc, &noop_qdisc);
queue->qdisc_sleeping = &noop_qdisc;
rcu_assign_pointer(dev->ingress_queue, queue);
#endif
return queue;
}
static const struct ethtool_ops default_ethtool_ops;
void netdev_set_default_ethtool_ops(struct net_device *dev,
const struct ethtool_ops *ops)
{
if (dev->ethtool_ops == &default_ethtool_ops)
dev->ethtool_ops = ops;
}
EXPORT_SYMBOL_GPL(netdev_set_default_ethtool_ops);
void netdev_freemem(struct net_device *dev)
{
char *addr = (char *)dev - dev->padded;
kvfree(addr);
}
/**
* alloc_netdev_mqs - allocate network device
* @sizeof_priv: size of private data to allocate space for
* @name: device name format string
* @name_assign_type: origin of device name
* @setup: callback to initialize device
* @txqs: the number of TX subqueues to allocate
* @rxqs: the number of RX subqueues to allocate
*
* Allocates a struct net_device with private data area for driver use
* and performs basic initialization. Also allocates subqueue structs
* for each queue on the device.
*/
struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
unsigned char name_assign_type,
void (*setup)(struct net_device *),
unsigned int txqs, unsigned int rxqs)
{
struct net_device *dev;
unsigned int alloc_size;
struct net_device *p;
BUG_ON(strlen(name) >= sizeof(dev->name));
if (txqs < 1) {
pr_err("alloc_netdev: Unable to allocate device with zero queues\n");
return NULL;
}
if (rxqs < 1) {
pr_err("alloc_netdev: Unable to allocate device with zero RX queues\n");
return NULL;
}
alloc_size = sizeof(struct net_device);
if (sizeof_priv) {
/* ensure 32-byte alignment of private area */
alloc_size = ALIGN(alloc_size, NETDEV_ALIGN);
alloc_size += sizeof_priv;
}
/* ensure 32-byte alignment of whole construct */
alloc_size += NETDEV_ALIGN - 1;
p = kvzalloc(alloc_size, GFP_KERNEL_ACCOUNT | __GFP_RETRY_MAYFAIL);
if (!p)
return NULL;
dev = PTR_ALIGN(p, NETDEV_ALIGN);
dev->padded = (char *)dev - (char *)p;
#ifdef CONFIG_PCPU_DEV_REFCNT
dev->pcpu_refcnt = alloc_percpu(int);
if (!dev->pcpu_refcnt)
goto free_dev;
dev_hold(dev);
#else
refcount_set(&dev->dev_refcnt, 1);
#endif
if (dev_addr_init(dev))
goto free_pcpu;
dev_mc_init(dev);
dev_uc_init(dev);
dev_net_set(dev, &init_net);
dev->gso_max_size = GSO_MAX_SIZE;
dev->gso_max_segs = GSO_MAX_SEGS;
dev->upper_level = 1;
dev->lower_level = 1;
#ifdef CONFIG_LOCKDEP
dev->nested_level = 0;
INIT_LIST_HEAD(&dev->unlink_list);
#endif
INIT_LIST_HEAD(&dev->napi_list);
INIT_LIST_HEAD(&dev->unreg_list);
INIT_LIST_HEAD(&dev->close_list);
INIT_LIST_HEAD(&dev->link_watch_list);
INIT_LIST_HEAD(&dev->adj_list.upper);
INIT_LIST_HEAD(&dev->adj_list.lower);
INIT_LIST_HEAD(&dev->ptype_all);
INIT_LIST_HEAD(&dev->ptype_specific);
INIT_LIST_HEAD(&dev->net_notifier_list);
#ifdef CONFIG_NET_SCHED
hash_init(dev->qdisc_hash);
#endif
dev->priv_flags = IFF_XMIT_DST_RELEASE | IFF_XMIT_DST_RELEASE_PERM;
setup(dev);
if (!dev->tx_queue_len) {
dev->priv_flags |= IFF_NO_QUEUE;
dev->tx_queue_len = DEFAULT_TX_QUEUE_LEN;
}
dev->num_tx_queues = txqs;
dev->real_num_tx_queues = txqs;
if (netif_alloc_netdev_queues(dev))
goto free_all;
dev->num_rx_queues = rxqs;
dev->real_num_rx_queues = rxqs;
if (netif_alloc_rx_queues(dev))
goto free_all;
strcpy(dev->name, name);
dev->name_assign_type = name_assign_type;
dev->group = INIT_NETDEV_GROUP;
if (!dev->ethtool_ops)
dev->ethtool_ops = &default_ethtool_ops;
nf_hook_ingress_init(dev);
return dev;
free_all:
free_netdev(dev);
return NULL;
free_pcpu:
#ifdef CONFIG_PCPU_DEV_REFCNT
free_percpu(dev->pcpu_refcnt);
free_dev:
#endif
netdev_freemem(dev);
return NULL;
}
EXPORT_SYMBOL(alloc_netdev_mqs);
/**
* free_netdev - free network device
* @dev: device
*
* This function does the last stage of destroying an allocated device
* interface. The reference to the device object is released. If this
* is the last reference then it will be freed.Must be called in process
* context.
*/
void free_netdev(struct net_device *dev)
{
struct napi_struct *p, *n;
might_sleep();
/* When called immediately after register_netdevice() failed the unwind
* handling may still be dismantling the device. Handle that case by
* deferring the free.
*/
if (dev->reg_state == NETREG_UNREGISTERING) {
ASSERT_RTNL();
dev->needs_free_netdev = true;
return;
}
netif_free_tx_queues(dev);
netif_free_rx_queues(dev);
kfree(rcu_dereference_protected(dev->ingress_queue, 1));
/* Flush device addresses */
dev_addr_flush(dev);
list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
netif_napi_del(p);
#ifdef CONFIG_PCPU_DEV_REFCNT
free_percpu(dev->pcpu_refcnt);
dev->pcpu_refcnt = NULL;
#endif
free_percpu(dev->xdp_bulkq);
dev->xdp_bulkq = NULL;
/* Compatibility with error handling in drivers */
if (dev->reg_state == NETREG_UNINITIALIZED) {
netdev_freemem(dev);
return;
}
BUG_ON(dev->reg_state != NETREG_UNREGISTERED);
dev->reg_state = NETREG_RELEASED;
/* will free via device release */
put_device(&dev->dev);
}
EXPORT_SYMBOL(free_netdev);
/**
* synchronize_net - Synchronize with packet receive processing
*
* Wait for packets currently being received to be done.
* Does not block later packets from starting.
*/
void synchronize_net(void)
{
might_sleep();
if (rtnl_is_locked())
synchronize_rcu_expedited();
else
synchronize_rcu();
}
EXPORT_SYMBOL(synchronize_net);
/**
* unregister_netdevice_queue - remove device from the kernel
* @dev: device
* @head: list
*
* This function shuts down a device interface and removes it
* from the kernel tables.
* If head not NULL, device is queued to be unregistered later.
*
* Callers must hold the rtnl semaphore. You may want
* unregister_netdev() instead of this.
*/
void unregister_netdevice_queue(struct net_device *dev, struct list_head *head)
{
ASSERT_RTNL();
if (head) {
list_move_tail(&dev->unreg_list, head);
} else {
LIST_HEAD(single);
list_add(&dev->unreg_list, &single);
unregister_netdevice_many(&single);
}
}
EXPORT_SYMBOL(unregister_netdevice_queue);
/**
* unregister_netdevice_many - unregister many devices
* @head: list of devices
*
* Note: As most callers use a stack allocated list_head,
* we force a list_del() to make sure stack wont be corrupted later.
*/
void unregister_netdevice_many(struct list_head *head)
{
struct net_device *dev, *tmp;
LIST_HEAD(close_head);
BUG_ON(dev_boot_phase);
ASSERT_RTNL();
if (list_empty(head))
return;
list_for_each_entry_safe(dev, tmp, head, unreg_list) {
/* Some devices call without registering
* for initialization unwind. Remove those
* devices and proceed with the remaining.
*/
if (dev->reg_state == NETREG_UNINITIALIZED) {
pr_debug("unregister_netdevice: device %s/%p never was registered\n",
dev->name, dev);
WARN_ON(1);
list_del(&dev->unreg_list);
continue;
}
dev->dismantle = true;
BUG_ON(dev->reg_state != NETREG_REGISTERED);
}
/* If device is running, close it first. */
list_for_each_entry(dev, head, unreg_list)
list_add_tail(&dev->close_list, &close_head);
dev_close_many(&close_head, true);
list_for_each_entry(dev, head, unreg_list) {
/* And unlink it from device chain. */
unlist_netdevice(dev);
dev->reg_state = NETREG_UNREGISTERING;
}
flush_all_backlogs();
synchronize_net();
list_for_each_entry(dev, head, unreg_list) {
struct sk_buff *skb = NULL;
/* Shutdown queueing discipline. */
dev_shutdown(dev);
dev_xdp_uninstall(dev);
/* Notify protocols, that we are about to destroy
* this device. They should clean all the things.
*/
call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
if (!dev->rtnl_link_ops ||
dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
skb = rtmsg_ifinfo_build_skb(RTM_DELLINK, dev, ~0U, 0,
GFP_KERNEL, NULL, 0);
/*
* Flush the unicast and multicast chains
*/
dev_uc_flush(dev);
dev_mc_flush(dev);
netdev_name_node_alt_flush(dev);
netdev_name_node_free(dev->name_node);
if (dev->netdev_ops->ndo_uninit)
dev->netdev_ops->ndo_uninit(dev);
if (skb)
rtmsg_ifinfo_send(skb, dev, GFP_KERNEL);
/* Notifier chain MUST detach us all upper devices. */
WARN_ON(netdev_has_any_upper_dev(dev));
WARN_ON(netdev_has_any_lower_dev(dev));
/* Remove entries from kobject tree */
netdev_unregister_kobject(dev);
#ifdef CONFIG_XPS
/* Remove XPS queueing entries */
netif_reset_xps_queues_gt(dev, 0);
#endif
}
synchronize_net();
list_for_each_entry(dev, head, unreg_list) {
dev_put(dev);
net_set_todo(dev);
}
list_del(head);
}
EXPORT_SYMBOL(unregister_netdevice_many);
/**
* unregister_netdev - remove device from the kernel
* @dev: device
*
* This function shuts down a device interface and removes it
* from the kernel tables.
*
* This is just a wrapper for unregister_netdevice that takes
* the rtnl semaphore. In general you want to use this and not
* unregister_netdevice.
*/
void unregister_netdev(struct net_device *dev)
{
rtnl_lock();
unregister_netdevice(dev);
rtnl_unlock();
}
EXPORT_SYMBOL(unregister_netdev);
/**
* __dev_change_net_namespace - move device to different nethost namespace
* @dev: device
* @net: network namespace
* @pat: If not NULL name pattern to try if the current device name
* is already taken in the destination network namespace.
* @new_ifindex: If not zero, specifies device index in the target
* namespace.
*
* This function shuts down a device interface and moves it
* to a new network namespace. On success 0 is returned, on
* a failure a netagive errno code is returned.
*
* Callers must hold the rtnl semaphore.
*/
int __dev_change_net_namespace(struct net_device *dev, struct net *net,
const char *pat, int new_ifindex)
{
struct net *net_old = dev_net(dev);
int err, new_nsid;
ASSERT_RTNL();
/* Don't allow namespace local devices to be moved. */
err = -EINVAL;
if (dev->features & NETIF_F_NETNS_LOCAL)
goto out;
/* Ensure the device has been registrered */
if (dev->reg_state != NETREG_REGISTERED)
goto out;
/* Get out if there is nothing todo */
err = 0;
if (net_eq(net_old, net))
goto out;
/* Pick the destination device name, and ensure
* we can use it in the destination network namespace.
*/
err = -EEXIST;
if (__dev_get_by_name(net, dev->name)) {
/* We get here if we can't use the current device name */
if (!pat)
goto out;
err = dev_get_valid_name(net, dev, pat);
if (err < 0)
goto out;
}
/* Check that new_ifindex isn't used yet. */
err = -EBUSY;
if (new_ifindex && __dev_get_by_index(net, new_ifindex))
goto out;
/*
* And now a mini version of register_netdevice unregister_netdevice.
*/
/* If device is running close it first. */
dev_close(dev);
/* And unlink it from device chain */
unlist_netdevice(dev);
synchronize_net();
/* Shutdown queueing discipline. */
dev_shutdown(dev);
/* Notify protocols, that we are about to destroy
* this device. They should clean all the things.
*
* Note that dev->reg_state stays at NETREG_REGISTERED.
* This is wanted because this way 8021q and macvlan know
* the device is just moving and can keep their slaves up.
*/
call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
rcu_barrier();
new_nsid = peernet2id_alloc(dev_net(dev), net, GFP_KERNEL);
/* If there is an ifindex conflict assign a new one */
if (!new_ifindex) {
if (__dev_get_by_index(net, dev->ifindex))
new_ifindex = dev_new_index(net);
else
new_ifindex = dev->ifindex;
}
rtmsg_ifinfo_newnet(RTM_DELLINK, dev, ~0U, GFP_KERNEL, &new_nsid,
new_ifindex);
/*
* Flush the unicast and multicast chains
*/
dev_uc_flush(dev);
dev_mc_flush(dev);
/* Send a netdev-removed uevent to the old namespace */
kobject_uevent(&dev->dev.kobj, KOBJ_REMOVE);
netdev_adjacent_del_links(dev);
/* Move per-net netdevice notifiers that are following the netdevice */
move_netdevice_notifiers_dev_net(dev, net);
/* Actually switch the network namespace */
dev_net_set(dev, net);
dev->ifindex = new_ifindex;
/* Send a netdev-add uevent to the new namespace */
kobject_uevent(&dev->dev.kobj, KOBJ_ADD);
netdev_adjacent_add_links(dev);
/* Fixup kobjects */
err = device_rename(&dev->dev, dev->name);
WARN_ON(err);
/* Adapt owner in case owning user namespace of target network
* namespace is different from the original one.
*/
err = netdev_change_owner(dev, net_old, net);
WARN_ON(err);
/* Add the device back in the hashes */
list_netdevice(dev);
/* Notify protocols, that a new device appeared. */
call_netdevice_notifiers(NETDEV_REGISTER, dev);
/*
* Prevent userspace races by waiting until the network
* device is fully setup before sending notifications.
*/
rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL);
synchronize_net();
err = 0;
out:
return err;
}
EXPORT_SYMBOL_GPL(__dev_change_net_namespace);
static int dev_cpu_dead(unsigned int oldcpu)
{
struct sk_buff **list_skb;
struct sk_buff *skb;
unsigned int cpu;
struct softnet_data *sd, *oldsd, *remsd = NULL;
local_irq_disable();
cpu = smp_processor_id();
sd = &per_cpu(softnet_data, cpu);
oldsd = &per_cpu(softnet_data, oldcpu);
/* Find end of our completion_queue. */
list_skb = &sd->completion_queue;
while (*list_skb)
list_skb = &(*list_skb)->next;
/* Append completion queue from offline CPU. */
*list_skb = oldsd->completion_queue;
oldsd->completion_queue = NULL;
/* Append output queue from offline CPU. */
if (oldsd->output_queue) {
*sd->output_queue_tailp = oldsd->output_queue;
sd->output_queue_tailp = oldsd->output_queue_tailp;
oldsd->output_queue = NULL;
oldsd->output_queue_tailp = &oldsd->output_queue;
}
/* Append NAPI poll list from offline CPU, with one exception :
* process_backlog() must be called by cpu owning percpu backlog.
* We properly handle process_queue & input_pkt_queue later.
*/
while (!list_empty(&oldsd->poll_list)) {
struct napi_struct *napi = list_first_entry(&oldsd->poll_list,
struct napi_struct,
poll_list);
list_del_init(&napi->poll_list);
if (napi->poll == process_backlog)
napi->state = 0;
else
____napi_schedule(sd, napi);
}
raise_softirq_irqoff(NET_TX_SOFTIRQ);
local_irq_enable();
#ifdef CONFIG_RPS
remsd = oldsd->rps_ipi_list;
oldsd->rps_ipi_list = NULL;
#endif
/* send out pending IPI's on offline CPU */
net_rps_send_ipi(remsd);
/* Process offline CPU's input_pkt_queue */
while ((skb = __skb_dequeue(&oldsd->process_queue))) {
netif_rx_ni(skb);
input_queue_head_incr(oldsd);
}
while ((skb = skb_dequeue(&oldsd->input_pkt_queue))) {
netif_rx_ni(skb);
input_queue_head_incr(oldsd);
}
return 0;
}
/**
* netdev_increment_features - increment feature set by one
* @all: current feature set
* @one: new feature set
* @mask: mask feature set
*
* Computes a new feature set after adding a device with feature set
* @one to the master device with current feature set @all. Will not
* enable anything that is off in @mask. Returns the new feature set.
*/
netdev_features_t netdev_increment_features(netdev_features_t all,
netdev_features_t one, netdev_features_t mask)
{
if (mask & NETIF_F_HW_CSUM)
mask |= NETIF_F_CSUM_MASK;
mask |= NETIF_F_VLAN_CHALLENGED;
all |= one & (NETIF_F_ONE_FOR_ALL | NETIF_F_CSUM_MASK) & mask;
all &= one | ~NETIF_F_ALL_FOR_ALL;
/* If one device supports hw checksumming, set for all. */
if (all & NETIF_F_HW_CSUM)
all &= ~(NETIF_F_CSUM_MASK & ~NETIF_F_HW_CSUM);
return all;
}
EXPORT_SYMBOL(netdev_increment_features);
static struct hlist_head * __net_init netdev_create_hash(void)
{
int i;
struct hlist_head *hash;
hash = kmalloc_array(NETDEV_HASHENTRIES, sizeof(*hash), GFP_KERNEL);
if (hash != NULL)
for (i = 0; i < NETDEV_HASHENTRIES; i++)
INIT_HLIST_HEAD(&hash[i]);
return hash;
}
/* Initialize per network namespace state */
static int __net_init netdev_init(struct net *net)
{
BUILD_BUG_ON(GRO_HASH_BUCKETS >
8 * sizeof_field(struct napi_struct, gro_bitmask));
INIT_LIST_HEAD(&net->dev_base_head);
net->dev_name_head = netdev_create_hash();
if (net->dev_name_head == NULL)
goto err_name;
net->dev_index_head = netdev_create_hash();
if (net->dev_index_head == NULL)
goto err_idx;
RAW_INIT_NOTIFIER_HEAD(&net->netdev_chain);
return 0;
err_idx:
kfree(net->dev_name_head);
err_name:
return -ENOMEM;
}
/**
* netdev_drivername - network driver for the device
* @dev: network device
*
* Determine network driver for device.
*/
const char *netdev_drivername(const struct net_device *dev)
{
const struct device_driver *driver;
const struct device *parent;
const char *empty = "";
parent = dev->dev.parent;
if (!parent)
return empty;
driver = parent->driver;
if (driver && driver->name)
return driver->name;
return empty;
}
static void __netdev_printk(const char *level, const struct net_device *dev,
struct va_format *vaf)
{
if (dev && dev->dev.parent) {
dev_printk_emit(level[1] - '0',
dev->dev.parent,
"%s %s %s%s: %pV",
dev_driver_string(dev->dev.parent),
dev_name(dev->dev.parent),
netdev_name(dev), netdev_reg_state(dev),
vaf);
} else if (dev) {
printk("%s%s%s: %pV",
level, netdev_name(dev), netdev_reg_state(dev), vaf);
} else {
printk("%s(NULL net_device): %pV", level, vaf);
}
}
void netdev_printk(const char *level, const struct net_device *dev,
const char *format, ...)
{
struct va_format vaf;
va_list args;
va_start(args, format);
vaf.fmt = format;
vaf.va = &args;
__netdev_printk(level, dev, &vaf);
va_end(args);
}
EXPORT_SYMBOL(netdev_printk);
#define define_netdev_printk_level(func, level) \
void func(const struct net_device *dev, const char *fmt, ...) \
{ \
struct va_format vaf; \
va_list args; \
\
va_start(args, fmt); \
\
vaf.fmt = fmt; \
vaf.va = &args; \
\
__netdev_printk(level, dev, &vaf); \
\
va_end(args); \
} \
EXPORT_SYMBOL(func);
define_netdev_printk_level(netdev_emerg, KERN_EMERG);
define_netdev_printk_level(netdev_alert, KERN_ALERT);
define_netdev_printk_level(netdev_crit, KERN_CRIT);
define_netdev_printk_level(netdev_err, KERN_ERR);
define_netdev_printk_level(netdev_warn, KERN_WARNING);
define_netdev_printk_level(netdev_notice, KERN_NOTICE);
define_netdev_printk_level(netdev_info, KERN_INFO);
static void __net_exit netdev_exit(struct net *net)
{
kfree(net->dev_name_head);
kfree(net->dev_index_head);
if (net != &init_net)
WARN_ON_ONCE(!list_empty(&net->dev_base_head));
}
static struct pernet_operations __net_initdata netdev_net_ops = {
.init = netdev_init,
.exit = netdev_exit,
};
static void __net_exit default_device_exit(struct net *net)
{
struct net_device *dev, *aux;
/*
* Push all migratable network devices back to the
* initial network namespace
*/
rtnl_lock();
for_each_netdev_safe(net, dev, aux) {
int err;
char fb_name[IFNAMSIZ];
/* Ignore unmoveable devices (i.e. loopback) */
if (dev->features & NETIF_F_NETNS_LOCAL)
continue;
/* Leave virtual devices for the generic cleanup */
if (dev->rtnl_link_ops && !dev->rtnl_link_ops->netns_refund)
continue;
/* Push remaining network devices to init_net */
snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex);
if (__dev_get_by_name(&init_net, fb_name))
snprintf(fb_name, IFNAMSIZ, "dev%%d");
err = dev_change_net_namespace(dev, &init_net, fb_name);
if (err) {
pr_emerg("%s: failed to move %s to init_net: %d\n",
__func__, dev->name, err);
BUG();
}
}
rtnl_unlock();
}
static void __net_exit rtnl_lock_unregistering(struct list_head *net_list)
{
/* Return with the rtnl_lock held when there are no network
* devices unregistering in any network namespace in net_list.
*/
struct net *net;
bool unregistering;
DEFINE_WAIT_FUNC(wait, woken_wake_function);
add_wait_queue(&netdev_unregistering_wq, &wait);
for (;;) {
unregistering = false;
rtnl_lock();
list_for_each_entry(net, net_list, exit_list) {
if (net->dev_unreg_count > 0) {
unregistering = true;
break;
}
}
if (!unregistering)
break;
__rtnl_unlock();
wait_woken(&wait, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
}
remove_wait_queue(&netdev_unregistering_wq, &wait);
}
static void __net_exit default_device_exit_batch(struct list_head *net_list)
{
/* At exit all network devices most be removed from a network
* namespace. Do this in the reverse order of registration.
* Do this across as many network namespaces as possible to
* improve batching efficiency.
*/
struct net_device *dev;
struct net *net;
LIST_HEAD(dev_kill_list);
/* To prevent network device cleanup code from dereferencing
* loopback devices or network devices that have been freed
* wait here for all pending unregistrations to complete,
* before unregistring the loopback device and allowing the
* network namespace be freed.
*
* The netdev todo list containing all network devices
* unregistrations that happen in default_device_exit_batch
* will run in the rtnl_unlock() at the end of
* default_device_exit_batch.
*/
rtnl_lock_unregistering(net_list);
list_for_each_entry(net, net_list, exit_list) {
for_each_netdev_reverse(net, dev) {
if (dev->rtnl_link_ops && dev->rtnl_link_ops->dellink)
dev->rtnl_link_ops->dellink(dev, &dev_kill_list);
else
unregister_netdevice_queue(dev, &dev_kill_list);
}
}
unregister_netdevice_many(&dev_kill_list);
rtnl_unlock();
}
static struct pernet_operations __net_initdata default_device_ops = {
.exit = default_device_exit,
.exit_batch = default_device_exit_batch,
};
/*
* Initialize the DEV module. At boot time this walks the device list and
* unhooks any devices that fail to initialise (normally hardware not
* present) and leaves us with a valid list of present and active devices.
*
*/
/*
* This is called single threaded during boot, so no need
* to take the rtnl semaphore.
*/
static int __init net_dev_init(void)
{
int i, rc = -ENOMEM;
BUG_ON(!dev_boot_phase);
if (dev_proc_init())
goto out;
if (netdev_kobject_init())
goto out;
INIT_LIST_HEAD(&ptype_all);
for (i = 0; i < PTYPE_HASH_SIZE; i++)
INIT_LIST_HEAD(&ptype_base[i]);
INIT_LIST_HEAD(&offload_base);
if (register_pernet_subsys(&netdev_net_ops))
goto out;
/*
* Initialise the packet receive queues.
*/
for_each_possible_cpu(i) {
struct work_struct *flush = per_cpu_ptr(&flush_works, i);
struct softnet_data *sd = &per_cpu(softnet_data, i);
INIT_WORK(flush, flush_backlog);
skb_queue_head_init(&sd->input_pkt_queue);
skb_queue_head_init(&sd->process_queue);
#ifdef CONFIG_XFRM_OFFLOAD
skb_queue_head_init(&sd->xfrm_backlog);
#endif
INIT_LIST_HEAD(&sd->poll_list);
sd->output_queue_tailp = &sd->output_queue;
#ifdef CONFIG_RPS
INIT_CSD(&sd->csd, rps_trigger_softirq, sd);
sd->cpu = i;
#endif
init_gro_hash(&sd->backlog);
sd->backlog.poll = process_backlog;
sd->backlog.weight = weight_p;
}
dev_boot_phase = 0;
/* The loopback device is special if any other network devices
* is present in a network namespace the loopback device must
* be present. Since we now dynamically allocate and free the
* loopback device ensure this invariant is maintained by
* keeping the loopback device as the first device on the
* list of network devices. Ensuring the loopback devices
* is the first device that appears and the last network device
* that disappears.
*/
if (register_pernet_device(&loopback_net_ops))
goto out;
if (register_pernet_device(&default_device_ops))
goto out;
open_softirq(NET_TX_SOFTIRQ, net_tx_action);
open_softirq(NET_RX_SOFTIRQ, net_rx_action);
rc = cpuhp_setup_state_nocalls(CPUHP_NET_DEV_DEAD, "net/dev:dead",
NULL, dev_cpu_dead);
WARN_ON(rc < 0);
rc = 0;
out:
return rc;
}
subsys_initcall(net_dev_init);
// SPDX-License-Identifier: GPL-2.0
/*
* fs/mpage.c
*
* Copyright (C) 2002, Linus Torvalds.
*
* Contains functions related to preparing and submitting BIOs which contain
* multiple pagecache pages.
*
* 15May2002 Andrew Morton
* Initial version
* 27Jun2002 axboe@suse.de
* use bio_add_page() to build bio's just the right size
*/
#include <linux/kernel.h>
#include <linux/export.h>
#include <linux/mm.h>
#include <linux/kdev_t.h>
#include <linux/gfp.h>
#include <linux/bio.h>
#include <linux/fs.h>
#include <linux/buffer_head.h>
#include <linux/blkdev.h>
#include <linux/highmem.h>
#include <linux/prefetch.h>
#include <linux/mpage.h>
#include <linux/mm_inline.h>
#include <linux/writeback.h>
#include <linux/backing-dev.h>
#include <linux/pagevec.h>
#include <linux/cleancache.h>
#include "internal.h"
/*
* I/O completion handler for multipage BIOs.
*
* The mpage code never puts partial pages into a BIO (except for end-of-file).
* If a page does not map to a contiguous run of blocks then it simply falls
* back to block_read_full_page().
*
* Why is this? If a page's completion depends on a number of different BIOs
* which can complete in any order (or at the same time) then determining the
* status of that page is hard. See end_buffer_async_read() for the details.
* There is no point in duplicating all that complexity.
*/
static void mpage_end_io(struct bio *bio)
{
struct bio_vec *bv;
struct bvec_iter_all iter_all;
bio_for_each_segment_all(bv, bio, iter_all) {
struct page *page = bv->bv_page;
page_endio(page, bio_op(bio),
blk_status_to_errno(bio->bi_status));
}
bio_put(bio);
}
static struct bio *mpage_bio_submit(int op, int op_flags, struct bio *bio)
{
bio->bi_end_io = mpage_end_io;
bio_set_op_attrs(bio, op, op_flags);
guard_bio_eod(bio);
submit_bio(bio);
return NULL;
}
static struct bio *
mpage_alloc(struct block_device *bdev,
sector_t first_sector, int nr_vecs,
gfp_t gfp_flags)
{
struct bio *bio;
/* Restrict the given (page cache) mask for slab allocations */
gfp_flags &= GFP_KERNEL;
bio = bio_alloc(gfp_flags, nr_vecs);
if (bio == NULL && (current->flags & PF_MEMALLOC)) {
while (!bio && (nr_vecs /= 2))
bio = bio_alloc(gfp_flags, nr_vecs);
}
if (bio) {
bio_set_dev(bio, bdev);
bio->bi_iter.bi_sector = first_sector;
}
return bio;
}
/*
* support function for mpage_readahead. The fs supplied get_block might
* return an up to date buffer. This is used to map that buffer into
* the page, which allows readpage to avoid triggering a duplicate call
* to get_block.
*
* The idea is to avoid adding buffers to pages that don't already have
* them. So when the buffer is up to date and the page size == block size,
* this marks the page up to date instead of adding new buffers.
*/
static void
map_buffer_to_page(struct page *page, struct buffer_head *bh, int page_block)
{
struct inode *inode = page->mapping->host;
struct buffer_head *page_bh, *head;
int block = 0;
if (!page_has_buffers(page)) {
/*
* don't make any buffers if there is only one buffer on
* the page and the page just needs to be set up to date
*/
if (inode->i_blkbits == PAGE_SHIFT &&
buffer_uptodate(bh)) {
SetPageUptodate(page);
return;
}
create_empty_buffers(page, i_blocksize(inode), 0);
}
head = page_buffers(page);
page_bh = head;
do {
if (block == page_block) { page_bh->b_state = bh->b_state;
page_bh->b_bdev = bh->b_bdev;
page_bh->b_blocknr = bh->b_blocknr;
break;
}
page_bh = page_bh->b_this_page;
block++;
} while (page_bh != head);
}
struct mpage_readpage_args {
struct bio *bio;
struct page *page;
unsigned int nr_pages;
bool is_readahead;
sector_t last_block_in_bio;
struct buffer_head map_bh;
unsigned long first_logical_block;
get_block_t *get_block;
};
/*
* This is the worker routine which does all the work of mapping the disk
* blocks and constructs largest possible bios, submits them for IO if the
* blocks are not contiguous on the disk.
*
* We pass a buffer_head back and forth and use its buffer_mapped() flag to
* represent the validity of its disk mapping and to decide when to do the next
* get_block() call.
*/
static struct bio *do_mpage_readpage(struct mpage_readpage_args *args)
{
struct page *page = args->page;
struct inode *inode = page->mapping->host;
const unsigned blkbits = inode->i_blkbits;
const unsigned blocks_per_page = PAGE_SIZE >> blkbits;
const unsigned blocksize = 1 << blkbits;
struct buffer_head *map_bh = &args->map_bh;
sector_t block_in_file;
sector_t last_block;
sector_t last_block_in_file;
sector_t blocks[MAX_BUF_PER_PAGE];
unsigned page_block;
unsigned first_hole = blocks_per_page;
struct block_device *bdev = NULL;
int length;
int fully_mapped = 1;
int op_flags;
unsigned nblocks;
unsigned relative_block;
gfp_t gfp;
if (args->is_readahead) {
op_flags = REQ_RAHEAD;
gfp = readahead_gfp_mask(page->mapping);
} else {
op_flags = 0;
gfp = mapping_gfp_constraint(page->mapping, GFP_KERNEL);
}
if (page_has_buffers(page))
goto confused;
block_in_file = (sector_t)page->index << (PAGE_SHIFT - blkbits);
last_block = block_in_file + args->nr_pages * blocks_per_page;
last_block_in_file = (i_size_read(inode) + blocksize - 1) >> blkbits;
if (last_block > last_block_in_file)
last_block = last_block_in_file;
page_block = 0;
/*
* Map blocks using the result from the previous get_blocks call first.
*/
nblocks = map_bh->b_size >> blkbits;
if (buffer_mapped(map_bh) &&
block_in_file > args->first_logical_block &&
block_in_file < (args->first_logical_block + nblocks)) {
unsigned map_offset = block_in_file - args->first_logical_block;
unsigned last = nblocks - map_offset;
for (relative_block = 0; ; relative_block++) {
if (relative_block == last) {
clear_buffer_mapped(map_bh);
break;
}
if (page_block == blocks_per_page)
break;
blocks[page_block] = map_bh->b_blocknr + map_offset +
relative_block;
page_block++;
block_in_file++;
}
bdev = map_bh->b_bdev;
}
/*
* Then do more get_blocks calls until we are done with this page.
*/
map_bh->b_page = page; while (page_block < blocks_per_page) { map_bh->b_state = 0; map_bh->b_size = 0;
if (block_in_file < last_block) {
map_bh->b_size = (last_block-block_in_file) << blkbits;
if (args->get_block(inode, block_in_file, map_bh, 0))
goto confused;
args->first_logical_block = block_in_file;
}
if (!buffer_mapped(map_bh)) {
fully_mapped = 0;
if (first_hole == blocks_per_page)
first_hole = page_block;
page_block++;
block_in_file++;
continue;
}
/* some filesystems will copy data into the page during
* the get_block call, in which case we don't want to
* read it again. map_buffer_to_page copies the data
* we just collected from get_block into the page's buffers
* so readpage doesn't have to repeat the get_block call
*/
if (buffer_uptodate(map_bh)) {
map_buffer_to_page(page, map_bh, page_block);
goto confused;
}
if (first_hole != blocks_per_page)
goto confused; /* hole -> non-hole */
/* Contiguous blocks? */
if (page_block && blocks[page_block-1] != map_bh->b_blocknr-1)
goto confused;
nblocks = map_bh->b_size >> blkbits;
for (relative_block = 0; ; relative_block++) {
if (relative_block == nblocks) {
clear_buffer_mapped(map_bh);
break;
} else if (page_block == blocks_per_page)
break;
blocks[page_block] = map_bh->b_blocknr+relative_block;
page_block++;
block_in_file++;
}
bdev = map_bh->b_bdev;
}
if (first_hole != blocks_per_page) {
zero_user_segment(page, first_hole << blkbits, PAGE_SIZE);
if (first_hole == 0) {
SetPageUptodate(page);
unlock_page(page);
goto out;
}
} else if (fully_mapped) {
SetPageMappedToDisk(page);
}
if (fully_mapped && blocks_per_page == 1 && !PageUptodate(page) &&
cleancache_get_page(page) == 0) {
SetPageUptodate(page);
goto confused;
}
/*
* This page will go to BIO. Do we need to send this BIO off first?
*/
if (args->bio && (args->last_block_in_bio != blocks[0] - 1))
args->bio = mpage_bio_submit(REQ_OP_READ, op_flags, args->bio);
alloc_new:
if (args->bio == NULL) { if (first_hole == blocks_per_page) { if (!bdev_read_page(bdev, blocks[0] << (blkbits - 9),
page))
goto out;
}
args->bio = mpage_alloc(bdev, blocks[0] << (blkbits - 9),
bio_max_segs(args->nr_pages), gfp);
if (args->bio == NULL)
goto confused;
}
length = first_hole << blkbits; if (bio_add_page(args->bio, page, length, 0) < length) { args->bio = mpage_bio_submit(REQ_OP_READ, op_flags, args->bio);
goto alloc_new;
}
relative_block = block_in_file - args->first_logical_block;
nblocks = map_bh->b_size >> blkbits;
if ((buffer_boundary(map_bh) && relative_block == nblocks) ||
(first_hole != blocks_per_page))
args->bio = mpage_bio_submit(REQ_OP_READ, op_flags, args->bio);
else
args->last_block_in_bio = blocks[blocks_per_page - 1];
out:
return args->bio;
confused:
if (args->bio)
args->bio = mpage_bio_submit(REQ_OP_READ, op_flags, args->bio);
if (!PageUptodate(page))
block_read_full_page(page, args->get_block);
else
unlock_page(page);
goto out;
}
/**
* mpage_readahead - start reads against pages
* @rac: Describes which pages to read.
* @get_block: The filesystem's block mapper function.
*
* This function walks the pages and the blocks within each page, building and
* emitting large BIOs.
*
* If anything unusual happens, such as:
*
* - encountering a page which has buffers
* - encountering a page which has a non-hole after a hole
* - encountering a page with non-contiguous blocks
*
* then this code just gives up and calls the buffer_head-based read function.
* It does handle a page which has holes at the end - that is a common case:
* the end-of-file on blocksize < PAGE_SIZE setups.
*
* BH_Boundary explanation:
*
* There is a problem. The mpage read code assembles several pages, gets all
* their disk mappings, and then submits them all. That's fine, but obtaining
* the disk mappings may require I/O. Reads of indirect blocks, for example.
*
* So an mpage read of the first 16 blocks of an ext2 file will cause I/O to be
* submitted in the following order:
*
* 12 0 1 2 3 4 5 6 7 8 9 10 11 13 14 15 16
*
* because the indirect block has to be read to get the mappings of blocks
* 13,14,15,16. Obviously, this impacts performance.
*
* So what we do it to allow the filesystem's get_block() function to set
* BH_Boundary when it maps block 11. BH_Boundary says: mapping of the block
* after this one will require I/O against a block which is probably close to
* this one. So you should push what I/O you have currently accumulated.
*
* This all causes the disk requests to be issued in the correct order.
*/
void mpage_readahead(struct readahead_control *rac, get_block_t get_block)
{
struct page *page;
struct mpage_readpage_args args = {
.get_block = get_block,
.is_readahead = true,
};
while ((page = readahead_page(rac))) {
prefetchw(&page->flags);
args.page = page;
args.nr_pages = readahead_count(rac);
args.bio = do_mpage_readpage(&args);
put_page(page);
}
if (args.bio)
mpage_bio_submit(REQ_OP_READ, REQ_RAHEAD, args.bio);
}
EXPORT_SYMBOL(mpage_readahead);
/*
* This isn't called much at all
*/
int mpage_readpage(struct page *page, get_block_t get_block)
{
struct mpage_readpage_args args = {
.page = page,
.nr_pages = 1,
.get_block = get_block,
};
args.bio = do_mpage_readpage(&args);
if (args.bio)
mpage_bio_submit(REQ_OP_READ, 0, args.bio);
return 0;
}
EXPORT_SYMBOL(mpage_readpage);
/*
* Writing is not so simple.
*
* If the page has buffers then they will be used for obtaining the disk
* mapping. We only support pages which are fully mapped-and-dirty, with a
* special case for pages which are unmapped at the end: end-of-file.
*
* If the page has no buffers (preferred) then the page is mapped here.
*
* If all blocks are found to be contiguous then the page can go into the
* BIO. Otherwise fall back to the mapping's writepage().
*
* FIXME: This code wants an estimate of how many pages are still to be
* written, so it can intelligently allocate a suitably-sized BIO. For now,
* just allocate full-size (16-page) BIOs.
*/
struct mpage_data {
struct bio *bio;
sector_t last_block_in_bio;
get_block_t *get_block;
unsigned use_writepage;
};
/*
* We have our BIO, so we can now mark the buffers clean. Make
* sure to only clean buffers which we know we'll be writing.
*/
static void clean_buffers(struct page *page, unsigned first_unmapped)
{
unsigned buffer_counter = 0;
struct buffer_head *bh, *head;
if (!page_has_buffers(page))
return;
head = page_buffers(page);
bh = head;
do {
if (buffer_counter++ == first_unmapped)
break;
clear_buffer_dirty(bh);
bh = bh->b_this_page;
} while (bh != head);
/*
* we cannot drop the bh if the page is not uptodate or a concurrent
* readpage would fail to serialize with the bh and it would read from
* disk before we reach the platter.
*/
if (buffer_heads_over_limit && PageUptodate(page))
try_to_free_buffers(page);
}
/*
* For situations where we want to clean all buffers attached to a page.
* We don't need to calculate how many buffers are attached to the page,
* we just need to specify a number larger than the maximum number of buffers.
*/
void clean_page_buffers(struct page *page)
{
clean_buffers(page, ~0U);
}
static int __mpage_writepage(struct page *page, struct writeback_control *wbc,
void *data)
{
struct mpage_data *mpd = data;
struct bio *bio = mpd->bio;
struct address_space *mapping = page->mapping;
struct inode *inode = page->mapping->host;
const unsigned blkbits = inode->i_blkbits;
unsigned long end_index;
const unsigned blocks_per_page = PAGE_SIZE >> blkbits;
sector_t last_block;
sector_t block_in_file;
sector_t blocks[MAX_BUF_PER_PAGE];
unsigned page_block;
unsigned first_unmapped = blocks_per_page;
struct block_device *bdev = NULL;
int boundary = 0;
sector_t boundary_block = 0;
struct block_device *boundary_bdev = NULL;
int length;
struct buffer_head map_bh;
loff_t i_size = i_size_read(inode);
int ret = 0;
int op_flags = wbc_to_write_flags(wbc);
if (page_has_buffers(page)) {
struct buffer_head *head = page_buffers(page);
struct buffer_head *bh = head;
/* If they're all mapped and dirty, do it */
page_block = 0;
do {
BUG_ON(buffer_locked(bh));
if (!buffer_mapped(bh)) {
/*
* unmapped dirty buffers are created by
* __set_page_dirty_buffers -> mmapped data
*/
if (buffer_dirty(bh))
goto confused;
if (first_unmapped == blocks_per_page)
first_unmapped = page_block;
continue;
}
if (first_unmapped != blocks_per_page)
goto confused; /* hole -> non-hole */
if (!buffer_dirty(bh) || !buffer_uptodate(bh))
goto confused;
if (page_block) {
if (bh->b_blocknr != blocks[page_block-1] + 1)
goto confused;
}
blocks[page_block++] = bh->b_blocknr;
boundary = buffer_boundary(bh);
if (boundary) {
boundary_block = bh->b_blocknr;
boundary_bdev = bh->b_bdev;
}
bdev = bh->b_bdev;
} while ((bh = bh->b_this_page) != head);
if (first_unmapped)
goto page_is_mapped;
/*
* Page has buffers, but they are all unmapped. The page was
* created by pagein or read over a hole which was handled by
* block_read_full_page(). If this address_space is also
* using mpage_readahead then this can rarely happen.
*/
goto confused;
}
/*
* The page has no buffers: map it to disk
*/
BUG_ON(!PageUptodate(page));
block_in_file = (sector_t)page->index << (PAGE_SHIFT - blkbits);
last_block = (i_size - 1) >> blkbits;
map_bh.b_page = page;
for (page_block = 0; page_block < blocks_per_page; ) {
map_bh.b_state = 0;
map_bh.b_size = 1 << blkbits;
if (mpd->get_block(inode, block_in_file, &map_bh, 1))
goto confused;
if (buffer_new(&map_bh))
clean_bdev_bh_alias(&map_bh);
if (buffer_boundary(&map_bh)) {
boundary_block = map_bh.b_blocknr;
boundary_bdev = map_bh.b_bdev;
}
if (page_block) {
if (map_bh.b_blocknr != blocks[page_block-1] + 1)
goto confused;
}
blocks[page_block++] = map_bh.b_blocknr;
boundary = buffer_boundary(&map_bh);
bdev = map_bh.b_bdev;
if (block_in_file == last_block)
break;
block_in_file++;
}
BUG_ON(page_block == 0);
first_unmapped = page_block;
page_is_mapped:
end_index = i_size >> PAGE_SHIFT;
if (page->index >= end_index) {
/*
* The page straddles i_size. It must be zeroed out on each
* and every writepage invocation because it may be mmapped.
* "A file is mapped in multiples of the page size. For a file
* that is not a multiple of the page size, the remaining memory
* is zeroed when mapped, and writes to that region are not
* written out to the file."
*/
unsigned offset = i_size & (PAGE_SIZE - 1);
if (page->index > end_index || !offset)
goto confused;
zero_user_segment(page, offset, PAGE_SIZE);
}
/*
* This page will go to BIO. Do we need to send this BIO off first?
*/
if (bio && mpd->last_block_in_bio != blocks[0] - 1)
bio = mpage_bio_submit(REQ_OP_WRITE, op_flags, bio);
alloc_new:
if (bio == NULL) {
if (first_unmapped == blocks_per_page) {
if (!bdev_write_page(bdev, blocks[0] << (blkbits - 9),
page, wbc))
goto out;
}
bio = mpage_alloc(bdev, blocks[0] << (blkbits - 9),
BIO_MAX_VECS, GFP_NOFS|__GFP_HIGH);
if (bio == NULL)
goto confused;
wbc_init_bio(wbc, bio);
bio->bi_write_hint = inode->i_write_hint;
}
/*
* Must try to add the page before marking the buffer clean or
* the confused fail path above (OOM) will be very confused when
* it finds all bh marked clean (i.e. it will not write anything)
*/
wbc_account_cgroup_owner(wbc, page, PAGE_SIZE);
length = first_unmapped << blkbits;
if (bio_add_page(bio, page, length, 0) < length) {
bio = mpage_bio_submit(REQ_OP_WRITE, op_flags, bio);
goto alloc_new;
}
clean_buffers(page, first_unmapped);
BUG_ON(PageWriteback(page));
set_page_writeback(page);
unlock_page(page);
if (boundary || (first_unmapped != blocks_per_page)) {
bio = mpage_bio_submit(REQ_OP_WRITE, op_flags, bio);
if (boundary_block) {
write_boundary_block(boundary_bdev,
boundary_block, 1 << blkbits);
}
} else {
mpd->last_block_in_bio = blocks[blocks_per_page - 1];
}
goto out;
confused:
if (bio)
bio = mpage_bio_submit(REQ_OP_WRITE, op_flags, bio);
if (mpd->use_writepage) {
ret = mapping->a_ops->writepage(page, wbc);
} else {
ret = -EAGAIN;
goto out;
}
/*
* The caller has a ref on the inode, so *mapping is stable
*/
mapping_set_error(mapping, ret);
out:
mpd->bio = bio;
return ret;
}
/**
* mpage_writepages - walk the list of dirty pages of the given address space & writepage() all of them
* @mapping: address space structure to write
* @wbc: subtract the number of written pages from *@wbc->nr_to_write
* @get_block: the filesystem's block mapper function.
* If this is NULL then use a_ops->writepage. Otherwise, go
* direct-to-BIO.
*
* This is a library function, which implements the writepages()
* address_space_operation.
*
* If a page is already under I/O, generic_writepages() skips it, even
* if it's dirty. This is desirable behaviour for memory-cleaning writeback,
* but it is INCORRECT for data-integrity system calls such as fsync(). fsync()
* and msync() need to guarantee that all the data which was dirty at the time
* the call was made get new I/O started against them. If wbc->sync_mode is
* WB_SYNC_ALL then we were called for data integrity and we must wait for
* existing IO to complete.
*/
int
mpage_writepages(struct address_space *mapping,
struct writeback_control *wbc, get_block_t get_block)
{
struct blk_plug plug;
int ret;
blk_start_plug(&plug);
if (!get_block)
ret = generic_writepages(mapping, wbc);
else {
struct mpage_data mpd = {
.bio = NULL,
.last_block_in_bio = 0,
.get_block = get_block,
.use_writepage = 1,
};
ret = write_cache_pages(mapping, wbc, __mpage_writepage, &mpd);
if (mpd.bio) {
int op_flags = (wbc->sync_mode == WB_SYNC_ALL ?
REQ_SYNC : 0);
mpage_bio_submit(REQ_OP_WRITE, op_flags, mpd.bio);
}
}
blk_finish_plug(&plug);
return ret;
}
EXPORT_SYMBOL(mpage_writepages);
int mpage_writepage(struct page *page, get_block_t get_block,
struct writeback_control *wbc)
{
struct mpage_data mpd = {
.bio = NULL,
.last_block_in_bio = 0,
.get_block = get_block,
.use_writepage = 0,
};
int ret = __mpage_writepage(page, wbc, &mpd);
if (mpd.bio) {
int op_flags = (wbc->sync_mode == WB_SYNC_ALL ?
REQ_SYNC : 0);
mpage_bio_submit(REQ_OP_WRITE, op_flags, mpd.bio);
}
return ret;
}
EXPORT_SYMBOL(mpage_writepage);
// SPDX-License-Identifier: GPL-2.0-only
#include <linux/bitmap.h>
#include <linux/bug.h>
#include <linux/export.h>
#include <linux/idr.h>
#include <linux/slab.h>
#include <linux/spinlock.h>
#include <linux/xarray.h>
/**
* idr_alloc_u32() - Allocate an ID.
* @idr: IDR handle.
* @ptr: Pointer to be associated with the new ID.
* @nextid: Pointer to an ID.
* @max: The maximum ID to allocate (inclusive).
* @gfp: Memory allocation flags.
*
* Allocates an unused ID in the range specified by @nextid and @max.
* Note that @max is inclusive whereas the @end parameter to idr_alloc()
* is exclusive. The new ID is assigned to @nextid before the pointer
* is inserted into the IDR, so if @nextid points into the object pointed
* to by @ptr, a concurrent lookup will not find an uninitialised ID.
*
* The caller should provide their own locking to ensure that two
* concurrent modifications to the IDR are not possible. Read-only
* accesses to the IDR may be done under the RCU read lock or may
* exclude simultaneous writers.
*
* Return: 0 if an ID was allocated, -ENOMEM if memory allocation failed,
* or -ENOSPC if no free IDs could be found. If an error occurred,
* @nextid is unchanged.
*/
int idr_alloc_u32(struct idr *idr, void *ptr, u32 *nextid,
unsigned long max, gfp_t gfp)
{
struct radix_tree_iter iter;
void __rcu **slot;
unsigned int base = idr->idr_base;
unsigned int id = *nextid;
if (WARN_ON_ONCE(!(idr->idr_rt.xa_flags & ROOT_IS_IDR)))
idr->idr_rt.xa_flags |= IDR_RT_MARKER;
id = (id < base) ? 0 : id - base;
radix_tree_iter_init(&iter, id);
slot = idr_get_free(&idr->idr_rt, &iter, gfp, max - base);
if (IS_ERR(slot))
return PTR_ERR(slot);
*nextid = iter.index + base;
/* there is a memory barrier inside radix_tree_iter_replace() */
radix_tree_iter_replace(&idr->idr_rt, &iter, slot, ptr);
radix_tree_iter_tag_clear(&idr->idr_rt, &iter, IDR_FREE);
return 0;
}
EXPORT_SYMBOL_GPL(idr_alloc_u32);
/**
* idr_alloc() - Allocate an ID.
* @idr: IDR handle.
* @ptr: Pointer to be associated with the new ID.
* @start: The minimum ID (inclusive).
* @end: The maximum ID (exclusive).
* @gfp: Memory allocation flags.
*
* Allocates an unused ID in the range specified by @start and @end. If
* @end is <= 0, it is treated as one larger than %INT_MAX. This allows
* callers to use @start + N as @end as long as N is within integer range.
*
* The caller should provide their own locking to ensure that two
* concurrent modifications to the IDR are not possible. Read-only
* accesses to the IDR may be done under the RCU read lock or may
* exclude simultaneous writers.
*
* Return: The newly allocated ID, -ENOMEM if memory allocation failed,
* or -ENOSPC if no free IDs could be found.
*/
int idr_alloc(struct idr *idr, void *ptr, int start, int end, gfp_t gfp)
{
u32 id = start;
int ret;
if (WARN_ON_ONCE(start < 0))
return -EINVAL;
ret = idr_alloc_u32(idr, ptr, &id, end > 0 ? end - 1 : INT_MAX, gfp);
if (ret)
return ret;
return id;
}
EXPORT_SYMBOL_GPL(idr_alloc);
/**
* idr_alloc_cyclic() - Allocate an ID cyclically.
* @idr: IDR handle.
* @ptr: Pointer to be associated with the new ID.
* @start: The minimum ID (inclusive).
* @end: The maximum ID (exclusive).
* @gfp: Memory allocation flags.
*
* Allocates an unused ID in the range specified by @nextid and @end. If
* @end is <= 0, it is treated as one larger than %INT_MAX. This allows
* callers to use @start + N as @end as long as N is within integer range.
* The search for an unused ID will start at the last ID allocated and will
* wrap around to @start if no free IDs are found before reaching @end.
*
* The caller should provide their own locking to ensure that two
* concurrent modifications to the IDR are not possible. Read-only
* accesses to the IDR may be done under the RCU read lock or may
* exclude simultaneous writers.
*
* Return: The newly allocated ID, -ENOMEM if memory allocation failed,
* or -ENOSPC if no free IDs could be found.
*/
int idr_alloc_cyclic(struct idr *idr, void *ptr, int start, int end, gfp_t gfp)
{
u32 id = idr->idr_next; int err, max = end > 0 ? end - 1 : INT_MAX; if ((int)id < start) id = start; err = idr_alloc_u32(idr, ptr, &id, max, gfp); if ((err == -ENOSPC) && (id > start)) { id = start;
err = idr_alloc_u32(idr, ptr, &id, max, gfp);
}
if (err)
return err;
idr->idr_next = id + 1; return id;
}
EXPORT_SYMBOL(idr_alloc_cyclic);
/**
* idr_remove() - Remove an ID from the IDR.
* @idr: IDR handle.
* @id: Pointer ID.
*
* Removes this ID from the IDR. If the ID was not previously in the IDR,
* this function returns %NULL.
*
* Since this function modifies the IDR, the caller should provide their
* own locking to ensure that concurrent modification of the same IDR is
* not possible.
*
* Return: The pointer formerly associated with this ID.
*/
void *idr_remove(struct idr *idr, unsigned long id)
{
return radix_tree_delete_item(&idr->idr_rt, id - idr->idr_base, NULL);
}
EXPORT_SYMBOL_GPL(idr_remove);
/**
* idr_find() - Return pointer for given ID.
* @idr: IDR handle.
* @id: Pointer ID.
*
* Looks up the pointer associated with this ID. A %NULL pointer may
* indicate that @id is not allocated or that the %NULL pointer was
* associated with this ID.
*
* This function can be called under rcu_read_lock(), given that the leaf
* pointers lifetimes are correctly managed.
*
* Return: The pointer associated with this ID.
*/
void *idr_find(const struct idr *idr, unsigned long id)
{
return radix_tree_lookup(&idr->idr_rt, id - idr->idr_base);
}
EXPORT_SYMBOL_GPL(idr_find);
/**
* idr_for_each() - Iterate through all stored pointers.
* @idr: IDR handle.
* @fn: Function to be called for each pointer.
* @data: Data passed to callback function.
*
* The callback function will be called for each entry in @idr, passing
* the ID, the entry and @data.
*
* If @fn returns anything other than %0, the iteration stops and that
* value is returned from this function.
*
* idr_for_each() can be called concurrently with idr_alloc() and
* idr_remove() if protected by RCU. Newly added entries may not be
* seen and deleted entries may be seen, but adding and removing entries
* will not cause other entries to be skipped, nor spurious ones to be seen.
*/
int idr_for_each(const struct idr *idr,
int (*fn)(int id, void *p, void *data), void *data)
{
struct radix_tree_iter iter;
void __rcu **slot;
int base = idr->idr_base; radix_tree_for_each_slot(slot, &idr->idr_rt, &iter, 0) {
int ret;
unsigned long id = iter.index + base; if (WARN_ON_ONCE(id > INT_MAX))
break;
ret = fn(id, rcu_dereference_raw(*slot), data);
if (ret)
return ret;
}
return 0;
}
EXPORT_SYMBOL(idr_for_each);
/**
* idr_get_next_ul() - Find next populated entry.
* @idr: IDR handle.
* @nextid: Pointer to an ID.
*
* Returns the next populated entry in the tree with an ID greater than
* or equal to the value pointed to by @nextid. On exit, @nextid is updated
* to the ID of the found value. To use in a loop, the value pointed to by
* nextid must be incremented by the user.
*/
void *idr_get_next_ul(struct idr *idr, unsigned long *nextid)
{
struct radix_tree_iter iter;
void __rcu **slot;
void *entry = NULL;
unsigned long base = idr->idr_base;
unsigned long id = *nextid;
id = (id < base) ? 0 : id - base;
radix_tree_for_each_slot(slot, &idr->idr_rt, &iter, id) {
entry = rcu_dereference_raw(*slot);
if (!entry)
continue;
if (!xa_is_internal(entry))
break;
if (slot != &idr->idr_rt.xa_head && !xa_is_retry(entry))
break;
slot = radix_tree_iter_retry(&iter);
}
if (!slot)
return NULL;
*nextid = iter.index + base;
return entry;
}
EXPORT_SYMBOL(idr_get_next_ul);
/**
* idr_get_next() - Find next populated entry.
* @idr: IDR handle.
* @nextid: Pointer to an ID.
*
* Returns the next populated entry in the tree with an ID greater than
* or equal to the value pointed to by @nextid. On exit, @nextid is updated
* to the ID of the found value. To use in a loop, the value pointed to by
* nextid must be incremented by the user.
*/
void *idr_get_next(struct idr *idr, int *nextid)
{
unsigned long id = *nextid;
void *entry = idr_get_next_ul(idr, &id);
if (WARN_ON_ONCE(id > INT_MAX))
return NULL;
*nextid = id;
return entry;
}
EXPORT_SYMBOL(idr_get_next);
/**
* idr_replace() - replace pointer for given ID.
* @idr: IDR handle.
* @ptr: New pointer to associate with the ID.
* @id: ID to change.
*
* Replace the pointer registered with an ID and return the old value.
* This function can be called under the RCU read lock concurrently with
* idr_alloc() and idr_remove() (as long as the ID being removed is not
* the one being replaced!).
*
* Returns: the old value on success. %-ENOENT indicates that @id was not
* found. %-EINVAL indicates that @ptr was not valid.
*/
void *idr_replace(struct idr *idr, void *ptr, unsigned long id)
{
struct radix_tree_node *node;
void __rcu **slot = NULL;
void *entry;
id -= idr->idr_base;
entry = __radix_tree_lookup(&idr->idr_rt, id, &node, &slot);
if (!slot || radix_tree_tag_get(&idr->idr_rt, id, IDR_FREE))
return ERR_PTR(-ENOENT);
__radix_tree_replace(&idr->idr_rt, node, slot, ptr);
return entry;
}
EXPORT_SYMBOL(idr_replace);
/**
* DOC: IDA description
*
* The IDA is an ID allocator which does not provide the ability to
* associate an ID with a pointer. As such, it only needs to store one
* bit per ID, and so is more space efficient than an IDR. To use an IDA,
* define it using DEFINE_IDA() (or embed a &struct ida in a data structure,
* then initialise it using ida_init()). To allocate a new ID, call
* ida_alloc(), ida_alloc_min(), ida_alloc_max() or ida_alloc_range().
* To free an ID, call ida_free().
*
* ida_destroy() can be used to dispose of an IDA without needing to
* free the individual IDs in it. You can use ida_is_empty() to find
* out whether the IDA has any IDs currently allocated.
*
* The IDA handles its own locking. It is safe to call any of the IDA
* functions without synchronisation in your code.
*
* IDs are currently limited to the range [0-INT_MAX]. If this is an awkward
* limitation, it should be quite straightforward to raise the maximum.
*/
/*
* Developer's notes:
*
* The IDA uses the functionality provided by the XArray to store bitmaps in
* each entry. The XA_FREE_MARK is only cleared when all bits in the bitmap
* have been set.
*
* I considered telling the XArray that each slot is an order-10 node
* and indexing by bit number, but the XArray can't allow a single multi-index
* entry in the head, which would significantly increase memory consumption
* for the IDA. So instead we divide the index by the number of bits in the
* leaf bitmap before doing a radix tree lookup.
*
* As an optimisation, if there are only a few low bits set in any given
* leaf, instead of allocating a 128-byte bitmap, we store the bits
* as a value entry. Value entries never have the XA_FREE_MARK cleared
* because we can always convert them into a bitmap entry.
*
* It would be possible to optimise further; once we've run out of a
* single 128-byte bitmap, we currently switch to a 576-byte node, put
* the 128-byte bitmap in the first entry and then start allocating extra
* 128-byte entries. We could instead use the 512 bytes of the node's
* data as a bitmap before moving to that scheme. I do not believe this
* is a worthwhile optimisation; Rasmus Villemoes surveyed the current
* users of the IDA and almost none of them use more than 1024 entries.
* Those that do use more than the 8192 IDs that the 512 bytes would
* provide.
*
* The IDA always uses a lock to alloc/free. If we add a 'test_bit'
* equivalent, it will still need locking. Going to RCU lookup would require
* using RCU to free bitmaps, and that's not trivial without embedding an
* RCU head in the bitmap, which adds a 2-pointer overhead to each 128-byte
* bitmap, which is excessive.
*/
/**
* ida_alloc_range() - Allocate an unused ID.
* @ida: IDA handle.
* @min: Lowest ID to allocate.
* @max: Highest ID to allocate.
* @gfp: Memory allocation flags.
*
* Allocate an ID between @min and @max, inclusive. The allocated ID will
* not exceed %INT_MAX, even if @max is larger.
*
* Context: Any context. It is safe to call this function without
* locking in your code.
* Return: The allocated ID, or %-ENOMEM if memory could not be allocated,
* or %-ENOSPC if there are no free IDs.
*/
int ida_alloc_range(struct ida *ida, unsigned int min, unsigned int max,
gfp_t gfp)
{
XA_STATE(xas, &ida->xa, min / IDA_BITMAP_BITS);
unsigned bit = min % IDA_BITMAP_BITS;
unsigned long flags;
struct ida_bitmap *bitmap, *alloc = NULL;
if ((int)min < 0)
return -ENOSPC;
if ((int)max < 0)
max = INT_MAX;
retry:
xas_lock_irqsave(&xas, flags);
next:
bitmap = xas_find_marked(&xas, max / IDA_BITMAP_BITS, XA_FREE_MARK);
if (xas.xa_index > min / IDA_BITMAP_BITS)
bit = 0;
if (xas.xa_index * IDA_BITMAP_BITS + bit > max)
goto nospc;
if (xa_is_value(bitmap)) {
unsigned long tmp = xa_to_value(bitmap);
if (bit < BITS_PER_XA_VALUE) {
bit = find_next_zero_bit(&tmp, BITS_PER_XA_VALUE, bit);
if (xas.xa_index * IDA_BITMAP_BITS + bit > max)
goto nospc;
if (bit < BITS_PER_XA_VALUE) { tmp |= 1UL << bit;
xas_store(&xas, xa_mk_value(tmp));
goto out;
}
}
bitmap = alloc;
if (!bitmap)
bitmap = kzalloc(sizeof(*bitmap), GFP_NOWAIT);
if (!bitmap)
goto alloc;
bitmap->bitmap[0] = tmp;
xas_store(&xas, bitmap);
if (xas_error(&xas)) {
bitmap->bitmap[0] = 0;
goto out;
}
}
if (bitmap) { bit = find_next_zero_bit(bitmap->bitmap, IDA_BITMAP_BITS, bit);
if (xas.xa_index * IDA_BITMAP_BITS + bit > max)
goto nospc;
if (bit == IDA_BITMAP_BITS)
goto next;
__set_bit(bit, bitmap->bitmap);
if (bitmap_full(bitmap->bitmap, IDA_BITMAP_BITS))
xas_clear_mark(&xas, XA_FREE_MARK);
} else {
if (bit < BITS_PER_XA_VALUE) { bitmap = xa_mk_value(1UL << bit);
} else {
bitmap = alloc;
if (!bitmap)
bitmap = kzalloc(sizeof(*bitmap), GFP_NOWAIT);
if (!bitmap)
goto alloc;
__set_bit(bit, bitmap->bitmap);
}
xas_store(&xas, bitmap);
}
out:
xas_unlock_irqrestore(&xas, flags);
if (xas_nomem(&xas, gfp)) {
xas.xa_index = min / IDA_BITMAP_BITS;
bit = min % IDA_BITMAP_BITS;
goto retry;
}
if (bitmap != alloc) kfree(alloc); if (xas_error(&xas))
return xas_error(&xas);
return xas.xa_index * IDA_BITMAP_BITS + bit;
alloc:
xas_unlock_irqrestore(&xas, flags);
alloc = kzalloc(sizeof(*bitmap), gfp);
if (!alloc)
return -ENOMEM;
xas_set(&xas, min / IDA_BITMAP_BITS);
bit = min % IDA_BITMAP_BITS;
goto retry;
nospc:
xas_unlock_irqrestore(&xas, flags);
kfree(alloc);
return -ENOSPC;
}
EXPORT_SYMBOL(ida_alloc_range);
/**
* ida_free() - Release an allocated ID.
* @ida: IDA handle.
* @id: Previously allocated ID.
*
* Context: Any context. It is safe to call this function without
* locking in your code.
*/
void ida_free(struct ida *ida, unsigned int id)
{
XA_STATE(xas, &ida->xa, id / IDA_BITMAP_BITS);
unsigned bit = id % IDA_BITMAP_BITS;
struct ida_bitmap *bitmap;
unsigned long flags;
BUG_ON((int)id < 0); xas_lock_irqsave(&xas, flags);
bitmap = xas_load(&xas);
if (xa_is_value(bitmap)) {
unsigned long v = xa_to_value(bitmap);
if (bit >= BITS_PER_XA_VALUE)
goto err;
if (!(v & (1UL << bit)))
goto err;
v &= ~(1UL << bit);
if (!v)
goto delete;
xas_store(&xas, xa_mk_value(v));
} else {
if (!test_bit(bit, bitmap->bitmap))
goto err;
__clear_bit(bit, bitmap->bitmap);
xas_set_mark(&xas, XA_FREE_MARK);
if (bitmap_empty(bitmap->bitmap, IDA_BITMAP_BITS)) {
kfree(bitmap);
delete:
xas_store(&xas, NULL);
}
}
xas_unlock_irqrestore(&xas, flags);
return;
err:
xas_unlock_irqrestore(&xas, flags); WARN(1, "ida_free called for id=%d which is not allocated.\n", id);
}
EXPORT_SYMBOL(ida_free);
/**
* ida_destroy() - Free all IDs.
* @ida: IDA handle.
*
* Calling this function frees all IDs and releases all resources used
* by an IDA. When this call returns, the IDA is empty and can be reused
* or freed. If the IDA is already empty, there is no need to call this
* function.
*
* Context: Any context. It is safe to call this function without
* locking in your code.
*/
void ida_destroy(struct ida *ida)
{
XA_STATE(xas, &ida->xa, 0);
struct ida_bitmap *bitmap;
unsigned long flags;
xas_lock_irqsave(&xas, flags);
xas_for_each(&xas, bitmap, ULONG_MAX) {
if (!xa_is_value(bitmap))
kfree(bitmap);
xas_store(&xas, NULL);
}
xas_unlock_irqrestore(&xas, flags);
}
EXPORT_SYMBOL(ida_destroy);
#ifndef __KERNEL__
extern void xa_dump_index(unsigned long index, unsigned int shift);
#define IDA_CHUNK_SHIFT ilog2(IDA_BITMAP_BITS)
static void ida_dump_entry(void *entry, unsigned long index)
{
unsigned long i;
if (!entry)
return;
if (xa_is_node(entry)) {
struct xa_node *node = xa_to_node(entry);
unsigned int shift = node->shift + IDA_CHUNK_SHIFT +
XA_CHUNK_SHIFT;
xa_dump_index(index * IDA_BITMAP_BITS, shift);
xa_dump_node(node);
for (i = 0; i < XA_CHUNK_SIZE; i++)
ida_dump_entry(node->slots[i],
index | (i << node->shift));
} else if (xa_is_value(entry)) {
xa_dump_index(index * IDA_BITMAP_BITS, ilog2(BITS_PER_LONG));
pr_cont("value: data %lx [%px]\n", xa_to_value(entry), entry);
} else {
struct ida_bitmap *bitmap = entry;
xa_dump_index(index * IDA_BITMAP_BITS, IDA_CHUNK_SHIFT);
pr_cont("bitmap: %p data", bitmap);
for (i = 0; i < IDA_BITMAP_LONGS; i++)
pr_cont(" %lx", bitmap->bitmap[i]);
pr_cont("\n");
}
}
static void ida_dump(struct ida *ida)
{
struct xarray *xa = &ida->xa;
pr_debug("ida: %p node %p free %d\n", ida, xa->xa_head,
xa->xa_flags >> ROOT_TAG_SHIFT);
ida_dump_entry(xa->xa_head, 0);
}
#endif
/* SPDX-License-Identifier: GPL-2.0 */
#include <linux/syscalls.h>
#include <linux/export.h>
#include <linux/uaccess.h>
#include <linux/fs_struct.h>
#include <linux/fs.h>
#include <linux/slab.h>
#include <linux/prefetch.h>
#include "mount.h"
struct prepend_buffer {
char *buf;
int len;
};
#define DECLARE_BUFFER(__name, __buf, __len) \
struct prepend_buffer __name = {.buf = __buf + __len, .len = __len}
static char *extract_string(struct prepend_buffer *p)
{
if (likely(p->len >= 0))
return p->buf;
return ERR_PTR(-ENAMETOOLONG);
}
static bool prepend_char(struct prepend_buffer *p, unsigned char c)
{
if (likely(p->len > 0)) { p->len--;
*--p->buf = c;
return true;
}
p->len = -1;
return false;
}
/*
* The source of the prepend data can be an optimistoc load
* of a dentry name and length. And because we don't hold any
* locks, the length and the pointer to the name may not be
* in sync if a concurrent rename happens, and the kernel
* copy might fault as a result.
*
* The end result will correct itself when we check the
* rename sequence count, but we need to be able to handle
* the fault gracefully.
*/
static bool prepend_copy(void *dst, const void *src, int len)
{
if (unlikely(copy_from_kernel_nofault(dst, src, len))) { memset(dst, 'x', len);
return false;
}
return true;
}
static bool prepend(struct prepend_buffer *p, const char *str, int namelen)
{
// Already overflowed?
if (p->len < 0)
return false;
// Will overflow?
if (p->len < namelen) {
// Fill as much as possible from the end of the name
str += namelen - p->len;
p->buf -= p->len;
prepend_copy(p->buf, str, p->len);
p->len = -1;
return false;
}
// Fits fully
p->len -= namelen;
p->buf -= namelen;
return prepend_copy(p->buf, str, namelen);
}
/**
* prepend_name - prepend a pathname in front of current buffer pointer
* @buffer: buffer pointer
* @buflen: allocated length of the buffer
* @name: name string and length qstr structure
*
* With RCU path tracing, it may race with d_move(). Use READ_ONCE() to
* make sure that either the old or the new name pointer and length are
* fetched. However, there may be mismatch between length and pointer.
* But since the length cannot be trusted, we need to copy the name very
* carefully when doing the prepend_copy(). It also prepends "/" at
* the beginning of the name. The sequence number check at the caller will
* retry it again when a d_move() does happen. So any garbage in the buffer
* due to mismatched pointer and length will be discarded.
*
* Load acquire is needed to make sure that we see the new name data even
* if we might get the length wrong.
*/
static bool prepend_name(struct prepend_buffer *p, const struct qstr *name)
{
const char *dname = smp_load_acquire(&name->name); /* ^^^ */
u32 dlen = READ_ONCE(name->len);
return prepend(p, dname, dlen) && prepend_char(p, '/');
}
static int __prepend_path(const struct dentry *dentry, const struct mount *mnt,
const struct path *root, struct prepend_buffer *p)
{
while (dentry != root->dentry || &mnt->mnt != root->mnt) {
const struct dentry *parent = READ_ONCE(dentry->d_parent);
if (dentry == mnt->mnt.mnt_root) {
struct mount *m = READ_ONCE(mnt->mnt_parent);
struct mnt_namespace *mnt_ns;
if (likely(mnt != m)) {
dentry = READ_ONCE(mnt->mnt_mountpoint);
mnt = m;
continue;
}
/* Global root */
mnt_ns = READ_ONCE(mnt->mnt_ns);
/* open-coded is_mounted() to use local mnt_ns */
if (!IS_ERR_OR_NULL(mnt_ns) && !is_anon_ns(mnt_ns))
return 1; // absolute root
else
return 2; // detached or not attached yet
}
if (unlikely(dentry == parent))
/* Escaped? */
return 3;
prefetch(parent);
if (!prepend_name(p, &dentry->d_name))
break;
dentry = parent;
}
return 0;
}
/**
* prepend_path - Prepend path string to a buffer
* @path: the dentry/vfsmount to report
* @root: root vfsmnt/dentry
* @buffer: pointer to the end of the buffer
* @buflen: pointer to buffer length
*
* The function will first try to write out the pathname without taking any
* lock other than the RCU read lock to make sure that dentries won't go away.
* It only checks the sequence number of the global rename_lock as any change
* in the dentry's d_seq will be preceded by changes in the rename_lock
* sequence number. If the sequence number had been changed, it will restart
* the whole pathname back-tracing sequence again by taking the rename_lock.
* In this case, there is no need to take the RCU read lock as the recursive
* parent pointer references will keep the dentry chain alive as long as no
* rename operation is performed.
*/
static int prepend_path(const struct path *path,
const struct path *root,
struct prepend_buffer *p)
{
unsigned seq, m_seq = 0;
struct prepend_buffer b;
int error;
rcu_read_lock();
restart_mnt:
read_seqbegin_or_lock(&mount_lock, &m_seq);
seq = 0;
rcu_read_lock();
restart:
b = *p;
read_seqbegin_or_lock(&rename_lock, &seq);
error = __prepend_path(path->dentry, real_mount(path->mnt), root, &b);
if (!(seq & 1))
rcu_read_unlock();
if (need_seqretry(&rename_lock, seq)) {
seq = 1;
goto restart;
}
done_seqretry(&rename_lock, seq);
if (!(m_seq & 1))
rcu_read_unlock();
if (need_seqretry(&mount_lock, m_seq)) {
m_seq = 1;
goto restart_mnt;
}
done_seqretry(&mount_lock, m_seq);
if (unlikely(error == 3))
b = *p;
if (b.len == p->len)
prepend_char(&b, '/');
*p = b;
return error;
}
/**
* __d_path - return the path of a dentry
* @path: the dentry/vfsmount to report
* @root: root vfsmnt/dentry
* @buf: buffer to return value in
* @buflen: buffer length
*
* Convert a dentry into an ASCII path name.
*
* Returns a pointer into the buffer or an error code if the
* path was too long.
*
* "buflen" should be positive.
*
* If the path is not reachable from the supplied root, return %NULL.
*/
char *__d_path(const struct path *path,
const struct path *root,
char *buf, int buflen)
{
DECLARE_BUFFER(b, buf, buflen);
prepend_char(&b, 0);
if (unlikely(prepend_path(path, root, &b) > 0))
return NULL;
return extract_string(&b);
}
char *d_absolute_path(const struct path *path,
char *buf, int buflen)
{
struct path root = {};
DECLARE_BUFFER(b, buf, buflen);
prepend_char(&b, 0);
if (unlikely(prepend_path(path, &root, &b) > 1))
return ERR_PTR(-EINVAL);
return extract_string(&b);
}
static void get_fs_root_rcu(struct fs_struct *fs, struct path *root)
{
unsigned seq;
do {
seq = read_seqcount_begin(&fs->seq);
*root = fs->root;
} while (read_seqcount_retry(&fs->seq, seq));
}
/**
* d_path - return the path of a dentry
* @path: path to report
* @buf: buffer to return value in
* @buflen: buffer length
*
* Convert a dentry into an ASCII path name. If the entry has been deleted
* the string " (deleted)" is appended. Note that this is ambiguous.
*
* Returns a pointer into the buffer or an error code if the path was
* too long. Note: Callers should use the returned pointer, not the passed
* in buffer, to use the name! The implementation often starts at an offset
* into the buffer, and may leave 0 bytes at the start.
*
* "buflen" should be positive.
*/
char *d_path(const struct path *path, char *buf, int buflen)
{
DECLARE_BUFFER(b, buf, buflen);
struct path root;
/*
* We have various synthetic filesystems that never get mounted. On
* these filesystems dentries are never used for lookup purposes, and
* thus don't need to be hashed. They also don't need a name until a
* user wants to identify the object in /proc/pid/fd/. The little hack
* below allows us to generate a name for these objects on demand:
*
* Some pseudo inodes are mountable. When they are mounted
* path->dentry == path->mnt->mnt_root. In that case don't call d_dname
* and instead have d_path return the mounted path.
*/
if (path->dentry->d_op && path->dentry->d_op->d_dname &&
(!IS_ROOT(path->dentry) || path->dentry != path->mnt->mnt_root))
return path->dentry->d_op->d_dname(path->dentry, buf, buflen);
rcu_read_lock();
get_fs_root_rcu(current->fs, &root);
if (unlikely(d_unlinked(path->dentry)))
prepend(&b, " (deleted)", 11);
else
prepend_char(&b, 0);
prepend_path(path, &root, &b);
rcu_read_unlock();
return extract_string(&b);
}
EXPORT_SYMBOL(d_path);
/*
* Helper function for dentry_operations.d_dname() members
*/
char *dynamic_dname(struct dentry *dentry, char *buffer, int buflen,
const char *fmt, ...)
{
va_list args;
char temp[64];
int sz;
va_start(args, fmt);
sz = vsnprintf(temp, sizeof(temp), fmt, args) + 1;
va_end(args);
if (sz > sizeof(temp) || sz > buflen)
return ERR_PTR(-ENAMETOOLONG);
buffer += buflen - sz;
return memcpy(buffer, temp, sz);
}
char *simple_dname(struct dentry *dentry, char *buffer, int buflen)
{
DECLARE_BUFFER(b, buffer, buflen);
/* these dentries are never renamed, so d_lock is not needed */
prepend(&b, " (deleted)", 11);
prepend(&b, dentry->d_name.name, dentry->d_name.len);
prepend_char(&b, '/');
return extract_string(&b);
}
/*
* Write full pathname from the root of the filesystem into the buffer.
*/
static char *__dentry_path(const struct dentry *d, struct prepend_buffer *p)
{
const struct dentry *dentry;
struct prepend_buffer b;
int seq = 0;
rcu_read_lock();
restart:
dentry = d;
b = *p;
read_seqbegin_or_lock(&rename_lock, &seq);
while (!IS_ROOT(dentry)) {
const struct dentry *parent = dentry->d_parent;
prefetch(parent);
if (!prepend_name(&b, &dentry->d_name))
break;
dentry = parent;
}
if (!(seq & 1))
rcu_read_unlock();
if (need_seqretry(&rename_lock, seq)) {
seq = 1;
goto restart;
}
done_seqretry(&rename_lock, seq);
if (b.len == p->len)
prepend_char(&b, '/');
return extract_string(&b);
}
char *dentry_path_raw(const struct dentry *dentry, char *buf, int buflen)
{
DECLARE_BUFFER(b, buf, buflen);
prepend_char(&b, 0);
return __dentry_path(dentry, &b);
}
EXPORT_SYMBOL(dentry_path_raw);
char *dentry_path(const struct dentry *dentry, char *buf, int buflen)
{
DECLARE_BUFFER(b, buf, buflen);
if (unlikely(d_unlinked(dentry)))
prepend(&b, "//deleted", 10);
else
prepend_char(&b, 0);
return __dentry_path(dentry, &b);
}
static void get_fs_root_and_pwd_rcu(struct fs_struct *fs, struct path *root,
struct path *pwd)
{
unsigned seq;
do {
seq = read_seqcount_begin(&fs->seq);
*root = fs->root;
*pwd = fs->pwd;
} while (read_seqcount_retry(&fs->seq, seq));
}
/*
* NOTE! The user-level library version returns a
* character pointer. The kernel system call just
* returns the length of the buffer filled (which
* includes the ending '\0' character), or a negative
* error value. So libc would do something like
*
* char *getcwd(char * buf, size_t size)
* {
* int retval;
*
* retval = sys_getcwd(buf, size);
* if (retval >= 0)
* return buf;
* errno = -retval;
* return NULL;
* }
*/
SYSCALL_DEFINE2(getcwd, char __user *, buf, unsigned long, size)
{
int error;
struct path pwd, root;
char *page = __getname();
if (!page)
return -ENOMEM;
rcu_read_lock();
get_fs_root_and_pwd_rcu(current->fs, &root, &pwd);
if (unlikely(d_unlinked(pwd.dentry))) {
rcu_read_unlock();
error = -ENOENT;
} else {
unsigned len;
DECLARE_BUFFER(b, page, PATH_MAX);
prepend_char(&b, 0);
if (unlikely(prepend_path(&pwd, &root, &b) > 0))
prepend(&b, "(unreachable)", 13);
rcu_read_unlock();
len = PATH_MAX - b.len;
if (unlikely(len > PATH_MAX))
error = -ENAMETOOLONG;
else if (unlikely(len > size))
error = -ERANGE;
else if (copy_to_user(buf, b.buf, len))
error = -EFAULT;
else
error = len;
}
__putname(page);
return error;
}
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_LIST_BL_H
#define _LINUX_LIST_BL_H
#include <linux/list.h>
#include <linux/bit_spinlock.h>
/*
* Special version of lists, where head of the list has a lock in the lowest
* bit. This is useful for scalable hash tables without increasing memory
* footprint overhead.
*
* For modification operations, the 0 bit of hlist_bl_head->first
* pointer must be set.
*
* With some small modifications, this can easily be adapted to store several
* arbitrary bits (not just a single lock bit), if the need arises to store
* some fast and compact auxiliary data.
*/
#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK)
#define LIST_BL_LOCKMASK 1UL
#else
#define LIST_BL_LOCKMASK 0UL
#endif
#ifdef CONFIG_DEBUG_LIST
#define LIST_BL_BUG_ON(x) BUG_ON(x)
#else
#define LIST_BL_BUG_ON(x)
#endif
struct hlist_bl_head {
struct hlist_bl_node *first;
};
struct hlist_bl_node {
struct hlist_bl_node *next, **pprev;
};
#define INIT_HLIST_BL_HEAD(ptr) \
((ptr)->first = NULL)
static inline void INIT_HLIST_BL_NODE(struct hlist_bl_node *h)
{
h->next = NULL;
h->pprev = NULL;
}
#define hlist_bl_entry(ptr, type, member) container_of(ptr,type,member)
static inline bool hlist_bl_unhashed(const struct hlist_bl_node *h)
{
return !h->pprev;
}
static inline struct hlist_bl_node *hlist_bl_first(struct hlist_bl_head *h)
{
return (struct hlist_bl_node *)
((unsigned long)h->first & ~LIST_BL_LOCKMASK);
}
static inline void hlist_bl_set_first(struct hlist_bl_head *h,
struct hlist_bl_node *n)
{
LIST_BL_BUG_ON((unsigned long)n & LIST_BL_LOCKMASK);
LIST_BL_BUG_ON(((unsigned long)h->first & LIST_BL_LOCKMASK) !=
LIST_BL_LOCKMASK);
h->first = (struct hlist_bl_node *)((unsigned long)n | LIST_BL_LOCKMASK);
}
static inline bool hlist_bl_empty(const struct hlist_bl_head *h)
{
return !((unsigned long)READ_ONCE(h->first) & ~LIST_BL_LOCKMASK);
}
static inline void hlist_bl_add_head(struct hlist_bl_node *n,
struct hlist_bl_head *h)
{
struct hlist_bl_node *first = hlist_bl_first(h);
n->next = first;
if (first)
first->pprev = &n->next;
n->pprev = &h->first;
hlist_bl_set_first(h, n);
}
static inline void hlist_bl_add_before(struct hlist_bl_node *n,
struct hlist_bl_node *next)
{
struct hlist_bl_node **pprev = next->pprev;
n->pprev = pprev;
n->next = next;
next->pprev = &n->next;
/* pprev may be `first`, so be careful not to lose the lock bit */
WRITE_ONCE(*pprev,
(struct hlist_bl_node *)
((uintptr_t)n | ((uintptr_t)*pprev & LIST_BL_LOCKMASK)));
}
static inline void hlist_bl_add_behind(struct hlist_bl_node *n,
struct hlist_bl_node *prev)
{
n->next = prev->next;
n->pprev = &prev->next;
prev->next = n;
if (n->next)
n->next->pprev = &n->next;
}
static inline void __hlist_bl_del(struct hlist_bl_node *n)
{
struct hlist_bl_node *next = n->next;
struct hlist_bl_node **pprev = n->pprev;
LIST_BL_BUG_ON((unsigned long)n & LIST_BL_LOCKMASK);
/* pprev may be `first`, so be careful not to lose the lock bit */
WRITE_ONCE(*pprev,
(struct hlist_bl_node *)
((unsigned long)next |
((unsigned long)*pprev & LIST_BL_LOCKMASK)));
if (next)
next->pprev = pprev;
}
static inline void hlist_bl_del(struct hlist_bl_node *n)
{
__hlist_bl_del(n);
n->next = LIST_POISON1;
n->pprev = LIST_POISON2;
}
static inline void hlist_bl_del_init(struct hlist_bl_node *n)
{
if (!hlist_bl_unhashed(n)) {
__hlist_bl_del(n);
INIT_HLIST_BL_NODE(n);
}
}
static inline void hlist_bl_lock(struct hlist_bl_head *b)
{
bit_spin_lock(0, (unsigned long *)b);
}
static inline void hlist_bl_unlock(struct hlist_bl_head *b)
{
__bit_spin_unlock(0, (unsigned long *)b);
}
static inline bool hlist_bl_is_locked(struct hlist_bl_head *b)
{
return bit_spin_is_locked(0, (unsigned long *)b);
}
/**
* hlist_bl_for_each_entry - iterate over list of given type
* @tpos: the type * to use as a loop cursor.
* @pos: the &struct hlist_node to use as a loop cursor.
* @head: the head for your list.
* @member: the name of the hlist_node within the struct.
*
*/
#define hlist_bl_for_each_entry(tpos, pos, head, member) \
for (pos = hlist_bl_first(head); \
pos && \
({ tpos = hlist_bl_entry(pos, typeof(*tpos), member); 1;}); \
pos = pos->next)
/**
* hlist_bl_for_each_entry_safe - iterate over list of given type safe against removal of list entry
* @tpos: the type * to use as a loop cursor.
* @pos: the &struct hlist_node to use as a loop cursor.
* @n: another &struct hlist_node to use as temporary storage
* @head: the head for your list.
* @member: the name of the hlist_node within the struct.
*/
#define hlist_bl_for_each_entry_safe(tpos, pos, n, head, member) \
for (pos = hlist_bl_first(head); \
pos && ({ n = pos->next; 1; }) && \
({ tpos = hlist_bl_entry(pos, typeof(*tpos), member); 1;}); \
pos = n)
#endif
// SPDX-License-Identifier: GPL-2.0-or-later
/* Provide a way to create a superblock configuration context within the kernel
* that allows a superblock to be set up prior to mounting.
*
* Copyright (C) 2017 Red Hat, Inc. All Rights Reserved.
* Written by David Howells (dhowells@redhat.com)
*/
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/module.h>
#include <linux/fs_context.h>
#include <linux/fs_parser.h>
#include <linux/fs.h>
#include <linux/mount.h>
#include <linux/nsproxy.h>
#include <linux/slab.h>
#include <linux/magic.h>
#include <linux/security.h>
#include <linux/mnt_namespace.h>
#include <linux/pid_namespace.h>
#include <linux/user_namespace.h>
#include <net/net_namespace.h>
#include <asm/sections.h>
#include "mount.h"
#include "internal.h"
enum legacy_fs_param {
LEGACY_FS_UNSET_PARAMS,
LEGACY_FS_MONOLITHIC_PARAMS,
LEGACY_FS_INDIVIDUAL_PARAMS,
};
struct legacy_fs_context {
char *legacy_data; /* Data page for legacy filesystems */
size_t data_size;
enum legacy_fs_param param_type;
};
static int legacy_init_fs_context(struct fs_context *fc);
static const struct constant_table common_set_sb_flag[] = {
{ "dirsync", SB_DIRSYNC },
{ "lazytime", SB_LAZYTIME },
{ "mand", SB_MANDLOCK },
{ "ro", SB_RDONLY },
{ "sync", SB_SYNCHRONOUS },
{ },
};
static const struct constant_table common_clear_sb_flag[] = {
{ "async", SB_SYNCHRONOUS },
{ "nolazytime", SB_LAZYTIME },
{ "nomand", SB_MANDLOCK },
{ "rw", SB_RDONLY },
{ },
};
/*
* Check for a common mount option that manipulates s_flags.
*/
static int vfs_parse_sb_flag(struct fs_context *fc, const char *key)
{
unsigned int token;
token = lookup_constant(common_set_sb_flag, key, 0);
if (token) {
fc->sb_flags |= token;
fc->sb_flags_mask |= token;
return 0;
}
token = lookup_constant(common_clear_sb_flag, key, 0);
if (token) {
fc->sb_flags &= ~token;
fc->sb_flags_mask |= token;
return 0;
}
return -ENOPARAM;
}
/**
* vfs_parse_fs_param_source - Handle setting "source" via parameter
* @fc: The filesystem context to modify
* @param: The parameter
*
* This is a simple helper for filesystems to verify that the "source" they
* accept is sane.
*
* Returns 0 on success, -ENOPARAM if this is not "source" parameter, and
* -EINVAL otherwise. In the event of failure, supplementary error information
* is logged.
*/
int vfs_parse_fs_param_source(struct fs_context *fc, struct fs_parameter *param)
{
if (strcmp(param->key, "source") != 0)
return -ENOPARAM;
if (param->type != fs_value_is_string) return invalf(fc, "Non-string source"); if (fc->source) return invalf(fc, "Multiple sources"); fc->source = param->string;
param->string = NULL;
return 0;
}
EXPORT_SYMBOL(vfs_parse_fs_param_source);
/**
* vfs_parse_fs_param - Add a single parameter to a superblock config
* @fc: The filesystem context to modify
* @param: The parameter
*
* A single mount option in string form is applied to the filesystem context
* being set up. Certain standard options (for example "ro") are translated
* into flag bits without going to the filesystem. The active security module
* is allowed to observe and poach options. Any other options are passed over
* to the filesystem to parse.
*
* This may be called multiple times for a context.
*
* Returns 0 on success and a negative error code on failure. In the event of
* failure, supplementary error information may have been set.
*/
int vfs_parse_fs_param(struct fs_context *fc, struct fs_parameter *param)
{
int ret;
if (!param->key) return invalf(fc, "Unnamed parameter\n");
ret = vfs_parse_sb_flag(fc, param->key);
if (ret != -ENOPARAM)
return ret;
ret = security_fs_context_parse_param(fc, param);
if (ret != -ENOPARAM)
/* Param belongs to the LSM or is disallowed by the LSM; so
* don't pass to the FS.
*/
return ret;
if (fc->ops->parse_param) { ret = fc->ops->parse_param(fc, param);
if (ret != -ENOPARAM)
return ret;
}
/* If the filesystem doesn't take any arguments, give it the
* default handling of source.
*/
ret = vfs_parse_fs_param_source(fc, param);
if (ret != -ENOPARAM)
return ret;
return invalf(fc, "%s: Unknown parameter '%s'",
fc->fs_type->name, param->key);
}
EXPORT_SYMBOL(vfs_parse_fs_param);
/**
* vfs_parse_fs_string - Convenience function to just parse a string.
*/
int vfs_parse_fs_string(struct fs_context *fc, const char *key,
const char *value, size_t v_size)
{
int ret;
struct fs_parameter param = {
.key = key,
.type = fs_value_is_flag,
.size = v_size,
};
if (value) {
param.string = kmemdup_nul(value, v_size, GFP_KERNEL);
if (!param.string)
return -ENOMEM;
param.type = fs_value_is_string;
}
ret = vfs_parse_fs_param(fc, ¶m);
kfree(param.string);
return ret;
}
EXPORT_SYMBOL(vfs_parse_fs_string);
/**
* generic_parse_monolithic - Parse key[=val][,key[=val]]* mount data
* @ctx: The superblock configuration to fill in.
* @data: The data to parse
*
* Parse a blob of data that's in key[=val][,key[=val]]* form. This can be
* called from the ->monolithic_mount_data() fs_context operation.
*
* Returns 0 on success or the error returned by the ->parse_option() fs_context
* operation on failure.
*/
int generic_parse_monolithic(struct fs_context *fc, void *data)
{
char *options = data, *key;
int ret = 0;
if (!options)
return 0;
ret = security_sb_eat_lsm_opts(options, &fc->security);
if (ret)
return ret;
while ((key = strsep(&options, ",")) != NULL) {
if (*key) {
size_t v_len = 0;
char *value = strchr(key, '=');
if (value) {
if (value == key)
continue;
*value++ = 0;
v_len = strlen(value);
}
ret = vfs_parse_fs_string(fc, key, value, v_len);
if (ret < 0)
break;
}
}
return ret;
}
EXPORT_SYMBOL(generic_parse_monolithic);
/**
* alloc_fs_context - Create a filesystem context.
* @fs_type: The filesystem type.
* @reference: The dentry from which this one derives (or NULL)
* @sb_flags: Filesystem/superblock flags (SB_*)
* @sb_flags_mask: Applicable members of @sb_flags
* @purpose: The purpose that this configuration shall be used for.
*
* Open a filesystem and create a mount context. The mount context is
* initialised with the supplied flags and, if a submount/automount from
* another superblock (referred to by @reference) is supplied, may have
* parameters such as namespaces copied across from that superblock.
*/
static struct fs_context *alloc_fs_context(struct file_system_type *fs_type,
struct dentry *reference,
unsigned int sb_flags,
unsigned int sb_flags_mask,
enum fs_context_purpose purpose)
{
int (*init_fs_context)(struct fs_context *);
struct fs_context *fc;
int ret = -ENOMEM;
fc = kzalloc(sizeof(struct fs_context), GFP_KERNEL_ACCOUNT);
if (!fc)
return ERR_PTR(-ENOMEM);
fc->purpose = purpose;
fc->sb_flags = sb_flags;
fc->sb_flags_mask = sb_flags_mask;
fc->fs_type = get_filesystem(fs_type);
fc->cred = get_current_cred(); fc->net_ns = get_net(current->nsproxy->net_ns);
fc->log.prefix = fs_type->name;
mutex_init(&fc->uapi_mutex);
switch (purpose) {
case FS_CONTEXT_FOR_MOUNT:
fc->user_ns = get_user_ns(fc->cred->user_ns);
break;
case FS_CONTEXT_FOR_SUBMOUNT:
fc->user_ns = get_user_ns(reference->d_sb->s_user_ns);
break;
case FS_CONTEXT_FOR_RECONFIGURE:
atomic_inc(&reference->d_sb->s_active);
fc->user_ns = get_user_ns(reference->d_sb->s_user_ns);
fc->root = dget(reference);
break;
}
/* TODO: Make all filesystems support this unconditionally */
init_fs_context = fc->fs_type->init_fs_context;
if (!init_fs_context)
init_fs_context = legacy_init_fs_context;
ret = init_fs_context(fc);
if (ret < 0)
goto err_fc;
fc->need_free = true; return fc;
err_fc:
put_fs_context(fc);
return ERR_PTR(ret);
}
struct fs_context *fs_context_for_mount(struct file_system_type *fs_type,
unsigned int sb_flags)
{
return alloc_fs_context(fs_type, NULL, sb_flags, 0,
FS_CONTEXT_FOR_MOUNT);
}
EXPORT_SYMBOL(fs_context_for_mount);
struct fs_context *fs_context_for_reconfigure(struct dentry *dentry,
unsigned int sb_flags,
unsigned int sb_flags_mask)
{
return alloc_fs_context(dentry->d_sb->s_type, dentry, sb_flags,
sb_flags_mask, FS_CONTEXT_FOR_RECONFIGURE);
}
EXPORT_SYMBOL(fs_context_for_reconfigure);
struct fs_context *fs_context_for_submount(struct file_system_type *type,
struct dentry *reference)
{
return alloc_fs_context(type, reference, 0, 0, FS_CONTEXT_FOR_SUBMOUNT);
}
EXPORT_SYMBOL(fs_context_for_submount);
void fc_drop_locked(struct fs_context *fc)
{
struct super_block *sb = fc->root->d_sb;
dput(fc->root);
fc->root = NULL;
deactivate_locked_super(sb);
}
static void legacy_fs_context_free(struct fs_context *fc);
/**
* vfs_dup_fc_config: Duplicate a filesystem context.
* @src_fc: The context to copy.
*/
struct fs_context *vfs_dup_fs_context(struct fs_context *src_fc)
{
struct fs_context *fc;
int ret;
if (!src_fc->ops->dup)
return ERR_PTR(-EOPNOTSUPP);
fc = kmemdup(src_fc, sizeof(struct fs_context), GFP_KERNEL);
if (!fc)
return ERR_PTR(-ENOMEM);
mutex_init(&fc->uapi_mutex);
fc->fs_private = NULL;
fc->s_fs_info = NULL;
fc->source = NULL;
fc->security = NULL;
get_filesystem(fc->fs_type);
get_net(fc->net_ns);
get_user_ns(fc->user_ns);
get_cred(fc->cred);
if (fc->log.log)
refcount_inc(&fc->log.log->usage);
/* Can't call put until we've called ->dup */
ret = fc->ops->dup(fc, src_fc);
if (ret < 0)
goto err_fc;
ret = security_fs_context_dup(fc, src_fc);
if (ret < 0)
goto err_fc;
return fc;
err_fc:
put_fs_context(fc);
return ERR_PTR(ret);
}
EXPORT_SYMBOL(vfs_dup_fs_context);
/**
* logfc - Log a message to a filesystem context
* @fc: The filesystem context to log to.
* @fmt: The format of the buffer.
*/
void logfc(struct fc_log *log, const char *prefix, char level, const char *fmt, ...)
{
va_list va;
struct va_format vaf = {.fmt = fmt, .va = &va};
va_start(va, fmt);
if (!log) {
switch (level) {
case 'w':
printk(KERN_WARNING "%s%s%pV\n", prefix ? prefix : "",
prefix ? ": " : "", &vaf);
break;
case 'e':
printk(KERN_ERR "%s%s%pV\n", prefix ? prefix : "",
prefix ? ": " : "", &vaf);
break;
default:
printk(KERN_NOTICE "%s%s%pV\n", prefix ? prefix : "",
prefix ? ": " : "", &vaf);
break;
}
} else {
unsigned int logsize = ARRAY_SIZE(log->buffer);
u8 index;
char *q = kasprintf(GFP_KERNEL, "%c %s%s%pV\n", level,
prefix ? prefix : "",
prefix ? ": " : "", &vaf);
index = log->head & (logsize - 1);
BUILD_BUG_ON(sizeof(log->head) != sizeof(u8) ||
sizeof(log->tail) != sizeof(u8));
if ((u8)(log->head - log->tail) == logsize) {
/* The buffer is full, discard the oldest message */
if (log->need_free & (1 << index)) kfree(log->buffer[index]); log->tail++;
}
log->buffer[index] = q ? q : "OOM: Can't store error string";
if (q)
log->need_free |= 1 << index;
else
log->need_free &= ~(1 << index);
log->head++;
}
va_end(va);
}
EXPORT_SYMBOL(logfc);
/*
* Free a logging structure.
*/
static void put_fc_log(struct fs_context *fc)
{
struct fc_log *log = fc->log.log;
int i;
if (log) {
if (refcount_dec_and_test(&log->usage)) {
fc->log.log = NULL;
for (i = 0; i <= 7; i++) if (log->need_free & (1 << i)) kfree(log->buffer[i]); kfree(log);
}
}
}
/**
* put_fs_context - Dispose of a superblock configuration context.
* @fc: The context to dispose of.
*/
void put_fs_context(struct fs_context *fc)
{
struct super_block *sb;
if (fc->root) { sb = fc->root->d_sb;
dput(fc->root);
fc->root = NULL;
deactivate_super(sb);
}
if (fc->need_free && fc->ops && fc->ops->free) fc->ops->free(fc); security_free_mnt_opts(&fc->security);
put_net(fc->net_ns);
put_user_ns(fc->user_ns);
put_cred(fc->cred);
put_fc_log(fc);
put_filesystem(fc->fs_type);
kfree(fc->source);
kfree(fc);
}
EXPORT_SYMBOL(put_fs_context);
/*
* Free the config for a filesystem that doesn't support fs_context.
*/
static void legacy_fs_context_free(struct fs_context *fc)
{
struct legacy_fs_context *ctx = fc->fs_private;
if (ctx) {
if (ctx->param_type == LEGACY_FS_INDIVIDUAL_PARAMS) kfree(ctx->legacy_data); kfree(ctx);
}
}
/*
* Duplicate a legacy config.
*/
static int legacy_fs_context_dup(struct fs_context *fc, struct fs_context *src_fc)
{
struct legacy_fs_context *ctx;
struct legacy_fs_context *src_ctx = src_fc->fs_private;
ctx = kmemdup(src_ctx, sizeof(*src_ctx), GFP_KERNEL);
if (!ctx)
return -ENOMEM;
if (ctx->param_type == LEGACY_FS_INDIVIDUAL_PARAMS) {
ctx->legacy_data = kmemdup(src_ctx->legacy_data,
src_ctx->data_size, GFP_KERNEL);
if (!ctx->legacy_data) {
kfree(ctx);
return -ENOMEM;
}
}
fc->fs_private = ctx;
return 0;
}
/*
* Add a parameter to a legacy config. We build up a comma-separated list of
* options.
*/
static int legacy_parse_param(struct fs_context *fc, struct fs_parameter *param)
{
struct legacy_fs_context *ctx = fc->fs_private; unsigned int size = ctx->data_size;
size_t len = 0;
int ret;
ret = vfs_parse_fs_param_source(fc, param);
if (ret != -ENOPARAM)
return ret;
if (ctx->param_type == LEGACY_FS_MONOLITHIC_PARAMS) return invalf(fc, "VFS: Legacy: Can't mix monolithic and individual options"); switch (param->type) {
case fs_value_is_string:
len = 1 + param->size;
fallthrough;
case fs_value_is_flag:
len += strlen(param->key);
break;
default:
return invalf(fc, "VFS: Legacy: Parameter type for '%s' not supported",
param->key);
}
if (size + len + 2 > PAGE_SIZE)
return invalf(fc, "VFS: Legacy: Cumulative options too large"); if (strchr(param->key, ',') ||
(param->type == fs_value_is_string &&
memchr(param->string, ',', param->size))) return invalf(fc, "VFS: Legacy: Option '%s' contained comma",
param->key);
if (!ctx->legacy_data) {
ctx->legacy_data = kmalloc(PAGE_SIZE, GFP_KERNEL);
if (!ctx->legacy_data)
return -ENOMEM;
}
ctx->legacy_data[size++] = ',';
len = strlen(param->key);
memcpy(ctx->legacy_data + size, param->key, len);
size += len;
if (param->type == fs_value_is_string) {
ctx->legacy_data[size++] = '=';
memcpy(ctx->legacy_data + size, param->string, param->size);
size += param->size;
}
ctx->legacy_data[size] = '\0';
ctx->data_size = size;
ctx->param_type = LEGACY_FS_INDIVIDUAL_PARAMS;
return 0;
}
/*
* Add monolithic mount data.
*/
static int legacy_parse_monolithic(struct fs_context *fc, void *data)
{
struct legacy_fs_context *ctx = fc->fs_private;
if (ctx->param_type != LEGACY_FS_UNSET_PARAMS) {
pr_warn("VFS: Can't mix monolithic and individual options\n");
return -EINVAL;
}
ctx->legacy_data = data;
ctx->param_type = LEGACY_FS_MONOLITHIC_PARAMS;
if (!ctx->legacy_data)
return 0; if (fc->fs_type->fs_flags & FS_BINARY_MOUNTDATA)
return 0;
return security_sb_eat_lsm_opts(ctx->legacy_data, &fc->security);
}
/*
* Get a mountable root with the legacy mount command.
*/
static int legacy_get_tree(struct fs_context *fc)
{
struct legacy_fs_context *ctx = fc->fs_private;
struct super_block *sb;
struct dentry *root;
root = fc->fs_type->mount(fc->fs_type, fc->sb_flags,
fc->source, ctx->legacy_data);
if (IS_ERR(root))
return PTR_ERR(root);
sb = root->d_sb; BUG_ON(!sb); fc->root = root; return 0;
}
/*
* Handle remount.
*/
static int legacy_reconfigure(struct fs_context *fc)
{
struct legacy_fs_context *ctx = fc->fs_private; struct super_block *sb = fc->root->d_sb;
if (!sb->s_op->remount_fs)
return 0;
return sb->s_op->remount_fs(sb, &fc->sb_flags,
ctx ? ctx->legacy_data : NULL);
}
const struct fs_context_operations legacy_fs_context_ops = {
.free = legacy_fs_context_free,
.dup = legacy_fs_context_dup,
.parse_param = legacy_parse_param,
.parse_monolithic = legacy_parse_monolithic,
.get_tree = legacy_get_tree,
.reconfigure = legacy_reconfigure,
};
/*
* Initialise a legacy context for a filesystem that doesn't support
* fs_context.
*/
static int legacy_init_fs_context(struct fs_context *fc)
{
fc->fs_private = kzalloc(sizeof(struct legacy_fs_context), GFP_KERNEL_ACCOUNT);
if (!fc->fs_private)
return -ENOMEM;
fc->ops = &legacy_fs_context_ops; return 0;
}
int parse_monolithic_mount_data(struct fs_context *fc, void *data)
{
int (*monolithic_mount_data)(struct fs_context *, void *);
monolithic_mount_data = fc->ops->parse_monolithic;
if (!monolithic_mount_data)
monolithic_mount_data = generic_parse_monolithic;
return monolithic_mount_data(fc, data);
}
/*
* Clean up a context after performing an action on it and put it into a state
* from where it can be used to reconfigure a superblock.
*
* Note that here we do only the parts that can't fail; the rest is in
* finish_clean_context() below and in between those fs_context is marked
* FS_CONTEXT_AWAITING_RECONF. The reason for splitup is that after
* successful mount or remount we need to report success to userland.
* Trying to do full reinit (for the sake of possible subsequent remount)
* and failing to allocate memory would've put us into a nasty situation.
* So here we only discard the old state and reinitialization is left
* until we actually try to reconfigure.
*/
void vfs_clean_context(struct fs_context *fc)
{
if (fc->need_free && fc->ops && fc->ops->free)
fc->ops->free(fc);
fc->need_free = false;
fc->fs_private = NULL;
fc->s_fs_info = NULL;
fc->sb_flags = 0;
security_free_mnt_opts(&fc->security);
kfree(fc->source);
fc->source = NULL;
fc->purpose = FS_CONTEXT_FOR_RECONFIGURE;
fc->phase = FS_CONTEXT_AWAITING_RECONF;
}
int finish_clean_context(struct fs_context *fc)
{
int error;
if (fc->phase != FS_CONTEXT_AWAITING_RECONF)
return 0;
if (fc->fs_type->init_fs_context)
error = fc->fs_type->init_fs_context(fc);
else
error = legacy_init_fs_context(fc);
if (unlikely(error)) {
fc->phase = FS_CONTEXT_FAILED;
return error;
}
fc->need_free = true;
fc->phase = FS_CONTEXT_RECONF_PARAMS;
return 0;
}
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef __LINUX_CPUMASK_H
#define __LINUX_CPUMASK_H
/*
* Cpumasks provide a bitmap suitable for representing the
* set of CPU's in a system, one bit position per CPU number. In general,
* only nr_cpu_ids (<= NR_CPUS) bits are valid.
*/
#include <linux/kernel.h>
#include <linux/threads.h>
#include <linux/bitmap.h>
#include <linux/atomic.h>
#include <linux/bug.h>
/* Don't assign or return these: may not be this big! */
typedef struct cpumask { DECLARE_BITMAP(bits, NR_CPUS); } cpumask_t;
/**
* cpumask_bits - get the bits in a cpumask
* @maskp: the struct cpumask *
*
* You should only assume nr_cpu_ids bits of this mask are valid. This is
* a macro so it's const-correct.
*/
#define cpumask_bits(maskp) ((maskp)->bits)
/**
* cpumask_pr_args - printf args to output a cpumask
* @maskp: cpumask to be printed
*
* Can be used to provide arguments for '%*pb[l]' when printing a cpumask.
*/
#define cpumask_pr_args(maskp) nr_cpu_ids, cpumask_bits(maskp)
#if NR_CPUS == 1
#define nr_cpu_ids 1U
#else
extern unsigned int nr_cpu_ids;
#endif
#ifdef CONFIG_CPUMASK_OFFSTACK
/* Assuming NR_CPUS is huge, a runtime limit is more efficient. Also,
* not all bits may be allocated. */
#define nr_cpumask_bits nr_cpu_ids
#else
#define nr_cpumask_bits ((unsigned int)NR_CPUS)
#endif
/*
* The following particular system cpumasks and operations manage
* possible, present, active and online cpus.
*
* cpu_possible_mask- has bit 'cpu' set iff cpu is populatable
* cpu_present_mask - has bit 'cpu' set iff cpu is populated
* cpu_online_mask - has bit 'cpu' set iff cpu available to scheduler
* cpu_active_mask - has bit 'cpu' set iff cpu available to migration
*
* If !CONFIG_HOTPLUG_CPU, present == possible, and active == online.
*
* The cpu_possible_mask is fixed at boot time, as the set of CPU id's
* that it is possible might ever be plugged in at anytime during the
* life of that system boot. The cpu_present_mask is dynamic(*),
* representing which CPUs are currently plugged in. And
* cpu_online_mask is the dynamic subset of cpu_present_mask,
* indicating those CPUs available for scheduling.
*
* If HOTPLUG is enabled, then cpu_possible_mask is forced to have
* all NR_CPUS bits set, otherwise it is just the set of CPUs that
* ACPI reports present at boot.
*
* If HOTPLUG is enabled, then cpu_present_mask varies dynamically,
* depending on what ACPI reports as currently plugged in, otherwise
* cpu_present_mask is just a copy of cpu_possible_mask.
*
* (*) Well, cpu_present_mask is dynamic in the hotplug case. If not
* hotplug, it's a copy of cpu_possible_mask, hence fixed at boot.
*
* Subtleties:
* 1) UP arch's (NR_CPUS == 1, CONFIG_SMP not defined) hardcode
* assumption that their single CPU is online. The UP
* cpu_{online,possible,present}_masks are placebos. Changing them
* will have no useful affect on the following num_*_cpus()
* and cpu_*() macros in the UP case. This ugliness is a UP
* optimization - don't waste any instructions or memory references
* asking if you're online or how many CPUs there are if there is
* only one CPU.
*/
extern struct cpumask __cpu_possible_mask;
extern struct cpumask __cpu_online_mask;
extern struct cpumask __cpu_present_mask;
extern struct cpumask __cpu_active_mask;
extern struct cpumask __cpu_dying_mask;
#define cpu_possible_mask ((const struct cpumask *)&__cpu_possible_mask)
#define cpu_online_mask ((const struct cpumask *)&__cpu_online_mask)
#define cpu_present_mask ((const struct cpumask *)&__cpu_present_mask)
#define cpu_active_mask ((const struct cpumask *)&__cpu_active_mask)
#define cpu_dying_mask ((const struct cpumask *)&__cpu_dying_mask)
extern atomic_t __num_online_cpus;
extern cpumask_t cpus_booted_once_mask;
static inline void cpu_max_bits_warn(unsigned int cpu, unsigned int bits)
{
#ifdef CONFIG_DEBUG_PER_CPU_MAPS
WARN_ON_ONCE(cpu >= bits);
#endif /* CONFIG_DEBUG_PER_CPU_MAPS */
}
/* verify cpu argument to cpumask_* operators */
static inline unsigned int cpumask_check(unsigned int cpu)
{
cpu_max_bits_warn(cpu, nr_cpumask_bits);
return cpu;
}
#if NR_CPUS == 1
/* Uniprocessor. Assume all masks are "1". */
static inline unsigned int cpumask_first(const struct cpumask *srcp)
{
return 0;
}
static inline unsigned int cpumask_last(const struct cpumask *srcp)
{
return 0;
}
/* Valid inputs for n are -1 and 0. */
static inline unsigned int cpumask_next(int n, const struct cpumask *srcp)
{
return n+1;
}
static inline unsigned int cpumask_next_zero(int n, const struct cpumask *srcp)
{
return n+1;
}
static inline unsigned int cpumask_next_and(int n,
const struct cpumask *srcp,
const struct cpumask *andp)
{
return n+1;
}
static inline unsigned int cpumask_next_wrap(int n, const struct cpumask *mask,
int start, bool wrap)
{
/* cpu0 unless stop condition, wrap and at cpu0, then nr_cpumask_bits */
return (wrap && n == 0);
}
/* cpu must be a valid cpu, ie 0, so there's no other choice. */
static inline unsigned int cpumask_any_but(const struct cpumask *mask,
unsigned int cpu)
{
return 1;
}
static inline unsigned int cpumask_local_spread(unsigned int i, int node)
{
return 0;
}
static inline int cpumask_any_and_distribute(const struct cpumask *src1p,
const struct cpumask *src2p) {
return cpumask_next_and(-1, src1p, src2p);
}
static inline int cpumask_any_distribute(const struct cpumask *srcp)
{
return cpumask_first(srcp);
}
#define for_each_cpu(cpu, mask) \
for ((cpu) = 0; (cpu) < 1; (cpu)++, (void)mask)
#define for_each_cpu_not(cpu, mask) \
for ((cpu) = 0; (cpu) < 1; (cpu)++, (void)mask)
#define for_each_cpu_wrap(cpu, mask, start) \
for ((cpu) = 0; (cpu) < 1; (cpu)++, (void)mask, (void)(start))
#define for_each_cpu_and(cpu, mask1, mask2) \
for ((cpu) = 0; (cpu) < 1; (cpu)++, (void)mask1, (void)mask2)
#else
/**
* cpumask_first - get the first cpu in a cpumask
* @srcp: the cpumask pointer
*
* Returns >= nr_cpu_ids if no cpus set.
*/
static inline unsigned int cpumask_first(const struct cpumask *srcp)
{
return find_first_bit(cpumask_bits(srcp), nr_cpumask_bits);
}
/**
* cpumask_last - get the last CPU in a cpumask
* @srcp: - the cpumask pointer
*
* Returns >= nr_cpumask_bits if no CPUs set.
*/
static inline unsigned int cpumask_last(const struct cpumask *srcp)
{
return find_last_bit(cpumask_bits(srcp), nr_cpumask_bits);
}
unsigned int __pure cpumask_next(int n, const struct cpumask *srcp);
/**
* cpumask_next_zero - get the next unset cpu in a cpumask
* @n: the cpu prior to the place to search (ie. return will be > @n)
* @srcp: the cpumask pointer
*
* Returns >= nr_cpu_ids if no further cpus unset.
*/
static inline unsigned int cpumask_next_zero(int n, const struct cpumask *srcp)
{
/* -1 is a legal arg here. */
if (n != -1)
cpumask_check(n);
return find_next_zero_bit(cpumask_bits(srcp), nr_cpumask_bits, n+1);
}
int __pure cpumask_next_and(int n, const struct cpumask *, const struct cpumask *);
int __pure cpumask_any_but(const struct cpumask *mask, unsigned int cpu);
unsigned int cpumask_local_spread(unsigned int i, int node);
int cpumask_any_and_distribute(const struct cpumask *src1p,
const struct cpumask *src2p);
int cpumask_any_distribute(const struct cpumask *srcp);
/**
* for_each_cpu - iterate over every cpu in a mask
* @cpu: the (optionally unsigned) integer iterator
* @mask: the cpumask pointer
*
* After the loop, cpu is >= nr_cpu_ids.
*/
#define for_each_cpu(cpu, mask) \
for ((cpu) = -1; \
(cpu) = cpumask_next((cpu), (mask)), \
(cpu) < nr_cpu_ids;)
/**
* for_each_cpu_not - iterate over every cpu in a complemented mask
* @cpu: the (optionally unsigned) integer iterator
* @mask: the cpumask pointer
*
* After the loop, cpu is >= nr_cpu_ids.
*/
#define for_each_cpu_not(cpu, mask) \
for ((cpu) = -1; \
(cpu) = cpumask_next_zero((cpu), (mask)), \
(cpu) < nr_cpu_ids;)
extern int cpumask_next_wrap(int n, const struct cpumask *mask, int start, bool wrap);
/**
* for_each_cpu_wrap - iterate over every cpu in a mask, starting at a specified location
* @cpu: the (optionally unsigned) integer iterator
* @mask: the cpumask pointer
* @start: the start location
*
* The implementation does not assume any bit in @mask is set (including @start).
*
* After the loop, cpu is >= nr_cpu_ids.
*/
#define for_each_cpu_wrap(cpu, mask, start) \
for ((cpu) = cpumask_next_wrap((start)-1, (mask), (start), false); \
(cpu) < nr_cpumask_bits; \
(cpu) = cpumask_next_wrap((cpu), (mask), (start), true))
/**
* for_each_cpu_and - iterate over every cpu in both masks
* @cpu: the (optionally unsigned) integer iterator
* @mask1: the first cpumask pointer
* @mask2: the second cpumask pointer
*
* This saves a temporary CPU mask in many places. It is equivalent to:
* struct cpumask tmp;
* cpumask_and(&tmp, &mask1, &mask2);
* for_each_cpu(cpu, &tmp)
* ...
*
* After the loop, cpu is >= nr_cpu_ids.
*/
#define for_each_cpu_and(cpu, mask1, mask2) \
for ((cpu) = -1; \
(cpu) = cpumask_next_and((cpu), (mask1), (mask2)), \
(cpu) < nr_cpu_ids;)
#endif /* SMP */
#define CPU_BITS_NONE \
{ \
[0 ... BITS_TO_LONGS(NR_CPUS)-1] = 0UL \
}
#define CPU_BITS_CPU0 \
{ \
[0] = 1UL \
}
/**
* cpumask_set_cpu - set a cpu in a cpumask
* @cpu: cpu number (< nr_cpu_ids)
* @dstp: the cpumask pointer
*/
static inline void cpumask_set_cpu(unsigned int cpu, struct cpumask *dstp)
{
set_bit(cpumask_check(cpu), cpumask_bits(dstp));
}
static inline void __cpumask_set_cpu(unsigned int cpu, struct cpumask *dstp)
{
__set_bit(cpumask_check(cpu), cpumask_bits(dstp));
}
/**
* cpumask_clear_cpu - clear a cpu in a cpumask
* @cpu: cpu number (< nr_cpu_ids)
* @dstp: the cpumask pointer
*/
static inline void cpumask_clear_cpu(int cpu, struct cpumask *dstp)
{
clear_bit(cpumask_check(cpu), cpumask_bits(dstp));
}
static inline void __cpumask_clear_cpu(int cpu, struct cpumask *dstp)
{
__clear_bit(cpumask_check(cpu), cpumask_bits(dstp));
}
/**
* cpumask_test_cpu - test for a cpu in a cpumask
* @cpu: cpu number (< nr_cpu_ids)
* @cpumask: the cpumask pointer
*
* Returns 1 if @cpu is set in @cpumask, else returns 0
*/
static inline int cpumask_test_cpu(int cpu, const struct cpumask *cpumask)
{
return test_bit(cpumask_check(cpu), cpumask_bits((cpumask)));
}
/**
* cpumask_test_and_set_cpu - atomically test and set a cpu in a cpumask
* @cpu: cpu number (< nr_cpu_ids)
* @cpumask: the cpumask pointer
*
* Returns 1 if @cpu is set in old bitmap of @cpumask, else returns 0
*
* test_and_set_bit wrapper for cpumasks.
*/
static inline int cpumask_test_and_set_cpu(int cpu, struct cpumask *cpumask)
{
return test_and_set_bit(cpumask_check(cpu), cpumask_bits(cpumask));
}
/**
* cpumask_test_and_clear_cpu - atomically test and clear a cpu in a cpumask
* @cpu: cpu number (< nr_cpu_ids)
* @cpumask: the cpumask pointer
*
* Returns 1 if @cpu is set in old bitmap of @cpumask, else returns 0
*
* test_and_clear_bit wrapper for cpumasks.
*/
static inline int cpumask_test_and_clear_cpu(int cpu, struct cpumask *cpumask)
{
return test_and_clear_bit(cpumask_check(cpu), cpumask_bits(cpumask));
}
/**
* cpumask_setall - set all cpus (< nr_cpu_ids) in a cpumask
* @dstp: the cpumask pointer
*/
static inline void cpumask_setall(struct cpumask *dstp)
{
bitmap_fill(cpumask_bits(dstp), nr_cpumask_bits);
}
/**
* cpumask_clear - clear all cpus (< nr_cpu_ids) in a cpumask
* @dstp: the cpumask pointer
*/
static inline void cpumask_clear(struct cpumask *dstp)
{
bitmap_zero(cpumask_bits(dstp), nr_cpumask_bits);
}
/**
* cpumask_and - *dstp = *src1p & *src2p
* @dstp: the cpumask result
* @src1p: the first input
* @src2p: the second input
*
* If *@dstp is empty, returns 0, else returns 1
*/
static inline int cpumask_and(struct cpumask *dstp,
const struct cpumask *src1p,
const struct cpumask *src2p)
{
return bitmap_and(cpumask_bits(dstp), cpumask_bits(src1p),
cpumask_bits(src2p), nr_cpumask_bits);
}
/**
* cpumask_or - *dstp = *src1p | *src2p
* @dstp: the cpumask result
* @src1p: the first input
* @src2p: the second input
*/
static inline void cpumask_or(struct cpumask *dstp, const struct cpumask *src1p,
const struct cpumask *src2p)
{
bitmap_or(cpumask_bits(dstp), cpumask_bits(src1p),
cpumask_bits(src2p), nr_cpumask_bits);
}
/**
* cpumask_xor - *dstp = *src1p ^ *src2p
* @dstp: the cpumask result
* @src1p: the first input
* @src2p: the second input
*/
static inline void cpumask_xor(struct cpumask *dstp,
const struct cpumask *src1p,
const struct cpumask *src2p)
{
bitmap_xor(cpumask_bits(dstp), cpumask_bits(src1p),
cpumask_bits(src2p), nr_cpumask_bits);
}
/**
* cpumask_andnot - *dstp = *src1p & ~*src2p
* @dstp: the cpumask result
* @src1p: the first input
* @src2p: the second input
*
* If *@dstp is empty, returns 0, else returns 1
*/
static inline int cpumask_andnot(struct cpumask *dstp,
const struct cpumask *src1p,
const struct cpumask *src2p)
{
return bitmap_andnot(cpumask_bits(dstp), cpumask_bits(src1p),
cpumask_bits(src2p), nr_cpumask_bits);
}
/**
* cpumask_complement - *dstp = ~*srcp
* @dstp: the cpumask result
* @srcp: the input to invert
*/
static inline void cpumask_complement(struct cpumask *dstp,
const struct cpumask *srcp)
{
bitmap_complement(cpumask_bits(dstp), cpumask_bits(srcp),
nr_cpumask_bits);
}
/**
* cpumask_equal - *src1p == *src2p
* @src1p: the first input
* @src2p: the second input
*/
static inline bool cpumask_equal(const struct cpumask *src1p,
const struct cpumask *src2p)
{
return bitmap_equal(cpumask_bits(src1p), cpumask_bits(src2p),
nr_cpumask_bits);
}
/**
* cpumask_or_equal - *src1p | *src2p == *src3p
* @src1p: the first input
* @src2p: the second input
* @src3p: the third input
*/
static inline bool cpumask_or_equal(const struct cpumask *src1p,
const struct cpumask *src2p,
const struct cpumask *src3p)
{
return bitmap_or_equal(cpumask_bits(src1p), cpumask_bits(src2p),
cpumask_bits(src3p), nr_cpumask_bits);
}
/**
* cpumask_intersects - (*src1p & *src2p) != 0
* @src1p: the first input
* @src2p: the second input
*/
static inline bool cpumask_intersects(const struct cpumask *src1p,
const struct cpumask *src2p)
{
return bitmap_intersects(cpumask_bits(src1p), cpumask_bits(src2p),
nr_cpumask_bits);
}
/**
* cpumask_subset - (*src1p & ~*src2p) == 0
* @src1p: the first input
* @src2p: the second input
*
* Returns 1 if *@src1p is a subset of *@src2p, else returns 0
*/
static inline int cpumask_subset(const struct cpumask *src1p,
const struct cpumask *src2p)
{
return bitmap_subset(cpumask_bits(src1p), cpumask_bits(src2p),
nr_cpumask_bits);
}
/**
* cpumask_empty - *srcp == 0
* @srcp: the cpumask to that all cpus < nr_cpu_ids are clear.
*/
static inline bool cpumask_empty(const struct cpumask *srcp)
{
return bitmap_empty(cpumask_bits(srcp), nr_cpumask_bits);
}
/**
* cpumask_full - *srcp == 0xFFFFFFFF...
* @srcp: the cpumask to that all cpus < nr_cpu_ids are set.
*/
static inline bool cpumask_full(const struct cpumask *srcp)
{
return bitmap_full(cpumask_bits(srcp), nr_cpumask_bits);
}
/**
* cpumask_weight - Count of bits in *srcp
* @srcp: the cpumask to count bits (< nr_cpu_ids) in.
*/
static inline unsigned int cpumask_weight(const struct cpumask *srcp)
{
return bitmap_weight(cpumask_bits(srcp), nr_cpumask_bits);
}
/**
* cpumask_shift_right - *dstp = *srcp >> n
* @dstp: the cpumask result
* @srcp: the input to shift
* @n: the number of bits to shift by
*/
static inline void cpumask_shift_right(struct cpumask *dstp,
const struct cpumask *srcp, int n)
{
bitmap_shift_right(cpumask_bits(dstp), cpumask_bits(srcp), n,
nr_cpumask_bits);
}
/**
* cpumask_shift_left - *dstp = *srcp << n
* @dstp: the cpumask result
* @srcp: the input to shift
* @n: the number of bits to shift by
*/
static inline void cpumask_shift_left(struct cpumask *dstp,
const struct cpumask *srcp, int n)
{
bitmap_shift_left(cpumask_bits(dstp), cpumask_bits(srcp), n,
nr_cpumask_bits);
}
/**
* cpumask_copy - *dstp = *srcp
* @dstp: the result
* @srcp: the input cpumask
*/
static inline void cpumask_copy(struct cpumask *dstp,
const struct cpumask *srcp)
{
bitmap_copy(cpumask_bits(dstp), cpumask_bits(srcp), nr_cpumask_bits);
}
/**
* cpumask_any - pick a "random" cpu from *srcp
* @srcp: the input cpumask
*
* Returns >= nr_cpu_ids if no cpus set.
*/
#define cpumask_any(srcp) cpumask_first(srcp)
/**
* cpumask_first_and - return the first cpu from *srcp1 & *srcp2
* @src1p: the first input
* @src2p: the second input
*
* Returns >= nr_cpu_ids if no cpus set in both. See also cpumask_next_and().
*/
#define cpumask_first_and(src1p, src2p) cpumask_next_and(-1, (src1p), (src2p))
/**
* cpumask_any_and - pick a "random" cpu from *mask1 & *mask2
* @mask1: the first input cpumask
* @mask2: the second input cpumask
*
* Returns >= nr_cpu_ids if no cpus set.
*/
#define cpumask_any_and(mask1, mask2) cpumask_first_and((mask1), (mask2))
/**
* cpumask_of - the cpumask containing just a given cpu
* @cpu: the cpu (<= nr_cpu_ids)
*/
#define cpumask_of(cpu) (get_cpu_mask(cpu))
/**
* cpumask_parse_user - extract a cpumask from a user string
* @buf: the buffer to extract from
* @len: the length of the buffer
* @dstp: the cpumask to set.
*
* Returns -errno, or 0 for success.
*/
static inline int cpumask_parse_user(const char __user *buf, int len,
struct cpumask *dstp)
{
return bitmap_parse_user(buf, len, cpumask_bits(dstp), nr_cpumask_bits);
}
/**
* cpumask_parselist_user - extract a cpumask from a user string
* @buf: the buffer to extract from
* @len: the length of the buffer
* @dstp: the cpumask to set.
*
* Returns -errno, or 0 for success.
*/
static inline int cpumask_parselist_user(const char __user *buf, int len,
struct cpumask *dstp)
{
return bitmap_parselist_user(buf, len, cpumask_bits(dstp),
nr_cpumask_bits);
}
/**
* cpumask_parse - extract a cpumask from a string
* @buf: the buffer to extract from
* @dstp: the cpumask to set.
*
* Returns -errno, or 0 for success.
*/
static inline int cpumask_parse(const char *buf, struct cpumask *dstp)
{
return bitmap_parse(buf, UINT_MAX, cpumask_bits(dstp), nr_cpumask_bits);
}
/**
* cpulist_parse - extract a cpumask from a user string of ranges
* @buf: the buffer to extract from
* @dstp: the cpumask to set.
*
* Returns -errno, or 0 for success.
*/
static inline int cpulist_parse(const char *buf, struct cpumask *dstp)
{
return bitmap_parselist(buf, cpumask_bits(dstp), nr_cpumask_bits);
}
/**
* cpumask_size - size to allocate for a 'struct cpumask' in bytes
*/
static inline unsigned int cpumask_size(void)
{
return BITS_TO_LONGS(nr_cpumask_bits) * sizeof(long);
}
/*
* cpumask_var_t: struct cpumask for stack usage.
*
* Oh, the wicked games we play! In order to make kernel coding a
* little more difficult, we typedef cpumask_var_t to an array or a
* pointer: doing &mask on an array is a noop, so it still works.
*
* ie.
* cpumask_var_t tmpmask;
* if (!alloc_cpumask_var(&tmpmask, GFP_KERNEL))
* return -ENOMEM;
*
* ... use 'tmpmask' like a normal struct cpumask * ...
*
* free_cpumask_var(tmpmask);
*
*
* However, one notable exception is there. alloc_cpumask_var() allocates
* only nr_cpumask_bits bits (in the other hand, real cpumask_t always has
* NR_CPUS bits). Therefore you don't have to dereference cpumask_var_t.
*
* cpumask_var_t tmpmask;
* if (!alloc_cpumask_var(&tmpmask, GFP_KERNEL))
* return -ENOMEM;
*
* var = *tmpmask;
*
* This code makes NR_CPUS length memcopy and brings to a memory corruption.
* cpumask_copy() provide safe copy functionality.
*
* Note that there is another evil here: If you define a cpumask_var_t
* as a percpu variable then the way to obtain the address of the cpumask
* structure differently influences what this_cpu_* operation needs to be
* used. Please use this_cpu_cpumask_var_t in those cases. The direct use
* of this_cpu_ptr() or this_cpu_read() will lead to failures when the
* other type of cpumask_var_t implementation is configured.
*
* Please also note that __cpumask_var_read_mostly can be used to declare
* a cpumask_var_t variable itself (not its content) as read mostly.
*/
#ifdef CONFIG_CPUMASK_OFFSTACK
typedef struct cpumask *cpumask_var_t;
#define this_cpu_cpumask_var_ptr(x) this_cpu_read(x)
#define __cpumask_var_read_mostly __read_mostly
bool alloc_cpumask_var_node(cpumask_var_t *mask, gfp_t flags, int node);
bool alloc_cpumask_var(cpumask_var_t *mask, gfp_t flags);
bool zalloc_cpumask_var_node(cpumask_var_t *mask, gfp_t flags, int node);
bool zalloc_cpumask_var(cpumask_var_t *mask, gfp_t flags);
void alloc_bootmem_cpumask_var(cpumask_var_t *mask);
void free_cpumask_var(cpumask_var_t mask);
void free_bootmem_cpumask_var(cpumask_var_t mask);
static inline bool cpumask_available(cpumask_var_t mask)
{
return mask != NULL;
}
#else
typedef struct cpumask cpumask_var_t[1];
#define this_cpu_cpumask_var_ptr(x) this_cpu_ptr(x)
#define __cpumask_var_read_mostly
static inline bool alloc_cpumask_var(cpumask_var_t *mask, gfp_t flags)
{
return true;
}
static inline bool alloc_cpumask_var_node(cpumask_var_t *mask, gfp_t flags,
int node)
{
return true;
}
static inline bool zalloc_cpumask_var(cpumask_var_t *mask, gfp_t flags)
{
cpumask_clear(*mask);
return true;
}
static inline bool zalloc_cpumask_var_node(cpumask_var_t *mask, gfp_t flags,
int node)
{
cpumask_clear(*mask);
return true;
}
static inline void alloc_bootmem_cpumask_var(cpumask_var_t *mask)
{
}
static inline void free_cpumask_var(cpumask_var_t mask)
{
}
static inline void free_bootmem_cpumask_var(cpumask_var_t mask)
{
}
static inline bool cpumask_available(cpumask_var_t mask)
{
return true;
}
#endif /* CONFIG_CPUMASK_OFFSTACK */
/* It's common to want to use cpu_all_mask in struct member initializers,
* so it has to refer to an address rather than a pointer. */
extern const DECLARE_BITMAP(cpu_all_bits, NR_CPUS);
#define cpu_all_mask to_cpumask(cpu_all_bits)
/* First bits of cpu_bit_bitmap are in fact unset. */
#define cpu_none_mask to_cpumask(cpu_bit_bitmap[0])
#define for_each_possible_cpu(cpu) for_each_cpu((cpu), cpu_possible_mask)
#define for_each_online_cpu(cpu) for_each_cpu((cpu), cpu_online_mask)
#define for_each_present_cpu(cpu) for_each_cpu((cpu), cpu_present_mask)
/* Wrappers for arch boot code to manipulate normally-constant masks */
void init_cpu_present(const struct cpumask *src);
void init_cpu_possible(const struct cpumask *src);
void init_cpu_online(const struct cpumask *src);
static inline void reset_cpu_possible_mask(void)
{
bitmap_zero(cpumask_bits(&__cpu_possible_mask), NR_CPUS);
}
static inline void
set_cpu_possible(unsigned int cpu, bool possible)
{
if (possible)
cpumask_set_cpu(cpu, &__cpu_possible_mask);
else
cpumask_clear_cpu(cpu, &__cpu_possible_mask);
}
static inline void
set_cpu_present(unsigned int cpu, bool present)
{
if (present)
cpumask_set_cpu(cpu, &__cpu_present_mask);
else
cpumask_clear_cpu(cpu, &__cpu_present_mask);
}
void set_cpu_online(unsigned int cpu, bool online);
static inline void
set_cpu_active(unsigned int cpu, bool active)
{
if (active)
cpumask_set_cpu(cpu, &__cpu_active_mask);
else
cpumask_clear_cpu(cpu, &__cpu_active_mask);
}
static inline void
set_cpu_dying(unsigned int cpu, bool dying)
{
if (dying)
cpumask_set_cpu(cpu, &__cpu_dying_mask);
else
cpumask_clear_cpu(cpu, &__cpu_dying_mask);
}
/**
* to_cpumask - convert an NR_CPUS bitmap to a struct cpumask *
* @bitmap: the bitmap
*
* There are a few places where cpumask_var_t isn't appropriate and
* static cpumasks must be used (eg. very early boot), yet we don't
* expose the definition of 'struct cpumask'.
*
* This does the conversion, and can be used as a constant initializer.
*/
#define to_cpumask(bitmap) \
((struct cpumask *)(1 ? (bitmap) \
: (void *)sizeof(__check_is_bitmap(bitmap))))
static inline int __check_is_bitmap(const unsigned long *bitmap)
{
return 1;
}
/*
* Special-case data structure for "single bit set only" constant CPU masks.
*
* We pre-generate all the 64 (or 32) possible bit positions, with enough
* padding to the left and the right, and return the constant pointer
* appropriately offset.
*/
extern const unsigned long
cpu_bit_bitmap[BITS_PER_LONG+1][BITS_TO_LONGS(NR_CPUS)];
static inline const struct cpumask *get_cpu_mask(unsigned int cpu)
{
const unsigned long *p = cpu_bit_bitmap[1 + cpu % BITS_PER_LONG];
p -= cpu / BITS_PER_LONG;
return to_cpumask(p);
}
#if NR_CPUS > 1
/**
* num_online_cpus() - Read the number of online CPUs
*
* Despite the fact that __num_online_cpus is of type atomic_t, this
* interface gives only a momentary snapshot and is not protected against
* concurrent CPU hotplug operations unless invoked from a cpuhp_lock held
* region.
*/
static inline unsigned int num_online_cpus(void)
{
return atomic_read(&__num_online_cpus);
}
#define num_possible_cpus() cpumask_weight(cpu_possible_mask)
#define num_present_cpus() cpumask_weight(cpu_present_mask)
#define num_active_cpus() cpumask_weight(cpu_active_mask)
static inline bool cpu_online(unsigned int cpu)
{
return cpumask_test_cpu(cpu, cpu_online_mask);
}
static inline bool cpu_possible(unsigned int cpu)
{
return cpumask_test_cpu(cpu, cpu_possible_mask);
}
static inline bool cpu_present(unsigned int cpu)
{
return cpumask_test_cpu(cpu, cpu_present_mask);
}
static inline bool cpu_active(unsigned int cpu)
{
return cpumask_test_cpu(cpu, cpu_active_mask);
}
static inline bool cpu_dying(unsigned int cpu)
{
return cpumask_test_cpu(cpu, cpu_dying_mask);
}
#else
#define num_online_cpus() 1U
#define num_possible_cpus() 1U
#define num_present_cpus() 1U
#define num_active_cpus() 1U
static inline bool cpu_online(unsigned int cpu)
{
return cpu == 0;
}
static inline bool cpu_possible(unsigned int cpu)
{
return cpu == 0;
}
static inline bool cpu_present(unsigned int cpu)
{
return cpu == 0;
}
static inline bool cpu_active(unsigned int cpu)
{
return cpu == 0;
}
static inline bool cpu_dying(unsigned int cpu)
{
return false;
}
#endif /* NR_CPUS > 1 */
#define cpu_is_offline(cpu) unlikely(!cpu_online(cpu))
#if NR_CPUS <= BITS_PER_LONG
#define CPU_BITS_ALL \
{ \
[BITS_TO_LONGS(NR_CPUS)-1] = BITMAP_LAST_WORD_MASK(NR_CPUS) \
}
#else /* NR_CPUS > BITS_PER_LONG */
#define CPU_BITS_ALL \
{ \
[0 ... BITS_TO_LONGS(NR_CPUS)-2] = ~0UL, \
[BITS_TO_LONGS(NR_CPUS)-1] = BITMAP_LAST_WORD_MASK(NR_CPUS) \
}
#endif /* NR_CPUS > BITS_PER_LONG */
/**
* cpumap_print_to_pagebuf - copies the cpumask into the buffer either
* as comma-separated list of cpus or hex values of cpumask
* @list: indicates whether the cpumap must be list
* @mask: the cpumask to copy
* @buf: the buffer to copy into
*
* Returns the length of the (null-terminated) @buf string, zero if
* nothing is copied.
*/
static inline ssize_t
cpumap_print_to_pagebuf(bool list, char *buf, const struct cpumask *mask)
{
return bitmap_print_to_pagebuf(list, buf, cpumask_bits(mask),
nr_cpu_ids);
}
/**
* cpumap_print_bitmask_to_buf - copies the cpumask into the buffer as
* hex values of cpumask
*
* @buf: the buffer to copy into
* @mask: the cpumask to copy
* @off: in the string from which we are copying, we copy to @buf
* @count: the maximum number of bytes to print
*
* The function prints the cpumask into the buffer as hex values of
* cpumask; Typically used by bin_attribute to export cpumask bitmask
* ABI.
*
* Returns the length of how many bytes have been copied, excluding
* terminating '\0'.
*/
static inline ssize_t
cpumap_print_bitmask_to_buf(char *buf, const struct cpumask *mask,
loff_t off, size_t count)
{
return bitmap_print_bitmask_to_buf(buf, cpumask_bits(mask),
nr_cpu_ids, off, count) - 1;
}
/**
* cpumap_print_list_to_buf - copies the cpumask into the buffer as
* comma-separated list of cpus
*
* Everything is same with the above cpumap_print_bitmask_to_buf()
* except the print format.
*/
static inline ssize_t
cpumap_print_list_to_buf(char *buf, const struct cpumask *mask,
loff_t off, size_t count)
{
return bitmap_print_list_to_buf(buf, cpumask_bits(mask),
nr_cpu_ids, off, count) - 1;
}
#if NR_CPUS <= BITS_PER_LONG
#define CPU_MASK_ALL \
(cpumask_t) { { \
[BITS_TO_LONGS(NR_CPUS)-1] = BITMAP_LAST_WORD_MASK(NR_CPUS) \
} }
#else
#define CPU_MASK_ALL \
(cpumask_t) { { \
[0 ... BITS_TO_LONGS(NR_CPUS)-2] = ~0UL, \
[BITS_TO_LONGS(NR_CPUS)-1] = BITMAP_LAST_WORD_MASK(NR_CPUS) \
} }
#endif /* NR_CPUS > BITS_PER_LONG */
#define CPU_MASK_NONE \
(cpumask_t) { { \
[0 ... BITS_TO_LONGS(NR_CPUS)-1] = 0UL \
} }
#define CPU_MASK_CPU0 \
(cpumask_t) { { \
[0] = 1UL \
} }
#endif /* __LINUX_CPUMASK_H */
// SPDX-License-Identifier: GPL-2.0-only
/*
* linux/kernel/exit.c
*
* Copyright (C) 1991, 1992 Linus Torvalds
*/
#include <linux/mm.h>
#include <linux/slab.h>
#include <linux/sched/autogroup.h>
#include <linux/sched/mm.h>
#include <linux/sched/stat.h>
#include <linux/sched/task.h>
#include <linux/sched/task_stack.h>
#include <linux/sched/cputime.h>
#include <linux/interrupt.h>
#include <linux/module.h>
#include <linux/capability.h>
#include <linux/completion.h>
#include <linux/personality.h>
#include <linux/tty.h>
#include <linux/iocontext.h>
#include <linux/key.h>
#include <linux/cpu.h>
#include <linux/acct.h>
#include <linux/tsacct_kern.h>
#include <linux/file.h>
#include <linux/fdtable.h>
#include <linux/freezer.h>
#include <linux/binfmts.h>
#include <linux/nsproxy.h>
#include <linux/pid_namespace.h>
#include <linux/ptrace.h>
#include <linux/profile.h>
#include <linux/mount.h>
#include <linux/proc_fs.h>
#include <linux/kthread.h>
#include <linux/mempolicy.h>
#include <linux/taskstats_kern.h>
#include <linux/delayacct.h>
#include <linux/cgroup.h>
#include <linux/syscalls.h>
#include <linux/signal.h>
#include <linux/posix-timers.h>
#include <linux/cn_proc.h>
#include <linux/mutex.h>
#include <linux/futex.h>
#include <linux/pipe_fs_i.h>
#include <linux/audit.h> /* for audit_free() */
#include <linux/resource.h>
#include <linux/blkdev.h>
#include <linux/task_io_accounting_ops.h>
#include <linux/tracehook.h>
#include <linux/fs_struct.h>
#include <linux/init_task.h>
#include <linux/perf_event.h>
#include <trace/events/sched.h>
#include <linux/hw_breakpoint.h>
#include <linux/oom.h>
#include <linux/writeback.h>
#include <linux/shm.h>
#include <linux/kcov.h>
#include <linux/random.h>
#include <linux/rcuwait.h>
#include <linux/compat.h>
#include <linux/io_uring.h>
#include <linux/uaccess.h>
#include <asm/unistd.h>
#include <asm/mmu_context.h>
static void __unhash_process(struct task_struct *p, bool group_dead)
{
nr_threads--;
detach_pid(p, PIDTYPE_PID);
if (group_dead) {
detach_pid(p, PIDTYPE_TGID);
detach_pid(p, PIDTYPE_PGID);
detach_pid(p, PIDTYPE_SID);
list_del_rcu(&p->tasks);
list_del_init(&p->sibling);
__this_cpu_dec(process_counts);
}
list_del_rcu(&p->thread_group);
list_del_rcu(&p->thread_node);
}
/*
* This function expects the tasklist_lock write-locked.
*/
static void __exit_signal(struct task_struct *tsk)
{
struct signal_struct *sig = tsk->signal;
bool group_dead = thread_group_leader(tsk);
struct sighand_struct *sighand;
struct tty_struct *tty;
u64 utime, stime;
sighand = rcu_dereference_check(tsk->sighand,
lockdep_tasklist_lock_is_held());
spin_lock(&sighand->siglock);
#ifdef CONFIG_POSIX_TIMERS
posix_cpu_timers_exit(tsk);
if (group_dead)
posix_cpu_timers_exit_group(tsk);
#endif
if (group_dead) {
tty = sig->tty;
sig->tty = NULL;
} else {
/*
* If there is any task waiting for the group exit
* then notify it:
*/
if (sig->notify_count > 0 && !--sig->notify_count)
wake_up_process(sig->group_exit_task);
if (tsk == sig->curr_target)
sig->curr_target = next_thread(tsk);
}
add_device_randomness((const void*) &tsk->se.sum_exec_runtime,
sizeof(unsigned long long));
/*
* Accumulate here the counters for all threads as they die. We could
* skip the group leader because it is the last user of signal_struct,
* but we want to avoid the race with thread_group_cputime() which can
* see the empty ->thread_head list.
*/
task_cputime(tsk, &utime, &stime);
write_seqlock(&sig->stats_lock);
sig->utime += utime;
sig->stime += stime;
sig->gtime += task_gtime(tsk);
sig->min_flt += tsk->min_flt;
sig->maj_flt += tsk->maj_flt;
sig->nvcsw += tsk->nvcsw;
sig->nivcsw += tsk->nivcsw;
sig->inblock += task_io_get_inblock(tsk);
sig->oublock += task_io_get_oublock(tsk);
task_io_accounting_add(&sig->ioac, &tsk->ioac);
sig->sum_sched_runtime += tsk->se.sum_exec_runtime;
sig->nr_threads--;
__unhash_process(tsk, group_dead);
write_sequnlock(&sig->stats_lock);
/*
* Do this under ->siglock, we can race with another thread
* doing sigqueue_free() if we have SIGQUEUE_PREALLOC signals.
*/
flush_sigqueue(&tsk->pending);
tsk->sighand = NULL;
spin_unlock(&sighand->siglock);
__cleanup_sighand(sighand);
clear_tsk_thread_flag(tsk, TIF_SIGPENDING);
if (group_dead) {
flush_sigqueue(&sig->shared_pending);
tty_kref_put(tty);
}
}
static void delayed_put_task_struct(struct rcu_head *rhp)
{
struct task_struct *tsk = container_of(rhp, struct task_struct, rcu);
perf_event_delayed_put(tsk);
trace_sched_process_free(tsk);
put_task_struct(tsk);
}
void put_task_struct_rcu_user(struct task_struct *task)
{
if (refcount_dec_and_test(&task->rcu_users))
call_rcu(&task->rcu, delayed_put_task_struct);
}
void release_task(struct task_struct *p)
{
struct task_struct *leader;
struct pid *thread_pid;
int zap_leader;
repeat:
/* don't need to get the RCU readlock here - the process is dead and
* can't be modifying its own credentials. But shut RCU-lockdep up */
rcu_read_lock();
dec_rlimit_ucounts(task_ucounts(p), UCOUNT_RLIMIT_NPROC, 1);
rcu_read_unlock();
cgroup_release(p);
write_lock_irq(&tasklist_lock);
ptrace_release_task(p);
thread_pid = get_pid(p->thread_pid);
__exit_signal(p);
/*
* If we are the last non-leader member of the thread
* group, and the leader is zombie, then notify the
* group leader's parent process. (if it wants notification.)
*/
zap_leader = 0;
leader = p->group_leader;
if (leader != p && thread_group_empty(leader)
&& leader->exit_state == EXIT_ZOMBIE) {
/*
* If we were the last child thread and the leader has
* exited already, and the leader's parent ignores SIGCHLD,
* then we are the one who should release the leader.
*/
zap_leader = do_notify_parent(leader, leader->exit_signal);
if (zap_leader)
leader->exit_state = EXIT_DEAD;
}
write_unlock_irq(&tasklist_lock);
seccomp_filter_release(p);
proc_flush_pid(thread_pid);
put_pid(thread_pid);
release_thread(p);
put_task_struct_rcu_user(p);
p = leader;
if (unlikely(zap_leader))
goto repeat;
}
int rcuwait_wake_up(struct rcuwait *w)
{
int ret = 0;
struct task_struct *task;
rcu_read_lock();
/*
* Order condition vs @task, such that everything prior to the load
* of @task is visible. This is the condition as to why the user called
* rcuwait_wake() in the first place. Pairs with set_current_state()
* barrier (A) in rcuwait_wait_event().
*
* WAIT WAKE
* [S] tsk = current [S] cond = true
* MB (A) MB (B)
* [L] cond [L] tsk
*/
smp_mb(); /* (B) */
task = rcu_dereference(w->task);
if (task)
ret = wake_up_process(task);
rcu_read_unlock();
return ret;
}
EXPORT_SYMBOL_GPL(rcuwait_wake_up);
/*
* Determine if a process group is "orphaned", according to the POSIX
* definition in 2.2.2.52. Orphaned process groups are not to be affected
* by terminal-generated stop signals. Newly orphaned process groups are
* to receive a SIGHUP and a SIGCONT.
*
* "I ask you, have you ever known what it is to be an orphan?"
*/
static int will_become_orphaned_pgrp(struct pid *pgrp,
struct task_struct *ignored_task)
{
struct task_struct *p;
do_each_pid_task(pgrp, PIDTYPE_PGID, p) {
if ((p == ignored_task) ||
(p->exit_state && thread_group_empty(p)) ||
is_global_init(p->real_parent))
continue;
if (task_pgrp(p->real_parent) != pgrp &&
task_session(p->real_parent) == task_session(p))
return 0;
} while_each_pid_task(pgrp, PIDTYPE_PGID, p);
return 1;
}
int is_current_pgrp_orphaned(void)
{
int retval;
read_lock(&tasklist_lock);
retval = will_become_orphaned_pgrp(task_pgrp(current), NULL);
read_unlock(&tasklist_lock);
return retval;
}
static bool has_stopped_jobs(struct pid *pgrp)
{
struct task_struct *p;
do_each_pid_task(pgrp, PIDTYPE_PGID, p) {
if (p->signal->flags & SIGNAL_STOP_STOPPED)
return true;
} while_each_pid_task(pgrp, PIDTYPE_PGID, p);
return false;
}
/*
* Check to see if any process groups have become orphaned as
* a result of our exiting, and if they have any stopped jobs,
* send them a SIGHUP and then a SIGCONT. (POSIX 3.2.2.2)
*/
static void
kill_orphaned_pgrp(struct task_struct *tsk, struct task_struct *parent)
{
struct pid *pgrp = task_pgrp(tsk);
struct task_struct *ignored_task = tsk;
if (!parent)
/* exit: our father is in a different pgrp than
* we are and we were the only connection outside.
*/
parent = tsk->real_parent;
else
/* reparent: our child is in a different pgrp than
* we are, and it was the only connection outside.
*/
ignored_task = NULL;
if (task_pgrp(parent) != pgrp &&
task_session(parent) == task_session(tsk) &&
will_become_orphaned_pgrp(pgrp, ignored_task) &&
has_stopped_jobs(pgrp)) {
__kill_pgrp_info(SIGHUP, SEND_SIG_PRIV, pgrp);
__kill_pgrp_info(SIGCONT, SEND_SIG_PRIV, pgrp);
}
}
#ifdef CONFIG_MEMCG
/*
* A task is exiting. If it owned this mm, find a new owner for the mm.
*/
void mm_update_next_owner(struct mm_struct *mm)
{
struct task_struct *c, *g, *p = current;
retry:
/*
* If the exiting or execing task is not the owner, it's
* someone else's problem.
*/
if (mm->owner != p)
return;
/*
* The current owner is exiting/execing and there are no other
* candidates. Do not leave the mm pointing to a possibly
* freed task structure.
*/
if (atomic_read(&mm->mm_users) <= 1) {
WRITE_ONCE(mm->owner, NULL);
return;
}
read_lock(&tasklist_lock);
/*
* Search in the children
*/
list_for_each_entry(c, &p->children, sibling) {
if (c->mm == mm)
goto assign_new_owner;
}
/*
* Search in the siblings
*/
list_for_each_entry(c, &p->real_parent->children, sibling) {
if (c->mm == mm)
goto assign_new_owner;
}
/*
* Search through everything else, we should not get here often.
*/
for_each_process(g) {
if (g->flags & PF_KTHREAD)
continue;
for_each_thread(g, c) {
if (c->mm == mm)
goto assign_new_owner;
if (c->mm)
break;
}
}
read_unlock(&tasklist_lock);
/*
* We found no owner yet mm_users > 1: this implies that we are
* most likely racing with swapoff (try_to_unuse()) or /proc or
* ptrace or page migration (get_task_mm()). Mark owner as NULL.
*/
WRITE_ONCE(mm->owner, NULL);
return;
assign_new_owner:
BUG_ON(c == p);
get_task_struct(c);
/*
* The task_lock protects c->mm from changing.
* We always want mm->owner->mm == mm
*/
task_lock(c);
/*
* Delay read_unlock() till we have the task_lock()
* to ensure that c does not slip away underneath us
*/
read_unlock(&tasklist_lock);
if (c->mm != mm) {
task_unlock(c);
put_task_struct(c);
goto retry;
}
WRITE_ONCE(mm->owner, c);
task_unlock(c);
put_task_struct(c);
}
#endif /* CONFIG_MEMCG */
/*
* Turn us into a lazy TLB process if we
* aren't already..
*/
static void exit_mm(void)
{
struct mm_struct *mm = current->mm;
struct core_state *core_state;
exit_mm_release(current, mm);
if (!mm)
return;
sync_mm_rss(mm);
/*
* Serialize with any possible pending coredump.
* We must hold mmap_lock around checking core_state
* and clearing tsk->mm. The core-inducing thread
* will increment ->nr_threads for each thread in the
* group with ->mm != NULL.
*/
mmap_read_lock(mm);
core_state = mm->core_state;
if (core_state) {
struct core_thread self;
mmap_read_unlock(mm);
self.task = current;
if (self.task->flags & PF_SIGNALED)
self.next = xchg(&core_state->dumper.next, &self);
else
self.task = NULL;
/*
* Implies mb(), the result of xchg() must be visible
* to core_state->dumper.
*/
if (atomic_dec_and_test(&core_state->nr_threads))
complete(&core_state->startup);
for (;;) {
set_current_state(TASK_UNINTERRUPTIBLE);
if (!self.task) /* see coredump_finish() */
break;
freezable_schedule();
}
__set_current_state(TASK_RUNNING);
mmap_read_lock(mm);
}
mmgrab(mm);
BUG_ON(mm != current->active_mm);
/* more a memory barrier than a real lock */
task_lock(current);
/*
* When a thread stops operating on an address space, the loop
* in membarrier_private_expedited() may not observe that
* tsk->mm, and the loop in membarrier_global_expedited() may
* not observe a MEMBARRIER_STATE_GLOBAL_EXPEDITED
* rq->membarrier_state, so those would not issue an IPI.
* Membarrier requires a memory barrier after accessing
* user-space memory, before clearing tsk->mm or the
* rq->membarrier_state.
*/
smp_mb__after_spinlock();
local_irq_disable();
current->mm = NULL;
membarrier_update_current_mm(NULL);
enter_lazy_tlb(mm, current);
local_irq_enable();
task_unlock(current);
mmap_read_unlock(mm);
mm_update_next_owner(mm);
mmput(mm);
if (test_thread_flag(TIF_MEMDIE))
exit_oom_victim();
}
static struct task_struct *find_alive_thread(struct task_struct *p)
{
struct task_struct *t;
for_each_thread(p, t) {
if (!(t->flags & PF_EXITING))
return t;
}
return NULL;
}
static struct task_struct *find_child_reaper(struct task_struct *father,
struct list_head *dead)
__releases(&tasklist_lock)
__acquires(&tasklist_lock)
{
struct pid_namespace *pid_ns = task_active_pid_ns(father);
struct task_struct *reaper = pid_ns->child_reaper;
struct task_struct *p, *n;
if (likely(reaper != father))
return reaper;
reaper = find_alive_thread(father);
if (reaper) {
pid_ns->child_reaper = reaper;
return reaper;
}
write_unlock_irq(&tasklist_lock);
list_for_each_entry_safe(p, n, dead, ptrace_entry) {
list_del_init(&p->ptrace_entry);
release_task(p);
}
zap_pid_ns_processes(pid_ns);
write_lock_irq(&tasklist_lock);
return father;
}
/*
* When we die, we re-parent all our children, and try to:
* 1. give them to another thread in our thread group, if such a member exists
* 2. give it to the first ancestor process which prctl'd itself as a
* child_subreaper for its children (like a service manager)
* 3. give it to the init process (PID 1) in our pid namespace
*/
static struct task_struct *find_new_reaper(struct task_struct *father,
struct task_struct *child_reaper)
{
struct task_struct *thread, *reaper;
thread = find_alive_thread(father);
if (thread)
return thread;
if (father->signal->has_child_subreaper) {
unsigned int ns_level = task_pid(father)->level;
/*
* Find the first ->is_child_subreaper ancestor in our pid_ns.
* We can't check reaper != child_reaper to ensure we do not
* cross the namespaces, the exiting parent could be injected
* by setns() + fork().
* We check pid->level, this is slightly more efficient than
* task_active_pid_ns(reaper) != task_active_pid_ns(father).
*/
for (reaper = father->real_parent;
task_pid(reaper)->level == ns_level;
reaper = reaper->real_parent) {
if (reaper == &init_task)
break;
if (!reaper->signal->is_child_subreaper)
continue;
thread = find_alive_thread(reaper);
if (thread)
return thread;
}
}
return child_reaper;
}
/*
* Any that need to be release_task'd are put on the @dead list.
*/
static void reparent_leader(struct task_struct *father, struct task_struct *p,
struct list_head *dead)
{
if (unlikely(p->exit_state == EXIT_DEAD))
return;
/* We don't want people slaying init. */
p->exit_signal = SIGCHLD;
/* If it has exited notify the new parent about this child's death. */
if (!p->ptrace &&
p->exit_state == EXIT_ZOMBIE && thread_group_empty(p)) {
if (do_notify_parent(p, p->exit_signal)) {
p->exit_state = EXIT_DEAD;
list_add(&p->ptrace_entry, dead);
}
}
kill_orphaned_pgrp(p, father);
}
/*
* This does two things:
*
* A. Make init inherit all the child processes
* B. Check to see if any process groups have become orphaned
* as a result of our exiting, and if they have any stopped
* jobs, send them a SIGHUP and then a SIGCONT. (POSIX 3.2.2.2)
*/
static void forget_original_parent(struct task_struct *father,
struct list_head *dead)
{
struct task_struct *p, *t, *reaper;
if (unlikely(!list_empty(&father->ptraced)))
exit_ptrace(father, dead);
/* Can drop and reacquire tasklist_lock */
reaper = find_child_reaper(father, dead);
if (list_empty(&father->children))
return;
reaper = find_new_reaper(father, reaper);
list_for_each_entry(p, &father->children, sibling) {
for_each_thread(p, t) {
RCU_INIT_POINTER(t->real_parent, reaper);
BUG_ON((!t->ptrace) != (rcu_access_pointer(t->parent) == father));
if (likely(!t->ptrace))
t->parent = t->real_parent;
if (t->pdeath_signal)
group_send_sig_info(t->pdeath_signal,
SEND_SIG_NOINFO, t,
PIDTYPE_TGID);
}
/*
* If this is a threaded reparent there is no need to
* notify anyone anything has happened.
*/
if (!same_thread_group(reaper, father))
reparent_leader(father, p, dead);
}
list_splice_tail_init(&father->children, &reaper->children);
}
/*
* Send signals to all our closest relatives so that they know
* to properly mourn us..
*/
static void exit_notify(struct task_struct *tsk, int group_dead)
{
bool autoreap;
struct task_struct *p, *n;
LIST_HEAD(dead);
write_lock_irq(&tasklist_lock);
forget_original_parent(tsk, &dead);
if (group_dead)
kill_orphaned_pgrp(tsk->group_leader, NULL);
tsk->exit_state = EXIT_ZOMBIE;
if (unlikely(tsk->ptrace)) {
int sig = thread_group_leader(tsk) &&
thread_group_empty(tsk) &&
!ptrace_reparented(tsk) ?
tsk->exit_signal : SIGCHLD;
autoreap = do_notify_parent(tsk, sig);
} else if (thread_group_leader(tsk)) {
autoreap = thread_group_empty(tsk) &&
do_notify_parent(tsk, tsk->exit_signal);
} else {
autoreap = true;
}
if (autoreap) {
tsk->exit_state = EXIT_DEAD;
list_add(&tsk->ptrace_entry, &dead);
}
/* mt-exec, de_thread() is waiting for group leader */
if (unlikely(tsk->signal->notify_count < 0))
wake_up_process(tsk->signal->group_exit_task);
write_unlock_irq(&tasklist_lock);
list_for_each_entry_safe(p, n, &dead, ptrace_entry) {
list_del_init(&p->ptrace_entry);
release_task(p);
}
}
#ifdef CONFIG_DEBUG_STACK_USAGE
static void check_stack_usage(void)
{
static DEFINE_SPINLOCK(low_water_lock);
static int lowest_to_date = THREAD_SIZE;
unsigned long free;
free = stack_not_used(current);
if (free >= lowest_to_date)
return;
spin_lock(&low_water_lock);
if (free < lowest_to_date) {
pr_info("%s (%d) used greatest stack depth: %lu bytes left\n",
current->comm, task_pid_nr(current), free);
lowest_to_date = free;
}
spin_unlock(&low_water_lock);
}
#else
static inline void check_stack_usage(void) {}
#endif
void __noreturn do_exit(long code)
{
struct task_struct *tsk = current;
int group_dead;
/*
* We can get here from a kernel oops, sometimes with preemption off.
* Start by checking for critical errors.
* Then fix up important state like USER_DS and preemption.
* Then do everything else.
*/
WARN_ON(blk_needs_flush_plug(tsk));
if (unlikely(in_interrupt()))
panic("Aiee, killing interrupt handler!");
if (unlikely(!tsk->pid))
panic("Attempted to kill the idle task!");
/*
* If do_exit is called because this processes oopsed, it's possible
* that get_fs() was left as KERNEL_DS, so reset it to USER_DS before
* continuing. Amongst other possible reasons, this is to prevent
* mm_release()->clear_child_tid() from writing to a user-controlled
* kernel address.
*/
force_uaccess_begin();
if (unlikely(in_atomic())) {
pr_info("note: %s[%d] exited with preempt_count %d\n",
current->comm, task_pid_nr(current),
preempt_count());
preempt_count_set(PREEMPT_ENABLED);
}
profile_task_exit(tsk);
kcov_task_exit(tsk);
ptrace_event(PTRACE_EVENT_EXIT, code);
validate_creds_for_do_exit(tsk);
/*
* We're taking recursive faults here in do_exit. Safest is to just
* leave this task alone and wait for reboot.
*/
if (unlikely(tsk->flags & PF_EXITING)) {
pr_alert("Fixing recursive fault but reboot is needed!\n");
futex_exit_recursive(tsk);
set_current_state(TASK_UNINTERRUPTIBLE);
schedule();
}
io_uring_files_cancel();
exit_signals(tsk); /* sets PF_EXITING */
/* sync mm's RSS info before statistics gathering */
if (tsk->mm)
sync_mm_rss(tsk->mm);
acct_update_integrals(tsk);
group_dead = atomic_dec_and_test(&tsk->signal->live);
if (group_dead) {
/*
* If the last thread of global init has exited, panic
* immediately to get a useable coredump.
*/
if (unlikely(is_global_init(tsk)))
panic("Attempted to kill init! exitcode=0x%08x\n",
tsk->signal->group_exit_code ?: (int)code);
#ifdef CONFIG_POSIX_TIMERS
hrtimer_cancel(&tsk->signal->real_timer);
exit_itimers(tsk->signal);
#endif
if (tsk->mm)
setmax_mm_hiwater_rss(&tsk->signal->maxrss, tsk->mm);
}
acct_collect(code, group_dead);
if (group_dead)
tty_audit_exit();
audit_free(tsk);
tsk->exit_code = code;
taskstats_exit(tsk, group_dead);
exit_mm();
if (group_dead)
acct_process();
trace_sched_process_exit(tsk);
exit_sem(tsk);
exit_shm(tsk);
exit_files(tsk);
exit_fs(tsk);
if (group_dead)
disassociate_ctty(1);
exit_task_namespaces(tsk);
exit_task_work(tsk);
exit_thread(tsk);
/*
* Flush inherited counters to the parent - before the parent
* gets woken up by child-exit notifications.
*
* because of cgroup mode, must be called before cgroup_exit()
*/
perf_event_exit_task(tsk);
sched_autogroup_exit_task(tsk);
cgroup_exit(tsk);
/*
* FIXME: do that only when needed, using sched_exit tracepoint
*/
flush_ptrace_hw_breakpoint(tsk);
exit_tasks_rcu_start();
exit_notify(tsk, group_dead);
proc_exit_connector(tsk);
mpol_put_task_policy(tsk);
#ifdef CONFIG_FUTEX
if (unlikely(current->pi_state_cache))
kfree(current->pi_state_cache);
#endif
/*
* Make sure we are holding no locks:
*/
debug_check_no_locks_held();
if (tsk->io_context)
exit_io_context(tsk);
if (tsk->splice_pipe)
free_pipe_info(tsk->splice_pipe);
if (tsk->task_frag.page)
put_page(tsk->task_frag.page);
validate_creds_for_do_exit(tsk);
check_stack_usage();
preempt_disable();
if (tsk->nr_dirtied)
__this_cpu_add(dirty_throttle_leaks, tsk->nr_dirtied);
exit_rcu();
exit_tasks_rcu_finish();
lockdep_free_task(tsk);
do_task_dead();
}
EXPORT_SYMBOL_GPL(do_exit);
void complete_and_exit(struct completion *comp, long code)
{
if (comp)
complete(comp);
do_exit(code);
}
EXPORT_SYMBOL(complete_and_exit);
SYSCALL_DEFINE1(exit, int, error_code)
{
do_exit((error_code&0xff)<<8);
}
/*
* Take down every thread in the group. This is called by fatal signals
* as well as by sys_exit_group (below).
*/
void
do_group_exit(int exit_code)
{
struct signal_struct *sig = current->signal;
BUG_ON(exit_code & 0x80); /* core dumps don't get here */
if (signal_group_exit(sig))
exit_code = sig->group_exit_code;
else if (!thread_group_empty(current)) {
struct sighand_struct *const sighand = current->sighand;
spin_lock_irq(&sighand->siglock);
if (signal_group_exit(sig))
/* Another thread got here before we took the lock. */
exit_code = sig->group_exit_code;
else {
sig->group_exit_code = exit_code;
sig->flags = SIGNAL_GROUP_EXIT;
zap_other_threads(current);
}
spin_unlock_irq(&sighand->siglock);
}
do_exit(exit_code);
/* NOTREACHED */
}
/*
* this kills every thread in the thread group. Note that any externally
* wait4()-ing process will get the correct exit code - even if this
* thread is not the thread group leader.
*/
SYSCALL_DEFINE1(exit_group, int, error_code)
{
do_group_exit((error_code & 0xff) << 8);
/* NOTREACHED */
return 0;
}
struct waitid_info {
pid_t pid;
uid_t uid;
int status;
int cause;
};
struct wait_opts {
enum pid_type wo_type;
int wo_flags;
struct pid *wo_pid;
struct waitid_info *wo_info;
int wo_stat;
struct rusage *wo_rusage;
wait_queue_entry_t child_wait;
int notask_error;
};
static int eligible_pid(struct wait_opts *wo, struct task_struct *p)
{
return wo->wo_type == PIDTYPE_MAX ||
task_pid_type(p, wo->wo_type) == wo->wo_pid;
}
static int
eligible_child(struct wait_opts *wo, bool ptrace, struct task_struct *p)
{
if (!eligible_pid(wo, p))
return 0;
/*
* Wait for all children (clone and not) if __WALL is set or
* if it is traced by us.
*/
if (ptrace || (wo->wo_flags & __WALL))
return 1;
/*
* Otherwise, wait for clone children *only* if __WCLONE is set;
* otherwise, wait for non-clone children *only*.
*
* Note: a "clone" child here is one that reports to its parent
* using a signal other than SIGCHLD, or a non-leader thread which
* we can only see if it is traced by us.
*/
if ((p->exit_signal != SIGCHLD) ^ !!(wo->wo_flags & __WCLONE))
return 0;
return 1;
}
/*
* Handle sys_wait4 work for one task in state EXIT_ZOMBIE. We hold
* read_lock(&tasklist_lock) on entry. If we return zero, we still hold
* the lock and this task is uninteresting. If we return nonzero, we have
* released the lock and the system call should return.
*/
static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
{
int state, status;
pid_t pid = task_pid_vnr(p);
uid_t uid = from_kuid_munged(current_user_ns(), task_uid(p));
struct waitid_info *infop;
if (!likely(wo->wo_flags & WEXITED))
return 0;
if (unlikely(wo->wo_flags & WNOWAIT)) {
status = p->exit_code;
get_task_struct(p);
read_unlock(&tasklist_lock);
sched_annotate_sleep();
if (wo->wo_rusage)
getrusage(p, RUSAGE_BOTH, wo->wo_rusage);
put_task_struct(p);
goto out_info;
}
/*
* Move the task's state to DEAD/TRACE, only one thread can do this.
*/
state = (ptrace_reparented(p) && thread_group_leader(p)) ?
EXIT_TRACE : EXIT_DEAD;
if (cmpxchg(&p->exit_state, EXIT_ZOMBIE, state) != EXIT_ZOMBIE)
return 0;
/*
* We own this thread, nobody else can reap it.
*/
read_unlock(&tasklist_lock);
sched_annotate_sleep();
/*
* Check thread_group_leader() to exclude the traced sub-threads.
*/
if (state == EXIT_DEAD && thread_group_leader(p)) {
struct signal_struct *sig = p->signal;
struct signal_struct *psig = current->signal;
unsigned long maxrss;
u64 tgutime, tgstime;
/*
* The resource counters for the group leader are in its
* own task_struct. Those for dead threads in the group
* are in its signal_struct, as are those for the child
* processes it has previously reaped. All these
* accumulate in the parent's signal_struct c* fields.
*
* We don't bother to take a lock here to protect these
* p->signal fields because the whole thread group is dead
* and nobody can change them.
*
* psig->stats_lock also protects us from our sub-theads
* which can reap other children at the same time. Until
* we change k_getrusage()-like users to rely on this lock
* we have to take ->siglock as well.
*
* We use thread_group_cputime_adjusted() to get times for
* the thread group, which consolidates times for all threads
* in the group including the group leader.
*/
thread_group_cputime_adjusted(p, &tgutime, &tgstime);
spin_lock_irq(¤t->sighand->siglock);
write_seqlock(&psig->stats_lock);
psig->cutime += tgutime + sig->cutime;
psig->cstime += tgstime + sig->cstime;
psig->cgtime += task_gtime(p) + sig->gtime + sig->cgtime;
psig->cmin_flt +=
p->min_flt + sig->min_flt + sig->cmin_flt;
psig->cmaj_flt +=
p->maj_flt + sig->maj_flt + sig->cmaj_flt;
psig->cnvcsw +=
p->nvcsw + sig->nvcsw + sig->cnvcsw;
psig->cnivcsw +=
p->nivcsw + sig->nivcsw + sig->cnivcsw;
psig->cinblock +=
task_io_get_inblock(p) +
sig->inblock + sig->cinblock;
psig->coublock +=
task_io_get_oublock(p) +
sig->oublock + sig->coublock;
maxrss = max(sig->maxrss, sig->cmaxrss);
if (psig->cmaxrss < maxrss)
psig->cmaxrss = maxrss;
task_io_accounting_add(&psig->ioac, &p->ioac);
task_io_accounting_add(&psig->ioac, &sig->ioac);
write_sequnlock(&psig->stats_lock);
spin_unlock_irq(¤t->sighand->siglock);
}
if (wo->wo_rusage)
getrusage(p, RUSAGE_BOTH, wo->wo_rusage);
status = (p->signal->flags & SIGNAL_GROUP_EXIT)
? p->signal->group_exit_code : p->exit_code;
wo->wo_stat = status;
if (state == EXIT_TRACE) {
write_lock_irq(&tasklist_lock);
/* We dropped tasklist, ptracer could die and untrace */
ptrace_unlink(p);
/* If parent wants a zombie, don't release it now */
state = EXIT_ZOMBIE;
if (do_notify_parent(p, p->exit_signal))
state = EXIT_DEAD;
p->exit_state = state;
write_unlock_irq(&tasklist_lock);
}
if (state == EXIT_DEAD)
release_task(p);
out_info:
infop = wo->wo_info;
if (infop) {
if ((status & 0x7f) == 0) {
infop->cause = CLD_EXITED;
infop->status = status >> 8;
} else {
infop->cause = (status & 0x80) ? CLD_DUMPED : CLD_KILLED;
infop->status = status & 0x7f;
}
infop->pid = pid;
infop->uid = uid;
}
return pid;
}
static int *task_stopped_code(struct task_struct *p, bool ptrace)
{
if (ptrace) {
if (task_is_traced(p) && !(p->jobctl & JOBCTL_LISTENING))
return &p->exit_code;
} else {
if (p->signal->flags & SIGNAL_STOP_STOPPED)
return &p->signal->group_exit_code;
}
return NULL;
}
/**
* wait_task_stopped - Wait for %TASK_STOPPED or %TASK_TRACED
* @wo: wait options
* @ptrace: is the wait for ptrace
* @p: task to wait for
*
* Handle sys_wait4() work for %p in state %TASK_STOPPED or %TASK_TRACED.
*
* CONTEXT:
* read_lock(&tasklist_lock), which is released if return value is
* non-zero. Also, grabs and releases @p->sighand->siglock.
*
* RETURNS:
* 0 if wait condition didn't exist and search for other wait conditions
* should continue. Non-zero return, -errno on failure and @p's pid on
* success, implies that tasklist_lock is released and wait condition
* search should terminate.
*/
static int wait_task_stopped(struct wait_opts *wo,
int ptrace, struct task_struct *p)
{
struct waitid_info *infop;
int exit_code, *p_code, why;
uid_t uid = 0; /* unneeded, required by compiler */
pid_t pid;
/*
* Traditionally we see ptrace'd stopped tasks regardless of options.
*/
if (!ptrace && !(wo->wo_flags & WUNTRACED))
return 0;
if (!task_stopped_code(p, ptrace))
return 0;
exit_code = 0;
spin_lock_irq(&p->sighand->siglock);
p_code = task_stopped_code(p, ptrace);
if (unlikely(!p_code))
goto unlock_sig;
exit_code = *p_code;
if (!exit_code)
goto unlock_sig;
if (!unlikely(wo->wo_flags & WNOWAIT))
*p_code = 0;
uid = from_kuid_munged(current_user_ns(), task_uid(p));
unlock_sig:
spin_unlock_irq(&p->sighand->siglock);
if (!exit_code)
return 0;
/*
* Now we are pretty sure this task is interesting.
* Make sure it doesn't get reaped out from under us while we
* give up the lock and then examine it below. We don't want to
* keep holding onto the tasklist_lock while we call getrusage and
* possibly take page faults for user memory.
*/
get_task_struct(p);
pid = task_pid_vnr(p);
why = ptrace ? CLD_TRAPPED : CLD_STOPPED;
read_unlock(&tasklist_lock);
sched_annotate_sleep();
if (wo->wo_rusage)
getrusage(p, RUSAGE_BOTH, wo->wo_rusage);
put_task_struct(p);
if (likely(!(wo->wo_flags & WNOWAIT)))
wo->wo_stat = (exit_code << 8) | 0x7f;
infop = wo->wo_info;
if (infop) {
infop->cause = why;
infop->status = exit_code;
infop->pid = pid;
infop->uid = uid;
}
return pid;
}
/*
* Handle do_wait work for one task in a live, non-stopped state.
* read_lock(&tasklist_lock) on entry. If we return zero, we still hold
* the lock and this task is uninteresting. If we return nonzero, we have
* released the lock and the system call should return.
*/
static int wait_task_continued(struct wait_opts *wo, struct task_struct *p)
{
struct waitid_info *infop;
pid_t pid;
uid_t uid;
if (!unlikely(wo->wo_flags & WCONTINUED))
return 0;
if (!(p->signal->flags & SIGNAL_STOP_CONTINUED))
return 0;
spin_lock_irq(&p->sighand->siglock);
/* Re-check with the lock held. */
if (!(p->signal->flags & SIGNAL_STOP_CONTINUED)) {
spin_unlock_irq(&p->sighand->siglock);
return 0;
}
if (!unlikely(wo->wo_flags & WNOWAIT))
p->signal->flags &= ~SIGNAL_STOP_CONTINUED;
uid = from_kuid_munged(current_user_ns(), task_uid(p));
spin_unlock_irq(&p->sighand->siglock);
pid = task_pid_vnr(p);
get_task_struct(p);
read_unlock(&tasklist_lock);
sched_annotate_sleep();
if (wo->wo_rusage)
getrusage(p, RUSAGE_BOTH, wo->wo_rusage);
put_task_struct(p);
infop = wo->wo_info;
if (!infop) {
wo->wo_stat = 0xffff;
} else {
infop->cause = CLD_CONTINUED;
infop->pid = pid;
infop->uid = uid;
infop->status = SIGCONT;
}
return pid;
}
/*
* Consider @p for a wait by @parent.
*
* -ECHILD should be in ->notask_error before the first call.
* Returns nonzero for a final return, when we have unlocked tasklist_lock.
* Returns zero if the search for a child should continue;
* then ->notask_error is 0 if @p is an eligible child,
* or still -ECHILD.
*/
static int wait_consider_task(struct wait_opts *wo, int ptrace,
struct task_struct *p)
{
/*
* We can race with wait_task_zombie() from another thread.
* Ensure that EXIT_ZOMBIE -> EXIT_DEAD/EXIT_TRACE transition
* can't confuse the checks below.
*/
int exit_state = READ_ONCE(p->exit_state);
int ret;
if (unlikely(exit_state == EXIT_DEAD))
return 0;
ret = eligible_child(wo, ptrace, p);
if (!ret)
return ret;
if (unlikely(exit_state == EXIT_TRACE)) {
/*
* ptrace == 0 means we are the natural parent. In this case
* we should clear notask_error, debugger will notify us.
*/
if (likely(!ptrace))
wo->notask_error = 0;
return 0;
}
if (likely(!ptrace) && unlikely(p->ptrace)) {
/*
* If it is traced by its real parent's group, just pretend
* the caller is ptrace_do_wait() and reap this child if it
* is zombie.
*
* This also hides group stop state from real parent; otherwise
* a single stop can be reported twice as group and ptrace stop.
* If a ptracer wants to distinguish these two events for its
* own children it should create a separate process which takes
* the role of real parent.
*/
if (!ptrace_reparented(p))
ptrace = 1;
}
/* slay zombie? */
if (exit_state == EXIT_ZOMBIE) {
/* we don't reap group leaders with subthreads */
if (!delay_group_leader(p)) {
/*
* A zombie ptracee is only visible to its ptracer.
* Notification and reaping will be cascaded to the
* real parent when the ptracer detaches.
*/
if (unlikely(ptrace) || likely(!p->ptrace))
return wait_task_zombie(wo, p);
}
/*
* Allow access to stopped/continued state via zombie by
* falling through. Clearing of notask_error is complex.
*
* When !@ptrace:
*
* If WEXITED is set, notask_error should naturally be
* cleared. If not, subset of WSTOPPED|WCONTINUED is set,
* so, if there are live subthreads, there are events to
* wait for. If all subthreads are dead, it's still safe
* to clear - this function will be called again in finite
* amount time once all the subthreads are released and
* will then return without clearing.
*
* When @ptrace:
*
* Stopped state is per-task and thus can't change once the
* target task dies. Only continued and exited can happen.
* Clear notask_error if WCONTINUED | WEXITED.
*/
if (likely(!ptrace) || (wo->wo_flags & (WCONTINUED | WEXITED)))
wo->notask_error = 0;
} else {
/*
* @p is alive and it's gonna stop, continue or exit, so
* there always is something to wait for.
*/
wo->notask_error = 0;
}
/*
* Wait for stopped. Depending on @ptrace, different stopped state
* is used and the two don't interact with each other.
*/
ret = wait_task_stopped(wo, ptrace, p);
if (ret)
return ret;
/*
* Wait for continued. There's only one continued state and the
* ptracer can consume it which can confuse the real parent. Don't
* use WCONTINUED from ptracer. You don't need or want it.
*/
return wait_task_continued(wo, p);
}
/*
* Do the work of do_wait() for one thread in the group, @tsk.
*
* -ECHILD should be in ->notask_error before the first call.
* Returns nonzero for a final return, when we have unlocked tasklist_lock.
* Returns zero if the search for a child should continue; then
* ->notask_error is 0 if there were any eligible children,
* or still -ECHILD.
*/
static int do_wait_thread(struct wait_opts *wo, struct task_struct *tsk)
{
struct task_struct *p;
list_for_each_entry(p, &tsk->children, sibling) {
int ret = wait_consider_task(wo, 0, p);
if (ret)
return ret;
}
return 0;
}
static int ptrace_do_wait(struct wait_opts *wo, struct task_struct *tsk)
{
struct task_struct *p;
list_for_each_entry(p, &tsk->ptraced, ptrace_entry) {
int ret = wait_consider_task(wo, 1, p);
if (ret)
return ret;
}
return 0;
}
static int child_wait_callback(wait_queue_entry_t *wait, unsigned mode,
int sync, void *key)
{
struct wait_opts *wo = container_of(wait, struct wait_opts,
child_wait);
struct task_struct *p = key;
if (!eligible_pid(wo, p))
return 0;
if ((wo->wo_flags & __WNOTHREAD) && wait->private != p->parent)
return 0;
return default_wake_function(wait, mode, sync, key);
}
void __wake_up_parent(struct task_struct *p, struct task_struct *parent)
{
__wake_up_sync_key(&parent->signal->wait_chldexit,
TASK_INTERRUPTIBLE, p);
}
static bool is_effectively_child(struct wait_opts *wo, bool ptrace,
struct task_struct *target)
{
struct task_struct *parent =
!ptrace ? target->real_parent : target->parent;
return current == parent || (!(wo->wo_flags & __WNOTHREAD) &&
same_thread_group(current, parent));
}
/*
* Optimization for waiting on PIDTYPE_PID. No need to iterate through child
* and tracee lists to find the target task.
*/
static int do_wait_pid(struct wait_opts *wo)
{
bool ptrace;
struct task_struct *target;
int retval;
ptrace = false;
target = pid_task(wo->wo_pid, PIDTYPE_TGID);
if (target && is_effectively_child(wo, ptrace, target)) {
retval = wait_consider_task(wo, ptrace, target);
if (retval)
return retval;
}
ptrace = true;
target = pid_task(wo->wo_pid, PIDTYPE_PID);
if (target && target->ptrace &&
is_effectively_child(wo, ptrace, target)) {
retval = wait_consider_task(wo, ptrace, target);
if (retval)
return retval;
}
return 0;
}
static long do_wait(struct wait_opts *wo)
{
int retval;
trace_sched_process_wait(wo->wo_pid);
init_waitqueue_func_entry(&wo->child_wait, child_wait_callback);
wo->child_wait.private = current;
add_wait_queue(¤t->signal->wait_chldexit, &wo->child_wait);
repeat:
/*
* If there is nothing that can match our criteria, just get out.
* We will clear ->notask_error to zero if we see any child that
* might later match our criteria, even if we are not able to reap
* it yet.
*/
wo->notask_error = -ECHILD;
if ((wo->wo_type < PIDTYPE_MAX) &&
(!wo->wo_pid || !pid_has_task(wo->wo_pid, wo->wo_type)))
goto notask;
set_current_state(TASK_INTERRUPTIBLE);
read_lock(&tasklist_lock);
if (wo->wo_type == PIDTYPE_PID) {
retval = do_wait_pid(wo);
if (retval)
goto end;
} else {
struct task_struct *tsk = current;
do {
retval = do_wait_thread(wo, tsk);
if (retval)
goto end;
retval = ptrace_do_wait(wo, tsk);
if (retval)
goto end;
if (wo->wo_flags & __WNOTHREAD)
break;
} while_each_thread(current, tsk);
}
read_unlock(&tasklist_lock);
notask:
retval = wo->notask_error;
if (!retval && !(wo->wo_flags & WNOHANG)) {
retval = -ERESTARTSYS;
if (!signal_pending(current)) {
schedule();
goto repeat;
}
}
end:
__set_current_state(TASK_RUNNING);
remove_wait_queue(¤t->signal->wait_chldexit, &wo->child_wait);
return retval;
}
static long kernel_waitid(int which, pid_t upid, struct waitid_info *infop,
int options, struct rusage *ru)
{
struct wait_opts wo;
struct pid *pid = NULL;
enum pid_type type;
long ret;
unsigned int f_flags = 0;
if (options & ~(WNOHANG|WNOWAIT|WEXITED|WSTOPPED|WCONTINUED|
__WNOTHREAD|__WCLONE|__WALL))
return -EINVAL;
if (!(options & (WEXITED|WSTOPPED|WCONTINUED)))
return -EINVAL;
switch (which) {
case P_ALL:
type = PIDTYPE_MAX;
break;
case P_PID:
type = PIDTYPE_PID;
if (upid <= 0)
return -EINVAL;
pid = find_get_pid(upid);
break;
case P_PGID:
type = PIDTYPE_PGID;
if (upid < 0)
return -EINVAL;
if (upid)
pid = find_get_pid(upid);
else
pid = get_task_pid(current, PIDTYPE_PGID);
break;
case P_PIDFD:
type = PIDTYPE_PID;
if (upid < 0)
return -EINVAL;
pid = pidfd_get_pid(upid, &f_flags);
if (IS_ERR(pid))
return PTR_ERR(pid);
break;
default:
return -EINVAL;
}
wo.wo_type = type;
wo.wo_pid = pid;
wo.wo_flags = options;
wo.wo_info = infop;
wo.wo_rusage = ru;
if (f_flags & O_NONBLOCK)
wo.wo_flags |= WNOHANG;
ret = do_wait(&wo);
if (!ret && !(options & WNOHANG) && (f_flags & O_NONBLOCK))
ret = -EAGAIN;
put_pid(pid);
return ret;
}
SYSCALL_DEFINE5(waitid, int, which, pid_t, upid, struct siginfo __user *,
infop, int, options, struct rusage __user *, ru)
{
struct rusage r;
struct waitid_info info = {.status = 0};
long err = kernel_waitid(which, upid, &info, options, ru ? &r : NULL);
int signo = 0;
if (err > 0) {
signo = SIGCHLD;
err = 0;
if (ru && copy_to_user(ru, &r, sizeof(struct rusage)))
return -EFAULT;
}
if (!infop)
return err;
if (!user_write_access_begin(infop, sizeof(*infop)))
return -EFAULT;
unsafe_put_user(signo, &infop->si_signo, Efault);
unsafe_put_user(0, &infop->si_errno, Efault);
unsafe_put_user(info.cause, &infop->si_code, Efault);
unsafe_put_user(info.pid, &infop->si_pid, Efault);
unsafe_put_user(info.uid, &infop->si_uid, Efault);
unsafe_put_user(info.status, &infop->si_status, Efault);
user_write_access_end();
return err;
Efault:
user_write_access_end();
return -EFAULT;
}
long kernel_wait4(pid_t upid, int __user *stat_addr, int options,
struct rusage *ru)
{
struct wait_opts wo;
struct pid *pid = NULL;
enum pid_type type;
long ret;
if (options & ~(WNOHANG|WUNTRACED|WCONTINUED|
__WNOTHREAD|__WCLONE|__WALL))
return -EINVAL;
/* -INT_MIN is not defined */
if (upid == INT_MIN)
return -ESRCH;
if (upid == -1)
type = PIDTYPE_MAX;
else if (upid < 0) {
type = PIDTYPE_PGID;
pid = find_get_pid(-upid);
} else if (upid == 0) {
type = PIDTYPE_PGID;
pid = get_task_pid(current, PIDTYPE_PGID);
} else /* upid > 0 */ {
type = PIDTYPE_PID;
pid = find_get_pid(upid);
}
wo.wo_type = type;
wo.wo_pid = pid;
wo.wo_flags = options | WEXITED;
wo.wo_info = NULL;
wo.wo_stat = 0;
wo.wo_rusage = ru;
ret = do_wait(&wo);
put_pid(pid);
if (ret > 0 && stat_addr && put_user(wo.wo_stat, stat_addr))
ret = -EFAULT;
return ret;
}
int kernel_wait(pid_t pid, int *stat)
{
struct wait_opts wo = {
.wo_type = PIDTYPE_PID,
.wo_pid = find_get_pid(pid),
.wo_flags = WEXITED,
};
int ret;
ret = do_wait(&wo);
if (ret > 0 && wo.wo_stat)
*stat = wo.wo_stat;
put_pid(wo.wo_pid);
return ret;
}
SYSCALL_DEFINE4(wait4, pid_t, upid, int __user *, stat_addr,
int, options, struct rusage __user *, ru)
{
struct rusage r;
long err = kernel_wait4(upid, stat_addr, options, ru ? &r : NULL);
if (err > 0) {
if (ru && copy_to_user(ru, &r, sizeof(struct rusage)))
return -EFAULT;
}
return err;
}
#ifdef __ARCH_WANT_SYS_WAITPID
/*
* sys_waitpid() remains for compatibility. waitpid() should be
* implemented by calling sys_wait4() from libc.a.
*/
SYSCALL_DEFINE3(waitpid, pid_t, pid, int __user *, stat_addr, int, options)
{
return kernel_wait4(pid, stat_addr, options, NULL);
}
#endif
#ifdef CONFIG_COMPAT
COMPAT_SYSCALL_DEFINE4(wait4,
compat_pid_t, pid,
compat_uint_t __user *, stat_addr,
int, options,
struct compat_rusage __user *, ru)
{
struct rusage r;
long err = kernel_wait4(pid, stat_addr, options, ru ? &r : NULL);
if (err > 0) {
if (ru && put_compat_rusage(&r, ru))
return -EFAULT;
}
return err;
}
COMPAT_SYSCALL_DEFINE5(waitid,
int, which, compat_pid_t, pid,
struct compat_siginfo __user *, infop, int, options,
struct compat_rusage __user *, uru)
{
struct rusage ru;
struct waitid_info info = {.status = 0};
long err = kernel_waitid(which, pid, &info, options, uru ? &ru : NULL);
int signo = 0;
if (err > 0) {
signo = SIGCHLD;
err = 0;
if (uru) {
/* kernel_waitid() overwrites everything in ru */
if (COMPAT_USE_64BIT_TIME)
err = copy_to_user(uru, &ru, sizeof(ru));
else
err = put_compat_rusage(&ru, uru);
if (err)
return -EFAULT;
}
}
if (!infop)
return err;
if (!user_write_access_begin(infop, sizeof(*infop)))
return -EFAULT;
unsafe_put_user(signo, &infop->si_signo, Efault);
unsafe_put_user(0, &infop->si_errno, Efault);
unsafe_put_user(info.cause, &infop->si_code, Efault);
unsafe_put_user(info.pid, &infop->si_pid, Efault);
unsafe_put_user(info.uid, &infop->si_uid, Efault);
unsafe_put_user(info.status, &infop->si_status, Efault);
user_write_access_end();
return err;
Efault:
user_write_access_end();
return -EFAULT;
}
#endif
/**
* thread_group_exited - check that a thread group has exited
* @pid: tgid of thread group to be checked.
*
* Test if the thread group represented by tgid has exited (all
* threads are zombies, dead or completely gone).
*
* Return: true if the thread group has exited. false otherwise.
*/
bool thread_group_exited(struct pid *pid)
{
struct task_struct *task;
bool exited;
rcu_read_lock();
task = pid_task(pid, PIDTYPE_PID);
exited = !task ||
(READ_ONCE(task->exit_state) && thread_group_empty(task));
rcu_read_unlock();
return exited;
}
EXPORT_SYMBOL(thread_group_exited);
__weak void abort(void)
{
BUG();
/* if that doesn't kill us, halt */
panic("Oops failed to kill thread");
}
EXPORT_SYMBOL(abort);
/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
* Hash: Hash algorithms under the crypto API
*
* Copyright (c) 2008 Herbert Xu <herbert@gondor.apana.org.au>
*/
#ifndef _CRYPTO_HASH_H
#define _CRYPTO_HASH_H
#include <linux/crypto.h>
#include <linux/string.h>
struct crypto_ahash;
/**
* DOC: Message Digest Algorithm Definitions
*
* These data structures define modular message digest algorithm
* implementations, managed via crypto_register_ahash(),
* crypto_register_shash(), crypto_unregister_ahash() and
* crypto_unregister_shash().
*/
/**
* struct hash_alg_common - define properties of message digest
* @digestsize: Size of the result of the transformation. A buffer of this size
* must be available to the @final and @finup calls, so they can
* store the resulting hash into it. For various predefined sizes,
* search include/crypto/ using
* git grep _DIGEST_SIZE include/crypto.
* @statesize: Size of the block for partial state of the transformation. A
* buffer of this size must be passed to the @export function as it
* will save the partial state of the transformation into it. On the
* other side, the @import function will load the state from a
* buffer of this size as well.
* @base: Start of data structure of cipher algorithm. The common data
* structure of crypto_alg contains information common to all ciphers.
* The hash_alg_common data structure now adds the hash-specific
* information.
*/
struct hash_alg_common {
unsigned int digestsize;
unsigned int statesize;
struct crypto_alg base;
};
struct ahash_request {
struct crypto_async_request base;
unsigned int nbytes;
struct scatterlist *src;
u8 *result;
/* This field may only be used by the ahash API code. */
void *priv;
void *__ctx[] CRYPTO_MINALIGN_ATTR;
};
/**
* struct ahash_alg - asynchronous message digest definition
* @init: **[mandatory]** Initialize the transformation context. Intended only to initialize the
* state of the HASH transformation at the beginning. This shall fill in
* the internal structures used during the entire duration of the whole
* transformation. No data processing happens at this point. Driver code
* implementation must not use req->result.
* @update: **[mandatory]** Push a chunk of data into the driver for transformation. This
* function actually pushes blocks of data from upper layers into the
* driver, which then passes those to the hardware as seen fit. This
* function must not finalize the HASH transformation by calculating the
* final message digest as this only adds more data into the
* transformation. This function shall not modify the transformation
* context, as this function may be called in parallel with the same
* transformation object. Data processing can happen synchronously
* [SHASH] or asynchronously [AHASH] at this point. Driver must not use
* req->result.
* @final: **[mandatory]** Retrieve result from the driver. This function finalizes the
* transformation and retrieves the resulting hash from the driver and
* pushes it back to upper layers. No data processing happens at this
* point unless hardware requires it to finish the transformation
* (then the data buffered by the device driver is processed).
* @finup: **[optional]** Combination of @update and @final. This function is effectively a
* combination of @update and @final calls issued in sequence. As some
* hardware cannot do @update and @final separately, this callback was
* added to allow such hardware to be used at least by IPsec. Data
* processing can happen synchronously [SHASH] or asynchronously [AHASH]
* at this point.
* @digest: Combination of @init and @update and @final. This function
* effectively behaves as the entire chain of operations, @init,
* @update and @final issued in sequence. Just like @finup, this was
* added for hardware which cannot do even the @finup, but can only do
* the whole transformation in one run. Data processing can happen
* synchronously [SHASH] or asynchronously [AHASH] at this point.
* @setkey: Set optional key used by the hashing algorithm. Intended to push
* optional key used by the hashing algorithm from upper layers into
* the driver. This function can store the key in the transformation
* context or can outright program it into the hardware. In the former
* case, one must be careful to program the key into the hardware at
* appropriate time and one must be careful that .setkey() can be
* called multiple times during the existence of the transformation
* object. Not all hashing algorithms do implement this function as it
* is only needed for keyed message digests. SHAx/MDx/CRCx do NOT
* implement this function. HMAC(MDx)/HMAC(SHAx)/CMAC(AES) do implement
* this function. This function must be called before any other of the
* @init, @update, @final, @finup, @digest is called. No data
* processing happens at this point.
* @export: Export partial state of the transformation. This function dumps the
* entire state of the ongoing transformation into a provided block of
* data so it can be @import 'ed back later on. This is useful in case
* you want to save partial result of the transformation after
* processing certain amount of data and reload this partial result
* multiple times later on for multiple re-use. No data processing
* happens at this point. Driver must not use req->result.
* @import: Import partial state of the transformation. This function loads the
* entire state of the ongoing transformation from a provided block of
* data so the transformation can continue from this point onward. No
* data processing happens at this point. Driver must not use
* req->result.
* @init_tfm: Initialize the cryptographic transformation object.
* This function is called only once at the instantiation
* time, right after the transformation context was
* allocated. In case the cryptographic hardware has
* some special requirements which need to be handled
* by software, this function shall check for the precise
* requirement of the transformation and put any software
* fallbacks in place.
* @exit_tfm: Deinitialize the cryptographic transformation object.
* This is a counterpart to @init_tfm, used to remove
* various changes set in @init_tfm.
* @halg: see struct hash_alg_common
*/
struct ahash_alg {
int (*init)(struct ahash_request *req);
int (*update)(struct ahash_request *req);
int (*final)(struct ahash_request *req);
int (*finup)(struct ahash_request *req);
int (*digest)(struct ahash_request *req);
int (*export)(struct ahash_request *req, void *out);
int (*import)(struct ahash_request *req, const void *in);
int (*setkey)(struct crypto_ahash *tfm, const u8 *key,
unsigned int keylen);
int (*init_tfm)(struct crypto_ahash *tfm);
void (*exit_tfm)(struct crypto_ahash *tfm);
struct hash_alg_common halg;
};
struct shash_desc {
struct crypto_shash *tfm;
void *__ctx[] __aligned(ARCH_SLAB_MINALIGN);
};
#define HASH_MAX_DIGESTSIZE 64
/*
* Worst case is hmac(sha3-224-generic). Its context is a nested 'shash_desc'
* containing a 'struct sha3_state'.
*/
#define HASH_MAX_DESCSIZE (sizeof(struct shash_desc) + 360)
#define HASH_MAX_STATESIZE 512
#define SHASH_DESC_ON_STACK(shash, ctx) \
char __##shash##_desc[sizeof(struct shash_desc) + HASH_MAX_DESCSIZE] \
__aligned(__alignof__(struct shash_desc)); \
struct shash_desc *shash = (struct shash_desc *)__##shash##_desc
/**
* struct shash_alg - synchronous message digest definition
* @init: see struct ahash_alg
* @update: see struct ahash_alg
* @final: see struct ahash_alg
* @finup: see struct ahash_alg
* @digest: see struct ahash_alg
* @export: see struct ahash_alg
* @import: see struct ahash_alg
* @setkey: see struct ahash_alg
* @init_tfm: Initialize the cryptographic transformation object.
* This function is called only once at the instantiation
* time, right after the transformation context was
* allocated. In case the cryptographic hardware has
* some special requirements which need to be handled
* by software, this function shall check for the precise
* requirement of the transformation and put any software
* fallbacks in place.
* @exit_tfm: Deinitialize the cryptographic transformation object.
* This is a counterpart to @init_tfm, used to remove
* various changes set in @init_tfm.
* @digestsize: see struct ahash_alg
* @statesize: see struct ahash_alg
* @descsize: Size of the operational state for the message digest. This state
* size is the memory size that needs to be allocated for
* shash_desc.__ctx
* @base: internally used
*/
struct shash_alg {
int (*init)(struct shash_desc *desc);
int (*update)(struct shash_desc *desc, const u8 *data,
unsigned int len);
int (*final)(struct shash_desc *desc, u8 *out);
int (*finup)(struct shash_desc *desc, const u8 *data,
unsigned int len, u8 *out);
int (*digest)(struct shash_desc *desc, const u8 *data,
unsigned int len, u8 *out);
int (*export)(struct shash_desc *desc, void *out);
int (*import)(struct shash_desc *desc, const void *in);
int (*setkey)(struct crypto_shash *tfm, const u8 *key,
unsigned int keylen);
int (*init_tfm)(struct crypto_shash *tfm);
void (*exit_tfm)(struct crypto_shash *tfm);
unsigned int descsize;
/* These fields must match hash_alg_common. */
unsigned int digestsize
__attribute__ ((aligned(__alignof__(struct hash_alg_common))));
unsigned int statesize;
struct crypto_alg base;
};
struct crypto_ahash {
int (*init)(struct ahash_request *req);
int (*update)(struct ahash_request *req);
int (*final)(struct ahash_request *req);
int (*finup)(struct ahash_request *req);
int (*digest)(struct ahash_request *req);
int (*export)(struct ahash_request *req, void *out);
int (*import)(struct ahash_request *req, const void *in);
int (*setkey)(struct crypto_ahash *tfm, const u8 *key,
unsigned int keylen);
unsigned int reqsize;
struct crypto_tfm base;
};
struct crypto_shash {
unsigned int descsize;
struct crypto_tfm base;
};
/**
* DOC: Asynchronous Message Digest API
*
* The asynchronous message digest API is used with the ciphers of type
* CRYPTO_ALG_TYPE_AHASH (listed as type "ahash" in /proc/crypto)
*
* The asynchronous cipher operation discussion provided for the
* CRYPTO_ALG_TYPE_SKCIPHER API applies here as well.
*/
static inline struct crypto_ahash *__crypto_ahash_cast(struct crypto_tfm *tfm)
{
return container_of(tfm, struct crypto_ahash, base);
}
/**
* crypto_alloc_ahash() - allocate ahash cipher handle
* @alg_name: is the cra_name / name or cra_driver_name / driver name of the
* ahash cipher
* @type: specifies the type of the cipher
* @mask: specifies the mask for the cipher
*
* Allocate a cipher handle for an ahash. The returned struct
* crypto_ahash is the cipher handle that is required for any subsequent
* API invocation for that ahash.
*
* Return: allocated cipher handle in case of success; IS_ERR() is true in case
* of an error, PTR_ERR() returns the error code.
*/
struct crypto_ahash *crypto_alloc_ahash(const char *alg_name, u32 type,
u32 mask);
static inline struct crypto_tfm *crypto_ahash_tfm(struct crypto_ahash *tfm)
{
return &tfm->base;
}
/**
* crypto_free_ahash() - zeroize and free the ahash handle
* @tfm: cipher handle to be freed
*
* If @tfm is a NULL or error pointer, this function does nothing.
*/
static inline void crypto_free_ahash(struct crypto_ahash *tfm)
{
crypto_destroy_tfm(tfm, crypto_ahash_tfm(tfm));
}
/**
* crypto_has_ahash() - Search for the availability of an ahash.
* @alg_name: is the cra_name / name or cra_driver_name / driver name of the
* ahash
* @type: specifies the type of the ahash
* @mask: specifies the mask for the ahash
*
* Return: true when the ahash is known to the kernel crypto API; false
* otherwise
*/
int crypto_has_ahash(const char *alg_name, u32 type, u32 mask);
static inline const char *crypto_ahash_alg_name(struct crypto_ahash *tfm)
{
return crypto_tfm_alg_name(crypto_ahash_tfm(tfm));
}
static inline const char *crypto_ahash_driver_name(struct crypto_ahash *tfm)
{
return crypto_tfm_alg_driver_name(crypto_ahash_tfm(tfm));
}
static inline unsigned int crypto_ahash_alignmask(
struct crypto_ahash *tfm)
{
return crypto_tfm_alg_alignmask(crypto_ahash_tfm(tfm));
}
/**
* crypto_ahash_blocksize() - obtain block size for cipher
* @tfm: cipher handle
*
* The block size for the message digest cipher referenced with the cipher
* handle is returned.
*
* Return: block size of cipher
*/
static inline unsigned int crypto_ahash_blocksize(struct crypto_ahash *tfm)
{
return crypto_tfm_alg_blocksize(crypto_ahash_tfm(tfm));
}
static inline struct hash_alg_common *__crypto_hash_alg_common(
struct crypto_alg *alg)
{
return container_of(alg, struct hash_alg_common, base);
}
static inline struct hash_alg_common *crypto_hash_alg_common(
struct crypto_ahash *tfm)
{
return __crypto_hash_alg_common(crypto_ahash_tfm(tfm)->__crt_alg);
}
/**
* crypto_ahash_digestsize() - obtain message digest size
* @tfm: cipher handle
*
* The size for the message digest created by the message digest cipher
* referenced with the cipher handle is returned.
*
*
* Return: message digest size of cipher
*/
static inline unsigned int crypto_ahash_digestsize(struct crypto_ahash *tfm)
{
return crypto_hash_alg_common(tfm)->digestsize;
}
/**
* crypto_ahash_statesize() - obtain size of the ahash state
* @tfm: cipher handle
*
* Return the size of the ahash state. With the crypto_ahash_export()
* function, the caller can export the state into a buffer whose size is
* defined with this function.
*
* Return: size of the ahash state
*/
static inline unsigned int crypto_ahash_statesize(struct crypto_ahash *tfm)
{
return crypto_hash_alg_common(tfm)->statesize;
}
static inline u32 crypto_ahash_get_flags(struct crypto_ahash *tfm)
{
return crypto_tfm_get_flags(crypto_ahash_tfm(tfm));
}
static inline void crypto_ahash_set_flags(struct crypto_ahash *tfm, u32 flags)
{
crypto_tfm_set_flags(crypto_ahash_tfm(tfm), flags);
}
static inline void crypto_ahash_clear_flags(struct crypto_ahash *tfm, u32 flags)
{
crypto_tfm_clear_flags(crypto_ahash_tfm(tfm), flags);
}
/**
* crypto_ahash_reqtfm() - obtain cipher handle from request
* @req: asynchronous request handle that contains the reference to the ahash
* cipher handle
*
* Return the ahash cipher handle that is registered with the asynchronous
* request handle ahash_request.
*
* Return: ahash cipher handle
*/
static inline struct crypto_ahash *crypto_ahash_reqtfm(
struct ahash_request *req)
{
return __crypto_ahash_cast(req->base.tfm);
}
/**
* crypto_ahash_reqsize() - obtain size of the request data structure
* @tfm: cipher handle
*
* Return: size of the request data
*/
static inline unsigned int crypto_ahash_reqsize(struct crypto_ahash *tfm)
{
return tfm->reqsize;
}
static inline void *ahash_request_ctx(struct ahash_request *req)
{
return req->__ctx;
}
/**
* crypto_ahash_setkey - set key for cipher handle
* @tfm: cipher handle
* @key: buffer holding the key
* @keylen: length of the key in bytes
*
* The caller provided key is set for the ahash cipher. The cipher
* handle must point to a keyed hash in order for this function to succeed.
*
* Return: 0 if the setting of the key was successful; < 0 if an error occurred
*/
int crypto_ahash_setkey(struct crypto_ahash *tfm, const u8 *key,
unsigned int keylen);
/**
* crypto_ahash_finup() - update and finalize message digest
* @req: reference to the ahash_request handle that holds all information
* needed to perform the cipher operation
*
* This function is a "short-hand" for the function calls of
* crypto_ahash_update and crypto_ahash_final. The parameters have the same
* meaning as discussed for those separate functions.
*
* Return: see crypto_ahash_final()
*/
int crypto_ahash_finup(struct ahash_request *req);
/**
* crypto_ahash_final() - calculate message digest
* @req: reference to the ahash_request handle that holds all information
* needed to perform the cipher operation
*
* Finalize the message digest operation and create the message digest
* based on all data added to the cipher handle. The message digest is placed
* into the output buffer registered with the ahash_request handle.
*
* Return:
* 0 if the message digest was successfully calculated;
* -EINPROGRESS if data is fed into hardware (DMA) or queued for later;
* -EBUSY if queue is full and request should be resubmitted later;
* other < 0 if an error occurred
*/
int crypto_ahash_final(struct ahash_request *req);
/**
* crypto_ahash_digest() - calculate message digest for a buffer
* @req: reference to the ahash_request handle that holds all information
* needed to perform the cipher operation
*
* This function is a "short-hand" for the function calls of crypto_ahash_init,
* crypto_ahash_update and crypto_ahash_final. The parameters have the same
* meaning as discussed for those separate three functions.
*
* Return: see crypto_ahash_final()
*/
int crypto_ahash_digest(struct ahash_request *req);
/**
* crypto_ahash_export() - extract current message digest state
* @req: reference to the ahash_request handle whose state is exported
* @out: output buffer of sufficient size that can hold the hash state
*
* This function exports the hash state of the ahash_request handle into the
* caller-allocated output buffer out which must have sufficient size (e.g. by
* calling crypto_ahash_statesize()).
*
* Return: 0 if the export was successful; < 0 if an error occurred
*/
static inline int crypto_ahash_export(struct ahash_request *req, void *out)
{
return crypto_ahash_reqtfm(req)->export(req, out);
}
/**
* crypto_ahash_import() - import message digest state
* @req: reference to ahash_request handle the state is imported into
* @in: buffer holding the state
*
* This function imports the hash state into the ahash_request handle from the
* input buffer. That buffer should have been generated with the
* crypto_ahash_export function.
*
* Return: 0 if the import was successful; < 0 if an error occurred
*/
static inline int crypto_ahash_import(struct ahash_request *req, const void *in)
{
struct crypto_ahash *tfm = crypto_ahash_reqtfm(req);
if (crypto_ahash_get_flags(tfm) & CRYPTO_TFM_NEED_KEY)
return -ENOKEY;
return tfm->import(req, in);
}
/**
* crypto_ahash_init() - (re)initialize message digest handle
* @req: ahash_request handle that already is initialized with all necessary
* data using the ahash_request_* API functions
*
* The call (re-)initializes the message digest referenced by the ahash_request
* handle. Any potentially existing state created by previous operations is
* discarded.
*
* Return: see crypto_ahash_final()
*/
static inline int crypto_ahash_init(struct ahash_request *req)
{
struct crypto_ahash *tfm = crypto_ahash_reqtfm(req);
if (crypto_ahash_get_flags(tfm) & CRYPTO_TFM_NEED_KEY)
return -ENOKEY;
return tfm->init(req);
}
/**
* crypto_ahash_update() - add data to message digest for processing
* @req: ahash_request handle that was previously initialized with the
* crypto_ahash_init call.
*
* Updates the message digest state of the &ahash_request handle. The input data
* is pointed to by the scatter/gather list registered in the &ahash_request
* handle
*
* Return: see crypto_ahash_final()
*/
static inline int crypto_ahash_update(struct ahash_request *req)
{
struct crypto_ahash *tfm = crypto_ahash_reqtfm(req);
struct crypto_alg *alg = tfm->base.__crt_alg;
unsigned int nbytes = req->nbytes;
int ret;
crypto_stats_get(alg);
ret = crypto_ahash_reqtfm(req)->update(req);
crypto_stats_ahash_update(nbytes, ret, alg);
return ret;
}
/**
* DOC: Asynchronous Hash Request Handle
*
* The &ahash_request data structure contains all pointers to data
* required for the asynchronous cipher operation. This includes the cipher
* handle (which can be used by multiple &ahash_request instances), pointer
* to plaintext and the message digest output buffer, asynchronous callback
* function, etc. It acts as a handle to the ahash_request_* API calls in a
* similar way as ahash handle to the crypto_ahash_* API calls.
*/
/**
* ahash_request_set_tfm() - update cipher handle reference in request
* @req: request handle to be modified
* @tfm: cipher handle that shall be added to the request handle
*
* Allow the caller to replace the existing ahash handle in the request
* data structure with a different one.
*/
static inline void ahash_request_set_tfm(struct ahash_request *req,
struct crypto_ahash *tfm)
{
req->base.tfm = crypto_ahash_tfm(tfm);
}
/**
* ahash_request_alloc() - allocate request data structure
* @tfm: cipher handle to be registered with the request
* @gfp: memory allocation flag that is handed to kmalloc by the API call.
*
* Allocate the request data structure that must be used with the ahash
* message digest API calls. During
* the allocation, the provided ahash handle
* is registered in the request data structure.
*
* Return: allocated request handle in case of success, or NULL if out of memory
*/
static inline struct ahash_request *ahash_request_alloc(
struct crypto_ahash *tfm, gfp_t gfp)
{
struct ahash_request *req;
req = kmalloc(sizeof(struct ahash_request) +
crypto_ahash_reqsize(tfm), gfp);
if (likely(req))
ahash_request_set_tfm(req, tfm);
return req;
}
/**
* ahash_request_free() - zeroize and free the request data structure
* @req: request data structure cipher handle to be freed
*/
static inline void ahash_request_free(struct ahash_request *req)
{
kfree_sensitive(req);
}
static inline void ahash_request_zero(struct ahash_request *req)
{
memzero_explicit(req, sizeof(*req) +
crypto_ahash_reqsize(crypto_ahash_reqtfm(req)));
}
static inline struct ahash_request *ahash_request_cast(
struct crypto_async_request *req)
{
return container_of(req, struct ahash_request, base);
}
/**
* ahash_request_set_callback() - set asynchronous callback function
* @req: request handle
* @flags: specify zero or an ORing of the flags
* CRYPTO_TFM_REQ_MAY_BACKLOG the request queue may back log and
* increase the wait queue beyond the initial maximum size;
* CRYPTO_TFM_REQ_MAY_SLEEP the request processing may sleep
* @compl: callback function pointer to be registered with the request handle
* @data: The data pointer refers to memory that is not used by the kernel
* crypto API, but provided to the callback function for it to use. Here,
* the caller can provide a reference to memory the callback function can
* operate on. As the callback function is invoked asynchronously to the
* related functionality, it may need to access data structures of the
* related functionality which can be referenced using this pointer. The
* callback function can access the memory via the "data" field in the
* &crypto_async_request data structure provided to the callback function.
*
* This function allows setting the callback function that is triggered once
* the cipher operation completes.
*
* The callback function is registered with the &ahash_request handle and
* must comply with the following template::
*
* void callback_function(struct crypto_async_request *req, int error)
*/
static inline void ahash_request_set_callback(struct ahash_request *req,
u32 flags,
crypto_completion_t compl,
void *data)
{
req->base.complete = compl;
req->base.data = data;
req->base.flags = flags;
}
/**
* ahash_request_set_crypt() - set data buffers
* @req: ahash_request handle to be updated
* @src: source scatter/gather list
* @result: buffer that is filled with the message digest -- the caller must
* ensure that the buffer has sufficient space by, for example, calling
* crypto_ahash_digestsize()
* @nbytes: number of bytes to process from the source scatter/gather list
*
* By using this call, the caller references the source scatter/gather list.
* The source scatter/gather list points to the data the message digest is to
* be calculated for.
*/
static inline void ahash_request_set_crypt(struct ahash_request *req,
struct scatterlist *src, u8 *result,
unsigned int nbytes)
{
req->src = src;
req->nbytes = nbytes;
req->result = result;
}
/**
* DOC: Synchronous Message Digest API
*
* The synchronous message digest API is used with the ciphers of type
* CRYPTO_ALG_TYPE_SHASH (listed as type "shash" in /proc/crypto)
*
* The message digest API is able to maintain state information for the
* caller.
*
* The synchronous message digest API can store user-related context in its
* shash_desc request data structure.
*/
/**
* crypto_alloc_shash() - allocate message digest handle
* @alg_name: is the cra_name / name or cra_driver_name / driver name of the
* message digest cipher
* @type: specifies the type of the cipher
* @mask: specifies the mask for the cipher
*
* Allocate a cipher handle for a message digest. The returned &struct
* crypto_shash is the cipher handle that is required for any subsequent
* API invocation for that message digest.
*
* Return: allocated cipher handle in case of success; IS_ERR() is true in case
* of an error, PTR_ERR() returns the error code.
*/
struct crypto_shash *crypto_alloc_shash(const char *alg_name, u32 type,
u32 mask);
static inline struct crypto_tfm *crypto_shash_tfm(struct crypto_shash *tfm)
{
return &tfm->base;
}
/**
* crypto_free_shash() - zeroize and free the message digest handle
* @tfm: cipher handle to be freed
*
* If @tfm is a NULL or error pointer, this function does nothing.
*/
static inline void crypto_free_shash(struct crypto_shash *tfm)
{
crypto_destroy_tfm(tfm, crypto_shash_tfm(tfm));
}
static inline const char *crypto_shash_alg_name(struct crypto_shash *tfm)
{
return crypto_tfm_alg_name(crypto_shash_tfm(tfm));
}
static inline const char *crypto_shash_driver_name(struct crypto_shash *tfm)
{
return crypto_tfm_alg_driver_name(crypto_shash_tfm(tfm));
}
static inline unsigned int crypto_shash_alignmask(
struct crypto_shash *tfm)
{
return crypto_tfm_alg_alignmask(crypto_shash_tfm(tfm));
}
/**
* crypto_shash_blocksize() - obtain block size for cipher
* @tfm: cipher handle
*
* The block size for the message digest cipher referenced with the cipher
* handle is returned.
*
* Return: block size of cipher
*/
static inline unsigned int crypto_shash_blocksize(struct crypto_shash *tfm)
{
return crypto_tfm_alg_blocksize(crypto_shash_tfm(tfm));
}
static inline struct shash_alg *__crypto_shash_alg(struct crypto_alg *alg)
{
return container_of(alg, struct shash_alg, base);
}
static inline struct shash_alg *crypto_shash_alg(struct crypto_shash *tfm)
{
return __crypto_shash_alg(crypto_shash_tfm(tfm)->__crt_alg);
}
/**
* crypto_shash_digestsize() - obtain message digest size
* @tfm: cipher handle
*
* The size for the message digest created by the message digest cipher
* referenced with the cipher handle is returned.
*
* Return: digest size of cipher
*/
static inline unsigned int crypto_shash_digestsize(struct crypto_shash *tfm)
{
return crypto_shash_alg(tfm)->digestsize;
}
static inline unsigned int crypto_shash_statesize(struct crypto_shash *tfm)
{
return crypto_shash_alg(tfm)->statesize;
}
static inline u32 crypto_shash_get_flags(struct crypto_shash *tfm)
{
return crypto_tfm_get_flags(crypto_shash_tfm(tfm));
}
static inline void crypto_shash_set_flags(struct crypto_shash *tfm, u32 flags)
{
crypto_tfm_set_flags(crypto_shash_tfm(tfm), flags);
}
static inline void crypto_shash_clear_flags(struct crypto_shash *tfm, u32 flags)
{
crypto_tfm_clear_flags(crypto_shash_tfm(tfm), flags);
}
/**
* crypto_shash_descsize() - obtain the operational state size
* @tfm: cipher handle
*
* The size of the operational state the cipher needs during operation is
* returned for the hash referenced with the cipher handle. This size is
* required to calculate the memory requirements to allow the caller allocating
* sufficient memory for operational state.
*
* The operational state is defined with struct shash_desc where the size of
* that data structure is to be calculated as
* sizeof(struct shash_desc) + crypto_shash_descsize(alg)
*
* Return: size of the operational state
*/
static inline unsigned int crypto_shash_descsize(struct crypto_shash *tfm)
{
return tfm->descsize;
}
static inline void *shash_desc_ctx(struct shash_desc *desc)
{
return desc->__ctx;
}
/**
* crypto_shash_setkey() - set key for message digest
* @tfm: cipher handle
* @key: buffer holding the key
* @keylen: length of the key in bytes
*
* The caller provided key is set for the keyed message digest cipher. The
* cipher handle must point to a keyed message digest cipher in order for this
* function to succeed.
*
* Context: Any context.
* Return: 0 if the setting of the key was successful; < 0 if an error occurred
*/
int crypto_shash_setkey(struct crypto_shash *tfm, const u8 *key,
unsigned int keylen);
/**
* crypto_shash_digest() - calculate message digest for buffer
* @desc: see crypto_shash_final()
* @data: see crypto_shash_update()
* @len: see crypto_shash_update()
* @out: see crypto_shash_final()
*
* This function is a "short-hand" for the function calls of crypto_shash_init,
* crypto_shash_update and crypto_shash_final. The parameters have the same
* meaning as discussed for those separate three functions.
*
* Context: Any context.
* Return: 0 if the message digest creation was successful; < 0 if an error
* occurred
*/
int crypto_shash_digest(struct shash_desc *desc, const u8 *data,
unsigned int len, u8 *out);
/**
* crypto_shash_tfm_digest() - calculate message digest for buffer
* @tfm: hash transformation object
* @data: see crypto_shash_update()
* @len: see crypto_shash_update()
* @out: see crypto_shash_final()
*
* This is a simplified version of crypto_shash_digest() for users who don't
* want to allocate their own hash descriptor (shash_desc). Instead,
* crypto_shash_tfm_digest() takes a hash transformation object (crypto_shash)
* directly, and it allocates a hash descriptor on the stack internally.
* Note that this stack allocation may be fairly large.
*
* Context: Any context.
* Return: 0 on success; < 0 if an error occurred.
*/
int crypto_shash_tfm_digest(struct crypto_shash *tfm, const u8 *data,
unsigned int len, u8 *out);
/**
* crypto_shash_export() - extract operational state for message digest
* @desc: reference to the operational state handle whose state is exported
* @out: output buffer of sufficient size that can hold the hash state
*
* This function exports the hash state of the operational state handle into the
* caller-allocated output buffer out which must have sufficient size (e.g. by
* calling crypto_shash_descsize).
*
* Context: Any context.
* Return: 0 if the export creation was successful; < 0 if an error occurred
*/
static inline int crypto_shash_export(struct shash_desc *desc, void *out)
{
return crypto_shash_alg(desc->tfm)->export(desc, out);
}
/**
* crypto_shash_import() - import operational state
* @desc: reference to the operational state handle the state imported into
* @in: buffer holding the state
*
* This function imports the hash state into the operational state handle from
* the input buffer. That buffer should have been generated with the
* crypto_ahash_export function.
*
* Context: Any context.
* Return: 0 if the import was successful; < 0 if an error occurred
*/
static inline int crypto_shash_import(struct shash_desc *desc, const void *in)
{
struct crypto_shash *tfm = desc->tfm;
if (crypto_shash_get_flags(tfm) & CRYPTO_TFM_NEED_KEY)
return -ENOKEY;
return crypto_shash_alg(tfm)->import(desc, in);
}
/**
* crypto_shash_init() - (re)initialize message digest
* @desc: operational state handle that is already filled
*
* The call (re-)initializes the message digest referenced by the
* operational state handle. Any potentially existing state created by
* previous operations is discarded.
*
* Context: Any context.
* Return: 0 if the message digest initialization was successful; < 0 if an
* error occurred
*/
static inline int crypto_shash_init(struct shash_desc *desc)
{
struct crypto_shash *tfm = desc->tfm;
if (crypto_shash_get_flags(tfm) & CRYPTO_TFM_NEED_KEY)
return -ENOKEY;
return crypto_shash_alg(tfm)->init(desc);
}
/**
* crypto_shash_update() - add data to message digest for processing
* @desc: operational state handle that is already initialized
* @data: input data to be added to the message digest
* @len: length of the input data
*
* Updates the message digest state of the operational state handle.
*
* Context: Any context.
* Return: 0 if the message digest update was successful; < 0 if an error
* occurred
*/
int crypto_shash_update(struct shash_desc *desc, const u8 *data,
unsigned int len);
/**
* crypto_shash_final() - calculate message digest
* @desc: operational state handle that is already filled with data
* @out: output buffer filled with the message digest
*
* Finalize the message digest operation and create the message digest
* based on all data added to the cipher handle. The message digest is placed
* into the output buffer. The caller must ensure that the output buffer is
* large enough by using crypto_shash_digestsize.
*
* Context: Any context.
* Return: 0 if the message digest creation was successful; < 0 if an error
* occurred
*/
int crypto_shash_final(struct shash_desc *desc, u8 *out);
/**
* crypto_shash_finup() - calculate message digest of buffer
* @desc: see crypto_shash_final()
* @data: see crypto_shash_update()
* @len: see crypto_shash_update()
* @out: see crypto_shash_final()
*
* This function is a "short-hand" for the function calls of
* crypto_shash_update and crypto_shash_final. The parameters have the same
* meaning as discussed for those separate functions.
*
* Context: Any context.
* Return: 0 if the message digest creation was successful; < 0 if an error
* occurred
*/
int crypto_shash_finup(struct shash_desc *desc, const u8 *data,
unsigned int len, u8 *out);
static inline void shash_desc_zero(struct shash_desc *desc)
{
memzero_explicit(desc,
sizeof(*desc) + crypto_shash_descsize(desc->tfm));
}
#endif /* _CRYPTO_HASH_H */
// SPDX-License-Identifier: GPL-2.0-only
/*
* Simple NUMA memory policy for the Linux kernel.
*
* Copyright 2003,2004 Andi Kleen, SuSE Labs.
* (C) Copyright 2005 Christoph Lameter, Silicon Graphics, Inc.
*
* NUMA policy allows the user to give hints in which node(s) memory should
* be allocated.
*
* Support four policies per VMA and per process:
*
* The VMA policy has priority over the process policy for a page fault.
*
* interleave Allocate memory interleaved over a set of nodes,
* with normal fallback if it fails.
* For VMA based allocations this interleaves based on the
* offset into the backing object or offset into the mapping
* for anonymous memory. For process policy an process counter
* is used.
*
* bind Only allocate memory on a specific set of nodes,
* no fallback.
* FIXME: memory is allocated starting with the first node
* to the last. It would be better if bind would truly restrict
* the allocation to memory nodes instead
*
* preferred Try a specific node first before normal fallback.
* As a special case NUMA_NO_NODE here means do the allocation
* on the local CPU. This is normally identical to default,
* but useful to set in a VMA when you have a non default
* process policy.
*
* preferred many Try a set of nodes first before normal fallback. This is
* similar to preferred without the special case.
*
* default Allocate on the local node first, or when on a VMA
* use the process policy. This is what Linux always did
* in a NUMA aware kernel and still does by, ahem, default.
*
* The process policy is applied for most non interrupt memory allocations
* in that process' context. Interrupts ignore the policies and always
* try to allocate on the local CPU. The VMA policy is only applied for memory
* allocations for a VMA in the VM.
*
* Currently there are a few corner cases in swapping where the policy
* is not applied, but the majority should be handled. When process policy
* is used it is not remembered over swap outs/swap ins.
*
* Only the highest zone in the zone hierarchy gets policied. Allocations
* requesting a lower zone just use default policy. This implies that
* on systems with highmem kernel lowmem allocation don't get policied.
* Same with GFP_DMA allocations.
*
* For shmfs/tmpfs/hugetlbfs shared memory the policy is shared between
* all users and remembered even when nobody has memory mapped.
*/
/* Notebook:
fix mmap readahead to honour policy and enable policy for any page cache
object
statistics for bigpages
global policy for page cache? currently it uses process policy. Requires
first item above.
handle mremap for shared memory (currently ignored for the policy)
grows down?
make bind policy root only? It can trigger oom much faster and the
kernel is not always grateful with that.
*/
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/mempolicy.h>
#include <linux/pagewalk.h>
#include <linux/highmem.h>
#include <linux/hugetlb.h>
#include <linux/kernel.h>
#include <linux/sched.h>
#include <linux/sched/mm.h>
#include <linux/sched/numa_balancing.h>
#include <linux/sched/task.h>
#include <linux/nodemask.h>
#include <linux/cpuset.h>
#include <linux/slab.h>
#include <linux/string.h>
#include <linux/export.h>
#include <linux/nsproxy.h>
#include <linux/interrupt.h>
#include <linux/init.h>
#include <linux/compat.h>
#include <linux/ptrace.h>
#include <linux/swap.h>
#include <linux/seq_file.h>
#include <linux/proc_fs.h>
#include <linux/migrate.h>
#include <linux/ksm.h>
#include <linux/rmap.h>
#include <linux/security.h>
#include <linux/syscalls.h>
#include <linux/ctype.h>
#include <linux/mm_inline.h>
#include <linux/mmu_notifier.h>
#include <linux/printk.h>
#include <linux/swapops.h>
#include <asm/tlbflush.h>
#include <linux/uaccess.h>
#include "internal.h"
/* Internal flags */
#define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0) /* Skip checks for continuous vmas */
#define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1) /* Invert check for nodemask */
static struct kmem_cache *policy_cache;
static struct kmem_cache *sn_cache;
/* Highest zone. An specific allocation for a zone below that is not
policied. */
enum zone_type policy_zone = 0;
/*
* run-time system-wide default policy => local allocation
*/
static struct mempolicy default_policy = {
.refcnt = ATOMIC_INIT(1), /* never free it */
.mode = MPOL_LOCAL,
};
static struct mempolicy preferred_node_policy[MAX_NUMNODES];
/**
* numa_map_to_online_node - Find closest online node
* @node: Node id to start the search
*
* Lookup the next closest node by distance if @nid is not online.
*/
int numa_map_to_online_node(int node)
{
int min_dist = INT_MAX, dist, n, min_node;
if (node == NUMA_NO_NODE || node_online(node))
return node;
min_node = node;
for_each_online_node(n) {
dist = node_distance(node, n);
if (dist < min_dist) {
min_dist = dist;
min_node = n;
}
}
return min_node;
}
EXPORT_SYMBOL_GPL(numa_map_to_online_node);
struct mempolicy *get_task_policy(struct task_struct *p)
{
struct mempolicy *pol = p->mempolicy;
int node;
if (pol)
return pol;
node = numa_node_id();
if (node != NUMA_NO_NODE) {
pol = &preferred_node_policy[node];
/* preferred_node_policy is not initialised early in boot */
if (pol->mode) return pol;
}
return &default_policy;
}
static const struct mempolicy_operations {
int (*create)(struct mempolicy *pol, const nodemask_t *nodes);
void (*rebind)(struct mempolicy *pol, const nodemask_t *nodes);
} mpol_ops[MPOL_MAX];
static inline int mpol_store_user_nodemask(const struct mempolicy *pol)
{
return pol->flags & MPOL_MODE_FLAGS;
}
static void mpol_relative_nodemask(nodemask_t *ret, const nodemask_t *orig,
const nodemask_t *rel)
{
nodemask_t tmp;
nodes_fold(tmp, *orig, nodes_weight(*rel));
nodes_onto(*ret, tmp, *rel);
}
static int mpol_new_nodemask(struct mempolicy *pol, const nodemask_t *nodes)
{
if (nodes_empty(*nodes))
return -EINVAL;
pol->nodes = *nodes;
return 0;
}
static int mpol_new_preferred(struct mempolicy *pol, const nodemask_t *nodes)
{
if (nodes_empty(*nodes))
return -EINVAL;
nodes_clear(pol->nodes);
node_set(first_node(*nodes), pol->nodes);
return 0;
}
/*
* mpol_set_nodemask is called after mpol_new() to set up the nodemask, if
* any, for the new policy. mpol_new() has already validated the nodes
* parameter with respect to the policy mode and flags.
*
* Must be called holding task's alloc_lock to protect task's mems_allowed
* and mempolicy. May also be called holding the mmap_lock for write.
*/
static int mpol_set_nodemask(struct mempolicy *pol,
const nodemask_t *nodes, struct nodemask_scratch *nsc)
{
int ret;
/*
* Default (pol==NULL) resp. local memory policies are not a
* subject of any remapping. They also do not need any special
* constructor.
*/
if (!pol || pol->mode == MPOL_LOCAL)
return 0;
/* Check N_MEMORY */
nodes_and(nsc->mask1,
cpuset_current_mems_allowed, node_states[N_MEMORY]);
VM_BUG_ON(!nodes);
if (pol->flags & MPOL_F_RELATIVE_NODES)
mpol_relative_nodemask(&nsc->mask2, nodes, &nsc->mask1);
else
nodes_and(nsc->mask2, *nodes, nsc->mask1);
if (mpol_store_user_nodemask(pol))
pol->w.user_nodemask = *nodes;
else
pol->w.cpuset_mems_allowed = cpuset_current_mems_allowed;
ret = mpol_ops[pol->mode].create(pol, &nsc->mask2);
return ret;
}
/*
* This function just creates a new policy, does some check and simple
* initialization. You must invoke mpol_set_nodemask() to set nodes.
*/
static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
nodemask_t *nodes)
{
struct mempolicy *policy;
pr_debug("setting mode %d flags %d nodes[0] %lx\n",
mode, flags, nodes ? nodes_addr(*nodes)[0] : NUMA_NO_NODE);
if (mode == MPOL_DEFAULT) {
if (nodes && !nodes_empty(*nodes))
return ERR_PTR(-EINVAL);
return NULL;
}
VM_BUG_ON(!nodes);
/*
* MPOL_PREFERRED cannot be used with MPOL_F_STATIC_NODES or
* MPOL_F_RELATIVE_NODES if the nodemask is empty (local allocation).
* All other modes require a valid pointer to a non-empty nodemask.
*/
if (mode == MPOL_PREFERRED) {
if (nodes_empty(*nodes)) {
if (((flags & MPOL_F_STATIC_NODES) ||
(flags & MPOL_F_RELATIVE_NODES)))
return ERR_PTR(-EINVAL);
mode = MPOL_LOCAL;
}
} else if (mode == MPOL_LOCAL) {
if (!nodes_empty(*nodes) ||
(flags & MPOL_F_STATIC_NODES) ||
(flags & MPOL_F_RELATIVE_NODES))
return ERR_PTR(-EINVAL);
} else if (nodes_empty(*nodes))
return ERR_PTR(-EINVAL);
policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
if (!policy)
return ERR_PTR(-ENOMEM);
atomic_set(&policy->refcnt, 1);
policy->mode = mode;
policy->flags = flags;
return policy;
}
/* Slow path of a mpol destructor. */
void __mpol_put(struct mempolicy *p)
{
if (!atomic_dec_and_test(&p->refcnt))
return;
kmem_cache_free(policy_cache, p);
}
static void mpol_rebind_default(struct mempolicy *pol, const nodemask_t *nodes)
{
}
static void mpol_rebind_nodemask(struct mempolicy *pol, const nodemask_t *nodes)
{
nodemask_t tmp;
if (pol->flags & MPOL_F_STATIC_NODES)
nodes_and(tmp, pol->w.user_nodemask, *nodes);
else if (pol->flags & MPOL_F_RELATIVE_NODES)
mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
else {
nodes_remap(tmp, pol->nodes, pol->w.cpuset_mems_allowed,
*nodes);
pol->w.cpuset_mems_allowed = *nodes;
}
if (nodes_empty(tmp))
tmp = *nodes;
pol->nodes = tmp;
}
static void mpol_rebind_preferred(struct mempolicy *pol,
const nodemask_t *nodes)
{
pol->w.cpuset_mems_allowed = *nodes;
}
/*
* mpol_rebind_policy - Migrate a policy to a different set of nodes
*
* Per-vma policies are protected by mmap_lock. Allocations using per-task
* policies are protected by task->mems_allowed_seq to prevent a premature
* OOM/allocation failure due to parallel nodemask modification.
*/
static void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask)
{
if (!pol)
return;
if (!mpol_store_user_nodemask(pol) &&
nodes_equal(pol->w.cpuset_mems_allowed, *newmask))
return;
mpol_ops[pol->mode].rebind(pol, newmask);
}
/*
* Wrapper for mpol_rebind_policy() that just requires task
* pointer, and updates task mempolicy.
*
* Called with task's alloc_lock held.
*/
void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new)
{
mpol_rebind_policy(tsk->mempolicy, new);
}
/*
* Rebind each vma in mm to new nodemask.
*
* Call holding a reference to mm. Takes mm->mmap_lock during call.
*/
void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new)
{
struct vm_area_struct *vma;
mmap_write_lock(mm);
for (vma = mm->mmap; vma; vma = vma->vm_next)
mpol_rebind_policy(vma->vm_policy, new);
mmap_write_unlock(mm);
}
static const struct mempolicy_operations mpol_ops[MPOL_MAX] = {
[MPOL_DEFAULT] = {
.rebind = mpol_rebind_default,
},
[MPOL_INTERLEAVE] = {
.create = mpol_new_nodemask,
.rebind = mpol_rebind_nodemask,
},
[MPOL_PREFERRED] = {
.create = mpol_new_preferred,
.rebind = mpol_rebind_preferred,
},
[MPOL_BIND] = {
.create = mpol_new_nodemask,
.rebind = mpol_rebind_nodemask,
},
[MPOL_LOCAL] = {
.rebind = mpol_rebind_default,
},
[MPOL_PREFERRED_MANY] = {
.create = mpol_new_nodemask,
.rebind = mpol_rebind_preferred,
},
};
static int migrate_page_add(struct page *page, struct list_head *pagelist,
unsigned long flags);
struct queue_pages {
struct list_head *pagelist;
unsigned long flags;
nodemask_t *nmask;
unsigned long start;
unsigned long end;
struct vm_area_struct *first;
};
/*
* Check if the page's nid is in qp->nmask.
*
* If MPOL_MF_INVERT is set in qp->flags, check if the nid is
* in the invert of qp->nmask.
*/
static inline bool queue_pages_required(struct page *page,
struct queue_pages *qp)
{
int nid = page_to_nid(page);
unsigned long flags = qp->flags;
return node_isset(nid, *qp->nmask) == !(flags & MPOL_MF_INVERT);
}
/*
* queue_pages_pmd() has four possible return values:
* 0 - pages are placed on the right node or queued successfully, or
* special page is met, i.e. huge zero page.
* 1 - there is unmovable page, and MPOL_MF_MOVE* & MPOL_MF_STRICT were
* specified.
* 2 - THP was split.
* -EIO - is migration entry or only MPOL_MF_STRICT was specified and an
* existing page was already on a node that does not follow the
* policy.
*/
static int queue_pages_pmd(pmd_t *pmd, spinlock_t *ptl, unsigned long addr,
unsigned long end, struct mm_walk *walk)
__releases(ptl)
{
int ret = 0;
struct page *page;
struct queue_pages *qp = walk->private;
unsigned long flags;
if (unlikely(is_pmd_migration_entry(*pmd))) {
ret = -EIO;
goto unlock;
}
page = pmd_page(*pmd);
if (is_huge_zero_page(page)) {
spin_unlock(ptl);
walk->action = ACTION_CONTINUE;
goto out;
}
if (!queue_pages_required(page, qp))
goto unlock;
flags = qp->flags;
/* go to thp migration */
if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
if (!vma_migratable(walk->vma) ||
migrate_page_add(page, qp->pagelist, flags)) {
ret = 1;
goto unlock;
}
} else
ret = -EIO;
unlock:
spin_unlock(ptl);
out:
return ret;
}
/*
* Scan through pages checking if pages follow certain conditions,
* and move them to the pagelist if they do.
*
* queue_pages_pte_range() has three possible return values:
* 0 - pages are placed on the right node or queued successfully, or
* special page is met, i.e. zero page.
* 1 - there is unmovable page, and MPOL_MF_MOVE* & MPOL_MF_STRICT were
* specified.
* -EIO - only MPOL_MF_STRICT was specified and an existing page was already
* on a node that does not follow the policy.
*/
static int queue_pages_pte_range(pmd_t *pmd, unsigned long addr,
unsigned long end, struct mm_walk *walk)
{
struct vm_area_struct *vma = walk->vma;
struct page *page;
struct queue_pages *qp = walk->private;
unsigned long flags = qp->flags;
int ret;
bool has_unmovable = false;
pte_t *pte, *mapped_pte;
spinlock_t *ptl;
ptl = pmd_trans_huge_lock(pmd, vma);
if (ptl) {
ret = queue_pages_pmd(pmd, ptl, addr, end, walk);
if (ret != 2)
return ret;
}
/* THP was split, fall through to pte walk */
if (pmd_trans_unstable(pmd))
return 0;
mapped_pte = pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
for (; addr != end; pte++, addr += PAGE_SIZE) {
if (!pte_present(*pte))
continue;
page = vm_normal_page(vma, addr, *pte);
if (!page)
continue;
/*
* vm_normal_page() filters out zero pages, but there might
* still be PageReserved pages to skip, perhaps in a VDSO.
*/
if (PageReserved(page))
continue;
if (!queue_pages_required(page, qp))
continue;
if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
/* MPOL_MF_STRICT must be specified if we get here */
if (!vma_migratable(vma)) {
has_unmovable = true;
break;
}
/*
* Do not abort immediately since there may be
* temporary off LRU pages in the range. Still
* need migrate other LRU pages.
*/
if (migrate_page_add(page, qp->pagelist, flags))
has_unmovable = true;
} else
break;
}
pte_unmap_unlock(mapped_pte, ptl);
cond_resched();
if (has_unmovable)
return 1;
return addr != end ? -EIO : 0;
}
static int queue_pages_hugetlb(pte_t *pte, unsigned long hmask,
unsigned long addr, unsigned long end,
struct mm_walk *walk)
{
int ret = 0;
#ifdef CONFIG_HUGETLB_PAGE
struct queue_pages *qp = walk->private;
unsigned long flags = (qp->flags & MPOL_MF_VALID);
struct page *page;
spinlock_t *ptl;
pte_t entry;
ptl = huge_pte_lock(hstate_vma(walk->vma), walk->mm, pte);
entry = huge_ptep_get(pte);
if (!pte_present(entry))
goto unlock;
page = pte_page(entry);
if (!queue_pages_required(page, qp))
goto unlock;
if (flags == MPOL_MF_STRICT) {
/*
* STRICT alone means only detecting misplaced page and no
* need to further check other vma.
*/
ret = -EIO;
goto unlock;
}
if (!vma_migratable(walk->vma)) {
/*
* Must be STRICT with MOVE*, otherwise .test_walk() have
* stopped walking current vma.
* Detecting misplaced page but allow migrating pages which
* have been queued.
*/
ret = 1;
goto unlock;
}
/* With MPOL_MF_MOVE, we migrate only unshared hugepage. */
if (flags & (MPOL_MF_MOVE_ALL) ||
(flags & MPOL_MF_MOVE && page_mapcount(page) == 1)) {
if (!isolate_huge_page(page, qp->pagelist) &&
(flags & MPOL_MF_STRICT))
/*
* Failed to isolate page but allow migrating pages
* which have been queued.
*/
ret = 1;
}
unlock:
spin_unlock(ptl);
#else
BUG();
#endif
return ret;
}
#ifdef CONFIG_NUMA_BALANCING
/*
* This is used to mark a range of virtual addresses to be inaccessible.
* These are later cleared by a NUMA hinting fault. Depending on these
* faults, pages may be migrated for better NUMA placement.
*
* This is assuming that NUMA faults are handled using PROT_NONE. If
* an architecture makes a different choice, it will need further
* changes to the core.
*/
unsigned long change_prot_numa(struct vm_area_struct *vma,
unsigned long addr, unsigned long end)
{
int nr_updated;
nr_updated = change_protection(vma, addr, end, PAGE_NONE, MM_CP_PROT_NUMA);
if (nr_updated)
count_vm_numa_events(NUMA_PTE_UPDATES, nr_updated);
return nr_updated;
}
#else
static unsigned long change_prot_numa(struct vm_area_struct *vma,
unsigned long addr, unsigned long end)
{
return 0;
}
#endif /* CONFIG_NUMA_BALANCING */
static int queue_pages_test_walk(unsigned long start, unsigned long end,
struct mm_walk *walk)
{
struct vm_area_struct *vma = walk->vma;
struct queue_pages *qp = walk->private;
unsigned long endvma = vma->vm_end;
unsigned long flags = qp->flags;
/* range check first */
VM_BUG_ON_VMA(!range_in_vma(vma, start, end), vma);
if (!qp->first) {
qp->first = vma;
if (!(flags & MPOL_MF_DISCONTIG_OK) &&
(qp->start < vma->vm_start))
/* hole at head side of range */
return -EFAULT;
}
if (!(flags & MPOL_MF_DISCONTIG_OK) &&
((vma->vm_end < qp->end) &&
(!vma->vm_next || vma->vm_end < vma->vm_next->vm_start)))
/* hole at middle or tail of range */
return -EFAULT;
/*
* Need check MPOL_MF_STRICT to return -EIO if possible
* regardless of vma_migratable
*/
if (!vma_migratable(vma) &&
!(flags & MPOL_MF_STRICT))
return 1;
if (endvma > end)
endvma = end;
if (flags & MPOL_MF_LAZY) {
/* Similar to task_numa_work, skip inaccessible VMAs */
if (!is_vm_hugetlb_page(vma) && vma_is_accessible(vma) &&
!(vma->vm_flags & VM_MIXEDMAP))
change_prot_numa(vma, start, endvma);
return 1;
}
/* queue pages from current vma */
if (flags & MPOL_MF_VALID)
return 0;
return 1;
}
static const struct mm_walk_ops queue_pages_walk_ops = {
.hugetlb_entry = queue_pages_hugetlb,
.pmd_entry = queue_pages_pte_range,
.test_walk = queue_pages_test_walk,
};
/*
* Walk through page tables and collect pages to be migrated.
*
* If pages found in a given range are on a set of nodes (determined by
* @nodes and @flags,) it's isolated and queued to the pagelist which is
* passed via @private.
*
* queue_pages_range() has three possible return values:
* 1 - there is unmovable page, but MPOL_MF_MOVE* & MPOL_MF_STRICT were
* specified.
* 0 - queue pages successfully or no misplaced page.
* errno - i.e. misplaced pages with MPOL_MF_STRICT specified (-EIO) or
* memory range specified by nodemask and maxnode points outside
* your accessible address space (-EFAULT)
*/
static int
queue_pages_range(struct mm_struct *mm, unsigned long start, unsigned long end,
nodemask_t *nodes, unsigned long flags,
struct list_head *pagelist)
{
int err;
struct queue_pages qp = {
.pagelist = pagelist,
.flags = flags,
.nmask = nodes,
.start = start,
.end = end,
.first = NULL,
};
err = walk_page_range(mm, start, end, &queue_pages_walk_ops, &qp);
if (!qp.first)
/* whole range in hole */
err = -EFAULT;
return err;
}
/*
* Apply policy to a single VMA
* This must be called with the mmap_lock held for writing.
*/
static int vma_replace_policy(struct vm_area_struct *vma,
struct mempolicy *pol)
{
int err;
struct mempolicy *old;
struct mempolicy *new;
pr_debug("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n",
vma->vm_start, vma->vm_end, vma->vm_pgoff,
vma->vm_ops, vma->vm_file,
vma->vm_ops ? vma->vm_ops->set_policy : NULL);
new = mpol_dup(pol);
if (IS_ERR(new))
return PTR_ERR(new);
if (vma->vm_ops && vma->vm_ops->set_policy) {
err = vma->vm_ops->set_policy(vma, new);
if (err)
goto err_out;
}
old = vma->vm_policy;
vma->vm_policy = new; /* protected by mmap_lock */
mpol_put(old);
return 0;
err_out:
mpol_put(new);
return err;
}
/* Step 2: apply policy to a range and do splits. */
static int mbind_range(struct mm_struct *mm, unsigned long start,
unsigned long end, struct mempolicy *new_pol)
{
struct vm_area_struct *prev;
struct vm_area_struct *vma;
int err = 0;
pgoff_t pgoff;
unsigned long vmstart;
unsigned long vmend;
vma = find_vma(mm, start);
VM_BUG_ON(!vma);
prev = vma->vm_prev;
if (start > vma->vm_start)
prev = vma;
for (; vma && vma->vm_start < end; prev = vma, vma = vma->vm_next) {
vmstart = max(start, vma->vm_start);
vmend = min(end, vma->vm_end);
if (mpol_equal(vma_policy(vma), new_pol))
continue;
pgoff = vma->vm_pgoff +
((vmstart - vma->vm_start) >> PAGE_SHIFT);
prev = vma_merge(mm, prev, vmstart, vmend, vma->vm_flags,
vma->anon_vma, vma->vm_file, pgoff,
new_pol, vma->vm_userfaultfd_ctx);
if (prev) {
vma = prev;
goto replace;
}
if (vma->vm_start != vmstart) {
err = split_vma(vma->vm_mm, vma, vmstart, 1);
if (err)
goto out;
}
if (vma->vm_end != vmend) {
err = split_vma(vma->vm_mm, vma, vmend, 0);
if (err)
goto out;
}
replace:
err = vma_replace_policy(vma, new_pol);
if (err)
goto out;
}
out:
return err;
}
/* Set the process memory policy */
static long do_set_mempolicy(unsigned short mode, unsigned short flags,
nodemask_t *nodes)
{
struct mempolicy *new, *old;
NODEMASK_SCRATCH(scratch);
int ret;
if (!scratch)
return -ENOMEM;
new = mpol_new(mode, flags, nodes);
if (IS_ERR(new)) {
ret = PTR_ERR(new);
goto out;
}
ret = mpol_set_nodemask(new, nodes, scratch);
if (ret) {
mpol_put(new);
goto out;
}
task_lock(current);
old = current->mempolicy;
current->mempolicy = new;
if (new && new->mode == MPOL_INTERLEAVE)
current->il_prev = MAX_NUMNODES-1;
task_unlock(current);
mpol_put(old);
ret = 0;
out:
NODEMASK_SCRATCH_FREE(scratch);
return ret;
}
/*
* Return nodemask for policy for get_mempolicy() query
*
* Called with task's alloc_lock held
*/
static void get_policy_nodemask(struct mempolicy *p, nodemask_t *nodes)
{
nodes_clear(*nodes);
if (p == &default_policy)
return;
switch (p->mode) {
case MPOL_BIND:
case MPOL_INTERLEAVE:
case MPOL_PREFERRED:
case MPOL_PREFERRED_MANY:
*nodes = p->nodes;
break;
case MPOL_LOCAL:
/* return empty node mask for local allocation */
break;
default:
BUG();
}
}
static int lookup_node(struct mm_struct *mm, unsigned long addr)
{
struct page *p = NULL;
int err;
int locked = 1;
err = get_user_pages_locked(addr & PAGE_MASK, 1, 0, &p, &locked);
if (err > 0) {
err = page_to_nid(p);
put_page(p);
}
if (locked)
mmap_read_unlock(mm);
return err;
}
/* Retrieve NUMA policy */
static long do_get_mempolicy(int *policy, nodemask_t *nmask,
unsigned long addr, unsigned long flags)
{
int err;
struct mm_struct *mm = current->mm;
struct vm_area_struct *vma = NULL;
struct mempolicy *pol = current->mempolicy, *pol_refcount = NULL;
if (flags &
~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR|MPOL_F_MEMS_ALLOWED))
return -EINVAL;
if (flags & MPOL_F_MEMS_ALLOWED) {
if (flags & (MPOL_F_NODE|MPOL_F_ADDR))
return -EINVAL;
*policy = 0; /* just so it's initialized */
task_lock(current);
*nmask = cpuset_current_mems_allowed;
task_unlock(current);
return 0;
}
if (flags & MPOL_F_ADDR) {
/*
* Do NOT fall back to task policy if the
* vma/shared policy at addr is NULL. We
* want to return MPOL_DEFAULT in this case.
*/
mmap_read_lock(mm);
vma = vma_lookup(mm, addr);
if (!vma) {
mmap_read_unlock(mm);
return -EFAULT;
}
if (vma->vm_ops && vma->vm_ops->get_policy)
pol = vma->vm_ops->get_policy(vma, addr);
else
pol = vma->vm_policy;
} else if (addr)
return -EINVAL;
if (!pol)
pol = &default_policy; /* indicates default behavior */
if (flags & MPOL_F_NODE) {
if (flags & MPOL_F_ADDR) {
/*
* Take a refcount on the mpol, lookup_node()
* will drop the mmap_lock, so after calling
* lookup_node() only "pol" remains valid, "vma"
* is stale.
*/
pol_refcount = pol;
vma = NULL;
mpol_get(pol);
err = lookup_node(mm, addr);
if (err < 0)
goto out;
*policy = err;
} else if (pol == current->mempolicy &&
pol->mode == MPOL_INTERLEAVE) {
*policy = next_node_in(current->il_prev, pol->nodes);
} else {
err = -EINVAL;
goto out;
}
} else {
*policy = pol == &default_policy ? MPOL_DEFAULT :
pol->mode;
/*
* Internal mempolicy flags must be masked off before exposing
* the policy to userspace.
*/
*policy |= (pol->flags & MPOL_MODE_FLAGS);
}
err = 0;
if (nmask) {
if (mpol_store_user_nodemask(pol)) {
*nmask = pol->w.user_nodemask;
} else {
task_lock(current);
get_policy_nodemask(pol, nmask);
task_unlock(current);
}
}
out:
mpol_cond_put(pol);
if (vma)
mmap_read_unlock(mm);
if (pol_refcount)
mpol_put(pol_refcount);
return err;
}
#ifdef CONFIG_MIGRATION
/*
* page migration, thp tail pages can be passed.
*/
static int migrate_page_add(struct page *page, struct list_head *pagelist,
unsigned long flags)
{
struct page *head = compound_head(page);
/*
* Avoid migrating a page that is shared with others.
*/
if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(head) == 1) {
if (!isolate_lru_page(head)) {
list_add_tail(&head->lru, pagelist);
mod_node_page_state(page_pgdat(head),
NR_ISOLATED_ANON + page_is_file_lru(head),
thp_nr_pages(head));
} else if (flags & MPOL_MF_STRICT) {
/*
* Non-movable page may reach here. And, there may be
* temporary off LRU pages or non-LRU movable pages.
* Treat them as unmovable pages since they can't be
* isolated, so they can't be moved at the moment. It
* should return -EIO for this case too.
*/
return -EIO;
}
}
return 0;
}
/*
* Migrate pages from one node to a target node.
* Returns error or the number of pages not migrated.
*/
static int migrate_to_node(struct mm_struct *mm, int source, int dest,
int flags)
{
nodemask_t nmask;
LIST_HEAD(pagelist);
int err = 0;
struct migration_target_control mtc = {
.nid = dest,
.gfp_mask = GFP_HIGHUSER_MOVABLE | __GFP_THISNODE,
};
nodes_clear(nmask);
node_set(source, nmask);
/*
* This does not "check" the range but isolates all pages that
* need migration. Between passing in the full user address
* space range and MPOL_MF_DISCONTIG_OK, this call can not fail.
*/
VM_BUG_ON(!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)));
queue_pages_range(mm, mm->mmap->vm_start, mm->task_size, &nmask,
flags | MPOL_MF_DISCONTIG_OK, &pagelist);
if (!list_empty(&pagelist)) {
err = migrate_pages(&pagelist, alloc_migration_target, NULL,
(unsigned long)&mtc, MIGRATE_SYNC, MR_SYSCALL, NULL);
if (err)
putback_movable_pages(&pagelist);
}
return err;
}
/*
* Move pages between the two nodesets so as to preserve the physical
* layout as much as possible.
*
* Returns the number of page that could not be moved.
*/
int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
const nodemask_t *to, int flags)
{
int busy = 0;
int err = 0;
nodemask_t tmp;
lru_cache_disable();
mmap_read_lock(mm);
/*
* Find a 'source' bit set in 'tmp' whose corresponding 'dest'
* bit in 'to' is not also set in 'tmp'. Clear the found 'source'
* bit in 'tmp', and return that <source, dest> pair for migration.
* The pair of nodemasks 'to' and 'from' define the map.
*
* If no pair of bits is found that way, fallback to picking some
* pair of 'source' and 'dest' bits that are not the same. If the
* 'source' and 'dest' bits are the same, this represents a node
* that will be migrating to itself, so no pages need move.
*
* If no bits are left in 'tmp', or if all remaining bits left
* in 'tmp' correspond to the same bit in 'to', return false
* (nothing left to migrate).
*
* This lets us pick a pair of nodes to migrate between, such that
* if possible the dest node is not already occupied by some other
* source node, minimizing the risk of overloading the memory on a
* node that would happen if we migrated incoming memory to a node
* before migrating outgoing memory source that same node.
*
* A single scan of tmp is sufficient. As we go, we remember the
* most recent <s, d> pair that moved (s != d). If we find a pair
* that not only moved, but what's better, moved to an empty slot
* (d is not set in tmp), then we break out then, with that pair.
* Otherwise when we finish scanning from_tmp, we at least have the
* most recent <s, d> pair that moved. If we get all the way through
* the scan of tmp without finding any node that moved, much less
* moved to an empty node, then there is nothing left worth migrating.
*/
tmp = *from;
while (!nodes_empty(tmp)) {
int s, d;
int source = NUMA_NO_NODE;
int dest = 0;
for_each_node_mask(s, tmp) {
/*
* do_migrate_pages() tries to maintain the relative
* node relationship of the pages established between
* threads and memory areas.
*
* However if the number of source nodes is not equal to
* the number of destination nodes we can not preserve
* this node relative relationship. In that case, skip
* copying memory from a node that is in the destination
* mask.
*
* Example: [2,3,4] -> [3,4,5] moves everything.
* [0-7] - > [3,4,5] moves only 0,1,2,6,7.
*/
if ((nodes_weight(*from) != nodes_weight(*to)) &&
(node_isset(s, *to)))
continue;
d = node_remap(s, *from, *to);
if (s == d)
continue;
source = s; /* Node moved. Memorize */
dest = d;
/* dest not in remaining from nodes? */
if (!node_isset(dest, tmp))
break;
}
if (source == NUMA_NO_NODE)
break;
node_clear(source, tmp);
err = migrate_to_node(mm, source, dest, flags);
if (err > 0)
busy += err;
if (err < 0)
break;
}
mmap_read_unlock(mm);
lru_cache_enable();
if (err < 0)
return err;
return busy;
}
/*
* Allocate a new page for page migration based on vma policy.
* Start by assuming the page is mapped by the same vma as contains @start.
* Search forward from there, if not. N.B., this assumes that the
* list of pages handed to migrate_pages()--which is how we get here--
* is in virtual address order.
*/
static struct page *new_page(struct page *page, unsigned long start)
{
struct vm_area_struct *vma;
unsigned long address;
vma = find_vma(current->mm, start);
while (vma) {
address = page_address_in_vma(page, vma);
if (address != -EFAULT)
break;
vma = vma->vm_next;
}
if (PageHuge(page)) {
return alloc_huge_page_vma(page_hstate(compound_head(page)),
vma, address);
} else if (PageTransHuge(page)) {
struct page *thp;
thp = alloc_hugepage_vma(GFP_TRANSHUGE, vma, address,
HPAGE_PMD_ORDER);
if (!thp)
return NULL;
prep_transhuge_page(thp);
return thp;
}
/*
* if !vma, alloc_page_vma() will use task or system default policy
*/
return alloc_page_vma(GFP_HIGHUSER_MOVABLE | __GFP_RETRY_MAYFAIL,
vma, address);
}
#else
static int migrate_page_add(struct page *page, struct list_head *pagelist,
unsigned long flags)
{
return -EIO;
}
int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
const nodemask_t *to, int flags)
{
return -ENOSYS;
}
static struct page *new_page(struct page *page, unsigned long start)
{
return NULL;
}
#endif
static long do_mbind(unsigned long start, unsigned long len,
unsigned short mode, unsigned short mode_flags,
nodemask_t *nmask, unsigned long flags)
{
struct mm_struct *mm = current->mm;
struct mempolicy *new;
unsigned long end;
int err;
int ret;
LIST_HEAD(pagelist);
if (flags & ~(unsigned long)MPOL_MF_VALID)
return -EINVAL;
if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE))
return -EPERM;
if (start & ~PAGE_MASK)
return -EINVAL;
if (mode == MPOL_DEFAULT)
flags &= ~MPOL_MF_STRICT;
len = (len + PAGE_SIZE - 1) & PAGE_MASK;
end = start + len;
if (end < start)
return -EINVAL;
if (end == start)
return 0;
new = mpol_new(mode, mode_flags, nmask);
if (IS_ERR(new))
return PTR_ERR(new);
if (flags & MPOL_MF_LAZY)
new->flags |= MPOL_F_MOF;
/*
* If we are using the default policy then operation
* on discontinuous address spaces is okay after all
*/
if (!new)
flags |= MPOL_MF_DISCONTIG_OK;
pr_debug("mbind %lx-%lx mode:%d flags:%d nodes:%lx\n",
start, start + len, mode, mode_flags,
nmask ? nodes_addr(*nmask)[0] : NUMA_NO_NODE);
if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
lru_cache_disable();
}
{
NODEMASK_SCRATCH(scratch);
if (scratch) {
mmap_write_lock(mm);
err = mpol_set_nodemask(new, nmask, scratch);
if (err)
mmap_write_unlock(mm);
} else
err = -ENOMEM;
NODEMASK_SCRATCH_FREE(scratch);
}
if (err)
goto mpol_out;
ret = queue_pages_range(mm, start, end, nmask,
flags | MPOL_MF_INVERT, &pagelist);
if (ret < 0) {
err = ret;
goto up_out;
}
err = mbind_range(mm, start, end, new);
if (!err) {
int nr_failed = 0;
if (!list_empty(&pagelist)) {
WARN_ON_ONCE(flags & MPOL_MF_LAZY);
nr_failed = migrate_pages(&pagelist, new_page, NULL,
start, MIGRATE_SYNC, MR_MEMPOLICY_MBIND, NULL);
if (nr_failed)
putback_movable_pages(&pagelist);
}
if ((ret > 0) || (nr_failed && (flags & MPOL_MF_STRICT)))
err = -EIO;
} else {
up_out:
if (!list_empty(&pagelist))
putback_movable_pages(&pagelist);
}
mmap_write_unlock(mm);
mpol_out:
mpol_put(new);
if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
lru_cache_enable();
return err;
}
/*
* User space interface with variable sized bitmaps for nodelists.
*/
static int get_bitmap(unsigned long *mask, const unsigned long __user *nmask,
unsigned long maxnode)
{
unsigned long nlongs = BITS_TO_LONGS(maxnode);
int ret;
if (in_compat_syscall())
ret = compat_get_bitmap(mask,
(const compat_ulong_t __user *)nmask,
maxnode);
else
ret = copy_from_user(mask, nmask,
nlongs * sizeof(unsigned long));
if (ret)
return -EFAULT;
if (maxnode % BITS_PER_LONG)
mask[nlongs - 1] &= (1UL << (maxnode % BITS_PER_LONG)) - 1;
return 0;
}
/* Copy a node mask from user space. */
static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask,
unsigned long maxnode)
{
--maxnode;
nodes_clear(*nodes);
if (maxnode == 0 || !nmask)
return 0;
if (maxnode > PAGE_SIZE*BITS_PER_BYTE)
return -EINVAL;
/*
* When the user specified more nodes than supported just check
* if the non supported part is all zero, one word at a time,
* starting at the end.
*/
while (maxnode > MAX_NUMNODES) {
unsigned long bits = min_t(unsigned long, maxnode, BITS_PER_LONG);
unsigned long t;
if (get_bitmap(&t, &nmask[maxnode / BITS_PER_LONG], bits))
return -EFAULT;
if (maxnode - bits >= MAX_NUMNODES) {
maxnode -= bits;
} else {
maxnode = MAX_NUMNODES;
t &= ~((1UL << (MAX_NUMNODES % BITS_PER_LONG)) - 1);
}
if (t)
return -EINVAL;
}
return get_bitmap(nodes_addr(*nodes), nmask, maxnode);
}
/* Copy a kernel node mask to user space */
static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,
nodemask_t *nodes)
{
unsigned long copy = ALIGN(maxnode-1, 64) / 8;
unsigned int nbytes = BITS_TO_LONGS(nr_node_ids) * sizeof(long);
bool compat = in_compat_syscall();
if (compat)
nbytes = BITS_TO_COMPAT_LONGS(nr_node_ids) * sizeof(compat_long_t);
if (copy > nbytes) {
if (copy > PAGE_SIZE)
return -EINVAL;
if (clear_user((char __user *)mask + nbytes, copy - nbytes))
return -EFAULT;
copy = nbytes;
maxnode = nr_node_ids;
}
if (compat)
return compat_put_bitmap((compat_ulong_t __user *)mask,
nodes_addr(*nodes), maxnode);
return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0;
}
/* Basic parameter sanity check used by both mbind() and set_mempolicy() */
static inline int sanitize_mpol_flags(int *mode, unsigned short *flags)
{
*flags = *mode & MPOL_MODE_FLAGS;
*mode &= ~MPOL_MODE_FLAGS;
if ((unsigned int)(*mode) >= MPOL_MAX)
return -EINVAL;
if ((*flags & MPOL_F_STATIC_NODES) && (*flags & MPOL_F_RELATIVE_NODES))
return -EINVAL;
if (*flags & MPOL_F_NUMA_BALANCING) {
if (*mode != MPOL_BIND)
return -EINVAL;
*flags |= (MPOL_F_MOF | MPOL_F_MORON);
}
return 0;
}
static long kernel_mbind(unsigned long start, unsigned long len,
unsigned long mode, const unsigned long __user *nmask,
unsigned long maxnode, unsigned int flags)
{
unsigned short mode_flags;
nodemask_t nodes;
int lmode = mode;
int err;
start = untagged_addr(start);
err = sanitize_mpol_flags(&lmode, &mode_flags);
if (err)
return err;
err = get_nodes(&nodes, nmask, maxnode);
if (err)
return err;
return do_mbind(start, len, lmode, mode_flags, &nodes, flags);
}
SYSCALL_DEFINE6(mbind, unsigned long, start, unsigned long, len,
unsigned long, mode, const unsigned long __user *, nmask,
unsigned long, maxnode, unsigned int, flags)
{
return kernel_mbind(start, len, mode, nmask, maxnode, flags);
}
/* Set the process memory policy */
static long kernel_set_mempolicy(int mode, const unsigned long __user *nmask,
unsigned long maxnode)
{
unsigned short mode_flags;
nodemask_t nodes;
int lmode = mode;
int err;
err = sanitize_mpol_flags(&lmode, &mode_flags);
if (err)
return err;
err = get_nodes(&nodes, nmask, maxnode);
if (err)
return err;
return do_set_mempolicy(lmode, mode_flags, &nodes);
}
SYSCALL_DEFINE3(set_mempolicy, int, mode, const unsigned long __user *, nmask,
unsigned long, maxnode)
{
return kernel_set_mempolicy(mode, nmask, maxnode);
}
static int kernel_migrate_pages(pid_t pid, unsigned long maxnode,
const unsigned long __user *old_nodes,
const unsigned long __user *new_nodes)
{
struct mm_struct *mm = NULL;
struct task_struct *task;
nodemask_t task_nodes;
int err;
nodemask_t *old;
nodemask_t *new;
NODEMASK_SCRATCH(scratch);
if (!scratch)
return -ENOMEM;
old = &scratch->mask1;
new = &scratch->mask2;
err = get_nodes(old, old_nodes, maxnode);
if (err)
goto out;
err = get_nodes(new, new_nodes, maxnode);
if (err)
goto out;
/* Find the mm_struct */
rcu_read_lock();
task = pid ? find_task_by_vpid(pid) : current;
if (!task) {
rcu_read_unlock();
err = -ESRCH;
goto out;
}
get_task_struct(task);
err = -EINVAL;
/*
* Check if this process has the right to modify the specified process.
* Use the regular "ptrace_may_access()" checks.
*/
if (!ptrace_may_access(task, PTRACE_MODE_READ_REALCREDS)) {
rcu_read_unlock();
err = -EPERM;
goto out_put;
}
rcu_read_unlock();
task_nodes = cpuset_mems_allowed(task);
/* Is the user allowed to access the target nodes? */
if (!nodes_subset(*new, task_nodes) && !capable(CAP_SYS_NICE)) {
err = -EPERM;
goto out_put;
}
task_nodes = cpuset_mems_allowed(current);
nodes_and(*new, *new, task_nodes);
if (nodes_empty(*new))
goto out_put;
err = security_task_movememory(task);
if (err)
goto out_put;
mm = get_task_mm(task);
put_task_struct(task);
if (!mm) {
err = -EINVAL;
goto out;
}
err = do_migrate_pages(mm, old, new,
capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE);
mmput(mm);
out:
NODEMASK_SCRATCH_FREE(scratch);
return err;
out_put:
put_task_struct(task);
goto out;
}
SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode,
const unsigned long __user *, old_nodes,
const unsigned long __user *, new_nodes)
{
return kernel_migrate_pages(pid, maxnode, old_nodes, new_nodes);
}
/* Retrieve NUMA policy */
static int kernel_get_mempolicy(int __user *policy,
unsigned long __user *nmask,
unsigned long maxnode,
unsigned long addr,
unsigned long flags)
{
int err;
int pval;
nodemask_t nodes;
if (nmask != NULL && maxnode < nr_node_ids)
return -EINVAL;
addr = untagged_addr(addr);
err = do_get_mempolicy(&pval, &nodes, addr, flags);
if (err)
return err;
if (policy && put_user(pval, policy))
return -EFAULT;
if (nmask)
err = copy_nodes_to_user(nmask, maxnode, &nodes);
return err;
}
SYSCALL_DEFINE5(get_mempolicy, int __user *, policy,
unsigned long __user *, nmask, unsigned long, maxnode,
unsigned long, addr, unsigned long, flags)
{
return kernel_get_mempolicy(policy, nmask, maxnode, addr, flags);
}
bool vma_migratable(struct vm_area_struct *vma)
{
if (vma->vm_flags & (VM_IO | VM_PFNMAP))
return false;
/*
* DAX device mappings require predictable access latency, so avoid
* incurring periodic faults.
*/
if (vma_is_dax(vma))
return false;
if (is_vm_hugetlb_page(vma) &&
!hugepage_migration_supported(hstate_vma(vma)))
return false;
/*
* Migration allocates pages in the highest zone. If we cannot
* do so then migration (at least from node to node) is not
* possible.
*/
if (vma->vm_file &&
gfp_zone(mapping_gfp_mask(vma->vm_file->f_mapping))
< policy_zone)
return false;
return true;
}
struct mempolicy *__get_vma_policy(struct vm_area_struct *vma,
unsigned long addr)
{
struct mempolicy *pol = NULL;
if (vma) { if (vma->vm_ops && vma->vm_ops->get_policy) { pol = vma->vm_ops->get_policy(vma, addr); } else if (vma->vm_policy) {
pol = vma->vm_policy;
/*
* shmem_alloc_page() passes MPOL_F_SHARED policy with
* a pseudo vma whose vma->vm_ops=NULL. Take a reference
* count on these policies which will be dropped by
* mpol_cond_put() later
*/
if (mpol_needs_cond_ref(pol))
mpol_get(pol);
}
}
return pol;
}
/*
* get_vma_policy(@vma, @addr)
* @vma: virtual memory area whose policy is sought
* @addr: address in @vma for shared policy lookup
*
* Returns effective policy for a VMA at specified address.
* Falls back to current->mempolicy or system default policy, as necessary.
* Shared policies [those marked as MPOL_F_SHARED] require an extra reference
* count--added by the get_policy() vm_op, as appropriate--to protect against
* freeing by another task. It is the caller's responsibility to free the
* extra reference for shared policies.
*/
static struct mempolicy *get_vma_policy(struct vm_area_struct *vma,
unsigned long addr)
{
struct mempolicy *pol = __get_vma_policy(vma, addr);
if (!pol)
pol = get_task_policy(current);
return pol;
}
bool vma_policy_mof(struct vm_area_struct *vma)
{
struct mempolicy *pol;
if (vma->vm_ops && vma->vm_ops->get_policy) {
bool ret = false;
pol = vma->vm_ops->get_policy(vma, vma->vm_start);
if (pol && (pol->flags & MPOL_F_MOF))
ret = true;
mpol_cond_put(pol);
return ret;
}
pol = vma->vm_policy;
if (!pol)
pol = get_task_policy(current);
return pol->flags & MPOL_F_MOF;
}
static int apply_policy_zone(struct mempolicy *policy, enum zone_type zone)
{
enum zone_type dynamic_policy_zone = policy_zone;
BUG_ON(dynamic_policy_zone == ZONE_MOVABLE);
/*
* if policy->nodes has movable memory only,
* we apply policy when gfp_zone(gfp) = ZONE_MOVABLE only.
*
* policy->nodes is intersect with node_states[N_MEMORY].
* so if the following test fails, it implies
* policy->nodes has movable memory only.
*/
if (!nodes_intersects(policy->nodes, node_states[N_HIGH_MEMORY]))
dynamic_policy_zone = ZONE_MOVABLE;
return zone >= dynamic_policy_zone;
}
/*
* Return a nodemask representing a mempolicy for filtering nodes for
* page allocation
*/
nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy)
{
int mode = policy->mode;
/* Lower zones don't get a nodemask applied for MPOL_BIND */
if (unlikely(mode == MPOL_BIND) &&
apply_policy_zone(policy, gfp_zone(gfp)) &&
cpuset_nodemask_valid_mems_allowed(&policy->nodes))
return &policy->nodes;
if (mode == MPOL_PREFERRED_MANY) return &policy->nodes; return NULL;
}
/*
* Return the preferred node id for 'prefer' mempolicy, and return
* the given id for all other policies.
*
* policy_node() is always coupled with policy_nodemask(), which
* secures the nodemask limit for 'bind' and 'prefer-many' policy.
*/
static int policy_node(gfp_t gfp, struct mempolicy *policy, int nd)
{
if (policy->mode == MPOL_PREFERRED) {
nd = first_node(policy->nodes);
} else {
/*
* __GFP_THISNODE shouldn't even be used with the bind policy
* because we might easily break the expectation to stay on the
* requested node and not break the policy.
*/
WARN_ON_ONCE(policy->mode == MPOL_BIND && (gfp & __GFP_THISNODE));
}
return nd;
}
/* Do dynamic interleaving for a process */
static unsigned interleave_nodes(struct mempolicy *policy)
{
unsigned next;
struct task_struct *me = current;
next = next_node_in(me->il_prev, policy->nodes);
if (next < MAX_NUMNODES)
me->il_prev = next;
return next;
}
/*
* Depending on the memory policy provide a node from which to allocate the
* next slab entry.
*/
unsigned int mempolicy_slab_node(void)
{
struct mempolicy *policy;
int node = numa_mem_id();
if (!in_task())
return node;
policy = current->mempolicy;
if (!policy)
return node;
switch (policy->mode) {
case MPOL_PREFERRED:
return first_node(policy->nodes);
case MPOL_INTERLEAVE:
return interleave_nodes(policy);
case MPOL_BIND:
case MPOL_PREFERRED_MANY:
{
struct zoneref *z;
/*
* Follow bind policy behavior and start allocation at the
* first node.
*/
struct zonelist *zonelist;
enum zone_type highest_zoneidx = gfp_zone(GFP_KERNEL);
zonelist = &NODE_DATA(node)->node_zonelists[ZONELIST_FALLBACK];
z = first_zones_zonelist(zonelist, highest_zoneidx,
&policy->nodes);
return z->zone ? zone_to_nid(z->zone) : node;
}
case MPOL_LOCAL:
return node;
default:
BUG();
}
}
/*
* Do static interleaving for a VMA with known offset @n. Returns the n'th
* node in pol->nodes (starting from n=0), wrapping around if n exceeds the
* number of present nodes.
*/
static unsigned offset_il_node(struct mempolicy *pol, unsigned long n)
{
nodemask_t nodemask = pol->nodes;
unsigned int target, nnodes;
int i;
int nid;
/*
* The barrier will stabilize the nodemask in a register or on
* the stack so that it will stop changing under the code.
*
* Between first_node() and next_node(), pol->nodes could be changed
* by other threads. So we put pol->nodes in a local stack.
*/
barrier();
nnodes = nodes_weight(nodemask);
if (!nnodes)
return numa_node_id();
target = (unsigned int)n % nnodes;
nid = first_node(nodemask);
for (i = 0; i < target; i++)
nid = next_node(nid, nodemask);
return nid;
}
/* Determine a node number for interleave */
static inline unsigned interleave_nid(struct mempolicy *pol,
struct vm_area_struct *vma, unsigned long addr, int shift)
{
if (vma) {
unsigned long off;
/*
* for small pages, there is no difference between
* shift and PAGE_SHIFT, so the bit-shift is safe.
* for huge pages, since vm_pgoff is in units of small
* pages, we need to shift off the always 0 bits to get
* a useful offset.
*/
BUG_ON(shift < PAGE_SHIFT); off = vma->vm_pgoff >> (shift - PAGE_SHIFT);
off += (addr - vma->vm_start) >> shift;
return offset_il_node(pol, off);
} else
return interleave_nodes(pol);
}
#ifdef CONFIG_HUGETLBFS
/*
* huge_node(@vma, @addr, @gfp_flags, @mpol)
* @vma: virtual memory area whose policy is sought
* @addr: address in @vma for shared policy lookup and interleave policy
* @gfp_flags: for requested zone
* @mpol: pointer to mempolicy pointer for reference counted mempolicy
* @nodemask: pointer to nodemask pointer for 'bind' and 'prefer-many' policy
*
* Returns a nid suitable for a huge page allocation and a pointer
* to the struct mempolicy for conditional unref after allocation.
* If the effective policy is 'bind' or 'prefer-many', returns a pointer
* to the mempolicy's @nodemask for filtering the zonelist.
*
* Must be protected by read_mems_allowed_begin()
*/
int huge_node(struct vm_area_struct *vma, unsigned long addr, gfp_t gfp_flags,
struct mempolicy **mpol, nodemask_t **nodemask)
{
int nid;
int mode;
*mpol = get_vma_policy(vma, addr);
*nodemask = NULL;
mode = (*mpol)->mode;
if (unlikely(mode == MPOL_INTERLEAVE)) {
nid = interleave_nid(*mpol, vma, addr,
huge_page_shift(hstate_vma(vma)));
} else {
nid = policy_node(gfp_flags, *mpol, numa_node_id());
if (mode == MPOL_BIND || mode == MPOL_PREFERRED_MANY)
*nodemask = &(*mpol)->nodes;
}
return nid;
}
/*
* init_nodemask_of_mempolicy
*
* If the current task's mempolicy is "default" [NULL], return 'false'
* to indicate default policy. Otherwise, extract the policy nodemask
* for 'bind' or 'interleave' policy into the argument nodemask, or
* initialize the argument nodemask to contain the single node for
* 'preferred' or 'local' policy and return 'true' to indicate presence
* of non-default mempolicy.
*
* We don't bother with reference counting the mempolicy [mpol_get/put]
* because the current task is examining it's own mempolicy and a task's
* mempolicy is only ever changed by the task itself.
*
* N.B., it is the caller's responsibility to free a returned nodemask.
*/
bool init_nodemask_of_mempolicy(nodemask_t *mask)
{
struct mempolicy *mempolicy;
if (!(mask && current->mempolicy))
return false;
task_lock(current);
mempolicy = current->mempolicy;
switch (mempolicy->mode) {
case MPOL_PREFERRED:
case MPOL_PREFERRED_MANY:
case MPOL_BIND:
case MPOL_INTERLEAVE:
*mask = mempolicy->nodes;
break;
case MPOL_LOCAL:
init_nodemask_of_node(mask, numa_node_id());
break;
default:
BUG();
}
task_unlock(current);
return true;
}
#endif
/*
* mempolicy_in_oom_domain
*
* If tsk's mempolicy is "bind", check for intersection between mask and
* the policy nodemask. Otherwise, return true for all other policies
* including "interleave", as a tsk with "interleave" policy may have
* memory allocated from all nodes in system.
*
* Takes task_lock(tsk) to prevent freeing of its mempolicy.
*/
bool mempolicy_in_oom_domain(struct task_struct *tsk,
const nodemask_t *mask)
{
struct mempolicy *mempolicy;
bool ret = true;
if (!mask)
return ret;
task_lock(tsk);
mempolicy = tsk->mempolicy;
if (mempolicy && mempolicy->mode == MPOL_BIND)
ret = nodes_intersects(mempolicy->nodes, *mask);
task_unlock(tsk);
return ret;
}
/* Allocate a page in interleaved policy.
Own path because it needs to do special accounting. */
static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
unsigned nid)
{
struct page *page;
page = __alloc_pages(gfp, order, nid, NULL);
/* skip NUMA_INTERLEAVE_HIT counter update if numa stats is disabled */
if (!static_branch_likely(&vm_numa_stat_key))
return page;
if (page && page_to_nid(page) == nid) {
preempt_disable();
__count_numa_event(page_zone(page), NUMA_INTERLEAVE_HIT);
preempt_enable();
}
return page;
}
static struct page *alloc_pages_preferred_many(gfp_t gfp, unsigned int order,
int nid, struct mempolicy *pol)
{
struct page *page;
gfp_t preferred_gfp;
/*
* This is a two pass approach. The first pass will only try the
* preferred nodes but skip the direct reclaim and allow the
* allocation to fail, while the second pass will try all the
* nodes in system.
*/
preferred_gfp = gfp | __GFP_NOWARN;
preferred_gfp &= ~(__GFP_DIRECT_RECLAIM | __GFP_NOFAIL);
page = __alloc_pages(preferred_gfp, order, nid, &pol->nodes);
if (!page)
page = __alloc_pages(gfp, order, numa_node_id(), NULL);
return page;
}
/**
* alloc_pages_vma - Allocate a page for a VMA.
* @gfp: GFP flags.
* @order: Order of the GFP allocation.
* @vma: Pointer to VMA or NULL if not available.
* @addr: Virtual address of the allocation. Must be inside @vma.
* @node: Which node to prefer for allocation (modulo policy).
* @hugepage: For hugepages try only the preferred node if possible.
*
* Allocate a page for a specific address in @vma, using the appropriate
* NUMA policy. When @vma is not NULL the caller must hold the mmap_lock
* of the mm_struct of the VMA to prevent it from going away. Should be
* used for all allocations for pages that will be mapped into user space.
*
* Return: The page on success or NULL if allocation fails.
*/
struct page *alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
unsigned long addr, int node, bool hugepage)
{
struct mempolicy *pol;
struct page *page;
int preferred_nid;
nodemask_t *nmask;
pol = get_vma_policy(vma, addr);
if (pol->mode == MPOL_INTERLEAVE) {
unsigned nid;
nid = interleave_nid(pol, vma, addr, PAGE_SHIFT + order);
mpol_cond_put(pol);
page = alloc_page_interleave(gfp, order, nid);
goto out;
}
if (pol->mode == MPOL_PREFERRED_MANY) { page = alloc_pages_preferred_many(gfp, order, node, pol);
mpol_cond_put(pol);
goto out;
}
if (unlikely(IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && hugepage)) {
int hpage_node = node;
/*
* For hugepage allocation and non-interleave policy which
* allows the current node (or other explicitly preferred
* node) we only try to allocate from the current/preferred
* node and don't fall back to other nodes, as the cost of
* remote accesses would likely offset THP benefits.
*
* If the policy is interleave or does not allow the current
* node in its nodemask, we allocate the standard way.
*/
if (pol->mode == MPOL_PREFERRED)
hpage_node = first_node(pol->nodes);
nmask = policy_nodemask(gfp, pol);
if (!nmask || node_isset(hpage_node, *nmask)) {
mpol_cond_put(pol);
/*
* First, try to allocate THP only on local node, but
* don't reclaim unnecessarily, just compact.
*/
page = __alloc_pages_node(hpage_node,
gfp | __GFP_THISNODE | __GFP_NORETRY, order);
/*
* If hugepage allocations are configured to always
* synchronous compact or the vma has been madvised
* to prefer hugepage backing, retry allowing remote
* memory with both reclaim and compact as well.
*/
if (!page && (gfp & __GFP_DIRECT_RECLAIM))
page = __alloc_pages(gfp, order, hpage_node, nmask);
goto out;
}
}
nmask = policy_nodemask(gfp, pol);
preferred_nid = policy_node(gfp, pol, node);
page = __alloc_pages(gfp, order, preferred_nid, nmask);
mpol_cond_put(pol);
out:
return page;}
EXPORT_SYMBOL(alloc_pages_vma);
/**
* alloc_pages - Allocate pages.
* @gfp: GFP flags.
* @order: Power of two of number of pages to allocate.
*
* Allocate 1 << @order contiguous pages. The physical address of the
* first page is naturally aligned (eg an order-3 allocation will be aligned
* to a multiple of 8 * PAGE_SIZE bytes). The NUMA policy of the current
* process is honoured when in process context.
*
* Context: Can be called from any context, providing the appropriate GFP
* flags are used.
* Return: The page on success or NULL if allocation fails.
*/
struct page *alloc_pages(gfp_t gfp, unsigned order)
{
struct mempolicy *pol = &default_policy;
struct page *page;
if (!in_interrupt() && !(gfp & __GFP_THISNODE))
pol = get_task_policy(current);
/*
* No reference counting needed for current->mempolicy
* nor system default_policy
*/
if (pol->mode == MPOL_INTERLEAVE) page = alloc_page_interleave(gfp, order, interleave_nodes(pol)); else if (pol->mode == MPOL_PREFERRED_MANY)
page = alloc_pages_preferred_many(gfp, order,
numa_node_id(), pol);
else
page = __alloc_pages(gfp, order,
policy_node(gfp, pol, numa_node_id()),
policy_nodemask(gfp, pol));
return page;
}
EXPORT_SYMBOL(alloc_pages);
int vma_dup_policy(struct vm_area_struct *src, struct vm_area_struct *dst)
{
struct mempolicy *pol = mpol_dup(vma_policy(src));
if (IS_ERR(pol))
return PTR_ERR(pol);
dst->vm_policy = pol;
return 0;
}
/*
* If mpol_dup() sees current->cpuset == cpuset_being_rebound, then it
* rebinds the mempolicy its copying by calling mpol_rebind_policy()
* with the mems_allowed returned by cpuset_mems_allowed(). This
* keeps mempolicies cpuset relative after its cpuset moves. See
* further kernel/cpuset.c update_nodemask().
*
* current's mempolicy may be rebinded by the other task(the task that changes
* cpuset's mems), so we needn't do rebind work for current task.
*/
/* Slow path of a mempolicy duplicate */
struct mempolicy *__mpol_dup(struct mempolicy *old)
{
struct mempolicy *new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
if (!new)
return ERR_PTR(-ENOMEM);
/* task's mempolicy is protected by alloc_lock */
if (old == current->mempolicy) {
task_lock(current);
*new = *old;
task_unlock(current);
} else
*new = *old;
if (current_cpuset_is_being_rebound()) {
nodemask_t mems = cpuset_mems_allowed(current);
mpol_rebind_policy(new, &mems);
}
atomic_set(&new->refcnt, 1);
return new;
}
/* Slow path of a mempolicy comparison */
bool __mpol_equal(struct mempolicy *a, struct mempolicy *b)
{
if (!a || !b)
return false;
if (a->mode != b->mode)
return false;
if (a->flags != b->flags)
return false;
if (mpol_store_user_nodemask(a))
if (!nodes_equal(a->w.user_nodemask, b->w.user_nodemask))
return false;
switch (a->mode) {
case MPOL_BIND:
case MPOL_INTERLEAVE:
case MPOL_PREFERRED:
case MPOL_PREFERRED_MANY:
return !!nodes_equal(a->nodes, b->nodes);
case MPOL_LOCAL:
return true;
default:
BUG();
return false;
}
}
/*
* Shared memory backing store policy support.
*
* Remember policies even when nobody has shared memory mapped.
* The policies are kept in Red-Black tree linked from the inode.
* They are protected by the sp->lock rwlock, which should be held
* for any accesses to the tree.
*/
/*
* lookup first element intersecting start-end. Caller holds sp->lock for
* reading or for writing
*/
static struct sp_node *
sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end)
{
struct rb_node *n = sp->root.rb_node;
while (n) {
struct sp_node *p = rb_entry(n, struct sp_node, nd);
if (start >= p->end)
n = n->rb_right;
else if (end <= p->start)
n = n->rb_left;
else
break;
}
if (!n)
return NULL;
for (;;) {
struct sp_node *w = NULL;
struct rb_node *prev = rb_prev(n);
if (!prev)
break;
w = rb_entry(prev, struct sp_node, nd);
if (w->end <= start)
break;
n = prev;
}
return rb_entry(n, struct sp_node, nd);
}
/*
* Insert a new shared policy into the list. Caller holds sp->lock for
* writing.
*/
static void sp_insert(struct shared_policy *sp, struct sp_node *new)
{
struct rb_node **p = &sp->root.rb_node;
struct rb_node *parent = NULL;
struct sp_node *nd;
while (*p) {
parent = *p;
nd = rb_entry(parent, struct sp_node, nd);
if (new->start < nd->start)
p = &(*p)->rb_left;
else if (new->end > nd->end)
p = &(*p)->rb_right;
else
BUG();
}
rb_link_node(&new->nd, parent, p);
rb_insert_color(&new->nd, &sp->root);
pr_debug("inserting %lx-%lx: %d\n", new->start, new->end,
new->policy ? new->policy->mode : 0);
}
/* Find shared policy intersecting idx */
struct mempolicy *
mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx)
{
struct mempolicy *pol = NULL;
struct sp_node *sn;
if (!sp->root.rb_node)
return NULL;
read_lock(&sp->lock);
sn = sp_lookup(sp, idx, idx+1);
if (sn) {
mpol_get(sn->policy);
pol = sn->policy;
}
read_unlock(&sp->lock);
return pol;
}
static void sp_free(struct sp_node *n)
{
mpol_put(n->policy);
kmem_cache_free(sn_cache, n);
}
/**
* mpol_misplaced - check whether current page node is valid in policy
*
* @page: page to be checked
* @vma: vm area where page mapped
* @addr: virtual address where page mapped
*
* Lookup current policy node id for vma,addr and "compare to" page's
* node id. Policy determination "mimics" alloc_page_vma().
* Called from fault path where we know the vma and faulting address.
*
* Return: NUMA_NO_NODE if the page is in a node that is valid for this
* policy, or a suitable node ID to allocate a replacement page from.
*/
int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long addr)
{
struct mempolicy *pol;
struct zoneref *z;
int curnid = page_to_nid(page);
unsigned long pgoff;
int thiscpu = raw_smp_processor_id();
int thisnid = cpu_to_node(thiscpu);
int polnid = NUMA_NO_NODE;
int ret = NUMA_NO_NODE;
pol = get_vma_policy(vma, addr);
if (!(pol->flags & MPOL_F_MOF))
goto out;
switch (pol->mode) {
case MPOL_INTERLEAVE:
pgoff = vma->vm_pgoff;
pgoff += (addr - vma->vm_start) >> PAGE_SHIFT;
polnid = offset_il_node(pol, pgoff);
break;
case MPOL_PREFERRED:
if (node_isset(curnid, pol->nodes))
goto out;
polnid = first_node(pol->nodes);
break;
case MPOL_LOCAL:
polnid = numa_node_id();
break;
case MPOL_BIND:
/* Optimize placement among multiple nodes via NUMA balancing */
if (pol->flags & MPOL_F_MORON) {
if (node_isset(thisnid, pol->nodes))
break;
goto out;
}
fallthrough;
case MPOL_PREFERRED_MANY:
/*
* use current page if in policy nodemask,
* else select nearest allowed node, if any.
* If no allowed nodes, use current [!misplaced].
*/
if (node_isset(curnid, pol->nodes))
goto out;
z = first_zones_zonelist(
node_zonelist(numa_node_id(), GFP_HIGHUSER),
gfp_zone(GFP_HIGHUSER),
&pol->nodes);
polnid = zone_to_nid(z->zone);
break;
default:
BUG();
}
/* Migrate the page towards the node whose CPU is referencing it */
if (pol->flags & MPOL_F_MORON) {
polnid = thisnid;
if (!should_numa_migrate_memory(current, page, curnid, thiscpu))
goto out;
}
if (curnid != polnid)
ret = polnid;
out:
mpol_cond_put(pol);
return ret;
}
/*
* Drop the (possibly final) reference to task->mempolicy. It needs to be
* dropped after task->mempolicy is set to NULL so that any allocation done as
* part of its kmem_cache_free(), such as by KASAN, doesn't reference a freed
* policy.
*/
void mpol_put_task_policy(struct task_struct *task)
{
struct mempolicy *pol;
task_lock(task);
pol = task->mempolicy;
task->mempolicy = NULL;
task_unlock(task);
mpol_put(pol);
}
static void sp_delete(struct shared_policy *sp, struct sp_node *n)
{
pr_debug("deleting %lx-l%lx\n", n->start, n->end);
rb_erase(&n->nd, &sp->root);
sp_free(n);
}
static void sp_node_init(struct sp_node *node, unsigned long start,
unsigned long end, struct mempolicy *pol)
{
node->start = start;
node->end = end;
node->policy = pol;
}
static struct sp_node *sp_alloc(unsigned long start, unsigned long end,
struct mempolicy *pol)
{
struct sp_node *n;
struct mempolicy *newpol;
n = kmem_cache_alloc(sn_cache, GFP_KERNEL);
if (!n)
return NULL;
newpol = mpol_dup(pol);
if (IS_ERR(newpol)) {
kmem_cache_free(sn_cache, n);
return NULL;
}
newpol->flags |= MPOL_F_SHARED;
sp_node_init(n, start, end, newpol);
return n;
}
/* Replace a policy range. */
static int shared_policy_replace(struct shared_policy *sp, unsigned long start,
unsigned long end, struct sp_node *new)
{
struct sp_node *n;
struct sp_node *n_new = NULL;
struct mempolicy *mpol_new = NULL;
int ret = 0;
restart:
write_lock(&sp->lock);
n = sp_lookup(sp, start, end);
/* Take care of old policies in the same range. */
while (n && n->start < end) {
struct rb_node *next = rb_next(&n->nd);
if (n->start >= start) {
if (n->end <= end)
sp_delete(sp, n);
else
n->start = end;
} else {
/* Old policy spanning whole new range. */
if (n->end > end) {
if (!n_new)
goto alloc_new;
*mpol_new = *n->policy;
atomic_set(&mpol_new->refcnt, 1);
sp_node_init(n_new, end, n->end, mpol_new);
n->end = start;
sp_insert(sp, n_new);
n_new = NULL;
mpol_new = NULL;
break;
} else
n->end = start;
}
if (!next)
break;
n = rb_entry(next, struct sp_node, nd);
}
if (new)
sp_insert(sp, new);
write_unlock(&sp->lock);
ret = 0;
err_out:
if (mpol_new)
mpol_put(mpol_new);
if (n_new)
kmem_cache_free(sn_cache, n_new);
return ret;
alloc_new:
write_unlock(&sp->lock);
ret = -ENOMEM;
n_new = kmem_cache_alloc(sn_cache, GFP_KERNEL);
if (!n_new)
goto err_out;
mpol_new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
if (!mpol_new)
goto err_out;
atomic_set(&mpol_new->refcnt, 1);
goto restart;
}
/**
* mpol_shared_policy_init - initialize shared policy for inode
* @sp: pointer to inode shared policy
* @mpol: struct mempolicy to install
*
* Install non-NULL @mpol in inode's shared policy rb-tree.
* On entry, the current task has a reference on a non-NULL @mpol.
* This must be released on exit.
* This is called at get_inode() calls and we can use GFP_KERNEL.
*/
void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol)
{
int ret;
sp->root = RB_ROOT; /* empty tree == default mempolicy */
rwlock_init(&sp->lock);
if (mpol) {
struct vm_area_struct pvma;
struct mempolicy *new;
NODEMASK_SCRATCH(scratch);
if (!scratch)
goto put_mpol;
/* contextualize the tmpfs mount point mempolicy */
new = mpol_new(mpol->mode, mpol->flags, &mpol->w.user_nodemask);
if (IS_ERR(new))
goto free_scratch; /* no valid nodemask intersection */
task_lock(current);
ret = mpol_set_nodemask(new, &mpol->w.user_nodemask, scratch);
task_unlock(current);
if (ret)
goto put_new;
/* Create pseudo-vma that contains just the policy */
vma_init(&pvma, NULL);
pvma.vm_end = TASK_SIZE; /* policy covers entire file */
mpol_set_shared_policy(sp, &pvma, new); /* adds ref */
put_new:
mpol_put(new); /* drop initial ref */
free_scratch:
NODEMASK_SCRATCH_FREE(scratch);
put_mpol:
mpol_put(mpol); /* drop our incoming ref on sb mpol */
}
}
int mpol_set_shared_policy(struct shared_policy *info,
struct vm_area_struct *vma, struct mempolicy *npol)
{
int err;
struct sp_node *new = NULL;
unsigned long sz = vma_pages(vma);
pr_debug("set_shared_policy %lx sz %lu %d %d %lx\n",
vma->vm_pgoff,
sz, npol ? npol->mode : -1,
npol ? npol->flags : -1,
npol ? nodes_addr(npol->nodes)[0] : NUMA_NO_NODE);
if (npol) {
new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol);
if (!new)
return -ENOMEM;
}
err = shared_policy_replace(info, vma->vm_pgoff, vma->vm_pgoff+sz, new);
if (err && new)
sp_free(new);
return err;
}
/* Free a backing policy store on inode delete. */
void mpol_free_shared_policy(struct shared_policy *p)
{
struct sp_node *n;
struct rb_node *next;
if (!p->root.rb_node)
return;
write_lock(&p->lock);
next = rb_first(&p->root);
while (next) {
n = rb_entry(next, struct sp_node, nd);
next = rb_next(&n->nd);
sp_delete(p, n);
}
write_unlock(&p->lock);
}
#ifdef CONFIG_NUMA_BALANCING
static int __initdata numabalancing_override;
static void __init check_numabalancing_enable(void)
{
bool numabalancing_default = false;
if (IS_ENABLED(CONFIG_NUMA_BALANCING_DEFAULT_ENABLED))
numabalancing_default = true;
/* Parsed by setup_numabalancing. override == 1 enables, -1 disables */
if (numabalancing_override)
set_numabalancing_state(numabalancing_override == 1);
if (num_online_nodes() > 1 && !numabalancing_override) {
pr_info("%s automatic NUMA balancing. Configure with numa_balancing= or the kernel.numa_balancing sysctl\n",
numabalancing_default ? "Enabling" : "Disabling");
set_numabalancing_state(numabalancing_default);
}
}
static int __init setup_numabalancing(char *str)
{
int ret = 0;
if (!str)
goto out;
if (!strcmp(str, "enable")) {
numabalancing_override = 1;
ret = 1;
} else if (!strcmp(str, "disable")) {
numabalancing_override = -1;
ret = 1;
}
out:
if (!ret)
pr_warn("Unable to parse numa_balancing=\n");
return ret;
}
__setup("numa_balancing=", setup_numabalancing);
#else
static inline void __init check_numabalancing_enable(void)
{
}
#endif /* CONFIG_NUMA_BALANCING */
/* assumes fs == KERNEL_DS */
void __init numa_policy_init(void)
{
nodemask_t interleave_nodes;
unsigned long largest = 0;
int nid, prefer = 0;
policy_cache = kmem_cache_create("numa_policy",
sizeof(struct mempolicy),
0, SLAB_PANIC, NULL);
sn_cache = kmem_cache_create("shared_policy_node",
sizeof(struct sp_node),
0, SLAB_PANIC, NULL);
for_each_node(nid) {
preferred_node_policy[nid] = (struct mempolicy) {
.refcnt = ATOMIC_INIT(1),
.mode = MPOL_PREFERRED,
.flags = MPOL_F_MOF | MPOL_F_MORON,
.nodes = nodemask_of_node(nid),
};
}
/*
* Set interleaving policy for system init. Interleaving is only
* enabled across suitably sized nodes (default is >= 16MB), or
* fall back to the largest node if they're all smaller.
*/
nodes_clear(interleave_nodes);
for_each_node_state(nid, N_MEMORY) {
unsigned long total_pages = node_present_pages(nid);
/* Preserve the largest node */
if (largest < total_pages) {
largest = total_pages;
prefer = nid;
}
/* Interleave this node? */
if ((total_pages << PAGE_SHIFT) >= (16 << 20))
node_set(nid, interleave_nodes);
}
/* All too small, use the largest */
if (unlikely(nodes_empty(interleave_nodes)))
node_set(prefer, interleave_nodes);
if (do_set_mempolicy(MPOL_INTERLEAVE, 0, &interleave_nodes))
pr_err("%s: interleaving failed\n", __func__);
check_numabalancing_enable();
}
/* Reset policy of current process to default */
void numa_default_policy(void)
{
do_set_mempolicy(MPOL_DEFAULT, 0, NULL);
}
/*
* Parse and format mempolicy from/to strings
*/
static const char * const policy_modes[] =
{
[MPOL_DEFAULT] = "default",
[MPOL_PREFERRED] = "prefer",
[MPOL_BIND] = "bind",
[MPOL_INTERLEAVE] = "interleave",
[MPOL_LOCAL] = "local",
[MPOL_PREFERRED_MANY] = "prefer (many)",
};
#ifdef CONFIG_TMPFS
/**
* mpol_parse_str - parse string to mempolicy, for tmpfs mpol mount option.
* @str: string containing mempolicy to parse
* @mpol: pointer to struct mempolicy pointer, returned on success.
*
* Format of input:
* <mode>[=<flags>][:<nodelist>]
*
* On success, returns 0, else 1
*/
int mpol_parse_str(char *str, struct mempolicy **mpol)
{
struct mempolicy *new = NULL;
unsigned short mode_flags;
nodemask_t nodes;
char *nodelist = strchr(str, ':');
char *flags = strchr(str, '=');
int err = 1, mode;
if (flags)
*flags++ = '\0'; /* terminate mode string */
if (nodelist) {
/* NUL-terminate mode or flags string */
*nodelist++ = '\0';
if (nodelist_parse(nodelist, nodes))
goto out;
if (!nodes_subset(nodes, node_states[N_MEMORY]))
goto out;
} else
nodes_clear(nodes);
mode = match_string(policy_modes, MPOL_MAX, str);
if (mode < 0)
goto out;
switch (mode) {
case MPOL_PREFERRED:
/*
* Insist on a nodelist of one node only, although later
* we use first_node(nodes) to grab a single node, so here
* nodelist (or nodes) cannot be empty.
*/
if (nodelist) {
char *rest = nodelist;
while (isdigit(*rest))
rest++;
if (*rest)
goto out;
if (nodes_empty(nodes))
goto out;
}
break;
case MPOL_INTERLEAVE:
/*
* Default to online nodes with memory if no nodelist
*/
if (!nodelist)
nodes = node_states[N_MEMORY];
break;
case MPOL_LOCAL:
/*
* Don't allow a nodelist; mpol_new() checks flags
*/
if (nodelist)
goto out;
break;
case MPOL_DEFAULT:
/*
* Insist on a empty nodelist
*/
if (!nodelist)
err = 0;
goto out;
case MPOL_PREFERRED_MANY:
case MPOL_BIND:
/*
* Insist on a nodelist
*/
if (!nodelist)
goto out;
}
mode_flags = 0;
if (flags) {
/*
* Currently, we only support two mutually exclusive
* mode flags.
*/
if (!strcmp(flags, "static"))
mode_flags |= MPOL_F_STATIC_NODES;
else if (!strcmp(flags, "relative"))
mode_flags |= MPOL_F_RELATIVE_NODES;
else
goto out;
}
new = mpol_new(mode, mode_flags, &nodes);
if (IS_ERR(new))
goto out;
/*
* Save nodes for mpol_to_str() to show the tmpfs mount options
* for /proc/mounts, /proc/pid/mounts and /proc/pid/mountinfo.
*/
if (mode != MPOL_PREFERRED) {
new->nodes = nodes;
} else if (nodelist) {
nodes_clear(new->nodes);
node_set(first_node(nodes), new->nodes);
} else {
new->mode = MPOL_LOCAL;
}
/*
* Save nodes for contextualization: this will be used to "clone"
* the mempolicy in a specific context [cpuset] at a later time.
*/
new->w.user_nodemask = nodes;
err = 0;
out:
/* Restore string for error message */
if (nodelist)
*--nodelist = ':';
if (flags)
*--flags = '=';
if (!err)
*mpol = new;
return err;
}
#endif /* CONFIG_TMPFS */
/**
* mpol_to_str - format a mempolicy structure for printing
* @buffer: to contain formatted mempolicy string
* @maxlen: length of @buffer
* @pol: pointer to mempolicy to be formatted
*
* Convert @pol into a string. If @buffer is too short, truncate the string.
* Recommend a @maxlen of at least 32 for the longest mode, "interleave", the
* longest flag, "relative", and to display at least a few node ids.
*/
void mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
{
char *p = buffer;
nodemask_t nodes = NODE_MASK_NONE;
unsigned short mode = MPOL_DEFAULT;
unsigned short flags = 0;
if (pol && pol != &default_policy && !(pol->flags & MPOL_F_MORON)) {
mode = pol->mode;
flags = pol->flags;
}
switch (mode) {
case MPOL_DEFAULT:
case MPOL_LOCAL:
break;
case MPOL_PREFERRED:
case MPOL_PREFERRED_MANY:
case MPOL_BIND:
case MPOL_INTERLEAVE:
nodes = pol->nodes;
break;
default:
WARN_ON_ONCE(1);
snprintf(p, maxlen, "unknown");
return;
}
p += snprintf(p, maxlen, "%s", policy_modes[mode]);
if (flags & MPOL_MODE_FLAGS) {
p += snprintf(p, buffer + maxlen - p, "=");
/*
* Currently, the only defined flags are mutually exclusive
*/
if (flags & MPOL_F_STATIC_NODES)
p += snprintf(p, buffer + maxlen - p, "static");
else if (flags & MPOL_F_RELATIVE_NODES)
p += snprintf(p, buffer + maxlen - p, "relative");
}
if (!nodes_empty(nodes))
p += scnprintf(p, buffer + maxlen - p, ":%*pbl",
nodemask_pr_args(&nodes));
}
bool numa_demotion_enabled = false;
#ifdef CONFIG_SYSFS
static ssize_t numa_demotion_enabled_show(struct kobject *kobj,
struct kobj_attribute *attr, char *buf)
{
return sysfs_emit(buf, "%s\n",
numa_demotion_enabled? "true" : "false");
}
static ssize_t numa_demotion_enabled_store(struct kobject *kobj,
struct kobj_attribute *attr,
const char *buf, size_t count)
{
if (!strncmp(buf, "true", 4) || !strncmp(buf, "1", 1))
numa_demotion_enabled = true;
else if (!strncmp(buf, "false", 5) || !strncmp(buf, "0", 1))
numa_demotion_enabled = false;
else
return -EINVAL;
return count;
}
static struct kobj_attribute numa_demotion_enabled_attr =
__ATTR(demotion_enabled, 0644, numa_demotion_enabled_show,
numa_demotion_enabled_store);
static struct attribute *numa_attrs[] = {
&numa_demotion_enabled_attr.attr,
NULL,
};
static const struct attribute_group numa_attr_group = {
.attrs = numa_attrs,
};
static int __init numa_init_sysfs(void)
{
int err;
struct kobject *numa_kobj;
numa_kobj = kobject_create_and_add("numa", mm_kobj);
if (!numa_kobj) {
pr_err("failed to create numa kobject\n");
return -ENOMEM;
}
err = sysfs_create_group(numa_kobj, &numa_attr_group);
if (err) {
pr_err("failed to register numa group\n");
goto delete_obj;
}
return 0;
delete_obj:
kobject_put(numa_kobj);
return err;
}
subsys_initcall(numa_init_sysfs);
#endif
// SPDX-License-Identifier: GPL-2.0-or-later
/* Common capabilities, needed by capability.o.
*/
#include <linux/capability.h>
#include <linux/audit.h>
#include <linux/init.h>
#include <linux/kernel.h>
#include <linux/lsm_hooks.h>
#include <linux/file.h>
#include <linux/mm.h>
#include <linux/mman.h>
#include <linux/pagemap.h>
#include <linux/swap.h>
#include <linux/skbuff.h>
#include <linux/netlink.h>
#include <linux/ptrace.h>
#include <linux/xattr.h>
#include <linux/hugetlb.h>
#include <linux/mount.h>
#include <linux/sched.h>
#include <linux/prctl.h>
#include <linux/securebits.h>
#include <linux/user_namespace.h>
#include <linux/binfmts.h>
#include <linux/personality.h>
/*
* If a non-root user executes a setuid-root binary in
* !secure(SECURE_NOROOT) mode, then we raise capabilities.
* However if fE is also set, then the intent is for only
* the file capabilities to be applied, and the setuid-root
* bit is left on either to change the uid (plausible) or
* to get full privilege on a kernel without file capabilities
* support. So in that case we do not raise capabilities.
*
* Warn if that happens, once per boot.
*/
static void warn_setuid_and_fcaps_mixed(const char *fname)
{
static int warned;
if (!warned) {
printk(KERN_INFO "warning: `%s' has both setuid-root and"
" effective capabilities. Therefore not raising all"
" capabilities.\n", fname);
warned = 1;
}
}
/**
* cap_capable - Determine whether a task has a particular effective capability
* @cred: The credentials to use
* @targ_ns: The user namespace in which we need the capability
* @cap: The capability to check for
* @opts: Bitmask of options defined in include/linux/security.h
*
* Determine whether the nominated task has the specified capability amongst
* its effective set, returning 0 if it does, -ve if it does not.
*
* NOTE WELL: cap_has_capability() cannot be used like the kernel's capable()
* and has_capability() functions. That is, it has the reverse semantics:
* cap_has_capability() returns 0 when a task has a capability, but the
* kernel's capable() and has_capability() returns 1 for this case.
*/
int cap_capable(const struct cred *cred, struct user_namespace *targ_ns,
int cap, unsigned int opts)
{
struct user_namespace *ns = targ_ns;
/* See if cred has the capability in the target user namespace
* by examining the target user namespace and all of the target
* user namespace's parents.
*/
for (;;) {
/* Do we have the necessary capabilities? */
if (ns == cred->user_ns) return cap_raised(cred->cap_effective, cap) ? 0 : -EPERM;
/*
* If we're already at a lower level than we're looking for,
* we're done searching.
*/
if (ns->level <= cred->user_ns->level)
return -EPERM;
/*
* The owner of the user namespace in the parent of the
* user namespace has all caps.
*/
if ((ns->parent == cred->user_ns) && uid_eq(ns->owner, cred->euid))
return 0;
/*
* If you have a capability in a parent user ns, then you have
* it over all children user namespaces as well.
*/
ns = ns->parent;
}
/* We never get here */
}
/**
* cap_settime - Determine whether the current process may set the system clock
* @ts: The time to set
* @tz: The timezone to set
*
* Determine whether the current process may set the system clock and timezone
* information, returning 0 if permission granted, -ve if denied.
*/
int cap_settime(const struct timespec64 *ts, const struct timezone *tz)
{
if (!capable(CAP_SYS_TIME))
return -EPERM;
return 0;
}
/**
* cap_ptrace_access_check - Determine whether the current process may access
* another
* @child: The process to be accessed
* @mode: The mode of attachment.
*
* If we are in the same or an ancestor user_ns and have all the target
* task's capabilities, then ptrace access is allowed.
* If we have the ptrace capability to the target user_ns, then ptrace
* access is allowed.
* Else denied.
*
* Determine whether a process may access another, returning 0 if permission
* granted, -ve if denied.
*/
int cap_ptrace_access_check(struct task_struct *child, unsigned int mode)
{
int ret = 0;
const struct cred *cred, *child_cred;
const kernel_cap_t *caller_caps;
rcu_read_lock();
cred = current_cred();
child_cred = __task_cred(child);
if (mode & PTRACE_MODE_FSCREDS)
caller_caps = &cred->cap_effective;
else
caller_caps = &cred->cap_permitted;
if (cred->user_ns == child_cred->user_ns &&
cap_issubset(child_cred->cap_permitted, *caller_caps))
goto out;
if (ns_capable(child_cred->user_ns, CAP_SYS_PTRACE))
goto out;
ret = -EPERM;
out:
rcu_read_unlock();
return ret;
}
/**
* cap_ptrace_traceme - Determine whether another process may trace the current
* @parent: The task proposed to be the tracer
*
* If parent is in the same or an ancestor user_ns and has all current's
* capabilities, then ptrace access is allowed.
* If parent has the ptrace capability to current's user_ns, then ptrace
* access is allowed.
* Else denied.
*
* Determine whether the nominated task is permitted to trace the current
* process, returning 0 if permission is granted, -ve if denied.
*/
int cap_ptrace_traceme(struct task_struct *parent)
{
int ret = 0;
const struct cred *cred, *child_cred;
rcu_read_lock();
cred = __task_cred(parent);
child_cred = current_cred();
if (cred->user_ns == child_cred->user_ns &&
cap_issubset(child_cred->cap_permitted, cred->cap_permitted))
goto out;
if (has_ns_capability(parent, child_cred->user_ns, CAP_SYS_PTRACE))
goto out;
ret = -EPERM;
out:
rcu_read_unlock();
return ret;
}
/**
* cap_capget - Retrieve a task's capability sets
* @target: The task from which to retrieve the capability sets
* @effective: The place to record the effective set
* @inheritable: The place to record the inheritable set
* @permitted: The place to record the permitted set
*
* This function retrieves the capabilities of the nominated task and returns
* them to the caller.
*/
int cap_capget(struct task_struct *target, kernel_cap_t *effective,
kernel_cap_t *inheritable, kernel_cap_t *permitted)
{
const struct cred *cred;
/* Derived from kernel/capability.c:sys_capget. */
rcu_read_lock();
cred = __task_cred(target);
*effective = cred->cap_effective;
*inheritable = cred->cap_inheritable;
*permitted = cred->cap_permitted;
rcu_read_unlock();
return 0;
}
/*
* Determine whether the inheritable capabilities are limited to the old
* permitted set. Returns 1 if they are limited, 0 if they are not.
*/
static inline int cap_inh_is_capped(void)
{
/* they are so limited unless the current task has the CAP_SETPCAP
* capability
*/
if (cap_capable(current_cred(), current_cred()->user_ns,
CAP_SETPCAP, CAP_OPT_NONE) == 0)
return 0;
return 1;
}
/**
* cap_capset - Validate and apply proposed changes to current's capabilities
* @new: The proposed new credentials; alterations should be made here
* @old: The current task's current credentials
* @effective: A pointer to the proposed new effective capabilities set
* @inheritable: A pointer to the proposed new inheritable capabilities set
* @permitted: A pointer to the proposed new permitted capabilities set
*
* This function validates and applies a proposed mass change to the current
* process's capability sets. The changes are made to the proposed new
* credentials, and assuming no error, will be committed by the caller of LSM.
*/
int cap_capset(struct cred *new,
const struct cred *old,
const kernel_cap_t *effective,
const kernel_cap_t *inheritable,
const kernel_cap_t *permitted)
{
if (cap_inh_is_capped() &&
!cap_issubset(*inheritable,
cap_combine(old->cap_inheritable,
old->cap_permitted)))
/* incapable of using this inheritable set */
return -EPERM;
if (!cap_issubset(*inheritable,
cap_combine(old->cap_inheritable,
old->cap_bset)))
/* no new pI capabilities outside bounding set */
return -EPERM;
/* verify restrictions on target's new Permitted set */
if (!cap_issubset(*permitted, old->cap_permitted))
return -EPERM;
/* verify the _new_Effective_ is a subset of the _new_Permitted_ */
if (!cap_issubset(*effective, *permitted))
return -EPERM;
new->cap_effective = *effective;
new->cap_inheritable = *inheritable;
new->cap_permitted = *permitted;
/*
* Mask off ambient bits that are no longer both permitted and
* inheritable.
*/
new->cap_ambient = cap_intersect(new->cap_ambient,
cap_intersect(*permitted,
*inheritable));
if (WARN_ON(!cap_ambient_invariant_ok(new)))
return -EINVAL;
return 0;
}
/**
* cap_inode_need_killpriv - Determine if inode change affects privileges
* @dentry: The inode/dentry in being changed with change marked ATTR_KILL_PRIV
*
* Determine if an inode having a change applied that's marked ATTR_KILL_PRIV
* affects the security markings on that inode, and if it is, should
* inode_killpriv() be invoked or the change rejected.
*
* Return: 1 if security.capability has a value, meaning inode_killpriv()
* is required, 0 otherwise, meaning inode_killpriv() is not required.
*/
int cap_inode_need_killpriv(struct dentry *dentry)
{
struct inode *inode = d_backing_inode(dentry);
int error;
error = __vfs_getxattr(dentry, inode, XATTR_NAME_CAPS, NULL, 0);
return error > 0;
}
/**
* cap_inode_killpriv - Erase the security markings on an inode
*
* @mnt_userns: user namespace of the mount the inode was found from
* @dentry: The inode/dentry to alter
*
* Erase the privilege-enhancing security markings on an inode.
*
* If the inode has been found through an idmapped mount the user namespace of
* the vfsmount must be passed through @mnt_userns. This function will then
* take care to map the inode according to @mnt_userns before checking
* permissions. On non-idmapped mounts or if permission checking is to be
* performed on the raw inode simply passs init_user_ns.
*
* Return: 0 if successful, -ve on error.
*/
int cap_inode_killpriv(struct user_namespace *mnt_userns, struct dentry *dentry)
{
int error;
error = __vfs_removexattr(mnt_userns, dentry, XATTR_NAME_CAPS);
if (error == -EOPNOTSUPP)
error = 0;
return error;
}
static bool rootid_owns_currentns(kuid_t kroot)
{
struct user_namespace *ns;
if (!uid_valid(kroot))
return false;
for (ns = current_user_ns(); ; ns = ns->parent) {
if (from_kuid(ns, kroot) == 0)
return true;
if (ns == &init_user_ns)
break;
}
return false;
}
static __u32 sansflags(__u32 m)
{
return m & ~VFS_CAP_FLAGS_EFFECTIVE;
}
static bool is_v2header(size_t size, const struct vfs_cap_data *cap)
{
if (size != XATTR_CAPS_SZ_2)
return false;
return sansflags(le32_to_cpu(cap->magic_etc)) == VFS_CAP_REVISION_2;
}
static bool is_v3header(size_t size, const struct vfs_cap_data *cap)
{
if (size != XATTR_CAPS_SZ_3)
return false;
return sansflags(le32_to_cpu(cap->magic_etc)) == VFS_CAP_REVISION_3;
}
/*
* getsecurity: We are called for security.* before any attempt to read the
* xattr from the inode itself.
*
* This gives us a chance to read the on-disk value and convert it. If we
* return -EOPNOTSUPP, then vfs_getxattr() will call the i_op handler.
*
* Note we are not called by vfs_getxattr_alloc(), but that is only called
* by the integrity subsystem, which really wants the unconverted values -
* so that's good.
*/
int cap_inode_getsecurity(struct user_namespace *mnt_userns,
struct inode *inode, const char *name, void **buffer,
bool alloc)
{
int size, ret;
kuid_t kroot;
u32 nsmagic, magic;
uid_t root, mappedroot;
char *tmpbuf = NULL;
struct vfs_cap_data *cap;
struct vfs_ns_cap_data *nscap = NULL;
struct dentry *dentry;
struct user_namespace *fs_ns;
if (strcmp(name, "capability") != 0)
return -EOPNOTSUPP;
dentry = d_find_any_alias(inode);
if (!dentry)
return -EINVAL;
size = sizeof(struct vfs_ns_cap_data);
ret = (int)vfs_getxattr_alloc(mnt_userns, dentry, XATTR_NAME_CAPS,
&tmpbuf, size, GFP_NOFS);
dput(dentry);
if (ret < 0 || !tmpbuf)
return ret;
fs_ns = inode->i_sb->s_user_ns;
cap = (struct vfs_cap_data *) tmpbuf;
if (is_v2header((size_t) ret, cap)) {
root = 0;
} else if (is_v3header((size_t) ret, cap)) {
nscap = (struct vfs_ns_cap_data *) tmpbuf;
root = le32_to_cpu(nscap->rootid);
} else {
size = -EINVAL;
goto out_free;
}
kroot = make_kuid(fs_ns, root);
/* If this is an idmapped mount shift the kuid. */
kroot = kuid_into_mnt(mnt_userns, kroot);
/* If the root kuid maps to a valid uid in current ns, then return
* this as a nscap. */
mappedroot = from_kuid(current_user_ns(), kroot);
if (mappedroot != (uid_t)-1 && mappedroot != (uid_t)0) {
size = sizeof(struct vfs_ns_cap_data);
if (alloc) {
if (!nscap) {
/* v2 -> v3 conversion */
nscap = kzalloc(size, GFP_ATOMIC);
if (!nscap) {
size = -ENOMEM;
goto out_free;
}
nsmagic = VFS_CAP_REVISION_3;
magic = le32_to_cpu(cap->magic_etc);
if (magic & VFS_CAP_FLAGS_EFFECTIVE)
nsmagic |= VFS_CAP_FLAGS_EFFECTIVE;
memcpy(&nscap->data, &cap->data, sizeof(__le32) * 2 * VFS_CAP_U32);
nscap->magic_etc = cpu_to_le32(nsmagic);
} else {
/* use allocated v3 buffer */
tmpbuf = NULL;
}
nscap->rootid = cpu_to_le32(mappedroot);
*buffer = nscap;
}
goto out_free;
}
if (!rootid_owns_currentns(kroot)) {
size = -EOVERFLOW;
goto out_free;
}
/* This comes from a parent namespace. Return as a v2 capability */
size = sizeof(struct vfs_cap_data);
if (alloc) {
if (nscap) {
/* v3 -> v2 conversion */
cap = kzalloc(size, GFP_ATOMIC);
if (!cap) {
size = -ENOMEM;
goto out_free;
}
magic = VFS_CAP_REVISION_2;
nsmagic = le32_to_cpu(nscap->magic_etc);
if (nsmagic & VFS_CAP_FLAGS_EFFECTIVE)
magic |= VFS_CAP_FLAGS_EFFECTIVE;
memcpy(&cap->data, &nscap->data, sizeof(__le32) * 2 * VFS_CAP_U32);
cap->magic_etc = cpu_to_le32(magic);
} else {
/* use unconverted v2 */
tmpbuf = NULL;
}
*buffer = cap;
}
out_free:
kfree(tmpbuf);
return size;
}
/**
* rootid_from_xattr - translate root uid of vfs caps
*
* @value: vfs caps value which may be modified by this function
* @size: size of @ivalue
* @task_ns: user namespace of the caller
* @mnt_userns: user namespace of the mount the inode was found from
*
* If the inode has been found through an idmapped mount the user namespace of
* the vfsmount must be passed through @mnt_userns. This function will then
* take care to map the inode according to @mnt_userns before checking
* permissions. On non-idmapped mounts or if permission checking is to be
* performed on the raw inode simply passs init_user_ns.
*/
static kuid_t rootid_from_xattr(const void *value, size_t size,
struct user_namespace *task_ns,
struct user_namespace *mnt_userns)
{
const struct vfs_ns_cap_data *nscap = value;
kuid_t rootkid;
uid_t rootid = 0;
if (size == XATTR_CAPS_SZ_3)
rootid = le32_to_cpu(nscap->rootid);
rootkid = make_kuid(task_ns, rootid);
return kuid_from_mnt(mnt_userns, rootkid);
}
static bool validheader(size_t size, const struct vfs_cap_data *cap)
{
return is_v2header(size, cap) || is_v3header(size, cap);
}
/**
* cap_convert_nscap - check vfs caps
*
* @mnt_userns: user namespace of the mount the inode was found from
* @dentry: used to retrieve inode to check permissions on
* @ivalue: vfs caps value which may be modified by this function
* @size: size of @ivalue
*
* User requested a write of security.capability. If needed, update the
* xattr to change from v2 to v3, or to fixup the v3 rootid.
*
* If the inode has been found through an idmapped mount the user namespace of
* the vfsmount must be passed through @mnt_userns. This function will then
* take care to map the inode according to @mnt_userns before checking
* permissions. On non-idmapped mounts or if permission checking is to be
* performed on the raw inode simply passs init_user_ns.
*
* Return: On success, return the new size; on error, return < 0.
*/
int cap_convert_nscap(struct user_namespace *mnt_userns, struct dentry *dentry,
const void **ivalue, size_t size)
{
struct vfs_ns_cap_data *nscap;
uid_t nsrootid;
const struct vfs_cap_data *cap = *ivalue;
__u32 magic, nsmagic;
struct inode *inode = d_backing_inode(dentry);
struct user_namespace *task_ns = current_user_ns(),
*fs_ns = inode->i_sb->s_user_ns;
kuid_t rootid;
size_t newsize;
if (!*ivalue)
return -EINVAL;
if (!validheader(size, cap))
return -EINVAL;
if (!capable_wrt_inode_uidgid(mnt_userns, inode, CAP_SETFCAP))
return -EPERM;
if (size == XATTR_CAPS_SZ_2 && (mnt_userns == &init_user_ns)) if (ns_capable(inode->i_sb->s_user_ns, CAP_SETFCAP))
/* user is privileged, just write the v2 */
return size;
rootid = rootid_from_xattr(*ivalue, size, task_ns, mnt_userns);
if (!uid_valid(rootid))
return -EINVAL;
nsrootid = from_kuid(fs_ns, rootid);
if (nsrootid == -1)
return -EINVAL;
newsize = sizeof(struct vfs_ns_cap_data);
nscap = kmalloc(newsize, GFP_ATOMIC);
if (!nscap)
return -ENOMEM;
nscap->rootid = cpu_to_le32(nsrootid);
nsmagic = VFS_CAP_REVISION_3;
magic = le32_to_cpu(cap->magic_etc);
if (magic & VFS_CAP_FLAGS_EFFECTIVE)
nsmagic |= VFS_CAP_FLAGS_EFFECTIVE;
nscap->magic_etc = cpu_to_le32(nsmagic);
memcpy(&nscap->data, &cap->data, sizeof(__le32) * 2 * VFS_CAP_U32);
*ivalue = nscap;
return newsize;
}
/*
* Calculate the new process capability sets from the capability sets attached
* to a file.
*/
static inline int bprm_caps_from_vfs_caps(struct cpu_vfs_cap_data *caps,
struct linux_binprm *bprm,
bool *effective,
bool *has_fcap)
{
struct cred *new = bprm->cred;
unsigned i;
int ret = 0;
if (caps->magic_etc & VFS_CAP_FLAGS_EFFECTIVE)
*effective = true;
if (caps->magic_etc & VFS_CAP_REVISION_MASK)
*has_fcap = true;
CAP_FOR_EACH_U32(i) {
__u32 permitted = caps->permitted.cap[i];
__u32 inheritable = caps->inheritable.cap[i];
/*
* pP' = (X & fP) | (pI & fI)
* The addition of pA' is handled later.
*/
new->cap_permitted.cap[i] =
(new->cap_bset.cap[i] & permitted) |
(new->cap_inheritable.cap[i] & inheritable);
if (permitted & ~new->cap_permitted.cap[i])
/* insufficient to execute correctly */
ret = -EPERM;
}
/*
* For legacy apps, with no internal support for recognizing they
* do not have enough capabilities, we return an error if they are
* missing some "forced" (aka file-permitted) capabilities.
*/
return *effective ? ret : 0;
}
/**
* get_vfs_caps_from_disk - retrieve vfs caps from disk
*
* @mnt_userns: user namespace of the mount the inode was found from
* @dentry: dentry from which @inode is retrieved
* @cpu_caps: vfs capabilities
*
* Extract the on-exec-apply capability sets for an executable file.
*
* If the inode has been found through an idmapped mount the user namespace of
* the vfsmount must be passed through @mnt_userns. This function will then
* take care to map the inode according to @mnt_userns before checking
* permissions. On non-idmapped mounts or if permission checking is to be
* performed on the raw inode simply passs init_user_ns.
*/
int get_vfs_caps_from_disk(struct user_namespace *mnt_userns,
const struct dentry *dentry,
struct cpu_vfs_cap_data *cpu_caps)
{
struct inode *inode = d_backing_inode(dentry);
__u32 magic_etc;
unsigned tocopy, i;
int size;
struct vfs_ns_cap_data data, *nscaps = &data;
struct vfs_cap_data *caps = (struct vfs_cap_data *) &data;
kuid_t rootkuid;
struct user_namespace *fs_ns;
memset(cpu_caps, 0, sizeof(struct cpu_vfs_cap_data));
if (!inode)
return -ENODATA;
fs_ns = inode->i_sb->s_user_ns;
size = __vfs_getxattr((struct dentry *)dentry, inode,
XATTR_NAME_CAPS, &data, XATTR_CAPS_SZ);
if (size == -ENODATA || size == -EOPNOTSUPP)
/* no data, that's ok */
return -ENODATA;
if (size < 0)
return size;
if (size < sizeof(magic_etc))
return -EINVAL;
cpu_caps->magic_etc = magic_etc = le32_to_cpu(caps->magic_etc);
rootkuid = make_kuid(fs_ns, 0);
switch (magic_etc & VFS_CAP_REVISION_MASK) {
case VFS_CAP_REVISION_1:
if (size != XATTR_CAPS_SZ_1)
return -EINVAL;
tocopy = VFS_CAP_U32_1;
break;
case VFS_CAP_REVISION_2:
if (size != XATTR_CAPS_SZ_2)
return -EINVAL;
tocopy = VFS_CAP_U32_2;
break;
case VFS_CAP_REVISION_3:
if (size != XATTR_CAPS_SZ_3)
return -EINVAL;
tocopy = VFS_CAP_U32_3;
rootkuid = make_kuid(fs_ns, le32_to_cpu(nscaps->rootid));
break;
default:
return -EINVAL;
}
/* Limit the caps to the mounter of the filesystem
* or the more limited uid specified in the xattr.
*/
rootkuid = kuid_into_mnt(mnt_userns, rootkuid);
if (!rootid_owns_currentns(rootkuid))
return -ENODATA;
CAP_FOR_EACH_U32(i) {
if (i >= tocopy)
break;
cpu_caps->permitted.cap[i] = le32_to_cpu(caps->data[i].permitted);
cpu_caps->inheritable.cap[i] = le32_to_cpu(caps->data[i].inheritable);
}
cpu_caps->permitted.cap[CAP_LAST_U32] &= CAP_LAST_U32_VALID_MASK;
cpu_caps->inheritable.cap[CAP_LAST_U32] &= CAP_LAST_U32_VALID_MASK;
cpu_caps->rootid = rootkuid;
return 0;
}
/*
* Attempt to get the on-exec apply capability sets for an executable file from
* its xattrs and, if present, apply them to the proposed credentials being
* constructed by execve().
*/
static int get_file_caps(struct linux_binprm *bprm, struct file *file,
bool *effective, bool *has_fcap)
{
int rc = 0;
struct cpu_vfs_cap_data vcaps;
cap_clear(bprm->cred->cap_permitted);
if (!file_caps_enabled)
return 0;
if (!mnt_may_suid(file->f_path.mnt))
return 0;
/*
* This check is redundant with mnt_may_suid() but is kept to make
* explicit that capability bits are limited to s_user_ns and its
* descendants.
*/
if (!current_in_userns(file->f_path.mnt->mnt_sb->s_user_ns))
return 0;
rc = get_vfs_caps_from_disk(file_mnt_user_ns(file),
file->f_path.dentry, &vcaps);
if (rc < 0) {
if (rc == -EINVAL)
printk(KERN_NOTICE "Invalid argument reading file caps for %s\n",
bprm->filename);
else if (rc == -ENODATA)
rc = 0;
goto out;
}
rc = bprm_caps_from_vfs_caps(&vcaps, bprm, effective, has_fcap);
out:
if (rc)
cap_clear(bprm->cred->cap_permitted);
return rc;
}
static inline bool root_privileged(void) { return !issecure(SECURE_NOROOT); }
static inline bool __is_real(kuid_t uid, struct cred *cred)
{ return uid_eq(cred->uid, uid); }
static inline bool __is_eff(kuid_t uid, struct cred *cred)
{ return uid_eq(cred->euid, uid); }
static inline bool __is_suid(kuid_t uid, struct cred *cred)
{ return !__is_real(uid, cred) && __is_eff(uid, cred); }
/*
* handle_privileged_root - Handle case of privileged root
* @bprm: The execution parameters, including the proposed creds
* @has_fcap: Are any file capabilities set?
* @effective: Do we have effective root privilege?
* @root_uid: This namespace' root UID WRT initial USER namespace
*
* Handle the case where root is privileged and hasn't been neutered by
* SECURE_NOROOT. If file capabilities are set, they won't be combined with
* set UID root and nothing is changed. If we are root, cap_permitted is
* updated. If we have become set UID root, the effective bit is set.
*/
static void handle_privileged_root(struct linux_binprm *bprm, bool has_fcap,
bool *effective, kuid_t root_uid)
{
const struct cred *old = current_cred();
struct cred *new = bprm->cred;
if (!root_privileged())
return;
/*
* If the legacy file capability is set, then don't set privs
* for a setuid root binary run by a non-root user. Do set it
* for a root user just to cause least surprise to an admin.
*/
if (has_fcap && __is_suid(root_uid, new)) {
warn_setuid_and_fcaps_mixed(bprm->filename);
return;
}
/*
* To support inheritance of root-permissions and suid-root
* executables under compatibility mode, we override the
* capability sets for the file.
*/
if (__is_eff(root_uid, new) || __is_real(root_uid, new)) {
/* pP' = (cap_bset & ~0) | (pI & ~0) */
new->cap_permitted = cap_combine(old->cap_bset,
old->cap_inheritable);
}
/*
* If only the real uid is 0, we do not set the effective bit.
*/
if (__is_eff(root_uid, new))
*effective = true;
}
#define __cap_gained(field, target, source) \
!cap_issubset(target->cap_##field, source->cap_##field)
#define __cap_grew(target, source, cred) \
!cap_issubset(cred->cap_##target, cred->cap_##source)
#define __cap_full(field, cred) \
cap_issubset(CAP_FULL_SET, cred->cap_##field)
static inline bool __is_setuid(struct cred *new, const struct cred *old)
{ return !uid_eq(new->euid, old->uid); }
static inline bool __is_setgid(struct cred *new, const struct cred *old)
{ return !gid_eq(new->egid, old->gid); }
/*
* 1) Audit candidate if current->cap_effective is set
*
* We do not bother to audit if 3 things are true:
* 1) cap_effective has all caps
* 2) we became root *OR* are were already root
* 3) root is supposed to have all caps (SECURE_NOROOT)
* Since this is just a normal root execing a process.
*
* Number 1 above might fail if you don't have a full bset, but I think
* that is interesting information to audit.
*
* A number of other conditions require logging:
* 2) something prevented setuid root getting all caps
* 3) non-setuid root gets fcaps
* 4) non-setuid root gets ambient
*/
static inline bool nonroot_raised_pE(struct cred *new, const struct cred *old,
kuid_t root, bool has_fcap)
{
bool ret = false;
if ((__cap_grew(effective, ambient, new) &&
!(__cap_full(effective, new) &&
(__is_eff(root, new) || __is_real(root, new)) &&
root_privileged())) ||
(root_privileged() &&
__is_suid(root, new) &&
!__cap_full(effective, new)) ||
(!__is_setuid(new, old) &&
((has_fcap &&
__cap_gained(permitted, new, old)) ||
__cap_gained(ambient, new, old))))
ret = true;
return ret;
}
/**
* cap_bprm_creds_from_file - Set up the proposed credentials for execve().
* @bprm: The execution parameters, including the proposed creds
* @file: The file to pull the credentials from
*
* Set up the proposed credentials for a new execution context being
* constructed by execve(). The proposed creds in @bprm->cred is altered,
* which won't take effect immediately.
*
* Return: 0 if successful, -ve on error.
*/
int cap_bprm_creds_from_file(struct linux_binprm *bprm, struct file *file)
{
/* Process setpcap binaries and capabilities for uid 0 */
const struct cred *old = current_cred();
struct cred *new = bprm->cred;
bool effective = false, has_fcap = false, is_setid;
int ret;
kuid_t root_uid;
if (WARN_ON(!cap_ambient_invariant_ok(old)))
return -EPERM;
ret = get_file_caps(bprm, file, &effective, &has_fcap);
if (ret < 0)
return ret;
root_uid = make_kuid(new->user_ns, 0);
handle_privileged_root(bprm, has_fcap, &effective, root_uid);
/* if we have fs caps, clear dangerous personality flags */
if (__cap_gained(permitted, new, old))
bprm->per_clear |= PER_CLEAR_ON_SETID;
/* Don't let someone trace a set[ug]id/setpcap binary with the revised
* credentials unless they have the appropriate permit.
*
* In addition, if NO_NEW_PRIVS, then ensure we get no new privs.
*/
is_setid = __is_setuid(new, old) || __is_setgid(new, old);
if ((is_setid || __cap_gained(permitted, new, old)) &&
((bprm->unsafe & ~LSM_UNSAFE_PTRACE) ||
!ptracer_capable(current, new->user_ns))) {
/* downgrade; they get no more than they had, and maybe less */
if (!ns_capable(new->user_ns, CAP_SETUID) ||
(bprm->unsafe & LSM_UNSAFE_NO_NEW_PRIVS)) {
new->euid = new->uid;
new->egid = new->gid;
}
new->cap_permitted = cap_intersect(new->cap_permitted,
old->cap_permitted);
}
new->suid = new->fsuid = new->euid;
new->sgid = new->fsgid = new->egid;
/* File caps or setid cancels ambient. */
if (has_fcap || is_setid)
cap_clear(new->cap_ambient);
/*
* Now that we've computed pA', update pP' to give:
* pP' = (X & fP) | (pI & fI) | pA'
*/
new->cap_permitted = cap_combine(new->cap_permitted, new->cap_ambient);
/*
* Set pE' = (fE ? pP' : pA'). Because pA' is zero if fE is set,
* this is the same as pE' = (fE ? pP' : 0) | pA'.
*/
if (effective)
new->cap_effective = new->cap_permitted;
else
new->cap_effective = new->cap_ambient;
if (WARN_ON(!cap_ambient_invariant_ok(new)))
return -EPERM;
if (nonroot_raised_pE(new, old, root_uid, has_fcap)) {
ret = audit_log_bprm_fcaps(bprm, new, old);
if (ret < 0)
return ret;
}
new->securebits &= ~issecure_mask(SECURE_KEEP_CAPS);
if (WARN_ON(!cap_ambient_invariant_ok(new)))
return -EPERM;
/* Check for privilege-elevated exec. */
if (is_setid ||
(!__is_real(root_uid, new) &&
(effective ||
__cap_grew(permitted, ambient, new))))
bprm->secureexec = 1;
return 0;
}
/**
* cap_inode_setxattr - Determine whether an xattr may be altered
* @dentry: The inode/dentry being altered
* @name: The name of the xattr to be changed
* @value: The value that the xattr will be changed to
* @size: The size of value
* @flags: The replacement flag
*
* Determine whether an xattr may be altered or set on an inode, returning 0 if
* permission is granted, -ve if denied.
*
* This is used to make sure security xattrs don't get updated or set by those
* who aren't privileged to do so.
*/
int cap_inode_setxattr(struct dentry *dentry, const char *name,
const void *value, size_t size, int flags)
{
struct user_namespace *user_ns = dentry->d_sb->s_user_ns;
/* Ignore non-security xattrs */
if (strncmp(name, XATTR_SECURITY_PREFIX,
XATTR_SECURITY_PREFIX_LEN) != 0)
return 0;
/*
* For XATTR_NAME_CAPS the check will be done in
* cap_convert_nscap(), called by setxattr()
*/
if (strcmp(name, XATTR_NAME_CAPS) == 0)
return 0;
if (!ns_capable(user_ns, CAP_SYS_ADMIN))
return -EPERM;
return 0;
}
/**
* cap_inode_removexattr - Determine whether an xattr may be removed
*
* @mnt_userns: User namespace of the mount the inode was found from
* @dentry: The inode/dentry being altered
* @name: The name of the xattr to be changed
*
* Determine whether an xattr may be removed from an inode, returning 0 if
* permission is granted, -ve if denied.
*
* If the inode has been found through an idmapped mount the user namespace of
* the vfsmount must be passed through @mnt_userns. This function will then
* take care to map the inode according to @mnt_userns before checking
* permissions. On non-idmapped mounts or if permission checking is to be
* performed on the raw inode simply passs init_user_ns.
*
* This is used to make sure security xattrs don't get removed by those who
* aren't privileged to remove them.
*/
int cap_inode_removexattr(struct user_namespace *mnt_userns,
struct dentry *dentry, const char *name)
{
struct user_namespace *user_ns = dentry->d_sb->s_user_ns;
/* Ignore non-security xattrs */
if (strncmp(name, XATTR_SECURITY_PREFIX,
XATTR_SECURITY_PREFIX_LEN) != 0)
return 0;
if (strcmp(name, XATTR_NAME_CAPS) == 0) {
/* security.capability gets namespaced */
struct inode *inode = d_backing_inode(dentry);
if (!inode)
return -EINVAL;
if (!capable_wrt_inode_uidgid(mnt_userns, inode, CAP_SETFCAP))
return -EPERM;
return 0;
}
if (!ns_capable(user_ns, CAP_SYS_ADMIN))
return -EPERM;
return 0;
}
/*
* cap_emulate_setxuid() fixes the effective / permitted capabilities of
* a process after a call to setuid, setreuid, or setresuid.
*
* 1) When set*uiding _from_ one of {r,e,s}uid == 0 _to_ all of
* {r,e,s}uid != 0, the permitted and effective capabilities are
* cleared.
*
* 2) When set*uiding _from_ euid == 0 _to_ euid != 0, the effective
* capabilities of the process are cleared.
*
* 3) When set*uiding _from_ euid != 0 _to_ euid == 0, the effective
* capabilities are set to the permitted capabilities.
*
* fsuid is handled elsewhere. fsuid == 0 and {r,e,s}uid!= 0 should
* never happen.
*
* -astor
*
* cevans - New behaviour, Oct '99
* A process may, via prctl(), elect to keep its capabilities when it
* calls setuid() and switches away from uid==0. Both permitted and
* effective sets will be retained.
* Without this change, it was impossible for a daemon to drop only some
* of its privilege. The call to setuid(!=0) would drop all privileges!
* Keeping uid 0 is not an option because uid 0 owns too many vital
* files..
* Thanks to Olaf Kirch and Peter Benie for spotting this.
*/
static inline void cap_emulate_setxuid(struct cred *new, const struct cred *old)
{
kuid_t root_uid = make_kuid(old->user_ns, 0);
if ((uid_eq(old->uid, root_uid) ||
uid_eq(old->euid, root_uid) ||
uid_eq(old->suid, root_uid)) &&
(!uid_eq(new->uid, root_uid) &&
!uid_eq(new->euid, root_uid) &&
!uid_eq(new->suid, root_uid))) {
if (!issecure(SECURE_KEEP_CAPS)) {
cap_clear(new->cap_permitted);
cap_clear(new->cap_effective);
}
/*
* Pre-ambient programs expect setresuid to nonroot followed
* by exec to drop capabilities. We should make sure that
* this remains the case.
*/
cap_clear(new->cap_ambient);
}
if (uid_eq(old->euid, root_uid) && !uid_eq(new->euid, root_uid))
cap_clear(new->cap_effective);
if (!uid_eq(old->euid, root_uid) && uid_eq(new->euid, root_uid))
new->cap_effective = new->cap_permitted;
}
/**
* cap_task_fix_setuid - Fix up the results of setuid() call
* @new: The proposed credentials
* @old: The current task's current credentials
* @flags: Indications of what has changed
*
* Fix up the results of setuid() call before the credential changes are
* actually applied.
*
* Return: 0 to grant the changes, -ve to deny them.
*/
int cap_task_fix_setuid(struct cred *new, const struct cred *old, int flags)
{
switch (flags) {
case LSM_SETID_RE:
case LSM_SETID_ID:
case LSM_SETID_RES:
/* juggle the capabilities to follow [RES]UID changes unless
* otherwise suppressed */
if (!issecure(SECURE_NO_SETUID_FIXUP))
cap_emulate_setxuid(new, old);
break;
case LSM_SETID_FS:
/* juggle the capabilties to follow FSUID changes, unless
* otherwise suppressed
*
* FIXME - is fsuser used for all CAP_FS_MASK capabilities?
* if not, we might be a bit too harsh here.
*/
if (!issecure(SECURE_NO_SETUID_FIXUP)) {
kuid_t root_uid = make_kuid(old->user_ns, 0);
if (uid_eq(old->fsuid, root_uid) && !uid_eq(new->fsuid, root_uid))
new->cap_effective =
cap_drop_fs_set(new->cap_effective);
if (!uid_eq(old->fsuid, root_uid) && uid_eq(new->fsuid, root_uid))
new->cap_effective =
cap_raise_fs_set(new->cap_effective,
new->cap_permitted);
}
break;
default:
return -EINVAL;
}
return 0;
}
/*
* Rationale: code calling task_setscheduler, task_setioprio, and
* task_setnice, assumes that
* . if capable(cap_sys_nice), then those actions should be allowed
* . if not capable(cap_sys_nice), but acting on your own processes,
* then those actions should be allowed
* This is insufficient now since you can call code without suid, but
* yet with increased caps.
* So we check for increased caps on the target process.
*/
static int cap_safe_nice(struct task_struct *p)
{
int is_subset, ret = 0;
rcu_read_lock();
is_subset = cap_issubset(__task_cred(p)->cap_permitted,
current_cred()->cap_permitted);
if (!is_subset && !ns_capable(__task_cred(p)->user_ns, CAP_SYS_NICE))
ret = -EPERM;
rcu_read_unlock();
return ret;
}
/**
* cap_task_setscheduler - Detemine if scheduler policy change is permitted
* @p: The task to affect
*
* Detemine if the requested scheduler policy change is permitted for the
* specified task.
*
* Return: 0 if permission is granted, -ve if denied.
*/
int cap_task_setscheduler(struct task_struct *p)
{
return cap_safe_nice(p);
}
/**
* cap_task_setioprio - Detemine if I/O priority change is permitted
* @p: The task to affect
* @ioprio: The I/O priority to set
*
* Detemine if the requested I/O priority change is permitted for the specified
* task.
*
* Return: 0 if permission is granted, -ve if denied.
*/
int cap_task_setioprio(struct task_struct *p, int ioprio)
{
return cap_safe_nice(p);
}
/**
* cap_task_setnice - Detemine if task priority change is permitted
* @p: The task to affect
* @nice: The nice value to set
*
* Detemine if the requested task priority change is permitted for the
* specified task.
*
* Return: 0 if permission is granted, -ve if denied.
*/
int cap_task_setnice(struct task_struct *p, int nice)
{
return cap_safe_nice(p);
}
/*
* Implement PR_CAPBSET_DROP. Attempt to remove the specified capability from
* the current task's bounding set. Returns 0 on success, -ve on error.
*/
static int cap_prctl_drop(unsigned long cap)
{
struct cred *new;
if (!ns_capable(current_user_ns(), CAP_SETPCAP))
return -EPERM;
if (!cap_valid(cap))
return -EINVAL;
new = prepare_creds();
if (!new)
return -ENOMEM;
cap_lower(new->cap_bset, cap);
return commit_creds(new);
}
/**
* cap_task_prctl - Implement process control functions for this security module
* @option: The process control function requested
* @arg2: The argument data for this function
* @arg3: The argument data for this function
* @arg4: The argument data for this function
* @arg5: The argument data for this function
*
* Allow process control functions (sys_prctl()) to alter capabilities; may
* also deny access to other functions not otherwise implemented here.
*
* Return: 0 or +ve on success, -ENOSYS if this function is not implemented
* here, other -ve on error. If -ENOSYS is returned, sys_prctl() and other LSM
* modules will consider performing the function.
*/
int cap_task_prctl(int option, unsigned long arg2, unsigned long arg3,
unsigned long arg4, unsigned long arg5)
{
const struct cred *old = current_cred();
struct cred *new;
switch (option) {
case PR_CAPBSET_READ:
if (!cap_valid(arg2))
return -EINVAL;
return !!cap_raised(old->cap_bset, arg2);
case PR_CAPBSET_DROP:
return cap_prctl_drop(arg2);
/*
* The next four prctl's remain to assist with transitioning a
* system from legacy UID=0 based privilege (when filesystem
* capabilities are not in use) to a system using filesystem
* capabilities only - as the POSIX.1e draft intended.
*
* Note:
*
* PR_SET_SECUREBITS =
* issecure_mask(SECURE_KEEP_CAPS_LOCKED)
* | issecure_mask(SECURE_NOROOT)
* | issecure_mask(SECURE_NOROOT_LOCKED)
* | issecure_mask(SECURE_NO_SETUID_FIXUP)
* | issecure_mask(SECURE_NO_SETUID_FIXUP_LOCKED)
*
* will ensure that the current process and all of its
* children will be locked into a pure
* capability-based-privilege environment.
*/
case PR_SET_SECUREBITS:
if ((((old->securebits & SECURE_ALL_LOCKS) >> 1)
& (old->securebits ^ arg2)) /*[1]*/
|| ((old->securebits & SECURE_ALL_LOCKS & ~arg2)) /*[2]*/
|| (arg2 & ~(SECURE_ALL_LOCKS | SECURE_ALL_BITS)) /*[3]*/
|| (cap_capable(current_cred(),
current_cred()->user_ns,
CAP_SETPCAP,
CAP_OPT_NONE) != 0) /*[4]*/
/*
* [1] no changing of bits that are locked
* [2] no unlocking of locks
* [3] no setting of unsupported bits
* [4] doing anything requires privilege (go read about
* the "sendmail capabilities bug")
*/
)
/* cannot change a locked bit */
return -EPERM;
new = prepare_creds();
if (!new)
return -ENOMEM;
new->securebits = arg2;
return commit_creds(new);
case PR_GET_SECUREBITS:
return old->securebits;
case PR_GET_KEEPCAPS:
return !!issecure(SECURE_KEEP_CAPS);
case PR_SET_KEEPCAPS:
if (arg2 > 1) /* Note, we rely on arg2 being unsigned here */
return -EINVAL;
if (issecure(SECURE_KEEP_CAPS_LOCKED))
return -EPERM;
new = prepare_creds();
if (!new)
return -ENOMEM;
if (arg2)
new->securebits |= issecure_mask(SECURE_KEEP_CAPS);
else
new->securebits &= ~issecure_mask(SECURE_KEEP_CAPS);
return commit_creds(new);
case PR_CAP_AMBIENT:
if (arg2 == PR_CAP_AMBIENT_CLEAR_ALL) {
if (arg3 | arg4 | arg5)
return -EINVAL;
new = prepare_creds();
if (!new)
return -ENOMEM;
cap_clear(new->cap_ambient);
return commit_creds(new);
}
if (((!cap_valid(arg3)) | arg4 | arg5))
return -EINVAL;
if (arg2 == PR_CAP_AMBIENT_IS_SET) {
return !!cap_raised(current_cred()->cap_ambient, arg3);
} else if (arg2 != PR_CAP_AMBIENT_RAISE &&
arg2 != PR_CAP_AMBIENT_LOWER) {
return -EINVAL;
} else {
if (arg2 == PR_CAP_AMBIENT_RAISE &&
(!cap_raised(current_cred()->cap_permitted, arg3) ||
!cap_raised(current_cred()->cap_inheritable,
arg3) ||
issecure(SECURE_NO_CAP_AMBIENT_RAISE)))
return -EPERM;
new = prepare_creds();
if (!new)
return -ENOMEM;
if (arg2 == PR_CAP_AMBIENT_RAISE)
cap_raise(new->cap_ambient, arg3);
else
cap_lower(new->cap_ambient, arg3);
return commit_creds(new);
}
default:
/* No functionality available - continue with default */
return -ENOSYS;
}
}
/**
* cap_vm_enough_memory - Determine whether a new virtual mapping is permitted
* @mm: The VM space in which the new mapping is to be made
* @pages: The size of the mapping
*
* Determine whether the allocation of a new virtual mapping by the current
* task is permitted.
*
* Return: 1 if permission is granted, 0 if not.
*/
int cap_vm_enough_memory(struct mm_struct *mm, long pages)
{
int cap_sys_admin = 0;
if (cap_capable(current_cred(), &init_user_ns,
CAP_SYS_ADMIN, CAP_OPT_NOAUDIT) == 0)
cap_sys_admin = 1;
return cap_sys_admin;
}
/**
* cap_mmap_addr - check if able to map given addr
* @addr: address attempting to be mapped
*
* If the process is attempting to map memory below dac_mmap_min_addr they need
* CAP_SYS_RAWIO. The other parameters to this function are unused by the
* capability security module.
*
* Return: 0 if this mapping should be allowed or -EPERM if not.
*/
int cap_mmap_addr(unsigned long addr)
{
int ret = 0;
if (addr < dac_mmap_min_addr) {
ret = cap_capable(current_cred(), &init_user_ns, CAP_SYS_RAWIO,
CAP_OPT_NONE);
/* set PF_SUPERPRIV if it turns out we allow the low mmap */
if (ret == 0)
current->flags |= PF_SUPERPRIV;
}
return ret;
}
int cap_mmap_file(struct file *file, unsigned long reqprot,
unsigned long prot, unsigned long flags)
{
return 0;
}
#ifdef CONFIG_SECURITY
static struct security_hook_list capability_hooks[] __lsm_ro_after_init = {
LSM_HOOK_INIT(capable, cap_capable),
LSM_HOOK_INIT(settime, cap_settime),
LSM_HOOK_INIT(ptrace_access_check, cap_ptrace_access_check),
LSM_HOOK_INIT(ptrace_traceme, cap_ptrace_traceme),
LSM_HOOK_INIT(capget, cap_capget),
LSM_HOOK_INIT(capset, cap_capset),
LSM_HOOK_INIT(bprm_creds_from_file, cap_bprm_creds_from_file),
LSM_HOOK_INIT(inode_need_killpriv, cap_inode_need_killpriv),
LSM_HOOK_INIT(inode_killpriv, cap_inode_killpriv),
LSM_HOOK_INIT(inode_getsecurity, cap_inode_getsecurity),
LSM_HOOK_INIT(mmap_addr, cap_mmap_addr),
LSM_HOOK_INIT(mmap_file, cap_mmap_file),
LSM_HOOK_INIT(task_fix_setuid, cap_task_fix_setuid),
LSM_HOOK_INIT(task_prctl, cap_task_prctl),
LSM_HOOK_INIT(task_setscheduler, cap_task_setscheduler),
LSM_HOOK_INIT(task_setioprio, cap_task_setioprio),
LSM_HOOK_INIT(task_setnice, cap_task_setnice),
LSM_HOOK_INIT(vm_enough_memory, cap_vm_enough_memory),
};
static int __init capability_init(void)
{
security_add_hooks(capability_hooks, ARRAY_SIZE(capability_hooks),
"capability");
return 0;
}
DEFINE_LSM(capability) = {
.name = "capability",
.order = LSM_ORDER_FIRST,
.init = capability_init,
};
#endif /* CONFIG_SECURITY */
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_SCHED_SIGNAL_H
#define _LINUX_SCHED_SIGNAL_H
#include <linux/rculist.h>
#include <linux/signal.h>
#include <linux/sched.h>
#include <linux/sched/jobctl.h>
#include <linux/sched/task.h>
#include <linux/cred.h>
#include <linux/refcount.h>
#include <linux/posix-timers.h>
#include <linux/mm_types.h>
#include <asm/ptrace.h>
/*
* Types defining task->signal and task->sighand and APIs using them:
*/
struct sighand_struct {
spinlock_t siglock;
refcount_t count;
wait_queue_head_t signalfd_wqh;
struct k_sigaction action[_NSIG];
};
/*
* Per-process accounting stats:
*/
struct pacct_struct {
int ac_flag;
long ac_exitcode;
unsigned long ac_mem;
u64 ac_utime, ac_stime;
unsigned long ac_minflt, ac_majflt;
};
struct cpu_itimer {
u64 expires;
u64 incr;
};
/*
* This is the atomic variant of task_cputime, which can be used for
* storing and updating task_cputime statistics without locking.
*/
struct task_cputime_atomic {
atomic64_t utime;
atomic64_t stime;
atomic64_t sum_exec_runtime;
};
#define INIT_CPUTIME_ATOMIC \
(struct task_cputime_atomic) { \
.utime = ATOMIC64_INIT(0), \
.stime = ATOMIC64_INIT(0), \
.sum_exec_runtime = ATOMIC64_INIT(0), \
}
/**
* struct thread_group_cputimer - thread group interval timer counts
* @cputime_atomic: atomic thread group interval timers.
*
* This structure contains the version of task_cputime, above, that is
* used for thread group CPU timer calculations.
*/
struct thread_group_cputimer {
struct task_cputime_atomic cputime_atomic;
};
struct multiprocess_signals {
sigset_t signal;
struct hlist_node node;
};
/*
* NOTE! "signal_struct" does not have its own
* locking, because a shared signal_struct always
* implies a shared sighand_struct, so locking
* sighand_struct is always a proper superset of
* the locking of signal_struct.
*/
struct signal_struct {
refcount_t sigcnt;
atomic_t live;
int nr_threads;
struct list_head thread_head;
wait_queue_head_t wait_chldexit; /* for wait4() */
/* current thread group signal load-balancing target: */
struct task_struct *curr_target;
/* shared signal handling: */
struct sigpending shared_pending;
/* For collecting multiprocess signals during fork */
struct hlist_head multiprocess;
/* thread group exit support */
int group_exit_code;
/* overloaded:
* - notify group_exit_task when ->count is equal to notify_count
* - everyone except group_exit_task is stopped during signal delivery
* of fatal signals, group_exit_task processes the signal.
*/
int notify_count;
struct task_struct *group_exit_task;
/* thread group stop support, overloads group_exit_code too */
int group_stop_count;
unsigned int flags; /* see SIGNAL_* flags below */
/*
* PR_SET_CHILD_SUBREAPER marks a process, like a service
* manager, to re-parent orphan (double-forking) child processes
* to this process instead of 'init'. The service manager is
* able to receive SIGCHLD signals and is able to investigate
* the process until it calls wait(). All children of this
* process will inherit a flag if they should look for a
* child_subreaper process at exit.
*/
unsigned int is_child_subreaper:1;
unsigned int has_child_subreaper:1;
#ifdef CONFIG_POSIX_TIMERS
/* POSIX.1b Interval Timers */
int posix_timer_id;
struct list_head posix_timers;
/* ITIMER_REAL timer for the process */
struct hrtimer real_timer;
ktime_t it_real_incr;
/*
* ITIMER_PROF and ITIMER_VIRTUAL timers for the process, we use
* CPUCLOCK_PROF and CPUCLOCK_VIRT for indexing array as these
* values are defined to 0 and 1 respectively
*/
struct cpu_itimer it[2];
/*
* Thread group totals for process CPU timers.
* See thread_group_cputimer(), et al, for details.
*/
struct thread_group_cputimer cputimer;
#endif
/* Empty if CONFIG_POSIX_TIMERS=n */
struct posix_cputimers posix_cputimers;
/* PID/PID hash table linkage. */
struct pid *pids[PIDTYPE_MAX];
#ifdef CONFIG_NO_HZ_FULL
atomic_t tick_dep_mask;
#endif
struct pid *tty_old_pgrp;
/* boolean value for session group leader */
int leader;
struct tty_struct *tty; /* NULL if no tty */
#ifdef CONFIG_SCHED_AUTOGROUP
struct autogroup *autogroup;
#endif
/*
* Cumulative resource counters for dead threads in the group,
* and for reaped dead child processes forked by this group.
* Live threads maintain their own counters and add to these
* in __exit_signal, except for the group leader.
*/
seqlock_t stats_lock;
u64 utime, stime, cutime, cstime;
u64 gtime;
u64 cgtime;
struct prev_cputime prev_cputime;
unsigned long nvcsw, nivcsw, cnvcsw, cnivcsw;
unsigned long min_flt, maj_flt, cmin_flt, cmaj_flt;
unsigned long inblock, oublock, cinblock, coublock;
unsigned long maxrss, cmaxrss;
struct task_io_accounting ioac;
/*
* Cumulative ns of schedule CPU time fo dead threads in the
* group, not including a zombie group leader, (This only differs
* from jiffies_to_ns(utime + stime) if sched_clock uses something
* other than jiffies.)
*/
unsigned long long sum_sched_runtime;
/*
* We don't bother to synchronize most readers of this at all,
* because there is no reader checking a limit that actually needs
* to get both rlim_cur and rlim_max atomically, and either one
* alone is a single word that can safely be read normally.
* getrlimit/setrlimit use task_lock(current->group_leader) to
* protect this instead of the siglock, because they really
* have no need to disable irqs.
*/
struct rlimit rlim[RLIM_NLIMITS];
#ifdef CONFIG_BSD_PROCESS_ACCT
struct pacct_struct pacct; /* per-process accounting information */
#endif
#ifdef CONFIG_TASKSTATS
struct taskstats *stats;
#endif
#ifdef CONFIG_AUDIT
unsigned audit_tty;
struct tty_audit_buf *tty_audit_buf;
#endif
/*
* Thread is the potential origin of an oom condition; kill first on
* oom
*/
bool oom_flag_origin;
short oom_score_adj; /* OOM kill score adjustment */
short oom_score_adj_min; /* OOM kill score adjustment min value.
* Only settable by CAP_SYS_RESOURCE. */
struct mm_struct *oom_mm; /* recorded mm when the thread group got
* killed by the oom killer */
struct mutex cred_guard_mutex; /* guard against foreign influences on
* credential calculations
* (notably. ptrace)
* Deprecated do not use in new code.
* Use exec_update_lock instead.
*/
struct rw_semaphore exec_update_lock; /* Held while task_struct is
* being updated during exec,
* and may have inconsistent
* permissions.
*/
} __randomize_layout;
/*
* Bits in flags field of signal_struct.
*/
#define SIGNAL_STOP_STOPPED 0x00000001 /* job control stop in effect */
#define SIGNAL_STOP_CONTINUED 0x00000002 /* SIGCONT since WCONTINUED reap */
#define SIGNAL_GROUP_EXIT 0x00000004 /* group exit in progress */
#define SIGNAL_GROUP_COREDUMP 0x00000008 /* coredump in progress */
/*
* Pending notifications to parent.
*/
#define SIGNAL_CLD_STOPPED 0x00000010
#define SIGNAL_CLD_CONTINUED 0x00000020
#define SIGNAL_CLD_MASK (SIGNAL_CLD_STOPPED|SIGNAL_CLD_CONTINUED)
#define SIGNAL_UNKILLABLE 0x00000040 /* for init: ignore fatal signals */
#define SIGNAL_STOP_MASK (SIGNAL_CLD_MASK | SIGNAL_STOP_STOPPED | \
SIGNAL_STOP_CONTINUED)
static inline void signal_set_stop_flags(struct signal_struct *sig,
unsigned int flags)
{
WARN_ON(sig->flags & (SIGNAL_GROUP_EXIT|SIGNAL_GROUP_COREDUMP)); sig->flags = (sig->flags & ~SIGNAL_STOP_MASK) | flags;
}
/* If true, all threads except ->group_exit_task have pending SIGKILL */
static inline int signal_group_exit(const struct signal_struct *sig)
{
return (sig->flags & SIGNAL_GROUP_EXIT) ||
(sig->group_exit_task != NULL);
}
extern void flush_signals(struct task_struct *);
extern void ignore_signals(struct task_struct *);
extern void flush_signal_handlers(struct task_struct *, int force_default);
extern int dequeue_signal(struct task_struct *task,
sigset_t *mask, kernel_siginfo_t *info);
static inline int kernel_dequeue_signal(void)
{
struct task_struct *task = current;
kernel_siginfo_t __info;
int ret;
spin_lock_irq(&task->sighand->siglock);
ret = dequeue_signal(task, &task->blocked, &__info);
spin_unlock_irq(&task->sighand->siglock);
return ret;
}
static inline void kernel_signal_stop(void)
{
spin_lock_irq(¤t->sighand->siglock);
if (current->jobctl & JOBCTL_STOP_DEQUEUED)
set_special_state(TASK_STOPPED);
spin_unlock_irq(¤t->sighand->siglock);
schedule();
}
#ifdef __ia64__
# define ___ARCH_SI_IA64(_a1, _a2, _a3) , _a1, _a2, _a3
#else
# define ___ARCH_SI_IA64(_a1, _a2, _a3)
#endif
int force_sig_fault_to_task(int sig, int code, void __user *addr
___ARCH_SI_IA64(int imm, unsigned int flags, unsigned long isr)
, struct task_struct *t);
int force_sig_fault(int sig, int code, void __user *addr
___ARCH_SI_IA64(int imm, unsigned int flags, unsigned long isr));
int send_sig_fault(int sig, int code, void __user *addr
___ARCH_SI_IA64(int imm, unsigned int flags, unsigned long isr)
, struct task_struct *t);
int force_sig_mceerr(int code, void __user *, short);
int send_sig_mceerr(int code, void __user *, short, struct task_struct *);
int force_sig_bnderr(void __user *addr, void __user *lower, void __user *upper);
int force_sig_pkuerr(void __user *addr, u32 pkey);
int force_sig_perf(void __user *addr, u32 type, u64 sig_data);
int force_sig_ptrace_errno_trap(int errno, void __user *addr);
int force_sig_fault_trapno(int sig, int code, void __user *addr, int trapno);
int send_sig_fault_trapno(int sig, int code, void __user *addr, int trapno,
struct task_struct *t);
int force_sig_seccomp(int syscall, int reason, bool force_coredump);
extern int send_sig_info(int, struct kernel_siginfo *, struct task_struct *);
extern void force_sigsegv(int sig);
extern int force_sig_info(struct kernel_siginfo *);
extern int __kill_pgrp_info(int sig, struct kernel_siginfo *info, struct pid *pgrp);
extern int kill_pid_info(int sig, struct kernel_siginfo *info, struct pid *pid);
extern int kill_pid_usb_asyncio(int sig, int errno, sigval_t addr, struct pid *,
const struct cred *);
extern int kill_pgrp(struct pid *pid, int sig, int priv);
extern int kill_pid(struct pid *pid, int sig, int priv);
extern __must_check bool do_notify_parent(struct task_struct *, int);
extern void __wake_up_parent(struct task_struct *p, struct task_struct *parent);
extern void force_sig(int);
extern void force_fatal_sig(int);
extern void force_exit_sig(int);
extern int send_sig(int, struct task_struct *, int);
extern int zap_other_threads(struct task_struct *p);
extern struct sigqueue *sigqueue_alloc(void);
extern void sigqueue_free(struct sigqueue *);
extern int send_sigqueue(struct sigqueue *, struct pid *, enum pid_type);
extern int do_sigaction(int, struct k_sigaction *, struct k_sigaction *);
static inline int restart_syscall(void)
{
set_tsk_thread_flag(current, TIF_SIGPENDING);
return -ERESTARTNOINTR;
}
static inline int task_sigpending(struct task_struct *p)
{
return unlikely(test_tsk_thread_flag(p,TIF_SIGPENDING));
}
static inline int signal_pending(struct task_struct *p)
{
/*
* TIF_NOTIFY_SIGNAL isn't really a signal, but it requires the same
* behavior in terms of ensuring that we break out of wait loops
* so that notify signal callbacks can be processed.
*/
if (unlikely(test_tsk_thread_flag(p, TIF_NOTIFY_SIGNAL)))
return 1;
return task_sigpending(p);
}
static inline int __fatal_signal_pending(struct task_struct *p)
{
return unlikely(sigismember(&p->pending.signal, SIGKILL));
}
static inline int fatal_signal_pending(struct task_struct *p)
{
return task_sigpending(p) && __fatal_signal_pending(p);
}
static inline int signal_pending_state(unsigned int state, struct task_struct *p)
{
if (!(state & (TASK_INTERRUPTIBLE | TASK_WAKEKILL)))
return 0;
if (!signal_pending(p))
return 0;
return (state & TASK_INTERRUPTIBLE) || __fatal_signal_pending(p);
}
/*
* This should only be used in fault handlers to decide whether we
* should stop the current fault routine to handle the signals
* instead, especially with the case where we've got interrupted with
* a VM_FAULT_RETRY.
*/
static inline bool fault_signal_pending(vm_fault_t fault_flags,
struct pt_regs *regs)
{
return unlikely((fault_flags & VM_FAULT_RETRY) &&
(fatal_signal_pending(current) ||
(user_mode(regs) && signal_pending(current))));
}
/*
* Reevaluate whether the task has signals pending delivery.
* Wake the task if so.
* This is required every time the blocked sigset_t changes.
* callers must hold sighand->siglock.
*/
extern void recalc_sigpending_and_wake(struct task_struct *t);
extern void recalc_sigpending(void);
extern void calculate_sigpending(void);
extern void signal_wake_up_state(struct task_struct *t, unsigned int state);
static inline void signal_wake_up(struct task_struct *t, bool resume)
{
signal_wake_up_state(t, resume ? TASK_WAKEKILL : 0);
}
static inline void ptrace_signal_wake_up(struct task_struct *t, bool resume)
{
signal_wake_up_state(t, resume ? __TASK_TRACED : 0);
}
void task_join_group_stop(struct task_struct *task);
#ifdef TIF_RESTORE_SIGMASK
/*
* Legacy restore_sigmask accessors. These are inefficient on
* SMP architectures because they require atomic operations.
*/
/**
* set_restore_sigmask() - make sure saved_sigmask processing gets done
*
* This sets TIF_RESTORE_SIGMASK and ensures that the arch signal code
* will run before returning to user mode, to process the flag. For
* all callers, TIF_SIGPENDING is already set or it's no harm to set
* it. TIF_RESTORE_SIGMASK need not be in the set of bits that the
* arch code will notice on return to user mode, in case those bits
* are scarce. We set TIF_SIGPENDING here to ensure that the arch
* signal code always gets run when TIF_RESTORE_SIGMASK is set.
*/
static inline void set_restore_sigmask(void)
{
set_thread_flag(TIF_RESTORE_SIGMASK);
}
static inline void clear_tsk_restore_sigmask(struct task_struct *task)
{
clear_tsk_thread_flag(task, TIF_RESTORE_SIGMASK);
}
static inline void clear_restore_sigmask(void)
{
clear_thread_flag(TIF_RESTORE_SIGMASK);
}
static inline bool test_tsk_restore_sigmask(struct task_struct *task)
{
return test_tsk_thread_flag(task, TIF_RESTORE_SIGMASK);
}
static inline bool test_restore_sigmask(void)
{
return test_thread_flag(TIF_RESTORE_SIGMASK);
}
static inline bool test_and_clear_restore_sigmask(void)
{
return test_and_clear_thread_flag(TIF_RESTORE_SIGMASK);
}
#else /* TIF_RESTORE_SIGMASK */
/* Higher-quality implementation, used if TIF_RESTORE_SIGMASK doesn't exist. */
static inline void set_restore_sigmask(void)
{
current->restore_sigmask = true;
}
static inline void clear_tsk_restore_sigmask(struct task_struct *task)
{
task->restore_sigmask = false;
}
static inline void clear_restore_sigmask(void)
{
current->restore_sigmask = false;
}
static inline bool test_restore_sigmask(void)
{
return current->restore_sigmask;
}
static inline bool test_tsk_restore_sigmask(struct task_struct *task)
{
return task->restore_sigmask;
}
static inline bool test_and_clear_restore_sigmask(void)
{
if (!current->restore_sigmask)
return false;
current->restore_sigmask = false;
return true;
}
#endif
static inline void restore_saved_sigmask(void)
{
if (test_and_clear_restore_sigmask())
__set_current_blocked(¤t->saved_sigmask);
}
extern int set_user_sigmask(const sigset_t __user *umask, size_t sigsetsize);
static inline void restore_saved_sigmask_unless(bool interrupted)
{
if (interrupted)
WARN_ON(!signal_pending(current));
else
restore_saved_sigmask();
}
static inline sigset_t *sigmask_to_save(void)
{
sigset_t *res = ¤t->blocked;
if (unlikely(test_restore_sigmask()))
res = ¤t->saved_sigmask;
return res;
}
static inline int kill_cad_pid(int sig, int priv)
{
return kill_pid(cad_pid, sig, priv);
}
/* These can be the second arg to send_sig_info/send_group_sig_info. */
#define SEND_SIG_NOINFO ((struct kernel_siginfo *) 0)
#define SEND_SIG_PRIV ((struct kernel_siginfo *) 1)
static inline int __on_sig_stack(unsigned long sp)
{
#ifdef CONFIG_STACK_GROWSUP
return sp >= current->sas_ss_sp &&
sp - current->sas_ss_sp < current->sas_ss_size;
#else
return sp > current->sas_ss_sp &&
sp - current->sas_ss_sp <= current->sas_ss_size;
#endif
}
/*
* True if we are on the alternate signal stack.
*/
static inline int on_sig_stack(unsigned long sp)
{
/*
* If the signal stack is SS_AUTODISARM then, by construction, we
* can't be on the signal stack unless user code deliberately set
* SS_AUTODISARM when we were already on it.
*
* This improves reliability: if user state gets corrupted such that
* the stack pointer points very close to the end of the signal stack,
* then this check will enable the signal to be handled anyway.
*/
if (current->sas_ss_flags & SS_AUTODISARM)
return 0;
return __on_sig_stack(sp);
}
static inline int sas_ss_flags(unsigned long sp)
{
if (!current->sas_ss_size)
return SS_DISABLE;
return on_sig_stack(sp) ? SS_ONSTACK : 0;
}
static inline void sas_ss_reset(struct task_struct *p)
{
p->sas_ss_sp = 0;
p->sas_ss_size = 0;
p->sas_ss_flags = SS_DISABLE;
}
static inline unsigned long sigsp(unsigned long sp, struct ksignal *ksig)
{
if (unlikely((ksig->ka.sa.sa_flags & SA_ONSTACK)) && ! sas_ss_flags(sp))
#ifdef CONFIG_STACK_GROWSUP
return current->sas_ss_sp;
#else
return current->sas_ss_sp + current->sas_ss_size;
#endif
return sp;
}
extern void __cleanup_sighand(struct sighand_struct *);
extern void flush_itimer_signals(void);
#define tasklist_empty() \
list_empty(&init_task.tasks)
#define next_task(p) \
list_entry_rcu((p)->tasks.next, struct task_struct, tasks)
#define for_each_process(p) \
for (p = &init_task ; (p = next_task(p)) != &init_task ; )
extern bool current_is_single_threaded(void);
/*
* Careful: do_each_thread/while_each_thread is a double loop so
* 'break' will not work as expected - use goto instead.
*/
#define do_each_thread(g, t) \
for (g = t = &init_task ; (g = t = next_task(g)) != &init_task ; ) do
#define while_each_thread(g, t) \
while ((t = next_thread(t)) != g)
#define __for_each_thread(signal, t) \
list_for_each_entry_rcu(t, &(signal)->thread_head, thread_node)
#define for_each_thread(p, t) \
__for_each_thread((p)->signal, t)
/* Careful: this is a double loop, 'break' won't work as expected. */
#define for_each_process_thread(p, t) \
for_each_process(p) for_each_thread(p, t)
typedef int (*proc_visitor)(struct task_struct *p, void *data);
void walk_process_tree(struct task_struct *top, proc_visitor, void *);
static inline
struct pid *task_pid_type(struct task_struct *task, enum pid_type type)
{
struct pid *pid;
if (type == PIDTYPE_PID)
pid = task_pid(task);
else
pid = task->signal->pids[type];
return pid;
}
static inline struct pid *task_tgid(struct task_struct *task)
{
return task->signal->pids[PIDTYPE_TGID];
}
/*
* Without tasklist or RCU lock it is not safe to dereference
* the result of task_pgrp/task_session even if task == current,
* we can race with another thread doing sys_setsid/sys_setpgid.
*/
static inline struct pid *task_pgrp(struct task_struct *task)
{
return task->signal->pids[PIDTYPE_PGID];
}
static inline struct pid *task_session(struct task_struct *task)
{
return task->signal->pids[PIDTYPE_SID];
}
static inline int get_nr_threads(struct task_struct *task)
{
return task->signal->nr_threads;
}
static inline bool thread_group_leader(struct task_struct *p)
{
return p->exit_signal >= 0;
}
static inline
bool same_thread_group(struct task_struct *p1, struct task_struct *p2)
{
return p1->signal == p2->signal;
}
static inline struct task_struct *next_thread(const struct task_struct *p)
{
return list_entry_rcu(p->thread_group.next,
struct task_struct, thread_group);
}
static inline int thread_group_empty(struct task_struct *p)
{
return list_empty(&p->thread_group);
}
#define delay_group_leader(p) \
(thread_group_leader(p) && !thread_group_empty(p))
extern bool thread_group_exited(struct pid *pid);
extern struct sighand_struct *__lock_task_sighand(struct task_struct *task,
unsigned long *flags);
static inline struct sighand_struct *lock_task_sighand(struct task_struct *task,
unsigned long *flags)
{
struct sighand_struct *ret;
ret = __lock_task_sighand(task, flags);
(void)__cond_lock(&task->sighand->siglock, ret);
return ret;
}
static inline void unlock_task_sighand(struct task_struct *task,
unsigned long *flags)
{
spin_unlock_irqrestore(&task->sighand->siglock, *flags);
}
#ifdef CONFIG_LOCKDEP
extern void lockdep_assert_task_sighand_held(struct task_struct *task);
#else
static inline void lockdep_assert_task_sighand_held(struct task_struct *task) { }
#endif
static inline unsigned long task_rlimit(const struct task_struct *task,
unsigned int limit)
{
return READ_ONCE(task->signal->rlim[limit].rlim_cur);
}
static inline unsigned long task_rlimit_max(const struct task_struct *task,
unsigned int limit)
{
return READ_ONCE(task->signal->rlim[limit].rlim_max);
}
static inline unsigned long rlimit(unsigned int limit)
{
return task_rlimit(current, limit);
}
static inline unsigned long rlimit_max(unsigned int limit)
{
return task_rlimit_max(current, limit);
}
#endif /* _LINUX_SCHED_SIGNAL_H */
// SPDX-License-Identifier: GPL-2.0
/*
* Copyright (C) 2010 Red Hat, Inc.
* Copyright (c) 2016-2021 Christoph Hellwig.
*/
#include <linux/module.h>
#include <linux/compiler.h>
#include <linux/fs.h>
#include <linux/iomap.h>
#include <linux/backing-dev.h>
#include <linux/uio.h>
#include <linux/task_io_accounting_ops.h>
#include "trace.h"
#include "../internal.h"
/*
* Private flags for iomap_dio, must not overlap with the public ones in
* iomap.h:
*/
#define IOMAP_DIO_WRITE_FUA (1 << 28)
#define IOMAP_DIO_NEED_SYNC (1 << 29)
#define IOMAP_DIO_WRITE (1 << 30)
#define IOMAP_DIO_DIRTY (1 << 31)
struct iomap_dio {
struct kiocb *iocb;
const struct iomap_dio_ops *dops;
loff_t i_size;
loff_t size;
atomic_t ref;
unsigned flags;
int error;
size_t done_before;
bool wait_for_completion;
union {
/* used during submission and for synchronous completion: */
struct {
struct iov_iter *iter;
struct task_struct *waiter;
struct request_queue *last_queue;
blk_qc_t cookie;
} submit;
/* used for aio completion: */
struct {
struct work_struct work;
} aio;
};
};
int iomap_dio_iopoll(struct kiocb *kiocb, bool spin)
{
struct request_queue *q = READ_ONCE(kiocb->private);
if (!q)
return 0;
return blk_poll(q, READ_ONCE(kiocb->ki_cookie), spin);
}
EXPORT_SYMBOL_GPL(iomap_dio_iopoll);
static void iomap_dio_submit_bio(const struct iomap_iter *iter,
struct iomap_dio *dio, struct bio *bio, loff_t pos)
{
atomic_inc(&dio->ref);
if (dio->iocb->ki_flags & IOCB_HIPRI)
bio_set_polled(bio, dio->iocb);
dio->submit.last_queue = bdev_get_queue(iter->iomap.bdev); if (dio->dops && dio->dops->submit_io) dio->submit.cookie = dio->dops->submit_io(iter, bio, pos);
else
dio->submit.cookie = submit_bio(bio);
}
ssize_t iomap_dio_complete(struct iomap_dio *dio)
{
const struct iomap_dio_ops *dops = dio->dops;
struct kiocb *iocb = dio->iocb;
struct inode *inode = file_inode(iocb->ki_filp);
loff_t offset = iocb->ki_pos;
ssize_t ret = dio->error;
if (dops && dops->end_io) ret = dops->end_io(iocb, dio->size, ret, dio->flags); if (likely(!ret)) { ret = dio->size;
/* check for short read */
if (offset + ret > dio->i_size &&
!(dio->flags & IOMAP_DIO_WRITE)) ret = dio->i_size - offset; iocb->ki_pos += ret;
}
/*
* Try again to invalidate clean pages which might have been cached by
* non-direct readahead, or faulted in by get_user_pages() if the source
* of the write was an mmap'ed region of the file we're writing. Either
* one is a pretty crazy thing to do, so we don't support it 100%. If
* this invalidation fails, tough, the write still worked...
*
* And this page cache invalidation has to be after ->end_io(), as some
* filesystems convert unwritten extents to real allocations in
* ->end_io() when necessary, otherwise a racing buffer read would cache
* zeros from unwritten extents.
*/
if (!dio->error && dio->size && (dio->flags & IOMAP_DIO_WRITE) && inode->i_mapping->nrpages) {
int err;
err = invalidate_inode_pages2_range(inode->i_mapping,
offset >> PAGE_SHIFT,
(offset + dio->size - 1) >> PAGE_SHIFT);
if (err)
dio_warn_stale_pagecache(iocb->ki_filp);
}
inode_dio_end(file_inode(iocb->ki_filp));
/*
* If this is a DSYNC write, make sure we push it to stable storage now
* that we've written data.
*/
if (ret > 0 && (dio->flags & IOMAP_DIO_NEED_SYNC))
ret = generic_write_sync(iocb, ret);
if (ret > 0)
ret += dio->done_before; kfree(dio);
return ret;
}
EXPORT_SYMBOL_GPL(iomap_dio_complete);
static void iomap_dio_complete_work(struct work_struct *work)
{
struct iomap_dio *dio = container_of(work, struct iomap_dio, aio.work);
struct kiocb *iocb = dio->iocb;
iocb->ki_complete(iocb, iomap_dio_complete(dio), 0);
}
/*
* Set an error in the dio if none is set yet. We have to use cmpxchg
* as the submission context and the completion context(s) can race to
* update the error.
*/
static inline void iomap_dio_set_error(struct iomap_dio *dio, int ret)
{
cmpxchg(&dio->error, 0, ret);
}
static void iomap_dio_bio_end_io(struct bio *bio)
{
struct iomap_dio *dio = bio->bi_private;
bool should_dirty = (dio->flags & IOMAP_DIO_DIRTY);
if (bio->bi_status)
iomap_dio_set_error(dio, blk_status_to_errno(bio->bi_status));
if (atomic_dec_and_test(&dio->ref)) {
if (dio->wait_for_completion) {
struct task_struct *waiter = dio->submit.waiter;
WRITE_ONCE(dio->submit.waiter, NULL);
blk_wake_io_task(waiter);
} else if (dio->flags & IOMAP_DIO_WRITE) {
struct inode *inode = file_inode(dio->iocb->ki_filp);
INIT_WORK(&dio->aio.work, iomap_dio_complete_work);
queue_work(inode->i_sb->s_dio_done_wq, &dio->aio.work);
} else {
iomap_dio_complete_work(&dio->aio.work);
}
}
if (should_dirty) {
bio_check_pages_dirty(bio);
} else {
bio_release_pages(bio, false);
bio_put(bio);
}
}
static void iomap_dio_zero(const struct iomap_iter *iter, struct iomap_dio *dio,
loff_t pos, unsigned len)
{
struct page *page = ZERO_PAGE(0);
int flags = REQ_SYNC | REQ_IDLE;
struct bio *bio;
bio = bio_alloc(GFP_KERNEL, 1);
bio_set_dev(bio, iter->iomap.bdev);
bio->bi_iter.bi_sector = iomap_sector(&iter->iomap, pos);
bio->bi_private = dio;
bio->bi_end_io = iomap_dio_bio_end_io;
get_page(page);
__bio_add_page(bio, page, len, 0);
bio_set_op_attrs(bio, REQ_OP_WRITE, flags);
iomap_dio_submit_bio(iter, dio, bio, pos);
}
/*
* Figure out the bio's operation flags from the dio request, the
* mapping, and whether or not we want FUA. Note that we can end up
* clearing the WRITE_FUA flag in the dio request.
*/
static inline unsigned int iomap_dio_bio_opflags(struct iomap_dio *dio,
const struct iomap *iomap, bool use_fua)
{
unsigned int opflags = REQ_SYNC | REQ_IDLE;
if (!(dio->flags & IOMAP_DIO_WRITE)) { WARN_ON_ONCE(iomap->flags & IOMAP_F_ZONE_APPEND);
return REQ_OP_READ;
}
if (iomap->flags & IOMAP_F_ZONE_APPEND)
opflags |= REQ_OP_ZONE_APPEND;
else
opflags |= REQ_OP_WRITE;
if (use_fua) opflags |= REQ_FUA;
else
dio->flags &= ~IOMAP_DIO_WRITE_FUA;
return opflags;
}
static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter,
struct iomap_dio *dio)
{
const struct iomap *iomap = &iter->iomap;
struct inode *inode = iter->inode;
unsigned int blkbits = blksize_bits(bdev_logical_block_size(iomap->bdev));
unsigned int fs_block_size = i_blocksize(inode), pad;
unsigned int align = iov_iter_alignment(dio->submit.iter);
loff_t length = iomap_length(iter);
loff_t pos = iter->pos;
unsigned int bio_opf;
struct bio *bio;
bool need_zeroout = false;
bool use_fua = false;
int nr_pages, ret = 0;
size_t copied = 0;
size_t orig_count;
if ((pos | length | align) & ((1 << blkbits) - 1))
return -EINVAL;
if (iomap->type == IOMAP_UNWRITTEN) { dio->flags |= IOMAP_DIO_UNWRITTEN;
need_zeroout = true;
}
if (iomap->flags & IOMAP_F_SHARED) dio->flags |= IOMAP_DIO_COW; if (iomap->flags & IOMAP_F_NEW) {
need_zeroout = true;
} else if (iomap->type == IOMAP_MAPPED) {
/*
* Use a FUA write if we need datasync semantics, this is a pure
* data IO that doesn't require any metadata updates (including
* after IO completion such as unwritten extent conversion) and
* the underlying device supports FUA. This allows us to avoid
* cache flushes on IO completion.
*/
if (!(iomap->flags & (IOMAP_F_SHARED|IOMAP_F_DIRTY)) && (dio->flags & IOMAP_DIO_WRITE_FUA) && blk_queue_fua(bdev_get_queue(iomap->bdev)))
use_fua = true;
}
/*
* Save the original count and trim the iter to just the extent we
* are operating on right now. The iter will be re-expanded once
* we are done.
*/
orig_count = iov_iter_count(dio->submit.iter);
iov_iter_truncate(dio->submit.iter, length);
if (!iov_iter_count(dio->submit.iter))
goto out;
if (need_zeroout) {
/* zero out from the start of the block to the write offset */
pad = pos & (fs_block_size - 1);
if (pad)
iomap_dio_zero(iter, dio, pos - pad, pad);
}
/*
* Set the operation flags early so that bio_iov_iter_get_pages
* can set up the page vector appropriately for a ZONE_APPEND
* operation.
*/
bio_opf = iomap_dio_bio_opflags(dio, iomap, use_fua);
nr_pages = bio_iov_vecs_to_alloc(dio->submit.iter, BIO_MAX_VECS);
do {
size_t n;
if (dio->error) { iov_iter_revert(dio->submit.iter, copied);
copied = ret = 0;
goto out;
}
bio = bio_alloc(GFP_KERNEL, nr_pages); bio_set_dev(bio, iomap->bdev);
bio->bi_iter.bi_sector = iomap_sector(iomap, pos);
bio->bi_write_hint = dio->iocb->ki_hint;
bio->bi_ioprio = dio->iocb->ki_ioprio;
bio->bi_private = dio;
bio->bi_end_io = iomap_dio_bio_end_io;
bio->bi_opf = bio_opf;
ret = bio_iov_iter_get_pages(bio, dio->submit.iter);
if (unlikely(ret)) {
/*
* We have to stop part way through an IO. We must fall
* through to the sub-block tail zeroing here, otherwise
* this short IO may expose stale data in the tail of
* the block we haven't written data to.
*/
bio_put(bio);
goto zero_tail;
}
n = bio->bi_iter.bi_size;
if (dio->flags & IOMAP_DIO_WRITE) {
task_io_account_write(n);
} else {
if (dio->flags & IOMAP_DIO_DIRTY) bio_set_pages_dirty(bio);
}
dio->size += n;
copied += n;
nr_pages = bio_iov_vecs_to_alloc(dio->submit.iter,
BIO_MAX_VECS);
iomap_dio_submit_bio(iter, dio, bio, pos);
pos += n;
} while (nr_pages);
/*
* We need to zeroout the tail of a sub-block write if the extent type
* requires zeroing or the write extends beyond EOF. If we don't zero
* the block tail in the latter case, we can expose stale data via mmap
* reads of the EOF block.
*/
zero_tail:
if (need_zeroout || ((dio->flags & IOMAP_DIO_WRITE) && pos >= i_size_read(inode))) {
/* zero out from the end of the write to the end of the block */
pad = pos & (fs_block_size - 1);
if (pad)
iomap_dio_zero(iter, dio, pos, fs_block_size - pad);
}
out:
/* Undo iter limitation to current extent */
iov_iter_reexpand(dio->submit.iter, orig_count - copied); if (copied) return copied;
return ret;
}
static loff_t iomap_dio_hole_iter(const struct iomap_iter *iter,
struct iomap_dio *dio)
{
loff_t length = iov_iter_zero(iomap_length(iter), dio->submit.iter);
dio->size += length;
if (!length)
return -EFAULT;
return length;
}
static loff_t iomap_dio_inline_iter(const struct iomap_iter *iomi,
struct iomap_dio *dio)
{
const struct iomap *iomap = &iomi->iomap;
struct iov_iter *iter = dio->submit.iter;
void *inline_data = iomap_inline_data(iomap, iomi->pos);
loff_t length = iomap_length(iomi);
loff_t pos = iomi->pos;
size_t copied;
if (WARN_ON_ONCE(!iomap_inline_data_valid(iomap)))
return -EIO;
if (dio->flags & IOMAP_DIO_WRITE) { loff_t size = iomi->inode->i_size;
if (pos > size)
memset(iomap_inline_data(iomap, size), 0, pos - size);
copied = copy_from_iter(inline_data, length, iter);
if (copied) {
if (pos + copied > size) i_size_write(iomi->inode, pos + copied);
mark_inode_dirty(iomi->inode);
}
} else {
copied = copy_to_iter(inline_data, length, iter);
}
dio->size += copied;
if (!copied)
return -EFAULT;
return copied;
}
static loff_t iomap_dio_iter(const struct iomap_iter *iter,
struct iomap_dio *dio)
{
switch (iter->iomap.type) {
case IOMAP_HOLE:
if (WARN_ON_ONCE(dio->flags & IOMAP_DIO_WRITE))
return -EIO;
return iomap_dio_hole_iter(iter, dio);
case IOMAP_UNWRITTEN:
if (!(dio->flags & IOMAP_DIO_WRITE)) return iomap_dio_hole_iter(iter, dio);
return iomap_dio_bio_iter(iter, dio);
case IOMAP_MAPPED:
return iomap_dio_bio_iter(iter, dio);
case IOMAP_INLINE:
return iomap_dio_inline_iter(iter, dio);
case IOMAP_DELALLOC:
/*
* DIO is not serialised against mmap() access at all, and so
* if the page_mkwrite occurs between the writeback and the
* iomap_iter() call in the DIO path, then it will see the
* DELALLOC block that the page-mkwrite allocated.
*/
pr_warn_ratelimited("Direct I/O collision with buffered writes! File: %pD4 Comm: %.20s\n",
dio->iocb->ki_filp, current->comm);
return -EIO;
default:
WARN_ON_ONCE(1);
return -EIO;
}
}
/*
* iomap_dio_rw() always completes O_[D]SYNC writes regardless of whether the IO
* is being issued as AIO or not. This allows us to optimise pure data writes
* to use REQ_FUA rather than requiring generic_write_sync() to issue a
* REQ_FLUSH post write. This is slightly tricky because a single request here
* can be mapped into multiple disjoint IOs and only a subset of the IOs issued
* may be pure data writes. In that case, we still need to do a full data sync
* completion.
*
* When page faults are disabled and @dio_flags includes IOMAP_DIO_PARTIAL,
* __iomap_dio_rw can return a partial result if it encounters a non-resident
* page in @iter after preparing a transfer. In that case, the non-resident
* pages can be faulted in and the request resumed with @done_before set to the
* number of bytes previously transferred. The request will then complete with
* the correct total number of bytes transferred; this is essential for
* completing partial requests asynchronously.
*
* Returns -ENOTBLK In case of a page invalidation invalidation failure for
* writes. The callers needs to fall back to buffered I/O in this case.
*/
struct iomap_dio *
__iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
const struct iomap_ops *ops, const struct iomap_dio_ops *dops,
unsigned int dio_flags, size_t done_before)
{
struct address_space *mapping = iocb->ki_filp->f_mapping;
struct inode *inode = file_inode(iocb->ki_filp);
struct iomap_iter iomi = {
.inode = inode,
.pos = iocb->ki_pos,
.len = iov_iter_count(iter),
.flags = IOMAP_DIRECT,
};
loff_t end = iomi.pos + iomi.len - 1, ret = 0;
bool wait_for_completion =
is_sync_kiocb(iocb) || (dio_flags & IOMAP_DIO_FORCE_WAIT);
struct blk_plug plug;
struct iomap_dio *dio;
if (!iomi.len)
return NULL;
dio = kmalloc(sizeof(*dio), GFP_KERNEL);
if (!dio)
return ERR_PTR(-ENOMEM);
dio->iocb = iocb;
atomic_set(&dio->ref, 1);
dio->size = 0;
dio->i_size = i_size_read(inode);
dio->dops = dops;
dio->error = 0;
dio->flags = 0;
dio->done_before = done_before;
dio->submit.iter = iter;
dio->submit.waiter = current;
dio->submit.cookie = BLK_QC_T_NONE;
dio->submit.last_queue = NULL;
if (iov_iter_rw(iter) == READ) {
if (iomi.pos >= dio->i_size)
goto out_free_dio;
if (iocb->ki_flags & IOCB_NOWAIT) { if (filemap_range_needs_writeback(mapping, iomi.pos,
end)) {
ret = -EAGAIN;
goto out_free_dio;
}
iomi.flags |= IOMAP_NOWAIT;
}
if (iter_is_iovec(iter))
dio->flags |= IOMAP_DIO_DIRTY;
} else {
iomi.flags |= IOMAP_WRITE;
dio->flags |= IOMAP_DIO_WRITE;
if (iocb->ki_flags & IOCB_NOWAIT) {
if (filemap_range_has_page(mapping, iomi.pos, end)) {
ret = -EAGAIN;
goto out_free_dio;
}
iomi.flags |= IOMAP_NOWAIT;
}
/* for data sync or sync, we need sync completion processing */
if (iocb->ki_flags & IOCB_DSYNC) dio->flags |= IOMAP_DIO_NEED_SYNC;
/*
* For datasync only writes, we optimistically try using FUA for
* this IO. Any non-FUA write that occurs will clear this flag,
* hence we know before completion whether a cache flush is
* necessary.
*/
if ((iocb->ki_flags & (IOCB_DSYNC | IOCB_SYNC)) == IOCB_DSYNC) dio->flags |= IOMAP_DIO_WRITE_FUA;
}
if (dio_flags & IOMAP_DIO_OVERWRITE_ONLY) {
ret = -EAGAIN;
if (iomi.pos >= dio->i_size || iomi.pos + iomi.len > dio->i_size)
goto out_free_dio;
iomi.flags |= IOMAP_OVERWRITE_ONLY;
}
ret = filemap_write_and_wait_range(mapping, iomi.pos, end);
if (ret)
goto out_free_dio;
if (iov_iter_rw(iter) == WRITE) {
/*
* Try to invalidate cache pages for the range we are writing.
* If this invalidation fails, let the caller fall back to
* buffered I/O.
*/
if (invalidate_inode_pages2_range(mapping,
iomi.pos >> PAGE_SHIFT, end >> PAGE_SHIFT)) { trace_iomap_dio_invalidate_fail(inode, iomi.pos,
iomi.len);
ret = -ENOTBLK;
goto out_free_dio;
}
if (!wait_for_completion && !inode->i_sb->s_dio_done_wq) { ret = sb_init_dio_done_wq(inode->i_sb);
if (ret < 0)
goto out_free_dio;
}
}
inode_dio_begin(inode);
blk_start_plug(&plug);
while ((ret = iomap_iter(&iomi, ops)) > 0) iomi.processed = iomap_dio_iter(&iomi, dio); blk_finish_plug(&plug);
/*
* We only report that we've read data up to i_size.
* Revert iter to a state corresponding to that as some callers (such
* as the splice code) rely on it.
*/
if (iov_iter_rw(iter) == READ && iomi.pos >= dio->i_size) iov_iter_revert(iter, iomi.pos - dio->i_size); if (ret == -EFAULT && dio->size && (dio_flags & IOMAP_DIO_PARTIAL)) { if (!(iocb->ki_flags & IOCB_NOWAIT))
wait_for_completion = true;
ret = 0;
}
/* magic error code to fall back to buffered I/O */
if (ret == -ENOTBLK) {
wait_for_completion = true;
ret = 0;
}
if (ret < 0)
iomap_dio_set_error(dio, ret);
/*
* If all the writes we issued were FUA, we don't need to flush the
* cache on IO completion. Clear the sync flag for this case.
*/
if (dio->flags & IOMAP_DIO_WRITE_FUA) dio->flags &= ~IOMAP_DIO_NEED_SYNC; WRITE_ONCE(iocb->ki_cookie, dio->submit.cookie);
WRITE_ONCE(iocb->private, dio->submit.last_queue);
/*
* We are about to drop our additional submission reference, which
* might be the last reference to the dio. There are three different
* ways we can progress here:
*
* (a) If this is the last reference we will always complete and free
* the dio ourselves.
* (b) If this is not the last reference, and we serve an asynchronous
* iocb, we must never touch the dio after the decrement, the
* I/O completion handler will complete and free it.
* (c) If this is not the last reference, but we serve a synchronous
* iocb, the I/O completion handler will wake us up on the drop
* of the final reference, and we will complete and free it here
* after we got woken by the I/O completion handler.
*/
dio->wait_for_completion = wait_for_completion;
if (!atomic_dec_and_test(&dio->ref)) {
if (!wait_for_completion)
return ERR_PTR(-EIOCBQUEUED);
for (;;) {
set_current_state(TASK_UNINTERRUPTIBLE);
if (!READ_ONCE(dio->submit.waiter))
break;
if (!(iocb->ki_flags & IOCB_HIPRI) || !dio->submit.last_queue || !blk_poll(dio->submit.last_queue,
dio->submit.cookie, true))
blk_io_schedule();
}
__set_current_state(TASK_RUNNING);
}
return dio;
out_free_dio:
kfree(dio); if (ret)
return ERR_PTR(ret);
return NULL;
}
EXPORT_SYMBOL_GPL(__iomap_dio_rw);
ssize_t
iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
const struct iomap_ops *ops, const struct iomap_dio_ops *dops,
unsigned int dio_flags, size_t done_before)
{
struct iomap_dio *dio;
dio = __iomap_dio_rw(iocb, iter, ops, dops, dio_flags, done_before);
if (IS_ERR_OR_NULL(dio))
return PTR_ERR_OR_ZERO(dio);
return iomap_dio_complete(dio);
}
EXPORT_SYMBOL_GPL(iomap_dio_rw);
// SPDX-License-Identifier: GPL-2.0
/*
* Copyright(C) 2005-2006, Thomas Gleixner <tglx@linutronix.de>
* Copyright(C) 2005-2007, Red Hat, Inc., Ingo Molnar
* Copyright(C) 2006-2007 Timesys Corp., Thomas Gleixner
*
* No idle tick implementation for low and high resolution timers
*
* Started by: Thomas Gleixner and Ingo Molnar
*/
#include <linux/cpu.h>
#include <linux/err.h>
#include <linux/hrtimer.h>
#include <linux/interrupt.h>
#include <linux/kernel_stat.h>
#include <linux/percpu.h>
#include <linux/nmi.h>
#include <linux/profile.h>
#include <linux/sched/signal.h>
#include <linux/sched/clock.h>
#include <linux/sched/stat.h>
#include <linux/sched/nohz.h>
#include <linux/sched/loadavg.h>
#include <linux/module.h>
#include <linux/irq_work.h>
#include <linux/posix-timers.h>
#include <linux/context_tracking.h>
#include <linux/mm.h>
#include <asm/irq_regs.h>
#include "tick-internal.h"
#include <trace/events/timer.h>
/*
* Per-CPU nohz control structure
*/
static DEFINE_PER_CPU(struct tick_sched, tick_cpu_sched);
struct tick_sched *tick_get_tick_sched(int cpu)
{
return &per_cpu(tick_cpu_sched, cpu);
}
#if defined(CONFIG_NO_HZ_COMMON) || defined(CONFIG_HIGH_RES_TIMERS)
/*
* The time, when the last jiffy update happened. Write access must hold
* jiffies_lock and jiffies_seq. tick_nohz_next_event() needs to get a
* consistent view of jiffies and last_jiffies_update.
*/
static ktime_t last_jiffies_update;
/*
* Must be called with interrupts disabled !
*/
static void tick_do_update_jiffies64(ktime_t now)
{
unsigned long ticks = 1;
ktime_t delta, nextp;
/*
* 64bit can do a quick check without holding jiffies lock and
* without looking at the sequence count. The smp_load_acquire()
* pairs with the update done later in this function.
*
* 32bit cannot do that because the store of tick_next_period
* consists of two 32bit stores and the first store could move it
* to a random point in the future.
*/
if (IS_ENABLED(CONFIG_64BIT)) {
if (ktime_before(now, smp_load_acquire(&tick_next_period)))
return;
} else {
unsigned int seq;
/*
* Avoid contention on jiffies_lock and protect the quick
* check with the sequence count.
*/
do {
seq = read_seqcount_begin(&jiffies_seq);
nextp = tick_next_period;
} while (read_seqcount_retry(&jiffies_seq, seq));
if (ktime_before(now, nextp))
return;
}
/* Quick check failed, i.e. update is required. */
raw_spin_lock(&jiffies_lock);
/*
* Reevaluate with the lock held. Another CPU might have done the
* update already.
*/
if (ktime_before(now, tick_next_period)) {
raw_spin_unlock(&jiffies_lock);
return;
}
write_seqcount_begin(&jiffies_seq);
delta = ktime_sub(now, tick_next_period);
if (unlikely(delta >= TICK_NSEC)) {
/* Slow path for long idle sleep times */
s64 incr = TICK_NSEC;
ticks += ktime_divns(delta, incr);
last_jiffies_update = ktime_add_ns(last_jiffies_update,
incr * ticks);
} else {
last_jiffies_update = ktime_add_ns(last_jiffies_update,
TICK_NSEC);
}
/* Advance jiffies to complete the jiffies_seq protected job */
jiffies_64 += ticks;
/*
* Keep the tick_next_period variable up to date.
*/
nextp = ktime_add_ns(last_jiffies_update, TICK_NSEC);
if (IS_ENABLED(CONFIG_64BIT)) {
/*
* Pairs with smp_load_acquire() in the lockless quick
* check above and ensures that the update to jiffies_64 is
* not reordered vs. the store to tick_next_period, neither
* by the compiler nor by the CPU.
*/
smp_store_release(&tick_next_period, nextp);
} else {
/*
* A plain store is good enough on 32bit as the quick check
* above is protected by the sequence count.
*/
tick_next_period = nextp;
}
/*
* Release the sequence count. calc_global_load() below is not
* protected by it, but jiffies_lock needs to be held to prevent
* concurrent invocations.
*/
write_seqcount_end(&jiffies_seq);
calc_global_load();
raw_spin_unlock(&jiffies_lock);
update_wall_time();
}
/*
* Initialize and return retrieve the jiffies update.
*/
static ktime_t tick_init_jiffy_update(void)
{
ktime_t period;
raw_spin_lock(&jiffies_lock);
write_seqcount_begin(&jiffies_seq);
/* Did we start the jiffies update yet ? */
if (last_jiffies_update == 0)
last_jiffies_update = tick_next_period;
period = last_jiffies_update;
write_seqcount_end(&jiffies_seq);
raw_spin_unlock(&jiffies_lock);
return period;
}
static void tick_sched_do_timer(struct tick_sched *ts, ktime_t now)
{
int cpu = smp_processor_id();
#ifdef CONFIG_NO_HZ_COMMON
/*
* Check if the do_timer duty was dropped. We don't care about
* concurrency: This happens only when the CPU in charge went
* into a long sleep. If two CPUs happen to assign themselves to
* this duty, then the jiffies update is still serialized by
* jiffies_lock.
*
* If nohz_full is enabled, this should not happen because the
* tick_do_timer_cpu never relinquishes.
*/
if (unlikely(tick_do_timer_cpu == TICK_DO_TIMER_NONE)) {
#ifdef CONFIG_NO_HZ_FULL
WARN_ON_ONCE(tick_nohz_full_running);
#endif
tick_do_timer_cpu = cpu;
}
#endif
/* Check, if the jiffies need an update */
if (tick_do_timer_cpu == cpu)
tick_do_update_jiffies64(now);
if (ts->inidle)
ts->got_idle_tick = 1;
}
static void tick_sched_handle(struct tick_sched *ts, struct pt_regs *regs)
{
#ifdef CONFIG_NO_HZ_COMMON
/*
* When we are idle and the tick is stopped, we have to touch
* the watchdog as we might not schedule for a really long
* time. This happens on complete idle SMP systems while
* waiting on the login prompt. We also increment the "start of
* idle" jiffy stamp so the idle accounting adjustment we do
* when we go busy again does not account too much ticks.
*/
if (ts->tick_stopped) {
touch_softlockup_watchdog_sched();
if (is_idle_task(current))
ts->idle_jiffies++;
/*
* In case the current tick fired too early past its expected
* expiration, make sure we don't bypass the next clock reprogramming
* to the same deadline.
*/
ts->next_tick = 0;
}
#endif
update_process_times(user_mode(regs));
profile_tick(CPU_PROFILING);
}
#endif
#ifdef CONFIG_NO_HZ_FULL
cpumask_var_t tick_nohz_full_mask;
EXPORT_SYMBOL_GPL(tick_nohz_full_mask);
bool tick_nohz_full_running;
EXPORT_SYMBOL_GPL(tick_nohz_full_running);
static atomic_t tick_dep_mask;
static bool check_tick_dependency(atomic_t *dep)
{
int val = atomic_read(dep);
if (val & TICK_DEP_MASK_POSIX_TIMER) {
trace_tick_stop(0, TICK_DEP_MASK_POSIX_TIMER);
return true;
}
if (val & TICK_DEP_MASK_PERF_EVENTS) {
trace_tick_stop(0, TICK_DEP_MASK_PERF_EVENTS);
return true;
}
if (val & TICK_DEP_MASK_SCHED) {
trace_tick_stop(0, TICK_DEP_MASK_SCHED);
return true;
}
if (val & TICK_DEP_MASK_CLOCK_UNSTABLE) {
trace_tick_stop(0, TICK_DEP_MASK_CLOCK_UNSTABLE);
return true;
}
if (val & TICK_DEP_MASK_RCU) {
trace_tick_stop(0, TICK_DEP_MASK_RCU);
return true;
}
return false;
}
static bool can_stop_full_tick(int cpu, struct tick_sched *ts)
{
lockdep_assert_irqs_disabled();
if (unlikely(!cpu_online(cpu)))
return false;
if (check_tick_dependency(&tick_dep_mask))
return false;
if (check_tick_dependency(&ts->tick_dep_mask))
return false;
if (check_tick_dependency(¤t->tick_dep_mask))
return false;
if (check_tick_dependency(¤t->signal->tick_dep_mask))
return false;
return true;
}
static void nohz_full_kick_func(struct irq_work *work)
{
/* Empty, the tick restart happens on tick_nohz_irq_exit() */
}
static DEFINE_PER_CPU(struct irq_work, nohz_full_kick_work) =
IRQ_WORK_INIT_HARD(nohz_full_kick_func);
/*
* Kick this CPU if it's full dynticks in order to force it to
* re-evaluate its dependency on the tick and restart it if necessary.
* This kick, unlike tick_nohz_full_kick_cpu() and tick_nohz_full_kick_all(),
* is NMI safe.
*/
static void tick_nohz_full_kick(void)
{
if (!tick_nohz_full_cpu(smp_processor_id()))
return;
irq_work_queue(this_cpu_ptr(&nohz_full_kick_work));
}
/*
* Kick the CPU if it's full dynticks in order to force it to
* re-evaluate its dependency on the tick and restart it if necessary.
*/
void tick_nohz_full_kick_cpu(int cpu)
{
if (!tick_nohz_full_cpu(cpu))
return;
irq_work_queue_on(&per_cpu(nohz_full_kick_work, cpu), cpu);
}
static void tick_nohz_kick_task(struct task_struct *tsk)
{
int cpu;
/*
* If the task is not running, run_posix_cpu_timers()
* has nothing to elapse, IPI can then be spared.
*
* activate_task() STORE p->tick_dep_mask
* STORE p->on_rq
* __schedule() (switch to task 'p') smp_mb() (atomic_fetch_or())
* LOCK rq->lock LOAD p->on_rq
* smp_mb__after_spin_lock()
* tick_nohz_task_switch()
* LOAD p->tick_dep_mask
*/
if (!sched_task_on_rq(tsk))
return;
/*
* If the task concurrently migrates to another CPU,
* we guarantee it sees the new tick dependency upon
* schedule.
*
* set_task_cpu(p, cpu);
* STORE p->cpu = @cpu
* __schedule() (switch to task 'p')
* LOCK rq->lock
* smp_mb__after_spin_lock() STORE p->tick_dep_mask
* tick_nohz_task_switch() smp_mb() (atomic_fetch_or())
* LOAD p->tick_dep_mask LOAD p->cpu
*/
cpu = task_cpu(tsk);
preempt_disable();
if (cpu_online(cpu))
tick_nohz_full_kick_cpu(cpu);
preempt_enable();
}
/*
* Kick all full dynticks CPUs in order to force these to re-evaluate
* their dependency on the tick and restart it if necessary.
*/
static void tick_nohz_full_kick_all(void)
{
int cpu;
if (!tick_nohz_full_running)
return;
preempt_disable();
for_each_cpu_and(cpu, tick_nohz_full_mask, cpu_online_mask)
tick_nohz_full_kick_cpu(cpu);
preempt_enable();
}
static void tick_nohz_dep_set_all(atomic_t *dep,
enum tick_dep_bits bit)
{
int prev;
prev = atomic_fetch_or(BIT(bit), dep);
if (!prev)
tick_nohz_full_kick_all();
}
/*
* Set a global tick dependency. Used by perf events that rely on freq and
* by unstable clock.
*/
void tick_nohz_dep_set(enum tick_dep_bits bit)
{
tick_nohz_dep_set_all(&tick_dep_mask, bit);
}
void tick_nohz_dep_clear(enum tick_dep_bits bit)
{
atomic_andnot(BIT(bit), &tick_dep_mask);
}
/*
* Set per-CPU tick dependency. Used by scheduler and perf events in order to
* manage events throttling.
*/
void tick_nohz_dep_set_cpu(int cpu, enum tick_dep_bits bit)
{
int prev;
struct tick_sched *ts;
ts = per_cpu_ptr(&tick_cpu_sched, cpu);
prev = atomic_fetch_or(BIT(bit), &ts->tick_dep_mask);
if (!prev) {
preempt_disable();
/* Perf needs local kick that is NMI safe */
if (cpu == smp_processor_id()) {
tick_nohz_full_kick();
} else {
/* Remote irq work not NMI-safe */
if (!WARN_ON_ONCE(in_nmi()))
tick_nohz_full_kick_cpu(cpu);
}
preempt_enable();
}
}
EXPORT_SYMBOL_GPL(tick_nohz_dep_set_cpu);
void tick_nohz_dep_clear_cpu(int cpu, enum tick_dep_bits bit)
{
struct tick_sched *ts = per_cpu_ptr(&tick_cpu_sched, cpu);
atomic_andnot(BIT(bit), &ts->tick_dep_mask);
}
EXPORT_SYMBOL_GPL(tick_nohz_dep_clear_cpu);
/*
* Set a per-task tick dependency. RCU need this. Also posix CPU timers
* in order to elapse per task timers.
*/
void tick_nohz_dep_set_task(struct task_struct *tsk, enum tick_dep_bits bit)
{
if (!atomic_fetch_or(BIT(bit), &tsk->tick_dep_mask))
tick_nohz_kick_task(tsk);
}
EXPORT_SYMBOL_GPL(tick_nohz_dep_set_task);
void tick_nohz_dep_clear_task(struct task_struct *tsk, enum tick_dep_bits bit)
{
atomic_andnot(BIT(bit), &tsk->tick_dep_mask);
}
EXPORT_SYMBOL_GPL(tick_nohz_dep_clear_task);
/*
* Set a per-taskgroup tick dependency. Posix CPU timers need this in order to elapse
* per process timers.
*/
void tick_nohz_dep_set_signal(struct task_struct *tsk,
enum tick_dep_bits bit)
{
int prev;
struct signal_struct *sig = tsk->signal;
prev = atomic_fetch_or(BIT(bit), &sig->tick_dep_mask);
if (!prev) {
struct task_struct *t;
lockdep_assert_held(&tsk->sighand->siglock);
__for_each_thread(sig, t)
tick_nohz_kick_task(t);
}
}
void tick_nohz_dep_clear_signal(struct signal_struct *sig, enum tick_dep_bits bit)
{
atomic_andnot(BIT(bit), &sig->tick_dep_mask);
}
/*
* Re-evaluate the need for the tick as we switch the current task.
* It might need the tick due to per task/process properties:
* perf events, posix CPU timers, ...
*/
void __tick_nohz_task_switch(void)
{
struct tick_sched *ts;
if (!tick_nohz_full_cpu(smp_processor_id()))
return;
ts = this_cpu_ptr(&tick_cpu_sched);
if (ts->tick_stopped) {
if (atomic_read(¤t->tick_dep_mask) ||
atomic_read(¤t->signal->tick_dep_mask))
tick_nohz_full_kick();
}
}
/* Get the boot-time nohz CPU list from the kernel parameters. */
void __init tick_nohz_full_setup(cpumask_var_t cpumask)
{
alloc_bootmem_cpumask_var(&tick_nohz_full_mask);
cpumask_copy(tick_nohz_full_mask, cpumask);
tick_nohz_full_running = true;
}
EXPORT_SYMBOL_GPL(tick_nohz_full_setup);
static int tick_nohz_cpu_down(unsigned int cpu)
{
/*
* The tick_do_timer_cpu CPU handles housekeeping duty (unbound
* timers, workqueues, timekeeping, ...) on behalf of full dynticks
* CPUs. It must remain online when nohz full is enabled.
*/
if (tick_nohz_full_running && tick_do_timer_cpu == cpu)
return -EBUSY;
return 0;
}
void __init tick_nohz_init(void)
{
int cpu, ret;
if (!tick_nohz_full_running)
return;
/*
* Full dynticks uses irq work to drive the tick rescheduling on safe
* locking contexts. But then we need irq work to raise its own
* interrupts to avoid circular dependency on the tick
*/
if (!arch_irq_work_has_interrupt()) {
pr_warn("NO_HZ: Can't run full dynticks because arch doesn't support irq work self-IPIs\n");
cpumask_clear(tick_nohz_full_mask);
tick_nohz_full_running = false;
return;
}
if (IS_ENABLED(CONFIG_PM_SLEEP_SMP) &&
!IS_ENABLED(CONFIG_PM_SLEEP_SMP_NONZERO_CPU)) {
cpu = smp_processor_id();
if (cpumask_test_cpu(cpu, tick_nohz_full_mask)) {
pr_warn("NO_HZ: Clearing %d from nohz_full range "
"for timekeeping\n", cpu);
cpumask_clear_cpu(cpu, tick_nohz_full_mask);
}
}
for_each_cpu(cpu, tick_nohz_full_mask)
context_tracking_cpu_set(cpu);
ret = cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN,
"kernel/nohz:predown", NULL,
tick_nohz_cpu_down);
WARN_ON(ret < 0);
pr_info("NO_HZ: Full dynticks CPUs: %*pbl.\n",
cpumask_pr_args(tick_nohz_full_mask));
}
#endif
/*
* NOHZ - aka dynamic tick functionality
*/
#ifdef CONFIG_NO_HZ_COMMON
/*
* NO HZ enabled ?
*/
bool tick_nohz_enabled __read_mostly = true;
unsigned long tick_nohz_active __read_mostly;
/*
* Enable / Disable tickless mode
*/
static int __init setup_tick_nohz(char *str)
{
return (kstrtobool(str, &tick_nohz_enabled) == 0);
}
__setup("nohz=", setup_tick_nohz);
bool tick_nohz_tick_stopped(void)
{
struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched);
return ts->tick_stopped;
}
bool tick_nohz_tick_stopped_cpu(int cpu)
{
struct tick_sched *ts = per_cpu_ptr(&tick_cpu_sched, cpu);
return ts->tick_stopped;
}
/**
* tick_nohz_update_jiffies - update jiffies when idle was interrupted
*
* Called from interrupt entry when the CPU was idle
*
* In case the sched_tick was stopped on this CPU, we have to check if jiffies
* must be updated. Otherwise an interrupt handler could use a stale jiffy
* value. We do this unconditionally on any CPU, as we don't know whether the
* CPU, which has the update task assigned is in a long sleep.
*/
static void tick_nohz_update_jiffies(ktime_t now)
{
unsigned long flags;
__this_cpu_write(tick_cpu_sched.idle_waketime, now);
local_irq_save(flags);
tick_do_update_jiffies64(now);
local_irq_restore(flags);
touch_softlockup_watchdog_sched();
}
/*
* Updates the per-CPU time idle statistics counters
*/
static void
update_ts_time_stats(int cpu, struct tick_sched *ts, ktime_t now, u64 *last_update_time)
{
ktime_t delta;
if (ts->idle_active) {
delta = ktime_sub(now, ts->idle_entrytime);
if (nr_iowait_cpu(cpu) > 0)
ts->iowait_sleeptime = ktime_add(ts->iowait_sleeptime, delta);
else
ts->idle_sleeptime = ktime_add(ts->idle_sleeptime, delta);
ts->idle_entrytime = now;
}
if (last_update_time)
*last_update_time = ktime_to_us(now);
}
static void tick_nohz_stop_idle(struct tick_sched *ts, ktime_t now)
{
update_ts_time_stats(smp_processor_id(), ts, now, NULL);
ts->idle_active = 0;
sched_clock_idle_wakeup_event();
}
static void tick_nohz_start_idle(struct tick_sched *ts)
{
ts->idle_entrytime = ktime_get();
ts->idle_active = 1;
sched_clock_idle_sleep_event();
}
/**
* get_cpu_idle_time_us - get the total idle time of a CPU
* @cpu: CPU number to query
* @last_update_time: variable to store update time in. Do not update
* counters if NULL.
*
* Return the cumulative idle time (since boot) for a given
* CPU, in microseconds.
*
* This time is measured via accounting rather than sampling,
* and is as accurate as ktime_get() is.
*
* This function returns -1 if NOHZ is not enabled.
*/
u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time)
{
struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
ktime_t now, idle;
if (!tick_nohz_active)
return -1;
now = ktime_get();
if (last_update_time) {
update_ts_time_stats(cpu, ts, now, last_update_time);
idle = ts->idle_sleeptime;
} else {
if (ts->idle_active && !nr_iowait_cpu(cpu)) {
ktime_t delta = ktime_sub(now, ts->idle_entrytime);
idle = ktime_add(ts->idle_sleeptime, delta);
} else {
idle = ts->idle_sleeptime;
}
}
return ktime_to_us(idle);
}
EXPORT_SYMBOL_GPL(get_cpu_idle_time_us);
/**
* get_cpu_iowait_time_us - get the total iowait time of a CPU
* @cpu: CPU number to query
* @last_update_time: variable to store update time in. Do not update
* counters if NULL.
*
* Return the cumulative iowait time (since boot) for a given
* CPU, in microseconds.
*
* This time is measured via accounting rather than sampling,
* and is as accurate as ktime_get() is.
*
* This function returns -1 if NOHZ is not enabled.
*/
u64 get_cpu_iowait_time_us(int cpu, u64 *last_update_time)
{
struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
ktime_t now, iowait;
if (!tick_nohz_active)
return -1;
now = ktime_get();
if (last_update_time) {
update_ts_time_stats(cpu, ts, now, last_update_time);
iowait = ts->iowait_sleeptime;
} else {
if (ts->idle_active && nr_iowait_cpu(cpu) > 0) {
ktime_t delta = ktime_sub(now, ts->idle_entrytime);
iowait = ktime_add(ts->iowait_sleeptime, delta);
} else {
iowait = ts->iowait_sleeptime;
}
}
return ktime_to_us(iowait);
}
EXPORT_SYMBOL_GPL(get_cpu_iowait_time_us);
static void tick_nohz_restart(struct tick_sched *ts, ktime_t now)
{
hrtimer_cancel(&ts->sched_timer);
hrtimer_set_expires(&ts->sched_timer, ts->last_tick);
/* Forward the time to expire in the future */
hrtimer_forward(&ts->sched_timer, now, TICK_NSEC);
if (ts->nohz_mode == NOHZ_MODE_HIGHRES) {
hrtimer_start_expires(&ts->sched_timer,
HRTIMER_MODE_ABS_PINNED_HARD);
} else {
tick_program_event(hrtimer_get_expires(&ts->sched_timer), 1);
}
/*
* Reset to make sure next tick stop doesn't get fooled by past
* cached clock deadline.
*/
ts->next_tick = 0;
}
static inline bool local_timer_softirq_pending(void)
{
return local_softirq_pending() & BIT(TIMER_SOFTIRQ);
}
static ktime_t tick_nohz_next_event(struct tick_sched *ts, int cpu)
{
u64 basemono, next_tick, next_tmr, next_rcu, delta, expires;
unsigned long basejiff;
unsigned int seq;
/* Read jiffies and the time when jiffies were updated last */
do {
seq = read_seqcount_begin(&jiffies_seq);
basemono = last_jiffies_update;
basejiff = jiffies;
} while (read_seqcount_retry(&jiffies_seq, seq));
ts->last_jiffies = basejiff;
ts->timer_expires_base = basemono;
/*
* Keep the periodic tick, when RCU, architecture or irq_work
* requests it.
* Aside of that check whether the local timer softirq is
* pending. If so its a bad idea to call get_next_timer_interrupt()
* because there is an already expired timer, so it will request
* immediate expiry, which rearms the hardware timer with a
* minimal delta which brings us back to this place
* immediately. Lather, rinse and repeat...
*/
if (rcu_needs_cpu(basemono, &next_rcu) || arch_needs_cpu() ||
irq_work_needs_cpu() || local_timer_softirq_pending()) {
next_tick = basemono + TICK_NSEC;
} else {
/*
* Get the next pending timer. If high resolution
* timers are enabled this only takes the timer wheel
* timers into account. If high resolution timers are
* disabled this also looks at the next expiring
* hrtimer.
*/
next_tmr = get_next_timer_interrupt(basejiff, basemono);
ts->next_timer = next_tmr;
/* Take the next rcu event into account */
next_tick = next_rcu < next_tmr ? next_rcu : next_tmr;
}
/*
* If the tick is due in the next period, keep it ticking or
* force prod the timer.
*/
delta = next_tick - basemono;
if (delta <= (u64)TICK_NSEC) {
/*
* Tell the timer code that the base is not idle, i.e. undo
* the effect of get_next_timer_interrupt():
*/
timer_clear_idle();
/*
* We've not stopped the tick yet, and there's a timer in the
* next period, so no point in stopping it either, bail.
*/
if (!ts->tick_stopped) {
ts->timer_expires = 0;
goto out;
}
}
/*
* If this CPU is the one which had the do_timer() duty last, we limit
* the sleep time to the timekeeping max_deferment value.
* Otherwise we can sleep as long as we want.
*/
delta = timekeeping_max_deferment();
if (cpu != tick_do_timer_cpu &&
(tick_do_timer_cpu != TICK_DO_TIMER_NONE || !ts->do_timer_last))
delta = KTIME_MAX;
/* Calculate the next expiry time */
if (delta < (KTIME_MAX - basemono))
expires = basemono + delta;
else
expires = KTIME_MAX;
ts->timer_expires = min_t(u64, expires, next_tick);
out:
return ts->timer_expires;
}
static void tick_nohz_stop_tick(struct tick_sched *ts, int cpu)
{
struct clock_event_device *dev = __this_cpu_read(tick_cpu_device.evtdev);
u64 basemono = ts->timer_expires_base;
u64 expires = ts->timer_expires;
ktime_t tick = expires;
/* Make sure we won't be trying to stop it twice in a row. */
ts->timer_expires_base = 0;
/*
* If this CPU is the one which updates jiffies, then give up
* the assignment and let it be taken by the CPU which runs
* the tick timer next, which might be this CPU as well. If we
* don't drop this here the jiffies might be stale and
* do_timer() never invoked. Keep track of the fact that it
* was the one which had the do_timer() duty last.
*/
if (cpu == tick_do_timer_cpu) {
tick_do_timer_cpu = TICK_DO_TIMER_NONE;
ts->do_timer_last = 1;
} else if (tick_do_timer_cpu != TICK_DO_TIMER_NONE) {
ts->do_timer_last = 0;
}
/* Skip reprogram of event if its not changed */
if (ts->tick_stopped && (expires == ts->next_tick)) {
/* Sanity check: make sure clockevent is actually programmed */
if (tick == KTIME_MAX || ts->next_tick == hrtimer_get_expires(&ts->sched_timer))
return;
WARN_ON_ONCE(1);
printk_once("basemono: %llu ts->next_tick: %llu dev->next_event: %llu timer->active: %d timer->expires: %llu\n",
basemono, ts->next_tick, dev->next_event,
hrtimer_active(&ts->sched_timer), hrtimer_get_expires(&ts->sched_timer));
}
/*
* nohz_stop_sched_tick can be called several times before
* the nohz_restart_sched_tick is called. This happens when
* interrupts arrive which do not cause a reschedule. In the
* first call we save the current tick time, so we can restart
* the scheduler tick in nohz_restart_sched_tick.
*/
if (!ts->tick_stopped) {
calc_load_nohz_start();
quiet_vmstat();
ts->last_tick = hrtimer_get_expires(&ts->sched_timer);
ts->tick_stopped = 1;
trace_tick_stop(1, TICK_DEP_MASK_NONE);
}
ts->next_tick = tick;
/*
* If the expiration time == KTIME_MAX, then we simply stop
* the tick timer.
*/
if (unlikely(expires == KTIME_MAX)) {
if (ts->nohz_mode == NOHZ_MODE_HIGHRES)
hrtimer_cancel(&ts->sched_timer);
return;
}
if (ts->nohz_mode == NOHZ_MODE_HIGHRES) {
hrtimer_start(&ts->sched_timer, tick,
HRTIMER_MODE_ABS_PINNED_HARD);
} else {
hrtimer_set_expires(&ts->sched_timer, tick);
tick_program_event(tick, 1);
}
}
static void tick_nohz_retain_tick(struct tick_sched *ts)
{
ts->timer_expires_base = 0;
}
#ifdef CONFIG_NO_HZ_FULL
static void tick_nohz_stop_sched_tick(struct tick_sched *ts, int cpu)
{
if (tick_nohz_next_event(ts, cpu))
tick_nohz_stop_tick(ts, cpu);
else
tick_nohz_retain_tick(ts);
}
#endif /* CONFIG_NO_HZ_FULL */
static void tick_nohz_restart_sched_tick(struct tick_sched *ts, ktime_t now)
{
/* Update jiffies first */
tick_do_update_jiffies64(now);
/*
* Clear the timer idle flag, so we avoid IPIs on remote queueing and
* the clock forward checks in the enqueue path:
*/
timer_clear_idle();
calc_load_nohz_stop();
touch_softlockup_watchdog_sched();
/*
* Cancel the scheduled timer and restore the tick
*/
ts->tick_stopped = 0;
tick_nohz_restart(ts, now);
}
static void __tick_nohz_full_update_tick(struct tick_sched *ts,
ktime_t now)
{
#ifdef CONFIG_NO_HZ_FULL
int cpu = smp_processor_id();
if (can_stop_full_tick(cpu, ts))
tick_nohz_stop_sched_tick(ts, cpu);
else if (ts->tick_stopped)
tick_nohz_restart_sched_tick(ts, now);
#endif
}
static void tick_nohz_full_update_tick(struct tick_sched *ts)
{
if (!tick_nohz_full_cpu(smp_processor_id()))
return;
if (!ts->tick_stopped && ts->nohz_mode == NOHZ_MODE_INACTIVE)
return;
__tick_nohz_full_update_tick(ts, ktime_get());
}
static bool can_stop_idle_tick(int cpu, struct tick_sched *ts)
{
/*
* If this CPU is offline and it is the one which updates
* jiffies, then give up the assignment and let it be taken by
* the CPU which runs the tick timer next. If we don't drop
* this here the jiffies might be stale and do_timer() never
* invoked.
*/
if (unlikely(!cpu_online(cpu))) {
if (cpu == tick_do_timer_cpu)
tick_do_timer_cpu = TICK_DO_TIMER_NONE;
/*
* Make sure the CPU doesn't get fooled by obsolete tick
* deadline if it comes back online later.
*/
ts->next_tick = 0;
return false;
}
if (unlikely(ts->nohz_mode == NOHZ_MODE_INACTIVE))
return false;
if (need_resched())
return false;
if (unlikely(local_softirq_pending())) {
static int ratelimit;
if (ratelimit < 10 && !local_bh_blocked() &&
(local_softirq_pending() & SOFTIRQ_STOP_IDLE_MASK)) {
pr_warn("NOHZ tick-stop error: Non-RCU local softirq work is pending, handler #%02x!!!\n",
(unsigned int) local_softirq_pending());
ratelimit++;
}
return false;
}
if (tick_nohz_full_enabled()) {
/*
* Keep the tick alive to guarantee timekeeping progression
* if there are full dynticks CPUs around
*/
if (tick_do_timer_cpu == cpu)
return false;
/* Should not happen for nohz-full */
if (WARN_ON_ONCE(tick_do_timer_cpu == TICK_DO_TIMER_NONE))
return false;
}
return true;
}
static void __tick_nohz_idle_stop_tick(struct tick_sched *ts)
{
ktime_t expires;
int cpu = smp_processor_id();
/*
* If tick_nohz_get_sleep_length() ran tick_nohz_next_event(), the
* tick timer expiration time is known already.
*/
if (ts->timer_expires_base)
expires = ts->timer_expires;
else if (can_stop_idle_tick(cpu, ts))
expires = tick_nohz_next_event(ts, cpu);
else
return;
ts->idle_calls++;
if (expires > 0LL) {
int was_stopped = ts->tick_stopped;
tick_nohz_stop_tick(ts, cpu);
ts->idle_sleeps++;
ts->idle_expires = expires;
if (!was_stopped && ts->tick_stopped) {
ts->idle_jiffies = ts->last_jiffies;
nohz_balance_enter_idle(cpu);
}
} else {
tick_nohz_retain_tick(ts);
}
}
/**
* tick_nohz_idle_stop_tick - stop the idle tick from the idle task
*
* When the next event is more than a tick into the future, stop the idle tick
*/
void tick_nohz_idle_stop_tick(void)
{
__tick_nohz_idle_stop_tick(this_cpu_ptr(&tick_cpu_sched));
}
void tick_nohz_idle_retain_tick(void)
{
tick_nohz_retain_tick(this_cpu_ptr(&tick_cpu_sched));
/*
* Undo the effect of get_next_timer_interrupt() called from
* tick_nohz_next_event().
*/
timer_clear_idle();
}
/**
* tick_nohz_idle_enter - prepare for entering idle on the current CPU
*
* Called when we start the idle loop.
*/
void tick_nohz_idle_enter(void)
{
struct tick_sched *ts;
lockdep_assert_irqs_enabled();
local_irq_disable();
ts = this_cpu_ptr(&tick_cpu_sched);
WARN_ON_ONCE(ts->timer_expires_base);
ts->inidle = 1;
tick_nohz_start_idle(ts);
local_irq_enable();
}
/**
* tick_nohz_irq_exit - update next tick event from interrupt exit
*
* When an interrupt fires while we are idle and it doesn't cause
* a reschedule, it may still add, modify or delete a timer, enqueue
* an RCU callback, etc...
* So we need to re-calculate and reprogram the next tick event.
*/
void tick_nohz_irq_exit(void)
{
struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched);
if (ts->inidle)
tick_nohz_start_idle(ts);
else
tick_nohz_full_update_tick(ts);
}
/**
* tick_nohz_idle_got_tick - Check whether or not the tick handler has run
*/
bool tick_nohz_idle_got_tick(void)
{
struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched);
if (ts->got_idle_tick) {
ts->got_idle_tick = 0;
return true;
}
return false;
}
/**
* tick_nohz_get_next_hrtimer - return the next expiration time for the hrtimer
* or the tick, whatever that expires first. Note that, if the tick has been
* stopped, it returns the next hrtimer.
*
* Called from power state control code with interrupts disabled
*/
ktime_t tick_nohz_get_next_hrtimer(void)
{
return __this_cpu_read(tick_cpu_device.evtdev)->next_event;
}
/**
* tick_nohz_get_sleep_length - return the expected length of the current sleep
* @delta_next: duration until the next event if the tick cannot be stopped
*
* Called from power state control code with interrupts disabled.
*
* The return value of this function and/or the value returned by it through the
* @delta_next pointer can be negative which must be taken into account by its
* callers.
*/
ktime_t tick_nohz_get_sleep_length(ktime_t *delta_next)
{
struct clock_event_device *dev = __this_cpu_read(tick_cpu_device.evtdev);
struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched);
int cpu = smp_processor_id();
/*
* The idle entry time is expected to be a sufficient approximation of
* the current time at this point.
*/
ktime_t now = ts->idle_entrytime;
ktime_t next_event;
WARN_ON_ONCE(!ts->inidle);
*delta_next = ktime_sub(dev->next_event, now);
if (!can_stop_idle_tick(cpu, ts))
return *delta_next;
next_event = tick_nohz_next_event(ts, cpu);
if (!next_event)
return *delta_next;
/*
* If the next highres timer to expire is earlier than next_event, the
* idle governor needs to know that.
*/
next_event = min_t(u64, next_event,
hrtimer_next_event_without(&ts->sched_timer));
return ktime_sub(next_event, now);
}
/**
* tick_nohz_get_idle_calls_cpu - return the current idle calls counter value
* for a particular CPU.
*
* Called from the schedutil frequency scaling governor in scheduler context.
*/
unsigned long tick_nohz_get_idle_calls_cpu(int cpu)
{
struct tick_sched *ts = tick_get_tick_sched(cpu);
return ts->idle_calls;
}
/**
* tick_nohz_get_idle_calls - return the current idle calls counter value
*
* Called from the schedutil frequency scaling governor in scheduler context.
*/
unsigned long tick_nohz_get_idle_calls(void)
{
struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched);
return ts->idle_calls;
}
static void tick_nohz_account_idle_time(struct tick_sched *ts,
ktime_t now)
{
unsigned long ticks;
ts->idle_exittime = now;
if (vtime_accounting_enabled_this_cpu())
return;
/*
* We stopped the tick in idle. Update process times would miss the
* time we slept as update_process_times does only a 1 tick
* accounting. Enforce that this is accounted to idle !
*/
ticks = jiffies - ts->idle_jiffies;
/*
* We might be one off. Do not randomly account a huge number of ticks!
*/
if (ticks && ticks < LONG_MAX)
account_idle_ticks(ticks);
}
void tick_nohz_idle_restart_tick(void)
{
struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched);
if (ts->tick_stopped) {
ktime_t now = ktime_get();
tick_nohz_restart_sched_tick(ts, now);
tick_nohz_account_idle_time(ts, now);
}
}
static void tick_nohz_idle_update_tick(struct tick_sched *ts, ktime_t now)
{
if (tick_nohz_full_cpu(smp_processor_id()))
__tick_nohz_full_update_tick(ts, now);
else
tick_nohz_restart_sched_tick(ts, now);
tick_nohz_account_idle_time(ts, now);
}
/**
* tick_nohz_idle_exit - restart the idle tick from the idle task
*
* Restart the idle tick when the CPU is woken up from idle
* This also exit the RCU extended quiescent state. The CPU
* can use RCU again after this function is called.
*/
void tick_nohz_idle_exit(void)
{
struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched);
bool idle_active, tick_stopped;
ktime_t now;
local_irq_disable();
WARN_ON_ONCE(!ts->inidle);
WARN_ON_ONCE(ts->timer_expires_base);
ts->inidle = 0;
idle_active = ts->idle_active;
tick_stopped = ts->tick_stopped;
if (idle_active || tick_stopped)
now = ktime_get();
if (idle_active)
tick_nohz_stop_idle(ts, now);
if (tick_stopped)
tick_nohz_idle_update_tick(ts, now);
local_irq_enable();
}
/*
* The nohz low res interrupt handler
*/
static void tick_nohz_handler(struct clock_event_device *dev)
{
struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched);
struct pt_regs *regs = get_irq_regs();
ktime_t now = ktime_get();
dev->next_event = KTIME_MAX;
tick_sched_do_timer(ts, now);
tick_sched_handle(ts, regs);
/* No need to reprogram if we are running tickless */
if (unlikely(ts->tick_stopped))
return;
hrtimer_forward(&ts->sched_timer, now, TICK_NSEC);
tick_program_event(hrtimer_get_expires(&ts->sched_timer), 1);
}
static inline void tick_nohz_activate(struct tick_sched *ts, int mode)
{
if (!tick_nohz_enabled)
return;
ts->nohz_mode = mode;
/* One update is enough */
if (!test_and_set_bit(0, &tick_nohz_active))
timers_update_nohz();
}
/**
* tick_nohz_switch_to_nohz - switch to nohz mode
*/
static void tick_nohz_switch_to_nohz(void)
{
struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched);
ktime_t next;
if (!tick_nohz_enabled)
return;
if (tick_switch_to_oneshot(tick_nohz_handler))
return;
/*
* Recycle the hrtimer in ts, so we can share the
* hrtimer_forward with the highres code.
*/
hrtimer_init(&ts->sched_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_HARD);
/* Get the next period */
next = tick_init_jiffy_update();
hrtimer_set_expires(&ts->sched_timer, next);
hrtimer_forward_now(&ts->sched_timer, TICK_NSEC);
tick_program_event(hrtimer_get_expires(&ts->sched_timer), 1);
tick_nohz_activate(ts, NOHZ_MODE_LOWRES);
}
static inline void tick_nohz_irq_enter(void)
{
struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched);
ktime_t now;
if (!ts->idle_active && !ts->tick_stopped)
return;
now = ktime_get();
if (ts->idle_active)
tick_nohz_stop_idle(ts, now);
if (ts->tick_stopped)
tick_nohz_update_jiffies(now);
}
#else
static inline void tick_nohz_switch_to_nohz(void) { }
static inline void tick_nohz_irq_enter(void) { }
static inline void tick_nohz_activate(struct tick_sched *ts, int mode) { }
#endif /* CONFIG_NO_HZ_COMMON */
/*
* Called from irq_enter to notify about the possible interruption of idle()
*/
void tick_irq_enter(void)
{
tick_check_oneshot_broadcast_this_cpu();
tick_nohz_irq_enter();
}
/*
* High resolution timer specific code
*/
#ifdef CONFIG_HIGH_RES_TIMERS
/*
* We rearm the timer until we get disabled by the idle code.
* Called with interrupts disabled.
*/
static enum hrtimer_restart tick_sched_timer(struct hrtimer *timer)
{
struct tick_sched *ts =
container_of(timer, struct tick_sched, sched_timer);
struct pt_regs *regs = get_irq_regs();
ktime_t now = ktime_get();
tick_sched_do_timer(ts, now);
/*
* Do not call, when we are not in irq context and have
* no valid regs pointer
*/
if (regs)
tick_sched_handle(ts, regs);
else
ts->next_tick = 0;
/* No need to reprogram if we are in idle or full dynticks mode */
if (unlikely(ts->tick_stopped))
return HRTIMER_NORESTART;
hrtimer_forward(timer, now, TICK_NSEC);
return HRTIMER_RESTART;
}
static int sched_skew_tick;
static int __init skew_tick(char *str)
{
get_option(&str, &sched_skew_tick);
return 0;
}
early_param("skew_tick", skew_tick);
/**
* tick_setup_sched_timer - setup the tick emulation timer
*/
void tick_setup_sched_timer(void)
{
struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched);
ktime_t now = ktime_get();
/*
* Emulate tick processing via per-CPU hrtimers:
*/
hrtimer_init(&ts->sched_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_HARD);
ts->sched_timer.function = tick_sched_timer;
/* Get the next period (per-CPU) */
hrtimer_set_expires(&ts->sched_timer, tick_init_jiffy_update());
/* Offset the tick to avert jiffies_lock contention. */
if (sched_skew_tick) {
u64 offset = TICK_NSEC >> 1;
do_div(offset, num_possible_cpus());
offset *= smp_processor_id();
hrtimer_add_expires_ns(&ts->sched_timer, offset);
}
hrtimer_forward(&ts->sched_timer, now, TICK_NSEC);
hrtimer_start_expires(&ts->sched_timer, HRTIMER_MODE_ABS_PINNED_HARD);
tick_nohz_activate(ts, NOHZ_MODE_HIGHRES);
}
#endif /* HIGH_RES_TIMERS */
#if defined CONFIG_NO_HZ_COMMON || defined CONFIG_HIGH_RES_TIMERS
void tick_cancel_sched_timer(int cpu)
{
struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
# ifdef CONFIG_HIGH_RES_TIMERS
if (ts->sched_timer.base)
hrtimer_cancel(&ts->sched_timer);
# endif
memset(ts, 0, sizeof(*ts));
}
#endif
/**
* Async notification about clocksource changes
*/
void tick_clock_notify(void)
{
int cpu;
for_each_possible_cpu(cpu)
set_bit(0, &per_cpu(tick_cpu_sched, cpu).check_clocks);
}
/*
* Async notification about clock event changes
*/
void tick_oneshot_notify(void)
{
struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched);
set_bit(0, &ts->check_clocks);
}
/**
* Check, if a change happened, which makes oneshot possible.
*
* Called cyclic from the hrtimer softirq (driven by the timer
* softirq) allow_nohz signals, that we can switch into low-res nohz
* mode, because high resolution timers are disabled (either compile
* or runtime). Called with interrupts disabled.
*/
int tick_check_oneshot_change(int allow_nohz)
{
struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched);
if (!test_and_clear_bit(0, &ts->check_clocks))
return 0;
if (ts->nohz_mode != NOHZ_MODE_INACTIVE)
return 0;
if (!timekeeping_valid_for_hres() || !tick_is_oneshot_available())
return 0;
if (!allow_nohz)
return 1;
tick_nohz_switch_to_nohz();
return 0;
}
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_X86_PAGE_64_H
#define _ASM_X86_PAGE_64_H
#include <asm/page_64_types.h>
#ifndef __ASSEMBLY__
#include <asm/alternative.h>
/* duplicated to the one in bootmem.h */
extern unsigned long max_pfn;
extern unsigned long phys_base;
extern unsigned long page_offset_base;
extern unsigned long vmalloc_base;
extern unsigned long vmemmap_base;
static inline unsigned long __phys_addr_nodebug(unsigned long x)
{
unsigned long y = x - __START_KERNEL_map;
/* use the carry flag to determine if x was < __START_KERNEL_map */
x = y + ((x > y) ? phys_base : (__START_KERNEL_map - PAGE_OFFSET));
return x;
}
#ifdef CONFIG_DEBUG_VIRTUAL
extern unsigned long __phys_addr(unsigned long);
extern unsigned long __phys_addr_symbol(unsigned long);
#else
#define __phys_addr(x) __phys_addr_nodebug(x)
#define __phys_addr_symbol(x) \
((unsigned long)(x) - __START_KERNEL_map + phys_base)
#endif
#define __phys_reloc_hide(x) (x)
#ifdef CONFIG_FLATMEM
#define pfn_valid(pfn) ((pfn) < max_pfn)
#endif
void clear_page_orig(void *page);
void clear_page_rep(void *page);
void clear_page_erms(void *page);
static inline void clear_page(void *page)
{
alternative_call_2(clear_page_orig,
clear_page_rep, X86_FEATURE_REP_GOOD,
clear_page_erms, X86_FEATURE_ERMS,
"=D" (page),
"0" (page)
: "cc", "memory", "rax", "rcx");
}
void copy_page(void *to, void *from);
#ifdef CONFIG_X86_5LEVEL
/*
* User space process size. This is the first address outside the user range.
* There are a few constraints that determine this:
*
* On Intel CPUs, if a SYSCALL instruction is at the highest canonical
* address, then that syscall will enter the kernel with a
* non-canonical return address, and SYSRET will explode dangerously.
* We avoid this particular problem by preventing anything
* from being mapped at the maximum canonical address.
*
* On AMD CPUs in the Ryzen family, there's a nasty bug in which the
* CPUs malfunction if they execute code from the highest canonical page.
* They'll speculate right off the end of the canonical space, and
* bad things happen. This is worked around in the same way as the
* Intel problem.
*
* With page table isolation enabled, we map the LDT in ... [stay tuned]
*/
static __always_inline unsigned long task_size_max(void)
{
unsigned long ret;
alternative_io("movq %[small],%0","movq %[large],%0",
X86_FEATURE_LA57,
"=r" (ret),
[small] "i" ((1ul << 47)-PAGE_SIZE),
[large] "i" ((1ul << 56)-PAGE_SIZE));
return ret;
}
#endif /* CONFIG_X86_5LEVEL */
#endif /* !__ASSEMBLY__ */
#ifdef CONFIG_X86_VSYSCALL_EMULATION
# define __HAVE_ARCH_GATE_AREA 1
#endif
#endif /* _ASM_X86_PAGE_64_H */
// SPDX-License-Identifier: GPL-2.0-only
/*
* A generic implementation of binary search for the Linux kernel
*
* Copyright (C) 2008-2009 Ksplice, Inc.
* Author: Tim Abbott <tabbott@ksplice.com>
*/
#include <linux/export.h>
#include <linux/bsearch.h>
#include <linux/kprobes.h>
/*
* bsearch - binary search an array of elements
* @key: pointer to item being searched for
* @base: pointer to first element to search
* @num: number of elements
* @size: size of each element
* @cmp: pointer to comparison function
*
* This function does a binary search on the given array. The
* contents of the array should already be in ascending sorted order
* under the provided comparison function.
*
* Note that the key need not have the same type as the elements in
* the array, e.g. key could be a string and the comparison function
* could compare the string with the struct's name field. However, if
* the key and elements in the array are of the same type, you can use
* the same comparison function for both sort() and bsearch().
*/
void *bsearch(const void *key, const void *base, size_t num, size_t size, cmp_func_t cmp)
{
return __inline_bsearch(key, base, num, size, cmp);
}
EXPORT_SYMBOL(bsearch);
NOKPROBE_SYMBOL(bsearch);
// SPDX-License-Identifier: GPL-2.0
/*
* SUCS NET3:
*
* Generic datagram handling routines. These are generic for all
* protocols. Possibly a generic IP version on top of these would
* make sense. Not tonight however 8-).
* This is used because UDP, RAW, PACKET, DDP, IPX, AX.25 and
* NetROM layer all have identical poll code and mostly
* identical recvmsg() code. So we share it here. The poll was
* shared before but buried in udp.c so I moved it.
*
* Authors: Alan Cox <alan@lxorguk.ukuu.org.uk>. (datagram_poll() from old
* udp.c code)
*
* Fixes:
* Alan Cox : NULL return from skb_peek_copy()
* understood
* Alan Cox : Rewrote skb_read_datagram to avoid the
* skb_peek_copy stuff.
* Alan Cox : Added support for SOCK_SEQPACKET.
* IPX can no longer use the SO_TYPE hack
* but AX.25 now works right, and SPX is
* feasible.
* Alan Cox : Fixed write poll of non IP protocol
* crash.
* Florian La Roche: Changed for my new skbuff handling.
* Darryl Miles : Fixed non-blocking SOCK_SEQPACKET.
* Linus Torvalds : BSD semantic fixes.
* Alan Cox : Datagram iovec handling
* Darryl Miles : Fixed non-blocking SOCK_STREAM.
* Alan Cox : POSIXisms
* Pete Wyckoff : Unconnected accept() fix.
*
*/
#include <linux/module.h>
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/uaccess.h>
#include <linux/mm.h>
#include <linux/interrupt.h>
#include <linux/errno.h>
#include <linux/sched.h>
#include <linux/inet.h>
#include <linux/netdevice.h>
#include <linux/rtnetlink.h>
#include <linux/poll.h>
#include <linux/highmem.h>
#include <linux/spinlock.h>
#include <linux/slab.h>
#include <linux/pagemap.h>
#include <linux/uio.h>
#include <linux/indirect_call_wrapper.h>
#include <net/protocol.h>
#include <linux/skbuff.h>
#include <net/checksum.h>
#include <net/sock.h>
#include <net/tcp_states.h>
#include <trace/events/skb.h>
#include <net/busy_poll.h>
#include "datagram.h"
/*
* Is a socket 'connection oriented' ?
*/
static inline int connection_based(struct sock *sk)
{
return sk->sk_type == SOCK_SEQPACKET || sk->sk_type == SOCK_STREAM;
}
static int receiver_wake_function(wait_queue_entry_t *wait, unsigned int mode, int sync,
void *key)
{
/*
* Avoid a wakeup if event not interesting for us
*/
if (key && !(key_to_poll(key) & (EPOLLIN | EPOLLERR)))
return 0;
return autoremove_wake_function(wait, mode, sync, key);}
/*
* Wait for the last received packet to be different from skb
*/
int __skb_wait_for_more_packets(struct sock *sk, struct sk_buff_head *queue,
int *err, long *timeo_p,
const struct sk_buff *skb)
{
int error;
DEFINE_WAIT_FUNC(wait, receiver_wake_function);
prepare_to_wait_exclusive(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
/* Socket errors? */
error = sock_error(sk);
if (error)
goto out_err;
if (READ_ONCE(queue->prev) != skb)
goto out;
/* Socket shut down? */
if (sk->sk_shutdown & RCV_SHUTDOWN)
goto out_noerr;
/* Sequenced packets can come disconnected.
* If so we report the problem
*/
error = -ENOTCONN;
if (connection_based(sk) &&
!(sk->sk_state == TCP_ESTABLISHED || sk->sk_state == TCP_LISTEN))
goto out_err;
/* handle signals */
if (signal_pending(current))
goto interrupted;
error = 0;
*timeo_p = schedule_timeout(*timeo_p);
out:
finish_wait(sk_sleep(sk), &wait);
return error;
interrupted:
error = sock_intr_errno(*timeo_p);
out_err:
*err = error;
goto out;
out_noerr:
*err = 0;
error = 1;
goto out;
}
EXPORT_SYMBOL(__skb_wait_for_more_packets);
static struct sk_buff *skb_set_peeked(struct sk_buff *skb)
{
struct sk_buff *nskb;
if (skb->peeked)
return skb;
/* We have to unshare an skb before modifying it. */
if (!skb_shared(skb))
goto done;
nskb = skb_clone(skb, GFP_ATOMIC);
if (!nskb)
return ERR_PTR(-ENOMEM);
skb->prev->next = nskb;
skb->next->prev = nskb;
nskb->prev = skb->prev;
nskb->next = skb->next;
consume_skb(skb);
skb = nskb;
done:
skb->peeked = 1;
return skb;
}
struct sk_buff *__skb_try_recv_from_queue(struct sock *sk,
struct sk_buff_head *queue,
unsigned int flags,
int *off, int *err,
struct sk_buff **last)
{
bool peek_at_off = false;
struct sk_buff *skb;
int _off = 0;
if (unlikely(flags & MSG_PEEK && *off >= 0)) {
peek_at_off = true;
_off = *off;
}
*last = queue->prev; skb_queue_walk(queue, skb) {
if (flags & MSG_PEEK) {
if (peek_at_off && _off >= skb->len && (_off || skb->peeked)) { _off -= skb->len;
continue;
}
if (!skb->len) {
skb = skb_set_peeked(skb);
if (IS_ERR(skb)) {
*err = PTR_ERR(skb); return NULL;
}
}
refcount_inc(&skb->users);
} else {
__skb_unlink(skb, queue);
}
*off = _off;
return skb;
}
return NULL;
}
/**
* __skb_try_recv_datagram - Receive a datagram skbuff
* @sk: socket
* @queue: socket queue from which to receive
* @flags: MSG\_ flags
* @off: an offset in bytes to peek skb from. Returns an offset
* within an skb where data actually starts
* @err: error code returned
* @last: set to last peeked message to inform the wait function
* what to look for when peeking
*
* Get a datagram skbuff, understands the peeking, nonblocking wakeups
* and possible races. This replaces identical code in packet, raw and
* udp, as well as the IPX AX.25 and Appletalk. It also finally fixes
* the long standing peek and read race for datagram sockets. If you
* alter this routine remember it must be re-entrant.
*
* This function will lock the socket if a skb is returned, so
* the caller needs to unlock the socket in that case (usually by
* calling skb_free_datagram). Returns NULL with @err set to
* -EAGAIN if no data was available or to some other value if an
* error was detected.
*
* * It does not lock socket since today. This function is
* * free of race conditions. This measure should/can improve
* * significantly datagram socket latencies at high loads,
* * when data copying to user space takes lots of time.
* * (BTW I've just killed the last cli() in IP/IPv6/core/netlink/packet
* * 8) Great win.)
* * --ANK (980729)
*
* The order of the tests when we find no data waiting are specified
* quite explicitly by POSIX 1003.1g, don't change them without having
* the standard around please.
*/
struct sk_buff *__skb_try_recv_datagram(struct sock *sk,
struct sk_buff_head *queue,
unsigned int flags, int *off, int *err,
struct sk_buff **last)
{
struct sk_buff *skb;
unsigned long cpu_flags;
/*
* Caller is allowed not to check sk->sk_err before skb_recv_datagram()
*/
int error = sock_error(sk);
if (error)
goto no_packet;
do {
/* Again only user level code calls this function, so nothing
* interrupt level will suddenly eat the receive_queue.
*
* Look at current nfs client by the way...
* However, this function was correct in any case. 8)
*/
spin_lock_irqsave(&queue->lock, cpu_flags);
skb = __skb_try_recv_from_queue(sk, queue, flags, off, &error,
last);
spin_unlock_irqrestore(&queue->lock, cpu_flags);
if (error)
goto no_packet;
if (skb)
return skb;
if (!sk_can_busy_loop(sk))
break;
sk_busy_loop(sk, flags & MSG_DONTWAIT); } while (READ_ONCE(queue->prev) != *last); error = -EAGAIN;
no_packet:
*err = error; return NULL;
}
EXPORT_SYMBOL(__skb_try_recv_datagram);
struct sk_buff *__skb_recv_datagram(struct sock *sk,
struct sk_buff_head *sk_queue,
unsigned int flags, int *off, int *err)
{
struct sk_buff *skb, *last;
long timeo;
timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT);
do {
skb = __skb_try_recv_datagram(sk, sk_queue, flags, off, err,
&last);
if (skb)
return skb;
if (*err != -EAGAIN)
break;
} while (timeo && !__skb_wait_for_more_packets(sk, sk_queue, err,
&timeo, last));
return NULL;
}
EXPORT_SYMBOL(__skb_recv_datagram);
struct sk_buff *skb_recv_datagram(struct sock *sk, unsigned int flags,
int noblock, int *err)
{
int off = 0; return __skb_recv_datagram(sk, &sk->sk_receive_queue, flags | (noblock ? MSG_DONTWAIT : 0),
&off, err);
}
EXPORT_SYMBOL(skb_recv_datagram);
void skb_free_datagram(struct sock *sk, struct sk_buff *skb)
{
consume_skb(skb);
sk_mem_reclaim_partial(sk);
}
EXPORT_SYMBOL(skb_free_datagram);
void __skb_free_datagram_locked(struct sock *sk, struct sk_buff *skb, int len)
{
bool slow;
if (!skb_unref(skb)) {
sk_peek_offset_bwd(sk, len);
return;
}
slow = lock_sock_fast(sk);
sk_peek_offset_bwd(sk, len);
skb_orphan(skb);
sk_mem_reclaim_partial(sk);
unlock_sock_fast(sk, slow);
/* skb is now orphaned, can be freed outside of locked section */
__kfree_skb(skb);
}
EXPORT_SYMBOL(__skb_free_datagram_locked);
int __sk_queue_drop_skb(struct sock *sk, struct sk_buff_head *sk_queue,
struct sk_buff *skb, unsigned int flags,
void (*destructor)(struct sock *sk,
struct sk_buff *skb))
{
int err = 0;
if (flags & MSG_PEEK) {
err = -ENOENT;
spin_lock_bh(&sk_queue->lock);
if (skb->next) {
__skb_unlink(skb, sk_queue);
refcount_dec(&skb->users);
if (destructor)
destructor(sk, skb);
err = 0;
}
spin_unlock_bh(&sk_queue->lock);
}
atomic_inc(&sk->sk_drops);
return err;
}
EXPORT_SYMBOL(__sk_queue_drop_skb);
/**
* skb_kill_datagram - Free a datagram skbuff forcibly
* @sk: socket
* @skb: datagram skbuff
* @flags: MSG\_ flags
*
* This function frees a datagram skbuff that was received by
* skb_recv_datagram. The flags argument must match the one
* used for skb_recv_datagram.
*
* If the MSG_PEEK flag is set, and the packet is still on the
* receive queue of the socket, it will be taken off the queue
* before it is freed.
*
* This function currently only disables BH when acquiring the
* sk_receive_queue lock. Therefore it must not be used in a
* context where that lock is acquired in an IRQ context.
*
* It returns 0 if the packet was removed by us.
*/
int skb_kill_datagram(struct sock *sk, struct sk_buff *skb, unsigned int flags)
{
int err = __sk_queue_drop_skb(sk, &sk->sk_receive_queue, skb, flags,
NULL);
kfree_skb(skb);
sk_mem_reclaim_partial(sk);
return err;
}
EXPORT_SYMBOL(skb_kill_datagram);
INDIRECT_CALLABLE_DECLARE(static size_t simple_copy_to_iter(const void *addr,
size_t bytes,
void *data __always_unused,
struct iov_iter *i));
static int __skb_datagram_iter(const struct sk_buff *skb, int offset,
struct iov_iter *to, int len, bool fault_short,
size_t (*cb)(const void *, size_t, void *,
struct iov_iter *), void *data)
{
int start = skb_headlen(skb);
int i, copy = start - offset, start_off = offset, n;
struct sk_buff *frag_iter;
/* Copy header. */
if (copy > 0) {
if (copy > len)
copy = len;
n = INDIRECT_CALL_1(cb, simple_copy_to_iter,
skb->data + offset, copy, data, to);
offset += n;
if (n != copy)
goto short_copy;
if ((len -= copy) == 0) return 0;
}
/* Copy paged appendix. Hmm... why does this look so complicated? */
for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
int end;
const skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
WARN_ON(start > offset + len); end = start + skb_frag_size(frag);
if ((copy = end - offset) > 0) {
struct page *page = skb_frag_page(frag);
u8 *vaddr = kmap(page);
if (copy > len)
copy = len;
n = INDIRECT_CALL_1(cb, simple_copy_to_iter,
vaddr + skb_frag_off(frag) + offset - start,
copy, data, to);
kunmap(page);
offset += n;
if (n != copy)
goto short_copy;
if (!(len -= copy))
return 0;
}
start = end;
}
skb_walk_frags(skb, frag_iter) {
int end;
WARN_ON(start > offset + len); end = start + frag_iter->len;
if ((copy = end - offset) > 0) {
if (copy > len)
copy = len;
if (__skb_datagram_iter(frag_iter, offset - start,
to, copy, fault_short, cb, data))
goto fault;
if ((len -= copy) == 0)
return 0;
offset += copy;
}
start = end;
}
if (!len)
return 0;
/* This is not really a user copy fault, but rather someone
* gave us a bogus length on the skb. We should probably
* print a warning here as it may indicate a kernel bug.
*/
fault:
iov_iter_revert(to, offset - start_off);
return -EFAULT;
short_copy:
if (fault_short || iov_iter_count(to))
goto fault;
return 0;
}
/**
* skb_copy_and_hash_datagram_iter - Copy datagram to an iovec iterator
* and update a hash.
* @skb: buffer to copy
* @offset: offset in the buffer to start copying from
* @to: iovec iterator to copy to
* @len: amount of data to copy from buffer to iovec
* @hash: hash request to update
*/
int skb_copy_and_hash_datagram_iter(const struct sk_buff *skb, int offset,
struct iov_iter *to, int len,
struct ahash_request *hash)
{
return __skb_datagram_iter(skb, offset, to, len, true,
hash_and_copy_to_iter, hash);
}
EXPORT_SYMBOL(skb_copy_and_hash_datagram_iter);
static size_t simple_copy_to_iter(const void *addr, size_t bytes,
void *data __always_unused, struct iov_iter *i)
{
return copy_to_iter(addr, bytes, i);
}
/**
* skb_copy_datagram_iter - Copy a datagram to an iovec iterator.
* @skb: buffer to copy
* @offset: offset in the buffer to start copying from
* @to: iovec iterator to copy to
* @len: amount of data to copy from buffer to iovec
*/
int skb_copy_datagram_iter(const struct sk_buff *skb, int offset,
struct iov_iter *to, int len)
{
trace_skb_copy_datagram_iovec(skb, len);
return __skb_datagram_iter(skb, offset, to, len, false,
simple_copy_to_iter, NULL);
}
EXPORT_SYMBOL(skb_copy_datagram_iter);
/**
* skb_copy_datagram_from_iter - Copy a datagram from an iov_iter.
* @skb: buffer to copy
* @offset: offset in the buffer to start copying to
* @from: the copy source
* @len: amount of data to copy to buffer from iovec
*
* Returns 0 or -EFAULT.
*/
int skb_copy_datagram_from_iter(struct sk_buff *skb, int offset,
struct iov_iter *from,
int len)
{
int start = skb_headlen(skb);
int i, copy = start - offset;
struct sk_buff *frag_iter;
/* Copy header. */
if (copy > 0) {
if (copy > len)
copy = len;
if (copy_from_iter(skb->data + offset, copy, from) != copy)
goto fault;
if ((len -= copy) == 0)
return 0;
offset += copy;
}
/* Copy paged appendix. Hmm... why does this look so complicated? */
for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
int end;
const skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
WARN_ON(start > offset + len);
end = start + skb_frag_size(frag);
if ((copy = end - offset) > 0) {
size_t copied;
if (copy > len)
copy = len;
copied = copy_page_from_iter(skb_frag_page(frag),
skb_frag_off(frag) + offset - start,
copy, from);
if (copied != copy)
goto fault;
if (!(len -= copy))
return 0;
offset += copy;
}
start = end;
}
skb_walk_frags(skb, frag_iter) {
int end;
WARN_ON(start > offset + len);
end = start + frag_iter->len;
if ((copy = end - offset) > 0) {
if (copy > len)
copy = len;
if (skb_copy_datagram_from_iter(frag_iter,
offset - start,
from, copy))
goto fault;
if ((len -= copy) == 0)
return 0;
offset += copy;
}
start = end;
}
if (!len)
return 0;
fault:
return -EFAULT;
}
EXPORT_SYMBOL(skb_copy_datagram_from_iter);
int __zerocopy_sg_from_iter(struct sock *sk, struct sk_buff *skb,
struct iov_iter *from, size_t length)
{
int frag = skb_shinfo(skb)->nr_frags;
while (length && iov_iter_count(from)) {
struct page *pages[MAX_SKB_FRAGS];
struct page *last_head = NULL;
size_t start;
ssize_t copied;
unsigned long truesize;
int refs, n = 0;
if (frag == MAX_SKB_FRAGS)
return -EMSGSIZE;
copied = iov_iter_get_pages(from, pages, length,
MAX_SKB_FRAGS - frag, &start);
if (copied < 0)
return -EFAULT;
iov_iter_advance(from, copied);
length -= copied;
truesize = PAGE_ALIGN(copied + start);
skb->data_len += copied;
skb->len += copied;
skb->truesize += truesize;
if (sk && sk->sk_type == SOCK_STREAM) {
sk_wmem_queued_add(sk, truesize);
sk_mem_charge(sk, truesize);
} else {
refcount_add(truesize, &skb->sk->sk_wmem_alloc);
}
for (refs = 0; copied != 0; start = 0) {
int size = min_t(int, copied, PAGE_SIZE - start);
struct page *head = compound_head(pages[n]);
start += (pages[n] - head) << PAGE_SHIFT;
copied -= size;
n++;
if (frag) {
skb_frag_t *last = &skb_shinfo(skb)->frags[frag - 1];
if (head == skb_frag_page(last) &&
start == skb_frag_off(last) + skb_frag_size(last)) {
skb_frag_size_add(last, size);
/* We combined this page, we need to release
* a reference. Since compound pages refcount
* is shared among many pages, batch the refcount
* adjustments to limit false sharing.
*/
last_head = head;
refs++;
continue;
}
}
if (refs) {
page_ref_sub(last_head, refs);
refs = 0;
}
skb_fill_page_desc(skb, frag++, head, start, size);
}
if (refs)
page_ref_sub(last_head, refs);
}
return 0;
}
EXPORT_SYMBOL(__zerocopy_sg_from_iter);
/**
* zerocopy_sg_from_iter - Build a zerocopy datagram from an iov_iter
* @skb: buffer to copy
* @from: the source to copy from
*
* The function will first copy up to headlen, and then pin the userspace
* pages and build frags through them.
*
* Returns 0, -EFAULT or -EMSGSIZE.
*/
int zerocopy_sg_from_iter(struct sk_buff *skb, struct iov_iter *from)
{
int copy = min_t(int, skb_headlen(skb), iov_iter_count(from));
/* copy up to skb headlen */
if (skb_copy_datagram_from_iter(skb, 0, from, copy))
return -EFAULT;
return __zerocopy_sg_from_iter(NULL, skb, from, ~0U);
}
EXPORT_SYMBOL(zerocopy_sg_from_iter);
/**
* skb_copy_and_csum_datagram - Copy datagram to an iovec iterator
* and update a checksum.
* @skb: buffer to copy
* @offset: offset in the buffer to start copying from
* @to: iovec iterator to copy to
* @len: amount of data to copy from buffer to iovec
* @csump: checksum pointer
*/
static int skb_copy_and_csum_datagram(const struct sk_buff *skb, int offset,
struct iov_iter *to, int len,
__wsum *csump)
{
struct csum_state csdata = { .csum = *csump };
int ret;
ret = __skb_datagram_iter(skb, offset, to, len, true,
csum_and_copy_to_iter, &csdata);
if (ret)
return ret;
*csump = csdata.csum;
return 0;
}
/**
* skb_copy_and_csum_datagram_msg - Copy and checksum skb to user iovec.
* @skb: skbuff
* @hlen: hardware length
* @msg: destination
*
* Caller _must_ check that skb will fit to this iovec.
*
* Returns: 0 - success.
* -EINVAL - checksum failure.
* -EFAULT - fault during copy.
*/
int skb_copy_and_csum_datagram_msg(struct sk_buff *skb,
int hlen, struct msghdr *msg)
{
__wsum csum;
int chunk = skb->len - hlen;
if (!chunk)
return 0;
if (msg_data_left(msg) < chunk) {
if (__skb_checksum_complete(skb))
return -EINVAL;
if (skb_copy_datagram_msg(skb, hlen, msg, chunk))
goto fault;
} else {
csum = csum_partial(skb->data, hlen, skb->csum);
if (skb_copy_and_csum_datagram(skb, hlen, &msg->msg_iter,
chunk, &csum))
goto fault;
if (csum_fold(csum)) {
iov_iter_revert(&msg->msg_iter, chunk);
return -EINVAL;
}
if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE) &&
!skb->csum_complete_sw)
netdev_rx_csum_fault(NULL, skb);
}
return 0;
fault:
return -EFAULT;
}
EXPORT_SYMBOL(skb_copy_and_csum_datagram_msg);
/**
* datagram_poll - generic datagram poll
* @file: file struct
* @sock: socket
* @wait: poll table
*
* Datagram poll: Again totally generic. This also handles
* sequenced packet sockets providing the socket receive queue
* is only ever holding data ready to receive.
*
* Note: when you *don't* use this routine for this protocol,
* and you use a different write policy from sock_writeable()
* then please supply your own write_space callback.
*/
__poll_t datagram_poll(struct file *file, struct socket *sock,
poll_table *wait)
{
struct sock *sk = sock->sk;
__poll_t mask;
sock_poll_wait(file, sock, wait);
mask = 0;
/* exceptional events? */
if (sk->sk_err || !skb_queue_empty_lockless(&sk->sk_error_queue))
mask |= EPOLLERR |
(sock_flag(sk, SOCK_SELECT_ERR_QUEUE) ? EPOLLPRI : 0);
if (sk->sk_shutdown & RCV_SHUTDOWN)
mask |= EPOLLRDHUP | EPOLLIN | EPOLLRDNORM;
if (sk->sk_shutdown == SHUTDOWN_MASK)
mask |= EPOLLHUP;
/* readable? */
if (!skb_queue_empty_lockless(&sk->sk_receive_queue))
mask |= EPOLLIN | EPOLLRDNORM;
/* Connection-based need to check for termination and startup */
if (connection_based(sk)) {
if (sk->sk_state == TCP_CLOSE)
mask |= EPOLLHUP;
/* connection hasn't started yet? */
if (sk->sk_state == TCP_SYN_SENT)
return mask;
}
/* writable? */
if (sock_writeable(sk))
mask |= EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND;
else
sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
return mask;
}
EXPORT_SYMBOL(datagram_poll);
/* SPDX-License-Identifier: GPL-2.0-or-later */
/* internal.h: mm/ internal definitions
*
* Copyright (C) 2004 Red Hat, Inc. All Rights Reserved.
* Written by David Howells (dhowells@redhat.com)
*/
#ifndef __MM_INTERNAL_H
#define __MM_INTERNAL_H
#include <linux/fs.h>
#include <linux/mm.h>
#include <linux/pagemap.h>
#include <linux/tracepoint-defs.h>
/*
* The set of flags that only affect watermark checking and reclaim
* behaviour. This is used by the MM to obey the caller constraints
* about IO, FS and watermark checking while ignoring placement
* hints such as HIGHMEM usage.
*/
#define GFP_RECLAIM_MASK (__GFP_RECLAIM|__GFP_HIGH|__GFP_IO|__GFP_FS|\
__GFP_NOWARN|__GFP_RETRY_MAYFAIL|__GFP_NOFAIL|\
__GFP_NORETRY|__GFP_MEMALLOC|__GFP_NOMEMALLOC|\
__GFP_ATOMIC)
/* The GFP flags allowed during early boot */
#define GFP_BOOT_MASK (__GFP_BITS_MASK & ~(__GFP_RECLAIM|__GFP_IO|__GFP_FS))
/* Control allocation cpuset and node placement constraints */
#define GFP_CONSTRAINT_MASK (__GFP_HARDWALL|__GFP_THISNODE)
/* Do not use these with a slab allocator */
#define GFP_SLAB_BUG_MASK (__GFP_DMA32|__GFP_HIGHMEM|~__GFP_BITS_MASK)
void page_writeback_init(void);
vm_fault_t do_swap_page(struct vm_fault *vmf);
void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *start_vma,
unsigned long floor, unsigned long ceiling);
static inline bool can_madv_lru_vma(struct vm_area_struct *vma)
{
return !(vma->vm_flags & (VM_LOCKED|VM_HUGETLB|VM_PFNMAP));
}
void unmap_page_range(struct mmu_gather *tlb,
struct vm_area_struct *vma,
unsigned long addr, unsigned long end,
struct zap_details *details);
void do_page_cache_ra(struct readahead_control *, unsigned long nr_to_read,
unsigned long lookahead_size);
void force_page_cache_ra(struct readahead_control *, unsigned long nr);
static inline void force_page_cache_readahead(struct address_space *mapping,
struct file *file, pgoff_t index, unsigned long nr_to_read)
{
DEFINE_READAHEAD(ractl, file, &file->f_ra, mapping, index);
force_page_cache_ra(&ractl, nr_to_read);
}
unsigned find_lock_entries(struct address_space *mapping, pgoff_t start,
pgoff_t end, struct pagevec *pvec, pgoff_t *indices);
/**
* page_evictable - test whether a page is evictable
* @page: the page to test
*
* Test whether page is evictable--i.e., should be placed on active/inactive
* lists vs unevictable list.
*
* Reasons page might not be evictable:
* (1) page's mapping marked unevictable
* (2) page is part of an mlocked VMA
*
*/
static inline bool page_evictable(struct page *page)
{
bool ret;
/* Prevent address_space of inode and swap cache from being freed */
rcu_read_lock();
ret = !mapping_unevictable(page_mapping(page)) && !PageMlocked(page);
rcu_read_unlock();
return ret;
}
/*
* Turn a non-refcounted page (->_refcount == 0) into refcounted with
* a count of one.
*/
static inline void set_page_refcounted(struct page *page)
{
VM_BUG_ON_PAGE(PageTail(page), page);
VM_BUG_ON_PAGE(page_ref_count(page), page);
set_page_count(page, 1);
}
extern unsigned long highest_memmap_pfn;
/*
* Maximum number of reclaim retries without progress before the OOM
* killer is consider the only way forward.
*/
#define MAX_RECLAIM_RETRIES 16
/*
* in mm/vmscan.c:
*/
extern int isolate_lru_page(struct page *page);
extern void putback_lru_page(struct page *page);
/*
* in mm/rmap.c:
*/
extern pmd_t *mm_find_pmd(struct mm_struct *mm, unsigned long address);
/*
* in mm/memcontrol.c:
*/
extern bool cgroup_memory_nokmem;
/*
* in mm/page_alloc.c
*/
/*
* Structure for holding the mostly immutable allocation parameters passed
* between functions involved in allocations, including the alloc_pages*
* family of functions.
*
* nodemask, migratetype and highest_zoneidx are initialized only once in
* __alloc_pages() and then never change.
*
* zonelist, preferred_zone and highest_zoneidx are set first in
* __alloc_pages() for the fast path, and might be later changed
* in __alloc_pages_slowpath(). All other functions pass the whole structure
* by a const pointer.
*/
struct alloc_context {
struct zonelist *zonelist;
nodemask_t *nodemask;
struct zoneref *preferred_zoneref;
int migratetype;
/*
* highest_zoneidx represents highest usable zone index of
* the allocation request. Due to the nature of the zone,
* memory on lower zone than the highest_zoneidx will be
* protected by lowmem_reserve[highest_zoneidx].
*
* highest_zoneidx is also used by reclaim/compaction to limit
* the target zone since higher zone than this index cannot be
* usable for this allocation request.
*/
enum zone_type highest_zoneidx;
bool spread_dirty_pages;
};
/*
* Locate the struct page for both the matching buddy in our
* pair (buddy1) and the combined O(n+1) page they form (page).
*
* 1) Any buddy B1 will have an order O twin B2 which satisfies
* the following equation:
* B2 = B1 ^ (1 << O)
* For example, if the starting buddy (buddy2) is #8 its order
* 1 buddy is #10:
* B2 = 8 ^ (1 << 1) = 8 ^ 2 = 10
*
* 2) Any buddy B will have an order O+1 parent P which
* satisfies the following equation:
* P = B & ~(1 << O)
*
* Assumption: *_mem_map is contiguous at least up to MAX_ORDER
*/
static inline unsigned long
__find_buddy_pfn(unsigned long page_pfn, unsigned int order)
{
return page_pfn ^ (1 << order);
}
extern struct page *__pageblock_pfn_to_page(unsigned long start_pfn,
unsigned long end_pfn, struct zone *zone);
static inline struct page *pageblock_pfn_to_page(unsigned long start_pfn,
unsigned long end_pfn, struct zone *zone)
{
if (zone->contiguous)
return pfn_to_page(start_pfn);
return __pageblock_pfn_to_page(start_pfn, end_pfn, zone);
}
extern int __isolate_free_page(struct page *page, unsigned int order);
extern void __putback_isolated_page(struct page *page, unsigned int order,
int mt);
extern void memblock_free_pages(struct page *page, unsigned long pfn,
unsigned int order);
extern void __free_pages_core(struct page *page, unsigned int order);
extern void prep_compound_page(struct page *page, unsigned int order);
extern void post_alloc_hook(struct page *page, unsigned int order,
gfp_t gfp_flags);
extern int user_min_free_kbytes;
extern void free_unref_page(struct page *page, unsigned int order);
extern void free_unref_page_list(struct list_head *list);
extern void zone_pcp_update(struct zone *zone, int cpu_online);
extern void zone_pcp_reset(struct zone *zone);
extern void zone_pcp_disable(struct zone *zone);
extern void zone_pcp_enable(struct zone *zone);
extern void *memmap_alloc(phys_addr_t size, phys_addr_t align,
phys_addr_t min_addr,
int nid, bool exact_nid);
#if defined CONFIG_COMPACTION || defined CONFIG_CMA
/*
* in mm/compaction.c
*/
/*
* compact_control is used to track pages being migrated and the free pages
* they are being migrated to during memory compaction. The free_pfn starts
* at the end of a zone and migrate_pfn begins at the start. Movable pages
* are moved to the end of a zone during a compaction run and the run
* completes when free_pfn <= migrate_pfn
*/
struct compact_control {
struct list_head freepages; /* List of free pages to migrate to */
struct list_head migratepages; /* List of pages being migrated */
unsigned int nr_freepages; /* Number of isolated free pages */
unsigned int nr_migratepages; /* Number of pages to migrate */
unsigned long free_pfn; /* isolate_freepages search base */
/*
* Acts as an in/out parameter to page isolation for migration.
* isolate_migratepages uses it as a search base.
* isolate_migratepages_block will update the value to the next pfn
* after the last isolated one.
*/
unsigned long migrate_pfn;
unsigned long fast_start_pfn; /* a pfn to start linear scan from */
struct zone *zone;
unsigned long total_migrate_scanned;
unsigned long total_free_scanned;
unsigned short fast_search_fail;/* failures to use free list searches */
short search_order; /* order to start a fast search at */
const gfp_t gfp_mask; /* gfp mask of a direct compactor */
int order; /* order a direct compactor needs */
int migratetype; /* migratetype of direct compactor */
const unsigned int alloc_flags; /* alloc flags of a direct compactor */
const int highest_zoneidx; /* zone index of a direct compactor */
enum migrate_mode mode; /* Async or sync migration mode */
bool ignore_skip_hint; /* Scan blocks even if marked skip */
bool no_set_skip_hint; /* Don't mark blocks for skipping */
bool ignore_block_suitable; /* Scan blocks considered unsuitable */
bool direct_compaction; /* False from kcompactd or /proc/... */
bool proactive_compaction; /* kcompactd proactive compaction */
bool whole_zone; /* Whole zone should/has been scanned */
bool contended; /* Signal lock or sched contention */
bool rescan; /* Rescanning the same pageblock */
bool alloc_contig; /* alloc_contig_range allocation */
};
/*
* Used in direct compaction when a page should be taken from the freelists
* immediately when one is created during the free path.
*/
struct capture_control {
struct compact_control *cc;
struct page *page;
};
unsigned long
isolate_freepages_range(struct compact_control *cc,
unsigned long start_pfn, unsigned long end_pfn);
int
isolate_migratepages_range(struct compact_control *cc,
unsigned long low_pfn, unsigned long end_pfn);
#endif
int find_suitable_fallback(struct free_area *area, unsigned int order,
int migratetype, bool only_stealable, bool *can_steal);
/*
* This function returns the order of a free page in the buddy system. In
* general, page_zone(page)->lock must be held by the caller to prevent the
* page from being allocated in parallel and returning garbage as the order.
* If a caller does not hold page_zone(page)->lock, it must guarantee that the
* page cannot be allocated or merged in parallel. Alternatively, it must
* handle invalid values gracefully, and use buddy_order_unsafe() below.
*/
static inline unsigned int buddy_order(struct page *page)
{
/* PageBuddy() must be checked by the caller */
return page_private(page);
}
/*
* Like buddy_order(), but for callers who cannot afford to hold the zone lock.
* PageBuddy() should be checked first by the caller to minimize race window,
* and invalid values must be handled gracefully.
*
* READ_ONCE is used so that if the caller assigns the result into a local
* variable and e.g. tests it for valid range before using, the compiler cannot
* decide to remove the variable and inline the page_private(page) multiple
* times, potentially observing different values in the tests and the actual
* use of the result.
*/
#define buddy_order_unsafe(page) READ_ONCE(page_private(page))
/*
* These three helpers classifies VMAs for virtual memory accounting.
*/
/*
* Executable code area - executable, not writable, not stack
*/
static inline bool is_exec_mapping(vm_flags_t flags)
{
return (flags & (VM_EXEC | VM_WRITE | VM_STACK)) == VM_EXEC;
}
/*
* Stack area - automatically grows in one direction
*
* VM_GROWSUP / VM_GROWSDOWN VMAs are always private anonymous:
* do_mmap() forbids all other combinations.
*/
static inline bool is_stack_mapping(vm_flags_t flags)
{
return (flags & VM_STACK) == VM_STACK;
}
/*
* Data area - private, writable, not stack
*/
static inline bool is_data_mapping(vm_flags_t flags)
{
return (flags & (VM_WRITE | VM_SHARED | VM_STACK)) == VM_WRITE;
}
/* mm/util.c */
void __vma_link_list(struct mm_struct *mm, struct vm_area_struct *vma,
struct vm_area_struct *prev);
void __vma_unlink_list(struct mm_struct *mm, struct vm_area_struct *vma);
#ifdef CONFIG_MMU
extern long populate_vma_page_range(struct vm_area_struct *vma,
unsigned long start, unsigned long end, int *locked);
extern long faultin_vma_page_range(struct vm_area_struct *vma,
unsigned long start, unsigned long end,
bool write, int *locked);
extern void munlock_vma_pages_range(struct vm_area_struct *vma,
unsigned long start, unsigned long end);
static inline void munlock_vma_pages_all(struct vm_area_struct *vma)
{
munlock_vma_pages_range(vma, vma->vm_start, vma->vm_end);
}
/*
* must be called with vma's mmap_lock held for read or write, and page locked.
*/
extern void mlock_vma_page(struct page *page);
extern unsigned int munlock_vma_page(struct page *page);
extern int mlock_future_check(struct mm_struct *mm, unsigned long flags,
unsigned long len);
/*
* Clear the page's PageMlocked(). This can be useful in a situation where
* we want to unconditionally remove a page from the pagecache -- e.g.,
* on truncation or freeing.
*
* It is legal to call this function for any page, mlocked or not.
* If called for a page that is still mapped by mlocked vmas, all we do
* is revert to lazy LRU behaviour -- semantics are not broken.
*/
extern void clear_page_mlock(struct page *page);
extern pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma);
/*
* At what user virtual address is page expected in vma?
* Returns -EFAULT if all of the page is outside the range of vma.
* If page is a compound head, the entire compound page is considered.
*/
static inline unsigned long
vma_address(struct page *page, struct vm_area_struct *vma)
{
pgoff_t pgoff;
unsigned long address;
VM_BUG_ON_PAGE(PageKsm(page), page); /* KSM page->index unusable */
pgoff = page_to_pgoff(page);
if (pgoff >= vma->vm_pgoff) {
address = vma->vm_start +
((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
/* Check for address beyond vma (or wrapped through 0?) */
if (address < vma->vm_start || address >= vma->vm_end)
address = -EFAULT;
} else if (PageHead(page) &&
pgoff + compound_nr(page) - 1 >= vma->vm_pgoff) {
/* Test above avoids possibility of wrap to 0 on 32-bit */
address = vma->vm_start;
} else {
address = -EFAULT;
}
return address;
}
/*
* Then at what user virtual address will none of the page be found in vma?
* Assumes that vma_address() already returned a good starting address.
* If page is a compound head, the entire compound page is considered.
*/
static inline unsigned long
vma_address_end(struct page *page, struct vm_area_struct *vma)
{
pgoff_t pgoff;
unsigned long address;
VM_BUG_ON_PAGE(PageKsm(page), page); /* KSM page->index unusable */
pgoff = page_to_pgoff(page) + compound_nr(page);
address = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
/* Check for address beyond vma (or wrapped through 0?) */
if (address < vma->vm_start || address > vma->vm_end)
address = vma->vm_end;
return address;
}
static inline struct file *maybe_unlock_mmap_for_io(struct vm_fault *vmf,
struct file *fpin)
{
int flags = vmf->flags;
if (fpin)
return fpin;
/*
* FAULT_FLAG_RETRY_NOWAIT means we don't want to wait on page locks or
* anything, so we only pin the file and drop the mmap_lock if only
* FAULT_FLAG_ALLOW_RETRY is set, while this is the first attempt.
*/
if (fault_flag_allow_retry_first(flags) &&
!(flags & FAULT_FLAG_RETRY_NOWAIT)) {
fpin = get_file(vmf->vma->vm_file);
mmap_read_unlock(vmf->vma->vm_mm);
}
return fpin;
}
#else /* !CONFIG_MMU */
static inline void clear_page_mlock(struct page *page) { }
static inline void mlock_vma_page(struct page *page) { }
static inline void vunmap_range_noflush(unsigned long start, unsigned long end)
{
}
#endif /* !CONFIG_MMU */
/*
* Return the mem_map entry representing the 'offset' subpage within
* the maximally aligned gigantic page 'base'. Handle any discontiguity
* in the mem_map at MAX_ORDER_NR_PAGES boundaries.
*/
static inline struct page *mem_map_offset(struct page *base, int offset)
{
if (unlikely(offset >= MAX_ORDER_NR_PAGES))
return nth_page(base, offset);
return base + offset;
}
/*
* Iterator over all subpages within the maximally aligned gigantic
* page 'base'. Handle any discontiguity in the mem_map.
*/
static inline struct page *mem_map_next(struct page *iter,
struct page *base, int offset)
{
if (unlikely((offset & (MAX_ORDER_NR_PAGES - 1)) == 0)) {
unsigned long pfn = page_to_pfn(base) + offset;
if (!pfn_valid(pfn))
return NULL;
return pfn_to_page(pfn);
}
return iter + 1;
}
/* Memory initialisation debug and verification */
enum mminit_level {
MMINIT_WARNING,
MMINIT_VERIFY,
MMINIT_TRACE
};
#ifdef CONFIG_DEBUG_MEMORY_INIT
extern int mminit_loglevel;
#define mminit_dprintk(level, prefix, fmt, arg...) \
do { \
if (level < mminit_loglevel) { \
if (level <= MMINIT_WARNING) \
pr_warn("mminit::" prefix " " fmt, ##arg); \
else \
printk(KERN_DEBUG "mminit::" prefix " " fmt, ##arg); \
} \
} while (0)
extern void mminit_verify_pageflags_layout(void);
extern void mminit_verify_zonelist(void);
#else
static inline void mminit_dprintk(enum mminit_level level,
const char *prefix, const char *fmt, ...)
{
}
static inline void mminit_verify_pageflags_layout(void)
{
}
static inline void mminit_verify_zonelist(void)
{
}
#endif /* CONFIG_DEBUG_MEMORY_INIT */
/* mminit_validate_memmodel_limits is independent of CONFIG_DEBUG_MEMORY_INIT */
#if defined(CONFIG_SPARSEMEM)
extern void mminit_validate_memmodel_limits(unsigned long *start_pfn,
unsigned long *end_pfn);
#else
static inline void mminit_validate_memmodel_limits(unsigned long *start_pfn,
unsigned long *end_pfn)
{
}
#endif /* CONFIG_SPARSEMEM */
#define NODE_RECLAIM_NOSCAN -2
#define NODE_RECLAIM_FULL -1
#define NODE_RECLAIM_SOME 0
#define NODE_RECLAIM_SUCCESS 1
#ifdef CONFIG_NUMA
extern int node_reclaim(struct pglist_data *, gfp_t, unsigned int);
extern int find_next_best_node(int node, nodemask_t *used_node_mask);
#else
static inline int node_reclaim(struct pglist_data *pgdat, gfp_t mask,
unsigned int order)
{
return NODE_RECLAIM_NOSCAN;
}
static inline int find_next_best_node(int node, nodemask_t *used_node_mask)
{
return NUMA_NO_NODE;
}
#endif
extern int hwpoison_filter(struct page *p);
extern u32 hwpoison_filter_dev_major;
extern u32 hwpoison_filter_dev_minor;
extern u64 hwpoison_filter_flags_mask;
extern u64 hwpoison_filter_flags_value;
extern u64 hwpoison_filter_memcg;
extern u32 hwpoison_filter_enable;
extern unsigned long __must_check vm_mmap_pgoff(struct file *, unsigned long,
unsigned long, unsigned long,
unsigned long, unsigned long);
extern void set_pageblock_order(void);
unsigned int reclaim_clean_pages_from_list(struct zone *zone,
struct list_head *page_list);
/* The ALLOC_WMARK bits are used as an index to zone->watermark */
#define ALLOC_WMARK_MIN WMARK_MIN
#define ALLOC_WMARK_LOW WMARK_LOW
#define ALLOC_WMARK_HIGH WMARK_HIGH
#define ALLOC_NO_WATERMARKS 0x04 /* don't check watermarks at all */
/* Mask to get the watermark bits */
#define ALLOC_WMARK_MASK (ALLOC_NO_WATERMARKS-1)
/*
* Only MMU archs have async oom victim reclaim - aka oom_reaper so we
* cannot assume a reduced access to memory reserves is sufficient for
* !MMU
*/
#ifdef CONFIG_MMU
#define ALLOC_OOM 0x08
#else
#define ALLOC_OOM ALLOC_NO_WATERMARKS
#endif
#define ALLOC_HARDER 0x10 /* try to alloc harder */
#define ALLOC_HIGH 0x20 /* __GFP_HIGH set */
#define ALLOC_CPUSET 0x40 /* check for correct cpuset */
#define ALLOC_CMA 0x80 /* allow allocations from CMA areas */
#ifdef CONFIG_ZONE_DMA32
#define ALLOC_NOFRAGMENT 0x100 /* avoid mixing pageblock types */
#else
#define ALLOC_NOFRAGMENT 0x0
#endif
#define ALLOC_KSWAPD 0x800 /* allow waking of kswapd, __GFP_KSWAPD_RECLAIM set */
enum ttu_flags;
struct tlbflush_unmap_batch;
/*
* only for MM internal work items which do not depend on
* any allocations or locks which might depend on allocations
*/
extern struct workqueue_struct *mm_percpu_wq;
#ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH
void try_to_unmap_flush(void);
void try_to_unmap_flush_dirty(void);
void flush_tlb_batched_pending(struct mm_struct *mm);
#else
static inline void try_to_unmap_flush(void)
{
}
static inline void try_to_unmap_flush_dirty(void)
{
}
static inline void flush_tlb_batched_pending(struct mm_struct *mm)
{
}
#endif /* CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH */
extern const struct trace_print_flags pageflag_names[];
extern const struct trace_print_flags vmaflag_names[];
extern const struct trace_print_flags gfpflag_names[];
static inline bool is_migrate_highatomic(enum migratetype migratetype)
{
return migratetype == MIGRATE_HIGHATOMIC;
}
static inline bool is_migrate_highatomic_page(struct page *page)
{
return get_pageblock_migratetype(page) == MIGRATE_HIGHATOMIC;
}
void setup_zone_pageset(struct zone *zone);
struct migration_target_control {
int nid; /* preferred node id */
nodemask_t *nmask;
gfp_t gfp_mask;
};
/*
* mm/vmalloc.c
*/
#ifdef CONFIG_MMU
int vmap_pages_range_noflush(unsigned long addr, unsigned long end,
pgprot_t prot, struct page **pages, unsigned int page_shift);
#else
static inline
int vmap_pages_range_noflush(unsigned long addr, unsigned long end,
pgprot_t prot, struct page **pages, unsigned int page_shift)
{
return -EINVAL;
}
#endif
void vunmap_range_noflush(unsigned long start, unsigned long end);
int numa_migrate_prep(struct page *page, struct vm_area_struct *vma,
unsigned long addr, int page_nid, int *flags);
#endif /* __MM_INTERNAL_H */
#ifndef _LINUX_JHASH_H
#define _LINUX_JHASH_H
/* jhash.h: Jenkins hash support.
*
* Copyright (C) 2006. Bob Jenkins (bob_jenkins@burtleburtle.net)
*
* https://burtleburtle.net/bob/hash/
*
* These are the credits from Bob's sources:
*
* lookup3.c, by Bob Jenkins, May 2006, Public Domain.
*
* These are functions for producing 32-bit hashes for hash table lookup.
* hashword(), hashlittle(), hashlittle2(), hashbig(), mix(), and final()
* are externally useful functions. Routines to test the hash are included
* if SELF_TEST is defined. You can use this free for any purpose. It's in
* the public domain. It has no warranty.
*
* Copyright (C) 2009-2010 Jozsef Kadlecsik (kadlec@netfilter.org)
*
* I've modified Bob's hash to be useful in the Linux kernel, and
* any bugs present are my fault.
* Jozsef
*/
#include <linux/bitops.h>
#include <linux/unaligned/packed_struct.h>
/* Best hash sizes are of power of two */
#define jhash_size(n) ((u32)1<<(n))
/* Mask the hash value, i.e (value & jhash_mask(n)) instead of (value % n) */
#define jhash_mask(n) (jhash_size(n)-1)
/* __jhash_mix -- mix 3 32-bit values reversibly. */
#define __jhash_mix(a, b, c) \
{ \
a -= c; a ^= rol32(c, 4); c += b; \
b -= a; b ^= rol32(a, 6); a += c; \
c -= b; c ^= rol32(b, 8); b += a; \
a -= c; a ^= rol32(c, 16); c += b; \
b -= a; b ^= rol32(a, 19); a += c; \
c -= b; c ^= rol32(b, 4); b += a; \
}
/* __jhash_final - final mixing of 3 32-bit values (a,b,c) into c */
#define __jhash_final(a, b, c) \
{ \
c ^= b; c -= rol32(b, 14); \
a ^= c; a -= rol32(c, 11); \
b ^= a; b -= rol32(a, 25); \
c ^= b; c -= rol32(b, 16); \
a ^= c; a -= rol32(c, 4); \
b ^= a; b -= rol32(a, 14); \
c ^= b; c -= rol32(b, 24); \
}
/* An arbitrary initial parameter */
#define JHASH_INITVAL 0xdeadbeef
/* jhash - hash an arbitrary key
* @k: sequence of bytes as key
* @length: the length of the key
* @initval: the previous hash, or an arbitray value
*
* The generic version, hashes an arbitrary sequence of bytes.
* No alignment or length assumptions are made about the input key.
*
* Returns the hash value of the key. The result depends on endianness.
*/
static inline u32 jhash(const void *key, u32 length, u32 initval)
{
u32 a, b, c;
const u8 *k = key;
/* Set up the internal state */
a = b = c = JHASH_INITVAL + length + initval;
/* All but the last block: affect some 32 bits of (a,b,c) */
while (length > 12) {
a += __get_unaligned_cpu32(k);
b += __get_unaligned_cpu32(k + 4);
c += __get_unaligned_cpu32(k + 8);
__jhash_mix(a, b, c);
length -= 12;
k += 12;
}
/* Last block: affect all 32 bits of (c) */
switch (length) { case 12: c += (u32)k[11]<<24; fallthrough; case 11: c += (u32)k[10]<<16; fallthrough; case 10: c += (u32)k[9]<<8; fallthrough; case 9: c += k[8]; fallthrough; case 8: b += (u32)k[7]<<24; fallthrough; case 7: b += (u32)k[6]<<16; fallthrough; case 6: b += (u32)k[5]<<8; fallthrough; case 5: b += k[4]; fallthrough; case 4: a += (u32)k[3]<<24; fallthrough; case 3: a += (u32)k[2]<<16; fallthrough; case 2: a += (u32)k[1]<<8; fallthrough; case 1: a += k[0];
__jhash_final(a, b, c);
break;
case 0: /* Nothing left to add */
break;
}
return c;
}
/* jhash2 - hash an array of u32's
* @k: the key which must be an array of u32's
* @length: the number of u32's in the key
* @initval: the previous hash, or an arbitray value
*
* Returns the hash value of the key.
*/
static inline u32 jhash2(const u32 *k, u32 length, u32 initval)
{
u32 a, b, c;
/* Set up the internal state */
a = b = c = JHASH_INITVAL + (length<<2) + initval;
/* Handle most of the key */
while (length > 3) {
a += k[0];
b += k[1];
c += k[2];
__jhash_mix(a, b, c);
length -= 3;
k += 3;
}
/* Handle the last 3 u32's */
switch (length) {
case 3: c += k[2]; fallthrough;
case 2: b += k[1]; fallthrough;
case 1: a += k[0];
__jhash_final(a, b, c);
break;
case 0: /* Nothing left to add */
break;
}
return c;
}
/* __jhash_nwords - hash exactly 3, 2 or 1 word(s) */
static inline u32 __jhash_nwords(u32 a, u32 b, u32 c, u32 initval)
{
a += initval;
b += initval;
c += initval;
__jhash_final(a, b, c);
return c;
}
static inline u32 jhash_3words(u32 a, u32 b, u32 c, u32 initval)
{
return __jhash_nwords(a, b, c, initval + JHASH_INITVAL + (3 << 2));
}
static inline u32 jhash_2words(u32 a, u32 b, u32 initval)
{
return __jhash_nwords(a, b, 0, initval + JHASH_INITVAL + (2 << 2));
}
static inline u32 jhash_1word(u32 a, u32 initval)
{
return __jhash_nwords(a, 0, 0, initval + JHASH_INITVAL + (1 << 2));
}
#endif /* _LINUX_JHASH_H */
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_PIPE_FS_I_H
#define _LINUX_PIPE_FS_I_H
#define PIPE_DEF_BUFFERS 16
#define PIPE_BUF_FLAG_LRU 0x01 /* page is on the LRU */
#define PIPE_BUF_FLAG_ATOMIC 0x02 /* was atomically mapped */
#define PIPE_BUF_FLAG_GIFT 0x04 /* page is a gift */
#define PIPE_BUF_FLAG_PACKET 0x08 /* read() as a packet */
#define PIPE_BUF_FLAG_CAN_MERGE 0x10 /* can merge buffers */
#define PIPE_BUF_FLAG_WHOLE 0x20 /* read() must return entire buffer or error */
#ifdef CONFIG_WATCH_QUEUE
#define PIPE_BUF_FLAG_LOSS 0x40 /* Message loss happened after this buffer */
#endif
/**
* struct pipe_buffer - a linux kernel pipe buffer
* @page: the page containing the data for the pipe buffer
* @offset: offset of data inside the @page
* @len: length of data inside the @page
* @ops: operations associated with this buffer. See @pipe_buf_operations.
* @flags: pipe buffer flags. See above.
* @private: private data owned by the ops.
**/
struct pipe_buffer {
struct page *page;
unsigned int offset, len;
const struct pipe_buf_operations *ops;
unsigned int flags;
unsigned long private;
};
/**
* struct pipe_inode_info - a linux kernel pipe
* @mutex: mutex protecting the whole thing
* @rd_wait: reader wait point in case of empty pipe
* @wr_wait: writer wait point in case of full pipe
* @head: The point of buffer production
* @tail: The point of buffer consumption
* @note_loss: The next read() should insert a data-lost message
* @max_usage: The maximum number of slots that may be used in the ring
* @ring_size: total number of buffers (should be a power of 2)
* @nr_accounted: The amount this pipe accounts for in user->pipe_bufs
* @tmp_page: cached released page
* @readers: number of current readers of this pipe
* @writers: number of current writers of this pipe
* @files: number of struct file referring this pipe (protected by ->i_lock)
* @r_counter: reader counter
* @w_counter: writer counter
* @poll_usage: is this pipe used for epoll, which has crazy wakeups?
* @fasync_readers: reader side fasync
* @fasync_writers: writer side fasync
* @bufs: the circular array of pipe buffers
* @user: the user who created this pipe
* @watch_queue: If this pipe is a watch_queue, this is the stuff for that
**/
struct pipe_inode_info {
struct mutex mutex;
wait_queue_head_t rd_wait, wr_wait;
unsigned int head;
unsigned int tail;
unsigned int max_usage;
unsigned int ring_size;
#ifdef CONFIG_WATCH_QUEUE
bool note_loss;
#endif
unsigned int nr_accounted;
unsigned int readers;
unsigned int writers;
unsigned int files;
unsigned int r_counter;
unsigned int w_counter;
unsigned int poll_usage;
struct page *tmp_page;
struct fasync_struct *fasync_readers;
struct fasync_struct *fasync_writers;
struct pipe_buffer *bufs;
struct user_struct *user;
#ifdef CONFIG_WATCH_QUEUE
struct watch_queue *watch_queue;
#endif
};
/*
* Note on the nesting of these functions:
*
* ->confirm()
* ->try_steal()
*
* That is, ->try_steal() must be called on a confirmed buffer. See below for
* the meaning of each operation. Also see the kerneldoc in fs/pipe.c for the
* pipe and generic variants of these hooks.
*/
struct pipe_buf_operations {
/*
* ->confirm() verifies that the data in the pipe buffer is there
* and that the contents are good. If the pages in the pipe belong
* to a file system, we may need to wait for IO completion in this
* hook. Returns 0 for good, or a negative error value in case of
* error. If not present all pages are considered good.
*/
int (*confirm)(struct pipe_inode_info *, struct pipe_buffer *);
/*
* When the contents of this pipe buffer has been completely
* consumed by a reader, ->release() is called.
*/
void (*release)(struct pipe_inode_info *, struct pipe_buffer *);
/*
* Attempt to take ownership of the pipe buffer and its contents.
* ->try_steal() returns %true for success, in which case the contents
* of the pipe (the buf->page) is locked and now completely owned by the
* caller. The page may then be transferred to a different mapping, the
* most often used case is insertion into different file address space
* cache.
*/
bool (*try_steal)(struct pipe_inode_info *, struct pipe_buffer *);
/*
* Get a reference to the pipe buffer.
*/
bool (*get)(struct pipe_inode_info *, struct pipe_buffer *);
};
/**
* pipe_empty - Return true if the pipe is empty
* @head: The pipe ring head pointer
* @tail: The pipe ring tail pointer
*/
static inline bool pipe_empty(unsigned int head, unsigned int tail)
{
return head == tail;
}
/**
* pipe_occupancy - Return number of slots used in the pipe
* @head: The pipe ring head pointer
* @tail: The pipe ring tail pointer
*/
static inline unsigned int pipe_occupancy(unsigned int head, unsigned int tail)
{
return head - tail;
}
/**
* pipe_full - Return true if the pipe is full
* @head: The pipe ring head pointer
* @tail: The pipe ring tail pointer
* @limit: The maximum amount of slots available.
*/
static inline bool pipe_full(unsigned int head, unsigned int tail,
unsigned int limit)
{
return pipe_occupancy(head, tail) >= limit;
}
/**
* pipe_space_for_user - Return number of slots available to userspace
* @head: The pipe ring head pointer
* @tail: The pipe ring tail pointer
* @pipe: The pipe info structure
*/
static inline unsigned int pipe_space_for_user(unsigned int head, unsigned int tail,
struct pipe_inode_info *pipe)
{
unsigned int p_occupancy, p_space;
p_occupancy = pipe_occupancy(head, tail);
if (p_occupancy >= pipe->max_usage)
return 0;
p_space = pipe->ring_size - p_occupancy;
if (p_space > pipe->max_usage)
p_space = pipe->max_usage;
return p_space;
}
/**
* pipe_buf_get - get a reference to a pipe_buffer
* @pipe: the pipe that the buffer belongs to
* @buf: the buffer to get a reference to
*
* Return: %true if the reference was successfully obtained.
*/
static inline __must_check bool pipe_buf_get(struct pipe_inode_info *pipe,
struct pipe_buffer *buf)
{
return buf->ops->get(pipe, buf);
}
/**
* pipe_buf_release - put a reference to a pipe_buffer
* @pipe: the pipe that the buffer belongs to
* @buf: the buffer to put a reference to
*/
static inline void pipe_buf_release(struct pipe_inode_info *pipe,
struct pipe_buffer *buf)
{
const struct pipe_buf_operations *ops = buf->ops;
buf->ops = NULL;
ops->release(pipe, buf);
}
/**
* pipe_buf_confirm - verify contents of the pipe buffer
* @pipe: the pipe that the buffer belongs to
* @buf: the buffer to confirm
*/
static inline int pipe_buf_confirm(struct pipe_inode_info *pipe,
struct pipe_buffer *buf)
{
if (!buf->ops->confirm)
return 0;
return buf->ops->confirm(pipe, buf);
}
/**
* pipe_buf_try_steal - attempt to take ownership of a pipe_buffer
* @pipe: the pipe that the buffer belongs to
* @buf: the buffer to attempt to steal
*/
static inline bool pipe_buf_try_steal(struct pipe_inode_info *pipe,
struct pipe_buffer *buf)
{
if (!buf->ops->try_steal)
return false;
return buf->ops->try_steal(pipe, buf);
}
/* Differs from PIPE_BUF in that PIPE_SIZE is the length of the actual
memory allocation, whereas PIPE_BUF makes atomicity guarantees. */
#define PIPE_SIZE PAGE_SIZE
/* Pipe lock and unlock operations */
void pipe_lock(struct pipe_inode_info *);
void pipe_unlock(struct pipe_inode_info *);
void pipe_double_lock(struct pipe_inode_info *, struct pipe_inode_info *);
extern unsigned int pipe_max_size;
extern unsigned long pipe_user_pages_hard;
extern unsigned long pipe_user_pages_soft;
/* Wait for a pipe to be readable/writable while dropping the pipe lock */
void pipe_wait_readable(struct pipe_inode_info *);
void pipe_wait_writable(struct pipe_inode_info *);
struct pipe_inode_info *alloc_pipe_info(void);
void free_pipe_info(struct pipe_inode_info *);
/* Generic pipe buffer ops functions */
bool generic_pipe_buf_get(struct pipe_inode_info *, struct pipe_buffer *);
bool generic_pipe_buf_try_steal(struct pipe_inode_info *, struct pipe_buffer *);
void generic_pipe_buf_release(struct pipe_inode_info *, struct pipe_buffer *);
extern const struct pipe_buf_operations nosteal_pipe_buf_ops;
#ifdef CONFIG_WATCH_QUEUE
unsigned long account_pipe_buffers(struct user_struct *user,
unsigned long old, unsigned long new);
bool too_many_pipe_buffers_soft(unsigned long user_bufs);
bool too_many_pipe_buffers_hard(unsigned long user_bufs);
bool pipe_is_unprivileged_user(void);
#endif
/* for F_SETPIPE_SZ and F_GETPIPE_SZ */
#ifdef CONFIG_WATCH_QUEUE
int pipe_resize_ring(struct pipe_inode_info *pipe, unsigned int nr_slots);
#endif
long pipe_fcntl(struct file *, unsigned int, unsigned long arg);
struct pipe_inode_info *get_pipe_info(struct file *file, bool for_splice);
int create_pipe_files(struct file **, int);
unsigned int round_pipe_size(unsigned long size);
#endif
// SPDX-License-Identifier: GPL-2.0-or-later
/*
* NetLabel Kernel API
*
* This file defines the kernel API for the NetLabel system. The NetLabel
* system manages static and dynamic label mappings for network protocols such
* as CIPSO and RIPSO.
*
* Author: Paul Moore <paul@paul-moore.com>
*/
/*
* (c) Copyright Hewlett-Packard Development Company, L.P., 2006, 2008
*/
#include <linux/init.h>
#include <linux/types.h>
#include <linux/slab.h>
#include <linux/audit.h>
#include <linux/in.h>
#include <linux/in6.h>
#include <net/ip.h>
#include <net/ipv6.h>
#include <net/netlabel.h>
#include <net/cipso_ipv4.h>
#include <net/calipso.h>
#include <asm/bug.h>
#include <linux/atomic.h>
#include "netlabel_domainhash.h"
#include "netlabel_unlabeled.h"
#include "netlabel_cipso_v4.h"
#include "netlabel_calipso.h"
#include "netlabel_user.h"
#include "netlabel_mgmt.h"
#include "netlabel_addrlist.h"
/*
* Configuration Functions
*/
/**
* netlbl_cfg_map_del - Remove a NetLabel/LSM domain mapping
* @domain: the domain mapping to remove
* @family: address family
* @addr: IP address
* @mask: IP address mask
* @audit_info: NetLabel audit information
*
* Description:
* Removes a NetLabel/LSM domain mapping. A @domain value of NULL causes the
* default domain mapping to be removed. Returns zero on success, negative
* values on failure.
*
*/
int netlbl_cfg_map_del(const char *domain,
u16 family,
const void *addr,
const void *mask,
struct netlbl_audit *audit_info)
{
if (addr == NULL && mask == NULL) {
return netlbl_domhsh_remove(domain, family, audit_info);
} else if (addr != NULL && mask != NULL) {
switch (family) {
case AF_INET:
return netlbl_domhsh_remove_af4(domain, addr, mask,
audit_info);
#if IS_ENABLED(CONFIG_IPV6)
case AF_INET6:
return netlbl_domhsh_remove_af6(domain, addr, mask,
audit_info);
#endif /* IPv6 */
default:
return -EPFNOSUPPORT;
}
} else
return -EINVAL;
}
/**
* netlbl_cfg_unlbl_map_add - Add a new unlabeled mapping
* @domain: the domain mapping to add
* @family: address family
* @addr: IP address
* @mask: IP address mask
* @audit_info: NetLabel audit information
*
* Description:
* Adds a new unlabeled NetLabel/LSM domain mapping. A @domain value of NULL
* causes a new default domain mapping to be added. Returns zero on success,
* negative values on failure.
*
*/
int netlbl_cfg_unlbl_map_add(const char *domain,
u16 family,
const void *addr,
const void *mask,
struct netlbl_audit *audit_info)
{
int ret_val = -ENOMEM;
struct netlbl_dom_map *entry;
struct netlbl_domaddr_map *addrmap = NULL;
struct netlbl_domaddr4_map *map4 = NULL;
struct netlbl_domaddr6_map *map6 = NULL;
entry = kzalloc(sizeof(*entry), GFP_ATOMIC);
if (entry == NULL)
return -ENOMEM;
if (domain != NULL) {
entry->domain = kstrdup(domain, GFP_ATOMIC);
if (entry->domain == NULL)
goto cfg_unlbl_map_add_failure;
}
entry->family = family;
if (addr == NULL && mask == NULL)
entry->def.type = NETLBL_NLTYPE_UNLABELED;
else if (addr != NULL && mask != NULL) {
addrmap = kzalloc(sizeof(*addrmap), GFP_ATOMIC);
if (addrmap == NULL)
goto cfg_unlbl_map_add_failure;
INIT_LIST_HEAD(&addrmap->list4);
INIT_LIST_HEAD(&addrmap->list6);
switch (family) {
case AF_INET: {
const struct in_addr *addr4 = addr;
const struct in_addr *mask4 = mask;
map4 = kzalloc(sizeof(*map4), GFP_ATOMIC);
if (map4 == NULL)
goto cfg_unlbl_map_add_failure;
map4->def.type = NETLBL_NLTYPE_UNLABELED;
map4->list.addr = addr4->s_addr & mask4->s_addr;
map4->list.mask = mask4->s_addr;
map4->list.valid = 1;
ret_val = netlbl_af4list_add(&map4->list,
&addrmap->list4);
if (ret_val != 0)
goto cfg_unlbl_map_add_failure;
break;
}
#if IS_ENABLED(CONFIG_IPV6)
case AF_INET6: {
const struct in6_addr *addr6 = addr;
const struct in6_addr *mask6 = mask;
map6 = kzalloc(sizeof(*map6), GFP_ATOMIC);
if (map6 == NULL)
goto cfg_unlbl_map_add_failure;
map6->def.type = NETLBL_NLTYPE_UNLABELED;
map6->list.addr = *addr6;
map6->list.addr.s6_addr32[0] &= mask6->s6_addr32[0];
map6->list.addr.s6_addr32[1] &= mask6->s6_addr32[1];
map6->list.addr.s6_addr32[2] &= mask6->s6_addr32[2];
map6->list.addr.s6_addr32[3] &= mask6->s6_addr32[3];
map6->list.mask = *mask6;
map6->list.valid = 1;
ret_val = netlbl_af6list_add(&map6->list,
&addrmap->list6);
if (ret_val != 0)
goto cfg_unlbl_map_add_failure;
break;
}
#endif /* IPv6 */
default:
goto cfg_unlbl_map_add_failure;
}
entry->def.addrsel = addrmap;
entry->def.type = NETLBL_NLTYPE_ADDRSELECT;
} else {
ret_val = -EINVAL;
goto cfg_unlbl_map_add_failure;
}
ret_val = netlbl_domhsh_add(entry, audit_info);
if (ret_val != 0)
goto cfg_unlbl_map_add_failure;
return 0;
cfg_unlbl_map_add_failure:
kfree(entry->domain);
kfree(entry);
kfree(addrmap);
kfree(map4);
kfree(map6);
return ret_val;
}
/**
* netlbl_cfg_unlbl_static_add - Adds a new static label
* @net: network namespace
* @dev_name: interface name
* @addr: IP address in network byte order (struct in[6]_addr)
* @mask: address mask in network byte order (struct in[6]_addr)
* @family: address family
* @secid: LSM secid value for the entry
* @audit_info: NetLabel audit information
*
* Description:
* Adds a new NetLabel static label to be used when protocol provided labels
* are not present on incoming traffic. If @dev_name is NULL then the default
* interface will be used. Returns zero on success, negative values on failure.
*
*/
int netlbl_cfg_unlbl_static_add(struct net *net,
const char *dev_name,
const void *addr,
const void *mask,
u16 family,
u32 secid,
struct netlbl_audit *audit_info)
{
u32 addr_len;
switch (family) {
case AF_INET:
addr_len = sizeof(struct in_addr);
break;
#if IS_ENABLED(CONFIG_IPV6)
case AF_INET6:
addr_len = sizeof(struct in6_addr);
break;
#endif /* IPv6 */
default:
return -EPFNOSUPPORT;
}
return netlbl_unlhsh_add(net,
dev_name, addr, mask, addr_len,
secid, audit_info);
}
/**
* netlbl_cfg_unlbl_static_del - Removes an existing static label
* @net: network namespace
* @dev_name: interface name
* @addr: IP address in network byte order (struct in[6]_addr)
* @mask: address mask in network byte order (struct in[6]_addr)
* @family: address family
* @audit_info: NetLabel audit information
*
* Description:
* Removes an existing NetLabel static label used when protocol provided labels
* are not present on incoming traffic. If @dev_name is NULL then the default
* interface will be used. Returns zero on success, negative values on failure.
*
*/
int netlbl_cfg_unlbl_static_del(struct net *net,
const char *dev_name,
const void *addr,
const void *mask,
u16 family,
struct netlbl_audit *audit_info)
{
u32 addr_len;
switch (family) {
case AF_INET:
addr_len = sizeof(struct in_addr);
break;
#if IS_ENABLED(CONFIG_IPV6)
case AF_INET6:
addr_len = sizeof(struct in6_addr);
break;
#endif /* IPv6 */
default:
return -EPFNOSUPPORT;
}
return netlbl_unlhsh_remove(net,
dev_name, addr, mask, addr_len,
audit_info);
}
/**
* netlbl_cfg_cipsov4_add - Add a new CIPSOv4 DOI definition
* @doi_def: CIPSO DOI definition
* @audit_info: NetLabel audit information
*
* Description:
* Add a new CIPSO DOI definition as defined by @doi_def. Returns zero on
* success and negative values on failure.
*
*/
int netlbl_cfg_cipsov4_add(struct cipso_v4_doi *doi_def,
struct netlbl_audit *audit_info)
{
return cipso_v4_doi_add(doi_def, audit_info);
}
/**
* netlbl_cfg_cipsov4_del - Remove an existing CIPSOv4 DOI definition
* @doi: CIPSO DOI
* @audit_info: NetLabel audit information
*
* Description:
* Remove an existing CIPSO DOI definition matching @doi. Returns zero on
* success and negative values on failure.
*
*/
void netlbl_cfg_cipsov4_del(u32 doi, struct netlbl_audit *audit_info)
{
cipso_v4_doi_remove(doi, audit_info);
}
/**
* netlbl_cfg_cipsov4_map_add - Add a new CIPSOv4 DOI mapping
* @doi: the CIPSO DOI
* @domain: the domain mapping to add
* @addr: IP address
* @mask: IP address mask
* @audit_info: NetLabel audit information
*
* Description:
* Add a new NetLabel/LSM domain mapping for the given CIPSO DOI to the NetLabel
* subsystem. A @domain value of NULL adds a new default domain mapping.
* Returns zero on success, negative values on failure.
*
*/
int netlbl_cfg_cipsov4_map_add(u32 doi,
const char *domain,
const struct in_addr *addr,
const struct in_addr *mask,
struct netlbl_audit *audit_info)
{
int ret_val = -ENOMEM;
struct cipso_v4_doi *doi_def;
struct netlbl_dom_map *entry;
struct netlbl_domaddr_map *addrmap = NULL;
struct netlbl_domaddr4_map *addrinfo = NULL;
doi_def = cipso_v4_doi_getdef(doi);
if (doi_def == NULL)
return -ENOENT;
entry = kzalloc(sizeof(*entry), GFP_ATOMIC);
if (entry == NULL)
goto out_entry;
entry->family = AF_INET;
if (domain != NULL) {
entry->domain = kstrdup(domain, GFP_ATOMIC);
if (entry->domain == NULL)
goto out_domain;
}
if (addr == NULL && mask == NULL) {
entry->def.cipso = doi_def;
entry->def.type = NETLBL_NLTYPE_CIPSOV4;
} else if (addr != NULL && mask != NULL) {
addrmap = kzalloc(sizeof(*addrmap), GFP_ATOMIC);
if (addrmap == NULL)
goto out_addrmap;
INIT_LIST_HEAD(&addrmap->list4);
INIT_LIST_HEAD(&addrmap->list6);
addrinfo = kzalloc(sizeof(*addrinfo), GFP_ATOMIC);
if (addrinfo == NULL)
goto out_addrinfo;
addrinfo->def.cipso = doi_def;
addrinfo->def.type = NETLBL_NLTYPE_CIPSOV4;
addrinfo->list.addr = addr->s_addr & mask->s_addr;
addrinfo->list.mask = mask->s_addr;
addrinfo->list.valid = 1;
ret_val = netlbl_af4list_add(&addrinfo->list, &addrmap->list4);
if (ret_val != 0)
goto cfg_cipsov4_map_add_failure;
entry->def.addrsel = addrmap;
entry->def.type = NETLBL_NLTYPE_ADDRSELECT;
} else {
ret_val = -EINVAL;
goto out_addrmap;
}
ret_val = netlbl_domhsh_add(entry, audit_info);
if (ret_val != 0)
goto cfg_cipsov4_map_add_failure;
return 0;
cfg_cipsov4_map_add_failure:
kfree(addrinfo);
out_addrinfo:
kfree(addrmap);
out_addrmap:
kfree(entry->domain);
out_domain:
kfree(entry);
out_entry:
cipso_v4_doi_putdef(doi_def);
return ret_val;
}
/**
* netlbl_cfg_calipso_add - Add a new CALIPSO DOI definition
* @doi_def: CALIPSO DOI definition
* @audit_info: NetLabel audit information
*
* Description:
* Add a new CALIPSO DOI definition as defined by @doi_def. Returns zero on
* success and negative values on failure.
*
*/
int netlbl_cfg_calipso_add(struct calipso_doi *doi_def,
struct netlbl_audit *audit_info)
{
#if IS_ENABLED(CONFIG_IPV6)
return calipso_doi_add(doi_def, audit_info);
#else /* IPv6 */
return -ENOSYS;
#endif /* IPv6 */
}
/**
* netlbl_cfg_calipso_del - Remove an existing CALIPSO DOI definition
* @doi: CALIPSO DOI
* @audit_info: NetLabel audit information
*
* Description:
* Remove an existing CALIPSO DOI definition matching @doi. Returns zero on
* success and negative values on failure.
*
*/
void netlbl_cfg_calipso_del(u32 doi, struct netlbl_audit *audit_info)
{
#if IS_ENABLED(CONFIG_IPV6)
calipso_doi_remove(doi, audit_info);
#endif /* IPv6 */
}
/**
* netlbl_cfg_calipso_map_add - Add a new CALIPSO DOI mapping
* @doi: the CALIPSO DOI
* @domain: the domain mapping to add
* @addr: IP address
* @mask: IP address mask
* @audit_info: NetLabel audit information
*
* Description:
* Add a new NetLabel/LSM domain mapping for the given CALIPSO DOI to the
* NetLabel subsystem. A @domain value of NULL adds a new default domain
* mapping. Returns zero on success, negative values on failure.
*
*/
int netlbl_cfg_calipso_map_add(u32 doi,
const char *domain,
const struct in6_addr *addr,
const struct in6_addr *mask,
struct netlbl_audit *audit_info)
{
#if IS_ENABLED(CONFIG_IPV6)
int ret_val = -ENOMEM;
struct calipso_doi *doi_def;
struct netlbl_dom_map *entry;
struct netlbl_domaddr_map *addrmap = NULL;
struct netlbl_domaddr6_map *addrinfo = NULL;
doi_def = calipso_doi_getdef(doi);
if (doi_def == NULL)
return -ENOENT;
entry = kzalloc(sizeof(*entry), GFP_ATOMIC);
if (entry == NULL)
goto out_entry;
entry->family = AF_INET6;
if (domain != NULL) {
entry->domain = kstrdup(domain, GFP_ATOMIC);
if (entry->domain == NULL)
goto out_domain;
}
if (addr == NULL && mask == NULL) {
entry->def.calipso = doi_def;
entry->def.type = NETLBL_NLTYPE_CALIPSO;
} else if (addr != NULL && mask != NULL) {
addrmap = kzalloc(sizeof(*addrmap), GFP_ATOMIC);
if (addrmap == NULL)
goto out_addrmap;
INIT_LIST_HEAD(&addrmap->list4);
INIT_LIST_HEAD(&addrmap->list6);
addrinfo = kzalloc(sizeof(*addrinfo), GFP_ATOMIC);
if (addrinfo == NULL)
goto out_addrinfo;
addrinfo->def.calipso = doi_def;
addrinfo->def.type = NETLBL_NLTYPE_CALIPSO;
addrinfo->list.addr = *addr;
addrinfo->list.addr.s6_addr32[0] &= mask->s6_addr32[0];
addrinfo->list.addr.s6_addr32[1] &= mask->s6_addr32[1];
addrinfo->list.addr.s6_addr32[2] &= mask->s6_addr32[2];
addrinfo->list.addr.s6_addr32[3] &= mask->s6_addr32[3];
addrinfo->list.mask = *mask;
addrinfo->list.valid = 1;
ret_val = netlbl_af6list_add(&addrinfo->list, &addrmap->list6);
if (ret_val != 0)
goto cfg_calipso_map_add_failure;
entry->def.addrsel = addrmap;
entry->def.type = NETLBL_NLTYPE_ADDRSELECT;
} else {
ret_val = -EINVAL;
goto out_addrmap;
}
ret_val = netlbl_domhsh_add(entry, audit_info);
if (ret_val != 0)
goto cfg_calipso_map_add_failure;
return 0;
cfg_calipso_map_add_failure:
kfree(addrinfo);
out_addrinfo:
kfree(addrmap);
out_addrmap:
kfree(entry->domain);
out_domain:
kfree(entry);
out_entry:
calipso_doi_putdef(doi_def);
return ret_val;
#else /* IPv6 */
return -ENOSYS;
#endif /* IPv6 */
}
/*
* Security Attribute Functions
*/
#define _CM_F_NONE 0x00000000
#define _CM_F_ALLOC 0x00000001
#define _CM_F_WALK 0x00000002
/**
* _netlbl_catmap_getnode - Get a individual node from a catmap
* @catmap: pointer to the category bitmap
* @offset: the requested offset
* @cm_flags: catmap flags, see _CM_F_*
* @gfp_flags: memory allocation flags
*
* Description:
* Iterate through the catmap looking for the node associated with @offset.
* If the _CM_F_ALLOC flag is set in @cm_flags and there is no associated node,
* one will be created and inserted into the catmap. If the _CM_F_WALK flag is
* set in @cm_flags and there is no associated node, the next highest node will
* be returned. Returns a pointer to the node on success, NULL on failure.
*
*/
static struct netlbl_lsm_catmap *_netlbl_catmap_getnode(
struct netlbl_lsm_catmap **catmap,
u32 offset,
unsigned int cm_flags,
gfp_t gfp_flags)
{
struct netlbl_lsm_catmap *iter = *catmap;
struct netlbl_lsm_catmap *prev = NULL;
if (iter == NULL)
goto catmap_getnode_alloc;
if (offset < iter->startbit)
goto catmap_getnode_walk;
while (iter && offset >= (iter->startbit + NETLBL_CATMAP_SIZE)) {
prev = iter;
iter = iter->next;
}
if (iter == NULL || offset < iter->startbit)
goto catmap_getnode_walk;
return iter;
catmap_getnode_walk:
if (cm_flags & _CM_F_WALK)
return iter;
catmap_getnode_alloc:
if (!(cm_flags & _CM_F_ALLOC))
return NULL;
iter = netlbl_catmap_alloc(gfp_flags);
if (iter == NULL)
return NULL;
iter->startbit = offset & ~(NETLBL_CATMAP_SIZE - 1);
if (prev == NULL) {
iter->next = *catmap;
*catmap = iter;
} else {
iter->next = prev->next;
prev->next = iter;
}
return iter;
}
/**
* netlbl_catmap_walk - Walk a LSM secattr catmap looking for a bit
* @catmap: the category bitmap
* @offset: the offset to start searching at, in bits
*
* Description:
* This function walks a LSM secattr category bitmap starting at @offset and
* returns the spot of the first set bit or -ENOENT if no bits are set.
*
*/
int netlbl_catmap_walk(struct netlbl_lsm_catmap *catmap, u32 offset)
{
struct netlbl_lsm_catmap *iter;
u32 idx;
u32 bit;
NETLBL_CATMAP_MAPTYPE bitmap;
iter = _netlbl_catmap_getnode(&catmap, offset, _CM_F_WALK, 0);
if (iter == NULL)
return -ENOENT;
if (offset > iter->startbit) {
offset -= iter->startbit;
idx = offset / NETLBL_CATMAP_MAPSIZE;
bit = offset % NETLBL_CATMAP_MAPSIZE;
} else {
idx = 0;
bit = 0;
}
bitmap = iter->bitmap[idx] >> bit;
for (;;) {
if (bitmap != 0) {
while ((bitmap & NETLBL_CATMAP_BIT) == 0) {
bitmap >>= 1;
bit++;
}
return iter->startbit +
(NETLBL_CATMAP_MAPSIZE * idx) + bit;
}
if (++idx >= NETLBL_CATMAP_MAPCNT) {
if (iter->next != NULL) {
iter = iter->next;
idx = 0;
} else
return -ENOENT;
}
bitmap = iter->bitmap[idx];
bit = 0;
}
return -ENOENT;
}
EXPORT_SYMBOL(netlbl_catmap_walk);
/**
* netlbl_catmap_walkrng - Find the end of a string of set bits
* @catmap: the category bitmap
* @offset: the offset to start searching at, in bits
*
* Description:
* This function walks a LSM secattr category bitmap starting at @offset and
* returns the spot of the first cleared bit or -ENOENT if the offset is past
* the end of the bitmap.
*
*/
int netlbl_catmap_walkrng(struct netlbl_lsm_catmap *catmap, u32 offset)
{
struct netlbl_lsm_catmap *iter;
struct netlbl_lsm_catmap *prev = NULL;
u32 idx;
u32 bit;
NETLBL_CATMAP_MAPTYPE bitmask;
NETLBL_CATMAP_MAPTYPE bitmap;
iter = _netlbl_catmap_getnode(&catmap, offset, _CM_F_WALK, 0);
if (iter == NULL)
return -ENOENT;
if (offset > iter->startbit) {
offset -= iter->startbit;
idx = offset / NETLBL_CATMAP_MAPSIZE;
bit = offset % NETLBL_CATMAP_MAPSIZE;
} else {
idx = 0;
bit = 0;
}
bitmask = NETLBL_CATMAP_BIT << bit;
for (;;) {
bitmap = iter->bitmap[idx];
while (bitmask != 0 && (bitmap & bitmask) != 0) {
bitmask <<= 1;
bit++;
}
if (prev && idx == 0 && bit == 0)
return prev->startbit + NETLBL_CATMAP_SIZE - 1;
else if (bitmask != 0)
return iter->startbit +
(NETLBL_CATMAP_MAPSIZE * idx) + bit - 1;
else if (++idx >= NETLBL_CATMAP_MAPCNT) {
if (iter->next == NULL)
return iter->startbit + NETLBL_CATMAP_SIZE - 1;
prev = iter;
iter = iter->next;
idx = 0;
}
bitmask = NETLBL_CATMAP_BIT;
bit = 0;
}
return -ENOENT;
}
/**
* netlbl_catmap_getlong - Export an unsigned long bitmap
* @catmap: pointer to the category bitmap
* @offset: pointer to the requested offset
* @bitmap: the exported bitmap
*
* Description:
* Export a bitmap with an offset greater than or equal to @offset and return
* it in @bitmap. The @offset must be aligned to an unsigned long and will be
* updated on return if different from what was requested; if the catmap is
* empty at the requested offset and beyond, the @offset is set to (u32)-1.
* Returns zero on success, negative values on failure.
*
*/
int netlbl_catmap_getlong(struct netlbl_lsm_catmap *catmap,
u32 *offset,
unsigned long *bitmap)
{
struct netlbl_lsm_catmap *iter;
u32 off = *offset;
u32 idx;
/* only allow aligned offsets */
if ((off & (BITS_PER_LONG - 1)) != 0)
return -EINVAL;
/* a null catmap is equivalent to an empty one */
if (!catmap) {
*offset = (u32)-1;
return 0;
}
if (off < catmap->startbit) {
off = catmap->startbit;
*offset = off;
}
iter = _netlbl_catmap_getnode(&catmap, off, _CM_F_WALK, 0);
if (iter == NULL) {
*offset = (u32)-1;
return 0;
}
if (off < iter->startbit) {
*offset = iter->startbit;
off = 0;
} else
off -= iter->startbit;
idx = off / NETLBL_CATMAP_MAPSIZE;
*bitmap = iter->bitmap[idx] >> (off % NETLBL_CATMAP_MAPSIZE);
return 0;
}
/**
* netlbl_catmap_setbit - Set a bit in a LSM secattr catmap
* @catmap: pointer to the category bitmap
* @bit: the bit to set
* @flags: memory allocation flags
*
* Description:
* Set the bit specified by @bit in @catmap. Returns zero on success,
* negative values on failure.
*
*/
int netlbl_catmap_setbit(struct netlbl_lsm_catmap **catmap,
u32 bit,
gfp_t flags)
{
struct netlbl_lsm_catmap *iter;
u32 idx;
iter = _netlbl_catmap_getnode(catmap, bit, _CM_F_ALLOC, flags);
if (iter == NULL)
return -ENOMEM;
bit -= iter->startbit;
idx = bit / NETLBL_CATMAP_MAPSIZE;
iter->bitmap[idx] |= NETLBL_CATMAP_BIT << (bit % NETLBL_CATMAP_MAPSIZE);
return 0;
}
EXPORT_SYMBOL(netlbl_catmap_setbit);
/**
* netlbl_catmap_setrng - Set a range of bits in a LSM secattr catmap
* @catmap: pointer to the category bitmap
* @start: the starting bit
* @end: the last bit in the string
* @flags: memory allocation flags
*
* Description:
* Set a range of bits, starting at @start and ending with @end. Returns zero
* on success, negative values on failure.
*
*/
int netlbl_catmap_setrng(struct netlbl_lsm_catmap **catmap,
u32 start,
u32 end,
gfp_t flags)
{
int rc = 0;
u32 spot = start;
while (rc == 0 && spot <= end) {
if (((spot & (BITS_PER_LONG - 1)) == 0) &&
((end - spot) > BITS_PER_LONG)) {
rc = netlbl_catmap_setlong(catmap,
spot,
(unsigned long)-1,
flags);
spot += BITS_PER_LONG;
} else
rc = netlbl_catmap_setbit(catmap, spot++, flags);
}
return rc;
}
/**
* netlbl_catmap_setlong - Import an unsigned long bitmap
* @catmap: pointer to the category bitmap
* @offset: offset to the start of the imported bitmap
* @bitmap: the bitmap to import
* @flags: memory allocation flags
*
* Description:
* Import the bitmap specified in @bitmap into @catmap, using the offset
* in @offset. The offset must be aligned to an unsigned long. Returns zero
* on success, negative values on failure.
*
*/
int netlbl_catmap_setlong(struct netlbl_lsm_catmap **catmap,
u32 offset,
unsigned long bitmap,
gfp_t flags)
{
struct netlbl_lsm_catmap *iter;
u32 idx;
/* only allow aligned offsets */
if ((offset & (BITS_PER_LONG - 1)) != 0)
return -EINVAL;
iter = _netlbl_catmap_getnode(catmap, offset, _CM_F_ALLOC, flags);
if (iter == NULL)
return -ENOMEM;
offset -= iter->startbit;
idx = offset / NETLBL_CATMAP_MAPSIZE;
iter->bitmap[idx] |= bitmap << (offset % NETLBL_CATMAP_MAPSIZE);
return 0;
}
/* Bitmap functions
*/
/**
* netlbl_bitmap_walk - Walk a bitmap looking for a bit
* @bitmap: the bitmap
* @bitmap_len: length in bits
* @offset: starting offset
* @state: if non-zero, look for a set (1) bit else look for a cleared (0) bit
*
* Description:
* Starting at @offset, walk the bitmap from left to right until either the
* desired bit is found or we reach the end. Return the bit offset, -1 if
* not found, or -2 if error.
*/
int netlbl_bitmap_walk(const unsigned char *bitmap, u32 bitmap_len,
u32 offset, u8 state)
{
u32 bit_spot;
u32 byte_offset;
unsigned char bitmask;
unsigned char byte;
if (offset >= bitmap_len)
return -1;
byte_offset = offset / 8;
byte = bitmap[byte_offset];
bit_spot = offset;
bitmask = 0x80 >> (offset % 8);
while (bit_spot < bitmap_len) {
if ((state && (byte & bitmask) == bitmask) ||
(state == 0 && (byte & bitmask) == 0))
return bit_spot;
if (++bit_spot >= bitmap_len)
return -1;
bitmask >>= 1;
if (bitmask == 0) {
byte = bitmap[++byte_offset];
bitmask = 0x80;
}
}
return -1;
}
EXPORT_SYMBOL(netlbl_bitmap_walk);
/**
* netlbl_bitmap_setbit - Sets a single bit in a bitmap
* @bitmap: the bitmap
* @bit: the bit
* @state: if non-zero, set the bit (1) else clear the bit (0)
*
* Description:
* Set a single bit in the bitmask. Returns zero on success, negative values
* on error.
*/
void netlbl_bitmap_setbit(unsigned char *bitmap, u32 bit, u8 state)
{
u32 byte_spot;
u8 bitmask;
/* gcc always rounds to zero when doing integer division */
byte_spot = bit / 8;
bitmask = 0x80 >> (bit % 8);
if (state)
bitmap[byte_spot] |= bitmask;
else
bitmap[byte_spot] &= ~bitmask;
}
EXPORT_SYMBOL(netlbl_bitmap_setbit);
/*
* LSM Functions
*/
/**
* netlbl_enabled - Determine if the NetLabel subsystem is enabled
*
* Description:
* The LSM can use this function to determine if it should use NetLabel
* security attributes in it's enforcement mechanism. Currently, NetLabel is
* considered to be enabled when it's configuration contains a valid setup for
* at least one labeled protocol (i.e. NetLabel can understand incoming
* labeled packets of at least one type); otherwise NetLabel is considered to
* be disabled.
*
*/
int netlbl_enabled(void)
{
/* At some point we probably want to expose this mechanism to the user
* as well so that admins can toggle NetLabel regardless of the
* configuration */
return (atomic_read(&netlabel_mgmt_protocount) > 0);
}
/**
* netlbl_sock_setattr - Label a socket using the correct protocol
* @sk: the socket to label
* @family: protocol family
* @secattr: the security attributes
*
* Description:
* Attach the correct label to the given socket using the security attributes
* specified in @secattr. This function requires exclusive access to @sk,
* which means it either needs to be in the process of being created or locked.
* Returns zero on success, -EDESTADDRREQ if the domain is configured to use
* network address selectors (can't blindly label the socket), and negative
* values on all other failures.
*
*/
int netlbl_sock_setattr(struct sock *sk,
u16 family,
const struct netlbl_lsm_secattr *secattr)
{
int ret_val;
struct netlbl_dom_map *dom_entry;
rcu_read_lock();
dom_entry = netlbl_domhsh_getentry(secattr->domain, family);
if (dom_entry == NULL) {
ret_val = -ENOENT;
goto socket_setattr_return;
}
switch (family) {
case AF_INET:
switch (dom_entry->def.type) {
case NETLBL_NLTYPE_ADDRSELECT:
ret_val = -EDESTADDRREQ;
break;
case NETLBL_NLTYPE_CIPSOV4:
ret_val = cipso_v4_sock_setattr(sk,
dom_entry->def.cipso,
secattr);
break;
case NETLBL_NLTYPE_UNLABELED:
ret_val = 0;
break;
default:
ret_val = -ENOENT;
}
break;
#if IS_ENABLED(CONFIG_IPV6)
case AF_INET6:
switch (dom_entry->def.type) {
case NETLBL_NLTYPE_ADDRSELECT:
ret_val = -EDESTADDRREQ;
break;
case NETLBL_NLTYPE_CALIPSO:
ret_val = calipso_sock_setattr(sk,
dom_entry->def.calipso,
secattr);
break;
case NETLBL_NLTYPE_UNLABELED:
ret_val = 0;
break;
default:
ret_val = -ENOENT;
}
break;
#endif /* IPv6 */
default:
ret_val = -EPROTONOSUPPORT;
}
socket_setattr_return:
rcu_read_unlock();
return ret_val;
}
/**
* netlbl_sock_delattr - Delete all the NetLabel labels on a socket
* @sk: the socket
*
* Description:
* Remove all the NetLabel labeling from @sk. The caller is responsible for
* ensuring that @sk is locked.
*
*/
void netlbl_sock_delattr(struct sock *sk)
{
switch (sk->sk_family) {
case AF_INET:
cipso_v4_sock_delattr(sk);
break;
#if IS_ENABLED(CONFIG_IPV6)
case AF_INET6:
calipso_sock_delattr(sk);
break;
#endif /* IPv6 */
}
}
/**
* netlbl_sock_getattr - Determine the security attributes of a sock
* @sk: the sock
* @secattr: the security attributes
*
* Description:
* Examines the given sock to see if any NetLabel style labeling has been
* applied to the sock, if so it parses the socket label and returns the
* security attributes in @secattr. Returns zero on success, negative values
* on failure.
*
*/
int netlbl_sock_getattr(struct sock *sk,
struct netlbl_lsm_secattr *secattr)
{
int ret_val;
switch (sk->sk_family) {
case AF_INET:
ret_val = cipso_v4_sock_getattr(sk, secattr);
break;
#if IS_ENABLED(CONFIG_IPV6)
case AF_INET6:
ret_val = calipso_sock_getattr(sk, secattr);
break;
#endif /* IPv6 */
default:
ret_val = -EPROTONOSUPPORT;
}
return ret_val;
}
/**
* netlbl_conn_setattr - Label a connected socket using the correct protocol
* @sk: the socket to label
* @addr: the destination address
* @secattr: the security attributes
*
* Description:
* Attach the correct label to the given connected socket using the security
* attributes specified in @secattr. The caller is responsible for ensuring
* that @sk is locked. Returns zero on success, negative values on failure.
*
*/
int netlbl_conn_setattr(struct sock *sk,
struct sockaddr *addr,
const struct netlbl_lsm_secattr *secattr)
{
int ret_val;
struct sockaddr_in *addr4;
#if IS_ENABLED(CONFIG_IPV6)
struct sockaddr_in6 *addr6;
#endif
struct netlbl_dommap_def *entry;
rcu_read_lock();
switch (addr->sa_family) {
case AF_INET:
addr4 = (struct sockaddr_in *)addr;
entry = netlbl_domhsh_getentry_af4(secattr->domain,
addr4->sin_addr.s_addr);
if (entry == NULL) {
ret_val = -ENOENT;
goto conn_setattr_return;
}
switch (entry->type) {
case NETLBL_NLTYPE_CIPSOV4:
ret_val = cipso_v4_sock_setattr(sk,
entry->cipso, secattr);
break;
case NETLBL_NLTYPE_UNLABELED:
/* just delete the protocols we support for right now
* but we could remove other protocols if needed */
netlbl_sock_delattr(sk);
ret_val = 0;
break;
default:
ret_val = -ENOENT;
}
break;
#if IS_ENABLED(CONFIG_IPV6)
case AF_INET6:
addr6 = (struct sockaddr_in6 *)addr;
entry = netlbl_domhsh_getentry_af6(secattr->domain,
&addr6->sin6_addr);
if (entry == NULL) {
ret_val = -ENOENT;
goto conn_setattr_return;
}
switch (entry->type) {
case NETLBL_NLTYPE_CALIPSO:
ret_val = calipso_sock_setattr(sk,
entry->calipso, secattr);
break;
case NETLBL_NLTYPE_UNLABELED:
/* just delete the protocols we support for right now
* but we could remove other protocols if needed */
netlbl_sock_delattr(sk);
ret_val = 0;
break;
default:
ret_val = -ENOENT;
}
break;
#endif /* IPv6 */
default:
ret_val = -EPROTONOSUPPORT;
}
conn_setattr_return:
rcu_read_unlock();
return ret_val;
}
/**
* netlbl_req_setattr - Label a request socket using the correct protocol
* @req: the request socket to label
* @secattr: the security attributes
*
* Description:
* Attach the correct label to the given socket using the security attributes
* specified in @secattr. Returns zero on success, negative values on failure.
*
*/
int netlbl_req_setattr(struct request_sock *req,
const struct netlbl_lsm_secattr *secattr)
{
int ret_val;
struct netlbl_dommap_def *entry;
struct inet_request_sock *ireq = inet_rsk(req);
rcu_read_lock();
switch (req->rsk_ops->family) {
case AF_INET:
entry = netlbl_domhsh_getentry_af4(secattr->domain,
ireq->ir_rmt_addr);
if (entry == NULL) {
ret_val = -ENOENT;
goto req_setattr_return;
}
switch (entry->type) {
case NETLBL_NLTYPE_CIPSOV4:
ret_val = cipso_v4_req_setattr(req,
entry->cipso, secattr);
break;
case NETLBL_NLTYPE_UNLABELED:
netlbl_req_delattr(req);
ret_val = 0;
break;
default:
ret_val = -ENOENT;
}
break;
#if IS_ENABLED(CONFIG_IPV6)
case AF_INET6:
entry = netlbl_domhsh_getentry_af6(secattr->domain,
&ireq->ir_v6_rmt_addr);
if (entry == NULL) {
ret_val = -ENOENT;
goto req_setattr_return;
}
switch (entry->type) {
case NETLBL_NLTYPE_CALIPSO:
ret_val = calipso_req_setattr(req,
entry->calipso, secattr);
break;
case NETLBL_NLTYPE_UNLABELED:
netlbl_req_delattr(req);
ret_val = 0;
break;
default:
ret_val = -ENOENT;
}
break;
#endif /* IPv6 */
default:
ret_val = -EPROTONOSUPPORT;
}
req_setattr_return:
rcu_read_unlock();
return ret_val;
}
/**
* netlbl_req_delattr - Delete all the NetLabel labels on a socket
* @req: the socket
*
* Description:
* Remove all the NetLabel labeling from @req.
*
*/
void netlbl_req_delattr(struct request_sock *req)
{
switch (req->rsk_ops->family) {
case AF_INET:
cipso_v4_req_delattr(req);
break;
#if IS_ENABLED(CONFIG_IPV6)
case AF_INET6:
calipso_req_delattr(req);
break;
#endif /* IPv6 */
}
}
/**
* netlbl_skbuff_setattr - Label a packet using the correct protocol
* @skb: the packet
* @family: protocol family
* @secattr: the security attributes
*
* Description:
* Attach the correct label to the given packet using the security attributes
* specified in @secattr. Returns zero on success, negative values on failure.
*
*/
int netlbl_skbuff_setattr(struct sk_buff *skb,
u16 family,
const struct netlbl_lsm_secattr *secattr)
{
int ret_val;
struct iphdr *hdr4;
#if IS_ENABLED(CONFIG_IPV6)
struct ipv6hdr *hdr6;
#endif
struct netlbl_dommap_def *entry;
rcu_read_lock();
switch (family) {
case AF_INET:
hdr4 = ip_hdr(skb);
entry = netlbl_domhsh_getentry_af4(secattr->domain,
hdr4->daddr);
if (entry == NULL) {
ret_val = -ENOENT;
goto skbuff_setattr_return;
}
switch (entry->type) {
case NETLBL_NLTYPE_CIPSOV4:
ret_val = cipso_v4_skbuff_setattr(skb, entry->cipso,
secattr);
break;
case NETLBL_NLTYPE_UNLABELED:
/* just delete the protocols we support for right now
* but we could remove other protocols if needed */
ret_val = cipso_v4_skbuff_delattr(skb);
break;
default:
ret_val = -ENOENT;
}
break;
#if IS_ENABLED(CONFIG_IPV6)
case AF_INET6:
hdr6 = ipv6_hdr(skb);
entry = netlbl_domhsh_getentry_af6(secattr->domain,
&hdr6->daddr);
if (entry == NULL) {
ret_val = -ENOENT;
goto skbuff_setattr_return;
}
switch (entry->type) {
case NETLBL_NLTYPE_CALIPSO:
ret_val = calipso_skbuff_setattr(skb, entry->calipso,
secattr);
break;
case NETLBL_NLTYPE_UNLABELED:
/* just delete the protocols we support for right now
* but we could remove other protocols if needed */
ret_val = calipso_skbuff_delattr(skb);
break;
default:
ret_val = -ENOENT;
}
break;
#endif /* IPv6 */
default:
ret_val = -EPROTONOSUPPORT;
}
skbuff_setattr_return:
rcu_read_unlock();
return ret_val;
}
/**
* netlbl_skbuff_getattr - Determine the security attributes of a packet
* @skb: the packet
* @family: protocol family
* @secattr: the security attributes
*
* Description:
* Examines the given packet to see if a recognized form of packet labeling
* is present, if so it parses the packet label and returns the security
* attributes in @secattr. Returns zero on success, negative values on
* failure.
*
*/
int netlbl_skbuff_getattr(const struct sk_buff *skb,
u16 family,
struct netlbl_lsm_secattr *secattr)
{
unsigned char *ptr;
switch (family) {
case AF_INET:
ptr = cipso_v4_optptr(skb);
if (ptr && cipso_v4_getattr(ptr, secattr) == 0)
return 0;
break;
#if IS_ENABLED(CONFIG_IPV6)
case AF_INET6:
ptr = calipso_optptr(skb);
if (ptr && calipso_getattr(ptr, secattr) == 0)
return 0;
break;
#endif /* IPv6 */
}
return netlbl_unlabel_getattr(skb, family, secattr);
}
/**
* netlbl_skbuff_err - Handle a LSM error on a sk_buff
* @skb: the packet
* @family: the family
* @error: the error code
* @gateway: true if host is acting as a gateway, false otherwise
*
* Description:
* Deal with a LSM problem when handling the packet in @skb, typically this is
* a permission denied problem (-EACCES). The correct action is determined
* according to the packet's labeling protocol.
*
*/
void netlbl_skbuff_err(struct sk_buff *skb, u16 family, int error, int gateway)
{
switch (family) {
case AF_INET:
if (cipso_v4_optptr(skb))
cipso_v4_error(skb, error, gateway);
break;
}
}
/**
* netlbl_cache_invalidate - Invalidate all of the NetLabel protocol caches
*
* Description:
* For all of the NetLabel protocols that support some form of label mapping
* cache, invalidate the cache. Returns zero on success, negative values on
* error.
*
*/
void netlbl_cache_invalidate(void)
{
cipso_v4_cache_invalidate();
#if IS_ENABLED(CONFIG_IPV6)
calipso_cache_invalidate();
#endif /* IPv6 */
}
/**
* netlbl_cache_add - Add an entry to a NetLabel protocol cache
* @skb: the packet
* @family: the family
* @secattr: the packet's security attributes
*
* Description:
* Add the LSM security attributes for the given packet to the underlying
* NetLabel protocol's label mapping cache. Returns zero on success, negative
* values on error.
*
*/
int netlbl_cache_add(const struct sk_buff *skb, u16 family,
const struct netlbl_lsm_secattr *secattr)
{
unsigned char *ptr;
if ((secattr->flags & NETLBL_SECATTR_CACHE) == 0)
return -ENOMSG;
switch (family) {
case AF_INET:
ptr = cipso_v4_optptr(skb);
if (ptr)
return cipso_v4_cache_add(ptr, secattr);
break;
#if IS_ENABLED(CONFIG_IPV6)
case AF_INET6:
ptr = calipso_optptr(skb);
if (ptr)
return calipso_cache_add(ptr, secattr);
break;
#endif /* IPv6 */
}
return -ENOMSG;
}
/*
* Protocol Engine Functions
*/
/**
* netlbl_audit_start - Start an audit message
* @type: audit message type
* @audit_info: NetLabel audit information
*
* Description:
* Start an audit message using the type specified in @type and fill the audit
* message with some fields common to all NetLabel audit messages. This
* function should only be used by protocol engines, not LSMs. Returns a
* pointer to the audit buffer on success, NULL on failure.
*
*/
struct audit_buffer *netlbl_audit_start(int type,
struct netlbl_audit *audit_info)
{
return netlbl_audit_start_common(type, audit_info);
}
EXPORT_SYMBOL(netlbl_audit_start);
/*
* Setup Functions
*/
/**
* netlbl_init - Initialize NetLabel
*
* Description:
* Perform the required NetLabel initialization before first use.
*
*/
static int __init netlbl_init(void)
{
int ret_val;
printk(KERN_INFO "NetLabel: Initializing\n");
printk(KERN_INFO "NetLabel: domain hash size = %u\n",
(1 << NETLBL_DOMHSH_BITSIZE));
printk(KERN_INFO "NetLabel: protocols = UNLABELED CIPSOv4 CALIPSO\n");
ret_val = netlbl_domhsh_init(NETLBL_DOMHSH_BITSIZE);
if (ret_val != 0)
goto init_failure;
ret_val = netlbl_unlabel_init(NETLBL_UNLHSH_BITSIZE);
if (ret_val != 0)
goto init_failure;
ret_val = netlbl_netlink_init();
if (ret_val != 0)
goto init_failure;
ret_val = netlbl_unlabel_defconf();
if (ret_val != 0)
goto init_failure;
printk(KERN_INFO "NetLabel: unlabeled traffic allowed by default\n");
return 0;
init_failure:
panic("NetLabel: failed to initialize properly (%d)\n", ret_val);
}
subsys_initcall(netlbl_init);
/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
* INET An implementation of the TCP/IP protocol suite for the LINUX
* operating system. INET is implemented using the BSD Socket
* interface as the means of communication with the user level.
*
* Global definitions for the Ethernet IEEE 802.3 interface.
*
* Version: @(#)if_ether.h 1.0.1a 02/08/94
*
* Author: Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
* Donald Becker, <becker@super.org>
* Alan Cox, <alan@lxorguk.ukuu.org.uk>
* Steve Whitehouse, <gw7rrm@eeshack3.swan.ac.uk>
*/
#ifndef _LINUX_IF_ETHER_H
#define _LINUX_IF_ETHER_H
#include <linux/skbuff.h>
#include <uapi/linux/if_ether.h>
static inline struct ethhdr *eth_hdr(const struct sk_buff *skb)
{
return (struct ethhdr *)skb_mac_header(skb);
}
/* Prefer this version in TX path, instead of
* skb_reset_mac_header() + eth_hdr()
*/
static inline struct ethhdr *skb_eth_hdr(const struct sk_buff *skb)
{
return (struct ethhdr *)skb->data;
}
static inline struct ethhdr *inner_eth_hdr(const struct sk_buff *skb)
{
return (struct ethhdr *)skb_inner_mac_header(skb);
}
int eth_header_parse(const struct sk_buff *skb, unsigned char *haddr);
extern ssize_t sysfs_format_mac(char *buf, const unsigned char *addr, int len);
#endif /* _LINUX_IF_ETHER_H */
// SPDX-License-Identifier: GPL-2.0
/*
* Copyright (C) 2008 Oracle. All rights reserved.
*/
#include <linux/kernel.h>
#include <linux/bio.h>
#include <linux/file.h>
#include <linux/fs.h>
#include <linux/pagemap.h>
#include <linux/highmem.h>
#include <linux/time.h>
#include <linux/init.h>
#include <linux/string.h>
#include <linux/backing-dev.h>
#include <linux/writeback.h>
#include <linux/slab.h>
#include <linux/sched/mm.h>
#include <linux/log2.h>
#include <crypto/hash.h>
#include "misc.h"
#include "ctree.h"
#include "disk-io.h"
#include "transaction.h"
#include "btrfs_inode.h"
#include "volumes.h"
#include "ordered-data.h"
#include "compression.h"
#include "extent_io.h"
#include "extent_map.h"
#include "zoned.h"
static const char* const btrfs_compress_types[] = { "", "zlib", "lzo", "zstd" };
const char* btrfs_compress_type2str(enum btrfs_compression_type type)
{
switch (type) {
case BTRFS_COMPRESS_ZLIB:
case BTRFS_COMPRESS_LZO:
case BTRFS_COMPRESS_ZSTD:
case BTRFS_COMPRESS_NONE:
return btrfs_compress_types[type];
default:
break;
}
return NULL;
}
bool btrfs_compress_is_valid_type(const char *str, size_t len)
{
int i;
for (i = 1; i < ARRAY_SIZE(btrfs_compress_types); i++) {
size_t comp_len = strlen(btrfs_compress_types[i]);
if (len < comp_len)
continue;
if (!strncmp(btrfs_compress_types[i], str, comp_len))
return true;
}
return false;
}
static int compression_compress_pages(int type, struct list_head *ws,
struct address_space *mapping, u64 start, struct page **pages,
unsigned long *out_pages, unsigned long *total_in,
unsigned long *total_out)
{
switch (type) {
case BTRFS_COMPRESS_ZLIB:
return zlib_compress_pages(ws, mapping, start, pages,
out_pages, total_in, total_out);
case BTRFS_COMPRESS_LZO:
return lzo_compress_pages(ws, mapping, start, pages,
out_pages, total_in, total_out);
case BTRFS_COMPRESS_ZSTD:
return zstd_compress_pages(ws, mapping, start, pages,
out_pages, total_in, total_out);
case BTRFS_COMPRESS_NONE:
default:
/*
* This can happen when compression races with remount setting
* it to 'no compress', while caller doesn't call
* inode_need_compress() to check if we really need to
* compress.
*
* Not a big deal, just need to inform caller that we
* haven't allocated any pages yet.
*/
*out_pages = 0;
return -E2BIG;
}
}
static int compression_decompress_bio(int type, struct list_head *ws,
struct compressed_bio *cb)
{
switch (type) {
case BTRFS_COMPRESS_ZLIB: return zlib_decompress_bio(ws, cb);
case BTRFS_COMPRESS_LZO: return lzo_decompress_bio(ws, cb);
case BTRFS_COMPRESS_ZSTD: return zstd_decompress_bio(ws, cb);
case BTRFS_COMPRESS_NONE:
default:
/*
* This can't happen, the type is validated several times
* before we get here.
*/
BUG();
}
}
static int compression_decompress(int type, struct list_head *ws,
unsigned char *data_in, struct page *dest_page,
unsigned long start_byte, size_t srclen, size_t destlen)
{
switch (type) {
case BTRFS_COMPRESS_ZLIB: return zlib_decompress(ws, data_in, dest_page,
start_byte, srclen, destlen);
case BTRFS_COMPRESS_LZO: return lzo_decompress(ws, data_in, dest_page,
start_byte, srclen, destlen);
case BTRFS_COMPRESS_ZSTD: return zstd_decompress(ws, data_in, dest_page,
start_byte, srclen, destlen);
case BTRFS_COMPRESS_NONE:
default:
/*
* This can't happen, the type is validated several times
* before we get here.
*/
BUG();
}
}
static int btrfs_decompress_bio(struct compressed_bio *cb);
static inline int compressed_bio_size(struct btrfs_fs_info *fs_info,
unsigned long disk_size)
{
return sizeof(struct compressed_bio) +
(DIV_ROUND_UP(disk_size, fs_info->sectorsize)) * fs_info->csum_size;
}
static int check_compressed_csum(struct btrfs_inode *inode, struct bio *bio,
u64 disk_start)
{
struct btrfs_fs_info *fs_info = inode->root->fs_info;
SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
const u32 csum_size = fs_info->csum_size;
const u32 sectorsize = fs_info->sectorsize;
struct page *page;
unsigned int i;
char *kaddr;
u8 csum[BTRFS_CSUM_SIZE];
struct compressed_bio *cb = bio->bi_private;
u8 *cb_sum = cb->sums;
if (!fs_info->csum_root || (inode->flags & BTRFS_INODE_NODATASUM))
return 0;
shash->tfm = fs_info->csum_shash;
for (i = 0; i < cb->nr_pages; i++) {
u32 pg_offset;
u32 bytes_left = PAGE_SIZE;
page = cb->compressed_pages[i];
/* Determine the remaining bytes inside the page first */
if (i == cb->nr_pages - 1)
bytes_left = cb->compressed_len - i * PAGE_SIZE;
/* Hash through the page sector by sector */
for (pg_offset = 0; pg_offset < bytes_left;
pg_offset += sectorsize) {
kaddr = kmap_atomic(page);
crypto_shash_digest(shash, kaddr + pg_offset,
sectorsize, csum);
kunmap_atomic(kaddr);
if (memcmp(&csum, cb_sum, csum_size) != 0) {
btrfs_print_data_csum_error(inode, disk_start,
csum, cb_sum, cb->mirror_num);
if (btrfs_io_bio(bio)->device)
btrfs_dev_stat_inc_and_print(
btrfs_io_bio(bio)->device,
BTRFS_DEV_STAT_CORRUPTION_ERRS);
return -EIO;
}
cb_sum += csum_size;
disk_start += sectorsize;
}
}
return 0;
}
/* when we finish reading compressed pages from the disk, we
* decompress them and then run the bio end_io routines on the
* decompressed pages (in the inode address space).
*
* This allows the checksumming and other IO error handling routines
* to work normally
*
* The compressed pages are freed here, and it must be run
* in process context
*/
static void end_compressed_bio_read(struct bio *bio)
{
struct compressed_bio *cb = bio->bi_private;
struct inode *inode;
struct page *page;
unsigned int index;
unsigned int mirror = btrfs_io_bio(bio)->mirror_num;
int ret = 0;
if (bio->bi_status)
cb->errors = 1;
/* if there are more bios still pending for this compressed
* extent, just exit
*/
if (!refcount_dec_and_test(&cb->pending_bios))
goto out;
/*
* Record the correct mirror_num in cb->orig_bio so that
* read-repair can work properly.
*/
btrfs_io_bio(cb->orig_bio)->mirror_num = mirror;
cb->mirror_num = mirror;
/*
* Some IO in this cb have failed, just skip checksum as there
* is no way it could be correct.
*/
if (cb->errors == 1)
goto csum_failed;
inode = cb->inode;
ret = check_compressed_csum(BTRFS_I(inode), bio,
bio->bi_iter.bi_sector << 9);
if (ret)
goto csum_failed;
/* ok, we're the last bio for this extent, lets start
* the decompression.
*/
ret = btrfs_decompress_bio(cb);
csum_failed:
if (ret)
cb->errors = 1;
/* release the compressed pages */
index = 0;
for (index = 0; index < cb->nr_pages; index++) {
page = cb->compressed_pages[index];
page->mapping = NULL;
put_page(page);
}
/* do io completion on the original bio */
if (cb->errors) {
bio_io_error(cb->orig_bio);
} else {
struct bio_vec *bvec;
struct bvec_iter_all iter_all;
/*
* we have verified the checksum already, set page
* checked so the end_io handlers know about it
*/
ASSERT(!bio_flagged(bio, BIO_CLONED));
bio_for_each_segment_all(bvec, cb->orig_bio, iter_all)
SetPageChecked(bvec->bv_page);
bio_endio(cb->orig_bio);
}
/* finally free the cb struct */
kfree(cb->compressed_pages);
kfree(cb);
out:
bio_put(bio);
}
/*
* Clear the writeback bits on all of the file
* pages for a compressed write
*/
static noinline void end_compressed_writeback(struct inode *inode,
const struct compressed_bio *cb)
{
unsigned long index = cb->start >> PAGE_SHIFT;
unsigned long end_index = (cb->start + cb->len - 1) >> PAGE_SHIFT;
struct page *pages[16];
unsigned long nr_pages = end_index - index + 1;
int i;
int ret;
if (cb->errors)
mapping_set_error(inode->i_mapping, -EIO);
while (nr_pages > 0) {
ret = find_get_pages_contig(inode->i_mapping, index,
min_t(unsigned long,
nr_pages, ARRAY_SIZE(pages)), pages);
if (ret == 0) {
nr_pages -= 1;
index += 1;
continue;
}
for (i = 0; i < ret; i++) {
if (cb->errors)
SetPageError(pages[i]);
end_page_writeback(pages[i]);
put_page(pages[i]);
}
nr_pages -= ret;
index += ret;
}
/* the inode may be gone now */
}
/*
* do the cleanup once all the compressed pages hit the disk.
* This will clear writeback on the file pages and free the compressed
* pages.
*
* This also calls the writeback end hooks for the file pages so that
* metadata and checksums can be updated in the file.
*/
static void end_compressed_bio_write(struct bio *bio)
{
struct compressed_bio *cb = bio->bi_private;
struct inode *inode;
struct page *page;
unsigned int index;
if (bio->bi_status)
cb->errors = 1;
/* if there are more bios still pending for this compressed
* extent, just exit
*/
if (!refcount_dec_and_test(&cb->pending_bios))
goto out;
/* ok, we're the last bio for this extent, step one is to
* call back into the FS and do all the end_io operations
*/
inode = cb->inode;
btrfs_record_physical_zoned(inode, cb->start, bio);
btrfs_writepage_endio_finish_ordered(BTRFS_I(inode), NULL,
cb->start, cb->start + cb->len - 1,
!cb->errors);
end_compressed_writeback(inode, cb);
/* note, our inode could be gone now */
/*
* release the compressed pages, these came from alloc_page and
* are not attached to the inode at all
*/
index = 0;
for (index = 0; index < cb->nr_pages; index++) {
page = cb->compressed_pages[index];
page->mapping = NULL;
put_page(page);
}
/* finally free the cb struct */
kfree(cb->compressed_pages);
kfree(cb);
out:
bio_put(bio);
}
/*
* worker function to build and submit bios for previously compressed pages.
* The corresponding pages in the inode should be marked for writeback
* and the compressed pages should have a reference on them for dropping
* when the IO is complete.
*
* This also checksums the file bytes and gets things ready for
* the end io hooks.
*/
blk_status_t btrfs_submit_compressed_write(struct btrfs_inode *inode, u64 start,
unsigned int len, u64 disk_start,
unsigned int compressed_len,
struct page **compressed_pages,
unsigned int nr_pages,
unsigned int write_flags,
struct cgroup_subsys_state *blkcg_css)
{
struct btrfs_fs_info *fs_info = inode->root->fs_info;
struct bio *bio = NULL;
struct compressed_bio *cb;
unsigned long bytes_left;
int pg_index = 0;
struct page *page;
u64 first_byte = disk_start;
blk_status_t ret;
int skip_sum = inode->flags & BTRFS_INODE_NODATASUM;
const bool use_append = btrfs_use_zone_append(inode, disk_start);
const unsigned int bio_op = use_append ? REQ_OP_ZONE_APPEND : REQ_OP_WRITE;
WARN_ON(!PAGE_ALIGNED(start));
cb = kmalloc(compressed_bio_size(fs_info, compressed_len), GFP_NOFS);
if (!cb)
return BLK_STS_RESOURCE;
refcount_set(&cb->pending_bios, 0);
cb->errors = 0;
cb->inode = &inode->vfs_inode;
cb->start = start;
cb->len = len;
cb->mirror_num = 0;
cb->compressed_pages = compressed_pages;
cb->compressed_len = compressed_len;
cb->orig_bio = NULL;
cb->nr_pages = nr_pages;
bio = btrfs_bio_alloc(first_byte);
bio->bi_opf = bio_op | write_flags;
bio->bi_private = cb;
bio->bi_end_io = end_compressed_bio_write;
if (use_append) {
struct btrfs_device *device;
device = btrfs_zoned_get_device(fs_info, disk_start, PAGE_SIZE);
if (IS_ERR(device)) {
kfree(cb);
bio_put(bio);
return BLK_STS_NOTSUPP;
}
bio_set_dev(bio, device->bdev);
}
if (blkcg_css) {
bio->bi_opf |= REQ_CGROUP_PUNT;
kthread_associate_blkcg(blkcg_css);
}
refcount_set(&cb->pending_bios, 1);
/* create and submit bios for the compressed pages */
bytes_left = compressed_len;
for (pg_index = 0; pg_index < cb->nr_pages; pg_index++) {
int submit = 0;
int len = 0;
page = compressed_pages[pg_index];
page->mapping = inode->vfs_inode.i_mapping;
if (bio->bi_iter.bi_size)
submit = btrfs_bio_fits_in_stripe(page, PAGE_SIZE, bio,
0);
/*
* Page can only be added to bio if the current bio fits in
* stripe.
*/
if (!submit) {
if (pg_index == 0 && use_append)
len = bio_add_zone_append_page(bio, page,
PAGE_SIZE, 0);
else
len = bio_add_page(bio, page, PAGE_SIZE, 0);
}
page->mapping = NULL;
if (submit || len < PAGE_SIZE) {
/*
* inc the count before we submit the bio so
* we know the end IO handler won't happen before
* we inc the count. Otherwise, the cb might get
* freed before we're done setting it up
*/
refcount_inc(&cb->pending_bios);
ret = btrfs_bio_wq_end_io(fs_info, bio,
BTRFS_WQ_ENDIO_DATA);
BUG_ON(ret); /* -ENOMEM */
if (!skip_sum) {
ret = btrfs_csum_one_bio(inode, bio, start, 1);
BUG_ON(ret); /* -ENOMEM */
}
ret = btrfs_map_bio(fs_info, bio, 0);
if (ret) {
bio->bi_status = ret;
bio_endio(bio);
}
bio = btrfs_bio_alloc(first_byte);
bio->bi_opf = bio_op | write_flags;
bio->bi_private = cb;
bio->bi_end_io = end_compressed_bio_write;
if (blkcg_css)
bio->bi_opf |= REQ_CGROUP_PUNT;
/*
* Use bio_add_page() to ensure the bio has at least one
* page.
*/
bio_add_page(bio, page, PAGE_SIZE, 0);
}
if (bytes_left < PAGE_SIZE) {
btrfs_info(fs_info,
"bytes left %lu compress len %u nr %u",
bytes_left, cb->compressed_len, cb->nr_pages);
}
bytes_left -= PAGE_SIZE;
first_byte += PAGE_SIZE;
cond_resched();
}
ret = btrfs_bio_wq_end_io(fs_info, bio, BTRFS_WQ_ENDIO_DATA);
BUG_ON(ret); /* -ENOMEM */
if (!skip_sum) {
ret = btrfs_csum_one_bio(inode, bio, start, 1);
BUG_ON(ret); /* -ENOMEM */
}
ret = btrfs_map_bio(fs_info, bio, 0);
if (ret) {
bio->bi_status = ret;
bio_endio(bio);
}
if (blkcg_css)
kthread_associate_blkcg(NULL);
return 0;
}
static u64 bio_end_offset(struct bio *bio)
{
struct bio_vec *last = bio_last_bvec_all(bio);
return page_offset(last->bv_page) + last->bv_len + last->bv_offset;
}
static noinline int add_ra_bio_pages(struct inode *inode,
u64 compressed_end,
struct compressed_bio *cb)
{
unsigned long end_index;
unsigned long pg_index;
u64 last_offset;
u64 isize = i_size_read(inode);
int ret;
struct page *page;
struct extent_map *em;
struct address_space *mapping = inode->i_mapping;
struct extent_map_tree *em_tree;
struct extent_io_tree *tree;
u64 end;
int misses = 0;
last_offset = bio_end_offset(cb->orig_bio);
em_tree = &BTRFS_I(inode)->extent_tree;
tree = &BTRFS_I(inode)->io_tree;
if (isize == 0)
return 0;
/*
* For current subpage support, we only support 64K page size,
* which means maximum compressed extent size (128K) is just 2x page
* size.
* This makes readahead less effective, so here disable readahead for
* subpage for now, until full compressed write is supported.
*/
if (btrfs_sb(inode->i_sb)->sectorsize < PAGE_SIZE)
return 0;
end_index = (i_size_read(inode) - 1) >> PAGE_SHIFT;
while (last_offset < compressed_end) {
pg_index = last_offset >> PAGE_SHIFT;
if (pg_index > end_index)
break;
page = xa_load(&mapping->i_pages, pg_index);
if (page && !xa_is_value(page)) {
misses++;
if (misses > 4)
break;
goto next;
}
page = __page_cache_alloc(mapping_gfp_constraint(mapping,
~__GFP_FS));
if (!page)
break;
if (add_to_page_cache_lru(page, mapping, pg_index, GFP_NOFS)) {
put_page(page);
goto next;
}
/*
* at this point, we have a locked page in the page cache
* for these bytes in the file. But, we have to make
* sure they map to this compressed extent on disk.
*/
ret = set_page_extent_mapped(page);
if (ret < 0) {
unlock_page(page);
put_page(page);
break;
}
end = last_offset + PAGE_SIZE - 1;
lock_extent(tree, last_offset, end);
read_lock(&em_tree->lock);
em = lookup_extent_mapping(em_tree, last_offset,
PAGE_SIZE);
read_unlock(&em_tree->lock);
if (!em || last_offset < em->start ||
(last_offset + PAGE_SIZE > extent_map_end(em)) ||
(em->block_start >> 9) != cb->orig_bio->bi_iter.bi_sector) {
free_extent_map(em);
unlock_extent(tree, last_offset, end);
unlock_page(page);
put_page(page);
break;
}
free_extent_map(em);
if (page->index == end_index) {
size_t zero_offset = offset_in_page(isize);
if (zero_offset) {
int zeros;
zeros = PAGE_SIZE - zero_offset;
memzero_page(page, zero_offset, zeros);
flush_dcache_page(page);
}
}
ret = bio_add_page(cb->orig_bio, page,
PAGE_SIZE, 0);
if (ret == PAGE_SIZE) {
put_page(page);
} else {
unlock_extent(tree, last_offset, end);
unlock_page(page);
put_page(page);
break;
}
next:
last_offset += PAGE_SIZE;
}
return 0;
}
/*
* for a compressed read, the bio we get passed has all the inode pages
* in it. We don't actually do IO on those pages but allocate new ones
* to hold the compressed pages on disk.
*
* bio->bi_iter.bi_sector points to the compressed extent on disk
* bio->bi_io_vec points to all of the inode pages
*
* After the compressed pages are read, we copy the bytes into the
* bio we were passed and then call the bio end_io calls
*/
blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
int mirror_num, unsigned long bio_flags)
{
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct extent_map_tree *em_tree;
struct compressed_bio *cb;
unsigned int compressed_len;
unsigned int nr_pages;
unsigned int pg_index;
struct page *page;
struct bio *comp_bio;
u64 cur_disk_byte = bio->bi_iter.bi_sector << 9;
u64 file_offset;
u64 em_len;
u64 em_start;
struct extent_map *em;
blk_status_t ret = BLK_STS_RESOURCE;
int faili = 0;
u8 *sums;
em_tree = &BTRFS_I(inode)->extent_tree;
file_offset = bio_first_bvec_all(bio)->bv_offset +
page_offset(bio_first_page_all(bio));
/* we need the actual starting offset of this extent in the file */
read_lock(&em_tree->lock);
em = lookup_extent_mapping(em_tree, file_offset, fs_info->sectorsize);
read_unlock(&em_tree->lock);
if (!em)
return BLK_STS_IOERR;
ASSERT(em->compress_type != BTRFS_COMPRESS_NONE);
compressed_len = em->block_len;
cb = kmalloc(compressed_bio_size(fs_info, compressed_len), GFP_NOFS);
if (!cb)
goto out;
refcount_set(&cb->pending_bios, 0);
cb->errors = 0;
cb->inode = inode;
cb->mirror_num = mirror_num;
sums = cb->sums;
cb->start = em->orig_start;
em_len = em->len;
em_start = em->start;
free_extent_map(em);
em = NULL;
cb->len = bio->bi_iter.bi_size;
cb->compressed_len = compressed_len;
cb->compress_type = extent_compress_type(bio_flags);
cb->orig_bio = bio;
nr_pages = DIV_ROUND_UP(compressed_len, PAGE_SIZE);
cb->compressed_pages = kcalloc(nr_pages, sizeof(struct page *),
GFP_NOFS);
if (!cb->compressed_pages)
goto fail1;
for (pg_index = 0; pg_index < nr_pages; pg_index++) {
cb->compressed_pages[pg_index] = alloc_page(GFP_NOFS);
if (!cb->compressed_pages[pg_index]) {
faili = pg_index - 1;
ret = BLK_STS_RESOURCE;
goto fail2;
}
}
faili = nr_pages - 1;
cb->nr_pages = nr_pages;
add_ra_bio_pages(inode, em_start + em_len, cb);
/* include any pages we added in add_ra-bio_pages */
cb->len = bio->bi_iter.bi_size;
comp_bio = btrfs_bio_alloc(cur_disk_byte);
comp_bio->bi_opf = REQ_OP_READ;
comp_bio->bi_private = cb;
comp_bio->bi_end_io = end_compressed_bio_read;
refcount_set(&cb->pending_bios, 1);
for (pg_index = 0; pg_index < nr_pages; pg_index++) {
u32 pg_len = PAGE_SIZE;
int submit = 0;
/*
* To handle subpage case, we need to make sure the bio only
* covers the range we need.
*
* If we're at the last page, truncate the length to only cover
* the remaining part.
*/
if (pg_index == nr_pages - 1)
pg_len = min_t(u32, PAGE_SIZE,
compressed_len - pg_index * PAGE_SIZE);
page = cb->compressed_pages[pg_index];
page->mapping = inode->i_mapping;
page->index = em_start >> PAGE_SHIFT;
if (comp_bio->bi_iter.bi_size)
submit = btrfs_bio_fits_in_stripe(page, pg_len,
comp_bio, 0);
page->mapping = NULL;
if (submit || bio_add_page(comp_bio, page, pg_len, 0) < pg_len) {
unsigned int nr_sectors;
ret = btrfs_bio_wq_end_io(fs_info, comp_bio,
BTRFS_WQ_ENDIO_DATA);
BUG_ON(ret); /* -ENOMEM */
/*
* inc the count before we submit the bio so
* we know the end IO handler won't happen before
* we inc the count. Otherwise, the cb might get
* freed before we're done setting it up
*/
refcount_inc(&cb->pending_bios);
ret = btrfs_lookup_bio_sums(inode, comp_bio, sums);
BUG_ON(ret); /* -ENOMEM */
nr_sectors = DIV_ROUND_UP(comp_bio->bi_iter.bi_size,
fs_info->sectorsize);
sums += fs_info->csum_size * nr_sectors;
ret = btrfs_map_bio(fs_info, comp_bio, mirror_num);
if (ret) {
comp_bio->bi_status = ret;
bio_endio(comp_bio);
}
comp_bio = btrfs_bio_alloc(cur_disk_byte);
comp_bio->bi_opf = REQ_OP_READ;
comp_bio->bi_private = cb;
comp_bio->bi_end_io = end_compressed_bio_read;
bio_add_page(comp_bio, page, pg_len, 0);
}
cur_disk_byte += pg_len;
}
ret = btrfs_bio_wq_end_io(fs_info, comp_bio, BTRFS_WQ_ENDIO_DATA);
BUG_ON(ret); /* -ENOMEM */
ret = btrfs_lookup_bio_sums(inode, comp_bio, sums);
BUG_ON(ret); /* -ENOMEM */
ret = btrfs_map_bio(fs_info, comp_bio, mirror_num);
if (ret) {
comp_bio->bi_status = ret;
bio_endio(comp_bio);
}
return 0;
fail2:
while (faili >= 0) {
__free_page(cb->compressed_pages[faili]);
faili--;
}
kfree(cb->compressed_pages);
fail1:
kfree(cb);
out:
free_extent_map(em);
return ret;
}
/*
* Heuristic uses systematic sampling to collect data from the input data
* range, the logic can be tuned by the following constants:
*
* @SAMPLING_READ_SIZE - how many bytes will be copied from for each sample
* @SAMPLING_INTERVAL - range from which the sampled data can be collected
*/
#define SAMPLING_READ_SIZE (16)
#define SAMPLING_INTERVAL (256)
/*
* For statistical analysis of the input data we consider bytes that form a
* Galois Field of 256 objects. Each object has an attribute count, ie. how
* many times the object appeared in the sample.
*/
#define BUCKET_SIZE (256)
/*
* The size of the sample is based on a statistical sampling rule of thumb.
* The common way is to perform sampling tests as long as the number of
* elements in each cell is at least 5.
*
* Instead of 5, we choose 32 to obtain more accurate results.
* If the data contain the maximum number of symbols, which is 256, we obtain a
* sample size bound by 8192.
*
* For a sample of at most 8KB of data per data range: 16 consecutive bytes
* from up to 512 locations.
*/
#define MAX_SAMPLE_SIZE (BTRFS_MAX_UNCOMPRESSED * \
SAMPLING_READ_SIZE / SAMPLING_INTERVAL)
struct bucket_item {
u32 count;
};
struct heuristic_ws {
/* Partial copy of input data */
u8 *sample;
u32 sample_size;
/* Buckets store counters for each byte value */
struct bucket_item *bucket;
/* Sorting buffer */
struct bucket_item *bucket_b;
struct list_head list;
};
static struct workspace_manager heuristic_wsm;
static void free_heuristic_ws(struct list_head *ws)
{
struct heuristic_ws *workspace;
workspace = list_entry(ws, struct heuristic_ws, list);
kvfree(workspace->sample);
kfree(workspace->bucket);
kfree(workspace->bucket_b);
kfree(workspace);
}
static struct list_head *alloc_heuristic_ws(unsigned int level)
{
struct heuristic_ws *ws;
ws = kzalloc(sizeof(*ws), GFP_KERNEL);
if (!ws)
return ERR_PTR(-ENOMEM);
ws->sample = kvmalloc(MAX_SAMPLE_SIZE, GFP_KERNEL);
if (!ws->sample)
goto fail;
ws->bucket = kcalloc(BUCKET_SIZE, sizeof(*ws->bucket), GFP_KERNEL);
if (!ws->bucket)
goto fail;
ws->bucket_b = kcalloc(BUCKET_SIZE, sizeof(*ws->bucket_b), GFP_KERNEL);
if (!ws->bucket_b)
goto fail;
INIT_LIST_HEAD(&ws->list);
return &ws->list;
fail:
free_heuristic_ws(&ws->list);
return ERR_PTR(-ENOMEM);
}
const struct btrfs_compress_op btrfs_heuristic_compress = {
.workspace_manager = &heuristic_wsm,
};
static const struct btrfs_compress_op * const btrfs_compress_op[] = {
/* The heuristic is represented as compression type 0 */
&btrfs_heuristic_compress,
&btrfs_zlib_compress,
&btrfs_lzo_compress,
&btrfs_zstd_compress,
};
static struct list_head *alloc_workspace(int type, unsigned int level)
{
switch (type) {
case BTRFS_COMPRESS_NONE: return alloc_heuristic_ws(level);
case BTRFS_COMPRESS_ZLIB: return zlib_alloc_workspace(level);
case BTRFS_COMPRESS_LZO: return lzo_alloc_workspace(level);
case BTRFS_COMPRESS_ZSTD: return zstd_alloc_workspace(level);
default:
/*
* This can't happen, the type is validated several times
* before we get here.
*/
BUG();
}
}
static void free_workspace(int type, struct list_head *ws)
{
switch (type) {
case BTRFS_COMPRESS_NONE: return free_heuristic_ws(ws);
case BTRFS_COMPRESS_ZLIB: return zlib_free_workspace(ws);
case BTRFS_COMPRESS_LZO: return lzo_free_workspace(ws);
case BTRFS_COMPRESS_ZSTD: return zstd_free_workspace(ws);
default:
/*
* This can't happen, the type is validated several times
* before we get here.
*/
BUG();
}
}
static void btrfs_init_workspace_manager(int type)
{
struct workspace_manager *wsm;
struct list_head *workspace;
wsm = btrfs_compress_op[type]->workspace_manager;
INIT_LIST_HEAD(&wsm->idle_ws);
spin_lock_init(&wsm->ws_lock);
atomic_set(&wsm->total_ws, 0);
init_waitqueue_head(&wsm->ws_wait);
/*
* Preallocate one workspace for each compression type so we can
* guarantee forward progress in the worst case
*/
workspace = alloc_workspace(type, 0);
if (IS_ERR(workspace)) {
pr_warn(
"BTRFS: cannot preallocate compression workspace, will try later\n");
} else {
atomic_set(&wsm->total_ws, 1);
wsm->free_ws = 1;
list_add(workspace, &wsm->idle_ws);
}
}
static void btrfs_cleanup_workspace_manager(int type)
{
struct workspace_manager *wsman;
struct list_head *ws;
wsman = btrfs_compress_op[type]->workspace_manager;
while (!list_empty(&wsman->idle_ws)) {
ws = wsman->idle_ws.next;
list_del(ws);
free_workspace(type, ws);
atomic_dec(&wsman->total_ws);
}
}
/*
* This finds an available workspace or allocates a new one.
* If it's not possible to allocate a new one, waits until there's one.
* Preallocation makes a forward progress guarantees and we do not return
* errors.
*/
struct list_head *btrfs_get_workspace(int type, unsigned int level)
{
struct workspace_manager *wsm;
struct list_head *workspace;
int cpus = num_online_cpus();
unsigned nofs_flag;
struct list_head *idle_ws;
spinlock_t *ws_lock;
atomic_t *total_ws;
wait_queue_head_t *ws_wait;
int *free_ws;
wsm = btrfs_compress_op[type]->workspace_manager;
idle_ws = &wsm->idle_ws;
ws_lock = &wsm->ws_lock;
total_ws = &wsm->total_ws;
ws_wait = &wsm->ws_wait;
free_ws = &wsm->free_ws;
again:
spin_lock(ws_lock);
if (!list_empty(idle_ws)) {
workspace = idle_ws->next;
list_del(workspace);
(*free_ws)--;
spin_unlock(ws_lock);
return workspace;
}
if (atomic_read(total_ws) > cpus) {
DEFINE_WAIT(wait);
spin_unlock(ws_lock);
prepare_to_wait(ws_wait, &wait, TASK_UNINTERRUPTIBLE);
if (atomic_read(total_ws) > cpus && !*free_ws) schedule(); finish_wait(ws_wait, &wait);
goto again;
}
atomic_inc(total_ws);
spin_unlock(ws_lock);
/*
* Allocation helpers call vmalloc that can't use GFP_NOFS, so we have
* to turn it off here because we might get called from the restricted
* context of btrfs_compress_bio/btrfs_compress_pages
*/
nofs_flag = memalloc_nofs_save();
workspace = alloc_workspace(type, level);
memalloc_nofs_restore(nofs_flag);
if (IS_ERR(workspace)) {
atomic_dec(total_ws);
wake_up(ws_wait);
/*
* Do not return the error but go back to waiting. There's a
* workspace preallocated for each type and the compression
* time is bounded so we get to a workspace eventually. This
* makes our caller's life easier.
*
* To prevent silent and low-probability deadlocks (when the
* initial preallocation fails), check if there are any
* workspaces at all.
*/
if (atomic_read(total_ws) == 0) {
static DEFINE_RATELIMIT_STATE(_rs,
/* once per minute */ 60 * HZ,
/* no burst */ 1);
if (__ratelimit(&_rs)) {
pr_warn("BTRFS: no compression workspaces, low memory, retrying\n");
}
}
goto again;
}
return workspace;
}
static struct list_head *get_workspace(int type, int level)
{
switch (type) { case BTRFS_COMPRESS_NONE: return btrfs_get_workspace(type, level); case BTRFS_COMPRESS_ZLIB: return zlib_get_workspace(level); case BTRFS_COMPRESS_LZO: return btrfs_get_workspace(type, level); case BTRFS_COMPRESS_ZSTD: return zstd_get_workspace(level);
default:
/*
* This can't happen, the type is validated several times
* before we get here.
*/
BUG();
}
}
/*
* put a workspace struct back on the list or free it if we have enough
* idle ones sitting around
*/
void btrfs_put_workspace(int type, struct list_head *ws)
{
struct workspace_manager *wsm;
struct list_head *idle_ws;
spinlock_t *ws_lock;
atomic_t *total_ws;
wait_queue_head_t *ws_wait;
int *free_ws;
wsm = btrfs_compress_op[type]->workspace_manager;
idle_ws = &wsm->idle_ws;
ws_lock = &wsm->ws_lock;
total_ws = &wsm->total_ws; ws_wait = &wsm->ws_wait;
free_ws = &wsm->free_ws;
spin_lock(ws_lock);
if (*free_ws <= num_online_cpus()) {
list_add(ws, idle_ws);
(*free_ws)++;
spin_unlock(ws_lock);
goto wake;
}
spin_unlock(ws_lock);
free_workspace(type, ws);
atomic_dec(total_ws);
wake:
cond_wake_up(ws_wait);
}
static void put_workspace(int type, struct list_head *ws)
{
switch (type) { case BTRFS_COMPRESS_NONE: return btrfs_put_workspace(type, ws); case BTRFS_COMPRESS_ZLIB: return btrfs_put_workspace(type, ws); case BTRFS_COMPRESS_LZO: return btrfs_put_workspace(type, ws); case BTRFS_COMPRESS_ZSTD: return zstd_put_workspace(ws);
default:
/*
* This can't happen, the type is validated several times
* before we get here.
*/
BUG();
}
}
/*
* Adjust @level according to the limits of the compression algorithm or
* fallback to default
*/
static unsigned int btrfs_compress_set_level(int type, unsigned level)
{
const struct btrfs_compress_op *ops = btrfs_compress_op[type];
if (level == 0)
level = ops->default_level;
else
level = min(level, ops->max_level);
return level;
}
/*
* Given an address space and start and length, compress the bytes into @pages
* that are allocated on demand.
*
* @type_level is encoded algorithm and level, where level 0 means whatever
* default the algorithm chooses and is opaque here;
* - compression algo are 0-3
* - the level are bits 4-7
*
* @out_pages is an in/out parameter, holds maximum number of pages to allocate
* and returns number of actually allocated pages
*
* @total_in is used to return the number of bytes actually read. It
* may be smaller than the input length if we had to exit early because we
* ran out of room in the pages array or because we cross the
* max_out threshold.
*
* @total_out is an in/out parameter, must be set to the input length and will
* be also used to return the total number of compressed bytes
*/
int btrfs_compress_pages(unsigned int type_level, struct address_space *mapping,
u64 start, struct page **pages,
unsigned long *out_pages,
unsigned long *total_in,
unsigned long *total_out)
{
int type = btrfs_compress_type(type_level);
int level = btrfs_compress_level(type_level);
struct list_head *workspace;
int ret;
level = btrfs_compress_set_level(type, level);
workspace = get_workspace(type, level);
ret = compression_compress_pages(type, workspace, mapping, start, pages,
out_pages, total_in, total_out);
put_workspace(type, workspace);
return ret;
}
static int btrfs_decompress_bio(struct compressed_bio *cb)
{
struct list_head *workspace;
int ret;
int type = cb->compress_type;
workspace = get_workspace(type, 0);
ret = compression_decompress_bio(type, workspace, cb);
put_workspace(type, workspace);
return ret;
}
/*
* a less complex decompression routine. Our compressed data fits in a
* single page, and we want to read a single page out of it.
* start_byte tells us the offset into the compressed data we're interested in
*/
int btrfs_decompress(int type, unsigned char *data_in, struct page *dest_page,
unsigned long start_byte, size_t srclen, size_t destlen)
{
struct list_head *workspace;
int ret;
workspace = get_workspace(type, 0);
ret = compression_decompress(type, workspace, data_in, dest_page,
start_byte, srclen, destlen);
put_workspace(type, workspace);
return ret;
}
void __init btrfs_init_compress(void)
{
btrfs_init_workspace_manager(BTRFS_COMPRESS_NONE);
btrfs_init_workspace_manager(BTRFS_COMPRESS_ZLIB);
btrfs_init_workspace_manager(BTRFS_COMPRESS_LZO);
zstd_init_workspace_manager();
}
void __cold btrfs_exit_compress(void)
{
btrfs_cleanup_workspace_manager(BTRFS_COMPRESS_NONE);
btrfs_cleanup_workspace_manager(BTRFS_COMPRESS_ZLIB);
btrfs_cleanup_workspace_manager(BTRFS_COMPRESS_LZO);
zstd_cleanup_workspace_manager();
}
/*
* Copy decompressed data from working buffer to pages.
*
* @buf: The decompressed data buffer
* @buf_len: The decompressed data length
* @decompressed: Number of bytes that are already decompressed inside the
* compressed extent
* @cb: The compressed extent descriptor
* @orig_bio: The original bio that the caller wants to read for
*
* An easier to understand graph is like below:
*
* |<- orig_bio ->| |<- orig_bio->|
* |<------- full decompressed extent ----->|
* |<----------- @cb range ---->|
* | |<-- @buf_len -->|
* |<--- @decompressed --->|
*
* Note that, @cb can be a subpage of the full decompressed extent, but
* @cb->start always has the same as the orig_file_offset value of the full
* decompressed extent.
*
* When reading compressed extent, we have to read the full compressed extent,
* while @orig_bio may only want part of the range.
* Thus this function will ensure only data covered by @orig_bio will be copied
* to.
*
* Return 0 if we have copied all needed contents for @orig_bio.
* Return >0 if we need continue decompress.
*/
int btrfs_decompress_buf2page(const char *buf, u32 buf_len,
struct compressed_bio *cb, u32 decompressed)
{
struct bio *orig_bio = cb->orig_bio;
/* Offset inside the full decompressed extent */
u32 cur_offset;
cur_offset = decompressed;
/* The main loop to do the copy */
while (cur_offset < decompressed + buf_len) {
struct bio_vec bvec;
size_t copy_len;
u32 copy_start;
/* Offset inside the full decompressed extent */
u32 bvec_offset;
bvec = bio_iter_iovec(orig_bio, orig_bio->bi_iter);
/*
* cb->start may underflow, but subtracting that value can still
* give us correct offset inside the full decompressed extent.
*/
bvec_offset = page_offset(bvec.bv_page) + bvec.bv_offset - cb->start;
/* Haven't reached the bvec range, exit */
if (decompressed + buf_len <= bvec_offset)
return 1;
copy_start = max(cur_offset, bvec_offset);
copy_len = min(bvec_offset + bvec.bv_len,
decompressed + buf_len) - copy_start;
ASSERT(copy_len);
/*
* Extra range check to ensure we didn't go beyond
* @buf + @buf_len.
*/
ASSERT(copy_start - decompressed < buf_len);
memcpy_to_page(bvec.bv_page, bvec.bv_offset,
buf + copy_start - decompressed, copy_len);
flush_dcache_page(bvec.bv_page);
cur_offset += copy_len;
bio_advance(orig_bio, copy_len);
/* Finished the bio */
if (!orig_bio->bi_iter.bi_size)
return 0;
}
return 1;
}
/*
* Shannon Entropy calculation
*
* Pure byte distribution analysis fails to determine compressibility of data.
* Try calculating entropy to estimate the average minimum number of bits
* needed to encode the sampled data.
*
* For convenience, return the percentage of needed bits, instead of amount of
* bits directly.
*
* @ENTROPY_LVL_ACEPTABLE - below that threshold, sample has low byte entropy
* and can be compressible with high probability
*
* @ENTROPY_LVL_HIGH - data are not compressible with high probability
*
* Use of ilog2() decreases precision, we lower the LVL to 5 to compensate.
*/
#define ENTROPY_LVL_ACEPTABLE (65)
#define ENTROPY_LVL_HIGH (80)
/*
* For increasead precision in shannon_entropy calculation,
* let's do pow(n, M) to save more digits after comma:
*
* - maximum int bit length is 64
* - ilog2(MAX_SAMPLE_SIZE) -> 13
* - 13 * 4 = 52 < 64 -> M = 4
*
* So use pow(n, 4).
*/
static inline u32 ilog2_w(u64 n)
{
return ilog2(n * n * n * n);
}
static u32 shannon_entropy(struct heuristic_ws *ws)
{
const u32 entropy_max = 8 * ilog2_w(2);
u32 entropy_sum = 0;
u32 p, p_base, sz_base;
u32 i;
sz_base = ilog2_w(ws->sample_size);
for (i = 0; i < BUCKET_SIZE && ws->bucket[i].count > 0; i++) {
p = ws->bucket[i].count;
p_base = ilog2_w(p);
entropy_sum += p * (sz_base - p_base);
}
entropy_sum /= ws->sample_size;
return entropy_sum * 100 / entropy_max;
}
#define RADIX_BASE 4U
#define COUNTERS_SIZE (1U << RADIX_BASE)
static u8 get4bits(u64 num, int shift) {
u8 low4bits;
num >>= shift;
/* Reverse order */
low4bits = (COUNTERS_SIZE - 1) - (num % COUNTERS_SIZE);
return low4bits;
}
/*
* Use 4 bits as radix base
* Use 16 u32 counters for calculating new position in buf array
*
* @array - array that will be sorted
* @array_buf - buffer array to store sorting results
* must be equal in size to @array
* @num - array size
*/
static void radix_sort(struct bucket_item *array, struct bucket_item *array_buf,
int num)
{
u64 max_num;
u64 buf_num;
u32 counters[COUNTERS_SIZE];
u32 new_addr;
u32 addr;
int bitlen;
int shift;
int i;
/*
* Try avoid useless loop iterations for small numbers stored in big
* counters. Example: 48 33 4 ... in 64bit array
*/
max_num = array[0].count;
for (i = 1; i < num; i++) {
buf_num = array[i].count;
if (buf_num > max_num)
max_num = buf_num;
}
buf_num = ilog2(max_num);
bitlen = ALIGN(buf_num, RADIX_BASE * 2);
shift = 0;
while (shift < bitlen) { memset(counters, 0, sizeof(counters));
for (i = 0; i < num; i++) {
buf_num = array[i].count;
addr = get4bits(buf_num, shift);
counters[addr]++;
}
for (i = 1; i < COUNTERS_SIZE; i++)
counters[i] += counters[i - 1];
for (i = num - 1; i >= 0; i--) {
buf_num = array[i].count;
addr = get4bits(buf_num, shift);
counters[addr]--;
new_addr = counters[addr];
array_buf[new_addr] = array[i];
}
shift += RADIX_BASE;
/*
* Normal radix expects to move data from a temporary array, to
* the main one. But that requires some CPU time. Avoid that
* by doing another sort iteration to original array instead of
* memcpy()
*/
memset(counters, 0, sizeof(counters));
for (i = 0; i < num; i ++) {
buf_num = array_buf[i].count;
addr = get4bits(buf_num, shift);
counters[addr]++;
}
for (i = 1; i < COUNTERS_SIZE; i++)
counters[i] += counters[i - 1];
for (i = num - 1; i >= 0; i--) {
buf_num = array_buf[i].count;
addr = get4bits(buf_num, shift);
counters[addr]--;
new_addr = counters[addr];
array[new_addr] = array_buf[i];
}
shift += RADIX_BASE;
}
}
/*
* Size of the core byte set - how many bytes cover 90% of the sample
*
* There are several types of structured binary data that use nearly all byte
* values. The distribution can be uniform and counts in all buckets will be
* nearly the same (eg. encrypted data). Unlikely to be compressible.
*
* Other possibility is normal (Gaussian) distribution, where the data could
* be potentially compressible, but we have to take a few more steps to decide
* how much.
*
* @BYTE_CORE_SET_LOW - main part of byte values repeated frequently,
* compression algo can easy fix that
* @BYTE_CORE_SET_HIGH - data have uniform distribution and with high
* probability is not compressible
*/
#define BYTE_CORE_SET_LOW (64)
#define BYTE_CORE_SET_HIGH (200)
static int byte_core_set_size(struct heuristic_ws *ws)
{
u32 i;
u32 coreset_sum = 0;
const u32 core_set_threshold = ws->sample_size * 90 / 100;
struct bucket_item *bucket = ws->bucket;
/* Sort in reverse order */
radix_sort(ws->bucket, ws->bucket_b, BUCKET_SIZE);
for (i = 0; i < BYTE_CORE_SET_LOW; i++)
coreset_sum += bucket[i].count; if (coreset_sum > core_set_threshold)
return i;
for (; i < BYTE_CORE_SET_HIGH && bucket[i].count > 0; i++) { coreset_sum += bucket[i].count;
if (coreset_sum > core_set_threshold)
break;
}
return i;
}
/*
* Count byte values in buckets.
* This heuristic can detect textual data (configs, xml, json, html, etc).
* Because in most text-like data byte set is restricted to limited number of
* possible characters, and that restriction in most cases makes data easy to
* compress.
*
* @BYTE_SET_THRESHOLD - consider all data within this byte set size:
* less - compressible
* more - need additional analysis
*/
#define BYTE_SET_THRESHOLD (64)
static u32 byte_set_size(const struct heuristic_ws *ws)
{
u32 i;
u32 byte_set_size = 0;
for (i = 0; i < BYTE_SET_THRESHOLD; i++) { if (ws->bucket[i].count > 0) byte_set_size++;
}
/*
* Continue collecting count of byte values in buckets. If the byte
* set size is bigger then the threshold, it's pointless to continue,
* the detection technique would fail for this type of data.
*/
for (; i < BUCKET_SIZE; i++) { if (ws->bucket[i].count > 0) { byte_set_size++;
if (byte_set_size > BYTE_SET_THRESHOLD)
return byte_set_size;
}
}
return byte_set_size;
}
static bool sample_repeated_patterns(struct heuristic_ws *ws)
{
const u32 half_of_sample = ws->sample_size / 2;
const u8 *data = ws->sample;
return memcmp(&data[0], &data[half_of_sample], half_of_sample) == 0;
}
static void heuristic_collect_sample(struct inode *inode, u64 start, u64 end,
struct heuristic_ws *ws)
{
struct page *page;
u64 index, index_end;
u32 i, curr_sample_pos;
u8 *in_data;
/*
* Compression handles the input data by chunks of 128KiB
* (defined by BTRFS_MAX_UNCOMPRESSED)
*
* We do the same for the heuristic and loop over the whole range.
*
* MAX_SAMPLE_SIZE - calculated under assumption that heuristic will
* process no more than BTRFS_MAX_UNCOMPRESSED at a time.
*/
if (end - start > BTRFS_MAX_UNCOMPRESSED)
end = start + BTRFS_MAX_UNCOMPRESSED; index = start >> PAGE_SHIFT;
index_end = end >> PAGE_SHIFT;
/* Don't miss unaligned end */
if (!IS_ALIGNED(end, PAGE_SIZE))
index_end++;
curr_sample_pos = 0;
while (index < index_end) { page = find_get_page(inode->i_mapping, index);
in_data = kmap_local_page(page);
/* Handle case where the start is not aligned to PAGE_SIZE */
i = start % PAGE_SIZE;
while (i < PAGE_SIZE - SAMPLING_READ_SIZE) {
/* Don't sample any garbage from the last page */
if (start > end - SAMPLING_READ_SIZE)
break;
memcpy(&ws->sample[curr_sample_pos], &in_data[i],
SAMPLING_READ_SIZE);
i += SAMPLING_INTERVAL;
start += SAMPLING_INTERVAL;
curr_sample_pos += SAMPLING_READ_SIZE;
}
kunmap_local(in_data);
put_page(page);
index++;
}
ws->sample_size = curr_sample_pos;
}
/*
* Compression heuristic.
*
* For now is's a naive and optimistic 'return true', we'll extend the logic to
* quickly (compared to direct compression) detect data characteristics
* (compressible/uncompressible) to avoid wasting CPU time on uncompressible
* data.
*
* The following types of analysis can be performed:
* - detect mostly zero data
* - detect data with low "byte set" size (text, etc)
* - detect data with low/high "core byte" set
*
* Return non-zero if the compression should be done, 0 otherwise.
*/
int btrfs_compress_heuristic(struct inode *inode, u64 start, u64 end)
{
struct list_head *ws_list = get_workspace(0, 0);
struct heuristic_ws *ws;
u32 i;
u8 byte;
int ret = 0;
ws = list_entry(ws_list, struct heuristic_ws, list);
heuristic_collect_sample(inode, start, end, ws);
if (sample_repeated_patterns(ws)) {
ret = 1;
goto out;
}
memset(ws->bucket, 0, sizeof(*ws->bucket)*BUCKET_SIZE);
for (i = 0; i < ws->sample_size; i++) {
byte = ws->sample[i];
ws->bucket[byte].count++;
}
i = byte_set_size(ws);
if (i < BYTE_SET_THRESHOLD) {
ret = 2;
goto out;
}
i = byte_core_set_size(ws);
if (i <= BYTE_CORE_SET_LOW) {
ret = 3;
goto out;
}
if (i >= BYTE_CORE_SET_HIGH) {
ret = 0;
goto out;
}
i = shannon_entropy(ws);
if (i <= ENTROPY_LVL_ACEPTABLE) {
ret = 4;
goto out;
}
/*
* For the levels below ENTROPY_LVL_HIGH, additional analysis would be
* needed to give green light to compression.
*
* For now just assume that compression at that level is not worth the
* resources because:
*
* 1. it is possible to defrag the data later
*
* 2. the data would turn out to be hardly compressible, eg. 150 byte
* values, every bucket has counter at level ~54. The heuristic would
* be confused. This can happen when data have some internal repeated
* patterns like "abbacbbc...". This can be detected by analyzing
* pairs of bytes, which is too costly.
*/
if (i < ENTROPY_LVL_HIGH) {
ret = 5;
goto out;
} else {
ret = 0;
goto out;
}
out:
put_workspace(0, ws_list);
return ret;
}
/*
* Convert the compression suffix (eg. after "zlib" starting with ":") to
* level, unrecognized string will set the default level
*/
unsigned int btrfs_compress_str2level(unsigned int type, const char *str)
{
unsigned int level = 0;
int ret;
if (!type)
return 0;
if (str[0] == ':') { ret = kstrtouint(str + 1, 10, &level);
if (ret)
level = 0;
}
level = btrfs_compress_set_level(type, level);
return level;
}
/* SPDX-License-Identifier: GPL-2.0-only */
#ifndef LLIST_H
#define LLIST_H
/*
* Lock-less NULL terminated single linked list
*
* Cases where locking is not needed:
* If there are multiple producers and multiple consumers, llist_add can be
* used in producers and llist_del_all can be used in consumers simultaneously
* without locking. Also a single consumer can use llist_del_first while
* multiple producers simultaneously use llist_add, without any locking.
*
* Cases where locking is needed:
* If we have multiple consumers with llist_del_first used in one consumer, and
* llist_del_first or llist_del_all used in other consumers, then a lock is
* needed. This is because llist_del_first depends on list->first->next not
* changing, but without lock protection, there's no way to be sure about that
* if a preemption happens in the middle of the delete operation and on being
* preempted back, the list->first is the same as before causing the cmpxchg in
* llist_del_first to succeed. For example, while a llist_del_first operation
* is in progress in one consumer, then a llist_del_first, llist_add,
* llist_add (or llist_del_all, llist_add, llist_add) sequence in another
* consumer may cause violations.
*
* This can be summarized as follows:
*
* | add | del_first | del_all
* add | - | - | -
* del_first | | L | L
* del_all | | | -
*
* Where, a particular row's operation can happen concurrently with a column's
* operation, with "-" being no lock needed, while "L" being lock is needed.
*
* The list entries deleted via llist_del_all can be traversed with
* traversing function such as llist_for_each etc. But the list
* entries can not be traversed safely before deleted from the list.
* The order of deleted entries is from the newest to the oldest added
* one. If you want to traverse from the oldest to the newest, you
* must reverse the order by yourself before traversing.
*
* The basic atomic operation of this list is cmpxchg on long. On
* architectures that don't have NMI-safe cmpxchg implementation, the
* list can NOT be used in NMI handlers. So code that uses the list in
* an NMI handler should depend on CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG.
*
* Copyright 2010,2011 Intel Corp.
* Author: Huang Ying <ying.huang@intel.com>
*/
#include <linux/atomic.h>
#include <linux/kernel.h>
struct llist_head {
struct llist_node *first;
};
struct llist_node {
struct llist_node *next;
};
#define LLIST_HEAD_INIT(name) { NULL }
#define LLIST_HEAD(name) struct llist_head name = LLIST_HEAD_INIT(name)
/**
* init_llist_head - initialize lock-less list head
* @head: the head for your lock-less list
*/
static inline void init_llist_head(struct llist_head *list)
{
list->first = NULL;
}
/**
* llist_entry - get the struct of this entry
* @ptr: the &struct llist_node pointer.
* @type: the type of the struct this is embedded in.
* @member: the name of the llist_node within the struct.
*/
#define llist_entry(ptr, type, member) \
container_of(ptr, type, member)
/**
* member_address_is_nonnull - check whether the member address is not NULL
* @ptr: the object pointer (struct type * that contains the llist_node)
* @member: the name of the llist_node within the struct.
*
* This macro is conceptually the same as
* &ptr->member != NULL
* but it works around the fact that compilers can decide that taking a member
* address is never a NULL pointer.
*
* Real objects that start at a high address and have a member at NULL are
* unlikely to exist, but such pointers may be returned e.g. by the
* container_of() macro.
*/
#define member_address_is_nonnull(ptr, member) \
((uintptr_t)(ptr) + offsetof(typeof(*(ptr)), member) != 0)
/**
* llist_for_each - iterate over some deleted entries of a lock-less list
* @pos: the &struct llist_node to use as a loop cursor
* @node: the first entry of deleted list entries
*
* In general, some entries of the lock-less list can be traversed
* safely only after being deleted from list, so start with an entry
* instead of list head.
*
* If being used on entries deleted from lock-less list directly, the
* traverse order is from the newest to the oldest added entry. If
* you want to traverse from the oldest to the newest, you must
* reverse the order by yourself before traversing.
*/
#define llist_for_each(pos, node) \
for ((pos) = (node); pos; (pos) = (pos)->next)
/**
* llist_for_each_safe - iterate over some deleted entries of a lock-less list
* safe against removal of list entry
* @pos: the &struct llist_node to use as a loop cursor
* @n: another &struct llist_node to use as temporary storage
* @node: the first entry of deleted list entries
*
* In general, some entries of the lock-less list can be traversed
* safely only after being deleted from list, so start with an entry
* instead of list head.
*
* If being used on entries deleted from lock-less list directly, the
* traverse order is from the newest to the oldest added entry. If
* you want to traverse from the oldest to the newest, you must
* reverse the order by yourself before traversing.
*/
#define llist_for_each_safe(pos, n, node) \
for ((pos) = (node); (pos) && ((n) = (pos)->next, true); (pos) = (n))
/**
* llist_for_each_entry - iterate over some deleted entries of lock-less list of given type
* @pos: the type * to use as a loop cursor.
* @node: the fist entry of deleted list entries.
* @member: the name of the llist_node with the struct.
*
* In general, some entries of the lock-less list can be traversed
* safely only after being removed from list, so start with an entry
* instead of list head.
*
* If being used on entries deleted from lock-less list directly, the
* traverse order is from the newest to the oldest added entry. If
* you want to traverse from the oldest to the newest, you must
* reverse the order by yourself before traversing.
*/
#define llist_for_each_entry(pos, node, member) \
for ((pos) = llist_entry((node), typeof(*(pos)), member); \
member_address_is_nonnull(pos, member); \
(pos) = llist_entry((pos)->member.next, typeof(*(pos)), member))
/**
* llist_for_each_entry_safe - iterate over some deleted entries of lock-less list of given type
* safe against removal of list entry
* @pos: the type * to use as a loop cursor.
* @n: another type * to use as temporary storage
* @node: the first entry of deleted list entries.
* @member: the name of the llist_node with the struct.
*
* In general, some entries of the lock-less list can be traversed
* safely only after being removed from list, so start with an entry
* instead of list head.
*
* If being used on entries deleted from lock-less list directly, the
* traverse order is from the newest to the oldest added entry. If
* you want to traverse from the oldest to the newest, you must
* reverse the order by yourself before traversing.
*/
#define llist_for_each_entry_safe(pos, n, node, member) \
for (pos = llist_entry((node), typeof(*pos), member); \
member_address_is_nonnull(pos, member) && \
(n = llist_entry(pos->member.next, typeof(*n), member), true); \
pos = n)
/**
* llist_empty - tests whether a lock-less list is empty
* @head: the list to test
*
* Not guaranteed to be accurate or up to date. Just a quick way to
* test whether the list is empty without deleting something from the
* list.
*/
static inline bool llist_empty(const struct llist_head *head)
{
return READ_ONCE(head->first) == NULL;
}
static inline struct llist_node *llist_next(struct llist_node *node)
{
return node->next;
}
extern bool llist_add_batch(struct llist_node *new_first,
struct llist_node *new_last,
struct llist_head *head);
static inline bool __llist_add_batch(struct llist_node *new_first,
struct llist_node *new_last,
struct llist_head *head)
{
new_last->next = head->first;
head->first = new_first;
return new_last->next == NULL;
}
/**
* llist_add - add a new entry
* @new: new entry to be added
* @head: the head for your lock-less list
*
* Returns true if the list was empty prior to adding this entry.
*/
static inline bool llist_add(struct llist_node *new, struct llist_head *head)
{
return llist_add_batch(new, new, head);
}
static inline bool __llist_add(struct llist_node *new, struct llist_head *head)
{
return __llist_add_batch(new, new, head);
}
/**
* llist_del_all - delete all entries from lock-less list
* @head: the head of lock-less list to delete all entries
*
* If list is empty, return NULL, otherwise, delete all entries and
* return the pointer to the first entry. The order of entries
* deleted is from the newest to the oldest added one.
*/
static inline struct llist_node *llist_del_all(struct llist_head *head)
{
return xchg(&head->first, NULL);
}
static inline struct llist_node *__llist_del_all(struct llist_head *head)
{
struct llist_node *first = head->first;
head->first = NULL;
return first;
}
extern struct llist_node *llist_del_first(struct llist_head *head);
struct llist_node *llist_reverse_order(struct llist_node *head);
#endif /* LLIST_H */
/* inffast.c -- fast decoding
* Copyright (C) 1995-2004 Mark Adler
* For conditions of distribution and use, see copyright notice in zlib.h
*/
#include <linux/zutil.h>
#include "inftrees.h"
#include "inflate.h"
#include "inffast.h"
#ifndef ASMINF
union uu {
unsigned short us;
unsigned char b[2];
};
/* Endian independent version */
static inline unsigned short
get_unaligned16(const unsigned short *p)
{
union uu mm;
unsigned char *b = (unsigned char *)p;
mm.b[0] = b[0];
mm.b[1] = b[1];
return mm.us;
}
/*
Decode literal, length, and distance codes and write out the resulting
literal and match bytes until either not enough input or output is
available, an end-of-block is encountered, or a data error is encountered.
When large enough input and output buffers are supplied to inflate(), for
example, a 16K input buffer and a 64K output buffer, more than 95% of the
inflate execution time is spent in this routine.
Entry assumptions:
state->mode == LEN
strm->avail_in >= 6
strm->avail_out >= 258
start >= strm->avail_out
state->bits < 8
On return, state->mode is one of:
LEN -- ran out of enough output space or enough available input
TYPE -- reached end of block code, inflate() to interpret next block
BAD -- error in block data
Notes:
- The maximum input bits used by a length/distance pair is 15 bits for the
length code, 5 bits for the length extra, 15 bits for the distance code,
and 13 bits for the distance extra. This totals 48 bits, or six bytes.
Therefore if strm->avail_in >= 6, then there is enough input to avoid
checking for available input while decoding.
- The maximum bytes that a single length/distance pair can output is 258
bytes, which is the maximum length that can be coded. inflate_fast()
requires strm->avail_out >= 258 for each loop to avoid checking for
output space.
- @start: inflate()'s starting value for strm->avail_out
*/
void inflate_fast(z_streamp strm, unsigned start)
{
struct inflate_state *state;
const unsigned char *in; /* local strm->next_in */
const unsigned char *last; /* while in < last, enough input available */
unsigned char *out; /* local strm->next_out */
unsigned char *beg; /* inflate()'s initial strm->next_out */
unsigned char *end; /* while out < end, enough space available */
#ifdef INFLATE_STRICT
unsigned dmax; /* maximum distance from zlib header */
#endif
unsigned wsize; /* window size or zero if not using window */
unsigned whave; /* valid bytes in the window */
unsigned write; /* window write index */
unsigned char *window; /* allocated sliding window, if wsize != 0 */
unsigned long hold; /* local strm->hold */
unsigned bits; /* local strm->bits */
code const *lcode; /* local strm->lencode */
code const *dcode; /* local strm->distcode */
unsigned lmask; /* mask for first level of length codes */
unsigned dmask; /* mask for first level of distance codes */
code this; /* retrieved table entry */
unsigned op; /* code bits, operation, extra bits, or */
/* window position, window bytes to copy */
unsigned len; /* match length, unused bytes */
unsigned dist; /* match distance */
unsigned char *from; /* where to copy match from */
/* copy state to local variables */
state = (struct inflate_state *)strm->state;
in = strm->next_in;
last = in + (strm->avail_in - 5);
out = strm->next_out;
beg = out - (start - strm->avail_out);
end = out + (strm->avail_out - 257);
#ifdef INFLATE_STRICT
dmax = state->dmax;
#endif
wsize = state->wsize;
whave = state->whave;
write = state->write;
window = state->window;
hold = state->hold;
bits = state->bits;
lcode = state->lencode;
dcode = state->distcode;
lmask = (1U << state->lenbits) - 1;
dmask = (1U << state->distbits) - 1;
/* decode literals and length/distances until end-of-block or not enough
input data or output space */
do {
if (bits < 15) { hold += (unsigned long)(*in++) << bits;
bits += 8;
hold += (unsigned long)(*in++) << bits;
bits += 8;
}
this = lcode[hold & lmask];
dolen:
op = (unsigned)(this.bits);
hold >>= op;
bits -= op;
op = (unsigned)(this.op);
if (op == 0) { /* literal */
*out++ = (unsigned char)(this.val);
}
else if (op & 16) { /* length base */ len = (unsigned)(this.val); op &= 15; /* number of extra bits */
if (op) {
if (bits < op) {
hold += (unsigned long)(*in++) << bits;
bits += 8;
}
len += (unsigned)hold & ((1U << op) - 1);
hold >>= op;
bits -= op;
}
if (bits < 15) { hold += (unsigned long)(*in++) << bits;
bits += 8;
hold += (unsigned long)(*in++) << bits;
bits += 8;
}
this = dcode[hold & dmask];
dodist:
op = (unsigned)(this.bits);
hold >>= op;
bits -= op;
op = (unsigned)(this.op);
if (op & 16) { /* distance base */
dist = (unsigned)(this.val);
op &= 15; /* number of extra bits */
if (bits < op) {
hold += (unsigned long)(*in++) << bits;
bits += 8;
if (bits < op) {
hold += (unsigned long)(*in++) << bits;
bits += 8;
}
}
dist += (unsigned)hold & ((1U << op) - 1);
#ifdef INFLATE_STRICT
if (dist > dmax) {
strm->msg = (char *)"invalid distance too far back";
state->mode = BAD;
break;
}
#endif
hold >>= op;
bits -= op;
op = (unsigned)(out - beg); /* max distance in output */
if (dist > op) { /* see if copy from window */
op = dist - op; /* distance back in window */
if (op > whave) {
strm->msg = (char *)"invalid distance too far back";
state->mode = BAD;
break;
}
from = window;
if (write == 0) { /* very common case */ from += wsize - op;
if (op < len) { /* some from window */
len -= op;
do {
*out++ = *from++;
} while (--op);
from = out - dist; /* rest from output */
}
}
else if (write < op) { /* wrap around window */ from += wsize + write - op;
op -= write;
if (op < len) { /* some from end of window */
len -= op;
do {
*out++ = *from++;
} while (--op);
from = window;
if (write < len) { /* some from start of window */
op = write;
len -= op;
do {
*out++ = *from++;
} while (--op);
from = out - dist; /* rest from output */
}
}
}
else { /* contiguous in window */
from += write - op;
if (op < len) { /* some from window */
len -= op;
do {
*out++ = *from++;
} while (--op);
from = out - dist; /* rest from output */
}
}
while (len > 2) { *out++ = *from++;
*out++ = *from++;
*out++ = *from++;
len -= 3;
}
if (len) { *out++ = *from++;
if (len > 1)
*out++ = *from++;
}
}
else {
unsigned short *sout;
unsigned long loops;
from = out - dist; /* copy direct from output */
/* minimum length is three */
/* Align out addr */
if (!((long)(out - 1) & 1)) {
*out++ = *from++;
len--;
}
sout = (unsigned short *)(out);
if (dist > 2) {
unsigned short *sfrom;
sfrom = (unsigned short *)(from);
loops = len >> 1;
do {
if (IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS))
*sout++ = *sfrom++;
else
*sout++ = get_unaligned16(sfrom++);
} while (--loops);
out = (unsigned char *)sout;
from = (unsigned char *)sfrom;
} else { /* dist == 1 or dist == 2 */
unsigned short pat16;
pat16 = *(sout-1);
if (dist == 1) {
union uu mm;
/* copy one char pattern to both bytes */
mm.us = pat16;
mm.b[0] = mm.b[1];
pat16 = mm.us;
}
loops = len >> 1;
do
*sout++ = pat16;
while (--loops);
out = (unsigned char *)sout;
}
if (len & 1) *out++ = *from++;
}
}
else if ((op & 64) == 0) { /* 2nd level distance code */ this = dcode[this.val + (hold & ((1U << op) - 1))];
goto dodist;
}
else {
strm->msg = (char *)"invalid distance code";
state->mode = BAD;
break;
}
}
else if ((op & 64) == 0) { /* 2nd level length code */ this = lcode[this.val + (hold & ((1U << op) - 1))];
goto dolen;
}
else if (op & 32) { /* end-of-block */ state->mode = TYPE;
break;
}
else {
strm->msg = (char *)"invalid literal/length code";
state->mode = BAD;
break;
}
} while (in < last && out < end);
/* return unused bytes (on entry, bits < 8, so in won't go too far back) */
len = bits >> 3;
in -= len;
bits -= len << 3;
hold &= (1U << bits) - 1;
/* update state and return */
strm->next_in = in;
strm->next_out = out;
strm->avail_in = (unsigned)(in < last ? 5 + (last - in) : 5 - (in - last)); strm->avail_out = (unsigned)(out < end ? 257 + (end - out) : 257 - (out - end));
state->hold = hold;
state->bits = bits;
return;
}
/*
inflate_fast() speedups that turned out slower (on a PowerPC G3 750CXe):
- Using bit fields for code structure
- Different op definition to avoid & for extra bits (do & for table bits)
- Three separate decoding do-loops for direct, window, and write == 0
- Special case for distance > 1 copies to do overlapped load and store copy
- Explicit branch predictions (based on measured branch probabilities)
- Deferring match copy and interspersed it with decoding subsequent codes
- Swapping literal/length else
- Swapping window/direct else
- Larger unrolled copy loops (three is about right)
- Moving len -= 3 statement into middle of loop
*/
#endif /* !ASMINF */
// SPDX-License-Identifier: GPL-2.0
/*
* Devices PM QoS constraints management
*
* Copyright (C) 2011 Texas Instruments, Inc.
*
* This module exposes the interface to kernel space for specifying
* per-device PM QoS dependencies. It provides infrastructure for registration
* of:
*
* Dependents on a QoS value : register requests
* Watchers of QoS value : get notified when target QoS value changes
*
* This QoS design is best effort based. Dependents register their QoS needs.
* Watchers register to keep track of the current QoS needs of the system.
* Watchers can register a per-device notification callback using the
* dev_pm_qos_*_notifier API. The notification chain data is stored in the
* per-device constraint data struct.
*
* Note about the per-device constraint data struct allocation:
* . The per-device constraints data struct ptr is stored into the device
* dev_pm_info.
* . To minimize the data usage by the per-device constraints, the data struct
* is only allocated at the first call to dev_pm_qos_add_request.
* . The data is later free'd when the device is removed from the system.
* . A global mutex protects the constraints users from the data being
* allocated and free'd.
*/
#include <linux/pm_qos.h>
#include <linux/spinlock.h>
#include <linux/slab.h>
#include <linux/device.h>
#include <linux/mutex.h>
#include <linux/export.h>
#include <linux/pm_runtime.h>
#include <linux/err.h>
#include <trace/events/power.h>
#include "power.h"
static DEFINE_MUTEX(dev_pm_qos_mtx);
static DEFINE_MUTEX(dev_pm_qos_sysfs_mtx);
/**
* __dev_pm_qos_flags - Check PM QoS flags for a given device.
* @dev: Device to check the PM QoS flags for.
* @mask: Flags to check against.
*
* This routine must be called with dev->power.lock held.
*/
enum pm_qos_flags_status __dev_pm_qos_flags(struct device *dev, s32 mask)
{
struct dev_pm_qos *qos = dev->power.qos;
struct pm_qos_flags *pqf;
s32 val;
lockdep_assert_held(&dev->power.lock);
if (IS_ERR_OR_NULL(qos))
return PM_QOS_FLAGS_UNDEFINED;
pqf = &qos->flags;
if (list_empty(&pqf->list))
return PM_QOS_FLAGS_UNDEFINED;
val = pqf->effective_flags & mask;
if (val)
return (val == mask) ? PM_QOS_FLAGS_ALL : PM_QOS_FLAGS_SOME;
return PM_QOS_FLAGS_NONE;
}
/**
* dev_pm_qos_flags - Check PM QoS flags for a given device (locked).
* @dev: Device to check the PM QoS flags for.
* @mask: Flags to check against.
*/
enum pm_qos_flags_status dev_pm_qos_flags(struct device *dev, s32 mask)
{
unsigned long irqflags;
enum pm_qos_flags_status ret;
spin_lock_irqsave(&dev->power.lock, irqflags);
ret = __dev_pm_qos_flags(dev, mask);
spin_unlock_irqrestore(&dev->power.lock, irqflags);
return ret;
}
EXPORT_SYMBOL_GPL(dev_pm_qos_flags);
/**
* __dev_pm_qos_resume_latency - Get resume latency constraint for a given device.
* @dev: Device to get the PM QoS constraint value for.
*
* This routine must be called with dev->power.lock held.
*/
s32 __dev_pm_qos_resume_latency(struct device *dev)
{
lockdep_assert_held(&dev->power.lock);
return dev_pm_qos_raw_resume_latency(dev);
}
/**
* dev_pm_qos_read_value - Get PM QoS constraint for a given device (locked).
* @dev: Device to get the PM QoS constraint value for.
* @type: QoS request type.
*/
s32 dev_pm_qos_read_value(struct device *dev, enum dev_pm_qos_req_type type)
{
struct dev_pm_qos *qos = dev->power.qos;
unsigned long flags;
s32 ret;
spin_lock_irqsave(&dev->power.lock, flags);
switch (type) {
case DEV_PM_QOS_RESUME_LATENCY:
ret = IS_ERR_OR_NULL(qos) ? PM_QOS_RESUME_LATENCY_NO_CONSTRAINT
: pm_qos_read_value(&qos->resume_latency);
break;
case DEV_PM_QOS_MIN_FREQUENCY:
ret = IS_ERR_OR_NULL(qos) ? PM_QOS_MIN_FREQUENCY_DEFAULT_VALUE
: freq_qos_read_value(&qos->freq, FREQ_QOS_MIN);
break;
case DEV_PM_QOS_MAX_FREQUENCY:
ret = IS_ERR_OR_NULL(qos) ? PM_QOS_MAX_FREQUENCY_DEFAULT_VALUE
: freq_qos_read_value(&qos->freq, FREQ_QOS_MAX);
break;
default:
WARN_ON(1);
ret = 0;
}
spin_unlock_irqrestore(&dev->power.lock, flags);
return ret;
}
/**
* apply_constraint - Add/modify/remove device PM QoS request.
* @req: Constraint request to apply
* @action: Action to perform (add/update/remove).
* @value: Value to assign to the QoS request.
*
* Internal function to update the constraints list using the PM QoS core
* code and if needed call the per-device callbacks.
*/
static int apply_constraint(struct dev_pm_qos_request *req,
enum pm_qos_req_action action, s32 value)
{
struct dev_pm_qos *qos = req->dev->power.qos;
int ret;
switch(req->type) {
case DEV_PM_QOS_RESUME_LATENCY:
if (WARN_ON(action != PM_QOS_REMOVE_REQ && value < 0))
value = 0;
ret = pm_qos_update_target(&qos->resume_latency,
&req->data.pnode, action, value);
break;
case DEV_PM_QOS_LATENCY_TOLERANCE:
ret = pm_qos_update_target(&qos->latency_tolerance,
&req->data.pnode, action, value);
if (ret) {
value = pm_qos_read_value(&qos->latency_tolerance);
req->dev->power.set_latency_tolerance(req->dev, value);
}
break;
case DEV_PM_QOS_MIN_FREQUENCY:
case DEV_PM_QOS_MAX_FREQUENCY:
ret = freq_qos_apply(&req->data.freq, action, value);
break;
case DEV_PM_QOS_FLAGS:
ret = pm_qos_update_flags(&qos->flags, &req->data.flr,
action, value);
break;
default:
ret = -EINVAL;
}
return ret;
}
/*
* dev_pm_qos_constraints_allocate
* @dev: device to allocate data for
*
* Called at the first call to add_request, for constraint data allocation
* Must be called with the dev_pm_qos_mtx mutex held
*/
static int dev_pm_qos_constraints_allocate(struct device *dev)
{
struct dev_pm_qos *qos;
struct pm_qos_constraints *c;
struct blocking_notifier_head *n;
qos = kzalloc(sizeof(*qos), GFP_KERNEL);
if (!qos)
return -ENOMEM;
n = kzalloc(3 * sizeof(*n), GFP_KERNEL);
if (!n) {
kfree(qos);
return -ENOMEM;
}
c = &qos->resume_latency;
plist_head_init(&c->list);
c->target_value = PM_QOS_RESUME_LATENCY_DEFAULT_VALUE;
c->default_value = PM_QOS_RESUME_LATENCY_DEFAULT_VALUE;
c->no_constraint_value = PM_QOS_RESUME_LATENCY_NO_CONSTRAINT;
c->type = PM_QOS_MIN;
c->notifiers = n;
BLOCKING_INIT_NOTIFIER_HEAD(n);
c = &qos->latency_tolerance;
plist_head_init(&c->list);
c->target_value = PM_QOS_LATENCY_TOLERANCE_DEFAULT_VALUE;
c->default_value = PM_QOS_LATENCY_TOLERANCE_DEFAULT_VALUE;
c->no_constraint_value = PM_QOS_LATENCY_TOLERANCE_NO_CONSTRAINT;
c->type = PM_QOS_MIN;
freq_constraints_init(&qos->freq);
INIT_LIST_HEAD(&qos->flags.list);
spin_lock_irq(&dev->power.lock);
dev->power.qos = qos;
spin_unlock_irq(&dev->power.lock);
return 0;
}
static void __dev_pm_qos_hide_latency_limit(struct device *dev);
static void __dev_pm_qos_hide_flags(struct device *dev);
/**
* dev_pm_qos_constraints_destroy
* @dev: target device
*
* Called from the device PM subsystem on device removal under device_pm_lock().
*/
void dev_pm_qos_constraints_destroy(struct device *dev)
{
struct dev_pm_qos *qos;
struct dev_pm_qos_request *req, *tmp;
struct pm_qos_constraints *c;
struct pm_qos_flags *f;
mutex_lock(&dev_pm_qos_sysfs_mtx);
/*
* If the device's PM QoS resume latency limit or PM QoS flags have been
* exposed to user space, they have to be hidden at this point.
*/
pm_qos_sysfs_remove_resume_latency(dev);
pm_qos_sysfs_remove_flags(dev);
mutex_lock(&dev_pm_qos_mtx);
__dev_pm_qos_hide_latency_limit(dev);
__dev_pm_qos_hide_flags(dev);
qos = dev->power.qos;
if (!qos)
goto out;
/* Flush the constraints lists for the device. */
c = &qos->resume_latency;
plist_for_each_entry_safe(req, tmp, &c->list, data.pnode) {
/*
* Update constraints list and call the notification
* callbacks if needed
*/
apply_constraint(req, PM_QOS_REMOVE_REQ, PM_QOS_DEFAULT_VALUE);
memset(req, 0, sizeof(*req));
}
c = &qos->latency_tolerance;
plist_for_each_entry_safe(req, tmp, &c->list, data.pnode) { apply_constraint(req, PM_QOS_REMOVE_REQ, PM_QOS_DEFAULT_VALUE);
memset(req, 0, sizeof(*req));
}
c = &qos->freq.min_freq;
plist_for_each_entry_safe(req, tmp, &c->list, data.freq.pnode) { apply_constraint(req, PM_QOS_REMOVE_REQ,
PM_QOS_MIN_FREQUENCY_DEFAULT_VALUE);
memset(req, 0, sizeof(*req));
}
c = &qos->freq.max_freq;
plist_for_each_entry_safe(req, tmp, &c->list, data.freq.pnode) { apply_constraint(req, PM_QOS_REMOVE_REQ,
PM_QOS_MAX_FREQUENCY_DEFAULT_VALUE);
memset(req, 0, sizeof(*req));
}
f = &qos->flags;
list_for_each_entry_safe(req, tmp, &f->list, data.flr.node) { apply_constraint(req, PM_QOS_REMOVE_REQ, PM_QOS_DEFAULT_VALUE);
memset(req, 0, sizeof(*req));
}
spin_lock_irq(&dev->power.lock);
dev->power.qos = ERR_PTR(-ENODEV);
spin_unlock_irq(&dev->power.lock);
kfree(qos->resume_latency.notifiers);
kfree(qos);
out:
mutex_unlock(&dev_pm_qos_mtx);
mutex_unlock(&dev_pm_qos_sysfs_mtx);
}
static bool dev_pm_qos_invalid_req_type(struct device *dev,
enum dev_pm_qos_req_type type)
{
return type == DEV_PM_QOS_LATENCY_TOLERANCE &&
!dev->power.set_latency_tolerance;
}
static int __dev_pm_qos_add_request(struct device *dev,
struct dev_pm_qos_request *req,
enum dev_pm_qos_req_type type, s32 value)
{
int ret = 0;
if (!dev || !req || dev_pm_qos_invalid_req_type(dev, type))
return -EINVAL;
if (WARN(dev_pm_qos_request_active(req),
"%s() called for already added request\n", __func__))
return -EINVAL;
if (IS_ERR(dev->power.qos))
ret = -ENODEV;
else if (!dev->power.qos)
ret = dev_pm_qos_constraints_allocate(dev);
trace_dev_pm_qos_add_request(dev_name(dev), type, value);
if (ret)
return ret;
req->dev = dev;
req->type = type;
if (req->type == DEV_PM_QOS_MIN_FREQUENCY)
ret = freq_qos_add_request(&dev->power.qos->freq,
&req->data.freq,
FREQ_QOS_MIN, value);
else if (req->type == DEV_PM_QOS_MAX_FREQUENCY)
ret = freq_qos_add_request(&dev->power.qos->freq,
&req->data.freq,
FREQ_QOS_MAX, value);
else
ret = apply_constraint(req, PM_QOS_ADD_REQ, value);
return ret;
}
/**
* dev_pm_qos_add_request - inserts new qos request into the list
* @dev: target device for the constraint
* @req: pointer to a preallocated handle
* @type: type of the request
* @value: defines the qos request
*
* This function inserts a new entry in the device constraints list of
* requested qos performance characteristics. It recomputes the aggregate
* QoS expectations of parameters and initializes the dev_pm_qos_request
* handle. Caller needs to save this handle for later use in updates and
* removal.
*
* Returns 1 if the aggregated constraint value has changed,
* 0 if the aggregated constraint value has not changed,
* -EINVAL in case of wrong parameters, -ENOMEM if there's not enough memory
* to allocate for data structures, -ENODEV if the device has just been removed
* from the system.
*
* Callers should ensure that the target device is not RPM_SUSPENDED before
* using this function for requests of type DEV_PM_QOS_FLAGS.
*/
int dev_pm_qos_add_request(struct device *dev, struct dev_pm_qos_request *req,
enum dev_pm_qos_req_type type, s32 value)
{
int ret;
mutex_lock(&dev_pm_qos_mtx);
ret = __dev_pm_qos_add_request(dev, req, type, value);
mutex_unlock(&dev_pm_qos_mtx);
return ret;
}
EXPORT_SYMBOL_GPL(dev_pm_qos_add_request);
/**
* __dev_pm_qos_update_request - Modify an existing device PM QoS request.
* @req : PM QoS request to modify.
* @new_value: New value to request.
*/
static int __dev_pm_qos_update_request(struct dev_pm_qos_request *req,
s32 new_value)
{
s32 curr_value;
int ret = 0;
if (!req) /*guard against callers passing in null */
return -EINVAL;
if (WARN(!dev_pm_qos_request_active(req),
"%s() called for unknown object\n", __func__))
return -EINVAL;
if (IS_ERR_OR_NULL(req->dev->power.qos))
return -ENODEV;
switch(req->type) {
case DEV_PM_QOS_RESUME_LATENCY:
case DEV_PM_QOS_LATENCY_TOLERANCE:
curr_value = req->data.pnode.prio;
break;
case DEV_PM_QOS_MIN_FREQUENCY:
case DEV_PM_QOS_MAX_FREQUENCY:
curr_value = req->data.freq.pnode.prio;
break;
case DEV_PM_QOS_FLAGS:
curr_value = req->data.flr.flags;
break;
default:
return -EINVAL;
}
trace_dev_pm_qos_update_request(dev_name(req->dev), req->type,
new_value);
if (curr_value != new_value)
ret = apply_constraint(req, PM_QOS_UPDATE_REQ, new_value);
return ret;
}
/**
* dev_pm_qos_update_request - modifies an existing qos request
* @req : handle to list element holding a dev_pm_qos request to use
* @new_value: defines the qos request
*
* Updates an existing dev PM qos request along with updating the
* target value.
*
* Attempts are made to make this code callable on hot code paths.
*
* Returns 1 if the aggregated constraint value has changed,
* 0 if the aggregated constraint value has not changed,
* -EINVAL in case of wrong parameters, -ENODEV if the device has been
* removed from the system
*
* Callers should ensure that the target device is not RPM_SUSPENDED before
* using this function for requests of type DEV_PM_QOS_FLAGS.
*/
int dev_pm_qos_update_request(struct dev_pm_qos_request *req, s32 new_value)
{
int ret;
mutex_lock(&dev_pm_qos_mtx);
ret = __dev_pm_qos_update_request(req, new_value);
mutex_unlock(&dev_pm_qos_mtx);
return ret;
}
EXPORT_SYMBOL_GPL(dev_pm_qos_update_request);
static int __dev_pm_qos_remove_request(struct dev_pm_qos_request *req)
{
int ret;
if (!req) /*guard against callers passing in null */
return -EINVAL;
if (WARN(!dev_pm_qos_request_active(req),
"%s() called for unknown object\n", __func__))
return -EINVAL;
if (IS_ERR_OR_NULL(req->dev->power.qos))
return -ENODEV;
trace_dev_pm_qos_remove_request(dev_name(req->dev), req->type,
PM_QOS_DEFAULT_VALUE);
ret = apply_constraint(req, PM_QOS_REMOVE_REQ, PM_QOS_DEFAULT_VALUE);
memset(req, 0, sizeof(*req));
return ret;
}
/**
* dev_pm_qos_remove_request - modifies an existing qos request
* @req: handle to request list element
*
* Will remove pm qos request from the list of constraints and
* recompute the current target value. Call this on slow code paths.
*
* Returns 1 if the aggregated constraint value has changed,
* 0 if the aggregated constraint value has not changed,
* -EINVAL in case of wrong parameters, -ENODEV if the device has been
* removed from the system
*
* Callers should ensure that the target device is not RPM_SUSPENDED before
* using this function for requests of type DEV_PM_QOS_FLAGS.
*/
int dev_pm_qos_remove_request(struct dev_pm_qos_request *req)
{
int ret;
mutex_lock(&dev_pm_qos_mtx);
ret = __dev_pm_qos_remove_request(req);
mutex_unlock(&dev_pm_qos_mtx);
return ret;
}
EXPORT_SYMBOL_GPL(dev_pm_qos_remove_request);
/**
* dev_pm_qos_add_notifier - sets notification entry for changes to target value
* of per-device PM QoS constraints
*
* @dev: target device for the constraint
* @notifier: notifier block managed by caller.
* @type: request type.
*
* Will register the notifier into a notification chain that gets called
* upon changes to the target value for the device.
*
* If the device's constraints object doesn't exist when this routine is called,
* it will be created (or error code will be returned if that fails).
*/
int dev_pm_qos_add_notifier(struct device *dev, struct notifier_block *notifier,
enum dev_pm_qos_req_type type)
{
int ret = 0;
mutex_lock(&dev_pm_qos_mtx);
if (IS_ERR(dev->power.qos))
ret = -ENODEV;
else if (!dev->power.qos)
ret = dev_pm_qos_constraints_allocate(dev);
if (ret)
goto unlock;
switch (type) {
case DEV_PM_QOS_RESUME_LATENCY:
ret = blocking_notifier_chain_register(dev->power.qos->resume_latency.notifiers,
notifier);
break;
case DEV_PM_QOS_MIN_FREQUENCY:
ret = freq_qos_add_notifier(&dev->power.qos->freq,
FREQ_QOS_MIN, notifier);
break;
case DEV_PM_QOS_MAX_FREQUENCY:
ret = freq_qos_add_notifier(&dev->power.qos->freq,
FREQ_QOS_MAX, notifier);
break;
default:
WARN_ON(1);
ret = -EINVAL;
}
unlock:
mutex_unlock(&dev_pm_qos_mtx);
return ret;
}
EXPORT_SYMBOL_GPL(dev_pm_qos_add_notifier);
/**
* dev_pm_qos_remove_notifier - deletes notification for changes to target value
* of per-device PM QoS constraints
*
* @dev: target device for the constraint
* @notifier: notifier block to be removed.
* @type: request type.
*
* Will remove the notifier from the notification chain that gets called
* upon changes to the target value.
*/
int dev_pm_qos_remove_notifier(struct device *dev,
struct notifier_block *notifier,
enum dev_pm_qos_req_type type)
{
int ret = 0;
mutex_lock(&dev_pm_qos_mtx);
/* Silently return if the constraints object is not present. */
if (IS_ERR_OR_NULL(dev->power.qos))
goto unlock;
switch (type) {
case DEV_PM_QOS_RESUME_LATENCY:
ret = blocking_notifier_chain_unregister(dev->power.qos->resume_latency.notifiers,
notifier);
break;
case DEV_PM_QOS_MIN_FREQUENCY:
ret = freq_qos_remove_notifier(&dev->power.qos->freq,
FREQ_QOS_MIN, notifier);
break;
case DEV_PM_QOS_MAX_FREQUENCY:
ret = freq_qos_remove_notifier(&dev->power.qos->freq,
FREQ_QOS_MAX, notifier);
break;
default:
WARN_ON(1);
ret = -EINVAL;
}
unlock:
mutex_unlock(&dev_pm_qos_mtx);
return ret;
}
EXPORT_SYMBOL_GPL(dev_pm_qos_remove_notifier);
/**
* dev_pm_qos_add_ancestor_request - Add PM QoS request for device's ancestor.
* @dev: Device whose ancestor to add the request for.
* @req: Pointer to the preallocated handle.
* @type: Type of the request.
* @value: Constraint latency value.
*/
int dev_pm_qos_add_ancestor_request(struct device *dev,
struct dev_pm_qos_request *req,
enum dev_pm_qos_req_type type, s32 value)
{
struct device *ancestor = dev->parent;
int ret = -ENODEV;
switch (type) {
case DEV_PM_QOS_RESUME_LATENCY:
while (ancestor && !ancestor->power.ignore_children)
ancestor = ancestor->parent;
break;
case DEV_PM_QOS_LATENCY_TOLERANCE:
while (ancestor && !ancestor->power.set_latency_tolerance)
ancestor = ancestor->parent;
break;
default:
ancestor = NULL;
}
if (ancestor)
ret = dev_pm_qos_add_request(ancestor, req, type, value);
if (ret < 0)
req->dev = NULL;
return ret;
}
EXPORT_SYMBOL_GPL(dev_pm_qos_add_ancestor_request);
static void __dev_pm_qos_drop_user_request(struct device *dev,
enum dev_pm_qos_req_type type)
{
struct dev_pm_qos_request *req = NULL;
switch(type) {
case DEV_PM_QOS_RESUME_LATENCY:
req = dev->power.qos->resume_latency_req;
dev->power.qos->resume_latency_req = NULL;
break;
case DEV_PM_QOS_LATENCY_TOLERANCE:
req = dev->power.qos->latency_tolerance_req;
dev->power.qos->latency_tolerance_req = NULL;
break;
case DEV_PM_QOS_FLAGS:
req = dev->power.qos->flags_req;
dev->power.qos->flags_req = NULL;
break;
default:
WARN_ON(1);
return;
}
__dev_pm_qos_remove_request(req);
kfree(req);
}
static void dev_pm_qos_drop_user_request(struct device *dev,
enum dev_pm_qos_req_type type)
{
mutex_lock(&dev_pm_qos_mtx);
__dev_pm_qos_drop_user_request(dev, type);
mutex_unlock(&dev_pm_qos_mtx);
}
/**
* dev_pm_qos_expose_latency_limit - Expose PM QoS latency limit to user space.
* @dev: Device whose PM QoS latency limit is to be exposed to user space.
* @value: Initial value of the latency limit.
*/
int dev_pm_qos_expose_latency_limit(struct device *dev, s32 value)
{
struct dev_pm_qos_request *req;
int ret;
if (!device_is_registered(dev) || value < 0)
return -EINVAL;
req = kzalloc(sizeof(*req), GFP_KERNEL);
if (!req)
return -ENOMEM;
ret = dev_pm_qos_add_request(dev, req, DEV_PM_QOS_RESUME_LATENCY, value);
if (ret < 0) {
kfree(req);
return ret;
}
mutex_lock(&dev_pm_qos_sysfs_mtx);
mutex_lock(&dev_pm_qos_mtx);
if (IS_ERR_OR_NULL(dev->power.qos))
ret = -ENODEV;
else if (dev->power.qos->resume_latency_req)
ret = -EEXIST;
if (ret < 0) {
__dev_pm_qos_remove_request(req);
kfree(req);
mutex_unlock(&dev_pm_qos_mtx);
goto out;
}
dev->power.qos->resume_latency_req = req;
mutex_unlock(&dev_pm_qos_mtx);
ret = pm_qos_sysfs_add_resume_latency(dev);
if (ret)
dev_pm_qos_drop_user_request(dev, DEV_PM_QOS_RESUME_LATENCY);
out:
mutex_unlock(&dev_pm_qos_sysfs_mtx);
return ret;
}
EXPORT_SYMBOL_GPL(dev_pm_qos_expose_latency_limit);
static void __dev_pm_qos_hide_latency_limit(struct device *dev)
{
if (!IS_ERR_OR_NULL(dev->power.qos) && dev->power.qos->resume_latency_req)
__dev_pm_qos_drop_user_request(dev, DEV_PM_QOS_RESUME_LATENCY);
}
/**
* dev_pm_qos_hide_latency_limit - Hide PM QoS latency limit from user space.
* @dev: Device whose PM QoS latency limit is to be hidden from user space.
*/
void dev_pm_qos_hide_latency_limit(struct device *dev)
{
mutex_lock(&dev_pm_qos_sysfs_mtx);
pm_qos_sysfs_remove_resume_latency(dev);
mutex_lock(&dev_pm_qos_mtx);
__dev_pm_qos_hide_latency_limit(dev);
mutex_unlock(&dev_pm_qos_mtx);
mutex_unlock(&dev_pm_qos_sysfs_mtx);
}
EXPORT_SYMBOL_GPL(dev_pm_qos_hide_latency_limit);
/**
* dev_pm_qos_expose_flags - Expose PM QoS flags of a device to user space.
* @dev: Device whose PM QoS flags are to be exposed to user space.
* @val: Initial values of the flags.
*/
int dev_pm_qos_expose_flags(struct device *dev, s32 val)
{
struct dev_pm_qos_request *req;
int ret;
if (!device_is_registered(dev))
return -EINVAL;
req = kzalloc(sizeof(*req), GFP_KERNEL);
if (!req)
return -ENOMEM;
ret = dev_pm_qos_add_request(dev, req, DEV_PM_QOS_FLAGS, val);
if (ret < 0) {
kfree(req);
return ret;
}
pm_runtime_get_sync(dev);
mutex_lock(&dev_pm_qos_sysfs_mtx);
mutex_lock(&dev_pm_qos_mtx);
if (IS_ERR_OR_NULL(dev->power.qos))
ret = -ENODEV;
else if (dev->power.qos->flags_req)
ret = -EEXIST;
if (ret < 0) {
__dev_pm_qos_remove_request(req);
kfree(req);
mutex_unlock(&dev_pm_qos_mtx);
goto out;
}
dev->power.qos->flags_req = req;
mutex_unlock(&dev_pm_qos_mtx);
ret = pm_qos_sysfs_add_flags(dev);
if (ret)
dev_pm_qos_drop_user_request(dev, DEV_PM_QOS_FLAGS);
out:
mutex_unlock(&dev_pm_qos_sysfs_mtx);
pm_runtime_put(dev);
return ret;
}
EXPORT_SYMBOL_GPL(dev_pm_qos_expose_flags);
static void __dev_pm_qos_hide_flags(struct device *dev)
{
if (!IS_ERR_OR_NULL(dev->power.qos) && dev->power.qos->flags_req)
__dev_pm_qos_drop_user_request(dev, DEV_PM_QOS_FLAGS);
}
/**
* dev_pm_qos_hide_flags - Hide PM QoS flags of a device from user space.
* @dev: Device whose PM QoS flags are to be hidden from user space.
*/
void dev_pm_qos_hide_flags(struct device *dev)
{
pm_runtime_get_sync(dev);
mutex_lock(&dev_pm_qos_sysfs_mtx);
pm_qos_sysfs_remove_flags(dev);
mutex_lock(&dev_pm_qos_mtx);
__dev_pm_qos_hide_flags(dev);
mutex_unlock(&dev_pm_qos_mtx);
mutex_unlock(&dev_pm_qos_sysfs_mtx);
pm_runtime_put(dev);
}
EXPORT_SYMBOL_GPL(dev_pm_qos_hide_flags);
/**
* dev_pm_qos_update_flags - Update PM QoS flags request owned by user space.
* @dev: Device to update the PM QoS flags request for.
* @mask: Flags to set/clear.
* @set: Whether to set or clear the flags (true means set).
*/
int dev_pm_qos_update_flags(struct device *dev, s32 mask, bool set)
{
s32 value;
int ret;
pm_runtime_get_sync(dev);
mutex_lock(&dev_pm_qos_mtx);
if (IS_ERR_OR_NULL(dev->power.qos) || !dev->power.qos->flags_req) {
ret = -EINVAL;
goto out;
}
value = dev_pm_qos_requested_flags(dev);
if (set)
value |= mask;
else
value &= ~mask;
ret = __dev_pm_qos_update_request(dev->power.qos->flags_req, value);
out:
mutex_unlock(&dev_pm_qos_mtx);
pm_runtime_put(dev);
return ret;
}
/**
* dev_pm_qos_get_user_latency_tolerance - Get user space latency tolerance.
* @dev: Device to obtain the user space latency tolerance for.
*/
s32 dev_pm_qos_get_user_latency_tolerance(struct device *dev)
{
s32 ret;
mutex_lock(&dev_pm_qos_mtx);
ret = IS_ERR_OR_NULL(dev->power.qos)
|| !dev->power.qos->latency_tolerance_req ?
PM_QOS_LATENCY_TOLERANCE_NO_CONSTRAINT :
dev->power.qos->latency_tolerance_req->data.pnode.prio;
mutex_unlock(&dev_pm_qos_mtx);
return ret;
}
/**
* dev_pm_qos_update_user_latency_tolerance - Update user space latency tolerance.
* @dev: Device to update the user space latency tolerance for.
* @val: New user space latency tolerance for @dev (negative values disable).
*/
int dev_pm_qos_update_user_latency_tolerance(struct device *dev, s32 val)
{
int ret;
mutex_lock(&dev_pm_qos_mtx);
if (IS_ERR_OR_NULL(dev->power.qos)
|| !dev->power.qos->latency_tolerance_req) {
struct dev_pm_qos_request *req;
if (val < 0) {
if (val == PM_QOS_LATENCY_TOLERANCE_NO_CONSTRAINT)
ret = 0;
else
ret = -EINVAL;
goto out;
}
req = kzalloc(sizeof(*req), GFP_KERNEL);
if (!req) {
ret = -ENOMEM;
goto out;
}
ret = __dev_pm_qos_add_request(dev, req, DEV_PM_QOS_LATENCY_TOLERANCE, val);
if (ret < 0) {
kfree(req);
goto out;
}
dev->power.qos->latency_tolerance_req = req;
} else {
if (val < 0) {
__dev_pm_qos_drop_user_request(dev, DEV_PM_QOS_LATENCY_TOLERANCE);
ret = 0;
} else {
ret = __dev_pm_qos_update_request(dev->power.qos->latency_tolerance_req, val);
}
}
out:
mutex_unlock(&dev_pm_qos_mtx);
return ret;
}
EXPORT_SYMBOL_GPL(dev_pm_qos_update_user_latency_tolerance);
/**
* dev_pm_qos_expose_latency_tolerance - Expose latency tolerance to userspace
* @dev: Device whose latency tolerance to expose
*/
int dev_pm_qos_expose_latency_tolerance(struct device *dev)
{
int ret;
if (!dev->power.set_latency_tolerance)
return -EINVAL;
mutex_lock(&dev_pm_qos_sysfs_mtx);
ret = pm_qos_sysfs_add_latency_tolerance(dev);
mutex_unlock(&dev_pm_qos_sysfs_mtx);
return ret;
}
EXPORT_SYMBOL_GPL(dev_pm_qos_expose_latency_tolerance);
/**
* dev_pm_qos_hide_latency_tolerance - Hide latency tolerance from userspace
* @dev: Device whose latency tolerance to hide
*/
void dev_pm_qos_hide_latency_tolerance(struct device *dev)
{
mutex_lock(&dev_pm_qos_sysfs_mtx);
pm_qos_sysfs_remove_latency_tolerance(dev);
mutex_unlock(&dev_pm_qos_sysfs_mtx);
/* Remove the request from user space now */
pm_runtime_get_sync(dev);
dev_pm_qos_update_user_latency_tolerance(dev,
PM_QOS_LATENCY_TOLERANCE_NO_CONSTRAINT);
pm_runtime_put(dev);
}
EXPORT_SYMBOL_GPL(dev_pm_qos_hide_latency_tolerance);
// SPDX-License-Identifier: GPL-2.0
/*
* Copyright (C) 1995 Linus Torvalds
* Copyright (C) 2001, 2002 Andi Kleen, SuSE Labs.
* Copyright (C) 2008-2009, Red Hat Inc., Ingo Molnar
*/
#include <linux/sched.h> /* test_thread_flag(), ... */
#include <linux/sched/task_stack.h> /* task_stack_*(), ... */
#include <linux/kdebug.h> /* oops_begin/end, ... */
#include <linux/extable.h> /* search_exception_tables */
#include <linux/memblock.h> /* max_low_pfn */
#include <linux/kfence.h> /* kfence_handle_page_fault */
#include <linux/kprobes.h> /* NOKPROBE_SYMBOL, ... */
#include <linux/mmiotrace.h> /* kmmio_handler, ... */
#include <linux/perf_event.h> /* perf_sw_event */
#include <linux/hugetlb.h> /* hstate_index_to_shift */
#include <linux/prefetch.h> /* prefetchw */
#include <linux/context_tracking.h> /* exception_enter(), ... */
#include <linux/uaccess.h> /* faulthandler_disabled() */
#include <linux/efi.h> /* efi_crash_gracefully_on_page_fault()*/
#include <linux/mm_types.h>
#include <asm/cpufeature.h> /* boot_cpu_has, ... */
#include <asm/traps.h> /* dotraplinkage, ... */
#include <asm/fixmap.h> /* VSYSCALL_ADDR */
#include <asm/vsyscall.h> /* emulate_vsyscall */
#include <asm/vm86.h> /* struct vm86 */
#include <asm/mmu_context.h> /* vma_pkey() */
#include <asm/efi.h> /* efi_crash_gracefully_on_page_fault()*/
#include <asm/desc.h> /* store_idt(), ... */
#include <asm/cpu_entry_area.h> /* exception stack */
#include <asm/pgtable_areas.h> /* VMALLOC_START, ... */
#include <asm/kvm_para.h> /* kvm_handle_async_pf */
#include <asm/vdso.h> /* fixup_vdso_exception() */
#include <asm/irq_stack.h>
#define CREATE_TRACE_POINTS
#include <asm/trace/exceptions.h>
/*
* Returns 0 if mmiotrace is disabled, or if the fault is not
* handled by mmiotrace:
*/
static nokprobe_inline int
kmmio_fault(struct pt_regs *regs, unsigned long addr)
{
if (unlikely(is_kmmio_active()))
if (kmmio_handler(regs, addr) == 1)
return -1;
return 0;
}
/*
* Prefetch quirks:
*
* 32-bit mode:
*
* Sometimes AMD Athlon/Opteron CPUs report invalid exceptions on prefetch.
* Check that here and ignore it. This is AMD erratum #91.
*
* 64-bit mode:
*
* Sometimes the CPU reports invalid exceptions on prefetch.
* Check that here and ignore it.
*
* Opcode checker based on code by Richard Brunner.
*/
static inline int
check_prefetch_opcode(struct pt_regs *regs, unsigned char *instr,
unsigned char opcode, int *prefetch)
{
unsigned char instr_hi = opcode & 0xf0;
unsigned char instr_lo = opcode & 0x0f;
switch (instr_hi) {
case 0x20:
case 0x30:
/*
* Values 0x26,0x2E,0x36,0x3E are valid x86 prefixes.
* In X86_64 long mode, the CPU will signal invalid
* opcode if some of these prefixes are present so
* X86_64 will never get here anyway
*/
return ((instr_lo & 7) == 0x6);
#ifdef CONFIG_X86_64
case 0x40:
/*
* In 64-bit mode 0x40..0x4F are valid REX prefixes
*/
return (!user_mode(regs) || user_64bit_mode(regs));
#endif
case 0x60:
/* 0x64 thru 0x67 are valid prefixes in all modes. */
return (instr_lo & 0xC) == 0x4;
case 0xF0:
/* 0xF0, 0xF2, 0xF3 are valid prefixes in all modes. */
return !instr_lo || (instr_lo>>1) == 1;
case 0x00:
/* Prefetch instruction is 0x0F0D or 0x0F18 */
if (get_kernel_nofault(opcode, instr))
return 0;
*prefetch = (instr_lo == 0xF) && (opcode == 0x0D || opcode == 0x18);
return 0;
default:
return 0;
}
}
static bool is_amd_k8_pre_npt(void)
{
struct cpuinfo_x86 *c = &boot_cpu_data;
return unlikely(IS_ENABLED(CONFIG_CPU_SUP_AMD) &&
c->x86_vendor == X86_VENDOR_AMD &&
c->x86 == 0xf && c->x86_model < 0x40);
}
static int
is_prefetch(struct pt_regs *regs, unsigned long error_code, unsigned long addr)
{
unsigned char *max_instr;
unsigned char *instr;
int prefetch = 0;
/* Erratum #91 affects AMD K8, pre-NPT CPUs */
if (!is_amd_k8_pre_npt())
return 0;
/*
* If it was a exec (instruction fetch) fault on NX page, then
* do not ignore the fault:
*/
if (error_code & X86_PF_INSTR)
return 0;
instr = (void *)convert_ip_to_linear(current, regs);
max_instr = instr + 15;
/*
* This code has historically always bailed out if IP points to a
* not-present page (e.g. due to a race). No one has ever
* complained about this.
*/
pagefault_disable();
while (instr < max_instr) {
unsigned char opcode;
if (user_mode(regs)) {
if (get_user(opcode, instr))
break;
} else {
if (get_kernel_nofault(opcode, instr))
break;
}
instr++; if (!check_prefetch_opcode(regs, instr, opcode, &prefetch))
break;
}
pagefault_enable();
return prefetch;
}
DEFINE_SPINLOCK(pgd_lock);
LIST_HEAD(pgd_list);
#ifdef CONFIG_X86_32
static inline pmd_t *vmalloc_sync_one(pgd_t *pgd, unsigned long address)
{
unsigned index = pgd_index(address);
pgd_t *pgd_k;
p4d_t *p4d, *p4d_k;
pud_t *pud, *pud_k;
pmd_t *pmd, *pmd_k;
pgd += index;
pgd_k = init_mm.pgd + index;
if (!pgd_present(*pgd_k))
return NULL;
/*
* set_pgd(pgd, *pgd_k); here would be useless on PAE
* and redundant with the set_pmd() on non-PAE. As would
* set_p4d/set_pud.
*/
p4d = p4d_offset(pgd, address);
p4d_k = p4d_offset(pgd_k, address);
if (!p4d_present(*p4d_k))
return NULL;
pud = pud_offset(p4d, address);
pud_k = pud_offset(p4d_k, address);
if (!pud_present(*pud_k))
return NULL;
pmd = pmd_offset(pud, address);
pmd_k = pmd_offset(pud_k, address);
if (pmd_present(*pmd) != pmd_present(*pmd_k))
set_pmd(pmd, *pmd_k);
if (!pmd_present(*pmd_k))
return NULL;
else
BUG_ON(pmd_pfn(*pmd) != pmd_pfn(*pmd_k));
return pmd_k;
}
/*
* Handle a fault on the vmalloc or module mapping area
*
* This is needed because there is a race condition between the time
* when the vmalloc mapping code updates the PMD to the point in time
* where it synchronizes this update with the other page-tables in the
* system.
*
* In this race window another thread/CPU can map an area on the same
* PMD, finds it already present and does not synchronize it with the
* rest of the system yet. As a result v[mz]alloc might return areas
* which are not mapped in every page-table in the system, causing an
* unhandled page-fault when they are accessed.
*/
static noinline int vmalloc_fault(unsigned long address)
{
unsigned long pgd_paddr;
pmd_t *pmd_k;
pte_t *pte_k;
/* Make sure we are in vmalloc area: */
if (!(address >= VMALLOC_START && address < VMALLOC_END))
return -1;
/*
* Synchronize this task's top level page-table
* with the 'reference' page table.
*
* Do _not_ use "current" here. We might be inside
* an interrupt in the middle of a task switch..
*/
pgd_paddr = read_cr3_pa();
pmd_k = vmalloc_sync_one(__va(pgd_paddr), address);
if (!pmd_k)
return -1;
if (pmd_large(*pmd_k))
return 0;
pte_k = pte_offset_kernel(pmd_k, address);
if (!pte_present(*pte_k))
return -1;
return 0;
}
NOKPROBE_SYMBOL(vmalloc_fault);
void arch_sync_kernel_mappings(unsigned long start, unsigned long end)
{
unsigned long addr;
for (addr = start & PMD_MASK;
addr >= TASK_SIZE_MAX && addr < VMALLOC_END;
addr += PMD_SIZE) {
struct page *page;
spin_lock(&pgd_lock);
list_for_each_entry(page, &pgd_list, lru) {
spinlock_t *pgt_lock;
/* the pgt_lock only for Xen */
pgt_lock = &pgd_page_get_mm(page)->page_table_lock;
spin_lock(pgt_lock);
vmalloc_sync_one(page_address(page), addr);
spin_unlock(pgt_lock);
}
spin_unlock(&pgd_lock);
}
}
static bool low_pfn(unsigned long pfn)
{
return pfn < max_low_pfn;
}
static void dump_pagetable(unsigned long address)
{
pgd_t *base = __va(read_cr3_pa());
pgd_t *pgd = &base[pgd_index(address)];
p4d_t *p4d;
pud_t *pud;
pmd_t *pmd;
pte_t *pte;
#ifdef CONFIG_X86_PAE
pr_info("*pdpt = %016Lx ", pgd_val(*pgd));
if (!low_pfn(pgd_val(*pgd) >> PAGE_SHIFT) || !pgd_present(*pgd))
goto out;
#define pr_pde pr_cont
#else
#define pr_pde pr_info
#endif
p4d = p4d_offset(pgd, address);
pud = pud_offset(p4d, address);
pmd = pmd_offset(pud, address);
pr_pde("*pde = %0*Lx ", sizeof(*pmd) * 2, (u64)pmd_val(*pmd));
#undef pr_pde
/*
* We must not directly access the pte in the highpte
* case if the page table is located in highmem.
* And let's rather not kmap-atomic the pte, just in case
* it's allocated already:
*/
if (!low_pfn(pmd_pfn(*pmd)) || !pmd_present(*pmd) || pmd_large(*pmd))
goto out;
pte = pte_offset_kernel(pmd, address);
pr_cont("*pte = %0*Lx ", sizeof(*pte) * 2, (u64)pte_val(*pte));
out:
pr_cont("\n");
}
#else /* CONFIG_X86_64: */
#ifdef CONFIG_CPU_SUP_AMD
static const char errata93_warning[] =
KERN_ERR
"******* Your BIOS seems to not contain a fix for K8 errata #93\n"
"******* Working around it, but it may cause SEGVs or burn power.\n"
"******* Please consider a BIOS update.\n"
"******* Disabling USB legacy in the BIOS may also help.\n";
#endif
static int bad_address(void *p)
{
unsigned long dummy;
return get_kernel_nofault(dummy, (unsigned long *)p);
}
static void dump_pagetable(unsigned long address)
{
pgd_t *base = __va(read_cr3_pa());
pgd_t *pgd = base + pgd_index(address);
p4d_t *p4d;
pud_t *pud;
pmd_t *pmd;
pte_t *pte;
if (bad_address(pgd))
goto bad;
pr_info("PGD %lx ", pgd_val(*pgd));
if (!pgd_present(*pgd))
goto out;
p4d = p4d_offset(pgd, address);
if (bad_address(p4d))
goto bad;
pr_cont("P4D %lx ", p4d_val(*p4d));
if (!p4d_present(*p4d) || p4d_large(*p4d))
goto out;
pud = pud_offset(p4d, address);
if (bad_address(pud))
goto bad;
pr_cont("PUD %lx ", pud_val(*pud));
if (!pud_present(*pud) || pud_large(*pud))
goto out;
pmd = pmd_offset(pud, address);
if (bad_address(pmd))
goto bad;
pr_cont("PMD %lx ", pmd_val(*pmd));
if (!pmd_present(*pmd) || pmd_large(*pmd))
goto out;
pte = pte_offset_kernel(pmd, address);
if (bad_address(pte))
goto bad;
pr_cont("PTE %lx", pte_val(*pte));
out:
pr_cont("\n");
return;
bad:
pr_info("BAD\n");
}
#endif /* CONFIG_X86_64 */
/*
* Workaround for K8 erratum #93 & buggy BIOS.
*
* BIOS SMM functions are required to use a specific workaround
* to avoid corruption of the 64bit RIP register on C stepping K8.
*
* A lot of BIOS that didn't get tested properly miss this.
*
* The OS sees this as a page fault with the upper 32bits of RIP cleared.
* Try to work around it here.
*
* Note we only handle faults in kernel here.
* Does nothing on 32-bit.
*/
static int is_errata93(struct pt_regs *regs, unsigned long address)
{
#if defined(CONFIG_X86_64) && defined(CONFIG_CPU_SUP_AMD)
if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD
|| boot_cpu_data.x86 != 0xf)
return 0;
if (user_mode(regs))
return 0;
if (address != regs->ip)
return 0;
if ((address >> 32) != 0)
return 0;
address |= 0xffffffffUL << 32; if ((address >= (u64)_stext && address <= (u64)_etext) || (address >= MODULES_VADDR && address <= MODULES_END)) { printk_once(errata93_warning); regs->ip = address; return 1;
}
#endif
return 0;
}
/*
* Work around K8 erratum #100 K8 in compat mode occasionally jumps
* to illegal addresses >4GB.
*
* We catch this in the page fault handler because these addresses
* are not reachable. Just detect this case and return. Any code
* segment in LDT is compatibility mode.
*/
static int is_errata100(struct pt_regs *regs, unsigned long address)
{
#ifdef CONFIG_X86_64
if ((regs->cs == __USER32_CS || (regs->cs & (1<<2))) && (address >> 32))
return 1;
#endif
return 0;
}
/* Pentium F0 0F C7 C8 bug workaround: */
static int is_f00f_bug(struct pt_regs *regs, unsigned long error_code,
unsigned long address)
{
#ifdef CONFIG_X86_F00F_BUG
if (boot_cpu_has_bug(X86_BUG_F00F) && !(error_code & X86_PF_USER) &&
idt_is_f00f_address(address)) {
handle_invalid_op(regs);
return 1;
}
#endif
return 0;
}
static void show_ldttss(const struct desc_ptr *gdt, const char *name, u16 index)
{
u32 offset = (index >> 3) * sizeof(struct desc_struct);
unsigned long addr;
struct ldttss_desc desc;
if (index == 0) {
pr_alert("%s: NULL\n", name);
return;
}
if (offset + sizeof(struct ldttss_desc) >= gdt->size) {
pr_alert("%s: 0x%hx -- out of bounds\n", name, index);
return;
}
if (copy_from_kernel_nofault(&desc, (void *)(gdt->address + offset),
sizeof(struct ldttss_desc))) {
pr_alert("%s: 0x%hx -- GDT entry is not readable\n",
name, index);
return;
}
addr = desc.base0 | (desc.base1 << 16) | ((unsigned long)desc.base2 << 24);
#ifdef CONFIG_X86_64
addr |= ((u64)desc.base3 << 32);
#endif
pr_alert("%s: 0x%hx -- base=0x%lx limit=0x%x\n",
name, index, addr, (desc.limit0 | (desc.limit1 << 16)));
}
static void
show_fault_oops(struct pt_regs *regs, unsigned long error_code, unsigned long address)
{
if (!oops_may_print())
return;
if (error_code & X86_PF_INSTR) {
unsigned int level;
pgd_t *pgd;
pte_t *pte;
pgd = __va(read_cr3_pa());
pgd += pgd_index(address);
pte = lookup_address_in_pgd(pgd, address, &level);
if (pte && pte_present(*pte) && !pte_exec(*pte))
pr_crit("kernel tried to execute NX-protected page - exploit attempt? (uid: %d)\n",
from_kuid(&init_user_ns, current_uid()));
if (pte && pte_present(*pte) && pte_exec(*pte) &&
(pgd_flags(*pgd) & _PAGE_USER) &&
(__read_cr4() & X86_CR4_SMEP))
pr_crit("unable to execute userspace code (SMEP?) (uid: %d)\n",
from_kuid(&init_user_ns, current_uid()));
}
if (address < PAGE_SIZE && !user_mode(regs))
pr_alert("BUG: kernel NULL pointer dereference, address: %px\n",
(void *)address);
else
pr_alert("BUG: unable to handle page fault for address: %px\n",
(void *)address);
pr_alert("#PF: %s %s in %s mode\n",
(error_code & X86_PF_USER) ? "user" : "supervisor",
(error_code & X86_PF_INSTR) ? "instruction fetch" :
(error_code & X86_PF_WRITE) ? "write access" :
"read access",
user_mode(regs) ? "user" : "kernel");
pr_alert("#PF: error_code(0x%04lx) - %s\n", error_code,
!(error_code & X86_PF_PROT) ? "not-present page" :
(error_code & X86_PF_RSVD) ? "reserved bit violation" :
(error_code & X86_PF_PK) ? "protection keys violation" :
"permissions violation");
if (!(error_code & X86_PF_USER) && user_mode(regs)) {
struct desc_ptr idt, gdt;
u16 ldtr, tr;
/*
* This can happen for quite a few reasons. The more obvious
* ones are faults accessing the GDT, or LDT. Perhaps
* surprisingly, if the CPU tries to deliver a benign or
* contributory exception from user code and gets a page fault
* during delivery, the page fault can be delivered as though
* it originated directly from user code. This could happen
* due to wrong permissions on the IDT, GDT, LDT, TSS, or
* kernel or IST stack.
*/
store_idt(&idt);
/* Usable even on Xen PV -- it's just slow. */
native_store_gdt(&gdt);
pr_alert("IDT: 0x%lx (limit=0x%hx) GDT: 0x%lx (limit=0x%hx)\n",
idt.address, idt.size, gdt.address, gdt.size);
store_ldt(ldtr);
show_ldttss(&gdt, "LDTR", ldtr);
store_tr(tr);
show_ldttss(&gdt, "TR", tr);
}
dump_pagetable(address);
}
static noinline void
pgtable_bad(struct pt_regs *regs, unsigned long error_code,
unsigned long address)
{
struct task_struct *tsk;
unsigned long flags;
int sig;
flags = oops_begin();
tsk = current;
sig = SIGKILL;
printk(KERN_ALERT "%s: Corrupted page table at address %lx\n",
tsk->comm, address);
dump_pagetable(address);
if (__die("Bad pagetable", regs, error_code))
sig = 0;
oops_end(flags, regs, sig);
}
static void sanitize_error_code(unsigned long address,
unsigned long *error_code)
{
/*
* To avoid leaking information about the kernel page
* table layout, pretend that user-mode accesses to
* kernel addresses are always protection faults.
*
* NB: This means that failed vsyscalls with vsyscall=none
* will have the PROT bit. This doesn't leak any
* information and does not appear to cause any problems.
*/
if (address >= TASK_SIZE_MAX)
*error_code |= X86_PF_PROT;
}
static void set_signal_archinfo(unsigned long address,
unsigned long error_code)
{
struct task_struct *tsk = current;
tsk->thread.trap_nr = X86_TRAP_PF;
tsk->thread.error_code = error_code | X86_PF_USER;
tsk->thread.cr2 = address;
}
static noinline void
page_fault_oops(struct pt_regs *regs, unsigned long error_code,
unsigned long address)
{
#ifdef CONFIG_VMAP_STACK
struct stack_info info;
#endif
unsigned long flags;
int sig;
if (user_mode(regs)) {
/*
* Implicit kernel access from user mode? Skip the stack
* overflow and EFI special cases.
*/
goto oops;
}
#ifdef CONFIG_VMAP_STACK
/*
* Stack overflow? During boot, we can fault near the initial
* stack in the direct map, but that's not an overflow -- check
* that we're in vmalloc space to avoid this.
*/
if (is_vmalloc_addr((void *)address) &&
get_stack_guard_info((void *)address, &info)) {
/*
* We're likely to be running with very little stack space
* left. It's plausible that we'd hit this condition but
* double-fault even before we get this far, in which case
* we're fine: the double-fault handler will deal with it.
*
* We don't want to make it all the way into the oops code
* and then double-fault, though, because we're likely to
* break the console driver and lose most of the stack dump.
*/
call_on_stack(__this_cpu_ist_top_va(DF) - sizeof(void*),
handle_stack_overflow,
ASM_CALL_ARG3,
, [arg1] "r" (regs), [arg2] "r" (address), [arg3] "r" (&info));
unreachable();
}
#endif
/*
* Buggy firmware could access regions which might page fault. If
* this happens, EFI has a special OOPS path that will try to
* avoid hanging the system.
*/
if (IS_ENABLED(CONFIG_EFI))
efi_crash_gracefully_on_page_fault(address);
/* Only not-present faults should be handled by KFENCE. */
if (!(error_code & X86_PF_PROT) &&
kfence_handle_page_fault(address, error_code & X86_PF_WRITE, regs))
return;
oops:
/*
* Oops. The kernel tried to access some bad page. We'll have to
* terminate things with extreme prejudice:
*/
flags = oops_begin();
show_fault_oops(regs, error_code, address);
if (task_stack_end_corrupted(current))
printk(KERN_EMERG "Thread overran stack, or stack corrupted\n");
sig = SIGKILL;
if (__die("Oops", regs, error_code))
sig = 0;
/* Executive summary in case the body of the oops scrolled away */
printk(KERN_DEFAULT "CR2: %016lx\n", address);
oops_end(flags, regs, sig);
}
static noinline void
kernelmode_fixup_or_oops(struct pt_regs *regs, unsigned long error_code,
unsigned long address, int signal, int si_code,
u32 pkey)
{
WARN_ON_ONCE(user_mode(regs));
/* Are we prepared to handle this kernel fault? */
if (fixup_exception(regs, X86_TRAP_PF, error_code, address)) {
/*
* Any interrupt that takes a fault gets the fixup. This makes
* the below recursive fault logic only apply to a faults from
* task context.
*/
if (in_interrupt())
return;
/*
* Per the above we're !in_interrupt(), aka. task context.
*
* In this case we need to make sure we're not recursively
* faulting through the emulate_vsyscall() logic.
*/
if (current->thread.sig_on_uaccess_err && signal) {
sanitize_error_code(address, &error_code);
set_signal_archinfo(address, error_code);
if (si_code == SEGV_PKUERR) {
force_sig_pkuerr((void __user *)address, pkey);
} else {
/* XXX: hwpoison faults will set the wrong code. */
force_sig_fault(signal, si_code, (void __user *)address);
}
}
/*
* Barring that, we can do the fixup and be happy.
*/
return;
}
/*
* AMD erratum #91 manifests as a spurious page fault on a PREFETCH
* instruction.
*/
if (is_prefetch(regs, error_code, address))
return;
page_fault_oops(regs, error_code, address);
}
/*
* Print out info about fatal segfaults, if the show_unhandled_signals
* sysctl is set:
*/
static inline void
show_signal_msg(struct pt_regs *regs, unsigned long error_code,
unsigned long address, struct task_struct *tsk)
{
const char *loglvl = task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG;
if (!unhandled_signal(tsk, SIGSEGV))
return;
if (!printk_ratelimit())
return;
printk("%s%s[%d]: segfault at %lx ip %px sp %px error %lx",
loglvl, tsk->comm, task_pid_nr(tsk), address,
(void *)regs->ip, (void *)regs->sp, error_code);
print_vma_addr(KERN_CONT " in ", regs->ip);
printk(KERN_CONT "\n");
show_opcodes(regs, loglvl);
}
/*
* The (legacy) vsyscall page is the long page in the kernel portion
* of the address space that has user-accessible permissions.
*/
static bool is_vsyscall_vaddr(unsigned long vaddr)
{
return unlikely((vaddr & PAGE_MASK) == VSYSCALL_ADDR);
}
static void
__bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code,
unsigned long address, u32 pkey, int si_code)
{
struct task_struct *tsk = current;
if (!user_mode(regs)) {
kernelmode_fixup_or_oops(regs, error_code, address,
SIGSEGV, si_code, pkey);
return;
}
if (!(error_code & X86_PF_USER)) {
/* Implicit user access to kernel memory -- just oops */
page_fault_oops(regs, error_code, address);
return;
}
/*
* User mode accesses just cause a SIGSEGV.
* It's possible to have interrupts off here:
*/
local_irq_enable();
/*
* Valid to do another page fault here because this one came
* from user space:
*/
if (is_prefetch(regs, error_code, address))
return;
if (is_errata100(regs, address))
return;
sanitize_error_code(address, &error_code);
if (fixup_vdso_exception(regs, X86_TRAP_PF, error_code, address))
return;
if (likely(show_unhandled_signals))
show_signal_msg(regs, error_code, address, tsk);
set_signal_archinfo(address, error_code);
if (si_code == SEGV_PKUERR)
force_sig_pkuerr((void __user *)address, pkey);
else
force_sig_fault(SIGSEGV, si_code, (void __user *)address);
local_irq_disable();
}
static noinline void
bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code,
unsigned long address)
{
__bad_area_nosemaphore(regs, error_code, address, 0, SEGV_MAPERR);
}
static void
__bad_area(struct pt_regs *regs, unsigned long error_code,
unsigned long address, u32 pkey, int si_code)
{
struct mm_struct *mm = current->mm;
/*
* Something tried to access memory that isn't in our memory map..
* Fix it, but check if it's kernel or user first..
*/
mmap_read_unlock(mm);
__bad_area_nosemaphore(regs, error_code, address, pkey, si_code);
}
static noinline void
bad_area(struct pt_regs *regs, unsigned long error_code, unsigned long address)
{
__bad_area(regs, error_code, address, 0, SEGV_MAPERR);
}
static inline bool bad_area_access_from_pkeys(unsigned long error_code,
struct vm_area_struct *vma)
{
/* This code is always called on the current mm */
bool foreign = false;
if (!cpu_feature_enabled(X86_FEATURE_OSPKE))
return false;
if (error_code & X86_PF_PK)
return true;
/* this checks permission keys on the VMA: */
if (!arch_vma_access_permitted(vma, (error_code & X86_PF_WRITE), (error_code & X86_PF_INSTR), foreign))
return true;
return false;
}
static noinline void
bad_area_access_error(struct pt_regs *regs, unsigned long error_code,
unsigned long address, struct vm_area_struct *vma)
{
/*
* This OSPKE check is not strictly necessary at runtime.
* But, doing it this way allows compiler optimizations
* if pkeys are compiled out.
*/
if (bad_area_access_from_pkeys(error_code, vma)) {
/*
* A protection key fault means that the PKRU value did not allow
* access to some PTE. Userspace can figure out what PKRU was
* from the XSAVE state. This function captures the pkey from
* the vma and passes it to userspace so userspace can discover
* which protection key was set on the PTE.
*
* If we get here, we know that the hardware signaled a X86_PF_PK
* fault and that there was a VMA once we got in the fault
* handler. It does *not* guarantee that the VMA we find here
* was the one that we faulted on.
*
* 1. T1 : mprotect_key(foo, PAGE_SIZE, pkey=4);
* 2. T1 : set PKRU to deny access to pkey=4, touches page
* 3. T1 : faults...
* 4. T2: mprotect_key(foo, PAGE_SIZE, pkey=5);
* 5. T1 : enters fault handler, takes mmap_lock, etc...
* 6. T1 : reaches here, sees vma_pkey(vma)=5, when we really
* faulted on a pte with its pkey=4.
*/
u32 pkey = vma_pkey(vma);
__bad_area(regs, error_code, address, pkey, SEGV_PKUERR);
} else {
__bad_area(regs, error_code, address, 0, SEGV_ACCERR);
}
}
static void
do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address,
vm_fault_t fault)
{
/* Kernel mode? Handle exceptions or die: */
if (!user_mode(regs)) {
kernelmode_fixup_or_oops(regs, error_code, address,
SIGBUS, BUS_ADRERR, ARCH_DEFAULT_PKEY);
return;
}
/* User-space => ok to do another page fault: */
if (is_prefetch(regs, error_code, address))
return;
sanitize_error_code(address, &error_code);
if (fixup_vdso_exception(regs, X86_TRAP_PF, error_code, address))
return;
set_signal_archinfo(address, error_code);
#ifdef CONFIG_MEMORY_FAILURE
if (fault & (VM_FAULT_HWPOISON|VM_FAULT_HWPOISON_LARGE)) {
struct task_struct *tsk = current;
unsigned lsb = 0;
pr_err(
"MCE: Killing %s:%d due to hardware memory corruption fault at %lx\n",
tsk->comm, tsk->pid, address);
if (fault & VM_FAULT_HWPOISON_LARGE)
lsb = hstate_index_to_shift(VM_FAULT_GET_HINDEX(fault));
if (fault & VM_FAULT_HWPOISON)
lsb = PAGE_SHIFT;
force_sig_mceerr(BUS_MCEERR_AR, (void __user *)address, lsb);
return;
}
#endif
force_sig_fault(SIGBUS, BUS_ADRERR, (void __user *)address);
}
static int spurious_kernel_fault_check(unsigned long error_code, pte_t *pte)
{
if ((error_code & X86_PF_WRITE) && !pte_write(*pte))
return 0;
if ((error_code & X86_PF_INSTR) && !pte_exec(*pte))
return 0;
return 1;
}
/*
* Handle a spurious fault caused by a stale TLB entry.
*
* This allows us to lazily refresh the TLB when increasing the
* permissions of a kernel page (RO -> RW or NX -> X). Doing it
* eagerly is very expensive since that implies doing a full
* cross-processor TLB flush, even if no stale TLB entries exist
* on other processors.
*
* Spurious faults may only occur if the TLB contains an entry with
* fewer permission than the page table entry. Non-present (P = 0)
* and reserved bit (R = 1) faults are never spurious.
*
* There are no security implications to leaving a stale TLB when
* increasing the permissions on a page.
*
* Returns non-zero if a spurious fault was handled, zero otherwise.
*
* See Intel Developer's Manual Vol 3 Section 4.10.4.3, bullet 3
* (Optional Invalidation).
*/
static noinline int
spurious_kernel_fault(unsigned long error_code, unsigned long address)
{
pgd_t *pgd;
p4d_t *p4d;
pud_t *pud;
pmd_t *pmd;
pte_t *pte;
int ret;
/*
* Only writes to RO or instruction fetches from NX may cause
* spurious faults.
*
* These could be from user or supervisor accesses but the TLB
* is only lazily flushed after a kernel mapping protection
* change, so user accesses are not expected to cause spurious
* faults.
*/
if (error_code != (X86_PF_WRITE | X86_PF_PROT) &&
error_code != (X86_PF_INSTR | X86_PF_PROT))
return 0;
pgd = init_mm.pgd + pgd_index(address); if (!pgd_present(*pgd))
return 0;
p4d = p4d_offset(pgd, address);
if (!p4d_present(*p4d))
return 0;
if (p4d_large(*p4d))
return spurious_kernel_fault_check(error_code, (pte_t *) p4d);
pud = pud_offset(p4d, address);
if (!pud_present(*pud))
return 0;
if (pud_large(*pud))
return spurious_kernel_fault_check(error_code, (pte_t *) pud);
pmd = pmd_offset(pud, address);
if (!pmd_present(*pmd))
return 0;
if (pmd_large(*pmd))
return spurious_kernel_fault_check(error_code, (pte_t *) pmd);
pte = pte_offset_kernel(pmd, address);
if (!pte_present(*pte))
return 0;
ret = spurious_kernel_fault_check(error_code, pte);
if (!ret)
return 0;
/*
* Make sure we have permissions in PMD.
* If not, then there's a bug in the page tables:
*/
ret = spurious_kernel_fault_check(error_code, (pte_t *) pmd); WARN_ONCE(!ret, "PMD has incorrect permission bits\n");
return ret;
}
NOKPROBE_SYMBOL(spurious_kernel_fault);
int show_unhandled_signals = 1;
static inline int
access_error(unsigned long error_code, struct vm_area_struct *vma)
{
/* This is only called for the current mm, so: */
bool foreign = false;
/*
* Read or write was blocked by protection keys. This is
* always an unconditional error and can never result in
* a follow-up action to resolve the fault, like a COW.
*/
if (error_code & X86_PF_PK)
return 1;
/*
* SGX hardware blocked the access. This usually happens
* when the enclave memory contents have been destroyed, like
* after a suspend/resume cycle. In any case, the kernel can't
* fix the cause of the fault. Handle the fault as an access
* error even in cases where no actual access violation
* occurred. This allows userspace to rebuild the enclave in
* response to the signal.
*/
if (unlikely(error_code & X86_PF_SGX))
return 1;
/*
* Make sure to check the VMA so that we do not perform
* faults just to hit a X86_PF_PK as soon as we fill in a
* page.
*/
if (!arch_vma_access_permitted(vma, (error_code & X86_PF_WRITE),
(error_code & X86_PF_INSTR), foreign))
return 1;
if (error_code & X86_PF_WRITE) {
/* write, present and write, not present: */
if (unlikely(!(vma->vm_flags & VM_WRITE)))
return 1;
return 0;
}
/* read, present: */
if (unlikely(error_code & X86_PF_PROT))
return 1;
/* read, not present: */
if (unlikely(!vma_is_accessible(vma)))
return 1;
return 0;
}
bool fault_in_kernel_space(unsigned long address)
{
/*
* On 64-bit systems, the vsyscall page is at an address above
* TASK_SIZE_MAX, but is not considered part of the kernel
* address space.
*/
if (IS_ENABLED(CONFIG_X86_64) && is_vsyscall_vaddr(address))
return false;
return address >= TASK_SIZE_MAX;}
/*
* Called for all faults where 'address' is part of the kernel address
* space. Might get called for faults that originate from *code* that
* ran in userspace or the kernel.
*/
static void
do_kern_addr_fault(struct pt_regs *regs, unsigned long hw_error_code,
unsigned long address)
{
/*
* Protection keys exceptions only happen on user pages. We
* have no user pages in the kernel portion of the address
* space, so do not expect them here.
*/
WARN_ON_ONCE(hw_error_code & X86_PF_PK);
#ifdef CONFIG_X86_32
/*
* We can fault-in kernel-space virtual memory on-demand. The
* 'reference' page table is init_mm.pgd.
*
* NOTE! We MUST NOT take any locks for this case. We may
* be in an interrupt or a critical region, and should
* only copy the information from the master page table,
* nothing more.
*
* Before doing this on-demand faulting, ensure that the
* fault is not any of the following:
* 1. A fault on a PTE with a reserved bit set.
* 2. A fault caused by a user-mode access. (Do not demand-
* fault kernel memory due to user-mode accesses).
* 3. A fault caused by a page-level protection violation.
* (A demand fault would be on a non-present page which
* would have X86_PF_PROT==0).
*
* This is only needed to close a race condition on x86-32 in
* the vmalloc mapping/unmapping code. See the comment above
* vmalloc_fault() for details. On x86-64 the race does not
* exist as the vmalloc mappings don't need to be synchronized
* there.
*/
if (!(hw_error_code & (X86_PF_RSVD | X86_PF_USER | X86_PF_PROT))) {
if (vmalloc_fault(address) >= 0)
return;
}
#endif
if (is_f00f_bug(regs, hw_error_code, address))
return;
/* Was the fault spurious, caused by lazy TLB invalidation? */
if (spurious_kernel_fault(hw_error_code, address))
return;
/* kprobes don't want to hook the spurious faults: */
if (WARN_ON_ONCE(kprobe_page_fault(regs, X86_TRAP_PF)))
return;
/*
* Note, despite being a "bad area", there are quite a few
* acceptable reasons to get here, such as erratum fixups
* and handling kernel code that can fault, like get_user().
*
* Don't take the mm semaphore here. If we fixup a prefetch
* fault we could otherwise deadlock:
*/
bad_area_nosemaphore(regs, hw_error_code, address);
}
NOKPROBE_SYMBOL(do_kern_addr_fault);
/*
* Handle faults in the user portion of the address space. Nothing in here
* should check X86_PF_USER without a specific justification: for almost
* all purposes, we should treat a normal kernel access to user memory
* (e.g. get_user(), put_user(), etc.) the same as the WRUSS instruction.
* The one exception is AC flag handling, which is, per the x86
* architecture, special for WRUSS.
*/
static inline
void do_user_addr_fault(struct pt_regs *regs,
unsigned long error_code,
unsigned long address)
{
struct vm_area_struct *vma;
struct task_struct *tsk;
struct mm_struct *mm;
vm_fault_t fault;
unsigned int flags = FAULT_FLAG_DEFAULT;
tsk = current;
mm = tsk->mm;
if (unlikely((error_code & (X86_PF_USER | X86_PF_INSTR)) == X86_PF_INSTR)) {
/*
* Whoops, this is kernel mode code trying to execute from
* user memory. Unless this is AMD erratum #93, which
* corrupts RIP such that it looks like a user address,
* this is unrecoverable. Don't even try to look up the
* VMA or look for extable entries.
*/
if (is_errata93(regs, address))
return;
page_fault_oops(regs, error_code, address);
return;
}
/* kprobes don't want to hook the spurious faults: */
if (WARN_ON_ONCE(kprobe_page_fault(regs, X86_TRAP_PF)))
return;
/*
* Reserved bits are never expected to be set on
* entries in the user portion of the page tables.
*/
if (unlikely(error_code & X86_PF_RSVD)) pgtable_bad(regs, error_code, address);
/*
* If SMAP is on, check for invalid kernel (supervisor) access to user
* pages in the user address space. The odd case here is WRUSS,
* which, according to the preliminary documentation, does not respect
* SMAP and will have the USER bit set so, in all cases, SMAP
* enforcement appears to be consistent with the USER bit.
*/
if (unlikely(cpu_feature_enabled(X86_FEATURE_SMAP) &&
!(error_code & X86_PF_USER) &&
!(regs->flags & X86_EFLAGS_AC))) {
/*
* No extable entry here. This was a kernel access to an
* invalid pointer. get_kernel_nofault() will not get here.
*/
page_fault_oops(regs, error_code, address);
return;
}
/*
* If we're in an interrupt, have no user context or are running
* in a region with pagefaults disabled then we must not take the fault
*/
if (unlikely(faulthandler_disabled() || !mm)) { bad_area_nosemaphore(regs, error_code, address);
return;
}
/*
* It's safe to allow irq's after cr2 has been saved and the
* vmalloc fault has been handled.
*
* User-mode registers count as a user access even for any
* potential system fault or CPU buglet:
*/
if (user_mode(regs)) {
local_irq_enable();
flags |= FAULT_FLAG_USER;
} else {
if (regs->flags & X86_EFLAGS_IF)
local_irq_enable();
}
perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address);
if (error_code & X86_PF_WRITE) flags |= FAULT_FLAG_WRITE; if (error_code & X86_PF_INSTR) flags |= FAULT_FLAG_INSTRUCTION;
#ifdef CONFIG_X86_64
/*
* Faults in the vsyscall page might need emulation. The
* vsyscall page is at a high address (>PAGE_OFFSET), but is
* considered to be part of the user address space.
*
* The vsyscall page does not have a "real" VMA, so do this
* emulation before we go searching for VMAs.
*
* PKRU never rejects instruction fetches, so we don't need
* to consider the PF_PK bit.
*/
if (is_vsyscall_vaddr(address)) {
if (emulate_vsyscall(error_code, regs, address))
return;
}
#endif
/*
* Kernel-mode access to the user address space should only occur
* on well-defined single instructions listed in the exception
* tables. But, an erroneous kernel fault occurring outside one of
* those areas which also holds mmap_lock might deadlock attempting
* to validate the fault against the address space.
*
* Only do the expensive exception table search when we might be at
* risk of a deadlock. This happens if we
* 1. Failed to acquire mmap_lock, and
* 2. The access did not originate in userspace.
*/
if (unlikely(!mmap_read_trylock(mm))) { if (!user_mode(regs) && !search_exception_tables(regs->ip)) {
/*
* Fault from code in kernel from
* which we do not expect faults.
*/
bad_area_nosemaphore(regs, error_code, address);
return;
}
retry:
mmap_read_lock(mm);
} else {
/*
* The above down_read_trylock() might have succeeded in
* which case we'll have missed the might_sleep() from
* down_read():
*/
might_sleep();
}
vma = find_vma(mm, address);
if (unlikely(!vma)) {
bad_area(regs, error_code, address);
return;
}
if (likely(vma->vm_start <= address))
goto good_area;
if (unlikely(!(vma->vm_flags & VM_GROWSDOWN))) {
bad_area(regs, error_code, address);
return;
}
if (unlikely(expand_stack(vma, address))) {
bad_area(regs, error_code, address);
return;
}
/*
* Ok, we have a good vm_area for this memory access, so
* we can handle it..
*/
good_area:
if (unlikely(access_error(error_code, vma))) {
bad_area_access_error(regs, error_code, address, vma);
return;
}
/*
* If for any reason at all we couldn't handle the fault,
* make sure we exit gracefully rather than endlessly redo
* the fault. Since we never set FAULT_FLAG_RETRY_NOWAIT, if
* we get VM_FAULT_RETRY back, the mmap_lock has been unlocked.
*
* Note that handle_userfault() may also release and reacquire mmap_lock
* (and not return with VM_FAULT_RETRY), when returning to userland to
* repeat the page fault later with a VM_FAULT_NOPAGE retval
* (potentially after handling any pending signal during the return to
* userland). The return to userland is identified whenever
* FAULT_FLAG_USER|FAULT_FLAG_KILLABLE are both set in flags.
*/
fault = handle_mm_fault(vma, address, flags, regs);
if (fault_signal_pending(fault, regs)) {
/*
* Quick path to respond to signals. The core mm code
* has unlocked the mm for us if we get here.
*/
if (!user_mode(regs))
kernelmode_fixup_or_oops(regs, error_code, address,
SIGBUS, BUS_ADRERR,
ARCH_DEFAULT_PKEY);
return;
}
/*
* If we need to retry the mmap_lock has already been released,
* and if there is a fatal signal pending there is no guarantee
* that we made any progress. Handle this case first.
*/
if (unlikely((fault & VM_FAULT_RETRY) &&
(flags & FAULT_FLAG_ALLOW_RETRY))) {
flags |= FAULT_FLAG_TRIED;
goto retry;
}
mmap_read_unlock(mm);
if (likely(!(fault & VM_FAULT_ERROR)))
return;
if (fatal_signal_pending(current) && !user_mode(regs)) {
kernelmode_fixup_or_oops(regs, error_code, address,
0, 0, ARCH_DEFAULT_PKEY);
return;
}
if (fault & VM_FAULT_OOM) {
/* Kernel mode? Handle exceptions or die: */
if (!user_mode(regs)) {
kernelmode_fixup_or_oops(regs, error_code, address,
SIGSEGV, SEGV_MAPERR,
ARCH_DEFAULT_PKEY);
return;
}
/*
* We ran out of memory, call the OOM killer, and return the
* userspace (which will retry the fault, or kill us if we got
* oom-killed):
*/
pagefault_out_of_memory();
} else {
if (fault & (VM_FAULT_SIGBUS|VM_FAULT_HWPOISON|
VM_FAULT_HWPOISON_LARGE))
do_sigbus(regs, error_code, address, fault);
else if (fault & VM_FAULT_SIGSEGV)
bad_area_nosemaphore(regs, error_code, address);
else
BUG();
}
}
NOKPROBE_SYMBOL(do_user_addr_fault);
static __always_inline void
trace_page_fault_entries(struct pt_regs *regs, unsigned long error_code,
unsigned long address)
{
if (!trace_pagefault_enabled())
return;
if (user_mode(regs))
trace_page_fault_user(address, regs, error_code);
else
trace_page_fault_kernel(address, regs, error_code);
}
static __always_inline void
handle_page_fault(struct pt_regs *regs, unsigned long error_code,
unsigned long address)
{
trace_page_fault_entries(regs, error_code, address);
if (unlikely(kmmio_fault(regs, address)))
return;
/* Was the fault on kernel-controlled part of the address space? */
if (unlikely(fault_in_kernel_space(address))) {
do_kern_addr_fault(regs, error_code, address);
} else {
do_user_addr_fault(regs, error_code, address);
/*
* User address page fault handling might have reenabled
* interrupts. Fixing up all potential exit points of
* do_user_addr_fault() and its leaf functions is just not
* doable w/o creating an unholy mess or turning the code
* upside down.
*/
local_irq_disable();
}
}
DEFINE_IDTENTRY_RAW_ERRORCODE(exc_page_fault)
{
unsigned long address = read_cr2();
irqentry_state_t state;
prefetchw(¤t->mm->mmap_lock);
/*
* KVM uses #PF vector to deliver 'page not present' events to guests
* (asynchronous page fault mechanism). The event happens when a
* userspace task is trying to access some valid (from guest's point of
* view) memory which is not currently mapped by the host (e.g. the
* memory is swapped out). Note, the corresponding "page ready" event
* which is injected when the memory becomes available, is delivered via
* an interrupt mechanism and not a #PF exception
* (see arch/x86/kernel/kvm.c: sysvec_kvm_asyncpf_interrupt()).
*
* We are relying on the interrupted context being sane (valid RSP,
* relevant locks not held, etc.), which is fine as long as the
* interrupted context had IF=1. We are also relying on the KVM
* async pf type field and CR2 being read consistently instead of
* getting values from real and async page faults mixed up.
*
* Fingers crossed.
*
* The async #PF handling code takes care of idtentry handling
* itself.
*/
if (kvm_handle_async_pf(regs, (u32)address))
return;
/*
* Entry handling for valid #PF from kernel mode is slightly
* different: RCU is already watching and rcu_irq_enter() must not
* be invoked because a kernel fault on a user space address might
* sleep.
*
* In case the fault hit a RCU idle region the conditional entry
* code reenabled RCU to avoid subsequent wreckage which helps
* debuggability.
*/
state = irqentry_enter(regs);
instrumentation_begin();
handle_page_fault(regs, error_code, address);
instrumentation_end();
irqentry_exit(regs, state);
}
// SPDX-License-Identifier: GPL-2.0-only
#include "cgroup-internal.h"
#include <linux/sched/cputime.h>
static DEFINE_SPINLOCK(cgroup_rstat_lock);
static DEFINE_PER_CPU(raw_spinlock_t, cgroup_rstat_cpu_lock);
static void cgroup_base_stat_flush(struct cgroup *cgrp, int cpu);
static struct cgroup_rstat_cpu *cgroup_rstat_cpu(struct cgroup *cgrp, int cpu)
{
return per_cpu_ptr(cgrp->rstat_cpu, cpu);
}
/**
* cgroup_rstat_updated - keep track of updated rstat_cpu
* @cgrp: target cgroup
* @cpu: cpu on which rstat_cpu was updated
*
* @cgrp's rstat_cpu on @cpu was updated. Put it on the parent's matching
* rstat_cpu->updated_children list. See the comment on top of
* cgroup_rstat_cpu definition for details.
*/
void cgroup_rstat_updated(struct cgroup *cgrp, int cpu)
{
raw_spinlock_t *cpu_lock = per_cpu_ptr(&cgroup_rstat_cpu_lock, cpu);
unsigned long flags;
/*
* Speculative already-on-list test. This may race leading to
* temporary inaccuracies, which is fine.
*
* Because @parent's updated_children is terminated with @parent
* instead of NULL, we can tell whether @cgrp is on the list by
* testing the next pointer for NULL.
*/
if (cgroup_rstat_cpu(cgrp, cpu)->updated_next)
return;
raw_spin_lock_irqsave(cpu_lock, flags);
/* put @cgrp and all ancestors on the corresponding updated lists */
while (true) {
struct cgroup_rstat_cpu *rstatc = cgroup_rstat_cpu(cgrp, cpu);
struct cgroup *parent = cgroup_parent(cgrp);
struct cgroup_rstat_cpu *prstatc;
/*
* Both additions and removals are bottom-up. If a cgroup
* is already in the tree, all ancestors are.
*/
if (rstatc->updated_next)
break;
/* Root has no parent to link it to, but mark it busy */
if (!parent) {
rstatc->updated_next = cgrp;
break;
}
prstatc = cgroup_rstat_cpu(parent, cpu);
rstatc->updated_next = prstatc->updated_children;
prstatc->updated_children = cgrp;
cgrp = parent;
}
raw_spin_unlock_irqrestore(cpu_lock, flags);
}
/**
* cgroup_rstat_cpu_pop_updated - iterate and dismantle rstat_cpu updated tree
* @pos: current position
* @root: root of the tree to traversal
* @cpu: target cpu
*
* Walks the updated rstat_cpu tree on @cpu from @root. %NULL @pos starts
* the traversal and %NULL return indicates the end. During traversal,
* each returned cgroup is unlinked from the tree. Must be called with the
* matching cgroup_rstat_cpu_lock held.
*
* The only ordering guarantee is that, for a parent and a child pair
* covered by a given traversal, if a child is visited, its parent is
* guaranteed to be visited afterwards.
*/
static struct cgroup *cgroup_rstat_cpu_pop_updated(struct cgroup *pos,
struct cgroup *root, int cpu)
{
struct cgroup_rstat_cpu *rstatc;
if (pos == root)
return NULL;
/*
* We're gonna walk down to the first leaf and visit/remove it. We
* can pick whatever unvisited node as the starting point.
*/
if (!pos)
pos = root;
else
pos = cgroup_parent(pos);
/* walk down to the first leaf */
while (true) {
rstatc = cgroup_rstat_cpu(pos, cpu);
if (rstatc->updated_children == pos)
break;
pos = rstatc->updated_children;
}
/*
* Unlink @pos from the tree. As the updated_children list is
* singly linked, we have to walk it to find the removal point.
* However, due to the way we traverse, @pos will be the first
* child in most cases. The only exception is @root.
*/
if (rstatc->updated_next) {
struct cgroup *parent = cgroup_parent(pos);
if (parent) {
struct cgroup_rstat_cpu *prstatc;
struct cgroup **nextp;
prstatc = cgroup_rstat_cpu(parent, cpu);
nextp = &prstatc->updated_children;
while (true) {
struct cgroup_rstat_cpu *nrstatc;
nrstatc = cgroup_rstat_cpu(*nextp, cpu);
if (*nextp == pos)
break;
WARN_ON_ONCE(*nextp == parent);
nextp = &nrstatc->updated_next;
}
*nextp = rstatc->updated_next;
}
rstatc->updated_next = NULL;
return pos;
}
/* only happens for @root */
return NULL;
}
/* see cgroup_rstat_flush() */
static void cgroup_rstat_flush_locked(struct cgroup *cgrp, bool may_sleep)
__releases(&cgroup_rstat_lock) __acquires(&cgroup_rstat_lock)
{
int cpu;
lockdep_assert_held(&cgroup_rstat_lock);
for_each_possible_cpu(cpu) {
raw_spinlock_t *cpu_lock = per_cpu_ptr(&cgroup_rstat_cpu_lock,
cpu);
struct cgroup *pos = NULL;
raw_spin_lock(cpu_lock);
while ((pos = cgroup_rstat_cpu_pop_updated(pos, cgrp, cpu))) {
struct cgroup_subsys_state *css;
cgroup_base_stat_flush(pos, cpu);
rcu_read_lock();
list_for_each_entry_rcu(css, &pos->rstat_css_list,
rstat_css_node)
css->ss->css_rstat_flush(css, cpu);
rcu_read_unlock();
}
raw_spin_unlock(cpu_lock);
/* if @may_sleep, play nice and yield if necessary */
if (may_sleep && (need_resched() ||
spin_needbreak(&cgroup_rstat_lock))) {
spin_unlock_irq(&cgroup_rstat_lock);
if (!cond_resched())
cpu_relax();
spin_lock_irq(&cgroup_rstat_lock);
}
}
}
/**
* cgroup_rstat_flush - flush stats in @cgrp's subtree
* @cgrp: target cgroup
*
* Collect all per-cpu stats in @cgrp's subtree into the global counters
* and propagate them upwards. After this function returns, all cgroups in
* the subtree have up-to-date ->stat.
*
* This also gets all cgroups in the subtree including @cgrp off the
* ->updated_children lists.
*
* This function may block.
*/
void cgroup_rstat_flush(struct cgroup *cgrp)
{
might_sleep();
spin_lock_irq(&cgroup_rstat_lock);
cgroup_rstat_flush_locked(cgrp, true);
spin_unlock_irq(&cgroup_rstat_lock);
}
/**
* cgroup_rstat_flush_irqsafe - irqsafe version of cgroup_rstat_flush()
* @cgrp: target cgroup
*
* This function can be called from any context.
*/
void cgroup_rstat_flush_irqsafe(struct cgroup *cgrp)
{
unsigned long flags;
spin_lock_irqsave(&cgroup_rstat_lock, flags);
cgroup_rstat_flush_locked(cgrp, false);
spin_unlock_irqrestore(&cgroup_rstat_lock, flags);
}
/**
* cgroup_rstat_flush_hold - flush stats in @cgrp's subtree and hold
* @cgrp: target cgroup
*
* Flush stats in @cgrp's subtree and prevent further flushes. Must be
* paired with cgroup_rstat_flush_release().
*
* This function may block.
*/
void cgroup_rstat_flush_hold(struct cgroup *cgrp)
__acquires(&cgroup_rstat_lock)
{
might_sleep();
spin_lock_irq(&cgroup_rstat_lock);
cgroup_rstat_flush_locked(cgrp, true);
}
/**
* cgroup_rstat_flush_release - release cgroup_rstat_flush_hold()
*/
void cgroup_rstat_flush_release(void)
__releases(&cgroup_rstat_lock)
{
spin_unlock_irq(&cgroup_rstat_lock);
}
int cgroup_rstat_init(struct cgroup *cgrp)
{
int cpu;
/* the root cgrp has rstat_cpu preallocated */
if (!cgrp->rstat_cpu) {
cgrp->rstat_cpu = alloc_percpu(struct cgroup_rstat_cpu);
if (!cgrp->rstat_cpu)
return -ENOMEM;
}
/* ->updated_children list is self terminated */
for_each_possible_cpu(cpu) {
struct cgroup_rstat_cpu *rstatc = cgroup_rstat_cpu(cgrp, cpu);
rstatc->updated_children = cgrp;
u64_stats_init(&rstatc->bsync);
}
return 0;
}
void cgroup_rstat_exit(struct cgroup *cgrp)
{
int cpu;
cgroup_rstat_flush(cgrp);
/* sanity check */
for_each_possible_cpu(cpu) {
struct cgroup_rstat_cpu *rstatc = cgroup_rstat_cpu(cgrp, cpu);
if (WARN_ON_ONCE(rstatc->updated_children != cgrp) ||
WARN_ON_ONCE(rstatc->updated_next))
return;
}
free_percpu(cgrp->rstat_cpu);
cgrp->rstat_cpu = NULL;
}
void __init cgroup_rstat_boot(void)
{
int cpu;
for_each_possible_cpu(cpu)
raw_spin_lock_init(per_cpu_ptr(&cgroup_rstat_cpu_lock, cpu));
}
/*
* Functions for cgroup basic resource statistics implemented on top of
* rstat.
*/
static void cgroup_base_stat_add(struct cgroup_base_stat *dst_bstat,
struct cgroup_base_stat *src_bstat)
{
dst_bstat->cputime.utime += src_bstat->cputime.utime;
dst_bstat->cputime.stime += src_bstat->cputime.stime;
dst_bstat->cputime.sum_exec_runtime += src_bstat->cputime.sum_exec_runtime;
}
static void cgroup_base_stat_sub(struct cgroup_base_stat *dst_bstat,
struct cgroup_base_stat *src_bstat)
{
dst_bstat->cputime.utime -= src_bstat->cputime.utime;
dst_bstat->cputime.stime -= src_bstat->cputime.stime;
dst_bstat->cputime.sum_exec_runtime -= src_bstat->cputime.sum_exec_runtime;
}
static void cgroup_base_stat_flush(struct cgroup *cgrp, int cpu)
{
struct cgroup_rstat_cpu *rstatc = cgroup_rstat_cpu(cgrp, cpu);
struct cgroup *parent = cgroup_parent(cgrp);
struct cgroup_base_stat cur, delta;
unsigned seq;
/* Root-level stats are sourced from system-wide CPU stats */
if (!parent)
return;
/* fetch the current per-cpu values */
do {
seq = __u64_stats_fetch_begin(&rstatc->bsync);
cur.cputime = rstatc->bstat.cputime;
} while (__u64_stats_fetch_retry(&rstatc->bsync, seq));
/* propagate percpu delta to global */
delta = cur;
cgroup_base_stat_sub(&delta, &rstatc->last_bstat);
cgroup_base_stat_add(&cgrp->bstat, &delta);
cgroup_base_stat_add(&rstatc->last_bstat, &delta);
/* propagate global delta to parent (unless that's root) */
if (cgroup_parent(parent)) {
delta = cgrp->bstat;
cgroup_base_stat_sub(&delta, &cgrp->last_bstat);
cgroup_base_stat_add(&parent->bstat, &delta);
cgroup_base_stat_add(&cgrp->last_bstat, &delta);
}
}
static struct cgroup_rstat_cpu *
cgroup_base_stat_cputime_account_begin(struct cgroup *cgrp, unsigned long *flags)
{
struct cgroup_rstat_cpu *rstatc;
rstatc = get_cpu_ptr(cgrp->rstat_cpu);
*flags = u64_stats_update_begin_irqsave(&rstatc->bsync);
return rstatc;
}
static void cgroup_base_stat_cputime_account_end(struct cgroup *cgrp,
struct cgroup_rstat_cpu *rstatc,
unsigned long flags)
{
u64_stats_update_end_irqrestore(&rstatc->bsync, flags);
cgroup_rstat_updated(cgrp, smp_processor_id());
put_cpu_ptr(rstatc);
}
void __cgroup_account_cputime(struct cgroup *cgrp, u64 delta_exec)
{
struct cgroup_rstat_cpu *rstatc;
unsigned long flags;
rstatc = cgroup_base_stat_cputime_account_begin(cgrp, &flags);
rstatc->bstat.cputime.sum_exec_runtime += delta_exec;
cgroup_base_stat_cputime_account_end(cgrp, rstatc, flags);
}
void __cgroup_account_cputime_field(struct cgroup *cgrp,
enum cpu_usage_stat index, u64 delta_exec)
{
struct cgroup_rstat_cpu *rstatc;
unsigned long flags;
rstatc = cgroup_base_stat_cputime_account_begin(cgrp, &flags);
switch (index) {
case CPUTIME_USER:
case CPUTIME_NICE:
rstatc->bstat.cputime.utime += delta_exec;
break;
case CPUTIME_SYSTEM:
case CPUTIME_IRQ:
case CPUTIME_SOFTIRQ:
rstatc->bstat.cputime.stime += delta_exec;
break;
default:
break;
}
cgroup_base_stat_cputime_account_end(cgrp, rstatc, flags);
}
/*
* compute the cputime for the root cgroup by getting the per cpu data
* at a global level, then categorizing the fields in a manner consistent
* with how it is done by __cgroup_account_cputime_field for each bit of
* cpu time attributed to a cgroup.
*/
static void root_cgroup_cputime(struct task_cputime *cputime)
{
int i;
cputime->stime = 0;
cputime->utime = 0;
cputime->sum_exec_runtime = 0;
for_each_possible_cpu(i) {
struct kernel_cpustat kcpustat;
u64 *cpustat = kcpustat.cpustat;
u64 user = 0;
u64 sys = 0;
kcpustat_cpu_fetch(&kcpustat, i);
user += cpustat[CPUTIME_USER];
user += cpustat[CPUTIME_NICE];
cputime->utime += user;
sys += cpustat[CPUTIME_SYSTEM];
sys += cpustat[CPUTIME_IRQ];
sys += cpustat[CPUTIME_SOFTIRQ];
cputime->stime += sys;
cputime->sum_exec_runtime += user;
cputime->sum_exec_runtime += sys;
cputime->sum_exec_runtime += cpustat[CPUTIME_STEAL];
}
}
void cgroup_base_stat_cputime_show(struct seq_file *seq)
{
struct cgroup *cgrp = seq_css(seq)->cgroup;
u64 usage, utime, stime;
struct task_cputime cputime;
if (cgroup_parent(cgrp)) {
cgroup_rstat_flush_hold(cgrp);
usage = cgrp->bstat.cputime.sum_exec_runtime;
cputime_adjust(&cgrp->bstat.cputime, &cgrp->prev_cputime,
&utime, &stime);
cgroup_rstat_flush_release();
} else {
root_cgroup_cputime(&cputime);
usage = cputime.sum_exec_runtime;
utime = cputime.utime;
stime = cputime.stime;
}
do_div(usage, NSEC_PER_USEC);
do_div(utime, NSEC_PER_USEC);
do_div(stime, NSEC_PER_USEC);
seq_printf(seq, "usage_usec %llu\n"
"user_usec %llu\n"
"system_usec %llu\n",
usage, utime, stime);
}
// SPDX-License-Identifier: GPL-2.0-or-later
/*
* Routines having to do with the 'struct sk_buff' memory handlers.
*
* Authors: Alan Cox <alan@lxorguk.ukuu.org.uk>
* Florian La Roche <rzsfl@rz.uni-sb.de>
*
* Fixes:
* Alan Cox : Fixed the worst of the load
* balancer bugs.
* Dave Platt : Interrupt stacking fix.
* Richard Kooijman : Timestamp fixes.
* Alan Cox : Changed buffer format.
* Alan Cox : destructor hook for AF_UNIX etc.
* Linus Torvalds : Better skb_clone.
* Alan Cox : Added skb_copy.
* Alan Cox : Added all the changed routines Linus
* only put in the headers
* Ray VanTassle : Fixed --skb->lock in free
* Alan Cox : skb_copy copy arp field
* Andi Kleen : slabified it.
* Robert Olsson : Removed skb_head_pool
*
* NOTE:
* The __skb_ routines should be called with interrupts
* disabled, or you better be *real* sure that the operation is atomic
* with respect to whatever list is being frobbed (e.g. via lock_sock()
* or via disabling bottom half handlers, etc).
*/
/*
* The functions in this file will not compile correctly with gcc 2.4.x
*/
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/module.h>
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/mm.h>
#include <linux/interrupt.h>
#include <linux/in.h>
#include <linux/inet.h>
#include <linux/slab.h>
#include <linux/tcp.h>
#include <linux/udp.h>
#include <linux/sctp.h>
#include <linux/netdevice.h>
#ifdef CONFIG_NET_CLS_ACT
#include <net/pkt_sched.h>
#endif
#include <linux/string.h>
#include <linux/skbuff.h>
#include <linux/splice.h>
#include <linux/cache.h>
#include <linux/rtnetlink.h>
#include <linux/init.h>
#include <linux/scatterlist.h>
#include <linux/errqueue.h>
#include <linux/prefetch.h>
#include <linux/if_vlan.h>
#include <linux/mpls.h>
#include <linux/kcov.h>
#include <net/protocol.h>
#include <net/dst.h>
#include <net/sock.h>
#include <net/checksum.h>
#include <net/ip6_checksum.h>
#include <net/xfrm.h>
#include <net/mpls.h>
#include <net/mptcp.h>
#include <net/page_pool.h>
#include <linux/uaccess.h>
#include <trace/events/skb.h>
#include <linux/highmem.h>
#include <linux/capability.h>
#include <linux/user_namespace.h>
#include <linux/indirect_call_wrapper.h>
#include "datagram.h"
#include "sock_destructor.h"
struct kmem_cache *skbuff_head_cache __ro_after_init;
static struct kmem_cache *skbuff_fclone_cache __ro_after_init;
#ifdef CONFIG_SKB_EXTENSIONS
static struct kmem_cache *skbuff_ext_cache __ro_after_init;
#endif
int sysctl_max_skb_frags __read_mostly = MAX_SKB_FRAGS;
EXPORT_SYMBOL(sysctl_max_skb_frags);
/**
* skb_panic - private function for out-of-line support
* @skb: buffer
* @sz: size
* @addr: address
* @msg: skb_over_panic or skb_under_panic
*
* Out-of-line support for skb_put() and skb_push().
* Called via the wrapper skb_over_panic() or skb_under_panic().
* Keep out of line to prevent kernel bloat.
* __builtin_return_address is not used because it is not always reliable.
*/
static void skb_panic(struct sk_buff *skb, unsigned int sz, void *addr,
const char msg[])
{
pr_emerg("%s: text:%px len:%d put:%d head:%px data:%px tail:%#lx end:%#lx dev:%s\n",
msg, addr, skb->len, sz, skb->head, skb->data,
(unsigned long)skb->tail, (unsigned long)skb->end,
skb->dev ? skb->dev->name : "<NULL>");
BUG();
}
static void skb_over_panic(struct sk_buff *skb, unsigned int sz, void *addr)
{
skb_panic(skb, sz, addr, __func__);
}
static void skb_under_panic(struct sk_buff *skb, unsigned int sz, void *addr)
{
skb_panic(skb, sz, addr, __func__);
}
#define NAPI_SKB_CACHE_SIZE 64
#define NAPI_SKB_CACHE_BULK 16
#define NAPI_SKB_CACHE_HALF (NAPI_SKB_CACHE_SIZE / 2)
struct napi_alloc_cache {
struct page_frag_cache page;
unsigned int skb_count;
void *skb_cache[NAPI_SKB_CACHE_SIZE];
};
static DEFINE_PER_CPU(struct page_frag_cache, netdev_alloc_cache);
static DEFINE_PER_CPU(struct napi_alloc_cache, napi_alloc_cache);
static void *__alloc_frag_align(unsigned int fragsz, gfp_t gfp_mask,
unsigned int align_mask)
{
struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache);
return page_frag_alloc_align(&nc->page, fragsz, gfp_mask, align_mask);
}
void *__napi_alloc_frag_align(unsigned int fragsz, unsigned int align_mask)
{
fragsz = SKB_DATA_ALIGN(fragsz);
return __alloc_frag_align(fragsz, GFP_ATOMIC, align_mask);
}
EXPORT_SYMBOL(__napi_alloc_frag_align);
void *__netdev_alloc_frag_align(unsigned int fragsz, unsigned int align_mask)
{
struct page_frag_cache *nc;
void *data;
fragsz = SKB_DATA_ALIGN(fragsz);
if (in_hardirq() || irqs_disabled()) {
nc = this_cpu_ptr(&netdev_alloc_cache);
data = page_frag_alloc_align(nc, fragsz, GFP_ATOMIC, align_mask);
} else {
local_bh_disable();
data = __alloc_frag_align(fragsz, GFP_ATOMIC, align_mask);
local_bh_enable();
}
return data;
}
EXPORT_SYMBOL(__netdev_alloc_frag_align);
static struct sk_buff *napi_skb_cache_get(void)
{
struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache);
struct sk_buff *skb;
if (unlikely(!nc->skb_count))
nc->skb_count = kmem_cache_alloc_bulk(skbuff_head_cache,
GFP_ATOMIC,
NAPI_SKB_CACHE_BULK,
nc->skb_cache);
if (unlikely(!nc->skb_count))
return NULL;
skb = nc->skb_cache[--nc->skb_count];
kasan_unpoison_object_data(skbuff_head_cache, skb);
return skb;
}
/* Caller must provide SKB that is memset cleared */
static void __build_skb_around(struct sk_buff *skb, void *data,
unsigned int frag_size)
{
struct skb_shared_info *shinfo;
unsigned int size = frag_size ? : ksize(data); size -= SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
/* Assumes caller memset cleared SKB */
skb->truesize = SKB_TRUESIZE(size);
refcount_set(&skb->users, 1);
skb->head = data;
skb->data = data;
skb_reset_tail_pointer(skb);
skb_set_end_offset(skb, size);
skb->mac_header = (typeof(skb->mac_header))~0U;
skb->transport_header = (typeof(skb->transport_header))~0U;
/* make sure we initialize shinfo sequentially */
shinfo = skb_shinfo(skb);
memset(shinfo, 0, offsetof(struct skb_shared_info, dataref));
atomic_set(&shinfo->dataref, 1);
skb_set_kcov_handle(skb, kcov_common_handle());
}
/**
* __build_skb - build a network buffer
* @data: data buffer provided by caller
* @frag_size: size of data, or 0 if head was kmalloced
*
* Allocate a new &sk_buff. Caller provides space holding head and
* skb_shared_info. @data must have been allocated by kmalloc() only if
* @frag_size is 0, otherwise data should come from the page allocator
* or vmalloc()
* The return is the new skb buffer.
* On a failure the return is %NULL, and @data is not freed.
* Notes :
* Before IO, driver allocates only data buffer where NIC put incoming frame
* Driver should add room at head (NET_SKB_PAD) and
* MUST add room at tail (SKB_DATA_ALIGN(skb_shared_info))
* After IO, driver calls build_skb(), to allocate sk_buff and populate it
* before giving packet to stack.
* RX rings only contains data buffers, not full skbs.
*/
struct sk_buff *__build_skb(void *data, unsigned int frag_size)
{
struct sk_buff *skb;
skb = kmem_cache_alloc(skbuff_head_cache, GFP_ATOMIC);
if (unlikely(!skb))
return NULL;
memset(skb, 0, offsetof(struct sk_buff, tail));
__build_skb_around(skb, data, frag_size);
return skb;
}
/* build_skb() is wrapper over __build_skb(), that specifically
* takes care of skb->head and skb->pfmemalloc
* This means that if @frag_size is not zero, then @data must be backed
* by a page fragment, not kmalloc() or vmalloc()
*/
struct sk_buff *build_skb(void *data, unsigned int frag_size)
{
struct sk_buff *skb = __build_skb(data, frag_size);
if (skb && frag_size) {
skb->head_frag = 1;
if (page_is_pfmemalloc(virt_to_head_page(data)))
skb->pfmemalloc = 1;
}
return skb;
}
EXPORT_SYMBOL(build_skb);
/**
* build_skb_around - build a network buffer around provided skb
* @skb: sk_buff provide by caller, must be memset cleared
* @data: data buffer provided by caller
* @frag_size: size of data, or 0 if head was kmalloced
*/
struct sk_buff *build_skb_around(struct sk_buff *skb,
void *data, unsigned int frag_size)
{
if (unlikely(!skb))
return NULL;
__build_skb_around(skb, data, frag_size);
if (frag_size) {
skb->head_frag = 1;
if (page_is_pfmemalloc(virt_to_head_page(data)))
skb->pfmemalloc = 1;
}
return skb;
}
EXPORT_SYMBOL(build_skb_around);
/**
* __napi_build_skb - build a network buffer
* @data: data buffer provided by caller
* @frag_size: size of data, or 0 if head was kmalloced
*
* Version of __build_skb() that uses NAPI percpu caches to obtain
* skbuff_head instead of inplace allocation.
*
* Returns a new &sk_buff on success, %NULL on allocation failure.
*/
static struct sk_buff *__napi_build_skb(void *data, unsigned int frag_size)
{
struct sk_buff *skb;
skb = napi_skb_cache_get();
if (unlikely(!skb))
return NULL;
memset(skb, 0, offsetof(struct sk_buff, tail));
__build_skb_around(skb, data, frag_size);
return skb;
}
/**
* napi_build_skb - build a network buffer
* @data: data buffer provided by caller
* @frag_size: size of data, or 0 if head was kmalloced
*
* Version of __napi_build_skb() that takes care of skb->head_frag
* and skb->pfmemalloc when the data is a page or page fragment.
*
* Returns a new &sk_buff on success, %NULL on allocation failure.
*/
struct sk_buff *napi_build_skb(void *data, unsigned int frag_size)
{
struct sk_buff *skb = __napi_build_skb(data, frag_size);
if (likely(skb) && frag_size) {
skb->head_frag = 1;
skb_propagate_pfmemalloc(virt_to_head_page(data), skb);
}
return skb;
}
EXPORT_SYMBOL(napi_build_skb);
/*
* kmalloc_reserve is a wrapper around kmalloc_node_track_caller that tells
* the caller if emergency pfmemalloc reserves are being used. If it is and
* the socket is later found to be SOCK_MEMALLOC then PFMEMALLOC reserves
* may be used. Otherwise, the packet data may be discarded until enough
* memory is free
*/
static void *kmalloc_reserve(size_t size, gfp_t flags, int node,
bool *pfmemalloc)
{
void *obj;
bool ret_pfmemalloc = false;
/*
* Try a regular allocation, when that fails and we're not entitled
* to the reserves, fail.
*/
obj = kmalloc_node_track_caller(size,
flags | __GFP_NOMEMALLOC | __GFP_NOWARN,
node);
if (obj || !(gfp_pfmemalloc_allowed(flags)))
goto out;
/* Try again but now we are using pfmemalloc reserves */
ret_pfmemalloc = true;
obj = kmalloc_node_track_caller(size, flags, node);
out:
if (pfmemalloc) *pfmemalloc = ret_pfmemalloc; return obj;
}
/* Allocate a new skbuff. We do this ourselves so we can fill in a few
* 'private' fields and also do memory statistics to find all the
* [BEEP] leaks.
*
*/
/**
* __alloc_skb - allocate a network buffer
* @size: size to allocate
* @gfp_mask: allocation mask
* @flags: If SKB_ALLOC_FCLONE is set, allocate from fclone cache
* instead of head cache and allocate a cloned (child) skb.
* If SKB_ALLOC_RX is set, __GFP_MEMALLOC will be used for
* allocations in case the data is required for writeback
* @node: numa node to allocate memory on
*
* Allocate a new &sk_buff. The returned buffer has no headroom and a
* tail room of at least size bytes. The object has a reference count
* of one. The return is the buffer. On a failure the return is %NULL.
*
* Buffers may only be allocated from interrupts using a @gfp_mask of
* %GFP_ATOMIC.
*/
struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask,
int flags, int node)
{
struct kmem_cache *cache;
struct sk_buff *skb;
u8 *data;
bool pfmemalloc;
cache = (flags & SKB_ALLOC_FCLONE) ? skbuff_fclone_cache : skbuff_head_cache; if (sk_memalloc_socks() && (flags & SKB_ALLOC_RX)) gfp_mask |= __GFP_MEMALLOC;
/* Get the HEAD */
if ((flags & (SKB_ALLOC_FCLONE | SKB_ALLOC_NAPI)) == SKB_ALLOC_NAPI &&
likely(node == NUMA_NO_NODE || node == numa_mem_id()))
skb = napi_skb_cache_get();
else
skb = kmem_cache_alloc_node(cache, gfp_mask & ~GFP_DMA, node); if (unlikely(!skb))
return NULL;
prefetchw(skb);
/* We do our best to align skb_shared_info on a separate cache
* line. It usually works because kmalloc(X > SMP_CACHE_BYTES) gives
* aligned memory blocks, unless SLUB/SLAB debug is enabled.
* Both skb->head and skb_shared_info are cache line aligned.
*/
size = SKB_DATA_ALIGN(size);
size += SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
data = kmalloc_reserve(size, gfp_mask, node, &pfmemalloc);
if (unlikely(!data))
goto nodata;
/* kmalloc(size) might give us more room than requested.
* Put skb_shared_info exactly at the end of allocated zone,
* to allow max possible filling before reallocation.
*/
size = SKB_WITH_OVERHEAD(ksize(data));
prefetchw(data + size);
/*
* Only clear those fields we need to clear, not those that we will
* actually initialise below. Hence, don't put any more fields after
* the tail pointer in struct sk_buff!
*/
memset(skb, 0, offsetof(struct sk_buff, tail));
__build_skb_around(skb, data, 0);
skb->pfmemalloc = pfmemalloc;
if (flags & SKB_ALLOC_FCLONE) {
struct sk_buff_fclones *fclones;
fclones = container_of(skb, struct sk_buff_fclones, skb1);
skb->fclone = SKB_FCLONE_ORIG;
refcount_set(&fclones->fclone_ref, 1);
fclones->skb2.fclone = SKB_FCLONE_CLONE;
}
return skb;
nodata:
kmem_cache_free(cache, skb);
return NULL;
}
EXPORT_SYMBOL(__alloc_skb);
/**
* __netdev_alloc_skb - allocate an skbuff for rx on a specific device
* @dev: network device to receive on
* @len: length to allocate
* @gfp_mask: get_free_pages mask, passed to alloc_skb
*
* Allocate a new &sk_buff and assign it a usage count of one. The
* buffer has NET_SKB_PAD headroom built in. Users should allocate
* the headroom they think they need without accounting for the
* built in space. The built in space is used for optimisations.
*
* %NULL is returned if there is no free memory.
*/
struct sk_buff *__netdev_alloc_skb(struct net_device *dev, unsigned int len,
gfp_t gfp_mask)
{
struct page_frag_cache *nc;
struct sk_buff *skb;
bool pfmemalloc;
void *data;
len += NET_SKB_PAD;
/* If requested length is either too small or too big,
* we use kmalloc() for skb->head allocation.
*/
if (len <= SKB_WITH_OVERHEAD(1024) ||
len > SKB_WITH_OVERHEAD(PAGE_SIZE) ||
(gfp_mask & (__GFP_DIRECT_RECLAIM | GFP_DMA))) {
skb = __alloc_skb(len, gfp_mask, SKB_ALLOC_RX, NUMA_NO_NODE);
if (!skb)
goto skb_fail;
goto skb_success;
}
len += SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
len = SKB_DATA_ALIGN(len);
if (sk_memalloc_socks())
gfp_mask |= __GFP_MEMALLOC;
if (in_hardirq() || irqs_disabled()) {
nc = this_cpu_ptr(&netdev_alloc_cache);
data = page_frag_alloc(nc, len, gfp_mask);
pfmemalloc = nc->pfmemalloc;
} else {
local_bh_disable();
nc = this_cpu_ptr(&napi_alloc_cache.page);
data = page_frag_alloc(nc, len, gfp_mask);
pfmemalloc = nc->pfmemalloc;
local_bh_enable();
}
if (unlikely(!data))
return NULL;
skb = __build_skb(data, len);
if (unlikely(!skb)) {
skb_free_frag(data);
return NULL;
}
if (pfmemalloc)
skb->pfmemalloc = 1;
skb->head_frag = 1;
skb_success:
skb_reserve(skb, NET_SKB_PAD);
skb->dev = dev;
skb_fail:
return skb;
}
EXPORT_SYMBOL(__netdev_alloc_skb);
/**
* __napi_alloc_skb - allocate skbuff for rx in a specific NAPI instance
* @napi: napi instance this buffer was allocated for
* @len: length to allocate
* @gfp_mask: get_free_pages mask, passed to alloc_skb and alloc_pages
*
* Allocate a new sk_buff for use in NAPI receive. This buffer will
* attempt to allocate the head from a special reserved region used
* only for NAPI Rx allocation. By doing this we can save several
* CPU cycles by avoiding having to disable and re-enable IRQs.
*
* %NULL is returned if there is no free memory.
*/
struct sk_buff *__napi_alloc_skb(struct napi_struct *napi, unsigned int len,
gfp_t gfp_mask)
{
struct napi_alloc_cache *nc;
struct sk_buff *skb;
void *data;
len += NET_SKB_PAD + NET_IP_ALIGN;
/* If requested length is either too small or too big,
* we use kmalloc() for skb->head allocation.
*/
if (len <= SKB_WITH_OVERHEAD(1024) ||
len > SKB_WITH_OVERHEAD(PAGE_SIZE) ||
(gfp_mask & (__GFP_DIRECT_RECLAIM | GFP_DMA))) {
skb = __alloc_skb(len, gfp_mask, SKB_ALLOC_RX | SKB_ALLOC_NAPI,
NUMA_NO_NODE);
if (!skb)
goto skb_fail;
goto skb_success;
}
nc = this_cpu_ptr(&napi_alloc_cache);
len += SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
len = SKB_DATA_ALIGN(len);
if (sk_memalloc_socks())
gfp_mask |= __GFP_MEMALLOC;
data = page_frag_alloc(&nc->page, len, gfp_mask);
if (unlikely(!data))
return NULL;
skb = __napi_build_skb(data, len);
if (unlikely(!skb)) {
skb_free_frag(data);
return NULL;
}
if (nc->page.pfmemalloc)
skb->pfmemalloc = 1;
skb->head_frag = 1;
skb_success:
skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN);
skb->dev = napi->dev;
skb_fail:
return skb;
}
EXPORT_SYMBOL(__napi_alloc_skb);
void skb_add_rx_frag(struct sk_buff *skb, int i, struct page *page, int off,
int size, unsigned int truesize)
{
skb_fill_page_desc(skb, i, page, off, size);
skb->len += size;
skb->data_len += size;
skb->truesize += truesize;
}
EXPORT_SYMBOL(skb_add_rx_frag);
void skb_coalesce_rx_frag(struct sk_buff *skb, int i, int size,
unsigned int truesize)
{
skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
skb_frag_size_add(frag, size);
skb->len += size;
skb->data_len += size;
skb->truesize += truesize;
}
EXPORT_SYMBOL(skb_coalesce_rx_frag);
static void skb_drop_list(struct sk_buff **listp)
{
kfree_skb_list(*listp);
*listp = NULL;
}
static inline void skb_drop_fraglist(struct sk_buff *skb)
{
skb_drop_list(&skb_shinfo(skb)->frag_list);
}
static void skb_clone_fraglist(struct sk_buff *skb)
{
struct sk_buff *list;
skb_walk_frags(skb, list)
skb_get(list);
}
static void skb_free_head(struct sk_buff *skb)
{
unsigned char *head = skb->head;
if (skb->head_frag) {
if (skb_pp_recycle(skb, head))
return;
skb_free_frag(head);
} else {
kfree(head);
}
}
static void skb_release_data(struct sk_buff *skb)
{
struct skb_shared_info *shinfo = skb_shinfo(skb);
int i;
if (skb->cloned &&
atomic_sub_return(skb->nohdr ? (1 << SKB_DATAREF_SHIFT) + 1 : 1,
&shinfo->dataref))
goto exit;
skb_zcopy_clear(skb, true);
for (i = 0; i < shinfo->nr_frags; i++) __skb_frag_unref(&shinfo->frags[i], skb->pp_recycle); if (shinfo->frag_list)
kfree_skb_list(shinfo->frag_list);
skb_free_head(skb);
exit:
/* When we clone an SKB we copy the reycling bit. The pp_recycle
* bit is only set on the head though, so in order to avoid races
* while trying to recycle fragments on __skb_frag_unref() we need
* to make one SKB responsible for triggering the recycle path.
* So disable the recycling bit if an SKB is cloned and we have
* additional references to to the fragmented part of the SKB.
* Eventually the last SKB will have the recycling bit set and it's
* dataref set to 0, which will trigger the recycling
*/
skb->pp_recycle = 0;
}
/*
* Free an skbuff by memory without cleaning the state.
*/
static void kfree_skbmem(struct sk_buff *skb)
{
struct sk_buff_fclones *fclones;
switch (skb->fclone) {
case SKB_FCLONE_UNAVAILABLE:
kmem_cache_free(skbuff_head_cache, skb);
return;
case SKB_FCLONE_ORIG:
fclones = container_of(skb, struct sk_buff_fclones, skb1);
/* We usually free the clone (TX completion) before original skb
* This test would have no chance to be true for the clone,
* while here, branch prediction will be good.
*/
if (refcount_read(&fclones->fclone_ref) == 1)
goto fastpath;
break;
default: /* SKB_FCLONE_CLONE */
fclones = container_of(skb, struct sk_buff_fclones, skb2);
break;
}
if (!refcount_dec_and_test(&fclones->fclone_ref))
return;
fastpath:
kmem_cache_free(skbuff_fclone_cache, fclones);
}
void skb_release_head_state(struct sk_buff *skb)
{
skb_dst_drop(skb);
if (skb->destructor) { WARN_ON(in_hardirq()); skb->destructor(skb);
}
#if IS_ENABLED(CONFIG_NF_CONNTRACK)
nf_conntrack_put(skb_nfct(skb));
#endif
skb_ext_put(skb);
}
/* Free everything but the sk_buff shell. */
static void skb_release_all(struct sk_buff *skb)
{
skb_release_head_state(skb);
if (likely(skb->head))
skb_release_data(skb);
}
/**
* __kfree_skb - private function
* @skb: buffer
*
* Free an sk_buff. Release anything attached to the buffer.
* Clean the state. This is an internal helper function. Users should
* always call kfree_skb
*/
void __kfree_skb(struct sk_buff *skb)
{
skb_release_all(skb);
kfree_skbmem(skb);
}
EXPORT_SYMBOL(__kfree_skb);
/**
* kfree_skb - free an sk_buff
* @skb: buffer to free
*
* Drop a reference to the buffer and free it if the usage count has
* hit zero.
*/
void kfree_skb(struct sk_buff *skb)
{
if (!skb_unref(skb))
return;
trace_kfree_skb(skb, __builtin_return_address(0));
__kfree_skb(skb);
}
EXPORT_SYMBOL(kfree_skb);
void kfree_skb_list(struct sk_buff *segs)
{
while (segs) {
struct sk_buff *next = segs->next;
kfree_skb(segs);
segs = next;
}
}
EXPORT_SYMBOL(kfree_skb_list);
/* Dump skb information and contents.
*
* Must only be called from net_ratelimit()-ed paths.
*
* Dumps whole packets if full_pkt, only headers otherwise.
*/
void skb_dump(const char *level, const struct sk_buff *skb, bool full_pkt)
{
struct skb_shared_info *sh = skb_shinfo(skb);
struct net_device *dev = skb->dev;
struct sock *sk = skb->sk;
struct sk_buff *list_skb;
bool has_mac, has_trans;
int headroom, tailroom;
int i, len, seg_len;
if (full_pkt)
len = skb->len;
else
len = min_t(int, skb->len, MAX_HEADER + 128);
headroom = skb_headroom(skb);
tailroom = skb_tailroom(skb);
has_mac = skb_mac_header_was_set(skb);
has_trans = skb_transport_header_was_set(skb);
printk("%sskb len=%u headroom=%u headlen=%u tailroom=%u\n"
"mac=(%d,%d) net=(%d,%d) trans=%d\n"
"shinfo(txflags=%u nr_frags=%u gso(size=%hu type=%u segs=%hu))\n"
"csum(0x%x ip_summed=%u complete_sw=%u valid=%u level=%u)\n"
"hash(0x%x sw=%u l4=%u) proto=0x%04x pkttype=%u iif=%d\n",
level, skb->len, headroom, skb_headlen(skb), tailroom,
has_mac ? skb->mac_header : -1,
has_mac ? skb_mac_header_len(skb) : -1,
skb->network_header,
has_trans ? skb_network_header_len(skb) : -1,
has_trans ? skb->transport_header : -1,
sh->tx_flags, sh->nr_frags,
sh->gso_size, sh->gso_type, sh->gso_segs,
skb->csum, skb->ip_summed, skb->csum_complete_sw,
skb->csum_valid, skb->csum_level,
skb->hash, skb->sw_hash, skb->l4_hash,
ntohs(skb->protocol), skb->pkt_type, skb->skb_iif);
if (dev)
printk("%sdev name=%s feat=%pNF\n",
level, dev->name, &dev->features);
if (sk)
printk("%ssk family=%hu type=%u proto=%u\n",
level, sk->sk_family, sk->sk_type, sk->sk_protocol);
if (full_pkt && headroom)
print_hex_dump(level, "skb headroom: ", DUMP_PREFIX_OFFSET,
16, 1, skb->head, headroom, false);
seg_len = min_t(int, skb_headlen(skb), len);
if (seg_len)
print_hex_dump(level, "skb linear: ", DUMP_PREFIX_OFFSET,
16, 1, skb->data, seg_len, false);
len -= seg_len;
if (full_pkt && tailroom)
print_hex_dump(level, "skb tailroom: ", DUMP_PREFIX_OFFSET,
16, 1, skb_tail_pointer(skb), tailroom, false);
for (i = 0; len && i < skb_shinfo(skb)->nr_frags; i++) {
skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
u32 p_off, p_len, copied;
struct page *p;
u8 *vaddr;
skb_frag_foreach_page(frag, skb_frag_off(frag),
skb_frag_size(frag), p, p_off, p_len,
copied) {
seg_len = min_t(int, p_len, len);
vaddr = kmap_atomic(p);
print_hex_dump(level, "skb frag: ",
DUMP_PREFIX_OFFSET,
16, 1, vaddr + p_off, seg_len, false);
kunmap_atomic(vaddr);
len -= seg_len;
if (!len)
break;
}
}
if (full_pkt && skb_has_frag_list(skb)) {
printk("skb fraglist:\n");
skb_walk_frags(skb, list_skb)
skb_dump(level, list_skb, true);
}
}
EXPORT_SYMBOL(skb_dump);
/**
* skb_tx_error - report an sk_buff xmit error
* @skb: buffer that triggered an error
*
* Report xmit error if a device callback is tracking this skb.
* skb must be freed afterwards.
*/
void skb_tx_error(struct sk_buff *skb)
{
skb_zcopy_clear(skb, true);
}
EXPORT_SYMBOL(skb_tx_error);
#ifdef CONFIG_TRACEPOINTS
/**
* consume_skb - free an skbuff
* @skb: buffer to free
*
* Drop a ref to the buffer and free it if the usage count has hit zero
* Functions identically to kfree_skb, but kfree_skb assumes that the frame
* is being dropped after a failure and notes that
*/
void consume_skb(struct sk_buff *skb)
{
if (!skb_unref(skb))
return;
trace_consume_skb(skb);
__kfree_skb(skb);
}
EXPORT_SYMBOL(consume_skb);
#endif
/**
* __consume_stateless_skb - free an skbuff, assuming it is stateless
* @skb: buffer to free
*
* Alike consume_skb(), but this variant assumes that this is the last
* skb reference and all the head states have been already dropped
*/
void __consume_stateless_skb(struct sk_buff *skb)
{
trace_consume_skb(skb);
skb_release_data(skb);
kfree_skbmem(skb);
}
static void napi_skb_cache_put(struct sk_buff *skb)
{
struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache);
u32 i;
kasan_poison_object_data(skbuff_head_cache, skb);
nc->skb_cache[nc->skb_count++] = skb;
if (unlikely(nc->skb_count == NAPI_SKB_CACHE_SIZE)) {
for (i = NAPI_SKB_CACHE_HALF; i < NAPI_SKB_CACHE_SIZE; i++)
kasan_unpoison_object_data(skbuff_head_cache,
nc->skb_cache[i]);
kmem_cache_free_bulk(skbuff_head_cache, NAPI_SKB_CACHE_HALF,
nc->skb_cache + NAPI_SKB_CACHE_HALF);
nc->skb_count = NAPI_SKB_CACHE_HALF;
}
}
void __kfree_skb_defer(struct sk_buff *skb)
{
skb_release_all(skb);
napi_skb_cache_put(skb);
}
void napi_skb_free_stolen_head(struct sk_buff *skb)
{
if (unlikely(skb->slow_gro)) {
nf_reset_ct(skb);
skb_dst_drop(skb);
skb_ext_put(skb);
skb_orphan(skb);
skb->slow_gro = 0;
}
napi_skb_cache_put(skb);
}
void napi_consume_skb(struct sk_buff *skb, int budget)
{
/* Zero budget indicate non-NAPI context called us, like netpoll */
if (unlikely(!budget)) {
dev_consume_skb_any(skb);
return;
}
lockdep_assert_in_softirq();
if (!skb_unref(skb))
return;
/* if reaching here SKB is ready to free */
trace_consume_skb(skb);
/* if SKB is a clone, don't handle this case */
if (skb->fclone != SKB_FCLONE_UNAVAILABLE) {
__kfree_skb(skb);
return;
}
skb_release_all(skb);
napi_skb_cache_put(skb);
}
EXPORT_SYMBOL(napi_consume_skb);
/* Make sure a field is enclosed inside headers_start/headers_end section */
#define CHECK_SKB_FIELD(field) \
BUILD_BUG_ON(offsetof(struct sk_buff, field) < \
offsetof(struct sk_buff, headers_start)); \
BUILD_BUG_ON(offsetof(struct sk_buff, field) > \
offsetof(struct sk_buff, headers_end)); \
static void __copy_skb_header(struct sk_buff *new, const struct sk_buff *old)
{
new->tstamp = old->tstamp;
/* We do not copy old->sk */
new->dev = old->dev;
memcpy(new->cb, old->cb, sizeof(old->cb));
skb_dst_copy(new, old);
__skb_ext_copy(new, old);
__nf_copy(new, old, false);
/* Note : this field could be in headers_start/headers_end section
* It is not yet because we do not want to have a 16 bit hole
*/
new->queue_mapping = old->queue_mapping;
memcpy(&new->headers_start, &old->headers_start,
offsetof(struct sk_buff, headers_end) -
offsetof(struct sk_buff, headers_start));
CHECK_SKB_FIELD(protocol);
CHECK_SKB_FIELD(csum);
CHECK_SKB_FIELD(hash);
CHECK_SKB_FIELD(priority);
CHECK_SKB_FIELD(skb_iif);
CHECK_SKB_FIELD(vlan_proto);
CHECK_SKB_FIELD(vlan_tci);
CHECK_SKB_FIELD(transport_header);
CHECK_SKB_FIELD(network_header);
CHECK_SKB_FIELD(mac_header);
CHECK_SKB_FIELD(inner_protocol);
CHECK_SKB_FIELD(inner_transport_header);
CHECK_SKB_FIELD(inner_network_header);
CHECK_SKB_FIELD(inner_mac_header);
CHECK_SKB_FIELD(mark);
#ifdef CONFIG_NETWORK_SECMARK
CHECK_SKB_FIELD(secmark);
#endif
#ifdef CONFIG_NET_RX_BUSY_POLL
CHECK_SKB_FIELD(napi_id);
#endif
#ifdef CONFIG_XPS
CHECK_SKB_FIELD(sender_cpu);
#endif
#ifdef CONFIG_NET_SCHED
CHECK_SKB_FIELD(tc_index);
#endif
}
/*
* You should not add any new code to this function. Add it to
* __copy_skb_header above instead.
*/
static struct sk_buff *__skb_clone(struct sk_buff *n, struct sk_buff *skb)
{
#define C(x) n->x = skb->x
n->next = n->prev = NULL;
n->sk = NULL;
__copy_skb_header(n, skb);
C(len);
C(data_len);
C(mac_len);
n->hdr_len = skb->nohdr ? skb_headroom(skb) : skb->hdr_len;
n->cloned = 1;
n->nohdr = 0;
n->peeked = 0;
C(pfmemalloc);
C(pp_recycle);
n->destructor = NULL;
C(tail);
C(end);
C(head);
C(head_frag);
C(data);
C(truesize);
refcount_set(&n->users, 1);
atomic_inc(&(skb_shinfo(skb)->dataref));
skb->cloned = 1;
return n;
#undef C
}
/**
* alloc_skb_for_msg() - allocate sk_buff to wrap frag list forming a msg
* @first: first sk_buff of the msg
*/
struct sk_buff *alloc_skb_for_msg(struct sk_buff *first)
{
struct sk_buff *n;
n = alloc_skb(0, GFP_ATOMIC);
if (!n)
return NULL;
n->len = first->len;
n->data_len = first->len;
n->truesize = first->truesize;
skb_shinfo(n)->frag_list = first;
__copy_skb_header(n, first);
n->destructor = NULL;
return n;
}
EXPORT_SYMBOL_GPL(alloc_skb_for_msg);
/**
* skb_morph - morph one skb into another
* @dst: the skb to receive the contents
* @src: the skb to supply the contents
*
* This is identical to skb_clone except that the target skb is
* supplied by the user.
*
* The target skb is returned upon exit.
*/
struct sk_buff *skb_morph(struct sk_buff *dst, struct sk_buff *src)
{
skb_release_all(dst);
return __skb_clone(dst, src);
}
EXPORT_SYMBOL_GPL(skb_morph);
int mm_account_pinned_pages(struct mmpin *mmp, size_t size)
{
unsigned long max_pg, num_pg, new_pg, old_pg;
struct user_struct *user;
if (capable(CAP_IPC_LOCK) || !size)
return 0;
num_pg = (size >> PAGE_SHIFT) + 2; /* worst case */
max_pg = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
user = mmp->user ? : current_user();
do {
old_pg = atomic_long_read(&user->locked_vm);
new_pg = old_pg + num_pg;
if (new_pg > max_pg)
return -ENOBUFS;
} while (atomic_long_cmpxchg(&user->locked_vm, old_pg, new_pg) !=
old_pg);
if (!mmp->user) {
mmp->user = get_uid(user);
mmp->num_pg = num_pg;
} else {
mmp->num_pg += num_pg;
}
return 0;
}
EXPORT_SYMBOL_GPL(mm_account_pinned_pages);
void mm_unaccount_pinned_pages(struct mmpin *mmp)
{
if (mmp->user) {
atomic_long_sub(mmp->num_pg, &mmp->user->locked_vm);
free_uid(mmp->user);
}
}
EXPORT_SYMBOL_GPL(mm_unaccount_pinned_pages);
struct ubuf_info *msg_zerocopy_alloc(struct sock *sk, size_t size)
{
struct ubuf_info *uarg;
struct sk_buff *skb;
WARN_ON_ONCE(!in_task());
skb = sock_omalloc(sk, 0, GFP_KERNEL);
if (!skb)
return NULL;
BUILD_BUG_ON(sizeof(*uarg) > sizeof(skb->cb));
uarg = (void *)skb->cb;
uarg->mmp.user = NULL;
if (mm_account_pinned_pages(&uarg->mmp, size)) {
kfree_skb(skb);
return NULL;
}
uarg->callback = msg_zerocopy_callback;
uarg->id = ((u32)atomic_inc_return(&sk->sk_zckey)) - 1;
uarg->len = 1;
uarg->bytelen = size;
uarg->zerocopy = 1;
uarg->flags = SKBFL_ZEROCOPY_FRAG;
refcount_set(&uarg->refcnt, 1);
sock_hold(sk);
return uarg;
}
EXPORT_SYMBOL_GPL(msg_zerocopy_alloc);
static inline struct sk_buff *skb_from_uarg(struct ubuf_info *uarg)
{
return container_of((void *)uarg, struct sk_buff, cb);
}
struct ubuf_info *msg_zerocopy_realloc(struct sock *sk, size_t size,
struct ubuf_info *uarg)
{
if (uarg) {
const u32 byte_limit = 1 << 19; /* limit to a few TSO */
u32 bytelen, next;
/* realloc only when socket is locked (TCP, UDP cork),
* so uarg->len and sk_zckey access is serialized
*/
if (!sock_owned_by_user(sk)) {
WARN_ON_ONCE(1);
return NULL;
}
bytelen = uarg->bytelen + size;
if (uarg->len == USHRT_MAX - 1 || bytelen > byte_limit) {
/* TCP can create new skb to attach new uarg */
if (sk->sk_type == SOCK_STREAM)
goto new_alloc;
return NULL;
}
next = (u32)atomic_read(&sk->sk_zckey);
if ((u32)(uarg->id + uarg->len) == next) {
if (mm_account_pinned_pages(&uarg->mmp, size))
return NULL;
uarg->len++;
uarg->bytelen = bytelen;
atomic_set(&sk->sk_zckey, ++next);
/* no extra ref when appending to datagram (MSG_MORE) */
if (sk->sk_type == SOCK_STREAM)
net_zcopy_get(uarg);
return uarg;
}
}
new_alloc:
return msg_zerocopy_alloc(sk, size);
}
EXPORT_SYMBOL_GPL(msg_zerocopy_realloc);
static bool skb_zerocopy_notify_extend(struct sk_buff *skb, u32 lo, u16 len)
{
struct sock_exterr_skb *serr = SKB_EXT_ERR(skb);
u32 old_lo, old_hi;
u64 sum_len;
old_lo = serr->ee.ee_info;
old_hi = serr->ee.ee_data;
sum_len = old_hi - old_lo + 1ULL + len;
if (sum_len >= (1ULL << 32))
return false;
if (lo != old_hi + 1)
return false;
serr->ee.ee_data += len;
return true;
}
static void __msg_zerocopy_callback(struct ubuf_info *uarg)
{
struct sk_buff *tail, *skb = skb_from_uarg(uarg);
struct sock_exterr_skb *serr;
struct sock *sk = skb->sk;
struct sk_buff_head *q;
unsigned long flags;
bool is_zerocopy;
u32 lo, hi;
u16 len;
mm_unaccount_pinned_pages(&uarg->mmp);
/* if !len, there was only 1 call, and it was aborted
* so do not queue a completion notification
*/
if (!uarg->len || sock_flag(sk, SOCK_DEAD))
goto release;
len = uarg->len;
lo = uarg->id;
hi = uarg->id + len - 1;
is_zerocopy = uarg->zerocopy;
serr = SKB_EXT_ERR(skb);
memset(serr, 0, sizeof(*serr));
serr->ee.ee_errno = 0;
serr->ee.ee_origin = SO_EE_ORIGIN_ZEROCOPY;
serr->ee.ee_data = hi;
serr->ee.ee_info = lo;
if (!is_zerocopy)
serr->ee.ee_code |= SO_EE_CODE_ZEROCOPY_COPIED;
q = &sk->sk_error_queue;
spin_lock_irqsave(&q->lock, flags);
tail = skb_peek_tail(q);
if (!tail || SKB_EXT_ERR(tail)->ee.ee_origin != SO_EE_ORIGIN_ZEROCOPY ||
!skb_zerocopy_notify_extend(tail, lo, len)) {
__skb_queue_tail(q, skb);
skb = NULL;
}
spin_unlock_irqrestore(&q->lock, flags);
sk_error_report(sk);
release:
consume_skb(skb);
sock_put(sk);
}
void msg_zerocopy_callback(struct sk_buff *skb, struct ubuf_info *uarg,
bool success)
{
uarg->zerocopy = uarg->zerocopy & success;
if (refcount_dec_and_test(&uarg->refcnt))
__msg_zerocopy_callback(uarg);
}
EXPORT_SYMBOL_GPL(msg_zerocopy_callback);
void msg_zerocopy_put_abort(struct ubuf_info *uarg, bool have_uref)
{
struct sock *sk = skb_from_uarg(uarg)->sk;
atomic_dec(&sk->sk_zckey);
uarg->len--;
if (have_uref)
msg_zerocopy_callback(NULL, uarg, true);
}
EXPORT_SYMBOL_GPL(msg_zerocopy_put_abort);
int skb_zerocopy_iter_dgram(struct sk_buff *skb, struct msghdr *msg, int len)
{
return __zerocopy_sg_from_iter(skb->sk, skb, &msg->msg_iter, len);
}
EXPORT_SYMBOL_GPL(skb_zerocopy_iter_dgram);
int skb_zerocopy_iter_stream(struct sock *sk, struct sk_buff *skb,
struct msghdr *msg, int len,
struct ubuf_info *uarg)
{
struct ubuf_info *orig_uarg = skb_zcopy(skb);
struct iov_iter orig_iter = msg->msg_iter;
int err, orig_len = skb->len;
/* An skb can only point to one uarg. This edge case happens when
* TCP appends to an skb, but zerocopy_realloc triggered a new alloc.
*/
if (orig_uarg && uarg != orig_uarg)
return -EEXIST;
err = __zerocopy_sg_from_iter(sk, skb, &msg->msg_iter, len);
if (err == -EFAULT || (err == -EMSGSIZE && skb->len == orig_len)) {
struct sock *save_sk = skb->sk;
/* Streams do not free skb on error. Reset to prev state. */
msg->msg_iter = orig_iter;
skb->sk = sk;
___pskb_trim(skb, orig_len);
skb->sk = save_sk;
return err;
}
skb_zcopy_set(skb, uarg, NULL);
return skb->len - orig_len;
}
EXPORT_SYMBOL_GPL(skb_zerocopy_iter_stream);
static int skb_zerocopy_clone(struct sk_buff *nskb, struct sk_buff *orig,
gfp_t gfp_mask)
{
if (skb_zcopy(orig)) {
if (skb_zcopy(nskb)) {
/* !gfp_mask callers are verified to !skb_zcopy(nskb) */
if (!gfp_mask) {
WARN_ON_ONCE(1);
return -ENOMEM;
}
if (skb_uarg(nskb) == skb_uarg(orig))
return 0;
if (skb_copy_ubufs(nskb, GFP_ATOMIC))
return -EIO;
}
skb_zcopy_set(nskb, skb_uarg(orig), NULL);
}
return 0;
}
/**
* skb_copy_ubufs - copy userspace skb frags buffers to kernel
* @skb: the skb to modify
* @gfp_mask: allocation priority
*
* This must be called on skb with SKBFL_ZEROCOPY_ENABLE.
* It will copy all frags into kernel and drop the reference
* to userspace pages.
*
* If this function is called from an interrupt gfp_mask() must be
* %GFP_ATOMIC.
*
* Returns 0 on success or a negative error code on failure
* to allocate kernel memory to copy to.
*/
int skb_copy_ubufs(struct sk_buff *skb, gfp_t gfp_mask)
{
int num_frags = skb_shinfo(skb)->nr_frags;
struct page *page, *head = NULL;
int i, new_frags;
u32 d_off;
if (skb_shared(skb) || skb_unclone(skb, gfp_mask))
return -EINVAL;
if (!num_frags)
goto release;
new_frags = (__skb_pagelen(skb) + PAGE_SIZE - 1) >> PAGE_SHIFT;
for (i = 0; i < new_frags; i++) {
page = alloc_page(gfp_mask);
if (!page) {
while (head) {
struct page *next = (struct page *)page_private(head);
put_page(head);
head = next;
}
return -ENOMEM;
}
set_page_private(page, (unsigned long)head);
head = page;
}
page = head;
d_off = 0;
for (i = 0; i < num_frags; i++) {
skb_frag_t *f = &skb_shinfo(skb)->frags[i];
u32 p_off, p_len, copied;
struct page *p;
u8 *vaddr;
skb_frag_foreach_page(f, skb_frag_off(f), skb_frag_size(f),
p, p_off, p_len, copied) {
u32 copy, done = 0;
vaddr = kmap_atomic(p);
while (done < p_len) {
if (d_off == PAGE_SIZE) {
d_off = 0;
page = (struct page *)page_private(page);
}
copy = min_t(u32, PAGE_SIZE - d_off, p_len - done);
memcpy(page_address(page) + d_off,
vaddr + p_off + done, copy);
done += copy;
d_off += copy;
}
kunmap_atomic(vaddr);
}
}
/* skb frags release userspace buffers */
for (i = 0; i < num_frags; i++)
skb_frag_unref(skb, i);
/* skb frags point to kernel buffers */
for (i = 0; i < new_frags - 1; i++) {
__skb_fill_page_desc(skb, i, head, 0, PAGE_SIZE);
head = (struct page *)page_private(head);
}
__skb_fill_page_desc(skb, new_frags - 1, head, 0, d_off);
skb_shinfo(skb)->nr_frags = new_frags;
release:
skb_zcopy_clear(skb, false);
return 0;
}
EXPORT_SYMBOL_GPL(skb_copy_ubufs);
/**
* skb_clone - duplicate an sk_buff
* @skb: buffer to clone
* @gfp_mask: allocation priority
*
* Duplicate an &sk_buff. The new one is not owned by a socket. Both
* copies share the same packet data but not structure. The new
* buffer has a reference count of 1. If the allocation fails the
* function returns %NULL otherwise the new buffer is returned.
*
* If this function is called from an interrupt gfp_mask() must be
* %GFP_ATOMIC.
*/
struct sk_buff *skb_clone(struct sk_buff *skb, gfp_t gfp_mask)
{
struct sk_buff_fclones *fclones = container_of(skb,
struct sk_buff_fclones,
skb1);
struct sk_buff *n;
if (skb_orphan_frags(skb, gfp_mask))
return NULL;
if (skb->fclone == SKB_FCLONE_ORIG &&
refcount_read(&fclones->fclone_ref) == 1) {
n = &fclones->skb2;
refcount_set(&fclones->fclone_ref, 2);
} else {
if (skb_pfmemalloc(skb))
gfp_mask |= __GFP_MEMALLOC; n = kmem_cache_alloc(skbuff_head_cache, gfp_mask);
if (!n)
return NULL;
n->fclone = SKB_FCLONE_UNAVAILABLE;
}
return __skb_clone(n, skb);
}
EXPORT_SYMBOL(skb_clone);
void skb_headers_offset_update(struct sk_buff *skb, int off)
{
/* Only adjust this if it actually is csum_start rather than csum */
if (skb->ip_summed == CHECKSUM_PARTIAL) skb->csum_start += off;
/* {transport,network,mac}_header and tail are relative to skb->head */
skb->transport_header += off;
skb->network_header += off;
if (skb_mac_header_was_set(skb))
skb->mac_header += off; skb->inner_transport_header += off;
skb->inner_network_header += off;
skb->inner_mac_header += off;
}
EXPORT_SYMBOL(skb_headers_offset_update);
void skb_copy_header(struct sk_buff *new, const struct sk_buff *old)
{
__copy_skb_header(new, old);
skb_shinfo(new)->gso_size = skb_shinfo(old)->gso_size;
skb_shinfo(new)->gso_segs = skb_shinfo(old)->gso_segs;
skb_shinfo(new)->gso_type = skb_shinfo(old)->gso_type;
}
EXPORT_SYMBOL(skb_copy_header);
static inline int skb_alloc_rx_flag(const struct sk_buff *skb)
{
if (skb_pfmemalloc(skb))
return SKB_ALLOC_RX;
return 0;
}
/**
* skb_copy - create private copy of an sk_buff
* @skb: buffer to copy
* @gfp_mask: allocation priority
*
* Make a copy of both an &sk_buff and its data. This is used when the
* caller wishes to modify the data and needs a private copy of the
* data to alter. Returns %NULL on failure or the pointer to the buffer
* on success. The returned buffer has a reference count of 1.
*
* As by-product this function converts non-linear &sk_buff to linear
* one, so that &sk_buff becomes completely private and caller is allowed
* to modify all the data of returned buffer. This means that this
* function is not recommended for use in circumstances when only
* header is going to be modified. Use pskb_copy() instead.
*/
struct sk_buff *skb_copy(const struct sk_buff *skb, gfp_t gfp_mask)
{
int headerlen = skb_headroom(skb);
unsigned int size = skb_end_offset(skb) + skb->data_len;
struct sk_buff *n = __alloc_skb(size, gfp_mask,
skb_alloc_rx_flag(skb), NUMA_NO_NODE);
if (!n)
return NULL;
/* Set the data pointer */
skb_reserve(n, headerlen);
/* Set the tail pointer and length */
skb_put(n, skb->len);
BUG_ON(skb_copy_bits(skb, -headerlen, n->head, headerlen + skb->len));
skb_copy_header(n, skb);
return n;
}
EXPORT_SYMBOL(skb_copy);
/**
* __pskb_copy_fclone - create copy of an sk_buff with private head.
* @skb: buffer to copy
* @headroom: headroom of new skb
* @gfp_mask: allocation priority
* @fclone: if true allocate the copy of the skb from the fclone
* cache instead of the head cache; it is recommended to set this
* to true for the cases where the copy will likely be cloned
*
* Make a copy of both an &sk_buff and part of its data, located
* in header. Fragmented data remain shared. This is used when
* the caller wishes to modify only header of &sk_buff and needs
* private copy of the header to alter. Returns %NULL on failure
* or the pointer to the buffer on success.
* The returned buffer has a reference count of 1.
*/
struct sk_buff *__pskb_copy_fclone(struct sk_buff *skb, int headroom,
gfp_t gfp_mask, bool fclone)
{
unsigned int size = skb_headlen(skb) + headroom;
int flags = skb_alloc_rx_flag(skb) | (fclone ? SKB_ALLOC_FCLONE : 0);
struct sk_buff *n = __alloc_skb(size, gfp_mask, flags, NUMA_NO_NODE);
if (!n)
goto out;
/* Set the data pointer */
skb_reserve(n, headroom);
/* Set the tail pointer and length */
skb_put(n, skb_headlen(skb));
/* Copy the bytes */
skb_copy_from_linear_data(skb, n->data, n->len);
n->truesize += skb->data_len;
n->data_len = skb->data_len;
n->len = skb->len;
if (skb_shinfo(skb)->nr_frags) {
int i;
if (skb_orphan_frags(skb, gfp_mask) ||
skb_zerocopy_clone(n, skb, gfp_mask)) {
kfree_skb(n);
n = NULL;
goto out;
}
for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
skb_shinfo(n)->frags[i] = skb_shinfo(skb)->frags[i];
skb_frag_ref(skb, i);
}
skb_shinfo(n)->nr_frags = i;
}
if (skb_has_frag_list(skb)) {
skb_shinfo(n)->frag_list = skb_shinfo(skb)->frag_list;
skb_clone_fraglist(n);
}
skb_copy_header(n, skb);
out:
return n;
}
EXPORT_SYMBOL(__pskb_copy_fclone);
/**
* pskb_expand_head - reallocate header of &sk_buff
* @skb: buffer to reallocate
* @nhead: room to add at head
* @ntail: room to add at tail
* @gfp_mask: allocation priority
*
* Expands (or creates identical copy, if @nhead and @ntail are zero)
* header of @skb. &sk_buff itself is not changed. &sk_buff MUST have
* reference count of 1. Returns zero in the case of success or error,
* if expansion failed. In the last case, &sk_buff is not changed.
*
* All the pointers pointing into skb header may change and must be
* reloaded after call to this function.
*/
int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail,
gfp_t gfp_mask)
{
int i, osize = skb_end_offset(skb);
int size = osize + nhead + ntail;
long off;
u8 *data;
BUG_ON(nhead < 0); BUG_ON(skb_shared(skb)); size = SKB_DATA_ALIGN(size);
if (skb_pfmemalloc(skb))
gfp_mask |= __GFP_MEMALLOC; data = kmalloc_reserve(size + SKB_DATA_ALIGN(sizeof(struct skb_shared_info)),
gfp_mask, NUMA_NO_NODE, NULL);
if (!data)
goto nodata;
size = SKB_WITH_OVERHEAD(ksize(data));
/* Copy only real data... and, alas, header. This should be
* optimized for the cases when header is void.
*/
memcpy(data + nhead, skb->head, skb_tail_pointer(skb) - skb->head);
memcpy((struct skb_shared_info *)(data + size),
skb_shinfo(skb),
offsetof(struct skb_shared_info, frags[skb_shinfo(skb)->nr_frags]));
/*
* if shinfo is shared we must drop the old head gracefully, but if it
* is not we can just drop the old head and let the existing refcount
* be since all we did is relocate the values
*/
if (skb_cloned(skb)) {
if (skb_orphan_frags(skb, gfp_mask))
goto nofrags;
if (skb_zcopy(skb))
refcount_inc(&skb_uarg(skb)->refcnt); for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
skb_frag_ref(skb, i);
if (skb_has_frag_list(skb))
skb_clone_fraglist(skb); skb_release_data(skb);
} else {
skb_free_head(skb);
}
off = (data + nhead) - skb->head;
skb->head = data;
skb->head_frag = 0;
skb->data += off;
skb_set_end_offset(skb, size);
#ifdef NET_SKBUFF_DATA_USES_OFFSET
off = nhead;
#endif
skb->tail += off;
skb_headers_offset_update(skb, nhead);
skb->cloned = 0;
skb->hdr_len = 0;
skb->nohdr = 0;
atomic_set(&skb_shinfo(skb)->dataref, 1);
skb_metadata_clear(skb);
/* It is not generally safe to change skb->truesize.
* For the moment, we really care of rx path, or
* when skb is orphaned (not attached to a socket).
*/
if (!skb->sk || skb->destructor == sock_edemux) skb->truesize += size - osize;
return 0;
nofrags:
kfree(data);
nodata:
return -ENOMEM;
}
EXPORT_SYMBOL(pskb_expand_head);
/* Make private copy of skb with writable head and some headroom */
struct sk_buff *skb_realloc_headroom(struct sk_buff *skb, unsigned int headroom)
{
struct sk_buff *skb2;
int delta = headroom - skb_headroom(skb);
if (delta <= 0)
skb2 = pskb_copy(skb, GFP_ATOMIC);
else {
skb2 = skb_clone(skb, GFP_ATOMIC);
if (skb2 && pskb_expand_head(skb2, SKB_DATA_ALIGN(delta), 0,
GFP_ATOMIC)) {
kfree_skb(skb2);
skb2 = NULL;
}
}
return skb2;
}
EXPORT_SYMBOL(skb_realloc_headroom);
int __skb_unclone_keeptruesize(struct sk_buff *skb, gfp_t pri)
{
unsigned int saved_end_offset, saved_truesize;
struct skb_shared_info *shinfo;
int res;
saved_end_offset = skb_end_offset(skb);
saved_truesize = skb->truesize;
res = pskb_expand_head(skb, 0, 0, pri);
if (res)
return res;
skb->truesize = saved_truesize;
if (likely(skb_end_offset(skb) == saved_end_offset))
return 0;
shinfo = skb_shinfo(skb);
/* We are about to change back skb->end,
* we need to move skb_shinfo() to its new location.
*/
memmove(skb->head + saved_end_offset,
shinfo,
offsetof(struct skb_shared_info, frags[shinfo->nr_frags]));
skb_set_end_offset(skb, saved_end_offset);
return 0;
}
/**
* skb_expand_head - reallocate header of &sk_buff
* @skb: buffer to reallocate
* @headroom: needed headroom
*
* Unlike skb_realloc_headroom, this one does not allocate a new skb
* if possible; copies skb->sk to new skb as needed
* and frees original skb in case of failures.
*
* It expect increased headroom and generates warning otherwise.
*/
struct sk_buff *skb_expand_head(struct sk_buff *skb, unsigned int headroom)
{
int delta = headroom - skb_headroom(skb);
int osize = skb_end_offset(skb);
struct sock *sk = skb->sk;
if (WARN_ONCE(delta <= 0,
"%s is expecting an increase in the headroom", __func__))
return skb;
delta = SKB_DATA_ALIGN(delta);
/* pskb_expand_head() might crash, if skb is shared. */
if (skb_shared(skb) || !is_skb_wmem(skb)) { struct sk_buff *nskb = skb_clone(skb, GFP_ATOMIC);
if (unlikely(!nskb))
goto fail;
if (sk) skb_set_owner_w(nskb, sk); consume_skb(skb);
skb = nskb;
}
if (pskb_expand_head(skb, delta, 0, GFP_ATOMIC))
goto fail;
if (sk && is_skb_wmem(skb)) { delta = skb_end_offset(skb) - osize;
refcount_add(delta, &sk->sk_wmem_alloc);
skb->truesize += delta;
}
return skb;
fail:
kfree_skb(skb);
return NULL;
}
EXPORT_SYMBOL(skb_expand_head);
/**
* skb_copy_expand - copy and expand sk_buff
* @skb: buffer to copy
* @newheadroom: new free bytes at head
* @newtailroom: new free bytes at tail
* @gfp_mask: allocation priority
*
* Make a copy of both an &sk_buff and its data and while doing so
* allocate additional space.
*
* This is used when the caller wishes to modify the data and needs a
* private copy of the data to alter as well as more space for new fields.
* Returns %NULL on failure or the pointer to the buffer
* on success. The returned buffer has a reference count of 1.
*
* You must pass %GFP_ATOMIC as the allocation priority if this function
* is called from an interrupt.
*/
struct sk_buff *skb_copy_expand(const struct sk_buff *skb,
int newheadroom, int newtailroom,
gfp_t gfp_mask)
{
/*
* Allocate the copy buffer
*/
struct sk_buff *n = __alloc_skb(newheadroom + skb->len + newtailroom,
gfp_mask, skb_alloc_rx_flag(skb),
NUMA_NO_NODE);
int oldheadroom = skb_headroom(skb);
int head_copy_len, head_copy_off;
if (!n)
return NULL;
skb_reserve(n, newheadroom);
/* Set the tail pointer and length */
skb_put(n, skb->len);
head_copy_len = oldheadroom;
head_copy_off = 0;
if (newheadroom <= head_copy_len)
head_copy_len = newheadroom;
else
head_copy_off = newheadroom - head_copy_len;
/* Copy the linear header and data. */
BUG_ON(skb_copy_bits(skb, -head_copy_len, n->head + head_copy_off,
skb->len + head_copy_len));
skb_copy_header(n, skb);
skb_headers_offset_update(n, newheadroom - oldheadroom);
return n;
}
EXPORT_SYMBOL(skb_copy_expand);
/**
* __skb_pad - zero pad the tail of an skb
* @skb: buffer to pad
* @pad: space to pad
* @free_on_error: free buffer on error
*
* Ensure that a buffer is followed by a padding area that is zero
* filled. Used by network drivers which may DMA or transfer data
* beyond the buffer end onto the wire.
*
* May return error in out of memory cases. The skb is freed on error
* if @free_on_error is true.
*/
int __skb_pad(struct sk_buff *skb, int pad, bool free_on_error)
{
int err;
int ntail;
/* If the skbuff is non linear tailroom is always zero.. */
if (!skb_cloned(skb) && skb_tailroom(skb) >= pad) {
memset(skb->data+skb->len, 0, pad);
return 0;
}
ntail = skb->data_len + pad - (skb->end - skb->tail);
if (likely(skb_cloned(skb) || ntail > 0)) {
err = pskb_expand_head(skb, 0, ntail, GFP_ATOMIC);
if (unlikely(err))
goto free_skb;
}
/* FIXME: The use of this function with non-linear skb's really needs
* to be audited.
*/
err = skb_linearize(skb);
if (unlikely(err))
goto free_skb;
memset(skb->data + skb->len, 0, pad);
return 0;
free_skb:
if (free_on_error)
kfree_skb(skb);
return err;
}
EXPORT_SYMBOL(__skb_pad);
/**
* pskb_put - add data to the tail of a potentially fragmented buffer
* @skb: start of the buffer to use
* @tail: tail fragment of the buffer to use
* @len: amount of data to add
*
* This function extends the used data area of the potentially
* fragmented buffer. @tail must be the last fragment of @skb -- or
* @skb itself. If this would exceed the total buffer size the kernel
* will panic. A pointer to the first byte of the extra data is
* returned.
*/
void *pskb_put(struct sk_buff *skb, struct sk_buff *tail, int len)
{
if (tail != skb) {
skb->data_len += len;
skb->len += len;
}
return skb_put(tail, len);
}
EXPORT_SYMBOL_GPL(pskb_put);
/**
* skb_put - add data to a buffer
* @skb: buffer to use
* @len: amount of data to add
*
* This function extends the used data area of the buffer. If this would
* exceed the total buffer size the kernel will panic. A pointer to the
* first byte of the extra data is returned.
*/
void *skb_put(struct sk_buff *skb, unsigned int len)
{
void *tmp = skb_tail_pointer(skb); SKB_LINEAR_ASSERT(skb); skb->tail += len;
skb->len += len;
if (unlikely(skb->tail > skb->end))
skb_over_panic(skb, len, __builtin_return_address(0));
return tmp;
}
EXPORT_SYMBOL(skb_put);
/**
* skb_push - add data to the start of a buffer
* @skb: buffer to use
* @len: amount of data to add
*
* This function extends the used data area of the buffer at the buffer
* start. If this would exceed the total buffer headroom the kernel will
* panic. A pointer to the first byte of the extra data is returned.
*/
void *skb_push(struct sk_buff *skb, unsigned int len)
{
skb->data -= len;
skb->len += len;
if (unlikely(skb->data < skb->head))
skb_under_panic(skb, len, __builtin_return_address(0));
return skb->data;
}
EXPORT_SYMBOL(skb_push);
/**
* skb_pull - remove data from the start of a buffer
* @skb: buffer to use
* @len: amount of data to remove
*
* This function removes data from the start of a buffer, returning
* the memory to the headroom. A pointer to the next data in the buffer
* is returned. Once the data has been pulled future pushes will overwrite
* the old data.
*/
void *skb_pull(struct sk_buff *skb, unsigned int len)
{
return skb_pull_inline(skb, len);
}
EXPORT_SYMBOL(skb_pull);
/**
* skb_trim - remove end from a buffer
* @skb: buffer to alter
* @len: new length
*
* Cut the length of a buffer down by removing data from the tail. If
* the buffer is already under the length specified it is not modified.
* The skb must be linear.
*/
void skb_trim(struct sk_buff *skb, unsigned int len)
{
if (skb->len > len)
__skb_trim(skb, len);
}
EXPORT_SYMBOL(skb_trim);
/* Trims skb to length len. It can change skb pointers.
*/
int ___pskb_trim(struct sk_buff *skb, unsigned int len)
{
struct sk_buff **fragp;
struct sk_buff *frag;
int offset = skb_headlen(skb);
int nfrags = skb_shinfo(skb)->nr_frags;
int i;
int err;
if (skb_cloned(skb) &&
unlikely((err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC))))
return err;
i = 0;
if (offset >= len)
goto drop_pages;
for (; i < nfrags; i++) {
int end = offset + skb_frag_size(&skb_shinfo(skb)->frags[i]);
if (end < len) {
offset = end;
continue;
}
skb_frag_size_set(&skb_shinfo(skb)->frags[i++], len - offset);
drop_pages:
skb_shinfo(skb)->nr_frags = i;
for (; i < nfrags; i++)
skb_frag_unref(skb, i);
if (skb_has_frag_list(skb))
skb_drop_fraglist(skb);
goto done;
}
for (fragp = &skb_shinfo(skb)->frag_list; (frag = *fragp);
fragp = &frag->next) {
int end = offset + frag->len;
if (skb_shared(frag)) {
struct sk_buff *nfrag;
nfrag = skb_clone(frag, GFP_ATOMIC);
if (unlikely(!nfrag))
return -ENOMEM;
nfrag->next = frag->next;
consume_skb(frag);
frag = nfrag;
*fragp = frag;
}
if (end < len) {
offset = end;
continue;
}
if (end > len &&
unlikely((err = pskb_trim(frag, len - offset))))
return err;
if (frag->next)
skb_drop_list(&frag->next);
break;
}
done:
if (len > skb_headlen(skb)) {
skb->data_len -= skb->len - len;
skb->len = len;
} else {
skb->len = len;
skb->data_len = 0;
skb_set_tail_pointer(skb, len);
}
if (!skb->sk || skb->destructor == sock_edemux)
skb_condense(skb);
return 0;
}
EXPORT_SYMBOL(___pskb_trim);
/* Note : use pskb_trim_rcsum() instead of calling this directly
*/
int pskb_trim_rcsum_slow(struct sk_buff *skb, unsigned int len)
{
if (skb->ip_summed == CHECKSUM_COMPLETE) {
int delta = skb->len - len;
skb->csum = csum_block_sub(skb->csum,
skb_checksum(skb, len, delta, 0),
len);
} else if (skb->ip_summed == CHECKSUM_PARTIAL) {
int hdlen = (len > skb_headlen(skb)) ? skb_headlen(skb) : len;
int offset = skb_checksum_start_offset(skb) + skb->csum_offset;
if (offset + sizeof(__sum16) > hdlen)
return -EINVAL;
}
return __pskb_trim(skb, len);
}
EXPORT_SYMBOL(pskb_trim_rcsum_slow);
/**
* __pskb_pull_tail - advance tail of skb header
* @skb: buffer to reallocate
* @delta: number of bytes to advance tail
*
* The function makes a sense only on a fragmented &sk_buff,
* it expands header moving its tail forward and copying necessary
* data from fragmented part.
*
* &sk_buff MUST have reference count of 1.
*
* Returns %NULL (and &sk_buff does not change) if pull failed
* or value of new tail of skb in the case of success.
*
* All the pointers pointing into skb header may change and must be
* reloaded after call to this function.
*/
/* Moves tail of skb head forward, copying data from fragmented part,
* when it is necessary.
* 1. It may fail due to malloc failure.
* 2. It may change skb pointers.
*
* It is pretty complicated. Luckily, it is called only in exceptional cases.
*/
void *__pskb_pull_tail(struct sk_buff *skb, int delta)
{
/* If skb has not enough free space at tail, get new one
* plus 128 bytes for future expansions. If we have enough
* room at tail, reallocate without expansion only if skb is cloned.
*/
int i, k, eat = (skb->tail + delta) - skb->end;
if (eat > 0 || skb_cloned(skb)) {
if (pskb_expand_head(skb, 0, eat > 0 ? eat + 128 : 0,
GFP_ATOMIC))
return NULL;
}
BUG_ON(skb_copy_bits(skb, skb_headlen(skb),
skb_tail_pointer(skb), delta));
/* Optimization: no fragments, no reasons to preestimate
* size of pulled pages. Superb.
*/
if (!skb_has_frag_list(skb))
goto pull_pages;
/* Estimate size of pulled pages. */
eat = delta;
for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
int size = skb_frag_size(&skb_shinfo(skb)->frags[i]);
if (size >= eat)
goto pull_pages;
eat -= size;
}
/* If we need update frag list, we are in troubles.
* Certainly, it is possible to add an offset to skb data,
* but taking into account that pulling is expected to
* be very rare operation, it is worth to fight against
* further bloating skb head and crucify ourselves here instead.
* Pure masohism, indeed. 8)8)
*/
if (eat) {
struct sk_buff *list = skb_shinfo(skb)->frag_list;
struct sk_buff *clone = NULL;
struct sk_buff *insp = NULL;
do {
if (list->len <= eat) {
/* Eaten as whole. */
eat -= list->len;
list = list->next;
insp = list;
} else {
/* Eaten partially. */
if (skb_shared(list)) {
/* Sucks! We need to fork list. :-( */
clone = skb_clone(list, GFP_ATOMIC);
if (!clone)
return NULL;
insp = list->next;
list = clone;
} else {
/* This may be pulled without
* problems. */
insp = list;
}
if (!pskb_pull(list, eat)) {
kfree_skb(clone);
return NULL;
}
break;
}
} while (eat);
/* Free pulled out fragments. */
while ((list = skb_shinfo(skb)->frag_list) != insp) {
skb_shinfo(skb)->frag_list = list->next;
consume_skb(list);
}
/* And insert new clone at head. */
if (clone) {
clone->next = list;
skb_shinfo(skb)->frag_list = clone;
}
}
/* Success! Now we may commit changes to skb data. */
pull_pages:
eat = delta;
k = 0;
for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
int size = skb_frag_size(&skb_shinfo(skb)->frags[i]);
if (size <= eat) {
skb_frag_unref(skb, i);
eat -= size;
} else {
skb_frag_t *frag = &skb_shinfo(skb)->frags[k];
*frag = skb_shinfo(skb)->frags[i];
if (eat) {
skb_frag_off_add(frag, eat);
skb_frag_size_sub(frag, eat);
if (!i)
goto end;
eat = 0;
}
k++;
}
}
skb_shinfo(skb)->nr_frags = k;
end:
skb->tail += delta;
skb->data_len -= delta;
if (!skb->data_len)
skb_zcopy_clear(skb, false);
return skb_tail_pointer(skb);
}
EXPORT_SYMBOL(__pskb_pull_tail);
/**
* skb_copy_bits - copy bits from skb to kernel buffer
* @skb: source skb
* @offset: offset in source
* @to: destination buffer
* @len: number of bytes to copy
*
* Copy the specified number of bytes from the source skb to the
* destination buffer.
*
* CAUTION ! :
* If its prototype is ever changed,
* check arch/{*}/net/{*}.S files,
* since it is called from BPF assembly code.
*/
int skb_copy_bits(const struct sk_buff *skb, int offset, void *to, int len)
{
int start = skb_headlen(skb);
struct sk_buff *frag_iter;
int i, copy;
if (offset > (int)skb->len - len)
goto fault;
/* Copy header. */
if ((copy = start - offset) > 0) { if (copy > len)
copy = len;
skb_copy_from_linear_data_offset(skb, offset, to, copy);
if ((len -= copy) == 0)
return 0; offset += copy;
to += copy;
}
for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
int end;
skb_frag_t *f = &skb_shinfo(skb)->frags[i];
WARN_ON(start > offset + len); end = start + skb_frag_size(f);
if ((copy = end - offset) > 0) {
u32 p_off, p_len, copied;
struct page *p;
u8 *vaddr;
if (copy > len)
copy = len;
skb_frag_foreach_page(f,
skb_frag_off(f) + offset - start,
copy, p, p_off, p_len, copied) {
vaddr = kmap_atomic(p);
memcpy(to + copied, vaddr + p_off, p_len);
kunmap_atomic(vaddr);
}
if ((len -= copy) == 0)
return 0;
offset += copy;
to += copy;
}
start = end;
}
skb_walk_frags(skb, frag_iter) {
int end;
WARN_ON(start > offset + len); end = start + frag_iter->len;
if ((copy = end - offset) > 0) {
if (copy > len)
copy = len;
if (skb_copy_bits(frag_iter, offset - start, to, copy))
goto fault;
if ((len -= copy) == 0)
return 0;
offset += copy;
to += copy;
}
start = end;
}
if (!len)
return 0;
fault:
return -EFAULT;
}
EXPORT_SYMBOL(skb_copy_bits);
/*
* Callback from splice_to_pipe(), if we need to release some pages
* at the end of the spd in case we error'ed out in filling the pipe.
*/
static void sock_spd_release(struct splice_pipe_desc *spd, unsigned int i)
{
put_page(spd->pages[i]);
}
static struct page *linear_to_page(struct page *page, unsigned int *len,
unsigned int *offset,
struct sock *sk)
{
struct page_frag *pfrag = sk_page_frag(sk);
if (!sk_page_frag_refill(sk, pfrag))
return NULL;
*len = min_t(unsigned int, *len, pfrag->size - pfrag->offset);
memcpy(page_address(pfrag->page) + pfrag->offset,
page_address(page) + *offset, *len);
*offset = pfrag->offset;
pfrag->offset += *len;
return pfrag->page;
}
static bool spd_can_coalesce(const struct splice_pipe_desc *spd,
struct page *page,
unsigned int offset)
{
return spd->nr_pages &&
spd->pages[spd->nr_pages - 1] == page &&
(spd->partial[spd->nr_pages - 1].offset +
spd->partial[spd->nr_pages - 1].len == offset);
}
/*
* Fill page/offset/length into spd, if it can hold more pages.
*/
static bool spd_fill_page(struct splice_pipe_desc *spd,
struct pipe_inode_info *pipe, struct page *page,
unsigned int *len, unsigned int offset,
bool linear,
struct sock *sk)
{
if (unlikely(spd->nr_pages == MAX_SKB_FRAGS))
return true;
if (linear) {
page = linear_to_page(page, len, &offset, sk);
if (!page)
return true;
}
if (spd_can_coalesce(spd, page, offset)) {
spd->partial[spd->nr_pages - 1].len += *len;
return false;
}
get_page(page);
spd->pages[spd->nr_pages] = page;
spd->partial[spd->nr_pages].len = *len;
spd->partial[spd->nr_pages].offset = offset;
spd->nr_pages++;
return false;
}
static bool __splice_segment(struct page *page, unsigned int poff,
unsigned int plen, unsigned int *off,
unsigned int *len,
struct splice_pipe_desc *spd, bool linear,
struct sock *sk,
struct pipe_inode_info *pipe)
{
if (!*len)
return true;
/* skip this segment if already processed */
if (*off >= plen) {
*off -= plen;
return false;
}
/* ignore any bits we already processed */
poff += *off;
plen -= *off;
*off = 0;
do {
unsigned int flen = min(*len, plen);
if (spd_fill_page(spd, pipe, page, &flen, poff,
linear, sk))
return true;
poff += flen;
plen -= flen;
*len -= flen;
} while (*len && plen);
return false;
}
/*
* Map linear and fragment data from the skb to spd. It reports true if the
* pipe is full or if we already spliced the requested length.
*/
static bool __skb_splice_bits(struct sk_buff *skb, struct pipe_inode_info *pipe,
unsigned int *offset, unsigned int *len,
struct splice_pipe_desc *spd, struct sock *sk)
{
int seg;
struct sk_buff *iter;
/* map the linear part :
* If skb->head_frag is set, this 'linear' part is backed by a
* fragment, and if the head is not shared with any clones then
* we can avoid a copy since we own the head portion of this page.
*/
if (__splice_segment(virt_to_page(skb->data),
(unsigned long) skb->data & (PAGE_SIZE - 1),
skb_headlen(skb),
offset, len, spd,
skb_head_is_locked(skb),
sk, pipe))
return true;
/*
* then map the fragments
*/
for (seg = 0; seg < skb_shinfo(skb)->nr_frags; seg++) {
const skb_frag_t *f = &skb_shinfo(skb)->frags[seg];
if (__splice_segment(skb_frag_page(f),
skb_frag_off(f), skb_frag_size(f),
offset, len, spd, false, sk, pipe))
return true;
}
skb_walk_frags(skb, iter) {
if (*offset >= iter->len) {
*offset -= iter->len;
continue;
}
/* __skb_splice_bits() only fails if the output has no room
* left, so no point in going over the frag_list for the error
* case.
*/
if (__skb_splice_bits(iter, pipe, offset, len, spd, sk))
return true;
}
return false;
}
/*
* Map data from the skb to a pipe. Should handle both the linear part,
* the fragments, and the frag list.
*/
int skb_splice_bits(struct sk_buff *skb, struct sock *sk, unsigned int offset,
struct pipe_inode_info *pipe, unsigned int tlen,
unsigned int flags)
{
struct partial_page partial[MAX_SKB_FRAGS];
struct page *pages[MAX_SKB_FRAGS];
struct splice_pipe_desc spd = {
.pages = pages,
.partial = partial,
.nr_pages_max = MAX_SKB_FRAGS,
.ops = &nosteal_pipe_buf_ops,
.spd_release = sock_spd_release,
};
int ret = 0;
__skb_splice_bits(skb, pipe, &offset, &tlen, &spd, sk);
if (spd.nr_pages)
ret = splice_to_pipe(pipe, &spd);
return ret;
}
EXPORT_SYMBOL_GPL(skb_splice_bits);
static int sendmsg_unlocked(struct sock *sk, struct msghdr *msg,
struct kvec *vec, size_t num, size_t size)
{
struct socket *sock = sk->sk_socket;
if (!sock)
return -EINVAL;
return kernel_sendmsg(sock, msg, vec, num, size);
}
static int sendpage_unlocked(struct sock *sk, struct page *page, int offset,
size_t size, int flags)
{
struct socket *sock = sk->sk_socket;
if (!sock)
return -EINVAL;
return kernel_sendpage(sock, page, offset, size, flags);
}
typedef int (*sendmsg_func)(struct sock *sk, struct msghdr *msg,
struct kvec *vec, size_t num, size_t size);
typedef int (*sendpage_func)(struct sock *sk, struct page *page, int offset,
size_t size, int flags);
static int __skb_send_sock(struct sock *sk, struct sk_buff *skb, int offset,
int len, sendmsg_func sendmsg, sendpage_func sendpage)
{
unsigned int orig_len = len;
struct sk_buff *head = skb;
unsigned short fragidx;
int slen, ret;
do_frag_list:
/* Deal with head data */
while (offset < skb_headlen(skb) && len) {
struct kvec kv;
struct msghdr msg;
slen = min_t(int, len, skb_headlen(skb) - offset);
kv.iov_base = skb->data + offset;
kv.iov_len = slen;
memset(&msg, 0, sizeof(msg));
msg.msg_flags = MSG_DONTWAIT;
ret = INDIRECT_CALL_2(sendmsg, kernel_sendmsg_locked,
sendmsg_unlocked, sk, &msg, &kv, 1, slen);
if (ret <= 0)
goto error;
offset += ret;
len -= ret;
}
/* All the data was skb head? */
if (!len)
goto out;
/* Make offset relative to start of frags */
offset -= skb_headlen(skb);
/* Find where we are in frag list */
for (fragidx = 0; fragidx < skb_shinfo(skb)->nr_frags; fragidx++) {
skb_frag_t *frag = &skb_shinfo(skb)->frags[fragidx];
if (offset < skb_frag_size(frag))
break;
offset -= skb_frag_size(frag);
}
for (; len && fragidx < skb_shinfo(skb)->nr_frags; fragidx++) {
skb_frag_t *frag = &skb_shinfo(skb)->frags[fragidx];
slen = min_t(size_t, len, skb_frag_size(frag) - offset);
while (slen) {
ret = INDIRECT_CALL_2(sendpage, kernel_sendpage_locked,
sendpage_unlocked, sk,
skb_frag_page(frag),
skb_frag_off(frag) + offset,
slen, MSG_DONTWAIT);
if (ret <= 0)
goto error;
len -= ret;
offset += ret;
slen -= ret;
}
offset = 0;
}
if (len) {
/* Process any frag lists */
if (skb == head) {
if (skb_has_frag_list(skb)) {
skb = skb_shinfo(skb)->frag_list;
goto do_frag_list;
}
} else if (skb->next) {
skb = skb->next;
goto do_frag_list;
}
}
out:
return orig_len - len;
error:
return orig_len == len ? ret : orig_len - len;
}
/* Send skb data on a socket. Socket must be locked. */
int skb_send_sock_locked(struct sock *sk, struct sk_buff *skb, int offset,
int len)
{
return __skb_send_sock(sk, skb, offset, len, kernel_sendmsg_locked,
kernel_sendpage_locked);
}
EXPORT_SYMBOL_GPL(skb_send_sock_locked);
/* Send skb data on a socket. Socket must be unlocked. */
int skb_send_sock(struct sock *sk, struct sk_buff *skb, int offset, int len)
{
return __skb_send_sock(sk, skb, offset, len, sendmsg_unlocked,
sendpage_unlocked);
}
/**
* skb_store_bits - store bits from kernel buffer to skb
* @skb: destination buffer
* @offset: offset in destination
* @from: source buffer
* @len: number of bytes to copy
*
* Copy the specified number of bytes from the source buffer to the
* destination skb. This function handles all the messy bits of
* traversing fragment lists and such.
*/
int skb_store_bits(struct sk_buff *skb, int offset, const void *from, int len)
{
int start = skb_headlen(skb);
struct sk_buff *frag_iter;
int i, copy;
if (offset > (int)skb->len - len)
goto fault;
if ((copy = start - offset) > 0) { if (copy > len)
copy = len;
skb_copy_to_linear_data_offset(skb, offset, from, copy);
if ((len -= copy) == 0)
return 0; offset += copy;
from += copy;
}
for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
int end;
WARN_ON(start > offset + len); end = start + skb_frag_size(frag);
if ((copy = end - offset) > 0) {
u32 p_off, p_len, copied;
struct page *p;
u8 *vaddr;
if (copy > len)
copy = len;
skb_frag_foreach_page(frag,
skb_frag_off(frag) + offset - start,
copy, p, p_off, p_len, copied) {
vaddr = kmap_atomic(p);
memcpy(vaddr + p_off, from + copied, p_len);
kunmap_atomic(vaddr);
}
if ((len -= copy) == 0)
return 0;
offset += copy;
from += copy;
}
start = end;
}
skb_walk_frags(skb, frag_iter) {
int end;
WARN_ON(start > offset + len); end = start + frag_iter->len;
if ((copy = end - offset) > 0) {
if (copy > len)
copy = len;
if (skb_store_bits(frag_iter, offset - start,
from, copy))
goto fault;
if ((len -= copy) == 0)
return 0;
offset += copy;
from += copy;
}
start = end;
}
if (!len)
return 0;
fault:
return -EFAULT;
}
EXPORT_SYMBOL(skb_store_bits);
/* Checksum skb data. */
__wsum __skb_checksum(const struct sk_buff *skb, int offset, int len,
__wsum csum, const struct skb_checksum_ops *ops)
{
int start = skb_headlen(skb);
int i, copy = start - offset;
struct sk_buff *frag_iter;
int pos = 0;
/* Checksum header. */
if (copy > 0) {
if (copy > len)
copy = len;
csum = INDIRECT_CALL_1(ops->update, csum_partial_ext,
skb->data + offset, copy, csum);
if ((len -= copy) == 0)
return csum;
offset += copy;
pos = copy;
}
for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
int end;
skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
WARN_ON(start > offset + len); end = start + skb_frag_size(frag);
if ((copy = end - offset) > 0) {
u32 p_off, p_len, copied;
struct page *p;
__wsum csum2;
u8 *vaddr;
if (copy > len)
copy = len;
skb_frag_foreach_page(frag,
skb_frag_off(frag) + offset - start,
copy, p, p_off, p_len, copied) {
vaddr = kmap_atomic(p);
csum2 = INDIRECT_CALL_1(ops->update,
csum_partial_ext,
vaddr + p_off, p_len, 0);
kunmap_atomic(vaddr);
csum = INDIRECT_CALL_1(ops->combine,
csum_block_add_ext, csum,
csum2, pos, p_len);
pos += p_len;
}
if (!(len -= copy))
return csum;
offset += copy;
}
start = end;
}
skb_walk_frags(skb, frag_iter) {
int end;
WARN_ON(start > offset + len); end = start + frag_iter->len;
if ((copy = end - offset) > 0) {
__wsum csum2;
if (copy > len)
copy = len;
csum2 = __skb_checksum(frag_iter, offset - start,
copy, 0, ops);
csum = INDIRECT_CALL_1(ops->combine, csum_block_add_ext,
csum, csum2, pos, copy);
if ((len -= copy) == 0)
return csum;
offset += copy;
pos += copy;
}
start = end;
}
BUG_ON(len);
return csum;
}
EXPORT_SYMBOL(__skb_checksum);
__wsum skb_checksum(const struct sk_buff *skb, int offset,
int len, __wsum csum)
{
const struct skb_checksum_ops ops = {
.update = csum_partial_ext,
.combine = csum_block_add_ext,
};
return __skb_checksum(skb, offset, len, csum, &ops);
}
EXPORT_SYMBOL(skb_checksum);
/* Both of above in one bottle. */
__wsum skb_copy_and_csum_bits(const struct sk_buff *skb, int offset,
u8 *to, int len)
{
int start = skb_headlen(skb);
int i, copy = start - offset;
struct sk_buff *frag_iter;
int pos = 0;
__wsum csum = 0;
/* Copy header. */
if (copy > 0) {
if (copy > len)
copy = len;
csum = csum_partial_copy_nocheck(skb->data + offset, to,
copy);
if ((len -= copy) == 0)
return csum;
offset += copy;
to += copy;
pos = copy;
}
for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
int end;
WARN_ON(start > offset + len);
end = start + skb_frag_size(&skb_shinfo(skb)->frags[i]);
if ((copy = end - offset) > 0) {
skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
u32 p_off, p_len, copied;
struct page *p;
__wsum csum2;
u8 *vaddr;
if (copy > len)
copy = len;
skb_frag_foreach_page(frag,
skb_frag_off(frag) + offset - start,
copy, p, p_off, p_len, copied) {
vaddr = kmap_atomic(p);
csum2 = csum_partial_copy_nocheck(vaddr + p_off,
to + copied,
p_len);
kunmap_atomic(vaddr);
csum = csum_block_add(csum, csum2, pos);
pos += p_len;
}
if (!(len -= copy))
return csum;
offset += copy;
to += copy;
}
start = end;
}
skb_walk_frags(skb, frag_iter) {
__wsum csum2;
int end;
WARN_ON(start > offset + len); end = start + frag_iter->len;
if ((copy = end - offset) > 0) {
if (copy > len)
copy = len;
csum2 = skb_copy_and_csum_bits(frag_iter,
offset - start,
to, copy);
csum = csum_block_add(csum, csum2, pos);
if ((len -= copy) == 0)
return csum;
offset += copy;
to += copy;
pos += copy;
}
start = end;
}
BUG_ON(len);
return csum;
}
EXPORT_SYMBOL(skb_copy_and_csum_bits);
__sum16 __skb_checksum_complete_head(struct sk_buff *skb, int len)
{
__sum16 sum;
sum = csum_fold(skb_checksum(skb, 0, len, skb->csum));
/* See comments in __skb_checksum_complete(). */
if (likely(!sum)) {
if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE) &&
!skb->csum_complete_sw)
netdev_rx_csum_fault(skb->dev, skb);
}
if (!skb_shared(skb))
skb->csum_valid = !sum;
return sum;
}
EXPORT_SYMBOL(__skb_checksum_complete_head);
/* This function assumes skb->csum already holds pseudo header's checksum,
* which has been changed from the hardware checksum, for example, by
* __skb_checksum_validate_complete(). And, the original skb->csum must
* have been validated unsuccessfully for CHECKSUM_COMPLETE case.
*
* It returns non-zero if the recomputed checksum is still invalid, otherwise
* zero. The new checksum is stored back into skb->csum unless the skb is
* shared.
*/
__sum16 __skb_checksum_complete(struct sk_buff *skb)
{
__wsum csum;
__sum16 sum;
csum = skb_checksum(skb, 0, skb->len, 0);
sum = csum_fold(csum_add(skb->csum, csum));
/* This check is inverted, because we already knew the hardware
* checksum is invalid before calling this function. So, if the
* re-computed checksum is valid instead, then we have a mismatch
* between the original skb->csum and skb_checksum(). This means either
* the original hardware checksum is incorrect or we screw up skb->csum
* when moving skb->data around.
*/
if (likely(!sum)) {
if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE) &&
!skb->csum_complete_sw)
netdev_rx_csum_fault(skb->dev, skb);
}
if (!skb_shared(skb)) {
/* Save full packet checksum */
skb->csum = csum;
skb->ip_summed = CHECKSUM_COMPLETE;
skb->csum_complete_sw = 1;
skb->csum_valid = !sum;
}
return sum;
}
EXPORT_SYMBOL(__skb_checksum_complete);
static __wsum warn_crc32c_csum_update(const void *buff, int len, __wsum sum)
{
net_warn_ratelimited(
"%s: attempt to compute crc32c without libcrc32c.ko\n",
__func__);
return 0;
}
static __wsum warn_crc32c_csum_combine(__wsum csum, __wsum csum2,
int offset, int len)
{
net_warn_ratelimited(
"%s: attempt to compute crc32c without libcrc32c.ko\n",
__func__);
return 0;
}
static const struct skb_checksum_ops default_crc32c_ops = {
.update = warn_crc32c_csum_update,
.combine = warn_crc32c_csum_combine,
};
const struct skb_checksum_ops *crc32c_csum_stub __read_mostly =
&default_crc32c_ops;
EXPORT_SYMBOL(crc32c_csum_stub);
/**
* skb_zerocopy_headlen - Calculate headroom needed for skb_zerocopy()
* @from: source buffer
*
* Calculates the amount of linear headroom needed in the 'to' skb passed
* into skb_zerocopy().
*/
unsigned int
skb_zerocopy_headlen(const struct sk_buff *from)
{
unsigned int hlen = 0;
if (!from->head_frag ||
skb_headlen(from) < L1_CACHE_BYTES ||
skb_shinfo(from)->nr_frags >= MAX_SKB_FRAGS) {
hlen = skb_headlen(from);
if (!hlen)
hlen = from->len;
}
if (skb_has_frag_list(from))
hlen = from->len;
return hlen;
}
EXPORT_SYMBOL_GPL(skb_zerocopy_headlen);
/**
* skb_zerocopy - Zero copy skb to skb
* @to: destination buffer
* @from: source buffer
* @len: number of bytes to copy from source buffer
* @hlen: size of linear headroom in destination buffer
*
* Copies up to `len` bytes from `from` to `to` by creating references
* to the frags in the source buffer.
*
* The `hlen` as calculated by skb_zerocopy_headlen() specifies the
* headroom in the `to` buffer.
*
* Return value:
* 0: everything is OK
* -ENOMEM: couldn't orphan frags of @from due to lack of memory
* -EFAULT: skb_copy_bits() found some problem with skb geometry
*/
int
skb_zerocopy(struct sk_buff *to, struct sk_buff *from, int len, int hlen)
{
int i, j = 0;
int plen = 0; /* length of skb->head fragment */
int ret;
struct page *page;
unsigned int offset;
BUG_ON(!from->head_frag && !hlen);
/* dont bother with small payloads */
if (len <= skb_tailroom(to))
return skb_copy_bits(from, 0, skb_put(to, len), len);
if (hlen) {
ret = skb_copy_bits(from, 0, skb_put(to, hlen), hlen);
if (unlikely(ret))
return ret;
len -= hlen;
} else {
plen = min_t(int, skb_headlen(from), len);
if (plen) {
page = virt_to_head_page(from->head);
offset = from->data - (unsigned char *)page_address(page);
__skb_fill_page_desc(to, 0, page, offset, plen);
get_page(page);
j = 1;
len -= plen;
}
}
to->truesize += len + plen;
to->len += len + plen;
to->data_len += len + plen;
if (unlikely(skb_orphan_frags(from, GFP_ATOMIC))) {
skb_tx_error(from);
return -ENOMEM;
}
skb_zerocopy_clone(to, from, GFP_ATOMIC);
for (i = 0; i < skb_shinfo(from)->nr_frags; i++) {
int size;
if (!len)
break;
skb_shinfo(to)->frags[j] = skb_shinfo(from)->frags[i];
size = min_t(int, skb_frag_size(&skb_shinfo(to)->frags[j]),
len);
skb_frag_size_set(&skb_shinfo(to)->frags[j], size);
len -= size;
skb_frag_ref(to, j);
j++;
}
skb_shinfo(to)->nr_frags = j;
return 0;
}
EXPORT_SYMBOL_GPL(skb_zerocopy);
void skb_copy_and_csum_dev(const struct sk_buff *skb, u8 *to)
{
__wsum csum;
long csstart;
if (skb->ip_summed == CHECKSUM_PARTIAL)
csstart = skb_checksum_start_offset(skb);
else
csstart = skb_headlen(skb);
BUG_ON(csstart > skb_headlen(skb));
skb_copy_from_linear_data(skb, to, csstart);
csum = 0;
if (csstart != skb->len)
csum = skb_copy_and_csum_bits(skb, csstart, to + csstart,
skb->len - csstart);
if (skb->ip_summed == CHECKSUM_PARTIAL) {
long csstuff = csstart + skb->csum_offset;
*((__sum16 *)(to + csstuff)) = csum_fold(csum);
}
}
EXPORT_SYMBOL(skb_copy_and_csum_dev);
/**
* skb_dequeue - remove from the head of the queue
* @list: list to dequeue from
*
* Remove the head of the list. The list lock is taken so the function
* may be used safely with other locking list functions. The head item is
* returned or %NULL if the list is empty.
*/
struct sk_buff *skb_dequeue(struct sk_buff_head *list)
{
unsigned long flags;
struct sk_buff *result;
spin_lock_irqsave(&list->lock, flags);
result = __skb_dequeue(list);
spin_unlock_irqrestore(&list->lock, flags);
return result;
}
EXPORT_SYMBOL(skb_dequeue);
/**
* skb_dequeue_tail - remove from the tail of the queue
* @list: list to dequeue from
*
* Remove the tail of the list. The list lock is taken so the function
* may be used safely with other locking list functions. The tail item is
* returned or %NULL if the list is empty.
*/
struct sk_buff *skb_dequeue_tail(struct sk_buff_head *list)
{
unsigned long flags;
struct sk_buff *result;
spin_lock_irqsave(&list->lock, flags);
result = __skb_dequeue_tail(list);
spin_unlock_irqrestore(&list->lock, flags);
return result;
}
EXPORT_SYMBOL(skb_dequeue_tail);
/**
* skb_queue_purge - empty a list
* @list: list to empty
*
* Delete all buffers on an &sk_buff list. Each buffer is removed from
* the list and one reference dropped. This function takes the list
* lock and is atomic with respect to other list locking functions.
*/
void skb_queue_purge(struct sk_buff_head *list)
{
struct sk_buff *skb;
while ((skb = skb_dequeue(list)) != NULL) kfree_skb(skb);
}
EXPORT_SYMBOL(skb_queue_purge);
/**
* skb_rbtree_purge - empty a skb rbtree
* @root: root of the rbtree to empty
* Return value: the sum of truesizes of all purged skbs.
*
* Delete all buffers on an &sk_buff rbtree. Each buffer is removed from
* the list and one reference dropped. This function does not take
* any lock. Synchronization should be handled by the caller (e.g., TCP
* out-of-order queue is protected by the socket lock).
*/
unsigned int skb_rbtree_purge(struct rb_root *root)
{
struct rb_node *p = rb_first(root);
unsigned int sum = 0;
while (p) {
struct sk_buff *skb = rb_entry(p, struct sk_buff, rbnode);
p = rb_next(p);
rb_erase(&skb->rbnode, root);
sum += skb->truesize;
kfree_skb(skb);
}
return sum;
}
/**
* skb_queue_head - queue a buffer at the list head
* @list: list to use
* @newsk: buffer to queue
*
* Queue a buffer at the start of the list. This function takes the
* list lock and can be used safely with other locking &sk_buff functions
* safely.
*
* A buffer cannot be placed on two lists at the same time.
*/
void skb_queue_head(struct sk_buff_head *list, struct sk_buff *newsk)
{
unsigned long flags;
spin_lock_irqsave(&list->lock, flags);
__skb_queue_head(list, newsk);
spin_unlock_irqrestore(&list->lock, flags);
}
EXPORT_SYMBOL(skb_queue_head);
/**
* skb_queue_tail - queue a buffer at the list tail
* @list: list to use
* @newsk: buffer to queue
*
* Queue a buffer at the tail of the list. This function takes the
* list lock and can be used safely with other locking &sk_buff functions
* safely.
*
* A buffer cannot be placed on two lists at the same time.
*/
void skb_queue_tail(struct sk_buff_head *list, struct sk_buff *newsk)
{
unsigned long flags;
spin_lock_irqsave(&list->lock, flags);
__skb_queue_tail(list, newsk);
spin_unlock_irqrestore(&list->lock, flags);
}
EXPORT_SYMBOL(skb_queue_tail);
/**
* skb_unlink - remove a buffer from a list
* @skb: buffer to remove
* @list: list to use
*
* Remove a packet from a list. The list locks are taken and this
* function is atomic with respect to other list locked calls
*
* You must know what list the SKB is on.
*/
void skb_unlink(struct sk_buff *skb, struct sk_buff_head *list)
{
unsigned long flags;
spin_lock_irqsave(&list->lock, flags);
__skb_unlink(skb, list);
spin_unlock_irqrestore(&list->lock, flags);
}
EXPORT_SYMBOL(skb_unlink);
/**
* skb_append - append a buffer
* @old: buffer to insert after
* @newsk: buffer to insert
* @list: list to use
*
* Place a packet after a given packet in a list. The list locks are taken
* and this function is atomic with respect to other list locked calls.
* A buffer cannot be placed on two lists at the same time.
*/
void skb_append(struct sk_buff *old, struct sk_buff *newsk, struct sk_buff_head *list)
{
unsigned long flags;
spin_lock_irqsave(&list->lock, flags);
__skb_queue_after(list, old, newsk);
spin_unlock_irqrestore(&list->lock, flags);
}
EXPORT_SYMBOL(skb_append);
static inline void skb_split_inside_header(struct sk_buff *skb,
struct sk_buff* skb1,
const u32 len, const int pos)
{
int i;
skb_copy_from_linear_data_offset(skb, len, skb_put(skb1, pos - len),
pos - len);
/* And move data appendix as is. */
for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
skb_shinfo(skb1)->frags[i] = skb_shinfo(skb)->frags[i];
skb_shinfo(skb1)->nr_frags = skb_shinfo(skb)->nr_frags;
skb_shinfo(skb)->nr_frags = 0;
skb1->data_len = skb->data_len;
skb1->len += skb1->data_len;
skb->data_len = 0;
skb->len = len;
skb_set_tail_pointer(skb, len);
}
static inline void skb_split_no_header(struct sk_buff *skb,
struct sk_buff* skb1,
const u32 len, int pos)
{
int i, k = 0;
const int nfrags = skb_shinfo(skb)->nr_frags;
skb_shinfo(skb)->nr_frags = 0;
skb1->len = skb1->data_len = skb->len - len;
skb->len = len;
skb->data_len = len - pos;
for (i = 0; i < nfrags; i++) {
int size = skb_frag_size(&skb_shinfo(skb)->frags[i]);
if (pos + size > len) {
skb_shinfo(skb1)->frags[k] = skb_shinfo(skb)->frags[i];
if (pos < len) {
/* Split frag.
* We have two variants in this case:
* 1. Move all the frag to the second
* part, if it is possible. F.e.
* this approach is mandatory for TUX,
* where splitting is expensive.
* 2. Split is accurately. We make this.
*/
skb_frag_ref(skb, i);
skb_frag_off_add(&skb_shinfo(skb1)->frags[0], len - pos);
skb_frag_size_sub(&skb_shinfo(skb1)->frags[0], len - pos);
skb_frag_size_set(&skb_shinfo(skb)->frags[i], len - pos);
skb_shinfo(skb)->nr_frags++;
}
k++;
} else
skb_shinfo(skb)->nr_frags++;
pos += size;
}
skb_shinfo(skb1)->nr_frags = k;
}
/**
* skb_split - Split fragmented skb to two parts at length len.
* @skb: the buffer to split
* @skb1: the buffer to receive the second part
* @len: new length for skb
*/
void skb_split(struct sk_buff *skb, struct sk_buff *skb1, const u32 len)
{
int pos = skb_headlen(skb);
skb_shinfo(skb1)->flags |= skb_shinfo(skb)->flags & SKBFL_SHARED_FRAG;
skb_zerocopy_clone(skb1, skb, 0);
if (len < pos) /* Split line is inside header. */
skb_split_inside_header(skb, skb1, len, pos);
else /* Second chunk has no header, nothing to copy. */
skb_split_no_header(skb, skb1, len, pos);
}
EXPORT_SYMBOL(skb_split);
/* Shifting from/to a cloned skb is a no-go.
*
* Caller cannot keep skb_shinfo related pointers past calling here!
*/
static int skb_prepare_for_shift(struct sk_buff *skb)
{
return skb_unclone_keeptruesize(skb, GFP_ATOMIC);
}
/**
* skb_shift - Shifts paged data partially from skb to another
* @tgt: buffer into which tail data gets added
* @skb: buffer from which the paged data comes from
* @shiftlen: shift up to this many bytes
*
* Attempts to shift up to shiftlen worth of bytes, which may be less than
* the length of the skb, from skb to tgt. Returns number bytes shifted.
* It's up to caller to free skb if everything was shifted.
*
* If @tgt runs out of frags, the whole operation is aborted.
*
* Skb cannot include anything else but paged data while tgt is allowed
* to have non-paged data as well.
*
* TODO: full sized shift could be optimized but that would need
* specialized skb free'er to handle frags without up-to-date nr_frags.
*/
int skb_shift(struct sk_buff *tgt, struct sk_buff *skb, int shiftlen)
{
int from, to, merge, todo;
skb_frag_t *fragfrom, *fragto;
BUG_ON(shiftlen > skb->len);
if (skb_headlen(skb))
return 0;
if (skb_zcopy(tgt) || skb_zcopy(skb))
return 0;
todo = shiftlen;
from = 0;
to = skb_shinfo(tgt)->nr_frags;
fragfrom = &skb_shinfo(skb)->frags[from];
/* Actual merge is delayed until the point when we know we can
* commit all, so that we don't have to undo partial changes
*/
if (!to ||
!skb_can_coalesce(tgt, to, skb_frag_page(fragfrom),
skb_frag_off(fragfrom))) {
merge = -1;
} else {
merge = to - 1;
todo -= skb_frag_size(fragfrom);
if (todo < 0) {
if (skb_prepare_for_shift(skb) ||
skb_prepare_for_shift(tgt))
return 0;
/* All previous frag pointers might be stale! */
fragfrom = &skb_shinfo(skb)->frags[from];
fragto = &skb_shinfo(tgt)->frags[merge];
skb_frag_size_add(fragto, shiftlen);
skb_frag_size_sub(fragfrom, shiftlen);
skb_frag_off_add(fragfrom, shiftlen);
goto onlymerged;
}
from++;
}
/* Skip full, not-fitting skb to avoid expensive operations */
if ((shiftlen == skb->len) &&
(skb_shinfo(skb)->nr_frags - from) > (MAX_SKB_FRAGS - to))
return 0;
if (skb_prepare_for_shift(skb) || skb_prepare_for_shift(tgt))
return 0;
while ((todo > 0) && (from < skb_shinfo(skb)->nr_frags)) {
if (to == MAX_SKB_FRAGS)
return 0;
fragfrom = &skb_shinfo(skb)->frags[from];
fragto = &skb_shinfo(tgt)->frags[to];
if (todo >= skb_frag_size(fragfrom)) {
*fragto = *fragfrom;
todo -= skb_frag_size(fragfrom);
from++;
to++;
} else {
__skb_frag_ref(fragfrom);
skb_frag_page_copy(fragto, fragfrom);
skb_frag_off_copy(fragto, fragfrom);
skb_frag_size_set(fragto, todo);
skb_frag_off_add(fragfrom, todo);
skb_frag_size_sub(fragfrom, todo);
todo = 0;
to++;
break;
}
}
/* Ready to "commit" this state change to tgt */
skb_shinfo(tgt)->nr_frags = to;
if (merge >= 0) {
fragfrom = &skb_shinfo(skb)->frags[0];
fragto = &skb_shinfo(tgt)->frags[merge];
skb_frag_size_add(fragto, skb_frag_size(fragfrom));
__skb_frag_unref(fragfrom, skb->pp_recycle);
}
/* Reposition in the original skb */
to = 0;
while (from < skb_shinfo(skb)->nr_frags)
skb_shinfo(skb)->frags[to++] = skb_shinfo(skb)->frags[from++];
skb_shinfo(skb)->nr_frags = to;
BUG_ON(todo > 0 && !skb_shinfo(skb)->nr_frags);
onlymerged:
/* Most likely the tgt won't ever need its checksum anymore, skb on
* the other hand might need it if it needs to be resent
*/
tgt->ip_summed = CHECKSUM_PARTIAL;
skb->ip_summed = CHECKSUM_PARTIAL;
/* Yak, is it really working this way? Some helper please? */
skb->len -= shiftlen;
skb->data_len -= shiftlen;
skb->truesize -= shiftlen;
tgt->len += shiftlen;
tgt->data_len += shiftlen;
tgt->truesize += shiftlen;
return shiftlen;
}
/**
* skb_prepare_seq_read - Prepare a sequential read of skb data
* @skb: the buffer to read
* @from: lower offset of data to be read
* @to: upper offset of data to be read
* @st: state variable
*
* Initializes the specified state variable. Must be called before
* invoking skb_seq_read() for the first time.
*/
void skb_prepare_seq_read(struct sk_buff *skb, unsigned int from,
unsigned int to, struct skb_seq_state *st)
{
st->lower_offset = from;
st->upper_offset = to;
st->root_skb = st->cur_skb = skb;
st->frag_idx = st->stepped_offset = 0;
st->frag_data = NULL;
st->frag_off = 0;
}
EXPORT_SYMBOL(skb_prepare_seq_read);
/**
* skb_seq_read - Sequentially read skb data
* @consumed: number of bytes consumed by the caller so far
* @data: destination pointer for data to be returned
* @st: state variable
*
* Reads a block of skb data at @consumed relative to the
* lower offset specified to skb_prepare_seq_read(). Assigns
* the head of the data block to @data and returns the length
* of the block or 0 if the end of the skb data or the upper
* offset has been reached.
*
* The caller is not required to consume all of the data
* returned, i.e. @consumed is typically set to the number
* of bytes already consumed and the next call to
* skb_seq_read() will return the remaining part of the block.
*
* Note 1: The size of each block of data returned can be arbitrary,
* this limitation is the cost for zerocopy sequential
* reads of potentially non linear data.
*
* Note 2: Fragment lists within fragments are not implemented
* at the moment, state->root_skb could be replaced with
* a stack for this purpose.
*/
unsigned int skb_seq_read(unsigned int consumed, const u8 **data,
struct skb_seq_state *st)
{
unsigned int block_limit, abs_offset = consumed + st->lower_offset;
skb_frag_t *frag;
if (unlikely(abs_offset >= st->upper_offset)) {
if (st->frag_data) {
kunmap_atomic(st->frag_data);
st->frag_data = NULL;
}
return 0;
}
next_skb:
block_limit = skb_headlen(st->cur_skb) + st->stepped_offset;
if (abs_offset < block_limit && !st->frag_data) {
*data = st->cur_skb->data + (abs_offset - st->stepped_offset);
return block_limit - abs_offset;
}
if (st->frag_idx == 0 && !st->frag_data)
st->stepped_offset += skb_headlen(st->cur_skb);
while (st->frag_idx < skb_shinfo(st->cur_skb)->nr_frags) {
unsigned int pg_idx, pg_off, pg_sz;
frag = &skb_shinfo(st->cur_skb)->frags[st->frag_idx];
pg_idx = 0;
pg_off = skb_frag_off(frag);
pg_sz = skb_frag_size(frag);
if (skb_frag_must_loop(skb_frag_page(frag))) {
pg_idx = (pg_off + st->frag_off) >> PAGE_SHIFT;
pg_off = offset_in_page(pg_off + st->frag_off);
pg_sz = min_t(unsigned int, pg_sz - st->frag_off,
PAGE_SIZE - pg_off);
}
block_limit = pg_sz + st->stepped_offset;
if (abs_offset < block_limit) {
if (!st->frag_data)
st->frag_data = kmap_atomic(skb_frag_page(frag) + pg_idx);
*data = (u8 *)st->frag_data + pg_off +
(abs_offset - st->stepped_offset);
return block_limit - abs_offset;
}
if (st->frag_data) {
kunmap_atomic(st->frag_data);
st->frag_data = NULL;
}
st->stepped_offset += pg_sz;
st->frag_off += pg_sz;
if (st->frag_off == skb_frag_size(frag)) {
st->frag_off = 0;
st->frag_idx++;
}
}
if (st->frag_data) {
kunmap_atomic(st->frag_data);
st->frag_data = NULL;
}
if (st->root_skb == st->cur_skb && skb_has_frag_list(st->root_skb)) {
st->cur_skb = skb_shinfo(st->root_skb)->frag_list;
st->frag_idx = 0;
goto next_skb;
} else if (st->cur_skb->next) {
st->cur_skb = st->cur_skb->next;
st->frag_idx = 0;
goto next_skb;
}
return 0;
}
EXPORT_SYMBOL(skb_seq_read);
/**
* skb_abort_seq_read - Abort a sequential read of skb data
* @st: state variable
*
* Must be called if skb_seq_read() was not called until it
* returned 0.
*/
void skb_abort_seq_read(struct skb_seq_state *st)
{
if (st->frag_data)
kunmap_atomic(st->frag_data);
}
EXPORT_SYMBOL(skb_abort_seq_read);
#define TS_SKB_CB(state) ((struct skb_seq_state *) &((state)->cb))
static unsigned int skb_ts_get_next_block(unsigned int offset, const u8 **text,
struct ts_config *conf,
struct ts_state *state)
{
return skb_seq_read(offset, text, TS_SKB_CB(state));
}
static void skb_ts_finish(struct ts_config *conf, struct ts_state *state)
{
skb_abort_seq_read(TS_SKB_CB(state));
}
/**
* skb_find_text - Find a text pattern in skb data
* @skb: the buffer to look in
* @from: search offset
* @to: search limit
* @config: textsearch configuration
*
* Finds a pattern in the skb data according to the specified
* textsearch configuration. Use textsearch_next() to retrieve
* subsequent occurrences of the pattern. Returns the offset
* to the first occurrence or UINT_MAX if no match was found.
*/
unsigned int skb_find_text(struct sk_buff *skb, unsigned int from,
unsigned int to, struct ts_config *config)
{
struct ts_state state;
unsigned int ret;
BUILD_BUG_ON(sizeof(struct skb_seq_state) > sizeof(state.cb));
config->get_next_block = skb_ts_get_next_block;
config->finish = skb_ts_finish;
skb_prepare_seq_read(skb, from, to, TS_SKB_CB(&state));
ret = textsearch_find(config, &state);
return (ret <= to - from ? ret : UINT_MAX);
}
EXPORT_SYMBOL(skb_find_text);
int skb_append_pagefrags(struct sk_buff *skb, struct page *page,
int offset, size_t size)
{
int i = skb_shinfo(skb)->nr_frags;
if (skb_can_coalesce(skb, i, page, offset)) {
skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], size);
} else if (i < MAX_SKB_FRAGS) {
get_page(page);
skb_fill_page_desc(skb, i, page, offset, size);
} else {
return -EMSGSIZE;
}
return 0;
}
EXPORT_SYMBOL_GPL(skb_append_pagefrags);
/**
* skb_pull_rcsum - pull skb and update receive checksum
* @skb: buffer to update
* @len: length of data pulled
*
* This function performs an skb_pull on the packet and updates
* the CHECKSUM_COMPLETE checksum. It should be used on
* receive path processing instead of skb_pull unless you know
* that the checksum difference is zero (e.g., a valid IP header)
* or you are setting ip_summed to CHECKSUM_NONE.
*/
void *skb_pull_rcsum(struct sk_buff *skb, unsigned int len)
{
unsigned char *data = skb->data;
BUG_ON(len > skb->len);
__skb_pull(skb, len);
skb_postpull_rcsum(skb, data, len);
return skb->data;
}
EXPORT_SYMBOL_GPL(skb_pull_rcsum);
static inline skb_frag_t skb_head_frag_to_page_desc(struct sk_buff *frag_skb)
{
skb_frag_t head_frag;
struct page *page;
page = virt_to_head_page(frag_skb->head);
__skb_frag_set_page(&head_frag, page);
skb_frag_off_set(&head_frag, frag_skb->data -
(unsigned char *)page_address(page));
skb_frag_size_set(&head_frag, skb_headlen(frag_skb));
return head_frag;
}
struct sk_buff *skb_segment_list(struct sk_buff *skb,
netdev_features_t features,
unsigned int offset)
{
struct sk_buff *list_skb = skb_shinfo(skb)->frag_list;
unsigned int tnl_hlen = skb_tnl_header_len(skb);
unsigned int delta_truesize = 0;
unsigned int delta_len = 0;
struct sk_buff *tail = NULL;
struct sk_buff *nskb, *tmp;
int err;
skb_push(skb, -skb_network_offset(skb) + offset);
skb_shinfo(skb)->frag_list = NULL;
do {
nskb = list_skb;
list_skb = list_skb->next;
err = 0;
delta_truesize += nskb->truesize;
if (skb_shared(nskb)) {
tmp = skb_clone(nskb, GFP_ATOMIC);
if (tmp) {
consume_skb(nskb);
nskb = tmp;
err = skb_unclone(nskb, GFP_ATOMIC);
} else {
err = -ENOMEM;
}
}
if (!tail)
skb->next = nskb;
else
tail->next = nskb;
if (unlikely(err)) {
nskb->next = list_skb;
goto err_linearize;
}
tail = nskb;
delta_len += nskb->len;
skb_push(nskb, -skb_network_offset(nskb) + offset);
skb_release_head_state(nskb);
__copy_skb_header(nskb, skb);
skb_headers_offset_update(nskb, skb_headroom(nskb) - skb_headroom(skb));
skb_copy_from_linear_data_offset(skb, -tnl_hlen,
nskb->data - tnl_hlen,
offset + tnl_hlen);
if (skb_needs_linearize(nskb, features) &&
__skb_linearize(nskb))
goto err_linearize;
} while (list_skb);
skb->truesize = skb->truesize - delta_truesize;
skb->data_len = skb->data_len - delta_len;
skb->len = skb->len - delta_len;
skb_gso_reset(skb);
skb->prev = tail;
if (skb_needs_linearize(skb, features) &&
__skb_linearize(skb))
goto err_linearize;
skb_get(skb);
return skb;
err_linearize:
kfree_skb_list(skb->next);
skb->next = NULL;
return ERR_PTR(-ENOMEM);
}
EXPORT_SYMBOL_GPL(skb_segment_list);
int skb_gro_receive_list(struct sk_buff *p, struct sk_buff *skb)
{
if (unlikely(p->len + skb->len >= 65536))
return -E2BIG;
if (NAPI_GRO_CB(p)->last == p)
skb_shinfo(p)->frag_list = skb;
else
NAPI_GRO_CB(p)->last->next = skb;
skb_pull(skb, skb_gro_offset(skb));
NAPI_GRO_CB(p)->last = skb;
NAPI_GRO_CB(p)->count++;
p->data_len += skb->len;
/* sk owenrship - if any - completely transferred to the aggregated packet */
skb->destructor = NULL;
p->truesize += skb->truesize;
p->len += skb->len;
NAPI_GRO_CB(skb)->same_flow = 1;
return 0;
}
/**
* skb_segment - Perform protocol segmentation on skb.
* @head_skb: buffer to segment
* @features: features for the output path (see dev->features)
*
* This function performs segmentation on the given skb. It returns
* a pointer to the first in a list of new skbs for the segments.
* In case of error it returns ERR_PTR(err).
*/
struct sk_buff *skb_segment(struct sk_buff *head_skb,
netdev_features_t features)
{
struct sk_buff *segs = NULL;
struct sk_buff *tail = NULL;
struct sk_buff *list_skb = skb_shinfo(head_skb)->frag_list;
skb_frag_t *frag = skb_shinfo(head_skb)->frags;
unsigned int mss = skb_shinfo(head_skb)->gso_size;
unsigned int doffset = head_skb->data - skb_mac_header(head_skb);
struct sk_buff *frag_skb = head_skb;
unsigned int offset = doffset;
unsigned int tnl_hlen = skb_tnl_header_len(head_skb);
unsigned int partial_segs = 0;
unsigned int headroom;
unsigned int len = head_skb->len;
__be16 proto;
bool csum, sg;
int nfrags = skb_shinfo(head_skb)->nr_frags;
int err = -ENOMEM;
int i = 0;
int pos;
if (list_skb && !list_skb->head_frag && skb_headlen(list_skb) &&
(skb_shinfo(head_skb)->gso_type & SKB_GSO_DODGY)) {
/* gso_size is untrusted, and we have a frag_list with a linear
* non head_frag head.
*
* (we assume checking the first list_skb member suffices;
* i.e if either of the list_skb members have non head_frag
* head, then the first one has too).
*
* If head_skb's headlen does not fit requested gso_size, it
* means that the frag_list members do NOT terminate on exact
* gso_size boundaries. Hence we cannot perform skb_frag_t page
* sharing. Therefore we must fallback to copying the frag_list
* skbs; we do so by disabling SG.
*/
if (mss != GSO_BY_FRAGS && mss != skb_headlen(head_skb))
features &= ~NETIF_F_SG;
}
__skb_push(head_skb, doffset);
proto = skb_network_protocol(head_skb, NULL);
if (unlikely(!proto))
return ERR_PTR(-EINVAL);
sg = !!(features & NETIF_F_SG);
csum = !!can_checksum_protocol(features, proto);
if (sg && csum && (mss != GSO_BY_FRAGS)) {
if (!(features & NETIF_F_GSO_PARTIAL)) {
struct sk_buff *iter;
unsigned int frag_len;
if (!list_skb ||
!net_gso_ok(features, skb_shinfo(head_skb)->gso_type))
goto normal;
/* If we get here then all the required
* GSO features except frag_list are supported.
* Try to split the SKB to multiple GSO SKBs
* with no frag_list.
* Currently we can do that only when the buffers don't
* have a linear part and all the buffers except
* the last are of the same length.
*/
frag_len = list_skb->len;
skb_walk_frags(head_skb, iter) {
if (frag_len != iter->len && iter->next)
goto normal;
if (skb_headlen(iter) && !iter->head_frag)
goto normal;
len -= iter->len;
}
if (len != frag_len)
goto normal;
}
/* GSO partial only requires that we trim off any excess that
* doesn't fit into an MSS sized block, so take care of that
* now.
*/
partial_segs = len / mss;
if (partial_segs > 1)
mss *= partial_segs;
else
partial_segs = 0;
}
normal:
headroom = skb_headroom(head_skb);
pos = skb_headlen(head_skb);
do {
struct sk_buff *nskb;
skb_frag_t *nskb_frag;
int hsize;
int size;
if (unlikely(mss == GSO_BY_FRAGS)) {
len = list_skb->len;
} else {
len = head_skb->len - offset;
if (len > mss)
len = mss;
}
hsize = skb_headlen(head_skb) - offset;
if (hsize <= 0 && i >= nfrags && skb_headlen(list_skb) &&
(skb_headlen(list_skb) == len || sg)) {
BUG_ON(skb_headlen(list_skb) > len);
i = 0;
nfrags = skb_shinfo(list_skb)->nr_frags;
frag = skb_shinfo(list_skb)->frags;
frag_skb = list_skb;
pos += skb_headlen(list_skb);
while (pos < offset + len) {
BUG_ON(i >= nfrags);
size = skb_frag_size(frag);
if (pos + size > offset + len)
break;
i++;
pos += size;
frag++;
}
nskb = skb_clone(list_skb, GFP_ATOMIC);
list_skb = list_skb->next;
if (unlikely(!nskb))
goto err;
if (unlikely(pskb_trim(nskb, len))) {
kfree_skb(nskb);
goto err;
}
hsize = skb_end_offset(nskb);
if (skb_cow_head(nskb, doffset + headroom)) {
kfree_skb(nskb);
goto err;
}
nskb->truesize += skb_end_offset(nskb) - hsize;
skb_release_head_state(nskb);
__skb_push(nskb, doffset);
} else {
if (hsize < 0)
hsize = 0;
if (hsize > len || !sg)
hsize = len;
nskb = __alloc_skb(hsize + doffset + headroom,
GFP_ATOMIC, skb_alloc_rx_flag(head_skb),
NUMA_NO_NODE);
if (unlikely(!nskb))
goto err;
skb_reserve(nskb, headroom);
__skb_put(nskb, doffset);
}
if (segs)
tail->next = nskb;
else
segs = nskb;
tail = nskb;
__copy_skb_header(nskb, head_skb);
skb_headers_offset_update(nskb, skb_headroom(nskb) - headroom);
skb_reset_mac_len(nskb);
skb_copy_from_linear_data_offset(head_skb, -tnl_hlen,
nskb->data - tnl_hlen,
doffset + tnl_hlen);
if (nskb->len == len + doffset)
goto perform_csum_check;
if (!sg) {
if (!csum) {
if (!nskb->remcsum_offload)
nskb->ip_summed = CHECKSUM_NONE;
SKB_GSO_CB(nskb)->csum =
skb_copy_and_csum_bits(head_skb, offset,
skb_put(nskb,
len),
len);
SKB_GSO_CB(nskb)->csum_start =
skb_headroom(nskb) + doffset;
} else {
skb_copy_bits(head_skb, offset,
skb_put(nskb, len),
len);
}
continue;
}
nskb_frag = skb_shinfo(nskb)->frags;
skb_copy_from_linear_data_offset(head_skb, offset,
skb_put(nskb, hsize), hsize);
skb_shinfo(nskb)->flags |= skb_shinfo(head_skb)->flags &
SKBFL_SHARED_FRAG;
if (skb_orphan_frags(frag_skb, GFP_ATOMIC) ||
skb_zerocopy_clone(nskb, frag_skb, GFP_ATOMIC))
goto err;
while (pos < offset + len) {
if (i >= nfrags) {
i = 0;
nfrags = skb_shinfo(list_skb)->nr_frags;
frag = skb_shinfo(list_skb)->frags;
frag_skb = list_skb;
if (!skb_headlen(list_skb)) {
BUG_ON(!nfrags);
} else {
BUG_ON(!list_skb->head_frag);
/* to make room for head_frag. */
i--;
frag--;
}
if (skb_orphan_frags(frag_skb, GFP_ATOMIC) ||
skb_zerocopy_clone(nskb, frag_skb,
GFP_ATOMIC))
goto err;
list_skb = list_skb->next;
}
if (unlikely(skb_shinfo(nskb)->nr_frags >=
MAX_SKB_FRAGS)) {
net_warn_ratelimited(
"skb_segment: too many frags: %u %u\n",
pos, mss);
err = -EINVAL;
goto err;
}
*nskb_frag = (i < 0) ? skb_head_frag_to_page_desc(frag_skb) : *frag;
__skb_frag_ref(nskb_frag);
size = skb_frag_size(nskb_frag);
if (pos < offset) {
skb_frag_off_add(nskb_frag, offset - pos);
skb_frag_size_sub(nskb_frag, offset - pos);
}
skb_shinfo(nskb)->nr_frags++;
if (pos + size <= offset + len) {
i++;
frag++;
pos += size;
} else {
skb_frag_size_sub(nskb_frag, pos + size - (offset + len));
goto skip_fraglist;
}
nskb_frag++;
}
skip_fraglist:
nskb->data_len = len - hsize;
nskb->len += nskb->data_len;
nskb->truesize += nskb->data_len;
perform_csum_check:
if (!csum) {
if (skb_has_shared_frag(nskb) &&
__skb_linearize(nskb))
goto err;
if (!nskb->remcsum_offload)
nskb->ip_summed = CHECKSUM_NONE;
SKB_GSO_CB(nskb)->csum =
skb_checksum(nskb, doffset,
nskb->len - doffset, 0);
SKB_GSO_CB(nskb)->csum_start =
skb_headroom(nskb) + doffset;
}
} while ((offset += len) < head_skb->len);
/* Some callers want to get the end of the list.
* Put it in segs->prev to avoid walking the list.
* (see validate_xmit_skb_list() for example)
*/
segs->prev = tail;
if (partial_segs) {
struct sk_buff *iter;
int type = skb_shinfo(head_skb)->gso_type;
unsigned short gso_size = skb_shinfo(head_skb)->gso_size;
/* Update type to add partial and then remove dodgy if set */
type |= (features & NETIF_F_GSO_PARTIAL) / NETIF_F_GSO_PARTIAL * SKB_GSO_PARTIAL;
type &= ~SKB_GSO_DODGY;
/* Update GSO info and prepare to start updating headers on
* our way back down the stack of protocols.
*/
for (iter = segs; iter; iter = iter->next) {
skb_shinfo(iter)->gso_size = gso_size;
skb_shinfo(iter)->gso_segs = partial_segs;
skb_shinfo(iter)->gso_type = type;
SKB_GSO_CB(iter)->data_offset = skb_headroom(iter) + doffset;
}
if (tail->len - doffset <= gso_size)
skb_shinfo(tail)->gso_size = 0;
else if (tail != segs)
skb_shinfo(tail)->gso_segs = DIV_ROUND_UP(tail->len - doffset, gso_size);
}
/* Following permits correct backpressure, for protocols
* using skb_set_owner_w().
* Idea is to tranfert ownership from head_skb to last segment.
*/
if (head_skb->destructor == sock_wfree) {
swap(tail->truesize, head_skb->truesize);
swap(tail->destructor, head_skb->destructor);
swap(tail->sk, head_skb->sk);
}
return segs;
err:
kfree_skb_list(segs);
return ERR_PTR(err);
}
EXPORT_SYMBOL_GPL(skb_segment);
int skb_gro_receive(struct sk_buff *p, struct sk_buff *skb)
{
struct skb_shared_info *pinfo, *skbinfo = skb_shinfo(skb);
unsigned int offset = skb_gro_offset(skb);
unsigned int headlen = skb_headlen(skb);
unsigned int len = skb_gro_len(skb);
unsigned int delta_truesize;
unsigned int new_truesize;
struct sk_buff *lp;
if (unlikely(p->len + len >= 65536 || NAPI_GRO_CB(skb)->flush))
return -E2BIG;
lp = NAPI_GRO_CB(p)->last;
pinfo = skb_shinfo(lp);
if (headlen <= offset) {
skb_frag_t *frag;
skb_frag_t *frag2;
int i = skbinfo->nr_frags;
int nr_frags = pinfo->nr_frags + i;
if (nr_frags > MAX_SKB_FRAGS)
goto merge;
offset -= headlen;
pinfo->nr_frags = nr_frags;
skbinfo->nr_frags = 0;
frag = pinfo->frags + nr_frags;
frag2 = skbinfo->frags + i;
do {
*--frag = *--frag2;
} while (--i);
skb_frag_off_add(frag, offset);
skb_frag_size_sub(frag, offset);
/* all fragments truesize : remove (head size + sk_buff) */
new_truesize = SKB_TRUESIZE(skb_end_offset(skb));
delta_truesize = skb->truesize - new_truesize;
skb->truesize = new_truesize;
skb->len -= skb->data_len;
skb->data_len = 0;
NAPI_GRO_CB(skb)->free = NAPI_GRO_FREE;
goto done;
} else if (skb->head_frag) {
int nr_frags = pinfo->nr_frags;
skb_frag_t *frag = pinfo->frags + nr_frags;
struct page *page = virt_to_head_page(skb->head);
unsigned int first_size = headlen - offset;
unsigned int first_offset;
if (nr_frags + 1 + skbinfo->nr_frags > MAX_SKB_FRAGS)
goto merge;
first_offset = skb->data -
(unsigned char *)page_address(page) +
offset;
pinfo->nr_frags = nr_frags + 1 + skbinfo->nr_frags;
__skb_frag_set_page(frag, page);
skb_frag_off_set(frag, first_offset);
skb_frag_size_set(frag, first_size);
memcpy(frag + 1, skbinfo->frags, sizeof(*frag) * skbinfo->nr_frags);
/* We dont need to clear skbinfo->nr_frags here */
new_truesize = SKB_DATA_ALIGN(sizeof(struct sk_buff));
delta_truesize = skb->truesize - new_truesize;
skb->truesize = new_truesize;
NAPI_GRO_CB(skb)->free = NAPI_GRO_FREE_STOLEN_HEAD;
goto done;
}
merge:
/* sk owenrship - if any - completely transferred to the aggregated packet */
skb->destructor = NULL;
delta_truesize = skb->truesize;
if (offset > headlen) {
unsigned int eat = offset - headlen;
skb_frag_off_add(&skbinfo->frags[0], eat);
skb_frag_size_sub(&skbinfo->frags[0], eat);
skb->data_len -= eat;
skb->len -= eat;
offset = headlen;
}
__skb_pull(skb, offset);
if (NAPI_GRO_CB(p)->last == p)
skb_shinfo(p)->frag_list = skb;
else
NAPI_GRO_CB(p)->last->next = skb;
NAPI_GRO_CB(p)->last = skb;
__skb_header_release(skb);
lp = p;
done:
NAPI_GRO_CB(p)->count++;
p->data_len += len;
p->truesize += delta_truesize;
p->len += len;
if (lp != p) {
lp->data_len += len;
lp->truesize += delta_truesize;
lp->len += len;
}
NAPI_GRO_CB(skb)->same_flow = 1;
return 0;
}
#ifdef CONFIG_SKB_EXTENSIONS
#define SKB_EXT_ALIGN_VALUE 8
#define SKB_EXT_CHUNKSIZEOF(x) (ALIGN((sizeof(x)), SKB_EXT_ALIGN_VALUE) / SKB_EXT_ALIGN_VALUE)
static const u8 skb_ext_type_len[] = {
#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
[SKB_EXT_BRIDGE_NF] = SKB_EXT_CHUNKSIZEOF(struct nf_bridge_info),
#endif
#ifdef CONFIG_XFRM
[SKB_EXT_SEC_PATH] = SKB_EXT_CHUNKSIZEOF(struct sec_path),
#endif
#if IS_ENABLED(CONFIG_NET_TC_SKB_EXT)
[TC_SKB_EXT] = SKB_EXT_CHUNKSIZEOF(struct tc_skb_ext),
#endif
#if IS_ENABLED(CONFIG_MPTCP)
[SKB_EXT_MPTCP] = SKB_EXT_CHUNKSIZEOF(struct mptcp_ext),
#endif
};
static __always_inline unsigned int skb_ext_total_length(void)
{
return SKB_EXT_CHUNKSIZEOF(struct skb_ext) +
#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
skb_ext_type_len[SKB_EXT_BRIDGE_NF] +
#endif
#ifdef CONFIG_XFRM
skb_ext_type_len[SKB_EXT_SEC_PATH] +
#endif
#if IS_ENABLED(CONFIG_NET_TC_SKB_EXT)
skb_ext_type_len[TC_SKB_EXT] +
#endif
#if IS_ENABLED(CONFIG_MPTCP)
skb_ext_type_len[SKB_EXT_MPTCP] +
#endif
0;
}
static void skb_extensions_init(void)
{
BUILD_BUG_ON(SKB_EXT_NUM >= 8);
BUILD_BUG_ON(skb_ext_total_length() > 255);
skbuff_ext_cache = kmem_cache_create("skbuff_ext_cache",
SKB_EXT_ALIGN_VALUE * skb_ext_total_length(),
0,
SLAB_HWCACHE_ALIGN|SLAB_PANIC,
NULL);
}
#else
static void skb_extensions_init(void) {}
#endif
void __init skb_init(void)
{
skbuff_head_cache = kmem_cache_create_usercopy("skbuff_head_cache",
sizeof(struct sk_buff),
0,
SLAB_HWCACHE_ALIGN|SLAB_PANIC,
offsetof(struct sk_buff, cb),
sizeof_field(struct sk_buff, cb),
NULL);
skbuff_fclone_cache = kmem_cache_create("skbuff_fclone_cache",
sizeof(struct sk_buff_fclones),
0,
SLAB_HWCACHE_ALIGN|SLAB_PANIC,
NULL);
skb_extensions_init();
}
static int
__skb_to_sgvec(struct sk_buff *skb, struct scatterlist *sg, int offset, int len,
unsigned int recursion_level)
{
int start = skb_headlen(skb);
int i, copy = start - offset;
struct sk_buff *frag_iter;
int elt = 0;
if (unlikely(recursion_level >= 24))
return -EMSGSIZE;
if (copy > 0) {
if (copy > len)
copy = len;
sg_set_buf(sg, skb->data + offset, copy);
elt++;
if ((len -= copy) == 0)
return elt;
offset += copy;
}
for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
int end;
WARN_ON(start > offset + len);
end = start + skb_frag_size(&skb_shinfo(skb)->frags[i]);
if ((copy = end - offset) > 0) {
skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
if (unlikely(elt && sg_is_last(&sg[elt - 1])))
return -EMSGSIZE;
if (copy > len)
copy = len;
sg_set_page(&sg[elt], skb_frag_page(frag), copy,
skb_frag_off(frag) + offset - start);
elt++;
if (!(len -= copy))
return elt;
offset += copy;
}
start = end;
}
skb_walk_frags(skb, frag_iter) {
int end, ret;
WARN_ON(start > offset + len);
end = start + frag_iter->len;
if ((copy = end - offset) > 0) {
if (unlikely(elt && sg_is_last(&sg[elt - 1])))
return -EMSGSIZE;
if (copy > len)
copy = len;
ret = __skb_to_sgvec(frag_iter, sg+elt, offset - start,
copy, recursion_level + 1);
if (unlikely(ret < 0))
return ret;
elt += ret;
if ((len -= copy) == 0)
return elt;
offset += copy;
}
start = end;
}
BUG_ON(len);
return elt;
}
/**
* skb_to_sgvec - Fill a scatter-gather list from a socket buffer
* @skb: Socket buffer containing the buffers to be mapped
* @sg: The scatter-gather list to map into
* @offset: The offset into the buffer's contents to start mapping
* @len: Length of buffer space to be mapped
*
* Fill the specified scatter-gather list with mappings/pointers into a
* region of the buffer space attached to a socket buffer. Returns either
* the number of scatterlist items used, or -EMSGSIZE if the contents
* could not fit.
*/
int skb_to_sgvec(struct sk_buff *skb, struct scatterlist *sg, int offset, int len)
{
int nsg = __skb_to_sgvec(skb, sg, offset, len, 0);
if (nsg <= 0)
return nsg;
sg_mark_end(&sg[nsg - 1]);
return nsg;
}
EXPORT_SYMBOL_GPL(skb_to_sgvec);
/* As compared with skb_to_sgvec, skb_to_sgvec_nomark only map skb to given
* sglist without mark the sg which contain last skb data as the end.
* So the caller can mannipulate sg list as will when padding new data after
* the first call without calling sg_unmark_end to expend sg list.
*
* Scenario to use skb_to_sgvec_nomark:
* 1. sg_init_table
* 2. skb_to_sgvec_nomark(payload1)
* 3. skb_to_sgvec_nomark(payload2)
*
* This is equivalent to:
* 1. sg_init_table
* 2. skb_to_sgvec(payload1)
* 3. sg_unmark_end
* 4. skb_to_sgvec(payload2)
*
* When mapping mutilple payload conditionally, skb_to_sgvec_nomark
* is more preferable.
*/
int skb_to_sgvec_nomark(struct sk_buff *skb, struct scatterlist *sg,
int offset, int len)
{
return __skb_to_sgvec(skb, sg, offset, len, 0);
}
EXPORT_SYMBOL_GPL(skb_to_sgvec_nomark);
/**
* skb_cow_data - Check that a socket buffer's data buffers are writable
* @skb: The socket buffer to check.
* @tailbits: Amount of trailing space to be added
* @trailer: Returned pointer to the skb where the @tailbits space begins
*
* Make sure that the data buffers attached to a socket buffer are
* writable. If they are not, private copies are made of the data buffers
* and the socket buffer is set to use these instead.
*
* If @tailbits is given, make sure that there is space to write @tailbits
* bytes of data beyond current end of socket buffer. @trailer will be
* set to point to the skb in which this space begins.
*
* The number of scatterlist elements required to completely map the
* COW'd and extended socket buffer will be returned.
*/
int skb_cow_data(struct sk_buff *skb, int tailbits, struct sk_buff **trailer)
{
int copyflag;
int elt;
struct sk_buff *skb1, **skb_p;
/* If skb is cloned or its head is paged, reallocate
* head pulling out all the pages (pages are considered not writable
* at the moment even if they are anonymous).
*/
if ((skb_cloned(skb) || skb_shinfo(skb)->nr_frags) &&
!__pskb_pull_tail(skb, __skb_pagelen(skb)))
return -ENOMEM;
/* Easy case. Most of packets will go this way. */
if (!skb_has_frag_list(skb)) {
/* A little of trouble, not enough of space for trailer.
* This should not happen, when stack is tuned to generate
* good frames. OK, on miss we reallocate and reserve even more
* space, 128 bytes is fair. */
if (skb_tailroom(skb) < tailbits &&
pskb_expand_head(skb, 0, tailbits-skb_tailroom(skb)+128, GFP_ATOMIC))
return -ENOMEM;
/* Voila! */
*trailer = skb;
return 1;
}
/* Misery. We are in troubles, going to mincer fragments... */
elt = 1;
skb_p = &skb_shinfo(skb)->frag_list;
copyflag = 0;
while ((skb1 = *skb_p) != NULL) {
int ntail = 0;
/* The fragment is partially pulled by someone,
* this can happen on input. Copy it and everything
* after it. */
if (skb_shared(skb1))
copyflag = 1;
/* If the skb is the last, worry about trailer. */
if (skb1->next == NULL && tailbits) {
if (skb_shinfo(skb1)->nr_frags ||
skb_has_frag_list(skb1) ||
skb_tailroom(skb1) < tailbits)
ntail = tailbits + 128;
}
if (copyflag ||
skb_cloned(skb1) ||
ntail ||
skb_shinfo(skb1)->nr_frags ||
skb_has_frag_list(skb1)) {
struct sk_buff *skb2;
/* Fuck, we are miserable poor guys... */
if (ntail == 0)
skb2 = skb_copy(skb1, GFP_ATOMIC);
else
skb2 = skb_copy_expand(skb1,
skb_headroom(skb1),
ntail,
GFP_ATOMIC);
if (unlikely(skb2 == NULL))
return -ENOMEM;
if (skb1->sk)
skb_set_owner_w(skb2, skb1->sk);
/* Looking around. Are we still alive?
* OK, link new skb, drop old one */
skb2->next = skb1->next;
*skb_p = skb2;
kfree_skb(skb1);
skb1 = skb2;
}
elt++;
*trailer = skb1;
skb_p = &skb1->next;
}
return elt;
}
EXPORT_SYMBOL_GPL(skb_cow_data);
static void sock_rmem_free(struct sk_buff *skb)
{
struct sock *sk = skb->sk;
atomic_sub(skb->truesize, &sk->sk_rmem_alloc);
}
static void skb_set_err_queue(struct sk_buff *skb)
{
/* pkt_type of skbs received on local sockets is never PACKET_OUTGOING.
* So, it is safe to (mis)use it to mark skbs on the error queue.
*/
skb->pkt_type = PACKET_OUTGOING;
BUILD_BUG_ON(PACKET_OUTGOING == 0);
}
/*
* Note: We dont mem charge error packets (no sk_forward_alloc changes)
*/
int sock_queue_err_skb(struct sock *sk, struct sk_buff *skb)
{
if (atomic_read(&sk->sk_rmem_alloc) + skb->truesize >=
(unsigned int)READ_ONCE(sk->sk_rcvbuf))
return -ENOMEM;
skb_orphan(skb);
skb->sk = sk;
skb->destructor = sock_rmem_free;
atomic_add(skb->truesize, &sk->sk_rmem_alloc);
skb_set_err_queue(skb);
/* before exiting rcu section, make sure dst is refcounted */
skb_dst_force(skb);
skb_queue_tail(&sk->sk_error_queue, skb);
if (!sock_flag(sk, SOCK_DEAD))
sk_error_report(sk);
return 0;
}
EXPORT_SYMBOL(sock_queue_err_skb);
static bool is_icmp_err_skb(const struct sk_buff *skb)
{
return skb && (SKB_EXT_ERR(skb)->ee.ee_origin == SO_EE_ORIGIN_ICMP ||
SKB_EXT_ERR(skb)->ee.ee_origin == SO_EE_ORIGIN_ICMP6);
}
struct sk_buff *sock_dequeue_err_skb(struct sock *sk)
{
struct sk_buff_head *q = &sk->sk_error_queue;
struct sk_buff *skb, *skb_next = NULL;
bool icmp_next = false;
unsigned long flags;
spin_lock_irqsave(&q->lock, flags);
skb = __skb_dequeue(q);
if (skb && (skb_next = skb_peek(q))) {
icmp_next = is_icmp_err_skb(skb_next);
if (icmp_next)
sk->sk_err = SKB_EXT_ERR(skb_next)->ee.ee_errno;
}
spin_unlock_irqrestore(&q->lock, flags);
if (is_icmp_err_skb(skb) && !icmp_next)
sk->sk_err = 0;
if (skb_next)
sk_error_report(sk);
return skb;
}
EXPORT_SYMBOL(sock_dequeue_err_skb);
/**
* skb_clone_sk - create clone of skb, and take reference to socket
* @skb: the skb to clone
*
* This function creates a clone of a buffer that holds a reference on
* sk_refcnt. Buffers created via this function are meant to be
* returned using sock_queue_err_skb, or free via kfree_skb.
*
* When passing buffers allocated with this function to sock_queue_err_skb
* it is necessary to wrap the call with sock_hold/sock_put in order to
* prevent the socket from being released prior to being enqueued on
* the sk_error_queue.
*/
struct sk_buff *skb_clone_sk(struct sk_buff *skb)
{
struct sock *sk = skb->sk;
struct sk_buff *clone;
if (!sk || !refcount_inc_not_zero(&sk->sk_refcnt))
return NULL;
clone = skb_clone(skb, GFP_ATOMIC);
if (!clone) {
sock_put(sk);
return NULL;
}
clone->sk = sk;
clone->destructor = sock_efree;
return clone;
}
EXPORT_SYMBOL(skb_clone_sk);
static void __skb_complete_tx_timestamp(struct sk_buff *skb,
struct sock *sk,
int tstype,
bool opt_stats)
{
struct sock_exterr_skb *serr;
int err;
BUILD_BUG_ON(sizeof(struct sock_exterr_skb) > sizeof(skb->cb));
serr = SKB_EXT_ERR(skb);
memset(serr, 0, sizeof(*serr));
serr->ee.ee_errno = ENOMSG;
serr->ee.ee_origin = SO_EE_ORIGIN_TIMESTAMPING;
serr->ee.ee_info = tstype;
serr->opt_stats = opt_stats;
serr->header.h4.iif = skb->dev ? skb->dev->ifindex : 0;
if (sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID) {
serr->ee.ee_data = skb_shinfo(skb)->tskey;
if (sk->sk_protocol == IPPROTO_TCP &&
sk->sk_type == SOCK_STREAM)
serr->ee.ee_data -= atomic_read(&sk->sk_tskey);
}
err = sock_queue_err_skb(sk, skb);
if (err)
kfree_skb(skb);
}
static bool skb_may_tx_timestamp(struct sock *sk, bool tsonly)
{
bool ret;
if (likely(sysctl_tstamp_allow_data || tsonly))
return true;
read_lock_bh(&sk->sk_callback_lock);
ret = sk->sk_socket && sk->sk_socket->file &&
file_ns_capable(sk->sk_socket->file, &init_user_ns, CAP_NET_RAW);
read_unlock_bh(&sk->sk_callback_lock);
return ret;
}
void skb_complete_tx_timestamp(struct sk_buff *skb,
struct skb_shared_hwtstamps *hwtstamps)
{
struct sock *sk = skb->sk;
if (!skb_may_tx_timestamp(sk, false))
goto err;
/* Take a reference to prevent skb_orphan() from freeing the socket,
* but only if the socket refcount is not zero.
*/
if (likely(refcount_inc_not_zero(&sk->sk_refcnt))) {
*skb_hwtstamps(skb) = *hwtstamps;
__skb_complete_tx_timestamp(skb, sk, SCM_TSTAMP_SND, false);
sock_put(sk);
return;
}
err:
kfree_skb(skb);
}
EXPORT_SYMBOL_GPL(skb_complete_tx_timestamp);
void __skb_tstamp_tx(struct sk_buff *orig_skb,
const struct sk_buff *ack_skb,
struct skb_shared_hwtstamps *hwtstamps,
struct sock *sk, int tstype)
{
struct sk_buff *skb;
bool tsonly, opt_stats = false;
if (!sk)
return;
if (!hwtstamps && !(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_TX_SWHW) &&
skb_shinfo(orig_skb)->tx_flags & SKBTX_IN_PROGRESS)
return;
tsonly = sk->sk_tsflags & SOF_TIMESTAMPING_OPT_TSONLY;
if (!skb_may_tx_timestamp(sk, tsonly))
return;
if (tsonly) {
#ifdef CONFIG_INET
if ((sk->sk_tsflags & SOF_TIMESTAMPING_OPT_STATS) &&
sk->sk_protocol == IPPROTO_TCP &&
sk->sk_type == SOCK_STREAM) {
skb = tcp_get_timestamping_opt_stats(sk, orig_skb,
ack_skb);
opt_stats = true;
} else
#endif
skb = alloc_skb(0, GFP_ATOMIC);
} else {
skb = skb_clone(orig_skb, GFP_ATOMIC);
}
if (!skb)
return;
if (tsonly) {
skb_shinfo(skb)->tx_flags |= skb_shinfo(orig_skb)->tx_flags &
SKBTX_ANY_TSTAMP;
skb_shinfo(skb)->tskey = skb_shinfo(orig_skb)->tskey;
}
if (hwtstamps)
*skb_hwtstamps(skb) = *hwtstamps;
else
skb->tstamp = ktime_get_real();
__skb_complete_tx_timestamp(skb, sk, tstype, opt_stats);
}
EXPORT_SYMBOL_GPL(__skb_tstamp_tx);
void skb_tstamp_tx(struct sk_buff *orig_skb,
struct skb_shared_hwtstamps *hwtstamps)
{
return __skb_tstamp_tx(orig_skb, NULL, hwtstamps, orig_skb->sk,
SCM_TSTAMP_SND);
}
EXPORT_SYMBOL_GPL(skb_tstamp_tx);
void skb_complete_wifi_ack(struct sk_buff *skb, bool acked)
{
struct sock *sk = skb->sk;
struct sock_exterr_skb *serr;
int err = 1;
skb->wifi_acked_valid = 1;
skb->wifi_acked = acked;
serr = SKB_EXT_ERR(skb);
memset(serr, 0, sizeof(*serr));
serr->ee.ee_errno = ENOMSG;
serr->ee.ee_origin = SO_EE_ORIGIN_TXSTATUS;
/* Take a reference to prevent skb_orphan() from freeing the socket,
* but only if the socket refcount is not zero.
*/
if (likely(refcount_inc_not_zero(&sk->sk_refcnt))) {
err = sock_queue_err_skb(sk, skb);
sock_put(sk);
}
if (err)
kfree_skb(skb);
}
EXPORT_SYMBOL_GPL(skb_complete_wifi_ack);
/**
* skb_partial_csum_set - set up and verify partial csum values for packet
* @skb: the skb to set
* @start: the number of bytes after skb->data to start checksumming.
* @off: the offset from start to place the checksum.
*
* For untrusted partially-checksummed packets, we need to make sure the values
* for skb->csum_start and skb->csum_offset are valid so we don't oops.
*
* This function checks and sets those values and skb->ip_summed: if this
* returns false you should drop the packet.
*/
bool skb_partial_csum_set(struct sk_buff *skb, u16 start, u16 off)
{
u32 csum_end = (u32)start + (u32)off + sizeof(__sum16);
u32 csum_start = skb_headroom(skb) + (u32)start;
if (unlikely(csum_start > U16_MAX || csum_end > skb_headlen(skb))) {
net_warn_ratelimited("bad partial csum: csum=%u/%u headroom=%u headlen=%u\n",
start, off, skb_headroom(skb), skb_headlen(skb));
return false;
}
skb->ip_summed = CHECKSUM_PARTIAL;
skb->csum_start = csum_start;
skb->csum_offset = off;
skb_set_transport_header(skb, start);
return true;
}
EXPORT_SYMBOL_GPL(skb_partial_csum_set);
static int skb_maybe_pull_tail(struct sk_buff *skb, unsigned int len,
unsigned int max)
{
if (skb_headlen(skb) >= len)
return 0;
/* If we need to pullup then pullup to the max, so we
* won't need to do it again.
*/
if (max > skb->len)
max = skb->len;
if (__pskb_pull_tail(skb, max - skb_headlen(skb)) == NULL)
return -ENOMEM;
if (skb_headlen(skb) < len)
return -EPROTO;
return 0;
}
#define MAX_TCP_HDR_LEN (15 * 4)
static __sum16 *skb_checksum_setup_ip(struct sk_buff *skb,
typeof(IPPROTO_IP) proto,
unsigned int off)
{
int err;
switch (proto) {
case IPPROTO_TCP:
err = skb_maybe_pull_tail(skb, off + sizeof(struct tcphdr),
off + MAX_TCP_HDR_LEN);
if (!err && !skb_partial_csum_set(skb, off,
offsetof(struct tcphdr,
check)))
err = -EPROTO;
return err ? ERR_PTR(err) : &tcp_hdr(skb)->check;
case IPPROTO_UDP:
err = skb_maybe_pull_tail(skb, off + sizeof(struct udphdr),
off + sizeof(struct udphdr));
if (!err && !skb_partial_csum_set(skb, off,
offsetof(struct udphdr,
check)))
err = -EPROTO;
return err ? ERR_PTR(err) : &udp_hdr(skb)->check;
}
return ERR_PTR(-EPROTO);
}
/* This value should be large enough to cover a tagged ethernet header plus
* maximally sized IP and TCP or UDP headers.
*/
#define MAX_IP_HDR_LEN 128
static int skb_checksum_setup_ipv4(struct sk_buff *skb, bool recalculate)
{
unsigned int off;
bool fragment;
__sum16 *csum;
int err;
fragment = false;
err = skb_maybe_pull_tail(skb,
sizeof(struct iphdr),
MAX_IP_HDR_LEN);
if (err < 0)
goto out;
if (ip_is_fragment(ip_hdr(skb)))
fragment = true;
off = ip_hdrlen(skb);
err = -EPROTO;
if (fragment)
goto out;
csum = skb_checksum_setup_ip(skb, ip_hdr(skb)->protocol, off);
if (IS_ERR(csum))
return PTR_ERR(csum);
if (recalculate)
*csum = ~csum_tcpudp_magic(ip_hdr(skb)->saddr,
ip_hdr(skb)->daddr,
skb->len - off,
ip_hdr(skb)->protocol, 0);
err = 0;
out:
return err;
}
/* This value should be large enough to cover a tagged ethernet header plus
* an IPv6 header, all options, and a maximal TCP or UDP header.
*/
#define MAX_IPV6_HDR_LEN 256
#define OPT_HDR(type, skb, off) \
(type *)(skb_network_header(skb) + (off))
static int skb_checksum_setup_ipv6(struct sk_buff *skb, bool recalculate)
{
int err;
u8 nexthdr;
unsigned int off;
unsigned int len;
bool fragment;
bool done;
__sum16 *csum;
fragment = false;
done = false;
off = sizeof(struct ipv6hdr);
err = skb_maybe_pull_tail(skb, off, MAX_IPV6_HDR_LEN);
if (err < 0)
goto out;
nexthdr = ipv6_hdr(skb)->nexthdr;
len = sizeof(struct ipv6hdr) + ntohs(ipv6_hdr(skb)->payload_len);
while (off <= len && !done) {
switch (nexthdr) {
case IPPROTO_DSTOPTS:
case IPPROTO_HOPOPTS:
case IPPROTO_ROUTING: {
struct ipv6_opt_hdr *hp;
err = skb_maybe_pull_tail(skb,
off +
sizeof(struct ipv6_opt_hdr),
MAX_IPV6_HDR_LEN);
if (err < 0)
goto out;
hp = OPT_HDR(struct ipv6_opt_hdr, skb, off);
nexthdr = hp->nexthdr;
off += ipv6_optlen(hp);
break;
}
case IPPROTO_AH: {
struct ip_auth_hdr *hp;
err = skb_maybe_pull_tail(skb,
off +
sizeof(struct ip_auth_hdr),
MAX_IPV6_HDR_LEN);
if (err < 0)
goto out;
hp = OPT_HDR(struct ip_auth_hdr, skb, off);
nexthdr = hp->nexthdr;
off += ipv6_authlen(hp);
break;
}
case IPPROTO_FRAGMENT: {
struct frag_hdr *hp;
err = skb_maybe_pull_tail(skb,
off +
sizeof(struct frag_hdr),
MAX_IPV6_HDR_LEN);
if (err < 0)
goto out;
hp = OPT_HDR(struct frag_hdr, skb, off);
if (hp->frag_off & htons(IP6_OFFSET | IP6_MF))
fragment = true;
nexthdr = hp->nexthdr;
off += sizeof(struct frag_hdr);
break;
}
default:
done = true;
break;
}
}
err = -EPROTO;
if (!done || fragment)
goto out;
csum = skb_checksum_setup_ip(skb, nexthdr, off);
if (IS_ERR(csum))
return PTR_ERR(csum);
if (recalculate)
*csum = ~csum_ipv6_magic(&ipv6_hdr(skb)->saddr,
&ipv6_hdr(skb)->daddr,
skb->len - off, nexthdr, 0);
err = 0;
out:
return err;
}
/**
* skb_checksum_setup - set up partial checksum offset
* @skb: the skb to set up
* @recalculate: if true the pseudo-header checksum will be recalculated
*/
int skb_checksum_setup(struct sk_buff *skb, bool recalculate)
{
int err;
switch (skb->protocol) {
case htons(ETH_P_IP):
err = skb_checksum_setup_ipv4(skb, recalculate);
break;
case htons(ETH_P_IPV6):
err = skb_checksum_setup_ipv6(skb, recalculate);
break;
default:
err = -EPROTO;
break;
}
return err;
}
EXPORT_SYMBOL(skb_checksum_setup);
/**
* skb_checksum_maybe_trim - maybe trims the given skb
* @skb: the skb to check
* @transport_len: the data length beyond the network header
*
* Checks whether the given skb has data beyond the given transport length.
* If so, returns a cloned skb trimmed to this transport length.
* Otherwise returns the provided skb. Returns NULL in error cases
* (e.g. transport_len exceeds skb length or out-of-memory).
*
* Caller needs to set the skb transport header and free any returned skb if it
* differs from the provided skb.
*/
static struct sk_buff *skb_checksum_maybe_trim(struct sk_buff *skb,
unsigned int transport_len)
{
struct sk_buff *skb_chk;
unsigned int len = skb_transport_offset(skb) + transport_len;
int ret;
if (skb->len < len)
return NULL;
else if (skb->len == len)
return skb;
skb_chk = skb_clone(skb, GFP_ATOMIC);
if (!skb_chk)
return NULL;
ret = pskb_trim_rcsum(skb_chk, len);
if (ret) {
kfree_skb(skb_chk);
return NULL;
}
return skb_chk;
}
/**
* skb_checksum_trimmed - validate checksum of an skb
* @skb: the skb to check
* @transport_len: the data length beyond the network header
* @skb_chkf: checksum function to use
*
* Applies the given checksum function skb_chkf to the provided skb.
* Returns a checked and maybe trimmed skb. Returns NULL on error.
*
* If the skb has data beyond the given transport length, then a
* trimmed & cloned skb is checked and returned.
*
* Caller needs to set the skb transport header and free any returned skb if it
* differs from the provided skb.
*/
struct sk_buff *skb_checksum_trimmed(struct sk_buff *skb,
unsigned int transport_len,
__sum16(*skb_chkf)(struct sk_buff *skb))
{
struct sk_buff *skb_chk;
unsigned int offset = skb_transport_offset(skb);
__sum16 ret;
skb_chk = skb_checksum_maybe_trim(skb, transport_len);
if (!skb_chk)
goto err;
if (!pskb_may_pull(skb_chk, offset))
goto err;
skb_pull_rcsum(skb_chk, offset);
ret = skb_chkf(skb_chk);
skb_push_rcsum(skb_chk, offset);
if (ret)
goto err;
return skb_chk;
err:
if (skb_chk && skb_chk != skb)
kfree_skb(skb_chk);
return NULL;
}
EXPORT_SYMBOL(skb_checksum_trimmed);
void __skb_warn_lro_forwarding(const struct sk_buff *skb)
{
net_warn_ratelimited("%s: received packets cannot be forwarded while LRO is enabled\n",
skb->dev->name);
}
EXPORT_SYMBOL(__skb_warn_lro_forwarding);
void kfree_skb_partial(struct sk_buff *skb, bool head_stolen)
{
if (head_stolen) {
skb_release_head_state(skb);
kmem_cache_free(skbuff_head_cache, skb);
} else {
__kfree_skb(skb);
}
}
EXPORT_SYMBOL(kfree_skb_partial);
/**
* skb_try_coalesce - try to merge skb to prior one
* @to: prior buffer
* @from: buffer to add
* @fragstolen: pointer to boolean
* @delta_truesize: how much more was allocated than was requested
*/
bool skb_try_coalesce(struct sk_buff *to, struct sk_buff *from,
bool *fragstolen, int *delta_truesize)
{
struct skb_shared_info *to_shinfo, *from_shinfo;
int i, delta, len = from->len;
*fragstolen = false;
if (skb_cloned(to))
return false;
/* In general, avoid mixing slab allocated and page_pool allocated
* pages within the same SKB. However when @to is not pp_recycle and
* @from is cloned, we can transition frag pages from page_pool to
* reference counted.
*
* On the other hand, don't allow coalescing two pp_recycle SKBs if
* @from is cloned, in case the SKB is using page_pool fragment
* references (PP_FLAG_PAGE_FRAG). Since we only take full page
* references for cloned SKBs at the moment that would result in
* inconsistent reference counts.
*/
if (to->pp_recycle != (from->pp_recycle && !skb_cloned(from)))
return false;
if (len <= skb_tailroom(to)) {
if (len)
BUG_ON(skb_copy_bits(from, 0, skb_put(to, len), len));
*delta_truesize = 0;
return true;
}
to_shinfo = skb_shinfo(to);
from_shinfo = skb_shinfo(from);
if (to_shinfo->frag_list || from_shinfo->frag_list)
return false;
if (skb_zcopy(to) || skb_zcopy(from))
return false;
if (skb_headlen(from) != 0) {
struct page *page;
unsigned int offset;
if (to_shinfo->nr_frags +
from_shinfo->nr_frags >= MAX_SKB_FRAGS)
return false;
if (skb_head_is_locked(from))
return false;
delta = from->truesize - SKB_DATA_ALIGN(sizeof(struct sk_buff));
page = virt_to_head_page(from->head);
offset = from->data - (unsigned char *)page_address(page);
skb_fill_page_desc(to, to_shinfo->nr_frags,
page, offset, skb_headlen(from));
*fragstolen = true;
} else {
if (to_shinfo->nr_frags +
from_shinfo->nr_frags > MAX_SKB_FRAGS)
return false;
delta = from->truesize - SKB_TRUESIZE(skb_end_offset(from));
}
WARN_ON_ONCE(delta < len);
memcpy(to_shinfo->frags + to_shinfo->nr_frags,
from_shinfo->frags,
from_shinfo->nr_frags * sizeof(skb_frag_t));
to_shinfo->nr_frags += from_shinfo->nr_frags;
if (!skb_cloned(from))
from_shinfo->nr_frags = 0;
/* if the skb is not cloned this does nothing
* since we set nr_frags to 0.
*/
for (i = 0; i < from_shinfo->nr_frags; i++)
__skb_frag_ref(&from_shinfo->frags[i]);
to->truesize += delta;
to->len += len;
to->data_len += len;
*delta_truesize = delta;
return true;
}
EXPORT_SYMBOL(skb_try_coalesce);
/**
* skb_scrub_packet - scrub an skb
*
* @skb: buffer to clean
* @xnet: packet is crossing netns
*
* skb_scrub_packet can be used after encapsulating or decapsulting a packet
* into/from a tunnel. Some information have to be cleared during these
* operations.
* skb_scrub_packet can also be used to clean a skb before injecting it in
* another namespace (@xnet == true). We have to clear all information in the
* skb that could impact namespace isolation.
*/
void skb_scrub_packet(struct sk_buff *skb, bool xnet)
{
skb->pkt_type = PACKET_HOST;
skb->skb_iif = 0;
skb->ignore_df = 0;
skb_dst_drop(skb);
skb_ext_reset(skb);
nf_reset_ct(skb);
nf_reset_trace(skb);
#ifdef CONFIG_NET_SWITCHDEV
skb->offload_fwd_mark = 0;
skb->offload_l3_fwd_mark = 0;
#endif
if (!xnet)
return;
ipvs_reset(skb);
skb->mark = 0;
skb->tstamp = 0;
}
EXPORT_SYMBOL_GPL(skb_scrub_packet);
/**
* skb_gso_transport_seglen - Return length of individual segments of a gso packet
*
* @skb: GSO skb
*
* skb_gso_transport_seglen is used to determine the real size of the
* individual segments, including Layer4 headers (TCP/UDP).
*
* The MAC/L2 or network (IP, IPv6) headers are not accounted for.
*/
static unsigned int skb_gso_transport_seglen(const struct sk_buff *skb)
{
const struct skb_shared_info *shinfo = skb_shinfo(skb);
unsigned int thlen = 0;
if (skb->encapsulation) {
thlen = skb_inner_transport_header(skb) -
skb_transport_header(skb);
if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6)))
thlen += inner_tcp_hdrlen(skb);
} else if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6))) {
thlen = tcp_hdrlen(skb);
} else if (unlikely(skb_is_gso_sctp(skb))) {
thlen = sizeof(struct sctphdr);
} else if (shinfo->gso_type & SKB_GSO_UDP_L4) {
thlen = sizeof(struct udphdr);
}
/* UFO sets gso_size to the size of the fragmentation
* payload, i.e. the size of the L4 (UDP) header is already
* accounted for.
*/
return thlen + shinfo->gso_size;
}
/**
* skb_gso_network_seglen - Return length of individual segments of a gso packet
*
* @skb: GSO skb
*
* skb_gso_network_seglen is used to determine the real size of the
* individual segments, including Layer3 (IP, IPv6) and L4 headers (TCP/UDP).
*
* The MAC/L2 header is not accounted for.
*/
static unsigned int skb_gso_network_seglen(const struct sk_buff *skb)
{
unsigned int hdr_len = skb_transport_header(skb) -
skb_network_header(skb);
return hdr_len + skb_gso_transport_seglen(skb);
}
/**
* skb_gso_mac_seglen - Return length of individual segments of a gso packet
*
* @skb: GSO skb
*
* skb_gso_mac_seglen is used to determine the real size of the
* individual segments, including MAC/L2, Layer3 (IP, IPv6) and L4
* headers (TCP/UDP).
*/
static unsigned int skb_gso_mac_seglen(const struct sk_buff *skb)
{
unsigned int hdr_len = skb_transport_header(skb) - skb_mac_header(skb);
return hdr_len + skb_gso_transport_seglen(skb);
}
/**
* skb_gso_size_check - check the skb size, considering GSO_BY_FRAGS
*
* There are a couple of instances where we have a GSO skb, and we
* want to determine what size it would be after it is segmented.
*
* We might want to check:
* - L3+L4+payload size (e.g. IP forwarding)
* - L2+L3+L4+payload size (e.g. sanity check before passing to driver)
*
* This is a helper to do that correctly considering GSO_BY_FRAGS.
*
* @skb: GSO skb
*
* @seg_len: The segmented length (from skb_gso_*_seglen). In the
* GSO_BY_FRAGS case this will be [header sizes + GSO_BY_FRAGS].
*
* @max_len: The maximum permissible length.
*
* Returns true if the segmented length <= max length.
*/
static inline bool skb_gso_size_check(const struct sk_buff *skb,
unsigned int seg_len,
unsigned int max_len) {
const struct skb_shared_info *shinfo = skb_shinfo(skb);
const struct sk_buff *iter;
if (shinfo->gso_size != GSO_BY_FRAGS)
return seg_len <= max_len;
/* Undo this so we can re-use header sizes */
seg_len -= GSO_BY_FRAGS;
skb_walk_frags(skb, iter) {
if (seg_len + skb_headlen(iter) > max_len)
return false;
}
return true;
}
/**
* skb_gso_validate_network_len - Will a split GSO skb fit into a given MTU?
*
* @skb: GSO skb
* @mtu: MTU to validate against
*
* skb_gso_validate_network_len validates if a given skb will fit a
* wanted MTU once split. It considers L3 headers, L4 headers, and the
* payload.
*/
bool skb_gso_validate_network_len(const struct sk_buff *skb, unsigned int mtu)
{
return skb_gso_size_check(skb, skb_gso_network_seglen(skb), mtu);
}
EXPORT_SYMBOL_GPL(skb_gso_validate_network_len);
/**
* skb_gso_validate_mac_len - Will a split GSO skb fit in a given length?
*
* @skb: GSO skb
* @len: length to validate against
*
* skb_gso_validate_mac_len validates if a given skb will fit a wanted
* length once split, including L2, L3 and L4 headers and the payload.
*/
bool skb_gso_validate_mac_len(const struct sk_buff *skb, unsigned int len)
{
return skb_gso_size_check(skb, skb_gso_mac_seglen(skb), len);
}
EXPORT_SYMBOL_GPL(skb_gso_validate_mac_len);
static struct sk_buff *skb_reorder_vlan_header(struct sk_buff *skb)
{
int mac_len, meta_len;
void *meta;
if (skb_cow(skb, skb_headroom(skb)) < 0) {
kfree_skb(skb);
return NULL;
}
mac_len = skb->data - skb_mac_header(skb);
if (likely(mac_len > VLAN_HLEN + ETH_TLEN)) {
memmove(skb_mac_header(skb) + VLAN_HLEN, skb_mac_header(skb),
mac_len - VLAN_HLEN - ETH_TLEN);
}
meta_len = skb_metadata_len(skb);
if (meta_len) {
meta = skb_metadata_end(skb) - meta_len;
memmove(meta + VLAN_HLEN, meta, meta_len);
}
skb->mac_header += VLAN_HLEN;
return skb;
}
struct sk_buff *skb_vlan_untag(struct sk_buff *skb)
{
struct vlan_hdr *vhdr;
u16 vlan_tci;
if (unlikely(skb_vlan_tag_present(skb))) {
/* vlan_tci is already set-up so leave this for another time */
return skb;
}
skb = skb_share_check(skb, GFP_ATOMIC);
if (unlikely(!skb))
goto err_free;
/* We may access the two bytes after vlan_hdr in vlan_set_encap_proto(). */
if (unlikely(!pskb_may_pull(skb, VLAN_HLEN + sizeof(unsigned short))))
goto err_free;
vhdr = (struct vlan_hdr *)skb->data;
vlan_tci = ntohs(vhdr->h_vlan_TCI);
__vlan_hwaccel_put_tag(skb, skb->protocol, vlan_tci);
skb_pull_rcsum(skb, VLAN_HLEN);
vlan_set_encap_proto(skb, vhdr);
skb = skb_reorder_vlan_header(skb);
if (unlikely(!skb))
goto err_free;
skb_reset_network_header(skb);
if (!skb_transport_header_was_set(skb))
skb_reset_transport_header(skb);
skb_reset_mac_len(skb);
return skb;
err_free:
kfree_skb(skb);
return NULL;
}
EXPORT_SYMBOL(skb_vlan_untag);
int skb_ensure_writable(struct sk_buff *skb, int write_len)
{
if (!pskb_may_pull(skb, write_len))
return -ENOMEM;
if (!skb_cloned(skb) || skb_clone_writable(skb, write_len))
return 0; return pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
}
EXPORT_SYMBOL(skb_ensure_writable);
/* remove VLAN header from packet and update csum accordingly.
* expects a non skb_vlan_tag_present skb with a vlan tag payload
*/
int __skb_vlan_pop(struct sk_buff *skb, u16 *vlan_tci)
{
struct vlan_hdr *vhdr;
int offset = skb->data - skb_mac_header(skb);
int err;
if (WARN_ONCE(offset,
"__skb_vlan_pop got skb with skb->data not at mac header (offset %d)\n",
offset)) {
return -EINVAL;
}
err = skb_ensure_writable(skb, VLAN_ETH_HLEN);
if (unlikely(err))
return err;
skb_postpull_rcsum(skb, skb->data + (2 * ETH_ALEN), VLAN_HLEN);
vhdr = (struct vlan_hdr *)(skb->data + ETH_HLEN);
*vlan_tci = ntohs(vhdr->h_vlan_TCI);
memmove(skb->data + VLAN_HLEN, skb->data, 2 * ETH_ALEN);
__skb_pull(skb, VLAN_HLEN);
vlan_set_encap_proto(skb, vhdr);
skb->mac_header += VLAN_HLEN;
if (skb_network_offset(skb) < ETH_HLEN)
skb_set_network_header(skb, ETH_HLEN);
skb_reset_mac_len(skb);
return err;
}
EXPORT_SYMBOL(__skb_vlan_pop);
/* Pop a vlan tag either from hwaccel or from payload.
* Expects skb->data at mac header.
*/
int skb_vlan_pop(struct sk_buff *skb)
{
u16 vlan_tci;
__be16 vlan_proto;
int err;
if (likely(skb_vlan_tag_present(skb))) {
__vlan_hwaccel_clear_tag(skb);
} else {
if (unlikely(!eth_type_vlan(skb->protocol)))
return 0;
err = __skb_vlan_pop(skb, &vlan_tci);
if (err)
return err;
}
/* move next vlan tag to hw accel tag */
if (likely(!eth_type_vlan(skb->protocol)))
return 0;
vlan_proto = skb->protocol;
err = __skb_vlan_pop(skb, &vlan_tci);
if (unlikely(err))
return err;
__vlan_hwaccel_put_tag(skb, vlan_proto, vlan_tci);
return 0;
}
EXPORT_SYMBOL(skb_vlan_pop);
/* Push a vlan tag either into hwaccel or into payload (if hwaccel tag present).
* Expects skb->data at mac header.
*/
int skb_vlan_push(struct sk_buff *skb, __be16 vlan_proto, u16 vlan_tci)
{
if (skb_vlan_tag_present(skb)) {
int offset = skb->data - skb_mac_header(skb);
int err;
if (WARN_ONCE(offset,
"skb_vlan_push got skb with skb->data not at mac header (offset %d)\n",
offset)) {
return -EINVAL;
}
err = __vlan_insert_tag(skb, skb->vlan_proto,
skb_vlan_tag_get(skb));
if (err)
return err;
skb->protocol = skb->vlan_proto;
skb->mac_len += VLAN_HLEN;
skb_postpush_rcsum(skb, skb->data + (2 * ETH_ALEN), VLAN_HLEN);
}
__vlan_hwaccel_put_tag(skb, vlan_proto, vlan_tci);
return 0;
}
EXPORT_SYMBOL(skb_vlan_push);
/**
* skb_eth_pop() - Drop the Ethernet header at the head of a packet
*
* @skb: Socket buffer to modify
*
* Drop the Ethernet header of @skb.
*
* Expects that skb->data points to the mac header and that no VLAN tags are
* present.
*
* Returns 0 on success, -errno otherwise.
*/
int skb_eth_pop(struct sk_buff *skb)
{
if (!pskb_may_pull(skb, ETH_HLEN) || skb_vlan_tagged(skb) ||
skb_network_offset(skb) < ETH_HLEN)
return -EPROTO;
skb_pull_rcsum(skb, ETH_HLEN);
skb_reset_mac_header(skb);
skb_reset_mac_len(skb);
return 0;
}
EXPORT_SYMBOL(skb_eth_pop);
/**
* skb_eth_push() - Add a new Ethernet header at the head of a packet
*
* @skb: Socket buffer to modify
* @dst: Destination MAC address of the new header
* @src: Source MAC address of the new header
*
* Prepend @skb with a new Ethernet header.
*
* Expects that skb->data points to the mac header, which must be empty.
*
* Returns 0 on success, -errno otherwise.
*/
int skb_eth_push(struct sk_buff *skb, const unsigned char *dst,
const unsigned char *src)
{
struct ethhdr *eth;
int err;
if (skb_network_offset(skb) || skb_vlan_tag_present(skb))
return -EPROTO;
err = skb_cow_head(skb, sizeof(*eth));
if (err < 0)
return err;
skb_push(skb, sizeof(*eth));
skb_reset_mac_header(skb);
skb_reset_mac_len(skb);
eth = eth_hdr(skb);
ether_addr_copy(eth->h_dest, dst);
ether_addr_copy(eth->h_source, src);
eth->h_proto = skb->protocol;
skb_postpush_rcsum(skb, eth, sizeof(*eth));
return 0;
}
EXPORT_SYMBOL(skb_eth_push);
/* Update the ethertype of hdr and the skb csum value if required. */
static void skb_mod_eth_type(struct sk_buff *skb, struct ethhdr *hdr,
__be16 ethertype)
{
if (skb->ip_summed == CHECKSUM_COMPLETE) {
__be16 diff[] = { ~hdr->h_proto, ethertype };
skb->csum = csum_partial((char *)diff, sizeof(diff), skb->csum);
}
hdr->h_proto = ethertype;
}
/**
* skb_mpls_push() - push a new MPLS header after mac_len bytes from start of
* the packet
*
* @skb: buffer
* @mpls_lse: MPLS label stack entry to push
* @mpls_proto: ethertype of the new MPLS header (expects 0x8847 or 0x8848)
* @mac_len: length of the MAC header
* @ethernet: flag to indicate if the resulting packet after skb_mpls_push is
* ethernet
*
* Expects skb->data at mac header.
*
* Returns 0 on success, -errno otherwise.
*/
int skb_mpls_push(struct sk_buff *skb, __be32 mpls_lse, __be16 mpls_proto,
int mac_len, bool ethernet)
{
struct mpls_shim_hdr *lse;
int err;
if (unlikely(!eth_p_mpls(mpls_proto)))
return -EINVAL;
/* Networking stack does not allow simultaneous Tunnel and MPLS GSO. */
if (skb->encapsulation)
return -EINVAL;
err = skb_cow_head(skb, MPLS_HLEN);
if (unlikely(err))
return err;
if (!skb->inner_protocol) {
skb_set_inner_network_header(skb, skb_network_offset(skb));
skb_set_inner_protocol(skb, skb->protocol);
}
skb_push(skb, MPLS_HLEN);
memmove(skb_mac_header(skb) - MPLS_HLEN, skb_mac_header(skb),
mac_len);
skb_reset_mac_header(skb);
skb_set_network_header(skb, mac_len);
skb_reset_mac_len(skb);
lse = mpls_hdr(skb);
lse->label_stack_entry = mpls_lse;
skb_postpush_rcsum(skb, lse, MPLS_HLEN);
if (ethernet && mac_len >= ETH_HLEN)
skb_mod_eth_type(skb, eth_hdr(skb), mpls_proto);
skb->protocol = mpls_proto;
return 0;
}
EXPORT_SYMBOL_GPL(skb_mpls_push);
/**
* skb_mpls_pop() - pop the outermost MPLS header
*
* @skb: buffer
* @next_proto: ethertype of header after popped MPLS header
* @mac_len: length of the MAC header
* @ethernet: flag to indicate if the packet is ethernet
*
* Expects skb->data at mac header.
*
* Returns 0 on success, -errno otherwise.
*/
int skb_mpls_pop(struct sk_buff *skb, __be16 next_proto, int mac_len,
bool ethernet)
{
int err;
if (unlikely(!eth_p_mpls(skb->protocol)))
return 0;
err = skb_ensure_writable(skb, mac_len + MPLS_HLEN);
if (unlikely(err))
return err;
skb_postpull_rcsum(skb, mpls_hdr(skb), MPLS_HLEN);
memmove(skb_mac_header(skb) + MPLS_HLEN, skb_mac_header(skb),
mac_len);
__skb_pull(skb, MPLS_HLEN);
skb_reset_mac_header(skb);
skb_set_network_header(skb, mac_len);
if (ethernet && mac_len >= ETH_HLEN) {
struct ethhdr *hdr;
/* use mpls_hdr() to get ethertype to account for VLANs. */
hdr = (struct ethhdr *)((void *)mpls_hdr(skb) - ETH_HLEN);
skb_mod_eth_type(skb, hdr, next_proto);
}
skb->protocol = next_proto;
return 0;
}
EXPORT_SYMBOL_GPL(skb_mpls_pop);
/**
* skb_mpls_update_lse() - modify outermost MPLS header and update csum
*
* @skb: buffer
* @mpls_lse: new MPLS label stack entry to update to
*
* Expects skb->data at mac header.
*
* Returns 0 on success, -errno otherwise.
*/
int skb_mpls_update_lse(struct sk_buff *skb, __be32 mpls_lse)
{
int err;
if (unlikely(!eth_p_mpls(skb->protocol)))
return -EINVAL;
err = skb_ensure_writable(skb, skb->mac_len + MPLS_HLEN);
if (unlikely(err))
return err;
if (skb->ip_summed == CHECKSUM_COMPLETE) {
__be32 diff[] = { ~mpls_hdr(skb)->label_stack_entry, mpls_lse };
skb->csum = csum_partial((char *)diff, sizeof(diff), skb->csum);
}
mpls_hdr(skb)->label_stack_entry = mpls_lse;
return 0;
}
EXPORT_SYMBOL_GPL(skb_mpls_update_lse);
/**
* skb_mpls_dec_ttl() - decrement the TTL of the outermost MPLS header
*
* @skb: buffer
*
* Expects skb->data at mac header.
*
* Returns 0 on success, -errno otherwise.
*/
int skb_mpls_dec_ttl(struct sk_buff *skb)
{
u32 lse;
u8 ttl;
if (unlikely(!eth_p_mpls(skb->protocol)))
return -EINVAL;
if (!pskb_may_pull(skb, skb_network_offset(skb) + MPLS_HLEN))
return -ENOMEM;
lse = be32_to_cpu(mpls_hdr(skb)->label_stack_entry);
ttl = (lse & MPLS_LS_TTL_MASK) >> MPLS_LS_TTL_SHIFT;
if (!--ttl)
return -EINVAL;
lse &= ~MPLS_LS_TTL_MASK;
lse |= ttl << MPLS_LS_TTL_SHIFT;
return skb_mpls_update_lse(skb, cpu_to_be32(lse));
}
EXPORT_SYMBOL_GPL(skb_mpls_dec_ttl);
/**
* alloc_skb_with_frags - allocate skb with page frags
*
* @header_len: size of linear part
* @data_len: needed length in frags
* @max_page_order: max page order desired.
* @errcode: pointer to error code if any
* @gfp_mask: allocation mask
*
* This can be used to allocate a paged skb, given a maximal order for frags.
*/
struct sk_buff *alloc_skb_with_frags(unsigned long header_len,
unsigned long data_len,
int max_page_order,
int *errcode,
gfp_t gfp_mask)
{
int npages = (data_len + (PAGE_SIZE - 1)) >> PAGE_SHIFT;
unsigned long chunk;
struct sk_buff *skb;
struct page *page;
int i;
*errcode = -EMSGSIZE;
/* Note this test could be relaxed, if we succeed to allocate
* high order pages...
*/
if (npages > MAX_SKB_FRAGS)
return NULL;
*errcode = -ENOBUFS;
skb = alloc_skb(header_len, gfp_mask);
if (!skb)
return NULL;
skb->truesize += npages << PAGE_SHIFT;
for (i = 0; npages > 0; i++) {
int order = max_page_order;
while (order) { if (npages >= 1 << order) { page = alloc_pages((gfp_mask & ~__GFP_DIRECT_RECLAIM) |
__GFP_COMP |
__GFP_NOWARN,
order);
if (page)
goto fill_page;
/* Do not retry other high order allocations */
order = 1;
max_page_order = 0;
}
order--;
}
page = alloc_page(gfp_mask);
if (!page)
goto failure;
fill_page:
chunk = min_t(unsigned long, data_len,
PAGE_SIZE << order);
skb_fill_page_desc(skb, i, page, 0, chunk);
data_len -= chunk;
npages -= 1 << order;
}
return skb;
failure:
kfree_skb(skb);
return NULL;
}
EXPORT_SYMBOL(alloc_skb_with_frags);
/* carve out the first off bytes from skb when off < headlen */
static int pskb_carve_inside_header(struct sk_buff *skb, const u32 off,
const int headlen, gfp_t gfp_mask)
{
int i;
int size = skb_end_offset(skb);
int new_hlen = headlen - off;
u8 *data;
size = SKB_DATA_ALIGN(size);
if (skb_pfmemalloc(skb))
gfp_mask |= __GFP_MEMALLOC;
data = kmalloc_reserve(size +
SKB_DATA_ALIGN(sizeof(struct skb_shared_info)),
gfp_mask, NUMA_NO_NODE, NULL);
if (!data)
return -ENOMEM;
size = SKB_WITH_OVERHEAD(ksize(data));
/* Copy real data, and all frags */
skb_copy_from_linear_data_offset(skb, off, data, new_hlen);
skb->len -= off;
memcpy((struct skb_shared_info *)(data + size),
skb_shinfo(skb),
offsetof(struct skb_shared_info,
frags[skb_shinfo(skb)->nr_frags]));
if (skb_cloned(skb)) {
/* drop the old head gracefully */
if (skb_orphan_frags(skb, gfp_mask)) {
kfree(data);
return -ENOMEM;
}
for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
skb_frag_ref(skb, i);
if (skb_has_frag_list(skb))
skb_clone_fraglist(skb);
skb_release_data(skb);
} else {
/* we can reuse existing recount- all we did was
* relocate values
*/
skb_free_head(skb);
}
skb->head = data;
skb->data = data;
skb->head_frag = 0;
skb_set_end_offset(skb, size);
skb_set_tail_pointer(skb, skb_headlen(skb));
skb_headers_offset_update(skb, 0);
skb->cloned = 0;
skb->hdr_len = 0;
skb->nohdr = 0;
atomic_set(&skb_shinfo(skb)->dataref, 1);
return 0;
}
static int pskb_carve(struct sk_buff *skb, const u32 off, gfp_t gfp);
/* carve out the first eat bytes from skb's frag_list. May recurse into
* pskb_carve()
*/
static int pskb_carve_frag_list(struct sk_buff *skb,
struct skb_shared_info *shinfo, int eat,
gfp_t gfp_mask)
{
struct sk_buff *list = shinfo->frag_list;
struct sk_buff *clone = NULL;
struct sk_buff *insp = NULL;
do {
if (!list) {
pr_err("Not enough bytes to eat. Want %d\n", eat);
return -EFAULT;
}
if (list->len <= eat) {
/* Eaten as whole. */
eat -= list->len;
list = list->next;
insp = list;
} else {
/* Eaten partially. */
if (skb_shared(list)) {
clone = skb_clone(list, gfp_mask);
if (!clone)
return -ENOMEM;
insp = list->next;
list = clone;
} else {
/* This may be pulled without problems. */
insp = list;
}
if (pskb_carve(list, eat, gfp_mask) < 0) {
kfree_skb(clone);
return -ENOMEM;
}
break;
}
} while (eat);
/* Free pulled out fragments. */
while ((list = shinfo->frag_list) != insp) {
shinfo->frag_list = list->next;
consume_skb(list);
}
/* And insert new clone at head. */
if (clone) {
clone->next = list;
shinfo->frag_list = clone;
}
return 0;
}
/* carve off first len bytes from skb. Split line (off) is in the
* non-linear part of skb
*/
static int pskb_carve_inside_nonlinear(struct sk_buff *skb, const u32 off,
int pos, gfp_t gfp_mask)
{
int i, k = 0;
int size = skb_end_offset(skb);
u8 *data;
const int nfrags = skb_shinfo(skb)->nr_frags;
struct skb_shared_info *shinfo;
size = SKB_DATA_ALIGN(size);
if (skb_pfmemalloc(skb))
gfp_mask |= __GFP_MEMALLOC;
data = kmalloc_reserve(size +
SKB_DATA_ALIGN(sizeof(struct skb_shared_info)),
gfp_mask, NUMA_NO_NODE, NULL);
if (!data)
return -ENOMEM;
size = SKB_WITH_OVERHEAD(ksize(data));
memcpy((struct skb_shared_info *)(data + size),
skb_shinfo(skb), offsetof(struct skb_shared_info, frags[0]));
if (skb_orphan_frags(skb, gfp_mask)) {
kfree(data);
return -ENOMEM;
}
shinfo = (struct skb_shared_info *)(data + size);
for (i = 0; i < nfrags; i++) {
int fsize = skb_frag_size(&skb_shinfo(skb)->frags[i]);
if (pos + fsize > off) {
shinfo->frags[k] = skb_shinfo(skb)->frags[i];
if (pos < off) {
/* Split frag.
* We have two variants in this case:
* 1. Move all the frag to the second
* part, if it is possible. F.e.
* this approach is mandatory for TUX,
* where splitting is expensive.
* 2. Split is accurately. We make this.
*/
skb_frag_off_add(&shinfo->frags[0], off - pos);
skb_frag_size_sub(&shinfo->frags[0], off - pos);
}
skb_frag_ref(skb, i);
k++;
}
pos += fsize;
}
shinfo->nr_frags = k;
if (skb_has_frag_list(skb))
skb_clone_fraglist(skb);
/* split line is in frag list */
if (k == 0 && pskb_carve_frag_list(skb, shinfo, off - pos, gfp_mask)) {
/* skb_frag_unref() is not needed here as shinfo->nr_frags = 0. */
if (skb_has_frag_list(skb))
kfree_skb_list(skb_shinfo(skb)->frag_list);
kfree(data);
return -ENOMEM;
}
skb_release_data(skb);
skb->head = data;
skb->head_frag = 0;
skb->data = data;
skb_set_end_offset(skb, size);
skb_reset_tail_pointer(skb);
skb_headers_offset_update(skb, 0);
skb->cloned = 0;
skb->hdr_len = 0;
skb->nohdr = 0;
skb->len -= off;
skb->data_len = skb->len;
atomic_set(&skb_shinfo(skb)->dataref, 1);
return 0;
}
/* remove len bytes from the beginning of the skb */
static int pskb_carve(struct sk_buff *skb, const u32 len, gfp_t gfp)
{
int headlen = skb_headlen(skb);
if (len < headlen)
return pskb_carve_inside_header(skb, len, headlen, gfp);
else
return pskb_carve_inside_nonlinear(skb, len, headlen, gfp);
}
/* Extract to_copy bytes starting at off from skb, and return this in
* a new skb
*/
struct sk_buff *pskb_extract(struct sk_buff *skb, int off,
int to_copy, gfp_t gfp)
{
struct sk_buff *clone = skb_clone(skb, gfp);
if (!clone)
return NULL;
if (pskb_carve(clone, off, gfp) < 0 ||
pskb_trim(clone, to_copy)) {
kfree_skb(clone);
return NULL;
}
return clone;
}
EXPORT_SYMBOL(pskb_extract);
/**
* skb_condense - try to get rid of fragments/frag_list if possible
* @skb: buffer
*
* Can be used to save memory before skb is added to a busy queue.
* If packet has bytes in frags and enough tail room in skb->head,
* pull all of them, so that we can free the frags right now and adjust
* truesize.
* Notes:
* We do not reallocate skb->head thus can not fail.
* Caller must re-evaluate skb->truesize if needed.
*/
void skb_condense(struct sk_buff *skb)
{
if (skb->data_len) {
if (skb->data_len > skb->end - skb->tail ||
skb_cloned(skb))
return;
/* Nice, we can free page frag(s) right now */
__pskb_pull_tail(skb, skb->data_len);
}
/* At this point, skb->truesize might be over estimated,
* because skb had a fragment, and fragments do not tell
* their truesize.
* When we pulled its content into skb->head, fragment
* was freed, but __pskb_pull_tail() could not possibly
* adjust skb->truesize, not knowing the frag truesize.
*/
skb->truesize = SKB_TRUESIZE(skb_end_offset(skb));
}
#ifdef CONFIG_SKB_EXTENSIONS
static void *skb_ext_get_ptr(struct skb_ext *ext, enum skb_ext_id id)
{
return (void *)ext + (ext->offset[id] * SKB_EXT_ALIGN_VALUE);
}
/**
* __skb_ext_alloc - allocate a new skb extensions storage
*
* @flags: See kmalloc().
*
* Returns the newly allocated pointer. The pointer can later attached to a
* skb via __skb_ext_set().
* Note: caller must handle the skb_ext as an opaque data.
*/
struct skb_ext *__skb_ext_alloc(gfp_t flags)
{
struct skb_ext *new = kmem_cache_alloc(skbuff_ext_cache, flags);
if (new) {
memset(new->offset, 0, sizeof(new->offset));
refcount_set(&new->refcnt, 1);
}
return new;
}
static struct skb_ext *skb_ext_maybe_cow(struct skb_ext *old,
unsigned int old_active)
{
struct skb_ext *new;
if (refcount_read(&old->refcnt) == 1)
return old;
new = kmem_cache_alloc(skbuff_ext_cache, GFP_ATOMIC);
if (!new)
return NULL;
memcpy(new, old, old->chunks * SKB_EXT_ALIGN_VALUE);
refcount_set(&new->refcnt, 1);
#ifdef CONFIG_XFRM
if (old_active & (1 << SKB_EXT_SEC_PATH)) {
struct sec_path *sp = skb_ext_get_ptr(old, SKB_EXT_SEC_PATH);
unsigned int i;
for (i = 0; i < sp->len; i++)
xfrm_state_hold(sp->xvec[i]);
}
#endif
__skb_ext_put(old);
return new;
}
/**
* __skb_ext_set - attach the specified extension storage to this skb
* @skb: buffer
* @id: extension id
* @ext: extension storage previously allocated via __skb_ext_alloc()
*
* Existing extensions, if any, are cleared.
*
* Returns the pointer to the extension.
*/
void *__skb_ext_set(struct sk_buff *skb, enum skb_ext_id id,
struct skb_ext *ext)
{
unsigned int newlen, newoff = SKB_EXT_CHUNKSIZEOF(*ext);
skb_ext_put(skb);
newlen = newoff + skb_ext_type_len[id];
ext->chunks = newlen;
ext->offset[id] = newoff;
skb->extensions = ext;
skb->active_extensions = 1 << id;
return skb_ext_get_ptr(ext, id);
}
/**
* skb_ext_add - allocate space for given extension, COW if needed
* @skb: buffer
* @id: extension to allocate space for
*
* Allocates enough space for the given extension.
* If the extension is already present, a pointer to that extension
* is returned.
*
* If the skb was cloned, COW applies and the returned memory can be
* modified without changing the extension space of clones buffers.
*
* Returns pointer to the extension or NULL on allocation failure.
*/
void *skb_ext_add(struct sk_buff *skb, enum skb_ext_id id)
{
struct skb_ext *new, *old = NULL;
unsigned int newlen, newoff;
if (skb->active_extensions) {
old = skb->extensions;
new = skb_ext_maybe_cow(old, skb->active_extensions);
if (!new)
return NULL;
if (__skb_ext_exist(new, id))
goto set_active;
newoff = new->chunks;
} else {
newoff = SKB_EXT_CHUNKSIZEOF(*new);
new = __skb_ext_alloc(GFP_ATOMIC);
if (!new)
return NULL;
}
newlen = newoff + skb_ext_type_len[id];
new->chunks = newlen;
new->offset[id] = newoff;
set_active:
skb->slow_gro = 1;
skb->extensions = new;
skb->active_extensions |= 1 << id;
return skb_ext_get_ptr(new, id);
}
EXPORT_SYMBOL(skb_ext_add);
#ifdef CONFIG_XFRM
static void skb_ext_put_sp(struct sec_path *sp)
{
unsigned int i;
for (i = 0; i < sp->len; i++)
xfrm_state_put(sp->xvec[i]);
}
#endif
void __skb_ext_del(struct sk_buff *skb, enum skb_ext_id id)
{
struct skb_ext *ext = skb->extensions;
skb->active_extensions &= ~(1 << id);
if (skb->active_extensions == 0) {
skb->extensions = NULL;
__skb_ext_put(ext);
#ifdef CONFIG_XFRM
} else if (id == SKB_EXT_SEC_PATH &&
refcount_read(&ext->refcnt) == 1) {
struct sec_path *sp = skb_ext_get_ptr(ext, SKB_EXT_SEC_PATH);
skb_ext_put_sp(sp);
sp->len = 0;
#endif
}
}
EXPORT_SYMBOL(__skb_ext_del);
void __skb_ext_put(struct skb_ext *ext)
{
/* If this is last clone, nothing can increment
* it after check passes. Avoids one atomic op.
*/
if (refcount_read(&ext->refcnt) == 1)
goto free_now;
if (!refcount_dec_and_test(&ext->refcnt))
return;
free_now:
#ifdef CONFIG_XFRM
if (__skb_ext_exist(ext, SKB_EXT_SEC_PATH))
skb_ext_put_sp(skb_ext_get_ptr(ext, SKB_EXT_SEC_PATH));
#endif
kmem_cache_free(skbuff_ext_cache, ext);
}
EXPORT_SYMBOL(__skb_ext_put);
#endif /* CONFIG_SKB_EXTENSIONS */
// SPDX-License-Identifier: GPL-2.0
/*
* Copyright (C) 2007 Oracle. All rights reserved.
*/
#include <linux/blkdev.h>
#include <linux/module.h>
#include <linux/fs.h>
#include <linux/pagemap.h>
#include <linux/highmem.h>
#include <linux/time.h>
#include <linux/init.h>
#include <linux/seq_file.h>
#include <linux/string.h>
#include <linux/backing-dev.h>
#include <linux/mount.h>
#include <linux/writeback.h>
#include <linux/statfs.h>
#include <linux/compat.h>
#include <linux/parser.h>
#include <linux/ctype.h>
#include <linux/namei.h>
#include <linux/miscdevice.h>
#include <linux/magic.h>
#include <linux/slab.h>
#include <linux/cleancache.h>
#include <linux/ratelimit.h>
#include <linux/crc32c.h>
#include <linux/btrfs.h>
#include "delayed-inode.h"
#include "ctree.h"
#include "disk-io.h"
#include "transaction.h"
#include "btrfs_inode.h"
#include "print-tree.h"
#include "props.h"
#include "xattr.h"
#include "volumes.h"
#include "export.h"
#include "compression.h"
#include "rcu-string.h"
#include "dev-replace.h"
#include "free-space-cache.h"
#include "backref.h"
#include "space-info.h"
#include "sysfs.h"
#include "zoned.h"
#include "tests/btrfs-tests.h"
#include "block-group.h"
#include "discard.h"
#include "qgroup.h"
#define CREATE_TRACE_POINTS
#include <trace/events/btrfs.h>
static const struct super_operations btrfs_super_ops;
/*
* Types for mounting the default subvolume and a subvolume explicitly
* requested by subvol=/path. That way the callchain is straightforward and we
* don't have to play tricks with the mount options and recursive calls to
* btrfs_mount.
*
* The new btrfs_root_fs_type also servers as a tag for the bdev_holder.
*/
static struct file_system_type btrfs_fs_type;
static struct file_system_type btrfs_root_fs_type;
static int btrfs_remount(struct super_block *sb, int *flags, char *data);
/*
* Generally the error codes correspond to their respective errors, but there
* are a few special cases.
*
* EUCLEAN: Any sort of corruption that we encounter. The tree-checker for
* instance will return EUCLEAN if any of the blocks are corrupted in
* a way that is problematic. We want to reserve EUCLEAN for these
* sort of corruptions.
*
* EROFS: If we check BTRFS_FS_STATE_ERROR and fail out with a return error, we
* need to use EROFS for this case. We will have no idea of the
* original failure, that will have been reported at the time we tripped
* over the error. Each subsequent error that doesn't have any context
* of the original error should use EROFS when handling BTRFS_FS_STATE_ERROR.
*/
const char * __attribute_const__ btrfs_decode_error(int errno)
{
char *errstr = "unknown";
switch (errno) {
case -ENOENT: /* -2 */
errstr = "No such entry";
break;
case -EIO: /* -5 */
errstr = "IO failure";
break;
case -ENOMEM: /* -12*/
errstr = "Out of memory";
break;
case -EEXIST: /* -17 */
errstr = "Object already exists";
break;
case -ENOSPC: /* -28 */
errstr = "No space left";
break;
case -EROFS: /* -30 */
errstr = "Readonly filesystem";
break;
case -EOPNOTSUPP: /* -95 */
errstr = "Operation not supported";
break;
case -EUCLEAN: /* -117 */
errstr = "Filesystem corrupted";
break;
case -EDQUOT: /* -122 */
errstr = "Quota exceeded";
break;
}
return errstr;
}
/*
* __btrfs_handle_fs_error decodes expected errors from the caller and
* invokes the appropriate error response.
*/
__cold
void __btrfs_handle_fs_error(struct btrfs_fs_info *fs_info, const char *function,
unsigned int line, int errno, const char *fmt, ...)
{
struct super_block *sb = fs_info->sb;
#ifdef CONFIG_PRINTK
const char *errstr;
#endif
/*
* Special case: if the error is EROFS, and we're already
* under SB_RDONLY, then it is safe here.
*/
if (errno == -EROFS && sb_rdonly(sb))
return;
#ifdef CONFIG_PRINTK
errstr = btrfs_decode_error(errno);
if (fmt) {
struct va_format vaf;
va_list args;
va_start(args, fmt);
vaf.fmt = fmt;
vaf.va = &args;
pr_crit("BTRFS: error (device %s) in %s:%d: errno=%d %s (%pV)\n",
sb->s_id, function, line, errno, errstr, &vaf);
va_end(args);
} else {
pr_crit("BTRFS: error (device %s) in %s:%d: errno=%d %s\n",
sb->s_id, function, line, errno, errstr);
}
#endif
/*
* Today we only save the error info to memory. Long term we'll
* also send it down to the disk
*/
set_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state);
/* Don't go through full error handling during mount */
if (!(sb->s_flags & SB_BORN))
return;
if (sb_rdonly(sb))
return;
btrfs_discard_stop(fs_info);
/* btrfs handle error by forcing the filesystem readonly */
btrfs_set_sb_rdonly(sb);
btrfs_info(fs_info, "forced readonly");
/*
* Note that a running device replace operation is not canceled here
* although there is no way to update the progress. It would add the
* risk of a deadlock, therefore the canceling is omitted. The only
* penalty is that some I/O remains active until the procedure
* completes. The next time when the filesystem is mounted writable
* again, the device replace operation continues.
*/
}
#ifdef CONFIG_PRINTK
static const char * const logtypes[] = {
"emergency",
"alert",
"critical",
"error",
"warning",
"notice",
"info",
"debug",
};
/*
* Use one ratelimit state per log level so that a flood of less important
* messages doesn't cause more important ones to be dropped.
*/
static struct ratelimit_state printk_limits[] = {
RATELIMIT_STATE_INIT(printk_limits[0], DEFAULT_RATELIMIT_INTERVAL, 100),
RATELIMIT_STATE_INIT(printk_limits[1], DEFAULT_RATELIMIT_INTERVAL, 100),
RATELIMIT_STATE_INIT(printk_limits[2], DEFAULT_RATELIMIT_INTERVAL, 100),
RATELIMIT_STATE_INIT(printk_limits[3], DEFAULT_RATELIMIT_INTERVAL, 100),
RATELIMIT_STATE_INIT(printk_limits[4], DEFAULT_RATELIMIT_INTERVAL, 100),
RATELIMIT_STATE_INIT(printk_limits[5], DEFAULT_RATELIMIT_INTERVAL, 100),
RATELIMIT_STATE_INIT(printk_limits[6], DEFAULT_RATELIMIT_INTERVAL, 100),
RATELIMIT_STATE_INIT(printk_limits[7], DEFAULT_RATELIMIT_INTERVAL, 100),
};
void __cold btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...)
{
char lvl[PRINTK_MAX_SINGLE_HEADER_LEN + 1] = "\0";
struct va_format vaf;
va_list args;
int kern_level;
const char *type = logtypes[4];
struct ratelimit_state *ratelimit = &printk_limits[4];
va_start(args, fmt);
while ((kern_level = printk_get_level(fmt)) != 0) {
size_t size = printk_skip_level(fmt) - fmt;
if (kern_level >= '0' && kern_level <= '7') {
memcpy(lvl, fmt, size);
lvl[size] = '\0';
type = logtypes[kern_level - '0'];
ratelimit = &printk_limits[kern_level - '0'];
}
fmt += size;
}
vaf.fmt = fmt;
vaf.va = &args;
if (__ratelimit(ratelimit)) {
if (fs_info) printk("%sBTRFS %s (device %s): %pV\n", lvl, type,
fs_info->sb->s_id, &vaf);
else
printk("%sBTRFS %s: %pV\n", lvl, type, &vaf);
}
va_end(args);
}
#endif
#if BITS_PER_LONG == 32
void __cold btrfs_warn_32bit_limit(struct btrfs_fs_info *fs_info)
{
if (!test_and_set_bit(BTRFS_FS_32BIT_WARN, &fs_info->flags)) {
btrfs_warn(fs_info, "reaching 32bit limit for logical addresses");
btrfs_warn(fs_info,
"due to page cache limit on 32bit systems, btrfs can't access metadata at or beyond %lluT",
BTRFS_32BIT_MAX_FILE_SIZE >> 40);
btrfs_warn(fs_info,
"please consider upgrading to 64bit kernel/hardware");
}
}
void __cold btrfs_err_32bit_limit(struct btrfs_fs_info *fs_info)
{
if (!test_and_set_bit(BTRFS_FS_32BIT_ERROR, &fs_info->flags)) {
btrfs_err(fs_info, "reached 32bit limit for logical addresses");
btrfs_err(fs_info,
"due to page cache limit on 32bit systems, metadata beyond %lluT can't be accessed",
BTRFS_32BIT_MAX_FILE_SIZE >> 40);
btrfs_err(fs_info,
"please consider upgrading to 64bit kernel/hardware");
}
}
#endif
/*
* We only mark the transaction aborted and then set the file system read-only.
* This will prevent new transactions from starting or trying to join this
* one.
*
* This means that error recovery at the call site is limited to freeing
* any local memory allocations and passing the error code up without
* further cleanup. The transaction should complete as it normally would
* in the call path but will return -EIO.
*
* We'll complete the cleanup in btrfs_end_transaction and
* btrfs_commit_transaction.
*/
__cold
void __btrfs_abort_transaction(struct btrfs_trans_handle *trans,
const char *function,
unsigned int line, int errno)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
WRITE_ONCE(trans->aborted, errno);
WRITE_ONCE(trans->transaction->aborted, errno);
/* Wake up anybody who may be waiting on this transaction */
wake_up(&fs_info->transaction_wait);
wake_up(&fs_info->transaction_blocked_wait);
__btrfs_handle_fs_error(fs_info, function, line, errno, NULL);
}
/*
* __btrfs_panic decodes unexpected, fatal errors from the caller,
* issues an alert, and either panics or BUGs, depending on mount options.
*/
__cold
void __btrfs_panic(struct btrfs_fs_info *fs_info, const char *function,
unsigned int line, int errno, const char *fmt, ...)
{
char *s_id = "<unknown>";
const char *errstr;
struct va_format vaf = { .fmt = fmt };
va_list args;
if (fs_info)
s_id = fs_info->sb->s_id;
va_start(args, fmt);
vaf.va = &args;
errstr = btrfs_decode_error(errno);
if (fs_info && (btrfs_test_opt(fs_info, PANIC_ON_FATAL_ERROR)))
panic(KERN_CRIT "BTRFS panic (device %s) in %s:%d: %pV (errno=%d %s)\n",
s_id, function, line, &vaf, errno, errstr);
btrfs_crit(fs_info, "panic in %s:%d: %pV (errno=%d %s)",
function, line, &vaf, errno, errstr);
va_end(args);
/* Caller calls BUG() */
}
static void btrfs_put_super(struct super_block *sb)
{
close_ctree(btrfs_sb(sb));
}
enum {
Opt_acl, Opt_noacl,
Opt_clear_cache,
Opt_commit_interval,
Opt_compress,
Opt_compress_force,
Opt_compress_force_type,
Opt_compress_type,
Opt_degraded,
Opt_device,
Opt_fatal_errors,
Opt_flushoncommit, Opt_noflushoncommit,
Opt_max_inline,
Opt_barrier, Opt_nobarrier,
Opt_datacow, Opt_nodatacow,
Opt_datasum, Opt_nodatasum,
Opt_defrag, Opt_nodefrag,
Opt_discard, Opt_nodiscard,
Opt_discard_mode,
Opt_norecovery,
Opt_ratio,
Opt_rescan_uuid_tree,
Opt_skip_balance,
Opt_space_cache, Opt_no_space_cache,
Opt_space_cache_version,
Opt_ssd, Opt_nossd,
Opt_ssd_spread, Opt_nossd_spread,
Opt_subvol,
Opt_subvol_empty,
Opt_subvolid,
Opt_thread_pool,
Opt_treelog, Opt_notreelog,
Opt_user_subvol_rm_allowed,
/* Rescue options */
Opt_rescue,
Opt_usebackuproot,
Opt_nologreplay,
Opt_ignorebadroots,
Opt_ignoredatacsums,
Opt_rescue_all,
/* Deprecated options */
Opt_recovery,
Opt_inode_cache, Opt_noinode_cache,
/* Debugging options */
Opt_check_integrity,
Opt_check_integrity_including_extent_data,
Opt_check_integrity_print_mask,
Opt_enospc_debug, Opt_noenospc_debug,
#ifdef CONFIG_BTRFS_DEBUG
Opt_fragment_data, Opt_fragment_metadata, Opt_fragment_all,
#endif
#ifdef CONFIG_BTRFS_FS_REF_VERIFY
Opt_ref_verify,
#endif
Opt_err,
};
static const match_table_t tokens = {
{Opt_acl, "acl"},
{Opt_noacl, "noacl"},
{Opt_clear_cache, "clear_cache"},
{Opt_commit_interval, "commit=%u"},
{Opt_compress, "compress"},
{Opt_compress_type, "compress=%s"},
{Opt_compress_force, "compress-force"},
{Opt_compress_force_type, "compress-force=%s"},
{Opt_degraded, "degraded"},
{Opt_device, "device=%s"},
{Opt_fatal_errors, "fatal_errors=%s"},
{Opt_flushoncommit, "flushoncommit"},
{Opt_noflushoncommit, "noflushoncommit"},
{Opt_inode_cache, "inode_cache"},
{Opt_noinode_cache, "noinode_cache"},
{Opt_max_inline, "max_inline=%s"},
{Opt_barrier, "barrier"},
{Opt_nobarrier, "nobarrier"},
{Opt_datacow, "datacow"},
{Opt_nodatacow, "nodatacow"},
{Opt_datasum, "datasum"},
{Opt_nodatasum, "nodatasum"},
{Opt_defrag, "autodefrag"},
{Opt_nodefrag, "noautodefrag"},
{Opt_discard, "discard"},
{Opt_discard_mode, "discard=%s"},
{Opt_nodiscard, "nodiscard"},
{Opt_norecovery, "norecovery"},
{Opt_ratio, "metadata_ratio=%u"},
{Opt_rescan_uuid_tree, "rescan_uuid_tree"},
{Opt_skip_balance, "skip_balance"},
{Opt_space_cache, "space_cache"},
{Opt_no_space_cache, "nospace_cache"},
{Opt_space_cache_version, "space_cache=%s"},
{Opt_ssd, "ssd"},
{Opt_nossd, "nossd"},
{Opt_ssd_spread, "ssd_spread"},
{Opt_nossd_spread, "nossd_spread"},
{Opt_subvol, "subvol=%s"},
{Opt_subvol_empty, "subvol="},
{Opt_subvolid, "subvolid=%s"},
{Opt_thread_pool, "thread_pool=%u"},
{Opt_treelog, "treelog"},
{Opt_notreelog, "notreelog"},
{Opt_user_subvol_rm_allowed, "user_subvol_rm_allowed"},
/* Rescue options */
{Opt_rescue, "rescue=%s"},
/* Deprecated, with alias rescue=nologreplay */
{Opt_nologreplay, "nologreplay"},
/* Deprecated, with alias rescue=usebackuproot */
{Opt_usebackuproot, "usebackuproot"},
/* Deprecated options */
{Opt_recovery, "recovery"},
/* Debugging options */
{Opt_check_integrity, "check_int"},
{Opt_check_integrity_including_extent_data, "check_int_data"},
{Opt_check_integrity_print_mask, "check_int_print_mask=%u"},
{Opt_enospc_debug, "enospc_debug"},
{Opt_noenospc_debug, "noenospc_debug"},
#ifdef CONFIG_BTRFS_DEBUG
{Opt_fragment_data, "fragment=data"},
{Opt_fragment_metadata, "fragment=metadata"},
{Opt_fragment_all, "fragment=all"},
#endif
#ifdef CONFIG_BTRFS_FS_REF_VERIFY
{Opt_ref_verify, "ref_verify"},
#endif
{Opt_err, NULL},
};
static const match_table_t rescue_tokens = {
{Opt_usebackuproot, "usebackuproot"},
{Opt_nologreplay, "nologreplay"},
{Opt_ignorebadroots, "ignorebadroots"},
{Opt_ignorebadroots, "ibadroots"},
{Opt_ignoredatacsums, "ignoredatacsums"},
{Opt_ignoredatacsums, "idatacsums"},
{Opt_rescue_all, "all"},
{Opt_err, NULL},
};
static bool check_ro_option(struct btrfs_fs_info *fs_info, unsigned long opt,
const char *opt_name)
{
if (fs_info->mount_opt & opt) { btrfs_err(fs_info, "%s must be used with ro mount option",
opt_name);
return true;
}
return false;
}
static int parse_rescue_options(struct btrfs_fs_info *info, const char *options)
{
char *opts;
char *orig;
char *p;
substring_t args[MAX_OPT_ARGS];
int ret = 0;
opts = kstrdup(options, GFP_KERNEL);
if (!opts)
return -ENOMEM;
orig = opts;
while ((p = strsep(&opts, ":")) != NULL) {
int token;
if (!*p)
continue;
token = match_token(p, rescue_tokens, args);
switch (token){
case Opt_usebackuproot:
btrfs_info(info,
"trying to use backup root at mount time");
btrfs_set_opt(info->mount_opt, USEBACKUPROOT);
break;
case Opt_nologreplay:
btrfs_set_and_info(info, NOLOGREPLAY,
"disabling log replay at mount time");
break;
case Opt_ignorebadroots:
btrfs_set_and_info(info, IGNOREBADROOTS,
"ignoring bad roots");
break;
case Opt_ignoredatacsums:
btrfs_set_and_info(info, IGNOREDATACSUMS,
"ignoring data csums");
break;
case Opt_rescue_all:
btrfs_info(info, "enabling all of the rescue options");
btrfs_set_and_info(info, IGNOREDATACSUMS,
"ignoring data csums");
btrfs_set_and_info(info, IGNOREBADROOTS,
"ignoring bad roots");
btrfs_set_and_info(info, NOLOGREPLAY,
"disabling log replay at mount time");
break;
case Opt_err:
btrfs_info(info, "unrecognized rescue option '%s'", p);
ret = -EINVAL;
goto out;
default:
break;
}
}
out:
kfree(orig);
return ret;
}
/*
* Regular mount options parser. Everything that is needed only when
* reading in a new superblock is parsed here.
* XXX JDM: This needs to be cleaned up for remount.
*/
int btrfs_parse_options(struct btrfs_fs_info *info, char *options,
unsigned long new_flags)
{
substring_t args[MAX_OPT_ARGS];
char *p, *num;
int intarg;
int ret = 0;
char *compress_type;
bool compress_force = false;
enum btrfs_compression_type saved_compress_type;
int saved_compress_level;
bool saved_compress_force;
int no_compress = 0;
if (btrfs_fs_compat_ro(info, FREE_SPACE_TREE)) btrfs_set_opt(info->mount_opt, FREE_SPACE_TREE); else if (btrfs_free_space_cache_v1_active(info)) { if (btrfs_is_zoned(info)) { btrfs_info(info,
"zoned: clearing existing space cache");
btrfs_set_super_cache_generation(info->super_copy, 0);
} else {
btrfs_set_opt(info->mount_opt, SPACE_CACHE);
}
}
/*
* Even the options are empty, we still need to do extra check
* against new flags
*/
if (!options)
goto check;
while ((p = strsep(&options, ",")) != NULL) {
int token;
if (!*p)
continue;
token = match_token(p, tokens, args); switch (token) {
case Opt_degraded:
btrfs_info(info, "allowing degraded mounts");
btrfs_set_opt(info->mount_opt, DEGRADED);
break;
case Opt_subvol:
case Opt_subvol_empty:
case Opt_subvolid:
case Opt_device:
/*
* These are parsed by btrfs_parse_subvol_options or
* btrfs_parse_device_options and can be ignored here.
*/
break;
case Opt_nodatasum:
btrfs_set_and_info(info, NODATASUM,
"setting nodatasum");
break;
case Opt_datasum:
if (btrfs_test_opt(info, NODATASUM)) { if (btrfs_test_opt(info, NODATACOW)) btrfs_info(info,
"setting datasum, datacow enabled");
else
btrfs_info(info, "setting datasum");
}
btrfs_clear_opt(info->mount_opt, NODATACOW);
btrfs_clear_opt(info->mount_opt, NODATASUM);
break;
case Opt_nodatacow:
if (!btrfs_test_opt(info, NODATACOW)) { if (!btrfs_test_opt(info, COMPRESS) ||
!btrfs_test_opt(info, FORCE_COMPRESS)) {
btrfs_info(info,
"setting nodatacow, compression disabled");
} else {
btrfs_info(info, "setting nodatacow");
}
}
btrfs_clear_opt(info->mount_opt, COMPRESS);
btrfs_clear_opt(info->mount_opt, FORCE_COMPRESS);
btrfs_set_opt(info->mount_opt, NODATACOW);
btrfs_set_opt(info->mount_opt, NODATASUM);
break;
case Opt_datacow:
btrfs_clear_and_info(info, NODATACOW,
"setting datacow");
break;
case Opt_compress_force:
case Opt_compress_force_type:
compress_force = true;
fallthrough;
case Opt_compress:
case Opt_compress_type:
saved_compress_type = btrfs_test_opt(info,
COMPRESS) ?
info->compress_type : BTRFS_COMPRESS_NONE;
saved_compress_force =
btrfs_test_opt(info, FORCE_COMPRESS);
saved_compress_level = info->compress_level;
if (token == Opt_compress ||
token == Opt_compress_force ||
strncmp(args[0].from, "zlib", 4) == 0) {
compress_type = "zlib";
info->compress_type = BTRFS_COMPRESS_ZLIB;
info->compress_level = BTRFS_ZLIB_DEFAULT_LEVEL;
/*
* args[0] contains uninitialized data since
* for these tokens we don't expect any
* parameter.
*/
if (token != Opt_compress &&
token != Opt_compress_force)
info->compress_level =
btrfs_compress_str2level(
BTRFS_COMPRESS_ZLIB,
args[0].from + 4);
btrfs_set_opt(info->mount_opt, COMPRESS);
btrfs_clear_opt(info->mount_opt, NODATACOW);
btrfs_clear_opt(info->mount_opt, NODATASUM);
no_compress = 0;
} else if (strncmp(args[0].from, "lzo", 3) == 0) {
compress_type = "lzo";
info->compress_type = BTRFS_COMPRESS_LZO;
info->compress_level = 0;
btrfs_set_opt(info->mount_opt, COMPRESS);
btrfs_clear_opt(info->mount_opt, NODATACOW);
btrfs_clear_opt(info->mount_opt, NODATASUM);
btrfs_set_fs_incompat(info, COMPRESS_LZO);
no_compress = 0;
} else if (strncmp(args[0].from, "zstd", 4) == 0) {
compress_type = "zstd";
info->compress_type = BTRFS_COMPRESS_ZSTD;
info->compress_level =
btrfs_compress_str2level(
BTRFS_COMPRESS_ZSTD,
args[0].from + 4);
btrfs_set_opt(info->mount_opt, COMPRESS);
btrfs_clear_opt(info->mount_opt, NODATACOW);
btrfs_clear_opt(info->mount_opt, NODATASUM);
btrfs_set_fs_incompat(info, COMPRESS_ZSTD);
no_compress = 0;
} else if (strncmp(args[0].from, "no", 2) == 0) {
compress_type = "no";
info->compress_level = 0;
info->compress_type = 0;
btrfs_clear_opt(info->mount_opt, COMPRESS);
btrfs_clear_opt(info->mount_opt, FORCE_COMPRESS);
compress_force = false;
no_compress++;
} else {
ret = -EINVAL;
goto out;
}
if (compress_force) { btrfs_set_opt(info->mount_opt, FORCE_COMPRESS);
} else {
/*
* If we remount from compress-force=xxx to
* compress=xxx, we need clear FORCE_COMPRESS
* flag, otherwise, there is no way for users
* to disable forcible compression separately.
*/
btrfs_clear_opt(info->mount_opt, FORCE_COMPRESS);
}
if (no_compress == 1) {
btrfs_info(info, "use no compression"); } else if ((info->compress_type != saved_compress_type) || (compress_force != saved_compress_force) ||
(info->compress_level != saved_compress_level)) {
btrfs_info(info, "%s %s compression, level %d",
(compress_force) ? "force" : "use",
compress_type, info->compress_level);
}
compress_force = false;
break;
case Opt_ssd:
btrfs_set_and_info(info, SSD,
"enabling ssd optimizations");
btrfs_clear_opt(info->mount_opt, NOSSD);
break;
case Opt_ssd_spread:
btrfs_set_and_info(info, SSD,
"enabling ssd optimizations");
btrfs_set_and_info(info, SSD_SPREAD,
"using spread ssd allocation scheme");
btrfs_clear_opt(info->mount_opt, NOSSD);
break;
case Opt_nossd:
btrfs_set_opt(info->mount_opt, NOSSD); btrfs_clear_and_info(info, SSD,
"not using ssd optimizations");
fallthrough;
case Opt_nossd_spread:
btrfs_clear_and_info(info, SSD_SPREAD,
"not using spread ssd allocation scheme");
break;
case Opt_barrier:
btrfs_clear_and_info(info, NOBARRIER,
"turning on barriers");
break;
case Opt_nobarrier:
btrfs_set_and_info(info, NOBARRIER,
"turning off barriers");
break;
case Opt_thread_pool:
ret = match_int(&args[0], &intarg);
if (ret) {
goto out;
} else if (intarg == 0) {
ret = -EINVAL;
goto out;
}
info->thread_pool_size = intarg;
break;
case Opt_max_inline:
num = match_strdup(&args[0]);
if (num) {
info->max_inline = memparse(num, NULL);
kfree(num);
if (info->max_inline) {
info->max_inline = min_t(u64,
info->max_inline,
info->sectorsize);
}
btrfs_info(info, "max_inline at %llu",
info->max_inline);
} else {
ret = -ENOMEM;
goto out;
}
break;
case Opt_acl:
#ifdef CONFIG_BTRFS_FS_POSIX_ACL
info->sb->s_flags |= SB_POSIXACL;
break;
#else
btrfs_err(info, "support for ACL not compiled in!");
ret = -EINVAL;
goto out;
#endif
case Opt_noacl:
info->sb->s_flags &= ~SB_POSIXACL;
break;
case Opt_notreelog:
btrfs_set_and_info(info, NOTREELOG,
"disabling tree log");
break;
case Opt_treelog:
btrfs_clear_and_info(info, NOTREELOG,
"enabling tree log");
break;
case Opt_norecovery:
case Opt_nologreplay:
btrfs_warn(info,
"'nologreplay' is deprecated, use 'rescue=nologreplay' instead");
btrfs_set_and_info(info, NOLOGREPLAY,
"disabling log replay at mount time");
break;
case Opt_flushoncommit:
btrfs_set_and_info(info, FLUSHONCOMMIT,
"turning on flush-on-commit");
break;
case Opt_noflushoncommit:
btrfs_clear_and_info(info, FLUSHONCOMMIT,
"turning off flush-on-commit");
break;
case Opt_ratio:
ret = match_int(&args[0], &intarg);
if (ret)
goto out;
info->metadata_ratio = intarg;
btrfs_info(info, "metadata ratio %u",
info->metadata_ratio);
break;
case Opt_discard:
case Opt_discard_mode:
if (token == Opt_discard || strcmp(args[0].from, "sync") == 0) { btrfs_clear_opt(info->mount_opt, DISCARD_ASYNC); btrfs_set_and_info(info, DISCARD_SYNC,
"turning on sync discard");
} else if (strcmp(args[0].from, "async") == 0) { btrfs_clear_opt(info->mount_opt, DISCARD_SYNC); btrfs_set_and_info(info, DISCARD_ASYNC,
"turning on async discard");
} else {
ret = -EINVAL;
goto out;
}
break;
case Opt_nodiscard:
btrfs_clear_and_info(info, DISCARD_SYNC,
"turning off discard");
btrfs_clear_and_info(info, DISCARD_ASYNC,
"turning off async discard");
break;
case Opt_space_cache:
case Opt_space_cache_version:
if (token == Opt_space_cache || strcmp(args[0].from, "v1") == 0) { btrfs_clear_opt(info->mount_opt,
FREE_SPACE_TREE);
btrfs_set_and_info(info, SPACE_CACHE,
"enabling disk space caching");
} else if (strcmp(args[0].from, "v2") == 0) { btrfs_clear_opt(info->mount_opt,
SPACE_CACHE);
btrfs_set_and_info(info, FREE_SPACE_TREE,
"enabling free space tree");
} else {
ret = -EINVAL;
goto out;
}
break;
case Opt_rescan_uuid_tree:
btrfs_set_opt(info->mount_opt, RESCAN_UUID_TREE);
break;
case Opt_no_space_cache:
if (btrfs_test_opt(info, SPACE_CACHE)) { btrfs_clear_and_info(info, SPACE_CACHE,
"disabling disk space caching");
}
if (btrfs_test_opt(info, FREE_SPACE_TREE)) { btrfs_clear_and_info(info, FREE_SPACE_TREE,
"disabling free space tree");
}
break;
case Opt_inode_cache:
case Opt_noinode_cache:
btrfs_warn(info,
"the 'inode_cache' option is deprecated and has no effect since 5.11");
break;
case Opt_clear_cache:
btrfs_set_and_info(info, CLEAR_CACHE,
"force clearing of disk cache");
break;
case Opt_user_subvol_rm_allowed:
btrfs_set_opt(info->mount_opt, USER_SUBVOL_RM_ALLOWED);
break;
case Opt_enospc_debug:
btrfs_set_opt(info->mount_opt, ENOSPC_DEBUG);
break;
case Opt_noenospc_debug:
btrfs_clear_opt(info->mount_opt, ENOSPC_DEBUG);
break;
case Opt_defrag:
btrfs_set_and_info(info, AUTO_DEFRAG,
"enabling auto defrag");
break;
case Opt_nodefrag:
btrfs_clear_and_info(info, AUTO_DEFRAG,
"disabling auto defrag");
break;
case Opt_recovery:
case Opt_usebackuproot:
btrfs_warn(info,
"'%s' is deprecated, use 'rescue=usebackuproot' instead",
token == Opt_recovery ? "recovery" :
"usebackuproot");
btrfs_info(info,
"trying to use backup root at mount time");
btrfs_set_opt(info->mount_opt, USEBACKUPROOT);
break;
case Opt_skip_balance:
btrfs_set_opt(info->mount_opt, SKIP_BALANCE);
break;
#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
case Opt_check_integrity_including_extent_data:
btrfs_info(info,
"enabling check integrity including extent data");
btrfs_set_opt(info->mount_opt, CHECK_INTEGRITY_DATA);
btrfs_set_opt(info->mount_opt, CHECK_INTEGRITY);
break;
case Opt_check_integrity:
btrfs_info(info, "enabling check integrity");
btrfs_set_opt(info->mount_opt, CHECK_INTEGRITY);
break;
case Opt_check_integrity_print_mask:
ret = match_int(&args[0], &intarg);
if (ret)
goto out;
info->check_integrity_print_mask = intarg;
btrfs_info(info, "check_integrity_print_mask 0x%x",
info->check_integrity_print_mask);
break;
#else
case Opt_check_integrity_including_extent_data:
case Opt_check_integrity:
case Opt_check_integrity_print_mask:
btrfs_err(info,
"support for check_integrity* not compiled in!");
ret = -EINVAL;
goto out;
#endif
case Opt_fatal_errors:
if (strcmp(args[0].from, "panic") == 0) btrfs_set_opt(info->mount_opt,
PANIC_ON_FATAL_ERROR);
else if (strcmp(args[0].from, "bug") == 0) btrfs_clear_opt(info->mount_opt,
PANIC_ON_FATAL_ERROR);
else {
ret = -EINVAL;
goto out;
}
break;
case Opt_commit_interval:
intarg = 0;
ret = match_int(&args[0], &intarg);
if (ret)
goto out;
if (intarg == 0) { btrfs_info(info,
"using default commit interval %us",
BTRFS_DEFAULT_COMMIT_INTERVAL);
intarg = BTRFS_DEFAULT_COMMIT_INTERVAL;
} else if (intarg > 300) { btrfs_warn(info, "excessive commit interval %d",
intarg);
}
info->commit_interval = intarg;
break;
case Opt_rescue:
ret = parse_rescue_options(info, args[0].from);
if (ret < 0)
goto out;
break;
#ifdef CONFIG_BTRFS_DEBUG
case Opt_fragment_all:
btrfs_info(info, "fragmenting all space");
btrfs_set_opt(info->mount_opt, FRAGMENT_DATA);
btrfs_set_opt(info->mount_opt, FRAGMENT_METADATA);
break;
case Opt_fragment_metadata:
btrfs_info(info, "fragmenting metadata");
btrfs_set_opt(info->mount_opt,
FRAGMENT_METADATA);
break;
case Opt_fragment_data:
btrfs_info(info, "fragmenting data");
btrfs_set_opt(info->mount_opt, FRAGMENT_DATA);
break;
#endif
#ifdef CONFIG_BTRFS_FS_REF_VERIFY
case Opt_ref_verify:
btrfs_info(info, "doing ref verification");
btrfs_set_opt(info->mount_opt, REF_VERIFY);
break;
#endif
case Opt_err:
btrfs_err(info, "unrecognized mount option '%s'", p);
ret = -EINVAL;
goto out;
default:
break;
}
}
check:
/* We're read-only, don't have to check. */
if (new_flags & SB_RDONLY)
goto out;
if (check_ro_option(info, BTRFS_MOUNT_NOLOGREPLAY, "nologreplay") ||
check_ro_option(info, BTRFS_MOUNT_IGNOREBADROOTS, "ignorebadroots") ||
check_ro_option(info, BTRFS_MOUNT_IGNOREDATACSUMS, "ignoredatacsums"))
ret = -EINVAL;
out:
if (btrfs_fs_compat_ro(info, FREE_SPACE_TREE) && !btrfs_test_opt(info, FREE_SPACE_TREE) &&
!btrfs_test_opt(info, CLEAR_CACHE)) {
btrfs_err(info, "cannot disable free space tree");
ret = -EINVAL;
}
if (!ret)
ret = btrfs_check_mountopts_zoned(info);
if (!ret && btrfs_test_opt(info, SPACE_CACHE)) btrfs_info(info, "disk space caching is enabled"); if (!ret && btrfs_test_opt(info, FREE_SPACE_TREE)) btrfs_info(info, "using free space tree"); return ret;
}
/*
* Parse mount options that are required early in the mount process.
*
* All other options will be parsed on much later in the mount process and
* only when we need to allocate a new super block.
*/
static int btrfs_parse_device_options(const char *options, fmode_t flags,
void *holder)
{
substring_t args[MAX_OPT_ARGS];
char *device_name, *opts, *orig, *p;
struct btrfs_device *device = NULL;
int error = 0;
lockdep_assert_held(&uuid_mutex);
if (!options)
return 0;
/*
* strsep changes the string, duplicate it because btrfs_parse_options
* gets called later
*/
opts = kstrdup(options, GFP_KERNEL);
if (!opts)
return -ENOMEM;
orig = opts;
while ((p = strsep(&opts, ",")) != NULL) {
int token;
if (!*p)
continue;
token = match_token(p, tokens, args);
if (token == Opt_device) {
device_name = match_strdup(&args[0]);
if (!device_name) {
error = -ENOMEM;
goto out;
}
device = btrfs_scan_one_device(device_name, flags,
holder);
kfree(device_name);
if (IS_ERR(device)) {
error = PTR_ERR(device);
goto out;
}
}
}
out:
kfree(orig);
return error;
}
/*
* Parse mount options that are related to subvolume id
*
* The value is later passed to mount_subvol()
*/
static int btrfs_parse_subvol_options(const char *options, char **subvol_name,
u64 *subvol_objectid)
{
substring_t args[MAX_OPT_ARGS];
char *opts, *orig, *p;
int error = 0;
u64 subvolid;
if (!options)
return 0;
/*
* strsep changes the string, duplicate it because
* btrfs_parse_device_options gets called later
*/
opts = kstrdup(options, GFP_KERNEL);
if (!opts)
return -ENOMEM;
orig = opts;
while ((p = strsep(&opts, ",")) != NULL) {
int token;
if (!*p)
continue;
token = match_token(p, tokens, args);
switch (token) {
case Opt_subvol:
kfree(*subvol_name);
*subvol_name = match_strdup(&args[0]);
if (!*subvol_name) {
error = -ENOMEM;
goto out;
}
break;
case Opt_subvolid:
error = match_u64(&args[0], &subvolid);
if (error)
goto out;
/* we want the original fs_tree */
if (subvolid == 0) subvolid = BTRFS_FS_TREE_OBJECTID;
*subvol_objectid = subvolid;
break;
default:
break;
}
}
out:
kfree(orig);
return error;
}
char *btrfs_get_subvol_name_from_objectid(struct btrfs_fs_info *fs_info,
u64 subvol_objectid)
{
struct btrfs_root *root = fs_info->tree_root;
struct btrfs_root *fs_root = NULL;
struct btrfs_root_ref *root_ref;
struct btrfs_inode_ref *inode_ref;
struct btrfs_key key;
struct btrfs_path *path = NULL;
char *name = NULL, *ptr;
u64 dirid;
int len;
int ret;
path = btrfs_alloc_path();
if (!path) {
ret = -ENOMEM;
goto err;
}
name = kmalloc(PATH_MAX, GFP_KERNEL);
if (!name) {
ret = -ENOMEM;
goto err;
}
ptr = name + PATH_MAX - 1;
ptr[0] = '\0';
/*
* Walk up the subvolume trees in the tree of tree roots by root
* backrefs until we hit the top-level subvolume.
*/
while (subvol_objectid != BTRFS_FS_TREE_OBJECTID) {
key.objectid = subvol_objectid;
key.type = BTRFS_ROOT_BACKREF_KEY;
key.offset = (u64)-1;
ret = btrfs_search_backwards(root, &key, path);
if (ret < 0) {
goto err;
} else if (ret > 0) {
ret = -ENOENT;
goto err;
}
subvol_objectid = key.offset;
root_ref = btrfs_item_ptr(path->nodes[0], path->slots[0],
struct btrfs_root_ref);
len = btrfs_root_ref_name_len(path->nodes[0], root_ref);
ptr -= len + 1;
if (ptr < name) {
ret = -ENAMETOOLONG;
goto err;
}
read_extent_buffer(path->nodes[0], ptr + 1,
(unsigned long)(root_ref + 1), len);
ptr[0] = '/';
dirid = btrfs_root_ref_dirid(path->nodes[0], root_ref);
btrfs_release_path(path);
fs_root = btrfs_get_fs_root(fs_info, subvol_objectid, true);
if (IS_ERR(fs_root)) {
ret = PTR_ERR(fs_root);
fs_root = NULL;
goto err;
}
/*
* Walk up the filesystem tree by inode refs until we hit the
* root directory.
*/
while (dirid != BTRFS_FIRST_FREE_OBJECTID) { key.objectid = dirid;
key.type = BTRFS_INODE_REF_KEY;
key.offset = (u64)-1;
ret = btrfs_search_backwards(fs_root, &key, path);
if (ret < 0) {
goto err;
} else if (ret > 0) {
ret = -ENOENT;
goto err;
}
dirid = key.offset;
inode_ref = btrfs_item_ptr(path->nodes[0],
path->slots[0],
struct btrfs_inode_ref);
len = btrfs_inode_ref_name_len(path->nodes[0],
inode_ref);
ptr -= len + 1;
if (ptr < name) {
ret = -ENAMETOOLONG;
goto err;
}
read_extent_buffer(path->nodes[0], ptr + 1,
(unsigned long)(inode_ref + 1), len);
ptr[0] = '/';
btrfs_release_path(path);
}
btrfs_put_root(fs_root);
fs_root = NULL;
}
btrfs_free_path(path);
if (ptr == name + PATH_MAX - 1) {
name[0] = '/';
name[1] = '\0';
} else {
memmove(name, ptr, name + PATH_MAX - ptr);
}
return name;
err:
btrfs_put_root(fs_root);
btrfs_free_path(path);
kfree(name);
return ERR_PTR(ret);
}
static int get_default_subvol_objectid(struct btrfs_fs_info *fs_info, u64 *objectid)
{
struct btrfs_root *root = fs_info->tree_root;
struct btrfs_dir_item *di;
struct btrfs_path *path;
struct btrfs_key location;
u64 dir_id;
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
/*
* Find the "default" dir item which points to the root item that we
* will mount by default if we haven't been given a specific subvolume
* to mount.
*/
dir_id = btrfs_super_root_dir(fs_info->super_copy);
di = btrfs_lookup_dir_item(NULL, root, path, dir_id, "default", 7, 0);
if (IS_ERR(di)) {
btrfs_free_path(path);
return PTR_ERR(di);
}
if (!di) {
/*
* Ok the default dir item isn't there. This is weird since
* it's always been there, but don't freak out, just try and
* mount the top-level subvolume.
*/
btrfs_free_path(path);
*objectid = BTRFS_FS_TREE_OBJECTID;
return 0;
}
btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location);
btrfs_free_path(path);
*objectid = location.objectid;
return 0;
}
static int btrfs_fill_super(struct super_block *sb,
struct btrfs_fs_devices *fs_devices,
void *data)
{
struct inode *inode;
struct btrfs_fs_info *fs_info = btrfs_sb(sb);
int err;
sb->s_maxbytes = MAX_LFS_FILESIZE;
sb->s_magic = BTRFS_SUPER_MAGIC;
sb->s_op = &btrfs_super_ops;
sb->s_d_op = &btrfs_dentry_operations;
sb->s_export_op = &btrfs_export_ops;
#ifdef CONFIG_FS_VERITY
sb->s_vop = &btrfs_verityops;
#endif
sb->s_xattr = btrfs_xattr_handlers;
sb->s_time_gran = 1;
#ifdef CONFIG_BTRFS_FS_POSIX_ACL
sb->s_flags |= SB_POSIXACL;
#endif
sb->s_flags |= SB_I_VERSION;
sb->s_iflags |= SB_I_CGROUPWB;
err = super_setup_bdi(sb);
if (err) {
btrfs_err(fs_info, "super_setup_bdi failed");
return err;
}
err = open_ctree(sb, fs_devices, (char *)data);
if (err) {
btrfs_err(fs_info, "open_ctree failed");
return err;
}
inode = btrfs_iget(sb, BTRFS_FIRST_FREE_OBJECTID, fs_info->fs_root);
if (IS_ERR(inode)) {
err = PTR_ERR(inode);
goto fail_close;
}
sb->s_root = d_make_root(inode);
if (!sb->s_root) {
err = -ENOMEM;
goto fail_close;
}
cleancache_init_fs(sb);
sb->s_flags |= SB_ACTIVE;
return 0;
fail_close:
close_ctree(fs_info);
return err;
}
int btrfs_sync_fs(struct super_block *sb, int wait)
{
struct btrfs_trans_handle *trans;
struct btrfs_fs_info *fs_info = btrfs_sb(sb);
struct btrfs_root *root = fs_info->tree_root;
trace_btrfs_sync_fs(fs_info, wait);
if (!wait) { filemap_flush(fs_info->btree_inode->i_mapping);
return 0;
}
btrfs_wait_ordered_roots(fs_info, U64_MAX, 0, (u64)-1);
trans = btrfs_attach_transaction_barrier(root);
if (IS_ERR(trans)) {
/* no transaction, don't bother */
if (PTR_ERR(trans) == -ENOENT) {
/*
* Exit unless we have some pending changes
* that need to go through commit
*/
if (fs_info->pending_changes == 0)
return 0;
/*
* A non-blocking test if the fs is frozen. We must not
* start a new transaction here otherwise a deadlock
* happens. The pending operations are delayed to the
* next commit after thawing.
*/
if (sb_start_write_trylock(sb))
sb_end_write(sb);
else
return 0;
trans = btrfs_start_transaction(root, 0);
}
if (IS_ERR(trans))
return PTR_ERR(trans);
}
return btrfs_commit_transaction(trans);
}
static void print_rescue_option(struct seq_file *seq, const char *s, bool *printed)
{
seq_printf(seq, "%s%s", (*printed) ? ":" : ",rescue=", s);
*printed = true;
}
static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry)
{
struct btrfs_fs_info *info = btrfs_sb(dentry->d_sb);
const char *compress_type;
const char *subvol_name;
bool printed = false;
if (btrfs_test_opt(info, DEGRADED))
seq_puts(seq, ",degraded");
if (btrfs_test_opt(info, NODATASUM))
seq_puts(seq, ",nodatasum");
if (btrfs_test_opt(info, NODATACOW))
seq_puts(seq, ",nodatacow");
if (btrfs_test_opt(info, NOBARRIER))
seq_puts(seq, ",nobarrier");
if (info->max_inline != BTRFS_DEFAULT_MAX_INLINE)
seq_printf(seq, ",max_inline=%llu", info->max_inline);
if (info->thread_pool_size != min_t(unsigned long,
num_online_cpus() + 2, 8))
seq_printf(seq, ",thread_pool=%u", info->thread_pool_size);
if (btrfs_test_opt(info, COMPRESS)) {
compress_type = btrfs_compress_type2str(info->compress_type);
if (btrfs_test_opt(info, FORCE_COMPRESS))
seq_printf(seq, ",compress-force=%s", compress_type);
else
seq_printf(seq, ",compress=%s", compress_type);
if (info->compress_level)
seq_printf(seq, ":%d", info->compress_level);
}
if (btrfs_test_opt(info, NOSSD))
seq_puts(seq, ",nossd");
if (btrfs_test_opt(info, SSD_SPREAD))
seq_puts(seq, ",ssd_spread");
else if (btrfs_test_opt(info, SSD))
seq_puts(seq, ",ssd");
if (btrfs_test_opt(info, NOTREELOG))
seq_puts(seq, ",notreelog");
if (btrfs_test_opt(info, NOLOGREPLAY))
print_rescue_option(seq, "nologreplay", &printed);
if (btrfs_test_opt(info, USEBACKUPROOT))
print_rescue_option(seq, "usebackuproot", &printed);
if (btrfs_test_opt(info, IGNOREBADROOTS))
print_rescue_option(seq, "ignorebadroots", &printed);
if (btrfs_test_opt(info, IGNOREDATACSUMS))
print_rescue_option(seq, "ignoredatacsums", &printed);
if (btrfs_test_opt(info, FLUSHONCOMMIT))
seq_puts(seq, ",flushoncommit");
if (btrfs_test_opt(info, DISCARD_SYNC))
seq_puts(seq, ",discard");
if (btrfs_test_opt(info, DISCARD_ASYNC))
seq_puts(seq, ",discard=async");
if (!(info->sb->s_flags & SB_POSIXACL))
seq_puts(seq, ",noacl");
if (btrfs_free_space_cache_v1_active(info))
seq_puts(seq, ",space_cache");
else if (btrfs_fs_compat_ro(info, FREE_SPACE_TREE))
seq_puts(seq, ",space_cache=v2");
else
seq_puts(seq, ",nospace_cache");
if (btrfs_test_opt(info, RESCAN_UUID_TREE))
seq_puts(seq, ",rescan_uuid_tree");
if (btrfs_test_opt(info, CLEAR_CACHE))
seq_puts(seq, ",clear_cache");
if (btrfs_test_opt(info, USER_SUBVOL_RM_ALLOWED))
seq_puts(seq, ",user_subvol_rm_allowed");
if (btrfs_test_opt(info, ENOSPC_DEBUG))
seq_puts(seq, ",enospc_debug");
if (btrfs_test_opt(info, AUTO_DEFRAG))
seq_puts(seq, ",autodefrag");
if (btrfs_test_opt(info, SKIP_BALANCE))
seq_puts(seq, ",skip_balance");
#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
if (btrfs_test_opt(info, CHECK_INTEGRITY_DATA))
seq_puts(seq, ",check_int_data");
else if (btrfs_test_opt(info, CHECK_INTEGRITY))
seq_puts(seq, ",check_int");
if (info->check_integrity_print_mask)
seq_printf(seq, ",check_int_print_mask=%d",
info->check_integrity_print_mask);
#endif
if (info->metadata_ratio)
seq_printf(seq, ",metadata_ratio=%u", info->metadata_ratio);
if (btrfs_test_opt(info, PANIC_ON_FATAL_ERROR))
seq_puts(seq, ",fatal_errors=panic");
if (info->commit_interval != BTRFS_DEFAULT_COMMIT_INTERVAL)
seq_printf(seq, ",commit=%u", info->commit_interval);
#ifdef CONFIG_BTRFS_DEBUG
if (btrfs_test_opt(info, FRAGMENT_DATA))
seq_puts(seq, ",fragment=data");
if (btrfs_test_opt(info, FRAGMENT_METADATA))
seq_puts(seq, ",fragment=metadata");
#endif
if (btrfs_test_opt(info, REF_VERIFY))
seq_puts(seq, ",ref_verify");
seq_printf(seq, ",subvolid=%llu",
BTRFS_I(d_inode(dentry))->root->root_key.objectid);
subvol_name = btrfs_get_subvol_name_from_objectid(info,
BTRFS_I(d_inode(dentry))->root->root_key.objectid);
if (!IS_ERR(subvol_name)) {
seq_puts(seq, ",subvol=");
seq_escape(seq, subvol_name, " \t\n\\");
kfree(subvol_name);
}
return 0;
}
static int btrfs_test_super(struct super_block *s, void *data)
{
struct btrfs_fs_info *p = data;
struct btrfs_fs_info *fs_info = btrfs_sb(s);
return fs_info->fs_devices == p->fs_devices;
}
static int btrfs_set_super(struct super_block *s, void *data)
{
int err = set_anon_super(s, data);
if (!err)
s->s_fs_info = data; return err;
}
/*
* subvolumes are identified by ino 256
*/
static inline int is_subvolume_inode(struct inode *inode)
{
if (inode && inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)
return 1;
return 0;
}
static struct dentry *mount_subvol(const char *subvol_name, u64 subvol_objectid,
struct vfsmount *mnt)
{
struct dentry *root;
int ret;
if (!subvol_name) { if (!subvol_objectid) { ret = get_default_subvol_objectid(btrfs_sb(mnt->mnt_sb),
&subvol_objectid);
if (ret) {
root = ERR_PTR(ret);
goto out;
}
}
subvol_name = btrfs_get_subvol_name_from_objectid(
btrfs_sb(mnt->mnt_sb), subvol_objectid);
if (IS_ERR(subvol_name)) {
root = ERR_CAST(subvol_name);
subvol_name = NULL;
goto out;
}
}
root = mount_subtree(mnt, subvol_name);
/* mount_subtree() drops our reference on the vfsmount. */
mnt = NULL;
if (!IS_ERR(root)) {
struct super_block *s = root->d_sb;
struct btrfs_fs_info *fs_info = btrfs_sb(s);
struct inode *root_inode = d_inode(root);
u64 root_objectid = BTRFS_I(root_inode)->root->root_key.objectid;
ret = 0;
if (!is_subvolume_inode(root_inode)) {
btrfs_err(fs_info, "'%s' is not a valid subvolume",
subvol_name);
ret = -EINVAL;
}
if (subvol_objectid && root_objectid != subvol_objectid) {
/*
* This will also catch a race condition where a
* subvolume which was passed by ID is renamed and
* another subvolume is renamed over the old location.
*/
btrfs_err(fs_info,
"subvol '%s' does not match subvolid %llu",
subvol_name, subvol_objectid);
ret = -EINVAL;
}
if (ret) {
dput(root);
root = ERR_PTR(ret);
deactivate_locked_super(s);
}
}
out:
mntput(mnt);
kfree(subvol_name);
return root;
}
/*
* Find a superblock for the given device / mount point.
*
* Note: This is based on mount_bdev from fs/super.c with a few additions
* for multiple device setup. Make sure to keep it in sync.
*/
static struct dentry *btrfs_mount_root(struct file_system_type *fs_type,
int flags, const char *device_name, void *data)
{
struct block_device *bdev = NULL;
struct super_block *s;
struct btrfs_device *device = NULL;
struct btrfs_fs_devices *fs_devices = NULL;
struct btrfs_fs_info *fs_info = NULL;
void *new_sec_opts = NULL;
fmode_t mode = FMODE_READ;
int error = 0;
if (!(flags & SB_RDONLY))
mode |= FMODE_WRITE;
if (data) { error = security_sb_eat_lsm_opts(data, &new_sec_opts);
if (error)
return ERR_PTR(error);
}
/*
* Setup a dummy root and fs_info for test/set super. This is because
* we don't actually fill this stuff out until open_ctree, but we need
* then open_ctree will properly initialize the file system specific
* settings later. btrfs_init_fs_info initializes the static elements
* of the fs_info (locks and such) to make cleanup easier if we find a
* superblock with our given fs_devices later on at sget() time.
*/
fs_info = kvzalloc(sizeof(struct btrfs_fs_info), GFP_KERNEL);
if (!fs_info) {
error = -ENOMEM;
goto error_sec_opts;
}
btrfs_init_fs_info(fs_info);
fs_info->super_copy = kzalloc(BTRFS_SUPER_INFO_SIZE, GFP_KERNEL);
fs_info->super_for_commit = kzalloc(BTRFS_SUPER_INFO_SIZE, GFP_KERNEL);
if (!fs_info->super_copy || !fs_info->super_for_commit) {
error = -ENOMEM;
goto error_fs_info;
}
mutex_lock(&uuid_mutex);
error = btrfs_parse_device_options(data, mode, fs_type);
if (error) { mutex_unlock(&uuid_mutex);
goto error_fs_info;
}
device = btrfs_scan_one_device(device_name, mode, fs_type);
if (IS_ERR(device)) {
mutex_unlock(&uuid_mutex);
error = PTR_ERR(device);
goto error_fs_info;
}
fs_devices = device->fs_devices;
fs_info->fs_devices = fs_devices;
error = btrfs_open_devices(fs_devices, mode, fs_type);
mutex_unlock(&uuid_mutex);
if (error)
goto error_fs_info;
if (!(flags & SB_RDONLY) && fs_devices->rw_devices == 0) {
error = -EACCES;
goto error_close_devices;
}
bdev = fs_devices->latest_dev->bdev;
s = sget(fs_type, btrfs_test_super, btrfs_set_super, flags | SB_NOSEC,
fs_info);
if (IS_ERR(s)) {
error = PTR_ERR(s);
goto error_close_devices;
}
if (s->s_root) { btrfs_close_devices(fs_devices);
btrfs_free_fs_info(fs_info);
if ((flags ^ s->s_flags) & SB_RDONLY)
error = -EBUSY;
} else {
snprintf(s->s_id, sizeof(s->s_id), "%pg", bdev);
btrfs_sb(s)->bdev_holder = fs_type;
if (!strstr(crc32c_impl(), "generic"))
set_bit(BTRFS_FS_CSUM_IMPL_FAST, &fs_info->flags);
error = btrfs_fill_super(s, fs_devices, data);
}
if (!error)
error = security_sb_set_mnt_opts(s, new_sec_opts, 0, NULL); security_free_mnt_opts(&new_sec_opts);
if (error) {
deactivate_locked_super(s);
return ERR_PTR(error);
}
return dget(s->s_root);
error_close_devices:
btrfs_close_devices(fs_devices);
error_fs_info:
btrfs_free_fs_info(fs_info);
error_sec_opts:
security_free_mnt_opts(&new_sec_opts); return ERR_PTR(error);
}
/*
* Mount function which is called by VFS layer.
*
* In order to allow mounting a subvolume directly, btrfs uses mount_subtree()
* which needs vfsmount* of device's root (/). This means device's root has to
* be mounted internally in any case.
*
* Operation flow:
* 1. Parse subvol id related options for later use in mount_subvol().
*
* 2. Mount device's root (/) by calling vfs_kern_mount().
*
* NOTE: vfs_kern_mount() is used by VFS to call btrfs_mount() in the
* first place. In order to avoid calling btrfs_mount() again, we use
* different file_system_type which is not registered to VFS by
* register_filesystem() (btrfs_root_fs_type). As a result,
* btrfs_mount_root() is called. The return value will be used by
* mount_subtree() in mount_subvol().
*
* 3. Call mount_subvol() to get the dentry of subvolume. Since there is
* "btrfs subvolume set-default", mount_subvol() is called always.
*/
static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,
const char *device_name, void *data)
{
struct vfsmount *mnt_root;
struct dentry *root;
char *subvol_name = NULL;
u64 subvol_objectid = 0;
int error = 0;
error = btrfs_parse_subvol_options(data, &subvol_name,
&subvol_objectid);
if (error) { kfree(subvol_name); return ERR_PTR(error);
}
/* mount device's root (/) */
mnt_root = vfs_kern_mount(&btrfs_root_fs_type, flags, device_name, data);
if (PTR_ERR_OR_ZERO(mnt_root) == -EBUSY) {
if (flags & SB_RDONLY) { mnt_root = vfs_kern_mount(&btrfs_root_fs_type,
flags & ~SB_RDONLY, device_name, data);
} else {
mnt_root = vfs_kern_mount(&btrfs_root_fs_type,
flags | SB_RDONLY, device_name, data);
if (IS_ERR(mnt_root)) {
root = ERR_CAST(mnt_root);
kfree(subvol_name);
goto out;
}
down_write(&mnt_root->mnt_sb->s_umount);
error = btrfs_remount(mnt_root->mnt_sb, &flags, NULL);
up_write(&mnt_root->mnt_sb->s_umount);
if (error < 0) {
root = ERR_PTR(error);
mntput(mnt_root);
kfree(subvol_name);
goto out;
}
}
}
if (IS_ERR(mnt_root)) {
root = ERR_CAST(mnt_root);
kfree(subvol_name);
goto out;
}
/* mount_subvol() will free subvol_name and mnt_root */
root = mount_subvol(subvol_name, subvol_objectid, mnt_root);
out:
return root;
}
static void btrfs_resize_thread_pool(struct btrfs_fs_info *fs_info,
u32 new_pool_size, u32 old_pool_size)
{
if (new_pool_size == old_pool_size)
return;
fs_info->thread_pool_size = new_pool_size;
btrfs_info(fs_info, "resize thread pool %d -> %d",
old_pool_size, new_pool_size);
btrfs_workqueue_set_max(fs_info->workers, new_pool_size);
btrfs_workqueue_set_max(fs_info->delalloc_workers, new_pool_size);
btrfs_workqueue_set_max(fs_info->caching_workers, new_pool_size);
btrfs_workqueue_set_max(fs_info->endio_workers, new_pool_size);
btrfs_workqueue_set_max(fs_info->endio_meta_workers, new_pool_size);
btrfs_workqueue_set_max(fs_info->endio_meta_write_workers,
new_pool_size);
btrfs_workqueue_set_max(fs_info->endio_write_workers, new_pool_size);
btrfs_workqueue_set_max(fs_info->endio_freespace_worker, new_pool_size);
btrfs_workqueue_set_max(fs_info->delayed_workers, new_pool_size);
btrfs_workqueue_set_max(fs_info->readahead_workers, new_pool_size);
btrfs_workqueue_set_max(fs_info->scrub_wr_completion_workers,
new_pool_size);
}
static inline void btrfs_remount_begin(struct btrfs_fs_info *fs_info,
unsigned long old_opts, int flags)
{
if (btrfs_raw_test_opt(old_opts, AUTO_DEFRAG) && (!btrfs_raw_test_opt(fs_info->mount_opt, AUTO_DEFRAG) ||
(flags & SB_RDONLY))) {
/* wait for any defraggers to finish */
wait_event(fs_info->transaction_wait,
(atomic_read(&fs_info->defrag_running) == 0));
if (flags & SB_RDONLY) sync_filesystem(fs_info->sb);
}
}
static inline void btrfs_remount_cleanup(struct btrfs_fs_info *fs_info,
unsigned long old_opts)
{
const bool cache_opt = btrfs_test_opt(fs_info, SPACE_CACHE);
/*
* We need to cleanup all defragable inodes if the autodefragment is
* close or the filesystem is read only.
*/
if (btrfs_raw_test_opt(old_opts, AUTO_DEFRAG) &&
(!btrfs_raw_test_opt(fs_info->mount_opt, AUTO_DEFRAG) || sb_rdonly(fs_info->sb))) { btrfs_cleanup_defrag_inodes(fs_info);
}
/* If we toggled discard async */
if (!btrfs_raw_test_opt(old_opts, DISCARD_ASYNC) &&
btrfs_test_opt(fs_info, DISCARD_ASYNC))
btrfs_discard_resume(fs_info); else if (btrfs_raw_test_opt(old_opts, DISCARD_ASYNC) &&
!btrfs_test_opt(fs_info, DISCARD_ASYNC))
btrfs_discard_cleanup(fs_info);
/* If we toggled space cache */
if (cache_opt != btrfs_free_space_cache_v1_active(fs_info)) btrfs_set_free_space_cache_v1_active(fs_info, cache_opt);
}
static int btrfs_remount(struct super_block *sb, int *flags, char *data)
{
struct btrfs_fs_info *fs_info = btrfs_sb(sb);
unsigned old_flags = sb->s_flags;
unsigned long old_opts = fs_info->mount_opt;
unsigned long old_compress_type = fs_info->compress_type;
u64 old_max_inline = fs_info->max_inline;
u32 old_thread_pool_size = fs_info->thread_pool_size;
u32 old_metadata_ratio = fs_info->metadata_ratio;
int ret;
sync_filesystem(sb);
set_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state);
if (data) {
void *new_sec_opts = NULL;
ret = security_sb_eat_lsm_opts(data, &new_sec_opts);
if (!ret)
ret = security_sb_remount(sb, new_sec_opts); security_free_mnt_opts(&new_sec_opts); if (ret) goto restore;
}
ret = btrfs_parse_options(fs_info, data, *flags);
if (ret)
goto restore;
btrfs_remount_begin(fs_info, old_opts, *flags); btrfs_resize_thread_pool(fs_info,
fs_info->thread_pool_size, old_thread_pool_size);
if ((bool)btrfs_test_opt(fs_info, FREE_SPACE_TREE) != (bool)btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE) && (!sb_rdonly(sb) || (*flags & SB_RDONLY))) { btrfs_warn(fs_info,
"remount supports changing free space tree only from ro to rw");
/* Make sure free space cache options match the state on disk */
if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE)) {
btrfs_set_opt(fs_info->mount_opt, FREE_SPACE_TREE);
btrfs_clear_opt(fs_info->mount_opt, SPACE_CACHE);
}
if (btrfs_free_space_cache_v1_active(fs_info)) { btrfs_clear_opt(fs_info->mount_opt, FREE_SPACE_TREE); btrfs_set_opt(fs_info->mount_opt, SPACE_CACHE);
}
}
if ((bool)(*flags & SB_RDONLY) == sb_rdonly(sb))
goto out;
if (*flags & SB_RDONLY) {
/*
* this also happens on 'umount -rf' or on shutdown, when
* the filesystem is busy.
*/
cancel_work_sync(&fs_info->async_reclaim_work);
cancel_work_sync(&fs_info->async_data_reclaim_work);
btrfs_discard_cleanup(fs_info);
/* wait for the uuid_scan task to finish */
down(&fs_info->uuid_tree_rescan_sem);
/* avoid complains from lockdep et al. */
up(&fs_info->uuid_tree_rescan_sem);
btrfs_set_sb_rdonly(sb);
/*
* Setting SB_RDONLY will put the cleaner thread to
* sleep at the next loop if it's already active.
* If it's already asleep, we'll leave unused block
* groups on disk until we're mounted read-write again
* unless we clean them up here.
*/
btrfs_delete_unused_bgs(fs_info);
/*
* The cleaner task could be already running before we set the
* flag BTRFS_FS_STATE_RO (and SB_RDONLY in the superblock).
* We must make sure that after we finish the remount, i.e. after
* we call btrfs_commit_super(), the cleaner can no longer start
* a transaction - either because it was dropping a dead root,
* running delayed iputs or deleting an unused block group (the
* cleaner picked a block group from the list of unused block
* groups before we were able to in the previous call to
* btrfs_delete_unused_bgs()).
*/
wait_on_bit(&fs_info->flags, BTRFS_FS_CLEANER_RUNNING,
TASK_UNINTERRUPTIBLE);
/*
* We've set the superblock to RO mode, so we might have made
* the cleaner task sleep without running all pending delayed
* iputs. Go through all the delayed iputs here, so that if an
* unmount happens without remounting RW we don't end up at
* finishing close_ctree() with a non-empty list of delayed
* iputs.
*/
btrfs_run_delayed_iputs(fs_info);
btrfs_dev_replace_suspend_for_unmount(fs_info);
btrfs_scrub_cancel(fs_info);
btrfs_pause_balance(fs_info);
/*
* Pause the qgroup rescan worker if it is running. We don't want
* it to be still running after we are in RO mode, as after that,
* by the time we unmount, it might have left a transaction open,
* so we would leak the transaction and/or crash.
*/
btrfs_qgroup_wait_for_completion(fs_info, false);
ret = btrfs_commit_super(fs_info);
if (ret)
goto restore;
} else {
if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) {
btrfs_err(fs_info,
"Remounting read-write after error is not allowed");
ret = -EINVAL;
goto restore;
}
if (fs_info->fs_devices->rw_devices == 0) {
ret = -EACCES;
goto restore;
}
if (!btrfs_check_rw_degradable(fs_info, NULL)) { btrfs_warn(fs_info,
"too many missing devices, writable remount is not allowed");
ret = -EACCES;
goto restore;
}
if (btrfs_super_log_root(fs_info->super_copy) != 0) { btrfs_warn(fs_info,
"mount required to replay tree-log, cannot remount read-write");
ret = -EINVAL;
goto restore;
}
/*
* NOTE: when remounting with a change that does writes, don't
* put it anywhere above this point, as we are not sure to be
* safe to write until we pass the above checks.
*/
ret = btrfs_start_pre_rw_mount(fs_info);
if (ret)
goto restore;
btrfs_clear_sb_rdonly(sb);
set_bit(BTRFS_FS_OPEN, &fs_info->flags);
}
out:
/*
* We need to set SB_I_VERSION here otherwise it'll get cleared by VFS,
* since the absence of the flag means it can be toggled off by remount.
*/
*flags |= SB_I_VERSION;
wake_up_process(fs_info->transaction_kthread);
btrfs_remount_cleanup(fs_info, old_opts);
btrfs_clear_oneshot_options(fs_info);
clear_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state);
return 0;
restore:
/* We've hit an error - don't reset SB_RDONLY */
if (sb_rdonly(sb)) old_flags |= SB_RDONLY; if (!(old_flags & SB_RDONLY))
clear_bit(BTRFS_FS_STATE_RO, &fs_info->fs_state);
sb->s_flags = old_flags;
fs_info->mount_opt = old_opts;
fs_info->compress_type = old_compress_type;
fs_info->max_inline = old_max_inline;
btrfs_resize_thread_pool(fs_info,
old_thread_pool_size, fs_info->thread_pool_size);
fs_info->metadata_ratio = old_metadata_ratio;
btrfs_remount_cleanup(fs_info, old_opts);
clear_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state);
return ret;
}
/* Used to sort the devices by max_avail(descending sort) */
static int btrfs_cmp_device_free_bytes(const void *a, const void *b)
{
const struct btrfs_device_info *dev_info1 = a;
const struct btrfs_device_info *dev_info2 = b;
if (dev_info1->max_avail > dev_info2->max_avail)
return -1;
else if (dev_info1->max_avail < dev_info2->max_avail)
return 1;
return 0;
}
/*
* sort the devices by max_avail, in which max free extent size of each device
* is stored.(Descending Sort)
*/
static inline void btrfs_descending_sort_devices(
struct btrfs_device_info *devices,
size_t nr_devices)
{
sort(devices, nr_devices, sizeof(struct btrfs_device_info),
btrfs_cmp_device_free_bytes, NULL);
}
/*
* The helper to calc the free space on the devices that can be used to store
* file data.
*/
static inline int btrfs_calc_avail_data_space(struct btrfs_fs_info *fs_info,
u64 *free_bytes)
{
struct btrfs_device_info *devices_info;
struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
struct btrfs_device *device;
u64 type;
u64 avail_space;
u64 min_stripe_size;
int num_stripes = 1;
int i = 0, nr_devices;
const struct btrfs_raid_attr *rattr;
/*
* We aren't under the device list lock, so this is racy-ish, but good
* enough for our purposes.
*/
nr_devices = fs_info->fs_devices->open_devices;
if (!nr_devices) {
smp_mb();
nr_devices = fs_info->fs_devices->open_devices;
ASSERT(nr_devices);
if (!nr_devices) {
*free_bytes = 0;
return 0;
}
}
devices_info = kmalloc_array(nr_devices, sizeof(*devices_info),
GFP_KERNEL);
if (!devices_info)
return -ENOMEM;
/* calc min stripe number for data space allocation */
type = btrfs_data_alloc_profile(fs_info);
rattr = &btrfs_raid_array[btrfs_bg_flags_to_raid_index(type)];
if (type & BTRFS_BLOCK_GROUP_RAID0)
num_stripes = nr_devices;
else if (type & BTRFS_BLOCK_GROUP_RAID1)
num_stripes = 2;
else if (type & BTRFS_BLOCK_GROUP_RAID1C3)
num_stripes = 3;
else if (type & BTRFS_BLOCK_GROUP_RAID1C4)
num_stripes = 4;
else if (type & BTRFS_BLOCK_GROUP_RAID10)
num_stripes = 4;
/* Adjust for more than 1 stripe per device */
min_stripe_size = rattr->dev_stripes * BTRFS_STRIPE_LEN;
rcu_read_lock();
list_for_each_entry_rcu(device, &fs_devices->devices, dev_list) {
if (!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA,
&device->dev_state) ||
!device->bdev ||
test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state))
continue;
if (i >= nr_devices)
break;
avail_space = device->total_bytes - device->bytes_used;
/* align with stripe_len */
avail_space = rounddown(avail_space, BTRFS_STRIPE_LEN);
/*
* In order to avoid overwriting the superblock on the drive,
* btrfs starts at an offset of at least 1MB when doing chunk
* allocation.
*
* This ensures we have at least min_stripe_size free space
* after excluding 1MB.
*/
if (avail_space <= SZ_1M + min_stripe_size)
continue;
avail_space -= SZ_1M;
devices_info[i].dev = device;
devices_info[i].max_avail = avail_space;
i++;
}
rcu_read_unlock();
nr_devices = i;
btrfs_descending_sort_devices(devices_info, nr_devices);
i = nr_devices - 1;
avail_space = 0;
while (nr_devices >= rattr->devs_min) {
num_stripes = min(num_stripes, nr_devices);
if (devices_info[i].max_avail >= min_stripe_size) {
int j;
u64 alloc_size;
avail_space += devices_info[i].max_avail * num_stripes;
alloc_size = devices_info[i].max_avail;
for (j = i + 1 - num_stripes; j <= i; j++)
devices_info[j].max_avail -= alloc_size;
}
i--;
nr_devices--;
}
kfree(devices_info);
*free_bytes = avail_space;
return 0;
}
/*
* Calculate numbers for 'df', pessimistic in case of mixed raid profiles.
*
* If there's a redundant raid level at DATA block groups, use the respective
* multiplier to scale the sizes.
*
* Unused device space usage is based on simulating the chunk allocator
* algorithm that respects the device sizes and order of allocations. This is
* a close approximation of the actual use but there are other factors that may
* change the result (like a new metadata chunk).
*
* If metadata is exhausted, f_bavail will be 0.
*/
static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf)
{
struct btrfs_fs_info *fs_info = btrfs_sb(dentry->d_sb);
struct btrfs_super_block *disk_super = fs_info->super_copy;
struct btrfs_space_info *found;
u64 total_used = 0;
u64 total_free_data = 0;
u64 total_free_meta = 0;
u32 bits = fs_info->sectorsize_bits;
__be32 *fsid = (__be32 *)fs_info->fs_devices->fsid;
unsigned factor = 1;
struct btrfs_block_rsv *block_rsv = &fs_info->global_block_rsv;
int ret;
u64 thresh = 0;
int mixed = 0;
list_for_each_entry(found, &fs_info->space_info, list) {
if (found->flags & BTRFS_BLOCK_GROUP_DATA) {
int i;
total_free_data += found->disk_total - found->disk_used;
total_free_data -=
btrfs_account_ro_block_groups_free_space(found);
for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) {
if (!list_empty(&found->block_groups[i]))
factor = btrfs_bg_type_to_factor(
btrfs_raid_array[i].bg_flag);
}
}
/*
* Metadata in mixed block goup profiles are accounted in data
*/
if (!mixed && found->flags & BTRFS_BLOCK_GROUP_METADATA) {
if (found->flags & BTRFS_BLOCK_GROUP_DATA)
mixed = 1;
else
total_free_meta += found->disk_total -
found->disk_used;
}
total_used += found->disk_used;
}
buf->f_blocks = div_u64(btrfs_super_total_bytes(disk_super), factor);
buf->f_blocks >>= bits;
buf->f_bfree = buf->f_blocks - (div_u64(total_used, factor) >> bits);
/* Account global block reserve as used, it's in logical size already */
spin_lock(&block_rsv->lock);
/* Mixed block groups accounting is not byte-accurate, avoid overflow */
if (buf->f_bfree >= block_rsv->size >> bits)
buf->f_bfree -= block_rsv->size >> bits;
else
buf->f_bfree = 0;
spin_unlock(&block_rsv->lock);
buf->f_bavail = div_u64(total_free_data, factor);
ret = btrfs_calc_avail_data_space(fs_info, &total_free_data);
if (ret)
return ret;
buf->f_bavail += div_u64(total_free_data, factor);
buf->f_bavail = buf->f_bavail >> bits;
/*
* We calculate the remaining metadata space minus global reserve. If
* this is (supposedly) smaller than zero, there's no space. But this
* does not hold in practice, the exhausted state happens where's still
* some positive delta. So we apply some guesswork and compare the
* delta to a 4M threshold. (Practically observed delta was ~2M.)
*
* We probably cannot calculate the exact threshold value because this
* depends on the internal reservations requested by various
* operations, so some operations that consume a few metadata will
* succeed even if the Avail is zero. But this is better than the other
* way around.
*/
thresh = SZ_4M;
/*
* We only want to claim there's no available space if we can no longer
* allocate chunks for our metadata profile and our global reserve will
* not fit in the free metadata space. If we aren't ->full then we
* still can allocate chunks and thus are fine using the currently
* calculated f_bavail.
*/
if (!mixed && block_rsv->space_info->full &&
total_free_meta - thresh < block_rsv->size)
buf->f_bavail = 0;
buf->f_type = BTRFS_SUPER_MAGIC;
buf->f_bsize = dentry->d_sb->s_blocksize;
buf->f_namelen = BTRFS_NAME_LEN;
/* We treat it as constant endianness (it doesn't matter _which_)
because we want the fsid to come out the same whether mounted
on a big-endian or little-endian host */
buf->f_fsid.val[0] = be32_to_cpu(fsid[0]) ^ be32_to_cpu(fsid[2]);
buf->f_fsid.val[1] = be32_to_cpu(fsid[1]) ^ be32_to_cpu(fsid[3]);
/* Mask in the root object ID too, to disambiguate subvols */
buf->f_fsid.val[0] ^=
BTRFS_I(d_inode(dentry))->root->root_key.objectid >> 32;
buf->f_fsid.val[1] ^=
BTRFS_I(d_inode(dentry))->root->root_key.objectid;
return 0;
}
static void btrfs_kill_super(struct super_block *sb)
{
struct btrfs_fs_info *fs_info = btrfs_sb(sb);
kill_anon_super(sb);
btrfs_free_fs_info(fs_info);
}
static struct file_system_type btrfs_fs_type = {
.owner = THIS_MODULE,
.name = "btrfs",
.mount = btrfs_mount,
.kill_sb = btrfs_kill_super,
.fs_flags = FS_REQUIRES_DEV | FS_BINARY_MOUNTDATA,
};
static struct file_system_type btrfs_root_fs_type = {
.owner = THIS_MODULE,
.name = "btrfs",
.mount = btrfs_mount_root,
.kill_sb = btrfs_kill_super,
.fs_flags = FS_REQUIRES_DEV | FS_BINARY_MOUNTDATA | FS_ALLOW_IDMAP,
};
MODULE_ALIAS_FS("btrfs");
static int btrfs_control_open(struct inode *inode, struct file *file)
{
/*
* The control file's private_data is used to hold the
* transaction when it is started and is used to keep
* track of whether a transaction is already in progress.
*/
file->private_data = NULL;
return 0;
}
/*
* Used by /dev/btrfs-control for devices ioctls.
*/
static long btrfs_control_ioctl(struct file *file, unsigned int cmd,
unsigned long arg)
{
struct btrfs_ioctl_vol_args *vol;
struct btrfs_device *device = NULL;
int ret = -ENOTTY;
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
vol = memdup_user((void __user *)arg, sizeof(*vol));
if (IS_ERR(vol))
return PTR_ERR(vol);
vol->name[BTRFS_PATH_NAME_MAX] = '\0';
switch (cmd) {
case BTRFS_IOC_SCAN_DEV:
mutex_lock(&uuid_mutex);
device = btrfs_scan_one_device(vol->name, FMODE_READ,
&btrfs_root_fs_type);
ret = PTR_ERR_OR_ZERO(device);
mutex_unlock(&uuid_mutex);
break;
case BTRFS_IOC_FORGET_DEV:
ret = btrfs_forget_devices(vol->name);
break;
case BTRFS_IOC_DEVICES_READY:
mutex_lock(&uuid_mutex);
device = btrfs_scan_one_device(vol->name, FMODE_READ,
&btrfs_root_fs_type);
if (IS_ERR(device)) {
mutex_unlock(&uuid_mutex);
ret = PTR_ERR(device);
break;
}
ret = !(device->fs_devices->num_devices ==
device->fs_devices->total_devices);
mutex_unlock(&uuid_mutex);
break;
case BTRFS_IOC_GET_SUPPORTED_FEATURES:
ret = btrfs_ioctl_get_supported_features((void __user*)arg);
break;
}
kfree(vol);
return ret;
}
static int btrfs_freeze(struct super_block *sb)
{
struct btrfs_trans_handle *trans;
struct btrfs_fs_info *fs_info = btrfs_sb(sb);
struct btrfs_root *root = fs_info->tree_root;
set_bit(BTRFS_FS_FROZEN, &fs_info->flags);
/*
* We don't need a barrier here, we'll wait for any transaction that
* could be in progress on other threads (and do delayed iputs that
* we want to avoid on a frozen filesystem), or do the commit
* ourselves.
*/
trans = btrfs_attach_transaction_barrier(root);
if (IS_ERR(trans)) {
/* no transaction, don't bother */
if (PTR_ERR(trans) == -ENOENT)
return 0;
return PTR_ERR(trans);
}
return btrfs_commit_transaction(trans);
}
static int btrfs_unfreeze(struct super_block *sb)
{
struct btrfs_fs_info *fs_info = btrfs_sb(sb);
clear_bit(BTRFS_FS_FROZEN, &fs_info->flags);
return 0;
}
static int btrfs_show_devname(struct seq_file *m, struct dentry *root)
{
struct btrfs_fs_info *fs_info = btrfs_sb(root->d_sb);
/*
* There should be always a valid pointer in latest_dev, it may be stale
* for a short moment in case it's being deleted but still valid until
* the end of RCU grace period.
*/
rcu_read_lock();
seq_escape(m, rcu_str_deref(fs_info->fs_devices->latest_dev->name), " \t\n\\");
rcu_read_unlock();
return 0;
}
static const struct super_operations btrfs_super_ops = {
.drop_inode = btrfs_drop_inode,
.evict_inode = btrfs_evict_inode,
.put_super = btrfs_put_super,
.sync_fs = btrfs_sync_fs,
.show_options = btrfs_show_options,
.show_devname = btrfs_show_devname,
.alloc_inode = btrfs_alloc_inode,
.destroy_inode = btrfs_destroy_inode,
.free_inode = btrfs_free_inode,
.statfs = btrfs_statfs,
.remount_fs = btrfs_remount,
.freeze_fs = btrfs_freeze,
.unfreeze_fs = btrfs_unfreeze,
};
static const struct file_operations btrfs_ctl_fops = {
.open = btrfs_control_open,
.unlocked_ioctl = btrfs_control_ioctl,
.compat_ioctl = compat_ptr_ioctl,
.owner = THIS_MODULE,
.llseek = noop_llseek,
};
static struct miscdevice btrfs_misc = {
.minor = BTRFS_MINOR,
.name = "btrfs-control",
.fops = &btrfs_ctl_fops
};
MODULE_ALIAS_MISCDEV(BTRFS_MINOR);
MODULE_ALIAS("devname:btrfs-control");
static int __init btrfs_interface_init(void)
{
return misc_register(&btrfs_misc);
}
static __cold void btrfs_interface_exit(void)
{
misc_deregister(&btrfs_misc);
}
static void __init btrfs_print_mod_info(void)
{
static const char options[] = ""
#ifdef CONFIG_BTRFS_DEBUG
", debug=on"
#endif
#ifdef CONFIG_BTRFS_ASSERT
", assert=on"
#endif
#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
", integrity-checker=on"
#endif
#ifdef CONFIG_BTRFS_FS_REF_VERIFY
", ref-verify=on"
#endif
#ifdef CONFIG_BLK_DEV_ZONED
", zoned=yes"
#else
", zoned=no"
#endif
#ifdef CONFIG_FS_VERITY
", fsverity=yes"
#else
", fsverity=no"
#endif
;
pr_info("Btrfs loaded, crc32c=%s%s\n", crc32c_impl(), options);
}
static int __init init_btrfs_fs(void)
{
int err;
btrfs_props_init();
err = btrfs_init_sysfs();
if (err)
return err;
btrfs_init_compress();
err = btrfs_init_cachep();
if (err)
goto free_compress;
err = extent_io_init();
if (err)
goto free_cachep;
err = extent_state_cache_init();
if (err)
goto free_extent_io;
err = extent_map_init();
if (err)
goto free_extent_state_cache;
err = ordered_data_init();
if (err)
goto free_extent_map;
err = btrfs_delayed_inode_init();
if (err)
goto free_ordered_data;
err = btrfs_auto_defrag_init();
if (err)
goto free_delayed_inode;
err = btrfs_delayed_ref_init();
if (err)
goto free_auto_defrag;
err = btrfs_prelim_ref_init();
if (err)
goto free_delayed_ref;
err = btrfs_end_io_wq_init();
if (err)
goto free_prelim_ref;
err = btrfs_interface_init();
if (err)
goto free_end_io_wq;
btrfs_print_mod_info();
err = btrfs_run_sanity_tests();
if (err)
goto unregister_ioctl;
err = register_filesystem(&btrfs_fs_type);
if (err)
goto unregister_ioctl;
return 0;
unregister_ioctl:
btrfs_interface_exit();
free_end_io_wq:
btrfs_end_io_wq_exit();
free_prelim_ref:
btrfs_prelim_ref_exit();
free_delayed_ref:
btrfs_delayed_ref_exit();
free_auto_defrag:
btrfs_auto_defrag_exit();
free_delayed_inode:
btrfs_delayed_inode_exit();
free_ordered_data:
ordered_data_exit();
free_extent_map:
extent_map_exit();
free_extent_state_cache:
extent_state_cache_exit();
free_extent_io:
extent_io_exit();
free_cachep:
btrfs_destroy_cachep();
free_compress:
btrfs_exit_compress();
btrfs_exit_sysfs();
return err;
}
static void __exit exit_btrfs_fs(void)
{
btrfs_destroy_cachep();
btrfs_delayed_ref_exit();
btrfs_auto_defrag_exit();
btrfs_delayed_inode_exit();
btrfs_prelim_ref_exit();
ordered_data_exit();
extent_map_exit();
extent_state_cache_exit();
extent_io_exit();
btrfs_interface_exit();
btrfs_end_io_wq_exit();
unregister_filesystem(&btrfs_fs_type);
btrfs_exit_sysfs();
btrfs_cleanup_fs_uuids();
btrfs_exit_compress();
}
late_initcall(init_btrfs_fs);
module_exit(exit_btrfs_fs)
MODULE_LICENSE("GPL");
MODULE_SOFTDEP("pre: crc32c");
MODULE_SOFTDEP("pre: xxhash64");
MODULE_SOFTDEP("pre: sha256");
MODULE_SOFTDEP("pre: blake2b-256");
// SPDX-License-Identifier: GPL-2.0
/*
* drivers/base/power/main.c - Where the driver meets power management.
*
* Copyright (c) 2003 Patrick Mochel
* Copyright (c) 2003 Open Source Development Lab
*
* The driver model core calls device_pm_add() when a device is registered.
* This will initialize the embedded device_pm_info object in the device
* and add it to the list of power-controlled devices. sysfs entries for
* controlling device power management will also be added.
*
* A separate list is used for keeping track of power info, because the power
* domain dependencies may differ from the ancestral dependencies that the
* subsystem list maintains.
*/
#define pr_fmt(fmt) "PM: " fmt
#define dev_fmt pr_fmt
#include <linux/device.h>
#include <linux/export.h>
#include <linux/mutex.h>
#include <linux/pm.h>
#include <linux/pm_runtime.h>
#include <linux/pm-trace.h>
#include <linux/pm_wakeirq.h>
#include <linux/interrupt.h>
#include <linux/sched.h>
#include <linux/sched/debug.h>
#include <linux/async.h>
#include <linux/suspend.h>
#include <trace/events/power.h>
#include <linux/cpufreq.h>
#include <linux/cpuidle.h>
#include <linux/devfreq.h>
#include <linux/timer.h>
#include "../base.h"
#include "power.h"
typedef int (*pm_callback_t)(struct device *);
#define list_for_each_entry_rcu_locked(pos, head, member) \
list_for_each_entry_rcu(pos, head, member, \
device_links_read_lock_held())
/*
* The entries in the dpm_list list are in a depth first order, simply
* because children are guaranteed to be discovered after parents, and
* are inserted at the back of the list on discovery.
*
* Since device_pm_add() may be called with a device lock held,
* we must never try to acquire a device lock while holding
* dpm_list_mutex.
*/
LIST_HEAD(dpm_list);
static LIST_HEAD(dpm_prepared_list);
static LIST_HEAD(dpm_suspended_list);
static LIST_HEAD(dpm_late_early_list);
static LIST_HEAD(dpm_noirq_list);
struct suspend_stats suspend_stats;
static DEFINE_MUTEX(dpm_list_mtx);
static pm_message_t pm_transition;
static int async_error;
static const char *pm_verb(int event)
{
switch (event) {
case PM_EVENT_SUSPEND:
return "suspend";
case PM_EVENT_RESUME:
return "resume";
case PM_EVENT_FREEZE:
return "freeze";
case PM_EVENT_QUIESCE:
return "quiesce";
case PM_EVENT_HIBERNATE:
return "hibernate";
case PM_EVENT_THAW:
return "thaw";
case PM_EVENT_RESTORE:
return "restore";
case PM_EVENT_RECOVER:
return "recover";
default:
return "(unknown PM event)";
}
}
/**
* device_pm_sleep_init - Initialize system suspend-related device fields.
* @dev: Device object being initialized.
*/
void device_pm_sleep_init(struct device *dev)
{
dev->power.is_prepared = false;
dev->power.is_suspended = false;
dev->power.is_noirq_suspended = false;
dev->power.is_late_suspended = false;
init_completion(&dev->power.completion);
complete_all(&dev->power.completion);
dev->power.wakeup = NULL;
INIT_LIST_HEAD(&dev->power.entry);
}
/**
* device_pm_lock - Lock the list of active devices used by the PM core.
*/
void device_pm_lock(void)
{
mutex_lock(&dpm_list_mtx);
}
/**
* device_pm_unlock - Unlock the list of active devices used by the PM core.
*/
void device_pm_unlock(void)
{
mutex_unlock(&dpm_list_mtx);
}
/**
* device_pm_add - Add a device to the PM core's list of active devices.
* @dev: Device to add to the list.
*/
void device_pm_add(struct device *dev)
{
/* Skip PM setup/initialization. */
if (device_pm_not_required(dev))
return;
pr_debug("Adding info for %s:%s\n",
dev->bus ? dev->bus->name : "No Bus", dev_name(dev));
device_pm_check_callbacks(dev);
mutex_lock(&dpm_list_mtx);
if (dev->parent && dev->parent->power.is_prepared)
dev_warn(dev, "parent %s should not be sleeping\n",
dev_name(dev->parent));
list_add_tail(&dev->power.entry, &dpm_list);
dev->power.in_dpm_list = true;
mutex_unlock(&dpm_list_mtx);
}
/**
* device_pm_remove - Remove a device from the PM core's list of active devices.
* @dev: Device to be removed from the list.
*/
void device_pm_remove(struct device *dev)
{
if (device_pm_not_required(dev))
return;
pr_debug("Removing info for %s:%s\n",
dev->bus ? dev->bus->name : "No Bus", dev_name(dev));
complete_all(&dev->power.completion);
mutex_lock(&dpm_list_mtx);
list_del_init(&dev->power.entry);
dev->power.in_dpm_list = false;
mutex_unlock(&dpm_list_mtx);
device_wakeup_disable(dev);
pm_runtime_remove(dev);
device_pm_check_callbacks(dev);
}
/**
* device_pm_move_before - Move device in the PM core's list of active devices.
* @deva: Device to move in dpm_list.
* @devb: Device @deva should come before.
*/
void device_pm_move_before(struct device *deva, struct device *devb)
{
pr_debug("Moving %s:%s before %s:%s\n",
deva->bus ? deva->bus->name : "No Bus", dev_name(deva),
devb->bus ? devb->bus->name : "No Bus", dev_name(devb));
/* Delete deva from dpm_list and reinsert before devb. */
list_move_tail(&deva->power.entry, &devb->power.entry);
}
/**
* device_pm_move_after - Move device in the PM core's list of active devices.
* @deva: Device to move in dpm_list.
* @devb: Device @deva should come after.
*/
void device_pm_move_after(struct device *deva, struct device *devb)
{
pr_debug("Moving %s:%s after %s:%s\n",
deva->bus ? deva->bus->name : "No Bus", dev_name(deva),
devb->bus ? devb->bus->name : "No Bus", dev_name(devb));
/* Delete deva from dpm_list and reinsert after devb. */
list_move(&deva->power.entry, &devb->power.entry);
}
/**
* device_pm_move_last - Move device to end of the PM core's list of devices.
* @dev: Device to move in dpm_list.
*/
void device_pm_move_last(struct device *dev)
{
pr_debug("Moving %s:%s to end of list\n",
dev->bus ? dev->bus->name : "No Bus", dev_name(dev));
list_move_tail(&dev->power.entry, &dpm_list);
}
static ktime_t initcall_debug_start(struct device *dev, void *cb)
{
if (!pm_print_times_enabled)
return 0;
dev_info(dev, "calling %pS @ %i, parent: %s\n", cb,
task_pid_nr(current),
dev->parent ? dev_name(dev->parent) : "none");
return ktime_get();
}
static void initcall_debug_report(struct device *dev, ktime_t calltime,
void *cb, int error)
{
ktime_t rettime;
if (!pm_print_times_enabled)
return;
rettime = ktime_get();
dev_info(dev, "%pS returned %d after %Ld usecs\n", cb, error,
(unsigned long long)ktime_us_delta(rettime, calltime));
}
/**
* dpm_wait - Wait for a PM operation to complete.
* @dev: Device to wait for.
* @async: If unset, wait only if the device's power.async_suspend flag is set.
*/
static void dpm_wait(struct device *dev, bool async)
{
if (!dev)
return;
if (async || (pm_async_enabled && dev->power.async_suspend))
wait_for_completion(&dev->power.completion);
}
static int dpm_wait_fn(struct device *dev, void *async_ptr)
{
dpm_wait(dev, *((bool *)async_ptr));
return 0;
}
static void dpm_wait_for_children(struct device *dev, bool async)
{
device_for_each_child(dev, &async, dpm_wait_fn);
}
static void dpm_wait_for_suppliers(struct device *dev, bool async)
{
struct device_link *link;
int idx;
idx = device_links_read_lock();
/*
* If the supplier goes away right after we've checked the link to it,
* we'll wait for its completion to change the state, but that's fine,
* because the only things that will block as a result are the SRCU
* callbacks freeing the link objects for the links in the list we're
* walking.
*/
list_for_each_entry_rcu_locked(link, &dev->links.suppliers, c_node)
if (READ_ONCE(link->status) != DL_STATE_DORMANT)
dpm_wait(link->supplier, async);
device_links_read_unlock(idx);
}
static bool dpm_wait_for_superior(struct device *dev, bool async)
{
struct device *parent;
/*
* If the device is resumed asynchronously and the parent's callback
* deletes both the device and the parent itself, the parent object may
* be freed while this function is running, so avoid that by reference
* counting the parent once more unless the device has been deleted
* already (in which case return right away).
*/
mutex_lock(&dpm_list_mtx);
if (!device_pm_initialized(dev)) {
mutex_unlock(&dpm_list_mtx);
return false;
}
parent = get_device(dev->parent);
mutex_unlock(&dpm_list_mtx);
dpm_wait(parent, async);
put_device(parent);
dpm_wait_for_suppliers(dev, async);
/*
* If the parent's callback has deleted the device, attempting to resume
* it would be invalid, so avoid doing that then.
*/
return device_pm_initialized(dev);
}
static void dpm_wait_for_consumers(struct device *dev, bool async)
{
struct device_link *link;
int idx;
idx = device_links_read_lock();
/*
* The status of a device link can only be changed from "dormant" by a
* probe, but that cannot happen during system suspend/resume. In
* theory it can change to "dormant" at that time, but then it is
* reasonable to wait for the target device anyway (eg. if it goes
* away, it's better to wait for it to go away completely and then
* continue instead of trying to continue in parallel with its
* unregistration).
*/
list_for_each_entry_rcu_locked(link, &dev->links.consumers, s_node)
if (READ_ONCE(link->status) != DL_STATE_DORMANT)
dpm_wait(link->consumer, async);
device_links_read_unlock(idx);
}
static void dpm_wait_for_subordinate(struct device *dev, bool async)
{
dpm_wait_for_children(dev, async);
dpm_wait_for_consumers(dev, async);
}
/**
* pm_op - Return the PM operation appropriate for given PM event.
* @ops: PM operations to choose from.
* @state: PM transition of the system being carried out.
*/
static pm_callback_t pm_op(const struct dev_pm_ops *ops, pm_message_t state)
{
switch (state.event) {
#ifdef CONFIG_SUSPEND
case PM_EVENT_SUSPEND:
return ops->suspend;
case PM_EVENT_RESUME:
return ops->resume;
#endif /* CONFIG_SUSPEND */
#ifdef CONFIG_HIBERNATE_CALLBACKS
case PM_EVENT_FREEZE:
case PM_EVENT_QUIESCE:
return ops->freeze;
case PM_EVENT_HIBERNATE:
return ops->poweroff;
case PM_EVENT_THAW:
case PM_EVENT_RECOVER:
return ops->thaw;
case PM_EVENT_RESTORE:
return ops->restore;
#endif /* CONFIG_HIBERNATE_CALLBACKS */
}
return NULL;
}
/**
* pm_late_early_op - Return the PM operation appropriate for given PM event.
* @ops: PM operations to choose from.
* @state: PM transition of the system being carried out.
*
* Runtime PM is disabled for @dev while this function is being executed.
*/
static pm_callback_t pm_late_early_op(const struct dev_pm_ops *ops,
pm_message_t state)
{
switch (state.event) {
#ifdef CONFIG_SUSPEND
case PM_EVENT_SUSPEND:
return ops->suspend_late;
case PM_EVENT_RESUME:
return ops->resume_early;
#endif /* CONFIG_SUSPEND */
#ifdef CONFIG_HIBERNATE_CALLBACKS
case PM_EVENT_FREEZE:
case PM_EVENT_QUIESCE:
return ops->freeze_late;
case PM_EVENT_HIBERNATE:
return ops->poweroff_late;
case PM_EVENT_THAW:
case PM_EVENT_RECOVER:
return ops->thaw_early;
case PM_EVENT_RESTORE:
return ops->restore_early;
#endif /* CONFIG_HIBERNATE_CALLBACKS */
}
return NULL;
}
/**
* pm_noirq_op - Return the PM operation appropriate for given PM event.
* @ops: PM operations to choose from.
* @state: PM transition of the system being carried out.
*
* The driver of @dev will not receive interrupts while this function is being
* executed.
*/
static pm_callback_t pm_noirq_op(const struct dev_pm_ops *ops, pm_message_t state)
{
switch (state.event) {
#ifdef CONFIG_SUSPEND
case PM_EVENT_SUSPEND:
return ops->suspend_noirq;
case PM_EVENT_RESUME:
return ops->resume_noirq;
#endif /* CONFIG_SUSPEND */
#ifdef CONFIG_HIBERNATE_CALLBACKS
case PM_EVENT_FREEZE:
case PM_EVENT_QUIESCE:
return ops->freeze_noirq;
case PM_EVENT_HIBERNATE:
return ops->poweroff_noirq;
case PM_EVENT_THAW:
case PM_EVENT_RECOVER:
return ops->thaw_noirq;
case PM_EVENT_RESTORE:
return ops->restore_noirq;
#endif /* CONFIG_HIBERNATE_CALLBACKS */
}
return NULL;
}
static void pm_dev_dbg(struct device *dev, pm_message_t state, const char *info)
{
dev_dbg(dev, "%s%s%s driver flags: %x\n", info, pm_verb(state.event),
((state.event & PM_EVENT_SLEEP) && device_may_wakeup(dev)) ?
", may wakeup" : "", dev->power.driver_flags);
}
static void pm_dev_err(struct device *dev, pm_message_t state, const char *info,
int error)
{
dev_err(dev, "failed to %s%s: error %d\n", pm_verb(state.event), info,
error);
}
static void dpm_show_time(ktime_t starttime, pm_message_t state, int error,
const char *info)
{
ktime_t calltime;
u64 usecs64;
int usecs;
calltime = ktime_get();
usecs64 = ktime_to_ns(ktime_sub(calltime, starttime));
do_div(usecs64, NSEC_PER_USEC);
usecs = usecs64;
if (usecs == 0)
usecs = 1;
pm_pr_dbg("%s%s%s of devices %s after %ld.%03ld msecs\n",
info ?: "", info ? " " : "", pm_verb(state.event),
error ? "aborted" : "complete",
usecs / USEC_PER_MSEC, usecs % USEC_PER_MSEC);
}
static int dpm_run_callback(pm_callback_t cb, struct device *dev,
pm_message_t state, const char *info)
{
ktime_t calltime;
int error;
if (!cb)
return 0;
calltime = initcall_debug_start(dev, cb);
pm_dev_dbg(dev, state, info);
trace_device_pm_callback_start(dev, info, state.event);
error = cb(dev);
trace_device_pm_callback_end(dev, error);
suspend_report_result(cb, error);
initcall_debug_report(dev, calltime, cb, error);
return error;
}
#ifdef CONFIG_DPM_WATCHDOG
struct dpm_watchdog {
struct device *dev;
struct task_struct *tsk;
struct timer_list timer;
};
#define DECLARE_DPM_WATCHDOG_ON_STACK(wd) \
struct dpm_watchdog wd
/**
* dpm_watchdog_handler - Driver suspend / resume watchdog handler.
* @t: The timer that PM watchdog depends on.
*
* Called when a driver has timed out suspending or resuming.
* There's not much we can do here to recover so panic() to
* capture a crash-dump in pstore.
*/
static void dpm_watchdog_handler(struct timer_list *t)
{
struct dpm_watchdog *wd = from_timer(wd, t, timer);
dev_emerg(wd->dev, "**** DPM device timeout ****\n");
show_stack(wd->tsk, NULL, KERN_EMERG);
panic("%s %s: unrecoverable failure\n",
dev_driver_string(wd->dev), dev_name(wd->dev));
}
/**
* dpm_watchdog_set - Enable pm watchdog for given device.
* @wd: Watchdog. Must be allocated on the stack.
* @dev: Device to handle.
*/
static void dpm_watchdog_set(struct dpm_watchdog *wd, struct device *dev)
{
struct timer_list *timer = &wd->timer;
wd->dev = dev;
wd->tsk = current;
timer_setup_on_stack(timer, dpm_watchdog_handler, 0);
/* use same timeout value for both suspend and resume */
timer->expires = jiffies + HZ * CONFIG_DPM_WATCHDOG_TIMEOUT;
add_timer(timer);
}
/**
* dpm_watchdog_clear - Disable suspend/resume watchdog.
* @wd: Watchdog to disable.
*/
static void dpm_watchdog_clear(struct dpm_watchdog *wd)
{
struct timer_list *timer = &wd->timer;
del_timer_sync(timer);
destroy_timer_on_stack(timer);
}
#else
#define DECLARE_DPM_WATCHDOG_ON_STACK(wd)
#define dpm_watchdog_set(x, y)
#define dpm_watchdog_clear(x)
#endif
/*------------------------- Resume routines -------------------------*/
/**
* dev_pm_skip_resume - System-wide device resume optimization check.
* @dev: Target device.
*
* Return:
* - %false if the transition under way is RESTORE.
* - Return value of dev_pm_skip_suspend() if the transition under way is THAW.
* - The logical negation of %power.must_resume otherwise (that is, when the
* transition under way is RESUME).
*/
bool dev_pm_skip_resume(struct device *dev)
{
if (pm_transition.event == PM_EVENT_RESTORE)
return false;
if (pm_transition.event == PM_EVENT_THAW)
return dev_pm_skip_suspend(dev);
return !dev->power.must_resume;
}
/**
* device_resume_noirq - Execute a "noirq resume" callback for given device.
* @dev: Device to handle.
* @state: PM transition of the system being carried out.
* @async: If true, the device is being resumed asynchronously.
*
* The driver of @dev will not receive interrupts while this function is being
* executed.
*/
static int device_resume_noirq(struct device *dev, pm_message_t state, bool async)
{
pm_callback_t callback = NULL;
const char *info = NULL;
bool skip_resume;
int error = 0;
TRACE_DEVICE(dev);
TRACE_RESUME(0);
if (dev->power.syscore || dev->power.direct_complete)
goto Out;
if (!dev->power.is_noirq_suspended)
goto Out;
if (!dpm_wait_for_superior(dev, async))
goto Out;
skip_resume = dev_pm_skip_resume(dev);
/*
* If the driver callback is skipped below or by the middle layer
* callback and device_resume_early() also skips the driver callback for
* this device later, it needs to appear as "suspended" to PM-runtime,
* so change its status accordingly.
*
* Otherwise, the device is going to be resumed, so set its PM-runtime
* status to "active", but do that only if DPM_FLAG_SMART_SUSPEND is set
* to avoid confusing drivers that don't use it.
*/
if (skip_resume)
pm_runtime_set_suspended(dev);
else if (dev_pm_skip_suspend(dev))
pm_runtime_set_active(dev);
if (dev->pm_domain) {
info = "noirq power domain ";
callback = pm_noirq_op(&dev->pm_domain->ops, state);
} else if (dev->type && dev->type->pm) {
info = "noirq type ";
callback = pm_noirq_op(dev->type->pm, state);
} else if (dev->class && dev->class->pm) {
info = "noirq class ";
callback = pm_noirq_op(dev->class->pm, state);
} else if (dev->bus && dev->bus->pm) {
info = "noirq bus ";
callback = pm_noirq_op(dev->bus->pm, state);
}
if (callback)
goto Run;
if (skip_resume)
goto Skip;
if (dev->driver && dev->driver->pm) {
info = "noirq driver ";
callback = pm_noirq_op(dev->driver->pm, state);
}
Run:
error = dpm_run_callback(callback, dev, state, info);
Skip:
dev->power.is_noirq_suspended = false;
Out:
complete_all(&dev->power.completion);
TRACE_RESUME(error);
return error;
}
static bool is_async(struct device *dev)
{
return dev->power.async_suspend && pm_async_enabled
&& !pm_trace_is_enabled();
}
static bool dpm_async_fn(struct device *dev, async_func_t func)
{
reinit_completion(&dev->power.completion);
if (is_async(dev)) {
get_device(dev);
async_schedule_dev(func, dev);
return true;
}
return false;
}
static void async_resume_noirq(void *data, async_cookie_t cookie)
{
struct device *dev = (struct device *)data;
int error;
error = device_resume_noirq(dev, pm_transition, true);
if (error)
pm_dev_err(dev, pm_transition, " async", error);
put_device(dev);
}
static void dpm_noirq_resume_devices(pm_message_t state)
{
struct device *dev;
ktime_t starttime = ktime_get();
trace_suspend_resume(TPS("dpm_resume_noirq"), state.event, true);
mutex_lock(&dpm_list_mtx);
pm_transition = state;
/*
* Advanced the async threads upfront,
* in case the starting of async threads is
* delayed by non-async resuming devices.
*/
list_for_each_entry(dev, &dpm_noirq_list, power.entry)
dpm_async_fn(dev, async_resume_noirq);
while (!list_empty(&dpm_noirq_list)) {
dev = to_device(dpm_noirq_list.next);
get_device(dev);
list_move_tail(&dev->power.entry, &dpm_late_early_list);
mutex_unlock(&dpm_list_mtx);
if (!is_async(dev)) {
int error;
error = device_resume_noirq(dev, state, false);
if (error) {
suspend_stats.failed_resume_noirq++;
dpm_save_failed_step(SUSPEND_RESUME_NOIRQ);
dpm_save_failed_dev(dev_name(dev));
pm_dev_err(dev, state, " noirq", error);
}
}
put_device(dev);
mutex_lock(&dpm_list_mtx);
}
mutex_unlock(&dpm_list_mtx);
async_synchronize_full();
dpm_show_time(starttime, state, 0, "noirq");
trace_suspend_resume(TPS("dpm_resume_noirq"), state.event, false);
}
/**
* dpm_resume_noirq - Execute "noirq resume" callbacks for all devices.
* @state: PM transition of the system being carried out.
*
* Invoke the "noirq" resume callbacks for all devices in dpm_noirq_list and
* allow device drivers' interrupt handlers to be called.
*/
void dpm_resume_noirq(pm_message_t state)
{
dpm_noirq_resume_devices(state);
resume_device_irqs();
device_wakeup_disarm_wake_irqs();
cpuidle_resume();
}
/**
* device_resume_early - Execute an "early resume" callback for given device.
* @dev: Device to handle.
* @state: PM transition of the system being carried out.
* @async: If true, the device is being resumed asynchronously.
*
* Runtime PM is disabled for @dev while this function is being executed.
*/
static int device_resume_early(struct device *dev, pm_message_t state, bool async)
{
pm_callback_t callback = NULL;
const char *info = NULL;
int error = 0;
TRACE_DEVICE(dev);
TRACE_RESUME(0);
if (dev->power.syscore || dev->power.direct_complete)
goto Out;
if (!dev->power.is_late_suspended)
goto Out;
if (!dpm_wait_for_superior(dev, async))
goto Out;
if (dev->pm_domain) {
info = "early power domain ";
callback = pm_late_early_op(&dev->pm_domain->ops, state);
} else if (dev->type && dev->type->pm) {
info = "early type ";
callback = pm_late_early_op(dev->type->pm, state);
} else if (dev->class && dev->class->pm) {
info = "early class ";
callback = pm_late_early_op(dev->class->pm, state);
} else if (dev->bus && dev->bus->pm) {
info = "early bus ";
callback = pm_late_early_op(dev->bus->pm, state);
}
if (callback)
goto Run;
if (dev_pm_skip_resume(dev))
goto Skip;
if (dev->driver && dev->driver->pm) {
info = "early driver ";
callback = pm_late_early_op(dev->driver->pm, state);
}
Run:
error = dpm_run_callback(callback, dev, state, info);
Skip:
dev->power.is_late_suspended = false;
Out:
TRACE_RESUME(error);
pm_runtime_enable(dev);
complete_all(&dev->power.completion);
return error;
}
static void async_resume_early(void *data, async_cookie_t cookie)
{
struct device *dev = (struct device *)data;
int error;
error = device_resume_early(dev, pm_transition, true);
if (error)
pm_dev_err(dev, pm_transition, " async", error);
put_device(dev);
}
/**
* dpm_resume_early - Execute "early resume" callbacks for all devices.
* @state: PM transition of the system being carried out.
*/
void dpm_resume_early(pm_message_t state)
{
struct device *dev;
ktime_t starttime = ktime_get();
trace_suspend_resume(TPS("dpm_resume_early"), state.event, true);
mutex_lock(&dpm_list_mtx);
pm_transition = state;
/*
* Advanced the async threads upfront,
* in case the starting of async threads is
* delayed by non-async resuming devices.
*/
list_for_each_entry(dev, &dpm_late_early_list, power.entry)
dpm_async_fn(dev, async_resume_early);
while (!list_empty(&dpm_late_early_list)) {
dev = to_device(dpm_late_early_list.next);
get_device(dev);
list_move_tail(&dev->power.entry, &dpm_suspended_list);
mutex_unlock(&dpm_list_mtx);
if (!is_async(dev)) {
int error;
error = device_resume_early(dev, state, false);
if (error) {
suspend_stats.failed_resume_early++;
dpm_save_failed_step(SUSPEND_RESUME_EARLY);
dpm_save_failed_dev(dev_name(dev));
pm_dev_err(dev, state, " early", error);
}
}
put_device(dev);
mutex_lock(&dpm_list_mtx);
}
mutex_unlock(&dpm_list_mtx);
async_synchronize_full();
dpm_show_time(starttime, state, 0, "early");
trace_suspend_resume(TPS("dpm_resume_early"), state.event, false);
}
/**
* dpm_resume_start - Execute "noirq" and "early" device callbacks.
* @state: PM transition of the system being carried out.
*/
void dpm_resume_start(pm_message_t state)
{
dpm_resume_noirq(state);
dpm_resume_early(state);
}
EXPORT_SYMBOL_GPL(dpm_resume_start);
/**
* device_resume - Execute "resume" callbacks for given device.
* @dev: Device to handle.
* @state: PM transition of the system being carried out.
* @async: If true, the device is being resumed asynchronously.
*/
static int device_resume(struct device *dev, pm_message_t state, bool async)
{
pm_callback_t callback = NULL;
const char *info = NULL;
int error = 0;
DECLARE_DPM_WATCHDOG_ON_STACK(wd);
TRACE_DEVICE(dev);
TRACE_RESUME(0);
if (dev->power.syscore)
goto Complete;
if (dev->power.direct_complete) {
/* Match the pm_runtime_disable() in __device_suspend(). */
pm_runtime_enable(dev);
goto Complete;
}
if (!dpm_wait_for_superior(dev, async))
goto Complete;
dpm_watchdog_set(&wd, dev);
device_lock(dev);
/*
* This is a fib. But we'll allow new children to be added below
* a resumed device, even if the device hasn't been completed yet.
*/
dev->power.is_prepared = false;
if (!dev->power.is_suspended)
goto Unlock;
if (dev->pm_domain) {
info = "power domain ";
callback = pm_op(&dev->pm_domain->ops, state);
goto Driver;
}
if (dev->type && dev->type->pm) {
info = "type ";
callback = pm_op(dev->type->pm, state);
goto Driver;
}
if (dev->class && dev->class->pm) {
info = "class ";
callback = pm_op(dev->class->pm, state);
goto Driver;
}
if (dev->bus) {
if (dev->bus->pm) {
info = "bus ";
callback = pm_op(dev->bus->pm, state);
} else if (dev->bus->resume) {
info = "legacy bus ";
callback = dev->bus->resume;
goto End;
}
}
Driver:
if (!callback && dev->driver && dev->driver->pm) {
info = "driver ";
callback = pm_op(dev->driver->pm, state);
}
End:
error = dpm_run_callback(callback, dev, state, info);
dev->power.is_suspended = false;
Unlock:
device_unlock(dev);
dpm_watchdog_clear(&wd);
Complete:
complete_all(&dev->power.completion);
TRACE_RESUME(error);
return error;
}
static void async_resume(void *data, async_cookie_t cookie)
{
struct device *dev = (struct device *)data;
int error;
error = device_resume(dev, pm_transition, true);
if (error)
pm_dev_err(dev, pm_transition, " async", error);
put_device(dev);
}
/**
* dpm_resume - Execute "resume" callbacks for non-sysdev devices.
* @state: PM transition of the system being carried out.
*
* Execute the appropriate "resume" callback for all devices whose status
* indicates that they are suspended.
*/
void dpm_resume(pm_message_t state)
{
struct device *dev;
ktime_t starttime = ktime_get();
trace_suspend_resume(TPS("dpm_resume"), state.event, true);
might_sleep();
mutex_lock(&dpm_list_mtx);
pm_transition = state;
async_error = 0;
list_for_each_entry(dev, &dpm_suspended_list, power.entry)
dpm_async_fn(dev, async_resume);
while (!list_empty(&dpm_suspended_list)) {
dev = to_device(dpm_suspended_list.next);
get_device(dev);
if (!is_async(dev)) {
int error;
mutex_unlock(&dpm_list_mtx);
error = device_resume(dev, state, false);
if (error) {
suspend_stats.failed_resume++;
dpm_save_failed_step(SUSPEND_RESUME);
dpm_save_failed_dev(dev_name(dev));
pm_dev_err(dev, state, "", error);
}
mutex_lock(&dpm_list_mtx);
}
if (!list_empty(&dev->power.entry))
list_move_tail(&dev->power.entry, &dpm_prepared_list);
mutex_unlock(&dpm_list_mtx);
put_device(dev);
mutex_lock(&dpm_list_mtx);
}
mutex_unlock(&dpm_list_mtx);
async_synchronize_full();
dpm_show_time(starttime, state, 0, NULL);
cpufreq_resume();
devfreq_resume();
trace_suspend_resume(TPS("dpm_resume"), state.event, false);
}
/**
* device_complete - Complete a PM transition for given device.
* @dev: Device to handle.
* @state: PM transition of the system being carried out.
*/
static void device_complete(struct device *dev, pm_message_t state)
{
void (*callback)(struct device *) = NULL;
const char *info = NULL;
if (dev->power.syscore)
goto out;
device_lock(dev);
if (dev->pm_domain) {
info = "completing power domain ";
callback = dev->pm_domain->ops.complete;
} else if (dev->type && dev->type->pm) {
info = "completing type ";
callback = dev->type->pm->complete;
} else if (dev->class && dev->class->pm) {
info = "completing class ";
callback = dev->class->pm->complete;
} else if (dev->bus && dev->bus->pm) {
info = "completing bus ";
callback = dev->bus->pm->complete;
}
if (!callback && dev->driver && dev->driver->pm) {
info = "completing driver ";
callback = dev->driver->pm->complete;
}
if (callback) {
pm_dev_dbg(dev, state, info);
callback(dev);
}
device_unlock(dev);
out:
pm_runtime_put(dev);
}
/**
* dpm_complete - Complete a PM transition for all non-sysdev devices.
* @state: PM transition of the system being carried out.
*
* Execute the ->complete() callbacks for all devices whose PM status is not
* DPM_ON (this allows new devices to be registered).
*/
void dpm_complete(pm_message_t state)
{
struct list_head list;
trace_suspend_resume(TPS("dpm_complete"), state.event, true);
might_sleep();
INIT_LIST_HEAD(&list);
mutex_lock(&dpm_list_mtx);
while (!list_empty(&dpm_prepared_list)) {
struct device *dev = to_device(dpm_prepared_list.prev);
get_device(dev);
dev->power.is_prepared = false;
list_move(&dev->power.entry, &list);
mutex_unlock(&dpm_list_mtx);
trace_device_pm_callback_start(dev, "", state.event);
device_complete(dev, state);
trace_device_pm_callback_end(dev, 0);
put_device(dev);
mutex_lock(&dpm_list_mtx);
}
list_splice(&list, &dpm_list);
mutex_unlock(&dpm_list_mtx);
/* Allow device probing and trigger re-probing of deferred devices */
device_unblock_probing();
trace_suspend_resume(TPS("dpm_complete"), state.event, false);
}
/**
* dpm_resume_end - Execute "resume" callbacks and complete system transition.
* @state: PM transition of the system being carried out.
*
* Execute "resume" callbacks for all devices and complete the PM transition of
* the system.
*/
void dpm_resume_end(pm_message_t state)
{
dpm_resume(state);
dpm_complete(state);
}
EXPORT_SYMBOL_GPL(dpm_resume_end);
/*------------------------- Suspend routines -------------------------*/
/**
* resume_event - Return a "resume" message for given "suspend" sleep state.
* @sleep_state: PM message representing a sleep state.
*
* Return a PM message representing the resume event corresponding to given
* sleep state.
*/
static pm_message_t resume_event(pm_message_t sleep_state)
{
switch (sleep_state.event) {
case PM_EVENT_SUSPEND:
return PMSG_RESUME;
case PM_EVENT_FREEZE:
case PM_EVENT_QUIESCE:
return PMSG_RECOVER;
case PM_EVENT_HIBERNATE:
return PMSG_RESTORE;
}
return PMSG_ON;
}
static void dpm_superior_set_must_resume(struct device *dev)
{
struct device_link *link;
int idx;
if (dev->parent)
dev->parent->power.must_resume = true;
idx = device_links_read_lock();
list_for_each_entry_rcu_locked(link, &dev->links.suppliers, c_node)
link->supplier->power.must_resume = true;
device_links_read_unlock(idx);
}
/**
* __device_suspend_noirq - Execute a "noirq suspend" callback for given device.
* @dev: Device to handle.
* @state: PM transition of the system being carried out.
* @async: If true, the device is being suspended asynchronously.
*
* The driver of @dev will not receive interrupts while this function is being
* executed.
*/
static int __device_suspend_noirq(struct device *dev, pm_message_t state, bool async)
{
pm_callback_t callback = NULL;
const char *info = NULL;
int error = 0;
TRACE_DEVICE(dev);
TRACE_SUSPEND(0);
dpm_wait_for_subordinate(dev, async);
if (async_error)
goto Complete;
if (dev->power.syscore || dev->power.direct_complete)
goto Complete;
if (dev->pm_domain) {
info = "noirq power domain ";
callback = pm_noirq_op(&dev->pm_domain->ops, state);
} else if (dev->type && dev->type->pm) {
info = "noirq type ";
callback = pm_noirq_op(dev->type->pm, state);
} else if (dev->class && dev->class->pm) {
info = "noirq class ";
callback = pm_noirq_op(dev->class->pm, state);
} else if (dev->bus && dev->bus->pm) {
info = "noirq bus ";
callback = pm_noirq_op(dev->bus->pm, state);
}
if (callback)
goto Run;
if (dev_pm_skip_suspend(dev))
goto Skip;
if (dev->driver && dev->driver->pm) {
info = "noirq driver ";
callback = pm_noirq_op(dev->driver->pm, state);
}
Run:
error = dpm_run_callback(callback, dev, state, info);
if (error) {
async_error = error;
goto Complete;
}
Skip:
dev->power.is_noirq_suspended = true;
/*
* Skipping the resume of devices that were in use right before the
* system suspend (as indicated by their PM-runtime usage counters)
* would be suboptimal. Also resume them if doing that is not allowed
* to be skipped.
*/
if (atomic_read(&dev->power.usage_count) > 1 ||
!(dev_pm_test_driver_flags(dev, DPM_FLAG_MAY_SKIP_RESUME) &&
dev->power.may_skip_resume))
dev->power.must_resume = true;
if (dev->power.must_resume)
dpm_superior_set_must_resume(dev);
Complete:
complete_all(&dev->power.completion);
TRACE_SUSPEND(error);
return error;
}
static void async_suspend_noirq(void *data, async_cookie_t cookie)
{
struct device *dev = (struct device *)data;
int error;
error = __device_suspend_noirq(dev, pm_transition, true);
if (error) {
dpm_save_failed_dev(dev_name(dev));
pm_dev_err(dev, pm_transition, " async", error);
}
put_device(dev);
}
static int device_suspend_noirq(struct device *dev)
{
if (dpm_async_fn(dev, async_suspend_noirq))
return 0;
return __device_suspend_noirq(dev, pm_transition, false);
}
static int dpm_noirq_suspend_devices(pm_message_t state)
{
ktime_t starttime = ktime_get();
int error = 0;
trace_suspend_resume(TPS("dpm_suspend_noirq"), state.event, true);
mutex_lock(&dpm_list_mtx);
pm_transition = state;
async_error = 0;
while (!list_empty(&dpm_late_early_list)) {
struct device *dev = to_device(dpm_late_early_list.prev);
get_device(dev);
mutex_unlock(&dpm_list_mtx);
error = device_suspend_noirq(dev);
mutex_lock(&dpm_list_mtx);
if (error) {
pm_dev_err(dev, state, " noirq", error);
dpm_save_failed_dev(dev_name(dev));
} else if (!list_empty(&dev->power.entry)) {
list_move(&dev->power.entry, &dpm_noirq_list);
}
mutex_unlock(&dpm_list_mtx);
put_device(dev);
mutex_lock(&dpm_list_mtx);
if (error || async_error)
break;
}
mutex_unlock(&dpm_list_mtx);
async_synchronize_full();
if (!error)
error = async_error;
if (error) {
suspend_stats.failed_suspend_noirq++;
dpm_save_failed_step(SUSPEND_SUSPEND_NOIRQ);
}
dpm_show_time(starttime, state, error, "noirq");
trace_suspend_resume(TPS("dpm_suspend_noirq"), state.event, false);
return error;
}
/**
* dpm_suspend_noirq - Execute "noirq suspend" callbacks for all devices.
* @state: PM transition of the system being carried out.
*
* Prevent device drivers' interrupt handlers from being called and invoke
* "noirq" suspend callbacks for all non-sysdev devices.
*/
int dpm_suspend_noirq(pm_message_t state)
{
int ret;
cpuidle_pause();
device_wakeup_arm_wake_irqs();
suspend_device_irqs();
ret = dpm_noirq_suspend_devices(state);
if (ret)
dpm_resume_noirq(resume_event(state));
return ret;
}
static void dpm_propagate_wakeup_to_parent(struct device *dev)
{
struct device *parent = dev->parent;
if (!parent)
return;
spin_lock_irq(&parent->power.lock);
if (device_wakeup_path(dev) && !parent->power.ignore_children)
parent->power.wakeup_path = true;
spin_unlock_irq(&parent->power.lock);
}
/**
* __device_suspend_late - Execute a "late suspend" callback for given device.
* @dev: Device to handle.
* @state: PM transition of the system being carried out.
* @async: If true, the device is being suspended asynchronously.
*
* Runtime PM is disabled for @dev while this function is being executed.
*/
static int __device_suspend_late(struct device *dev, pm_message_t state, bool async)
{
pm_callback_t callback = NULL;
const char *info = NULL;
int error = 0;
TRACE_DEVICE(dev);
TRACE_SUSPEND(0);
__pm_runtime_disable(dev, false);
dpm_wait_for_subordinate(dev, async);
if (async_error)
goto Complete;
if (pm_wakeup_pending()) {
async_error = -EBUSY;
goto Complete;
}
if (dev->power.syscore || dev->power.direct_complete)
goto Complete;
if (dev->pm_domain) {
info = "late power domain ";
callback = pm_late_early_op(&dev->pm_domain->ops, state);
} else if (dev->type && dev->type->pm) {
info = "late type ";
callback = pm_late_early_op(dev->type->pm, state);
} else if (dev->class && dev->class->pm) {
info = "late class ";
callback = pm_late_early_op(dev->class->pm, state);
} else if (dev->bus && dev->bus->pm) {
info = "late bus ";
callback = pm_late_early_op(dev->bus->pm, state);
}
if (callback)
goto Run;
if (dev_pm_skip_suspend(dev))
goto Skip;
if (dev->driver && dev->driver->pm) {
info = "late driver ";
callback = pm_late_early_op(dev->driver->pm, state);
}
Run:
error = dpm_run_callback(callback, dev, state, info);
if (error) {
async_error = error;
goto Complete;
}
dpm_propagate_wakeup_to_parent(dev);
Skip:
dev->power.is_late_suspended = true;
Complete:
TRACE_SUSPEND(error);
complete_all(&dev->power.completion);
return error;
}
static void async_suspend_late(void *data, async_cookie_t cookie)
{
struct device *dev = (struct device *)data;
int error;
error = __device_suspend_late(dev, pm_transition, true);
if (error) {
dpm_save_failed_dev(dev_name(dev));
pm_dev_err(dev, pm_transition, " async", error);
}
put_device(dev);
}
static int device_suspend_late(struct device *dev)
{
if (dpm_async_fn(dev, async_suspend_late))
return 0;
return __device_suspend_late(dev, pm_transition, false);
}
/**
* dpm_suspend_late - Execute "late suspend" callbacks for all devices.
* @state: PM transition of the system being carried out.
*/
int dpm_suspend_late(pm_message_t state)
{
ktime_t starttime = ktime_get();
int error = 0;
trace_suspend_resume(TPS("dpm_suspend_late"), state.event, true);
mutex_lock(&dpm_list_mtx);
pm_transition = state;
async_error = 0;
while (!list_empty(&dpm_suspended_list)) {
struct device *dev = to_device(dpm_suspended_list.prev);
get_device(dev);
mutex_unlock(&dpm_list_mtx);
error = device_suspend_late(dev);
mutex_lock(&dpm_list_mtx);
if (!list_empty(&dev->power.entry))
list_move(&dev->power.entry, &dpm_late_early_list);
if (error) {
pm_dev_err(dev, state, " late", error);
dpm_save_failed_dev(dev_name(dev));
}
mutex_unlock(&dpm_list_mtx);
put_device(dev);
mutex_lock(&dpm_list_mtx);
if (error || async_error)
break;
}
mutex_unlock(&dpm_list_mtx);
async_synchronize_full();
if (!error)
error = async_error;
if (error) {
suspend_stats.failed_suspend_late++;
dpm_save_failed_step(SUSPEND_SUSPEND_LATE);
dpm_resume_early(resume_event(state));
}
dpm_show_time(starttime, state, error, "late");
trace_suspend_resume(TPS("dpm_suspend_late"), state.event, false);
return error;
}
/**
* dpm_suspend_end - Execute "late" and "noirq" device suspend callbacks.
* @state: PM transition of the system being carried out.
*/
int dpm_suspend_end(pm_message_t state)
{
ktime_t starttime = ktime_get();
int error;
error = dpm_suspend_late(state);
if (error)
goto out;
error = dpm_suspend_noirq(state);
if (error)
dpm_resume_early(resume_event(state));
out:
dpm_show_time(starttime, state, error, "end");
return error;
}
EXPORT_SYMBOL_GPL(dpm_suspend_end);
/**
* legacy_suspend - Execute a legacy (bus or class) suspend callback for device.
* @dev: Device to suspend.
* @state: PM transition of the system being carried out.
* @cb: Suspend callback to execute.
* @info: string description of caller.
*/
static int legacy_suspend(struct device *dev, pm_message_t state,
int (*cb)(struct device *dev, pm_message_t state),
const char *info)
{
int error;
ktime_t calltime;
calltime = initcall_debug_start(dev, cb);
trace_device_pm_callback_start(dev, info, state.event);
error = cb(dev, state);
trace_device_pm_callback_end(dev, error);
suspend_report_result(cb, error);
initcall_debug_report(dev, calltime, cb, error);
return error;
}
static void dpm_clear_superiors_direct_complete(struct device *dev)
{
struct device_link *link;
int idx;
if (dev->parent) {
spin_lock_irq(&dev->parent->power.lock);
dev->parent->power.direct_complete = false;
spin_unlock_irq(&dev->parent->power.lock);
}
idx = device_links_read_lock();
list_for_each_entry_rcu_locked(link, &dev->links.suppliers, c_node) {
spin_lock_irq(&link->supplier->power.lock);
link->supplier->power.direct_complete = false;
spin_unlock_irq(&link->supplier->power.lock);
}
device_links_read_unlock(idx);
}
/**
* __device_suspend - Execute "suspend" callbacks for given device.
* @dev: Device to handle.
* @state: PM transition of the system being carried out.
* @async: If true, the device is being suspended asynchronously.
*/
static int __device_suspend(struct device *dev, pm_message_t state, bool async)
{
pm_callback_t callback = NULL;
const char *info = NULL;
int error = 0;
DECLARE_DPM_WATCHDOG_ON_STACK(wd);
TRACE_DEVICE(dev);
TRACE_SUSPEND(0);
dpm_wait_for_subordinate(dev, async);
if (async_error) {
dev->power.direct_complete = false;
goto Complete;
}
/*
* Wait for possible runtime PM transitions of the device in progress
* to complete and if there's a runtime resume request pending for it,
* resume it before proceeding with invoking the system-wide suspend
* callbacks for it.
*
* If the system-wide suspend callbacks below change the configuration
* of the device, they must disable runtime PM for it or otherwise
* ensure that its runtime-resume callbacks will not be confused by that
* change in case they are invoked going forward.
*/
pm_runtime_barrier(dev);
if (pm_wakeup_pending()) {
dev->power.direct_complete = false;
async_error = -EBUSY;
goto Complete;
}
if (dev->power.syscore)
goto Complete;
/* Avoid direct_complete to let wakeup_path propagate. */
if (device_may_wakeup(dev) || device_wakeup_path(dev))
dev->power.direct_complete = false;
if (dev->power.direct_complete) {
if (pm_runtime_status_suspended(dev)) {
pm_runtime_disable(dev);
if (pm_runtime_status_suspended(dev)) {
pm_dev_dbg(dev, state, "direct-complete ");
goto Complete;
}
pm_runtime_enable(dev);
}
dev->power.direct_complete = false;
}
dev->power.may_skip_resume = true;
dev->power.must_resume = !dev_pm_test_driver_flags(dev, DPM_FLAG_MAY_SKIP_RESUME);
dpm_watchdog_set(&wd, dev);
device_lock(dev);
if (dev->pm_domain) {
info = "power domain ";
callback = pm_op(&dev->pm_domain->ops, state);
goto Run;
}
if (dev->type && dev->type->pm) {
info = "type ";
callback = pm_op(dev->type->pm, state);
goto Run;
}
if (dev->class && dev->class->pm) {
info = "class ";
callback = pm_op(dev->class->pm, state);
goto Run;
}
if (dev->bus) {
if (dev->bus->pm) {
info = "bus ";
callback = pm_op(dev->bus->pm, state);
} else if (dev->bus->suspend) {
pm_dev_dbg(dev, state, "legacy bus ");
error = legacy_suspend(dev, state, dev->bus->suspend,
"legacy bus ");
goto End;
}
}
Run:
if (!callback && dev->driver && dev->driver->pm) {
info = "driver ";
callback = pm_op(dev->driver->pm, state);
}
error = dpm_run_callback(callback, dev, state, info);
End:
if (!error) {
dev->power.is_suspended = true;
if (device_may_wakeup(dev))
dev->power.wakeup_path = true;
dpm_propagate_wakeup_to_parent(dev);
dpm_clear_superiors_direct_complete(dev);
}
device_unlock(dev);
dpm_watchdog_clear(&wd);
Complete:
if (error)
async_error = error;
complete_all(&dev->power.completion);
TRACE_SUSPEND(error);
return error;
}
static void async_suspend(void *data, async_cookie_t cookie)
{
struct device *dev = (struct device *)data;
int error;
error = __device_suspend(dev, pm_transition, true);
if (error) {
dpm_save_failed_dev(dev_name(dev));
pm_dev_err(dev, pm_transition, " async", error);
}
put_device(dev);
}
static int device_suspend(struct device *dev)
{
if (dpm_async_fn(dev, async_suspend))
return 0;
return __device_suspend(dev, pm_transition, false);
}
/**
* dpm_suspend - Execute "suspend" callbacks for all non-sysdev devices.
* @state: PM transition of the system being carried out.
*/
int dpm_suspend(pm_message_t state)
{
ktime_t starttime = ktime_get();
int error = 0;
trace_suspend_resume(TPS("dpm_suspend"), state.event, true);
might_sleep();
devfreq_suspend();
cpufreq_suspend();
mutex_lock(&dpm_list_mtx);
pm_transition = state;
async_error = 0;
while (!list_empty(&dpm_prepared_list)) {
struct device *dev = to_device(dpm_prepared_list.prev);
get_device(dev);
mutex_unlock(&dpm_list_mtx);
error = device_suspend(dev);
mutex_lock(&dpm_list_mtx);
if (error) {
pm_dev_err(dev, state, "", error);
dpm_save_failed_dev(dev_name(dev));
} else if (!list_empty(&dev->power.entry)) {
list_move(&dev->power.entry, &dpm_suspended_list);
}
mutex_unlock(&dpm_list_mtx);
put_device(dev);
mutex_lock(&dpm_list_mtx);
if (error || async_error)
break;
}
mutex_unlock(&dpm_list_mtx);
async_synchronize_full();
if (!error)
error = async_error;
if (error) {
suspend_stats.failed_suspend++;
dpm_save_failed_step(SUSPEND_SUSPEND);
}
dpm_show_time(starttime, state, error, NULL);
trace_suspend_resume(TPS("dpm_suspend"), state.event, false);
return error;
}
/**
* device_prepare - Prepare a device for system power transition.
* @dev: Device to handle.
* @state: PM transition of the system being carried out.
*
* Execute the ->prepare() callback(s) for given device. No new children of the
* device may be registered after this function has returned.
*/
static int device_prepare(struct device *dev, pm_message_t state)
{
int (*callback)(struct device *) = NULL;
int ret = 0;
/*
* If a device's parent goes into runtime suspend at the wrong time,
* it won't be possible to resume the device. To prevent this we
* block runtime suspend here, during the prepare phase, and allow
* it again during the complete phase.
*/
pm_runtime_get_noresume(dev);
if (dev->power.syscore)
return 0;
device_lock(dev);
dev->power.wakeup_path = false;
if (dev->power.no_pm_callbacks)
goto unlock;
if (dev->pm_domain)
callback = dev->pm_domain->ops.prepare;
else if (dev->type && dev->type->pm)
callback = dev->type->pm->prepare;
else if (dev->class && dev->class->pm)
callback = dev->class->pm->prepare;
else if (dev->bus && dev->bus->pm)
callback = dev->bus->pm->prepare;
if (!callback && dev->driver && dev->driver->pm)
callback = dev->driver->pm->prepare;
if (callback)
ret = callback(dev);
unlock:
device_unlock(dev);
if (ret < 0) {
suspend_report_result(callback, ret);
pm_runtime_put(dev);
return ret;
}
/*
* A positive return value from ->prepare() means "this device appears
* to be runtime-suspended and its state is fine, so if it really is
* runtime-suspended, you can leave it in that state provided that you
* will do the same thing with all of its descendants". This only
* applies to suspend transitions, however.
*/
spin_lock_irq(&dev->power.lock);
dev->power.direct_complete = state.event == PM_EVENT_SUSPEND &&
(ret > 0 || dev->power.no_pm_callbacks) &&
!dev_pm_test_driver_flags(dev, DPM_FLAG_NO_DIRECT_COMPLETE);
spin_unlock_irq(&dev->power.lock);
return 0;
}
/**
* dpm_prepare - Prepare all non-sysdev devices for a system PM transition.
* @state: PM transition of the system being carried out.
*
* Execute the ->prepare() callback(s) for all devices.
*/
int dpm_prepare(pm_message_t state)
{
int error = 0;
trace_suspend_resume(TPS("dpm_prepare"), state.event, true);
might_sleep();
/*
* Give a chance for the known devices to complete their probes, before
* disable probing of devices. This sync point is important at least
* at boot time + hibernation restore.
*/
wait_for_device_probe();
/*
* It is unsafe if probing of devices will happen during suspend or
* hibernation and system behavior will be unpredictable in this case.
* So, let's prohibit device's probing here and defer their probes
* instead. The normal behavior will be restored in dpm_complete().
*/
device_block_probing();
mutex_lock(&dpm_list_mtx);
while (!list_empty(&dpm_list) && !error) {
struct device *dev = to_device(dpm_list.next);
get_device(dev);
mutex_unlock(&dpm_list_mtx);
trace_device_pm_callback_start(dev, "", state.event);
error = device_prepare(dev, state);
trace_device_pm_callback_end(dev, error);
mutex_lock(&dpm_list_mtx);
if (!error) {
dev->power.is_prepared = true;
if (!list_empty(&dev->power.entry))
list_move_tail(&dev->power.entry, &dpm_prepared_list);
} else if (error == -EAGAIN) {
error = 0;
} else {
dev_info(dev, "not prepared for power transition: code %d\n",
error);
}
mutex_unlock(&dpm_list_mtx);
put_device(dev);
mutex_lock(&dpm_list_mtx);
}
mutex_unlock(&dpm_list_mtx);
trace_suspend_resume(TPS("dpm_prepare"), state.event, false);
return error;
}
/**
* dpm_suspend_start - Prepare devices for PM transition and suspend them.
* @state: PM transition of the system being carried out.
*
* Prepare all non-sysdev devices for system PM transition and execute "suspend"
* callbacks for them.
*/
int dpm_suspend_start(pm_message_t state)
{
ktime_t starttime = ktime_get();
int error;
error = dpm_prepare(state);
if (error) {
suspend_stats.failed_prepare++;
dpm_save_failed_step(SUSPEND_PREPARE);
} else
error = dpm_suspend(state);
dpm_show_time(starttime, state, error, "start");
return error;
}
EXPORT_SYMBOL_GPL(dpm_suspend_start);
void __suspend_report_result(const char *function, void *fn, int ret)
{
if (ret)
pr_err("%s(): %pS returns %d\n", function, fn, ret);
}
EXPORT_SYMBOL_GPL(__suspend_report_result);
/**
* device_pm_wait_for_dev - Wait for suspend/resume of a device to complete.
* @subordinate: Device that needs to wait for @dev.
* @dev: Device to wait for.
*/
int device_pm_wait_for_dev(struct device *subordinate, struct device *dev)
{
dpm_wait(dev, subordinate->power.async_suspend);
return async_error;
}
EXPORT_SYMBOL_GPL(device_pm_wait_for_dev);
/**
* dpm_for_each_dev - device iterator.
* @data: data for the callback.
* @fn: function to be called for each device.
*
* Iterate over devices in dpm_list, and call @fn for each device,
* passing it @data.
*/
void dpm_for_each_dev(void *data, void (*fn)(struct device *, void *))
{
struct device *dev;
if (!fn)
return;
device_pm_lock();
list_for_each_entry(dev, &dpm_list, power.entry)
fn(dev, data);
device_pm_unlock();
}
EXPORT_SYMBOL_GPL(dpm_for_each_dev);
static bool pm_ops_is_empty(const struct dev_pm_ops *ops)
{
if (!ops)
return true;
return !ops->prepare && !ops->suspend && !ops->suspend_late && !ops->suspend_noirq && !ops->resume_noirq && !ops->resume_early && !ops->resume && !ops->complete;
}
void device_pm_check_callbacks(struct device *dev)
{
unsigned long flags;
spin_lock_irqsave(&dev->power.lock, flags);
dev->power.no_pm_callbacks = (!dev->bus || (pm_ops_is_empty(dev->bus->pm) && !dev->bus->suspend && !dev->bus->resume)) && (!dev->class || pm_ops_is_empty(dev->class->pm)) && (!dev->type || pm_ops_is_empty(dev->type->pm)) && (!dev->pm_domain || pm_ops_is_empty(&dev->pm_domain->ops)) && (!dev->driver || (pm_ops_is_empty(dev->driver->pm) && !dev->driver->suspend && !dev->driver->resume));
spin_unlock_irqrestore(&dev->power.lock, flags);
}
bool dev_pm_skip_suspend(struct device *dev)
{
return dev_pm_test_driver_flags(dev, DPM_FLAG_SMART_SUSPEND) &&
pm_runtime_status_suspended(dev);
}